1236 


 1254 




























































































































    8 
















































































   34 
















   34 




















































































  202 




  202 














 1210 















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/tomoyo.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/lsm_hooks.h>
#include <uapi/linux/lsm.h>
#include "common.h"

/**
 * tomoyo_domain - Get "struct tomoyo_domain_info" for current thread.
 *
 * Returns pointer to "struct tomoyo_domain_info" for current thread.
 */
struct tomoyo_domain_info *tomoyo_domain(void)
{
        struct tomoyo_task *s = tomoyo_task(current);

        if (s->old_domain_info && !current->in_execve) {
                atomic_dec(&s->old_domain_info->users);
                s->old_domain_info = NULL;
        }
        return s->domain_info;
}

/**
 * tomoyo_cred_prepare - Target for security_prepare_creds().
 *
 * @new: Pointer to "struct cred".
 * @old: Pointer to "struct cred".
 * @gfp: Memory allocation flags.
 *
 * Returns 0.
 */
static int tomoyo_cred_prepare(struct cred *new, const struct cred *old,
                               gfp_t gfp)
{
        /* Restore old_domain_info saved by previous execve() request. */
        struct tomoyo_task *s = tomoyo_task(current);

        if (s->old_domain_info && !current->in_execve) {
                atomic_dec(&s->domain_info->users);
                s->domain_info = s->old_domain_info;
                s->old_domain_info = NULL;
        }
        return 0;
}

/**
 * tomoyo_bprm_committed_creds - Target for security_bprm_committed_creds().
 *
 * @bprm: Pointer to "struct linux_binprm".
 */
static void tomoyo_bprm_committed_creds(const struct linux_binprm *bprm)
{
        /* Clear old_domain_info saved by execve() request. */
        struct tomoyo_task *s = tomoyo_task(current);

        atomic_dec(&s->old_domain_info->users);
        s->old_domain_info = NULL;
}

#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
/**
 * tomoyo_bprm_creds_for_exec - Target for security_bprm_creds_for_exec().
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0.
 */
static int tomoyo_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        /*
         * Load policy if /sbin/tomoyo-init exists and /sbin/init is requested
         * for the first time.
         */
        if (!tomoyo_policy_loaded)
                tomoyo_load_policy(bprm->filename);
        return 0;
}
#endif

/**
 * tomoyo_bprm_check_security - Target for security_bprm_check().
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_bprm_check_security(struct linux_binprm *bprm)
{
        struct tomoyo_task *s = tomoyo_task(current);

        /*
         * Execute permission is checked against pathname passed to execve()
         * using current domain.
         */
        if (!s->old_domain_info) {
                const int idx = tomoyo_read_lock();
                const int err = tomoyo_find_next_domain(bprm);

                tomoyo_read_unlock(idx);
                return err;
        }
        /*
         * Read permission is checked against interpreters using next domain.
         */
        return tomoyo_check_open_permission(s->domain_info,
                                            &bprm->file->f_path, O_RDONLY);
}

/**
 * tomoyo_inode_getattr - Target for security_inode_getattr().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_inode_getattr(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_GETATTR, path, NULL);
}

/**
 * tomoyo_path_truncate - Target for security_path_truncate().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_truncate(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_TRUNCATE, path, NULL);
}

/**
 * tomoyo_file_truncate - Target for security_file_truncate().
 *
 * @file: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_truncate(struct file *file)
{
        return tomoyo_path_truncate(&file->f_path);
}

/**
 * tomoyo_path_unlink - Target for security_path_unlink().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_unlink(const struct path *parent, struct dentry *dentry)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_UNLINK, &path, NULL);
}

/**
 * tomoyo_path_mkdir - Target for security_path_mkdir().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 * @mode:   DAC permission mode.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_mkdir(const struct path *parent, struct dentry *dentry,
                             umode_t mode)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_number_perm(TOMOYO_TYPE_MKDIR, &path,
                                       mode & S_IALLUGO);
}

/**
 * tomoyo_path_rmdir - Target for security_path_rmdir().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_rmdir(const struct path *parent, struct dentry *dentry)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_RMDIR, &path, NULL);
}

/**
 * tomoyo_path_symlink - Target for security_path_symlink().
 *
 * @parent:   Pointer to "struct path".
 * @dentry:   Pointer to "struct dentry".
 * @old_name: Symlink's content.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_symlink(const struct path *parent, struct dentry *dentry,
                               const char *old_name)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };

        return tomoyo_path_perm(TOMOYO_TYPE_SYMLINK, &path, old_name);
}

/**
 * tomoyo_path_mknod - Target for security_path_mknod().
 *
 * @parent: Pointer to "struct path".
 * @dentry: Pointer to "struct dentry".
 * @mode:   DAC permission mode.
 * @dev:    Device attributes.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_mknod(const struct path *parent, struct dentry *dentry,
                             umode_t mode, unsigned int dev)
{
        struct path path = { .mnt = parent->mnt, .dentry = dentry };
        int type = TOMOYO_TYPE_CREATE;
        const unsigned int perm = mode & S_IALLUGO;

        switch (mode & S_IFMT) {
        case S_IFCHR:
                type = TOMOYO_TYPE_MKCHAR;
                break;
        case S_IFBLK:
                type = TOMOYO_TYPE_MKBLOCK;
                break;
        default:
                goto no_dev;
        }
        return tomoyo_mkdev_perm(type, &path, perm, dev);
 no_dev:
        switch (mode & S_IFMT) {
        case S_IFIFO:
                type = TOMOYO_TYPE_MKFIFO;
                break;
        case S_IFSOCK:
                type = TOMOYO_TYPE_MKSOCK;
                break;
        }
        return tomoyo_path_number_perm(type, &path, perm);
}

/**
 * tomoyo_path_link - Target for security_path_link().
 *
 * @old_dentry: Pointer to "struct dentry".
 * @new_dir:    Pointer to "struct path".
 * @new_dentry: Pointer to "struct dentry".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_link(struct dentry *old_dentry, const struct path *new_dir,
                            struct dentry *new_dentry)
{
        struct path path1 = { .mnt = new_dir->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_dir->mnt, .dentry = new_dentry };

        return tomoyo_path2_perm(TOMOYO_TYPE_LINK, &path1, &path2);
}

/**
 * tomoyo_path_rename - Target for security_path_rename().
 *
 * @old_parent: Pointer to "struct path".
 * @old_dentry: Pointer to "struct dentry".
 * @new_parent: Pointer to "struct path".
 * @new_dentry: Pointer to "struct dentry".
 * @flags: Rename options.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_rename(const struct path *old_parent,
                              struct dentry *old_dentry,
                              const struct path *new_parent,
                              struct dentry *new_dentry,
                              const unsigned int flags)
{
        struct path path1 = { .mnt = old_parent->mnt, .dentry = old_dentry };
        struct path path2 = { .mnt = new_parent->mnt, .dentry = new_dentry };

        if (flags & RENAME_EXCHANGE) {
                const int err = tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path2,
                                &path1);

                if (err)
                        return err;
        }
        return tomoyo_path2_perm(TOMOYO_TYPE_RENAME, &path1, &path2);
}

/**
 * tomoyo_file_fcntl - Target for security_file_fcntl().
 *
 * @file: Pointer to "struct file".
 * @cmd:  Command for fcntl().
 * @arg:  Argument for @cmd.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_fcntl(struct file *file, unsigned int cmd,
                             unsigned long arg)
{
        if (!(cmd == F_SETFL && ((arg ^ file->f_flags) & O_APPEND)))
                return 0;
        return tomoyo_check_open_permission(tomoyo_domain(), &file->f_path,
                                            O_WRONLY | (arg & O_APPEND));
}

/**
 * tomoyo_file_open - Target for security_file_open().
 *
 * @f: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_open(struct file *f)
{
        /* Don't check read permission here if called from execve(). */
        /* Illogically, FMODE_EXEC is in f_flags, not f_mode. */
        if (f->f_flags & __FMODE_EXEC)
                return 0;
        return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path,
                                            f->f_flags);
}

/**
 * tomoyo_file_ioctl - Target for security_file_ioctl().
 *
 * @file: Pointer to "struct file".
 * @cmd:  Command for ioctl().
 * @arg:  Argument for @cmd.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_file_ioctl(struct file *file, unsigned int cmd,
                             unsigned long arg)
{
        return tomoyo_path_number_perm(TOMOYO_TYPE_IOCTL, &file->f_path, cmd);
}

/**
 * tomoyo_path_chmod - Target for security_path_chmod().
 *
 * @path: Pointer to "struct path".
 * @mode: DAC permission mode.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chmod(const struct path *path, umode_t mode)
{
        return tomoyo_path_number_perm(TOMOYO_TYPE_CHMOD, path,
                                       mode & S_IALLUGO);
}

/**
 * tomoyo_path_chown - Target for security_path_chown().
 *
 * @path: Pointer to "struct path".
 * @uid:  Owner ID.
 * @gid:  Group ID.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        int error = 0;

        if (uid_valid(uid))
                error = tomoyo_path_number_perm(TOMOYO_TYPE_CHOWN, path,
                                                from_kuid(&init_user_ns, uid));
        if (!error && gid_valid(gid))
                error = tomoyo_path_number_perm(TOMOYO_TYPE_CHGRP, path,
                                                from_kgid(&init_user_ns, gid));
        return error;
}

/**
 * tomoyo_path_chroot - Target for security_path_chroot().
 *
 * @path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_path_chroot(const struct path *path)
{
        return tomoyo_path_perm(TOMOYO_TYPE_CHROOT, path, NULL);
}

/**
 * tomoyo_sb_mount - Target for security_sb_mount().
 *
 * @dev_name: Name of device file. Maybe NULL.
 * @path:     Pointer to "struct path".
 * @type:     Name of filesystem type. Maybe NULL.
 * @flags:    Mount options.
 * @data:     Optional data. Maybe NULL.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_mount(const char *dev_name, const struct path *path,
                           const char *type, unsigned long flags, void *data)
{
        return tomoyo_mount_permission(dev_name, path, type, flags, data);
}

/**
 * tomoyo_sb_umount - Target for security_sb_umount().
 *
 * @mnt:   Pointer to "struct vfsmount".
 * @flags: Unmount options.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_umount(struct vfsmount *mnt, int flags)
{
        struct path path = { .mnt = mnt, .dentry = mnt->mnt_root };

        return tomoyo_path_perm(TOMOYO_TYPE_UMOUNT, &path, NULL);
}

/**
 * tomoyo_sb_pivotroot - Target for security_sb_pivotroot().
 *
 * @old_path: Pointer to "struct path".
 * @new_path: Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_sb_pivotroot(const struct path *old_path, const struct path *new_path)
{
        return tomoyo_path2_perm(TOMOYO_TYPE_PIVOT_ROOT, new_path, old_path);
}

/**
 * tomoyo_socket_listen - Check permission for listen().
 *
 * @sock:    Pointer to "struct socket".
 * @backlog: Backlog parameter.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_listen(struct socket *sock, int backlog)
{
        return tomoyo_socket_listen_permission(sock);
}

/**
 * tomoyo_socket_connect - Check permission for connect().
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_connect(struct socket *sock, struct sockaddr *addr,
                                 int addr_len)
{
        return tomoyo_socket_connect_permission(sock, addr, addr_len);
}

/**
 * tomoyo_socket_bind - Check permission for bind().
 *
 * @sock:     Pointer to "struct socket".
 * @addr:     Pointer to "struct sockaddr".
 * @addr_len: Size of @addr.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_bind(struct socket *sock, struct sockaddr *addr,
                              int addr_len)
{
        return tomoyo_socket_bind_permission(sock, addr, addr_len);
}

/**
 * tomoyo_socket_sendmsg - Check permission for sendmsg().
 *
 * @sock: Pointer to "struct socket".
 * @msg:  Pointer to "struct msghdr".
 * @size: Size of message.
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                 int size)
{
        return tomoyo_socket_sendmsg_permission(sock, msg, size);
}

struct lsm_blob_sizes tomoyo_blob_sizes __ro_after_init = {
        .lbs_task = sizeof(struct tomoyo_task),
};

/**
 * tomoyo_task_alloc - Target for security_task_alloc().
 *
 * @task:        Pointer to "struct task_struct".
 * @clone_flags: clone() flags.
 *
 * Returns 0.
 */
static int tomoyo_task_alloc(struct task_struct *task,
                             unsigned long clone_flags)
{
        struct tomoyo_task *old = tomoyo_task(current);
        struct tomoyo_task *new = tomoyo_task(task);

        new->domain_info = old->domain_info;
        atomic_inc(&new->domain_info->users);
        new->old_domain_info = NULL;
        return 0;
}

/**
 * tomoyo_task_free - Target for security_task_free().
 *
 * @task: Pointer to "struct task_struct".
 */
static void tomoyo_task_free(struct task_struct *task)
{
        struct tomoyo_task *s = tomoyo_task(task);

        if (s->domain_info) {
                atomic_dec(&s->domain_info->users);
                s->domain_info = NULL;
        }
        if (s->old_domain_info) {
                atomic_dec(&s->old_domain_info->users);
                s->old_domain_info = NULL;
        }
}

static const struct lsm_id tomoyo_lsmid = {
        .name = "tomoyo",
        .id = LSM_ID_TOMOYO,
};

/* tomoyo_hooks is used for registering TOMOYO. */
static struct security_hook_list tomoyo_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(cred_prepare, tomoyo_cred_prepare),
        LSM_HOOK_INIT(bprm_committed_creds, tomoyo_bprm_committed_creds),
        LSM_HOOK_INIT(task_alloc, tomoyo_task_alloc),
        LSM_HOOK_INIT(task_free, tomoyo_task_free),
#ifndef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        LSM_HOOK_INIT(bprm_creds_for_exec, tomoyo_bprm_creds_for_exec),
#endif
        LSM_HOOK_INIT(bprm_check_security, tomoyo_bprm_check_security),
        LSM_HOOK_INIT(file_fcntl, tomoyo_file_fcntl),
        LSM_HOOK_INIT(file_open, tomoyo_file_open),
        LSM_HOOK_INIT(file_truncate, tomoyo_file_truncate),
        LSM_HOOK_INIT(path_truncate, tomoyo_path_truncate),
        LSM_HOOK_INIT(path_unlink, tomoyo_path_unlink),
        LSM_HOOK_INIT(path_mkdir, tomoyo_path_mkdir),
        LSM_HOOK_INIT(path_rmdir, tomoyo_path_rmdir),
        LSM_HOOK_INIT(path_symlink, tomoyo_path_symlink),
        LSM_HOOK_INIT(path_mknod, tomoyo_path_mknod),
        LSM_HOOK_INIT(path_link, tomoyo_path_link),
        LSM_HOOK_INIT(path_rename, tomoyo_path_rename),
        LSM_HOOK_INIT(inode_getattr, tomoyo_inode_getattr),
        LSM_HOOK_INIT(file_ioctl, tomoyo_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, tomoyo_file_ioctl),
        LSM_HOOK_INIT(path_chmod, tomoyo_path_chmod),
        LSM_HOOK_INIT(path_chown, tomoyo_path_chown),
        LSM_HOOK_INIT(path_chroot, tomoyo_path_chroot),
        LSM_HOOK_INIT(sb_mount, tomoyo_sb_mount),
        LSM_HOOK_INIT(sb_umount, tomoyo_sb_umount),
        LSM_HOOK_INIT(sb_pivotroot, tomoyo_sb_pivotroot),
        LSM_HOOK_INIT(socket_bind, tomoyo_socket_bind),
        LSM_HOOK_INIT(socket_connect, tomoyo_socket_connect),
        LSM_HOOK_INIT(socket_listen, tomoyo_socket_listen),
        LSM_HOOK_INIT(socket_sendmsg, tomoyo_socket_sendmsg),
};

/* Lock for GC. */
DEFINE_SRCU(tomoyo_ss);

int tomoyo_enabled __ro_after_init = 1;

/**
 * tomoyo_init - Register TOMOYO Linux as a LSM module.
 *
 * Returns 0.
 */
static int __init tomoyo_init(void)
{
        struct tomoyo_task *s = tomoyo_task(current);

        /* register ourselves with the security framework */
        security_add_hooks(tomoyo_hooks, ARRAY_SIZE(tomoyo_hooks),
                           &tomoyo_lsmid);
        pr_info("TOMOYO Linux initialized\n");
        s->domain_info = &tomoyo_kernel_domain;
        atomic_inc(&tomoyo_kernel_domain.users);
        s->old_domain_info = NULL;
        tomoyo_mm_init();

        return 0;
}

DEFINE_LSM(tomoyo) = {
        .name = "tomoyo",
        .enabled = &tomoyo_enabled,
        .flags = LSM_FLAG_LEGACY_MAJOR,
        .blobs = &tomoyo_blob_sizes,
        .init = tomoyo_init,
};










  274 

















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/interval_tree.h>
#include <linux/interval_tree_generic.h>
#include <linux/compiler.h>
#include <linux/export.h>

#define START(node) ((node)->start)
#define LAST(node)  ((node)->last)

INTERVAL_TREE_DEFINE(struct interval_tree_node, rb,
                     unsigned long, __subtree_last,
                     START, LAST,, interval_tree)

EXPORT_SYMBOL_GPL(interval_tree_insert);
EXPORT_SYMBOL_GPL(interval_tree_remove);
EXPORT_SYMBOL_GPL(interval_tree_iter_first);
EXPORT_SYMBOL_GPL(interval_tree_iter_next);

#ifdef CONFIG_INTERVAL_TREE_SPAN_ITER
/*
 * Roll nodes[1] into nodes[0] by advancing nodes[1] to the end of a contiguous
 * span of nodes. This makes nodes[0]->last the end of that contiguous used span
 * of indexes that started at the original nodes[1]->start.
 *
 * If there is an interior hole, nodes[1] is now the first node starting the
 * next used span. A hole span is between nodes[0]->last and nodes[1]->start.
 *
 * If there is a tailing hole, nodes[1] is now NULL. A hole span is between
 * nodes[0]->last and last_index.
 *
 * If the contiguous used range span to last_index, nodes[1] is set to NULL.
 */
static void
interval_tree_span_iter_next_gap(struct interval_tree_span_iter *state)
{
        struct interval_tree_node *cur = state->nodes[1];

        state->nodes[0] = cur;
        do {
                if (cur->last > state->nodes[0]->last)
                        state->nodes[0] = cur;
                cur = interval_tree_iter_next(cur, state->first_index,
                                              state->last_index);
        } while (cur && (state->nodes[0]->last >= cur->start ||
                         state->nodes[0]->last + 1 == cur->start));
        state->nodes[1] = cur;
}

void interval_tree_span_iter_first(struct interval_tree_span_iter *iter,
                                   struct rb_root_cached *itree,
                                   unsigned long first_index,
                                   unsigned long last_index)
{
        iter->first_index = first_index;
        iter->last_index = last_index;
        iter->nodes[0] = NULL;
        iter->nodes[1] =
                interval_tree_iter_first(itree, first_index, last_index);
        if (!iter->nodes[1]) {
                /* No nodes intersect the span, whole span is hole */
                iter->start_hole = first_index;
                iter->last_hole = last_index;
                iter->is_hole = 1;
                return;
        }
        if (iter->nodes[1]->start > first_index) {
                /* Leading hole on first iteration */
                iter->start_hole = first_index;
                iter->last_hole = iter->nodes[1]->start - 1;
                iter->is_hole = 1;
                interval_tree_span_iter_next_gap(iter);
                return;
        }

        /* Starting inside a used */
        iter->start_used = first_index;
        iter->is_hole = 0;
        interval_tree_span_iter_next_gap(iter);
        iter->last_used = iter->nodes[0]->last;
        if (iter->last_used >= last_index) {
                iter->last_used = last_index;
                iter->nodes[0] = NULL;
                iter->nodes[1] = NULL;
        }
}
EXPORT_SYMBOL_GPL(interval_tree_span_iter_first);

void interval_tree_span_iter_next(struct interval_tree_span_iter *iter)
{
        if (!iter->nodes[0] && !iter->nodes[1]) {
                iter->is_hole = -1;
                return;
        }

        if (iter->is_hole) {
                iter->start_used = iter->last_hole + 1;
                iter->last_used = iter->nodes[0]->last;
                if (iter->last_used >= iter->last_index) {
                        iter->last_used = iter->last_index;
                        iter->nodes[0] = NULL;
                        iter->nodes[1] = NULL;
                }
                iter->is_hole = 0;
                return;
        }

        if (!iter->nodes[1]) {
                /* Trailing hole */
                iter->start_hole = iter->nodes[0]->last + 1;
                iter->last_hole = iter->last_index;
                iter->nodes[0] = NULL;
                iter->is_hole = 1;
                return;
        }

        /* must have both nodes[0] and [1], interior hole */
        iter->start_hole = iter->nodes[0]->last + 1;
        iter->last_hole = iter->nodes[1]->start - 1;
        iter->is_hole = 1;
        interval_tree_span_iter_next_gap(iter);
}
EXPORT_SYMBOL_GPL(interval_tree_span_iter_next);

/*
 * Advance the iterator index to a specific position. The returned used/hole is
 * updated to start at new_index. This is faster than calling
 * interval_tree_span_iter_first() as it can avoid full searches in several
 * cases where the iterator is already set.
 */
void interval_tree_span_iter_advance(struct interval_tree_span_iter *iter,
                                     struct rb_root_cached *itree,
                                     unsigned long new_index)
{
        if (iter->is_hole == -1)
                return;

        iter->first_index = new_index;
        if (new_index > iter->last_index) {
                iter->is_hole = -1;
                return;
        }

        /* Rely on the union aliasing hole/used */
        if (iter->start_hole <= new_index && new_index <= iter->last_hole) {
                iter->start_hole = new_index;
                return;
        }
        if (new_index == iter->last_hole + 1)
                interval_tree_span_iter_next(iter);
        else
                interval_tree_span_iter_first(iter, itree, new_index,
                                              iter->last_index);
}
EXPORT_SYMBOL_GPL(interval_tree_span_iter_advance);
#endif


















































   34 













































   34 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM signal

#if !defined(_TRACE_SIGNAL_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SIGNAL_H

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/tracepoint.h>

#define TP_STORE_SIGINFO(__entry, info)                                \
        do {                                                        \
                if (info == SEND_SIG_NOINFO) {                        \
                        __entry->errno        = 0;                        \
                        __entry->code        = SI_USER;                \
                } else if (info == SEND_SIG_PRIV) {                \
                        __entry->errno        = 0;                        \
                        __entry->code        = SI_KERNEL;                \
                } else {                                        \
                        __entry->errno        = info->si_errno;        \
                        __entry->code        = info->si_code;        \
                }                                                \
        } while (0)

#ifndef TRACE_HEADER_MULTI_READ
enum {
        TRACE_SIGNAL_DELIVERED,
        TRACE_SIGNAL_IGNORED,
        TRACE_SIGNAL_ALREADY_PENDING,
        TRACE_SIGNAL_OVERFLOW_FAIL,
        TRACE_SIGNAL_LOSE_INFO,
};
#endif

/**
 * signal_generate - called when a signal is generated
 * @sig: signal number
 * @info: pointer to struct siginfo
 * @task: pointer to struct task_struct
 * @group: shared or private
 * @result: TRACE_SIGNAL_*
 *
 * Current process sends a 'sig' signal to 'task' process with
 * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV,
 * 'info' is not a pointer and you can't access its field. Instead,
 * SEND_SIG_NOINFO means that si_code is SI_USER, and SEND_SIG_PRIV
 * means that si_code is SI_KERNEL.
 */
TRACE_EVENT(signal_generate,

        TP_PROTO(int sig, struct kernel_siginfo *info, struct task_struct *task,
                        int group, int result),

        TP_ARGS(sig, info, task, group, result),

        TP_STRUCT__entry(
                __field(        int,        sig                        )
                __field(        int,        errno                        )
                __field(        int,        code                        )
                __array(        char,        comm,        TASK_COMM_LEN        )
                __field(        pid_t,        pid                        )
                __field(        int,        group                        )
                __field(        int,        result                        )
        ),

        TP_fast_assign(
                __entry->sig        = sig;
                TP_STORE_SIGINFO(__entry, info);
                memcpy(__entry->comm, task->comm, TASK_COMM_LEN);
                __entry->pid        = task->pid;
                __entry->group        = group;
                __entry->result        = result;
        ),

        TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d",
                  __entry->sig, __entry->errno, __entry->code,
                  __entry->comm, __entry->pid, __entry->group,
                  __entry->result)
);

/**
 * signal_deliver - called when a signal is delivered
 * @sig: signal number
 * @info: pointer to struct siginfo
 * @ka: pointer to struct k_sigaction
 *
 * A 'sig' signal is delivered to current process with 'info' siginfo,
 * and it will be handled by 'ka'. ka->sa.sa_handler can be SIG_IGN or
 * SIG_DFL.
 * Note that some signals reported by signal_generate tracepoint can be
 * lost, ignored or modified (by debugger) before hitting this tracepoint.
 * This means, this can show which signals are actually delivered, but
 * matching generated signals and delivered signals may not be correct.
 */
TRACE_EVENT(signal_deliver,

        TP_PROTO(int sig, struct kernel_siginfo *info, struct k_sigaction *ka),

        TP_ARGS(sig, info, ka),

        TP_STRUCT__entry(
                __field(        int,                sig                )
                __field(        int,                errno                )
                __field(        int,                code                )
                __field(        unsigned long,        sa_handler        )
                __field(        unsigned long,        sa_flags        )
        ),

        TP_fast_assign(
                __entry->sig        = sig;
                TP_STORE_SIGINFO(__entry, info);
                __entry->sa_handler        = (unsigned long)ka->sa.sa_handler;
                __entry->sa_flags        = ka->sa.sa_flags;
        ),

        TP_printk("sig=%d errno=%d code=%d sa_handler=%lx sa_flags=%lx",
                  __entry->sig, __entry->errno, __entry->code,
                  __entry->sa_handler, __entry->sa_flags)
);

#endif /* _TRACE_SIGNAL_H */

/* This part must be outside protection */
#include <trace/define_trace.h>














































































































































































































































    1 






    1 







































































    1 







    1 





























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
// SPDX-License-Identifier: GPL-2.0
/* Watch queue and general notification mechanism, built on pipes
 *
 * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 *
 * See Documentation/core-api/watch_queue.rst
 */

#define pr_fmt(fmt) "watchq: " fmt
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/printk.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/poll.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/file.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/sched/signal.h>
#include <linux/watch_queue.h>
#include <linux/pipe_fs_i.h>

MODULE_DESCRIPTION("Watch queue");
MODULE_AUTHOR("Red Hat, Inc.");

#define WATCH_QUEUE_NOTE_SIZE 128
#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE)

/*
 * This must be called under the RCU read-lock, which makes
 * sure that the wqueue still exists. It can then take the lock,
 * and check that the wqueue hasn't been destroyed, which in
 * turn makes sure that the notification pipe still exists.
 */
static inline bool lock_wqueue(struct watch_queue *wqueue)
{
        spin_lock_bh(&wqueue->lock);
        if (unlikely(!wqueue->pipe)) {
                spin_unlock_bh(&wqueue->lock);
                return false;
        }
        return true;
}

static inline void unlock_wqueue(struct watch_queue *wqueue)
{
        spin_unlock_bh(&wqueue->lock);
}

static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
                                         struct pipe_buffer *buf)
{
        struct watch_queue *wqueue = (struct watch_queue *)buf->private;
        struct page *page;
        unsigned int bit;

        /* We need to work out which note within the page this refers to, but
         * the note might have been maximum size, so merely ANDing the offset
         * off doesn't work.  OTOH, the note must've been more than zero size.
         */
        bit = buf->offset + buf->len;
        if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0)
                bit -= WATCH_QUEUE_NOTE_SIZE;
        bit /= WATCH_QUEUE_NOTE_SIZE;

        page = buf->page;
        bit += page->private;

        set_bit(bit, wqueue->notes_bitmap);
        generic_pipe_buf_release(pipe, buf);
}

// No try_steal function => no stealing
#define watch_queue_pipe_buf_try_steal NULL

/* New data written to a pipe may be appended to a buffer with this type. */
static const struct pipe_buf_operations watch_queue_pipe_buf_ops = {
        .release        = watch_queue_pipe_buf_release,
        .try_steal        = watch_queue_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

/*
 * Post a notification to a watch queue.
 *
 * Must be called with the RCU lock for reading, and the
 * watch_queue lock held, which guarantees that the pipe
 * hasn't been released.
 */
static bool post_one_notification(struct watch_queue *wqueue,
                                  struct watch_notification *n)
{
        void *p;
        struct pipe_inode_info *pipe = wqueue->pipe;
        struct pipe_buffer *buf;
        struct page *page;
        unsigned int head, tail, note, offset, len;
        bool done = false;

        spin_lock_irq(&pipe->rd_wait.lock);

        head = pipe->head;
        tail = pipe->tail;
        if (pipe_full(head, tail, pipe->ring_size))
                goto lost;

        note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes);
        if (note >= wqueue->nr_notes)
                goto lost;

        page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE];
        offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE;
        get_page(page);
        len = n->info & WATCH_INFO_LENGTH;
        p = kmap_atomic(page);
        memcpy(p + offset, n, len);
        kunmap_atomic(p);

        buf = pipe_buf(pipe, head);
        buf->page = page;
        buf->private = (unsigned long)wqueue;
        buf->ops = &watch_queue_pipe_buf_ops;
        buf->offset = offset;
        buf->len = len;
        buf->flags = PIPE_BUF_FLAG_WHOLE;
        smp_store_release(&pipe->head, head + 1); /* vs pipe_read() */

        if (!test_and_clear_bit(note, wqueue->notes_bitmap)) {
                spin_unlock_irq(&pipe->rd_wait.lock);
                BUG();
        }
        wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        done = true;

out:
        spin_unlock_irq(&pipe->rd_wait.lock);
        if (done)
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        return done;

lost:
        buf = pipe_buf(pipe, head - 1);
        buf->flags |= PIPE_BUF_FLAG_LOSS;
        goto out;
}

/*
 * Apply filter rules to a notification.
 */
static bool filter_watch_notification(const struct watch_filter *wf,
                                      const struct watch_notification *n)
{
        const struct watch_type_filter *wt;
        unsigned int st_bits = sizeof(wt->subtype_filter[0]) * 8;
        unsigned int st_index = n->subtype / st_bits;
        unsigned int st_bit = 1U << (n->subtype % st_bits);
        int i;

        if (!test_bit(n->type, wf->type_filter))
                return false;

        for (i = 0; i < wf->nr_filters; i++) {
                wt = &wf->filters[i];
                if (n->type == wt->type &&
                    (wt->subtype_filter[st_index] & st_bit) &&
                    (n->info & wt->info_mask) == wt->info_filter)
                        return true;
        }

        return false; /* If there is a filter, the default is to reject. */
}

/**
 * __post_watch_notification - Post an event notification
 * @wlist: The watch list to post the event to.
 * @n: The notification record to post.
 * @cred: The creds of the process that triggered the notification.
 * @id: The ID to match on the watch.
 *
 * Post a notification of an event into a set of watch queues and let the users
 * know.
 *
 * The size of the notification should be set in n->info & WATCH_INFO_LENGTH and
 * should be in units of sizeof(*n).
 */
void __post_watch_notification(struct watch_list *wlist,
                               struct watch_notification *n,
                               const struct cred *cred,
                               u64 id)
{
        const struct watch_filter *wf;
        struct watch_queue *wqueue;
        struct watch *watch;

        if (((n->info & WATCH_INFO_LENGTH) >> WATCH_INFO_LENGTH__SHIFT) == 0) {
                WARN_ON(1);
                return;
        }

        rcu_read_lock();

        hlist_for_each_entry_rcu(watch, &wlist->watchers, list_node) {
                if (watch->id != id)
                        continue;
                n->info &= ~WATCH_INFO_ID;
                n->info |= watch->info_id;

                wqueue = rcu_dereference(watch->queue);
                wf = rcu_dereference(wqueue->filter);
                if (wf && !filter_watch_notification(wf, n))
                        continue;

                if (security_post_notification(watch->cred, cred, n) < 0)
                        continue;

                if (lock_wqueue(wqueue)) {
                        post_one_notification(wqueue, n);
                        unlock_wqueue(wqueue);
                }
        }

        rcu_read_unlock();
}
EXPORT_SYMBOL(__post_watch_notification);

/*
 * Allocate sufficient pages to preallocation for the requested number of
 * notifications.
 */
long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
{
        struct watch_queue *wqueue = pipe->watch_queue;
        struct page **pages;
        unsigned long *bitmap;
        unsigned long user_bufs;
        int ret, i, nr_pages;

        if (!wqueue)
                return -ENODEV;
        if (wqueue->notes)
                return -EBUSY;

        if (nr_notes < 1 ||
            nr_notes > 512) /* TODO: choose a better hard limit */
                return -EINVAL;

        nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1);
        nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE;
        user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages);

        if (nr_pages > pipe->max_usage &&
            (too_many_pipe_buffers_hard(user_bufs) ||
             too_many_pipe_buffers_soft(user_bufs)) &&
            pipe_is_unprivileged_user()) {
                ret = -EPERM;
                goto error;
        }

        nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE;
        ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes));
        if (ret < 0)
                goto error;

        /*
         * pipe_resize_ring() does not update nr_accounted for watch_queue
         * pipes, because the above vastly overprovisions. Set nr_accounted on
         * and max_usage this pipe to the number that was actually charged to
         * the user above via account_pipe_buffers.
         */
        pipe->max_usage = nr_pages;
        pipe->nr_accounted = nr_pages;

        ret = -ENOMEM;
        pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
        if (!pages)
                goto error;

        for (i = 0; i < nr_pages; i++) {
                pages[i] = alloc_page(GFP_KERNEL);
                if (!pages[i])
                        goto error_p;
                pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE;
        }

        bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);
        if (!bitmap)
                goto error_p;

        bitmap_fill(bitmap, nr_notes);
        wqueue->notes = pages;
        wqueue->notes_bitmap = bitmap;
        wqueue->nr_pages = nr_pages;
        wqueue->nr_notes = nr_notes;
        return 0;

error_p:
        while (--i >= 0)
                __free_page(pages[i]);
        kfree(pages);
error:
        (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted);
        return ret;
}

/*
 * Set the filter on a watch queue.
 */
long watch_queue_set_filter(struct pipe_inode_info *pipe,
                            struct watch_notification_filter __user *_filter)
{
        struct watch_notification_type_filter *tf;
        struct watch_notification_filter filter;
        struct watch_type_filter *q;
        struct watch_filter *wfilter;
        struct watch_queue *wqueue = pipe->watch_queue;
        int ret, nr_filter = 0, i;

        if (!wqueue)
                return -ENODEV;

        if (!_filter) {
                /* Remove the old filter */
                wfilter = NULL;
                goto set;
        }

        /* Grab the user's filter specification */
        if (copy_from_user(&filter, _filter, sizeof(filter)) != 0)
                return -EFAULT;
        if (filter.nr_filters == 0 ||
            filter.nr_filters > 16 ||
            filter.__reserved != 0)
                return -EINVAL;

        tf = memdup_array_user(_filter->filters, filter.nr_filters, sizeof(*tf));
        if (IS_ERR(tf))
                return PTR_ERR(tf);

        ret = -EINVAL;
        for (i = 0; i < filter.nr_filters; i++) {
                if ((tf[i].info_filter & ~tf[i].info_mask) ||
                    tf[i].info_mask & WATCH_INFO_LENGTH)
                        goto err_filter;
                /* Ignore any unknown types */
                if (tf[i].type >= WATCH_TYPE__NR)
                        continue;
                nr_filter++;
        }

        /* Now we need to build the internal filter from only the relevant
         * user-specified filters.
         */
        ret = -ENOMEM;
        wfilter = kzalloc(struct_size(wfilter, filters, nr_filter), GFP_KERNEL);
        if (!wfilter)
                goto err_filter;
        wfilter->nr_filters = nr_filter;

        q = wfilter->filters;
        for (i = 0; i < filter.nr_filters; i++) {
                if (tf[i].type >= WATCH_TYPE__NR)
                        continue;

                q->type                        = tf[i].type;
                q->info_filter                = tf[i].info_filter;
                q->info_mask                = tf[i].info_mask;
                q->subtype_filter[0]        = tf[i].subtype_filter[0];
                __set_bit(q->type, wfilter->type_filter);
                q++;
        }

        kfree(tf);
set:
        pipe_lock(pipe);
        wfilter = rcu_replace_pointer(wqueue->filter, wfilter,
                                      lockdep_is_held(&pipe->mutex));
        pipe_unlock(pipe);
        if (wfilter)
                kfree_rcu(wfilter, rcu);
        return 0;

err_filter:
        kfree(tf);
        return ret;
}

static void __put_watch_queue(struct kref *kref)
{
        struct watch_queue *wqueue =
                container_of(kref, struct watch_queue, usage);
        struct watch_filter *wfilter;
        int i;

        for (i = 0; i < wqueue->nr_pages; i++)
                __free_page(wqueue->notes[i]);
        kfree(wqueue->notes);
        bitmap_free(wqueue->notes_bitmap);

        wfilter = rcu_access_pointer(wqueue->filter);
        if (wfilter)
                kfree_rcu(wfilter, rcu);
        kfree_rcu(wqueue, rcu);
}

/**
 * put_watch_queue - Dispose of a ref on a watchqueue.
 * @wqueue: The watch queue to unref.
 */
void put_watch_queue(struct watch_queue *wqueue)
{
        kref_put(&wqueue->usage, __put_watch_queue);
}
EXPORT_SYMBOL(put_watch_queue);

static void free_watch(struct rcu_head *rcu)
{
        struct watch *watch = container_of(rcu, struct watch, rcu);

        put_watch_queue(rcu_access_pointer(watch->queue));
        atomic_dec(&watch->cred->user->nr_watches);
        put_cred(watch->cred);
        kfree(watch);
}

static void __put_watch(struct kref *kref)
{
        struct watch *watch = container_of(kref, struct watch, usage);

        call_rcu(&watch->rcu, free_watch);
}

/*
 * Discard a watch.
 */
static void put_watch(struct watch *watch)
{
        kref_put(&watch->usage, __put_watch);
}

/**
 * init_watch - Initialise a watch
 * @watch: The watch to initialise.
 * @wqueue: The queue to assign.
 *
 * Initialise a watch and set the watch queue.
 */
void init_watch(struct watch *watch, struct watch_queue *wqueue)
{
        kref_init(&watch->usage);
        INIT_HLIST_NODE(&watch->list_node);
        INIT_HLIST_NODE(&watch->queue_node);
        rcu_assign_pointer(watch->queue, wqueue);
}

static int add_one_watch(struct watch *watch, struct watch_list *wlist, struct watch_queue *wqueue)
{
        const struct cred *cred;
        struct watch *w;

        hlist_for_each_entry(w, &wlist->watchers, list_node) {
                struct watch_queue *wq = rcu_access_pointer(w->queue);
                if (wqueue == wq && watch->id == w->id)
                        return -EBUSY;
        }

        cred = current_cred();
        if (atomic_inc_return(&cred->user->nr_watches) > task_rlimit(current, RLIMIT_NOFILE)) {
                atomic_dec(&cred->user->nr_watches);
                return -EAGAIN;
        }

        watch->cred = get_cred(cred);
        rcu_assign_pointer(watch->watch_list, wlist);

        kref_get(&wqueue->usage);
        kref_get(&watch->usage);
        hlist_add_head(&watch->queue_node, &wqueue->watches);
        hlist_add_head_rcu(&watch->list_node, &wlist->watchers);
        return 0;
}

/**
 * add_watch_to_object - Add a watch on an object to a watch list
 * @watch: The watch to add
 * @wlist: The watch list to add to
 *
 * @watch->queue must have been set to point to the queue to post notifications
 * to and the watch list of the object to be watched.  @watch->cred must also
 * have been set to the appropriate credentials and a ref taken on them.
 *
 * The caller must pin the queue and the list both and must hold the list
 * locked against racing watch additions/removals.
 */
int add_watch_to_object(struct watch *watch, struct watch_list *wlist)
{
        struct watch_queue *wqueue;
        int ret = -ENOENT;

        rcu_read_lock();

        wqueue = rcu_access_pointer(watch->queue);
        if (lock_wqueue(wqueue)) {
                spin_lock(&wlist->lock);
                ret = add_one_watch(watch, wlist, wqueue);
                spin_unlock(&wlist->lock);
                unlock_wqueue(wqueue);
        }

        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(add_watch_to_object);

/**
 * remove_watch_from_object - Remove a watch or all watches from an object.
 * @wlist: The watch list to remove from
 * @wq: The watch queue of interest (ignored if @all is true)
 * @id: The ID of the watch to remove (ignored if @all is true)
 * @all: True to remove all objects
 *
 * Remove a specific watch or all watches from an object.  A notification is
 * sent to the watcher to tell them that this happened.
 */
int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq,
                             u64 id, bool all)
{
        struct watch_notification_removal n;
        struct watch_queue *wqueue;
        struct watch *watch;
        int ret = -EBADSLT;

        rcu_read_lock();

again:
        spin_lock(&wlist->lock);
        hlist_for_each_entry(watch, &wlist->watchers, list_node) {
                if (all ||
                    (watch->id == id && rcu_access_pointer(watch->queue) == wq))
                        goto found;
        }
        spin_unlock(&wlist->lock);
        goto out;

found:
        ret = 0;
        hlist_del_init_rcu(&watch->list_node);
        rcu_assign_pointer(watch->watch_list, NULL);
        spin_unlock(&wlist->lock);

        /* We now own the reference on watch that used to belong to wlist. */

        n.watch.type = WATCH_TYPE_META;
        n.watch.subtype = WATCH_META_REMOVAL_NOTIFICATION;
        n.watch.info = watch->info_id | watch_sizeof(n.watch);
        n.id = id;
        if (id != 0)
                n.watch.info = watch->info_id | watch_sizeof(n);

        wqueue = rcu_dereference(watch->queue);

        if (lock_wqueue(wqueue)) {
                post_one_notification(wqueue, &n.watch);

                if (!hlist_unhashed(&watch->queue_node)) {
                        hlist_del_init_rcu(&watch->queue_node);
                        put_watch(watch);
                }

                unlock_wqueue(wqueue);
        }

        if (wlist->release_watch) {
                void (*release_watch)(struct watch *);

                release_watch = wlist->release_watch;
                rcu_read_unlock();
                (*release_watch)(watch);
                rcu_read_lock();
        }
        put_watch(watch);

        if (all && !hlist_empty(&wlist->watchers))
                goto again;
out:
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(remove_watch_from_object);

/*
 * Remove all the watches that are contributory to a queue.  This has the
 * potential to race with removal of the watches by the destruction of the
 * objects being watched or with the distribution of notifications.
 */
void watch_queue_clear(struct watch_queue *wqueue)
{
        struct watch_list *wlist;
        struct watch *watch;
        bool release;

        rcu_read_lock();
        spin_lock_bh(&wqueue->lock);

        /*
         * This pipe can be freed by callers like free_pipe_info().
         * Removing this reference also prevents new notifications.
         */
        wqueue->pipe = NULL;

        while (!hlist_empty(&wqueue->watches)) {
                watch = hlist_entry(wqueue->watches.first, struct watch, queue_node);
                hlist_del_init_rcu(&watch->queue_node);
                /* We now own a ref on the watch. */
                spin_unlock_bh(&wqueue->lock);

                /* We can't do the next bit under the queue lock as we need to
                 * get the list lock - which would cause a deadlock if someone
                 * was removing from the opposite direction at the same time or
                 * posting a notification.
                 */
                wlist = rcu_dereference(watch->watch_list);
                if (wlist) {
                        void (*release_watch)(struct watch *);

                        spin_lock(&wlist->lock);

                        release = !hlist_unhashed(&watch->list_node);
                        if (release) {
                                hlist_del_init_rcu(&watch->list_node);
                                rcu_assign_pointer(watch->watch_list, NULL);

                                /* We now own a second ref on the watch. */
                        }

                        release_watch = wlist->release_watch;
                        spin_unlock(&wlist->lock);

                        if (release) {
                                if (release_watch) {
                                        rcu_read_unlock();
                                        /* This might need to call dput(), so
                                         * we have to drop all the locks.
                                         */
                                        (*release_watch)(watch);
                                        rcu_read_lock();
                                }
                                put_watch(watch);
                        }
                }

                put_watch(watch);
                spin_lock_bh(&wqueue->lock);
        }

        spin_unlock_bh(&wqueue->lock);
        rcu_read_unlock();
}

/**
 * get_watch_queue - Get a watch queue from its file descriptor.
 * @fd: The fd to query.
 */
struct watch_queue *get_watch_queue(int fd)
{
        struct pipe_inode_info *pipe;
        struct watch_queue *wqueue = ERR_PTR(-EINVAL);
        CLASS(fd, f)(fd);

        if (!fd_empty(f)) {
                pipe = get_pipe_info(fd_file(f), false);
                if (pipe && pipe->watch_queue) {
                        wqueue = pipe->watch_queue;
                        kref_get(&wqueue->usage);
                }
        }

        return wqueue;
}
EXPORT_SYMBOL(get_watch_queue);

/*
 * Initialise a watch queue
 */
int watch_queue_init(struct pipe_inode_info *pipe)
{
        struct watch_queue *wqueue;

        wqueue = kzalloc(sizeof(*wqueue), GFP_KERNEL);
        if (!wqueue)
                return -ENOMEM;

        wqueue->pipe = pipe;
        kref_init(&wqueue->usage);
        spin_lock_init(&wqueue->lock);
        INIT_HLIST_HEAD(&wqueue->watches);

        pipe->watch_queue = wqueue;
        return 0;
}







































































































































































































































































































































































































































































    7 








    5 







    2 





    2 















    1 

































































    7 



    7 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                The Internet Protocol (IP) module.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@super.org>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Richard Underwood
 *                Stefan Becker, <stefanb@yello.ping.de>
 *                Jorge Cwik, <jorge@laser.satlink.net>
 *                Arnt Gulbrandsen, <agulbra@nvg.unit.no>
 *
 * Fixes:
 *                Alan Cox        :        Commented a couple of minor bits of surplus code
 *                Alan Cox        :        Undefining IP_FORWARD doesn't include the code
 *                                        (just stops a compiler warning).
 *                Alan Cox        :        Frames with >=MAX_ROUTE record routes, strict routes or loose routes
 *                                        are junked rather than corrupting things.
 *                Alan Cox        :        Frames to bad broadcast subnets are dumped
 *                                        We used to process them non broadcast and
 *                                        boy could that cause havoc.
 *                Alan Cox        :        ip_forward sets the free flag on the
 *                                        new frame it queues. Still crap because
 *                                        it copies the frame but at least it
 *                                        doesn't eat memory too.
 *                Alan Cox        :        Generic queue code and memory fixes.
 *                Fred Van Kempen :        IP fragment support (borrowed from NET2E)
 *                Gerhard Koerting:        Forward fragmented frames correctly.
 *                Gerhard Koerting:         Fixes to my fix of the above 8-).
 *                Gerhard Koerting:        IP interface addressing fix.
 *                Linus Torvalds        :        More robustness checks
 *                Alan Cox        :        Even more checks: Still not as robust as it ought to be
 *                Alan Cox        :        Save IP header pointer for later
 *                Alan Cox        :        ip option setting
 *                Alan Cox        :        Use ip_tos/ip_ttl settings
 *                Alan Cox        :        Fragmentation bogosity removed
 *                                        (Thanks to Mark.Bush@prg.ox.ac.uk)
 *                Dmitry Gorodchanin :        Send of a raw packet crash fix.
 *                Alan Cox        :        Silly ip bug when an overlength
 *                                        fragment turns up. Now frees the
 *                                        queue.
 *                Linus Torvalds/ :        Memory leakage on fragmentation
 *                Alan Cox        :        handling.
 *                Gerhard Koerting:        Forwarding uses IP priority hints
 *                Teemu Rantanen        :        Fragment problems.
 *                Alan Cox        :        General cleanup, comments and reformat
 *                Alan Cox        :        SNMP statistics
 *                Alan Cox        :        BSD address rule semantics. Also see
 *                                        UDP as there is a nasty checksum issue
 *                                        if you do things the wrong way.
 *                Alan Cox        :        Always defrag, moved IP_FORWARD to the config.in file
 *                Alan Cox        :         IP options adjust sk->priority.
 *                Pedro Roque        :        Fix mtu/length error in ip_forward.
 *                Alan Cox        :        Avoid ip_chk_addr when possible.
 *        Richard Underwood        :        IP multicasting.
 *                Alan Cox        :        Cleaned up multicast handlers.
 *                Alan Cox        :        RAW sockets demultiplex in the BSD style.
 *                Gunther Mayer        :        Fix the SNMP reporting typo
 *                Alan Cox        :        Always in group 224.0.0.1
 *        Pauline Middelink        :        Fast ip_checksum update when forwarding
 *                                        Masquerading support.
 *                Alan Cox        :        Multicast loopback error for 224.0.0.1
 *                Alan Cox        :        IP_MULTICAST_LOOP option.
 *                Alan Cox        :        Use notifiers.
 *                Bjorn Ekwall        :        Removed ip_csum (from slhc.c too)
 *                Bjorn Ekwall        :        Moved ip_fast_csum to ip.h (inline!)
 *                Stefan Becker   :       Send out ICMP HOST REDIRECT
 *        Arnt Gulbrandsen        :        ip_build_xmit
 *                Alan Cox        :        Per socket routing cache
 *                Alan Cox        :        Fixed routing cache, added header cache.
 *                Alan Cox        :        Loopback didn't work right in original ip_build_xmit - fixed it.
 *                Alan Cox        :        Only send ICMP_REDIRECT if src/dest are the same net.
 *                Alan Cox        :        Incoming IP option handling.
 *                Alan Cox        :        Set saddr on raw output frames as per BSD.
 *                Alan Cox        :        Stopped broadcast source route explosions.
 *                Alan Cox        :        Can disable source routing
 *                Takeshi Sone    :        Masquerading didn't work.
 *        Dave Bonn,Alan Cox        :        Faster IP forwarding whenever possible.
 *                Alan Cox        :        Memory leaks, tramples, misc debugging.
 *                Alan Cox        :        Fixed multicast (by popular demand 8))
 *                Alan Cox        :        Fixed forwarding (by even more popular demand 8))
 *                Alan Cox        :        Fixed SNMP statistics [I think]
 *        Gerhard Koerting        :        IP fragmentation forwarding fix
 *                Alan Cox        :        Device lock against page fault.
 *                Alan Cox        :        IP_HDRINCL facility.
 *        Werner Almesberger        :        Zero fragment bug
 *                Alan Cox        :        RAW IP frame length bug
 *                Alan Cox        :        Outgoing firewall on build_xmit
 *                A.N.Kuznetsov        :        IP_OPTIONS support throughout the kernel
 *                Alan Cox        :        Multicast routing hooks
 *                Jos Vos                :        Do accounting *before* call_in_firewall
 *        Willy Konynenberg        :        Transparent proxying support
 *
 * To Fix:
 *                IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
 *                and could be made very efficient with the addition of some virtual memory hacks to permit
 *                the allocation of a buffer that can then be 'grown' by twiddling page tables.
 *                Output fragmentation wants updating along with the buffer management to use a single
 *                interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
 *                output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
 *                fragmentation anyway.
 */

#define pr_fmt(fmt) "IPv4: " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/slab.h>

#include <linux/net.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/indirect_call_wrapper.h>

#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/icmp.h>
#include <net/raw.h>
#include <net/checksum.h>
#include <net/inet_ecn.h>
#include <linux/netfilter_ipv4.h>
#include <net/xfrm.h>
#include <linux/mroute.h>
#include <linux/netlink.h>
#include <net/dst_metadata.h>

/*
 *        Process Router Attention IP option (RFC 2113)
 */
bool ip_call_ra_chain(struct sk_buff *skb)
{
        struct ip_ra_chain *ra;
        u8 protocol = ip_hdr(skb)->protocol;
        struct sock *last = NULL;
        struct net_device *dev = skb->dev;
        struct net *net = dev_net(dev);

        for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
                struct sock *sk = ra->sk;

                /* If socket is bound to an interface, only report
                 * the packet if it came  from that interface.
                 */
                if (sk && inet_sk(sk)->inet_num == protocol &&
                    (!sk->sk_bound_dev_if ||
                     sk->sk_bound_dev_if == dev->ifindex)) {
                        if (ip_is_fragment(ip_hdr(skb))) {
                                if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
                                        return true;
                        }
                        if (last) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                                if (skb2)
                                        raw_rcv(last, skb2);
                        }
                        last = sk;
                }
        }

        if (last) {
                raw_rcv(last, skb);
                return true;
        }
        return false;
}

INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
        const struct net_protocol *ipprot;
        int raw, ret;

resubmit:
        raw = raw_local_deliver(skb, protocol);

        ipprot = rcu_dereference(inet_protos[protocol]);
        if (ipprot) {
                if (!ipprot->no_policy) {
                        if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                kfree_skb_reason(skb,
                                                 SKB_DROP_REASON_XFRM_POLICY);
                                return;
                        }
                        nf_reset_ct(skb);
                }
                ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
                                      skb);
                if (ret < 0) {
                        protocol = -ret;
                        goto resubmit;
                }
                __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
        } else {
                if (!raw) {
                        if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
                                __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
                                icmp_send(skb, ICMP_DEST_UNREACH,
                                          ICMP_PROT_UNREACH, 0);
                        }
                        kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
                } else {
                        __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
                        consume_skb(skb);
                }
        }
}

static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb_clear_delivery_time(skb);
        __skb_pull(skb, skb_network_header_len(skb));

        rcu_read_lock();
        ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
        rcu_read_unlock();

        return 0;
}

/*
 *         Deliver IP Packets to the higher protocol layers.
 */
int ip_local_deliver(struct sk_buff *skb)
{
        /*
         *        Reassemble IP fragments.
         */
        struct net *net = dev_net(skb->dev);

        if (ip_is_fragment(ip_hdr(skb))) {
                if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
                        return 0;
        }

        return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
                       net, NULL, skb, skb->dev, NULL,
                       ip_local_deliver_finish);
}
EXPORT_SYMBOL(ip_local_deliver);

static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
{
        struct ip_options *opt;
        const struct iphdr *iph;

        /* It looks as overkill, because not all
           IP options require packet mangling.
           But it is the easiest for now, especially taking
           into account that combination of IP options
           and running sniffer is extremely rare condition.
                                              --ANK (980813)
        */
        if (skb_cow(skb, skb_headroom(skb))) {
                __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
                goto drop;
        }

        iph = ip_hdr(skb);
        opt = &(IPCB(skb)->opt);
        opt->optlen = iph->ihl*4 - sizeof(struct iphdr);

        if (ip_options_compile(dev_net(dev), opt, skb)) {
                __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
                goto drop;
        }

        if (unlikely(opt->srr)) {
                struct in_device *in_dev = __in_dev_get_rcu(dev);

                if (in_dev) {
                        if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
                                if (IN_DEV_LOG_MARTIANS(in_dev))
                                        net_info_ratelimited("source route option %pI4 -> %pI4\n",
                                                             &iph->saddr,
                                                             &iph->daddr);
                                goto drop;
                        }
                }

                if (ip_options_rcv_srr(skb, dev))
                        goto drop;
        }

        return false;
drop:
        return true;
}

static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
                            const struct sk_buff *hint)
{
        return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
               ip_hdr(hint)->tos == iph->tos;
}

int tcp_v4_early_demux(struct sk_buff *skb);
int udp_v4_early_demux(struct sk_buff *skb);
static int ip_rcv_finish_core(struct net *net,
                              struct sk_buff *skb, struct net_device *dev,
                              const struct sk_buff *hint)
{
        const struct iphdr *iph = ip_hdr(skb);
        int err, drop_reason;
        struct rtable *rt;

        if (ip_can_use_hint(skb, iph, hint)) {
                drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr,
                                                ip4h_dscp(iph), dev, hint);
                if (unlikely(drop_reason))
                        goto drop_error;
        }

        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
            !skb_dst(skb) &&
            !skb->sk &&
            !ip_is_fragment(iph)) {
                switch (iph->protocol) {
                case IPPROTO_TCP:
                        if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
                                tcp_v4_early_demux(skb);

                                /* must reload iph, skb->head might have changed */
                                iph = ip_hdr(skb);
                        }
                        break;
                case IPPROTO_UDP:
                        if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
                                err = udp_v4_early_demux(skb);
                                if (unlikely(err))
                                        goto drop_error;

                                /* must reload iph, skb->head might have changed */
                                iph = ip_hdr(skb);
                        }
                        break;
                }
        }

        /*
         *        Initialise the virtual path cache for the packet. It describes
         *        how the packet travels inside Linux networking.
         */
        if (!skb_valid_dst(skb)) {
                drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
                                                   ip4h_dscp(iph), dev);
                if (unlikely(drop_reason))
                        goto drop_error;
                drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        } else {
                struct in_device *in_dev = __in_dev_get_rcu(dev);

                if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY))
                        IPCB(skb)->flags |= IPSKB_NOPOLICY;
        }

#ifdef CONFIG_IP_ROUTE_CLASSID
        if (unlikely(skb_dst(skb)->tclassid)) {
                struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
                u32 idx = skb_dst(skb)->tclassid;
                st[idx&0xFF].o_packets++;
                st[idx&0xFF].o_bytes += skb->len;
                st[(idx>>16)&0xFF].i_packets++;
                st[(idx>>16)&0xFF].i_bytes += skb->len;
        }
#endif

        if (iph->ihl > 5 && ip_rcv_options(skb, dev))
                goto drop;

        rt = skb_rtable(skb);
        if (rt->rt_type == RTN_MULTICAST) {
                __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
        } else if (rt->rt_type == RTN_BROADCAST) {
                __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
        } else if (skb->pkt_type == PACKET_BROADCAST ||
                   skb->pkt_type == PACKET_MULTICAST) {
                struct in_device *in_dev = __in_dev_get_rcu(dev);

                /* RFC 1122 3.3.6:
                 *
                 *   When a host sends a datagram to a link-layer broadcast
                 *   address, the IP destination address MUST be a legal IP
                 *   broadcast or IP multicast address.
                 *
                 *   A host SHOULD silently discard a datagram that is received
                 *   via a link-layer broadcast (see Section 2.4) but does not
                 *   specify an IP multicast or broadcast destination address.
                 *
                 * This doesn't explicitly say L2 *broadcast*, but broadcast is
                 * in a way a form of multicast and the most common use case for
                 * this is 802.11 protecting against cross-station spoofing (the
                 * so-called "hole-196" attack) so do it for both.
                 */
                if (in_dev &&
                    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
                        drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
                        goto drop;
                }
        }

        return NET_RX_SUCCESS;

drop:
        kfree_skb_reason(skb, drop_reason);
        return NET_RX_DROP;

drop_error:
        if (drop_reason == SKB_DROP_REASON_IP_RPFILTER)
                __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
        goto drop;
}

static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        int ret;

        /* if ingress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
        skb = l3mdev_ip_rcv(skb);
        if (!skb)
                return NET_RX_SUCCESS;

        ret = ip_rcv_finish_core(net, skb, dev, NULL);
        if (ret != NET_RX_DROP)
                ret = dst_input(skb);
        return ret;
}

/*
 *         Main IP Receive routine.
 */
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
        const struct iphdr *iph;
        int drop_reason;
        u32 len;

        /* When the interface is in promisc. mode, drop all the crap
         * that it receives, do not try to analyse it.
         */
        if (skb->pkt_type == PACKET_OTHERHOST) {
                dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
                drop_reason = SKB_DROP_REASON_OTHERHOST;
                goto drop;
        }

        __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb) {
                __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
                goto out;
        }

        drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
        if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                goto inhdr_error;

        iph = ip_hdr(skb);

        /*
         *        RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
         *
         *        Is the datagram acceptable?
         *
         *        1.        Length at least the size of an ip header
         *        2.        Version of 4
         *        3.        Checksums correctly. [Speed optimisation for later, skip loopback checksums]
         *        4.        Doesn't have a bogus length
         */

        if (iph->ihl < 5 || iph->version != 4)
                goto inhdr_error;

        BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
        BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
        BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
        __IP_ADD_STATS(net,
                       IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
                       max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));

        if (!pskb_may_pull(skb, iph->ihl*4))
                goto inhdr_error;

        iph = ip_hdr(skb);

        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                goto csum_error;

        len = iph_totlen(skb, iph);
        if (skb->len < len) {
                drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
                __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
                goto drop;
        } else if (len < (iph->ihl*4))
                goto inhdr_error;

        /* Our transport medium may have padded the buffer out. Now we know it
         * is IP we can trim to the true length of the frame.
         * Note this now means skb->len holds ntohs(iph->tot_len).
         */
        if (pskb_trim_rcsum(skb, len)) {
                __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
                goto drop;
        }

        iph = ip_hdr(skb);
        skb->transport_header = skb->network_header + iph->ihl*4;

        /* Remove any debris in the socket control block */
        memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
        IPCB(skb)->iif = skb->skb_iif;

        /* Must drop socket now because of tproxy. */
        if (!skb_sk_is_prefetched(skb))
                skb_orphan(skb);

        return skb;

csum_error:
        drop_reason = SKB_DROP_REASON_IP_CSUM;
        __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
        if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
                drop_reason = SKB_DROP_REASON_IP_INHDR;
        __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
        kfree_skb_reason(skb, drop_reason);
out:
        return NULL;
}

/*
 * IP receive entry point
 */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
           struct net_device *orig_dev)
{
        struct net *net = dev_net(dev);

        skb = ip_rcv_core(skb, net);
        if (skb == NULL)
                return NET_RX_DROP;

        return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
                       net, NULL, skb, dev, NULL,
                       ip_rcv_finish);
}

static void ip_sublist_rcv_finish(struct list_head *head)
{
        struct sk_buff *skb, *next;

        list_for_each_entry_safe(skb, next, head, list) {
                skb_list_del_init(skb);
                dst_input(skb);
        }
}

static struct sk_buff *ip_extract_route_hint(const struct net *net,
                                             struct sk_buff *skb, int rt_type)
{
        if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
            IPCB(skb)->flags & IPSKB_MULTIPATH)
                return NULL;

        return skb;
}

static void ip_list_rcv_finish(struct net *net, struct list_head *head)
{
        struct sk_buff *skb, *next, *hint = NULL;
        struct dst_entry *curr_dst = NULL;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *dev = skb->dev;
                struct dst_entry *dst;

                skb_list_del_init(skb);
                /* if ingress device is enslaved to an L3 master device pass the
                 * skb to its handler for processing
                 */
                skb = l3mdev_ip_rcv(skb);
                if (!skb)
                        continue;
                if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP)
                        continue;

                dst = skb_dst(skb);
                if (curr_dst != dst) {
                        hint = ip_extract_route_hint(net, skb,
                                                     dst_rtable(dst)->rt_type);

                        /* dispatch old sublist */
                        if (!list_empty(&sublist))
                                ip_sublist_rcv_finish(&sublist);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        curr_dst = dst;
                }
                list_add_tail(&skb->list, &sublist);
        }
        /* dispatch final sublist */
        ip_sublist_rcv_finish(&sublist);
}

static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
                           struct net *net)
{
        NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
                     head, dev, NULL, ip_rcv_finish);
        ip_list_rcv_finish(net, head);
}

/* Receive a list of IP packets */
void ip_list_rcv(struct list_head *head, struct packet_type *pt,
                 struct net_device *orig_dev)
{
        struct net_device *curr_dev = NULL;
        struct net *curr_net = NULL;
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *dev = skb->dev;
                struct net *net = dev_net(dev);

                skb_list_del_init(skb);
                skb = ip_rcv_core(skb, net);
                if (skb == NULL)
                        continue;

                if (curr_dev != dev || curr_net != net) {
                        /* dispatch old sublist */
                        if (!list_empty(&sublist))
                                ip_sublist_rcv(&sublist, curr_dev, curr_net);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        curr_dev = dev;
                        curr_net = net;
                }
                list_add_tail(&skb->list, &sublist);
        }
        /* dispatch final sublist */
        if (!list_empty(&sublist))
                ip_sublist_rcv(&sublist, curr_dev, curr_net);
}












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 











































































    3 





























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * net/dsa/user.c - user device handling
 * Copyright (c) 2008-2009 Marvell Semiconductor
 */

#include <linux/list.h>
#include <linux/etherdevice.h>
#include <linux/netdevice.h>
#include <linux/phy.h>
#include <linux/phy_fixed.h>
#include <linux/phylink.h>
#include <linux/of_net.h>
#include <linux/of_mdio.h>
#include <linux/mdio.h>
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/selftests.h>
#include <net/tc_act/tc_mirred.h>
#include <linux/if_bridge.h>
#include <linux/if_hsr.h>
#include <net/dcbnl.h>
#include <linux/netpoll.h>
#include <linux/string.h>

#include "conduit.h"
#include "dsa.h"
#include "netlink.h"
#include "port.h"
#include "switch.h"
#include "tag.h"
#include "user.h"

struct dsa_switchdev_event_work {
        struct net_device *dev;
        struct net_device *orig_dev;
        struct work_struct work;
        unsigned long event;
        /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
         * SWITCHDEV_FDB_DEL_TO_DEVICE
         */
        unsigned char addr[ETH_ALEN];
        u16 vid;
        bool host_addr;
};

enum dsa_standalone_event {
        DSA_UC_ADD,
        DSA_UC_DEL,
        DSA_MC_ADD,
        DSA_MC_DEL,
};

struct dsa_standalone_event_work {
        struct work_struct work;
        struct net_device *dev;
        enum dsa_standalone_event event;
        unsigned char addr[ETH_ALEN];
        u16 vid;
};

struct dsa_host_vlan_rx_filtering_ctx {
        struct net_device *dev;
        const unsigned char *addr;
        enum dsa_standalone_event event;
};

static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds)
{
        return ds->ops->port_fdb_add && ds->ops->port_fdb_del &&
               ds->fdb_isolation && !ds->vlan_filtering_is_global &&
               !ds->needs_standalone_vlan_filtering;
}

static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds)
{
        return ds->ops->port_mdb_add && ds->ops->port_mdb_del &&
               ds->fdb_isolation && !ds->vlan_filtering_is_global &&
               !ds->needs_standalone_vlan_filtering;
}

static void dsa_user_standalone_event_work(struct work_struct *work)
{
        struct dsa_standalone_event_work *standalone_work =
                container_of(work, struct dsa_standalone_event_work, work);
        const unsigned char *addr = standalone_work->addr;
        struct net_device *dev = standalone_work->dev;
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct switchdev_obj_port_mdb mdb;
        struct dsa_switch *ds = dp->ds;
        u16 vid = standalone_work->vid;
        int err;

        switch (standalone_work->event) {
        case DSA_UC_ADD:
                err = dsa_port_standalone_host_fdb_add(dp, addr, vid);
                if (err) {
                        dev_err(ds->dev,
                                "port %d failed to add %pM vid %d to fdb: %d\n",
                                dp->index, addr, vid, err);
                        break;
                }
                break;

        case DSA_UC_DEL:
                err = dsa_port_standalone_host_fdb_del(dp, addr, vid);
                if (err) {
                        dev_err(ds->dev,
                                "port %d failed to delete %pM vid %d from fdb: %d\n",
                                dp->index, addr, vid, err);
                }

                break;
        case DSA_MC_ADD:
                ether_addr_copy(mdb.addr, addr);
                mdb.vid = vid;

                err = dsa_port_standalone_host_mdb_add(dp, &mdb);
                if (err) {
                        dev_err(ds->dev,
                                "port %d failed to add %pM vid %d to mdb: %d\n",
                                dp->index, addr, vid, err);
                        break;
                }
                break;
        case DSA_MC_DEL:
                ether_addr_copy(mdb.addr, addr);
                mdb.vid = vid;

                err = dsa_port_standalone_host_mdb_del(dp, &mdb);
                if (err) {
                        dev_err(ds->dev,
                                "port %d failed to delete %pM vid %d from mdb: %d\n",
                                dp->index, addr, vid, err);
                }

                break;
        }

        kfree(standalone_work);
}

static int dsa_user_schedule_standalone_work(struct net_device *dev,
                                             enum dsa_standalone_event event,
                                             const unsigned char *addr,
                                             u16 vid)
{
        struct dsa_standalone_event_work *standalone_work;

        standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC);
        if (!standalone_work)
                return -ENOMEM;

        INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work);
        standalone_work->event = event;
        standalone_work->dev = dev;

        ether_addr_copy(standalone_work->addr, addr);
        standalone_work->vid = vid;

        dsa_schedule_work(&standalone_work->work);

        return 0;
}

static int dsa_user_host_vlan_rx_filtering(void *arg, int vid)
{
        struct dsa_host_vlan_rx_filtering_ctx *ctx = arg;

        return dsa_user_schedule_standalone_work(ctx->dev, ctx->event,
                                                  ctx->addr, vid);
}

static int dsa_user_vlan_for_each(struct net_device *dev,
                                  int (*cb)(void *arg, int vid), void *arg)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_vlan *v;
        int err;

        lockdep_assert_held(&dev->addr_list_lock);

        err = cb(arg, 0);
        if (err)
                return err;

        list_for_each_entry(v, &dp->user_vlans, list) {
                err = cb(arg, v->vid);
                if (err)
                        return err;
        }

        return 0;
}

static int dsa_user_sync_uc(struct net_device *dev,
                            const unsigned char *addr)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_host_vlan_rx_filtering_ctx ctx = {
                .dev = dev,
                .addr = addr,
                .event = DSA_UC_ADD,
        };

        dev_uc_add(conduit, addr);

        if (!dsa_switch_supports_uc_filtering(dp->ds))
                return 0;

        return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
                                      &ctx);
}

static int dsa_user_unsync_uc(struct net_device *dev,
                              const unsigned char *addr)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_host_vlan_rx_filtering_ctx ctx = {
                .dev = dev,
                .addr = addr,
                .event = DSA_UC_DEL,
        };

        dev_uc_del(conduit, addr);

        if (!dsa_switch_supports_uc_filtering(dp->ds))
                return 0;

        return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
                                      &ctx);
}

static int dsa_user_sync_mc(struct net_device *dev,
                            const unsigned char *addr)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_host_vlan_rx_filtering_ctx ctx = {
                .dev = dev,
                .addr = addr,
                .event = DSA_MC_ADD,
        };

        dev_mc_add(conduit, addr);

        if (!dsa_switch_supports_mc_filtering(dp->ds))
                return 0;

        return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
                                      &ctx);
}

static int dsa_user_unsync_mc(struct net_device *dev,
                              const unsigned char *addr)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_host_vlan_rx_filtering_ctx ctx = {
                .dev = dev,
                .addr = addr,
                .event = DSA_MC_DEL,
        };

        dev_mc_del(conduit, addr);

        if (!dsa_switch_supports_mc_filtering(dp->ds))
                return 0;

        return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
                                      &ctx);
}

void dsa_user_sync_ha(struct net_device *dev)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        struct netdev_hw_addr *ha;

        netif_addr_lock_bh(dev);

        netdev_for_each_synced_mc_addr(ha, dev)
                dsa_user_sync_mc(dev, ha->addr);

        netdev_for_each_synced_uc_addr(ha, dev)
                dsa_user_sync_uc(dev, ha->addr);

        netif_addr_unlock_bh(dev);

        if (dsa_switch_supports_uc_filtering(ds) ||
            dsa_switch_supports_mc_filtering(ds))
                dsa_flush_workqueue();
}

void dsa_user_unsync_ha(struct net_device *dev)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        struct netdev_hw_addr *ha;

        netif_addr_lock_bh(dev);

        netdev_for_each_synced_uc_addr(ha, dev)
                dsa_user_unsync_uc(dev, ha->addr);

        netdev_for_each_synced_mc_addr(ha, dev)
                dsa_user_unsync_mc(dev, ha->addr);

        netif_addr_unlock_bh(dev);

        if (dsa_switch_supports_uc_filtering(ds) ||
            dsa_switch_supports_mc_filtering(ds))
                dsa_flush_workqueue();
}

/* user mii_bus handling ***************************************************/
static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg)
{
        struct dsa_switch *ds = bus->priv;

        if (ds->phys_mii_mask & (1 << addr))
                return ds->ops->phy_read(ds, addr, reg);

        return 0xffff;
}

static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
{
        struct dsa_switch *ds = bus->priv;

        if (ds->phys_mii_mask & (1 << addr))
                return ds->ops->phy_write(ds, addr, reg, val);

        return 0;
}

void dsa_user_mii_bus_init(struct dsa_switch *ds)
{
        ds->user_mii_bus->priv = (void *)ds;
        ds->user_mii_bus->name = "dsa user smi";
        ds->user_mii_bus->read = dsa_user_phy_read;
        ds->user_mii_bus->write = dsa_user_phy_write;
        snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
                 ds->dst->index, ds->index);
        ds->user_mii_bus->parent = ds->dev;
        ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask;
}


/* user device handling ****************************************************/
static int dsa_user_get_iflink(const struct net_device *dev)
{
        return READ_ONCE(dsa_user_to_conduit(dev)->ifindex);
}

int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int err;

        if (dsa_switch_supports_uc_filtering(ds)) {
                err = dsa_port_standalone_host_fdb_add(dp, addr, 0);
                if (err)
                        goto out;
        }

        if (!ether_addr_equal(addr, conduit->dev_addr)) {
                err = dev_uc_add(conduit, addr);
                if (err < 0)
                        goto del_host_addr;
        }

        return 0;

del_host_addr:
        if (dsa_switch_supports_uc_filtering(ds))
                dsa_port_standalone_host_fdb_del(dp, addr, 0);
out:
        return err;
}

void dsa_user_host_uc_uninstall(struct net_device *dev)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr))
                dev_uc_del(conduit, dev->dev_addr);

        if (dsa_switch_supports_uc_filtering(ds))
                dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
}

static int dsa_user_open(struct net_device *dev)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        int err;

        err = dev_open(conduit, NULL);
        if (err < 0) {
                netdev_err(dev, "failed to open conduit %s\n", conduit->name);
                goto out;
        }

        err = dsa_user_host_uc_install(dev, dev->dev_addr);
        if (err)
                goto out;

        err = dsa_port_enable_rt(dp, dev->phydev);
        if (err)
                goto out_del_host_uc;

        return 0;

out_del_host_uc:
        dsa_user_host_uc_uninstall(dev);
out:
        return err;
}

static int dsa_user_close(struct net_device *dev)
{
        struct dsa_port *dp = dsa_user_to_port(dev);

        dsa_port_disable_rt(dp);

        dsa_user_host_uc_uninstall(dev);

        return 0;
}

static void dsa_user_manage_host_flood(struct net_device *dev)
{
        bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI);
        struct dsa_port *dp = dsa_user_to_port(dev);
        bool uc = dev->flags & IFF_PROMISC;

        dsa_port_set_host_flood(dp, uc, mc);
}

static void dsa_user_change_rx_flags(struct net_device *dev, int change)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (change & IFF_ALLMULTI)
                dev_set_allmulti(conduit,
                                 dev->flags & IFF_ALLMULTI ? 1 : -1);
        if (change & IFF_PROMISC)
                dev_set_promiscuity(conduit,
                                    dev->flags & IFF_PROMISC ? 1 : -1);

        if (dsa_switch_supports_uc_filtering(ds) &&
            dsa_switch_supports_mc_filtering(ds))
                dsa_user_manage_host_flood(dev);
}

static void dsa_user_set_rx_mode(struct net_device *dev)
{
        __dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc);
        __dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc);
}

static int dsa_user_set_mac_address(struct net_device *dev, void *a)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        struct sockaddr *addr = a;
        int err;

        if (!is_valid_ether_addr(addr->sa_data))
                return -EADDRNOTAVAIL;

        if (ds->ops->port_set_mac_address) {
                err = ds->ops->port_set_mac_address(ds, dp->index,
                                                    addr->sa_data);
                if (err)
                        return err;
        }

        /* If the port is down, the address isn't synced yet to hardware or
         * to the DSA conduit, so there is nothing to change.
         */
        if (!(dev->flags & IFF_UP))
                goto out_change_dev_addr;

        err = dsa_user_host_uc_install(dev, addr->sa_data);
        if (err)
                return err;

        dsa_user_host_uc_uninstall(dev);

out_change_dev_addr:
        eth_hw_addr_set(dev, addr->sa_data);

        return 0;
}

struct dsa_user_dump_ctx {
        struct net_device *dev;
        struct sk_buff *skb;
        struct netlink_callback *cb;
        int idx;
};

static int
dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid,
                          bool is_static, void *data)
{
        struct dsa_user_dump_ctx *dump = data;
        struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx;
        u32 portid = NETLINK_CB(dump->cb->skb).portid;
        u32 seq = dump->cb->nlh->nlmsg_seq;
        struct nlmsghdr *nlh;
        struct ndmsg *ndm;

        if (dump->idx < ctx->fdb_idx)
                goto skip;

        nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
                        sizeof(*ndm), NLM_F_MULTI);
        if (!nlh)
                return -EMSGSIZE;

        ndm = nlmsg_data(nlh);
        ndm->ndm_family  = AF_BRIDGE;
        ndm->ndm_pad1    = 0;
        ndm->ndm_pad2    = 0;
        ndm->ndm_flags   = NTF_SELF;
        ndm->ndm_type    = 0;
        ndm->ndm_ifindex = dump->dev->ifindex;
        ndm->ndm_state   = is_static ? NUD_NOARP : NUD_REACHABLE;

        if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
                goto nla_put_failure;

        if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
                goto nla_put_failure;

        nlmsg_end(dump->skb, nlh);

skip:
        dump->idx++;
        return 0;

nla_put_failure:
        nlmsg_cancel(dump->skb, nlh);
        return -EMSGSIZE;
}

static int
dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                  struct net_device *dev, struct net_device *filter_dev,
                  int *idx)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_user_dump_ctx dump = {
                .dev = dev,
                .skb = skb,
                .cb = cb,
                .idx = *idx,
        };
        int err;

        err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump);
        *idx = dump.idx;

        return err;
}

static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
        struct dsa_user_priv *p = netdev_priv(dev);
        struct dsa_switch *ds = p->dp->ds;
        int port = p->dp->index;

        /* Pass through to switch driver if it supports timestamping */
        switch (cmd) {
        case SIOCGHWTSTAMP:
                if (ds->ops->port_hwtstamp_get)
                        return ds->ops->port_hwtstamp_get(ds, port, ifr);
                break;
        case SIOCSHWTSTAMP:
                if (ds->ops->port_hwtstamp_set)
                        return ds->ops->port_hwtstamp_set(ds, port, ifr);
                break;
        }

        return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
}

static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx,
                                  const struct switchdev_attr *attr,
                                  struct netlink_ext_ack *extack)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        int ret;

        if (ctx && ctx != dp)
                return 0;

        switch (attr->id) {
        case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
                if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
                        return -EOPNOTSUPP;

                ret = dsa_port_set_state(dp, attr->u.stp_state, true);
                break;
        case SWITCHDEV_ATTR_ID_PORT_MST_STATE:
                if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
                        return -EOPNOTSUPP;

                ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack);
                break;
        case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
                if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
                        return -EOPNOTSUPP;

                ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering,
                                              extack);
                break;
        case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
                if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
                        return -EOPNOTSUPP;

                ret = dsa_port_ageing_time(dp, attr->u.ageing_time);
                break;
        case SWITCHDEV_ATTR_ID_BRIDGE_MST:
                if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
                        return -EOPNOTSUPP;

                ret = dsa_port_mst_enable(dp, attr->u.mst, extack);
                break;
        case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS:
                if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
                        return -EOPNOTSUPP;

                ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags,
                                                extack);
                break;
        case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
                if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
                        return -EOPNOTSUPP;

                ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack);
                break;
        case SWITCHDEV_ATTR_ID_VLAN_MSTI:
                if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
                        return -EOPNOTSUPP;

                ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti);
                break;
        default:
                ret = -EOPNOTSUPP;
                break;
        }

        return ret;
}

/* Must be called under rcu_read_lock() */
static int
dsa_user_vlan_check_for_8021q_uppers(struct net_device *user,
                                     const struct switchdev_obj_port_vlan *vlan)
{
        struct net_device *upper_dev;
        struct list_head *iter;

        netdev_for_each_upper_dev_rcu(user, upper_dev, iter) {
                u16 vid;

                if (!is_vlan_dev(upper_dev))
                        continue;

                vid = vlan_dev_vlan_id(upper_dev);
                if (vid == vlan->vid)
                        return -EBUSY;
        }

        return 0;
}

static int dsa_user_vlan_add(struct net_device *dev,
                             const struct switchdev_obj *obj,
                             struct netlink_ext_ack *extack)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct switchdev_obj_port_vlan *vlan;
        int err;

        if (dsa_port_skip_vlan_configuration(dp)) {
                NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
                return 0;
        }

        vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);

        /* Deny adding a bridge VLAN when there is already an 802.1Q upper with
         * the same VID.
         */
        if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) {
                rcu_read_lock();
                err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan);
                rcu_read_unlock();
                if (err) {
                        NL_SET_ERR_MSG_MOD(extack,
                                           "Port already has a VLAN upper with this VID");
                        return err;
                }
        }

        return dsa_port_vlan_add(dp, vlan, extack);
}

/* Offload a VLAN installed on the bridge or on a foreign interface by
 * installing it as a VLAN towards the CPU port.
 */
static int dsa_user_host_vlan_add(struct net_device *dev,
                                  const struct switchdev_obj *obj,
                                  struct netlink_ext_ack *extack)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct switchdev_obj_port_vlan vlan;

        /* Do nothing if this is a software bridge */
        if (!dp->bridge)
                return -EOPNOTSUPP;

        if (dsa_port_skip_vlan_configuration(dp)) {
                NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
                return 0;
        }

        vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);

        /* Even though drivers often handle CPU membership in special ways,
         * it doesn't make sense to program a PVID, so clear this flag.
         */
        vlan.flags &= ~BRIDGE_VLAN_INFO_PVID;

        return dsa_port_host_vlan_add(dp, &vlan, extack);
}

static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx,
                                 const struct switchdev_obj *obj,
                                 struct netlink_ext_ack *extack)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        int err;

        if (ctx && ctx != dp)
                return 0;

        switch (obj->id) {
        case SWITCHDEV_OBJ_ID_PORT_MDB:
                if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
                        return -EOPNOTSUPP;

                err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
                break;
        case SWITCHDEV_OBJ_ID_HOST_MDB:
                if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
                        return -EOPNOTSUPP;

                err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
                break;
        case SWITCHDEV_OBJ_ID_PORT_VLAN:
                if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
                        err = dsa_user_vlan_add(dev, obj, extack);
                else
                        err = dsa_user_host_vlan_add(dev, obj, extack);
                break;
        case SWITCHDEV_OBJ_ID_MRP:
                if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
                        return -EOPNOTSUPP;

                err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj));
                break;
        case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
                if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
                        return -EOPNOTSUPP;

                err = dsa_port_mrp_add_ring_role(dp,
                                                 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
                break;
        default:
                err = -EOPNOTSUPP;
                break;
        }

        return err;
}

static int dsa_user_vlan_del(struct net_device *dev,
                             const struct switchdev_obj *obj)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct switchdev_obj_port_vlan *vlan;

        if (dsa_port_skip_vlan_configuration(dp))
                return 0;

        vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);

        return dsa_port_vlan_del(dp, vlan);
}

static int dsa_user_host_vlan_del(struct net_device *dev,
                                  const struct switchdev_obj *obj)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct switchdev_obj_port_vlan *vlan;

        /* Do nothing if this is a software bridge */
        if (!dp->bridge)
                return -EOPNOTSUPP;

        if (dsa_port_skip_vlan_configuration(dp))
                return 0;

        vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);

        return dsa_port_host_vlan_del(dp, vlan);
}

static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx,
                                 const struct switchdev_obj *obj)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        int err;

        if (ctx && ctx != dp)
                return 0;

        switch (obj->id) {
        case SWITCHDEV_OBJ_ID_PORT_MDB:
                if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
                        return -EOPNOTSUPP;

                err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
                break;
        case SWITCHDEV_OBJ_ID_HOST_MDB:
                if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
                        return -EOPNOTSUPP;

                err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
                break;
        case SWITCHDEV_OBJ_ID_PORT_VLAN:
                if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
                        err = dsa_user_vlan_del(dev, obj);
                else
                        err = dsa_user_host_vlan_del(dev, obj);
                break;
        case SWITCHDEV_OBJ_ID_MRP:
                if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
                        return -EOPNOTSUPP;

                err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj));
                break;
        case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
                if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
                        return -EOPNOTSUPP;

                err = dsa_port_mrp_del_ring_role(dp,
                                                 SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
                break;
        default:
                err = -EOPNOTSUPP;
                break;
        }

        return err;
}

static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev,
                                             struct sk_buff *skb)
{
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct dsa_user_priv *p = netdev_priv(dev);

        return netpoll_send_skb(p->netpoll, skb);
#else
        BUG();
        return NETDEV_TX_OK;
#endif
}

static void dsa_skb_tx_timestamp(struct dsa_user_priv *p,
                                 struct sk_buff *skb)
{
        struct dsa_switch *ds = p->dp->ds;

        if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF))
                return;

        if (!ds->ops->port_txtstamp)
                return;

        ds->ops->port_txtstamp(ds, p->dp->index, skb);
}

netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
{
        /* SKB for netpoll still need to be mangled with the protocol-specific
         * tag to be successfully transmitted
         */
        if (unlikely(netpoll_tx_running(dev)))
                return dsa_user_netpoll_send_skb(dev, skb);

        /* Queue the SKB for transmission on the parent interface, but
         * do not modify its EtherType
         */
        skb->dev = dsa_user_to_conduit(dev);
        dev_queue_xmit(skb);

        return NETDEV_TX_OK;
}
EXPORT_SYMBOL_GPL(dsa_enqueue_skb);

static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct dsa_user_priv *p = netdev_priv(dev);
        struct sk_buff *nskb;

        dev_sw_netstats_tx_add(dev, 1, skb->len);

        memset(skb->cb, 0, sizeof(skb->cb));

        /* Handle tx timestamp if any */
        dsa_skb_tx_timestamp(p, skb);

        if (skb_ensure_writable_head_tail(skb, dev)) {
                dev_kfree_skb_any(skb);
                return NETDEV_TX_OK;
        }

        /* needed_tailroom should still be 'warm' in the cache line from
         * skb_ensure_writable_head_tail(), which has also ensured that
         * padding is safe.
         */
        if (dev->needed_tailroom)
                eth_skb_pad(skb);

        /* Transmit function may have to reallocate the original SKB,
         * in which case it must have freed it. Only free it here on error.
         */
        nskb = p->xmit(skb, dev);
        if (!nskb) {
                kfree_skb(skb);
                return NETDEV_TX_OK;
        }

        return dsa_enqueue_skb(nskb, dev);
}

/* ethtool operations *******************************************************/

static void dsa_user_get_drvinfo(struct net_device *dev,
                                 struct ethtool_drvinfo *drvinfo)
{
        strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
        strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
        strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
}

static int dsa_user_get_regs_len(struct net_device *dev)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_regs_len)
                return ds->ops->get_regs_len(ds, dp->index);

        return -EOPNOTSUPP;
}

static void
dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_regs)
                ds->ops->get_regs(ds, dp->index, regs, _p);
}

static int dsa_user_nway_reset(struct net_device *dev)
{
        struct dsa_port *dp = dsa_user_to_port(dev);

        return phylink_ethtool_nway_reset(dp->pl);
}

static int dsa_user_get_eeprom_len(struct net_device *dev)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->cd && ds->cd->eeprom_len)
                return ds->cd->eeprom_len;

        if (ds->ops->get_eeprom_len)
                return ds->ops->get_eeprom_len(ds);

        return 0;
}

static int dsa_user_get_eeprom(struct net_device *dev,
                               struct ethtool_eeprom *eeprom, u8 *data)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_eeprom)
                return ds->ops->get_eeprom(ds, eeprom, data);

        return -EOPNOTSUPP;
}

static int dsa_user_set_eeprom(struct net_device *dev,
                               struct ethtool_eeprom *eeprom, u8 *data)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->set_eeprom)
                return ds->ops->set_eeprom(ds, eeprom, data);

        return -EOPNOTSUPP;
}

static void dsa_user_get_strings(struct net_device *dev,
                                 uint32_t stringset, uint8_t *data)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (stringset == ETH_SS_STATS) {
                ethtool_puts(&data, "tx_packets");
                ethtool_puts(&data, "tx_bytes");
                ethtool_puts(&data, "rx_packets");
                ethtool_puts(&data, "rx_bytes");
                if (ds->ops->get_strings)
                        ds->ops->get_strings(ds, dp->index, stringset, data);
        } else if (stringset ==  ETH_SS_TEST) {
                net_selftest_get_strings(data);
        }

}

static void dsa_user_get_ethtool_stats(struct net_device *dev,
                                       struct ethtool_stats *stats,
                                       uint64_t *data)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        struct pcpu_sw_netstats *s;
        unsigned int start;
        int i;

        for_each_possible_cpu(i) {
                u64 tx_packets, tx_bytes, rx_packets, rx_bytes;

                s = per_cpu_ptr(dev->tstats, i);
                do {
                        start = u64_stats_fetch_begin(&s->syncp);
                        tx_packets = u64_stats_read(&s->tx_packets);
                        tx_bytes = u64_stats_read(&s->tx_bytes);
                        rx_packets = u64_stats_read(&s->rx_packets);
                        rx_bytes = u64_stats_read(&s->rx_bytes);
                } while (u64_stats_fetch_retry(&s->syncp, start));
                data[0] += tx_packets;
                data[1] += tx_bytes;
                data[2] += rx_packets;
                data[3] += rx_bytes;
        }
        if (ds->ops->get_ethtool_stats)
                ds->ops->get_ethtool_stats(ds, dp->index, data + 4);
}

static int dsa_user_get_sset_count(struct net_device *dev, int sset)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (sset == ETH_SS_STATS) {
                int count = 0;

                if (ds->ops->get_sset_count) {
                        count = ds->ops->get_sset_count(ds, dp->index, sset);
                        if (count < 0)
                                return count;
                }

                return count + 4;
        } else if (sset ==  ETH_SS_TEST) {
                return net_selftest_get_count();
        }

        return -EOPNOTSUPP;
}

static void dsa_user_get_eth_phy_stats(struct net_device *dev,
                                       struct ethtool_eth_phy_stats *phy_stats)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_eth_phy_stats)
                ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats);
}

static void dsa_user_get_eth_mac_stats(struct net_device *dev,
                                       struct ethtool_eth_mac_stats *mac_stats)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_eth_mac_stats)
                ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats);
}

static void
dsa_user_get_eth_ctrl_stats(struct net_device *dev,
                            struct ethtool_eth_ctrl_stats *ctrl_stats)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_eth_ctrl_stats)
                ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats);
}

static void
dsa_user_get_rmon_stats(struct net_device *dev,
                        struct ethtool_rmon_stats *rmon_stats,
                        const struct ethtool_rmon_hist_range **ranges)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_rmon_stats)
                ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
}

static void dsa_user_get_ts_stats(struct net_device *dev,
                                  struct ethtool_ts_stats *ts_stats)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_ts_stats)
                ds->ops->get_ts_stats(ds, dp->index, ts_stats);
}

static void dsa_user_net_selftest(struct net_device *ndev,
                                  struct ethtool_test *etest, u64 *buf)
{
        struct dsa_port *dp = dsa_user_to_port(ndev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->self_test) {
                ds->ops->self_test(ds, dp->index, etest, buf);
                return;
        }

        net_selftest(ndev, etest, buf);
}

static int dsa_user_get_mm(struct net_device *dev,
                           struct ethtool_mm_state *state)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (!ds->ops->get_mm)
                return -EOPNOTSUPP;

        return ds->ops->get_mm(ds, dp->index, state);
}

static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg,
                           struct netlink_ext_ack *extack)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (!ds->ops->set_mm)
                return -EOPNOTSUPP;

        return ds->ops->set_mm(ds, dp->index, cfg, extack);
}

static void dsa_user_get_mm_stats(struct net_device *dev,
                                  struct ethtool_mm_stats *stats)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_mm_stats)
                ds->ops->get_mm_stats(ds, dp->index, stats);
}

static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        phylink_ethtool_get_wol(dp->pl, w);

        if (ds->ops->get_wol)
                ds->ops->get_wol(ds, dp->index, w);
}

static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int ret = -EOPNOTSUPP;

        phylink_ethtool_set_wol(dp->pl, w);

        if (ds->ops->set_wol)
                ret = ds->ops->set_wol(ds, dp->index, w);

        return ret;
}

static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int ret;

        /* Check whether the switch supports EEE */
        if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index))
                return -EOPNOTSUPP;

        /* If the port is using phylink managed EEE, then an unimplemented
         * set_mac_eee() is permissible.
         */
        if (!phylink_mac_implements_lpi(ds->phylink_mac_ops)) {
                /* Port's PHY and MAC both need to be EEE capable */
                if (!dev->phydev)
                        return -ENODEV;

                if (!ds->ops->set_mac_eee)
                        return -EOPNOTSUPP;

                ret = ds->ops->set_mac_eee(ds, dp->index, e);
                if (ret)
                        return ret;
        } else if (ds->ops->set_mac_eee) {
                ret = ds->ops->set_mac_eee(ds, dp->index, e);
                if (ret)
                        return ret;
        }

        return phylink_ethtool_set_eee(dp->pl, e);
}

static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        /* Check whether the switch supports EEE */
        if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index))
                return -EOPNOTSUPP;

        /* Port's PHY and MAC both need to be EEE capable */
        if (!dev->phydev)
                return -ENODEV;

        return phylink_ethtool_get_eee(dp->pl, e);
}

static int dsa_user_get_link_ksettings(struct net_device *dev,
                                       struct ethtool_link_ksettings *cmd)
{
        struct dsa_port *dp = dsa_user_to_port(dev);

        return phylink_ethtool_ksettings_get(dp->pl, cmd);
}

static int dsa_user_set_link_ksettings(struct net_device *dev,
                                       const struct ethtool_link_ksettings *cmd)
{
        struct dsa_port *dp = dsa_user_to_port(dev);

        return phylink_ethtool_ksettings_set(dp->pl, cmd);
}

static void dsa_user_get_pause_stats(struct net_device *dev,
                                     struct ethtool_pause_stats *pause_stats)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_pause_stats)
                ds->ops->get_pause_stats(ds, dp->index, pause_stats);
}

static void dsa_user_get_pauseparam(struct net_device *dev,
                                    struct ethtool_pauseparam *pause)
{
        struct dsa_port *dp = dsa_user_to_port(dev);

        phylink_ethtool_get_pauseparam(dp->pl, pause);
}

static int dsa_user_set_pauseparam(struct net_device *dev,
                                   struct ethtool_pauseparam *pause)
{
        struct dsa_port *dp = dsa_user_to_port(dev);

        return phylink_ethtool_set_pauseparam(dp->pl, pause);
}

#ifdef CONFIG_NET_POLL_CONTROLLER
static int dsa_user_netpoll_setup(struct net_device *dev)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_user_priv *p = netdev_priv(dev);
        struct netpoll *netpoll;
        int err = 0;

        netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
        if (!netpoll)
                return -ENOMEM;

        err = __netpoll_setup(netpoll, conduit);
        if (err) {
                kfree(netpoll);
                goto out;
        }

        p->netpoll = netpoll;
out:
        return err;
}

static void dsa_user_netpoll_cleanup(struct net_device *dev)
{
        struct dsa_user_priv *p = netdev_priv(dev);
        struct netpoll *netpoll = p->netpoll;

        if (!netpoll)
                return;

        p->netpoll = NULL;

        __netpoll_free(netpoll);
}

static void dsa_user_poll_controller(struct net_device *dev)
{
}
#endif

static struct dsa_mall_tc_entry *
dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
{
        struct dsa_user_priv *p = netdev_priv(dev);
        struct dsa_mall_tc_entry *mall_tc_entry;

        list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
                if (mall_tc_entry->cookie == cookie)
                        return mall_tc_entry;

        return NULL;
}

static int
dsa_user_add_cls_matchall_mirred(struct net_device *dev,
                                 struct tc_cls_matchall_offload *cls,
                                 bool ingress, bool ingress_target)
{
        struct netlink_ext_ack *extack = cls->common.extack;
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_user_priv *p = netdev_priv(dev);
        struct dsa_mall_mirror_tc_entry *mirror;
        struct dsa_mall_tc_entry *mall_tc_entry;
        struct dsa_switch *ds = dp->ds;
        struct flow_action_entry *act;
        struct dsa_port *to_dp;
        int err;

        if (cls->common.protocol != htons(ETH_P_ALL)) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Can only offload \"protocol all\" matchall filter");
                return -EOPNOTSUPP;
        }

        if (!ds->ops->port_mirror_add) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Switch does not support mirroring operation");
                return -EOPNOTSUPP;
        }

        if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack))
                return -EOPNOTSUPP;

        act = &cls->rule->action.entries[0];

        if (!act->dev)
                return -EINVAL;

        if (dsa_user_dev_check(act->dev)) {
                if (ingress_target) {
                        /* We can only fulfill this using software assist */
                        if (cls->common.skip_sw) {
                                NL_SET_ERR_MSG_MOD(extack,
                                                   "Can only mirred to ingress of DSA user port if filter also runs in software");
                                return -EOPNOTSUPP;
                        }
                        to_dp = dp->cpu_dp;
                } else {
                        to_dp = dsa_user_to_port(act->dev);
                }
        } else {
                /* Handle mirroring to foreign target ports as a mirror towards
                 * the CPU. The software tc rule will take the packets from
                 * there.
                 */
                if (cls->common.skip_sw) {
                        NL_SET_ERR_MSG_MOD(extack,
                                           "Can only mirred to CPU if filter also runs in software");
                        return -EOPNOTSUPP;
                }
                to_dp = dp->cpu_dp;
        }

        if (dp->ds != to_dp->ds) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Cross-chip mirroring not implemented");
                return -EOPNOTSUPP;
        }

        mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
        if (!mall_tc_entry)
                return -ENOMEM;

        mall_tc_entry->cookie = cls->cookie;
        mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
        mirror = &mall_tc_entry->mirror;
        mirror->to_local_port = to_dp->index;
        mirror->ingress = ingress;

        err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack);
        if (err) {
                kfree(mall_tc_entry);
                return err;
        }

        list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);

        return err;
}

static int
dsa_user_add_cls_matchall_police(struct net_device *dev,
                                 struct tc_cls_matchall_offload *cls,
                                 bool ingress)
{
        struct netlink_ext_ack *extack = cls->common.extack;
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_user_priv *p = netdev_priv(dev);
        struct dsa_mall_policer_tc_entry *policer;
        struct dsa_mall_tc_entry *mall_tc_entry;
        struct dsa_switch *ds = dp->ds;
        struct flow_action_entry *act;
        int err;

        if (!ds->ops->port_policer_add) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Policing offload not implemented");
                return -EOPNOTSUPP;
        }

        if (!ingress) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Only supported on ingress qdisc");
                return -EOPNOTSUPP;
        }

        if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack))
                return -EOPNOTSUPP;

        list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
                if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
                        NL_SET_ERR_MSG_MOD(extack,
                                           "Only one port policer allowed");
                        return -EEXIST;
                }
        }

        act = &cls->rule->action.entries[0];

        mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
        if (!mall_tc_entry)
                return -ENOMEM;

        mall_tc_entry->cookie = cls->cookie;
        mall_tc_entry->type = DSA_PORT_MALL_POLICER;
        policer = &mall_tc_entry->policer;
        policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
        policer->burst = act->police.burst;

        err = ds->ops->port_policer_add(ds, dp->index, policer);
        if (err) {
                kfree(mall_tc_entry);
                return err;
        }

        list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);

        return err;
}

static int dsa_user_add_cls_matchall(struct net_device *dev,
                                     struct tc_cls_matchall_offload *cls,
                                     bool ingress)
{
        const struct flow_action *action = &cls->rule->action;
        struct netlink_ext_ack *extack = cls->common.extack;

        if (!flow_offload_has_one_action(action)) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Cannot offload matchall filter with more than one action");
                return -EOPNOTSUPP;
        }

        switch (action->entries[0].id) {
        case FLOW_ACTION_MIRRED:
                return dsa_user_add_cls_matchall_mirred(dev, cls, ingress,
                                                        false);
        case FLOW_ACTION_MIRRED_INGRESS:
                return dsa_user_add_cls_matchall_mirred(dev, cls, ingress,
                                                        true);
        case FLOW_ACTION_POLICE:
                return dsa_user_add_cls_matchall_police(dev, cls, ingress);
        default:
                NL_SET_ERR_MSG_MOD(extack, "Unknown action");
                break;
        }

        return -EOPNOTSUPP;
}

static void dsa_user_del_cls_matchall(struct net_device *dev,
                                      struct tc_cls_matchall_offload *cls)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_mall_tc_entry *mall_tc_entry;
        struct dsa_switch *ds = dp->ds;

        mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie);
        if (!mall_tc_entry)
                return;

        list_del(&mall_tc_entry->list);

        switch (mall_tc_entry->type) {
        case DSA_PORT_MALL_MIRROR:
                if (ds->ops->port_mirror_del)
                        ds->ops->port_mirror_del(ds, dp->index,
                                                 &mall_tc_entry->mirror);
                break;
        case DSA_PORT_MALL_POLICER:
                if (ds->ops->port_policer_del)
                        ds->ops->port_policer_del(ds, dp->index);
                break;
        default:
                WARN_ON(1);
        }

        kfree(mall_tc_entry);
}

static int dsa_user_setup_tc_cls_matchall(struct net_device *dev,
                                          struct tc_cls_matchall_offload *cls,
                                          bool ingress)
{
        if (cls->common.chain_index)
                return -EOPNOTSUPP;

        switch (cls->command) {
        case TC_CLSMATCHALL_REPLACE:
                return dsa_user_add_cls_matchall(dev, cls, ingress);
        case TC_CLSMATCHALL_DESTROY:
                dsa_user_del_cls_matchall(dev, cls);
                return 0;
        default:
                return -EOPNOTSUPP;
        }
}

static int dsa_user_add_cls_flower(struct net_device *dev,
                                   struct flow_cls_offload *cls,
                                   bool ingress)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int port = dp->index;

        if (!ds->ops->cls_flower_add)
                return -EOPNOTSUPP;

        return ds->ops->cls_flower_add(ds, port, cls, ingress);
}

static int dsa_user_del_cls_flower(struct net_device *dev,
                                   struct flow_cls_offload *cls,
                                   bool ingress)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int port = dp->index;

        if (!ds->ops->cls_flower_del)
                return -EOPNOTSUPP;

        return ds->ops->cls_flower_del(ds, port, cls, ingress);
}

static int dsa_user_stats_cls_flower(struct net_device *dev,
                                     struct flow_cls_offload *cls,
                                     bool ingress)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int port = dp->index;

        if (!ds->ops->cls_flower_stats)
                return -EOPNOTSUPP;

        return ds->ops->cls_flower_stats(ds, port, cls, ingress);
}

static int dsa_user_setup_tc_cls_flower(struct net_device *dev,
                                        struct flow_cls_offload *cls,
                                        bool ingress)
{
        switch (cls->command) {
        case FLOW_CLS_REPLACE:
                return dsa_user_add_cls_flower(dev, cls, ingress);
        case FLOW_CLS_DESTROY:
                return dsa_user_del_cls_flower(dev, cls, ingress);
        case FLOW_CLS_STATS:
                return dsa_user_stats_cls_flower(dev, cls, ingress);
        default:
                return -EOPNOTSUPP;
        }
}

static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
                                      void *cb_priv, bool ingress)
{
        struct net_device *dev = cb_priv;

        if (!tc_can_offload(dev))
                return -EOPNOTSUPP;

        switch (type) {
        case TC_SETUP_CLSMATCHALL:
                return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress);
        case TC_SETUP_CLSFLOWER:
                return dsa_user_setup_tc_cls_flower(dev, type_data, ingress);
        default:
                return -EOPNOTSUPP;
        }
}

static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type,
                                         void *type_data, void *cb_priv)
{
        return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true);
}

static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type,
                                         void *type_data, void *cb_priv)
{
        return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false);
}

static LIST_HEAD(dsa_user_block_cb_list);

static int dsa_user_setup_tc_block(struct net_device *dev,
                                   struct flow_block_offload *f)
{
        struct flow_block_cb *block_cb;
        flow_setup_cb_t *cb;

        if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
                cb = dsa_user_setup_tc_block_cb_ig;
        else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
                cb = dsa_user_setup_tc_block_cb_eg;
        else
                return -EOPNOTSUPP;

        f->driver_block_list = &dsa_user_block_cb_list;

        switch (f->command) {
        case FLOW_BLOCK_BIND:
                if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list))
                        return -EBUSY;

                block_cb = flow_block_cb_alloc(cb, dev, dev, NULL);
                if (IS_ERR(block_cb))
                        return PTR_ERR(block_cb);

                flow_block_cb_add(block_cb, f);
                list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list);
                return 0;
        case FLOW_BLOCK_UNBIND:
                block_cb = flow_block_cb_lookup(f->block, cb, dev);
                if (!block_cb)
                        return -ENOENT;

                flow_block_cb_remove(block_cb, f);
                list_del(&block_cb->driver_list);
                return 0;
        default:
                return -EOPNOTSUPP;
        }
}

static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port,
                                   void *type_data)
{
        struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port));

        if (!conduit->netdev_ops->ndo_setup_tc)
                return -EOPNOTSUPP;

        return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data);
}

static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type,
                             void *type_data)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        switch (type) {
        case TC_SETUP_BLOCK:
                return dsa_user_setup_tc_block(dev, type_data);
        case TC_SETUP_FT:
                return dsa_user_setup_ft_block(ds, dp->index, type_data);
        default:
                break;
        }

        if (!ds->ops->port_setup_tc)
                return -EOPNOTSUPP;

        return ds->ops->port_setup_tc(ds, dp->index, type, type_data);
}

static int dsa_user_get_rxnfc(struct net_device *dev,
                              struct ethtool_rxnfc *nfc, u32 *rule_locs)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (!ds->ops->get_rxnfc)
                return -EOPNOTSUPP;

        return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs);
}

static int dsa_user_set_rxnfc(struct net_device *dev,
                              struct ethtool_rxnfc *nfc)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (!ds->ops->set_rxnfc)
                return -EOPNOTSUPP;

        return ds->ops->set_rxnfc(ds, dp->index, nfc);
}

static int dsa_user_get_ts_info(struct net_device *dev,
                                struct kernel_ethtool_ts_info *ts)
{
        struct dsa_user_priv *p = netdev_priv(dev);
        struct dsa_switch *ds = p->dp->ds;

        if (!ds->ops->get_ts_info)
                return -EOPNOTSUPP;

        return ds->ops->get_ts_info(ds, p->dp->index, ts);
}

static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
                                    u16 vid)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct switchdev_obj_port_vlan vlan = {
                .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
                .vid = vid,
                /* This API only allows programming tagged, non-PVID VIDs */
                .flags = 0,
        };
        struct netlink_ext_ack extack = {0};
        struct dsa_switch *ds = dp->ds;
        struct netdev_hw_addr *ha;
        struct dsa_vlan *v;
        int ret;

        /* User port... */
        ret = dsa_port_vlan_add(dp, &vlan, &extack);
        if (ret) {
                if (extack._msg)
                        netdev_err(dev, "%s\n", extack._msg);
                return ret;
        }

        /* And CPU port... */
        ret = dsa_port_host_vlan_add(dp, &vlan, &extack);
        if (ret) {
                if (extack._msg)
                        netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index,
                                   extack._msg);
                return ret;
        }

        if (!dsa_switch_supports_uc_filtering(ds) &&
            !dsa_switch_supports_mc_filtering(ds))
                return 0;

        v = kzalloc(sizeof(*v), GFP_KERNEL);
        if (!v) {
                ret = -ENOMEM;
                goto rollback;
        }

        netif_addr_lock_bh(dev);

        v->vid = vid;
        list_add_tail(&v->list, &dp->user_vlans);

        if (dsa_switch_supports_mc_filtering(ds)) {
                netdev_for_each_synced_mc_addr(ha, dev) {
                        dsa_user_schedule_standalone_work(dev, DSA_MC_ADD,
                                                          ha->addr, vid);
                }
        }

        if (dsa_switch_supports_uc_filtering(ds)) {
                netdev_for_each_synced_uc_addr(ha, dev) {
                        dsa_user_schedule_standalone_work(dev, DSA_UC_ADD,
                                                          ha->addr, vid);
                }
        }

        netif_addr_unlock_bh(dev);

        dsa_flush_workqueue();

        return 0;

rollback:
        dsa_port_host_vlan_del(dp, &vlan);
        dsa_port_vlan_del(dp, &vlan);

        return ret;
}

static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
                                     u16 vid)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct switchdev_obj_port_vlan vlan = {
                .vid = vid,
                /* This API only allows programming tagged, non-PVID VIDs */
                .flags = 0,
        };
        struct dsa_switch *ds = dp->ds;
        struct netdev_hw_addr *ha;
        struct dsa_vlan *v;
        int err;

        err = dsa_port_vlan_del(dp, &vlan);
        if (err)
                return err;

        err = dsa_port_host_vlan_del(dp, &vlan);
        if (err)
                return err;

        if (!dsa_switch_supports_uc_filtering(ds) &&
            !dsa_switch_supports_mc_filtering(ds))
                return 0;

        netif_addr_lock_bh(dev);

        v = dsa_vlan_find(&dp->user_vlans, &vlan);
        if (!v) {
                netif_addr_unlock_bh(dev);
                return -ENOENT;
        }

        list_del(&v->list);
        kfree(v);

        if (dsa_switch_supports_mc_filtering(ds)) {
                netdev_for_each_synced_mc_addr(ha, dev) {
                        dsa_user_schedule_standalone_work(dev, DSA_MC_DEL,
                                                          ha->addr, vid);
                }
        }

        if (dsa_switch_supports_uc_filtering(ds)) {
                netdev_for_each_synced_uc_addr(ha, dev) {
                        dsa_user_schedule_standalone_work(dev, DSA_UC_DEL,
                                                          ha->addr, vid);
                }
        }

        netif_addr_unlock_bh(dev);

        dsa_flush_workqueue();

        return 0;
}

static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg)
{
        __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);

        return dsa_user_vlan_rx_add_vid(arg, proto, vid);
}

static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg)
{
        __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);

        return dsa_user_vlan_rx_kill_vid(arg, proto, vid);
}

/* Keep the VLAN RX filtering list in sync with the hardware only if VLAN
 * filtering is enabled. The baseline is that only ports that offload a
 * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware,
 * but there are exceptions for quirky hardware.
 *
 * If ds->vlan_filtering_is_global = true, then standalone ports which share
 * the same switch with other ports that offload a VLAN-aware bridge are also
 * inevitably VLAN-aware.
 *
 * To summarize, a DSA switch port offloads:
 *
 * - If standalone (this includes software bridge, software LAG):
 *     - if ds->needs_standalone_vlan_filtering = true, OR if
 *       (ds->vlan_filtering_is_global = true AND there are bridges spanning
 *       this switch chip which have vlan_filtering=1)
 *         - the 8021q upper VLANs
 *     - else (standalone VLAN filtering is not needed, VLAN filtering is not
 *       global, or it is, but no port is under a VLAN-aware bridge):
 *         - no VLAN (any 8021q upper is a software VLAN)
 *
 * - If under a vlan_filtering=0 bridge which it offload:
 *     - if ds->configure_vlan_while_not_filtering = true (default):
 *         - the bridge VLANs. These VLANs are committed to hardware but inactive.
 *     - else (deprecated):
 *         - no VLAN. The bridge VLANs are not restored when VLAN awareness is
 *           enabled, so this behavior is broken and discouraged.
 *
 * - If under a vlan_filtering=1 bridge which it offload:
 *     - the bridge VLANs
 *     - the 8021q upper VLANs
 */
int dsa_user_manage_vlan_filtering(struct net_device *user,
                                   bool vlan_filtering)
{
        int err;

        if (vlan_filtering) {
                user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;

                err = vlan_for_each(user, dsa_user_restore_vlan, user);
                if (err) {
                        vlan_for_each(user, dsa_user_clear_vlan, user);
                        user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
                        return err;
                }
        } else {
                err = vlan_for_each(user, dsa_user_clear_vlan, user);
                if (err)
                        return err;

                user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
        }

        return 0;
}

struct dsa_hw_port {
        struct list_head list;
        struct net_device *dev;
        int old_mtu;
};

static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu)
{
        const struct dsa_hw_port *p;
        int err;

        list_for_each_entry(p, hw_port_list, list) {
                if (p->dev->mtu == mtu)
                        continue;

                err = dev_set_mtu(p->dev, mtu);
                if (err)
                        goto rollback;
        }

        return 0;

rollback:
        list_for_each_entry_continue_reverse(p, hw_port_list, list) {
                if (p->dev->mtu == p->old_mtu)
                        continue;

                if (dev_set_mtu(p->dev, p->old_mtu))
                        netdev_err(p->dev, "Failed to restore MTU\n");
        }

        return err;
}

static void dsa_hw_port_list_free(struct list_head *hw_port_list)
{
        struct dsa_hw_port *p, *n;

        list_for_each_entry_safe(p, n, hw_port_list, list)
                kfree(p);
}

/* Make the hardware datapath to/from @dev limited to a common MTU */
static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
{
        struct list_head hw_port_list;
        struct dsa_switch_tree *dst;
        int min_mtu = ETH_MAX_MTU;
        struct dsa_port *other_dp;
        int err;

        if (!dp->ds->mtu_enforcement_ingress)
                return;

        if (!dp->bridge)
                return;

        INIT_LIST_HEAD(&hw_port_list);

        /* Populate the list of ports that are part of the same bridge
         * as the newly added/modified port
         */
        list_for_each_entry(dst, &dsa_tree_list, list) {
                list_for_each_entry(other_dp, &dst->ports, list) {
                        struct dsa_hw_port *hw_port;
                        struct net_device *user;

                        if (other_dp->type != DSA_PORT_TYPE_USER)
                                continue;

                        if (!dsa_port_bridge_same(dp, other_dp))
                                continue;

                        if (!other_dp->ds->mtu_enforcement_ingress)
                                continue;

                        user = other_dp->user;

                        if (min_mtu > user->mtu)
                                min_mtu = user->mtu;

                        hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL);
                        if (!hw_port)
                                goto out;

                        hw_port->dev = user;
                        hw_port->old_mtu = user->mtu;

                        list_add(&hw_port->list, &hw_port_list);
                }
        }

        /* Attempt to configure the entire hardware bridge to the newly added
         * interface's MTU first, regardless of whether the intention of the
         * user was to raise or lower it.
         */
        err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu);
        if (!err)
                goto out;

        /* Clearly that didn't work out so well, so just set the minimum MTU on
         * all hardware bridge ports now. If this fails too, then all ports will
         * still have their old MTU rolled back anyway.
         */
        dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu);

out:
        dsa_hw_port_list_free(&hw_port_list);
}

int dsa_user_change_mtu(struct net_device *dev, int new_mtu)
{
        struct net_device *conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_port *cpu_dp = dp->cpu_dp;
        struct dsa_switch *ds = dp->ds;
        struct dsa_port *other_dp;
        int largest_mtu = 0;
        int new_conduit_mtu;
        int old_conduit_mtu;
        int mtu_limit;
        int overhead;
        int cpu_mtu;
        int err;

        if (!ds->ops->port_change_mtu)
                return -EOPNOTSUPP;

        dsa_tree_for_each_user_port(other_dp, ds->dst) {
                int user_mtu;

                /* During probe, this function will be called for each user
                 * device, while not all of them have been allocated. That's
                 * ok, it doesn't change what the maximum is, so ignore it.
                 */
                if (!other_dp->user)
                        continue;

                /* Pretend that we already applied the setting, which we
                 * actually haven't (still haven't done all integrity checks)
                 */
                if (dp == other_dp)
                        user_mtu = new_mtu;
                else
                        user_mtu = other_dp->user->mtu;

                if (largest_mtu < user_mtu)
                        largest_mtu = user_mtu;
        }

        overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops);
        mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead);
        old_conduit_mtu = conduit->mtu;
        new_conduit_mtu = largest_mtu + overhead;
        if (new_conduit_mtu > mtu_limit)
                return -ERANGE;

        /* If the conduit MTU isn't over limit, there's no need to check the CPU
         * MTU, since that surely isn't either.
         */
        cpu_mtu = largest_mtu;

        /* Start applying stuff */
        if (new_conduit_mtu != old_conduit_mtu) {
                err = dev_set_mtu(conduit, new_conduit_mtu);
                if (err < 0)
                        goto out_conduit_failed;

                /* We only need to propagate the MTU of the CPU port to
                 * upstream switches, so emit a notifier which updates them.
                 */
                err = dsa_port_mtu_change(cpu_dp, cpu_mtu);
                if (err)
                        goto out_cpu_failed;
        }

        err = ds->ops->port_change_mtu(ds, dp->index, new_mtu);
        if (err)
                goto out_port_failed;

        WRITE_ONCE(dev->mtu, new_mtu);

        dsa_bridge_mtu_normalization(dp);

        return 0;

out_port_failed:
        if (new_conduit_mtu != old_conduit_mtu)
                dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead);
out_cpu_failed:
        if (new_conduit_mtu != old_conduit_mtu)
                dev_set_mtu(conduit, old_conduit_mtu);
out_conduit_failed:
        return err;
}

static int __maybe_unused
dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int port = dp->index;

        if (!ds->ops->port_set_apptrust)
                return -EOPNOTSUPP;

        return ds->ops->port_set_apptrust(ds, port, sel, nsel);
}

static int __maybe_unused
dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int port = dp->index;

        if (!ds->ops->port_get_apptrust)
                return -EOPNOTSUPP;

        return ds->ops->port_get_apptrust(ds, port, sel, nsel);
}

static int __maybe_unused
dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        unsigned long mask, new_prio;
        int err, port = dp->index;

        if (!ds->ops->port_set_default_prio)
                return -EOPNOTSUPP;

        err = dcb_ieee_setapp(dev, app);
        if (err)
                return err;

        mask = dcb_ieee_getapp_mask(dev, app);
        new_prio = __fls(mask);

        err = ds->ops->port_set_default_prio(ds, port, new_prio);
        if (err) {
                dcb_ieee_delapp(dev, app);
                return err;
        }

        return 0;
}

/* Update the DSCP prio entries on all user ports of the switch in case
 * the switch supports global DSCP prio instead of per port DSCP prios.
 */
static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev,
                                                  struct dcb_app *app, bool del)
{
        int (*setdel)(struct net_device *dev, struct dcb_app *app);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        struct dsa_port *other_dp;
        int err, restore_err;

        if (del)
                setdel = dcb_ieee_delapp;
        else
                setdel = dcb_ieee_setapp;

        dsa_switch_for_each_user_port(other_dp, ds) {
                struct net_device *user = other_dp->user;

                if (!user || user == dev)
                        continue;

                err = setdel(user, app);
                if (err)
                        goto err_try_to_restore;
        }

        return 0;

err_try_to_restore:

        /* Revert logic to restore previous state of app entries */
        if (!del)
                setdel = dcb_ieee_delapp;
        else
                setdel = dcb_ieee_setapp;

        dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) {
                struct net_device *user = other_dp->user;

                if (!user || user == dev)
                        continue;

                restore_err = setdel(user, app);
                if (restore_err)
                        netdev_err(user, "Failed to restore DSCP prio entry configuration\n");
        }

        return err;
}

static int __maybe_unused
dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        unsigned long mask, new_prio;
        int err, port = dp->index;
        u8 dscp = app->protocol;

        if (!ds->ops->port_add_dscp_prio)
                return -EOPNOTSUPP;

        if (dscp >= 64) {
                netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n",
                           dscp);
                return -EINVAL;
        }

        err = dcb_ieee_setapp(dev, app);
        if (err)
                return err;

        mask = dcb_ieee_getapp_mask(dev, app);
        new_prio = __fls(mask);

        err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio);
        if (err) {
                dcb_ieee_delapp(dev, app);
                return err;
        }

        if (!ds->dscp_prio_mapping_is_global)
                return 0;

        err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false);
        if (err) {
                if (ds->ops->port_del_dscp_prio)
                        ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio);
                dcb_ieee_delapp(dev, app);
                return err;
        }

        return 0;
}

static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev,
                                                     struct dcb_app *app)
{
        switch (app->selector) {
        case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
                switch (app->protocol) {
                case 0:
                        return dsa_user_dcbnl_set_default_prio(dev, app);
                default:
                        return -EOPNOTSUPP;
                }
                break;
        case IEEE_8021QAZ_APP_SEL_DSCP:
                return dsa_user_dcbnl_add_dscp_prio(dev, app);
        default:
                return -EOPNOTSUPP;
        }
}

static int __maybe_unused
dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        unsigned long mask, new_prio;
        int err, port = dp->index;

        if (!ds->ops->port_set_default_prio)
                return -EOPNOTSUPP;

        err = dcb_ieee_delapp(dev, app);
        if (err)
                return err;

        mask = dcb_ieee_getapp_mask(dev, app);
        new_prio = mask ? __fls(mask) : 0;

        err = ds->ops->port_set_default_prio(ds, port, new_prio);
        if (err) {
                dcb_ieee_setapp(dev, app);
                return err;
        }

        return 0;
}

static int __maybe_unused
dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int err, port = dp->index;
        u8 dscp = app->protocol;

        if (!ds->ops->port_del_dscp_prio)
                return -EOPNOTSUPP;

        err = dcb_ieee_delapp(dev, app);
        if (err)
                return err;

        err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority);
        if (err) {
                dcb_ieee_setapp(dev, app);
                return err;
        }

        if (!ds->dscp_prio_mapping_is_global)
                return 0;

        err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true);
        if (err) {
                if (ds->ops->port_add_dscp_prio)
                        ds->ops->port_add_dscp_prio(ds, port, dscp,
                                                    app->priority);
                dcb_ieee_setapp(dev, app);
                return err;
        }

        return 0;
}

static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev,
                                                     struct dcb_app *app)
{
        switch (app->selector) {
        case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
                switch (app->protocol) {
                case 0:
                        return dsa_user_dcbnl_del_default_prio(dev, app);
                default:
                        return -EOPNOTSUPP;
                }
                break;
        case IEEE_8021QAZ_APP_SEL_DSCP:
                return dsa_user_dcbnl_del_dscp_prio(dev, app);
        default:
                return -EOPNOTSUPP;
        }
}

/* Pre-populate the DCB application priority table with the priorities
 * configured during switch setup, which we read from hardware here.
 */
static int dsa_user_dcbnl_init(struct net_device *dev)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        int port = dp->index;
        int err;

        if (ds->ops->port_get_default_prio) {
                int prio = ds->ops->port_get_default_prio(ds, port);
                struct dcb_app app = {
                        .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE,
                        .protocol = 0,
                        .priority = prio,
                };

                if (prio < 0)
                        return prio;

                err = dcb_ieee_setapp(dev, &app);
                if (err)
                        return err;
        }

        if (ds->ops->port_get_dscp_prio) {
                int protocol;

                for (protocol = 0; protocol < 64; protocol++) {
                        struct dcb_app app = {
                                .selector = IEEE_8021QAZ_APP_SEL_DSCP,
                                .protocol = protocol,
                        };
                        int prio;

                        prio = ds->ops->port_get_dscp_prio(ds, port, protocol);
                        if (prio == -EOPNOTSUPP)
                                continue;
                        if (prio < 0)
                                return prio;

                        app.priority = prio;

                        err = dcb_ieee_setapp(dev, &app);
                        if (err)
                                return err;
                }
        }

        return 0;
}

static const struct ethtool_ops dsa_user_ethtool_ops = {
        .get_drvinfo                = dsa_user_get_drvinfo,
        .get_regs_len                = dsa_user_get_regs_len,
        .get_regs                = dsa_user_get_regs,
        .nway_reset                = dsa_user_nway_reset,
        .get_link                = ethtool_op_get_link,
        .get_eeprom_len                = dsa_user_get_eeprom_len,
        .get_eeprom                = dsa_user_get_eeprom,
        .set_eeprom                = dsa_user_set_eeprom,
        .get_strings                = dsa_user_get_strings,
        .get_ethtool_stats        = dsa_user_get_ethtool_stats,
        .get_sset_count                = dsa_user_get_sset_count,
        .get_eth_phy_stats        = dsa_user_get_eth_phy_stats,
        .get_eth_mac_stats        = dsa_user_get_eth_mac_stats,
        .get_eth_ctrl_stats        = dsa_user_get_eth_ctrl_stats,
        .get_rmon_stats                = dsa_user_get_rmon_stats,
        .get_ts_stats                = dsa_user_get_ts_stats,
        .set_wol                = dsa_user_set_wol,
        .get_wol                = dsa_user_get_wol,
        .set_eee                = dsa_user_set_eee,
        .get_eee                = dsa_user_get_eee,
        .get_link_ksettings        = dsa_user_get_link_ksettings,
        .set_link_ksettings        = dsa_user_set_link_ksettings,
        .get_pause_stats        = dsa_user_get_pause_stats,
        .get_pauseparam                = dsa_user_get_pauseparam,
        .set_pauseparam                = dsa_user_set_pauseparam,
        .get_rxnfc                = dsa_user_get_rxnfc,
        .set_rxnfc                = dsa_user_set_rxnfc,
        .get_ts_info                = dsa_user_get_ts_info,
        .self_test                = dsa_user_net_selftest,
        .get_mm                        = dsa_user_get_mm,
        .set_mm                        = dsa_user_set_mm,
        .get_mm_stats                = dsa_user_get_mm_stats,
};

static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = {
        .ieee_setapp                = dsa_user_dcbnl_ieee_setapp,
        .ieee_delapp                = dsa_user_dcbnl_ieee_delapp,
        .dcbnl_setapptrust        = dsa_user_dcbnl_set_apptrust,
        .dcbnl_getapptrust        = dsa_user_dcbnl_get_apptrust,
};

static void dsa_user_get_stats64(struct net_device *dev,
                                 struct rtnl_link_stats64 *s)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;

        if (ds->ops->get_stats64)
                ds->ops->get_stats64(ds, dp->index, s);
        else
                dev_get_tstats64(dev, s);
}

static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx,
                                      struct net_device_path *path)
{
        struct dsa_port *dp = dsa_user_to_port(ctx->dev);
        struct net_device *conduit = dsa_port_to_conduit(dp);
        struct dsa_port *cpu_dp = dp->cpu_dp;

        path->dev = ctx->dev;
        path->type = DEV_PATH_DSA;
        path->dsa.proto = cpu_dp->tag_ops->proto;
        path->dsa.port = dp->index;
        ctx->dev = conduit;

        return 0;
}

static const struct net_device_ops dsa_user_netdev_ops = {
        .ndo_open                = dsa_user_open,
        .ndo_stop                = dsa_user_close,
        .ndo_start_xmit                = dsa_user_xmit,
        .ndo_change_rx_flags        = dsa_user_change_rx_flags,
        .ndo_set_rx_mode        = dsa_user_set_rx_mode,
        .ndo_set_mac_address        = dsa_user_set_mac_address,
        .ndo_fdb_dump                = dsa_user_fdb_dump,
        .ndo_eth_ioctl                = dsa_user_ioctl,
        .ndo_get_iflink                = dsa_user_get_iflink,
#ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_netpoll_setup        = dsa_user_netpoll_setup,
        .ndo_netpoll_cleanup        = dsa_user_netpoll_cleanup,
        .ndo_poll_controller        = dsa_user_poll_controller,
#endif
        .ndo_setup_tc                = dsa_user_setup_tc,
        .ndo_get_stats64        = dsa_user_get_stats64,
        .ndo_vlan_rx_add_vid        = dsa_user_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid        = dsa_user_vlan_rx_kill_vid,
        .ndo_change_mtu                = dsa_user_change_mtu,
        .ndo_fill_forward_path        = dsa_user_fill_forward_path,
};

static const struct device_type dsa_type = {
        .name        = "dsa",
};

void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
{
        const struct dsa_port *dp = dsa_to_port(ds, port);

        if (dp->pl)
                phylink_mac_change(dp->pl, up);
}
EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);

static void dsa_user_phylink_fixed_state(struct phylink_config *config,
                                         struct phylink_link_state *state)
{
        struct dsa_port *dp = dsa_phylink_to_port(config);
        struct dsa_switch *ds = dp->ds;

        /* No need to check that this operation is valid, the callback would
         * not be called if it was not.
         */
        ds->ops->phylink_fixed_state(ds, dp->index, state);
}

/* user device setup *******************************************************/
static int dsa_user_phy_connect(struct net_device *user_dev, int addr,
                                u32 flags)
{
        struct dsa_port *dp = dsa_user_to_port(user_dev);
        struct dsa_switch *ds = dp->ds;

        user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr);
        if (!user_dev->phydev) {
                netdev_err(user_dev, "no phy at %d\n", addr);
                return -ENODEV;
        }

        user_dev->phydev->dev_flags |= flags;

        return phylink_connect_phy(dp->pl, user_dev->phydev);
}

static int dsa_user_phy_setup(struct net_device *user_dev)
{
        struct dsa_port *dp = dsa_user_to_port(user_dev);
        struct device_node *port_dn = dp->dn;
        struct dsa_switch *ds = dp->ds;
        u32 phy_flags = 0;
        int ret;

        dp->pl_config.dev = &user_dev->dev;
        dp->pl_config.type = PHYLINK_NETDEV;

        /* The get_fixed_state callback takes precedence over polling the
         * link GPIO in PHYLINK (see phylink_get_fixed_state).  Only set
         * this if the switch provides such a callback.
         */
        if (ds->ops->phylink_fixed_state) {
                dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state;
                dp->pl_config.poll_fixed_state = true;
        }

        ret = dsa_port_phylink_create(dp);
        if (ret)
                return ret;

        if (ds->ops->get_phy_flags)
                phy_flags = ds->ops->get_phy_flags(ds, dp->index);

        ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
        if (ret == -ENODEV && ds->user_mii_bus) {
                /* We could not connect to a designated PHY or SFP, so try to
                 * use the switch internal MDIO bus instead
                 */
                ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags);
        }
        if (ret) {
                netdev_err(user_dev, "failed to connect to PHY: %pe\n",
                           ERR_PTR(ret));
                dsa_port_phylink_destroy(dp);
        }

        return ret;
}

void dsa_user_setup_tagger(struct net_device *user)
{
        struct dsa_port *dp = dsa_user_to_port(user);
        struct net_device *conduit = dsa_port_to_conduit(dp);
        struct dsa_user_priv *p = netdev_priv(user);
        const struct dsa_port *cpu_dp = dp->cpu_dp;
        const struct dsa_switch *ds = dp->ds;

        user->needed_headroom = cpu_dp->tag_ops->needed_headroom;
        user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom;
        /* Try to save one extra realloc later in the TX path (in the conduit)
         * by also inheriting the conduit's needed headroom and tailroom.
         * The 8021q driver also does this.
         */
        user->needed_headroom += conduit->needed_headroom;
        user->needed_tailroom += conduit->needed_tailroom;

        p->xmit = cpu_dp->tag_ops->xmit;

        user->features = conduit->vlan_features | NETIF_F_HW_TC;
        user->hw_features |= NETIF_F_HW_TC;
        if (user->needed_tailroom)
                user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST);
        if (ds->needs_standalone_vlan_filtering)
                user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;

        user->lltx = true;
}

int dsa_user_suspend(struct net_device *user_dev)
{
        struct dsa_port *dp = dsa_user_to_port(user_dev);

        if (!netif_running(user_dev))
                return 0;

        netif_device_detach(user_dev);

        rtnl_lock();
        phylink_stop(dp->pl);
        rtnl_unlock();

        return 0;
}

int dsa_user_resume(struct net_device *user_dev)
{
        struct dsa_port *dp = dsa_user_to_port(user_dev);

        if (!netif_running(user_dev))
                return 0;

        netif_device_attach(user_dev);

        rtnl_lock();
        phylink_start(dp->pl);
        rtnl_unlock();

        return 0;
}

int dsa_user_create(struct dsa_port *port)
{
        struct net_device *conduit = dsa_port_to_conduit(port);
        struct dsa_switch *ds = port->ds;
        struct net_device *user_dev;
        struct dsa_user_priv *p;
        const char *name;
        int assign_type;
        int ret;

        if (!ds->num_tx_queues)
                ds->num_tx_queues = 1;

        if (port->name) {
                name = port->name;
                assign_type = NET_NAME_PREDICTABLE;
        } else {
                name = "eth%d";
                assign_type = NET_NAME_ENUM;
        }

        user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name,
                                    assign_type, ether_setup,
                                    ds->num_tx_queues, 1);
        if (user_dev == NULL)
                return -ENOMEM;

        user_dev->rtnl_link_ops = &dsa_link_ops;
        user_dev->ethtool_ops = &dsa_user_ethtool_ops;
#if IS_ENABLED(CONFIG_DCB)
        user_dev->dcbnl_ops = &dsa_user_dcbnl_ops;
#endif
        if (!is_zero_ether_addr(port->mac))
                eth_hw_addr_set(user_dev, port->mac);
        else
                eth_hw_addr_inherit(user_dev, conduit);
        user_dev->priv_flags |= IFF_NO_QUEUE;
        if (dsa_switch_supports_uc_filtering(ds))
                user_dev->priv_flags |= IFF_UNICAST_FLT;
        user_dev->netdev_ops = &dsa_user_netdev_ops;
        if (ds->ops->port_max_mtu)
                user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
        SET_NETDEV_DEVTYPE(user_dev, &dsa_type);

        SET_NETDEV_DEV(user_dev, port->ds->dev);
        SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port);
        user_dev->dev.of_node = port->dn;
        user_dev->vlan_features = conduit->vlan_features;

        p = netdev_priv(user_dev);
        user_dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;

        ret = gro_cells_init(&p->gcells, user_dev);
        if (ret)
                goto out_free;

        p->dp = port;
        INIT_LIST_HEAD(&p->mall_tc_list);
        port->user = user_dev;
        dsa_user_setup_tagger(user_dev);

        netif_carrier_off(user_dev);

        ret = dsa_user_phy_setup(user_dev);
        if (ret) {
                netdev_err(user_dev,
                           "error %d setting up PHY for tree %d, switch %d, port %d\n",
                           ret, ds->dst->index, ds->index, port->index);
                goto out_gcells;
        }

        rtnl_lock();

        ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN);
        if (ret && ret != -EOPNOTSUPP)
                dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n",
                         ret, ETH_DATA_LEN, port->index);

        ret = register_netdevice(user_dev);
        if (ret) {
                netdev_err(conduit, "error %d registering interface %s\n",
                           ret, user_dev->name);
                rtnl_unlock();
                goto out_phy;
        }

        if (IS_ENABLED(CONFIG_DCB)) {
                ret = dsa_user_dcbnl_init(user_dev);
                if (ret) {
                        netdev_err(user_dev,
                                   "failed to initialize DCB: %pe\n",
                                   ERR_PTR(ret));
                        rtnl_unlock();
                        goto out_unregister;
                }
        }

        ret = netdev_upper_dev_link(conduit, user_dev, NULL);

        rtnl_unlock();

        if (ret)
                goto out_unregister;

        return 0;

out_unregister:
        unregister_netdev(user_dev);
out_phy:
        rtnl_lock();
        phylink_disconnect_phy(p->dp->pl);
        rtnl_unlock();
        dsa_port_phylink_destroy(p->dp);
out_gcells:
        gro_cells_destroy(&p->gcells);
out_free:
        free_netdev(user_dev);
        port->user = NULL;
        return ret;
}

void dsa_user_destroy(struct net_device *user_dev)
{
        struct net_device *conduit = dsa_user_to_conduit(user_dev);
        struct dsa_port *dp = dsa_user_to_port(user_dev);
        struct dsa_user_priv *p = netdev_priv(user_dev);

        netif_carrier_off(user_dev);
        rtnl_lock();
        netdev_upper_dev_unlink(conduit, user_dev);
        unregister_netdevice(user_dev);
        phylink_disconnect_phy(dp->pl);
        rtnl_unlock();

        dsa_port_phylink_destroy(dp);
        gro_cells_destroy(&p->gcells);
        free_netdev(user_dev);
}

int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
                            struct netlink_ext_ack *extack)
{
        struct net_device *old_conduit = dsa_user_to_conduit(dev);
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch *ds = dp->ds;
        struct net_device *upper;
        struct list_head *iter;
        int err;

        if (conduit == old_conduit)
                return 0;

        if (!ds->ops->port_change_conduit) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Driver does not support changing DSA conduit");
                return -EOPNOTSUPP;
        }

        if (!netdev_uses_dsa(conduit)) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Interface not eligible as DSA conduit");
                return -EOPNOTSUPP;
        }

        netdev_for_each_upper_dev_rcu(conduit, upper, iter) {
                if (dsa_user_dev_check(upper))
                        continue;
                if (netif_is_bridge_master(upper))
                        continue;
                NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers");
                return -EOPNOTSUPP;
        }

        /* Since we allow live-changing the DSA conduit, plus we auto-open the
         * DSA conduit when the user port opens => we need to ensure that the
         * new DSA conduit is open too.
         */
        if (dev->flags & IFF_UP) {
                err = dev_open(conduit, extack);
                if (err)
                        return err;
        }

        netdev_upper_dev_unlink(old_conduit, dev);

        err = netdev_upper_dev_link(conduit, dev, extack);
        if (err)
                goto out_revert_old_conduit_unlink;

        err = dsa_port_change_conduit(dp, conduit, extack);
        if (err)
                goto out_revert_conduit_link;

        /* Update the MTU of the new CPU port through cross-chip notifiers */
        err = dsa_user_change_mtu(dev, dev->mtu);
        if (err && err != -EOPNOTSUPP) {
                netdev_warn(dev,
                            "nonfatal error updating MTU with new conduit: %pe\n",
                            ERR_PTR(err));
        }

        return 0;

out_revert_conduit_link:
        netdev_upper_dev_unlink(conduit, dev);
out_revert_old_conduit_unlink:
        netdev_upper_dev_link(old_conduit, dev, NULL);
        return err;
}

bool dsa_user_dev_check(const struct net_device *dev)
{
        return dev->netdev_ops == &dsa_user_netdev_ops;
}
EXPORT_SYMBOL_GPL(dsa_user_dev_check);

static int dsa_user_changeupper(struct net_device *dev,
                                struct netdev_notifier_changeupper_info *info)
{
        struct netlink_ext_ack *extack;
        int err = NOTIFY_DONE;
        struct dsa_port *dp;

        if (!dsa_user_dev_check(dev))
                return err;

        dp = dsa_user_to_port(dev);
        extack = netdev_notifier_info_to_extack(&info->info);

        if (netif_is_bridge_master(info->upper_dev)) {
                if (info->linking) {
                        err = dsa_port_bridge_join(dp, info->upper_dev, extack);
                        if (!err)
                                dsa_bridge_mtu_normalization(dp);
                        if (err == -EOPNOTSUPP) {
                                NL_SET_ERR_MSG_WEAK_MOD(extack,
                                                        "Offloading not supported");
                                err = 0;
                        }
                        err = notifier_from_errno(err);
                } else {
                        dsa_port_bridge_leave(dp, info->upper_dev);
                        err = NOTIFY_OK;
                }
        } else if (netif_is_lag_master(info->upper_dev)) {
                if (info->linking) {
                        err = dsa_port_lag_join(dp, info->upper_dev,
                                                info->upper_info, extack);
                        if (err == -EOPNOTSUPP) {
                                NL_SET_ERR_MSG_WEAK_MOD(extack,
                                                        "Offloading not supported");
                                err = 0;
                        }
                        err = notifier_from_errno(err);
                } else {
                        dsa_port_lag_leave(dp, info->upper_dev);
                        err = NOTIFY_OK;
                }
        } else if (is_hsr_master(info->upper_dev)) {
                if (info->linking) {
                        err = dsa_port_hsr_join(dp, info->upper_dev, extack);
                        if (err == -EOPNOTSUPP) {
                                NL_SET_ERR_MSG_WEAK_MOD(extack,
                                                        "Offloading not supported");
                                err = 0;
                        }
                        err = notifier_from_errno(err);
                } else {
                        dsa_port_hsr_leave(dp, info->upper_dev);
                        err = NOTIFY_OK;
                }
        }

        return err;
}

static int dsa_user_prechangeupper(struct net_device *dev,
                                   struct netdev_notifier_changeupper_info *info)
{
        struct dsa_port *dp;

        if (!dsa_user_dev_check(dev))
                return NOTIFY_DONE;

        dp = dsa_user_to_port(dev);

        if (netif_is_bridge_master(info->upper_dev) && !info->linking)
                dsa_port_pre_bridge_leave(dp, info->upper_dev);
        else if (netif_is_lag_master(info->upper_dev) && !info->linking)
                dsa_port_pre_lag_leave(dp, info->upper_dev);
        /* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot
         * meaningfully placed under a bridge yet
         */

        return NOTIFY_DONE;
}

static int
dsa_user_lag_changeupper(struct net_device *dev,
                         struct netdev_notifier_changeupper_info *info)
{
        struct net_device *lower;
        struct list_head *iter;
        int err = NOTIFY_DONE;
        struct dsa_port *dp;

        if (!netif_is_lag_master(dev))
                return err;

        netdev_for_each_lower_dev(dev, lower, iter) {
                if (!dsa_user_dev_check(lower))
                        continue;

                dp = dsa_user_to_port(lower);
                if (!dp->lag)
                        /* Software LAG */
                        continue;

                err = dsa_user_changeupper(lower, info);
                if (notifier_to_errno(err))
                        break;
        }

        return err;
}

/* Same as dsa_user_lag_changeupper() except that it calls
 * dsa_user_prechangeupper()
 */
static int
dsa_user_lag_prechangeupper(struct net_device *dev,
                            struct netdev_notifier_changeupper_info *info)
{
        struct net_device *lower;
        struct list_head *iter;
        int err = NOTIFY_DONE;
        struct dsa_port *dp;

        if (!netif_is_lag_master(dev))
                return err;

        netdev_for_each_lower_dev(dev, lower, iter) {
                if (!dsa_user_dev_check(lower))
                        continue;

                dp = dsa_user_to_port(lower);
                if (!dp->lag)
                        /* Software LAG */
                        continue;

                err = dsa_user_prechangeupper(lower, info);
                if (notifier_to_errno(err))
                        break;
        }

        return err;
}

static int
dsa_prevent_bridging_8021q_upper(struct net_device *dev,
                                 struct netdev_notifier_changeupper_info *info)
{
        struct netlink_ext_ack *ext_ack;
        struct net_device *user, *br;
        struct dsa_port *dp;

        ext_ack = netdev_notifier_info_to_extack(&info->info);

        if (!is_vlan_dev(dev))
                return NOTIFY_DONE;

        user = vlan_dev_real_dev(dev);
        if (!dsa_user_dev_check(user))
                return NOTIFY_DONE;

        dp = dsa_user_to_port(user);
        br = dsa_port_bridge_dev_get(dp);
        if (!br)
                return NOTIFY_DONE;

        /* Deny enslaving a VLAN device into a VLAN-aware bridge */
        if (br_vlan_enabled(br) &&
            netif_is_bridge_master(info->upper_dev) && info->linking) {
                NL_SET_ERR_MSG_MOD(ext_ack,
                                   "Cannot make VLAN device join VLAN-aware bridge");
                return notifier_from_errno(-EINVAL);
        }

        return NOTIFY_DONE;
}

static int
dsa_user_check_8021q_upper(struct net_device *dev,
                           struct netdev_notifier_changeupper_info *info)
{
        struct dsa_port *dp = dsa_user_to_port(dev);
        struct net_device *br = dsa_port_bridge_dev_get(dp);
        struct bridge_vlan_info br_info;
        struct netlink_ext_ack *extack;
        int err = NOTIFY_DONE;
        u16 vid;

        if (!br || !br_vlan_enabled(br))
                return NOTIFY_DONE;

        extack = netdev_notifier_info_to_extack(&info->info);
        vid = vlan_dev_vlan_id(info->upper_dev);

        /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
         * device, respectively the VID is not found, returning
         * 0 means success, which is a failure for us here.
         */
        err = br_vlan_get_info(br, vid, &br_info);
        if (err == 0) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "This VLAN is already configured by the bridge");
                return notifier_from_errno(-EBUSY);
        }

        return NOTIFY_DONE;
}

static int
dsa_user_prechangeupper_sanity_check(struct net_device *dev,
                                     struct netdev_notifier_changeupper_info *info)
{
        struct dsa_switch *ds;
        struct dsa_port *dp;
        int err;

        if (!dsa_user_dev_check(dev))
                return dsa_prevent_bridging_8021q_upper(dev, info);

        dp = dsa_user_to_port(dev);
        ds = dp->ds;

        if (ds->ops->port_prechangeupper) {
                err = ds->ops->port_prechangeupper(ds, dp->index, info);
                if (err)
                        return notifier_from_errno(err);
        }

        if (is_vlan_dev(info->upper_dev))
                return dsa_user_check_8021q_upper(dev, info);

        return NOTIFY_DONE;
}

/* To be eligible as a DSA conduit, a LAG must have all lower interfaces be
 * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of
 * switches in the same switch tree.
 */
static int dsa_lag_conduit_validate(struct net_device *lag_dev,
                                    struct netlink_ext_ack *extack)
{
        struct net_device *lower1, *lower2;
        struct list_head *iter1, *iter2;

        netdev_for_each_lower_dev(lag_dev, lower1, iter1) {
                netdev_for_each_lower_dev(lag_dev, lower2, iter2) {
                        if (!netdev_uses_dsa(lower1) ||
                            !netdev_uses_dsa(lower2)) {
                                NL_SET_ERR_MSG_MOD(extack,
                                                   "All LAG ports must be eligible as DSA conduits");
                                return notifier_from_errno(-EINVAL);
                        }

                        if (lower1 == lower2)
                                continue;

                        if (!dsa_port_tree_same(lower1->dsa_ptr,
                                                lower2->dsa_ptr)) {
                                NL_SET_ERR_MSG_MOD(extack,
                                                   "LAG contains DSA conduits of disjoint switch trees");
                                return notifier_from_errno(-EINVAL);
                        }
                }
        }

        return NOTIFY_DONE;
}

static int
dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit,
                                        struct netdev_notifier_changeupper_info *info)
{
        struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);

        if (!netdev_uses_dsa(conduit))
                return NOTIFY_DONE;

        if (!info->linking)
                return NOTIFY_DONE;

        /* Allow DSA switch uppers */
        if (dsa_user_dev_check(info->upper_dev))
                return NOTIFY_DONE;

        /* Allow bridge uppers of DSA conduits, subject to further
         * restrictions in dsa_bridge_prechangelower_sanity_check()
         */
        if (netif_is_bridge_master(info->upper_dev))
                return NOTIFY_DONE;

        /* Allow LAG uppers, subject to further restrictions in
         * dsa_lag_conduit_prechangelower_sanity_check()
         */
        if (netif_is_lag_master(info->upper_dev))
                return dsa_lag_conduit_validate(info->upper_dev, extack);

        NL_SET_ERR_MSG_MOD(extack,
                           "DSA conduit cannot join unknown upper interfaces");
        return notifier_from_errno(-EBUSY);
}

static int
dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev,
                                            struct netdev_notifier_changeupper_info *info)
{
        struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
        struct net_device *lag_dev = info->upper_dev;
        struct net_device *lower;
        struct list_head *iter;

        if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev))
                return NOTIFY_DONE;

        if (!info->linking)
                return NOTIFY_DONE;

        if (!netdev_uses_dsa(dev)) {
                NL_SET_ERR_MSG(extack,
                               "Only DSA conduits can join a LAG DSA conduit");
                return notifier_from_errno(-EINVAL);
        }

        netdev_for_each_lower_dev(lag_dev, lower, iter) {
                if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) {
                        NL_SET_ERR_MSG(extack,
                                       "Interface is DSA conduit for a different switch tree than this LAG");
                        return notifier_from_errno(-EINVAL);
                }

                break;
        }

        return NOTIFY_DONE;
}

/* Don't allow bridging of DSA conduits, since the bridge layer rx_handler
 * prevents the DSA fake ethertype handler to be invoked, so we don't get the
 * chance to strip off and parse the DSA switch tag protocol header (the bridge
 * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these
 * frames).
 * The only case where that would not be an issue is when bridging can already
 * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev
 * port, and is bridged only with other ports from the same hardware device.
 */
static int
dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower,
                                       struct netdev_notifier_changeupper_info *info)
{
        struct net_device *br = info->upper_dev;
        struct netlink_ext_ack *extack;
        struct net_device *lower;
        struct list_head *iter;

        if (!netif_is_bridge_master(br))
                return NOTIFY_DONE;

        if (!info->linking)
                return NOTIFY_DONE;

        extack = netdev_notifier_info_to_extack(&info->info);

        netdev_for_each_lower_dev(br, lower, iter) {
                if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower))
                        continue;

                if (!netdev_port_same_parent_id(lower, new_lower)) {
                        NL_SET_ERR_MSG(extack,
                                       "Cannot do software bridging with a DSA conduit");
                        return notifier_from_errno(-EINVAL);
                }
        }

        return NOTIFY_DONE;
}

static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst,
                                                    struct net_device *lag_dev)
{
        struct net_device *new_conduit = dsa_tree_find_first_conduit(dst);
        struct dsa_port *dp;
        int err;

        dsa_tree_for_each_user_port(dp, dst) {
                if (dsa_port_to_conduit(dp) != lag_dev)
                        continue;

                err = dsa_user_change_conduit(dp->user, new_conduit, NULL);
                if (err) {
                        netdev_err(dp->user,
                                   "failed to restore conduit to %s: %pe\n",
                                   new_conduit->name, ERR_PTR(err));
                }
        }
}

static int dsa_conduit_lag_join(struct net_device *conduit,
                                struct net_device *lag_dev,
                                struct netdev_lag_upper_info *uinfo,
                                struct netlink_ext_ack *extack)
{
        struct dsa_port *cpu_dp = conduit->dsa_ptr;
        struct dsa_switch_tree *dst = cpu_dp->dst;
        struct dsa_port *dp;
        int err;

        err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack);
        if (err)
                return err;

        dsa_tree_for_each_user_port(dp, dst) {
                if (dsa_port_to_conduit(dp) != conduit)
                        continue;

                err = dsa_user_change_conduit(dp->user, lag_dev, extack);
                if (err)
                        goto restore;
        }

        return 0;

restore:
        dsa_tree_for_each_user_port_continue_reverse(dp, dst) {
                if (dsa_port_to_conduit(dp) != lag_dev)
                        continue;

                err = dsa_user_change_conduit(dp->user, conduit, NULL);
                if (err) {
                        netdev_err(dp->user,
                                   "failed to restore conduit to %s: %pe\n",
                                   conduit->name, ERR_PTR(err));
                }
        }

        dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);

        return err;
}

static void dsa_conduit_lag_leave(struct net_device *conduit,
                                  struct net_device *lag_dev)
{
        struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr;
        struct dsa_switch_tree *dst = cpu_dp->dst;
        struct dsa_port *new_cpu_dp = NULL;
        struct net_device *lower;
        struct list_head *iter;

        netdev_for_each_lower_dev(lag_dev, lower, iter) {
                if (netdev_uses_dsa(lower)) {
                        new_cpu_dp = lower->dsa_ptr;
                        break;
                }
        }

        if (new_cpu_dp) {
                /* Update the CPU port of the user ports still under the LAG
                 * so that dsa_port_to_conduit() continues to work properly
                 */
                dsa_tree_for_each_user_port(dp, dst)
                        if (dsa_port_to_conduit(dp) == lag_dev)
                                dp->cpu_dp = new_cpu_dp;

                /* Update the index of the virtual CPU port to match the lowest
                 * physical CPU port
                 */
                lag_dev->dsa_ptr = new_cpu_dp;
                wmb();
        } else {
                /* If the LAG DSA conduit has no ports left, migrate back all
                 * user ports to the first physical CPU port
                 */
                dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev);
        }

        /* This DSA conduit has left its LAG in any case, so let
         * the CPU port leave the hardware LAG as well
         */
        dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);
}

static int dsa_conduit_changeupper(struct net_device *dev,
                                   struct netdev_notifier_changeupper_info *info)
{
        struct netlink_ext_ack *extack;
        int err = NOTIFY_DONE;

        if (!netdev_uses_dsa(dev))
                return err;

        extack = netdev_notifier_info_to_extack(&info->info);

        if (netif_is_lag_master(info->upper_dev)) {
                if (info->linking) {
                        err = dsa_conduit_lag_join(dev, info->upper_dev,
                                                   info->upper_info, extack);
                        err = notifier_from_errno(err);
                } else {
                        dsa_conduit_lag_leave(dev, info->upper_dev);
                        err = NOTIFY_OK;
                }
        }

        return err;
}

static int dsa_user_netdevice_event(struct notifier_block *nb,
                                    unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        switch (event) {
        case NETDEV_PRECHANGEUPPER: {
                struct netdev_notifier_changeupper_info *info = ptr;
                int err;

                err = dsa_user_prechangeupper_sanity_check(dev, info);
                if (notifier_to_errno(err))
                        return err;

                err = dsa_conduit_prechangeupper_sanity_check(dev, info);
                if (notifier_to_errno(err))
                        return err;

                err = dsa_lag_conduit_prechangelower_sanity_check(dev, info);
                if (notifier_to_errno(err))
                        return err;

                err = dsa_bridge_prechangelower_sanity_check(dev, info);
                if (notifier_to_errno(err))
                        return err;

                err = dsa_user_prechangeupper(dev, ptr);
                if (notifier_to_errno(err))
                        return err;

                err = dsa_user_lag_prechangeupper(dev, ptr);
                if (notifier_to_errno(err))
                        return err;

                break;
        }
        case NETDEV_CHANGEUPPER: {
                int err;

                err = dsa_user_changeupper(dev, ptr);
                if (notifier_to_errno(err))
                        return err;

                err = dsa_user_lag_changeupper(dev, ptr);
                if (notifier_to_errno(err))
                        return err;

                err = dsa_conduit_changeupper(dev, ptr);
                if (notifier_to_errno(err))
                        return err;

                break;
        }
        case NETDEV_CHANGELOWERSTATE: {
                struct netdev_notifier_changelowerstate_info *info = ptr;
                struct dsa_port *dp;
                int err = 0;

                if (dsa_user_dev_check(dev)) {
                        dp = dsa_user_to_port(dev);

                        err = dsa_port_lag_change(dp, info->lower_state_info);
                }

                /* Mirror LAG port events on DSA conduits that are in
                 * a LAG towards their respective switch CPU ports
                 */
                if (netdev_uses_dsa(dev)) {
                        dp = dev->dsa_ptr;

                        err = dsa_port_lag_change(dp, info->lower_state_info);
                }

                return notifier_from_errno(err);
        }
        case NETDEV_CHANGE:
        case NETDEV_UP: {
                /* Track state of conduit port.
                 * DSA driver may require the conduit port (and indirectly
                 * the tagger) to be available for some special operation.
                 */
                if (netdev_uses_dsa(dev)) {
                        struct dsa_port *cpu_dp = dev->dsa_ptr;
                        struct dsa_switch_tree *dst = cpu_dp->ds->dst;

                        /* Track when the conduit port is UP */
                        dsa_tree_conduit_oper_state_change(dst, dev,
                                                           netif_oper_up(dev));

                        /* Track when the conduit port is ready and can accept
                         * packet.
                         * NETDEV_UP event is not enough to flag a port as ready.
                         * We also have to wait for linkwatch_do_dev to dev_activate
                         * and emit a NETDEV_CHANGE event.
                         * We check if a conduit port is ready by checking if the dev
                         * have a qdisc assigned and is not noop.
                         */
                        dsa_tree_conduit_admin_state_change(dst, dev,
                                                            !qdisc_tx_is_noop(dev));

                        return NOTIFY_OK;
                }

                return NOTIFY_DONE;
        }
        case NETDEV_GOING_DOWN: {
                struct dsa_port *dp, *cpu_dp;
                struct dsa_switch_tree *dst;
                LIST_HEAD(close_list);

                if (!netdev_uses_dsa(dev))
                        return NOTIFY_DONE;

                cpu_dp = dev->dsa_ptr;
                dst = cpu_dp->ds->dst;

                dsa_tree_conduit_admin_state_change(dst, dev, false);

                list_for_each_entry(dp, &dst->ports, list) {
                        if (!dsa_port_is_user(dp))
                                continue;

                        if (dp->cpu_dp != cpu_dp)
                                continue;

                        list_add(&dp->user->close_list, &close_list);
                }

                dev_close_many(&close_list, true);

                return NOTIFY_OK;
        }
        default:
                break;
        }

        return NOTIFY_DONE;
}

static void
dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work)
{
        struct switchdev_notifier_fdb_info info = {};

        info.addr = switchdev_work->addr;
        info.vid = switchdev_work->vid;
        info.offloaded = true;
        call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
                                 switchdev_work->orig_dev, &info.info, NULL);
}

static void dsa_user_switchdev_event_work(struct work_struct *work)
{
        struct dsa_switchdev_event_work *switchdev_work =
                container_of(work, struct dsa_switchdev_event_work, work);
        const unsigned char *addr = switchdev_work->addr;
        struct net_device *dev = switchdev_work->dev;
        u16 vid = switchdev_work->vid;
        struct dsa_switch *ds;
        struct dsa_port *dp;
        int err;

        dp = dsa_user_to_port(dev);
        ds = dp->ds;

        switch (switchdev_work->event) {
        case SWITCHDEV_FDB_ADD_TO_DEVICE:
                if (switchdev_work->host_addr)
                        err = dsa_port_bridge_host_fdb_add(dp, addr, vid);
                else if (dp->lag)
                        err = dsa_port_lag_fdb_add(dp, addr, vid);
                else
                        err = dsa_port_fdb_add(dp, addr, vid);
                if (err) {
                        dev_err(ds->dev,
                                "port %d failed to add %pM vid %d to fdb: %d\n",
                                dp->index, addr, vid, err);
                        break;
                }
                dsa_fdb_offload_notify(switchdev_work);
                break;

        case SWITCHDEV_FDB_DEL_TO_DEVICE:
                if (switchdev_work->host_addr)
                        err = dsa_port_bridge_host_fdb_del(dp, addr, vid);
                else if (dp->lag)
                        err = dsa_port_lag_fdb_del(dp, addr, vid);
                else
                        err = dsa_port_fdb_del(dp, addr, vid);
                if (err) {
                        dev_err(ds->dev,
                                "port %d failed to delete %pM vid %d from fdb: %d\n",
                                dp->index, addr, vid, err);
                }

                break;
        }

        kfree(switchdev_work);
}

static bool dsa_foreign_dev_check(const struct net_device *dev,
                                  const struct net_device *foreign_dev)
{
        const struct dsa_port *dp = dsa_user_to_port(dev);
        struct dsa_switch_tree *dst = dp->ds->dst;

        if (netif_is_bridge_master(foreign_dev))
                return !dsa_tree_offloads_bridge_dev(dst, foreign_dev);

        if (netif_is_bridge_port(foreign_dev))
                return !dsa_tree_offloads_bridge_port(dst, foreign_dev);

        /* Everything else is foreign */
        return true;
}

static int dsa_user_fdb_event(struct net_device *dev,
                              struct net_device *orig_dev,
                              unsigned long event, const void *ctx,
                              const struct switchdev_notifier_fdb_info *fdb_info)
{
        struct dsa_switchdev_event_work *switchdev_work;
        struct dsa_port *dp = dsa_user_to_port(dev);
        bool host_addr = fdb_info->is_local;
        struct dsa_switch *ds = dp->ds;

        if (ctx && ctx != dp)
                return 0;

        if (!dp->bridge)
                return 0;

        if (switchdev_fdb_is_dynamically_learned(fdb_info)) {
                if (dsa_port_offloads_bridge_port(dp, orig_dev))
                        return 0;

                /* FDB entries learned by the software bridge or by foreign
                 * bridge ports should be installed as host addresses only if
                 * the driver requests assisted learning.
                 */
                if (!ds->assisted_learning_on_cpu_port)
                        return 0;
        }

        /* Also treat FDB entries on foreign interfaces bridged with us as host
         * addresses.
         */
        if (dsa_foreign_dev_check(dev, orig_dev))
                host_addr = true;

        /* Check early that we're not doing work in vain.
         * Host addresses on LAG ports still require regular FDB ops,
         * since the CPU port isn't in a LAG.
         */
        if (dp->lag && !host_addr) {
                if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del)
                        return -EOPNOTSUPP;
        } else {
                if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del)
                        return -EOPNOTSUPP;
        }

        switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
        if (!switchdev_work)
                return -ENOMEM;

        netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n",
                   event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting",
                   orig_dev->name, fdb_info->addr, fdb_info->vid,
                   host_addr ? " as host address" : "");

        INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work);
        switchdev_work->event = event;
        switchdev_work->dev = dev;
        switchdev_work->orig_dev = orig_dev;

        ether_addr_copy(switchdev_work->addr, fdb_info->addr);
        switchdev_work->vid = fdb_info->vid;
        switchdev_work->host_addr = host_addr;

        dsa_schedule_work(&switchdev_work->work);

        return 0;
}

/* Called under rcu_read_lock() */
static int dsa_user_switchdev_event(struct notifier_block *unused,
                                    unsigned long event, void *ptr)
{
        struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
        int err;

        switch (event) {
        case SWITCHDEV_PORT_ATTR_SET:
                err = switchdev_handle_port_attr_set(dev, ptr,
                                                     dsa_user_dev_check,
                                                     dsa_user_port_attr_set);
                return notifier_from_errno(err);
        case SWITCHDEV_FDB_ADD_TO_DEVICE:
        case SWITCHDEV_FDB_DEL_TO_DEVICE:
                err = switchdev_handle_fdb_event_to_device(dev, event, ptr,
                                                           dsa_user_dev_check,
                                                           dsa_foreign_dev_check,
                                                           dsa_user_fdb_event);
                return notifier_from_errno(err);
        default:
                return NOTIFY_DONE;
        }

        return NOTIFY_OK;
}

static int dsa_user_switchdev_blocking_event(struct notifier_block *unused,
                                             unsigned long event, void *ptr)
{
        struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
        int err;

        switch (event) {
        case SWITCHDEV_PORT_OBJ_ADD:
                err = switchdev_handle_port_obj_add_foreign(dev, ptr,
                                                            dsa_user_dev_check,
                                                            dsa_foreign_dev_check,
                                                            dsa_user_port_obj_add);
                return notifier_from_errno(err);
        case SWITCHDEV_PORT_OBJ_DEL:
                err = switchdev_handle_port_obj_del_foreign(dev, ptr,
                                                            dsa_user_dev_check,
                                                            dsa_foreign_dev_check,
                                                            dsa_user_port_obj_del);
                return notifier_from_errno(err);
        case SWITCHDEV_PORT_ATTR_SET:
                err = switchdev_handle_port_attr_set(dev, ptr,
                                                     dsa_user_dev_check,
                                                     dsa_user_port_attr_set);
                return notifier_from_errno(err);
        }

        return NOTIFY_DONE;
}

static struct notifier_block dsa_user_nb __read_mostly = {
        .notifier_call  = dsa_user_netdevice_event,
};

struct notifier_block dsa_user_switchdev_notifier = {
        .notifier_call = dsa_user_switchdev_event,
};

struct notifier_block dsa_user_switchdev_blocking_notifier = {
        .notifier_call = dsa_user_switchdev_blocking_event,
};

int dsa_user_register_notifier(void)
{
        struct notifier_block *nb;
        int err;

        err = register_netdevice_notifier(&dsa_user_nb);
        if (err)
                return err;

        err = register_switchdev_notifier(&dsa_user_switchdev_notifier);
        if (err)
                goto err_switchdev_nb;

        nb = &dsa_user_switchdev_blocking_notifier;
        err = register_switchdev_blocking_notifier(nb);
        if (err)
                goto err_switchdev_blocking_nb;

        return 0;

err_switchdev_blocking_nb:
        unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
err_switchdev_nb:
        unregister_netdevice_notifier(&dsa_user_nb);
        return err;
}

void dsa_user_unregister_notifier(void)
{
        struct notifier_block *nb;
        int err;

        nb = &dsa_user_switchdev_blocking_notifier;
        err = unregister_switchdev_blocking_notifier(nb);
        if (err)
                pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err);

        err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
        if (err)
                pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err);

        err = unregister_netdevice_notifier(&dsa_user_nb);
        if (err)
                pr_err("DSA: failed to unregister user notifier (%d)\n", err);
}





















































































































  168 



  168 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 



    4 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/fcntl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/file.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/pipe_fs_i.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/memfd.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/rw_hint.h>

#include <linux/poll.h>
#include <asm/siginfo.h>
#include <linux/uaccess.h>

#include "internal.h"

#define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)

static int setfl(int fd, struct file * filp, unsigned int arg)
{
        struct inode * inode = file_inode(filp);
        int error = 0;

        /*
         * O_APPEND cannot be cleared if the file is marked as append-only
         * and the file is open for write.
         */
        if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode))
                return -EPERM;

        /* O_NOATIME can only be set by the owner or superuser */
        if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME))
                if (!inode_owner_or_capable(file_mnt_idmap(filp), inode))
                        return -EPERM;

        /* required for strict SunOS emulation */
        if (O_NONBLOCK != O_NDELAY)
               if (arg & O_NDELAY)
                   arg |= O_NONBLOCK;

        /* Pipe packetized mode is controlled by O_DIRECT flag */
        if (!S_ISFIFO(inode->i_mode) &&
            (arg & O_DIRECT) &&
            !(filp->f_mode & FMODE_CAN_ODIRECT))
                return -EINVAL;

        if (filp->f_op->check_flags)
                error = filp->f_op->check_flags(arg);
        if (error)
                return error;

        /*
         * ->fasync() is responsible for setting the FASYNC bit.
         */
        if (((arg ^ filp->f_flags) & FASYNC) && filp->f_op->fasync) {
                error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
                if (error < 0)
                        goto out;
                if (error > 0)
                        error = 0;
        }
        spin_lock(&filp->f_lock);
        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
        filp->f_iocb_flags = iocb_flags(filp);
        spin_unlock(&filp->f_lock);

 out:
        return error;
}

/*
 * Allocate an file->f_owner struct if it doesn't exist, handling racing
 * allocations correctly.
 */
int file_f_owner_allocate(struct file *file)
{
        struct fown_struct *f_owner;

        f_owner = file_f_owner(file);
        if (f_owner)
                return 0;

        f_owner = kzalloc(sizeof(struct fown_struct), GFP_KERNEL);
        if (!f_owner)
                return -ENOMEM;

        rwlock_init(&f_owner->lock);
        f_owner->file = file;
        /* If someone else raced us, drop our allocation. */
        if (unlikely(cmpxchg(&file->f_owner, NULL, f_owner)))
                kfree(f_owner);
        return 0;
}
EXPORT_SYMBOL(file_f_owner_allocate);

void file_f_owner_release(struct file *file)
{
        struct fown_struct *f_owner;

        f_owner = file_f_owner(file);
        if (f_owner) {
                put_pid(f_owner->pid);
                kfree(f_owner);
        }
}

void __f_setown(struct file *filp, struct pid *pid, enum pid_type type,
                int force)
{
        struct fown_struct *f_owner;

        f_owner = file_f_owner(filp);
        if (WARN_ON_ONCE(!f_owner))
                return;

        write_lock_irq(&f_owner->lock);
        if (force || !f_owner->pid) {
                put_pid(f_owner->pid);
                f_owner->pid = get_pid(pid);
                f_owner->pid_type = type;

                if (pid) {
                        const struct cred *cred = current_cred();
                        security_file_set_fowner(filp);
                        f_owner->uid = cred->uid;
                        f_owner->euid = cred->euid;
                }
        }
        write_unlock_irq(&f_owner->lock);
}
EXPORT_SYMBOL(__f_setown);

int f_setown(struct file *filp, int who, int force)
{
        enum pid_type type;
        struct pid *pid = NULL;
        int ret = 0;

        might_sleep();

        type = PIDTYPE_TGID;
        if (who < 0) {
                /* avoid overflow below */
                if (who == INT_MIN)
                        return -EINVAL;

                type = PIDTYPE_PGID;
                who = -who;
        }

        ret = file_f_owner_allocate(filp);
        if (ret)
                return ret;

        rcu_read_lock();
        if (who) {
                pid = find_vpid(who);
                if (!pid)
                        ret = -ESRCH;
        }

        if (!ret)
                __f_setown(filp, pid, type, force);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(f_setown);

void f_delown(struct file *filp)
{
        __f_setown(filp, NULL, PIDTYPE_TGID, 1);
}

pid_t f_getown(struct file *filp)
{
        pid_t pid = 0;
        struct fown_struct *f_owner;

        f_owner = file_f_owner(filp);
        if (!f_owner)
                return pid;

        read_lock_irq(&f_owner->lock);
        rcu_read_lock();
        if (pid_task(f_owner->pid, f_owner->pid_type)) {
                pid = pid_vnr(f_owner->pid);
                if (f_owner->pid_type == PIDTYPE_PGID)
                        pid = -pid;
        }
        rcu_read_unlock();
        read_unlock_irq(&f_owner->lock);
        return pid;
}

static int f_setown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner;
        struct pid *pid;
        int type;
        int ret;

        ret = copy_from_user(&owner, owner_p, sizeof(owner));
        if (ret)
                return -EFAULT;

        switch (owner.type) {
        case F_OWNER_TID:
                type = PIDTYPE_PID;
                break;

        case F_OWNER_PID:
                type = PIDTYPE_TGID;
                break;

        case F_OWNER_PGRP:
                type = PIDTYPE_PGID;
                break;

        default:
                return -EINVAL;
        }

        ret = file_f_owner_allocate(filp);
        if (ret)
                return ret;

        rcu_read_lock();
        pid = find_vpid(owner.pid);
        if (owner.pid && !pid)
                ret = -ESRCH;
        else
                 __f_setown(filp, pid, type, 1);
        rcu_read_unlock();

        return ret;
}

static int f_getown_ex(struct file *filp, unsigned long arg)
{
        struct f_owner_ex __user *owner_p = (void __user *)arg;
        struct f_owner_ex owner = {};
        int ret = 0;
        struct fown_struct *f_owner;
        enum pid_type pid_type = PIDTYPE_PID;

        f_owner = file_f_owner(filp);
        if (f_owner) {
                read_lock_irq(&f_owner->lock);
                rcu_read_lock();
                if (pid_task(f_owner->pid, f_owner->pid_type))
                        owner.pid = pid_vnr(f_owner->pid);
                rcu_read_unlock();
                pid_type = f_owner->pid_type;
        }

        switch (pid_type) {
        case PIDTYPE_PID:
                owner.type = F_OWNER_TID;
                break;

        case PIDTYPE_TGID:
                owner.type = F_OWNER_PID;
                break;

        case PIDTYPE_PGID:
                owner.type = F_OWNER_PGRP;
                break;

        default:
                WARN_ON(1);
                ret = -EINVAL;
                break;
        }
        if (f_owner)
                read_unlock_irq(&f_owner->lock);

        if (!ret) {
                ret = copy_to_user(owner_p, &owner, sizeof(owner));
                if (ret)
                        ret = -EFAULT;
        }
        return ret;
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        struct user_namespace *user_ns = current_user_ns();
        struct fown_struct *f_owner;
        uid_t __user *dst = (void __user *)arg;
        uid_t src[2] = {0, 0};
        int err;

        f_owner = file_f_owner(filp);
        if (f_owner) {
                read_lock_irq(&f_owner->lock);
                src[0] = from_kuid(user_ns, f_owner->uid);
                src[1] = from_kuid(user_ns, f_owner->euid);
                read_unlock_irq(&f_owner->lock);
        }

        err  = put_user(src[0], &dst[0]);
        err |= put_user(src[1], &dst[1]);

        return err;
}
#else
static int f_getowner_uids(struct file *filp, unsigned long arg)
{
        return -EINVAL;
}
#endif

static bool rw_hint_valid(u64 hint)
{
        BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
        BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
        BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
        BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
        BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
        BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);

        switch (hint) {
        case RWH_WRITE_LIFE_NOT_SET:
        case RWH_WRITE_LIFE_NONE:
        case RWH_WRITE_LIFE_SHORT:
        case RWH_WRITE_LIFE_MEDIUM:
        case RWH_WRITE_LIFE_LONG:
        case RWH_WRITE_LIFE_EXTREME:
                return true;
        default:
                return false;
        }
}

static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct inode *inode = file_inode(file);
        u64 __user *argp = (u64 __user *)arg;
        u64 hint = READ_ONCE(inode->i_write_hint);

        if (copy_to_user(argp, &hint, sizeof(*argp)))
                return -EFAULT;
        return 0;
}

static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        struct inode *inode = file_inode(file);
        u64 __user *argp = (u64 __user *)arg;
        u64 hint;

        if (!inode_owner_or_capable(file_mnt_idmap(file), inode))
                return -EPERM;

        if (copy_from_user(&hint, argp, sizeof(hint)))
                return -EFAULT;
        if (!rw_hint_valid(hint))
                return -EINVAL;

        WRITE_ONCE(inode->i_write_hint, hint);

        /*
         * file->f_mapping->host may differ from inode. As an example,
         * blkdev_open() modifies file->f_mapping.
         */
        if (file->f_mapping->host != inode)
                WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);

        return 0;
}

/* Is the file descriptor a dup of the file? */
static long f_dupfd_query(int fd, struct file *filp)
{
        CLASS(fd_raw, f)(fd);

        if (fd_empty(f))
                return -EBADF;

        /*
         * We can do the 'fdput()' immediately, as the only thing that
         * matters is the pointer value which isn't changed by the fdput.
         *
         * Technically we didn't need a ref at all, and 'fdget()' was
         * overkill, but given our lockless file pointer lookup, the
         * alternatives are complicated.
         */
        return fd_file(f) == filp;
}

/* Let the caller figure out whether a given file was just created. */
static long f_created_query(const struct file *filp)
{
        return !!(filp->f_mode & FMODE_CREATED);
}

static int f_owner_sig(struct file *filp, int signum, bool setsig)
{
        int ret = 0;
        struct fown_struct *f_owner;

        might_sleep();

        if (setsig) {
                if (!valid_signal(signum))
                        return -EINVAL;

                ret = file_f_owner_allocate(filp);
                if (ret)
                        return ret;
        }

        f_owner = file_f_owner(filp);
        if (setsig)
                f_owner->signum = signum;
        else if (f_owner)
                ret = f_owner->signum;
        return ret;
}

static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
                struct file *filp)
{
        void __user *argp = (void __user *)arg;
        int argi = (int)arg;
        struct flock flock;
        long err = -EINVAL;

        switch (cmd) {
        case F_CREATED_QUERY:
                err = f_created_query(filp);
                break;
        case F_DUPFD:
                err = f_dupfd(argi, filp, 0);
                break;
        case F_DUPFD_CLOEXEC:
                err = f_dupfd(argi, filp, O_CLOEXEC);
                break;
        case F_DUPFD_QUERY:
                err = f_dupfd_query(argi, filp);
                break;
        case F_GETFD:
                err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
                break;
        case F_SETFD:
                err = 0;
                set_close_on_exec(fd, argi & FD_CLOEXEC);
                break;
        case F_GETFL:
                err = filp->f_flags;
                break;
        case F_SETFL:
                err = setfl(fd, filp, argi);
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_GETLK:
#endif
        case F_GETLK:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_getlk(filp, cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        return -EFAULT;
                break;
#if BITS_PER_LONG != 32
        /* 32-bit arches must use fcntl64() */
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                fallthrough;
#endif
        case F_SETLK:
        case F_SETLKW:
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        return -EFAULT;
                err = fcntl_setlk(fd, filp, cmd, &flock);
                break;
        case F_GETOWN:
                /*
                 * XXX If f_owner is a process group, the
                 * negative return value will get converted
                 * into an error.  Oops.  If we keep the
                 * current syscall conventions, the only way
                 * to fix this will be in libc.
                 */
                err = f_getown(filp);
                force_successful_syscall_return();
                break;
        case F_SETOWN:
                err = f_setown(filp, argi, 1);
                break;
        case F_GETOWN_EX:
                err = f_getown_ex(filp, arg);
                break;
        case F_SETOWN_EX:
                err = f_setown_ex(filp, arg);
                break;
        case F_GETOWNER_UIDS:
                err = f_getowner_uids(filp, arg);
                break;
        case F_GETSIG:
                err = f_owner_sig(filp, 0, false);
                break;
        case F_SETSIG:
                err = f_owner_sig(filp, argi, true);
                break;
        case F_GETLEASE:
                err = fcntl_getlease(filp);
                break;
        case F_SETLEASE:
                err = fcntl_setlease(fd, filp, argi);
                break;
        case F_NOTIFY:
                err = fcntl_dirnotify(fd, filp, argi);
                break;
        case F_SETPIPE_SZ:
        case F_GETPIPE_SZ:
                err = pipe_fcntl(filp, cmd, argi);
                break;
        case F_ADD_SEALS:
        case F_GET_SEALS:
                err = memfd_fcntl(filp, cmd, argi);
                break;
        case F_GET_RW_HINT:
                err = fcntl_get_rw_hint(filp, cmd, arg);
                break;
        case F_SET_RW_HINT:
                err = fcntl_set_rw_hint(filp, cmd, arg);
                break;
        default:
                break;
        }
        return err;
}

static int check_fcntl_cmd(unsigned cmd)
{
        switch (cmd) {
        case F_CREATED_QUERY:
        case F_DUPFD:
        case F_DUPFD_CLOEXEC:
        case F_DUPFD_QUERY:
        case F_GETFD:
        case F_SETFD:
        case F_GETFL:
                return 1;
        }
        return 0;
}

SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{        
        CLASS(fd_raw, f)(fd);
        long err;

        if (fd_empty(f))
                return -EBADF;

        if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        return -EBADF;
        }

        err = security_file_fcntl(fd_file(f), cmd, arg);
        if (!err)
                err = do_fcntl(fd, cmd, arg, fd_file(f));

        return err;
}

#if BITS_PER_LONG == 32
SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                unsigned long, arg)
{        
        void __user *argp = (void __user *)arg;
        CLASS(fd_raw, f)(fd);
        struct flock64 flock;
        long err;

        if (fd_empty(f))
                return -EBADF;

        if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        return -EBADF;
        }

        err = security_file_fcntl(fd_file(f), cmd, arg);
        if (err)
                return err;
        
        switch (cmd) {
        case F_GETLK64:
        case F_OFD_GETLK:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_getlk64(fd_file(f), cmd, &flock);
                if (!err && copy_to_user(argp, &flock, sizeof(flock)))
                        err = -EFAULT;
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = -EFAULT;
                if (copy_from_user(&flock, argp, sizeof(flock)))
                        break;
                err = fcntl_setlk64(fd, fd_file(f), cmd, &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, fd_file(f));
                break;
        }
        return err;
}
#endif

#ifdef CONFIG_COMPAT
/* careful - don't use anywhere else */
#define copy_flock_fields(dst, src)                \
        (dst)->l_type = (src)->l_type;                \
        (dst)->l_whence = (src)->l_whence;        \
        (dst)->l_start = (src)->l_start;        \
        (dst)->l_len = (src)->l_len;                \
        (dst)->l_pid = (src)->l_pid;

static int get_compat_flock(struct flock *kfl, const struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int get_compat_flock64(struct flock *kfl, const struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        if (copy_from_user(&fl, ufl, sizeof(struct compat_flock64)))
                return -EFAULT;
        copy_flock_fields(kfl, &fl);
        return 0;
}

static int put_compat_flock(const struct flock *kfl, struct compat_flock __user *ufl)
{
        struct compat_flock fl;

        memset(&fl, 0, sizeof(struct compat_flock));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock)))
                return -EFAULT;
        return 0;
}

static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __user *ufl)
{
        struct compat_flock64 fl;

        BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start));
        BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len));

        memset(&fl, 0, sizeof(struct compat_flock64));
        copy_flock_fields(&fl, kfl);
        if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64)))
                return -EFAULT;
        return 0;
}
#undef copy_flock_fields

static unsigned int
convert_fcntl_cmd(unsigned int cmd)
{
        switch (cmd) {
        case F_GETLK64:
                return F_GETLK;
        case F_SETLK64:
                return F_SETLK;
        case F_SETLKW64:
                return F_SETLKW;
        }

        return cmd;
}

/*
 * GETLK was successful and we need to return the data, but it needs to fit in
 * the compat structure.
 * l_start shouldn't be too big, unless the original start + end is greater than
 * COMPAT_OFF_T_MAX, in which case the app was asking for trouble, so we return
 * -EOVERFLOW in that case.  l_len could be too big, in which case we just
 * truncate it, and only allow the app to see that part of the conflicting lock
 * that might make sense to it anyway
 */
static int fixup_compat_flock(struct flock *flock)
{
        if (flock->l_start > COMPAT_OFF_T_MAX)
                return -EOVERFLOW;
        if (flock->l_len > COMPAT_OFF_T_MAX)
                flock->l_len = COMPAT_OFF_T_MAX;
        return 0;
}

static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
                             compat_ulong_t arg)
{
        CLASS(fd_raw, f)(fd);
        struct flock flock;
        long err;

        if (fd_empty(f))
                return -EBADF;

        if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
                if (!check_fcntl_cmd(cmd))
                        return -EBADF;
        }

        err = security_file_fcntl(fd_file(f), cmd, arg);
        if (err)
                return err;

        switch (cmd) {
        case F_GETLK:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
                if (err)
                        break;
                err = fixup_compat_flock(&flock);
                if (!err)
                        err = put_compat_flock(&flock, compat_ptr(arg));
                break;
        case F_GETLK64:
        case F_OFD_GETLK:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_getlk(fd_file(f), convert_fcntl_cmd(cmd), &flock);
                if (!err)
                        err = put_compat_flock64(&flock, compat_ptr(arg));
                break;
        case F_SETLK:
        case F_SETLKW:
                err = get_compat_flock(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
                break;
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                err = get_compat_flock64(&flock, compat_ptr(arg));
                if (err)
                        break;
                err = fcntl_setlk(fd, fd_file(f), convert_fcntl_cmd(cmd), &flock);
                break;
        default:
                err = do_fcntl(fd, cmd, arg, fd_file(f));
                break;
        }
        return err;
}

COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        return do_compat_fcntl64(fd, cmd, arg);
}

COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        switch (cmd) {
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
        case F_OFD_GETLK:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
                return -EINVAL;
        }
        return do_compat_fcntl64(fd, cmd, arg);
}
#endif

/* Table to convert sigio signal codes into poll band bitmaps */

static const __poll_t band_table[NSIGPOLL] = {
        EPOLLIN | EPOLLRDNORM,                        /* POLL_IN */
        EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND,        /* POLL_OUT */
        EPOLLIN | EPOLLRDNORM | EPOLLMSG,                /* POLL_MSG */
        EPOLLERR,                                /* POLL_ERR */
        EPOLLPRI | EPOLLRDBAND,                        /* POLL_PRI */
        EPOLLHUP | EPOLLERR                        /* POLL_HUP */
};

static inline int sigio_perm(struct task_struct *p,
                             struct fown_struct *fown, int sig)
{
        const struct cred *cred;
        int ret;

        rcu_read_lock();
        cred = __task_cred(p);
        ret = ((uid_eq(fown->euid, GLOBAL_ROOT_UID) ||
                uid_eq(fown->euid, cred->suid) || uid_eq(fown->euid, cred->uid) ||
                uid_eq(fown->uid,  cred->suid) || uid_eq(fown->uid,  cred->uid)) &&
               !security_file_send_sigiotask(p, fown, sig));
        rcu_read_unlock();
        return ret;
}

static void send_sigio_to_task(struct task_struct *p,
                               struct fown_struct *fown,
                               int fd, int reason, enum pid_type type)
{
        /*
         * F_SETSIG can change ->signum lockless in parallel, make
         * sure we read it once and use the same value throughout.
         */
        int signum = READ_ONCE(fown->signum);

        if (!sigio_perm(p, fown, signum))
                return;

        switch (signum) {
                default: {
                        kernel_siginfo_t si;

                        /* Queue a rt signal with the appropriate fd as its
                           value.  We use SI_SIGIO as the source, not 
                           SI_KERNEL, since kernel signals always get 
                           delivered even if we can't queue.  Failure to
                           queue in this case _should_ be reported; we fall
                           back to SIGIO in that case. --sct */
                        clear_siginfo(&si);
                        si.si_signo = signum;
                        si.si_errno = 0;
                        si.si_code  = reason;
                        /*
                         * Posix definies POLL_IN and friends to be signal
                         * specific si_codes for SIG_POLL.  Linux extended
                         * these si_codes to other signals in a way that is
                         * ambiguous if other signals also have signal
                         * specific si_codes.  In that case use SI_SIGIO instead
                         * to remove the ambiguity.
                         */
                        if ((signum != SIGPOLL) && sig_specific_sicodes(signum))
                                si.si_code = SI_SIGIO;

                        /* Make sure we are called with one of the POLL_*
                           reasons, otherwise we could leak kernel stack into
                           userspace.  */
                        BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL));
                        if (reason - POLL_IN >= NSIGPOLL)
                                si.si_band  = ~0L;
                        else
                                si.si_band = mangle_poll(band_table[reason - POLL_IN]);
                        si.si_fd    = fd;
                        if (!do_send_sig_info(signum, &si, p, type))
                                break;
                }
                        fallthrough;        /* fall back on the old plain SIGIO signal */
                case 0:
                        do_send_sig_info(SIGIO, SEND_SIG_PRIV, p, type);
        }
}

void send_sigio(struct fown_struct *fown, int fd, int band)
{
        struct task_struct *p;
        enum pid_type type;
        unsigned long flags;
        struct pid *pid;
        
        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigio_to_task(p, fown, fd, band, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigio_to_task(p, fown, fd, band, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
}

static void send_sigurg_to_task(struct task_struct *p,
                                struct fown_struct *fown, enum pid_type type)
{
        if (sigio_perm(p, fown, SIGURG))
                do_send_sig_info(SIGURG, SEND_SIG_PRIV, p, type);
}

int send_sigurg(struct file *file)
{
        struct fown_struct *fown;
        struct task_struct *p;
        enum pid_type type;
        struct pid *pid;
        unsigned long flags;
        int ret = 0;
        
        fown = file_f_owner(file);
        if (!fown)
                return 0;

        read_lock_irqsave(&fown->lock, flags);

        type = fown->pid_type;
        pid = fown->pid;
        if (!pid)
                goto out_unlock_fown;

        ret = 1;

        if (type <= PIDTYPE_TGID) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        send_sigurg_to_task(p, fown, type);
                rcu_read_unlock();
        } else {
                read_lock(&tasklist_lock);
                do_each_pid_task(pid, type, p) {
                        send_sigurg_to_task(p, fown, type);
                } while_each_pid_task(pid, type, p);
                read_unlock(&tasklist_lock);
        }
 out_unlock_fown:
        read_unlock_irqrestore(&fown->lock, flags);
        return ret;
}

static DEFINE_SPINLOCK(fasync_lock);
static struct kmem_cache *fasync_cache __ro_after_init;

/*
 * Remove a fasync entry. If successfully removed, return
 * positive and clear the FASYNC flag. If no entry exists,
 * do nothing and return 0.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 *
 */
int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *fa, **fp;
        int result = 0;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_file = NULL;
                write_unlock_irq(&fa->fa_lock);

                *fp = fa->fa_next;
                kfree_rcu(fa, fa_rcu);
                filp->f_flags &= ~FASYNC;
                result = 1;
                break;
        }
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return result;
}

struct fasync_struct *fasync_alloc(void)
{
        return kmem_cache_alloc(fasync_cache, GFP_KERNEL);
}

/*
 * NOTE! This can be used only for unused fasync entries:
 * entries that actually got inserted on the fasync list
 * need to be released by rcu - see fasync_remove_entry.
 */
void fasync_free(struct fasync_struct *new)
{
        kmem_cache_free(fasync_cache, new);
}

/*
 * Insert a new entry into the fasync list.  Return the pointer to the
 * old one if we didn't use the new one.
 *
 * NOTE! It is very important that the FASYNC flag always
 * match the state "is the filp on a fasync list".
 */
struct fasync_struct *fasync_insert_entry(int fd, struct file *filp, struct fasync_struct **fapp, struct fasync_struct *new)
{
        struct fasync_struct *fa, **fp;

        spin_lock(&filp->f_lock);
        spin_lock(&fasync_lock);
        for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) {
                if (fa->fa_file != filp)
                        continue;

                write_lock_irq(&fa->fa_lock);
                fa->fa_fd = fd;
                write_unlock_irq(&fa->fa_lock);
                goto out;
        }

        rwlock_init(&new->fa_lock);
        new->magic = FASYNC_MAGIC;
        new->fa_file = filp;
        new->fa_fd = fd;
        new->fa_next = *fapp;
        rcu_assign_pointer(*fapp, new);
        filp->f_flags |= FASYNC;

out:
        spin_unlock(&fasync_lock);
        spin_unlock(&filp->f_lock);
        return fa;
}

/*
 * Add a fasync entry. Return negative on error, positive if
 * added, and zero if did nothing but change an existing one.
 */
static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp)
{
        struct fasync_struct *new;

        new = fasync_alloc();
        if (!new)
                return -ENOMEM;

        /*
         * fasync_insert_entry() returns the old (update) entry if
         * it existed.
         *
         * So free the (unused) new entry and return 0 to let the
         * caller know that we didn't add any new fasync entries.
         */
        if (fasync_insert_entry(fd, filp, fapp, new)) {
                fasync_free(new);
                return 0;
        }

        return 1;
}

/*
 * fasync_helper() is used by almost all character device drivers
 * to set up the fasync queue, and for regular files by the file
 * lease code. It returns negative on error, 0 if it did no changes
 * and positive if it added/deleted the entry.
 */
int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp)
{
        if (!on)
                return fasync_remove_entry(filp, fapp);
        return fasync_add_entry(fd, filp, fapp);
}

EXPORT_SYMBOL(fasync_helper);

/*
 * rcu_read_lock() is held
 */
static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band)
{
        while (fa) {
                struct fown_struct *fown;
                unsigned long flags;

                if (fa->magic != FASYNC_MAGIC) {
                        printk(KERN_ERR "kill_fasync: bad magic number in "
                               "fasync_struct!\n");
                        return;
                }
                read_lock_irqsave(&fa->fa_lock, flags);
                if (fa->fa_file) {
                        fown = file_f_owner(fa->fa_file);
                        if (!fown)
                                goto next;
                        /* Don't send SIGURG to processes which have not set a
                           queued signum: SIGURG has its own default signalling
                           mechanism. */
                        if (!(sig == SIGURG && fown->signum == 0))
                                send_sigio(fown, fa->fa_fd, band);
                }
next:
                read_unlock_irqrestore(&fa->fa_lock, flags);
                fa = rcu_dereference(fa->fa_next);
        }
}

void kill_fasync(struct fasync_struct **fp, int sig, int band)
{
        /* First a quick test without locking: usually
         * the list is empty.
         */
        if (*fp) {
                rcu_read_lock();
                kill_fasync_rcu(rcu_dereference(*fp), sig, band);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(kill_fasync);

static int __init fcntl_init(void)
{
        /*
         * Please add new bits here to ensure allocation uniqueness.
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
        BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
                HWEIGHT32(
                        (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
                        __FMODE_EXEC));

        fasync_cache = kmem_cache_create("fasync_cache",
                                         sizeof(struct fasync_struct), 0,
                                         SLAB_PANIC | SLAB_ACCOUNT, NULL);
        return 0;
}

module_init(fcntl_init)






































   23 



    3 






















   26 




   24 
    2 

   26 

   26 























































































    8 








    8 









    8 




    8 




    8 
















    8 






    8 













































    8 


































    8 








    8 



































    8 


    8 
    8 
    8 








    8 



















    8 








    8 










































    8 






    8 

    8 



    8 


























    8 



    8 



    8 


















    8 





    8 















    8 



    8 







    8 


    8 














    8 




    8 




    8 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/attr.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  changes by Thomas Schoebel-Theuer
 */

#include <linux/export.h>
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/sched/signal.h>
#include <linux/capability.h>
#include <linux/fsnotify.h>
#include <linux/fcntl.h>
#include <linux/filelock.h>
#include <linux/security.h>

/**
 * setattr_should_drop_sgid - determine whether the setgid bit needs to be
 *                            removed
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check
 *
 * This function determines whether the setgid bit needs to be removed.
 * We retain backwards compatibility and require setgid bit to be removed
 * unconditionally if S_IXGRP is set. Otherwise we have the exact same
 * requirements as setattr_prepare() and setattr_copy().
 *
 * Return: ATTR_KILL_SGID if setgid bit needs to be removed, 0 otherwise.
 */
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
                             const struct inode *inode)
{
        umode_t mode = inode->i_mode;

        if (!(mode & S_ISGID))
                return 0;
        if (mode & S_IXGRP)
                return ATTR_KILL_SGID;
        if (!in_group_or_capable(idmap, inode, i_gid_into_vfsgid(idmap, inode)))
                return ATTR_KILL_SGID;
        return 0;
}
EXPORT_SYMBOL(setattr_should_drop_sgid);

/**
 * setattr_should_drop_suidgid - determine whether the set{g,u}id bit needs to
 *                               be dropped
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check
 *
 * This function determines whether the set{g,u}id bits need to be removed.
 * If the setuid bit needs to be removed ATTR_KILL_SUID is returned. If the
 * setgid bit needs to be removed ATTR_KILL_SGID is returned. If both
 * set{g,u}id bits need to be removed the corresponding mask of both flags is
 * returned.
 *
 * Return: A mask of ATTR_KILL_S{G,U}ID indicating which - if any - setid bits
 * to remove, 0 otherwise.
 */
int setattr_should_drop_suidgid(struct mnt_idmap *idmap,
                                struct inode *inode)
{
        umode_t mode = inode->i_mode;
        int kill = 0;

        /* suid always must be killed */
        if (unlikely(mode & S_ISUID))
                kill = ATTR_KILL_SUID;

        kill |= setattr_should_drop_sgid(idmap, inode);

        if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
                return kill;

        return 0;
}
EXPORT_SYMBOL(setattr_should_drop_suidgid);

/**
 * chown_ok - verify permissions to chown inode
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check permissions on
 * @ia_vfsuid:        uid to chown @inode to
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
static bool chown_ok(struct mnt_idmap *idmap,
                     const struct inode *inode, vfsuid_t ia_vfsuid)
{
        vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()) &&
            vfsuid_eq(ia_vfsuid, vfsuid))
                return true;
        if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN))
                return true;
        if (!vfsuid_valid(vfsuid) &&
            ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
                return true;
        return false;
}

/**
 * chgrp_ok - verify permissions to chgrp inode
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check permissions on
 * @ia_vfsgid:        gid to chown @inode to
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
static bool chgrp_ok(struct mnt_idmap *idmap,
                     const struct inode *inode, vfsgid_t ia_vfsgid)
{
        vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
        vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (vfsuid_eq_kuid(vfsuid, current_fsuid())) {
                if (vfsgid_eq(ia_vfsgid, vfsgid))
                        return true;
                if (vfsgid_in_group_p(ia_vfsgid))
                        return true;
        }
        if (capable_wrt_inode_uidgid(idmap, inode, CAP_CHOWN))
                return true;
        if (!vfsgid_valid(vfsgid) &&
            ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN))
                return true;
        return false;
}

/**
 * setattr_prepare - check if attribute changes to a dentry are allowed
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        dentry to check
 * @attr:        attributes to change
 *
 * Check if we are allowed to change the attributes contained in @attr
 * in the given dentry.  This includes the normal unix access permission
 * checks, as well as checks for rlimits and others. The function also clears
 * SGID bit from mode if user is not allowed to set it. Also file capabilities
 * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Should be called as the first thing in ->setattr implementations,
 * possibly after taking additional locks.
 */
int setattr_prepare(struct mnt_idmap *idmap, struct dentry *dentry,
                    struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        unsigned int ia_valid = attr->ia_valid;

        /*
         * First check size constraints.  These can't be overriden using
         * ATTR_FORCE.
         */
        if (ia_valid & ATTR_SIZE) {
                int error = inode_newsize_ok(inode, attr->ia_size);
                if (error)
                        return error;
        }

        /* If force is set do it anyway. */
        if (ia_valid & ATTR_FORCE)
                goto kill_priv;

        /* Make sure a caller can chown. */
        if ((ia_valid & ATTR_UID) &&
            !chown_ok(idmap, inode, attr->ia_vfsuid))
                return -EPERM;

        /* Make sure caller can chgrp. */
        if ((ia_valid & ATTR_GID) &&
            !chgrp_ok(idmap, inode, attr->ia_vfsgid))
                return -EPERM;

        /* Make sure a caller can chmod. */
        if (ia_valid & ATTR_MODE) {
                vfsgid_t vfsgid;

                if (!inode_owner_or_capable(idmap, inode))
                        return -EPERM;

                if (ia_valid & ATTR_GID)
                        vfsgid = attr->ia_vfsgid;
                else
                        vfsgid = i_gid_into_vfsgid(idmap, inode);

                /* Also check the setgid bit! */
                if (!in_group_or_capable(idmap, inode, vfsgid))
                        attr->ia_mode &= ~S_ISGID;
        }

        /* Check for setting the inode time. */
        if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) {
                if (!inode_owner_or_capable(idmap, inode))
                        return -EPERM;
        }

kill_priv:
        /* User has permission for the change */
        if (ia_valid & ATTR_KILL_PRIV) {
                int error;

                error = security_inode_killpriv(idmap, dentry);
                if (error)
                        return error;
        }

        return 0;
}
EXPORT_SYMBOL(setattr_prepare);

/**
 * inode_newsize_ok - may this inode be truncated to a given size
 * @inode:        the inode to be truncated
 * @offset:        the new size to assign to the inode
 *
 * inode_newsize_ok must be called with i_mutex held.
 *
 * inode_newsize_ok will check filesystem limits and ulimits to check that the
 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
 * when necessary. Caller must not proceed with inode size change if failure is
 * returned. @inode must be a file (not directory), with appropriate
 * permissions to allow truncate (inode_newsize_ok does NOT check these
 * conditions).
 *
 * Return: 0 on success, -ve errno on failure
 */
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
        if (offset < 0)
                return -EINVAL;
        if (inode->i_size < offset) {
                unsigned long limit;

                limit = rlimit(RLIMIT_FSIZE);
                if (limit != RLIM_INFINITY && offset > limit)
                        goto out_sig;
                if (offset > inode->i_sb->s_maxbytes)
                        goto out_big;
        } else {
                /*
                 * truncation of in-use swapfiles is disallowed - it would
                 * cause subsequent swapout to scribble on the now-freed
                 * blocks.
                 */
                if (IS_SWAPFILE(inode))
                        return -ETXTBSY;
        }

        return 0;
out_sig:
        send_sig(SIGXFSZ, current, 0);
out_big:
        return -EFBIG;
}
EXPORT_SYMBOL(inode_newsize_ok);

/**
 * setattr_copy_mgtime - update timestamps for mgtime inodes
 * @inode: inode timestamps to be updated
 * @attr: attrs for the update
 *
 * With multigrain timestamps, take more care to prevent races when
 * updating the ctime. Always update the ctime to the very latest using
 * the standard mechanism, and use that to populate the atime and mtime
 * appropriately (unless those are being set to specific values).
 */
static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
{
        unsigned int ia_valid = attr->ia_valid;
        struct timespec64 now;

        if (ia_valid & ATTR_CTIME) {
                /*
                 * In the case of an update for a write delegation, we must respect
                 * the value in ia_ctime and not use the current time.
                 */
                if (ia_valid & ATTR_DELEG)
                        now = inode_set_ctime_deleg(inode, attr->ia_ctime);
                else
                        now = inode_set_ctime_current(inode);
        } else {
                /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */
                WARN_ON_ONCE(ia_valid & ATTR_MTIME);
                now = current_time(inode);
        }

        if (ia_valid & ATTR_ATIME_SET)
                inode_set_atime_to_ts(inode, attr->ia_atime);
        else if (ia_valid & ATTR_ATIME)
                inode_set_atime_to_ts(inode, now);

        if (ia_valid & ATTR_MTIME_SET)
                inode_set_mtime_to_ts(inode, attr->ia_mtime);
        else if (ia_valid & ATTR_MTIME)
                inode_set_mtime_to_ts(inode, now);
}

/**
 * setattr_copy - copy simple metadata updates into the generic inode
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        the inode to be updated
 * @attr:        the new attributes
 *
 * setattr_copy must be called with i_mutex held.
 *
 * setattr_copy updates the inode's metadata with that specified
 * in attr on idmapped mounts. Necessary permission checks to determine
 * whether or not the S_ISGID property needs to be removed are performed with
 * the correct idmapped mount permission helpers.
 * Noticeably missing is inode size update, which is more complex
 * as it requires pagecache updates.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * The inode is not marked as dirty after this operation. The rationale is
 * that for "simple" filesystems, the struct inode is the inode storage.
 * The caller is free to mark the inode dirty afterwards if needed.
 */
void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
                  const struct iattr *attr)
{
        unsigned int ia_valid = attr->ia_valid;

        i_uid_update(idmap, attr, inode);
        i_gid_update(idmap, attr, inode);
        if (ia_valid & ATTR_MODE) {
                umode_t mode = attr->ia_mode;
                if (!in_group_or_capable(idmap, inode,
                                         i_gid_into_vfsgid(idmap, inode)))
                        mode &= ~S_ISGID;
                inode->i_mode = mode;
        }

        if (is_mgtime(inode))
                return setattr_copy_mgtime(inode, attr);

        if (ia_valid & ATTR_ATIME)
                inode_set_atime_to_ts(inode, attr->ia_atime);
        if (ia_valid & ATTR_MTIME)
                inode_set_mtime_to_ts(inode, attr->ia_mtime);
        if (ia_valid & ATTR_CTIME) {
                if (ia_valid & ATTR_DELEG)
                        inode_set_ctime_deleg(inode, attr->ia_ctime);
                else
                        inode_set_ctime_to_ts(inode, attr->ia_ctime);
        }
}
EXPORT_SYMBOL(setattr_copy);

int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
                unsigned int ia_valid)
{
        int error;

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
                if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
                        return -EPERM;
        }

        /*
         * If utimes(2) and friends are called with times == NULL (or both
         * times are UTIME_NOW), then we need to check for write permission
         */
        if (ia_valid & ATTR_TOUCH) {
                if (IS_IMMUTABLE(inode))
                        return -EPERM;

                if (!inode_owner_or_capable(idmap, inode)) {
                        error = inode_permission(idmap, inode, MAY_WRITE);
                        if (error)
                                return error;
                }
        }
        return 0;
}
EXPORT_SYMBOL(may_setattr);

/**
 * notify_change - modify attributes of a filesystem object
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        object affected
 * @attr:        new attributes
 * @delegated_inode: returns inode, if the inode is delegated
 *
 * The caller must hold the i_mutex on the affected object.
 *
 * If notify_change discovers a delegation in need of breaking,
 * it will return -EWOULDBLOCK and return a reference to the inode in
 * delegated_inode.  The caller should then break the delegation and
 * retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.  Also, passing NULL is fine for callers holding
 * the file open for write, as there can be no conflicting delegation in
 * that case.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
                  struct iattr *attr, struct inode **delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        umode_t mode = inode->i_mode;
        int error;
        struct timespec64 now;
        unsigned int ia_valid = attr->ia_valid;

        WARN_ON_ONCE(!inode_is_locked(inode));

        error = may_setattr(idmap, inode, ia_valid);
        if (error)
                return error;

        if ((ia_valid & ATTR_MODE)) {
                /*
                 * Don't allow changing the mode of symlinks:
                 *
                 * (1) The vfs doesn't take the mode of symlinks into account
                 *     during permission checking.
                 * (2) This has never worked correctly. Most major filesystems
                 *     did return EOPNOTSUPP due to interactions with POSIX ACLs
                 *     but did still updated the mode of the symlink.
                 *     This inconsistency led system call wrapper providers such
                 *     as libc to block changing the mode of symlinks with
                 *     EOPNOTSUPP already.
                 * (3) To even do this in the first place one would have to use
                 *     specific file descriptors and quite some effort.
                 */
                if (S_ISLNK(inode->i_mode))
                        return -EOPNOTSUPP;

                /* Flag setting protected by i_mutex */
                if (is_sxid(attr->ia_mode))
                        inode->i_flags &= ~S_NOSEC;
        }

        now = current_time(inode);

        attr->ia_ctime = now;
        if (!(ia_valid & ATTR_ATIME_SET))
                attr->ia_atime = now;
        else
                attr->ia_atime = timestamp_truncate(attr->ia_atime, inode);
        if (!(ia_valid & ATTR_MTIME_SET))
                attr->ia_mtime = now;
        else
                attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode);

        if (ia_valid & ATTR_KILL_PRIV) {
                error = security_inode_need_killpriv(dentry);
                if (error < 0)
                        return error;
                if (error == 0)
                        ia_valid = attr->ia_valid &= ~ATTR_KILL_PRIV;
        }

        /*
         * We now pass ATTR_KILL_S*ID to the lower level setattr function so
         * that the function has the ability to reinterpret a mode change
         * that's due to these bits. This adds an implicit restriction that
         * no function will ever call notify_change with both ATTR_MODE and
         * ATTR_KILL_S*ID set.
         */
        if ((ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) &&
            (ia_valid & ATTR_MODE))
                BUG();

        if (ia_valid & ATTR_KILL_SUID) {
                if (mode & S_ISUID) {
                        ia_valid = attr->ia_valid |= ATTR_MODE;
                        attr->ia_mode = (inode->i_mode & ~S_ISUID);
                }
        }
        if (ia_valid & ATTR_KILL_SGID) {
                if (mode & S_ISGID) {
                        if (!(ia_valid & ATTR_MODE)) {
                                ia_valid = attr->ia_valid |= ATTR_MODE;
                                attr->ia_mode = inode->i_mode;
                        }
                        attr->ia_mode &= ~S_ISGID;
                }
        }
        if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
                return 0;

        /*
         * Verify that uid/gid changes are valid in the target
         * namespace of the superblock.
         */
        if (ia_valid & ATTR_UID &&
            !vfsuid_has_fsmapping(idmap, inode->i_sb->s_user_ns,
                                  attr->ia_vfsuid))
                return -EOVERFLOW;
        if (ia_valid & ATTR_GID &&
            !vfsgid_has_fsmapping(idmap, inode->i_sb->s_user_ns,
                                  attr->ia_vfsgid))
                return -EOVERFLOW;

        /* Don't allow modifications of files with invalid uids or
         * gids unless those uids & gids are being made valid.
         */
        if (!(ia_valid & ATTR_UID) &&
            !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)))
                return -EOVERFLOW;
        if (!(ia_valid & ATTR_GID) &&
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        error = security_inode_setattr(idmap, dentry, attr);
        if (error)
                return error;

        /*
         * If ATTR_DELEG is set, then these attributes are being set on
         * behalf of the holder of a write delegation. We want to avoid
         * breaking the delegation in this case.
         */
        if (!(ia_valid & ATTR_DELEG)) {
                error = try_break_deleg(inode, delegated_inode);
                if (error)
                        return error;
        }

        if (inode->i_op->setattr)
                error = inode->i_op->setattr(idmap, dentry, attr);
        else
                error = simple_setattr(idmap, dentry, attr);

        if (!error) {
                fsnotify_change(dentry, ia_valid);
                security_inode_post_setattr(idmap, dentry, ia_valid);
        }

        return error;
}
EXPORT_SYMBOL(notify_change);


































































































































































































































  131 
  132 
























  130 




  131 
  131 



  130 



  131 
    8 

    1 
  131 

























































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/backing-dev.h>
#include <linux/blkdev.h>
#include <linux/migrate.h>
#include <linux/vmalloc.h>
#include <linux/huge_mm.h>
#include <linux/shmem_fs.h>
#include "internal.h"
#include "swap.h"

/*
 * swapper_space is a fiction, retained to simplify the path through
 * vmscan's shrink_folio_list.
 */
static const struct address_space_operations swap_aops = {
        .writepage        = swap_writepage,
        .dirty_folio        = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
        .migrate_folio        = migrate_folio,
#endif
};

struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly;
static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly;
static bool enable_vma_readahead __read_mostly = true;

#define SWAP_RA_ORDER_CEILING        5

#define SWAP_RA_WIN_SHIFT        (PAGE_SHIFT / 2)
#define SWAP_RA_HITS_MASK        ((1UL << SWAP_RA_WIN_SHIFT) - 1)
#define SWAP_RA_HITS_MAX        SWAP_RA_HITS_MASK
#define SWAP_RA_WIN_MASK        (~PAGE_MASK & ~SWAP_RA_HITS_MASK)

#define SWAP_RA_HITS(v)                ((v) & SWAP_RA_HITS_MASK)
#define SWAP_RA_WIN(v)                (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT)
#define SWAP_RA_ADDR(v)                ((v) & PAGE_MASK)

#define SWAP_RA_VAL(addr, win, hits)                                \
        (((addr) & PAGE_MASK) |                                        \
         (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) |        \
         ((hits) & SWAP_RA_HITS_MASK))

/* Initial readahead hits is 4 to start up with a small window */
#define GET_SWAP_RA_VAL(vma)                                        \
        (atomic_long_read(&(vma)->swap_readahead_info) ? : 4)

static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);

void show_swap_cache_info(void)
{
        printk("%lu pages in swap cache\n", total_swapcache_pages());
        printk("Free swap  = %ldkB\n", K(get_nr_swap_pages()));
        printk("Total swap = %lukB\n", K(total_swap_pages));
}

void *get_shadow_from_swap_cache(swp_entry_t entry)
{
        struct address_space *address_space = swap_address_space(entry);
        pgoff_t idx = swap_cache_index(entry);
        void *shadow;

        shadow = xa_load(&address_space->i_pages, idx);
        if (xa_is_value(shadow))
                return shadow;
        return NULL;
}

/*
 * add_to_swap_cache resembles filemap_add_folio on swapper_space,
 * but sets SwapCache flag and 'swap' instead of mapping and index.
 */
int add_to_swap_cache(struct folio *folio, swp_entry_t entry,
                        gfp_t gfp, void **shadowp)
{
        struct address_space *address_space = swap_address_space(entry);
        pgoff_t idx = swap_cache_index(entry);
        XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio));
        unsigned long i, nr = folio_nr_pages(folio);
        void *old;

        xas_set_update(&xas, workingset_update_node);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);

        folio_ref_add(folio, nr);
        folio_set_swapcache(folio);
        folio->swap = entry;

        do {
                xas_lock_irq(&xas);
                xas_create_range(&xas);
                if (xas_error(&xas))
                        goto unlock;
                for (i = 0; i < nr; i++) {
                        VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio);
                        if (shadowp) {
                                old = xas_load(&xas);
                                if (xa_is_value(old))
                                        *shadowp = old;
                        }
                        xas_store(&xas, folio);
                        xas_next(&xas);
                }
                address_space->nrpages += nr;
                __node_stat_mod_folio(folio, NR_FILE_PAGES, nr);
                __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr);
unlock:
                xas_unlock_irq(&xas);
        } while (xas_nomem(&xas, gfp));

        if (!xas_error(&xas))
                return 0;

        folio_clear_swapcache(folio);
        folio_ref_sub(folio, nr);
        return xas_error(&xas);
}

/*
 * This must be called only on folios that have
 * been verified to be in the swap cache.
 */
void __delete_from_swap_cache(struct folio *folio,
                        swp_entry_t entry, void *shadow)
{
        struct address_space *address_space = swap_address_space(entry);
        int i;
        long nr = folio_nr_pages(folio);
        pgoff_t idx = swap_cache_index(entry);
        XA_STATE(xas, &address_space->i_pages, idx);

        xas_set_update(&xas, workingset_update_node);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);

        for (i = 0; i < nr; i++) {
                void *entry = xas_store(&xas, shadow);
                VM_BUG_ON_PAGE(entry != folio, entry);
                xas_next(&xas);
        }
        folio->swap.val = 0;
        folio_clear_swapcache(folio);
        address_space->nrpages -= nr;
        __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
        __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}

/*
 * This must be called only on folios that have
 * been verified to be in the swap cache and locked.
 * It will never put the folio into the free list,
 * the caller has a reference on the folio.
 */
void delete_from_swap_cache(struct folio *folio)
{
        swp_entry_t entry = folio->swap;
        struct address_space *address_space = swap_address_space(entry);

        xa_lock_irq(&address_space->i_pages);
        __delete_from_swap_cache(folio, entry, NULL);
        xa_unlock_irq(&address_space->i_pages);

        put_swap_folio(folio, entry);
        folio_ref_sub(folio, folio_nr_pages(folio));
}

void clear_shadow_from_swap_cache(int type, unsigned long begin,
                                unsigned long end)
{
        unsigned long curr = begin;
        void *old;

        for (;;) {
                swp_entry_t entry = swp_entry(type, curr);
                unsigned long index = curr & SWAP_ADDRESS_SPACE_MASK;
                struct address_space *address_space = swap_address_space(entry);
                XA_STATE(xas, &address_space->i_pages, index);

                xas_set_update(&xas, workingset_update_node);

                xa_lock_irq(&address_space->i_pages);
                xas_for_each(&xas, old, min(index + (end - curr), SWAP_ADDRESS_SPACE_PAGES)) {
                        if (!xa_is_value(old))
                                continue;
                        xas_store(&xas, NULL);
                }
                xa_unlock_irq(&address_space->i_pages);

                /* search the next swapcache until we meet end */
                curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES);
                if (curr > end)
                        break;
        }
}

/*
 * If we are the only user, then try to free up the swap cache.
 *
 * Its ok to check the swapcache flag without the folio lock
 * here because we are going to recheck again inside
 * folio_free_swap() _with_ the lock.
 *                                         - Marcelo
 */
void free_swap_cache(struct folio *folio)
{
        if (folio_test_swapcache(folio) && !folio_mapped(folio) &&
            folio_trylock(folio)) {
                folio_free_swap(folio);
                folio_unlock(folio);
        }
}

/*
 * Perform a free_page(), also freeing any swap cache associated with
 * this page if it is the last user of the page.
 */
void free_page_and_swap_cache(struct page *page)
{
        struct folio *folio = page_folio(page);

        free_swap_cache(folio);
        if (!is_huge_zero_folio(folio))
                folio_put(folio);
}

/*
 * Passed an array of pages, drop them all from swapcache and then release
 * them.  They are removed from the LRU and freed if this is their last use.
 */
void free_pages_and_swap_cache(struct encoded_page **pages, int nr)
{
        struct folio_batch folios;
        unsigned int refs[PAGEVEC_SIZE];

        folio_batch_init(&folios);
        for (int i = 0; i < nr; i++) {
                struct folio *folio = page_folio(encoded_page_ptr(pages[i]));

                free_swap_cache(folio);
                refs[folios.nr] = 1;
                if (unlikely(encoded_page_flags(pages[i]) &
                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                        refs[folios.nr] = encoded_nr_pages(pages[++i]);

                if (folio_batch_add(&folios, folio) == 0)
                        folios_put_refs(&folios, refs);
        }
        if (folios.nr)
                folios_put_refs(&folios, refs);
}

static inline bool swap_use_vma_readahead(void)
{
        return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap);
}

/*
 * Lookup a swap entry in the swap cache. A found folio will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the folio
 * lock before returning.
 *
 * Caller must lock the swap device or hold a reference to keep it valid.
 */
struct folio *swap_cache_get_folio(swp_entry_t entry,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct folio *folio;

        folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
        if (!IS_ERR(folio)) {
                bool vma_ra = swap_use_vma_readahead();
                bool readahead;

                /*
                 * At the moment, we don't support PG_readahead for anon THP
                 * so let's bail out rather than confusing the readahead stat.
                 */
                if (unlikely(folio_test_large(folio)))
                        return folio;

                readahead = folio_test_clear_readahead(folio);
                if (vma && vma_ra) {
                        unsigned long ra_val;
                        int win, hits;

                        ra_val = GET_SWAP_RA_VAL(vma);
                        win = SWAP_RA_WIN(ra_val);
                        hits = SWAP_RA_HITS(ra_val);
                        if (readahead)
                                hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX);
                        atomic_long_set(&vma->swap_readahead_info,
                                        SWAP_RA_VAL(addr, win, hits));
                }

                if (readahead) {
                        count_vm_event(SWAP_RA_HIT);
                        if (!vma || !vma_ra)
                                atomic_inc(&swapin_readahead_hits);
                }
        } else {
                folio = NULL;
        }

        return folio;
}

/**
 * filemap_get_incore_folio - Find and get a folio from the page or swap caches.
 * @mapping: The address_space to search.
 * @index: The page cache index.
 *
 * This differs from filemap_get_folio() in that it will also look for the
 * folio in the swap cache.
 *
 * Return: The found folio or %NULL.
 */
struct folio *filemap_get_incore_folio(struct address_space *mapping,
                pgoff_t index)
{
        swp_entry_t swp;
        struct swap_info_struct *si;
        struct folio *folio = filemap_get_entry(mapping, index);

        if (!folio)
                return ERR_PTR(-ENOENT);
        if (!xa_is_value(folio))
                return folio;
        if (!shmem_mapping(mapping))
                return ERR_PTR(-ENOENT);

        swp = radix_to_swp_entry(folio);
        /* There might be swapin error entries in shmem mapping. */
        if (non_swap_entry(swp))
                return ERR_PTR(-ENOENT);
        /* Prevent swapoff from happening to us */
        si = get_swap_device(swp);
        if (!si)
                return ERR_PTR(-ENOENT);
        index = swap_cache_index(swp);
        folio = filemap_get_folio(swap_address_space(swp), index);
        put_swap_device(si);
        return folio;
}

struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated,
                bool skip_if_exists)
{
        struct swap_info_struct *si = swp_swap_info(entry);
        struct folio *folio;
        struct folio *new_folio = NULL;
        struct folio *result = NULL;
        void *shadow = NULL;

        *new_page_allocated = false;
        for (;;) {
                int err;
                /*
                 * First check the swap cache.  Since this is normally
                 * called after swap_cache_get_folio() failed, re-calling
                 * that would confuse statistics.
                 */
                folio = filemap_get_folio(swap_address_space(entry),
                                          swap_cache_index(entry));
                if (!IS_ERR(folio))
                        goto got_folio;

                /*
                 * Just skip read ahead for unused swap slot.
                 */
                if (!swap_entry_swapped(si, entry))
                        goto put_and_return;

                /*
                 * Get a new folio to read into from swap.  Allocate it now if
                 * new_folio not exist, before marking swap_map SWAP_HAS_CACHE,
                 * when -EEXIST will cause any racers to loop around until we
                 * add it to cache.
                 */
                if (!new_folio) {
                        new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
                        if (!new_folio)
                                goto put_and_return;
                }

                /*
                 * Swap entry may have been freed since our caller observed it.
                 */
                err = swapcache_prepare(entry, 1);
                if (!err)
                        break;
                else if (err != -EEXIST)
                        goto put_and_return;

                /*
                 * Protect against a recursive call to __read_swap_cache_async()
                 * on the same entry waiting forever here because SWAP_HAS_CACHE
                 * is set but the folio is not the swap cache yet. This can
                 * happen today if mem_cgroup_swapin_charge_folio() below
                 * triggers reclaim through zswap, which may call
                 * __read_swap_cache_async() in the writeback path.
                 */
                if (skip_if_exists)
                        goto put_and_return;

                /*
                 * We might race against __delete_from_swap_cache(), and
                 * stumble across a swap_map entry whose SWAP_HAS_CACHE
                 * has not yet been cleared.  Or race against another
                 * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
                 * in swap_map, but not yet added its folio to swap cache.
                 */
                schedule_timeout_uninterruptible(1);
        }

        /*
         * The swap entry is ours to swap in. Prepare the new folio.
         */
        __folio_set_locked(new_folio);
        __folio_set_swapbacked(new_folio);

        if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry))
                goto fail_unlock;

        /* May fail (-ENOMEM) if XArray node allocation failed. */
        if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
                goto fail_unlock;

        memcg1_swapin(entry, 1);

        if (shadow)
                workingset_refault(new_folio, shadow);

        /* Caller will initiate read into locked new_folio */
        folio_add_lru(new_folio);
        *new_page_allocated = true;
        folio = new_folio;
got_folio:
        result = folio;
        goto put_and_return;

fail_unlock:
        put_swap_folio(new_folio, entry);
        folio_unlock(new_folio);
put_and_return:
        if (!(*new_page_allocated) && new_folio)
                folio_put(new_folio);
        return result;
}

/*
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 *
 * get/put_swap_device() aren't needed to call this function, because
 * __read_swap_cache_async() call them and swap_read_folio() holds the
 * swap cache folio lock.
 */
struct folio *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                struct vm_area_struct *vma, unsigned long addr,
                struct swap_iocb **plug)
{
        struct swap_info_struct *si;
        bool page_allocated;
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct folio *folio;

        si = get_swap_device(entry);
        if (!si)
                return NULL;

        mpol = get_vma_policy(vma, addr, 0, &ilx);
        folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
                                        &page_allocated, false);
        mpol_cond_put(mpol);

        if (page_allocated)
                swap_read_folio(folio, plug);

        put_swap_device(si);
        return folio;
}

static unsigned int __swapin_nr_pages(unsigned long prev_offset,
                                      unsigned long offset,
                                      int hits,
                                      int max_pages,
                                      int prev_win)
{
        unsigned int pages, last_ra;

        /*
         * This heuristic has been found to work well on both sequential and
         * random loads, swapping to hard disk or to SSD: please don't ask
         * what the "+ 2" means, it just happens to work well, that's all.
         */
        pages = hits + 2;
        if (pages == 2) {
                /*
                 * We can have no readahead hits to judge by: but must not get
                 * stuck here forever, so check for an adjacent offset instead
                 * (and don't even bother to check whether swap type is same).
                 */
                if (offset != prev_offset + 1 && offset != prev_offset - 1)
                        pages = 1;
        } else {
                unsigned int roundup = 4;
                while (roundup < pages)
                        roundup <<= 1;
                pages = roundup;
        }

        if (pages > max_pages)
                pages = max_pages;

        /* Don't shrink readahead too fast */
        last_ra = prev_win / 2;
        if (pages < last_ra)
                pages = last_ra;

        return pages;
}

static unsigned long swapin_nr_pages(unsigned long offset)
{
        static unsigned long prev_offset;
        unsigned int hits, pages, max_pages;
        static atomic_t last_readahead_pages;

        max_pages = 1 << READ_ONCE(page_cluster);
        if (max_pages <= 1)
                return 1;

        hits = atomic_xchg(&swapin_readahead_hits, 0);
        pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
                                  max_pages,
                                  atomic_read(&last_readahead_pages));
        if (!hits)
                WRITE_ONCE(prev_offset, offset);
        atomic_set(&last_readahead_pages, pages);

        return pages;
}

/**
 * swap_cluster_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @gfp_mask: memory allocation flags
 * @mpol: NUMA memory allocation policy to be applied
 * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
 *
 * Returns the struct folio for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
 * because it doesn't cost us any seek time.  We also make sure to queue
 * the 'original' request together with the readahead ones...
 *
 * Note: it is intentional that the same NUMA policy and interleave index
 * are used for every page of the readahead: neighbouring pages on swap
 * are fairly likely to have been swapped out from the same node.
 */
struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
                                    struct mempolicy *mpol, pgoff_t ilx)
{
        struct folio *folio;
        unsigned long entry_offset = swp_offset(entry);
        unsigned long offset = entry_offset;
        unsigned long start_offset, end_offset;
        unsigned long mask;
        struct swap_info_struct *si = swp_swap_info(entry);
        struct blk_plug plug;
        struct swap_iocb *splug = NULL;
        bool page_allocated;

        mask = swapin_nr_pages(offset) - 1;
        if (!mask)
                goto skip;

        /* Read a page_cluster sized and aligned cluster around offset. */
        start_offset = offset & ~mask;
        end_offset = offset | mask;
        if (!start_offset)        /* First page is swap header. */
                start_offset++;
        if (end_offset >= si->max)
                end_offset = si->max - 1;

        blk_start_plug(&plug);
        for (offset = start_offset; offset <= end_offset ; offset++) {
                /* Ok, do the async read-ahead now */
                folio = __read_swap_cache_async(
                                swp_entry(swp_type(entry), offset),
                                gfp_mask, mpol, ilx, &page_allocated, false);
                if (!folio)
                        continue;
                if (page_allocated) {
                        swap_read_folio(folio, &splug);
                        if (offset != entry_offset) {
                                folio_set_readahead(folio);
                                count_vm_event(SWAP_RA);
                        }
                }
                folio_put(folio);
        }
        blk_finish_plug(&plug);
        swap_read_unplug(splug);
        lru_add_drain();        /* Push any new pages onto the LRU now */
skip:
        /* The page was likely read above, so no need for plugging here */
        folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
                                        &page_allocated, false);
        if (unlikely(page_allocated))
                swap_read_folio(folio, NULL);
        return folio;
}

int init_swap_address_space(unsigned int type, unsigned long nr_pages)
{
        struct address_space *spaces, *space;
        unsigned int i, nr;

        nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES);
        spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL);
        if (!spaces)
                return -ENOMEM;
        for (i = 0; i < nr; i++) {
                space = spaces + i;
                xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
                atomic_set(&space->i_mmap_writable, 0);
                space->a_ops = &swap_aops;
                /* swap cache doesn't use writeback related tags */
                mapping_set_no_writeback_tags(space);
        }
        nr_swapper_spaces[type] = nr;
        swapper_spaces[type] = spaces;

        return 0;
}

void exit_swap_address_space(unsigned int type)
{
        int i;
        struct address_space *spaces = swapper_spaces[type];

        for (i = 0; i < nr_swapper_spaces[type]; i++)
                VM_WARN_ON_ONCE(!mapping_empty(&spaces[i]));
        kvfree(spaces);
        nr_swapper_spaces[type] = 0;
        swapper_spaces[type] = NULL;
}

static int swap_vma_ra_win(struct vm_fault *vmf, unsigned long *start,
                           unsigned long *end)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long ra_val;
        unsigned long faddr, prev_faddr, left, right;
        unsigned int max_win, hits, prev_win, win;

        max_win = 1 << min(READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING);
        if (max_win == 1)
                return 1;

        faddr = vmf->address;
        ra_val = GET_SWAP_RA_VAL(vma);
        prev_faddr = SWAP_RA_ADDR(ra_val);
        prev_win = SWAP_RA_WIN(ra_val);
        hits = SWAP_RA_HITS(ra_val);
        win = __swapin_nr_pages(PFN_DOWN(prev_faddr), PFN_DOWN(faddr), hits,
                                max_win, prev_win);
        atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0));
        if (win == 1)
                return 1;

        if (faddr == prev_faddr + PAGE_SIZE)
                left = faddr;
        else if (prev_faddr == faddr + PAGE_SIZE)
                left = faddr - (win << PAGE_SHIFT) + PAGE_SIZE;
        else
                left = faddr - (((win - 1) / 2) << PAGE_SHIFT);
        right = left + (win << PAGE_SHIFT);
        if ((long)left < 0)
                left = 0;
        *start = max3(left, vma->vm_start, faddr & PMD_MASK);
        *end = min3(right, vma->vm_end, (faddr & PMD_MASK) + PMD_SIZE);

        return win;
}

/**
 * swap_vma_readahead - swap in pages in hope we need them soon
 * @targ_entry: swap entry of the targeted memory
 * @gfp_mask: memory allocation flags
 * @mpol: NUMA memory allocation policy to be applied
 * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
 * @vmf: fault information
 *
 * Returns the struct folio for entry and addr, after queueing swapin.
 *
 * Primitive swap readahead code. We simply read in a few pages whose
 * virtual addresses are around the fault address in the same vma.
 *
 * Caller must hold read mmap_lock if vmf->vma is not NULL.
 *
 */
static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
                struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf)
{
        struct blk_plug plug;
        struct swap_iocb *splug = NULL;
        struct folio *folio;
        pte_t *pte = NULL, pentry;
        int win;
        unsigned long start, end, addr;
        swp_entry_t entry;
        pgoff_t ilx;
        bool page_allocated;

        win = swap_vma_ra_win(vmf, &start, &end);
        if (win == 1)
                goto skip;

        ilx = targ_ilx - PFN_DOWN(vmf->address - start);

        blk_start_plug(&plug);
        for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) {
                if (!pte++) {
                        pte = pte_offset_map(vmf->pmd, addr);
                        if (!pte)
                                break;
                }
                pentry = ptep_get_lockless(pte);
                if (!is_swap_pte(pentry))
                        continue;
                entry = pte_to_swp_entry(pentry);
                if (unlikely(non_swap_entry(entry)))
                        continue;
                pte_unmap(pte);
                pte = NULL;
                folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
                                                &page_allocated, false);
                if (!folio)
                        continue;
                if (page_allocated) {
                        swap_read_folio(folio, &splug);
                        if (addr != vmf->address) {
                                folio_set_readahead(folio);
                                count_vm_event(SWAP_RA);
                        }
                }
                folio_put(folio);
        }
        if (pte)
                pte_unmap(pte);
        blk_finish_plug(&plug);
        swap_read_unplug(splug);
        lru_add_drain();
skip:
        /* The folio was likely read above, so no need for plugging here */
        folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
                                        &page_allocated, false);
        if (unlikely(page_allocated))
                swap_read_folio(folio, NULL);
        return folio;
}

/**
 * swapin_readahead - swap in pages in hope we need them soon
 * @entry: swap entry of this memory
 * @gfp_mask: memory allocation flags
 * @vmf: fault information
 *
 * Returns the struct folio for entry and addr, after queueing swapin.
 *
 * It's a main entry function for swap readahead. By the configuration,
 * it will read ahead blocks by cluster-based(ie, physical disk based)
 * or vma-based(ie, virtual address based on faulty address) readahead.
 */
struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                                struct vm_fault *vmf)
{
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct folio *folio;

        mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
        folio = swap_use_vma_readahead() ?
                swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
                swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
        mpol_cond_put(mpol);

        return folio;
}

#ifdef CONFIG_SYSFS
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead));
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
                                      struct kobj_attribute *attr,
                                      const char *buf, size_t count)
{
        ssize_t ret;

        ret = kstrtobool(buf, &enable_vma_readahead);
        if (ret)
                return ret;

        return count;
}
static struct kobj_attribute vma_ra_enabled_attr = __ATTR_RW(vma_ra_enabled);

static struct attribute *swap_attrs[] = {
        &vma_ra_enabled_attr.attr,
        NULL,
};

static const struct attribute_group swap_attr_group = {
        .attrs = swap_attrs,
};

static int __init swap_init_sysfs(void)
{
        int err;
        struct kobject *swap_kobj;

        swap_kobj = kobject_create_and_add("swap", mm_kobj);
        if (!swap_kobj) {
                pr_err("failed to create swap kobject\n");
                return -ENOMEM;
        }
        err = sysfs_create_group(swap_kobj, &swap_attr_group);
        if (err) {
                pr_err("failed to register swap group\n");
                goto delete_obj;
        }
        return 0;

delete_obj:
        kobject_put(swap_kobj);
        return err;
}
subsys_initcall(swap_init_sysfs);
#endif





















































































































































































































































































   22 































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/kernel/setup.c
 *
 * Copyright (C) 1995-2001 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/acpi.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/stddef.h>
#include <linux/ioport.h>
#include <linux/delay.h>
#include <linux/initrd.h>
#include <linux/console.h>
#include <linux/cache.h>
#include <linux/screen_info.h>
#include <linux/init.h>
#include <linux/kexec.h>
#include <linux/root_dev.h>
#include <linux/cpu.h>
#include <linux/interrupt.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <linux/panic_notifier.h>
#include <linux/proc_fs.h>
#include <linux/memblock.h>
#include <linux/of_fdt.h>
#include <linux/efi.h>
#include <linux/psci.h>
#include <linux/sched/task.h>
#include <linux/scs.h>
#include <linux/mm.h>

#include <asm/acpi.h>
#include <asm/fixmap.h>
#include <asm/cpu.h>
#include <asm/cputype.h>
#include <asm/daifflags.h>
#include <asm/elf.h>
#include <asm/cpufeature.h>
#include <asm/cpu_ops.h>
#include <asm/kasan.h>
#include <asm/numa.h>
#include <asm/rsi.h>
#include <asm/scs.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <asm/smp_plat.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/traps.h>
#include <asm/efi.h>
#include <asm/xen/hypervisor.h>
#include <asm/mmu_context.h>

static int num_standard_resources;
static struct resource *standard_resources;

phys_addr_t __fdt_pointer __initdata;
u64 mmu_enabled_at_boot __initdata;

/*
 * Standard memory resources
 */
static struct resource mem_res[] = {
        {
                .name = "Kernel code",
                .start = 0,
                .end = 0,
                .flags = IORESOURCE_SYSTEM_RAM
        },
        {
                .name = "Kernel data",
                .start = 0,
                .end = 0,
                .flags = IORESOURCE_SYSTEM_RAM
        }
};

#define kernel_code mem_res[0]
#define kernel_data mem_res[1]

/*
 * The recorded values of x0 .. x3 upon kernel entry.
 */
u64 __cacheline_aligned boot_args[4];

void __init smp_setup_processor_id(void)
{
        u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
        set_cpu_logical_map(0, mpidr);

        pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n",
                (unsigned long)mpidr, read_cpuid_id());
}

bool arch_match_cpu_phys_id(int cpu, u64 phys_id)
{
        return phys_id == cpu_logical_map(cpu);
}

struct mpidr_hash mpidr_hash;
/**
 * smp_build_mpidr_hash - Pre-compute shifts required at each affinity
 *                          level in order to build a linear index from an
 *                          MPIDR value. Resulting algorithm is a collision
 *                          free hash carried out through shifting and ORing
 */
static void __init smp_build_mpidr_hash(void)
{
        u32 i, affinity, fs[4], bits[4], ls;
        u64 mask = 0;
        /*
         * Pre-scan the list of MPIDRS and filter out bits that do
         * not contribute to affinity levels, ie they never toggle.
         */
        for_each_possible_cpu(i)
                mask |= (cpu_logical_map(i) ^ cpu_logical_map(0));
        pr_debug("mask of set bits %#llx\n", mask);
        /*
         * Find and stash the last and first bit set at all affinity levels to
         * check how many bits are required to represent them.
         */
        for (i = 0; i < 4; i++) {
                affinity = MPIDR_AFFINITY_LEVEL(mask, i);
                /*
                 * Find the MSB bit and LSB bits position
                 * to determine how many bits are required
                 * to express the affinity level.
                 */
                ls = fls(affinity);
                fs[i] = affinity ? ffs(affinity) - 1 : 0;
                bits[i] = ls - fs[i];
        }
        /*
         * An index can be created from the MPIDR_EL1 by isolating the
         * significant bits at each affinity level and by shifting
         * them in order to compress the 32 bits values space to a
         * compressed set of values. This is equivalent to hashing
         * the MPIDR_EL1 through shifting and ORing. It is a collision free
         * hash though not minimal since some levels might contain a number
         * of CPUs that is not an exact power of 2 and their bit
         * representation might contain holes, eg MPIDR_EL1[7:0] = {0x2, 0x80}.
         */
        mpidr_hash.shift_aff[0] = MPIDR_LEVEL_SHIFT(0) + fs[0];
        mpidr_hash.shift_aff[1] = MPIDR_LEVEL_SHIFT(1) + fs[1] - bits[0];
        mpidr_hash.shift_aff[2] = MPIDR_LEVEL_SHIFT(2) + fs[2] -
                                                (bits[1] + bits[0]);
        mpidr_hash.shift_aff[3] = MPIDR_LEVEL_SHIFT(3) +
                                  fs[3] - (bits[2] + bits[1] + bits[0]);
        mpidr_hash.mask = mask;
        mpidr_hash.bits = bits[3] + bits[2] + bits[1] + bits[0];
        pr_debug("MPIDR hash: aff0[%u] aff1[%u] aff2[%u] aff3[%u] mask[%#llx] bits[%u]\n",
                mpidr_hash.shift_aff[0],
                mpidr_hash.shift_aff[1],
                mpidr_hash.shift_aff[2],
                mpidr_hash.shift_aff[3],
                mpidr_hash.mask,
                mpidr_hash.bits);
        /*
         * 4x is an arbitrary value used to warn on a hash table much bigger
         * than expected on most systems.
         */
        if (mpidr_hash_size() > 4 * num_possible_cpus())
                pr_warn("Large number of MPIDR hash buckets detected\n");
}

static void __init setup_machine_fdt(phys_addr_t dt_phys)
{
        int size;
        void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL);
        const char *name;

        if (dt_virt)
                memblock_reserve(dt_phys, size);

        /*
         * dt_virt is a fixmap address, hence __pa(dt_virt) can't be used.
         * Pass dt_phys directly.
         */
        if (!early_init_dt_scan(dt_virt, dt_phys)) {
                pr_crit("\n"
                        "Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n"
                        "The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
                        "\nPlease check your bootloader.",
                        &dt_phys, dt_virt);

                /*
                 * Note that in this _really_ early stage we cannot even BUG()
                 * or oops, so the least terrible thing to do is cpu_relax(),
                 * or else we could end-up printing non-initialized data, etc.
                 */
                while (true)
                        cpu_relax();
        }

        /* Early fixups are done, map the FDT as read-only now */
        fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);

        name = of_flat_dt_get_machine_name();
        if (!name)
                return;

        pr_info("Machine model: %s\n", name);
        dump_stack_set_arch_desc("%s (DT)", name);
}

static void __init request_standard_resources(void)
{
        struct memblock_region *region;
        struct resource *res;
        unsigned long i = 0;
        size_t res_size;

        kernel_code.start   = __pa_symbol(_stext);
        kernel_code.end     = __pa_symbol(__init_begin - 1);
        kernel_data.start   = __pa_symbol(_sdata);
        kernel_data.end     = __pa_symbol(_end - 1);
        insert_resource(&iomem_resource, &kernel_code);
        insert_resource(&iomem_resource, &kernel_data);

        num_standard_resources = memblock.memory.cnt;
        res_size = num_standard_resources * sizeof(*standard_resources);
        standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES);

        for_each_mem_region(region) {
                res = &standard_resources[i++];
                if (memblock_is_nomap(region)) {
                        res->name  = "reserved";
                        res->flags = IORESOURCE_MEM;
                        res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region));
                        res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1;
                } else {
                        res->name  = "System RAM";
                        res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
                        res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
                        res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
                }

                insert_resource(&iomem_resource, res);
        }
}

static int __init reserve_memblock_reserved_regions(void)
{
        u64 i, j;

        for (i = 0; i < num_standard_resources; ++i) {
                struct resource *mem = &standard_resources[i];
                phys_addr_t r_start, r_end, mem_size = resource_size(mem);

                if (!memblock_is_region_reserved(mem->start, mem_size))
                        continue;

                for_each_reserved_mem_range(j, &r_start, &r_end) {
                        resource_size_t start, end;

                        start = max(PFN_PHYS(PFN_DOWN(r_start)), mem->start);
                        end = min(PFN_PHYS(PFN_UP(r_end)) - 1, mem->end);

                        if (start > mem->end || end < mem->start)
                                continue;

                        reserve_region_with_split(mem, start, end, "reserved");
                }
        }

        return 0;
}
arch_initcall(reserve_memblock_reserved_regions);

u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID };

u64 cpu_logical_map(unsigned int cpu)
{
        return __cpu_logical_map[cpu];
}

void __init __no_sanitize_address setup_arch(char **cmdline_p)
{
        setup_initial_init_mm(_stext, _etext, _edata, _end);

        *cmdline_p = boot_command_line;

        kaslr_init();

        early_fixmap_init();
        early_ioremap_init();

        setup_machine_fdt(__fdt_pointer);

        /*
         * Initialise the static keys early as they may be enabled by the
         * cpufeature code and early parameters.
         */
        jump_label_init();
        parse_early_param();

        dynamic_scs_init();

        /*
         * The primary CPU enters the kernel with all DAIF exceptions masked.
         *
         * We must unmask Debug and SError before preemption or scheduling is
         * possible to ensure that these are consistently unmasked across
         * threads, and we want to unmask SError as soon as possible after
         * initializing earlycon so that we can report any SErrors immediately.
         *
         * IRQ and FIQ will be unmasked after the root irqchip has been
         * detected and initialized.
         */
        local_daif_restore(DAIF_PROCCTX_NOIRQ);

        /*
         * TTBR0 is only used for the identity mapping at this stage. Make it
         * point to zero page to avoid speculatively fetching new entries.
         */
        cpu_uninstall_idmap();

        xen_early_init();
        efi_init();

        if (!efi_enabled(EFI_BOOT)) {
                if ((u64)_text % MIN_KIMG_ALIGN)
                        pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!");
                WARN_TAINT(mmu_enabled_at_boot, TAINT_FIRMWARE_WORKAROUND,
                           FW_BUG "Booted with MMU enabled!");
        }

        arm64_memblock_init();

        paging_init();

        acpi_table_upgrade();

        /* Parse the ACPI tables for possible boot-time configuration */
        acpi_boot_table_init();

        if (acpi_disabled)
                unflatten_device_tree();

        bootmem_init();

        kasan_init();

        request_standard_resources();

        early_ioremap_reset();

        if (acpi_disabled)
                psci_dt_init();
        else
                psci_acpi_init();

        arm64_rsi_init();

        init_bootcpu_ops();
        smp_init_cpus();
        smp_build_mpidr_hash();

#ifdef CONFIG_ARM64_SW_TTBR0_PAN
        /*
         * Make sure init_thread_info.ttbr0 always generates translation
         * faults in case uaccess_enable() is inadvertently called by the init
         * thread.
         */
        init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
#endif

        if (boot_args[1] || boot_args[2] || boot_args[3]) {
                pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
                        "\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n"
                        "This indicates a broken bootloader or old kernel\n",
                        boot_args[1], boot_args[2], boot_args[3]);
        }
}

static inline bool cpu_can_disable(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
        const struct cpu_operations *ops = get_cpu_ops(cpu);

        if (ops && ops->cpu_can_disable)
                return ops->cpu_can_disable(cpu);
#endif
        return false;
}

bool arch_cpu_is_hotpluggable(int num)
{
        return cpu_can_disable(num);
}

static void dump_kernel_offset(void)
{
        const unsigned long offset = kaslr_offset();

        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) {
                pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n",
                         offset, KIMAGE_VADDR);
                pr_emerg("PHYS_OFFSET: 0x%llx\n", PHYS_OFFSET);
        } else {
                pr_emerg("Kernel Offset: disabled\n");
        }
}

static int arm64_panic_block_dump(struct notifier_block *self,
                                  unsigned long v, void *p)
{
        dump_kernel_offset();
        dump_cpu_features();
        dump_mem_limit();
        return 0;
}

static struct notifier_block arm64_panic_block = {
        .notifier_call = arm64_panic_block_dump
};

static int __init register_arm64_panic_block(void)
{
        atomic_notifier_chain_register(&panic_notifier_list,
                                       &arm64_panic_block);
        return 0;
}
device_initcall(register_arm64_panic_block);

static int __init check_mmu_enabled_at_boot(void)
{
        if (!efi_enabled(EFI_BOOT) && mmu_enabled_at_boot)
                panic("Non-EFI boot detected with MMU and caches enabled");
        return 0;
}
device_initcall_sync(check_mmu_enabled_at_boot);
















































































   34 













   34 




   34 



   34 




















   34 





















































   37 
   37 
    3 
   34 






























































































































































































































   34 











   34 
   34 


   34 
   34 


















































   34 




































































































































































































































   34 


   34 

   34 
















   34 

   34 









   34 








































































































































   34 



















   34 










































   34 
























   34 



   34 








   34 





























   34 





























   34 










   34 





   34 





   34 


























   34 






   34 























   34 





















   34 



   34 





















   34 






















   34 









   34 




































































































   34 









   34 










   34 




   34 




























































































































































































































































































































































































   34 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   34 




















   34 




   34 
   34 
    6 












   34 



























































   34 































   34 
   34 





   34 

















   34 








































































































   34 






















   34 


   34 


   34 

   34 




   34 



   34 









































































































    3 























   38 






   35 



    3 












    4 




    2 
    4 









    4 



























































































































































































































   34 




























   34 

   34 

   34 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   34 

   34 
   34 
   34 





















































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/signal.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-11-02  Modified for POSIX.1b signals by Richard Henderson
 *
 *  2003-06-02  Jim Houston - Concurrent Computer Corp.
 *                Changes to use preallocated sigqueue structures
 *                to allow signals to be sent reliably.
 */

#include <linux/slab.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/user.h>
#include <linux/sched/debug.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/proc_fs.h>
#include <linux/tty.h>
#include <linux/binfmts.h>
#include <linux/coredump.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
#include <linux/signalfd.h>
#include <linux/ratelimit.h>
#include <linux/task_work.h>
#include <linux/capability.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/uprobes.h>
#include <linux/compat.h>
#include <linux/cn_proc.h>
#include <linux/compiler.h>
#include <linux/posix-timers.h>
#include <linux/cgroup.h>
#include <linux/audit.h>
#include <linux/sysctl.h>
#include <uapi/linux/pidfd.h>

#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>

#include <asm/param.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
#include <asm/cacheflush.h>
#include <asm/syscall.h>        /* for syscall_get_* */

#include "time/posix-timers.h"

/*
 * SLAB caches for signal bits.
 */

static struct kmem_cache *sigqueue_cachep;

int print_fatal_signals __read_mostly;

static void __user *sig_handler(struct task_struct *t, int sig)
{
        return t->sighand->action[sig - 1].sa.sa_handler;
}

static inline bool sig_handler_ignored(void __user *handler, int sig)
{
        /* Is it explicitly or implicitly ignored? */
        return handler == SIG_IGN ||
               (handler == SIG_DFL && sig_kernel_ignore(sig));
}

static bool sig_task_ignored(struct task_struct *t, int sig, bool force)
{
        void __user *handler;

        handler = sig_handler(t, sig);

        /* SIGKILL and SIGSTOP may not be sent to the global init */
        if (unlikely(is_global_init(t) && sig_kernel_only(sig)))
                return true;

        if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
            handler == SIG_DFL && !(force && sig_kernel_only(sig)))
                return true;

        /* Only allow kernel generated signals to this kthread */
        if (unlikely((t->flags & PF_KTHREAD) &&
                     (handler == SIG_KTHREAD_KERNEL) && !force))
                return true;

        return sig_handler_ignored(handler, sig);
}

static bool sig_ignored(struct task_struct *t, int sig, bool force)
{
        /*
         * Blocked signals are never ignored, since the
         * signal handler may change by the time it is
         * unblocked.
         */
        if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
                return false;

        /*
         * Tracers may want to know about even ignored signal unless it
         * is SIGKILL which can't be reported anyway but can be ignored
         * by SIGNAL_UNKILLABLE task.
         */
        if (t->ptrace && sig != SIGKILL)
                return false;

        return sig_task_ignored(t, sig, force);
}

/*
 * Re-calculate pending state from the set of locally pending
 * signals, globally pending signals, and blocked signals.
 */
static inline bool has_pending_signals(sigset_t *signal, sigset_t *blocked)
{
        unsigned long ready;
        long i;

        switch (_NSIG_WORDS) {
        default:
                for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
                        ready |= signal->sig[i] &~ blocked->sig[i];
                break;

        case 4: ready  = signal->sig[3] &~ blocked->sig[3];
                ready |= signal->sig[2] &~ blocked->sig[2];
                ready |= signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 2: ready  = signal->sig[1] &~ blocked->sig[1];
                ready |= signal->sig[0] &~ blocked->sig[0];
                break;

        case 1: ready  = signal->sig[0] &~ blocked->sig[0];
        }
        return ready !=        0;
}

#define PENDING(p,b) has_pending_signals(&(p)->signal, (b))

static bool recalc_sigpending_tsk(struct task_struct *t)
{
        if ((t->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) ||
            PENDING(&t->pending, &t->blocked) ||
            PENDING(&t->signal->shared_pending, &t->blocked) ||
            cgroup_task_frozen(t)) {
                set_tsk_thread_flag(t, TIF_SIGPENDING);
                return true;
        }

        /*
         * We must never clear the flag in another thread, or in current
         * when it's possible the current syscall is returning -ERESTART*.
         * So we don't clear it here, and only callers who know they should do.
         */
        return false;
}

void recalc_sigpending(void)
{
        if (!recalc_sigpending_tsk(current) && !freezing(current)) {
                if (unlikely(test_thread_flag(TIF_SIGPENDING)))
                        clear_thread_flag(TIF_SIGPENDING);
        }
}
EXPORT_SYMBOL(recalc_sigpending);

void calculate_sigpending(void)
{
        /* Have any signals or users of TIF_SIGPENDING been delayed
         * until after fork?
         */
        spin_lock_irq(&current->sighand->siglock);
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
}

/* Given the mask, find the first available signal that should be serviced. */

#define SYNCHRONOUS_MASK \
        (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \
         sigmask(SIGTRAP) | sigmask(SIGFPE) | sigmask(SIGSYS))

int next_signal(struct sigpending *pending, sigset_t *mask)
{
        unsigned long i, *s, *m, x;
        int sig = 0;

        s = pending->signal.sig;
        m = mask->sig;

        /*
         * Handle the first word specially: it contains the
         * synchronous signals that need to be dequeued first.
         */
        x = *s &~ *m;
        if (x) {
                if (x & SYNCHRONOUS_MASK)
                        x &= SYNCHRONOUS_MASK;
                sig = ffz(~x) + 1;
                return sig;
        }

        switch (_NSIG_WORDS) {
        default:
                for (i = 1; i < _NSIG_WORDS; ++i) {
                        x = *++s &~ *++m;
                        if (!x)
                                continue;
                        sig = ffz(~x) + i*_NSIG_BPW + 1;
                        break;
                }
                break;

        case 2:
                x = s[1] &~ m[1];
                if (!x)
                        break;
                sig = ffz(~x) + _NSIG_BPW + 1;
                break;

        case 1:
                /* Nothing to do */
                break;
        }

        return sig;
}

static inline void print_dropped_signal(int sig)
{
        static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);

        if (!print_fatal_signals)
                return;

        if (!__ratelimit(&ratelimit_state))
                return;

        pr_info("%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n",
                                current->comm, current->pid, sig);
}

/**
 * task_set_jobctl_pending - set jobctl pending bits
 * @task: target task
 * @mask: pending bits to set
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK | %JOBCTL_STOP_CONSUME | %JOBCTL_STOP_SIGMASK |
 * %JOBCTL_TRAPPING.  If stop signo is being set, the existing signo is
 * cleared.  If @task is already being killed or exiting, this function
 * becomes noop.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if @mask is set, %false if made noop because @task was dying.
 */
bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~(JOBCTL_PENDING_MASK | JOBCTL_STOP_CONSUME |
                        JOBCTL_STOP_SIGMASK | JOBCTL_TRAPPING));
        BUG_ON((mask & JOBCTL_TRAPPING) && !(mask & JOBCTL_PENDING_MASK));

        if (unlikely(fatal_signal_pending(task) || (task->flags & PF_EXITING)))
                return false;

        if (mask & JOBCTL_STOP_SIGMASK)
                task->jobctl &= ~JOBCTL_STOP_SIGMASK;

        task->jobctl |= mask;
        return true;
}

/**
 * task_clear_jobctl_trapping - clear jobctl trapping bit
 * @task: target task
 *
 * If JOBCTL_TRAPPING is set, a ptracer is waiting for us to enter TRACED.
 * Clear it and wake up the ptracer.  Note that we don't need any further
 * locking.  @task->siglock guarantees that @task->parent points to the
 * ptracer.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_trapping(struct task_struct *task)
{
        if (unlikely(task->jobctl & JOBCTL_TRAPPING)) {
                task->jobctl &= ~JOBCTL_TRAPPING;
                smp_mb();        /* advised by wake_up_bit() */
                wake_up_bit(&task->jobctl, JOBCTL_TRAPPING_BIT);
        }
}

/**
 * task_clear_jobctl_pending - clear jobctl pending bits
 * @task: target task
 * @mask: pending bits to clear
 *
 * Clear @mask from @task->jobctl.  @mask must be subset of
 * %JOBCTL_PENDING_MASK.  If %JOBCTL_STOP_PENDING is being cleared, other
 * STOP bits are cleared together.
 *
 * If clearing of @mask leaves no stop or trap pending, this function calls
 * task_clear_jobctl_trapping().
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
{
        BUG_ON(mask & ~JOBCTL_PENDING_MASK);

        if (mask & JOBCTL_STOP_PENDING)
                mask |= JOBCTL_STOP_CONSUME | JOBCTL_STOP_DEQUEUED;

        task->jobctl &= ~mask;

        if (!(task->jobctl & JOBCTL_PENDING_MASK))
                task_clear_jobctl_trapping(task);
}

/**
 * task_participate_group_stop - participate in a group stop
 * @task: task participating in a group stop
 *
 * @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
 * Group stop states are cleared and the group stop count is consumed if
 * %JOBCTL_STOP_CONSUME was set.  If the consumption completes the group
 * stop, the appropriate `SIGNAL_*` flags are set.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 *
 * RETURNS:
 * %true if group stop completion should be notified to the parent, %false
 * otherwise.
 */
static bool task_participate_group_stop(struct task_struct *task)
{
        struct signal_struct *sig = task->signal;
        bool consume = task->jobctl & JOBCTL_STOP_CONSUME;

        WARN_ON_ONCE(!(task->jobctl & JOBCTL_STOP_PENDING));

        task_clear_jobctl_pending(task, JOBCTL_STOP_PENDING);

        if (!consume)
                return false;

        if (!WARN_ON_ONCE(sig->group_stop_count == 0))
                sig->group_stop_count--;

        /*
         * Tell the caller to notify completion iff we are entering into a
         * fresh group stop.  Read comment in do_signal_stop() for details.
         */
        if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
                signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
                return true;
        }
        return false;
}

void task_join_group_stop(struct task_struct *task)
{
        unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK;
        struct signal_struct *sig = current->signal;

        if (sig->group_stop_count) {
                sig->group_stop_count++;
                mask |= JOBCTL_STOP_CONSUME;
        } else if (!(sig->flags & SIGNAL_STOP_STOPPED))
                return;

        /* Have the new thread join an on-going signal group stop */
        task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING);
}

static struct ucounts *sig_get_ucounts(struct task_struct *t, int sig,
                                       int override_rlimit)
{
        struct ucounts *ucounts;
        long sigpending;

        /*
         * Protect access to @t credentials. This can go away when all
         * callers hold rcu read lock.
         *
         * NOTE! A pending signal will hold on to the user refcount,
         * and we get/put the refcount only when the sigpending count
         * changes from/to zero.
         */
        rcu_read_lock();
        ucounts = task_ucounts(t);
        sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING,
                                            override_rlimit);
        rcu_read_unlock();
        if (!sigpending)
                return NULL;

        if (unlikely(!override_rlimit && sigpending > task_rlimit(t, RLIMIT_SIGPENDING))) {
                dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
                print_dropped_signal(sig);
                return NULL;
        }

        return ucounts;
}

static void __sigqueue_init(struct sigqueue *q, struct ucounts *ucounts,
                            const unsigned int sigqueue_flags)
{
        INIT_LIST_HEAD(&q->list);
        q->flags = sigqueue_flags;
        q->ucounts = ucounts;
}

/*
 * allocate a new signal queue record
 * - this may be called without locks if and only if t == current, otherwise an
 *   appropriate lock must be held to stop the target task from exiting
 */
static struct sigqueue *sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
                                       int override_rlimit)
{
        struct ucounts *ucounts = sig_get_ucounts(t, sig, override_rlimit);
        struct sigqueue *q;

        if (!ucounts)
                return NULL;

        q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
        if (!q) {
                dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
                return NULL;
        }

        __sigqueue_init(q, ucounts, 0);
        return q;
}

static void __sigqueue_free(struct sigqueue *q)
{
        if (q->flags & SIGQUEUE_PREALLOC) {
                posixtimer_sigqueue_putref(q);
                return;
        }
        if (q->ucounts) {
                dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
                q->ucounts = NULL;
        }
        kmem_cache_free(sigqueue_cachep, q);
}

void flush_sigqueue(struct sigpending *queue)
{
        struct sigqueue *q;

        sigemptyset(&queue->signal);
        while (!list_empty(&queue->list)) {
                q = list_entry(queue->list.next, struct sigqueue , list);
                list_del_init(&q->list);
                __sigqueue_free(q);
        }
}

/*
 * Flush all pending signals for this kthread.
 */
void flush_signals(struct task_struct *t)
{
        unsigned long flags;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        clear_tsk_thread_flag(t, TIF_SIGPENDING);
        flush_sigqueue(&t->pending);
        flush_sigqueue(&t->signal->shared_pending);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
}
EXPORT_SYMBOL(flush_signals);

void ignore_signals(struct task_struct *t)
{
        int i;

        for (i = 0; i < _NSIG; ++i)
                t->sighand->action[i].sa.sa_handler = SIG_IGN;

        flush_signals(t);
}

/*
 * Flush all handlers for a task.
 */

void
flush_signal_handlers(struct task_struct *t, int force_default)
{
        int i;
        struct k_sigaction *ka = &t->sighand->action[0];
        for (i = _NSIG ; i != 0 ; i--) {
                if (force_default || ka->sa.sa_handler != SIG_IGN)
                        ka->sa.sa_handler = SIG_DFL;
                ka->sa.sa_flags = 0;
#ifdef __ARCH_HAS_SA_RESTORER
                ka->sa.sa_restorer = NULL;
#endif
                sigemptyset(&ka->sa.sa_mask);
                ka++;
        }
}

bool unhandled_signal(struct task_struct *tsk, int sig)
{
        void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
        if (is_global_init(tsk))
                return true;

        if (handler != SIG_IGN && handler != SIG_DFL)
                return false;

        /* If dying, we handle all new signals by ignoring them */
        if (fatal_signal_pending(tsk))
                return false;

        /* if ptraced, let the tracer determine */
        return !tsk->ptrace;
}

static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *info,
                           struct sigqueue **timer_sigq)
{
        struct sigqueue *q, *first = NULL;

        /*
         * Collect the siginfo appropriate to this signal.  Check if
         * there is another siginfo for the same signal.
        */
        list_for_each_entry(q, &list->list, list) {
                if (q->info.si_signo == sig) {
                        if (first)
                                goto still_pending;
                        first = q;
                }
        }

        sigdelset(&list->signal, sig);

        if (first) {
still_pending:
                list_del_init(&first->list);
                copy_siginfo(info, &first->info);

                /*
                 * posix-timer signals are preallocated and freed when the last
                 * reference count is dropped in posixtimer_deliver_signal() or
                 * immediately on timer deletion when the signal is not pending.
                 * Spare the extra round through __sigqueue_free() which is
                 * ignoring preallocated signals.
                 */
                if (unlikely((first->flags & SIGQUEUE_PREALLOC) && (info->si_code == SI_TIMER)))
                        *timer_sigq = first;
                else
                        __sigqueue_free(first);
        } else {
                /*
                 * Ok, it wasn't in the queue.  This must be
                 * a fast-pathed signal or we must have been
                 * out of queue space.  So zero out the info.
                 */
                clear_siginfo(info);
                info->si_signo = sig;
                info->si_errno = 0;
                info->si_code = SI_USER;
                info->si_pid = 0;
                info->si_uid = 0;
        }
}

static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
                            kernel_siginfo_t *info, struct sigqueue **timer_sigq)
{
        int sig = next_signal(pending, mask);

        if (sig)
                collect_signal(sig, pending, info, timer_sigq);
        return sig;
}

/*
 * Try to dequeue a signal. If a deliverable signal is found fill in the
 * caller provided siginfo and return the signal number. Otherwise return
 * 0.
 */
int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type)
{
        struct task_struct *tsk = current;
        struct sigqueue *timer_sigq;
        int signr;

        lockdep_assert_held(&tsk->sighand->siglock);

again:
        *type = PIDTYPE_PID;
        timer_sigq = NULL;
        signr = __dequeue_signal(&tsk->pending, mask, info, &timer_sigq);
        if (!signr) {
                *type = PIDTYPE_TGID;
                signr = __dequeue_signal(&tsk->signal->shared_pending,
                                         mask, info, &timer_sigq);

                if (unlikely(signr == SIGALRM))
                        posixtimer_rearm_itimer(tsk);
        }

        recalc_sigpending();
        if (!signr)
                return 0;

        if (unlikely(sig_kernel_stop(signr))) {
                /*
                 * Set a marker that we have dequeued a stop signal.  Our
                 * caller might release the siglock and then the pending
                 * stop signal it is about to process is no longer in the
                 * pending bitmasks, but must still be cleared by a SIGCONT
                 * (and overruled by a SIGKILL).  So those cases clear this
                 * shared flag after we've set it.  Note that this flag may
                 * remain set after the signal we return is ignored or
                 * handled.  That doesn't matter because its only purpose
                 * is to alert stop-signal processing code when another
                 * processor has come along and cleared the flag.
                 */
                current->jobctl |= JOBCTL_STOP_DEQUEUED;
        }

        if (IS_ENABLED(CONFIG_POSIX_TIMERS) && unlikely(timer_sigq)) {
                if (!posixtimer_deliver_signal(info, timer_sigq))
                        goto again;
        }

        return signr;
}
EXPORT_SYMBOL_GPL(dequeue_signal);

static int dequeue_synchronous_signal(kernel_siginfo_t *info)
{
        struct task_struct *tsk = current;
        struct sigpending *pending = &tsk->pending;
        struct sigqueue *q, *sync = NULL;

        /*
         * Might a synchronous signal be in the queue?
         */
        if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
                return 0;

        /*
         * Return the first synchronous signal in the queue.
         */
        list_for_each_entry(q, &pending->list, list) {
                /* Synchronous signals have a positive si_code */
                if ((q->info.si_code > SI_USER) &&
                    (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
                        sync = q;
                        goto next;
                }
        }
        return 0;
next:
        /*
         * Check if there is another siginfo for the same signal.
         */
        list_for_each_entry_continue(q, &pending->list, list) {
                if (q->info.si_signo == sync->info.si_signo)
                        goto still_pending;
        }

        sigdelset(&pending->signal, sync->info.si_signo);
        recalc_sigpending();
still_pending:
        list_del_init(&sync->list);
        copy_siginfo(info, &sync->info);
        __sigqueue_free(sync);
        return info->si_signo;
}

/*
 * Tell a process that it has a new active signal..
 *
 * NOTE! we rely on the previous spin_lock to
 * lock interrupts for us! We can only be called with
 * "siglock" held, and the local interrupt must
 * have been disabled when that got acquired!
 *
 * No need to set need_resched since signal event passing
 * goes through ->blocked
 */
void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
        lockdep_assert_held(&t->sighand->siglock);

        set_tsk_thread_flag(t, TIF_SIGPENDING);

        /*
         * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
         * case. We don't check t->state here because there is a race with it
         * executing another processor and just now entering stopped state.
         * By using wake_up_state, we ensure the process will wake up and
         * handle its death signal.
         */
        if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
                kick_process(t);
}

static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q);

static void sigqueue_free_ignored(struct task_struct *tsk, struct sigqueue *q)
{
        if (likely(!(q->flags & SIGQUEUE_PREALLOC) || q->info.si_code != SI_TIMER))
                __sigqueue_free(q);
        else
                posixtimer_sig_ignore(tsk, q);
}

/* Remove signals in mask from the pending set and queue. */
static void flush_sigqueue_mask(struct task_struct *p, sigset_t *mask, struct sigpending *s)
{
        struct sigqueue *q, *n;
        sigset_t m;

        lockdep_assert_held(&p->sighand->siglock);

        sigandsets(&m, mask, &s->signal);
        if (sigisemptyset(&m))
                return;

        sigandnsets(&s->signal, &s->signal, mask);
        list_for_each_entry_safe(q, n, &s->list, list) {
                if (sigismember(mask, q->info.si_signo)) {
                        list_del_init(&q->list);
                        sigqueue_free_ignored(p, q);
                }
        }
}

static inline int is_si_special(const struct kernel_siginfo *info)
{
        return info <= SEND_SIG_PRIV;
}

static inline bool si_fromuser(const struct kernel_siginfo *info)
{
        return info == SEND_SIG_NOINFO ||
                (!is_si_special(info) && SI_FROMUSER(info));
}

/*
 * called with RCU read lock from check_kill_permission()
 */
static bool kill_ok_by_cred(struct task_struct *t)
{
        const struct cred *cred = current_cred();
        const struct cred *tcred = __task_cred(t);

        return uid_eq(cred->euid, tcred->suid) ||
               uid_eq(cred->euid, tcred->uid) ||
               uid_eq(cred->uid, tcred->suid) ||
               uid_eq(cred->uid, tcred->uid) ||
               ns_capable(tcred->user_ns, CAP_KILL);
}

/*
 * Bad permissions for sending the signal
 * - the caller must hold the RCU read lock
 */
static int check_kill_permission(int sig, struct kernel_siginfo *info,
                                 struct task_struct *t)
{
        struct pid *sid;
        int error;

        if (!valid_signal(sig))
                return -EINVAL;

        if (!si_fromuser(info))
                return 0;

        error = audit_signal_info(sig, t); /* Let audit system see the signal */
        if (error)
                return error;

        if (!same_thread_group(current, t) &&
            !kill_ok_by_cred(t)) {
                switch (sig) {
                case SIGCONT:
                        sid = task_session(t);
                        /*
                         * We don't return the error if sid == NULL. The
                         * task was unhashed, the caller must notice this.
                         */
                        if (!sid || sid == task_session(current))
                                break;
                        fallthrough;
                default:
                        return -EPERM;
                }
        }

        return security_task_kill(t, info, sig, NULL);
}

/**
 * ptrace_trap_notify - schedule trap to notify ptracer
 * @t: tracee wanting to notify tracer
 *
 * This function schedules sticky ptrace trap which is cleared on the next
 * TRAP_STOP to notify ptracer of an event.  @t must have been seized by
 * ptracer.
 *
 * If @t is running, STOP trap will be taken.  If trapped for STOP and
 * ptracer is listening for events, tracee is woken up so that it can
 * re-trap for the new event.  If trapped otherwise, STOP trap will be
 * eventually taken without returning to userland after the existing traps
 * are finished by PTRACE_CONT.
 *
 * CONTEXT:
 * Must be called with @task->sighand->siglock held.
 */
static void ptrace_trap_notify(struct task_struct *t)
{
        WARN_ON_ONCE(!(t->ptrace & PT_SEIZED));
        lockdep_assert_held(&t->sighand->siglock);

        task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
        ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
}

/*
 * Handle magic process-wide effects of stop/continue signals. Unlike
 * the signal actions, these happen immediately at signal-generation
 * time regardless of blocking, ignoring, or handling.  This does the
 * actual continuing for SIGCONT, but not the actual stopping for stop
 * signals. The process stop is done as a signal action for SIG_DFL.
 *
 * Returns true if the signal should be actually delivered, otherwise
 * it should be dropped.
 */
static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;
        sigset_t flush;

        if (signal->flags & SIGNAL_GROUP_EXIT) {
                if (signal->core_state)
                        return sig == SIGKILL;
                /*
                 * The process is in the middle of dying, drop the signal.
                 */
                return false;
        } else if (sig_kernel_stop(sig)) {
                /*
                 * This is a stop signal.  Remove SIGCONT from all queues.
                 */
                siginitset(&flush, sigmask(SIGCONT));
                flush_sigqueue_mask(p, &flush, &signal->shared_pending);
                for_each_thread(p, t)
                        flush_sigqueue_mask(p, &flush, &t->pending);
        } else if (sig == SIGCONT) {
                unsigned int why;
                /*
                 * Remove all stop signals from all queues, wake all threads.
                 */
                siginitset(&flush, SIG_KERNEL_STOP_MASK);
                flush_sigqueue_mask(p, &flush, &signal->shared_pending);
                for_each_thread(p, t) {
                        flush_sigqueue_mask(p, &flush, &t->pending);
                        task_clear_jobctl_pending(t, JOBCTL_STOP_PENDING);
                        if (likely(!(t->ptrace & PT_SEIZED))) {
                                t->jobctl &= ~JOBCTL_STOPPED;
                                wake_up_state(t, __TASK_STOPPED);
                        } else
                                ptrace_trap_notify(t);
                }

                /*
                 * Notify the parent with CLD_CONTINUED if we were stopped.
                 *
                 * If we were in the middle of a group stop, we pretend it
                 * was already finished, and then continued. Since SIGCHLD
                 * doesn't queue we report only CLD_STOPPED, as if the next
                 * CLD_CONTINUED was dropped.
                 */
                why = 0;
                if (signal->flags & SIGNAL_STOP_STOPPED)
                        why |= SIGNAL_CLD_CONTINUED;
                else if (signal->group_stop_count)
                        why |= SIGNAL_CLD_STOPPED;

                if (why) {
                        /*
                         * The first thread which returns from do_signal_stop()
                         * will take ->siglock, notice SIGNAL_CLD_MASK, and
                         * notify its parent. See get_signal().
                         */
                        signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
                        signal->group_stop_count = 0;
                        signal->group_exit_code = 0;
                }
        }

        return !sig_ignored(p, sig, force);
}

/*
 * Test if P wants to take SIG.  After we've checked all threads with this,
 * it's equivalent to finding no threads not blocking SIG.  Any threads not
 * blocking SIG were ruled out because they are not running and already
 * have pending signals.  Such threads will dequeue from the shared queue
 * as soon as they're available, so putting the signal on the shared queue
 * will be equivalent to sending it to one such thread.
 */
static inline bool wants_signal(int sig, struct task_struct *p)
{
        if (sigismember(&p->blocked, sig))
                return false;

        if (p->flags & PF_EXITING)
                return false;

        if (sig == SIGKILL)
                return true;

        if (task_is_stopped_or_traced(p))
                return false;

        return task_curr(p) || !task_sigpending(p);
}

static void complete_signal(int sig, struct task_struct *p, enum pid_type type)
{
        struct signal_struct *signal = p->signal;
        struct task_struct *t;

        /*
         * Now find a thread we can wake up to take the signal off the queue.
         *
         * Try the suggested task first (may or may not be the main thread).
         */
        if (wants_signal(sig, p))
                t = p;
        else if ((type == PIDTYPE_PID) || thread_group_empty(p))
                /*
                 * There is just one thread and it does not need to be woken.
                 * It will dequeue unblocked signals before it runs again.
                 */
                return;
        else {
                /*
                 * Otherwise try to find a suitable thread.
                 */
                t = signal->curr_target;
                while (!wants_signal(sig, t)) {
                        t = next_thread(t);
                        if (t == signal->curr_target)
                                /*
                                 * No thread needs to be woken.
                                 * Any eligible threads will see
                                 * the signal in the queue soon.
                                 */
                                return;
                }
                signal->curr_target = t;
        }

        /*
         * Found a killable thread.  If the signal will be fatal,
         * then start taking the whole group down immediately.
         */
        if (sig_fatal(p, sig) &&
            (signal->core_state || !(signal->flags & SIGNAL_GROUP_EXIT)) &&
            !sigismember(&t->real_blocked, sig) &&
            (sig == SIGKILL || !p->ptrace)) {
                /*
                 * This signal will be fatal to the whole group.
                 */
                if (!sig_kernel_coredump(sig)) {
                        /*
                         * Start a group exit and wake everybody up.
                         * This way we don't have other threads
                         * running and doing things after a slower
                         * thread has the fatal signal pending.
                         */
                        signal->flags = SIGNAL_GROUP_EXIT;
                        signal->group_exit_code = sig;
                        signal->group_stop_count = 0;
                        __for_each_thread(signal, t) {
                                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                                sigaddset(&t->pending.signal, SIGKILL);
                                signal_wake_up(t, 1);
                        }
                        return;
                }
        }

        /*
         * The signal is already in the shared-pending queue.
         * Tell the chosen thread to wake up and dequeue it.
         */
        signal_wake_up(t, sig == SIGKILL);
        return;
}

static inline bool legacy_queue(struct sigpending *signals, int sig)
{
        return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}

static int __send_signal_locked(int sig, struct kernel_siginfo *info,
                                struct task_struct *t, enum pid_type type, bool force)
{
        struct sigpending *pending;
        struct sigqueue *q;
        int override_rlimit;
        int ret = 0, result;

        lockdep_assert_held(&t->sighand->siglock);

        result = TRACE_SIGNAL_IGNORED;
        if (!prepare_signal(sig, t, force))
                goto ret;

        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        /*
         * Short-circuit ignored signals and support queuing
         * exactly one non-rt signal, so that we can get more
         * detailed information about the cause of the signal.
         */
        result = TRACE_SIGNAL_ALREADY_PENDING;
        if (legacy_queue(pending, sig))
                goto ret;

        result = TRACE_SIGNAL_DELIVERED;
        /*
         * Skip useless siginfo allocation for SIGKILL and kernel threads.
         */
        if ((sig == SIGKILL) || (t->flags & PF_KTHREAD))
                goto out_set;

        /*
         * Real-time signals must be queued if sent by sigqueue, or
         * some other real-time mechanism.  It is implementation
         * defined whether kill() does so.  We attempt to do so, on
         * the principle of least surprise, but since kill is not
         * allowed to fail with EAGAIN when low on memory we just
         * make sure at least one signal gets delivered and don't
         * pass on the info struct.
         */
        if (sig < SIGRTMIN)
                override_rlimit = (is_si_special(info) || info->si_code >= 0);
        else
                override_rlimit = 0;

        q = sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);

        if (q) {
                list_add_tail(&q->list, &pending->list);
                switch ((unsigned long) info) {
                case (unsigned long) SEND_SIG_NOINFO:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_USER;
                        q->info.si_pid = task_tgid_nr_ns(current,
                                                        task_active_pid_ns(t));
                        rcu_read_lock();
                        q->info.si_uid =
                                from_kuid_munged(task_cred_xxx(t, user_ns),
                                                 current_uid());
                        rcu_read_unlock();
                        break;
                case (unsigned long) SEND_SIG_PRIV:
                        clear_siginfo(&q->info);
                        q->info.si_signo = sig;
                        q->info.si_errno = 0;
                        q->info.si_code = SI_KERNEL;
                        q->info.si_pid = 0;
                        q->info.si_uid = 0;
                        break;
                default:
                        copy_siginfo(&q->info, info);
                        break;
                }
        } else if (!is_si_special(info) &&
                   sig >= SIGRTMIN && info->si_code != SI_USER) {
                /*
                 * Queue overflow, abort.  We may abort if the
                 * signal was rt and sent by user using something
                 * other than kill().
                 */
                result = TRACE_SIGNAL_OVERFLOW_FAIL;
                ret = -EAGAIN;
                goto ret;
        } else {
                /*
                 * This is a silent loss of information.  We still
                 * send the signal, but the *info bits are lost.
                 */
                result = TRACE_SIGNAL_LOSE_INFO;
        }

out_set:
        signalfd_notify(t, sig);
        sigaddset(&pending->signal, sig);

        /* Let multiprocess signals appear after on-going forks */
        if (type > PIDTYPE_TGID) {
                struct multiprocess_signals *delayed;
                hlist_for_each_entry(delayed, &t->signal->multiprocess, node) {
                        sigset_t *signal = &delayed->signal;
                        /* Can't queue both a stop and a continue signal */
                        if (sig == SIGCONT)
                                sigdelsetmask(signal, SIG_KERNEL_STOP_MASK);
                        else if (sig_kernel_stop(sig))
                                sigdelset(signal, SIGCONT);
                        sigaddset(signal, sig);
                }
        }

        complete_signal(sig, t, type);
ret:
        trace_signal_generate(sig, info, t, type != PIDTYPE_PID, result);
        return ret;
}

static inline bool has_si_pid_and_uid(struct kernel_siginfo *info)
{
        bool ret = false;
        switch (siginfo_layout(info->si_signo, info->si_code)) {
        case SIL_KILL:
        case SIL_CHLD:
        case SIL_RT:
                ret = true;
                break;
        case SIL_TIMER:
        case SIL_POLL:
        case SIL_FAULT:
        case SIL_FAULT_TRAPNO:
        case SIL_FAULT_MCEERR:
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
        case SIL_FAULT_PERF_EVENT:
        case SIL_SYS:
                ret = false;
                break;
        }
        return ret;
}

int send_signal_locked(int sig, struct kernel_siginfo *info,
                       struct task_struct *t, enum pid_type type)
{
        /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */
        bool force = false;

        if (info == SEND_SIG_NOINFO) {
                /* Force if sent from an ancestor pid namespace */
                force = !task_pid_nr_ns(current, task_active_pid_ns(t));
        } else if (info == SEND_SIG_PRIV) {
                /* Don't ignore kernel generated signals */
                force = true;
        } else if (has_si_pid_and_uid(info)) {
                /* SIGKILL and SIGSTOP is special or has ids */
                struct user_namespace *t_user_ns;

                rcu_read_lock();
                t_user_ns = task_cred_xxx(t, user_ns);
                if (current_user_ns() != t_user_ns) {
                        kuid_t uid = make_kuid(current_user_ns(), info->si_uid);
                        info->si_uid = from_kuid_munged(t_user_ns, uid);
                }
                rcu_read_unlock();

                /* A kernel generated signal? */
                force = (info->si_code == SI_KERNEL);

                /* From an ancestor pid namespace? */
                if (!task_pid_nr_ns(current, task_active_pid_ns(t))) {
                        info->si_pid = 0;
                        force = true;
                }
        }
        return __send_signal_locked(sig, info, t, type, force);
}

static void print_fatal_signal(int signr)
{
        struct pt_regs *regs = task_pt_regs(current);
        struct file *exe_file;

        exe_file = get_task_exe_file(current);
        if (exe_file) {
                pr_info("%pD: %s: potentially unexpected fatal signal %d.\n",
                        exe_file, current->comm, signr);
                fput(exe_file);
        } else {
                pr_info("%s: potentially unexpected fatal signal %d.\n",
                        current->comm, signr);
        }

#if defined(__i386__) && !defined(__arch_um__)
        pr_info("code at %08lx: ", regs->ip);
        {
                int i;
                for (i = 0; i < 16; i++) {
                        unsigned char insn;

                        if (get_user(insn, (unsigned char *)(regs->ip + i)))
                                break;
                        pr_cont("%02x ", insn);
                }
        }
        pr_cont("\n");
#endif
        preempt_disable();
        show_regs(regs);
        preempt_enable();
}

static int __init setup_print_fatal_signals(char *str)
{
        get_option (&str, &print_fatal_signals);

        return 1;
}

__setup("print-fatal-signals=", setup_print_fatal_signals);

int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p,
                        enum pid_type type)
{
        unsigned long flags;
        int ret = -ESRCH;

        if (lock_task_sighand(p, &flags)) {
                ret = send_signal_locked(sig, info, p, type);
                unlock_task_sighand(p, &flags);
        }

        return ret;
}

enum sig_handler {
        HANDLER_CURRENT, /* If reachable use the current handler */
        HANDLER_SIG_DFL, /* Always use SIG_DFL handler semantics */
        HANDLER_EXIT,         /* Only visible as the process exit code */
};

/*
 * Force a signal that the process can't ignore: if necessary
 * we unblock the signal and change any SIG_IGN to SIG_DFL.
 *
 * Note: If we unblock the signal, we always reset it to SIG_DFL,
 * since we do not want to have a signal handler that was blocked
 * be invoked when user space had explicitly blocked it.
 *
 * We don't want to have recursive SIGSEGV's etc, for example,
 * that is why we also clear SIGNAL_UNKILLABLE.
 */
static int
force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t,
        enum sig_handler handler)
{
        unsigned long int flags;
        int ret, blocked, ignored;
        struct k_sigaction *action;
        int sig = info->si_signo;

        spin_lock_irqsave(&t->sighand->siglock, flags);
        action = &t->sighand->action[sig-1];
        ignored = action->sa.sa_handler == SIG_IGN;
        blocked = sigismember(&t->blocked, sig);
        if (blocked || ignored || (handler != HANDLER_CURRENT)) {
                action->sa.sa_handler = SIG_DFL;
                if (handler == HANDLER_EXIT)
                        action->sa.sa_flags |= SA_IMMUTABLE;
                if (blocked)
                        sigdelset(&t->blocked, sig);
        }
        /*
         * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
         * debugging to leave init killable. But HANDLER_EXIT is always fatal.
         */
        if (action->sa.sa_handler == SIG_DFL &&
            (!t->ptrace || (handler == HANDLER_EXIT)))
                t->signal->flags &= ~SIGNAL_UNKILLABLE;
        ret = send_signal_locked(sig, info, t, PIDTYPE_PID);
        /* This can happen if the signal was already pending and blocked */
        if (!task_sigpending(t))
                signal_wake_up(t, 0);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);

        return ret;
}

int force_sig_info(struct kernel_siginfo *info)
{
        return force_sig_info_to_task(info, current, HANDLER_CURRENT);
}

/*
 * Nuke all other threads in the group.
 */
int zap_other_threads(struct task_struct *p)
{
        struct task_struct *t;
        int count = 0;

        p->signal->group_stop_count = 0;

        for_other_threads(p, t) {
                task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
                count++;

                /* Don't bother with already dead threads */
                if (t->exit_state)
                        continue;
                sigaddset(&t->pending.signal, SIGKILL);
                signal_wake_up(t, 1);
        }

        return count;
}

struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
                                           unsigned long *flags)
{
        struct sighand_struct *sighand;

        rcu_read_lock();
        for (;;) {
                sighand = rcu_dereference(tsk->sighand);
                if (unlikely(sighand == NULL))
                        break;

                /*
                 * This sighand can be already freed and even reused, but
                 * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which
                 * initializes ->siglock: this slab can't go away, it has
                 * the same object type, ->siglock can't be reinitialized.
                 *
                 * We need to ensure that tsk->sighand is still the same
                 * after we take the lock, we can race with de_thread() or
                 * __exit_signal(). In the latter case the next iteration
                 * must see ->sighand == NULL.
                 */
                spin_lock_irqsave(&sighand->siglock, *flags);
                if (likely(sighand == rcu_access_pointer(tsk->sighand)))
                        break;
                spin_unlock_irqrestore(&sighand->siglock, *flags);
        }
        rcu_read_unlock();

        return sighand;
}

#ifdef CONFIG_LOCKDEP
void lockdep_assert_task_sighand_held(struct task_struct *task)
{
        struct sighand_struct *sighand;

        rcu_read_lock();
        sighand = rcu_dereference(task->sighand);
        if (sighand)
                lockdep_assert_held(&sighand->siglock);
        else
                WARN_ON_ONCE(1);
        rcu_read_unlock();
}
#endif

/*
 * send signal info to all the members of a thread group or to the
 * individual thread if type == PIDTYPE_PID.
 */
int group_send_sig_info(int sig, struct kernel_siginfo *info,
                        struct task_struct *p, enum pid_type type)
{
        int ret;

        rcu_read_lock();
        ret = check_kill_permission(sig, info, p);
        rcu_read_unlock();

        if (!ret && sig)
                ret = do_send_sig_info(sig, info, p, type);

        return ret;
}

/*
 * __kill_pgrp_info() sends a signal to a process group: this is what the tty
 * control characters do (^C, ^Z etc)
 * - the caller must hold at least a readlock on tasklist_lock
 */
int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
        struct task_struct *p = NULL;
        int ret = -ESRCH;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                int err = group_send_sig_info(sig, info, p, PIDTYPE_PGID);
                /*
                 * If group_send_sig_info() succeeds at least once ret
                 * becomes 0 and after that the code below has no effect.
                 * Otherwise we return the last err or -ESRCH if this
                 * process group is empty.
                 */
                if (ret)
                        ret = err;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return ret;
}

static int kill_pid_info_type(int sig, struct kernel_siginfo *info,
                                struct pid *pid, enum pid_type type)
{
        int error = -ESRCH;
        struct task_struct *p;

        for (;;) {
                rcu_read_lock();
                p = pid_task(pid, PIDTYPE_PID);
                if (p)
                        error = group_send_sig_info(sig, info, p, type);
                rcu_read_unlock();
                if (likely(!p || error != -ESRCH))
                        return error;
                /*
                 * The task was unhashed in between, try again.  If it
                 * is dead, pid_task() will return NULL, if we race with
                 * de_thread() it will find the new leader.
                 */
        }
}

int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
{
        return kill_pid_info_type(sig, info, pid, PIDTYPE_TGID);
}

static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int error;
        rcu_read_lock();
        error = kill_pid_info(sig, info, find_vpid(pid));
        rcu_read_unlock();
        return error;
}

static inline bool kill_as_cred_perm(const struct cred *cred,
                                     struct task_struct *target)
{
        const struct cred *pcred = __task_cred(target);

        return uid_eq(cred->euid, pcred->suid) ||
               uid_eq(cred->euid, pcred->uid) ||
               uid_eq(cred->uid, pcred->suid) ||
               uid_eq(cred->uid, pcred->uid);
}

/*
 * The usb asyncio usage of siginfo is wrong.  The glibc support
 * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT.
 * AKA after the generic fields:
 *        kernel_pid_t        si_pid;
 *        kernel_uid32_t        si_uid;
 *        sigval_t        si_value;
 *
 * Unfortunately when usb generates SI_ASYNCIO it assumes the layout
 * after the generic fields is:
 *        void __user         *si_addr;
 *
 * This is a practical problem when there is a 64bit big endian kernel
 * and a 32bit userspace.  As the 32bit address will encoded in the low
 * 32bits of the pointer.  Those low 32bits will be stored at higher
 * address than appear in a 32 bit pointer.  So userspace will not
 * see the address it was expecting for it's completions.
 *
 * There is nothing in the encoding that can allow
 * copy_siginfo_to_user32 to detect this confusion of formats, so
 * handle this by requiring the caller of kill_pid_usb_asyncio to
 * notice when this situration takes place and to store the 32bit
 * pointer in sival_int, instead of sival_addr of the sigval_t addr
 * parameter.
 */
int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr,
                         struct pid *pid, const struct cred *cred)
{
        struct kernel_siginfo info;
        struct task_struct *p;
        unsigned long flags;
        int ret = -EINVAL;

        if (!valid_signal(sig))
                return ret;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = errno;
        info.si_code = SI_ASYNCIO;
        *((sigval_t *)&info.si_pid) = addr;

        rcu_read_lock();
        p = pid_task(pid, PIDTYPE_PID);
        if (!p) {
                ret = -ESRCH;
                goto out_unlock;
        }
        if (!kill_as_cred_perm(cred, p)) {
                ret = -EPERM;
                goto out_unlock;
        }
        ret = security_task_kill(p, &info, sig, cred);
        if (ret)
                goto out_unlock;

        if (sig) {
                if (lock_task_sighand(p, &flags)) {
                        ret = __send_signal_locked(sig, &info, p, PIDTYPE_TGID, false);
                        unlock_task_sighand(p, &flags);
                } else
                        ret = -ESRCH;
        }
out_unlock:
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio);

/*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
 *
 * POSIX specifies that kill(-1,sig) is unspecified, but what we have
 * is probably wrong.  Should make it like BSD or SYSV.
 */

static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
{
        int ret;

        if (pid > 0)
                return kill_proc_info(sig, info, pid);

        /* -INT_MIN is undefined.  Exclude this case to avoid a UBSAN warning */
        if (pid == INT_MIN)
                return -ESRCH;

        read_lock(&tasklist_lock);
        if (pid != -1) {
                ret = __kill_pgrp_info(sig, info,
                                pid ? find_vpid(-pid) : task_pgrp(current));
        } else {
                int retval = 0, count = 0;
                struct task_struct * p;

                for_each_process(p) {
                        if (task_pid_vnr(p) > 1 &&
                                        !same_thread_group(p, current)) {
                                int err = group_send_sig_info(sig, info, p,
                                                              PIDTYPE_MAX);
                                ++count;
                                if (err != -EPERM)
                                        retval = err;
                        }
                }
                ret = count ? retval : -ESRCH;
        }
        read_unlock(&tasklist_lock);

        return ret;
}

/*
 * These are for backward compatibility with the rest of the kernel source.
 */

int send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p)
{
        /*
         * Make sure legacy kernel users don't send in bad values
         * (normal paths check this in check_kill_permission).
         */
        if (!valid_signal(sig))
                return -EINVAL;

        return do_send_sig_info(sig, info, p, PIDTYPE_PID);
}
EXPORT_SYMBOL(send_sig_info);

#define __si_special(priv) \
        ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)

int
send_sig(int sig, struct task_struct *p, int priv)
{
        return send_sig_info(sig, __si_special(priv), p);
}
EXPORT_SYMBOL(send_sig);

void force_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info(&info);
}
EXPORT_SYMBOL(force_sig);

void force_fatal_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info_to_task(&info, current, HANDLER_SIG_DFL);
}

void force_exit_sig(int sig)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code = SI_KERNEL;
        info.si_pid = 0;
        info.si_uid = 0;
        force_sig_info_to_task(&info, current, HANDLER_EXIT);
}

/*
 * When things go south during signal handling, we
 * will force a SIGSEGV. And if the signal that caused
 * the problem was already a SIGSEGV, we'll want to
 * make sure we don't even try to deliver the signal..
 */
void force_sigsegv(int sig)
{
        if (sig == SIGSEGV)
                force_fatal_sig(SIGSEGV);
        else
                force_sig(SIGSEGV);
}

int force_sig_fault_to_task(int sig, int code, void __user *addr,
                            struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        return force_sig_info_to_task(&info, t, HANDLER_CURRENT);
}

int force_sig_fault(int sig, int code, void __user *addr)
{
        return force_sig_fault_to_task(sig, code, addr, current);
}

int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        return send_sig_info(info.si_signo, &info, t);
}

int force_sig_mceerr(int code, void __user *addr, short lsb)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return force_sig_info(&info);
}

int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t)
{
        struct kernel_siginfo info;

        WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR));
        clear_siginfo(&info);
        info.si_signo = SIGBUS;
        info.si_errno = 0;
        info.si_code = code;
        info.si_addr = addr;
        info.si_addr_lsb = lsb;
        return send_sig_info(info.si_signo, &info, t);
}
EXPORT_SYMBOL(send_sig_mceerr);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_BNDERR;
        info.si_addr  = addr;
        info.si_lower = lower;
        info.si_upper = upper;
        return force_sig_info(&info);
}

#ifdef SEGV_PKUERR
int force_sig_pkuerr(void __user *addr, u32 pkey)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSEGV;
        info.si_errno = 0;
        info.si_code  = SEGV_PKUERR;
        info.si_addr  = addr;
        info.si_pkey  = pkey;
        return force_sig_info(&info);
}
#endif

int send_sig_perf(void __user *addr, u32 type, u64 sig_data)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo     = SIGTRAP;
        info.si_errno     = 0;
        info.si_code      = TRAP_PERF;
        info.si_addr      = addr;
        info.si_perf_data = sig_data;
        info.si_perf_type = type;

        /*
         * Signals generated by perf events should not terminate the whole
         * process if SIGTRAP is blocked, however, delivering the signal
         * asynchronously is better than not delivering at all. But tell user
         * space if the signal was asynchronous, so it can clearly be
         * distinguished from normal synchronous ones.
         */
        info.si_perf_flags = sigismember(&current->blocked, info.si_signo) ?
                                     TRAP_PERF_FLAG_ASYNC :
                                     0;

        return send_sig_info(info.si_signo, &info, current);
}

/**
 * force_sig_seccomp - signals the task to allow in-process syscall emulation
 * @syscall: syscall number to send to userland
 * @reason: filter-supplied reason code to send to userland (via si_errno)
 * @force_coredump: true to trigger a coredump
 *
 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 */
int force_sig_seccomp(int syscall, int reason, bool force_coredump)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGSYS;
        info.si_code = SYS_SECCOMP;
        info.si_call_addr = (void __user *)KSTK_EIP(current);
        info.si_errno = reason;
        info.si_arch = syscall_get_arch(current);
        info.si_syscall = syscall;
        return force_sig_info_to_task(&info, current,
                force_coredump ? HANDLER_EXIT : HANDLER_CURRENT);
}

/* For the crazy architectures that include trap information in
 * the errno field, instead of an actual errno value.
 */
int force_sig_ptrace_errno_trap(int errno, void __user *addr)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = SIGTRAP;
        info.si_errno = errno;
        info.si_code  = TRAP_HWBKPT;
        info.si_addr  = addr;
        return force_sig_info(&info);
}

/* For the rare architectures that include trap information using
 * si_trapno.
 */
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        info.si_trapno = trapno;
        return force_sig_info(&info);
}

/* For the rare architectures that include trap information using
 * si_trapno.
 */
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
                          struct task_struct *t)
{
        struct kernel_siginfo info;

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        info.si_code  = code;
        info.si_addr  = addr;
        info.si_trapno = trapno;
        return send_sig_info(info.si_signo, &info, t);
}

static int kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
{
        int ret;
        read_lock(&tasklist_lock);
        ret = __kill_pgrp_info(sig, info, pgrp);
        read_unlock(&tasklist_lock);
        return ret;
}

int kill_pgrp(struct pid *pid, int sig, int priv)
{
        return kill_pgrp_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pgrp);

int kill_pid(struct pid *pid, int sig, int priv)
{
        return kill_pid_info(sig, __si_special(priv), pid);
}
EXPORT_SYMBOL(kill_pid);

#ifdef CONFIG_POSIX_TIMERS
/*
 * These functions handle POSIX timer signals. POSIX timers use
 * preallocated sigqueue structs for sending signals.
 */
static void __flush_itimer_signals(struct sigpending *pending)
{
        sigset_t signal, retain;
        struct sigqueue *q, *n;

        signal = pending->signal;
        sigemptyset(&retain);

        list_for_each_entry_safe(q, n, &pending->list, list) {
                int sig = q->info.si_signo;

                if (likely(q->info.si_code != SI_TIMER)) {
                        sigaddset(&retain, sig);
                } else {
                        sigdelset(&signal, sig);
                        list_del_init(&q->list);
                        __sigqueue_free(q);
                }
        }

        sigorsets(&pending->signal, &signal, &retain);
}

void flush_itimer_signals(void)
{
        struct task_struct *tsk = current;

        guard(spinlock_irqsave)(&tsk->sighand->siglock);
        __flush_itimer_signals(&tsk->pending);
        __flush_itimer_signals(&tsk->signal->shared_pending);
}

bool posixtimer_init_sigqueue(struct sigqueue *q)
{
        struct ucounts *ucounts = sig_get_ucounts(current, -1, 0);

        if (!ucounts)
                return false;
        clear_siginfo(&q->info);
        __sigqueue_init(q, ucounts, SIGQUEUE_PREALLOC);
        return true;
}

static void posixtimer_queue_sigqueue(struct sigqueue *q, struct task_struct *t, enum pid_type type)
{
        struct sigpending *pending;
        int sig = q->info.si_signo;

        signalfd_notify(t, sig);
        pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending;
        list_add_tail(&q->list, &pending->list);
        sigaddset(&pending->signal, sig);
        complete_signal(sig, t, type);
}

/*
 * This function is used by POSIX timers to deliver a timer signal.
 * Where type is PIDTYPE_PID (such as for timers with SIGEV_THREAD_ID
 * set), the signal must be delivered to the specific thread (queues
 * into t->pending).
 *
 * Where type is not PIDTYPE_PID, signals must be delivered to the
 * process. In this case, prefer to deliver to current if it is in
 * the same thread group as the target process and its sighand is
 * stable, which avoids unnecessarily waking up a potentially idle task.
 */
static inline struct task_struct *posixtimer_get_target(struct k_itimer *tmr)
{
        struct task_struct *t = pid_task(tmr->it_pid, tmr->it_pid_type);

        if (t && tmr->it_pid_type != PIDTYPE_PID &&
            same_thread_group(t, current) && !current->exit_state)
                t = current;
        return t;
}

void posixtimer_send_sigqueue(struct k_itimer *tmr)
{
        struct sigqueue *q = &tmr->sigq;
        int sig = q->info.si_signo;
        struct task_struct *t;
        unsigned long flags;
        int result;

        guard(rcu)();

        t = posixtimer_get_target(tmr);
        if (!t)
                return;

        if (!likely(lock_task_sighand(t, &flags)))
                return;

        /*
         * Update @tmr::sigqueue_seq for posix timer signals with sighand
         * locked to prevent a race against dequeue_signal().
         */
        tmr->it_sigqueue_seq = tmr->it_signal_seq;

        /*
         * Set the signal delivery status under sighand lock, so that the
         * ignored signal handling can distinguish between a periodic and a
         * non-periodic timer.
         */
        tmr->it_sig_periodic = tmr->it_status == POSIX_TIMER_REQUEUE_PENDING;

        if (!prepare_signal(sig, t, false)) {
                result = TRACE_SIGNAL_IGNORED;

                if (!list_empty(&q->list)) {
                        /*
                         * The signal was ignored and blocked. The timer
                         * expiry queued it because blocked signals are
                         * queued independent of the ignored state.
                         *
                         * The unblocking set SIGPENDING, but the signal
                         * was not yet dequeued from the pending list.
                         * So prepare_signal() sees unblocked and ignored,
                         * which ends up here. Leave it queued like a
                         * regular signal.
                         *
                         * The same happens when the task group is exiting
                         * and the signal is already queued.
                         * prepare_signal() treats SIGNAL_GROUP_EXIT as
                         * ignored independent of its queued state. This
                         * gets cleaned up in __exit_signal().
                         */
                        goto out;
                }

                /* Periodic timers with SIG_IGN are queued on the ignored list */
                if (tmr->it_sig_periodic) {
                        /*
                         * Already queued means the timer was rearmed after
                         * the previous expiry got it on the ignore list.
                         * Nothing to do for that case.
                         */
                        if (hlist_unhashed(&tmr->ignored_list)) {
                                /*
                                 * Take a signal reference and queue it on
                                 * the ignored list.
                                 */
                                posixtimer_sigqueue_getref(q);
                                posixtimer_sig_ignore(t, q);
                        }
                } else if (!hlist_unhashed(&tmr->ignored_list)) {
                        /*
                         * Covers the case where a timer was periodic and
                         * then the signal was ignored. Later it was rearmed
                         * as oneshot timer. The previous signal is invalid
                         * now, and this oneshot signal has to be dropped.
                         * Remove it from the ignored list and drop the
                         * reference count as the signal is not longer
                         * queued.
                         */
                        hlist_del_init(&tmr->ignored_list);
                        posixtimer_putref(tmr);
                }
                goto out;
        }

        if (unlikely(!list_empty(&q->list))) {
                /* This holds a reference count already */
                result = TRACE_SIGNAL_ALREADY_PENDING;
                goto out;
        }

        /*
         * If the signal is on the ignore list, it got blocked after it was
         * ignored earlier. But nothing lifted the ignore. Move it back to
         * the pending list to be consistent with the regular signal
         * handling. This already holds a reference count.
         *
         * If it's not on the ignore list acquire a reference count.
         */
        if (likely(hlist_unhashed(&tmr->ignored_list)))
                posixtimer_sigqueue_getref(q);
        else
                hlist_del_init(&tmr->ignored_list);

        posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
        result = TRACE_SIGNAL_DELIVERED;
out:
        trace_signal_generate(sig, &q->info, t, tmr->it_pid_type != PIDTYPE_PID, result);
        unlock_task_sighand(t, &flags);
}

static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q)
{
        struct k_itimer *tmr = container_of(q, struct k_itimer, sigq);

        /*
         * If the timer is marked deleted already or the signal originates
         * from a non-periodic timer, then just drop the reference
         * count. Otherwise queue it on the ignored list.
         */
        if (posixtimer_valid(tmr) && tmr->it_sig_periodic)
                hlist_add_head(&tmr->ignored_list, &tsk->signal->ignored_posix_timers);
        else
                posixtimer_putref(tmr);
}

static void posixtimer_sig_unignore(struct task_struct *tsk, int sig)
{
        struct hlist_head *head = &tsk->signal->ignored_posix_timers;
        struct hlist_node *tmp;
        struct k_itimer *tmr;

        if (likely(hlist_empty(head)))
                return;

        /*
         * Rearming a timer with sighand lock held is not possible due to
         * lock ordering vs. tmr::it_lock. Just stick the sigqueue back and
         * let the signal delivery path deal with it whether it needs to be
         * rearmed or not. This cannot be decided here w/o dropping sighand
         * lock and creating a loop retry horror show.
         */
        hlist_for_each_entry_safe(tmr, tmp , head, ignored_list) {
                struct task_struct *target;

                /*
                 * tmr::sigq.info.si_signo is immutable, so accessing it
                 * without holding tmr::it_lock is safe.
                 */
                if (tmr->sigq.info.si_signo != sig)
                        continue;

                hlist_del_init(&tmr->ignored_list);

                /* This should never happen and leaks a reference count */
                if (WARN_ON_ONCE(!list_empty(&tmr->sigq.list)))
                        continue;

                /*
                 * Get the target for the signal. If target is a thread and
                 * has exited by now, drop the reference count.
                 */
                guard(rcu)();
                target = posixtimer_get_target(tmr);
                if (target)
                        posixtimer_queue_sigqueue(&tmr->sigq, target, tmr->it_pid_type);
                else
                        posixtimer_putref(tmr);
        }
}
#else /* CONFIG_POSIX_TIMERS */
static inline void posixtimer_sig_ignore(struct task_struct *tsk, struct sigqueue *q) { }
static inline void posixtimer_sig_unignore(struct task_struct *tsk, int sig) { }
#endif /* !CONFIG_POSIX_TIMERS */

void do_notify_pidfd(struct task_struct *task)
{
        struct pid *pid = task_pid(task);

        WARN_ON(task->exit_state == 0);

        __wake_up(&pid->wait_pidfd, TASK_NORMAL, 0,
                        poll_to_key(EPOLLIN | EPOLLRDNORM));
}

/*
 * Let a parent know about the death of a child.
 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
 *
 * Returns true if our parent ignored us and so we've switched to
 * self-reaping.
 */
bool do_notify_parent(struct task_struct *tsk, int sig)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct sighand_struct *psig;
        bool autoreap = false;
        u64 utime, stime;

        WARN_ON_ONCE(sig == -1);

        /* do_notify_parent_cldstop should have been called instead.  */
        WARN_ON_ONCE(task_is_stopped_or_traced(tsk));

        WARN_ON_ONCE(!tsk->ptrace &&
               (tsk->group_leader != tsk || !thread_group_empty(tsk)));

        /* ptraced, or group-leader without sub-threads */
        do_notify_pidfd(tsk);

        if (sig != SIGCHLD) {
                /*
                 * This is only possible if parent == real_parent.
                 * Check if it has changed security domain.
                 */
                if (tsk->parent_exec_id != READ_ONCE(tsk->parent->self_exec_id))
                        sig = SIGCHLD;
        }

        clear_siginfo(&info);
        info.si_signo = sig;
        info.si_errno = 0;
        /*
         * We are under tasklist_lock here so our parent is tied to
         * us and cannot change.
         *
         * task_active_pid_ns will always return the same pid namespace
         * until a task passes through release_task.
         *
         * write_lock() currently calls preempt_disable() which is the
         * same as rcu_read_lock(), but according to Oleg, this is not
         * correct to rely on this
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(tsk->parent, user_ns),
                                       task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime + tsk->signal->utime);
        info.si_stime = nsec_to_clock_t(stime + tsk->signal->stime);

        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
                info.si_code = CLD_DUMPED;
        else if (tsk->exit_code & 0x7f)
                info.si_code = CLD_KILLED;
        else {
                info.si_code = CLD_EXITED;
                info.si_status = tsk->exit_code >> 8;
        }

        psig = tsk->parent->sighand;
        spin_lock_irqsave(&psig->siglock, flags);
        if (!tsk->ptrace && sig == SIGCHLD &&
            (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
             (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
                /*
                 * We are exiting and our parent doesn't care.  POSIX.1
                 * defines special semantics for setting SIGCHLD to SIG_IGN
                 * or setting the SA_NOCLDWAIT flag: we should be reaped
                 * automatically and not left for our parent's wait4 call.
                 * Rather than having the parent do it as a magic kind of
                 * signal handler, we just set this to tell do_exit that we
                 * can be cleaned up without becoming a zombie.  Note that
                 * we still call __wake_up_parent in this case, because a
                 * blocked sys_wait4 might now return -ECHILD.
                 *
                 * Whether we send SIGCHLD or not for SA_NOCLDWAIT
                 * is implementation-defined: we do (if you don't want
                 * it, just use SIG_IGN instead).
                 */
                autoreap = true;
                if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
                        sig = 0;
        }
        /*
         * Send with __send_signal as si_pid and si_uid are in the
         * parent's namespaces.
         */
        if (valid_signal(sig) && sig)
                __send_signal_locked(sig, &info, tsk->parent, PIDTYPE_TGID, false);
        __wake_up_parent(tsk, tsk->parent);
        spin_unlock_irqrestore(&psig->siglock, flags);

        return autoreap;
}

/**
 * do_notify_parent_cldstop - notify parent of stopped/continued state change
 * @tsk: task reporting the state change
 * @for_ptracer: the notification is for ptracer
 * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
 *
 * Notify @tsk's parent that the stopped/continued state has changed.  If
 * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
 * If %true, @tsk reports to @tsk->parent which should be the ptracer.
 *
 * CONTEXT:
 * Must be called with tasklist_lock at least read locked.
 */
static void do_notify_parent_cldstop(struct task_struct *tsk,
                                     bool for_ptracer, int why)
{
        struct kernel_siginfo info;
        unsigned long flags;
        struct task_struct *parent;
        struct sighand_struct *sighand;
        u64 utime, stime;

        if (for_ptracer) {
                parent = tsk->parent;
        } else {
                tsk = tsk->group_leader;
                parent = tsk->real_parent;
        }

        clear_siginfo(&info);
        info.si_signo = SIGCHLD;
        info.si_errno = 0;
        /*
         * see comment in do_notify_parent() about the following 4 lines
         */
        rcu_read_lock();
        info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
        info.si_uid = from_kuid_munged(task_cred_xxx(parent, user_ns), task_uid(tsk));
        rcu_read_unlock();

        task_cputime(tsk, &utime, &stime);
        info.si_utime = nsec_to_clock_t(utime);
        info.si_stime = nsec_to_clock_t(stime);

         info.si_code = why;
         switch (why) {
         case CLD_CONTINUED:
                 info.si_status = SIGCONT;
                 break;
         case CLD_STOPPED:
                 info.si_status = tsk->signal->group_exit_code & 0x7f;
                 break;
         case CLD_TRAPPED:
                 info.si_status = tsk->exit_code & 0x7f;
                 break;
         default:
                 BUG();
         }

        sighand = parent->sighand;
        spin_lock_irqsave(&sighand->siglock, flags);
        if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
            !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
                send_signal_locked(SIGCHLD, &info, parent, PIDTYPE_TGID);
        /*
         * Even if SIGCHLD is not generated, we must wake up wait4 calls.
         */
        __wake_up_parent(tsk, parent);
        spin_unlock_irqrestore(&sighand->siglock, flags);
}

/*
 * This must be called with current->sighand->siglock held.
 *
 * This should be the path for all ptrace stops.
 * We always set current->last_siginfo while stopped here.
 * That makes it a way to test a stopped process for
 * being ptrace-stopped vs being job-control-stopped.
 *
 * Returns the signal the ptracer requested the code resume
 * with.  If the code did not stop because the tracer is gone,
 * the stop signal remains unchanged unless clear_code.
 */
static int ptrace_stop(int exit_code, int why, unsigned long message,
                       kernel_siginfo_t *info)
        __releases(&current->sighand->siglock)
        __acquires(&current->sighand->siglock)
{
        bool gstop_done = false;

        if (arch_ptrace_stop_needed()) {
                /*
                 * The arch code has something special to do before a
                 * ptrace stop.  This is allowed to block, e.g. for faults
                 * on user stack pages.  We can't keep the siglock while
                 * calling arch_ptrace_stop, so we must release it now.
                 * To preserve proper semantics, we must do this before
                 * any signal bookkeeping like checking group_stop_count.
                 */
                spin_unlock_irq(&current->sighand->siglock);
                arch_ptrace_stop();
                spin_lock_irq(&current->sighand->siglock);
        }

        /*
         * After this point ptrace_signal_wake_up or signal_wake_up
         * will clear TASK_TRACED if ptrace_unlink happens or a fatal
         * signal comes in.  Handle previous ptrace_unlinks and fatal
         * signals here to prevent ptrace_stop sleeping in schedule.
         */
        if (!current->ptrace || __fatal_signal_pending(current))
                return exit_code;

        set_special_state(TASK_TRACED);
        current->jobctl |= JOBCTL_TRACED;

        /*
         * We're committing to trapping.  TRACED should be visible before
         * TRAPPING is cleared; otherwise, the tracer might fail do_wait().
         * Also, transition to TRACED and updates to ->jobctl should be
         * atomic with respect to siglock and should be done after the arch
         * hook as siglock is released and regrabbed across it.
         *
         *     TRACER                                    TRACEE
         *
         *     ptrace_attach()
         * [L]   wait_on_bit(JOBCTL_TRAPPING)        [S] set_special_state(TRACED)
         *     do_wait()
         *       set_current_state()                smp_wmb();
         *       ptrace_do_wait()
         *         wait_task_stopped()
         *           task_stopped_code()
         * [L]         task_is_traced()                [S] task_clear_jobctl_trapping();
         */
        smp_wmb();

        current->ptrace_message = message;
        current->last_siginfo = info;
        current->exit_code = exit_code;

        /*
         * If @why is CLD_STOPPED, we're trapping to participate in a group
         * stop.  Do the bookkeeping.  Note that if SIGCONT was delievered
         * across siglock relocks since INTERRUPT was scheduled, PENDING
         * could be clear now.  We act as if SIGCONT is received after
         * TASK_TRACED is entered - ignore it.
         */
        if (why == CLD_STOPPED && (current->jobctl & JOBCTL_STOP_PENDING))
                gstop_done = task_participate_group_stop(current);

        /* any trap clears pending STOP trap, STOP trap clears NOTIFY */
        task_clear_jobctl_pending(current, JOBCTL_TRAP_STOP);
        if (info && info->si_code >> 8 == PTRACE_EVENT_STOP)
                task_clear_jobctl_pending(current, JOBCTL_TRAP_NOTIFY);

        /* entering a trap, clear TRAPPING */
        task_clear_jobctl_trapping(current);

        spin_unlock_irq(&current->sighand->siglock);
        read_lock(&tasklist_lock);
        /*
         * Notify parents of the stop.
         *
         * While ptraced, there are two parents - the ptracer and
         * the real_parent of the group_leader.  The ptracer should
         * know about every stop while the real parent is only
         * interested in the completion of group stop.  The states
         * for the two don't interact with each other.  Notify
         * separately unless they're gonna be duplicates.
         */
        if (current->ptrace)
                do_notify_parent_cldstop(current, true, why);
        if (gstop_done && (!current->ptrace || ptrace_reparented(current)))
                do_notify_parent_cldstop(current, false, why);

        /*
         * The previous do_notify_parent_cldstop() invocation woke ptracer.
         * One a PREEMPTION kernel this can result in preemption requirement
         * which will be fulfilled after read_unlock() and the ptracer will be
         * put on the CPU.
         * The ptracer is in wait_task_inactive(, __TASK_TRACED) waiting for
         * this task wait in schedule(). If this task gets preempted then it
         * remains enqueued on the runqueue. The ptracer will observe this and
         * then sleep for a delay of one HZ tick. In the meantime this task
         * gets scheduled, enters schedule() and will wait for the ptracer.
         *
         * This preemption point is not bad from a correctness point of
         * view but extends the runtime by one HZ tick time due to the
         * ptracer's sleep.  The preempt-disable section ensures that there
         * will be no preemption between unlock and schedule() and so
         * improving the performance since the ptracer will observe that
         * the tracee is scheduled out once it gets on the CPU.
         *
         * On PREEMPT_RT locking tasklist_lock does not disable preemption.
         * Therefore the task can be preempted after do_notify_parent_cldstop()
         * before unlocking tasklist_lock so there is no benefit in doing this.
         *
         * In fact disabling preemption is harmful on PREEMPT_RT because
         * the spinlock_t in cgroup_enter_frozen() must not be acquired
         * with preemption disabled due to the 'sleeping' spinlock
         * substitution of RT.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_disable();
        read_unlock(&tasklist_lock);
        cgroup_enter_frozen();
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable_no_resched();
        schedule();
        cgroup_leave_frozen(true);

        /*
         * We are back.  Now reacquire the siglock before touching
         * last_siginfo, so that we are sure to have synchronized with
         * any signal-sending on another CPU that wants to examine it.
         */
        spin_lock_irq(&current->sighand->siglock);
        exit_code = current->exit_code;
        current->last_siginfo = NULL;
        current->ptrace_message = 0;
        current->exit_code = 0;

        /* LISTENING can be set only during STOP traps, clear it */
        current->jobctl &= ~(JOBCTL_LISTENING | JOBCTL_PTRACE_FROZEN);

        /*
         * Queued signals ignored us while we were stopped for tracing.
         * So check for any that we should take before resuming user mode.
         * This sets TIF_SIGPENDING, but never clears it.
         */
        recalc_sigpending_tsk(current);
        return exit_code;
}

static int ptrace_do_notify(int signr, int exit_code, int why, unsigned long message)
{
        kernel_siginfo_t info;

        clear_siginfo(&info);
        info.si_signo = signr;
        info.si_code = exit_code;
        info.si_pid = task_pid_vnr(current);
        info.si_uid = from_kuid_munged(current_user_ns(), current_uid());

        /* Let the debugger run.  */
        return ptrace_stop(exit_code, why, message, &info);
}

int ptrace_notify(int exit_code, unsigned long message)
{
        int signr;

        BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
        if (unlikely(task_work_pending(current)))
                task_work_run();

        spin_lock_irq(&current->sighand->siglock);
        signr = ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED, message);
        spin_unlock_irq(&current->sighand->siglock);
        return signr;
}

/**
 * do_signal_stop - handle group stop for SIGSTOP and other stop signals
 * @signr: signr causing group stop if initiating
 *
 * If %JOBCTL_STOP_PENDING is not set yet, initiate group stop with @signr
 * and participate in it.  If already set, participate in the existing
 * group stop.  If participated in a group stop (and thus slept), %true is
 * returned with siglock released.
 *
 * If ptraced, this function doesn't handle stop itself.  Instead,
 * %JOBCTL_TRAP_STOP is scheduled and %false is returned with siglock
 * untouched.  The caller must ensure that INTERRUPT trap handling takes
 * places afterwards.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which is released
 * on %true return.
 *
 * RETURNS:
 * %false if group stop is already cancelled or ptrace trap is scheduled.
 * %true if participated in group stop.
 */
static bool do_signal_stop(int signr)
        __releases(&current->sighand->siglock)
{
        struct signal_struct *sig = current->signal;

        if (!(current->jobctl & JOBCTL_STOP_PENDING)) {
                unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME;
                struct task_struct *t;

                /* signr will be recorded in task->jobctl for retries */
                WARN_ON_ONCE(signr & ~JOBCTL_STOP_SIGMASK);

                if (!likely(current->jobctl & JOBCTL_STOP_DEQUEUED) ||
                    unlikely(sig->flags & SIGNAL_GROUP_EXIT) ||
                    unlikely(sig->group_exec_task))
                        return false;
                /*
                 * There is no group stop already in progress.  We must
                 * initiate one now.
                 *
                 * While ptraced, a task may be resumed while group stop is
                 * still in effect and then receive a stop signal and
                 * initiate another group stop.  This deviates from the
                 * usual behavior as two consecutive stop signals can't
                 * cause two group stops when !ptraced.  That is why we
                 * also check !task_is_stopped(t) below.
                 *
                 * The condition can be distinguished by testing whether
                 * SIGNAL_STOP_STOPPED is already set.  Don't generate
                 * group_exit_code in such case.
                 *
                 * This is not necessary for SIGNAL_STOP_CONTINUED because
                 * an intervening stop signal is required to cause two
                 * continued events regardless of ptrace.
                 */
                if (!(sig->flags & SIGNAL_STOP_STOPPED))
                        sig->group_exit_code = signr;

                sig->group_stop_count = 0;
                if (task_set_jobctl_pending(current, signr | gstop))
                        sig->group_stop_count++;

                for_other_threads(current, t) {
                        /*
                         * Setting state to TASK_STOPPED for a group
                         * stop is always done with the siglock held,
                         * so this check has no races.
                         */
                        if (!task_is_stopped(t) &&
                            task_set_jobctl_pending(t, signr | gstop)) {
                                sig->group_stop_count++;
                                if (likely(!(t->ptrace & PT_SEIZED)))
                                        signal_wake_up(t, 0);
                                else
                                        ptrace_trap_notify(t);
                        }
                }
        }

        if (likely(!current->ptrace)) {
                int notify = 0;

                /*
                 * If there are no other threads in the group, or if there
                 * is a group stop in progress and we are the last to stop,
                 * report to the parent.
                 */
                if (task_participate_group_stop(current))
                        notify = CLD_STOPPED;

                current->jobctl |= JOBCTL_STOPPED;
                set_special_state(TASK_STOPPED);
                spin_unlock_irq(&current->sighand->siglock);

                /*
                 * Notify the parent of the group stop completion.  Because
                 * we're not holding either the siglock or tasklist_lock
                 * here, ptracer may attach inbetween; however, this is for
                 * group stop and should always be delivered to the real
                 * parent of the group leader.  The new ptracer will get
                 * its notification when this task transitions into
                 * TASK_TRACED.
                 */
                if (notify) {
                        read_lock(&tasklist_lock);
                        do_notify_parent_cldstop(current, false, notify);
                        read_unlock(&tasklist_lock);
                }

                /* Now we don't run again until woken by SIGCONT or SIGKILL */
                cgroup_enter_frozen();
                schedule();
                return true;
        } else {
                /*
                 * While ptraced, group stop is handled by STOP trap.
                 * Schedule it and let the caller deal with it.
                 */
                task_set_jobctl_pending(current, JOBCTL_TRAP_STOP);
                return false;
        }
}

/**
 * do_jobctl_trap - take care of ptrace jobctl traps
 *
 * When PT_SEIZED, it's used for both group stop and explicit
 * SEIZE/INTERRUPT traps.  Both generate PTRACE_EVENT_STOP trap with
 * accompanying siginfo.  If stopped, lower eight bits of exit_code contain
 * the stop signal; otherwise, %SIGTRAP.
 *
 * When !PT_SEIZED, it's used only for group stop trap with stop signal
 * number as exit_code and no siginfo.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held, which may be
 * released and re-acquired before returning with intervening sleep.
 */
static void do_jobctl_trap(void)
{
        struct signal_struct *signal = current->signal;
        int signr = current->jobctl & JOBCTL_STOP_SIGMASK;

        if (current->ptrace & PT_SEIZED) {
                if (!signal->group_stop_count &&
                    !(signal->flags & SIGNAL_STOP_STOPPED))
                        signr = SIGTRAP;
                WARN_ON_ONCE(!signr);
                ptrace_do_notify(signr, signr | (PTRACE_EVENT_STOP << 8),
                                 CLD_STOPPED, 0);
        } else {
                WARN_ON_ONCE(!signr);
                ptrace_stop(signr, CLD_STOPPED, 0, NULL);
        }
}

/**
 * do_freezer_trap - handle the freezer jobctl trap
 *
 * Puts the task into frozen state, if only the task is not about to quit.
 * In this case it drops JOBCTL_TRAP_FREEZE.
 *
 * CONTEXT:
 * Must be called with @current->sighand->siglock held,
 * which is always released before returning.
 */
static void do_freezer_trap(void)
        __releases(&current->sighand->siglock)
{
        /*
         * If there are other trap bits pending except JOBCTL_TRAP_FREEZE,
         * let's make another loop to give it a chance to be handled.
         * In any case, we'll return back.
         */
        if ((current->jobctl & (JOBCTL_PENDING_MASK | JOBCTL_TRAP_FREEZE)) !=
             JOBCTL_TRAP_FREEZE) {
                spin_unlock_irq(&current->sighand->siglock);
                return;
        }

        /*
         * Now we're sure that there is no pending fatal signal and no
         * pending traps. Clear TIF_SIGPENDING to not get out of schedule()
         * immediately (if there is a non-fatal signal pending), and
         * put the task into sleep.
         */
        __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
        clear_thread_flag(TIF_SIGPENDING);
        spin_unlock_irq(&current->sighand->siglock);
        cgroup_enter_frozen();
        schedule();

        /*
         * We could've been woken by task_work, run it to clear
         * TIF_NOTIFY_SIGNAL. The caller will retry if necessary.
         */
        clear_notify_signal();
        if (unlikely(task_work_pending(current)))
                task_work_run();
}

static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
{
        /*
         * We do not check sig_kernel_stop(signr) but set this marker
         * unconditionally because we do not know whether debugger will
         * change signr. This flag has no meaning unless we are going
         * to stop after return from ptrace_stop(). In this case it will
         * be checked in do_signal_stop(), we should only stop if it was
         * not cleared by SIGCONT while we were sleeping. See also the
         * comment in dequeue_signal().
         */
        current->jobctl |= JOBCTL_STOP_DEQUEUED;
        signr = ptrace_stop(signr, CLD_TRAPPED, 0, info);

        /* We're back.  Did the debugger cancel the sig?  */
        if (signr == 0)
                return signr;

        /*
         * Update the siginfo structure if the signal has
         * changed.  If the debugger wanted something
         * specific in the siginfo structure then it should
         * have updated *info via PTRACE_SETSIGINFO.
         */
        if (signr != info->si_signo) {
                clear_siginfo(info);
                info->si_signo = signr;
                info->si_errno = 0;
                info->si_code = SI_USER;
                rcu_read_lock();
                info->si_pid = task_pid_vnr(current->parent);
                info->si_uid = from_kuid_munged(current_user_ns(),
                                                task_uid(current->parent));
                rcu_read_unlock();
        }

        /* If the (new) signal is now blocked, requeue it.  */
        if (sigismember(&current->blocked, signr) ||
            fatal_signal_pending(current)) {
                send_signal_locked(signr, info, current, type);
                signr = 0;
        }

        return signr;
}

static void hide_si_addr_tag_bits(struct ksignal *ksig)
{
        switch (siginfo_layout(ksig->sig, ksig->info.si_code)) {
        case SIL_FAULT:
        case SIL_FAULT_TRAPNO:
        case SIL_FAULT_MCEERR:
        case SIL_FAULT_BNDERR:
        case SIL_FAULT_PKUERR:
        case SIL_FAULT_PERF_EVENT:
                ksig->info.si_addr = arch_untagged_si_addr(
                        ksig->info.si_addr, ksig->sig, ksig->info.si_code);
                break;
        case SIL_KILL:
        case SIL_TIMER:
        case SIL_POLL:
        case SIL_CHLD:
        case SIL_RT:
        case SIL_SYS:
                break;
        }
}

bool get_signal(struct ksignal *ksig)
{
        struct sighand_struct *sighand = current->sighand;
        struct signal_struct *signal = current->signal;
        int signr;

        clear_notify_signal();
        if (unlikely(task_work_pending(current)))
                task_work_run();

        if (!task_sigpending(current))
                return false;

        if (unlikely(uprobe_deny_signal()))
                return false;

        /*
         * Do this once, we can't return to user-mode if freezing() == T.
         * do_signal_stop() and ptrace_stop() do freezable_schedule() and
         * thus do not need another check after return.
         */
        try_to_freeze();

relock:
        spin_lock_irq(&sighand->siglock);

        /*
         * Every stopped thread goes here after wakeup. Check to see if
         * we should notify the parent, prepare_signal(SIGCONT) encodes
         * the CLD_ si_code into SIGNAL_CLD_MASK bits.
         */
        if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
                int why;

                if (signal->flags & SIGNAL_CLD_CONTINUED)
                        why = CLD_CONTINUED;
                else
                        why = CLD_STOPPED;

                signal->flags &= ~SIGNAL_CLD_MASK;

                spin_unlock_irq(&sighand->siglock);

                /*
                 * Notify the parent that we're continuing.  This event is
                 * always per-process and doesn't make whole lot of sense
                 * for ptracers, who shouldn't consume the state via
                 * wait(2) either, but, for backward compatibility, notify
                 * the ptracer of the group leader too unless it's gonna be
                 * a duplicate.
                 */
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(current, false, why);

                if (ptrace_reparented(current->group_leader))
                        do_notify_parent_cldstop(current->group_leader,
                                                true, why);
                read_unlock(&tasklist_lock);

                goto relock;
        }

        for (;;) {
                struct k_sigaction *ka;
                enum pid_type type;

                /* Has this task already been marked for death? */
                if ((signal->flags & SIGNAL_GROUP_EXIT) ||
                     signal->group_exec_task) {
                        signr = SIGKILL;
                        sigdelset(&current->pending.signal, SIGKILL);
                        trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO,
                                             &sighand->action[SIGKILL-1]);
                        recalc_sigpending();
                        /*
                         * implies do_group_exit() or return to PF_USER_WORKER,
                         * no need to initialize ksig->info/etc.
                         */
                        goto fatal;
                }

                if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
                    do_signal_stop(0))
                        goto relock;

                if (unlikely(current->jobctl &
                             (JOBCTL_TRAP_MASK | JOBCTL_TRAP_FREEZE))) {
                        if (current->jobctl & JOBCTL_TRAP_MASK) {
                                do_jobctl_trap();
                                spin_unlock_irq(&sighand->siglock);
                        } else if (current->jobctl & JOBCTL_TRAP_FREEZE)
                                do_freezer_trap();

                        goto relock;
                }

                /*
                 * If the task is leaving the frozen state, let's update
                 * cgroup counters and reset the frozen bit.
                 */
                if (unlikely(cgroup_task_frozen(current))) {
                        spin_unlock_irq(&sighand->siglock);
                        cgroup_leave_frozen(false);
                        goto relock;
                }

                /*
                 * Signals generated by the execution of an instruction
                 * need to be delivered before any other pending signals
                 * so that the instruction pointer in the signal stack
                 * frame points to the faulting instruction.
                 */
                type = PIDTYPE_PID;
                signr = dequeue_synchronous_signal(&ksig->info);
                if (!signr)
                        signr = dequeue_signal(&current->blocked, &ksig->info, &type);

                if (!signr)
                        break; /* will return 0 */

                if (unlikely(current->ptrace) && (signr != SIGKILL) &&
                    !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) {
                        signr = ptrace_signal(signr, &ksig->info, type);
                        if (!signr)
                                continue;
                }

                ka = &sighand->action[signr-1];

                /* Trace actually delivered signals. */
                trace_signal_deliver(signr, &ksig->info, ka);

                if (ka->sa.sa_handler == SIG_IGN) /* Do nothing.  */
                        continue;
                if (ka->sa.sa_handler != SIG_DFL) {
                        /* Run the handler.  */
                        ksig->ka = *ka;

                        if (ka->sa.sa_flags & SA_ONESHOT)
                                ka->sa.sa_handler = SIG_DFL;

                        break; /* will return non-zero "signr" value */
                }

                /*
                 * Now we are doing the default action for this signal.
                 */
                if (sig_kernel_ignore(signr)) /* Default is nothing. */
                        continue;

                /*
                 * Global init gets no signals it doesn't want.
                 * Container-init gets no signals it doesn't want from same
                 * container.
                 *
                 * Note that if global/container-init sees a sig_kernel_only()
                 * signal here, the signal must have been generated internally
                 * or must have come from an ancestor namespace. In either
                 * case, the signal cannot be dropped.
                 */
                if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
                                !sig_kernel_only(signr))
                        continue;

                if (sig_kernel_stop(signr)) {
                        /*
                         * The default action is to stop all threads in
                         * the thread group.  The job control signals
                         * do nothing in an orphaned pgrp, but SIGSTOP
                         * always works.  Note that siglock needs to be
                         * dropped during the call to is_orphaned_pgrp()
                         * because of lock ordering with tasklist_lock.
                         * This allows an intervening SIGCONT to be posted.
                         * We need to check for that and bail out if necessary.
                         */
                        if (signr != SIGSTOP) {
                                spin_unlock_irq(&sighand->siglock);

                                /* signals can be posted during this window */

                                if (is_current_pgrp_orphaned())
                                        goto relock;

                                spin_lock_irq(&sighand->siglock);
                        }

                        if (likely(do_signal_stop(signr))) {
                                /* It released the siglock.  */
                                goto relock;
                        }

                        /*
                         * We didn't actually stop, due to a race
                         * with SIGCONT or something like that.
                         */
                        continue;
                }

        fatal:
                spin_unlock_irq(&sighand->siglock);
                if (unlikely(cgroup_task_frozen(current)))
                        cgroup_leave_frozen(true);

                /*
                 * Anything else is fatal, maybe with a core dump.
                 */
                current->flags |= PF_SIGNALED;

                if (sig_kernel_coredump(signr)) {
                        if (print_fatal_signals)
                                print_fatal_signal(signr);
                        proc_coredump_connector(current);
                        /*
                         * If it was able to dump core, this kills all
                         * other threads in the group and synchronizes with
                         * their demise.  If we lost the race with another
                         * thread getting here, it set group_exit_code
                         * first and our do_group_exit call below will use
                         * that value and ignore the one we pass it.
                         */
                        do_coredump(&ksig->info);
                }

                /*
                 * PF_USER_WORKER threads will catch and exit on fatal signals
                 * themselves. They have cleanup that must be performed, so we
                 * cannot call do_exit() on their behalf. Note that ksig won't
                 * be properly initialized, PF_USER_WORKER's shouldn't use it.
                 */
                if (current->flags & PF_USER_WORKER)
                        goto out;

                /*
                 * Death signals, no core dump.
                 */
                do_group_exit(signr);
                /* NOTREACHED */
        }
        spin_unlock_irq(&sighand->siglock);

        ksig->sig = signr;

        if (signr && !(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS))
                hide_si_addr_tag_bits(ksig);
out:
        return signr > 0;
}

/**
 * signal_delivered - called after signal delivery to update blocked signals
 * @ksig:                kernel signal struct
 * @stepping:                nonzero if debugger single-step or block-step in use
 *
 * This function should be called when a signal has successfully been
 * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
 * is always blocked), and the signal itself is blocked unless %SA_NODEFER
 * is set in @ksig->ka.sa.sa_flags.  Tracing is notified.
 */
static void signal_delivered(struct ksignal *ksig, int stepping)
{
        sigset_t blocked;

        /* A signal was successfully delivered, and the
           saved sigmask was stored on the signal frame,
           and will be restored by sigreturn.  So we can
           simply clear the restore sigmask flag.  */
        clear_restore_sigmask();

        sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
        if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
                sigaddset(&blocked, ksig->sig);
        set_current_blocked(&blocked);
        if (current->sas_ss_flags & SS_AUTODISARM)
                sas_ss_reset(current);
        if (stepping)
                ptrace_notify(SIGTRAP, 0);
}

void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
{
        if (failed)
                force_sigsegv(ksig->sig);
        else
                signal_delivered(ksig, stepping);
}

/*
 * It could be that complete_signal() picked us to notify about the
 * group-wide signal. Other threads should be notified now to take
 * the shared signals in @which since we will not.
 */
static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
{
        sigset_t retarget;
        struct task_struct *t;

        sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
        if (sigisemptyset(&retarget))
                return;

        for_other_threads(tsk, t) {
                if (t->flags & PF_EXITING)
                        continue;

                if (!has_pending_signals(&retarget, &t->blocked))
                        continue;
                /* Remove the signals this thread can handle. */
                sigandsets(&retarget, &retarget, &t->blocked);

                if (!task_sigpending(t))
                        signal_wake_up(t, 0);

                if (sigisemptyset(&retarget))
                        break;
        }
}

void exit_signals(struct task_struct *tsk)
{
        int group_stop = 0;
        sigset_t unblocked;

        /*
         * @tsk is about to have PF_EXITING set - lock out users which
         * expect stable threadgroup.
         */
        cgroup_threadgroup_change_begin(tsk);

        if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
                sched_mm_cid_exit_signals(tsk);
                tsk->flags |= PF_EXITING;
                cgroup_threadgroup_change_end(tsk);
                return;
        }

        spin_lock_irq(&tsk->sighand->siglock);
        /*
         * From now this task is not visible for group-wide signals,
         * see wants_signal(), do_signal_stop().
         */
        sched_mm_cid_exit_signals(tsk);
        tsk->flags |= PF_EXITING;

        cgroup_threadgroup_change_end(tsk);

        if (!task_sigpending(tsk))
                goto out;

        unblocked = tsk->blocked;
        signotset(&unblocked);
        retarget_shared_pending(tsk, &unblocked);

        if (unlikely(tsk->jobctl & JOBCTL_STOP_PENDING) &&
            task_participate_group_stop(tsk))
                group_stop = CLD_STOPPED;
out:
        spin_unlock_irq(&tsk->sighand->siglock);

        /*
         * If group stop has completed, deliver the notification.  This
         * should always go to the real parent of the group leader.
         */
        if (unlikely(group_stop)) {
                read_lock(&tasklist_lock);
                do_notify_parent_cldstop(tsk, false, group_stop);
                read_unlock(&tasklist_lock);
        }
}

/*
 * System call entry points.
 */

/**
 *  sys_restart_syscall - restart a system call
 */
SYSCALL_DEFINE0(restart_syscall)
{
        struct restart_block *restart = &current->restart_block;
        return restart->fn(restart);
}

long do_no_restart_syscall(struct restart_block *param)
{
        return -EINTR;
}

static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
{
        if (task_sigpending(tsk) && !thread_group_empty(tsk)) {
                sigset_t newblocked;
                /* A set of now blocked but previously unblocked signals. */
                sigandnsets(&newblocked, newset, &current->blocked);
                retarget_shared_pending(tsk, &newblocked);
        }
        tsk->blocked = *newset;
        recalc_sigpending();
}

/**
 * set_current_blocked - change current->blocked mask
 * @newset: new mask
 *
 * It is wrong to change ->blocked directly, this helper should be used
 * to ensure the process can't miss a shared signal we are going to block.
 */
void set_current_blocked(sigset_t *newset)
{
        sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
        __set_current_blocked(newset);
}

void __set_current_blocked(const sigset_t *newset)
{
        struct task_struct *tsk = current;

        /*
         * In case the signal mask hasn't changed, there is nothing we need
         * to do. The current->blocked shouldn't be modified by other task.
         */
        if (sigequalsets(&tsk->blocked, newset))
                return;

        spin_lock_irq(&tsk->sighand->siglock);
        __set_task_blocked(tsk, newset);
        spin_unlock_irq(&tsk->sighand->siglock);
}

/*
 * This is also useful for kernel threads that want to temporarily
 * (or permanently) block certain signals.
 *
 * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel
 * interface happily blocks "unblockable" signals like SIGKILL
 * and friends.
 */
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
        struct task_struct *tsk = current;
        sigset_t newset;

        /* Lockless, only current can change ->blocked, never from irq */
        if (oldset)
                *oldset = tsk->blocked;

        switch (how) {
        case SIG_BLOCK:
                sigorsets(&newset, &tsk->blocked, set);
                break;
        case SIG_UNBLOCK:
                sigandnsets(&newset, &tsk->blocked, set);
                break;
        case SIG_SETMASK:
                newset = *set;
                break;
        default:
                return -EINVAL;
        }

        __set_current_blocked(&newset);
        return 0;
}
EXPORT_SYMBOL(sigprocmask);

/*
 * The api helps set app-provided sigmasks.
 *
 * This is useful for syscalls such as ppoll, pselect, io_pgetevents and
 * epoll_pwait where a new sigmask is passed from userland for the syscalls.
 *
 * Note that it does set_restore_sigmask() in advance, so it must be always
 * paired with restore_saved_sigmask_unless() before return from syscall.
 */
int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;
        if (copy_from_user(&kmask, umask, sizeof(sigset_t)))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}

#ifdef CONFIG_COMPAT
int set_compat_user_sigmask(const compat_sigset_t __user *umask,
                            size_t sigsetsize)
{
        sigset_t kmask;

        if (!umask)
                return 0;
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;
        if (get_compat_sigset(&kmask, umask))
                return -EFAULT;

        set_restore_sigmask();
        current->saved_sigmask = current->blocked;
        set_current_blocked(&kmask);

        return 0;
}
#endif

/**
 *  sys_rt_sigprocmask - change the list of currently blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: stores pending signals
 *  @oset: previous value of signal mask if non-null
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
                sigset_t __user *, oset, size_t, sigsetsize)
{
        sigset_t old_set, new_set;
        int error;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        old_set = current->blocked;

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
                        return -EFAULT;
        }

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
                compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
{
        sigset_t old_set = current->blocked;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (nset) {
                sigset_t new_set;
                int error;
                if (get_compat_sigset(&new_set, nset))
                        return -EFAULT;
                sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));

                error = sigprocmask(how, &new_set, NULL);
                if (error)
                        return error;
        }
        return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
}
#endif

static void do_sigpending(sigset_t *set)
{
        spin_lock_irq(&current->sighand->siglock);
        sigorsets(set, &current->pending.signal,
                  &current->signal->shared_pending.signal);
        spin_unlock_irq(&current->sighand->siglock);

        /* Outside the lock because only this thread touches it.  */
        sigandsets(set, &current->blocked, set);
}

/**
 *  sys_rt_sigpending - examine a pending signal that has been raised
 *                        while blocked
 *  @uset: stores pending signals
 *  @sigsetsize: size of sigset_t type or larger
 */
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sigsetsize))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
                compat_size_t, sigsetsize)
{
        sigset_t set;

        if (sigsetsize > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        return put_compat_sigset(uset, &set, sigsetsize);
}
#endif

static const struct {
        unsigned char limit, layout;
} sig_sicodes[] = {
        [SIGILL]  = { NSIGILL,  SIL_FAULT },
        [SIGFPE]  = { NSIGFPE,  SIL_FAULT },
        [SIGSEGV] = { NSIGSEGV, SIL_FAULT },
        [SIGBUS]  = { NSIGBUS,  SIL_FAULT },
        [SIGTRAP] = { NSIGTRAP, SIL_FAULT },
#if defined(SIGEMT)
        [SIGEMT]  = { NSIGEMT,  SIL_FAULT },
#endif
        [SIGCHLD] = { NSIGCHLD, SIL_CHLD },
        [SIGPOLL] = { NSIGPOLL, SIL_POLL },
        [SIGSYS]  = { NSIGSYS,  SIL_SYS },
};

static bool known_siginfo_layout(unsigned sig, int si_code)
{
        if (si_code == SI_KERNEL)
                return true;
        else if ((si_code > SI_USER)) {
                if (sig_specific_sicodes(sig)) {
                        if (si_code <= sig_sicodes[sig].limit)
                                return true;
                }
                else if (si_code <= NSIGPOLL)
                        return true;
        }
        else if (si_code >= SI_DETHREAD)
                return true;
        else if (si_code == SI_ASYNCNL)
                return true;
        return false;
}

enum siginfo_layout siginfo_layout(unsigned sig, int si_code)
{
        enum siginfo_layout layout = SIL_KILL;
        if ((si_code > SI_USER) && (si_code < SI_KERNEL)) {
                if ((sig < ARRAY_SIZE(sig_sicodes)) &&
                    (si_code <= sig_sicodes[sig].limit)) {
                        layout = sig_sicodes[sig].layout;
                        /* Handle the exceptions */
                        if ((sig == SIGBUS) &&
                            (si_code >= BUS_MCEERR_AR) && (si_code <= BUS_MCEERR_AO))
                                layout = SIL_FAULT_MCEERR;
                        else if ((sig == SIGSEGV) && (si_code == SEGV_BNDERR))
                                layout = SIL_FAULT_BNDERR;
#ifdef SEGV_PKUERR
                        else if ((sig == SIGSEGV) && (si_code == SEGV_PKUERR))
                                layout = SIL_FAULT_PKUERR;
#endif
                        else if ((sig == SIGTRAP) && (si_code == TRAP_PERF))
                                layout = SIL_FAULT_PERF_EVENT;
                        else if (IS_ENABLED(CONFIG_SPARC) &&
                                 (sig == SIGILL) && (si_code == ILL_ILLTRP))
                                layout = SIL_FAULT_TRAPNO;
                        else if (IS_ENABLED(CONFIG_ALPHA) &&
                                 ((sig == SIGFPE) ||
                                  ((sig == SIGTRAP) && (si_code == TRAP_UNK))))
                                layout = SIL_FAULT_TRAPNO;
                }
                else if (si_code <= NSIGPOLL)
                        layout = SIL_POLL;
        } else {
                if (si_code == SI_TIMER)
                        layout = SIL_TIMER;
                else if (si_code == SI_SIGIO)
                        layout = SIL_POLL;
                else if (si_code < 0)
                        layout = SIL_RT;
        }
        return layout;
}

static inline char __user *si_expansion(const siginfo_t __user *info)
{
        return ((char __user *)info) + sizeof(struct kernel_siginfo);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from)
{
        char __user *expansion = si_expansion(to);
        if (copy_to_user(to, from , sizeof(struct kernel_siginfo)))
                return -EFAULT;
        if (clear_user(expansion, SI_EXPANSION_SIZE))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user(kernel_siginfo_t *info,
                                       const siginfo_t __user *from)
{
        if (unlikely(!known_siginfo_layout(info->si_signo, info->si_code))) {
                char __user *expansion = si_expansion(from);
                char buf[SI_EXPANSION_SIZE];
                int i;
                /*
                 * An unknown si_code might need more than
                 * sizeof(struct kernel_siginfo) bytes.  Verify all of the
                 * extra bytes are 0.  This guarantees copy_siginfo_to_user
                 * will return this data to userspace exactly.
                 */
                if (copy_from_user(&buf, expansion, SI_EXPANSION_SIZE))
                        return -EFAULT;
                for (i = 0; i < SI_EXPANSION_SIZE; i++) {
                        if (buf[i] != 0)
                                return -E2BIG;
                }
        }
        return 0;
}

static int __copy_siginfo_from_user(int signo, kernel_siginfo_t *to,
                                    const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        to->si_signo = signo;
        return post_copy_siginfo_from_user(to, from);
}

int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from)
{
        if (copy_from_user(to, from, sizeof(struct kernel_siginfo)))
                return -EFAULT;
        return post_copy_siginfo_from_user(to, from);
}

#ifdef CONFIG_COMPAT
/**
 * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo
 * @to: compat siginfo destination
 * @from: kernel siginfo source
 *
 * Note: This function does not work properly for the SIGCHLD on x32, but
 * fortunately it doesn't have to.  The only valid callers for this function are
 * copy_siginfo_to_user32, which is overriden for x32 and the coredump code.
 * The latter does not care because SIGCHLD will never cause a coredump.
 */
void copy_siginfo_to_external32(struct compat_siginfo *to,
                const struct kernel_siginfo *from)
{
        memset(to, 0, sizeof(*to));

        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = ptr_to_compat(from->si_addr);
                break;
        case SIL_FAULT_TRAPNO:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_trapno = from->si_trapno;
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_lower = ptr_to_compat(from->si_lower);
                to->si_upper = ptr_to_compat(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_pkey = from->si_pkey;
                break;
        case SIL_FAULT_PERF_EVENT:
                to->si_addr = ptr_to_compat(from->si_addr);
                to->si_perf_data = from->si_perf_data;
                to->si_perf_type = from->si_perf_type;
                to->si_perf_flags = from->si_perf_flags;
                break;
        case SIL_CHLD:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_status = from->si_status;
                to->si_utime = from->si_utime;
                to->si_stime = from->si_stime;
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = ptr_to_compat(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
}

int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
                           const struct kernel_siginfo *from)
{
        struct compat_siginfo new;

        copy_siginfo_to_external32(&new, from);
        if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
                return -EFAULT;
        return 0;
}

static int post_copy_siginfo_from_user32(kernel_siginfo_t *to,
                                         const struct compat_siginfo *from)
{
        clear_siginfo(to);
        to->si_signo = from->si_signo;
        to->si_errno = from->si_errno;
        to->si_code  = from->si_code;
        switch(siginfo_layout(from->si_signo, from->si_code)) {
        case SIL_KILL:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                break;
        case SIL_TIMER:
                to->si_tid     = from->si_tid;
                to->si_overrun = from->si_overrun;
                to->si_int     = from->si_int;
                break;
        case SIL_POLL:
                to->si_band = from->si_band;
                to->si_fd   = from->si_fd;
                break;
        case SIL_FAULT:
                to->si_addr = compat_ptr(from->si_addr);
                break;
        case SIL_FAULT_TRAPNO:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_trapno = from->si_trapno;
                break;
        case SIL_FAULT_MCEERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_addr_lsb = from->si_addr_lsb;
                break;
        case SIL_FAULT_BNDERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_lower = compat_ptr(from->si_lower);
                to->si_upper = compat_ptr(from->si_upper);
                break;
        case SIL_FAULT_PKUERR:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_pkey = from->si_pkey;
                break;
        case SIL_FAULT_PERF_EVENT:
                to->si_addr = compat_ptr(from->si_addr);
                to->si_perf_data = from->si_perf_data;
                to->si_perf_type = from->si_perf_type;
                to->si_perf_flags = from->si_perf_flags;
                break;
        case SIL_CHLD:
                to->si_pid    = from->si_pid;
                to->si_uid    = from->si_uid;
                to->si_status = from->si_status;
#ifdef CONFIG_X86_X32_ABI
                if (in_x32_syscall()) {
                        to->si_utime = from->_sifields._sigchld_x32._utime;
                        to->si_stime = from->_sifields._sigchld_x32._stime;
                } else
#endif
                {
                        to->si_utime = from->si_utime;
                        to->si_stime = from->si_stime;
                }
                break;
        case SIL_RT:
                to->si_pid = from->si_pid;
                to->si_uid = from->si_uid;
                to->si_int = from->si_int;
                break;
        case SIL_SYS:
                to->si_call_addr = compat_ptr(from->si_call_addr);
                to->si_syscall   = from->si_syscall;
                to->si_arch      = from->si_arch;
                break;
        }
        return 0;
}

static int __copy_siginfo_from_user32(int signo, struct kernel_siginfo *to,
                                      const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        from.si_signo = signo;
        return post_copy_siginfo_from_user32(to, &from);
}

int copy_siginfo_from_user32(struct kernel_siginfo *to,
                             const struct compat_siginfo __user *ufrom)
{
        struct compat_siginfo from;

        if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo)))
                return -EFAULT;

        return post_copy_siginfo_from_user32(to, &from);
}
#endif /* CONFIG_COMPAT */

/**
 *  do_sigtimedwait - wait for queued signals specified in @which
 *  @which: queued signals to wait for
 *  @info: if non-null, the signal's siginfo is returned here
 *  @ts: upper bound on process time suspension
 */
static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
                    const struct timespec64 *ts)
{
        ktime_t *to = NULL, timeout = KTIME_MAX;
        struct task_struct *tsk = current;
        sigset_t mask = *which;
        enum pid_type type;
        int sig, ret = 0;

        if (ts) {
                if (!timespec64_valid(ts))
                        return -EINVAL;
                timeout = timespec64_to_ktime(*ts);
                to = &timeout;
        }

        /*
         * Invert the set of allowed signals to get those we want to block.
         */
        sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
        signotset(&mask);

        spin_lock_irq(&tsk->sighand->siglock);
        sig = dequeue_signal(&mask, info, &type);
        if (!sig && timeout) {
                /*
                 * None ready, temporarily unblock those we're interested
                 * while we are sleeping in so that we'll be awakened when
                 * they arrive. Unblocking is always fine, we can avoid
                 * set_current_blocked().
                 */
                tsk->real_blocked = tsk->blocked;
                sigandsets(&tsk->blocked, &tsk->blocked, &mask);
                recalc_sigpending();
                spin_unlock_irq(&tsk->sighand->siglock);

                __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
                                               HRTIMER_MODE_REL);
                spin_lock_irq(&tsk->sighand->siglock);
                __set_task_blocked(tsk, &tsk->real_blocked);
                sigemptyset(&tsk->real_blocked);
                sig = dequeue_signal(&mask, info, &type);
        }
        spin_unlock_irq(&tsk->sighand->siglock);

        if (sig)
                return sig;
        return ret ? -EINTR : -EAGAIN;
}

/**
 *  sys_rt_sigtimedwait - synchronously wait for queued signals specified
 *                        in @uthese
 *  @uthese: queued signals to wait for
 *  @uinfo: if non-null, the signal's siginfo is returned here
 *  @uts: upper bound on process time suspension
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct __kernel_timespec __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE4(rt_sigtimedwait_time32, const sigset_t __user *, uthese,
                siginfo_t __user *, uinfo,
                const struct old_timespec32 __user *, uts,
                size_t, sigsetsize)
{
        sigset_t these;
        struct timespec64 ts;
        kernel_siginfo_t info;
        int ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&these, uthese, sizeof(these)))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&ts, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time64, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct __kernel_timespec __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_timespec64(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}

#ifdef CONFIG_COMPAT_32BIT_TIME
COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
                struct compat_siginfo __user *, uinfo,
                struct old_timespec32 __user *, uts, compat_size_t, sigsetsize)
{
        sigset_t s;
        struct timespec64 t;
        kernel_siginfo_t info;
        long ret;

        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&s, uthese))
                return -EFAULT;

        if (uts) {
                if (get_old_timespec32(&t, uts))
                        return -EFAULT;
        }

        ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);

        if (ret > 0 && uinfo) {
                if (copy_siginfo_to_user32(uinfo, &info))
                        ret = -EFAULT;
        }

        return ret;
}
#endif
#endif

static void prepare_kill_siginfo(int sig, struct kernel_siginfo *info,
                                 enum pid_type type)
{
        clear_siginfo(info);
        info->si_signo = sig;
        info->si_errno = 0;
        info->si_code = (type == PIDTYPE_PID) ? SI_TKILL : SI_USER;
        info->si_pid = task_tgid_vnr(current);
        info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}

/**
 *  sys_kill - send a signal to a process
 *  @pid: the PID of the process
 *  @sig: signal to be sent
 */
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
        struct kernel_siginfo info;

        prepare_kill_siginfo(sig, &info, PIDTYPE_TGID);

        return kill_something_info(sig, &info, pid);
}

/*
 * Verify that the signaler and signalee either are in the same pid namespace
 * or that the signaler's pid namespace is an ancestor of the signalee's pid
 * namespace.
 */
static bool access_pidfd_pidns(struct pid *pid)
{
        struct pid_namespace *active = task_active_pid_ns(current);
        struct pid_namespace *p = ns_of_pid(pid);

        for (;;) {
                if (!p)
                        return false;
                if (p == active)
                        break;
                p = p->parent;
        }

        return true;
}

static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo,
                siginfo_t __user *info)
{
#ifdef CONFIG_COMPAT
        /*
         * Avoid hooking up compat syscalls and instead handle necessary
         * conversions here. Note, this is a stop-gap measure and should not be
         * considered a generic solution.
         */
        if (in_compat_syscall())
                return copy_siginfo_from_user32(
                        kinfo, (struct compat_siginfo __user *)info);
#endif
        return copy_siginfo_from_user(kinfo, info);
}

static struct pid *pidfd_to_pid(const struct file *file)
{
        struct pid *pid;

        pid = pidfd_pid(file);
        if (!IS_ERR(pid))
                return pid;

        return tgid_pidfd_to_pid(file);
}

#define PIDFD_SEND_SIGNAL_FLAGS                            \
        (PIDFD_SIGNAL_THREAD | PIDFD_SIGNAL_THREAD_GROUP | \
         PIDFD_SIGNAL_PROCESS_GROUP)

static int do_pidfd_send_signal(struct pid *pid, int sig, enum pid_type type,
                                siginfo_t __user *info, unsigned int flags)
{
        kernel_siginfo_t kinfo;

        switch (flags) {
        case PIDFD_SIGNAL_THREAD:
                type = PIDTYPE_PID;
                break;
        case PIDFD_SIGNAL_THREAD_GROUP:
                type = PIDTYPE_TGID;
                break;
        case PIDFD_SIGNAL_PROCESS_GROUP:
                type = PIDTYPE_PGID;
                break;
        }

        if (info) {
                int ret;

                ret = copy_siginfo_from_user_any(&kinfo, info);
                if (unlikely(ret))
                        return ret;

                if (unlikely(sig != kinfo.si_signo))
                        return -EINVAL;

                /* Only allow sending arbitrary signals to yourself. */
                if ((task_pid(current) != pid || type > PIDTYPE_TGID) &&
                    (kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL))
                        return -EPERM;
        } else {
                prepare_kill_siginfo(sig, &kinfo, type);
        }

        if (type == PIDTYPE_PGID)
                return kill_pgrp_info(sig, &kinfo, pid);

        return kill_pid_info_type(sig, &kinfo, pid, type);
}

/**
 * sys_pidfd_send_signal - Signal a process through a pidfd
 * @pidfd:  file descriptor of the process
 * @sig:    signal to send
 * @info:   signal info
 * @flags:  future flags
 *
 * Send the signal to the thread group or to the individual thread depending
 * on PIDFD_THREAD.
 * In the future extension to @flags may be used to override the default scope
 * of @pidfd.
 *
 * Return: 0 on success, negative errno on failure
 */
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
                siginfo_t __user *, info, unsigned int, flags)
{
        struct pid *pid;
        enum pid_type type;

        /* Enforce flags be set to 0 until we add an extension. */
        if (flags & ~PIDFD_SEND_SIGNAL_FLAGS)
                return -EINVAL;

        /* Ensure that only a single signal scope determining flag is set. */
        if (hweight32(flags & PIDFD_SEND_SIGNAL_FLAGS) > 1)
                return -EINVAL;

        switch (pidfd) {
        case PIDFD_SELF_THREAD:
                pid = get_task_pid(current, PIDTYPE_PID);
                type = PIDTYPE_PID;
                break;
        case PIDFD_SELF_THREAD_GROUP:
                pid = get_task_pid(current, PIDTYPE_TGID);
                type = PIDTYPE_TGID;
                break;
        default: {
                CLASS(fd, f)(pidfd);
                if (fd_empty(f))
                        return -EBADF;

                /* Is this a pidfd? */
                pid = pidfd_to_pid(fd_file(f));
                if (IS_ERR(pid))
                        return PTR_ERR(pid);

                if (!access_pidfd_pidns(pid))
                        return -EINVAL;

                /* Infer scope from the type of pidfd. */
                if (fd_file(f)->f_flags & PIDFD_THREAD)
                        type = PIDTYPE_PID;
                else
                        type = PIDTYPE_TGID;

                return do_pidfd_send_signal(pid, sig, type, info, flags);
        }
        }

        return do_pidfd_send_signal(pid, sig, type, info, flags);
}

static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{
        struct task_struct *p;
        int error = -ESRCH;

        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
                error = check_kill_permission(sig, info, p);
                /*
                 * The null signal is a permissions and process existence
                 * probe.  No signal is actually delivered.
                 */
                if (!error && sig) {
                        error = do_send_sig_info(sig, info, p, PIDTYPE_PID);
                        /*
                         * If lock_task_sighand() failed we pretend the task
                         * dies after receiving the signal. The window is tiny,
                         * and the signal is private anyway.
                         */
                        if (unlikely(error == -ESRCH))
                                error = 0;
                }
        }
        rcu_read_unlock();

        return error;
}

static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
        struct kernel_siginfo info;

        prepare_kill_siginfo(sig, &info, PIDTYPE_PID);

        return do_send_specific(tgid, pid, sig, &info);
}

/**
 *  sys_tgkill - send signal to one specific thread
 *  @tgid: the thread group ID of the thread
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *
 *  This syscall also checks the @tgid and returns -ESRCH even if the PID
 *  exists but it's not belonging to the target process anymore. This
 *  method solves the problem of threads exiting and PIDs getting reused.
 */
SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        return do_tkill(tgid, pid, sig);
}

/**
 *  sys_tkill - send signal to one specific task
 *  @pid: the PID of the task
 *  @sig: signal to be sent
 *
 *  Send a signal to only one task, even if it's a CLONE_THREAD task.
 */
SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
{
        /* This is only valid for single tasks */
        if (pid <= 0)
                return -EINVAL;

        return do_tkill(0, pid, sig);
}

static int do_rt_sigqueueinfo(pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        /* POSIX.1b doesn't mention process groups.  */
        return kill_proc_info(sig, info, pid);
}

/**
 *  sys_rt_sigqueueinfo - send signal information to a signal
 *  @pid: the PID of the thread
 *  @sig: signal to be sent
 *  @uinfo: signal info to be sent
 */
SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_sigqueueinfo(pid, sig, &info);
}
#endif

static int do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, kernel_siginfo_t *info)
{
        /* This is only valid for single tasks */
        if (pid <= 0 || tgid <= 0)
                return -EINVAL;

        /* Not even root can pretend to send signals from the kernel.
         * Nor can they impersonate a kill()/tgkill(), which adds source info.
         */
        if ((info->si_code >= 0 || info->si_code == SI_TKILL) &&
            (task_pid_vnr(current) != pid))
                return -EPERM;

        return do_send_specific(tgid, pid, sig, info);
}

SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
                siginfo_t __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
                        compat_pid_t, tgid,
                        compat_pid_t, pid,
                        int, sig,
                        struct compat_siginfo __user *, uinfo)
{
        kernel_siginfo_t info;
        int ret = __copy_siginfo_from_user32(sig, &info, uinfo);
        if (unlikely(ret))
                return ret;
        return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
}
#endif

/*
 * For kthreads only, must not be used if cloned with CLONE_SIGHAND
 */
void kernel_sigaction(int sig, __sighandler_t action)
{
        spin_lock_irq(&current->sighand->siglock);
        current->sighand->action[sig - 1].sa.sa_handler = action;
        if (action == SIG_IGN) {
                sigset_t mask;

                sigemptyset(&mask);
                sigaddset(&mask, sig);

                flush_sigqueue_mask(current, &mask, &current->signal->shared_pending);
                flush_sigqueue_mask(current, &mask, &current->pending);
                recalc_sigpending();
        }
        spin_unlock_irq(&current->sighand->siglock);
}
EXPORT_SYMBOL(kernel_sigaction);

void __weak sigaction_compat_abi(struct k_sigaction *act,
                struct k_sigaction *oact)
{
}

int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
{
        struct task_struct *p = current, *t;
        struct k_sigaction *k;
        sigset_t mask;

        if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig)))
                return -EINVAL;

        k = &p->sighand->action[sig-1];

        spin_lock_irq(&p->sighand->siglock);
        if (k->sa.sa_flags & SA_IMMUTABLE) {
                spin_unlock_irq(&p->sighand->siglock);
                return -EINVAL;
        }
        if (oact)
                *oact = *k;

        /*
         * Make sure that we never accidentally claim to support SA_UNSUPPORTED,
         * e.g. by having an architecture use the bit in their uapi.
         */
        BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED);

        /*
         * Clear unknown flag bits in order to allow userspace to detect missing
         * support for flag bits and to allow the kernel to use non-uapi bits
         * internally.
         */
        if (act)
                act->sa.sa_flags &= UAPI_SA_FLAGS;
        if (oact)
                oact->sa.sa_flags &= UAPI_SA_FLAGS;

        sigaction_compat_abi(act, oact);

        if (act) {
                bool was_ignored = k->sa.sa_handler == SIG_IGN;

                sigdelsetmask(&act->sa.sa_mask,
                              sigmask(SIGKILL) | sigmask(SIGSTOP));
                *k = *act;
                /*
                 * POSIX 3.3.1.3:
                 *  "Setting a signal action to SIG_IGN for a signal that is
                 *   pending shall cause the pending signal to be discarded,
                 *   whether or not it is blocked."
                 *
                 *  "Setting a signal action to SIG_DFL for a signal that is
                 *   pending and whose default action is to ignore the signal
                 *   (for example, SIGCHLD), shall cause the pending signal to
                 *   be discarded, whether or not it is blocked"
                 */
                if (sig_handler_ignored(sig_handler(p, sig), sig)) {
                        sigemptyset(&mask);
                        sigaddset(&mask, sig);
                        flush_sigqueue_mask(p, &mask, &p->signal->shared_pending);
                        for_each_thread(p, t)
                                flush_sigqueue_mask(p, &mask, &t->pending);
                } else if (was_ignored) {
                        posixtimer_sig_unignore(p, sig);
                }
        }

        spin_unlock_irq(&p->sighand->siglock);
        return 0;
}

#ifdef CONFIG_DYNAMIC_SIGFRAME
static inline void sigaltstack_lock(void)
        __acquires(&current->sighand->siglock)
{
        spin_lock_irq(&current->sighand->siglock);
}

static inline void sigaltstack_unlock(void)
        __releases(&current->sighand->siglock)
{
        spin_unlock_irq(&current->sighand->siglock);
}
#else
static inline void sigaltstack_lock(void) { }
static inline void sigaltstack_unlock(void) { }
#endif

static int
do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp,
                size_t min_ss_size)
{
        struct task_struct *t = current;
        int ret = 0;

        if (oss) {
                memset(oss, 0, sizeof(stack_t));
                oss->ss_sp = (void __user *) t->sas_ss_sp;
                oss->ss_size = t->sas_ss_size;
                oss->ss_flags = sas_ss_flags(sp) |
                        (current->sas_ss_flags & SS_FLAG_BITS);
        }

        if (ss) {
                void __user *ss_sp = ss->ss_sp;
                size_t ss_size = ss->ss_size;
                unsigned ss_flags = ss->ss_flags;
                int ss_mode;

                if (unlikely(on_sig_stack(sp)))
                        return -EPERM;

                ss_mode = ss_flags & ~SS_FLAG_BITS;
                if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
                                ss_mode != 0))
                        return -EINVAL;

                /*
                 * Return before taking any locks if no actual
                 * sigaltstack changes were requested.
                 */
                if (t->sas_ss_sp == (unsigned long)ss_sp &&
                    t->sas_ss_size == ss_size &&
                    t->sas_ss_flags == ss_flags)
                        return 0;

                sigaltstack_lock();
                if (ss_mode == SS_DISABLE) {
                        ss_size = 0;
                        ss_sp = NULL;
                } else {
                        if (unlikely(ss_size < min_ss_size))
                                ret = -ENOMEM;
                        if (!sigaltstack_size_valid(ss_size))
                                ret = -ENOMEM;
                }
                if (!ret) {
                        t->sas_ss_sp = (unsigned long) ss_sp;
                        t->sas_ss_size = ss_size;
                        t->sas_ss_flags = ss_flags;
                }
                sigaltstack_unlock();
        }
        return ret;
}

SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
        stack_t new, old;
        int err;
        if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
                              current_user_stack_pointer(),
                              MINSIGSTKSZ);
        if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
                err = -EFAULT;
        return err;
}

int restore_altstack(const stack_t __user *uss)
{
        stack_t new;
        if (copy_from_user(&new, uss, sizeof(stack_t)))
                return -EFAULT;
        (void)do_sigaltstack(&new, NULL, current_user_stack_pointer(),
                             MINSIGSTKSZ);
        /* squash all but EFAULT for now */
        return 0;
}

int __save_altstack(stack_t __user *uss, unsigned long sp)
{
        struct task_struct *t = current;
        int err = __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        return err;
}

#ifdef CONFIG_COMPAT
static int do_compat_sigaltstack(const compat_stack_t __user *uss_ptr,
                                 compat_stack_t __user *uoss_ptr)
{
        stack_t uss, uoss;
        int ret;

        if (uss_ptr) {
                compat_stack_t uss32;
                if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
                        return -EFAULT;
                uss.ss_sp = compat_ptr(uss32.ss_sp);
                uss.ss_flags = uss32.ss_flags;
                uss.ss_size = uss32.ss_size;
        }
        ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
                             compat_user_stack_pointer(),
                             COMPAT_MINSIGSTKSZ);
        if (ret >= 0 && uoss_ptr)  {
                compat_stack_t old;
                memset(&old, 0, sizeof(old));
                old.ss_sp = ptr_to_compat(uoss.ss_sp);
                old.ss_flags = uoss.ss_flags;
                old.ss_size = uoss.ss_size;
                if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
                        ret = -EFAULT;
        }
        return ret;
}

COMPAT_SYSCALL_DEFINE2(sigaltstack,
                        const compat_stack_t __user *, uss_ptr,
                        compat_stack_t __user *, uoss_ptr)
{
        return do_compat_sigaltstack(uss_ptr, uoss_ptr);
}

int compat_restore_altstack(const compat_stack_t __user *uss)
{
        int err = do_compat_sigaltstack(uss, NULL);
        /* squash all but -EFAULT for now */
        return err == -EFAULT ? err : 0;
}

int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
{
        int err;
        struct task_struct *t = current;
        err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
                         &uss->ss_sp) |
                __put_user(t->sas_ss_flags, &uss->ss_flags) |
                __put_user(t->sas_ss_size, &uss->ss_size);
        return err;
}
#endif

#ifdef __ARCH_WANT_SYS_SIGPENDING

/**
 *  sys_sigpending - examine pending signals
 *  @uset: where mask of pending signal is returned
 */
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, uset)
{
        sigset_t set;

        if (sizeof(old_sigset_t) > sizeof(*uset))
                return -EINVAL;

        do_sigpending(&set);

        if (copy_to_user(uset, &set, sizeof(old_sigset_t)))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
{
        sigset_t set;

        do_sigpending(&set);

        return put_user(set.sig[0], set32);
}
#endif

#endif

#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/**
 *  sys_sigprocmask - examine and change blocked signals
 *  @how: whether to add, remove, or set signals
 *  @nset: signals to add or remove (if non-null)
 *  @oset: previous value of signal mask if non-null
 *
 * Some platforms have their own version with special arguments;
 * others support only sys_rt_sigprocmask.
 */

SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
                old_sigset_t __user *, oset)
{
        old_sigset_t old_set, new_set;
        sigset_t new_blocked;

        old_set = current->blocked.sig[0];

        if (nset) {
                if (copy_from_user(&new_set, nset, sizeof(*nset)))
                        return -EFAULT;

                new_blocked = current->blocked;

                switch (how) {
                case SIG_BLOCK:
                        sigaddsetmask(&new_blocked, new_set);
                        break;
                case SIG_UNBLOCK:
                        sigdelsetmask(&new_blocked, new_set);
                        break;
                case SIG_SETMASK:
                        new_blocked.sig[0] = new_set;
                        break;
                default:
                        return -EINVAL;
                }

                set_current_blocked(&new_blocked);
        }

        if (oset) {
                if (copy_to_user(oset, &old_set, sizeof(*oset)))
                        return -EFAULT;
        }

        return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */

#ifndef CONFIG_ODD_RT_SIGACTION
/**
 *  sys_rt_sigaction - alter an action taken by a process
 *  @sig: signal to be sent
 *  @act: new sigaction
 *  @oact: used to save the previous sigaction
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct sigaction __user *, act,
                struct sigaction __user *, oact,
                size_t, sigsetsize)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (act && copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa)))
                return -EFAULT;

        ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
        if (ret)
                return ret;

        if (oact && copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa)))
                return -EFAULT;

        return 0;
}
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
                const struct compat_sigaction __user *, act,
                struct compat_sigaction __user *, oact,
                compat_size_t, sigsetsize)
{
        struct k_sigaction new_ka, old_ka;
#ifdef __ARCH_HAS_SA_RESTORER
        compat_uptr_t restorer;
#endif
        int ret;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(compat_sigset_t))
                return -EINVAL;

        if (act) {
                compat_uptr_t handler;
                ret = get_user(handler, &act->sa_handler);
                new_ka.sa.sa_handler = compat_ptr(handler);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= get_user(restorer, &act->sa_restorer);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
#endif
                ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
                ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
                if (ret)
                        return -EFAULT;
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
        if (!ret && oact) {
                ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
                               &oact->sa_handler);
                ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
                                         sizeof(oact->sa_mask));
                ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
#ifdef __ARCH_HAS_SA_RESTORER
                ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                                &oact->sa_restorer);
#endif
        }
        return ret;
}
#endif
#endif /* !CONFIG_ODD_RT_SIGACTION */

#ifdef CONFIG_OLD_SIGACTION
SYSCALL_DEFINE3(sigaction, int, sig,
                const struct old_sigaction __user *, act,
                struct old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;

        if (act) {
                old_sigset_t mask;
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
                    __get_user(new_ka.sa.sa_restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;
#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
                    __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }

        return ret;
}
#endif
#ifdef CONFIG_COMPAT_OLD_SIGACTION
COMPAT_SYSCALL_DEFINE3(sigaction, int, sig,
                const struct compat_old_sigaction __user *, act,
                struct compat_old_sigaction __user *, oact)
{
        struct k_sigaction new_ka, old_ka;
        int ret;
        compat_old_sigset_t mask;
        compat_uptr_t handler, restorer;

        if (act) {
                if (!access_ok(act, sizeof(*act)) ||
                    __get_user(handler, &act->sa_handler) ||
                    __get_user(restorer, &act->sa_restorer) ||
                    __get_user(new_ka.sa.sa_flags, &act->sa_flags) ||
                    __get_user(mask, &act->sa_mask))
                        return -EFAULT;

#ifdef __ARCH_HAS_KA_RESTORER
                new_ka.ka_restorer = NULL;
#endif
                new_ka.sa.sa_handler = compat_ptr(handler);
                new_ka.sa.sa_restorer = compat_ptr(restorer);
                siginitset(&new_ka.sa.sa_mask, mask);
        }

        ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);

        if (!ret && oact) {
                if (!access_ok(oact, sizeof(*oact)) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_handler),
                               &oact->sa_handler) ||
                    __put_user(ptr_to_compat(old_ka.sa.sa_restorer),
                               &oact->sa_restorer) ||
                    __put_user(old_ka.sa.sa_flags, &oact->sa_flags) ||
                    __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask))
                        return -EFAULT;
        }
        return ret;
}
#endif

#ifdef CONFIG_SGETMASK_SYSCALL

/*
 * For backwards compatibility.  Functionality superseded by sigprocmask.
 */
SYSCALL_DEFINE0(sgetmask)
{
        /* SMP safe */
        return current->blocked.sig[0];
}

SYSCALL_DEFINE1(ssetmask, int, newmask)
{
        int old = current->blocked.sig[0];
        sigset_t newset;

        siginitset(&newset, newmask);
        set_current_blocked(&newset);

        return old;
}
#endif /* CONFIG_SGETMASK_SYSCALL */

#ifdef __ARCH_WANT_SYS_SIGNAL
/*
 * For backwards compatibility.  Functionality superseded by sigaction.
 */
SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
{
        struct k_sigaction new_sa, old_sa;
        int ret;

        new_sa.sa.sa_handler = handler;
        new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
        sigemptyset(&new_sa.sa.sa_mask);

        ret = do_sigaction(sig, &new_sa, &old_sa);

        return ret ? ret : (unsigned long)old_sa.sa.sa_handler;
}
#endif /* __ARCH_WANT_SYS_SIGNAL */

#ifdef __ARCH_WANT_SYS_PAUSE

SYSCALL_DEFINE0(pause)
{
        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        return -ERESTARTNOHAND;
}

#endif

static int sigsuspend(sigset_t *set)
{
        current->saved_sigmask = current->blocked;
        set_current_blocked(set);

        while (!signal_pending(current)) {
                __set_current_state(TASK_INTERRUPTIBLE);
                schedule();
        }
        set_restore_sigmask();
        return -ERESTARTNOHAND;
}

/**
 *  sys_rt_sigsuspend - replace the signal mask for a value with the
 *        @unewset value until a signal is received
 *  @unewset: new signal mask value
 *  @sigsetsize: size of sigset_t type
 */
SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (copy_from_user(&newset, unewset, sizeof(newset)))
                return -EFAULT;
        return sigsuspend(&newset);
}
 
#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
{
        sigset_t newset;

        /* XXX: Don't preclude handling different sized sigset_t's.  */
        if (sigsetsize != sizeof(sigset_t))
                return -EINVAL;

        if (get_compat_sigset(&newset, unewset))
                return -EFAULT;
        return sigsuspend(&newset);
}
#endif

#ifdef CONFIG_OLD_SIGSUSPEND
SYSCALL_DEFINE1(sigsuspend, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif
#ifdef CONFIG_OLD_SIGSUSPEND3
SYSCALL_DEFINE3(sigsuspend, int, unused1, int, unused2, old_sigset_t, mask)
{
        sigset_t blocked;
        siginitset(&blocked, mask);
        return sigsuspend(&blocked);
}
#endif

__weak const char *arch_vma_name(struct vm_area_struct *vma)
{
        return NULL;
}

static inline void siginfo_buildtime_checks(void)
{
        BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE);

        /* Verify the offsets in the two siginfos match */
#define CHECK_OFFSET(field) \
        BUILD_BUG_ON(offsetof(siginfo_t, field) != offsetof(kernel_siginfo_t, field))

        /* kill */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);

        /* timer */
        CHECK_OFFSET(si_tid);
        CHECK_OFFSET(si_overrun);
        CHECK_OFFSET(si_value);

        /* rt */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_value);

        /* sigchld */
        CHECK_OFFSET(si_pid);
        CHECK_OFFSET(si_uid);
        CHECK_OFFSET(si_status);
        CHECK_OFFSET(si_utime);
        CHECK_OFFSET(si_stime);

        /* sigfault */
        CHECK_OFFSET(si_addr);
        CHECK_OFFSET(si_trapno);
        CHECK_OFFSET(si_addr_lsb);
        CHECK_OFFSET(si_lower);
        CHECK_OFFSET(si_upper);
        CHECK_OFFSET(si_pkey);
        CHECK_OFFSET(si_perf_data);
        CHECK_OFFSET(si_perf_type);
        CHECK_OFFSET(si_perf_flags);

        /* sigpoll */
        CHECK_OFFSET(si_band);
        CHECK_OFFSET(si_fd);

        /* sigsys */
        CHECK_OFFSET(si_call_addr);
        CHECK_OFFSET(si_syscall);
        CHECK_OFFSET(si_arch);
#undef CHECK_OFFSET

        /* usb asyncio */
        BUILD_BUG_ON(offsetof(struct siginfo, si_pid) !=
                     offsetof(struct siginfo, si_addr));
        if (sizeof(int) == sizeof(void __user *)) {
                BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) !=
                             sizeof(void __user *));
        } else {
                BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) +
                              sizeof_field(struct siginfo, si_uid)) !=
                             sizeof(void __user *));
                BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) !=
                             offsetof(struct siginfo, si_uid));
        }
#ifdef CONFIG_COMPAT
        BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) !=
                     offsetof(struct compat_siginfo, si_addr));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof(compat_uptr_t));
        BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) !=
                     sizeof_field(struct siginfo, si_pid));
#endif
}

#if defined(CONFIG_SYSCTL)
static const struct ctl_table signal_debug_table[] = {
#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
        {
                .procname        = "exception-trace",
                .data                = &show_unhandled_signals,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
#endif
};

static int __init init_signal_sysctls(void)
{
        register_sysctl_init("debug", signal_debug_table);
        return 0;
}
early_initcall(init_signal_sysctls);
#endif /* CONFIG_SYSCTL */

void __init signals_init(void)
{
        siginfo_buildtime_checks();

        sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC | SLAB_ACCOUNT);
}

#ifdef CONFIG_KGDB_KDB
#include <linux/kdb.h>
/*
 * kdb_send_sig - Allows kdb to send signals without exposing
 * signal internals.  This function checks if the required locks are
 * available before calling the main signal code, to avoid kdb
 * deadlocks.
 */
void kdb_send_sig(struct task_struct *t, int sig)
{
        static struct task_struct *kdb_prev_t;
        int new_t, ret;
        if (!spin_trylock(&t->sighand->siglock)) {
                kdb_printf("Can't do kill command now.\n"
                           "The sigmask lock is held somewhere else in "
                           "kernel, try again later\n");
                return;
        }
        new_t = kdb_prev_t != t;
        kdb_prev_t = t;
        if (!task_is_running(t) && new_t) {
                spin_unlock(&t->sighand->siglock);
                kdb_printf("Process is not RUNNING, sending a signal from "
                           "kdb risks deadlock\n"
                           "on the run queue locks. "
                           "The signal has _not_ been sent.\n"
                           "Reissue the kill command if you want to risk "
                           "the deadlock.\n");
                return;
        }
        ret = send_signal_locked(sig, SEND_SIG_PRIV, t, PIDTYPE_PID);
        spin_unlock(&t->sighand->siglock);
        if (ret)
                kdb_printf("Fail to deliver Signal %d to process %d.\n",
                           sig, t->pid);
        else
                kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
}
#endif        /* CONFIG_KGDB_KDB */




































































    3 





    3 









    3 


    3 












































































































































































































































































































































































    3 






















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kdebug.h>
#include <linux/kprobes.h>
#include <linux/export.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>

#define CREATE_TRACE_POINTS
#include <trace/events/notifier.h>

/*
 *        Notifier chain core routines.  The exported routines below
 *        are layered on top of these, with appropriate locking added.
 */

static int notifier_chain_register(struct notifier_block **nl,
                                   struct notifier_block *n,
                                   bool unique_priority)
{
        while ((*nl) != NULL) {
                if (unlikely((*nl) == n)) {
                        WARN(1, "notifier callback %ps already registered",
                             n->notifier_call);
                        return -EEXIST;
                }
                if (n->priority > (*nl)->priority)
                        break;
                if (n->priority == (*nl)->priority && unique_priority)
                        return -EBUSY;
                nl = &((*nl)->next);
        }
        n->next = *nl;
        rcu_assign_pointer(*nl, n);
        trace_notifier_register((void *)n->notifier_call);
        return 0;
}

static int notifier_chain_unregister(struct notifier_block **nl,
                struct notifier_block *n)
{
        while ((*nl) != NULL) {
                if ((*nl) == n) {
                        rcu_assign_pointer(*nl, n->next);
                        trace_notifier_unregister((void *)n->notifier_call);
                        return 0;
                }
                nl = &((*nl)->next);
        }
        return -ENOENT;
}

/**
 * notifier_call_chain - Informs the registered notifiers about an event.
 *        @nl:                Pointer to head of the blocking notifier chain
 *        @val:                Value passed unmodified to notifier function
 *        @v:                Pointer passed unmodified to notifier function
 *        @nr_to_call:        Number of notifier functions to be called. Don't care
 *                        value of this parameter is -1.
 *        @nr_calls:        Records the number of notifications sent. Don't care
 *                        value of this field is NULL.
 *        Return:                notifier_call_chain returns the value returned by the
 *                        last notifier function called.
 */
static int notifier_call_chain(struct notifier_block **nl,
                               unsigned long val, void *v,
                               int nr_to_call, int *nr_calls)
{
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;

        nb = rcu_dereference_raw(*nl);

        while (nb && nr_to_call) {
                next_nb = rcu_dereference_raw(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
                        WARN(1, "Invalid notifier called!");
                        nb = next_nb;
                        continue;
                }
#endif
                trace_notifier_run((void *)nb->notifier_call);
                ret = nb->notifier_call(nb, val, v);

                if (nr_calls)
                        (*nr_calls)++;

                if (ret & NOTIFY_STOP_MASK)
                        break;
                nb = next_nb;
                nr_to_call--;
        }
        return ret;
}
NOKPROBE_SYMBOL(notifier_call_chain);

/**
 * notifier_call_chain_robust - Inform the registered notifiers about an event
 *                              and rollback on error.
 * @nl:                Pointer to head of the blocking notifier chain
 * @val_up:        Value passed unmodified to the notifier function
 * @val_down:        Value passed unmodified to the notifier function when recovering
 *              from an error on @val_up
 * @v:                Pointer passed unmodified to the notifier function
 *
 * NOTE:        It is important the @nl chain doesn't change between the two
 *                invocations of notifier_call_chain() such that we visit the
 *                exact same notifier callbacks; this rules out any RCU usage.
 *
 * Return:        the return value of the @val_up call.
 */
static int notifier_call_chain_robust(struct notifier_block **nl,
                                     unsigned long val_up, unsigned long val_down,
                                     void *v)
{
        int ret, nr = 0;

        ret = notifier_call_chain(nl, val_up, v, -1, &nr);
        if (ret & NOTIFY_STOP_MASK)
                notifier_call_chain(nl, val_down, v, nr-1, NULL);

        return ret;
}

/*
 *        Atomic notifier chain routines.  Registration and unregistration
 *        use a spinlock, and call_chain is synchronized by RCU (no locks).
 */

/**
 *        atomic_notifier_chain_register - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, false);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register);

/**
 *        atomic_notifier_chain_register_unique_prio - Add notifier to an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an atomic notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int atomic_notifier_chain_register_unique_prio(struct atomic_notifier_head *nh,
                                               struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_register(&nh->head, n, true);
        spin_unlock_irqrestore(&nh->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_register_unique_prio);

/**
 *        atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an atomic notifier chain.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
                struct notifier_block *n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&nh->lock, flags);
        ret = notifier_chain_unregister(&nh->head, n);
        spin_unlock_irqrestore(&nh->lock, flags);
        synchronize_rcu();
        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);

/**
 *        atomic_notifier_call_chain - Call functions in an atomic notifier chain
 *        @nh: Pointer to head of the atomic notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an atomic context, so they must not block.
 *        This routine uses RCU to synchronize with changes to the chain.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then atomic_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                               unsigned long val, void *v)
{
        int ret;

        rcu_read_lock();
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
NOKPROBE_SYMBOL(atomic_notifier_call_chain);

/**
 *        atomic_notifier_call_chain_is_empty - Check whether notifier chain is empty
 *        @nh: Pointer to head of the atomic notifier chain
 *
 *        Checks whether notifier chain is empty.
 *
 *        Returns true is notifier chain is empty, false otherwise.
 */
bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh)
{
        return !rcu_access_pointer(nh->head);
}

/*
 *        Blocking notifier chain routines.  All access to the chain is
 *        synchronized by an rwsem.
 */

static int __blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                                              struct notifier_block *n,
                                              bool unique_priority)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, unique_priority);

        down_write(&nh->rwsem);
        ret = notifier_chain_register(&nh->head, n, unique_priority);
        up_write(&nh->rwsem);
        return ret;
}

/**
 *        blocking_notifier_chain_register - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a blocking notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, false);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register);

/**
 *        blocking_notifier_chain_register_unique_prio - Add notifier to a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an blocking notifier chain if there is no other
 *        notifier registered using the same priority.
 *
 *        Returns 0 on success, %-EEXIST or %-EBUSY on error.
 */
int blocking_notifier_chain_register_unique_prio(struct blocking_notifier_head *nh,
                                                 struct notifier_block *n)
{
        return __blocking_notifier_chain_register(nh, n, true);
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_register_unique_prio);

/**
 *        blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a blocking notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call down_write().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        down_write(&nh->rwsem);
        ret = notifier_chain_unregister(&nh->head, n);
        up_write(&nh->rwsem);
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister);

int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain_robust);

/**
 *        blocking_notifier_call_chain - Call functions in a blocking notifier chain
 *        @nh: Pointer to head of the blocking notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then blocking_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret = NOTIFY_DONE;

        /*
         * We check the head outside the lock, but if this access is
         * racy then it does not matter what the result of the test
         * is, we re-check the list after having taken the lock anyway:
         */
        if (rcu_access_pointer(nh->head)) {
                down_read(&nh->rwsem);
                ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
                up_read(&nh->rwsem);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blocking_notifier_call_chain);

/*
 *        Raw notifier chain routines.  There is no protection;
 *        the caller must provide it.  Use at your own risk!
 */

/**
 *        raw_notifier_chain_register - Add notifier to a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int raw_notifier_chain_register(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_register(&nh->head, n, false);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);

/**
 *        raw_notifier_chain_unregister - Remove notifier from a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from a raw notifier chain.
 *        All locking must be provided by the caller.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_unregister(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister);

int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v)
{
        return notifier_call_chain_robust(&nh->head, val_up, val_down, v);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain_robust);

/**
 *        raw_notifier_call_chain - Call functions in a raw notifier chain
 *        @nh: Pointer to head of the raw notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in an undefined context.
 *        All locking must be provided by the caller.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int raw_notifier_call_chain(struct raw_notifier_head *nh,
                unsigned long val, void *v)
{
        return notifier_call_chain(&nh->head, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(raw_notifier_call_chain);

/*
 *        SRCU notifier chain routines.    Registration and unregistration
 *        use a mutex, and call_chain is synchronized by SRCU (no locks).
 */

/**
 *        srcu_notifier_chain_register - Add notifier to an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: New entry in notifier chain
 *
 *        Adds a notifier to an SRCU notifier chain.
 *        Must be called in process context.
 *
 *        Returns 0 on success, %-EEXIST on error.
 */
int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_register(&nh->head, n, false);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_register(&nh->head, n, false);
        mutex_unlock(&nh->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_register);

/**
 *        srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @n: Entry to remove from notifier chain
 *
 *        Removes a notifier from an SRCU notifier chain.
 *        Must be called from process context.
 *
 *        Returns zero on success or %-ENOENT on failure.
 */
int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
                struct notifier_block *n)
{
        int ret;

        /*
         * This code gets used during boot-up, when task switching is
         * not yet working and interrupts must remain disabled.  At
         * such times we must not call mutex_lock().
         */
        if (unlikely(system_state == SYSTEM_BOOTING))
                return notifier_chain_unregister(&nh->head, n);

        mutex_lock(&nh->mutex);
        ret = notifier_chain_unregister(&nh->head, n);
        mutex_unlock(&nh->mutex);
        synchronize_srcu(&nh->srcu);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister);

/**
 *        srcu_notifier_call_chain - Call functions in an SRCU notifier chain
 *        @nh: Pointer to head of the SRCU notifier chain
 *        @val: Value passed unmodified to notifier function
 *        @v: Pointer passed unmodified to notifier function
 *
 *        Calls each function in a notifier chain in turn.  The functions
 *        run in a process context, so they are allowed to block.
 *
 *        If the return value of the notifier can be and'ed
 *        with %NOTIFY_STOP_MASK then srcu_notifier_call_chain()
 *        will return immediately, with the return value of
 *        the notifier function which halted execution.
 *        Otherwise the return value is the return value
 *        of the last notifier function called.
 */
int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                unsigned long val, void *v)
{
        int ret;
        int idx;

        idx = srcu_read_lock(&nh->srcu);
        ret = notifier_call_chain(&nh->head, val, v, -1, NULL);
        srcu_read_unlock(&nh->srcu, idx);
        return ret;
}
EXPORT_SYMBOL_GPL(srcu_notifier_call_chain);

/**
 *        srcu_init_notifier_head - Initialize an SRCU notifier head
 *        @nh: Pointer to head of the srcu notifier chain
 *
 *        Unlike other sorts of notifier heads, SRCU notifier heads require
 *        dynamic initialization.  Be sure to call this routine before
 *        calling any of the other SRCU notifier routines for this head.
 *
 *        If an SRCU notifier head is deallocated, it must first be cleaned
 *        up by calling srcu_cleanup_notifier_head().  Otherwise the head's
 *        per-cpu data (used by the SRCU mechanism) will leak.
 */
void srcu_init_notifier_head(struct srcu_notifier_head *nh)
{
        mutex_init(&nh->mutex);
        if (init_srcu_struct(&nh->srcu) < 0)
                BUG();
        nh->head = NULL;
}
EXPORT_SYMBOL_GPL(srcu_init_notifier_head);

static ATOMIC_NOTIFIER_HEAD(die_chain);

int notrace notify_die(enum die_val val, const char *str,
               struct pt_regs *regs, long err, int trap, int sig)
{
        struct die_args args = {
                .regs        = regs,
                .str        = str,
                .err        = err,
                .trapnr        = trap,
                .signr        = sig,

        };
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                           "notify_die called but RCU thinks we're quiescent");
        return atomic_notifier_call_chain(&die_chain, val, &args);
}
NOKPROBE_SYMBOL(notify_die);

int register_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);

int unregister_die_notifier(struct notifier_block *nb)
{
        return atomic_notifier_chain_unregister(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_die_notifier);




































































































































































    3 




























































































































































































































































































































































































































































































































    3 



    3 



    3 





    3 


    3 










    3 









































































    3 





    3 















































































































































































































































































































































    3 
    3 










































































































































    3 




    3 




    3 




    3 

    3 

    3 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 
    3 



    3 





































































    3 





    3 




    3 

    3 

















































































    3 

    3 

    3 
















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Multicast support for IPv6
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c
 */

/* Changes:
 *
 *        yoshfuji        : fix format of router-alert option
 *        YOSHIFUJI Hideaki @USAGI:
 *                Fixed source address for MLD message based on
 *                <draft-ietf-magma-mld-source-05.txt>.
 *        YOSHIFUJI Hideaki @USAGI:
 *                - Ignore Queries for invalid addresses.
 *                - MLD for link-local addresses.
 *        David L Stevens <dlstevens@us.ibm.com>:
 *                - MLDv2 support
 */

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/jiffies.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/route.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/pkt_sched.h>
#include <net/mld.h>
#include <linux/workqueue.h>

#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>

#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/sock.h>
#include <net/snmp.h>

#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/if_inet6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/inet_common.h>

#include <net/ip6_checksum.h>

/* Ensure that we have struct in6_addr aligned on 32bit word. */
static int __mld2_query_bugs[] __attribute__((__unused__)) = {
        BUILD_BUG_ON_ZERO(offsetof(struct mld2_query, mld2q_srcs) % 4),
        BUILD_BUG_ON_ZERO(offsetof(struct mld2_report, mld2r_grec) % 4),
        BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4)
};

static struct workqueue_struct *mld_wq;
static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;

static void igmp6_join_group(struct ifmcaddr6 *ma);
static void igmp6_leave_group(struct ifmcaddr6 *ma);
static void mld_mca_work(struct work_struct *work);

static void mld_ifc_event(struct inet6_dev *idev);
static bool mld_in_v1_mode(const struct inet6_dev *idev);
static int sf_setstate(struct ifmcaddr6 *pmc);
static void sf_markstate(struct ifmcaddr6 *pmc);
static void ip6_mc_clear_src(struct ifmcaddr6 *pmc);
static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
                          int sfmode, int sfcount, const struct in6_addr *psfsrc,
                          int delta);
static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
                          int sfmode, int sfcount, const struct in6_addr *psfsrc,
                          int delta);
static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
                            struct inet6_dev *idev);
static int __ipv6_dev_mc_inc(struct net_device *dev,
                             const struct in6_addr *addr, unsigned int mode);

#define MLD_QRV_DEFAULT                2
/* RFC3810, 9.2. Query Interval */
#define MLD_QI_DEFAULT                (125 * HZ)
/* RFC3810, 9.3. Query Response Interval */
#define MLD_QRI_DEFAULT                (10 * HZ)

/* RFC3810, 8.1 Query Version Distinctions */
#define MLD_V1_QUERY_LEN        24
#define MLD_V2_QUERY_LEN_MIN        28

#define IPV6_MLD_MAX_MSF        64

int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF;
int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT;

/*
 *        socket join on multicast group
 */
#define mc_dereference(e, idev) \
        rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock))

#define sock_dereference(e, sk) \
        rcu_dereference_protected(e, lockdep_sock_is_held(sk))

#define for_each_pmc_socklock(np, sk, pmc)                        \
        for (pmc = sock_dereference((np)->ipv6_mc_list, sk);        \
             pmc;                                                \
             pmc = sock_dereference(pmc->next, sk))

#define for_each_pmc_rcu(np, pmc)                                \
        for (pmc = rcu_dereference((np)->ipv6_mc_list);                \
             pmc;                                                \
             pmc = rcu_dereference(pmc->next))

#define for_each_psf_mclock(mc, psf)                                \
        for (psf = mc_dereference((mc)->mca_sources, mc->idev);        \
             psf;                                                \
             psf = mc_dereference(psf->sf_next, mc->idev))

#define for_each_psf_rcu(mc, psf)                                \
        for (psf = rcu_dereference((mc)->mca_sources);                \
             psf;                                                \
             psf = rcu_dereference(psf->sf_next))

#define for_each_psf_tomb(mc, psf)                                \
        for (psf = mc_dereference((mc)->mca_tomb, mc->idev);        \
             psf;                                                \
             psf = mc_dereference(psf->sf_next, mc->idev))

#define for_each_mc_mclock(idev, mc)                                \
        for (mc = mc_dereference((idev)->mc_list, idev);        \
             mc;                                                \
             mc = mc_dereference(mc->next, idev))

#define for_each_mc_rcu(idev, mc)                                \
        for (mc = rcu_dereference((idev)->mc_list);             \
             mc;                                                \
             mc = rcu_dereference(mc->next))

#define for_each_mc_tomb(idev, mc)                                \
        for (mc = mc_dereference((idev)->mc_tomb, idev);        \
             mc;                                                \
             mc = mc_dereference(mc->next, idev))

static int unsolicited_report_interval(struct inet6_dev *idev)
{
        int iv;

        if (mld_in_v1_mode(idev))
                iv = READ_ONCE(idev->cnf.mldv1_unsolicited_report_interval);
        else
                iv = READ_ONCE(idev->cnf.mldv2_unsolicited_report_interval);

        return iv > 0 ? iv : 1;
}

static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
                               const struct in6_addr *addr, unsigned int mode)
{
        struct net_device *dev = NULL;
        struct ipv6_mc_socklist *mc_lst;
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct net *net = sock_net(sk);
        int err;

        ASSERT_RTNL();

        if (!ipv6_addr_is_multicast(addr))
                return -EINVAL;

        for_each_pmc_socklock(np, sk, mc_lst) {
                if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
                    ipv6_addr_equal(&mc_lst->addr, addr))
                        return -EADDRINUSE;
        }

        mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);

        if (!mc_lst)
                return -ENOMEM;

        mc_lst->next = NULL;
        mc_lst->addr = *addr;

        if (ifindex == 0) {
                struct rt6_info *rt;
                rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
                if (rt) {
                        dev = rt->dst.dev;
                        ip6_rt_put(rt);
                }
        } else
                dev = __dev_get_by_index(net, ifindex);

        if (!dev) {
                sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
                return -ENODEV;
        }

        mc_lst->ifindex = dev->ifindex;
        mc_lst->sfmode = mode;
        RCU_INIT_POINTER(mc_lst->sflist, NULL);

        /*
         *        now add/increase the group membership on the device
         */

        err = __ipv6_dev_mc_inc(dev, addr, mode);

        if (err) {
                sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
                return err;
        }

        mc_lst->next = np->ipv6_mc_list;
        rcu_assign_pointer(np->ipv6_mc_list, mc_lst);

        return 0;
}

int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
        return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE);
}
EXPORT_SYMBOL(ipv6_sock_mc_join);

int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
                          const struct in6_addr *addr, unsigned int mode)
{
        return __ipv6_sock_mc_join(sk, ifindex, addr, mode);
}

/*
 *        socket leave on multicast group
 */
int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct ipv6_mc_socklist *mc_lst;
        struct ipv6_mc_socklist __rcu **lnk;
        struct net *net = sock_net(sk);

        ASSERT_RTNL();

        if (!ipv6_addr_is_multicast(addr))
                return -EINVAL;

        for (lnk = &np->ipv6_mc_list;
             (mc_lst = sock_dereference(*lnk, sk)) != NULL;
              lnk = &mc_lst->next) {
                if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
                    ipv6_addr_equal(&mc_lst->addr, addr)) {
                        struct net_device *dev;

                        *lnk = mc_lst->next;

                        dev = __dev_get_by_index(net, mc_lst->ifindex);
                        if (dev) {
                                struct inet6_dev *idev = __in6_dev_get(dev);

                                ip6_mc_leave_src(sk, mc_lst, idev);
                                if (idev)
                                        __ipv6_dev_mc_dec(idev, &mc_lst->addr);
                        } else {
                                ip6_mc_leave_src(sk, mc_lst, NULL);
                        }

                        atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
                        kfree_rcu(mc_lst, rcu);
                        return 0;
                }
        }

        return -EADDRNOTAVAIL;
}
EXPORT_SYMBOL(ipv6_sock_mc_drop);

static struct inet6_dev *ip6_mc_find_dev_rtnl(struct net *net,
                                              const struct in6_addr *group,
                                              int ifindex)
{
        struct net_device *dev = NULL;
        struct inet6_dev *idev = NULL;

        if (ifindex == 0) {
                struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0);

                if (rt) {
                        dev = rt->dst.dev;
                        ip6_rt_put(rt);
                }
        } else {
                dev = __dev_get_by_index(net, ifindex);
        }

        if (!dev)
                return NULL;
        idev = __in6_dev_get(dev);
        if (!idev)
                return NULL;
        if (idev->dead)
                return NULL;
        return idev;
}

void __ipv6_sock_mc_close(struct sock *sk)
{
        struct ipv6_pinfo *np = inet6_sk(sk);
        struct ipv6_mc_socklist *mc_lst;
        struct net *net = sock_net(sk);

        ASSERT_RTNL();

        while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) {
                struct net_device *dev;

                np->ipv6_mc_list = mc_lst->next;

                dev = __dev_get_by_index(net, mc_lst->ifindex);
                if (dev) {
                        struct inet6_dev *idev = __in6_dev_get(dev);

                        ip6_mc_leave_src(sk, mc_lst, idev);
                        if (idev)
                                __ipv6_dev_mc_dec(idev, &mc_lst->addr);
                } else {
                        ip6_mc_leave_src(sk, mc_lst, NULL);
                }

                atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
                kfree_rcu(mc_lst, rcu);
        }
}

void ipv6_sock_mc_close(struct sock *sk)
{
        struct ipv6_pinfo *np = inet6_sk(sk);

        if (!rcu_access_pointer(np->ipv6_mc_list))
                return;

        rtnl_lock();
        lock_sock(sk);
        __ipv6_sock_mc_close(sk);
        release_sock(sk);
        rtnl_unlock();
}

int ip6_mc_source(int add, int omode, struct sock *sk,
        struct group_source_req *pgsr)
{
        struct in6_addr *source, *group;
        struct ipv6_mc_socklist *pmc;
        struct inet6_dev *idev;
        struct ipv6_pinfo *inet6 = inet6_sk(sk);
        struct ip6_sf_socklist *psl;
        struct net *net = sock_net(sk);
        int i, j, rv;
        int leavegroup = 0;
        int err;

        source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr;
        group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr;

        if (!ipv6_addr_is_multicast(group))
                return -EINVAL;

        idev = ip6_mc_find_dev_rtnl(net, group, pgsr->gsr_interface);
        if (!idev)
                return -ENODEV;

        err = -EADDRNOTAVAIL;

        mutex_lock(&idev->mc_lock);
        for_each_pmc_socklock(inet6, sk, pmc) {
                if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
                        continue;
                if (ipv6_addr_equal(&pmc->addr, group))
                        break;
        }
        if (!pmc) {                /* must have a prior join */
                err = -EINVAL;
                goto done;
        }
        /* if a source filter was set, must be the same mode as before */
        if (rcu_access_pointer(pmc->sflist)) {
                if (pmc->sfmode != omode) {
                        err = -EINVAL;
                        goto done;
                }
        } else if (pmc->sfmode != omode) {
                /* allow mode switches for empty-set filters */
                ip6_mc_add_src(idev, group, omode, 0, NULL, 0);
                ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
                pmc->sfmode = omode;
        }

        psl = sock_dereference(pmc->sflist, sk);
        if (!add) {
                if (!psl)
                        goto done;        /* err = -EADDRNOTAVAIL */
                rv = !0;
                for (i = 0; i < psl->sl_count; i++) {
                        rv = !ipv6_addr_equal(&psl->sl_addr[i], source);
                        if (rv == 0)
                                break;
                }
                if (rv)                /* source not found */
                        goto done;        /* err = -EADDRNOTAVAIL */

                /* special case - (INCLUDE, empty) == LEAVE_GROUP */
                if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
                        leavegroup = 1;
                        goto done;
                }

                /* update the interface filter */
                ip6_mc_del_src(idev, group, omode, 1, source, 1);

                for (j = i+1; j < psl->sl_count; j++)
                        psl->sl_addr[j-1] = psl->sl_addr[j];
                psl->sl_count--;
                err = 0;
                goto done;
        }
        /* else, add a new source to the filter */

        if (psl && psl->sl_count >= sysctl_mld_max_msf) {
                err = -ENOBUFS;
                goto done;
        }
        if (!psl || psl->sl_count == psl->sl_max) {
                struct ip6_sf_socklist *newpsl;
                int count = IP6_SFBLOCK;

                if (psl)
                        count += psl->sl_max;
                newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
                                      GFP_KERNEL);
                if (!newpsl) {
                        err = -ENOBUFS;
                        goto done;
                }
                newpsl->sl_max = count;
                newpsl->sl_count = count - IP6_SFBLOCK;
                if (psl) {
                        for (i = 0; i < psl->sl_count; i++)
                                newpsl->sl_addr[i] = psl->sl_addr[i];
                        atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                                   &sk->sk_omem_alloc);
                }
                rcu_assign_pointer(pmc->sflist, newpsl);
                kfree_rcu(psl, rcu);
                psl = newpsl;
        }
        rv = 1;        /* > 0 for insert logic below if sl_count is 0 */
        for (i = 0; i < psl->sl_count; i++) {
                rv = !ipv6_addr_equal(&psl->sl_addr[i], source);
                if (rv == 0) /* There is an error in the address. */
                        goto done;
        }
        for (j = psl->sl_count-1; j >= i; j--)
                psl->sl_addr[j+1] = psl->sl_addr[j];
        psl->sl_addr[i] = *source;
        psl->sl_count++;
        err = 0;
        /* update the interface list */
        ip6_mc_add_src(idev, group, omode, 1, source, 1);
done:
        mutex_unlock(&idev->mc_lock);
        if (leavegroup)
                err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
        return err;
}

int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
                    struct sockaddr_storage *list)
{
        const struct in6_addr *group;
        struct ipv6_mc_socklist *pmc;
        struct inet6_dev *idev;
        struct ipv6_pinfo *inet6 = inet6_sk(sk);
        struct ip6_sf_socklist *newpsl, *psl;
        struct net *net = sock_net(sk);
        int leavegroup = 0;
        int i, err;

        group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;

        if (!ipv6_addr_is_multicast(group))
                return -EINVAL;
        if (gsf->gf_fmode != MCAST_INCLUDE &&
            gsf->gf_fmode != MCAST_EXCLUDE)
                return -EINVAL;

        idev = ip6_mc_find_dev_rtnl(net, group, gsf->gf_interface);
        if (!idev)
                return -ENODEV;

        err = 0;

        if (gsf->gf_fmode == MCAST_INCLUDE && gsf->gf_numsrc == 0) {
                leavegroup = 1;
                goto done;
        }

        for_each_pmc_socklock(inet6, sk, pmc) {
                if (pmc->ifindex != gsf->gf_interface)
                        continue;
                if (ipv6_addr_equal(&pmc->addr, group))
                        break;
        }
        if (!pmc) {                /* must have a prior join */
                err = -EINVAL;
                goto done;
        }
        if (gsf->gf_numsrc) {
                newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
                                                      gsf->gf_numsrc),
                                      GFP_KERNEL);
                if (!newpsl) {
                        err = -ENOBUFS;
                        goto done;
                }
                newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc;
                for (i = 0; i < newpsl->sl_count; ++i, ++list) {
                        struct sockaddr_in6 *psin6;

                        psin6 = (struct sockaddr_in6 *)list;
                        newpsl->sl_addr[i] = psin6->sin6_addr;
                }
                mutex_lock(&idev->mc_lock);
                err = ip6_mc_add_src(idev, group, gsf->gf_fmode,
                                     newpsl->sl_count, newpsl->sl_addr, 0);
                if (err) {
                        mutex_unlock(&idev->mc_lock);
                        sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr,
                                                             newpsl->sl_max));
                        goto done;
                }
                mutex_unlock(&idev->mc_lock);
        } else {
                newpsl = NULL;
                mutex_lock(&idev->mc_lock);
                ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
                mutex_unlock(&idev->mc_lock);
        }

        mutex_lock(&idev->mc_lock);
        psl = sock_dereference(pmc->sflist, sk);
        if (psl) {
                ip6_mc_del_src(idev, group, pmc->sfmode,
                               psl->sl_count, psl->sl_addr, 0);
                atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                           &sk->sk_omem_alloc);
        } else {
                ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
        }
        rcu_assign_pointer(pmc->sflist, newpsl);
        mutex_unlock(&idev->mc_lock);
        kfree_rcu(psl, rcu);
        pmc->sfmode = gsf->gf_fmode;
        err = 0;
done:
        if (leavegroup)
                err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
        return err;
}

int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
                  sockptr_t optval, size_t ss_offset)
{
        struct ipv6_pinfo *inet6 = inet6_sk(sk);
        const struct in6_addr *group;
        struct ipv6_mc_socklist *pmc;
        struct ip6_sf_socklist *psl;
        unsigned int count;
        int i, copycount;

        group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;

        if (!ipv6_addr_is_multicast(group))
                return -EINVAL;

        /* changes to the ipv6_mc_list require the socket lock and
         * rtnl lock. We have the socket lock, so reading the list is safe.
         */

        for_each_pmc_socklock(inet6, sk, pmc) {
                if (pmc->ifindex != gsf->gf_interface)
                        continue;
                if (ipv6_addr_equal(group, &pmc->addr))
                        break;
        }
        if (!pmc)                /* must have a prior join */
                return -EADDRNOTAVAIL;

        gsf->gf_fmode = pmc->sfmode;
        psl = sock_dereference(pmc->sflist, sk);
        count = psl ? psl->sl_count : 0;

        copycount = min(count, gsf->gf_numsrc);
        gsf->gf_numsrc = count;
        for (i = 0; i < copycount; i++) {
                struct sockaddr_in6 *psin6;
                struct sockaddr_storage ss;

                psin6 = (struct sockaddr_in6 *)&ss;
                memset(&ss, 0, sizeof(ss));
                psin6->sin6_family = AF_INET6;
                psin6->sin6_addr = psl->sl_addr[i];
                if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss)))
                        return -EFAULT;
                ss_offset += sizeof(ss);
        }
        return 0;
}

bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
                    const struct in6_addr *src_addr)
{
        const struct ipv6_pinfo *np = inet6_sk(sk);
        const struct ipv6_mc_socklist *mc;
        const struct ip6_sf_socklist *psl;
        bool rv = true;

        rcu_read_lock();
        for_each_pmc_rcu(np, mc) {
                if (ipv6_addr_equal(&mc->addr, mc_addr))
                        break;
        }
        if (!mc) {
                rcu_read_unlock();
                return inet6_test_bit(MC6_ALL, sk);
        }
        psl = rcu_dereference(mc->sflist);
        if (!psl) {
                rv = mc->sfmode == MCAST_EXCLUDE;
        } else {
                int i;

                for (i = 0; i < psl->sl_count; i++) {
                        if (ipv6_addr_equal(&psl->sl_addr[i], src_addr))
                                break;
                }
                if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
                        rv = false;
                if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
                        rv = false;
        }
        rcu_read_unlock();

        return rv;
}

/* called with mc_lock */
static void igmp6_group_added(struct ifmcaddr6 *mc)
{
        struct net_device *dev = mc->idev->dev;
        char buf[MAX_ADDR_LEN];

        if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
            IPV6_ADDR_SCOPE_LINKLOCAL)
                return;

        if (!(mc->mca_flags&MAF_LOADED)) {
                mc->mca_flags |= MAF_LOADED;
                if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
                        dev_mc_add(dev, buf);
        }

        if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT))
                return;

        if (mld_in_v1_mode(mc->idev)) {
                igmp6_join_group(mc);
                return;
        }
        /* else v2 */

        /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we
         * should not send filter-mode change record as the mode
         * should be from IN() to IN(A).
         */
        if (mc->mca_sfmode == MCAST_EXCLUDE)
                mc->mca_crcount = mc->idev->mc_qrv;

        mld_ifc_event(mc->idev);
}

/* called with mc_lock */
static void igmp6_group_dropped(struct ifmcaddr6 *mc)
{
        struct net_device *dev = mc->idev->dev;
        char buf[MAX_ADDR_LEN];

        if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
            IPV6_ADDR_SCOPE_LINKLOCAL)
                return;

        if (mc->mca_flags&MAF_LOADED) {
                mc->mca_flags &= ~MAF_LOADED;
                if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
                        dev_mc_del(dev, buf);
        }

        if (mc->mca_flags & MAF_NOREPORT)
                return;

        if (!mc->idev->dead)
                igmp6_leave_group(mc);

        if (cancel_delayed_work(&mc->mca_work))
                refcount_dec(&mc->mca_refcnt);
}

/*
 * deleted ifmcaddr6 manipulation
 * called with mc_lock
 */
static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
        struct ifmcaddr6 *pmc;

        /* this is an "ifmcaddr6" for convenience; only the fields below
         * are actually used. In particular, the refcnt and users are not
         * used for management of the delete list. Using the same structure
         * for deleted items allows change reports to use common code with
         * non-deleted or query-response MCA's.
         */
        pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
        if (!pmc)
                return;

        pmc->idev = im->idev;
        in6_dev_hold(idev);
        pmc->mca_addr = im->mca_addr;
        pmc->mca_crcount = idev->mc_qrv;
        pmc->mca_sfmode = im->mca_sfmode;
        if (pmc->mca_sfmode == MCAST_INCLUDE) {
                struct ip6_sf_list *psf;

                rcu_assign_pointer(pmc->mca_tomb,
                                   mc_dereference(im->mca_tomb, idev));
                rcu_assign_pointer(pmc->mca_sources,
                                   mc_dereference(im->mca_sources, idev));
                RCU_INIT_POINTER(im->mca_tomb, NULL);
                RCU_INIT_POINTER(im->mca_sources, NULL);

                for_each_psf_mclock(pmc, psf)
                        psf->sf_crcount = pmc->mca_crcount;
        }

        rcu_assign_pointer(pmc->next, idev->mc_tomb);
        rcu_assign_pointer(idev->mc_tomb, pmc);
}

/* called with mc_lock */
static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
        struct ip6_sf_list *psf, *sources, *tomb;
        struct in6_addr *pmca = &im->mca_addr;
        struct ifmcaddr6 *pmc, *pmc_prev;

        pmc_prev = NULL;
        for_each_mc_tomb(idev, pmc) {
                if (ipv6_addr_equal(&pmc->mca_addr, pmca))
                        break;
                pmc_prev = pmc;
        }
        if (pmc) {
                if (pmc_prev)
                        rcu_assign_pointer(pmc_prev->next, pmc->next);
                else
                        rcu_assign_pointer(idev->mc_tomb, pmc->next);
        }

        if (pmc) {
                im->idev = pmc->idev;
                if (im->mca_sfmode == MCAST_INCLUDE) {
                        tomb = rcu_replace_pointer(im->mca_tomb,
                                                   mc_dereference(pmc->mca_tomb, pmc->idev),
                                                   lockdep_is_held(&im->idev->mc_lock));
                        rcu_assign_pointer(pmc->mca_tomb, tomb);

                        sources = rcu_replace_pointer(im->mca_sources,
                                                      mc_dereference(pmc->mca_sources, pmc->idev),
                                                      lockdep_is_held(&im->idev->mc_lock));
                        rcu_assign_pointer(pmc->mca_sources, sources);
                        for_each_psf_mclock(im, psf)
                                psf->sf_crcount = idev->mc_qrv;
                } else {
                        im->mca_crcount = idev->mc_qrv;
                }
                in6_dev_put(pmc->idev);
                ip6_mc_clear_src(pmc);
                kfree_rcu(pmc, rcu);
        }
}

/* called with mc_lock */
static void mld_clear_delrec(struct inet6_dev *idev)
{
        struct ifmcaddr6 *pmc, *nextpmc;

        pmc = mc_dereference(idev->mc_tomb, idev);
        RCU_INIT_POINTER(idev->mc_tomb, NULL);

        for (; pmc; pmc = nextpmc) {
                nextpmc = mc_dereference(pmc->next, idev);
                ip6_mc_clear_src(pmc);
                in6_dev_put(pmc->idev);
                kfree_rcu(pmc, rcu);
        }

        /* clear dead sources, too */
        for_each_mc_mclock(idev, pmc) {
                struct ip6_sf_list *psf, *psf_next;

                psf = mc_dereference(pmc->mca_tomb, idev);
                RCU_INIT_POINTER(pmc->mca_tomb, NULL);
                for (; psf; psf = psf_next) {
                        psf_next = mc_dereference(psf->sf_next, idev);
                        kfree_rcu(psf, rcu);
                }
        }
}

static void mld_clear_query(struct inet6_dev *idev)
{
        struct sk_buff *skb;

        spin_lock_bh(&idev->mc_query_lock);
        while ((skb = __skb_dequeue(&idev->mc_query_queue)))
                kfree_skb(skb);
        spin_unlock_bh(&idev->mc_query_lock);
}

static void mld_clear_report(struct inet6_dev *idev)
{
        struct sk_buff *skb;

        spin_lock_bh(&idev->mc_report_lock);
        while ((skb = __skb_dequeue(&idev->mc_report_queue)))
                kfree_skb(skb);
        spin_unlock_bh(&idev->mc_report_lock);
}

static void mca_get(struct ifmcaddr6 *mc)
{
        refcount_inc(&mc->mca_refcnt);
}

static void ma_put(struct ifmcaddr6 *mc)
{
        if (refcount_dec_and_test(&mc->mca_refcnt)) {
                in6_dev_put(mc->idev);
                kfree_rcu(mc, rcu);
        }
}

/* called with mc_lock */
static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
                                   const struct in6_addr *addr,
                                   unsigned int mode)
{
        struct ifmcaddr6 *mc;

        mc = kzalloc(sizeof(*mc), GFP_KERNEL);
        if (!mc)
                return NULL;

        INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work);

        mc->mca_addr = *addr;
        mc->idev = idev; /* reference taken by caller */
        mc->mca_users = 1;
        /* mca_stamp should be updated upon changes */
        mc->mca_cstamp = mc->mca_tstamp = jiffies;
        refcount_set(&mc->mca_refcnt, 1);

        mc->mca_sfmode = mode;
        mc->mca_sfcount[mode] = 1;

        if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
            IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
                mc->mca_flags |= MAF_NOREPORT;

        return mc;
}

static void inet6_ifmcaddr_notify(struct net_device *dev,
                                  const struct ifmcaddr6 *ifmca, int event)
{
        struct inet6_fill_args fillargs = {
                .portid = 0,
                .seq = 0,
                .event = event,
                .flags = 0,
                .netnsid = -1,
                .force_rt_scope_universe = true,
        };
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -ENOMEM;

        skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
                        nla_total_size(sizeof(struct in6_addr)) +
                        nla_total_size(sizeof(struct ifa_cacheinfo)),
                        GFP_KERNEL);
        if (!skb)
                goto error;

        err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs);
        if (err < 0) {
                WARN_ON_ONCE(err == -EMSGSIZE);
                nlmsg_free(skb);
                goto error;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL);
        return;
error:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err);
}

/*
 *        device multicast group inc (add if not found)
 */
static int __ipv6_dev_mc_inc(struct net_device *dev,
                             const struct in6_addr *addr, unsigned int mode)
{
        struct ifmcaddr6 *mc;
        struct inet6_dev *idev;

        ASSERT_RTNL();

        /* we need to take a reference on idev */
        idev = in6_dev_get(dev);

        if (!idev)
                return -EINVAL;

        if (idev->dead) {
                in6_dev_put(idev);
                return -ENODEV;
        }

        mutex_lock(&idev->mc_lock);
        for_each_mc_mclock(idev, mc) {
                if (ipv6_addr_equal(&mc->mca_addr, addr)) {
                        mc->mca_users++;
                        ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
                        mutex_unlock(&idev->mc_lock);
                        in6_dev_put(idev);
                        return 0;
                }
        }

        mc = mca_alloc(idev, addr, mode);
        if (!mc) {
                mutex_unlock(&idev->mc_lock);
                in6_dev_put(idev);
                return -ENOMEM;
        }

        rcu_assign_pointer(mc->next, idev->mc_list);
        rcu_assign_pointer(idev->mc_list, mc);

        mca_get(mc);

        mld_del_delrec(idev, mc);
        igmp6_group_added(mc);
        inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST);
        mutex_unlock(&idev->mc_lock);
        ma_put(mc);
        return 0;
}

int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
{
        return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE);
}
EXPORT_SYMBOL(ipv6_dev_mc_inc);

/*
 * device multicast group del
 */
int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
{
        struct ifmcaddr6 *ma, __rcu **map;

        ASSERT_RTNL();

        mutex_lock(&idev->mc_lock);
        for (map = &idev->mc_list;
             (ma = mc_dereference(*map, idev));
             map = &ma->next) {
                if (ipv6_addr_equal(&ma->mca_addr, addr)) {
                        if (--ma->mca_users == 0) {
                                *map = ma->next;

                                igmp6_group_dropped(ma);
                                inet6_ifmcaddr_notify(idev->dev, ma,
                                                      RTM_DELMULTICAST);
                                ip6_mc_clear_src(ma);
                                mutex_unlock(&idev->mc_lock);

                                ma_put(ma);
                                return 0;
                        }
                        mutex_unlock(&idev->mc_lock);
                        return 0;
                }
        }

        mutex_unlock(&idev->mc_lock);
        return -ENOENT;
}

int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr)
{
        struct inet6_dev *idev;
        int err;

        ASSERT_RTNL();

        idev = __in6_dev_get(dev);
        if (!idev)
                err = -ENODEV;
        else
                err = __ipv6_dev_mc_dec(idev, addr);

        return err;
}
EXPORT_SYMBOL(ipv6_dev_mc_dec);

/*
 *        check if the interface/address pair is valid
 */
bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
                         const struct in6_addr *src_addr)
{
        struct inet6_dev *idev;
        struct ifmcaddr6 *mc;
        bool rv = false;

        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (!idev)
                goto unlock;
        for_each_mc_rcu(idev, mc) {
                if (ipv6_addr_equal(&mc->mca_addr, group))
                        break;
        }
        if (!mc)
                goto unlock;
        if (src_addr && !ipv6_addr_any(src_addr)) {
                struct ip6_sf_list *psf;

                for_each_psf_rcu(mc, psf) {
                        if (ipv6_addr_equal(&psf->sf_addr, src_addr))
                                break;
                }
                if (psf)
                        rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) ||
                                READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) !=
                                READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]);
                else
                        rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0;
        } else {
                rv = true; /* don't filter unspecified source */
        }
unlock:
        rcu_read_unlock();
        return rv;
}

/* called with mc_lock */
static void mld_gq_start_work(struct inet6_dev *idev)
{
        unsigned long tv = get_random_u32_below(idev->mc_maxdelay);

        idev->mc_gq_running = 1;
        if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2))
                in6_dev_hold(idev);
}

/* called with mc_lock */
static void mld_gq_stop_work(struct inet6_dev *idev)
{
        idev->mc_gq_running = 0;
        if (cancel_delayed_work(&idev->mc_gq_work))
                __in6_dev_put(idev);
}

/* called with mc_lock */
static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay)
{
        unsigned long tv = get_random_u32_below(delay);

        if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2))
                in6_dev_hold(idev);
}

/* called with mc_lock */
static void mld_ifc_stop_work(struct inet6_dev *idev)
{
        idev->mc_ifc_count = 0;
        if (cancel_delayed_work(&idev->mc_ifc_work))
                __in6_dev_put(idev);
}

/* called with mc_lock */
static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay)
{
        unsigned long tv = get_random_u32_below(delay);

        if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2))
                in6_dev_hold(idev);
}

static void mld_dad_stop_work(struct inet6_dev *idev)
{
        if (cancel_delayed_work(&idev->mc_dad_work))
                __in6_dev_put(idev);
}

static void mld_query_stop_work(struct inet6_dev *idev)
{
        spin_lock_bh(&idev->mc_query_lock);
        if (cancel_delayed_work(&idev->mc_query_work))
                __in6_dev_put(idev);
        spin_unlock_bh(&idev->mc_query_lock);
}

static void mld_report_stop_work(struct inet6_dev *idev)
{
        if (cancel_delayed_work_sync(&idev->mc_report_work))
                __in6_dev_put(idev);
}

/*
 * IGMP handling (alias multicast ICMPv6 messages)
 * called with mc_lock
 */
static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
{
        unsigned long delay = resptime;

        /* Do not start work for these addresses */
        if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) ||
            IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
                return;

        if (cancel_delayed_work(&ma->mca_work)) {
                refcount_dec(&ma->mca_refcnt);
                delay = ma->mca_work.timer.expires - jiffies;
        }

        if (delay >= resptime)
                delay = get_random_u32_below(resptime);

        if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
                refcount_inc(&ma->mca_refcnt);
        ma->mca_flags |= MAF_TIMER_RUNNING;
}

/* mark EXCLUDE-mode sources
 * called with mc_lock
 */
static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
                             const struct in6_addr *srcs)
{
        struct ip6_sf_list *psf;
        int i, scount;

        scount = 0;
        for_each_psf_mclock(pmc, psf) {
                if (scount == nsrcs)
                        break;
                for (i = 0; i < nsrcs; i++) {
                        /* skip inactive filters */
                        if (psf->sf_count[MCAST_INCLUDE] ||
                            pmc->mca_sfcount[MCAST_EXCLUDE] !=
                            psf->sf_count[MCAST_EXCLUDE])
                                break;
                        if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
                                scount++;
                                break;
                        }
                }
        }
        pmc->mca_flags &= ~MAF_GSQUERY;
        if (scount == nsrcs)        /* all sources excluded */
                return false;
        return true;
}

/* called with mc_lock */
static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
                            const struct in6_addr *srcs)
{
        struct ip6_sf_list *psf;
        int i, scount;

        if (pmc->mca_sfmode == MCAST_EXCLUDE)
                return mld_xmarksources(pmc, nsrcs, srcs);

        /* mark INCLUDE-mode sources */

        scount = 0;
        for_each_psf_mclock(pmc, psf) {
                if (scount == nsrcs)
                        break;
                for (i = 0; i < nsrcs; i++) {
                        if (ipv6_addr_equal(&srcs[i], &psf->sf_addr)) {
                                psf->sf_gsresp = 1;
                                scount++;
                                break;
                        }
                }
        }
        if (!scount) {
                pmc->mca_flags &= ~MAF_GSQUERY;
                return false;
        }
        pmc->mca_flags |= MAF_GSQUERY;
        return true;
}

static int mld_force_mld_version(const struct inet6_dev *idev)
{
        const struct net *net = dev_net(idev->dev);
        int all_force;

        all_force = READ_ONCE(net->ipv6.devconf_all->force_mld_version);
        /* Normally, both are 0 here. If enforcement to a particular is
         * being used, individual device enforcement will have a lower
         * precedence over 'all' device (.../conf/all/force_mld_version).
         */
        return all_force ?: READ_ONCE(idev->cnf.force_mld_version);
}

static bool mld_in_v2_mode_only(const struct inet6_dev *idev)
{
        return mld_force_mld_version(idev) == 2;
}

static bool mld_in_v1_mode_only(const struct inet6_dev *idev)
{
        return mld_force_mld_version(idev) == 1;
}

static bool mld_in_v1_mode(const struct inet6_dev *idev)
{
        if (mld_in_v2_mode_only(idev))
                return false;
        if (mld_in_v1_mode_only(idev))
                return true;
        if (idev->mc_v1_seen && time_before(jiffies, idev->mc_v1_seen))
                return true;

        return false;
}

static void mld_set_v1_mode(struct inet6_dev *idev)
{
        /* RFC3810, relevant sections:
         *  - 9.1. Robustness Variable
         *  - 9.2. Query Interval
         *  - 9.3. Query Response Interval
         *  - 9.12. Older Version Querier Present Timeout
         */
        unsigned long switchback;

        switchback = (idev->mc_qrv * idev->mc_qi) + idev->mc_qri;

        idev->mc_v1_seen = jiffies + switchback;
}

static void mld_update_qrv(struct inet6_dev *idev,
                           const struct mld2_query *mlh2)
{
        /* RFC3810, relevant sections:
         *  - 5.1.8. QRV (Querier's Robustness Variable)
         *  - 9.1. Robustness Variable
         */

        /* The value of the Robustness Variable MUST NOT be zero,
         * and SHOULD NOT be one. Catch this here if we ever run
         * into such a case in future.
         */
        const int min_qrv = min(MLD_QRV_DEFAULT, sysctl_mld_qrv);
        WARN_ON(idev->mc_qrv == 0);

        if (mlh2->mld2q_qrv > 0)
                idev->mc_qrv = mlh2->mld2q_qrv;

        if (unlikely(idev->mc_qrv < min_qrv)) {
                net_warn_ratelimited("IPv6: MLD: clamping QRV from %u to %u!\n",
                                     idev->mc_qrv, min_qrv);
                idev->mc_qrv = min_qrv;
        }
}

static void mld_update_qi(struct inet6_dev *idev,
                          const struct mld2_query *mlh2)
{
        /* RFC3810, relevant sections:
         *  - 5.1.9. QQIC (Querier's Query Interval Code)
         *  - 9.2. Query Interval
         *  - 9.12. Older Version Querier Present Timeout
         *    (the [Query Interval] in the last Query received)
         */
        unsigned long mc_qqi;

        if (mlh2->mld2q_qqic < 128) {
                mc_qqi = mlh2->mld2q_qqic;
        } else {
                unsigned long mc_man, mc_exp;

                mc_exp = MLDV2_QQIC_EXP(mlh2->mld2q_qqic);
                mc_man = MLDV2_QQIC_MAN(mlh2->mld2q_qqic);

                mc_qqi = (mc_man | 0x10) << (mc_exp + 3);
        }

        idev->mc_qi = mc_qqi * HZ;
}

static void mld_update_qri(struct inet6_dev *idev,
                           const struct mld2_query *mlh2)
{
        /* RFC3810, relevant sections:
         *  - 5.1.3. Maximum Response Code
         *  - 9.3. Query Response Interval
         */
        idev->mc_qri = msecs_to_jiffies(mldv2_mrc(mlh2));
}

static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld,
                          unsigned long *max_delay, bool v1_query)
{
        unsigned long mldv1_md;

        /* Ignore v1 queries */
        if (mld_in_v2_mode_only(idev))
                return -EINVAL;

        mldv1_md = ntohs(mld->mld_maxdelay);

        /* When in MLDv1 fallback and a MLDv2 router start-up being
         * unaware of current MLDv1 operation, the MRC == MRD mapping
         * only works when the exponential algorithm is not being
         * used (as MLDv1 is unaware of such things).
         *
         * According to the RFC author, the MLDv2 implementations
         * he's aware of all use a MRC < 32768 on start up queries.
         *
         * Thus, should we *ever* encounter something else larger
         * than that, just assume the maximum possible within our
         * reach.
         */
        if (!v1_query)
                mldv1_md = min(mldv1_md, MLDV1_MRD_MAX_COMPAT);

        *max_delay = max(msecs_to_jiffies(mldv1_md), 1UL);

        /* MLDv1 router present: we need to go into v1 mode *only*
         * when an MLDv1 query is received as per section 9.12. of
         * RFC3810! And we know from RFC2710 section 3.7 that MLDv1
         * queries MUST be of exactly 24 octets.
         */
        if (v1_query)
                mld_set_v1_mode(idev);

        /* cancel MLDv2 report work */
        mld_gq_stop_work(idev);
        /* cancel the interface change work */
        mld_ifc_stop_work(idev);
        /* clear deleted report items */
        mld_clear_delrec(idev);

        return 0;
}

static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
                           unsigned long *max_delay)
{
        *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL);

        mld_update_qrv(idev, mld);
        mld_update_qi(idev, mld);
        mld_update_qri(idev, mld);

        idev->mc_maxdelay = *max_delay;

        return;
}

/* called with rcu_read_lock() */
void igmp6_event_query(struct sk_buff *skb)
{
        struct inet6_dev *idev = __in6_dev_get(skb->dev);

        if (!idev || idev->dead)
                goto out;

        spin_lock_bh(&idev->mc_query_lock);
        if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) {
                __skb_queue_tail(&idev->mc_query_queue, skb);
                if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0))
                        in6_dev_hold(idev);
                skb = NULL;
        }
        spin_unlock_bh(&idev->mc_query_lock);
out:
        kfree_skb(skb);
}

static void __mld_query_work(struct sk_buff *skb)
{
        struct mld2_query *mlh2 = NULL;
        const struct in6_addr *group;
        unsigned long max_delay;
        struct inet6_dev *idev;
        struct ifmcaddr6 *ma;
        struct mld_msg *mld;
        int group_type;
        int mark = 0;
        int len, err;

        if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
                goto kfree_skb;

        /* compute payload length excluding extension headers */
        len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr);
        len -= skb_network_header_len(skb);

        /* RFC3810 6.2
         * Upon reception of an MLD message that contains a Query, the node
         * checks if the source address of the message is a valid link-local
         * address, if the Hop Limit is set to 1, and if the Router Alert
         * option is present in the Hop-By-Hop Options header of the IPv6
         * packet.  If any of these checks fails, the packet is dropped.
         */
        if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL) ||
            ipv6_hdr(skb)->hop_limit != 1 ||
            !(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) ||
            IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD))
                goto kfree_skb;

        idev = in6_dev_get(skb->dev);
        if (!idev)
                goto kfree_skb;

        mld = (struct mld_msg *)icmp6_hdr(skb);
        group = &mld->mld_mca;
        group_type = ipv6_addr_type(group);

        if (group_type != IPV6_ADDR_ANY &&
            !(group_type&IPV6_ADDR_MULTICAST))
                goto out;

        if (len < MLD_V1_QUERY_LEN) {
                goto out;
        } else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) {
                err = mld_process_v1(idev, mld, &max_delay,
                                     len == MLD_V1_QUERY_LEN);
                if (err < 0)
                        goto out;
        } else if (len >= MLD_V2_QUERY_LEN_MIN) {
                int srcs_offset = sizeof(struct mld2_query) -
                                  sizeof(struct icmp6hdr);

                if (!pskb_may_pull(skb, srcs_offset))
                        goto out;

                mlh2 = (struct mld2_query *)skb_transport_header(skb);

                mld_process_v2(idev, mlh2, &max_delay);

                if (group_type == IPV6_ADDR_ANY) { /* general query */
                        if (mlh2->mld2q_nsrcs)
                                goto out; /* no sources allowed */

                        mld_gq_start_work(idev);
                        goto out;
                }
                /* mark sources to include, if group & source-specific */
                if (mlh2->mld2q_nsrcs != 0) {
                        if (!pskb_may_pull(skb, srcs_offset +
                            ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr)))
                                goto out;

                        mlh2 = (struct mld2_query *)skb_transport_header(skb);
                        mark = 1;
                }
        } else {
                goto out;
        }

        if (group_type == IPV6_ADDR_ANY) {
                for_each_mc_mclock(idev, ma) {
                        igmp6_group_queried(ma, max_delay);
                }
        } else {
                for_each_mc_mclock(idev, ma) {
                        if (!ipv6_addr_equal(group, &ma->mca_addr))
                                continue;
                        if (ma->mca_flags & MAF_TIMER_RUNNING) {
                                /* gsquery <- gsquery && mark */
                                if (!mark)
                                        ma->mca_flags &= ~MAF_GSQUERY;
                        } else {
                                /* gsquery <- mark */
                                if (mark)
                                        ma->mca_flags |= MAF_GSQUERY;
                                else
                                        ma->mca_flags &= ~MAF_GSQUERY;
                        }
                        if (!(ma->mca_flags & MAF_GSQUERY) ||
                            mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs))
                                igmp6_group_queried(ma, max_delay);
                        break;
                }
        }

out:
        in6_dev_put(idev);
kfree_skb:
        consume_skb(skb);
}

static void mld_query_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_query_work);
        struct sk_buff_head q;
        struct sk_buff *skb;
        bool rework = false;
        int cnt = 0;

        skb_queue_head_init(&q);

        spin_lock_bh(&idev->mc_query_lock);
        while ((skb = __skb_dequeue(&idev->mc_query_queue))) {
                __skb_queue_tail(&q, skb);

                if (++cnt >= MLD_MAX_QUEUE) {
                        rework = true;
                        break;
                }
        }
        spin_unlock_bh(&idev->mc_query_lock);

        mutex_lock(&idev->mc_lock);
        while ((skb = __skb_dequeue(&q)))
                __mld_query_work(skb);
        mutex_unlock(&idev->mc_lock);

        if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0))
                return;

        in6_dev_put(idev);
}

/* called with rcu_read_lock() */
void igmp6_event_report(struct sk_buff *skb)
{
        struct inet6_dev *idev = __in6_dev_get(skb->dev);

        if (!idev || idev->dead)
                goto out;

        spin_lock_bh(&idev->mc_report_lock);
        if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) {
                __skb_queue_tail(&idev->mc_report_queue, skb);
                if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0))
                        in6_dev_hold(idev);
                skb = NULL;
        }
        spin_unlock_bh(&idev->mc_report_lock);
out:
        kfree_skb(skb);
}

static void __mld_report_work(struct sk_buff *skb)
{
        struct inet6_dev *idev;
        struct ifmcaddr6 *ma;
        struct mld_msg *mld;
        int addr_type;

        /* Our own report looped back. Ignore it. */
        if (skb->pkt_type == PACKET_LOOPBACK)
                goto kfree_skb;

        /* send our report if the MC router may not have heard this report */
        if (skb->pkt_type != PACKET_MULTICAST &&
            skb->pkt_type != PACKET_BROADCAST)
                goto kfree_skb;

        if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr)))
                goto kfree_skb;

        mld = (struct mld_msg *)icmp6_hdr(skb);

        /* Drop reports with not link local source */
        addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
        if (addr_type != IPV6_ADDR_ANY &&
            !(addr_type&IPV6_ADDR_LINKLOCAL))
                goto kfree_skb;

        idev = in6_dev_get(skb->dev);
        if (!idev)
                goto kfree_skb;

        /*
         *        Cancel the work for this group
         */

        for_each_mc_mclock(idev, ma) {
                if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) {
                        if (cancel_delayed_work(&ma->mca_work))
                                refcount_dec(&ma->mca_refcnt);
                        ma->mca_flags &= ~(MAF_LAST_REPORTER |
                                           MAF_TIMER_RUNNING);
                        break;
                }
        }

        in6_dev_put(idev);
kfree_skb:
        consume_skb(skb);
}

static void mld_report_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_report_work);
        struct sk_buff_head q;
        struct sk_buff *skb;
        bool rework = false;
        int cnt = 0;

        skb_queue_head_init(&q);
        spin_lock_bh(&idev->mc_report_lock);
        while ((skb = __skb_dequeue(&idev->mc_report_queue))) {
                __skb_queue_tail(&q, skb);

                if (++cnt >= MLD_MAX_QUEUE) {
                        rework = true;
                        break;
                }
        }
        spin_unlock_bh(&idev->mc_report_lock);

        mutex_lock(&idev->mc_lock);
        while ((skb = __skb_dequeue(&q)))
                __mld_report_work(skb);
        mutex_unlock(&idev->mc_lock);

        if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0))
                return;

        in6_dev_put(idev);
}

static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
                  int gdeleted, int sdeleted)
{
        switch (type) {
        case MLD2_MODE_IS_INCLUDE:
        case MLD2_MODE_IS_EXCLUDE:
                if (gdeleted || sdeleted)
                        return false;
                if (!((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp)) {
                        if (pmc->mca_sfmode == MCAST_INCLUDE)
                                return true;
                        /* don't include if this source is excluded
                         * in all filters
                         */
                        if (psf->sf_count[MCAST_INCLUDE])
                                return type == MLD2_MODE_IS_INCLUDE;
                        return pmc->mca_sfcount[MCAST_EXCLUDE] ==
                                psf->sf_count[MCAST_EXCLUDE];
                }
                return false;
        case MLD2_CHANGE_TO_INCLUDE:
                if (gdeleted || sdeleted)
                        return false;
                return psf->sf_count[MCAST_INCLUDE] != 0;
        case MLD2_CHANGE_TO_EXCLUDE:
                if (gdeleted || sdeleted)
                        return false;
                if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 ||
                    psf->sf_count[MCAST_INCLUDE])
                        return false;
                return pmc->mca_sfcount[MCAST_EXCLUDE] ==
                        psf->sf_count[MCAST_EXCLUDE];
        case MLD2_ALLOW_NEW_SOURCES:
                if (gdeleted || !psf->sf_crcount)
                        return false;
                return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted;
        case MLD2_BLOCK_OLD_SOURCES:
                if (pmc->mca_sfmode == MCAST_INCLUDE)
                        return gdeleted || (psf->sf_crcount && sdeleted);
                return psf->sf_crcount && !gdeleted && !sdeleted;
        }
        return false;
}

static int
mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted)
{
        struct ip6_sf_list *psf;
        int scount = 0;

        for_each_psf_mclock(pmc, psf) {
                if (!is_in(pmc, psf, type, gdeleted, sdeleted))
                        continue;
                scount++;
        }
        return scount;
}

static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb,
                       struct net_device *dev, const struct in6_addr *saddr,
                       const struct in6_addr *daddr, int proto, int len)
{
        struct ipv6hdr *hdr;

        skb->protocol = htons(ETH_P_IPV6);
        skb->dev = dev;

        skb_reset_network_header(skb);
        skb_put(skb, sizeof(struct ipv6hdr));
        hdr = ipv6_hdr(skb);

        ip6_flow_hdr(hdr, 0, 0);

        hdr->payload_len = htons(len);
        hdr->nexthdr = proto;
        hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit);

        hdr->saddr = *saddr;
        hdr->daddr = *daddr;
}

static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
{
        u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT,
                     2, 0, 0, IPV6_TLV_PADN, 0 };
        struct net_device *dev = idev->dev;
        int hlen = LL_RESERVED_SPACE(dev);
        int tlen = dev->needed_tailroom;
        const struct in6_addr *saddr;
        struct in6_addr addr_buf;
        struct mld2_report *pmr;
        struct sk_buff *skb;
        unsigned int size;
        struct sock *sk;
        struct net *net;

        /* we assume size > sizeof(ra) here
         * Also try to not allocate high-order pages for big MTU
         */
        size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen;
        skb = alloc_skb(size, GFP_KERNEL);
        if (!skb)
                return NULL;

        skb->priority = TC_PRIO_CONTROL;
        skb_reserve(skb, hlen);
        skb_tailroom_reserve(skb, mtu, tlen);

        rcu_read_lock();

        net = dev_net_rcu(dev);
        sk = net->ipv6.igmp_sk;
        skb_set_owner_w(skb, sk);

        if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
                /* <draft-ietf-magma-mld-source-05.txt>:
                 * use unspecified address as the source address
                 * when a valid link-local address is not available.
                 */
                saddr = &in6addr_any;
        } else
                saddr = &addr_buf;

        ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0);

        rcu_read_unlock();

        skb_put_data(skb, ra, sizeof(ra));

        skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data);
        skb_put(skb, sizeof(*pmr));
        pmr = (struct mld2_report *)skb_transport_header(skb);
        pmr->mld2r_type = ICMPV6_MLD2_REPORT;
        pmr->mld2r_resv1 = 0;
        pmr->mld2r_cksum = 0;
        pmr->mld2r_resv2 = 0;
        pmr->mld2r_ngrec = 0;
        return skb;
}

static void mld_sendpack(struct sk_buff *skb)
{
        struct ipv6hdr *pip6 = ipv6_hdr(skb);
        struct mld2_report *pmr =
                              (struct mld2_report *)skb_transport_header(skb);
        int payload_len, mldlen;
        struct inet6_dev *idev;
        struct net *net = dev_net(skb->dev);
        int err;
        struct flowi6 fl6;
        struct dst_entry *dst;

        rcu_read_lock();
        idev = __in6_dev_get(skb->dev);
        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);

        payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) -
                sizeof(*pip6);
        mldlen = skb_tail_pointer(skb) - skb_transport_header(skb);
        pip6->payload_len = htons(payload_len);

        pmr->mld2r_cksum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen,
                                           IPPROTO_ICMPV6,
                                           csum_partial(skb_transport_header(skb),
                                                        mldlen, 0));

        icmpv6_flow_init(net->ipv6.igmp_sk, &fl6, ICMPV6_MLD2_REPORT,
                         &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
                         skb->dev->ifindex);
        dst = icmp6_dst_alloc(skb->dev, &fl6);

        err = 0;
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                dst = NULL;
        }
        skb_dst_set(skb, dst);
        if (err)
                goto err_out;

        err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
                      net, net->ipv6.igmp_sk, skb, NULL, skb->dev,
                      dst_output);
out:
        if (!err) {
                ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT);
                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
        } else {
                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
        }

        rcu_read_unlock();
        return;

err_out:
        kfree_skb(skb);
        goto out;
}

static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
{
        return sizeof(struct mld2_grec) + 16 * mld_scount(pmc,type,gdel,sdel);
}

static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
        int type, struct mld2_grec **ppgr, unsigned int mtu)
{
        struct mld2_report *pmr;
        struct mld2_grec *pgr;

        if (!skb) {
                skb = mld_newpack(pmc->idev, mtu);
                if (!skb)
                        return NULL;
        }
        pgr = skb_put(skb, sizeof(struct mld2_grec));
        pgr->grec_type = type;
        pgr->grec_auxwords = 0;
        pgr->grec_nsrcs = 0;
        pgr->grec_mca = pmc->mca_addr;        /* structure copy */
        pmr = (struct mld2_report *)skb_transport_header(skb);
        pmr->mld2r_ngrec = htons(ntohs(pmr->mld2r_ngrec)+1);
        *ppgr = pgr;
        return skb;
}

#define AVAILABLE(skb)        ((skb) ? skb_availroom(skb) : 0)

/* called with mc_lock */
static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
                                int type, int gdeleted, int sdeleted,
                                int crsend)
{
        struct ip6_sf_list *psf, *psf_prev, *psf_next;
        int scount, stotal, first, isquery, truncate;
        struct ip6_sf_list __rcu **psf_list;
        struct inet6_dev *idev = pmc->idev;
        struct net_device *dev = idev->dev;
        struct mld2_grec *pgr = NULL;
        struct mld2_report *pmr;
        unsigned int mtu;

        if (pmc->mca_flags & MAF_NOREPORT)
                return skb;

        mtu = READ_ONCE(dev->mtu);
        if (mtu < IPV6_MIN_MTU)
                return skb;

        isquery = type == MLD2_MODE_IS_INCLUDE ||
                  type == MLD2_MODE_IS_EXCLUDE;
        truncate = type == MLD2_MODE_IS_EXCLUDE ||
                    type == MLD2_CHANGE_TO_EXCLUDE;

        stotal = scount = 0;

        psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources;

        if (!rcu_access_pointer(*psf_list))
                goto empty_source;

        pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL;

        /* EX and TO_EX get a fresh packet, if needed */
        if (truncate) {
                if (pmr && pmr->mld2r_ngrec &&
                    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
                        if (skb)
                                mld_sendpack(skb);
                        skb = mld_newpack(idev, mtu);
                }
        }
        first = 1;
        psf_prev = NULL;
        for (psf = mc_dereference(*psf_list, idev);
             psf;
             psf = psf_next) {
                struct in6_addr *psrc;

                psf_next = mc_dereference(psf->sf_next, idev);

                if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) {
                        psf_prev = psf;
                        continue;
                }

                /* Based on RFC3810 6.1. Should not send source-list change
                 * records when there is a filter mode change.
                 */
                if (((gdeleted && pmc->mca_sfmode == MCAST_EXCLUDE) ||
                     (!gdeleted && pmc->mca_crcount)) &&
                    (type == MLD2_ALLOW_NEW_SOURCES ||
                     type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount)
                        goto decrease_sf_crcount;

                /* clear marks on query responses */
                if (isquery)
                        psf->sf_gsresp = 0;

                if (AVAILABLE(skb) < sizeof(*psrc) +
                    first*sizeof(struct mld2_grec)) {
                        if (truncate && !first)
                                break;         /* truncate these */
                        if (pgr)
                                pgr->grec_nsrcs = htons(scount);
                        if (skb)
                                mld_sendpack(skb);
                        skb = mld_newpack(idev, mtu);
                        first = 1;
                        scount = 0;
                }
                if (first) {
                        skb = add_grhead(skb, pmc, type, &pgr, mtu);
                        first = 0;
                }
                if (!skb)
                        return NULL;
                psrc = skb_put(skb, sizeof(*psrc));
                *psrc = psf->sf_addr;
                scount++; stotal++;
                if ((type == MLD2_ALLOW_NEW_SOURCES ||
                     type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
decrease_sf_crcount:
                        psf->sf_crcount--;
                        if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
                                if (psf_prev)
                                        rcu_assign_pointer(psf_prev->sf_next,
                                                           mc_dereference(psf->sf_next, idev));
                                else
                                        rcu_assign_pointer(*psf_list,
                                                           mc_dereference(psf->sf_next, idev));
                                kfree_rcu(psf, rcu);
                                continue;
                        }
                }
                psf_prev = psf;
        }

empty_source:
        if (!stotal) {
                if (type == MLD2_ALLOW_NEW_SOURCES ||
                    type == MLD2_BLOCK_OLD_SOURCES)
                        return skb;
                if (pmc->mca_crcount || isquery || crsend) {
                        /* make sure we have room for group header */
                        if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)) {
                                mld_sendpack(skb);
                                skb = NULL; /* add_grhead will get a new one */
                        }
                        skb = add_grhead(skb, pmc, type, &pgr, mtu);
                }
        }
        if (pgr)
                pgr->grec_nsrcs = htons(scount);

        if (isquery)
                pmc->mca_flags &= ~MAF_GSQUERY;        /* clear query state */
        return skb;
}

/* called with mc_lock */
static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc)
{
        struct sk_buff *skb = NULL;
        int type;

        if (!pmc) {
                for_each_mc_mclock(idev, pmc) {
                        if (pmc->mca_flags & MAF_NOREPORT)
                                continue;
                        if (pmc->mca_sfcount[MCAST_EXCLUDE])
                                type = MLD2_MODE_IS_EXCLUDE;
                        else
                                type = MLD2_MODE_IS_INCLUDE;
                        skb = add_grec(skb, pmc, type, 0, 0, 0);
                }
        } else {
                if (pmc->mca_sfcount[MCAST_EXCLUDE])
                        type = MLD2_MODE_IS_EXCLUDE;
                else
                        type = MLD2_MODE_IS_INCLUDE;
                skb = add_grec(skb, pmc, type, 0, 0, 0);
        }
        if (skb)
                mld_sendpack(skb);
}

/*
 * remove zero-count source records from a source filter list
 * called with mc_lock
 */
static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev)
{
        struct ip6_sf_list *psf_prev, *psf_next, *psf;

        psf_prev = NULL;
        for (psf = mc_dereference(*ppsf, idev);
             psf;
             psf = psf_next) {
                psf_next = mc_dereference(psf->sf_next, idev);
                if (psf->sf_crcount == 0) {
                        if (psf_prev)
                                rcu_assign_pointer(psf_prev->sf_next,
                                                   mc_dereference(psf->sf_next, idev));
                        else
                                rcu_assign_pointer(*ppsf,
                                                   mc_dereference(psf->sf_next, idev));
                        kfree_rcu(psf, rcu);
                } else {
                        psf_prev = psf;
                }
        }
}

/* called with mc_lock */
static void mld_send_cr(struct inet6_dev *idev)
{
        struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next;
        struct sk_buff *skb = NULL;
        int type, dtype;

        /* deleted MCA's */
        pmc_prev = NULL;
        for (pmc = mc_dereference(idev->mc_tomb, idev);
             pmc;
             pmc = pmc_next) {
                pmc_next = mc_dereference(pmc->next, idev);
                if (pmc->mca_sfmode == MCAST_INCLUDE) {
                        type = MLD2_BLOCK_OLD_SOURCES;
                        dtype = MLD2_BLOCK_OLD_SOURCES;
                        skb = add_grec(skb, pmc, type, 1, 0, 0);
                        skb = add_grec(skb, pmc, dtype, 1, 1, 0);
                }
                if (pmc->mca_crcount) {
                        if (pmc->mca_sfmode == MCAST_EXCLUDE) {
                                type = MLD2_CHANGE_TO_INCLUDE;
                                skb = add_grec(skb, pmc, type, 1, 0, 0);
                        }
                        pmc->mca_crcount--;
                        if (pmc->mca_crcount == 0) {
                                mld_clear_zeros(&pmc->mca_tomb, idev);
                                mld_clear_zeros(&pmc->mca_sources, idev);
                        }
                }
                if (pmc->mca_crcount == 0 &&
                    !rcu_access_pointer(pmc->mca_tomb) &&
                    !rcu_access_pointer(pmc->mca_sources)) {
                        if (pmc_prev)
                                rcu_assign_pointer(pmc_prev->next, pmc_next);
                        else
                                rcu_assign_pointer(idev->mc_tomb, pmc_next);
                        in6_dev_put(pmc->idev);
                        kfree_rcu(pmc, rcu);
                } else
                        pmc_prev = pmc;
        }

        /* change recs */
        for_each_mc_mclock(idev, pmc) {
                if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
                        type = MLD2_BLOCK_OLD_SOURCES;
                        dtype = MLD2_ALLOW_NEW_SOURCES;
                } else {
                        type = MLD2_ALLOW_NEW_SOURCES;
                        dtype = MLD2_BLOCK_OLD_SOURCES;
                }
                skb = add_grec(skb, pmc, type, 0, 0, 0);
                skb = add_grec(skb, pmc, dtype, 0, 1, 0);        /* deleted sources */

                /* filter mode changes */
                if (pmc->mca_crcount) {
                        if (pmc->mca_sfmode == MCAST_EXCLUDE)
                                type = MLD2_CHANGE_TO_EXCLUDE;
                        else
                                type = MLD2_CHANGE_TO_INCLUDE;
                        skb = add_grec(skb, pmc, type, 0, 0, 0);
                        pmc->mca_crcount--;
                }
        }
        if (!skb)
                return;
        (void) mld_sendpack(skb);
}

static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
{
        const struct in6_addr *snd_addr, *saddr;
        int err, len, payload_len, full_len;
        struct in6_addr addr_buf;
        struct inet6_dev *idev;
        struct sk_buff *skb;
        struct mld_msg *hdr;
        int hlen = LL_RESERVED_SPACE(dev);
        int tlen = dev->needed_tailroom;
        u8 ra[8] = { IPPROTO_ICMPV6, 0,
                     IPV6_TLV_ROUTERALERT, 2, 0, 0,
                     IPV6_TLV_PADN, 0 };
        struct dst_entry *dst;
        struct flowi6 fl6;
        struct net *net;
        struct sock *sk;

        if (type == ICMPV6_MGM_REDUCTION)
                snd_addr = &in6addr_linklocal_allrouters;
        else
                snd_addr = addr;

        len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr);
        payload_len = len + sizeof(ra);
        full_len = sizeof(struct ipv6hdr) + payload_len;

        skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL);

        rcu_read_lock();

        net = dev_net_rcu(dev);
        idev = __in6_dev_get(dev);
        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
        if (!skb) {
                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
                rcu_read_unlock();
                return;
        }
        sk = net->ipv6.igmp_sk;
        skb_set_owner_w(skb, sk);

        skb->priority = TC_PRIO_CONTROL;
        skb_reserve(skb, hlen);

        if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
                /* <draft-ietf-magma-mld-source-05.txt>:
                 * use unspecified address as the source address
                 * when a valid link-local address is not available.
                 */
                saddr = &in6addr_any;
        } else
                saddr = &addr_buf;

        ip6_mc_hdr(sk, skb, dev, saddr, snd_addr, NEXTHDR_HOP, payload_len);

        skb_put_data(skb, ra, sizeof(ra));

        hdr = skb_put_zero(skb, sizeof(struct mld_msg));
        hdr->mld_type = type;
        hdr->mld_mca = *addr;

        hdr->mld_cksum = csum_ipv6_magic(saddr, snd_addr, len,
                                         IPPROTO_ICMPV6,
                                         csum_partial(hdr, len, 0));

        icmpv6_flow_init(sk, &fl6, type,
                         &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
                         skb->dev->ifindex);
        dst = icmp6_dst_alloc(skb->dev, &fl6);
        if (IS_ERR(dst)) {
                err = PTR_ERR(dst);
                goto err_out;
        }

        skb_dst_set(skb, dst);
        err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
                      net, sk, skb, NULL, skb->dev,
                      dst_output);
out:
        if (!err) {
                ICMP6MSGOUT_INC_STATS(net, idev, type);
                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
        } else
                IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);

        rcu_read_unlock();
        return;

err_out:
        kfree_skb(skb);
        goto out;
}

/* called with mc_lock */
static void mld_send_initial_cr(struct inet6_dev *idev)
{
        struct sk_buff *skb;
        struct ifmcaddr6 *pmc;
        int type;

        if (mld_in_v1_mode(idev))
                return;

        skb = NULL;
        for_each_mc_mclock(idev, pmc) {
                if (pmc->mca_sfcount[MCAST_EXCLUDE])
                        type = MLD2_CHANGE_TO_EXCLUDE;
                else
                        type = MLD2_ALLOW_NEW_SOURCES;
                skb = add_grec(skb, pmc, type, 0, 0, 1);
        }
        if (skb)
                mld_sendpack(skb);
}

void ipv6_mc_dad_complete(struct inet6_dev *idev)
{
        mutex_lock(&idev->mc_lock);
        idev->mc_dad_count = idev->mc_qrv;
        if (idev->mc_dad_count) {
                mld_send_initial_cr(idev);
                idev->mc_dad_count--;
                if (idev->mc_dad_count)
                        mld_dad_start_work(idev,
                                           unsolicited_report_interval(idev));
        }
        mutex_unlock(&idev->mc_lock);
}

static void mld_dad_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_dad_work);
        mutex_lock(&idev->mc_lock);
        mld_send_initial_cr(idev);
        if (idev->mc_dad_count) {
                idev->mc_dad_count--;
                if (idev->mc_dad_count)
                        mld_dad_start_work(idev,
                                           unsolicited_report_interval(idev));
        }
        mutex_unlock(&idev->mc_lock);
        in6_dev_put(idev);
}

/* called with mc_lock */
static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
        const struct in6_addr *psfsrc)
{
        struct ip6_sf_list *psf, *psf_prev;
        int rv = 0;

        psf_prev = NULL;
        for_each_psf_mclock(pmc, psf) {
                if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
                        break;
                psf_prev = psf;
        }
        if (!psf || psf->sf_count[sfmode] == 0) {
                /* source filter not found, or count wrong =>  bug */
                return -ESRCH;
        }
        WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1);
        if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
                struct inet6_dev *idev = pmc->idev;

                /* no more filters for this source */
                if (psf_prev)
                        rcu_assign_pointer(psf_prev->sf_next,
                                           mc_dereference(psf->sf_next, idev));
                else
                        rcu_assign_pointer(pmc->mca_sources,
                                           mc_dereference(psf->sf_next, idev));

                if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) &&
                    !mld_in_v1_mode(idev)) {
                        psf->sf_crcount = idev->mc_qrv;
                        rcu_assign_pointer(psf->sf_next,
                                           mc_dereference(pmc->mca_tomb, idev));
                        rcu_assign_pointer(pmc->mca_tomb, psf);
                        rv = 1;
                } else {
                        kfree_rcu(psf, rcu);
                }
        }
        return rv;
}

/* called with mc_lock */
static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
                          int sfmode, int sfcount, const struct in6_addr *psfsrc,
                          int delta)
{
        struct ifmcaddr6 *pmc;
        int        changerec = 0;
        int        i, err;

        if (!idev)
                return -ENODEV;

        for_each_mc_mclock(idev, pmc) {
                if (ipv6_addr_equal(pmca, &pmc->mca_addr))
                        break;
        }
        if (!pmc)
                return -ESRCH;

        sf_markstate(pmc);
        if (!delta) {
                if (!pmc->mca_sfcount[sfmode])
                        return -EINVAL;

                pmc->mca_sfcount[sfmode]--;
        }
        err = 0;
        for (i = 0; i < sfcount; i++) {
                int rv = ip6_mc_del1_src(pmc, sfmode, &psfsrc[i]);

                changerec |= rv > 0;
                if (!err && rv < 0)
                        err = rv;
        }
        if (pmc->mca_sfmode == MCAST_EXCLUDE &&
            pmc->mca_sfcount[MCAST_EXCLUDE] == 0 &&
            pmc->mca_sfcount[MCAST_INCLUDE]) {
                struct ip6_sf_list *psf;

                /* filter mode change */
                pmc->mca_sfmode = MCAST_INCLUDE;
                pmc->mca_crcount = idev->mc_qrv;
                idev->mc_ifc_count = pmc->mca_crcount;
                for_each_psf_mclock(pmc, psf)
                        psf->sf_crcount = 0;
                mld_ifc_event(pmc->idev);
        } else if (sf_setstate(pmc) || changerec) {
                mld_ifc_event(pmc->idev);
        }

        return err;
}

/*
 * Add multicast single-source filter to the interface list
 * called with mc_lock
 */
static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode,
        const struct in6_addr *psfsrc)
{
        struct ip6_sf_list *psf, *psf_prev;

        psf_prev = NULL;
        for_each_psf_mclock(pmc, psf) {
                if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
                        break;
                psf_prev = psf;
        }
        if (!psf) {
                psf = kzalloc(sizeof(*psf), GFP_KERNEL);
                if (!psf)
                        return -ENOBUFS;

                psf->sf_addr = *psfsrc;
                if (psf_prev) {
                        rcu_assign_pointer(psf_prev->sf_next, psf);
                } else {
                        rcu_assign_pointer(pmc->mca_sources, psf);
                }
        }
        WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1);
        return 0;
}

/* called with mc_lock */
static void sf_markstate(struct ifmcaddr6 *pmc)
{
        struct ip6_sf_list *psf;
        int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];

        for_each_psf_mclock(pmc, psf) {
                if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
                        psf->sf_oldin = mca_xcount ==
                                psf->sf_count[MCAST_EXCLUDE] &&
                                !psf->sf_count[MCAST_INCLUDE];
                } else {
                        psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
                }
        }
}

/* called with mc_lock */
static int sf_setstate(struct ifmcaddr6 *pmc)
{
        struct ip6_sf_list *psf, *dpsf;
        int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
        int qrv = pmc->idev->mc_qrv;
        int new_in, rv;

        rv = 0;
        for_each_psf_mclock(pmc, psf) {
                if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
                        new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
                                !psf->sf_count[MCAST_INCLUDE];
                } else
                        new_in = psf->sf_count[MCAST_INCLUDE] != 0;
                if (new_in) {
                        if (!psf->sf_oldin) {
                                struct ip6_sf_list *prev = NULL;

                                for_each_psf_tomb(pmc, dpsf) {
                                        if (ipv6_addr_equal(&dpsf->sf_addr,
                                            &psf->sf_addr))
                                                break;
                                        prev = dpsf;
                                }
                                if (dpsf) {
                                        if (prev)
                                                rcu_assign_pointer(prev->sf_next,
                                                                   mc_dereference(dpsf->sf_next,
                                                                                  pmc->idev));
                                        else
                                                rcu_assign_pointer(pmc->mca_tomb,
                                                                   mc_dereference(dpsf->sf_next,
                                                                                  pmc->idev));
                                        kfree_rcu(dpsf, rcu);
                                }
                                psf->sf_crcount = qrv;
                                rv++;
                        }
                } else if (psf->sf_oldin) {
                        psf->sf_crcount = 0;
                        /*
                         * add or update "delete" records if an active filter
                         * is now inactive
                         */

                        for_each_psf_tomb(pmc, dpsf)
                                if (ipv6_addr_equal(&dpsf->sf_addr,
                                    &psf->sf_addr))
                                        break;
                        if (!dpsf) {
                                dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL);
                                if (!dpsf)
                                        continue;
                                *dpsf = *psf;
                                rcu_assign_pointer(dpsf->sf_next,
                                                   mc_dereference(pmc->mca_tomb, pmc->idev));
                                rcu_assign_pointer(pmc->mca_tomb, dpsf);
                        }
                        dpsf->sf_crcount = qrv;
                        rv++;
                }
        }
        return rv;
}

/*
 * Add multicast source filter list to the interface list
 * called with mc_lock
 */
static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
                          int sfmode, int sfcount, const struct in6_addr *psfsrc,
                          int delta)
{
        struct ifmcaddr6 *pmc;
        int        isexclude;
        int        i, err;

        if (!idev)
                return -ENODEV;

        for_each_mc_mclock(idev, pmc) {
                if (ipv6_addr_equal(pmca, &pmc->mca_addr))
                        break;
        }
        if (!pmc)
                return -ESRCH;

        sf_markstate(pmc);
        isexclude = pmc->mca_sfmode == MCAST_EXCLUDE;
        if (!delta)
                WRITE_ONCE(pmc->mca_sfcount[sfmode],
                           pmc->mca_sfcount[sfmode] + 1);
        err = 0;
        for (i = 0; i < sfcount; i++) {
                err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]);
                if (err)
                        break;
        }
        if (err) {
                int j;

                if (!delta)
                        WRITE_ONCE(pmc->mca_sfcount[sfmode],
                                   pmc->mca_sfcount[sfmode] - 1);
                for (j = 0; j < i; j++)
                        ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]);
        } else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) {
                struct ip6_sf_list *psf;

                /* filter mode change */
                if (pmc->mca_sfcount[MCAST_EXCLUDE])
                        pmc->mca_sfmode = MCAST_EXCLUDE;
                else if (pmc->mca_sfcount[MCAST_INCLUDE])
                        pmc->mca_sfmode = MCAST_INCLUDE;
                /* else no filters; keep old mode for reports */

                pmc->mca_crcount = idev->mc_qrv;
                idev->mc_ifc_count = pmc->mca_crcount;
                for_each_psf_mclock(pmc, psf)
                        psf->sf_crcount = 0;
                mld_ifc_event(idev);
        } else if (sf_setstate(pmc)) {
                mld_ifc_event(idev);
        }
        return err;
}

/* called with mc_lock */
static void ip6_mc_clear_src(struct ifmcaddr6 *pmc)
{
        struct ip6_sf_list *psf, *nextpsf;

        for (psf = mc_dereference(pmc->mca_tomb, pmc->idev);
             psf;
             psf = nextpsf) {
                nextpsf = mc_dereference(psf->sf_next, pmc->idev);
                kfree_rcu(psf, rcu);
        }
        RCU_INIT_POINTER(pmc->mca_tomb, NULL);
        for (psf = mc_dereference(pmc->mca_sources, pmc->idev);
             psf;
             psf = nextpsf) {
                nextpsf = mc_dereference(psf->sf_next, pmc->idev);
                kfree_rcu(psf, rcu);
        }
        RCU_INIT_POINTER(pmc->mca_sources, NULL);
        pmc->mca_sfmode = MCAST_EXCLUDE;
        pmc->mca_sfcount[MCAST_INCLUDE] = 0;
        /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */
        WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1);
}

/* called with mc_lock */
static void igmp6_join_group(struct ifmcaddr6 *ma)
{
        unsigned long delay;

        if (ma->mca_flags & MAF_NOREPORT)
                return;

        igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);

        delay = get_random_u32_below(unsolicited_report_interval(ma->idev));

        if (cancel_delayed_work(&ma->mca_work)) {
                refcount_dec(&ma->mca_refcnt);
                delay = ma->mca_work.timer.expires - jiffies;
        }

        if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
                refcount_inc(&ma->mca_refcnt);
        ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER;
}

static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
                            struct inet6_dev *idev)
{
        struct ip6_sf_socklist *psl;
        int err;

        psl = sock_dereference(iml->sflist, sk);

        if (idev)
                mutex_lock(&idev->mc_lock);

        if (!psl) {
                /* any-source empty exclude case */
                err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
        } else {
                err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
                                     psl->sl_count, psl->sl_addr, 0);
                RCU_INIT_POINTER(iml->sflist, NULL);
                atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                           &sk->sk_omem_alloc);
                kfree_rcu(psl, rcu);
        }

        if (idev)
                mutex_unlock(&idev->mc_lock);

        return err;
}

/* called with mc_lock */
static void igmp6_leave_group(struct ifmcaddr6 *ma)
{
        if (mld_in_v1_mode(ma->idev)) {
                if (ma->mca_flags & MAF_LAST_REPORTER) {
                        igmp6_send(&ma->mca_addr, ma->idev->dev,
                                ICMPV6_MGM_REDUCTION);
                }
        } else {
                mld_add_delrec(ma->idev, ma);
                mld_ifc_event(ma->idev);
        }
}

static void mld_gq_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_gq_work);

        mutex_lock(&idev->mc_lock);
        mld_send_report(idev, NULL);
        idev->mc_gq_running = 0;
        mutex_unlock(&idev->mc_lock);

        in6_dev_put(idev);
}

static void mld_ifc_work(struct work_struct *work)
{
        struct inet6_dev *idev = container_of(to_delayed_work(work),
                                              struct inet6_dev,
                                              mc_ifc_work);

        mutex_lock(&idev->mc_lock);
        mld_send_cr(idev);

        if (idev->mc_ifc_count) {
                idev->mc_ifc_count--;
                if (idev->mc_ifc_count)
                        mld_ifc_start_work(idev,
                                           unsolicited_report_interval(idev));
        }
        mutex_unlock(&idev->mc_lock);
        in6_dev_put(idev);
}

/* called with mc_lock */
static void mld_ifc_event(struct inet6_dev *idev)
{
        if (mld_in_v1_mode(idev))
                return;

        idev->mc_ifc_count = idev->mc_qrv;
        mld_ifc_start_work(idev, 1);
}

static void mld_mca_work(struct work_struct *work)
{
        struct ifmcaddr6 *ma = container_of(to_delayed_work(work),
                                            struct ifmcaddr6, mca_work);

        mutex_lock(&ma->idev->mc_lock);
        if (mld_in_v1_mode(ma->idev))
                igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
        else
                mld_send_report(ma->idev, ma);
        ma->mca_flags |=  MAF_LAST_REPORTER;
        ma->mca_flags &= ~MAF_TIMER_RUNNING;
        mutex_unlock(&ma->idev->mc_lock);

        ma_put(ma);
}

/* Device changing type */

void ipv6_mc_unmap(struct inet6_dev *idev)
{
        struct ifmcaddr6 *i;

        /* Install multicast list, except for all-nodes (already installed) */

        mutex_lock(&idev->mc_lock);
        for_each_mc_mclock(idev, i)
                igmp6_group_dropped(i);
        mutex_unlock(&idev->mc_lock);
}

void ipv6_mc_remap(struct inet6_dev *idev)
{
        ipv6_mc_up(idev);
}

/* Device going down */
void ipv6_mc_down(struct inet6_dev *idev)
{
        struct ifmcaddr6 *i;

        mutex_lock(&idev->mc_lock);
        /* Withdraw multicast list */
        for_each_mc_mclock(idev, i)
                igmp6_group_dropped(i);
        mutex_unlock(&idev->mc_lock);

        /* Should stop work after group drop. or we will
         * start work again in mld_ifc_event()
         */
        mld_query_stop_work(idev);
        mld_report_stop_work(idev);

        mutex_lock(&idev->mc_lock);
        mld_ifc_stop_work(idev);
        mld_gq_stop_work(idev);
        mutex_unlock(&idev->mc_lock);

        mld_dad_stop_work(idev);
}

static void ipv6_mc_reset(struct inet6_dev *idev)
{
        idev->mc_qrv = sysctl_mld_qrv;
        idev->mc_qi = MLD_QI_DEFAULT;
        idev->mc_qri = MLD_QRI_DEFAULT;
        idev->mc_v1_seen = 0;
        idev->mc_maxdelay = unsolicited_report_interval(idev);
}

/* Device going up */

void ipv6_mc_up(struct inet6_dev *idev)
{
        struct ifmcaddr6 *i;

        /* Install multicast list, except for all-nodes (already installed) */

        ipv6_mc_reset(idev);
        mutex_lock(&idev->mc_lock);
        for_each_mc_mclock(idev, i) {
                mld_del_delrec(idev, i);
                igmp6_group_added(i);
        }
        mutex_unlock(&idev->mc_lock);
}

/* IPv6 device initialization. */

void ipv6_mc_init_dev(struct inet6_dev *idev)
{
        idev->mc_gq_running = 0;
        INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work);
        RCU_INIT_POINTER(idev->mc_tomb, NULL);
        idev->mc_ifc_count = 0;
        INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work);
        INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work);
        INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work);
        INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work);
        skb_queue_head_init(&idev->mc_query_queue);
        skb_queue_head_init(&idev->mc_report_queue);
        spin_lock_init(&idev->mc_query_lock);
        spin_lock_init(&idev->mc_report_lock);
        mutex_init(&idev->mc_lock);
        ipv6_mc_reset(idev);
}

/*
 *        Device is about to be destroyed: clean up.
 */

void ipv6_mc_destroy_dev(struct inet6_dev *idev)
{
        struct ifmcaddr6 *i;

        /* Deactivate works */
        ipv6_mc_down(idev);
        mutex_lock(&idev->mc_lock);
        mld_clear_delrec(idev);
        mutex_unlock(&idev->mc_lock);
        mld_clear_query(idev);
        mld_clear_report(idev);

        /* Delete all-nodes address. */
        /* We cannot call ipv6_dev_mc_dec() directly, our caller in
         * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will
         * fail.
         */
        __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allnodes);

        if (idev->cnf.forwarding)
                __ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters);

        mutex_lock(&idev->mc_lock);
        while ((i = mc_dereference(idev->mc_list, idev))) {
                rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev));

                ip6_mc_clear_src(i);
                ma_put(i);
        }
        mutex_unlock(&idev->mc_lock);
}

static void ipv6_mc_rejoin_groups(struct inet6_dev *idev)
{
        struct ifmcaddr6 *pmc;

        ASSERT_RTNL();

        mutex_lock(&idev->mc_lock);
        if (mld_in_v1_mode(idev)) {
                for_each_mc_mclock(idev, pmc)
                        igmp6_join_group(pmc);
        } else {
                mld_send_report(idev, NULL);
        }
        mutex_unlock(&idev->mc_lock);
}

static int ipv6_mc_netdev_event(struct notifier_block *this,
                                unsigned long event,
                                void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct inet6_dev *idev = __in6_dev_get(dev);

        switch (event) {
        case NETDEV_RESEND_IGMP:
                if (idev)
                        ipv6_mc_rejoin_groups(idev);
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block igmp6_netdev_notifier = {
        .notifier_call = ipv6_mc_netdev_event,
};

#ifdef CONFIG_PROC_FS
struct igmp6_mc_iter_state {
        struct seq_net_private p;
        struct net_device *dev;
        struct inet6_dev *idev;
};

#define igmp6_mc_seq_private(seq)        ((struct igmp6_mc_iter_state *)(seq)->private)

static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
{
        struct ifmcaddr6 *im = NULL;
        struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
        struct net *net = seq_file_net(seq);

        state->idev = NULL;
        for_each_netdev_rcu(net, state->dev) {
                struct inet6_dev *idev;
                idev = __in6_dev_get(state->dev);
                if (!idev)
                        continue;

                im = rcu_dereference(idev->mc_list);
                if (im) {
                        state->idev = idev;
                        break;
                }
        }
        return im;
}

static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im)
{
        struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);

        im = rcu_dereference(im->next);
        while (!im) {
                state->dev = next_net_device_rcu(state->dev);
                if (!state->dev) {
                        state->idev = NULL;
                        break;
                }
                state->idev = __in6_dev_get(state->dev);
                if (!state->idev)
                        continue;
                im = rcu_dereference(state->idev->mc_list);
        }
        return im;
}

static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos)
{
        struct ifmcaddr6 *im = igmp6_mc_get_first(seq);
        if (im)
                while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL)
                        --pos;
        return pos ? NULL : im;
}

static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        rcu_read_lock();
        return igmp6_mc_get_idx(seq, *pos);
}

static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct ifmcaddr6 *im = igmp6_mc_get_next(seq, v);

        ++*pos;
        return im;
}

static void igmp6_mc_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);

        if (likely(state->idev))
                state->idev = NULL;
        state->dev = NULL;
        rcu_read_unlock();
}

static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
{
        struct ifmcaddr6 *im = (struct ifmcaddr6 *)v;
        struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);

        seq_printf(seq,
                   "%-4d %-15s %pi6 %5d %08X %ld\n",
                   state->dev->ifindex, state->dev->name,
                   &im->mca_addr,
                   im->mca_users, im->mca_flags,
                   (im->mca_flags & MAF_TIMER_RUNNING) ?
                   jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0);
        return 0;
}

static const struct seq_operations igmp6_mc_seq_ops = {
        .start        =        igmp6_mc_seq_start,
        .next        =        igmp6_mc_seq_next,
        .stop        =        igmp6_mc_seq_stop,
        .show        =        igmp6_mc_seq_show,
};

struct igmp6_mcf_iter_state {
        struct seq_net_private p;
        struct net_device *dev;
        struct inet6_dev *idev;
        struct ifmcaddr6 *im;
};

#define igmp6_mcf_seq_private(seq)        ((struct igmp6_mcf_iter_state *)(seq)->private)

static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
{
        struct ip6_sf_list *psf = NULL;
        struct ifmcaddr6 *im = NULL;
        struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
        struct net *net = seq_file_net(seq);

        state->idev = NULL;
        state->im = NULL;
        for_each_netdev_rcu(net, state->dev) {
                struct inet6_dev *idev;
                idev = __in6_dev_get(state->dev);
                if (unlikely(idev == NULL))
                        continue;

                im = rcu_dereference(idev->mc_list);
                if (likely(im)) {
                        psf = rcu_dereference(im->mca_sources);
                        if (likely(psf)) {
                                state->im = im;
                                state->idev = idev;
                                break;
                        }
                }
        }
        return psf;
}

static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf)
{
        struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);

        psf = rcu_dereference(psf->sf_next);
        while (!psf) {
                state->im = rcu_dereference(state->im->next);
                while (!state->im) {
                        state->dev = next_net_device_rcu(state->dev);
                        if (!state->dev) {
                                state->idev = NULL;
                                goto out;
                        }
                        state->idev = __in6_dev_get(state->dev);
                        if (!state->idev)
                                continue;
                        state->im = rcu_dereference(state->idev->mc_list);
                }
                psf = rcu_dereference(state->im->mca_sources);
        }
out:
        return psf;
}

static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos)
{
        struct ip6_sf_list *psf = igmp6_mcf_get_first(seq);
        if (psf)
                while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL)
                        --pos;
        return pos ? NULL : psf;
}

static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        rcu_read_lock();
        return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct ip6_sf_list *psf;
        if (v == SEQ_START_TOKEN)
                psf = igmp6_mcf_get_first(seq);
        else
                psf = igmp6_mcf_get_next(seq, v);
        ++*pos;
        return psf;
}

static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);

        if (likely(state->im))
                state->im = NULL;
        if (likely(state->idev))
                state->idev = NULL;

        state->dev = NULL;
        rcu_read_unlock();
}

static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
{
        struct ip6_sf_list *psf = (struct ip6_sf_list *)v;
        struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "Idx Device                Multicast Address                   Source Address    INC    EXC\n");
        } else {
                seq_printf(seq,
                           "%3d %6.6s %pi6 %pi6 %6lu %6lu\n",
                           state->dev->ifindex, state->dev->name,
                           &state->im->mca_addr,
                           &psf->sf_addr,
                           READ_ONCE(psf->sf_count[MCAST_INCLUDE]),
                           READ_ONCE(psf->sf_count[MCAST_EXCLUDE]));
        }
        return 0;
}

static const struct seq_operations igmp6_mcf_seq_ops = {
        .start        =        igmp6_mcf_seq_start,
        .next        =        igmp6_mcf_seq_next,
        .stop        =        igmp6_mcf_seq_stop,
        .show        =        igmp6_mcf_seq_show,
};

static int __net_init igmp6_proc_init(struct net *net)
{
        int err;

        err = -ENOMEM;
        if (!proc_create_net("igmp6", 0444, net->proc_net, &igmp6_mc_seq_ops,
                        sizeof(struct igmp6_mc_iter_state)))
                goto out;
        if (!proc_create_net("mcfilter6", 0444, net->proc_net,
                        &igmp6_mcf_seq_ops,
                        sizeof(struct igmp6_mcf_iter_state)))
                goto out_proc_net_igmp6;

        err = 0;
out:
        return err;

out_proc_net_igmp6:
        remove_proc_entry("igmp6", net->proc_net);
        goto out;
}

static void __net_exit igmp6_proc_exit(struct net *net)
{
        remove_proc_entry("mcfilter6", net->proc_net);
        remove_proc_entry("igmp6", net->proc_net);
}
#else
static inline int igmp6_proc_init(struct net *net)
{
        return 0;
}
static inline void igmp6_proc_exit(struct net *net)
{
}
#endif

static int __net_init igmp6_net_init(struct net *net)
{
        int err;

        err = inet_ctl_sock_create(&net->ipv6.igmp_sk, PF_INET6,
                                   SOCK_RAW, IPPROTO_ICMPV6, net);
        if (err < 0) {
                pr_err("Failed to initialize the IGMP6 control socket (err %d)\n",
                       err);
                goto out;
        }

        inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1;
        net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL;

        err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6,
                                   SOCK_RAW, IPPROTO_ICMPV6, net);
        if (err < 0) {
                pr_err("Failed to initialize the IGMP6 autojoin socket (err %d)\n",
                       err);
                goto out_sock_create;
        }

        err = igmp6_proc_init(net);
        if (err)
                goto out_sock_create_autojoin;

        return 0;

out_sock_create_autojoin:
        inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk);
out_sock_create:
        inet_ctl_sock_destroy(net->ipv6.igmp_sk);
out:
        return err;
}

static void __net_exit igmp6_net_exit(struct net *net)
{
        inet_ctl_sock_destroy(net->ipv6.igmp_sk);
        inet_ctl_sock_destroy(net->ipv6.mc_autojoin_sk);
        igmp6_proc_exit(net);
}

static struct pernet_operations igmp6_net_ops = {
        .init = igmp6_net_init,
        .exit = igmp6_net_exit,
};

int __init igmp6_init(void)
{
        int err;

        err = register_pernet_subsys(&igmp6_net_ops);
        if (err)
                return err;

        mld_wq = create_workqueue("mld");
        if (!mld_wq) {
                unregister_pernet_subsys(&igmp6_net_ops);
                return -ENOMEM;
        }

        return err;
}

int __init igmp6_late_init(void)
{
        return register_netdevice_notifier(&igmp6_netdev_notifier);
}

void igmp6_cleanup(void)
{
        unregister_pernet_subsys(&igmp6_net_ops);
        destroy_workqueue(mld_wq);
}

void igmp6_late_cleanup(void)
{
        unregister_netdevice_notifier(&igmp6_netdev_notifier);
}



































































































































































  435 
  437 










   82 

   82 














  118 

   82 
   74 


  128 



























   37 

  129 



  162 
























  163 




  163 



  162 




















  320 






























  217 

  155 




  321 




  214 









  163 

  163 
  163 





  321 











  321 






   86 
  283 





   81 







  313 
    3 


  124 

   35 
   91 








   18 
  321 


  158 



  158 














  159 






  108 





  108 




























  179 



  179 
  130 











































































































  324 





   74 



   74 

















































  306 




























































































































































































































































































































































































































  255 



  247 

   12 




  254 



  256 
























































































































































































































































































































































































  131 


























































   55 
   33 
   32 





   55 

   55 


  110 









   32 


  123 


  131 


























   16 





   16 



  131 
  131 

















  131 

















































































  179 




  133 
  135 
   87 




  134 

   10 




  133 






























   38 







   38 




  179 








  179 
















  179 



  179 







  177 
   32 

  179 



























  259 







  258 










  157 



  179 


   72 























   87 

  259 

  258 
  316 












  316 




  316 
  314 








  316 




  315 



  316 

  316 

  314 




































   62 

   24 




  316 




























  319 













  320 
  321 

  321 

  320 
  320 













    3 






    3 

    3 





    3 
    3 

    3 





































































































































































































































































































































































































































































































































































































































































































































































































































































































































  306 
  307 

  307 













  307 

  307 


  306 





  306 
















  307 
  307 















  307 

  307 

  307 













  306 
  306 
  305 










  306 




  307 



  307 













  306 
  307 
  306 















  307 



  306 







  307 












  307 





  307 




  307 



  306 













  307 

























































    1 


























































































  306 
  252 
















































   15 


   15 

   15 













   15 










   11 


    4 


    4 





















































  162 

   15 



















   26 



  220 

   70 
   26 




   70 
   26 



































   39 
    1 
   39 
   14 







   39 


    1 





























   39 


    1 



   39 











   39 













   39 

   39 






















    1 





   39 


   11 


   39 

   39 
   12 
   39 

    1 
    1 
































































































































































































































































   39 






   39 
























   39 







    1 





   39 





















    1 














    1 

   39 

    1 


   39 













    3 



    3 






































































    3 

























    6 

    3 









    3 



























































































  307 

  253 



















































































































































































































































































































































































































































































































































































































































































































































  231 



















































































  307 











  153 















  232 
   46 


  231 





  232 

















  230 


























  306 





  304 


   10 











  178 



















  169 
   16 





   11 



  168 
















   90 

   85 













































































































  249 








  249 




  236 




   15 

    8 




  249 






  232 

  363 

  153 


















  168 




  156 
   23 

















  159 

   12 



  138 
   46 






   46 




  168 
  168 






   92 


   82 




























  168 









  168 

  168 




  168 
















































































  135 
   39 




  160 



  160 








   80 



















   80 
  160 
  160 



  162 




    6 



  156 

  156 
    1 









    2 

   11 
   11 









    3 













    8 
    8 













   11 




    2 


   15 





   15 












    2 




   15 









































  237 

   11 

   17 


  251 
    7 















































































































































































































































































































  158 






















  390 



  389 





  445 




   43 




   43 



   13 

   39 




   17 


















   17 


















  443 








  445 



  444 



  431 
   45 

























  441 







  412 
  158 














  444 















  445 
























  469 







  469 








   50 



















  239 



    4 

  302 






  469 




























  353 




  272 


























  468 





  470 

  471 



  471 












  402 
  156 

  469 


   43 

  445 













  403 
  155 






  155 



  471 










  323 


   19 






















   13 
   13 
    3 




   13 























  320 


  323 



  214 






  149 


























   13 














  551 










  551 



  541 









  541 




  551 

















  329 


  541 






  318 
  317 
  317 









  318 

















  163 



  163 


   21 


  156 























  156 





   39 

























































   53 

   53 





   53 

   54 










































































































































































































































































































































































































































































































 1093 




 1093 













































































































































































































   53 






    7 




  278 
  250 
   41 



  279 
  250 
   41 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/memory.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus
 */

/*
 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 *
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 *
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.
 */

/*
 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *                Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.
 */

/*
 * 05.04.94  -  Multi-page memory management added for v1.1.
 *              Idea by Alex Bligh (alex@cconcepts.co.uk)
 *
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *                (Gerhard.Wichert@pdb.siemens.de)
 *
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)
 */

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/kmsan.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/perf_event.h>
#include <linux/ptrace.h>
#include <linux/vmalloc.h>
#include <linux/sched/sysctl.h>

#include <trace/events/kmem.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>

#include "pgalloc-track.h"
#include "internal.h"
#include "swap.h"

#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
#endif

static vm_fault_t do_fault(struct vm_fault *vmf);
static vm_fault_t do_anonymous_page(struct vm_fault *vmf);
static bool vmf_pte_changed(struct vm_fault *vmf);

/*
 * Return true if the original pte was a uffd-wp pte marker (so the pte was
 * wr-protected).
 */
static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf)
{
        if (!userfaultfd_wp(vmf->vma))
                return false;
        if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                return false;

        return pte_marker_uffd_wp(vmf->orig_pte);
}

/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
                                        1;
#else
                                        2;
#endif

#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
        /*
         * Transitioning a PTE from 'old' to 'young' can be expensive on
         * some architectures, even if it's performed in hardware. By
         * default, "false" means prefaulted entries will be 'young'.
         */
        return false;
}
#endif

static int __init disable_randmaps(char *s)
{
        randomize_va_space = 0;
        return 1;
}
__setup("norandmaps", disable_randmaps);

unsigned long zero_pfn __read_mostly;
EXPORT_SYMBOL(zero_pfn);

unsigned long highest_memmap_pfn __read_mostly;

/*
 * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
 */
static int __init init_zero_pfn(void)
{
        zero_pfn = page_to_pfn(ZERO_PAGE(0));
        return 0;
}
early_initcall(init_zero_pfn);

void mm_trace_rss_stat(struct mm_struct *mm, int member)
{
        trace_rss_stat(mm, member);
}

/*
 * Note: this doesn't free the actual pages themselves. That
 * has been handled earlier when unmapping all the memory regions.
 */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                           unsigned long addr)
{
        pgtable_t token = pmd_pgtable(*pmd);
        pmd_clear(pmd);
        pte_free_tlb(tlb, token, addr);
        mm_dec_nr_ptes(tlb->mm);
}

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pmd_t *pmd;
        unsigned long next;
        unsigned long start;

        start = addr;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                free_pte_range(tlb, pmd, addr);
        } while (pmd++, addr = next, addr != end);

        start &= PUD_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PUD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pmd = pmd_offset(pud, start);
        pud_clear(pud);
        pmd_free_tlb(tlb, pmd, start);
        mm_dec_nr_pmds(tlb->mm);
}

static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        pud_t *pud;
        unsigned long next;
        unsigned long start;

        start = addr;
        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                free_pmd_range(tlb, pud, addr, next, floor, ceiling);
        } while (pud++, addr = next, addr != end);

        start &= P4D_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= P4D_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        pud = pud_offset(p4d, start);
        p4d_clear(p4d);
        pud_free_tlb(tlb, pud, start);
        mm_dec_nr_puds(tlb->mm);
}

static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        p4d_t *p4d;
        unsigned long next;
        unsigned long start;

        start = addr;
        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                free_pud_range(tlb, p4d, addr, next, floor, ceiling);
        } while (p4d++, addr = next, addr != end);

        start &= PGDIR_MASK;
        if (start < floor)
                return;
        if (ceiling) {
                ceiling &= PGDIR_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                return;

        p4d = p4d_offset(pgd, start);
        pgd_clear(pgd);
        p4d_free_tlb(tlb, p4d, start);
}

/*
 * This function frees user-level page tables of a process.
 */
void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
{
        pgd_t *pgd;
        unsigned long next;

        /*
         * The next few lines have given us lots of grief...
         *
         * Why are we testing PMD* at this top level?  Because often
         * there will be no work to do at all, and we'd prefer not to
         * go all the way down to the bottom just to discover that.
         *
         * Why all these "- 1"s?  Because 0 represents both the bottom
         * of the address space and the top of it (using -1 for the
         * top wouldn't help much: the masks would do the wrong thing).
         * The rule is that addr 0 and floor 0 refer to the bottom of
         * the address space, but end 0 and ceiling 0 refer to the top
         * Comparisons need to use "end - 1" and "ceiling - 1" (though
         * that end 0 case should be mythical).
         *
         * Wherever addr is brought up or ceiling brought down, we must
         * be careful to reject "the opposite 0" before it confuses the
         * subsequent tests.  But what about where end is brought down
         * by PMD_SIZE below? no, end can't go down to 0 there.
         *
         * Whereas we round start (addr) and ceiling down, by different
         * masks at different levels, in order to test whether a table
         * now has no other vmas using it, so can be freed, we don't
         * bother to round floor or end up - the tests don't need that.
         */

        addr &= PMD_MASK;
        if (addr < floor) {
                addr += PMD_SIZE;
                if (!addr)
                        return;
        }
        if (ceiling) {
                ceiling &= PMD_MASK;
                if (!ceiling)
                        return;
        }
        if (end - 1 > ceiling - 1)
                end -= PMD_SIZE;
        if (addr > end - 1)
                return;
        /*
         * We add page table cache pages with PAGE_SIZE,
         * (see pte_free_tlb()), flush the tlb if we need
         */
        tlb_change_page_size(tlb, PAGE_SIZE);
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                free_p4d_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
}

void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked)
{
        struct unlink_vma_file_batch vb;

        do {
                unsigned long addr = vma->vm_start;
                struct vm_area_struct *next;

                /*
                 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
                 * be 0.  This will underflow and is okay.
                 */
                next = mas_find(mas, ceiling - 1);
                if (unlikely(xa_is_zero(next)))
                        next = NULL;

                /*
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
                if (mm_wr_locked)
                        vma_start_write(vma);
                unlink_anon_vmas(vma);

                if (is_vm_hugetlb_page(vma)) {
                        unlink_file_vma(vma);
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                } else {
                        unlink_file_vma_batch_init(&vb);
                        unlink_file_vma_batch_add(&vb, vma);

                        /*
                         * Optimization: gather nearby vmas into one call down
                         */
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
                                next = mas_find(mas, ceiling - 1);
                                if (unlikely(xa_is_zero(next)))
                                        next = NULL;
                                if (mm_wr_locked)
                                        vma_start_write(vma);
                                unlink_anon_vmas(vma);
                                unlink_file_vma_batch_add(&vb, vma);
                        }
                        unlink_file_vma_batch_final(&vb);
                        free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                }
                vma = next;
        } while (vma);
}

void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
{
        spinlock_t *ptl = pmd_lock(mm, pmd);

        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                mm_inc_nr_ptes(mm);
                /*
                 * Ensure all pte setup (eg. pte page lock and page clearing) are
                 * visible before the pte is made visible to other CPUs by being
                 * put into page tables.
                 *
                 * The other side of the story is the pointer chasing in the page
                 * table walking code (when walking the page table without locking;
                 * ie. most of the time). Fortunately, these data accesses consist
                 * of a chain of data-dependent loads, meaning most CPUs (alpha
                 * being the notable exception) will already guarantee loads are
                 * seen in-order. See the alpha page table accessors for the
                 * smp_rmb() barriers in page table walking code.
                 */
                smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
                pmd_populate(mm, pmd, *pte);
                *pte = NULL;
        }
        spin_unlock(ptl);
}

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
        pgtable_t new = pte_alloc_one(mm);
        if (!new)
                return -ENOMEM;

        pmd_install(mm, pmd, &new);
        if (new)
                pte_free(mm, new);
        return 0;
}

int __pte_alloc_kernel(pmd_t *pmd)
{
        pte_t *new = pte_alloc_one_kernel(&init_mm);
        if (!new)
                return -ENOMEM;

        spin_lock(&init_mm.page_table_lock);
        if (likely(pmd_none(*pmd))) {        /* Has another populated it ? */
                smp_wmb(); /* See comment in pmd_install() */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
        }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
        return 0;
}

static inline void init_rss_vec(int *rss)
{
        memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
}

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
{
        int i;

        for (i = 0; i < NR_MM_COUNTERS; i++)
                if (rss[i])
                        add_mm_counter(mm, i, rss[i]);
}

/*
 * This function is called to print an error when a bad pte
 * is found. For example, we might have a PFN-mapped pte in
 * a region that doesn't allow it.
 *
 * The calling function must still handle the error.
 */
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                          pte_t pte, struct page *page)
{
        pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
        p4d_t *p4d = p4d_offset(pgd, addr);
        pud_t *pud = pud_offset(p4d, addr);
        pmd_t *pmd = pmd_offset(pud, addr);
        struct address_space *mapping;
        pgoff_t index;
        static unsigned long resume;
        static unsigned long nr_shown;
        static unsigned long nr_unshown;

        /*
         * Allow a burst of 60 reports, then keep quiet for that minute;
         * or allow a steady drip of one report per second.
         */
        if (nr_shown == 60) {
                if (time_before(jiffies, resume)) {
                        nr_unshown++;
                        return;
                }
                if (nr_unshown) {
                        pr_alert("BUG: Bad page map: %lu messages suppressed\n",
                                 nr_unshown);
                        nr_unshown = 0;
                }
                nr_shown = 0;
        }
        if (nr_shown++ == 0)
                resume = jiffies + 60 * HZ;

        mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
        index = linear_page_index(vma, addr);

        pr_alert("BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
                 current->comm,
                 (long long)pte_val(pte), (long long)pmd_val(*pmd));
        if (page)
                dump_page(page, "bad pte");
        pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
                 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
        pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
                 vma->vm_file,
                 vma->vm_ops ? vma->vm_ops->fault : NULL,
                 vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
                 mapping ? mapping->a_ops->read_folio : NULL);
        dump_stack();
        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}

/*
 * vm_normal_page -- This function gets the "struct page" associated with a pte.
 *
 * "Special" mappings do not wish to be associated with a "struct page" (either
 * it doesn't exist, or it exists but they don't want to touch it). In this
 * case, NULL is returned here. "Normal" mappings do have a struct page.
 *
 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
 * pte bit, in which case this function is trivial. Secondly, an architecture
 * may not have a spare pte bit, which requires a more complicated scheme,
 * described below.
 *
 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
 * special mapping (even if there are underlying and valid "struct pages").
 * COWed pages of a VM_PFNMAP are always normal.
 *
 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
 * mapping will always honor the rule
 *
 *        pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
 * And for normal mappings this is false.
 *
 * This restricts such mappings to be a linear translation from virtual address
 * to pfn. To get around this restriction, we allow arbitrary mappings so long
 * as the vma is not a COW mapping; in that case, we know that all ptes are
 * special (because none can have been COWed).
 *
 *
 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
 *
 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
 * page" backing, however the difference is that _all_ pages with a struct
 * page (that is, those where pfn_valid is true) are refcounted and considered
 * normal pages by the VM. The only exception are zeropages, which are
 * *never* refcounted.
 *
 * The disadvantage is that pages are refcounted (which can be slower and
 * simply not an option for some PFNMAP users). The advantage is that we
 * don't have to follow the strict linearity rule of PFNMAP mappings in
 * order to support COWable mappings.
 *
 */
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        unsigned long pfn = pte_pfn(pte);

        if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) {
                if (likely(!pte_special(pte)))
                        goto check_pfn;
                if (vma->vm_ops && vma->vm_ops->find_special_page)
                        return vma->vm_ops->find_special_page(vma, addr);
                if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                        return NULL;
                if (is_zero_pfn(pfn))
                        return NULL;
                if (pte_devmap(pte))
                /*
                 * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
                 * and will have refcounts incremented on their struct pages
                 * when they are inserted into PTEs, thus they are safe to
                 * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
                 * do not have refcounts. Example of legacy ZONE_DEVICE is
                 * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
                 */
                        return NULL;

                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */

        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        if (is_zero_pfn(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (is_zero_pfn(pfn))
                return NULL;

check_pfn:
        if (unlikely(pfn > highest_memmap_pfn)) {
                print_bad_pte(vma, addr, pte, NULL);
                return NULL;
        }

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        VM_WARN_ON_ONCE(is_zero_pfn(pfn));
        return pfn_to_page(pfn);
}

struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                            pte_t pte)
{
        struct page *page = vm_normal_page(vma, addr, pte);

        if (page)
                return page_folio(page);
        return NULL;
}

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd)
{
        unsigned long pfn = pmd_pfn(pmd);

        /* Currently it's only used for huge pfnmaps */
        if (unlikely(pmd_special(pmd)))
                return NULL;

        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
                if (vma->vm_flags & VM_MIXEDMAP) {
                        if (!pfn_valid(pfn))
                                return NULL;
                        goto out;
                } else {
                        unsigned long off;
                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
                        if (pfn == vma->vm_pgoff + off)
                                return NULL;
                        if (!is_cow_mapping(vma->vm_flags))
                                return NULL;
                }
        }

        if (pmd_devmap(pmd))
                return NULL;
        if (is_huge_zero_pmd(pmd))
                return NULL;
        if (unlikely(pfn > highest_memmap_pfn))
                return NULL;

        /*
         * NOTE! We still have PageReserved() pages in the page tables.
         * eg. VDSO mappings can cause them to exist.
         */
out:
        return pfn_to_page(pfn);
}

struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd)
{
        struct page *page = vm_normal_page_pmd(vma, addr, pmd);

        if (page)
                return page_folio(page);
        return NULL;
}
#endif

/**
 * restore_exclusive_pte - Restore a device-exclusive entry
 * @vma: VMA covering @address
 * @folio: the mapped folio
 * @page: the mapped folio page
 * @address: the virtual address
 * @ptep: pte pointer into the locked page table mapping the folio page
 * @orig_pte: pte value at @ptep
 *
 * Restore a device-exclusive non-swap entry to an ordinary present pte.
 *
 * The folio and the page table must be locked, and MMU notifiers must have
 * been called to invalidate any (exclusive) device mappings.
 *
 * Locking the folio makes sure that anybody who just converted the pte to
 * a device-exclusive entry can map it into the device to make forward
 * progress without others converting it back until the folio was unlocked.
 *
 * If the folio lock ever becomes an issue, we can stop relying on the folio
 * lock; it might make some scenarios with heavy thrashing less likely to
 * make forward progress, but these scenarios might not be valid use cases.
 *
 * Note that the folio lock does not protect against all cases of concurrent
 * page table modifications (e.g., MADV_DONTNEED, mprotect), so device drivers
 * must use MMU notifiers to sync against any concurrent changes.
 */
static void restore_exclusive_pte(struct vm_area_struct *vma,
                struct folio *folio, struct page *page, unsigned long address,
                pte_t *ptep, pte_t orig_pte)
{
        pte_t pte;

        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);

        pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
        if (pte_swp_soft_dirty(orig_pte))
                pte = pte_mksoft_dirty(pte);

        if (pte_swp_uffd_wp(orig_pte))
                pte = pte_mkuffd_wp(pte);

        if ((vma->vm_flags & VM_WRITE) &&
            can_change_pte_writable(vma, address, pte)) {
                if (folio_test_dirty(folio))
                        pte = pte_mkdirty(pte);
                pte = pte_mkwrite(pte, vma);
        }
        set_pte_at(vma->vm_mm, address, ptep, pte);

        /*
         * No need to invalidate - it was non-present before. However
         * secondary CPUs may have mappings that need invalidating.
         */
        update_mmu_cache(vma, address, ptep);
}

/*
 * Tries to restore an exclusive pte if the page lock can be acquired without
 * sleeping.
 */
static int try_restore_exclusive_pte(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep, pte_t orig_pte)
{
        struct page *page = pfn_swap_entry_to_page(pte_to_swp_entry(orig_pte));
        struct folio *folio = page_folio(page);

        if (folio_trylock(folio)) {
                restore_exclusive_pte(vma, folio, page, addr, ptep, orig_pte);
                folio_unlock(folio);
                return 0;
        }

        return -EBUSY;
}

/*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
 * covered by this vma.
 */

static unsigned long
copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, unsigned long addr, int *rss)
{
        unsigned long vm_flags = dst_vma->vm_flags;
        pte_t orig_pte = ptep_get(src_pte);
        pte_t pte = orig_pte;
        struct folio *folio;
        struct page *page;
        swp_entry_t entry = pte_to_swp_entry(orig_pte);

        if (likely(!non_swap_entry(entry))) {
                if (swap_duplicate(entry) < 0)
                        return -EIO;

                /* make sure dst_mm is on swapoff's mmlist. */
                if (unlikely(list_empty(&dst_mm->mmlist))) {
                        spin_lock(&mmlist_lock);
                        if (list_empty(&dst_mm->mmlist))
                                list_add(&dst_mm->mmlist,
                                                &src_mm->mmlist);
                        spin_unlock(&mmlist_lock);
                }
                /* Mark the swap entry as shared. */
                if (pte_swp_exclusive(orig_pte)) {
                        pte = pte_swp_clear_exclusive(orig_pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
                rss[MM_SWAPENTS]++;
        } else if (is_migration_entry(entry)) {
                folio = pfn_swap_entry_folio(entry);

                rss[mm_counter(folio)]++;

                if (!is_readable_migration_entry(entry) &&
                                is_cow_mapping(vm_flags)) {
                        /*
                         * COW mappings require pages in both parent and child
                         * to be set to read. A previously exclusive entry is
                         * now shared.
                         */
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_soft_dirty(orig_pte))
                                pte = pte_swp_mksoft_dirty(pte);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_private_entry(entry)) {
                page = pfn_swap_entry_to_page(entry);
                folio = page_folio(page);

                /*
                 * Update rss count even for unaddressable pages, as
                 * they should treated just like normal pages in this
                 * respect.
                 *
                 * We will likely want to have some new rss counters
                 * for unaddressable pages, at some point. But for now
                 * keep things as they are.
                 */
                folio_get(folio);
                rss[mm_counter(folio)]++;
                /* Cannot fail as these pages cannot get pinned. */
                folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma);

                /*
                 * We do not preserve soft-dirty information, because so
                 * far, checkpoint/restore is the only feature that
                 * requires that. And checkpoint/restore does not work
                 * when a device driver is involved (you cannot easily
                 * save and restore device driver state).
                 */
                if (is_writable_device_private_entry(entry) &&
                    is_cow_mapping(vm_flags)) {
                        entry = make_readable_device_private_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_exclusive_entry(entry)) {
                /*
                 * Make device exclusive entries present by restoring the
                 * original entry then copying as for a present pte. Device
                 * exclusive entries currently only support private writable
                 * (ie. COW) mappings.
                 */
                VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
                if (try_restore_exclusive_pte(src_vma, addr, src_pte, orig_pte))
                        return -EBUSY;
                return -ENOENT;
        } else if (is_pte_marker_entry(entry)) {
                pte_marker marker = copy_pte_marker(entry, dst_vma);

                if (marker)
                        set_pte_at(dst_mm, addr, dst_pte,
                                   make_pte_marker(marker));
                return 0;
        }
        if (!userfaultfd_wp(dst_vma))
                pte = pte_swp_clear_uffd_wp(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
}

/*
 * Copy a present and normal page.
 *
 * NOTE! The usual case is that this isn't required;
 * instead, the caller can just increase the page refcount
 * and re-use the pte the traditional way.
 *
 * And if we need a pre-allocated page but don't yet have
 * one, return a negative error to let the preallocation
 * code know so that it can do so outside the page table
 * lock.
 */
static inline int
copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                  struct folio **prealloc, struct page *page)
{
        struct folio *new_folio;
        pte_t pte;

        new_folio = *prealloc;
        if (!new_folio)
                return -EAGAIN;

        /*
         * We have a prealloc page, all good!  Take it
         * over and copy the page & arm it.
         */

        if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma))
                return -EHWPOISON;

        *prealloc = NULL;
        __folio_mark_uptodate(new_folio);
        folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE);
        folio_add_lru_vma(new_folio, dst_vma);
        rss[MM_ANONPAGES]++;

        /* All done, just insert the new page copy in the child */
        pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
        if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
                /* Uffd-wp needs to be delivered to dest pte as well */
                pte = pte_mkuffd_wp(pte);
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
}

static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
                pte_t pte, unsigned long addr, int nr)
{
        struct mm_struct *src_mm = src_vma->vm_mm;

        /* If it's a COW mapping, write protect it both processes. */
        if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
                wrprotect_ptes(src_mm, addr, src_pte, nr);
                pte = pte_wrprotect(pte);
        }

        /* If it's a shared mapping, mark it clean in the child. */
        if (src_vma->vm_flags & VM_SHARED)
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);

        if (!userfaultfd_wp(dst_vma))
                pte = pte_clear_uffd_wp(pte);

        set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
}

/*
 * Copy one present PTE, trying to batch-process subsequent PTEs that map
 * consecutive pages of the same folio by copying them as well.
 *
 * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
 * Otherwise, returns the number of copied PTEs (at least 1).
 */
static inline int
copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
                 int max_nr, int *rss, struct folio **prealloc)
{
        struct page *page;
        struct folio *folio;
        bool any_writable;
        fpb_t flags = 0;
        int err, nr;

        page = vm_normal_page(src_vma, addr, pte);
        if (unlikely(!page))
                goto copy_pte;

        folio = page_folio(page);

        /*
         * If we likely have to copy, just don't bother with batching. Make
         * sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
                if (src_vma->vm_flags & VM_SHARED)
                        flags |= FPB_IGNORE_DIRTY;
                if (!vma_soft_dirty_enabled(src_vma))
                        flags |= FPB_IGNORE_SOFT_DIRTY;

                nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
                                     &any_writable, NULL, NULL);
                folio_ref_add(folio, nr);
                if (folio_test_anon(folio)) {
                        if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
                                                                  nr, dst_vma, src_vma))) {
                                folio_ref_sub(folio, nr);
                                return -EAGAIN;
                        }
                        rss[MM_ANONPAGES] += nr;
                        VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
                } else {
                        folio_dup_file_rmap_ptes(folio, page, nr, dst_vma);
                        rss[mm_counter_file(folio)] += nr;
                }
                if (any_writable)
                        pte = pte_mkwrite(pte, src_vma);
                __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
                                    addr, nr);
                return nr;
        }

        folio_get(folio);
        if (folio_test_anon(folio)) {
                /*
                 * If this page may have been pinned by the parent process,
                 * copy the page immediately for the child so that we'll always
                 * guarantee the pinned page won't be randomly replaced in the
                 * future.
                 */
                if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, dst_vma, src_vma))) {
                        /* Page may be pinned, we have to copy. */
                        folio_put(folio);
                        err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
                                                addr, rss, prealloc, page);
                        return err ? err : 1;
                }
                rss[MM_ANONPAGES]++;
                VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
        } else {
                folio_dup_file_rmap_pte(folio, page, dst_vma);
                rss[mm_counter_file(folio)]++;
        }

copy_pte:
        __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
        return 1;
}

static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
                struct vm_area_struct *vma, unsigned long addr, bool need_zero)
{
        struct folio *new_folio;

        if (need_zero)
                new_folio = vma_alloc_zeroed_movable_folio(vma, addr);
        else
                new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);

        if (!new_folio)
                return NULL;

        if (mem_cgroup_charge(new_folio, src_mm, GFP_KERNEL)) {
                folio_put(new_folio);
                return NULL;
        }
        folio_throttle_swaprate(new_folio, GFP_KERNEL);

        return new_folio;
}

static int
copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
        pmd_t dummy_pmdval;
        pte_t ptent;
        spinlock_t *src_ptl, *dst_ptl;
        int progress, max_nr, ret = 0;
        int rss[NR_MM_COUNTERS];
        swp_entry_t entry = (swp_entry_t){0};
        struct folio *prealloc = NULL;
        int nr;

again:
        progress = 0;
        init_rss_vec(rss);

        /*
         * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
         * error handling here, assume that exclusive mmap_lock on dst and src
         * protects anon from unexpected THP transitions; with shmem and file
         * protected by mmap_lock-less collapse skipping areas with anon_vma
         * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
         * can remove such assumptions later, but this is good enough for now.
         */
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte) {
                ret = -ENOMEM;
                goto out;
        }

        /*
         * We already hold the exclusive mmap_lock, the copy_pte_range() and
         * retract_page_tables() are using vma->anon_vma to be exclusive, so
         * the PTE page is stable, and there is no need to get pmdval and do
         * pmd_same() check.
         */
        src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval,
                                           &src_ptl);
        if (!src_pte) {
                pte_unmap_unlock(dst_pte, dst_ptl);
                /* ret == 0 */
                goto out;
        }
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
        orig_dst_pte = dst_pte;
        arch_enter_lazy_mmu_mode();

        do {
                nr = 1;

                /*
                 * We are holding two locks at this point - either of them
                 * could generate latencies in another task on another CPU.
                 */
                if (progress >= 32) {
                        progress = 0;
                        if (need_resched() ||
                            spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
                                break;
                }
                ptent = ptep_get(src_pte);
                if (pte_none(ptent)) {
                        progress++;
                        continue;
                }
                if (unlikely(!pte_present(ptent))) {
                        ret = copy_nonpresent_pte(dst_mm, src_mm,
                                                  dst_pte, src_pte,
                                                  dst_vma, src_vma,
                                                  addr, rss);
                        if (ret == -EIO) {
                                entry = pte_to_swp_entry(ptep_get(src_pte));
                                break;
                        } else if (ret == -EBUSY) {
                                break;
                        } else if (!ret) {
                                progress += 8;
                                continue;
                        }
                        ptent = ptep_get(src_pte);
                        VM_WARN_ON_ONCE(!pte_present(ptent));

                        /*
                         * Device exclusive entry restored, continue by copying
                         * the now present pte.
                         */
                        WARN_ON_ONCE(ret != -ENOENT);
                }
                /* copy_present_ptes() will clear `*prealloc' if consumed */
                max_nr = (end - addr) / PAGE_SIZE;
                ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
                                        ptent, addr, max_nr, rss, &prealloc);
                /*
                 * If we need a pre-allocated page for this pte, drop the
                 * locks, allocate, and try again.
                 * If copy failed due to hwpoison in source page, break out.
                 */
                if (unlikely(ret == -EAGAIN || ret == -EHWPOISON))
                        break;
                if (unlikely(prealloc)) {
                        /*
                         * pre-alloc page cannot be reused by next time so as
                         * to strictly follow mempolicy (e.g., alloc_page_vma()
                         * will allocate page according to address).  This
                         * could only happen if one pinned pte changed.
                         */
                        folio_put(prealloc);
                        prealloc = NULL;
                }
                nr = ret;
                progress += 8 * nr;
        } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
                 addr != end);

        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(orig_src_pte, src_ptl);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();

        if (ret == -EIO) {
                VM_WARN_ON_ONCE(!entry.val);
                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
                        ret = -ENOMEM;
                        goto out;
                }
                entry.val = 0;
        } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) {
                goto out;
        } else if (ret ==  -EAGAIN) {
                prealloc = folio_prealloc(src_mm, src_vma, addr, false);
                if (!prealloc)
                        return -ENOMEM;
        } else if (ret < 0) {
                VM_WARN_ON_ONCE(1);
        }

        /* We've captured and resolved the error. Reset, try again. */
        ret = 0;

        if (addr != end)
                goto again;
out:
        if (unlikely(prealloc))
                folio_put(prealloc);
        return ret;
}

static inline int
copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pmd_t *src_pmd, *dst_pmd;
        unsigned long next;

        dst_pmd = pmd_alloc(dst_mm, dst_pud, addr);
        if (!dst_pmd)
                return -ENOMEM;
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd)
                        || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
                        err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
                                            addr, dst_vma, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pmd++, src_pmd++, addr = next, addr != end);
        return 0;
}

static inline int
copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        pud_t *src_pud, *dst_pud;
        unsigned long next;

        dst_pud = pud_alloc(dst_mm, dst_p4d, addr);
        if (!dst_pud)
                return -ENOMEM;
        src_pud = pud_offset(src_p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) {
                        int err;

                        VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma);
                        err = copy_huge_pud(dst_mm, src_mm,
                                            dst_pud, src_pud, addr, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
                                continue;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(src_pud))
                        continue;
                if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_pud++, src_pud++, addr = next, addr != end);
        return 0;
}

static inline int
copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
               pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr,
               unsigned long end)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        p4d_t *src_p4d, *dst_p4d;
        unsigned long next;

        dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr);
        if (!dst_p4d)
                return -ENOMEM;
        src_p4d = p4d_offset(src_pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(src_p4d))
                        continue;
                if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d,
                                   addr, next))
                        return -ENOMEM;
        } while (dst_p4d++, src_p4d++, addr = next, addr != end);
        return 0;
}

/*
 * Return true if the vma needs to copy the pgtable during this fork().  Return
 * false when we can speed up fork() by allowing lazy page faults later until
 * when the child accesses the memory range.
 */
static bool
vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        /*
         * Always copy pgtables when dst_vma has uffd-wp enabled even if it's
         * file-backed (e.g. shmem). Because when uffd-wp is enabled, pgtable
         * contains uffd-wp protection information, that's something we can't
         * retrieve from page cache, and skip copying will lose those info.
         */
        if (userfaultfd_wp(dst_vma))
                return true;

        if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
                return true;

        if (src_vma->anon_vma)
                return true;

        /*
         * Don't copy ptes where a page fault will fill them correctly.  Fork
         * becomes much lighter when there are big shared or private readonly
         * mappings. The tradeoff is that copy_page_range is more efficient
         * than faulting.
         */
        return false;
}

int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        pgd_t *src_pgd, *dst_pgd;
        unsigned long addr = src_vma->vm_start;
        unsigned long end = src_vma->vm_end;
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        struct mm_struct *src_mm = src_vma->vm_mm;
        struct mmu_notifier_range range;
        unsigned long next, pfn = 0;
        bool is_cow;
        int ret;

        if (!vma_needs_copy(dst_vma, src_vma))
                return 0;

        if (is_vm_hugetlb_page(src_vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, dst_vma, src_vma);

        if (unlikely(src_vma->vm_flags & VM_PFNMAP)) {
                ret = track_pfn_copy(dst_vma, src_vma, &pfn);
                if (ret)
                        return ret;
        }

        /*
         * We need to invalidate the secondary MMU mappings only when
         * there could be a permission downgrade on the ptes of the
         * parent mm. And a permission downgrade will only happen if
         * is_cow_mapping() returns true.
         */
        is_cow = is_cow_mapping(src_vma->vm_flags);

        if (is_cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
                                        0, src_mm, addr, end);
                mmu_notifier_invalidate_range_start(&range);
                /*
                 * Disabling preemption is not needed for the write side, as
                 * the read side doesn't spin, but goes to the mmap_lock.
                 *
                 * Use the raw variant of the seqcount_t write API to avoid
                 * lockdep complaining about preemptibility.
                 */
                vma_assert_write_locked(src_vma);
                raw_write_seqcount_begin(&src_mm->write_protect_seq);
        }

        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
                if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd,
                                            addr, next))) {
                        ret = -ENOMEM;
                        break;
                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);

        if (is_cow) {
                raw_write_seqcount_end(&src_mm->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        }
        if (ret && unlikely(src_vma->vm_flags & VM_PFNMAP))
                untrack_pfn_copy(dst_vma, pfn);
        return ret;
}

/* Whether we should zap all COWed (private) pages too */
static inline bool should_zap_cows(struct zap_details *details)
{
        /* By default, zap all pages */
        if (!details || details->reclaim_pt)
                return true;

        /* Or, we zap COWed pages only if the caller wants to */
        return details->even_cows;
}

/* Decides whether we should zap this folio with the folio pointer specified */
static inline bool should_zap_folio(struct zap_details *details,
                                    struct folio *folio)
{
        /* If we can make a decision without *folio.. */
        if (should_zap_cows(details))
                return true;

        /* Otherwise we should only zap non-anon folios */
        return !folio_test_anon(folio);
}

static inline bool zap_drop_markers(struct zap_details *details)
{
        if (!details)
                return false;

        return details->zap_flags & ZAP_FLAG_DROP_MARKER;
}

/*
 * This function makes sure that we'll replace the none pte with an uffd-wp
 * swap special pte marker when necessary. Must be with the pgtable lock held.
 *
 * Returns true if uffd-wp ptes was installed, false otherwise.
 */
static inline bool
zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
                              unsigned long addr, pte_t *pte, int nr,
                              struct zap_details *details, pte_t pteval)
{
        bool was_installed = false;

#ifdef CONFIG_PTE_MARKER_UFFD_WP
        /* Zap on anonymous always means dropping everything */
        if (vma_is_anonymous(vma))
                return false;

        if (zap_drop_markers(details))
                return false;

        for (;;) {
                /* the PFN in the PTE is irrelevant. */
                if (pte_install_uffd_wp_if_needed(vma, addr, pte, pteval))
                        was_installed = true;
                if (--nr == 0)
                        break;
                pte++;
                addr += PAGE_SIZE;
        }
#endif
        return was_installed;
}

static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, struct folio *folio,
                struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
                unsigned long addr, struct zap_details *details, int *rss,
                bool *force_flush, bool *force_break, bool *any_skipped)
{
        struct mm_struct *mm = tlb->mm;
        bool delay_rmap = false;

        if (!folio_test_anon(folio)) {
                ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                if (pte_dirty(ptent)) {
                        folio_mark_dirty(folio);
                        if (tlb_delay_rmap(tlb)) {
                                delay_rmap = true;
                                *force_flush = true;
                        }
                }
                if (pte_young(ptent) && likely(vma_has_recency(vma)))
                        folio_mark_accessed(folio);
                rss[mm_counter(folio)] -= nr;
        } else {
                /* We don't need up-to-date accessed/dirty bits. */
                clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                rss[MM_ANONPAGES] -= nr;
        }
        /* Checking a single PTE in a batch is sufficient. */
        arch_check_zapped_pte(vma, ptent);
        tlb_remove_tlb_entries(tlb, pte, nr, addr);
        if (unlikely(userfaultfd_pte_wp(vma, ptent)))
                *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte,
                                                             nr, details, ptent);

        if (!delay_rmap) {
                folio_remove_rmap_ptes(folio, page, nr, vma);

                if (unlikely(folio_mapcount(folio) < 0))
                        print_bad_pte(vma, addr, ptent, page);
        }
        if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
                *force_flush = true;
                *force_break = true;
        }
}

/*
 * Zap or skip at least one present PTE, trying to batch-process subsequent
 * PTEs that map consecutive pages of the same folio.
 *
 * Returns the number of processed (skipped or zapped) PTEs (at least 1).
 */
static inline int zap_present_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
                unsigned int max_nr, unsigned long addr,
                struct zap_details *details, int *rss, bool *force_flush,
                bool *force_break, bool *any_skipped)
{
        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
        struct mm_struct *mm = tlb->mm;
        struct folio *folio;
        struct page *page;
        int nr;

        page = vm_normal_page(vma, addr, ptent);
        if (!page) {
                /* We don't need up-to-date accessed/dirty bits. */
                ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
                arch_check_zapped_pte(vma, ptent);
                tlb_remove_tlb_entry(tlb, pte, addr);
                if (userfaultfd_pte_wp(vma, ptent))
                        *any_skipped = zap_install_uffd_wp_if_needed(vma, addr,
                                                pte, 1, details, ptent);
                ksm_might_unmap_zero_page(mm, ptent);
                return 1;
        }

        folio = page_folio(page);
        if (unlikely(!should_zap_folio(details, folio))) {
                *any_skipped = true;
                return 1;
        }

        /*
         * Make sure that the common "small folio" case is as fast as possible
         * by keeping the batching logic separate.
         */
        if (unlikely(folio_test_large(folio) && max_nr != 1)) {
                nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
                                     NULL, NULL, NULL);

                zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
                                       addr, details, rss, force_flush,
                                       force_break, any_skipped);
                return nr;
        }
        zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
                               details, rss, force_flush, force_break, any_skipped);
        return 1;
}

static inline int zap_nonpresent_ptes(struct mmu_gather *tlb,
                struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
                unsigned int max_nr, unsigned long addr,
                struct zap_details *details, int *rss, bool *any_skipped)
{
        swp_entry_t entry;
        int nr = 1;

        *any_skipped = true;
        entry = pte_to_swp_entry(ptent);
        if (is_device_private_entry(entry) ||
                is_device_exclusive_entry(entry)) {
                struct page *page = pfn_swap_entry_to_page(entry);
                struct folio *folio = page_folio(page);

                if (unlikely(!should_zap_folio(details, folio)))
                        return 1;
                /*
                 * Both device private/exclusive mappings should only
                 * work with anonymous page so far, so we don't need to
                 * consider uffd-wp bit when zap. For more information,
                 * see zap_install_uffd_wp_if_needed().
                 */
                WARN_ON_ONCE(!vma_is_anonymous(vma));
                rss[mm_counter(folio)]--;
                folio_remove_rmap_pte(folio, page, vma);
                folio_put(folio);
        } else if (!non_swap_entry(entry)) {
                /* Genuine swap entries, hence a private anon pages */
                if (!should_zap_cows(details))
                        return 1;

                nr = swap_pte_batch(pte, max_nr, ptent);
                rss[MM_SWAPENTS] -= nr;
                free_swap_and_cache_nr(entry, nr);
        } else if (is_migration_entry(entry)) {
                struct folio *folio = pfn_swap_entry_folio(entry);

                if (!should_zap_folio(details, folio))
                        return 1;
                rss[mm_counter(folio)]--;
        } else if (pte_marker_entry_uffd_wp(entry)) {
                /*
                 * For anon: always drop the marker; for file: only
                 * drop the marker if explicitly requested.
                 */
                if (!vma_is_anonymous(vma) && !zap_drop_markers(details))
                        return 1;
        } else if (is_guard_swp_entry(entry)) {
                /*
                 * Ordinary zapping should not remove guard PTE
                 * markers. Only do so if we should remove PTE markers
                 * in general.
                 */
                if (!zap_drop_markers(details))
                        return 1;
        } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) {
                if (!should_zap_cows(details))
                        return 1;
        } else {
                /* We should have covered all the swap entry types */
                pr_alert("unrecognized swap entry 0x%lx\n", entry.val);
                WARN_ON_ONCE(1);
        }
        clear_not_present_full_ptes(vma->vm_mm, addr, pte, nr, tlb->fullmm);
        *any_skipped = zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details, ptent);

        return nr;
}

static inline int do_zap_pte_range(struct mmu_gather *tlb,
                                   struct vm_area_struct *vma, pte_t *pte,
                                   unsigned long addr, unsigned long end,
                                   struct zap_details *details, int *rss,
                                   bool *force_flush, bool *force_break,
                                   bool *any_skipped)
{
        pte_t ptent = ptep_get(pte);
        int max_nr = (end - addr) / PAGE_SIZE;
        int nr = 0;

        /* Skip all consecutive none ptes */
        if (pte_none(ptent)) {
                for (nr = 1; nr < max_nr; nr++) {
                        ptent = ptep_get(pte + nr);
                        if (!pte_none(ptent))
                                break;
                }
                max_nr -= nr;
                if (!max_nr)
                        return nr;
                pte += nr;
                addr += nr * PAGE_SIZE;
        }

        if (pte_present(ptent))
                nr += zap_present_ptes(tlb, vma, pte, ptent, max_nr, addr,
                                       details, rss, force_flush, force_break,
                                       any_skipped);
        else
                nr += zap_nonpresent_ptes(tlb, vma, pte, ptent, max_nr, addr,
                                          details, rss, any_skipped);

        return nr;
}

static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        bool force_flush = false, force_break = false;
        struct mm_struct *mm = tlb->mm;
        int rss[NR_MM_COUNTERS];
        spinlock_t *ptl;
        pte_t *start_pte;
        pte_t *pte;
        pmd_t pmdval;
        unsigned long start = addr;
        bool can_reclaim_pt = reclaim_pt_is_enabled(start, end, details);
        bool direct_reclaim = true;
        int nr;

retry:
        tlb_change_page_size(tlb, PAGE_SIZE);
        init_rss_vec(rss);
        start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return addr;

        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        do {
                bool any_skipped = false;

                if (need_resched()) {
                        direct_reclaim = false;
                        break;
                }

                nr = do_zap_pte_range(tlb, vma, pte, addr, end, details, rss,
                                      &force_flush, &force_break, &any_skipped);
                if (any_skipped)
                        can_reclaim_pt = false;
                if (unlikely(force_break)) {
                        addr += nr * PAGE_SIZE;
                        direct_reclaim = false;
                        break;
                }
        } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);

        /*
         * Fast path: try to hold the pmd lock and unmap the PTE page.
         *
         * If the pte lock was released midway (retry case), or if the attempt
         * to hold the pmd lock failed, then we need to recheck all pte entries
         * to ensure they are still none, thereby preventing the pte entries
         * from being repopulated by another thread.
         */
        if (can_reclaim_pt && direct_reclaim && addr == end)
                direct_reclaim = try_get_and_clear_pmd(mm, pmd, &pmdval);

        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();

        /* Do the actual TLB flush before dropping ptl */
        if (force_flush) {
                tlb_flush_mmu_tlbonly(tlb);
                tlb_flush_rmaps(tlb, vma);
        }
        pte_unmap_unlock(start_pte, ptl);

        /*
         * If we forced a TLB flush (either due to running out of
         * batch buffers or because we needed to flush dirty TLB
         * entries before releasing the ptl), free the batched
         * memory too. Come back again if we didn't do everything.
         */
        if (force_flush)
                tlb_flush_mmu(tlb);

        if (addr != end) {
                cond_resched();
                force_flush = false;
                force_break = false;
                goto retry;
        }

        if (can_reclaim_pt) {
                if (direct_reclaim)
                        free_pte(mm, start, tlb, pmdval);
                else
                        try_to_free_pte(mm, pmd, start, tlb);
        }

        return addr;
}

static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
                        else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
                                addr = next;
                                continue;
                        }
                        /* fall through */
                } else if (details && details->single_folio &&
                           folio_test_pmd_mappable(details->single_folio) &&
                           next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
                        spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
                        /*
                         * Take and drop THP pmd lock so that we cannot return
                         * prematurely, while zap_huge_pmd() has cleared *pmd,
                         * but not yet decremented compound_mapcount().
                         */
                        spin_unlock(ptl);
                }
                if (pmd_none(*pmd)) {
                        addr = next;
                        continue;
                }
                addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
                if (addr != next)
                        pmd--;
        } while (pmd++, cond_resched(), addr != end);

        return addr;
}

static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
                        if (next - addr != HPAGE_PUD_SIZE) {
                                mmap_assert_locked(tlb->mm);
                                split_huge_pud(vma, pud, addr);
                        } else if (zap_huge_pud(tlb, vma, pud, addr))
                                goto next;
                        /* fall through */
                }
                if (pud_none_or_clear_bad(pud))
                        continue;
                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
next:
                cond_resched();
        } while (pud++, addr = next, addr != end);

        return addr;
}

static inline unsigned long zap_p4d_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                struct zap_details *details)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                next = zap_pud_range(tlb, vma, p4d, addr, next, details);
        } while (p4d++, addr = next, addr != end);

        return addr;
}

void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details)
{
        pgd_t *pgd;
        unsigned long next;

        BUG_ON(addr >= end);
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
        } while (pgd++, addr = next, addr != end);
        tlb_end_vma(tlb, vma);
}


static void unmap_single_vma(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr,
                struct zap_details *details, bool mm_wr_locked)
{
        unsigned long start = max(vma->vm_start, start_addr);
        unsigned long end;

        if (start >= vma->vm_end)
                return;
        end = min(vma->vm_end, end_addr);
        if (end <= vma->vm_start)
                return;

        if (vma->vm_file)
                uprobe_munmap(vma, start, end);

        if (unlikely(vma->vm_flags & VM_PFNMAP))
                untrack_pfn(vma, 0, 0, mm_wr_locked);

        if (start != end) {
                if (unlikely(is_vm_hugetlb_page(vma))) {
                        /*
                         * It is undesirable to test vma->vm_file as it
                         * should be non-null for valid hugetlb area.
                         * However, vm_file will be NULL in the error
                         * cleanup path of mmap_region. When
                         * hugetlbfs ->mmap method fails,
                         * mmap_region() nullifies vma->vm_file
                         * before calling this function to clean up.
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
                        if (vma->vm_file) {
                                zap_flags_t zap_flags = details ?
                                    details->zap_flags : 0;
                                __unmap_hugepage_range(tlb, vma, start, end,
                                                             NULL, zap_flags);
                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
}

/**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
 * @tlb: address of the caller's struct mmu_gather
 * @mas: the maple state
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
 * @tree_end: The maximum index to check
 * @mm_wr_locked: lock flag
 *
 * Unmap all pages in the vma list.
 *
 * Only addresses between `start' and `end' will be unmapped.
 *
 * The VMA list must be sorted in ascending virtual address order.
 *
 * unmap_vmas() assumes that the caller will flush the whole unmapped address
 * range after unmap_vmas() returns.  So the only responsibility here is to
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long tree_end,
                bool mm_wr_locked)
{
        struct mmu_notifier_range range;
        struct zap_details details = {
                .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP,
                /* Careful - we need to zap private pages too! */
                .even_cows = true,
        };

        mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm,
                                start_addr, end_addr);
        mmu_notifier_invalidate_range_start(&range);
        do {
                unsigned long start = start_addr;
                unsigned long end = end_addr;
                hugetlb_zap_begin(vma, &start, &end);
                unmap_single_vma(tlb, vma, start, end, &details,
                                 mm_wr_locked);
                hugetlb_zap_end(vma, &details);
                vma = mas_find(mas, tree_end - 1);
        } while (vma && likely(!xa_is_zero(vma)));
        mmu_notifier_invalidate_range_end(&range);
}

/**
 * zap_page_range_single - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of shared cache invalidation
 *
 * The range must fit into one VMA.
 */
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
{
        const unsigned long end = address + size;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, end);
        hugetlb_zap_begin(vma, &range.start, &range.end);
        tlb_gather_mmu(&tlb, vma->vm_mm);
        update_hiwater_rss(vma->vm_mm);
        mmu_notifier_invalidate_range_start(&range);
        /*
         * unmap 'address-end' not 'range.start-range.end' as range
         * could have been expanded for hugetlb pmd sharing.
         */
        unmap_single_vma(&tlb, vma, address, end, details, false);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
        hugetlb_zap_end(vma, details);
}

/**
 * zap_vma_ptes - remove ptes mapping the vma
 * @vma: vm_area_struct holding ptes to be zapped
 * @address: starting address of pages to zap
 * @size: number of bytes to zap
 *
 * This function only unmaps ptes assigned to VM_PFNMAP vmas.
 *
 * The entire address range must be fully contained within the vma.
 *
 */
void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                unsigned long size)
{
        if (!range_in_vma(vma, address, address + size) ||
                            !(vma->vm_flags & VM_PFNMAP))
                return;

        zap_page_range_single(vma, address, size, NULL);
}
EXPORT_SYMBOL_GPL(zap_vma_ptes);

static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return NULL;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return NULL;

        VM_BUG_ON(pmd_trans_huge(*pmd));
        return pmd;
}

pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
{
        pmd_t *pmd = walk_to_pmd(mm, addr);

        if (!pmd)
                return NULL;
        return pte_alloc_map_lock(mm, pmd, addr, ptl);
}

static bool vm_mixed_zeropage_allowed(struct vm_area_struct *vma)
{
        VM_WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP);
        /*
         * Whoever wants to forbid the zeropage after some zeropages
         * might already have been mapped has to scan the page tables and
         * bail out on any zeropages. Zeropages in COW mappings can
         * be unshared using FAULT_FLAG_UNSHARE faults.
         */
        if (mm_forbids_zeropage(vma->vm_mm))
                return false;
        /* zeropages in COW mappings are common and unproblematic. */
        if (is_cow_mapping(vma->vm_flags))
                return true;
        /* Mappings that do not allow for writable PTEs are unproblematic. */
        if (!(vma->vm_flags & (VM_WRITE | VM_MAYWRITE)))
                return true;
        /*
         * Why not allow any VMA that has vm_ops->pfn_mkwrite? GUP could
         * find the shared zeropage and longterm-pin it, which would
         * be problematic as soon as the zeropage gets replaced by a different
         * page due to vma->vm_ops->pfn_mkwrite, because what's mapped would
         * now differ to what GUP looked up. FSDAX is incompatible to
         * FOLL_LONGTERM and VM_IO is incompatible to GUP completely (see
         * check_vma_flags).
         */
        return vma->vm_ops && vma->vm_ops->pfn_mkwrite &&
               (vma_is_fsdax(vma) || vma->vm_flags & VM_IO);
}

static int validate_page_before_insert(struct vm_area_struct *vma,
                                       struct page *page)
{
        struct folio *folio = page_folio(page);

        if (!folio_ref_count(folio))
                return -EINVAL;
        if (unlikely(is_zero_folio(folio))) {
                if (!vm_mixed_zeropage_allowed(vma))
                        return -EINVAL;
                return 0;
        }
        if (folio_test_anon(folio) || folio_test_slab(folio) ||
            page_has_type(page))
                return -EINVAL;
        flush_dcache_folio(folio);
        return 0;
}

static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
                                unsigned long addr, struct page *page,
                                pgprot_t prot, bool mkwrite)
{
        struct folio *folio = page_folio(page);
        pte_t pteval = ptep_get(pte);

        if (!pte_none(pteval)) {
                if (!mkwrite)
                        return -EBUSY;

                /* see insert_pfn(). */
                if (pte_pfn(pteval) != page_to_pfn(page)) {
                        WARN_ON_ONCE(!is_zero_pfn(pte_pfn(pteval)));
                        return -EFAULT;
                }
                pteval = maybe_mkwrite(pteval, vma);
                pteval = pte_mkyoung(pteval);
                if (ptep_set_access_flags(vma, addr, pte, pteval, 1))
                        update_mmu_cache(vma, addr, pte);
                return 0;
        }

        /* Ok, finally just insert the thing.. */
        pteval = mk_pte(page, prot);
        if (unlikely(is_zero_folio(folio))) {
                pteval = pte_mkspecial(pteval);
        } else {
                folio_get(folio);
                pteval = mk_pte(page, prot);
                if (mkwrite) {
                        pteval = pte_mkyoung(pteval);
                        pteval = maybe_mkwrite(pte_mkdirty(pteval), vma);
                }
                inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
                folio_add_file_rmap_pte(folio, page, vma);
        }
        set_pte_at(vma->vm_mm, addr, pte, pteval);
        return 0;
}

static int insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page, pgprot_t prot, bool mkwrite)
{
        int retval;
        pte_t *pte;
        spinlock_t *ptl;

        retval = validate_page_before_insert(vma, page);
        if (retval)
                goto out;
        retval = -ENOMEM;
        pte = get_locked_pte(vma->vm_mm, addr, &ptl);
        if (!pte)
                goto out;
        retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
                                        mkwrite);
        pte_unmap_unlock(pte, ptl);
out:
        return retval;
}

static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
{
        int err;

        err = validate_page_before_insert(vma, page);
        if (err)
                return err;
        return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
}

/* insert_pages() amortizes the cost of spinlock operations
 * when inserting pages in a loop.
 */
static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num, pgprot_t prot)
{
        pmd_t *pmd = NULL;
        pte_t *start_pte, *pte;
        spinlock_t *pte_lock;
        struct mm_struct *const mm = vma->vm_mm;
        unsigned long curr_page_idx = 0;
        unsigned long remaining_pages_total = *num;
        unsigned long pages_to_write_in_pmd;
        int ret;
more:
        ret = -EFAULT;
        pmd = walk_to_pmd(mm, addr);
        if (!pmd)
                goto out;

        pages_to_write_in_pmd = min_t(unsigned long,
                remaining_pages_total, PTRS_PER_PTE - pte_index(addr));

        /* Allocate the PTE if necessary; takes PMD lock once only. */
        ret = -ENOMEM;
        if (pte_alloc(mm, pmd))
                goto out;

        while (pages_to_write_in_pmd) {
                int pte_idx = 0;
                const int batch_size = min_t(int, pages_to_write_in_pmd, 8);

                start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
                if (!start_pte) {
                        ret = -EFAULT;
                        goto out;
                }
                for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
                        int err = insert_page_in_batch_locked(vma, pte,
                                addr, pages[curr_page_idx], prot);
                        if (unlikely(err)) {
                                pte_unmap_unlock(start_pte, pte_lock);
                                ret = err;
                                remaining_pages_total -= pte_idx;
                                goto out;
                        }
                        addr += PAGE_SIZE;
                        ++curr_page_idx;
                }
                pte_unmap_unlock(start_pte, pte_lock);
                pages_to_write_in_pmd -= batch_size;
                remaining_pages_total -= batch_size;
        }
        if (remaining_pages_total)
                goto more;
        ret = 0;
out:
        *num = remaining_pages_total;
        return ret;
}

/**
 * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock.
 * @vma: user vma to map to
 * @addr: target start user address of these pages
 * @pages: source kernel pages
 * @num: in: number of pages to map. out: number of pages that were *not*
 * mapped. (0 means all pages were successfully mapped).
 *
 * Preferred over vm_insert_page() when inserting multiple pages.
 *
 * In case of error, we may have mapped a subset of the provided
 * pages. It is the caller's responsibility to account for this case.
 *
 * The same restrictions apply as in vm_insert_page().
 */
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num)
{
        const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1;

        if (addr < vma->vm_start || end_addr >= vma->vm_end)
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        /* Defer page refcount checking till we're about to map that page. */
        return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_insert_pages);

/**
 * vm_insert_page - insert single page into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @page: source kernel page
 *
 * This allows drivers to insert individual pages they've allocated
 * into a user vma. The zeropage is supported in some VMAs,
 * see vm_mixed_zeropage_allowed().
 *
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
 * (see split_page()).
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
 * that. Your vma protection will have to be set up correctly, which
 * means that if you want a shared writable mapping, you'd better
 * ask for a shared writable mapping!
 *
 * The page does not need to be reserved.
 *
 * Usually this function is called from f_op->mmap() handler
 * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
 * Caller must set VM_MIXEDMAP on vma if it wants to call this
 * function from other places, for example from page-fault handler.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                        struct page *page)
{
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
        if (!(vma->vm_flags & VM_MIXEDMAP)) {
                BUG_ON(mmap_read_trylock(vma->vm_mm));
                BUG_ON(vma->vm_flags & VM_PFNMAP);
                vm_flags_set(vma, VM_MIXEDMAP);
        }
        return insert_page(vma, addr, page, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vm_insert_page);

/*
 * __vm_map_pages - maps range of kernel pages into user vma
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 * @offset: user's requested vm_pgoff
 *
 * This allows drivers to map range of kernel pages into a user vma.
 * The zeropage is supported in some VMAs, see
 * vm_mixed_zeropage_allowed().
 *
 * Return: 0 on success and error code otherwise.
 */
static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num, unsigned long offset)
{
        unsigned long count = vma_pages(vma);
        unsigned long uaddr = vma->vm_start;
        int ret, i;

        /* Fail if the user requested offset is beyond the end of the object */
        if (offset >= num)
                return -ENXIO;

        /* Fail if the user requested size exceeds available object size */
        if (count > num - offset)
                return -ENXIO;

        for (i = 0; i < count; i++) {
                ret = vm_insert_page(vma, uaddr, pages[offset + i]);
                if (ret < 0)
                        return ret;
                uaddr += PAGE_SIZE;
        }

        return 0;
}

/**
 * vm_map_pages - maps range of kernel pages starts with non zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Maps an object consisting of @num pages, catering for the user's
 * requested vm_pgoff
 *
 * If we fail to insert any page into the vma, the function will return
 * immediately leaving any previously inserted pages present.  Callers
 * from the mmap handler may immediately return the error as their caller
 * will destroy the vma, removing any successfully inserted pages. Other
 * callers should make their own arrangements for calling unmap_region().
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
}
EXPORT_SYMBOL(vm_map_pages);

/**
 * vm_map_pages_zero - map range of kernel pages starts with zero offset
 * @vma: user vma to map to
 * @pages: pointer to array of source kernel pages
 * @num: number of pages in page array
 *
 * Similar to vm_map_pages(), except that it explicitly sets the offset
 * to 0. This function is intended for the drivers that did not consider
 * vm_pgoff.
 *
 * Context: Process context. Called by mmap handlers.
 * Return: 0 on success and error code otherwise.
 */
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num)
{
        return __vm_map_pages(vma, pages, num, 0);
}
EXPORT_SYMBOL(vm_map_pages_zero);

static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn, pgprot_t prot, bool mkwrite)
{
        struct mm_struct *mm = vma->vm_mm;
        pte_t *pte, entry;
        spinlock_t *ptl;

        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                return VM_FAULT_OOM;
        entry = ptep_get(pte);
        if (!pte_none(entry)) {
                if (mkwrite) {
                        /*
                         * For read faults on private mappings the PFN passed
                         * in may not match the PFN we have mapped if the
                         * mapped PFN is a writeable COW page.  In the mkwrite
                         * case we are creating a writable PTE for a shared
                         * mapping and we expect the PFNs to match. If they
                         * don't match, we are likely racing with block
                         * allocation and mapping invalidation so just skip the
                         * update.
                         */
                        if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
                                goto out_unlock;
                        }
                        entry = pte_mkyoung(entry);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (ptep_set_access_flags(vma, addr, pte, entry, 1))
                                update_mmu_cache(vma, addr, pte);
                }
                goto out_unlock;
        }

        /* Ok, finally just insert the thing.. */
        if (pfn_t_devmap(pfn))
                entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
        else
                entry = pte_mkspecial(pfn_t_pte(pfn, prot));

        if (mkwrite) {
                entry = pte_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        }

        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */

out_unlock:
        pte_unmap_unlock(pte, ptl);
        return VM_FAULT_NOPAGE;
}

/**
 * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 * @pgprot: pgprot flags for the inserted page
 *
 * This is exactly like vmf_insert_pfn(), except that it allows drivers
 * to override pgprot on a per-page basis.
 *
 * This only makes sense for IO mappings, and it makes no sense for
 * COW mappings.  In general, using multiple vmas is preferable;
 * vmf_insert_pfn_prot should only be used if using multiple VMAs is
 * impractical.
 *
 * pgprot typically only differs from @vma->vm_page_prot when drivers set
 * caching- and encryption bits different than those of @vma->vm_page_prot,
 * because the caching- or encryption mode may not be known at mmap() time.
 *
 * This is ok as long as @vma->vm_page_prot is not used by the core vm
 * to set caching and encryption bits for those vmas (except for COW pages).
 * This is ensured by core vm only modifying these page table entries using
 * functions that don't touch caching- or encryption bits, using pte_modify()
 * if needed. (See for example mprotect()).
 *
 * Also when new page-table entries are created, this is only done using the
 * fault() callback, and never using the value of vma->vm_page_prot,
 * except for page-table entries that point to anonymous pages as the result
 * of COW.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot)
{
        /*
         * Technically, architectures with pte_special can avoid all these
         * restrictions (same for remap_pfn_range).  However we would like
         * consistency in testing and feature parity among all, so we should
         * try to keep these invariants in place for everybody.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (!pfn_modify_allowed(pfn, pgprot))
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));

        return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
                        false);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);

/**
 * vmf_insert_pfn - insert single pfn into user vma
 * @vma: user vma to map to
 * @addr: target user address of this page
 * @pfn: source kernel pfn
 *
 * Similar to vm_insert_page, this allows drivers to insert individual pages
 * they've allocated into a user vma. Same comments apply.
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return the result of this function.
 *
 * vma cannot be a COW mapping.
 *
 * As this is called only for pages that do not currently exist, we
 * do not need to flush old virtual caches or the TLB.
 *
 * Context: Process context.  May allocate using %GFP_KERNEL.
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
{
        return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
}
EXPORT_SYMBOL(vmf_insert_pfn);

static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
{
        if (unlikely(is_zero_pfn(pfn_t_to_pfn(pfn))) &&
            (mkwrite || !vm_mixed_zeropage_allowed(vma)))
                return false;
        /* these checks mirror the abort conditions in vm_normal_page */
        if (vma->vm_flags & VM_MIXEDMAP)
                return true;
        if (pfn_t_devmap(pfn))
                return true;
        if (pfn_t_special(pfn))
                return true;
        if (is_zero_pfn(pfn_t_to_pfn(pfn)))
                return true;
        return false;
}

static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn, bool mkwrite)
{
        pgprot_t pgprot = vma->vm_page_prot;
        int err;

        if (!vm_mixed_ok(vma, pfn, mkwrite))
                return VM_FAULT_SIGBUS;

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, pfn);

        if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
                return VM_FAULT_SIGBUS;

        /*
         * If we don't have pte special, then we have to use the pfn_valid()
         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
         * refcount the page if pfn_valid is true (hence insert_page rather
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) &&
            !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
                struct page *page;

                /*
                 * At this point we are committed to insert_page()
                 * regardless of whether the caller specified flags that
                 * result in pfn_t_has_page() == false.
                 */
                page = pfn_to_page(pfn_t_to_pfn(pfn));
                err = insert_page(vma, addr, page, pgprot, mkwrite);
        } else {
                return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
        }

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
                        bool write)
{
        pgprot_t pgprot = vmf->vma->vm_page_prot;
        unsigned long addr = vmf->address;
        int err;

        if (addr < vmf->vma->vm_start || addr >= vmf->vma->vm_end)
                return VM_FAULT_SIGBUS;

        err = insert_page(vmf->vma, addr, page, pgprot, write);
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_page_mkwrite);

vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, false);
}
EXPORT_SYMBOL(vmf_insert_mixed);

/*
 *  If the insertion of PTE failed because someone else already added a
 *  different entry in the mean time, we treat that as success as we assume
 *  the same entry was actually inserted.
 */
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn)
{
        return __vm_insert_mixed(vma, addr, pfn, true);
}

/*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
 * in null mappings (currently treated as "copy-on-access")
 */
static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pte_t *pte, *mapped_pte;
        spinlock_t *ptl;
        int err = 0;

        mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                return -ENOMEM;
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(ptep_get(pte)));
                if (!pfn_modify_allowed(pfn, prot)) {
                        err = -EACCES;
                        break;
                }
                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pmd_t *pmd;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                err = remap_pte_range(mm, pmd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        pud_t *pud;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        pud = pud_alloc(mm, p4d, addr);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                err = remap_pmd_range(mm, pud, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                        unsigned long addr, unsigned long end,
                        unsigned long pfn, pgprot_t prot)
{
        p4d_t *p4d;
        unsigned long next;
        int err;

        pfn -= addr >> PAGE_SHIFT;
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                err = remap_pud_range(mm, p4d, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        pgd_t *pgd;
        unsigned long next;
        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
        int err;

        if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
                return -EINVAL;

        /*
         * Physically remapped pages are special. Tell the
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *        (accesses can have side effects).
         *   VM_PFNMAP tells the core MM that the base pages are just
         *        raw PFN mappings, and do not have a "struct page" associated
         *        with them.
         *   VM_DONTEXPAND
         *      Disable vma merging and expanding with mremap().
         *   VM_DONTDUMP
         *      Omit vma from core dump, even when VM_IO turned off.
         *
         * There's a horrible special case to handle copy-on-write
         * behaviour that some programs depend on. We mark the "original"
         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         * See vm_normal_page() for details.
         */
        if (is_cow_mapping(vma->vm_flags)) {
                if (addr != vma->vm_start || end != vma->vm_end)
                        return -EINVAL;
                vma->vm_pgoff = pfn;
        }

        vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);

        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
        pgd = pgd_offset(mm, addr);
        flush_cache_range(vma, addr, end);
        do {
                next = pgd_addr_end(addr, end);
                err = remap_p4d_range(mm, pgd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
                        return err;
        } while (pgd++, addr = next, addr != end);

        return 0;
}

/*
 * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
 * must have pre-validated the caching bits of the pgprot_t.
 */
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot)
{
        int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);

        if (!error)
                return 0;

        /*
         * A partial pfn range mapping is dangerous: it does not
         * maintain page reference counts, and callers may free
         * pages due to the error. So zap it early.
         */
        zap_page_range_single(vma, addr, size, NULL);
        return error;
}

/**
 * remap_pfn_range - remap kernel memory to userspace
 * @vma: user vma to map to
 * @addr: target page aligned user address to start at
 * @pfn: page frame number of kernel physical memory address
 * @size: size of mapping area
 * @prot: page protection flags for this mapping
 *
 * Note: this is only safe if the mm semaphore is held when called.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                    unsigned long pfn, unsigned long size, pgprot_t prot)
{
        int err;

        err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
        if (err)
                return -EINVAL;

        err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
        if (err)
                untrack_pfn(vma, pfn, PAGE_ALIGN(size), true);
        return err;
}
EXPORT_SYMBOL(remap_pfn_range);

/**
 * vm_iomap_memory - remap memory to userspace
 * @vma: user vma to map to
 * @start: start of the physical memory to be mapped
 * @len: size of area
 *
 * This is a simplified io_remap_pfn_range() for common driver use. The
 * driver just needs to give us the physical memory range to be mapped,
 * we'll figure out the rest from the vma information.
 *
 * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
 * whatever write-combining details or similar.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
        unsigned long vm_len, pfn, pages;

        /* Check that the physical memory area passed in looks valid */
        if (start + len < start)
                return -EINVAL;
        /*
         * You *really* shouldn't map things that aren't page-aligned,
         * but we've historically allowed it because IO memory might
         * just have smaller alignment.
         */
        len += start & ~PAGE_MASK;
        pfn = start >> PAGE_SHIFT;
        pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
        if (pfn + pages < pfn)
                return -EINVAL;

        /* We start the mapping 'vm_pgoff' pages into the area */
        if (vma->vm_pgoff > pages)
                return -EINVAL;
        pfn += vma->vm_pgoff;
        pages -= vma->vm_pgoff;

        /* Can we fit all of the mapping? */
        vm_len = vma->vm_end - vma->vm_start;
        if (vm_len >> PAGE_SHIFT > pages)
                return -EINVAL;

        /* Ok, let it rip */
        return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
}
EXPORT_SYMBOL(vm_iomap_memory);

static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pte_t *pte, *mapped_pte;
        int err = 0;
        spinlock_t *ptl;

        if (create) {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_alloc_kernel_track(pmd, addr, mask) :
                        pte_alloc_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -ENOMEM;
        } else {
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_offset_kernel(pmd, addr) :
                        pte_offset_map_lock(mm, pmd, addr, &ptl);
                if (!pte)
                        return -EINVAL;
        }

        arch_enter_lazy_mmu_mode();

        if (fn) {
                do {
                        if (create || !pte_none(ptep_get(pte))) {
                                err = fn(pte, addr, data);
                                if (err)
                                        break;
                        }
                } while (pte++, addr += PAGE_SIZE, addr != end);
        }
        *mask |= PGTBL_PTE_MODIFIED;

        arch_leave_lazy_mmu_mode();

        if (mm != &init_mm)
                pte_unmap_unlock(mapped_pte, ptl);
        return err;
}

static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int err = 0;

        BUG_ON(pud_leaf(*pud));

        if (create) {
                pmd = pmd_alloc_track(mm, pud, addr, mask);
                if (!pmd)
                        return -ENOMEM;
        } else {
                pmd = pmd_offset(pud, addr);
        }
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none(*pmd) && !create)
                        continue;
                if (WARN_ON_ONCE(pmd_leaf(*pmd)))
                        return -EINVAL;
                if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
                        if (!create)
                                continue;
                        pmd_clear_bad(pmd);
                }
                err = apply_to_pte_range(mm, pmd, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pmd++, addr = next, addr != end);

        return err;
}

static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int err = 0;

        if (create) {
                pud = pud_alloc_track(mm, p4d, addr, mask);
                if (!pud)
                        return -ENOMEM;
        } else {
                pud = pud_offset(p4d, addr);
        }
        do {
                next = pud_addr_end(addr, end);
                if (pud_none(*pud) && !create)
                        continue;
                if (WARN_ON_ONCE(pud_leaf(*pud)))
                        return -EINVAL;
                if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
                        if (!create)
                                continue;
                        pud_clear_bad(pud);
                }
                err = apply_to_pmd_range(mm, pud, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (pud++, addr = next, addr != end);

        return err;
}

static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long addr, unsigned long end,
                                     pte_fn_t fn, void *data, bool create,
                                     pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;
        int err = 0;

        if (create) {
                p4d = p4d_alloc_track(mm, pgd, addr, mask);
                if (!p4d)
                        return -ENOMEM;
        } else {
                p4d = p4d_offset(pgd, addr);
        }
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none(*p4d) && !create)
                        continue;
                if (WARN_ON_ONCE(p4d_leaf(*p4d)))
                        return -EINVAL;
                if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
                        if (!create)
                                continue;
                        p4d_clear_bad(p4d);
                }
                err = apply_to_pud_range(mm, p4d, addr, next,
                                         fn, data, create, mask);
                if (err)
                        break;
        } while (p4d++, addr = next, addr != end);

        return err;
}

static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn,
                                 void *data, bool create)
{
        pgd_t *pgd;
        unsigned long start = addr, next;
        unsigned long end = addr + size;
        pgtbl_mod_mask mask = 0;
        int err = 0;

        if (WARN_ON(addr >= end))
                return -EINVAL;

        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none(*pgd) && !create)
                        continue;
                if (WARN_ON_ONCE(pgd_leaf(*pgd))) {
                        err = -EINVAL;
                        break;
                }
                if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
                        if (!create)
                                continue;
                        pgd_clear_bad(pgd);
                }
                err = apply_to_p4d_range(mm, pgd, addr, next,
                                         fn, data, create, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, start + size);

        return err;
}

/*
 * Scan a region of virtual memory, filling in page tables as necessary
 * and calling a provided function on each leaf page table.
 */
int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                        unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, true);
}
EXPORT_SYMBOL_GPL(apply_to_page_range);

/*
 * Scan a region of virtual memory, calling a provided function on
 * each leaf page table where it exists.
 *
 * Unlike apply_to_page_range, this does _not_ fill in page tables
 * where they are absent.
 */
int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr,
                                 unsigned long size, pte_fn_t fn, void *data)
{
        return __apply_to_page_range(mm, addr, size, fn, data, false);
}

/*
 * handle_pte_fault chooses page fault handler according to an entry which was
 * read non-atomically.  Before making any commitment, on those architectures
 * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
 * parts, do_swap_page must check under lock before unmapping the pte and
 * proceeding (but do_wp_page is only called after already making such a check;
 * and do_anonymous_page can safely check later on).
 */
static inline int pte_unmap_same(struct vm_fault *vmf)
{
        int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
                spin_lock(vmf->ptl);
                same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
                spin_unlock(vmf->ptl);
        }
#endif
        pte_unmap(vmf->pte);
        vmf->pte = NULL;
        return same;
}

/*
 * Return:
 *        0:                copied succeeded
 *        -EHWPOISON:        copy failed due to hwpoison in source page
 *        -EAGAIN:        copied failed (some other reason)
 */
static inline int __wp_page_copy_user(struct page *dst, struct page *src,
                                      struct vm_fault *vmf)
{
        int ret;
        void *kaddr;
        void __user *uaddr;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = vmf->address;

        if (likely(src)) {
                if (copy_mc_user_highpage(dst, src, addr, vma))
                        return -EHWPOISON;
                return 0;
        }

        /*
         * If the source page was a PFN mapping, we don't have
         * a "struct page" for it. We do a best-effort copy by
         * just copying from the original user address. If that
         * fails, we just zero-fill it. Live with it.
         */
        kaddr = kmap_local_page(dst);
        pagefault_disable();
        uaddr = (void __user *)(addr & PAGE_MASK);

        /*
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
        vmf->pte = NULL;
        if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
                pte_t entry;

                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /*
                         * Other thread has already handled the fault
                         * and update local tlb only
                         */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                entry = pte_mkyoung(vmf->orig_pte);
                if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
                        update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1);
        }

        /*
         * This really shouldn't fail, because the page is there
         * in the page tables. But it might just be unreadable,
         * in which case we just give up and fill the result with
         * zeroes.
         */
        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                if (vmf->pte)
                        goto warn;

                /* Re-validate under PTL if the page is still mapped */
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
                if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
                        if (vmf->pte)
                                update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }

                /*
                 * The same page can be mapped back since last copy attempt.
                 * Try to copy again under PTL.
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
                        /*
                         * Give a warn in case there can be some obscure
                         * use-case
                         */
warn:
                        WARN_ON_ONCE(1);
                        clear_page(kaddr);
                }
        }

        ret = 0;

pte_unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        pagefault_enable();
        kunmap_local(kaddr);
        flush_dcache_page(dst);

        return ret;
}

static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
{
        struct file *vm_file = vma->vm_file;

        if (vm_file)
                return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;

        /*
         * Special mappings (e.g. VDSO) do not have any file so fake
         * a default GFP_KERNEL for them.
         */
        return GFP_KERNEL;
}

/*
 * Notify the address space that the page is about to become writable so that
 * it can prohibit this or wait for the page to get into an appropriate state.
 *
 * We do this without the lock held, so that it can sleep if it needs to.
 */
static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio)
{
        vm_fault_t ret;
        unsigned int old_flags = vmf->flags;

        vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;

        if (vmf->vma->vm_file &&
            IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
                return VM_FAULT_SIGBUS;

        ret = vmf->vma->vm_ops->page_mkwrite(vmf);
        /* Restore original flags so that caller is not surprised */
        vmf->flags = old_flags;
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
                return ret;
        if (unlikely(!(ret & VM_FAULT_LOCKED))) {
                folio_lock(folio);
                if (!folio->mapping) {
                        folio_unlock(folio);
                        return 0; /* retry */
                }
                ret |= VM_FAULT_LOCKED;
        } else
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        return ret;
}

/*
 * Handle dirtying of a page in shared file mapping on a write fault.
 *
 * The function expects the page to be locked and unlocks it.
 */
static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct address_space *mapping;
        struct folio *folio = page_folio(vmf->page);
        bool dirtied;
        bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;

        dirtied = folio_mark_dirty(folio);
        VM_BUG_ON_FOLIO(folio_test_anon(folio), folio);
        /*
         * Take a local copy of the address_space - folio.mapping may be zeroed
         * by truncate after folio_unlock().   The address_space itself remains
         * pinned by vma->vm_file's reference.  We rely on folio_unlock()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
        mapping = folio_raw_mapping(folio);
        folio_unlock(folio);

        if (!page_mkwrite)
                file_update_time(vma->vm_file);

        /*
         * Throttle page dirtying rate down to writeback speed.
         *
         * mapping may be NULL here because some device drivers do not
         * set page.mapping but still dirty their pages
         *
         * Drop the mmap_lock before waiting on IO, if we can. The file
         * is pinning the mapping, as per above.
         */
        if ((dirtied || page_mkwrite) && mapping) {
                struct file *fpin;

                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                balance_dirty_pages_ratelimited(mapping);
                if (fpin) {
                        fput(fpin);
                        return VM_FAULT_COMPLETED;
                }
        }

        return 0;
}

/*
 * Handle write page faults for pages that can be reused in the current vma
 *
 * This can happen either due to the mapping being with the VM_SHARED flag,
 * or due to us being the last reference standing to the page. In either
 * case, all we need to do here is to mark the page as writable and update
 * any related book-keeping.
 */
static inline void wp_page_reuse(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        pte_t entry;

        VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
        VM_WARN_ON(is_zero_pfn(pte_pfn(vmf->orig_pte)));

        if (folio) {
                VM_BUG_ON(folio_test_anon(folio) &&
                          !PageAnonExclusive(vmf->page));
                /*
                 * Clear the folio's cpupid information as the existing
                 * information potentially belongs to a now completely
                 * unrelated process.
                 */
                folio_xchg_last_cpupid(folio, (1 << LAST_CPUPID_SHIFT) - 1);
        }

        flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
        entry = pte_mkyoung(vmf->orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        count_vm_event(PGREUSE);
}

/*
 * We could add a bitflag somewhere, but for now, we know that all
 * vm_ops that have a ->map_pages have been audited and don't need
 * the mmap_lock to be held.
 */
static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
                return 0;
        vma_end_read(vma);
        return VM_FAULT_RETRY;
}

/**
 * __vmf_anon_prepare - Prepare to handle an anonymous fault.
 * @vmf: The vm_fault descriptor passed from the fault handler.
 *
 * When preparing to insert an anonymous page into a VMA from a
 * fault handler, call this function rather than anon_vma_prepare().
 * If this vma does not already have an associated anon_vma and we are
 * only protected by the per-VMA lock, the caller must retry with the
 * mmap_lock held.  __anon_vma_prepare() will look at adjacent VMAs to
 * determine if this VMA can share its anon_vma, and that's not safe to
 * do with only the per-VMA lock held for this VMA.
 *
 * Return: 0 if fault handling can proceed.  Any other value should be
 * returned to the caller.
 */
vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        if (likely(vma->anon_vma))
                return 0;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                if (!mmap_read_trylock(vma->vm_mm))
                        return VM_FAULT_RETRY;
        }
        if (__anon_vma_prepare(vma))
                ret = VM_FAULT_OOM;
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                mmap_read_unlock(vma->vm_mm);
        return ret;
}

/*
 * Handle the case of a page which we actually need to copy to a new page,
 * either due to COW or unsharing.
 *
 * Called with mmap_lock locked and the old page referenced, but
 * without the ptl held.
 *
 * High level logic flow:
 *
 * - Allocate a page, copy the content of the old page to the new one.
 * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc.
 * - Take the PTL. If the pte changed, bail out and release the allocated page
 * - If the pte is still the way we remember it, update the page table and all
 *   relevant references. This includes dropping the reference the page-table
 *   held to the old page, as well as updating the rmap.
 * - In any case, unlock the PTL and drop the reference we took to the old page.
 */
static vm_fault_t wp_page_copy(struct vm_fault *vmf)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct folio *old_folio = NULL;
        struct folio *new_folio = NULL;
        pte_t entry;
        int page_copied = 0;
        struct mmu_notifier_range range;
        vm_fault_t ret;
        bool pfn_is_zero;

        delayacct_wpcopy_start();

        if (vmf->page)
                old_folio = page_folio(vmf->page);
        ret = vmf_anon_prepare(vmf);
        if (unlikely(ret))
                goto out;

        pfn_is_zero = is_zero_pfn(pte_pfn(vmf->orig_pte));
        new_folio = folio_prealloc(mm, vma, vmf->address, pfn_is_zero);
        if (!new_folio)
                goto oom;

        if (!pfn_is_zero) {
                int err;

                err = __wp_page_copy_user(&new_folio->page, vmf->page, vmf);
                if (err) {
                        /*
                         * COW failed, if the fault was solved by other,
                         * it's fine. If not, userspace would re-fault on
                         * the same address and we will handle the fault
                         * from the second attempt.
                         * The -EHWPOISON case will not be retried.
                         */
                        folio_put(new_folio);
                        if (old_folio)
                                folio_put(old_folio);

                        delayacct_wpcopy_end();
                        return err == -EHWPOISON ? VM_FAULT_HWPOISON : 0;
                }
                kmsan_copy_page_meta(&new_folio->page, vmf->page);
        }

        __folio_mark_uptodate(new_folio);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Re-check the pte - we dropped the lock
         */
        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                if (old_folio) {
                        if (!folio_test_anon(old_folio)) {
                                dec_mm_counter(mm, mm_counter_file(old_folio));
                                inc_mm_counter(mm, MM_ANONPAGES);
                        }
                } else {
                        ksm_might_unmap_zero_page(mm, vmf->orig_pte);
                        inc_mm_counter(mm, MM_ANONPAGES);
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = mk_pte(&new_folio->page, vma->vm_page_prot);
                entry = pte_sw_mkyoung(entry);
                if (unlikely(unshare)) {
                        if (pte_soft_dirty(vmf->orig_pte))
                                entry = pte_mksoft_dirty(entry);
                        if (pte_uffd_wp(vmf->orig_pte))
                                entry = pte_mkuffd_wp(entry);
                } else {
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                }

                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry, to keep TLBs on different CPUs in
                 * sync. This code used to set the new PTE then flush TLBs, but
                 * that left a window where the new PTE could be loaded into
                 * some TLBs while the old PTE remains in others.
                 */
                ptep_clear_flush(vma, vmf->address, vmf->pte);
                folio_add_new_anon_rmap(new_folio, vma, vmf->address, RMAP_EXCLUSIVE);
                folio_add_lru_vma(new_folio, vma);
                BUG_ON(unshare && pte_write(entry));
                set_pte_at(mm, vmf->address, vmf->pte, entry);
                update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1);
                if (old_folio) {
                        /*
                         * Only after switching the pte to the new page may
                         * we remove the mapcount here. Otherwise another
                         * process may come and find the rmap count decremented
                         * before the pte is switched to the new page, and
                         * "reuse" the old page writing into it while our pte
                         * here still points into it and can be read by other
                         * threads.
                         *
                         * The critical issue is to order this
                         * folio_remove_rmap_pte() with the ptp_clear_flush
                         * above. Those stores are ordered by (if nothing else,)
                         * the barrier present in the atomic_add_negative
                         * in folio_remove_rmap_pte();
                         *
                         * Then the TLB flush in ptep_clear_flush ensures that
                         * no process can access the old page before the
                         * decremented mapcount is visible. And the old page
                         * cannot be reused until after the decremented
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
                        folio_remove_rmap_pte(old_folio, vmf->page, vma);
                }

                /* Free the old page.. */
                new_folio = old_folio;
                page_copied = 1;
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        } else if (vmf->pte) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        }

        mmu_notifier_invalidate_range_end(&range);

        if (new_folio)
                folio_put(new_folio);
        if (old_folio) {
                if (page_copied)
                        free_swap_cache(old_folio);
                folio_put(old_folio);
        }

        delayacct_wpcopy_end();
        return 0;
oom:
        ret = VM_FAULT_OOM;
out:
        if (old_folio)
                folio_put(old_folio);

        delayacct_wpcopy_end();
        return ret;
}

/**
 * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE
 *                          writeable once the page is prepared
 *
 * @vmf: structure describing the fault
 * @folio: the folio of vmf->page
 *
 * This function handles all that is needed to finish a write page fault in a
 * shared mapping due to PTE being read-only once the mapped page is prepared.
 * It handles locking of PTE and modifying it.
 *
 * The function expects the page to be locked or other protection against
 * concurrent faults / writeback (such as DAX radix tree locks).
 *
 * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
 * we acquired PTE lock.
 */
static vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf, struct folio *folio)
{
        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
                                       &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;
        /*
         * We might have raced with another page fault while we released the
         * pte_offset_map_lock.
         */
        if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return VM_FAULT_NOPAGE;
        }
        wp_page_reuse(vmf, folio);
        return 0;
}

/*
 * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED
 * mapping
 */
static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
                vm_fault_t ret;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                ret = vmf_can_call_fault(vmf);
                if (ret)
                        return ret;

                vmf->flags |= FAULT_FLAG_MKWRITE;
                ret = vma->vm_ops->pfn_mkwrite(vmf);
                if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
                        return ret;
                return finish_mkwrite_fault(vmf, NULL);
        }
        wp_page_reuse(vmf, NULL);
        return 0;
}

static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
        __releases(vmf->ptl)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;

        folio_get(folio);

        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
                vm_fault_t tmp;

                pte_unmap_unlock(vmf->pte, vmf->ptl);
                tmp = vmf_can_call_fault(vmf);
                if (tmp) {
                        folio_put(folio);
                        return tmp;
                }

                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp || (tmp &
                                      (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
                tmp = finish_mkwrite_fault(vmf, folio);
                if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return tmp;
                }
        } else {
                wp_page_reuse(vmf, folio);
                folio_lock(folio);
        }
        ret |= fault_dirty_shared_page(vmf);
        folio_put(folio);

        return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
                struct vm_area_struct *vma)
{
        bool exclusive = false;

        /* Let's just free up a large folio if only a single page is mapped. */
        if (folio_large_mapcount(folio) <= 1)
                return false;

        /*
         * The assumption for anonymous folios is that each page can only get
         * mapped once into each MM. The only exception are KSM folios, which
         * are always small.
         *
         * Each taken mapcount must be paired with exactly one taken reference,
         * whereby the refcount must be incremented before the mapcount when
         * mapping a page, and the refcount must be decremented after the
         * mapcount when unmapping a page.
         *
         * If all folio references are from mappings, and all mappings are in
         * the page tables of this MM, then this folio is exclusive to this MM.
         */
        if (folio_test_large_maybe_mapped_shared(folio))
                return false;

        VM_WARN_ON_ONCE(folio_test_ksm(folio));

        if (unlikely(folio_test_swapcache(folio))) {
                /*
                 * Note: freeing up the swapcache will fail if some PTEs are
                 * still swap entries.
                 */
                if (!folio_trylock(folio))
                        return false;
                folio_free_swap(folio);
                folio_unlock(folio);
        }

        if (folio_large_mapcount(folio) != folio_ref_count(folio))
                return false;

        /* Stabilize the mapcount vs. refcount and recheck. */
        folio_lock_large_mapcount(folio);
        VM_WARN_ON_ONCE(folio_large_mapcount(folio) < folio_ref_count(folio));

        if (folio_test_large_maybe_mapped_shared(folio))
                goto unlock;
        if (folio_large_mapcount(folio) != folio_ref_count(folio))
                goto unlock;

        VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
                        folio_mm_id(folio, 1) != vma->vm_mm->mm_id);

        /*
         * Do we need the folio lock? Likely not. If there would have been
         * references from page migration/swapout, we would have detected
         * an additional folio reference and never ended up here.
         */
        exclusive = true;
unlock:
        folio_unlock_large_mapcount(folio);
        return exclusive;
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
                struct vm_area_struct *vma)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static bool wp_can_reuse_anon_folio(struct folio *folio,
                                    struct vm_area_struct *vma)
{
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && folio_test_large(folio))
                return __wp_can_reuse_large_anon_folio(folio, vma);

        /*
         * We have to verify under folio lock: these early checks are
         * just an optimization to avoid locking the folio and freeing
         * the swapcache if there is little hope that we can reuse.
         *
         * KSM doesn't necessarily raise the folio refcount.
         */
        if (folio_test_ksm(folio) || folio_ref_count(folio) > 3)
                return false;
        if (!folio_test_lru(folio))
                /*
                 * We cannot easily detect+handle references from
                 * remote LRU caches or references to LRU folios.
                 */
                lru_add_drain();
        if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
                return false;
        if (!folio_trylock(folio))
                return false;
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) {
                folio_unlock(folio);
                return false;
        }
        /*
         * Ok, we've got the only folio reference from our mapping
         * and the folio is locked, it's dark out, and we're wearing
         * sunglasses. Hit it.
         */
        folio_move_anon_rmap(folio, vma);
        folio_unlock(folio);
        return true;
}

/*
 * This routine handles present pages, when
 * * users try to write to a shared page (FAULT_FLAG_WRITE)
 * * GUP wants to take a R/O pin on a possibly shared anonymous page
 *   (FAULT_FLAG_UNSHARE)
 *
 * It is done by copying the page to a new address and decrementing the
 * shared-page counter for the old page.
 *
 * Note that this routine assumes that the protection checks have been
 * done by the caller (the low-level page fault routine in most cases).
 * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
 * done any necessary COW.
 *
 * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
 * though the page will change only once the write actually happens. This
 * avoids a few races, and potentially makes it more efficient.
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), with pte both mapped and locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_wp_page(struct vm_fault *vmf)
        __releases(vmf->ptl)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        pte_t pte;

        if (likely(!unshare)) {
                if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
                        if (!userfaultfd_wp_async(vma)) {
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                                return handle_userfault(vmf, VM_UFFD_WP);
                        }

                        /*
                         * Nothing needed (cache flush, TLB invalidations,
                         * etc.) because we're only removing the uffd-wp bit,
                         * which is completely invisible to the user.
                         */
                        pte = pte_clear_uffd_wp(ptep_get(vmf->pte));

                        set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
                        /*
                         * Update this to be prepared for following up CoW
                         * handling
                         */
                        vmf->orig_pte = pte;
                }

                /*
                 * Userfaultfd write-protect can defer flushes. Ensure the TLB
                 * is flushed in this case before copying.
                 */
                if (unlikely(userfaultfd_wp(vmf->vma) &&
                             mm_tlb_flush_pending(vmf->vma->vm_mm)))
                        flush_tlb_page(vmf->vma, vmf->address);
        }

        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);

        if (vmf->page)
                folio = page_folio(vmf->page);

        /*
         * Shared mapping: we are guaranteed to have VM_WRITE and
         * FAULT_FLAG_WRITE set at this point.
         */
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                /*
                 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
                 * VM_PFNMAP VMA. FS DAX also wants ops->pfn_mkwrite called.
                 *
                 * We should not cow pages in a shared writeable mapping.
                 * Just mark the pages writable and/or call ops->pfn_mkwrite.
                 */
                if (!vmf->page || is_fsdax_page(vmf->page)) {
                        vmf->page = NULL;
                        return wp_pfn_shared(vmf);
                }
                return wp_page_shared(vmf, folio);
        }

        /*
         * Private mapping: create an exclusive anonymous page copy if reuse
         * is impossible. We might miss VM_WRITE for FOLL_FORCE handling.
         *
         * If we encounter a page that is marked exclusive, we must reuse
         * the page without further checks.
         */
        if (folio && folio_test_anon(folio) &&
            (PageAnonExclusive(vmf->page) || wp_can_reuse_anon_folio(folio, vma))) {
                if (!PageAnonExclusive(vmf->page))
                        SetPageAnonExclusive(vmf->page);
                if (unlikely(unshare)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return 0;
                }
                wp_page_reuse(vmf, folio);
                return 0;
        }
        /*
         * Ok, we need to copy. Oh, well..
         */
        if (folio)
                folio_get(folio);

        pte_unmap_unlock(vmf->pte, vmf->ptl);
#ifdef CONFIG_KSM
        if (folio && folio_test_ksm(folio))
                count_vm_event(COW_KSM);
#endif
        return wp_page_copy(vmf);
}

static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
{
        zap_page_range_single(vma, start_addr, end_addr - start_addr, details);
}

static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
                                            pgoff_t first_index,
                                            pgoff_t last_index,
                                            struct zap_details *details)
{
        struct vm_area_struct *vma;
        pgoff_t vba, vea, zba, zea;

        vma_interval_tree_foreach(vma, root, first_index, last_index) {
                vba = vma->vm_pgoff;
                vea = vba + vma_pages(vma) - 1;
                zba = max(first_index, vba);
                zea = min(last_index, vea);

                unmap_mapping_range_vma(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                        ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
                                details);
        }
}

/**
 * unmap_mapping_folio() - Unmap single folio from processes.
 * @folio: The locked folio to be unmapped.
 *
 * Unmap this folio from any userspace process which still has it mmaped.
 * Typically, for efficiency, the range of nearby pages has already been
 * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
 * truncation or invalidation holds the lock on a folio, it may find that
 * the page has been remapped again: and then uses unmap_mapping_folio()
 * to unmap it finally.
 */
void unmap_mapping_folio(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;
        struct zap_details details = { };
        pgoff_t        first_index;
        pgoff_t        last_index;

        VM_BUG_ON(!folio_test_locked(folio));

        first_index = folio->index;
        last_index = folio_next_index(folio) - 1;

        details.even_cows = false;
        details.single_folio = folio;
        details.zap_flags = ZAP_FLAG_DROP_MARKER;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}

/**
 * unmap_mapping_pages() - Unmap pages from processes.
 * @mapping: The address space containing pages to be unmapped.
 * @start: Index of first page to be unmapped.
 * @nr: Number of pages to be unmapped.  0 to unmap to end of file.
 * @even_cows: Whether to unmap even private COWed pages.
 *
 * Unmap the pages in this address space from any userspace process which
 * has them mmaped.  Generally, you want to remove COWed pages as well when
 * a file is being truncated, but not when invalidating pages from the page
 * cache.
 */
void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
                pgoff_t nr, bool even_cows)
{
        struct zap_details details = { };
        pgoff_t        first_index = start;
        pgoff_t        last_index = start + nr - 1;

        details.even_cows = even_cows;
        if (last_index < first_index)
                last_index = ULONG_MAX;

        i_mmap_lock_read(mapping);
        if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
                unmap_mapping_range_tree(&mapping->i_mmap, first_index,
                                         last_index, &details);
        i_mmap_unlock_read(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);

/**
 * unmap_mapping_range - unmap the portion of all mmaps in the specified
 * address_space corresponding to the specified byte range in the underlying
 * file.
 *
 * @mapping: the address space containing mmaps to be unmapped.
 * @holebegin: byte in first page to unmap, relative to the start of
 * the underlying file.  This will be rounded down to a PAGE_SIZE
 * boundary.  Note that this is different from truncate_pagecache(), which
 * must keep the partial page.  In contrast, we must get rid of
 * partial pages.
 * @holelen: size of prospective hole in bytes.  This will be rounded
 * up to a PAGE_SIZE boundary.  A holelen of zero truncates to the
 * end of the file.
 * @even_cows: 1 when truncating a file, unmap even private COWed pages;
 * but 0 when invalidating pagecache, don't throw away private data.
 */
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows)
{
        pgoff_t hba = (pgoff_t)(holebegin) >> PAGE_SHIFT;
        pgoff_t hlen = ((pgoff_t)(holelen) + PAGE_SIZE - 1) >> PAGE_SHIFT;

        /* Check for overflow. */
        if (sizeof(holelen) > sizeof(hlen)) {
                long long holeend =
                        (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
                if (holeend & ~(long long)ULONG_MAX)
                        hlen = ULONG_MAX - hba + 1;
        }

        unmap_mapping_pages(mapping, hba, hlen, even_cows);
}
EXPORT_SYMBOL(unmap_mapping_range);

/*
 * Restore a potential device exclusive pte to a working pte entry
 */
static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
{
        struct folio *folio = page_folio(vmf->page);
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
        vm_fault_t ret;

        /*
         * We need a reference to lock the folio because we don't hold
         * the PTL so a racing thread can remove the device-exclusive
         * entry and unmap it. If the folio is free the entry must
         * have been removed already. If it happens to have already
         * been re-allocated after being freed all we do is lock and
         * unlock it.
         */
        if (!folio_try_get(folio))
                return 0;

        ret = folio_lock_or_retry(folio, vmf);
        if (ret) {
                folio_put(folio);
                return ret;
        }
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_CLEAR, 0,
                                vma->vm_mm, vmf->address & PAGE_MASK,
                                (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
        mmu_notifier_invalidate_range_start(&range);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                                &vmf->ptl);
        if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                restore_exclusive_pte(vma, folio, vmf->page, vmf->address,
                                      vmf->pte, vmf->orig_pte);

        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        folio_unlock(folio);
        folio_put(folio);

        mmu_notifier_invalidate_range_end(&range);
        return 0;
}

static inline bool should_try_to_free_swap(struct folio *folio,
                                           struct vm_area_struct *vma,
                                           unsigned int fault_flags)
{
        if (!folio_test_swapcache(folio))
                return false;
        if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) ||
            folio_test_mlocked(folio))
                return true;
        /*
         * If we want to map a page that's in the swapcache writable, we
         * have to detect via the refcount if we're really the exclusive
         * user. Try freeing the swapcache to get rid of the swapcache
         * reference only in case it's likely that we'll be the exlusive user.
         */
        return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) &&
                folio_ref_count(folio) == (1 + folio_nr_pages(folio));
}

static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
{
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
        if (!vmf->pte)
                return 0;
        /*
         * Be careful so that we will only recover a special uffd-wp pte into a
         * none pte.  Otherwise it means the pte could have changed, so retry.
         *
         * This should also cover the case where e.g. the pte changed
         * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED.
         * So is_pte_marker() check is not enough to safely drop the pte.
         */
        if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
                pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

static vm_fault_t do_pte_missing(struct vm_fault *vmf)
{
        if (vma_is_anonymous(vmf->vma))
                return do_anonymous_page(vmf);
        else
                return do_fault(vmf);
}

/*
 * This is actually a page-missing access, but with uffd-wp special pte
 * installed.  It means this pte was wr-protected before being unmapped.
 */
static vm_fault_t pte_marker_handle_uffd_wp(struct vm_fault *vmf)
{
        /*
         * Just in case there're leftover special ptes even after the region
         * got unregistered - we can simply clear them.
         */
        if (unlikely(!userfaultfd_wp(vmf->vma)))
                return pte_marker_clear(vmf);

        return do_pte_missing(vmf);
}

static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
{
        swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
        unsigned long marker = pte_marker_get(entry);

        /*
         * PTE markers should never be empty.  If anything weird happened,
         * the best thing to do is to kill the process along with its mm.
         */
        if (WARN_ON_ONCE(!marker))
                return VM_FAULT_SIGBUS;

        /* Higher priority than uffd-wp when data corrupted */
        if (marker & PTE_MARKER_POISONED)
                return VM_FAULT_HWPOISON;

        /* Hitting a guard page is always a fatal condition. */
        if (marker & PTE_MARKER_GUARD)
                return VM_FAULT_SIGSEGV;

        if (pte_marker_entry_uffd_wp(entry))
                return pte_marker_handle_uffd_wp(vmf);

        /* This is an unknown pte marker */
        return VM_FAULT_SIGBUS;
}

static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        swp_entry_t entry;

        folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address);
        if (!folio)
                return NULL;

        entry = pte_to_swp_entry(vmf->orig_pte);
        if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
                                           GFP_KERNEL, entry)) {
                folio_put(folio);
                return NULL;
        }

        return folio;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
{
        struct swap_info_struct *si = swp_swap_info(entry);
        pgoff_t offset = swp_offset(entry);
        int i;

        /*
         * While allocating a large folio and doing swap_read_folio, which is
         * the case the being faulted pte doesn't have swapcache. We need to
         * ensure all PTEs have no cache as well, otherwise, we might go to
         * swap devices while the content is in swapcache.
         */
        for (i = 0; i < max_nr; i++) {
                if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
                        return i;
        }

        return i;
}

/*
 * Check if the PTEs within a range are contiguous swap entries
 * and have consistent swapcache, zeromap.
 */
static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
{
        unsigned long addr;
        swp_entry_t entry;
        int idx;
        pte_t pte;

        addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
        idx = (vmf->address - addr) / PAGE_SIZE;
        pte = ptep_get(ptep);

        if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
                return false;
        entry = pte_to_swp_entry(pte);
        if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
                return false;

        /*
         * swap_read_folio() can't handle the case a large folio is hybridly
         * from different backends. And they are likely corner cases. Similar
         * things might be added once zswap support large folios.
         */
        if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
                return false;
        if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
                return false;

        return true;
}

static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
                                                     unsigned long addr,
                                                     unsigned long orders)
{
        int order, nr;

        order = highest_order(orders);

        /*
         * To swap in a THP with nr pages, we require that its first swap_offset
         * is aligned with that number, as it was when the THP was swapped out.
         * This helps filter out most invalid entries.
         */
        while (orders) {
                nr = 1 << order;
                if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr)
                        break;
                order = next_order(&orders, order);
        }

        return orders;
}

static struct folio *alloc_swap_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long orders;
        struct folio *folio;
        unsigned long addr;
        swp_entry_t entry;
        spinlock_t *ptl;
        pte_t *pte;
        gfp_t gfp;
        int order;

        /*
         * If uffd is active for the vma we need per-page fault fidelity to
         * maintain the uffd semantics.
         */
        if (unlikely(userfaultfd_armed(vma)))
                goto fallback;

        /*
         * A large swapped out folio could be partially or fully in zswap. We
         * lack handling for such cases, so fallback to swapping in order-0
         * folio.
         */
        if (!zswap_never_enabled())
                goto fallback;

        entry = pte_to_swp_entry(vmf->orig_pte);
        /*
         * Get a list of all the (large) orders below PMD_ORDER that are enabled
         * and suitable for swapping THP.
         */
        orders = thp_vma_allowable_orders(vma, vma->vm_flags,
                        TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
        orders = thp_vma_suitable_orders(vma, vmf->address, orders);
        orders = thp_swap_suitable_orders(swp_offset(entry),
                                          vmf->address, orders);

        if (!orders)
                goto fallback;

        pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                  vmf->address & PMD_MASK, &ptl);
        if (unlikely(!pte))
                goto fallback;

        /*
         * For do_swap_page, find the highest order where the aligned range is
         * completely swap entries with contiguous swap offsets.
         */
        order = highest_order(orders);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
                        break;
                order = next_order(&orders, order);
        }

        pte_unmap_unlock(pte, ptl);

        /* Try allocating the highest of the remaining orders. */
        gfp = vma_thp_gfp_mask(vma);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                folio = vma_alloc_folio(gfp, order, vma, addr);
                if (folio) {
                        if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
                                                            gfp, entry))
                                return folio;
                        count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
                        folio_put(folio);
                }
                count_mthp_stat(order, MTHP_STAT_SWPIN_FALLBACK);
                order = next_order(&orders, order);
        }

fallback:
        return __alloc_swap_folio(vmf);
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
static struct folio *alloc_swap_folio(struct vm_fault *vmf)
{
        return __alloc_swap_folio(vmf);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static DECLARE_WAIT_QUEUE_HEAD(swapcache_wq);

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * We return with the mmap_lock locked or unlocked in the same cases
 * as does filemap_fault().
 */
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *swapcache, *folio = NULL;
        DECLARE_WAITQUEUE(wait, current);
        struct page *page;
        struct swap_info_struct *si = NULL;
        rmap_t rmap_flags = RMAP_NONE;
        bool need_clear_cache = false;
        bool exclusive = false;
        swp_entry_t entry;
        pte_t pte;
        vm_fault_t ret = 0;
        void *shadow = NULL;
        int nr_pages;
        unsigned long page_idx;
        unsigned long address;
        pte_t *ptep;

        if (!pte_unmap_same(vmf))
                goto out;

        entry = pte_to_swp_entry(vmf->orig_pte);
        if (unlikely(non_swap_entry(entry))) {
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
                } else if (is_device_exclusive_entry(entry)) {
                        vmf->page = pfn_swap_entry_to_page(entry);
                        ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
                        if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
                                /*
                                 * migrate_to_ram is not yet ready to operate
                                 * under VMA lock.
                                 */
                                vma_end_read(vma);
                                ret = VM_FAULT_RETRY;
                                goto out;
                        }

                        vmf->page = pfn_swap_entry_to_page(entry);
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (unlikely(!vmf->pte ||
                                     !pte_same(ptep_get(vmf->pte),
                                                        vmf->orig_pte)))
                                goto unlock;

                        /*
                         * Get a page reference while we know the page can't be
                         * freed.
                         */
                        if (trylock_page(vmf->page)) {
                                struct dev_pagemap *pgmap;

                                get_page(vmf->page);
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                                pgmap = page_pgmap(vmf->page);
                                ret = pgmap->ops->migrate_to_ram(vmf);
                                unlock_page(vmf->page);
                                put_page(vmf->page);
                        } else {
                                pte_unmap_unlock(vmf->pte, vmf->ptl);
                        }
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
                } else if (is_pte_marker_entry(entry)) {
                        ret = handle_pte_marker(vmf);
                } else {
                        print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
        }

        /* Prevent swapoff from happening to us. */
        si = get_swap_device(entry);
        if (unlikely(!si))
                goto out;

        folio = swap_cache_get_folio(entry, vma, vmf->address);
        if (folio)
                page = folio_file_page(folio, swp_offset(entry));
        swapcache = folio;

        if (!folio) {
                if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
                    __swap_count(entry) == 1) {
                        /* skip swapcache */
                        folio = alloc_swap_folio(vmf);
                        if (folio) {
                                __folio_set_locked(folio);
                                __folio_set_swapbacked(folio);

                                nr_pages = folio_nr_pages(folio);
                                if (folio_test_large(folio))
                                        entry.val = ALIGN_DOWN(entry.val, nr_pages);
                                /*
                                 * Prevent parallel swapin from proceeding with
                                 * the cache flag. Otherwise, another thread
                                 * may finish swapin first, free the entry, and
                                 * swapout reusing the same entry. It's
                                 * undetectable as pte_same() returns true due
                                 * to entry reuse.
                                 */
                                if (swapcache_prepare(entry, nr_pages)) {
                                        /*
                                         * Relax a bit to prevent rapid
                                         * repeated page faults.
                                         */
                                        add_wait_queue(&swapcache_wq, &wait);
                                        schedule_timeout_uninterruptible(1);
                                        remove_wait_queue(&swapcache_wq, &wait);
                                        goto out_page;
                                }
                                need_clear_cache = true;

                                memcg1_swapin(entry, nr_pages);

                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
                                        workingset_refault(folio, shadow);

                                folio_add_lru(folio);

                                /* To provide entry to swap_read_folio() */
                                folio->swap = entry;
                                swap_read_folio(folio, NULL);
                                folio->private = NULL;
                        }
                } else {
                        folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                vmf);
                        swapcache = folio;
                }

                if (!folio) {
                        /*
                         * Back out if somebody else faulted in this pte
                         * while we released the pte lock.
                         */
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
                        if (likely(vmf->pte &&
                                   pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        goto unlock;
                }

                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
                page = folio_file_page(folio, swp_offset(entry));
        } else if (PageHWPoison(page)) {
                /*
                 * hwpoisoned dirty swapcache pages are kept for killing
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
                goto out_release;
        }

        ret |= folio_lock_or_retry(folio, vmf);
        if (ret & VM_FAULT_RETRY)
                goto out_release;

        if (swapcache) {
                /*
                 * Make sure folio_free_swap() or swapoff did not release the
                 * swapcache from under us.  The page pin, and pte_same test
                 * below, are not enough to exclude that.  Even if it is still
                 * swapcache, we need to check that the page's swap has not
                 * changed.
                 */
                if (unlikely(!folio_test_swapcache(folio) ||
                             page_swap_entry(page).val != entry.val))
                        goto out_page;

                /*
                 * KSM sometimes has to copy on read faults, for example, if
                 * page->index of !PageKSM() pages would be nonlinear inside the
                 * anon VMA -- PageKSM() is lost on actual swapout.
                 */
                folio = ksm_might_need_to_copy(folio, vma, vmf->address);
                if (unlikely(!folio)) {
                        ret = VM_FAULT_OOM;
                        folio = swapcache;
                        goto out_page;
                } else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
                        ret = VM_FAULT_HWPOISON;
                        folio = swapcache;
                        goto out_page;
                }
                if (folio != swapcache)
                        page = folio_page(folio, 0);

                /*
                 * If we want to map a page that's in the swapcache writable, we
                 * have to detect via the refcount if we're really the exclusive
                 * owner. Try removing the extra reference from the local LRU
                 * caches if required.
                 */
                if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
                    !folio_test_ksm(folio) && !folio_test_lru(folio))
                        lru_add_drain();
        }

        folio_throttle_swaprate(folio, GFP_KERNEL);

        /*
         * Back out if somebody else already faulted in this pte.
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
        if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                goto out_nomap;

        if (unlikely(!folio_test_uptodate(folio))) {
                ret = VM_FAULT_SIGBUS;
                goto out_nomap;
        }

        /* allocated large folios for SWP_SYNCHRONOUS_IO */
        if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
                unsigned long nr = folio_nr_pages(folio);
                unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
                unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
                pte_t *folio_ptep = vmf->pte - idx;
                pte_t folio_pte = ptep_get(folio_ptep);

                if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
                    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
                        goto out_nomap;

                page_idx = idx;
                address = folio_start;
                ptep = folio_ptep;
                goto check_folio;
        }

        nr_pages = 1;
        page_idx = 0;
        address = vmf->address;
        ptep = vmf->pte;
        if (folio_test_large(folio) && folio_test_swapcache(folio)) {
                int nr = folio_nr_pages(folio);
                unsigned long idx = folio_page_idx(folio, page);
                unsigned long folio_start = address - idx * PAGE_SIZE;
                unsigned long folio_end = folio_start + nr * PAGE_SIZE;
                pte_t *folio_ptep;
                pte_t folio_pte;

                if (unlikely(folio_start < max(address & PMD_MASK, vma->vm_start)))
                        goto check_folio;
                if (unlikely(folio_end > pmd_addr_end(address, vma->vm_end)))
                        goto check_folio;

                folio_ptep = vmf->pte - idx;
                folio_pte = ptep_get(folio_ptep);
                if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
                    swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
                        goto check_folio;

                page_idx = idx;
                address = folio_start;
                ptep = folio_ptep;
                nr_pages = nr;
                entry = folio->swap;
                page = &folio->page;
        }

check_folio:
        /*
         * PG_anon_exclusive reuses PG_mappedtodisk for anon pages. A swap pte
         * must never point at an anonymous page in the swapcache that is
         * PG_anon_exclusive. Sanity check that this holds and especially, that
         * no filesystem set PG_mappedtodisk on a page in the swapcache. Sanity
         * check after taking the PT lock and making sure that nobody
         * concurrently faulted in this page and set PG_anon_exclusive.
         */
        BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio));
        BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page));

        /*
         * Check under PT lock (to protect against concurrent fork() sharing
         * the swap entry concurrently) for certainly exclusive pages.
         */
        if (!folio_test_ksm(folio)) {
                exclusive = pte_swp_exclusive(vmf->orig_pte);
                if (folio != swapcache) {
                        /*
                         * We have a fresh page that is not exposed to the
                         * swapcache -> certainly exclusive.
                         */
                        exclusive = true;
                } else if (exclusive && folio_test_writeback(folio) &&
                          data_race(si->flags & SWP_STABLE_WRITES)) {
                        /*
                         * This is tricky: not all swap backends support
                         * concurrent page modifications while under writeback.
                         *
                         * So if we stumble over such a page in the swapcache
                         * we must not set the page exclusive, otherwise we can
                         * map it writable without further checks and modify it
                         * while still under writeback.
                         *
                         * For these problematic swap backends, simply drop the
                         * exclusive marker: this is perfectly fine as we start
                         * writeback only if we fully unmapped the page and
                         * there are no unexpected references on the page after
                         * unmapping succeeded. After fully unmapped, no
                         * further GUP references (FOLL_GET and FOLL_PIN) can
                         * appear, so dropping the exclusive marker and mapping
                         * it only R/O is fine.
                         */
                        exclusive = false;
                }
        }

        /*
         * Some architectures may have to restore extra metadata to the page
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before swap_free().
         */
        arch_swap_restore(folio_swap(entry, folio), folio);

        /*
         * Remove the swap entry and conditionally try to free up the swapcache.
         * We're already holding a reference on the page but haven't mapped it
         * yet.
         */
        swap_free_nr(entry, nr_pages);
        if (should_try_to_free_swap(folio, vma, vmf->flags))
                folio_free_swap(folio);

        add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
        add_mm_counter(vma->vm_mm, MM_SWAPENTS, -nr_pages);
        pte = mk_pte(page, vma->vm_page_prot);
        if (pte_swp_soft_dirty(vmf->orig_pte))
                pte = pte_mksoft_dirty(pte);
        if (pte_swp_uffd_wp(vmf->orig_pte))
                pte = pte_mkuffd_wp(pte);

        /*
         * Same logic as in do_wp_page(); however, optimize for pages that are
         * certainly not shared either because we just allocated them without
         * exposing them to the swapcache or because the swap entry indicates
         * exclusivity.
         */
        if (!folio_test_ksm(folio) &&
            (exclusive || folio_ref_count(folio) == 1)) {
                if ((vma->vm_flags & VM_WRITE) && !userfaultfd_pte_wp(vma, pte) &&
                    !pte_needs_soft_dirty_wp(vma, pte)) {
                        pte = pte_mkwrite(pte, vma);
                        if (vmf->flags & FAULT_FLAG_WRITE) {
                                pte = pte_mkdirty(pte);
                                vmf->flags &= ~FAULT_FLAG_WRITE;
                        }
                }
                rmap_flags |= RMAP_EXCLUSIVE;
        }
        folio_ref_add(folio, nr_pages - 1);
        flush_icache_pages(vma, page, nr_pages);
        vmf->orig_pte = pte_advance_pfn(pte, page_idx);

        /* ksm created a completely new copy */
        if (unlikely(folio != swapcache && swapcache)) {
                folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
                folio_add_lru_vma(folio, vma);
        } else if (!folio_test_anon(folio)) {
                /*
                 * We currently only expect small !anon folios which are either
                 * fully exclusive or fully shared, or new allocated large
                 * folios which are fully exclusive. If we ever get large
                 * folios within swapcache here, we have to be careful.
                 */
                VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
                VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
                folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
        } else {
                folio_add_anon_rmap_ptes(folio, page, nr_pages, vma, address,
                                        rmap_flags);
        }

        VM_BUG_ON(!folio_test_anon(folio) ||
                        (pte_write(pte) && !PageAnonExclusive(page)));
        set_ptes(vma->vm_mm, address, ptep, pte, nr_pages);
        arch_do_swap_page_nr(vma->vm_mm, vma, address,
                        pte, pte, nr_pages);

        folio_unlock(folio);
        if (folio != swapcache && swapcache) {
                /*
                 * Hold the lock to avoid the swap entry to be reused
                 * until we take the PT lock for the pte_same() check
                 * (to avoid false positives from pte_same). For
                 * further safety release the lock after the swap_free
                 * so that the swap count won't change under a
                 * parallel locked swapcache.
                 */
                folio_unlock(swapcache);
                folio_put(swapcache);
        }

        if (vmf->flags & FAULT_FLAG_WRITE) {
                ret |= do_wp_page(vmf);
                if (ret & VM_FAULT_ERROR)
                        ret &= VM_FAULT_ERROR;
                goto out;
        }

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_range(vmf, vma, address, ptep, nr_pages);
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
        /* Clear the swap cache pin for direct swapin after PTL unlock */
        if (need_clear_cache) {
                swapcache_clear(si, entry, nr_pages);
                if (waitqueue_active(&swapcache_wq))
                        wake_up(&swapcache_wq);
        }
        if (si)
                put_swap_device(si);
        return ret;
out_nomap:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
        folio_unlock(folio);
out_release:
        folio_put(folio);
        if (folio != swapcache && swapcache) {
                folio_unlock(swapcache);
                folio_put(swapcache);
        }
        if (need_clear_cache) {
                swapcache_clear(si, entry, nr_pages);
                if (waitqueue_active(&swapcache_wq))
                        wake_up(&swapcache_wq);
        }
        if (si)
                put_swap_device(si);
        return ret;
}

static bool pte_range_none(pte_t *pte, int nr_pages)
{
        int i;

        for (i = 0; i < nr_pages; i++) {
                if (!pte_none(ptep_get_lockless(pte + i)))
                        return false;
        }

        return true;
}

static struct folio *alloc_anon_folio(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        unsigned long orders;
        struct folio *folio;
        unsigned long addr;
        pte_t *pte;
        gfp_t gfp;
        int order;

        /*
         * If uffd is active for the vma we need per-page fault fidelity to
         * maintain the uffd semantics.
         */
        if (unlikely(userfaultfd_armed(vma)))
                goto fallback;

        /*
         * Get a list of all the (large) orders below PMD_ORDER that are enabled
         * for this vma. Then filter out the orders that can't be allocated over
         * the faulting address and still be fully contained in the vma.
         */
        orders = thp_vma_allowable_orders(vma, vma->vm_flags,
                        TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
        orders = thp_vma_suitable_orders(vma, vmf->address, orders);

        if (!orders)
                goto fallback;

        pte = pte_offset_map(vmf->pmd, vmf->address & PMD_MASK);
        if (!pte)
                return ERR_PTR(-EAGAIN);

        /*
         * Find the highest order where the aligned range is completely
         * pte_none(). Note that all remaining orders will be completely
         * pte_none().
         */
        order = highest_order(orders);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                if (pte_range_none(pte + pte_index(addr), 1 << order))
                        break;
                order = next_order(&orders, order);
        }

        pte_unmap(pte);

        if (!orders)
                goto fallback;

        /* Try allocating the highest of the remaining orders. */
        gfp = vma_thp_gfp_mask(vma);
        while (orders) {
                addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                folio = vma_alloc_folio(gfp, order, vma, addr);
                if (folio) {
                        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                                folio_put(folio);
                                goto next;
                        }
                        folio_throttle_swaprate(folio, gfp);
                        /*
                         * When a folio is not zeroed during allocation
                         * (__GFP_ZERO not used) or user folios require special
                         * handling, folio_zero_user() is used to make sure
                         * that the page corresponding to the faulting address
                         * will be hot in the cache after zeroing.
                         */
                        if (user_alloc_needs_zeroing())
                                folio_zero_user(folio, vmf->address);
                        return folio;
                }
next:
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                order = next_order(&orders, order);
        }

fallback:
#endif
        return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_lock still held, but pte unmapped and unlocked.
 */
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long addr = vmf->address;
        struct folio *folio;
        vm_fault_t ret = 0;
        int nr_pages = 1;
        pte_t entry;

        /* File mapping without ->vm_ops ? */
        if (vma->vm_flags & VM_SHARED)
                return VM_FAULT_SIGBUS;

        /*
         * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
         * be distinguished from a transient failure of pte_offset_map().
         */
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;

        /* Use the zero-page for reads */
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address),
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
                if (!vmf->pte)
                        goto unlock;
                if (vmf_pte_changed(vmf)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
                }
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock;
                /* Deliver the page fault to userland, check inside PT lock */
                if (userfaultfd_missing(vma)) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return handle_userfault(vmf, VM_UFFD_MISSING);
                }
                goto setpte;
        }

        /* Allocate our own private page. */
        ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;
        /* Returns NULL on OOM or ERR_PTR(-EAGAIN) if we must retry the fault */
        folio = alloc_anon_folio(vmf);
        if (IS_ERR(folio))
                return 0;
        if (!folio)
                goto oom;

        nr_pages = folio_nr_pages(folio);
        addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);

        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * preceding stores to the page contents become visible before
         * the set_pte_at() write.
         */
        __folio_mark_uptodate(folio);

        entry = mk_pte(&folio->page, vma->vm_page_prot);
        entry = pte_sw_mkyoung(entry);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry), vma);

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
        if (!vmf->pte)
                goto release;
        if (nr_pages == 1 && vmf_pte_changed(vmf)) {
                update_mmu_tlb(vma, addr, vmf->pte);
                goto release;
        } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
                update_mmu_tlb_range(vma, addr, vmf->pte, nr_pages);
                goto release;
        }

        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
                goto release;

        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                folio_put(folio);
                return handle_userfault(vmf, VM_UFFD_MISSING);
        }

        folio_ref_add(folio, nr_pages - 1);
        add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
        count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
        folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
        folio_add_lru_vma(folio, vma);
setpte:
        if (vmf_orig_pte_uffd_wp(vmf))
                entry = pte_mkuffd_wp(entry);
        set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr_pages);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr_pages);
unlock:
        if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
release:
        folio_put(folio);
        goto unlock;
oom:
        return VM_FAULT_OOM;
}

/*
 * The mmap_lock must have been held on entry, and may have been
 * released depending on flags and vma->vm_ops->fault() return value.
 * See filemap_fault() and __lock_page_retry().
 */
static vm_fault_t __do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        /*
         * Preallocate pte before we take page_lock because this might lead to
         * deadlocks for memcg reclaim which waits for pages under writeback:
         *                                lock_page(A)
         *                                SetPageWriteback(A)
         *                                unlock_page(A)
         * lock_page(B)
         *                                lock_page(B)
         * pte_alloc_one
         *   shrink_folio_list
         *     wait_on_page_writeback(A)
         *                                SetPageWriteback(B)
         *                                unlock_page(B)
         *                                # flush A, B to clear the writeback
         */
        if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        ret = vma->vm_ops->fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
                            VM_FAULT_DONE_COW)))
                return ret;

        folio = page_folio(vmf->page);
        if (unlikely(PageHWPoison(vmf->page))) {
                vm_fault_t poisonret = VM_FAULT_HWPOISON;
                if (ret & VM_FAULT_LOCKED) {
                        if (page_mapped(vmf->page))
                                unmap_mapping_folio(folio);
                        /* Retry if a clean folio was removed from the cache. */
                        if (mapping_evict_folio(folio->mapping, folio))
                                poisonret = VM_FAULT_NOPAGE;
                        folio_unlock(folio);
                }
                folio_put(folio);
                vmf->page = NULL;
                return poisonret;
        }

        if (unlikely(!(ret & VM_FAULT_LOCKED)))
                folio_lock(folio);
        else
                VM_BUG_ON_PAGE(!folio_test_locked(folio), vmf->page);

        return ret;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;

        pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
        /*
         * We are going to consume the prealloc table,
         * count that as nr_ptes.
         */
        mm_inc_nr_ptes(vma->vm_mm);
        vmf->prealloc_pte = NULL;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        struct folio *folio = page_folio(page);
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t entry;
        vm_fault_t ret = VM_FAULT_FALLBACK;

        /*
         * It is too late to allocate a small folio, we already have a large
         * folio in the pagecache: especially s390 KVM cannot tolerate any
         * PMD mappings, but PTE-mapped THP are fine. So let's simply refuse any
         * PMD mappings if THPs are disabled.
         */
        if (thp_disabled_by_hw() || vma_thp_disabled(vma, vma->vm_flags))
                return ret;

        if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                return ret;

        if (folio_order(folio) != HPAGE_PMD_ORDER)
                return ret;
        page = &folio->page;

        /*
         * Just backoff if any subpage of a THP is corrupted otherwise
         * the corrupted page may mapped by PMD silently to escape the
         * check.  This kind of THP just can be PTE mapped.  Access to
         * the corrupted subpage should trigger SIGBUS as expected.
         */
        if (unlikely(folio_test_has_hwpoisoned(folio)))
                return ret;

        /*
         * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
                vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd)))
                goto out;

        flush_icache_pages(vma, page, HPAGE_PMD_NR);

        entry = mk_huge_pmd(page, vma->vm_page_prot);
        if (write)
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);

        add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
        folio_add_file_rmap_pmd(folio, page, vma);

        /*
         * deposit and withdraw with pmd lock held
         */
        if (arch_needs_pgtable_deposit())
                deposit_prealloc_pte(vmf);

        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);

        update_mmu_cache_pmd(vma, haddr, vmf->pmd);

        /* fault is handled */
        ret = 0;
        count_vm_event(THP_FILE_MAPPED);
out:
        spin_unlock(vmf->ptl);
        return ret;
}
#else
vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
{
        return VM_FAULT_FALLBACK;
}
#endif

/**
 * set_pte_range - Set a range of PTEs to point to pages in a folio.
 * @vmf: Fault decription.
 * @folio: The folio that contains @page.
 * @page: The first page to create a PTE for.
 * @nr: The number of PTEs to create.
 * @addr: The first address to create a PTE for.
 */
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr)
{
        struct vm_area_struct *vma = vmf->vma;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
        bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE);
        pte_t entry;

        flush_icache_pages(vma, page, nr);
        entry = mk_pte(page, vma->vm_page_prot);

        if (prefault && arch_wants_old_prefaulted_pte())
                entry = pte_mkold(entry);
        else
                entry = pte_sw_mkyoung(entry);

        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (unlikely(vmf_orig_pte_uffd_wp(vmf)))
                entry = pte_mkuffd_wp(entry);
        /* copy-on-write page */
        if (write && !(vma->vm_flags & VM_SHARED)) {
                VM_BUG_ON_FOLIO(nr != 1, folio);
                folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
                folio_add_lru_vma(folio, vma);
        } else {
                folio_add_file_rmap_ptes(folio, page, nr, vma);
        }
        set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);

        /* no need to invalidate: a not-present page won't be cached */
        update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr);
}

static bool vmf_pte_changed(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
                return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);

        return !pte_none(ptep_get(vmf->pte));
}

/**
 * finish_fault - finish page fault once we have prepared the page to fault
 *
 * @vmf: structure describing the fault
 *
 * This function handles all that is needed to finish a page fault once the
 * page to fault in is prepared. It handles locking of PTEs, inserts PTE for
 * given page, adds reverse page mapping, handles memcg charges and LRU
 * addition.
 *
 * The function expects the page to be locked and on success it consumes a
 * reference of a page being mapped (for the PTE which maps it).
 *
 * Return: %0 on success, %VM_FAULT_ code in case of error.
 */
vm_fault_t finish_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct page *page;
        struct folio *folio;
        vm_fault_t ret;
        bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
                      !(vma->vm_flags & VM_SHARED);
        int type, nr_pages;
        unsigned long addr;
        bool needs_fallback = false;

fallback:
        addr = vmf->address;

        /* Did we COW the page? */
        if (is_cow)
                page = vmf->cow_page;
        else
                page = vmf->page;

        /*
         * check even for read faults because we might have lost our CoWed
         * page
         */
        if (!(vma->vm_flags & VM_SHARED)) {
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        return ret;
        }

        if (pmd_none(*vmf->pmd)) {
                if (PageTransCompound(page)) {
                        ret = do_set_pmd(vmf, page);
                        if (ret != VM_FAULT_FALLBACK)
                                return ret;
                }

                if (vmf->prealloc_pte)
                        pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
                else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
                        return VM_FAULT_OOM;
        }

        folio = page_folio(page);
        nr_pages = folio_nr_pages(folio);

        /*
         * Using per-page fault to maintain the uffd semantics, and same
         * approach also applies to non-anonymous-shmem faults to avoid
         * inflating the RSS of the process.
         */
        if (!vma_is_anon_shmem(vma) || unlikely(userfaultfd_armed(vma)) ||
            unlikely(needs_fallback)) {
                nr_pages = 1;
        } else if (nr_pages > 1) {
                pgoff_t idx = folio_page_idx(folio, page);
                /* The page offset of vmf->address within the VMA. */
                pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
                /* The index of the entry in the pagetable for fault page. */
                pgoff_t pte_off = pte_index(vmf->address);

                /*
                 * Fallback to per-page fault in case the folio size in page
                 * cache beyond the VMA limits and PMD pagetable limits.
                 */
                if (unlikely(vma_off < idx ||
                            vma_off + (nr_pages - idx) > vma_pages(vma) ||
                            pte_off < idx ||
                            pte_off + (nr_pages - idx)  > PTRS_PER_PTE)) {
                        nr_pages = 1;
                } else {
                        /* Now we can set mappings for the whole large folio. */
                        addr = vmf->address - idx * PAGE_SIZE;
                        page = &folio->page;
                }
        }

        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                       addr, &vmf->ptl);
        if (!vmf->pte)
                return VM_FAULT_NOPAGE;

        /* Re-check under ptl */
        if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
                update_mmu_tlb(vma, addr, vmf->pte);
                ret = VM_FAULT_NOPAGE;
                goto unlock;
        } else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
                needs_fallback = true;
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto fallback;
        }

        folio_ref_add(folio, nr_pages - 1);
        set_pte_range(vmf, folio, page, nr_pages, addr);
        type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
        add_mm_counter(vma->vm_mm, type, nr_pages);
        ret = 0;

unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
}

static unsigned long fault_around_pages __read_mostly =
        65536 >> PAGE_SHIFT;

#ifdef CONFIG_DEBUG_FS
static int fault_around_bytes_get(void *data, u64 *val)
{
        *val = fault_around_pages << PAGE_SHIFT;
        return 0;
}

/*
 * fault_around_bytes must be rounded down to the nearest page order as it's
 * what do_fault_around() expects to see.
 */
static int fault_around_bytes_set(void *data, u64 val)
{
        if (val / PAGE_SIZE > PTRS_PER_PTE)
                return -EINVAL;

        /*
         * The minimum value is 1 page, however this results in no fault-around
         * at all. See should_fault_around().
         */
        val = max(val, PAGE_SIZE);
        fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;

        return 0;
}
DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops,
                fault_around_bytes_get, fault_around_bytes_set, "%llu\n");

static int __init fault_around_debugfs(void)
{
        debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
                                   &fault_around_bytes_fops);
        return 0;
}
late_initcall(fault_around_debugfs);
#endif

/*
 * do_fault_around() tries to map few pages around the fault address. The hope
 * is that the pages will be needed soon and this will lower the number of
 * faults to handle.
 *
 * It uses vm_ops->map_pages() to map the pages, which skips the page if it's
 * not ready to be mapped: not up-to-date, locked, etc.
 *
 * This function doesn't cross VMA or page table boundaries, in order to call
 * map_pages() and acquire a PTE lock only once.
 *
 * fault_around_pages defines how many pages we'll try to map.
 * do_fault_around() expects it to be set to a power of two less than or equal
 * to PTRS_PER_PTE.
 *
 * The virtual address of the area that we map is naturally aligned to
 * fault_around_pages * PAGE_SIZE rounded down to the machine page size
 * (and therefore to page order).  This way it's easier to guarantee
 * that we don't cross page table boundaries.
 */
static vm_fault_t do_fault_around(struct vm_fault *vmf)
{
        pgoff_t nr_pages = READ_ONCE(fault_around_pages);
        pgoff_t pte_off = pte_index(vmf->address);
        /* The page offset of vmf->address within the VMA. */
        pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
        pgoff_t from_pte, to_pte;
        vm_fault_t ret;

        /* The PTE offset of the start address, clamped to the VMA. */
        from_pte = max(ALIGN_DOWN(pte_off, nr_pages),
                       pte_off - min(pte_off, vma_off));

        /* The PTE offset of the end address, clamped to the VMA and PTE. */
        to_pte = min3(from_pte + nr_pages, (pgoff_t)PTRS_PER_PTE,
                      pte_off + vma_pages(vmf->vma) - vma_off) - 1;

        if (pmd_none(*vmf->pmd)) {
                vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
                if (!vmf->prealloc_pte)
                        return VM_FAULT_OOM;
        }

        rcu_read_lock();
        ret = vmf->vma->vm_ops->map_pages(vmf,
                        vmf->pgoff + from_pte - pte_off,
                        vmf->pgoff + to_pte - pte_off);
        rcu_read_unlock();

        return ret;
}

/* Return true if we should do read fault-around, false otherwise */
static inline bool should_fault_around(struct vm_fault *vmf)
{
        /* No ->map_pages?  No way to fault around... */
        if (!vmf->vma->vm_ops->map_pages)
                return false;

        if (uffd_disable_fault_around(vmf->vma))
                return false;

        /* A single page implies no faulting 'around' at all. */
        return fault_around_pages > 1;
}

static vm_fault_t do_read_fault(struct vm_fault *vmf)
{
        vm_fault_t ret = 0;
        struct folio *folio;

        /*
         * Let's call ->map_pages() first and use ->fault() as fallback
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
        if (should_fault_around(vmf)) {
                ret = do_fault_around(vmf);
                if (ret)
                        return ret;
        }

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        ret |= finish_fault(vmf);
        folio = page_folio(vmf->page);
        folio_unlock(folio);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                folio_put(folio);
        return ret;
}

static vm_fault_t do_cow_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        vm_fault_t ret;

        ret = vmf_can_call_fault(vmf);
        if (!ret)
                ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;

        folio = folio_prealloc(vma->vm_mm, vma, vmf->address, false);
        if (!folio)
                return VM_FAULT_OOM;

        vmf->cow_page = &folio->page;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        if (ret & VM_FAULT_DONE_COW)
                return ret;

        if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) {
                ret = VM_FAULT_HWPOISON;
                goto unlock;
        }
        __folio_mark_uptodate(folio);

        ret |= finish_fault(vmf);
unlock:
        unlock_page(vmf->page);
        put_page(vmf->page);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                goto uncharge_out;
        return ret;
uncharge_out:
        folio_put(folio);
        return ret;
}

static vm_fault_t do_shared_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret, tmp;
        struct folio *folio;

        ret = vmf_can_call_fault(vmf);
        if (ret)
                return ret;

        ret = __do_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
                return ret;

        folio = page_folio(vmf->page);

        /*
         * Check if the backing address space wants to know that the page is
         * about to become writable
         */
        if (vma->vm_ops->page_mkwrite) {
                folio_unlock(folio);
                tmp = do_page_mkwrite(vmf, folio);
                if (unlikely(!tmp ||
                                (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) {
                        folio_put(folio);
                        return tmp;
                }
        }

        ret |= finish_fault(vmf);
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                                        VM_FAULT_RETRY))) {
                folio_unlock(folio);
                folio_put(folio);
                return ret;
        }

        ret |= fault_dirty_shared_page(vmf);
        return ret;
}

/*
 * We enter with non-exclusive mmap_lock (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 * If mmap_lock is released, vma may become invalid (for example
 * by other thread calling munmap()).
 */
static vm_fault_t do_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *vm_mm = vma->vm_mm;
        vm_fault_t ret;

        /*
         * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
         */
        if (!vma->vm_ops->fault) {
                vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                               vmf->address, &vmf->ptl);
                if (unlikely(!vmf->pte))
                        ret = VM_FAULT_SIGBUS;
                else {
                        /*
                         * Make sure this is not a temporary clearing of pte
                         * by holding ptl and checking again. A R/M/W update
                         * of pte involves: take ptl, clearing the pte so that
                         * we don't have concurrent modification by hardware
                         * followed by an update.
                         */
                        if (unlikely(pte_none(ptep_get(vmf->pte))))
                                ret = VM_FAULT_SIGBUS;
                        else
                                ret = VM_FAULT_NOPAGE;

                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                }
        } else if (!(vmf->flags & FAULT_FLAG_WRITE))
                ret = do_read_fault(vmf);
        else if (!(vma->vm_flags & VM_SHARED))
                ret = do_cow_fault(vmf);
        else
                ret = do_shared_fault(vmf);

        /* preallocated pagetable is unused: free it */
        if (vmf->prealloc_pte) {
                pte_free(vm_mm, vmf->prealloc_pte);
                vmf->prealloc_pte = NULL;
        }
        return ret;
}

int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int *flags,
                      bool writable, int *last_cpupid)
{
        struct vm_area_struct *vma = vmf->vma;

        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
         * the case where a mapping is writable but the process never writes
         * to it but pte_write gets cleared during protection updates and
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
        if (!writable)
                *flags |= TNF_NO_GROUP;

        /*
         * Flag if the folio is shared between multiple address spaces. This
         * is later used when determining whether to group tasks together
         */
        if (folio_maybe_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
                *flags |= TNF_SHARED;
        /*
         * For memory tiering mode, cpupid of slow memory page is used
         * to record page access time.  So use default value.
         */
        if (folio_use_access_time(folio))
                *last_cpupid = (-1 & LAST_CPUPID_MASK);
        else
                *last_cpupid = folio_last_cpupid(folio);

        /* Record the current PID acceesing VMA */
        vma_set_access_pid_bit(vma);

        count_vm_numa_event(NUMA_HINT_FAULTS);
#ifdef CONFIG_NUMA_BALANCING
        count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1);
#endif
        if (folio_nid(folio) == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                *flags |= TNF_FAULT_LOCAL;
        }

        return mpol_misplaced(folio, vmf, addr);
}

static void numa_rebuild_single_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                        unsigned long fault_addr, pte_t *fault_pte,
                                        bool writable)
{
        pte_t pte, old_pte;

        old_pte = ptep_modify_prot_start(vma, fault_addr, fault_pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
        pte = pte_mkyoung(pte);
        if (writable)
                pte = pte_mkwrite(pte, vma);
        ptep_modify_prot_commit(vma, fault_addr, fault_pte, old_pte, pte);
        update_mmu_cache_range(vmf, vma, fault_addr, fault_pte, 1);
}

static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_struct *vma,
                                       struct folio *folio, pte_t fault_pte,
                                       bool ignore_writable, bool pte_write_upgrade)
{
        int nr = pte_pfn(fault_pte) - folio_pfn(folio);
        unsigned long start, end, addr = vmf->address;
        unsigned long addr_start = addr - (nr << PAGE_SHIFT);
        unsigned long pt_start = ALIGN_DOWN(addr, PMD_SIZE);
        pte_t *start_ptep;

        /* Stay within the VMA and within the page table. */
        start = max3(addr_start, pt_start, vma->vm_start);
        end = min3(addr_start + folio_size(folio), pt_start + PMD_SIZE,
                   vma->vm_end);
        start_ptep = vmf->pte - ((addr - start) >> PAGE_SHIFT);

        /* Restore all PTEs' mapping of the large folio */
        for (addr = start; addr != end; start_ptep++, addr += PAGE_SIZE) {
                pte_t ptent = ptep_get(start_ptep);
                bool writable = false;

                if (!pte_present(ptent) || !pte_protnone(ptent))
                        continue;

                if (pfn_folio(pte_pfn(ptent)) != folio)
                        continue;

                if (!ignore_writable) {
                        ptent = pte_modify(ptent, vma->vm_page_prot);
                        writable = pte_write(ptent);
                        if (!writable && pte_write_upgrade &&
                            can_change_pte_writable(vma, addr, ptent))
                                writable = true;
                }

                numa_rebuild_single_mapping(vmf, vma, addr, start_ptep, writable);
        }
}

static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = NULL;
        int nid = NUMA_NO_NODE;
        bool writable = false, ignore_writable = false;
        bool pte_write_upgrade = vma_wants_manual_pte_write_upgrade(vma);
        int last_cpupid;
        int target_nid;
        pte_t pte, old_pte;
        int flags = 0, nr_pages;

        /*
         * The pte cannot be used safely until we verify, while holding the page
         * table lock, that its contents have not changed during fault handling.
         */
        spin_lock(vmf->ptl);
        /* Read the live PTE from the page tables: */
        old_pte = ptep_get(vmf->pte);

        if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }

        pte = pte_modify(old_pte, vma->vm_page_prot);

        /*
         * Detect now whether the PTE could be writable; this information
         * is only valid while holding the PT lock.
         */
        writable = pte_write(pte);
        if (!writable && pte_write_upgrade &&
            can_change_pte_writable(vma, vmf->address, pte))
                writable = true;

        folio = vm_normal_folio(vma, vmf->address, pte);
        if (!folio || folio_is_zone_device(folio))
                goto out_map;

        nid = folio_nid(folio);
        nr_pages = folio_nr_pages(folio);

        target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
                                        writable, &last_cpupid);
        if (target_nid == NUMA_NO_NODE)
                goto out_map;
        if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
                flags |= TNF_MIGRATE_FAIL;
                goto out_map;
        }
        /* The folio is isolated and isolation code holds a folio reference. */
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        writable = false;
        ignore_writable = true;

        /* Migrate to the requested node */
        if (!migrate_misplaced_folio(folio, target_nid)) {
                nid = target_nid;
                flags |= TNF_MIGRATED;
                task_numa_fault(last_cpupid, nid, nr_pages, flags);
                return 0;
        }

        flags |= TNF_MIGRATE_FAIL;
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
        if (unlikely(!vmf->pte))
                return 0;
        if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return 0;
        }
out_map:
        /*
         * Make it present again, depending on how arch implements
         * non-accessible ptes, some can allow access by kernel mode.
         */
        if (folio && folio_test_large(folio))
                numa_rebuild_large_mapping(vmf, vma, folio, pte, ignore_writable,
                                           pte_write_upgrade);
        else
                numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte,
                                            writable);
        pte_unmap_unlock(vmf->pte, vmf->ptl);

        if (nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, nid, nr_pages, flags);
        return 0;
}

static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        if (vma_is_anonymous(vma))
                return do_huge_pmd_anonymous_page(vmf);
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PMD_ORDER);
        return VM_FAULT_FALLBACK;
}

/* `inline' is required to avoid gcc 4.1.2 build error */
static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        vm_fault_t ret;

        if (vma_is_anonymous(vma)) {
                if (likely(!unshare) &&
                    userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) {
                        if (userfaultfd_wp_async(vmf->vma))
                                goto split;
                        return handle_userfault(vmf, VM_UFFD_WP);
                }
                return do_huge_pmd_wp_page(vmf);
        }

        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }

split:
        /* COW or write-notify handled on pte level: split pmd. */
        __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);

        return VM_FAULT_FALLBACK;
}

static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                return VM_FAULT_FALLBACK;
        if (vma->vm_ops->huge_fault)
                return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
        return VM_FAULT_FALLBACK;
}

static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&                        \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret;

        /* No support for anonymous transparent PUD pages yet */
        if (vma_is_anonymous(vma))
                goto split;
        if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) {
                if (vma->vm_ops->huge_fault) {
                        ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER);
                        if (!(ret & VM_FAULT_FALLBACK))
                                return ret;
                }
        }
split:
        /* COW or write-notify not handled on PUD level: split pud.*/
        __split_huge_pud(vma, vmf->pud, vmf->address);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
        return VM_FAULT_FALLBACK;
}

/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
 * concurrent faults).
 *
 * The mmap_lock may have been released depending on flags and our return value.
 * See filemap_fault() and __folio_lock_or_retry().
 */
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
        pte_t entry;

        if (unlikely(pmd_none(*vmf->pmd))) {
                /*
                 * Leave __pte_alloc() until later: because vm_ops->fault may
                 * want to allocate huge page, and if we expose page table
                 * for an instant, it will be difficult to retract from
                 * concurrent faults and from rmap lookups.
                 */
                vmf->pte = NULL;
                vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
        } else {
                pmd_t dummy_pmdval;

                /*
                 * A regular pmd is established and it can't morph into a huge
                 * pmd by anon khugepaged, since that takes mmap_lock in write
                 * mode; but shmem or file collapse to THP could still morph
                 * it into a huge pmd: just retry later if so.
                 *
                 * Use the maywrite version to indicate that vmf->pte may be
                 * modified, but since we will use pte_same() to detect the
                 * change of the !pte_none() entry, there is no need to recheck
                 * the pmdval. Here we chooes to pass a dummy variable instead
                 * of NULL, which helps new user think about why this place is
                 * special.
                 */
                vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd,
                                                    vmf->address, &dummy_pmdval,
                                                    &vmf->ptl);
                if (unlikely(!vmf->pte))
                        return 0;
                vmf->orig_pte = ptep_get_lockless(vmf->pte);
                vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;

                if (pte_none(vmf->orig_pte)) {
                        pte_unmap(vmf->pte);
                        vmf->pte = NULL;
                }
        }

        if (!vmf->pte)
                return do_pte_missing(vmf);

        if (!pte_present(vmf->orig_pte))
                return do_swap_page(vmf);

        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
                return do_numa_page(vmf);

        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
        if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                goto unlock;
        }
        if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!pte_write(entry))
                        return do_wp_page(vmf);
                else if (likely(vmf->flags & FAULT_FLAG_WRITE))
                        entry = pte_mkdirty(entry);
        }
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
                                vmf->flags & FAULT_FLAG_WRITE)) {
                update_mmu_cache_range(vmf, vmf->vma, vmf->address,
                                vmf->pte, 1);
        } else {
                /* Skip spurious TLB flush for retried page fault */
                if (vmf->flags & FAULT_FLAG_TRIED)
                        goto unlock;
                /*
                 * This is needed only for protection faults but the arch code
                 * is not yet telling us if this is a protection fault or not.
                 * This still avoids useless tlb flushes for .text page faults
                 * with threads.
                 */
                if (vmf->flags & FAULT_FLAG_WRITE)
                        flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
                                                     vmf->pte);
        }
unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
}

/*
 * On entry, we hold either the VMA lock or the mmap_lock
 * (FAULT_FLAG_VMA_LOCK tells you which).  If VM_FAULT_RETRY is set in
 * the result, the mmap_lock is not held on exit.  See filemap_fault()
 * and __folio_lock_or_retry().
 */
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & PAGE_MASK,
                .real_address = address,
                .flags = flags,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
        };
        struct mm_struct *mm = vma->vm_mm;
        unsigned long vm_flags = vma->vm_flags;
        pgd_t *pgd;
        p4d_t *p4d;
        vm_fault_t ret;

        pgd = pgd_offset(mm, address);
        p4d = p4d_alloc(mm, pgd, address);
        if (!p4d)
                return VM_FAULT_OOM;

        vmf.pud = pud_alloc(mm, p4d, address);
        if (!vmf.pud)
                return VM_FAULT_OOM;
retry_pud:
        if (pud_none(*vmf.pud) &&
            thp_vma_allowable_order(vma, vm_flags,
                                TVA_IN_PF | TVA_ENFORCE_SYSFS, PUD_ORDER)) {
                ret = create_huge_pud(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                pud_t orig_pud = *vmf.pud;

                barrier();
                if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {

                        /*
                         * TODO once we support anonymous PUDs: NUMA case and
                         * FAULT_FLAG_UNSHARE handling.
                         */
                        if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
                                ret = wp_huge_pud(&vmf, orig_pud);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pud_set_accessed(&vmf, orig_pud);
                                return 0;
                        }
                }
        }

        vmf.pmd = pmd_alloc(mm, vmf.pud, address);
        if (!vmf.pmd)
                return VM_FAULT_OOM;

        /* Huge pud page fault raced with pmd_alloc? */
        if (pud_trans_unstable(vmf.pud))
                goto retry_pud;

        if (pmd_none(*vmf.pmd) &&
            thp_vma_allowable_order(vma, vm_flags,
                                TVA_IN_PF | TVA_ENFORCE_SYSFS, PMD_ORDER)) {
                ret = create_huge_pmd(&vmf);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
                vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);

                if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                          !is_pmd_migration_entry(vmf.orig_pmd));
                        if (is_pmd_migration_entry(vmf.orig_pmd))
                                pmd_migration_entry_wait(mm, vmf.pmd);
                        return 0;
                }
                if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
                        if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
                                return do_huge_pmd_numa_page(&vmf);

                        if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
                            !pmd_write(vmf.orig_pmd)) {
                                ret = wp_huge_pmd(&vmf);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
                                huge_pmd_set_accessed(&vmf);
                                return 0;
                        }
                }
        }

        return handle_pte_fault(&vmf);
}

/**
 * mm_account_fault - Do page fault accounting
 * @mm: mm from which memcg should be extracted. It can be NULL.
 * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
 *        of perf event counters, but we'll still do the per-task accounting to
 *        the task who triggered this page fault.
 * @address: the faulted address.
 * @flags: the fault flags.
 * @ret: the fault retcode.
 *
 * This will take care of most of the page fault accounting.  Meanwhile, it
 * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
 * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
 * still be in per-arch page fault handlers at the entry of page fault.
 */
static inline void mm_account_fault(struct mm_struct *mm, struct pt_regs *regs,
                                    unsigned long address, unsigned int flags,
                                    vm_fault_t ret)
{
        bool major;

        /* Incomplete faults will be accounted upon completion. */
        if (ret & VM_FAULT_RETRY)
                return;

        /*
         * To preserve the behavior of older kernels, PGFAULT counters record
         * both successful and failed faults, as opposed to perf counters,
         * which ignore failed cases.
         */
        count_vm_event(PGFAULT);
        count_memcg_event_mm(mm, PGFAULT);

        /*
         * Do not account for unsuccessful faults (e.g. when the address wasn't
         * valid).  That includes arch_vma_access_permitted() failing before
         * reaching here. So this is not a "this many hardware page faults"
         * counter.  We should use the hw profiling for that.
         */
        if (ret & VM_FAULT_ERROR)
                return;

        /*
         * We define the fault as a major fault when the final successful fault
         * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
         * handle it immediately previously).
         */
        major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);

        if (major)
                current->maj_flt++;
        else
                current->min_flt++;

        /*
         * If the fault is done for GUP, regs will be NULL.  We only do the
         * accounting for the per thread fault counters who triggered the
         * fault, and we skip the perf event updates.
         */
        if (!regs)
                return;

        if (major)
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
        else
                perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
}

#ifdef CONFIG_LRU_GEN
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
        /* the LRU algorithm only applies to accesses with recency */
        current->in_lru_fault = vma_has_recency(vma);
}

static void lru_gen_exit_fault(void)
{
        current->in_lru_fault = false;
}
#else
static void lru_gen_enter_fault(struct vm_area_struct *vma)
{
}

static void lru_gen_exit_fault(void)
{
}
#endif /* CONFIG_LRU_GEN */

static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma,
                                       unsigned int *flags)
{
        if (unlikely(*flags & FAULT_FLAG_UNSHARE)) {
                if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE))
                        return VM_FAULT_SIGSEGV;
                /*
                 * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's
                 * just treat it like an ordinary read-fault otherwise.
                 */
                if (!is_cow_mapping(vma->vm_flags))
                        *flags &= ~FAULT_FLAG_UNSHARE;
        } else if (*flags & FAULT_FLAG_WRITE) {
                /* Write faults on read-only mappings are impossible ... */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE)))
                        return VM_FAULT_SIGSEGV;
                /* ... and FOLL_FORCE only applies to COW mappings. */
                if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) &&
                                 !is_cow_mapping(vma->vm_flags)))
                        return VM_FAULT_SIGSEGV;
        }
#ifdef CONFIG_PER_VMA_LOCK
        /*
         * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of
         * the assumption that lock is dropped on VM_FAULT_RETRY.
         */
        if (WARN_ON_ONCE((*flags &
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) ==
                        (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)))
                return VM_FAULT_SIGSEGV;
#endif

        return 0;
}

/*
 * By the time we get here, we already hold either the VMA lock or the
 * mmap_lock (FAULT_FLAG_VMA_LOCK tells you which).
 *
 * The mmap_lock may have been released depending on flags and our
 * return value.  See filemap_fault() and __folio_lock_or_retry().
 */
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                           unsigned int flags, struct pt_regs *regs)
{
        /* If the fault handler drops the mmap_lock, vma may be freed */
        struct mm_struct *mm = vma->vm_mm;
        vm_fault_t ret;
        bool is_droppable;

        __set_current_state(TASK_RUNNING);

        ret = sanitize_fault_flags(vma, &flags);
        if (ret)
                goto out;

        if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
                                            flags & FAULT_FLAG_INSTRUCTION,
                                            flags & FAULT_FLAG_REMOTE)) {
                ret = VM_FAULT_SIGSEGV;
                goto out;
        }

        is_droppable = !!(vma->vm_flags & VM_DROPPABLE);

        /*
         * Enable the memcg OOM handling for faults triggered in user
         * space.  Kernel faults are handled more gracefully.
         */
        if (flags & FAULT_FLAG_USER)
                mem_cgroup_enter_user_fault();

        lru_gen_enter_fault(vma);

        if (unlikely(is_vm_hugetlb_page(vma)))
                ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
        else
                ret = __handle_mm_fault(vma, address, flags);

        /*
         * Warning: It is no longer safe to dereference vma-> after this point,
         * because mmap_lock might have been dropped by __handle_mm_fault(), so
         * vma might be destroyed from underneath us.
         */

        lru_gen_exit_fault();

        /* If the mapping is droppable, then errors due to OOM aren't fatal. */
        if (is_droppable)
                ret &= ~VM_FAULT_OOM;

        if (flags & FAULT_FLAG_USER) {
                mem_cgroup_exit_user_fault();
                /*
                 * The task may have entered a memcg OOM situation but
                 * if the allocation error was handled gracefully (no
                 * VM_FAULT_OOM), there is no need to kill anything.
                 * Just clean up the OOM state peacefully.
                 */
                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
                        mem_cgroup_oom_synchronize(false);
        }
out:
        mm_account_fault(mm, regs, address, flags, ret);

        return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);

#ifdef CONFIG_LOCK_MM_AND_FIND_VMA
#include <linux/extable.h>

static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        if (likely(mmap_read_trylock(mm)))
                return true;

        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }

        return !mmap_read_lock_killable(mm);
}

static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
{
        /*
         * We don't have this operation yet.
         *
         * It should be easy enough to do: it's basically a
         *    atomic_long_try_cmpxchg_acquire()
         * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
         * it also needs the proper lockdep magic etc.
         */
        return false;
}

static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
{
        mmap_read_unlock(mm);
        if (regs && !user_mode(regs)) {
                unsigned long ip = exception_ip(regs);
                if (!search_exception_tables(ip))
                        return false;
        }
        return !mmap_write_lock_killable(mm);
}

/*
 * Helper for page fault handling.
 *
 * This is kind of equivalent to "mmap_read_lock()" followed
 * by "find_extend_vma()", except it's a lot more careful about
 * the locking (and will drop the lock on failure).
 *
 * For example, if we have a kernel bug that causes a page
 * fault, we don't want to just use mmap_read_lock() to get
 * the mm lock, because that would deadlock if the bug were
 * to happen while we're holding the mm lock for writing.
 *
 * So this checks the exception tables on kernel faults in
 * order to only do this all for instructions that are actually
 * expected to fault.
 *
 * We can also actually take the mm lock for writing if we
 * need to extend the vma, which helps the VM layer a lot.
 */
struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                        unsigned long addr, struct pt_regs *regs)
{
        struct vm_area_struct *vma;

        if (!get_mmap_lock_carefully(mm, regs))
                return NULL;

        vma = find_vma(mm, addr);
        if (likely(vma && (vma->vm_start <= addr)))
                return vma;

        /*
         * Well, dang. We might still be successful, but only
         * if we can extend a vma to do so.
         */
        if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
                mmap_read_unlock(mm);
                return NULL;
        }

        /*
         * We can try to upgrade the mmap lock atomically,
         * in which case we can continue to use the vma
         * we already looked up.
         *
         * Otherwise we'll have to drop the mmap lock and
         * re-take it, and also look up the vma again,
         * re-checking it.
         */
        if (!mmap_upgrade_trylock(mm)) {
                if (!upgrade_mmap_lock_carefully(mm, regs))
                        return NULL;

                vma = find_vma(mm, addr);
                if (!vma)
                        goto fail;
                if (vma->vm_start <= addr)
                        goto success;
                if (!(vma->vm_flags & VM_GROWSDOWN))
                        goto fail;
        }

        if (expand_stack_locked(vma, addr))
                goto fail;

success:
        mmap_write_downgrade(mm);
        return vma;

fail:
        mmap_write_unlock(mm);
        return NULL;
}
#endif

#ifdef CONFIG_PER_VMA_LOCK
static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching)
{
        unsigned int tgt_refcnt = VMA_LOCK_OFFSET;

        /* Additional refcnt if the vma is attached. */
        if (!detaching)
                tgt_refcnt++;

        /*
         * If vma is detached then only vma_mark_attached() can raise the
         * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
         */
        if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt))
                return false;

        rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_);
        rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
                   refcount_read(&vma->vm_refcnt) == tgt_refcnt,
                   TASK_UNINTERRUPTIBLE);
        lock_acquired(&vma->vmlock_dep_map, _RET_IP_);

        return true;
}

static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached)
{
        *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt);
        rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
}

void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq)
{
        bool locked;

        /*
         * __vma_enter_locked() returns false immediately if the vma is not
         * attached, otherwise it waits until refcnt is indicating that vma
         * is attached with no readers.
         */
        locked = __vma_enter_locked(vma, false);

        /*
         * We should use WRITE_ONCE() here because we can have concurrent reads
         * from the early lockless pessimistic check in vma_start_read().
         * We don't really care about the correctness of that early check, but
         * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
         */
        WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);

        if (locked) {
                bool detached;

                __vma_exit_locked(vma, &detached);
                WARN_ON_ONCE(detached); /* vma should remain attached */
        }
}
EXPORT_SYMBOL_GPL(__vma_start_write);

void vma_mark_detached(struct vm_area_struct *vma)
{
        vma_assert_write_locked(vma);
        vma_assert_attached(vma);

        /*
         * We are the only writer, so no need to use vma_refcount_put().
         * The condition below is unlikely because the vma has been already
         * write-locked and readers can increment vm_refcnt only temporarily
         * before they check vm_lock_seq, realize the vma is locked and drop
         * back the vm_refcnt. That is a narrow window for observing a raised
         * vm_refcnt.
         */
        if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) {
                /* Wait until vma is detached with no readers. */
                if (__vma_enter_locked(vma, true)) {
                        bool detached;

                        __vma_exit_locked(vma, &detached);
                        WARN_ON_ONCE(!detached);
                }
        }
}

/*
 * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
 * stable and not isolated. If the VMA is not found or is being modified the
 * function returns NULL.
 */
struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address)
{
        MA_STATE(mas, &mm->mm_mt, address, address);
        struct vm_area_struct *vma;

        rcu_read_lock();
retry:
        vma = mas_walk(&mas);
        if (!vma)
                goto inval;

        vma = vma_start_read(mm, vma);
        if (IS_ERR_OR_NULL(vma)) {
                /* Check if the VMA got isolated after we found it */
                if (PTR_ERR(vma) == -EAGAIN) {
                        count_vm_vma_lock_event(VMA_LOCK_MISS);
                        /* The area was replaced with another one */
                        goto retry;
                }

                /* Failed to lock the VMA */
                goto inval;
        }
        /*
         * At this point, we have a stable reference to a VMA: The VMA is
         * locked and we know it hasn't already been isolated.
         * From here on, we can access the VMA without worrying about which
         * fields are accessible for RCU readers.
         */

        /* Check if the vma we locked is the right one. */
        if (unlikely(vma->vm_mm != mm ||
                     address < vma->vm_start || address >= vma->vm_end))
                goto inval_end_read;

        rcu_read_unlock();
        return vma;

inval_end_read:
        vma_end_read(vma);
inval:
        rcu_read_unlock();
        count_vm_vma_lock_event(VMA_LOCK_ABORT);
        return NULL;
}
#endif /* CONFIG_PER_VMA_LOCK */

#ifndef __PAGETABLE_P4D_FOLDED
/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        p4d_t *new = p4d_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (pgd_present(*pgd)) {        /* Another has populated it */
                p4d_free(mm, new);
        } else {
                smp_wmb(); /* See comment in pmd_install() */
                pgd_populate(mm, pgd, new);
        }
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_P4D_FOLDED */

#ifndef __PAGETABLE_PUD_FOLDED
/*
 * Allocate page upper directory.
 * We've already handled the fast-path in-line.
 */
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
        pud_t *new = pud_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        spin_lock(&mm->page_table_lock);
        if (!p4d_present(*p4d)) {
                mm_inc_nr_puds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                p4d_populate(mm, p4d, new);
        } else        /* Another has populated it */
                pud_free(mm, new);
        spin_unlock(&mm->page_table_lock);
        return 0;
}
#endif /* __PAGETABLE_PUD_FOLDED */

#ifndef __PAGETABLE_PMD_FOLDED
/*
 * Allocate page middle directory.
 * We've already handled the fast-path in-line.
 */
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        spinlock_t *ptl;
        pmd_t *new = pmd_alloc_one(mm, address);
        if (!new)
                return -ENOMEM;

        ptl = pud_lock(mm, pud);
        if (!pud_present(*pud)) {
                mm_inc_nr_pmds(mm);
                smp_wmb(); /* See comment in pmd_install() */
                pud_populate(mm, pud, new);
        } else {        /* Another has populated it */
                pmd_free(mm, new);
        }
        spin_unlock(ptl);
        return 0;
}
#endif /* __PAGETABLE_PMD_FOLDED */

static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
                                     spinlock_t *lock, pte_t *ptep,
                                     pgprot_t pgprot, unsigned long pfn_base,
                                     unsigned long addr_mask, bool writable,
                                     bool special)
{
        args->lock = lock;
        args->ptep = ptep;
        args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
        args->addr_mask = addr_mask;
        args->pgprot = pgprot;
        args->writable = writable;
        args->special = special;
}

static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
{
#ifdef CONFIG_LOCKDEP
        struct file *file = vma->vm_file;
        struct address_space *mapping = file ? file->f_mapping : NULL;

        if (mapping)
                lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) ||
                               lockdep_is_held(&vma->vm_mm->mmap_lock));
        else
                lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
#endif
}

/**
 * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
 * @args: Pointer to struct @follow_pfnmap_args
 *
 * The caller needs to setup args->vma and args->address to point to the
 * virtual address as the target of such lookup.  On a successful return,
 * the results will be put into other output fields.
 *
 * After the caller finished using the fields, the caller must invoke
 * another follow_pfnmap_end() to proper releases the locks and resources
 * of such look up request.
 *
 * During the start() and end() calls, the results in @args will be valid
 * as proper locks will be held.  After the end() is called, all the fields
 * in @follow_pfnmap_args will be invalid to be further accessed.  Further
 * use of such information after end() may require proper synchronizations
 * by the caller with page table updates, otherwise it can create a
 * security bug.
 *
 * If the PTE maps a refcounted page, callers are responsible to protect
 * against invalidation with MMU notifiers; otherwise access to the PFN at
 * a later point in time can trigger use-after-free.
 *
 * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
 * should be taken for read, and the mmap semaphore cannot be released
 * before the end() is invoked.
 *
 * This function must not be used to modify PTE content.
 *
 * Return: zero on success, negative otherwise.
 */
int follow_pfnmap_start(struct follow_pfnmap_args *args)
{
        struct vm_area_struct *vma = args->vma;
        unsigned long address = args->address;
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *lock;
        pgd_t *pgdp;
        p4d_t *p4dp, p4d;
        pud_t *pudp, pud;
        pmd_t *pmdp, pmd;
        pte_t *ptep, pte;

        pfnmap_lockdep_assert(vma);

        if (unlikely(address < vma->vm_start || address >= vma->vm_end))
                goto out;

        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
                goto out;
retry:
        pgdp = pgd_offset(mm, address);
        if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
                goto out;

        p4dp = p4d_offset(pgdp, address);
        p4d = READ_ONCE(*p4dp);
        if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
                goto out;

        pudp = pud_offset(p4dp, address);
        pud = READ_ONCE(*pudp);
        if (pud_none(pud))
                goto out;
        if (pud_leaf(pud)) {
                lock = pud_lock(mm, pudp);
                if (!unlikely(pud_leaf(pud))) {
                        spin_unlock(lock);
                        goto retry;
                }
                pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
                                  pud_pfn(pud), PUD_MASK, pud_write(pud),
                                  pud_special(pud));
                return 0;
        }

        pmdp = pmd_offset(pudp, address);
        pmd = pmdp_get_lockless(pmdp);
        if (pmd_leaf(pmd)) {
                lock = pmd_lock(mm, pmdp);
                if (!unlikely(pmd_leaf(pmd))) {
                        spin_unlock(lock);
                        goto retry;
                }
                pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
                                  pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
                                  pmd_special(pmd));
                return 0;
        }

        ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
        if (!ptep)
                goto out;
        pte = ptep_get(ptep);
        if (!pte_present(pte))
                goto unlock;
        pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
                          pte_pfn(pte), PAGE_MASK, pte_write(pte),
                          pte_special(pte));
        return 0;
unlock:
        pte_unmap_unlock(ptep, lock);
out:
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(follow_pfnmap_start);

/**
 * follow_pfnmap_end(): End a follow_pfnmap_start() process
 * @args: Pointer to struct @follow_pfnmap_args
 *
 * Must be used in pair of follow_pfnmap_start().  See the start() function
 * above for more information.
 */
void follow_pfnmap_end(struct follow_pfnmap_args *args)
{
        if (args->lock)
                spin_unlock(args->lock);
        if (args->ptep)
                pte_unmap(args->ptep);
}
EXPORT_SYMBOL_GPL(follow_pfnmap_end);

#ifdef CONFIG_HAVE_IOREMAP_PROT
/**
 * generic_access_phys - generic implementation for iomem mmap access
 * @vma: the vma to access
 * @addr: userspace address, not relative offset within @vma
 * @buf: buffer to read/write
 * @len: length of transfer
 * @write: set to FOLL_WRITE when writing, otherwise reading
 *
 * This is a generic implementation for &vm_operations_struct.access for an
 * iomem mapping. This callback is used by access_process_vm() when the @vma is
 * not page based.
 */
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write)
{
        resource_size_t phys_addr;
        pgprot_t prot = __pgprot(0);
        void __iomem *maddr;
        int offset = offset_in_page(addr);
        int ret = -EINVAL;
        bool writable;
        struct follow_pfnmap_args args = { .vma = vma, .address = addr };

retry:
        if (follow_pfnmap_start(&args))
                return -EINVAL;
        prot = args.pgprot;
        phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
        writable = args.writable;
        follow_pfnmap_end(&args);

        if ((write & FOLL_WRITE) && !writable)
                return -EINVAL;

        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (!maddr)
                return -ENOMEM;

        if (follow_pfnmap_start(&args))
                goto out_unmap;

        if ((pgprot_val(prot) != pgprot_val(args.pgprot)) ||
            (phys_addr != (args.pfn << PAGE_SHIFT)) ||
            (writable != args.writable)) {
                follow_pfnmap_end(&args);
                iounmap(maddr);
                goto retry;
        }

        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
                memcpy_fromio(buf, maddr + offset, len);
        ret = len;
        follow_pfnmap_end(&args);
out_unmap:
        iounmap(maddr);

        return ret;
}
EXPORT_SYMBOL_GPL(generic_access_phys);
#endif

/*
 * Access another process' address space as given in mm.
 */
static int __access_remote_vm(struct mm_struct *mm, unsigned long addr,
                              void *buf, int len, unsigned int gup_flags)
{
        void *old_buf = buf;
        int write = gup_flags & FOLL_WRITE;

        if (mmap_read_lock_killable(mm))
                return 0;

        /* Untag the address before looking up the VMA */
        addr = untagged_addr_remote(mm, addr);

        /* Avoid triggering the temporary warning in __get_user_pages */
        if (!vma_lookup(mm, addr) && !expand_stack(mm, addr))
                return 0;

        /* ignore errors, just check how much was successfully transferred */
        while (len) {
                int bytes, offset;
                void *maddr;
                struct vm_area_struct *vma = NULL;
                struct page *page = get_user_page_vma_remote(mm, addr,
                                                             gup_flags, &vma);

                if (IS_ERR(page)) {
                        /* We might need to expand the stack to access it */
                        vma = vma_lookup(mm, addr);
                        if (!vma) {
                                vma = expand_stack(mm, addr);

                                /* mmap_lock was dropped on failure */
                                if (!vma)
                                        return buf - old_buf;

                                /* Try again if stack expansion worked */
                                continue;
                        }

                        /*
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
                        bytes = 0;
#ifdef CONFIG_HAVE_IOREMAP_PROT
                        if (vma->vm_ops && vma->vm_ops->access)
                                bytes = vma->vm_ops->access(vma, addr, buf,
                                                            len, write);
#endif
                        if (bytes <= 0)
                                break;
                } else {
                        bytes = len;
                        offset = addr & (PAGE_SIZE-1);
                        if (bytes > PAGE_SIZE-offset)
                                bytes = PAGE_SIZE-offset;

                        maddr = kmap_local_page(page);
                        if (write) {
                                copy_to_user_page(vma, page, addr,
                                                  maddr + offset, buf, bytes);
                                set_page_dirty_lock(page);
                        } else {
                                copy_from_user_page(vma, page, addr,
                                                    buf, maddr + offset, bytes);
                        }
                        unmap_and_put_page(page, maddr);
                }
                len -= bytes;
                buf += bytes;
                addr += bytes;
        }
        mmap_read_unlock(mm);

        return buf - old_buf;
}

/**
 * access_remote_vm - access another process' address space
 * @mm:                the mm_struct of the target address space
 * @addr:        start address to access
 * @buf:        source or destination buffer
 * @len:        number of bytes to transfer
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from source to destination.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        return __access_remote_vm(mm, addr, buf, len, gup_flags);
}

/*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
 * Do not walk the page table directly, use get_user_pages
 */
int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        mm = get_task_mm(tsk);
        if (!mm)
                return 0;

        ret = __access_remote_vm(mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(access_process_vm);

#ifdef CONFIG_BPF_SYSCALL
/*
 * Copy a string from another process's address space as given in mm.
 * If there is any error return -EFAULT.
 */
static int __copy_remote_vm_str(struct mm_struct *mm, unsigned long addr,
                                void *buf, int len, unsigned int gup_flags)
{
        void *old_buf = buf;
        int err = 0;

        *(char *)buf = '\0';

        if (mmap_read_lock_killable(mm))
                return -EFAULT;

        addr = untagged_addr_remote(mm, addr);

        /* Avoid triggering the temporary warning in __get_user_pages */
        if (!vma_lookup(mm, addr)) {
                err = -EFAULT;
                goto out;
        }

        while (len) {
                int bytes, offset, retval;
                void *maddr;
                struct page *page;
                struct vm_area_struct *vma = NULL;

                page = get_user_page_vma_remote(mm, addr, gup_flags, &vma);
                if (IS_ERR(page)) {
                        /*
                         * Treat as a total failure for now until we decide how
                         * to handle the CONFIG_HAVE_IOREMAP_PROT case and
                         * stack expansion.
                         */
                        *(char *)buf = '\0';
                        err = -EFAULT;
                        goto out;
                }

                bytes = len;
                offset = addr & (PAGE_SIZE - 1);
                if (bytes > PAGE_SIZE - offset)
                        bytes = PAGE_SIZE - offset;

                maddr = kmap_local_page(page);
                retval = strscpy(buf, maddr + offset, bytes);
                if (retval >= 0) {
                        /* Found the end of the string */
                        buf += retval;
                        unmap_and_put_page(page, maddr);
                        break;
                }

                buf += bytes - 1;
                /*
                 * Because strscpy always NUL terminates we need to
                 * copy the last byte in the page if we are going to
                 * load more pages
                 */
                if (bytes != len) {
                        addr += bytes - 1;
                        copy_from_user_page(vma, page, addr, buf, maddr + (PAGE_SIZE - 1), 1);
                        buf += 1;
                        addr += 1;
                }
                len -= bytes;

                unmap_and_put_page(page, maddr);
        }

out:
        mmap_read_unlock(mm);
        if (err)
                return err;
        return buf - old_buf;
}

/**
 * copy_remote_vm_str - copy a string from another process's address space.
 * @tsk:        the task of the target address space
 * @addr:        start address to read from
 * @buf:        destination buffer
 * @len:        number of bytes to copy
 * @gup_flags:        flags modifying lookup behaviour
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from @addr (source) to @buf (destination);
 * not including the trailing NUL. Always guaranteed to leave NUL-terminated
 * buffer. On any error, return -EFAULT.
 */
int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
                       void *buf, int len, unsigned int gup_flags)
{
        struct mm_struct *mm;
        int ret;

        if (unlikely(len == 0))
                return 0;

        mm = get_task_mm(tsk);
        if (!mm) {
                *(char *)buf = '\0';
                return -EFAULT;
        }

        ret = __copy_remote_vm_str(mm, addr, buf, len, gup_flags);

        mmput(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(copy_remote_vm_str);
#endif /* CONFIG_BPF_SYSCALL */

/*
 * Print the name of a VMA.
 */
void print_vma_addr(char *prefix, unsigned long ip)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;

        /*
         * we might be running from an atomic context so we cannot sleep
         */
        if (!mmap_read_trylock(mm))
                return;

        vma = vma_lookup(mm, ip);
        if (vma && vma->vm_file) {
                struct file *f = vma->vm_file;
                ip -= vma->vm_start;
                ip += vma->vm_pgoff << PAGE_SHIFT;
                printk("%s%pD[%lx,%lx+%lx]", prefix, f, ip,
                                vma->vm_start,
                                vma->vm_end - vma->vm_start);
        }
        mmap_read_unlock(mm);
}

#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
void __might_fault(const char *file, int line)
{
        if (pagefault_disabled())
                return;
        __might_sleep(file, line);
        if (current->mm)
                might_lock_read(&current->mm->mmap_lock);
}
EXPORT_SYMBOL(__might_fault);
#endif

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
/*
 * Process all subpages of the specified huge page with the specified
 * operation.  The target subpage will be processed last to keep its
 * cache lines hot.
 */
static inline int process_huge_page(
        unsigned long addr_hint, unsigned int nr_pages,
        int (*process_subpage)(unsigned long addr, int idx, void *arg),
        void *arg)
{
        int i, n, base, l, ret;
        unsigned long addr = addr_hint &
                ~(((unsigned long)nr_pages << PAGE_SHIFT) - 1);

        /* Process target subpage last to keep its cache lines hot */
        might_sleep();
        n = (addr_hint - addr) / PAGE_SIZE;
        if (2 * n <= nr_pages) {
                /* If target subpage in first half of huge page */
                base = 0;
                l = n;
                /* Process subpages at the end of huge page */
                for (i = nr_pages - 1; i >= 2 * n; i--) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        } else {
                /* If target subpage in second half of huge page */
                base = nr_pages - 2 * (nr_pages - n);
                l = nr_pages - n;
                /* Process subpages at the begin of huge page */
                for (i = 0; i < base; i++) {
                        cond_resched();
                        ret = process_subpage(addr + i * PAGE_SIZE, i, arg);
                        if (ret)
                                return ret;
                }
        }
        /*
         * Process remaining subpages in left-right-left-right pattern
         * towards the target subpage
         */
        for (i = 0; i < l; i++) {
                int left_idx = base + i;
                int right_idx = base + 2 * l - 1 - i;

                cond_resched();
                ret = process_subpage(addr + left_idx * PAGE_SIZE, left_idx, arg);
                if (ret)
                        return ret;
                cond_resched();
                ret = process_subpage(addr + right_idx * PAGE_SIZE, right_idx, arg);
                if (ret)
                        return ret;
        }
        return 0;
}

static void clear_gigantic_page(struct folio *folio, unsigned long addr_hint,
                                unsigned int nr_pages)
{
        unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(folio));
        int i;

        might_sleep();
        for (i = 0; i < nr_pages; i++) {
                cond_resched();
                clear_user_highpage(folio_page(folio, i), addr + i * PAGE_SIZE);
        }
}

static int clear_subpage(unsigned long addr, int idx, void *arg)
{
        struct folio *folio = arg;

        clear_user_highpage(folio_page(folio, idx), addr);
        return 0;
}

/**
 * folio_zero_user - Zero a folio which will be mapped to userspace.
 * @folio: The folio to zero.
 * @addr_hint: The address will be accessed or the base address if uncelar.
 */
void folio_zero_user(struct folio *folio, unsigned long addr_hint)
{
        unsigned int nr_pages = folio_nr_pages(folio);

        if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
                clear_gigantic_page(folio, addr_hint, nr_pages);
        else
                process_huge_page(addr_hint, nr_pages, clear_subpage, folio);
}

static int copy_user_gigantic_page(struct folio *dst, struct folio *src,
                                   unsigned long addr_hint,
                                   struct vm_area_struct *vma,
                                   unsigned int nr_pages)
{
        unsigned long addr = ALIGN_DOWN(addr_hint, folio_size(dst));
        struct page *dst_page;
        struct page *src_page;
        int i;

        for (i = 0; i < nr_pages; i++) {
                dst_page = folio_page(dst, i);
                src_page = folio_page(src, i);

                cond_resched();
                if (copy_mc_user_highpage(dst_page, src_page,
                                          addr + i*PAGE_SIZE, vma))
                        return -EHWPOISON;
        }
        return 0;
}

struct copy_subpage_arg {
        struct folio *dst;
        struct folio *src;
        struct vm_area_struct *vma;
};

static int copy_subpage(unsigned long addr, int idx, void *arg)
{
        struct copy_subpage_arg *copy_arg = arg;
        struct page *dst = folio_page(copy_arg->dst, idx);
        struct page *src = folio_page(copy_arg->src, idx);

        if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma))
                return -EHWPOISON;
        return 0;
}

int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint, struct vm_area_struct *vma)
{
        unsigned int nr_pages = folio_nr_pages(dst);
        struct copy_subpage_arg arg = {
                .dst = dst,
                .src = src,
                .vma = vma,
        };

        if (unlikely(nr_pages > MAX_ORDER_NR_PAGES))
                return copy_user_gigantic_page(dst, src, addr_hint, vma, nr_pages);

        return process_huge_page(addr_hint, nr_pages, copy_subpage, &arg);
}

long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault)
{
        void *kaddr;
        unsigned long i, rc = 0;
        unsigned int nr_pages = folio_nr_pages(dst_folio);
        unsigned long ret_val = nr_pages * PAGE_SIZE;
        struct page *subpage;

        for (i = 0; i < nr_pages; i++) {
                subpage = folio_page(dst_folio, i);
                kaddr = kmap_local_page(subpage);
                if (!allow_pagefault)
                        pagefault_disable();
                rc = copy_from_user(kaddr, usr_src + i * PAGE_SIZE, PAGE_SIZE);
                if (!allow_pagefault)
                        pagefault_enable();
                kunmap_local(kaddr);

                ret_val -= (PAGE_SIZE - rc);
                if (rc)
                        break;

                flush_dcache_page(subpage);

                cond_resched();
        }
        return ret_val;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS

static struct kmem_cache *page_ptl_cachep;

void __init ptlock_cache_init(void)
{
        page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0,
                        SLAB_PANIC, NULL);
}

bool ptlock_alloc(struct ptdesc *ptdesc)
{
        spinlock_t *ptl;

        ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL);
        if (!ptl)
                return false;
        ptdesc->ptl = ptl;
        return true;
}

void ptlock_free(struct ptdesc *ptdesc)
{
        if (ptdesc->ptl)
                kmem_cache_free(page_ptl_cachep, ptdesc->ptl);
}
#endif

void vma_pgtable_walk_begin(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_lock_read(vma);
}

void vma_pgtable_walk_end(struct vm_area_struct *vma)
{
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_unlock_read(vma);
}











































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2008-2012 Novell Inc.
 * Copyright (c) 2012-2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 * Copyright (c) 2012-2019 Linux Foundation
 *
 * Core driver model functions and structures that should not be
 * shared outside of the drivers/base/ directory.
 *
 */
#include <linux/notifier.h>

/**
 * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure.
 *
 * @subsys - the struct kset that defines this subsystem
 * @devices_kset - the subsystem's 'devices' directory
 * @interfaces - list of subsystem interfaces associated
 * @mutex - protect the devices, and interfaces lists.
 *
 * @drivers_kset - the list of drivers associated
 * @klist_devices - the klist to iterate over the @devices_kset
 * @klist_drivers - the klist to iterate over the @drivers_kset
 * @bus_notifier - the bus notifier list for anything that cares about things
 *                 on this bus.
 * @bus - pointer back to the struct bus_type that this structure is associated
 *        with.
 * @dev_root: Default device to use as the parent.
 *
 * @glue_dirs - "glue" directory to put in-between the parent device to
 *              avoid namespace conflicts
 * @class - pointer back to the struct class that this structure is associated
 *          with.
 * @lock_key:        Lock class key for use by the lock validator
 *
 * This structure is the one that is the actual kobject allowing struct
 * bus_type/class to be statically allocated safely.  Nothing outside of the
 * driver core should ever touch these fields.
 */
struct subsys_private {
        struct kset subsys;
        struct kset *devices_kset;
        struct list_head interfaces;
        struct mutex mutex;

        struct kset *drivers_kset;
        struct klist klist_devices;
        struct klist klist_drivers;
        struct blocking_notifier_head bus_notifier;
        unsigned int drivers_autoprobe:1;
        const struct bus_type *bus;
        struct device *dev_root;

        struct kset glue_dirs;
        const struct class *class;

        struct lock_class_key lock_key;
};
#define to_subsys_private(obj) container_of_const(obj, struct subsys_private, subsys.kobj)

static inline struct subsys_private *subsys_get(struct subsys_private *sp)
{
        if (sp)
                kset_get(&sp->subsys);
        return sp;
}

static inline void subsys_put(struct subsys_private *sp)
{
        if (sp)
                kset_put(&sp->subsys);
}

struct subsys_private *bus_to_subsys(const struct bus_type *bus);
struct subsys_private *class_to_subsys(const struct class *class);

struct driver_private {
        struct kobject kobj;
        struct klist klist_devices;
        struct klist_node knode_bus;
        struct module_kobject *mkobj;
        struct device_driver *driver;
};
#define to_driver(obj) container_of(obj, struct driver_private, kobj)

/**
 * struct device_private - structure to hold the private to the driver core portions of the device structure.
 *
 * @klist_children - klist containing all children of this device
 * @knode_parent - node in sibling list
 * @knode_driver - node in driver list
 * @knode_bus - node in bus list
 * @knode_class - node in class list
 * @deferred_probe - entry in deferred_probe_list which is used to retry the
 *        binding of drivers which were unable to get all the resources needed by
 *        the device; typically because it depends on another driver getting
 *        probed first.
 * @async_driver - pointer to device driver awaiting probe via async_probe
 * @device - pointer back to the struct device that this structure is
 * associated with.
 * @dead - This device is currently either in the process of or has been
 *        removed from the system. Any asynchronous events scheduled for this
 *        device should exit without taking any action.
 *
 * Nothing outside of the driver core should ever touch these fields.
 */
struct device_private {
        struct klist klist_children;
        struct klist_node knode_parent;
        struct klist_node knode_driver;
        struct klist_node knode_bus;
        struct klist_node knode_class;
        struct list_head deferred_probe;
        const struct device_driver *async_driver;
        char *deferred_probe_reason;
        struct device *device;
        u8 dead:1;
};
#define to_device_private_parent(obj)        \
        container_of(obj, struct device_private, knode_parent)
#define to_device_private_driver(obj)        \
        container_of(obj, struct device_private, knode_driver)
#define to_device_private_bus(obj)        \
        container_of(obj, struct device_private, knode_bus)
#define to_device_private_class(obj)        \
        container_of(obj, struct device_private, knode_class)

/* initialisation functions */
int devices_init(void);
int buses_init(void);
int classes_init(void);
int firmware_init(void);
#ifdef CONFIG_SYS_HYPERVISOR
int hypervisor_init(void);
#else
static inline int hypervisor_init(void) { return 0; }
#endif
int platform_bus_init(void);
int faux_bus_init(void);
void cpu_dev_init(void);
void container_dev_init(void);
#ifdef CONFIG_AUXILIARY_BUS
void auxiliary_bus_init(void);
#else
static inline void auxiliary_bus_init(void) { }
#endif

struct kobject *virtual_device_parent(void);

int bus_add_device(struct device *dev);
void bus_probe_device(struct device *dev);
void bus_remove_device(struct device *dev);
void bus_notify(struct device *dev, enum bus_notifier_event value);
bool bus_is_registered(const struct bus_type *bus);

int bus_add_driver(struct device_driver *drv);
void bus_remove_driver(struct device_driver *drv);
void device_release_driver_internal(struct device *dev, const struct device_driver *drv,
                                    struct device *parent);

void driver_detach(const struct device_driver *drv);
void driver_deferred_probe_del(struct device *dev);
void device_set_deferred_probe_reason(const struct device *dev, struct va_format *vaf);
static inline int driver_match_device(const struct device_driver *drv,
                                      struct device *dev)
{
        return drv->bus->match ? drv->bus->match(dev, drv) : 1;
}

static inline void dev_sync_state(struct device *dev)
{
        if (dev->bus->sync_state)
                dev->bus->sync_state(dev);
        else if (dev->driver && dev->driver->sync_state)
                dev->driver->sync_state(dev);
}

int driver_add_groups(const struct device_driver *drv, const struct attribute_group **groups);
void driver_remove_groups(const struct device_driver *drv, const struct attribute_group **groups);
void device_driver_detach(struct device *dev);

static inline void device_set_driver(struct device *dev, const struct device_driver *drv)
{
        /*
         * Majority (all?) read accesses to dev->driver happens either
         * while holding device lock or in bus/driver code that is only
         * invoked when the device is bound to a driver and there is no
         * concern of the pointer being changed while it is being read.
         * However when reading device's uevent file we read driver pointer
         * without taking device lock (so we do not block there for
         * arbitrary amount of time). We use WRITE_ONCE() here to prevent
         * tearing so that READ_ONCE() can safely be used in uevent code.
         */
        // FIXME - this cast should not be needed "soon"
        WRITE_ONCE(dev->driver, (struct device_driver *)drv);
}

int devres_release_all(struct device *dev);
void device_block_probing(void);
void device_unblock_probing(void);
void deferred_probe_extend_timeout(void);
void driver_deferred_probe_trigger(void);
const char *device_get_devnode(const struct device *dev, umode_t *mode,
                               kuid_t *uid, kgid_t *gid, const char **tmp);

/* /sys/devices directory */
extern struct kset *devices_kset;
void devices_kset_move_last(struct device *dev);

#if defined(CONFIG_MODULES) && defined(CONFIG_SYSFS)
int module_add_driver(struct module *mod, const struct device_driver *drv);
void module_remove_driver(const struct device_driver *drv);
#else
static inline int module_add_driver(struct module *mod,
                                    struct device_driver *drv)
{
        return 0;
}
static inline void module_remove_driver(struct device_driver *drv) { }
#endif

#ifdef CONFIG_DEVTMPFS
int devtmpfs_init(void);
#else
static inline int devtmpfs_init(void) { return 0; }
#endif

#ifdef CONFIG_BLOCK
extern const struct class block_class;
static inline bool is_blockdev(struct device *dev)
{
        return dev->class == &block_class;
}
#else
static inline bool is_blockdev(struct device *dev) { return false; }
#endif

/* Device links support */
int device_links_read_lock(void);
void device_links_read_unlock(int idx);
int device_links_read_lock_held(void);
int device_links_check_suppliers(struct device *dev);
void device_links_force_bind(struct device *dev);
void device_links_driver_bound(struct device *dev);
void device_links_driver_cleanup(struct device *dev);
void device_links_no_driver(struct device *dev);
bool device_links_busy(struct device *dev);
void device_links_unbind_consumers(struct device *dev);
void fw_devlink_drivers_done(void);
void fw_devlink_probing_done(void);

/* device pm support */
void device_pm_move_to_tail(struct device *dev);

#ifdef CONFIG_DEVTMPFS
int devtmpfs_create_node(struct device *dev);
int devtmpfs_delete_node(struct device *dev);
#else
static inline int devtmpfs_create_node(struct device *dev) { return 0; }
static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
#endif

void software_node_notify(struct device *dev);
void software_node_notify_remove(struct device *dev);















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   31 




   30 
   31 






   31 
   31 


































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/mm/mmu.c
 *
 * Copyright (C) 1995-2005 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/cache.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/kexec.h>
#include <linux/libfdt.h>
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/memblock.h>
#include <linux/memremap.h>
#include <linux/memory.h>
#include <linux/fs.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <linux/set_memory.h>
#include <linux/kfence.h>
#include <linux/pkeys.h>

#include <asm/barrier.h>
#include <asm/cputype.h>
#include <asm/fixmap.h>
#include <asm/kasan.h>
#include <asm/kernel-pgtable.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <linux/sizes.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/ptdump.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
#include <asm/kfence.h>

#define NO_BLOCK_MAPPINGS        BIT(0)
#define NO_CONT_MAPPINGS        BIT(1)
#define NO_EXEC_MAPPINGS        BIT(2)        /* assumes FEAT_HPDS is not used */

u64 kimage_voffset __ro_after_init;
EXPORT_SYMBOL(kimage_voffset);

u32 __boot_cpu_mode[] = { BOOT_CPU_MODE_EL2, BOOT_CPU_MODE_EL1 };

static bool rodata_is_rw __ro_after_init = true;

/*
 * The booting CPU updates the failed status @__early_cpu_boot_status,
 * with MMU turned off.
 */
long __section(".mmuoff.data.write") __early_cpu_boot_status;

/*
 * Empty_zero_page is a special page that is used for zero-initialized data
 * and COW.
 */
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss;
EXPORT_SYMBOL(empty_zero_page);

static DEFINE_SPINLOCK(swapper_pgdir_lock);
static DEFINE_MUTEX(fixmap_lock);

void noinstr set_swapper_pgd(pgd_t *pgdp, pgd_t pgd)
{
        pgd_t *fixmap_pgdp;

        /*
         * Don't bother with the fixmap if swapper_pg_dir is still mapped
         * writable in the kernel mapping.
         */
        if (rodata_is_rw) {
                WRITE_ONCE(*pgdp, pgd);
                dsb(ishst);
                isb();
                return;
        }

        spin_lock(&swapper_pgdir_lock);
        fixmap_pgdp = pgd_set_fixmap(__pa_symbol(pgdp));
        WRITE_ONCE(*fixmap_pgdp, pgd);
        /*
         * We need dsb(ishst) here to ensure the page-table-walker sees
         * our new entry before set_p?d() returns. The fixmap's
         * flush_tlb_kernel_range() via clear_fixmap() does this for us.
         */
        pgd_clear_fixmap();
        spin_unlock(&swapper_pgdir_lock);
}

pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                              unsigned long size, pgprot_t vma_prot)
{
        if (!pfn_is_map_memory(pfn))
                return pgprot_noncached(vma_prot);
        else if (file->f_flags & O_SYNC)
                return pgprot_writecombine(vma_prot);
        return vma_prot;
}
EXPORT_SYMBOL(phys_mem_access_prot);

static phys_addr_t __init early_pgtable_alloc(int shift)
{
        phys_addr_t phys;

        phys = memblock_phys_alloc_range(PAGE_SIZE, PAGE_SIZE, 0,
                                         MEMBLOCK_ALLOC_NOLEAKTRACE);
        if (!phys)
                panic("Failed to allocate page table page\n");

        return phys;
}

bool pgattr_change_is_safe(pteval_t old, pteval_t new)
{
        /*
         * The following mapping attributes may be updated in live
         * kernel mappings without the need for break-before-make.
         */
        pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG |
                        PTE_SWBITS_MASK;

        /* creating or taking down mappings is always safe */
        if (!pte_valid(__pte(old)) || !pte_valid(__pte(new)))
                return true;

        /* A live entry's pfn should not change */
        if (pte_pfn(__pte(old)) != pte_pfn(__pte(new)))
                return false;

        /* live contiguous mappings may not be manipulated at all */
        if ((old | new) & PTE_CONT)
                return false;

        /* Transitioning from Non-Global to Global is unsafe */
        if (old & ~new & PTE_NG)
                return false;

        /*
         * Changing the memory type between Normal and Normal-Tagged is safe
         * since Tagged is considered a permission attribute from the
         * mismatched attribute aliases perspective.
         */
        if (((old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
             (old & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)) &&
            ((new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL) ||
             (new & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_TAGGED)))
                mask |= PTE_ATTRINDX_MASK;

        return ((old ^ new) & ~mask) == 0;
}

static void init_clear_pgtable(void *table)
{
        clear_page(table);

        /* Ensure the zeroing is observed by page table walks. */
        dsb(ishst);
}

static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
                     phys_addr_t phys, pgprot_t prot)
{
        do {
                pte_t old_pte = __ptep_get(ptep);

                /*
                 * Required barriers to make this visible to the table walker
                 * are deferred to the end of alloc_init_cont_pte().
                 */
                __set_pte_nosync(ptep, pfn_pte(__phys_to_pfn(phys), prot));

                /*
                 * After the PTE entry has been populated once, we
                 * only allow updates to the permission attributes.
                 */
                BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
                                              pte_val(__ptep_get(ptep))));

                phys += PAGE_SIZE;
        } while (ptep++, addr += PAGE_SIZE, addr != end);
}

static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
                                unsigned long end, phys_addr_t phys,
                                pgprot_t prot,
                                phys_addr_t (*pgtable_alloc)(int),
                                int flags)
{
        unsigned long next;
        pmd_t pmd = READ_ONCE(*pmdp);
        pte_t *ptep;

        BUG_ON(pmd_sect(pmd));
        if (pmd_none(pmd)) {
                pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
                phys_addr_t pte_phys;

                if (flags & NO_EXEC_MAPPINGS)
                        pmdval |= PMD_TABLE_PXN;
                BUG_ON(!pgtable_alloc);
                pte_phys = pgtable_alloc(PAGE_SHIFT);
                ptep = pte_set_fixmap(pte_phys);
                init_clear_pgtable(ptep);
                ptep += pte_index(addr);
                __pmd_populate(pmdp, pte_phys, pmdval);
        } else {
                BUG_ON(pmd_bad(pmd));
                ptep = pte_set_fixmap_offset(pmdp, addr);
        }

        do {
                pgprot_t __prot = prot;

                next = pte_cont_addr_end(addr, end);

                /* use a contiguous mapping if the range is suitably aligned */
                if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) &&
                    (flags & NO_CONT_MAPPINGS) == 0)
                        __prot = __pgprot(pgprot_val(prot) | PTE_CONT);

                init_pte(ptep, addr, next, phys, __prot);

                ptep += pte_index(next) - pte_index(addr);
                phys += next - addr;
        } while (addr = next, addr != end);

        /*
         * Note: barriers and maintenance necessary to clear the fixmap slot
         * ensure that all previous pgtable writes are visible to the table
         * walker.
         */
        pte_clear_fixmap();
}

static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
                     phys_addr_t phys, pgprot_t prot,
                     phys_addr_t (*pgtable_alloc)(int), int flags)
{
        unsigned long next;

        do {
                pmd_t old_pmd = READ_ONCE(*pmdp);

                next = pmd_addr_end(addr, end);

                /* try section mapping first */
                if (((addr | next | phys) & ~PMD_MASK) == 0 &&
                    (flags & NO_BLOCK_MAPPINGS) == 0) {
                        pmd_set_huge(pmdp, phys, prot);

                        /*
                         * After the PMD entry has been populated once, we
                         * only allow updates to the permission attributes.
                         */
                        BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd),
                                                      READ_ONCE(pmd_val(*pmdp))));
                } else {
                        alloc_init_cont_pte(pmdp, addr, next, phys, prot,
                                            pgtable_alloc, flags);

                        BUG_ON(pmd_val(old_pmd) != 0 &&
                               pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp)));
                }
                phys += next - addr;
        } while (pmdp++, addr = next, addr != end);
}

static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
                                unsigned long end, phys_addr_t phys,
                                pgprot_t prot,
                                phys_addr_t (*pgtable_alloc)(int), int flags)
{
        unsigned long next;
        pud_t pud = READ_ONCE(*pudp);
        pmd_t *pmdp;

        /*
         * Check for initial section mappings in the pgd/pud.
         */
        BUG_ON(pud_sect(pud));
        if (pud_none(pud)) {
                pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
                phys_addr_t pmd_phys;

                if (flags & NO_EXEC_MAPPINGS)
                        pudval |= PUD_TABLE_PXN;
                BUG_ON(!pgtable_alloc);
                pmd_phys = pgtable_alloc(PMD_SHIFT);
                pmdp = pmd_set_fixmap(pmd_phys);
                init_clear_pgtable(pmdp);
                pmdp += pmd_index(addr);
                __pud_populate(pudp, pmd_phys, pudval);
        } else {
                BUG_ON(pud_bad(pud));
                pmdp = pmd_set_fixmap_offset(pudp, addr);
        }

        do {
                pgprot_t __prot = prot;

                next = pmd_cont_addr_end(addr, end);

                /* use a contiguous mapping if the range is suitably aligned */
                if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) &&
                    (flags & NO_CONT_MAPPINGS) == 0)
                        __prot = __pgprot(pgprot_val(prot) | PTE_CONT);

                init_pmd(pmdp, addr, next, phys, __prot, pgtable_alloc, flags);

                pmdp += pmd_index(next) - pmd_index(addr);
                phys += next - addr;
        } while (addr = next, addr != end);

        pmd_clear_fixmap();
}

static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
                           phys_addr_t phys, pgprot_t prot,
                           phys_addr_t (*pgtable_alloc)(int),
                           int flags)
{
        unsigned long next;
        p4d_t p4d = READ_ONCE(*p4dp);
        pud_t *pudp;

        if (p4d_none(p4d)) {
                p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF;
                phys_addr_t pud_phys;

                if (flags & NO_EXEC_MAPPINGS)
                        p4dval |= P4D_TABLE_PXN;
                BUG_ON(!pgtable_alloc);
                pud_phys = pgtable_alloc(PUD_SHIFT);
                pudp = pud_set_fixmap(pud_phys);
                init_clear_pgtable(pudp);
                pudp += pud_index(addr);
                __p4d_populate(p4dp, pud_phys, p4dval);
        } else {
                BUG_ON(p4d_bad(p4d));
                pudp = pud_set_fixmap_offset(p4dp, addr);
        }

        do {
                pud_t old_pud = READ_ONCE(*pudp);

                next = pud_addr_end(addr, end);

                /*
                 * For 4K granule only, attempt to put down a 1GB block
                 */
                if (pud_sect_supported() &&
                   ((addr | next | phys) & ~PUD_MASK) == 0 &&
                    (flags & NO_BLOCK_MAPPINGS) == 0) {
                        pud_set_huge(pudp, phys, prot);

                        /*
                         * After the PUD entry has been populated once, we
                         * only allow updates to the permission attributes.
                         */
                        BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
                                                      READ_ONCE(pud_val(*pudp))));
                } else {
                        alloc_init_cont_pmd(pudp, addr, next, phys, prot,
                                            pgtable_alloc, flags);

                        BUG_ON(pud_val(old_pud) != 0 &&
                               pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
                }
                phys += next - addr;
        } while (pudp++, addr = next, addr != end);

        pud_clear_fixmap();
}

static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
                           phys_addr_t phys, pgprot_t prot,
                           phys_addr_t (*pgtable_alloc)(int),
                           int flags)
{
        unsigned long next;
        pgd_t pgd = READ_ONCE(*pgdp);
        p4d_t *p4dp;

        if (pgd_none(pgd)) {
                pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF;
                phys_addr_t p4d_phys;

                if (flags & NO_EXEC_MAPPINGS)
                        pgdval |= PGD_TABLE_PXN;
                BUG_ON(!pgtable_alloc);
                p4d_phys = pgtable_alloc(P4D_SHIFT);
                p4dp = p4d_set_fixmap(p4d_phys);
                init_clear_pgtable(p4dp);
                p4dp += p4d_index(addr);
                __pgd_populate(pgdp, p4d_phys, pgdval);
        } else {
                BUG_ON(pgd_bad(pgd));
                p4dp = p4d_set_fixmap_offset(pgdp, addr);
        }

        do {
                p4d_t old_p4d = READ_ONCE(*p4dp);

                next = p4d_addr_end(addr, end);

                alloc_init_pud(p4dp, addr, next, phys, prot,
                               pgtable_alloc, flags);

                BUG_ON(p4d_val(old_p4d) != 0 &&
                       p4d_val(old_p4d) != READ_ONCE(p4d_val(*p4dp)));

                phys += next - addr;
        } while (p4dp++, addr = next, addr != end);

        p4d_clear_fixmap();
}

static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys,
                                        unsigned long virt, phys_addr_t size,
                                        pgprot_t prot,
                                        phys_addr_t (*pgtable_alloc)(int),
                                        int flags)
{
        unsigned long addr, end, next;
        pgd_t *pgdp = pgd_offset_pgd(pgdir, virt);

        /*
         * If the virtual and physical address don't have the same offset
         * within a page, we cannot map the region as the caller expects.
         */
        if (WARN_ON((phys ^ virt) & ~PAGE_MASK))
                return;

        phys &= PAGE_MASK;
        addr = virt & PAGE_MASK;
        end = PAGE_ALIGN(virt + size);

        do {
                next = pgd_addr_end(addr, end);
                alloc_init_p4d(pgdp, addr, next, phys, prot, pgtable_alloc,
                               flags);
                phys += next - addr;
        } while (pgdp++, addr = next, addr != end);
}

static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
                                 unsigned long virt, phys_addr_t size,
                                 pgprot_t prot,
                                 phys_addr_t (*pgtable_alloc)(int),
                                 int flags)
{
        mutex_lock(&fixmap_lock);
        __create_pgd_mapping_locked(pgdir, phys, virt, size, prot,
                                    pgtable_alloc, flags);
        mutex_unlock(&fixmap_lock);
}

#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
extern __alias(__create_pgd_mapping_locked)
void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
                             phys_addr_t size, pgprot_t prot,
                             phys_addr_t (*pgtable_alloc)(int), int flags);
#endif

static phys_addr_t __pgd_pgtable_alloc(int shift)
{
        /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
        void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO);

        BUG_ON(!ptr);
        return __pa(ptr);
}

static phys_addr_t pgd_pgtable_alloc(int shift)
{
        phys_addr_t pa = __pgd_pgtable_alloc(shift);
        struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa));

        /*
         * Call proper page table ctor in case later we need to
         * call core mm functions like apply_to_page_range() on
         * this pre-allocated page table.
         *
         * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is
         * folded, and if so pagetable_pte_ctor() becomes nop.
         */
        if (shift == PAGE_SHIFT)
                BUG_ON(!pagetable_pte_ctor(ptdesc));
        else if (shift == PMD_SHIFT)
                BUG_ON(!pagetable_pmd_ctor(ptdesc));

        return pa;
}

/*
 * This function can only be used to modify existing table entries,
 * without allocating new levels of table. Note that this permits the
 * creation of new section or page entries.
 */
void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
                                   phys_addr_t size, pgprot_t prot)
{
        if (virt < PAGE_OFFSET) {
                pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n",
                        &phys, virt);
                return;
        }
        __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
                             NO_CONT_MAPPINGS);
}

void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
                               unsigned long virt, phys_addr_t size,
                               pgprot_t prot, bool page_mappings_only)
{
        int flags = 0;

        BUG_ON(mm == &init_mm);

        if (page_mappings_only)
                flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;

        __create_pgd_mapping(mm->pgd, phys, virt, size, prot,
                             pgd_pgtable_alloc, flags);
}

static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
                                phys_addr_t size, pgprot_t prot)
{
        if (virt < PAGE_OFFSET) {
                pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n",
                        &phys, virt);
                return;
        }

        __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
                             NO_CONT_MAPPINGS);

        /* flush the TLBs after updating live kernel mappings */
        flush_tlb_kernel_range(virt, virt + size);
}

static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start,
                                  phys_addr_t end, pgprot_t prot, int flags)
{
        __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start,
                             prot, early_pgtable_alloc, flags);
}

void __init mark_linear_text_alias_ro(void)
{
        /*
         * Remove the write permissions from the linear alias of .text/.rodata
         */
        update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext),
                            (unsigned long)__init_begin - (unsigned long)_stext,
                            PAGE_KERNEL_RO);
}

#ifdef CONFIG_KFENCE

bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;

/* early_param() will be parsed before map_mem() below. */
static int __init parse_kfence_early_init(char *arg)
{
        int val;

        if (get_option(&arg, &val))
                kfence_early_init = !!val;
        return 0;
}
early_param("kfence.sample_interval", parse_kfence_early_init);

static phys_addr_t __init arm64_kfence_alloc_pool(void)
{
        phys_addr_t kfence_pool;

        if (!kfence_early_init)
                return 0;

        kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
        if (!kfence_pool) {
                pr_err("failed to allocate kfence pool\n");
                kfence_early_init = false;
                return 0;
        }

        /* Temporarily mark as NOMAP. */
        memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);

        return kfence_pool;
}

static void __init arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp)
{
        if (!kfence_pool)
                return;

        /* KFENCE pool needs page-level mapping. */
        __map_memblock(pgdp, kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
                        pgprot_tagged(PAGE_KERNEL),
                        NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS);
        memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
        __kfence_pool = phys_to_virt(kfence_pool);
}
#else /* CONFIG_KFENCE */

static inline phys_addr_t arm64_kfence_alloc_pool(void) { return 0; }
static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) { }

#endif /* CONFIG_KFENCE */

static void __init map_mem(pgd_t *pgdp)
{
        static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
        phys_addr_t kernel_start = __pa_symbol(_stext);
        phys_addr_t kernel_end = __pa_symbol(__init_begin);
        phys_addr_t start, end;
        phys_addr_t early_kfence_pool;
        int flags = NO_EXEC_MAPPINGS;
        u64 i;

        /*
         * Setting hierarchical PXNTable attributes on table entries covering
         * the linear region is only possible if it is guaranteed that no table
         * entries at any level are being shared between the linear region and
         * the vmalloc region. Check whether this is true for the PGD level, in
         * which case it is guaranteed to be true for all other levels as well.
         * (Unless we are running with support for LPA2, in which case the
         * entire reduced VA space is covered by a single pgd_t which will have
         * been populated without the PXNTable attribute by the time we get here.)
         */
        BUILD_BUG_ON(pgd_index(direct_map_end - 1) == pgd_index(direct_map_end) &&
                     pgd_index(_PAGE_OFFSET(VA_BITS_MIN)) != PTRS_PER_PGD - 1);

        early_kfence_pool = arm64_kfence_alloc_pool();

        if (can_set_direct_map())
                flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;

        /*
         * Take care not to create a writable alias for the
         * read-only text and rodata sections of the kernel image.
         * So temporarily mark them as NOMAP to skip mappings in
         * the following for-loop
         */
        memblock_mark_nomap(kernel_start, kernel_end - kernel_start);

        /* map all the memory banks */
        for_each_mem_range(i, &start, &end) {
                if (start >= end)
                        break;
                /*
                 * The linear map must allow allocation tags reading/writing
                 * if MTE is present. Otherwise, it has the same attributes as
                 * PAGE_KERNEL.
                 */
                __map_memblock(pgdp, start, end, pgprot_tagged(PAGE_KERNEL),
                               flags);
        }

        /*
         * Map the linear alias of the [_stext, __init_begin) interval
         * as non-executable now, and remove the write permission in
         * mark_linear_text_alias_ro() below (which will be called after
         * alternative patching has completed). This makes the contents
         * of the region accessible to subsystems such as hibernate,
         * but protects it from inadvertent modification or execution.
         * Note that contiguous mappings cannot be remapped in this way,
         * so we should avoid them here.
         */
        __map_memblock(pgdp, kernel_start, kernel_end,
                       PAGE_KERNEL, NO_CONT_MAPPINGS);
        memblock_clear_nomap(kernel_start, kernel_end - kernel_start);
        arm64_kfence_map_pool(early_kfence_pool, pgdp);
}

void mark_rodata_ro(void)
{
        unsigned long section_size;

        /*
         * mark .rodata as read only. Use __init_begin rather than __end_rodata
         * to cover NOTES and EXCEPTION_TABLE.
         */
        section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata;
        WRITE_ONCE(rodata_is_rw, false);
        update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata,
                            section_size, PAGE_KERNEL_RO);
}

static void __init declare_vma(struct vm_struct *vma,
                               void *va_start, void *va_end,
                               unsigned long vm_flags)
{
        phys_addr_t pa_start = __pa_symbol(va_start);
        unsigned long size = va_end - va_start;

        BUG_ON(!PAGE_ALIGNED(pa_start));
        BUG_ON(!PAGE_ALIGNED(size));

        if (!(vm_flags & VM_NO_GUARD))
                size += PAGE_SIZE;

        vma->addr        = va_start;
        vma->phys_addr        = pa_start;
        vma->size        = size;
        vma->flags        = VM_MAP | vm_flags;
        vma->caller        = __builtin_return_address(0);

        vm_area_add_early(vma);
}

#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
static pgprot_t kernel_exec_prot(void)
{
        return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC;
}

static int __init map_entry_trampoline(void)
{
        int i;

        if (!arm64_kernel_unmapped_at_el0())
                return 0;

        pgprot_t prot = kernel_exec_prot();
        phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start);

        /* The trampoline is always mapped and can therefore be global */
        pgprot_val(prot) &= ~PTE_NG;

        /* Map only the text into the trampoline page table */
        memset(tramp_pg_dir, 0, PGD_SIZE);
        __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS,
                             entry_tramp_text_size(), prot,
                             __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS);

        /* Map both the text and data into the kernel page table */
        for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++)
                __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
                             pa_start + i * PAGE_SIZE, prot);

        if (IS_ENABLED(CONFIG_RELOCATABLE))
                __set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
                             pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO);

        return 0;
}
core_initcall(map_entry_trampoline);
#endif

/*
 * Declare the VMA areas for the kernel
 */
static void __init declare_kernel_vmas(void)
{
        static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT];

        declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD);
        declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD);
        declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD);
        declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD);
        declare_vma(&vmlinux_seg[4], _data, _end, 0);
}

void __pi_map_range(u64 *pgd, u64 start, u64 end, u64 pa, pgprot_t prot,
                    int level, pte_t *tbl, bool may_use_cont, u64 va_offset);

static u8 idmap_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init,
          kpti_ptes[IDMAP_LEVELS - 1][PAGE_SIZE] __aligned(PAGE_SIZE) __ro_after_init;

static void __init create_idmap(void)
{
        u64 start = __pa_symbol(__idmap_text_start);
        u64 end   = __pa_symbol(__idmap_text_end);
        u64 ptep  = __pa_symbol(idmap_ptes);

        __pi_map_range(&ptep, start, end, start, PAGE_KERNEL_ROX,
                       IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
                       __phys_to_virt(ptep) - ptep);

        if (IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && !arm64_use_ng_mappings) {
                extern u32 __idmap_kpti_flag;
                u64 pa = __pa_symbol(&__idmap_kpti_flag);

                /*
                 * The KPTI G-to-nG conversion code needs a read-write mapping
                 * of its synchronization flag in the ID map.
                 */
                ptep = __pa_symbol(kpti_ptes);
                __pi_map_range(&ptep, pa, pa + sizeof(u32), pa, PAGE_KERNEL,
                               IDMAP_ROOT_LEVEL, (pte_t *)idmap_pg_dir, false,
                               __phys_to_virt(ptep) - ptep);
        }
}

void __init paging_init(void)
{
        map_mem(swapper_pg_dir);

        memblock_allow_resize();

        create_idmap();
        declare_kernel_vmas();
}

#ifdef CONFIG_MEMORY_HOTPLUG
static void free_hotplug_page_range(struct page *page, size_t size,
                                    struct vmem_altmap *altmap)
{
        if (altmap) {
                vmem_altmap_free(altmap, size >> PAGE_SHIFT);
        } else {
                WARN_ON(PageReserved(page));
                free_pages((unsigned long)page_address(page), get_order(size));
        }
}

static void free_hotplug_pgtable_page(struct page *page)
{
        free_hotplug_page_range(page, PAGE_SIZE, NULL);
}

static bool pgtable_range_aligned(unsigned long start, unsigned long end,
                                  unsigned long floor, unsigned long ceiling,
                                  unsigned long mask)
{
        start &= mask;
        if (start < floor)
                return false;

        if (ceiling) {
                ceiling &= mask;
                if (!ceiling)
                        return false;
        }

        if (end - 1 > ceiling - 1)
                return false;
        return true;
}

static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
                                    unsigned long end, bool free_mapped,
                                    struct vmem_altmap *altmap)
{
        pte_t *ptep, pte;

        do {
                ptep = pte_offset_kernel(pmdp, addr);
                pte = __ptep_get(ptep);
                if (pte_none(pte))
                        continue;

                WARN_ON(!pte_present(pte));
                __pte_clear(&init_mm, addr, ptep);
                flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
                if (free_mapped)
                        free_hotplug_page_range(pte_page(pte),
                                                PAGE_SIZE, altmap);
        } while (addr += PAGE_SIZE, addr < end);
}

static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
                                    unsigned long end, bool free_mapped,
                                    struct vmem_altmap *altmap)
{
        unsigned long next;
        pmd_t *pmdp, pmd;

        do {
                next = pmd_addr_end(addr, end);
                pmdp = pmd_offset(pudp, addr);
                pmd = READ_ONCE(*pmdp);
                if (pmd_none(pmd))
                        continue;

                WARN_ON(!pmd_present(pmd));
                if (pmd_sect(pmd)) {
                        pmd_clear(pmdp);

                        /*
                         * One TLBI should be sufficient here as the PMD_SIZE
                         * range is mapped with a single block entry.
                         */
                        flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
                        if (free_mapped)
                                free_hotplug_page_range(pmd_page(pmd),
                                                        PMD_SIZE, altmap);
                        continue;
                }
                WARN_ON(!pmd_table(pmd));
                unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
        } while (addr = next, addr < end);
}

static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
                                    unsigned long end, bool free_mapped,
                                    struct vmem_altmap *altmap)
{
        unsigned long next;
        pud_t *pudp, pud;

        do {
                next = pud_addr_end(addr, end);
                pudp = pud_offset(p4dp, addr);
                pud = READ_ONCE(*pudp);
                if (pud_none(pud))
                        continue;

                WARN_ON(!pud_present(pud));
                if (pud_sect(pud)) {
                        pud_clear(pudp);

                        /*
                         * One TLBI should be sufficient here as the PUD_SIZE
                         * range is mapped with a single block entry.
                         */
                        flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
                        if (free_mapped)
                                free_hotplug_page_range(pud_page(pud),
                                                        PUD_SIZE, altmap);
                        continue;
                }
                WARN_ON(!pud_table(pud));
                unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
        } while (addr = next, addr < end);
}

static void unmap_hotplug_p4d_range(pgd_t *pgdp, unsigned long addr,
                                    unsigned long end, bool free_mapped,
                                    struct vmem_altmap *altmap)
{
        unsigned long next;
        p4d_t *p4dp, p4d;

        do {
                next = p4d_addr_end(addr, end);
                p4dp = p4d_offset(pgdp, addr);
                p4d = READ_ONCE(*p4dp);
                if (p4d_none(p4d))
                        continue;

                WARN_ON(!p4d_present(p4d));
                unmap_hotplug_pud_range(p4dp, addr, next, free_mapped, altmap);
        } while (addr = next, addr < end);
}

static void unmap_hotplug_range(unsigned long addr, unsigned long end,
                                bool free_mapped, struct vmem_altmap *altmap)
{
        unsigned long next;
        pgd_t *pgdp, pgd;

        /*
         * altmap can only be used as vmemmap mapping backing memory.
         * In case the backing memory itself is not being freed, then
         * altmap is irrelevant. Warn about this inconsistency when
         * encountered.
         */
        WARN_ON(!free_mapped && altmap);

        do {
                next = pgd_addr_end(addr, end);
                pgdp = pgd_offset_k(addr);
                pgd = READ_ONCE(*pgdp);
                if (pgd_none(pgd))
                        continue;

                WARN_ON(!pgd_present(pgd));
                unmap_hotplug_p4d_range(pgdp, addr, next, free_mapped, altmap);
        } while (addr = next, addr < end);
}

static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
                                 unsigned long end, unsigned long floor,
                                 unsigned long ceiling)
{
        pte_t *ptep, pte;
        unsigned long i, start = addr;

        do {
                ptep = pte_offset_kernel(pmdp, addr);
                pte = __ptep_get(ptep);

                /*
                 * This is just a sanity check here which verifies that
                 * pte clearing has been done by earlier unmap loops.
                 */
                WARN_ON(!pte_none(pte));
        } while (addr += PAGE_SIZE, addr < end);

        if (!pgtable_range_aligned(start, end, floor, ceiling, PMD_MASK))
                return;

        /*
         * Check whether we can free the pte page if the rest of the
         * entries are empty. Overlap with other regions have been
         * handled by the floor/ceiling check.
         */
        ptep = pte_offset_kernel(pmdp, 0UL);
        for (i = 0; i < PTRS_PER_PTE; i++) {
                if (!pte_none(__ptep_get(&ptep[i])))
                        return;
        }

        pmd_clear(pmdp);
        __flush_tlb_kernel_pgtable(start);
        free_hotplug_pgtable_page(virt_to_page(ptep));
}

static void free_empty_pmd_table(pud_t *pudp, unsigned long addr,
                                 unsigned long end, unsigned long floor,
                                 unsigned long ceiling)
{
        pmd_t *pmdp, pmd;
        unsigned long i, next, start = addr;

        do {
                next = pmd_addr_end(addr, end);
                pmdp = pmd_offset(pudp, addr);
                pmd = READ_ONCE(*pmdp);
                if (pmd_none(pmd))
                        continue;

                WARN_ON(!pmd_present(pmd) || !pmd_table(pmd) || pmd_sect(pmd));
                free_empty_pte_table(pmdp, addr, next, floor, ceiling);
        } while (addr = next, addr < end);

        if (CONFIG_PGTABLE_LEVELS <= 2)
                return;

        if (!pgtable_range_aligned(start, end, floor, ceiling, PUD_MASK))
                return;

        /*
         * Check whether we can free the pmd page if the rest of the
         * entries are empty. Overlap with other regions have been
         * handled by the floor/ceiling check.
         */
        pmdp = pmd_offset(pudp, 0UL);
        for (i = 0; i < PTRS_PER_PMD; i++) {
                if (!pmd_none(READ_ONCE(pmdp[i])))
                        return;
        }

        pud_clear(pudp);
        __flush_tlb_kernel_pgtable(start);
        free_hotplug_pgtable_page(virt_to_page(pmdp));
}

static void free_empty_pud_table(p4d_t *p4dp, unsigned long addr,
                                 unsigned long end, unsigned long floor,
                                 unsigned long ceiling)
{
        pud_t *pudp, pud;
        unsigned long i, next, start = addr;

        do {
                next = pud_addr_end(addr, end);
                pudp = pud_offset(p4dp, addr);
                pud = READ_ONCE(*pudp);
                if (pud_none(pud))
                        continue;

                WARN_ON(!pud_present(pud) || !pud_table(pud) || pud_sect(pud));
                free_empty_pmd_table(pudp, addr, next, floor, ceiling);
        } while (addr = next, addr < end);

        if (!pgtable_l4_enabled())
                return;

        if (!pgtable_range_aligned(start, end, floor, ceiling, P4D_MASK))
                return;

        /*
         * Check whether we can free the pud page if the rest of the
         * entries are empty. Overlap with other regions have been
         * handled by the floor/ceiling check.
         */
        pudp = pud_offset(p4dp, 0UL);
        for (i = 0; i < PTRS_PER_PUD; i++) {
                if (!pud_none(READ_ONCE(pudp[i])))
                        return;
        }

        p4d_clear(p4dp);
        __flush_tlb_kernel_pgtable(start);
        free_hotplug_pgtable_page(virt_to_page(pudp));
}

static void free_empty_p4d_table(pgd_t *pgdp, unsigned long addr,
                                 unsigned long end, unsigned long floor,
                                 unsigned long ceiling)
{
        p4d_t *p4dp, p4d;
        unsigned long i, next, start = addr;

        do {
                next = p4d_addr_end(addr, end);
                p4dp = p4d_offset(pgdp, addr);
                p4d = READ_ONCE(*p4dp);
                if (p4d_none(p4d))
                        continue;

                WARN_ON(!p4d_present(p4d));
                free_empty_pud_table(p4dp, addr, next, floor, ceiling);
        } while (addr = next, addr < end);

        if (!pgtable_l5_enabled())
                return;

        if (!pgtable_range_aligned(start, end, floor, ceiling, PGDIR_MASK))
                return;

        /*
         * Check whether we can free the p4d page if the rest of the
         * entries are empty. Overlap with other regions have been
         * handled by the floor/ceiling check.
         */
        p4dp = p4d_offset(pgdp, 0UL);
        for (i = 0; i < PTRS_PER_P4D; i++) {
                if (!p4d_none(READ_ONCE(p4dp[i])))
                        return;
        }

        pgd_clear(pgdp);
        __flush_tlb_kernel_pgtable(start);
        free_hotplug_pgtable_page(virt_to_page(p4dp));
}

static void free_empty_tables(unsigned long addr, unsigned long end,
                              unsigned long floor, unsigned long ceiling)
{
        unsigned long next;
        pgd_t *pgdp, pgd;

        do {
                next = pgd_addr_end(addr, end);
                pgdp = pgd_offset_k(addr);
                pgd = READ_ONCE(*pgdp);
                if (pgd_none(pgd))
                        continue;

                WARN_ON(!pgd_present(pgd));
                free_empty_p4d_table(pgdp, addr, next, floor, ceiling);
        } while (addr = next, addr < end);
}
#endif

void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
                               unsigned long addr, unsigned long next)
{
        pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL));
}

int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
                                unsigned long addr, unsigned long next)
{
        vmemmap_verify((pte_t *)pmdp, node, addr, next);

        return pmd_sect(READ_ONCE(*pmdp));
}

int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap)
{
        WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));
        /* [start, end] should be within one section */
        WARN_ON_ONCE(end - start > PAGES_PER_SECTION * sizeof(struct page));

        if (!IS_ENABLED(CONFIG_ARM64_4K_PAGES) ||
            (end - start < PAGES_PER_SECTION * sizeof(struct page)))
                return vmemmap_populate_basepages(start, end, node, altmap);
        else
                return vmemmap_populate_hugepages(start, end, node, altmap);
}

#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap)
{
        WARN_ON((start < VMEMMAP_START) || (end > VMEMMAP_END));

        unmap_hotplug_range(start, end, true, altmap);
        free_empty_tables(start, end, VMEMMAP_START, VMEMMAP_END);
}
#endif /* CONFIG_MEMORY_HOTPLUG */

int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot)
{
        pud_t new_pud = pfn_pud(__phys_to_pfn(phys), mk_pud_sect_prot(prot));

        /* Only allow permission changes for now */
        if (!pgattr_change_is_safe(READ_ONCE(pud_val(*pudp)),
                                   pud_val(new_pud)))
                return 0;

        VM_BUG_ON(phys & ~PUD_MASK);
        set_pud(pudp, new_pud);
        return 1;
}

int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot)
{
        pmd_t new_pmd = pfn_pmd(__phys_to_pfn(phys), mk_pmd_sect_prot(prot));

        /* Only allow permission changes for now */
        if (!pgattr_change_is_safe(READ_ONCE(pmd_val(*pmdp)),
                                   pmd_val(new_pmd)))
                return 0;

        VM_BUG_ON(phys & ~PMD_MASK);
        set_pmd(pmdp, new_pmd);
        return 1;
}

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_huge(p4d_t *p4dp)
{
}
#endif

int pud_clear_huge(pud_t *pudp)
{
        if (!pud_sect(READ_ONCE(*pudp)))
                return 0;
        pud_clear(pudp);
        return 1;
}

int pmd_clear_huge(pmd_t *pmdp)
{
        if (!pmd_sect(READ_ONCE(*pmdp)))
                return 0;
        pmd_clear(pmdp);
        return 1;
}

int pmd_free_pte_page(pmd_t *pmdp, unsigned long addr)
{
        pte_t *table;
        pmd_t pmd;

        pmd = READ_ONCE(*pmdp);

        if (!pmd_table(pmd)) {
                VM_WARN_ON(1);
                return 1;
        }

        table = pte_offset_kernel(pmdp, addr);
        pmd_clear(pmdp);
        __flush_tlb_kernel_pgtable(addr);
        pte_free_kernel(NULL, table);
        return 1;
}

int pud_free_pmd_page(pud_t *pudp, unsigned long addr)
{
        pmd_t *table;
        pmd_t *pmdp;
        pud_t pud;
        unsigned long next, end;

        pud = READ_ONCE(*pudp);

        if (!pud_table(pud)) {
                VM_WARN_ON(1);
                return 1;
        }

        table = pmd_offset(pudp, addr);
        pmdp = table;
        next = addr;
        end = addr + PUD_SIZE;
        do {
                pmd_free_pte_page(pmdp, next);
        } while (pmdp++, next += PMD_SIZE, next != end);

        pud_clear(pudp);
        __flush_tlb_kernel_pgtable(addr);
        pmd_free(NULL, table);
        return 1;
}

#ifdef CONFIG_MEMORY_HOTPLUG
static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size)
{
        unsigned long end = start + size;

        WARN_ON(pgdir != init_mm.pgd);
        WARN_ON((start < PAGE_OFFSET) || (end > PAGE_END));

        unmap_hotplug_range(start, end, false, NULL);
        free_empty_tables(start, end, PAGE_OFFSET, PAGE_END);
}

struct range arch_get_mappable_range(void)
{
        struct range mhp_range;
        u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual));
        u64 end_linear_pa = __pa(PAGE_END - 1);

        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
                /*
                 * Check for a wrap, it is possible because of randomized linear
                 * mapping the start physical address is actually bigger than
                 * the end physical address. In this case set start to zero
                 * because [0, end_linear_pa] range must still be able to cover
                 * all addressable physical addresses.
                 */
                if (start_linear_pa > end_linear_pa)
                        start_linear_pa = 0;
        }

        WARN_ON(start_linear_pa > end_linear_pa);

        /*
         * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)]
         * accommodating both its ends but excluding PAGE_END. Max physical
         * range which can be mapped inside this linear mapping range, must
         * also be derived from its end points.
         */
        mhp_range.start = start_linear_pa;
        mhp_range.end =  end_linear_pa;

        return mhp_range;
}

int arch_add_memory(int nid, u64 start, u64 size,
                    struct mhp_params *params)
{
        int ret, flags = NO_EXEC_MAPPINGS;

        VM_BUG_ON(!mhp_range_allowed(start, size, true));

        if (can_set_direct_map())
                flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;

        __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
                             size, params->pgprot, __pgd_pgtable_alloc,
                             flags);

        memblock_clear_nomap(start, size);

        ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
                           params);
        if (ret)
                __remove_pgd_mapping(swapper_pg_dir,
                                     __phys_to_virt(start), size);
        else {
                /* Address of hotplugged memory can be smaller */
                max_pfn = max(max_pfn, PFN_UP(start + size));
                max_low_pfn = max_pfn;
        }

        return ret;
}

void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
{
        unsigned long start_pfn = start >> PAGE_SHIFT;
        unsigned long nr_pages = size >> PAGE_SHIFT;

        __remove_pages(start_pfn, nr_pages, altmap);
        __remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
}

/*
 * This memory hotplug notifier helps prevent boot memory from being
 * inadvertently removed as it blocks pfn range offlining process in
 * __offline_pages(). Hence this prevents both offlining as well as
 * removal process for boot memory which is initially always online.
 * In future if and when boot memory could be removed, this notifier
 * should be dropped and free_hotplug_page_range() should handle any
 * reserved pages allocated during boot.
 */
static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
                                           unsigned long action, void *data)
{
        struct mem_section *ms;
        struct memory_notify *arg = data;
        unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
        unsigned long pfn = arg->start_pfn;

        if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
                return NOTIFY_OK;

        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
                unsigned long start = PFN_PHYS(pfn);
                unsigned long end = start + (1UL << PA_SECTION_SHIFT);

                ms = __pfn_to_section(pfn);
                if (!early_section(ms))
                        continue;

                if (action == MEM_GOING_OFFLINE) {
                        /*
                         * Boot memory removal is not supported. Prevent
                         * it via blocking any attempted offline request
                         * for the boot memory and just report it.
                         */
                        pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end);
                        return NOTIFY_BAD;
                } else if (action == MEM_OFFLINE) {
                        /*
                         * This should have never happened. Boot memory
                         * offlining should have been prevented by this
                         * very notifier. Probably some memory removal
                         * procedure might have changed which would then
                         * require further debug.
                         */
                        pr_err("Boot memory [%lx %lx] offlined\n", start, end);

                        /*
                         * Core memory hotplug does not process a return
                         * code from the notifier for MEM_OFFLINE events.
                         * The error condition has been reported. Return
                         * from here as if ignored.
                         */
                        return NOTIFY_DONE;
                }
        }
        return NOTIFY_OK;
}

static struct notifier_block prevent_bootmem_remove_nb = {
        .notifier_call = prevent_bootmem_remove_notifier,
};

/*
 * This ensures that boot memory sections on the platform are online
 * from early boot. Memory sections could not be prevented from being
 * offlined, unless for some reason they are not online to begin with.
 * This helps validate the basic assumption on which the above memory
 * event notifier works to prevent boot memory section offlining and
 * its possible removal.
 */
static void validate_bootmem_online(void)
{
        phys_addr_t start, end, addr;
        struct mem_section *ms;
        u64 i;

        /*
         * Scanning across all memblock might be expensive
         * on some big memory systems. Hence enable this
         * validation only with DEBUG_VM.
         */
        if (!IS_ENABLED(CONFIG_DEBUG_VM))
                return;

        for_each_mem_range(i, &start, &end) {
                for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) {
                        ms = __pfn_to_section(PHYS_PFN(addr));

                        /*
                         * All memory ranges in the system at this point
                         * should have been marked as early sections.
                         */
                        WARN_ON(!early_section(ms));

                        /*
                         * Memory notifier mechanism here to prevent boot
                         * memory offlining depends on the fact that each
                         * early section memory on the system is initially
                         * online. Otherwise a given memory section which
                         * is already offline will be overlooked and can
                         * be removed completely. Call out such sections.
                         */
                        if (!online_section(ms))
                                pr_err("Boot memory [%llx %llx] is offline, can be removed\n",
                                        addr, addr + (1UL << PA_SECTION_SHIFT));
                }
        }
}

static int __init prevent_bootmem_remove_init(void)
{
        int ret = 0;

        if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
                return ret;

        validate_bootmem_online();
        ret = register_memory_notifier(&prevent_bootmem_remove_nb);
        if (ret)
                pr_err("%s: Notifier registration failed %d\n", __func__, ret);

        return ret;
}
early_initcall(prevent_bootmem_remove_init);
#endif

pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
        if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
                /*
                 * Break-before-make (BBM) is required for all user space mappings
                 * when the permission changes from executable to non-executable
                 * in cases where cpu is affected with errata #2645198.
                 */
                if (pte_user_exec(ptep_get(ptep)))
                        return ptep_clear_flush(vma, addr, ptep);
        }
        return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}

void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
                             pte_t old_pte, pte_t pte)
{
        set_pte_at(vma->vm_mm, addr, ptep, pte);
}

/*
 * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD,
 * avoiding the possibility of conflicting TLB entries being allocated.
 */
void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp)
{
        typedef void (ttbr_replace_func)(phys_addr_t);
        extern ttbr_replace_func idmap_cpu_replace_ttbr1;
        ttbr_replace_func *replace_phys;
        unsigned long daif;

        /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */
        phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp));

        if (cnp)
                ttbr1 |= TTBR_CNP_BIT;

        replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1);

        cpu_install_idmap();

        /*
         * We really don't want to take *any* exceptions while TTBR1 is
         * in the process of being replaced so mask everything.
         */
        daif = local_daif_save();
        replace_phys(ttbr1);
        local_daif_restore(daif);

        cpu_uninstall_idmap();
}

#ifdef CONFIG_ARCH_HAS_PKEYS
int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val)
{
        u64 new_por;
        u64 old_por;

        if (!system_supports_poe())
                return -ENOSPC;

        /*
         * This code should only be called with valid 'pkey'
         * values originating from in-kernel users.  Complain
         * if a bad value is observed.
         */
        if (WARN_ON_ONCE(pkey >= arch_max_pkey()))
                return -EINVAL;

        /* Set the bits we need in POR:  */
        new_por = POE_RWX;
        if (init_val & PKEY_DISABLE_WRITE)
                new_por &= ~POE_W;
        if (init_val & PKEY_DISABLE_ACCESS)
                new_por &= ~POE_RW;
        if (init_val & PKEY_DISABLE_READ)
                new_por &= ~POE_R;
        if (init_val & PKEY_DISABLE_EXECUTE)
                new_por &= ~POE_X;

        /* Shift the bits in to the correct place in POR for pkey: */
        new_por = POR_ELx_PERM_PREP(pkey, new_por);

        /* Get old POR and mask off any old bits in place: */
        old_por = read_sysreg_s(SYS_POR_EL0);
        old_por &= ~(POE_MASK << POR_ELx_PERM_SHIFT(pkey));

        /* Write old part along with new part: */
        write_sysreg_s(old_por | new_por, SYS_POR_EL0);

        return 0;
}
#endif





























  681 












  251 





























  585 













 1259 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for atomic bit
 * operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_ATOMIC_H

#include <linux/instrumented.h>

/**
 * set_bit - Atomically set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 *
 * Note that @nr may be almost arbitrarily large; this function is not
 * restricted to acting on a single-word quantity.
 */
static __always_inline void set_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_set_bit(nr, addr);
}

/**
 * clear_bit - Clears a bit in memory
 * @nr: Bit to clear
 * @addr: Address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 */
static __always_inline void clear_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_clear_bit(nr, addr);
}

/**
 * change_bit - Toggle a bit in memory
 * @nr: Bit to change
 * @addr: Address to start counting from
 *
 * This is a relaxed atomic operation (no implied memory barriers).
 *
 * Note that @nr may be almost arbitrarily large; this function is not
 * restricted to acting on a single-word quantity.
 */
static __always_inline void change_bit(long nr, volatile unsigned long *addr)
{
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_change_bit(nr, addr);
}

/**
 * test_and_set_bit - Set a bit and return its old value
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr)
{
        kcsan_mb();
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_set_bit(nr, addr);
}

/**
 * test_and_clear_bit - Clear a bit and return its old value
 * @nr: Bit to clear
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr)
{
        kcsan_mb();
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_clear_bit(nr, addr);
}

/**
 * test_and_change_bit - Change a bit and return its old value
 * @nr: Bit to change
 * @addr: Address to count from
 *
 * This is an atomic fully-ordered operation (implied full memory barrier).
 */
static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr)
{
        kcsan_mb();
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_change_bit(nr, addr);
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */










































































































































    3 







    3 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/etherdevice.h>
#include "ipvlan.h"
#include <linux/if_vlan.h>
#include <linux/if_tap.h>
#include <linux/interrupt.h>
#include <linux/nsproxy.h>
#include <linux/compat.h>
#include <linux/if_tun.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/cache.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/cdev.h>
#include <linux/idr.h>
#include <linux/fs.h>
#include <linux/uio.h>

#include <net/net_namespace.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <linux/virtio_net.h>

#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
                      NETIF_F_TSO6)

static dev_t ipvtap_major;
static struct cdev ipvtap_cdev;

static const void *ipvtap_net_namespace(const struct device *d)
{
        const struct net_device *dev = to_net_dev(d->parent);
        return dev_net(dev);
}

static struct class ipvtap_class = {
         .name = "ipvtap",
         .ns_type = &net_ns_type_operations,
         .namespace = ipvtap_net_namespace,
};

struct ipvtap_dev {
        struct ipvl_dev vlan;
        struct tap_dev          tap;
};

static void ipvtap_count_tx_dropped(struct tap_dev *tap)
{
        struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap);
        struct ipvl_dev *vlan = &vlantap->vlan;

        this_cpu_inc(vlan->pcpu_stats->tx_drps);
}

static void ipvtap_count_rx_dropped(struct tap_dev *tap)
{
        struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap);
        struct ipvl_dev *vlan = &vlantap->vlan;

        ipvlan_count_rx(vlan, 0, 0, 0);
}

static void ipvtap_update_features(struct tap_dev *tap,
                                   netdev_features_t features)
{
        struct ipvtap_dev *vlantap = container_of(tap, struct ipvtap_dev, tap);
        struct ipvl_dev *vlan = &vlantap->vlan;

        vlan->sfeatures = features;
        netdev_update_features(vlan->dev);
}

static int ipvtap_newlink(struct net_device *dev,
                          struct rtnl_newlink_params *params,
                          struct netlink_ext_ack *extack)
{
        struct ipvtap_dev *vlantap = netdev_priv(dev);
        int err;

        INIT_LIST_HEAD(&vlantap->tap.queue_list);

        /* Since macvlan supports all offloads by default, make
         * tap support all offloads also.
         */
        vlantap->tap.tap_features = TUN_OFFLOADS;
        vlantap->tap.count_tx_dropped = ipvtap_count_tx_dropped;
        vlantap->tap.update_features =        ipvtap_update_features;
        vlantap->tap.count_rx_dropped = ipvtap_count_rx_dropped;

        err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);
        if (err)
                return err;

        /* Don't put anything that may fail after macvlan_common_newlink
         * because we can't undo what it does.
         */
        err = ipvlan_link_new(dev, params, extack);
        if (err) {
                netdev_rx_handler_unregister(dev);
                return err;
        }

        vlantap->tap.dev = vlantap->vlan.dev;

        return err;
}

static void ipvtap_dellink(struct net_device *dev,
                           struct list_head *head)
{
        struct ipvtap_dev *vlan = netdev_priv(dev);

        netdev_rx_handler_unregister(dev);
        tap_del_queues(&vlan->tap);
        ipvlan_link_delete(dev, head);
}

static void ipvtap_setup(struct net_device *dev)
{
        ipvlan_link_setup(dev);
        dev->tx_queue_len = TUN_READQ_SIZE;
        dev->priv_flags &= ~IFF_NO_QUEUE;
}

static struct rtnl_link_ops ipvtap_link_ops __read_mostly = {
        .kind                = "ipvtap",
        .setup                = ipvtap_setup,
        .newlink        = ipvtap_newlink,
        .dellink        = ipvtap_dellink,
        .priv_size        = sizeof(struct ipvtap_dev),
};

static int ipvtap_device_event(struct notifier_block *unused,
                               unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct ipvtap_dev *vlantap;
        struct device *classdev;
        dev_t devt;
        int err;
        char tap_name[IFNAMSIZ];

        if (dev->rtnl_link_ops != &ipvtap_link_ops)
                return NOTIFY_DONE;

        snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
        vlantap = netdev_priv(dev);

        switch (event) {
        case NETDEV_REGISTER:
                /* Create the device node here after the network device has
                 * been registered but before register_netdevice has
                 * finished running.
                 */
                err = tap_get_minor(ipvtap_major, &vlantap->tap);
                if (err)
                        return notifier_from_errno(err);

                devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor);
                classdev = device_create(&ipvtap_class, &dev->dev, devt,
                                         dev, "%s", tap_name);
                if (IS_ERR(classdev)) {
                        tap_free_minor(ipvtap_major, &vlantap->tap);
                        return notifier_from_errno(PTR_ERR(classdev));
                }
                err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
                                        tap_name);
                if (err)
                        return notifier_from_errno(err);
                break;
        case NETDEV_UNREGISTER:
                /* vlan->minor == 0 if NETDEV_REGISTER above failed */
                if (vlantap->tap.minor == 0)
                        break;
                sysfs_remove_link(&dev->dev.kobj, tap_name);
                devt = MKDEV(MAJOR(ipvtap_major), vlantap->tap.minor);
                device_destroy(&ipvtap_class, devt);
                tap_free_minor(ipvtap_major, &vlantap->tap);
                break;
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                if (tap_queue_resize(&vlantap->tap))
                        return NOTIFY_BAD;
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block ipvtap_notifier_block __read_mostly = {
        .notifier_call        = ipvtap_device_event,
};

static int __init ipvtap_init(void)
{
        int err;

        err = tap_create_cdev(&ipvtap_cdev, &ipvtap_major, "ipvtap",
                              THIS_MODULE);
        if (err)
                goto out1;

        err = class_register(&ipvtap_class);
        if (err)
                goto out2;

        err = register_netdevice_notifier(&ipvtap_notifier_block);
        if (err)
                goto out3;

        err = ipvlan_link_register(&ipvtap_link_ops);
        if (err)
                goto out4;

        return 0;

out4:
        unregister_netdevice_notifier(&ipvtap_notifier_block);
out3:
        class_unregister(&ipvtap_class);
out2:
        tap_destroy_cdev(ipvtap_major, &ipvtap_cdev);
out1:
        return err;
}
module_init(ipvtap_init);

static void __exit ipvtap_exit(void)
{
        rtnl_link_unregister(&ipvtap_link_ops);
        unregister_netdevice_notifier(&ipvtap_notifier_block);
        class_unregister(&ipvtap_class);
        tap_destroy_cdev(ipvtap_major, &ipvtap_cdev);
}
module_exit(ipvtap_exit);
MODULE_ALIAS_RTNL_LINK("ipvtap");
MODULE_AUTHOR("Sainath Grandhi <sainath.grandhi@intel.com>");
MODULE_DESCRIPTION("IP-VLAN based tap driver");
MODULE_LICENSE("GPL");


















































































































































































































































































  289 









  265 





  265 





























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * net/dst.h        Protocol independent destination cache definitions.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#ifndef _NET_DST_H
#define _NET_DST_H

#include <net/dst_ops.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/rcupdate.h>
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/refcount.h>
#include <linux/rcuref.h>
#include <net/neighbour.h>
#include <asm/processor.h>
#include <linux/indirect_call_wrapper.h>

struct sk_buff;

struct dst_entry {
        struct net_device       *dev;
        struct  dst_ops                *ops;
        unsigned long                _metrics;
        unsigned long           expires;
#ifdef CONFIG_XFRM
        struct xfrm_state        *xfrm;
#else
        void                        *__pad1;
#endif
        int                        (*input)(struct sk_buff *);
        int                        (*output)(struct net *net, struct sock *sk, struct sk_buff *skb);

        unsigned short                flags;
#define DST_NOXFRM                0x0002
#define DST_NOPOLICY                0x0004
#define DST_NOCOUNT                0x0008
#define DST_FAKE_RTABLE                0x0010
#define DST_XFRM_TUNNEL                0x0020
#define DST_XFRM_QUEUE                0x0040
#define DST_METADATA                0x0080

        /* A non-zero value of dst->obsolete forces by-hand validation
         * of the route entry.  Positive values are set by the generic
         * dst layer to indicate that the entry has been forcefully
         * destroyed.
         *
         * Negative values are used by the implementation layer code to
         * force invocation of the dst_ops->check() method.
         */
        short                        obsolete;
#define DST_OBSOLETE_NONE        0
#define DST_OBSOLETE_DEAD        2
#define DST_OBSOLETE_FORCE_CHK        -1
#define DST_OBSOLETE_KILL        -2
        unsigned short                header_len;        /* more space at head required */
        unsigned short                trailer_len;        /* space to reserve at tail */

        /*
         * __rcuref wants to be on a different cache line from
         * input/output/ops or performance tanks badly
         */
#ifdef CONFIG_64BIT
        rcuref_t                __rcuref;        /* 64-bit offset 64 */
#endif
        int                        __use;
        unsigned long                lastuse;
        struct rcu_head                rcu_head;
        short                        error;
        short                        __pad;
        __u32                        tclassid;
#ifndef CONFIG_64BIT
        struct lwtunnel_state   *lwtstate;
        rcuref_t                __rcuref;        /* 32-bit offset 64 */
#endif
        netdevice_tracker        dev_tracker;

        /*
         * Used by rtable and rt6_info. Moves lwtstate into the next cache
         * line on 64bit so that lwtstate does not cause false sharing with
         * __rcuref under contention of __rcuref. This also puts the
         * frequently accessed members of rtable and rt6_info out of the
         * __rcuref cache line.
         */
        struct list_head        rt_uncached;
        struct uncached_list        *rt_uncached_list;
#ifdef CONFIG_64BIT
        struct lwtunnel_state   *lwtstate;
#endif
};

struct dst_metrics {
        u32                metrics[RTAX_MAX];
        refcount_t        refcnt;
} __aligned(4);                /* Low pointer bits contain DST_METRICS_FLAGS */
extern const struct dst_metrics dst_default_metrics;

u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old);

#define DST_METRICS_READ_ONLY                0x1UL
#define DST_METRICS_REFCOUNTED                0x2UL
#define DST_METRICS_FLAGS                0x3UL
#define __DST_METRICS_PTR(Y)        \
        ((u32 *)((Y) & ~DST_METRICS_FLAGS))
#define DST_METRICS_PTR(X)        __DST_METRICS_PTR((X)->_metrics)

static inline bool dst_metrics_read_only(const struct dst_entry *dst)
{
        return dst->_metrics & DST_METRICS_READ_ONLY;
}

void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old);

static inline void dst_destroy_metrics_generic(struct dst_entry *dst)
{
        unsigned long val = dst->_metrics;
        if (!(val & DST_METRICS_READ_ONLY))
                __dst_destroy_metrics_generic(dst, val);
}

static inline u32 *dst_metrics_write_ptr(struct dst_entry *dst)
{
        unsigned long p = dst->_metrics;

        BUG_ON(!p);

        if (p & DST_METRICS_READ_ONLY)
                return dst->ops->cow_metrics(dst, p);
        return __DST_METRICS_PTR(p);
}

/* This may only be invoked before the entry has reached global
 * visibility.
 */
static inline void dst_init_metrics(struct dst_entry *dst,
                                    const u32 *src_metrics,
                                    bool read_only)
{
        dst->_metrics = ((unsigned long) src_metrics) |
                (read_only ? DST_METRICS_READ_ONLY : 0);
}

static inline void dst_copy_metrics(struct dst_entry *dest, const struct dst_entry *src)
{
        u32 *dst_metrics = dst_metrics_write_ptr(dest);

        if (dst_metrics) {
                u32 *src_metrics = DST_METRICS_PTR(src);

                memcpy(dst_metrics, src_metrics, RTAX_MAX * sizeof(u32));
        }
}

static inline u32 *dst_metrics_ptr(struct dst_entry *dst)
{
        return DST_METRICS_PTR(dst);
}

static inline u32
dst_metric_raw(const struct dst_entry *dst, const int metric)
{
        u32 *p = DST_METRICS_PTR(dst);

        return p[metric-1];
}

static inline u32
dst_metric(const struct dst_entry *dst, const int metric)
{
        WARN_ON_ONCE(metric == RTAX_HOPLIMIT ||
                     metric == RTAX_ADVMSS ||
                     metric == RTAX_MTU);
        return dst_metric_raw(dst, metric);
}

static inline u32
dst_metric_advmss(const struct dst_entry *dst)
{
        u32 advmss = dst_metric_raw(dst, RTAX_ADVMSS);

        if (!advmss)
                advmss = dst->ops->default_advmss(dst);

        return advmss;
}

static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val)
{
        u32 *p = dst_metrics_write_ptr(dst);

        if (p)
                p[metric-1] = val;
}

/* Kernel-internal feature bits that are unallocated in user space. */
#define DST_FEATURE_ECN_CA        (1U << 31)

#define DST_FEATURE_MASK        (DST_FEATURE_ECN_CA)
#define DST_FEATURE_ECN_MASK        (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN)

static inline u32
dst_feature(const struct dst_entry *dst, u32 feature)
{
        return dst_metric(dst, RTAX_FEATURES) & feature;
}

INDIRECT_CALLABLE_DECLARE(unsigned int ip6_mtu(const struct dst_entry *));
INDIRECT_CALLABLE_DECLARE(unsigned int ipv4_mtu(const struct dst_entry *));
static inline u32 dst_mtu(const struct dst_entry *dst)
{
        return INDIRECT_CALL_INET(dst->ops->mtu, ip6_mtu, ipv4_mtu, dst);
}

/* RTT metrics are stored in milliseconds for user ABI, but used as jiffies */
static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metric)
{
        return msecs_to_jiffies(dst_metric(dst, metric));
}

static inline int
dst_metric_locked(const struct dst_entry *dst, int metric)
{
        return dst_metric(dst, RTAX_LOCK) & (1 << metric);
}

static inline void dst_hold(struct dst_entry *dst)
{
        /*
         * If your kernel compilation stops here, please check
         * the placement of __rcuref in struct dst_entry
         */
        BUILD_BUG_ON(offsetof(struct dst_entry, __rcuref) & 63);
        WARN_ON(!rcuref_get(&dst->__rcuref));
}

static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
{
        if (unlikely(time != dst->lastuse)) {
                dst->__use++;
                dst->lastuse = time;
        }
}

static inline struct dst_entry *dst_clone(struct dst_entry *dst)
{
        if (dst)
                dst_hold(dst);
        return dst;
}

void dst_release(struct dst_entry *dst);

void dst_release_immediate(struct dst_entry *dst);

static inline void refdst_drop(unsigned long refdst)
{
        if (!(refdst & SKB_DST_NOREF))
                dst_release((struct dst_entry *)(refdst & SKB_DST_PTRMASK));
}

/**
 * skb_dst_drop - drops skb dst
 * @skb: buffer
 *
 * Drops dst reference count if a reference was taken.
 */
static inline void skb_dst_drop(struct sk_buff *skb)
{
        if (skb->_skb_refdst) {
                refdst_drop(skb->_skb_refdst);
                skb->_skb_refdst = 0UL;
        }
}

static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst)
{
        nskb->slow_gro |= !!refdst;
        nskb->_skb_refdst = refdst;
        if (!(nskb->_skb_refdst & SKB_DST_NOREF))
                dst_clone(skb_dst(nskb));
}

static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb)
{
        __skb_dst_copy(nskb, oskb->_skb_refdst);
}

/**
 * dst_hold_safe - Take a reference on a dst if possible
 * @dst: pointer to dst entry
 *
 * This helper returns false if it could not safely
 * take a reference on a dst.
 */
static inline bool dst_hold_safe(struct dst_entry *dst)
{
        return rcuref_get(&dst->__rcuref);
}

/**
 * skb_dst_force - makes sure skb dst is refcounted
 * @skb: buffer
 *
 * If dst is not yet refcounted and not destroyed, grab a ref on it.
 * Returns: true if dst is refcounted.
 */
static inline bool skb_dst_force(struct sk_buff *skb)
{
        if (skb_dst_is_noref(skb)) {
                struct dst_entry *dst = skb_dst(skb);

                WARN_ON(!rcu_read_lock_held());
                if (!dst_hold_safe(dst))
                        dst = NULL;

                skb->_skb_refdst = (unsigned long)dst;
                skb->slow_gro |= !!dst;
        }

        return skb->_skb_refdst != 0UL;
}


/**
 *        __skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups. (no accounting done)
 */
static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                   struct net *net)
{
        skb->dev = dev;

        /*
         * Clear hash so that we can recalculate the hash for the
         * encapsulated packet, unless we have already determine the hash
         * over the L4 4-tuple.
         */
        skb_clear_hash_if_not_l4(skb);
        skb_set_queue_mapping(skb, 0);
        skb_scrub_packet(skb, !net_eq(net, dev_net(dev)));
}

/**
 *        skb_tunnel_rx - prepare skb for rx reinsert
 *        @skb: buffer
 *        @dev: tunnel device
 *        @net: netns for packet i/o
 *
 *        After decapsulation, packet is going to re-enter (netif_rx()) our stack,
 *        so make some cleanups, and perform accounting.
 *        Note: this accounting is not SMP safe.
 */
static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
                                 struct net *net)
{
        DEV_STATS_INC(dev, rx_packets);
        DEV_STATS_ADD(dev, rx_bytes, skb->len);
        __skb_tunnel_rx(skb, dev, net);
}

static inline u32 dst_tclassid(const struct sk_buff *skb)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        const struct dst_entry *dst;

        dst = skb_dst(skb);
        if (dst)
                return dst->tclassid;
#endif
        return 0;
}

int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static inline int dst_discard(struct sk_buff *skb)
{
        return dst_discard_out(&init_net, skb->sk, skb);
}
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
                int initial_obsolete, unsigned short flags);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
              struct net_device *dev, int initial_obsolete,
              unsigned short flags);
void dst_dev_put(struct dst_entry *dst);

static inline void dst_confirm(struct dst_entry *dst)
{
}

static inline struct neighbour *dst_neigh_lookup(const struct dst_entry *dst, const void *daddr)
{
        struct neighbour *n = dst->ops->neigh_lookup(dst, NULL, daddr);
        return IS_ERR(n) ? NULL : n;
}

static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst,
                                                     struct sk_buff *skb)
{
        struct neighbour *n;

        if (WARN_ON_ONCE(!dst->ops->neigh_lookup))
                return NULL;

        n = dst->ops->neigh_lookup(dst, skb, NULL);

        return IS_ERR(n) ? NULL : n;
}

static inline void dst_confirm_neigh(const struct dst_entry *dst,
                                     const void *daddr)
{
        if (dst->ops->confirm_neigh)
                dst->ops->confirm_neigh(dst, daddr);
}

static inline void dst_link_failure(struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        if (dst && dst->ops && dst->ops->link_failure)
                dst->ops->link_failure(skb);
}

static inline void dst_set_expires(struct dst_entry *dst, int timeout)
{
        unsigned long expires = jiffies + timeout;

        if (expires == 0)
                expires = 1;

        if (dst->expires == 0 || time_before(expires, dst->expires))
                dst->expires = expires;
}

static inline unsigned int dst_dev_overhead(struct dst_entry *dst,
                                            struct sk_buff *skb)
{
        if (likely(dst))
                return LL_RESERVED_SPACE(dst->dev);

        return skb->mac_len;
}

INDIRECT_CALLABLE_DECLARE(int ip6_output(struct net *, struct sock *,
                                         struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *,
                                         struct sk_buff *));
/* Output packet to network from transport.  */
static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(skb_dst(skb)->output,
                                  ip6_output, ip_output,
                                  net, sk, skb);
}

INDIRECT_CALLABLE_DECLARE(int ip6_input(struct sk_buff *));
INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *));
/* Input packet from network to transport.  */
static inline int dst_input(struct sk_buff *skb)
{
        return INDIRECT_CALL_INET(skb_dst(skb)->input,
                                  ip6_input, ip_local_deliver, skb);
}

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
                                                          u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie)
{
        if (dst->obsolete)
                dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check,
                                         ipv4_dst_check, dst, cookie);
        return dst;
}

/* Flags for xfrm_lookup flags argument. */
enum {
        XFRM_LOOKUP_ICMP = 1 << 0,
        XFRM_LOOKUP_QUEUE = 1 << 1,
        XFRM_LOOKUP_KEEP_DST_REF = 1 << 2,
};

struct flowi;
#ifndef CONFIG_XFRM
static inline struct dst_entry *xfrm_lookup(struct net *net,
                                            struct dst_entry *dst_orig,
                                            const struct flowi *fl,
                                            const struct sock *sk,
                                            int flags)
{
        return dst_orig;
}

static inline struct dst_entry *
xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig,
                      const struct flowi *fl, const struct sock *sk,
                      int flags, u32 if_id)
{
        return dst_orig;
}

static inline struct dst_entry *xfrm_lookup_route(struct net *net,
                                                  struct dst_entry *dst_orig,
                                                  const struct flowi *fl,
                                                  const struct sock *sk,
                                                  int flags)
{
        return dst_orig;
}

static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return NULL;
}

#else
struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
                              const struct flowi *fl, const struct sock *sk,
                              int flags);

struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
                                        struct dst_entry *dst_orig,
                                        const struct flowi *fl,
                                        const struct sock *sk, int flags,
                                        u32 if_id);

struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
                                    const struct flowi *fl, const struct sock *sk,
                                    int flags);

/* skb attached with this dst needs transformation if dst->xfrm is valid */
static inline struct xfrm_state *dst_xfrm(const struct dst_entry *dst)
{
        return dst->xfrm;
}
#endif

static inline void skb_dst_update_pmtu(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, true);
}

/* update dst pmtu but not do neighbor confirm */
static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu)
{
        struct dst_entry *dst = skb_dst(skb);

        if (dst && dst->ops->update_pmtu)
                dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}

struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie);
void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu, bool confirm_neigh);
void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
                            struct sk_buff *skb);
u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old);
struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
                                             struct sk_buff *skb,
                                             const void *daddr);
unsigned int dst_blackhole_mtu(const struct dst_entry *dst);

#endif /* _NET_DST_H */









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2020 - Google LLC
 * Author: Quentin Perret <qperret@google.com>
 */
#ifndef __ARM64_KVM_PKVM_H__
#define __ARM64_KVM_PKVM_H__

#include <linux/arm_ffa.h>
#include <linux/memblock.h>
#include <linux/scatterlist.h>
#include <asm/kvm_pgtable.h>

/* Maximum number of VMs that can co-exist under pKVM. */
#define KVM_MAX_PVMS 255

#define HYP_MEMBLOCK_REGIONS 128

int pkvm_init_host_vm(struct kvm *kvm);
int pkvm_create_hyp_vm(struct kvm *kvm);
void pkvm_destroy_hyp_vm(struct kvm *kvm);
int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu);

/*
 * This functions as an allow-list of protected VM capabilities.
 * Features not explicitly allowed by this function are denied.
 */
static inline bool kvm_pvm_ext_allowed(long ext)
{
        switch (ext) {
        case KVM_CAP_IRQCHIP:
        case KVM_CAP_ARM_PSCI:
        case KVM_CAP_ARM_PSCI_0_2:
        case KVM_CAP_NR_VCPUS:
        case KVM_CAP_MAX_VCPUS:
        case KVM_CAP_MAX_VCPU_ID:
        case KVM_CAP_MSI_DEVID:
        case KVM_CAP_ARM_VM_IPA_SIZE:
        case KVM_CAP_ARM_PMU_V3:
        case KVM_CAP_ARM_SVE:
        case KVM_CAP_ARM_PTRAUTH_ADDRESS:
        case KVM_CAP_ARM_PTRAUTH_GENERIC:
                return true;
        default:
                return false;
        }
}

extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);

static inline unsigned long
hyp_vmemmap_memblock_size(struct memblock_region *reg, size_t vmemmap_entry_size)
{
        unsigned long nr_pages = reg->size >> PAGE_SHIFT;
        unsigned long start, end;

        start = (reg->base >> PAGE_SHIFT) * vmemmap_entry_size;
        end = start + nr_pages * vmemmap_entry_size;
        start = ALIGN_DOWN(start, PAGE_SIZE);
        end = ALIGN(end, PAGE_SIZE);

        return end - start;
}

static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size)
{
        unsigned long res = 0, i;

        for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
                res += hyp_vmemmap_memblock_size(&kvm_nvhe_sym(hyp_memory)[i],
                                                 vmemmap_entry_size);
        }

        return res >> PAGE_SHIFT;
}

static inline unsigned long hyp_vm_table_pages(void)
{
        return PAGE_ALIGN(KVM_MAX_PVMS * sizeof(void *)) >> PAGE_SHIFT;
}

static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
{
        unsigned long total = 0;
        int i;

        /* Provision the worst case scenario */
        for (i = KVM_PGTABLE_FIRST_LEVEL; i <= KVM_PGTABLE_LAST_LEVEL; i++) {
                nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE);
                total += nr_pages;
        }

        return total;
}

static inline unsigned long __hyp_pgtable_total_pages(void)
{
        unsigned long res = 0, i;

        /* Cover all of memory with page-granularity */
        for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) {
                struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i];
                res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT);
        }

        return res;
}

static inline unsigned long hyp_s1_pgtable_pages(void)
{
        unsigned long res;

        res = __hyp_pgtable_total_pages();

        /* Allow 1 GiB for private mappings */
        res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);

        return res;
}

static inline unsigned long host_s2_pgtable_pages(void)
{
        unsigned long res;

        /*
         * Include an extra 16 pages to safely upper-bound the worst case of
         * concatenated pgds.
         */
        res = __hyp_pgtable_total_pages() + 16;

        /* Allow 1 GiB for MMIO mappings */
        res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT);

        return res;
}

#ifdef CONFIG_NVHE_EL2_DEBUG
static inline unsigned long pkvm_selftest_pages(void) { return 32; }
#else
static inline unsigned long pkvm_selftest_pages(void) { return 0; }
#endif

#define KVM_FFA_MBOX_NR_PAGES        1

static inline unsigned long hyp_ffa_proxy_pages(void)
{
        size_t desc_max;

        /*
         * The hypervisor FFA proxy needs enough memory to buffer a fragmented
         * descriptor returned from EL3 in response to a RETRIEVE_REQ call.
         */
        desc_max = sizeof(struct ffa_mem_region) +
                   sizeof(struct ffa_mem_region_attributes) +
                   sizeof(struct ffa_composite_mem_region) +
                   SG_MAX_SEGMENTS * sizeof(struct ffa_mem_region_addr_range);

        /* Plus a page each for the hypervisor's RX and TX mailboxes. */
        return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE);
}

static inline size_t pkvm_host_sve_state_size(void)
{
        if (!system_supports_sve())
                return 0;

        return size_add(sizeof(struct cpu_sve_state),
                        SVE_SIG_REGS_SIZE(sve_vq_from_vl(kvm_host_sve_max_vl)));
}

struct pkvm_mapping {
        struct rb_node node;
        u64 gfn;
        u64 pfn;
        u64 nr_pages;
        u64 __subtree_last;        /* Internal member for interval tree */
};

int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
                             struct kvm_pgtable_mm_ops *mm_ops);
void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);
int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
                            enum kvm_pgtable_prot prot, void *mc,
                            enum kvm_pgtable_walk_flags flags);
int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold);
int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
                                    enum kvm_pgtable_walk_flags flags);
void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
                                 enum kvm_pgtable_walk_flags flags);
int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
                              struct kvm_mmu_memory_cache *mc);
void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level);
kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
                                               enum kvm_pgtable_prot prot, void *mc,
                                               bool force_pte);
#endif        /* __ARM64_KVM_PKVM_H__ */














































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_GFP_H
#define __LINUX_GFP_H

#include <linux/gfp_types.h>

#include <linux/mmzone.h>
#include <linux/topology.h>
#include <linux/alloc_tag.h>
#include <linux/sched.h>

struct vm_area_struct;
struct mempolicy;

/* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
#define GFP_MOVABLE_SHIFT 3

static inline int gfp_migratetype(const gfp_t gfp_flags)
{
        VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
        BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
        BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
        BUILD_BUG_ON((___GFP_RECLAIMABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_RECLAIMABLE);
        BUILD_BUG_ON(((___GFP_MOVABLE | ___GFP_RECLAIMABLE) >>
                      GFP_MOVABLE_SHIFT) != MIGRATE_HIGHATOMIC);

        if (unlikely(page_group_by_mobility_disabled))
                return MIGRATE_UNMOVABLE;

        /* Group based on mobility */
        return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
}
#undef GFP_MOVABLE_MASK
#undef GFP_MOVABLE_SHIFT

static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
        return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
}

static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
{
        /*
         * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
         * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
         * All GFP_* flags including GFP_NOWAIT use one or both flags.
         * try_alloc_pages() is the only API that doesn't specify either flag.
         *
         * This is stronger than GFP_NOWAIT or GFP_ATOMIC because
         * those are guaranteed to never block on a sleeping lock.
         * Here we are enforcing that the allocation doesn't ever spin
         * on any locks (i.e. only trylocks). There is no high level
         * GFP_$FOO flag for this use in try_alloc_pages() as the
         * regular page allocator doesn't fully support this
         * allocation mode.
         */
        return !!(gfp_flags & __GFP_RECLAIM);
}

#ifdef CONFIG_HIGHMEM
#define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
#else
#define OPT_ZONE_HIGHMEM ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA
#define OPT_ZONE_DMA ZONE_DMA
#else
#define OPT_ZONE_DMA ZONE_NORMAL
#endif

#ifdef CONFIG_ZONE_DMA32
#define OPT_ZONE_DMA32 ZONE_DMA32
#else
#define OPT_ZONE_DMA32 ZONE_NORMAL
#endif

/*
 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
 * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT
 * bits long and there are 16 of them to cover all possible combinations of
 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
 *
 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
 * But GFP_MOVABLE is not only a zone specifier but also an allocation
 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
 *
 *       bit       result
 *       =================
 *       0x0    => NORMAL
 *       0x1    => DMA or NORMAL
 *       0x2    => HIGHMEM or NORMAL
 *       0x3    => BAD (DMA+HIGHMEM)
 *       0x4    => DMA32 or NORMAL
 *       0x5    => BAD (DMA+DMA32)
 *       0x6    => BAD (HIGHMEM+DMA32)
 *       0x7    => BAD (HIGHMEM+DMA32+DMA)
 *       0x8    => NORMAL (MOVABLE+0)
 *       0x9    => DMA or NORMAL (MOVABLE+DMA)
 *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
 *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
 *       0xc    => DMA32 or NORMAL (MOVABLE+DMA32)
 *       0xd    => BAD (MOVABLE+DMA32+DMA)
 *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
 *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
 *
 * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms.
 */

#if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4
/* ZONE_DEVICE is not a valid GFP zone specifier */
#define GFP_ZONES_SHIFT 2
#else
#define GFP_ZONES_SHIFT ZONES_SHIFT
#endif

#if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG
#error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif

#define GFP_ZONE_TABLE ( \
        (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT)                                       \
        | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT)               \
        | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT)                       \
        | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT)                       \
        | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT)    \
        | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\
        | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\
)

/*
 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
 * entry starting with bit 0. Bit is set if the combination is not
 * allowed.
 */
#define GFP_ZONE_BAD ( \
        1 << (___GFP_DMA | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32)                                      \
        | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)                                      \
        | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)                      \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
)

static inline enum zone_type gfp_zone(gfp_t flags)
{
        enum zone_type z;
        int bit = (__force int) (flags & GFP_ZONEMASK);

        z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) &
                                         ((1 << GFP_ZONES_SHIFT) - 1);
        VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
        return z;
}

/*
 * There is only one page-allocator function, and two main namespaces to
 * it. The alloc_page*() variants return 'struct page *' and as such
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */

static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
        if (unlikely(flags & __GFP_THISNODE))
                return ZONELIST_NOFALLBACK;
#endif
        return ZONELIST_FALLBACK;
}

/*
 * gfp flag masking for nested internal allocations.
 *
 * For code that needs to do allocations inside the public allocation API (e.g.
 * memory allocation tracking code) the allocations need to obey the caller
 * allocation context constrains to prevent allocation context mismatches (e.g.
 * GFP_KERNEL allocations in GFP_NOFS contexts) from potential deadlock
 * situations.
 *
 * It is also assumed that these nested allocations are for internal kernel
 * object storage purposes only and are not going to be used for DMA, etc. Hence
 * we strip out all the zone information and leave just the context information
 * intact.
 *
 * Further, internal allocations must fail before the higher level allocation
 * can fail, so we must make them fail faster and fail silently. We also don't
 * want them to deplete emergency reserves.  Hence nested allocations must be
 * prepared for these allocations to fail.
 */
static inline gfp_t gfp_nested_mask(gfp_t flags)
{
        return ((flags & (GFP_KERNEL | GFP_ATOMIC | __GFP_NOLOCKDEP)) |
                (__GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN));
}

/*
 * We get the zone list from the current node and the gfp_mask.
 * This zone list contains a maximum of MAX_NUMNODES*MAX_NR_ZONES zones.
 * There are two zonelists per node, one for all zones with memory and
 * one containing just zones from the node the zonelist belongs to.
 *
 * For the case of non-NUMA systems the NODE_DATA() gets optimized to
 * &contig_page_data at compile-time.
 */
static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
        return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

#ifndef HAVE_ARCH_FREE_PAGE
static inline void arch_free_page(struct page *page, int order) { }
#endif
#ifndef HAVE_ARCH_ALLOC_PAGE
static inline void arch_alloc_page(struct page *page, int order) { }
#endif

struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __alloc_pages(...)                        alloc_hooks(__alloc_pages_noprof(__VA_ARGS__))

struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid,
                nodemask_t *nodemask);
#define __folio_alloc(...)                        alloc_hooks(__folio_alloc_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
                                nodemask_t *nodemask, int nr_pages,
                                struct page **page_array);
#define __alloc_pages_bulk(...)                        alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))

unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
                                unsigned long nr_pages,
                                struct page **page_array);
#define  alloc_pages_bulk_mempolicy(...)                                \
        alloc_hooks(alloc_pages_bulk_mempolicy_noprof(__VA_ARGS__))

/* Bulk allocate order-0 pages */
#define alloc_pages_bulk(_gfp, _nr_pages, _page_array)                \
        __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array)

static inline unsigned long
alloc_pages_bulk_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
                                   struct page **page_array)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array);
}

#define alloc_pages_bulk_node(...)                                \
        alloc_hooks(alloc_pages_bulk_node_noprof(__VA_ARGS__))

static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask)
{
        gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN);

        if (warn_gfp != (__GFP_THISNODE|__GFP_NOWARN))
                return;

        if (node_online(this_node))
                return;

        pr_warn("%pGg allocation from offline node %d\n", &gfp_mask, this_node);
        dump_stack();
}

/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp_mask);

        return __alloc_pages_noprof(gfp_mask, order, nid, NULL);
}

#define  __alloc_pages_node(...)                alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__))

static inline
struct folio *__folio_alloc_node_noprof(gfp_t gfp, unsigned int order, int nid)
{
        VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
        warn_if_node_offline(nid, gfp);

        return __folio_alloc_noprof(gfp, order, nid, NULL);
}

#define  __folio_alloc_node(...)                alloc_hooks(__folio_alloc_node_noprof(__VA_ARGS__))

/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
 */
static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask,
                                                   unsigned int order)
{
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();

        return __alloc_pages_node_noprof(nid, gfp_mask, order);
}

#define  alloc_pages_node(...)                        alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__))

#ifdef CONFIG_NUMA
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order);
struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order);
struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid);
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr);
#else
static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order)
{
        return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order);
}
static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return __folio_alloc_node_noprof(gfp, order, numa_node_id());
}
static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *mpol, pgoff_t ilx, int nid)
{
        return folio_alloc_noprof(gfp, order);
}
#define vma_alloc_folio_noprof(gfp, order, vma, addr)                \
        folio_alloc_noprof(gfp, order)
#endif

#define alloc_pages(...)                        alloc_hooks(alloc_pages_noprof(__VA_ARGS__))
#define folio_alloc(...)                        alloc_hooks(folio_alloc_noprof(__VA_ARGS__))
#define folio_alloc_mpol(...)                        alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__))
#define vma_alloc_folio(...)                        alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__))

#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)

static inline struct page *alloc_page_vma_noprof(gfp_t gfp,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr);

        return &folio->page;
}
#define alloc_page_vma(...)                        alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__))

struct page *try_alloc_pages_noprof(int nid, unsigned int order);
#define try_alloc_pages(...)                        alloc_hooks(try_alloc_pages_noprof(__VA_ARGS__))

extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order);
#define __get_free_pages(...)                        alloc_hooks(get_free_pages_noprof(__VA_ARGS__))

extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask);
#define get_zeroed_page(...)                        alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__))

void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1);
#define alloc_pages_exact(...)                        alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__))

void free_pages_exact(void *virt, size_t size);

__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2);
#define alloc_pages_exact_nid(...)                                        \
        alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__))

#define __get_free_page(gfp_mask)                                        \
        __get_free_pages((gfp_mask), 0)

#define __get_dma_pages(gfp_mask, order)                                \
        __get_free_pages((gfp_mask) | GFP_DMA, (order))

extern void __free_pages(struct page *page, unsigned int order);
extern void free_pages_nolock(struct page *page, unsigned int order);
extern void free_pages(unsigned long addr, unsigned int order);

#define __free_page(page) __free_pages((page), 0)
#define free_page(addr) free_pages((addr), 0)

void page_alloc_init_cpuhp(void);
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
void drain_all_pages(struct zone *zone);
void drain_local_pages(struct zone *zone);

void page_alloc_init_late(void);
void setup_pcp_cacheinfo(unsigned int cpu);

/*
 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
 * GFP flags are used before interrupts are enabled. Once interrupts are
 * enabled, it is set to __GFP_BITS_MASK while the system is running. During
 * hibernation, it is used by PM to avoid I/O during memory allocation while
 * devices are suspended.
 */
extern gfp_t gfp_allowed_mask;

/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */
bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);

static inline bool gfp_has_io_fs(gfp_t gfp)
{
        return (gfp & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS);
}

/*
 * Check if the gfp flags allow compaction - GFP_NOIO is a really
 * tricky context because the migration might require IO.
 */
static inline bool gfp_compaction_allowed(gfp_t gfp_mask)
{
        return IS_ENABLED(CONFIG_COMPACTION) && (gfp_mask & __GFP_IO);
}

extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);

#ifdef CONFIG_CONTIG_ALLOC
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range_noprof(unsigned long start, unsigned long end,
                              unsigned migratetype, gfp_t gfp_mask);
#define alloc_contig_range(...)                        alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__))

extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
                                              int nid, nodemask_t *nodemask);
#define alloc_contig_pages(...)                        alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__))

#endif
void free_contig_range(unsigned long pfn, unsigned long nr_pages);

#ifdef CONFIG_CONTIG_ALLOC
static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
                                                        int nid, nodemask_t *node)
{
        struct page *page;

        if (WARN_ON(!order || !(gfp & __GFP_COMP)))
                return NULL;

        page = alloc_contig_pages_noprof(1 << order, gfp, nid, node);

        return page ? page_folio(page) : NULL;
}
#else
static inline struct folio *folio_alloc_gigantic_noprof(int order, gfp_t gfp,
                                                        int nid, nodemask_t *node)
{
        return NULL;
}
#endif
/* This should be paired with folio_put() rather than free_contig_range(). */
#define folio_alloc_gigantic(...) alloc_hooks(folio_alloc_gigantic_noprof(__VA_ARGS__))

#endif /* __LINUX_GFP_H */

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  165 




























  164 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
 * Copyright (C) 2005-2006 Thomas Gleixner
 *
 * This file contains driver APIs to the irq subsystem.
 */

#define pr_fmt(fmt) "genirq: " fmt

#include <linux/irq.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/irqdomain.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/sched/task.h>
#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include <linux/task_work.h>

#include "internals.h"

#if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT)
DEFINE_STATIC_KEY_FALSE(force_irqthreads_key);

static int __init setup_forced_irqthreads(char *arg)
{
        static_branch_enable(&force_irqthreads_key);
        return 0;
}
early_param("threadirqs", setup_forced_irqthreads);
#endif

static int __irq_get_irqchip_state(struct irq_data *d, enum irqchip_irq_state which, bool *state);

static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip)
{
        struct irq_data *irqd = irq_desc_get_irq_data(desc);
        bool inprogress;

        do {
                unsigned long flags;

                /*
                 * Wait until we're out of the critical section.  This might
                 * give the wrong answer due to the lack of memory barriers.
                 */
                while (irqd_irq_inprogress(&desc->irq_data))
                        cpu_relax();

                /* Ok, that indicated we're done: double-check carefully. */
                raw_spin_lock_irqsave(&desc->lock, flags);
                inprogress = irqd_irq_inprogress(&desc->irq_data);

                /*
                 * If requested and supported, check at the chip whether it
                 * is in flight at the hardware level, i.e. already pending
                 * in a CPU and waiting for service and acknowledge.
                 */
                if (!inprogress && sync_chip) {
                        /*
                         * Ignore the return code. inprogress is only updated
                         * when the chip supports it.
                         */
                        __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE,
                                                &inprogress);
                }
                raw_spin_unlock_irqrestore(&desc->lock, flags);

                /* Oops, that failed? */
        } while (inprogress);
}

/**
 *        synchronize_hardirq - wait for pending hard IRQ handlers (on other CPUs)
 *        @irq: interrupt number to wait for
 *
 *        This function waits for any pending hard IRQ handlers for this
 *        interrupt to complete before returning. If you use this
 *        function while holding a resource the IRQ handler may need you
 *        will deadlock. It does not take associated threaded handlers
 *        into account.
 *
 *        Do not use this for shutdown scenarios where you must be sure
 *        that all parts (hardirq and threaded handler) have completed.
 *
 *        Returns: false if a threaded handler is active.
 *
 *        This function may be called - with care - from IRQ context.
 *
 *        It does not check whether there is an interrupt in flight at the
 *        hardware level, but not serviced yet, as this might deadlock when
 *        called with interrupts disabled and the target CPU of the interrupt
 *        is the current CPU.
 */
bool synchronize_hardirq(unsigned int irq)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (desc) {
                __synchronize_hardirq(desc, false);
                return !atomic_read(&desc->threads_active);
        }

        return true;
}
EXPORT_SYMBOL(synchronize_hardirq);

static void __synchronize_irq(struct irq_desc *desc)
{
        __synchronize_hardirq(desc, true);
        /*
         * We made sure that no hardirq handler is running. Now verify that no
         * threaded handlers are active.
         */
        wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
}

/**
 *        synchronize_irq - wait for pending IRQ handlers (on other CPUs)
 *        @irq: interrupt number to wait for
 *
 *        This function waits for any pending IRQ handlers for this interrupt
 *        to complete before returning. If you use this function while
 *        holding a resource the IRQ handler may need you will deadlock.
 *
 *        Can only be called from preemptible code as it might sleep when
 *        an interrupt thread is associated to @irq.
 *
 *        It optionally makes sure (when the irq chip supports that method)
 *        that the interrupt is not pending in any CPU and waiting for
 *        service.
 */
void synchronize_irq(unsigned int irq)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (desc)
                __synchronize_irq(desc);
}
EXPORT_SYMBOL(synchronize_irq);

#ifdef CONFIG_SMP
cpumask_var_t irq_default_affinity;

static bool __irq_can_set_affinity(struct irq_desc *desc)
{
        if (!desc || !irqd_can_balance(&desc->irq_data) ||
            !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
                return false;
        return true;
}

/**
 *        irq_can_set_affinity - Check if the affinity of a given irq can be set
 *        @irq:                Interrupt to check
 *
 */
int irq_can_set_affinity(unsigned int irq)
{
        return __irq_can_set_affinity(irq_to_desc(irq));
}

/**
 * irq_can_set_affinity_usr - Check if affinity of a irq can be set from user space
 * @irq:        Interrupt to check
 *
 * Like irq_can_set_affinity() above, but additionally checks for the
 * AFFINITY_MANAGED flag.
 */
bool irq_can_set_affinity_usr(unsigned int irq)
{
        struct irq_desc *desc = irq_to_desc(irq);

        return __irq_can_set_affinity(desc) &&
                !irqd_affinity_is_managed(&desc->irq_data);
}

/**
 *        irq_set_thread_affinity - Notify irq threads to adjust affinity
 *        @desc:                irq descriptor which has affinity changed
 *
 *        We just set IRQTF_AFFINITY and delegate the affinity setting
 *        to the interrupt thread itself. We can not call
 *        set_cpus_allowed_ptr() here as we hold desc->lock and this
 *        code can be called from hard interrupt context.
 */
static void irq_set_thread_affinity(struct irq_desc *desc)
{
        struct irqaction *action;

        for_each_action_of_desc(desc, action) {
                if (action->thread) {
                        set_bit(IRQTF_AFFINITY, &action->thread_flags);
                        wake_up_process(action->thread);
                }
                if (action->secondary && action->secondary->thread) {
                        set_bit(IRQTF_AFFINITY, &action->secondary->thread_flags);
                        wake_up_process(action->secondary->thread);
                }
        }
}

#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
static void irq_validate_effective_affinity(struct irq_data *data)
{
        const struct cpumask *m = irq_data_get_effective_affinity_mask(data);
        struct irq_chip *chip = irq_data_get_irq_chip(data);

        if (!cpumask_empty(m))
                return;
        pr_warn_once("irq_chip %s did not update eff. affinity mask of irq %u\n",
                     chip->name, data->irq);
}
#else
static inline void irq_validate_effective_affinity(struct irq_data *data) { }
#endif

static DEFINE_PER_CPU(struct cpumask, __tmp_mask);

int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask,
                        bool force)
{
        struct cpumask *tmp_mask = this_cpu_ptr(&__tmp_mask);
        struct irq_desc *desc = irq_data_to_desc(data);
        struct irq_chip *chip = irq_data_get_irq_chip(data);
        const struct cpumask  *prog_mask;
        int ret;

        if (!chip || !chip->irq_set_affinity)
                return -EINVAL;

        /*
         * If this is a managed interrupt and housekeeping is enabled on
         * it check whether the requested affinity mask intersects with
         * a housekeeping CPU. If so, then remove the isolated CPUs from
         * the mask and just keep the housekeeping CPU(s). This prevents
         * the affinity setter from routing the interrupt to an isolated
         * CPU to avoid that I/O submitted from a housekeeping CPU causes
         * interrupts on an isolated one.
         *
         * If the masks do not intersect or include online CPU(s) then
         * keep the requested mask. The isolated target CPUs are only
         * receiving interrupts when the I/O operation was submitted
         * directly from them.
         *
         * If all housekeeping CPUs in the affinity mask are offline, the
         * interrupt will be migrated by the CPU hotplug code once a
         * housekeeping CPU which belongs to the affinity mask comes
         * online.
         */
        if (irqd_affinity_is_managed(data) &&
            housekeeping_enabled(HK_TYPE_MANAGED_IRQ)) {
                const struct cpumask *hk_mask;

                hk_mask = housekeeping_cpumask(HK_TYPE_MANAGED_IRQ);

                cpumask_and(tmp_mask, mask, hk_mask);
                if (!cpumask_intersects(tmp_mask, cpu_online_mask))
                        prog_mask = mask;
                else
                        prog_mask = tmp_mask;
        } else {
                prog_mask = mask;
        }

        /*
         * Make sure we only provide online CPUs to the irqchip,
         * unless we are being asked to force the affinity (in which
         * case we do as we are told).
         */
        cpumask_and(tmp_mask, prog_mask, cpu_online_mask);
        if (!force && !cpumask_empty(tmp_mask))
                ret = chip->irq_set_affinity(data, tmp_mask, force);
        else if (force)
                ret = chip->irq_set_affinity(data, mask, force);
        else
                ret = -EINVAL;

        switch (ret) {
        case IRQ_SET_MASK_OK:
        case IRQ_SET_MASK_OK_DONE:
                cpumask_copy(desc->irq_common_data.affinity, mask);
                fallthrough;
        case IRQ_SET_MASK_OK_NOCOPY:
                irq_validate_effective_affinity(data);
                irq_set_thread_affinity(desc);
                ret = 0;
        }

        return ret;
}

#ifdef CONFIG_GENERIC_PENDING_IRQ
static inline int irq_set_affinity_pending(struct irq_data *data,
                                           const struct cpumask *dest)
{
        struct irq_desc *desc = irq_data_to_desc(data);

        irqd_set_move_pending(data);
        irq_copy_pending(desc, dest);
        return 0;
}
#else
static inline int irq_set_affinity_pending(struct irq_data *data,
                                           const struct cpumask *dest)
{
        return -EBUSY;
}
#endif

static int irq_try_set_affinity(struct irq_data *data,
                                const struct cpumask *dest, bool force)
{
        int ret = irq_do_set_affinity(data, dest, force);

        /*
         * In case that the underlying vector management is busy and the
         * architecture supports the generic pending mechanism then utilize
         * this to avoid returning an error to user space.
         */
        if (ret == -EBUSY && !force)
                ret = irq_set_affinity_pending(data, dest);
        return ret;
}

static bool irq_set_affinity_deactivated(struct irq_data *data,
                                         const struct cpumask *mask)
{
        struct irq_desc *desc = irq_data_to_desc(data);

        /*
         * Handle irq chips which can handle affinity only in activated
         * state correctly
         *
         * If the interrupt is not yet activated, just store the affinity
         * mask and do not call the chip driver at all. On activation the
         * driver has to make sure anyway that the interrupt is in a
         * usable state so startup works.
         */
        if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
            irqd_is_activated(data) || !irqd_affinity_on_activate(data))
                return false;

        cpumask_copy(desc->irq_common_data.affinity, mask);
        irq_data_update_effective_affinity(data, mask);
        irqd_set(data, IRQD_AFFINITY_SET);
        return true;
}

int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask,
                            bool force)
{
        struct irq_chip *chip = irq_data_get_irq_chip(data);
        struct irq_desc *desc = irq_data_to_desc(data);
        int ret = 0;

        if (!chip || !chip->irq_set_affinity)
                return -EINVAL;

        if (irq_set_affinity_deactivated(data, mask))
                return 0;

        if (irq_can_move_pcntxt(data) && !irqd_is_setaffinity_pending(data)) {
                ret = irq_try_set_affinity(data, mask, force);
        } else {
                irqd_set_move_pending(data);
                irq_copy_pending(desc, mask);
        }

        if (desc->affinity_notify) {
                kref_get(&desc->affinity_notify->kref);
                if (!schedule_work(&desc->affinity_notify->work)) {
                        /* Work was already scheduled, drop our extra ref */
                        kref_put(&desc->affinity_notify->kref,
                                 desc->affinity_notify->release);
                }
        }
        irqd_set(data, IRQD_AFFINITY_SET);

        return ret;
}

/**
 * irq_update_affinity_desc - Update affinity management for an interrupt
 * @irq:        The interrupt number to update
 * @affinity:        Pointer to the affinity descriptor
 *
 * This interface can be used to configure the affinity management of
 * interrupts which have been allocated already.
 *
 * There are certain limitations on when it may be used - attempts to use it
 * for when the kernel is configured for generic IRQ reservation mode (in
 * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with
 * managed/non-managed interrupt accounting. In addition, attempts to use it on
 * an interrupt which is already started or which has already been configured
 * as managed will also fail, as these mean invalid init state or double init.
 */
int irq_update_affinity_desc(unsigned int irq,
                             struct irq_affinity_desc *affinity)
{
        struct irq_desc *desc;
        unsigned long flags;
        bool activated;
        int ret = 0;

        /*
         * Supporting this with the reservation scheme used by x86 needs
         * some more thought. Fail it for now.
         */
        if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE))
                return -EOPNOTSUPP;

        desc = irq_get_desc_buslock(irq, &flags, 0);
        if (!desc)
                return -EINVAL;

        /* Requires the interrupt to be shut down */
        if (irqd_is_started(&desc->irq_data)) {
                ret = -EBUSY;
                goto out_unlock;
        }

        /* Interrupts which are already managed cannot be modified */
        if (irqd_affinity_is_managed(&desc->irq_data)) {
                ret = -EBUSY;
                goto out_unlock;
        }

        /*
         * Deactivate the interrupt. That's required to undo
         * anything an earlier activation has established.
         */
        activated = irqd_is_activated(&desc->irq_data);
        if (activated)
                irq_domain_deactivate_irq(&desc->irq_data);

        if (affinity->is_managed) {
                irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
                irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN);
        }

        cpumask_copy(desc->irq_common_data.affinity, &affinity->mask);

        /* Restore the activation state */
        if (activated)
                irq_domain_activate_irq(&desc->irq_data, false);

out_unlock:
        irq_put_desc_busunlock(desc, flags);
        return ret;
}

static int __irq_set_affinity(unsigned int irq, const struct cpumask *mask,
                              bool force)
{
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
        int ret;

        if (!desc)
                return -EINVAL;

        raw_spin_lock_irqsave(&desc->lock, flags);
        ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return ret;
}

/**
 * irq_set_affinity - Set the irq affinity of a given irq
 * @irq:        Interrupt to set affinity
 * @cpumask:        cpumask
 *
 * Fails if cpumask does not contain an online CPU
 */
int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
{
        return __irq_set_affinity(irq, cpumask, false);
}
EXPORT_SYMBOL_GPL(irq_set_affinity);

/**
 * irq_force_affinity - Force the irq affinity of a given irq
 * @irq:        Interrupt to set affinity
 * @cpumask:        cpumask
 *
 * Same as irq_set_affinity, but without checking the mask against
 * online cpus.
 *
 * Solely for low level cpu hotplug code, where we need to make per
 * cpu interrupts affine before the cpu becomes online.
 */
int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
{
        return __irq_set_affinity(irq, cpumask, true);
}
EXPORT_SYMBOL_GPL(irq_force_affinity);

int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
                              bool setaffinity)
{
        unsigned long flags;
        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);

        if (!desc)
                return -EINVAL;
        desc->affinity_hint = m;
        irq_put_desc_unlock(desc, flags);
        if (m && setaffinity)
                __irq_set_affinity(irq, m, false);
        return 0;
}
EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);

static void irq_affinity_notify(struct work_struct *work)
{
        struct irq_affinity_notify *notify =
                container_of(work, struct irq_affinity_notify, work);
        struct irq_desc *desc = irq_to_desc(notify->irq);
        cpumask_var_t cpumask;
        unsigned long flags;

        if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
                goto out;

        raw_spin_lock_irqsave(&desc->lock, flags);
        if (irq_move_pending(&desc->irq_data))
                irq_get_pending(cpumask, desc);
        else
                cpumask_copy(cpumask, desc->irq_common_data.affinity);
        raw_spin_unlock_irqrestore(&desc->lock, flags);

        notify->notify(notify, cpumask);

        free_cpumask_var(cpumask);
out:
        kref_put(&notify->kref, notify->release);
}

/**
 *        irq_set_affinity_notifier - control notification of IRQ affinity changes
 *        @irq:                Interrupt for which to enable/disable notification
 *        @notify:        Context for notification, or %NULL to disable
 *                        notification.  Function pointers must be initialised;
 *                        the other fields will be initialised by this function.
 *
 *        Must be called in process context.  Notification may only be enabled
 *        after the IRQ is allocated and must be disabled before the IRQ is
 *        freed using free_irq().
 */
int
irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
{
        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_affinity_notify *old_notify;
        unsigned long flags;

        /* The release function is promised process context */
        might_sleep();

        if (!desc || irq_is_nmi(desc))
                return -EINVAL;

        /* Complete initialisation of *notify */
        if (notify) {
                notify->irq = irq;
                kref_init(&notify->kref);
                INIT_WORK(&notify->work, irq_affinity_notify);
        }

        raw_spin_lock_irqsave(&desc->lock, flags);
        old_notify = desc->affinity_notify;
        desc->affinity_notify = notify;
        raw_spin_unlock_irqrestore(&desc->lock, flags);

        if (old_notify) {
                if (cancel_work_sync(&old_notify->work)) {
                        /* Pending work had a ref, put that one too */
                        kref_put(&old_notify->kref, old_notify->release);
                }
                kref_put(&old_notify->kref, old_notify->release);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);

#ifndef CONFIG_AUTO_IRQ_AFFINITY
/*
 * Generic version of the affinity autoselector.
 */
int irq_setup_affinity(struct irq_desc *desc)
{
        struct cpumask *set = irq_default_affinity;
        int ret, node = irq_desc_get_node(desc);
        static DEFINE_RAW_SPINLOCK(mask_lock);
        static struct cpumask mask;

        /* Excludes PER_CPU and NO_BALANCE interrupts */
        if (!__irq_can_set_affinity(desc))
                return 0;

        raw_spin_lock(&mask_lock);
        /*
         * Preserve the managed affinity setting and a userspace affinity
         * setup, but make sure that one of the targets is online.
         */
        if (irqd_affinity_is_managed(&desc->irq_data) ||
            irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
                if (cpumask_intersects(desc->irq_common_data.affinity,
                                       cpu_online_mask))
                        set = desc->irq_common_data.affinity;
                else
                        irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
        }

        cpumask_and(&mask, cpu_online_mask, set);
        if (cpumask_empty(&mask))
                cpumask_copy(&mask, cpu_online_mask);

        if (node != NUMA_NO_NODE) {
                const struct cpumask *nodemask = cpumask_of_node(node);

                /* make sure at least one of the cpus in nodemask is online */
                if (cpumask_intersects(&mask, nodemask))
                        cpumask_and(&mask, &mask, nodemask);
        }
        ret = irq_do_set_affinity(&desc->irq_data, &mask, false);
        raw_spin_unlock(&mask_lock);
        return ret;
}
#else
/* Wrapper for ALPHA specific affinity selector magic */
int irq_setup_affinity(struct irq_desc *desc)
{
        return irq_select_affinity(irq_desc_get_irq(desc));
}
#endif /* CONFIG_AUTO_IRQ_AFFINITY */
#endif /* CONFIG_SMP */


/**
 *        irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
 *        @irq: interrupt number to set affinity
 *        @vcpu_info: vCPU specific data or pointer to a percpu array of vCPU
 *                    specific data for percpu_devid interrupts
 *
 *        This function uses the vCPU specific data to set the vCPU
 *        affinity for an irq. The vCPU specific data is passed from
 *        outside, such as KVM. One example code path is as below:
 *        KVM -> IOMMU -> irq_set_vcpu_affinity().
 */
int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
{
        unsigned long flags;
        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        struct irq_data *data;
        struct irq_chip *chip;
        int ret = -ENOSYS;

        if (!desc)
                return -EINVAL;

        data = irq_desc_get_irq_data(desc);
        do {
                chip = irq_data_get_irq_chip(data);
                if (chip && chip->irq_set_vcpu_affinity)
                        break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
                data = data->parent_data;
#else
                data = NULL;
#endif
        } while (data);

        if (data)
                ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
        irq_put_desc_unlock(desc, flags);

        return ret;
}
EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);

void __disable_irq(struct irq_desc *desc)
{
        if (!desc->depth++)
                irq_disable(desc);
}

static int __disable_irq_nosync(unsigned int irq)
{
        unsigned long flags;
        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);

        if (!desc)
                return -EINVAL;
        __disable_irq(desc);
        irq_put_desc_busunlock(desc, flags);
        return 0;
}

/**
 *        disable_irq_nosync - disable an irq without waiting
 *        @irq: Interrupt to disable
 *
 *        Disable the selected interrupt line.  Disables and Enables are
 *        nested.
 *        Unlike disable_irq(), this function does not ensure existing
 *        instances of the IRQ handler have completed before returning.
 *
 *        This function may be called from IRQ context.
 */
void disable_irq_nosync(unsigned int irq)
{
        __disable_irq_nosync(irq);
}
EXPORT_SYMBOL(disable_irq_nosync);

/**
 *        disable_irq - disable an irq and wait for completion
 *        @irq: Interrupt to disable
 *
 *        Disable the selected interrupt line.  Enables and Disables are
 *        nested.
 *        This function waits for any pending IRQ handlers for this interrupt
 *        to complete before returning. If you use this function while
 *        holding a resource the IRQ handler may need you will deadlock.
 *
 *        Can only be called from preemptible code as it might sleep when
 *        an interrupt thread is associated to @irq.
 *
 */
void disable_irq(unsigned int irq)
{
        might_sleep();
        if (!__disable_irq_nosync(irq))
                synchronize_irq(irq);
}
EXPORT_SYMBOL(disable_irq);

/**
 *        disable_hardirq - disables an irq and waits for hardirq completion
 *        @irq: Interrupt to disable
 *
 *        Disable the selected interrupt line.  Enables and Disables are
 *        nested.
 *        This function waits for any pending hard IRQ handlers for this
 *        interrupt to complete before returning. If you use this function while
 *        holding a resource the hard IRQ handler may need you will deadlock.
 *
 *        When used to optimistically disable an interrupt from atomic context
 *        the return value must be checked.
 *
 *        Returns: false if a threaded handler is active.
 *
 *        This function may be called - with care - from IRQ context.
 */
bool disable_hardirq(unsigned int irq)
{
        if (!__disable_irq_nosync(irq))
                return synchronize_hardirq(irq);

        return false;
}
EXPORT_SYMBOL_GPL(disable_hardirq);

/**
 *        disable_nmi_nosync - disable an nmi without waiting
 *        @irq: Interrupt to disable
 *
 *        Disable the selected interrupt line. Disables and enables are
 *        nested.
 *        The interrupt to disable must have been requested through request_nmi.
 *        Unlike disable_nmi(), this function does not ensure existing
 *        instances of the IRQ handler have completed before returning.
 */
void disable_nmi_nosync(unsigned int irq)
{
        disable_irq_nosync(irq);
}

void __enable_irq(struct irq_desc *desc)
{
        switch (desc->depth) {
        case 0:
 err_out:
                WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n",
                     irq_desc_get_irq(desc));
                break;
        case 1: {
                if (desc->istate & IRQS_SUSPENDED)
                        goto err_out;
                /* Prevent probing on this irq: */
                irq_settings_set_noprobe(desc);
                /*
                 * Call irq_startup() not irq_enable() here because the
                 * interrupt might be marked NOAUTOEN so irq_startup()
                 * needs to be invoked when it gets enabled the first time.
                 * This is also required when __enable_irq() is invoked for
                 * a managed and shutdown interrupt from the S3 resume
                 * path.
                 *
                 * If it was already started up, then irq_startup() will
                 * invoke irq_enable() under the hood.
                 */
                irq_startup(desc, IRQ_RESEND, IRQ_START_FORCE);
                break;
        }
        default:
                desc->depth--;
        }
}

/**
 *        enable_irq - enable handling of an irq
 *        @irq: Interrupt to enable
 *
 *        Undoes the effect of one call to disable_irq().  If this
 *        matches the last disable, processing of interrupts on this
 *        IRQ line is re-enabled.
 *
 *        This function may be called from IRQ context only when
 *        desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
 */
void enable_irq(unsigned int irq)
{
        unsigned long flags;
        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);

        if (!desc)
                return;
        if (WARN(!desc->irq_data.chip,
                 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
                goto out;

        __enable_irq(desc);
out:
        irq_put_desc_busunlock(desc, flags);
}
EXPORT_SYMBOL(enable_irq);

/**
 *        enable_nmi - enable handling of an nmi
 *        @irq: Interrupt to enable
 *
 *        The interrupt to enable must have been requested through request_nmi.
 *        Undoes the effect of one call to disable_nmi(). If this
 *        matches the last disable, processing of interrupts on this
 *        IRQ line is re-enabled.
 */
void enable_nmi(unsigned int irq)
{
        enable_irq(irq);
}

static int set_irq_wake_real(unsigned int irq, unsigned int on)
{
        struct irq_desc *desc = irq_to_desc(irq);
        int ret = -ENXIO;

        if (irq_desc_get_chip(desc)->flags &  IRQCHIP_SKIP_SET_WAKE)
                return 0;

        if (desc->irq_data.chip->irq_set_wake)
                ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);

        return ret;
}

/**
 *        irq_set_irq_wake - control irq power management wakeup
 *        @irq:        interrupt to control
 *        @on:        enable/disable power management wakeup
 *
 *        Enable/disable power management wakeup mode, which is
 *        disabled by default.  Enables and disables must match,
 *        just as they match for non-wakeup mode support.
 *
 *        Wakeup mode lets this IRQ wake the system from sleep
 *        states like "suspend to RAM".
 *
 *        Note: irq enable/disable state is completely orthogonal
 *        to the enable/disable state of irq wake. An irq can be
 *        disabled with disable_irq() and still wake the system as
 *        long as the irq has wake enabled. If this does not hold,
 *        then the underlying irq chip and the related driver need
 *        to be investigated.
 */
int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
        unsigned long flags;
        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        int ret = 0;

        if (!desc)
                return -EINVAL;

        /* Don't use NMIs as wake up interrupts please */
        if (irq_is_nmi(desc)) {
                ret = -EINVAL;
                goto out_unlock;
        }

        /* wakeup-capable irqs can be shared between drivers that
         * don't need to have the same sleep mode behaviors.
         */
        if (on) {
                if (desc->wake_depth++ == 0) {
                        ret = set_irq_wake_real(irq, on);
                        if (ret)
                                desc->wake_depth = 0;
                        else
                                irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
                }
        } else {
                if (desc->wake_depth == 0) {
                        WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
                } else if (--desc->wake_depth == 0) {
                        ret = set_irq_wake_real(irq, on);
                        if (ret)
                                desc->wake_depth = 1;
                        else
                                irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
                }
        }

out_unlock:
        irq_put_desc_busunlock(desc, flags);
        return ret;
}
EXPORT_SYMBOL(irq_set_irq_wake);

/*
 * Internal function that tells the architecture code whether a
 * particular irq has been exclusively allocated or is available
 * for driver use.
 */
int can_request_irq(unsigned int irq, unsigned long irqflags)
{
        unsigned long flags;
        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        int canrequest = 0;

        if (!desc)
                return 0;

        if (irq_settings_can_request(desc)) {
                if (!desc->action ||
                    irqflags & desc->action->flags & IRQF_SHARED)
                        canrequest = 1;
        }
        irq_put_desc_unlock(desc, flags);
        return canrequest;
}

int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
{
        struct irq_chip *chip = desc->irq_data.chip;
        int ret, unmask = 0;

        if (!chip || !chip->irq_set_type) {
                /*
                 * IRQF_TRIGGER_* but the PIC does not support multiple
                 * flow-types?
                 */
                pr_debug("No set_type function for IRQ %d (%s)\n",
                         irq_desc_get_irq(desc),
                         chip ? (chip->name ? : "unknown") : "unknown");
                return 0;
        }

        if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
                if (!irqd_irq_masked(&desc->irq_data))
                        mask_irq(desc);
                if (!irqd_irq_disabled(&desc->irq_data))
                        unmask = 1;
        }

        /* Mask all flags except trigger mode */
        flags &= IRQ_TYPE_SENSE_MASK;
        ret = chip->irq_set_type(&desc->irq_data, flags);

        switch (ret) {
        case IRQ_SET_MASK_OK:
        case IRQ_SET_MASK_OK_DONE:
                irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
                irqd_set(&desc->irq_data, flags);
                fallthrough;

        case IRQ_SET_MASK_OK_NOCOPY:
                flags = irqd_get_trigger_type(&desc->irq_data);
                irq_settings_set_trigger_mask(desc, flags);
                irqd_clear(&desc->irq_data, IRQD_LEVEL);
                irq_settings_clr_level(desc);
                if (flags & IRQ_TYPE_LEVEL_MASK) {
                        irq_settings_set_level(desc);
                        irqd_set(&desc->irq_data, IRQD_LEVEL);
                }

                ret = 0;
                break;
        default:
                pr_err("Setting trigger mode %lu for irq %u failed (%pS)\n",
                       flags, irq_desc_get_irq(desc), chip->irq_set_type);
        }
        if (unmask)
                unmask_irq(desc);
        return ret;
}

#ifdef CONFIG_HARDIRQS_SW_RESEND
int irq_set_parent(int irq, int parent_irq)
{
        unsigned long flags;
        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);

        if (!desc)
                return -EINVAL;

        desc->parent_irq = parent_irq;

        irq_put_desc_unlock(desc, flags);
        return 0;
}
EXPORT_SYMBOL_GPL(irq_set_parent);
#endif

/*
 * Default primary interrupt handler for threaded interrupts. Is
 * assigned as primary handler when request_threaded_irq is called
 * with handler == NULL. Useful for oneshot interrupts.
 */
static irqreturn_t irq_default_primary_handler(int irq, void *dev_id)
{
        return IRQ_WAKE_THREAD;
}

/*
 * Primary handler for nested threaded interrupts. Should never be
 * called.
 */
static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
{
        WARN(1, "Primary handler called for nested irq %d\n", irq);
        return IRQ_NONE;
}

static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id)
{
        WARN(1, "Secondary action handler called for irq %d\n", irq);
        return IRQ_NONE;
}

#ifdef CONFIG_SMP
/*
 * Check whether we need to change the affinity of the interrupt thread.
 */
static void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
{
        cpumask_var_t mask;
        bool valid = false;

        if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags))
                return;

        __set_current_state(TASK_RUNNING);

        /*
         * In case we are out of memory we set IRQTF_AFFINITY again and
         * try again next time
         */
        if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
                set_bit(IRQTF_AFFINITY, &action->thread_flags);
                return;
        }

        raw_spin_lock_irq(&desc->lock);
        /*
         * This code is triggered unconditionally. Check the affinity
         * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out.
         */
        if (cpumask_available(desc->irq_common_data.affinity)) {
                const struct cpumask *m;

                m = irq_data_get_effective_affinity_mask(&desc->irq_data);
                cpumask_copy(mask, m);
                valid = true;
        }
        raw_spin_unlock_irq(&desc->lock);

        if (valid)
                set_cpus_allowed_ptr(current, mask);
        free_cpumask_var(mask);
}
#else
static inline void irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
#endif

static int irq_wait_for_interrupt(struct irq_desc *desc,
                                  struct irqaction *action)
{
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                irq_thread_check_affinity(desc, action);

                if (kthread_should_stop()) {
                        /* may need to run one last time */
                        if (test_and_clear_bit(IRQTF_RUNTHREAD,
                                               &action->thread_flags)) {
                                __set_current_state(TASK_RUNNING);
                                return 0;
                        }
                        __set_current_state(TASK_RUNNING);
                        return -1;
                }

                if (test_and_clear_bit(IRQTF_RUNTHREAD,
                                       &action->thread_flags)) {
                        __set_current_state(TASK_RUNNING);
                        return 0;
                }
                schedule();
        }
}

/*
 * Oneshot interrupts keep the irq line masked until the threaded
 * handler finished. unmask if the interrupt has not been disabled and
 * is marked MASKED.
 */
static void irq_finalize_oneshot(struct irq_desc *desc,
                                 struct irqaction *action)
{
        if (!(desc->istate & IRQS_ONESHOT) ||
            action->handler == irq_forced_secondary_handler)
                return;
again:
        chip_bus_lock(desc);
        raw_spin_lock_irq(&desc->lock);

        /*
         * Implausible though it may be we need to protect us against
         * the following scenario:
         *
         * The thread is faster done than the hard interrupt handler
         * on the other CPU. If we unmask the irq line then the
         * interrupt can come in again and masks the line, leaves due
         * to IRQS_INPROGRESS and the irq line is masked forever.
         *
         * This also serializes the state of shared oneshot handlers
         * versus "desc->threads_oneshot |= action->thread_mask;" in
         * irq_wake_thread(). See the comment there which explains the
         * serialization.
         */
        if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
                raw_spin_unlock_irq(&desc->lock);
                chip_bus_sync_unlock(desc);
                cpu_relax();
                goto again;
        }

        /*
         * Now check again, whether the thread should run. Otherwise
         * we would clear the threads_oneshot bit of this thread which
         * was just set.
         */
        if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
                goto out_unlock;

        desc->threads_oneshot &= ~action->thread_mask;

        if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
            irqd_irq_masked(&desc->irq_data))
                unmask_threaded_irq(desc);

out_unlock:
        raw_spin_unlock_irq(&desc->lock);
        chip_bus_sync_unlock(desc);
}

/*
 * Interrupts explicitly requested as threaded interrupts want to be
 * preemptible - many of them need to sleep and wait for slow busses to
 * complete.
 */
static irqreturn_t irq_thread_fn(struct irq_desc *desc,        struct irqaction *action)
{
        irqreturn_t ret = action->thread_fn(action->irq, action->dev_id);

        if (ret == IRQ_HANDLED)
                atomic_inc(&desc->threads_handled);

        irq_finalize_oneshot(desc, action);
        return ret;
}

/*
 * Interrupts which are not explicitly requested as threaded
 * interrupts rely on the implicit bh/preempt disable of the hard irq
 * context. So we need to disable bh here to avoid deadlocks and other
 * side effects.
 */
static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
{
        irqreturn_t ret;

        local_bh_disable();
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_irq_disable();
        ret = irq_thread_fn(desc, action);
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_irq_enable();
        local_bh_enable();
        return ret;
}

void wake_threads_waitq(struct irq_desc *desc)
{
        if (atomic_dec_and_test(&desc->threads_active))
                wake_up(&desc->wait_for_threads);
}

static void irq_thread_dtor(struct callback_head *unused)
{
        struct task_struct *tsk = current;
        struct irq_desc *desc;
        struct irqaction *action;

        if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))
                return;

        action = kthread_data(tsk);

        pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
               tsk->comm, tsk->pid, action->irq);


        desc = irq_to_desc(action->irq);
        /*
         * If IRQTF_RUNTHREAD is set, we need to decrement
         * desc->threads_active and wake possible waiters.
         */
        if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))
                wake_threads_waitq(desc);

        /* Prevent a stale desc->threads_oneshot */
        irq_finalize_oneshot(desc, action);
}

static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
{
        struct irqaction *secondary = action->secondary;

        if (WARN_ON_ONCE(!secondary))
                return;

        raw_spin_lock_irq(&desc->lock);
        __irq_wake_thread(desc, secondary);
        raw_spin_unlock_irq(&desc->lock);
}

/*
 * Internal function to notify that a interrupt thread is ready.
 */
static void irq_thread_set_ready(struct irq_desc *desc,
                                 struct irqaction *action)
{
        set_bit(IRQTF_READY, &action->thread_flags);
        wake_up(&desc->wait_for_threads);
}

/*
 * Internal function to wake up a interrupt thread and wait until it is
 * ready.
 */
static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc,
                                                  struct irqaction *action)
{
        if (!action || !action->thread)
                return;

        wake_up_process(action->thread);
        wait_event(desc->wait_for_threads,
                   test_bit(IRQTF_READY, &action->thread_flags));
}

/*
 * Interrupt handler thread
 */
static int irq_thread(void *data)
{
        struct callback_head on_exit_work;
        struct irqaction *action = data;
        struct irq_desc *desc = irq_to_desc(action->irq);
        irqreturn_t (*handler_fn)(struct irq_desc *desc,
                        struct irqaction *action);

        irq_thread_set_ready(desc, action);

        sched_set_fifo(current);

        if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
                                           &action->thread_flags))
                handler_fn = irq_forced_thread_fn;
        else
                handler_fn = irq_thread_fn;

        init_task_work(&on_exit_work, irq_thread_dtor);
        task_work_add(current, &on_exit_work, TWA_NONE);

        while (!irq_wait_for_interrupt(desc, action)) {
                irqreturn_t action_ret;

                action_ret = handler_fn(desc, action);
                if (action_ret == IRQ_WAKE_THREAD)
                        irq_wake_secondary(desc, action);

                wake_threads_waitq(desc);
        }

        /*
         * This is the regular exit path. __free_irq() is stopping the
         * thread via kthread_stop() after calling
         * synchronize_hardirq(). So neither IRQTF_RUNTHREAD nor the
         * oneshot mask bit can be set.
         */
        task_work_cancel_func(current, irq_thread_dtor);
        return 0;
}

/**
 *        irq_wake_thread - wake the irq thread for the action identified by dev_id
 *        @irq:                Interrupt line
 *        @dev_id:        Device identity for which the thread should be woken
 *
 */
void irq_wake_thread(unsigned int irq, void *dev_id)
{
        struct irq_desc *desc = irq_to_desc(irq);
        struct irqaction *action;
        unsigned long flags;

        if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return;

        raw_spin_lock_irqsave(&desc->lock, flags);
        for_each_action_of_desc(desc, action) {
                if (action->dev_id == dev_id) {
                        if (action->thread)
                                __irq_wake_thread(desc, action);
                        break;
                }
        }
        raw_spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL_GPL(irq_wake_thread);

static int irq_setup_forced_threading(struct irqaction *new)
{
        if (!force_irqthreads())
                return 0;
        if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
                return 0;

        /*
         * No further action required for interrupts which are requested as
         * threaded interrupts already
         */
        if (new->handler == irq_default_primary_handler)
                return 0;

        new->flags |= IRQF_ONESHOT;

        /*
         * Handle the case where we have a real primary handler and a
         * thread handler. We force thread them as well by creating a
         * secondary action.
         */
        if (new->handler && new->thread_fn) {
                /* Allocate the secondary action */
                new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
                if (!new->secondary)
                        return -ENOMEM;
                new->secondary->handler = irq_forced_secondary_handler;
                new->secondary->thread_fn = new->thread_fn;
                new->secondary->dev_id = new->dev_id;
                new->secondary->irq = new->irq;
                new->secondary->name = new->name;
        }
        /* Deal with the primary handler */
        set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
        new->thread_fn = new->handler;
        new->handler = irq_default_primary_handler;
        return 0;
}

static int irq_request_resources(struct irq_desc *desc)
{
        struct irq_data *d = &desc->irq_data;
        struct irq_chip *c = d->chip;

        return c->irq_request_resources ? c->irq_request_resources(d) : 0;
}

static void irq_release_resources(struct irq_desc *desc)
{
        struct irq_data *d = &desc->irq_data;
        struct irq_chip *c = d->chip;

        if (c->irq_release_resources)
                c->irq_release_resources(d);
}

static bool irq_supports_nmi(struct irq_desc *desc)
{
        struct irq_data *d = irq_desc_get_irq_data(desc);

#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
        /* Only IRQs directly managed by the root irqchip can be set as NMI */
        if (d->parent_data)
                return false;
#endif
        /* Don't support NMIs for chips behind a slow bus */
        if (d->chip->irq_bus_lock || d->chip->irq_bus_sync_unlock)
                return false;

        return d->chip->flags & IRQCHIP_SUPPORTS_NMI;
}

static int irq_nmi_setup(struct irq_desc *desc)
{
        struct irq_data *d = irq_desc_get_irq_data(desc);
        struct irq_chip *c = d->chip;

        return c->irq_nmi_setup ? c->irq_nmi_setup(d) : -EINVAL;
}

static void irq_nmi_teardown(struct irq_desc *desc)
{
        struct irq_data *d = irq_desc_get_irq_data(desc);
        struct irq_chip *c = d->chip;

        if (c->irq_nmi_teardown)
                c->irq_nmi_teardown(d);
}

static int
setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
{
        struct task_struct *t;

        if (!secondary) {
                t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
                                   new->name);
        } else {
                t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
                                   new->name);
        }

        if (IS_ERR(t))
                return PTR_ERR(t);

        /*
         * We keep the reference to the task struct even if
         * the thread dies to avoid that the interrupt code
         * references an already freed task_struct.
         */
        new->thread = get_task_struct(t);
        /*
         * Tell the thread to set its affinity. This is
         * important for shared interrupt handlers as we do
         * not invoke setup_affinity() for the secondary
         * handlers as everything is already set up. Even for
         * interrupts marked with IRQF_NO_BALANCE this is
         * correct as we want the thread to move to the cpu(s)
         * on which the requesting code placed the interrupt.
         */
        set_bit(IRQTF_AFFINITY, &new->thread_flags);
        return 0;
}

/*
 * Internal function to register an irqaction - typically used to
 * allocate special interrupts that are part of the architecture.
 *
 * Locking rules:
 *
 * desc->request_mutex        Provides serialization against a concurrent free_irq()
 *   chip_bus_lock        Provides serialization for slow bus operations
 *     desc->lock        Provides serialization against hard interrupts
 *
 * chip_bus_lock and desc->lock are sufficient for all other management and
 * interrupt related functions. desc->request_mutex solely serializes
 * request/free_irq().
 */
static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
        struct irqaction *old, **old_ptr;
        unsigned long flags, thread_mask = 0;
        int ret, nested, shared = 0;

        if (!desc)
                return -EINVAL;

        if (desc->irq_data.chip == &no_irq_chip)
                return -ENOSYS;
        if (!try_module_get(desc->owner))
                return -ENODEV;

        new->irq = irq;

        /*
         * If the trigger type is not specified by the caller,
         * then use the default for this interrupt.
         */
        if (!(new->flags & IRQF_TRIGGER_MASK))
                new->flags |= irqd_get_trigger_type(&desc->irq_data);

        /*
         * Check whether the interrupt nests into another interrupt
         * thread.
         */
        nested = irq_settings_is_nested_thread(desc);
        if (nested) {
                if (!new->thread_fn) {
                        ret = -EINVAL;
                        goto out_mput;
                }
                /*
                 * Replace the primary handler which was provided from
                 * the driver for non nested interrupt handling by the
                 * dummy function which warns when called.
                 */
                new->handler = irq_nested_primary_handler;
        } else {
                if (irq_settings_can_thread(desc)) {
                        ret = irq_setup_forced_threading(new);
                        if (ret)
                                goto out_mput;
                }
        }

        /*
         * Create a handler thread when a thread function is supplied
         * and the interrupt does not nest into another interrupt
         * thread.
         */
        if (new->thread_fn && !nested) {
                ret = setup_irq_thread(new, irq, false);
                if (ret)
                        goto out_mput;
                if (new->secondary) {
                        ret = setup_irq_thread(new->secondary, irq, true);
                        if (ret)
                                goto out_thread;
                }
        }

        /*
         * Drivers are often written to work w/o knowledge about the
         * underlying irq chip implementation, so a request for a
         * threaded irq without a primary hard irq context handler
         * requires the ONESHOT flag to be set. Some irq chips like
         * MSI based interrupts are per se one shot safe. Check the
         * chip flags, so we can avoid the unmask dance at the end of
         * the threaded handler for those.
         */
        if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
                new->flags &= ~IRQF_ONESHOT;

        /*
         * Protects against a concurrent __free_irq() call which might wait
         * for synchronize_hardirq() to complete without holding the optional
         * chip bus lock and desc->lock. Also protects against handing out
         * a recycled oneshot thread_mask bit while it's still in use by
         * its previous owner.
         */
        mutex_lock(&desc->request_mutex);

        /*
         * Acquire bus lock as the irq_request_resources() callback below
         * might rely on the serialization or the magic power management
         * functions which are abusing the irq_bus_lock() callback,
         */
        chip_bus_lock(desc);

        /* First installed action requests resources. */
        if (!desc->action) {
                ret = irq_request_resources(desc);
                if (ret) {
                        pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
                               new->name, irq, desc->irq_data.chip->name);
                        goto out_bus_unlock;
                }
        }

        /*
         * The following block of code has to be executed atomically
         * protected against a concurrent interrupt and any of the other
         * management calls which are not serialized via
         * desc->request_mutex or the optional bus lock.
         */
        raw_spin_lock_irqsave(&desc->lock, flags);
        old_ptr = &desc->action;
        old = *old_ptr;
        if (old) {
                /*
                 * Can't share interrupts unless both agree to and are
                 * the same type (level, edge, polarity). So both flag
                 * fields must have IRQF_SHARED set and the bits which
                 * set the trigger type must match. Also all must
                 * agree on ONESHOT.
                 * Interrupt lines used for NMIs cannot be shared.
                 */
                unsigned int oldtype;

                if (irq_is_nmi(desc)) {
                        pr_err("Invalid attempt to share NMI for %s (irq %d) on irqchip %s.\n",
                                new->name, irq, desc->irq_data.chip->name);
                        ret = -EINVAL;
                        goto out_unlock;
                }

                /*
                 * If nobody did set the configuration before, inherit
                 * the one provided by the requester.
                 */
                if (irqd_trigger_type_was_set(&desc->irq_data)) {
                        oldtype = irqd_get_trigger_type(&desc->irq_data);
                } else {
                        oldtype = new->flags & IRQF_TRIGGER_MASK;
                        irqd_set_trigger_type(&desc->irq_data, oldtype);
                }

                if (!((old->flags & new->flags) & IRQF_SHARED) ||
                    (oldtype != (new->flags & IRQF_TRIGGER_MASK)))
                        goto mismatch;

                if ((old->flags & IRQF_ONESHOT) &&
                    (new->flags & IRQF_COND_ONESHOT))
                        new->flags |= IRQF_ONESHOT;
                else if ((old->flags ^ new->flags) & IRQF_ONESHOT)
                        goto mismatch;

                /* All handlers must agree on per-cpuness */
                if ((old->flags & IRQF_PERCPU) !=
                    (new->flags & IRQF_PERCPU))
                        goto mismatch;

                /* add new interrupt at end of irq queue */
                do {
                        /*
                         * Or all existing action->thread_mask bits,
                         * so we can find the next zero bit for this
                         * new action.
                         */
                        thread_mask |= old->thread_mask;
                        old_ptr = &old->next;
                        old = *old_ptr;
                } while (old);
                shared = 1;
        }

        /*
         * Setup the thread mask for this irqaction for ONESHOT. For
         * !ONESHOT irqs the thread mask is 0 so we can avoid a
         * conditional in irq_wake_thread().
         */
        if (new->flags & IRQF_ONESHOT) {
                /*
                 * Unlikely to have 32 resp 64 irqs sharing one line,
                 * but who knows.
                 */
                if (thread_mask == ~0UL) {
                        ret = -EBUSY;
                        goto out_unlock;
                }
                /*
                 * The thread_mask for the action is or'ed to
                 * desc->thread_active to indicate that the
                 * IRQF_ONESHOT thread handler has been woken, but not
                 * yet finished. The bit is cleared when a thread
                 * completes. When all threads of a shared interrupt
                 * line have completed desc->threads_active becomes
                 * zero and the interrupt line is unmasked. See
                 * handle.c:irq_wake_thread() for further information.
                 *
                 * If no thread is woken by primary (hard irq context)
                 * interrupt handlers, then desc->threads_active is
                 * also checked for zero to unmask the irq line in the
                 * affected hard irq flow handlers
                 * (handle_[fasteoi|level]_irq).
                 *
                 * The new action gets the first zero bit of
                 * thread_mask assigned. See the loop above which or's
                 * all existing action->thread_mask bits.
                 */
                new->thread_mask = 1UL << ffz(thread_mask);

        } else if (new->handler == irq_default_primary_handler &&
                   !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
                /*
                 * The interrupt was requested with handler = NULL, so
                 * we use the default primary handler for it. But it
                 * does not have the oneshot flag set. In combination
                 * with level interrupts this is deadly, because the
                 * default primary handler just wakes the thread, then
                 * the irq lines is reenabled, but the device still
                 * has the level irq asserted. Rinse and repeat....
                 *
                 * While this works for edge type interrupts, we play
                 * it safe and reject unconditionally because we can't
                 * say for sure which type this interrupt really
                 * has. The type flags are unreliable as the
                 * underlying chip implementation can override them.
                 */
                pr_err("Threaded irq requested with handler=NULL and !ONESHOT for %s (irq %d)\n",
                       new->name, irq);
                ret = -EINVAL;
                goto out_unlock;
        }

        if (!shared) {
                /* Setup the type (level, edge polarity) if configured: */
                if (new->flags & IRQF_TRIGGER_MASK) {
                        ret = __irq_set_trigger(desc,
                                                new->flags & IRQF_TRIGGER_MASK);

                        if (ret)
                                goto out_unlock;
                }

                /*
                 * Activate the interrupt. That activation must happen
                 * independently of IRQ_NOAUTOEN. request_irq() can fail
                 * and the callers are supposed to handle
                 * that. enable_irq() of an interrupt requested with
                 * IRQ_NOAUTOEN is not supposed to fail. The activation
                 * keeps it in shutdown mode, it merily associates
                 * resources if necessary and if that's not possible it
                 * fails. Interrupts which are in managed shutdown mode
                 * will simply ignore that activation request.
                 */
                ret = irq_activate(desc);
                if (ret)
                        goto out_unlock;

                desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
                                  IRQS_ONESHOT | IRQS_WAITING);
                irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);

                if (new->flags & IRQF_PERCPU) {
                        irqd_set(&desc->irq_data, IRQD_PER_CPU);
                        irq_settings_set_per_cpu(desc);
                        if (new->flags & IRQF_NO_DEBUG)
                                irq_settings_set_no_debug(desc);
                }

                if (noirqdebug)
                        irq_settings_set_no_debug(desc);

                if (new->flags & IRQF_ONESHOT)
                        desc->istate |= IRQS_ONESHOT;

                /* Exclude IRQ from balancing if requested */
                if (new->flags & IRQF_NOBALANCING) {
                        irq_settings_set_no_balancing(desc);
                        irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
                }

                if (!(new->flags & IRQF_NO_AUTOEN) &&
                    irq_settings_can_autoenable(desc)) {
                        irq_startup(desc, IRQ_RESEND, IRQ_START_COND);
                } else {
                        /*
                         * Shared interrupts do not go well with disabling
                         * auto enable. The sharing interrupt might request
                         * it while it's still disabled and then wait for
                         * interrupts forever.
                         */
                        WARN_ON_ONCE(new->flags & IRQF_SHARED);
                        /* Undo nested disables: */
                        desc->depth = 1;
                }

        } else if (new->flags & IRQF_TRIGGER_MASK) {
                unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
                unsigned int omsk = irqd_get_trigger_type(&desc->irq_data);

                if (nmsk != omsk)
                        /* hope the handler works with current  trigger mode */
                        pr_warn("irq %d uses trigger mode %u; requested %u\n",
                                irq, omsk, nmsk);
        }

        *old_ptr = new;

        irq_pm_install_action(desc, new);

        /* Reset broken irq detection when installing new handler */
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;

        /*
         * Check whether we disabled the irq via the spurious handler
         * before. Reenable it and give it another chance.
         */
        if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
                desc->istate &= ~IRQS_SPURIOUS_DISABLED;
                __enable_irq(desc);
        }

        raw_spin_unlock_irqrestore(&desc->lock, flags);
        chip_bus_sync_unlock(desc);
        mutex_unlock(&desc->request_mutex);

        irq_setup_timings(desc, new);

        wake_up_and_wait_for_irq_thread_ready(desc, new);
        wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);

        register_irq_proc(irq, desc);
        new->dir = NULL;
        register_handler_proc(irq, new);
        return 0;

mismatch:
        if (!(new->flags & IRQF_PROBE_SHARED)) {
                pr_err("Flags mismatch irq %d. %08x (%s) vs. %08x (%s)\n",
                       irq, new->flags, new->name, old->flags, old->name);
#ifdef CONFIG_DEBUG_SHIRQ
                dump_stack();
#endif
        }
        ret = -EBUSY;

out_unlock:
        raw_spin_unlock_irqrestore(&desc->lock, flags);

        if (!desc->action)
                irq_release_resources(desc);
out_bus_unlock:
        chip_bus_sync_unlock(desc);
        mutex_unlock(&desc->request_mutex);

out_thread:
        if (new->thread) {
                struct task_struct *t = new->thread;

                new->thread = NULL;
                kthread_stop_put(t);
        }
        if (new->secondary && new->secondary->thread) {
                struct task_struct *t = new->secondary->thread;

                new->secondary->thread = NULL;
                kthread_stop_put(t);
        }
out_mput:
        module_put(desc->owner);
        return ret;
}

/*
 * Internal function to unregister an irqaction - used to free
 * regular and special interrupts that are part of the architecture.
 */
static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id)
{
        unsigned irq = desc->irq_data.irq;
        struct irqaction *action, **action_ptr;
        unsigned long flags;

        WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);

        mutex_lock(&desc->request_mutex);
        chip_bus_lock(desc);
        raw_spin_lock_irqsave(&desc->lock, flags);

        /*
         * There can be multiple actions per IRQ descriptor, find the right
         * one based on the dev_id:
         */
        action_ptr = &desc->action;
        for (;;) {
                action = *action_ptr;

                if (!action) {
                        WARN(1, "Trying to free already-free IRQ %d\n", irq);
                        raw_spin_unlock_irqrestore(&desc->lock, flags);
                        chip_bus_sync_unlock(desc);
                        mutex_unlock(&desc->request_mutex);
                        return NULL;
                }

                if (action->dev_id == dev_id)
                        break;
                action_ptr = &action->next;
        }

        /* Found it - now remove it from the list of entries: */
        *action_ptr = action->next;

        irq_pm_remove_action(desc, action);

        /* If this was the last handler, shut down the IRQ line: */
        if (!desc->action) {
                irq_settings_clr_disable_unlazy(desc);
                /* Only shutdown. Deactivate after synchronize_hardirq() */
                irq_shutdown(desc);
        }

#ifdef CONFIG_SMP
        /* make sure affinity_hint is cleaned up */
        if (WARN_ON_ONCE(desc->affinity_hint))
                desc->affinity_hint = NULL;
#endif

        raw_spin_unlock_irqrestore(&desc->lock, flags);
        /*
         * Drop bus_lock here so the changes which were done in the chip
         * callbacks above are synced out to the irq chips which hang
         * behind a slow bus (I2C, SPI) before calling synchronize_hardirq().
         *
         * Aside of that the bus_lock can also be taken from the threaded
         * handler in irq_finalize_oneshot() which results in a deadlock
         * because kthread_stop() would wait forever for the thread to
         * complete, which is blocked on the bus lock.
         *
         * The still held desc->request_mutex() protects against a
         * concurrent request_irq() of this irq so the release of resources
         * and timing data is properly serialized.
         */
        chip_bus_sync_unlock(desc);

        unregister_handler_proc(irq, action);

        /*
         * Make sure it's not being used on another CPU and if the chip
         * supports it also make sure that there is no (not yet serviced)
         * interrupt in flight at the hardware level.
         */
        __synchronize_irq(desc);

#ifdef CONFIG_DEBUG_SHIRQ
        /*
         * It's a shared IRQ -- the driver ought to be prepared for an IRQ
         * event to happen even now it's being freed, so let's make sure that
         * is so by doing an extra call to the handler ....
         *
         * ( We do this after actually deregistering it, to make sure that a
         *   'real' IRQ doesn't run in parallel with our fake. )
         */
        if (action->flags & IRQF_SHARED) {
                local_irq_save(flags);
                action->handler(irq, dev_id);
                local_irq_restore(flags);
        }
#endif

        /*
         * The action has already been removed above, but the thread writes
         * its oneshot mask bit when it completes. Though request_mutex is
         * held across this which prevents __setup_irq() from handing out
         * the same bit to a newly requested action.
         */
        if (action->thread) {
                kthread_stop_put(action->thread);
                if (action->secondary && action->secondary->thread)
                        kthread_stop_put(action->secondary->thread);
        }

        /* Last action releases resources */
        if (!desc->action) {
                /*
                 * Reacquire bus lock as irq_release_resources() might
                 * require it to deallocate resources over the slow bus.
                 */
                chip_bus_lock(desc);
                /*
                 * There is no interrupt on the fly anymore. Deactivate it
                 * completely.
                 */
                raw_spin_lock_irqsave(&desc->lock, flags);
                irq_domain_deactivate_irq(&desc->irq_data);
                raw_spin_unlock_irqrestore(&desc->lock, flags);

                irq_release_resources(desc);
                chip_bus_sync_unlock(desc);
                irq_remove_timings(desc);
        }

        mutex_unlock(&desc->request_mutex);

        irq_chip_pm_put(&desc->irq_data);
        module_put(desc->owner);
        kfree(action->secondary);
        return action;
}

/**
 *        free_irq - free an interrupt allocated with request_irq
 *        @irq: Interrupt line to free
 *        @dev_id: Device identity to free
 *
 *        Remove an interrupt handler. The handler is removed and if the
 *        interrupt line is no longer in use by any driver it is disabled.
 *        On a shared IRQ the caller must ensure the interrupt is disabled
 *        on the card it drives before calling this function. The function
 *        does not return until any executing interrupts for this IRQ
 *        have completed.
 *
 *        This function must not be called from interrupt context.
 *
 *        Returns the devname argument passed to request_irq.
 */
const void *free_irq(unsigned int irq, void *dev_id)
{
        struct irq_desc *desc = irq_to_desc(irq);
        struct irqaction *action;
        const char *devname;

        if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return NULL;

#ifdef CONFIG_SMP
        if (WARN_ON(desc->affinity_notify))
                desc->affinity_notify = NULL;
#endif

        action = __free_irq(desc, dev_id);

        if (!action)
                return NULL;

        devname = action->name;
        kfree(action);
        return devname;
}
EXPORT_SYMBOL(free_irq);

/* This function must be called with desc->lock held */
static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc)
{
        const char *devname = NULL;

        desc->istate &= ~IRQS_NMI;

        if (!WARN_ON(desc->action == NULL)) {
                irq_pm_remove_action(desc, desc->action);
                devname = desc->action->name;
                unregister_handler_proc(irq, desc->action);

                kfree(desc->action);
                desc->action = NULL;
        }

        irq_settings_clr_disable_unlazy(desc);
        irq_shutdown_and_deactivate(desc);

        irq_release_resources(desc);

        irq_chip_pm_put(&desc->irq_data);
        module_put(desc->owner);

        return devname;
}

const void *free_nmi(unsigned int irq, void *dev_id)
{
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;
        const void *devname;

        if (!desc || WARN_ON(!irq_is_nmi(desc)))
                return NULL;

        if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return NULL;

        /* NMI still enabled */
        if (WARN_ON(desc->depth == 0))
                disable_nmi_nosync(irq);

        raw_spin_lock_irqsave(&desc->lock, flags);

        irq_nmi_teardown(desc);
        devname = __cleanup_nmi(irq, desc);

        raw_spin_unlock_irqrestore(&desc->lock, flags);

        return devname;
}

/**
 *        request_threaded_irq - allocate an interrupt line
 *        @irq: Interrupt line to allocate
 *        @handler: Function to be called when the IRQ occurs.
 *                  Primary handler for threaded interrupts.
 *                  If handler is NULL and thread_fn != NULL
 *                  the default primary handler is installed.
 *        @thread_fn: Function called from the irq handler thread
 *                    If NULL, no irq thread is created
 *        @irqflags: Interrupt type flags
 *        @devname: An ascii name for the claiming device
 *        @dev_id: A cookie passed back to the handler function
 *
 *        This call allocates interrupt resources and enables the
 *        interrupt line and IRQ handling. From the point this
 *        call is made your handler function may be invoked. Since
 *        your handler function must clear any interrupt the board
 *        raises, you must take care both to initialise your hardware
 *        and to set up the interrupt handler in the right order.
 *
 *        If you want to set up a threaded irq handler for your device
 *        then you need to supply @handler and @thread_fn. @handler is
 *        still called in hard interrupt context and has to check
 *        whether the interrupt originates from the device. If yes it
 *        needs to disable the interrupt on the device and return
 *        IRQ_WAKE_THREAD which will wake up the handler thread and run
 *        @thread_fn. This split handler design is necessary to support
 *        shared interrupts.
 *
 *        Dev_id must be globally unique. Normally the address of the
 *        device data structure is used as the cookie. Since the handler
 *        receives this value it makes sense to use it.
 *
 *        If your interrupt is shared you must pass a non NULL dev_id
 *        as this is required when freeing the interrupt.
 *
 *        Flags:
 *
 *        IRQF_SHARED                Interrupt is shared
 *        IRQF_TRIGGER_*                Specify active edge(s) or level
 *        IRQF_ONESHOT                Run thread_fn with interrupt line masked
 */
int request_threaded_irq(unsigned int irq, irq_handler_t handler,
                         irq_handler_t thread_fn, unsigned long irqflags,
                         const char *devname, void *dev_id)
{
        struct irqaction *action;
        struct irq_desc *desc;
        int retval;

        if (irq == IRQ_NOTCONNECTED)
                return -ENOTCONN;

        /*
         * Sanity-check: shared interrupts must pass in a real dev-ID,
         * otherwise we'll have trouble later trying to figure out
         * which interrupt is which (messes up the interrupt freeing
         * logic etc).
         *
         * Also shared interrupts do not go well with disabling auto enable.
         * The sharing interrupt might request it while it's still disabled
         * and then wait for interrupts forever.
         *
         * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and
         * it cannot be set along with IRQF_NO_SUSPEND.
         */
        if (((irqflags & IRQF_SHARED) && !dev_id) ||
            ((irqflags & IRQF_SHARED) && (irqflags & IRQF_NO_AUTOEN)) ||
            (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) ||
            ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND)))
                return -EINVAL;

        desc = irq_to_desc(irq);
        if (!desc)
                return -EINVAL;

        if (!irq_settings_can_request(desc) ||
            WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return -EINVAL;

        if (!handler) {
                if (!thread_fn)
                        return -EINVAL;
                handler = irq_default_primary_handler;
        }

        action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
        if (!action)
                return -ENOMEM;

        action->handler = handler;
        action->thread_fn = thread_fn;
        action->flags = irqflags;
        action->name = devname;
        action->dev_id = dev_id;

        retval = irq_chip_pm_get(&desc->irq_data);
        if (retval < 0) {
                kfree(action);
                return retval;
        }

        retval = __setup_irq(irq, desc, action);

        if (retval) {
                irq_chip_pm_put(&desc->irq_data);
                kfree(action->secondary);
                kfree(action);
        }

#ifdef CONFIG_DEBUG_SHIRQ_FIXME
        if (!retval && (irqflags & IRQF_SHARED)) {
                /*
                 * It's a shared IRQ -- the driver ought to be prepared for it
                 * to happen immediately, so let's make sure....
                 * We disable the irq to make sure that a 'real' IRQ doesn't
                 * run in parallel with our fake.
                 */
                unsigned long flags;

                disable_irq(irq);
                local_irq_save(flags);

                handler(irq, dev_id);

                local_irq_restore(flags);
                enable_irq(irq);
        }
#endif
        return retval;
}
EXPORT_SYMBOL(request_threaded_irq);

/**
 *        request_any_context_irq - allocate an interrupt line
 *        @irq: Interrupt line to allocate
 *        @handler: Function to be called when the IRQ occurs.
 *                  Threaded handler for threaded interrupts.
 *        @flags: Interrupt type flags
 *        @name: An ascii name for the claiming device
 *        @dev_id: A cookie passed back to the handler function
 *
 *        This call allocates interrupt resources and enables the
 *        interrupt line and IRQ handling. It selects either a
 *        hardirq or threaded handling method depending on the
 *        context.
 *
 *        On failure, it returns a negative value. On success,
 *        it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
 */
int request_any_context_irq(unsigned int irq, irq_handler_t handler,
                            unsigned long flags, const char *name, void *dev_id)
{
        struct irq_desc *desc;
        int ret;

        if (irq == IRQ_NOTCONNECTED)
                return -ENOTCONN;

        desc = irq_to_desc(irq);
        if (!desc)
                return -EINVAL;

        if (irq_settings_is_nested_thread(desc)) {
                ret = request_threaded_irq(irq, NULL, handler,
                                           flags, name, dev_id);
                return !ret ? IRQC_IS_NESTED : ret;
        }

        ret = request_irq(irq, handler, flags, name, dev_id);
        return !ret ? IRQC_IS_HARDIRQ : ret;
}
EXPORT_SYMBOL_GPL(request_any_context_irq);

/**
 *        request_nmi - allocate an interrupt line for NMI delivery
 *        @irq: Interrupt line to allocate
 *        @handler: Function to be called when the IRQ occurs.
 *                  Threaded handler for threaded interrupts.
 *        @irqflags: Interrupt type flags
 *        @name: An ascii name for the claiming device
 *        @dev_id: A cookie passed back to the handler function
 *
 *        This call allocates interrupt resources and enables the
 *        interrupt line and IRQ handling. It sets up the IRQ line
 *        to be handled as an NMI.
 *
 *        An interrupt line delivering NMIs cannot be shared and IRQ handling
 *        cannot be threaded.
 *
 *        Interrupt lines requested for NMI delivering must produce per cpu
 *        interrupts and have auto enabling setting disabled.
 *
 *        Dev_id must be globally unique. Normally the address of the
 *        device data structure is used as the cookie. Since the handler
 *        receives this value it makes sense to use it.
 *
 *        If the interrupt line cannot be used to deliver NMIs, function
 *        will fail and return a negative value.
 */
int request_nmi(unsigned int irq, irq_handler_t handler,
                unsigned long irqflags, const char *name, void *dev_id)
{
        struct irqaction *action;
        struct irq_desc *desc;
        unsigned long flags;
        int retval;

        if (irq == IRQ_NOTCONNECTED)
                return -ENOTCONN;

        /* NMI cannot be shared, used for Polling */
        if (irqflags & (IRQF_SHARED | IRQF_COND_SUSPEND | IRQF_IRQPOLL))
                return -EINVAL;

        if (!(irqflags & IRQF_PERCPU))
                return -EINVAL;

        if (!handler)
                return -EINVAL;

        desc = irq_to_desc(irq);

        if (!desc || (irq_settings_can_autoenable(desc) &&
            !(irqflags & IRQF_NO_AUTOEN)) ||
            !irq_settings_can_request(desc) ||
            WARN_ON(irq_settings_is_per_cpu_devid(desc)) ||
            !irq_supports_nmi(desc))
                return -EINVAL;

        action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
        if (!action)
                return -ENOMEM;

        action->handler = handler;
        action->flags = irqflags | IRQF_NO_THREAD | IRQF_NOBALANCING;
        action->name = name;
        action->dev_id = dev_id;

        retval = irq_chip_pm_get(&desc->irq_data);
        if (retval < 0)
                goto err_out;

        retval = __setup_irq(irq, desc, action);
        if (retval)
                goto err_irq_setup;

        raw_spin_lock_irqsave(&desc->lock, flags);

        /* Setup NMI state */
        desc->istate |= IRQS_NMI;
        retval = irq_nmi_setup(desc);
        if (retval) {
                __cleanup_nmi(irq, desc);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
                return -EINVAL;
        }

        raw_spin_unlock_irqrestore(&desc->lock, flags);

        return 0;

err_irq_setup:
        irq_chip_pm_put(&desc->irq_data);
err_out:
        kfree(action);

        return retval;
}

void enable_percpu_irq(unsigned int irq, unsigned int type)
{
        unsigned int cpu = smp_processor_id();
        unsigned long flags;
        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);

        if (!desc)
                return;

        /*
         * If the trigger type is not specified by the caller, then
         * use the default for this interrupt.
         */
        type &= IRQ_TYPE_SENSE_MASK;
        if (type == IRQ_TYPE_NONE)
                type = irqd_get_trigger_type(&desc->irq_data);

        if (type != IRQ_TYPE_NONE) {
                int ret;

                ret = __irq_set_trigger(desc, type);

                if (ret) {
                        WARN(1, "failed to set type for IRQ%d\n", irq);
                        goto out;
                }
        }

        irq_percpu_enable(desc, cpu);
out:
        irq_put_desc_unlock(desc, flags);
}
EXPORT_SYMBOL_GPL(enable_percpu_irq);

void enable_percpu_nmi(unsigned int irq, unsigned int type)
{
        enable_percpu_irq(irq, type);
}

/**
 * irq_percpu_is_enabled - Check whether the per cpu irq is enabled
 * @irq:        Linux irq number to check for
 *
 * Must be called from a non migratable context. Returns the enable
 * state of a per cpu interrupt on the current cpu.
 */
bool irq_percpu_is_enabled(unsigned int irq)
{
        unsigned int cpu = smp_processor_id();
        struct irq_desc *desc;
        unsigned long flags;
        bool is_enabled;

        desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
        if (!desc)
                return false;

        is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled);
        irq_put_desc_unlock(desc, flags);

        return is_enabled;
}
EXPORT_SYMBOL_GPL(irq_percpu_is_enabled);

void disable_percpu_irq(unsigned int irq)
{
        unsigned int cpu = smp_processor_id();
        unsigned long flags;
        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);

        if (!desc)
                return;

        irq_percpu_disable(desc, cpu);
        irq_put_desc_unlock(desc, flags);
}
EXPORT_SYMBOL_GPL(disable_percpu_irq);

void disable_percpu_nmi(unsigned int irq)
{
        disable_percpu_irq(irq);
}

/*
 * Internal function to unregister a percpu irqaction.
 */
static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
{
        struct irq_desc *desc = irq_to_desc(irq);
        struct irqaction *action;
        unsigned long flags;

        WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);

        if (!desc)
                return NULL;

        raw_spin_lock_irqsave(&desc->lock, flags);

        action = desc->action;
        if (!action || action->percpu_dev_id != dev_id) {
                WARN(1, "Trying to free already-free IRQ %d\n", irq);
                goto bad;
        }

        if (!cpumask_empty(desc->percpu_enabled)) {
                WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
                     irq, cpumask_first(desc->percpu_enabled));
                goto bad;
        }

        /* Found it - now remove it from the list of entries: */
        desc->action = NULL;

        desc->istate &= ~IRQS_NMI;

        raw_spin_unlock_irqrestore(&desc->lock, flags);

        unregister_handler_proc(irq, action);

        irq_chip_pm_put(&desc->irq_data);
        module_put(desc->owner);
        return action;

bad:
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        return NULL;
}

/**
 *        remove_percpu_irq - free a per-cpu interrupt
 *        @irq: Interrupt line to free
 *        @act: irqaction for the interrupt
 *
 * Used to remove interrupts statically setup by the early boot process.
 */
void remove_percpu_irq(unsigned int irq, struct irqaction *act)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (desc && irq_settings_is_per_cpu_devid(desc))
            __free_percpu_irq(irq, act->percpu_dev_id);
}

/**
 *        free_percpu_irq - free an interrupt allocated with request_percpu_irq
 *        @irq: Interrupt line to free
 *        @dev_id: Device identity to free
 *
 *        Remove a percpu interrupt handler. The handler is removed, but
 *        the interrupt line is not disabled. This must be done on each
 *        CPU before calling this function. The function does not return
 *        until any executing interrupts for this IRQ have completed.
 *
 *        This function must not be called from interrupt context.
 */
void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (!desc || !irq_settings_is_per_cpu_devid(desc))
                return;

        chip_bus_lock(desc);
        kfree(__free_percpu_irq(irq, dev_id));
        chip_bus_sync_unlock(desc);
}
EXPORT_SYMBOL_GPL(free_percpu_irq);

void free_percpu_nmi(unsigned int irq, void __percpu *dev_id)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (!desc || !irq_settings_is_per_cpu_devid(desc))
                return;

        if (WARN_ON(!irq_is_nmi(desc)))
                return;

        kfree(__free_percpu_irq(irq, dev_id));
}

/**
 *        setup_percpu_irq - setup a per-cpu interrupt
 *        @irq: Interrupt line to setup
 *        @act: irqaction for the interrupt
 *
 * Used to statically setup per-cpu interrupts in the early boot process.
 */
int setup_percpu_irq(unsigned int irq, struct irqaction *act)
{
        struct irq_desc *desc = irq_to_desc(irq);
        int retval;

        if (!desc || !irq_settings_is_per_cpu_devid(desc))
                return -EINVAL;

        retval = irq_chip_pm_get(&desc->irq_data);
        if (retval < 0)
                return retval;

        retval = __setup_irq(irq, desc, act);

        if (retval)
                irq_chip_pm_put(&desc->irq_data);

        return retval;
}

/**
 *        __request_percpu_irq - allocate a percpu interrupt line
 *        @irq: Interrupt line to allocate
 *        @handler: Function to be called when the IRQ occurs.
 *        @flags: Interrupt type flags (IRQF_TIMER only)
 *        @devname: An ascii name for the claiming device
 *        @dev_id: A percpu cookie passed back to the handler function
 *
 *        This call allocates interrupt resources and enables the
 *        interrupt on the local CPU. If the interrupt is supposed to be
 *        enabled on other CPUs, it has to be done on each CPU using
 *        enable_percpu_irq().
 *
 *        Dev_id must be globally unique. It is a per-cpu variable, and
 *        the handler gets called with the interrupted CPU's instance of
 *        that variable.
 */
int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
                         unsigned long flags, const char *devname,
                         void __percpu *dev_id)
{
        struct irqaction *action;
        struct irq_desc *desc;
        int retval;

        if (!dev_id)
                return -EINVAL;

        desc = irq_to_desc(irq);
        if (!desc || !irq_settings_can_request(desc) ||
            !irq_settings_is_per_cpu_devid(desc))
                return -EINVAL;

        if (flags && flags != IRQF_TIMER)
                return -EINVAL;

        action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
        if (!action)
                return -ENOMEM;

        action->handler = handler;
        action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
        action->name = devname;
        action->percpu_dev_id = dev_id;

        retval = irq_chip_pm_get(&desc->irq_data);
        if (retval < 0) {
                kfree(action);
                return retval;
        }

        retval = __setup_irq(irq, desc, action);

        if (retval) {
                irq_chip_pm_put(&desc->irq_data);
                kfree(action);
        }

        return retval;
}
EXPORT_SYMBOL_GPL(__request_percpu_irq);

/**
 *        request_percpu_nmi - allocate a percpu interrupt line for NMI delivery
 *        @irq: Interrupt line to allocate
 *        @handler: Function to be called when the IRQ occurs.
 *        @name: An ascii name for the claiming device
 *        @dev_id: A percpu cookie passed back to the handler function
 *
 *        This call allocates interrupt resources for a per CPU NMI. Per CPU NMIs
 *        have to be setup on each CPU by calling prepare_percpu_nmi() before
 *        being enabled on the same CPU by using enable_percpu_nmi().
 *
 *        Dev_id must be globally unique. It is a per-cpu variable, and
 *        the handler gets called with the interrupted CPU's instance of
 *        that variable.
 *
 *        Interrupt lines requested for NMI delivering should have auto enabling
 *        setting disabled.
 *
 *        If the interrupt line cannot be used to deliver NMIs, function
 *        will fail returning a negative value.
 */
int request_percpu_nmi(unsigned int irq, irq_handler_t handler,
                       const char *name, void __percpu *dev_id)
{
        struct irqaction *action;
        struct irq_desc *desc;
        unsigned long flags;
        int retval;

        if (!handler)
                return -EINVAL;

        desc = irq_to_desc(irq);

        if (!desc || !irq_settings_can_request(desc) ||
            !irq_settings_is_per_cpu_devid(desc) ||
            irq_settings_can_autoenable(desc) ||
            !irq_supports_nmi(desc))
                return -EINVAL;

        /* The line cannot already be NMI */
        if (irq_is_nmi(desc))
                return -EINVAL;

        action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
        if (!action)
                return -ENOMEM;

        action->handler = handler;
        action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND | IRQF_NO_THREAD
                | IRQF_NOBALANCING;
        action->name = name;
        action->percpu_dev_id = dev_id;

        retval = irq_chip_pm_get(&desc->irq_data);
        if (retval < 0)
                goto err_out;

        retval = __setup_irq(irq, desc, action);
        if (retval)
                goto err_irq_setup;

        raw_spin_lock_irqsave(&desc->lock, flags);
        desc->istate |= IRQS_NMI;
        raw_spin_unlock_irqrestore(&desc->lock, flags);

        return 0;

err_irq_setup:
        irq_chip_pm_put(&desc->irq_data);
err_out:
        kfree(action);

        return retval;
}

/**
 *        prepare_percpu_nmi - performs CPU local setup for NMI delivery
 *        @irq: Interrupt line to prepare for NMI delivery
 *
 *        This call prepares an interrupt line to deliver NMI on the current CPU,
 *        before that interrupt line gets enabled with enable_percpu_nmi().
 *
 *        As a CPU local operation, this should be called from non-preemptible
 *        context.
 *
 *        If the interrupt line cannot be used to deliver NMIs, function
 *        will fail returning a negative value.
 */
int prepare_percpu_nmi(unsigned int irq)
{
        unsigned long flags;
        struct irq_desc *desc;
        int ret = 0;

        WARN_ON(preemptible());

        desc = irq_get_desc_lock(irq, &flags,
                                 IRQ_GET_DESC_CHECK_PERCPU);
        if (!desc)
                return -EINVAL;

        if (WARN(!irq_is_nmi(desc),
                 KERN_ERR "prepare_percpu_nmi called for a non-NMI interrupt: irq %u\n",
                 irq)) {
                ret = -EINVAL;
                goto out;
        }

        ret = irq_nmi_setup(desc);
        if (ret) {
                pr_err("Failed to setup NMI delivery: irq %u\n", irq);
                goto out;
        }

out:
        irq_put_desc_unlock(desc, flags);
        return ret;
}

/**
 *        teardown_percpu_nmi - undoes NMI setup of IRQ line
 *        @irq: Interrupt line from which CPU local NMI configuration should be
 *              removed
 *
 *        This call undoes the setup done by prepare_percpu_nmi().
 *
 *        IRQ line should not be enabled for the current CPU.
 *
 *        As a CPU local operation, this should be called from non-preemptible
 *        context.
 */
void teardown_percpu_nmi(unsigned int irq)
{
        unsigned long flags;
        struct irq_desc *desc;

        WARN_ON(preemptible());

        desc = irq_get_desc_lock(irq, &flags,
                                 IRQ_GET_DESC_CHECK_PERCPU);
        if (!desc)
                return;

        if (WARN_ON(!irq_is_nmi(desc)))
                goto out;

        irq_nmi_teardown(desc);
out:
        irq_put_desc_unlock(desc, flags);
}

static int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, bool *state)
{
        struct irq_chip *chip;
        int err = -EINVAL;

        do {
                chip = irq_data_get_irq_chip(data);
                if (WARN_ON_ONCE(!chip))
                        return -ENODEV;
                if (chip->irq_get_irqchip_state)
                        break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
                data = data->parent_data;
#else
                data = NULL;
#endif
        } while (data);

        if (data)
                err = chip->irq_get_irqchip_state(data, which, state);
        return err;
}

/**
 *        irq_get_irqchip_state - returns the irqchip state of a interrupt.
 *        @irq: Interrupt line that is forwarded to a VM
 *        @which: One of IRQCHIP_STATE_* the caller wants to know about
 *        @state: a pointer to a boolean where the state is to be stored
 *
 *        This call snapshots the internal irqchip state of an
 *        interrupt, returning into @state the bit corresponding to
 *        stage @which
 *
 *        This function should be called with preemption disabled if the
 *        interrupt controller has per-cpu registers.
 */
int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
                          bool *state)
{
        struct irq_desc *desc;
        struct irq_data *data;
        unsigned long flags;
        int err = -EINVAL;

        desc = irq_get_desc_buslock(irq, &flags, 0);
        if (!desc)
                return err;

        data = irq_desc_get_irq_data(desc);

        err = __irq_get_irqchip_state(data, which, state);

        irq_put_desc_busunlock(desc, flags);
        return err;
}
EXPORT_SYMBOL_GPL(irq_get_irqchip_state);

/**
 *        irq_set_irqchip_state - set the state of a forwarded interrupt.
 *        @irq: Interrupt line that is forwarded to a VM
 *        @which: State to be restored (one of IRQCHIP_STATE_*)
 *        @val: Value corresponding to @which
 *
 *        This call sets the internal irqchip state of an interrupt,
 *        depending on the value of @which.
 *
 *        This function should be called with migration disabled if the
 *        interrupt controller has per-cpu registers.
 */
int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
                          bool val)
{
        struct irq_desc *desc;
        struct irq_data *data;
        struct irq_chip *chip;
        unsigned long flags;
        int err = -EINVAL;

        desc = irq_get_desc_buslock(irq, &flags, 0);
        if (!desc)
                return err;

        data = irq_desc_get_irq_data(desc);

        do {
                chip = irq_data_get_irq_chip(data);
                if (WARN_ON_ONCE(!chip)) {
                        err = -ENODEV;
                        goto out_unlock;
                }
                if (chip->irq_set_irqchip_state)
                        break;
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
                data = data->parent_data;
#else
                data = NULL;
#endif
        } while (data);

        if (data)
                err = chip->irq_set_irqchip_state(data, which, val);

out_unlock:
        irq_put_desc_busunlock(desc, flags);
        return err;
}
EXPORT_SYMBOL_GPL(irq_set_irqchip_state);

/**
 * irq_has_action - Check whether an interrupt is requested
 * @irq:        The linux irq number
 *
 * Returns: A snapshot of the current state
 */
bool irq_has_action(unsigned int irq)
{
        bool res;

        rcu_read_lock();
        res = irq_desc_has_action(irq_to_desc(irq));
        rcu_read_unlock();
        return res;
}
EXPORT_SYMBOL_GPL(irq_has_action);

/**
 * irq_check_status_bit - Check whether bits in the irq descriptor status are set
 * @irq:        The linux irq number
 * @bitmask:        The bitmask to evaluate
 *
 * Returns: True if one of the bits in @bitmask is set
 */
bool irq_check_status_bit(unsigned int irq, unsigned int bitmask)
{
        struct irq_desc *desc;
        bool res = false;

        rcu_read_lock();
        desc = irq_to_desc(irq);
        if (desc)
                res = !!(desc->status_use_accessors & bitmask);
        rcu_read_unlock();
        return res;
}
EXPORT_SYMBOL_GPL(irq_check_status_bit);




























    5 


















    5 


    3 
    5 

    5 
    1 

    1 








    2 
    5 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
// SPDX-License-Identifier: GPL-2.0-only
/*
 * ratelimit.c - Do something with rate limit.
 *
 * Isolated from kernel/printk.c by Dave Young <hidave.darkstar@gmail.com>
 *
 * 2008-05-01 rewrite the function and use a ratelimit_state data struct as
 * parameter. Now every user can use their own standalone ratelimit_state.
 */

#include <linux/ratelimit.h>
#include <linux/jiffies.h>
#include <linux/export.h>

/*
 * __ratelimit - rate limiting
 * @rs: ratelimit_state data
 * @func: name of calling function
 *
 * This enforces a rate limit: not more than @rs->burst callbacks
 * in every @rs->interval
 *
 * RETURNS:
 * 0 means callbacks will be suppressed.
 * 1 means go ahead and do it.
 */
int ___ratelimit(struct ratelimit_state *rs, const char *func)
{
        /* Paired with WRITE_ONCE() in .proc_handler().
         * Changing two values seperately could be inconsistent
         * and some message could be lost.  (See: net_ratelimit_state).
         */
        int interval = READ_ONCE(rs->interval);
        int burst = READ_ONCE(rs->burst);
        unsigned long flags;
        int ret;

        if (!interval)
                return 1;

        /*
         * If we contend on this state's lock then almost
         * by definition we are too busy to print a message,
         * in addition to the one that will be printed by
         * the entity that is holding the lock already:
         */
        if (!raw_spin_trylock_irqsave(&rs->lock, flags))
                return 0;

        if (!rs->begin)
                rs->begin = jiffies;

        if (time_is_before_jiffies(rs->begin + interval)) {
                if (rs->missed) {
                        if (!(rs->flags & RATELIMIT_MSG_ON_RELEASE)) {
                                printk_deferred(KERN_WARNING
                                                "%s: %d callbacks suppressed\n",
                                                func, rs->missed);
                                rs->missed = 0;
                        }
                }
                rs->begin   = jiffies;
                rs->printed = 0;
        }
        if (burst && burst > rs->printed) {
                rs->printed++;
                ret = 1;
        } else {
                rs->missed++;
                ret = 0;
        }
        raw_spin_unlock_irqrestore(&rs->lock, flags);

        return ret;
}
EXPORT_SYMBOL(___ratelimit);



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 







    3 















    3 



    3 

    3 








    3 







    3 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 
































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux INET6 implementation
 *        FIB front-end.
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 */

/*        Changes:
 *
 *        YOSHIFUJI Hideaki @USAGI
 *                reworked default router selection.
 *                - respect outgoing interface
 *                - select from (probably) reachable routers (i.e.
 *                routers in REACHABLE, STALE, DELAY or PROBE states).
 *                - always select the same router if it is (probably)
 *                reachable.  otherwise, round-robin the list.
 *        Ville Nuorvala
 *                Fixed routing subtrees.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/mroute6.h>
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <linux/siphash.h>
#include <net/net_namespace.h>
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
#include <net/ip.h>
#include <linux/uaccess.h>
#include <linux/btf_ids.h>

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

static int ip6_rt_type_to_error(u8 fib6_type);

#define CREATE_TRACE_POINTS
#include <trace/events/fib6.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
#undef CREATE_TRACE_POINTS

enum rt6_nud_state {
        RT6_NUD_FAIL_HARD = -3,
        RT6_NUD_FAIL_PROBE = -2,
        RT6_NUD_FAIL_DO_RR = -1,
        RT6_NUD_SUCCEED = 1
};

INDIRECT_CALLABLE_SCOPE
struct dst_entry        *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int         ip6_default_advmss(const struct dst_entry *dst);
INDIRECT_CALLABLE_SCOPE
unsigned int                ip6_mtu(const struct dst_entry *dst);
static void                ip6_negative_advice(struct sock *sk,
                                            struct dst_entry *dst);
static void                ip6_dst_destroy(struct dst_entry *);
static void                ip6_dst_ifdown(struct dst_entry *,
                                       struct net_device *dev);
static void                 ip6_dst_gc(struct dst_ops *ops);

static int                ip6_pkt_discard(struct sk_buff *skb);
static int                ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static int                ip6_pkt_prohibit(struct sk_buff *skb);
static int                ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void                ip6_link_failure(struct sk_buff *skb);
static void                ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                           struct sk_buff *skb, u32 mtu,
                                           bool confirm_neigh);
static void                rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
                           int strict);
static size_t rt6_nlmsg_size(struct fib6_info *f6i);
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                         struct fib6_info *rt, struct dst_entry *dst,
                         struct in6_addr *dest, struct in6_addr *src,
                         int iif, int type, u32 portid, u32 seq,
                         unsigned int flags);
static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr);

#ifdef CONFIG_IPV6_ROUTE_INFO
static struct fib6_info *rt6_add_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev,
                                           unsigned int pref);
static struct fib6_info *rt6_get_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev);
#endif

struct uncached_list {
        spinlock_t                lock;
        struct list_head        head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);

void rt6_uncached_list_add(struct rt6_info *rt)
{
        struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);

        rt->dst.rt_uncached_list = ul;

        spin_lock_bh(&ul->lock);
        list_add_tail(&rt->dst.rt_uncached, &ul->head);
        spin_unlock_bh(&ul->lock);
}

void rt6_uncached_list_del(struct rt6_info *rt)
{
        if (!list_empty(&rt->dst.rt_uncached)) {
                struct uncached_list *ul = rt->dst.rt_uncached_list;

                spin_lock_bh(&ul->lock);
                list_del_init(&rt->dst.rt_uncached);
                spin_unlock_bh(&ul->lock);
        }
}

static void rt6_uncached_list_flush_dev(struct net_device *dev)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
                struct rt6_info *rt, *safe;

                if (list_empty(&ul->head))
                        continue;

                spin_lock_bh(&ul->lock);
                list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
                        struct inet6_dev *rt_idev = rt->rt6i_idev;
                        struct net_device *rt_dev = rt->dst.dev;
                        bool handled = false;

                        if (rt_idev && rt_idev->dev == dev) {
                                rt->rt6i_idev = in6_dev_get(blackhole_netdev);
                                in6_dev_put(rt_idev);
                                handled = true;
                        }

                        if (rt_dev == dev) {
                                rt->dst.dev = blackhole_netdev;
                                netdev_ref_replace(rt_dev, blackhole_netdev,
                                                   &rt->dst.dev_tracker,
                                                   GFP_ATOMIC);
                                handled = true;
                        }
                        if (handled)
                                list_del_init(&rt->dst.rt_uncached);
                }
                spin_unlock_bh(&ul->lock);
        }
}

static inline const void *choose_neigh_daddr(const struct in6_addr *p,
                                             struct sk_buff *skb,
                                             const void *daddr)
{
        if (!ipv6_addr_any(p))
                return (const void *) p;
        else if (skb)
                return &ipv6_hdr(skb)->daddr;
        return daddr;
}

struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
                                   struct net_device *dev,
                                   struct sk_buff *skb,
                                   const void *daddr)
{
        struct neighbour *n;

        daddr = choose_neigh_daddr(gw, skb, daddr);
        n = __ipv6_neigh_lookup(dev, daddr);
        if (n)
                return n;

        n = neigh_create(&nd_tbl, daddr, dev);
        return IS_ERR(n) ? NULL : n;
}

static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
                                              struct sk_buff *skb,
                                              const void *daddr)
{
        const struct rt6_info *rt = dst_rt6_info(dst);

        return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
                                dst->dev, skb, daddr);
}

static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct rt6_info *rt = dst_rt6_info(dst);
        struct net_device *dev = dst->dev;

        daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
        if (!daddr)
                return;
        if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
                return;
        if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
                return;
        __ipv6_confirm_neigh(dev, daddr);
}

static struct dst_ops ip6_dst_ops_template = {
        .family                        =        AF_INET6,
        .gc                        =        ip6_dst_gc,
        .gc_thresh                =        1024,
        .check                        =        ip6_dst_check,
        .default_advmss                =        ip6_default_advmss,
        .mtu                        =        ip6_mtu,
        .cow_metrics                =        dst_cow_metrics_generic,
        .destroy                =        ip6_dst_destroy,
        .ifdown                        =        ip6_dst_ifdown,
        .negative_advice        =        ip6_negative_advice,
        .link_failure                =        ip6_link_failure,
        .update_pmtu                =        ip6_rt_update_pmtu,
        .redirect                =        rt6_do_redirect,
        .local_out                =        __ip6_local_out,
        .neigh_lookup                =        ip6_dst_neigh_lookup,
        .confirm_neigh                =        ip6_confirm_neigh,
};

static struct dst_ops ip6_dst_blackhole_ops = {
        .family                        = AF_INET6,
        .default_advmss                = ip6_default_advmss,
        .neigh_lookup                = ip6_dst_neigh_lookup,
        .check                        = ip6_dst_check,
        .destroy                = ip6_dst_destroy,
        .cow_metrics                = dst_cow_metrics_generic,
        .update_pmtu                = dst_blackhole_update_pmtu,
        .redirect                = dst_blackhole_redirect,
        .mtu                        = dst_blackhole_mtu,
};

static const u32 ip6_template_metrics[RTAX_MAX] = {
        [RTAX_HOPLIMIT - 1] = 0,
};

static const struct fib6_info fib6_null_entry_template = {
        .fib6_flags        = (RTF_REJECT | RTF_NONEXTHOP),
        .fib6_protocol  = RTPROT_KERNEL,
        .fib6_metric        = ~(u32)0,
        .fib6_ref        = REFCOUNT_INIT(1),
        .fib6_type        = RTN_UNREACHABLE,
        .fib6_metrics        = (struct dst_metrics *)&dst_default_metrics,
};

static const struct rt6_info ip6_null_entry_template = {
        .dst = {
                .__rcuref        = RCUREF_INIT(1),
                .__use                = 1,
                .obsolete        = DST_OBSOLETE_FORCE_CHK,
                .error                = -ENETUNREACH,
                .input                = ip6_pkt_discard,
                .output                = ip6_pkt_discard_out,
        },
        .rt6i_flags        = (RTF_REJECT | RTF_NONEXTHOP),
};

#ifdef CONFIG_IPV6_MULTIPLE_TABLES

static const struct rt6_info ip6_prohibit_entry_template = {
        .dst = {
                .__rcuref        = RCUREF_INIT(1),
                .__use                = 1,
                .obsolete        = DST_OBSOLETE_FORCE_CHK,
                .error                = -EACCES,
                .input                = ip6_pkt_prohibit,
                .output                = ip6_pkt_prohibit_out,
        },
        .rt6i_flags        = (RTF_REJECT | RTF_NONEXTHOP),
};

static const struct rt6_info ip6_blk_hole_entry_template = {
        .dst = {
                .__rcuref        = RCUREF_INIT(1),
                .__use                = 1,
                .obsolete        = DST_OBSOLETE_FORCE_CHK,
                .error                = -EINVAL,
                .input                = dst_discard,
                .output                = dst_discard_out,
        },
        .rt6i_flags        = (RTF_REJECT | RTF_NONEXTHOP),
};

#endif

static void rt6_info_init(struct rt6_info *rt)
{
        memset_after(rt, 0, dst);
}

/* allocate dst with ip6_dst_ops */
struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
                               int flags)
{
        struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
                                        DST_OBSOLETE_FORCE_CHK, flags);

        if (rt) {
                rt6_info_init(rt);
                atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
        }

        return rt;
}
EXPORT_SYMBOL(ip6_dst_alloc);

static void ip6_dst_destroy(struct dst_entry *dst)
{
        struct rt6_info *rt = dst_rt6_info(dst);
        struct fib6_info *from;
        struct inet6_dev *idev;

        ip_dst_metrics_put(dst);
        rt6_uncached_list_del(rt);

        idev = rt->rt6i_idev;
        if (idev) {
                rt->rt6i_idev = NULL;
                in6_dev_put(idev);
        }

        from = unrcu_pointer(xchg(&rt->from, NULL));
        fib6_info_release(from);
}

static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
        struct rt6_info *rt = dst_rt6_info(dst);
        struct inet6_dev *idev = rt->rt6i_idev;
        struct fib6_info *from;

        if (idev && idev->dev != blackhole_netdev) {
                struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);

                if (blackhole_idev) {
                        rt->rt6i_idev = blackhole_idev;
                        in6_dev_put(idev);
                }
        }
        from = unrcu_pointer(xchg(&rt->from, NULL));
        fib6_info_release(from);
}

static bool __rt6_check_expired(const struct rt6_info *rt)
{
        if (rt->rt6i_flags & RTF_EXPIRES)
                return time_after(jiffies, rt->dst.expires);
        else
                return false;
}

static bool rt6_check_expired(const struct rt6_info *rt)
{
        struct fib6_info *from;

        from = rcu_dereference(rt->from);

        if (rt->rt6i_flags & RTF_EXPIRES) {
                if (time_after(jiffies, rt->dst.expires))
                        return true;
        } else if (from) {
                return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
                        fib6_check_expired(from);
        }
        return false;
}

static struct fib6_info *
rt6_multipath_first_sibling_rcu(const struct fib6_info *rt)
{
        struct fib6_info *iter;
        struct fib6_node *fn;

        fn = rcu_dereference(rt->fib6_node);
        if (!fn)
                goto out;
        iter = rcu_dereference(fn->leaf);
        if (!iter)
                goto out;

        while (iter) {
                if (iter->fib6_metric == rt->fib6_metric &&
                    rt6_qualify_for_ecmp(iter))
                        return iter;
                iter = rcu_dereference(iter->fib6_next);
        }

out:
        return NULL;
}

void fib6_select_path(const struct net *net, struct fib6_result *res,
                      struct flowi6 *fl6, int oif, bool have_oif_match,
                      const struct sk_buff *skb, int strict)
{
        struct fib6_info *first, *match = res->f6i;
        struct fib6_info *sibling;
        int hash;

        if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
                goto out;

        if (match->nh && have_oif_match && res->nh)
                return;

        if (skb)
                IP6CB(skb)->flags |= IP6SKB_MULTIPATH;

        /* We might have already computed the hash for ICMPv6 errors. In such
         * case it will always be non-zero. Otherwise now is the time to do it.
         */
        if (!fl6->mp_hash &&
            (!match->nh || nexthop_is_multipath(match->nh)))
                fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);

        if (unlikely(match->nh)) {
                nexthop_path_fib6_result(res, fl6->mp_hash);
                return;
        }

        first = rt6_multipath_first_sibling_rcu(match);
        if (!first)
                goto out;

        hash = fl6->mp_hash;
        if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) {
                if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif,
                                    strict) >= 0)
                        match = first;
                goto out;
        }

        list_for_each_entry_rcu(sibling, &first->fib6_siblings,
                                fib6_siblings) {
                const struct fib6_nh *nh = sibling->fib6_nh;
                int nh_upper_bound;

                nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
                if (hash > nh_upper_bound)
                        continue;
                if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
                        break;
                match = sibling;
                break;
        }

out:
        res->f6i = match;
        res->nh = match->fib6_nh;
}

/*
 *        Route lookup. rcu_read_lock() should be held.
 */

static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
                               const struct in6_addr *saddr, int oif, int flags)
{
        const struct net_device *dev;

        if (nh->fib_nh_flags & RTNH_F_DEAD)
                return false;

        dev = nh->fib_nh_dev;
        if (oif) {
                if (dev->ifindex == oif)
                        return true;
        } else {
                if (ipv6_chk_addr(net, saddr, dev,
                                  flags & RT6_LOOKUP_F_IFACE))
                        return true;
        }

        return false;
}

struct fib6_nh_dm_arg {
        struct net                *net;
        const struct in6_addr        *saddr;
        int                        oif;
        int                        flags;
        struct fib6_nh                *nh;
};

static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_dm_arg *arg = _arg;

        arg->nh = nh;
        return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
                                  arg->flags);
}

/* returns fib6_nh from nexthop or NULL */
static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
                                        struct fib6_result *res,
                                        const struct in6_addr *saddr,
                                        int oif, int flags)
{
        struct fib6_nh_dm_arg arg = {
                .net   = net,
                .saddr = saddr,
                .oif   = oif,
                .flags = flags,
        };

        if (nexthop_is_blackhole(nh))
                return NULL;

        if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
                return arg.nh;

        return NULL;
}

static void rt6_device_match(struct net *net, struct fib6_result *res,
                             const struct in6_addr *saddr, int oif, int flags)
{
        struct fib6_info *f6i = res->f6i;
        struct fib6_info *spf6i;
        struct fib6_nh *nh;

        if (!oif && ipv6_addr_any(saddr)) {
                if (unlikely(f6i->nh)) {
                        nh = nexthop_fib6_nh(f6i->nh);
                        if (nexthop_is_blackhole(f6i->nh))
                                goto out_blackhole;
                } else {
                        nh = f6i->fib6_nh;
                }
                if (!(nh->fib_nh_flags & RTNH_F_DEAD))
                        goto out;
        }

        for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
                bool matched = false;

                if (unlikely(spf6i->nh)) {
                        nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
                                              oif, flags);
                        if (nh)
                                matched = true;
                } else {
                        nh = spf6i->fib6_nh;
                        if (__rt6_device_match(net, nh, saddr, oif, flags))
                                matched = true;
                }
                if (matched) {
                        res->f6i = spf6i;
                        goto out;
                }
        }

        if (oif && flags & RT6_LOOKUP_F_IFACE) {
                res->f6i = net->ipv6.fib6_null_entry;
                nh = res->f6i->fib6_nh;
                goto out;
        }

        if (unlikely(f6i->nh)) {
                nh = nexthop_fib6_nh(f6i->nh);
                if (nexthop_is_blackhole(f6i->nh))
                        goto out_blackhole;
        } else {
                nh = f6i->fib6_nh;
        }

        if (nh->fib_nh_flags & RTNH_F_DEAD) {
                res->f6i = net->ipv6.fib6_null_entry;
                nh = res->f6i->fib6_nh;
        }
out:
        res->nh = nh;
        res->fib6_type = res->f6i->fib6_type;
        res->fib6_flags = res->f6i->fib6_flags;
        return;

out_blackhole:
        res->fib6_flags |= RTF_REJECT;
        res->fib6_type = RTN_BLACKHOLE;
        res->nh = nh;
}

#ifdef CONFIG_IPV6_ROUTER_PREF
struct __rt6_probe_work {
        struct work_struct work;
        struct in6_addr target;
        struct net_device *dev;
        netdevice_tracker dev_tracker;
};

static void rt6_probe_deferred(struct work_struct *w)
{
        struct in6_addr mcaddr;
        struct __rt6_probe_work *work =
                container_of(w, struct __rt6_probe_work, work);

        addrconf_addr_solict_mult(&work->target, &mcaddr);
        ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
        netdev_put(work->dev, &work->dev_tracker);
        kfree(work);
}

static void rt6_probe(struct fib6_nh *fib6_nh)
{
        struct __rt6_probe_work *work = NULL;
        const struct in6_addr *nh_gw;
        unsigned long last_probe;
        struct neighbour *neigh;
        struct net_device *dev;
        struct inet6_dev *idev;

        /*
         * Okay, this does not seem to be appropriate
         * for now, however, we need to check if it
         * is really so; aka Router Reachability Probing.
         *
         * Router Reachability Probe MUST be rate-limited
         * to no more than one per minute.
         */
        if (!fib6_nh->fib_nh_gw_family)
                return;

        nh_gw = &fib6_nh->fib_nh_gw6;
        dev = fib6_nh->fib_nh_dev;
        rcu_read_lock();
        last_probe = READ_ONCE(fib6_nh->last_probe);
        idev = __in6_dev_get(dev);
        if (!idev)
                goto out;
        neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
        if (neigh) {
                if (READ_ONCE(neigh->nud_state) & NUD_VALID)
                        goto out;

                write_lock_bh(&neigh->lock);
                if (!(neigh->nud_state & NUD_VALID) &&
                    time_after(jiffies,
                               neigh->updated +
                               READ_ONCE(idev->cnf.rtr_probe_interval))) {
                        work = kmalloc(sizeof(*work), GFP_ATOMIC);
                        if (work)
                                __neigh_set_probe_once(neigh);
                }
                write_unlock_bh(&neigh->lock);
        } else if (time_after(jiffies, last_probe +
                                       READ_ONCE(idev->cnf.rtr_probe_interval))) {
                work = kmalloc(sizeof(*work), GFP_ATOMIC);
        }

        if (!work || cmpxchg(&fib6_nh->last_probe,
                             last_probe, jiffies) != last_probe) {
                kfree(work);
        } else {
                INIT_WORK(&work->work, rt6_probe_deferred);
                work->target = *nh_gw;
                netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
                work->dev = dev;
                schedule_work(&work->work);
        }

out:
        rcu_read_unlock();
}
#else
static inline void rt6_probe(struct fib6_nh *fib6_nh)
{
}
#endif

/*
 * Default Router Selection (RFC 2461 6.3.6)
 */
static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
{
        enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
        struct neighbour *neigh;

        rcu_read_lock();
        neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
                                          &fib6_nh->fib_nh_gw6);
        if (neigh) {
                u8 nud_state = READ_ONCE(neigh->nud_state);

                if (nud_state & NUD_VALID)
                        ret = RT6_NUD_SUCCEED;
#ifdef CONFIG_IPV6_ROUTER_PREF
                else if (!(nud_state & NUD_FAILED))
                        ret = RT6_NUD_SUCCEED;
                else
                        ret = RT6_NUD_FAIL_PROBE;
#endif
        } else {
                ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
                      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
        }
        rcu_read_unlock();

        return ret;
}

static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
                           int strict)
{
        int m = 0;

        if (!oif || nh->fib_nh_dev->ifindex == oif)
                m = 2;

        if (!m && (strict & RT6_LOOKUP_F_IFACE))
                return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
        m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
#endif
        if ((strict & RT6_LOOKUP_F_REACHABLE) &&
            !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
                int n = rt6_check_neigh(nh);
                if (n < 0)
                        return n;
        }
        return m;
}

static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
                       int oif, int strict, int *mpri, bool *do_rr)
{
        bool match_do_rr = false;
        bool rc = false;
        int m;

        if (nh->fib_nh_flags & RTNH_F_DEAD)
                goto out;

        if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
            nh->fib_nh_flags & RTNH_F_LINKDOWN &&
            !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
                goto out;

        m = rt6_score_route(nh, fib6_flags, oif, strict);
        if (m == RT6_NUD_FAIL_DO_RR) {
                match_do_rr = true;
                m = 0; /* lowest valid score */
        } else if (m == RT6_NUD_FAIL_HARD) {
                goto out;
        }

        if (strict & RT6_LOOKUP_F_REACHABLE)
                rt6_probe(nh);

        /* note that m can be RT6_NUD_FAIL_PROBE at this point */
        if (m > *mpri) {
                *do_rr = match_do_rr;
                *mpri = m;
                rc = true;
        }
out:
        return rc;
}

struct fib6_nh_frl_arg {
        u32                flags;
        int                oif;
        int                strict;
        int                *mpri;
        bool                *do_rr;
        struct fib6_nh        *nh;
};

static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_frl_arg *arg = _arg;

        arg->nh = nh;
        return find_match(nh, arg->flags, arg->oif, arg->strict,
                          arg->mpri, arg->do_rr);
}

static void __find_rr_leaf(struct fib6_info *f6i_start,
                           struct fib6_info *nomatch, u32 metric,
                           struct fib6_result *res, struct fib6_info **cont,
                           int oif, int strict, bool *do_rr, int *mpri)
{
        struct fib6_info *f6i;

        for (f6i = f6i_start;
             f6i && f6i != nomatch;
             f6i = rcu_dereference(f6i->fib6_next)) {
                bool matched = false;
                struct fib6_nh *nh;

                if (cont && f6i->fib6_metric != metric) {
                        *cont = f6i;
                        return;
                }

                if (fib6_check_expired(f6i))
                        continue;

                if (unlikely(f6i->nh)) {
                        struct fib6_nh_frl_arg arg = {
                                .flags  = f6i->fib6_flags,
                                .oif    = oif,
                                .strict = strict,
                                .mpri   = mpri,
                                .do_rr  = do_rr
                        };

                        if (nexthop_is_blackhole(f6i->nh)) {
                                res->fib6_flags = RTF_REJECT;
                                res->fib6_type = RTN_BLACKHOLE;
                                res->f6i = f6i;
                                res->nh = nexthop_fib6_nh(f6i->nh);
                                return;
                        }
                        if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
                                                     &arg)) {
                                matched = true;
                                nh = arg.nh;
                        }
                } else {
                        nh = f6i->fib6_nh;
                        if (find_match(nh, f6i->fib6_flags, oif, strict,
                                       mpri, do_rr))
                                matched = true;
                }
                if (matched) {
                        res->f6i = f6i;
                        res->nh = nh;
                        res->fib6_flags = f6i->fib6_flags;
                        res->fib6_type = f6i->fib6_type;
                }
        }
}

static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
                         struct fib6_info *rr_head, int oif, int strict,
                         bool *do_rr, struct fib6_result *res)
{
        u32 metric = rr_head->fib6_metric;
        struct fib6_info *cont = NULL;
        int mpri = -1;

        __find_rr_leaf(rr_head, NULL, metric, res, &cont,
                       oif, strict, do_rr, &mpri);

        __find_rr_leaf(leaf, rr_head, metric, res, &cont,
                       oif, strict, do_rr, &mpri);

        if (res->f6i || !cont)
                return;

        __find_rr_leaf(cont, NULL, metric, res, NULL,
                       oif, strict, do_rr, &mpri);
}

static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
                       struct fib6_result *res, int strict)
{
        struct fib6_info *leaf = rcu_dereference(fn->leaf);
        struct fib6_info *rt0;
        bool do_rr = false;
        int key_plen;

        /* make sure this function or its helpers sets f6i */
        res->f6i = NULL;

        if (!leaf || leaf == net->ipv6.fib6_null_entry)
                goto out;

        rt0 = rcu_dereference(fn->rr_ptr);
        if (!rt0)
                rt0 = leaf;

        /* Double check to make sure fn is not an intermediate node
         * and fn->leaf does not points to its child's leaf
         * (This might happen if all routes under fn are deleted from
         * the tree and fib6_repair_tree() is called on the node.)
         */
        key_plen = rt0->fib6_dst.plen;
#ifdef CONFIG_IPV6_SUBTREES
        if (rt0->fib6_src.plen)
                key_plen = rt0->fib6_src.plen;
#endif
        if (fn->fn_bit != key_plen)
                goto out;

        find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
        if (do_rr) {
                struct fib6_info *next = rcu_dereference(rt0->fib6_next);

                /* no entries matched; do round-robin */
                if (!next || next->fib6_metric != rt0->fib6_metric)
                        next = leaf;

                if (next != rt0) {
                        spin_lock_bh(&leaf->fib6_table->tb6_lock);
                        /* make sure next is not being deleted from the tree */
                        if (next->fib6_node)
                                rcu_assign_pointer(fn->rr_ptr, next);
                        spin_unlock_bh(&leaf->fib6_table->tb6_lock);
                }
        }

out:
        if (!res->f6i) {
                res->f6i = net->ipv6.fib6_null_entry;
                res->nh = res->f6i->fib6_nh;
                res->fib6_flags = res->f6i->fib6_flags;
                res->fib6_type = res->f6i->fib6_type;
        }
}

static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
{
        return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
               res->nh->fib_nh_gw_family;
}

#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                  const struct in6_addr *gwaddr)
{
        struct net *net = dev_net(dev);
        struct route_info *rinfo = (struct route_info *) opt;
        struct in6_addr prefix_buf, *prefix;
        struct fib6_table *table;
        unsigned int pref;
        unsigned long lifetime;
        struct fib6_info *rt;

        if (len < sizeof(struct route_info)) {
                return -EINVAL;
        }

        /* Sanity check for prefix_len and length */
        if (rinfo->length > 3) {
                return -EINVAL;
        } else if (rinfo->prefix_len > 128) {
                return -EINVAL;
        } else if (rinfo->prefix_len > 64) {
                if (rinfo->length < 2) {
                        return -EINVAL;
                }
        } else if (rinfo->prefix_len > 0) {
                if (rinfo->length < 1) {
                        return -EINVAL;
                }
        }

        pref = rinfo->route_pref;
        if (pref == ICMPV6_ROUTER_PREF_INVALID)
                return -EINVAL;

        lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);

        if (rinfo->length == 3)
                prefix = (struct in6_addr *)rinfo->prefix;
        else {
                /* this function is safe */
                ipv6_addr_prefix(&prefix_buf,
                                 (struct in6_addr *)rinfo->prefix,
                                 rinfo->prefix_len);
                prefix = &prefix_buf;
        }

        if (rinfo->prefix_len == 0)
                rt = rt6_get_dflt_router(net, gwaddr, dev);
        else
                rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
                                        gwaddr, dev);

        if (rt && !lifetime) {
                ip6_del_rt(net, rt, false);
                rt = NULL;
        }

        if (!rt && lifetime)
                rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
                                        dev, pref);
        else if (rt)
                rt->fib6_flags = RTF_ROUTEINFO |
                                 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);

        if (rt) {
                table = rt->fib6_table;
                spin_lock_bh(&table->tb6_lock);

                if (!addrconf_finite_timeout(lifetime)) {
                        fib6_clean_expires(rt);
                        fib6_remove_gc_list(rt);
                } else {
                        fib6_set_expires(rt, jiffies + HZ * lifetime);
                        fib6_add_gc_list(rt);
                }

                spin_unlock_bh(&table->tb6_lock);

                fib6_info_release(rt);
        }
        return 0;
}
#endif

/*
 *        Misc support functions
 */

/* called with rcu_lock held */
static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
{
        struct net_device *dev = res->nh->fib_nh_dev;

        if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
                /* for copies of local routes, dst->dev needs to be the
                 * device if it is a master device, the master device if
                 * device is enslaved, and the loopback as the default
                 */
                if (netif_is_l3_slave(dev) &&
                    !rt6_need_strict(&res->f6i->fib6_dst.addr))
                        dev = l3mdev_master_dev_rcu(dev);
                else if (!netif_is_l3_master(dev))
                        dev = dev_net(dev)->loopback_dev;
                /* last case is netif_is_l3_master(dev) is true in which
                 * case we want dev returned to be dev
                 */
        }

        return dev;
}

static const int fib6_prop[RTN_MAX + 1] = {
        [RTN_UNSPEC]        = 0,
        [RTN_UNICAST]        = 0,
        [RTN_LOCAL]        = 0,
        [RTN_BROADCAST]        = 0,
        [RTN_ANYCAST]        = 0,
        [RTN_MULTICAST]        = 0,
        [RTN_BLACKHOLE]        = -EINVAL,
        [RTN_UNREACHABLE] = -EHOSTUNREACH,
        [RTN_PROHIBIT]        = -EACCES,
        [RTN_THROW]        = -EAGAIN,
        [RTN_NAT]        = -EINVAL,
        [RTN_XRESOLVE]        = -EINVAL,
};

static int ip6_rt_type_to_error(u8 fib6_type)
{
        return fib6_prop[fib6_type];
}

static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
{
        unsigned short flags = 0;

        if (rt->dst_nocount)
                flags |= DST_NOCOUNT;
        if (rt->dst_nopolicy)
                flags |= DST_NOPOLICY;

        return flags;
}

static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
{
        rt->dst.error = ip6_rt_type_to_error(fib6_type);

        switch (fib6_type) {
        case RTN_BLACKHOLE:
                rt->dst.output = dst_discard_out;
                rt->dst.input = dst_discard;
                break;
        case RTN_PROHIBIT:
                rt->dst.output = ip6_pkt_prohibit_out;
                rt->dst.input = ip6_pkt_prohibit;
                break;
        case RTN_THROW:
        case RTN_UNREACHABLE:
        default:
                rt->dst.output = ip6_pkt_discard_out;
                rt->dst.input = ip6_pkt_discard;
                break;
        }
}

static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
{
        struct fib6_info *f6i = res->f6i;

        if (res->fib6_flags & RTF_REJECT) {
                ip6_rt_init_dst_reject(rt, res->fib6_type);
                return;
        }

        rt->dst.error = 0;
        rt->dst.output = ip6_output;

        if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
                rt->dst.input = ip6_input;
        } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
                rt->dst.input = ip6_mc_input;
        } else {
                rt->dst.input = ip6_forward;
        }

        if (res->nh->fib_nh_lws) {
                rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
                lwtunnel_set_redirect(&rt->dst);
        }

        rt->dst.lastuse = jiffies;
}

/* Caller must already hold reference to @from */
static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
{
        rt->rt6i_flags &= ~RTF_EXPIRES;
        rcu_assign_pointer(rt->from, from);
        ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
}

/* Caller must already hold reference to f6i in result */
static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
{
        const struct fib6_nh *nh = res->nh;
        const struct net_device *dev = nh->fib_nh_dev;
        struct fib6_info *f6i = res->f6i;

        ip6_rt_init_dst(rt, res);

        rt->rt6i_dst = f6i->fib6_dst;
        rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
        rt->rt6i_flags = res->fib6_flags;
        if (nh->fib_nh_gw_family) {
                rt->rt6i_gateway = nh->fib_nh_gw6;
                rt->rt6i_flags |= RTF_GATEWAY;
        }
        rt6_set_from(rt, f6i);
#ifdef CONFIG_IPV6_SUBTREES
        rt->rt6i_src = f6i->fib6_src;
#endif
}

static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
                                        struct in6_addr *saddr)
{
        struct fib6_node *pn, *sn;
        while (1) {
                if (fn->fn_flags & RTN_TL_ROOT)
                        return NULL;
                pn = rcu_dereference(fn->parent);
                sn = FIB6_SUBTREE(pn);
                if (sn && sn != fn)
                        fn = fib6_node_lookup(sn, NULL, saddr);
                else
                        fn = pn;
                if (fn->fn_flags & RTN_RTINFO)
                        return fn;
        }
}

static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
{
        struct rt6_info *rt = *prt;

        if (dst_hold_safe(&rt->dst))
                return true;
        if (net) {
                rt = net->ipv6.ip6_null_entry;
                dst_hold(&rt->dst);
        } else {
                rt = NULL;
        }
        *prt = rt;
        return false;
}

/* called with rcu_lock held */
static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
{
        struct net_device *dev = res->nh->fib_nh_dev;
        struct fib6_info *f6i = res->f6i;
        unsigned short flags;
        struct rt6_info *nrt;

        if (!fib6_info_hold_safe(f6i))
                goto fallback;

        flags = fib6_info_dst_flags(f6i);
        nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
        if (!nrt) {
                fib6_info_release(f6i);
                goto fallback;
        }

        ip6_rt_copy_init(nrt, res);
        return nrt;

fallback:
        nrt = dev_net(dev)->ipv6.ip6_null_entry;
        dst_hold(&nrt->dst);
        return nrt;
}

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags)
{
        struct fib6_result res = {};
        struct fib6_node *fn;
        struct rt6_info *rt;

        rcu_read_lock();
        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
        res.f6i = rcu_dereference(fn->leaf);
        if (!res.f6i)
                res.f6i = net->ipv6.fib6_null_entry;
        else
                rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
                                 flags);

        if (res.f6i == net->ipv6.fib6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto restart;

                rt = net->ipv6.ip6_null_entry;
                dst_hold(&rt->dst);
                goto out;
        } else if (res.fib6_flags & RTF_REJECT) {
                goto do_create;
        }

        fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
                         fl6->flowi6_oif != 0, skb, flags);

        /* Search through exception table */
        rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
        if (rt) {
                if (ip6_hold_safe(net, &rt))
                        dst_use_noref(&rt->dst, jiffies);
        } else {
do_create:
                rt = ip6_create_rt_rcu(&res);
        }

out:
        trace_fib6_table_lookup(net, &res, table, fl6);

        rcu_read_unlock();

        return rt;
}

struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb, int flags)
{
        return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
                            const struct in6_addr *saddr, int oif,
                            const struct sk_buff *skb, int strict)
{
        struct flowi6 fl6 = {
                .flowi6_oif = oif,
                .daddr = *daddr,
        };
        struct dst_entry *dst;
        int flags = strict ? RT6_LOOKUP_F_IFACE : 0;

        if (saddr) {
                memcpy(&fl6.saddr, saddr, sizeof(*saddr));
                flags |= RT6_LOOKUP_F_HAS_SADDR;
        }

        dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
        if (dst->error == 0)
                return dst_rt6_info(dst);

        dst_release(dst);

        return NULL;
}
EXPORT_SYMBOL(rt6_lookup);

/* ip6_ins_rt is called with FREE table->tb6_lock.
 * It takes new route entry, the addition fails by any reason the
 * route is released.
 * Caller must hold dst before calling it.
 */

static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
                        struct netlink_ext_ack *extack)
{
        int err;
        struct fib6_table *table;

        table = rt->fib6_table;
        spin_lock_bh(&table->tb6_lock);
        err = fib6_add(&table->tb6_root, rt, info, extack);
        spin_unlock_bh(&table->tb6_lock);

        return err;
}

int ip6_ins_rt(struct net *net, struct fib6_info *rt)
{
        struct nl_info info = {        .nl_net = net, };

        return __ip6_ins_rt(rt, &info, NULL);
}

static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr)
{
        struct fib6_info *f6i = res->f6i;
        struct net_device *dev;
        struct rt6_info *rt;

        /*
         *        Clone the route.
         */

        if (!fib6_info_hold_safe(f6i))
                return NULL;

        dev = ip6_rt_get_dev_rcu(res);
        rt = ip6_dst_alloc(dev_net(dev), dev, 0);
        if (!rt) {
                fib6_info_release(f6i);
                return NULL;
        }

        ip6_rt_copy_init(rt, res);
        rt->rt6i_flags |= RTF_CACHE;
        rt->rt6i_dst.addr = *daddr;
        rt->rt6i_dst.plen = 128;

        if (!rt6_is_gw_or_nonexthop(res)) {
                if (f6i->fib6_dst.plen != 128 &&
                    ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
                        rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
                if (rt->rt6i_src.plen && saddr) {
                        rt->rt6i_src.addr = *saddr;
                        rt->rt6i_src.plen = 128;
                }
#endif
        }

        return rt;
}

static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
{
        struct fib6_info *f6i = res->f6i;
        unsigned short flags = fib6_info_dst_flags(f6i);
        struct net_device *dev;
        struct rt6_info *pcpu_rt;

        if (!fib6_info_hold_safe(f6i))
                return NULL;

        rcu_read_lock();
        dev = ip6_rt_get_dev_rcu(res);
        pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
        rcu_read_unlock();
        if (!pcpu_rt) {
                fib6_info_release(f6i);
                return NULL;
        }
        ip6_rt_copy_init(pcpu_rt, res);
        pcpu_rt->rt6i_flags |= RTF_PCPU;

        if (f6i->nh)
                pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));

        return pcpu_rt;
}

static bool rt6_is_valid(const struct rt6_info *rt6)
{
        return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
}

/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
        struct rt6_info *pcpu_rt;

        pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);

        if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
                struct rt6_info *prev, **p;

                p = this_cpu_ptr(res->nh->rt6i_pcpu);
                /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */
                prev = xchg(p, NULL);
                if (prev) {
                        dst_dev_put(&prev->dst);
                        dst_release(&prev->dst);
                }

                pcpu_rt = NULL;
        }

        return pcpu_rt;
}

static struct rt6_info *rt6_make_pcpu_route(struct net *net,
                                            const struct fib6_result *res)
{
        struct rt6_info *pcpu_rt, *prev, **p;

        pcpu_rt = ip6_rt_pcpu_alloc(res);
        if (!pcpu_rt)
                return NULL;

        p = this_cpu_ptr(res->nh->rt6i_pcpu);
        prev = cmpxchg(p, NULL, pcpu_rt);
        BUG_ON(prev);

        if (res->f6i->fib6_destroying) {
                struct fib6_info *from;

                from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
                fib6_info_release(from);
        }

        return pcpu_rt;
}

/* exception hash table implementation
 */
static DEFINE_SPINLOCK(rt6_exception_lock);

/* Remove rt6_ex from hash table and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
                                 struct rt6_exception *rt6_ex)
{
        struct net *net;

        if (!bucket || !rt6_ex)
                return;

        net = dev_net(rt6_ex->rt6i->dst.dev);
        net->ipv6.rt6_stats->fib_rt_cache--;

        /* purge completely the exception to allow releasing the held resources:
         * some [sk] cache may keep the dst around for unlimited time
         */
        dst_dev_put(&rt6_ex->rt6i->dst);

        hlist_del_rcu(&rt6_ex->hlist);
        dst_release(&rt6_ex->rt6i->dst);
        kfree_rcu(rt6_ex, rcu);
        WARN_ON_ONCE(!bucket->depth);
        bucket->depth--;
}

/* Remove oldest rt6_ex in bucket and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
{
        struct rt6_exception *rt6_ex, *oldest = NULL;

        if (!bucket)
                return;

        hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
                if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
                        oldest = rt6_ex;
        }
        rt6_remove_exception(bucket, oldest);
}

static u32 rt6_exception_hash(const struct in6_addr *dst,
                              const struct in6_addr *src)
{
        static siphash_aligned_key_t rt6_exception_key;
        struct {
                struct in6_addr dst;
                struct in6_addr src;
        } __aligned(SIPHASH_ALIGNMENT) combined = {
                .dst = *dst,
        };
        u64 val;

        net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));

#ifdef CONFIG_IPV6_SUBTREES
        if (src)
                combined.src = *src;
#endif
        val = siphash(&combined, sizeof(combined), &rt6_exception_key);

        return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rt6_exception_lock
 */
static struct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
                              const struct in6_addr *daddr,
                              const struct in6_addr *saddr)
{
        struct rt6_exception *rt6_ex;
        u32 hval;

        if (!(*bucket) || !daddr)
                return NULL;

        hval = rt6_exception_hash(daddr, saddr);
        *bucket += hval;

        hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
                struct rt6_info *rt6 = rt6_ex->rt6i;
                bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
                if (matched && saddr)
                        matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
                if (matched)
                        return rt6_ex;
        }
        return NULL;
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rcu_read_lock()
 */
static struct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
                         const struct in6_addr *daddr,
                         const struct in6_addr *saddr)
{
        struct rt6_exception *rt6_ex;
        u32 hval;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (!(*bucket) || !daddr)
                return NULL;

        hval = rt6_exception_hash(daddr, saddr);
        *bucket += hval;

        hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
                struct rt6_info *rt6 = rt6_ex->rt6i;
                bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
                if (matched && saddr)
                        matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
                if (matched)
                        return rt6_ex;
        }
        return NULL;
}

static unsigned int fib6_mtu(const struct fib6_result *res)
{
        const struct fib6_nh *nh = res->nh;
        unsigned int mtu;

        if (res->f6i->fib6_pmtu) {
                mtu = res->f6i->fib6_pmtu;
        } else {
                struct net_device *dev = nh->fib_nh_dev;
                struct inet6_dev *idev;

                rcu_read_lock();
                idev = __in6_dev_get(dev);
                mtu = READ_ONCE(idev->cnf.mtu6);
                rcu_read_unlock();
        }

        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

        return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}

#define FIB6_EXCEPTION_BUCKET_FLUSHED  0x1UL

/* used when the flushed bit is not relevant, only access to the bucket
 * (ie., all bucket users except rt6_insert_exception);
 *
 * called under rcu lock; sometimes called with rt6_exception_lock held
 */
static
struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
                                                       spinlock_t *lock)
{
        struct rt6_exception_bucket *bucket;

        if (lock)
                bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
                                                   lockdep_is_held(lock));
        else
                bucket = rcu_dereference(nh->rt6i_exception_bucket);

        /* remove bucket flushed bit if set */
        if (bucket) {
                unsigned long p = (unsigned long)bucket;

                p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
                bucket = (struct rt6_exception_bucket *)p;
        }

        return bucket;
}

static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
{
        unsigned long p = (unsigned long)bucket;

        return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
}

/* called with rt6_exception_lock held */
static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
                                              spinlock_t *lock)
{
        struct rt6_exception_bucket *bucket;
        unsigned long p;

        bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
                                           lockdep_is_held(lock));

        p = (unsigned long)bucket;
        p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
        bucket = (struct rt6_exception_bucket *)p;
        rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
}

static int rt6_insert_exception(struct rt6_info *nrt,
                                const struct fib6_result *res)
{
        struct net *net = dev_net(nrt->dst.dev);
        struct rt6_exception_bucket *bucket;
        struct fib6_info *f6i = res->f6i;
        struct in6_addr *src_key = NULL;
        struct rt6_exception *rt6_ex;
        struct fib6_nh *nh = res->nh;
        int max_depth;
        int err = 0;

        spin_lock_bh(&rt6_exception_lock);

        bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
                                          lockdep_is_held(&rt6_exception_lock));
        if (!bucket) {
                bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
                                 GFP_ATOMIC);
                if (!bucket) {
                        err = -ENOMEM;
                        goto out;
                }
                rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
        } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
                err = -EINVAL;
                goto out;
        }

#ifdef CONFIG_IPV6_SUBTREES
        /* fib6_src.plen != 0 indicates f6i is in subtree
         * and exception table is indexed by a hash of
         * both fib6_dst and fib6_src.
         * Otherwise, the exception table is indexed by
         * a hash of only fib6_dst.
         */
        if (f6i->fib6_src.plen)
                src_key = &nrt->rt6i_src.addr;
#endif
        /* rt6_mtu_change() might lower mtu on f6i.
         * Only insert this exception route if its mtu
         * is less than f6i's mtu value.
         */
        if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
                err = -EINVAL;
                goto out;
        }

        rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
                                               src_key);
        if (rt6_ex)
                rt6_remove_exception(bucket, rt6_ex);

        rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
        if (!rt6_ex) {
                err = -ENOMEM;
                goto out;
        }
        rt6_ex->rt6i = nrt;
        rt6_ex->stamp = jiffies;
        hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
        bucket->depth++;
        net->ipv6.rt6_stats->fib_rt_cache++;

        /* Randomize max depth to avoid some side channels attacks. */
        max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH);
        while (bucket->depth > max_depth)
                rt6_exception_remove_oldest(bucket);

out:
        spin_unlock_bh(&rt6_exception_lock);

        /* Update fn->fn_sernum to invalidate all cached dst */
        if (!err) {
                spin_lock_bh(&f6i->fib6_table->tb6_lock);
                fib6_update_sernum(net, f6i);
                fib6_add_gc_list(f6i);
                spin_unlock_bh(&f6i->fib6_table->tb6_lock);
                fib6_force_start_gc(net);
        }

        return err;
}

static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;

        spin_lock_bh(&rt6_exception_lock);

        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (!bucket)
                goto out;

        /* Prevent rt6_insert_exception() to recreate the bucket list */
        if (!from)
                fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);

        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
                        if (!from ||
                            rcu_access_pointer(rt6_ex->rt6i->from) == from)
                                rt6_remove_exception(bucket, rt6_ex);
                }
                WARN_ON_ONCE(!from && bucket->depth);
                bucket++;
        }
out:
        spin_unlock_bh(&rt6_exception_lock);
}

static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
{
        struct fib6_info *f6i = arg;

        fib6_nh_flush_exceptions(nh, f6i);

        return 0;
}

void rt6_flush_exceptions(struct fib6_info *f6i)
{
        if (f6i->nh)
                nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions,
                                         f6i);
        else
                fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
}

/* Find cached rt in the hash table inside passed in rt
 * Caller has to hold rcu_read_lock()
 */
static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
                                           const struct in6_addr *daddr,
                                           const struct in6_addr *saddr)
{
        const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct rt6_info *ret = NULL;

#ifdef CONFIG_IPV6_SUBTREES
        /* fib6i_src.plen != 0 indicates f6i is in subtree
         * and exception table is indexed by a hash of
         * both fib6_dst and fib6_src.
         * However, the src addr used to create the hash
         * might not be exactly the passed in saddr which
         * is a /128 addr from the flow.
         * So we need to use f6i->fib6_src to redo lookup
         * if the passed in saddr does not find anything.
         * (See the logic in ip6_rt_cache_alloc() on how
         * rt->rt6i_src is updated.)
         */
        if (res->f6i->fib6_src.plen)
                src_key = saddr;
find_ex:
#endif
        bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
        rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);

        if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
                ret = rt6_ex->rt6i;

#ifdef CONFIG_IPV6_SUBTREES
        /* Use fib6_src as src_key and redo lookup */
        if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
                src_key = &res->f6i->fib6_src.addr;
                goto find_ex;
        }
#endif

        return ret;
}

/* Remove the passed in cached rt from the hash table that contains it */
static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
                                    const struct rt6_info *rt)
{
        const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int err;

        if (!rcu_access_pointer(nh->rt6i_exception_bucket))
                return -ENOENT;

        spin_lock_bh(&rt6_exception_lock);
        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);

#ifdef CONFIG_IPV6_SUBTREES
        /* rt6i_src.plen != 0 indicates 'from' is in subtree
         * and exception table is indexed by a hash of
         * both rt6i_dst and rt6i_src.
         * Otherwise, the exception table is indexed by
         * a hash of only rt6i_dst.
         */
        if (plen)
                src_key = &rt->rt6i_src.addr;
#endif
        rt6_ex = __rt6_find_exception_spinlock(&bucket,
                                               &rt->rt6i_dst.addr,
                                               src_key);
        if (rt6_ex) {
                rt6_remove_exception(bucket, rt6_ex);
                err = 0;
        } else {
                err = -ENOENT;
        }

        spin_unlock_bh(&rt6_exception_lock);
        return err;
}

struct fib6_nh_excptn_arg {
        struct rt6_info        *rt;
        int                plen;
};

static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_excptn_arg *arg = _arg;
        int err;

        err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
        if (err == 0)
                return 1;

        return 0;
}

static int rt6_remove_exception_rt(struct rt6_info *rt)
{
        struct fib6_info *from;

        from = rcu_dereference(rt->from);
        if (!from || !(rt->rt6i_flags & RTF_CACHE))
                return -EINVAL;

        if (from->nh) {
                struct fib6_nh_excptn_arg arg = {
                        .rt = rt,
                        .plen = from->fib6_src.plen
                };
                int rc;

                /* rc = 1 means an entry was found */
                rc = nexthop_for_each_fib6_nh(from->nh,
                                              rt6_nh_remove_exception_rt,
                                              &arg);
                return rc ? 0 : -ENOENT;
        }

        return fib6_nh_remove_exception(from->fib6_nh,
                                        from->fib6_src.plen, rt);
}

/* Find rt6_ex which contains the passed in rt cache and
 * refresh its stamp
 */
static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
                                     const struct rt6_info *rt)
{
        const struct in6_addr *src_key = NULL;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;

        bucket = fib6_nh_get_excptn_bucket(nh, NULL);
#ifdef CONFIG_IPV6_SUBTREES
        /* rt6i_src.plen != 0 indicates 'from' is in subtree
         * and exception table is indexed by a hash of
         * both rt6i_dst and rt6i_src.
         * Otherwise, the exception table is indexed by
         * a hash of only rt6i_dst.
         */
        if (plen)
                src_key = &rt->rt6i_src.addr;
#endif
        rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
        if (rt6_ex)
                rt6_ex->stamp = jiffies;
}

struct fib6_nh_match_arg {
        const struct net_device *dev;
        const struct in6_addr        *gw;
        struct fib6_nh                *match;
};

/* determine if fib6_nh has given device and gateway */
static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_match_arg *arg = _arg;

        if (arg->dev != nh->fib_nh_dev ||
            (arg->gw && !nh->fib_nh_gw_family) ||
            (!arg->gw && nh->fib_nh_gw_family) ||
            (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
                return 0;

        arg->match = nh;

        /* found a match, break the loop */
        return 1;
}

static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
        struct fib6_info *from;
        struct fib6_nh *fib6_nh;

        rcu_read_lock();

        from = rcu_dereference(rt->from);
        if (!from || !(rt->rt6i_flags & RTF_CACHE))
                goto unlock;

        if (from->nh) {
                struct fib6_nh_match_arg arg = {
                        .dev = rt->dst.dev,
                        .gw = &rt->rt6i_gateway,
                };

                nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);

                if (!arg.match)
                        goto unlock;
                fib6_nh = arg.match;
        } else {
                fib6_nh = from->fib6_nh;
        }
        fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
unlock:
        rcu_read_unlock();
}

static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
                                         struct rt6_info *rt, int mtu)
{
        /* If the new MTU is lower than the route PMTU, this new MTU will be the
         * lowest MTU in the path: always allow updating the route PMTU to
         * reflect PMTU decreases.
         *
         * If the new MTU is higher, and the route PMTU is equal to the local
         * MTU, this means the old MTU is the lowest in the path, so allow
         * updating it: if other nodes now have lower MTUs, PMTU discovery will
         * handle this.
         */

        if (dst_mtu(&rt->dst) >= mtu)
                return true;

        if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
                return true;

        return false;
}

static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
                                       const struct fib6_nh *nh, int mtu)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int i;

        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (!bucket)
                return;

        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
                        struct rt6_info *entry = rt6_ex->rt6i;

                        /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
                         * route), the metrics of its rt->from have already
                         * been updated.
                         */
                        if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
                            rt6_mtu_change_route_allowed(idev, entry, mtu))
                                dst_metric_set(&entry->dst, RTAX_MTU, mtu);
                }
                bucket++;
        }
}

#define RTF_CACHE_GATEWAY        (RTF_GATEWAY | RTF_CACHE)

static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
                                            const struct in6_addr *gateway)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;

        if (!rcu_access_pointer(nh->rt6i_exception_bucket))
                return;

        spin_lock_bh(&rt6_exception_lock);
        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (bucket) {
                for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                        hlist_for_each_entry_safe(rt6_ex, tmp,
                                                  &bucket->chain, hlist) {
                                struct rt6_info *entry = rt6_ex->rt6i;

                                if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
                                    RTF_CACHE_GATEWAY &&
                                    ipv6_addr_equal(gateway,
                                                    &entry->rt6i_gateway)) {
                                        rt6_remove_exception(bucket, rt6_ex);
                                }
                        }
                        bucket++;
                }
        }

        spin_unlock_bh(&rt6_exception_lock);
}

static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
                                      struct rt6_exception *rt6_ex,
                                      struct fib6_gc_args *gc_args,
                                      unsigned long now)
{
        struct rt6_info *rt = rt6_ex->rt6i;

        /* we are pruning and obsoleting aged-out and non gateway exceptions
         * even if others have still references to them, so that on next
         * dst_check() such references can be dropped.
         * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
         * expired, independently from their aging, as per RFC 8201 section 4
         */
        if (!(rt->rt6i_flags & RTF_EXPIRES)) {
                if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
                        pr_debug("aging clone %p\n", rt);
                        rt6_remove_exception(bucket, rt6_ex);
                        return;
                }
        } else if (time_after(jiffies, rt->dst.expires)) {
                pr_debug("purging expired route %p\n", rt);
                rt6_remove_exception(bucket, rt6_ex);
                return;
        }

        if (rt->rt6i_flags & RTF_GATEWAY) {
                struct neighbour *neigh;

                neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);

                if (!(neigh && (neigh->flags & NTF_ROUTER))) {
                        pr_debug("purging route %p via non-router but gateway\n",
                                 rt);
                        rt6_remove_exception(bucket, rt6_ex);
                        return;
                }
        }

        gc_args->more++;
}

static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
                                   struct fib6_gc_args *gc_args,
                                   unsigned long now)
{
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        struct hlist_node *tmp;
        int i;

        if (!rcu_access_pointer(nh->rt6i_exception_bucket))
                return;

        rcu_read_lock_bh();
        spin_lock(&rt6_exception_lock);
        bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
        if (bucket) {
                for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                        hlist_for_each_entry_safe(rt6_ex, tmp,
                                                  &bucket->chain, hlist) {
                                rt6_age_examine_exception(bucket, rt6_ex,
                                                          gc_args, now);
                        }
                        bucket++;
                }
        }
        spin_unlock(&rt6_exception_lock);
        rcu_read_unlock_bh();
}

struct fib6_nh_age_excptn_arg {
        struct fib6_gc_args        *gc_args;
        unsigned long                now;
};

static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_age_excptn_arg *arg = _arg;

        fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
        return 0;
}

void rt6_age_exceptions(struct fib6_info *f6i,
                        struct fib6_gc_args *gc_args,
                        unsigned long now)
{
        if (f6i->nh) {
                struct fib6_nh_age_excptn_arg arg = {
                        .gc_args = gc_args,
                        .now = now
                };

                nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
                                         &arg);
        } else {
                fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
        }
}

/* must be called with rcu lock held */
int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
                      struct flowi6 *fl6, struct fib6_result *res, int strict)
{
        struct fib6_node *fn, *saved_fn;

        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
        saved_fn = fn;

redo_rt6_select:
        rt6_select(net, fn, oif, res, strict);
        if (res->f6i == net->ipv6.fib6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto redo_rt6_select;
                else if (strict & RT6_LOOKUP_F_REACHABLE) {
                        /* also consider unreachable route */
                        strict &= ~RT6_LOOKUP_F_REACHABLE;
                        fn = saved_fn;
                        goto redo_rt6_select;
                }
        }

        trace_fib6_table_lookup(net, res, table, fl6);

        return 0;
}

struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                               int oif, struct flowi6 *fl6,
                               const struct sk_buff *skb, int flags)
{
        struct fib6_result res = {};
        struct rt6_info *rt = NULL;
        int strict = 0;

        WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
                     !rcu_read_lock_held());

        strict |= flags & RT6_LOOKUP_F_IFACE;
        strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
        if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
                strict |= RT6_LOOKUP_F_REACHABLE;

        rcu_read_lock();

        fib6_table_lookup(net, table, oif, fl6, &res, strict);
        if (res.f6i == net->ipv6.fib6_null_entry)
                goto out;

        fib6_select_path(net, &res, fl6, oif, false, skb, strict);

        /*Search through exception table */
        rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
        if (rt) {
                goto out;
        } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
                            !res.nh->fib_nh_gw_family)) {
                /* Create a RTF_CACHE clone which will not be
                 * owned by the fib6 tree.  It is for the special case where
                 * the daddr in the skb during the neighbor look-up is different
                 * from the fl6->daddr used to look-up route here.
                 */
                rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);

                if (rt) {
                        /* 1 refcnt is taken during ip6_rt_cache_alloc().
                         * As rt6_uncached_list_add() does not consume refcnt,
                         * this refcnt is always returned to the caller even
                         * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
                         */
                        rt6_uncached_list_add(rt);
                        rcu_read_unlock();

                        return rt;
                }
        } else {
                /* Get a percpu copy */
                local_bh_disable();
                rt = rt6_get_pcpu_route(&res);

                if (!rt)
                        rt = rt6_make_pcpu_route(net, &res);

                local_bh_enable();
        }
out:
        if (!rt)
                rt = net->ipv6.ip6_null_entry;
        if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                ip6_hold_safe(net, &rt);
        rcu_read_unlock();

        return rt;
}
EXPORT_SYMBOL_GPL(ip6_pol_route);

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
                                            struct fib6_table *table,
                                            struct flowi6 *fl6,
                                            const struct sk_buff *skb,
                                            int flags)
{
        return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
}

struct dst_entry *ip6_route_input_lookup(struct net *net,
                                         struct net_device *dev,
                                         struct flowi6 *fl6,
                                         const struct sk_buff *skb,
                                         int flags)
{
        if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
                flags |= RT6_LOOKUP_F_IFACE;

        return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
}
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);

static void ip6_multipath_l3_keys(const struct sk_buff *skb,
                                  struct flow_keys *keys,
                                  struct flow_keys *flkeys)
{
        const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
        const struct ipv6hdr *key_iph = outer_iph;
        struct flow_keys *_flkeys = flkeys;
        const struct ipv6hdr *inner_iph;
        const struct icmp6hdr *icmph;
        struct ipv6hdr _inner_iph;
        struct icmp6hdr _icmph;

        if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
                goto out;

        icmph = skb_header_pointer(skb, skb_transport_offset(skb),
                                   sizeof(_icmph), &_icmph);
        if (!icmph)
                goto out;

        if (!icmpv6_is_err(icmph->icmp6_type))
                goto out;

        inner_iph = skb_header_pointer(skb,
                                       skb_transport_offset(skb) + sizeof(*icmph),
                                       sizeof(_inner_iph), &_inner_iph);
        if (!inner_iph)
                goto out;

        key_iph = inner_iph;
        _flkeys = NULL;
out:
        if (_flkeys) {
                keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
                keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
                keys->tags.flow_label = _flkeys->tags.flow_label;
                keys->basic.ip_proto = _flkeys->basic.ip_proto;
        } else {
                keys->addrs.v6addrs.src = key_iph->saddr;
                keys->addrs.v6addrs.dst = key_iph->daddr;
                keys->tags.flow_label = ip6_flowlabel(key_iph);
                keys->basic.ip_proto = key_iph->nexthdr;
        }
}

static u32 rt6_multipath_custom_hash_outer(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool *p_has_inner)
{
        u32 hash_fields = ip6_multipath_hash_fields(net);
        struct flow_keys keys, hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);

        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
                hash_keys.tags.flow_label = keys.tags.flow_label;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 rt6_multipath_custom_hash_inner(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool has_inner)
{
        u32 hash_fields = ip6_multipath_hash_fields(net);
        struct flow_keys keys, hash_keys;

        /* We assume the packet carries an encapsulation, but if none was
         * encountered during dissection of the outer flow, then there is no
         * point in calling the flow dissector again.
         */
        if (!has_inner)
                return 0;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, 0);

        if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
                return 0;

        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
                        hash_keys.tags.flow_label = keys.tags.flow_label;
        }

        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 rt6_multipath_custom_hash_skb(const struct net *net,
                                         const struct sk_buff *skb)
{
        u32 mhash, mhash_inner;
        bool has_inner = true;

        mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
        mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);

        return jhash_2words(mhash, mhash_inner, 0);
}

static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
                                         const struct flowi6 *fl6)
{
        u32 hash_fields = ip6_multipath_hash_fields(net);
        struct flow_keys hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v6addrs.src = fl6->saddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v6addrs.dst = fl6->daddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = fl6->flowi6_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
                hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = fl6->fl6_sport;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = fl6->fl6_dport;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

/* if skb is set it will be used and fl6 can be NULL */
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
                       const struct sk_buff *skb, struct flow_keys *flkeys)
{
        struct flow_keys hash_keys;
        u32 mhash = 0;

        switch (ip6_multipath_hash_policy(net)) {
        case 0:
                memset(&hash_keys, 0, sizeof(hash_keys));
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (skb) {
                        ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
                } else {
                        hash_keys.addrs.v6addrs.src = fl6->saddr;
                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
                        hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 1:
                if (skb) {
                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
                        struct flow_keys keys;

                        /* short-circuit if we already have L4 hash present */
                        if (skb->l4_hash)
                                return skb_get_hash_raw(skb) >> 1;

                        memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, flag);
                                flkeys = &keys;
                        }
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
                        hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
                        hash_keys.ports.src = flkeys->ports.src;
                        hash_keys.ports.dst = flkeys->ports.dst;
                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
                } else {
                        memset(&hash_keys, 0, sizeof(hash_keys));
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        hash_keys.addrs.v6addrs.src = fl6->saddr;
                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
                        hash_keys.ports.src = fl6->fl6_sport;
                        hash_keys.ports.dst = fl6->fl6_dport;
                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 2:
                memset(&hash_keys, 0, sizeof(hash_keys));
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (skb) {
                        struct flow_keys keys;

                        if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, 0);
                                flkeys = &keys;
                        }

                        /* Inner can be v4 or v6 */
                        if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                                hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
                                hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
                        } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                                hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
                                hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
                                hash_keys.tags.flow_label = flkeys->tags.flow_label;
                                hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
                        } else {
                                /* Same as case 0 */
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                                ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
                        }
                } else {
                        /* Same as case 0 */
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        hash_keys.addrs.v6addrs.src = fl6->saddr;
                        hash_keys.addrs.v6addrs.dst = fl6->daddr;
                        hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
                        hash_keys.basic.ip_proto = fl6->flowi6_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 3:
                if (skb)
                        mhash = rt6_multipath_custom_hash_skb(net, skb);
                else
                        mhash = rt6_multipath_custom_hash_fl6(net, fl6);
                break;
        }

        return mhash >> 1;
}

/* Called with rcu held */
void ip6_route_input(struct sk_buff *skb)
{
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        struct net *net = dev_net(skb->dev);
        int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
        struct ip_tunnel_info *tun_info;
        struct flowi6 fl6 = {
                .flowi6_iif = skb->dev->ifindex,
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
                .flowi6_mark = skb->mark,
                .flowi6_proto = iph->nexthdr,
        };
        struct flow_keys *flkeys = NULL, _flkeys;

        tun_info = skb_tunnel_info(skb);
        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
                fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;

        if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
                flkeys = &_flkeys;

        if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
                fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
        skb_dst_drop(skb);
        skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
                                                      &fl6, skb, flags));
}

INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags)
{
        return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}

static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
                                                      const struct sock *sk,
                                                      struct flowi6 *fl6,
                                                      int flags)
{
        bool any_src;

        if (ipv6_addr_type(&fl6->daddr) &
            (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
                struct dst_entry *dst;

                /* This function does not take refcnt on the dst */
                dst = l3mdev_link_scope_lookup(net, fl6);
                if (dst)
                        return dst;
        }

        fl6->flowi6_iif = LOOPBACK_IFINDEX;

        flags |= RT6_LOOKUP_F_DST_NOREF;
        any_src = ipv6_addr_any(&fl6->saddr);
        if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
            (fl6->flowi6_oif && any_src))
                flags |= RT6_LOOKUP_F_IFACE;

        if (!any_src)
                flags |= RT6_LOOKUP_F_HAS_SADDR;
        else if (sk)
                flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));

        return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}

struct dst_entry *ip6_route_output_flags(struct net *net,
                                         const struct sock *sk,
                                         struct flowi6 *fl6,
                                         int flags)
{
        struct dst_entry *dst;
        struct rt6_info *rt6;

        rcu_read_lock();
        dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
        rt6 = dst_rt6_info(dst);
        /* For dst cached in uncached_list, refcnt is already taken. */
        if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
                dst = &net->ipv6.ip6_null_entry->dst;
                dst_hold(dst);
        }
        rcu_read_unlock();

        return dst;
}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);

struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
        struct rt6_info *rt, *ort = dst_rt6_info(dst_orig);
        struct net_device *loopback_dev = net->loopback_dev;
        struct dst_entry *new = NULL;

        rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
                       DST_OBSOLETE_DEAD, 0);
        if (rt) {
                rt6_info_init(rt);
                atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);

                new = &rt->dst;
                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard_out;

                dst_copy_metrics(new, &ort->dst);

                rt->rt6i_idev = in6_dev_get(loopback_dev);
                rt->rt6i_gateway = ort->rt6i_gateway;
                rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;

                memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
                memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
        }

        dst_release(dst_orig);
        return new ? new : ERR_PTR(-ENOMEM);
}

/*
 *        Destination cache support functions
 */

static bool fib6_check(struct fib6_info *f6i, u32 cookie)
{
        u32 rt_cookie = 0;

        if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
                return false;

        if (fib6_check_expired(f6i))
                return false;

        return true;
}

static struct dst_entry *rt6_check(struct rt6_info *rt,
                                   struct fib6_info *from,
                                   u32 cookie)
{
        u32 rt_cookie = 0;

        if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
            rt_cookie != cookie)
                return NULL;

        if (rt6_check_expired(rt))
                return NULL;

        return &rt->dst;
}

static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
                                            struct fib6_info *from,
                                            u32 cookie)
{
        if (!__rt6_check_expired(rt) &&
            rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
            fib6_check(from, cookie))
                return &rt->dst;
        else
                return NULL;
}

INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
                                                        u32 cookie)
{
        struct dst_entry *dst_ret;
        struct fib6_info *from;
        struct rt6_info *rt;

        rt = dst_rt6_info(dst);

        if (rt->sernum)
                return rt6_is_valid(rt) ? dst : NULL;

        rcu_read_lock();

        /* All IPV6 dsts are created with ->obsolete set to the value
         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
         * into this function always.
         */

        from = rcu_dereference(rt->from);

        if (from && (rt->rt6i_flags & RTF_PCPU ||
            unlikely(!list_empty(&rt->dst.rt_uncached))))
                dst_ret = rt6_dst_from_check(rt, from, cookie);
        else
                dst_ret = rt6_check(rt, from, cookie);

        rcu_read_unlock();

        return dst_ret;
}
EXPORT_INDIRECT_CALLABLE(ip6_dst_check);

static void ip6_negative_advice(struct sock *sk,
                                struct dst_entry *dst)
{
        struct rt6_info *rt = dst_rt6_info(dst);

        if (rt->rt6i_flags & RTF_CACHE) {
                rcu_read_lock();
                if (rt6_check_expired(rt)) {
                        /* rt/dst can not be destroyed yet,
                         * because of rcu_read_lock()
                         */
                        sk_dst_reset(sk);
                        rt6_remove_exception_rt(rt);
                }
                rcu_read_unlock();
                return;
        }
        sk_dst_reset(sk);
}

static void ip6_link_failure(struct sk_buff *skb)
{
        struct rt6_info *rt;

        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);

        rt = dst_rt6_info(skb_dst(skb));
        if (rt) {
                rcu_read_lock();
                if (rt->rt6i_flags & RTF_CACHE) {
                        rt6_remove_exception_rt(rt);
                } else {
                        struct fib6_info *from;
                        struct fib6_node *fn;

                        from = rcu_dereference(rt->from);
                        if (from) {
                                fn = rcu_dereference(from->fib6_node);
                                if (fn && (rt->rt6i_flags & RTF_DEFAULT))
                                        WRITE_ONCE(fn->fn_sernum, -1);
                        }
                }
                rcu_read_unlock();
        }
}

static void rt6_update_expires(struct rt6_info *rt0, int timeout)
{
        if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
                struct fib6_info *from;

                rcu_read_lock();
                from = rcu_dereference(rt0->from);
                if (from)
                        rt0->dst.expires = from->expires;
                rcu_read_unlock();
        }

        dst_set_expires(&rt0->dst, timeout);
        rt0->rt6i_flags |= RTF_EXPIRES;
}

static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
        struct net *net = dev_net(rt->dst.dev);

        dst_metric_set(&rt->dst, RTAX_MTU, mtu);
        rt->rt6i_flags |= RTF_MODIFIED;
        rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}

static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
        return !(rt->rt6i_flags & RTF_CACHE) &&
                (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
}

static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
                                 const struct ipv6hdr *iph, u32 mtu,
                                 bool confirm_neigh)
{
        const struct in6_addr *daddr, *saddr;
        struct rt6_info *rt6 = dst_rt6_info(dst);

        /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
         * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
         * [see also comment in rt6_mtu_change_route()]
         */

        if (iph) {
                daddr = &iph->daddr;
                saddr = &iph->saddr;
        } else if (sk) {
                daddr = &sk->sk_v6_daddr;
                saddr = &inet6_sk(sk)->saddr;
        } else {
                daddr = NULL;
                saddr = NULL;
        }

        if (confirm_neigh)
                dst_confirm_neigh(dst, daddr);

        if (mtu < IPV6_MIN_MTU)
                return;
        if (mtu >= dst_mtu(dst))
                return;

        if (!rt6_cache_allowed_for_pmtu(rt6)) {
                rt6_do_update_pmtu(rt6, mtu);
                /* update rt6_ex->stamp for cache */
                if (rt6->rt6i_flags & RTF_CACHE)
                        rt6_update_exception_stamp_rt(rt6);
        } else if (daddr) {
                struct fib6_result res = {};
                struct rt6_info *nrt6;

                rcu_read_lock();
                res.f6i = rcu_dereference(rt6->from);
                if (!res.f6i)
                        goto out_unlock;

                res.fib6_flags = res.f6i->fib6_flags;
                res.fib6_type = res.f6i->fib6_type;

                if (res.f6i->nh) {
                        struct fib6_nh_match_arg arg = {
                                .dev = dst->dev,
                                .gw = &rt6->rt6i_gateway,
                        };

                        nexthop_for_each_fib6_nh(res.f6i->nh,
                                                 fib6_nh_find_match, &arg);

                        /* fib6_info uses a nexthop that does not have fib6_nh
                         * using the dst->dev + gw. Should be impossible.
                         */
                        if (!arg.match)
                                goto out_unlock;

                        res.nh = arg.match;
                } else {
                        res.nh = res.f6i->fib6_nh;
                }

                nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
                if (nrt6) {
                        rt6_do_update_pmtu(nrt6, mtu);
                        if (rt6_insert_exception(nrt6, &res))
                                dst_release_immediate(&nrt6->dst);
                }
out_unlock:
                rcu_read_unlock();
        }
}

static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                               struct sk_buff *skb, u32 mtu,
                               bool confirm_neigh)
{
        __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
                             confirm_neigh);
}

void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
                     int oif, u32 mark, kuid_t uid)
{
        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
        struct dst_entry *dst;
        struct flowi6 fl6 = {
                .flowi6_oif = oif,
                .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
                .flowi6_uid = uid,
        };

        dst = ip6_route_output(net, NULL, &fl6);
        if (!dst->error)
                __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
        dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);

void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
        int oif = sk->sk_bound_dev_if;
        struct dst_entry *dst;

        if (!oif && skb->dev)
                oif = l3mdev_master_ifindex(skb->dev);

        ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark),
                        sk->sk_uid);

        dst = __sk_dst_get(sk);
        if (!dst || !dst->obsolete ||
            dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
                return;

        bh_lock_sock(sk);
        if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
                ip6_datagram_dst_update(sk, false);
        bh_unlock_sock(sk);
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);

void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
                           const struct flowi6 *fl6)
{
#ifdef CONFIG_IPV6_SUBTREES
        struct ipv6_pinfo *np = inet6_sk(sk);
#endif

        ip6_dst_store(sk, dst,
                      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
                      &sk->sk_v6_daddr : NULL,
#ifdef CONFIG_IPV6_SUBTREES
                      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
                      &np->saddr :
#endif
                      NULL);
}

static bool ip6_redirect_nh_match(const struct fib6_result *res,
                                  struct flowi6 *fl6,
                                  const struct in6_addr *gw,
                                  struct rt6_info **ret)
{
        const struct fib6_nh *nh = res->nh;

        if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
            fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
                return false;

        /* rt_cache's gateway might be different from its 'parent'
         * in the case of an ip redirect.
         * So we keep searching in the exception table if the gateway
         * is different.
         */
        if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
                struct rt6_info *rt_cache;

                rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
                if (rt_cache &&
                    ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
                        *ret = rt_cache;
                        return true;
                }
                return false;
        }
        return true;
}

struct fib6_nh_rd_arg {
        struct fib6_result        *res;
        struct flowi6                *fl6;
        const struct in6_addr        *gw;
        struct rt6_info                **ret;
};

static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_rd_arg *arg = _arg;

        arg->res->nh = nh;
        return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
}

/* Handle redirects */
struct ip6rd_flowi {
        struct flowi6 fl6;
        struct in6_addr gateway;
};

INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
                                             struct fib6_table *table,
                                             struct flowi6 *fl6,
                                             const struct sk_buff *skb,
                                             int flags)
{
        struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
        struct rt6_info *ret = NULL;
        struct fib6_result res = {};
        struct fib6_nh_rd_arg arg = {
                .res = &res,
                .fl6 = fl6,
                .gw  = &rdfl->gateway,
                .ret = &ret
        };
        struct fib6_info *rt;
        struct fib6_node *fn;

        /* Get the "current" route for this destination and
         * check if the redirect has come from appropriate router.
         *
         * RFC 4861 specifies that redirects should only be
         * accepted if they come from the nexthop to the target.
         * Due to the way the routes are chosen, this notion
         * is a bit fuzzy and one might need to check all possible
         * routes.
         */

        rcu_read_lock();
        fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
        for_each_fib6_node_rt_rcu(fn) {
                res.f6i = rt;
                if (fib6_check_expired(rt))
                        continue;
                if (rt->fib6_flags & RTF_REJECT)
                        break;
                if (unlikely(rt->nh)) {
                        if (nexthop_is_blackhole(rt->nh))
                                continue;
                        /* on match, res->nh is filled in and potentially ret */
                        if (nexthop_for_each_fib6_nh(rt->nh,
                                                     fib6_nh_redirect_match,
                                                     &arg))
                                goto out;
                } else {
                        res.nh = rt->fib6_nh;
                        if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
                                                  &ret))
                                goto out;
                }
        }

        if (!rt)
                rt = net->ipv6.fib6_null_entry;
        else if (rt->fib6_flags & RTF_REJECT) {
                ret = net->ipv6.ip6_null_entry;
                goto out;
        }

        if (rt == net->ipv6.fib6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
                        goto restart;
        }

        res.f6i = rt;
        res.nh = rt->fib6_nh;
out:
        if (ret) {
                ip6_hold_safe(net, &ret);
        } else {
                res.fib6_flags = res.f6i->fib6_flags;
                res.fib6_type = res.f6i->fib6_type;
                ret = ip6_create_rt_rcu(&res);
        }

        rcu_read_unlock();

        trace_fib6_table_lookup(net, &res, table, fl6);
        return ret;
};

static struct dst_entry *ip6_route_redirect(struct net *net,
                                            const struct flowi6 *fl6,
                                            const struct sk_buff *skb,
                                            const struct in6_addr *gateway)
{
        int flags = RT6_LOOKUP_F_HAS_SADDR;
        struct ip6rd_flowi rdfl;

        rdfl.fl6 = *fl6;
        rdfl.gateway = *gateway;

        return fib6_rule_lookup(net, &rdfl.fl6, skb,
                                flags, __ip6_route_redirect);
}

void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
                  kuid_t uid)
{
        const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
        struct dst_entry *dst;
        struct flowi6 fl6 = {
                .flowi6_iif = LOOPBACK_IFINDEX,
                .flowi6_oif = oif,
                .flowi6_mark = mark,
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowlabel = ip6_flowinfo(iph),
                .flowi6_uid = uid,
        };

        dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
        rt6_do_redirect(dst, NULL, skb);
        dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);

void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
{
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
        struct dst_entry *dst;
        struct flowi6 fl6 = {
                .flowi6_iif = LOOPBACK_IFINDEX,
                .flowi6_oif = oif,
                .daddr = msg->dest,
                .saddr = iph->daddr,
                .flowi6_uid = sock_net_uid(net, NULL),
        };

        dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
        rt6_do_redirect(dst, NULL, skb);
        dst_release(dst);
}

void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
        ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
                     READ_ONCE(sk->sk_mark), sk->sk_uid);
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);

static unsigned int ip6_default_advmss(const struct dst_entry *dst)
{
        struct net_device *dev = dst->dev;
        unsigned int mtu = dst_mtu(dst);
        struct net *net;

        mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

        rcu_read_lock();

        net = dev_net_rcu(dev);
        if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
                mtu = net->ipv6.sysctl.ip6_rt_min_advmss;

        rcu_read_unlock();

        /*
         * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
         * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
         * IPV6_MAXPLEN is also valid and means: "any MSS,
         * rely only on pmtu discovery"
         */
        if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
                mtu = IPV6_MAXPLEN;
        return mtu;
}

INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
{
        return ip6_dst_mtu_maybe_forward(dst, false);
}
EXPORT_INDIRECT_CALLABLE(ip6_mtu);

/* MTU selection:
 * 1. mtu on route is locked - use it
 * 2. mtu from nexthop exception
 * 3. mtu from egress device
 *
 * based on ip6_dst_mtu_forward and exception logic of
 * rt6_find_cached_rt; called with rcu_read_lock
 */
u32 ip6_mtu_from_fib6(const struct fib6_result *res,
                      const struct in6_addr *daddr,
                      const struct in6_addr *saddr)
{
        const struct fib6_nh *nh = res->nh;
        struct fib6_info *f6i = res->f6i;
        struct inet6_dev *idev;
        struct rt6_info *rt;
        u32 mtu = 0;

        if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
                mtu = f6i->fib6_pmtu;
                if (mtu)
                        goto out;
        }

        rt = rt6_find_cached_rt(res, daddr, saddr);
        if (unlikely(rt)) {
                mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
        } else {
                struct net_device *dev = nh->fib_nh_dev;

                mtu = IPV6_MIN_MTU;
                idev = __in6_dev_get(dev);
                if (idev)
                        mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6));
        }

        mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
out:
        return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}

struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
                                  struct flowi6 *fl6)
{
        struct dst_entry *dst;
        struct rt6_info *rt;
        struct inet6_dev *idev = in6_dev_get(dev);
        struct net *net = dev_net(dev);

        if (unlikely(!idev))
                return ERR_PTR(-ENODEV);

        rt = ip6_dst_alloc(net, dev, 0);
        if (unlikely(!rt)) {
                in6_dev_put(idev);
                dst = ERR_PTR(-ENOMEM);
                goto out;
        }

        rt->dst.input = ip6_input;
        rt->dst.output  = ip6_output;
        rt->rt6i_gateway  = fl6->daddr;
        rt->rt6i_dst.addr = fl6->daddr;
        rt->rt6i_dst.plen = 128;
        rt->rt6i_idev     = idev;
        dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);

        /* Add this dst into uncached_list so that rt6_disable_ip() can
         * do proper release of the net_device
         */
        rt6_uncached_list_add(rt);

        dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

out:
        return dst;
}

static void ip6_dst_gc(struct dst_ops *ops)
{
        struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
        int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
        int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
        int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
        unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
        unsigned int val;
        int entries;

        if (time_after(rt_last_gc + rt_min_interval, jiffies))
                goto out;

        fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
        entries = dst_entries_get_slow(ops);
        if (entries < ops->gc_thresh)
                atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
out:
        val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
        atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
}

static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
                               const struct in6_addr *gw_addr, u32 tbid,
                               int flags, struct fib6_result *res)
{
        struct flowi6 fl6 = {
                .flowi6_oif = cfg->fc_ifindex,
                .daddr = *gw_addr,
                .saddr = cfg->fc_prefsrc,
        };
        struct fib6_table *table;
        int err;

        table = fib6_get_table(net, tbid);
        if (!table)
                return -EINVAL;

        if (!ipv6_addr_any(&cfg->fc_prefsrc))
                flags |= RT6_LOOKUP_F_HAS_SADDR;

        flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;

        err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
        if (!err && res->f6i != net->ipv6.fib6_null_entry)
                fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
                                 cfg->fc_ifindex != 0, NULL, flags);

        return err;
}

static int ip6_route_check_nh_onlink(struct net *net,
                                     struct fib6_config *cfg,
                                     const struct net_device *dev,
                                     struct netlink_ext_ack *extack)
{
        u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
        const struct in6_addr *gw_addr = &cfg->fc_gateway;
        struct fib6_result res = {};
        int err;

        err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
        if (!err && !(res.fib6_flags & RTF_REJECT) &&
            /* ignore match if it is the default route */
            !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
            (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
                NL_SET_ERR_MSG(extack,
                               "Nexthop has invalid gateway or device mismatch");
                err = -EINVAL;
        }

        return err;
}

static int ip6_route_check_nh(struct net *net,
                              struct fib6_config *cfg,
                              struct net_device **_dev,
                              netdevice_tracker *dev_tracker,
                              struct inet6_dev **idev)
{
        const struct in6_addr *gw_addr = &cfg->fc_gateway;
        struct net_device *dev = _dev ? *_dev : NULL;
        int flags = RT6_LOOKUP_F_IFACE;
        struct fib6_result res = {};
        int err = -EHOSTUNREACH;

        if (cfg->fc_table) {
                err = ip6_nh_lookup_table(net, cfg, gw_addr,
                                          cfg->fc_table, flags, &res);
                /* gw_addr can not require a gateway or resolve to a reject
                 * route. If a device is given, it must match the result.
                 */
                if (err || res.fib6_flags & RTF_REJECT ||
                    res.nh->fib_nh_gw_family ||
                    (dev && dev != res.nh->fib_nh_dev))
                        err = -EHOSTUNREACH;
        }

        if (err < 0) {
                struct flowi6 fl6 = {
                        .flowi6_oif = cfg->fc_ifindex,
                        .daddr = *gw_addr,
                };

                err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
                if (err || res.fib6_flags & RTF_REJECT ||
                    res.nh->fib_nh_gw_family)
                        err = -EHOSTUNREACH;

                if (err)
                        return err;

                fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
                                 cfg->fc_ifindex != 0, NULL, flags);
        }

        err = 0;
        if (dev) {
                if (dev != res.nh->fib_nh_dev)
                        err = -EHOSTUNREACH;
        } else {
                *_dev = dev = res.nh->fib_nh_dev;
                netdev_hold(dev, dev_tracker, GFP_ATOMIC);
                *idev = in6_dev_get(dev);
        }

        return err;
}

static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
                           struct net_device **_dev,
                           netdevice_tracker *dev_tracker,
                           struct inet6_dev **idev,
                           struct netlink_ext_ack *extack)
{
        const struct in6_addr *gw_addr = &cfg->fc_gateway;
        int gwa_type = ipv6_addr_type(gw_addr);
        bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
        const struct net_device *dev = *_dev;
        bool need_addr_check = !dev;
        int err = -EINVAL;

        /* if gw_addr is local we will fail to detect this in case
         * address is still TENTATIVE (DAD in progress). rt6_lookup()
         * will return already-added prefix route via interface that
         * prefix route was assigned to, which might be non-loopback.
         */
        if (dev &&
            ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
                NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
                goto out;
        }

        if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
                /* IPv6 strictly inhibits using not link-local
                 * addresses as nexthop address.
                 * Otherwise, router will not able to send redirects.
                 * It is very good, but in some (rare!) circumstances
                 * (SIT, PtP, NBMA NOARP links) it is handy to allow
                 * some exceptions. --ANK
                 * We allow IPv4-mapped nexthops to support RFC4798-type
                 * addressing
                 */
                if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
                        NL_SET_ERR_MSG(extack, "Invalid gateway address");
                        goto out;
                }

                rcu_read_lock();

                if (cfg->fc_flags & RTNH_F_ONLINK)
                        err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
                else
                        err = ip6_route_check_nh(net, cfg, _dev, dev_tracker,
                                                 idev);

                rcu_read_unlock();

                if (err)
                        goto out;
        }

        /* reload in case device was changed */
        dev = *_dev;

        err = -EINVAL;
        if (!dev) {
                NL_SET_ERR_MSG(extack, "Egress device not specified");
                goto out;
        } else if (dev->flags & IFF_LOOPBACK) {
                NL_SET_ERR_MSG(extack,
                               "Egress device can not be loopback device for this route");
                goto out;
        }

        /* if we did not check gw_addr above, do so now that the
         * egress device has been resolved.
         */
        if (need_addr_check &&
            ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
                NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
                goto out;
        }

        err = 0;
out:
        return err;
}

static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
{
        if ((flags & RTF_REJECT) ||
            (dev && (dev->flags & IFF_LOOPBACK) &&
             !(addr_type & IPV6_ADDR_LOOPBACK) &&
             !(flags & (RTF_ANYCAST | RTF_LOCAL))))
                return true;

        return false;
}

int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
                 struct fib6_config *cfg, gfp_t gfp_flags,
                 struct netlink_ext_ack *extack)
{
        netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker;
        struct net_device *dev = NULL;
        struct inet6_dev *idev = NULL;
        int addr_type;
        int err;

        fib6_nh->fib_nh_family = AF_INET6;
#ifdef CONFIG_IPV6_ROUTER_PREF
        fib6_nh->last_probe = jiffies;
#endif
        if (cfg->fc_is_fdb) {
                fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
                fib6_nh->fib_nh_gw_family = AF_INET6;
                return 0;
        }

        err = -ENODEV;
        if (cfg->fc_ifindex) {
                dev = netdev_get_by_index(net, cfg->fc_ifindex,
                                          dev_tracker, gfp_flags);
                if (!dev)
                        goto out;
                idev = in6_dev_get(dev);
                if (!idev)
                        goto out;
        }

        if (cfg->fc_flags & RTNH_F_ONLINK) {
                if (!dev) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop device required for onlink");
                        goto out;
                }

                if (!(dev->flags & IFF_UP)) {
                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
                        err = -ENETDOWN;
                        goto out;
                }

                fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
        }

        fib6_nh->fib_nh_weight = 1;

        /* We cannot add true routes via loopback here,
         * they would result in kernel looping; promote them to reject routes
         */
        addr_type = ipv6_addr_type(&cfg->fc_dst);
        if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
                /* hold loopback dev/idev if we haven't done so. */
                if (dev != net->loopback_dev) {
                        if (dev) {
                                netdev_put(dev, dev_tracker);
                                in6_dev_put(idev);
                        }
                        dev = net->loopback_dev;
                        netdev_hold(dev, dev_tracker, gfp_flags);
                        idev = in6_dev_get(dev);
                        if (!idev) {
                                err = -ENODEV;
                                goto out;
                        }
                }
                goto pcpu_alloc;
        }

        if (cfg->fc_flags & RTF_GATEWAY) {
                err = ip6_validate_gw(net, cfg, &dev, dev_tracker,
                                      &idev, extack);
                if (err)
                        goto out;

                fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
                fib6_nh->fib_nh_gw_family = AF_INET6;
        }

        err = -ENODEV;
        if (!dev)
                goto out;

        if (!idev || idev->cnf.disable_ipv6) {
                NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
                err = -EACCES;
                goto out;
        }

        if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
                NL_SET_ERR_MSG(extack, "Nexthop device is not up");
                err = -ENETDOWN;
                goto out;
        }

        if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
            !netif_carrier_ok(dev))
                fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;

        err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
                                 cfg->fc_encap_type, cfg, gfp_flags, extack);
        if (err)
                goto out;

pcpu_alloc:
        fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
        if (!fib6_nh->rt6i_pcpu) {
                err = -ENOMEM;
                goto out;
        }

        fib6_nh->fib_nh_dev = dev;
        fib6_nh->fib_nh_oif = dev->ifindex;
        err = 0;
out:
        if (idev)
                in6_dev_put(idev);

        if (err) {
                fib_nh_common_release(&fib6_nh->nh_common);
                fib6_nh->nh_common.nhc_pcpu_rth_output = NULL;
                fib6_nh->fib_nh_lws = NULL;
                netdev_put(dev, dev_tracker);
        }

        return err;
}

void fib6_nh_release(struct fib6_nh *fib6_nh)
{
        struct rt6_exception_bucket *bucket;

        rcu_read_lock();

        fib6_nh_flush_exceptions(fib6_nh, NULL);
        bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
        if (bucket) {
                rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
                kfree(bucket);
        }

        rcu_read_unlock();

        fib6_nh_release_dsts(fib6_nh);
        free_percpu(fib6_nh->rt6i_pcpu);

        fib_nh_common_release(&fib6_nh->nh_common);
}

void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
{
        int cpu;

        if (!fib6_nh->rt6i_pcpu)
                return;

        for_each_possible_cpu(cpu) {
                struct rt6_info *pcpu_rt, **ppcpu_rt;

                ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
                pcpu_rt = xchg(ppcpu_rt, NULL);
                if (pcpu_rt) {
                        dst_dev_put(&pcpu_rt->dst);
                        dst_release(&pcpu_rt->dst);
                }
        }
}

static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
                                              gfp_t gfp_flags,
                                              struct netlink_ext_ack *extack)
{
        struct net *net = cfg->fc_nlinfo.nl_net;
        struct fib6_info *rt = NULL;
        struct nexthop *nh = NULL;
        struct fib6_table *table;
        struct fib6_nh *fib6_nh;
        int err = -EINVAL;
        int addr_type;

        /* RTF_PCPU is an internal flag; can not be set by userspace */
        if (cfg->fc_flags & RTF_PCPU) {
                NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
                goto out;
        }

        /* RTF_CACHE is an internal flag; can not be set by userspace */
        if (cfg->fc_flags & RTF_CACHE) {
                NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
                goto out;
        }

        if (cfg->fc_type > RTN_MAX) {
                NL_SET_ERR_MSG(extack, "Invalid route type");
                goto out;
        }

        if (cfg->fc_dst_len > 128) {
                NL_SET_ERR_MSG(extack, "Invalid prefix length");
                goto out;
        }
        if (cfg->fc_src_len > 128) {
                NL_SET_ERR_MSG(extack, "Invalid source address length");
                goto out;
        }
#ifndef CONFIG_IPV6_SUBTREES
        if (cfg->fc_src_len) {
                NL_SET_ERR_MSG(extack,
                               "Specifying source address requires IPV6_SUBTREES to be enabled");
                goto out;
        }
#endif
        if (cfg->fc_nh_id) {
                nh = nexthop_find_by_id(net, cfg->fc_nh_id);
                if (!nh) {
                        NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                        goto out;
                }
                err = fib6_check_nexthop(nh, cfg, extack);
                if (err)
                        goto out;
        }

        err = -ENOBUFS;
        if (cfg->fc_nlinfo.nlh &&
            !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
                table = fib6_get_table(net, cfg->fc_table);
                if (!table) {
                        pr_warn("NLM_F_CREATE should be specified when creating new route\n");
                        table = fib6_new_table(net, cfg->fc_table);
                }
        } else {
                table = fib6_new_table(net, cfg->fc_table);
        }

        if (!table)
                goto out;

        err = -ENOMEM;
        rt = fib6_info_alloc(gfp_flags, !nh);
        if (!rt)
                goto out;

        rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len,
                                               extack);
        if (IS_ERR(rt->fib6_metrics)) {
                err = PTR_ERR(rt->fib6_metrics);
                /* Do not leave garbage there. */
                rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
                goto out_free;
        }

        if (cfg->fc_flags & RTF_ADDRCONF)
                rt->dst_nocount = true;

        if (cfg->fc_flags & RTF_EXPIRES)
                fib6_set_expires(rt, jiffies +
                                clock_t_to_jiffies(cfg->fc_expires));

        if (cfg->fc_protocol == RTPROT_UNSPEC)
                cfg->fc_protocol = RTPROT_BOOT;
        rt->fib6_protocol = cfg->fc_protocol;

        rt->fib6_table = table;
        rt->fib6_metric = cfg->fc_metric;
        rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
        rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;

        ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
        rt->fib6_dst.plen = cfg->fc_dst_len;

#ifdef CONFIG_IPV6_SUBTREES
        ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
        rt->fib6_src.plen = cfg->fc_src_len;
#endif
        if (nh) {
                if (rt->fib6_src.plen) {
                        NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
                        err = -EINVAL;
                        goto out_free;
                }
                if (!nexthop_get(nh)) {
                        NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
                        err = -ENOENT;
                        goto out_free;
                }
                rt->nh = nh;
                fib6_nh = nexthop_fib6_nh(rt->nh);
        } else {
                err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
                if (err)
                        goto out;

                fib6_nh = rt->fib6_nh;

                /* We cannot add true routes via loopback here, they would
                 * result in kernel looping; promote them to reject routes
                 */
                addr_type = ipv6_addr_type(&cfg->fc_dst);
                if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
                                   addr_type))
                        rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
        }

        if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
                struct net_device *dev = fib6_nh->fib_nh_dev;

                if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
                        NL_SET_ERR_MSG(extack, "Invalid source address");
                        err = -EINVAL;
                        goto out;
                }
                rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
                rt->fib6_prefsrc.plen = 128;
        } else
                rt->fib6_prefsrc.plen = 0;

        return rt;
out:
        fib6_info_release(rt);
        return ERR_PTR(err);
out_free:
        ip_fib_metrics_put(rt->fib6_metrics);
        kfree(rt);
        return ERR_PTR(err);
}

int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
                  struct netlink_ext_ack *extack)
{
        struct fib6_info *rt;
        int err;

        rt = ip6_route_info_create(cfg, gfp_flags, extack);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
        fib6_info_release(rt);

        return err;
}

static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
{
        struct net *net = info->nl_net;
        struct fib6_table *table;
        int err;

        if (rt == net->ipv6.fib6_null_entry) {
                err = -ENOENT;
                goto out;
        }

        table = rt->fib6_table;
        spin_lock_bh(&table->tb6_lock);
        err = fib6_del(rt, info);
        spin_unlock_bh(&table->tb6_lock);

out:
        fib6_info_release(rt);
        return err;
}

int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
{
        struct nl_info info = {
                .nl_net = net,
                .skip_notify = skip_notify
        };

        return __ip6_del_rt(rt, &info);
}

static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
{
        struct nl_info *info = &cfg->fc_nlinfo;
        struct net *net = info->nl_net;
        struct sk_buff *skb = NULL;
        struct fib6_table *table;
        int err = -ENOENT;

        if (rt == net->ipv6.fib6_null_entry)
                goto out_put;
        table = rt->fib6_table;
        spin_lock_bh(&table->tb6_lock);

        if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
                struct fib6_info *sibling, *next_sibling;
                struct fib6_node *fn;

                /* prefer to send a single notification with all hops */
                skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
                if (skb) {
                        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;

                        if (rt6_fill_node(net, skb, rt, NULL,
                                          NULL, NULL, 0, RTM_DELROUTE,
                                          info->portid, seq, 0) < 0) {
                                kfree_skb(skb);
                                skb = NULL;
                        } else
                                info->skip_notify = 1;
                }

                /* 'rt' points to the first sibling route. If it is not the
                 * leaf, then we do not need to send a notification. Otherwise,
                 * we need to check if the last sibling has a next route or not
                 * and emit a replace or delete notification, respectively.
                 */
                info->skip_notify_kernel = 1;
                fn = rcu_dereference_protected(rt->fib6_node,
                                            lockdep_is_held(&table->tb6_lock));
                if (rcu_access_pointer(fn->leaf) == rt) {
                        struct fib6_info *last_sibling, *replace_rt;

                        last_sibling = list_last_entry(&rt->fib6_siblings,
                                                       struct fib6_info,
                                                       fib6_siblings);
                        replace_rt = rcu_dereference_protected(
                                            last_sibling->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                        if (replace_rt)
                                call_fib6_entry_notifiers_replace(net,
                                                                  replace_rt);
                        else
                                call_fib6_multipath_entry_notifiers(net,
                                                       FIB_EVENT_ENTRY_DEL,
                                                       rt, rt->fib6_nsiblings,
                                                       NULL);
                }
                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->fib6_siblings,
                                         fib6_siblings) {
                        err = fib6_del(sibling, info);
                        if (err)
                                goto out_unlock;
                }
        }

        err = fib6_del(rt, info);
out_unlock:
        spin_unlock_bh(&table->tb6_lock);
out_put:
        fib6_info_release(rt);

        if (skb) {
                rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
                            info->nlh, gfp_any());
        }
        return err;
}

static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
        int rc = -ESRCH;

        if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
                goto out;

        if (cfg->fc_flags & RTF_GATEWAY &&
            !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
                goto out;

        rc = rt6_remove_exception_rt(rt);
out:
        return rc;
}

static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
                             struct fib6_nh *nh)
{
        struct fib6_result res = {
                .f6i = rt,
                .nh = nh,
        };
        struct rt6_info *rt_cache;

        rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
        if (rt_cache)
                return __ip6_del_cached_rt(rt_cache, cfg);

        return 0;
}

struct fib6_nh_del_cached_rt_arg {
        struct fib6_config *cfg;
        struct fib6_info *f6i;
};

static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_del_cached_rt_arg *arg = _arg;
        int rc;

        rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
        return rc != -ESRCH ? rc : 0;
}

static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
{
        struct fib6_nh_del_cached_rt_arg arg = {
                .cfg = cfg,
                .f6i = f6i
        };

        return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
}

static int ip6_route_del(struct fib6_config *cfg,
                         struct netlink_ext_ack *extack)
{
        struct fib6_table *table;
        struct fib6_info *rt;
        struct fib6_node *fn;
        int err = -ESRCH;

        table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
        if (!table) {
                NL_SET_ERR_MSG(extack, "FIB table does not exist");
                return err;
        }

        rcu_read_lock();

        fn = fib6_locate(&table->tb6_root,
                         &cfg->fc_dst, cfg->fc_dst_len,
                         &cfg->fc_src, cfg->fc_src_len,
                         !(cfg->fc_flags & RTF_CACHE));

        if (fn) {
                for_each_fib6_node_rt_rcu(fn) {
                        struct fib6_nh *nh;

                        if (rt->nh && cfg->fc_nh_id &&
                            rt->nh->id != cfg->fc_nh_id)
                                continue;

                        if (cfg->fc_flags & RTF_CACHE) {
                                int rc = 0;

                                if (rt->nh) {
                                        rc = ip6_del_cached_rt_nh(cfg, rt);
                                } else if (cfg->fc_nh_id) {
                                        continue;
                                } else {
                                        nh = rt->fib6_nh;
                                        rc = ip6_del_cached_rt(cfg, rt, nh);
                                }
                                if (rc != -ESRCH) {
                                        rcu_read_unlock();
                                        return rc;
                                }
                                continue;
                        }

                        if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
                                continue;
                        if (cfg->fc_protocol &&
                            cfg->fc_protocol != rt->fib6_protocol)
                                continue;

                        if (rt->nh) {
                                if (!fib6_info_hold_safe(rt))
                                        continue;
                                rcu_read_unlock();

                                return __ip6_del_rt(rt, &cfg->fc_nlinfo);
                        }
                        if (cfg->fc_nh_id)
                                continue;

                        nh = rt->fib6_nh;
                        if (cfg->fc_ifindex &&
                            (!nh->fib_nh_dev ||
                             nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
                                continue;
                        if (cfg->fc_flags & RTF_GATEWAY &&
                            !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
                                continue;
                        if (!fib6_info_hold_safe(rt))
                                continue;
                        rcu_read_unlock();

                        /* if gateway was specified only delete the one hop */
                        if (cfg->fc_flags & RTF_GATEWAY)
                                return __ip6_del_rt(rt, &cfg->fc_nlinfo);

                        return __ip6_del_rt_siblings(rt, cfg);
                }
        }
        rcu_read_unlock();

        return err;
}

static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
        struct netevent_redirect netevent;
        struct rt6_info *rt, *nrt = NULL;
        struct fib6_result res = {};
        struct ndisc_options ndopts;
        struct inet6_dev *in6_dev;
        struct neighbour *neigh;
        struct rd_msg *msg;
        int optlen, on_link;
        u8 *lladdr;

        optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
        optlen -= sizeof(*msg);

        if (optlen < 0) {
                net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
                return;
        }

        msg = (struct rd_msg *)icmp6_hdr(skb);

        if (ipv6_addr_is_multicast(&msg->dest)) {
                net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
                return;
        }

        on_link = 0;
        if (ipv6_addr_equal(&msg->dest, &msg->target)) {
                on_link = 1;
        } else if (ipv6_addr_type(&msg->target) !=
                   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
                net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
                return;
        }

        in6_dev = __in6_dev_get(skb->dev);
        if (!in6_dev)
                return;
        if (READ_ONCE(in6_dev->cnf.forwarding) ||
            !READ_ONCE(in6_dev->cnf.accept_redirects))
                return;

        /* RFC2461 8.1:
         *        The IP source address of the Redirect MUST be the same as the current
         *        first-hop router for the specified ICMP Destination Address.
         */

        if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
                net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
                return;
        }

        lladdr = NULL;
        if (ndopts.nd_opts_tgt_lladdr) {
                lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
                                             skb->dev);
                if (!lladdr) {
                        net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
                        return;
                }
        }

        rt = dst_rt6_info(dst);
        if (rt->rt6i_flags & RTF_REJECT) {
                net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
                return;
        }

        /* Redirect received -> path was valid.
         * Look, redirects are sent only in response to data packets,
         * so that this nexthop apparently is reachable. --ANK
         */
        dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);

        neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
        if (!neigh)
                return;

        /*
         *        We have finally decided to accept it.
         */

        ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
                     NEIGH_UPDATE_F_WEAK_OVERRIDE|
                     NEIGH_UPDATE_F_OVERRIDE|
                     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
                                     NEIGH_UPDATE_F_ISROUTER)),
                     NDISC_REDIRECT, &ndopts);

        rcu_read_lock();
        res.f6i = rcu_dereference(rt->from);
        if (!res.f6i)
                goto out;

        if (res.f6i->nh) {
                struct fib6_nh_match_arg arg = {
                        .dev = dst->dev,
                        .gw = &rt->rt6i_gateway,
                };

                nexthop_for_each_fib6_nh(res.f6i->nh,
                                         fib6_nh_find_match, &arg);

                /* fib6_info uses a nexthop that does not have fib6_nh
                 * using the dst->dev. Should be impossible
                 */
                if (!arg.match)
                        goto out;
                res.nh = arg.match;
        } else {
                res.nh = res.f6i->fib6_nh;
        }

        res.fib6_flags = res.f6i->fib6_flags;
        res.fib6_type = res.f6i->fib6_type;
        nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
        if (!nrt)
                goto out;

        nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
        if (on_link)
                nrt->rt6i_flags &= ~RTF_GATEWAY;

        nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;

        /* rt6_insert_exception() will take care of duplicated exceptions */
        if (rt6_insert_exception(nrt, &res)) {
                dst_release_immediate(&nrt->dst);
                goto out;
        }

        netevent.old = &rt->dst;
        netevent.new = &nrt->dst;
        netevent.daddr = &msg->dest;
        netevent.neigh = neigh;
        call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

out:
        rcu_read_unlock();
        neigh_release(neigh);
}

#ifdef CONFIG_IPV6_ROUTE_INFO
static struct fib6_info *rt6_get_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev)
{
        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
        int ifindex = dev->ifindex;
        struct fib6_node *fn;
        struct fib6_info *rt = NULL;
        struct fib6_table *table;

        table = fib6_get_table(net, tb_id);
        if (!table)
                return NULL;

        rcu_read_lock();
        fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
        if (!fn)
                goto out;

        for_each_fib6_node_rt_rcu(fn) {
                /* these routes do not use nexthops */
                if (rt->nh)
                        continue;
                if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
                        continue;
                if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
                    !rt->fib6_nh->fib_nh_gw_family)
                        continue;
                if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
                        continue;
                if (!fib6_info_hold_safe(rt))
                        continue;
                break;
        }
out:
        rcu_read_unlock();
        return rt;
}

static struct fib6_info *rt6_add_route_info(struct net *net,
                                           const struct in6_addr *prefix, int prefixlen,
                                           const struct in6_addr *gwaddr,
                                           struct net_device *dev,
                                           unsigned int pref)
{
        struct fib6_config cfg = {
                .fc_metric        = IP6_RT_PRIO_USER,
                .fc_ifindex        = dev->ifindex,
                .fc_dst_len        = prefixlen,
                .fc_flags        = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
                                  RTF_UP | RTF_PREF(pref),
                .fc_protocol = RTPROT_RA,
                .fc_type = RTN_UNICAST,
                .fc_nlinfo.portid = 0,
                .fc_nlinfo.nlh = NULL,
                .fc_nlinfo.nl_net = net,
        };

        cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
        cfg.fc_dst = *prefix;
        cfg.fc_gateway = *gwaddr;

        /* We should treat it as a default route if prefix length is 0. */
        if (!prefixlen)
                cfg.fc_flags |= RTF_DEFAULT;

        ip6_route_add(&cfg, GFP_ATOMIC, NULL);

        return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
}
#endif

struct fib6_info *rt6_get_dflt_router(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev)
{
        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
        struct fib6_info *rt;
        struct fib6_table *table;

        table = fib6_get_table(net, tb_id);
        if (!table)
                return NULL;

        rcu_read_lock();
        for_each_fib6_node_rt_rcu(&table->tb6_root) {
                struct fib6_nh *nh;

                /* RA routes do not use nexthops */
                if (rt->nh)
                        continue;

                nh = rt->fib6_nh;
                if (dev == nh->fib_nh_dev &&
                    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
                    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
                        break;
        }
        if (rt && !fib6_info_hold_safe(rt))
                rt = NULL;
        rcu_read_unlock();
        return rt;
}

struct fib6_info *rt6_add_dflt_router(struct net *net,
                                     const struct in6_addr *gwaddr,
                                     struct net_device *dev,
                                     unsigned int pref,
                                     u32 defrtr_usr_metric,
                                     int lifetime)
{
        struct fib6_config cfg = {
                .fc_table        = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
                .fc_metric        = defrtr_usr_metric,
                .fc_ifindex        = dev->ifindex,
                .fc_flags        = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
                                  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
                .fc_protocol = RTPROT_RA,
                .fc_type = RTN_UNICAST,
                .fc_nlinfo.portid = 0,
                .fc_nlinfo.nlh = NULL,
                .fc_nlinfo.nl_net = net,
                .fc_expires = jiffies_to_clock_t(lifetime * HZ),
        };

        cfg.fc_gateway = *gwaddr;

        if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
                struct fib6_table *table;

                table = fib6_get_table(dev_net(dev), cfg.fc_table);
                if (table)
                        table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
        }

        return rt6_get_dflt_router(net, gwaddr, dev);
}

static void __rt6_purge_dflt_routers(struct net *net,
                                     struct fib6_table *table)
{
        struct fib6_info *rt;

restart:
        rcu_read_lock();
        for_each_fib6_node_rt_rcu(&table->tb6_root) {
                struct net_device *dev = fib6_info_nh_dev(rt);
                struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;

                if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
                    (!idev || idev->cnf.accept_ra != 2) &&
                    fib6_info_hold_safe(rt)) {
                        rcu_read_unlock();
                        ip6_del_rt(net, rt, false);
                        goto restart;
                }
        }
        rcu_read_unlock();

        table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}

void rt6_purge_dflt_routers(struct net *net)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();

        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
                                __rt6_purge_dflt_routers(net, table);
                }
        }

        rcu_read_unlock();
}

static void rtmsg_to_fib6_config(struct net *net,
                                 struct in6_rtmsg *rtmsg,
                                 struct fib6_config *cfg)
{
        *cfg = (struct fib6_config){
                .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
                         : RT6_TABLE_MAIN,
                .fc_ifindex = rtmsg->rtmsg_ifindex,
                .fc_metric = rtmsg->rtmsg_metric,
                .fc_expires = rtmsg->rtmsg_info,
                .fc_dst_len = rtmsg->rtmsg_dst_len,
                .fc_src_len = rtmsg->rtmsg_src_len,
                .fc_flags = rtmsg->rtmsg_flags,
                .fc_type = rtmsg->rtmsg_type,

                .fc_nlinfo.nl_net = net,

                .fc_dst = rtmsg->rtmsg_dst,
                .fc_src = rtmsg->rtmsg_src,
                .fc_gateway = rtmsg->rtmsg_gateway,
        };
}

int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
{
        struct fib6_config cfg;
        int err;

        if (cmd != SIOCADDRT && cmd != SIOCDELRT)
                return -EINVAL;
        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        rtmsg_to_fib6_config(net, rtmsg, &cfg);

        rtnl_lock();
        switch (cmd) {
        case SIOCADDRT:
                /* Only do the default setting of fc_metric in route adding */
                if (cfg.fc_metric == 0)
                        cfg.fc_metric = IP6_RT_PRIO_USER;
                err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
                break;
        case SIOCDELRT:
                err = ip6_route_del(&cfg, NULL);
                break;
        }
        rtnl_unlock();
        return err;
}

/*
 *        Drop the packet on the floor
 */

static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
{
        struct dst_entry *dst = skb_dst(skb);
        struct net *net = dev_net(dst->dev);
        struct inet6_dev *idev;
        SKB_DR(reason);
        int type;

        if (netif_is_l3_master(skb->dev) ||
            dst->dev == net->loopback_dev)
                idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
        else
                idev = ip6_dst_idev(dst);

        switch (ipstats_mib_noroutes) {
        case IPSTATS_MIB_INNOROUTES:
                type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
                if (type == IPV6_ADDR_ANY) {
                        SKB_DR_SET(reason, IP_INADDRERRORS);
                        IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
                        break;
                }
                SKB_DR_SET(reason, IP_INNOROUTES);
                fallthrough;
        case IPSTATS_MIB_OUTNOROUTES:
                SKB_DR_OR(reason, IP_OUTNOROUTES);
                IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
                break;
        }

        /* Start over by dropping the dst for l3mdev case */
        if (netif_is_l3_master(skb->dev))
                skb_dst_drop(skb);

        icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
        kfree_skb_reason(skb, reason);
        return 0;
}

static int ip6_pkt_discard(struct sk_buff *skb)
{
        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
}

static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb->dev = skb_dst(skb)->dev;
        return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
}

static int ip6_pkt_prohibit(struct sk_buff *skb)
{
        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
}

static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb->dev = skb_dst(skb)->dev;
        return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
}

/*
 *        Allocate a dst for local (unicast / anycast) address.
 */

struct fib6_info *addrconf_f6i_alloc(struct net *net,
                                     struct inet6_dev *idev,
                                     const struct in6_addr *addr,
                                     bool anycast, gfp_t gfp_flags,
                                     struct netlink_ext_ack *extack)
{
        struct fib6_config cfg = {
                .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
                .fc_ifindex = idev->dev->ifindex,
                .fc_flags = RTF_UP | RTF_NONEXTHOP,
                .fc_dst = *addr,
                .fc_dst_len = 128,
                .fc_protocol = RTPROT_KERNEL,
                .fc_nlinfo.nl_net = net,
                .fc_ignore_dev_down = true,
        };
        struct fib6_info *f6i;

        if (anycast) {
                cfg.fc_type = RTN_ANYCAST;
                cfg.fc_flags |= RTF_ANYCAST;
        } else {
                cfg.fc_type = RTN_LOCAL;
                cfg.fc_flags |= RTF_LOCAL;
        }

        f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
        if (!IS_ERR(f6i)) {
                f6i->dst_nocount = true;

                if (!anycast &&
                    (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
                     READ_ONCE(idev->cnf.disable_policy)))
                        f6i->dst_nopolicy = true;
        }

        return f6i;
}

/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
        struct net *net;
        struct in6_addr *addr;
};

static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
{
        struct net *net = ((struct arg_dev_net_ip *)arg)->net;
        struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

        if (!rt->nh &&
            rt != net->ipv6.fib6_null_entry &&
            ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) &&
            !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) {
                spin_lock_bh(&rt6_exception_lock);
                /* remove prefsrc entry */
                rt->fib6_prefsrc.plen = 0;
                spin_unlock_bh(&rt6_exception_lock);
        }
        return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
        struct net *net = dev_net(ifp->idev->dev);
        struct arg_dev_net_ip adni = {
                .net = net,
                .addr = &ifp->addr,
        };
        fib6_clean_all(net, fib6_remove_prefsrc, &adni);
}

#define RTF_RA_ROUTER                (RTF_ADDRCONF | RTF_DEFAULT)

/* Remove routers and update dst entries when gateway turn into host. */
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
        struct in6_addr *gateway = (struct in6_addr *)arg;
        struct fib6_nh *nh;

        /* RA routes do not use nexthops */
        if (rt->nh)
                return 0;

        nh = rt->fib6_nh;
        if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
            nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
                return -1;

        /* Further clean up cached routes in exception table.
         * This is needed because cached route may have a different
         * gateway than its 'parent' in the case of an ip redirect.
         */
        fib6_nh_exceptions_clean_tohost(nh, gateway);

        return 0;
}

void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
{
        fib6_clean_all(net, fib6_clean_tohost, gateway);
}

struct arg_netdev_event {
        const struct net_device *dev;
        union {
                unsigned char nh_flags;
                unsigned long event;
        };
};

static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
{
        struct fib6_info *iter;
        struct fib6_node *fn;

        fn = rcu_dereference_protected(rt->fib6_node,
                        lockdep_is_held(&rt->fib6_table->tb6_lock));
        iter = rcu_dereference_protected(fn->leaf,
                        lockdep_is_held(&rt->fib6_table->tb6_lock));
        while (iter) {
                if (iter->fib6_metric == rt->fib6_metric &&
                    rt6_qualify_for_ecmp(iter))
                        return iter;
                iter = rcu_dereference_protected(iter->fib6_next,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));
        }

        return NULL;
}

/* only called for fib entries with builtin fib6_nh */
static bool rt6_is_dead(const struct fib6_info *rt)
{
        if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
            (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
             ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
                return true;

        return false;
}

static int rt6_multipath_total_weight(const struct fib6_info *rt)
{
        struct fib6_info *iter;
        int total = 0;

        if (!rt6_is_dead(rt))
                total += rt->fib6_nh->fib_nh_weight;

        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
                if (!rt6_is_dead(iter))
                        total += iter->fib6_nh->fib_nh_weight;
        }

        return total;
}

static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
{
        int upper_bound = -1;

        if (!rt6_is_dead(rt)) {
                *weight += rt->fib6_nh->fib_nh_weight;
                upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
                                                    total) - 1;
        }
        atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
}

static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
{
        struct fib6_info *iter;
        int weight = 0;

        rt6_upper_bound_set(rt, &weight, total);

        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                rt6_upper_bound_set(iter, &weight, total);
}

void rt6_multipath_rebalance(struct fib6_info *rt)
{
        struct fib6_info *first;
        int total;

        /* In case the entire multipath route was marked for flushing,
         * then there is no need to rebalance upon the removal of every
         * sibling route.
         */
        if (!rt->fib6_nsiblings || rt->should_flush)
                return;

        /* During lookup routes are evaluated in order, so we need to
         * make sure upper bounds are assigned from the first sibling
         * onwards.
         */
        first = rt6_multipath_first_sibling(rt);
        if (WARN_ON_ONCE(!first))
                return;

        total = rt6_multipath_total_weight(first);
        rt6_multipath_upper_bound_set(first, total);
}

static int fib6_ifup(struct fib6_info *rt, void *p_arg)
{
        const struct arg_netdev_event *arg = p_arg;
        struct net *net = dev_net(arg->dev);

        if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
            rt->fib6_nh->fib_nh_dev == arg->dev) {
                rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
                fib6_update_sernum_upto_root(net, rt);
                rt6_multipath_rebalance(rt);
        }

        return 0;
}

void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
{
        struct arg_netdev_event arg = {
                .dev = dev,
                {
                        .nh_flags = nh_flags,
                },
        };

        if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
                arg.nh_flags |= RTNH_F_LINKDOWN;

        fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}

/* only called for fib entries with inline fib6_nh */
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
                                   const struct net_device *dev)
{
        struct fib6_info *iter;

        if (rt->fib6_nh->fib_nh_dev == dev)
                return true;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                if (iter->fib6_nh->fib_nh_dev == dev)
                        return true;

        return false;
}

static void rt6_multipath_flush(struct fib6_info *rt)
{
        struct fib6_info *iter;

        rt->should_flush = 1;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                iter->should_flush = 1;
}

static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
                                             const struct net_device *down_dev)
{
        struct fib6_info *iter;
        unsigned int dead = 0;

        if (rt->fib6_nh->fib_nh_dev == down_dev ||
            rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
                dead++;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                if (iter->fib6_nh->fib_nh_dev == down_dev ||
                    iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
                        dead++;

        return dead;
}

static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
                                       const struct net_device *dev,
                                       unsigned char nh_flags)
{
        struct fib6_info *iter;

        if (rt->fib6_nh->fib_nh_dev == dev)
                rt->fib6_nh->fib_nh_flags |= nh_flags;
        list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
                if (iter->fib6_nh->fib_nh_dev == dev)
                        iter->fib6_nh->fib_nh_flags |= nh_flags;
}

/* called with write lock held for table with rt */
static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
{
        const struct arg_netdev_event *arg = p_arg;
        const struct net_device *dev = arg->dev;
        struct net *net = dev_net(dev);

        if (rt == net->ipv6.fib6_null_entry || rt->nh)
                return 0;

        switch (arg->event) {
        case NETDEV_UNREGISTER:
                return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
        case NETDEV_DOWN:
                if (rt->should_flush)
                        return -1;
                if (!rt->fib6_nsiblings)
                        return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
                if (rt6_multipath_uses_dev(rt, dev)) {
                        unsigned int count;

                        count = rt6_multipath_dead_count(rt, dev);
                        if (rt->fib6_nsiblings + 1 == count) {
                                rt6_multipath_flush(rt);
                                return -1;
                        }
                        rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
                                                   RTNH_F_LINKDOWN);
                        fib6_update_sernum(net, rt);
                        rt6_multipath_rebalance(rt);
                }
                return -2;
        case NETDEV_CHANGE:
                if (rt->fib6_nh->fib_nh_dev != dev ||
                    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
                        break;
                rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
                rt6_multipath_rebalance(rt);
                break;
        }

        return 0;
}

void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
{
        struct arg_netdev_event arg = {
                .dev = dev,
                {
                        .event = event,
                },
        };
        struct net *net = dev_net(dev);

        if (net->ipv6.sysctl.skip_notify_on_dev_down)
                fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
        else
                fib6_clean_all(net, fib6_ifdown, &arg);
}

void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
        rt6_sync_down_dev(dev, event);
        rt6_uncached_list_flush_dev(dev);
        neigh_ifdown(&nd_tbl, dev);
}

struct rt6_mtu_change_arg {
        struct net_device *dev;
        unsigned int mtu;
        struct fib6_info *f6i;
};

static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
{
        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
        struct fib6_info *f6i = arg->f6i;

        /* For administrative MTU increase, there is no way to discover
         * IPv6 PMTU increase, so PMTU increase should be updated here.
         * Since RFC 1981 doesn't include administrative MTU increase
         * update PMTU increase is a MUST. (i.e. jumbo frame)
         */
        if (nh->fib_nh_dev == arg->dev) {
                struct inet6_dev *idev = __in6_dev_get(arg->dev);
                u32 mtu = f6i->fib6_pmtu;

                if (mtu >= arg->mtu ||
                    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
                        fib6_metric_set(f6i, RTAX_MTU, arg->mtu);

                spin_lock_bh(&rt6_exception_lock);
                rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
                spin_unlock_bh(&rt6_exception_lock);
        }

        return 0;
}

static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
{
        struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
        struct inet6_dev *idev;

        /* In IPv6 pmtu discovery is not optional,
           so that RTAX_MTU lock cannot disable it.
           We still use this lock to block changes
           caused by addrconf/ndisc.
        */

        idev = __in6_dev_get(arg->dev);
        if (!idev)
                return 0;

        if (fib6_metric_locked(f6i, RTAX_MTU))
                return 0;

        arg->f6i = f6i;
        if (f6i->nh) {
                /* fib6_nh_mtu_change only returns 0, so this is safe */
                return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
                                                arg);
        }

        return fib6_nh_mtu_change(f6i->fib6_nh, arg);
}

void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
{
        struct rt6_mtu_change_arg arg = {
                .dev = dev,
                .mtu = mtu,
        };

        fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
}

static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
        [RTA_UNSPEC]                = { .strict_start_type = RTA_DPORT + 1 },
        [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
        [RTA_PREFSRC]                = { .len = sizeof(struct in6_addr) },
        [RTA_OIF]               = { .type = NLA_U32 },
        [RTA_IIF]                = { .type = NLA_U32 },
        [RTA_PRIORITY]          = { .type = NLA_U32 },
        [RTA_METRICS]           = { .type = NLA_NESTED },
        [RTA_MULTIPATH]                = { .len = sizeof(struct rtnexthop) },
        [RTA_PREF]              = { .type = NLA_U8 },
        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
        [RTA_ENCAP]                = { .type = NLA_NESTED },
        [RTA_EXPIRES]                = { .type = NLA_U32 },
        [RTA_UID]                = { .type = NLA_U32 },
        [RTA_MARK]                = { .type = NLA_U32 },
        [RTA_TABLE]                = { .type = NLA_U32 },
        [RTA_IP_PROTO]                = { .type = NLA_U8 },
        [RTA_SPORT]                = { .type = NLA_U16 },
        [RTA_DPORT]                = { .type = NLA_U16 },
        [RTA_NH_ID]                = { .type = NLA_U32 },
        [RTA_FLOWLABEL]                = { .type = NLA_BE32 },
};

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct fib6_config *cfg,
                              struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        struct nlattr *tb[RTA_MAX+1];
        unsigned int pref;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                     rtm_ipv6_policy, extack);
        if (err < 0)
                goto errout;

        err = -EINVAL;
        rtm = nlmsg_data(nlh);

        if (rtm->rtm_tos) {
                NL_SET_ERR_MSG(extack,
                               "Invalid dsfield (tos): option not available for IPv6");
                goto errout;
        }

        if (tb[RTA_FLOWLABEL]) {
                NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
                                    "Flow label cannot be specified for this operation");
                goto errout;
        }

        *cfg = (struct fib6_config){
                .fc_table = rtm->rtm_table,
                .fc_dst_len = rtm->rtm_dst_len,
                .fc_src_len = rtm->rtm_src_len,
                .fc_flags = RTF_UP,
                .fc_protocol = rtm->rtm_protocol,
                .fc_type = rtm->rtm_type,

                .fc_nlinfo.portid = NETLINK_CB(skb).portid,
                .fc_nlinfo.nlh = nlh,
                .fc_nlinfo.nl_net = sock_net(skb->sk),
        };

        if (rtm->rtm_type == RTN_UNREACHABLE ||
            rtm->rtm_type == RTN_BLACKHOLE ||
            rtm->rtm_type == RTN_PROHIBIT ||
            rtm->rtm_type == RTN_THROW)
                cfg->fc_flags |= RTF_REJECT;

        if (rtm->rtm_type == RTN_LOCAL)
                cfg->fc_flags |= RTF_LOCAL;

        if (rtm->rtm_flags & RTM_F_CLONED)
                cfg->fc_flags |= RTF_CACHE;

        cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);

        if (tb[RTA_NH_ID]) {
                if (tb[RTA_GATEWAY]   || tb[RTA_OIF] ||
                    tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop specification and nexthop id are mutually exclusive");
                        goto errout;
                }
                cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
        }

        if (tb[RTA_GATEWAY]) {
                cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
                cfg->fc_flags |= RTF_GATEWAY;
        }
        if (tb[RTA_VIA]) {
                NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
                goto errout;
        }

        if (tb[RTA_DST]) {
                int plen = (rtm->rtm_dst_len + 7) >> 3;

                if (nla_len(tb[RTA_DST]) < plen)
                        goto errout;

                nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
        }

        if (tb[RTA_SRC]) {
                int plen = (rtm->rtm_src_len + 7) >> 3;

                if (nla_len(tb[RTA_SRC]) < plen)
                        goto errout;

                nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
        }

        if (tb[RTA_PREFSRC])
                cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);

        if (tb[RTA_OIF])
                cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

        if (tb[RTA_PRIORITY])
                cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

        if (tb[RTA_METRICS]) {
                cfg->fc_mx = nla_data(tb[RTA_METRICS]);
                cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
        }

        if (tb[RTA_TABLE])
                cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

        if (tb[RTA_MULTIPATH]) {
                cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
                cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);

                err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
                                                     cfg->fc_mp_len,
                                                     extack, true);
                if (err < 0)
                        goto errout;
        }

        if (tb[RTA_PREF]) {
                pref = nla_get_u8(tb[RTA_PREF]);
                if (pref != ICMPV6_ROUTER_PREF_LOW &&
                    pref != ICMPV6_ROUTER_PREF_HIGH)
                        pref = ICMPV6_ROUTER_PREF_MEDIUM;
                cfg->fc_flags |= RTF_PREF(pref);
        }

        if (tb[RTA_ENCAP])
                cfg->fc_encap = tb[RTA_ENCAP];

        if (tb[RTA_ENCAP_TYPE]) {
                cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);

                err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
                                                extack, true);
                if (err < 0)
                        goto errout;
        }

        if (tb[RTA_EXPIRES]) {
                unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);

                if (addrconf_finite_timeout(timeout)) {
                        cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
                        cfg->fc_flags |= RTF_EXPIRES;
                }
        }

        err = 0;
errout:
        return err;
}

struct rt6_nh {
        struct fib6_info *fib6_info;
        struct fib6_config r_cfg;
        struct list_head next;
};

static int ip6_route_info_append(struct net *net,
                                 struct list_head *rt6_nh_list,
                                 struct fib6_info *rt,
                                 struct fib6_config *r_cfg)
{
        struct rt6_nh *nh;
        int err = -EEXIST;

        list_for_each_entry(nh, rt6_nh_list, next) {
                /* check if fib6_info already exists */
                if (rt6_duplicate_nexthop(nh->fib6_info, rt))
                        return err;
        }

        nh = kzalloc(sizeof(*nh), GFP_KERNEL);
        if (!nh)
                return -ENOMEM;
        nh->fib6_info = rt;
        memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
        list_add_tail(&nh->next, rt6_nh_list);

        return 0;
}

static void ip6_route_mpath_notify(struct fib6_info *rt,
                                   struct fib6_info *rt_last,
                                   struct nl_info *info,
                                   __u16 nlflags)
{
        /* if this is an APPEND route, then rt points to the first route
         * inserted and rt_last points to last route inserted. Userspace
         * wants a consistent dump of the route which starts at the first
         * nexthop. Since sibling routes are always added at the end of
         * the list, find the first sibling of the last route appended
         */
        rcu_read_lock();

        if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
                rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
                                            struct fib6_info,
                                            fib6_siblings);
        }

        if (rt)
                inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);

        rcu_read_unlock();
}

static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
{
        bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
        bool should_notify = false;
        struct fib6_info *leaf;
        struct fib6_node *fn;

        rcu_read_lock();
        fn = rcu_dereference(rt->fib6_node);
        if (!fn)
                goto out;

        leaf = rcu_dereference(fn->leaf);
        if (!leaf)
                goto out;

        if (rt == leaf ||
            (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
             rt6_qualify_for_ecmp(leaf)))
                should_notify = true;
out:
        rcu_read_unlock();

        return should_notify;
}

static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
                             struct netlink_ext_ack *extack)
{
        if (nla_len(nla) < sizeof(*gw)) {
                NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
                return -EINVAL;
        }

        *gw = nla_get_in6_addr(nla);

        return 0;
}

static int ip6_route_multipath_add(struct fib6_config *cfg,
                                   struct netlink_ext_ack *extack)
{
        struct fib6_info *rt_notif = NULL, *rt_last = NULL;
        struct nl_info *info = &cfg->fc_nlinfo;
        struct fib6_config r_cfg;
        struct rtnexthop *rtnh;
        struct fib6_info *rt;
        struct rt6_nh *err_nh;
        struct rt6_nh *nh, *nh_safe;
        __u16 nlflags;
        int remaining;
        int attrlen;
        int err = 1;
        int nhn = 0;
        int replace = (cfg->fc_nlinfo.nlh &&
                       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
        LIST_HEAD(rt6_nh_list);

        nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
        if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
                nlflags |= NLM_F_APPEND;

        remaining = cfg->fc_mp_len;
        rtnh = (struct rtnexthop *)cfg->fc_mp;

        /* Parse a Multipath Entry and build a list (rt6_nh_list) of
         * fib6_info structs per nexthop
         */
        while (rtnh_ok(rtnh, remaining)) {
                memcpy(&r_cfg, cfg, sizeof(*cfg));
                if (rtnh->rtnh_ifindex)
                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        if (nla) {
                                err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
                                                        extack);
                                if (err)
                                        goto cleanup;

                                r_cfg.fc_flags |= RTF_GATEWAY;
                        }
                        r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);

                        /* RTA_ENCAP_TYPE length checked in
                         * lwtunnel_valid_encap_type_attr
                         */
                        nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
                        if (nla)
                                r_cfg.fc_encap_type = nla_get_u16(nla);
                }

                r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
                rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
                if (IS_ERR(rt)) {
                        err = PTR_ERR(rt);
                        rt = NULL;
                        goto cleanup;
                }
                if (!rt6_qualify_for_ecmp(rt)) {
                        err = -EINVAL;
                        NL_SET_ERR_MSG(extack,
                                       "Device only routes can not be added for IPv6 using the multipath API.");
                        fib6_info_release(rt);
                        goto cleanup;
                }

                rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;

                err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
                                            rt, &r_cfg);
                if (err) {
                        fib6_info_release(rt);
                        goto cleanup;
                }

                rtnh = rtnh_next(rtnh, &remaining);
        }

        if (list_empty(&rt6_nh_list)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid nexthop configuration - no valid nexthops");
                return -EINVAL;
        }

        /* for add and replace send one notification with all nexthops.
         * Skip the notification in fib6_add_rt2node and send one with
         * the full route when done
         */
        info->skip_notify = 1;

        /* For add and replace, send one notification with all nexthops. For
         * append, send one notification with all appended nexthops.
         */
        info->skip_notify_kernel = 1;

        err_nh = NULL;
        list_for_each_entry(nh, &rt6_nh_list, next) {
                err = __ip6_ins_rt(nh->fib6_info, info, extack);

                if (err) {
                        if (replace && nhn)
                                NL_SET_ERR_MSG_MOD(extack,
                                                   "multipath route replace failed (check consistency of installed routes)");
                        err_nh = nh;
                        goto add_errout;
                }
                /* save reference to last route successfully inserted */
                rt_last = nh->fib6_info;

                /* save reference to first route for notification */
                if (!rt_notif)
                        rt_notif = nh->fib6_info;

                /* Because each route is added like a single route we remove
                 * these flags after the first nexthop: if there is a collision,
                 * we have already failed to add the first nexthop:
                 * fib6_add_rt2node() has rejected it; when replacing, old
                 * nexthops have been replaced by first new, the rest should
                 * be added to it.
                 */
                if (cfg->fc_nlinfo.nlh) {
                        cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
                                                             NLM_F_REPLACE);
                        cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
                }
                nhn++;
        }

        /* An in-kernel notification should only be sent in case the new
         * multipath route is added as the first route in the node, or if
         * it was appended to it. We pass 'rt_notif' since it is the first
         * sibling and might allow us to skip some checks in the replace case.
         */
        if (ip6_route_mpath_should_notify(rt_notif)) {
                enum fib_event_type fib_event;

                if (rt_notif->fib6_nsiblings != nhn - 1)
                        fib_event = FIB_EVENT_ENTRY_APPEND;
                else
                        fib_event = FIB_EVENT_ENTRY_REPLACE;

                err = call_fib6_multipath_entry_notifiers(info->nl_net,
                                                          fib_event, rt_notif,
                                                          nhn - 1, extack);
                if (err) {
                        /* Delete all the siblings that were just added */
                        err_nh = NULL;
                        goto add_errout;
                }
        }

        /* success ... tell user about new route */
        ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
        goto cleanup;

add_errout:
        /* send notification for routes that were added so that
         * the delete notifications sent by ip6_route_del are
         * coherent
         */
        if (rt_notif)
                ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);

        /* Delete routes that were already added */
        list_for_each_entry(nh, &rt6_nh_list, next) {
                if (err_nh == nh)
                        break;
                ip6_route_del(&nh->r_cfg, extack);
        }

cleanup:
        list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
                fib6_info_release(nh->fib6_info);
                list_del(&nh->next);
                kfree(nh);
        }

        return err;
}

static int ip6_route_multipath_del(struct fib6_config *cfg,
                                   struct netlink_ext_ack *extack)
{
        struct fib6_config r_cfg;
        struct rtnexthop *rtnh;
        int last_err = 0;
        int remaining;
        int attrlen;
        int err;

        remaining = cfg->fc_mp_len;
        rtnh = (struct rtnexthop *)cfg->fc_mp;

        /* Parse a Multipath Entry */
        while (rtnh_ok(rtnh, remaining)) {
                memcpy(&r_cfg, cfg, sizeof(*cfg));
                if (rtnh->rtnh_ifindex)
                        r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        if (nla) {
                                err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
                                                        extack);
                                if (err) {
                                        last_err = err;
                                        goto next_rtnh;
                                }

                                r_cfg.fc_flags |= RTF_GATEWAY;
                        }
                }
                err = ip6_route_del(&r_cfg, extack);
                if (err)
                        last_err = err;

next_rtnh:
                rtnh = rtnh_next(rtnh, &remaining);
        }

        return last_err;
}

static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct fib6_config cfg;
        int err;

        err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
        if (err < 0)
                return err;

        if (cfg.fc_nh_id &&
            !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
                NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                return -EINVAL;
        }

        if (cfg.fc_mp)
                return ip6_route_multipath_del(&cfg, extack);
        else {
                cfg.fc_delete_all_nh = 1;
                return ip6_route_del(&cfg, extack);
        }
}

static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct fib6_config cfg;
        int err;

        err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
        if (err < 0)
                return err;

        if (cfg.fc_metric == 0)
                cfg.fc_metric = IP6_RT_PRIO_USER;

        if (cfg.fc_mp)
                return ip6_route_multipath_add(&cfg, extack);
        else
                return ip6_route_add(&cfg, GFP_KERNEL, extack);
}

/* add the overhead of this fib6_nh to nexthop_len */
static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
{
        int *nexthop_len = arg;

        *nexthop_len += nla_total_size(0)         /* RTA_MULTIPATH */
                     + NLA_ALIGN(sizeof(struct rtnexthop))
                     + nla_total_size(16); /* RTA_GATEWAY */

        if (nh->fib_nh_lws) {
                /* RTA_ENCAP_TYPE */
                *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
                /* RTA_ENCAP */
                *nexthop_len += nla_total_size(2);
        }

        return 0;
}

static size_t rt6_nlmsg_size(struct fib6_info *f6i)
{
        int nexthop_len;

        if (f6i->nh) {
                nexthop_len = nla_total_size(4); /* RTA_NH_ID */
                nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
                                         &nexthop_len);
        } else {
                struct fib6_nh *nh = f6i->fib6_nh;
                struct fib6_info *sibling;

                nexthop_len = 0;
                if (f6i->fib6_nsiblings) {
                        rt6_nh_nlmsg_size(nh, &nexthop_len);

                        rcu_read_lock();

                        list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
                                                fib6_siblings) {
                                rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
                        }

                        rcu_read_unlock();
                }
                nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
        }

        return NLMSG_ALIGN(sizeof(struct rtmsg))
               + nla_total_size(16) /* RTA_SRC */
               + nla_total_size(16) /* RTA_DST */
               + nla_total_size(16) /* RTA_GATEWAY */
               + nla_total_size(16) /* RTA_PREFSRC */
               + nla_total_size(4) /* RTA_TABLE */
               + nla_total_size(4) /* RTA_IIF */
               + nla_total_size(4) /* RTA_OIF */
               + nla_total_size(4) /* RTA_PRIORITY */
               + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
               + nla_total_size(sizeof(struct rta_cacheinfo))
               + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
               + nla_total_size(1) /* RTA_PREF */
               + nexthop_len;
}

static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
                                 unsigned char *flags)
{
        if (nexthop_is_multipath(nh)) {
                struct nlattr *mp;

                mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
                if (!mp)
                        goto nla_put_failure;

                if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
                        goto nla_put_failure;

                nla_nest_end(skb, mp);
        } else {
                struct fib6_nh *fib6_nh;

                fib6_nh = nexthop_fib6_nh(nh);
                if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
                                     flags, false) < 0)
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int rt6_fill_node(struct net *net, struct sk_buff *skb,
                         struct fib6_info *rt, struct dst_entry *dst,
                         struct in6_addr *dest, struct in6_addr *src,
                         int iif, int type, u32 portid, u32 seq,
                         unsigned int flags)
{
        struct rt6_info *rt6 = dst_rt6_info(dst);
        struct rt6key *rt6_dst, *rt6_src;
        u32 *pmetrics, table, rt6_flags;
        unsigned char nh_flags = 0;
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;
        long expires = 0;

        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
        if (!nlh)
                return -EMSGSIZE;

        if (rt6) {
                rt6_dst = &rt6->rt6i_dst;
                rt6_src = &rt6->rt6i_src;
                rt6_flags = rt6->rt6i_flags;
        } else {
                rt6_dst = &rt->fib6_dst;
                rt6_src = &rt->fib6_src;
                rt6_flags = rt->fib6_flags;
        }

        rtm = nlmsg_data(nlh);
        rtm->rtm_family = AF_INET6;
        rtm->rtm_dst_len = rt6_dst->plen;
        rtm->rtm_src_len = rt6_src->plen;
        rtm->rtm_tos = 0;
        if (rt->fib6_table)
                table = rt->fib6_table->tb6_id;
        else
                table = RT6_TABLE_UNSPEC;
        rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
        if (nla_put_u32(skb, RTA_TABLE, table))
                goto nla_put_failure;

        rtm->rtm_type = rt->fib6_type;
        rtm->rtm_flags = 0;
        rtm->rtm_scope = RT_SCOPE_UNIVERSE;
        rtm->rtm_protocol = rt->fib6_protocol;

        if (rt6_flags & RTF_CACHE)
                rtm->rtm_flags |= RTM_F_CLONED;

        if (dest) {
                if (nla_put_in6_addr(skb, RTA_DST, dest))
                        goto nla_put_failure;
                rtm->rtm_dst_len = 128;
        } else if (rtm->rtm_dst_len)
                if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
                        goto nla_put_failure;
#ifdef CONFIG_IPV6_SUBTREES
        if (src) {
                if (nla_put_in6_addr(skb, RTA_SRC, src))
                        goto nla_put_failure;
                rtm->rtm_src_len = 128;
        } else if (rtm->rtm_src_len &&
                   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
                goto nla_put_failure;
#endif
        if (iif) {
#ifdef CONFIG_IPV6_MROUTE
                if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
                        int err = ip6mr_get_route(net, skb, rtm, portid);

                        if (err == 0)
                                return 0;
                        if (err < 0)
                                goto nla_put_failure;
                } else
#endif
                        if (nla_put_u32(skb, RTA_IIF, iif))
                                goto nla_put_failure;
        } else if (dest) {
                struct in6_addr saddr_buf;
                if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 &&
                    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
                        goto nla_put_failure;
        }

        if (rt->fib6_prefsrc.plen) {
                struct in6_addr saddr_buf;
                saddr_buf = rt->fib6_prefsrc.addr;
                if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
                        goto nla_put_failure;
        }

        pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
        if (rtnetlink_put_metrics(skb, pmetrics) < 0)
                goto nla_put_failure;

        if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
                goto nla_put_failure;

        /* For multipath routes, walk the siblings list and add
         * each as a nexthop within RTA_MULTIPATH.
         */
        if (rt6) {
                if (rt6_flags & RTF_GATEWAY &&
                    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
                        goto nla_put_failure;

                if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
                        goto nla_put_failure;

                if (dst->lwtstate &&
                    lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
                        goto nla_put_failure;
        } else if (rt->fib6_nsiblings) {
                struct fib6_info *sibling;
                struct nlattr *mp;

                mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
                if (!mp)
                        goto nla_put_failure;

                if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
                                    rt->fib6_nh->fib_nh_weight, AF_INET6,
                                    0) < 0)
                        goto nla_put_failure;

                rcu_read_lock();

                list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
                                        fib6_siblings) {
                        if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
                                            sibling->fib6_nh->fib_nh_weight,
                                            AF_INET6, 0) < 0) {
                                rcu_read_unlock();

                                goto nla_put_failure;
                        }
                }

                rcu_read_unlock();

                nla_nest_end(skb, mp);
        } else if (rt->nh) {
                if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
                        goto nla_put_failure;

                if (nexthop_is_blackhole(rt->nh))
                        rtm->rtm_type = RTN_BLACKHOLE;

                if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
                    rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
                        goto nla_put_failure;

                rtm->rtm_flags |= nh_flags;
        } else {
                if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
                                     &nh_flags, false) < 0)
                        goto nla_put_failure;

                rtm->rtm_flags |= nh_flags;
        }

        if (rt6_flags & RTF_EXPIRES) {
                expires = dst ? dst->expires : rt->expires;
                expires -= jiffies;
        }

        if (!dst) {
                if (READ_ONCE(rt->offload))
                        rtm->rtm_flags |= RTM_F_OFFLOAD;
                if (READ_ONCE(rt->trap))
                        rtm->rtm_flags |= RTM_F_TRAP;
                if (READ_ONCE(rt->offload_failed))
                        rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
        }

        if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
                goto nla_put_failure;

        if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
                goto nla_put_failure;


        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
{
        const struct net_device *dev = arg;

        if (nh->fib_nh_dev == dev)
                return 1;

        return 0;
}

static bool fib6_info_uses_dev(const struct fib6_info *f6i,
                               const struct net_device *dev)
{
        if (f6i->nh) {
                struct net_device *_dev = (struct net_device *)dev;

                return !!nexthop_for_each_fib6_nh(f6i->nh,
                                                  fib6_info_nh_uses_dev,
                                                  _dev);
        }

        if (f6i->fib6_nh->fib_nh_dev == dev)
                return true;

        if (f6i->fib6_nsiblings) {
                struct fib6_info *sibling, *next_sibling;

                list_for_each_entry_safe(sibling, next_sibling,
                                         &f6i->fib6_siblings, fib6_siblings) {
                        if (sibling->fib6_nh->fib_nh_dev == dev)
                                return true;
                }
        }

        return false;
}

struct fib6_nh_exception_dump_walker {
        struct rt6_rtnl_dump_arg *dump;
        struct fib6_info *rt;
        unsigned int flags;
        unsigned int skip;
        unsigned int count;
};

static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
{
        struct fib6_nh_exception_dump_walker *w = arg;
        struct rt6_rtnl_dump_arg *dump = w->dump;
        struct rt6_exception_bucket *bucket;
        struct rt6_exception *rt6_ex;
        int i, err;

        bucket = fib6_nh_get_excptn_bucket(nh, NULL);
        if (!bucket)
                return 0;

        for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
                hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
                        if (w->skip) {
                                w->skip--;
                                continue;
                        }

                        /* Expiration of entries doesn't bump sernum, insertion
                         * does. Removal is triggered by insertion, so we can
                         * rely on the fact that if entries change between two
                         * partial dumps, this node is scanned again completely,
                         * see rt6_insert_exception() and fib6_dump_table().
                         *
                         * Count expired entries we go through as handled
                         * entries that we'll skip next time, in case of partial
                         * node dump. Otherwise, if entries expire meanwhile,
                         * we'll skip the wrong amount.
                         */
                        if (rt6_check_expired(rt6_ex->rt6i)) {
                                w->count++;
                                continue;
                        }

                        err = rt6_fill_node(dump->net, dump->skb, w->rt,
                                            &rt6_ex->rt6i->dst, NULL, NULL, 0,
                                            RTM_NEWROUTE,
                                            NETLINK_CB(dump->cb->skb).portid,
                                            dump->cb->nlh->nlmsg_seq, w->flags);
                        if (err)
                                return err;

                        w->count++;
                }
                bucket++;
        }

        return 0;
}

/* Return -1 if done with node, number of handled routes on partial dump */
int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
{
        struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
        struct fib_dump_filter *filter = &arg->filter;
        unsigned int flags = NLM_F_MULTI;
        struct net *net = arg->net;
        int count = 0;

        if (rt == net->ipv6.fib6_null_entry)
                return -1;

        if ((filter->flags & RTM_F_PREFIX) &&
            !(rt->fib6_flags & RTF_PREFIX_RT)) {
                /* success since this is not a prefix route */
                return -1;
        }
        if (filter->filter_set &&
            ((filter->rt_type  && rt->fib6_type != filter->rt_type) ||
             (filter->dev      && !fib6_info_uses_dev(rt, filter->dev)) ||
             (filter->protocol && rt->fib6_protocol != filter->protocol))) {
                return -1;
        }

        if (filter->filter_set ||
            !filter->dump_routes || !filter->dump_exceptions) {
                flags |= NLM_F_DUMP_FILTERED;
        }

        if (filter->dump_routes) {
                if (skip) {
                        skip--;
                } else {
                        if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
                                          0, RTM_NEWROUTE,
                                          NETLINK_CB(arg->cb->skb).portid,
                                          arg->cb->nlh->nlmsg_seq, flags)) {
                                return 0;
                        }
                        count++;
                }
        }

        if (filter->dump_exceptions) {
                struct fib6_nh_exception_dump_walker w = { .dump = arg,
                                                           .rt = rt,
                                                           .flags = flags,
                                                           .skip = skip,
                                                           .count = 0 };
                int err;

                rcu_read_lock();
                if (rt->nh) {
                        err = nexthop_for_each_fib6_nh(rt->nh,
                                                       rt6_nh_dump_exceptions,
                                                       &w);
                } else {
                        err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
                }
                rcu_read_unlock();

                if (err)
                        return count + w.count;
        }

        return -1;
}

static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
                                        const struct nlmsghdr *nlh,
                                        struct nlattr **tb,
                                        struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Invalid header for get route request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                              rtm_ipv6_policy, extack);

        rtm = nlmsg_data(nlh);
        if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
            (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
            rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
            rtm->rtm_type) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
                return -EINVAL;
        }
        if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Invalid flags for get route request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
                                            rtm_ipv6_policy, extack);
        if (err)
                return err;

        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
                NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
                return -EINVAL;
        }

        if (tb[RTA_FLOWLABEL] &&
            (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) {
                NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
                                    "Invalid flow label");
                return -EINVAL;
        }

        for (i = 0; i <= RTA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case RTA_SRC:
                case RTA_DST:
                case RTA_IIF:
                case RTA_OIF:
                case RTA_MARK:
                case RTA_UID:
                case RTA_SPORT:
                case RTA_DPORT:
                case RTA_IP_PROTO:
                case RTA_FLOWLABEL:
                        break;
                default:
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[RTA_MAX+1];
        int err, iif = 0, oif = 0;
        struct fib6_info *from;
        struct dst_entry *dst;
        struct rt6_info *rt;
        struct sk_buff *skb;
        struct rtmsg *rtm;
        struct flowi6 fl6 = {};
        __be32 flowlabel;
        bool fibmatch;

        err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
        if (err < 0)
                goto errout;

        err = -EINVAL;
        rtm = nlmsg_data(nlh);
        fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);

        if (tb[RTA_SRC]) {
                if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
                        goto errout;

                fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
        }

        if (tb[RTA_DST]) {
                if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
                        goto errout;

                fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
        }

        if (tb[RTA_IIF])
                iif = nla_get_u32(tb[RTA_IIF]);

        if (tb[RTA_OIF])
                oif = nla_get_u32(tb[RTA_OIF]);

        if (tb[RTA_MARK])
                fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);

        if (tb[RTA_UID])
                fl6.flowi6_uid = make_kuid(current_user_ns(),
                                           nla_get_u32(tb[RTA_UID]));
        else
                fl6.flowi6_uid = iif ? INVALID_UID : current_uid();

        if (tb[RTA_SPORT])
                fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);

        if (tb[RTA_DPORT])
                fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);

        if (tb[RTA_IP_PROTO]) {
                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
                                                  &fl6.flowi6_proto, AF_INET6,
                                                  extack);
                if (err)
                        goto errout;
        }

        flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0);
        fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel);

        if (iif) {
                struct net_device *dev;
                int flags = 0;

                rcu_read_lock();

                dev = dev_get_by_index_rcu(net, iif);
                if (!dev) {
                        rcu_read_unlock();
                        err = -ENODEV;
                        goto errout;
                }

                fl6.flowi6_iif = iif;

                if (!ipv6_addr_any(&fl6.saddr))
                        flags |= RT6_LOOKUP_F_HAS_SADDR;

                dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);

                rcu_read_unlock();
        } else {
                fl6.flowi6_oif = oif;

                dst = ip6_route_output(net, NULL, &fl6);
        }


        rt = dst_rt6_info(dst);
        if (rt->dst.error) {
                err = rt->dst.error;
                ip6_rt_put(rt);
                goto errout;
        }

        if (rt == net->ipv6.ip6_null_entry) {
                err = rt->dst.error;
                ip6_rt_put(rt);
                goto errout;
        }

        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb) {
                ip6_rt_put(rt);
                err = -ENOBUFS;
                goto errout;
        }

        skb_dst_set(skb, &rt->dst);

        rcu_read_lock();
        from = rcu_dereference(rt->from);
        if (from) {
                if (fibmatch)
                        err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
                                            iif, RTM_NEWROUTE,
                                            NETLINK_CB(in_skb).portid,
                                            nlh->nlmsg_seq, 0);
                else
                        err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
                                            &fl6.saddr, iif, RTM_NEWROUTE,
                                            NETLINK_CB(in_skb).portid,
                                            nlh->nlmsg_seq, 0);
        } else {
                err = -ENETUNREACH;
        }
        rcu_read_unlock();

        if (err < 0) {
                kfree_skb(skb);
                goto errout;
        }

        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
        return err;
}

void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
                     unsigned int nlm_flags)
{
        struct sk_buff *skb;
        struct net *net = info->nl_net;
        u32 seq;
        int err;

        err = -ENOBUFS;
        seq = info->nlh ? info->nlh->nlmsg_seq : 0;

        skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
                            event, info->portid, seq, nlm_flags);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
                    info->nlh, GFP_ATOMIC);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}

void fib6_rt_update(struct net *net, struct fib6_info *rt,
                    struct nl_info *info)
{
        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
        if (!skb)
                goto errout;

        err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
                            RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
                    info->nlh, gfp_any());
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}

void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
                            bool offload, bool trap, bool offload_failed)
{
        struct sk_buff *skb;
        int err;

        if (READ_ONCE(f6i->offload) == offload &&
            READ_ONCE(f6i->trap) == trap &&
            READ_ONCE(f6i->offload_failed) == offload_failed)
                return;

        WRITE_ONCE(f6i->offload, offload);
        WRITE_ONCE(f6i->trap, trap);

        /* 2 means send notifications only if offload_failed was changed. */
        if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
            READ_ONCE(f6i->offload_failed) == offload_failed)
                return;

        WRITE_ONCE(f6i->offload_failed, offload_failed);

        if (!rcu_access_pointer(f6i->fib6_node))
                /* The route was removed from the tree, do not send
                 * notification.
                 */
                return;

        if (!net->ipv6.sysctl.fib_notify_on_flag_change)
                return;

        skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
        if (!skb) {
                err = -ENOBUFS;
                goto errout;
        }

        err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
                            0, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
        return;

errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
EXPORT_SYMBOL(fib6_info_hw_flags_set);

static int ip6_route_dev_notify(struct notifier_block *this,
                                unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);

        if (!(dev->flags & IFF_LOOPBACK))
                return NOTIFY_OK;

        if (event == NETDEV_REGISTER) {
                net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
                net->ipv6.ip6_null_entry->dst.dev = dev;
                net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
                net->ipv6.ip6_prohibit_entry->dst.dev = dev;
                net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
                net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
                net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
         } else if (event == NETDEV_UNREGISTER &&
                    dev->reg_state != NETREG_UNREGISTERED) {
                /* NETDEV_UNREGISTER could be fired for multiple times by
                 * netdev_wait_allrefs(). Make sure we only call this once.
                 */
                in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
                in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
                in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
#endif
        }

        return NOTIFY_OK;
}

/*
 *        /proc
 */

#ifdef CONFIG_PROC_FS
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
        struct net *net = (struct net *)seq->private;
        seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
                   net->ipv6.rt6_stats->fib_nodes,
                   net->ipv6.rt6_stats->fib_route_nodes,
                   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
                   net->ipv6.rt6_stats->fib_rt_entries,
                   net->ipv6.rt6_stats->fib_rt_cache,
                   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
                   net->ipv6.rt6_stats->fib_discarded_routes);

        return 0;
}
#endif        /* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        struct net *net;
        int delay;
        int ret;
        if (!write)
                return -EINVAL;

        ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        if (ret)
                return ret;

        net = (struct net *)ctl->extra1;
        delay = net->ipv6.sysctl.flush_delay;
        fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
        return 0;
}

static struct ctl_table ipv6_route_table_template[] = {
        {
                .procname        =        "max_size",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_max_size,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "gc_thresh",
                .data                =        &ip6_dst_ops_template.gc_thresh,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "flush",
                .data                =        &init_net.ipv6.sysctl.flush_delay,
                .maxlen                =        sizeof(int),
                .mode                =        0200,
                .proc_handler        =        ipv6_sysctl_rtcache_flush
        },
        {
                .procname        =        "gc_min_interval",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "gc_timeout",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "gc_interval",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_interval,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "gc_elasticity",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "mtu_expires",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_jiffies,
        },
        {
                .procname        =        "min_adv_mss",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_min_advmss,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec,
        },
        {
                .procname        =        "gc_min_interval_ms",
                .data                =        &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
                .maxlen                =        sizeof(int),
                .mode                =        0644,
                .proc_handler        =        proc_dointvec_ms_jiffies,
        },
        {
                .procname        =        "skip_notify_on_dev_down",
                .data                =        &init_net.ipv6.sysctl.skip_notify_on_dev_down,
                .maxlen                =        sizeof(u8),
                .mode                =        0644,
                .proc_handler        =        proc_dou8vec_minmax,
                .extra1                =        SYSCTL_ZERO,
                .extra2                =        SYSCTL_ONE,
        },
};

struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
{
        struct ctl_table *table;

        table = kmemdup(ipv6_route_table_template,
                        sizeof(ipv6_route_table_template),
                        GFP_KERNEL);

        if (table) {
                table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
                table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
                table[2].data = &net->ipv6.sysctl.flush_delay;
                table[2].extra1 = net;
                table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
                table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
                table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
                table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
                table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
                table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
                table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
                table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
        }

        return table;
}

size_t ipv6_route_sysctl_table_size(struct net *net)
{
        /* Don't export sysctls to unprivileged users */
        if (net->user_ns != &init_user_ns)
                return 1;

        return ARRAY_SIZE(ipv6_route_table_template);
}
#endif

static int __net_init ip6_route_net_init(struct net *net)
{
        int ret = -ENOMEM;

        memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
               sizeof(net->ipv6.ip6_dst_ops));

        if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
                goto out_ip6_dst_ops;

        net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
        if (!net->ipv6.fib6_null_entry)
                goto out_ip6_dst_entries;
        memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
               sizeof(*net->ipv6.fib6_null_entry));

        net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
                                           sizeof(*net->ipv6.ip6_null_entry),
                                           GFP_KERNEL);
        if (!net->ipv6.ip6_null_entry)
                goto out_fib6_null_entry;
        net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
        dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
                         ip6_template_metrics, true);
        INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        net->ipv6.fib6_has_custom_rules = false;
        net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
                                               sizeof(*net->ipv6.ip6_prohibit_entry),
                                               GFP_KERNEL);
        if (!net->ipv6.ip6_prohibit_entry)
                goto out_ip6_null_entry;
        net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
        dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
                         ip6_template_metrics, true);
        INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);

        net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
                                               sizeof(*net->ipv6.ip6_blk_hole_entry),
                                               GFP_KERNEL);
        if (!net->ipv6.ip6_blk_hole_entry)
                goto out_ip6_prohibit_entry;
        net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
        dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
                         ip6_template_metrics, true);
        INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
#ifdef CONFIG_IPV6_SUBTREES
        net->ipv6.fib6_routes_require_src = 0;
#endif
#endif

        net->ipv6.sysctl.flush_delay = 0;
        net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
        net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
        net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
        net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
        net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
        net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
        net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
        net->ipv6.sysctl.skip_notify_on_dev_down = 0;

        atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);

        ret = 0;
out:
        return ret;

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
        kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
        kfree(net->ipv6.ip6_null_entry);
#endif
out_fib6_null_entry:
        kfree(net->ipv6.fib6_null_entry);
out_ip6_dst_entries:
        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
out_ip6_dst_ops:
        goto out;
}

static void __net_exit ip6_route_net_exit(struct net *net)
{
        kfree(net->ipv6.fib6_null_entry);
        kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        kfree(net->ipv6.ip6_prohibit_entry);
        kfree(net->ipv6.ip6_blk_hole_entry);
#endif
        dst_entries_destroy(&net->ipv6.ip6_dst_ops);
}

static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
        if (!proc_create_net("ipv6_route", 0, net->proc_net,
                             &ipv6_route_seq_ops,
                             sizeof(struct ipv6_route_iter)))
                return -ENOMEM;

        if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
                                    rt6_stats_seq_show, NULL)) {
                remove_proc_entry("ipv6_route", net->proc_net);
                return -ENOMEM;
        }
#endif
        return 0;
}

static void __net_exit ip6_route_net_exit_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("ipv6_route", net->proc_net);
        remove_proc_entry("rt6_stats", net->proc_net);
#endif
}

static struct pernet_operations ip6_route_net_ops = {
        .init = ip6_route_net_init,
        .exit = ip6_route_net_exit,
};

static int __net_init ipv6_inetpeer_init(struct net *net)
{
        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

        if (!bp)
                return -ENOMEM;
        inet_peer_base_init(bp);
        net->ipv6.peers = bp;
        return 0;
}

static void __net_exit ipv6_inetpeer_exit(struct net *net)
{
        struct inet_peer_base *bp = net->ipv6.peers;

        net->ipv6.peers = NULL;
        inetpeer_invalidate_tree(bp);
        kfree(bp);
}

static struct pernet_operations ipv6_inetpeer_ops = {
        .init        =        ipv6_inetpeer_init,
        .exit        =        ipv6_inetpeer_exit,
};

static struct pernet_operations ip6_route_net_late_ops = {
        .init = ip6_route_net_init_late,
        .exit = ip6_route_net_exit_late,
};

static struct notifier_block ip6_route_dev_notifier = {
        .notifier_call = ip6_route_dev_notify,
        .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
};

void __init ip6_route_init_special_entries(void)
{
        /* Registering of the loopback is done before this portion of code,
         * the loopback reference in rt6_info will not be taken, do it
         * manually for init_net */
        init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
        init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
        init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
        init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
        init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
        init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
        init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
}

#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)

BTF_ID_LIST(btf_fib6_info_id)
BTF_ID(struct, fib6_info)

static const struct bpf_iter_seq_info ipv6_route_seq_info = {
        .seq_ops                = &ipv6_route_seq_ops,
        .init_seq_private        = bpf_iter_init_seq_net,
        .fini_seq_private        = bpf_iter_fini_seq_net,
        .seq_priv_size                = sizeof(struct ipv6_route_iter),
};

static struct bpf_iter_reg ipv6_route_reg_info = {
        .target                        = "ipv6_route",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__ipv6_route, rt),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .seq_info                = &ipv6_route_seq_info,
};

static int __init bpf_iter_register(void)
{
        ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
        return bpf_iter_reg_target(&ipv6_route_reg_info);
}

static void bpf_iter_unregister(void)
{
        bpf_iter_unreg_target(&ipv6_route_reg_info);
}
#endif
#endif

static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = {
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE,
         .doit = inet6_rtm_newroute},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE,
         .doit = inet6_rtm_delroute},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
         .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
};

int __init ip6_route_init(void)
{
        int ret;
        int cpu;

        ret = -ENOMEM;
        ip6_dst_ops_template.kmem_cachep =
                kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
                                  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
        if (!ip6_dst_ops_template.kmem_cachep)
                goto out;

        ret = dst_entries_init(&ip6_dst_blackhole_ops);
        if (ret)
                goto out_kmem_cache;

        ret = register_pernet_subsys(&ipv6_inetpeer_ops);
        if (ret)
                goto out_dst_entries;

        ret = register_pernet_subsys(&ip6_route_net_ops);
        if (ret)
                goto out_register_inetpeer;

        ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

        ret = fib6_init();
        if (ret)
                goto out_register_subsys;

        ret = xfrm6_init();
        if (ret)
                goto out_fib6_init;

        ret = fib6_rules_init();
        if (ret)
                goto xfrm6_init;

        ret = register_pernet_subsys(&ip6_route_net_late_ops);
        if (ret)
                goto fib6_rules_init;

        ret = rtnl_register_many(ip6_route_rtnl_msg_handlers);
        if (ret < 0)
                goto out_register_late_subsys;

        ret = register_netdevice_notifier(&ip6_route_dev_notifier);
        if (ret)
                goto out_register_late_subsys;

#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        ret = bpf_iter_register();
        if (ret)
                goto out_register_late_subsys;
#endif
#endif

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

                INIT_LIST_HEAD(&ul->head);
                spin_lock_init(&ul->lock);
        }

out:
        return ret;

out_register_late_subsys:
        rtnl_unregister_all(PF_INET6);
        unregister_pernet_subsys(&ip6_route_net_late_ops);
fib6_rules_init:
        fib6_rules_cleanup();
xfrm6_init:
        xfrm6_fini();
out_fib6_init:
        fib6_gc_cleanup();
out_register_subsys:
        unregister_pernet_subsys(&ip6_route_net_ops);
out_register_inetpeer:
        unregister_pernet_subsys(&ipv6_inetpeer_ops);
out_dst_entries:
        dst_entries_destroy(&ip6_dst_blackhole_ops);
out_kmem_cache:
        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
        goto out;
}

void ip6_route_cleanup(void)
{
#if IS_BUILTIN(CONFIG_IPV6)
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        bpf_iter_unregister();
#endif
#endif
        unregister_netdevice_notifier(&ip6_route_dev_notifier);
        unregister_pernet_subsys(&ip6_route_net_late_ops);
        fib6_rules_cleanup();
        xfrm6_fini();
        fib6_gc_cleanup();
        unregister_pernet_subsys(&ipv6_inetpeer_ops);
        unregister_pernet_subsys(&ip6_route_net_ops);
        dst_entries_destroy(&ip6_dst_blackhole_ops);
        kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
}



















































  157 


  157 







































  157 

















  156 


  156 






























































  157 











  157 
  157 



  157 











  156 


  156 

  157 













  157 
  156 




  155 
    1 














  156 














  164 
  166 
  166 






  127 
  127 
  127 



  165 



  166 
  165 



  127 
  127 











































   23 












































































































































   23 































   22 




















   23 




















   23 
   23 


   23 


   22 


   23 


   23 







































  157 







  157 

















  156 


  157 














  157 







  157 











  157 





  157 



  157 









  157 















  157 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#include <hyp/switch.h>

#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/percpu.h>
#include <uapi/linux/psci.h>

#include <kvm/arm_psci.h>

#include <asm/barrier.h>
#include <asm/cpufeature.h>
#include <asm/kprobes.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/fpsimd.h>
#include <asm/debug-monitors.h>
#include <asm/processor.h>
#include <asm/thread_info.h>
#include <asm/vectors.h>

/* VHE specific context */
DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data);
DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);
DEFINE_PER_CPU(unsigned long, kvm_hyp_vector);

/*
 * HCR_EL2 bits that the NV guest can freely change (no RES0/RES1
 * semantics, irrespective of the configuration), but that cannot be
 * applied to the actual HW as things would otherwise break badly.
 *
 * - TGE: we want the guest to use EL1, which is incompatible with
 *   this bit being set
 *
 * - API/APK: they are already accounted for by vcpu_load(), and can
 *   only take effect across a load/put cycle (such as ERET)
 */
#define NV_HCR_GUEST_EXCLUDE        (HCR_TGE | HCR_API | HCR_APK)

static u64 __compute_hcr(struct kvm_vcpu *vcpu)
{
        u64 guest_hcr = __vcpu_sys_reg(vcpu, HCR_EL2);
        u64 hcr = vcpu->arch.hcr_el2;

        if (!vcpu_has_nv(vcpu))
                return hcr;

        /*
         * We rely on the invariant that a vcpu entered from HYP
         * context must also exit in the same context, as only an ERET
         * instruction can kick us out of it, and we obviously trap
         * that sucker. PSTATE.M will get fixed-up on exit.
         */
        if (is_hyp_ctxt(vcpu)) {
                host_data_set_flag(VCPU_IN_HYP_CONTEXT);

                hcr |= HCR_NV | HCR_NV2 | HCR_AT | HCR_TTLB;

                if (!vcpu_el2_e2h_is_set(vcpu))
                        hcr |= HCR_NV1;

                write_sysreg_s(vcpu->arch.ctxt.vncr_array, SYS_VNCR_EL2);
        } else {
                host_data_clear_flag(VCPU_IN_HYP_CONTEXT);

                if (guest_hcr & HCR_NV) {
                        u64 va = __fix_to_virt(vncr_fixmap(smp_processor_id()));

                        /* Inherit the low bits from the actual register */
                        va |= __vcpu_sys_reg(vcpu, VNCR_EL2) & GENMASK(PAGE_SHIFT - 1, 0);
                        write_sysreg_s(va, SYS_VNCR_EL2);

                        /* Force NV2 in case the guest is forgetful... */
                        guest_hcr |= HCR_NV2;
                }
        }

        BUG_ON(host_data_test_flag(VCPU_IN_HYP_CONTEXT) &&
               host_data_test_flag(L1_VNCR_MAPPED));

        return hcr | (guest_hcr & ~NV_HCR_GUEST_EXCLUDE);
}

static void __activate_cptr_traps(struct kvm_vcpu *vcpu)
{
        u64 cptr;

        /*
         * With VHE (HCR.E2H == 1), accesses to CPACR_EL1 are routed to
         * CPTR_EL2. In general, CPACR_EL1 has the same layout as CPTR_EL2,
         * except for some missing controls, such as TAM.
         * In this case, CPTR_EL2.TAM has the same position with or without
         * VHE (HCR.E2H == 1) which allows us to use here the CPTR_EL2.TAM
         * shift value for trapping the AMU accesses.
         */
        u64 val = CPACR_EL1_TTA | CPTR_EL2_TAM;

        if (guest_owns_fp_regs()) {
                val |= CPACR_EL1_FPEN;
                if (vcpu_has_sve(vcpu))
                        val |= CPACR_EL1_ZEN;
        } else {
                __activate_traps_fpsimd32(vcpu);
        }

        if (!vcpu_has_nv(vcpu))
                goto write;

        /*
         * The architecture is a bit crap (what a surprise): an EL2 guest
         * writing to CPTR_EL2 via CPACR_EL1 can't set any of TCPAC or TTA,
         * as they are RES0 in the guest's view. To work around it, trap the
         * sucker using the very same bit it can't set...
         */
        if (vcpu_el2_e2h_is_set(vcpu) && is_hyp_ctxt(vcpu))
                val |= CPTR_EL2_TCPAC;

        /*
         * Layer the guest hypervisor's trap configuration on top of our own if
         * we're in a nested context.
         */
        if (is_hyp_ctxt(vcpu))
                goto write;

        cptr = vcpu_sanitised_cptr_el2(vcpu);

        /*
         * Pay attention, there's some interesting detail here.
         *
         * The CPTR_EL2.xEN fields are 2 bits wide, although there are only two
         * meaningful trap states when HCR_EL2.TGE = 0 (running a nested guest):
         *
         *  - CPTR_EL2.xEN = x0, traps are enabled
         *  - CPTR_EL2.xEN = x1, traps are disabled
         *
         * In other words, bit[0] determines if guest accesses trap or not. In
         * the interest of simplicity, clear the entire field if the guest
         * hypervisor has traps enabled to dispel any illusion of something more
         * complicated taking place.
         */
        if (!(SYS_FIELD_GET(CPACR_EL1, FPEN, cptr) & BIT(0)))
                val &= ~CPACR_EL1_FPEN;
        if (!(SYS_FIELD_GET(CPACR_EL1, ZEN, cptr) & BIT(0)))
                val &= ~CPACR_EL1_ZEN;

        if (kvm_has_feat(vcpu->kvm, ID_AA64MMFR3_EL1, S2POE, IMP))
                val |= cptr & CPACR_EL1_E0POE;

        val |= cptr & CPTR_EL2_TCPAC;

write:
        write_sysreg(val, cpacr_el1);
}

static void __deactivate_cptr_traps(struct kvm_vcpu *vcpu)
{
        u64 val = CPACR_EL1_FPEN | CPACR_EL1_ZEN_EL1EN;

        if (cpus_have_final_cap(ARM64_SME))
                val |= CPACR_EL1_SMEN_EL1EN;

        write_sysreg(val, cpacr_el1);
}

static void __activate_traps(struct kvm_vcpu *vcpu)
{
        u64 val;

        ___activate_traps(vcpu, __compute_hcr(vcpu));

        if (has_cntpoff()) {
                struct timer_map map;

                get_timer_map(vcpu, &map);

                /*
                 * We're entrering the guest. Reload the correct
                 * values from memory now that TGE is clear.
                 */
                if (map.direct_ptimer == vcpu_ptimer(vcpu))
                        val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
                if (map.direct_ptimer == vcpu_hptimer(vcpu))
                        val = __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2);

                if (map.direct_ptimer) {
                        write_sysreg_el0(val, SYS_CNTP_CVAL);
                        isb();
                }
        }

        __activate_cptr_traps(vcpu);

        write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el1);
}
NOKPROBE_SYMBOL(__activate_traps);

static void __deactivate_traps(struct kvm_vcpu *vcpu)
{
        const char *host_vectors = vectors;

        ___deactivate_traps(vcpu);

        write_sysreg_hcr(HCR_HOST_VHE_FLAGS);

        if (has_cntpoff()) {
                struct timer_map map;
                u64 val, offset;

                get_timer_map(vcpu, &map);

                /*
                 * We're exiting the guest. Save the latest CVAL value
                 * to memory and apply the offset now that TGE is set.
                 */
                val = read_sysreg_el0(SYS_CNTP_CVAL);
                if (map.direct_ptimer == vcpu_ptimer(vcpu))
                        __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = val;
                if (map.direct_ptimer == vcpu_hptimer(vcpu))
                        __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = val;

                offset = read_sysreg_s(SYS_CNTPOFF_EL2);

                if (map.direct_ptimer && offset) {
                        write_sysreg_el0(val + offset, SYS_CNTP_CVAL);
                        isb();
                }
        }

        /*
         * ARM errata 1165522 and 1530923 require the actual execution of the
         * above before we can switch to the EL2/EL0 translation regime used by
         * the host.
         */
        asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));

        __deactivate_cptr_traps(vcpu);

        if (!arm64_kernel_unmapped_at_el0())
                host_vectors = __this_cpu_read(this_cpu_vector);
        write_sysreg(host_vectors, vbar_el1);
}
NOKPROBE_SYMBOL(__deactivate_traps);

/*
 * Disable IRQs in __vcpu_{load,put}_{activate,deactivate}_traps() to
 * prevent a race condition between context switching of PMUSERENR_EL0
 * in __{activate,deactivate}_traps_common() and IPIs that attempts to
 * update PMUSERENR_EL0. See also kvm_set_pmuserenr().
 */
static void __vcpu_load_activate_traps(struct kvm_vcpu *vcpu)
{
        unsigned long flags;

        local_irq_save(flags);
        __activate_traps_common(vcpu);
        local_irq_restore(flags);
}

static void __vcpu_put_deactivate_traps(struct kvm_vcpu *vcpu)
{
        unsigned long flags;

        local_irq_save(flags);
        __deactivate_traps_common(vcpu);
        local_irq_restore(flags);
}

void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu)
{
        host_data_ptr(host_ctxt)->__hyp_running_vcpu = vcpu;

        __vcpu_load_switch_sysregs(vcpu);
        __vcpu_load_activate_traps(vcpu);
        __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch);
}

void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu)
{
        __vcpu_put_deactivate_traps(vcpu);
        __vcpu_put_switch_sysregs(vcpu);

        host_data_ptr(host_ctxt)->__hyp_running_vcpu = NULL;
}

static u64 compute_emulated_cntx_ctl_el0(struct kvm_vcpu *vcpu,
                                         enum vcpu_sysreg reg)
{
        unsigned long ctl;
        u64 cval, cnt;
        bool stat;

        switch (reg) {
        case CNTP_CTL_EL0:
                cval = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
                ctl  = __vcpu_sys_reg(vcpu, CNTP_CTL_EL0);
                cnt  = compute_counter_value(vcpu_ptimer(vcpu));
                break;
        case CNTV_CTL_EL0:
                cval = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
                ctl  = __vcpu_sys_reg(vcpu, CNTV_CTL_EL0);
                cnt  = compute_counter_value(vcpu_vtimer(vcpu));
                break;
        default:
                BUG();
        }

        stat = cval <= cnt;
        __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &ctl, stat);

        return ctl;
}

static bool kvm_hyp_handle_timer(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        u64 esr, val;

        /*
         * Having FEAT_ECV allows for a better quality of timer emulation.
         * However, this comes at a huge cost in terms of traps. Try and
         * satisfy the reads from guest's hypervisor context without
         * returning to the kernel if we can.
         */
        if (!is_hyp_ctxt(vcpu))
                return false;

        esr = kvm_vcpu_get_esr(vcpu);
        if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) != ESR_ELx_SYS64_ISS_DIR_READ)
                return false;

        switch (esr_sys64_to_sysreg(esr)) {
        case SYS_CNTP_CTL_EL02:
                val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0);
                break;
        case SYS_CNTP_CTL_EL0:
                if (vcpu_el2_e2h_is_set(vcpu))
                        val = read_sysreg_el0(SYS_CNTP_CTL);
                else
                        val = compute_emulated_cntx_ctl_el0(vcpu, CNTP_CTL_EL0);
                break;
        case SYS_CNTP_CVAL_EL02:
                val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
                break;
        case SYS_CNTP_CVAL_EL0:
                if (vcpu_el2_e2h_is_set(vcpu)) {
                        val = read_sysreg_el0(SYS_CNTP_CVAL);

                        if (!has_cntpoff())
                                val -= timer_get_offset(vcpu_hptimer(vcpu));
                } else {
                        val = __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
                }
                break;
        case SYS_CNTPCT_EL0:
        case SYS_CNTPCTSS_EL0:
                val = compute_counter_value(vcpu_hptimer(vcpu));
                break;
        case SYS_CNTV_CTL_EL02:
                val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0);
                break;
        case SYS_CNTV_CTL_EL0:
                if (vcpu_el2_e2h_is_set(vcpu))
                        val = read_sysreg_el0(SYS_CNTV_CTL);
                else
                        val = compute_emulated_cntx_ctl_el0(vcpu, CNTV_CTL_EL0);
                break;
        case SYS_CNTV_CVAL_EL02:
                val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
                break;
        case SYS_CNTV_CVAL_EL0:
                if (vcpu_el2_e2h_is_set(vcpu))
                        val = read_sysreg_el0(SYS_CNTV_CVAL);
                else
                        val = __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
                break;
        case SYS_CNTVCT_EL0:
        case SYS_CNTVCTSS_EL0:
                val = compute_counter_value(vcpu_hvtimer(vcpu));
                break;
        default:
                return false;
        }

        vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val);
        __kvm_skip_instr(vcpu);

        return true;
}

static bool kvm_hyp_handle_eret(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        u64 esr = kvm_vcpu_get_esr(vcpu);
        u64 spsr, elr, mode;

        /*
         * Going through the whole put/load motions is a waste of time
         * if this is a VHE guest hypervisor returning to its own
         * userspace, or the hypervisor performing a local exception
         * return. No need to save/restore registers, no need to
         * switch S2 MMU. Just do the canonical ERET.
         *
         * Unless the trap has to be forwarded further down the line,
         * of course...
         */
        if ((__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV) ||
            (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_ERET))
                return false;

        spsr = read_sysreg_el1(SYS_SPSR);
        mode = spsr & (PSR_MODE_MASK | PSR_MODE32_BIT);

        switch (mode) {
        case PSR_MODE_EL0t:
                if (!(vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)))
                        return false;
                break;
        case PSR_MODE_EL2t:
                mode = PSR_MODE_EL1t;
                break;
        case PSR_MODE_EL2h:
                mode = PSR_MODE_EL1h;
                break;
        default:
                return false;
        }

        /* If ERETAx fails, take the slow path */
        if (esr_iss_is_eretax(esr)) {
                if (!(vcpu_has_ptrauth(vcpu) && kvm_auth_eretax(vcpu, &elr)))
                        return false;
        } else {
                elr = read_sysreg_el1(SYS_ELR);
        }

        spsr = (spsr & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode;

        write_sysreg_el2(spsr, SYS_SPSR);
        write_sysreg_el2(elr, SYS_ELR);

        return true;
}

static bool kvm_hyp_handle_tlbi_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        int ret = -EINVAL;
        u32 instr;
        u64 val;

        /*
         * Ideally, we would never trap on EL2 S1 TLB invalidations using
         * the EL1 instructions when the guest's HCR_EL2.{E2H,TGE}=={1,1}.
         * But "thanks" to FEAT_NV2, we don't trap writes to HCR_EL2,
         * meaning that we can't track changes to the virtual TGE bit. So we
         * have to leave HCR_EL2.TTLB set on the host. Oopsie...
         *
         * Try and handle these invalidation as quickly as possible, without
         * fully exiting. Note that we don't need to consider any forwarding
         * here, as having E2H+TGE set is the very definition of being
         * InHost.
         *
         * For the lesser hypervisors out there that have failed to get on
         * with the VHE program, we can also handle the nVHE style of EL2
         * invalidation.
         */
        if (!(is_hyp_ctxt(vcpu)))
                return false;

        instr = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
        val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));

        if ((kvm_supported_tlbi_s1e1_op(vcpu, instr) &&
             vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) ||
            kvm_supported_tlbi_s1e2_op (vcpu, instr))
                ret = __kvm_tlbi_s1e2(NULL, val, instr);

        if (ret)
                return false;

        /*
         * If we have to check for any VNCR mapping being invalidated,
         * go back to the slow path for further processing.
         */
        if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu) &&
            atomic_read(&vcpu->kvm->arch.vncr_map_count))
                return false;

        __kvm_skip_instr(vcpu);

        return true;
}

static bool kvm_hyp_handle_cpacr_el1(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        u64 esr = kvm_vcpu_get_esr(vcpu);
        int rt;

        if (!is_hyp_ctxt(vcpu) || esr_sys64_to_sysreg(esr) != SYS_CPACR_EL1)
                return false;

        rt = kvm_vcpu_sys_get_rt(vcpu);

        if ((esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ) {
                vcpu_set_reg(vcpu, rt, __vcpu_sys_reg(vcpu, CPTR_EL2));
        } else {
                vcpu_write_sys_reg(vcpu, vcpu_get_reg(vcpu, rt), CPTR_EL2);
                __activate_cptr_traps(vcpu);
        }

        __kvm_skip_instr(vcpu);

        return true;
}

static bool kvm_hyp_handle_zcr_el2(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));

        if (!vcpu_has_nv(vcpu))
                return false;

        if (sysreg != SYS_ZCR_EL2)
                return false;

        if (guest_owns_fp_regs())
                return false;

        /*
         * ZCR_EL2 traps are handled in the slow path, with the expectation
         * that the guest's FP context has already been loaded onto the CPU.
         *
         * Load the guest's FP context and unconditionally forward to the
         * slow path for handling (i.e. return false).
         */
        kvm_hyp_handle_fpsimd(vcpu, exit_code);
        return false;
}

static bool kvm_hyp_handle_sysreg_vhe(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        if (kvm_hyp_handle_tlbi_el2(vcpu, exit_code))
                return true;

        if (kvm_hyp_handle_timer(vcpu, exit_code))
                return true;

        if (kvm_hyp_handle_cpacr_el1(vcpu, exit_code))
                return true;

        if (kvm_hyp_handle_zcr_el2(vcpu, exit_code))
                return true;

        return kvm_hyp_handle_sysreg(vcpu, exit_code);
}

static bool kvm_hyp_handle_impdef(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        u64 iss;

        if (!cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
                return false;

        /*
         * Compute a synthetic ESR for a sysreg trap. Conveniently, AFSR1_EL2
         * is populated with a correct ISS for a sysreg trap. These fruity
         * parts are 64bit only, so unconditionally set IL.
         */
        iss = ESR_ELx_ISS(read_sysreg_s(SYS_AFSR1_EL2));
        vcpu->arch.fault.esr_el2 = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SYS64) |
                                   FIELD_PREP(ESR_ELx_ISS_MASK, iss) |
                                   ESR_ELx_IL;
        return false;
}

static const exit_handler_fn hyp_exit_handlers[] = {
        [0 ... ESR_ELx_EC_MAX]                = NULL,
        [ESR_ELx_EC_CP15_32]                = kvm_hyp_handle_cp15_32,
        [ESR_ELx_EC_SYS64]                = kvm_hyp_handle_sysreg_vhe,
        [ESR_ELx_EC_SVE]                = kvm_hyp_handle_fpsimd,
        [ESR_ELx_EC_FP_ASIMD]                = kvm_hyp_handle_fpsimd,
        [ESR_ELx_EC_IABT_LOW]                = kvm_hyp_handle_iabt_low,
        [ESR_ELx_EC_DABT_LOW]                = kvm_hyp_handle_dabt_low,
        [ESR_ELx_EC_WATCHPT_LOW]        = kvm_hyp_handle_watchpt_low,
        [ESR_ELx_EC_ERET]                = kvm_hyp_handle_eret,
        [ESR_ELx_EC_MOPS]                = kvm_hyp_handle_mops,

        /* Apple shenanigans */
        [0x3F]                                = kvm_hyp_handle_impdef,
};

static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        synchronize_vcpu_pstate(vcpu, exit_code);

        /*
         * If we were in HYP context on entry, adjust the PSTATE view
         * so that the usual helpers work correctly. This enforces our
         * invariant that the guest's HYP context status is preserved
         * across a run.
         */
        if (vcpu_has_nv(vcpu) &&
            unlikely(host_data_test_flag(VCPU_IN_HYP_CONTEXT))) {
                u64 mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);

                switch (mode) {
                case PSR_MODE_EL1t:
                        mode = PSR_MODE_EL2t;
                        break;
                case PSR_MODE_EL1h:
                        mode = PSR_MODE_EL2h;
                        break;
                }

                *vcpu_cpsr(vcpu) &= ~(PSR_MODE_MASK | PSR_MODE32_BIT);
                *vcpu_cpsr(vcpu) |= mode;
        }

        /* Apply extreme paranoia! */
        BUG_ON(vcpu_has_nv(vcpu) &&
               !!host_data_test_flag(VCPU_IN_HYP_CONTEXT) != is_hyp_ctxt(vcpu));

        return __fixup_guest_exit(vcpu, exit_code, hyp_exit_handlers);
}

/* Switch to the guest for VHE systems running in EL2 */
static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
{
        struct kvm_cpu_context *host_ctxt;
        struct kvm_cpu_context *guest_ctxt;
        u64 exit_code;

        host_ctxt = host_data_ptr(host_ctxt);
        guest_ctxt = &vcpu->arch.ctxt;

        sysreg_save_host_state_vhe(host_ctxt);

        fpsimd_lazy_switch_to_guest(vcpu);

        /*
         * Note that ARM erratum 1165522 requires us to configure both stage 1
         * and stage 2 translation for the guest context before we clear
         * HCR_EL2.TGE. The stage 1 and stage 2 guest context has already been
         * loaded on the CPU in kvm_vcpu_load_vhe().
         */
        __activate_traps(vcpu);

        __kvm_adjust_pc(vcpu);

        sysreg_restore_guest_state_vhe(guest_ctxt);
        __debug_switch_to_guest(vcpu);

        do {
                /* Jump in the fire! */
                exit_code = __guest_enter(vcpu);

                /* And we're baaack! */
        } while (fixup_guest_exit(vcpu, &exit_code));

        sysreg_save_guest_state_vhe(guest_ctxt);

        __deactivate_traps(vcpu);

        fpsimd_lazy_switch_to_host(vcpu);

        sysreg_restore_host_state_vhe(host_ctxt);

        if (guest_owns_fp_regs())
                __fpsimd_save_fpexc32(vcpu);

        __debug_switch_to_host(vcpu);

        return exit_code;
}
NOKPROBE_SYMBOL(__kvm_vcpu_run_vhe);

int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
{
        int ret;

        local_daif_mask();

        /*
         * Having IRQs masked via PMR when entering the guest means the GIC
         * will not signal the CPU of interrupts of lower priority, and the
         * only way to get out will be via guest exceptions.
         * Naturally, we want to avoid this.
         *
         * local_daif_mask() already sets GIC_PRIO_PSR_I_SET, we just need a
         * dsb to ensure the redistributor is forwards EL2 IRQs to the CPU.
         */
        pmr_sync();

        ret = __kvm_vcpu_run_vhe(vcpu);

        /*
         * local_daif_restore() takes care to properly restore PSTATE.DAIF
         * and the GIC PMR if the host is using IRQ priorities.
         */
        local_daif_restore(DAIF_PROCCTX_NOIRQ);

        /*
         * When we exit from the guest we change a number of CPU configuration
         * parameters, such as traps.  We rely on the isb() in kvm_call_hyp*()
         * to make sure these changes take effect before running the host or
         * additional guests.
         */
        return ret;
}

static void __noreturn __hyp_call_panic(u64 spsr, u64 elr, u64 par)
{
        struct kvm_cpu_context *host_ctxt;
        struct kvm_vcpu *vcpu;

        host_ctxt = host_data_ptr(host_ctxt);
        vcpu = host_ctxt->__hyp_running_vcpu;

        __deactivate_traps(vcpu);
        sysreg_restore_host_state_vhe(host_ctxt);

        panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n",
              spsr, elr,
              read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR),
              read_sysreg(hpfar_el2), par, vcpu);
}
NOKPROBE_SYMBOL(__hyp_call_panic);

void __noreturn hyp_panic(void)
{
        u64 spsr = read_sysreg_el2(SYS_SPSR);
        u64 elr = read_sysreg_el2(SYS_ELR);
        u64 par = read_sysreg_par();

        __hyp_call_panic(spsr, elr, par);
}

asmlinkage void kvm_unexpected_el2_exception(void)
{
        __kvm_unexpected_el2_exception();
}















  179 













  179 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Implementations of the security context functions.
 *
 * Author: Ondrej Mosnacek <omosnacek@gmail.com>
 * Copyright (C) 2020 Red Hat, Inc.
 */

#include <linux/jhash.h>

#include "context.h"
#include "mls.h"

u32 context_compute_hash(const struct context *c)
{
        u32 hash = 0;

        /*
         * If a context is invalid, it will always be represented by a
         * context struct with only the len & str set (and vice versa)
         * under a given policy. Since context structs from different
         * policies should never meet, it is safe to hash valid and
         * invalid contexts differently. The context_equal() function
         * already operates under the same assumption.
         */
        if (c->len)
                return full_name_hash(NULL, c->str, c->len);

        hash = jhash_3words(c->user, c->role, c->type, hash);
        hash = mls_range_hash(&c->range, hash);
        return hash;
}


































































































































































































































































































































































































































































   72 






    5 







   66 









   72 















































































































   61 
   61 




   12 
   46 






































































   74 


    1 

    1 

   73 







   71 

   10 

   62 


   13 
   55 



   67 

































   75 



   23 

    5 
   69 



   17 
   52 



   69 

   76 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   27 








   27 







   28 






    2 


    6 
   21 

   27 




   27 











   28 







   28 









    2 






    2 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/read_write.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/sched/xacct.h>
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include "internal.h"

#include <linux/uaccess.h>
#include <asm/unistd.h>

const struct file_operations generic_ro_fops = {
        .llseek                = generic_file_llseek,
        .read_iter        = generic_file_read_iter,
        .mmap                = generic_file_readonly_mmap,
        .splice_read        = filemap_splice_read,
};

EXPORT_SYMBOL(generic_ro_fops);

static inline bool unsigned_offsets(struct file *file)
{
        return file->f_op->fop_flags & FOP_UNSIGNED_OFFSET;
}

/**
 * vfs_setpos_cookie - update the file offset for lseek and reset cookie
 * @file:        file structure in question
 * @offset:        file offset to seek to
 * @maxsize:        maximum file size
 * @cookie:        cookie to reset
 *
 * Update the file offset to the value specified by @offset if the given
 * offset is valid and it is not equal to the current file offset and
 * reset the specified cookie to indicate that a seek happened.
 *
 * Return the specified offset on success and -EINVAL on invalid offset.
 */
static loff_t vfs_setpos_cookie(struct file *file, loff_t offset,
                                loff_t maxsize, u64 *cookie)
{
        if (offset < 0 && !unsigned_offsets(file))
                return -EINVAL;
        if (offset > maxsize)
                return -EINVAL;

        if (offset != file->f_pos) {
                file->f_pos = offset;
                if (cookie)
                        *cookie = 0;
        }
        return offset;
}

/**
 * vfs_setpos - update the file offset for lseek
 * @file:        file structure in question
 * @offset:        file offset to seek to
 * @maxsize:        maximum file size
 *
 * This is a low-level filesystem helper for updating the file offset to
 * the value specified by @offset if the given offset is valid and it is
 * not equal to the current file offset.
 *
 * Return the specified offset on success and -EINVAL on invalid offset.
 */
loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
{
        return vfs_setpos_cookie(file, offset, maxsize, NULL);
}
EXPORT_SYMBOL(vfs_setpos);

/**
 * must_set_pos - check whether f_pos has to be updated
 * @file: file to seek on
 * @offset: offset to use
 * @whence: type of seek operation
 * @eof: end of file
 *
 * Check whether f_pos needs to be updated and update @offset according
 * to @whence.
 *
 * Return: 0 if f_pos doesn't need to be updated, 1 if f_pos has to be
 * updated, and negative error code on failure.
 */
static int must_set_pos(struct file *file, loff_t *offset, int whence, loff_t eof)
{
        switch (whence) {
        case SEEK_END:
                *offset += eof;
                break;
        case SEEK_CUR:
                /*
                 * Here we special-case the lseek(fd, 0, SEEK_CUR)
                 * position-querying operation.  Avoid rewriting the "same"
                 * f_pos value back to the file because a concurrent read(),
                 * write() or lseek() might have altered it
                 */
                if (*offset == 0) {
                        *offset = file->f_pos;
                        return 0;
                }
                break;
        case SEEK_DATA:
                /*
                 * In the generic case the entire file is data, so as long as
                 * offset isn't at the end of the file then the offset is data.
                 */
                if ((unsigned long long)*offset >= eof)
                        return -ENXIO;
                break;
        case SEEK_HOLE:
                /*
                 * There is a virtual hole at the end of the file, so as long as
                 * offset isn't i_size or larger, return i_size.
                 */
                if ((unsigned long long)*offset >= eof)
                        return -ENXIO;
                *offset = eof;
                break;
        }

        return 1;
}

/**
 * generic_file_llseek_size - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @maxsize:        max size of this file in file system
 * @eof:        offset used for SEEK_END position
 *
 * This is a variant of generic_file_llseek that allows passing in a custom
 * maximum file size and a custom EOF position, for e.g. hashed directories
 *
 * Synchronization:
 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
 * read/writes behave like SEEK_SET against seeks.
 */
loff_t
generic_file_llseek_size(struct file *file, loff_t offset, int whence,
                loff_t maxsize, loff_t eof)
{
        int ret;

        ret = must_set_pos(file, &offset, whence, eof);
        if (ret < 0)
                return ret;
        if (ret == 0)
                return offset;

        if (whence == SEEK_CUR) {
                /*
                 * If the file requires locking via f_pos_lock we know
                 * that mutual exclusion for SEEK_CUR on the same file
                 * is guaranteed. If the file isn't locked, we take
                 * f_lock to protect against f_pos races with other
                 * SEEK_CURs.
                 */
                if (file_seek_cur_needs_f_lock(file)) {
                        guard(spinlock)(&file->f_lock);
                        return vfs_setpos(file, file->f_pos + offset, maxsize);
                }
                return vfs_setpos(file, file->f_pos + offset, maxsize);
        }

        return vfs_setpos(file, offset, maxsize);
}
EXPORT_SYMBOL(generic_file_llseek_size);

/**
 * generic_llseek_cookie - versioned llseek implementation
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @cookie:        cookie to update
 *
 * See generic_file_llseek for a general description and locking assumptions.
 *
 * In contrast to generic_file_llseek, this function also resets a
 * specified cookie to indicate a seek took place.
 */
loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
                             u64 *cookie)
{
        struct inode *inode = file->f_mapping->host;
        loff_t maxsize = inode->i_sb->s_maxbytes;
        loff_t eof = i_size_read(inode);
        int ret;

        if (WARN_ON_ONCE(!cookie))
                return -EINVAL;

        /*
         * Require that this is only used for directories that guarantee
         * synchronization between readdir and seek so that an update to
         * @cookie is correctly synchronized with concurrent readdir.
         */
        if (WARN_ON_ONCE(!(file->f_mode & FMODE_ATOMIC_POS)))
                return -EINVAL;

        ret = must_set_pos(file, &offset, whence, eof);
        if (ret < 0)
                return ret;
        if (ret == 0)
                return offset;

        /* No need to hold f_lock because we know that f_pos_lock is held. */
        if (whence == SEEK_CUR)
                return vfs_setpos_cookie(file, file->f_pos + offset, maxsize, cookie);

        return vfs_setpos_cookie(file, offset, maxsize, cookie);
}
EXPORT_SYMBOL(generic_llseek_cookie);

/**
 * generic_file_llseek - generic llseek implementation for regular files
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is a generic implemenation of ->llseek useable for all normal local
 * filesystems.  It just updates the file offset to the value specified by
 * @offset and @whence.
 */
loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file->f_mapping->host;

        return generic_file_llseek_size(file, offset, whence,
                                        inode->i_sb->s_maxbytes,
                                        i_size_read(inode));
}
EXPORT_SYMBOL(generic_file_llseek);

/**
 * fixed_size_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        size of the file
 *
 */
loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR: case SEEK_END:
                return generic_file_llseek_size(file, offset, whence,
                                                size, size);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(fixed_size_llseek);

/**
 * no_seek_end_llseek - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 */
loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                OFFSET_MAX, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek);

/**
 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 * @size:        maximal offset allowed
 *
 */
loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
{
        switch (whence) {
        case SEEK_SET: case SEEK_CUR:
                return generic_file_llseek_size(file, offset, whence,
                                                size, 0);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(no_seek_end_llseek_size);

/**
 * noop_llseek - No Operation Performed llseek implementation
 * @file:        file structure to seek on
 * @offset:        file offset to seek to
 * @whence:        type of seek
 *
 * This is an implementation of ->llseek useable for the rare special case when
 * userspace expects the seek to succeed but the (device) file is actually not
 * able to perform the seek. In this case you use noop_llseek() instead of
 * falling back to the default implementation of ->llseek.
 */
loff_t noop_llseek(struct file *file, loff_t offset, int whence)
{
        return file->f_pos;
}
EXPORT_SYMBOL(noop_llseek);

loff_t default_llseek(struct file *file, loff_t offset, int whence)
{
        struct inode *inode = file_inode(file);
        loff_t retval;

        inode_lock(inode);
        switch (whence) {
                case SEEK_END:
                        offset += i_size_read(inode);
                        break;
                case SEEK_CUR:
                        if (offset == 0) {
                                retval = file->f_pos;
                                goto out;
                        }
                        offset += file->f_pos;
                        break;
                case SEEK_DATA:
                        /*
                         * In the generic case the entire file is data, so as
                         * long as offset isn't at the end of the file then the
                         * offset is data.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        break;
                case SEEK_HOLE:
                        /*
                         * There is a virtual hole at the end of the file, so
                         * as long as offset isn't i_size or larger, return
                         * i_size.
                         */
                        if (offset >= inode->i_size) {
                                retval = -ENXIO;
                                goto out;
                        }
                        offset = inode->i_size;
                        break;
        }
        retval = -EINVAL;
        if (offset >= 0 || unsigned_offsets(file)) {
                if (offset != file->f_pos)
                        file->f_pos = offset;
                retval = offset;
        }
out:
        inode_unlock(inode);
        return retval;
}
EXPORT_SYMBOL(default_llseek);

loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
{
        if (!(file->f_mode & FMODE_LSEEK))
                return -ESPIPE;
        return file->f_op->llseek(file, offset, whence);
}
EXPORT_SYMBOL(vfs_llseek);

static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
        off_t retval;
        CLASS(fd_pos, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        retval = -EINVAL;
        if (whence <= SEEK_MAX) {
                loff_t res = vfs_llseek(fd_file(f), offset, whence);
                retval = res;
                if (res != (loff_t)retval)
                        retval = -EOVERFLOW;        /* LFS: should only happen on 32 bit platforms */
        }
        return retval;
}

SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
{
        return ksys_lseek(fd, offset, whence);
}
#endif

#if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
        defined(__ARCH_WANT_SYS_LLSEEK)
SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
                unsigned long, offset_low, loff_t __user *, result,
                unsigned int, whence)
{
        int retval;
        CLASS(fd_pos, f)(fd);
        loff_t offset;

        if (fd_empty(f))
                return -EBADF;

        if (whence > SEEK_MAX)
                return -EINVAL;

        offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
                        whence);

        retval = (int)offset;
        if (offset >= 0) {
                retval = -EFAULT;
                if (!copy_to_user(result, &offset, sizeof(offset)))
                        retval = 0;
        }
        return retval;
}
#endif

int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
        int mask = read_write == READ ? MAY_READ : MAY_WRITE;
        int ret;

        if (unlikely((ssize_t) count < 0))
                return -EINVAL;

        if (ppos) {
                loff_t pos = *ppos;

                if (unlikely(pos < 0)) {
                        if (!unsigned_offsets(file))
                                return -EINVAL;
                        if (count >= -pos) /* both values are in 0..LLONG_MAX */
                                return -EOVERFLOW;
                } else if (unlikely((loff_t) (pos + count) < 0)) {
                        if (!unsigned_offsets(file))
                                return -EINVAL;
                }
        }

        ret = security_file_permission(file, mask);
        if (ret)
                return ret;

        return fsnotify_file_area_perm(file, mask, ppos, count);
}
EXPORT_SYMBOL(rw_verify_area);

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_ubuf(&iter, ITER_DEST, buf, len);

        ret = filp->f_op->read_iter(&kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

static int warn_unsupported(struct file *file, const char *op)
{
        pr_warn_ratelimited(
                "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
                op, file, current->pid, current->comm);
        return -EINVAL;
}

ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
                return -EINVAL;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        /*
         * Also fail if ->read_iter and ->read are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->read_iter || file->f_op->read))
                return warn_unsupported(file, "read");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
        ret = file->f_op->read_iter(&kiocb, &iter);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        return __kernel_read(file, buf, count, pos);
}
EXPORT_SYMBOL(kernel_read);

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(READ, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        if (file->f_op->read)
                ret = file->f_op->read(file, buf, count, pos);
        else if (file->f_op->read_iter)
                ret = new_sync_read(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_access(file);
                add_rchar(current, ret);
        }
        inc_syscr(current);
        return ret;
}

static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
        struct kiocb kiocb;
        struct iov_iter iter;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        kiocb.ki_pos = (ppos ? *ppos : 0);
        iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);

        ret = filp->f_op->write_iter(&kiocb, &iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ret > 0 && ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
{
        struct kiocb kiocb;
        ssize_t ret;

        if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        /*
         * Also fail if ->write_iter and ->write are both wired up as that
         * implies very convoluted semantics.
         */
        if (unlikely(!file->f_op->write_iter || file->f_op->write))
                return warn_unsupported(file, "write");

        init_sync_kiocb(&kiocb, file);
        kiocb.ki_pos = pos ? *pos : 0;
        ret = file->f_op->write_iter(&kiocb, from);
        if (ret > 0) {
                if (pos)
                        *pos = kiocb.ki_pos;
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        return ret;
}

/* caller is responsible for file_start_write/file_end_write */
ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
{
        struct kvec iov = {
                .iov_base        = (void *)buf,
                .iov_len        = min_t(size_t, count, MAX_RW_COUNT),
        };
        struct iov_iter iter;
        iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
        return __kernel_write_iter(file, &iter, pos);
}
/*
 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
 * but autofs is one of the few internal kernel users that actually
 * wants this _and_ can be built as a module. So we need to export
 * this symbol for autofs, even though it really isn't appropriate
 * for any other kernel modules.
 */
EXPORT_SYMBOL_GPL(__kernel_write);

ssize_t kernel_write(struct file *file, const void *buf, size_t count,
                            loff_t *pos)
{
        ssize_t ret;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;

        file_start_write(file);
        ret =  __kernel_write(file, buf, count, pos);
        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL(kernel_write);

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (unlikely(!access_ok(buf, count)))
                return -EFAULT;

        ret = rw_verify_area(WRITE, file, pos, count);
        if (ret)
                return ret;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;
        file_start_write(file);
        if (file->f_op->write)
                ret = file->f_op->write(file, buf, count, pos);
        else if (file->f_op->write_iter)
                ret = new_sync_write(file, buf, count, pos);
        else
                ret = -EINVAL;
        if (ret > 0) {
                fsnotify_modify(file);
                add_wchar(current, ret);
        }
        inc_syscw(current);
        file_end_write(file);
        return ret;
}

/* file_ppos returns &file->f_pos or NULL if file is stream */
static inline loff_t *file_ppos(struct file *file)
{
        return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
}

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
        CLASS(fd_pos, f)(fd);
        ssize_t ret = -EBADF;

        if (!fd_empty(f)) {
                loff_t pos, *ppos = file_ppos(fd_file(f));
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_read(fd_file(f), buf, count, ppos);
                if (ret >= 0 && ppos)
                        fd_file(f)->f_pos = pos;
        }
        return ret;
}

SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
        return ksys_read(fd, buf, count);
}

ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
        CLASS(fd_pos, f)(fd);
        ssize_t ret = -EBADF;

        if (!fd_empty(f)) {
                loff_t pos, *ppos = file_ppos(fd_file(f));
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_write(fd_file(f), buf, count, ppos);
                if (ret >= 0 && ppos)
                        fd_file(f)->f_pos = pos;
        }

        return ret;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
                size_t, count)
{
        return ksys_write(fd, buf, count);
}

ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
                     loff_t pos)
{
        if (pos < 0)
                return -EINVAL;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        if (fd_file(f)->f_mode & FMODE_PREAD)
                return vfs_read(fd_file(f), buf, count, &pos);

        return -ESPIPE;
}

SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
                        size_t, count, loff_t, pos)
{
        return ksys_pread64(fd, buf, count, pos);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
                       size_t, count, compat_arg_u64_dual(pos))
{
        return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
}
#endif

ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
                      size_t count, loff_t pos)
{
        if (pos < 0)
                return -EINVAL;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        if (fd_file(f)->f_mode & FMODE_PWRITE)
                return vfs_write(fd_file(f), buf, count, &pos);

        return -ESPIPE;
}

SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
                         size_t, count, loff_t, pos)
{
        return ksys_pwrite64(fd, buf, count, pos);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
                       size_t, count, compat_arg_u64_dual(pos))
{
        return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
}
#endif

static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        struct kiocb kiocb;
        ssize_t ret;

        init_sync_kiocb(&kiocb, filp);
        ret = kiocb_set_rw_flags(&kiocb, flags, type);
        if (ret)
                return ret;
        kiocb.ki_pos = (ppos ? *ppos : 0);

        if (type == READ)
                ret = filp->f_op->read_iter(&kiocb, iter);
        else
                ret = filp->f_op->write_iter(&kiocb, iter);
        BUG_ON(ret == -EIOCBQUEUED);
        if (ppos)
                *ppos = kiocb.ki_pos;
        return ret;
}

/* Do it by hand, with file-ops */
static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
                loff_t *ppos, int type, rwf_t flags)
{
        ssize_t ret = 0;

        if (flags & ~RWF_HIPRI)
                return -EOPNOTSUPP;

        while (iov_iter_count(iter)) {
                ssize_t nr;

                if (type == READ) {
                        nr = filp->f_op->read(filp, iter_iov_addr(iter),
                                                iter_iov_len(iter), ppos);
                } else {
                        nr = filp->f_op->write(filp, iter_iov_addr(iter),
                                                iter_iov_len(iter), ppos);
                }

                if (nr < 0) {
                        if (!ret)
                                ret = nr;
                        break;
                }
                ret += nr;
                if (nr != iter_iov_len(iter))
                        break;
                iov_iter_advance(iter, nr);
        }

        return ret;
}

ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->read_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        ret = file->f_op->read_iter(iocb, iter);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_read);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                      rwf_t flags)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->read_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                goto out;
        ret = rw_verify_area(READ, file, ppos, tot_len);
        if (ret < 0)
                return ret;

        ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
out:
        if (ret >= 0)
                fsnotify_access(file);
        return ret;
}
EXPORT_SYMBOL(vfs_iter_read);

/*
 * Caller is responsible for calling kiocb_end_write() on completion
 * if async iocb was queued.
 */
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter)
{
        size_t tot_len;
        ssize_t ret = 0;

        if (!file->f_op->write_iter)
                return -EINVAL;
        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;
        ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
        if (ret < 0)
                return ret;

        kiocb_start_write(iocb);
        ret = file->f_op->write_iter(iocb, iter);
        if (ret != -EIOCBQUEUED)
                kiocb_end_write(iocb);
        if (ret > 0)
                fsnotify_modify(file);

        return ret;
}
EXPORT_SYMBOL(vfs_iocb_iter_write);

ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                       rwf_t flags)
{
        size_t tot_len;
        ssize_t ret;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;
        if (!file->f_op->write_iter)
                return -EINVAL;

        tot_len = iov_iter_count(iter);
        if (!tot_len)
                return 0;

        ret = rw_verify_area(WRITE, file, ppos, tot_len);
        if (ret < 0)
                return ret;

        file_start_write(file);
        ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
        if (ret > 0)
                fsnotify_modify(file);
        file_end_write(file);

        return ret;
}
EXPORT_SYMBOL(vfs_iter_write);

static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
                         unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_READ))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_READ))
                return -EINVAL;

        ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
                           &iter);
        if (ret < 0)
                return ret;

        tot_len = iov_iter_count(&iter);
        if (!tot_len)
                goto out;

        ret = rw_verify_area(READ, file, pos, tot_len);
        if (ret < 0)
                goto out;

        if (file->f_op->read_iter)
                ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
        else
                ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
out:
        if (ret >= 0)
                fsnotify_access(file);
        kfree(iov);
        return ret;
}

static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
                          unsigned long vlen, loff_t *pos, rwf_t flags)
{
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        size_t tot_len;
        ssize_t ret = 0;

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;
        if (!(file->f_mode & FMODE_CAN_WRITE))
                return -EINVAL;

        ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
                           &iter);
        if (ret < 0)
                return ret;

        tot_len = iov_iter_count(&iter);
        if (!tot_len)
                goto out;

        ret = rw_verify_area(WRITE, file, pos, tot_len);
        if (ret < 0)
                goto out;

        file_start_write(file);
        if (file->f_op->write_iter)
                ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
        else
                ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
        if (ret > 0)
                fsnotify_modify(file);
        file_end_write(file);
out:
        kfree(iov);
        return ret;
}

static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
                        unsigned long vlen, rwf_t flags)
{
        CLASS(fd_pos, f)(fd);
        ssize_t ret = -EBADF;

        if (!fd_empty(f)) {
                loff_t pos, *ppos = file_ppos(fd_file(f));
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        fd_file(f)->f_pos = pos;
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, rwf_t flags)
{
        CLASS(fd_pos, f)(fd);
        ssize_t ret = -EBADF;

        if (!fd_empty(f)) {
                loff_t pos, *ppos = file_ppos(fd_file(f));
                if (ppos) {
                        pos = *ppos;
                        ppos = &pos;
                }
                ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
                if (ret >= 0 && ppos)
                        fd_file(f)->f_pos = pos;
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
{
#define HALF_LONG_BITS (BITS_PER_LONG / 2)
        return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
}

static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
                         unsigned long vlen, loff_t pos, rwf_t flags)
{
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        CLASS(fd, f)(fd);
        if (!fd_empty(f)) {
                ret = -ESPIPE;
                if (fd_file(f)->f_mode & FMODE_PREAD)
                        ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
        }

        if (ret > 0)
                add_rchar(current, ret);
        inc_syscr(current);
        return ret;
}

static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
                          unsigned long vlen, loff_t pos, rwf_t flags)
{
        ssize_t ret = -EBADF;

        if (pos < 0)
                return -EINVAL;

        CLASS(fd, f)(fd);
        if (!fd_empty(f)) {
                ret = -ESPIPE;
                if (fd_file(f)->f_mode & FMODE_PWRITE)
                        ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
        }

        if (ret > 0)
                add_wchar(current, ret);
        inc_syscw(current);
        return ret;
}

SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_readv(fd, vec, vlen, 0);
}

SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen)
{
        return do_writev(fd, vec, vlen, 0);
}

SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_preadv(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);

        return do_preadv(fd, vec, vlen, pos, flags);
}

SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        return do_pwritev(fd, vec, vlen, pos, 0);
}

SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
                unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
                rwf_t, flags)
{
        loff_t pos = pos_from_hilo(pos_h, pos_l);

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);

        return do_pwritev(fd, vec, vlen, pos, flags);
}

/*
 * Various compat syscalls.  Note that they all pretend to take a native
 * iovec - import_iovec will properly treat those as compat_iovecs based on
 * in_compat_syscall().
 */
#ifdef CONFIG_COMPAT
#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_preadv(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_preadv(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
                const struct iovec __user *, vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
                rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_readv(fd, vec, vlen, flags);
        return do_preadv(fd, vec, vlen, pos, flags);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos)
{
        return do_pwritev(fd, vec, vlen, pos, 0);
}
#endif

COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        return do_pwritev(fd, vec, vlen, pos, 0);
}

#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
                const struct iovec __user *, vec,
                unsigned long, vlen, loff_t, pos, rwf_t, flags)
{
        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif

COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
                const struct iovec __user *,vec,
                compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
{
        loff_t pos = ((loff_t)pos_high << 32) | pos_low;

        if (pos == -1)
                return do_writev(fd, vec, vlen, flags);
        return do_pwritev(fd, vec, vlen, pos, flags);
}
#endif /* CONFIG_COMPAT */

static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
                           size_t count, loff_t max)
{
        struct inode *in_inode, *out_inode;
        struct pipe_inode_info *opipe;
        loff_t pos;
        loff_t out_pos;
        ssize_t retval;
        int fl;

        /*
         * Get input file, and verify that it is ok..
         */
        CLASS(fd, in)(in_fd);
        if (fd_empty(in))
                return -EBADF;
        if (!(fd_file(in)->f_mode & FMODE_READ))
                return -EBADF;
        if (!ppos) {
                pos = fd_file(in)->f_pos;
        } else {
                pos = *ppos;
                if (!(fd_file(in)->f_mode & FMODE_PREAD))
                        return -ESPIPE;
        }
        retval = rw_verify_area(READ, fd_file(in), &pos, count);
        if (retval < 0)
                return retval;
        if (count > MAX_RW_COUNT)
                count =  MAX_RW_COUNT;

        /*
         * Get output file, and verify that it is ok..
         */
        CLASS(fd, out)(out_fd);
        if (fd_empty(out))
                return -EBADF;
        if (!(fd_file(out)->f_mode & FMODE_WRITE))
                return -EBADF;
        in_inode = file_inode(fd_file(in));
        out_inode = file_inode(fd_file(out));
        out_pos = fd_file(out)->f_pos;

        if (!max)
                max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);

        if (unlikely(pos + count > max)) {
                if (pos >= max)
                        return -EOVERFLOW;
                count = max - pos;
        }

        fl = 0;
#if 0
        /*
         * We need to debate whether we can enable this or not. The
         * man page documents EAGAIN return for the output at least,
         * and the application is arguably buggy if it doesn't expect
         * EAGAIN on a non-blocking file descriptor.
         */
        if (fd_file(in)->f_flags & O_NONBLOCK)
                fl = SPLICE_F_NONBLOCK;
#endif
        opipe = get_pipe_info(fd_file(out), true);
        if (!opipe) {
                retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
                if (retval < 0)
                        return retval;
                retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
                                          count, fl);
        } else {
                if (fd_file(out)->f_flags & O_NONBLOCK)
                        fl |= SPLICE_F_NONBLOCK;

                retval = splice_file_to_pipe(fd_file(in), opipe, &pos, count, fl);
        }

        if (retval > 0) {
                add_rchar(current, retval);
                add_wchar(current, retval);
                fsnotify_access(fd_file(in));
                fsnotify_modify(fd_file(out));
                fd_file(out)->f_pos = out_pos;
                if (ppos)
                        *ppos = pos;
                else
                        fd_file(in)->f_pos = pos;
        }

        inc_syscr(current);
        inc_syscw(current);
        if (pos > max)
                retval = -EOVERFLOW;
        return retval;
}

SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
                compat_off_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        off_t off;
        ssize_t ret;

        if (offset) {
                if (unlikely(get_user(off, offset)))
                        return -EFAULT;
                pos = off;
                ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}

COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
                compat_loff_t __user *, offset, compat_size_t, count)
{
        loff_t pos;
        ssize_t ret;

        if (offset) {
                if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
                        return -EFAULT;
                ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
                if (unlikely(put_user(pos, offset)))
                        return -EFAULT;
                return ret;
        }

        return do_sendfile(out_fd, in_fd, NULL, count, 0);
}
#endif

/*
 * Performs necessary checks before doing a file copy
 *
 * Can adjust amount of bytes to copy via @req_count argument.
 * Returns appropriate error code that caller should return or
 * zero in case the copy should be allowed.
 */
static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    size_t *req_count, unsigned int flags)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);
        uint64_t count = *req_count;
        loff_t size_in;
        int ret;

        ret = generic_file_rw_checks(file_in, file_out);
        if (ret)
                return ret;

        /*
         * We allow some filesystems to handle cross sb copy, but passing
         * a file of the wrong filesystem type to filesystem driver can result
         * in an attempt to dereference the wrong type of ->private_data, so
         * avoid doing that until we really have a good reason.
         *
         * nfs and cifs define several different file_system_type structures
         * and several different sets of file_operations, but they all end up
         * using the same ->copy_file_range() function pointer.
         */
        if (flags & COPY_FILE_SPLICE) {
                /* cross sb splice is allowed */
        } else if (file_out->f_op->copy_file_range) {
                if (file_in->f_op->copy_file_range !=
                    file_out->f_op->copy_file_range)
                        return -EXDEV;
        } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
                return -EXDEV;
        }

        /* Don't touch certain kinds of inodes */
        if (IS_IMMUTABLE(inode_out))
                return -EPERM;

        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
                return -ETXTBSY;

        /* Ensure offsets don't wrap. */
        if (pos_in + count < pos_in || pos_out + count < pos_out)
                return -EOVERFLOW;

        /* Shorten the copy to EOF */
        size_in = i_size_read(inode_in);
        if (pos_in >= size_in)
                count = 0;
        else
                count = min(count, size_in - (uint64_t)pos_in);

        ret = generic_write_check_limits(file_out, pos_out, &count);
        if (ret)
                return ret;

        /* Don't allow overlapped copying within the same file. */
        if (inode_in == inode_out &&
            pos_out + count > pos_in &&
            pos_out < pos_in + count)
                return -EINVAL;

        *req_count = count;
        return 0;
}

/*
 * copy_file_range() differs from regular file read and write in that it
 * specifically allows return partial success.  When it does so is up to
 * the copy_file_range method.
 */
ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
                            struct file *file_out, loff_t pos_out,
                            size_t len, unsigned int flags)
{
        ssize_t ret;
        bool splice = flags & COPY_FILE_SPLICE;
        bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;

        if (flags & ~COPY_FILE_SPLICE)
                return -EINVAL;

        ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
                                       flags);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(READ, file_in, &pos_in, len);
        if (unlikely(ret))
                return ret;

        ret = rw_verify_area(WRITE, file_out, &pos_out, len);
        if (unlikely(ret))
                return ret;

        if (len == 0)
                return 0;

        file_start_write(file_out);

        /*
         * Cloning is supported by more file systems, so we implement copy on
         * same sb using clone, but for filesystems where both clone and copy
         * are supported (e.g. nfs,cifs), we only call the copy method.
         */
        if (!splice && file_out->f_op->copy_file_range) {
                ret = file_out->f_op->copy_file_range(file_in, pos_in,
                                                      file_out, pos_out,
                                                      len, flags);
        } else if (!splice && file_in->f_op->remap_file_range && samesb) {
                ret = file_in->f_op->remap_file_range(file_in, pos_in,
                                file_out, pos_out,
                                min_t(loff_t, MAX_RW_COUNT, len),
                                REMAP_FILE_CAN_SHORTEN);
                /* fallback to splice */
                if (ret <= 0)
                        splice = true;
        } else if (samesb) {
                /* Fallback to splice for same sb copy for backward compat */
                splice = true;
        }

        file_end_write(file_out);

        if (!splice)
                goto done;

        /*
         * We can get here for same sb copy of filesystems that do not implement
         * ->copy_file_range() in case filesystem does not support clone or in
         * case filesystem supports clone but rejected the clone request (e.g.
         * because it was not block aligned).
         *
         * In both cases, fall back to kernel copy so we are able to maintain a
         * consistent story about which filesystems support copy_file_range()
         * and which filesystems do not, that will allow userspace tools to
         * make consistent desicions w.r.t using copy_file_range().
         *
         * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
         * for server-side-copy between any two sb.
         *
         * In any case, we call do_splice_direct() and not splice_file_range(),
         * without file_start_write() held, to avoid possible deadlocks related
         * to splicing from input file, while file_start_write() is held on
         * the output file on a different sb.
         */
        ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
                               min_t(size_t, len, MAX_RW_COUNT), 0);
done:
        if (ret > 0) {
                fsnotify_access(file_in);
                add_rchar(current, ret);
                fsnotify_modify(file_out);
                add_wchar(current, ret);
        }

        inc_syscr(current);
        inc_syscw(current);

        return ret;
}
EXPORT_SYMBOL(vfs_copy_file_range);

SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
                int, fd_out, loff_t __user *, off_out,
                size_t, len, unsigned int, flags)
{
        loff_t pos_in;
        loff_t pos_out;
        ssize_t ret = -EBADF;

        CLASS(fd, f_in)(fd_in);
        if (fd_empty(f_in))
                return -EBADF;

        CLASS(fd, f_out)(fd_out);
        if (fd_empty(f_out))
                return -EBADF;

        if (off_in) {
                if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
                        return -EFAULT;
        } else {
                pos_in = fd_file(f_in)->f_pos;
        }

        if (off_out) {
                if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
                        return -EFAULT;
        } else {
                pos_out = fd_file(f_out)->f_pos;
        }

        if (flags != 0)
                return -EINVAL;

        ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
                                  flags);
        if (ret > 0) {
                pos_in += ret;
                pos_out += ret;

                if (off_in) {
                        if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        fd_file(f_in)->f_pos = pos_in;
                }

                if (off_out) {
                        if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
                                ret = -EFAULT;
                } else {
                        fd_file(f_out)->f_pos = pos_out;
                }
        }
        return ret;
}

/*
 * Don't operate on ranges the page cache doesn't support, and don't exceed the
 * LFS limits.  If pos is under the limit it becomes a short access.  If it
 * exceeds the limit we return -EFBIG.
 */
int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
{
        struct inode *inode = file->f_mapping->host;
        loff_t max_size = inode->i_sb->s_maxbytes;
        loff_t limit = rlimit(RLIMIT_FSIZE);

        if (limit != RLIM_INFINITY) {
                if (pos >= limit) {
                        send_sig(SIGXFSZ, current, 0);
                        return -EFBIG;
                }
                *count = min(*count, limit - pos);
        }

        if (!(file->f_flags & O_LARGEFILE))
                max_size = MAX_NON_LFS;

        if (unlikely(pos >= max_size))
                return -EFBIG;

        *count = min(*count, max_size - pos);

        return 0;
}
EXPORT_SYMBOL_GPL(generic_write_check_limits);

/* Like generic_write_checks(), but takes size of write instead of iter. */
int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;

        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        if (!*count)
                return 0;

        if (iocb->ki_flags & IOCB_APPEND)
                iocb->ki_pos = i_size_read(inode);

        if ((iocb->ki_flags & IOCB_NOWAIT) &&
            !((iocb->ki_flags & IOCB_DIRECT) ||
              (file->f_op->fop_flags & FOP_BUFFER_WASYNC)))
                return -EINVAL;

        return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
}
EXPORT_SYMBOL(generic_write_checks_count);

/*
 * Performs necessary checks before doing a write
 *
 * Can adjust writing position or amount of bytes to write.
 * Returns appropriate error code that caller should return or
 * zero in case that write should be allowed.
 */
ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
{
        loff_t count = iov_iter_count(from);
        int ret;

        ret = generic_write_checks_count(iocb, &count);
        if (ret)
                return ret;

        iov_iter_truncate(from, count);
        return iov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks);

/*
 * Performs common checks before doing a file copy/clone
 * from @file_in to @file_out.
 */
int generic_file_rw_checks(struct file *file_in, struct file *file_out)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);

        /* Don't copy dirs, pipes, sockets... */
        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                return -EINVAL;

        if (!(file_in->f_mode & FMODE_READ) ||
            !(file_out->f_mode & FMODE_WRITE) ||
            (file_out->f_flags & O_APPEND))
                return -EBADF;

        return 0;
}

int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
{
        size_t len = iov_iter_count(iter);

        if (!iter_is_ubuf(iter))
                return -EINVAL;

        if (!is_power_of_2(len))
                return -EINVAL;

        if (!IS_ALIGNED(iocb->ki_pos, len))
                return -EINVAL;

        if (!(iocb->ki_flags & IOCB_DIRECT))
                return -EOPNOTSUPP;

        return 0;
}
EXPORT_SYMBOL_GPL(generic_atomic_write_valid);




















































  539 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2005-2010 IBM Corporation
 *
 * Authors:
 * Mimi Zohar <zohar@us.ibm.com>
 * Kylene Hall <kjhall@us.ibm.com>
 *
 * File: evm.h
 */

#ifndef __INTEGRITY_EVM_H
#define __INTEGRITY_EVM_H

#include <linux/xattr.h>
#include <linux/security.h>

#include "../integrity.h"

#define EVM_INIT_HMAC        0x0001
#define EVM_INIT_X509        0x0002
#define EVM_ALLOW_METADATA_WRITES        0x0004
#define EVM_SETUP_COMPLETE 0x80000000 /* userland has signaled key load */

#define EVM_KEY_MASK (EVM_INIT_HMAC | EVM_INIT_X509)
#define EVM_INIT_MASK (EVM_INIT_HMAC | EVM_INIT_X509 | EVM_SETUP_COMPLETE | \
                       EVM_ALLOW_METADATA_WRITES)

struct xattr_list {
        struct list_head list;
        char *name;
        bool enabled;
};

#define EVM_NEW_FILE                        0x00000001
#define EVM_IMMUTABLE_DIGSIG                0x00000002

/* EVM integrity metadata associated with an inode */
struct evm_iint_cache {
        unsigned long flags;
        enum integrity_status evm_status:4;
        struct integrity_inode_attributes metadata_inode;
};

extern struct lsm_blob_sizes evm_blob_sizes;

static inline struct evm_iint_cache *evm_iint_inode(const struct inode *inode)
{
        if (unlikely(!inode->i_security))
                return NULL;

        return inode->i_security + evm_blob_sizes.lbs_inode;
}

extern int evm_initialized;

#define EVM_ATTR_FSUUID                0x0001

extern int evm_hmac_attrs;

/* List of EVM protected security xattrs */
extern struct list_head evm_config_xattrnames;

struct evm_digest {
        struct ima_digest_data_hdr hdr;
        char digest[IMA_MAX_DIGEST_SIZE];
} __packed;

int evm_protected_xattr(const char *req_xattr_name);

int evm_init_key(void);
int evm_update_evmxattr(struct dentry *dentry,
                        const char *req_xattr_name,
                        const char *req_xattr_value,
                        size_t req_xattr_value_len);
int evm_calc_hmac(struct dentry *dentry, const char *req_xattr_name,
                  const char *req_xattr_value,
                  size_t req_xattr_value_len, struct evm_digest *data,
                  struct evm_iint_cache *iint);
int evm_calc_hash(struct dentry *dentry, const char *req_xattr_name,
                  const char *req_xattr_value,
                  size_t req_xattr_value_len, char type,
                  struct evm_digest *data, struct evm_iint_cache *iint);
int evm_init_hmac(struct inode *inode, const struct xattr *xattrs,
                  char *hmac_val);
int evm_init_secfs(void);

#endif

























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 */
#ifndef _LINUX_RADIX_TREE_H
#define _LINUX_RADIX_TREE_H

#include <linux/bitops.h>
#include <linux/gfp_types.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/math.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/xarray.h>
#include <linux/local_lock.h>

/* Keep unconverted code working */
#define radix_tree_root                xarray
#define radix_tree_node                xa_node

struct radix_tree_preload {
        local_lock_t lock;
        unsigned nr;
        /* nodes->parent points to next preallocated node */
        struct radix_tree_node *nodes;
};
DECLARE_PER_CPU(struct radix_tree_preload, radix_tree_preloads);

/*
 * The bottom two bits of the slot determine how the remaining bits in the
 * slot are interpreted:
 *
 * 00 - data pointer
 * 10 - internal entry
 * x1 - value entry
 *
 * The internal entry may be a pointer to the next level in the tree, a
 * sibling entry, or an indicator that the entry in this slot has been moved
 * to another location in the tree and the lookup should be restarted.  While
 * NULL fits the 'data pointer' pattern, it means that there is no entry in
 * the tree for this index (no matter what level of the tree it is found at).
 * This means that storing a NULL entry in the tree is the same as deleting
 * the entry from the tree.
 */
#define RADIX_TREE_ENTRY_MASK                3UL
#define RADIX_TREE_INTERNAL_NODE        2UL

static inline bool radix_tree_is_internal_node(void *ptr)
{
        return ((unsigned long)ptr & RADIX_TREE_ENTRY_MASK) ==
                                RADIX_TREE_INTERNAL_NODE;
}

/*** radix-tree API starts here ***/

#define RADIX_TREE_MAP_SHIFT        XA_CHUNK_SHIFT
#define RADIX_TREE_MAP_SIZE        (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK        (RADIX_TREE_MAP_SIZE-1)

#define RADIX_TREE_MAX_TAGS        XA_MAX_MARKS
#define RADIX_TREE_TAG_LONGS        XA_MARK_LONGS

#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
#define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
                                          RADIX_TREE_MAP_SHIFT))

/* The IDR tag is stored in the low bits of xa_flags */
#define ROOT_IS_IDR        ((__force gfp_t)4)
/* The top bits of xa_flags are used to store the root tags */
#define ROOT_TAG_SHIFT        (__GFP_BITS_SHIFT)

#define RADIX_TREE_INIT(name, mask)        XARRAY_INIT(name, mask)

#define RADIX_TREE(name, mask) \
        struct radix_tree_root name = RADIX_TREE_INIT(name, mask)

#define INIT_RADIX_TREE(root, mask) xa_init_flags(root, mask)

static inline bool radix_tree_empty(const struct radix_tree_root *root)
{
        return root->xa_head == NULL;
}

/**
 * struct radix_tree_iter - radix tree iterator state
 *
 * @index:        index of current slot
 * @next_index:        one beyond the last index for this chunk
 * @tags:        bit-mask for tag-iterating
 * @node:        node that contains current slot
 *
 * This radix tree iterator works in terms of "chunks" of slots.  A chunk is a
 * subinterval of slots contained within one radix tree leaf node.  It is
 * described by a pointer to its first slot and a struct radix_tree_iter
 * which holds the chunk's position in the tree and its size.  For tagged
 * iteration radix_tree_iter also holds the slots' bit-mask for one chosen
 * radix tree tag.
 */
struct radix_tree_iter {
        unsigned long        index;
        unsigned long        next_index;
        unsigned long        tags;
        struct radix_tree_node *node;
};

/**
 * Radix-tree synchronization
 *
 * The radix-tree API requires that users provide all synchronisation (with
 * specific exceptions, noted below).
 *
 * Synchronization of access to the data items being stored in the tree, and
 * management of their lifetimes must be completely managed by API users.
 *
 * For API usage, in general,
 * - any function _modifying_ the tree or tags (inserting or deleting
 *   items, setting or clearing tags) must exclude other modifications, and
 *   exclude any functions reading the tree.
 * - any function _reading_ the tree or tags (looking up items or tags,
 *   gang lookups) must exclude modifications to the tree, but may occur
 *   concurrently with other readers.
 *
 * The notable exceptions to this rule are the following functions:
 * __radix_tree_lookup
 * radix_tree_lookup
 * radix_tree_lookup_slot
 * radix_tree_tag_get
 * radix_tree_gang_lookup
 * radix_tree_gang_lookup_tag
 * radix_tree_gang_lookup_tag_slot
 * radix_tree_tagged
 *
 * The first 7 functions are able to be called locklessly, using RCU. The
 * caller must ensure calls to these functions are made within rcu_read_lock()
 * regions. Other readers (lock-free or otherwise) and modifications may be
 * running concurrently.
 *
 * It is still required that the caller manage the synchronization and lifetimes
 * of the items. So if RCU lock-free lookups are used, typically this would mean
 * that the items have their own locks, or are amenable to lock-free access; and
 * that the items are freed by RCU (or only freed after having been deleted from
 * the radix tree *and* a synchronize_rcu() grace period).
 *
 * (Note, rcu_assign_pointer and rcu_dereference are not needed to control
 * access to data items when inserting into or looking up from the radix tree)
 *
 * Note that the value returned by radix_tree_tag_get() may not be relied upon
 * if only the RCU read lock is held.  Functions to set/clear tags and to
 * delete nodes running concurrently with it may affect its result such that
 * two consecutive reads in the same locked section may return different
 * values.  If reliability is required, modification functions must also be
 * excluded from concurrency.
 *
 * radix_tree_tagged is able to be called without locking or RCU.
 */

/**
 * radix_tree_deref_slot - dereference a slot
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * For use with radix_tree_lookup_slot().  Caller must hold tree at least read
 * locked across slot lookup and dereference. Not required if write lock is
 * held (ie. items cannot be concurrently inserted).
 *
 * radix_tree_deref_retry must be used to confirm validity of the pointer if
 * only the read lock is held.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot(void __rcu **slot)
{
        return rcu_dereference(*slot);
}

/**
 * radix_tree_deref_slot_protected - dereference a slot with tree lock held
 * @slot: slot pointer, returned by radix_tree_lookup_slot
 *
 * Similar to radix_tree_deref_slot.  The caller does not hold the RCU read
 * lock but it must hold the tree lock to prevent parallel updates.
 *
 * Return: entry stored in that slot.
 */
static inline void *radix_tree_deref_slot_protected(void __rcu **slot,
                                                        spinlock_t *treelock)
{
        return rcu_dereference_protected(*slot, lockdep_is_held(treelock));
}

/**
 * radix_tree_deref_retry        - check radix_tree_deref_slot
 * @arg:        pointer returned by radix_tree_deref_slot
 * Returns:        0 if retry is not required, otherwise retry is required
 *
 * radix_tree_deref_retry must be used with radix_tree_deref_slot.
 */
static inline int radix_tree_deref_retry(void *arg)
{
        return unlikely(radix_tree_is_internal_node(arg));
}

/**
 * radix_tree_exception        - radix_tree_deref_slot returned either exception?
 * @arg:        value returned by radix_tree_deref_slot
 * Returns:        0 if well-aligned pointer, non-0 if either kind of exception.
 */
static inline int radix_tree_exception(void *arg)
{
        return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK);
}

int radix_tree_insert(struct radix_tree_root *, unsigned long index,
                        void *);
void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index,
                          struct radix_tree_node **nodep, void __rcu ***slotp);
void *radix_tree_lookup(const struct radix_tree_root *, unsigned long);
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *,
                                        unsigned long index);
void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *,
                          void __rcu **slot, void *entry);
void radix_tree_iter_replace(struct radix_tree_root *,
                const struct radix_tree_iter *, void __rcu **slot, void *entry);
void radix_tree_replace_slot(struct radix_tree_root *,
                             void __rcu **slot, void *entry);
void radix_tree_iter_delete(struct radix_tree_root *,
                        struct radix_tree_iter *iter, void __rcu **slot);
void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
void *radix_tree_delete(struct radix_tree_root *, unsigned long);
unsigned int radix_tree_gang_lookup(const struct radix_tree_root *,
                        void **results, unsigned long first_index,
                        unsigned int max_items);
int radix_tree_preload(gfp_t gfp_mask);
int radix_tree_maybe_preload(gfp_t gfp_mask);
void radix_tree_init(void);
void *radix_tree_tag_set(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void *radix_tree_tag_clear(struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
int radix_tree_tag_get(const struct radix_tree_root *,
                        unsigned long index, unsigned int tag);
void radix_tree_iter_tag_clear(struct radix_tree_root *,
                const struct radix_tree_iter *iter, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag(const struct radix_tree_root *,
                void **results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
unsigned int radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag);
int radix_tree_tagged(const struct radix_tree_root *, unsigned int tag);

static inline void radix_tree_preload_end(void)
{
        local_unlock(&radix_tree_preloads.lock);
}

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max);

enum {
        RADIX_TREE_ITER_TAG_MASK = 0x0f,        /* tag index in lower nybble */
        RADIX_TREE_ITER_TAGGED   = 0x10,        /* lookup tagged slots */
        RADIX_TREE_ITER_CONTIG   = 0x20,        /* stop at first hole */
};

/**
 * radix_tree_iter_init - initialize radix tree iterator
 *
 * @iter:        pointer to iterator state
 * @start:        iteration starting index
 * Returns:        NULL
 */
static __always_inline void __rcu **
radix_tree_iter_init(struct radix_tree_iter *iter, unsigned long start)
{
        /*
         * Leave iter->tags uninitialized. radix_tree_next_chunk() will fill it
         * in the case of a successful tagged chunk lookup.  If the lookup was
         * unsuccessful or non-tagged then nobody cares about ->tags.
         *
         * Set index to zero to bypass next_index overflow protection.
         * See the comment in radix_tree_next_chunk() for details.
         */
        iter->index = 0;
        iter->next_index = start;
        return NULL;
}

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if there no more left
 *
 * This function looks up the next chunk in the radix tree starting from
 * @iter->next_index.  It returns a pointer to the chunk's first slot.
 * Also it fills @iter with data about chunk: position in the tree (index),
 * its end (next_index), and constructs a bit mask for tagged iterating (tags).
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *,
                             struct radix_tree_iter *iter, unsigned flags);

/**
 * radix_tree_iter_lookup - look up an index in the radix tree
 * @root: radix tree root
 * @iter: iterator state
 * @index: key to look up
 *
 * If @index is present in the radix tree, this function returns the slot
 * containing it and updates @iter to describe the entry.  If @index is not
 * present, it returns NULL.
 */
static inline void __rcu **
radix_tree_iter_lookup(const struct radix_tree_root *root,
                        struct radix_tree_iter *iter, unsigned long index)
{
        radix_tree_iter_init(iter, index);
        return radix_tree_next_chunk(root, iter, RADIX_TREE_ITER_CONTIG);
}

/**
 * radix_tree_iter_retry - retry this chunk of the iteration
 * @iter:        iterator state
 *
 * If we iterate over a tree protected only by the RCU lock, a race
 * against deletion or creation may result in seeing a slot for which
 * radix_tree_deref_retry() returns true.  If so, call this function
 * and continue the iteration.
 */
static inline __must_check
void __rcu **radix_tree_iter_retry(struct radix_tree_iter *iter)
{
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}

static inline unsigned long
__radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots)
{
        return iter->index + slots;
}

/**
 * radix_tree_iter_resume - resume iterating when the chunk may be invalid
 * @slot: pointer to current slot
 * @iter: iterator state
 * Returns: New slot pointer
 *
 * If the iterator needs to release then reacquire a lock, the chunk may
 * have been invalidated by an insertion or deletion.  Call this function
 * before releasing the lock to continue the iteration from the next index.
 */
void __rcu **__must_check radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter);

/**
 * radix_tree_chunk_size - get current chunk size
 *
 * @iter:        pointer to radix tree iterator
 * Returns:        current chunk size
 */
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
        return iter->next_index - iter->index;
}

/**
 * radix_tree_next_slot - find next slot in chunk
 *
 * @slot:        pointer to current slot
 * @iter:        pointer to iterator state
 * @flags:        RADIX_TREE_ITER_*, should be constant
 * Returns:        pointer to next slot, or NULL if there no more left
 *
 * This function updates @iter->index in the case of a successful lookup.
 * For tagged lookup it also eats @iter->tags.
 *
 * There are several cases where 'slot' can be passed in as NULL to this
 * function.  These cases result from the use of radix_tree_iter_resume() or
 * radix_tree_iter_retry().  In these cases we don't end up dereferencing
 * 'slot' because either:
 * a) we are doing tagged iteration and iter->tags has been set to 0, or
 * b) we are doing non-tagged iteration, and iter->index and iter->next_index
 *    have been set up so that radix_tree_chunk_size() returns 1 or 0.
 */
static __always_inline void __rcu **radix_tree_next_slot(void __rcu **slot,
                                struct radix_tree_iter *iter, unsigned flags)
{
        if (flags & RADIX_TREE_ITER_TAGGED) {
                iter->tags >>= 1;
                if (unlikely(!iter->tags))
                        return NULL;
                if (likely(iter->tags & 1ul)) {
                        iter->index = __radix_tree_iter_add(iter, 1);
                        slot++;
                        goto found;
                }
                if (!(flags & RADIX_TREE_ITER_CONTIG)) {
                        unsigned offset = __ffs(iter->tags);

                        iter->tags >>= offset++;
                        iter->index = __radix_tree_iter_add(iter, offset);
                        slot += offset;
                        goto found;
                }
        } else {
                long count = radix_tree_chunk_size(iter);

                while (--count > 0) {
                        slot++;
                        iter->index = __radix_tree_iter_add(iter, 1);

                        if (likely(*slot))
                                goto found;
                        if (flags & RADIX_TREE_ITER_CONTIG) {
                                /* forbid switching to the next chunk */
                                iter->next_index = 0;
                                break;
                        }
                }
        }
        return NULL;

 found:
        return slot;
}

/**
 * radix_tree_for_each_slot - iterate over non-empty slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_slot(slot, root, iter, start)                \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter, 0)) ;        \
             slot = radix_tree_next_slot(slot, iter, 0))

/**
 * radix_tree_for_each_tagged - iterate over tagged slots
 *
 * @slot:        the void** variable for pointer to slot
 * @root:        the struct radix_tree_root pointer
 * @iter:        the struct radix_tree_iter pointer
 * @start:        iteration starting index
 * @tag:        tag index
 *
 * @slot points to radix tree slot, @iter->index contains its index.
 */
#define radix_tree_for_each_tagged(slot, root, iter, start, tag)        \
        for (slot = radix_tree_iter_init(iter, start) ;                        \
             slot || (slot = radix_tree_next_chunk(root, iter,                \
                              RADIX_TREE_ITER_TAGGED | tag)) ;                \
             slot = radix_tree_next_slot(slot, iter,                        \
                                RADIX_TREE_ITER_TAGGED | tag))

#endif /* _LINUX_RADIX_TREE_H */




































































































































































































































  499 

















  417 
  122 












































  499 
















  497 































  498 




  496 

    3 




  498 














  145 

  147 



    2 
  145 
  145 



  146 

  146 


  147 









   25 

  120 


















   25 
   25 

   23 
    2 












  238 


  237 



































  244 
  244 






























   21 























  345 
  347 




   46 
   46 































   46 































  147 
  147 




























  494 








    2 

  145 









































































































  125 





















  146 

















  146 





  144 


  147 


  147 











  125 


   23 

  146 


  146 















  144 















































































































































































































































































  414 




  415 

   21 







  415 


  415 

















  495 




  494 






  243 
  241 















































































































































































































































































































































































































































































































































































































































































































   50 





















  146 




  108 

   51 





















  147 
















  162 




   42 
  147 






  147 








































  148 






    4 





































   41 








   41 


   41 


   41 





   41 


  149 






















  149 



   41 















  175 














  152 














  152 




  152 




  152 










  152 



   51 


  149 













  149 


  149 










    8 




    6 






    2 


















   27 
   27 




    4 






   24 







   24 














   27 


















  179 





  157 




   36 








  179 















   41 


   33 
   32 


   41 












   41 





















   41 



   41 
    3 


   41 




























































































































































































































  243 

  243 



























   23 




   23 



























































    5 
    5 
    5 










  442 
  443 






















  537 



  538 








  442 




















  537 




  537 













  162 









  162 


  162 
    6 





  162 


  162 
  162 











































































































    3 
    3 























   36 
   34 

    2 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
// SPDX-License-Identifier: GPL-2.0-only
/*
 * (C) 1997 Linus Torvalds
 * (C) 1999 Andrea Arcangeli <andrea@suse.de> (dynamic inode allocation)
 */
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/mm.h>
#include <linux/backing-dev.h>
#include <linux/hash.h>
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/cdev.h>
#include <linux/memblock.h>
#include <linux/fsnotify.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <linux/rw_hint.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <trace/events/writeback.h>
#define CREATE_TRACE_POINTS
#include <trace/events/timestamp.h>

#include "internal.h"

/*
 * Inode locking rules:
 *
 * inode->i_lock protects:
 *   inode->i_state, inode->i_hash, __iget(), inode->i_io_list
 * Inode LRU list locks protect:
 *   inode->i_sb->s_inode_lru, inode->i_lru
 * inode->i_sb->s_inode_list_lock protects:
 *   inode->i_sb->s_inodes, inode->i_sb_list
 * bdi->wb.list_lock protects:
 *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
 *
 * Lock ordering:
 *
 * inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *     Inode LRU list locks
 *
 * bdi->wb.list_lock
 *   inode->i_lock
 *
 * inode_hash_lock
 *   inode->i_sb->s_inode_list_lock
 *   inode->i_lock
 *
 * iunique_lock
 *   inode_hash_lock
 */

static unsigned int i_hash_mask __ro_after_init;
static unsigned int i_hash_shift __ro_after_init;
static struct hlist_head *inode_hashtable __ro_after_init;
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);

/*
 * Empty aops. Can be used for the cases where the user does not
 * define any of the address_space operations.
 */
const struct address_space_operations empty_aops = {
};
EXPORT_SYMBOL(empty_aops);

static DEFINE_PER_CPU(unsigned long, nr_inodes);
static DEFINE_PER_CPU(unsigned long, nr_unused);

static struct kmem_cache *inode_cachep __ro_after_init;

static long get_nr_inodes(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_inodes, i);
        return sum < 0 ? 0 : sum;
}

static inline long get_nr_inodes_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_unused, i);
        return sum < 0 ? 0 : sum;
}

long get_nr_dirty_inodes(void)
{
        /* not actually dirty inodes, but a wild approximation */
        long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
        return nr_dirty > 0 ? nr_dirty : 0;
}

#ifdef CONFIG_DEBUG_FS
static DEFINE_PER_CPU(long, mg_ctime_updates);
static DEFINE_PER_CPU(long, mg_fine_stamps);
static DEFINE_PER_CPU(long, mg_ctime_swaps);

static unsigned long get_mg_ctime_updates(void)
{
        unsigned long sum = 0;
        int i;

        for_each_possible_cpu(i)
                sum += data_race(per_cpu(mg_ctime_updates, i));
        return sum;
}

static unsigned long get_mg_fine_stamps(void)
{
        unsigned long sum = 0;
        int i;

        for_each_possible_cpu(i)
                sum += data_race(per_cpu(mg_fine_stamps, i));
        return sum;
}

static unsigned long get_mg_ctime_swaps(void)
{
        unsigned long sum = 0;
        int i;

        for_each_possible_cpu(i)
                sum += data_race(per_cpu(mg_ctime_swaps, i));
        return sum;
}

#define mgtime_counter_inc(__var)        this_cpu_inc(__var)

static int mgts_show(struct seq_file *s, void *p)
{
        unsigned long ctime_updates = get_mg_ctime_updates();
        unsigned long ctime_swaps = get_mg_ctime_swaps();
        unsigned long fine_stamps = get_mg_fine_stamps();
        unsigned long floor_swaps = timekeeping_get_mg_floor_swaps();

        seq_printf(s, "%lu %lu %lu %lu\n",
                   ctime_updates, ctime_swaps, fine_stamps, floor_swaps);
        return 0;
}

DEFINE_SHOW_ATTRIBUTE(mgts);

static int __init mg_debugfs_init(void)
{
        debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops);
        return 0;
}
late_initcall(mg_debugfs_init);

#else /* ! CONFIG_DEBUG_FS */

#define mgtime_counter_inc(__var)        do { } while (0)

#endif /* CONFIG_DEBUG_FS */

/*
 * Handle nr_inode sysctl
 */
#ifdef CONFIG_SYSCTL
/*
 * Statistics gathering..
 */
static struct inodes_stat_t inodes_stat;

static int proc_nr_inodes(const struct ctl_table *table, int write, void *buffer,
                          size_t *lenp, loff_t *ppos)
{
        inodes_stat.nr_inodes = get_nr_inodes();
        inodes_stat.nr_unused = get_nr_inodes_unused();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table inodes_sysctls[] = {
        {
                .procname        = "inode-nr",
                .data                = &inodes_stat,
                .maxlen                = 2*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_inodes,
        },
        {
                .procname        = "inode-state",
                .data                = &inodes_stat,
                .maxlen                = 7*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_inodes,
        },
};

static int __init init_fs_inode_sysctls(void)
{
        register_sysctl_init("fs", inodes_sysctls);
        return 0;
}
early_initcall(init_fs_inode_sysctls);
#endif

static int no_open(struct inode *inode, struct file *file)
{
        return -ENXIO;
}

/**
 * inode_init_always_gfp - perform inode structure initialisation
 * @sb: superblock inode belongs to
 * @inode: inode to initialise
 * @gfp: allocation flags
 *
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 * If there are additional allocations required @gfp is used.
 */
int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp)
{
        static const struct inode_operations empty_iops;
        static const struct file_operations no_open_fops = {.open = no_open};
        struct address_space *const mapping = &inode->i_data;

        inode->i_sb = sb;
        inode->i_blkbits = sb->s_blocksize_bits;
        inode->i_flags = 0;
        inode->i_state = 0;
        atomic64_set(&inode->i_sequence, 0);
        atomic_set(&inode->i_count, 1);
        inode->i_op = &empty_iops;
        inode->i_fop = &no_open_fops;
        inode->i_ino = 0;
        inode->__i_nlink = 1;
        inode->i_opflags = 0;
        if (sb->s_xattr)
                inode->i_opflags |= IOP_XATTR;
        if (sb->s_type->fs_flags & FS_MGTIME)
                inode->i_opflags |= IOP_MGTIME;
        i_uid_write(inode, 0);
        i_gid_write(inode, 0);
        atomic_set(&inode->i_writecount, 0);
        inode->i_size = 0;
        inode->i_write_hint = WRITE_LIFE_NOT_SET;
        inode->i_blocks = 0;
        inode->i_bytes = 0;
        inode->i_generation = 0;
        inode->i_pipe = NULL;
        inode->i_cdev = NULL;
        inode->i_link = NULL;
        inode->i_dir_seq = 0;
        inode->i_rdev = 0;
        inode->dirtied_when = 0;

#ifdef CONFIG_CGROUP_WRITEBACK
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
#endif

        spin_lock_init(&inode->i_lock);
        lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);

        init_rwsem(&inode->i_rwsem);
        lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);

        atomic_set(&inode->i_dio_count, 0);

        mapping->a_ops = &empty_aops;
        mapping->host = inode;
        mapping->flags = 0;
        mapping->wb_err = 0;
        atomic_set(&mapping->i_mmap_writable, 0);
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        atomic_set(&mapping->nr_thps, 0);
#endif
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->i_private_data = NULL;
        mapping->writeback_index = 0;
        init_rwsem(&mapping->invalidate_lock);
        lockdep_set_class_and_name(&mapping->invalidate_lock,
                                   &sb->s_type->invalidate_lock_key,
                                   "mapping.invalidate_lock");
        if (sb->s_iflags & SB_I_STABLE_WRITES)
                mapping_set_stable_writes(mapping);
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);        /* buggered by rcu freeing */
#ifdef CONFIG_FS_POSIX_ACL
        inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
#endif

#ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
#endif
        inode->i_flctx = NULL;

        if (unlikely(security_inode_alloc(inode, gfp)))
                return -ENOMEM;

        this_cpu_inc(nr_inodes);

        return 0;
}
EXPORT_SYMBOL(inode_init_always_gfp);

void free_inode_nonrcu(struct inode *inode)
{
        kmem_cache_free(inode_cachep, inode);
}
EXPORT_SYMBOL(free_inode_nonrcu);

static void i_callback(struct rcu_head *head)
{
        struct inode *inode = container_of(head, struct inode, i_rcu);
        if (inode->free_inode)
                inode->free_inode(inode);
        else
                free_inode_nonrcu(inode);
}

/**
 *        alloc_inode         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock.
 *        Inode wont be chained in superblock s_inodes list
 *        This means :
 *        - fs can't be unmount
 *        - quotas, fsnotify, writeback can't work
 */
struct inode *alloc_inode(struct super_block *sb)
{
        const struct super_operations *ops = sb->s_op;
        struct inode *inode;

        if (ops->alloc_inode)
                inode = ops->alloc_inode(sb);
        else
                inode = alloc_inode_sb(sb, inode_cachep, GFP_KERNEL);

        if (!inode)
                return NULL;

        if (unlikely(inode_init_always(sb, inode))) {
                if (ops->destroy_inode) {
                        ops->destroy_inode(inode);
                        if (!ops->free_inode)
                                return NULL;
                }
                inode->free_inode = ops->free_inode;
                i_callback(&inode->i_rcu);
                return NULL;
        }

        return inode;
}

void __destroy_inode(struct inode *inode)
{
        BUG_ON(inode_has_buffers(inode));
        inode_detach_wb(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
        locks_free_lock_context(inode);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

#ifdef CONFIG_FS_POSIX_ACL
        if (inode->i_acl && !is_uncached_acl(inode->i_acl))
                posix_acl_release(inode->i_acl);
        if (inode->i_default_acl && !is_uncached_acl(inode->i_default_acl))
                posix_acl_release(inode->i_default_acl);
#endif
        this_cpu_dec(nr_inodes);
}
EXPORT_SYMBOL(__destroy_inode);

static void destroy_inode(struct inode *inode)
{
        const struct super_operations *ops = inode->i_sb->s_op;

        BUG_ON(!list_empty(&inode->i_lru));
        __destroy_inode(inode);
        if (ops->destroy_inode) {
                ops->destroy_inode(inode);
                if (!ops->free_inode)
                        return;
        }
        inode->free_inode = ops->free_inode;
        call_rcu(&inode->i_rcu, i_callback);
}

/**
 * drop_nlink - directly drop an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  In cases
 * where we are attempting to track writes to the
 * filesystem, a decrement to zero means an imminent
 * write when the file is truncated and actually unlinked
 * on the filesystem.
 */
void drop_nlink(struct inode *inode)
{
        WARN_ON(inode->i_nlink == 0);
        inode->__i_nlink--;
        if (!inode->i_nlink)
                atomic_long_inc(&inode->i_sb->s_remove_count);
}
EXPORT_SYMBOL(drop_nlink);

/**
 * clear_nlink - directly zero an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  See
 * drop_nlink() for why we care about i_nlink hitting zero.
 */
void clear_nlink(struct inode *inode)
{
        if (inode->i_nlink) {
                inode->__i_nlink = 0;
                atomic_long_inc(&inode->i_sb->s_remove_count);
        }
}
EXPORT_SYMBOL(clear_nlink);

/**
 * set_nlink - directly set an inode's link count
 * @inode: inode
 * @nlink: new nlink (should be non-zero)
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.
 */
void set_nlink(struct inode *inode, unsigned int nlink)
{
        if (!nlink) {
                clear_nlink(inode);
        } else {
                /* Yes, some filesystems do change nlink from zero to one */
                if (inode->i_nlink == 0)
                        atomic_long_dec(&inode->i_sb->s_remove_count);

                inode->__i_nlink = nlink;
        }
}
EXPORT_SYMBOL(set_nlink);

/**
 * inc_nlink - directly increment an inode's link count
 * @inode: inode
 *
 * This is a low-level filesystem helper to replace any
 * direct filesystem manipulation of i_nlink.  Currently,
 * it is only here for parity with dec_nlink().
 */
void inc_nlink(struct inode *inode)
{
        if (unlikely(inode->i_nlink == 0)) {
                WARN_ON(!(inode->i_state & I_LINKABLE));
                atomic_long_dec(&inode->i_sb->s_remove_count);
        }

        inode->__i_nlink++;
}
EXPORT_SYMBOL(inc_nlink);

static void __address_space_init_once(struct address_space *mapping)
{
        xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
        init_rwsem(&mapping->i_mmap_rwsem);
        INIT_LIST_HEAD(&mapping->i_private_list);
        spin_lock_init(&mapping->i_private_lock);
        mapping->i_mmap = RB_ROOT_CACHED;
}

void address_space_init_once(struct address_space *mapping)
{
        memset(mapping, 0, sizeof(*mapping));
        __address_space_init_once(mapping);
}
EXPORT_SYMBOL(address_space_init_once);

/*
 * These are initializations that only need to be done
 * once, because the fields are idempotent across use
 * of the inode, so let the slab aware of that.
 */
void inode_init_once(struct inode *inode)
{
        memset(inode, 0, sizeof(*inode));
        INIT_HLIST_NODE(&inode->i_hash);
        INIT_LIST_HEAD(&inode->i_devices);
        INIT_LIST_HEAD(&inode->i_io_list);
        INIT_LIST_HEAD(&inode->i_wb_list);
        INIT_LIST_HEAD(&inode->i_lru);
        INIT_LIST_HEAD(&inode->i_sb_list);
        __address_space_init_once(&inode->i_data);
        i_size_ordered_init(inode);
}
EXPORT_SYMBOL(inode_init_once);

static void init_once(void *foo)
{
        struct inode *inode = (struct inode *) foo;

        inode_init_once(inode);
}

/*
 * get additional reference to inode; caller must already hold one.
 */
void ihold(struct inode *inode)
{
        WARN_ON(atomic_inc_return(&inode->i_count) < 2);
}
EXPORT_SYMBOL(ihold);

static void __inode_add_lru(struct inode *inode, bool rotate)
{
        if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE))
                return;
        if (atomic_read(&inode->i_count))
                return;
        if (!(inode->i_sb->s_flags & SB_ACTIVE))
                return;
        if (!mapping_shrinkable(&inode->i_data))
                return;

        if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_inc(nr_unused);
        else if (rotate)
                inode->i_state |= I_REFERENCED;
}

struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
                                            struct inode *inode, u32 bit)
{
        void *bit_address;

        bit_address = inode_state_wait_address(inode, bit);
        init_wait_var_entry(wqe, bit_address, 0);
        return __var_waitqueue(bit_address);
}
EXPORT_SYMBOL(inode_bit_waitqueue);

/*
 * Add inode to LRU if needed (inode is unused and clean).
 *
 * Needs inode->i_lock held.
 */
void inode_add_lru(struct inode *inode)
{
        __inode_add_lru(inode, false);
}

static void inode_lru_list_del(struct inode *inode)
{
        if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
                this_cpu_dec(nr_unused);
}

static void inode_pin_lru_isolating(struct inode *inode)
{
        lockdep_assert_held(&inode->i_lock);
        WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE));
        inode->i_state |= I_LRU_ISOLATING;
}

static void inode_unpin_lru_isolating(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_LRU_ISOLATING));
        inode->i_state &= ~I_LRU_ISOLATING;
        /* Called with inode->i_lock which ensures memory ordering. */
        inode_wake_up_bit(inode, __I_LRU_ISOLATING);
        spin_unlock(&inode->i_lock);
}

static void inode_wait_for_lru_isolating(struct inode *inode)
{
        struct wait_bit_queue_entry wqe;
        struct wait_queue_head *wq_head;

        lockdep_assert_held(&inode->i_lock);
        if (!(inode->i_state & I_LRU_ISOLATING))
                return;

        wq_head = inode_bit_waitqueue(&wqe, inode, __I_LRU_ISOLATING);
        for (;;) {
                prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
                /*
                 * Checking I_LRU_ISOLATING with inode->i_lock guarantees
                 * memory ordering.
                 */
                if (!(inode->i_state & I_LRU_ISOLATING))
                        break;
                spin_unlock(&inode->i_lock);
                schedule();
                spin_lock(&inode->i_lock);
        }
        finish_wait(wq_head, &wqe.wq_entry);
        WARN_ON(inode->i_state & I_LRU_ISOLATING);
}

/**
 * inode_sb_list_add - add inode to the superblock list of inodes
 * @inode: inode to add
 */
void inode_sb_list_add(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        spin_lock(&sb->s_inode_list_lock);
        list_add(&inode->i_sb_list, &sb->s_inodes);
        spin_unlock(&sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);

static inline void inode_sb_list_del(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (!list_empty(&inode->i_sb_list)) {
                spin_lock(&sb->s_inode_list_lock);
                list_del_init(&inode->i_sb_list);
                spin_unlock(&sb->s_inode_list_lock);
        }
}

static unsigned long hash(struct super_block *sb, unsigned long hashval)
{
        unsigned long tmp;

        tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
                        L1_CACHE_BYTES;
        tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
        return tmp & i_hash_mask;
}

/**
 *        __insert_inode_hash - hash an inode
 *        @inode: unhashed inode
 *        @hashval: unsigned long value used to locate this object in the
 *                inode_hashtable.
 *
 *        Add an inode to the inode hash for this superblock.
 */
void __insert_inode_hash(struct inode *inode, unsigned long hashval)
{
        struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);

        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_add_head_rcu(&inode->i_hash, b);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__insert_inode_hash);

/**
 *        __remove_inode_hash - remove an inode from the hash
 *        @inode: inode to unhash
 *
 *        Remove an inode from the superblock.
 */
void __remove_inode_hash(struct inode *inode)
{
        spin_lock(&inode_hash_lock);
        spin_lock(&inode->i_lock);
        hlist_del_init_rcu(&inode->i_hash);
        spin_unlock(&inode->i_lock);
        spin_unlock(&inode_hash_lock);
}
EXPORT_SYMBOL(__remove_inode_hash);

void dump_mapping(const struct address_space *mapping)
{
        struct inode *host;
        const struct address_space_operations *a_ops;
        struct hlist_node *dentry_first;
        struct dentry *dentry_ptr;
        struct dentry dentry;
        char fname[64] = {};
        unsigned long ino;

        /*
         * If mapping is an invalid pointer, we don't want to crash
         * accessing it, so probe everything depending on it carefully.
         */
        if (get_kernel_nofault(host, &mapping->host) ||
            get_kernel_nofault(a_ops, &mapping->a_ops)) {
                pr_warn("invalid mapping:%px\n", mapping);
                return;
        }

        if (!host) {
                pr_warn("aops:%ps\n", a_ops);
                return;
        }

        if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
            get_kernel_nofault(ino, &host->i_ino)) {
                pr_warn("aops:%ps invalid inode:%px\n", a_ops, host);
                return;
        }

        if (!dentry_first) {
                pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
                return;
        }

        dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
        if (get_kernel_nofault(dentry, dentry_ptr) ||
            !dentry.d_parent || !dentry.d_name.name) {
                pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
                                a_ops, ino, dentry_ptr);
                return;
        }

        if (strncpy_from_kernel_nofault(fname, dentry.d_name.name, 63) < 0)
                strscpy(fname, "<invalid>");
        /*
         * Even if strncpy_from_kernel_nofault() succeeded,
         * the fname could be unreliable
         */
        pr_warn("aops:%ps ino:%lx dentry name(?):\"%s\"\n",
                a_ops, ino, fname);
}

void clear_inode(struct inode *inode)
{
        /*
         * We have to cycle the i_pages lock here because reclaim can be in the
         * process of removing the last page (in __filemap_remove_folio())
         * and we must not free the mapping under it.
         */
        xa_lock_irq(&inode->i_data.i_pages);
        BUG_ON(inode->i_data.nrpages);
        /*
         * Almost always, mapping_empty(&inode->i_data) here; but there are
         * two known and long-standing ways in which nodes may get left behind
         * (when deep radix-tree node allocation failed partway; or when THP
         * collapse_file() failed). Until those two known cases are cleaned up,
         * or a cleanup function is called here, do not BUG_ON(!mapping_empty),
         * nor even WARN_ON(!mapping_empty).
         */
        xa_unlock_irq(&inode->i_data.i_pages);
        BUG_ON(!list_empty(&inode->i_data.i_private_list));
        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(inode->i_state & I_CLEAR);
        BUG_ON(!list_empty(&inode->i_wb_list));
        /* don't need i_lock here, no concurrent mods to i_state */
        inode->i_state = I_FREEING | I_CLEAR;
}
EXPORT_SYMBOL(clear_inode);

/*
 * Free the inode passed in, removing it from the lists it is still connected
 * to. We remove any pages still attached to the inode and wait for any IO that
 * is still in progress before finally destroying the inode.
 *
 * An inode must already be marked I_FREEING so that we avoid the inode being
 * moved back onto lists if we race with other code that manipulates the lists
 * (e.g. writeback_single_inode). The caller is responsible for setting this.
 *
 * An inode must already be removed from the LRU list before being evicted from
 * the cache. This should occur atomically with setting the I_FREEING state
 * flag, so no inodes here should ever be on the LRU when being evicted.
 */
static void evict(struct inode *inode)
{
        const struct super_operations *op = inode->i_sb->s_op;

        BUG_ON(!(inode->i_state & I_FREEING));
        BUG_ON(!list_empty(&inode->i_lru));

        if (!list_empty(&inode->i_io_list))
                inode_io_list_del(inode);

        inode_sb_list_del(inode);

        spin_lock(&inode->i_lock);
        inode_wait_for_lru_isolating(inode);

        /*
         * Wait for flusher thread to be done with the inode so that filesystem
         * does not start destroying it while writeback is still running. Since
         * the inode has I_FREEING set, flusher thread won't start new work on
         * the inode.  We just have to wait for running writeback to finish.
         */
        inode_wait_for_writeback(inode);
        spin_unlock(&inode->i_lock);

        if (op->evict_inode) {
                op->evict_inode(inode);
        } else {
                truncate_inode_pages_final(&inode->i_data);
                clear_inode(inode);
        }
        if (S_ISCHR(inode->i_mode) && inode->i_cdev)
                cd_forget(inode);

        remove_inode_hash(inode);

        /*
         * Wake up waiters in __wait_on_freeing_inode().
         *
         * It is an invariant that any thread we need to wake up is already
         * accounted for before remove_inode_hash() acquires ->i_lock -- both
         * sides take the lock and sleep is aborted if the inode is found
         * unhashed. Thus either the sleeper wins and goes off CPU, or removal
         * wins and the sleeper aborts after testing with the lock.
         *
         * This also means we don't need any fences for the call below.
         */
        inode_wake_up_bit(inode, __I_NEW);
        BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));

        destroy_inode(inode);
}

/*
 * dispose_list - dispose of the contents of a local list
 * @head: the head of the list to free
 *
 * Dispose-list gets a local list with local inodes in it, so it doesn't
 * need to worry about list corruption and SMP locks.
 */
static void dispose_list(struct list_head *head)
{
        while (!list_empty(head)) {
                struct inode *inode;

                inode = list_first_entry(head, struct inode, i_lru);
                list_del_init(&inode->i_lru);

                evict(inode);
                cond_resched();
        }
}

/**
 * evict_inodes        - evict all evictable inodes for a superblock
 * @sb:                superblock to operate on
 *
 * Make sure that no inodes with zero refcount are retained.  This is
 * called by superblock shutdown after having SB_ACTIVE flag removed,
 * so any inode reaching zero refcount during or after that call will
 * be immediately evicted.
 */
void evict_inodes(struct super_block *sb)
{
        struct inode *inode, *next;
        LIST_HEAD(dispose);

again:
        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
                if (atomic_read(&inode->i_count))
                        continue;

                spin_lock(&inode->i_lock);
                if (atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                inode->i_state |= I_FREEING;
                inode_lru_list_del(inode);
                spin_unlock(&inode->i_lock);
                list_add(&inode->i_lru, &dispose);

                /*
                 * We can have a ton of inodes to evict at unmount time given
                 * enough memory, check to see if we need to go to sleep for a
                 * bit so we don't livelock.
                 */
                if (need_resched()) {
                        spin_unlock(&sb->s_inode_list_lock);
                        cond_resched();
                        dispose_list(&dispose);
                        goto again;
                }
        }
        spin_unlock(&sb->s_inode_list_lock);

        dispose_list(&dispose);
}
EXPORT_SYMBOL_GPL(evict_inodes);

/*
 * Isolate the inode from the LRU in preparation for freeing it.
 *
 * If the inode has the I_REFERENCED flag set, then it means that it has been
 * used recently - the flag is set in iput_final(). When we encounter such an
 * inode, clear the flag and move it to the back of the LRU so it gets another
 * pass through the LRU before it gets reclaimed. This is necessary because of
 * the fact we are doing lazy LRU updates to minimise lock contention so the
 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
 * with this flag set because they are the inodes that are out of order.
 */
static enum lru_status inode_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, void *arg)
{
        struct list_head *freeable = arg;
        struct inode        *inode = container_of(item, struct inode, i_lru);

        /*
         * We are inverting the lru lock/inode->i_lock here, so use a
         * trylock. If we fail to get the lock, just skip it.
         */
        if (!spin_trylock(&inode->i_lock))
                return LRU_SKIP;

        /*
         * Inodes can get referenced, redirtied, or repopulated while
         * they're already on the LRU, and this can make them
         * unreclaimable for a while. Remove them lazily here; iput,
         * sync, or the last page cache deletion will requeue them.
         */
        if (atomic_read(&inode->i_count) ||
            (inode->i_state & ~I_REFERENCED) ||
            !mapping_shrinkable(&inode->i_data)) {
                list_lru_isolate(lru, &inode->i_lru);
                spin_unlock(&inode->i_lock);
                this_cpu_dec(nr_unused);
                return LRU_REMOVED;
        }

        /* Recently referenced inodes get one more pass */
        if (inode->i_state & I_REFERENCED) {
                inode->i_state &= ~I_REFERENCED;
                spin_unlock(&inode->i_lock);
                return LRU_ROTATE;
        }

        /*
         * On highmem systems, mapping_shrinkable() permits dropping
         * page cache in order to free up struct inodes: lowmem might
         * be under pressure before the cache inside the highmem zone.
         */
        if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) {
                inode_pin_lru_isolating(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&lru->lock);
                if (remove_inode_buffers(inode)) {
                        unsigned long reap;
                        reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
                        if (current_is_kswapd())
                                __count_vm_events(KSWAPD_INODESTEAL, reap);
                        else
                                __count_vm_events(PGINODESTEAL, reap);
                        mm_account_reclaimed_pages(reap);
                }
                inode_unpin_lru_isolating(inode);
                return LRU_RETRY;
        }

        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
        list_lru_isolate_move(lru, &inode->i_lru, freeable);
        spin_unlock(&inode->i_lock);

        this_cpu_dec(nr_unused);
        return LRU_REMOVED;
}

/*
 * Walk the superblock inode LRU for freeable inodes and attempt to free them.
 * This is called from the superblock shrinker function with a number of inodes
 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 * then are freed outside inode_lock by dispose_list().
 */
long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(freeable);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
                                     inode_lru_isolate, &freeable);
        dispose_list(&freeable);
        return freed;
}

static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked);
/*
 * Called with the inode lock held.
 */
static struct inode *find_inode(struct super_block *sb,
                                struct hlist_head *head,
                                int (*test)(struct inode *, void *),
                                void *data, bool is_inode_hash_locked)
{
        struct inode *inode = NULL;

        if (is_inode_hash_locked)
                lockdep_assert_held(&inode_hash_lock);
        else
                lockdep_assert_not_held(&inode_hash_lock);

        rcu_read_lock();
repeat:
        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                if (!test(inode, data))
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode, is_inode_hash_locked);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        rcu_read_unlock();
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                rcu_read_unlock();
                return inode;
        }
        rcu_read_unlock();
        return NULL;
}

/*
 * find_inode_fast is the fast path version of find_inode, see the comment at
 * iget_locked for details.
 */
static struct inode *find_inode_fast(struct super_block *sb,
                                struct hlist_head *head, unsigned long ino,
                                bool is_inode_hash_locked)
{
        struct inode *inode = NULL;

        if (is_inode_hash_locked)
                lockdep_assert_held(&inode_hash_lock);
        else
                lockdep_assert_not_held(&inode_hash_lock);

        rcu_read_lock();
repeat:
        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_ino != ino)
                        continue;
                if (inode->i_sb != sb)
                        continue;
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
                        __wait_on_freeing_inode(inode, is_inode_hash_locked);
                        goto repeat;
                }
                if (unlikely(inode->i_state & I_CREATING)) {
                        spin_unlock(&inode->i_lock);
                        rcu_read_unlock();
                        return ERR_PTR(-ESTALE);
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                rcu_read_unlock();
                return inode;
        }
        rcu_read_unlock();
        return NULL;
}

/*
 * Each cpu owns a range of LAST_INO_BATCH numbers.
 * 'shared_last_ino' is dirtied only once out of LAST_INO_BATCH allocations,
 * to renew the exhausted range.
 *
 * This does not significantly increase overflow rate because every CPU can
 * consume at most LAST_INO_BATCH-1 unused inode numbers. So there is
 * NR_CPUS*(LAST_INO_BATCH-1) wastage. At 4096 and 1024, this is ~0.1% of the
 * 2^32 range, and is a worst-case. Even a 50% wastage would only increase
 * overflow rate by 2x, which does not seem too significant.
 *
 * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
 * error if st_ino won't fit in target struct field. Use 32bit counter
 * here to attempt to avoid that.
 */
#define LAST_INO_BATCH 1024
static DEFINE_PER_CPU(unsigned int, last_ino);

unsigned int get_next_ino(void)
{
        unsigned int *p = &get_cpu_var(last_ino);
        unsigned int res = *p;

#ifdef CONFIG_SMP
        if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) {
                static atomic_t shared_last_ino;
                int next = atomic_add_return(LAST_INO_BATCH, &shared_last_ino);

                res = next - LAST_INO_BATCH;
        }
#endif

        res++;
        /* get_next_ino should not provide a 0 inode number */
        if (unlikely(!res))
                res++;
        *p = res;
        put_cpu_var(last_ino);
        return res;
}
EXPORT_SYMBOL(get_next_ino);

/**
 *        new_inode         - obtain an inode
 *        @sb: superblock
 *
 *        Allocates a new inode for given superblock. The default gfp_mask
 *        for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE.
 *        If HIGHMEM pages are unsuitable or it is known that pages allocated
 *        for the page cache are not reclaimable or migratable,
 *        mapping_set_gfp_mask() must be called with suitable flags on the
 *        newly created inode's mapping
 *
 */
struct inode *new_inode(struct super_block *sb)
{
        struct inode *inode;

        inode = alloc_inode(sb);
        if (inode)
                inode_sb_list_add(inode);
        return inode;
}
EXPORT_SYMBOL(new_inode);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
void lockdep_annotate_inode_mutex_key(struct inode *inode)
{
        if (S_ISDIR(inode->i_mode)) {
                struct file_system_type *type = inode->i_sb->s_type;

                /* Set new key only if filesystem hasn't already changed it */
                if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
                        /*
                         * ensure nobody is actually holding i_mutex
                         */
                        // mutex_destroy(&inode->i_mutex);
                        init_rwsem(&inode->i_rwsem);
                        lockdep_set_class(&inode->i_rwsem,
                                          &type->i_mutex_dir_key);
                }
        }
}
EXPORT_SYMBOL(lockdep_annotate_inode_mutex_key);
#endif

/**
 * unlock_new_inode - clear the I_NEW state and wake up any waiters
 * @inode:        new inode to unlock
 *
 * Called when the inode is fully initialised to clear the new state of the
 * inode and wake up anyone waiting for the inode to finish initialisation.
 */
void unlock_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        /*
         * Pairs with the barrier in prepare_to_wait_event() to make sure
         * ___wait_var_event() either sees the bit cleared or
         * waitqueue_active() check in wake_up_var() sees the waiter.
         */
        smp_mb();
        inode_wake_up_bit(inode, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(unlock_new_inode);

void discard_new_inode(struct inode *inode)
{
        lockdep_annotate_inode_mutex_key(inode);
        spin_lock(&inode->i_lock);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW;
        /*
         * Pairs with the barrier in prepare_to_wait_event() to make sure
         * ___wait_var_event() either sees the bit cleared or
         * waitqueue_active() check in wake_up_var() sees the waiter.
         */
        smp_mb();
        inode_wake_up_bit(inode, __I_NEW);
        spin_unlock(&inode->i_lock);
        iput(inode);
}
EXPORT_SYMBOL(discard_new_inode);

/**
 * lock_two_nondirectories - take two i_mutexes on non-directory objects
 *
 * Lock any non-NULL argument. Passed objects must not be directories.
 * Zero, one or two objects may be locked by this function.
 *
 * @inode1: first inode to lock
 * @inode2: second inode to lock
 */
void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1)
                WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
        if (inode2)
                WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
        if (inode1 > inode2)
                swap(inode1, inode2);
        if (inode1)
                inode_lock(inode1);
        if (inode2 && inode2 != inode1)
                inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);

/**
 * unlock_two_nondirectories - release locks from lock_two_nondirectories()
 * @inode1: first inode to unlock
 * @inode2: second inode to unlock
 */
void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
{
        if (inode1) {
                WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
                inode_unlock(inode1);
        }
        if (inode2 && inode2 != inode1) {
                WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
                inode_unlock(inode2);
        }
}
EXPORT_SYMBOL(unlock_two_nondirectories);

/**
 * inode_insert5 - obtain an inode from a mounted file system
 * @inode:        pre-allocated inode to use for insert to cache
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present return it with an increased reference count. This is a
 * variant of iget5_locked() that doesn't allocate an inode.
 *
 * If the inode is not present in the cache, insert the pre-allocated inode and
 * return it locked, hashed, and with the I_NEW flag set. The file system gets
 * to fill it in before unlocking it via unlock_new_inode().
 *
 * Note that both @test and @set are called with the inode_hash_lock held, so
 * they can't sleep.
 */
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                            int (*test)(struct inode *, void *),
                            int (*set)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
        struct inode *old;

again:
        spin_lock(&inode_hash_lock);
        old = find_inode(inode->i_sb, head, test, data, true);
        if (unlikely(old)) {
                /*
                 * Uhhuh, somebody else created the same inode under us.
                 * Use the old inode instead of the preallocated one.
                 */
                spin_unlock(&inode_hash_lock);
                if (IS_ERR(old))
                        return NULL;
                wait_on_inode(old);
                if (unlikely(inode_unhashed(old))) {
                        iput(old);
                        goto again;
                }
                return old;
        }

        if (set && unlikely(set(inode, data))) {
                spin_unlock(&inode_hash_lock);
                return NULL;
        }

        /*
         * Return the locked inode with I_NEW set, the
         * caller is responsible for filling in the contents
         */
        spin_lock(&inode->i_lock);
        inode->i_state |= I_NEW;
        hlist_add_head_rcu(&inode->i_hash, head);
        spin_unlock(&inode->i_lock);

        spin_unlock(&inode_hash_lock);

        /*
         * Add inode to the sb list if it's not already. It has I_NEW at this
         * point, so it should be safe to test i_sb_list locklessly.
         */
        if (list_empty(&inode->i_sb_list))
                inode_sb_list_add(inode);

        return inode;
}
EXPORT_SYMBOL(inode_insert5);

/**
 * iget5_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if present return it with an increased reference count. This is a
 * generalized version of iget_locked() for file systems where the inode
 * number is not sufficient for unique identification of an inode.
 *
 * If the inode is not present in the cache, allocate and insert a new inode
 * and return it locked, hashed, and with the I_NEW flag set. The file system
 * gets to fill it in before unlocking it via unlock_new_inode().
 *
 * Note that both @test and @set are called with the inode_hash_lock held, so
 * they can't sleep.
 */
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *), void *data)
{
        struct inode *inode = ilookup5(sb, hashval, test, data);

        if (!inode) {
                struct inode *new = alloc_inode(sb);

                if (new) {
                        inode = inode_insert5(new, hashval, test, set, data);
                        if (unlikely(inode != new))
                                destroy_inode(new);
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget5_locked);

/**
 * iget5_locked_rcu - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @hashval:        hash value (usually inode number) to get
 * @test:        callback used for comparisons between inodes
 * @set:        callback used to initialize a new struct inode
 * @data:        opaque data pointer to pass to @test and @set
 *
 * This is equivalent to iget5_locked, except the @test callback must
 * tolerate the inode not being stable, including being mid-teardown.
 */
struct inode *iget5_locked_rcu(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode, *new;

again:
        inode = find_inode(sb, head, test, data, false);
        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
                return inode;
        }

        new = alloc_inode(sb);
        if (new) {
                inode = inode_insert5(new, hashval, test, set, data);
                if (unlikely(inode != new))
                        destroy_inode(new);
        }
        return inode;
}
EXPORT_SYMBOL_GPL(iget5_locked_rcu);

/**
 * iget_locked - obtain an inode from a mounted file system
 * @sb:                super block of file system
 * @ino:        inode number to get
 *
 * Search for the inode specified by @ino in the inode cache and if present
 * return it with an increased reference count. This is for file systems
 * where the inode number is sufficient for unique identification of an inode.
 *
 * If the inode is not in cache, allocate a new inode and return it locked,
 * hashed, and with the I_NEW flag set.  The file system gets to fill it in
 * before unlocking it via unlock_new_inode().
 */
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        inode = find_inode_fast(sb, head, ino, false);
        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
                return inode;
        }

        inode = alloc_inode(sb);
        if (inode) {
                struct inode *old;

                spin_lock(&inode_hash_lock);
                /* We released the lock, so.. */
                old = find_inode_fast(sb, head, ino, true);
                if (!old) {
                        inode->i_ino = ino;
                        spin_lock(&inode->i_lock);
                        inode->i_state = I_NEW;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        spin_unlock(&inode_hash_lock);
                        inode_sb_list_add(inode);

                        /* Return the locked inode with I_NEW set, the
                         * caller is responsible for filling in the contents
                         */
                        return inode;
                }

                /*
                 * Uhhuh, somebody else created the same inode under
                 * us. Use the old inode instead of the one we just
                 * allocated.
                 */
                spin_unlock(&inode_hash_lock);
                destroy_inode(inode);
                if (IS_ERR(old))
                        return NULL;
                inode = old;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(iget_locked);

/*
 * search the inode cache for a matching inode number.
 * If we find one, then the inode number we are trying to
 * allocate is not unique and so we should not use it.
 *
 * Returns 1 if the inode number is unique, 0 if it is not.
 */
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *b = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        hlist_for_each_entry_rcu(inode, b, i_hash) {
                if (inode->i_ino == ino && inode->i_sb == sb)
                        return 0;
        }
        return 1;
}

/**
 *        iunique - get a unique inode number
 *        @sb: superblock
 *        @max_reserved: highest reserved inode number
 *
 *        Obtain an inode number that is unique on the system for a given
 *        superblock. This is used by file systems that have no natural
 *        permanent inode numbering system. An inode number is returned that
 *        is higher than the reserved limit but unique.
 *
 *        BUGS:
 *        With a large number of inodes live on the file system this function
 *        currently becomes quite slow.
 */
ino_t iunique(struct super_block *sb, ino_t max_reserved)
{
        /*
         * On a 32bit, non LFS stat() call, glibc will generate an EOVERFLOW
         * error if st_ino won't fit in target struct field. Use 32bit counter
         * here to attempt to avoid that.
         */
        static DEFINE_SPINLOCK(iunique_lock);
        static unsigned int counter;
        ino_t res;

        rcu_read_lock();
        spin_lock(&iunique_lock);
        do {
                if (counter <= max_reserved)
                        counter = max_reserved + 1;
                res = counter++;
        } while (!test_inode_iunique(sb, res));
        spin_unlock(&iunique_lock);
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(iunique);

struct inode *igrab(struct inode *inode)
{
        spin_lock(&inode->i_lock);
        if (!(inode->i_state & (I_FREEING|I_WILL_FREE))) {
                __iget(inode);
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                /*
                 * Handle the case where s_op->clear_inode is not been
                 * called yet, and somebody is calling igrab
                 * while the inode is getting freed.
                 */
                inode = NULL;
        }
        return inode;
}
EXPORT_SYMBOL(igrab);

/**
 * ilookup5_nowait - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache.
 * If the inode is in the cache, the inode is returned with an incremented
 * reference count.
 *
 * Note: I_NEW is not waited upon so you have to be very careful what you do
 * with the returned inode.  You probably should be using ilookup5() instead.
 *
 * Note2: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        spin_lock(&inode_hash_lock);
        inode = find_inode(sb, head, test, data, true);
        spin_unlock(&inode_hash_lock);

        return IS_ERR(inode) ? NULL : inode;
}
EXPORT_SYMBOL(ilookup5_nowait);

/**
 * ilookup5 - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @test:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @test
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * and if the inode is in the cache, return the inode with an incremented
 * reference count.  Waits on I_NEW before returning the inode.
 * returned with an incremented reference count.
 *
 * This is a generalized version of ilookup() for file systems where the
 * inode number is not sufficient for unique identification of an inode.
 *
 * Note: @test is called with the inode_hash_lock held, so can't sleep.
 */
struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *inode;
again:
        inode = ilookup5_nowait(sb, hashval, test, data);
        if (inode) {
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup5);

/**
 * ilookup - search for an inode in the inode cache
 * @sb:                super block of file system to search
 * @ino:        inode number to search for
 *
 * Search for the inode @ino in the inode cache, and if the inode is in the
 * cache, the inode is returned with an incremented reference count.
 */
struct inode *ilookup(struct super_block *sb, unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;
again:
        inode = find_inode_fast(sb, head, ino, false);

        if (inode) {
                if (IS_ERR(inode))
                        return NULL;
                wait_on_inode(inode);
                if (unlikely(inode_unhashed(inode))) {
                        iput(inode);
                        goto again;
                }
        }
        return inode;
}
EXPORT_SYMBOL(ilookup);

/**
 * find_inode_nowait - find an inode in the inode cache
 * @sb:                super block of file system to search
 * @hashval:        hash value (usually inode number) to search for
 * @match:        callback used for comparisons between inodes
 * @data:        opaque data pointer to pass to @match
 *
 * Search for the inode specified by @hashval and @data in the inode
 * cache, where the helper function @match will return 0 if the inode
 * does not match, 1 if the inode does match, and -1 if the search
 * should be stopped.  The @match function must be responsible for
 * taking the i_lock spin_lock and checking i_state for an inode being
 * freed or being initialized, and incrementing the reference count
 * before returning 1.  It also must not sleep, since it is called with
 * the inode_hash_lock spinlock held.
 *
 * This is a even more generalized version of ilookup5() when the
 * function must never block --- find_inode() can block in
 * __wait_on_freeing_inode() --- or when the caller can not increment
 * the reference count because the resulting iput() might cause an
 * inode eviction.  The tradeoff is that the @match funtion must be
 * very carefully implemented.
 */
struct inode *find_inode_nowait(struct super_block *sb,
                                unsigned long hashval,
                                int (*match)(struct inode *, unsigned long,
                                             void *),
                                void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode, *ret_inode = NULL;
        int mval;

        spin_lock(&inode_hash_lock);
        hlist_for_each_entry(inode, head, i_hash) {
                if (inode->i_sb != sb)
                        continue;
                mval = match(inode, hashval, data);
                if (mval == 0)
                        continue;
                if (mval == 1)
                        ret_inode = inode;
                goto out;
        }
out:
        spin_unlock(&inode_hash_lock);
        return ret_inode;
}
EXPORT_SYMBOL(find_inode_nowait);

/**
 * find_inode_rcu - find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @hashval:        Key to hash
 * @test:        Function to test match on an inode
 * @data:        Data for test function
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
                             int (*test)(struct inode *, void *), void *data)
{
        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
                    test(inode, data))
                        return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_rcu);

/**
 * find_inode_by_ino_rcu - Find an inode in the inode cache
 * @sb:                Super block of file system to search
 * @ino:        The inode number to match
 *
 * Search for the inode specified by @hashval and @data in the inode cache,
 * where the helper function @test will return 0 if the inode does not match
 * and 1 if it does.  The @test function must be responsible for taking the
 * i_lock spin_lock and checking i_state for an inode being freed or being
 * initialized.
 *
 * If successful, this will return the inode for which the @test function
 * returned 1 and NULL otherwise.
 *
 * The @test function is not permitted to take a ref on any inode presented.
 * It is also not permitted to sleep.
 *
 * The caller must hold the RCU read lock.
 */
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
                                    unsigned long ino)
{
        struct hlist_head *head = inode_hashtable + hash(sb, ino);
        struct inode *inode;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "suspicious find_inode_by_ino_rcu() usage");

        hlist_for_each_entry_rcu(inode, head, i_hash) {
                if (inode->i_ino == ino &&
                    inode->i_sb == sb &&
                    !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
                    return inode;
        }
        return NULL;
}
EXPORT_SYMBOL(find_inode_by_ino_rcu);

int insert_inode_locked(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        ino_t ino = inode->i_ino;
        struct hlist_head *head = inode_hashtable + hash(sb, ino);

        while (1) {
                struct inode *old = NULL;
                spin_lock(&inode_hash_lock);
                hlist_for_each_entry(old, head, i_hash) {
                        if (old->i_ino != ino)
                                continue;
                        if (old->i_sb != sb)
                                continue;
                        spin_lock(&old->i_lock);
                        if (old->i_state & (I_FREEING|I_WILL_FREE)) {
                                spin_unlock(&old->i_lock);
                                continue;
                        }
                        break;
                }
                if (likely(!old)) {
                        spin_lock(&inode->i_lock);
                        inode->i_state |= I_NEW | I_CREATING;
                        hlist_add_head_rcu(&inode->i_hash, head);
                        spin_unlock(&inode->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return 0;
                }
                if (unlikely(old->i_state & I_CREATING)) {
                        spin_unlock(&old->i_lock);
                        spin_unlock(&inode_hash_lock);
                        return -EBUSY;
                }
                __iget(old);
                spin_unlock(&old->i_lock);
                spin_unlock(&inode_hash_lock);
                wait_on_inode(old);
                if (unlikely(!inode_unhashed(old))) {
                        iput(old);
                        return -EBUSY;
                }
                iput(old);
        }
}
EXPORT_SYMBOL(insert_inode_locked);

int insert_inode_locked4(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data)
{
        struct inode *old;

        inode->i_state |= I_CREATING;
        old = inode_insert5(inode, hashval, test, NULL, data);

        if (old != inode) {
                iput(old);
                return -EBUSY;
        }
        return 0;
}
EXPORT_SYMBOL(insert_inode_locked4);


int generic_delete_inode(struct inode *inode)
{
        return 1;
}
EXPORT_SYMBOL(generic_delete_inode);

/*
 * Called when we're dropping the last reference
 * to an inode.
 *
 * Call the FS "drop_inode()" function, defaulting to
 * the legacy UNIX filesystem behaviour.  If it tells
 * us to evict inode, do so.  Otherwise, retain inode
 * in cache if fs is alive, sync and evict if fs is
 * shutting down.
 */
static void iput_final(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        const struct super_operations *op = inode->i_sb->s_op;
        unsigned long state;
        int drop;

        WARN_ON(inode->i_state & I_NEW);

        if (op->drop_inode)
                drop = op->drop_inode(inode);
        else
                drop = generic_drop_inode(inode);

        if (!drop &&
            !(inode->i_state & I_DONTCACHE) &&
            (sb->s_flags & SB_ACTIVE)) {
                __inode_add_lru(inode, true);
                spin_unlock(&inode->i_lock);
                return;
        }

        state = inode->i_state;
        if (!drop) {
                WRITE_ONCE(inode->i_state, state | I_WILL_FREE);
                spin_unlock(&inode->i_lock);

                write_inode_now(inode, 1);

                spin_lock(&inode->i_lock);
                state = inode->i_state;
                WARN_ON(state & I_NEW);
                state &= ~I_WILL_FREE;
        }

        WRITE_ONCE(inode->i_state, state | I_FREEING);
        if (!list_empty(&inode->i_lru))
                inode_lru_list_del(inode);
        spin_unlock(&inode->i_lock);

        evict(inode);
}

/**
 *        iput        - put an inode
 *        @inode: inode to put
 *
 *        Puts an inode, dropping its usage count. If the inode use count hits
 *        zero, the inode is then freed and may also be destroyed.
 *
 *        Consequently, iput() can sleep.
 */
void iput(struct inode *inode)
{
        if (!inode)
                return;
        BUG_ON(inode->i_state & I_CLEAR);
retry:
        if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
                if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
                        atomic_inc(&inode->i_count);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_lazytime_iput(inode);
                        mark_inode_dirty_sync(inode);
                        goto retry;
                }
                iput_final(inode);
        }
}
EXPORT_SYMBOL(iput);

#ifdef CONFIG_BLOCK
/**
 *        bmap        - find a block number in a file
 *        @inode:  inode owning the block number being requested
 *        @block: pointer containing the block to find
 *
 *        Replaces the value in ``*block`` with the block number on the device holding
 *        corresponding to the requested block number in the file.
 *        That is, asked for block 4 of inode 1 the function will replace the
 *        4 in ``*block``, with disk block relative to the disk start that holds that
 *        block of the file.
 *
 *        Returns -EINVAL in case of error, 0 otherwise. If mapping falls into a
 *        hole, returns 0 and ``*block`` is also set to 0.
 */
int bmap(struct inode *inode, sector_t *block)
{
        if (!inode->i_mapping->a_ops->bmap)
                return -EINVAL;

        *block = inode->i_mapping->a_ops->bmap(inode->i_mapping, *block);
        return 0;
}
EXPORT_SYMBOL(bmap);
#endif

/*
 * With relative atime, only update atime if the previous atime is
 * earlier than or equal to either the ctime or mtime,
 * or if at least a day has passed since the last atime update.
 */
static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode,
                             struct timespec64 now)
{
        struct timespec64 atime, mtime, ctime;

        if (!(mnt->mnt_flags & MNT_RELATIME))
                return true;
        /*
         * Is mtime younger than or equal to atime? If yes, update atime:
         */
        atime = inode_get_atime(inode);
        mtime = inode_get_mtime(inode);
        if (timespec64_compare(&mtime, &atime) >= 0)
                return true;
        /*
         * Is ctime younger than or equal to atime? If yes, update atime:
         */
        ctime = inode_get_ctime(inode);
        if (timespec64_compare(&ctime, &atime) >= 0)
                return true;

        /*
         * Is the previous atime value older than a day? If yes,
         * update atime:
         */
        if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
                return true;
        /*
         * Good, we can skip the atime update:
         */
        return false;
}

/**
 * inode_update_timestamps - update the timestamps on the inode
 * @inode: inode to be updated
 * @flags: S_* flags that needed to be updated
 *
 * The update_time function is called when an inode's timestamps need to be
 * updated for a read or write operation. This function handles updating the
 * actual timestamps. It's up to the caller to ensure that the inode is marked
 * dirty appropriately.
 *
 * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated,
 * attempt to update all three of them. S_ATIME updates can be handled
 * independently of the rest.
 *
 * Returns a set of S_* flags indicating which values changed.
 */
int inode_update_timestamps(struct inode *inode, int flags)
{
        int updated = 0;
        struct timespec64 now;

        if (flags & (S_MTIME|S_CTIME|S_VERSION)) {
                struct timespec64 ctime = inode_get_ctime(inode);
                struct timespec64 mtime = inode_get_mtime(inode);

                now = inode_set_ctime_current(inode);
                if (!timespec64_equal(&now, &ctime))
                        updated |= S_CTIME;
                if (!timespec64_equal(&now, &mtime)) {
                        inode_set_mtime_to_ts(inode, now);
                        updated |= S_MTIME;
                }
                if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated))
                        updated |= S_VERSION;
        } else {
                now = current_time(inode);
        }

        if (flags & S_ATIME) {
                struct timespec64 atime = inode_get_atime(inode);

                if (!timespec64_equal(&now, &atime)) {
                        inode_set_atime_to_ts(inode, now);
                        updated |= S_ATIME;
                }
        }
        return updated;
}
EXPORT_SYMBOL(inode_update_timestamps);

/**
 * generic_update_time - update the timestamps on the inode
 * @inode: inode to be updated
 * @flags: S_* flags that needed to be updated
 *
 * The update_time function is called when an inode's timestamps need to be
 * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME,
 * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME
 * updates can be handled done independently of the rest.
 *
 * Returns a S_* mask indicating which fields were updated.
 */
int generic_update_time(struct inode *inode, int flags)
{
        int updated = inode_update_timestamps(inode, flags);
        int dirty_flags = 0;

        if (updated & (S_ATIME|S_MTIME|S_CTIME))
                dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC;
        if (updated & S_VERSION)
                dirty_flags |= I_DIRTY_SYNC;
        __mark_inode_dirty(inode, dirty_flags);
        return updated;
}
EXPORT_SYMBOL(generic_update_time);

/*
 * This does the actual work of updating an inodes time or version.  Must have
 * had called mnt_want_write() before calling this.
 */
int inode_update_time(struct inode *inode, int flags)
{
        if (inode->i_op->update_time)
                return inode->i_op->update_time(inode, flags);
        generic_update_time(inode, flags);
        return 0;
}
EXPORT_SYMBOL(inode_update_time);

/**
 *        atime_needs_update        -        update the access time
 *        @path: the &struct path to update
 *        @inode: inode to update
 *
 *        Update the accessed time on an inode and mark it for writeback.
 *        This function automatically handles read only file systems and media,
 *        as well as the "noatime" flag and inode specific "noatime" markers.
 */
bool atime_needs_update(const struct path *path, struct inode *inode)
{
        struct vfsmount *mnt = path->mnt;
        struct timespec64 now, atime;

        if (inode->i_flags & S_NOATIME)
                return false;

        /* Atime updates will likely cause i_uid and i_gid to be written
         * back improprely if their true value is unknown to the vfs.
         */
        if (HAS_UNMAPPED_ID(mnt_idmap(mnt), inode))
                return false;

        if (IS_NOATIME(inode))
                return false;
        if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        if (mnt->mnt_flags & MNT_NOATIME)
                return false;
        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
                return false;

        now = current_time(inode);

        if (!relatime_need_update(mnt, inode, now))
                return false;

        atime = inode_get_atime(inode);
        if (timespec64_equal(&atime, &now))
                return false;

        return true;
}

void touch_atime(const struct path *path)
{
        struct vfsmount *mnt = path->mnt;
        struct inode *inode = d_inode(path->dentry);

        if (!atime_needs_update(path, inode))
                return;

        if (!sb_start_write_trylock(inode->i_sb))
                return;

        if (mnt_get_write_access(mnt) != 0)
                goto skip_update;
        /*
         * File systems can error out when updating inodes if they need to
         * allocate new space to modify an inode (such is the case for
         * Btrfs), but since we touch atime while walking down the path we
         * really don't care if we failed to update the atime of the file,
         * so just ignore the return value.
         * We may also fail on filesystems that have the ability to make parts
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
        inode_update_time(inode, S_ATIME);
        mnt_put_write_access(mnt);
skip_update:
        sb_end_write(inode->i_sb);
}
EXPORT_SYMBOL(touch_atime);

/*
 * Return mask of changes for notify_change() that need to be done as a
 * response to write or truncate. Return 0 if nothing has to be changed.
 * Negative value on error (change should be denied).
 */
int dentry_needs_remove_privs(struct mnt_idmap *idmap,
                              struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        int mask = 0;
        int ret;

        if (IS_NOSEC(inode))
                return 0;

        mask = setattr_should_drop_suidgid(idmap, inode);
        ret = security_inode_need_killpriv(dentry);
        if (ret < 0)
                return ret;
        if (ret)
                mask |= ATTR_KILL_PRIV;
        return mask;
}

static int __remove_privs(struct mnt_idmap *idmap,
                          struct dentry *dentry, int kill)
{
        struct iattr newattrs;

        newattrs.ia_valid = ATTR_FORCE | kill;
        /*
         * Note we call this on write, so notify_change will not
         * encounter any conflicting delegations:
         */
        return notify_change(idmap, dentry, &newattrs, NULL);
}

int file_remove_privs_flags(struct file *file, unsigned int flags)
{
        struct dentry *dentry = file_dentry(file);
        struct inode *inode = file_inode(file);
        int error = 0;
        int kill;

        if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
                return 0;

        kill = dentry_needs_remove_privs(file_mnt_idmap(file), dentry);
        if (kill < 0)
                return kill;

        if (kill) {
                if (flags & IOCB_NOWAIT)
                        return -EAGAIN;

                error = __remove_privs(file_mnt_idmap(file), dentry, kill);
        }

        if (!error)
                inode_has_no_xattr(inode);
        return error;
}
EXPORT_SYMBOL_GPL(file_remove_privs_flags);

/**
 * file_remove_privs - remove special file privileges (suid, capabilities)
 * @file: file to remove privileges from
 *
 * When file is modified by a write or truncation ensure that special
 * file privileges are removed.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_remove_privs(struct file *file)
{
        return file_remove_privs_flags(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);

/**
 * current_time - Return FS time (possibly fine-grained)
 * @inode: inode.
 *
 * Return the current time truncated to the time granularity supported by
 * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
 * as having been QUERIED, get a fine-grained timestamp, but don't update
 * the floor.
 *
 * For a multigrain inode, this is effectively an estimate of the timestamp
 * that a file would receive. An actual update must go through
 * inode_set_ctime_current().
 */
struct timespec64 current_time(struct inode *inode)
{
        struct timespec64 now;
        u32 cns;

        ktime_get_coarse_real_ts64_mg(&now);

        if (!is_mgtime(inode))
                goto out;

        /* If nothing has queried it, then coarse time is fine */
        cns = smp_load_acquire(&inode->i_ctime_nsec);
        if (cns & I_CTIME_QUERIED) {
                /*
                 * If there is no apparent change, then get a fine-grained
                 * timestamp.
                 */
                if (now.tv_nsec == (cns & ~I_CTIME_QUERIED))
                        ktime_get_real_ts64(&now);
        }
out:
        return timestamp_truncate(now, inode);
}
EXPORT_SYMBOL(current_time);

static int inode_needs_update_time(struct inode *inode)
{
        struct timespec64 now, ts;
        int sync_it = 0;

        /* First try to exhaust all avenues to not sync */
        if (IS_NOCMTIME(inode))
                return 0;

        now = current_time(inode);

        ts = inode_get_mtime(inode);
        if (!timespec64_equal(&ts, &now))
                sync_it |= S_MTIME;

        ts = inode_get_ctime(inode);
        if (!timespec64_equal(&ts, &now))
                sync_it |= S_CTIME;

        if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
                sync_it |= S_VERSION;

        return sync_it;
}

static int __file_update_time(struct file *file, int sync_mode)
{
        int ret = 0;
        struct inode *inode = file_inode(file);

        /* try to update time settings */
        if (!mnt_get_write_access_file(file)) {
                ret = inode_update_time(inode, sync_mode);
                mnt_put_write_access_file(file);
        }

        return ret;
}

/**
 * file_update_time - update mtime and ctime time
 * @file: file accessed
 *
 * Update the mtime and ctime members of an inode and mark the inode for
 * writeback. Note that this function is meant exclusively for usage in
 * the file write path of filesystems, and filesystems may choose to
 * explicitly ignore updates via this function with the _NOCMTIME inode
 * flag, e.g. for network filesystem where these imestamps are handled
 * by the server. This can return an error for file systems who need to
 * allocate space in order to update an inode.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_update_time(struct file *file)
{
        int ret;
        struct inode *inode = file_inode(file);

        ret = inode_needs_update_time(inode);
        if (ret <= 0)
                return ret;

        return __file_update_time(file, ret);
}
EXPORT_SYMBOL(file_update_time);

/**
 * file_modified_flags - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 * @flags: kiocb flags
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * If IOCB_NOWAIT is set, special file privileges will not be removed and
 * time settings will not be updated. It will return -EAGAIN.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
static int file_modified_flags(struct file *file, int flags)
{
        int ret;
        struct inode *inode = file_inode(file);

        /*
         * Clear the security bits if the process is not being run by root.
         * This keeps people from modifying setuid and setgid binaries.
         */
        ret = file_remove_privs_flags(file, flags);
        if (ret)
                return ret;

        if (unlikely(file->f_mode & FMODE_NOCMTIME))
                return 0;

        ret = inode_needs_update_time(inode);
        if (ret <= 0)
                return ret;
        if (flags & IOCB_NOWAIT)
                return -EAGAIN;

        return __file_update_time(file, ret);
}

/**
 * file_modified - handle mandated vfs changes when modifying a file
 * @file: file that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int file_modified(struct file *file)
{
        return file_modified_flags(file, 0);
}
EXPORT_SYMBOL(file_modified);

/**
 * kiocb_modified - handle mandated vfs changes when modifying a file
 * @iocb: iocb that was modified
 *
 * When file has been modified ensure that special
 * file privileges are removed and time settings are updated.
 *
 * Context: Caller must hold the file's inode lock.
 *
 * Return: 0 on success, negative errno on failure.
 */
int kiocb_modified(struct kiocb *iocb)
{
        return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
}
EXPORT_SYMBOL_GPL(kiocb_modified);

int inode_needs_sync(struct inode *inode)
{
        if (IS_SYNC(inode))
                return 1;
        if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
                return 1;
        return 0;
}
EXPORT_SYMBOL(inode_needs_sync);

/*
 * If we try to find an inode in the inode hash while it is being
 * deleted, we have to wait until the filesystem completes its
 * deletion before reporting that it isn't found.  This function waits
 * until the deletion _might_ have completed.  Callers are responsible
 * to recheck inode state.
 *
 * It doesn't matter if I_NEW is not set initially, a call to
 * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
 * will DTRT.
 */
static void __wait_on_freeing_inode(struct inode *inode, bool is_inode_hash_locked)
{
        struct wait_bit_queue_entry wqe;
        struct wait_queue_head *wq_head;

        /*
         * Handle racing against evict(), see that routine for more details.
         */
        if (unlikely(inode_unhashed(inode))) {
                WARN_ON(is_inode_hash_locked);
                spin_unlock(&inode->i_lock);
                return;
        }

        wq_head = inode_bit_waitqueue(&wqe, inode, __I_NEW);
        prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
        spin_unlock(&inode->i_lock);
        rcu_read_unlock();
        if (is_inode_hash_locked)
                spin_unlock(&inode_hash_lock);
        schedule();
        finish_wait(wq_head, &wqe.wq_entry);
        if (is_inode_hash_locked)
                spin_lock(&inode_hash_lock);
        rcu_read_lock();
}

static __initdata unsigned long ihash_entries;
static int __init set_ihash_entries(char *str)
{
        if (!str)
                return 0;
        ihash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("ihash_entries=", set_ihash_entries);

/*
 * Initialize the waitqueues and inode hash table.
 */
void __init inode_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_EARLY | HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void __init inode_init(void)
{
        /* inode slab cache */
        inode_cachep = kmem_cache_create("inode_cache",
                                         sizeof(struct inode),
                                         0,
                                         (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
                                         SLAB_ACCOUNT),
                                         init_once);

        /* Hash may have been set up in inode_init_early */
        if (!hashdist)
                return;

        inode_hashtable =
                alloc_large_system_hash("Inode-cache",
                                        sizeof(struct hlist_head),
                                        ihash_entries,
                                        14,
                                        HASH_ZERO,
                                        &i_hash_shift,
                                        &i_hash_mask,
                                        0,
                                        0);
}

void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
        inode->i_mode = mode;
        if (S_ISCHR(mode)) {
                inode->i_fop = &def_chr_fops;
                inode->i_rdev = rdev;
        } else if (S_ISBLK(mode)) {
                if (IS_ENABLED(CONFIG_BLOCK))
                        inode->i_fop = &def_blk_fops;
                inode->i_rdev = rdev;
        } else if (S_ISFIFO(mode))
                inode->i_fop = &pipefifo_fops;
        else if (S_ISSOCK(mode))
                ;        /* leave it no_open_fops */
        else
                printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
                                  " inode %s:%lu\n", mode, inode->i_sb->s_id,
                                  inode->i_ino);
}
EXPORT_SYMBOL(init_special_inode);

/**
 * inode_init_owner - Init uid,gid,mode for new inode according to posix standards
 * @idmap: idmap of the mount the inode was created from
 * @inode: New inode
 * @dir: Directory inode
 * @mode: mode of the new inode
 *
 * If the inode has been created through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions
 * and initializing i_uid and i_gid. On non-idmapped mounts or if permission
 * checking is to be performed on the raw inode simply pass @nop_mnt_idmap.
 */
void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
                      const struct inode *dir, umode_t mode)
{
        inode_fsuid_set(inode, idmap);
        if (dir && dir->i_mode & S_ISGID) {
                inode->i_gid = dir->i_gid;

                /* Directories are special, and always inherit S_ISGID */
                if (S_ISDIR(mode))
                        mode |= S_ISGID;
        } else
                inode_fsgid_set(inode, idmap);
        inode->i_mode = mode;
}
EXPORT_SYMBOL(inode_init_owner);

/**
 * inode_owner_or_capable - check current task permissions to inode
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode being checked
 *
 * Return true if current either has CAP_FOWNER in a namespace with the
 * inode owner uid mapped, or owns the file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
bool inode_owner_or_capable(struct mnt_idmap *idmap,
                            const struct inode *inode)
{
        vfsuid_t vfsuid;
        struct user_namespace *ns;

        vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                return true;

        ns = current_user_ns();
        if (vfsuid_has_mapping(ns, vfsuid) && ns_capable(ns, CAP_FOWNER))
                return true;
        return false;
}
EXPORT_SYMBOL(inode_owner_or_capable);

/*
 * Direct i/o helper functions
 */
bool inode_dio_finished(const struct inode *inode)
{
        return atomic_read(&inode->i_dio_count) == 0;
}
EXPORT_SYMBOL(inode_dio_finished);

/**
 * inode_dio_wait - wait for outstanding DIO requests to finish
 * @inode: inode to wait for
 *
 * Waits for all pending direct I/O requests to finish so that we can
 * proceed with a truncate or equivalent operation.
 *
 * Must be called under a lock that serializes taking new references
 * to i_dio_count, usually by inode->i_mutex.
 */
void inode_dio_wait(struct inode *inode)
{
        wait_var_event(&inode->i_dio_count, inode_dio_finished(inode));
}
EXPORT_SYMBOL(inode_dio_wait);

void inode_dio_wait_interruptible(struct inode *inode)
{
        wait_var_event_interruptible(&inode->i_dio_count,
                                     inode_dio_finished(inode));
}
EXPORT_SYMBOL(inode_dio_wait_interruptible);

/*
 * inode_set_flags - atomically set some inode flags
 *
 * Note: the caller should be holding i_mutex, or else be sure that
 * they have exclusive access to the inode structure (i.e., while the
 * inode is being instantiated).  The reason for the cmpxchg() loop
 * --- which wouldn't be necessary if all code paths which modify
 * i_flags actually followed this rule, is that there is at least one
 * code path which doesn't today so we use cmpxchg() out of an abundance
 * of caution.
 *
 * In the long run, i_mutex is overkill, and we should probably look
 * at using the i_lock spinlock to protect i_flags, and then make sure
 * it is so documented in include/linux/fs.h and that all code follows
 * the locking convention!!
 */
void inode_set_flags(struct inode *inode, unsigned int flags,
                     unsigned int mask)
{
        WARN_ON_ONCE(flags & ~mask);
        set_mask_bits(&inode->i_flags, mask, flags);
}
EXPORT_SYMBOL(inode_set_flags);

void inode_nohighmem(struct inode *inode)
{
        mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
}
EXPORT_SYMBOL(inode_nohighmem);

struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts)
{
        trace_inode_set_ctime_to_ts(inode, &ts);
        set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
        inode->i_ctime_sec = ts.tv_sec;
        inode->i_ctime_nsec = ts.tv_nsec;
        return ts;
}
EXPORT_SYMBOL(inode_set_ctime_to_ts);

/**
 * timestamp_truncate - Truncate timespec to a granularity
 * @t: Timespec
 * @inode: inode being updated
 *
 * Truncate a timespec to the granularity supported by the fs
 * containing the inode. Always rounds down. gran must
 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
 */
struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned int gran = sb->s_time_gran;

        t.tv_sec = clamp(t.tv_sec, sb->s_time_min, sb->s_time_max);
        if (unlikely(t.tv_sec == sb->s_time_max || t.tv_sec == sb->s_time_min))
                t.tv_nsec = 0;

        /* Avoid division in the common cases 1 ns and 1 s. */
        if (gran == 1)
                ; /* nothing */
        else if (gran == NSEC_PER_SEC)
                t.tv_nsec = 0;
        else if (gran > 1 && gran < NSEC_PER_SEC)
                t.tv_nsec -= t.tv_nsec % gran;
        else
                WARN(1, "invalid file time granularity: %u", gran);
        return t;
}
EXPORT_SYMBOL(timestamp_truncate);

/**
 * inode_set_ctime_current - set the ctime to current_time
 * @inode: inode
 *
 * Set the inode's ctime to the current value for the inode. Returns the
 * current value that was assigned. If this is not a multigrain inode, then we
 * set it to the later of the coarse time and floor value.
 *
 * If it is multigrain, then we first see if the coarse-grained timestamp is
 * distinct from what is already there. If so, then use that. Otherwise, get a
 * fine-grained timestamp.
 *
 * After that, try to swap the new value into i_ctime_nsec. Accept the
 * resulting ctime, regardless of the outcome of the swap. If it has
 * already been replaced, then that timestamp is later than the earlier
 * unacceptable one, and is thus acceptable.
 */
struct timespec64 inode_set_ctime_current(struct inode *inode)
{
        struct timespec64 now;
        u32 cns, cur;

        ktime_get_coarse_real_ts64_mg(&now);
        now = timestamp_truncate(now, inode);

        /* Just return that if this is not a multigrain fs */
        if (!is_mgtime(inode)) {
                inode_set_ctime_to_ts(inode, now);
                goto out;
        }

        /*
         * A fine-grained time is only needed if someone has queried
         * for timestamps, and the current coarse grained time isn't
         * later than what's already there.
         */
        cns = smp_load_acquire(&inode->i_ctime_nsec);
        if (cns & I_CTIME_QUERIED) {
                struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
                                            .tv_nsec = cns & ~I_CTIME_QUERIED };

                if (timespec64_compare(&now, &ctime) <= 0) {
                        ktime_get_real_ts64_mg(&now);
                        now = timestamp_truncate(now, inode);
                        mgtime_counter_inc(mg_fine_stamps);
                }
        }
        mgtime_counter_inc(mg_ctime_updates);

        /* No need to cmpxchg if it's exactly the same */
        if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) {
                trace_ctime_xchg_skip(inode, &now);
                goto out;
        }
        cur = cns;
retry:
        /* Try to swap the nsec value into place. */
        if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
                /* If swap occurred, then we're (mostly) done */
                inode->i_ctime_sec = now.tv_sec;
                trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur);
                mgtime_counter_inc(mg_ctime_swaps);
        } else {
                /*
                 * Was the change due to someone marking the old ctime QUERIED?
                 * If so then retry the swap. This can only happen once since
                 * the only way to clear I_CTIME_QUERIED is to stamp the inode
                 * with a new ctime.
                 */
                if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) {
                        cns = cur;
                        goto retry;
                }
                /* Otherwise, keep the existing ctime */
                now.tv_sec = inode->i_ctime_sec;
                now.tv_nsec = cur & ~I_CTIME_QUERIED;
        }
out:
        return now;
}
EXPORT_SYMBOL(inode_set_ctime_current);

/**
 * inode_set_ctime_deleg - try to update the ctime on a delegated inode
 * @inode: inode to update
 * @update: timespec64 to set the ctime
 *
 * Attempt to atomically update the ctime on behalf of a delegation holder.
 *
 * The nfs server can call back the holder of a delegation to get updated
 * inode attributes, including the mtime. When updating the mtime, update
 * the ctime to a value at least equal to that.
 *
 * This can race with concurrent updates to the inode, in which
 * case the update is skipped.
 *
 * Note that this works even when multigrain timestamps are not enabled,
 * so it is used in either case.
 */
struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update)
{
        struct timespec64 now, cur_ts;
        u32 cur, old;

        /* pairs with try_cmpxchg below */
        cur = smp_load_acquire(&inode->i_ctime_nsec);
        cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
        cur_ts.tv_sec = inode->i_ctime_sec;

        /* If the update is older than the existing value, skip it. */
        if (timespec64_compare(&update, &cur_ts) <= 0)
                return cur_ts;

        ktime_get_coarse_real_ts64_mg(&now);

        /* Clamp the update to "now" if it's in the future */
        if (timespec64_compare(&update, &now) > 0)
                update = now;

        update = timestamp_truncate(update, inode);

        /* No need to update if the values are already the same */
        if (timespec64_equal(&update, &cur_ts))
                return cur_ts;

        /*
         * Try to swap the nsec value into place. If it fails, that means
         * it raced with an update due to a write or similar activity. That
         * stamp takes precedence, so just skip the update.
         */
retry:
        old = cur;
        if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
                inode->i_ctime_sec = update.tv_sec;
                mgtime_counter_inc(mg_ctime_swaps);
                return update;
        }

        /*
         * Was the change due to another task marking the old ctime QUERIED?
         *
         * If so, then retry the swap. This can only happen once since
         * the only way to clear I_CTIME_QUERIED is to stamp the inode
         * with a new ctime.
         */
        if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED)))
                goto retry;

        /* Otherwise, it was a new timestamp. */
        cur_ts.tv_sec = inode->i_ctime_sec;
        cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
        return cur_ts;
}
EXPORT_SYMBOL(inode_set_ctime_deleg);

/**
 * in_group_or_capable - check whether caller is CAP_FSETID privileged
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        inode to check
 * @vfsgid:        the new/current vfsgid of @inode
 *
 * Check whether @vfsgid is in the caller's group list or if the caller is
 * privileged with CAP_FSETID over @inode. This can be used to determine
 * whether the setgid bit can be kept or must be dropped.
 *
 * Return: true if the caller is sufficiently privileged, false if not.
 */
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid)
{
        if (vfsgid_in_group_p(vfsgid))
                return true;
        if (capable_wrt_inode_uidgid(idmap, inode, CAP_FSETID))
                return true;
        return false;
}
EXPORT_SYMBOL(in_group_or_capable);

/**
 * mode_strip_sgid - handle the sgid bit for non-directories
 * @idmap: idmap of the mount the inode was created from
 * @dir: parent directory inode
 * @mode: mode of the file to be created in @dir
 *
 * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
 * raised and @dir has the S_ISGID bit raised ensure that the caller is
 * either in the group of the parent directory or they have CAP_FSETID
 * in their user namespace and are privileged over the parent directory.
 * In all other cases, strip the S_ISGID bit from @mode.
 *
 * Return: the new mode to use for the file
 */
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
                        const struct inode *dir, umode_t mode)
{
        if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
                return mode;
        if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
                return mode;
        if (in_group_or_capable(idmap, dir, i_gid_into_vfsgid(idmap, dir)))
                return mode;
        return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);

#ifdef CONFIG_DEBUG_VFS
/*
 * Dump an inode.
 *
 * TODO: add a proper inode dumping routine, this is a stub to get debug off the
 * ground.
 */
void dump_inode(struct inode *inode, const char *reason)
{
       pr_warn("%s encountered for inode %px", reason, inode);
}

EXPORT_SYMBOL(dump_inode);
#endif










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions of the Internet Protocol.
 *
 * Version:        @(#)in.h        1.0.1        04/21/93
 *
 * Authors:        Original taken from the GNU Project <netinet/in.h> file.
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_IN_H
#define _LINUX_IN_H


#include <linux/errno.h>
#include <uapi/linux/in.h>

static inline int proto_ports_offset(int proto)
{
        switch (proto) {
        case IPPROTO_TCP:
        case IPPROTO_UDP:
        case IPPROTO_DCCP:
        case IPPROTO_ESP:        /* SPI */
        case IPPROTO_SCTP:
        case IPPROTO_UDPLITE:
                return 0;
        case IPPROTO_AH:        /* SPI */
                return 4;
        default:
                return -EINVAL;
        }
}

static inline bool ipv4_is_loopback(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x7f000000);
}

static inline bool ipv4_is_multicast(__be32 addr)
{
        return (addr & htonl(0xf0000000)) == htonl(0xe0000000);
}

static inline bool ipv4_is_local_multicast(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xe0000000);
}

static inline bool ipv4_is_lbcast(__be32 addr)
{
        /* limited broadcast */
        return addr == htonl(INADDR_BROADCAST);
}

static inline bool ipv4_is_all_snoopers(__be32 addr)
{
        return addr == htonl(INADDR_ALLSNOOPERS_GROUP);
}

static inline bool ipv4_is_zeronet(__be32 addr)
{
        return (addr == 0);
}

/* Special-Use IPv4 Addresses (RFC3330) */

static inline bool ipv4_is_private_10(__be32 addr)
{
        return (addr & htonl(0xff000000)) == htonl(0x0a000000);
}

static inline bool ipv4_is_private_172(__be32 addr)
{
        return (addr & htonl(0xfff00000)) == htonl(0xac100000);
}

static inline bool ipv4_is_private_192(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xc0a80000);
}

static inline bool ipv4_is_linklocal_169(__be32 addr)
{
        return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000);
}

static inline bool ipv4_is_anycast_6to4(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0586300);
}

static inline bool ipv4_is_test_192(__be32 addr)
{
        return (addr & htonl(0xffffff00)) == htonl(0xc0000200);
}

static inline bool ipv4_is_test_198(__be32 addr)
{
        return (addr & htonl(0xfffe0000)) == htonl(0xc6120000);
}
#endif        /* _LINUX_IN_H */










   63 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/* SPDX-License-Identifier: GPL-2.0 */
#if !defined(_TRACE_VGIC_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_VGIC_H

#include <linux/tracepoint.h>

#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm

TRACE_EVENT(vgic_update_irq_pending,
        TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
        TP_ARGS(vcpu_id, irq, level),

        TP_STRUCT__entry(
                __field(        unsigned long,        vcpu_id        )
                __field(        __u32,                irq        )
                __field(        bool,                level        )
        ),

        TP_fast_assign(
                __entry->vcpu_id        = vcpu_id;
                __entry->irq                = irq;
                __entry->level                = level;
        ),

        TP_printk("VCPU: %ld, IRQ %d, level: %d",
                  __entry->vcpu_id, __entry->irq, __entry->level)
);

#endif /* _TRACE_VGIC_H */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH ../../arch/arm64/kvm/vgic
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace

/* This part must be outside protection */
#include <trace/define_trace.h>













































































































































































































































































































































































































































































































































































































































































































  546 




  550 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  316 


  317 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
// SPDX-License-Identifier: GPL-2.0-only
/* Kernel thread helper functions.
 *   Copyright (C) 2004 IBM Corporation, Rusty Russell.
 *   Copyright (C) 2009 Red Hat, Inc.
 *
 * Creation is done via kthreadd, so that we get a clean environment
 * even if we're invoked from userspace (think modprobe, hotplug cpu,
 * etc.).
 */
#include <uapi/linux/sched/types.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/kthread.h>
#include <linux/completion.h>
#include <linux/err.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/unistd.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <linux/numa.h>
#include <linux/sched/isolation.h>
#include <trace/events/sched.h>


static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;

static LIST_HEAD(kthreads_hotplug);
static DEFINE_MUTEX(kthreads_hotplug_lock);

struct kthread_create_info
{
        /* Information passed to kthread() from kthreadd. */
        char *full_name;
        int (*threadfn)(void *data);
        void *data;
        int node;

        /* Result passed back to kthread_create() from kthreadd. */
        struct task_struct *result;
        struct completion *done;

        struct list_head list;
};

struct kthread {
        unsigned long flags;
        unsigned int cpu;
        unsigned int node;
        int started;
        int result;
        int (*threadfn)(void *);
        void *data;
        struct completion parked;
        struct completion exited;
#ifdef CONFIG_BLK_CGROUP
        struct cgroup_subsys_state *blkcg_css;
#endif
        /* To store the full name if task comm is truncated. */
        char *full_name;
        struct task_struct *task;
        struct list_head hotplug_node;
        struct cpumask *preferred_affinity;
};

enum KTHREAD_BITS {
        KTHREAD_IS_PER_CPU = 0,
        KTHREAD_SHOULD_STOP,
        KTHREAD_SHOULD_PARK,
};

static inline struct kthread *to_kthread(struct task_struct *k)
{
        WARN_ON(!(k->flags & PF_KTHREAD));
        return k->worker_private;
}

/*
 * Variant of to_kthread() that doesn't assume @p is a kthread.
 *
 * Per construction; when:
 *
 *   (p->flags & PF_KTHREAD) && p->worker_private
 *
 * the task is both a kthread and struct kthread is persistent. However
 * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
 * begin_new_exec()).
 */
static inline struct kthread *__to_kthread(struct task_struct *p)
{
        void *kthread = p->worker_private;
        if (kthread && !(p->flags & PF_KTHREAD))
                kthread = NULL;
        return kthread;
}

void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
        struct kthread *kthread = to_kthread(tsk);

        if (!kthread || !kthread->full_name) {
                strscpy(buf, tsk->comm, buf_size);
                return;
        }

        strscpy_pad(buf, kthread->full_name, buf_size);
}

bool set_kthread_struct(struct task_struct *p)
{
        struct kthread *kthread;

        if (WARN_ON_ONCE(to_kthread(p)))
                return false;

        kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
        if (!kthread)
                return false;

        init_completion(&kthread->exited);
        init_completion(&kthread->parked);
        INIT_LIST_HEAD(&kthread->hotplug_node);
        p->vfork_done = &kthread->exited;

        kthread->task = p;
        kthread->node = tsk_fork_get_node(current);
        p->worker_private = kthread;
        return true;
}

void free_kthread_struct(struct task_struct *k)
{
        struct kthread *kthread;

        /*
         * Can be NULL if kmalloc() in set_kthread_struct() failed.
         */
        kthread = to_kthread(k);
        if (!kthread)
                return;

#ifdef CONFIG_BLK_CGROUP
        WARN_ON_ONCE(kthread->blkcg_css);
#endif
        k->worker_private = NULL;
        kfree(kthread->full_name);
        kfree(kthread);
}

/**
 * kthread_should_stop - should this kthread return now?
 *
 * When someone calls kthread_stop() on your kthread, it will be woken
 * and this will return true.  You should then return, and your return
 * value will be passed through to kthread_stop().
 */
bool kthread_should_stop(void)
{
        return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);

static bool __kthread_should_park(struct task_struct *k)
{
        return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags);
}

/**
 * kthread_should_park - should this kthread park now?
 *
 * When someone calls kthread_park() on your kthread, it will be woken
 * and this will return true.  You should then do the necessary
 * cleanup and call kthread_parkme()
 *
 * Similar to kthread_should_stop(), but this keeps the thread alive
 * and in a park position. kthread_unpark() "restarts" the thread and
 * calls the thread function again.
 */
bool kthread_should_park(void)
{
        return __kthread_should_park(current);
}
EXPORT_SYMBOL_GPL(kthread_should_park);

bool kthread_should_stop_or_park(void)
{
        struct kthread *kthread = __to_kthread(current);

        if (!kthread)
                return false;

        return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK));
}

/**
 * kthread_freezable_should_stop - should this freezable kthread return now?
 * @was_frozen: optional out parameter, indicates whether %current was frozen
 *
 * kthread_should_stop() for freezable kthreads, which will enter
 * refrigerator if necessary.  This function is safe from kthread_stop() /
 * freezer deadlock and freezable kthreads should use this function instead
 * of calling try_to_freeze() directly.
 */
bool kthread_freezable_should_stop(bool *was_frozen)
{
        bool frozen = false;

        might_sleep();

        if (unlikely(freezing(current)))
                frozen = __refrigerator(true);

        if (was_frozen)
                *was_frozen = frozen;

        return kthread_should_stop();
}
EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);

/**
 * kthread_func - return the function specified on kthread creation
 * @task: kthread task in question
 *
 * Returns NULL if the task is not a kthread.
 */
void *kthread_func(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        if (kthread)
                return kthread->threadfn;
        return NULL;
}
EXPORT_SYMBOL_GPL(kthread_func);

/**
 * kthread_data - return data value specified on kthread creation
 * @task: kthread task in question
 *
 * Return the data value specified when kthread @task was created.
 * The caller is responsible for ensuring the validity of @task when
 * calling this function.
 */
void *kthread_data(struct task_struct *task)
{
        return to_kthread(task)->data;
}
EXPORT_SYMBOL_GPL(kthread_data);

/**
 * kthread_probe_data - speculative version of kthread_data()
 * @task: possible kthread task in question
 *
 * @task could be a kthread task.  Return the data value specified when it
 * was created if accessible.  If @task isn't a kthread task or its data is
 * inaccessible for any reason, %NULL is returned.  This function requires
 * that @task itself is safe to dereference.
 */
void *kthread_probe_data(struct task_struct *task)
{
        struct kthread *kthread = __to_kthread(task);
        void *data = NULL;

        if (kthread)
                copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
        return data;
}

static void __kthread_parkme(struct kthread *self)
{
        for (;;) {
                /*
                 * TASK_PARKED is a special state; we must serialize against
                 * possible pending wakeups to avoid store-store collisions on
                 * task->state.
                 *
                 * Such a collision might possibly result in the task state
                 * changin from TASK_PARKED and us failing the
                 * wait_task_inactive() in kthread_park().
                 */
                set_special_state(TASK_PARKED);
                if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
                        break;

                /*
                 * Thread is going to call schedule(), do not preempt it,
                 * or the caller of kthread_park() may spend more time in
                 * wait_task_inactive().
                 */
                preempt_disable();
                complete(&self->parked);
                schedule_preempt_disabled();
                preempt_enable();
        }
        __set_current_state(TASK_RUNNING);
}

void kthread_parkme(void)
{
        __kthread_parkme(to_kthread(current));
}
EXPORT_SYMBOL_GPL(kthread_parkme);

/**
 * kthread_exit - Cause the current kthread return @result to kthread_stop().
 * @result: The integer value to return to kthread_stop().
 *
 * While kthread_exit can be called directly, it exists so that
 * functions which do some additional work in non-modular code such as
 * module_put_and_kthread_exit can be implemented.
 *
 * Does not return.
 */
void __noreturn kthread_exit(long result)
{
        struct kthread *kthread = to_kthread(current);
        kthread->result = result;
        if (!list_empty(&kthread->hotplug_node)) {
                mutex_lock(&kthreads_hotplug_lock);
                list_del(&kthread->hotplug_node);
                mutex_unlock(&kthreads_hotplug_lock);

                if (kthread->preferred_affinity) {
                        kfree(kthread->preferred_affinity);
                        kthread->preferred_affinity = NULL;
                }
        }
        do_exit(0);
}
EXPORT_SYMBOL(kthread_exit);

/**
 * kthread_complete_and_exit - Exit the current kthread.
 * @comp: Completion to complete
 * @code: The integer value to return to kthread_stop().
 *
 * If present, complete @comp and then return code to kthread_stop().
 *
 * A kernel thread whose module may be removed after the completion of
 * @comp can use this function to exit safely.
 *
 * Does not return.
 */
void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
{
        if (comp)
                complete(comp);

        kthread_exit(code);
}
EXPORT_SYMBOL(kthread_complete_and_exit);

static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
{
        const struct cpumask *pref;

        if (kthread->preferred_affinity) {
                pref = kthread->preferred_affinity;
        } else {
                if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
                        return;
                pref = cpumask_of_node(kthread->node);
        }

        cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
        if (cpumask_empty(cpumask))
                cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
}

static void kthread_affine_node(void)
{
        struct kthread *kthread = to_kthread(current);
        cpumask_var_t affinity;

        WARN_ON_ONCE(kthread_is_per_cpu(current));

        if (kthread->node == NUMA_NO_NODE) {
                housekeeping_affine(current, HK_TYPE_KTHREAD);
        } else {
                if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
                        WARN_ON_ONCE(1);
                        return;
                }

                mutex_lock(&kthreads_hotplug_lock);
                WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
                list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
                /*
                 * The node cpumask is racy when read from kthread() but:
                 * - a racing CPU going down will either fail on the subsequent
                 *   call to set_cpus_allowed_ptr() or be migrated to housekeepers
                 *   afterwards by the scheduler.
                 * - a racing CPU going up will be handled by kthreads_online_cpu()
                 */
                kthread_fetch_affinity(kthread, affinity);
                set_cpus_allowed_ptr(current, affinity);
                mutex_unlock(&kthreads_hotplug_lock);

                free_cpumask_var(affinity);
        }
}

static int kthread(void *_create)
{
        static const struct sched_param param = { .sched_priority = 0 };
        /* Copy data: it's on kthread's stack */
        struct kthread_create_info *create = _create;
        int (*threadfn)(void *data) = create->threadfn;
        void *data = create->data;
        struct completion *done;
        struct kthread *self;
        int ret;

        self = to_kthread(current);

        /* Release the structure when caller killed by a fatal signal. */
        done = xchg(&create->done, NULL);
        if (!done) {
                kfree(create->full_name);
                kfree(create);
                kthread_exit(-EINTR);
        }

        self->full_name = create->full_name;
        self->threadfn = threadfn;
        self->data = data;

        /*
         * The new thread inherited kthreadd's priority and CPU mask. Reset
         * back to default in case they have been changed.
         */
        sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);

        /* OK, tell user we're spawned, wait for stop or wakeup */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        create->result = current;
        /*
         * Thread is going to call schedule(), do not preempt it,
         * or the creator may spend more time in wait_task_inactive().
         */
        preempt_disable();
        complete(done);
        schedule_preempt_disabled();
        preempt_enable();

        self->started = 1;

        if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
                kthread_affine_node();

        ret = -EINTR;
        if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
                cgroup_kthread_ready();
                __kthread_parkme(self);
                ret = threadfn(data);
        }
        kthread_exit(ret);
}

/* called from kernel_clone() to get node information for about to be created task */
int tsk_fork_get_node(struct task_struct *tsk)
{
#ifdef CONFIG_NUMA
        if (tsk == kthreadd_task)
                return tsk->pref_node_fork;
#endif
        return NUMA_NO_NODE;
}

static void create_kthread(struct kthread_create_info *create)
{
        int pid;

#ifdef CONFIG_NUMA
        current->pref_node_fork = create->node;
#endif
        /* We want our own signal handler (we take no signals by default). */
        pid = kernel_thread(kthread, create, create->full_name,
                            CLONE_FS | CLONE_FILES | SIGCHLD);
        if (pid < 0) {
                /* Release the structure when caller killed by a fatal signal. */
                struct completion *done = xchg(&create->done, NULL);

                kfree(create->full_name);
                if (!done) {
                        kfree(create);
                        return;
                }
                create->result = ERR_PTR(pid);
                complete(done);
        }
}

static __printf(4, 0)
struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
                                                    void *data, int node,
                                                    const char namefmt[],
                                                    va_list args)
{
        DECLARE_COMPLETION_ONSTACK(done);
        struct task_struct *task;
        struct kthread_create_info *create = kmalloc(sizeof(*create),
                                                     GFP_KERNEL);

        if (!create)
                return ERR_PTR(-ENOMEM);
        create->threadfn = threadfn;
        create->data = data;
        create->node = node;
        create->done = &done;
        create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
        if (!create->full_name) {
                task = ERR_PTR(-ENOMEM);
                goto free_create;
        }

        spin_lock(&kthread_create_lock);
        list_add_tail(&create->list, &kthread_create_list);
        spin_unlock(&kthread_create_lock);

        wake_up_process(kthreadd_task);
        /*
         * Wait for completion in killable state, for I might be chosen by
         * the OOM killer while kthreadd is trying to allocate memory for
         * new kernel thread.
         */
        if (unlikely(wait_for_completion_killable(&done))) {
                /*
                 * If I was killed by a fatal signal before kthreadd (or new
                 * kernel thread) calls complete(), leave the cleanup of this
                 * structure to that thread.
                 */
                if (xchg(&create->done, NULL))
                        return ERR_PTR(-EINTR);
                /*
                 * kthreadd (or new kernel thread) will call complete()
                 * shortly.
                 */
                wait_for_completion(&done);
        }
        task = create->result;
free_create:
        kfree(create);
        return task;
}

/**
 * kthread_create_on_node - create a kthread.
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @node: task and thread structures for the thread are allocated on this node
 * @namefmt: printf-style name for the thread.
 *
 * Description: This helper function creates and names a kernel
 * thread.  The thread will be stopped: use wake_up_process() to start
 * it.  See also kthread_run().  The new thread has SCHED_NORMAL policy and
 * is affine to all CPUs.
 *
 * If thread is going to be bound on a particular cpu, give its node
 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
 * When woken, the thread will run @threadfn() with @data as its
 * argument. @threadfn() can either return directly if it is a
 * standalone thread for which no one will call kthread_stop(), or
 * return when 'kthread_should_stop()' is true (which means
 * kthread_stop() has been called).  The return value should be zero
 * or a negative error number; it will be passed to kthread_stop().
 *
 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR).
 */
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
                                           void *data, int node,
                                           const char namefmt[],
                                           ...)
{
        struct task_struct *task;
        va_list args;

        va_start(args, namefmt);
        task = __kthread_create_on_node(threadfn, data, node, namefmt, args);
        va_end(args);

        return task;
}
EXPORT_SYMBOL(kthread_create_on_node);

static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state)
{
        unsigned long flags;

        if (!wait_task_inactive(p, state)) {
                WARN_ON(1);
                return;
        }

        /* It's safe because the task is inactive. */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        do_set_cpus_allowed(p, mask);
        p->flags |= PF_NO_SETAFFINITY;
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}

static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state)
{
        __kthread_bind_mask(p, cpumask_of(cpu), state);
}

void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
        struct kthread *kthread = to_kthread(p);
        __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
        WARN_ON_ONCE(kthread->started);
}

/**
 * kthread_bind - bind a just-created kthread to a cpu.
 * @p: thread created by kthread_create().
 * @cpu: cpu (might not be online, must be possible) for @k to run on.
 *
 * Description: This function is equivalent to set_cpus_allowed(),
 * except that @cpu doesn't need to be online, and the thread must be
 * stopped (i.e., just returned from kthread_create()).
 */
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
        struct kthread *kthread = to_kthread(p);
        __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
        WARN_ON_ONCE(kthread->started);
}
EXPORT_SYMBOL(kthread_bind);

/**
 * kthread_create_on_cpu - Create a cpu bound kthread
 * @threadfn: the function to run until signal_pending(current).
 * @data: data ptr for @threadfn.
 * @cpu: The cpu on which the thread should be bound,
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Description: This helper function creates and names a kernel thread
 */
struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
                                          void *data, unsigned int cpu,
                                          const char *namefmt)
{
        struct task_struct *p;

        p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
                                   cpu);
        if (IS_ERR(p))
                return p;
        kthread_bind(p, cpu);
        /* CPU hotplug need to bind once again when unparking the thread. */
        to_kthread(p)->cpu = cpu;
        return p;
}
EXPORT_SYMBOL(kthread_create_on_cpu);

void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
        struct kthread *kthread = to_kthread(k);
        if (!kthread)
                return;

        WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));

        if (cpu < 0) {
                clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
                return;
        }

        kthread->cpu = cpu;
        set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

bool kthread_is_per_cpu(struct task_struct *p)
{
        struct kthread *kthread = __to_kthread(p);
        if (!kthread)
                return false;

        return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
}

/**
 * kthread_unpark - unpark a thread created by kthread_create().
 * @k:                thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return false, wakes it, and
 * waits for it to return. If the thread is marked percpu then its
 * bound to the cpu again.
 */
void kthread_unpark(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))
                return;
        /*
         * Newly created kthread was parked when the CPU was offline.
         * The binding was lost and we need to set it again.
         */
        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
                __kthread_bind(k, kthread->cpu, TASK_PARKED);

        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        /*
         * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
         */
        wake_up_state(k, TASK_PARKED);
}
EXPORT_SYMBOL_GPL(kthread_unpark);

/**
 * kthread_park - park a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_park() for @k to return true, wakes it, and
 * waits for it to return. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will park without
 * calling threadfn().
 *
 * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
 * If called by the kthread itself just the park bit is set.
 */
int kthread_park(struct task_struct *k)
{
        struct kthread *kthread = to_kthread(k);

        if (WARN_ON(k->flags & PF_EXITING))
                return -ENOSYS;

        if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
                return -EBUSY;

        set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        if (k != current) {
                wake_up_process(k);
                /*
                 * Wait for __kthread_parkme() to complete(), this means we
                 * _will_ have TASK_PARKED and are about to call schedule().
                 */
                wait_for_completion(&kthread->parked);
                /*
                 * Now wait for that schedule() to complete and the task to
                 * get scheduled out.
                 */
                WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
        }

        return 0;
}
EXPORT_SYMBOL_GPL(kthread_park);

/**
 * kthread_stop - stop a thread created by kthread_create().
 * @k: thread created by kthread_create().
 *
 * Sets kthread_should_stop() for @k to return true, wakes it, and
 * waits for it to exit. This can also be called after kthread_create()
 * instead of calling wake_up_process(): the thread will exit without
 * calling threadfn().
 *
 * If threadfn() may call kthread_exit() itself, the caller must ensure
 * task_struct can't go away.
 *
 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
 * was never called.
 */
int kthread_stop(struct task_struct *k)
{
        struct kthread *kthread;
        int ret;

        trace_sched_kthread_stop(k);

        get_task_struct(k);
        kthread = to_kthread(k);
        set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
        kthread_unpark(k);
        set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL);
        wake_up_process(k);
        wait_for_completion(&kthread->exited);
        ret = kthread->result;
        put_task_struct(k);

        trace_sched_kthread_stop_ret(ret);
        return ret;
}
EXPORT_SYMBOL(kthread_stop);

/**
 * kthread_stop_put - stop a thread and put its task struct
 * @k: thread created by kthread_create().
 *
 * Stops a thread created by kthread_create() and put its task_struct.
 * Only use when holding an extra task struct reference obtained by
 * calling get_task_struct().
 */
int kthread_stop_put(struct task_struct *k)
{
        int ret;

        ret = kthread_stop(k);
        put_task_struct(k);
        return ret;
}
EXPORT_SYMBOL(kthread_stop_put);

int kthreadd(void *unused)
{
        static const char comm[TASK_COMM_LEN] = "kthreadd";
        struct task_struct *tsk = current;

        /* Setup a clean context for our children to inherit. */
        set_task_comm(tsk, comm);
        ignore_signals(tsk);
        set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
        set_mems_allowed(node_states[N_MEMORY]);

        current->flags |= PF_NOFREEZE;
        cgroup_init_kthreadd();

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                if (list_empty(&kthread_create_list))
                        schedule();
                __set_current_state(TASK_RUNNING);

                spin_lock(&kthread_create_lock);
                while (!list_empty(&kthread_create_list)) {
                        struct kthread_create_info *create;

                        create = list_entry(kthread_create_list.next,
                                            struct kthread_create_info, list);
                        list_del_init(&create->list);
                        spin_unlock(&kthread_create_lock);

                        create_kthread(create);

                        spin_lock(&kthread_create_lock);
                }
                spin_unlock(&kthread_create_lock);
        }

        return 0;
}

int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
{
        struct kthread *kthread = to_kthread(p);
        cpumask_var_t affinity;
        unsigned long flags;
        int ret = 0;

        if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
                WARN_ON(1);
                return -EINVAL;
        }

        WARN_ON_ONCE(kthread->preferred_affinity);

        if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
                return -ENOMEM;

        kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
        if (!kthread->preferred_affinity) {
                ret = -ENOMEM;
                goto out;
        }

        mutex_lock(&kthreads_hotplug_lock);
        cpumask_copy(kthread->preferred_affinity, mask);
        WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
        list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
        kthread_fetch_affinity(kthread, affinity);

        /* It's safe because the task is inactive. */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        do_set_cpus_allowed(p, affinity);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);

        mutex_unlock(&kthreads_hotplug_lock);
out:
        free_cpumask_var(affinity);

        return ret;
}

/*
 * Re-affine kthreads according to their preferences
 * and the newly online CPU. The CPU down part is handled
 * by select_fallback_rq() which default re-affines to
 * housekeepers from other nodes in case the preferred
 * affinity doesn't apply anymore.
 */
static int kthreads_online_cpu(unsigned int cpu)
{
        cpumask_var_t affinity;
        struct kthread *k;
        int ret;

        guard(mutex)(&kthreads_hotplug_lock);

        if (list_empty(&kthreads_hotplug))
                return 0;

        if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
                return -ENOMEM;

        ret = 0;

        list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
                if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
                                 kthread_is_per_cpu(k->task))) {
                        ret = -EINVAL;
                        continue;
                }
                kthread_fetch_affinity(k, affinity);
                set_cpus_allowed_ptr(k->task, affinity);
        }

        free_cpumask_var(affinity);

        return ret;
}

static int kthreads_init(void)
{
        return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
                                kthreads_online_cpu, NULL);
}
early_initcall(kthreads_init);

void __kthread_init_worker(struct kthread_worker *worker,
                                const char *name,
                                struct lock_class_key *key)
{
        memset(worker, 0, sizeof(struct kthread_worker));
        raw_spin_lock_init(&worker->lock);
        lockdep_set_class_and_name(&worker->lock, key, name);
        INIT_LIST_HEAD(&worker->work_list);
        INIT_LIST_HEAD(&worker->delayed_work_list);
}
EXPORT_SYMBOL_GPL(__kthread_init_worker);

/**
 * kthread_worker_fn - kthread function to process kthread_worker
 * @worker_ptr: pointer to initialized kthread_worker
 *
 * This function implements the main cycle of kthread worker. It processes
 * work_list until it is stopped with kthread_stop(). It sleeps when the queue
 * is empty.
 *
 * The works are not allowed to keep any locks, disable preemption or interrupts
 * when they finish. There is defined a safe point for freezing when one work
 * finishes and before a new one is started.
 *
 * Also the works must not be handled by more than one worker at the same time,
 * see also kthread_queue_work().
 */
int kthread_worker_fn(void *worker_ptr)
{
        struct kthread_worker *worker = worker_ptr;
        struct kthread_work *work;

        /*
         * FIXME: Update the check and remove the assignment when all kthread
         * worker users are created using kthread_create_worker*() functions.
         */
        WARN_ON(worker->task && worker->task != current);
        worker->task = current;

        if (worker->flags & KTW_FREEZABLE)
                set_freezable();

repeat:
        set_current_state(TASK_INTERRUPTIBLE);        /* mb paired w/ kthread_stop */

        if (kthread_should_stop()) {
                __set_current_state(TASK_RUNNING);
                raw_spin_lock_irq(&worker->lock);
                worker->task = NULL;
                raw_spin_unlock_irq(&worker->lock);
                return 0;
        }

        work = NULL;
        raw_spin_lock_irq(&worker->lock);
        if (!list_empty(&worker->work_list)) {
                work = list_first_entry(&worker->work_list,
                                        struct kthread_work, node);
                list_del_init(&work->node);
        }
        worker->current_work = work;
        raw_spin_unlock_irq(&worker->lock);

        if (work) {
                kthread_work_func_t func = work->func;
                __set_current_state(TASK_RUNNING);
                trace_sched_kthread_work_execute_start(work);
                work->func(work);
                /*
                 * Avoid dereferencing work after this point.  The trace
                 * event only cares about the address.
                 */
                trace_sched_kthread_work_execute_end(work, func);
        } else if (!freezing(current)) {
                schedule();
        } else {
                /*
                 * Handle the case where the current remains
                 * TASK_INTERRUPTIBLE. try_to_freeze() expects
                 * the current to be TASK_RUNNING.
                 */
                __set_current_state(TASK_RUNNING);
        }

        try_to_freeze();
        cond_resched();
        goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);

static __printf(3, 0) struct kthread_worker *
__kthread_create_worker_on_node(unsigned int flags, int node,
                                const char namefmt[], va_list args)
{
        struct kthread_worker *worker;
        struct task_struct *task;

        worker = kzalloc(sizeof(*worker), GFP_KERNEL);
        if (!worker)
                return ERR_PTR(-ENOMEM);

        kthread_init_worker(worker);

        task = __kthread_create_on_node(kthread_worker_fn, worker,
                                        node, namefmt, args);
        if (IS_ERR(task))
                goto fail_task;

        worker->flags = flags;
        worker->task = task;

        return worker;

fail_task:
        kfree(worker);
        return ERR_CAST(task);
}

/**
 * kthread_create_worker_on_node - create a kthread worker
 * @flags: flags modifying the default behavior of the worker
 * @node: task structure for the thread is allocated on this node
 * @namefmt: printf-style name for the kthread worker (task).
 *
 * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...)
{
        struct kthread_worker *worker;
        va_list args;

        va_start(args, namefmt);
        worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
        va_end(args);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_node);

/**
 * kthread_create_worker_on_cpu - create a kthread worker and bind it
 *        to a given CPU and the associated NUMA node.
 * @cpu: CPU number
 * @flags: flags modifying the default behavior of the worker
 * @namefmt: printf-style name for the thread. Format is restricted
 *             to "name.*%u". Code fills in cpu number.
 *
 * Use a valid CPU number if you want to bind the kthread worker
 * to the given CPU and the associated NUMA node.
 *
 * A good practice is to add the cpu number also into the worker name.
 * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu).
 *
 * CPU hotplug:
 * The kthread worker API is simple and generic. It just provides a way
 * to create, use, and destroy workers.
 *
 * It is up to the API user how to handle CPU hotplug. They have to decide
 * how to handle pending work items, prevent queuing new ones, and
 * restore the functionality when the CPU goes off and on. There are a
 * few catches:
 *
 *    - CPU affinity gets lost when it is scheduled on an offline CPU.
 *
 *    - The worker might not exist when the CPU was off when the user
 *      created the workers.
 *
 * Good practice is to implement two CPU hotplug callbacks and to
 * destroy/create the worker when the CPU goes down/up.
 *
 * Return:
 * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
 * when the needed structures could not get allocated, and ERR_PTR(-EINTR)
 * when the caller was killed by a fatal signal.
 */
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
                             const char namefmt[])
{
        struct kthread_worker *worker;

        worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu);
        if (!IS_ERR(worker))
                kthread_bind(worker->task, cpu);

        return worker;
}
EXPORT_SYMBOL(kthread_create_worker_on_cpu);

/*
 * Returns true when the work could not be queued at the moment.
 * It happens when it is already pending in a worker list
 * or when it is being cancelled.
 */
static inline bool queuing_blocked(struct kthread_worker *worker,
                                   struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);

        return !list_empty(&work->node) || work->canceling;
}

static void kthread_insert_work_sanity_check(struct kthread_worker *worker,
                                             struct kthread_work *work)
{
        lockdep_assert_held(&worker->lock);
        WARN_ON_ONCE(!list_empty(&work->node));
        /* Do not use a work with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker && work->worker != worker);
}

/* insert @work before @pos in @worker */
static void kthread_insert_work(struct kthread_worker *worker,
                                struct kthread_work *work,
                                struct list_head *pos)
{
        kthread_insert_work_sanity_check(worker, work);

        trace_sched_kthread_work_queue_work(worker, work);

        list_add_tail(&work->node, pos);
        work->worker = worker;
        if (!worker->current_work && likely(worker->task))
                wake_up_process(worker->task);
}

/**
 * kthread_queue_work - queue a kthread_work
 * @worker: target kthread_worker
 * @work: kthread_work to queue
 *
 * Queue @work to work processor @task for async execution.  @task
 * must have been created with kthread_create_worker().  Returns %true
 * if @work was successfully queued, %false if it was already pending.
 *
 * Reinitialize the work if it needs to be used by another worker.
 * For example, when the worker was stopped and started again.
 */
bool kthread_queue_work(struct kthread_worker *worker,
                        struct kthread_work *work)
{
        bool ret = false;
        unsigned long flags;

        raw_spin_lock_irqsave(&worker->lock, flags);
        if (!queuing_blocked(worker, work)) {
                kthread_insert_work(worker, work, &worker->work_list);
                ret = true;
        }
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_work);

/**
 * kthread_delayed_work_timer_fn - callback that queues the associated kthread
 *        delayed work when the timer expires.
 * @t: pointer to the expired timer
 *
 * The format of the function is defined by struct timer_list.
 * It should have been called from irqsafe timer with irq already off.
 */
void kthread_delayed_work_timer_fn(struct timer_list *t)
{
        struct kthread_delayed_work *dwork = from_timer(dwork, t, timer);
        struct kthread_work *work = &dwork->work;
        struct kthread_worker *worker = work->worker;
        unsigned long flags;

        /*
         * This might happen when a pending work is reinitialized.
         * It means that it is used a wrong way.
         */
        if (WARN_ON_ONCE(!worker))
                return;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        /* Move the work from worker->delayed_work_list. */
        WARN_ON_ONCE(list_empty(&work->node));
        list_del_init(&work->node);
        if (!work->canceling)
                kthread_insert_work(worker, work, &worker->work_list);

        raw_spin_unlock_irqrestore(&worker->lock, flags);
}
EXPORT_SYMBOL(kthread_delayed_work_timer_fn);

static void __kthread_queue_delayed_work(struct kthread_worker *worker,
                                         struct kthread_delayed_work *dwork,
                                         unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct kthread_work *work = &dwork->work;

        WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                kthread_insert_work(worker, work, &worker->work_list);
                return;
        }

        /* Be paranoid and try to detect possible races already now. */
        kthread_insert_work_sanity_check(worker, work);

        list_add(&work->node, &worker->delayed_work_list);
        work->worker = worker;
        timer->expires = jiffies + delay;
        add_timer(timer);
}

/**
 * kthread_queue_delayed_work - queue the associated kthread work
 *        after a delay.
 * @worker: target kthread_worker
 * @dwork: kthread_delayed_work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If the work has not been pending it starts a timer that will queue
 * the work after the given @delay. If @delay is zero, it queues the
 * work immediately.
 *
 * Return: %false if the @work has already been pending. It means that
 * either the timer was running or the work was queued. It returns %true
 * otherwise.
 */
bool kthread_queue_delayed_work(struct kthread_worker *worker,
                                struct kthread_delayed_work *dwork,
                                unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        bool ret = false;

        raw_spin_lock_irqsave(&worker->lock, flags);

        if (!queuing_blocked(worker, work)) {
                __kthread_queue_delayed_work(worker, dwork, delay);
                ret = true;
        }

        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_queue_delayed_work);

struct kthread_flush_work {
        struct kthread_work        work;
        struct completion        done;
};

static void kthread_flush_work_fn(struct kthread_work *work)
{
        struct kthread_flush_work *fwork =
                container_of(work, struct kthread_flush_work, work);
        complete(&fwork->done);
}

/**
 * kthread_flush_work - flush a kthread_work
 * @work: work to flush
 *
 * If @work is queued or executing, wait for it to finish execution.
 */
void kthread_flush_work(struct kthread_work *work)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };
        struct kthread_worker *worker;
        bool noop = false;

        worker = work->worker;
        if (!worker)
                return;

        raw_spin_lock_irq(&worker->lock);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (!list_empty(&work->node))
                kthread_insert_work(worker, &fwork.work, work->node.next);
        else if (worker->current_work == work)
                kthread_insert_work(worker, &fwork.work,
                                    worker->work_list.next);
        else
                noop = true;

        raw_spin_unlock_irq(&worker->lock);

        if (!noop)
                wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_work);

/*
 * Make sure that the timer is neither set nor running and could
 * not manipulate the work list_head any longer.
 *
 * The function is called under worker->lock. The lock is temporary
 * released but the timer can't be set again in the meantime.
 */
static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
                                              unsigned long *flags)
{
        struct kthread_delayed_work *dwork =
                container_of(work, struct kthread_delayed_work, work);
        struct kthread_worker *worker = work->worker;

        /*
         * timer_delete_sync() must be called to make sure that the timer
         * callback is not running. The lock must be temporary released
         * to avoid a deadlock with the callback. In the meantime,
         * any queuing is blocked by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, *flags);
        timer_delete_sync(&dwork->timer);
        raw_spin_lock_irqsave(&worker->lock, *flags);
        work->canceling--;
}

/*
 * This function removes the work from the worker queue.
 *
 * It is called under worker->lock. The caller must make sure that
 * the timer used by delayed work is not running, e.g. by calling
 * kthread_cancel_delayed_work_timer().
 *
 * The work might still be in use when this function finishes. See the
 * current_work proceed by the worker.
 *
 * Return: %true if @work was pending and successfully canceled,
 *        %false if @work was not pending
 */
static bool __kthread_cancel_work(struct kthread_work *work)
{
        /*
         * Try to remove the work from a worker list. It might either
         * be from worker->work_list or from worker->delayed_work_list.
         */
        if (!list_empty(&work->node)) {
                list_del_init(&work->node);
                return true;
        }

        return false;
}

/**
 * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work
 * @worker: kthread worker to use
 * @dwork: kthread delayed work to queue
 * @delay: number of jiffies to wait before queuing
 *
 * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise,
 * modify @dwork's timer so that it expires after @delay. If @delay is zero,
 * @work is guaranteed to be queued immediately.
 *
 * Return: %false if @dwork was idle and queued, %true otherwise.
 *
 * A special case is when the work is being canceled in parallel.
 * It might be caused either by the real kthread_cancel_delayed_work_sync()
 * or yet another kthread_mod_delayed_work() call. We let the other command
 * win and return %true here. The return value can be used for reference
 * counting and the number of queued works stays the same. Anyway, the caller
 * is supposed to synchronize these operations a reasonable way.
 *
 * This function is safe to call from any context including IRQ handler.
 * See __kthread_cancel_work() and kthread_delayed_work_timer_fn()
 * for details.
 */
bool kthread_mod_delayed_work(struct kthread_worker *worker,
                              struct kthread_delayed_work *dwork,
                              unsigned long delay)
{
        struct kthread_work *work = &dwork->work;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&worker->lock, flags);

        /* Do not bother with canceling when never queued. */
        if (!work->worker) {
                ret = false;
                goto fast_queue;
        }

        /* Work must not be used with >1 worker, see kthread_queue_work() */
        WARN_ON_ONCE(work->worker != worker);

        /*
         * Temporary cancel the work but do not fight with another command
         * that is canceling the work as well.
         *
         * It is a bit tricky because of possible races with another
         * mod_delayed_work() and cancel_delayed_work() callers.
         *
         * The timer must be canceled first because worker->lock is released
         * when doing so. But the work can be removed from the queue (list)
         * only when it can be queued again so that the return value can
         * be used for reference counting.
         */
        kthread_cancel_delayed_work_timer(work, &flags);
        if (work->canceling) {
                /* The number of works in the queue does not change. */
                ret = true;
                goto out;
        }
        ret = __kthread_cancel_work(work);

fast_queue:
        __kthread_queue_delayed_work(worker, dwork, delay);
out:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        return ret;
}
EXPORT_SYMBOL_GPL(kthread_mod_delayed_work);

static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork)
{
        struct kthread_worker *worker = work->worker;
        unsigned long flags;
        int ret = false;

        if (!worker)
                goto out;

        raw_spin_lock_irqsave(&worker->lock, flags);
        /* Work must not be used with >1 worker, see kthread_queue_work(). */
        WARN_ON_ONCE(work->worker != worker);

        if (is_dwork)
                kthread_cancel_delayed_work_timer(work, &flags);

        ret = __kthread_cancel_work(work);

        if (worker->current_work != work)
                goto out_fast;

        /*
         * The work is in progress and we need to wait with the lock released.
         * In the meantime, block any queuing by setting the canceling counter.
         */
        work->canceling++;
        raw_spin_unlock_irqrestore(&worker->lock, flags);
        kthread_flush_work(work);
        raw_spin_lock_irqsave(&worker->lock, flags);
        work->canceling--;

out_fast:
        raw_spin_unlock_irqrestore(&worker->lock, flags);
out:
        return ret;
}

/**
 * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
 * @work: the kthread work to cancel
 *
 * Cancel @work and wait for its execution to finish.  This function
 * can be used even if the work re-queues itself. On return from this
 * function, @work is guaranteed to be not pending or executing on any CPU.
 *
 * kthread_cancel_work_sync(&delayed_work->work) must not be used for
 * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
 *
 * The caller must ensure that the worker on which @work was last
 * queued can't be destroyed before this function returns.
 *
 * Return: %true if @work was pending, %false otherwise.
 */
bool kthread_cancel_work_sync(struct kthread_work *work)
{
        return __kthread_cancel_work_sync(work, false);
}
EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);

/**
 * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and
 *        wait for it to finish.
 * @dwork: the kthread delayed work to cancel
 *
 * This is kthread_cancel_work_sync() for delayed works.
 *
 * Return: %true if @dwork was pending, %false otherwise.
 */
bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork)
{
        return __kthread_cancel_work_sync(&dwork->work, true);
}
EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync);

/**
 * kthread_flush_worker - flush all current works on a kthread_worker
 * @worker: worker to flush
 *
 * Wait until all currently executing or pending works on @worker are
 * finished.
 */
void kthread_flush_worker(struct kthread_worker *worker)
{
        struct kthread_flush_work fwork = {
                KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn),
                COMPLETION_INITIALIZER_ONSTACK(fwork.done),
        };

        kthread_queue_work(worker, &fwork.work);
        wait_for_completion(&fwork.done);
}
EXPORT_SYMBOL_GPL(kthread_flush_worker);

/**
 * kthread_destroy_worker - destroy a kthread worker
 * @worker: worker to be destroyed
 *
 * Flush and destroy @worker.  The simple flush is enough because the kthread
 * worker API is used only in trivial scenarios.  There are no multi-step state
 * machines needed.
 *
 * Note that this function is not responsible for handling delayed work, so
 * caller should be responsible for queuing or canceling all delayed work items
 * before invoke this function.
 */
void kthread_destroy_worker(struct kthread_worker *worker)
{
        struct task_struct *task;

        task = worker->task;
        if (WARN_ON(!task))
                return;

        kthread_flush_worker(worker);
        kthread_stop(task);
        WARN_ON(!list_empty(&worker->delayed_work_list));
        WARN_ON(!list_empty(&worker->work_list));
        kfree(worker);
}
EXPORT_SYMBOL(kthread_destroy_worker);

/**
 * kthread_use_mm - make the calling kthread operate on an address space
 * @mm: address space to operate on
 */
void kthread_use_mm(struct mm_struct *mm)
{
        struct mm_struct *active_mm;
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(tsk->mm);

        /*
         * It is possible for mm to be the same as tsk->active_mm, but
         * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm),
         * because these references are not equivalent.
         */
        mmgrab(mm);

        task_lock(tsk);
        /* Hold off tlb flush IPIs while switching mm's */
        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        membarrier_update_current_mm(mm);
        switch_mm_irqs_off(active_mm, mm, tsk);
        local_irq_enable();
        task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
        finish_arch_post_lock_switch();
#endif

        /*
         * When a kthread starts operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after storing to tsk->mm, before accessing
         * user-space memory. A full memory barrier for membarrier
         * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
         * mmdrop_lazy_tlb().
         */
        mmdrop_lazy_tlb(active_mm);
}
EXPORT_SYMBOL_GPL(kthread_use_mm);

/**
 * kthread_unuse_mm - reverse the effect of kthread_use_mm()
 * @mm: address space to operate on
 */
void kthread_unuse_mm(struct mm_struct *mm)
{
        struct task_struct *tsk = current;

        WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
        WARN_ON_ONCE(!tsk->mm);

        task_lock(tsk);
        /*
         * When a kthread stops operating on an address space, the loop
         * in membarrier_{private,global}_expedited() may not observe
         * that tsk->mm, and not issue an IPI. Membarrier requires a
         * memory barrier after accessing user-space memory, before
         * clearing tsk->mm.
         */
        smp_mb__after_spinlock();
        local_irq_disable();
        tsk->mm = NULL;
        membarrier_update_current_mm(NULL);
        mmgrab_lazy_tlb(mm);
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
        local_irq_enable();
        task_unlock(tsk);

        mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);

#ifdef CONFIG_BLK_CGROUP
/**
 * kthread_associate_blkcg - associate blkcg to current kthread
 * @css: the cgroup info
 *
 * Current thread must be a kthread. The thread is running jobs on behalf of
 * other threads. In some cases, we expect the jobs attach cgroup info of
 * original threads instead of that of current thread. This function stores
 * original thread's cgroup info in current kthread context for later
 * retrieval.
 */
void kthread_associate_blkcg(struct cgroup_subsys_state *css)
{
        struct kthread *kthread;

        if (!(current->flags & PF_KTHREAD))
                return;
        kthread = to_kthread(current);
        if (!kthread)
                return;

        if (kthread->blkcg_css) {
                css_put(kthread->blkcg_css);
                kthread->blkcg_css = NULL;
        }
        if (css) {
                css_get(css);
                kthread->blkcg_css = css;
        }
}
EXPORT_SYMBOL(kthread_associate_blkcg);

/**
 * kthread_blkcg - get associated blkcg css of current kthread
 *
 * Current thread must be a kthread.
 */
struct cgroup_subsys_state *kthread_blkcg(void)
{
        struct kthread *kthread;

        if (current->flags & PF_KTHREAD) {
                kthread = to_kthread(current);
                if (kthread)
                        return kthread->blkcg_css;
        }
        return NULL;
}
#endif













  280 















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ARM64_KVM_NESTED_H
#define __ARM64_KVM_NESTED_H

#include <linux/bitfield.h>
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_pgtable.h>

static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu)
{
        return (!__is_defined(__KVM_NVHE_HYPERVISOR__) &&
                cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) &&
                vcpu_has_feature(vcpu, KVM_ARM_VCPU_HAS_EL2));
}

/* Translation helpers from non-VHE EL2 to EL1 */
static inline u64 tcr_el2_ps_to_tcr_el1_ips(u64 tcr_el2)
{
        return (u64)FIELD_GET(TCR_EL2_PS_MASK, tcr_el2) << TCR_IPS_SHIFT;
}

static inline u64 translate_tcr_el2_to_tcr_el1(u64 tcr)
{
        return TCR_EPD1_MASK |                                /* disable TTBR1_EL1 */
               ((tcr & TCR_EL2_TBI) ? TCR_TBI0 : 0) |
               tcr_el2_ps_to_tcr_el1_ips(tcr) |
               (tcr & TCR_EL2_TG0_MASK) |
               (tcr & TCR_EL2_ORGN0_MASK) |
               (tcr & TCR_EL2_IRGN0_MASK) |
               (tcr & TCR_EL2_T0SZ_MASK);
}

static inline u64 translate_cptr_el2_to_cpacr_el1(u64 cptr_el2)
{
        u64 cpacr_el1 = CPACR_EL1_RES1;

        if (cptr_el2 & CPTR_EL2_TTA)
                cpacr_el1 |= CPACR_EL1_TTA;
        if (!(cptr_el2 & CPTR_EL2_TFP))
                cpacr_el1 |= CPACR_EL1_FPEN;
        if (!(cptr_el2 & CPTR_EL2_TZ))
                cpacr_el1 |= CPACR_EL1_ZEN;

        cpacr_el1 |= cptr_el2 & (CPTR_EL2_TCPAC | CPTR_EL2_TAM);

        return cpacr_el1;
}

static inline u64 translate_sctlr_el2_to_sctlr_el1(u64 val)
{
        /* Only preserve the minimal set of bits we support */
        val &= (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | SCTLR_ELx_SA |
                SCTLR_ELx_I | SCTLR_ELx_IESB | SCTLR_ELx_WXN | SCTLR_ELx_EE);
        val |= SCTLR_EL1_RES1;

        return val;
}

static inline u64 translate_ttbr0_el2_to_ttbr0_el1(u64 ttbr0)
{
        /* Clear the ASID field */
        return ttbr0 & ~GENMASK_ULL(63, 48);
}

extern bool forward_smc_trap(struct kvm_vcpu *vcpu);
extern bool forward_debug_exception(struct kvm_vcpu *vcpu);
extern void kvm_init_nested(struct kvm *kvm);
extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);

union tlbi_info;

extern void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
                                       const union tlbi_info *info,
                                       void (*)(struct kvm_s2_mmu *,
                                                const union tlbi_info *));
extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu);
extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu);

extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu);

struct kvm_s2_trans {
        phys_addr_t output;
        unsigned long block_size;
        bool writable;
        bool readable;
        int level;
        u32 esr;
        u64 desc;
};

static inline phys_addr_t kvm_s2_trans_output(struct kvm_s2_trans *trans)
{
        return trans->output;
}

static inline unsigned long kvm_s2_trans_size(struct kvm_s2_trans *trans)
{
        return trans->block_size;
}

static inline u32 kvm_s2_trans_esr(struct kvm_s2_trans *trans)
{
        return trans->esr;
}

static inline bool kvm_s2_trans_readable(struct kvm_s2_trans *trans)
{
        return trans->readable;
}

static inline bool kvm_s2_trans_writable(struct kvm_s2_trans *trans)
{
        return trans->writable;
}

static inline bool kvm_s2_trans_executable(struct kvm_s2_trans *trans)
{
        return !(trans->desc & BIT(54));
}

extern int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
                              struct kvm_s2_trans *result);
extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
                                    struct kvm_s2_trans *trans);
extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
extern void kvm_nested_s2_wp(struct kvm *kvm);
extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block);
extern void kvm_nested_s2_flush(struct kvm *kvm);

unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val);

static inline bool kvm_supported_tlbi_s1e1_op(struct kvm_vcpu *vpcu, u32 instr)
{
        struct kvm *kvm = vpcu->kvm;
        u8 CRm = sys_reg_CRm(instr);

        if (!(sys_reg_Op0(instr) == TLBI_Op0 &&
              sys_reg_Op1(instr) == TLBI_Op1_EL1))
                return false;

        if (!(sys_reg_CRn(instr) == TLBI_CRn_XS ||
              (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
               kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))))
                return false;

        if (CRm == TLBI_CRm_nROS &&
            !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
                return false;

        if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS ||
             CRm == TLBI_CRm_RNS) &&
            !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
                return false;

        return true;
}

static inline bool kvm_supported_tlbi_s1e2_op(struct kvm_vcpu *vpcu, u32 instr)
{
        struct kvm *kvm = vpcu->kvm;
        u8 CRm = sys_reg_CRm(instr);

        if (!(sys_reg_Op0(instr) == TLBI_Op0 &&
              sys_reg_Op1(instr) == TLBI_Op1_EL2))
                return false;

        if (!(sys_reg_CRn(instr) == TLBI_CRn_XS ||
              (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
               kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))))
                return false;

        if (CRm == TLBI_CRm_IPAIS || CRm == TLBI_CRm_IPAONS)
                return false;

        if (CRm == TLBI_CRm_nROS &&
            !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
                return false;

        if ((CRm == TLBI_CRm_RIS || CRm == TLBI_CRm_ROS ||
             CRm == TLBI_CRm_RNS) &&
            !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
                return false;

        return true;
}

int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu);
u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val);

#ifdef CONFIG_ARM64_PTR_AUTH
bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr);
#else
static inline bool kvm_auth_eretax(struct kvm_vcpu *vcpu, u64 *elr)
{
        /* We really should never execute this... */
        WARN_ON_ONCE(1);
        *elr = 0xbad9acc0debadbad;
        return false;
}
#endif

#define KVM_NV_GUEST_MAP_SZ        (KVM_PGTABLE_PROT_SW1 | KVM_PGTABLE_PROT_SW0)

static inline u64 kvm_encode_nested_level(struct kvm_s2_trans *trans)
{
        return FIELD_PREP(KVM_NV_GUEST_MAP_SZ, trans->level);
}

/* Adjust alignment for the contiguous bit as per StageOA() */
#define contiguous_bit_shift(d, wi, l)                                        \
        ({                                                                \
                u8 shift = 0;                                                \
                                                                        \
                if ((d) & PTE_CONT) {                                        \
                        switch (BIT((wi)->pgshift)) {                        \
                        case SZ_4K:                                        \
                                shift = 4;                                \
                                break;                                        \
                        case SZ_16K:                                        \
                                shift = (l) == 2 ? 5 : 7;                \
                                break;                                        \
                        case SZ_64K:                                        \
                                shift = 5;                                \
                                break;                                        \
                        }                                                \
                }                                                        \
                                                                        \
                shift;                                                        \
        })

static inline u64 decode_range_tlbi(u64 val, u64 *range, u16 *asid)
{
        u64 base, tg, num, scale;
        int shift;

        tg        = FIELD_GET(GENMASK(47, 46), val);

        switch(tg) {
        case 1:
                shift = 12;
                break;
        case 2:
                shift = 14;
                break;
        case 3:
        default:                /* IMPDEF: handle tg==0 as 64k */
                shift = 16;
                break;
        }

        base        = (val & GENMASK(36, 0)) << shift;

        if (asid)
                *asid = FIELD_GET(TLBIR_ASID_MASK, val);

        scale        = FIELD_GET(GENMASK(45, 44), val);
        num        = FIELD_GET(GENMASK(43, 39), val);
        *range        = __TLBI_RANGE_PAGES(num, scale) << shift;

        return base;
}

static inline unsigned int ps_to_output_size(unsigned int ps)
{
        switch (ps) {
        case 0: return 32;
        case 1: return 36;
        case 2: return 40;
        case 3: return 42;
        case 4: return 44;
        case 5:
        default:
                return 48;
        }
}

enum trans_regime {
        TR_EL10,
        TR_EL20,
        TR_EL2,
};

struct s1_walk_info {
        u64                             baddr;
        enum trans_regime        regime;
        unsigned int                max_oa_bits;
        unsigned int                pgshift;
        unsigned int                txsz;
        int                              sl;
        bool                        as_el0;
        bool                             hpd;
        bool                        e0poe;
        bool                        poe;
        bool                        pan;
        bool                             be;
        bool                             s2;
};

struct s1_walk_result {
        union {
                struct {
                        u64        desc;
                        u64        pa;
                        s8        level;
                        u8        APTable;
                        bool        nG;
                        u16        asid;
                        bool        UXNTable;
                        bool        PXNTable;
                        bool        uwxn;
                        bool        uov;
                        bool        ur;
                        bool        uw;
                        bool        ux;
                        bool        pwxn;
                        bool        pov;
                        bool        pr;
                        bool        pw;
                        bool        px;
                };
                struct {
                        u8        fst;
                        bool        ptw;
                        bool        s2;
                };
        };
        bool        failed;
};

int __kvm_translate_va(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
                       struct s1_walk_result *wr, u64 va);

/* VNCR management */
int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu);
int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu);
void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val);

#define vncr_fixmap(c)                                                \
        ({                                                        \
                u32 __c = (c);                                        \
                BUG_ON(__c >= NR_CPUS);                                \
                (FIX_VNCR - __c);                                \
        })

#endif /* __ARM64_KVM_NESTED_H */
































  647 
  649 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
// SPDX-License-Identifier: GPL-2.0-only
/*
 * A generic implementation of binary search for the Linux kernel
 *
 * Copyright (C) 2008-2009 Ksplice, Inc.
 * Author: Tim Abbott <tabbott@ksplice.com>
 */

#include <linux/export.h>
#include <linux/bsearch.h>
#include <linux/kprobes.h>

/*
 * bsearch - binary search an array of elements
 * @key: pointer to item being searched for
 * @base: pointer to first element to search
 * @num: number of elements
 * @size: size of each element
 * @cmp: pointer to comparison function
 *
 * This function does a binary search on the given array.  The
 * contents of the array should already be in ascending sorted order
 * under the provided comparison function.
 *
 * Note that the key need not have the same type as the elements in
 * the array, e.g. key could be a string and the comparison function
 * could compare the string with the struct's name field.  However, if
 * the key and elements in the array are of the same type, you can use
 * the same comparison function for both sort() and bsearch().
 */
void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        return __inline_bsearch(key, base, num, size, cmp);
}
EXPORT_SYMBOL(bsearch);
NOKPROBE_SYMBOL(bsearch);




























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/traps.h
 *
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_TRAP_H
#define __ASM_TRAP_H

#include <linux/list.h>
#include <asm/esr.h>
#include <asm/ptrace.h>
#include <asm/sections.h>

#ifdef CONFIG_ARMV8_DEPRECATED
bool try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn);
#else
static inline bool
try_emulate_armv8_deprecated(struct pt_regs *regs, u32 insn)
{
        return false;
}
#endif /* CONFIG_ARMV8_DEPRECATED */

void force_signal_inject(int signal, int code, unsigned long address, unsigned long err);
void arm64_notify_segfault(unsigned long addr);
void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str);
void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey);
void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str);
void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str);

int early_brk64(unsigned long addr, unsigned long esr, struct pt_regs *regs);

/*
 * Move regs->pc to next instruction and do necessary setup before it
 * is executed.
 */
void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size);

static inline int __in_irqentry_text(unsigned long ptr)
{
        return ptr >= (unsigned long)&__irqentry_text_start &&
               ptr < (unsigned long)&__irqentry_text_end;
}

static inline int in_entry_text(unsigned long ptr)
{
        return ptr >= (unsigned long)&__entry_text_start &&
               ptr < (unsigned long)&__entry_text_end;
}

/*
 * CPUs with the RAS extensions have an Implementation-Defined-Syndrome bit
 * to indicate whether this ESR has a RAS encoding. CPUs without this feature
 * have a ISS-Valid bit in the same position.
 * If this bit is set, we know its not a RAS SError.
 * If its clear, we need to know if the CPU supports RAS. Uncategorized RAS
 * errors share the same encoding as an all-zeros encoding from a CPU that
 * doesn't support RAS.
 */
static inline bool arm64_is_ras_serror(unsigned long esr)
{
        WARN_ON(preemptible());

        if (esr & ESR_ELx_IDS)
                return false;

        if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN))
                return true;
        else
                return false;
}

/*
 * Return the AET bits from a RAS SError's ESR.
 *
 * It is implementation defined whether Uncategorized errors are containable.
 * We treat them as Uncontainable.
 * Non-RAS SError's are reported as Uncontained/Uncategorized.
 */
static inline unsigned long arm64_ras_serror_get_severity(unsigned long esr)
{
        unsigned long aet = esr & ESR_ELx_AET;

        if (!arm64_is_ras_serror(esr)) {
                /* Not a RAS error, we can't interpret the ESR. */
                return ESR_ELx_AET_UC;
        }

        /*
         * AET is RES0 if 'the value returned in the DFSC field is not
         * [ESR_ELx_FSC_SERROR]'
         */
        if ((esr & ESR_ELx_FSC) != ESR_ELx_FSC_SERROR) {
                /* No severity information : Uncategorized */
                return ESR_ELx_AET_UC;
        }

        return aet;
}

bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr);
void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr);

static inline void arm64_mops_reset_regs(struct user_pt_regs *regs, unsigned long esr)
{
        bool wrong_option = esr & ESR_ELx_MOPS_ISS_WRONG_OPTION;
        bool option_a = esr & ESR_ELx_MOPS_ISS_OPTION_A;
        int dstreg = ESR_ELx_MOPS_ISS_DESTREG(esr);
        int srcreg = ESR_ELx_MOPS_ISS_SRCREG(esr);
        int sizereg = ESR_ELx_MOPS_ISS_SIZEREG(esr);
        unsigned long dst, size;

        dst = regs->regs[dstreg];
        size = regs->regs[sizereg];

        /*
         * Put the registers back in the original format suitable for a
         * prologue instruction, using the generic return routine from the
         * Arm ARM (DDI 0487I.a) rules CNTMJ and MWFQH.
         */
        if (esr & ESR_ELx_MOPS_ISS_MEM_INST) {
                /* SET* instruction */
                if (option_a ^ wrong_option) {
                        /* Format is from Option A; forward set */
                        regs->regs[dstreg] = dst + size;
                        regs->regs[sizereg] = -size;
                }
        } else {
                /* CPY* instruction */
                unsigned long src = regs->regs[srcreg];
                if (!(option_a ^ wrong_option)) {
                        /* Format is from Option B */
                        if (regs->pstate & PSR_N_BIT) {
                                /* Backward copy */
                                regs->regs[dstreg] = dst - size;
                                regs->regs[srcreg] = src - size;
                        }
                } else {
                        /* Format is from Option A */
                        if (size & BIT(63)) {
                                /* Forward copy */
                                regs->regs[dstreg] = dst + size;
                                regs->regs[srcreg] = src + size;
                                regs->regs[sizereg] = -size;
                        }
                }
        }

        if (esr & ESR_ELx_MOPS_ISS_FROM_EPILOGUE)
                regs->pc -= 8;
        else
                regs->pc -= 4;
}
#endif






















































    1 
    1 












































  137 
  137 



















































  163 




  165 



   14 









































  106 


    1 
  106 


  165 



  102 



   76 







































   33 


   33 



   76 











    4 


























    4 

































    1 




    2 






    1 
    1 

    1 












    1 

    1 




















































    1 

























































    3 



    1 























    2 







































    1 
    1 


    2 











    2 








    2 





    1 



    1 






    1 



    1 





    2 




    1 


    1 


















    2 
    2 

































    1 
























    1 








    1 







    1 





    1 





    1 





    1 




   72 










   72 




   72 






   72 






   68 




















   21 







   72 







   72 

   72 



   72 

   72 

   72 



   72 

   58 



   15 

   15 



   72 

   72 

   72 



   72 


   72 






   72 

   72 




    6 


    6 







    6 




    1 




    1 




    2 




    4 


    4 










    4 














    1 
    1 



    1 




































    1 








    4 


    4 












    1 


    1 













    2 












    1 












    1 












    2 












    2 


    2 




    2 












    1 



    1 









    1 


    1 
    1 





    1 


    1 





    1 


    1 







    1 


    1 











    4 


    4 







    3 


    4 









    1 


    1 








    1 









    1 





    1 


























    1 






    2 











    2 

    1 















    2 


    2 
    2 












































    9 
    9 



































































































































































































    4 

    1 









    2 



































    7 
    3 







    7 




    6 






    5 


    7 



































   72 




   66 

   65 


   65 


   63 
















   64 


   64 






   64 



   64 



   64 



   65 


   64 



   64 



   69 







   72 
   72 




   13 




   72 
   72 














   72 






   71 








   72 











   82 


   83 

   73 









   74 





   74 


   74 




   74 








    2 



    2 







   11 
   11 







    7 








    8 
    8 







   65 












































   54 
   12 














    2 

















    1 






    1 







   65 




   54 

   12 










    3 














    1 










    1 


























































    1 





    1 








    1 















    2 













    1 





    1 














    6 





    2 




    5 







   11 










    2 








   10 


















   66 
   66 

   65 




   66 




    1 



































   68 













   68 




























   68 


   68 

   68 




    3 

    3 

    1 


    2 


































    9 
    8 















    9 
   11 







































































































































































   72 





   72 







    9 




    9 
    9 




    9 
    9 



















































































    8 








    8 
    8 




    9 








    8 
    8 




    9 








    8 
    8 
























































    1 



















    1 




















   64 


   64 

   64 

   64 








    3 





    3 



    3 






























   72 
   72 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   21 
   21 


   20 
    1 











    3 
   17 






















    2 
















    2 





































































































































































































    2 


    2 

    2 





    2 









    2 










    2 

















































































































































  244 











    9 


   64 





    3 


   68 










   72 



   72 


   72 


   72 
   72 
   72 
   71 



   72 
   72 


   72 

   58 
   15 








   23 






   23 

    2 



   21 










   12 

    9 








































    1 





   92 







   92 



    1 


   91 


   90 











    1 









    1 


    3 











    1 












    5 


    3 







   19 






   19 





    2 



   11 
   13 





   23 



    5 

   17 





   75 





   76 



   55 


   19 



   27 

   40 







   80 



    6 

   74 














    4 
    4 



















    8 


    5 


    5 












    8 


    8 


    8 








    8 







    8 
    8 







    8 





    4 



    1 



    4 










    3 
    3 



    2 




    1 



























  166 



  165 





  164 


  165 





  149 
   16 

  166 







  165 




  165 



  166 

  166 

    6 










  164 













  166 




   59 
  107 





  165 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Derived from arch/arm/kvm/coproc.c:
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Authors: Rusty Russell <rusty@rustcorp.com.au>
 *          Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <linux/bitfield.h>
#include <linux/bsearch.h>
#include <linux/cacheinfo.h>
#include <linux/debugfs.h>
#include <linux/kvm_host.h>
#include <linux/mm.h>
#include <linux/printk.h>
#include <linux/uaccess.h>
#include <linux/irqchip/arm-gic-v3.h>

#include <asm/arm_pmuv3.h>
#include <asm/cacheflush.h>
#include <asm/cputype.h>
#include <asm/debug-monitors.h>
#include <asm/esr.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/perf_event.h>
#include <asm/sysreg.h>

#include <trace/events/kvm.h>

#include "sys_regs.h"
#include "vgic/vgic.h"

#include "trace.h"

/*
 * For AArch32, we only take care of what is being trapped. Anything
 * that has to do with init and userspace access has to go via the
 * 64bit interface.
 */

static u64 sys_reg_to_index(const struct sys_reg_desc *reg);
static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                      u64 val);

static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                         const struct sys_reg_desc *r)
{
        kvm_inject_undefined(vcpu);
        return false;
}

static bool bad_trap(struct kvm_vcpu *vcpu,
                     struct sys_reg_params *params,
                     const struct sys_reg_desc *r,
                     const char *msg)
{
        WARN_ONCE(1, "Unexpected %s\n", msg);
        print_sys_reg_instr(params);
        return undef_access(vcpu, params, r);
}

static bool read_from_write_only(struct kvm_vcpu *vcpu,
                                 struct sys_reg_params *params,
                                 const struct sys_reg_desc *r)
{
        return bad_trap(vcpu, params, r,
                        "sys_reg read to write-only register");
}

static bool write_to_read_only(struct kvm_vcpu *vcpu,
                               struct sys_reg_params *params,
                               const struct sys_reg_desc *r)
{
        return bad_trap(vcpu, params, r,
                        "sys_reg write to read-only register");
}

#define PURE_EL2_SYSREG(el2)                                                \
        case el2: {                                                        \
                *el1r = el2;                                                \
                return true;                                                \
        }

#define MAPPED_EL2_SYSREG(el2, el1, fn)                                        \
        case el2: {                                                        \
                *xlate = fn;                                                \
                *el1r = el1;                                                \
                return true;                                                \
        }

static bool get_el2_to_el1_mapping(unsigned int reg,
                                   unsigned int *el1r, u64 (**xlate)(u64))
{
        switch (reg) {
                PURE_EL2_SYSREG(  VPIDR_EL2        );
                PURE_EL2_SYSREG(  VMPIDR_EL2        );
                PURE_EL2_SYSREG(  ACTLR_EL2        );
                PURE_EL2_SYSREG(  HCR_EL2        );
                PURE_EL2_SYSREG(  MDCR_EL2        );
                PURE_EL2_SYSREG(  HSTR_EL2        );
                PURE_EL2_SYSREG(  HACR_EL2        );
                PURE_EL2_SYSREG(  VTTBR_EL2        );
                PURE_EL2_SYSREG(  VTCR_EL2        );
                PURE_EL2_SYSREG(  RVBAR_EL2        );
                PURE_EL2_SYSREG(  TPIDR_EL2        );
                PURE_EL2_SYSREG(  HPFAR_EL2        );
                PURE_EL2_SYSREG(  HCRX_EL2        );
                PURE_EL2_SYSREG(  HFGRTR_EL2        );
                PURE_EL2_SYSREG(  HFGWTR_EL2        );
                PURE_EL2_SYSREG(  HFGITR_EL2        );
                PURE_EL2_SYSREG(  HDFGRTR_EL2        );
                PURE_EL2_SYSREG(  HDFGWTR_EL2        );
                PURE_EL2_SYSREG(  HAFGRTR_EL2        );
                PURE_EL2_SYSREG(  CNTVOFF_EL2        );
                PURE_EL2_SYSREG(  CNTHCTL_EL2        );
                MAPPED_EL2_SYSREG(SCTLR_EL2,   SCTLR_EL1,
                                  translate_sctlr_el2_to_sctlr_el1             );
                MAPPED_EL2_SYSREG(CPTR_EL2,    CPACR_EL1,
                                  translate_cptr_el2_to_cpacr_el1             );
                MAPPED_EL2_SYSREG(TTBR0_EL2,   TTBR0_EL1,
                                  translate_ttbr0_el2_to_ttbr0_el1             );
                MAPPED_EL2_SYSREG(TTBR1_EL2,   TTBR1_EL1,   NULL             );
                MAPPED_EL2_SYSREG(TCR_EL2,     TCR_EL1,
                                  translate_tcr_el2_to_tcr_el1                     );
                MAPPED_EL2_SYSREG(VBAR_EL2,    VBAR_EL1,    NULL             );
                MAPPED_EL2_SYSREG(AFSR0_EL2,   AFSR0_EL1,   NULL             );
                MAPPED_EL2_SYSREG(AFSR1_EL2,   AFSR1_EL1,   NULL             );
                MAPPED_EL2_SYSREG(ESR_EL2,     ESR_EL1,     NULL             );
                MAPPED_EL2_SYSREG(FAR_EL2,     FAR_EL1,     NULL             );
                MAPPED_EL2_SYSREG(MAIR_EL2,    MAIR_EL1,    NULL             );
                MAPPED_EL2_SYSREG(TCR2_EL2,    TCR2_EL1,    NULL             );
                MAPPED_EL2_SYSREG(PIR_EL2,     PIR_EL1,     NULL             );
                MAPPED_EL2_SYSREG(PIRE0_EL2,   PIRE0_EL1,   NULL             );
                MAPPED_EL2_SYSREG(POR_EL2,     POR_EL1,     NULL             );
                MAPPED_EL2_SYSREG(AMAIR_EL2,   AMAIR_EL1,   NULL             );
                MAPPED_EL2_SYSREG(ELR_EL2,     ELR_EL1,            NULL             );
                MAPPED_EL2_SYSREG(SPSR_EL2,    SPSR_EL1,    NULL             );
                MAPPED_EL2_SYSREG(ZCR_EL2,     ZCR_EL1,     NULL             );
                MAPPED_EL2_SYSREG(CONTEXTIDR_EL2, CONTEXTIDR_EL1, NULL             );
        default:
                return false;
        }
}

u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
{
        u64 val = 0x8badf00d8badf00d;
        u64 (*xlate)(u64) = NULL;
        unsigned int el1r;

        if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
                goto memory_read;

        if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
                if (!is_hyp_ctxt(vcpu))
                        goto memory_read;

                /*
                 * CNTHCTL_EL2 requires some special treatment to
                 * account for the bits that can be set via CNTKCTL_EL1.
                 */
                switch (reg) {
                case CNTHCTL_EL2:
                        if (vcpu_el2_e2h_is_set(vcpu)) {
                                val = read_sysreg_el1(SYS_CNTKCTL);
                                val &= CNTKCTL_VALID_BITS;
                                val |= __vcpu_sys_reg(vcpu, reg) & ~CNTKCTL_VALID_BITS;
                                return val;
                        }
                        break;
                }

                /*
                 * If this register does not have an EL1 counterpart,
                 * then read the stored EL2 version.
                 */
                if (reg == el1r)
                        goto memory_read;

                /*
                 * If we have a non-VHE guest and that the sysreg
                 * requires translation to be used at EL1, use the
                 * in-memory copy instead.
                 */
                if (!vcpu_el2_e2h_is_set(vcpu) && xlate)
                        goto memory_read;

                /* Get the current version of the EL1 counterpart. */
                WARN_ON(!__vcpu_read_sys_reg_from_cpu(el1r, &val));
                if (reg >= __SANITISED_REG_START__)
                        val = kvm_vcpu_apply_reg_masks(vcpu, reg, val);

                return val;
        }

        /* EL1 register can't be on the CPU if the guest is in vEL2. */
        if (unlikely(is_hyp_ctxt(vcpu)))
                goto memory_read;

        if (__vcpu_read_sys_reg_from_cpu(reg, &val))
                return val;

memory_read:
        return __vcpu_sys_reg(vcpu, reg);
}

void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
{
        u64 (*xlate)(u64) = NULL;
        unsigned int el1r;

        if (!vcpu_get_flag(vcpu, SYSREGS_ON_CPU))
                goto memory_write;

        if (unlikely(get_el2_to_el1_mapping(reg, &el1r, &xlate))) {
                if (!is_hyp_ctxt(vcpu))
                        goto memory_write;

                /*
                 * Always store a copy of the write to memory to avoid having
                 * to reverse-translate virtual EL2 system registers for a
                 * non-VHE guest hypervisor.
                 */
                __vcpu_sys_reg(vcpu, reg) = val;

                switch (reg) {
                case CNTHCTL_EL2:
                        /*
                         * If E2H=0, CNHTCTL_EL2 is a pure shadow register.
                         * Otherwise, some of the bits are backed by
                         * CNTKCTL_EL1, while the rest is kept in memory.
                         * Yes, this is fun stuff.
                         */
                        if (vcpu_el2_e2h_is_set(vcpu))
                                write_sysreg_el1(val, SYS_CNTKCTL);
                        return;
                }

                /* No EL1 counterpart? We're done here.? */
                if (reg == el1r)
                        return;

                if (!vcpu_el2_e2h_is_set(vcpu) && xlate)
                        val = xlate(val);

                /* Redirect this to the EL1 version of the register. */
                WARN_ON(!__vcpu_write_sys_reg_to_cpu(val, el1r));
                return;
        }

        /* EL1 register can't be on the CPU if the guest is in vEL2. */
        if (unlikely(is_hyp_ctxt(vcpu)))
                goto memory_write;

        if (__vcpu_write_sys_reg_to_cpu(val, reg))
                return;

memory_write:
         __vcpu_sys_reg(vcpu, reg) = val;
}

/* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */
#define CSSELR_MAX 14

/*
 * Returns the minimum line size for the selected cache, expressed as
 * Log2(bytes).
 */
static u8 get_min_cache_line_size(bool icache)
{
        u64 ctr = read_sanitised_ftr_reg(SYS_CTR_EL0);
        u8 field;

        if (icache)
                field = SYS_FIELD_GET(CTR_EL0, IminLine, ctr);
        else
                field = SYS_FIELD_GET(CTR_EL0, DminLine, ctr);

        /*
         * Cache line size is represented as Log2(words) in CTR_EL0.
         * Log2(bytes) can be derived with the following:
         *
         * Log2(words) + 2 = Log2(bytes / 4) + 2
         *                    = Log2(bytes) - 2 + 2
         *                    = Log2(bytes)
         */
        return field + 2;
}

/* Which cache CCSIDR represents depends on CSSELR value. */
static u32 get_ccsidr(struct kvm_vcpu *vcpu, u32 csselr)
{
        u8 line_size;

        if (vcpu->arch.ccsidr)
                return vcpu->arch.ccsidr[csselr];

        line_size = get_min_cache_line_size(csselr & CSSELR_EL1_InD);

        /*
         * Fabricate a CCSIDR value as the overriding value does not exist.
         * The real CCSIDR value will not be used as it can vary by the
         * physical CPU which the vcpu currently resides in.
         *
         * The line size is determined with get_min_cache_line_size(), which
         * should be valid for all CPUs even if they have different cache
         * configuration.
         *
         * The associativity bits are cleared, meaning the geometry of all data
         * and unified caches (which are guaranteed to be PIPT and thus
         * non-aliasing) are 1 set and 1 way.
         * Guests should not be doing cache operations by set/way at all, and
         * for this reason, we trap them and attempt to infer the intent, so
         * that we can flush the entire guest's address space at the appropriate
         * time. The exposed geometry minimizes the number of the traps.
         * [If guests should attempt to infer aliasing properties from the
         * geometry (which is not permitted by the architecture), they would
         * only do so for virtually indexed caches.]
         *
         * We don't check if the cache level exists as it is allowed to return
         * an UNKNOWN value if not.
         */
        return SYS_FIELD_PREP(CCSIDR_EL1, LineSize, line_size - 4);
}

static int set_ccsidr(struct kvm_vcpu *vcpu, u32 csselr, u32 val)
{
        u8 line_size = FIELD_GET(CCSIDR_EL1_LineSize, val) + 4;
        u32 *ccsidr = vcpu->arch.ccsidr;
        u32 i;

        if ((val & CCSIDR_EL1_RES0) ||
            line_size < get_min_cache_line_size(csselr & CSSELR_EL1_InD))
                return -EINVAL;

        if (!ccsidr) {
                if (val == get_ccsidr(vcpu, csselr))
                        return 0;

                ccsidr = kmalloc_array(CSSELR_MAX, sizeof(u32), GFP_KERNEL_ACCOUNT);
                if (!ccsidr)
                        return -ENOMEM;

                for (i = 0; i < CSSELR_MAX; i++)
                        ccsidr[i] = get_ccsidr(vcpu, i);

                vcpu->arch.ccsidr = ccsidr;
        }

        ccsidr[csselr] = val;

        return 0;
}

static bool access_rw(struct kvm_vcpu *vcpu,
                      struct sys_reg_params *p,
                      const struct sys_reg_desc *r)
{
        if (p->is_write)
                vcpu_write_sys_reg(vcpu, p->regval, r->reg);
        else
                p->regval = vcpu_read_sys_reg(vcpu, r->reg);

        return true;
}

/*
 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
 */
static bool access_dcsw(struct kvm_vcpu *vcpu,
                        struct sys_reg_params *p,
                        const struct sys_reg_desc *r)
{
        if (!p->is_write)
                return read_from_write_only(vcpu, p, r);

        /*
         * Only track S/W ops if we don't have FWB. It still indicates
         * that the guest is a bit broken (S/W operations should only
         * be done by firmware, knowing that there is only a single
         * CPU left in the system, and certainly not from non-secure
         * software).
         */
        if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
                kvm_set_way_flush(vcpu);

        return true;
}

static bool access_dcgsw(struct kvm_vcpu *vcpu,
                         struct sys_reg_params *p,
                         const struct sys_reg_desc *r)
{
        if (!kvm_has_mte(vcpu->kvm))
                return undef_access(vcpu, p, r);

        /* Treat MTE S/W ops as we treat the classic ones: with contempt */
        return access_dcsw(vcpu, p, r);
}

static void get_access_mask(const struct sys_reg_desc *r, u64 *mask, u64 *shift)
{
        switch (r->aarch32_map) {
        case AA32_LO:
                *mask = GENMASK_ULL(31, 0);
                *shift = 0;
                break;
        case AA32_HI:
                *mask = GENMASK_ULL(63, 32);
                *shift = 32;
                break;
        default:
                *mask = GENMASK_ULL(63, 0);
                *shift = 0;
                break;
        }
}

/*
 * Generic accessor for VM registers. Only called as long as HCR_TVM
 * is set. If the guest enables the MMU, we stop trapping the VM
 * sys_regs and leave it in complete control of the caches.
 */
static bool access_vm_reg(struct kvm_vcpu *vcpu,
                          struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        bool was_enabled = vcpu_has_cache_enabled(vcpu);
        u64 val, mask, shift;

        BUG_ON(!p->is_write);

        get_access_mask(r, &mask, &shift);

        if (~mask) {
                val = vcpu_read_sys_reg(vcpu, r->reg);
                val &= ~mask;
        } else {
                val = 0;
        }

        val |= (p->regval & (mask >> shift)) << shift;
        vcpu_write_sys_reg(vcpu, val, r->reg);

        kvm_toggle_cache(vcpu, was_enabled);
        return true;
}

static bool access_actlr(struct kvm_vcpu *vcpu,
                         struct sys_reg_params *p,
                         const struct sys_reg_desc *r)
{
        u64 mask, shift;

        if (p->is_write)
                return ignore_write(vcpu, p);

        get_access_mask(r, &mask, &shift);
        p->regval = (vcpu_read_sys_reg(vcpu, r->reg) & mask) >> shift;

        return true;
}

/*
 * Trap handler for the GICv3 SGI generation system register.
 * Forward the request to the VGIC emulation.
 * The cp15_64 code makes sure this automatically works
 * for both AArch64 and AArch32 accesses.
 */
static bool access_gic_sgi(struct kvm_vcpu *vcpu,
                           struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        bool g1;

        if (!kvm_has_gicv3(vcpu->kvm))
                return undef_access(vcpu, p, r);

        if (!p->is_write)
                return read_from_write_only(vcpu, p, r);

        /*
         * In a system where GICD_CTLR.DS=1, a ICC_SGI0R_EL1 access generates
         * Group0 SGIs only, while ICC_SGI1R_EL1 can generate either group,
         * depending on the SGI configuration. ICC_ASGI1R_EL1 is effectively
         * equivalent to ICC_SGI0R_EL1, as there is no "alternative" secure
         * group.
         */
        if (p->Op0 == 0) {                /* AArch32 */
                switch (p->Op1) {
                default:                /* Keep GCC quiet */
                case 0:                        /* ICC_SGI1R */
                        g1 = true;
                        break;
                case 1:                        /* ICC_ASGI1R */
                case 2:                        /* ICC_SGI0R */
                        g1 = false;
                        break;
                }
        } else {                        /* AArch64 */
                switch (p->Op2) {
                default:                /* Keep GCC quiet */
                case 5:                        /* ICC_SGI1R_EL1 */
                        g1 = true;
                        break;
                case 6:                        /* ICC_ASGI1R_EL1 */
                case 7:                        /* ICC_SGI0R_EL1 */
                        g1 = false;
                        break;
                }
        }

        vgic_v3_dispatch_sgi(vcpu, p->regval, g1);

        return true;
}

static bool access_gic_sre(struct kvm_vcpu *vcpu,
                           struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        if (!kvm_has_gicv3(vcpu->kvm))
                return undef_access(vcpu, p, r);

        if (p->is_write)
                return ignore_write(vcpu, p);

        if (p->Op1 == 4) {        /* ICC_SRE_EL2 */
                p->regval = (ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE |
                             ICC_SRE_EL1_DIB | ICC_SRE_EL1_DFB);
        } else {                /* ICC_SRE_EL1 */
                p->regval = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre;
        }

        return true;
}

static bool trap_raz_wi(struct kvm_vcpu *vcpu,
                        struct sys_reg_params *p,
                        const struct sys_reg_desc *r)
{
        if (p->is_write)
                return ignore_write(vcpu, p);
        else
                return read_zero(vcpu, p);
}

/*
 * ARMv8.1 mandates at least a trivial LORegion implementation, where all the
 * RW registers are RES0 (which we can implement as RAZ/WI). On an ARMv8.0
 * system, these registers should UNDEF. LORID_EL1 being a RO register, we
 * treat it separately.
 */
static bool trap_loregion(struct kvm_vcpu *vcpu,
                          struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        u32 sr = reg_to_encoding(r);

        if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, LO, IMP))
                return undef_access(vcpu, p, r);

        if (p->is_write && sr == SYS_LORID_EL1)
                return write_to_read_only(vcpu, p, r);

        return trap_raz_wi(vcpu, p, r);
}

static bool trap_oslar_el1(struct kvm_vcpu *vcpu,
                           struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        if (!p->is_write)
                return read_from_write_only(vcpu, p, r);

        kvm_debug_handle_oslar(vcpu, p->regval);
        return true;
}

static bool trap_oslsr_el1(struct kvm_vcpu *vcpu,
                           struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        p->regval = __vcpu_sys_reg(vcpu, r->reg);
        return true;
}

static int set_oslsr_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                         u64 val)
{
        /*
         * The only modifiable bit is the OSLK bit. Refuse the write if
         * userspace attempts to change any other bit in the register.
         */
        if ((val ^ rd->val) & ~OSLSR_EL1_OSLK)
                return -EINVAL;

        __vcpu_sys_reg(vcpu, rd->reg) = val;
        return 0;
}

static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu,
                                   struct sys_reg_params *p,
                                   const struct sys_reg_desc *r)
{
        if (p->is_write) {
                return ignore_write(vcpu, p);
        } else {
                p->regval = read_sysreg(dbgauthstatus_el1);
                return true;
        }
}

static bool trap_debug_regs(struct kvm_vcpu *vcpu,
                            struct sys_reg_params *p,
                            const struct sys_reg_desc *r)
{
        access_rw(vcpu, p, r);

        kvm_debug_set_guest_ownership(vcpu);
        return true;
}

/*
 * reg_to_dbg/dbg_to_reg
 *
 * A 32 bit write to a debug register leave top bits alone
 * A 32 bit read from a debug register only returns the bottom bits
 */
static void reg_to_dbg(struct kvm_vcpu *vcpu,
                       struct sys_reg_params *p,
                       const struct sys_reg_desc *rd,
                       u64 *dbg_reg)
{
        u64 mask, shift, val;

        get_access_mask(rd, &mask, &shift);

        val = *dbg_reg;
        val &= ~mask;
        val |= (p->regval & (mask >> shift)) << shift;
        *dbg_reg = val;
}

static void dbg_to_reg(struct kvm_vcpu *vcpu,
                       struct sys_reg_params *p,
                       const struct sys_reg_desc *rd,
                       u64 *dbg_reg)
{
        u64 mask, shift;

        get_access_mask(rd, &mask, &shift);
        p->regval = (*dbg_reg & mask) >> shift;
}

static u64 *demux_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd)
{
        struct kvm_guest_debug_arch *dbg = &vcpu->arch.vcpu_debug_state;

        switch (rd->Op2) {
        case 0b100:
                return &dbg->dbg_bvr[rd->CRm];
        case 0b101:
                return &dbg->dbg_bcr[rd->CRm];
        case 0b110:
                return &dbg->dbg_wvr[rd->CRm];
        case 0b111:
                return &dbg->dbg_wcr[rd->CRm];
        default:
                KVM_BUG_ON(1, vcpu->kvm);
                return NULL;
        }
}

static bool trap_dbg_wb_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                            const struct sys_reg_desc *rd)
{
        u64 *reg = demux_wb_reg(vcpu, rd);

        if (!reg)
                return false;

        if (p->is_write)
                reg_to_dbg(vcpu, p, rd, reg);
        else
                dbg_to_reg(vcpu, p, rd, reg);

        kvm_debug_set_guest_ownership(vcpu);
        return true;
}

static int set_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                          u64 val)
{
        u64 *reg = demux_wb_reg(vcpu, rd);

        if (!reg)
                return -EINVAL;

        *reg = val;
        return 0;
}

static int get_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                          u64 *val)
{
        u64 *reg = demux_wb_reg(vcpu, rd);

        if (!reg)
                return -EINVAL;

        *val = *reg;
        return 0;
}

static u64 reset_dbg_wb_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd)
{
        u64 *reg = demux_wb_reg(vcpu, rd);

        /*
         * Bail early if we couldn't find storage for the register, the
         * KVM_BUG_ON() in demux_wb_reg() will prevent this VM from ever
         * being run.
         */
        if (!reg)
                return 0;

        *reg = rd->val;
        return rd->val;
}

static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        u64 amair = read_sysreg(amair_el1);
        vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1);
        return amair;
}

static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        u64 actlr = read_sysreg(actlr_el1);
        vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1);
        return actlr;
}

static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        u64 mpidr;

        /*
         * Map the vcpu_id into the first three affinity level fields of
         * the MPIDR. We limit the number of VCPUs in level 0 due to a
         * limitation to 16 CPUs in that level in the ICC_SGIxR registers
         * of the GICv3 to be able to address each CPU directly when
         * sending IPIs.
         */
        mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0);
        mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1);
        mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2);
        mpidr |= (1ULL << 31);
        vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1);

        return mpidr;
}

static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu,
                                   const struct sys_reg_desc *r)
{
        if (kvm_vcpu_has_pmu(vcpu))
                return 0;

        return REG_HIDDEN;
}

static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        u64 mask = BIT(ARMV8_PMU_CYCLE_IDX);
        u8 n = vcpu->kvm->arch.nr_pmu_counters;

        if (n)
                mask |= GENMASK(n - 1, 0);

        reset_unknown(vcpu, r);
        __vcpu_sys_reg(vcpu, r->reg) &= mask;

        return __vcpu_sys_reg(vcpu, r->reg);
}

static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        reset_unknown(vcpu, r);
        __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0);

        return __vcpu_sys_reg(vcpu, r->reg);
}

static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        /* This thing will UNDEF, who cares about the reset value? */
        if (!kvm_vcpu_has_pmu(vcpu))
                return 0;

        reset_unknown(vcpu, r);
        __vcpu_sys_reg(vcpu, r->reg) &= kvm_pmu_evtyper_mask(vcpu->kvm);

        return __vcpu_sys_reg(vcpu, r->reg);
}

static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        reset_unknown(vcpu, r);
        __vcpu_sys_reg(vcpu, r->reg) &= PMSELR_EL0_SEL_MASK;

        return __vcpu_sys_reg(vcpu, r->reg);
}

static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        u64 pmcr = 0;

        if (!kvm_supports_32bit_el0())
                pmcr |= ARMV8_PMU_PMCR_LC;

        /*
         * The value of PMCR.N field is included when the
         * vCPU register is read via kvm_vcpu_read_pmcr().
         */
        __vcpu_sys_reg(vcpu, r->reg) = pmcr;

        return __vcpu_sys_reg(vcpu, r->reg);
}

static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags)
{
        u64 reg = __vcpu_sys_reg(vcpu, PMUSERENR_EL0);
        bool enabled = (reg & flags) || vcpu_mode_priv(vcpu);

        if (!enabled)
                kvm_inject_undefined(vcpu);

        return !enabled;
}

static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu)
{
        return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_EN);
}

static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu)
{
        return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN);
}

static bool pmu_access_cycle_counter_el0_disabled(struct kvm_vcpu *vcpu)
{
        return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_EN);
}

static bool pmu_access_event_counter_el0_disabled(struct kvm_vcpu *vcpu)
{
        return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_EN);
}

static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                        const struct sys_reg_desc *r)
{
        u64 val;

        if (pmu_access_el0_disabled(vcpu))
                return false;

        if (p->is_write) {
                /*
                 * Only update writeable bits of PMCR (continuing into
                 * kvm_pmu_handle_pmcr() as well)
                 */
                val = kvm_vcpu_read_pmcr(vcpu);
                val &= ~ARMV8_PMU_PMCR_MASK;
                val |= p->regval & ARMV8_PMU_PMCR_MASK;
                if (!kvm_supports_32bit_el0())
                        val |= ARMV8_PMU_PMCR_LC;
                kvm_pmu_handle_pmcr(vcpu, val);
        } else {
                /* PMCR.P & PMCR.C are RAZ */
                val = kvm_vcpu_read_pmcr(vcpu)
                      & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C);
                p->regval = val;
        }

        return true;
}

static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        if (pmu_access_event_counter_el0_disabled(vcpu))
                return false;

        if (p->is_write)
                __vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval;
        else
                /* return PMSELR.SEL field */
                p->regval = __vcpu_sys_reg(vcpu, PMSELR_EL0)
                            & PMSELR_EL0_SEL_MASK;

        return true;
}

static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        u64 pmceid, mask, shift;

        BUG_ON(p->is_write);

        if (pmu_access_el0_disabled(vcpu))
                return false;

        get_access_mask(r, &mask, &shift);

        pmceid = kvm_pmu_get_pmceid(vcpu, (p->Op2 & 1));
        pmceid &= mask;
        pmceid >>= shift;

        p->regval = pmceid;

        return true;
}

static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
{
        u64 pmcr, val;

        pmcr = kvm_vcpu_read_pmcr(vcpu);
        val = FIELD_GET(ARMV8_PMU_PMCR_N, pmcr);
        if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) {
                kvm_inject_undefined(vcpu);
                return false;
        }

        return true;
}

static int get_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                          u64 *val)
{
        u64 idx;

        if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0)
                /* PMCCNTR_EL0 */
                idx = ARMV8_PMU_CYCLE_IDX;
        else
                /* PMEVCNTRn_EL0 */
                idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);

        *val = kvm_pmu_get_counter_value(vcpu, idx);
        return 0;
}

static int set_pmu_evcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                          u64 val)
{
        u64 idx;

        if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 0)
                /* PMCCNTR_EL0 */
                idx = ARMV8_PMU_CYCLE_IDX;
        else
                /* PMEVCNTRn_EL0 */
                idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);

        kvm_pmu_set_counter_value_user(vcpu, idx, val);
        return 0;
}

static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
                              struct sys_reg_params *p,
                              const struct sys_reg_desc *r)
{
        u64 idx = ~0UL;

        if (r->CRn == 9 && r->CRm == 13) {
                if (r->Op2 == 2) {
                        /* PMXEVCNTR_EL0 */
                        if (pmu_access_event_counter_el0_disabled(vcpu))
                                return false;

                        idx = SYS_FIELD_GET(PMSELR_EL0, SEL,
                                            __vcpu_sys_reg(vcpu, PMSELR_EL0));
                } else if (r->Op2 == 0) {
                        /* PMCCNTR_EL0 */
                        if (pmu_access_cycle_counter_el0_disabled(vcpu))
                                return false;

                        idx = ARMV8_PMU_CYCLE_IDX;
                }
        } else if (r->CRn == 0 && r->CRm == 9) {
                /* PMCCNTR */
                if (pmu_access_event_counter_el0_disabled(vcpu))
                        return false;

                idx = ARMV8_PMU_CYCLE_IDX;
        } else if (r->CRn == 14 && (r->CRm & 12) == 8) {
                /* PMEVCNTRn_EL0 */
                if (pmu_access_event_counter_el0_disabled(vcpu))
                        return false;

                idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
        }

        /* Catch any decoding mistake */
        WARN_ON(idx == ~0UL);

        if (!pmu_counter_idx_valid(vcpu, idx))
                return false;

        if (p->is_write) {
                if (pmu_access_el0_disabled(vcpu))
                        return false;

                kvm_pmu_set_counter_value(vcpu, idx, p->regval);
        } else {
                p->regval = kvm_pmu_get_counter_value(vcpu, idx);
        }

        return true;
}

static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                               const struct sys_reg_desc *r)
{
        u64 idx, reg;

        if (pmu_access_el0_disabled(vcpu))
                return false;

        if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) {
                /* PMXEVTYPER_EL0 */
                idx = SYS_FIELD_GET(PMSELR_EL0, SEL, __vcpu_sys_reg(vcpu, PMSELR_EL0));
                reg = PMEVTYPER0_EL0 + idx;
        } else if (r->CRn == 14 && (r->CRm & 12) == 12) {
                idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
                if (idx == ARMV8_PMU_CYCLE_IDX)
                        reg = PMCCFILTR_EL0;
                else
                        /* PMEVTYPERn_EL0 */
                        reg = PMEVTYPER0_EL0 + idx;
        } else {
                BUG();
        }

        if (!pmu_counter_idx_valid(vcpu, idx))
                return false;

        if (p->is_write) {
                kvm_pmu_set_counter_event_type(vcpu, p->regval, idx);
                kvm_vcpu_pmu_restore_guest(vcpu);
        } else {
                p->regval = __vcpu_sys_reg(vcpu, reg);
        }

        return true;
}

static int set_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 val)
{
        u64 mask = kvm_pmu_accessible_counter_mask(vcpu);

        __vcpu_sys_reg(vcpu, r->reg) = val & mask;
        kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);

        return 0;
}

static int get_pmreg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r, u64 *val)
{
        u64 mask = kvm_pmu_accessible_counter_mask(vcpu);

        *val = __vcpu_sys_reg(vcpu, r->reg) & mask;
        return 0;
}

static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        u64 val, mask;

        if (pmu_access_el0_disabled(vcpu))
                return false;

        mask = kvm_pmu_accessible_counter_mask(vcpu);
        if (p->is_write) {
                val = p->regval & mask;
                if (r->Op2 & 0x1)
                        /* accessing PMCNTENSET_EL0 */
                        __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val;
                else
                        /* accessing PMCNTENCLR_EL0 */
                        __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;

                kvm_pmu_reprogram_counter_mask(vcpu, val);
        } else {
                p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
        }

        return true;
}

static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        u64 mask = kvm_pmu_accessible_counter_mask(vcpu);

        if (check_pmu_access_disabled(vcpu, 0))
                return false;

        if (p->is_write) {
                u64 val = p->regval & mask;

                if (r->Op2 & 0x1)
                        /* accessing PMINTENSET_EL1 */
                        __vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val;
                else
                        /* accessing PMINTENCLR_EL1 */
                        __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
        } else {
                p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1);
        }

        return true;
}

static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                         const struct sys_reg_desc *r)
{
        u64 mask = kvm_pmu_accessible_counter_mask(vcpu);

        if (pmu_access_el0_disabled(vcpu))
                return false;

        if (p->is_write) {
                if (r->CRm & 0x2)
                        /* accessing PMOVSSET_EL0 */
                        __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= (p->regval & mask);
                else
                        /* accessing PMOVSCLR_EL0 */
                        __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
        } else {
                p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);
        }

        return true;
}

static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        u64 mask;

        if (!p->is_write)
                return read_from_write_only(vcpu, p, r);

        if (pmu_write_swinc_el0_disabled(vcpu))
                return false;

        mask = kvm_pmu_accessible_counter_mask(vcpu);
        kvm_pmu_software_increment(vcpu, p->regval & mask);
        return true;
}

static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                             const struct sys_reg_desc *r)
{
        if (p->is_write) {
                if (!vcpu_mode_priv(vcpu))
                        return undef_access(vcpu, p, r);

                __vcpu_sys_reg(vcpu, PMUSERENR_EL0) =
                               p->regval & ARMV8_PMU_USERENR_MASK;
        } else {
                p->regval = __vcpu_sys_reg(vcpu, PMUSERENR_EL0)
                            & ARMV8_PMU_USERENR_MASK;
        }

        return true;
}

static int get_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                    u64 *val)
{
        *val = kvm_vcpu_read_pmcr(vcpu);
        return 0;
}

static int set_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                    u64 val)
{
        u8 new_n = FIELD_GET(ARMV8_PMU_PMCR_N, val);
        struct kvm *kvm = vcpu->kvm;

        mutex_lock(&kvm->arch.config_lock);

        /*
         * The vCPU can't have more counters than the PMU hardware
         * implements. Ignore this error to maintain compatibility
         * with the existing KVM behavior.
         */
        if (!kvm_vm_has_ran_once(kvm) &&
            !vcpu_has_nv(vcpu)              &&
            new_n <= kvm_arm_pmu_get_max_counters(kvm))
                kvm->arch.nr_pmu_counters = new_n;

        mutex_unlock(&kvm->arch.config_lock);

        /*
         * Ignore writes to RES0 bits, read only bits that are cleared on
         * vCPU reset, and writable bits that KVM doesn't support yet.
         * (i.e. only PMCR.N and bits [7:0] are mutable from userspace)
         * The LP bit is RES0 when FEAT_PMUv3p5 is not supported on the vCPU.
         * But, we leave the bit as it is here, as the vCPU's PMUver might
         * be changed later (NOTE: the bit will be cleared on first vCPU run
         * if necessary).
         */
        val &= ARMV8_PMU_PMCR_MASK;

        /* The LC bit is RES1 when AArch32 is not supported */
        if (!kvm_supports_32bit_el0())
                val |= ARMV8_PMU_PMCR_LC;

        __vcpu_sys_reg(vcpu, r->reg) = val;
        kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);

        return 0;
}

/* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
#define DBG_BCR_BVR_WCR_WVR_EL1(n)                                        \
        { SYS_DESC(SYS_DBGBVRn_EL1(n)),                                        \
          trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0,                        \
          get_dbg_wb_reg, set_dbg_wb_reg },                                \
        { SYS_DESC(SYS_DBGBCRn_EL1(n)),                                        \
          trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0,                        \
          get_dbg_wb_reg, set_dbg_wb_reg },                                \
        { SYS_DESC(SYS_DBGWVRn_EL1(n)),                                        \
          trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0,                        \
          get_dbg_wb_reg, set_dbg_wb_reg },                                \
        { SYS_DESC(SYS_DBGWCRn_EL1(n)),                                        \
          trap_dbg_wb_reg, reset_dbg_wb_reg, 0, 0,                        \
          get_dbg_wb_reg, set_dbg_wb_reg }

#define PMU_SYS_REG(name)                                                \
        SYS_DESC(SYS_##name), .reset = reset_pmu_reg,                        \
        .visibility = pmu_visibility

/* Macro to expand the PMEVCNTRn_EL0 register */
#define PMU_PMEVCNTR_EL0(n)                                                \
        { PMU_SYS_REG(PMEVCNTRn_EL0(n)),                                \
          .reset = reset_pmevcntr, .get_user = get_pmu_evcntr,                \
          .set_user = set_pmu_evcntr,                                        \
          .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), }

/* Macro to expand the PMEVTYPERn_EL0 register */
#define PMU_PMEVTYPER_EL0(n)                                                \
        { PMU_SYS_REG(PMEVTYPERn_EL0(n)),                                \
          .reset = reset_pmevtyper,                                        \
          .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), }

/* Macro to expand the AMU counter and type registers*/
#define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), undef_access }
#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), undef_access }
#define AMU_AMEVCNTR1_EL0(n) { SYS_DESC(SYS_AMEVCNTR1_EL0(n)), undef_access }
#define AMU_AMEVTYPER1_EL0(n) { SYS_DESC(SYS_AMEVTYPER1_EL0(n)), undef_access }

static unsigned int ptrauth_visibility(const struct kvm_vcpu *vcpu,
                        const struct sys_reg_desc *rd)
{
        return vcpu_has_ptrauth(vcpu) ? 0 : REG_HIDDEN;
}

/*
 * If we land here on a PtrAuth access, that is because we didn't
 * fixup the access on exit by allowing the PtrAuth sysregs. The only
 * way this happens is when the guest does not have PtrAuth support
 * enabled.
 */
#define __PTRAUTH_KEY(k)                                                \
        { SYS_DESC(SYS_## k), undef_access, reset_unknown, k,                \
        .visibility = ptrauth_visibility}

#define PTRAUTH_KEY(k)                                                        \
        __PTRAUTH_KEY(k ## KEYLO_EL1),                                        \
        __PTRAUTH_KEY(k ## KEYHI_EL1)

static bool access_arch_timer(struct kvm_vcpu *vcpu,
                              struct sys_reg_params *p,
                              const struct sys_reg_desc *r)
{
        enum kvm_arch_timers tmr;
        enum kvm_arch_timer_regs treg;
        u64 reg = reg_to_encoding(r);

        switch (reg) {
        case SYS_CNTP_TVAL_EL0:
                if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
                        tmr = TIMER_HPTIMER;
                else
                        tmr = TIMER_PTIMER;
                treg = TIMER_REG_TVAL;
                break;

        case SYS_CNTV_TVAL_EL0:
                if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
                        tmr = TIMER_HVTIMER;
                else
                        tmr = TIMER_VTIMER;
                treg = TIMER_REG_TVAL;
                break;

        case SYS_AARCH32_CNTP_TVAL:
        case SYS_CNTP_TVAL_EL02:
                tmr = TIMER_PTIMER;
                treg = TIMER_REG_TVAL;
                break;

        case SYS_CNTV_TVAL_EL02:
                tmr = TIMER_VTIMER;
                treg = TIMER_REG_TVAL;
                break;

        case SYS_CNTHP_TVAL_EL2:
                tmr = TIMER_HPTIMER;
                treg = TIMER_REG_TVAL;
                break;

        case SYS_CNTHV_TVAL_EL2:
                tmr = TIMER_HVTIMER;
                treg = TIMER_REG_TVAL;
                break;

        case SYS_CNTP_CTL_EL0:
                if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
                        tmr = TIMER_HPTIMER;
                else
                        tmr = TIMER_PTIMER;
                treg = TIMER_REG_CTL;
                break;

        case SYS_CNTV_CTL_EL0:
                if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
                        tmr = TIMER_HVTIMER;
                else
                        tmr = TIMER_VTIMER;
                treg = TIMER_REG_CTL;
                break;

        case SYS_AARCH32_CNTP_CTL:
        case SYS_CNTP_CTL_EL02:
                tmr = TIMER_PTIMER;
                treg = TIMER_REG_CTL;
                break;

        case SYS_CNTV_CTL_EL02:
                tmr = TIMER_VTIMER;
                treg = TIMER_REG_CTL;
                break;

        case SYS_CNTHP_CTL_EL2:
                tmr = TIMER_HPTIMER;
                treg = TIMER_REG_CTL;
                break;

        case SYS_CNTHV_CTL_EL2:
                tmr = TIMER_HVTIMER;
                treg = TIMER_REG_CTL;
                break;

        case SYS_CNTP_CVAL_EL0:
                if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
                        tmr = TIMER_HPTIMER;
                else
                        tmr = TIMER_PTIMER;
                treg = TIMER_REG_CVAL;
                break;

        case SYS_CNTV_CVAL_EL0:
                if (is_hyp_ctxt(vcpu) && vcpu_el2_e2h_is_set(vcpu))
                        tmr = TIMER_HVTIMER;
                else
                        tmr = TIMER_VTIMER;
                treg = TIMER_REG_CVAL;
                break;

        case SYS_AARCH32_CNTP_CVAL:
        case SYS_CNTP_CVAL_EL02:
                tmr = TIMER_PTIMER;
                treg = TIMER_REG_CVAL;
                break;

        case SYS_CNTV_CVAL_EL02:
                tmr = TIMER_VTIMER;
                treg = TIMER_REG_CVAL;
                break;

        case SYS_CNTHP_CVAL_EL2:
                tmr = TIMER_HPTIMER;
                treg = TIMER_REG_CVAL;
                break;

        case SYS_CNTHV_CVAL_EL2:
                tmr = TIMER_HVTIMER;
                treg = TIMER_REG_CVAL;
                break;

        case SYS_CNTPCT_EL0:
        case SYS_CNTPCTSS_EL0:
                if (is_hyp_ctxt(vcpu))
                        tmr = TIMER_HPTIMER;
                else
                        tmr = TIMER_PTIMER;
                treg = TIMER_REG_CNT;
                break;

        case SYS_AARCH32_CNTPCT:
        case SYS_AARCH32_CNTPCTSS:
                tmr = TIMER_PTIMER;
                treg = TIMER_REG_CNT;
                break;

        case SYS_CNTVCT_EL0:
        case SYS_CNTVCTSS_EL0:
                if (is_hyp_ctxt(vcpu))
                        tmr = TIMER_HVTIMER;
                else
                        tmr = TIMER_VTIMER;
                treg = TIMER_REG_CNT;
                break;

        case SYS_AARCH32_CNTVCT:
        case SYS_AARCH32_CNTVCTSS:
                tmr = TIMER_VTIMER;
                treg = TIMER_REG_CNT;
                break;

        default:
                print_sys_reg_msg(p, "%s", "Unhandled trapped timer register");
                return undef_access(vcpu, p, r);
        }

        if (p->is_write)
                kvm_arm_timer_write_sysreg(vcpu, tmr, treg, p->regval);
        else
                p->regval = kvm_arm_timer_read_sysreg(vcpu, tmr, treg);

        return true;
}

static bool access_hv_timer(struct kvm_vcpu *vcpu,
                            struct sys_reg_params *p,
                            const struct sys_reg_desc *r)
{
        if (!vcpu_el2_e2h_is_set(vcpu))
                return undef_access(vcpu, p, r);

        return access_arch_timer(vcpu, p, r);
}

static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp,
                                    s64 new, s64 cur)
{
        struct arm64_ftr_bits kvm_ftr = *ftrp;

        /* Some features have different safe value type in KVM than host features */
        switch (id) {
        case SYS_ID_AA64DFR0_EL1:
                switch (kvm_ftr.shift) {
                case ID_AA64DFR0_EL1_PMUVer_SHIFT:
                        kvm_ftr.type = FTR_LOWER_SAFE;
                        break;
                case ID_AA64DFR0_EL1_DebugVer_SHIFT:
                        kvm_ftr.type = FTR_LOWER_SAFE;
                        break;
                }
                break;
        case SYS_ID_DFR0_EL1:
                if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT)
                        kvm_ftr.type = FTR_LOWER_SAFE;
                break;
        }

        return arm64_ftr_safe_value(&kvm_ftr, new, cur);
}

/*
 * arm64_check_features() - Check if a feature register value constitutes
 * a subset of features indicated by the idreg's KVM sanitised limit.
 *
 * This function will check if each feature field of @val is the "safe" value
 * against idreg's KVM sanitised limit return from reset() callback.
 * If a field value in @val is the same as the one in limit, it is always
 * considered the safe value regardless For register fields that are not in
 * writable, only the value in limit is considered the safe value.
 *
 * Return: 0 if all the fields are safe. Otherwise, return negative errno.
 */
static int arm64_check_features(struct kvm_vcpu *vcpu,
                                const struct sys_reg_desc *rd,
                                u64 val)
{
        const struct arm64_ftr_reg *ftr_reg;
        const struct arm64_ftr_bits *ftrp = NULL;
        u32 id = reg_to_encoding(rd);
        u64 writable_mask = rd->val;
        u64 limit = rd->reset(vcpu, rd);
        u64 mask = 0;

        /*
         * Hidden and unallocated ID registers may not have a corresponding
         * struct arm64_ftr_reg. Of course, if the register is RAZ we know the
         * only safe value is 0.
         */
        if (sysreg_visible_as_raz(vcpu, rd))
                return val ? -E2BIG : 0;

        ftr_reg = get_arm64_ftr_reg(id);
        if (!ftr_reg)
                return -EINVAL;

        ftrp = ftr_reg->ftr_bits;

        for (; ftrp && ftrp->width; ftrp++) {
                s64 f_val, f_lim, safe_val;
                u64 ftr_mask;

                ftr_mask = arm64_ftr_mask(ftrp);
                if ((ftr_mask & writable_mask) != ftr_mask)
                        continue;

                f_val = arm64_ftr_value(ftrp, val);
                f_lim = arm64_ftr_value(ftrp, limit);
                mask |= ftr_mask;

                if (f_val == f_lim)
                        safe_val = f_val;
                else
                        safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim);

                if (safe_val != f_val)
                        return -E2BIG;
        }

        /* For fields that are not writable, values in limit are the safe values. */
        if ((val & ~mask) != (limit & ~mask))
                return -E2BIG;

        return 0;
}

static u8 pmuver_to_perfmon(u8 pmuver)
{
        switch (pmuver) {
        case ID_AA64DFR0_EL1_PMUVer_IMP:
                return ID_DFR0_EL1_PerfMon_PMUv3;
        case ID_AA64DFR0_EL1_PMUVer_IMP_DEF:
                return ID_DFR0_EL1_PerfMon_IMPDEF;
        default:
                /* Anything ARMv8.1+ and NI have the same value. For now. */
                return pmuver;
        }
}

static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val);
static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val);

/* Read a sanitised cpufeature ID register by sys_reg_desc */
static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu,
                                       const struct sys_reg_desc *r)
{
        u32 id = reg_to_encoding(r);
        u64 val;

        if (sysreg_visible_as_raz(vcpu, r))
                return 0;

        val = read_sanitised_ftr_reg(id);

        switch (id) {
        case SYS_ID_AA64DFR0_EL1:
                val = sanitise_id_aa64dfr0_el1(vcpu, val);
                break;
        case SYS_ID_AA64PFR0_EL1:
                val = sanitise_id_aa64pfr0_el1(vcpu, val);
                break;
        case SYS_ID_AA64PFR1_EL1:
                if (!kvm_has_mte(vcpu->kvm)) {
                        val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE);
                        val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac);
                }

                val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME);
                val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap);
                val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI);
                val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS);
                val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE);
                val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX);
                val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2);
                val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR);
                val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MPAM_frac);
                break;
        case SYS_ID_AA64PFR2_EL1:
                /* We only expose FPMR */
                val &= ID_AA64PFR2_EL1_FPMR;
                break;
        case SYS_ID_AA64ISAR1_EL1:
                if (!vcpu_has_ptrauth(vcpu))
                        val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA) |
                                 ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API) |
                                 ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA) |
                                 ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI));
                break;
        case SYS_ID_AA64ISAR2_EL1:
                if (!vcpu_has_ptrauth(vcpu))
                        val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_APA3) |
                                 ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_GPA3));
                if (!cpus_have_final_cap(ARM64_HAS_WFXT) ||
                    has_broken_cntvoff())
                        val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT);
                break;
        case SYS_ID_AA64ISAR3_EL1:
                val &= ID_AA64ISAR3_EL1_FPRCVT | ID_AA64ISAR3_EL1_FAMINMAX;
                break;
        case SYS_ID_AA64MMFR2_EL1:
                val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK;
                val &= ~ID_AA64MMFR2_EL1_NV;
                break;
        case SYS_ID_AA64MMFR3_EL1:
                val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE |
                        ID_AA64MMFR3_EL1_S1PIE;
                break;
        case SYS_ID_MMFR4_EL1:
                val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX);
                break;
        }

        if (vcpu_has_nv(vcpu))
                val = limit_nv_id_reg(vcpu->kvm, id, val);

        return val;
}

static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu,
                                     const struct sys_reg_desc *r)
{
        return __kvm_read_sanitised_id_reg(vcpu, r);
}

static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        return kvm_read_vm_id_reg(vcpu->kvm, reg_to_encoding(r));
}

static bool is_feature_id_reg(u32 encoding)
{
        return (sys_reg_Op0(encoding) == 3 &&
                (sys_reg_Op1(encoding) < 2 || sys_reg_Op1(encoding) == 3) &&
                sys_reg_CRn(encoding) == 0 &&
                sys_reg_CRm(encoding) <= 7);
}

/*
 * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is
 * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8, which is the range of ID
 * registers KVM maintains on a per-VM basis.
 *
 * Additionally, the implementation ID registers and CTR_EL0 are handled as
 * per-VM registers.
 */
static inline bool is_vm_ftr_id_reg(u32 id)
{
        switch (id) {
        case SYS_CTR_EL0:
        case SYS_MIDR_EL1:
        case SYS_REVIDR_EL1:
        case SYS_AIDR_EL1:
                return true;
        default:
                return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
                        sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
                        sys_reg_CRm(id) < 8);

        }
}

static inline bool is_vcpu_ftr_id_reg(u32 id)
{
        return is_feature_id_reg(id) && !is_vm_ftr_id_reg(id);
}

static inline bool is_aa32_id_reg(u32 id)
{
        return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 &&
                sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 &&
                sys_reg_CRm(id) <= 3);
}

static unsigned int id_visibility(const struct kvm_vcpu *vcpu,
                                  const struct sys_reg_desc *r)
{
        u32 id = reg_to_encoding(r);

        switch (id) {
        case SYS_ID_AA64ZFR0_EL1:
                if (!vcpu_has_sve(vcpu))
                        return REG_RAZ;
                break;
        }

        return 0;
}

static unsigned int aa32_id_visibility(const struct kvm_vcpu *vcpu,
                                       const struct sys_reg_desc *r)
{
        /*
         * AArch32 ID registers are UNKNOWN if AArch32 isn't implemented at any
         * EL. Promote to RAZ/WI in order to guarantee consistency between
         * systems.
         */
        if (!kvm_supports_32bit_el0())
                return REG_RAZ | REG_USER_WI;

        return id_visibility(vcpu, r);
}

static unsigned int raz_visibility(const struct kvm_vcpu *vcpu,
                                   const struct sys_reg_desc *r)
{
        return REG_RAZ;
}

/* cpufeature ID register access trap handlers */

static bool access_id_reg(struct kvm_vcpu *vcpu,
                          struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        p->regval = read_id_reg(vcpu, r);

        return true;
}

/* Visibility overrides for SVE-specific control registers */
static unsigned int sve_visibility(const struct kvm_vcpu *vcpu,
                                   const struct sys_reg_desc *rd)
{
        if (vcpu_has_sve(vcpu))
                return 0;

        return REG_HIDDEN;
}

static unsigned int sme_visibility(const struct kvm_vcpu *vcpu,
                                   const struct sys_reg_desc *rd)
{
        if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SME, IMP))
                return 0;

        return REG_HIDDEN;
}

static unsigned int fp8_visibility(const struct kvm_vcpu *vcpu,
                                   const struct sys_reg_desc *rd)
{
        if (kvm_has_fpmr(vcpu->kvm))
                return 0;

        return REG_HIDDEN;
}

static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
{
        if (!vcpu_has_sve(vcpu))
                val &= ~ID_AA64PFR0_EL1_SVE_MASK;

        /*
         * The default is to expose CSV2 == 1 if the HW isn't affected.
         * Although this is a per-CPU feature, we make it global because
         * asymmetric systems are just a nuisance.
         *
         * Userspace can override this as long as it doesn't promise
         * the impossible.
         */
        if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) {
                val &= ~ID_AA64PFR0_EL1_CSV2_MASK;
                val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP);
        }
        if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) {
                val &= ~ID_AA64PFR0_EL1_CSV3_MASK;
                val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP);
        }

        if (kvm_vgic_global_state.type == VGIC_V3) {
                val &= ~ID_AA64PFR0_EL1_GIC_MASK;
                val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
        }

        val &= ~ID_AA64PFR0_EL1_AMU_MASK;

        /*
         * MPAM is disabled by default as KVM also needs a set of PARTID to
         * program the MPAMVPMx_EL2 PARTID remapping registers with. But some
         * older kernels let the guest see the ID bit.
         */
        val &= ~ID_AA64PFR0_EL1_MPAM_MASK;

        return val;
}

static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu, u64 val)
{
        val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, V8P8);

        /*
         * Only initialize the PMU version if the vCPU was configured with one.
         */
        val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;
        if (kvm_vcpu_has_pmu(vcpu))
                val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer,
                                      kvm_arm_pmu_get_pmuver_limit());

        /* Hide SPE from guests */
        val &= ~ID_AA64DFR0_EL1_PMSVer_MASK;

        /* Hide BRBE from guests */
        val &= ~ID_AA64DFR0_EL1_BRBE_MASK;

        return val;
}

static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu,
                               const struct sys_reg_desc *rd,
                               u64 val)
{
        u8 debugver = SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, val);
        u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val);

        /*
         * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the
         * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously
         * exposed an IMP_DEF PMU to userspace and the guest on systems w/
         * non-architectural PMUs. Of course, PMUv3 is the only game in town for
         * PMU virtualization, so the IMP_DEF value was rather user-hostile.
         *
         * At minimum, we're on the hook to allow values that were given to
         * userspace by KVM. Cover our tracks here and replace the IMP_DEF value
         * with a more sensible NI. The value of an ID register changing under
         * the nose of the guest is unfortunate, but is certainly no more
         * surprising than an ill-guided PMU driver poking at impdef system
         * registers that end in an UNDEF...
         */
        if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
                val &= ~ID_AA64DFR0_EL1_PMUVer_MASK;

        /*
         * ID_AA64DFR0_EL1.DebugVer is one of those awkward fields with a
         * nonzero minimum safe value.
         */
        if (debugver < ID_AA64DFR0_EL1_DebugVer_IMP)
                return -EINVAL;

        return set_id_reg(vcpu, rd, val);
}

static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu,
                                      const struct sys_reg_desc *rd)
{
        u8 perfmon;
        u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1);

        val &= ~ID_DFR0_EL1_PerfMon_MASK;
        if (kvm_vcpu_has_pmu(vcpu)) {
                perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit());
                val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon);
        }

        val = ID_REG_LIMIT_FIELD_ENUM(val, ID_DFR0_EL1, CopDbg, Debugv8p8);

        return val;
}

static int set_id_dfr0_el1(struct kvm_vcpu *vcpu,
                           const struct sys_reg_desc *rd,
                           u64 val)
{
        u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val);
        u8 copdbg = SYS_FIELD_GET(ID_DFR0_EL1, CopDbg, val);

        if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) {
                val &= ~ID_DFR0_EL1_PerfMon_MASK;
                perfmon = 0;
        }

        /*
         * Allow DFR0_EL1.PerfMon to be set from userspace as long as
         * it doesn't promise more than what the HW gives us on the
         * AArch64 side (as everything is emulated with that), and
         * that this is a PMUv3.
         */
        if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3)
                return -EINVAL;

        if (copdbg < ID_DFR0_EL1_CopDbg_Armv8)
                return -EINVAL;

        return set_id_reg(vcpu, rd, val);
}

static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu,
                               const struct sys_reg_desc *rd, u64 user_val)
{
        u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
        u64 mpam_mask = ID_AA64PFR0_EL1_MPAM_MASK;

        /*
         * Commit 011e5f5bf529f ("arm64/cpufeature: Add remaining feature bits
         * in ID_AA64PFR0 register") exposed the MPAM field of AA64PFR0_EL1 to
         * guests, but didn't add trap handling. KVM doesn't support MPAM and
         * always returns an UNDEF for these registers. The guest must see 0
         * for this field.
         *
         * But KVM must also accept values from user-space that were provided
         * by KVM. On CPUs that support MPAM, permit user-space to write
         * the sanitizied value to ID_AA64PFR0_EL1.MPAM, but ignore this field.
         */
        if ((hw_val & mpam_mask) == (user_val & mpam_mask))
                user_val &= ~ID_AA64PFR0_EL1_MPAM_MASK;

        return set_id_reg(vcpu, rd, user_val);
}

static int set_id_aa64pfr1_el1(struct kvm_vcpu *vcpu,
                               const struct sys_reg_desc *rd, u64 user_val)
{
        u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
        u64 mpam_mask = ID_AA64PFR1_EL1_MPAM_frac_MASK;
        u8 mte = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE, hw_val);
        u8 user_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, user_val);
        u8 hw_mte_frac = SYS_FIELD_GET(ID_AA64PFR1_EL1, MTE_frac, hw_val);

        /* See set_id_aa64pfr0_el1 for comment about MPAM */
        if ((hw_val & mpam_mask) == (user_val & mpam_mask))
                user_val &= ~ID_AA64PFR1_EL1_MPAM_frac_MASK;

        /*
         * Previously MTE_frac was hidden from guest. However, if the
         * hardware supports MTE2 but not MTE_ASYM_FAULT then a value
         * of 0 for this field indicates that the hardware supports
         * MTE_ASYNC. Whereas, 0xf indicates MTE_ASYNC is not supported.
         *
         * As KVM must accept values from KVM provided by user-space,
         * when ID_AA64PFR1_EL1.MTE is 2 allow user-space to set
         * ID_AA64PFR1_EL1.MTE_frac to 0. However, ignore it to avoid
         * incorrectly claiming hardware support for MTE_ASYNC in the
         * guest.
         */

        if (mte == ID_AA64PFR1_EL1_MTE_MTE2 &&
            hw_mte_frac == ID_AA64PFR1_EL1_MTE_frac_NI &&
            user_mte_frac == ID_AA64PFR1_EL1_MTE_frac_ASYNC) {
                user_val &= ~ID_AA64PFR1_EL1_MTE_frac_MASK;
                user_val |= hw_val & ID_AA64PFR1_EL1_MTE_frac_MASK;
        }

        return set_id_reg(vcpu, rd, user_val);
}

static int set_id_aa64mmfr0_el1(struct kvm_vcpu *vcpu,
                                const struct sys_reg_desc *rd, u64 user_val)
{
        u64 sanitized_val = kvm_read_sanitised_id_reg(vcpu, rd);
        u64 tgran2_mask = ID_AA64MMFR0_EL1_TGRAN4_2_MASK |
                          ID_AA64MMFR0_EL1_TGRAN16_2_MASK |
                          ID_AA64MMFR0_EL1_TGRAN64_2_MASK;

        if (vcpu_has_nv(vcpu) &&
            ((sanitized_val & tgran2_mask) != (user_val & tgran2_mask)))
                return -EINVAL;

        return set_id_reg(vcpu, rd, user_val);
}

static int set_id_aa64mmfr2_el1(struct kvm_vcpu *vcpu,
                                const struct sys_reg_desc *rd, u64 user_val)
{
        u64 hw_val = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
        u64 nv_mask = ID_AA64MMFR2_EL1_NV_MASK;

        /*
         * We made the mistake to expose the now deprecated NV field,
         * so allow userspace to write it, but silently ignore it.
         */
        if ((hw_val & nv_mask) == (user_val & nv_mask))
                user_val &= ~nv_mask;

        return set_id_reg(vcpu, rd, user_val);
}

static int set_ctr_el0(struct kvm_vcpu *vcpu,
                       const struct sys_reg_desc *rd, u64 user_val)
{
        u8 user_L1Ip = SYS_FIELD_GET(CTR_EL0, L1Ip, user_val);

        /*
         * Both AIVIVT (0b01) and VPIPT (0b00) are documented as reserved.
         * Hence only allow to set VIPT(0b10) or PIPT(0b11) for L1Ip based
         * on what hardware reports.
         *
         * Using a VIPT software model on PIPT will lead to over invalidation,
         * but still correct. Hence, we can allow downgrading PIPT to VIPT,
         * but not the other way around. This is handled via arm64_ftr_safe_value()
         * as CTR_EL0 ftr_bits has L1Ip field with type FTR_EXACT and safe value
         * set as VIPT.
         */
        switch (user_L1Ip) {
        case CTR_EL0_L1Ip_RESERVED_VPIPT:
        case CTR_EL0_L1Ip_RESERVED_AIVIVT:
                return -EINVAL;
        case CTR_EL0_L1Ip_VIPT:
        case CTR_EL0_L1Ip_PIPT:
                return set_id_reg(vcpu, rd, user_val);
        default:
                return -ENOENT;
        }
}

/*
 * cpufeature ID register user accessors
 *
 * For now, these registers are immutable for userspace, so no values
 * are stored, and for set_id_reg() we don't allow the effective value
 * to be changed.
 */
static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                      u64 *val)
{
        /*
         * Avoid locking if the VM has already started, as the ID registers are
         * guaranteed to be invariant at that point.
         */
        if (kvm_vm_has_ran_once(vcpu->kvm)) {
                *val = read_id_reg(vcpu, rd);
                return 0;
        }

        mutex_lock(&vcpu->kvm->arch.config_lock);
        *val = read_id_reg(vcpu, rd);
        mutex_unlock(&vcpu->kvm->arch.config_lock);

        return 0;
}

static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                      u64 val)
{
        u32 id = reg_to_encoding(rd);
        int ret;

        mutex_lock(&vcpu->kvm->arch.config_lock);

        /*
         * Once the VM has started the ID registers are immutable. Reject any
         * write that does not match the final register value.
         */
        if (kvm_vm_has_ran_once(vcpu->kvm)) {
                if (val != read_id_reg(vcpu, rd))
                        ret = -EBUSY;
                else
                        ret = 0;

                mutex_unlock(&vcpu->kvm->arch.config_lock);
                return ret;
        }

        ret = arm64_check_features(vcpu, rd, val);
        if (!ret)
                kvm_set_vm_id_reg(vcpu->kvm, id, val);

        mutex_unlock(&vcpu->kvm->arch.config_lock);

        /*
         * arm64_check_features() returns -E2BIG to indicate the register's
         * feature set is a superset of the maximally-allowed register value.
         * While it would be nice to precisely describe this to userspace, the
         * existing UAPI for KVM_SET_ONE_REG has it that invalid register
         * writes return -EINVAL.
         */
        if (ret == -E2BIG)
                ret = -EINVAL;
        return ret;
}

void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val)
{
        u64 *p = __vm_id_reg(&kvm->arch, reg);

        lockdep_assert_held(&kvm->arch.config_lock);

        if (KVM_BUG_ON(kvm_vm_has_ran_once(kvm) || !p, kvm))
                return;

        *p = val;
}

static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                       u64 *val)
{
        *val = 0;
        return 0;
}

static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                      u64 val)
{
        return 0;
}

static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                       const struct sys_reg_desc *r)
{
        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        p->regval = kvm_read_vm_id_reg(vcpu->kvm, SYS_CTR_EL0);
        return true;
}

static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                         const struct sys_reg_desc *r)
{
        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        p->regval = __vcpu_sys_reg(vcpu, r->reg);
        return true;
}

/*
 * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary
 * by the physical CPU which the vcpu currently resides in.
 */
static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0);
        u64 clidr;
        u8 loc;

        if ((ctr_el0 & CTR_EL0_IDC)) {
                /*
                 * Data cache clean to the PoU is not required so LoUU and LoUIS
                 * will not be set and a unified cache, which will be marked as
                 * LoC, will be added.
                 *
                 * If not DIC, let the unified cache L2 so that an instruction
                 * cache can be added as L1 later.
                 */
                loc = (ctr_el0 & CTR_EL0_DIC) ? 1 : 2;
                clidr = CACHE_TYPE_UNIFIED << CLIDR_CTYPE_SHIFT(loc);
        } else {
                /*
                 * Data cache clean to the PoU is required so let L1 have a data
                 * cache and mark it as LoUU and LoUIS. As L1 has a data cache,
                 * it can be marked as LoC too.
                 */
                loc = 1;
                clidr = 1 << CLIDR_LOUU_SHIFT;
                clidr |= 1 << CLIDR_LOUIS_SHIFT;
                clidr |= CACHE_TYPE_DATA << CLIDR_CTYPE_SHIFT(1);
        }

        /*
         * Instruction cache invalidation to the PoU is required so let L1 have
         * an instruction cache. If L1 already has a data cache, it will be
         * CACHE_TYPE_SEPARATE.
         */
        if (!(ctr_el0 & CTR_EL0_DIC))
                clidr |= CACHE_TYPE_INST << CLIDR_CTYPE_SHIFT(1);

        clidr |= loc << CLIDR_LOC_SHIFT;

        /*
         * Add tag cache unified to data cache. Allocation tags and data are
         * unified in a cache line so that it looks valid even if there is only
         * one cache line.
         */
        if (kvm_has_mte(vcpu->kvm))
                clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc);

        __vcpu_sys_reg(vcpu, r->reg) = clidr;

        return __vcpu_sys_reg(vcpu, r->reg);
}

static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                      u64 val)
{
        u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0);
        u64 idc = !CLIDR_LOC(val) || (!CLIDR_LOUIS(val) && !CLIDR_LOUU(val));

        if ((val & CLIDR_EL1_RES0) || (!(ctr_el0 & CTR_EL0_IDC) && idc))
                return -EINVAL;

        __vcpu_sys_reg(vcpu, rd->reg) = val;

        return 0;
}

static bool access_csselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        int reg = r->reg;

        if (p->is_write)
                vcpu_write_sys_reg(vcpu, p->regval, reg);
        else
                p->regval = vcpu_read_sys_reg(vcpu, reg);
        return true;
}

static bool access_ccsidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        u32 csselr;

        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        csselr = vcpu_read_sys_reg(vcpu, CSSELR_EL1);
        csselr &= CSSELR_EL1_Level | CSSELR_EL1_InD;
        if (csselr < CSSELR_MAX)
                p->regval = get_ccsidr(vcpu, csselr);

        return true;
}

static unsigned int mte_visibility(const struct kvm_vcpu *vcpu,
                                   const struct sys_reg_desc *rd)
{
        if (kvm_has_mte(vcpu->kvm))
                return 0;

        return REG_HIDDEN;
}

#define MTE_REG(name) {                                \
        SYS_DESC(SYS_##name),                        \
        .access = undef_access,                        \
        .reset = reset_unknown,                        \
        .reg = name,                                \
        .visibility = mte_visibility,                \
}

static unsigned int el2_visibility(const struct kvm_vcpu *vcpu,
                                   const struct sys_reg_desc *rd)
{
        if (vcpu_has_nv(vcpu))
                return 0;

        return REG_HIDDEN;
}

static bool bad_vncr_trap(struct kvm_vcpu *vcpu,
                          struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        /*
         * We really shouldn't be here, and this is likely the result
         * of a misconfigured trap, as this register should target the
         * VNCR page, and nothing else.
         */
        return bad_trap(vcpu, p, r,
                        "trap of VNCR-backed register");
}

static bool bad_redir_trap(struct kvm_vcpu *vcpu,
                           struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        /*
         * We really shouldn't be here, and this is likely the result
         * of a misconfigured trap, as this register should target the
         * corresponding EL1, and nothing else.
         */
        return bad_trap(vcpu, p, r,
                        "trap of EL2 register redirected to EL1");
}

#define EL2_REG_FILTERED(name, acc, rst, v, filter) {        \
        SYS_DESC(SYS_##name),                        \
        .access = acc,                                \
        .reset = rst,                                \
        .reg = name,                                \
        .visibility = filter,                        \
        .val = v,                                \
}

#define EL2_REG(name, acc, rst, v)                        \
        EL2_REG_FILTERED(name, acc, rst, v, el2_visibility)

#define EL2_REG_VNCR(name, rst, v)        EL2_REG(name, bad_vncr_trap, rst, v)
#define EL2_REG_REDIR(name, rst, v)        EL2_REG(name, bad_redir_trap, rst, v)

/*
 * Since reset() callback and field val are not used for idregs, they will be
 * used for specific purposes for idregs.
 * The reset() would return KVM sanitised register value. The value would be the
 * same as the host kernel sanitised value if there is no KVM sanitisation.
 * The val would be used as a mask indicating writable fields for the idreg.
 * Only bits with 1 are writable from userspace. This mask might not be
 * necessary in the future whenever all ID registers are enabled as writable
 * from userspace.
 */

#define ID_DESC_DEFAULT_CALLBACKS                \
        .access        = access_id_reg,                \
        .get_user = get_id_reg,                        \
        .set_user = set_id_reg,                        \
        .visibility = id_visibility,                \
        .reset = kvm_read_sanitised_id_reg

#define ID_DESC(name)                                \
        SYS_DESC(SYS_##name),                        \
        ID_DESC_DEFAULT_CALLBACKS

/* sys_reg_desc initialiser for known cpufeature ID registers */
#define ID_SANITISED(name) {                        \
        ID_DESC(name),                                \
        .val = 0,                                \
}

/* sys_reg_desc initialiser for known cpufeature ID registers */
#define AA32_ID_SANITISED(name) {                \
        ID_DESC(name),                                \
        .visibility = aa32_id_visibility,        \
        .val = 0,                                \
}

/* sys_reg_desc initialiser for writable ID registers */
#define ID_WRITABLE(name, mask) {                \
        ID_DESC(name),                                \
        .val = mask,                                \
}

/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */
#define ID_FILTERED(sysreg, name, mask) {        \
        ID_DESC(sysreg),                                \
        .set_user = set_##name,                                \
        .val = (mask),                                        \
}

/*
 * sys_reg_desc initialiser for architecturally unallocated cpufeature ID
 * register with encoding Op0=3, Op1=0, CRn=0, CRm=crm, Op2=op2
 * (1 <= crm < 8, 0 <= Op2 < 8).
 */
#define ID_UNALLOCATED(crm, op2) {                        \
        .name = "S3_0_0_" #crm "_" #op2,                \
        Op0(3), Op1(0), CRn(0), CRm(crm), Op2(op2),        \
        ID_DESC_DEFAULT_CALLBACKS,                        \
        .visibility = raz_visibility,                        \
        .val = 0,                                        \
}

/*
 * sys_reg_desc initialiser for known ID registers that we hide from guests.
 * For now, these are exposed just like unallocated ID regs: they appear
 * RAZ for the guest.
 */
#define ID_HIDDEN(name) {                        \
        ID_DESC(name),                                \
        .visibility = raz_visibility,                \
        .val = 0,                                \
}

static bool access_sp_el1(struct kvm_vcpu *vcpu,
                          struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        if (p->is_write)
                __vcpu_sys_reg(vcpu, SP_EL1) = p->regval;
        else
                p->regval = __vcpu_sys_reg(vcpu, SP_EL1);

        return true;
}

static bool access_elr(struct kvm_vcpu *vcpu,
                       struct sys_reg_params *p,
                       const struct sys_reg_desc *r)
{
        if (p->is_write)
                vcpu_write_sys_reg(vcpu, p->regval, ELR_EL1);
        else
                p->regval = vcpu_read_sys_reg(vcpu, ELR_EL1);

        return true;
}

static bool access_spsr(struct kvm_vcpu *vcpu,
                        struct sys_reg_params *p,
                        const struct sys_reg_desc *r)
{
        if (p->is_write)
                __vcpu_sys_reg(vcpu, SPSR_EL1) = p->regval;
        else
                p->regval = __vcpu_sys_reg(vcpu, SPSR_EL1);

        return true;
}

static bool access_cntkctl_el12(struct kvm_vcpu *vcpu,
                                struct sys_reg_params *p,
                                const struct sys_reg_desc *r)
{
        if (p->is_write)
                __vcpu_sys_reg(vcpu, CNTKCTL_EL1) = p->regval;
        else
                p->regval = __vcpu_sys_reg(vcpu, CNTKCTL_EL1);

        return true;
}

static u64 reset_hcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        u64 val = r->val;

        if (!cpus_have_final_cap(ARM64_HAS_HCR_NV1))
                val |= HCR_E2H;

        return __vcpu_sys_reg(vcpu, r->reg) = val;
}

static unsigned int __el2_visibility(const struct kvm_vcpu *vcpu,
                                     const struct sys_reg_desc *rd,
                                     unsigned int (*fn)(const struct kvm_vcpu *,
                                                        const struct sys_reg_desc *))
{
        return el2_visibility(vcpu, rd) ?: fn(vcpu, rd);
}

static unsigned int sve_el2_visibility(const struct kvm_vcpu *vcpu,
                                       const struct sys_reg_desc *rd)
{
        return __el2_visibility(vcpu, rd, sve_visibility);
}

static unsigned int vncr_el2_visibility(const struct kvm_vcpu *vcpu,
                                        const struct sys_reg_desc *rd)
{
        if (el2_visibility(vcpu, rd) == 0 &&
            kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
                return 0;

        return REG_HIDDEN;
}

static bool access_zcr_el2(struct kvm_vcpu *vcpu,
                           struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        unsigned int vq;

        if (guest_hyp_sve_traps_enabled(vcpu)) {
                kvm_inject_nested_sve_trap(vcpu);
                return true;
        }

        if (!p->is_write) {
                p->regval = vcpu_read_sys_reg(vcpu, ZCR_EL2);
                return true;
        }

        vq = SYS_FIELD_GET(ZCR_ELx, LEN, p->regval) + 1;
        vq = min(vq, vcpu_sve_max_vq(vcpu));
        vcpu_write_sys_reg(vcpu, vq - 1, ZCR_EL2);

        return true;
}

static bool access_gic_vtr(struct kvm_vcpu *vcpu,
                           struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        p->regval = kvm_vgic_global_state.ich_vtr_el2;
        p->regval &= ~(ICH_VTR_EL2_DVIM         |
                       ICH_VTR_EL2_A3V                |
                       ICH_VTR_EL2_IDbits);
        p->regval |= ICH_VTR_EL2_nV4;

        return true;
}

static bool access_gic_misr(struct kvm_vcpu *vcpu,
                            struct sys_reg_params *p,
                            const struct sys_reg_desc *r)
{
        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        p->regval = vgic_v3_get_misr(vcpu);

        return true;
}

static bool access_gic_eisr(struct kvm_vcpu *vcpu,
                            struct sys_reg_params *p,
                            const struct sys_reg_desc *r)
{
        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        p->regval = vgic_v3_get_eisr(vcpu);

        return true;
}

static bool access_gic_elrsr(struct kvm_vcpu *vcpu,
                             struct sys_reg_params *p,
                             const struct sys_reg_desc *r)
{
        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        p->regval = vgic_v3_get_elrsr(vcpu);

        return true;
}

static unsigned int s1poe_visibility(const struct kvm_vcpu *vcpu,
                                     const struct sys_reg_desc *rd)
{
        if (kvm_has_s1poe(vcpu->kvm))
                return 0;

        return REG_HIDDEN;
}

static unsigned int s1poe_el2_visibility(const struct kvm_vcpu *vcpu,
                                         const struct sys_reg_desc *rd)
{
        return __el2_visibility(vcpu, rd, s1poe_visibility);
}

static unsigned int tcr2_visibility(const struct kvm_vcpu *vcpu,
                                    const struct sys_reg_desc *rd)
{
        if (kvm_has_tcr2(vcpu->kvm))
                return 0;

        return REG_HIDDEN;
}

static unsigned int tcr2_el2_visibility(const struct kvm_vcpu *vcpu,
                                    const struct sys_reg_desc *rd)
{
        return __el2_visibility(vcpu, rd, tcr2_visibility);
}

static unsigned int s1pie_visibility(const struct kvm_vcpu *vcpu,
                                     const struct sys_reg_desc *rd)
{
        if (kvm_has_s1pie(vcpu->kvm))
                return 0;

        return REG_HIDDEN;
}

static unsigned int s1pie_el2_visibility(const struct kvm_vcpu *vcpu,
                                         const struct sys_reg_desc *rd)
{
        return __el2_visibility(vcpu, rd, s1pie_visibility);
}

static bool access_mdcr(struct kvm_vcpu *vcpu,
                        struct sys_reg_params *p,
                        const struct sys_reg_desc *r)
{
        u64 hpmn, val, old = __vcpu_sys_reg(vcpu, MDCR_EL2);

        if (!p->is_write) {
                p->regval = old;
                return true;
        }

        val = p->regval;
        hpmn = FIELD_GET(MDCR_EL2_HPMN, val);

        /*
         * If HPMN is out of bounds, limit it to what we actually
         * support. This matches the UNKNOWN definition of the field
         * in that case, and keeps the emulation simple. Sort of.
         */
        if (hpmn > vcpu->kvm->arch.nr_pmu_counters) {
                hpmn = vcpu->kvm->arch.nr_pmu_counters;
                u64_replace_bits(val, hpmn, MDCR_EL2_HPMN);
        }

        __vcpu_sys_reg(vcpu, MDCR_EL2) = val;

        /*
         * Request a reload of the PMU to enable/disable the counters
         * affected by HPME.
         */
        if ((old ^ val) & MDCR_EL2_HPME)
                kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);

        return true;
}

/*
 * For historical (ahem ABI) reasons, KVM treated MIDR_EL1, REVIDR_EL1, and
 * AIDR_EL1 as "invariant" registers, meaning userspace cannot change them.
 * The values made visible to userspace were the register values of the boot
 * CPU.
 *
 * At the same time, reads from these registers at EL1 previously were not
 * trapped, allowing the guest to read the actual hardware value. On big-little
 * machines, this means the VM can see different values depending on where a
 * given vCPU got scheduled.
 *
 * These registers are now trapped as collateral damage from SME, and what
 * follows attempts to give a user / guest view consistent with the existing
 * ABI.
 */
static bool access_imp_id_reg(struct kvm_vcpu *vcpu,
                              struct sys_reg_params *p,
                              const struct sys_reg_desc *r)
{
        if (p->is_write)
                return write_to_read_only(vcpu, p, r);

        /*
         * Return the VM-scoped implementation ID register values if userspace
         * has made them writable.
         */
        if (test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &vcpu->kvm->arch.flags))
                return access_id_reg(vcpu, p, r);

        /*
         * Otherwise, fall back to the old behavior of returning the value of
         * the current CPU.
         */
        switch (reg_to_encoding(r)) {
        case SYS_REVIDR_EL1:
                p->regval = read_sysreg(revidr_el1);
                break;
        case SYS_AIDR_EL1:
                p->regval = read_sysreg(aidr_el1);
                break;
        default:
                WARN_ON_ONCE(1);
        }

        return true;
}

static u64 __ro_after_init boot_cpu_midr_val;
static u64 __ro_after_init boot_cpu_revidr_val;
static u64 __ro_after_init boot_cpu_aidr_val;

static void init_imp_id_regs(void)
{
        boot_cpu_midr_val = read_sysreg(midr_el1);
        boot_cpu_revidr_val = read_sysreg(revidr_el1);
        boot_cpu_aidr_val = read_sysreg(aidr_el1);
}

static u64 reset_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        switch (reg_to_encoding(r)) {
        case SYS_MIDR_EL1:
                return boot_cpu_midr_val;
        case SYS_REVIDR_EL1:
                return boot_cpu_revidr_val;
        case SYS_AIDR_EL1:
                return boot_cpu_aidr_val;
        default:
                KVM_BUG_ON(1, vcpu->kvm);
                return 0;
        }
}

static int set_imp_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                          u64 val)
{
        struct kvm *kvm = vcpu->kvm;
        u64 expected;

        guard(mutex)(&kvm->arch.config_lock);

        expected = read_id_reg(vcpu, r);
        if (expected == val)
                return 0;

        if (!test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags))
                return -EINVAL;

        /*
         * Once the VM has started the ID registers are immutable. Reject the
         * write if userspace tries to change it.
         */
        if (kvm_vm_has_ran_once(kvm))
                return -EBUSY;

        /*
         * Any value is allowed for the implementation ID registers so long as
         * it is within the writable mask.
         */
        if ((val & r->val) != val)
                return -EINVAL;

        kvm_set_vm_id_reg(kvm, reg_to_encoding(r), val);
        return 0;
}

#define IMPLEMENTATION_ID(reg, mask) {                        \
        SYS_DESC(SYS_##reg),                                \
        .access = access_imp_id_reg,                        \
        .get_user = get_id_reg,                                \
        .set_user = set_imp_id_reg,                        \
        .reset = reset_imp_id_reg,                        \
        .val = mask,                                        \
        }

static u64 reset_mdcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        __vcpu_sys_reg(vcpu, r->reg) = vcpu->kvm->arch.nr_pmu_counters;
        return vcpu->kvm->arch.nr_pmu_counters;
}

/*
 * Architected system registers.
 * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
 *
 * Debug handling: We do trap most, if not all debug related system
 * registers. The implementation is good enough to ensure that a guest
 * can use these with minimal performance degradation. The drawback is
 * that we don't implement any of the external debug architecture.
 * This should be revisited if we ever encounter a more demanding
 * guest...
 */
static const struct sys_reg_desc sys_reg_descs[] = {
        DBG_BCR_BVR_WCR_WVR_EL1(0),
        DBG_BCR_BVR_WCR_WVR_EL1(1),
        { SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 },
        { SYS_DESC(SYS_MDSCR_EL1), trap_debug_regs, reset_val, MDSCR_EL1, 0 },
        DBG_BCR_BVR_WCR_WVR_EL1(2),
        DBG_BCR_BVR_WCR_WVR_EL1(3),
        DBG_BCR_BVR_WCR_WVR_EL1(4),
        DBG_BCR_BVR_WCR_WVR_EL1(5),
        DBG_BCR_BVR_WCR_WVR_EL1(6),
        DBG_BCR_BVR_WCR_WVR_EL1(7),
        DBG_BCR_BVR_WCR_WVR_EL1(8),
        DBG_BCR_BVR_WCR_WVR_EL1(9),
        DBG_BCR_BVR_WCR_WVR_EL1(10),
        DBG_BCR_BVR_WCR_WVR_EL1(11),
        DBG_BCR_BVR_WCR_WVR_EL1(12),
        DBG_BCR_BVR_WCR_WVR_EL1(13),
        DBG_BCR_BVR_WCR_WVR_EL1(14),
        DBG_BCR_BVR_WCR_WVR_EL1(15),

        { SYS_DESC(SYS_MDRAR_EL1), trap_raz_wi },
        { SYS_DESC(SYS_OSLAR_EL1), trap_oslar_el1 },
        { SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1, reset_val, OSLSR_EL1,
                OSLSR_EL1_OSLM_IMPLEMENTED, .set_user = set_oslsr_el1, },
        { SYS_DESC(SYS_OSDLR_EL1), trap_raz_wi },
        { SYS_DESC(SYS_DBGPRCR_EL1), trap_raz_wi },
        { SYS_DESC(SYS_DBGCLAIMSET_EL1), trap_raz_wi },
        { SYS_DESC(SYS_DBGCLAIMCLR_EL1), trap_raz_wi },
        { SYS_DESC(SYS_DBGAUTHSTATUS_EL1), trap_dbgauthstatus_el1 },

        { SYS_DESC(SYS_MDCCSR_EL0), trap_raz_wi },
        { SYS_DESC(SYS_DBGDTR_EL0), trap_raz_wi },
        // DBGDTR[TR]X_EL0 share the same encoding
        { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi },

        { SYS_DESC(SYS_DBGVCR32_EL2), undef_access, reset_val, DBGVCR32_EL2, 0 },

        IMPLEMENTATION_ID(MIDR_EL1, GENMASK_ULL(31, 0)),
        { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 },
        IMPLEMENTATION_ID(REVIDR_EL1, GENMASK_ULL(63, 0)),

        /*
         * ID regs: all ID_SANITISED() entries here must have corresponding
         * entries in arm64_ftr_regs[].
         */

        /* AArch64 mappings of the AArch32 ID registers */
        /* CRm=1 */
        AA32_ID_SANITISED(ID_PFR0_EL1),
        AA32_ID_SANITISED(ID_PFR1_EL1),
        { SYS_DESC(SYS_ID_DFR0_EL1),
          .access = access_id_reg,
          .get_user = get_id_reg,
          .set_user = set_id_dfr0_el1,
          .visibility = aa32_id_visibility,
          .reset = read_sanitised_id_dfr0_el1,
          .val = ID_DFR0_EL1_PerfMon_MASK |
                 ID_DFR0_EL1_CopDbg_MASK, },
        ID_HIDDEN(ID_AFR0_EL1),
        AA32_ID_SANITISED(ID_MMFR0_EL1),
        AA32_ID_SANITISED(ID_MMFR1_EL1),
        AA32_ID_SANITISED(ID_MMFR2_EL1),
        AA32_ID_SANITISED(ID_MMFR3_EL1),

        /* CRm=2 */
        AA32_ID_SANITISED(ID_ISAR0_EL1),
        AA32_ID_SANITISED(ID_ISAR1_EL1),
        AA32_ID_SANITISED(ID_ISAR2_EL1),
        AA32_ID_SANITISED(ID_ISAR3_EL1),
        AA32_ID_SANITISED(ID_ISAR4_EL1),
        AA32_ID_SANITISED(ID_ISAR5_EL1),
        AA32_ID_SANITISED(ID_MMFR4_EL1),
        AA32_ID_SANITISED(ID_ISAR6_EL1),

        /* CRm=3 */
        AA32_ID_SANITISED(MVFR0_EL1),
        AA32_ID_SANITISED(MVFR1_EL1),
        AA32_ID_SANITISED(MVFR2_EL1),
        ID_UNALLOCATED(3,3),
        AA32_ID_SANITISED(ID_PFR2_EL1),
        ID_HIDDEN(ID_DFR1_EL1),
        AA32_ID_SANITISED(ID_MMFR5_EL1),
        ID_UNALLOCATED(3,7),

        /* AArch64 ID registers */
        /* CRm=4 */
        ID_FILTERED(ID_AA64PFR0_EL1, id_aa64pfr0_el1,
                    ~(ID_AA64PFR0_EL1_AMU |
                      ID_AA64PFR0_EL1_MPAM |
                      ID_AA64PFR0_EL1_SVE |
                      ID_AA64PFR0_EL1_RAS |
                      ID_AA64PFR0_EL1_AdvSIMD |
                      ID_AA64PFR0_EL1_FP)),
        ID_FILTERED(ID_AA64PFR1_EL1, id_aa64pfr1_el1,
                                     ~(ID_AA64PFR1_EL1_PFAR |
                                       ID_AA64PFR1_EL1_DF2 |
                                       ID_AA64PFR1_EL1_MTEX |
                                       ID_AA64PFR1_EL1_THE |
                                       ID_AA64PFR1_EL1_GCS |
                                       ID_AA64PFR1_EL1_MTE_frac |
                                       ID_AA64PFR1_EL1_NMI |
                                       ID_AA64PFR1_EL1_RNDR_trap |
                                       ID_AA64PFR1_EL1_SME |
                                       ID_AA64PFR1_EL1_RES0 |
                                       ID_AA64PFR1_EL1_MPAM_frac |
                                       ID_AA64PFR1_EL1_RAS_frac |
                                       ID_AA64PFR1_EL1_MTE)),
        ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR),
        ID_UNALLOCATED(4,3),
        ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
        ID_HIDDEN(ID_AA64SMFR0_EL1),
        ID_UNALLOCATED(4,6),
        ID_WRITABLE(ID_AA64FPFR0_EL1, ~ID_AA64FPFR0_EL1_RES0),

        /* CRm=5 */
        /*
         * Prior to FEAT_Debugv8.9, the architecture defines context-aware
         * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs).
         * KVM does not trap + emulate the breakpoint registers, and as such
         * cannot support a layout that misaligns with the underlying hardware.
         * While it may be possible to describe a subset that aligns with
         * hardware, just prevent changes to BRPs and CTX_CMPs altogether for
         * simplicity.
         *
         * See DDI0487K.a, section D2.8.3 Breakpoint types and linking
         * of breakpoints for more details.
         */
        ID_FILTERED(ID_AA64DFR0_EL1, id_aa64dfr0_el1,
                    ID_AA64DFR0_EL1_DoubleLock_MASK |
                    ID_AA64DFR0_EL1_WRPs_MASK |
                    ID_AA64DFR0_EL1_PMUVer_MASK |
                    ID_AA64DFR0_EL1_DebugVer_MASK),
        ID_SANITISED(ID_AA64DFR1_EL1),
        ID_UNALLOCATED(5,2),
        ID_UNALLOCATED(5,3),
        ID_HIDDEN(ID_AA64AFR0_EL1),
        ID_HIDDEN(ID_AA64AFR1_EL1),
        ID_UNALLOCATED(5,6),
        ID_UNALLOCATED(5,7),

        /* CRm=6 */
        ID_WRITABLE(ID_AA64ISAR0_EL1, ~ID_AA64ISAR0_EL1_RES0),
        ID_WRITABLE(ID_AA64ISAR1_EL1, ~(ID_AA64ISAR1_EL1_GPI |
                                        ID_AA64ISAR1_EL1_GPA |
                                        ID_AA64ISAR1_EL1_API |
                                        ID_AA64ISAR1_EL1_APA)),
        ID_WRITABLE(ID_AA64ISAR2_EL1, ~(ID_AA64ISAR2_EL1_RES0 |
                                        ID_AA64ISAR2_EL1_APA3 |
                                        ID_AA64ISAR2_EL1_GPA3)),
        ID_WRITABLE(ID_AA64ISAR3_EL1, (ID_AA64ISAR3_EL1_FPRCVT |
                                       ID_AA64ISAR3_EL1_FAMINMAX)),
        ID_UNALLOCATED(6,4),
        ID_UNALLOCATED(6,5),
        ID_UNALLOCATED(6,6),
        ID_UNALLOCATED(6,7),

        /* CRm=7 */
        ID_FILTERED(ID_AA64MMFR0_EL1, id_aa64mmfr0_el1,
                                      ~(ID_AA64MMFR0_EL1_RES0 |
                                        ID_AA64MMFR0_EL1_ASIDBITS)),
        ID_WRITABLE(ID_AA64MMFR1_EL1, ~(ID_AA64MMFR1_EL1_RES0 |
                                        ID_AA64MMFR1_EL1_HCX |
                                        ID_AA64MMFR1_EL1_TWED |
                                        ID_AA64MMFR1_EL1_XNX |
                                        ID_AA64MMFR1_EL1_VH |
                                        ID_AA64MMFR1_EL1_VMIDBits)),
        ID_FILTERED(ID_AA64MMFR2_EL1,
                    id_aa64mmfr2_el1, ~(ID_AA64MMFR2_EL1_RES0 |
                                        ID_AA64MMFR2_EL1_EVT |
                                        ID_AA64MMFR2_EL1_FWB |
                                        ID_AA64MMFR2_EL1_IDS |
                                        ID_AA64MMFR2_EL1_NV |
                                        ID_AA64MMFR2_EL1_CCIDX)),
        ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX        |
                                       ID_AA64MMFR3_EL1_S1PIE   |
                                       ID_AA64MMFR3_EL1_S1POE)),
        ID_WRITABLE(ID_AA64MMFR4_EL1, ID_AA64MMFR4_EL1_NV_frac),
        ID_UNALLOCATED(7,5),
        ID_UNALLOCATED(7,6),
        ID_UNALLOCATED(7,7),

        { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 },
        { SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 },
        { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 },

        MTE_REG(RGSR_EL1),
        MTE_REG(GCR_EL1),

        { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility },
        { SYS_DESC(SYS_TRFCR_EL1), undef_access },
        { SYS_DESC(SYS_SMPRI_EL1), undef_access },
        { SYS_DESC(SYS_SMCR_EL1), undef_access },
        { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 },
        { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 },
        { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 },
        { SYS_DESC(SYS_TCR2_EL1), access_vm_reg, reset_val, TCR2_EL1, 0,
          .visibility = tcr2_visibility },

        PTRAUTH_KEY(APIA),
        PTRAUTH_KEY(APIB),
        PTRAUTH_KEY(APDA),
        PTRAUTH_KEY(APDB),
        PTRAUTH_KEY(APGA),

        { SYS_DESC(SYS_SPSR_EL1), access_spsr},
        { SYS_DESC(SYS_ELR_EL1), access_elr},

        { SYS_DESC(SYS_ICC_PMR_EL1), undef_access },

        { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 },
        { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 },
        { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 },

        { SYS_DESC(SYS_ERRIDR_EL1), trap_raz_wi },
        { SYS_DESC(SYS_ERRSELR_EL1), trap_raz_wi },
        { SYS_DESC(SYS_ERXFR_EL1), trap_raz_wi },
        { SYS_DESC(SYS_ERXCTLR_EL1), trap_raz_wi },
        { SYS_DESC(SYS_ERXSTATUS_EL1), trap_raz_wi },
        { SYS_DESC(SYS_ERXADDR_EL1), trap_raz_wi },
        { SYS_DESC(SYS_ERXMISC0_EL1), trap_raz_wi },
        { SYS_DESC(SYS_ERXMISC1_EL1), trap_raz_wi },

        MTE_REG(TFSR_EL1),
        MTE_REG(TFSRE0_EL1),

        { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 },
        { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 },

        { SYS_DESC(SYS_PMSCR_EL1), undef_access },
        { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access },
        { SYS_DESC(SYS_PMSICR_EL1), undef_access },
        { SYS_DESC(SYS_PMSIRR_EL1), undef_access },
        { SYS_DESC(SYS_PMSFCR_EL1), undef_access },
        { SYS_DESC(SYS_PMSEVFR_EL1), undef_access },
        { SYS_DESC(SYS_PMSLATFR_EL1), undef_access },
        { SYS_DESC(SYS_PMSIDR_EL1), undef_access },
        { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access },
        { SYS_DESC(SYS_PMBPTR_EL1), undef_access },
        { SYS_DESC(SYS_PMBSR_EL1), undef_access },
        /* PMBIDR_EL1 is not trapped */

        { PMU_SYS_REG(PMINTENSET_EL1),
          .access = access_pminten, .reg = PMINTENSET_EL1,
          .get_user = get_pmreg, .set_user = set_pmreg },
        { PMU_SYS_REG(PMINTENCLR_EL1),
          .access = access_pminten, .reg = PMINTENSET_EL1,
          .get_user = get_pmreg, .set_user = set_pmreg },
        { SYS_DESC(SYS_PMMIR_EL1), trap_raz_wi },

        { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 },
        { SYS_DESC(SYS_PIRE0_EL1), NULL, reset_unknown, PIRE0_EL1,
          .visibility = s1pie_visibility },
        { SYS_DESC(SYS_PIR_EL1), NULL, reset_unknown, PIR_EL1,
          .visibility = s1pie_visibility },
        { SYS_DESC(SYS_POR_EL1), NULL, reset_unknown, POR_EL1,
          .visibility = s1poe_visibility },
        { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 },

        { SYS_DESC(SYS_LORSA_EL1), trap_loregion },
        { SYS_DESC(SYS_LOREA_EL1), trap_loregion },
        { SYS_DESC(SYS_LORN_EL1), trap_loregion },
        { SYS_DESC(SYS_LORC_EL1), trap_loregion },
        { SYS_DESC(SYS_MPAMIDR_EL1), undef_access },
        { SYS_DESC(SYS_LORID_EL1), trap_loregion },

        { SYS_DESC(SYS_MPAM1_EL1), undef_access },
        { SYS_DESC(SYS_MPAM0_EL1), undef_access },
        { SYS_DESC(SYS_VBAR_EL1), access_rw, reset_val, VBAR_EL1, 0 },
        { SYS_DESC(SYS_DISR_EL1), NULL, reset_val, DISR_EL1, 0 },

        { SYS_DESC(SYS_ICC_IAR0_EL1), undef_access },
        { SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access },
        { SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access },
        { SYS_DESC(SYS_ICC_BPR0_EL1), undef_access },
        { SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access },
        { SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access },
        { SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access },
        { SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access },
        { SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access },
        { SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
        { SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
        { SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
        { SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
        { SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
        { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
        { SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi },
        { SYS_DESC(SYS_ICC_SGI0R_EL1), access_gic_sgi },
        { SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
        { SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
        { SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access },
        { SYS_DESC(SYS_ICC_BPR1_EL1), undef_access },
        { SYS_DESC(SYS_ICC_CTLR_EL1), undef_access },
        { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
        { SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access },
        { SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access },

        { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 },
        { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 },

        { SYS_DESC(SYS_ACCDATA_EL1), undef_access },

        { SYS_DESC(SYS_SCXTNUM_EL1), undef_access },

        { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0},

        { SYS_DESC(SYS_CCSIDR_EL1), access_ccsidr },
        { SYS_DESC(SYS_CLIDR_EL1), access_clidr, reset_clidr, CLIDR_EL1,
          .set_user = set_clidr, .val = ~CLIDR_EL1_RES0 },
        { SYS_DESC(SYS_CCSIDR2_EL1), undef_access },
        { SYS_DESC(SYS_SMIDR_EL1), undef_access },
        IMPLEMENTATION_ID(AIDR_EL1, GENMASK_ULL(63, 0)),
        { SYS_DESC(SYS_CSSELR_EL1), access_csselr, reset_unknown, CSSELR_EL1 },
        ID_FILTERED(CTR_EL0, ctr_el0,
                    CTR_EL0_DIC_MASK |
                    CTR_EL0_IDC_MASK |
                    CTR_EL0_DminLine_MASK |
                    CTR_EL0_L1Ip_MASK |
                    CTR_EL0_IminLine_MASK),
        { SYS_DESC(SYS_SVCR), undef_access, reset_val, SVCR, 0, .visibility = sme_visibility  },
        { SYS_DESC(SYS_FPMR), undef_access, reset_val, FPMR, 0, .visibility = fp8_visibility },

        { PMU_SYS_REG(PMCR_EL0), .access = access_pmcr, .reset = reset_pmcr,
          .reg = PMCR_EL0, .get_user = get_pmcr, .set_user = set_pmcr },
        { PMU_SYS_REG(PMCNTENSET_EL0),
          .access = access_pmcnten, .reg = PMCNTENSET_EL0,
          .get_user = get_pmreg, .set_user = set_pmreg },
        { PMU_SYS_REG(PMCNTENCLR_EL0),
          .access = access_pmcnten, .reg = PMCNTENSET_EL0,
          .get_user = get_pmreg, .set_user = set_pmreg },
        { PMU_SYS_REG(PMOVSCLR_EL0),
          .access = access_pmovs, .reg = PMOVSSET_EL0,
          .get_user = get_pmreg, .set_user = set_pmreg },
        /*
         * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was
         * previously (and pointlessly) advertised in the past...
         */
        { PMU_SYS_REG(PMSWINC_EL0),
          .get_user = get_raz_reg, .set_user = set_wi_reg,
          .access = access_pmswinc, .reset = NULL },
        { PMU_SYS_REG(PMSELR_EL0),
          .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 },
        { PMU_SYS_REG(PMCEID0_EL0),
          .access = access_pmceid, .reset = NULL },
        { PMU_SYS_REG(PMCEID1_EL0),
          .access = access_pmceid, .reset = NULL },
        { PMU_SYS_REG(PMCCNTR_EL0),
          .access = access_pmu_evcntr, .reset = reset_unknown,
          .reg = PMCCNTR_EL0, .get_user = get_pmu_evcntr,
          .set_user = set_pmu_evcntr },
        { PMU_SYS_REG(PMXEVTYPER_EL0),
          .access = access_pmu_evtyper, .reset = NULL },
        { PMU_SYS_REG(PMXEVCNTR_EL0),
          .access = access_pmu_evcntr, .reset = NULL },
        /*
         * PMUSERENR_EL0 resets as unknown in 64bit mode while it resets as zero
         * in 32bit mode. Here we choose to reset it as zero for consistency.
         */
        { PMU_SYS_REG(PMUSERENR_EL0), .access = access_pmuserenr,
          .reset = reset_val, .reg = PMUSERENR_EL0, .val = 0 },
        { PMU_SYS_REG(PMOVSSET_EL0),
          .access = access_pmovs, .reg = PMOVSSET_EL0,
          .get_user = get_pmreg, .set_user = set_pmreg },

        { SYS_DESC(SYS_POR_EL0), NULL, reset_unknown, POR_EL0,
          .visibility = s1poe_visibility },
        { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 },
        { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 },
        { SYS_DESC(SYS_TPIDR2_EL0), undef_access },

        { SYS_DESC(SYS_SCXTNUM_EL0), undef_access },

        { SYS_DESC(SYS_AMCR_EL0), undef_access },
        { SYS_DESC(SYS_AMCFGR_EL0), undef_access },
        { SYS_DESC(SYS_AMCGCR_EL0), undef_access },
        { SYS_DESC(SYS_AMUSERENR_EL0), undef_access },
        { SYS_DESC(SYS_AMCNTENCLR0_EL0), undef_access },
        { SYS_DESC(SYS_AMCNTENSET0_EL0), undef_access },
        { SYS_DESC(SYS_AMCNTENCLR1_EL0), undef_access },
        { SYS_DESC(SYS_AMCNTENSET1_EL0), undef_access },
        AMU_AMEVCNTR0_EL0(0),
        AMU_AMEVCNTR0_EL0(1),
        AMU_AMEVCNTR0_EL0(2),
        AMU_AMEVCNTR0_EL0(3),
        AMU_AMEVCNTR0_EL0(4),
        AMU_AMEVCNTR0_EL0(5),
        AMU_AMEVCNTR0_EL0(6),
        AMU_AMEVCNTR0_EL0(7),
        AMU_AMEVCNTR0_EL0(8),
        AMU_AMEVCNTR0_EL0(9),
        AMU_AMEVCNTR0_EL0(10),
        AMU_AMEVCNTR0_EL0(11),
        AMU_AMEVCNTR0_EL0(12),
        AMU_AMEVCNTR0_EL0(13),
        AMU_AMEVCNTR0_EL0(14),
        AMU_AMEVCNTR0_EL0(15),
        AMU_AMEVTYPER0_EL0(0),
        AMU_AMEVTYPER0_EL0(1),
        AMU_AMEVTYPER0_EL0(2),
        AMU_AMEVTYPER0_EL0(3),
        AMU_AMEVTYPER0_EL0(4),
        AMU_AMEVTYPER0_EL0(5),
        AMU_AMEVTYPER0_EL0(6),
        AMU_AMEVTYPER0_EL0(7),
        AMU_AMEVTYPER0_EL0(8),
        AMU_AMEVTYPER0_EL0(9),
        AMU_AMEVTYPER0_EL0(10),
        AMU_AMEVTYPER0_EL0(11),
        AMU_AMEVTYPER0_EL0(12),
        AMU_AMEVTYPER0_EL0(13),
        AMU_AMEVTYPER0_EL0(14),
        AMU_AMEVTYPER0_EL0(15),
        AMU_AMEVCNTR1_EL0(0),
        AMU_AMEVCNTR1_EL0(1),
        AMU_AMEVCNTR1_EL0(2),
        AMU_AMEVCNTR1_EL0(3),
        AMU_AMEVCNTR1_EL0(4),
        AMU_AMEVCNTR1_EL0(5),
        AMU_AMEVCNTR1_EL0(6),
        AMU_AMEVCNTR1_EL0(7),
        AMU_AMEVCNTR1_EL0(8),
        AMU_AMEVCNTR1_EL0(9),
        AMU_AMEVCNTR1_EL0(10),
        AMU_AMEVCNTR1_EL0(11),
        AMU_AMEVCNTR1_EL0(12),
        AMU_AMEVCNTR1_EL0(13),
        AMU_AMEVCNTR1_EL0(14),
        AMU_AMEVCNTR1_EL0(15),
        AMU_AMEVTYPER1_EL0(0),
        AMU_AMEVTYPER1_EL0(1),
        AMU_AMEVTYPER1_EL0(2),
        AMU_AMEVTYPER1_EL0(3),
        AMU_AMEVTYPER1_EL0(4),
        AMU_AMEVTYPER1_EL0(5),
        AMU_AMEVTYPER1_EL0(6),
        AMU_AMEVTYPER1_EL0(7),
        AMU_AMEVTYPER1_EL0(8),
        AMU_AMEVTYPER1_EL0(9),
        AMU_AMEVTYPER1_EL0(10),
        AMU_AMEVTYPER1_EL0(11),
        AMU_AMEVTYPER1_EL0(12),
        AMU_AMEVTYPER1_EL0(13),
        AMU_AMEVTYPER1_EL0(14),
        AMU_AMEVTYPER1_EL0(15),

        { SYS_DESC(SYS_CNTPCT_EL0), access_arch_timer },
        { SYS_DESC(SYS_CNTVCT_EL0), access_arch_timer },
        { SYS_DESC(SYS_CNTPCTSS_EL0), access_arch_timer },
        { SYS_DESC(SYS_CNTVCTSS_EL0), access_arch_timer },
        { SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer },
        { SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer },
        { SYS_DESC(SYS_CNTP_CVAL_EL0), access_arch_timer },

        { SYS_DESC(SYS_CNTV_TVAL_EL0), access_arch_timer },
        { SYS_DESC(SYS_CNTV_CTL_EL0), access_arch_timer },
        { SYS_DESC(SYS_CNTV_CVAL_EL0), access_arch_timer },

        /* PMEVCNTRn_EL0 */
        PMU_PMEVCNTR_EL0(0),
        PMU_PMEVCNTR_EL0(1),
        PMU_PMEVCNTR_EL0(2),
        PMU_PMEVCNTR_EL0(3),
        PMU_PMEVCNTR_EL0(4),
        PMU_PMEVCNTR_EL0(5),
        PMU_PMEVCNTR_EL0(6),
        PMU_PMEVCNTR_EL0(7),
        PMU_PMEVCNTR_EL0(8),
        PMU_PMEVCNTR_EL0(9),
        PMU_PMEVCNTR_EL0(10),
        PMU_PMEVCNTR_EL0(11),
        PMU_PMEVCNTR_EL0(12),
        PMU_PMEVCNTR_EL0(13),
        PMU_PMEVCNTR_EL0(14),
        PMU_PMEVCNTR_EL0(15),
        PMU_PMEVCNTR_EL0(16),
        PMU_PMEVCNTR_EL0(17),
        PMU_PMEVCNTR_EL0(18),
        PMU_PMEVCNTR_EL0(19),
        PMU_PMEVCNTR_EL0(20),
        PMU_PMEVCNTR_EL0(21),
        PMU_PMEVCNTR_EL0(22),
        PMU_PMEVCNTR_EL0(23),
        PMU_PMEVCNTR_EL0(24),
        PMU_PMEVCNTR_EL0(25),
        PMU_PMEVCNTR_EL0(26),
        PMU_PMEVCNTR_EL0(27),
        PMU_PMEVCNTR_EL0(28),
        PMU_PMEVCNTR_EL0(29),
        PMU_PMEVCNTR_EL0(30),
        /* PMEVTYPERn_EL0 */
        PMU_PMEVTYPER_EL0(0),
        PMU_PMEVTYPER_EL0(1),
        PMU_PMEVTYPER_EL0(2),
        PMU_PMEVTYPER_EL0(3),
        PMU_PMEVTYPER_EL0(4),
        PMU_PMEVTYPER_EL0(5),
        PMU_PMEVTYPER_EL0(6),
        PMU_PMEVTYPER_EL0(7),
        PMU_PMEVTYPER_EL0(8),
        PMU_PMEVTYPER_EL0(9),
        PMU_PMEVTYPER_EL0(10),
        PMU_PMEVTYPER_EL0(11),
        PMU_PMEVTYPER_EL0(12),
        PMU_PMEVTYPER_EL0(13),
        PMU_PMEVTYPER_EL0(14),
        PMU_PMEVTYPER_EL0(15),
        PMU_PMEVTYPER_EL0(16),
        PMU_PMEVTYPER_EL0(17),
        PMU_PMEVTYPER_EL0(18),
        PMU_PMEVTYPER_EL0(19),
        PMU_PMEVTYPER_EL0(20),
        PMU_PMEVTYPER_EL0(21),
        PMU_PMEVTYPER_EL0(22),
        PMU_PMEVTYPER_EL0(23),
        PMU_PMEVTYPER_EL0(24),
        PMU_PMEVTYPER_EL0(25),
        PMU_PMEVTYPER_EL0(26),
        PMU_PMEVTYPER_EL0(27),
        PMU_PMEVTYPER_EL0(28),
        PMU_PMEVTYPER_EL0(29),
        PMU_PMEVTYPER_EL0(30),
        /*
         * PMCCFILTR_EL0 resets as unknown in 64bit mode while it resets as zero
         * in 32bit mode. Here we choose to reset it as zero for consistency.
         */
        { PMU_SYS_REG(PMCCFILTR_EL0), .access = access_pmu_evtyper,
          .reset = reset_val, .reg = PMCCFILTR_EL0, .val = 0 },

        EL2_REG_VNCR(VPIDR_EL2, reset_unknown, 0),
        EL2_REG_VNCR(VMPIDR_EL2, reset_unknown, 0),
        EL2_REG(SCTLR_EL2, access_rw, reset_val, SCTLR_EL2_RES1),
        EL2_REG(ACTLR_EL2, access_rw, reset_val, 0),
        EL2_REG_VNCR(HCR_EL2, reset_hcr, 0),
        EL2_REG(MDCR_EL2, access_mdcr, reset_mdcr, 0),
        EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1),
        EL2_REG_VNCR(HSTR_EL2, reset_val, 0),
        EL2_REG_VNCR(HFGRTR_EL2, reset_val, 0),
        EL2_REG_VNCR(HFGWTR_EL2, reset_val, 0),
        EL2_REG_VNCR(HFGITR_EL2, reset_val, 0),
        EL2_REG_VNCR(HACR_EL2, reset_val, 0),

        EL2_REG_FILTERED(ZCR_EL2, access_zcr_el2, reset_val, 0,
                         sve_el2_visibility),

        EL2_REG_VNCR(HCRX_EL2, reset_val, 0),

        EL2_REG(TTBR0_EL2, access_rw, reset_val, 0),
        EL2_REG(TTBR1_EL2, access_rw, reset_val, 0),
        EL2_REG(TCR_EL2, access_rw, reset_val, TCR_EL2_RES1),
        EL2_REG_FILTERED(TCR2_EL2, access_rw, reset_val, TCR2_EL2_RES1,
                         tcr2_el2_visibility),
        EL2_REG_VNCR(VTTBR_EL2, reset_val, 0),
        EL2_REG_VNCR(VTCR_EL2, reset_val, 0),
        EL2_REG_FILTERED(VNCR_EL2, bad_vncr_trap, reset_val, 0,
                         vncr_el2_visibility),

        { SYS_DESC(SYS_DACR32_EL2), undef_access, reset_unknown, DACR32_EL2 },
        EL2_REG_VNCR(HDFGRTR_EL2, reset_val, 0),
        EL2_REG_VNCR(HDFGWTR_EL2, reset_val, 0),
        EL2_REG_VNCR(HAFGRTR_EL2, reset_val, 0),
        EL2_REG_REDIR(SPSR_EL2, reset_val, 0),
        EL2_REG_REDIR(ELR_EL2, reset_val, 0),
        { SYS_DESC(SYS_SP_EL1), access_sp_el1},

        /* AArch32 SPSR_* are RES0 if trapped from a NV guest */
        { SYS_DESC(SYS_SPSR_irq), .access = trap_raz_wi },
        { SYS_DESC(SYS_SPSR_abt), .access = trap_raz_wi },
        { SYS_DESC(SYS_SPSR_und), .access = trap_raz_wi },
        { SYS_DESC(SYS_SPSR_fiq), .access = trap_raz_wi },

        { SYS_DESC(SYS_IFSR32_EL2), undef_access, reset_unknown, IFSR32_EL2 },
        EL2_REG(AFSR0_EL2, access_rw, reset_val, 0),
        EL2_REG(AFSR1_EL2, access_rw, reset_val, 0),
        EL2_REG_REDIR(ESR_EL2, reset_val, 0),
        { SYS_DESC(SYS_FPEXC32_EL2), undef_access, reset_val, FPEXC32_EL2, 0x700 },

        EL2_REG_REDIR(FAR_EL2, reset_val, 0),
        EL2_REG(HPFAR_EL2, access_rw, reset_val, 0),

        EL2_REG(MAIR_EL2, access_rw, reset_val, 0),
        EL2_REG_FILTERED(PIRE0_EL2, access_rw, reset_val, 0,
                         s1pie_el2_visibility),
        EL2_REG_FILTERED(PIR_EL2, access_rw, reset_val, 0,
                         s1pie_el2_visibility),
        EL2_REG_FILTERED(POR_EL2, access_rw, reset_val, 0,
                         s1poe_el2_visibility),
        EL2_REG(AMAIR_EL2, access_rw, reset_val, 0),
        { SYS_DESC(SYS_MPAMHCR_EL2), undef_access },
        { SYS_DESC(SYS_MPAMVPMV_EL2), undef_access },
        { SYS_DESC(SYS_MPAM2_EL2), undef_access },
        { SYS_DESC(SYS_MPAMVPM0_EL2), undef_access },
        { SYS_DESC(SYS_MPAMVPM1_EL2), undef_access },
        { SYS_DESC(SYS_MPAMVPM2_EL2), undef_access },
        { SYS_DESC(SYS_MPAMVPM3_EL2), undef_access },
        { SYS_DESC(SYS_MPAMVPM4_EL2), undef_access },
        { SYS_DESC(SYS_MPAMVPM5_EL2), undef_access },
        { SYS_DESC(SYS_MPAMVPM6_EL2), undef_access },
        { SYS_DESC(SYS_MPAMVPM7_EL2), undef_access },

        EL2_REG(VBAR_EL2, access_rw, reset_val, 0),
        EL2_REG(RVBAR_EL2, access_rw, reset_val, 0),
        { SYS_DESC(SYS_RMR_EL2), undef_access },

        EL2_REG_VNCR(ICH_AP0R0_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_AP0R1_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_AP0R2_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_AP0R3_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_AP1R0_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_AP1R1_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_AP1R2_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_AP1R3_EL2, reset_val, 0),

        { SYS_DESC(SYS_ICC_SRE_EL2), access_gic_sre },

        EL2_REG_VNCR(ICH_HCR_EL2, reset_val, 0),
        { SYS_DESC(SYS_ICH_VTR_EL2), access_gic_vtr },
        { SYS_DESC(SYS_ICH_MISR_EL2), access_gic_misr },
        { SYS_DESC(SYS_ICH_EISR_EL2), access_gic_eisr },
        { SYS_DESC(SYS_ICH_ELRSR_EL2), access_gic_elrsr },
        EL2_REG_VNCR(ICH_VMCR_EL2, reset_val, 0),

        EL2_REG_VNCR(ICH_LR0_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR1_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR2_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR3_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR4_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR5_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR6_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR7_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR8_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR9_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR10_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR11_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR12_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR13_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR14_EL2, reset_val, 0),
        EL2_REG_VNCR(ICH_LR15_EL2, reset_val, 0),

        EL2_REG(CONTEXTIDR_EL2, access_rw, reset_val, 0),
        EL2_REG(TPIDR_EL2, access_rw, reset_val, 0),

        EL2_REG_VNCR(CNTVOFF_EL2, reset_val, 0),
        EL2_REG(CNTHCTL_EL2, access_rw, reset_val, 0),
        { SYS_DESC(SYS_CNTHP_TVAL_EL2), access_arch_timer },
        EL2_REG(CNTHP_CTL_EL2, access_arch_timer, reset_val, 0),
        EL2_REG(CNTHP_CVAL_EL2, access_arch_timer, reset_val, 0),

        { SYS_DESC(SYS_CNTHV_TVAL_EL2), access_hv_timer },
        EL2_REG(CNTHV_CTL_EL2, access_hv_timer, reset_val, 0),
        EL2_REG(CNTHV_CVAL_EL2, access_hv_timer, reset_val, 0),

        { SYS_DESC(SYS_CNTKCTL_EL12), access_cntkctl_el12 },

        { SYS_DESC(SYS_CNTP_TVAL_EL02), access_arch_timer },
        { SYS_DESC(SYS_CNTP_CTL_EL02), access_arch_timer },
        { SYS_DESC(SYS_CNTP_CVAL_EL02), access_arch_timer },

        { SYS_DESC(SYS_CNTV_TVAL_EL02), access_arch_timer },
        { SYS_DESC(SYS_CNTV_CTL_EL02), access_arch_timer },
        { SYS_DESC(SYS_CNTV_CVAL_EL02), access_arch_timer },

        EL2_REG(SP_EL2, NULL, reset_unknown, 0),
};

static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                            const struct sys_reg_desc *r)
{
        u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);

        __kvm_at_s1e01(vcpu, op, p->regval);

        return true;
}

static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);

        /* There is no FGT associated with AT S1E2A :-( */
        if (op == OP_AT_S1E2A &&
            !kvm_has_feat(vcpu->kvm, ID_AA64ISAR2_EL1, ATS1A, IMP)) {
                kvm_inject_undefined(vcpu);
                return false;
        }

        __kvm_at_s1e2(vcpu, op, p->regval);

        return true;
}

static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                          const struct sys_reg_desc *r)
{
        u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);

        __kvm_at_s12(vcpu, op, p->regval);

        return true;
}

static bool kvm_supported_tlbi_s12_op(struct kvm_vcpu *vpcu, u32 instr)
{
        struct kvm *kvm = vpcu->kvm;
        u8 CRm = sys_reg_CRm(instr);

        if (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
            !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))
                return false;

        if (CRm == TLBI_CRm_nROS &&
            !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
                return false;

        return true;
}

static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                           const struct sys_reg_desc *r)
{
        u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);

        if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding))
                return undef_access(vcpu, p, r);

        write_lock(&vcpu->kvm->mmu_lock);

        /*
         * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the
         * corresponding VMIDs.
         */
        kvm_nested_s2_unmap(vcpu->kvm, true);

        write_unlock(&vcpu->kvm->mmu_lock);

        return true;
}

static bool kvm_supported_tlbi_ipas2_op(struct kvm_vcpu *vpcu, u32 instr)
{
        struct kvm *kvm = vpcu->kvm;
        u8 CRm = sys_reg_CRm(instr);
        u8 Op2 = sys_reg_Op2(instr);

        if (sys_reg_CRn(instr) == TLBI_CRn_nXS &&
            !kvm_has_feat(kvm, ID_AA64ISAR1_EL1, XS, IMP))
                return false;

        if (CRm == TLBI_CRm_IPAIS && (Op2 == 2 || Op2 == 6) &&
            !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
                return false;

        if (CRm == TLBI_CRm_IPAONS && (Op2 == 0 || Op2 == 4) &&
            !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
                return false;

        if (CRm == TLBI_CRm_IPAONS && (Op2 == 3 || Op2 == 7) &&
            !kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, RANGE))
                return false;

        return true;
}

/* Only defined here as this is an internal "abstraction" */
union tlbi_info {
        struct {
                u64        start;
                u64        size;
        } range;

        struct {
                u64        addr;
        } ipa;

        struct {
                u64        addr;
                u32        encoding;
        } va;
};

static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu,
                               const union tlbi_info *info)
{
        /*
         * The unmap operation is allowed to drop the MMU lock and block, which
         * means that @mmu could be used for a different context than the one
         * currently being invalidated.
         *
         * This behavior is still safe, as:
         *
         *  1) The vCPU(s) that recycled the MMU are responsible for invalidating
         *     the entire MMU before reusing it, which still honors the intent
         *     of a TLBI.
         *
         *  2) Until the guest TLBI instruction is 'retired' (i.e. increment PC
         *     and ERET to the guest), other vCPUs are allowed to use stale
         *     translations.
         *
         *  3) Accidentally unmapping an unrelated MMU context is nonfatal, and
         *     at worst may cause more aborts for shadow stage-2 fills.
         *
         * Dropping the MMU lock also implies that shadow stage-2 fills could
         * happen behind the back of the TLBI. This is still safe, though, as
         * the L1 needs to put its stage-2 in a consistent state before doing
         * the TLBI.
         */
        kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true);
}

static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                                const struct sys_reg_desc *r)
{
        u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
        u64 limit, vttbr;

        if (!kvm_supported_tlbi_s12_op(vcpu, sys_encoding))
                return undef_access(vcpu, p, r);

        vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
        limit = BIT_ULL(kvm_get_pa_bits(vcpu->kvm));

        kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
                                   &(union tlbi_info) {
                                           .range = {
                                                   .start = 0,
                                                   .size = limit,
                                           },
                                   },
                                   s2_mmu_unmap_range);

        return true;
}

static bool handle_ripas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                              const struct sys_reg_desc *r)
{
        u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
        u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
        u64 base, range;

        if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding))
                return undef_access(vcpu, p, r);

        /*
         * Because the shadow S2 structure doesn't necessarily reflect that
         * of the guest's S2 (different base granule size, for example), we
         * decide to ignore TTL and only use the described range.
         */
        base = decode_range_tlbi(p->regval, &range, NULL);

        kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
                                   &(union tlbi_info) {
                                           .range = {
                                                   .start = base,
                                                   .size = range,
                                           },
                                   },
                                   s2_mmu_unmap_range);

        return true;
}

static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu,
                             const union tlbi_info *info)
{
        unsigned long max_size;
        u64 base_addr;

        /*
         * We drop a number of things from the supplied value:
         *
         * - NS bit: we're non-secure only.
         *
         * - IPA[51:48]: We don't support 52bit IPA just yet...
         *
         * And of course, adjust the IPA to be on an actual address.
         */
        base_addr = (info->ipa.addr & GENMASK_ULL(35, 0)) << 12;
        max_size = compute_tlb_inval_range(mmu, info->ipa.addr);
        base_addr &= ~(max_size - 1);

        /*
         * See comment in s2_mmu_unmap_range() for why this is allowed to
         * reschedule.
         */
        kvm_stage2_unmap_range(mmu, base_addr, max_size, true);
}

static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                             const struct sys_reg_desc *r)
{
        u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
        u64 vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);

        if (!kvm_supported_tlbi_ipas2_op(vcpu, sys_encoding))
                return undef_access(vcpu, p, r);

        kvm_s2_mmu_iterate_by_vmid(vcpu->kvm, get_vmid(vttbr),
                                   &(union tlbi_info) {
                                           .ipa = {
                                                   .addr = p->regval,
                                           },
                                   },
                                   s2_mmu_unmap_ipa);

        return true;
}

static void s2_mmu_tlbi_s1e1(struct kvm_s2_mmu *mmu,
                             const union tlbi_info *info)
{
        WARN_ON(__kvm_tlbi_s1e2(mmu, info->va.addr, info->va.encoding));
}

static bool handle_tlbi_el2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                            const struct sys_reg_desc *r)
{
        u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);

        if (!kvm_supported_tlbi_s1e2_op(vcpu, sys_encoding))
                return undef_access(vcpu, p, r);

        kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval);
        return true;
}

static bool handle_tlbi_el1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
                            const struct sys_reg_desc *r)
{
        u32 sys_encoding = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);

        /*
         * If we're here, this is because we've trapped on a EL1 TLBI
         * instruction that affects the EL1 translation regime while
         * we're running in a context that doesn't allow us to let the
         * HW do its thing (aka vEL2):
         *
         * - HCR_EL2.E2H == 0 : a non-VHE guest
         * - HCR_EL2.{E2H,TGE} == { 1, 0 } : a VHE guest in guest mode
         *
         * Another possibility is that we are invalidating the EL2 context
         * using EL1 instructions, but that we landed here because we need
         * additional invalidation for structures that are not held in the
         * CPU TLBs (such as the VNCR pseudo-TLB and its EL2 mapping). In
         * that case, we are guaranteed that HCR_EL2.{E2H,TGE} == { 1, 1 }
         * as we don't allow an NV-capable L1 in a nVHE configuration.
         *
         * We don't expect these helpers to ever be called when running
         * in a vEL1 context.
         */

        WARN_ON(!vcpu_is_el2(vcpu));

        if (!kvm_supported_tlbi_s1e1_op(vcpu, sys_encoding))
                return undef_access(vcpu, p, r);

        if (vcpu_el2_e2h_is_set(vcpu) && vcpu_el2_tge_is_set(vcpu)) {
                kvm_handle_s1e2_tlbi(vcpu, sys_encoding, p->regval);
                return true;
        }

        kvm_s2_mmu_iterate_by_vmid(vcpu->kvm,
                                   get_vmid(__vcpu_sys_reg(vcpu, VTTBR_EL2)),
                                   &(union tlbi_info) {
                                           .va = {
                                                   .addr = p->regval,
                                                   .encoding = sys_encoding,
                                           },
                                   },
                                   s2_mmu_tlbi_s1e1);

        return true;
}

#define SYS_INSN(insn, access_fn)                                        \
        {                                                                \
                SYS_DESC(OP_##insn),                                        \
                .access = (access_fn),                                        \
        }

static struct sys_reg_desc sys_insn_descs[] = {
        { SYS_DESC(SYS_DC_ISW), access_dcsw },
        { SYS_DESC(SYS_DC_IGSW), access_dcgsw },
        { SYS_DESC(SYS_DC_IGDSW), access_dcgsw },

        SYS_INSN(AT_S1E1R, handle_at_s1e01),
        SYS_INSN(AT_S1E1W, handle_at_s1e01),
        SYS_INSN(AT_S1E0R, handle_at_s1e01),
        SYS_INSN(AT_S1E0W, handle_at_s1e01),
        SYS_INSN(AT_S1E1RP, handle_at_s1e01),
        SYS_INSN(AT_S1E1WP, handle_at_s1e01),

        { SYS_DESC(SYS_DC_CSW), access_dcsw },
        { SYS_DESC(SYS_DC_CGSW), access_dcgsw },
        { SYS_DESC(SYS_DC_CGDSW), access_dcgsw },
        { SYS_DESC(SYS_DC_CISW), access_dcsw },
        { SYS_DESC(SYS_DC_CIGSW), access_dcgsw },
        { SYS_DESC(SYS_DC_CIGDSW), access_dcgsw },

        SYS_INSN(TLBI_VMALLE1OS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAE1OS, handle_tlbi_el1),
        SYS_INSN(TLBI_ASIDE1OS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAAE1OS, handle_tlbi_el1),
        SYS_INSN(TLBI_VALE1OS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAALE1OS, handle_tlbi_el1),

        SYS_INSN(TLBI_RVAE1IS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAAE1IS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVALE1IS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAALE1IS, handle_tlbi_el1),

        SYS_INSN(TLBI_VMALLE1IS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAE1IS, handle_tlbi_el1),
        SYS_INSN(TLBI_ASIDE1IS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAAE1IS, handle_tlbi_el1),
        SYS_INSN(TLBI_VALE1IS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAALE1IS, handle_tlbi_el1),

        SYS_INSN(TLBI_RVAE1OS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAAE1OS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVALE1OS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAALE1OS, handle_tlbi_el1),

        SYS_INSN(TLBI_RVAE1, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAAE1, handle_tlbi_el1),
        SYS_INSN(TLBI_RVALE1, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAALE1, handle_tlbi_el1),

        SYS_INSN(TLBI_VMALLE1, handle_tlbi_el1),
        SYS_INSN(TLBI_VAE1, handle_tlbi_el1),
        SYS_INSN(TLBI_ASIDE1, handle_tlbi_el1),
        SYS_INSN(TLBI_VAAE1, handle_tlbi_el1),
        SYS_INSN(TLBI_VALE1, handle_tlbi_el1),
        SYS_INSN(TLBI_VAALE1, handle_tlbi_el1),

        SYS_INSN(TLBI_VMALLE1OSNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAE1OSNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_ASIDE1OSNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAAE1OSNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VALE1OSNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAALE1OSNXS, handle_tlbi_el1),

        SYS_INSN(TLBI_RVAE1ISNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAAE1ISNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVALE1ISNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAALE1ISNXS, handle_tlbi_el1),

        SYS_INSN(TLBI_VMALLE1ISNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAE1ISNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_ASIDE1ISNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAAE1ISNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VALE1ISNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAALE1ISNXS, handle_tlbi_el1),

        SYS_INSN(TLBI_RVAE1OSNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAAE1OSNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVALE1OSNXS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAALE1OSNXS, handle_tlbi_el1),

        SYS_INSN(TLBI_RVAE1NXS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAAE1NXS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVALE1NXS, handle_tlbi_el1),
        SYS_INSN(TLBI_RVAALE1NXS, handle_tlbi_el1),

        SYS_INSN(TLBI_VMALLE1NXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAE1NXS, handle_tlbi_el1),
        SYS_INSN(TLBI_ASIDE1NXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAAE1NXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VALE1NXS, handle_tlbi_el1),
        SYS_INSN(TLBI_VAALE1NXS, handle_tlbi_el1),

        SYS_INSN(AT_S1E2R, handle_at_s1e2),
        SYS_INSN(AT_S1E2W, handle_at_s1e2),
        SYS_INSN(AT_S12E1R, handle_at_s12),
        SYS_INSN(AT_S12E1W, handle_at_s12),
        SYS_INSN(AT_S12E0R, handle_at_s12),
        SYS_INSN(AT_S12E0W, handle_at_s12),
        SYS_INSN(AT_S1E2A, handle_at_s1e2),

        SYS_INSN(TLBI_IPAS2E1IS, handle_ipas2e1is),
        SYS_INSN(TLBI_RIPAS2E1IS, handle_ripas2e1is),
        SYS_INSN(TLBI_IPAS2LE1IS, handle_ipas2e1is),
        SYS_INSN(TLBI_RIPAS2LE1IS, handle_ripas2e1is),

        SYS_INSN(TLBI_ALLE2OS, handle_tlbi_el2),
        SYS_INSN(TLBI_VAE2OS, handle_tlbi_el2),
        SYS_INSN(TLBI_ALLE1OS, handle_alle1is),
        SYS_INSN(TLBI_VALE2OS, handle_tlbi_el2),
        SYS_INSN(TLBI_VMALLS12E1OS, handle_vmalls12e1is),

        SYS_INSN(TLBI_RVAE2IS, handle_tlbi_el2),
        SYS_INSN(TLBI_RVALE2IS, handle_tlbi_el2),
        SYS_INSN(TLBI_ALLE2IS, handle_tlbi_el2),
        SYS_INSN(TLBI_VAE2IS, handle_tlbi_el2),

        SYS_INSN(TLBI_ALLE1IS, handle_alle1is),

        SYS_INSN(TLBI_VALE2IS, handle_tlbi_el2),

        SYS_INSN(TLBI_VMALLS12E1IS, handle_vmalls12e1is),
        SYS_INSN(TLBI_IPAS2E1OS, handle_ipas2e1is),
        SYS_INSN(TLBI_IPAS2E1, handle_ipas2e1is),
        SYS_INSN(TLBI_RIPAS2E1, handle_ripas2e1is),
        SYS_INSN(TLBI_RIPAS2E1OS, handle_ripas2e1is),
        SYS_INSN(TLBI_IPAS2LE1OS, handle_ipas2e1is),
        SYS_INSN(TLBI_IPAS2LE1, handle_ipas2e1is),
        SYS_INSN(TLBI_RIPAS2LE1, handle_ripas2e1is),
        SYS_INSN(TLBI_RIPAS2LE1OS, handle_ripas2e1is),
        SYS_INSN(TLBI_RVAE2OS, handle_tlbi_el2),
        SYS_INSN(TLBI_RVALE2OS, handle_tlbi_el2),
        SYS_INSN(TLBI_RVAE2, handle_tlbi_el2),
        SYS_INSN(TLBI_RVALE2, handle_tlbi_el2),
        SYS_INSN(TLBI_ALLE2, handle_tlbi_el2),
        SYS_INSN(TLBI_VAE2, handle_tlbi_el2),

        SYS_INSN(TLBI_ALLE1, handle_alle1is),

        SYS_INSN(TLBI_VALE2, handle_tlbi_el2),

        SYS_INSN(TLBI_VMALLS12E1, handle_vmalls12e1is),

        SYS_INSN(TLBI_IPAS2E1ISNXS, handle_ipas2e1is),
        SYS_INSN(TLBI_RIPAS2E1ISNXS, handle_ripas2e1is),
        SYS_INSN(TLBI_IPAS2LE1ISNXS, handle_ipas2e1is),
        SYS_INSN(TLBI_RIPAS2LE1ISNXS, handle_ripas2e1is),

        SYS_INSN(TLBI_ALLE2OSNXS, handle_tlbi_el2),
        SYS_INSN(TLBI_VAE2OSNXS, handle_tlbi_el2),
        SYS_INSN(TLBI_ALLE1OSNXS, handle_alle1is),
        SYS_INSN(TLBI_VALE2OSNXS, handle_tlbi_el2),
        SYS_INSN(TLBI_VMALLS12E1OSNXS, handle_vmalls12e1is),

        SYS_INSN(TLBI_RVAE2ISNXS, handle_tlbi_el2),
        SYS_INSN(TLBI_RVALE2ISNXS, handle_tlbi_el2),
        SYS_INSN(TLBI_ALLE2ISNXS, handle_tlbi_el2),
        SYS_INSN(TLBI_VAE2ISNXS, handle_tlbi_el2),

        SYS_INSN(TLBI_ALLE1ISNXS, handle_alle1is),
        SYS_INSN(TLBI_VALE2ISNXS, handle_tlbi_el2),
        SYS_INSN(TLBI_VMALLS12E1ISNXS, handle_vmalls12e1is),
        SYS_INSN(TLBI_IPAS2E1OSNXS, handle_ipas2e1is),
        SYS_INSN(TLBI_IPAS2E1NXS, handle_ipas2e1is),
        SYS_INSN(TLBI_RIPAS2E1NXS, handle_ripas2e1is),
        SYS_INSN(TLBI_RIPAS2E1OSNXS, handle_ripas2e1is),
        SYS_INSN(TLBI_IPAS2LE1OSNXS, handle_ipas2e1is),
        SYS_INSN(TLBI_IPAS2LE1NXS, handle_ipas2e1is),
        SYS_INSN(TLBI_RIPAS2LE1NXS, handle_ripas2e1is),
        SYS_INSN(TLBI_RIPAS2LE1OSNXS, handle_ripas2e1is),
        SYS_INSN(TLBI_RVAE2OSNXS, handle_tlbi_el2),
        SYS_INSN(TLBI_RVALE2OSNXS, handle_tlbi_el2),
        SYS_INSN(TLBI_RVAE2NXS, handle_tlbi_el2),
        SYS_INSN(TLBI_RVALE2NXS, handle_tlbi_el2),
        SYS_INSN(TLBI_ALLE2NXS, handle_tlbi_el2),
        SYS_INSN(TLBI_VAE2NXS, handle_tlbi_el2),
        SYS_INSN(TLBI_ALLE1NXS, handle_alle1is),
        SYS_INSN(TLBI_VALE2NXS, handle_tlbi_el2),
        SYS_INSN(TLBI_VMALLS12E1NXS, handle_vmalls12e1is),
};

static bool trap_dbgdidr(struct kvm_vcpu *vcpu,
                        struct sys_reg_params *p,
                        const struct sys_reg_desc *r)
{
        if (p->is_write) {
                return ignore_write(vcpu, p);
        } else {
                u64 dfr = kvm_read_vm_id_reg(vcpu->kvm, SYS_ID_AA64DFR0_EL1);
                u32 el3 = kvm_has_feat(vcpu->kvm, ID_AA64PFR0_EL1, EL3, IMP);

                p->regval = ((SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr) << 28) |
                             (SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr) << 24) |
                             (SYS_FIELD_GET(ID_AA64DFR0_EL1, CTX_CMPs, dfr) << 20) |
                             (SYS_FIELD_GET(ID_AA64DFR0_EL1, DebugVer, dfr) << 16) |
                             (1 << 15) | (el3 << 14) | (el3 << 12));
                return true;
        }
}

/*
 * AArch32 debug register mappings
 *
 * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0]
 * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32]
 *
 * None of the other registers share their location, so treat them as
 * if they were 64bit.
 */
#define DBG_BCR_BVR_WCR_WVR(n)                                                        \
        /* DBGBVRn */                                                                \
        { AA32(LO), Op1( 0), CRn( 0), CRm((n)), Op2( 4),                        \
          trap_dbg_wb_reg, NULL, n },                                                \
        /* DBGBCRn */                                                                \
        { Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_dbg_wb_reg, NULL, n },        \
        /* DBGWVRn */                                                                \
        { Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_dbg_wb_reg, NULL, n },        \
        /* DBGWCRn */                                                                \
        { Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_dbg_wb_reg, NULL, n }

#define DBGBXVR(n)                                                                \
        { AA32(HI), Op1( 0), CRn( 1), CRm((n)), Op2( 1),                        \
          trap_dbg_wb_reg, NULL, n }

/*
 * Trapped cp14 registers. We generally ignore most of the external
 * debug, on the principle that they don't really make sense to a
 * guest. Revisit this one day, would this principle change.
 */
static const struct sys_reg_desc cp14_regs[] = {
        /* DBGDIDR */
        { Op1( 0), CRn( 0), CRm( 0), Op2( 0), trap_dbgdidr },
        /* DBGDTRRXext */
        { Op1( 0), CRn( 0), CRm( 0), Op2( 2), trap_raz_wi },

        DBG_BCR_BVR_WCR_WVR(0),
        /* DBGDSCRint */
        { Op1( 0), CRn( 0), CRm( 1), Op2( 0), trap_raz_wi },
        DBG_BCR_BVR_WCR_WVR(1),
        /* DBGDCCINT */
        { Op1( 0), CRn( 0), CRm( 2), Op2( 0), trap_debug_regs, NULL, MDCCINT_EL1 },
        /* DBGDSCRext */
        { Op1( 0), CRn( 0), CRm( 2), Op2( 2), trap_debug_regs, NULL, MDSCR_EL1 },
        DBG_BCR_BVR_WCR_WVR(2),
        /* DBGDTR[RT]Xint */
        { Op1( 0), CRn( 0), CRm( 3), Op2( 0), trap_raz_wi },
        /* DBGDTR[RT]Xext */
        { Op1( 0), CRn( 0), CRm( 3), Op2( 2), trap_raz_wi },
        DBG_BCR_BVR_WCR_WVR(3),
        DBG_BCR_BVR_WCR_WVR(4),
        DBG_BCR_BVR_WCR_WVR(5),
        /* DBGWFAR */
        { Op1( 0), CRn( 0), CRm( 6), Op2( 0), trap_raz_wi },
        /* DBGOSECCR */
        { Op1( 0), CRn( 0), CRm( 6), Op2( 2), trap_raz_wi },
        DBG_BCR_BVR_WCR_WVR(6),
        /* DBGVCR */
        { Op1( 0), CRn( 0), CRm( 7), Op2( 0), trap_debug_regs, NULL, DBGVCR32_EL2 },
        DBG_BCR_BVR_WCR_WVR(7),
        DBG_BCR_BVR_WCR_WVR(8),
        DBG_BCR_BVR_WCR_WVR(9),
        DBG_BCR_BVR_WCR_WVR(10),
        DBG_BCR_BVR_WCR_WVR(11),
        DBG_BCR_BVR_WCR_WVR(12),
        DBG_BCR_BVR_WCR_WVR(13),
        DBG_BCR_BVR_WCR_WVR(14),
        DBG_BCR_BVR_WCR_WVR(15),

        /* DBGDRAR (32bit) */
        { Op1( 0), CRn( 1), CRm( 0), Op2( 0), trap_raz_wi },

        DBGBXVR(0),
        /* DBGOSLAR */
        { Op1( 0), CRn( 1), CRm( 0), Op2( 4), trap_oslar_el1 },
        DBGBXVR(1),
        /* DBGOSLSR */
        { Op1( 0), CRn( 1), CRm( 1), Op2( 4), trap_oslsr_el1, NULL, OSLSR_EL1 },
        DBGBXVR(2),
        DBGBXVR(3),
        /* DBGOSDLR */
        { Op1( 0), CRn( 1), CRm( 3), Op2( 4), trap_raz_wi },
        DBGBXVR(4),
        /* DBGPRCR */
        { Op1( 0), CRn( 1), CRm( 4), Op2( 4), trap_raz_wi },
        DBGBXVR(5),
        DBGBXVR(6),
        DBGBXVR(7),
        DBGBXVR(8),
        DBGBXVR(9),
        DBGBXVR(10),
        DBGBXVR(11),
        DBGBXVR(12),
        DBGBXVR(13),
        DBGBXVR(14),
        DBGBXVR(15),

        /* DBGDSAR (32bit) */
        { Op1( 0), CRn( 2), CRm( 0), Op2( 0), trap_raz_wi },

        /* DBGDEVID2 */
        { Op1( 0), CRn( 7), CRm( 0), Op2( 7), trap_raz_wi },
        /* DBGDEVID1 */
        { Op1( 0), CRn( 7), CRm( 1), Op2( 7), trap_raz_wi },
        /* DBGDEVID */
        { Op1( 0), CRn( 7), CRm( 2), Op2( 7), trap_raz_wi },
        /* DBGCLAIMSET */
        { Op1( 0), CRn( 7), CRm( 8), Op2( 6), trap_raz_wi },
        /* DBGCLAIMCLR */
        { Op1( 0), CRn( 7), CRm( 9), Op2( 6), trap_raz_wi },
        /* DBGAUTHSTATUS */
        { Op1( 0), CRn( 7), CRm(14), Op2( 6), trap_dbgauthstatus_el1 },
};

/* Trapped cp14 64bit registers */
static const struct sys_reg_desc cp14_64_regs[] = {
        /* DBGDRAR (64bit) */
        { Op1( 0), CRm( 1), .access = trap_raz_wi },

        /* DBGDSAR (64bit) */
        { Op1( 0), CRm( 2), .access = trap_raz_wi },
};

#define CP15_PMU_SYS_REG(_map, _Op1, _CRn, _CRm, _Op2)                        \
        AA32(_map),                                                        \
        Op1(_Op1), CRn(_CRn), CRm(_CRm), Op2(_Op2),                        \
        .visibility = pmu_visibility

/* Macro to expand the PMEVCNTRn register */
#define PMU_PMEVCNTR(n)                                                        \
        { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110,                                \
          (0b1000 | (((n) >> 3) & 0x3)), ((n) & 0x7)),                        \
          .access = access_pmu_evcntr }

/* Macro to expand the PMEVTYPERn register */
#define PMU_PMEVTYPER(n)                                                \
        { CP15_PMU_SYS_REG(DIRECT, 0, 0b1110,                                \
          (0b1100 | (((n) >> 3) & 0x3)), ((n) & 0x7)),                        \
          .access = access_pmu_evtyper }
/*
 * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding,
 * depending on the way they are accessed (as a 32bit or a 64bit
 * register).
 */
static const struct sys_reg_desc cp15_regs[] = {
        { Op1( 0), CRn( 0), CRm( 0), Op2( 1), access_ctr },
        { Op1( 0), CRn( 1), CRm( 0), Op2( 0), access_vm_reg, NULL, SCTLR_EL1 },
        /* ACTLR */
        { AA32(LO), Op1( 0), CRn( 1), CRm( 0), Op2( 1), access_actlr, NULL, ACTLR_EL1 },
        /* ACTLR2 */
        { AA32(HI), Op1( 0), CRn( 1), CRm( 0), Op2( 3), access_actlr, NULL, ACTLR_EL1 },
        { Op1( 0), CRn( 2), CRm( 0), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 },
        { Op1( 0), CRn( 2), CRm( 0), Op2( 1), access_vm_reg, NULL, TTBR1_EL1 },
        /* TTBCR */
        { AA32(LO), Op1( 0), CRn( 2), CRm( 0), Op2( 2), access_vm_reg, NULL, TCR_EL1 },
        /* TTBCR2 */
        { AA32(HI), Op1( 0), CRn( 2), CRm( 0), Op2( 3), access_vm_reg, NULL, TCR_EL1 },
        { Op1( 0), CRn( 3), CRm( 0), Op2( 0), access_vm_reg, NULL, DACR32_EL2 },
        { CP15_SYS_DESC(SYS_ICC_PMR_EL1), undef_access },
        /* DFSR */
        { Op1( 0), CRn( 5), CRm( 0), Op2( 0), access_vm_reg, NULL, ESR_EL1 },
        { Op1( 0), CRn( 5), CRm( 0), Op2( 1), access_vm_reg, NULL, IFSR32_EL2 },
        /* ADFSR */
        { Op1( 0), CRn( 5), CRm( 1), Op2( 0), access_vm_reg, NULL, AFSR0_EL1 },
        /* AIFSR */
        { Op1( 0), CRn( 5), CRm( 1), Op2( 1), access_vm_reg, NULL, AFSR1_EL1 },
        /* DFAR */
        { AA32(LO), Op1( 0), CRn( 6), CRm( 0), Op2( 0), access_vm_reg, NULL, FAR_EL1 },
        /* IFAR */
        { AA32(HI), Op1( 0), CRn( 6), CRm( 0), Op2( 2), access_vm_reg, NULL, FAR_EL1 },

        /*
         * DC{C,I,CI}SW operations:
         */
        { Op1( 0), CRn( 7), CRm( 6), Op2( 2), access_dcsw },
        { Op1( 0), CRn( 7), CRm(10), Op2( 2), access_dcsw },
        { Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw },

        /* PMU */
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 0), .access = access_pmcr },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 1), .access = access_pmcnten },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 2), .access = access_pmcnten },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 3), .access = access_pmovs },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 4), .access = access_pmswinc },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 12, 5), .access = access_pmselr },
        { CP15_PMU_SYS_REG(LO,     0, 9, 12, 6), .access = access_pmceid },
        { CP15_PMU_SYS_REG(LO,     0, 9, 12, 7), .access = access_pmceid },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 0), .access = access_pmu_evcntr },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 1), .access = access_pmu_evtyper },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 13, 2), .access = access_pmu_evcntr },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 0), .access = access_pmuserenr },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 1), .access = access_pminten },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 2), .access = access_pminten },
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 3), .access = access_pmovs },
        { CP15_PMU_SYS_REG(HI,     0, 9, 14, 4), .access = access_pmceid },
        { CP15_PMU_SYS_REG(HI,     0, 9, 14, 5), .access = access_pmceid },
        /* PMMIR */
        { CP15_PMU_SYS_REG(DIRECT, 0, 9, 14, 6), .access = trap_raz_wi },

        /* PRRR/MAIR0 */
        { AA32(LO), Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, MAIR_EL1 },
        /* NMRR/MAIR1 */
        { AA32(HI), Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, MAIR_EL1 },
        /* AMAIR0 */
        { AA32(LO), Op1( 0), CRn(10), CRm( 3), Op2( 0), access_vm_reg, NULL, AMAIR_EL1 },
        /* AMAIR1 */
        { AA32(HI), Op1( 0), CRn(10), CRm( 3), Op2( 1), access_vm_reg, NULL, AMAIR_EL1 },

        { CP15_SYS_DESC(SYS_ICC_IAR0_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_EOIR0_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_HPPIR0_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_BPR0_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_AP0R0_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_AP0R1_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_AP0R2_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_AP0R3_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_AP1R0_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_HPPIR1_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_BPR1_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_CTLR_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre },
        { CP15_SYS_DESC(SYS_ICC_IGRPEN0_EL1), undef_access },
        { CP15_SYS_DESC(SYS_ICC_IGRPEN1_EL1), undef_access },

        { Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, CONTEXTIDR_EL1 },

        /* Arch Tmers */
        { SYS_DESC(SYS_AARCH32_CNTP_TVAL), access_arch_timer },
        { SYS_DESC(SYS_AARCH32_CNTP_CTL), access_arch_timer },

        /* PMEVCNTRn */
        PMU_PMEVCNTR(0),
        PMU_PMEVCNTR(1),
        PMU_PMEVCNTR(2),
        PMU_PMEVCNTR(3),
        PMU_PMEVCNTR(4),
        PMU_PMEVCNTR(5),
        PMU_PMEVCNTR(6),
        PMU_PMEVCNTR(7),
        PMU_PMEVCNTR(8),
        PMU_PMEVCNTR(9),
        PMU_PMEVCNTR(10),
        PMU_PMEVCNTR(11),
        PMU_PMEVCNTR(12),
        PMU_PMEVCNTR(13),
        PMU_PMEVCNTR(14),
        PMU_PMEVCNTR(15),
        PMU_PMEVCNTR(16),
        PMU_PMEVCNTR(17),
        PMU_PMEVCNTR(18),
        PMU_PMEVCNTR(19),
        PMU_PMEVCNTR(20),
        PMU_PMEVCNTR(21),
        PMU_PMEVCNTR(22),
        PMU_PMEVCNTR(23),
        PMU_PMEVCNTR(24),
        PMU_PMEVCNTR(25),
        PMU_PMEVCNTR(26),
        PMU_PMEVCNTR(27),
        PMU_PMEVCNTR(28),
        PMU_PMEVCNTR(29),
        PMU_PMEVCNTR(30),
        /* PMEVTYPERn */
        PMU_PMEVTYPER(0),
        PMU_PMEVTYPER(1),
        PMU_PMEVTYPER(2),
        PMU_PMEVTYPER(3),
        PMU_PMEVTYPER(4),
        PMU_PMEVTYPER(5),
        PMU_PMEVTYPER(6),
        PMU_PMEVTYPER(7),
        PMU_PMEVTYPER(8),
        PMU_PMEVTYPER(9),
        PMU_PMEVTYPER(10),
        PMU_PMEVTYPER(11),
        PMU_PMEVTYPER(12),
        PMU_PMEVTYPER(13),
        PMU_PMEVTYPER(14),
        PMU_PMEVTYPER(15),
        PMU_PMEVTYPER(16),
        PMU_PMEVTYPER(17),
        PMU_PMEVTYPER(18),
        PMU_PMEVTYPER(19),
        PMU_PMEVTYPER(20),
        PMU_PMEVTYPER(21),
        PMU_PMEVTYPER(22),
        PMU_PMEVTYPER(23),
        PMU_PMEVTYPER(24),
        PMU_PMEVTYPER(25),
        PMU_PMEVTYPER(26),
        PMU_PMEVTYPER(27),
        PMU_PMEVTYPER(28),
        PMU_PMEVTYPER(29),
        PMU_PMEVTYPER(30),
        /* PMCCFILTR */
        { CP15_PMU_SYS_REG(DIRECT, 0, 14, 15, 7), .access = access_pmu_evtyper },

        { Op1(1), CRn( 0), CRm( 0), Op2(0), access_ccsidr },
        { Op1(1), CRn( 0), CRm( 0), Op2(1), access_clidr },

        /* CCSIDR2 */
        { Op1(1), CRn( 0), CRm( 0),  Op2(2), undef_access },

        { Op1(2), CRn( 0), CRm( 0), Op2(0), access_csselr, NULL, CSSELR_EL1 },
};

static const struct sys_reg_desc cp15_64_regs[] = {
        { Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR0_EL1 },
        { CP15_PMU_SYS_REG(DIRECT, 0, 0, 9, 0), .access = access_pmu_evcntr },
        { Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI1R */
        { SYS_DESC(SYS_AARCH32_CNTPCT),              access_arch_timer },
        { Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, TTBR1_EL1 },
        { Op1( 1), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_ASGI1R */
        { SYS_DESC(SYS_AARCH32_CNTVCT),              access_arch_timer },
        { Op1( 2), CRn( 0), CRm(12), Op2( 0), access_gic_sgi }, /* ICC_SGI0R */
        { SYS_DESC(SYS_AARCH32_CNTP_CVAL),    access_arch_timer },
        { SYS_DESC(SYS_AARCH32_CNTPCTSS),     access_arch_timer },
        { SYS_DESC(SYS_AARCH32_CNTVCTSS),     access_arch_timer },
};

static bool check_sysreg_table(const struct sys_reg_desc *table, unsigned int n,
                               bool is_32)
{
        unsigned int i;

        for (i = 0; i < n; i++) {
                if (!is_32 && table[i].reg && !table[i].reset) {
                        kvm_err("sys_reg table %pS entry %d (%s) lacks reset\n",
                                &table[i], i, table[i].name);
                        return false;
                }

                if (i && cmp_sys_reg(&table[i-1], &table[i]) >= 0) {
                        kvm_err("sys_reg table %pS entry %d (%s -> %s) out of order\n",
                                &table[i], i, table[i - 1].name, table[i].name);
                        return false;
                }
        }

        return true;
}

int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu)
{
        kvm_inject_undefined(vcpu);
        return 1;
}

static void perform_access(struct kvm_vcpu *vcpu,
                           struct sys_reg_params *params,
                           const struct sys_reg_desc *r)
{
        trace_kvm_sys_access(*vcpu_pc(vcpu), params, r);

        /* Check for regs disabled by runtime config */
        if (sysreg_hidden(vcpu, r)) {
                kvm_inject_undefined(vcpu);
                return;
        }

        /*
         * Not having an accessor means that we have configured a trap
         * that we don't know how to handle. This certainly qualifies
         * as a gross bug that should be fixed right away.
         */
        BUG_ON(!r->access);

        /* Skip instruction if instructed so */
        if (likely(r->access(vcpu, params, r)))
                kvm_incr_pc(vcpu);
}

/*
 * emulate_cp --  tries to match a sys_reg access in a handling table, and
 *                call the corresponding trap handler.
 *
 * @params: pointer to the descriptor of the access
 * @table: array of trap descriptors
 * @num: size of the trap descriptor array
 *
 * Return true if the access has been handled, false if not.
 */
static bool emulate_cp(struct kvm_vcpu *vcpu,
                       struct sys_reg_params *params,
                       const struct sys_reg_desc *table,
                       size_t num)
{
        const struct sys_reg_desc *r;

        if (!table)
                return false;        /* Not handled */

        r = find_reg(params, table, num);

        if (r) {
                perform_access(vcpu, params, r);
                return true;
        }

        /* Not handled */
        return false;
}

static void unhandled_cp_access(struct kvm_vcpu *vcpu,
                                struct sys_reg_params *params)
{
        u8 esr_ec = kvm_vcpu_trap_get_class(vcpu);
        int cp = -1;

        switch (esr_ec) {
        case ESR_ELx_EC_CP15_32:
        case ESR_ELx_EC_CP15_64:
                cp = 15;
                break;
        case ESR_ELx_EC_CP14_MR:
        case ESR_ELx_EC_CP14_64:
                cp = 14;
                break;
        default:
                WARN_ON(1);
        }

        print_sys_reg_msg(params,
                          "Unsupported guest CP%d access at: %08lx [%08lx]\n",
                          cp, *vcpu_pc(vcpu), *vcpu_cpsr(vcpu));
        kvm_inject_undefined(vcpu);
}

/**
 * kvm_handle_cp_64 -- handles a mrrc/mcrr trap on a guest CP14/CP15 access
 * @vcpu: The VCPU pointer
 * @global: &struct sys_reg_desc
 * @nr_global: size of the @global array
 */
static int kvm_handle_cp_64(struct kvm_vcpu *vcpu,
                            const struct sys_reg_desc *global,
                            size_t nr_global)
{
        struct sys_reg_params params;
        u64 esr = kvm_vcpu_get_esr(vcpu);
        int Rt = kvm_vcpu_sys_get_rt(vcpu);
        int Rt2 = (esr >> 10) & 0x1f;

        params.CRm = (esr >> 1) & 0xf;
        params.is_write = ((esr & 1) == 0);

        params.Op0 = 0;
        params.Op1 = (esr >> 16) & 0xf;
        params.Op2 = 0;
        params.CRn = 0;

        /*
         * Make a 64-bit value out of Rt and Rt2. As we use the same trap
         * backends between AArch32 and AArch64, we get away with it.
         */
        if (params.is_write) {
                params.regval = vcpu_get_reg(vcpu, Rt) & 0xffffffff;
                params.regval |= vcpu_get_reg(vcpu, Rt2) << 32;
        }

        /*
         * If the table contains a handler, handle the
         * potential register operation in the case of a read and return
         * with success.
         */
        if (emulate_cp(vcpu, &params, global, nr_global)) {
                /* Split up the value between registers for the read side */
                if (!params.is_write) {
                        vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval));
                        vcpu_set_reg(vcpu, Rt2, upper_32_bits(params.regval));
                }

                return 1;
        }

        unhandled_cp_access(vcpu, &params);
        return 1;
}

static bool emulate_sys_reg(struct kvm_vcpu *vcpu, struct sys_reg_params *params);

/*
 * The CP10 ID registers are architecturally mapped to AArch64 feature
 * registers. Abuse that fact so we can rely on the AArch64 handler for accesses
 * from AArch32.
 */
static bool kvm_esr_cp10_id_to_sys64(u64 esr, struct sys_reg_params *params)
{
        u8 reg_id = (esr >> 10) & 0xf;
        bool valid;

        params->is_write = ((esr & 1) == 0);
        params->Op0 = 3;
        params->Op1 = 0;
        params->CRn = 0;
        params->CRm = 3;

        /* CP10 ID registers are read-only */
        valid = !params->is_write;

        switch (reg_id) {
        /* MVFR0 */
        case 0b0111:
                params->Op2 = 0;
                break;
        /* MVFR1 */
        case 0b0110:
                params->Op2 = 1;
                break;
        /* MVFR2 */
        case 0b0101:
                params->Op2 = 2;
                break;
        default:
                valid = false;
        }

        if (valid)
                return true;

        kvm_pr_unimpl("Unhandled cp10 register %s: %u\n",
                      params->is_write ? "write" : "read", reg_id);
        return false;
}

/**
 * kvm_handle_cp10_id() - Handles a VMRS trap on guest access to a 'Media and
 *                          VFP Register' from AArch32.
 * @vcpu: The vCPU pointer
 *
 * MVFR{0-2} are architecturally mapped to the AArch64 MVFR{0-2}_EL1 registers.
 * Work out the correct AArch64 system register encoding and reroute to the
 * AArch64 system register emulation.
 */
int kvm_handle_cp10_id(struct kvm_vcpu *vcpu)
{
        int Rt = kvm_vcpu_sys_get_rt(vcpu);
        u64 esr = kvm_vcpu_get_esr(vcpu);
        struct sys_reg_params params;

        /* UNDEF on any unhandled register access */
        if (!kvm_esr_cp10_id_to_sys64(esr, &params)) {
                kvm_inject_undefined(vcpu);
                return 1;
        }

        if (emulate_sys_reg(vcpu, &params))
                vcpu_set_reg(vcpu, Rt, params.regval);

        return 1;
}

/**
 * kvm_emulate_cp15_id_reg() - Handles an MRC trap on a guest CP15 access where
 *                               CRn=0, which corresponds to the AArch32 feature
 *                               registers.
 * @vcpu: the vCPU pointer
 * @params: the system register access parameters.
 *
 * Our cp15 system register tables do not enumerate the AArch32 feature
 * registers. Conveniently, our AArch64 table does, and the AArch32 system
 * register encoding can be trivially remapped into the AArch64 for the feature
 * registers: Append op0=3, leaving op1, CRn, CRm, and op2 the same.
 *
 * According to DDI0487G.b G7.3.1, paragraph "Behavior of VMSAv8-32 32-bit
 * System registers with (coproc=0b1111, CRn==c0)", read accesses from this
 * range are either UNKNOWN or RES0. Rerouting remains architectural as we
 * treat undefined registers in this range as RAZ.
 */
static int kvm_emulate_cp15_id_reg(struct kvm_vcpu *vcpu,
                                   struct sys_reg_params *params)
{
        int Rt = kvm_vcpu_sys_get_rt(vcpu);

        /* Treat impossible writes to RO registers as UNDEFINED */
        if (params->is_write) {
                unhandled_cp_access(vcpu, params);
                return 1;
        }

        params->Op0 = 3;

        /*
         * All registers where CRm > 3 are known to be UNKNOWN/RAZ from AArch32.
         * Avoid conflicting with future expansion of AArch64 feature registers
         * and simply treat them as RAZ here.
         */
        if (params->CRm > 3)
                params->regval = 0;
        else if (!emulate_sys_reg(vcpu, params))
                return 1;

        vcpu_set_reg(vcpu, Rt, params->regval);
        return 1;
}

/**
 * kvm_handle_cp_32 -- handles a mrc/mcr trap on a guest CP14/CP15 access
 * @vcpu: The VCPU pointer
 * @params: &struct sys_reg_params
 * @global: &struct sys_reg_desc
 * @nr_global: size of the @global array
 */
static int kvm_handle_cp_32(struct kvm_vcpu *vcpu,
                            struct sys_reg_params *params,
                            const struct sys_reg_desc *global,
                            size_t nr_global)
{
        int Rt  = kvm_vcpu_sys_get_rt(vcpu);

        params->regval = vcpu_get_reg(vcpu, Rt);

        if (emulate_cp(vcpu, params, global, nr_global)) {
                if (!params->is_write)
                        vcpu_set_reg(vcpu, Rt, params->regval);
                return 1;
        }

        unhandled_cp_access(vcpu, params);
        return 1;
}

int kvm_handle_cp15_64(struct kvm_vcpu *vcpu)
{
        return kvm_handle_cp_64(vcpu, cp15_64_regs, ARRAY_SIZE(cp15_64_regs));
}

int kvm_handle_cp15_32(struct kvm_vcpu *vcpu)
{
        struct sys_reg_params params;

        params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu));

        /*
         * Certain AArch32 ID registers are handled by rerouting to the AArch64
         * system register table. Registers in the ID range where CRm=0 are
         * excluded from this scheme as they do not trivially map into AArch64
         * system register encodings, except for AIDR/REVIDR.
         */
        if (params.Op1 == 0 && params.CRn == 0 &&
            (params.CRm || params.Op2 == 6 /* REVIDR */))
                return kvm_emulate_cp15_id_reg(vcpu, &params);
        if (params.Op1 == 1 && params.CRn == 0 &&
            params.CRm == 0 && params.Op2 == 7 /* AIDR */)
                return kvm_emulate_cp15_id_reg(vcpu, &params);

        return kvm_handle_cp_32(vcpu, &params, cp15_regs, ARRAY_SIZE(cp15_regs));
}

int kvm_handle_cp14_64(struct kvm_vcpu *vcpu)
{
        return kvm_handle_cp_64(vcpu, cp14_64_regs, ARRAY_SIZE(cp14_64_regs));
}

int kvm_handle_cp14_32(struct kvm_vcpu *vcpu)
{
        struct sys_reg_params params;

        params = esr_cp1x_32_to_params(kvm_vcpu_get_esr(vcpu));

        return kvm_handle_cp_32(vcpu, &params, cp14_regs, ARRAY_SIZE(cp14_regs));
}

/**
 * emulate_sys_reg - Emulate a guest access to an AArch64 system register
 * @vcpu: The VCPU pointer
 * @params: Decoded system register parameters
 *
 * Return: true if the system register access was successful, false otherwise.
 */
static bool emulate_sys_reg(struct kvm_vcpu *vcpu,
                            struct sys_reg_params *params)
{
        const struct sys_reg_desc *r;

        r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
        if (likely(r)) {
                perform_access(vcpu, params, r);
                return true;
        }

        print_sys_reg_msg(params,
                          "Unsupported guest sys_reg access at: %lx [%08lx]\n",
                          *vcpu_pc(vcpu), *vcpu_cpsr(vcpu));
        kvm_inject_undefined(vcpu);

        return false;
}

static const struct sys_reg_desc *idregs_debug_find(struct kvm *kvm, u8 pos)
{
        unsigned long i, idreg_idx = 0;

        for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
                const struct sys_reg_desc *r = &sys_reg_descs[i];

                if (!is_vm_ftr_id_reg(reg_to_encoding(r)))
                        continue;

                if (idreg_idx == pos)
                        return r;

                idreg_idx++;
        }

        return NULL;
}

static void *idregs_debug_start(struct seq_file *s, loff_t *pos)
{
        struct kvm *kvm = s->private;
        u8 *iter;

        mutex_lock(&kvm->arch.config_lock);

        iter = &kvm->arch.idreg_debugfs_iter;
        if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags) &&
            *iter == (u8)~0) {
                *iter = *pos;
                if (!idregs_debug_find(kvm, *iter))
                        iter = NULL;
        } else {
                iter = ERR_PTR(-EBUSY);
        }

        mutex_unlock(&kvm->arch.config_lock);

        return iter;
}

static void *idregs_debug_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct kvm *kvm = s->private;

        (*pos)++;

        if (idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter + 1)) {
                kvm->arch.idreg_debugfs_iter++;

                return &kvm->arch.idreg_debugfs_iter;
        }

        return NULL;
}

static void idregs_debug_stop(struct seq_file *s, void *v)
{
        struct kvm *kvm = s->private;

        if (IS_ERR(v))
                return;

        mutex_lock(&kvm->arch.config_lock);

        kvm->arch.idreg_debugfs_iter = ~0;

        mutex_unlock(&kvm->arch.config_lock);
}

static int idregs_debug_show(struct seq_file *s, void *v)
{
        const struct sys_reg_desc *desc;
        struct kvm *kvm = s->private;

        desc = idregs_debug_find(kvm, kvm->arch.idreg_debugfs_iter);

        if (!desc->name)
                return 0;

        seq_printf(s, "%20s:\t%016llx\n",
                   desc->name, kvm_read_vm_id_reg(kvm, reg_to_encoding(desc)));

        return 0;
}

static const struct seq_operations idregs_debug_sops = {
        .start        = idregs_debug_start,
        .next        = idregs_debug_next,
        .stop        = idregs_debug_stop,
        .show        = idregs_debug_show,
};

DEFINE_SEQ_ATTRIBUTE(idregs_debug);

void kvm_sys_regs_create_debugfs(struct kvm *kvm)
{
        kvm->arch.idreg_debugfs_iter = ~0;

        debugfs_create_file("idregs", 0444, kvm->debugfs_dentry, kvm,
                            &idregs_debug_fops);
}

static void reset_vm_ftr_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *reg)
{
        u32 id = reg_to_encoding(reg);
        struct kvm *kvm = vcpu->kvm;

        if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags))
                return;

        kvm_set_vm_id_reg(kvm, id, reg->reset(vcpu, reg));
}

static void reset_vcpu_ftr_id_reg(struct kvm_vcpu *vcpu,
                                  const struct sys_reg_desc *reg)
{
        if (kvm_vcpu_initialized(vcpu))
                return;

        reg->reset(vcpu, reg);
}

/**
 * kvm_reset_sys_regs - sets system registers to reset value
 * @vcpu: The VCPU pointer
 *
 * This function finds the right table above and sets the registers on the
 * virtual CPU struct to their architecturally defined reset values.
 */
void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        unsigned long i;

        for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
                const struct sys_reg_desc *r = &sys_reg_descs[i];

                if (!r->reset)
                        continue;

                if (is_vm_ftr_id_reg(reg_to_encoding(r)))
                        reset_vm_ftr_id_reg(vcpu, r);
                else if (is_vcpu_ftr_id_reg(reg_to_encoding(r)))
                        reset_vcpu_ftr_id_reg(vcpu, r);
                else
                        r->reset(vcpu, r);

                if (r->reg >= __SANITISED_REG_START__ && r->reg < NR_SYS_REGS)
                        (void)__vcpu_sys_reg(vcpu, r->reg);
        }

        set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags);

        if (kvm_vcpu_has_pmu(vcpu))
                kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
}

/**
 * kvm_handle_sys_reg -- handles a system instruction or mrs/msr instruction
 *                         trap on a guest execution
 * @vcpu: The VCPU pointer
 */
int kvm_handle_sys_reg(struct kvm_vcpu *vcpu)
{
        const struct sys_reg_desc *desc = NULL;
        struct sys_reg_params params;
        unsigned long esr = kvm_vcpu_get_esr(vcpu);
        int Rt = kvm_vcpu_sys_get_rt(vcpu);
        int sr_idx;

        trace_kvm_handle_sys_reg(esr);

        if (triage_sysreg_trap(vcpu, &sr_idx))
                return 1;

        params = esr_sys64_to_params(esr);
        params.regval = vcpu_get_reg(vcpu, Rt);

        /* System registers have Op0=={2,3}, as per DDI487 J.a C5.1.2 */
        if (params.Op0 == 2 || params.Op0 == 3)
                desc = &sys_reg_descs[sr_idx];
        else
                desc = &sys_insn_descs[sr_idx];

        perform_access(vcpu, &params, desc);

        /* Read from system register? */
        if (!params.is_write &&
            (params.Op0 == 2 || params.Op0 == 3))
                vcpu_set_reg(vcpu, Rt, params.regval);

        return 1;
}

/******************************************************************************
 * Userspace API
 *****************************************************************************/

static bool index_to_params(u64 id, struct sys_reg_params *params)
{
        switch (id & KVM_REG_SIZE_MASK) {
        case KVM_REG_SIZE_U64:
                /* Any unused index bits means it's not valid. */
                if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK
                              | KVM_REG_ARM_COPROC_MASK
                              | KVM_REG_ARM64_SYSREG_OP0_MASK
                              | KVM_REG_ARM64_SYSREG_OP1_MASK
                              | KVM_REG_ARM64_SYSREG_CRN_MASK
                              | KVM_REG_ARM64_SYSREG_CRM_MASK
                              | KVM_REG_ARM64_SYSREG_OP2_MASK))
                        return false;
                params->Op0 = ((id & KVM_REG_ARM64_SYSREG_OP0_MASK)
                               >> KVM_REG_ARM64_SYSREG_OP0_SHIFT);
                params->Op1 = ((id & KVM_REG_ARM64_SYSREG_OP1_MASK)
                               >> KVM_REG_ARM64_SYSREG_OP1_SHIFT);
                params->CRn = ((id & KVM_REG_ARM64_SYSREG_CRN_MASK)
                               >> KVM_REG_ARM64_SYSREG_CRN_SHIFT);
                params->CRm = ((id & KVM_REG_ARM64_SYSREG_CRM_MASK)
                               >> KVM_REG_ARM64_SYSREG_CRM_SHIFT);
                params->Op2 = ((id & KVM_REG_ARM64_SYSREG_OP2_MASK)
                               >> KVM_REG_ARM64_SYSREG_OP2_SHIFT);
                return true;
        default:
                return false;
        }
}

const struct sys_reg_desc *get_reg_by_id(u64 id,
                                         const struct sys_reg_desc table[],
                                         unsigned int num)
{
        struct sys_reg_params params;

        if (!index_to_params(id, &params))
                return NULL;

        return find_reg(&params, table, num);
}

/* Decode an index value, and find the sys_reg_desc entry. */
static const struct sys_reg_desc *
id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id,
                   const struct sys_reg_desc table[], unsigned int num)

{
        const struct sys_reg_desc *r;

        /* We only do sys_reg for now. */
        if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG)
                return NULL;

        r = get_reg_by_id(id, table, num);

        /* Not saved in the sys_reg array and not otherwise accessible? */
        if (r && (!(r->reg || r->get_user) || sysreg_hidden(vcpu, r)))
                r = NULL;

        return r;
}

static int demux_c15_get(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
{
        u32 val;
        u32 __user *uval = uaddr;

        /* Fail if we have unknown bits set. */
        if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK
                   | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1)))
                return -ENOENT;

        switch (id & KVM_REG_ARM_DEMUX_ID_MASK) {
        case KVM_REG_ARM_DEMUX_ID_CCSIDR:
                if (KVM_REG_SIZE(id) != 4)
                        return -ENOENT;
                val = (id & KVM_REG_ARM_DEMUX_VAL_MASK)
                        >> KVM_REG_ARM_DEMUX_VAL_SHIFT;
                if (val >= CSSELR_MAX)
                        return -ENOENT;

                return put_user(get_ccsidr(vcpu, val), uval);
        default:
                return -ENOENT;
        }
}

static int demux_c15_set(struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
{
        u32 val, newval;
        u32 __user *uval = uaddr;

        /* Fail if we have unknown bits set. */
        if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK
                   | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1)))
                return -ENOENT;

        switch (id & KVM_REG_ARM_DEMUX_ID_MASK) {
        case KVM_REG_ARM_DEMUX_ID_CCSIDR:
                if (KVM_REG_SIZE(id) != 4)
                        return -ENOENT;
                val = (id & KVM_REG_ARM_DEMUX_VAL_MASK)
                        >> KVM_REG_ARM_DEMUX_VAL_SHIFT;
                if (val >= CSSELR_MAX)
                        return -ENOENT;

                if (get_user(newval, uval))
                        return -EFAULT;

                return set_ccsidr(vcpu, val, newval);
        default:
                return -ENOENT;
        }
}

int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
                         const struct sys_reg_desc table[], unsigned int num)
{
        u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr;
        const struct sys_reg_desc *r;
        u64 val;
        int ret;

        r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
        if (!r || sysreg_hidden(vcpu, r))
                return -ENOENT;

        if (r->get_user) {
                ret = (r->get_user)(vcpu, r, &val);
        } else {
                val = __vcpu_sys_reg(vcpu, r->reg);
                ret = 0;
        }

        if (!ret)
                ret = put_user(val, uaddr);

        return ret;
}

int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        void __user *uaddr = (void __user *)(unsigned long)reg->addr;

        if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
                return demux_c15_get(vcpu, reg->id, uaddr);

        return kvm_sys_reg_get_user(vcpu, reg,
                                    sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
}

int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
                         const struct sys_reg_desc table[], unsigned int num)
{
        u64 __user *uaddr = (u64 __user *)(unsigned long)reg->addr;
        const struct sys_reg_desc *r;
        u64 val;
        int ret;

        if (get_user(val, uaddr))
                return -EFAULT;

        r = id_to_sys_reg_desc(vcpu, reg->id, table, num);
        if (!r || sysreg_hidden(vcpu, r))
                return -ENOENT;

        if (sysreg_user_write_ignore(vcpu, r))
                return 0;

        if (r->set_user) {
                ret = (r->set_user)(vcpu, r, val);
        } else {
                __vcpu_sys_reg(vcpu, r->reg) = val;
                ret = 0;
        }

        return ret;
}

int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        void __user *uaddr = (void __user *)(unsigned long)reg->addr;

        if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX)
                return demux_c15_set(vcpu, reg->id, uaddr);

        return kvm_sys_reg_set_user(vcpu, reg,
                                    sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
}

static unsigned int num_demux_regs(void)
{
        return CSSELR_MAX;
}

static int write_demux_regids(u64 __user *uindices)
{
        u64 val = KVM_REG_ARM64 | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX;
        unsigned int i;

        val |= KVM_REG_ARM_DEMUX_ID_CCSIDR;
        for (i = 0; i < CSSELR_MAX; i++) {
                if (put_user(val | i, uindices))
                        return -EFAULT;
                uindices++;
        }
        return 0;
}

static u64 sys_reg_to_index(const struct sys_reg_desc *reg)
{
        return (KVM_REG_ARM64 | KVM_REG_SIZE_U64 |
                KVM_REG_ARM64_SYSREG |
                (reg->Op0 << KVM_REG_ARM64_SYSREG_OP0_SHIFT) |
                (reg->Op1 << KVM_REG_ARM64_SYSREG_OP1_SHIFT) |
                (reg->CRn << KVM_REG_ARM64_SYSREG_CRN_SHIFT) |
                (reg->CRm << KVM_REG_ARM64_SYSREG_CRM_SHIFT) |
                (reg->Op2 << KVM_REG_ARM64_SYSREG_OP2_SHIFT));
}

static bool copy_reg_to_user(const struct sys_reg_desc *reg, u64 __user **uind)
{
        if (!*uind)
                return true;

        if (put_user(sys_reg_to_index(reg), *uind))
                return false;

        (*uind)++;
        return true;
}

static int walk_one_sys_reg(const struct kvm_vcpu *vcpu,
                            const struct sys_reg_desc *rd,
                            u64 __user **uind,
                            unsigned int *total)
{
        /*
         * Ignore registers we trap but don't save,
         * and for which no custom user accessor is provided.
         */
        if (!(rd->reg || rd->get_user))
                return 0;

        if (sysreg_hidden(vcpu, rd))
                return 0;

        if (!copy_reg_to_user(rd, uind))
                return -EFAULT;

        (*total)++;
        return 0;
}

/* Assumed ordered tables, see kvm_sys_reg_table_init. */
static int walk_sys_regs(struct kvm_vcpu *vcpu, u64 __user *uind)
{
        const struct sys_reg_desc *i2, *end2;
        unsigned int total = 0;
        int err;

        i2 = sys_reg_descs;
        end2 = sys_reg_descs + ARRAY_SIZE(sys_reg_descs);

        while (i2 != end2) {
                err = walk_one_sys_reg(vcpu, i2++, &uind, &total);
                if (err)
                        return err;
        }
        return total;
}

unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu)
{
        return num_demux_regs()
                + walk_sys_regs(vcpu, (u64 __user *)NULL);
}

int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
        int err;

        err = walk_sys_regs(vcpu, uindices);
        if (err < 0)
                return err;
        uindices += err;

        return write_demux_regids(uindices);
}

#define KVM_ARM_FEATURE_ID_RANGE_INDEX(r)                        \
        KVM_ARM_FEATURE_ID_RANGE_IDX(sys_reg_Op0(r),                \
                sys_reg_Op1(r),                                        \
                sys_reg_CRn(r),                                        \
                sys_reg_CRm(r),                                        \
                sys_reg_Op2(r))

int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm, struct reg_mask_range *range)
{
        const void *zero_page = page_to_virt(ZERO_PAGE(0));
        u64 __user *masks = (u64 __user *)range->addr;

        /* Only feature id range is supported, reserved[13] must be zero. */
        if (range->range ||
            memcmp(range->reserved, zero_page, sizeof(range->reserved)))
                return -EINVAL;

        /* Wipe the whole thing first */
        if (clear_user(masks, KVM_ARM_FEATURE_ID_RANGE_SIZE * sizeof(__u64)))
                return -EFAULT;

        for (int i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) {
                const struct sys_reg_desc *reg = &sys_reg_descs[i];
                u32 encoding = reg_to_encoding(reg);
                u64 val;

                if (!is_feature_id_reg(encoding) || !reg->set_user)
                        continue;

                if (!reg->val ||
                    (is_aa32_id_reg(encoding) && !kvm_supports_32bit_el0())) {
                        continue;
                }
                val = reg->val;

                if (put_user(val, (masks + KVM_ARM_FEATURE_ID_RANGE_INDEX(encoding))))
                        return -EFAULT;
        }

        return 0;
}

static void vcpu_set_hcr(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;

        if (has_vhe() || has_hvhe())
                vcpu->arch.hcr_el2 |= HCR_E2H;
        if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) {
                /* route synchronous external abort exceptions to EL2 */
                vcpu->arch.hcr_el2 |= HCR_TEA;
                /* trap error record accesses */
                vcpu->arch.hcr_el2 |= HCR_TERR;
        }

        if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
                vcpu->arch.hcr_el2 |= HCR_FWB;

        if (cpus_have_final_cap(ARM64_HAS_EVT) &&
            !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE) &&
            kvm_read_vm_id_reg(kvm, SYS_CTR_EL0) == read_sanitised_ftr_reg(SYS_CTR_EL0))
                vcpu->arch.hcr_el2 |= HCR_TID4;
        else
                vcpu->arch.hcr_el2 |= HCR_TID2;

        if (vcpu_el1_is_32bit(vcpu))
                vcpu->arch.hcr_el2 &= ~HCR_RW;

        if (kvm_has_mte(vcpu->kvm))
                vcpu->arch.hcr_el2 |= HCR_ATA;

        /*
         * In the absence of FGT, we cannot independently trap TLBI
         * Range instructions. This isn't great, but trapping all
         * TLBIs would be far worse. Live with it...
         */
        if (!kvm_has_feat(kvm, ID_AA64ISAR0_EL1, TLB, OS))
                vcpu->arch.hcr_el2 |= HCR_TTLBOS;
}

void kvm_calculate_traps(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;

        mutex_lock(&kvm->arch.config_lock);
        vcpu_set_hcr(vcpu);
        vcpu_set_ich_hcr(vcpu);
        vcpu_set_hcrx(vcpu);

        if (test_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags))
                goto out;

        compute_fgu(kvm, HFGRTR_GROUP);
        compute_fgu(kvm, HFGITR_GROUP);
        compute_fgu(kvm, HDFGRTR_GROUP);
        compute_fgu(kvm, HAFGRTR_GROUP);
        compute_fgu(kvm, HFGRTR2_GROUP);
        compute_fgu(kvm, HFGITR2_GROUP);
        compute_fgu(kvm, HDFGRTR2_GROUP);

        set_bit(KVM_ARCH_FLAG_FGU_INITIALIZED, &kvm->arch.flags);
out:
        mutex_unlock(&kvm->arch.config_lock);
}

/*
 * Perform last adjustments to the ID registers that are implied by the
 * configuration outside of the ID regs themselves, as well as any
 * initialisation that directly depend on these ID registers (such as
 * RES0/RES1 behaviours). This is not the place to configure traps though.
 *
 * Because this can be called once per CPU, changes must be idempotent.
 */
int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;

        guard(mutex)(&kvm->arch.config_lock);

        if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) &&
              irqchip_in_kernel(kvm) &&
              kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) {
                kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK;
                kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK;
        }

        if (vcpu_has_nv(vcpu)) {
                int ret = kvm_init_nv_sysregs(vcpu);
                if (ret)
                        return ret;
        }

        return 0;
}

int __init kvm_sys_reg_table_init(void)
{
        bool valid = true;
        unsigned int i;
        int ret = 0;

        /* Make sure tables are unique and in order. */
        valid &= check_sysreg_table(sys_reg_descs, ARRAY_SIZE(sys_reg_descs), false);
        valid &= check_sysreg_table(cp14_regs, ARRAY_SIZE(cp14_regs), true);
        valid &= check_sysreg_table(cp14_64_regs, ARRAY_SIZE(cp14_64_regs), true);
        valid &= check_sysreg_table(cp15_regs, ARRAY_SIZE(cp15_regs), true);
        valid &= check_sysreg_table(cp15_64_regs, ARRAY_SIZE(cp15_64_regs), true);
        valid &= check_sysreg_table(sys_insn_descs, ARRAY_SIZE(sys_insn_descs), false);

        if (!valid)
                return -EINVAL;

        init_imp_id_regs();

        ret = populate_nv_trap_config();

        check_feature_map();

        for (i = 0; !ret && i < ARRAY_SIZE(sys_reg_descs); i++)
                ret = populate_sysreg_config(sys_reg_descs + i, i);

        for (i = 0; !ret && i < ARRAY_SIZE(sys_insn_descs); i++)
                ret = populate_sysreg_config(sys_insn_descs + i, i);

        return ret;
}
































  166 






  166 









  162 

    3 




    3 
  163 



  165 

  166 



































































  165 



  166 














  166 







    2 



    2 


    2 

    2 





    1 

  162 






  127 
  124 







    1 

    1 














    3 



    3 




    1 

    1 






    1 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Debug and Guest Debug support
 *
 * Copyright (C) 2015 - Linaro Ltd
 * Authors: Alex Bennée <alex.bennee@linaro.org>
 *             Oliver Upton <oliver.upton@linux.dev>
 */

#include <linux/kvm_host.h>
#include <linux/hw_breakpoint.h>

#include <asm/debug-monitors.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_emulate.h>

/**
 * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value
 *
 * @vcpu:        the vcpu pointer
 *
 * This ensures we will trap access to:
 *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
 *  - Debug ROM Address (MDCR_EL2_TDRA)
 *  - OS related registers (MDCR_EL2_TDOSA)
 *  - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB)
 *  - Self-hosted Trace Filter controls (MDCR_EL2_TTRF)
 *  - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB)
 */
static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu)
{
        preempt_disable();

        /*
         * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK
         * to disable guest access to the profiling and trace buffers
         */
        vcpu->arch.mdcr_el2 = FIELD_PREP(MDCR_EL2_HPMN,
                                         *host_data_ptr(nr_event_counters));
        vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
                                MDCR_EL2_TPMS |
                                MDCR_EL2_TTRF |
                                MDCR_EL2_TPMCR |
                                MDCR_EL2_TDRA |
                                MDCR_EL2_TDOSA);

        /* Is the VM being debugged by userspace? */
        if (vcpu->guest_debug)
                /* Route all software debug exceptions to EL2 */
                vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;

        /*
         * Trap debug registers if the guest doesn't have ownership of them.
         */
        if (!kvm_guest_owns_debug_regs(vcpu))
                vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;

        /* Write MDCR_EL2 directly if we're already at EL2 */
        if (has_vhe())
                write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);

        preempt_enable();
}

void kvm_init_host_debug_data(void)
{
        u64 dfr0 = read_sysreg(id_aa64dfr0_el1);

        if (cpuid_feature_extract_signed_field(dfr0, ID_AA64DFR0_EL1_PMUVer_SHIFT) > 0)
                *host_data_ptr(nr_event_counters) = FIELD_GET(ARMV8_PMU_PMCR_N,
                                                              read_sysreg(pmcr_el0));

        *host_data_ptr(debug_brps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, BRPs, dfr0);
        *host_data_ptr(debug_wrps) = SYS_FIELD_GET(ID_AA64DFR0_EL1, WRPs, dfr0);

        if (has_vhe())
                return;

        if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_PMSVer_SHIFT) &&
            !(read_sysreg_s(SYS_PMBIDR_EL1) & PMBIDR_EL1_P))
                host_data_set_flag(HAS_SPE);

        if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceFilt_SHIFT)) {
                /* Force disable trace in protected mode in case of no TRBE */
                if (is_protected_kvm_enabled())
                        host_data_set_flag(EL1_TRACING_CONFIGURED);

                if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_EL1_TraceBuffer_SHIFT) &&
                    !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_P))
                        host_data_set_flag(HAS_TRBE);
        }
}

/*
 * Configures the 'external' MDSCR_EL1 value for the guest, i.e. when the host
 * has taken over MDSCR_EL1.
 *
 *  - Userspace is single-stepping the guest, and MDSCR_EL1.SS is forced to 1.
 *
 *  - Userspace is using the breakpoint/watchpoint registers to debug the
 *    guest, and MDSCR_EL1.MDE is forced to 1.
 *
 *  - The guest has enabled the OS Lock, and KVM is forcing MDSCR_EL1.MDE to 0,
 *    masking all debug exceptions affected by the OS Lock.
 */
static void setup_external_mdscr(struct kvm_vcpu *vcpu)
{
        /*
         * Use the guest's MDSCR_EL1 as a starting point, since there are
         * several other features controlled by MDSCR_EL1 that are not relevant
         * to the host.
         *
         * Clear the bits that KVM may use which also satisfies emulation of
         * the OS Lock as MDSCR_EL1.MDE is cleared.
         */
        u64 mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1) & ~(MDSCR_EL1_SS |
                                                           MDSCR_EL1_MDE |
                                                           MDSCR_EL1_KDE);

        if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                mdscr |= MDSCR_EL1_SS;

        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW)
                mdscr |= MDSCR_EL1_MDE | MDSCR_EL1_KDE;

        vcpu->arch.external_mdscr_el1 = mdscr;
}

void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu)
{
        u64 mdscr;

        /* Must be called before kvm_vcpu_load_vhe() */
        KVM_BUG_ON(vcpu_get_flag(vcpu, SYSREGS_ON_CPU), vcpu->kvm);

        /*
         * Determine which of the possible debug states we're in:
         *
         *  - VCPU_DEBUG_HOST_OWNED: KVM has taken ownership of the guest's
         *    breakpoint/watchpoint registers, or needs to use MDSCR_EL1 to do
         *    software step or emulate the effects of the OS Lock being enabled.
         *
         *  - VCPU_DEBUG_GUEST_OWNED: The guest has debug exceptions enabled, and
         *    the breakpoint/watchpoint registers need to be loaded eagerly.
         *
         *  - VCPU_DEBUG_FREE: Neither of the above apply, no breakpoint/watchpoint
         *    context needs to be loaded on the CPU.
         */
        if (vcpu->guest_debug || kvm_vcpu_os_lock_enabled(vcpu)) {
                vcpu->arch.debug_owner = VCPU_DEBUG_HOST_OWNED;
                setup_external_mdscr(vcpu);

                /*
                 * Steal the guest's single-step state machine if userspace wants
                 * single-step the guest.
                 */
                if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
                        if (*vcpu_cpsr(vcpu) & DBG_SPSR_SS)
                                vcpu_clear_flag(vcpu, GUEST_SS_ACTIVE_PENDING);
                        else
                                vcpu_set_flag(vcpu, GUEST_SS_ACTIVE_PENDING);

                        if (!vcpu_get_flag(vcpu, HOST_SS_ACTIVE_PENDING))
                                *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
                        else
                                *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
                }
        } else {
                mdscr = vcpu_read_sys_reg(vcpu, MDSCR_EL1);

                if (mdscr & (MDSCR_EL1_KDE | MDSCR_EL1_MDE))
                        vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED;
                else
                        vcpu->arch.debug_owner = VCPU_DEBUG_FREE;
        }

        kvm_arm_setup_mdcr_el2(vcpu);
}

void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu)
{
        if (likely(!(vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
                return;

        /*
         * Save the host's software step state and restore the guest's before
         * potentially returning to userspace.
         */
        if (!(*vcpu_cpsr(vcpu) & DBG_SPSR_SS))
                vcpu_set_flag(vcpu, HOST_SS_ACTIVE_PENDING);
        else
                vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING);

        if (vcpu_get_flag(vcpu, GUEST_SS_ACTIVE_PENDING))
                *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
        else
                *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
}

/*
 * Updates ownership of the debug registers after a trapped guest access to a
 * breakpoint/watchpoint register. Host ownership of the debug registers is of
 * strictly higher priority, and it is the responsibility of the VMM to emulate
 * guest debug exceptions in this configuration.
 */
void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu)
{
        if (kvm_host_owns_debug_regs(vcpu))
                return;

        vcpu->arch.debug_owner = VCPU_DEBUG_GUEST_OWNED;
        kvm_arm_setup_mdcr_el2(vcpu);
}

void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val)
{
        if (val & OSLAR_EL1_OSLK)
                __vcpu_sys_reg(vcpu, OSLSR_EL1) |= OSLSR_EL1_OSLK;
        else
                __vcpu_sys_reg(vcpu, OSLSR_EL1) &= ~OSLSR_EL1_OSLK;

        preempt_disable();
        kvm_arch_vcpu_put(vcpu);
        kvm_arch_vcpu_load(vcpu, smp_processor_id());
        preempt_enable();
}

void kvm_enable_trbe(void)
{
        if (has_vhe() || is_protected_kvm_enabled() ||
            WARN_ON_ONCE(preemptible()))
                return;

        host_data_set_flag(TRBE_ENABLED);
}
EXPORT_SYMBOL_GPL(kvm_enable_trbe);

void kvm_disable_trbe(void)
{
        if (has_vhe() || is_protected_kvm_enabled() ||
            WARN_ON_ONCE(preemptible()))
                return;

        host_data_clear_flag(TRBE_ENABLED);
}
EXPORT_SYMBOL_GPL(kvm_disable_trbe);

void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest)
{
        if (is_protected_kvm_enabled() || WARN_ON_ONCE(preemptible()))
                return;

        if (has_vhe()) {
                write_sysreg_s(trfcr_while_in_guest, SYS_TRFCR_EL12);
                return;
        }

        *host_data_ptr(trfcr_while_in_guest) = trfcr_while_in_guest;
        if (read_sysreg_s(SYS_TRFCR_EL1) != trfcr_while_in_guest)
                host_data_set_flag(EL1_TRACING_CONFIGURED);
        else
                host_data_clear_flag(EL1_TRACING_CONFIGURED);
}
EXPORT_SYMBOL_GPL(kvm_tracing_set_el1_configuration);




























































































































































































































































































































































































































































































































































































    7 





    7 
















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
/* SPDX-License-Identifier: GPL-2.0-only */
/* Authors: Karl MacMillan <kmacmillan@tresys.com>
 *            Frank Mayer <mayerf@tresys.com>
 *          Copyright (C) 2003 - 2004 Tresys Technology, LLC
 */

#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/slab.h>

#include "security.h"
#include "conditional.h"
#include "services.h"

/*
 * cond_evaluate_expr evaluates a conditional expr
 * in reverse polish notation. It returns true (1), false (0),
 * or undefined (-1). Undefined occurs when the expression
 * exceeds the stack depth of COND_EXPR_MAXDEPTH.
 */
static int cond_evaluate_expr(struct policydb *p, struct cond_expr *expr)
{
        u32 i;
        int s[COND_EXPR_MAXDEPTH];
        int sp = -1;

        if (expr->len == 0)
                return -1;

        for (i = 0; i < expr->len; i++) {
                struct cond_expr_node *node = &expr->nodes[i];

                switch (node->expr_type) {
                case COND_BOOL:
                        if (sp == (COND_EXPR_MAXDEPTH - 1))
                                return -1;
                        sp++;
                        s[sp] = p->bool_val_to_struct[node->boolean - 1]->state;
                        break;
                case COND_NOT:
                        if (sp < 0)
                                return -1;
                        s[sp] = !s[sp];
                        break;
                case COND_OR:
                        if (sp < 1)
                                return -1;
                        sp--;
                        s[sp] |= s[sp + 1];
                        break;
                case COND_AND:
                        if (sp < 1)
                                return -1;
                        sp--;
                        s[sp] &= s[sp + 1];
                        break;
                case COND_XOR:
                        if (sp < 1)
                                return -1;
                        sp--;
                        s[sp] ^= s[sp + 1];
                        break;
                case COND_EQ:
                        if (sp < 1)
                                return -1;
                        sp--;
                        s[sp] = (s[sp] == s[sp + 1]);
                        break;
                case COND_NEQ:
                        if (sp < 1)
                                return -1;
                        sp--;
                        s[sp] = (s[sp] != s[sp + 1]);
                        break;
                default:
                        return -1;
                }
        }
        return s[0];
}

/*
 * evaluate_cond_node evaluates the conditional stored in
 * a struct cond_node and if the result is different than the
 * current state of the node it sets the rules in the true/false
 * list appropriately. If the result of the expression is undefined
 * all of the rules are disabled for safety.
 */
static void evaluate_cond_node(struct policydb *p, struct cond_node *node)
{
        struct avtab_node *avnode;
        int new_state;
        u32 i;

        new_state = cond_evaluate_expr(p, &node->expr);
        if (new_state != node->cur_state) {
                node->cur_state = new_state;
                if (new_state == -1)
                        pr_err("SELinux: expression result was undefined - disabling all rules.\n");
                /* turn the rules on or off */
                for (i = 0; i < node->true_list.len; i++) {
                        avnode = node->true_list.nodes[i];
                        if (new_state <= 0)
                                avnode->key.specified &= ~AVTAB_ENABLED;
                        else
                                avnode->key.specified |= AVTAB_ENABLED;
                }

                for (i = 0; i < node->false_list.len; i++) {
                        avnode = node->false_list.nodes[i];
                        /* -1 or 1 */
                        if (new_state)
                                avnode->key.specified &= ~AVTAB_ENABLED;
                        else
                                avnode->key.specified |= AVTAB_ENABLED;
                }
        }
}

void evaluate_cond_nodes(struct policydb *p)
{
        u32 i;

        for (i = 0; i < p->cond_list_len; i++)
                evaluate_cond_node(p, &p->cond_list[i]);
}

void cond_policydb_init(struct policydb *p)
{
        p->bool_val_to_struct = NULL;
        p->cond_list = NULL;
        p->cond_list_len = 0;

        avtab_init(&p->te_cond_avtab);
}

static void cond_node_destroy(struct cond_node *node)
{
        kfree(node->expr.nodes);
        /* the avtab_ptr_t nodes are destroyed by the avtab */
        kfree(node->true_list.nodes);
        kfree(node->false_list.nodes);
}

static void cond_list_destroy(struct policydb *p)
{
        u32 i;

        for (i = 0; i < p->cond_list_len; i++)
                cond_node_destroy(&p->cond_list[i]);
        kfree(p->cond_list);
        p->cond_list = NULL;
        p->cond_list_len = 0;
}

void cond_policydb_destroy(struct policydb *p)
{
        kfree(p->bool_val_to_struct);
        avtab_destroy(&p->te_cond_avtab);
        cond_list_destroy(p);
}

int cond_init_bool_indexes(struct policydb *p)
{
        kfree(p->bool_val_to_struct);
        p->bool_val_to_struct = kmalloc_array(
                p->p_bools.nprim, sizeof(*p->bool_val_to_struct), GFP_KERNEL);
        if (!p->bool_val_to_struct)
                return -ENOMEM;

        avtab_hash_eval(&p->te_cond_avtab, "conditional_rules");

        return 0;
}

int cond_destroy_bool(void *key, void *datum, void *p)
{
        kfree(key);
        kfree(datum);
        return 0;
}

int cond_index_bool(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct cond_bool_datum *booldatum;

        booldatum = datum;
        p = datap;

        if (!booldatum->value || booldatum->value > p->p_bools.nprim)
                return -EINVAL;

        p->sym_val_to_name[SYM_BOOLS][booldatum->value - 1] = key;
        p->bool_val_to_struct[booldatum->value - 1] = booldatum;

        return 0;
}

static int bool_isvalid(struct cond_bool_datum *b)
{
        if (!(b->state == 0 || b->state == 1))
                return 0;
        return 1;
}

int cond_read_bool(struct policydb *p, struct symtab *s, struct policy_file *fp)
{
        char *key = NULL;
        struct cond_bool_datum *booldatum;
        __le32 buf[3];
        u32 len;
        int rc;

        booldatum = kzalloc(sizeof(*booldatum), GFP_KERNEL);
        if (!booldatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof(buf));
        if (rc)
                goto err;

        booldatum->value = le32_to_cpu(buf[0]);
        booldatum->state = le32_to_cpu(buf[1]);

        rc = -EINVAL;
        if (!bool_isvalid(booldatum))
                goto err;

        len = le32_to_cpu(buf[2]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto err;

        rc = symtab_insert(s, key, booldatum);
        if (rc)
                goto err;

        return 0;
err:
        cond_destroy_bool(key, booldatum, NULL);
        return rc;
}

struct cond_insertf_data {
        struct policydb *p;
        struct avtab_node **dst;
        struct cond_av_list *other;
};

static int cond_insertf(struct avtab *a, const struct avtab_key *k,
                        const struct avtab_datum *d, void *ptr)
{
        struct cond_insertf_data *data = ptr;
        struct policydb *p = data->p;
        struct cond_av_list *other = data->other;
        struct avtab_node *node_ptr;
        u32 i;
        bool found;

        /*
         * For type rules we have to make certain there aren't any
         * conflicting rules by searching the te_avtab and the
         * cond_te_avtab.
         */
        if (k->specified & AVTAB_TYPE) {
                if (avtab_search_node(&p->te_avtab, k)) {
                        pr_err("SELinux: type rule already exists outside of a conditional.\n");
                        return -EINVAL;
                }
                /*
                 * If we are reading the false list other will be a pointer to
                 * the true list. We can have duplicate entries if there is only
                 * 1 other entry and it is in our true list.
                 *
                 * If we are reading the true list (other == NULL) there shouldn't
                 * be any other entries.
                 */
                if (other) {
                        node_ptr = avtab_search_node(&p->te_cond_avtab, k);
                        if (node_ptr) {
                                if (avtab_search_node_next(node_ptr,
                                                           k->specified)) {
                                        pr_err("SELinux: too many conflicting type rules.\n");
                                        return -EINVAL;
                                }
                                found = false;
                                for (i = 0; i < other->len; i++) {
                                        if (other->nodes[i] == node_ptr) {
                                                found = true;
                                                break;
                                        }
                                }
                                if (!found) {
                                        pr_err("SELinux: conflicting type rules.\n");
                                        return -EINVAL;
                                }
                        }
                } else {
                        if (avtab_search_node(&p->te_cond_avtab, k)) {
                                pr_err("SELinux: conflicting type rules when adding type rule for true.\n");
                                return -EINVAL;
                        }
                }
        }

        node_ptr = avtab_insert_nonunique(&p->te_cond_avtab, k, d);
        if (!node_ptr) {
                pr_err("SELinux: could not insert rule.\n");
                return -ENOMEM;
        }

        *data->dst = node_ptr;
        return 0;
}

static int cond_read_av_list(struct policydb *p, struct policy_file *fp,
                             struct cond_av_list *list,
                             struct cond_av_list *other)
{
        int rc;
        __le32 buf[1];
        u32 i, len;
        struct cond_insertf_data data;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;

        len = le32_to_cpu(buf[0]);
        if (len == 0)
                return 0;

        list->nodes = kcalloc(len, sizeof(*list->nodes), GFP_KERNEL);
        if (!list->nodes)
                return -ENOMEM;

        data.p = p;
        data.other = other;
        for (i = 0; i < len; i++) {
                data.dst = &list->nodes[i];
                rc = avtab_read_item(&p->te_cond_avtab, fp, p, cond_insertf,
                                     &data, true);
                if (rc) {
                        kfree(list->nodes);
                        list->nodes = NULL;
                        return rc;
                }
        }

        list->len = len;
        return 0;
}

static int expr_node_isvalid(struct policydb *p, struct cond_expr_node *expr)
{
        if (expr->expr_type <= 0 || expr->expr_type > COND_LAST) {
                pr_err("SELinux: conditional expressions uses unknown operator.\n");
                return 0;
        }

        if (expr->boolean > p->p_bools.nprim) {
                pr_err("SELinux: conditional expressions uses unknown bool.\n");
                return 0;
        }
        return 1;
}

static int cond_read_node(struct policydb *p, struct cond_node *node, struct policy_file *fp)
{
        __le32 buf[2];
        u32 i, len;
        int rc;

        rc = next_entry(buf, fp, sizeof(u32) * 2);
        if (rc)
                return rc;

        node->cur_state = le32_to_cpu(buf[0]);

        /* expr */
        len = le32_to_cpu(buf[1]);
        node->expr.nodes = kcalloc(len, sizeof(*node->expr.nodes), GFP_KERNEL);
        if (!node->expr.nodes)
                return -ENOMEM;

        node->expr.len = len;

        for (i = 0; i < len; i++) {
                struct cond_expr_node *expr = &node->expr.nodes[i];

                rc = next_entry(buf, fp, sizeof(u32) * 2);
                if (rc)
                        return rc;

                expr->expr_type = le32_to_cpu(buf[0]);
                expr->boolean = le32_to_cpu(buf[1]);

                if (!expr_node_isvalid(p, expr))
                        return -EINVAL;
        }

        rc = cond_read_av_list(p, fp, &node->true_list, NULL);
        if (rc)
                return rc;
        return cond_read_av_list(p, fp, &node->false_list, &node->true_list);
}

int cond_read_list(struct policydb *p, struct policy_file *fp)
{
        __le32 buf[1];
        u32 i, len;
        int rc;

        rc = next_entry(buf, fp, sizeof(buf));
        if (rc)
                return rc;

        len = le32_to_cpu(buf[0]);

        p->cond_list = kcalloc(len, sizeof(*p->cond_list), GFP_KERNEL);
        if (!p->cond_list)
                return -ENOMEM;

        rc = avtab_alloc(&(p->te_cond_avtab), p->te_avtab.nel);
        if (rc)
                goto err;

        p->cond_list_len = len;

        for (i = 0; i < len; i++) {
                rc = cond_read_node(p, &p->cond_list[i], fp);
                if (rc)
                        goto err;
        }
        return 0;
err:
        cond_list_destroy(p);
        return rc;
}

int cond_write_bool(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct cond_bool_datum *booldatum = datum;
        struct policy_data *pd = ptr;
        struct policy_file *fp = pd->fp;
        __le32 buf[3];
        u32 len;
        int rc;

        len = strlen(key);
        buf[0] = cpu_to_le32(booldatum->value);
        buf[1] = cpu_to_le32(booldatum->state);
        buf[2] = cpu_to_le32(len);
        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;
        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;
        return 0;
}

/*
 * cond_write_cond_av_list doesn't write out the av_list nodes.
 * Instead it writes out the key/value pairs from the avtab. This
 * is necessary because there is no way to uniquely identifying rules
 * in the avtab so it is not possible to associate individual rules
 * in the avtab with a conditional without saving them as part of
 * the conditional. This means that the avtab with the conditional
 * rules will not be saved but will be rebuilt on policy load.
 */
static int cond_write_av_list(struct policydb *p, struct cond_av_list *list,
                              struct policy_file *fp)
{
        __le32 buf[1];
        u32 i;
        int rc;

        buf[0] = cpu_to_le32(list->len);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        for (i = 0; i < list->len; i++) {
                rc = avtab_write_item(p, list->nodes[i], fp);
                if (rc)
                        return rc;
        }

        return 0;
}

static int cond_write_node(struct policydb *p, struct cond_node *node,
                           struct policy_file *fp)
{
        __le32 buf[2];
        int rc;
        u32 i;

        buf[0] = cpu_to_le32(node->cur_state);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        buf[0] = cpu_to_le32(node->expr.len);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        for (i = 0; i < node->expr.len; i++) {
                buf[0] = cpu_to_le32(node->expr.nodes[i].expr_type);
                buf[1] = cpu_to_le32(node->expr.nodes[i].boolean);
                rc = put_entry(buf, sizeof(u32), 2, fp);
                if (rc)
                        return rc;
        }

        rc = cond_write_av_list(p, &node->true_list, fp);
        if (rc)
                return rc;
        rc = cond_write_av_list(p, &node->false_list, fp);
        if (rc)
                return rc;

        return 0;
}

int cond_write_list(struct policydb *p, struct policy_file *fp)
{
        u32 i;
        __le32 buf[1];
        int rc;

        buf[0] = cpu_to_le32(p->cond_list_len);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        for (i = 0; i < p->cond_list_len; i++) {
                rc = cond_write_node(p, &p->cond_list[i], fp);
                if (rc)
                        return rc;
        }

        return 0;
}

void cond_compute_xperms(struct avtab *ctab, struct avtab_key *key,
                         struct extended_perms_decision *xpermd)
{
        struct avtab_node *node;

        if (!ctab || !key || !xpermd)
                return;

        for (node = avtab_search_node(ctab, key); node;
             node = avtab_search_node_next(node, key->specified)) {
                if (node->key.specified & AVTAB_ENABLED)
                        services_compute_xperms_decision(xpermd, node);
        }
}
/* Determine whether additional permissions are granted by the conditional
 * av table, and if so, add them to the result
 */
void cond_compute_av(struct avtab *ctab, struct avtab_key *key,
                     struct av_decision *avd, struct extended_perms *xperms)
{
        struct avtab_node *node;

        if (!ctab || !key || !avd)
                return;

        for (node = avtab_search_node(ctab, key); node;
             node = avtab_search_node_next(node, key->specified)) {
                if ((u16)(AVTAB_ALLOWED | AVTAB_ENABLED) ==
                    (node->key.specified & (AVTAB_ALLOWED | AVTAB_ENABLED)))
                        avd->allowed |= node->datum.u.data;
                if ((u16)(AVTAB_AUDITDENY | AVTAB_ENABLED) ==
                    (node->key.specified & (AVTAB_AUDITDENY | AVTAB_ENABLED)))
                        /* Since a '0' in an auditdeny mask represents a
                         * permission we do NOT want to audit (dontaudit), we use
                         * the '&' operand to ensure that all '0's in the mask
                         * are retained (much unlike the allow and auditallow cases).
                         */
                        avd->auditdeny &= node->datum.u.data;
                if ((u16)(AVTAB_AUDITALLOW | AVTAB_ENABLED) ==
                    (node->key.specified & (AVTAB_AUDITALLOW | AVTAB_ENABLED)))
                        avd->auditallow |= node->datum.u.data;
                if (xperms && (node->key.specified & AVTAB_ENABLED) &&
                    (node->key.specified & AVTAB_XPERMS))
                        services_compute_xperms_drivers(xperms, node);
        }
}

static int cond_dup_av_list(struct cond_av_list *new,
                            const struct cond_av_list *orig,
                            struct avtab *avtab)
{
        u32 i;

        memset(new, 0, sizeof(*new));

        new->nodes = kcalloc(orig->len, sizeof(*new->nodes), GFP_KERNEL);
        if (!new->nodes)
                return -ENOMEM;

        for (i = 0; i < orig->len; i++) {
                new->nodes[i] = avtab_insert_nonunique(
                        avtab, &orig->nodes[i]->key, &orig->nodes[i]->datum);
                if (!new->nodes[i])
                        return -ENOMEM;
                new->len++;
        }

        return 0;
}

static int duplicate_policydb_cond_list(struct policydb *newp,
                                        const struct policydb *origp)
{
        int rc;
        u32 i;

        rc = avtab_alloc_dup(&newp->te_cond_avtab, &origp->te_cond_avtab);
        if (rc)
                return rc;

        newp->cond_list_len = 0;
        newp->cond_list = kcalloc(origp->cond_list_len,
                                  sizeof(*newp->cond_list), GFP_KERNEL);
        if (!newp->cond_list)
                goto error;

        for (i = 0; i < origp->cond_list_len; i++) {
                struct cond_node *newn = &newp->cond_list[i];
                const struct cond_node *orign = &origp->cond_list[i];

                newp->cond_list_len++;

                newn->cur_state = orign->cur_state;
                newn->expr.nodes =
                        kmemdup(orign->expr.nodes,
                                orign->expr.len * sizeof(*orign->expr.nodes),
                                GFP_KERNEL);
                if (!newn->expr.nodes)
                        goto error;

                newn->expr.len = orign->expr.len;

                rc = cond_dup_av_list(&newn->true_list, &orign->true_list,
                                      &newp->te_cond_avtab);
                if (rc)
                        goto error;

                rc = cond_dup_av_list(&newn->false_list, &orign->false_list,
                                      &newp->te_cond_avtab);
                if (rc)
                        goto error;
        }

        return 0;

error:
        avtab_destroy(&newp->te_cond_avtab);
        cond_list_destroy(newp);
        return -ENOMEM;
}

static int cond_bools_destroy(void *key, void *datum, void *args)
{
        /* key was not copied so no need to free here */
        kfree(datum);
        return 0;
}

static int cond_bools_copy(struct hashtab_node *new,
                           const struct hashtab_node *orig, void *args)
{
        struct cond_bool_datum *datum;

        datum = kmemdup(orig->datum, sizeof(struct cond_bool_datum),
                        GFP_KERNEL);
        if (!datum)
                return -ENOMEM;

        new->key = orig->key; /* No need to copy, never modified */
        new->datum = datum;
        return 0;
}

static int cond_bools_index(void *key, void *datum, void *args)
{
        struct cond_bool_datum *booldatum, **cond_bool_array;

        booldatum = datum;
        cond_bool_array = args;
        cond_bool_array[booldatum->value - 1] = booldatum;

        return 0;
}

static int duplicate_policydb_bools(struct policydb *newdb,
                                    const struct policydb *orig)
{
        struct cond_bool_datum **cond_bool_array;
        int rc;

        cond_bool_array = kmalloc_array(orig->p_bools.nprim,
                                        sizeof(*orig->bool_val_to_struct),
                                        GFP_KERNEL);
        if (!cond_bool_array)
                return -ENOMEM;

        rc = hashtab_duplicate(&newdb->p_bools.table, &orig->p_bools.table,
                               cond_bools_copy, cond_bools_destroy, NULL);
        if (rc) {
                kfree(cond_bool_array);
                return -ENOMEM;
        }

        hashtab_map(&newdb->p_bools.table, cond_bools_index, cond_bool_array);
        newdb->bool_val_to_struct = cond_bool_array;

        newdb->p_bools.nprim = orig->p_bools.nprim;

        return 0;
}

void cond_policydb_destroy_dup(struct policydb *p)
{
        hashtab_map(&p->p_bools.table, cond_bools_destroy, NULL);
        hashtab_destroy(&p->p_bools.table);
        cond_policydb_destroy(p);
}

int cond_policydb_dup(struct policydb *new, const struct policydb *orig)
{
        cond_policydb_init(new);

        if (duplicate_policydb_bools(new, orig))
                return -ENOMEM;

        if (duplicate_policydb_cond_list(new, orig)) {
                cond_policydb_destroy_dup(new);
                return -ENOMEM;
        }

        return 0;
}




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_SYSCALL_H
#define __ASM_SYSCALL_H

#include <uapi/linux/audit.h>
#include <linux/compat.h>
#include <linux/err.h>

typedef long (*syscall_fn_t)(const struct pt_regs *regs);

extern const syscall_fn_t sys_call_table[];

#ifdef CONFIG_COMPAT
extern const syscall_fn_t compat_sys_call_table[];
#endif

static inline int syscall_get_nr(struct task_struct *task,
                                 struct pt_regs *regs)
{
        return regs->syscallno;
}

static inline void syscall_rollback(struct task_struct *task,
                                    struct pt_regs *regs)
{
        regs->regs[0] = regs->orig_x0;
}

static inline long syscall_get_return_value(struct task_struct *task,
                                            struct pt_regs *regs)
{
        unsigned long val = regs->regs[0];

        if (is_compat_thread(task_thread_info(task)))
                val = sign_extend64(val, 31);

        return val;
}

static inline long syscall_get_error(struct task_struct *task,
                                     struct pt_regs *regs)
{
        unsigned long error = syscall_get_return_value(task, regs);

        return IS_ERR_VALUE(error) ? error : 0;
}

static inline void syscall_set_return_value(struct task_struct *task,
                                            struct pt_regs *regs,
                                            int error, long val)
{
        if (error)
                val = error;

        if (is_compat_thread(task_thread_info(task)))
                val = lower_32_bits(val);

        regs->regs[0] = val;
}

#define SYSCALL_MAX_ARGS 6

static inline void syscall_get_arguments(struct task_struct *task,
                                         struct pt_regs *regs,
                                         unsigned long *args)
{
        args[0] = regs->orig_x0;
        args++;

        memcpy(args, &regs->regs[1], 5 * sizeof(args[0]));
}

/*
 * We don't care about endianness (__AUDIT_ARCH_LE bit) here because
 * AArch64 has the same system calls both on little- and big- endian.
 */
static inline int syscall_get_arch(struct task_struct *task)
{
        if (is_compat_thread(task_thread_info(task)))
                return AUDIT_ARCH_ARM;

        return AUDIT_ARCH_AARCH64;
}

int syscall_trace_enter(struct pt_regs *regs);
void syscall_trace_exit(struct pt_regs *regs);

#endif        /* __ASM_SYSCALL_H */





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  787 






































































































































  787 








































  724 














  738 
  738 








  739 
  738 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMZONE_H
#define _LINUX_MMZONE_H

#ifndef __ASSEMBLY__
#ifndef __GENERATING_BOUNDS_H

#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/wait.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/threads.h>
#include <linux/numa.h>
#include <linux/init.h>
#include <linux/seqlock.h>
#include <linux/nodemask.h>
#include <linux/pageblock-flags.h>
#include <linux/page-flags-layout.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/local_lock.h>
#include <linux/zswap.h>
#include <asm/page.h>

/* Free memory management - zoned buddy allocator.  */
#ifndef CONFIG_ARCH_FORCE_MAX_ORDER
#define MAX_PAGE_ORDER 10
#else
#define MAX_PAGE_ORDER CONFIG_ARCH_FORCE_MAX_ORDER
#endif
#define MAX_ORDER_NR_PAGES (1 << MAX_PAGE_ORDER)

#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)

#define NR_PAGE_ORDERS (MAX_PAGE_ORDER + 1)

/*
 * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
 * costly to service.  That is between allocation orders which should
 * coalesce naturally under reasonable reclaim pressure and those which
 * will not.
 */
#define PAGE_ALLOC_COSTLY_ORDER 3

enum migratetype {
        MIGRATE_UNMOVABLE,
        MIGRATE_MOVABLE,
        MIGRATE_RECLAIMABLE,
        MIGRATE_PCPTYPES,        /* the number of types on the pcp lists */
        MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
        /*
         * MIGRATE_CMA migration type is designed to mimic the way
         * ZONE_MOVABLE works.  Only movable pages can be allocated
         * from MIGRATE_CMA pageblocks and page allocator never
         * implicitly change migration type of MIGRATE_CMA pageblock.
         *
         * The way to use it is to change migratetype of a range of
         * pageblocks to MIGRATE_CMA which can be done by
         * __free_pageblock_cma() function.
         */
        MIGRATE_CMA,
#endif
#ifdef CONFIG_MEMORY_ISOLATION
        MIGRATE_ISOLATE,        /* can't allocate from here */
#endif
        MIGRATE_TYPES
};

/* In mm/page_alloc.c; keep in sync also with show_migration_types() there */
extern const char * const migratetype_names[MIGRATE_TYPES];

#ifdef CONFIG_CMA
#  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
#  define is_migrate_cma_folio(folio, pfn)        (MIGRATE_CMA ==                \
        get_pfnblock_flags_mask(&folio->page, pfn, MIGRATETYPE_MASK))
#else
#  define is_migrate_cma(migratetype) false
#  define is_migrate_cma_page(_page) false
#  define is_migrate_cma_folio(folio, pfn) false
#endif

static inline bool is_migrate_movable(int mt)
{
        return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE;
}

/*
 * Check whether a migratetype can be merged with another migratetype.
 *
 * It is only mergeable when it can fall back to other migratetypes for
 * allocation. See fallbacks[MIGRATE_TYPES][3] in page_alloc.c.
 */
static inline bool migratetype_is_mergeable(int mt)
{
        return mt < MIGRATE_PCPTYPES;
}

#define for_each_migratetype_order(order, type) \
        for (order = 0; order < NR_PAGE_ORDERS; order++) \
                for (type = 0; type < MIGRATE_TYPES; type++)

extern int page_group_by_mobility_disabled;

#define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1)

#define get_pageblock_migratetype(page)                                        \
        get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK)

#define folio_migratetype(folio)                                \
        get_pfnblock_flags_mask(&folio->page, folio_pfn(folio),                \
                        MIGRATETYPE_MASK)
struct free_area {
        struct list_head        free_list[MIGRATE_TYPES];
        unsigned long                nr_free;
};

struct pglist_data;

#ifdef CONFIG_NUMA
enum numa_stat_item {
        NUMA_HIT,                /* allocated in intended node */
        NUMA_MISS,                /* allocated in non intended node */
        NUMA_FOREIGN,                /* was intended here, hit elsewhere */
        NUMA_INTERLEAVE_HIT,        /* interleaver preferred this zone */
        NUMA_LOCAL,                /* allocation from local node */
        NUMA_OTHER,                /* allocation from other node */
        NR_VM_NUMA_EVENT_ITEMS
};
#else
#define NR_VM_NUMA_EVENT_ITEMS 0
#endif

enum zone_stat_item {
        /* First 128 byte cacheline (assuming 64 bit words) */
        NR_FREE_PAGES,
        NR_FREE_PAGES_BLOCKS,
        NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
        NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
        NR_ZONE_ACTIVE_ANON,
        NR_ZONE_INACTIVE_FILE,
        NR_ZONE_ACTIVE_FILE,
        NR_ZONE_UNEVICTABLE,
        NR_ZONE_WRITE_PENDING,        /* Count of dirty, writeback and unstable pages */
        NR_MLOCK,                /* mlock()ed pages found and moved off LRU */
        /* Second 128 byte cacheline */
        NR_BOUNCE,
#if IS_ENABLED(CONFIG_ZSMALLOC)
        NR_ZSPAGES,                /* allocated in zsmalloc */
#endif
        NR_FREE_CMA_PAGES,
#ifdef CONFIG_UNACCEPTED_MEMORY
        NR_UNACCEPTED,
#endif
        NR_VM_ZONE_STAT_ITEMS };

enum node_stat_item {
        NR_LRU_BASE,
        NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
        NR_ACTIVE_ANON,                /*  "     "     "   "       "         */
        NR_INACTIVE_FILE,        /*  "     "     "   "       "         */
        NR_ACTIVE_FILE,                /*  "     "     "   "       "         */
        NR_UNEVICTABLE,                /*  "     "     "   "       "         */
        NR_SLAB_RECLAIMABLE_B,
        NR_SLAB_UNRECLAIMABLE_B,
        NR_ISOLATED_ANON,        /* Temporary isolated pages from anon lru */
        NR_ISOLATED_FILE,        /* Temporary isolated pages from file lru */
        WORKINGSET_NODES,
        WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE,
        WORKINGSET_REFAULT_FILE,
        WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE,
        WORKINGSET_ACTIVATE_FILE,
        WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE,
        WORKINGSET_RESTORE_FILE,
        WORKINGSET_NODERECLAIM,
        NR_ANON_MAPPED,        /* Mapped anonymous pages */
        NR_FILE_MAPPED,        /* pagecache pages mapped into pagetables.
                           only modified from process context */
        NR_FILE_PAGES,
        NR_FILE_DIRTY,
        NR_WRITEBACK,
        NR_WRITEBACK_TEMP,        /* Writeback using temporary buffers */
        NR_SHMEM,                /* shmem pages (included tmpfs/GEM pages) */
        NR_SHMEM_THPS,
        NR_SHMEM_PMDMAPPED,
        NR_FILE_THPS,
        NR_FILE_PMDMAPPED,
        NR_ANON_THPS,
        NR_VMSCAN_WRITE,
        NR_VMSCAN_IMMEDIATE,        /* Prioritise for reclaim when writeback ends */
        NR_DIRTIED,                /* page dirtyings since bootup */
        NR_WRITTEN,                /* page writings since bootup */
        NR_THROTTLED_WRITTEN,        /* NR_WRITTEN while reclaim throttled */
        NR_KERNEL_MISC_RECLAIMABLE,        /* reclaimable non-slab kernel pages */
        NR_FOLL_PIN_ACQUIRED,        /* via: pin_user_page(), gup flag: FOLL_PIN */
        NR_FOLL_PIN_RELEASED,        /* pages returned via unpin_user_page() */
        NR_KERNEL_STACK_KB,        /* measured in KiB */
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
        NR_KERNEL_SCS_KB,        /* measured in KiB */
#endif
        NR_PAGETABLE,                /* used for pagetables */
        NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */
#ifdef CONFIG_IOMMU_SUPPORT
        NR_IOMMU_PAGES,                /* # of pages allocated by IOMMU */
#endif
#ifdef CONFIG_SWAP
        NR_SWAPCACHE,
#endif
#ifdef CONFIG_NUMA_BALANCING
        PGPROMOTE_SUCCESS,        /* promote successfully */
        PGPROMOTE_CANDIDATE,        /* candidate pages to promote */
#endif
        /* PGDEMOTE_*: pages demoted */
        PGDEMOTE_KSWAPD,
        PGDEMOTE_DIRECT,
        PGDEMOTE_KHUGEPAGED,
        PGDEMOTE_PROACTIVE,
#ifdef CONFIG_HUGETLB_PAGE
        NR_HUGETLB,
#endif
        NR_BALLOON_PAGES,
        NR_VM_NODE_STAT_ITEMS
};

/*
 * Returns true if the item should be printed in THPs (/proc/vmstat
 * currently prints number of anon, file and shmem THPs. But the item
 * is charged in pages).
 */
static __always_inline bool vmstat_item_print_in_thp(enum node_stat_item item)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return false;

        return item == NR_ANON_THPS ||
               item == NR_FILE_THPS ||
               item == NR_SHMEM_THPS ||
               item == NR_SHMEM_PMDMAPPED ||
               item == NR_FILE_PMDMAPPED;
}

/*
 * Returns true if the value is measured in bytes (most vmstat values are
 * measured in pages). This defines the API part, the internal representation
 * might be different.
 */
static __always_inline bool vmstat_item_in_bytes(int idx)
{
        /*
         * Global and per-node slab counters track slab pages.
         * It's expected that changes are multiples of PAGE_SIZE.
         * Internally values are stored in pages.
         *
         * Per-memcg and per-lruvec counters track memory, consumed
         * by individual slab objects. These counters are actually
         * byte-precise.
         */
        return (idx == NR_SLAB_RECLAIMABLE_B ||
                idx == NR_SLAB_UNRECLAIMABLE_B);
}

/*
 * We do arithmetic on the LRU lists in various places in the code,
 * so it is important to keep the active lists LRU_ACTIVE higher in
 * the array than the corresponding inactive lists, and to keep
 * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists.
 *
 * This has to be kept in sync with the statistics in zone_stat_item
 * above and the descriptions in vmstat_text in mm/vmstat.c
 */
#define LRU_BASE 0
#define LRU_ACTIVE 1
#define LRU_FILE 2

enum lru_list {
        LRU_INACTIVE_ANON = LRU_BASE,
        LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
        LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
        LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
        LRU_UNEVICTABLE,
        NR_LRU_LISTS
};

enum vmscan_throttle_state {
        VMSCAN_THROTTLE_WRITEBACK,
        VMSCAN_THROTTLE_ISOLATED,
        VMSCAN_THROTTLE_NOPROGRESS,
        VMSCAN_THROTTLE_CONGESTED,
        NR_VMSCAN_THROTTLE,
};

#define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++)

#define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++)

static inline bool is_file_lru(enum lru_list lru)
{
        return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE);
}

static inline bool is_active_lru(enum lru_list lru)
{
        return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
}

#define WORKINGSET_ANON 0
#define WORKINGSET_FILE 1
#define ANON_AND_FILE 2

enum lruvec_flags {
        /*
         * An lruvec has many dirty pages backed by a congested BDI:
         * 1. LRUVEC_CGROUP_CONGESTED is set by cgroup-level reclaim.
         *    It can be cleared by cgroup reclaim or kswapd.
         * 2. LRUVEC_NODE_CONGESTED is set by kswapd node-level reclaim.
         *    It can only be cleared by kswapd.
         *
         * Essentially, kswapd can unthrottle an lruvec throttled by cgroup
         * reclaim, but not vice versa. This only applies to the root cgroup.
         * The goal is to prevent cgroup reclaim on the root cgroup (e.g.
         * memory.reclaim) to unthrottle an unbalanced node (that was throttled
         * by kswapd).
         */
        LRUVEC_CGROUP_CONGESTED,
        LRUVEC_NODE_CONGESTED,
};

#endif /* !__GENERATING_BOUNDS_H */

/*
 * Evictable folios are divided into multiple generations. The youngest and the
 * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
 * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
 * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
 * corresponding generation. The gen counter in folio->flags stores gen+1 while
 * a folio is on one of lrugen->folios[]. Otherwise it stores 0.
 *
 * After a folio is faulted in, the aging needs to check the accessed bit at
 * least twice before handing this folio over to the eviction. The first check
 * clears the accessed bit from the initial fault; the second check makes sure
 * this folio hasn't been used since then. This process, AKA second chance,
 * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI
 * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two
 * generations are considered active; the rest of generations, if they exist,
 * are considered inactive. See lru_gen_is_active().
 *
 * PG_active is always cleared while a folio is on one of lrugen->folios[] so
 * that the sliding window needs not to worry about it. And it's set again when
 * a folio considered active is isolated for non-reclaiming purposes, e.g.,
 * migration. See lru_gen_add_folio() and lru_gen_del_folio().
 *
 * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
 * in folio->flags, masked by LRU_GEN_MASK.
 */
#define MIN_NR_GENS                2U
#define MAX_NR_GENS                4U

/*
 * Each generation is divided into multiple tiers. A folio accessed N times
 * through file descriptors is in tier order_base_2(N). A folio in the first
 * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page
 * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by
 * PG_workingset. A folio in any other tier (1<N<5) between the first and last
 * is marked by additional bits of LRU_REFS_WIDTH in folio->flags.
 *
 * In contrast to moving across generations which requires the LRU lock, moving
 * across tiers only involves atomic operations on folio->flags and therefore
 * has a negligible cost in the buffered access path. In the eviction path,
 * comparisons of refaulted/(evicted+protected) from the first tier and the rest
 * infer whether folios accessed multiple times through file descriptors are
 * statistically hot and thus worth protecting.
 *
 * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the
 * number of categories of the active/inactive LRU when keeping track of
 * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in
 * folio->flags, masked by LRU_REFS_MASK.
 */
#define MAX_NR_TIERS                4U

#ifndef __GENERATING_BOUNDS_H

#define LRU_GEN_MASK                ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
#define LRU_REFS_MASK                ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)

/*
 * For folios accessed multiple times through file descriptors,
 * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags
 * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its
 * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily
 * promoted into the second oldest generation in the eviction path. And when
 * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that
 * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is
 * only valid when PG_referenced is set.
 *
 * For folios accessed multiple times through page tables, folio_update_gen()
 * from a page table walk or lru_gen_set_refs() from a rmap walk sets
 * PG_referenced after the accessed bit is cleared for the first time.
 * Thereafter, those two paths set PG_workingset and promote folios to the
 * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears
 * PG_referenced. Note that for this case, LRU_REFS_MASK is not used.
 *
 * For both cases above, after PG_workingset is set on a folio, it remains until
 * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It
 * can be set again if lru_gen_test_recent() returns true upon a refault.
 */
#define LRU_REFS_FLAGS                (LRU_REFS_MASK | BIT(PG_referenced))

struct lruvec;
struct page_vma_mapped_walk;

#ifdef CONFIG_LRU_GEN

enum {
        LRU_GEN_ANON,
        LRU_GEN_FILE,
};

enum {
        LRU_GEN_CORE,
        LRU_GEN_MM_WALK,
        LRU_GEN_NONLEAF_YOUNG,
        NR_LRU_GEN_CAPS
};

#define MIN_LRU_BATCH                BITS_PER_LONG
#define MAX_LRU_BATCH                (MIN_LRU_BATCH * 64)

/* whether to keep historical stats from evicted generations */
#ifdef CONFIG_LRU_GEN_STATS
#define NR_HIST_GENS                MAX_NR_GENS
#else
#define NR_HIST_GENS                1U
#endif

/*
 * The youngest generation number is stored in max_seq for both anon and file
 * types as they are aged on an equal footing. The oldest generation numbers are
 * stored in min_seq[] separately for anon and file types so that they can be
 * incremented independently. Ideally min_seq[] are kept in sync when both anon
 * and file types are evictable. However, to adapt to situations like extreme
 * swappiness, they are allowed to be out of sync by at most
 * MAX_NR_GENS-MIN_NR_GENS-1.
 *
 * The number of pages in each generation is eventually consistent and therefore
 * can be transiently negative when reset_batch_size() is pending.
 */
struct lru_gen_folio {
        /* the aging increments the youngest generation number */
        unsigned long max_seq;
        /* the eviction increments the oldest generation numbers */
        unsigned long min_seq[ANON_AND_FILE];
        /* the birth time of each generation in jiffies */
        unsigned long timestamps[MAX_NR_GENS];
        /* the multi-gen LRU lists, lazily sorted on eviction */
        struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the multi-gen LRU sizes, eventually consistent */
        long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* the exponential moving average of refaulted */
        unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
        /* the exponential moving average of evicted+protected */
        unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
        /* can only be modified under the LRU lock */
        unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        /* can be modified without holding the LRU lock */
        atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
        /* whether the multi-gen LRU is enabled */
        bool enabled;
        /* the memcg generation this lru_gen_folio belongs to */
        u8 gen;
        /* the list segment this lru_gen_folio belongs to */
        u8 seg;
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_node list;
};

enum {
        MM_LEAF_TOTAL,                /* total leaf entries */
        MM_LEAF_YOUNG,                /* young leaf entries */
        MM_NONLEAF_FOUND,        /* non-leaf entries found in Bloom filters */
        MM_NONLEAF_ADDED,        /* non-leaf entries added to Bloom filters */
        NR_MM_STATS
};

/* double-buffering Bloom filters */
#define NR_BLOOM_FILTERS        2

struct lru_gen_mm_state {
        /* synced with max_seq after each iteration */
        unsigned long seq;
        /* where the current iteration continues after */
        struct list_head *head;
        /* where the last iteration ended before */
        struct list_head *tail;
        /* Bloom filters flip after each iteration */
        unsigned long *filters[NR_BLOOM_FILTERS];
        /* the mm stats for debugging */
        unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
};

struct lru_gen_mm_walk {
        /* the lruvec under reclaim */
        struct lruvec *lruvec;
        /* max_seq from lru_gen_folio: can be out of date */
        unsigned long seq;
        /* the next address within an mm to scan */
        unsigned long next_addr;
        /* to batch promoted pages */
        int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
        /* to batch the mm stats */
        int mm_stats[NR_MM_STATS];
        /* total batched items */
        int batched;
        int swappiness;
        bool force_scan;
};

/*
 * For each node, memcgs are divided into two generations: the old and the
 * young. For each generation, memcgs are randomly sharded into multiple bins
 * to improve scalability. For each bin, the hlist_nulls is virtually divided
 * into three segments: the head, the tail and the default.
 *
 * An onlining memcg is added to the tail of a random bin in the old generation.
 * The eviction starts at the head of a random bin in the old generation. The
 * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
 * the old generation, is incremented when all its bins become empty.
 *
 * There are four operations:
 * 1. MEMCG_LRU_HEAD, which moves a memcg to the head of a random bin in its
 *    current generation (old or young) and updates its "seg" to "head";
 * 2. MEMCG_LRU_TAIL, which moves a memcg to the tail of a random bin in its
 *    current generation (old or young) and updates its "seg" to "tail";
 * 3. MEMCG_LRU_OLD, which moves a memcg to the head of a random bin in the old
 *    generation, updates its "gen" to "old" and resets its "seg" to "default";
 * 4. MEMCG_LRU_YOUNG, which moves a memcg to the tail of a random bin in the
 *    young generation, updates its "gen" to "young" and resets its "seg" to
 *    "default".
 *
 * The events that trigger the above operations are:
 * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
 * 2. The first attempt to reclaim a memcg below low, which triggers
 *    MEMCG_LRU_TAIL;
 * 3. The first attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_TAIL;
 * 4. The second attempt to reclaim a memcg offlined or below reclaimable size
 *    threshold, which triggers MEMCG_LRU_YOUNG;
 * 5. Attempting to reclaim a memcg below min, which triggers MEMCG_LRU_YOUNG;
 * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
 * 7. Offlining a memcg, which triggers MEMCG_LRU_OLD.
 *
 * Notes:
 * 1. Memcg LRU only applies to global reclaim, and the round-robin incrementing
 *    of their max_seq counters ensures the eventual fairness to all eligible
 *    memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
 * 2. There are only two valid generations: old (seq) and young (seq+1).
 *    MEMCG_NR_GENS is set to three so that when reading the generation counter
 *    locklessly, a stale value (seq-1) does not wraparound to young.
 */
#define MEMCG_NR_GENS        3
#define MEMCG_NR_BINS        8

struct lru_gen_memcg {
        /* the per-node memcg generation counter */
        unsigned long seq;
        /* each memcg has one lru_gen_folio per node */
        unsigned long nr_memcgs[MEMCG_NR_GENS];
        /* per-node lru_gen_folio list for global reclaim */
        struct hlist_nulls_head        fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
        /* protects the above */
        spinlock_t lock;
};

void lru_gen_init_pgdat(struct pglist_data *pgdat);
void lru_gen_init_lruvec(struct lruvec *lruvec);
bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw);

void lru_gen_init_memcg(struct mem_cgroup *memcg);
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
void lru_gen_online_memcg(struct mem_cgroup *memcg);
void lru_gen_offline_memcg(struct mem_cgroup *memcg);
void lru_gen_release_memcg(struct mem_cgroup *memcg);
void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid);

#else /* !CONFIG_LRU_GEN */

static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
{
}

static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
{
}

static inline bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
{
        return false;
}

static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
{
}

static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
{
}

#endif /* CONFIG_LRU_GEN */

struct lruvec {
        struct list_head                lists[NR_LRU_LISTS];
        /* per lruvec lru_lock for memcg */
        spinlock_t                        lru_lock;
        /*
         * These track the cost of reclaiming one LRU - file or anon -
         * over the other. As the observed cost of reclaiming one LRU
         * increases, the reclaim scan balance tips toward the other.
         */
        unsigned long                        anon_cost;
        unsigned long                        file_cost;
        /* Non-resident age, driven by LRU movement */
        atomic_long_t                        nonresident_age;
        /* Refaults at the time of last reclaim cycle */
        unsigned long                        refaults[ANON_AND_FILE];
        /* Various lruvec state flags (enum lruvec_flags) */
        unsigned long                        flags;
#ifdef CONFIG_LRU_GEN
        /* evictable pages divided into generations */
        struct lru_gen_folio                lrugen;
#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* to concurrently iterate lru_gen_mm_list */
        struct lru_gen_mm_state                mm_state;
#endif
#endif /* CONFIG_LRU_GEN */
#ifdef CONFIG_MEMCG
        struct pglist_data *pgdat;
#endif
        struct zswap_lruvec_state zswap_lruvec_state;
};

/* Isolate for asynchronous migration */
#define ISOLATE_ASYNC_MIGRATE        ((__force isolate_mode_t)0x4)
/* Isolate unevictable pages */
#define ISOLATE_UNEVICTABLE        ((__force isolate_mode_t)0x8)

/* LRU Isolation modes. */
typedef unsigned __bitwise isolate_mode_t;

enum zone_watermarks {
        WMARK_MIN,
        WMARK_LOW,
        WMARK_HIGH,
        WMARK_PROMO,
        NR_WMARK
};

/*
 * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. Two additional lists
 * are added for THP. One PCP list is used by GPF_MOVABLE, and the other PCP list
 * is used by GFP_UNMOVABLE and GFP_RECLAIMABLE.
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define NR_PCP_THP 2
#else
#define NR_PCP_THP 0
#endif
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)

/*
 * Flags used in pcp->flags field.
 *
 * PCPF_PREV_FREE_HIGH_ORDER: a high-order page is freed in the
 * previous page freeing.  To avoid to drain PCP for an accident
 * high-order page freeing.
 *
 * PCPF_FREE_HIGH_BATCH: preserve "pcp->batch" pages in PCP before
 * draining PCP for consecutive high-order pages freeing without
 * allocation if data cache slice of CPU is large enough.  To reduce
 * zone lock contention and keep cache-hot pages reusing.
 */
#define        PCPF_PREV_FREE_HIGH_ORDER        BIT(0)
#define        PCPF_FREE_HIGH_BATCH                BIT(1)

struct per_cpu_pages {
        spinlock_t lock;        /* Protects lists field */
        int count;                /* number of pages in the list */
        int high;                /* high watermark, emptying needed */
        int high_min;                /* min high watermark */
        int high_max;                /* max high watermark */
        int batch;                /* chunk size for buddy add/remove */
        u8 flags;                /* protected by pcp->lock */
        u8 alloc_factor;        /* batch scaling factor during allocate */
#ifdef CONFIG_NUMA
        u8 expire;                /* When 0, remote pagesets are drained */
#endif
        short free_count;        /* consecutive free count */

        /* Lists of pages, one per migrate type stored on the pcp-lists */
        struct list_head lists[NR_PCP_LISTS];
} ____cacheline_aligned_in_smp;

struct per_cpu_zonestat {
#ifdef CONFIG_SMP
        s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
        s8 stat_threshold;
#endif
#ifdef CONFIG_NUMA
        /*
         * Low priority inaccurate counters that are only folded
         * on demand. Use a large type to avoid the overhead of
         * folding during refresh_cpu_vm_stats.
         */
        unsigned long vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
#endif
};

struct per_cpu_nodestat {
        s8 stat_threshold;
        s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS];
};

#endif /* !__GENERATING_BOUNDS.H */

enum zone_type {
        /*
         * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able
         * to DMA to all of the addressable memory (ZONE_NORMAL).
         * On architectures where this area covers the whole 32 bit address
         * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller
         * DMA addressing constraints. This distinction is important as a 32bit
         * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit
         * platforms may need both zones as they support peripherals with
         * different DMA addressing limitations.
         */
#ifdef CONFIG_ZONE_DMA
        ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
        ZONE_DMA32,
#endif
        /*
         * Normal addressable memory is in ZONE_NORMAL. DMA operations can be
         * performed on pages in ZONE_NORMAL if the DMA devices support
         * transfers to all addressable memory.
         */
        ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
        /*
         * A memory area that is only addressable by the kernel through
         * mapping portions into its own address space. This is for example
         * used by i386 to allow the kernel to address the memory beyond
         * 900MB. The kernel will set up special mappings (page
         * table entries on i386) for each page that the kernel needs to
         * access.
         */
        ZONE_HIGHMEM,
#endif
        /*
         * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains
         * movable pages with few exceptional cases described below. Main use
         * cases for ZONE_MOVABLE are to make memory offlining/unplug more
         * likely to succeed, and to locally limit unmovable allocations - e.g.,
         * to increase the number of THP/huge pages. Notable special cases are:
         *
         * 1. Pinned pages: (long-term) pinning of movable pages might
         *    essentially turn such pages unmovable. Therefore, we do not allow
         *    pinning long-term pages in ZONE_MOVABLE. When pages are pinned and
         *    faulted, they come from the right zone right away. However, it is
         *    still possible that address space already has pages in
         *    ZONE_MOVABLE at the time when pages are pinned (i.e. user has
         *    touches that memory before pinning). In such case we migrate them
         *    to a different zone. When migration fails - pinning fails.
         * 2. memblock allocations: kernelcore/movablecore setups might create
         *    situations where ZONE_MOVABLE contains unmovable allocations
         *    after boot. Memory offlining and allocations fail early.
         * 3. Memory holes: kernelcore/movablecore setups might create very rare
         *    situations where ZONE_MOVABLE contains memory holes after boot,
         *    for example, if we have sections that are only partially
         *    populated. Memory offlining and allocations fail early.
         * 4. PG_hwpoison pages: while poisoned pages can be skipped during
         *    memory offlining, such pages cannot be allocated.
         * 5. Unmovable PG_offline pages: in paravirtualized environments,
         *    hotplugged memory blocks might only partially be managed by the
         *    buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The
         *    parts not manged by the buddy are unmovable PG_offline pages. In
         *    some cases (virtio-mem), such pages can be skipped during
         *    memory offlining, however, cannot be moved/allocated. These
         *    techniques might use alloc_contig_range() to hide previously
         *    exposed pages from the buddy again (e.g., to implement some sort
         *    of memory unplug in virtio-mem).
         * 6. ZERO_PAGE(0), kernelcore/movablecore setups might create
         *    situations where ZERO_PAGE(0) which is allocated differently
         *    on different platforms may end up in a movable zone. ZERO_PAGE(0)
         *    cannot be migrated.
         * 7. Memory-hotplug: when using memmap_on_memory and onlining the
         *    memory to the MOVABLE zone, the vmemmap pages are also placed in
         *    such zone. Such pages cannot be really moved around as they are
         *    self-stored in the range, but they are treated as movable when
         *    the range they describe is about to be offlined.
         *
         * In general, no unmovable allocations that degrade memory offlining
         * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range())
         * have to expect that migrating pages in ZONE_MOVABLE can fail (even
         * if has_unmovable_pages() states that there are no unmovable pages,
         * there can be false negatives).
         */
        ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
        ZONE_DEVICE,
#endif
        __MAX_NR_ZONES

};

#ifndef __GENERATING_BOUNDS_H

#define ASYNC_AND_SYNC 2

struct zone {
        /* Read-mostly fields */

        /* zone watermarks, access with *_wmark_pages(zone) macros */
        unsigned long _watermark[NR_WMARK];
        unsigned long watermark_boost;

        unsigned long nr_reserved_highatomic;
        unsigned long nr_free_highatomic;

        /*
         * We don't know if the memory that we're going to allocate will be
         * freeable or/and it will be released eventually, so to avoid totally
         * wasting several GB of ram we must reserve some of the lower zone
         * memory (otherwise we risk to run OOM on the lower zones despite
         * there being tons of freeable ram on the higher zones).  This array is
         * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl
         * changes.
         */
        long lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
        int node;
#endif
        struct pglist_data        *zone_pgdat;
        struct per_cpu_pages        __percpu *per_cpu_pageset;
        struct per_cpu_zonestat        __percpu *per_cpu_zonestats;
        /*
         * the high and batch values are copied to individual pagesets for
         * faster access
         */
        int pageset_high_min;
        int pageset_high_max;
        int pageset_batch;

#ifndef CONFIG_SPARSEMEM
        /*
         * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
         * In SPARSEMEM, this map is stored in struct mem_section
         */
        unsigned long                *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

        /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
        unsigned long                zone_start_pfn;

        /*
         * spanned_pages is the total pages spanned by the zone, including
         * holes, which is calculated as:
         *         spanned_pages = zone_end_pfn - zone_start_pfn;
         *
         * present_pages is physical pages existing within the zone, which
         * is calculated as:
         *        present_pages = spanned_pages - absent_pages(pages in holes);
         *
         * present_early_pages is present pages existing within the zone
         * located on memory available since early boot, excluding hotplugged
         * memory.
         *
         * managed_pages is present pages managed by the buddy system, which
         * is calculated as (reserved_pages includes pages allocated by the
         * bootmem allocator):
         *        managed_pages = present_pages - reserved_pages;
         *
         * cma pages is present pages that are assigned for CMA use
         * (MIGRATE_CMA).
         *
         * So present_pages may be used by memory hotplug or memory power
         * management logic to figure out unmanaged pages by checking
         * (present_pages - managed_pages). And managed_pages should be used
         * by page allocator and vm scanner to calculate all kinds of watermarks
         * and thresholds.
         *
         * Locking rules:
         *
         * zone_start_pfn and spanned_pages are protected by span_seqlock.
         * It is a seqlock because it has to be read outside of zone->lock,
         * and it is done in the main allocator path.  But, it is written
         * quite infrequently.
         *
         * The span_seq lock is declared along with zone->lock because it is
         * frequently read in proximity to zone->lock.  It's good to
         * give them a chance of being in the same cacheline.
         *
         * Write access to present_pages at runtime should be protected by
         * mem_hotplug_begin/done(). Any reader who can't tolerant drift of
         * present_pages should use get_online_mems() to get a stable value.
         */
        atomic_long_t                managed_pages;
        unsigned long                spanned_pages;
        unsigned long                present_pages;
#if defined(CONFIG_MEMORY_HOTPLUG)
        unsigned long                present_early_pages;
#endif
#ifdef CONFIG_CMA
        unsigned long                cma_pages;
#endif

        const char                *name;

#ifdef CONFIG_MEMORY_ISOLATION
        /*
         * Number of isolated pageblock. It is used to solve incorrect
         * freepage counting problem due to racy retrieving migratetype
         * of pageblock. Protected by zone->lock.
         */
        unsigned long                nr_isolate_pageblock;
#endif

#ifdef CONFIG_MEMORY_HOTPLUG
        /* see spanned/present_pages for more description */
        seqlock_t                span_seqlock;
#endif

        int initialized;

        /* Write-intensive fields used from the page allocator */
        CACHELINE_PADDING(_pad1_);

        /* free areas of different sizes */
        struct free_area        free_area[NR_PAGE_ORDERS];

#ifdef CONFIG_UNACCEPTED_MEMORY
        /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */
        struct list_head        unaccepted_pages;

        /* To be called once the last page in the zone is accepted */
        struct work_struct        unaccepted_cleanup;
#endif

        /* zone flags, see below */
        unsigned long                flags;

        /* Primarily protects free_area */
        spinlock_t                lock;

        /* Pages to be freed when next trylock succeeds */
        struct llist_head        trylock_free_pages;

        /* Write-intensive fields used by compaction and vmstats. */
        CACHELINE_PADDING(_pad2_);

        /*
         * When free pages are below this point, additional steps are taken
         * when reading the number of free pages to avoid per-cpu counter
         * drift allowing watermarks to be breached
         */
        unsigned long percpu_drift_mark;

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* pfn where compaction free scanner should start */
        unsigned long                compact_cached_free_pfn;
        /* pfn where compaction migration scanner should start */
        unsigned long                compact_cached_migrate_pfn[ASYNC_AND_SYNC];
        unsigned long                compact_init_migrate_pfn;
        unsigned long                compact_init_free_pfn;
#endif

#ifdef CONFIG_COMPACTION
        /*
         * On compaction failure, 1<<compact_defer_shift compactions
         * are skipped before trying again. The number attempted since
         * last failure is tracked with compact_considered.
         * compact_order_failed is the minimum compaction failed order.
         */
        unsigned int                compact_considered;
        unsigned int                compact_defer_shift;
        int                        compact_order_failed;
#endif

#if defined CONFIG_COMPACTION || defined CONFIG_CMA
        /* Set to true when the PG_migrate_skip bits should be cleared */
        bool                        compact_blockskip_flush;
#endif

        bool                        contiguous;

        CACHELINE_PADDING(_pad3_);
        /* Zone statistics */
        atomic_long_t                vm_stat[NR_VM_ZONE_STAT_ITEMS];
        atomic_long_t                vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];
} ____cacheline_internodealigned_in_smp;

enum pgdat_flags {
        PGDAT_DIRTY,                        /* reclaim scanning has recently found
                                         * many dirty file pages at the tail
                                         * of the LRU.
                                         */
        PGDAT_WRITEBACK,                /* reclaim scanning has recently found
                                         * many pages under writeback
                                         */
        PGDAT_RECLAIM_LOCKED,                /* prevents concurrent reclaim */
};

enum zone_flags {
        ZONE_BOOSTED_WATERMARK,                /* zone recently boosted watermarks.
                                         * Cleared when kswapd is woken.
                                         */
        ZONE_RECLAIM_ACTIVE,                /* kswapd may be scanning the zone. */
        ZONE_BELOW_HIGH,                /* zone is below high watermark. */
};

static inline unsigned long wmark_pages(const struct zone *z,
                                        enum zone_watermarks w)
{
        return z->_watermark[w] + z->watermark_boost;
}

static inline unsigned long min_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_MIN);
}

static inline unsigned long low_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_LOW);
}

static inline unsigned long high_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_HIGH);
}

static inline unsigned long promo_wmark_pages(const struct zone *z)
{
        return wmark_pages(z, WMARK_PROMO);
}

static inline unsigned long zone_managed_pages(struct zone *zone)
{
        return (unsigned long)atomic_long_read(&zone->managed_pages);
}

static inline unsigned long zone_cma_pages(struct zone *zone)
{
#ifdef CONFIG_CMA
        return zone->cma_pages;
#else
        return 0;
#endif
}

static inline unsigned long zone_end_pfn(const struct zone *zone)
{
        return zone->zone_start_pfn + zone->spanned_pages;
}

static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
        return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
}

static inline bool zone_is_initialized(struct zone *zone)
{
        return zone->initialized;
}

static inline bool zone_is_empty(struct zone *zone)
{
        return zone->spanned_pages == 0;
}

#ifndef BUILD_VDSO32_64
/*
 * The zone field is never updated after free_area_init_core()
 * sets it, so none of the operations on it need to be atomic.
 */

/* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */
#define SECTIONS_PGOFF                ((sizeof(unsigned long)*8) - SECTIONS_WIDTH)
#define NODES_PGOFF                (SECTIONS_PGOFF - NODES_WIDTH)
#define ZONES_PGOFF                (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF        (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF                (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
#define LRU_GEN_PGOFF                (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
#define LRU_REFS_PGOFF                (LRU_GEN_PGOFF - LRU_REFS_WIDTH)

/*
 * Define the bit shifts to access each section.  For non-existent
 * sections we define the shift as 0; that plus a 0 mask ensures
 * the compiler will optimise away reference to them.
 */
#define SECTIONS_PGSHIFT        (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0))
#define NODES_PGSHIFT                (NODES_PGOFF * (NODES_WIDTH != 0))
#define ZONES_PGSHIFT                (ZONES_PGOFF * (ZONES_WIDTH != 0))
#define LAST_CPUPID_PGSHIFT        (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0))
#define KASAN_TAG_PGSHIFT        (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0))

/* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */
#ifdef NODE_NOT_IN_PAGE_FLAGS
#define ZONEID_SHIFT                (SECTIONS_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((SECTIONS_PGOFF < ZONES_PGOFF) ? \
                                                SECTIONS_PGOFF : ZONES_PGOFF)
#else
#define ZONEID_SHIFT                (NODES_SHIFT + ZONES_SHIFT)
#define ZONEID_PGOFF                ((NODES_PGOFF < ZONES_PGOFF) ? \
                                                NODES_PGOFF : ZONES_PGOFF)
#endif

#define ZONEID_PGSHIFT                (ZONEID_PGOFF * (ZONEID_SHIFT != 0))

#define ZONES_MASK                ((1UL << ZONES_WIDTH) - 1)
#define NODES_MASK                ((1UL << NODES_WIDTH) - 1)
#define SECTIONS_MASK                ((1UL << SECTIONS_WIDTH) - 1)
#define LAST_CPUPID_MASK        ((1UL << LAST_CPUPID_SHIFT) - 1)
#define KASAN_TAG_MASK                ((1UL << KASAN_TAG_WIDTH) - 1)
#define ZONEID_MASK                ((1UL << ZONEID_SHIFT) - 1)

static inline enum zone_type page_zonenum(const struct page *page)
{
        ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT);
        return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}

static inline enum zone_type folio_zonenum(const struct folio *folio)
{
        return page_zonenum(&folio->page);
}

#ifdef CONFIG_ZONE_DEVICE
static inline bool is_zone_device_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_DEVICE;
}

static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
        VM_WARN_ON_ONCE_PAGE(!is_zone_device_page(page), page);
        return page_folio(page)->pgmap;
}

/*
 * Consecutive zone device pages should not be merged into the same sgl
 * or bvec segment with other types of pages or if they belong to different
 * pgmaps. Otherwise getting the pgmap of a given segment is not possible
 * without scanning the entire segment. This helper returns true either if
 * both pages are not zone device pages or both pages are zone device pages
 * with the same pgmap.
 */
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        if (is_zone_device_page(a) != is_zone_device_page(b))
                return false;
        if (!is_zone_device_page(a))
                return true;
        return page_pgmap(a) == page_pgmap(b);
}

extern void memmap_init_zone_device(struct zone *, unsigned long,
                                    unsigned long, struct dev_pagemap *);
#else
static inline bool is_zone_device_page(const struct page *page)
{
        return false;
}
static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
                                                     const struct page *b)
{
        return true;
}
static inline struct dev_pagemap *page_pgmap(const struct page *page)
{
        return NULL;
}
#endif

static inline bool folio_is_zone_device(const struct folio *folio)
{
        return is_zone_device_page(&folio->page);
}

static inline bool is_zone_movable_page(const struct page *page)
{
        return page_zonenum(page) == ZONE_MOVABLE;
}

static inline bool folio_is_zone_movable(const struct folio *folio)
{
        return folio_zonenum(folio) == ZONE_MOVABLE;
}
#endif

/*
 * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty
 * intersection with the given zone
 */
static inline bool zone_intersects(struct zone *zone,
                unsigned long start_pfn, unsigned long nr_pages)
{
        if (zone_is_empty(zone))
                return false;
        if (start_pfn >= zone_end_pfn(zone) ||
            start_pfn + nr_pages <= zone->zone_start_pfn)
                return false;

        return true;
}

/*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 * queues ("queue_length >> 12") during an aging round.
 */
#define DEF_PRIORITY 12

/* Maximum number of zones on a zonelist */
#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)

enum {
        ZONELIST_FALLBACK,        /* zonelist with fallback */
#ifdef CONFIG_NUMA
        /*
         * The NUMA zonelists are doubled because we need zonelists that
         * restrict the allocations to a single node for __GFP_THISNODE.
         */
        ZONELIST_NOFALLBACK,        /* zonelist without fallback (__GFP_THISNODE) */
#endif
        MAX_ZONELISTS
};

/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */
struct zoneref {
        struct zone *zone;        /* Pointer to actual zone */
        int zone_idx;                /* zone_idx(zoneref->zone) */
};

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()        - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()        - Return the index of the zone for an entry
 * zonelist_node_idx()        - Return the index of the node for an entry
 */
struct zonelist {
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

/*
 * The array of struct pages for flatmem.
 * It must be declared for SPARSEMEM as well because there are configurations
 * that rely on that.
 */
extern struct page *mem_map;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
struct deferred_split {
        spinlock_t split_queue_lock;
        struct list_head split_queue;
        unsigned long split_queue_len;
};
#endif

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Per NUMA node memory failure handling statistics.
 */
struct memory_failure_stats {
        /*
         * Number of raw pages poisoned.
         * Cases not accounted: memory outside kernel control, offline page,
         * arch-specific memory_failure (SGX), hwpoison_filter() filtered
         * error events, and unpoison actions from hwpoison_unpoison.
         */
        unsigned long total;
        /*
         * Recovery results of poisoned raw pages handled by memory_failure,
         * in sync with mf_result.
         * total = ignored + failed + delayed + recovered.
         * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
         */
        unsigned long ignored;
        unsigned long failed;
        unsigned long delayed;
        unsigned long recovered;
};
#endif

/*
 * On NUMA machines, each NUMA node would have a pg_data_t to describe
 * it's memory layout. On UMA machines there is a single pglist_data which
 * describes the whole memory.
 *
 * Memory statistics and page replacement data structures are maintained on a
 * per-zone basis.
 */
typedef struct pglist_data {
        /*
         * node_zones contains just the zones for THIS node. Not all of the
         * zones may be populated, but it is the full list. It is referenced by
         * this node's node_zonelists as well as other node's node_zonelists.
         */
        struct zone node_zones[MAX_NR_ZONES];

        /*
         * node_zonelists contains references to all zones in all nodes.
         * Generally the first zones will be references to this node's
         * node_zones.
         */
        struct zonelist node_zonelists[MAX_ZONELISTS];

        int nr_zones; /* number of populated zones in this node */
#ifdef CONFIG_FLATMEM        /* means !SPARSEMEM */
        struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
        struct page_ext *node_page_ext;
#endif
#endif
#if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT)
        /*
         * Must be held any time you expect node_start_pfn,
         * node_present_pages, node_spanned_pages or nr_zones to stay constant.
         * Also synchronizes pgdat->first_deferred_pfn during deferred page
         * init.
         *
         * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
         * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG
         * or CONFIG_DEFERRED_STRUCT_PAGE_INIT.
         *
         * Nests above zone->lock and zone->span_seqlock
         */
        spinlock_t node_size_lock;
#endif
        unsigned long node_start_pfn;
        unsigned long node_present_pages; /* total number of physical pages */
        unsigned long node_spanned_pages; /* total size of physical page
                                             range, including holes */
        int node_id;
        wait_queue_head_t kswapd_wait;
        wait_queue_head_t pfmemalloc_wait;

        /* workqueues for throttling reclaim for different reasons. */
        wait_queue_head_t reclaim_wait[NR_VMSCAN_THROTTLE];

        atomic_t nr_writeback_throttled;/* nr of writeback-throttled tasks */
        unsigned long nr_reclaim_start;        /* nr pages written while throttled
                                         * when throttling started. */
#ifdef CONFIG_MEMORY_HOTPLUG
        struct mutex kswapd_lock;
#endif
        struct task_struct *kswapd;        /* Protected by kswapd_lock */
        int kswapd_order;
        enum zone_type kswapd_highest_zoneidx;

        int kswapd_failures;                /* Number of 'reclaimed == 0' runs */

#ifdef CONFIG_COMPACTION
        int kcompactd_max_order;
        enum zone_type kcompactd_highest_zoneidx;
        wait_queue_head_t kcompactd_wait;
        struct task_struct *kcompactd;
        bool proactive_compact_trigger;
#endif
        /*
         * This is a per-node reserve of pages that are not available
         * to userspace allocations.
         */
        unsigned long                totalreserve_pages;

#ifdef CONFIG_NUMA
        /*
         * node reclaim becomes active if more unmapped pages exist.
         */
        unsigned long                min_unmapped_pages;
        unsigned long                min_slab_pages;
#endif /* CONFIG_NUMA */

        /* Write-intensive fields used by page reclaim */
        CACHELINE_PADDING(_pad1_);

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
        /*
         * If memory initialisation on large machines is deferred then this
         * is the first PFN that needs to be initialised.
         */
        unsigned long first_deferred_pfn;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_NUMA_BALANCING
        /* start time in ms of current promote rate limit period */
        unsigned int nbp_rl_start;
        /* number of promote candidate pages at start time of current rate limit period */
        unsigned long nbp_rl_nr_cand;
        /* promote threshold in ms */
        unsigned int nbp_threshold;
        /* start time in ms of current promote threshold adjustment period */
        unsigned int nbp_th_start;
        /*
         * number of promote candidate pages at start time of current promote
         * threshold adjustment period
         */
        unsigned long nbp_th_nr_cand;
#endif
        /* Fields commonly accessed by the page reclaim scanner */

        /*
         * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED.
         *
         * Use mem_cgroup_lruvec() to look up lruvecs.
         */
        struct lruvec                __lruvec;

        unsigned long                flags;

#ifdef CONFIG_LRU_GEN
        /* kswap mm walk data */
        struct lru_gen_mm_walk mm_walk;
        /* lru_gen_folio list */
        struct lru_gen_memcg memcg_lru;
#endif

        CACHELINE_PADDING(_pad2_);

        /* Per-node vmstats */
        struct per_cpu_nodestat __percpu *per_cpu_nodestats;
        atomic_long_t                vm_stat[NR_VM_NODE_STAT_ITEMS];
#ifdef CONFIG_NUMA
        struct memory_tier __rcu *memtier;
#endif
#ifdef CONFIG_MEMORY_FAILURE
        struct memory_failure_stats mf_stats;
#endif
} pg_data_t;

#define node_present_pages(nid)        (NODE_DATA(nid)->node_present_pages)
#define node_spanned_pages(nid)        (NODE_DATA(nid)->node_spanned_pages)

#define node_start_pfn(nid)        (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
        return pgdat->node_start_pfn + pgdat->node_spanned_pages;
}

#include <linux/memory_hotplug.h>

void build_all_zonelists(pg_data_t *pgdat);
void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order,
                   enum zone_type highest_zoneidx);
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
                         int highest_zoneidx, unsigned int alloc_flags,
                         long free_pages);
bool zone_watermark_ok(struct zone *z, unsigned int order,
                unsigned long mark, int highest_zoneidx,
                unsigned int alloc_flags);
/*
 * Memory initialization context, use to differentiate memory added by
 * the platform statically or via memory hotplug interface.
 */
enum meminit_context {
        MEMINIT_EARLY,
        MEMINIT_HOTPLUG,
};

extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn,
                                     unsigned long size);

extern void lruvec_init(struct lruvec *lruvec);

static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec)
{
#ifdef CONFIG_MEMCG
        return lruvec->pgdat;
#else
        return container_of(lruvec, struct pglist_data, __lruvec);
#endif
}

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
int local_memory_node(int node_id);
#else
static inline int local_memory_node(int node_id) { return node_id; };
#endif

/*
 * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
 */
#define zone_idx(zone)                ((zone) - (zone)->zone_pgdat->node_zones)

#ifdef CONFIG_ZONE_DEVICE
static inline bool zone_is_zone_device(struct zone *zone)
{
        return zone_idx(zone) == ZONE_DEVICE;
}
#else
static inline bool zone_is_zone_device(struct zone *zone)
{
        return false;
}
#endif

/*
 * Returns true if a zone has pages managed by the buddy allocator.
 * All the reclaim decisions have to use this function rather than
 * populated_zone(). If the whole zone is reserved then we can easily
 * end up with populated_zone() && !managed_zone().
 */
static inline bool managed_zone(struct zone *zone)
{
        return zone_managed_pages(zone);
}

/* Returns true if a zone has memory */
static inline bool populated_zone(struct zone *zone)
{
        return zone->present_pages;
}

#ifdef CONFIG_NUMA
static inline int zone_to_nid(struct zone *zone)
{
        return zone->node;
}

static inline void zone_set_nid(struct zone *zone, int nid)
{
        zone->node = nid;
}
#else
static inline int zone_to_nid(struct zone *zone)
{
        return 0;
}

static inline void zone_set_nid(struct zone *zone, int nid) {}
#endif

extern int movable_zone;

static inline int is_highmem_idx(enum zone_type idx)
{
#ifdef CONFIG_HIGHMEM
        return (idx == ZONE_HIGHMEM ||
                (idx == ZONE_MOVABLE && movable_zone == ZONE_HIGHMEM));
#else
        return 0;
#endif
}

/**
 * is_highmem - helper function to quickly check if a struct zone is a
 *              highmem zone or not.  This is an attempt to keep references
 *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
 * @zone: pointer to struct zone variable
 * Return: 1 for a highmem zone, 0 otherwise
 */
static inline int is_highmem(struct zone *zone)
{
        return is_highmem_idx(zone_idx(zone));
}

#ifdef CONFIG_ZONE_DMA
bool has_managed_dma(void);
#else
static inline bool has_managed_dma(void)
{
        return false;
}
#endif


#ifndef CONFIG_NUMA

extern struct pglist_data contig_page_data;
static inline struct pglist_data *NODE_DATA(int nid)
{
        return &contig_page_data;
}

#else /* CONFIG_NUMA */

#include <asm/mmzone.h>

#endif /* !CONFIG_NUMA */

extern struct pglist_data *first_online_pgdat(void);
extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
extern struct zone *next_zone(struct zone *zone);

/**
 * for_each_online_pgdat - helper macro to iterate over all online nodes
 * @pgdat: pointer to a pg_data_t variable
 */
#define for_each_online_pgdat(pgdat)                        \
        for (pgdat = first_online_pgdat();                \
             pgdat;                                        \
             pgdat = next_online_pgdat(pgdat))
/**
 * for_each_zone - helper macro to iterate over all memory zones
 * @zone: pointer to struct zone variable
 *
 * The user only needs to declare the zone variable, for_each_zone
 * fills it in.
 */
#define for_each_zone(zone)                                \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))

#define for_each_populated_zone(zone)                        \
        for (zone = (first_online_pgdat())->node_zones; \
             zone;                                        \
             zone = next_zone(zone))                        \
                if (!populated_zone(zone))                \
                        ; /* do nothing */                \
                else

static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
        return zoneref->zone;
}

static inline int zonelist_zone_idx(struct zoneref *zoneref)
{
        return zoneref->zone_idx;
}

static inline int zonelist_node_idx(struct zoneref *zoneref)
{
        return zone_to_nid(zoneref->zone);
}

struct zoneref *__next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes);

/**
 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
 * @z: The cursor used as a starting point for the search
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the next zone at or below a given zone index that is
 * within the allowed nodemask using a cursor as the starting point for the
 * search. The zoneref returned is a cursor that represents the current zone
 * being examined. It should be advanced by one before calling
 * next_zones_zonelist again.
 *
 * Return: the next zone at or below highest_zoneidx within the allowed
 * nodemask using a cursor within a zonelist as a starting point
 */
static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
                return z;
        return __next_zones_zonelist(z, highest_zoneidx, nodes);
}

/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist: The zonelist to search for a suitable zone
 * @highest_zoneidx: The zone index of the highest zone to return
 * @nodes: An optional nodemask to filter the zonelist with
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 *
 * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is
 * never NULL). This may happen either genuinely, or due to concurrent nodemask
 * update due to cpuset modification.
 *
 * Return: Zoneref pointer for the first suitable zone found
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        return next_zones_zonelist(zonelist->_zonerefs,
                                                        highest_zoneidx, nodes);
}

/**
 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->_zonerefs being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 * @nodemask: Nodemask allowed by the allocator
 *
 * This iterator iterates though all zones at or below a given zone index and
 * within a given nodemask
 */
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
        for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))

#define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \
        for (zone = zonelist_zone(z);        \
                zone;                                                        \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))


/**
 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
 * @zone: The current zone in the iterator
 * @z: The current pointer within zonelist->zones being iterated
 * @zlist: The zonelist being iterated
 * @highidx: The zone index of the highest zone to return
 *
 * This iterator iterates though all zones at or below a given zone index.
 */
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
        for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)

/* Whether the 'nodes' are all movable nodes */
static inline bool movable_only_nodes(nodemask_t *nodes)
{
        struct zonelist *zonelist;
        struct zoneref *z;
        int nid;

        if (nodes_empty(*nodes))
                return false;

        /*
         * We can chose arbitrary node from the nodemask to get a
         * zonelist as they are interlinked. We just need to find
         * at least one zone that can satisfy kernel allocations.
         */
        nid = first_node(*nodes);
        zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
        z = first_zones_zonelist(zonelist, ZONE_NORMAL,        nodes);
        return (!zonelist_zone(z)) ? true : false;
}


#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#endif

#ifdef CONFIG_FLATMEM
#define pfn_to_nid(pfn)                (0)
#endif

#ifdef CONFIG_SPARSEMEM

/*
 * PA_SECTION_SHIFT                physical address to/from section number
 * PFN_SECTION_SHIFT                pfn to/from section number
 */
#define PA_SECTION_SHIFT        (SECTION_SIZE_BITS)
#define PFN_SECTION_SHIFT        (SECTION_SIZE_BITS - PAGE_SHIFT)

#define NR_MEM_SECTIONS                (1UL << SECTIONS_SHIFT)

#define PAGES_PER_SECTION       (1UL << PFN_SECTION_SHIFT)
#define PAGE_SECTION_MASK        (~(PAGES_PER_SECTION-1))

#define SECTION_BLOCKFLAGS_BITS \
        ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS)

#if (MAX_PAGE_ORDER + PAGE_SHIFT) > SECTION_SIZE_BITS
#error Allocator MAX_PAGE_ORDER exceeds SECTION_SIZE
#endif

static inline unsigned long pfn_to_section_nr(unsigned long pfn)
{
        return pfn >> PFN_SECTION_SHIFT;
}
static inline unsigned long section_nr_to_pfn(unsigned long sec)
{
        return sec << PFN_SECTION_SHIFT;
}

#define SECTION_ALIGN_UP(pfn)        (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK)
#define SECTION_ALIGN_DOWN(pfn)        ((pfn) & PAGE_SECTION_MASK)

#define SUBSECTION_SHIFT 21
#define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT)

#define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT)
#define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT)
#define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1))

#if SUBSECTION_SHIFT > SECTION_SIZE_BITS
#error Subsection size exceeds section size
#else
#define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT))
#endif

#define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION)
#define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK)

struct mem_section_usage {
        struct rcu_head rcu;
#ifdef CONFIG_SPARSEMEM_VMEMMAP
        DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION);
#endif
        /* See declaration of similar field in struct zone */
        unsigned long pageblock_flags[0];
};

void subsection_map_init(unsigned long pfn, unsigned long nr_pages);

struct page;
struct page_ext;
struct mem_section {
        /*
         * This is, logically, a pointer to an array of struct
         * pages.  However, it is stored with some other magic.
         * (see sparse.c::sparse_init_one_section())
         *
         * Additionally during early boot we encode node id of
         * the location of the section here to guide allocation.
         * (see sparse.c::memory_present())
         *
         * Making it a UL at least makes someone do a cast
         * before using it wrong.
         */
        unsigned long section_mem_map;

        struct mem_section_usage *usage;
#ifdef CONFIG_PAGE_EXTENSION
        /*
         * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use
         * section. (see page_ext.h about this.)
         */
        struct page_ext *page_ext;
        unsigned long pad;
#endif
        /*
         * WARNING: mem_section must be a power-of-2 in size for the
         * calculation and use of SECTION_ROOT_MASK to make sense.
         */
};

#ifdef CONFIG_SPARSEMEM_EXTREME
#define SECTIONS_PER_ROOT       (PAGE_SIZE / sizeof (struct mem_section))
#else
#define SECTIONS_PER_ROOT        1
#endif

#define SECTION_NR_TO_ROOT(sec)        ((sec) / SECTIONS_PER_ROOT)
#define NR_SECTION_ROOTS        DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT)
#define SECTION_ROOT_MASK        (SECTIONS_PER_ROOT - 1)

#ifdef CONFIG_SPARSEMEM_EXTREME
extern struct mem_section **mem_section;
#else
extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT];
#endif

static inline unsigned long *section_to_usemap(struct mem_section *ms)
{
        return ms->usage->pageblock_flags;
}

static inline struct mem_section *__nr_to_section(unsigned long nr)
{
        unsigned long root = SECTION_NR_TO_ROOT(nr);

        if (unlikely(root >= NR_SECTION_ROOTS))
                return NULL;

#ifdef CONFIG_SPARSEMEM_EXTREME
        if (!mem_section || !mem_section[root])
                return NULL;
#endif
        return &mem_section[root][nr & SECTION_ROOT_MASK];
}
extern size_t mem_section_usage_size(void);

/*
 * We use the lower bits of the mem_map pointer to store
 * a little bit of information.  The pointer is calculated
 * as mem_map - section_nr_to_pfn(pnum).  The result is
 * aligned to the minimum alignment of the two values:
 *   1. All mem_map arrays are page-aligned.
 *   2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT
 *      lowest bits.  PFN_SECTION_SHIFT is arch-specific
 *      (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the
 *      worst combination is powerpc with 256k pages,
 *      which results in PFN_SECTION_SHIFT equal 6.
 * To sum it up, at least 6 bits are available on all architectures.
 * However, we can exceed 6 bits on some other architectures except
 * powerpc (e.g. 15 bits are available on x86_64, 13 bits are available
 * with the worst case of 64K pages on arm64) if we make sure the
 * exceeded bit is not applicable to powerpc.
 */
enum {
        SECTION_MARKED_PRESENT_BIT,
        SECTION_HAS_MEM_MAP_BIT,
        SECTION_IS_ONLINE_BIT,
        SECTION_IS_EARLY_BIT,
#ifdef CONFIG_ZONE_DEVICE
        SECTION_TAINT_ZONE_DEVICE_BIT,
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
        SECTION_IS_VMEMMAP_PREINIT_BIT,
#endif
        SECTION_MAP_LAST_BIT,
};

#define SECTION_MARKED_PRESENT                BIT(SECTION_MARKED_PRESENT_BIT)
#define SECTION_HAS_MEM_MAP                BIT(SECTION_HAS_MEM_MAP_BIT)
#define SECTION_IS_ONLINE                BIT(SECTION_IS_ONLINE_BIT)
#define SECTION_IS_EARLY                BIT(SECTION_IS_EARLY_BIT)
#ifdef CONFIG_ZONE_DEVICE
#define SECTION_TAINT_ZONE_DEVICE        BIT(SECTION_TAINT_ZONE_DEVICE_BIT)
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
#define SECTION_IS_VMEMMAP_PREINIT        BIT(SECTION_IS_VMEMMAP_PREINIT_BIT)
#endif
#define SECTION_MAP_MASK                (~(BIT(SECTION_MAP_LAST_BIT) - 1))
#define SECTION_NID_SHIFT                SECTION_MAP_LAST_BIT

static inline struct page *__section_mem_map_addr(struct mem_section *section)
{
        unsigned long map = section->section_mem_map;
        map &= SECTION_MAP_MASK;
        return (struct page *)map;
}

static inline int present_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_MARKED_PRESENT));
}

static inline int present_section_nr(unsigned long nr)
{
        return present_section(__nr_to_section(nr));
}

static inline int valid_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP));
}

static inline int early_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_EARLY));
}

static inline int valid_section_nr(unsigned long nr)
{
        return valid_section(__nr_to_section(nr));
}

static inline int online_section(struct mem_section *section)
{
        return (section && (section->section_mem_map & SECTION_IS_ONLINE));
}

#ifdef CONFIG_ZONE_DEVICE
static inline int online_device_section(struct mem_section *section)
{
        unsigned long flags = SECTION_IS_ONLINE | SECTION_TAINT_ZONE_DEVICE;

        return section && ((section->section_mem_map & flags) == flags);
}
#else
static inline int online_device_section(struct mem_section *section)
{
        return 0;
}
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
static inline int preinited_vmemmap_section(struct mem_section *section)
{
        return (section &&
                (section->section_mem_map & SECTION_IS_VMEMMAP_PREINIT));
}

void sparse_vmemmap_init_nid_early(int nid);
void sparse_vmemmap_init_nid_late(int nid);

#else
static inline int preinited_vmemmap_section(struct mem_section *section)
{
        return 0;
}
static inline void sparse_vmemmap_init_nid_early(int nid)
{
}

static inline void sparse_vmemmap_init_nid_late(int nid)
{
}
#endif

static inline int online_section_nr(unsigned long nr)
{
        return online_section(__nr_to_section(nr));
}

#ifdef CONFIG_MEMORY_HOTPLUG
void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn);
#endif

static inline struct mem_section *__pfn_to_section(unsigned long pfn)
{
        return __nr_to_section(pfn_to_section_nr(pfn));
}

extern unsigned long __highest_present_section_nr;

static inline int subsection_map_index(unsigned long pfn)
{
        return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION;
}

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        int idx = subsection_map_index(pfn);
        struct mem_section_usage *usage = READ_ONCE(ms->usage);

        return usage ? test_bit(idx, usage->subsection_map) : 0;
}
#else
static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn)
{
        return 1;
}
#endif

void sparse_init_early_section(int nid, struct page *map, unsigned long pnum,
                               unsigned long flags);

#ifndef CONFIG_HAVE_ARCH_PFN_VALID
/**
 * pfn_valid - check if there is a valid memory map entry for a PFN
 * @pfn: the page frame number to check
 *
 * Check if there is a valid memory map entry aka struct page for the @pfn.
 * Note, that availability of the memory map entry does not imply that
 * there is actual usable memory at that @pfn. The struct page may
 * represent a hole or an unusable page frame.
 *
 * Return: 1 for PFNs that have memory map entries and 0 otherwise
 */
static inline int pfn_valid(unsigned long pfn)
{
        struct mem_section *ms;
        int ret;

        /*
         * Ensure the upper PAGE_SHIFT bits are clear in the
         * pfn. Else it might lead to false positives when
         * some of the upper bits are set, but the lower bits
         * match a valid pfn.
         */
        if (PHYS_PFN(PFN_PHYS(pfn)) != pfn)
                return 0;

        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        ms = __pfn_to_section(pfn);
        rcu_read_lock_sched();
        if (!valid_section(ms)) {
                rcu_read_unlock_sched();
                return 0;
        }
        /*
         * Traditionally early sections always returned pfn_valid() for
         * the entire section-sized span.
         */
        ret = early_section(ms) || pfn_section_valid(ms, pfn);
        rcu_read_unlock_sched();

        return ret;
}
#endif

static inline int pfn_in_present_section(unsigned long pfn)
{
        if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
                return 0;
        return present_section(__pfn_to_section(pfn));
}

static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
        while (++section_nr <= __highest_present_section_nr) {
                if (present_section_nr(section_nr))
                        return section_nr;
        }

        return -1;
}

#define for_each_present_section_nr(start, section_nr)                \
        for (section_nr = next_present_section_nr(start - 1);        \
             section_nr != -1;                                        \
             section_nr = next_present_section_nr(section_nr))

/*
 * These are _only_ used during initialisation, therefore they
 * can use __initdata ...  They could have names to indicate
 * this restriction.
 */
#ifdef CONFIG_NUMA
#define pfn_to_nid(pfn)                                                        \
({                                                                        \
        unsigned long __pfn_to_nid_pfn = (pfn);                                \
        page_to_nid(pfn_to_page(__pfn_to_nid_pfn));                        \
})
#else
#define pfn_to_nid(pfn)                (0)
#endif

void sparse_init(void);
#else
#define sparse_init()        do {} while (0)
#define sparse_index_init(_sec, _nid)  do {} while (0)
#define sparse_vmemmap_init_nid_early(_nid, _use) do {} while (0)
#define sparse_vmemmap_init_nid_late(_nid) do {} while (0)
#define pfn_in_present_section pfn_valid
#define subsection_map_init(_pfn, _nr_pages) do {} while (0)
#endif /* CONFIG_SPARSEMEM */

#endif /* !__GENERATING_BOUNDS.H */
#endif /* !__ASSEMBLY__ */
#endif /* _LINUX_MMZONE_H */











































































































































































































































































































































































































  265 




















































































































































































































































































































































































  186 

















































  189 
  186 
















































































































































































































































































































































  265 




  265 












  265 



  265 
  265 







































































































































































































































































































































































































   20 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (C) 2001 Momchil Velikov
 * Portions Copyright (C) 2001 Christoph Hellwig
 * Copyright (C) 2005 SGI, Christoph Lameter
 * Copyright (C) 2006 Nick Piggin
 * Copyright (C) 2012 Konstantin Khlebnikov
 * Copyright (C) 2016 Intel, Matthew Wilcox
 * Copyright (C) 2016 Intel, Ross Zwisler
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kmemleak.h>
#include <linux/percpu.h>
#include <linux/preempt.h>                /* in_interrupt() */
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/xarray.h>

#include "radix-tree.h"

/*
 * Radix tree node cache.
 */
struct kmem_cache *radix_tree_node_cachep;

/*
 * The radix tree is variable-height, so an insert operation not only has
 * to build the branch to its corresponding item, it also has to build the
 * branch to existing items if the size has to be increased (by
 * radix_tree_extend).
 *
 * The worst case is a zero height tree with just a single item at index 0,
 * and then inserting an item at index ULONG_MAX. This requires 2 new branches
 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
 * Hence:
 */
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)

/*
 * The IDR does not have to be as high as the radix tree since it uses
 * signed integers, not unsigned longs.
 */
#define IDR_INDEX_BITS                (8 /* CHAR_BIT */ * sizeof(int) - 1)
#define IDR_MAX_PATH                (DIV_ROUND_UP(IDR_INDEX_BITS, \
                                                RADIX_TREE_MAP_SHIFT))
#define IDR_PRELOAD_SIZE        (IDR_MAX_PATH * 2 - 1)

/*
 * Per-cpu pool of preloaded nodes
 */
DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = {
        .lock = INIT_LOCAL_LOCK(lock),
};
EXPORT_PER_CPU_SYMBOL_GPL(radix_tree_preloads);

static inline struct radix_tree_node *entry_to_node(void *ptr)
{
        return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE);
}

static inline void *node_to_entry(void *ptr)
{
        return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE);
}

#define RADIX_TREE_RETRY        XA_RETRY_ENTRY

static inline unsigned long
get_slot_offset(const struct radix_tree_node *parent, void __rcu **slot)
{
        return parent ? slot - parent->slots : 0;
}

static unsigned int radix_tree_descend(const struct radix_tree_node *parent,
                        struct radix_tree_node **nodep, unsigned long index)
{
        unsigned int offset = (index >> parent->shift) & RADIX_TREE_MAP_MASK;
        void __rcu **entry = rcu_dereference_raw(parent->slots[offset]);

        *nodep = (void *)entry;
        return offset;
}

static inline gfp_t root_gfp_mask(const struct radix_tree_root *root)
{
        return root->xa_flags & (__GFP_BITS_MASK & ~GFP_ZONEMASK);
}

static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __set_bit(offset, node->tags[tag]);
}

static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __clear_bit(offset, node->tags[tag]);
}

static inline int tag_get(const struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        return test_bit(offset, node->tags[tag]);
}

static inline void root_tag_set(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags |= (__force gfp_t)(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear(struct radix_tree_root *root, unsigned tag)
{
        root->xa_flags &= (__force gfp_t)~(1 << (tag + ROOT_TAG_SHIFT));
}

static inline void root_tag_clear_all(struct radix_tree_root *root)
{
        root->xa_flags &= (__force gfp_t)((1 << ROOT_TAG_SHIFT) - 1);
}

static inline int root_tag_get(const struct radix_tree_root *root, unsigned tag)
{
        return (__force int)root->xa_flags & (1 << (tag + ROOT_TAG_SHIFT));
}

static inline unsigned root_tags_get(const struct radix_tree_root *root)
{
        return (__force unsigned)root->xa_flags >> ROOT_TAG_SHIFT;
}

static inline bool is_idr(const struct radix_tree_root *root)
{
        return !!(root->xa_flags & ROOT_IS_IDR);
}

/*
 * Returns 1 if any slot in the node has this tag set.
 * Otherwise returns 0.
 */
static inline int any_tag_set(const struct radix_tree_node *node,
                                                        unsigned int tag)
{
        unsigned idx;
        for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
                if (node->tags[tag][idx])
                        return 1;
        }
        return 0;
}

static inline void all_tag_set(struct radix_tree_node *node, unsigned int tag)
{
        bitmap_fill(node->tags[tag], RADIX_TREE_MAP_SIZE);
}

/**
 * radix_tree_find_next_bit - find the next set bit in a memory region
 *
 * @node: where to begin the search
 * @tag: the tag index
 * @offset: the bitnumber to start searching at
 *
 * Unrollable variant of find_next_bit() for constant size arrays.
 * Tail bits starting from size to roundup(size, BITS_PER_LONG) must be zero.
 * Returns next bit offset, or size if nothing found.
 */
static __always_inline unsigned long
radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag,
                         unsigned long offset)
{
        const unsigned long *addr = node->tags[tag];

        if (offset < RADIX_TREE_MAP_SIZE) {
                unsigned long tmp;

                addr += offset / BITS_PER_LONG;
                tmp = *addr >> (offset % BITS_PER_LONG);
                if (tmp)
                        return __ffs(tmp) + offset;
                offset = (offset + BITS_PER_LONG) & ~(BITS_PER_LONG - 1);
                while (offset < RADIX_TREE_MAP_SIZE) {
                        tmp = *++addr;
                        if (tmp)
                                return __ffs(tmp) + offset;
                        offset += BITS_PER_LONG;
                }
        }
        return RADIX_TREE_MAP_SIZE;
}

static unsigned int iter_offset(const struct radix_tree_iter *iter)
{
        return iter->index & RADIX_TREE_MAP_MASK;
}

/*
 * The maximum index which can be stored in a radix tree
 */
static inline unsigned long shift_maxindex(unsigned int shift)
{
        return (RADIX_TREE_MAP_SIZE << shift) - 1;
}

static inline unsigned long node_maxindex(const struct radix_tree_node *node)
{
        return shift_maxindex(node->shift);
}

static unsigned long next_index(unsigned long index,
                                const struct radix_tree_node *node,
                                unsigned long offset)
{
        return (index & ~node_maxindex(node)) + (offset << node->shift);
}

/*
 * This assumes that the caller has performed appropriate preallocation, and
 * that the caller has pinned this thread of control to the current CPU.
 */
static struct radix_tree_node *
radix_tree_node_alloc(gfp_t gfp_mask, struct radix_tree_node *parent,
                        struct radix_tree_root *root,
                        unsigned int shift, unsigned int offset,
                        unsigned int count, unsigned int nr_values)
{
        struct radix_tree_node *ret = NULL;

        /*
         * Preload code isn't irq safe and it doesn't make sense to use
         * preloading during an interrupt anyway as all the allocations have
         * to be atomic. So just do normal allocation when in interrupt.
         */
        if (!gfpflags_allow_blocking(gfp_mask) && !in_interrupt()) {
                struct radix_tree_preload *rtp;

                /*
                 * Even if the caller has preloaded, try to allocate from the
                 * cache first for the new node to get accounted to the memory
                 * cgroup.
                 */
                ret = kmem_cache_alloc(radix_tree_node_cachep,
                                       gfp_mask | __GFP_NOWARN);
                if (ret)
                        goto out;

                /*
                 * Provided the caller has preloaded here, we will always
                 * succeed in getting a node here (and never reach
                 * kmem_cache_alloc)
                 */
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr) {
                        ret = rtp->nodes;
                        rtp->nodes = ret->parent;
                        rtp->nr--;
                }
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(ret);
                goto out;
        }
        ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
out:
        BUG_ON(radix_tree_is_internal_node(ret));
        if (ret) {
                ret->shift = shift;
                ret->offset = offset;
                ret->count = count;
                ret->nr_values = nr_values;
                ret->parent = parent;
                ret->array = root;
        }
        return ret;
}

void radix_tree_node_rcu_free(struct rcu_head *head)
{
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);

        /*
         * Must only free zeroed nodes into the slab.  We can be left with
         * non-NULL entries by radix_tree_free_nodes, so clear the entries
         * and tags here.
         */
        memset(node->slots, 0, sizeof(node->slots));
        memset(node->tags, 0, sizeof(node->tags));
        INIT_LIST_HEAD(&node->private_list);

        kmem_cache_free(radix_tree_node_cachep, node);
}

static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
static __must_check int __radix_tree_preload(gfp_t gfp_mask, unsigned nr)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;
        int ret = -ENOMEM;

        /*
         * Nodes preloaded by one cgroup can be used by another cgroup, so
         * they should never be accounted to any particular memory cgroup.
         */
        gfp_mask &= ~__GFP_ACCOUNT;

        local_lock(&radix_tree_preloads.lock);
        rtp = this_cpu_ptr(&radix_tree_preloads);
        while (rtp->nr < nr) {
                local_unlock(&radix_tree_preloads.lock);
                node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
                if (node == NULL)
                        goto out;
                local_lock(&radix_tree_preloads.lock);
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr < nr) {
                        node->parent = rtp->nodes;
                        rtp->nodes = node;
                        rtp->nr++;
                } else {
                        kmem_cache_free(radix_tree_node_cachep, node);
                }
        }
        ret = 0;
out:
        return ret;
}

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE().
 */
int radix_tree_preload(gfp_t gfp_mask)
{
        /* Warn on non-sensical use... */
        WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask));
        return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
}
EXPORT_SYMBOL(radix_tree_preload);

/*
 * The same as above function, except we don't guarantee preloading happens.
 * We do it, if we decide it helps. On success, return zero with preemption
 * disabled. On error, return -ENOMEM with preemption not disabled.
 */
int radix_tree_maybe_preload(gfp_t gfp_mask)
{
        if (gfpflags_allow_blocking(gfp_mask))
                return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE);
        /* Preloading doesn't help anything with this gfp mask, skip it */
        local_lock(&radix_tree_preloads.lock);
        return 0;
}
EXPORT_SYMBOL(radix_tree_maybe_preload);

static unsigned radix_tree_load_root(const struct radix_tree_root *root,
                struct radix_tree_node **nodep, unsigned long *maxindex)
{
        struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);

        *nodep = node;

        if (likely(radix_tree_is_internal_node(node))) {
                node = entry_to_node(node);
                *maxindex = node_maxindex(node);
                return node->shift + RADIX_TREE_MAP_SHIFT;
        }

        *maxindex = 0;
        return 0;
}

/*
 *        Extend a radix tree so it can store key @index.
 */
static int radix_tree_extend(struct radix_tree_root *root, gfp_t gfp,
                                unsigned long index, unsigned int shift)
{
        void *entry;
        unsigned int maxshift;
        int tag;

        /* Figure out what the shift should be.  */
        maxshift = shift;
        while (index > shift_maxindex(maxshift))
                maxshift += RADIX_TREE_MAP_SHIFT;

        entry = rcu_dereference_raw(root->xa_head);
        if (!entry && (!is_idr(root) || root_tag_get(root, IDR_FREE)))
                goto out;

        do {
                struct radix_tree_node *node = radix_tree_node_alloc(gfp, NULL,
                                                        root, shift, 0, 1, 0);
                if (!node)
                        return -ENOMEM;

                if (is_idr(root)) {
                        all_tag_set(node, IDR_FREE);
                        if (!root_tag_get(root, IDR_FREE)) {
                                tag_clear(node, IDR_FREE, 0);
                                root_tag_set(root, IDR_FREE);
                        }
                } else {
                        /* Propagate the aggregated tag info to the new child */
                        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
                                if (root_tag_get(root, tag))
                                        tag_set(node, tag, 0);
                        }
                }

                BUG_ON(shift > BITS_PER_LONG);
                if (radix_tree_is_internal_node(entry)) {
                        entry_to_node(entry)->parent = node;
                } else if (xa_is_value(entry)) {
                        /* Moving a value entry root->xa_head to a node */
                        node->nr_values = 1;
                }
                /*
                 * entry was already in the radix tree, so we do not need
                 * rcu_assign_pointer here
                 */
                node->slots[0] = (void __rcu *)entry;
                entry = node_to_entry(node);
                rcu_assign_pointer(root->xa_head, entry);
                shift += RADIX_TREE_MAP_SHIFT;
        } while (shift <= maxshift);
out:
        return maxshift + RADIX_TREE_MAP_SHIFT;
}

/**
 *        radix_tree_shrink    -    shrink radix tree to minimum height
 *        @root:                radix tree root
 */
static inline bool radix_tree_shrink(struct radix_tree_root *root)
{
        bool shrunk = false;

        for (;;) {
                struct radix_tree_node *node = rcu_dereference_raw(root->xa_head);
                struct radix_tree_node *child;

                if (!radix_tree_is_internal_node(node))
                        break;
                node = entry_to_node(node);

                /*
                 * The candidate node has more than one child, or its child
                 * is not at the leftmost slot, we cannot shrink.
                 */
                if (node->count != 1)
                        break;
                child = rcu_dereference_raw(node->slots[0]);
                if (!child)
                        break;

                /*
                 * For an IDR, we must not shrink entry 0 into the root in
                 * case somebody calls idr_replace() with a pointer that
                 * appears to be an internal entry
                 */
                if (!node->shift && is_idr(root))
                        break;

                if (radix_tree_is_internal_node(child))
                        entry_to_node(child)->parent = NULL;

                /*
                 * We don't need rcu_assign_pointer(), since we are simply
                 * moving the node from one part of the tree to another: if it
                 * was safe to dereference the old pointer to it
                 * (node->slots[0]), it will be safe to dereference the new
                 * one (root->xa_head) as far as dependent read barriers go.
                 */
                root->xa_head = (void __rcu *)child;
                if (is_idr(root) && !tag_get(node, IDR_FREE, 0))
                        root_tag_clear(root, IDR_FREE);

                /*
                 * We have a dilemma here. The node's slot[0] must not be
                 * NULLed in case there are concurrent lookups expecting to
                 * find the item. However if this was a bottom-level node,
                 * then it may be subject to the slot pointer being visible
                 * to callers dereferencing it. If item corresponding to
                 * slot[0] is subsequently deleted, these callers would expect
                 * their slot to become empty sooner or later.
                 *
                 * For example, lockless pagecache will look up a slot, deref
                 * the page pointer, and if the page has 0 refcount it means it
                 * was concurrently deleted from pagecache so try the deref
                 * again. Fortunately there is already a requirement for logic
                 * to retry the entire slot lookup -- the indirect pointer
                 * problem (replacing direct root node with an indirect pointer
                 * also results in a stale slot). So tag the slot as indirect
                 * to force callers to retry.
                 */
                node->count = 0;
                if (!radix_tree_is_internal_node(child)) {
                        node->slots[0] = (void __rcu *)RADIX_TREE_RETRY;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                shrunk = true;
        }

        return shrunk;
}

static bool delete_node(struct radix_tree_root *root,
                        struct radix_tree_node *node)
{
        bool deleted = false;

        do {
                struct radix_tree_node *parent;

                if (node->count) {
                        if (node_to_entry(node) ==
                                        rcu_dereference_raw(root->xa_head))
                                deleted |= radix_tree_shrink(root);
                        return deleted;
                }

                parent = node->parent;
                if (parent) {
                        parent->slots[node->offset] = NULL;
                        parent->count--;
                } else {
                        /*
                         * Shouldn't the tags already have all been cleared
                         * by the caller?
                         */
                        if (!is_idr(root))
                                root_tag_clear_all(root);
                        root->xa_head = NULL;
                }

                WARN_ON_ONCE(!list_empty(&node->private_list));
                radix_tree_node_free(node);
                deleted = true;

                node = parent;
        } while (node);

        return deleted;
}

/**
 *        __radix_tree_create        -        create a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Create, if necessary, and return the node and slot for an item
 *        at position @index in the radix tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 *
 *        Returns -ENOMEM, or 0 for success.
 */
static int __radix_tree_create(struct radix_tree_root *root,
                unsigned long index, struct radix_tree_node **nodep,
                void __rcu ***slotp)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex;
        unsigned int shift, offset = 0;
        unsigned long max = index;
        gfp_t gfp = root_gfp_mask(root);

        shift = radix_tree_load_root(root, &child, &maxindex);

        /* Make sure the tree is high enough.  */
        if (max > maxindex) {
                int error = radix_tree_extend(root, gfp, max, shift);
                if (error < 0)
                        return error;
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }

        while (shift > 0) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return -ENOMEM;
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                /* Go a level down */
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);
                slot = &node->slots[offset];
        }

        if (nodep)
                *nodep = node;
        if (slotp)
                *slotp = slot;
        return 0;
}

/*
 * Free any nodes below this node.  The tree is presumed to not need
 * shrinking, and any user data in the tree is presumed to not need a
 * destructor called on it.  If we need to add a destructor, we can
 * add that functionality later.  Note that we may not clear tags or
 * slots from the tree as an RCU walker may still have a pointer into
 * this subtree.  We could replace the entries with RADIX_TREE_RETRY,
 * but we'll still have to clear those in rcu_free.
 */
static void radix_tree_free_nodes(struct radix_tree_node *node)
{
        unsigned offset = 0;
        struct radix_tree_node *child = entry_to_node(node);

        for (;;) {
                void *entry = rcu_dereference_raw(child->slots[offset]);
                if (xa_is_node(entry) && child->shift) {
                        child = entry_to_node(entry);
                        offset = 0;
                        continue;
                }
                offset++;
                while (offset == RADIX_TREE_MAP_SIZE) {
                        struct radix_tree_node *old = child;
                        offset = child->offset + 1;
                        child = child->parent;
                        WARN_ON_ONCE(!list_empty(&old->private_list));
                        radix_tree_node_free(old);
                        if (old == entry_to_node(node))
                                return;
                }
        }
}

static inline int insert_entries(struct radix_tree_node *node,
                void __rcu **slot, void *item)
{
        if (*slot)
                return -EEXIST;
        rcu_assign_pointer(*slot, item);
        if (node) {
                node->count++;
                if (xa_is_value(item))
                        node->nr_values++;
        }
        return 1;
}

/**
 *        radix_tree_insert    -    insert into a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @item:                item to insert
 *
 *        Insert an item into the radix tree at position @index.
 */
int radix_tree_insert(struct radix_tree_root *root, unsigned long index,
                        void *item)
{
        struct radix_tree_node *node;
        void __rcu **slot;
        int error;

        BUG_ON(radix_tree_is_internal_node(item));

        error = __radix_tree_create(root, index, &node, &slot);
        if (error)
                return error;

        error = insert_entries(node, slot, item);
        if (error < 0)
                return error;

        if (node) {
                unsigned offset = get_slot_offset(node, slot);
                BUG_ON(tag_get(node, 0, offset));
                BUG_ON(tag_get(node, 1, offset));
                BUG_ON(tag_get(node, 2, offset));
        } else {
                BUG_ON(root_tags_get(root));
        }

        return 0;
}
EXPORT_SYMBOL(radix_tree_insert);

/**
 *        __radix_tree_lookup        -        lookup an item in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *        @nodep:                returns node
 *        @slotp:                returns slot
 *
 *        Lookup and return the item at position @index in the radix
 *        tree @root.
 *
 *        Until there is more than one item in the tree, no nodes are
 *        allocated and @root->xa_head is used as a direct slot instead of
 *        pointing to a node, in which case *@nodep will be NULL.
 */
void *__radix_tree_lookup(const struct radix_tree_root *root,
                          unsigned long index, struct radix_tree_node **nodep,
                          void __rcu ***slotp)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        void __rcu **slot;

 restart:
        parent = NULL;
        slot = (void __rcu **)&root->xa_head;
        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                slot = parent->slots + offset;
                if (node == RADIX_TREE_RETRY)
                        goto restart;
                if (parent->shift == 0)
                        break;
        }

        if (nodep)
                *nodep = parent;
        if (slotp)
                *slotp = slot;
        return node;
}

/**
 *        radix_tree_lookup_slot    -    lookup a slot in a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Returns:  the slot corresponding to the position @index in the
 *        radix tree @root. This is useful for update-if-exists operations.
 *
 *        This function can be called under rcu_read_lock iff the slot is not
 *        modified by radix_tree_replace_slot, otherwise it must be called
 *        exclusive from other writers. Any dereference of the slot must be done
 *        using radix_tree_deref_slot.
 */
void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *root,
                                unsigned long index)
{
        void __rcu **slot;

        if (!__radix_tree_lookup(root, index, NULL, &slot))
                return NULL;
        return slot;
}
EXPORT_SYMBOL(radix_tree_lookup_slot);

/**
 *        radix_tree_lookup    -    perform lookup operation on a radix tree
 *        @root:                radix tree root
 *        @index:                index key
 *
 *        Lookup the item at the position @index in the radix tree @root.
 *
 *        This function can be called under rcu_read_lock, however the caller
 *        must manage lifetimes of leaf nodes (eg. RCU may also be used to free
 *        them safely). No RCU barriers are required to access or modify the
 *        returned item, however.
 */
void *radix_tree_lookup(const struct radix_tree_root *root, unsigned long index)
{
        return __radix_tree_lookup(root, index, NULL, NULL);
}
EXPORT_SYMBOL(radix_tree_lookup);

static void replace_slot(void __rcu **slot, void *item,
                struct radix_tree_node *node, int count, int values)
{
        if (node && (count || values)) {
                node->count += count;
                node->nr_values += values;
        }

        rcu_assign_pointer(*slot, item);
}

static bool node_tag_get(const struct radix_tree_root *root,
                                const struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        if (node)
                return tag_get(node, tag, offset);
        return root_tag_get(root, tag);
}

/*
 * IDR users want to be able to store NULL in the tree, so if the slot isn't
 * free, don't adjust the count, even if it's transitioning between NULL and
 * non-NULL.  For the IDA, we mark slots as being IDR_FREE while they still
 * have empty bits, but it only stores NULL in slots when they're being
 * deleted.
 */
static int calculate_count(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot,
                                void *item, void *old)
{
        if (is_idr(root)) {
                unsigned offset = get_slot_offset(node, slot);
                bool free = node_tag_get(root, node, IDR_FREE, offset);
                if (!free)
                        return 0;
                if (!old)
                        return 1;
        }
        return !!item - !!old;
}

/**
 * __radix_tree_replace                - replace item in a slot
 * @root:                radix tree root
 * @node:                pointer to tree node
 * @slot:                pointer to slot in @node
 * @item:                new item to store in the slot.
 *
 * For use with __radix_tree_lookup().  Caller must hold tree write locked
 * across slot lookup and replacement.
 */
void __radix_tree_replace(struct radix_tree_root *root,
                          struct radix_tree_node *node,
                          void __rcu **slot, void *item)
{
        void *old = rcu_dereference_raw(*slot);
        int values = !!xa_is_value(item) - !!xa_is_value(old);
        int count = calculate_count(root, node, slot, item, old);

        /*
         * This function supports replacing value entries and
         * deleting entries, but that needs accounting against the
         * node unless the slot is root->xa_head.
         */
        WARN_ON_ONCE(!node && (slot != (void __rcu **)&root->xa_head) &&
                        (count || values));
        replace_slot(slot, item, node, count, values);

        if (!node)
                return;

        delete_node(root, node);
}

/**
 * radix_tree_replace_slot        - replace item in a slot
 * @root:        radix tree root
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_lookup_slot() and
 * radix_tree_gang_lookup_tag_slot().  Caller must hold tree write locked
 * across slot lookup and replacement.
 *
 * NOTE: This cannot be used to switch between non-entries (empty slots),
 * regular entries, and value entries, as that requires accounting
 * inside the radix tree node. When switching from one type of entry or
 * deleting, use __radix_tree_lookup() and __radix_tree_replace() or
 * radix_tree_iter_replace().
 */
void radix_tree_replace_slot(struct radix_tree_root *root,
                             void __rcu **slot, void *item)
{
        __radix_tree_replace(root, NULL, slot, item);
}
EXPORT_SYMBOL(radix_tree_replace_slot);

/**
 * radix_tree_iter_replace - replace item in a slot
 * @root:        radix tree root
 * @iter:        iterator state
 * @slot:        pointer to slot
 * @item:        new item to store in the slot.
 *
 * For use with radix_tree_for_each_slot().
 * Caller must hold tree write locked.
 */
void radix_tree_iter_replace(struct radix_tree_root *root,
                                const struct radix_tree_iter *iter,
                                void __rcu **slot, void *item)
{
        __radix_tree_replace(root, iter->node, slot, item);
}

static void node_tag_set(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (tag_get(node, tag, offset))
                        return;
                tag_set(node, tag, offset);
                offset = node->offset;
                node = node->parent;
        }

        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);
}

/**
 *        radix_tree_tag_set - set a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  From
 *        the root all the way down to the leaf node.
 *
 *        Returns the address of the tagged item.  Setting a tag on a not-present
 *        item is a bug.
 */
void *radix_tree_tag_set(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        radix_tree_load_root(root, &node, &maxindex);
        BUG_ON(index > maxindex);

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
                BUG_ON(!node);

                if (!tag_get(parent, tag, offset))
                        tag_set(parent, tag, offset);
        }

        /* set the root's tag bit */
        if (!root_tag_get(root, tag))
                root_tag_set(root, tag);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_set);

static void node_tag_clear(struct radix_tree_root *root,
                                struct radix_tree_node *node,
                                unsigned int tag, unsigned int offset)
{
        while (node) {
                if (!tag_get(node, tag, offset))
                        return;
                tag_clear(node, tag, offset);
                if (any_tag_set(node, tag))
                        return;

                offset = node->offset;
                node = node->parent;
        }

        /* clear the root's tag bit */
        if (root_tag_get(root, tag))
                root_tag_clear(root, tag);
}

/**
 *        radix_tree_tag_clear - clear a tag on a radix tree node
 *        @root:                radix tree root
 *        @index:                index key
 *        @tag:                tag index
 *
 *        Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *        corresponding to @index in the radix tree.  If this causes
 *        the leaf node to have no tags set then clear the tag in the
 *        next-to-leaf node, etc.
 *
 *        Returns the address of the tagged item on success, else NULL.  ie:
 *        has the same return value and semantics as radix_tree_lookup().
 */
void *radix_tree_tag_clear(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;
        int offset = 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return NULL;

        parent = NULL;

        while (radix_tree_is_internal_node(node)) {
                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);
        }

        if (node)
                node_tag_clear(root, parent, tag, offset);

        return node;
}
EXPORT_SYMBOL(radix_tree_tag_clear);

/**
  * radix_tree_iter_tag_clear - clear a tag on the current iterator entry
  * @root: radix tree root
  * @iter: iterator state
  * @tag: tag to clear
  */
void radix_tree_iter_tag_clear(struct radix_tree_root *root,
                        const struct radix_tree_iter *iter, unsigned int tag)
{
        node_tag_clear(root, iter->node, tag, iter_offset(iter));
}

/**
 * radix_tree_tag_get - get a tag on a radix tree node
 * @root:                radix tree root
 * @index:                index key
 * @tag:                tag index (< RADIX_TREE_MAX_TAGS)
 *
 * Return values:
 *
 *  0: tag not present or not set
 *  1: tag set
 *
 * Note that the return value of this function may not be relied on, even if
 * the RCU lock is held, unless tag modification and node deletion are excluded
 * from concurrency.
 */
int radix_tree_tag_get(const struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node, *parent;
        unsigned long maxindex;

        if (!root_tag_get(root, tag))
                return 0;

        radix_tree_load_root(root, &node, &maxindex);
        if (index > maxindex)
                return 0;

        while (radix_tree_is_internal_node(node)) {
                unsigned offset;

                parent = entry_to_node(node);
                offset = radix_tree_descend(parent, &node, index);

                if (!tag_get(parent, tag, offset))
                        return 0;
                if (node == RADIX_TREE_RETRY)
                        break;
        }

        return 1;
}
EXPORT_SYMBOL(radix_tree_tag_get);

/* Construct iter->tags bit-mask from node->tags[tag] array */
static void set_iter_tags(struct radix_tree_iter *iter,
                                struct radix_tree_node *node, unsigned offset,
                                unsigned tag)
{
        unsigned tag_long = offset / BITS_PER_LONG;
        unsigned tag_bit  = offset % BITS_PER_LONG;

        if (!node) {
                iter->tags = 1;
                return;
        }

        iter->tags = node->tags[tag][tag_long] >> tag_bit;

        /* This never happens if RADIX_TREE_TAG_LONGS == 1 */
        if (tag_long < RADIX_TREE_TAG_LONGS - 1) {
                /* Pick tags from next element */
                if (tag_bit)
                        iter->tags |= node->tags[tag][tag_long + 1] <<
                                                (BITS_PER_LONG - tag_bit);
                /* Clip chunk size, here only BITS_PER_LONG tags */
                iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG);
        }
}

void __rcu **radix_tree_iter_resume(void __rcu **slot,
                                        struct radix_tree_iter *iter)
{
        iter->index = __radix_tree_iter_add(iter, 1);
        iter->next_index = iter->index;
        iter->tags = 0;
        return NULL;
}
EXPORT_SYMBOL(radix_tree_iter_resume);

/**
 * radix_tree_next_chunk - find next chunk of slots for iteration
 *
 * @root:        radix tree root
 * @iter:        iterator state
 * @flags:        RADIX_TREE_ITER_* flags and tag index
 * Returns:        pointer to chunk first slot, or NULL if iteration is over
 */
void __rcu **radix_tree_next_chunk(const struct radix_tree_root *root,
                             struct radix_tree_iter *iter, unsigned flags)
{
        unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK;
        struct radix_tree_node *node, *child;
        unsigned long index, offset, maxindex;

        if ((flags & RADIX_TREE_ITER_TAGGED) && !root_tag_get(root, tag))
                return NULL;

        /*
         * Catch next_index overflow after ~0UL. iter->index never overflows
         * during iterating; it can be zero only at the beginning.
         * And we cannot overflow iter->next_index in a single step,
         * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG.
         *
         * This condition also used by radix_tree_next_slot() to stop
         * contiguous iterating, and forbid switching to the next chunk.
         */
        index = iter->next_index;
        if (!index && iter->index)
                return NULL;

 restart:
        radix_tree_load_root(root, &child, &maxindex);
        if (index > maxindex)
                return NULL;
        if (!child)
                return NULL;

        if (!radix_tree_is_internal_node(child)) {
                /* Single-slot tree */
                iter->index = index;
                iter->next_index = maxindex + 1;
                iter->tags = 1;
                iter->node = NULL;
                return (void __rcu **)&root->xa_head;
        }

        do {
                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, index);

                if ((flags & RADIX_TREE_ITER_TAGGED) ?
                                !tag_get(node, tag, offset) : !child) {
                        /* Hole detected */
                        if (flags & RADIX_TREE_ITER_CONTIG)
                                return NULL;

                        if (flags & RADIX_TREE_ITER_TAGGED)
                                offset = radix_tree_find_next_bit(node, tag,
                                                offset + 1);
                        else
                                while (++offset        < RADIX_TREE_MAP_SIZE) {
                                        void *slot = rcu_dereference_raw(
                                                        node->slots[offset]);
                                        if (slot)
                                                break;
                                }
                        index &= ~node_maxindex(node);
                        index += offset << node->shift;
                        /* Overflow after ~0UL */
                        if (!index)
                                return NULL;
                        if (offset == RADIX_TREE_MAP_SIZE)
                                goto restart;
                        child = rcu_dereference_raw(node->slots[offset]);
                }

                if (!child)
                        goto restart;
                if (child == RADIX_TREE_RETRY)
                        break;
        } while (node->shift && radix_tree_is_internal_node(child));

        /* Update the iterator state */
        iter->index = (index &~ node_maxindex(node)) | offset;
        iter->next_index = (index | node_maxindex(node)) + 1;
        iter->node = node;

        if (flags & RADIX_TREE_ITER_TAGGED)
                set_iter_tags(iter, node, offset, tag);

        return node->slots + offset;
}
EXPORT_SYMBOL(radix_tree_next_chunk);

/**
 *        radix_tree_gang_lookup - perform multiple lookup on a radix tree
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *
 *        Performs an index-ascending scan of the tree for present items.  Places
 *        them at *@results and returns the number of items which were placed at
 *        *@results.
 *
 *        The implementation is naive.
 *
 *        Like radix_tree_lookup, radix_tree_gang_lookup may be called under
 *        rcu_read_lock. In this case, rather than the returned results being
 *        an atomic snapshot of the tree at a single point in time, the
 *        semantics of an RCU protected gang lookup are as though multiple
 *        radix_tree_lookups have been issued in individual locks, and results
 *        stored in 'results'.
 */
unsigned int
radix_tree_gang_lookup(const struct radix_tree_root *root, void **results,
                        unsigned long first_index, unsigned int max_items)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_slot(slot, root, &iter, first_index) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup);

/**
 *        radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
 *                                     based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the items at *@results and
 *        returns the number of items which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag(const struct radix_tree_root *root, void **results,
                unsigned long first_index, unsigned int max_items,
                unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = rcu_dereference_raw(*slot);
                if (!results[ret])
                        continue;
                if (radix_tree_is_internal_node(results[ret])) {
                        slot = radix_tree_iter_retry(&iter);
                        continue;
                }
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag);

/**
 *        radix_tree_gang_lookup_tag_slot - perform multiple slot lookup on a
 *                                          radix tree based on a tag
 *        @root:                radix tree root
 *        @results:        where the results of the lookup are placed
 *        @first_index:        start the lookup from this key
 *        @max_items:        place up to this many items at *results
 *        @tag:                the tag index (< RADIX_TREE_MAX_TAGS)
 *
 *        Performs an index-ascending scan of the tree for present items which
 *        have the tag indexed by @tag set.  Places the slots at *@results and
 *        returns the number of slots which were placed at *@results.
 */
unsigned int
radix_tree_gang_lookup_tag_slot(const struct radix_tree_root *root,
                void __rcu ***results, unsigned long first_index,
                unsigned int max_items, unsigned int tag)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int ret = 0;

        if (unlikely(!max_items))
                return 0;

        radix_tree_for_each_tagged(slot, root, &iter, first_index, tag) {
                results[ret] = slot;
                if (++ret == max_items)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot);

static bool __radix_tree_delete(struct radix_tree_root *root,
                                struct radix_tree_node *node, void __rcu **slot)
{
        void *old = rcu_dereference_raw(*slot);
        int values = xa_is_value(old) ? -1 : 0;
        unsigned offset = get_slot_offset(node, slot);
        int tag;

        if (is_idr(root))
                node_tag_set(root, node, IDR_FREE, offset);
        else
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
                        node_tag_clear(root, node, tag, offset);

        replace_slot(slot, NULL, node, -1, values);
        return node && delete_node(root, node);
}

/**
 * radix_tree_iter_delete - delete the entry at this iterator position
 * @root: radix tree root
 * @iter: iterator state
 * @slot: pointer to slot
 *
 * Delete the entry at the position currently pointed to by the iterator.
 * This may result in the current node being freed; if it is, the iterator
 * is advanced so that it will not reference the freed memory.  This
 * function may be called without any locking if there are no other threads
 * which can access this tree.
 */
void radix_tree_iter_delete(struct radix_tree_root *root,
                                struct radix_tree_iter *iter, void __rcu **slot)
{
        if (__radix_tree_delete(root, iter->node, slot))
                iter->index = iter->next_index;
}
EXPORT_SYMBOL(radix_tree_iter_delete);

/**
 * radix_tree_delete_item - delete an item from a radix tree
 * @root: radix tree root
 * @index: index key
 * @item: expected item
 *
 * Remove @item at @index from the radix tree rooted at @root.
 *
 * Return: the deleted entry, or %NULL if it was not present
 * or the entry at the given @index was not @item.
 */
void *radix_tree_delete_item(struct radix_tree_root *root,
                             unsigned long index, void *item)
{
        struct radix_tree_node *node = NULL;
        void __rcu **slot = NULL;
        void *entry;

        entry = __radix_tree_lookup(root, index, &node, &slot);
        if (!slot)
                return NULL;
        if (!entry && (!is_idr(root) || node_tag_get(root, node, IDR_FREE,
                                                get_slot_offset(node, slot))))
                return NULL;

        if (item && entry != item)
                return NULL;

        __radix_tree_delete(root, node, slot);

        return entry;
}
EXPORT_SYMBOL(radix_tree_delete_item);

/**
 * radix_tree_delete - delete an entry from a radix tree
 * @root: radix tree root
 * @index: index key
 *
 * Remove the entry at @index from the radix tree rooted at @root.
 *
 * Return: The deleted entry, or %NULL if it was not present.
 */
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
        return radix_tree_delete_item(root, index, NULL);
}
EXPORT_SYMBOL(radix_tree_delete);

/**
 *        radix_tree_tagged - test whether any items in the tree are tagged
 *        @root:                radix tree root
 *        @tag:                tag to test
 */
int radix_tree_tagged(const struct radix_tree_root *root, unsigned int tag)
{
        return root_tag_get(root, tag);
}
EXPORT_SYMBOL(radix_tree_tagged);

/**
 * idr_preload - preload for idr_alloc()
 * @gfp_mask: allocation mask to use for preloading
 *
 * Preallocate memory to use for the next call to idr_alloc().  This function
 * returns with preemption disabled.  It will be enabled by idr_preload_end().
 */
void idr_preload(gfp_t gfp_mask)
{
        if (__radix_tree_preload(gfp_mask, IDR_PRELOAD_SIZE))
                local_lock(&radix_tree_preloads.lock);
}
EXPORT_SYMBOL(idr_preload);

void __rcu **idr_get_free(struct radix_tree_root *root,
                              struct radix_tree_iter *iter, gfp_t gfp,
                              unsigned long max)
{
        struct radix_tree_node *node = NULL, *child;
        void __rcu **slot = (void __rcu **)&root->xa_head;
        unsigned long maxindex, start = iter->next_index;
        unsigned int shift, offset = 0;

 grow:
        shift = radix_tree_load_root(root, &child, &maxindex);
        if (!radix_tree_tagged(root, IDR_FREE))
                start = max(start, maxindex + 1);
        if (start > max)
                return ERR_PTR(-ENOSPC);

        if (start > maxindex) {
                int error = radix_tree_extend(root, gfp, start, shift);
                if (error < 0)
                        return ERR_PTR(error);
                shift = error;
                child = rcu_dereference_raw(root->xa_head);
        }
        if (start == 0 && shift == 0)
                shift = RADIX_TREE_MAP_SHIFT;

        while (shift) {
                shift -= RADIX_TREE_MAP_SHIFT;
                if (child == NULL) {
                        /* Have to add a child node.  */
                        child = radix_tree_node_alloc(gfp, node, root, shift,
                                                        offset, 0, 0);
                        if (!child)
                                return ERR_PTR(-ENOMEM);
                        all_tag_set(child, IDR_FREE);
                        rcu_assign_pointer(*slot, node_to_entry(child));
                        if (node)
                                node->count++;
                } else if (!radix_tree_is_internal_node(child))
                        break;

                node = entry_to_node(child);
                offset = radix_tree_descend(node, &child, start);
                if (!tag_get(node, IDR_FREE, offset)) {
                        offset = radix_tree_find_next_bit(node, IDR_FREE,
                                                        offset + 1);
                        start = next_index(start, node, offset);
                        if (start > max || start == 0)
                                return ERR_PTR(-ENOSPC);
                        while (offset == RADIX_TREE_MAP_SIZE) {
                                offset = node->offset + 1;
                                node = node->parent;
                                if (!node)
                                        goto grow;
                                shift = node->shift;
                        }
                        child = rcu_dereference_raw(node->slots[offset]);
                }
                slot = &node->slots[offset];
        }

        iter->index = start;
        if (node)
                iter->next_index = 1 + min(max, (start | node_maxindex(node)));
        else
                iter->next_index = 1;
        iter->node = node;
        set_iter_tags(iter, node, offset, IDR_FREE);

        return slot;
}

/**
 * idr_destroy - release all internal memory from an IDR
 * @idr: idr handle
 *
 * After this function is called, the IDR is empty, and may be reused or
 * the data structure containing it may be freed.
 *
 * A typical clean-up sequence for objects stored in an idr tree will use
 * idr_for_each() to free all objects, if necessary, then idr_destroy() to
 * free the memory used to keep track of those objects.
 */
void idr_destroy(struct idr *idr)
{
        struct radix_tree_node *node = rcu_dereference_raw(idr->idr_rt.xa_head);
        if (radix_tree_is_internal_node(node))
                radix_tree_free_nodes(node);
        idr->idr_rt.xa_head = NULL;
        root_tag_set(&idr->idr_rt, IDR_FREE);
}
EXPORT_SYMBOL(idr_destroy);

static void
radix_tree_node_ctor(void *arg)
{
        struct radix_tree_node *node = arg;

        memset(node, 0, sizeof(*node));
        INIT_LIST_HEAD(&node->private_list);
}

static int radix_tree_cpu_dead(unsigned int cpu)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;

        /* Free per-cpu pool of preloaded nodes */
        rtp = &per_cpu(radix_tree_preloads, cpu);
        while (rtp->nr) {
                node = rtp->nodes;
                rtp->nodes = node->parent;
                kmem_cache_free(radix_tree_node_cachep, node);
                rtp->nr--;
        }
        return 0;
}

void __init radix_tree_init(void)
{
        int ret;

        BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32);
        BUILD_BUG_ON(ROOT_IS_IDR & ~GFP_ZONEMASK);
        BUILD_BUG_ON(XA_CHUNK_SIZE > 255);
        radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
                        sizeof(struct radix_tree_node), 0,
                        SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
                        radix_tree_node_ctor);
        ret = cpuhp_setup_state_nocalls(CPUHP_RADIX_DEAD, "lib/radix:dead",
                                        NULL, radix_tree_cpu_dead);
        WARN_ON(ret < 0);
}






























































































































































  201 




  166 










  169 
  169 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#ifndef __ASM_ARM_KVM_ARCH_TIMER_H
#define __ASM_ARM_KVM_ARCH_TIMER_H

#include <linux/clocksource.h>
#include <linux/hrtimer.h>

enum kvm_arch_timers {
        TIMER_PTIMER,
        TIMER_VTIMER,
        NR_KVM_EL0_TIMERS,
        TIMER_HVTIMER = NR_KVM_EL0_TIMERS,
        TIMER_HPTIMER,
        NR_KVM_TIMERS
};

enum kvm_arch_timer_regs {
        TIMER_REG_CNT,
        TIMER_REG_CVAL,
        TIMER_REG_TVAL,
        TIMER_REG_CTL,
        TIMER_REG_VOFF,
};

struct arch_timer_offset {
        /*
         * If set, pointer to one of the offsets in the kvm's offset
         * structure. If NULL, assume a zero offset.
         */
        u64        *vm_offset;
        /*
         * If set, pointer to one of the offsets in the vcpu's sysreg
         * array. If NULL, assume a zero offset.
         */
        u64        *vcpu_offset;
};

struct arch_timer_vm_data {
        /* Offset applied to the virtual timer/counter */
        u64        voffset;
        /* Offset applied to the physical timer/counter */
        u64        poffset;

        /* The PPI for each timer, global to the VM */
        u8        ppi[NR_KVM_TIMERS];
};

struct arch_timer_context {
        struct kvm_vcpu                        *vcpu;

        /* Emulated Timer (may be unused) */
        struct hrtimer                        hrtimer;
        u64                                ns_frac;

        /* Offset for this counter/timer */
        struct arch_timer_offset        offset;
        /*
         * We have multiple paths which can save/restore the timer state onto
         * the hardware, so we need some way of keeping track of where the
         * latest state is.
         */
        bool                                loaded;

        /* Output level of the timer IRQ */
        struct {
                bool                        level;
        } irq;

        /* Duplicated state from arch_timer.c for convenience */
        u32                                host_timer_irq;
};

struct timer_map {
        struct arch_timer_context *direct_vtimer;
        struct arch_timer_context *direct_ptimer;
        struct arch_timer_context *emul_vtimer;
        struct arch_timer_context *emul_ptimer;
};

void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map);

struct arch_timer_cpu {
        struct arch_timer_context timers[NR_KVM_TIMERS];

        /* Background timer used when the guest is not running */
        struct hrtimer                        bg_timer;

        /* Is the timer enabled */
        bool                        enabled;
};

int __init kvm_timer_hyp_init(bool has_gic);
int kvm_timer_enable(struct kvm_vcpu *vcpu);
void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu);
void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
void kvm_timer_sync_nested(struct kvm_vcpu *vcpu);
void kvm_timer_sync_user(struct kvm_vcpu *vcpu);
bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu);
void kvm_timer_update_run(struct kvm_vcpu *vcpu);
void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu);

void kvm_timer_init_vm(struct kvm *kvm);

u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);

int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);

u64 kvm_phys_timer_read(void);

void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu);
void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu);

void kvm_timer_init_vhe(void);

#define vcpu_timer(v)        (&(v)->arch.timer_cpu)
#define vcpu_get_timer(v,t)        (&vcpu_timer(v)->timers[(t)])
#define vcpu_vtimer(v)        (&(v)->arch.timer_cpu.timers[TIMER_VTIMER])
#define vcpu_ptimer(v)        (&(v)->arch.timer_cpu.timers[TIMER_PTIMER])
#define vcpu_hvtimer(v)        (&(v)->arch.timer_cpu.timers[TIMER_HVTIMER])
#define vcpu_hptimer(v)        (&(v)->arch.timer_cpu.timers[TIMER_HPTIMER])

#define arch_timer_ctx_index(ctx)        ((ctx) - vcpu_timer((ctx)->vcpu)->timers)

#define timer_vm_data(ctx)                (&(ctx)->vcpu->kvm->arch.timer_data)
#define timer_irq(ctx)                        (timer_vm_data(ctx)->ppi[arch_timer_ctx_index(ctx)])

u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
                              enum kvm_arch_timers tmr,
                              enum kvm_arch_timer_regs treg);
void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
                                enum kvm_arch_timers tmr,
                                enum kvm_arch_timer_regs treg,
                                u64 val);

/* Needed for tracing */
u32 timer_get_ctl(struct arch_timer_context *ctxt);
u64 timer_get_cval(struct arch_timer_context *ctxt);

/* CPU HP callbacks */
void kvm_timer_cpu_up(void);
void kvm_timer_cpu_down(void);

/* CNTKCTL_EL1 valid bits as of DDI0487J.a */
#define CNTKCTL_VALID_BITS        (BIT(17) | GENMASK_ULL(9, 0))

DECLARE_STATIC_KEY_FALSE(broken_cntvoff_key);

static inline bool has_broken_cntvoff(void)
{
        return static_branch_unlikely(&broken_cntvoff_key);
}

static inline bool has_cntpoff(void)
{
        return (has_vhe() && cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF));
}

static inline u64 timer_get_offset(struct arch_timer_context *ctxt)
{
        u64 offset = 0;

        if (!ctxt)
                return 0;

        if (ctxt->offset.vm_offset)
                offset += *ctxt->offset.vm_offset;
        if (ctxt->offset.vcpu_offset)
                offset += *ctxt->offset.vcpu_offset;

        return offset;
}

#endif

























































































    3 





















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
// SPDX-License-Identifier: GPL-2.0
/* Copyright 2011-2014 Autronica Fire and Security AS
 *
 * Author(s):
 *        2011-2014 Arvid Brodin, arvid.brodin@alten.se
 *
 * Frame handler other utility functions for HSR and PRP.
 */

#include "hsr_slave.h"
#include <linux/etherdevice.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include "hsr_main.h"
#include "hsr_device.h"
#include "hsr_forward.h"
#include "hsr_framereg.h"

bool hsr_invalid_dan_ingress_frame(__be16 protocol)
{
        return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR));
}

static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
{
        struct sk_buff *skb = *pskb;
        struct hsr_port *port;
        struct hsr_priv *hsr;
        __be16 protocol;

        /* Packets from dev_loopback_xmit() do not have L2 header, bail out */
        if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
                return RX_HANDLER_PASS;

        if (!skb_mac_header_was_set(skb)) {
                WARN_ONCE(1, "%s: skb invalid", __func__);
                return RX_HANDLER_PASS;
        }

        port = hsr_port_get_rcu(skb->dev);
        if (!port)
                goto finish_pass;
        hsr = port->hsr;

        if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) {
                /* Directly kill frames sent by ourselves */
                kfree_skb(skb);
                goto finish_consume;
        }

        /* For HSR, only tagged frames are expected (unless the device offloads
         * HSR tag removal), but for PRP there could be non tagged frames as
         * well from Single attached nodes (SANs).
         */
        protocol = eth_hdr(skb)->h_proto;

        if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) &&
            port->type != HSR_PT_INTERLINK &&
            hsr->proto_ops->invalid_dan_ingress_frame &&
            hsr->proto_ops->invalid_dan_ingress_frame(protocol))
                goto finish_pass;

        skb_push(skb, ETH_HLEN);
        skb_reset_mac_header(skb);
        if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) ||
            protocol == htons(ETH_P_HSR))
                skb_set_network_header(skb, ETH_HLEN + HSR_HLEN);
        skb_reset_mac_len(skb);

        /* Only the frames received over the interlink port will assign a
         * sequence number and require synchronisation vs other sender.
         */
        if (port->type == HSR_PT_INTERLINK) {
                spin_lock_bh(&hsr->seqnr_lock);
                hsr_forward_skb(skb, port);
                spin_unlock_bh(&hsr->seqnr_lock);
        } else {
                hsr_forward_skb(skb, port);
        }

finish_consume:
        return RX_HANDLER_CONSUMED;

finish_pass:
        return RX_HANDLER_PASS;
}

bool hsr_port_exists(const struct net_device *dev)
{
        return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame;
}

static int hsr_check_dev_ok(struct net_device *dev,
                            struct netlink_ext_ack *extack)
{
        /* Don't allow HSR on non-ethernet like devices */
        if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
            dev->addr_len != ETH_ALEN) {
                NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave.");
                return -EINVAL;
        }

        /* Don't allow enslaving hsr devices */
        if (is_hsr_master(dev)) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Cannot create trees of HSR devices.");
                return -EINVAL;
        }

        if (hsr_port_exists(dev)) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "This device is already a HSR slave.");
                return -EINVAL;
        }

        if (is_vlan_dev(dev)) {
                NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver.");
                return -EINVAL;
        }

        if (dev->priv_flags & IFF_DONT_BRIDGE) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "This device does not support bridging.");
                return -EOPNOTSUPP;
        }

        /* HSR over bonded devices has not been tested, but I'm not sure it
         * won't work...
         */

        return 0;
}

/* Setup device to be added to the HSR bridge. */
static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev,
                             struct hsr_port *port,
                             struct netlink_ext_ack *extack)

{
        struct net_device *hsr_dev;
        struct hsr_port *master;
        int res;

        /* Don't use promiscuous mode for offload since L2 frame forward
         * happens at the offloaded hardware.
         */
        if (!port->hsr->fwd_offloaded) {
                res = dev_set_promiscuity(dev, 1);
                if (res)
                        return res;
        }

        master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
        hsr_dev = master->dev;

        res = netdev_upper_dev_link(dev, hsr_dev, extack);
        if (res)
                goto fail_upper_dev_link;

        res = netdev_rx_handler_register(dev, hsr_handle_frame, port);
        if (res)
                goto fail_rx_handler;
        dev_disable_lro(dev);

        return 0;

fail_rx_handler:
        netdev_upper_dev_unlink(dev, hsr_dev);
fail_upper_dev_link:
        if (!port->hsr->fwd_offloaded)
                dev_set_promiscuity(dev, -1);

        return res;
}

int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
                 enum hsr_port_type type, struct netlink_ext_ack *extack)
{
        struct hsr_port *port, *master;
        int res;

        if (type != HSR_PT_MASTER) {
                res = hsr_check_dev_ok(dev, extack);
                if (res)
                        return res;
        }

        port = hsr_port_get_hsr(hsr, type);
        if (port)
                return -EBUSY;        /* This port already exists */

        port = kzalloc(sizeof(*port), GFP_KERNEL);
        if (!port)
                return -ENOMEM;

        port->hsr = hsr;
        port->dev = dev;
        port->type = type;

        if (type != HSR_PT_MASTER) {
                res = hsr_portdev_setup(hsr, dev, port, extack);
                if (res)
                        goto fail_dev_setup;
        }

        list_add_tail_rcu(&port->port_list, &hsr->ports);

        master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
        netdev_update_features(master->dev);
        dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));

        return 0;

fail_dev_setup:
        kfree(port);
        return res;
}

void hsr_del_port(struct hsr_port *port)
{
        struct hsr_priv *hsr;
        struct hsr_port *master;

        hsr = port->hsr;
        master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
        list_del_rcu(&port->port_list);

        if (port != master) {
                netdev_update_features(master->dev);
                dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
                netdev_rx_handler_unregister(port->dev);
                if (!port->hsr->fwd_offloaded)
                        dev_set_promiscuity(port->dev, -1);
                netdev_upper_dev_unlink(port->dev, master->dev);
        }

        kfree_rcu(port, rcu);
}















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  400 










  401 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/blkdev.h>
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/kthread.h>
#include <linux/backing-dev.h>
#include <linux/blk-cgroup.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/writeback.h>
#include <linux/device.h>
#include <trace/events/writeback.h>
#include "internal.h"

struct backing_dev_info noop_backing_dev_info;
EXPORT_SYMBOL_GPL(noop_backing_dev_info);

static const char *bdi_unknown_name = "(unknown)";

/*
 * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
 * reader side locking.
 */
DEFINE_SPINLOCK(bdi_lock);
static u64 bdi_id_cursor;
static struct rb_root bdi_tree = RB_ROOT;
LIST_HEAD(bdi_list);

/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;

#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>

struct wb_stats {
        unsigned long nr_dirty;
        unsigned long nr_io;
        unsigned long nr_more_io;
        unsigned long nr_dirty_time;
        unsigned long nr_writeback;
        unsigned long nr_reclaimable;
        unsigned long nr_dirtied;
        unsigned long nr_written;
        unsigned long dirty_thresh;
        unsigned long wb_thresh;
};

static struct dentry *bdi_debug_root;

static void bdi_debug_init(void)
{
        bdi_debug_root = debugfs_create_dir("bdi", NULL);
}

static void collect_wb_stats(struct wb_stats *stats,
                             struct bdi_writeback *wb)
{
        struct inode *inode;

        spin_lock(&wb->list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_io_list)
                stats->nr_dirty++;
        list_for_each_entry(inode, &wb->b_io, i_io_list)
                stats->nr_io++;
        list_for_each_entry(inode, &wb->b_more_io, i_io_list)
                stats->nr_more_io++;
        list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
                if (inode->i_state & I_DIRTY_TIME)
                        stats->nr_dirty_time++;
        spin_unlock(&wb->list_lock);

        stats->nr_writeback += wb_stat(wb, WB_WRITEBACK);
        stats->nr_reclaimable += wb_stat(wb, WB_RECLAIMABLE);
        stats->nr_dirtied += wb_stat(wb, WB_DIRTIED);
        stats->nr_written += wb_stat(wb, WB_WRITTEN);
        stats->wb_thresh += wb_calc_thresh(wb, stats->dirty_thresh);
}

#ifdef CONFIG_CGROUP_WRITEBACK
static void bdi_collect_stats(struct backing_dev_info *bdi,
                              struct wb_stats *stats)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
                if (!wb_tryget(wb))
                        continue;

                collect_wb_stats(stats, wb);
                wb_put(wb);
        }
        rcu_read_unlock();
}
#else
static void bdi_collect_stats(struct backing_dev_info *bdi,
                              struct wb_stats *stats)
{
        collect_wb_stats(stats, &bdi->wb);
}
#endif

static int bdi_debug_stats_show(struct seq_file *m, void *v)
{
        struct backing_dev_info *bdi = m->private;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        struct wb_stats stats;
        unsigned long tot_bw;

        global_dirty_limits(&background_thresh, &dirty_thresh);

        memset(&stats, 0, sizeof(stats));
        stats.dirty_thresh = dirty_thresh;
        bdi_collect_stats(bdi, &stats);
        tot_bw = atomic_long_read(&bdi->tot_write_bandwidth);

        seq_printf(m,
                   "BdiWriteback:       %10lu kB\n"
                   "BdiReclaimable:     %10lu kB\n"
                   "BdiDirtyThresh:     %10lu kB\n"
                   "DirtyThresh:        %10lu kB\n"
                   "BackgroundThresh:   %10lu kB\n"
                   "BdiDirtied:         %10lu kB\n"
                   "BdiWritten:         %10lu kB\n"
                   "BdiWriteBandwidth:  %10lu kBps\n"
                   "b_dirty:            %10lu\n"
                   "b_io:               %10lu\n"
                   "b_more_io:          %10lu\n"
                   "b_dirty_time:       %10lu\n"
                   "bdi_list:           %10u\n"
                   "state:              %10lx\n",
                   K(stats.nr_writeback),
                   K(stats.nr_reclaimable),
                   K(stats.wb_thresh),
                   K(dirty_thresh),
                   K(background_thresh),
                   K(stats.nr_dirtied),
                   K(stats.nr_written),
                   K(tot_bw),
                   stats.nr_dirty,
                   stats.nr_io,
                   stats.nr_more_io,
                   stats.nr_dirty_time,
                   !list_empty(&bdi->bdi_list), bdi->wb.state);

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);

static void wb_stats_show(struct seq_file *m, struct bdi_writeback *wb,
                          struct wb_stats *stats)
{

        seq_printf(m,
                   "WbCgIno:           %10lu\n"
                   "WbWriteback:       %10lu kB\n"
                   "WbReclaimable:     %10lu kB\n"
                   "WbDirtyThresh:     %10lu kB\n"
                   "WbDirtied:         %10lu kB\n"
                   "WbWritten:         %10lu kB\n"
                   "WbWriteBandwidth:  %10lu kBps\n"
                   "b_dirty:           %10lu\n"
                   "b_io:              %10lu\n"
                   "b_more_io:         %10lu\n"
                   "b_dirty_time:      %10lu\n"
                   "state:             %10lx\n\n",
#ifdef CONFIG_CGROUP_WRITEBACK
                   cgroup_ino(wb->memcg_css->cgroup),
#else
                   1ul,
#endif
                   K(stats->nr_writeback),
                   K(stats->nr_reclaimable),
                   K(stats->wb_thresh),
                   K(stats->nr_dirtied),
                   K(stats->nr_written),
                   K(wb->avg_write_bandwidth),
                   stats->nr_dirty,
                   stats->nr_io,
                   stats->nr_more_io,
                   stats->nr_dirty_time,
                   wb->state);
}

static int cgwb_debug_stats_show(struct seq_file *m, void *v)
{
        struct backing_dev_info *bdi = m->private;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        struct bdi_writeback *wb;

        global_dirty_limits(&background_thresh, &dirty_thresh);

        rcu_read_lock();
        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
                struct wb_stats stats = { .dirty_thresh = dirty_thresh };

                if (!wb_tryget(wb))
                        continue;

                collect_wb_stats(&stats, wb);

                /*
                 * Calculate thresh of wb in writeback cgroup which is min of
                 * thresh in global domain and thresh in cgroup domain. Drop
                 * rcu lock because cgwb_calc_thresh may sleep in
                 * cgroup_rstat_flush. We can do so here because we have a ref.
                 */
                if (mem_cgroup_wb_domain(wb)) {
                        rcu_read_unlock();
                        stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
                        rcu_read_lock();
                }

                wb_stats_show(m, wb, &stats);

                wb_put(wb);
        }
        rcu_read_unlock();

        return 0;
}
DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);

static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
        bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);

        debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
                            &bdi_debug_stats_fops);
        debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
                            &cgwb_debug_stats_fops);
}

static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
        debugfs_remove_recursive(bdi->debug_dir);
}
#else /* CONFIG_DEBUG_FS */
static inline void bdi_debug_init(void)
{
}
static inline void bdi_debug_register(struct backing_dev_info *bdi,
                                      const char *name)
{
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
}
#endif /* CONFIG_DEBUG_FS */

static ssize_t read_ahead_kb_store(struct device *dev,
                                  struct device_attribute *attr,
                                  const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned long read_ahead_kb;
        ssize_t ret;

        ret = kstrtoul(buf, 10, &read_ahead_kb);
        if (ret < 0)
                return ret;

        bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10);

        return count;
}

#define BDI_SHOW(name, expr)                                                \
static ssize_t name##_show(struct device *dev,                                \
                           struct device_attribute *attr, char *buf)        \
{                                                                        \
        struct backing_dev_info *bdi = dev_get_drvdata(dev);                \
                                                                        \
        return sysfs_emit(buf, "%lld\n", (long long)expr);                \
}                                                                        \
static DEVICE_ATTR_RW(name);

BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))

static ssize_t min_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_ratio(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE)

static ssize_t min_ratio_fine_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_ratio_no_scale(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(min_ratio_fine, bdi->min_ratio)

static ssize_t max_ratio_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_ratio(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE)

static ssize_t max_ratio_fine_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int ratio;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &ratio);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_ratio_no_scale(bdi, ratio);
        if (!ret)
                ret = count;

        return ret;
}
BDI_SHOW(max_ratio_fine, bdi->max_ratio)

static ssize_t min_bytes_show(struct device *dev,
                              struct device_attribute *attr,
                              char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi));
}

static ssize_t min_bytes_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        u64 bytes;
        ssize_t ret;

        ret = kstrtoull(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = bdi_set_min_bytes(bdi, bytes);
        if (!ret)
                ret = count;

        return ret;
}
static DEVICE_ATTR_RW(min_bytes);

static ssize_t max_bytes_show(struct device *dev,
                              struct device_attribute *attr,
                              char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi));
}

static ssize_t max_bytes_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        u64 bytes;
        ssize_t ret;

        ret = kstrtoull(buf, 10, &bytes);
        if (ret < 0)
                return ret;

        ret = bdi_set_max_bytes(bdi, bytes);
        if (!ret)
                ret = count;

        return ret;
}
static DEVICE_ATTR_RW(max_bytes);

static ssize_t stable_pages_required_show(struct device *dev,
                                          struct device_attribute *attr,
                                          char *buf)
{
        dev_warn_once(dev,
                "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
        return sysfs_emit(buf, "%d\n", 0);
}
static DEVICE_ATTR_RO(stable_pages_required);

static ssize_t strict_limit_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);
        unsigned int strict_limit;
        ssize_t ret;

        ret = kstrtouint(buf, 10, &strict_limit);
        if (ret < 0)
                return ret;

        ret = bdi_set_strict_limit(bdi, strict_limit);
        if (!ret)
                ret = count;

        return ret;
}

static ssize_t strict_limit_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        struct backing_dev_info *bdi = dev_get_drvdata(dev);

        return sysfs_emit(buf, "%d\n",
                        !!(bdi->capabilities & BDI_CAP_STRICTLIMIT));
}
static DEVICE_ATTR_RW(strict_limit);

static struct attribute *bdi_dev_attrs[] = {
        &dev_attr_read_ahead_kb.attr,
        &dev_attr_min_ratio.attr,
        &dev_attr_min_ratio_fine.attr,
        &dev_attr_max_ratio.attr,
        &dev_attr_max_ratio_fine.attr,
        &dev_attr_min_bytes.attr,
        &dev_attr_max_bytes.attr,
        &dev_attr_stable_pages_required.attr,
        &dev_attr_strict_limit.attr,
        NULL,
};
ATTRIBUTE_GROUPS(bdi_dev);

static const struct class bdi_class = {
        .name                = "bdi",
        .dev_groups        = bdi_dev_groups,
};

static __init int bdi_class_init(void)
{
        int ret;

        ret = class_register(&bdi_class);
        if (ret)
                return ret;

        bdi_debug_init();

        return 0;
}
postcore_initcall(bdi_class_init);

static int __init default_bdi_init(void)
{
        bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
                                 WQ_SYSFS, 0);
        if (!bdi_wq)
                return -ENOMEM;
        return 0;
}
subsys_initcall(default_bdi_init);

static void wb_update_bandwidth_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, bw_dwork);

        wb_update_bandwidth(wb);
}

/*
 * Initial write bandwidth: 100 MB/s
 */
#define INIT_BW                (100 << (20 - PAGE_SHIFT))

static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
                   gfp_t gfp)
{
        int err;

        memset(wb, 0, sizeof(*wb));

        wb->bdi = bdi;
        wb->last_old_flush = jiffies;
        INIT_LIST_HEAD(&wb->b_dirty);
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
        INIT_LIST_HEAD(&wb->b_dirty_time);
        spin_lock_init(&wb->list_lock);

        atomic_set(&wb->writeback_inodes, 0);
        wb->bw_time_stamp = jiffies;
        wb->balanced_dirty_ratelimit = INIT_BW;
        wb->dirty_ratelimit = INIT_BW;
        wb->write_bandwidth = INIT_BW;
        wb->avg_write_bandwidth = INIT_BW;

        spin_lock_init(&wb->work_lock);
        INIT_LIST_HEAD(&wb->work_list);
        INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
        INIT_DELAYED_WORK(&wb->bw_dwork, wb_update_bandwidth_workfn);

        err = fprop_local_init_percpu(&wb->completions, gfp);
        if (err)
                return err;

        err = percpu_counter_init_many(wb->stat, 0, gfp, NR_WB_STAT_ITEMS);
        if (err)
                fprop_local_destroy_percpu(&wb->completions);

        return err;
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb);

/*
 * Remove bdi from the global list and shutdown any threads we have running
 */
static void wb_shutdown(struct bdi_writeback *wb)
{
        /* Make sure nobody queues further work */
        spin_lock_irq(&wb->work_lock);
        if (!test_and_clear_bit(WB_registered, &wb->state)) {
                spin_unlock_irq(&wb->work_lock);
                return;
        }
        spin_unlock_irq(&wb->work_lock);

        cgwb_remove_from_bdi_list(wb);
        /*
         * Drain work list and shutdown the delayed_work.  !WB_registered
         * tells wb_workfn() that @wb is dying and its work_list needs to
         * be drained no matter what.
         */
        mod_delayed_work(bdi_wq, &wb->dwork, 0);
        flush_delayed_work(&wb->dwork);
        WARN_ON(!list_empty(&wb->work_list));
        flush_delayed_work(&wb->bw_dwork);
}

static void wb_exit(struct bdi_writeback *wb)
{
        WARN_ON(delayed_work_pending(&wb->dwork));
        percpu_counter_destroy_many(wb->stat, NR_WB_STAT_ITEMS);
        fprop_local_destroy_percpu(&wb->completions);
}

#ifdef CONFIG_CGROUP_WRITEBACK

#include <linux/memcontrol.h>

/*
 * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, offline_cgwbs and
 * memcg->cgwb_list.  bdi->cgwb_tree is also RCU protected.
 */
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;

static LIST_HEAD(offline_cgwbs);
static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);

static void cgwb_free_rcu(struct rcu_head *rcu_head)
{
        struct bdi_writeback *wb = container_of(rcu_head,
                        struct bdi_writeback, rcu);

        percpu_ref_exit(&wb->refcnt);
        kfree(wb);
}

static void cgwb_release_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
                                                release_work);
        struct backing_dev_info *bdi = wb->bdi;

        mutex_lock(&wb->bdi->cgwb_release_mutex);
        wb_shutdown(wb);

        css_put(wb->memcg_css);
        css_put(wb->blkcg_css);
        mutex_unlock(&wb->bdi->cgwb_release_mutex);

        /* triggers blkg destruction if no online users left */
        blkcg_unpin_online(wb->blkcg_css);

        fprop_local_destroy_percpu(&wb->memcg_completions);

        spin_lock_irq(&cgwb_lock);
        list_del(&wb->offline_node);
        spin_unlock_irq(&cgwb_lock);

        wb_exit(wb);
        bdi_put(bdi);
        WARN_ON_ONCE(!list_empty(&wb->b_attached));
        call_rcu(&wb->rcu, cgwb_free_rcu);
}

static void cgwb_release(struct percpu_ref *refcnt)
{
        struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
                                                refcnt);
        queue_work(cgwb_release_wq, &wb->release_work);
}

static void cgwb_kill(struct bdi_writeback *wb)
{
        lockdep_assert_held(&cgwb_lock);

        WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
        list_del(&wb->memcg_node);
        list_del(&wb->blkcg_node);
        list_add(&wb->offline_node, &offline_cgwbs);
        percpu_ref_kill(&wb->refcnt);
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
        spin_lock_irq(&cgwb_lock);
        list_del_rcu(&wb->bdi_node);
        spin_unlock_irq(&cgwb_lock);
}

static int cgwb_create(struct backing_dev_info *bdi,
                       struct cgroup_subsys_state *memcg_css, gfp_t gfp)
{
        struct mem_cgroup *memcg;
        struct cgroup_subsys_state *blkcg_css;
        struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
        struct bdi_writeback *wb;
        unsigned long flags;
        int ret = 0;

        memcg = mem_cgroup_from_css(memcg_css);
        blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
        memcg_cgwb_list = &memcg->cgwb_list;
        blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);

        /* look up again under lock and discard on blkcg mismatch */
        spin_lock_irqsave(&cgwb_lock, flags);
        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
        if (wb && wb->blkcg_css != blkcg_css) {
                cgwb_kill(wb);
                wb = NULL;
        }
        spin_unlock_irqrestore(&cgwb_lock, flags);
        if (wb)
                goto out_put;

        /* need to create a new one */
        wb = kmalloc(sizeof(*wb), gfp);
        if (!wb) {
                ret = -ENOMEM;
                goto out_put;
        }

        ret = wb_init(wb, bdi, gfp);
        if (ret)
                goto err_free;

        ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
        if (ret)
                goto err_wb_exit;

        ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
        if (ret)
                goto err_ref_exit;

        wb->memcg_css = memcg_css;
        wb->blkcg_css = blkcg_css;
        INIT_LIST_HEAD(&wb->b_attached);
        INIT_WORK(&wb->release_work, cgwb_release_workfn);
        set_bit(WB_registered, &wb->state);
        bdi_get(bdi);

        /*
         * The root wb determines the registered state of the whole bdi and
         * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
         * whether they're still online.  Don't link @wb if any is dead.
         * See wb_memcg_offline() and wb_blkcg_offline().
         */
        ret = -ENODEV;
        spin_lock_irqsave(&cgwb_lock, flags);
        if (test_bit(WB_registered, &bdi->wb.state) &&
            blkcg_cgwb_list->next && memcg_cgwb_list->next) {
                /* we might have raced another instance of this function */
                ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
                if (!ret) {
                        list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                        list_add(&wb->memcg_node, memcg_cgwb_list);
                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
                        blkcg_pin_online(blkcg_css);
                        css_get(memcg_css);
                        css_get(blkcg_css);
                }
        }
        spin_unlock_irqrestore(&cgwb_lock, flags);
        if (ret) {
                if (ret == -EEXIST)
                        ret = 0;
                goto err_fprop_exit;
        }
        goto out_put;

err_fprop_exit:
        bdi_put(bdi);
        fprop_local_destroy_percpu(&wb->memcg_completions);
err_ref_exit:
        percpu_ref_exit(&wb->refcnt);
err_wb_exit:
        wb_exit(wb);
err_free:
        kfree(wb);
out_put:
        css_put(blkcg_css);
        return ret;
}

/**
 * wb_get_lookup - get wb for a given memcg
 * @bdi: target bdi
 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 *
 * Try to get the wb for @memcg_css on @bdi.  The returned wb has its
 * refcount incremented.
 *
 * This function uses css_get() on @memcg_css and thus expects its refcnt
 * to be positive on invocation.  IOW, rcu_read_lock() protection on
 * @memcg_css isn't enough.  try_get it before calling this function.
 *
 * A wb is keyed by its associated memcg.  As blkcg implicitly enables
 * memcg on the default hierarchy, memcg association is guaranteed to be
 * more specific (equal or descendant to the associated blkcg) and thus can
 * identify both the memcg and blkcg associations.
 *
 * Because the blkcg associated with a memcg may change as blkcg is enabled
 * and disabled closer to root in the hierarchy, each wb keeps track of
 * both the memcg and blkcg associated with it and verifies the blkcg on
 * each lookup.  On mismatch, the existing wb is discarded and a new one is
 * created.
 */
struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css)
{
        struct bdi_writeback *wb;

        if (!memcg_css->parent)
                return &bdi->wb;

        rcu_read_lock();
        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
        if (wb) {
                struct cgroup_subsys_state *blkcg_css;

                /* see whether the blkcg association has changed */
                blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
                if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
                        wb = NULL;
                css_put(blkcg_css);
        }
        rcu_read_unlock();

        return wb;
}

/**
 * wb_get_create - get wb for a given memcg, create if necessary
 * @bdi: target bdi
 * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
 * @gfp: allocation mask to use
 *
 * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
 * create one.  See wb_get_lookup() for more details.
 */
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp)
{
        struct bdi_writeback *wb;

        might_alloc(gfp);

        do {
                wb = wb_get_lookup(bdi, memcg_css);
        } while (!wb && !cgwb_create(bdi, memcg_css, gfp));

        return wb;
}

static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
        int ret;

        INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
        mutex_init(&bdi->cgwb_release_mutex);
        init_rwsem(&bdi->wb_switch_rwsem);

        ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
        if (!ret) {
                bdi->wb.memcg_css = &root_mem_cgroup->css;
                bdi->wb.blkcg_css = blkcg_root_css;
        }
        return ret;
}

static void cgwb_bdi_unregister(struct backing_dev_info *bdi)
{
        struct radix_tree_iter iter;
        void **slot;
        struct bdi_writeback *wb;

        WARN_ON(test_bit(WB_registered, &bdi->wb.state));

        spin_lock_irq(&cgwb_lock);
        radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
                cgwb_kill(*slot);
        spin_unlock_irq(&cgwb_lock);

        mutex_lock(&bdi->cgwb_release_mutex);
        spin_lock_irq(&cgwb_lock);
        while (!list_empty(&bdi->wb_list)) {
                wb = list_first_entry(&bdi->wb_list, struct bdi_writeback,
                                      bdi_node);
                spin_unlock_irq(&cgwb_lock);
                wb_shutdown(wb);
                spin_lock_irq(&cgwb_lock);
        }
        spin_unlock_irq(&cgwb_lock);
        mutex_unlock(&bdi->cgwb_release_mutex);
}

/*
 * cleanup_offline_cgwbs_workfn - try to release dying cgwbs
 *
 * Try to release dying cgwbs by switching attached inodes to the nearest
 * living ancestor's writeback. Processed wbs are placed at the end
 * of the list to guarantee the forward progress.
 */
static void cleanup_offline_cgwbs_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb;
        LIST_HEAD(processed);

        spin_lock_irq(&cgwb_lock);

        while (!list_empty(&offline_cgwbs)) {
                wb = list_first_entry(&offline_cgwbs, struct bdi_writeback,
                                      offline_node);
                list_move(&wb->offline_node, &processed);

                /*
                 * If wb is dirty, cleaning up the writeback by switching
                 * attached inodes will result in an effective removal of any
                 * bandwidth restrictions, which isn't the goal.  Instead,
                 * it can be postponed until the next time, when all io
                 * will be likely completed.  If in the meantime some inodes
                 * will get re-dirtied, they should be eventually switched to
                 * a new cgwb.
                 */
                if (wb_has_dirty_io(wb))
                        continue;

                if (!wb_tryget(wb))
                        continue;

                spin_unlock_irq(&cgwb_lock);
                while (cleanup_offline_cgwb(wb))
                        cond_resched();
                spin_lock_irq(&cgwb_lock);

                wb_put(wb);
        }

        if (!list_empty(&processed))
                list_splice_tail(&processed, &offline_cgwbs);

        spin_unlock_irq(&cgwb_lock);
}

/**
 * wb_memcg_offline - kill all wb's associated with a memcg being offlined
 * @memcg: memcg being offlined
 *
 * Also prevents creation of any new wb's associated with @memcg.
 */
void wb_memcg_offline(struct mem_cgroup *memcg)
{
        struct list_head *memcg_cgwb_list = &memcg->cgwb_list;
        struct bdi_writeback *wb, *next;

        spin_lock_irq(&cgwb_lock);
        list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
                cgwb_kill(wb);
        memcg_cgwb_list->next = NULL;        /* prevent new wb's */
        spin_unlock_irq(&cgwb_lock);

        queue_work(system_unbound_wq, &cleanup_offline_cgwbs_work);
}

/**
 * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
 * @css: blkcg being offlined
 *
 * Also prevents creation of any new wb's associated with @blkcg.
 */
void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
        struct bdi_writeback *wb, *next;
        struct list_head *list = blkcg_get_cgwb_list(css);

        spin_lock_irq(&cgwb_lock);
        list_for_each_entry_safe(wb, next, list, blkcg_node)
                cgwb_kill(wb);
        list->next = NULL;        /* prevent new wb's */
        spin_unlock_irq(&cgwb_lock);
}

static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
        spin_lock_irq(&cgwb_lock);
        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
        spin_unlock_irq(&cgwb_lock);
}

static int __init cgwb_init(void)
{
        /*
         * There can be many concurrent release work items overwhelming
         * system_wq.  Put them in a separate wq and limit concurrency.
         * There's no point in executing many of these in parallel.
         */
        cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1);
        if (!cgwb_release_wq)
                return -ENOMEM;

        return 0;
}
subsys_initcall(cgwb_init);

#else        /* CONFIG_CGROUP_WRITEBACK */

static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
        return wb_init(&bdi->wb, bdi, GFP_KERNEL);
}

static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }

static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
        list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
}

static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb)
{
        list_del_rcu(&wb->bdi_node);
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

int bdi_init(struct backing_dev_info *bdi)
{
        bdi->dev = NULL;

        kref_init(&bdi->refcnt);
        bdi->min_ratio = 0;
        bdi->max_ratio = 100 * BDI_RATIO_SCALE;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
        INIT_LIST_HEAD(&bdi->bdi_list);
        INIT_LIST_HEAD(&bdi->wb_list);
        init_waitqueue_head(&bdi->wb_waitq);
        bdi->last_bdp_sleep = jiffies;

        return cgwb_bdi_init(bdi);
}

struct backing_dev_info *bdi_alloc(int node_id)
{
        struct backing_dev_info *bdi;

        bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
        if (!bdi)
                return NULL;

        if (bdi_init(bdi)) {
                kfree(bdi);
                return NULL;
        }
        bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT;
        bdi->ra_pages = VM_READAHEAD_PAGES;
        bdi->io_pages = VM_READAHEAD_PAGES;
        timer_setup(&bdi->laptop_mode_wb_timer, laptop_mode_timer_fn, 0);
        return bdi;
}
EXPORT_SYMBOL(bdi_alloc);

static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
{
        struct rb_node **p = &bdi_tree.rb_node;
        struct rb_node *parent = NULL;
        struct backing_dev_info *bdi;

        lockdep_assert_held(&bdi_lock);

        while (*p) {
                parent = *p;
                bdi = rb_entry(parent, struct backing_dev_info, rb_node);

                if (bdi->id > id)
                        p = &(*p)->rb_left;
                else if (bdi->id < id)
                        p = &(*p)->rb_right;
                else
                        break;
        }

        if (parentp)
                *parentp = parent;
        return p;
}

/**
 * bdi_get_by_id - lookup and get bdi from its id
 * @id: bdi id to lookup
 *
 * Find bdi matching @id and get it.  Returns NULL if the matching bdi
 * doesn't exist or is already unregistered.
 */
struct backing_dev_info *bdi_get_by_id(u64 id)
{
        struct backing_dev_info *bdi = NULL;
        struct rb_node **p;

        spin_lock_bh(&bdi_lock);
        p = bdi_lookup_rb_node(id, NULL);
        if (*p) {
                bdi = rb_entry(*p, struct backing_dev_info, rb_node);
                bdi_get(bdi);
        }
        spin_unlock_bh(&bdi_lock);

        return bdi;
}

int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
        struct device *dev;
        struct rb_node *parent, **p;

        if (bdi->dev)        /* The driver needs to use separate queues per device */
                return 0;

        vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
        dev = device_create(&bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        cgwb_bdi_register(bdi);
        bdi->dev = dev;

        bdi_debug_register(bdi, dev_name(dev));
        set_bit(WB_registered, &bdi->wb.state);

        spin_lock_bh(&bdi_lock);

        bdi->id = ++bdi_id_cursor;

        p = bdi_lookup_rb_node(bdi->id, &parent);
        rb_link_node(&bdi->rb_node, parent, p);
        rb_insert_color(&bdi->rb_node, &bdi_tree);

        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);

        spin_unlock_bh(&bdi_lock);

        trace_writeback_bdi_register(bdi);
        return 0;
}

int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
        va_list args;
        int ret;

        va_start(args, fmt);
        ret = bdi_register_va(bdi, fmt, args);
        va_end(args);
        return ret;
}
EXPORT_SYMBOL(bdi_register);

void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
{
        WARN_ON_ONCE(bdi->owner);
        bdi->owner = owner;
        get_device(owner);
}

/*
 * Remove bdi from bdi_list, and ensure that it is no longer visible
 */
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
        spin_lock_bh(&bdi_lock);
        rb_erase(&bdi->rb_node, &bdi_tree);
        list_del_rcu(&bdi->bdi_list);
        spin_unlock_bh(&bdi_lock);

        synchronize_rcu_expedited();
}

void bdi_unregister(struct backing_dev_info *bdi)
{
        timer_delete_sync(&bdi->laptop_mode_wb_timer);

        /* make sure nobody finds us on the bdi_list anymore */
        bdi_remove_from_list(bdi);
        wb_shutdown(&bdi->wb);
        cgwb_bdi_unregister(bdi);

        /*
         * If this BDI's min ratio has been set, use bdi_set_min_ratio() to
         * update the global bdi_min_ratio.
         */
        if (bdi->min_ratio)
                bdi_set_min_ratio(bdi, 0);

        if (bdi->dev) {
                bdi_debug_unregister(bdi);
                device_unregister(bdi->dev);
                bdi->dev = NULL;
        }

        if (bdi->owner) {
                put_device(bdi->owner);
                bdi->owner = NULL;
        }
}
EXPORT_SYMBOL(bdi_unregister);

static void release_bdi(struct kref *ref)
{
        struct backing_dev_info *bdi =
                        container_of(ref, struct backing_dev_info, refcnt);

        WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
        WARN_ON_ONCE(bdi->dev);
        wb_exit(&bdi->wb);
        kfree(bdi);
}

void bdi_put(struct backing_dev_info *bdi)
{
        kref_put(&bdi->refcnt, release_bdi);
}
EXPORT_SYMBOL(bdi_put);

struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
        struct super_block *sb;

        if (!inode)
                return &noop_backing_dev_info;

        sb = inode->i_sb;
#ifdef CONFIG_BLOCK
        if (sb_is_blkdev_sb(sb))
                return I_BDEV(inode)->bd_disk->bdi;
#endif
        return sb->s_bdi;
}
EXPORT_SYMBOL(inode_to_bdi);

const char *bdi_dev_name(struct backing_dev_info *bdi)
{
        if (!bdi || !bdi->dev)
                return bdi_unknown_name;
        return bdi->dev_name;
}
EXPORT_SYMBOL_GPL(bdi_dev_name);



















































































































































































































































































































































































































































































































































































































































































    3 





    3 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netfilter.h>
#include <net/flow_offload.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_offload.h>
#include <net/pkt_cls.h>

static struct nft_flow_rule *nft_flow_rule_alloc(int num_actions)
{
        struct nft_flow_rule *flow;

        flow = kzalloc(sizeof(struct nft_flow_rule), GFP_KERNEL);
        if (!flow)
                return NULL;

        flow->rule = flow_rule_alloc(num_actions);
        if (!flow->rule) {
                kfree(flow);
                return NULL;
        }

        flow->rule->match.dissector        = &flow->match.dissector;
        flow->rule->match.mask                = &flow->match.mask;
        flow->rule->match.key                = &flow->match.key;

        return flow;
}

void nft_flow_rule_set_addr_type(struct nft_flow_rule *flow,
                                 enum flow_dissector_key_id addr_type)
{
        struct nft_flow_match *match = &flow->match;
        struct nft_flow_key *mask = &match->mask;
        struct nft_flow_key *key = &match->key;

        if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL))
                return;

        key->control.addr_type = addr_type;
        mask->control.addr_type = 0xffff;
        match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CONTROL);
        match->dissector.offset[FLOW_DISSECTOR_KEY_CONTROL] =
                offsetof(struct nft_flow_key, control);
}

struct nft_offload_ethertype {
        __be16 value;
        __be16 mask;
};

static void nft_flow_rule_transfer_vlan(struct nft_offload_ctx *ctx,
                                        struct nft_flow_rule *flow)
{
        struct nft_flow_match *match = &flow->match;
        struct nft_offload_ethertype ethertype = {
                .value        = match->key.basic.n_proto,
                .mask        = match->mask.basic.n_proto,
        };

        if (match->dissector.used_keys & BIT_ULL(FLOW_DISSECTOR_KEY_VLAN) &&
            (match->key.vlan.vlan_tpid == htons(ETH_P_8021Q) ||
             match->key.vlan.vlan_tpid == htons(ETH_P_8021AD))) {
                match->key.basic.n_proto = match->key.cvlan.vlan_tpid;
                match->mask.basic.n_proto = match->mask.cvlan.vlan_tpid;
                match->key.cvlan.vlan_tpid = match->key.vlan.vlan_tpid;
                match->mask.cvlan.vlan_tpid = match->mask.vlan.vlan_tpid;
                match->key.vlan.vlan_tpid = ethertype.value;
                match->mask.vlan.vlan_tpid = ethertype.mask;
                match->dissector.offset[FLOW_DISSECTOR_KEY_CVLAN] =
                        offsetof(struct nft_flow_key, cvlan);
                match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_CVLAN);
        } else if (match->dissector.used_keys &
                   BIT_ULL(FLOW_DISSECTOR_KEY_BASIC) &&
                   (match->key.basic.n_proto == htons(ETH_P_8021Q) ||
                    match->key.basic.n_proto == htons(ETH_P_8021AD))) {
                match->key.basic.n_proto = match->key.vlan.vlan_tpid;
                match->mask.basic.n_proto = match->mask.vlan.vlan_tpid;
                match->key.vlan.vlan_tpid = ethertype.value;
                match->mask.vlan.vlan_tpid = ethertype.mask;
                match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] =
                        offsetof(struct nft_flow_key, vlan);
                match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_VLAN);
        }
}

struct nft_flow_rule *nft_flow_rule_create(struct net *net,
                                           const struct nft_rule *rule)
{
        struct nft_offload_ctx *ctx;
        struct nft_flow_rule *flow;
        int num_actions = 0, err;
        struct nft_expr *expr;

        expr = nft_expr_first(rule);
        while (nft_expr_more(rule, expr)) {
                if (expr->ops->offload_action &&
                    expr->ops->offload_action(expr))
                        num_actions++;

                expr = nft_expr_next(expr);
        }

        if (num_actions == 0)
                return ERR_PTR(-EOPNOTSUPP);

        flow = nft_flow_rule_alloc(num_actions);
        if (!flow)
                return ERR_PTR(-ENOMEM);

        expr = nft_expr_first(rule);

        ctx = kzalloc(sizeof(struct nft_offload_ctx), GFP_KERNEL);
        if (!ctx) {
                err = -ENOMEM;
                goto err_out;
        }
        ctx->net = net;
        ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;

        while (nft_expr_more(rule, expr)) {
                if (!expr->ops->offload) {
                        err = -EOPNOTSUPP;
                        goto err_out;
                }
                err = expr->ops->offload(ctx, flow, expr);
                if (err < 0)
                        goto err_out;

                expr = nft_expr_next(expr);
        }
        nft_flow_rule_transfer_vlan(ctx, flow);

        flow->proto = ctx->dep.l3num;
        kfree(ctx);

        return flow;
err_out:
        kfree(ctx);
        nft_flow_rule_destroy(flow);

        return ERR_PTR(err);
}

void nft_flow_rule_destroy(struct nft_flow_rule *flow)
{
        struct flow_action_entry *entry;
        int i;

        flow_action_for_each(i, entry, &flow->rule->action) {
                switch (entry->id) {
                case FLOW_ACTION_REDIRECT:
                case FLOW_ACTION_MIRRED:
                        dev_put(entry->dev);
                        break;
                default:
                        break;
                }
        }
        kfree(flow->rule);
        kfree(flow);
}

void nft_offload_set_dependency(struct nft_offload_ctx *ctx,
                                enum nft_offload_dep_type type)
{
        ctx->dep.type = type;
}

void nft_offload_update_dependency(struct nft_offload_ctx *ctx,
                                   const void *data, u32 len)
{
        switch (ctx->dep.type) {
        case NFT_OFFLOAD_DEP_NETWORK:
                WARN_ON(len != sizeof(__u16));
                memcpy(&ctx->dep.l3num, data, sizeof(__u16));
                break;
        case NFT_OFFLOAD_DEP_TRANSPORT:
                WARN_ON(len != sizeof(__u8));
                memcpy(&ctx->dep.protonum, data, sizeof(__u8));
                break;
        default:
                break;
        }
        ctx->dep.type = NFT_OFFLOAD_DEP_UNSPEC;
}

static void nft_flow_offload_common_init(struct flow_cls_common_offload *common,
                                         __be16 proto, int priority,
                                         struct netlink_ext_ack *extack)
{
        common->protocol = proto;
        common->prio = priority;
        common->extack = extack;
}

static int nft_setup_cb_call(enum tc_setup_type type, void *type_data,
                             struct list_head *cb_list)
{
        struct flow_block_cb *block_cb;
        int err;

        list_for_each_entry(block_cb, cb_list, list) {
                err = block_cb->cb(type, type_data, block_cb->cb_priv);
                if (err < 0)
                        return err;
        }
        return 0;
}

static int nft_chain_offload_priority(const struct nft_base_chain *basechain)
{
        if (basechain->ops.priority <= 0 ||
            basechain->ops.priority > USHRT_MAX)
                return -1;

        return 0;
}

bool nft_chain_offload_support(const struct nft_base_chain *basechain)
{
        struct net_device *dev;
        struct nft_hook *hook;

        if (nft_chain_offload_priority(basechain) < 0)
                return false;

        list_for_each_entry(hook, &basechain->hook_list, list) {
                if (hook->ops.pf != NFPROTO_NETDEV ||
                    hook->ops.hooknum != NF_NETDEV_INGRESS)
                        return false;

                dev = hook->ops.dev;
                if (!dev->netdev_ops->ndo_setup_tc && !flow_indr_dev_exists())
                        return false;
        }

        return true;
}

static void nft_flow_cls_offload_setup(struct flow_cls_offload *cls_flow,
                                       const struct nft_base_chain *basechain,
                                       const struct nft_rule *rule,
                                       const struct nft_flow_rule *flow,
                                       struct netlink_ext_ack *extack,
                                       enum flow_cls_command command)
{
        __be16 proto = ETH_P_ALL;

        memset(cls_flow, 0, sizeof(*cls_flow));

        if (flow)
                proto = flow->proto;

        nft_flow_offload_common_init(&cls_flow->common, proto,
                                     basechain->ops.priority, extack);
        cls_flow->command = command;
        cls_flow->cookie = (unsigned long) rule;
        if (flow)
                cls_flow->rule = flow->rule;
}

static int nft_flow_offload_cmd(const struct nft_chain *chain,
                                const struct nft_rule *rule,
                                struct nft_flow_rule *flow,
                                enum flow_cls_command command,
                                struct flow_cls_offload *cls_flow)
{
        struct netlink_ext_ack extack = {};
        struct nft_base_chain *basechain;

        if (!nft_is_base_chain(chain))
                return -EOPNOTSUPP;

        basechain = nft_base_chain(chain);
        nft_flow_cls_offload_setup(cls_flow, basechain, rule, flow, &extack,
                                   command);

        return nft_setup_cb_call(TC_SETUP_CLSFLOWER, cls_flow,
                                 &basechain->flow_block.cb_list);
}

static int nft_flow_offload_rule(const struct nft_chain *chain,
                                 struct nft_rule *rule,
                                 struct nft_flow_rule *flow,
                                 enum flow_cls_command command)
{
        struct flow_cls_offload cls_flow;

        return nft_flow_offload_cmd(chain, rule, flow, command, &cls_flow);
}

int nft_flow_rule_stats(const struct nft_chain *chain,
                        const struct nft_rule *rule)
{
        struct flow_cls_offload cls_flow = {};
        struct nft_expr *expr, *next;
        int err;

        err = nft_flow_offload_cmd(chain, rule, NULL, FLOW_CLS_STATS,
                                   &cls_flow);
        if (err < 0)
                return err;

        nft_rule_for_each_expr(expr, next, rule) {
                if (expr->ops->offload_stats)
                        expr->ops->offload_stats(expr, &cls_flow.stats);
        }

        return 0;
}

static int nft_flow_offload_bind(struct flow_block_offload *bo,
                                 struct nft_base_chain *basechain)
{
        list_splice(&bo->cb_list, &basechain->flow_block.cb_list);
        return 0;
}

static int nft_flow_offload_unbind(struct flow_block_offload *bo,
                                   struct nft_base_chain *basechain)
{
        struct flow_block_cb *block_cb, *next;
        struct flow_cls_offload cls_flow;
        struct netlink_ext_ack extack;
        struct nft_chain *chain;
        struct nft_rule *rule;

        chain = &basechain->chain;
        list_for_each_entry(rule, &chain->rules, list) {
                memset(&extack, 0, sizeof(extack));
                nft_flow_cls_offload_setup(&cls_flow, basechain, rule, NULL,
                                           &extack, FLOW_CLS_DESTROY);
                nft_setup_cb_call(TC_SETUP_CLSFLOWER, &cls_flow, &bo->cb_list);
        }

        list_for_each_entry_safe(block_cb, next, &bo->cb_list, list) {
                list_del(&block_cb->list);
                flow_block_cb_free(block_cb);
        }

        return 0;
}

static int nft_block_setup(struct nft_base_chain *basechain,
                           struct flow_block_offload *bo,
                           enum flow_block_command cmd)
{
        int err;

        switch (cmd) {
        case FLOW_BLOCK_BIND:
                err = nft_flow_offload_bind(bo, basechain);
                break;
        case FLOW_BLOCK_UNBIND:
                err = nft_flow_offload_unbind(bo, basechain);
                break;
        default:
                WARN_ON_ONCE(1);
                err = -EOPNOTSUPP;
        }

        return err;
}

static void nft_flow_block_offload_init(struct flow_block_offload *bo,
                                        struct net *net,
                                        enum flow_block_command cmd,
                                        struct nft_base_chain *basechain,
                                        struct netlink_ext_ack *extack)
{
        memset(bo, 0, sizeof(*bo));
        bo->net                = net;
        bo->block        = &basechain->flow_block;
        bo->command        = cmd;
        bo->binder_type        = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS;
        bo->extack        = extack;
        bo->cb_list_head = &basechain->flow_block.cb_list;
        INIT_LIST_HEAD(&bo->cb_list);
}

static int nft_block_offload_cmd(struct nft_base_chain *chain,
                                 struct net_device *dev,
                                 enum flow_block_command cmd)
{
        struct netlink_ext_ack extack = {};
        struct flow_block_offload bo;
        int err;

        nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack);

        err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo);
        if (err < 0)
                return err;

        return nft_block_setup(chain, &bo, cmd);
}

static void nft_indr_block_cleanup(struct flow_block_cb *block_cb)
{
        struct nft_base_chain *basechain = block_cb->indr.data;
        struct net_device *dev = block_cb->indr.dev;
        struct netlink_ext_ack extack = {};
        struct nftables_pernet *nft_net;
        struct net *net = dev_net(dev);
        struct flow_block_offload bo;

        nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND,
                                    basechain, &extack);
        nft_net = nft_pernet(net);
        mutex_lock(&nft_net->commit_mutex);
        list_del(&block_cb->driver_list);
        list_move(&block_cb->list, &bo.cb_list);
        nft_flow_offload_unbind(&bo, basechain);
        mutex_unlock(&nft_net->commit_mutex);
}

static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain,
                                      struct net_device *dev,
                                      enum flow_block_command cmd)
{
        struct netlink_ext_ack extack = {};
        struct flow_block_offload bo;
        int err;

        nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack);

        err = flow_indr_dev_setup_offload(dev, NULL, TC_SETUP_BLOCK, basechain, &bo,
                                          nft_indr_block_cleanup);
        if (err < 0)
                return err;

        if (list_empty(&bo.cb_list))
                return -EOPNOTSUPP;

        return nft_block_setup(basechain, &bo, cmd);
}

static int nft_chain_offload_cmd(struct nft_base_chain *basechain,
                                 struct net_device *dev,
                                 enum flow_block_command cmd)
{
        int err;

        if (dev->netdev_ops->ndo_setup_tc)
                err = nft_block_offload_cmd(basechain, dev, cmd);
        else
                err = nft_indr_block_offload_cmd(basechain, dev, cmd);

        return err;
}

static int nft_flow_block_chain(struct nft_base_chain *basechain,
                                const struct net_device *this_dev,
                                enum flow_block_command cmd)
{
        struct net_device *dev;
        struct nft_hook *hook;
        int err, i = 0;

        list_for_each_entry(hook, &basechain->hook_list, list) {
                dev = hook->ops.dev;
                if (this_dev && this_dev != dev)
                        continue;

                err = nft_chain_offload_cmd(basechain, dev, cmd);
                if (err < 0 && cmd == FLOW_BLOCK_BIND) {
                        if (!this_dev)
                                goto err_flow_block;

                        return err;
                }
                i++;
        }

        return 0;

err_flow_block:
        list_for_each_entry(hook, &basechain->hook_list, list) {
                if (i-- <= 0)
                        break;

                dev = hook->ops.dev;
                nft_chain_offload_cmd(basechain, dev, FLOW_BLOCK_UNBIND);
        }
        return err;
}

static int nft_flow_offload_chain(struct nft_chain *chain, u8 *ppolicy,
                                  enum flow_block_command cmd)
{
        struct nft_base_chain *basechain;
        u8 policy;

        if (!nft_is_base_chain(chain))
                return -EOPNOTSUPP;

        basechain = nft_base_chain(chain);
        policy = ppolicy ? *ppolicy : basechain->policy;

        /* Only default policy to accept is supported for now. */
        if (cmd == FLOW_BLOCK_BIND && policy == NF_DROP)
                return -EOPNOTSUPP;

        return nft_flow_block_chain(basechain, NULL, cmd);
}

static void nft_flow_rule_offload_abort(struct net *net,
                                        struct nft_trans *trans)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        int err = 0;

        list_for_each_entry_continue_reverse(trans, &nft_net->commit_list, list) {
                if (trans->table->family != NFPROTO_NETDEV)
                        continue;

                switch (trans->msg_type) {
                case NFT_MSG_NEWCHAIN:
                        if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) ||
                            nft_trans_chain_update(trans))
                                continue;

                        err = nft_flow_offload_chain(nft_trans_chain(trans), NULL,
                                                     FLOW_BLOCK_UNBIND);
                        break;
                case NFT_MSG_DELCHAIN:
                        if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
                                continue;

                        err = nft_flow_offload_chain(nft_trans_chain(trans), NULL,
                                                     FLOW_BLOCK_BIND);
                        break;
                case NFT_MSG_NEWRULE:
                        if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
                                continue;

                        err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
                                                    nft_trans_rule(trans),
                                                    NULL, FLOW_CLS_DESTROY);
                        break;
                case NFT_MSG_DELRULE:
                        if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
                                continue;

                        err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
                                                    nft_trans_rule(trans),
                                                    nft_trans_flow_rule(trans),
                                                    FLOW_CLS_REPLACE);
                        break;
                }

                if (WARN_ON_ONCE(err))
                        break;
        }
}

int nft_flow_rule_offload_commit(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans *trans;
        int err = 0;
        u8 policy;

        list_for_each_entry(trans, &nft_net->commit_list, list) {
                if (trans->table->family != NFPROTO_NETDEV)
                        continue;

                switch (trans->msg_type) {
                case NFT_MSG_NEWCHAIN:
                        if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD) ||
                            nft_trans_chain_update(trans))
                                continue;

                        policy = nft_trans_chain_policy(trans);
                        err = nft_flow_offload_chain(nft_trans_chain(trans), &policy,
                                                     FLOW_BLOCK_BIND);
                        break;
                case NFT_MSG_DELCHAIN:
                        if (!(nft_trans_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
                                continue;

                        policy = nft_trans_chain_policy(trans);
                        err = nft_flow_offload_chain(nft_trans_chain(trans), &policy,
                                                     FLOW_BLOCK_UNBIND);
                        break;
                case NFT_MSG_NEWRULE:
                        if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
                                continue;

                        if (trans->flags & NLM_F_REPLACE ||
                            !(trans->flags & NLM_F_APPEND)) {
                                err = -EOPNOTSUPP;
                                break;
                        }
                        err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
                                                    nft_trans_rule(trans),
                                                    nft_trans_flow_rule(trans),
                                                    FLOW_CLS_REPLACE);
                        break;
                case NFT_MSG_DELRULE:
                        if (!(nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD))
                                continue;

                        err = nft_flow_offload_rule(nft_trans_rule_chain(trans),
                                                    nft_trans_rule(trans),
                                                    NULL, FLOW_CLS_DESTROY);
                        break;
                }

                if (err) {
                        nft_flow_rule_offload_abort(net, trans);
                        break;
                }
        }

        return err;
}

static struct nft_chain *__nft_offload_get_chain(const struct nftables_pernet *nft_net,
                                                 struct net_device *dev)
{
        struct nft_base_chain *basechain;
        struct nft_hook *hook, *found;
        const struct nft_table *table;
        struct nft_chain *chain;

        list_for_each_entry(table, &nft_net->tables, list) {
                if (table->family != NFPROTO_NETDEV)
                        continue;

                list_for_each_entry(chain, &table->chains, list) {
                        if (!nft_is_base_chain(chain) ||
                            !(chain->flags & NFT_CHAIN_HW_OFFLOAD))
                                continue;

                        found = NULL;
                        basechain = nft_base_chain(chain);
                        list_for_each_entry(hook, &basechain->hook_list, list) {
                                if (hook->ops.dev != dev)
                                        continue;

                                found = hook;
                                break;
                        }
                        if (!found)
                                continue;

                        return chain;
                }
        }

        return NULL;
}

static int nft_offload_netdev_event(struct notifier_block *this,
                                    unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct nftables_pernet *nft_net;
        struct net *net = dev_net(dev);
        struct nft_chain *chain;

        if (event != NETDEV_UNREGISTER)
                return NOTIFY_DONE;

        nft_net = nft_pernet(net);
        mutex_lock(&nft_net->commit_mutex);
        chain = __nft_offload_get_chain(nft_net, dev);
        if (chain)
                nft_flow_block_chain(nft_base_chain(chain), dev,
                                     FLOW_BLOCK_UNBIND);

        mutex_unlock(&nft_net->commit_mutex);

        return NOTIFY_DONE;
}

static struct notifier_block nft_offload_netdev_notifier = {
        .notifier_call        = nft_offload_netdev_event,
};

int nft_offload_init(void)
{
        return register_netdevice_notifier(&nft_offload_netdev_notifier);
}

void nft_offload_exit(void)
{
        unregister_netdevice_notifier(&nft_offload_netdev_notifier);
}













































































  130 


  124 














   14 











    9 







  129 






  124 



    3 


    2 


   90 







    2 


   39 


   13 


   39 


    2 


    2 


    2 


    1 


    1 


    1 




    2 



    1 


    2 





















    2 




    3 


   38 
















    2 




    2 





   82 




   66 

   12 

    1 








    5 




    2 





    3 










   54 






















   16 
   16 
















    1 








    1 

    1 

    1 













    1 









    2 



    1 


















    1 










































    7 





























    4 


    1 





    2 













    1 


















    2 



    3 


    1 


















    3 



    3 













    1 




    1 





    8 



    8 



    8 


    8 






    8 



    7 






    8 
    5 

    5 


























    9 













    5 
    5 













    5 



    5 








    4 






    7 



    1 













    4 



    1 






    1 




    1 

    1 




    1 

    1 





    1 














    8 



    8 















    5 







    5 









    5 


    5 





   86 

    1 


   10 
   43 



    6 


    4 
    4 

   22 



  162 

    1 


   28 
   86 



    6 


    5 
    5 

   80 




    1 





    1 





    3 



    2 
    1 












    8 




    2 



    1 
    3 



    2 


    3 
    4 



































    1 




    1 





    1 















    4 
    4 

    1 




    1 






    1 
    1 






   37 


    1 

   25 




    4 


    7 











   10 


    1 

    3 


    3 


    3 











    6 


    1 

    3 


    1 


    1 











    2 







    2 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Derived from arch/arm/kvm/guest.c:
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <linux/bits.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/nospec.h>
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
#include <kvm/arm_hypercalls.h>
#include <asm/cputype.h>
#include <linux/uaccess.h>
#include <asm/fpsimd.h>
#include <asm/kvm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_nested.h>
#include <asm/sigcontext.h>

#include "trace.h"

const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
        KVM_GENERIC_VM_STATS()
};

const struct kvm_stats_header kvm_vm_stats_header = {
        .name_size = KVM_STATS_NAME_SIZE,
        .num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
        .id_offset =  sizeof(struct kvm_stats_header),
        .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
        .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
                       sizeof(kvm_vm_stats_desc),
};

const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
        KVM_GENERIC_VCPU_STATS(),
        STATS_DESC_COUNTER(VCPU, hvc_exit_stat),
        STATS_DESC_COUNTER(VCPU, wfe_exit_stat),
        STATS_DESC_COUNTER(VCPU, wfi_exit_stat),
        STATS_DESC_COUNTER(VCPU, mmio_exit_user),
        STATS_DESC_COUNTER(VCPU, mmio_exit_kernel),
        STATS_DESC_COUNTER(VCPU, signal_exits),
        STATS_DESC_COUNTER(VCPU, exits)
};

const struct kvm_stats_header kvm_vcpu_stats_header = {
        .name_size = KVM_STATS_NAME_SIZE,
        .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
        .id_offset = sizeof(struct kvm_stats_header),
        .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
        .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
                       sizeof(kvm_vcpu_stats_desc),
};

static bool core_reg_offset_is_vreg(u64 off)
{
        return off >= KVM_REG_ARM_CORE_REG(fp_regs.vregs) &&
                off < KVM_REG_ARM_CORE_REG(fp_regs.fpsr);
}

static u64 core_reg_offset_from_id(u64 id)
{
        return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE);
}

static int core_reg_size_from_offset(const struct kvm_vcpu *vcpu, u64 off)
{
        int size;

        switch (off) {
        case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
             KVM_REG_ARM_CORE_REG(regs.regs[30]):
        case KVM_REG_ARM_CORE_REG(regs.sp):
        case KVM_REG_ARM_CORE_REG(regs.pc):
        case KVM_REG_ARM_CORE_REG(regs.pstate):
        case KVM_REG_ARM_CORE_REG(sp_el1):
        case KVM_REG_ARM_CORE_REG(elr_el1):
        case KVM_REG_ARM_CORE_REG(spsr[0]) ...
             KVM_REG_ARM_CORE_REG(spsr[KVM_NR_SPSR - 1]):
                size = sizeof(__u64);
                break;

        case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
             KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
                size = sizeof(__uint128_t);
                break;

        case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
        case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
                size = sizeof(__u32);
                break;

        default:
                return -EINVAL;
        }

        if (!IS_ALIGNED(off, size / sizeof(__u32)))
                return -EINVAL;

        /*
         * The KVM_REG_ARM64_SVE regs must be used instead of
         * KVM_REG_ARM_CORE for accessing the FPSIMD V-registers on
         * SVE-enabled vcpus:
         */
        if (vcpu_has_sve(vcpu) && core_reg_offset_is_vreg(off))
                return -EINVAL;

        return size;
}

static void *core_reg_addr(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        u64 off = core_reg_offset_from_id(reg->id);
        int size = core_reg_size_from_offset(vcpu, off);

        if (size < 0)
                return NULL;

        if (KVM_REG_SIZE(reg->id) != size)
                return NULL;

        switch (off) {
        case KVM_REG_ARM_CORE_REG(regs.regs[0]) ...
             KVM_REG_ARM_CORE_REG(regs.regs[30]):
                off -= KVM_REG_ARM_CORE_REG(regs.regs[0]);
                off /= 2;
                return &vcpu->arch.ctxt.regs.regs[off];

        case KVM_REG_ARM_CORE_REG(regs.sp):
                return &vcpu->arch.ctxt.regs.sp;

        case KVM_REG_ARM_CORE_REG(regs.pc):
                return &vcpu->arch.ctxt.regs.pc;

        case KVM_REG_ARM_CORE_REG(regs.pstate):
                return &vcpu->arch.ctxt.regs.pstate;

        case KVM_REG_ARM_CORE_REG(sp_el1):
                return __ctxt_sys_reg(&vcpu->arch.ctxt, SP_EL1);

        case KVM_REG_ARM_CORE_REG(elr_el1):
                return __ctxt_sys_reg(&vcpu->arch.ctxt, ELR_EL1);

        case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_EL1]):
                return __ctxt_sys_reg(&vcpu->arch.ctxt, SPSR_EL1);

        case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_ABT]):
                return &vcpu->arch.ctxt.spsr_abt;

        case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_UND]):
                return &vcpu->arch.ctxt.spsr_und;

        case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_IRQ]):
                return &vcpu->arch.ctxt.spsr_irq;

        case KVM_REG_ARM_CORE_REG(spsr[KVM_SPSR_FIQ]):
                return &vcpu->arch.ctxt.spsr_fiq;

        case KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]) ...
             KVM_REG_ARM_CORE_REG(fp_regs.vregs[31]):
                off -= KVM_REG_ARM_CORE_REG(fp_regs.vregs[0]);
                off /= 4;
                return &vcpu->arch.ctxt.fp_regs.vregs[off];

        case KVM_REG_ARM_CORE_REG(fp_regs.fpsr):
                return &vcpu->arch.ctxt.fp_regs.fpsr;

        case KVM_REG_ARM_CORE_REG(fp_regs.fpcr):
                return &vcpu->arch.ctxt.fp_regs.fpcr;

        default:
                return NULL;
        }
}

static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        /*
         * Because the kvm_regs structure is a mix of 32, 64 and
         * 128bit fields, we index it as if it was a 32bit
         * array. Hence below, nr_regs is the number of entries, and
         * off the index in the "array".
         */
        __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr;
        int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32);
        void *addr;
        u32 off;

        /* Our ID is an index into the kvm_regs struct. */
        off = core_reg_offset_from_id(reg->id);
        if (off >= nr_regs ||
            (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
                return -ENOENT;

        addr = core_reg_addr(vcpu, reg);
        if (!addr)
                return -EINVAL;

        if (copy_to_user(uaddr, addr, KVM_REG_SIZE(reg->id)))
                return -EFAULT;

        return 0;
}

static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        __u32 __user *uaddr = (__u32 __user *)(unsigned long)reg->addr;
        int nr_regs = sizeof(struct kvm_regs) / sizeof(__u32);
        __uint128_t tmp;
        void *valp = &tmp, *addr;
        u64 off;
        int err = 0;

        /* Our ID is an index into the kvm_regs struct. */
        off = core_reg_offset_from_id(reg->id);
        if (off >= nr_regs ||
            (off + (KVM_REG_SIZE(reg->id) / sizeof(__u32))) >= nr_regs)
                return -ENOENT;

        addr = core_reg_addr(vcpu, reg);
        if (!addr)
                return -EINVAL;

        if (KVM_REG_SIZE(reg->id) > sizeof(tmp))
                return -EINVAL;

        if (copy_from_user(valp, uaddr, KVM_REG_SIZE(reg->id))) {
                err = -EFAULT;
                goto out;
        }

        if (off == KVM_REG_ARM_CORE_REG(regs.pstate)) {
                u64 mode = (*(u64 *)valp) & PSR_AA32_MODE_MASK;
                switch (mode) {
                case PSR_AA32_MODE_USR:
                        if (!kvm_supports_32bit_el0())
                                return -EINVAL;
                        break;
                case PSR_AA32_MODE_FIQ:
                case PSR_AA32_MODE_IRQ:
                case PSR_AA32_MODE_SVC:
                case PSR_AA32_MODE_ABT:
                case PSR_AA32_MODE_UND:
                case PSR_AA32_MODE_SYS:
                        if (!vcpu_el1_is_32bit(vcpu))
                                return -EINVAL;
                        break;
                case PSR_MODE_EL2h:
                case PSR_MODE_EL2t:
                        if (!vcpu_has_nv(vcpu))
                                return -EINVAL;
                        fallthrough;
                case PSR_MODE_EL0t:
                case PSR_MODE_EL1t:
                case PSR_MODE_EL1h:
                        if (vcpu_el1_is_32bit(vcpu))
                                return -EINVAL;
                        break;
                default:
                        err = -EINVAL;
                        goto out;
                }
        }

        memcpy(addr, valp, KVM_REG_SIZE(reg->id));

        if (*vcpu_cpsr(vcpu) & PSR_MODE32_BIT) {
                int i, nr_reg;

                switch (*vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK) {
                /*
                 * Either we are dealing with user mode, and only the
                 * first 15 registers (+ PC) must be narrowed to 32bit.
                 * AArch32 r0-r14 conveniently map to AArch64 x0-x14.
                 */
                case PSR_AA32_MODE_USR:
                case PSR_AA32_MODE_SYS:
                        nr_reg = 15;
                        break;

                /*
                 * Otherwise, this is a privileged mode, and *all* the
                 * registers must be narrowed to 32bit.
                 */
                default:
                        nr_reg = 31;
                        break;
                }

                for (i = 0; i < nr_reg; i++)
                        vcpu_set_reg(vcpu, i, (u32)vcpu_get_reg(vcpu, i));

                *vcpu_pc(vcpu) = (u32)*vcpu_pc(vcpu);
        }
out:
        return err;
}

#define vq_word(vq) (((vq) - SVE_VQ_MIN) / 64)
#define vq_mask(vq) ((u64)1 << ((vq) - SVE_VQ_MIN) % 64)
#define vq_present(vqs, vq) (!!((vqs)[vq_word(vq)] & vq_mask(vq)))

static int get_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        unsigned int max_vq, vq;
        u64 vqs[KVM_ARM64_SVE_VLS_WORDS];

        if (!vcpu_has_sve(vcpu))
                return -ENOENT;

        if (WARN_ON(!sve_vl_valid(vcpu->arch.sve_max_vl)))
                return -EINVAL;

        memset(vqs, 0, sizeof(vqs));

        max_vq = vcpu_sve_max_vq(vcpu);
        for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
                if (sve_vq_available(vq))
                        vqs[vq_word(vq)] |= vq_mask(vq);

        if (copy_to_user((void __user *)reg->addr, vqs, sizeof(vqs)))
                return -EFAULT;

        return 0;
}

static int set_sve_vls(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        unsigned int max_vq, vq;
        u64 vqs[KVM_ARM64_SVE_VLS_WORDS];

        if (!vcpu_has_sve(vcpu))
                return -ENOENT;

        if (kvm_arm_vcpu_sve_finalized(vcpu))
                return -EPERM; /* too late! */

        if (WARN_ON(vcpu->arch.sve_state))
                return -EINVAL;

        if (copy_from_user(vqs, (const void __user *)reg->addr, sizeof(vqs)))
                return -EFAULT;

        max_vq = 0;
        for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; ++vq)
                if (vq_present(vqs, vq))
                        max_vq = vq;

        if (max_vq > sve_vq_from_vl(kvm_sve_max_vl))
                return -EINVAL;

        /*
         * Vector lengths supported by the host can't currently be
         * hidden from the guest individually: instead we can only set a
         * maximum via ZCR_EL2.LEN.  So, make sure the available vector
         * lengths match the set requested exactly up to the requested
         * maximum:
         */
        for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq)
                if (vq_present(vqs, vq) != sve_vq_available(vq))
                        return -EINVAL;

        /* Can't run with no vector lengths at all: */
        if (max_vq < SVE_VQ_MIN)
                return -EINVAL;

        /* vcpu->arch.sve_state will be alloc'd by kvm_vcpu_finalize_sve() */
        vcpu->arch.sve_max_vl = sve_vl_from_vq(max_vq);

        return 0;
}

#define SVE_REG_SLICE_SHIFT        0
#define SVE_REG_SLICE_BITS        5
#define SVE_REG_ID_SHIFT        (SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS)
#define SVE_REG_ID_BITS                5

#define SVE_REG_SLICE_MASK                                        \
        GENMASK(SVE_REG_SLICE_SHIFT + SVE_REG_SLICE_BITS - 1,        \
                SVE_REG_SLICE_SHIFT)
#define SVE_REG_ID_MASK                                                        \
        GENMASK(SVE_REG_ID_SHIFT + SVE_REG_ID_BITS - 1, SVE_REG_ID_SHIFT)

#define SVE_NUM_SLICES (1 << SVE_REG_SLICE_BITS)

#define KVM_SVE_ZREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_ZREG(0, 0))
#define KVM_SVE_PREG_SIZE KVM_REG_SIZE(KVM_REG_ARM64_SVE_PREG(0, 0))

/*
 * Number of register slices required to cover each whole SVE register.
 * NOTE: Only the first slice every exists, for now.
 * If you are tempted to modify this, you must also rework sve_reg_to_region()
 * to match:
 */
#define vcpu_sve_slices(vcpu) 1

/* Bounds of a single SVE register slice within vcpu->arch.sve_state */
struct sve_state_reg_region {
        unsigned int koffset;        /* offset into sve_state in kernel memory */
        unsigned int klen;        /* length in kernel memory */
        unsigned int upad;        /* extra trailing padding in user memory */
};

/*
 * Validate SVE register ID and get sanitised bounds for user/kernel SVE
 * register copy
 */
static int sve_reg_to_region(struct sve_state_reg_region *region,
                             struct kvm_vcpu *vcpu,
                             const struct kvm_one_reg *reg)
{
        /* reg ID ranges for Z- registers */
        const u64 zreg_id_min = KVM_REG_ARM64_SVE_ZREG(0, 0);
        const u64 zreg_id_max = KVM_REG_ARM64_SVE_ZREG(SVE_NUM_ZREGS - 1,
                                                       SVE_NUM_SLICES - 1);

        /* reg ID ranges for P- registers and FFR (which are contiguous) */
        const u64 preg_id_min = KVM_REG_ARM64_SVE_PREG(0, 0);
        const u64 preg_id_max = KVM_REG_ARM64_SVE_FFR(SVE_NUM_SLICES - 1);

        unsigned int vq;
        unsigned int reg_num;

        unsigned int reqoffset, reqlen; /* User-requested offset and length */
        unsigned int maxlen; /* Maximum permitted length */

        size_t sve_state_size;

        const u64 last_preg_id = KVM_REG_ARM64_SVE_PREG(SVE_NUM_PREGS - 1,
                                                        SVE_NUM_SLICES - 1);

        /* Verify that the P-regs and FFR really do have contiguous IDs: */
        BUILD_BUG_ON(KVM_REG_ARM64_SVE_FFR(0) != last_preg_id + 1);

        /* Verify that we match the UAPI header: */
        BUILD_BUG_ON(SVE_NUM_SLICES != KVM_ARM64_SVE_MAX_SLICES);

        reg_num = (reg->id & SVE_REG_ID_MASK) >> SVE_REG_ID_SHIFT;

        if (reg->id >= zreg_id_min && reg->id <= zreg_id_max) {
                if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
                        return -ENOENT;

                vq = vcpu_sve_max_vq(vcpu);

                reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) -
                                SVE_SIG_REGS_OFFSET;
                reqlen = KVM_SVE_ZREG_SIZE;
                maxlen = SVE_SIG_ZREG_SIZE(vq);
        } else if (reg->id >= preg_id_min && reg->id <= preg_id_max) {
                if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0)
                        return -ENOENT;

                vq = vcpu_sve_max_vq(vcpu);

                reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) -
                                SVE_SIG_REGS_OFFSET;
                reqlen = KVM_SVE_PREG_SIZE;
                maxlen = SVE_SIG_PREG_SIZE(vq);
        } else {
                return -EINVAL;
        }

        sve_state_size = vcpu_sve_state_size(vcpu);
        if (WARN_ON(!sve_state_size))
                return -EINVAL;

        region->koffset = array_index_nospec(reqoffset, sve_state_size);
        region->klen = min(maxlen, reqlen);
        region->upad = reqlen - region->klen;

        return 0;
}

static int get_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        int ret;
        struct sve_state_reg_region region;
        char __user *uptr = (char __user *)reg->addr;

        /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */
        if (reg->id == KVM_REG_ARM64_SVE_VLS)
                return get_sve_vls(vcpu, reg);

        /* Try to interpret reg ID as an architectural SVE register... */
        ret = sve_reg_to_region(&region, vcpu, reg);
        if (ret)
                return ret;

        if (!kvm_arm_vcpu_sve_finalized(vcpu))
                return -EPERM;

        if (copy_to_user(uptr, vcpu->arch.sve_state + region.koffset,
                         region.klen) ||
            clear_user(uptr + region.klen, region.upad))
                return -EFAULT;

        return 0;
}

static int set_sve_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        int ret;
        struct sve_state_reg_region region;
        const char __user *uptr = (const char __user *)reg->addr;

        /* Handle the KVM_REG_ARM64_SVE_VLS pseudo-reg as a special case: */
        if (reg->id == KVM_REG_ARM64_SVE_VLS)
                return set_sve_vls(vcpu, reg);

        /* Try to interpret reg ID as an architectural SVE register... */
        ret = sve_reg_to_region(&region, vcpu, reg);
        if (ret)
                return ret;

        if (!kvm_arm_vcpu_sve_finalized(vcpu))
                return -EPERM;

        if (copy_from_user(vcpu->arch.sve_state + region.koffset, uptr,
                           region.klen))
                return -EFAULT;

        return 0;
}

int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
        return -EINVAL;
}

int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
{
        return -EINVAL;
}

static int copy_core_reg_indices(const struct kvm_vcpu *vcpu,
                                 u64 __user *uindices)
{
        unsigned int i;
        int n = 0;

        for (i = 0; i < sizeof(struct kvm_regs) / sizeof(__u32); i++) {
                u64 reg = KVM_REG_ARM64 | KVM_REG_ARM_CORE | i;
                int size = core_reg_size_from_offset(vcpu, i);

                if (size < 0)
                        continue;

                switch (size) {
                case sizeof(__u32):
                        reg |= KVM_REG_SIZE_U32;
                        break;

                case sizeof(__u64):
                        reg |= KVM_REG_SIZE_U64;
                        break;

                case sizeof(__uint128_t):
                        reg |= KVM_REG_SIZE_U128;
                        break;

                default:
                        WARN_ON(1);
                        continue;
                }

                if (uindices) {
                        if (put_user(reg, uindices))
                                return -EFAULT;
                        uindices++;
                }

                n++;
        }

        return n;
}

static unsigned long num_core_regs(const struct kvm_vcpu *vcpu)
{
        return copy_core_reg_indices(vcpu, NULL);
}

static const u64 timer_reg_list[] = {
        KVM_REG_ARM_TIMER_CTL,
        KVM_REG_ARM_TIMER_CNT,
        KVM_REG_ARM_TIMER_CVAL,
        KVM_REG_ARM_PTIMER_CTL,
        KVM_REG_ARM_PTIMER_CNT,
        KVM_REG_ARM_PTIMER_CVAL,
};

#define NUM_TIMER_REGS ARRAY_SIZE(timer_reg_list)

static bool is_timer_reg(u64 index)
{
        switch (index) {
        case KVM_REG_ARM_TIMER_CTL:
        case KVM_REG_ARM_TIMER_CNT:
        case KVM_REG_ARM_TIMER_CVAL:
        case KVM_REG_ARM_PTIMER_CTL:
        case KVM_REG_ARM_PTIMER_CNT:
        case KVM_REG_ARM_PTIMER_CVAL:
                return true;
        }
        return false;
}

static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
        for (int i = 0; i < NUM_TIMER_REGS; i++) {
                if (put_user(timer_reg_list[i], uindices))
                        return -EFAULT;
                uindices++;
        }

        return 0;
}

static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        void __user *uaddr = (void __user *)(long)reg->addr;
        u64 val;
        int ret;

        ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id));
        if (ret != 0)
                return -EFAULT;

        return kvm_arm_timer_set_reg(vcpu, reg->id, val);
}

static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        void __user *uaddr = (void __user *)(long)reg->addr;
        u64 val;

        val = kvm_arm_timer_get_reg(vcpu, reg->id);
        return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0;
}

static unsigned long num_sve_regs(const struct kvm_vcpu *vcpu)
{
        const unsigned int slices = vcpu_sve_slices(vcpu);

        if (!vcpu_has_sve(vcpu))
                return 0;

        /* Policed by KVM_GET_REG_LIST: */
        WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu));

        return slices * (SVE_NUM_PREGS + SVE_NUM_ZREGS + 1 /* FFR */)
                + 1; /* KVM_REG_ARM64_SVE_VLS */
}

static int copy_sve_reg_indices(const struct kvm_vcpu *vcpu,
                                u64 __user *uindices)
{
        const unsigned int slices = vcpu_sve_slices(vcpu);
        u64 reg;
        unsigned int i, n;
        int num_regs = 0;

        if (!vcpu_has_sve(vcpu))
                return 0;

        /* Policed by KVM_GET_REG_LIST: */
        WARN_ON(!kvm_arm_vcpu_sve_finalized(vcpu));

        /*
         * Enumerate this first, so that userspace can save/restore in
         * the order reported by KVM_GET_REG_LIST:
         */
        reg = KVM_REG_ARM64_SVE_VLS;
        if (put_user(reg, uindices++))
                return -EFAULT;
        ++num_regs;

        for (i = 0; i < slices; i++) {
                for (n = 0; n < SVE_NUM_ZREGS; n++) {
                        reg = KVM_REG_ARM64_SVE_ZREG(n, i);
                        if (put_user(reg, uindices++))
                                return -EFAULT;
                        num_regs++;
                }

                for (n = 0; n < SVE_NUM_PREGS; n++) {
                        reg = KVM_REG_ARM64_SVE_PREG(n, i);
                        if (put_user(reg, uindices++))
                                return -EFAULT;
                        num_regs++;
                }

                reg = KVM_REG_ARM64_SVE_FFR(i);
                if (put_user(reg, uindices++))
                        return -EFAULT;
                num_regs++;
        }

        return num_regs;
}

/**
 * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG
 * @vcpu: the vCPU pointer
 *
 * This is for all registers.
 */
unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
{
        unsigned long res = 0;

        res += num_core_regs(vcpu);
        res += num_sve_regs(vcpu);
        res += kvm_arm_num_sys_reg_descs(vcpu);
        res += kvm_arm_get_fw_num_regs(vcpu);
        res += NUM_TIMER_REGS;

        return res;
}

/**
 * kvm_arm_copy_reg_indices - get indices of all registers.
 * @vcpu: the vCPU pointer
 * @uindices: register list to copy
 *
 * We do core registers right here, then we append system regs.
 */
int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
        int ret;

        ret = copy_core_reg_indices(vcpu, uindices);
        if (ret < 0)
                return ret;
        uindices += ret;

        ret = copy_sve_reg_indices(vcpu, uindices);
        if (ret < 0)
                return ret;
        uindices += ret;

        ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices);
        if (ret < 0)
                return ret;
        uindices += kvm_arm_get_fw_num_regs(vcpu);

        ret = copy_timer_indices(vcpu, uindices);
        if (ret < 0)
                return ret;
        uindices += NUM_TIMER_REGS;

        return kvm_arm_copy_sys_reg_indices(vcpu, uindices);
}

int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        /* We currently use nothing arch-specific in upper 32 bits */
        if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32)
                return -EINVAL;

        switch (reg->id & KVM_REG_ARM_COPROC_MASK) {
        case KVM_REG_ARM_CORE:        return get_core_reg(vcpu, reg);
        case KVM_REG_ARM_FW:
        case KVM_REG_ARM_FW_FEAT_BMAP:
                return kvm_arm_get_fw_reg(vcpu, reg);
        case KVM_REG_ARM64_SVE:        return get_sve_reg(vcpu, reg);
        }

        if (is_timer_reg(reg->id))
                return get_timer_reg(vcpu, reg);

        return kvm_arm_sys_reg_get_reg(vcpu, reg);
}

int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        /* We currently use nothing arch-specific in upper 32 bits */
        if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM64 >> 32)
                return -EINVAL;

        switch (reg->id & KVM_REG_ARM_COPROC_MASK) {
        case KVM_REG_ARM_CORE:        return set_core_reg(vcpu, reg);
        case KVM_REG_ARM_FW:
        case KVM_REG_ARM_FW_FEAT_BMAP:
                return kvm_arm_set_fw_reg(vcpu, reg);
        case KVM_REG_ARM64_SVE:        return set_sve_reg(vcpu, reg);
        }

        if (is_timer_reg(reg->id))
                return set_timer_reg(vcpu, reg);

        return kvm_arm_sys_reg_set_reg(vcpu, reg);
}

int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
{
        return -EINVAL;
}

int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs)
{
        return -EINVAL;
}

int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
                              struct kvm_vcpu_events *events)
{
        events->exception.serror_pending = !!(vcpu->arch.hcr_el2 & HCR_VSE);
        events->exception.serror_has_esr = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);

        if (events->exception.serror_pending && events->exception.serror_has_esr)
                events->exception.serror_esr = vcpu_get_vsesr(vcpu);

        /*
         * We never return a pending ext_dabt here because we deliver it to
         * the virtual CPU directly when setting the event and it's no longer
         * 'pending' at this point.
         */

        return 0;
}

int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
                              struct kvm_vcpu_events *events)
{
        bool serror_pending = events->exception.serror_pending;
        bool has_esr = events->exception.serror_has_esr;
        bool ext_dabt_pending = events->exception.ext_dabt_pending;

        if (serror_pending && has_esr) {
                if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
                        return -EINVAL;

                if (!((events->exception.serror_esr) & ~ESR_ELx_ISS_MASK))
                        kvm_set_sei_esr(vcpu, events->exception.serror_esr);
                else
                        return -EINVAL;
        } else if (serror_pending) {
                kvm_inject_vabt(vcpu);
        }

        if (ext_dabt_pending)
                kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));

        return 0;
}

u32 __attribute_const__ kvm_target_cpu(void)
{
        unsigned long implementor = read_cpuid_implementor();
        unsigned long part_number = read_cpuid_part_number();

        switch (implementor) {
        case ARM_CPU_IMP_ARM:
                switch (part_number) {
                case ARM_CPU_PART_AEM_V8:
                        return KVM_ARM_TARGET_AEM_V8;
                case ARM_CPU_PART_FOUNDATION:
                        return KVM_ARM_TARGET_FOUNDATION_V8;
                case ARM_CPU_PART_CORTEX_A53:
                        return KVM_ARM_TARGET_CORTEX_A53;
                case ARM_CPU_PART_CORTEX_A57:
                        return KVM_ARM_TARGET_CORTEX_A57;
                }
                break;
        case ARM_CPU_IMP_APM:
                switch (part_number) {
                case APM_CPU_PART_XGENE:
                        return KVM_ARM_TARGET_XGENE_POTENZA;
                }
                break;
        }

        /* Return a default generic target */
        return KVM_ARM_TARGET_GENERIC_V8;
}

int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
        return -EINVAL;
}

int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
        return -EINVAL;
}

int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
                                  struct kvm_translation *tr)
{
        return -EINVAL;
}

/**
 * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
 * @vcpu: the vCPU pointer
 * @dbg: the ioctl data buffer
 *
 * This sets up and enables the VM for guest debugging. Userspace
 * passes in a control flag to enable different debug types and
 * potentially other architecture specific information in the rest of
 * the structure.
 */
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                                        struct kvm_guest_debug *dbg)
{
        trace_kvm_set_guest_debug(vcpu, dbg->control);

        if (dbg->control & ~KVM_GUESTDBG_VALID_MASK)
                return -EINVAL;

        if (!(dbg->control & KVM_GUESTDBG_ENABLE)) {
                vcpu->guest_debug = 0;
                vcpu_clear_flag(vcpu, HOST_SS_ACTIVE_PENDING);
                return 0;
        }

        vcpu->guest_debug = dbg->control;

        /* Hardware assisted Break and Watch points */
        if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW)
                vcpu->arch.external_debug_state = dbg->arch;

        return 0;
}

int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
                               struct kvm_device_attr *attr)
{
        int ret;

        switch (attr->group) {
        case KVM_ARM_VCPU_PMU_V3_CTRL:
                mutex_lock(&vcpu->kvm->arch.config_lock);
                ret = kvm_arm_pmu_v3_set_attr(vcpu, attr);
                mutex_unlock(&vcpu->kvm->arch.config_lock);
                break;
        case KVM_ARM_VCPU_TIMER_CTRL:
                ret = kvm_arm_timer_set_attr(vcpu, attr);
                break;
        case KVM_ARM_VCPU_PVTIME_CTRL:
                ret = kvm_arm_pvtime_set_attr(vcpu, attr);
                break;
        default:
                ret = -ENXIO;
                break;
        }

        return ret;
}

int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
                               struct kvm_device_attr *attr)
{
        int ret;

        switch (attr->group) {
        case KVM_ARM_VCPU_PMU_V3_CTRL:
                ret = kvm_arm_pmu_v3_get_attr(vcpu, attr);
                break;
        case KVM_ARM_VCPU_TIMER_CTRL:
                ret = kvm_arm_timer_get_attr(vcpu, attr);
                break;
        case KVM_ARM_VCPU_PVTIME_CTRL:
                ret = kvm_arm_pvtime_get_attr(vcpu, attr);
                break;
        default:
                ret = -ENXIO;
                break;
        }

        return ret;
}

int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
                               struct kvm_device_attr *attr)
{
        int ret;

        switch (attr->group) {
        case KVM_ARM_VCPU_PMU_V3_CTRL:
                ret = kvm_arm_pmu_v3_has_attr(vcpu, attr);
                break;
        case KVM_ARM_VCPU_TIMER_CTRL:
                ret = kvm_arm_timer_has_attr(vcpu, attr);
                break;
        case KVM_ARM_VCPU_PVTIME_CTRL:
                ret = kvm_arm_pvtime_has_attr(vcpu, attr);
                break;
        default:
                ret = -ENXIO;
                break;
        }

        return ret;
}

int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
                               struct kvm_arm_copy_mte_tags *copy_tags)
{
        gpa_t guest_ipa = copy_tags->guest_ipa;
        size_t length = copy_tags->length;
        void __user *tags = copy_tags->addr;
        gpa_t gfn;
        bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
        int ret = 0;

        if (!kvm_has_mte(kvm))
                return -EINVAL;

        if (copy_tags->reserved[0] || copy_tags->reserved[1])
                return -EINVAL;

        if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
                return -EINVAL;

        if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
                return -EINVAL;

        /* Lengths above INT_MAX cannot be represented in the return value */
        if (length > INT_MAX)
                return -EINVAL;

        gfn = gpa_to_gfn(guest_ipa);

        mutex_lock(&kvm->slots_lock);

        if (write && atomic_read(&kvm->nr_memslots_dirty_logging)) {
                ret = -EBUSY;
                goto out;
        }

        while (length > 0) {
                struct page *page = __gfn_to_page(kvm, gfn, write);
                void *maddr;
                unsigned long num_tags;
                struct folio *folio;

                if (!page) {
                        ret = -EFAULT;
                        goto out;
                }

                if (!pfn_to_online_page(page_to_pfn(page))) {
                        /* Reject ZONE_DEVICE memory */
                        kvm_release_page_unused(page);
                        ret = -EFAULT;
                        goto out;
                }
                folio = page_folio(page);
                maddr = page_address(page);

                if (!write) {
                        if ((folio_test_hugetlb(folio) &&
                             folio_test_hugetlb_mte_tagged(folio)) ||
                             page_mte_tagged(page))
                                num_tags = mte_copy_tags_to_user(tags, maddr,
                                                        MTE_GRANULES_PER_PAGE);
                        else
                                /* No tags in memory, so write zeros */
                                num_tags = MTE_GRANULES_PER_PAGE -
                                        clear_user(tags, MTE_GRANULES_PER_PAGE);
                        kvm_release_page_clean(page);
                } else {
                        /*
                         * Only locking to serialise with a concurrent
                         * __set_ptes() in the VMM but still overriding the
                         * tags, hence ignoring the return value.
                         */
                        if (folio_test_hugetlb(folio))
                                folio_try_hugetlb_mte_tagging(folio);
                        else
                                try_page_mte_tagging(page);
                        num_tags = mte_copy_tags_from_user(maddr, tags,
                                                        MTE_GRANULES_PER_PAGE);

                        /* uaccess failed, don't leave stale tags */
                        if (num_tags != MTE_GRANULES_PER_PAGE)
                                mte_clear_page_tags(maddr);
                        if (folio_test_hugetlb(folio))
                                folio_set_hugetlb_mte_tagged(folio);
                        else
                                set_page_mte_tagged(page);

                        kvm_release_page_dirty(page);
                }

                if (num_tags != MTE_GRANULES_PER_PAGE) {
                        ret = -EFAULT;
                        goto out;
                }

                gfn++;
                tags += num_tags;
                length -= PAGE_SIZE;
        }

out:
        mutex_unlock(&kvm->slots_lock);
        /* If some data has been copied report the number of bytes copied */
        if (length != copy_tags->length)
                return copy_tags->length - length;
        return ret;
}


















































































































































































































  324 




  170 



  170 





















































































  333 











    2 





































































  330 




    8 
























































































  551 




















   71 





   71 
   71 






























   63 






  552 












  320 













































  353 










































    3 
















  905 

  898 



















   71 










































  354 

  121 














   63 























  903 

  898 












  317 





  900 
   54 





  900 





  358 












  358 













  572 










  905 




  346 










  138 





  860 

















  333 



















    7 






























  860 








































    7 




    2 



















   71 




    7 










  334 







  333 
    3 

  335 
   71 

  335 






























  552 




















  550 
  550 









  551 


































































  333 






   12 




   71 





  334 





















    3 







    2 










    3 









  336 







  337 



  334 







  337 
















  336 








   71 




































  318 

    3 













  336 















  146 
















  902 
  357 







  905 






  901 








  326 





  318 































   34 



  860 

  652 























  335 
  335 







  335 

  335 












  326 










  327 






















   60 
  325 






  327 




  327 








  327 

  296 


  326 















































  327 







  327 




























    4 




    4 









    4 

























  348 




   70 


   98 





  320 


  321 
    4 










   71 


   71 




    3 

   71 

   71 
   71 















   71 






  318 
















  318 
  318 
  318 









   71 










   71 

    3 
   71 
   71 

   71 


















    9 




    9 



































    1 
   59 




    2 

   27 

















































   49 







   49 

   23 
    1 
















   71 









   71 
   70 

   12 

   70 



   71 




   71 



   69 




   71 

   71 
   71 














   67 
  333 












   71 



   71 








   71 



   71 
   71 




   71 



   71 










   71 



   71 












   70 






















   71 











   70 



    7 

   42 


   47 
    5 









   31 




   50 
   24 
    3 

   23 
    2 



   18 
    9 




    4 



   65 















    3 


   56 


   14 















    3 




   63 
   63 


   63 











































  358 
  357 
  111 


  157 



  357 









    2 
































    2 








    2 






    2 





























   34 







   11 
   31 


   34 





   34 

















   71 






















   34 




   29 


   24 
   24 


   23 
    1 
















   33 



    3 
   71 


















    8 



    9 




















































   34 


    9 





    9 


    9 


    9 















   71 






   71 

    2 




























   71 






   71 


   71 



   71 







   71 


   71 








    3 










   71 


   71 
   71 


   71 

    7 







   71 


   71 


   71 
   71 


   71 

   71 
   71 

   71 











   71 

   71 




   70 






















   24 
   28 





   34 
    1 





   29 
   24 













   29 


    8 












   34 
































  874 

















  876 
  874 


  855 






  629 









  875 
  876 


  872 

  874 









    4 





















   34 



















   34 

    2 














    9 


   34 

   34 

   34 










   34 



   34 

   34 
   34 
   34 


   34 


    9 






   28 









   28 




   28 


   22 
   19 

   28 


   33 





















































    9 














































































































































   63 



   63 









   63 


















   63 





    3 



   63 


   59 
    7 


   63 

   63 



    3 
   63 















   63 



   63 

   63 


   63 























   63 







   63 

   61 


   63 









   39 





    9 



   46 















   51 

   46 
    9 


















































   38 

   38 




   63 
   63 








    9 


   46 


   38 































   63 
    9 

   63 








  137 
  138 






  138 







    9 
  129 










  136 























  138 



  318 






















  290 


   31 

  328 



   23 




   27 








  357 

  358 
  356 




   84 



















  358 


  358 




   61 






   31 




   31 



   31 

   31 













   21 
    2 

   19 
    2 
   23 

    4 

    1 






   12 

   11 













  258 































  167 





  166 

  167 
  167 


  165 





























































   31 


























   30 















   31 









    8 
   23 






   31 









   31 
































  286 
  235 
   70 



  319 

   69 




  331 
  331 





  266 
  266 




    1 
  332 






   15 










   65 
  326 


  331 

  319 




   69 


















    9 
    9 




    3 



    6 






















    9 










   59 



   74 





    7 






    6 


  126 



    6 







  196 








































    3 
   52 












   51 





    1 






   52 




















   65 







  352 



   65 







  327 



  200 
   63 


   52 


    9 


  332 


   31 



   65 










  327 





















   23 


  355 


   85 






















  281 







  136 










   63 


    9 





















  355 



  326 


  358 



  196 
  357 
  132 

  358 


  353 





   70 




   63 


   52 


  330 













  341 



  342 

  202 
  327 


  146 





































    9 







   44 





























   33 









   33 





   33 



   33 
   33 





   33 






































  334 








  334 




































   38 
   38 
   38 



   36 
    9 

   38 



   38 






  334 













  342 











  343 



   46 
  336 




  337 







  342 















   38 



  344 
  343 




  343 


   41 
   65 


























  552 









  335 















  551 
    3 





  551 

  552 



  551 













  551 





  551 
  550 



  551 







  333 













  643 











  642 



  641 

  551 






  552 
   81 





  643 


  551 
  642 




















  552 



  643 
  644 



  642 




  550 
  551 


























































































































   33 



   33 
   33 
   33 



   33 




   33 





   33 
































  219 





  258 
  642 





  257 




  257 









































































   33 


















































   33 












   33 












   33 







   33 
   33 
   33 


   33 














































































    3 





    3 
    3 























































































    3 










    3 
    3 
    3 











    3 




  129 

  130 

















    3 


    3 























  130 


    7 


















  130 






































































  317 







  317 










  318 













  307 







  307 
    8 

  307 
   89 





  307 













  334 




  333 

  242 
  319 


  325 








  326 













  357 









  358 













  353 
  117 


  117 













































































  334 



  317 



  156 




























































  317 



  317 


  128 















   51 






   51 




























  344 





  343 


























   13 
   13 





































  342 





  344 
















  133 





  133 

























































  552 















































  552 

  290 


  552 
  551 




    1 











  257 


















  551 


  552 


































































































































































































































  339 
































  165 




  167 












  167 
  167 


  167 





































































   11 











   11 
    9 



    9 
    5 
























































































   33 








   33 






























































































































































































































































































































































  338 
  337 


  277 
  130 












   24 























  595 











  594 


  594 


  597 





   34 

  159 






  595 
   34 


  593 















































































































  335 











  333 


  334 











  333 



























































































































































































































  335 
















  335 



  335 
  335 

  335 


  335 

  333 











  334 




  335 
  335 

  335 




  335 





  335 
  335 







  335 


  335 

  335 
  335 















  334 





  333 
  335 



  335 
  335 

  334 


  335 
  335 










  335 




  335 



  335 

  333 





  335 






  334 






  335 














  335 


  335 




  333 





  334 





  333 




  334 









  335 





  335 
  335 

  335 





  334 


  334 




















  335 




  335 
  334 



  335 

  335 

  333 


  335 


  335 


  333 









  334 







  334 


  335 
  335 
  335 
  335 





  335 
  335 
  335 

  335 
  335 

  335 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
// SPDX-License-Identifier: GPL-2.0+
/*
 * Maple Tree implementation
 * Copyright (c) 2018-2022 Oracle Corporation
 * Authors: Liam R. Howlett <Liam.Howlett@oracle.com>
 *            Matthew Wilcox <willy@infradead.org>
 * Copyright (c) 2023 ByteDance
 * Author: Peng Zhang <zhangpeng.00@bytedance.com>
 */

/*
 * DOC: Interesting implementation details of the Maple Tree
 *
 * Each node type has a number of slots for entries and a number of slots for
 * pivots.  In the case of dense nodes, the pivots are implied by the position
 * and are simply the slot index + the minimum of the node.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges.  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 *
 *
 * The following illustrates the layout of a range64 nodes slots and pivots.
 *
 *
 *  Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 |
 *           ┬   ┬   ┬   ┬     ┬    ┬    ┬    ┬    ┬
 *           │   │   │   │     │    │    │    │    └─ Implied maximum
 *           │   │   │   │     │    │    │    └─ Pivot 14
 *           │   │   │   │     │    │    └─ Pivot 13
 *           │   │   │   │     │    └─ Pivot 12
 *           │   │   │   │     └─ Pivot 11
 *           │   │   │   └─ Pivot 2
 *           │   │   └─ Pivot 1
 *           │   └─ Pivot 0
 *           └─  Implied minimum
 *
 * Slot contents:
 *  Internal (non-leaf) nodes contain pointers to other nodes.
 *  Leaf nodes contain entries.
 *
 * The location of interest is often referred to as an offset.  All offsets have
 * a slot, but the last offset has an implied pivot from the node above (or
 * UINT_MAX for the root node.
 *
 * Ranges complicate certain write activities.  When modifying any of
 * the B-tree variants, it is known that one entry will either be added or
 * deleted.  When modifying the Maple Tree, one store operation may overwrite
 * the entire data set, or one half of the tree, or the middle half of the tree.
 *
 */


#include <linux/maple_tree.h>
#include <linux/xarray.h>
#include <linux/types.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/limits.h>
#include <asm/barrier.h>

#define CREATE_TRACE_POINTS
#include <trace/events/maple_tree.h>

/*
 * Kernel pointer hashing renders much of the maple tree dump useless as tagged
 * pointers get hashed to arbitrary values.
 *
 * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is
 * permissible to bypass this. Otherwise remain cautious and retain the hashing.
 *
 * Userland doesn't know about %px so also use %p there.
 */
#if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE)
#define PTR_FMT "%px"
#else
#define PTR_FMT "%p"
#endif

#define MA_ROOT_PARENT 1

/*
 * Maple state flags
 * * MA_STATE_BULK                - Bulk insert mode
 * * MA_STATE_REBALANCE                - Indicate a rebalance during bulk insert
 * * MA_STATE_PREALLOC                - Preallocated nodes, WARN_ON allocation
 */
#define MA_STATE_BULK                1
#define MA_STATE_REBALANCE        2
#define MA_STATE_PREALLOC        4

#define ma_parent_ptr(x) ((struct maple_pnode *)(x))
#define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT)
#define ma_mnode_ptr(x) ((struct maple_node *)(x))
#define ma_enode_ptr(x) ((struct maple_enode *)(x))
static struct kmem_cache *maple_node_cache;

#ifdef CONFIG_DEBUG_MAPLE_TREE
static const unsigned long mt_max[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = ULONG_MAX,
        [maple_range_64]        = ULONG_MAX,
        [maple_arange_64]        = ULONG_MAX,
};
#define mt_node_max(x) mt_max[mte_node_type(x)]
#endif

static const unsigned char mt_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS,
};
#define mt_slot_count(x) mt_slots[mte_node_type(x)]

static const unsigned char mt_pivots[] = {
        [maple_dense]                = 0,
        [maple_leaf_64]                = MAPLE_RANGE64_SLOTS - 1,
        [maple_range_64]        = MAPLE_RANGE64_SLOTS - 1,
        [maple_arange_64]        = MAPLE_ARANGE64_SLOTS - 1,
};
#define mt_pivot_count(x) mt_pivots[mte_node_type(x)]

static const unsigned char mt_min_slots[] = {
        [maple_dense]                = MAPLE_NODE_SLOTS / 2,
        [maple_leaf_64]                = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_range_64]        = (MAPLE_RANGE64_SLOTS / 2) - 2,
        [maple_arange_64]        = (MAPLE_ARANGE64_SLOTS / 2) - 1,
};
#define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)]

#define MAPLE_BIG_NODE_SLOTS        (MAPLE_RANGE64_SLOTS * 2 + 2)
#define MAPLE_BIG_NODE_GAPS        (MAPLE_ARANGE64_SLOTS * 2 + 1)

struct maple_big_node {
        unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1];
        union {
                struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS];
                struct {
                        unsigned long padding[MAPLE_BIG_NODE_GAPS];
                        unsigned long gap[MAPLE_BIG_NODE_GAPS];
                };
        };
        unsigned char b_end;
        enum maple_type type;
};

/*
 * The maple_subtree_state is used to build a tree to replace a segment of an
 * existing tree in a more atomic way.  Any walkers of the older tree will hit a
 * dead node and restart on updates.
 */
struct maple_subtree_state {
        struct ma_state *orig_l;        /* Original left side of subtree */
        struct ma_state *orig_r;        /* Original right side of subtree */
        struct ma_state *l;                /* New left side of subtree */
        struct ma_state *m;                /* New middle of subtree (rare) */
        struct ma_state *r;                /* New right side of subtree */
        struct ma_topiary *free;        /* nodes to be freed */
        struct ma_topiary *destroy;        /* Nodes to be destroyed (walked and freed) */
        struct maple_big_node *bn;
};

#ifdef CONFIG_KASAN_STACK
/* Prevent mas_wr_bnode() from exceeding the stack frame limit */
#define noinline_for_kasan noinline_for_stack
#else
#define noinline_for_kasan inline
#endif

/* Functions */
static inline struct maple_node *mt_alloc_one(gfp_t gfp)
{
        return kmem_cache_alloc(maple_node_cache, gfp);
}

static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes)
{
        return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes);
}

static inline void mt_free_one(struct maple_node *node)
{
        kmem_cache_free(maple_node_cache, node);
}

static inline void mt_free_bulk(size_t size, void __rcu **nodes)
{
        kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes);
}

static void mt_free_rcu(struct rcu_head *head)
{
        struct maple_node *node = container_of(head, struct maple_node, rcu);

        kmem_cache_free(maple_node_cache, node);
}

/*
 * ma_free_rcu() - Use rcu callback to free a maple node
 * @node: The node to free
 *
 * The maple tree uses the parent pointer to indicate this node is no longer in
 * use and will be freed.
 */
static void ma_free_rcu(struct maple_node *node)
{
        WARN_ON(node->parent != ma_parent_ptr(node));
        call_rcu(&node->rcu, mt_free_rcu);
}

static void mas_set_height(struct ma_state *mas)
{
        unsigned int new_flags = mas->tree->ma_flags;

        new_flags &= ~MT_FLAGS_HEIGHT_MASK;
        MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX);
        new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET;
        mas->tree->ma_flags = new_flags;
}

static unsigned int mas_mt_height(struct ma_state *mas)
{
        return mt_height(mas->tree);
}

static inline unsigned int mt_attr(struct maple_tree *mt)
{
        return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK;
}

static __always_inline enum maple_type mte_node_type(
                const struct maple_enode *entry)
{
        return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) &
                MAPLE_NODE_TYPE_MASK;
}

static __always_inline bool ma_is_dense(const enum maple_type type)
{
        return type < maple_leaf_64;
}

static __always_inline bool ma_is_leaf(const enum maple_type type)
{
        return type < maple_range_64;
}

static __always_inline bool mte_is_leaf(const struct maple_enode *entry)
{
        return ma_is_leaf(mte_node_type(entry));
}

/*
 * We also reserve values with the bottom two bits set to '10' which are
 * below 4096
 */
static __always_inline bool mt_is_reserved(const void *entry)
{
        return ((unsigned long)entry < MAPLE_RESERVED_RANGE) &&
                xa_is_internal(entry);
}

static __always_inline void mas_set_err(struct ma_state *mas, long err)
{
        mas->node = MA_ERROR(err);
        mas->status = ma_error;
}

static __always_inline bool mas_is_ptr(const struct ma_state *mas)
{
        return mas->status == ma_root;
}

static __always_inline bool mas_is_start(const struct ma_state *mas)
{
        return mas->status == ma_start;
}

static __always_inline bool mas_is_none(const struct ma_state *mas)
{
        return mas->status == ma_none;
}

static __always_inline bool mas_is_paused(const struct ma_state *mas)
{
        return mas->status == ma_pause;
}

static __always_inline bool mas_is_overflow(struct ma_state *mas)
{
        return mas->status == ma_overflow;
}

static inline bool mas_is_underflow(struct ma_state *mas)
{
        return mas->status == ma_underflow;
}

static __always_inline struct maple_node *mte_to_node(
                const struct maple_enode *entry)
{
        return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mte_to_mat() - Convert a maple encoded node to a maple topiary node.
 * @entry: The maple encoded node
 *
 * Return: a maple topiary pointer
 */
static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry)
{
        return (struct maple_topiary *)
                ((unsigned long)entry & ~MAPLE_NODE_MASK);
}

/*
 * mas_mn() - Get the maple state node.
 * @mas: The maple state
 *
 * Return: the maple node (not encoded - bare pointer).
 */
static inline struct maple_node *mas_mn(const struct ma_state *mas)
{
        return mte_to_node(mas->node);
}

/*
 * mte_set_node_dead() - Set a maple encoded node as dead.
 * @mn: The maple encoded node.
 */
static inline void mte_set_node_dead(struct maple_enode *mn)
{
        mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn));
        smp_wmb(); /* Needed for RCU */
}

/* Bit 1 indicates the root is a node */
#define MAPLE_ROOT_NODE                        0x02
/* maple_type stored bit 3-6 */
#define MAPLE_ENODE_TYPE_SHIFT                0x03
/* Bit 2 means a NULL somewhere below */
#define MAPLE_ENODE_NULL                0x04

static inline struct maple_enode *mt_mk_node(const struct maple_node *node,
                                             enum maple_type type)
{
        return (void *)((unsigned long)node |
                        (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL);
}

static inline void *mte_mk_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ROOT_NODE);
}

static inline void *mte_safe_root(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE);
}

static inline void __maybe_unused *mte_set_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL);
}

static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node)
{
        return (void *)((unsigned long)node | MAPLE_ENODE_NULL);
}

static inline bool __maybe_unused mte_has_null(const struct maple_enode *node)
{
        return (unsigned long)node & MAPLE_ENODE_NULL;
}

static __always_inline bool ma_is_root(struct maple_node *node)
{
        return ((unsigned long)node->parent & MA_ROOT_PARENT);
}

static __always_inline bool mte_is_root(const struct maple_enode *node)
{
        return ma_is_root(mte_to_node(node));
}

static inline bool mas_is_root_limits(const struct ma_state *mas)
{
        return !mas->min && mas->max == ULONG_MAX;
}

static __always_inline bool mt_is_alloc(struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE);
}

/*
 * The Parent Pointer
 * Excluding root, the parent pointer is 256B aligned like all other tree nodes.
 * When storing a 32 or 64 bit values, the offset can fit into 5 bits.  The 16
 * bit values need an extra bit to store the offset.  This extra bit comes from
 * a reuse of the last bit in the node type.  This is possible by using bit 1 to
 * indicate if bit 2 is part of the type or the slot.
 *
 * Note types:
 *  0x??1 = Root
 *  0x?00 = 16 bit nodes
 *  0x010 = 32 bit nodes
 *  0x110 = 64 bit nodes
 *
 * Slot size and alignment
 *  0b??1 : Root
 *  0b?00 : 16 bit values, type in 0-1, slot in 2-7
 *  0b010 : 32 bit values, type in 0-2, slot in 3-7
 *  0b110 : 64 bit values, type in 0-2, slot in 3-7
 */

#define MAPLE_PARENT_ROOT                0x01

#define MAPLE_PARENT_SLOT_SHIFT                0x03
#define MAPLE_PARENT_SLOT_MASK                0xF8

#define MAPLE_PARENT_16B_SLOT_SHIFT        0x02
#define MAPLE_PARENT_16B_SLOT_MASK        0xFC

#define MAPLE_PARENT_RANGE64                0x06
#define MAPLE_PARENT_RANGE32                0x04
#define MAPLE_PARENT_NOT_RANGE16        0x02

/*
 * mte_parent_shift() - Get the parent shift for the slot storage.
 * @parent: The parent pointer cast as an unsigned long
 * Return: The shift into that pointer to the star to of the slot
 */
static inline unsigned long mte_parent_shift(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_SHIFT;

        return MAPLE_PARENT_16B_SLOT_SHIFT;
}

/*
 * mte_parent_slot_mask() - Get the slot mask for the parent.
 * @parent: The parent pointer cast as an unsigned long.
 * Return: The slot mask for that parent.
 */
static inline unsigned long mte_parent_slot_mask(unsigned long parent)
{
        /* Note bit 1 == 0 means 16B */
        if (likely(parent & MAPLE_PARENT_NOT_RANGE16))
                return MAPLE_PARENT_SLOT_MASK;

        return MAPLE_PARENT_16B_SLOT_MASK;
}

/*
 * mas_parent_type() - Return the maple_type of the parent from the stored
 * parent type.
 * @mas: The maple state
 * @enode: The maple_enode to extract the parent's enum
 * Return: The node->parent maple_type
 */
static inline
enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode)
{
        unsigned long p_type;

        p_type = (unsigned long)mte_to_node(enode)->parent;
        if (WARN_ON(p_type & MAPLE_PARENT_ROOT))
                return 0;

        p_type &= MAPLE_NODE_MASK;
        p_type &= ~mte_parent_slot_mask(p_type);
        switch (p_type) {
        case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */
                if (mt_is_alloc(mas->tree))
                        return maple_arange_64;
                return maple_range_64;
        }

        return 0;
}

/*
 * mas_set_parent() - Set the parent node and encode the slot
 * @mas: The maple state
 * @enode: The encoded maple node.
 * @parent: The encoded maple node that is the parent of @enode.
 * @slot: The slot that @enode resides in @parent.
 *
 * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the
 * parent type.
 */
static inline
void mas_set_parent(struct ma_state *mas, struct maple_enode *enode,
                    const struct maple_enode *parent, unsigned char slot)
{
        unsigned long val = (unsigned long)parent;
        unsigned long shift;
        unsigned long type;
        enum maple_type p_type = mte_node_type(parent);

        MAS_BUG_ON(mas, p_type == maple_dense);
        MAS_BUG_ON(mas, p_type == maple_leaf_64);

        switch (p_type) {
        case maple_range_64:
        case maple_arange_64:
                shift = MAPLE_PARENT_SLOT_SHIFT;
                type = MAPLE_PARENT_RANGE64;
                break;
        default:
        case maple_dense:
        case maple_leaf_64:
                shift = type = 0;
                break;
        }

        val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */
        val |= (slot << shift) | type;
        mte_to_node(enode)->parent = ma_parent_ptr(val);
}

/*
 * mte_parent_slot() - get the parent slot of @enode.
 * @enode: The encoded maple node.
 *
 * Return: The slot in the parent node where @enode resides.
 */
static __always_inline
unsigned int mte_parent_slot(const struct maple_enode *enode)
{
        unsigned long val = (unsigned long)mte_to_node(enode)->parent;

        if (unlikely(val & MA_ROOT_PARENT))
                return 0;

        /*
         * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost
         * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT
         */
        return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val);
}

/*
 * mte_parent() - Get the parent of @node.
 * @enode: The encoded maple node.
 *
 * Return: The parent maple node.
 */
static __always_inline
struct maple_node *mte_parent(const struct maple_enode *enode)
{
        return (void *)((unsigned long)
                        (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK);
}

/*
 * ma_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool ma_dead_node(const struct maple_node *node)
{
        struct maple_node *parent;

        /* Do not reorder reads from the node prior to the parent check */
        smp_rmb();
        parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
        return (parent == node);
}

/*
 * mte_dead_node() - check if the @enode is dead.
 * @enode: The encoded maple node
 *
 * Return: true if dead, false otherwise.
 */
static __always_inline bool mte_dead_node(const struct maple_enode *enode)
{
        struct maple_node *node;

        node = mte_to_node(enode);
        return ma_dead_node(node);
}

/*
 * mas_allocated() - Get the number of nodes allocated in a maple state.
 * @mas: The maple state
 *
 * The ma_state alloc member is overloaded to hold a pointer to the first
 * allocated node or to the number of requested nodes to allocate.  If bit 0 is
 * set, then the alloc contains the number of requested nodes.  If there is an
 * allocated node, then the total allocated nodes is in that node.
 *
 * Return: The total number of nodes allocated
 */
static inline unsigned long mas_allocated(const struct ma_state *mas)
{
        if (!mas->alloc || ((unsigned long)mas->alloc & 0x1))
                return 0;

        return mas->alloc->total;
}

/*
 * mas_set_alloc_req() - Set the requested number of allocations.
 * @mas: the maple state
 * @count: the number of allocations.
 *
 * The requested number of allocations is either in the first allocated node,
 * located in @mas->alloc->request_count, or directly in @mas->alloc if there is
 * no allocated node.  Set the request either in the node or do the necessary
 * encoding to store in @mas->alloc directly.
 */
static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count)
{
        if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) {
                if (!count)
                        mas->alloc = NULL;
                else
                        mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U);
                return;
        }

        mas->alloc->request_count = count;
}

/*
 * mas_alloc_req() - get the requested number of allocations.
 * @mas: The maple state
 *
 * The alloc count is either stored directly in @mas, or in
 * @mas->alloc->request_count if there is at least one node allocated.  Decode
 * the request count if it's stored directly in @mas->alloc.
 *
 * Return: The allocation request count.
 */
static inline unsigned int mas_alloc_req(const struct ma_state *mas)
{
        if ((unsigned long)mas->alloc & 0x1)
                return (unsigned long)(mas->alloc) >> 1;
        else if (mas->alloc)
                return mas->alloc->request_count;
        return 0;
}

/*
 * ma_pivots() - Get a pointer to the maple node pivots.
 * @node: the maple node
 * @type: the node type
 *
 * In the event of a dead node, this array may be %NULL
 *
 * Return: A pointer to the maple node pivots
 */
static inline unsigned long *ma_pivots(struct maple_node *node,
                                           enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.pivot;
        case maple_range_64:
        case maple_leaf_64:
                return node->mr64.pivot;
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * ma_gaps() - Get a pointer to the maple node gaps.
 * @node: the maple node
 * @type: the node type
 *
 * Return: A pointer to the maple node gaps
 */
static inline unsigned long *ma_gaps(struct maple_node *node,
                                     enum maple_type type)
{
        switch (type) {
        case maple_arange_64:
                return node->ma64.gap;
        case maple_range_64:
        case maple_leaf_64:
        case maple_dense:
                return NULL;
        }
        return NULL;
}

/*
 * mas_safe_pivot() - get the pivot at @piv or mas->max.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @piv: The pivot to fetch
 * @type: The maple node type
 *
 * Return: The pivot at @piv within the limit of the @pivots array, @mas->max
 * otherwise.
 */
static __always_inline unsigned long
mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots,
               unsigned char piv, enum maple_type type)
{
        if (piv >= mt_pivots[type])
                return mas->max;

        return pivots[piv];
}

/*
 * mas_safe_min() - Return the minimum for a given offset.
 * @mas: The maple state
 * @pivots: The pointer to the maple node pivots
 * @offset: The offset into the pivot array
 *
 * Return: The minimum range value that is contained in @offset.
 */
static inline unsigned long
mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset)
{
        if (likely(offset))
                return pivots[offset - 1] + 1;

        return mas->min;
}

/*
 * mte_set_pivot() - Set a pivot to a value in an encoded maple node.
 * @mn: The encoded maple node
 * @piv: The pivot offset
 * @val: The value of the pivot
 */
static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv,
                                unsigned long val)
{
        struct maple_node *node = mte_to_node(mn);
        enum maple_type type = mte_node_type(mn);

        BUG_ON(piv >= mt_pivots[type]);
        switch (type) {
        case maple_range_64:
        case maple_leaf_64:
                node->mr64.pivot[piv] = val;
                break;
        case maple_arange_64:
                node->ma64.pivot[piv] = val;
                break;
        case maple_dense:
                break;
        }

}

/*
 * ma_slots() - Get a pointer to the maple node slots.
 * @mn: The maple node
 * @mt: The maple node type
 *
 * Return: A pointer to the maple node slots
 */
static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return mn->ma64.slot;
        case maple_range_64:
        case maple_leaf_64:
                return mn->mr64.slot;
        case maple_dense:
                return mn->slot;
        }

        return NULL;
}

static inline bool mt_write_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_write_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline bool mt_locked(const struct maple_tree *mt)
{
        return mt_external_lock(mt) ? mt_lock_is_held(mt) :
                lockdep_is_held(&mt->ma_lock);
}

static __always_inline void *mt_slot(const struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_check(slots[offset], mt_locked(mt));
}

static __always_inline void *mt_slot_locked(struct maple_tree *mt,
                void __rcu **slots, unsigned char offset)
{
        return rcu_dereference_protected(slots[offset], mt_write_locked(mt));
}
/*
 * mas_slot_locked() - Get the slot value when holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset.
 */
static __always_inline void *mas_slot_locked(struct ma_state *mas,
                void __rcu **slots, unsigned char offset)
{
        return mt_slot_locked(mas->tree, slots, offset);
}

/*
 * mas_slot() - Get the slot value when not holding the maple tree lock.
 * @mas: The maple state
 * @slots: The pointer to the slots
 * @offset: The offset into the slots array to fetch
 *
 * Return: The entry stored in @slots at the @offset
 */
static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots,
                unsigned char offset)
{
        return mt_slot(mas->tree, slots, offset);
}

/*
 * mas_root() - Get the maple tree root.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static __always_inline void *mas_root(struct ma_state *mas)
{
        return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree));
}

static inline void *mt_root_locked(struct maple_tree *mt)
{
        return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt));
}

/*
 * mas_root_locked() - Get the maple tree root when holding the maple tree lock.
 * @mas: The maple state.
 *
 * Return: The pointer to the root of the tree
 */
static inline void *mas_root_locked(struct ma_state *mas)
{
        return mt_root_locked(mas->tree);
}

static inline struct maple_metadata *ma_meta(struct maple_node *mn,
                                             enum maple_type mt)
{
        switch (mt) {
        case maple_arange_64:
                return &mn->ma64.meta;
        default:
                return &mn->mr64.meta;
        }
}

/*
 * ma_set_meta() - Set the metadata information of a node.
 * @mn: The maple node
 * @mt: The maple node type
 * @offset: The offset of the highest sub-gap in this node.
 * @end: The end of the data in this node.
 */
static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
                               unsigned char offset, unsigned char end)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
        meta->end = end;
}

/*
 * mt_clear_meta() - clear the metadata information of a node, if it exists
 * @mt: The maple tree
 * @mn: The maple node
 * @type: The maple node type
 */
static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn,
                                  enum maple_type type)
{
        struct maple_metadata *meta;
        unsigned long *pivots;
        void __rcu **slots;
        void *next;

        switch (type) {
        case maple_range_64:
                pivots = mn->mr64.pivot;
                if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
                        slots = mn->mr64.slot;
                        next = mt_slot_locked(mt, slots,
                                              MAPLE_RANGE64_SLOTS - 1);
                        if (unlikely((mte_to_node(next) &&
                                      mte_node_type(next))))
                                return; /* no metadata, could be node */
                }
                fallthrough;
        case maple_arange_64:
                meta = ma_meta(mn, type);
                break;
        default:
                return;
        }

        meta->gap = 0;
        meta->end = 0;
}

/*
 * ma_meta_end() - Get the data end of a node from the metadata
 * @mn: The maple node
 * @mt: The maple node type
 */
static inline unsigned char ma_meta_end(struct maple_node *mn,
                                        enum maple_type mt)
{
        struct maple_metadata *meta = ma_meta(mn, mt);

        return meta->end;
}

/*
 * ma_meta_gap() - Get the largest gap location of a node from the metadata
 * @mn: The maple node
 */
static inline unsigned char ma_meta_gap(struct maple_node *mn)
{
        return mn->ma64.meta.gap;
}

/*
 * ma_set_meta_gap() - Set the largest gap location in a nodes metadata
 * @mn: The maple node
 * @mt: The maple node type
 * @offset: The location of the largest gap.
 */
static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt,
                                   unsigned char offset)
{

        struct maple_metadata *meta = ma_meta(mn, mt);

        meta->gap = offset;
}

/*
 * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes.
 * @mat: the ma_topiary, a linked list of dead nodes.
 * @dead_enode: the node to be marked as dead and added to the tail of the list
 *
 * Add the @dead_enode to the linked list in @mat.
 */
static inline void mat_add(struct ma_topiary *mat,
                           struct maple_enode *dead_enode)
{
        mte_set_node_dead(dead_enode);
        mte_to_mat(dead_enode)->next = NULL;
        if (!mat->tail) {
                mat->tail = mat->head = dead_enode;
                return;
        }

        mte_to_mat(mat->tail)->next = dead_enode;
        mat->tail = dead_enode;
}

static void mt_free_walk(struct rcu_head *head);
static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free);
/*
 * mas_mat_destroy() - Free all nodes and subtrees in a dead list.
 * @mas: the maple state
 * @mat: the ma_topiary linked list of dead nodes to free.
 *
 * Destroy walk a dead list.
 */
static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat)
{
        struct maple_enode *next;
        struct maple_node *node;
        bool in_rcu = mt_in_rcu(mas->tree);

        while (mat->head) {
                next = mte_to_mat(mat->head)->next;
                node = mte_to_node(mat->head);
                mt_destroy_walk(mat->head, mas->tree, !in_rcu);
                if (in_rcu)
                        call_rcu(&node->rcu, mt_free_walk);
                mat->head = next;
        }
}
/*
 * mas_descend() - Descend into the slot stored in the ma_state.
 * @mas: the maple state.
 *
 * Note: Not RCU safe, only use in write side or debug code.
 */
static inline void mas_descend(struct ma_state *mas)
{
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        void __rcu **slots;

        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);

        if (mas->offset)
                mas->min = pivots[mas->offset - 1] + 1;
        mas->max = mas_safe_pivot(mas, pivots, mas->offset, type);
        mas->node = mas_slot(mas, slots, mas->offset);
}

/*
 * mte_set_gap() - Set a maple node gap.
 * @mn: The encoded maple node
 * @gap: The offset of the gap to set
 * @val: The gap value
 */
static inline void mte_set_gap(const struct maple_enode *mn,
                                 unsigned char gap, unsigned long val)
{
        switch (mte_node_type(mn)) {
        default:
                break;
        case maple_arange_64:
                mte_to_node(mn)->ma64.gap[gap] = val;
                break;
        }
}

/*
 * mas_ascend() - Walk up a level of the tree.
 * @mas: The maple state
 *
 * Sets the @mas->max and @mas->min to the correct values when walking up.  This
 * may cause several levels of walking up to find the correct min and max.
 * May find a dead node which will cause a premature return.
 * Return: 1 on dead node, 0 otherwise
 */
static int mas_ascend(struct ma_state *mas)
{
        struct maple_enode *p_enode; /* parent enode. */
        struct maple_enode *a_enode; /* ancestor enode. */
        struct maple_node *a_node; /* ancestor node. */
        struct maple_node *p_node; /* parent node. */
        unsigned char a_slot;
        enum maple_type a_type;
        unsigned long min, max;
        unsigned long *pivots;
        bool set_max = false, set_min = false;

        a_node = mas_mn(mas);
        if (ma_is_root(a_node)) {
                mas->offset = 0;
                return 0;
        }

        p_node = mte_parent(mas->node);
        if (unlikely(a_node == p_node))
                return 1;

        a_type = mas_parent_type(mas, mas->node);
        mas->offset = mte_parent_slot(mas->node);
        a_enode = mt_mk_node(p_node, a_type);

        /* Check to make sure all parent information is still accurate */
        if (p_node != mte_parent(mas->node))
                return 1;

        mas->node = a_enode;

        if (mte_is_root(a_enode)) {
                mas->max = ULONG_MAX;
                mas->min = 0;
                return 0;
        }

        min = 0;
        max = ULONG_MAX;
        if (!mas->offset) {
                min = mas->min;
                set_min = true;
        }

        if (mas->max == ULONG_MAX)
                set_max = true;

        do {
                p_enode = a_enode;
                a_type = mas_parent_type(mas, p_enode);
                a_node = mte_parent(p_enode);
                a_slot = mte_parent_slot(p_enode);
                a_enode = mt_mk_node(a_node, a_type);
                pivots = ma_pivots(a_node, a_type);

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (!set_min && a_slot) {
                        set_min = true;
                        min = pivots[a_slot - 1] + 1;
                }

                if (!set_max && a_slot < mt_pivots[a_type]) {
                        set_max = true;
                        max = pivots[a_slot];
                }

                if (unlikely(ma_dead_node(a_node)))
                        return 1;

                if (unlikely(ma_is_root(a_node)))
                        break;

        } while (!set_min || !set_max);

        mas->max = max;
        mas->min = min;
        return 0;
}

/*
 * mas_pop_node() - Get a previously allocated maple node from the maple state.
 * @mas: The maple state
 *
 * Return: A pointer to a maple node.
 */
static inline struct maple_node *mas_pop_node(struct ma_state *mas)
{
        struct maple_alloc *ret, *node = mas->alloc;
        unsigned long total = mas_allocated(mas);
        unsigned int req = mas_alloc_req(mas);

        /* nothing or a request pending. */
        if (WARN_ON(!total))
                return NULL;

        if (total == 1) {
                /* single allocation in this ma_state */
                mas->alloc = NULL;
                ret = node;
                goto single_node;
        }

        if (node->node_count == 1) {
                /* Single allocation in this node. */
                mas->alloc = node->slot[0];
                mas->alloc->total = node->total - 1;
                ret = node;
                goto new_head;
        }
        node->total--;
        ret = node->slot[--node->node_count];
        node->slot[node->node_count] = NULL;

single_node:
new_head:
        if (req) {
                req++;
                mas_set_alloc_req(mas, req);
        }

        memset(ret, 0, sizeof(*ret));
        return (struct maple_node *)ret;
}

/*
 * mas_push_node() - Push a node back on the maple state allocation.
 * @mas: The maple state
 * @used: The used maple node
 *
 * Stores the maple node back into @mas->alloc for reuse.  Updates allocated and
 * requested node count as necessary.
 */
static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
{
        struct maple_alloc *reuse = (struct maple_alloc *)used;
        struct maple_alloc *head = mas->alloc;
        unsigned long count;
        unsigned int requested = mas_alloc_req(mas);

        count = mas_allocated(mas);

        reuse->request_count = 0;
        reuse->node_count = 0;
        if (count) {
                if (head->node_count < MAPLE_ALLOC_SLOTS) {
                        head->slot[head->node_count++] = reuse;
                        head->total++;
                        goto done;
                }
                reuse->slot[0] = head;
                reuse->node_count = 1;
        }

        reuse->total = count + 1;
        mas->alloc = reuse;
done:
        if (requested > 1)
                mas_set_alloc_req(mas, requested - 1);
}

/*
 * mas_alloc_nodes() - Allocate nodes into a maple state
 * @mas: The maple state
 * @gfp: The GFP Flags
 */
static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
{
        struct maple_alloc *node;
        unsigned long allocated = mas_allocated(mas);
        unsigned int requested = mas_alloc_req(mas);
        unsigned int count;
        void **slots = NULL;
        unsigned int max_req = 0;

        if (!requested)
                return;

        mas_set_alloc_req(mas, 0);
        if (mas->mas_flags & MA_STATE_PREALLOC) {
                if (allocated)
                        return;
                WARN_ON(!allocated);
        }

        if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) {
                node = (struct maple_alloc *)mt_alloc_one(gfp);
                if (!node)
                        goto nomem_one;

                if (allocated) {
                        node->slot[0] = mas->alloc;
                        node->node_count = 1;
                } else {
                        node->node_count = 0;
                }

                mas->alloc = node;
                node->total = ++allocated;
                node->request_count = 0;
                requested--;
        }

        node = mas->alloc;
        while (requested) {
                max_req = MAPLE_ALLOC_SLOTS - node->node_count;
                slots = (void **)&node->slot[node->node_count];
                max_req = min(requested, max_req);
                count = mt_alloc_bulk(gfp, max_req, slots);
                if (!count)
                        goto nomem_bulk;

                if (node->node_count == 0) {
                        node->slot[0]->node_count = 0;
                        node->slot[0]->request_count = 0;
                }

                node->node_count += count;
                allocated += count;
                /* find a non-full node*/
                do {
                        node = node->slot[0];
                } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS));
                requested -= count;
        }
        mas->alloc->total = allocated;
        return;

nomem_bulk:
        /* Clean up potential freed allocations on bulk failure */
        memset(slots, 0, max_req * sizeof(unsigned long));
        mas->alloc->total = allocated;
nomem_one:
        mas_set_alloc_req(mas, requested);
        mas_set_err(mas, -ENOMEM);
}

/*
 * mas_free() - Free an encoded maple node
 * @mas: The maple state
 * @used: The encoded maple node to free.
 *
 * Uses rcu free if necessary, pushes @used back on the maple state allocations
 * otherwise.
 */
static inline void mas_free(struct ma_state *mas, struct maple_enode *used)
{
        struct maple_node *tmp = mte_to_node(used);

        if (mt_in_rcu(mas->tree))
                ma_free_rcu(tmp);
        else
                mas_push_node(mas, tmp);
}

/*
 * mas_node_count_gfp() - Check if enough nodes are allocated and request more
 * if there is not enough nodes.
 * @mas: The maple state
 * @count: The number of nodes needed
 * @gfp: the gfp flags
 */
static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp)
{
        unsigned long allocated = mas_allocated(mas);

        if (allocated < count) {
                mas_set_alloc_req(mas, count - allocated);
                mas_alloc_nodes(mas, gfp);
        }
}

/*
 * mas_node_count() - Check if enough nodes are allocated and request more if
 * there is not enough nodes.
 * @mas: The maple state
 * @count: The number of nodes needed
 *
 * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags.
 */
static void mas_node_count(struct ma_state *mas, int count)
{
        return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN);
}

/*
 * mas_start() - Sets up maple state for operations.
 * @mas: The maple state.
 *
 * If mas->status == ma_start, then set the min, max and depth to
 * defaults.
 *
 * Return:
 * - If mas->node is an error or not mas_start, return NULL.
 * - If it's an empty tree:     NULL & mas->status == ma_none
 * - If it's a single entry:    The entry & mas->status == ma_root
 * - If it's a tree:            NULL & mas->status == ma_active
 */
static inline struct maple_enode *mas_start(struct ma_state *mas)
{
        if (likely(mas_is_start(mas))) {
                struct maple_enode *root;

                mas->min = 0;
                mas->max = ULONG_MAX;

retry:
                mas->depth = 0;
                root = mas_root(mas);
                /* Tree with nodes */
                if (likely(xa_is_node(root))) {
                        mas->depth = 1;
                        mas->status = ma_active;
                        mas->node = mte_safe_root(root);
                        mas->offset = 0;
                        if (mte_dead_node(mas->node))
                                goto retry;

                        return NULL;
                }

                mas->node = NULL;
                /* empty tree */
                if (unlikely(!root)) {
                        mas->status = ma_none;
                        mas->offset = MAPLE_NODE_SLOTS;
                        return NULL;
                }

                /* Single entry tree */
                mas->status = ma_root;
                mas->offset = MAPLE_NODE_SLOTS;

                /* Single entry tree. */
                if (mas->index > 0)
                        return NULL;

                return root;
        }

        return NULL;
}

/*
 * ma_data_end() - Find the end of the data in a node.
 * @node: The maple node
 * @type: The maple node type
 * @pivots: The array of pivots in the node
 * @max: The maximum value in the node
 *
 * Uses metadata to find the end of the data when possible.
 * Return: The zero indexed last slot with data (may be null).
 */
static __always_inline unsigned char ma_data_end(struct maple_node *node,
                enum maple_type type, unsigned long *pivots, unsigned long max)
{
        unsigned char offset;

        if (!pivots)
                return 0;

        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == max))
                return offset;

        return mt_pivots[type];
}

/*
 * mas_data_end() - Find the end of the data (slot).
 * @mas: the maple state
 *
 * This method is optimized to check the metadata of a node if the node type
 * supports data end metadata.
 *
 * Return: The zero indexed last slot with data (may be null).
 */
static inline unsigned char mas_data_end(struct ma_state *mas)
{
        enum maple_type type;
        struct maple_node *node;
        unsigned char offset;
        unsigned long *pivots;

        type = mte_node_type(mas->node);
        node = mas_mn(mas);
        if (type == maple_arange_64)
                return ma_meta_end(node, type);

        pivots = ma_pivots(node, type);
        if (unlikely(ma_dead_node(node)))
                return 0;

        offset = mt_pivots[type] - 1;
        if (likely(!pivots[offset]))
                return ma_meta_end(node, type);

        if (likely(pivots[offset] == mas->max))
                return offset;

        return mt_pivots[type];
}

/*
 * mas_leaf_max_gap() - Returns the largest gap in a leaf node
 * @mas: the maple state
 *
 * Return: The maximum gap in the leaf.
 */
static unsigned long mas_leaf_max_gap(struct ma_state *mas)
{
        enum maple_type mt;
        unsigned long pstart, gap, max_gap;
        struct maple_node *mn;
        unsigned long *pivots;
        void __rcu **slots;
        unsigned char i;
        unsigned char max_piv;

        mt = mte_node_type(mas->node);
        mn = mas_mn(mas);
        slots = ma_slots(mn, mt);
        max_gap = 0;
        if (unlikely(ma_is_dense(mt))) {
                gap = 0;
                for (i = 0; i < mt_slots[mt]; i++) {
                        if (slots[i]) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                        } else {
                                gap++;
                        }
                }
                if (gap > max_gap)
                        max_gap = gap;
                return max_gap;
        }

        /*
         * Check the first implied pivot optimizes the loop below and slot 1 may
         * be skipped if there is a gap in slot 0.
         */
        pivots = ma_pivots(mn, mt);
        if (likely(!slots[0])) {
                max_gap = pivots[0] - mas->min + 1;
                i = 2;
        } else {
                i = 1;
        }

        /* reduce max_piv as the special case is checked before the loop */
        max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1;
        /*
         * Check end implied pivot which can only be a gap on the right most
         * node.
         */
        if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) {
                gap = ULONG_MAX - pivots[max_piv];
                if (gap > max_gap)
                        max_gap = gap;

                if (max_gap > pivots[max_piv] - mas->min)
                        return max_gap;
        }

        for (; i <= max_piv; i++) {
                /* data == no gap. */
                if (likely(slots[i]))
                        continue;

                pstart = pivots[i - 1];
                gap = pivots[i] - pstart;
                if (gap > max_gap)
                        max_gap = gap;

                /* There cannot be two gaps in a row. */
                i++;
        }
        return max_gap;
}

/*
 * ma_max_gap() - Get the maximum gap in a maple node (non-leaf)
 * @node: The maple node
 * @gaps: The pointer to the gaps
 * @mt: The maple node type
 * @off: Pointer to store the offset location of the gap.
 *
 * Uses the metadata data end to scan backwards across set gaps.
 *
 * Return: The maximum gap value
 */
static inline unsigned long
ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt,
            unsigned char *off)
{
        unsigned char offset, i;
        unsigned long max_gap = 0;

        i = offset = ma_meta_end(node, mt);
        do {
                if (gaps[i] > max_gap) {
                        max_gap = gaps[i];
                        offset = i;
                }
        } while (i--);

        *off = offset;
        return max_gap;
}

/*
 * mas_max_gap() - find the largest gap in a non-leaf node and set the slot.
 * @mas: The maple state.
 *
 * Return: The gap value.
 */
static inline unsigned long mas_max_gap(struct ma_state *mas)
{
        unsigned long *gaps;
        unsigned char offset;
        enum maple_type mt;
        struct maple_node *node;

        mt = mte_node_type(mas->node);
        if (ma_is_leaf(mt))
                return mas_leaf_max_gap(mas);

        node = mas_mn(mas);
        MAS_BUG_ON(mas, mt != maple_arange_64);
        offset = ma_meta_gap(node);
        gaps = ma_gaps(node, mt);
        return gaps[offset];
}

/*
 * mas_parent_gap() - Set the parent gap and any gaps above, as needed
 * @mas: The maple state
 * @offset: The gap offset in the parent to set
 * @new: The new gap value.
 *
 * Set the parent gap then continue to set the gap upwards, using the metadata
 * of the parent to see if it is necessary to check the node above.
 */
static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset,
                unsigned long new)
{
        unsigned long meta_gap = 0;
        struct maple_node *pnode;
        struct maple_enode *penode;
        unsigned long *pgaps;
        unsigned char meta_offset;
        enum maple_type pmt;

        pnode = mte_parent(mas->node);
        pmt = mas_parent_type(mas, mas->node);
        penode = mt_mk_node(pnode, pmt);
        pgaps = ma_gaps(pnode, pmt);

ascend:
        MAS_BUG_ON(mas, pmt != maple_arange_64);
        meta_offset = ma_meta_gap(pnode);
        meta_gap = pgaps[meta_offset];

        pgaps[offset] = new;

        if (meta_gap == new)
                return;

        if (offset != meta_offset) {
                if (meta_gap > new)
                        return;

                ma_set_meta_gap(pnode, pmt, offset);
        } else if (new < meta_gap) {
                new = ma_max_gap(pnode, pgaps, pmt, &meta_offset);
                ma_set_meta_gap(pnode, pmt, meta_offset);
        }

        if (ma_is_root(pnode))
                return;

        /* Go to the parent node. */
        pnode = mte_parent(penode);
        pmt = mas_parent_type(mas, penode);
        pgaps = ma_gaps(pnode, pmt);
        offset = mte_parent_slot(penode);
        penode = mt_mk_node(pnode, pmt);
        goto ascend;
}

/*
 * mas_update_gap() - Update a nodes gaps and propagate up if necessary.
 * @mas: the maple state.
 */
static inline void mas_update_gap(struct ma_state *mas)
{
        unsigned char pslot;
        unsigned long p_gap;
        unsigned long max_gap;

        if (!mt_is_alloc(mas->tree))
                return;

        if (mte_is_root(mas->node))
                return;

        max_gap = mas_max_gap(mas);

        pslot = mte_parent_slot(mas->node);
        p_gap = ma_gaps(mte_parent(mas->node),
                        mas_parent_type(mas, mas->node))[pslot];

        if (p_gap != max_gap)
                mas_parent_gap(mas, pslot, max_gap);
}

/*
 * mas_adopt_children() - Set the parent pointer of all nodes in @parent to
 * @parent with the slot encoded.
 * @mas: the maple state (for the tree)
 * @parent: the maple encoded node containing the children.
 */
static inline void mas_adopt_children(struct ma_state *mas,
                struct maple_enode *parent)
{
        enum maple_type type = mte_node_type(parent);
        struct maple_node *node = mte_to_node(parent);
        void __rcu **slots = ma_slots(node, type);
        unsigned long *pivots = ma_pivots(node, type);
        struct maple_enode *child;
        unsigned char offset;

        offset = ma_data_end(node, type, pivots, mas->max);
        do {
                child = mas_slot_locked(mas, slots, offset);
                mas_set_parent(mas, child, parent, offset);
        } while (offset--);
}

/*
 * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old
 * node as dead.
 * @mas: the maple state with the new node
 * @old_enode: The old maple encoded node to replace.
 */
static inline void mas_put_in_tree(struct ma_state *mas,
                struct maple_enode *old_enode)
        __must_hold(mas->tree->ma_lock)
{
        unsigned char offset;
        void __rcu **slots;

        if (mte_is_root(mas->node)) {
                mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas));
                rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
                mas_set_height(mas);
        } else {

                offset = mte_parent_slot(mas->node);
                slots = ma_slots(mte_parent(mas->node),
                                 mas_parent_type(mas, mas->node));
                rcu_assign_pointer(slots[offset], mas->node);
        }

        mte_set_node_dead(old_enode);
}

/*
 * mas_replace_node() - Replace a node by putting it in the tree, marking it
 * dead, and freeing it.
 * the parent encoding to locate the maple node in the tree.
 * @mas: the ma_state with @mas->node pointing to the new node.
 * @old_enode: The old maple encoded node.
 */
static inline void mas_replace_node(struct ma_state *mas,
                struct maple_enode *old_enode)
        __must_hold(mas->tree->ma_lock)
{
        mas_put_in_tree(mas, old_enode);
        mas_free(mas, old_enode);
}

/*
 * mas_find_child() - Find a child who has the parent @mas->node.
 * @mas: the maple state with the parent.
 * @child: the maple state to store the child.
 */
static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child)
        __must_hold(mas->tree->ma_lock)
{
        enum maple_type mt;
        unsigned char offset;
        unsigned char end;
        unsigned long *pivots;
        struct maple_enode *entry;
        struct maple_node *node;
        void __rcu **slots;

        mt = mte_node_type(mas->node);
        node = mas_mn(mas);
        slots = ma_slots(node, mt);
        pivots = ma_pivots(node, mt);
        end = ma_data_end(node, mt, pivots, mas->max);
        for (offset = mas->offset; offset <= end; offset++) {
                entry = mas_slot_locked(mas, slots, offset);
                if (mte_parent(entry) == node) {
                        *child = *mas;
                        mas->offset = offset + 1;
                        child->offset = offset;
                        mas_descend(child);
                        child->offset = 0;
                        return true;
                }
        }
        return false;
}

/*
 * mab_shift_right() - Shift the data in mab right. Note, does not clean out the
 * old data or set b_node->b_end.
 * @b_node: the maple_big_node
 * @shift: the shift count
 */
static inline void mab_shift_right(struct maple_big_node *b_node,
                                 unsigned char shift)
{
        unsigned long size = b_node->b_end * sizeof(unsigned long);

        memmove(b_node->pivot + shift, b_node->pivot, size);
        memmove(b_node->slot + shift, b_node->slot, size);
        if (b_node->type == maple_arange_64)
                memmove(b_node->gap + shift, b_node->gap, size);
}

/*
 * mab_middle_node() - Check if a middle node is needed (unlikely)
 * @b_node: the maple_big_node that contains the data.
 * @split: the potential split location
 * @slot_count: the size that can be stored in a single node being considered.
 *
 * Return: true if a middle node is required.
 */
static inline bool mab_middle_node(struct maple_big_node *b_node, int split,
                                   unsigned char slot_count)
{
        unsigned char size = b_node->b_end;

        if (size >= 2 * slot_count)
                return true;

        if (!b_node->slot[split] && (size >= 2 * slot_count - 1))
                return true;

        return false;
}

/*
 * mab_no_null_split() - ensure the split doesn't fall on a NULL
 * @b_node: the maple_big_node with the data
 * @split: the suggested split location
 * @slot_count: the number of slots in the node being considered.
 *
 * Return: the split location.
 */
static inline int mab_no_null_split(struct maple_big_node *b_node,
                                    unsigned char split, unsigned char slot_count)
{
        if (!b_node->slot[split]) {
                /*
                 * If the split is less than the max slot && the right side will
                 * still be sufficient, then increment the split on NULL.
                 */
                if ((split < slot_count - 1) &&
                    (b_node->b_end - split) > (mt_min_slots[b_node->type]))
                        split++;
                else
                        split--;
        }
        return split;
}

/*
 * mab_calc_split() - Calculate the split location and if there needs to be two
 * splits.
 * @mas: The maple state
 * @bn: The maple_big_node with the data
 * @mid_split: The second split, if required.  0 otherwise.
 *
 * Return: The first split location.  The middle split is set in @mid_split.
 */
static inline int mab_calc_split(struct ma_state *mas,
         struct maple_big_node *bn, unsigned char *mid_split)
{
        unsigned char b_end = bn->b_end;
        int split = b_end / 2; /* Assume equal split. */
        unsigned char slot_count = mt_slots[bn->type];

        /*
         * To support gap tracking, all NULL entries are kept together and a node cannot
         * end on a NULL entry, with the exception of the left-most leaf.  The
         * limitation means that the split of a node must be checked for this condition
         * and be able to put more data in one direction or the other.
         */
        if (unlikely((mas->mas_flags & MA_STATE_BULK))) {
                *mid_split = 0;
                split = b_end - mt_min_slots[bn->type];

                if (!ma_is_leaf(bn->type))
                        return split;

                mas->mas_flags |= MA_STATE_REBALANCE;
                if (!bn->slot[split])
                        split--;
                return split;
        }

        /*
         * Although extremely rare, it is possible to enter what is known as the 3-way
         * split scenario.  The 3-way split comes about by means of a store of a range
         * that overwrites the end and beginning of two full nodes.  The result is a set
         * of entries that cannot be stored in 2 nodes.  Sometimes, these two nodes can
         * also be located in different parent nodes which are also full.  This can
         * carry upwards all the way to the root in the worst case.
         */
        if (unlikely(mab_middle_node(bn, split, slot_count))) {
                split = b_end / 3;
                *mid_split = split * 2;
        } else {
                *mid_split = 0;
        }

        /* Avoid ending a node on a NULL entry */
        split = mab_no_null_split(bn, split, slot_count);

        if (unlikely(*mid_split))
                *mid_split = mab_no_null_split(bn, *mid_split, slot_count);

        return split;
}

/*
 * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node
 * and set @b_node->b_end to the next free slot.
 * @mas: The maple state
 * @mas_start: The starting slot to copy
 * @mas_end: The end slot to copy (inclusively)
 * @b_node: The maple_big_node to place the data
 * @mab_start: The starting location in maple_big_node to store the data.
 */
static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start,
                        unsigned char mas_end, struct maple_big_node *b_node,
                        unsigned char mab_start)
{
        enum maple_type mt;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots, *gaps;
        int i = mas_start, j = mab_start;
        unsigned char piv_end;

        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        pivots = ma_pivots(node, mt);
        if (!i) {
                b_node->pivot[j] = pivots[i++];
                if (unlikely(i > mas_end))
                        goto complete;
                j++;
        }

        piv_end = min(mas_end, mt_pivots[mt]);
        for (; i < piv_end; i++, j++) {
                b_node->pivot[j] = pivots[i];
                if (unlikely(!b_node->pivot[j]))
                        goto complete;

                if (unlikely(mas->max == b_node->pivot[j]))
                        goto complete;
        }

        b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt);

complete:
        b_node->b_end = ++j;
        j -= mab_start;
        slots = ma_slots(node, mt);
        memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j);
        if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) {
                gaps = ma_gaps(node, mt);
                memcpy(b_node->gap + mab_start, gaps + mas_start,
                       sizeof(unsigned long) * j);
        }
}

/*
 * mas_leaf_set_meta() - Set the metadata of a leaf if possible.
 * @node: The maple node
 * @mt: The maple type
 * @end: The node end
 */
static inline void mas_leaf_set_meta(struct maple_node *node,
                enum maple_type mt, unsigned char end)
{
        if (end < mt_slots[mt] - 1)
                ma_set_meta(node, mt, 0, end);
}

/*
 * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node.
 * @b_node: the maple_big_node that has the data
 * @mab_start: the start location in @b_node.
 * @mab_end: The end location in @b_node (inclusively)
 * @mas: The maple state with the maple encoded node.
 */
static inline void mab_mas_cp(struct maple_big_node *b_node,
                              unsigned char mab_start, unsigned char mab_end,
                              struct ma_state *mas, bool new_max)
{
        int i, j = 0;
        enum maple_type mt = mte_node_type(mas->node);
        struct maple_node *node = mte_to_node(mas->node);
        void __rcu **slots = ma_slots(node, mt);
        unsigned long *pivots = ma_pivots(node, mt);
        unsigned long *gaps = NULL;
        unsigned char end;

        if (mab_end - mab_start > mt_pivots[mt])
                mab_end--;

        if (!pivots[mt_pivots[mt] - 1])
                slots[mt_pivots[mt]] = NULL;

        i = mab_start;
        do {
                pivots[j++] = b_node->pivot[i++];
        } while (i <= mab_end && likely(b_node->pivot[i]));

        memcpy(slots, b_node->slot + mab_start,
               sizeof(void *) * (i - mab_start));

        if (new_max)
                mas->max = b_node->pivot[i - 1];

        end = j - 1;
        if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) {
                unsigned long max_gap = 0;
                unsigned char offset = 0;

                gaps = ma_gaps(node, mt);
                do {
                        gaps[--j] = b_node->gap[--i];
                        if (gaps[j] > max_gap) {
                                offset = j;
                                max_gap = gaps[j];
                        }
                } while (j);

                ma_set_meta(node, mt, offset, end);
        } else {
                mas_leaf_set_meta(node, mt, end);
        }
}

/*
 * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert.
 * @mas: The maple state
 * @end: The maple node end
 * @mt: The maple node type
 */
static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end,
                                      enum maple_type mt)
{
        if (!(mas->mas_flags & MA_STATE_BULK))
                return;

        if (mte_is_root(mas->node))
                return;

        if (end > mt_min_slots[mt]) {
                mas->mas_flags &= ~MA_STATE_REBALANCE;
                return;
        }
}

/*
 * mas_store_b_node() - Store an @entry into the b_node while also copying the
 * data from a maple encoded node.
 * @wr_mas: the maple write state
 * @b_node: the maple_big_node to fill with data
 * @offset_end: the offset to end copying
 *
 * Return: The actual end of the data stored in @b_node
 */
static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas,
                struct maple_big_node *b_node, unsigned char offset_end)
{
        unsigned char slot;
        unsigned char b_end;
        /* Possible underflow of piv will wrap back to 0 before use. */
        unsigned long piv;
        struct ma_state *mas = wr_mas->mas;

        b_node->type = wr_mas->type;
        b_end = 0;
        slot = mas->offset;
        if (slot) {
                /* Copy start data up to insert. */
                mas_mab_cp(mas, 0, slot - 1, b_node, 0);
                b_end = b_node->b_end;
                piv = b_node->pivot[b_end - 1];
        } else
                piv = mas->min - 1;

        if (piv + 1 < mas->index) {
                /* Handle range starting after old range */
                b_node->slot[b_end] = wr_mas->content;
                if (!wr_mas->content)
                        b_node->gap[b_end] = mas->index - 1 - piv;
                b_node->pivot[b_end++] = mas->index - 1;
        }

        /* Store the new entry. */
        mas->offset = b_end;
        b_node->slot[b_end] = wr_mas->entry;
        b_node->pivot[b_end] = mas->last;

        /* Appended. */
        if (mas->last >= mas->max)
                goto b_end;

        /* Handle new range ending before old range ends */
        piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type);
        if (piv > mas->last) {
                if (piv == ULONG_MAX)
                        mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type);

                if (offset_end != slot)
                        wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                          offset_end);

                b_node->slot[++b_end] = wr_mas->content;
                if (!wr_mas->content)
                        b_node->gap[b_end] = piv - mas->last + 1;
                b_node->pivot[b_end] = piv;
        }

        slot = offset_end + 1;
        if (slot > mas->end)
                goto b_end;

        /* Copy end data to the end of the node. */
        mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end);
        b_node->b_end--;
        return;

b_end:
        b_node->b_end = b_end;
}

/*
 * mas_prev_sibling() - Find the previous node with the same parent.
 * @mas: the maple state
 *
 * Return: True if there is a previous sibling, false otherwise.
 */
static inline bool mas_prev_sibling(struct ma_state *mas)
{
        unsigned int p_slot = mte_parent_slot(mas->node);

        /* For root node, p_slot is set to 0 by mte_parent_slot(). */
        if (!p_slot)
                return false;

        mas_ascend(mas);
        mas->offset = p_slot - 1;
        mas_descend(mas);
        return true;
}

/*
 * mas_next_sibling() - Find the next node with the same parent.
 * @mas: the maple state
 *
 * Return: true if there is a next sibling, false otherwise.
 */
static inline bool mas_next_sibling(struct ma_state *mas)
{
        MA_STATE(parent, mas->tree, mas->index, mas->last);

        if (mte_is_root(mas->node))
                return false;

        parent = *mas;
        mas_ascend(&parent);
        parent.offset = mte_parent_slot(mas->node) + 1;
        if (parent.offset > mas_data_end(&parent))
                return false;

        *mas = parent;
        mas_descend(mas);
        return true;
}

/*
 * mas_node_or_none() - Set the enode and state.
 * @mas: the maple state
 * @enode: The encoded maple node.
 *
 * Set the node to the enode and the status.
 */
static inline void mas_node_or_none(struct ma_state *mas,
                struct maple_enode *enode)
{
        if (enode) {
                mas->node = enode;
                mas->status = ma_active;
        } else {
                mas->node = NULL;
                mas->status = ma_none;
        }
}

/*
 * mas_wr_node_walk() - Find the correct offset for the index in the @mas.
 *                      If @mas->index cannot be found within the containing
 *                      node, we traverse to the last entry in the node.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 */
static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char count, offset;

        if (unlikely(ma_is_dense(wr_mas->type))) {
                wr_mas->r_max = wr_mas->r_min = mas->index;
                mas->offset = mas->index = mas->min;
                return;
        }

        wr_mas->node = mas_mn(wr_mas->mas);
        wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type);
        count = mas->end = ma_data_end(wr_mas->node, wr_mas->type,
                                       wr_mas->pivots, mas->max);
        offset = mas->offset;

        while (offset < count && mas->index > wr_mas->pivots[offset])
                offset++;

        wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max;
        wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset);
        wr_mas->offset_end = mas->offset = offset;
}

/*
 * mast_rebalance_next() - Rebalance against the next node
 * @mast: The maple subtree state
 */
static inline void mast_rebalance_next(struct maple_subtree_state *mast)
{
        unsigned char b_end = mast->bn->b_end;

        mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node),
                   mast->bn, b_end);
        mast->orig_r->last = mast->orig_r->max;
}

/*
 * mast_rebalance_prev() - Rebalance against the previous node
 * @mast: The maple subtree state
 */
static inline void mast_rebalance_prev(struct maple_subtree_state *mast)
{
        unsigned char end = mas_data_end(mast->orig_l) + 1;
        unsigned char b_end = mast->bn->b_end;

        mab_shift_right(mast->bn, end);
        mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0);
        mast->l->min = mast->orig_l->min;
        mast->orig_l->index = mast->orig_l->min;
        mast->bn->b_end = end + b_end;
        mast->l->offset += end;
}

/*
 * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring
 * the node to the right.  Checking the nodes to the right then the left at each
 * level upwards until root is reached.
 * Data is copied into the @mast->bn.
 * @mast: The maple_subtree_state.
 */
static inline
bool mast_spanning_rebalance(struct maple_subtree_state *mast)
{
        struct ma_state r_tmp = *mast->orig_r;
        struct ma_state l_tmp = *mast->orig_l;
        unsigned char depth = 0;

        do {
                mas_ascend(mast->orig_r);
                mas_ascend(mast->orig_l);
                depth++;
                if (mast->orig_r->offset < mas_data_end(mast->orig_r)) {
                        mast->orig_r->offset++;
                        do {
                                mas_descend(mast->orig_r);
                                mast->orig_r->offset = 0;
                        } while (--depth);

                        mast_rebalance_next(mast);
                        *mast->orig_l = l_tmp;
                        return true;
                } else if (mast->orig_l->offset != 0) {
                        mast->orig_l->offset--;
                        do {
                                mas_descend(mast->orig_l);
                                mast->orig_l->offset =
                                        mas_data_end(mast->orig_l);
                        } while (--depth);

                        mast_rebalance_prev(mast);
                        *mast->orig_r = r_tmp;
                        return true;
                }
        } while (!mte_is_root(mast->orig_r->node));

        *mast->orig_r = r_tmp;
        *mast->orig_l = l_tmp;
        return false;
}

/*
 * mast_ascend() - Ascend the original left and right maple states.
 * @mast: the maple subtree state.
 *
 * Ascend the original left and right sides.  Set the offsets to point to the
 * data already in the new tree (@mast->l and @mast->r).
 */
static inline void mast_ascend(struct maple_subtree_state *mast)
{
        MA_WR_STATE(wr_mas, mast->orig_r,  NULL);
        mas_ascend(mast->orig_l);
        mas_ascend(mast->orig_r);

        mast->orig_r->offset = 0;
        mast->orig_r->index = mast->r->max;
        /* last should be larger than or equal to index */
        if (mast->orig_r->last < mast->orig_r->index)
                mast->orig_r->last = mast->orig_r->index;

        wr_mas.type = mte_node_type(mast->orig_r->node);
        mas_wr_node_walk(&wr_mas);
        /* Set up the left side of things */
        mast->orig_l->offset = 0;
        mast->orig_l->index = mast->l->min;
        wr_mas.mas = mast->orig_l;
        wr_mas.type = mte_node_type(mast->orig_l->node);
        mas_wr_node_walk(&wr_mas);

        mast->bn->type = wr_mas.type;
}

/*
 * mas_new_ma_node() - Create and return a new maple node.  Helper function.
 * @mas: the maple state with the allocations.
 * @b_node: the maple_big_node with the type encoding.
 *
 * Use the node type from the maple_big_node to allocate a new node from the
 * ma_state.  This function exists mainly for code readability.
 *
 * Return: A new maple encoded node
 */
static inline struct maple_enode
*mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node)
{
        return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type);
}

/*
 * mas_mab_to_node() - Set up right and middle nodes
 *
 * @mas: the maple state that contains the allocations.
 * @b_node: the node which contains the data.
 * @left: The pointer which will have the left node
 * @right: The pointer which may have the right node
 * @middle: the pointer which may have the middle node (rare)
 * @mid_split: the split location for the middle node
 *
 * Return: the split of left.
 */
static inline unsigned char mas_mab_to_node(struct ma_state *mas,
        struct maple_big_node *b_node, struct maple_enode **left,
        struct maple_enode **right, struct maple_enode **middle,
        unsigned char *mid_split)
{
        unsigned char split = 0;
        unsigned char slot_count = mt_slots[b_node->type];

        *left = mas_new_ma_node(mas, b_node);
        *right = NULL;
        *middle = NULL;
        *mid_split = 0;

        if (b_node->b_end < slot_count) {
                split = b_node->b_end;
        } else {
                split = mab_calc_split(mas, b_node, mid_split);
                *right = mas_new_ma_node(mas, b_node);
        }

        if (*mid_split)
                *middle = mas_new_ma_node(mas, b_node);

        return split;

}

/*
 * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end
 * pointer.
 * @b_node: the big node to add the entry
 * @mas: the maple state to get the pivot (mas->max)
 * @entry: the entry to add, if NULL nothing happens.
 */
static inline void mab_set_b_end(struct maple_big_node *b_node,
                                 struct ma_state *mas,
                                 void *entry)
{
        if (!entry)
                return;

        b_node->slot[b_node->b_end] = entry;
        if (mt_is_alloc(mas->tree))
                b_node->gap[b_node->b_end] = mas_max_gap(mas);
        b_node->pivot[b_node->b_end++] = mas->max;
}

/*
 * mas_set_split_parent() - combine_then_separate helper function.  Sets the parent
 * of @mas->node to either @left or @right, depending on @slot and @split
 *
 * @mas: the maple state with the node that needs a parent
 * @left: possible parent 1
 * @right: possible parent 2
 * @slot: the slot the mas->node was placed
 * @split: the split location between @left and @right
 */
static inline void mas_set_split_parent(struct ma_state *mas,
                                        struct maple_enode *left,
                                        struct maple_enode *right,
                                        unsigned char *slot, unsigned char split)
{
        if (mas_is_none(mas))
                return;

        if ((*slot) <= split)
                mas_set_parent(mas, mas->node, left, *slot);
        else if (right)
                mas_set_parent(mas, mas->node, right, (*slot) - split - 1);

        (*slot)++;
}

/*
 * mte_mid_split_check() - Check if the next node passes the mid-split
 * @l: Pointer to left encoded maple node.
 * @m: Pointer to middle encoded maple node.
 * @r: Pointer to right encoded maple node.
 * @slot: The offset
 * @split: The split location.
 * @mid_split: The middle split.
 */
static inline void mte_mid_split_check(struct maple_enode **l,
                                       struct maple_enode **r,
                                       struct maple_enode *right,
                                       unsigned char slot,
                                       unsigned char *split,
                                       unsigned char mid_split)
{
        if (*r == right)
                return;

        if (slot < mid_split)
                return;

        *l = *r;
        *r = right;
        *split = mid_split;
}

/*
 * mast_set_split_parents() - Helper function to set three nodes parents.  Slot
 * is taken from @mast->l.
 * @mast: the maple subtree state
 * @left: the left node
 * @right: the right node
 * @split: the split location.
 */
static inline void mast_set_split_parents(struct maple_subtree_state *mast,
                                          struct maple_enode *left,
                                          struct maple_enode *middle,
                                          struct maple_enode *right,
                                          unsigned char split,
                                          unsigned char mid_split)
{
        unsigned char slot;
        struct maple_enode *l = left;
        struct maple_enode *r = right;

        if (mas_is_none(mast->l))
                return;

        if (middle)
                r = middle;

        slot = mast->l->offset;

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->l, l, r, &slot, split);

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->m, l, r, &slot, split);

        mte_mid_split_check(&l, &r, right, slot, &split, mid_split);
        mas_set_split_parent(mast->r, l, r, &slot, split);
}

/*
 * mas_topiary_node() - Dispose of a single node
 * @mas: The maple state for pushing nodes
 * @in_rcu: If the tree is in rcu mode
 *
 * The node will either be RCU freed or pushed back on the maple state.
 */
static inline void mas_topiary_node(struct ma_state *mas,
                struct ma_state *tmp_mas, bool in_rcu)
{
        struct maple_node *tmp;
        struct maple_enode *enode;

        if (mas_is_none(tmp_mas))
                return;

        enode = tmp_mas->node;
        tmp = mte_to_node(enode);
        mte_set_node_dead(enode);
        if (in_rcu)
                ma_free_rcu(tmp);
        else
                mas_push_node(mas, tmp);
}

/*
 * mas_topiary_replace() - Replace the data with new data, then repair the
 * parent links within the new tree.  Iterate over the dead sub-tree and collect
 * the dead subtrees and topiary the nodes that are no longer of use.
 *
 * The new tree will have up to three children with the correct parent.  Keep
 * track of the new entries as they need to be followed to find the next level
 * of new entries.
 *
 * The old tree will have up to three children with the old parent.  Keep track
 * of the old entries as they may have more nodes below replaced.  Nodes within
 * [index, last] are dead subtrees, others need to be freed and followed.
 *
 * @mas: The maple state pointing at the new data
 * @old_enode: The maple encoded node being replaced
 *
 */
static inline void mas_topiary_replace(struct ma_state *mas,
                struct maple_enode *old_enode)
{
        struct ma_state tmp[3], tmp_next[3];
        MA_TOPIARY(subtrees, mas->tree);
        bool in_rcu;
        int i, n;

        /* Place data in tree & then mark node as old */
        mas_put_in_tree(mas, old_enode);

        /* Update the parent pointers in the tree */
        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        while (!mte_is_leaf(tmp[0].node)) {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;
                                n++;
                        }

                        mas_adopt_children(&tmp[i], tmp[i].node);
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++)
                        tmp[i] = tmp_next[i];
        }

        /* Collect the old nodes that need to be discarded */
        if (mte_is_leaf(old_enode))
                return mas_free(mas, old_enode);

        tmp[0] = *mas;
        tmp[0].offset = 0;
        tmp[0].node = old_enode;
        tmp[1].status = ma_none;
        tmp[2].status = ma_none;
        in_rcu = mt_in_rcu(mas->tree);
        do {
                n = 0;
                for (i = 0; i < 3; i++) {
                        if (mas_is_none(&tmp[i]))
                                continue;

                        while (n < 3) {
                                if (!mas_find_child(&tmp[i], &tmp_next[n]))
                                        break;

                                if ((tmp_next[n].min >= tmp_next->index) &&
                                    (tmp_next[n].max <= tmp_next->last)) {
                                        mat_add(&subtrees, tmp_next[n].node);
                                        tmp_next[n].status = ma_none;
                                } else {
                                        n++;
                                }
                        }
                }

                if (MAS_WARN_ON(mas, n == 0))
                        break;

                while (n < 3)
                        tmp_next[n++].status = ma_none;

                for (i = 0; i < 3; i++) {
                        mas_topiary_node(mas, &tmp[i], in_rcu);
                        tmp[i] = tmp_next[i];
                }
        } while (!mte_is_leaf(tmp[0].node));

        for (i = 0; i < 3; i++)
                mas_topiary_node(mas, &tmp[i], in_rcu);

        mas_mat_destroy(mas, &subtrees);
}

/*
 * mas_wmb_replace() - Write memory barrier and replace
 * @mas: The maple state
 * @old_enode: The old maple encoded node that is being replaced.
 *
 * Updates gap as necessary.
 */
static inline void mas_wmb_replace(struct ma_state *mas,
                struct maple_enode *old_enode)
{
        /* Insert the new data in the tree */
        mas_topiary_replace(mas, old_enode);

        if (mte_is_leaf(mas->node))
                return;

        mas_update_gap(mas);
}

/*
 * mast_cp_to_nodes() - Copy data out to nodes.
 * @mast: The maple subtree state
 * @left: The left encoded maple node
 * @middle: The middle encoded maple node
 * @right: The right encoded maple node
 * @split: The location to split between left and (middle ? middle : right)
 * @mid_split: The location to split between middle and right.
 */
static inline void mast_cp_to_nodes(struct maple_subtree_state *mast,
        struct maple_enode *left, struct maple_enode *middle,
        struct maple_enode *right, unsigned char split, unsigned char mid_split)
{
        bool new_lmax = true;

        mas_node_or_none(mast->l, left);
        mas_node_or_none(mast->m, middle);
        mas_node_or_none(mast->r, right);

        mast->l->min = mast->orig_l->min;
        if (split == mast->bn->b_end) {
                mast->l->max = mast->orig_r->max;
                new_lmax = false;
        }

        mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax);

        if (middle) {
                mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true);
                mast->m->min = mast->bn->pivot[split] + 1;
                split = mid_split;
        }

        mast->r->max = mast->orig_r->max;
        if (right) {
                mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false);
                mast->r->min = mast->bn->pivot[split] + 1;
        }
}

/*
 * mast_combine_cp_left - Copy in the original left side of the tree into the
 * combined data set in the maple subtree state big node.
 * @mast: The maple subtree state
 */
static inline void mast_combine_cp_left(struct maple_subtree_state *mast)
{
        unsigned char l_slot = mast->orig_l->offset;

        if (!l_slot)
                return;

        mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0);
}

/*
 * mast_combine_cp_right: Copy in the original right side of the tree into the
 * combined data set in the maple subtree state big node.
 * @mast: The maple subtree state
 */
static inline void mast_combine_cp_right(struct maple_subtree_state *mast)
{
        if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max)
                return;

        mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1,
                   mt_slot_count(mast->orig_r->node), mast->bn,
                   mast->bn->b_end);
        mast->orig_r->last = mast->orig_r->max;
}

/*
 * mast_sufficient: Check if the maple subtree state has enough data in the big
 * node to create at least one sufficient node
 * @mast: the maple subtree state
 */
static inline bool mast_sufficient(struct maple_subtree_state *mast)
{
        if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node))
                return true;

        return false;
}

/*
 * mast_overflow: Check if there is too much data in the subtree state for a
 * single node.
 * @mast: The maple subtree state
 */
static inline bool mast_overflow(struct maple_subtree_state *mast)
{
        if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node))
                return true;

        return false;
}

static inline void *mtree_range_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next, *last;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;
        unsigned long max, min;
        unsigned long prev_max, prev_min;

        next = mas->node;
        min = mas->min;
        max = mas->max;
        do {
                last = next;
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = ma_data_end(node, type, pivots, max);
                prev_min = min;
                prev_max = max;
                if (pivots[0] >= mas->index) {
                        offset = 0;
                        max = pivots[0];
                        goto next;
                }

                offset = 1;
                while (offset < end) {
                        if (pivots[offset] >= mas->index) {
                                max = pivots[offset];
                                break;
                        }
                        offset++;
                }

                min = pivots[offset - 1] + 1;
next:
                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        mas->end = end;
        mas->offset = offset;
        mas->index = min;
        mas->last = max;
        mas->min = prev_min;
        mas->max = prev_max;
        mas->node = last;
        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

/*
 * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers.
 * @mas: The starting maple state
 * @mast: The maple_subtree_state, keeps track of 4 maple states.
 * @count: The estimated count of iterations needed.
 *
 * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root
 * is hit.  First @b_node is split into two entries which are inserted into the
 * next iteration of the loop.  @b_node is returned populated with the final
 * iteration. @mas is used to obtain allocations.  orig_l_mas keeps track of the
 * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last
 * to account of what has been copied into the new sub-tree.  The update of
 * orig_l_mas->last is used in mas_consume to find the slots that will need to
 * be either freed or destroyed.  orig_l_mas->depth keeps track of the height of
 * the new sub-tree in case the sub-tree becomes the full tree.
 */
static void mas_spanning_rebalance(struct ma_state *mas,
                struct maple_subtree_state *mast, unsigned char count)
{
        unsigned char split, mid_split;
        unsigned char slot = 0;
        struct maple_enode *left = NULL, *middle = NULL, *right = NULL;
        struct maple_enode *old_enode;

        MA_STATE(l_mas, mas->tree, mas->index, mas->index);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);
        MA_STATE(m_mas, mas->tree, mas->index, mas->index);

        /*
         * The tree needs to be rebalanced and leaves need to be kept at the same level.
         * Rebalancing is done by use of the ``struct maple_topiary``.
         */
        mast->l = &l_mas;
        mast->m = &m_mas;
        mast->r = &r_mas;
        l_mas.status = r_mas.status = m_mas.status = ma_none;

        /* Check if this is not root and has sufficient data.  */
        if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) &&
            unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type]))
                mast_spanning_rebalance(mast);

        l_mas.depth = 0;

        /*
         * Each level of the tree is examined and balanced, pushing data to the left or
         * right, or rebalancing against left or right nodes is employed to avoid
         * rippling up the tree to limit the amount of churn.  Once a new sub-section of
         * the tree is created, there may be a mix of new and old nodes.  The old nodes
         * will have the incorrect parent pointers and currently be in two trees: the
         * original tree and the partially new tree.  To remedy the parent pointers in
         * the old tree, the new data is swapped into the active tree and a walk down
         * the tree is performed and the parent pointers are updated.
         * See mas_topiary_replace() for more information.
         */
        while (count--) {
                mast->bn->b_end--;
                mast->bn->type = mte_node_type(mast->orig_l->node);
                split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle,
                                        &mid_split);
                mast_set_split_parents(mast, left, middle, right, split,
                                       mid_split);
                mast_cp_to_nodes(mast, left, middle, right, split, mid_split);

                /*
                 * Copy data from next level in the tree to mast->bn from next
                 * iteration
                 */
                memset(mast->bn, 0, sizeof(struct maple_big_node));
                mast->bn->type = mte_node_type(left);
                l_mas.depth++;

                /* Root already stored in l->node. */
                if (mas_is_root_limits(mast->l))
                        goto new_root;

                mast_ascend(mast);
                mast_combine_cp_left(mast);
                l_mas.offset = mast->bn->b_end;
                mab_set_b_end(mast->bn, &l_mas, left);
                mab_set_b_end(mast->bn, &m_mas, middle);
                mab_set_b_end(mast->bn, &r_mas, right);

                /* Copy anything necessary out of the right node. */
                mast_combine_cp_right(mast);
                mast->orig_l->last = mast->orig_l->max;

                if (mast_sufficient(mast))
                        continue;

                if (mast_overflow(mast))
                        continue;

                /* May be a new root stored in mast->bn */
                if (mas_is_root_limits(mast->orig_l))
                        break;

                mast_spanning_rebalance(mast);

                /* rebalancing from other nodes may require another loop. */
                if (!count)
                        count++;
        }

        l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)),
                                mte_node_type(mast->orig_l->node));
        l_mas.depth++;
        mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true);
        mas_set_parent(mas, left, l_mas.node, slot);
        if (middle)
                mas_set_parent(mas, middle, l_mas.node, ++slot);

        if (right)
                mas_set_parent(mas, right, l_mas.node, ++slot);

        if (mas_is_root_limits(mast->l)) {
new_root:
                mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas));
                while (!mte_is_root(mast->orig_l->node))
                        mast_ascend(mast);
        } else {
                mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent;
        }

        old_enode = mast->orig_l->node;
        mas->depth = l_mas.depth;
        mas->node = l_mas.node;
        mas->min = l_mas.min;
        mas->max = l_mas.max;
        mas->offset = l_mas.offset;
        mas_wmb_replace(mas, old_enode);
        mtree_range_walk(mas);
        return;
}

/*
 * mas_rebalance() - Rebalance a given node.
 * @mas: The maple state
 * @b_node: The big maple node.
 *
 * Rebalance two nodes into a single node or two new nodes that are sufficient.
 * Continue upwards until tree is sufficient.
 */
static inline void mas_rebalance(struct ma_state *mas,
                                struct maple_big_node *b_node)
{
        char empty_count = mas_mt_height(mas);
        struct maple_subtree_state mast;
        unsigned char shift, b_end = ++b_node->b_end;

        MA_STATE(l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);

        trace_ma_op(__func__, mas);

        /*
         * Rebalancing occurs if a node is insufficient.  Data is rebalanced
         * against the node to the right if it exists, otherwise the node to the
         * left of this node is rebalanced against this node.  If rebalancing
         * causes just one node to be produced instead of two, then the parent
         * is also examined and rebalanced if it is insufficient.  Every level
         * tries to combine the data in the same way.  If one node contains the
         * entire range of the tree, then that node is used as a new root node.
         */

        mast.orig_l = &l_mas;
        mast.orig_r = &r_mas;
        mast.bn = b_node;
        mast.bn->type = mte_node_type(mas->node);

        l_mas = r_mas = *mas;

        if (mas_next_sibling(&r_mas)) {
                mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end);
                r_mas.last = r_mas.index = r_mas.max;
        } else {
                mas_prev_sibling(&l_mas);
                shift = mas_data_end(&l_mas) + 1;
                mab_shift_right(b_node, shift);
                mas->offset += shift;
                mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0);
                b_node->b_end = shift + b_end;
                l_mas.index = l_mas.last = l_mas.min;
        }

        return mas_spanning_rebalance(mas, &mast, empty_count);
}

/*
 * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple
 * state.
 * @mas: The maple state
 * @end: The end of the left-most node.
 *
 * During a mass-insert event (such as forking), it may be necessary to
 * rebalance the left-most node when it is not sufficient.
 */
static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end)
{
        enum maple_type mt = mte_node_type(mas->node);
        struct maple_node reuse, *newnode, *parent, *new_left, *left, *node;
        struct maple_enode *eparent, *old_eparent;
        unsigned char offset, tmp, split = mt_slots[mt] / 2;
        void __rcu **l_slots, **slots;
        unsigned long *l_pivs, *pivs, gap;
        bool in_rcu = mt_in_rcu(mas->tree);

        MA_STATE(l_mas, mas->tree, mas->index, mas->last);

        l_mas = *mas;
        mas_prev_sibling(&l_mas);

        /* set up node. */
        if (in_rcu) {
                newnode = mas_pop_node(mas);
        } else {
                newnode = &reuse;
        }

        node = mas_mn(mas);
        newnode->parent = node->parent;
        slots = ma_slots(newnode, mt);
        pivs = ma_pivots(newnode, mt);
        left = mas_mn(&l_mas);
        l_slots = ma_slots(left, mt);
        l_pivs = ma_pivots(left, mt);
        if (!l_slots[split])
                split++;
        tmp = mas_data_end(&l_mas) - split;

        memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp);
        memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp);
        pivs[tmp] = l_mas.max;
        memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end);
        memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end);

        l_mas.max = l_pivs[split];
        mas->min = l_mas.max + 1;
        old_eparent = mt_mk_node(mte_parent(l_mas.node),
                             mas_parent_type(&l_mas, l_mas.node));
        tmp += end;
        if (!in_rcu) {
                unsigned char max_p = mt_pivots[mt];
                unsigned char max_s = mt_slots[mt];

                if (tmp < max_p)
                        memset(pivs + tmp, 0,
                               sizeof(unsigned long) * (max_p - tmp));

                if (tmp < mt_slots[mt])
                        memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp));

                memcpy(node, newnode, sizeof(struct maple_node));
                ma_set_meta(node, mt, 0, tmp - 1);
                mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node),
                              l_pivs[split]);

                /* Remove data from l_pivs. */
                tmp = split + 1;
                memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp));
                memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp));
                ma_set_meta(left, mt, 0, split);
                eparent = old_eparent;

                goto done;
        }

        /* RCU requires replacing both l_mas, mas, and parent. */
        mas->node = mt_mk_node(newnode, mt);
        ma_set_meta(newnode, mt, 0, tmp);

        new_left = mas_pop_node(mas);
        new_left->parent = left->parent;
        mt = mte_node_type(l_mas.node);
        slots = ma_slots(new_left, mt);
        pivs = ma_pivots(new_left, mt);
        memcpy(slots, l_slots, sizeof(void *) * split);
        memcpy(pivs, l_pivs, sizeof(unsigned long) * split);
        ma_set_meta(new_left, mt, 0, split);
        l_mas.node = mt_mk_node(new_left, mt);

        /* replace parent. */
        offset = mte_parent_slot(mas->node);
        mt = mas_parent_type(&l_mas, l_mas.node);
        parent = mas_pop_node(mas);
        slots = ma_slots(parent, mt);
        pivs = ma_pivots(parent, mt);
        memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node));
        rcu_assign_pointer(slots[offset], mas->node);
        rcu_assign_pointer(slots[offset - 1], l_mas.node);
        pivs[offset - 1] = l_mas.max;
        eparent = mt_mk_node(parent, mt);
done:
        gap = mas_leaf_max_gap(mas);
        mte_set_gap(eparent, mte_parent_slot(mas->node), gap);
        gap = mas_leaf_max_gap(&l_mas);
        mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap);
        mas_ascend(mas);

        if (in_rcu) {
                mas_replace_node(mas, old_eparent);
                mas_adopt_children(mas, mas->node);
        }

        mas_update_gap(mas);
}

/*
 * mas_split_final_node() - Split the final node in a subtree operation.
 * @mast: the maple subtree state
 * @mas: The maple state
 * @height: The height of the tree in case it's a new root.
 */
static inline void mas_split_final_node(struct maple_subtree_state *mast,
                                        struct ma_state *mas, int height)
{
        struct maple_enode *ancestor;

        if (mte_is_root(mas->node)) {
                if (mt_is_alloc(mas->tree))
                        mast->bn->type = maple_arange_64;
                else
                        mast->bn->type = maple_range_64;
                mas->depth = height;
        }
        /*
         * Only a single node is used here, could be root.
         * The Big_node data should just fit in a single node.
         */
        ancestor = mas_new_ma_node(mas, mast->bn);
        mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset);
        mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset);
        mte_to_node(ancestor)->parent = mas_mn(mas)->parent;

        mast->l->node = ancestor;
        mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true);
        mas->offset = mast->bn->b_end - 1;
}

/*
 * mast_fill_bnode() - Copy data into the big node in the subtree state
 * @mast: The maple subtree state
 * @mas: the maple state
 * @skip: The number of entries to skip for new nodes insertion.
 */
static inline void mast_fill_bnode(struct maple_subtree_state *mast,
                                         struct ma_state *mas,
                                         unsigned char skip)
{
        bool cp = true;
        unsigned char split;

        memset(mast->bn, 0, sizeof(struct maple_big_node));

        if (mte_is_root(mas->node)) {
                cp = false;
        } else {
                mas_ascend(mas);
                mas->offset = mte_parent_slot(mas->node);
        }

        if (cp && mast->l->offset)
                mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0);

        split = mast->bn->b_end;
        mab_set_b_end(mast->bn, mast->l, mast->l->node);
        mast->r->offset = mast->bn->b_end;
        mab_set_b_end(mast->bn, mast->r, mast->r->node);
        if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max)
                cp = false;

        if (cp)
                mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1,
                           mast->bn, mast->bn->b_end);

        mast->bn->b_end--;
        mast->bn->type = mte_node_type(mas->node);
}

/*
 * mast_split_data() - Split the data in the subtree state big node into regular
 * nodes.
 * @mast: The maple subtree state
 * @mas: The maple state
 * @split: The location to split the big node
 */
static inline void mast_split_data(struct maple_subtree_state *mast,
           struct ma_state *mas, unsigned char split)
{
        unsigned char p_slot;

        mab_mas_cp(mast->bn, 0, split, mast->l, true);
        mte_set_pivot(mast->r->node, 0, mast->r->max);
        mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false);
        mast->l->offset = mte_parent_slot(mas->node);
        mast->l->max = mast->bn->pivot[split];
        mast->r->min = mast->l->max + 1;
        if (mte_is_leaf(mas->node))
                return;

        p_slot = mast->orig_l->offset;
        mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node,
                             &p_slot, split);
        mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node,
                             &p_slot, split);
}

/*
 * mas_push_data() - Instead of splitting a node, it is beneficial to push the
 * data to the right or left node if there is room.
 * @mas: The maple state
 * @height: The current height of the maple state
 * @mast: The maple subtree state
 * @left: Push left or not.
 *
 * Keeping the height of the tree low means faster lookups.
 *
 * Return: True if pushed, false otherwise.
 */
static inline bool mas_push_data(struct ma_state *mas, int height,
                                 struct maple_subtree_state *mast, bool left)
{
        unsigned char slot_total = mast->bn->b_end;
        unsigned char end, space, split;

        MA_STATE(tmp_mas, mas->tree, mas->index, mas->last);
        tmp_mas = *mas;
        tmp_mas.depth = mast->l->depth;

        if (left && !mas_prev_sibling(&tmp_mas))
                return false;
        else if (!left && !mas_next_sibling(&tmp_mas))
                return false;

        end = mas_data_end(&tmp_mas);
        slot_total += end;
        space = 2 * mt_slot_count(mas->node) - 2;
        /* -2 instead of -1 to ensure there isn't a triple split */
        if (ma_is_leaf(mast->bn->type))
                space--;

        if (mas->max == ULONG_MAX)
                space--;

        if (slot_total >= space)
                return false;

        /* Get the data; Fill mast->bn */
        mast->bn->b_end++;
        if (left) {
                mab_shift_right(mast->bn, end + 1);
                mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0);
                mast->bn->b_end = slot_total + 1;
        } else {
                mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end);
        }

        /* Configure mast for splitting of mast->bn */
        split = mt_slots[mast->bn->type] - 2;
        if (left) {
                /*  Switch mas to prev node  */
                *mas = tmp_mas;
                /* Start using mast->l for the left side. */
                tmp_mas.node = mast->l->node;
                *mast->l = tmp_mas;
        } else {
                tmp_mas.node = mast->r->node;
                *mast->r = tmp_mas;
                split = slot_total - split;
        }
        split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]);
        /* Update parent slot for split calculation. */
        if (left)
                mast->orig_l->offset += end + 1;

        mast_split_data(mast, mas, split);
        mast_fill_bnode(mast, mas, 2);
        mas_split_final_node(mast, mas, height + 1);
        return true;
}

/*
 * mas_split() - Split data that is too big for one node into two.
 * @mas: The maple state
 * @b_node: The maple big node
 */
static void mas_split(struct ma_state *mas, struct maple_big_node *b_node)
{
        struct maple_subtree_state mast;
        int height = 0;
        unsigned char mid_split, split = 0;
        struct maple_enode *old;

        /*
         * Splitting is handled differently from any other B-tree; the Maple
         * Tree splits upwards.  Splitting up means that the split operation
         * occurs when the walk of the tree hits the leaves and not on the way
         * down.  The reason for splitting up is that it is impossible to know
         * how much space will be needed until the leaf is (or leaves are)
         * reached.  Since overwriting data is allowed and a range could
         * overwrite more than one range or result in changing one entry into 3
         * entries, it is impossible to know if a split is required until the
         * data is examined.
         *
         * Splitting is a balancing act between keeping allocations to a minimum
         * and avoiding a 'jitter' event where a tree is expanded to make room
         * for an entry followed by a contraction when the entry is removed.  To
         * accomplish the balance, there are empty slots remaining in both left
         * and right nodes after a split.
         */
        MA_STATE(l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(r_mas, mas->tree, mas->index, mas->last);
        MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last);
        MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last);

        trace_ma_op(__func__, mas);
        mas->depth = mas_mt_height(mas);

        mast.l = &l_mas;
        mast.r = &r_mas;
        mast.orig_l = &prev_l_mas;
        mast.orig_r = &prev_r_mas;
        mast.bn = b_node;

        while (height++ <= mas->depth) {
                if (mt_slots[b_node->type] > b_node->b_end) {
                        mas_split_final_node(&mast, mas, height);
                        break;
                }

                l_mas = r_mas = *mas;
                l_mas.node = mas_new_ma_node(mas, b_node);
                r_mas.node = mas_new_ma_node(mas, b_node);
                /*
                 * Another way that 'jitter' is avoided is to terminate a split up early if the
                 * left or right node has space to spare.  This is referred to as "pushing left"
                 * or "pushing right" and is similar to the B* tree, except the nodes left or
                 * right can rarely be reused due to RCU, but the ripple upwards is halted which
                 * is a significant savings.
                 */
                /* Try to push left. */
                if (mas_push_data(mas, height, &mast, true))
                        break;
                /* Try to push right. */
                if (mas_push_data(mas, height, &mast, false))
                        break;

                split = mab_calc_split(mas, b_node, &mid_split);
                mast_split_data(&mast, mas, split);
                /*
                 * Usually correct, mab_mas_cp in the above call overwrites
                 * r->max.
                 */
                mast.r->max = mas->max;
                mast_fill_bnode(&mast, mas, 1);
                prev_l_mas = *mast.l;
                prev_r_mas = *mast.r;
        }

        /* Set the original node as dead */
        old = mas->node;
        mas->node = l_mas.node;
        mas_wmb_replace(mas, old);
        mtree_range_walk(mas);
        return;
}

/*
 * mas_commit_b_node() - Commit the big node into the tree.
 * @wr_mas: The maple write state
 * @b_node: The maple big node
 */
static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas,
                            struct maple_big_node *b_node)
{
        enum store_type type = wr_mas->mas->store_type;

        WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store);

        if (type == wr_rebalance)
                return mas_rebalance(wr_mas->mas, b_node);

        return mas_split(wr_mas->mas, b_node);
}

/*
 * mas_root_expand() - Expand a root to a node
 * @mas: The maple state
 * @entry: The entry to store into the tree
 */
static inline void mas_root_expand(struct ma_state *mas, void *entry)
{
        void *contents = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;
        int slot = 0;

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;

        if (mas->index) {
                if (contents) {
                        rcu_assign_pointer(slots[slot], contents);
                        if (likely(mas->index > 1))
                                slot++;
                }
                pivots[slot++] = mas->index - 1;
        }

        rcu_assign_pointer(slots[slot], entry);
        mas->offset = slot;
        pivots[slot] = mas->last;
        if (mas->last != ULONG_MAX)
                pivots[++slot] = ULONG_MAX;

        mas->depth = 1;
        mas_set_height(mas);
        ma_set_meta(node, maple_leaf_64, 0, slot);
        /* swap the new root into the tree */
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
        return;
}

/*
 * mas_store_root() - Storing value into root.
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * There is no root node now and we are storing a value into the root - this
 * function either assigns the pointer or expands into a node.
 */
static inline void mas_store_root(struct ma_state *mas, void *entry)
{
        if (!entry) {
                if (!mas->index)
                        rcu_assign_pointer(mas->tree->ma_root, NULL);
        } else if (likely((mas->last != 0) || (mas->index != 0)))
                mas_root_expand(mas, entry);
        else if (((unsigned long) (entry) & 3) == 2)
                mas_root_expand(mas, entry);
        else {
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
        }
}

/*
 * mas_is_span_wr() - Check if the write needs to be treated as a write that
 * spans the node.
 * @wr_mas: The maple write state
 *
 * Spanning writes are writes that start in one node and end in another OR if
 * the write of a %NULL will cause the node to end with a %NULL.
 *
 * Return: True if this is a spanning write, false otherwise.
 */
static bool mas_is_span_wr(struct ma_wr_state *wr_mas)
{
        unsigned long max = wr_mas->r_max;
        unsigned long last = wr_mas->mas->last;
        enum maple_type type = wr_mas->type;
        void *entry = wr_mas->entry;

        /* Contained in this pivot, fast path */
        if (last < max)
                return false;

        if (ma_is_leaf(type)) {
                max = wr_mas->mas->max;
                if (last < max)
                        return false;
        }

        if (last == max) {
                /*
                 * The last entry of leaf node cannot be NULL unless it is the
                 * rightmost node (writing ULONG_MAX), otherwise it spans slots.
                 */
                if (entry || last == ULONG_MAX)
                        return false;
        }

        trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry);
        return true;
}

static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas)
{
        wr_mas->type = mte_node_type(wr_mas->mas->node);
        mas_wr_node_walk(wr_mas);
        wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type);
}

static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas)
{
        wr_mas->mas->max = wr_mas->r_max;
        wr_mas->mas->min = wr_mas->r_min;
        wr_mas->mas->node = wr_mas->content;
        wr_mas->mas->offset = 0;
        wr_mas->mas->depth++;
}
/*
 * mas_wr_walk() - Walk the tree for a write.
 * @wr_mas: The maple write state
 *
 * Uses mas_slot_locked() and does not need to worry about dead nodes.
 *
 * Return: True if it's contained in a node, false on spanning write.
 */
static bool mas_wr_walk(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                if (unlikely(mas_is_span_wr(wr_mas)))
                        return false;

                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return true;

                mas_wr_walk_traverse(wr_mas);
        }

        return true;
}

static void mas_wr_walk_index(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        while (true) {
                mas_wr_walk_descend(wr_mas);
                wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
                                                  mas->offset);
                if (ma_is_leaf(wr_mas->type))
                        return;
                mas_wr_walk_traverse(wr_mas);
        }
}
/*
 * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
 * @l_wr_mas: The left maple write state
 * @r_wr_mas: The right maple write state
 */
static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas,
                                            struct ma_wr_state *r_wr_mas)
{
        struct ma_state *r_mas = r_wr_mas->mas;
        struct ma_state *l_mas = l_wr_mas->mas;
        unsigned char l_slot;

        l_slot = l_mas->offset;
        if (!l_wr_mas->content)
                l_mas->index = l_wr_mas->r_min;

        if ((l_mas->index == l_wr_mas->r_min) &&
                 (l_slot &&
                  !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) {
                if (l_slot > 1)
                        l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1;
                else
                        l_mas->index = l_mas->min;

                l_mas->offset = l_slot - 1;
        }

        if (!r_wr_mas->content) {
                if (r_mas->last < r_wr_mas->r_max)
                        r_mas->last = r_wr_mas->r_max;
                r_mas->offset++;
        } else if ((r_mas->last == r_wr_mas->r_max) &&
            (r_mas->last < r_mas->max) &&
            !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) {
                r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots,
                                             r_wr_mas->type, r_mas->offset + 1);
                r_mas->offset++;
        }
}

static inline void *mas_state_walk(struct ma_state *mas)
{
        void *entry;

        entry = mas_start(mas);
        if (mas_is_none(mas))
                return NULL;

        if (mas_is_ptr(mas))
                return entry;

        return mtree_range_walk(mas);
}

/*
 * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up
 * to date.
 *
 * @mas: The maple state.
 *
 * Note: Leaves mas in undesirable state.
 * Return: The entry for @mas->index or %NULL on dead node.
 */
static inline void *mtree_lookup_walk(struct ma_state *mas)
{
        unsigned long *pivots;
        unsigned char offset;
        struct maple_node *node;
        struct maple_enode *next;
        enum maple_type type;
        void __rcu **slots;
        unsigned char end;

        next = mas->node;
        do {
                node = mte_to_node(next);
                type = mte_node_type(next);
                pivots = ma_pivots(node, type);
                end = mt_pivots[type];
                offset = 0;
                do {
                        if (pivots[offset] >= mas->index)
                                break;
                } while (++offset < end);

                slots = ma_slots(node, type);
                next = mt_slot(mas->tree, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        goto dead_node;
        } while (!ma_is_leaf(type));

        return (void *)next;

dead_node:
        mas_reset(mas);
        return NULL;
}

static void mte_destroy_walk(struct maple_enode *, struct maple_tree *);
/*
 * mas_new_root() - Create a new root node that only contains the entry passed
 * in.
 * @mas: The maple state
 * @entry: The entry to store.
 *
 * Only valid when the index == 0 and the last == ULONG_MAX
 */
static inline void mas_new_root(struct ma_state *mas, void *entry)
{
        struct maple_enode *root = mas_root_locked(mas);
        enum maple_type type = maple_leaf_64;
        struct maple_node *node;
        void __rcu **slots;
        unsigned long *pivots;

        WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX);

        if (!entry) {
                mas->depth = 0;
                mas_set_height(mas);
                rcu_assign_pointer(mas->tree->ma_root, entry);
                mas->status = ma_start;
                goto done;
        }

        node = mas_pop_node(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        node->parent = ma_parent_ptr(mas_tree_parent(mas));
        mas->node = mt_mk_node(node, type);
        mas->status = ma_active;
        rcu_assign_pointer(slots[0], entry);
        pivots[0] = mas->last;
        mas->depth = 1;
        mas_set_height(mas);
        rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));

done:
        if (xa_is_node(root))
                mte_destroy_walk(root, mas->tree);

        return;
}
/*
 * mas_wr_spanning_store() - Create a subtree with the store operation completed
 * and new nodes where necessary, then place the sub-tree in the actual tree.
 * Note that mas is expected to point to the node which caused the store to
 * span.
 * @wr_mas: The maple write state
 */
static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas)
{
        struct maple_subtree_state mast;
        struct maple_big_node b_node;
        struct ma_state *mas;
        unsigned char height;

        /* Left and Right side of spanning store */
        MA_STATE(l_mas, NULL, 0, 0);
        MA_STATE(r_mas, NULL, 0, 0);
        MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry);
        MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry);

        /*
         * A store operation that spans multiple nodes is called a spanning
         * store and is handled early in the store call stack by the function
         * mas_is_span_wr().  When a spanning store is identified, the maple
         * state is duplicated.  The first maple state walks the left tree path
         * to ``index``, the duplicate walks the right tree path to ``last``.
         * The data in the two nodes are combined into a single node, two nodes,
         * or possibly three nodes (see the 3-way split above).  A ``NULL``
         * written to the last entry of a node is considered a spanning store as
         * a rebalance is required for the operation to complete and an overflow
         * of data may happen.
         */
        mas = wr_mas->mas;
        trace_ma_op(__func__, mas);

        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                return mas_new_root(mas, wr_mas->entry);
        /*
         * Node rebalancing may occur due to this store, so there may be three new
         * entries per level plus a new root.
         */
        height = mas_mt_height(mas);

        /*
         * Set up right side.  Need to get to the next offset after the spanning
         * store to ensure it's not NULL and to combine both the next node and
         * the node with the start together.
         */
        r_mas = *mas;
        /* Avoid overflow, walk to next slot in the tree. */
        if (r_mas.last + 1)
                r_mas.last++;

        r_mas.index = r_mas.last;
        mas_wr_walk_index(&r_wr_mas);
        r_mas.last = r_mas.index = mas->last;

        /* Set up left side. */
        l_mas = *mas;
        mas_wr_walk_index(&l_wr_mas);

        if (!wr_mas->entry) {
                mas_extend_spanning_null(&l_wr_mas, &r_wr_mas);
                mas->offset = l_mas.offset;
                mas->index = l_mas.index;
                mas->last = l_mas.last = r_mas.last;
        }

        /* expanding NULLs may make this cover the entire range */
        if (!l_mas.index && r_mas.last == ULONG_MAX) {
                mas_set_range(mas, 0, ULONG_MAX);
                return mas_new_root(mas, wr_mas->entry);
        }

        memset(&b_node, 0, sizeof(struct maple_big_node));
        /* Copy l_mas and store the value in b_node. */
        mas_store_b_node(&l_wr_mas, &b_node, l_mas.end);
        /* Copy r_mas into b_node if there is anything to copy. */
        if (r_mas.max > r_mas.last)
                mas_mab_cp(&r_mas, r_mas.offset, r_mas.end,
                           &b_node, b_node.b_end + 1);
        else
                b_node.b_end++;

        /* Stop spanning searches by searching for just index. */
        l_mas.index = l_mas.last = mas->index;

        mast.bn = &b_node;
        mast.orig_l = &l_mas;
        mast.orig_r = &r_mas;
        /* Combine l_mas and r_mas and split them up evenly again. */
        return mas_spanning_rebalance(mas, &mast, height + 1);
}

/*
 * mas_wr_node_store() - Attempt to store the value in a node
 * @wr_mas: The maple write state
 *
 * Attempts to reuse the node, but may allocate.
 */
static inline void mas_wr_node_store(struct ma_wr_state *wr_mas,
                                     unsigned char new_end)
{
        struct ma_state *mas = wr_mas->mas;
        void __rcu **dst_slots;
        unsigned long *dst_pivots;
        unsigned char dst_offset, offset_end = wr_mas->offset_end;
        struct maple_node reuse, *newnode;
        unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type];
        bool in_rcu = mt_in_rcu(mas->tree);

        if (mas->last == wr_mas->end_piv)
                offset_end++; /* don't copy this offset */
        else if (unlikely(wr_mas->r_max == ULONG_MAX))
                mas_bulk_rebalance(mas, mas->end, wr_mas->type);

        /* set up node. */
        if (in_rcu) {
                newnode = mas_pop_node(mas);
        } else {
                memset(&reuse, 0, sizeof(struct maple_node));
                newnode = &reuse;
        }

        newnode->parent = mas_mn(mas)->parent;
        dst_pivots = ma_pivots(newnode, wr_mas->type);
        dst_slots = ma_slots(newnode, wr_mas->type);
        /* Copy from start to insert point */
        memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset);
        memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset);

        /* Handle insert of new range starting after old range */
        if (wr_mas->r_min < mas->index) {
                rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content);
                dst_pivots[mas->offset++] = mas->index - 1;
        }

        /* Store the new entry and range end. */
        if (mas->offset < node_pivots)
                dst_pivots[mas->offset] = mas->last;
        rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry);

        /*
         * this range wrote to the end of the node or it overwrote the rest of
         * the data
         */
        if (offset_end > mas->end)
                goto done;

        dst_offset = mas->offset + 1;
        /* Copy to the end of node if necessary. */
        copy_size = mas->end - offset_end + 1;
        memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end,
               sizeof(void *) * copy_size);
        memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end,
               sizeof(unsigned long) * (copy_size - 1));

        if (new_end < node_pivots)
                dst_pivots[new_end] = mas->max;

done:
        mas_leaf_set_meta(newnode, maple_leaf_64, new_end);
        if (in_rcu) {
                struct maple_enode *old_enode = mas->node;

                mas->node = mt_mk_node(newnode, wr_mas->type);
                mas_replace_node(mas, old_enode);
        } else {
                memcpy(wr_mas->node, newnode, sizeof(struct maple_node));
        }
        trace_ma_write(__func__, mas, 0, wr_mas->entry);
        mas_update_gap(mas);
        mas->end = new_end;
        return;
}

/*
 * mas_wr_slot_store: Attempt to store a value in a slot.
 * @wr_mas: the maple write state
 */
static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char offset = mas->offset;
        void __rcu **slots = wr_mas->slots;
        bool gap = false;

        gap |= !mt_slot_locked(mas->tree, slots, offset);
        gap |= !mt_slot_locked(mas->tree, slots, offset + 1);

        if (wr_mas->offset_end - offset == 1) {
                if (mas->index == wr_mas->r_min) {
                        /* Overwriting the range and a part of the next one */
                        rcu_assign_pointer(slots[offset], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->last;
                } else {
                        /* Overwriting a part of the range and the next one */
                        rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                        wr_mas->pivots[offset] = mas->index - 1;
                        mas->offset++; /* Keep mas accurate. */
                }
        } else {
                WARN_ON_ONCE(mt_in_rcu(mas->tree));
                /*
                 * Expand the range, only partially overwriting the previous and
                 * next ranges
                 */
                gap |= !mt_slot_locked(mas->tree, slots, offset + 2);
                rcu_assign_pointer(slots[offset + 1], wr_mas->entry);
                wr_mas->pivots[offset] = mas->index - 1;
                wr_mas->pivots[offset + 1] = mas->last;
                mas->offset++; /* Keep mas accurate. */
        }

        trace_ma_write(__func__, mas, 0, wr_mas->entry);
        /*
         * Only update gap when the new entry is empty or there is an empty
         * entry in the original two ranges.
         */
        if (!wr_mas->entry || gap)
                mas_update_gap(mas);

        return;
}

static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        if (!wr_mas->slots[wr_mas->offset_end]) {
                /* If this one is null, the next and prev are not */
                mas->last = wr_mas->end_piv;
        } else {
                /* Check next slot(s) if we are overwriting the end */
                if ((mas->last == wr_mas->end_piv) &&
                    (mas->end != wr_mas->offset_end) &&
                    !wr_mas->slots[wr_mas->offset_end + 1]) {
                        wr_mas->offset_end++;
                        if (wr_mas->offset_end == mas->end)
                                mas->last = mas->max;
                        else
                                mas->last = wr_mas->pivots[wr_mas->offset_end];
                        wr_mas->end_piv = mas->last;
                }
        }

        if (!wr_mas->content) {
                /* If this one is null, the next and prev are not */
                mas->index = wr_mas->r_min;
        } else {
                /* Check prev slot if we are overwriting the start */
                if (mas->index == wr_mas->r_min && mas->offset &&
                    !wr_mas->slots[mas->offset - 1]) {
                        mas->offset--;
                        wr_mas->r_min = mas->index =
                                mas_safe_min(mas, wr_mas->pivots, mas->offset);
                        wr_mas->r_max = wr_mas->pivots[mas->offset];
                }
        }
}

static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas)
{
        while ((wr_mas->offset_end < wr_mas->mas->end) &&
               (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end]))
                wr_mas->offset_end++;

        if (wr_mas->offset_end < wr_mas->mas->end)
                wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end];
        else
                wr_mas->end_piv = wr_mas->mas->max;
}

static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end = mas->end + 2;

        new_end -= wr_mas->offset_end - mas->offset;
        if (wr_mas->r_min == mas->index)
                new_end--;

        if (wr_mas->end_piv == mas->last)
                new_end--;

        return new_end;
}

/*
 * mas_wr_append: Attempt to append
 * @wr_mas: the maple write state
 * @new_end: The end of the node after the modification
 *
 * This is currently unsafe in rcu mode since the end of the node may be cached
 * by readers while the node contents may be updated which could result in
 * inaccurate information.
 */
static inline void mas_wr_append(struct ma_wr_state *wr_mas,
                unsigned char new_end)
{
        struct ma_state *mas = wr_mas->mas;
        void __rcu **slots;
        unsigned char end = mas->end;

        if (new_end < mt_pivots[wr_mas->type]) {
                wr_mas->pivots[new_end] = wr_mas->pivots[end];
                ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end);
        }

        slots = wr_mas->slots;
        if (new_end == end + 1) {
                if (mas->last == wr_mas->r_max) {
                        /* Append to end of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->entry);
                        wr_mas->pivots[end] = mas->index - 1;
                        mas->offset = new_end;
                } else {
                        /* Append to start of range */
                        rcu_assign_pointer(slots[new_end], wr_mas->content);
                        wr_mas->pivots[end] = mas->last;
                        rcu_assign_pointer(slots[end], wr_mas->entry);
                }
        } else {
                /* Append to the range without touching any boundaries. */
                rcu_assign_pointer(slots[new_end], wr_mas->content);
                wr_mas->pivots[end + 1] = mas->last;
                rcu_assign_pointer(slots[end + 1], wr_mas->entry);
                wr_mas->pivots[end] = mas->index - 1;
                mas->offset = end + 1;
        }

        if (!wr_mas->content || !wr_mas->entry)
                mas_update_gap(mas);

        mas->end = new_end;
        trace_ma_write(__func__, mas, new_end, wr_mas->entry);
        return;
}

/*
 * mas_wr_bnode() - Slow path for a modification.
 * @wr_mas: The write maple state
 *
 * This is where split, rebalance end up.
 */
static void mas_wr_bnode(struct ma_wr_state *wr_mas)
{
        struct maple_big_node b_node;

        trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry);
        memset(&b_node, 0, sizeof(struct maple_big_node));
        mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end);
        mas_commit_b_node(wr_mas, &b_node);
}

/*
 * mas_wr_store_entry() - Internal call to store a value
 * @wr_mas: The maple write state
 */
static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end = mas_wr_new_end(wr_mas);

        switch (mas->store_type) {
        case wr_invalid:
                MT_BUG_ON(mas->tree, 1);
                return;
        case wr_new_root:
                mas_new_root(mas, wr_mas->entry);
                break;
        case wr_store_root:
                mas_store_root(mas, wr_mas->entry);
                break;
        case wr_exact_fit:
                rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry);
                if (!!wr_mas->entry ^ !!wr_mas->content)
                        mas_update_gap(mas);
                break;
        case wr_append:
                mas_wr_append(wr_mas, new_end);
                break;
        case wr_slot_store:
                mas_wr_slot_store(wr_mas);
                break;
        case wr_node_store:
                mas_wr_node_store(wr_mas, new_end);
                break;
        case wr_spanning_store:
                mas_wr_spanning_store(wr_mas);
                break;
        case wr_split_store:
        case wr_rebalance:
                mas_wr_bnode(wr_mas);
                break;
        }

        return;
}

static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;

        if (!mas_is_active(mas)) {
                if (mas_is_start(mas))
                        goto set_content;

                if (unlikely(mas_is_paused(mas)))
                        goto reset;

                if (unlikely(mas_is_none(mas)))
                        goto reset;

                if (unlikely(mas_is_overflow(mas)))
                        goto reset;

                if (unlikely(mas_is_underflow(mas)))
                        goto reset;
        }

        /*
         * A less strict version of mas_is_span_wr() where we allow spanning
         * writes within this node.  This is to stop partial walks in
         * mas_prealloc() from being reset.
         */
        if (mas->last > mas->max)
                goto reset;

        if (wr_mas->entry)
                goto set_content;

        if (mte_is_leaf(mas->node) && mas->last == mas->max)
                goto reset;

        goto set_content;

reset:
        mas_reset(mas);
set_content:
        wr_mas->content = mas_start(mas);
}

/**
 * mas_prealloc_calc() - Calculate number of nodes needed for a
 * given store oepration
 * @mas: The maple state
 * @entry: The entry to store into the tree
 *
 * Return: Number of nodes required for preallocation.
 */
static inline int mas_prealloc_calc(struct ma_state *mas, void *entry)
{
        int ret = mas_mt_height(mas) * 3 + 1;

        switch (mas->store_type) {
        case wr_invalid:
                WARN_ON_ONCE(1);
                break;
        case wr_new_root:
                ret = 1;
                break;
        case wr_store_root:
                if (likely((mas->last != 0) || (mas->index != 0)))
                        ret = 1;
                else if (((unsigned long) (entry) & 3) == 2)
                        ret = 1;
                else
                        ret = 0;
                break;
        case wr_spanning_store:
                ret =  mas_mt_height(mas) * 3 + 1;
                break;
        case wr_split_store:
                ret =  mas_mt_height(mas) * 2 + 1;
                break;
        case wr_rebalance:
                ret =  mas_mt_height(mas) * 2 - 1;
                break;
        case wr_node_store:
                ret = mt_in_rcu(mas->tree) ? 1 : 0;
                break;
        case wr_append:
        case wr_exact_fit:
        case wr_slot_store:
                ret = 0;
        }

        return ret;
}

/*
 * mas_wr_store_type() - Determine the store type for a given
 * store operation.
 * @wr_mas: The maple write state
 *
 * Return: the type of store needed for the operation
 */
static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas)
{
        struct ma_state *mas = wr_mas->mas;
        unsigned char new_end;

        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                return wr_store_root;

        if (unlikely(!mas_wr_walk(wr_mas)))
                return wr_spanning_store;

        /* At this point, we are at the leaf node that needs to be altered. */
        mas_wr_end_piv(wr_mas);
        if (!wr_mas->entry)
                mas_wr_extend_null(wr_mas);

        if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last))
                return wr_exact_fit;

        if (unlikely(!mas->index && mas->last == ULONG_MAX))
                return wr_new_root;

        new_end = mas_wr_new_end(wr_mas);
        /* Potential spanning rebalance collapsing a node */
        if (new_end < mt_min_slots[wr_mas->type]) {
                if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK))
                        return  wr_rebalance;
                return wr_node_store;
        }

        if (new_end >= mt_slots[wr_mas->type])
                return wr_split_store;

        if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end))
                return wr_append;

        if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) ||
                (wr_mas->offset_end - mas->offset == 1)))
                return wr_slot_store;

        return wr_node_store;
}

/**
 * mas_wr_preallocate() - Preallocate enough nodes for a store operation
 * @wr_mas: The maple write state
 * @entry: The entry that will be stored
 *
 */
static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry)
{
        struct ma_state *mas = wr_mas->mas;
        int request;

        mas_wr_prealloc_setup(wr_mas);
        mas->store_type = mas_wr_store_type(wr_mas);
        request = mas_prealloc_calc(mas, entry);
        if (!request)
                return;

        mas_node_count(mas, request);
}

/**
 * mas_insert() - Internal call to insert a value
 * @mas: The maple state
 * @entry: The entry to store
 *
 * Return: %NULL or the contents that already exists at the requested index
 * otherwise.  The maple state needs to be checked for error conditions.
 */
static inline void *mas_insert(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        /*
         * Inserting a new range inserts either 0, 1, or 2 pivots within the
         * tree.  If the insert fits exactly into an existing gap with a value
         * of NULL, then the slot only needs to be written with the new value.
         * If the range being inserted is adjacent to another range, then only a
         * single pivot needs to be inserted (as well as writing the entry).  If
         * the new range is within a gap but does not touch any other ranges,
         * then two pivots need to be inserted: the start - 1, and the end.  As
         * usual, the entry must be written.  Most operations require a new node
         * to be allocated and replace an existing node to ensure RCU safety,
         * when in RCU mode.  The exception to requiring a newly allocated node
         * is when inserting at the end of a node (appending).  When done
         * carefully, appending can reuse the node in place.
         */
        wr_mas.content = mas_start(mas);
        if (wr_mas.content)
                goto exists;

        mas_wr_preallocate(&wr_mas, entry);
        if (mas_is_err(mas))
                return NULL;

        /* spanning writes always overwrite something */
        if (mas->store_type == wr_spanning_store)
                goto exists;

        /* At this point, we are at the leaf node that needs to be altered. */
        if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) {
                wr_mas.offset_end = mas->offset;
                wr_mas.end_piv = wr_mas.r_max;

                if (wr_mas.content || (mas->last > wr_mas.r_max))
                        goto exists;
        }

        mas_wr_store_entry(&wr_mas);
        return wr_mas.content;

exists:
        mas_set_err(mas, -EEXIST);
        return wr_mas.content;

}

/**
 * mas_alloc_cyclic() - Internal call to find somewhere to store an entry
 * @mas: The maple state.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, or -EBUSY if there are no
 * free entries.
 */
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        unsigned long min = range_lo;
        int ret = 0;

        range_lo = max(min, *next);
        ret = mas_empty_area(mas, range_lo, range_hi, 1);
        if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }
        if (ret < 0 && range_lo > min) {
                mas_reset(mas);
                ret = mas_empty_area(mas, min, range_hi, 1);
                if (ret == 0)
                        ret = 1;
        }
        if (ret < 0)
                return ret;

        do {
                mas_insert(mas, entry);
        } while (mas_nomem(mas, gfp));
        if (mas_is_err(mas))
                return xa_err(mas->node);

        *startp = mas->index;
        *next = *startp + 1;
        if (*next == 0)
                mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED;

        mas_destroy(mas);
        return ret;
}
EXPORT_SYMBOL(mas_alloc_cyclic);

static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index)
{
retry:
        mas_set(mas, index);
        mas_state_walk(mas);
        if (mas_is_start(mas))
                goto retry;
}

static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas,
                struct maple_node *node, const unsigned long index)
{
        if (unlikely(ma_dead_node(node))) {
                mas_rewalk(mas, index);
                return true;
        }
        return false;
}

/*
 * mas_prev_node() - Find the prev non-null entry at the same level in the
 * tree.  The prev value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * @mas: The maple state
 * @min: The lower limit to search
 *
 * The prev node value will be mas->node[mas->offset] or the status will be
 * ma_none.
 * Return: 1 if the node is dead, 0 otherwise.
 */
static int mas_prev_node(struct ma_state *mas, unsigned long min)
{
        enum maple_type mt;
        int offset, level;
        void __rcu **slots;
        struct maple_node *node;
        unsigned long *pivots;
        unsigned long max;

        node = mas_mn(mas);
        if (!mas->min)
                goto no_entry;

        max = mas->min - 1;
        if (max < min)
                goto no_entry;

        level = 0;
        do {
                if (ma_is_root(node))
                        goto no_entry;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;
                offset = mas->offset;
                level++;
                node = mas_mn(mas);
        } while (!offset);

        offset--;
        mt = mte_node_type(mas->node);
        while (level > 1) {
                level--;
                slots = ma_slots(node, mt);
                mas->node = mas_slot(mas, slots, offset);
                if (unlikely(ma_dead_node(node)))
                        return 1;

                mt = mte_node_type(mas->node);
                node = mas_mn(mas);
                pivots = ma_pivots(node, mt);
                offset = ma_data_end(node, mt, pivots, max);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        slots = ma_slots(node, mt);
        mas->node = mas_slot(mas, slots, offset);
        pivots = ma_pivots(node, mt);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (likely(offset))
                mas->min = pivots[offset - 1] + 1;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        if (unlikely(mte_dead_node(mas->node)))
                return 1;

        mas->end = mas->offset;
        return 0;

no_entry:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_underflow;
        return 0;
}

/*
 * mas_prev_slot() - Get the entry in the previous slot
 *
 * @mas: The maple state
 * @min: The minimum starting range
 * @empty: Can be empty
 *
 * Return: The entry in the previous slot which is possibly NULL
 */
static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty)
{
        void *entry;
        void __rcu **slots;
        unsigned long pivot;
        enum maple_type type;
        unsigned long *pivots;
        struct maple_node *node;
        unsigned long save_point = mas->index;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->min <= min) {
                pivot = mas_safe_min(mas, pivots, mas->offset);

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot <= min)
                        goto underflow;
        }

again:
        if (likely(mas->offset)) {
                mas->offset--;
                mas->last = mas->index - 1;
                mas->index = mas_safe_min(mas, pivots, mas->offset);
        } else  {
                if (mas->index <= min)
                        goto underflow;

                if (mas_prev_node(mas, min)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_underflow(mas)))
                        return NULL;

                mas->last = mas->max;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->index = pivots[mas->offset - 1] + 1;
        }

        slots = ma_slots(node, type);
        entry = mas_slot(mas, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;


        if (likely(entry))
                return entry;

        if (!empty) {
                if (mas->index <= min) {
                        mas->status = ma_underflow;
                        return NULL;
                }

                goto again;
        }

        return entry;

underflow:
        mas->status = ma_underflow;
        return NULL;
}

/*
 * mas_next_node() - Get the next node at the same level in the tree.
 * @mas: The maple state
 * @node: The maple node
 * @max: The maximum pivot value to check.
 *
 * The next value will be mas->node[mas->offset] or the status will have
 * overflowed.
 * Return: 1 on dead node, 0 otherwise.
 */
static int mas_next_node(struct ma_state *mas, struct maple_node *node,
                unsigned long max)
{
        unsigned long min;
        unsigned long *pivots;
        struct maple_enode *enode;
        struct maple_node *tmp;
        int level = 0;
        unsigned char node_end;
        enum maple_type mt;
        void __rcu **slots;

        if (mas->max >= max)
                goto overflow;

        min = mas->max + 1;
        level = 0;
        do {
                if (ma_is_root(node))
                        goto overflow;

                /* Walk up. */
                if (unlikely(mas_ascend(mas)))
                        return 1;

                level++;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                pivots = ma_pivots(node, mt);
                node_end = ma_data_end(node, mt, pivots, mas->max);
                if (unlikely(ma_dead_node(node)))
                        return 1;

        } while (unlikely(mas->offset == node_end));

        slots = ma_slots(node, mt);
        mas->offset++;
        enode = mas_slot(mas, slots, mas->offset);
        if (unlikely(ma_dead_node(node)))
                return 1;

        if (level > 1)
                mas->offset = 0;

        while (unlikely(level > 1)) {
                level--;
                mas->node = enode;
                node = mas_mn(mas);
                mt = mte_node_type(mas->node);
                slots = ma_slots(node, mt);
                enode = mas_slot(mas, slots, 0);
                if (unlikely(ma_dead_node(node)))
                        return 1;
        }

        if (!mas->offset)
                pivots = ma_pivots(node, mt);

        mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt);
        tmp = mte_to_node(enode);
        mt = mte_node_type(enode);
        pivots = ma_pivots(tmp, mt);
        mas->end = ma_data_end(tmp, mt, pivots, mas->max);
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->node = enode;
        mas->min = min;
        return 0;

overflow:
        if (unlikely(ma_dead_node(node)))
                return 1;

        mas->status = ma_overflow;
        return 0;
}

/*
 * mas_next_slot() - Get the entry in the next slot
 *
 * @mas: The maple state
 * @max: The maximum starting range
 * @empty: Can be empty
 *
 * Return: The entry in the next slot which is possibly NULL
 */
static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty)
{
        void __rcu **slots;
        unsigned long *pivots;
        unsigned long pivot;
        enum maple_type type;
        struct maple_node *node;
        unsigned long save_point = mas->last;
        void *entry;

retry:
        node = mas_mn(mas);
        type = mte_node_type(mas->node);
        pivots = ma_pivots(node, type);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (mas->max >= max) {
                if (likely(mas->offset < mas->end))
                        pivot = pivots[mas->offset];
                else
                        pivot = mas->max;

                if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                        goto retry;

                if (pivot >= max) { /* Was at the limit, next will extend beyond */
                        mas->status = ma_overflow;
                        return NULL;
                }
        }

        if (likely(mas->offset < mas->end)) {
                mas->index = pivots[mas->offset] + 1;
again:
                mas->offset++;
                if (likely(mas->offset < mas->end))
                        mas->last = pivots[mas->offset];
                else
                        mas->last = mas->max;
        } else  {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                if (mas_next_node(mas, node, max)) {
                        mas_rewalk(mas, save_point);
                        goto retry;
                }

                if (WARN_ON_ONCE(mas_is_overflow(mas)))
                        return NULL;

                mas->offset = 0;
                mas->index = mas->min;
                node = mas_mn(mas);
                type = mte_node_type(mas->node);
                pivots = ma_pivots(node, type);
                mas->last = pivots[0];
        }

        slots = ma_slots(node, type);
        entry = mt_slot(mas->tree, slots, mas->offset);
        if (unlikely(mas_rewalk_if_dead(mas, node, save_point)))
                goto retry;

        if (entry)
                return entry;


        if (!empty) {
                if (mas->last >= max) {
                        mas->status = ma_overflow;
                        return NULL;
                }

                mas->index = mas->last + 1;
                goto again;
        }

        return entry;
}

/*
 * mas_rev_awalk() - Internal function.  Reverse allocation walk.  Find the
 * highest gap address of a given size in a given node and descend.
 * @mas: The maple state
 * @size: The needed size.
 *
 * Return: True if found in a leaf, false otherwise.
 *
 */
static bool mas_rev_awalk(struct ma_state *mas, unsigned long size,
                unsigned long *gap_min, unsigned long *gap_max)
{
        enum maple_type type = mte_node_type(mas->node);
        struct maple_node *node = mas_mn(mas);
        unsigned long *pivots, *gaps;
        void __rcu **slots;
        unsigned long gap = 0;
        unsigned long max, min;
        unsigned char offset;

        if (unlikely(mas_is_err(mas)))
                return true;

        if (ma_is_dense(type)) {
                /* dense nodes. */
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        /* Skip out of bounds. */
        while (mas->last < min)
                min = mas_safe_min(mas, pivots, --offset);

        max = mas_safe_pivot(mas, pivots, offset, type);
        while (mas->index <= max) {
                gap = 0;
                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = max - min + 1;

                if (gap) {
                        if ((size <= gap) && (size <= mas->last - min + 1))
                                break;

                        if (!gaps) {
                                /* Skip the next slot, it cannot be a gap. */
                                if (offset < 2)
                                        goto ascend;

                                offset -= 2;
                                max = pivots[offset];
                                min = mas_safe_min(mas, pivots, offset);
                                continue;
                        }
                }

                if (!offset)
                        goto ascend;

                offset--;
                max = min - 1;
                min = mas_safe_min(mas, pivots, offset);
        }

        if (unlikely((mas->index > max) || (size - 1 > max - mas->index)))
                goto no_space;

        if (unlikely(ma_is_leaf(type))) {
                mas->offset = offset;
                *gap_min = min;
                *gap_max = min + gap - 1;
                return true;
        }

        /* descend, only happens under lock. */
        mas->node = mas_slot(mas, slots, offset);
        mas->min = min;
        mas->max = max;
        mas->offset = mas_data_end(mas);
        return false;

ascend:
        if (!mte_is_root(mas->node))
                return false;

no_space:
        mas_set_err(mas, -EBUSY);
        return false;
}

static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size)
{
        enum maple_type type = mte_node_type(mas->node);
        unsigned long pivot, min, gap = 0;
        unsigned char offset, data_end;
        unsigned long *gaps, *pivots;
        void __rcu **slots;
        struct maple_node *node;
        bool found = false;

        if (ma_is_dense(type)) {
                mas->offset = (unsigned char)(mas->index - mas->min);
                return true;
        }

        node = mas_mn(mas);
        pivots = ma_pivots(node, type);
        slots = ma_slots(node, type);
        gaps = ma_gaps(node, type);
        offset = mas->offset;
        min = mas_safe_min(mas, pivots, offset);
        data_end = ma_data_end(node, type, pivots, mas->max);
        for (; offset <= data_end; offset++) {
                pivot = mas_safe_pivot(mas, pivots, offset, type);

                /* Not within lower bounds */
                if (mas->index > pivot)
                        goto next_slot;

                if (gaps)
                        gap = gaps[offset];
                else if (!mas_slot(mas, slots, offset))
                        gap = min(pivot, mas->last) - max(mas->index, min) + 1;
                else
                        goto next_slot;

                if (gap >= size) {
                        if (ma_is_leaf(type)) {
                                found = true;
                                break;
                        }

                        mas->node = mas_slot(mas, slots, offset);
                        mas->min = min;
                        mas->max = pivot;
                        offset = 0;
                        break;
                }
next_slot:
                min = pivot + 1;
                if (mas->last <= pivot) {
                        mas_set_err(mas, -EBUSY);
                        return true;
                }
        }

        mas->offset = offset;
        return found;
}

/**
 * mas_walk() - Search for @mas->index in the tree.
 * @mas: The maple state.
 *
 * mas->index and mas->last will be set to the range if there is a value.  If
 * mas->status is ma_none, reset to ma_start
 *
 * Return: the entry at the location or %NULL.
 */
void *mas_walk(struct ma_state *mas)
{
        void *entry;

        if (!mas_is_active(mas) || !mas_is_start(mas))
                mas->status = ma_start;
retry:
        entry = mas_state_walk(mas);
        if (mas_is_start(mas)) {
                goto retry;
        } else if (mas_is_none(mas)) {
                mas->index = 0;
                mas->last = ULONG_MAX;
        } else if (mas_is_ptr(mas)) {
                if (!mas->index) {
                        mas->last = 0;
                        return entry;
                }

                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return NULL;
        }

        return entry;
}
EXPORT_SYMBOL_GPL(mas_walk);

static inline bool mas_rewind_node(struct ma_state *mas)
{
        unsigned char slot;

        do {
                if (mte_is_root(mas->node)) {
                        slot = mas->offset;
                        if (!slot)
                                return false;
                } else {
                        mas_ascend(mas);
                        slot = mas->offset;
                }
        } while (!slot);

        mas->offset = --slot;
        return true;
}

/*
 * mas_skip_node() - Internal function.  Skip over a node.
 * @mas: The maple state.
 *
 * Return: true if there is another node, false otherwise.
 */
static inline bool mas_skip_node(struct ma_state *mas)
{
        if (mas_is_err(mas))
                return false;

        do {
                if (mte_is_root(mas->node)) {
                        if (mas->offset >= mas_data_end(mas)) {
                                mas_set_err(mas, -EBUSY);
                                return false;
                        }
                } else {
                        mas_ascend(mas);
                }
        } while (mas->offset >= mas_data_end(mas));

        mas->offset++;
        return true;
}

/*
 * mas_awalk() - Allocation walk.  Search from low address to high, for a gap of
 * @size
 * @mas: The maple state
 * @size: The size of the gap required
 *
 * Search between @mas->index and @mas->last for a gap of @size.
 */
static inline void mas_awalk(struct ma_state *mas, unsigned long size)
{
        struct maple_enode *last = NULL;

        /*
         * There are 4 options:
         * go to child (descend)
         * go back to parent (ascend)
         * no gap found. (return, error == -EBUSY)
         * found the gap. (return)
         */
        while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) {
                if (last == mas->node)
                        mas_skip_node(mas);
                else
                        last = mas->node;
        }
}

/*
 * mas_sparse_area() - Internal function.  Return upper or lower limit when
 * searching for a gap in an empty tree.
 * @mas: The maple state
 * @min: the minimum range
 * @max: The maximum range
 * @size: The size of the gap
 * @fwd: Searching forward or back
 */
static inline int mas_sparse_area(struct ma_state *mas, unsigned long min,
                                unsigned long max, unsigned long size, bool fwd)
{
        if (!unlikely(mas_is_none(mas)) && min == 0) {
                min++;
                /*
                 * At this time, min is increased, we need to recheck whether
                 * the size is satisfied.
                 */
                if (min > max || max - min + 1 < size)
                        return -EBUSY;
        }
        /* mas_is_ptr */

        if (fwd) {
                mas->index = min;
                mas->last = min + size - 1;
        } else {
                mas->last = max;
                mas->index = max - size + 1;
        }
        return 0;
}

/*
 * mas_empty_area() - Get the lowest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        unsigned char offset;
        unsigned long *pivots;
        enum maple_type mt;
        struct maple_node *node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else if (!mas_skip_node(mas))
                return -EBUSY;

        /* Empty set */
        if (mas_is_none(mas) || mas_is_ptr(mas))
                return mas_sparse_area(mas, min, max, size, true);

        /* The start of the window can only be within these values */
        mas->index = min;
        mas->last = max;
        mas_awalk(mas, size);

        if (unlikely(mas_is_err(mas)))
                return xa_err(mas->node);

        offset = mas->offset;
        node = mas_mn(mas);
        mt = mte_node_type(mas->node);
        pivots = ma_pivots(node, mt);
        min = mas_safe_min(mas, pivots, offset);
        if (mas->index < min)
                mas->index = min;
        mas->last = mas->index + size - 1;
        mas->end = ma_data_end(node, mt, pivots, mas->max);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area);

/*
 * mas_empty_area_rev() - Get the highest address within the range that is
 * sufficient for the size requested.
 * @mas: The maple state
 * @min: The lowest value of the range
 * @max: The highest value of the range
 * @size: The size needed
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                unsigned long max, unsigned long size)
{
        struct maple_enode *last = mas->node;

        if (min > max)
                return -EINVAL;

        if (size == 0 || max - min < size - 1)
                return -EINVAL;

        if (mas_is_start(mas))
                mas_start(mas);
        else if ((mas->offset < 2) && (!mas_rewind_node(mas)))
                return -EBUSY;

        if (unlikely(mas_is_none(mas) || mas_is_ptr(mas)))
                return mas_sparse_area(mas, min, max, size, false);
        else if (mas->offset >= 2)
                mas->offset -= 2;
        else
                mas->offset = mas_data_end(mas);


        /* The start of the window can only be within these values. */
        mas->index = min;
        mas->last = max;

        while (!mas_rev_awalk(mas, size, &min, &max)) {
                if (last == mas->node) {
                        if (!mas_rewind_node(mas))
                                return -EBUSY;
                } else {
                        last = mas->node;
                }
        }

        if (mas_is_err(mas))
                return xa_err(mas->node);

        if (unlikely(mas->offset == MAPLE_NODE_SLOTS))
                return -EBUSY;

        /* Trim the upper limit to the max. */
        if (max < mas->last)
                mas->last = max;

        mas->index = mas->last - size + 1;
        mas->end = mas_data_end(mas);
        return 0;
}
EXPORT_SYMBOL_GPL(mas_empty_area_rev);

/*
 * mte_dead_leaves() - Mark all leaves of a node as dead.
 * @enode: the encoded node
 * @mt: the maple tree
 * @slots: Pointer to the slot array
 *
 * Must hold the write lock.
 *
 * Return: The number of leaves marked as dead.
 */
static inline
unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt,
                              void __rcu **slots)
{
        struct maple_node *node;
        enum maple_type type;
        void *entry;
        int offset;

        for (offset = 0; offset < mt_slot_count(enode); offset++) {
                entry = mt_slot(mt, slots, offset);
                type = mte_node_type(entry);
                node = mte_to_node(entry);
                /* Use both node and type to catch LE & BE metadata */
                if (!node || !type)
                        break;

                mte_set_node_dead(entry);
                node->type = type;
                rcu_assign_pointer(slots[offset], node);
        }

        return offset;
}

/**
 * mte_dead_walk() - Walk down a dead tree to just before the leaves
 * @enode: The maple encoded node
 * @offset: The starting offset
 *
 * Note: This can only be used from the RCU callback context.
 */
static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset)
{
        struct maple_node *node, *next;
        void __rcu **slots = NULL;

        next = mte_to_node(*enode);
        do {
                *enode = ma_enode_ptr(next);
                node = mte_to_node(*enode);
                slots = ma_slots(node, node->type);
                next = rcu_dereference_protected(slots[offset],
                                        lock_is_held(&rcu_callback_map));
                offset = 0;
        } while (!ma_is_leaf(next->type));

        return slots;
}

/**
 * mt_free_walk() - Walk & free a tree in the RCU callback context
 * @head: The RCU head that's within the node.
 *
 * Note: This can only be used from the RCU callback context.
 */
static void mt_free_walk(struct rcu_head *head)
{
        void __rcu **slots;
        struct maple_node *node, *start;
        struct maple_enode *enode;
        unsigned char offset;
        enum maple_type type;

        node = container_of(head, struct maple_node, rcu);

        if (ma_is_leaf(node->type))
                goto free_leaf;

        start = node;
        enode = mt_mk_node(node, node->type);
        slots = mte_dead_walk(&enode, 0);
        node = mte_to_node(enode);
        do {
                mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if ((offset < mt_slots[type]) &&
                    rcu_dereference_protected(slots[offset],
                                              lock_is_held(&rcu_callback_map)))
                        slots = mte_dead_walk(&enode, offset);
                node = mte_to_node(enode);
        } while ((node != start) || (node->slot_len < offset));

        slots = ma_slots(node, node->type);
        mt_free_bulk(node->slot_len, slots);

free_leaf:
        mt_free_rcu(&node->rcu);
}

static inline void __rcu **mte_destroy_descend(struct maple_enode **enode,
        struct maple_tree *mt, struct maple_enode *prev, unsigned char offset)
{
        struct maple_node *node;
        struct maple_enode *next = *enode;
        void __rcu **slots = NULL;
        enum maple_type type;
        unsigned char next_offset = 0;

        do {
                *enode = next;
                node = mte_to_node(*enode);
                type = mte_node_type(*enode);
                slots = ma_slots(node, type);
                next = mt_slot_locked(mt, slots, next_offset);
                if ((mte_dead_node(next)))
                        next = mt_slot_locked(mt, slots, ++next_offset);

                mte_set_node_dead(*enode);
                node->type = type;
                node->piv_parent = prev;
                node->parent_slot = offset;
                offset = next_offset;
                next_offset = 0;
                prev = *enode;
        } while (!mte_is_leaf(next));

        return slots;
}

static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
                            bool free)
{
        void __rcu **slots;
        struct maple_node *node = mte_to_node(enode);
        struct maple_enode *start;

        if (mte_is_leaf(enode)) {
                node->type = mte_node_type(enode);
                goto free_leaf;
        }

        start = enode;
        slots = mte_destroy_descend(&enode, mt, start, 0);
        node = mte_to_node(enode); // Updated in the above call.
        do {
                enum maple_type type;
                unsigned char offset;
                struct maple_enode *parent, *tmp;

                node->slot_len = mte_dead_leaves(enode, mt, slots);
                if (free)
                        mt_free_bulk(node->slot_len, slots);
                offset = node->parent_slot + 1;
                enode = node->piv_parent;
                if (mte_to_node(enode) == node)
                        goto free_leaf;

                type = mte_node_type(enode);
                slots = ma_slots(mte_to_node(enode), type);
                if (offset >= mt_slots[type])
                        goto next;

                tmp = mt_slot_locked(mt, slots, offset);
                if (mte_node_type(tmp) && mte_to_node(tmp)) {
                        parent = enode;
                        enode = tmp;
                        slots = mte_destroy_descend(&enode, mt, parent, offset);
                }
next:
                node = mte_to_node(enode);
        } while (start != enode);

        node = mte_to_node(enode);
        node->slot_len = mte_dead_leaves(enode, mt, slots);
        if (free)
                mt_free_bulk(node->slot_len, slots);

free_leaf:
        if (free)
                mt_free_rcu(&node->rcu);
        else
                mt_clear_meta(mt, node, node->type);
}

/*
 * mte_destroy_walk() - Free a tree or sub-tree.
 * @enode: the encoded maple node (maple_enode) to start
 * @mt: the tree to free - needed for node types.
 *
 * Must hold the write lock.
 */
static inline void mte_destroy_walk(struct maple_enode *enode,
                                    struct maple_tree *mt)
{
        struct maple_node *node = mte_to_node(enode);

        if (mt_in_rcu(mt)) {
                mt_destroy_walk(enode, mt, false);
                call_rcu(&node->rcu, mt_free_walk);
        } else {
                mt_destroy_walk(enode, mt, true);
        }
}
/* Interface */

/**
 * mas_store() - Store an @entry.
 * @mas: The maple state.
 * @entry: The entry to store.
 *
 * The @mas->index and @mas->last is used to set the range for the @entry.
 *
 * Return: the first entry between mas->index and mas->last or %NULL.
 */
void *mas_store(struct ma_state *mas, void *entry)
{
        int request;
        MA_WR_STATE(wr_mas, mas, entry);

        trace_ma_write(__func__, mas, 0, entry);
#ifdef CONFIG_DEBUG_MAPLE_TREE
        if (MAS_WARN_ON(mas, mas->index > mas->last))
                pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last,
                       entry);

        if (mas->index > mas->last) {
                mas_set_err(mas, -EINVAL);
                return NULL;
        }

#endif

        /*
         * Storing is the same operation as insert with the added caveat that it
         * can overwrite entries.  Although this seems simple enough, one may
         * want to examine what happens if a single store operation was to
         * overwrite multiple entries within a self-balancing B-Tree.
         */
        mas_wr_prealloc_setup(&wr_mas);
        mas->store_type = mas_wr_store_type(&wr_mas);
        if (mas->mas_flags & MA_STATE_PREALLOC) {
                mas_wr_store_entry(&wr_mas);
                MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
                return wr_mas.content;
        }

        request = mas_prealloc_calc(mas, entry);
        if (!request)
                goto store;

        mas_node_count(mas, request);
        if (mas_is_err(mas))
                return NULL;

store:
        mas_wr_store_entry(&wr_mas);
        mas_destroy(mas);
        return wr_mas.content;
}
EXPORT_SYMBOL_GPL(mas_store);

/**
 * mas_store_gfp() - Store a value into the tree.
 * @mas: The maple state
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations if necessary.
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp)
{
        unsigned long index = mas->index;
        unsigned long last = mas->last;
        MA_WR_STATE(wr_mas, mas, entry);
        int ret = 0;

retry:
        mas_wr_preallocate(&wr_mas, entry);
        if (unlikely(mas_nomem(mas, gfp))) {
                if (!entry)
                        __mas_set_range(mas, index, last);
                goto retry;
        }

        if (mas_is_err(mas)) {
                ret = xa_err(mas->node);
                goto out;
        }

        mas_wr_store_entry(&wr_mas);
out:
        mas_destroy(mas);
        return ret;
}
EXPORT_SYMBOL_GPL(mas_store_gfp);

/**
 * mas_store_prealloc() - Store a value into the tree using memory
 * preallocated in the maple state.
 * @mas: The maple state
 * @entry: The entry to store.
 */
void mas_store_prealloc(struct ma_state *mas, void *entry)
{
        MA_WR_STATE(wr_mas, mas, entry);

        if (mas->store_type == wr_store_root) {
                mas_wr_prealloc_setup(&wr_mas);
                goto store;
        }

        mas_wr_walk_descend(&wr_mas);
        if (mas->store_type != wr_spanning_store) {
                /* set wr_mas->content to current slot */
                wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset);
                mas_wr_end_piv(&wr_mas);
        }

store:
        trace_ma_write(__func__, mas, 0, entry);
        mas_wr_store_entry(&wr_mas);
        MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas));
        mas_destroy(mas);
}
EXPORT_SYMBOL_GPL(mas_store_prealloc);

/**
 * mas_preallocate() - Preallocate enough nodes for a store operation
 * @mas: The maple state
 * @entry: The entry that will be stored
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated.
 */
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
{
        MA_WR_STATE(wr_mas, mas, entry);
        int ret = 0;
        int request;

        mas_wr_prealloc_setup(&wr_mas);
        mas->store_type = mas_wr_store_type(&wr_mas);
        request = mas_prealloc_calc(mas, entry);
        if (!request)
                return ret;

        mas_node_count_gfp(mas, request, gfp);
        if (mas_is_err(mas)) {
                mas_set_alloc_req(mas, 0);
                ret = xa_err(mas->node);
                mas_destroy(mas);
                mas_reset(mas);
                return ret;
        }

        mas->mas_flags |= MA_STATE_PREALLOC;
        return ret;
}
EXPORT_SYMBOL_GPL(mas_preallocate);

/*
 * mas_destroy() - destroy a maple state.
 * @mas: The maple state
 *
 * Upon completion, check the left-most node and rebalance against the node to
 * the right if necessary.  Frees any allocated nodes associated with this maple
 * state.
 */
void mas_destroy(struct ma_state *mas)
{
        struct maple_alloc *node;
        unsigned long total;

        /*
         * When using mas_for_each() to insert an expected number of elements,
         * it is possible that the number inserted is less than the expected
         * number.  To fix an invalid final node, a check is performed here to
         * rebalance the previous node with the final node.
         */
        if (mas->mas_flags & MA_STATE_REBALANCE) {
                unsigned char end;
                if (mas_is_err(mas))
                        mas_reset(mas);
                mas_start(mas);
                mtree_range_walk(mas);
                end = mas->end + 1;
                if (end < mt_min_slot_count(mas->node) - 1)
                        mas_destroy_rebalance(mas, end);

                mas->mas_flags &= ~MA_STATE_REBALANCE;
        }
        mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);

        total = mas_allocated(mas);
        while (total) {
                node = mas->alloc;
                mas->alloc = node->slot[0];
                if (node->node_count > 1) {
                        size_t count = node->node_count - 1;

                        mt_free_bulk(count, (void __rcu **)&node->slot[1]);
                        total -= count;
                }
                mt_free_one(ma_mnode_ptr(node));
                total--;
        }

        mas->alloc = NULL;
}
EXPORT_SYMBOL_GPL(mas_destroy);

/*
 * mas_expected_entries() - Set the expected number of entries that will be inserted.
 * @mas: The maple state
 * @nr_entries: The number of expected entries.
 *
 * This will attempt to pre-allocate enough nodes to store the expected number
 * of entries.  The allocations will occur using the bulk allocator interface
 * for speed.  Please call mas_destroy() on the @mas after inserting the entries
 * to ensure any unused nodes are freed.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated.
 */
int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries)
{
        int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2;
        struct maple_enode *enode = mas->node;
        int nr_nodes;
        int ret;

        /*
         * Sometimes it is necessary to duplicate a tree to a new tree, such as
         * forking a process and duplicating the VMAs from one tree to a new
         * tree.  When such a situation arises, it is known that the new tree is
         * not going to be used until the entire tree is populated.  For
         * performance reasons, it is best to use a bulk load with RCU disabled.
         * This allows for optimistic splitting that favours the left and reuse
         * of nodes during the operation.
         */

        /* Optimize splitting for bulk insert in-order */
        mas->mas_flags |= MA_STATE_BULK;

        /*
         * Avoid overflow, assume a gap between each entry and a trailing null.
         * If this is wrong, it just means allocation can happen during
         * insertion of entries.
         */
        nr_nodes = max(nr_entries, nr_entries * 2 + 1);
        if (!mt_is_alloc(mas->tree))
                nonleaf_cap = MAPLE_RANGE64_SLOTS - 2;

        /* Leaves; reduce slots to keep space for expansion */
        nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2);
        /* Internal nodes */
        nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap);
        /* Add working room for split (2 nodes) + new parents */
        mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL);

        /* Detect if allocations run out */
        mas->mas_flags |= MA_STATE_PREALLOC;

        if (!mas_is_err(mas))
                return 0;

        ret = xa_err(mas->node);
        mas->node = enode;
        mas_destroy(mas);
        return ret;

}
EXPORT_SYMBOL_GPL(mas_expected_entries);

static bool mas_next_setup(struct ma_state *mas, unsigned long max,
                void **entry)
{
        bool was_none = mas_is_none(mas);

        if (unlikely(mas->last >= max)) {
                mas->status = ma_overflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                fallthrough;
        case ma_start:
                mas_walk(mas); /* Retries on dead nodes handled by mas_walk */
                break;
        case ma_overflow:
                /* Overflowed before, but the max changed */
                mas->status = ma_active;
                break;
        case ma_underflow:
                /* The user expects the mas to be one before where it is */
                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (likely(mas_is_active(mas))) /* Fast path */
                return false;

        if (mas_is_ptr(mas)) {
                *entry = NULL;
                if (was_none && mas->index == 0) {
                        mas->index = mas->last = 0;
                        return true;
                }
                mas->index = 1;
                mas->last = ULONG_MAX;
                mas->status = ma_none;
                return true;
        }

        if (mas_is_none(mas))
                return true;

        return false;
}

/**
 * mas_next() - Get the next entry.
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Returns the next entry after @mas->index.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, false);
}
EXPORT_SYMBOL_GPL(mas_next);

/**
 * mas_next_range() - Advance the maple state to the next range
 * @mas: The maple state
 * @max: The maximum index to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Can return the zero entry.
 *
 * Return: The next entry or %NULL
 */
void *mas_next_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_next_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_next_range);

/**
 * mt_next() - get the next value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @max: The maximum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry higher than @index or %NULL if nothing is found.
 */
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_next(&mas, max);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_next);

static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry)
{
        if (unlikely(mas->index <= min)) {
                mas->status = ma_underflow;
                return true;
        }

        switch (mas->status) {
        case ma_active:
                return false;
        case ma_start:
                break;
        case ma_none:
                fallthrough;
        case ma_pause:
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* underflowed before but the min changed */
                mas->status = ma_active;
                break;
        case ma_overflow:
                /* User expects mas to be one after where it is */
                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas))
                mas_walk(mas);

        if (unlikely(mas_is_ptr(mas))) {
                if (!mas->index) {
                        mas->status = ma_none;
                        return true;
                }
                mas->index = mas->last = 0;
                *entry = mas_root(mas);
                return true;
        }

        if (mas_is_none(mas)) {
                if (mas->index) {
                        /* Walked to out-of-range pointer? */
                        mas->index = mas->last = 0;
                        mas->status = ma_root;
                        *entry = mas_root(mas);
                        return true;
                }
                return true;
        }

        return false;
}

/**
 * mas_prev() - Get the previous entry
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the status is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, false);
}
EXPORT_SYMBOL_GPL(mas_prev);

/**
 * mas_prev_range() - Advance to the previous range
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Sets @mas->index and @mas->last to the range.
 * Must hold rcu_read_lock or the write lock.
 * Will reset mas to ma_start if the node is ma_none.  Will stop on not
 * searchable nodes.
 *
 * Return: the previous value or %NULL.
 */
void *mas_prev_range(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_prev_setup(mas, min, &entry))
                return entry;

        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_prev_range);

/**
 * mt_prev() - get the previous value in the maple tree
 * @mt: The maple tree
 * @index: The start index
 * @min: The minimum index to check
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * Return: The entry before @index or %NULL if nothing is found.
 */
void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min)
{
        void *entry = NULL;
        MA_STATE(mas, mt, index, index);

        rcu_read_lock();
        entry = mas_prev(&mas, min);
        rcu_read_unlock();
        return entry;
}
EXPORT_SYMBOL_GPL(mt_prev);

/**
 * mas_pause() - Pause a mas_find/mas_for_each to drop the lock.
 * @mas: The maple state to pause
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @mas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call mas_pause(), the mt_for_each()
 * iterator may be more appropriate.
 *
 */
void mas_pause(struct ma_state *mas)
{
        mas->status = ma_pause;
        mas->node = NULL;
}
EXPORT_SYMBOL_GPL(mas_pause);

/**
 * mas_find_setup() - Internal function to set up mas_find*().
 * @mas: The maple state
 * @max: The maximum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry)
{
        switch (mas->status) {
        case ma_active:
                if (mas->last < max)
                        return false;
                return true;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = ++mas->last;
                mas->status = ma_start;
                break;
        case ma_none:
                if (unlikely(mas->last >= max))
                        return true;

                mas->index = mas->last;
                mas->status = ma_start;
                break;
        case ma_underflow:
                /* mas is pointing at entry before unable to go lower */
                if (unlikely(mas->index >= max)) {
                        mas->status = ma_overflow;
                        return true;
                }

                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_overflow:
                if (unlikely(mas->last >= max))
                        return true;

                mas->status = ma_active;
                *entry = mas_walk(mas);
                if (*entry)
                        return true;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index > max)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;

        }

        if (unlikely(mas_is_ptr(mas)))
                goto ptr_out_of_range;

        if (unlikely(mas_is_none(mas)))
                return true;

        if (mas->index == max)
                return true;

        return false;

ptr_out_of_range:
        mas->status = ma_none;
        mas->index = 1;
        mas->last = ULONG_MAX;
        return true;
}

/**
 * mas_find() - On the first call, find the entry at or after mas->index up to
 * %max.  Otherwise, find the entry after mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        entry = mas_next_slot(mas, max, false);
        /* Ignore overflow */
        mas->status = ma_active;
        return entry;
}
EXPORT_SYMBOL_GPL(mas_find);

/**
 * mas_find_range() - On the first call, find the entry at or after
 * mas->index up to %max.  Otherwise, advance to the next slot mas->index.
 * @mas: The maple state
 * @max: The maximum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_overflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range(struct ma_state *mas, unsigned long max)
{
        void *entry = NULL;

        if (mas_find_setup(mas, max, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_next_slot */
        return mas_next_slot(mas, max, true);
}
EXPORT_SYMBOL_GPL(mas_find_range);

/**
 * mas_find_rev_setup() - Internal function to set up mas_find_*_rev()
 * @mas: The maple state
 * @min: The minimum index
 * @entry: Pointer to the entry
 *
 * Returns: True if entry is the answer, false otherwise.
 */
static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min,
                void **entry)
{

        switch (mas->status) {
        case ma_active:
                goto active;
        case ma_start:
                break;
        case ma_pause:
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }
                mas->last = --mas->index;
                mas->status = ma_start;
                break;
        case ma_none:
                if (mas->index <= min)
                        goto none;

                mas->last = mas->index;
                mas->status = ma_start;
                break;
        case ma_overflow: /* user expects the mas to be one after where it is */
                if (unlikely(mas->index <= min)) {
                        mas->status = ma_underflow;
                        return true;
                }

                mas->status = ma_active;
                break;
        case ma_underflow: /* user expects the mas to be one before where it is */
                if (unlikely(mas->index <= min))
                        return true;

                mas->status = ma_active;
                break;
        case ma_root:
                break;
        case ma_error:
                return true;
        }

        if (mas_is_start(mas)) {
                /* First run or continue */
                if (mas->index < min)
                        return true;

                *entry = mas_walk(mas);
                if (*entry)
                        return true;
        }

        if (unlikely(mas_is_ptr(mas)))
                goto none;

        if (unlikely(mas_is_none(mas))) {
                /*
                 * Walked to the location, and there was nothing so the previous
                 * location is 0.
                 */
                mas->last = mas->index = 0;
                mas->status = ma_root;
                *entry = mas_root(mas);
                return true;
        }

active:
        if (mas->index < min)
                return true;

        return false;

none:
        mas->status = ma_none;
        return true;
}

/**
 * mas_find_rev: On the first call, find the first non-null entry at or below
 * mas->index down to %min.  Otherwise find the first non-null entry below
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, false);

}
EXPORT_SYMBOL_GPL(mas_find_rev);

/**
 * mas_find_range_rev: On the first call, find the first non-null entry at or
 * below mas->index down to %min.  Otherwise advance to the previous slot after
 * mas->index down to %min.
 * @mas: The maple state
 * @min: The minimum value to check.
 *
 * Must hold rcu_read_lock or the write lock.
 * If an entry exists, last and index are updated accordingly.
 * May set @mas->status to ma_underflow.
 *
 * Return: The entry or %NULL.
 */
void *mas_find_range_rev(struct ma_state *mas, unsigned long min)
{
        void *entry = NULL;

        if (mas_find_rev_setup(mas, min, &entry))
                return entry;

        /* Retries on dead nodes handled by mas_prev_slot */
        return mas_prev_slot(mas, min, true);
}
EXPORT_SYMBOL_GPL(mas_find_range_rev);

/**
 * mas_erase() - Find the range in which index resides and erase the entire
 * range.
 * @mas: The maple state
 *
 * Must hold the write lock.
 * Searches for @mas->index, sets @mas->index and @mas->last to the range and
 * erases that range.
 *
 * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated.
 */
void *mas_erase(struct ma_state *mas)
{
        void *entry;
        unsigned long index = mas->index;
        MA_WR_STATE(wr_mas, mas, NULL);

        if (!mas_is_active(mas) || !mas_is_start(mas))
                mas->status = ma_start;

write_retry:
        entry = mas_state_walk(mas);
        if (!entry)
                return NULL;

        /* Must reset to ensure spanning writes of last slot are detected */
        mas_reset(mas);
        mas_wr_preallocate(&wr_mas, NULL);
        if (mas_nomem(mas, GFP_KERNEL)) {
                /* in case the range of entry changed when unlocked */
                mas->index = mas->last = index;
                goto write_retry;
        }

        if (mas_is_err(mas))
                goto out;

        mas_wr_store_entry(&wr_mas);
out:
        mas_destroy(mas);
        return entry;
}
EXPORT_SYMBOL_GPL(mas_erase);

/**
 * mas_nomem() - Check if there was an error allocating and do the allocation
 * if necessary If there are allocations, then free them.
 * @mas: The maple state
 * @gfp: The GFP_FLAGS to use for allocations
 * Return: true on allocation, false otherwise.
 */
bool mas_nomem(struct ma_state *mas, gfp_t gfp)
        __must_hold(mas->tree->ma_lock)
{
        if (likely(mas->node != MA_ERROR(-ENOMEM)))
                return false;

        if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) {
                mtree_unlock(mas->tree);
                mas_alloc_nodes(mas, gfp);
                mtree_lock(mas->tree);
        } else {
                mas_alloc_nodes(mas, gfp);
        }

        if (!mas_allocated(mas))
                return false;

        mas->status = ma_start;
        return true;
}

void __init maple_tree_init(void)
{
        maple_node_cache = kmem_cache_create("maple_node",
                        sizeof(struct maple_node), sizeof(struct maple_node),
                        SLAB_PANIC, NULL);
}

/**
 * mtree_load() - Load a value stored in a maple tree
 * @mt: The maple tree
 * @index: The index to load
 *
 * Return: the entry or %NULL
 */
void *mtree_load(struct maple_tree *mt, unsigned long index)
{
        MA_STATE(mas, mt, index, index);
        void *entry;

        trace_ma_read(__func__, &mas);
        rcu_read_lock();
retry:
        entry = mas_start(&mas);
        if (unlikely(mas_is_none(&mas)))
                goto unlock;

        if (unlikely(mas_is_ptr(&mas))) {
                if (index)
                        entry = NULL;

                goto unlock;
        }

        entry = mtree_lookup_walk(&mas);
        if (!entry && unlikely(mas_is_start(&mas)))
                goto retry;
unlock:
        rcu_read_unlock();
        if (xa_is_zero(entry))
                return NULL;

        return entry;
}
EXPORT_SYMBOL(mtree_load);

/**
 * mtree_store_range() - Store an entry at a given range.
 * @mt: The maple tree
 * @index: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store_range(struct maple_tree *mt, unsigned long index,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(mas, mt, index, last);
        int ret = 0;

        trace_ma_write(__func__, &mas, 0, entry);
        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (index > last)
                return -EINVAL;

        mtree_lock(mt);
        ret = mas_store_gfp(&mas, entry, gfp);
        mtree_unlock(mt);

        return ret;
}
EXPORT_SYMBOL(mtree_store_range);

/**
 * mtree_store() - Store an entry at a given index.
 * @mt: The maple tree
 * @index: The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not
 * be allocated.
 */
int mtree_store(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_store_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_store);

/**
 * mtree_insert_range() - Insert an entry at a given range if there is no value.
 * @mt: The maple tree
 * @first: The start of the range
 * @last: The end of the range
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        MA_STATE(ms, mt, first, last);
        int ret = 0;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;

        if (first > last)
                return -EINVAL;

        mtree_lock(mt);
retry:
        mas_insert(&ms, entry);
        if (mas_nomem(&ms, gfp))
                goto retry;

        mtree_unlock(mt);
        if (mas_is_err(&ms))
                ret = xa_err(ms.node);

        mas_destroy(&ms);
        return ret;
}
EXPORT_SYMBOL(mtree_insert_range);

/**
 * mtree_insert() - Insert an entry at a given index if there is no value.
 * @mt: The maple tree
 * @index : The index to store the value
 * @entry: The entry to store
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid
 * request, -ENOMEM if memory could not be allocated.
 */
int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry,
                 gfp_t gfp)
{
        return mtree_insert_range(mt, index, index, entry, gfp);
}
EXPORT_SYMBOL(mtree_insert);

int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        mas_destroy(&mas);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_range);

/**
 * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree.
 * @mt: The maple tree.
 * @startp: Pointer to ID.
 * @range_lo: Lower bound of range to search.
 * @range_hi: Upper bound of range to search.
 * @entry: The entry to store.
 * @next: Pointer to next ID to allocate.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * Finds an empty entry in @mt after @next, stores the new index into
 * the @id pointer, stores the entry at that index, then updates @next.
 *
 * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag.
 *
 * Context: Any context.  Takes and releases the mt.lock.  May sleep if
 * the @gfp flags permit.
 *
 * Return: 0 if the allocation succeeded without wrapping, 1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no
 * free entries.
 */
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp)
{
        int ret;

        MA_STATE(mas, mt, 0, 0);

        if (!mt_is_alloc(mt))
                return -EINVAL;
        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;
        mtree_lock(mt);
        ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi,
                               next, gfp);
        mtree_unlock(mt);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_cyclic);

int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp)
{
        int ret = 0;

        MA_STATE(mas, mt, 0, 0);
        if (!mt_is_alloc(mt))
                return -EINVAL;

        if (WARN_ON_ONCE(mt_is_reserved(entry)))
                return -EINVAL;

        mtree_lock(mt);
retry:
        ret = mas_empty_area_rev(&mas, min, max, size);
        if (ret)
                goto unlock;

        mas_insert(&mas, entry);
        /*
         * mas_nomem() may release the lock, causing the allocated area
         * to be unavailable, so try to allocate a free area again.
         */
        if (mas_nomem(&mas, gfp))
                goto retry;

        if (mas_is_err(&mas))
                ret = xa_err(mas.node);
        else
                *startp = mas.index;

unlock:
        mtree_unlock(mt);
        mas_destroy(&mas);
        return ret;
}
EXPORT_SYMBOL(mtree_alloc_rrange);

/**
 * mtree_erase() - Find an index and erase the entire range.
 * @mt: The maple tree
 * @index: The index to erase
 *
 * Erasing is the same as a walk to an entry then a store of a NULL to that
 * ENTIRE range.  In fact, it is implemented as such using the advanced API.
 *
 * Return: The entry stored at the @index or %NULL
 */
void *mtree_erase(struct maple_tree *mt, unsigned long index)
{
        void *entry = NULL;

        MA_STATE(mas, mt, index, index);
        trace_ma_op(__func__, &mas);

        mtree_lock(mt);
        entry = mas_erase(&mas);
        mtree_unlock(mt);

        return entry;
}
EXPORT_SYMBOL(mtree_erase);

/*
 * mas_dup_free() - Free an incomplete duplication of a tree.
 * @mas: The maple state of a incomplete tree.
 *
 * The parameter @mas->node passed in indicates that the allocation failed on
 * this node. This function frees all nodes starting from @mas->node in the
 * reverse order of mas_dup_build(). There is no need to hold the source tree
 * lock at this time.
 */
static void mas_dup_free(struct ma_state *mas)
{
        struct maple_node *node;
        enum maple_type type;
        void __rcu **slots;
        unsigned char count, i;

        /* Maybe the first node allocation failed. */
        if (mas_is_none(mas))
                return;

        while (!mte_is_root(mas->node)) {
                mas_ascend(mas);
                if (mas->offset) {
                        mas->offset--;
                        do {
                                mas_descend(mas);
                                mas->offset = mas_data_end(mas);
                        } while (!mte_is_leaf(mas->node));

                        mas_ascend(mas);
                }

                node = mte_to_node(mas->node);
                type = mte_node_type(mas->node);
                slots = ma_slots(node, type);
                count = mas_data_end(mas) + 1;
                for (i = 0; i < count; i++)
                        ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK;
                mt_free_bulk(count, slots);
        }

        node = mte_to_node(mas->node);
        mt_free_one(node);
}

/*
 * mas_copy_node() - Copy a maple node and replace the parent.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @parent: The parent of the new node.
 *
 * Copy @mas->node to @new_mas->node, set @parent to be the parent of
 * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas,
                struct maple_pnode *parent)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        unsigned long val;

        /* Copy the node completely. */
        memcpy(new_node, node, sizeof(struct maple_node));
        /* Update the parent node pointer. */
        val = (unsigned long)node->parent & MAPLE_NODE_MASK;
        new_node->parent = ma_parent_ptr(val | (unsigned long)parent);
}

/*
 * mas_dup_alloc() - Allocate child nodes for a maple node.
 * @mas: The maple state of source tree.
 * @new_mas: The maple state of new tree.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function allocates child nodes for @new_mas->node during the duplication
 * process. If memory allocation fails, @mas is set to -ENOMEM.
 */
static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node = mte_to_node(mas->node);
        struct maple_node *new_node = mte_to_node(new_mas->node);
        enum maple_type type;
        unsigned char request, count, i;
        void __rcu **slots;
        void __rcu **new_slots;
        unsigned long val;

        /* Allocate memory for child nodes. */
        type = mte_node_type(mas->node);
        new_slots = ma_slots(new_node, type);
        request = mas_data_end(mas) + 1;
        count = mt_alloc_bulk(gfp, request, (void **)new_slots);
        if (unlikely(count < request)) {
                memset(new_slots, 0, request * sizeof(void *));
                mas_set_err(mas, -ENOMEM);
                return;
        }

        /* Restore node type information in slots. */
        slots = ma_slots(node, type);
        for (i = 0; i < count; i++) {
                val = (unsigned long)mt_slot_locked(mas->tree, slots, i);
                val &= MAPLE_NODE_MASK;
                ((unsigned long *)new_slots)[i] |= val;
        }
}

/*
 * mas_dup_build() - Build a new maple tree from a source tree
 * @mas: The maple state of source tree, need to be in MAS_START state.
 * @new_mas: The maple state of new tree, need to be in MAS_START state.
 * @gfp: The GFP_FLAGS to use for allocations.
 *
 * This function builds a new tree in DFS preorder. If the memory allocation
 * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the
 * last node. mas_dup_free() will free the incomplete duplication of a tree.
 *
 * Note that the attributes of the two trees need to be exactly the same, and the
 * new tree needs to be empty, otherwise -EINVAL will be set in @mas.
 */
static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas,
                gfp_t gfp)
{
        struct maple_node *node;
        struct maple_pnode *parent = NULL;
        struct maple_enode *root;
        enum maple_type type;

        if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) ||
            unlikely(!mtree_empty(new_mas->tree))) {
                mas_set_err(mas, -EINVAL);
                return;
        }

        root = mas_start(mas);
        if (mas_is_ptr(mas) || mas_is_none(mas))
                goto set_new_tree;

        node = mt_alloc_one(gfp);
        if (!node) {
                new_mas->status = ma_none;
                mas_set_err(mas, -ENOMEM);
                return;
        }

        type = mte_node_type(mas->node);
        root = mt_mk_node(node, type);
        new_mas->node = root;
        new_mas->min = 0;
        new_mas->max = ULONG_MAX;
        root = mte_mk_root(root);
        while (1) {
                mas_copy_node(mas, new_mas, parent);
                if (!mte_is_leaf(mas->node)) {
                        /* Only allocate child nodes for non-leaf nodes. */
                        mas_dup_alloc(mas, new_mas, gfp);
                        if (unlikely(mas_is_err(mas)))
                                return;
                } else {
                        /*
                         * This is the last leaf node and duplication is
                         * completed.
                         */
                        if (mas->max == ULONG_MAX)
                                goto done;

                        /* This is not the last leaf node and needs to go up. */
                        do {
                                mas_ascend(mas);
                                mas_ascend(new_mas);
                        } while (mas->offset == mas_data_end(mas));

                        /* Move to the next subtree. */
                        mas->offset++;
                        new_mas->offset++;
                }

                mas_descend(mas);
                parent = ma_parent_ptr(mte_to_node(new_mas->node));
                mas_descend(new_mas);
                mas->offset = 0;
                new_mas->offset = 0;
        }
done:
        /* Specially handle the parent of the root node. */
        mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas));
set_new_tree:
        /* Make them the same height */
        new_mas->tree->ma_flags = mas->tree->ma_flags;
        rcu_assign_pointer(new_mas->tree->ma_root, root);
}

/**
 * __mt_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 * Note that the user needs to manually lock the source tree and the new tree.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_dup_build(&mas, &new_mas, gfp);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        return ret;
}
EXPORT_SYMBOL(__mt_dup);

/**
 * mtree_dup(): Duplicate an entire maple tree
 * @mt: The source maple tree
 * @new: The new maple tree
 * @gfp: The GFP_FLAGS to use for allocations
 *
 * This function duplicates a maple tree in Depth-First Search (DFS) pre-order
 * traversal. It uses memcpy() to copy nodes in the source tree and allocate
 * new child nodes in non-leaf nodes. The new node is exactly the same as the
 * source node except for all the addresses stored in it. It will be faster than
 * traversing all elements in the source tree and inserting them one by one into
 * the new tree.
 * The user needs to ensure that the attributes of the source tree and the new
 * tree are the same, and the new tree needs to be an empty tree, otherwise
 * -EINVAL will be returned.
 *
 * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If
 * the attributes of the two trees are different or the new tree is not an empty
 * tree.
 */
int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp)
{
        int ret = 0;
        MA_STATE(mas, mt, 0, 0);
        MA_STATE(new_mas, new, 0, 0);

        mas_lock(&new_mas);
        mas_lock_nested(&mas, SINGLE_DEPTH_NESTING);
        mas_dup_build(&mas, &new_mas, gfp);
        mas_unlock(&mas);
        if (unlikely(mas_is_err(&mas))) {
                ret = xa_err(mas.node);
                if (ret == -ENOMEM)
                        mas_dup_free(&new_mas);
        }

        mas_unlock(&new_mas);
        return ret;
}
EXPORT_SYMBOL(mtree_dup);

/**
 * __mt_destroy() - Walk and free all nodes of a locked maple tree.
 * @mt: The maple tree
 *
 * Note: Does not handle locking.
 */
void __mt_destroy(struct maple_tree *mt)
{
        void *root = mt_root_locked(mt);

        rcu_assign_pointer(mt->ma_root, NULL);
        if (xa_is_node(root))
                mte_destroy_walk(root, mt);

        mt->ma_flags = mt_attr(mt);
}
EXPORT_SYMBOL_GPL(__mt_destroy);

/**
 * mtree_destroy() - Destroy a maple tree
 * @mt: The maple tree
 *
 * Frees all resources used by the tree.  Handles locking.
 */
void mtree_destroy(struct maple_tree *mt)
{
        mtree_lock(mt);
        __mt_destroy(mt);
        mtree_unlock(mt);
}
EXPORT_SYMBOL(mtree_destroy);

/**
 * mt_find() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value of the search range
 *
 * Takes RCU read lock internally to protect the search, which does not
 * protect the returned pointer after dropping RCU read lock.
 * See also: Documentation/core-api/maple_tree.rst
 *
 * In case that an entry is found @index is updated to point to the next
 * possible entry independent whether the found entry is occupying a
 * single index or a range if indices.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max)
{
        MA_STATE(mas, mt, *index, *index);
        void *entry;
#ifdef CONFIG_DEBUG_MAPLE_TREE
        unsigned long copy = *index;
#endif

        trace_ma_read(__func__, &mas);

        if ((*index) > max)
                return NULL;

        rcu_read_lock();
retry:
        entry = mas_state_walk(&mas);
        if (mas_is_start(&mas))
                goto retry;

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;

        if (entry)
                goto unlock;

        while (mas_is_active(&mas) && (mas.last < max)) {
                entry = mas_next_slot(&mas, max, false);
                if (likely(entry && !xa_is_zero(entry)))
                        break;
        }

        if (unlikely(xa_is_zero(entry)))
                entry = NULL;
unlock:
        rcu_read_unlock();
        if (likely(entry)) {
                *index = mas.last + 1;
#ifdef CONFIG_DEBUG_MAPLE_TREE
                if (MT_WARN_ON(mt, (*index) && ((*index) <= copy)))
                        pr_err("index not increased! %lx <= %lx\n",
                               *index, copy);
#endif
        }

        return entry;
}
EXPORT_SYMBOL(mt_find);

/**
 * mt_find_after() - Search from the start up until an entry is found.
 * @mt: The maple tree
 * @index: Pointer which contains the start location of the search
 * @max: The maximum value to check
 *
 * Same as mt_find() except that it checks @index for 0 before
 * searching. If @index == 0, the search is aborted. This covers a wrap
 * around of @index to 0 in an iterator loop.
 *
 * Return: The entry at or after the @index or %NULL
 */
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max)
{
        if (!(*index))
                return NULL;

        return mt_find(mt, index, max);
}
EXPORT_SYMBOL(mt_find_after);

#ifdef CONFIG_DEBUG_MAPLE_TREE
atomic_t maple_tree_tests_run;
EXPORT_SYMBOL_GPL(maple_tree_tests_run);
atomic_t maple_tree_tests_passed;
EXPORT_SYMBOL_GPL(maple_tree_tests_passed);

#ifndef __KERNEL__
extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int);
void mt_set_non_kernel(unsigned int val)
{
        kmem_cache_set_non_kernel(maple_node_cache, val);
}

extern void kmem_cache_set_callback(struct kmem_cache *cachep,
                void (*callback)(void *));
void mt_set_callback(void (*callback)(void *))
{
        kmem_cache_set_callback(maple_node_cache, callback);
}

extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private);
void mt_set_private(void *private)
{
        kmem_cache_set_private(maple_node_cache, private);
}

extern unsigned long kmem_cache_get_alloc(struct kmem_cache *);
unsigned long mt_get_alloc_size(void)
{
        return kmem_cache_get_alloc(maple_node_cache);
}

extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *);
void mt_zero_nr_tallocated(void)
{
        kmem_cache_zero_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *);
unsigned int mt_nr_tallocated(void)
{
        return kmem_cache_nr_tallocated(maple_node_cache);
}

extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *);
unsigned int mt_nr_allocated(void)
{
        return kmem_cache_nr_allocated(maple_node_cache);
}

void mt_cache_shrink(void)
{
}
#else
/*
 * mt_cache_shrink() - For testing, don't use this.
 *
 * Certain testcases can trigger an OOM when combined with other memory
 * debugging configuration options.  This function is used to reduce the
 * possibility of an out of memory even due to kmem_cache objects remaining
 * around for longer than usual.
 */
void mt_cache_shrink(void)
{
        kmem_cache_shrink(maple_node_cache);

}
EXPORT_SYMBOL_GPL(mt_cache_shrink);

#endif /* not defined __KERNEL__ */
/*
 * mas_get_slot() - Get the entry in the maple state node stored at @offset.
 * @mas: The maple state
 * @offset: The offset into the slot array to fetch.
 *
 * Return: The entry stored at @offset.
 */
static inline struct maple_enode *mas_get_slot(struct ma_state *mas,
                unsigned char offset)
{
        return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)),
                        offset);
}

/* Depth first search, post-order */
static void mas_dfs_postorder(struct ma_state *mas, unsigned long max)
{

        struct maple_enode *p, *mn = mas->node;
        unsigned long p_min, p_max;

        mas_next_node(mas, mas_mn(mas), max);
        if (!mas_is_overflow(mas))
                return;

        if (mte_is_root(mn))
                return;

        mas->node = mn;
        mas_ascend(mas);
        do {
                p = mas->node;
                p_min = mas->min;
                p_max = mas->max;
                mas_prev_node(mas, 0);
        } while (!mas_is_underflow(mas));

        mas->node = p;
        mas->max = p_max;
        mas->min = p_min;
}

/* Tree validations */
static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format);
static void mt_dump_range(unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        static const char spaces[] = "                                ";

        switch(format) {
        case mt_dump_hex:
                if (min == max)
                        pr_info("%.*s%lx: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max);
                break;
        case mt_dump_dec:
                if (min == max)
                        pr_info("%.*s%lu: ", depth * 2, spaces, min);
                else
                        pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max);
        }
}

static void mt_dump_entry(void *entry, unsigned long min, unsigned long max,
                          unsigned int depth, enum mt_dump_format format)
{
        mt_dump_range(min, max, depth, format);

        if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry),
                        xa_to_value(entry), entry);
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else if (mt_is_reserved(entry))
                pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry);
        else
                pr_cont(PTR_FMT "\n", entry);
}

static void mt_dump_range64(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_range_64 *node = &mte_to_node(entry)->mr64;
        bool leaf = mte_is_leaf(entry);
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) {
                switch(format) {
                case mt_dump_hex:
                        pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont(PTR_FMT "\n", node->slot[i]);
        for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_RANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i] && max != mt_node_max(entry))
                        break;
                if (last == 0 && i > 0)
                        break;
                if (leaf)
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);
                else if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        switch(format) {
                        case mt_dump_hex:
                                pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n",
                                        node, last, max, i);
                                break;
                        case mt_dump_dec:
                                pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        }
                }
                first = last + 1;
        }
}

static void mt_dump_arange64(const struct maple_tree *mt, void *entry,
        unsigned long min, unsigned long max, unsigned int depth,
        enum mt_dump_format format)
{
        struct maple_arange_64 *node = &mte_to_node(entry)->ma64;
        unsigned long first = min;
        int i;

        pr_cont(" contents: ");
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont("%lx ", node->gap[i]);
                        break;
                case mt_dump_dec:
                        pr_cont("%lu ", node->gap[i]);
                }
        }
        pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) {
                switch (format) {
                case mt_dump_hex:
                        pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]);
                        break;
                case mt_dump_dec:
                        pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]);
                }
        }
        pr_cont(PTR_FMT "\n", node->slot[i]);
        for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) {
                unsigned long last = max;

                if (i < (MAPLE_ARANGE64_SLOTS - 1))
                        last = node->pivot[i];
                else if (!node->slot[i])
                        break;
                if (last == 0 && i > 0)
                        break;
                if (node->slot[i])
                        mt_dump_node(mt, mt_slot(mt, node->slot, i),
                                        first, last, depth + 1, format);

                if (last == max)
                        break;
                if (last > max) {
                        switch(format) {
                        case mt_dump_hex:
                                pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n",
                                        node, last, max, i);
                                break;
                        case mt_dump_dec:
                                pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n",
                                        node, last, max, i);
                        }
                }
                first = last + 1;
        }
}

static void mt_dump_node(const struct maple_tree *mt, void *entry,
                unsigned long min, unsigned long max, unsigned int depth,
                enum mt_dump_format format)
{
        struct maple_node *node = mte_to_node(entry);
        unsigned int type = mte_node_type(entry);
        unsigned int i;

        mt_dump_range(min, max, depth, format);

        pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node,
                depth, type, node ? node->parent : NULL);
        switch (type) {
        case maple_dense:
                pr_cont("\n");
                for (i = 0; i < MAPLE_NODE_SLOTS; i++) {
                        if (min + i > max)
                                pr_cont("OUT OF RANGE: ");
                        mt_dump_entry(mt_slot(mt, node->slot, i),
                                        min + i, min + i, depth, format);
                }
                break;
        case maple_leaf_64:
        case maple_range_64:
                mt_dump_range64(mt, entry, min, max, depth, format);
                break;
        case maple_arange_64:
                mt_dump_arange64(mt, entry, min, max, depth, format);
                break;

        default:
                pr_cont(" UNKNOWN TYPE\n");
        }
}

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format)
{
        void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt));

        pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n",
                 mt, mt->ma_flags, mt_height(mt), entry);
        if (xa_is_node(entry))
                mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format);
        else if (entry)
                mt_dump_entry(entry, 0, 0, 0, format);
        else
                pr_info("(empty)\n");
}
EXPORT_SYMBOL_GPL(mt_dump);

/*
 * Calculate the maximum gap in a node and check if that's what is reported in
 * the parent (unless root).
 */
static void mas_validate_gaps(struct ma_state *mas)
{
        struct maple_enode *mte = mas->node;
        struct maple_node *p_mn, *node = mte_to_node(mte);
        enum maple_type mt = mte_node_type(mas->node);
        unsigned long gap = 0, max_gap = 0;
        unsigned long p_end, p_start = mas->min;
        unsigned char p_slot, offset;
        unsigned long *gaps = NULL;
        unsigned long *pivots = ma_pivots(node, mt);
        unsigned int i;

        if (ma_is_dense(mt)) {
                for (i = 0; i < mt_slot_count(mte); i++) {
                        if (mas_get_slot(mas, i)) {
                                if (gap > max_gap)
                                        max_gap = gap;
                                gap = 0;
                                continue;
                        }
                        gap++;
                }
                goto counted;
        }

        gaps = ma_gaps(node, mt);
        for (i = 0; i < mt_slot_count(mte); i++) {
                p_end = mas_safe_pivot(mas, pivots, i, mt);

                if (!gaps) {
                        if (!mas_get_slot(mas, i))
                                gap = p_end - p_start + 1;
                } else {
                        void *entry = mas_get_slot(mas, i);

                        gap = gaps[i];
                        MT_BUG_ON(mas->tree, !entry);

                        if (gap > p_end - p_start + 1) {
                                pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n",
                                       mas_mn(mas), i, gap, p_end, p_start,
                                       p_end - p_start + 1);
                                MT_BUG_ON(mas->tree, gap > p_end - p_start + 1);
                        }
                }

                if (gap > max_gap)
                        max_gap = gap;

                p_start = p_end + 1;
                if (p_end >= mas->max)
                        break;
        }

counted:
        if (mt == maple_arange_64) {
                MT_BUG_ON(mas->tree, !gaps);
                offset = ma_meta_gap(node);
                if (offset > i) {
                        pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (gaps[offset] != max_gap) {
                        pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n",
                               node, offset, max_gap);
                        MT_BUG_ON(mas->tree, 1);
                }

                for (i++ ; i < mt_slot_count(mte); i++) {
                        if (gaps[i] != 0) {
                                pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n",
                                       node, i);
                                MT_BUG_ON(mas->tree, 1);
                        }
                }
        }

        if (mte_is_root(mte))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_mn = mte_parent(mte);
        MT_BUG_ON(mas->tree, max_gap > mas->max);
        if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) {
                pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap);
                mt_dump(mas->tree, mt_dump_hex);
                MT_BUG_ON(mas->tree, 1);
        }
}

static void mas_validate_parent_slot(struct ma_state *mas)
{
        struct maple_node *parent;
        struct maple_enode *node;
        enum maple_type p_type;
        unsigned char p_slot;
        void __rcu **slots;
        int i;

        if (mte_is_root(mas->node))
                return;

        p_slot = mte_parent_slot(mas->node);
        p_type = mas_parent_type(mas, mas->node);
        parent = mte_parent(mas->node);
        slots = ma_slots(parent, p_type);
        MT_BUG_ON(mas->tree, mas_mn(mas) == parent);

        /* Check prev/next parent slot for duplicate node entry */

        for (i = 0; i < mt_slots[p_type]; i++) {
                node = mas_slot(mas, slots, i);
                if (i == p_slot) {
                        if (node != mas->node)
                                pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n",
                                        parent, i, mas_mn(mas));
                        MT_BUG_ON(mas->tree, node != mas->node);
                } else if (node == mas->node) {
                        pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n",
                               mas_mn(mas), parent, i, p_slot);
                        MT_BUG_ON(mas->tree, node == mas->node);
                }
        }
}

static void mas_validate_child_slot(struct ma_state *mas)
{
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type);
        struct maple_enode *child;
        unsigned char i;

        if (mte_is_leaf(mas->node))
                return;

        for (i = 0; i < mt_slots[type]; i++) {
                child = mas_slot(mas, slots, i);

                if (!child) {
                        pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n",
                               mas_mn(mas), i);
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent_slot(child) != i) {
                        pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n",
                               mas_mn(mas), i, mte_to_node(child),
                               mte_parent_slot(child));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (mte_parent(child) != mte_to_node(mas->node)) {
                        pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n",
                               mte_to_node(child), mte_parent(child),
                               mte_to_node(mas->node));
                        MT_BUG_ON(mas->tree, 1);
                }

                if (i < mt_pivots[type] && pivots[i] == mas->max)
                        break;
        }
}

/*
 * Validate all pivots are within mas->min and mas->max, check metadata ends
 * where the maximum ends and ensure there is no slots or pivots set outside of
 * the end of the data.
 */
static void mas_validate_limits(struct ma_state *mas)
{
        int i;
        unsigned long prev_piv = 0;
        enum maple_type type = mte_node_type(mas->node);
        void __rcu **slots = ma_slots(mte_to_node(mas->node), type);
        unsigned long *pivots = ma_pivots(mas_mn(mas), type);

        for (i = 0; i < mt_slots[type]; i++) {
                unsigned long piv;

                piv = mas_safe_pivot(mas, pivots, i, type);

                if (!piv && (i != 0)) {
                        pr_err("Missing node limit pivot at " PTR_FMT "[%u]",
                               mas_mn(mas), i);
                        MAS_WARN_ON(mas, 1);
                }

                if (prev_piv > piv) {
                        pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n",
                                mas_mn(mas), i, piv, prev_piv);
                        MAS_WARN_ON(mas, piv < prev_piv);
                }

                if (piv < mas->min) {
                        pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i,
                                piv, mas->min);
                        MAS_WARN_ON(mas, piv < mas->min);
                }
                if (piv > mas->max) {
                        pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i,
                                piv, mas->max);
                        MAS_WARN_ON(mas, piv > mas->max);
                }
                prev_piv = piv;
                if (piv == mas->max)
                        break;
        }

        if (mas_data_end(mas) != i) {
                pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n",
                       mas_mn(mas), mas_data_end(mas), i);
                MT_BUG_ON(mas->tree, 1);
        }

        for (i += 1; i < mt_slots[type]; i++) {
                void *entry = mas_slot(mas, slots, i);

                if (entry && (i != mt_slots[type] - 1)) {
                        pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n",
                               mas_mn(mas), i, entry);
                        MT_BUG_ON(mas->tree, entry != NULL);
                }

                if (i < mt_pivots[type]) {
                        unsigned long piv = pivots[i];

                        if (!piv)
                                continue;

                        pr_err(PTR_FMT "[%u] should not have piv %lu\n",
                               mas_mn(mas), i, piv);
                        MAS_WARN_ON(mas, i < mt_pivots[type] - 1);
                }
        }
}

static void mt_validate_nulls(struct maple_tree *mt)
{
        void *entry, *last = (void *)1;
        unsigned char offset = 0;
        void __rcu **slots;
        MA_STATE(mas, mt, 0, 0);

        mas_start(&mas);
        if (mas_is_none(&mas) || (mas_is_ptr(&mas)))
                return;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node));
        do {
                entry = mas_slot(&mas, slots, offset);
                if (!last && !entry) {
                        pr_err("Sequential nulls end at " PTR_FMT "[%u]\n",
                                mas_mn(&mas), offset);
                }
                MT_BUG_ON(mt, !last && !entry);
                last = entry;
                if (offset == mas_data_end(&mas)) {
                        mas_next_node(&mas, mas_mn(&mas), ULONG_MAX);
                        if (mas_is_overflow(&mas))
                                return;
                        offset = 0;
                        slots = ma_slots(mte_to_node(mas.node),
                                         mte_node_type(mas.node));
                } else {
                        offset++;
                }

        } while (!mas_is_overflow(&mas));
}

/*
 * validate a maple tree by checking:
 * 1. The limits (pivots are within mas->min to mas->max)
 * 2. The gap is correctly set in the parents
 */
void mt_validate(struct maple_tree *mt)
        __must_hold(mas->tree->ma_lock)
{
        unsigned char end;

        MA_STATE(mas, mt, 0, 0);
        mas_start(&mas);
        if (!mas_is_active(&mas))
                return;

        while (!mte_is_leaf(mas.node))
                mas_descend(&mas);

        while (!mas_is_overflow(&mas)) {
                MAS_WARN_ON(&mas, mte_dead_node(mas.node));
                end = mas_data_end(&mas);
                if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) &&
                                (!mte_is_root(mas.node)))) {
                        pr_err("Invalid size %u of " PTR_FMT "\n",
                               end, mas_mn(&mas));
                }

                mas_validate_parent_slot(&mas);
                mas_validate_limits(&mas);
                mas_validate_child_slot(&mas);
                if (mt_is_alloc(mt))
                        mas_validate_gaps(&mas);
                mas_dfs_postorder(&mas, ULONG_MAX);
        }
        mt_validate_nulls(mt);
}
EXPORT_SYMBOL_GPL(mt_validate);

void mas_dump(const struct ma_state *mas)
{
        pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ",
               mas->tree, mas->node);
        switch (mas->status) {
        case ma_active:
                pr_err("(ma_active)");
                break;
        case ma_none:
                pr_err("(ma_none)");
                break;
        case ma_root:
                pr_err("(ma_root)");
                break;
        case ma_start:
                pr_err("(ma_start) ");
                break;
        case ma_pause:
                pr_err("(ma_pause) ");
                break;
        case ma_overflow:
                pr_err("(ma_overflow) ");
                break;
        case ma_underflow:
                pr_err("(ma_underflow) ");
                break;
        case ma_error:
                pr_err("(ma_error) ");
                break;
        }

        pr_err("Store Type: ");
        switch (mas->store_type) {
        case wr_invalid:
                pr_err("invalid store type\n");
                break;
        case wr_new_root:
                pr_err("new_root\n");
                break;
        case wr_store_root:
                pr_err("store_root\n");
                break;
        case wr_exact_fit:
                pr_err("exact_fit\n");
                break;
        case wr_split_store:
                pr_err("split_store\n");
                break;
        case wr_slot_store:
                pr_err("slot_store\n");
                break;
        case wr_append:
                pr_err("append\n");
                break;
        case wr_node_store:
                pr_err("node_store\n");
                break;
        case wr_spanning_store:
                pr_err("spanning_store\n");
                break;
        case wr_rebalance:
                pr_err("rebalance\n");
                break;
        }

        pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end,
               mas->index, mas->last);
        pr_err("     min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n",
               mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags);
        if (mas->index > mas->last)
                pr_err("Check index & last\n");
}
EXPORT_SYMBOL_GPL(mas_dump);

void mas_wr_dump(const struct ma_wr_state *wr_mas)
{
        pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n",
               wr_mas->node, wr_mas->r_min, wr_mas->r_max);
        pr_err("        type=%u off_end=%u, node_end=%u, end_piv=%lx\n",
               wr_mas->type, wr_mas->offset_end, wr_mas->mas->end,
               wr_mas->end_piv);
}
EXPORT_SYMBOL_GPL(mas_wr_dump);

#endif /* CONFIG_DEBUG_MAPLE_TREE */



















   22 


















   22 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ipi

#if !defined(_TRACE_IPI_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_IPI_H

#include <linux/tracepoint.h>

/**
 * ipi_raise - called when a smp cross call is made
 *
 * @mask: mask of recipient CPUs for the IPI
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string.
 */
TRACE_EVENT(ipi_raise,

        TP_PROTO(const struct cpumask *mask, const char *reason),

        TP_ARGS(mask, reason),

        TP_STRUCT__entry(
                __bitmask(target_cpus, nr_cpumask_bits)
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __assign_bitmask(target_cpus, cpumask_bits(mask), nr_cpumask_bits);
                __entry->reason = reason;
        ),

        TP_printk("target_mask=%s (%s)", __get_bitmask(target_cpus), __entry->reason)
);

TRACE_EVENT(ipi_send_cpu,

        TP_PROTO(const unsigned int cpu, unsigned long callsite, void *callback),

        TP_ARGS(cpu, callsite, callback),

        TP_STRUCT__entry(
                __field(unsigned int, cpu)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __entry->cpu = cpu;
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpu=%u callsite=%pS callback=%pS",
                  __entry->cpu, __entry->callsite, __entry->callback)
);

TRACE_EVENT(ipi_send_cpumask,

        TP_PROTO(const struct cpumask *cpumask, unsigned long callsite, void *callback),

        TP_ARGS(cpumask, callsite, callback),

        TP_STRUCT__entry(
                __cpumask(cpumask)
                __field(void *, callsite)
                __field(void *, callback)
        ),

        TP_fast_assign(
                __assign_cpumask(cpumask, cpumask_bits(cpumask));
                __entry->callsite = (void *)callsite;
                __entry->callback = callback;
        ),

        TP_printk("cpumask=%s callsite=%pS callback=%pS",
                  __get_cpumask(cpumask), __entry->callsite, __entry->callback)
);

DECLARE_EVENT_CLASS(ipi_handler,

        TP_PROTO(const char *reason),

        TP_ARGS(reason),

        TP_STRUCT__entry(
                __field(const char *, reason)
        ),

        TP_fast_assign(
                __entry->reason = reason;
        ),

        TP_printk("(%s)", __entry->reason)
);

/**
 * ipi_entry - called immediately before the IPI handler
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise
 * for that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_entry,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);

/**
 * ipi_exit - called immediately after the IPI handler returns
 *
 * @reason: string identifying the IPI purpose
 *
 * It is necessary for @reason to be a static string declared with
 * __tracepoint_string, ideally the same as used with trace_ipi_raise for
 * that IPI.
 */
DEFINE_EVENT(ipi_handler, ipi_exit,

        TP_PROTO(const char *reason),

        TP_ARGS(reason)
);

#endif /* _TRACE_IPI_H */

/* This part must be outside protection */
#include <trace/define_trace.h>







































































































































 1041 






 1041 















































































   22 




























































  208 















































































































































































































































































































 1041 











































































































































































































































































































































































































































































































































































 1041 






























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_CPUMASK_H
#define __LINUX_CPUMASK_H

/*
 * Cpumasks provide a bitmap suitable for representing the
 * set of CPUs in a system, one bit position per CPU number.  In general,
 * only nr_cpu_ids (<= NR_CPUS) bits are valid.
 */
#include <linux/cleanup.h>
#include <linux/kernel.h>
#include <linux/bitmap.h>
#include <linux/cpumask_types.h>
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/gfp_types.h>
#include <linux/numa.h>

/**
 * cpumask_pr_args - printf args to output a cpumask
 * @maskp: cpumask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a cpumask.
 */
#define cpumask_pr_args(maskp)                nr_cpu_ids, cpumask_bits(maskp)

#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
#define nr_cpu_ids ((unsigned int)NR_CPUS)
#else
extern unsigned int nr_cpu_ids;
#endif

static __always_inline void set_nr_cpu_ids(unsigned int nr)
{
#if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS)
        WARN_ON(nr != nr_cpu_ids);
#else
        nr_cpu_ids = nr;
#endif
}

/*
 * We have several different "preferred sizes" for the cpumask
 * operations, depending on operation.
 *
 * For example, the bitmap scanning and operating operations have
 * optimized routines that work for the single-word case, but only when
 * the size is constant. So if NR_CPUS fits in one single word, we are
 * better off using that small constant, in order to trigger the
 * optimized bit finding. That is 'small_cpumask_size'.
 *
 * The clearing and copying operations will similarly perform better
 * with a constant size, but we limit that size arbitrarily to four
 * words. We call this 'large_cpumask_size'.
 *
 * Finally, some operations just want the exact limit, either because
 * they set bits or just don't have any faster fixed-sized versions. We
 * call this just 'nr_cpumask_bits'.
 *
 * Note that these optional constants are always guaranteed to be at
 * least as big as 'nr_cpu_ids' itself is, and all our cpumask
 * allocations are at least that size (see cpumask_size()). The
 * optimization comes from being able to potentially use a compile-time
 * constant instead of a run-time generated exact number of CPUs.
 */
#if NR_CPUS <= BITS_PER_LONG
  #define small_cpumask_bits ((unsigned int)NR_CPUS)
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#elif NR_CPUS <= 4*BITS_PER_LONG
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits ((unsigned int)NR_CPUS)
#else
  #define small_cpumask_bits nr_cpu_ids
  #define large_cpumask_bits nr_cpu_ids
#endif
#define nr_cpumask_bits nr_cpu_ids

/*
 * The following particular system cpumasks and operations manage
 * possible, present, active and online cpus.
 *
 *     cpu_possible_mask- has bit 'cpu' set iff cpu is populatable
 *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
 *     cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online
 *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
 *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
 *
 *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
 *
 *  The cpu_possible_mask is fixed at boot time, as the set of CPU IDs
 *  that it is possible might ever be plugged in at anytime during the
 *  life of that system boot.  The cpu_present_mask is dynamic(*),
 *  representing which CPUs are currently plugged in.  And
 *  cpu_online_mask is the dynamic subset of cpu_present_mask,
 *  indicating those CPUs available for scheduling.
 *
 *  If HOTPLUG is enabled, then cpu_present_mask varies dynamically,
 *  depending on what ACPI reports as currently plugged in, otherwise
 *  cpu_present_mask is just a copy of cpu_possible_mask.
 *
 *  (*) Well, cpu_present_mask is dynamic in the hotplug case.  If not
 *      hotplug, it's a copy of cpu_possible_mask, hence fixed at boot.
 *
 * Subtleties:
 * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode
 *    assumption that their single CPU is online.  The UP
 *    cpu_{online,possible,present}_masks are placebos.  Changing them
 *    will have no useful affect on the following num_*_cpus()
 *    and cpu_*() macros in the UP case.  This ugliness is a UP
 *    optimization - don't waste any instructions or memory references
 *    asking if you're online or how many CPUs there are if there is
 *    only one CPU.
 */

extern struct cpumask __cpu_possible_mask;
extern struct cpumask __cpu_online_mask;
extern struct cpumask __cpu_enabled_mask;
extern struct cpumask __cpu_present_mask;
extern struct cpumask __cpu_active_mask;
extern struct cpumask __cpu_dying_mask;
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
#define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
#define cpu_enabled_mask   ((const struct cpumask *)&__cpu_enabled_mask)
#define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
#define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
#define cpu_dying_mask    ((const struct cpumask *)&__cpu_dying_mask)

extern atomic_t __num_online_cpus;

extern cpumask_t cpus_booted_once_mask;

static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        WARN_ON_ONCE(cpu >= bits);
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
}

/* verify cpu argument to cpumask_* operators */
static __always_inline unsigned int cpumask_check(unsigned int cpu)
{
        cpu_max_bits_warn(cpu, small_cpumask_bits);
        return cpu;
}

/**
 * cpumask_first - get the first cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
static __always_inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return find_first_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_zero - get the first unset cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if all cpus are set.
 */
static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
{
        return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
 * @srcp1: the first input
 * @srcp2: the second input
 *
 * Return: >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
 */
static __always_inline
unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3
 * @srcp1: the first input
 * @srcp2: the second input
 * @srcp3: the third input
 *
 * Return: >= nr_cpu_ids if no cpus set in all.
 */
static __always_inline
unsigned int cpumask_first_and_and(const struct cpumask *srcp1,
                                   const struct cpumask *srcp2,
                                   const struct cpumask *srcp3)
{
        return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                      cpumask_bits(srcp3), small_cpumask_bits);
}

/**
 * cpumask_last - get the last CPU in a cpumask
 * @srcp:        - the cpumask pointer
 *
 * Return:        >= nr_cpumask_bits if no CPUs set.
 */
static __always_inline unsigned int cpumask_last(const struct cpumask *srcp)
{
        return find_last_bit(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_next - get the next cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set.
 */
static __always_inline
unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_zero - get the next unset cpu in a cpumask
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus unset.
 */
static __always_inline
unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1);
}

#if NR_CPUS == 1
/* Uniprocessor: there is only one valid CPU */
static __always_inline
unsigned int cpumask_local_spread(unsigned int i, int node)
{
        return 0;
}

static __always_inline
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                                        const struct cpumask *src2p)
{
        return cpumask_first_and(src1p, src2p);
}

static __always_inline
unsigned int cpumask_any_distribute(const struct cpumask *srcp)
{
        return cpumask_first(srcp);
}
#else
unsigned int cpumask_local_spread(unsigned int i, int node);
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p);
unsigned int cpumask_any_distribute(const struct cpumask *srcp);
#endif /* NR_CPUS */

/**
 * cpumask_next_and - get the next cpu in *src1p & *src2p
 * @n: the cpu prior to the place to search (i.e. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: >= nr_cpu_ids if no further cpus set in both.
 */
static __always_inline
unsigned int cpumask_next_and(int n, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from
 *                           @n+1. If nothing found, wrap around and start from
 *                           the beginning
 * @n: the cpu prior to the place to search (i.e. search starts from @n+1)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty.
 */
static __always_inline
unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p,
                              const struct cpumask *src2p)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p),
                small_cpumask_bits, n + 1);
}

/**
 * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing
 *                       found, wrap around and start from the beginning
 * @n: the cpu prior to the place to search (i.e. search starts from @n+1)
 * @src: cpumask pointer
 *
 * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty.
 */
static __always_inline
unsigned int cpumask_next_wrap(int n, const struct cpumask *src)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1);
}

/**
 * for_each_cpu - iterate over every cpu in a mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu(cpu, mask)                                \
        for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits)

/**
 * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 * @start: the start location
 *
 * The implementation does not assume any bit in @mask is set (including @start).
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_wrap(cpu, mask, start)                                \
        for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start)

/**
 * for_each_cpu_and - iterate over every cpu in both masks
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_and(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_and(cpu, mask1, mask2)                                \
        for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
 *                         those present in another.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_andnot(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_andnot(cpu, mask1, mask2)                                \
        for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_or - iterate over every cpu present in either mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask1: the first cpumask pointer
 * @mask2: the second cpumask pointer
 *
 * This saves a temporary CPU mask in many places.  It is equivalent to:
 *        struct cpumask tmp;
 *        cpumask_or(&tmp, &mask1, &mask2);
 *        for_each_cpu(cpu, &tmp)
 *                ...
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_or(cpu, mask1, mask2)                                \
        for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)

/**
 * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask.
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu_from(cpu, mask)                                \
        for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits)

/**
 * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one.
 * @mask: the cpumask to search
 * @cpu: the cpu to ignore.
 *
 * Often used to find any cpu but smp_processor_id() in a mask.
 * Return: >= nr_cpu_ids if no cpus set.
 */
static __always_inline
unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
{
        unsigned int i;

        cpumask_check(cpu);
        for_each_cpu(i, mask)
                if (i != cpu)
                        break;
        return i;
}

/**
 * cpumask_any_and_but - pick an arbitrary cpu from *mask1 & *mask2, but not this one.
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 * @cpu: the cpu to ignore
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
static __always_inline
unsigned int cpumask_any_and_but(const struct cpumask *mask1,
                                 const struct cpumask *mask2,
                                 unsigned int cpu)
{
        unsigned int i;

        cpumask_check(cpu);
        i = cpumask_first_and(mask1, mask2);
        if (i != cpu)
                return i;

        return cpumask_next_and(cpu, mask1, mask2);
}

/**
 * cpumask_nth - get the Nth cpu in a cpumask
 * @srcp: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp)
{
        return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and - get the Nth cpu in 2 cpumasks
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2)
{
        return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_andnot - get the Nth cpu set in 1st cpumask, and clear in 2nd.
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2)
{
        return find_nth_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2),
                                small_cpumask_bits, cpumask_check(cpu));
}

/**
 * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
 * @srcp1: the cpumask pointer
 * @srcp2: the cpumask pointer
 * @srcp3: the cpumask pointer
 * @cpu: the Nth cpu to find, starting from 0
 *
 * Return: >= nr_cpu_ids if such cpu doesn't exist.
 */
static __always_inline
unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
                                                        const struct cpumask *srcp2,
                                                        const struct cpumask *srcp3)
{
        return find_nth_and_andnot_bit(cpumask_bits(srcp1),
                                        cpumask_bits(srcp2),
                                        cpumask_bits(srcp3),
                                        small_cpumask_bits, cpumask_check(cpu));
}

#define CPU_BITS_NONE                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL                        \
}

#define CPU_BITS_CPU0                                                \
{                                                                \
        [0] =  1UL                                                \
}

/**
 * cpumask_set_cpu - set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline
void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline
void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp)
{
        __set_bit(cpumask_check(cpu), cpumask_bits(dstp));
}


/**
 * cpumask_clear_cpu - clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp)
{
        __clear_bit(cpumask_check(cpu), cpumask_bits(dstp));
}

/**
 * cpumask_assign_cpu - assign a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @dstp: the cpumask pointer
 * @bool: the value to assign
 */
static __always_inline void cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value)
{
        assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value);
}

static __always_inline void __cpumask_assign_cpu(int cpu, struct cpumask *dstp, bool value)
{
        __assign_bit(cpumask_check(cpu), cpumask_bits(dstp), value);
}

/**
 * cpumask_test_cpu - test for a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * Return: true if @cpu is set in @cpumask, else returns false
 */
static __always_inline
bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask)
{
        return test_bit(cpumask_check(cpu), cpumask_bits((cpumask)));
}

/**
 * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_set_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline
bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * test_and_clear_bit wrapper for cpumasks.
 *
 * Return: true if @cpu is set in old bitmap of @cpumask, else returns false
 */
static __always_inline
bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask)
{
        return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask));
}

/**
 * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_setall(struct cpumask *dstp)
{
        if (small_const_nbits(small_cpumask_bits)) {
                cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits);
                return;
        }
        bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer
 */
static __always_inline void cpumask_clear(struct cpumask *dstp)
{
        bitmap_zero(cpumask_bits(dstp), large_cpumask_bits);
}

/**
 * cpumask_and - *dstp = *src1p & *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static __always_inline
bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p,
                 const struct cpumask *src2p)
{
        return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_or - *dstp = *src1p | *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static __always_inline
void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p,
                const struct cpumask *src2p)
{
        bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p),
                                      cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_xor - *dstp = *src1p ^ *src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 */
static __always_inline
void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p,
                 const struct cpumask *src2p)
{
        bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p),
                                       cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_andnot - *dstp = *src1p & ~*src2p
 * @dstp: the cpumask result
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: false if *@dstp is empty, else returns true
 */
static __always_inline
bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p,
                    const struct cpumask *src2p)
{
        return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p),
                                          cpumask_bits(src2p), small_cpumask_bits);
}

/**
 * cpumask_equal - *src1p == *src2p
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if the cpumasks are equal, false if not
 */
static __always_inline
bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p)
{
        return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                                                 small_cpumask_bits);
}

/**
 * cpumask_or_equal - *src1p | *src2p == *src3p
 * @src1p: the first input
 * @src2p: the second input
 * @src3p: the third input
 *
 * Return: true if first cpumask ORed with second cpumask == third cpumask,
 *           otherwise false
 */
static __always_inline
bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p,
                      const struct cpumask *src3p)
{
        return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p),
                               cpumask_bits(src3p), small_cpumask_bits);
}

/**
 * cpumask_intersects - (*src1p & *src2p) != 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if first cpumask ANDed with second cpumask is non-empty,
 *           otherwise false
 */
static __always_inline
bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p)
{
        return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p),
                                                      small_cpumask_bits);
}

/**
 * cpumask_subset - (*src1p & ~*src2p) == 0
 * @src1p: the first input
 * @src2p: the second input
 *
 * Return: true if *@src1p is a subset of *@src2p, else returns false
 */
static __always_inline
bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p)
{
        return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p),
                                                  small_cpumask_bits);
}

/**
 * cpumask_empty - *srcp == 0
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear.
 *
 * Return: true if srcp is empty (has no bits set), else false
 */
static __always_inline bool cpumask_empty(const struct cpumask *srcp)
{
        return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_full - *srcp == 0xFFFFFFFF...
 * @srcp: the cpumask to that all cpus < nr_cpu_ids are set.
 *
 * Return: true if srcp is full (has all bits set), else false
 */
static __always_inline bool cpumask_full(const struct cpumask *srcp)
{
        return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits);
}

/**
 * cpumask_weight - Count of bits in *srcp
 * @srcp: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in *srcp
 */
static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
        return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits);
}

/**
 * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static __always_inline
unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2)
{
        return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2)
 * @srcp1: the cpumask to count bits (< nr_cpu_ids) in.
 * @srcp2: the cpumask to count bits (< nr_cpu_ids) in.
 *
 * Return: count of bits set in both *srcp1 and *srcp2
 */
static __always_inline
unsigned int cpumask_weight_andnot(const struct cpumask *srcp1,
                                   const struct cpumask *srcp2)
{
        return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits);
}

/**
 * cpumask_shift_right - *dstp = *srcp >> n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static __always_inline
void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n)
{
        bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                               small_cpumask_bits);
}

/**
 * cpumask_shift_left - *dstp = *srcp << n
 * @dstp: the cpumask result
 * @srcp: the input to shift
 * @n: the number of bits to shift by
 */
static __always_inline
void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n)
{
        bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n,
                                              nr_cpumask_bits);
}

/**
 * cpumask_copy - *dstp = *srcp
 * @dstp: the result
 * @srcp: the input cpumask
 */
static __always_inline
void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp)
{
        bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits);
}

/**
 * cpumask_any - pick an arbitrary cpu from *srcp
 * @srcp: the input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any(srcp) cpumask_first(srcp)

/**
 * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2
 * @mask1: the first input cpumask
 * @mask2: the second input cpumask
 *
 * Return: >= nr_cpu_ids if no cpus set.
 */
#define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2))

/**
 * cpumask_of - the cpumask containing just a given cpu
 * @cpu: the cpu (<= nr_cpu_ids)
 */
#define cpumask_of(cpu) (get_cpu_mask(cpu))

/**
 * cpumask_parse_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline
int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp)
{
        return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_parselist_user - extract a cpumask from a user string
 * @buf: the buffer to extract from
 * @len: the length of the buffer
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline
int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp)
{
        return bitmap_parselist_user(buf, len, cpumask_bits(dstp),
                                     nr_cpumask_bits);
}

/**
 * cpumask_parse - extract a cpumask from a string
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpulist_parse - extract a cpumask from a user string of ranges
 * @buf: the buffer to extract from
 * @dstp: the cpumask to set.
 *
 * Return: -errno, or 0 for success.
 */
static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp)
{
        return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits);
}

/**
 * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes
 *
 * Return: size to allocate for a &struct cpumask in bytes
 */
static __always_inline unsigned int cpumask_size(void)
{
        return bitmap_size(large_cpumask_bits);
}

#ifdef CONFIG_CPUMASK_OFFSTACK

#define this_cpu_cpumask_var_ptr(x)        this_cpu_read(x)
#define __cpumask_var_read_mostly        __read_mostly

bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node);

static __always_inline
bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node);
}

/**
 * alloc_cpumask_var - allocate a struct cpumask
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>).
 *
 * See alloc_cpumask_var_node.
 *
 * Return: %true if allocation succeeded, %false if not
 */
static __always_inline
bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE);
}

static __always_inline
bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return alloc_cpumask_var(mask, flags | __GFP_ZERO);
}

void alloc_bootmem_cpumask_var(cpumask_var_t *mask);
void free_cpumask_var(cpumask_var_t mask);
void free_bootmem_cpumask_var(cpumask_var_t mask);

static __always_inline bool cpumask_available(cpumask_var_t mask)
{
        return mask != NULL;
}

#else

#define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x)
#define __cpumask_var_read_mostly

static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        return true;
}

static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        return true;
}

static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags)
{
        cpumask_clear(*mask);
        return true;
}

static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags,
                                          int node)
{
        cpumask_clear(*mask);
        return true;
}

static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
}

static __always_inline void free_cpumask_var(cpumask_var_t mask)
{
}

static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask)
{
}

static __always_inline bool cpumask_available(cpumask_var_t mask)
{
        return true;
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T));

/* It's common to want to use cpu_all_mask in struct member initializers,
 * so it has to refer to an address rather than a pointer. */
extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
#define cpu_all_mask to_cpumask(cpu_all_bits)

/* First bits of cpu_bit_bitmap are in fact unset. */
#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])

#if NR_CPUS == 1
/* Uniprocessor: the possible/online/present masks are always "1" */
#define for_each_possible_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_online_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_present_cpu(cpu)        for ((cpu) = 0; (cpu) < 1; (cpu)++)

#define for_each_possible_cpu_wrap(cpu, start)        \
        for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++)
#define for_each_online_cpu_wrap(cpu, start)        \
        for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++)
#else
#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
#define for_each_enabled_cpu(cpu)   for_each_cpu((cpu), cpu_enabled_mask)
#define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)

#define for_each_possible_cpu_wrap(cpu, start)        \
        for_each_cpu_wrap((cpu), cpu_possible_mask, (start))
#define for_each_online_cpu_wrap(cpu, start)        \
        for_each_cpu_wrap((cpu), cpu_online_mask, (start))
#endif

/* Wrappers for arch boot code to manipulate normally-constant masks */
void init_cpu_present(const struct cpumask *src);
void init_cpu_possible(const struct cpumask *src);

#define assign_cpu(cpu, mask, val)        \
        assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val))

#define set_cpu_possible(cpu, possible)        assign_cpu((cpu), &__cpu_possible_mask, (possible))
#define set_cpu_enabled(cpu, enabled)        assign_cpu((cpu), &__cpu_enabled_mask, (enabled))
#define set_cpu_present(cpu, present)        assign_cpu((cpu), &__cpu_present_mask, (present))
#define set_cpu_active(cpu, active)        assign_cpu((cpu), &__cpu_active_mask, (active))
#define set_cpu_dying(cpu, dying)        assign_cpu((cpu), &__cpu_dying_mask, (dying))

void set_cpu_online(unsigned int cpu, bool online);

/**
 * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask *
 * @bitmap: the bitmap
 *
 * There are a few places where cpumask_var_t isn't appropriate and
 * static cpumasks must be used (eg. very early boot), yet we don't
 * expose the definition of 'struct cpumask'.
 *
 * This does the conversion, and can be used as a constant initializer.
 */
#define to_cpumask(bitmap)                                                \
        ((struct cpumask *)(1 ? (bitmap)                                \
                            : (void *)sizeof(__check_is_bitmap(bitmap))))

static __always_inline int __check_is_bitmap(const unsigned long *bitmap)
{
        return 1;
}

/*
 * Special-case data structure for "single bit set only" constant CPU masks.
 *
 * We pre-generate all the 64 (or 32) possible bit positions, with enough
 * padding to the left and the right, and return the constant pointer
 * appropriately offset.
 */
extern const unsigned long
        cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)];

static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu)
{
        const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
        p -= cpu / BITS_PER_LONG;
        return to_cpumask(p);
}

#if NR_CPUS > 1
/**
 * num_online_cpus() - Read the number of online CPUs
 *
 * Despite the fact that __num_online_cpus is of type atomic_t, this
 * interface gives only a momentary snapshot and is not protected against
 * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
 * region.
 *
 * Return: momentary snapshot of the number of online CPUs
 */
static __always_inline unsigned int num_online_cpus(void)
{
        return raw_atomic_read(&__num_online_cpus);
}
#define num_possible_cpus()        cpumask_weight(cpu_possible_mask)
#define num_enabled_cpus()        cpumask_weight(cpu_enabled_mask)
#define num_present_cpus()        cpumask_weight(cpu_present_mask)
#define num_active_cpus()        cpumask_weight(cpu_active_mask)

static __always_inline bool cpu_online(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_online_mask);
}

static __always_inline bool cpu_enabled(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_enabled_mask);
}

static __always_inline bool cpu_possible(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_possible_mask);
}

static __always_inline bool cpu_present(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_present_mask);
}

static __always_inline bool cpu_active(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_active_mask);
}

static __always_inline bool cpu_dying(unsigned int cpu)
{
        return cpumask_test_cpu(cpu, cpu_dying_mask);
}

#else

#define num_online_cpus()        1U
#define num_possible_cpus()        1U
#define num_enabled_cpus()        1U
#define num_present_cpus()        1U
#define num_active_cpus()        1U

static __always_inline bool cpu_online(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_possible(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_enabled(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_present(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_active(unsigned int cpu)
{
        return cpu == 0;
}

static __always_inline bool cpu_dying(unsigned int cpu)
{
        return false;
}

#endif /* NR_CPUS > 1 */

#define cpu_is_offline(cpu)        unlikely(!cpu_online(cpu))

#if NR_CPUS <= BITS_PER_LONG
#define CPU_BITS_ALL                                                \
{                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}

#else /* NR_CPUS > BITS_PER_LONG */

#define CPU_BITS_ALL                                                \
{                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
}
#endif /* NR_CPUS > BITS_PER_LONG */

/**
 * cpumap_print_to_pagebuf  - copies the cpumask into the buffer either
 *        as comma-separated list of cpus or hex values of cpumask
 * @list: indicates whether the cpumap must be list
 * @mask: the cpumask to copy
 * @buf: the buffer to copy into
 *
 * Return: the length of the (null-terminated) @buf string, zero if
 * nothing is copied.
 */
static __always_inline ssize_t
cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask)
{
        return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask),
                                      nr_cpu_ids);
}

/**
 * cpumap_print_bitmask_to_buf  - copies the cpumask into the buffer as
 *        hex values of cpumask
 *
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * The function prints the cpumask into the buffer as hex values of
 * cpumask; Typically used by bin_attribute to export cpumask bitmask
 * ABI.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static __always_inline
ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask,
                                    loff_t off, size_t count)
{
        return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

/**
 * cpumap_print_list_to_buf  - copies the cpumask into the buffer as
 *        comma-separated list of cpus
 * @buf: the buffer to copy into
 * @mask: the cpumask to copy
 * @off: in the string from which we are copying, we copy to @buf
 * @count: the maximum number of bytes to print
 *
 * Everything is same with the above cpumap_print_bitmask_to_buf()
 * except the print format.
 *
 * Return: the length of how many bytes have been copied, excluding
 * terminating '\0'.
 */
static __always_inline
ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask,
                                 loff_t off, size_t count)
{
        return bitmap_print_list_to_buf(buf, cpumask_bits(mask),
                                   nr_cpu_ids, off, count) - 1;
}

#if NR_CPUS <= BITS_PER_LONG
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#else
#define CPU_MASK_ALL                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL,                        \
        [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS)        \
} }
#endif /* NR_CPUS > BITS_PER_LONG */

#define CPU_MASK_NONE                                                        \
(cpumask_t) { {                                                                \
        [0 ... BITS_TO_LONGS(NR_CPUS)-1] =  0UL                                \
} }

#define CPU_MASK_CPU0                                                        \
(cpumask_t) { {                                                                \
        [0] =  1UL                                                        \
} }

/*
 * Provide a valid theoretical max size for cpumap and cpulist sysfs files
 * to avoid breaking userspace which may allocate a buffer based on the size
 * reported by e.g. fstat.
 *
 * for cpumap NR_CPUS * 9/32 - 1 should be an exact length.
 *
 * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up
 * to 2 orders of magnitude larger than 8192. And then we divide by 2 to
 * cover a worst-case of every other cpu being on one of two nodes for a
 * very large NR_CPUS.
 *
 *  Use PAGE_SIZE as a minimum for smaller configurations while avoiding
 *  unsigned comparison to -1.
 */
#define CPUMAP_FILE_MAX_BYTES  (((NR_CPUS * 9)/32 > PAGE_SIZE) \
                                        ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE)
#define CPULIST_FILE_MAX_BYTES  (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE)

#endif /* __LINUX_CPUMASK_H */























































































































































































































































































    2 







    2 


































































































































   24 




























































































































    3 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Operations on the network namespace
 */
#ifndef __NET_NET_NAMESPACE_H
#define __NET_NET_NAMESPACE_H

#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <linux/sysctl.h>
#include <linux/uidgid.h>

#include <net/flow.h>
#include <net/netns/core.h>
#include <net/netns/mib.h>
#include <net/netns/unix.h>
#include <net/netns/packet.h>
#include <net/netns/ipv4.h>
#include <net/netns/ipv6.h>
#include <net/netns/nexthop.h>
#include <net/netns/ieee802154_6lowpan.h>
#include <net/netns/sctp.h>
#include <net/netns/netfilter.h>
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
#include <net/netns/conntrack.h>
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
#include <net/netns/flow_table.h>
#endif
#include <net/netns/nftables.h>
#include <net/netns/xfrm.h>
#include <net/netns/mpls.h>
#include <net/netns/can.h>
#include <net/netns/xdp.h>
#include <net/netns/smc.h>
#include <net/netns/bpf.h>
#include <net/netns/mctp.h>
#include <net/net_trackers.h>
#include <linux/ns_common.h>
#include <linux/idr.h>
#include <linux/skbuff.h>
#include <linux/notifier.h>
#include <linux/xarray.h>

struct user_namespace;
struct proc_dir_entry;
struct net_device;
struct sock;
struct ctl_table_header;
struct net_generic;
struct uevent_sock;
struct netns_ipvs;
struct bpf_prog;


#define NETDEV_HASHBITS    8
#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)

struct net {
        /* First cache line can be often dirtied.
         * Do not place here read-mostly fields.
         */
        refcount_t                passive;        /* To decide when the network
                                                 * namespace should be freed.
                                                 */
        spinlock_t                rules_mod_lock;

        unsigned int                dev_base_seq;        /* protected by rtnl_mutex */
        u32                        ifindex;

        spinlock_t                nsid_lock;
        atomic_t                fnhe_genid;

        struct list_head        list;                /* list of network namespaces */
        struct list_head        exit_list;        /* To linked to call pernet exit
                                                 * methods on dead net (
                                                 * pernet_ops_rwsem read locked),
                                                 * or to unregister pernet ops
                                                 * (pernet_ops_rwsem write locked).
                                                 */
        struct llist_node        defer_free_list;
        struct llist_node        cleanup_list;        /* namespaces on death row */

        struct list_head ptype_all;
        struct list_head ptype_specific;

#ifdef CONFIG_KEYS
        struct key_tag                *key_domain;        /* Key domain of operation tag */
#endif
        struct user_namespace   *user_ns;        /* Owning user namespace */
        struct ucounts                *ucounts;
        struct idr                netns_ids;

        struct ns_common        ns;
        struct ref_tracker_dir  refcnt_tracker;
        struct ref_tracker_dir  notrefcnt_tracker; /* tracker for objects not
                                                    * refcounted against netns
                                                    */
        struct list_head         dev_base_head;
        struct proc_dir_entry         *proc_net;
        struct proc_dir_entry         *proc_net_stat;

#ifdef CONFIG_SYSCTL
        struct ctl_table_set        sysctls;
#endif

        struct sock                 *rtnl;                        /* rtnetlink socket */
        struct sock                *genl_sock;

        struct uevent_sock        *uevent_sock;                /* uevent socket */

        struct hlist_head         *dev_name_head;
        struct hlist_head        *dev_index_head;
        struct xarray                dev_by_index;
        struct raw_notifier_head        netdev_chain;

        /* Note that @hash_mix can be read millions times per second,
         * it is critical that it is on a read_mostly cache line.
         */
        u32                        hash_mix;

        struct net_device       *loopback_dev;          /* The loopback */

        /* core fib_rules */
        struct list_head        rules_ops;

        struct netns_core        core;
        struct netns_mib        mib;
        struct netns_packet        packet;
#if IS_ENABLED(CONFIG_UNIX)
        struct netns_unix        unx;
#endif
        struct netns_nexthop        nexthop;
        struct netns_ipv4        ipv4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netns_ipv6        ipv6;
#endif
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
        struct netns_ieee802154_lowpan        ieee802154_lowpan;
#endif
#if defined(CONFIG_IP_SCTP) || defined(CONFIG_IP_SCTP_MODULE)
        struct netns_sctp        sctp;
#endif
#ifdef CONFIG_NETFILTER
        struct netns_nf                nf;
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        struct netns_ct                ct;
#endif
#if defined(CONFIG_NF_TABLES) || defined(CONFIG_NF_TABLES_MODULE)
        struct netns_nftables        nft;
#endif
#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
        struct netns_ft ft;
#endif
#endif
#ifdef CONFIG_WEXT_CORE
        struct sk_buff_head        wext_nlevents;
#endif
        struct net_generic __rcu        *gen;

        /* Used to store attached BPF programs */
        struct netns_bpf        bpf;

        /* Note : following structs are cache line aligned */
#ifdef CONFIG_XFRM
        struct netns_xfrm        xfrm;
#endif

        u64                        net_cookie; /* written once */

#if IS_ENABLED(CONFIG_IP_VS)
        struct netns_ipvs        *ipvs;
#endif
#if IS_ENABLED(CONFIG_MPLS)
        struct netns_mpls        mpls;
#endif
#if IS_ENABLED(CONFIG_CAN)
        struct netns_can        can;
#endif
#ifdef CONFIG_XDP_SOCKETS
        struct netns_xdp        xdp;
#endif
#if IS_ENABLED(CONFIG_MCTP)
        struct netns_mctp        mctp;
#endif
#if IS_ENABLED(CONFIG_CRYPTO_USER)
        struct sock                *crypto_nlsk;
#endif
        struct sock                *diag_nlsk;
#if IS_ENABLED(CONFIG_SMC)
        struct netns_smc        smc;
#endif
#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
        /* Move to a better place when the config guard is removed. */
        struct mutex                rtnl_mutex;
#endif
} __randomize_layout;

#include <linux/seq_file_net.h>

/* Init's network namespace */
extern struct net init_net;

#ifdef CONFIG_NET_NS
struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns,
                        struct net *old_net);

void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);

void net_ns_barrier(void);

struct ns_common *get_net_ns(struct ns_common *ns);
struct net *get_net_ns_by_fd(int fd);
extern struct task_struct *cleanup_net_task;

#else /* CONFIG_NET_NS */
#include <linux/sched.h>
#include <linux/nsproxy.h>
static inline struct net *copy_net_ns(unsigned long flags,
        struct user_namespace *user_ns, struct net *old_net)
{
        if (flags & CLONE_NEWNET)
                return ERR_PTR(-EINVAL);
        return old_net;
}

static inline void net_ns_get_ownership(const struct net *net,
                                        kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;
}

static inline void net_ns_barrier(void) {}

static inline struct ns_common *get_net_ns(struct ns_common *ns)
{
        return ERR_PTR(-EINVAL);
}

static inline struct net *get_net_ns_by_fd(int fd)
{
        return ERR_PTR(-EINVAL);
}
#endif /* CONFIG_NET_NS */


extern struct list_head net_namespace_list;

struct net *get_net_ns_by_pid(pid_t pid);

#ifdef CONFIG_SYSCTL
void ipx_register_sysctl(void);
void ipx_unregister_sysctl(void);
#else
#define ipx_register_sysctl()
#define ipx_unregister_sysctl()
#endif

#ifdef CONFIG_NET_NS
void __put_net(struct net *net);

/* Try using get_net_track() instead */
static inline struct net *get_net(struct net *net)
{
        refcount_inc(&net->ns.count);
        return net;
}

static inline struct net *maybe_get_net(struct net *net)
{
        /* Used when we know struct net exists but we
         * aren't guaranteed a previous reference count
         * exists.  If the reference count is zero this
         * function fails and returns NULL.
         */
        if (!refcount_inc_not_zero(&net->ns.count))
                net = NULL;
        return net;
}

/* Try using put_net_track() instead */
static inline void put_net(struct net *net)
{
        if (refcount_dec_and_test(&net->ns.count))
                __put_net(net);
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return net1 == net2;
}

static inline int check_net(const struct net *net)
{
        return refcount_read(&net->ns.count) != 0;
}

void net_drop_ns(void *);
void net_passive_dec(struct net *net);

#else

static inline struct net *get_net(struct net *net)
{
        return net;
}

static inline void put_net(struct net *net)
{
}

static inline struct net *maybe_get_net(struct net *net)
{
        return net;
}

static inline
int net_eq(const struct net *net1, const struct net *net2)
{
        return 1;
}

static inline int check_net(const struct net *net)
{
        return 1;
}

#define net_drop_ns NULL

static inline void net_passive_dec(struct net *net)
{
        refcount_dec(&net->passive);
}
#endif

static inline void net_passive_inc(struct net *net)
{
        refcount_inc(&net->passive);
}

/* Returns true if the netns initialization is completed successfully */
static inline bool net_initialized(const struct net *net)
{
        return READ_ONCE(net->list.next);
}

static inline void __netns_tracker_alloc(struct net *net,
                                         netns_tracker *tracker,
                                         bool refcounted,
                                         gfp_t gfp)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
        ref_tracker_alloc(refcounted ? &net->refcnt_tracker :
                                       &net->notrefcnt_tracker,
                          tracker, gfp);
#endif
}

static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker,
                                       gfp_t gfp)
{
        __netns_tracker_alloc(net, tracker, true, gfp);
}

static inline void __netns_tracker_free(struct net *net,
                                        netns_tracker *tracker,
                                        bool refcounted)
{
#ifdef CONFIG_NET_NS_REFCNT_TRACKER
       ref_tracker_free(refcounted ? &net->refcnt_tracker :
                                     &net->notrefcnt_tracker, tracker);
#endif
}

static inline struct net *get_net_track(struct net *net,
                                        netns_tracker *tracker, gfp_t gfp)
{
        get_net(net);
        netns_tracker_alloc(net, tracker, gfp);
        return net;
}

static inline void put_net_track(struct net *net, netns_tracker *tracker)
{
        __netns_tracker_free(net, tracker, true);
        put_net(net);
}

typedef struct {
#ifdef CONFIG_NET_NS
        struct net __rcu *net;
#endif
} possible_net_t;

static inline void write_pnet(possible_net_t *pnet, struct net *net)
{
#ifdef CONFIG_NET_NS
        rcu_assign_pointer(pnet->net, net);
#endif
}

static inline struct net *read_pnet(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference_protected(pnet->net, true);
#else
        return &init_net;
#endif
}

static inline struct net *read_pnet_rcu(const possible_net_t *pnet)
{
#ifdef CONFIG_NET_NS
        return rcu_dereference(pnet->net);
#else
        return &init_net;
#endif
}

/* Protected by net_rwsem */
#define for_each_net(VAR)                                \
        list_for_each_entry(VAR, &net_namespace_list, list)
#define for_each_net_continue_reverse(VAR)                \
        list_for_each_entry_continue_reverse(VAR, &net_namespace_list, list)
#define for_each_net_rcu(VAR)                                \
        list_for_each_entry_rcu(VAR, &net_namespace_list, list)

#ifdef CONFIG_NET_NS
#define __net_init
#define __net_exit
#define __net_initdata
#define __net_initconst
#else
#define __net_init        __init
#define __net_exit        __ref
#define __net_initdata        __initdata
#define __net_initconst        __initconst
#endif

int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp);
int peernet2id(const struct net *net, struct net *peer);
bool peernet_has_id(const struct net *net, struct net *peer);
struct net *get_net_ns_by_id(const struct net *net, int id);

struct pernet_operations {
        struct list_head list;
        /*
         * Below methods are called without any exclusive locks.
         * More than one net may be constructed and destructed
         * in parallel on several cpus. Every pernet_operations
         * have to keep in mind all other pernet_operations and
         * to introduce a locking, if they share common resources.
         *
         * The only time they are called with exclusive lock is
         * from register_pernet_subsys(), unregister_pernet_subsys()
         * register_pernet_device() and unregister_pernet_device().
         *
         * Exit methods using blocking RCU primitives, such as
         * synchronize_rcu(), should be implemented via exit_batch.
         * Then, destruction of a group of net requires single
         * synchronize_rcu() related to these pernet_operations,
         * instead of separate synchronize_rcu() for every net.
         * Please, avoid synchronize_rcu() at all, where it's possible.
         *
         * Note that a combination of pre_exit() and exit() can
         * be used, since a synchronize_rcu() is guaranteed between
         * the calls.
         */
        int (*init)(struct net *net);
        void (*pre_exit)(struct net *net);
        void (*exit)(struct net *net);
        void (*exit_batch)(struct list_head *net_exit_list);
        /* Following method is called with RTNL held. */
        void (*exit_batch_rtnl)(struct list_head *net_exit_list,
                                struct list_head *dev_kill_list);
        unsigned int * const id;
        const size_t size;
};

/*
 * Use these carefully.  If you implement a network device and it
 * needs per network namespace operations use device pernet operations,
 * otherwise use pernet subsys operations.
 *
 * Network interfaces need to be removed from a dying netns _before_
 * subsys notifiers can be called, as most of the network code cleanup
 * (which is done from subsys notifiers) runs with the assumption that
 * dev_remove_pack has been called so no new packets will arrive during
 * and after the cleanup functions have been called.  dev_remove_pack
 * is not per namespace so instead the guarantee of no more packets
 * arriving in a network namespace is provided by ensuring that all
 * network devices and all sockets have left the network namespace
 * before the cleanup methods are called.
 *
 * For the longest time the ipv4 icmp code was registered as a pernet
 * device which caused kernel oops, and panics during network
 * namespace cleanup.   So please don't get this wrong.
 */
int register_pernet_subsys(struct pernet_operations *);
void unregister_pernet_subsys(struct pernet_operations *);
int register_pernet_device(struct pernet_operations *);
void unregister_pernet_device(struct pernet_operations *);

struct ctl_table;

#define register_net_sysctl(net, path, table)        \
        register_net_sysctl_sz(net, path, table, ARRAY_SIZE(table))
#ifdef CONFIG_SYSCTL
int net_sysctl_init(void);
struct ctl_table_header *register_net_sysctl_sz(struct net *net, const char *path,
                                             struct ctl_table *table, size_t table_size);
void unregister_net_sysctl_table(struct ctl_table_header *header);
#else
static inline int net_sysctl_init(void) { return 0; }
static inline struct ctl_table_header *register_net_sysctl_sz(struct net *net,
        const char *path, struct ctl_table *table, size_t table_size)
{
        return NULL;
}
static inline void unregister_net_sysctl_table(struct ctl_table_header *header)
{
}
#endif

static inline int rt_genid_ipv4(const struct net *net)
{
        return atomic_read(&net->ipv4.rt_genid);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int rt_genid_ipv6(const struct net *net)
{
        return atomic_read(&net->ipv6.fib6_sernum);
}
#endif

static inline void rt_genid_bump_ipv4(struct net *net)
{
        atomic_inc(&net->ipv4.rt_genid);
}

extern void (*__fib6_flush_trees)(struct net *net);
static inline void rt_genid_bump_ipv6(struct net *net)
{
        if (__fib6_flush_trees)
                __fib6_flush_trees(net);
}

#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
static inline struct netns_ieee802154_lowpan *
net_ieee802154_lowpan(struct net *net)
{
        return &net->ieee802154_lowpan;
}
#endif

/* For callers who don't really care about whether it's IPv4 or IPv6 */
static inline void rt_genid_bump_all(struct net *net)
{
        rt_genid_bump_ipv4(net);
        rt_genid_bump_ipv6(net);
}

static inline int fnhe_genid(const struct net *net)
{
        return atomic_read(&net->fnhe_genid);
}

static inline void fnhe_genid_bump(struct net *net)
{
        atomic_inc(&net->fnhe_genid);
}

#ifdef CONFIG_NET
void net_ns_init(void);
#else
static inline void net_ns_init(void) {}
#endif

#endif /* __NET_NET_NAMESPACE_H */

































  248 


















   34 





















































































    3 






















































   72 




























  255 






























































































  255 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timer

#if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMER_H

#include <linux/tracepoint.h>
#include <linux/hrtimer.h>
#include <linux/timer.h>

DECLARE_EVENT_CLASS(timer_class,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer),

        TP_STRUCT__entry(
                __field( void *,        timer        )
        ),

        TP_fast_assign(
                __entry->timer        = timer;
        ),

        TP_printk("timer=%p", __entry->timer)
);

/**
 * timer_init - called when the timer is initialized
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_init,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

#define decode_timer_flags(flags)                        \
        __print_flags(flags, "|",                        \
                {  TIMER_MIGRATING,        "M" },                \
                {  TIMER_DEFERRABLE,        "D" },                \
                {  TIMER_PINNED,        "P" },                \
                {  TIMER_IRQSAFE,        "I" })

/**
 * timer_start - called when the timer is started
 * @timer:                pointer to struct timer_list
 * @bucket_expiry:        the bucket expiry time
 */
TRACE_EVENT(timer_start,

        TP_PROTO(struct timer_list *timer,
                unsigned long bucket_expiry),

        TP_ARGS(timer, bucket_expiry),

        TP_STRUCT__entry(
                __field( void *,        timer                )
                __field( void *,        function        )
                __field( unsigned long,        expires                )
                __field( unsigned long,        bucket_expiry        )
                __field( unsigned long,        now                )
                __field( unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->function        = timer->function;
                __entry->expires        = timer->expires;
                __entry->bucket_expiry        = bucket_expiry;
                __entry->now                = jiffies;
                __entry->flags                = timer->flags;
        ),

        TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s",
                  __entry->timer, __entry->function, __entry->expires,
                  (long)__entry->expires - __entry->now,
                  __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK,
                  __entry->flags >> TIMER_ARRAYSHIFT,
                  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
);

/**
 * timer_expire_entry - called immediately before the timer callback
 * @timer:        pointer to struct timer_list
 * @baseclk:        value of timer_base::clk when timer expires
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(timer_expire_entry,

        TP_PROTO(struct timer_list *timer, unsigned long baseclk),

        TP_ARGS(timer, baseclk),

        TP_STRUCT__entry(
                __field( void *,        timer        )
                __field( unsigned long,        now        )
                __field( void *,        function)
                __field( unsigned long,        baseclk        )
        ),

        TP_fast_assign(
                __entry->timer                = timer;
                __entry->now                = jiffies;
                __entry->function        = timer->function;
                __entry->baseclk        = baseclk;
        ),

        TP_printk("timer=%p function=%ps now=%lu baseclk=%lu",
                  __entry->timer, __entry->function, __entry->now,
                  __entry->baseclk)
);

/**
 * timer_expire_exit - called immediately after the timer callback returns
 * @timer:        pointer to struct timer_list
 *
 * When used in combination with the timer_expire_entry tracepoint we can
 * determine the runtime of the timer callback function.
 *
 * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might
 * be invalid. We solely track the pointer.
 */
DEFINE_EVENT(timer_class, timer_expire_exit,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

/**
 * timer_cancel - called when the timer is canceled
 * @timer:        pointer to struct timer_list
 */
DEFINE_EVENT(timer_class, timer_cancel,

        TP_PROTO(struct timer_list *timer),

        TP_ARGS(timer)
);

TRACE_EVENT(timer_base_idle,

        TP_PROTO(bool is_idle, unsigned int cpu),

        TP_ARGS(is_idle, cpu),

        TP_STRUCT__entry(
                __field( bool,                is_idle        )
                __field( unsigned int,        cpu        )
        ),

        TP_fast_assign(
                __entry->is_idle        = is_idle;
                __entry->cpu                = cpu;
        ),

        TP_printk("is_idle=%d cpu=%d",
                  __entry->is_idle, __entry->cpu)
);

#define decode_clockid(type)                                                \
        __print_symbolic(type,                                                \
                { CLOCK_REALTIME,        "CLOCK_REALTIME"        },        \
                { CLOCK_MONOTONIC,        "CLOCK_MONOTONIC"        },        \
                { CLOCK_BOOTTIME,        "CLOCK_BOOTTIME"        },        \
                { CLOCK_TAI,                "CLOCK_TAI"                })

#define decode_hrtimer_mode(mode)                                        \
        __print_symbolic(mode,                                                \
                { HRTIMER_MODE_ABS,                "ABS"                },        \
                { HRTIMER_MODE_REL,                "REL"                },        \
                { HRTIMER_MODE_ABS_PINNED,        "ABS|PINNED"        },        \
                { HRTIMER_MODE_REL_PINNED,        "REL|PINNED"        },        \
                { HRTIMER_MODE_ABS_SOFT,        "ABS|SOFT"        },        \
                { HRTIMER_MODE_REL_SOFT,        "REL|SOFT"        },        \
                { HRTIMER_MODE_ABS_PINNED_SOFT,        "ABS|PINNED|SOFT" },        \
                { HRTIMER_MODE_REL_PINNED_SOFT,        "REL|PINNED|SOFT" },        \
                { HRTIMER_MODE_ABS_HARD,        "ABS|HARD" },                \
                { HRTIMER_MODE_REL_HARD,        "REL|HARD" },                \
                { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" },        \
                { HRTIMER_MODE_REL_PINNED_HARD,        "REL|PINNED|HARD" })

/**
 * hrtimer_setup - called when the hrtimer is initialized
 * @hrtimer:        pointer to struct hrtimer
 * @clockid:        the hrtimers clock
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_setup,

        TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid,
                 enum hrtimer_mode mode),

        TP_ARGS(hrtimer, clockid, mode),

        TP_STRUCT__entry(
                __field( void *,                hrtimer                )
                __field( clockid_t,                clockid                )
                __field( enum hrtimer_mode,        mode                )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->clockid        = clockid;
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer,
                  decode_clockid(__entry->clockid),
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_start - called when the hrtimer is started
 * @hrtimer:        pointer to struct hrtimer
 * @mode:        the hrtimers mode
 */
TRACE_EVENT(hrtimer_start,

        TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode),

        TP_ARGS(hrtimer, mode),

        TP_STRUCT__entry(
                __field( void *,        hrtimer                )
                __field( void *,        function        )
                __field( s64,                expires                )
                __field( s64,                softexpires        )
                __field( enum hrtimer_mode,        mode        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->function        = ACCESS_PRIVATE(hrtimer, function);
                __entry->expires        = hrtimer_get_expires(hrtimer);
                __entry->softexpires        = hrtimer_get_softexpires(hrtimer);
                __entry->mode                = mode;
        ),

        TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu "
                  "mode=%s", __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->expires,
                  (unsigned long long) __entry->softexpires,
                  decode_hrtimer_mode(__entry->mode))
);

/**
 * hrtimer_expire_entry - called immediately before the hrtimer callback
 * @hrtimer:        pointer to struct hrtimer
 * @now:        pointer to variable which contains current time of the
 *                timers base.
 *
 * Allows to determine the timer latency.
 */
TRACE_EVENT(hrtimer_expire_entry,

        TP_PROTO(struct hrtimer *hrtimer, ktime_t *now),

        TP_ARGS(hrtimer, now),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
                __field( s64,                now        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
                __entry->now                = *now;
                __entry->function        = ACCESS_PRIVATE(hrtimer, function);
        ),

        TP_printk("hrtimer=%p function=%ps now=%llu",
                  __entry->hrtimer, __entry->function,
                  (unsigned long long) __entry->now)
);

DECLARE_EVENT_CLASS(hrtimer_class,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer),

        TP_STRUCT__entry(
                __field( void *,        hrtimer        )
        ),

        TP_fast_assign(
                __entry->hrtimer        = hrtimer;
        ),

        TP_printk("hrtimer=%p", __entry->hrtimer)
);

/**
 * hrtimer_expire_exit - called immediately after the hrtimer callback returns
 * @hrtimer:        pointer to struct hrtimer
 *
 * When used in combination with the hrtimer_expire_entry tracepoint we can
 * determine the runtime of the callback function.
 */
DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * hrtimer_cancel - called when the hrtimer is canceled
 * @hrtimer:        pointer to struct hrtimer
 */
DEFINE_EVENT(hrtimer_class, hrtimer_cancel,

        TP_PROTO(struct hrtimer *hrtimer),

        TP_ARGS(hrtimer)
);

/**
 * itimer_state - called when itimer is started or canceled
 * @which:        name of the interval timer
 * @value:        the itimers value, itimer is canceled if value->it_value is
 *                zero, otherwise it is started
 * @expires:        the itimers expiry time
 */
TRACE_EVENT(itimer_state,

        TP_PROTO(int which, const struct itimerspec64 *const value,
                 unsigned long long expires),

        TP_ARGS(which, value, expires),

        TP_STRUCT__entry(
                __field(        int,                        which                )
                __field(        unsigned long long,        expires                )
                __field(        long,                        value_sec        )
                __field(        long,                        value_nsec        )
                __field(        long,                        interval_sec        )
                __field(        long,                        interval_nsec        )
        ),

        TP_fast_assign(
                __entry->which                = which;
                __entry->expires        = expires;
                __entry->value_sec        = value->it_value.tv_sec;
                __entry->value_nsec        = value->it_value.tv_nsec;
                __entry->interval_sec        = value->it_interval.tv_sec;
                __entry->interval_nsec        = value->it_interval.tv_nsec;
        ),

        TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld",
                  __entry->which, __entry->expires,
                  __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC,
                  __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC)
);

/**
 * itimer_expire - called when itimer expires
 * @which:        type of the interval timer
 * @pid:        pid of the process which owns the timer
 * @now:        current time, used to calculate the latency of itimer
 */
TRACE_EVENT(itimer_expire,

        TP_PROTO(int which, struct pid *pid, unsigned long long now),

        TP_ARGS(which, pid, now),

        TP_STRUCT__entry(
                __field( int ,                        which        )
                __field( pid_t,                        pid        )
                __field( unsigned long long,        now        )
        ),

        TP_fast_assign(
                __entry->which        = which;
                __entry->now        = now;
                __entry->pid        = pid_nr(pid);
        ),

        TP_printk("which=%d pid=%d now=%llu", __entry->which,
                  (int) __entry->pid, __entry->now)
);

#ifdef CONFIG_NO_HZ_COMMON

#define TICK_DEP_NAMES                                        \
                tick_dep_mask_name(NONE)                \
                tick_dep_name(POSIX_TIMER)                \
                tick_dep_name(PERF_EVENTS)                \
                tick_dep_name(SCHED)                        \
                tick_dep_name(CLOCK_UNSTABLE)                \
                tick_dep_name(RCU)                        \
                tick_dep_name_end(RCU_EXP)

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

/* The MASK will convert to their bits and they need to be processed too */
#define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
#define tick_dep_name_end(sdep)  TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \
        TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);
/* NONE only has a mask defined for it */
#define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep);

TICK_DEP_NAMES

#undef tick_dep_name
#undef tick_dep_mask_name
#undef tick_dep_name_end

#define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep },
#define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep }

#define show_tick_dep_name(val)                                \
        __print_symbolic(val, TICK_DEP_NAMES)

TRACE_EVENT(tick_stop,

        TP_PROTO(int success, int dependency),

        TP_ARGS(success, dependency),

        TP_STRUCT__entry(
                __field( int ,                success        )
                __field( int ,                dependency )
        ),

        TP_fast_assign(
                __entry->success        = success;
                __entry->dependency        = dependency;
        ),

        TP_printk("success=%d dependency=%s",  __entry->success, \
                        show_tick_dep_name(__entry->dependency))
);
#endif

#endif /*  _TRACE_TIMER_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






























    4 










    3 





    3 





    4 


    4 







    3 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/* SPDX-License-Identifier: GPL-2.0-or-later */

#ifndef _NET_NETDEV_LOCK_H
#define _NET_NETDEV_LOCK_H

#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>

static inline bool netdev_trylock(struct net_device *dev)
{
        return mutex_trylock(&dev->lock);
}

static inline void netdev_assert_locked(const struct net_device *dev)
{
        lockdep_assert_held(&dev->lock);
}

static inline void
netdev_assert_locked_or_invisible(const struct net_device *dev)
{
        if (dev->reg_state == NETREG_REGISTERED ||
            dev->reg_state == NETREG_UNREGISTERING)
                netdev_assert_locked(dev);
}

static inline bool netdev_need_ops_lock(const struct net_device *dev)
{
        bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops;

#if IS_ENABLED(CONFIG_NET_SHAPER)
        ret |= !!dev->netdev_ops->net_shaper_ops;
#endif

        return ret;
}

static inline void netdev_lock_ops(struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                netdev_lock(dev);
}

static inline void netdev_unlock_ops(struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                netdev_unlock(dev);
}

static inline void netdev_ops_assert_locked(const struct net_device *dev)
{
        if (netdev_need_ops_lock(dev))
                lockdep_assert_held(&dev->lock);
        else
                ASSERT_RTNL();
}

static inline void
netdev_ops_assert_locked_or_invisible(const struct net_device *dev)
{
        if (dev->reg_state == NETREG_REGISTERED ||
            dev->reg_state == NETREG_UNREGISTERING)
                netdev_ops_assert_locked(dev);
}

static inline int netdev_lock_cmp_fn(const struct lockdep_map *a,
                                     const struct lockdep_map *b)
{
        /* Only lower devices currently grab the instance lock, so no
         * real ordering issues can occur. In the near future, only
         * hardware devices will grab instance lock which also does not
         * involve any ordering. Suppress lockdep ordering warnings
         * until (if) we start grabbing instance lock on pure SW
         * devices (bond/team/veth/etc).
         */
        if (a == b)
                return 0;
        return -1;
}

#define netdev_lockdep_set_classes(dev)                                \
{                                                                \
        static struct lock_class_key qdisc_tx_busylock_key;        \
        static struct lock_class_key qdisc_xmit_lock_key;        \
        static struct lock_class_key dev_addr_list_lock_key;        \
        static struct lock_class_key dev_instance_lock_key;        \
        unsigned int i;                                                \
                                                                \
        (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key;        \
        lockdep_set_class(&(dev)->addr_list_lock,                \
                          &dev_addr_list_lock_key);                \
        lockdep_set_class(&(dev)->lock,                                \
                          &dev_instance_lock_key);                \
        lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL);        \
        for (i = 0; i < (dev)->num_tx_queues; i++)                \
                lockdep_set_class(&(dev)->_tx[i]._xmit_lock,        \
                                  &qdisc_xmit_lock_key);        \
}

int netdev_debug_event(struct notifier_block *nb, unsigned long event,
                       void *ptr);

#endif


























 1513 
















 1507 






















 1509 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_STACKTRACE_H
#define __ASM_STACKTRACE_H

#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/llist.h>

#include <asm/memory.h>
#include <asm/pointer_auth.h>
#include <asm/ptrace.h>
#include <asm/sdei.h>

#include <asm/stacktrace/common.h>

extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
                           const char *loglvl);

DECLARE_PER_CPU(unsigned long *, irq_stack_ptr);

static inline struct stack_info stackinfo_get_irq(void)
{
        unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr);
        unsigned long high = low + IRQ_STACK_SIZE;

        return (struct stack_info) {
                .low = low,
                .high = high,
        };
}

static inline bool on_irq_stack(unsigned long sp, unsigned long size)
{
        struct stack_info info = stackinfo_get_irq();
        return stackinfo_on_stack(&info, sp, size);
}

static inline struct stack_info stackinfo_get_task(const struct task_struct *tsk)
{
        unsigned long low = (unsigned long)task_stack_page(tsk);
        unsigned long high = low + THREAD_SIZE;

        return (struct stack_info) {
                .low = low,
                .high = high,
        };
}

static inline bool on_task_stack(const struct task_struct *tsk,
                                 unsigned long sp, unsigned long size)
{
        struct stack_info info = stackinfo_get_task(tsk);
        return stackinfo_on_stack(&info, sp, size);
}

#define on_thread_stack()        (on_task_stack(current, current_stack_pointer, 1))

#ifdef CONFIG_VMAP_STACK
DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack);

static inline struct stack_info stackinfo_get_overflow(void)
{
        unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack);
        unsigned long high = low + OVERFLOW_STACK_SIZE;

        return (struct stack_info) {
                .low = low,
                .high = high,
        };
}
#else
#define stackinfo_get_overflow()        stackinfo_get_unknown()
#endif

#if defined(CONFIG_ARM_SDE_INTERFACE) && defined(CONFIG_VMAP_STACK)
DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr);
DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr);

static inline struct stack_info stackinfo_get_sdei_normal(void)
{
        unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr);
        unsigned long high = low + SDEI_STACK_SIZE;

        return (struct stack_info) {
                .low = low,
                .high = high,
        };
}

static inline struct stack_info stackinfo_get_sdei_critical(void)
{
        unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr);
        unsigned long high = low + SDEI_STACK_SIZE;

        return (struct stack_info) {
                .low = low,
                .high = high,
        };
}
#else
#define stackinfo_get_sdei_normal()        stackinfo_get_unknown()
#define stackinfo_get_sdei_critical()        stackinfo_get_unknown()
#endif

#ifdef CONFIG_EFI
extern u64 *efi_rt_stack_top;

static inline struct stack_info stackinfo_get_efi(void)
{
        unsigned long high = (u64)efi_rt_stack_top;
        unsigned long low = high - THREAD_SIZE;

        return (struct stack_info) {
                .low = low,
                .high = high,
        };
}
#endif

#endif        /* __ASM_STACKTRACE_H */
























































































    8 




































    1 




    3 
    1 

    1 





    1 






  179 
  179 






   12 



   11 
    1 









    1 





















   72 











   72 



   63 
    8 

    1 


    8 

   64 




















   71 



















   72 

   72 









  385 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Derived from arch/arm/kvm/reset.c
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/hw_breakpoint.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/types.h>

#include <kvm/arm_arch_timer.h>

#include <asm/cpufeature.h>
#include <asm/cputype.h>
#include <asm/fpsimd.h>
#include <asm/ptrace.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/virt.h>

/* Maximum phys_shift supported for any VM on this host */
static u32 __ro_after_init kvm_ipa_limit;
unsigned int __ro_after_init kvm_host_sve_max_vl;

/*
 * ARMv8 Reset Values
 */
#define VCPU_RESET_PSTATE_EL1        (PSR_MODE_EL1h | PSR_A_BIT | PSR_I_BIT | \
                                 PSR_F_BIT | PSR_D_BIT)

#define VCPU_RESET_PSTATE_EL2        (PSR_MODE_EL2h | PSR_A_BIT | PSR_I_BIT | \
                                 PSR_F_BIT | PSR_D_BIT)

#define VCPU_RESET_PSTATE_SVC        (PSR_AA32_MODE_SVC | PSR_AA32_A_BIT | \
                                 PSR_AA32_I_BIT | PSR_AA32_F_BIT)

unsigned int __ro_after_init kvm_sve_max_vl;

int __init kvm_arm_init_sve(void)
{
        if (system_supports_sve()) {
                kvm_sve_max_vl = sve_max_virtualisable_vl();
                kvm_host_sve_max_vl = sve_max_vl();
                kvm_nvhe_sym(kvm_host_sve_max_vl) = kvm_host_sve_max_vl;

                /*
                 * The get_sve_reg()/set_sve_reg() ioctl interface will need
                 * to be extended with multiple register slice support in
                 * order to support vector lengths greater than
                 * VL_ARCH_MAX:
                 */
                if (WARN_ON(kvm_sve_max_vl > VL_ARCH_MAX))
                        kvm_sve_max_vl = VL_ARCH_MAX;

                /*
                 * Don't even try to make use of vector lengths that
                 * aren't available on all CPUs, for now:
                 */
                if (kvm_sve_max_vl < sve_max_vl())
                        pr_warn("KVM: SVE vector length for guests limited to %u bytes\n",
                                kvm_sve_max_vl);
        }

        return 0;
}

static void kvm_vcpu_enable_sve(struct kvm_vcpu *vcpu)
{
        vcpu->arch.sve_max_vl = kvm_sve_max_vl;

        /*
         * Userspace can still customize the vector lengths by writing
         * KVM_REG_ARM64_SVE_VLS.  Allocation is deferred until
         * kvm_arm_vcpu_finalize(), which freezes the configuration.
         */
        set_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &vcpu->kvm->arch.flags);
}

/*
 * Finalize vcpu's maximum SVE vector length, allocating
 * vcpu->arch.sve_state as necessary.
 */
static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
{
        void *buf;
        unsigned int vl;
        size_t reg_sz;
        int ret;

        vl = vcpu->arch.sve_max_vl;

        /*
         * Responsibility for these properties is shared between
         * kvm_arm_init_sve(), kvm_vcpu_enable_sve() and
         * set_sve_vls().  Double-check here just to be sure:
         */
        if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl() ||
                    vl > VL_ARCH_MAX))
                return -EIO;

        reg_sz = vcpu_sve_state_size(vcpu);
        buf = kzalloc(reg_sz, GFP_KERNEL_ACCOUNT);
        if (!buf)
                return -ENOMEM;

        ret = kvm_share_hyp(buf, buf + reg_sz);
        if (ret) {
                kfree(buf);
                return ret;
        }
        
        vcpu->arch.sve_state = buf;
        vcpu_set_flag(vcpu, VCPU_SVE_FINALIZED);
        return 0;
}

int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature)
{
        switch (feature) {
        case KVM_ARM_VCPU_SVE:
                if (!vcpu_has_sve(vcpu))
                        return -EINVAL;

                if (kvm_arm_vcpu_sve_finalized(vcpu))
                        return -EPERM;

                return kvm_vcpu_finalize_sve(vcpu);
        }

        return -EINVAL;
}

bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu)
{
        if (vcpu_has_sve(vcpu) && !kvm_arm_vcpu_sve_finalized(vcpu))
                return false;

        return true;
}

void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu)
{
        void *sve_state = vcpu->arch.sve_state;

        kvm_unshare_hyp(vcpu, vcpu + 1);
        if (sve_state)
                kvm_unshare_hyp(sve_state, sve_state + vcpu_sve_state_size(vcpu));
        kfree(sve_state);
        free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
        kfree(vcpu->arch.vncr_tlb);
        kfree(vcpu->arch.ccsidr);
}

static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu)
{
        if (vcpu_has_sve(vcpu))
                memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu));
}

/**
 * kvm_reset_vcpu - sets core registers and sys_regs to reset value
 * @vcpu: The VCPU pointer
 *
 * This function sets the registers on the virtual CPU struct to their
 * architecturally defined reset values, except for registers whose reset is
 * deferred until kvm_arm_vcpu_finalize().
 *
 * Note: This function can be called from two paths: The KVM_ARM_VCPU_INIT
 * ioctl or as part of handling a request issued by another VCPU in the PSCI
 * handling code.  In the first case, the VCPU will not be loaded, and in the
 * second case the VCPU will be loaded.  Because this function operates purely
 * on the memory-backed values of system registers, we want to do a full put if
 * we were loaded (handling a request) and load the values back at the end of
 * the function.  Otherwise we leave the state alone.  In both cases, we
 * disable preemption around the vcpu reset as we would otherwise race with
 * preempt notifiers which also call put/load.
 */
void kvm_reset_vcpu(struct kvm_vcpu *vcpu)
{
        struct vcpu_reset_state reset_state;
        bool loaded;
        u32 pstate;

        spin_lock(&vcpu->arch.mp_state_lock);
        reset_state = vcpu->arch.reset_state;
        vcpu->arch.reset_state.reset = false;
        spin_unlock(&vcpu->arch.mp_state_lock);

        preempt_disable();
        loaded = (vcpu->cpu != -1);
        if (loaded)
                kvm_arch_vcpu_put(vcpu);

        if (!kvm_arm_vcpu_sve_finalized(vcpu)) {
                if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE))
                        kvm_vcpu_enable_sve(vcpu);
        } else {
                kvm_vcpu_reset_sve(vcpu);
        }

        if (vcpu_el1_is_32bit(vcpu))
                pstate = VCPU_RESET_PSTATE_SVC;
        else if (vcpu_has_nv(vcpu))
                pstate = VCPU_RESET_PSTATE_EL2;
        else
                pstate = VCPU_RESET_PSTATE_EL1;

        /* Reset core registers */
        memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu)));
        memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs));
        vcpu->arch.ctxt.spsr_abt = 0;
        vcpu->arch.ctxt.spsr_und = 0;
        vcpu->arch.ctxt.spsr_irq = 0;
        vcpu->arch.ctxt.spsr_fiq = 0;
        vcpu_gp_regs(vcpu)->pstate = pstate;

        /* Reset system registers */
        kvm_reset_sys_regs(vcpu);

        /*
         * Additional reset state handling that PSCI may have imposed on us.
         * Must be done after all the sys_reg reset.
         */
        if (reset_state.reset) {
                unsigned long target_pc = reset_state.pc;

                /* Gracefully handle Thumb2 entry point */
                if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) {
                        target_pc &= ~1UL;
                        vcpu_set_thumb(vcpu);
                }

                /* Propagate caller endianness */
                if (reset_state.be)
                        kvm_vcpu_set_be(vcpu);

                *vcpu_pc(vcpu) = target_pc;
                vcpu_set_reg(vcpu, 0, reset_state.r0);
        }

        /* Reset timer */
        kvm_timer_vcpu_reset(vcpu);

        if (loaded)
                kvm_arch_vcpu_load(vcpu, smp_processor_id());
        preempt_enable();
}

u32 kvm_get_pa_bits(struct kvm *kvm)
{
        /* Fixed limit until we can configure ID_AA64MMFR0.PARange */
        return kvm_ipa_limit;
}

u32 get_kvm_ipa_limit(void)
{
        return kvm_ipa_limit;
}

int __init kvm_set_ipa_limit(void)
{
        unsigned int parange;
        u64 mmfr0;

        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        parange = cpuid_feature_extract_unsigned_field(mmfr0,
                                ID_AA64MMFR0_EL1_PARANGE_SHIFT);
        /*
         * IPA size beyond 48 bits for 4K and 16K page size is only supported
         * when LPA2 is available. So if we have LPA2, enable it, else cap to 48
         * bits, in case it's reported as larger on the system.
         */
        if (!kvm_lpa2_is_enabled() && PAGE_SIZE != SZ_64K)
                parange = min(parange, (unsigned int)ID_AA64MMFR0_EL1_PARANGE_48);

        /*
         * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at
         * Stage-2. If not, things will stop very quickly.
         */
        switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_TGRAN_2_SHIFT)) {
        case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_NONE:
                kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n");
                return -EINVAL;
        case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_DEFAULT:
                kvm_debug("PAGE_SIZE supported at Stage-2 (default)\n");
                break;
        case ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MIN ... ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_MAX:
                kvm_debug("PAGE_SIZE supported at Stage-2 (advertised)\n");
                break;
        default:
                kvm_err("Unsupported value for TGRAN_2, giving up\n");
                return -EINVAL;
        }

        kvm_ipa_limit = id_aa64mmfr0_parange_to_phys_shift(parange);
        kvm_info("IPA Size Limit: %d bits%s\n", kvm_ipa_limit,
                 ((kvm_ipa_limit < KVM_PHYS_SHIFT) ?
                  " (Reduced IPA size, limited VM/VMM compatibility)" : ""));

        return 0;
}
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fscrypt.h: declarations for per-file encryption
 *
 * Filesystems that implement per-file encryption must include this header
 * file.
 *
 * Copyright (C) 2015, Google, Inc.
 *
 * Written by Michael Halcrow, 2015.
 * Modified by Jaegeuk Kim, 2015.
 */
#ifndef _LINUX_FSCRYPT_H
#define _LINUX_FSCRYPT_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <uapi/linux/fscrypt.h>

/*
 * The lengths of all file contents blocks must be divisible by this value.
 * This is needed to ensure that all contents encryption modes will work, as
 * some of the supported modes don't support arbitrarily byte-aligned messages.
 *
 * Since the needed alignment is 16 bytes, most filesystems will meet this
 * requirement naturally, as typical block sizes are powers of 2.  However, if a
 * filesystem can generate arbitrarily byte-aligned block lengths (e.g., via
 * compression), then it will need to pad to this alignment before encryption.
 */
#define FSCRYPT_CONTENTS_ALIGNMENT 16

union fscrypt_policy;
struct fscrypt_inode_info;
struct fs_parameter;
struct seq_file;

struct fscrypt_str {
        unsigned char *name;
        u32 len;
};

struct fscrypt_name {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        u32 hash;
        u32 minor_hash;
        struct fscrypt_str crypto_buf;
        bool is_nokey_name;
};

#define FSTR_INIT(n, l)                { .name = n, .len = l }
#define FSTR_TO_QSTR(f)                QSTR_INIT((f)->name, (f)->len)
#define fname_name(p)                ((p)->disk_name.name)
#define fname_len(p)                ((p)->disk_name.len)

/* Maximum value for the third parameter of fscrypt_operations.set_context(). */
#define FSCRYPT_SET_CONTEXT_MAX_SIZE        40

#ifdef CONFIG_FS_ENCRYPTION

/* Crypto operations for filesystems */
struct fscrypt_operations {

        /*
         * If set, then fs/crypto/ will allocate a global bounce page pool the
         * first time an encryption key is set up for a file.  The bounce page
         * pool is required by the following functions:
         *
         * - fscrypt_encrypt_pagecache_blocks()
         * - fscrypt_zeroout_range() for files not using inline crypto
         *
         * If the filesystem doesn't use those, it doesn't need to set this.
         */
        unsigned int needs_bounce_pages : 1;

        /*
         * If set, then fs/crypto/ will allow the use of encryption settings
         * that assume inode numbers fit in 32 bits (i.e.
         * FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64}), provided that the other
         * prerequisites for these settings are also met.  This is only useful
         * if the filesystem wants to support inline encryption hardware that is
         * limited to 32-bit or 64-bit data unit numbers and where programming
         * keyslots is very slow.
         */
        unsigned int has_32bit_inodes : 1;

        /*
         * If set, then fs/crypto/ will allow users to select a crypto data unit
         * size that is less than the filesystem block size.  This is done via
         * the log2_data_unit_size field of the fscrypt policy.  This flag is
         * not compatible with filesystems that encrypt variable-length blocks
         * (i.e. blocks that aren't all equal to filesystem's block size), for
         * example as a result of compression.  It's also not compatible with
         * the fscrypt_encrypt_block_inplace() and
         * fscrypt_decrypt_block_inplace() functions.
         */
        unsigned int supports_subblock_data_units : 1;

        /*
         * This field exists only for backwards compatibility reasons and should
         * only be set by the filesystems that are setting it already.  It
         * contains the filesystem-specific key description prefix that is
         * accepted for "logon" keys for v1 fscrypt policies.  This
         * functionality is deprecated in favor of the generic prefix
         * "fscrypt:", which itself is deprecated in favor of the filesystem
         * keyring ioctls such as FS_IOC_ADD_ENCRYPTION_KEY.  Filesystems that
         * are newly adding fscrypt support should not set this field.
         */
        const char *legacy_key_prefix;

        /*
         * Get the fscrypt context of the given inode.
         *
         * @inode: the inode whose context to get
         * @ctx: the buffer into which to get the context
         * @len: length of the @ctx buffer in bytes
         *
         * Return: On success, returns the length of the context in bytes; this
         *           may be less than @len.  On failure, returns -ENODATA if the
         *           inode doesn't have a context, -ERANGE if the context is
         *           longer than @len, or another -errno code.
         */
        int (*get_context)(struct inode *inode, void *ctx, size_t len);

        /*
         * Set an fscrypt context on the given inode.
         *
         * @inode: the inode whose context to set.  The inode won't already have
         *           an fscrypt context.
         * @ctx: the context to set
         * @len: length of @ctx in bytes (at most FSCRYPT_SET_CONTEXT_MAX_SIZE)
         * @fs_data: If called from fscrypt_set_context(), this will be the
         *             value the filesystem passed to fscrypt_set_context().
         *             Otherwise (i.e. when called from
         *             FS_IOC_SET_ENCRYPTION_POLICY) this will be NULL.
         *
         * i_rwsem will be held for write.
         *
         * Return: 0 on success, -errno on failure.
         */
        int (*set_context)(struct inode *inode, const void *ctx, size_t len,
                           void *fs_data);

        /*
         * Get the dummy fscrypt policy in use on the filesystem (if any).
         *
         * Filesystems only need to implement this function if they support the
         * test_dummy_encryption mount option.
         *
         * Return: A pointer to the dummy fscrypt policy, if the filesystem is
         *           mounted with test_dummy_encryption; otherwise NULL.
         */
        const union fscrypt_policy *(*get_dummy_policy)(struct super_block *sb);

        /*
         * Check whether a directory is empty.  i_rwsem will be held for write.
         */
        bool (*empty_dir)(struct inode *inode);

        /*
         * Check whether the filesystem's inode numbers and UUID are stable,
         * meaning that they will never be changed even by offline operations
         * such as filesystem shrinking and therefore can be used in the
         * encryption without the possibility of files becoming unreadable.
         *
         * Filesystems only need to implement this function if they want to
         * support the FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{32,64} flags.  These
         * flags are designed to work around the limitations of UFS and eMMC
         * inline crypto hardware, and they shouldn't be used in scenarios where
         * such hardware isn't being used.
         *
         * Leaving this NULL is equivalent to always returning false.
         */
        bool (*has_stable_inodes)(struct super_block *sb);

        /*
         * Return an array of pointers to the block devices to which the
         * filesystem may write encrypted file contents, NULL if the filesystem
         * only has a single such block device, or an ERR_PTR() on error.
         *
         * On successful non-NULL return, *num_devs is set to the number of
         * devices in the returned array.  The caller must free the returned
         * array using kfree().
         *
         * If the filesystem can use multiple block devices (other than block
         * devices that aren't used for encrypted file contents, such as
         * external journal devices), and wants to support inline encryption,
         * then it must implement this function.  Otherwise it's not needed.
         */
        struct block_device **(*get_devices)(struct super_block *sb,
                                             unsigned int *num_devs);
};

int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
                         struct dentry *dentry, unsigned int flags);

static inline struct fscrypt_inode_info *
fscrypt_get_inode_info(const struct inode *inode)
{
        /*
         * Pairs with the cmpxchg_release() in fscrypt_setup_encryption_info().
         * I.e., another task may publish ->i_crypt_info concurrently, executing
         * a RELEASE barrier.  We need to use smp_load_acquire() here to safely
         * ACQUIRE the memory the other task published.
         */
        return smp_load_acquire(&inode->i_crypt_info);
}

/**
 * fscrypt_needs_contents_encryption() - check whether an inode needs
 *                                         contents encryption
 * @inode: the inode to check
 *
 * Return: %true iff the inode is an encrypted regular file and the kernel was
 * built with fscrypt support.
 *
 * If you need to know whether the encrypt bit is set even when the kernel was
 * built without fscrypt support, you must use IS_ENCRYPTED() directly instead.
 */
static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode);
}

/*
 * When d_splice_alias() moves a directory's no-key alias to its
 * plaintext alias as a result of the encryption key being added,
 * DCACHE_NOKEY_NAME must be cleared and there might be an opportunity
 * to disable d_revalidate.  Note that we don't have to support the
 * inverse operation because fscrypt doesn't allow no-key names to be
 * the source or target of a rename().
 */
static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
        /*
         * VFS calls fscrypt_handle_d_move even for non-fscrypt
         * filesystems.
         */
        if (dentry->d_flags & DCACHE_NOKEY_NAME) {
                dentry->d_flags &= ~DCACHE_NOKEY_NAME;

                /*
                 * Other filesystem features might be handling dentry
                 * revalidation, in which case it cannot be disabled.
                 */
                if (dentry->d_op->d_revalidate == fscrypt_d_revalidate)
                        dentry->d_flags &= ~DCACHE_OP_REVALIDATE;
        }
}

/**
 * fscrypt_is_nokey_name() - test whether a dentry is a no-key name
 * @dentry: the dentry to check
 *
 * This returns true if the dentry is a no-key dentry.  A no-key dentry is a
 * dentry that was created in an encrypted directory that hasn't had its
 * encryption key added yet.  Such dentries may be either positive or negative.
 *
 * When a filesystem is asked to create a new filename in an encrypted directory
 * and the new filename's dentry is a no-key dentry, it must fail the operation
 * with ENOKEY.  This includes ->create(), ->mkdir(), ->mknod(), ->symlink(),
 * ->rename(), and ->link().  (However, ->rename() and ->link() are already
 * handled by fscrypt_prepare_rename() and fscrypt_prepare_link().)
 *
 * This is necessary because creating a filename requires the directory's
 * encryption key, but just checking for the key on the directory inode during
 * the final filesystem operation doesn't guarantee that the key was available
 * during the preceding dentry lookup.  And the key must have already been
 * available during the dentry lookup in order for it to have been checked
 * whether the filename already exists in the directory and for the new file's
 * dentry not to be invalidated due to it incorrectly having the no-key flag.
 *
 * Return: %true if the dentry is a no-key name
 */
static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_NOKEY_NAME;
}

static inline void fscrypt_prepare_dentry(struct dentry *dentry,
                                          bool is_nokey_name)
{
        /*
         * This code tries to only take ->d_lock when necessary to write
         * to ->d_flags.  We shouldn't be peeking on d_flags for
         * DCACHE_OP_REVALIDATE unlocked, but in the unlikely case
         * there is a race, the worst it can happen is that we fail to
         * unset DCACHE_OP_REVALIDATE and pay the cost of an extra
         * d_revalidate.
         */
        if (is_nokey_name) {
                spin_lock(&dentry->d_lock);
                dentry->d_flags |= DCACHE_NOKEY_NAME;
                spin_unlock(&dentry->d_lock);
        } else if (dentry->d_flags & DCACHE_OP_REVALIDATE &&
                   dentry->d_op->d_revalidate == fscrypt_d_revalidate) {
                /*
                 * Unencrypted dentries and encrypted dentries where the
                 * key is available are always valid from fscrypt
                 * perspective. Avoid the cost of calling
                 * fscrypt_d_revalidate unnecessarily.
                 */
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_OP_REVALIDATE;
                spin_unlock(&dentry->d_lock);
        }
}

/* crypto.c */
void fscrypt_enqueue_decrypt_work(struct work_struct *);

struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
                size_t len, size_t offs, gfp_t gfp_flags);
int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num, gfp_t gfp_flags);

int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len,
                                     size_t offs);
int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page,
                                  unsigned int len, unsigned int offs,
                                  u64 lblk_num);

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return page->mapping == NULL;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        return (struct page *)page_private(bounce_page);
}

static inline bool fscrypt_is_bounce_folio(struct folio *folio)
{
        return folio->mapping == NULL;
}

static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
{
        return bounce_folio->private;
}

void fscrypt_free_bounce_page(struct page *bounce_page);

/* policy.c */
int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg);
int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg);
int fscrypt_has_permitted_context(struct inode *parent, struct inode *child);
int fscrypt_context_for_new_inode(void *ctx, struct inode *inode);
int fscrypt_set_context(struct inode *inode, void *fs_data);

struct fscrypt_dummy_policy {
        const union fscrypt_policy *policy;
};

int fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
                                    struct fscrypt_dummy_policy *dummy_policy);
bool fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
                                  const struct fscrypt_dummy_policy *p2);
void fscrypt_show_test_dummy_encryption(struct seq_file *seq, char sep,
                                        struct super_block *sb);
static inline bool
fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy)
{
        return dummy_policy->policy != NULL;
}
static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
        kfree(dummy_policy->policy);
        dummy_policy->policy = NULL;
}

/* keyring.c */
void fscrypt_destroy_keyring(struct super_block *sb);
int fscrypt_ioctl_add_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg);
int fscrypt_ioctl_remove_key_all_users(struct file *filp, void __user *arg);
int fscrypt_ioctl_get_key_status(struct file *filp, void __user *arg);

/* keysetup.c */
int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
                              bool *encrypt_ret);
void fscrypt_put_encryption_info(struct inode *inode);
void fscrypt_free_inode(struct inode *inode);
int fscrypt_drop_inode(struct inode *inode);

/* fname.c */
int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname,
                          u8 *out, unsigned int olen);
bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len,
                                  u32 max_len, u32 *encrypted_len_ret);
int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname,
                           int lookup, struct fscrypt_name *fname);

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        kfree(fname->crypto_buf.name);
}

int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                               struct fscrypt_str *crypto_str);
void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str);
int fscrypt_fname_disk_to_usr(const struct inode *inode,
                              u32 hash, u32 minor_hash,
                              const struct fscrypt_str *iname,
                              struct fscrypt_str *oname);
bool fscrypt_match_name(const struct fscrypt_name *fname,
                        const u8 *de_name, u32 de_name_len);
u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name);

/* bio.c */
bool fscrypt_decrypt_bio(struct bio *bio);
int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
                          sector_t pblk, unsigned int len);

/* hooks.c */
int fscrypt_file_open(struct inode *inode, struct file *filp);
int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                           struct dentry *dentry);
int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags);
int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
                             struct fscrypt_name *fname);
int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry);
int __fscrypt_prepare_readdir(struct inode *dir);
int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr);
int fscrypt_prepare_setflags(struct inode *inode,
                             unsigned int oldflags, unsigned int flags);
int fscrypt_prepare_symlink(struct inode *dir, const char *target,
                            unsigned int len, unsigned int max_len,
                            struct fscrypt_str *disk_link);
int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
                              unsigned int len, struct fscrypt_str *disk_link);
const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
                                unsigned int max_size,
                                struct delayed_call *done);
int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat);
static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
        sb->s_cop = s_cop;
}
#else  /* !CONFIG_FS_ENCRYPTION */

static inline struct fscrypt_inode_info *
fscrypt_get_inode_info(const struct inode *inode)
{
        return NULL;
}

static inline bool fscrypt_needs_contents_encryption(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_handle_d_move(struct dentry *dentry)
{
}

static inline bool fscrypt_is_nokey_name(const struct dentry *dentry)
{
        return false;
}

static inline void fscrypt_prepare_dentry(struct dentry *dentry,
                                          bool is_nokey_name)
{
}

/* crypto.c */
static inline void fscrypt_enqueue_decrypt_work(struct work_struct *work)
{
}

static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio,
                size_t len, size_t offs, gfp_t gfp_flags)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_encrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num,
                                                gfp_t gfp_flags)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_pagecache_blocks(struct folio *folio,
                                                   size_t len, size_t offs)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_decrypt_block_inplace(const struct inode *inode,
                                                struct page *page,
                                                unsigned int len,
                                                unsigned int offs, u64 lblk_num)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_is_bounce_page(struct page *page)
{
        return false;
}

static inline struct page *fscrypt_pagecache_page(struct page *bounce_page)
{
        WARN_ON_ONCE(1);
        return ERR_PTR(-EINVAL);
}

static inline bool fscrypt_is_bounce_folio(struct folio *folio)
{
        return false;
}

static inline struct folio *fscrypt_pagecache_folio(struct folio *bounce_folio)
{
        WARN_ON_ONCE(1);
        return ERR_PTR(-EINVAL);
}

static inline void fscrypt_free_bounce_page(struct page *bounce_page)
{
}

/* policy.c */
static inline int fscrypt_ioctl_set_policy(struct file *filp,
                                           const void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_policy_ex(struct file *filp,
                                              void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_has_permitted_context(struct inode *parent,
                                                struct inode *child)
{
        return 0;
}

static inline int fscrypt_set_context(struct inode *inode, void *fs_data)
{
        return -EOPNOTSUPP;
}

struct fscrypt_dummy_policy {
};

static inline int
fscrypt_parse_test_dummy_encryption(const struct fs_parameter *param,
                                    struct fscrypt_dummy_policy *dummy_policy)
{
        return -EINVAL;
}

static inline bool
fscrypt_dummy_policies_equal(const struct fscrypt_dummy_policy *p1,
                             const struct fscrypt_dummy_policy *p2)
{
        return true;
}

static inline void fscrypt_show_test_dummy_encryption(struct seq_file *seq,
                                                      char sep,
                                                      struct super_block *sb)
{
}

static inline bool
fscrypt_is_dummy_policy_set(const struct fscrypt_dummy_policy *dummy_policy)
{
        return false;
}

static inline void
fscrypt_free_dummy_policy(struct fscrypt_dummy_policy *dummy_policy)
{
}

/* keyring.c */
static inline void fscrypt_destroy_keyring(struct super_block *sb)
{
}

static inline int fscrypt_ioctl_add_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_remove_key_all_users(struct file *filp,
                                                     void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_ioctl_get_key_status(struct file *filp,
                                               void __user *arg)
{
        return -EOPNOTSUPP;
}

/* keysetup.c */

static inline int fscrypt_prepare_new_inode(struct inode *dir,
                                            struct inode *inode,
                                            bool *encrypt_ret)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        return 0;
}

static inline void fscrypt_put_encryption_info(struct inode *inode)
{
        return;
}

static inline void fscrypt_free_inode(struct inode *inode)
{
}

static inline int fscrypt_drop_inode(struct inode *inode)
{
        return 0;
}

 /* fname.c */
static inline int fscrypt_setup_filename(struct inode *dir,
                                         const struct qstr *iname,
                                         int lookup, struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *)iname->name;
        fname->disk_name.len = iname->len;
        return 0;
}

static inline void fscrypt_free_filename(struct fscrypt_name *fname)
{
        return;
}

static inline int fscrypt_fname_alloc_buffer(u32 max_encrypted_len,
                                             struct fscrypt_str *crypto_str)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str)
{
        return;
}

static inline int fscrypt_fname_disk_to_usr(const struct inode *inode,
                                            u32 hash, u32 minor_hash,
                                            const struct fscrypt_str *iname,
                                            struct fscrypt_str *oname)
{
        return -EOPNOTSUPP;
}

static inline bool fscrypt_match_name(const struct fscrypt_name *fname,
                                      const u8 *de_name, u32 de_name_len)
{
        /* Encryption support disabled; use standard comparison */
        if (de_name_len != fname->disk_name.len)
                return false;
        return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len);
}

static inline u64 fscrypt_fname_siphash(const struct inode *dir,
                                        const struct qstr *name)
{
        WARN_ON_ONCE(1);
        return 0;
}

static inline int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name,
                                       struct dentry *dentry, unsigned int flags)
{
        return 1;
}

/* bio.c */
static inline bool fscrypt_decrypt_bio(struct bio *bio)
{
        return true;
}

static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
                                        sector_t pblk, unsigned int len)
{
        return -EOPNOTSUPP;
}

/* hooks.c */

static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
{
        if (IS_ENCRYPTED(inode))
                return -EOPNOTSUPP;
        return 0;
}

static inline int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                                         struct dentry *dentry)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_rename(struct inode *old_dir,
                                           struct dentry *old_dentry,
                                           struct inode *new_dir,
                                           struct dentry *new_dentry,
                                           unsigned int flags)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_lookup(struct inode *dir,
                                           struct dentry *dentry,
                                           struct fscrypt_name *fname)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_lookup_partial(struct inode *dir,
                                                 struct dentry *dentry)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_readdir(struct inode *dir)
{
        return -EOPNOTSUPP;
}

static inline int __fscrypt_prepare_setattr(struct dentry *dentry,
                                            struct iattr *attr)
{
        return -EOPNOTSUPP;
}

static inline int fscrypt_prepare_setflags(struct inode *inode,
                                           unsigned int oldflags,
                                           unsigned int flags)
{
        return 0;
}

static inline int fscrypt_prepare_symlink(struct inode *dir,
                                          const char *target,
                                          unsigned int len,
                                          unsigned int max_len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(dir))
                return -EOPNOTSUPP;
        disk_link->name = (unsigned char *)target;
        disk_link->len = len + 1;
        if (disk_link->len > max_len)
                return -ENAMETOOLONG;
        return 0;
}

static inline int __fscrypt_encrypt_symlink(struct inode *inode,
                                            const char *target,
                                            unsigned int len,
                                            struct fscrypt_str *disk_link)
{
        return -EOPNOTSUPP;
}

static inline const char *fscrypt_get_symlink(struct inode *inode,
                                              const void *caddr,
                                              unsigned int max_size,
                                              struct delayed_call *done)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int fscrypt_symlink_getattr(const struct path *path,
                                          struct kstat *stat)
{
        return -EOPNOTSUPP;
}

static inline void fscrypt_set_ops(struct super_block *sb,
                                   const struct fscrypt_operations *s_cop)
{
}

#endif        /* !CONFIG_FS_ENCRYPTION */

/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT

bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode);

void fscrypt_set_bio_crypt_ctx(struct bio *bio,
                               const struct inode *inode, u64 first_lblk,
                               gfp_t gfp_mask);

void fscrypt_set_bio_crypt_ctx_bh(struct bio *bio,
                                  const struct buffer_head *first_bh,
                                  gfp_t gfp_mask);

bool fscrypt_mergeable_bio(struct bio *bio, const struct inode *inode,
                           u64 next_lblk);

bool fscrypt_mergeable_bio_bh(struct bio *bio,
                              const struct buffer_head *next_bh);

bool fscrypt_dio_supported(struct inode *inode);

u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks);

#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

static inline bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return false;
}

static inline void fscrypt_set_bio_crypt_ctx(struct bio *bio,
                                             const struct inode *inode,
                                             u64 first_lblk, gfp_t gfp_mask) { }

static inline void fscrypt_set_bio_crypt_ctx_bh(
                                         struct bio *bio,
                                         const struct buffer_head *first_bh,
                                         gfp_t gfp_mask) { }

static inline bool fscrypt_mergeable_bio(struct bio *bio,
                                         const struct inode *inode,
                                         u64 next_lblk)
{
        return true;
}

static inline bool fscrypt_mergeable_bio_bh(struct bio *bio,
                                            const struct buffer_head *next_bh)
{
        return true;
}

static inline bool fscrypt_dio_supported(struct inode *inode)
{
        return !fscrypt_needs_contents_encryption(inode);
}

static inline u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk,
                                          u64 nr_blocks)
{
        return nr_blocks;
}
#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

/**
 * fscrypt_inode_uses_inline_crypto() - test whether an inode uses inline
 *                                        encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the block layer via blk-crypto rather
 *           than in the filesystem layer.
 */
static inline bool fscrypt_inode_uses_inline_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               __fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_inode_uses_fs_layer_crypto() - test whether an inode uses fs-layer
 *                                          encryption
 * @inode: an inode. If encrypted, its key must be set up.
 *
 * Return: true if the inode requires file contents encryption and if the
 *           encryption should be done in the filesystem layer rather than in the
 *           block layer via blk-crypto.
 */
static inline bool fscrypt_inode_uses_fs_layer_crypto(const struct inode *inode)
{
        return fscrypt_needs_contents_encryption(inode) &&
               !__fscrypt_inode_uses_inline_crypto(inode);
}

/**
 * fscrypt_has_encryption_key() - check whether an inode has had its key set up
 * @inode: the inode to check
 *
 * Return: %true if the inode has had its encryption key set up, else %false.
 *
 * Usually this should be preceded by fscrypt_get_encryption_info() to try to
 * set up the key first.
 */
static inline bool fscrypt_has_encryption_key(const struct inode *inode)
{
        return fscrypt_get_inode_info(inode) != NULL;
}

/**
 * fscrypt_prepare_link() - prepare to link an inode into a possibly-encrypted
 *                            directory
 * @old_dentry: an existing dentry for the inode being linked
 * @dir: the target directory
 * @dentry: negative dentry for the target filename
 *
 * A new link can only be added to an encrypted directory if the directory's
 * encryption key is available --- since otherwise we'd have no way to encrypt
 * the filename.
 *
 * We also verify that the link will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
 * -EXDEV if the link would result in an inconsistent encryption policy, or
 * another -errno code.
 */
static inline int fscrypt_prepare_link(struct dentry *old_dentry,
                                       struct inode *dir,
                                       struct dentry *dentry)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_link(d_inode(old_dentry), dir, dentry);
        return 0;
}

/**
 * fscrypt_prepare_rename() - prepare for a rename between possibly-encrypted
 *                              directories
 * @old_dir: source directory
 * @old_dentry: dentry for source file
 * @new_dir: target directory
 * @new_dentry: dentry for target location (may be negative unless exchanging)
 * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
 *
 * Prepare for ->rename() where the source and/or target directories may be
 * encrypted.  A new link can only be added to an encrypted directory if the
 * directory's encryption key is available --- since otherwise we'd have no way
 * to encrypt the filename.  A rename to an existing name, on the other hand,
 * *is* cryptographically possible without the key.  However, we take the more
 * conservative approach and just forbid all no-key renames.
 *
 * We also verify that the rename will not violate the constraint that all files
 * in an encrypted directory tree use the same encryption policy.
 *
 * Return: 0 on success, -ENOKEY if an encryption key is missing, -EXDEV if the
 * rename would cause inconsistent encryption policies, or another -errno code.
 */
static inline int fscrypt_prepare_rename(struct inode *old_dir,
                                         struct dentry *old_dentry,
                                         struct inode *new_dir,
                                         struct dentry *new_dentry,
                                         unsigned int flags)
{
        if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
                return __fscrypt_prepare_rename(old_dir, old_dentry,
                                                new_dir, new_dentry, flags);
        return 0;
}

/**
 * fscrypt_prepare_lookup() - prepare to lookup a name in a possibly-encrypted
 *                              directory
 * @dir: directory being searched
 * @dentry: filename being looked up
 * @fname: (output) the name to use to search the on-disk directory
 *
 * Prepare for ->lookup() in a directory which may be encrypted by determining
 * the name that will actually be used to search the directory on-disk.  If the
 * directory's encryption policy is supported by this kernel and its encryption
 * key is available, then the lookup is assumed to be by plaintext name;
 * otherwise, it is assumed to be by no-key name.
 *
 * This will set DCACHE_NOKEY_NAME on the dentry if the lookup is by no-key
 * name.  In this case the filesystem must assign the dentry a dentry_operations
 * which contains fscrypt_d_revalidate (or contains a d_revalidate method that
 * calls fscrypt_d_revalidate), so that the dentry will be invalidated if the
 * directory's encryption key is later added.
 *
 * Return: 0 on success; -ENOENT if the directory's key is unavailable but the
 * filename isn't a valid no-key name, so a negative dentry should be created;
 * or another -errno code.
 */
static inline int fscrypt_prepare_lookup(struct inode *dir,
                                         struct dentry *dentry,
                                         struct fscrypt_name *fname)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_lookup(dir, dentry, fname);

        memset(fname, 0, sizeof(*fname));
        fname->usr_fname = &dentry->d_name;
        fname->disk_name.name = (unsigned char *)dentry->d_name.name;
        fname->disk_name.len = dentry->d_name.len;

        fscrypt_prepare_dentry(dentry, false);

        return 0;
}

/**
 * fscrypt_prepare_readdir() - prepare to read a possibly-encrypted directory
 * @dir: the directory inode
 *
 * If the directory is encrypted and it doesn't already have its encryption key
 * set up, try to set it up so that the filenames will be listed in plaintext
 * form rather than in no-key form.
 *
 * Return: 0 on success; -errno on error.  Note that the encryption key being
 *           unavailable is not considered an error.  It is also not an error if
 *           the encryption policy is unsupported by this kernel; that is treated
 *           like the key being unavailable, so that files can still be deleted.
 */
static inline int fscrypt_prepare_readdir(struct inode *dir)
{
        if (IS_ENCRYPTED(dir))
                return __fscrypt_prepare_readdir(dir);
        return 0;
}

/**
 * fscrypt_prepare_setattr() - prepare to change a possibly-encrypted inode's
 *                               attributes
 * @dentry: dentry through which the inode is being changed
 * @attr: attributes to change
 *
 * Prepare for ->setattr() on a possibly-encrypted inode.  On an encrypted file,
 * most attribute changes are allowed even without the encryption key.  However,
 * without the encryption key we do have to forbid truncates.  This is needed
 * because the size being truncated to may not be a multiple of the filesystem
 * block size, and in that case we'd have to decrypt the final block, zero the
 * portion past i_size, and re-encrypt it.  (We *could* allow truncating to a
 * filesystem block boundary, but it's simpler to just forbid all truncates ---
 * and we already forbid all other contents modifications without the key.)
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 * if a problem occurred while setting up the encryption key.
 */
static inline int fscrypt_prepare_setattr(struct dentry *dentry,
                                          struct iattr *attr)
{
        if (IS_ENCRYPTED(d_inode(dentry)))
                return __fscrypt_prepare_setattr(dentry, attr);
        return 0;
}

/**
 * fscrypt_encrypt_symlink() - encrypt the symlink target if needed
 * @inode: symlink inode
 * @target: plaintext symlink target
 * @len: length of @target excluding null terminator
 * @disk_link: (in/out) the on-disk symlink target being prepared
 *
 * If the symlink target needs to be encrypted, then this function encrypts it
 * into @disk_link->name.  fscrypt_prepare_symlink() must have been called
 * previously to compute @disk_link->len.  If the filesystem did not allocate a
 * buffer for @disk_link->name after calling fscrypt_prepare_link(), then one
 * will be kmalloc()'ed and the filesystem will be responsible for freeing it.
 *
 * Return: 0 on success, -errno on failure
 */
static inline int fscrypt_encrypt_symlink(struct inode *inode,
                                          const char *target,
                                          unsigned int len,
                                          struct fscrypt_str *disk_link)
{
        if (IS_ENCRYPTED(inode))
                return __fscrypt_encrypt_symlink(inode, target, len, disk_link);
        return 0;
}

/* If *pagep is a bounce page, free it and set *pagep to the pagecache page */
static inline void fscrypt_finalize_bounce_page(struct page **pagep)
{
        struct page *page = *pagep;

        if (fscrypt_is_bounce_page(page)) {
                *pagep = fscrypt_pagecache_page(page);
                fscrypt_free_bounce_page(page);
        }
}

#endif        /* _LINUX_FSCRYPT_H */









































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 



















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
// SPDX-License-Identifier: GPL-2.0
/*
 *  Shared Memory Communications over RDMA (SMC-R) and RoCE
 *
 *  Generic netlink support functions to configure an SMC-R PNET table
 *
 *  Copyright IBM Corp. 2016
 *
 *  Author(s):  Thomas Richter <tmricht@linux.vnet.ibm.com>
 */

#include <linux/module.h>
#include <linux/list.h>
#include <linux/ctype.h>
#include <linux/mutex.h>
#include <net/netlink.h>
#include <net/genetlink.h>

#include <uapi/linux/if.h>
#include <uapi/linux/smc.h>

#include <rdma/ib_verbs.h>

#include <net/netns/generic.h>
#include "smc_netns.h"

#include "smc_pnet.h"
#include "smc_ib.h"
#include "smc_ism.h"
#include "smc_core.h"

static struct net_device *__pnet_find_base_ndev(struct net_device *ndev);
static struct net_device *pnet_find_base_ndev(struct net_device *ndev);

static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = {
        [SMC_PNETID_NAME] = {
                .type = NLA_NUL_STRING,
                .len = SMC_MAX_PNETID_LEN
        },
        [SMC_PNETID_ETHNAME] = {
                .type = NLA_NUL_STRING,
                .len = IFNAMSIZ - 1
        },
        [SMC_PNETID_IBNAME] = {
                .type = NLA_NUL_STRING,
                .len = IB_DEVICE_NAME_MAX - 1
        },
        [SMC_PNETID_IBPORT] = { .type = NLA_U8 }
};

static struct genl_family smc_pnet_nl_family;

enum smc_pnet_nametype {
        SMC_PNET_ETH        = 1,
        SMC_PNET_IB        = 2,
};

/* pnet entry stored in pnet table */
struct smc_pnetentry {
        struct list_head list;
        char pnet_name[SMC_MAX_PNETID_LEN + 1];
        enum smc_pnet_nametype type;
        union {
                struct {
                        char eth_name[IFNAMSIZ + 1];
                        struct net_device *ndev;
                        netdevice_tracker dev_tracker;
                };
                struct {
                        char ib_name[IB_DEVICE_NAME_MAX + 1];
                        u8 ib_port;
                };
        };
};

/* Check if the pnetid is set */
bool smc_pnet_is_pnetid_set(u8 *pnetid)
{
        if (pnetid[0] == 0 || pnetid[0] == _S)
                return false;
        return true;
}

/* Check if two given pnetids match */
static bool smc_pnet_match(u8 *pnetid1, u8 *pnetid2)
{
        int i;

        for (i = 0; i < SMC_MAX_PNETID_LEN; i++) {
                if ((pnetid1[i] == 0 || pnetid1[i] == _S) &&
                    (pnetid2[i] == 0 || pnetid2[i] == _S))
                        break;
                if (pnetid1[i] != pnetid2[i])
                        return false;
        }
        return true;
}

/* Remove a pnetid from the pnet table.
 */
static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
{
        struct smc_pnetentry *pnetelem, *tmp_pe;
        struct smc_pnettable *pnettable;
        struct smc_ib_device *ibdev;
        struct smcd_dev *smcd;
        struct smc_net *sn;
        int rc = -ENOENT;
        int ibport;

        /* get pnettable for namespace */
        sn = net_generic(net, smc_net_id);
        pnettable = &sn->pnettable;

        /* remove table entry */
        mutex_lock(&pnettable->lock);
        list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist,
                                 list) {
                if (!pnet_name ||
                    smc_pnet_match(pnetelem->pnet_name, pnet_name)) {
                        list_del(&pnetelem->list);
                        if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) {
                                netdev_put(pnetelem->ndev,
                                           &pnetelem->dev_tracker);
                                pr_warn_ratelimited("smc: net device %s "
                                                    "erased user defined "
                                                    "pnetid %.16s\n",
                                                    pnetelem->eth_name,
                                                    pnetelem->pnet_name);
                        }
                        kfree(pnetelem);
                        rc = 0;
                }
        }
        mutex_unlock(&pnettable->lock);

        /* if this is not the initial namespace, stop here */
        if (net != &init_net)
                return rc;

        /* remove ib devices */
        mutex_lock(&smc_ib_devices.mutex);
        list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
                for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) {
                        if (ibdev->pnetid_by_user[ibport] &&
                            (!pnet_name ||
                             smc_pnet_match(pnet_name,
                                            ibdev->pnetid[ibport]))) {
                                pr_warn_ratelimited("smc: ib device %s ibport "
                                                    "%d erased user defined "
                                                    "pnetid %.16s\n",
                                                    ibdev->ibdev->name,
                                                    ibport + 1,
                                                    ibdev->pnetid[ibport]);
                                memset(ibdev->pnetid[ibport], 0,
                                       SMC_MAX_PNETID_LEN);
                                ibdev->pnetid_by_user[ibport] = false;
                                rc = 0;
                        }
                }
        }
        mutex_unlock(&smc_ib_devices.mutex);
        /* remove smcd devices */
        mutex_lock(&smcd_dev_list.mutex);
        list_for_each_entry(smcd, &smcd_dev_list.list, list) {
                if (smcd->pnetid_by_user &&
                    (!pnet_name ||
                     smc_pnet_match(pnet_name, smcd->pnetid))) {
                        pr_warn_ratelimited("smc: smcd device %s "
                                            "erased user defined pnetid "
                                            "%.16s\n",
                                            dev_name(smcd->ops->get_dev(smcd)),
                                            smcd->pnetid);
                        memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN);
                        smcd->pnetid_by_user = false;
                        rc = 0;
                }
        }
        mutex_unlock(&smcd_dev_list.mutex);
        return rc;
}

/* Add the reference to a given network device to the pnet table.
 */
static int smc_pnet_add_by_ndev(struct net_device *ndev)
{
        struct smc_pnetentry *pnetelem, *tmp_pe;
        struct smc_pnettable *pnettable;
        struct net *net = dev_net(ndev);
        struct smc_net *sn;
        int rc = -ENOENT;

        /* get pnettable for namespace */
        sn = net_generic(net, smc_net_id);
        pnettable = &sn->pnettable;

        mutex_lock(&pnettable->lock);
        list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
                if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev &&
                    !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) {
                        netdev_hold(ndev, &pnetelem->dev_tracker, GFP_ATOMIC);
                        pnetelem->ndev = ndev;
                        rc = 0;
                        pr_warn_ratelimited("smc: adding net device %s with "
                                            "user defined pnetid %.16s\n",
                                            pnetelem->eth_name,
                                            pnetelem->pnet_name);
                        break;
                }
        }
        mutex_unlock(&pnettable->lock);
        return rc;
}

/* Remove the reference to a given network device from the pnet table.
 */
static int smc_pnet_remove_by_ndev(struct net_device *ndev)
{
        struct smc_pnetentry *pnetelem, *tmp_pe;
        struct smc_pnettable *pnettable;
        struct net *net = dev_net(ndev);
        struct smc_net *sn;
        int rc = -ENOENT;

        /* get pnettable for namespace */
        sn = net_generic(net, smc_net_id);
        pnettable = &sn->pnettable;

        mutex_lock(&pnettable->lock);
        list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) {
                if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) {
                        netdev_put(pnetelem->ndev, &pnetelem->dev_tracker);
                        pnetelem->ndev = NULL;
                        rc = 0;
                        pr_warn_ratelimited("smc: removing net device %s with "
                                            "user defined pnetid %.16s\n",
                                            pnetelem->eth_name,
                                            pnetelem->pnet_name);
                        break;
                }
        }
        mutex_unlock(&pnettable->lock);
        return rc;
}

/* Apply pnetid to ib device when no pnetid is set.
 */
static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port,
                              char *pnet_name)
{
        bool applied = false;

        mutex_lock(&smc_ib_devices.mutex);
        if (!smc_pnet_is_pnetid_set(ib_dev->pnetid[ib_port - 1])) {
                memcpy(ib_dev->pnetid[ib_port - 1], pnet_name,
                       SMC_MAX_PNETID_LEN);
                ib_dev->pnetid_by_user[ib_port - 1] = true;
                applied = true;
        }
        mutex_unlock(&smc_ib_devices.mutex);
        return applied;
}

/* Apply pnetid to smcd device when no pnetid is set.
 */
static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name)
{
        bool applied = false;

        mutex_lock(&smcd_dev_list.mutex);
        if (!smc_pnet_is_pnetid_set(smcd_dev->pnetid)) {
                memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN);
                smcd_dev->pnetid_by_user = true;
                applied = true;
        }
        mutex_unlock(&smcd_dev_list.mutex);
        return applied;
}

/* The limit for pnetid is 16 characters.
 * Valid characters should be (single-byte character set) a-z, A-Z, 0-9.
 * Lower case letters are converted to upper case.
 * Interior blanks should not be used.
 */
static bool smc_pnetid_valid(const char *pnet_name, char *pnetid)
{
        char *bf = skip_spaces(pnet_name);
        size_t len = strlen(bf);
        char *end = bf + len;

        if (!len)
                return false;
        while (--end >= bf && isspace(*end))
                ;
        if (end - bf >= SMC_MAX_PNETID_LEN)
                return false;
        while (bf <= end) {
                if (!isalnum(*bf))
                        return false;
                *pnetid++ = islower(*bf) ? toupper(*bf) : *bf;
                bf++;
        }
        *pnetid = '\0';
        return true;
}

/* Find an infiniband device by a given name. The device might not exist. */
static struct smc_ib_device *smc_pnet_find_ib(char *ib_name)
{
        struct smc_ib_device *ibdev;

        mutex_lock(&smc_ib_devices.mutex);
        list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
                if (!strncmp(ibdev->ibdev->name, ib_name,
                             sizeof(ibdev->ibdev->name)) ||
                    (ibdev->ibdev->dev.parent &&
                     !strncmp(dev_name(ibdev->ibdev->dev.parent), ib_name,
                             IB_DEVICE_NAME_MAX - 1))) {
                        goto out;
                }
        }
        ibdev = NULL;
out:
        mutex_unlock(&smc_ib_devices.mutex);
        return ibdev;
}

/* Find an smcd device by a given name. The device might not exist. */
static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name)
{
        struct smcd_dev *smcd_dev;

        mutex_lock(&smcd_dev_list.mutex);
        list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
                if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)),
                             smcd_name, IB_DEVICE_NAME_MAX - 1))
                        goto out;
        }
        smcd_dev = NULL;
out:
        mutex_unlock(&smcd_dev_list.mutex);
        return smcd_dev;
}

static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net,
                            char *eth_name, char *pnet_name)
{
        struct smc_pnetentry *tmp_pe, *new_pe;
        struct net_device *ndev, *base_ndev;
        u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
        bool new_netdev;
        int rc;

        /* check if (base) netdev already has a pnetid. If there is one, we do
         * not want to add a pnet table entry
         */
        rc = -EEXIST;
        ndev = dev_get_by_name(net, eth_name);        /* dev_hold() */
        if (ndev) {
                base_ndev = pnet_find_base_ndev(ndev);
                if (!smc_pnetid_by_dev_port(base_ndev->dev.parent,
                                            base_ndev->dev_port, ndev_pnetid))
                        goto out_put;
        }

        /* add a new netdev entry to the pnet table if there isn't one */
        rc = -ENOMEM;
        new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
        if (!new_pe)
                goto out_put;
        new_pe->type = SMC_PNET_ETH;
        memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
        strncpy(new_pe->eth_name, eth_name, IFNAMSIZ);
        rc = -EEXIST;
        new_netdev = true;
        mutex_lock(&pnettable->lock);
        list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
                if (tmp_pe->type == SMC_PNET_ETH &&
                    !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) {
                        new_netdev = false;
                        break;
                }
        }
        if (new_netdev) {
                if (ndev) {
                        new_pe->ndev = ndev;
                        netdev_tracker_alloc(ndev, &new_pe->dev_tracker,
                                             GFP_ATOMIC);
                }
                list_add_tail(&new_pe->list, &pnettable->pnetlist);
                mutex_unlock(&pnettable->lock);
        } else {
                mutex_unlock(&pnettable->lock);
                kfree(new_pe);
                goto out_put;
        }
        if (ndev)
                pr_warn_ratelimited("smc: net device %s "
                                    "applied user defined pnetid %.16s\n",
                                    new_pe->eth_name, new_pe->pnet_name);
        return 0;

out_put:
        dev_put(ndev);
        return rc;
}

static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
                           u8 ib_port, char *pnet_name)
{
        struct smc_pnetentry *tmp_pe, *new_pe;
        struct smc_ib_device *ib_dev;
        bool smcddev_applied = true;
        bool ibdev_applied = true;
        struct smcd_dev *smcd;
        struct device *dev;
        bool new_ibdev;

        /* try to apply the pnetid to active devices */
        ib_dev = smc_pnet_find_ib(ib_name);
        if (ib_dev) {
                ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name);
                if (ibdev_applied)
                        pr_warn_ratelimited("smc: ib device %s ibport %d "
                                            "applied user defined pnetid "
                                            "%.16s\n", ib_dev->ibdev->name,
                                            ib_port,
                                            ib_dev->pnetid[ib_port - 1]);
        }
        smcd = smc_pnet_find_smcd(ib_name);
        if (smcd) {
                smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name);
                if (smcddev_applied) {
                        dev = smcd->ops->get_dev(smcd);
                        pr_warn_ratelimited("smc: smcd device %s "
                                            "applied user defined pnetid "
                                            "%.16s\n", dev_name(dev),
                                            smcd->pnetid);
                }
        }
        /* Apply fails when a device has a hardware-defined pnetid set, do not
         * add a pnet table entry in that case.
         */
        if (!ibdev_applied || !smcddev_applied)
                return -EEXIST;

        /* add a new ib entry to the pnet table if there isn't one */
        new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL);
        if (!new_pe)
                return -ENOMEM;
        new_pe->type = SMC_PNET_IB;
        memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
        strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX);
        new_pe->ib_port = ib_port;

        new_ibdev = true;
        mutex_lock(&pnettable->lock);
        list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
                if (tmp_pe->type == SMC_PNET_IB &&
                    !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
                        new_ibdev = false;
                        break;
                }
        }
        if (new_ibdev) {
                list_add_tail(&new_pe->list, &pnettable->pnetlist);
                mutex_unlock(&pnettable->lock);
        } else {
                mutex_unlock(&pnettable->lock);
                kfree(new_pe);
        }
        return (new_ibdev) ? 0 : -EEXIST;
}

/* Append a pnetid to the end of the pnet table if not already on this list.
 */
static int smc_pnet_enter(struct net *net, struct nlattr *tb[])
{
        char pnet_name[SMC_MAX_PNETID_LEN + 1];
        struct smc_pnettable *pnettable;
        bool new_netdev = false;
        bool new_ibdev = false;
        struct smc_net *sn;
        u8 ibport = 1;
        char *string;
        int rc;

        /* get pnettable for namespace */
        sn = net_generic(net, smc_net_id);
        pnettable = &sn->pnettable;

        rc = -EINVAL;
        if (!tb[SMC_PNETID_NAME])
                goto error;
        string = (char *)nla_data(tb[SMC_PNETID_NAME]);
        if (!smc_pnetid_valid(string, pnet_name))
                goto error;

        if (tb[SMC_PNETID_ETHNAME]) {
                string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]);
                rc = smc_pnet_add_eth(pnettable, net, string, pnet_name);
                if (!rc)
                        new_netdev = true;
                else if (rc != -EEXIST)
                        goto error;
        }

        /* if this is not the initial namespace, stop here */
        if (net != &init_net)
                return new_netdev ? 0 : -EEXIST;

        rc = -EINVAL;
        if (tb[SMC_PNETID_IBNAME]) {
                string = (char *)nla_data(tb[SMC_PNETID_IBNAME]);
                string = strim(string);
                if (tb[SMC_PNETID_IBPORT]) {
                        ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]);
                        if (ibport < 1 || ibport > SMC_MAX_PORTS)
                                goto error;
                }
                rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name);
                if (!rc)
                        new_ibdev = true;
                else if (rc != -EEXIST)
                        goto error;
        }
        return (new_netdev || new_ibdev) ? 0 : -EEXIST;

error:
        return rc;
}

/* Convert an smc_pnetentry to a netlink attribute sequence */
static int smc_pnet_set_nla(struct sk_buff *msg,
                            struct smc_pnetentry *pnetelem)
{
        if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name))
                return -1;
        if (pnetelem->type == SMC_PNET_ETH) {
                if (nla_put_string(msg, SMC_PNETID_ETHNAME,
                                   pnetelem->eth_name))
                        return -1;
        } else {
                if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a"))
                        return -1;
        }
        if (pnetelem->type == SMC_PNET_IB) {
                if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) ||
                    nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port))
                        return -1;
        } else {
                if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") ||
                    nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff))
                        return -1;
        }

        return 0;
}

static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);

        return smc_pnet_enter(net, info->attrs);
}

static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);

        if (!info->attrs[SMC_PNETID_NAME])
                return -EINVAL;
        return smc_pnet_remove_by_pnetid(net,
                                (char *)nla_data(info->attrs[SMC_PNETID_NAME]));
}

static int smc_pnet_dump_start(struct netlink_callback *cb)
{
        cb->args[0] = 0;
        return 0;
}

static int smc_pnet_dumpinfo(struct sk_buff *skb,
                             u32 portid, u32 seq, u32 flags,
                             struct smc_pnetentry *pnetelem)
{
        void *hdr;

        hdr = genlmsg_put(skb, portid, seq, &smc_pnet_nl_family,
                          flags, SMC_PNETID_GET);
        if (!hdr)
                return -ENOMEM;
        if (smc_pnet_set_nla(skb, pnetelem) < 0) {
                genlmsg_cancel(skb, hdr);
                return -EMSGSIZE;
        }
        genlmsg_end(skb, hdr);
        return 0;
}

static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid,
                          u32 seq, u8 *pnetid, int start_idx)
{
        struct smc_pnettable *pnettable;
        struct smc_pnetentry *pnetelem;
        struct smc_net *sn;
        int idx = 0;

        /* get pnettable for namespace */
        sn = net_generic(net, smc_net_id);
        pnettable = &sn->pnettable;

        /* dump pnettable entries */
        mutex_lock(&pnettable->lock);
        list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
                if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid))
                        continue;
                if (idx++ < start_idx)
                        continue;
                /* if this is not the initial namespace, dump only netdev */
                if (net != &init_net && pnetelem->type != SMC_PNET_ETH)
                        continue;
                if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI,
                                      pnetelem)) {
                        --idx;
                        break;
                }
        }
        mutex_unlock(&pnettable->lock);
        return idx;
}

static int smc_pnet_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        int idx;

        idx = _smc_pnet_dump(net, skb, NETLINK_CB(cb->skb).portid,
                             cb->nlh->nlmsg_seq, NULL, cb->args[0]);

        cb->args[0] = idx;
        return skb->len;
}

/* Retrieve one PNETID entry */
static int smc_pnet_get(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);
        struct sk_buff *msg;
        void *hdr;

        if (!info->attrs[SMC_PNETID_NAME])
                return -EINVAL;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        _smc_pnet_dump(net, msg, info->snd_portid, info->snd_seq,
                       nla_data(info->attrs[SMC_PNETID_NAME]), 0);

        /* finish multi part message and send it */
        hdr = nlmsg_put(msg, info->snd_portid, info->snd_seq, NLMSG_DONE, 0,
                        NLM_F_MULTI);
        if (!hdr) {
                nlmsg_free(msg);
                return -EMSGSIZE;
        }
        return genlmsg_reply(msg, info);
}

/* Remove and delete all pnetids from pnet table.
 */
static int smc_pnet_flush(struct sk_buff *skb, struct genl_info *info)
{
        struct net *net = genl_info_net(info);

        smc_pnet_remove_by_pnetid(net, NULL);
        return 0;
}

/* SMC_PNETID generic netlink operation definition */
static const struct genl_ops smc_pnet_ops[] = {
        {
                .cmd = SMC_PNETID_GET,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                /* can be retrieved by unprivileged users */
                .doit = smc_pnet_get,
                .dumpit = smc_pnet_dump,
                .start = smc_pnet_dump_start
        },
        {
                .cmd = SMC_PNETID_ADD,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .flags = GENL_ADMIN_PERM,
                .doit = smc_pnet_add
        },
        {
                .cmd = SMC_PNETID_DEL,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .flags = GENL_ADMIN_PERM,
                .doit = smc_pnet_del
        },
        {
                .cmd = SMC_PNETID_FLUSH,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .flags = GENL_ADMIN_PERM,
                .doit = smc_pnet_flush
        }
};

/* SMC_PNETID family definition */
static struct genl_family smc_pnet_nl_family __ro_after_init = {
        .hdrsize = 0,
        .name = SMCR_GENL_FAMILY_NAME,
        .version = SMCR_GENL_FAMILY_VERSION,
        .maxattr = SMC_PNETID_MAX,
        .policy = smc_pnet_policy,
        .netnsok = true,
        .module = THIS_MODULE,
        .ops = smc_pnet_ops,
        .n_ops =  ARRAY_SIZE(smc_pnet_ops),
        .resv_start_op = SMC_PNETID_FLUSH + 1,
};

bool smc_pnet_is_ndev_pnetid(struct net *net, u8 *pnetid)
{
        struct smc_net *sn = net_generic(net, smc_net_id);
        struct smc_pnetids_ndev_entry *pe;
        bool rc = false;

        read_lock(&sn->pnetids_ndev.lock);
        list_for_each_entry(pe, &sn->pnetids_ndev.list, list) {
                if (smc_pnet_match(pnetid, pe->pnetid)) {
                        rc = true;
                        goto unlock;
                }
        }

unlock:
        read_unlock(&sn->pnetids_ndev.lock);
        return rc;
}

static int smc_pnet_add_pnetid(struct net *net, u8 *pnetid)
{
        struct smc_net *sn = net_generic(net, smc_net_id);
        struct smc_pnetids_ndev_entry *pe, *pi;

        pe = kzalloc(sizeof(*pe), GFP_KERNEL);
        if (!pe)
                return -ENOMEM;

        write_lock(&sn->pnetids_ndev.lock);
        list_for_each_entry(pi, &sn->pnetids_ndev.list, list) {
                if (smc_pnet_match(pnetid, pi->pnetid)) {
                        refcount_inc(&pi->refcnt);
                        kfree(pe);
                        goto unlock;
                }
        }
        refcount_set(&pe->refcnt, 1);
        memcpy(pe->pnetid, pnetid, SMC_MAX_PNETID_LEN);
        list_add_tail(&pe->list, &sn->pnetids_ndev.list);

unlock:
        write_unlock(&sn->pnetids_ndev.lock);
        return 0;
}

static void smc_pnet_remove_pnetid(struct net *net, u8 *pnetid)
{
        struct smc_net *sn = net_generic(net, smc_net_id);
        struct smc_pnetids_ndev_entry *pe, *pe2;

        write_lock(&sn->pnetids_ndev.lock);
        list_for_each_entry_safe(pe, pe2, &sn->pnetids_ndev.list, list) {
                if (smc_pnet_match(pnetid, pe->pnetid)) {
                        if (refcount_dec_and_test(&pe->refcnt)) {
                                list_del(&pe->list);
                                kfree(pe);
                        }
                        break;
                }
        }
        write_unlock(&sn->pnetids_ndev.lock);
}

static void smc_pnet_add_base_pnetid(struct net *net, struct net_device *dev,
                                     u8 *ndev_pnetid)
{
        struct net_device *base_dev;

        base_dev = __pnet_find_base_ndev(dev);
        if (base_dev->flags & IFF_UP &&
            !smc_pnetid_by_dev_port(base_dev->dev.parent, base_dev->dev_port,
                                    ndev_pnetid)) {
                /* add to PNETIDs list */
                smc_pnet_add_pnetid(net, ndev_pnetid);
        }
}

/* create initial list of netdevice pnetids */
static void smc_pnet_create_pnetids_list(struct net *net)
{
        u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
        struct net_device *dev;

        /* Newly created netns do not have devices.
         * Do not even acquire rtnl.
         */
        if (list_empty(&net->dev_base_head))
                return;

        /* Note: This might not be needed, because smc_pnet_netdev_event()
         * is also calling smc_pnet_add_base_pnetid() when handling
         * NETDEV_UP event.
         */
        rtnl_lock();
        for_each_netdev(net, dev)
                smc_pnet_add_base_pnetid(net, dev, ndev_pnetid);
        rtnl_unlock();
}

/* clean up list of netdevice pnetids */
static void smc_pnet_destroy_pnetids_list(struct net *net)
{
        struct smc_net *sn = net_generic(net, smc_net_id);
        struct smc_pnetids_ndev_entry *pe, *temp_pe;

        write_lock(&sn->pnetids_ndev.lock);
        list_for_each_entry_safe(pe, temp_pe, &sn->pnetids_ndev.list, list) {
                list_del(&pe->list);
                kfree(pe);
        }
        write_unlock(&sn->pnetids_ndev.lock);
}

static int smc_pnet_netdev_event(struct notifier_block *this,
                                 unsigned long event, void *ptr)
{
        struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(event_dev);
        u8 ndev_pnetid[SMC_MAX_PNETID_LEN];

        switch (event) {
        case NETDEV_REBOOT:
        case NETDEV_UNREGISTER:
                smc_pnet_remove_by_ndev(event_dev);
                smc_ib_ndev_change(event_dev, event);
                return NOTIFY_OK;
        case NETDEV_REGISTER:
                smc_pnet_add_by_ndev(event_dev);
                smc_ib_ndev_change(event_dev, event);
                return NOTIFY_OK;
        case NETDEV_UP:
                smc_pnet_add_base_pnetid(net, event_dev, ndev_pnetid);
                return NOTIFY_OK;
        case NETDEV_DOWN:
                event_dev = __pnet_find_base_ndev(event_dev);
                if (!smc_pnetid_by_dev_port(event_dev->dev.parent,
                                            event_dev->dev_port, ndev_pnetid)) {
                        /* remove from PNETIDs list */
                        smc_pnet_remove_pnetid(net, ndev_pnetid);
                }
                return NOTIFY_OK;
        default:
                return NOTIFY_DONE;
        }
}

static struct notifier_block smc_netdev_notifier = {
        .notifier_call = smc_pnet_netdev_event
};

/* init network namespace */
int smc_pnet_net_init(struct net *net)
{
        struct smc_net *sn = net_generic(net, smc_net_id);
        struct smc_pnettable *pnettable = &sn->pnettable;
        struct smc_pnetids_ndev *pnetids_ndev = &sn->pnetids_ndev;

        INIT_LIST_HEAD(&pnettable->pnetlist);
        mutex_init(&pnettable->lock);
        INIT_LIST_HEAD(&pnetids_ndev->list);
        rwlock_init(&pnetids_ndev->lock);

        smc_pnet_create_pnetids_list(net);

        return 0;
}

int __init smc_pnet_init(void)
{
        int rc;

        rc = genl_register_family(&smc_pnet_nl_family);
        if (rc)
                return rc;
        rc = register_netdevice_notifier(&smc_netdev_notifier);
        if (rc)
                genl_unregister_family(&smc_pnet_nl_family);

        return rc;
}

/* exit network namespace */
void smc_pnet_net_exit(struct net *net)
{
        /* flush pnet table */
        smc_pnet_remove_by_pnetid(net, NULL);
        smc_pnet_destroy_pnetids_list(net);
}

void smc_pnet_exit(void)
{
        unregister_netdevice_notifier(&smc_netdev_notifier);
        genl_unregister_family(&smc_pnet_nl_family);
}

static struct net_device *__pnet_find_base_ndev(struct net_device *ndev)
{
        int i, nest_lvl;

        ASSERT_RTNL();
        nest_lvl = ndev->lower_level;
        for (i = 0; i < nest_lvl; i++) {
                struct list_head *lower = &ndev->adj_list.lower;

                if (list_empty(lower))
                        break;
                lower = lower->next;
                ndev = netdev_lower_get_next(ndev, &lower);
        }
        return ndev;
}

/* Determine one base device for stacked net devices.
 * If the lower device level contains more than one devices
 * (for instance with bonding slaves), just the first device
 * is used to reach a base device.
 */
static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
{
        rtnl_lock();
        ndev = __pnet_find_base_ndev(ndev);
        rtnl_unlock();
        return ndev;
}

static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
                                              u8 *pnetid)
{
        struct smc_pnettable *pnettable;
        struct net *net = dev_net(ndev);
        struct smc_pnetentry *pnetelem;
        struct smc_net *sn;
        int rc = -ENOENT;

        /* get pnettable for namespace */
        sn = net_generic(net, smc_net_id);
        pnettable = &sn->pnettable;

        mutex_lock(&pnettable->lock);
        list_for_each_entry(pnetelem, &pnettable->pnetlist, list) {
                if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) {
                        /* get pnetid of netdev device */
                        memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN);
                        rc = 0;
                        break;
                }
        }
        mutex_unlock(&pnettable->lock);
        return rc;
}

static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i,
                                  struct smc_init_info *ini)
{
        if (!ini->check_smcrv2 &&
            !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL,
                                  NULL)) {
                ini->ib_dev = ibdev;
                ini->ib_port = i;
                return 0;
        }
        if (ini->check_smcrv2 &&
            !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2,
                                  NULL, &ini->smcrv2)) {
                ini->smcrv2.ib_dev_v2 = ibdev;
                ini->smcrv2.ib_port_v2 = i;
                return 0;
        }
        return -ENODEV;
}

/* find a roce device for the given pnetid */
static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
                                          struct smc_init_info *ini,
                                          struct smc_ib_device *known_dev,
                                          struct net *net)
{
        struct smc_ib_device *ibdev;
        int i;

        mutex_lock(&smc_ib_devices.mutex);
        list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
                if (ibdev == known_dev ||
                    !rdma_dev_access_netns(ibdev->ibdev, net))
                        continue;
                for (i = 1; i <= SMC_MAX_PORTS; i++) {
                        if (!rdma_is_port_valid(ibdev->ibdev, i))
                                continue;
                        if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) &&
                            smc_ib_port_active(ibdev, i) &&
                            !test_bit(i - 1, ibdev->ports_going_away)) {
                                if (!smc_pnet_determine_gid(ibdev, i, ini))
                                        goto out;
                        }
                }
        }
out:
        mutex_unlock(&smc_ib_devices.mutex);
}

/* find alternate roce device with same pnet_id, vlan_id and net namespace */
void smc_pnet_find_alt_roce(struct smc_link_group *lgr,
                            struct smc_init_info *ini,
                            struct smc_ib_device *known_dev)
{
        struct net *net = lgr->net;

        _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev, net);
}

/* if handshake network device belongs to a roce device, return its
 * IB device and port
 */
static void smc_pnet_find_rdma_dev(struct net_device *netdev,
                                   struct smc_init_info *ini)
{
        struct net *net = dev_net(netdev);
        struct smc_ib_device *ibdev;

        mutex_lock(&smc_ib_devices.mutex);
        list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
                struct net_device *ndev;
                int i;

                /* check rdma net namespace */
                if (!rdma_dev_access_netns(ibdev->ibdev, net))
                        continue;

                for (i = 1; i <= SMC_MAX_PORTS; i++) {
                        if (!rdma_is_port_valid(ibdev->ibdev, i))
                                continue;
                        ndev = ib_device_get_netdev(ibdev->ibdev, i);
                        if (!ndev)
                                continue;
                        dev_put(ndev);
                        if (netdev == ndev &&
                            smc_ib_port_active(ibdev, i) &&
                            !test_bit(i - 1, ibdev->ports_going_away)) {
                                if (!smc_pnet_determine_gid(ibdev, i, ini))
                                        break;
                        }
                }
        }
        mutex_unlock(&smc_ib_devices.mutex);
}

/* Determine the corresponding IB device port based on the hardware PNETID.
 * Searching stops at the first matching active IB device port with vlan_id
 * configured.
 * If nothing found, check pnetid table.
 * If nothing found, try to use handshake device
 */
static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
                                         struct smc_init_info *ini)
{
        u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
        struct net_device *base_ndev;
        struct net *net;

        base_ndev = pnet_find_base_ndev(ndev);
        net = dev_net(ndev);
        if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port,
                                   ndev_pnetid) &&
            smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) &&
            smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) {
                smc_pnet_find_rdma_dev(base_ndev, ini);
                return; /* pnetid could not be determined */
        }
        _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net);
}

static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
                                        struct smc_init_info *ini)
{
        u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
        struct smcd_dev *ismdev;

        ndev = pnet_find_base_ndev(ndev);
        if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port,
                                   ndev_pnetid) &&
            smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid))
                return; /* pnetid could not be determined */

        mutex_lock(&smcd_dev_list.mutex);
        list_for_each_entry(ismdev, &smcd_dev_list.list, list) {
                if (smc_pnet_match(ismdev->pnetid, ndev_pnetid) &&
                    !ismdev->going_away &&
                    (!ini->ism_peer_gid[0].gid ||
                     !smc_ism_cantalk(&ini->ism_peer_gid[0], ini->vlan_id,
                                      ismdev))) {
                        ini->ism_dev[0] = ismdev;
                        break;
                }
        }
        mutex_unlock(&smcd_dev_list.mutex);
}

/* PNET table analysis for a given sock:
 * determine ib_device and port belonging to used internal TCP socket
 * ethernet interface.
 */
void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini)
{
        struct dst_entry *dst = sk_dst_get(sk);

        if (!dst)
                goto out;
        if (!dst->dev)
                goto out_rel;

        smc_pnet_find_roce_by_pnetid(dst->dev, ini);

out_rel:
        dst_release(dst);
out:
        return;
}

void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini)
{
        struct dst_entry *dst = sk_dst_get(sk);

        ini->ism_dev[0] = NULL;
        if (!dst)
                goto out;
        if (!dst->dev)
                goto out_rel;

        smc_pnet_find_ism_by_pnetid(dst->dev, ini);

out_rel:
        dst_release(dst);
out:
        return;
}

/* Lookup and apply a pnet table entry to the given ib device.
 */
int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port)
{
        char *ib_name = smcibdev->ibdev->name;
        struct smc_pnettable *pnettable;
        struct smc_pnetentry *tmp_pe;
        struct smc_net *sn;
        int rc = -ENOENT;

        /* get pnettable for init namespace */
        sn = net_generic(&init_net, smc_net_id);
        pnettable = &sn->pnettable;

        mutex_lock(&pnettable->lock);
        list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
                if (tmp_pe->type == SMC_PNET_IB &&
                    !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) &&
                    tmp_pe->ib_port == ib_port) {
                        smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name);
                        rc = 0;
                        break;
                }
        }
        mutex_unlock(&pnettable->lock);

        return rc;
}

/* Lookup and apply a pnet table entry to the given smcd device.
 */
int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
{
        const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev));
        struct smc_pnettable *pnettable;
        struct smc_pnetentry *tmp_pe;
        struct smc_net *sn;
        int rc = -ENOENT;

        /* get pnettable for init namespace */
        sn = net_generic(&init_net, smc_net_id);
        pnettable = &sn->pnettable;

        mutex_lock(&pnettable->lock);
        list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
                if (tmp_pe->type == SMC_PNET_IB &&
                    !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
                        smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name);
                        rc = 0;
                        break;
                }
        }
        mutex_unlock(&pnettable->lock);

        return rc;
}
















































  112 



































  112 































  904 
  222 


  177 




  142 


  611 
  991 






































  938 
  177 
  142 
 1001 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/cmpxchg.h
 *
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_CMPXCHG_H
#define __ASM_CMPXCHG_H

#include <linux/build_bug.h>
#include <linux/compiler.h>

#include <asm/barrier.h>
#include <asm/lse.h>

/*
 * We need separate acquire parameters for ll/sc and lse, since the full
 * barrier case is generated as release+dmb for the former and
 * acquire+release for the latter.
 */
#define __XCHG_CASE(w, sfx, name, sz, mb, nop_lse, acq, acq_lse, rel, cl)        \
static inline u##sz __xchg_case_##name##sz(u##sz x, volatile void *ptr)                \
{                                                                                \
        u##sz ret;                                                                \
        unsigned long tmp;                                                        \
                                                                                \
        asm volatile(ARM64_LSE_ATOMIC_INSN(                                        \
        /* LL/SC */                                                                \
        "        prfm        pstl1strm, %2\n"                                        \
        "1:        ld" #acq "xr" #sfx "\t%" #w "0, %2\n"                                \
        "        st" #rel "xr" #sfx "\t%w1, %" #w "3, %2\n"                        \
        "        cbnz        %w1, 1b\n"                                                \
        "        " #mb,                                                                \
        /* LSE atomics */                                                        \
        "        swp" #acq_lse #rel #sfx "\t%" #w "3, %" #w "0, %2\n"                \
                __nops(3)                                                        \
        "        " #nop_lse)                                                        \
        : "=&r" (ret), "=&r" (tmp), "+Q" (*(u##sz *)ptr)                        \
        : "r" (x)                                                                \
        : cl);                                                                        \
                                                                                \
        return ret;                                                                \
}

__XCHG_CASE(w, b,     ,  8,        ,    ,  ,  ,  ,         )
__XCHG_CASE(w, h,     , 16,        ,    ,  ,  ,  ,         )
__XCHG_CASE(w,  ,     , 32,        ,    ,  ,  ,  ,         )
__XCHG_CASE( ,  ,     , 64,        ,    ,  ,  ,  ,         )
__XCHG_CASE(w, b, acq_,  8,        ,    , a, a,  , "memory")
__XCHG_CASE(w, h, acq_, 16,        ,    , a, a,  , "memory")
__XCHG_CASE(w,  , acq_, 32,        ,    , a, a,  , "memory")
__XCHG_CASE( ,  , acq_, 64,        ,    , a, a,  , "memory")
__XCHG_CASE(w, b, rel_,  8,        ,    ,  ,  , l, "memory")
__XCHG_CASE(w, h, rel_, 16,        ,    ,  ,  , l, "memory")
__XCHG_CASE(w,  , rel_, 32,        ,    ,  ,  , l, "memory")
__XCHG_CASE( ,  , rel_, 64,        ,    ,  ,  , l, "memory")
__XCHG_CASE(w, b,  mb_,  8, dmb ish, nop,  , a, l, "memory")
__XCHG_CASE(w, h,  mb_, 16, dmb ish, nop,  , a, l, "memory")
__XCHG_CASE(w,  ,  mb_, 32, dmb ish, nop,  , a, l, "memory")
__XCHG_CASE( ,  ,  mb_, 64, dmb ish, nop,  , a, l, "memory")

#undef __XCHG_CASE

#define __XCHG_GEN(sfx)                                                        \
static __always_inline unsigned long                                        \
__arch_xchg##sfx(unsigned long x, volatile void *ptr, int size)                \
{                                                                        \
        switch (size) {                                                        \
        case 1:                                                                \
                return __xchg_case##sfx##_8(x, ptr);                        \
        case 2:                                                                \
                return __xchg_case##sfx##_16(x, ptr);                        \
        case 4:                                                                \
                return __xchg_case##sfx##_32(x, ptr);                        \
        case 8:                                                                \
                return __xchg_case##sfx##_64(x, ptr);                        \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
                                                                        \
        unreachable();                                                        \
}

__XCHG_GEN()
__XCHG_GEN(_acq)
__XCHG_GEN(_rel)
__XCHG_GEN(_mb)

#undef __XCHG_GEN

#define __xchg_wrapper(sfx, ptr, x)                                        \
({                                                                        \
        __typeof__(*(ptr)) __ret;                                        \
        __ret = (__typeof__(*(ptr)))                                        \
                __arch_xchg##sfx((unsigned long)(x), (ptr), sizeof(*(ptr))); \
        __ret;                                                                \
})

/* xchg */
#define arch_xchg_relaxed(...)        __xchg_wrapper(    , __VA_ARGS__)
#define arch_xchg_acquire(...)        __xchg_wrapper(_acq, __VA_ARGS__)
#define arch_xchg_release(...)        __xchg_wrapper(_rel, __VA_ARGS__)
#define arch_xchg(...)                __xchg_wrapper( _mb, __VA_ARGS__)

#define __CMPXCHG_CASE(name, sz)                        \
static inline u##sz __cmpxchg_case_##name##sz(volatile void *ptr,        \
                                              u##sz old,                \
                                              u##sz new)                \
{                                                                        \
        return __lse_ll_sc_body(_cmpxchg_case_##name##sz,                \
                                ptr, old, new);                                \
}

__CMPXCHG_CASE(    ,  8)
__CMPXCHG_CASE(    , 16)
__CMPXCHG_CASE(    , 32)
__CMPXCHG_CASE(    , 64)
__CMPXCHG_CASE(acq_,  8)
__CMPXCHG_CASE(acq_, 16)
__CMPXCHG_CASE(acq_, 32)
__CMPXCHG_CASE(acq_, 64)
__CMPXCHG_CASE(rel_,  8)
__CMPXCHG_CASE(rel_, 16)
__CMPXCHG_CASE(rel_, 32)
__CMPXCHG_CASE(rel_, 64)
__CMPXCHG_CASE(mb_,  8)
__CMPXCHG_CASE(mb_, 16)
__CMPXCHG_CASE(mb_, 32)
__CMPXCHG_CASE(mb_, 64)

#undef __CMPXCHG_CASE

#define __CMPXCHG128(name)                                                \
static inline u128 __cmpxchg128##name(volatile u128 *ptr,                \
                                      u128 old, u128 new)                \
{                                                                        \
        return __lse_ll_sc_body(_cmpxchg128##name,                        \
                                ptr, old, new);                                \
}

__CMPXCHG128(   )
__CMPXCHG128(_mb)

#undef __CMPXCHG128

#define __CMPXCHG_GEN(sfx)                                                \
static __always_inline unsigned long __cmpxchg##sfx(volatile void *ptr,        \
                                           unsigned long old,                \
                                           unsigned long new,                \
                                           int size)                        \
{                                                                        \
        switch (size) {                                                        \
        case 1:                                                                \
                return __cmpxchg_case##sfx##_8(ptr, old, new);                \
        case 2:                                                                \
                return __cmpxchg_case##sfx##_16(ptr, old, new);                \
        case 4:                                                                \
                return __cmpxchg_case##sfx##_32(ptr, old, new);                \
        case 8:                                                                \
                return __cmpxchg_case##sfx##_64(ptr, old, new);                \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
                                                                        \
        unreachable();                                                        \
}

__CMPXCHG_GEN()
__CMPXCHG_GEN(_acq)
__CMPXCHG_GEN(_rel)
__CMPXCHG_GEN(_mb)

#undef __CMPXCHG_GEN

#define __cmpxchg_wrapper(sfx, ptr, o, n)                                \
({                                                                        \
        __typeof__(*(ptr)) __ret;                                        \
        __ret = (__typeof__(*(ptr)))                                        \
                __cmpxchg##sfx((ptr), (unsigned long)(o),                \
                                (unsigned long)(n), sizeof(*(ptr)));        \
        __ret;                                                                \
})

/* cmpxchg */
#define arch_cmpxchg_relaxed(...)        __cmpxchg_wrapper(    , __VA_ARGS__)
#define arch_cmpxchg_acquire(...)        __cmpxchg_wrapper(_acq, __VA_ARGS__)
#define arch_cmpxchg_release(...)        __cmpxchg_wrapper(_rel, __VA_ARGS__)
#define arch_cmpxchg(...)                __cmpxchg_wrapper( _mb, __VA_ARGS__)
#define arch_cmpxchg_local                arch_cmpxchg_relaxed

/* cmpxchg64 */
#define arch_cmpxchg64_relaxed                arch_cmpxchg_relaxed
#define arch_cmpxchg64_acquire                arch_cmpxchg_acquire
#define arch_cmpxchg64_release                arch_cmpxchg_release
#define arch_cmpxchg64                        arch_cmpxchg
#define arch_cmpxchg64_local                arch_cmpxchg_local

/* cmpxchg128 */
#define system_has_cmpxchg128()                1

#define arch_cmpxchg128(ptr, o, n)                                                \
({                                                                                \
        __cmpxchg128_mb((ptr), (o), (n));                                        \
})

#define arch_cmpxchg128_local(ptr, o, n)                                        \
({                                                                                \
        __cmpxchg128((ptr), (o), (n));                                                \
})

#define __CMPWAIT_CASE(w, sfx, sz)                                        \
static inline void __cmpwait_case_##sz(volatile void *ptr,                \
                                       unsigned long val)                \
{                                                                        \
        unsigned long tmp;                                                \
                                                                        \
        asm volatile(                                                        \
        "        sevl\n"                                                        \
        "        wfe\n"                                                        \
        "        ldxr" #sfx "\t%" #w "[tmp], %[v]\n"                        \
        "        eor        %" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"        \
        "        cbnz        %" #w "[tmp], 1f\n"                                \
        "        wfe\n"                                                        \
        "1:"                                                                \
        : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr)                        \
        : [val] "r" (val));                                                \
}

__CMPWAIT_CASE(w, b, 8);
__CMPWAIT_CASE(w, h, 16);
__CMPWAIT_CASE(w,  , 32);
__CMPWAIT_CASE( ,  , 64);

#undef __CMPWAIT_CASE

#define __CMPWAIT_GEN(sfx)                                                \
static __always_inline void __cmpwait##sfx(volatile void *ptr,                \
                                  unsigned long val,                        \
                                  int size)                                \
{                                                                        \
        switch (size) {                                                        \
        case 1:                                                                \
                return __cmpwait_case##sfx##_8(ptr, (u8)val);                \
        case 2:                                                                \
                return __cmpwait_case##sfx##_16(ptr, (u16)val);                \
        case 4:                                                                \
                return __cmpwait_case##sfx##_32(ptr, val);                \
        case 8:                                                                \
                return __cmpwait_case##sfx##_64(ptr, val);                \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
                                                                        \
        unreachable();                                                        \
}

__CMPWAIT_GEN()

#undef __CMPWAIT_GEN

#define __cmpwait_relaxed(ptr, val) \
        __cmpwait((ptr), (unsigned long)(val), sizeof(*(ptr)))

#endif        /* __ASM_CMPXCHG_H */



















































































































































































































  247 

















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2020 - Google LLC
 * Author: Quentin Perret <qperret@google.com>
 */

#include <linux/init.h>
#include <linux/interval_tree_generic.h>
#include <linux/kmemleak.h>
#include <linux/kvm_host.h>
#include <asm/kvm_mmu.h>
#include <linux/memblock.h>
#include <linux/mutex.h>

#include <asm/kvm_pkvm.h>

#include "hyp_constants.h"

DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);

static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory);
static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr);

phys_addr_t hyp_mem_base;
phys_addr_t hyp_mem_size;

static int __init register_memblock_regions(void)
{
        struct memblock_region *reg;

        for_each_mem_region(reg) {
                if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS)
                        return -ENOMEM;

                hyp_memory[*hyp_memblock_nr_ptr] = *reg;
                (*hyp_memblock_nr_ptr)++;
        }

        return 0;
}

void __init kvm_hyp_reserve(void)
{
        u64 hyp_mem_pages = 0;
        int ret;

        if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
                return;

        if (kvm_get_mode() != KVM_MODE_PROTECTED)
                return;

        ret = register_memblock_regions();
        if (ret) {
                *hyp_memblock_nr_ptr = 0;
                kvm_err("Failed to register hyp memblocks: %d\n", ret);
                return;
        }

        hyp_mem_pages += hyp_s1_pgtable_pages();
        hyp_mem_pages += host_s2_pgtable_pages();
        hyp_mem_pages += hyp_vm_table_pages();
        hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);
        hyp_mem_pages += pkvm_selftest_pages();
        hyp_mem_pages += hyp_ffa_proxy_pages();

        /*
         * Try to allocate a PMD-aligned region to reduce TLB pressure once
         * this is unmapped from the host stage-2, and fallback to PAGE_SIZE.
         */
        hyp_mem_size = hyp_mem_pages << PAGE_SHIFT;
        hyp_mem_base = memblock_phys_alloc(ALIGN(hyp_mem_size, PMD_SIZE),
                                           PMD_SIZE);
        if (!hyp_mem_base)
                hyp_mem_base = memblock_phys_alloc(hyp_mem_size, PAGE_SIZE);
        else
                hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE);

        if (!hyp_mem_base) {
                kvm_err("Failed to reserve hyp memory\n");
                return;
        }

        kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
                 hyp_mem_base);
}

static void __pkvm_destroy_hyp_vm(struct kvm *host_kvm)
{
        if (host_kvm->arch.pkvm.handle) {
                WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
                                          host_kvm->arch.pkvm.handle));
        }

        host_kvm->arch.pkvm.handle = 0;
        free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
        free_hyp_memcache(&host_kvm->arch.pkvm.stage2_teardown_mc);
}

static int __pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
{
        size_t hyp_vcpu_sz = PAGE_ALIGN(PKVM_HYP_VCPU_SIZE);
        pkvm_handle_t handle = vcpu->kvm->arch.pkvm.handle;
        void *hyp_vcpu;
        int ret;

        vcpu->arch.pkvm_memcache.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;

        hyp_vcpu = alloc_pages_exact(hyp_vcpu_sz, GFP_KERNEL_ACCOUNT);
        if (!hyp_vcpu)
                return -ENOMEM;

        ret = kvm_call_hyp_nvhe(__pkvm_init_vcpu, handle, vcpu, hyp_vcpu);
        if (!ret)
                vcpu_set_flag(vcpu, VCPU_PKVM_FINALIZED);
        else
                free_pages_exact(hyp_vcpu, hyp_vcpu_sz);

        return ret;
}

/*
 * Allocates and donates memory for hypervisor VM structs at EL2.
 *
 * Allocates space for the VM state, which includes the hyp vm as well as
 * the hyp vcpus.
 *
 * Stores an opaque handler in the kvm struct for future reference.
 *
 * Return 0 on success, negative error code on failure.
 */
static int __pkvm_create_hyp_vm(struct kvm *host_kvm)
{
        size_t pgd_sz, hyp_vm_sz;
        void *pgd, *hyp_vm;
        int ret;

        if (host_kvm->created_vcpus < 1)
                return -EINVAL;

        pgd_sz = kvm_pgtable_stage2_pgd_size(host_kvm->arch.mmu.vtcr);

        /*
         * The PGD pages will be reclaimed using a hyp_memcache which implies
         * page granularity. So, use alloc_pages_exact() to get individual
         * refcounts.
         */
        pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT);
        if (!pgd)
                return -ENOMEM;

        /* Allocate memory to donate to hyp for vm and vcpu pointers. */
        hyp_vm_sz = PAGE_ALIGN(size_add(PKVM_HYP_VM_SIZE,
                                        size_mul(sizeof(void *),
                                                 host_kvm->created_vcpus)));
        hyp_vm = alloc_pages_exact(hyp_vm_sz, GFP_KERNEL_ACCOUNT);
        if (!hyp_vm) {
                ret = -ENOMEM;
                goto free_pgd;
        }

        /* Donate the VM memory to hyp and let hyp initialize it. */
        ret = kvm_call_hyp_nvhe(__pkvm_init_vm, host_kvm, hyp_vm, pgd);
        if (ret < 0)
                goto free_vm;

        host_kvm->arch.pkvm.handle = ret;
        host_kvm->arch.pkvm.stage2_teardown_mc.flags |= HYP_MEMCACHE_ACCOUNT_STAGE2;
        kvm_account_pgtable_pages(pgd, pgd_sz / PAGE_SIZE);

        return 0;
free_vm:
        free_pages_exact(hyp_vm, hyp_vm_sz);
free_pgd:
        free_pages_exact(pgd, pgd_sz);
        return ret;
}

int pkvm_create_hyp_vm(struct kvm *host_kvm)
{
        int ret = 0;

        mutex_lock(&host_kvm->arch.config_lock);
        if (!host_kvm->arch.pkvm.handle)
                ret = __pkvm_create_hyp_vm(host_kvm);
        mutex_unlock(&host_kvm->arch.config_lock);

        return ret;
}

int pkvm_create_hyp_vcpu(struct kvm_vcpu *vcpu)
{
        int ret = 0;

        mutex_lock(&vcpu->kvm->arch.config_lock);
        if (!vcpu_get_flag(vcpu, VCPU_PKVM_FINALIZED))
                ret = __pkvm_create_hyp_vcpu(vcpu);
        mutex_unlock(&vcpu->kvm->arch.config_lock);

        return ret;
}

void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
{
        mutex_lock(&host_kvm->arch.config_lock);
        __pkvm_destroy_hyp_vm(host_kvm);
        mutex_unlock(&host_kvm->arch.config_lock);
}

int pkvm_init_host_vm(struct kvm *host_kvm)
{
        return 0;
}

static void __init _kvm_host_prot_finalize(void *arg)
{
        int *err = arg;

        if (WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)))
                WRITE_ONCE(*err, -EINVAL);
}

static int __init pkvm_drop_host_privileges(void)
{
        int ret = 0;

        /*
         * Flip the static key upfront as that may no longer be possible
         * once the host stage 2 is installed.
         */
        static_branch_enable(&kvm_protected_mode_initialized);
        on_each_cpu(_kvm_host_prot_finalize, &ret, 1);
        return ret;
}

static int __init finalize_pkvm(void)
{
        int ret;

        if (!is_protected_kvm_enabled() || !is_kvm_arm_initialised())
                return 0;

        /*
         * Exclude HYP sections from kmemleak so that they don't get peeked
         * at, which would end badly once inaccessible.
         */
        kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start);
        kmemleak_free_part(__hyp_data_start, __hyp_data_end - __hyp_data_start);
        kmemleak_free_part(__hyp_rodata_start, __hyp_rodata_end - __hyp_rodata_start);
        kmemleak_free_part_phys(hyp_mem_base, hyp_mem_size);

        ret = pkvm_drop_host_privileges();
        if (ret)
                pr_err("Failed to finalize Hyp protection: %d\n", ret);

        return ret;
}
device_initcall_sync(finalize_pkvm);

static u64 __pkvm_mapping_start(struct pkvm_mapping *m)
{
        return m->gfn * PAGE_SIZE;
}

static u64 __pkvm_mapping_end(struct pkvm_mapping *m)
{
        return (m->gfn + m->nr_pages) * PAGE_SIZE - 1;
}

INTERVAL_TREE_DEFINE(struct pkvm_mapping, node, u64, __subtree_last,
                     __pkvm_mapping_start, __pkvm_mapping_end, static,
                     pkvm_mapping);

/*
 * __tmp is updated to iter_first(pkvm_mappings) *before* entering the body of the loop to allow
 * freeing of __map inline.
 */
#define for_each_mapping_in_range_safe(__pgt, __start, __end, __map)                                \
        for (struct pkvm_mapping *__tmp = pkvm_mapping_iter_first(&(__pgt)->pkvm_mappings,        \
                                                                  __start, __end - 1);                \
             __tmp && ({                                                                        \
                                __map = __tmp;                                                        \
                                __tmp = pkvm_mapping_iter_next(__map, __start, __end - 1);        \
                                true;                                                                \
                       });                                                                        \
            )

int pkvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
                             struct kvm_pgtable_mm_ops *mm_ops)
{
        pgt->pkvm_mappings        = RB_ROOT_CACHED;
        pgt->mmu                = mmu;

        return 0;
}

static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 end)
{
        struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
        pkvm_handle_t handle = kvm->arch.pkvm.handle;
        struct pkvm_mapping *mapping;
        int ret;

        if (!handle)
                return 0;

        for_each_mapping_in_range_safe(pgt, start, end, mapping) {
                ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn,
                                        mapping->nr_pages);
                if (WARN_ON(ret))
                        return ret;
                pkvm_mapping_remove(mapping, &pgt->pkvm_mappings);
                kfree(mapping);
        }

        return 0;
}

void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
{
        __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL));
}

int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                           u64 phys, enum kvm_pgtable_prot prot,
                           void *mc, enum kvm_pgtable_walk_flags flags)
{
        struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
        struct pkvm_mapping *mapping = NULL;
        struct kvm_hyp_memcache *cache = mc;
        u64 gfn = addr >> PAGE_SHIFT;
        u64 pfn = phys >> PAGE_SHIFT;
        int ret;

        if (size != PAGE_SIZE && size != PMD_SIZE)
                return -EINVAL;

        lockdep_assert_held_write(&kvm->mmu_lock);

        /*
         * Calling stage2_map() on top of existing mappings is either happening because of a race
         * with another vCPU, or because we're changing between page and block mappings. As per
         * user_mem_abort(), same-size permission faults are handled in the relax_perms() path.
         */
        mapping = pkvm_mapping_iter_first(&pgt->pkvm_mappings, addr, addr + size - 1);
        if (mapping) {
                if (size == (mapping->nr_pages * PAGE_SIZE))
                        return -EAGAIN;

                /* Remove _any_ pkvm_mapping overlapping with the range, bigger or smaller. */
                ret = __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
                if (ret)
                        return ret;
                mapping = NULL;
        }

        ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, size / PAGE_SIZE, prot);
        if (WARN_ON(ret))
                return ret;

        swap(mapping, cache->mapping);
        mapping->gfn = gfn;
        mapping->pfn = pfn;
        mapping->nr_pages = size / PAGE_SIZE;
        pkvm_mapping_insert(mapping, &pgt->pkvm_mappings);

        return ret;
}

int pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
        lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(pgt->mmu)->mmu_lock);

        return __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
}

int pkvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
        struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
        pkvm_handle_t handle = kvm->arch.pkvm.handle;
        struct pkvm_mapping *mapping;
        int ret = 0;

        lockdep_assert_held(&kvm->mmu_lock);
        for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping) {
                ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn,
                                        mapping->nr_pages);
                if (WARN_ON(ret))
                        break;
        }

        return ret;
}

int pkvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
        struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
        struct pkvm_mapping *mapping;

        lockdep_assert_held(&kvm->mmu_lock);
        for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
                __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn),
                                          PAGE_SIZE * mapping->nr_pages);

        return 0;
}

bool pkvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold)
{
        struct kvm *kvm = kvm_s2_mmu_to_kvm(pgt->mmu);
        pkvm_handle_t handle = kvm->arch.pkvm.handle;
        struct pkvm_mapping *mapping;
        bool young = false;

        lockdep_assert_held(&kvm->mmu_lock);
        for_each_mapping_in_range_safe(pgt, addr, addr + size, mapping)
                young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
                                           mapping->nr_pages, mkold);

        return young;
}

int pkvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
                                    enum kvm_pgtable_walk_flags flags)
{
        return kvm_call_hyp_nvhe(__pkvm_host_relax_perms_guest, addr >> PAGE_SHIFT, prot);
}

void pkvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
                                 enum kvm_pgtable_walk_flags flags)
{
        WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT));
}

void pkvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
{
        WARN_ON_ONCE(1);
}

kvm_pte_t *pkvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
                                        enum kvm_pgtable_prot prot, void *mc, bool force_pte)
{
        WARN_ON_ONCE(1);
        return NULL;
}

int pkvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
                              struct kvm_mmu_memory_cache *mc)
{
        WARN_ON_ONCE(1);
        return -EINVAL;
}


















































































  270 




  270 

















































































































































    9 


  319 













  156 









   61 








  205 
  205 



  205 



































































  321 





  320 

    3 








  320 







  319 



































  320 






    1 


  309 
   19 



  225 
  131 


  318 







    1 






   91 








  123 


    1 

  118 
    2 

   24 







  134 



  132 
    2 


    3 

  195 





    1 

    1 















   72 

    2 





   90 












    1 

    1 














    4 
















  217 


  116 


   24 
   94 



   65 
   90 

  203 






  321 




   69 

   12 



   69 






  154 
    1 















  160 

  153 





































































































































  319 



  318 





  316 












































  318 








  152 

  254 







  319 









  319 





   92 











  206 


   77 





  132 


    1 


  318 




  316 







   97 

   97 
















  270 


  274 













  517 


  519 







































































   13 
































































































   91 

































































































































































































































































































  301 
    1 


  302 




















  272 



   23 

   34 
  189 
   62 

































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/mmap.c
 *
 * Written by obz.
 *
 * Address space accounting code        <alan@lxorguk.ukuu.org.uk>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/shm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/capability.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/profile.h>
#include <linux/export.h>
#include <linux/mount.h>
#include <linux/mempolicy.h>
#include <linux/rmap.h>
#include <linux/mmu_notifier.h>
#include <linux/mmdebug.h>
#include <linux/perf_event.h>
#include <linux/audit.h>
#include <linux/khugepaged.h>
#include <linux/uprobes.h>
#include <linux/notifier.h>
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/ksm.h>
#include <linux/memfd.h>

#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>

#define CREATE_TRACE_POINTS
#include <trace/events/mmap.h>

#include "internal.h"

#ifndef arch_mmap_check
#define arch_mmap_check(addr, len, flags)        (0)
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
int mmap_rnd_bits_max __ro_after_init = CONFIG_ARCH_MMAP_RND_BITS_MAX;
int mmap_rnd_bits __read_mostly = CONFIG_ARCH_MMAP_RND_BITS;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
const int mmap_rnd_compat_bits_min = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN;
const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif

static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);

/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
        unsigned long vm_flags = vma->vm_flags;
        pgprot_t vm_page_prot;

        vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags);
        if (vma_wants_writenotify(vma, vm_page_prot)) {
                vm_flags &= ~VM_SHARED;
                vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
        }
        /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
        WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}

/*
 * check_brk_limits() - Use platform specific check of range & verify mlock
 * limits.
 * @addr: The address to check
 * @len: The size of increase.
 *
 * Return: 0 on success.
 */
static int check_brk_limits(unsigned long addr, unsigned long len)
{
        unsigned long mapped_addr;

        mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
        if (IS_ERR_VALUE(mapped_addr))
                return mapped_addr;

        return mlock_future_ok(current->mm, current->mm->def_flags, len)
                ? 0 : -EAGAIN;
}

SYSCALL_DEFINE1(brk, unsigned long, brk)
{
        unsigned long newbrk, oldbrk, origbrk;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *brkvma, *next = NULL;
        unsigned long min_brk;
        bool populate = false;
        LIST_HEAD(uf);
        struct vma_iterator vmi;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        origbrk = mm->brk;

#ifdef CONFIG_COMPAT_BRK
        /*
         * CONFIG_COMPAT_BRK can still be overridden by setting
         * randomize_va_space to 2, which will still cause mm->start_brk
         * to be arbitrarily shifted
         */
        if (current->brk_randomized)
                min_brk = mm->start_brk;
        else
                min_brk = mm->end_data;
#else
        min_brk = mm->start_brk;
#endif
        if (brk < min_brk)
                goto out;

        /*
         * Check against rlimit here. If this check is done later after the test
         * of oldbrk with newbrk then it can escape the test and let the data
         * segment grow beyond its set limit the in case where the limit is
         * not page aligned -Ram Gupta
         */
        if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
                              mm->end_data, mm->start_data))
                goto out;

        newbrk = PAGE_ALIGN(brk);
        oldbrk = PAGE_ALIGN(mm->brk);
        if (oldbrk == newbrk) {
                mm->brk = brk;
                goto success;
        }

        /* Always allow shrinking brk. */
        if (brk <= mm->brk) {
                /* Search one past newbrk */
                vma_iter_init(&vmi, mm, newbrk);
                brkvma = vma_find(&vmi, oldbrk);
                if (!brkvma || brkvma->vm_start >= oldbrk)
                        goto out; /* mapping intersects with an existing non-brk vma. */
                /*
                 * mm->brk must be protected by write mmap_lock.
                 * do_vmi_align_munmap() will drop the lock on success,  so
                 * update it before calling do_vma_munmap().
                 */
                mm->brk = brk;
                if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf,
                                        /* unlock = */ true))
                        goto out;

                goto success_unlocked;
        }

        if (check_brk_limits(oldbrk, newbrk - oldbrk))
                goto out;

        /*
         * Only check if the next VMA is within the stack_guard_gap of the
         * expansion area
         */
        vma_iter_init(&vmi, mm, oldbrk);
        next = vma_find(&vmi, newbrk + PAGE_SIZE + stack_guard_gap);
        if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
                goto out;

        brkvma = vma_prev_limit(&vmi, mm->start_brk);
        /* Ok, looks good - let it rip. */
        if (do_brk_flags(&vmi, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
                goto out;

        mm->brk = brk;
        if (mm->def_flags & VM_LOCKED)
                populate = true;

success:
        mmap_write_unlock(mm);
success_unlocked:
        userfaultfd_unmap_complete(mm, &uf);
        if (populate)
                mm_populate(oldbrk, newbrk - oldbrk);
        return brk;

out:
        mm->brk = origbrk;
        mmap_write_unlock(mm);
        return origbrk;
}

/*
 * If a hint addr is less than mmap_min_addr change hint to be as
 * low as possible but still greater than mmap_min_addr
 */
static inline unsigned long round_hint_to_min(unsigned long hint)
{
        hint &= PAGE_MASK;
        if (((void *)hint != NULL) &&
            (hint < mmap_min_addr))
                return PAGE_ALIGN(mmap_min_addr);
        return hint;
}

bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
                        unsigned long bytes)
{
        unsigned long locked_pages, limit_pages;

        if (!(flags & VM_LOCKED) || capable(CAP_IPC_LOCK))
                return true;

        locked_pages = bytes >> PAGE_SHIFT;
        locked_pages += mm->locked_vm;

        limit_pages = rlimit(RLIMIT_MEMLOCK);
        limit_pages >>= PAGE_SHIFT;

        return locked_pages <= limit_pages;
}

static inline u64 file_mmap_size_max(struct file *file, struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISBLK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        if (S_ISSOCK(inode->i_mode))
                return MAX_LFS_FILESIZE;

        /* Special "we do even unsigned file positions" case */
        if (file->f_op->fop_flags & FOP_UNSIGNED_OFFSET)
                return 0;

        /* Yes, random drivers might want more. But I'm tired of buggy drivers */
        return ULONG_MAX;
}

static inline bool file_mmap_ok(struct file *file, struct inode *inode,
                                unsigned long pgoff, unsigned long len)
{
        u64 maxsize = file_mmap_size_max(file, inode);

        if (maxsize && len > maxsize)
                return false;
        maxsize -= len;
        if (pgoff > maxsize >> PAGE_SHIFT)
                return false;
        return true;
}

/**
 * do_mmap() - Perform a userland memory mapping into the current process
 * address space of length @len with protection bits @prot, mmap flags @flags
 * (from which VMA flags will be inferred), and any additional VMA flags to
 * apply @vm_flags. If this is a file-backed mapping then the file is specified
 * in @file and page offset into the file via @pgoff.
 *
 * This function does not perform security checks on the file and assumes, if
 * @uf is non-NULL, the caller has provided a list head to track unmap events
 * for userfaultfd @uf.
 *
 * It also simply indicates whether memory population is required by setting
 * @populate, which must be non-NULL, expecting the caller to actually perform
 * this task itself if appropriate.
 *
 * This function will invoke architecture-specific (and if provided and
 * relevant, file system-specific) logic to determine the most appropriate
 * unmapped area in which to place the mapping if not MAP_FIXED.
 *
 * Callers which require userland mmap() behaviour should invoke vm_mmap(),
 * which is also exported for module use.
 *
 * Those which require this behaviour less security checks, userfaultfd and
 * populate behaviour, and who handle the mmap write lock themselves, should
 * call this function.
 *
 * Note that the returned address may reside within a merged VMA if an
 * appropriate merge were to take place, so it doesn't necessarily specify the
 * start of a VMA, rather only the start of a valid mapped range of length
 * @len bytes, rounded down to the nearest page size.
 *
 * The caller must write-lock current->mm->mmap_lock.
 *
 * @file: An optional struct file pointer describing the file which is to be
 * mapped, if a file-backed mapping.
 * @addr: If non-zero, hints at (or if @flags has MAP_FIXED set, specifies) the
 * address at which to perform this mapping. See mmap (2) for details. Must be
 * page-aligned.
 * @len: The length of the mapping. Will be page-aligned and must be at least 1
 * page in size.
 * @prot: Protection bits describing access required to the mapping. See mmap
 * (2) for details.
 * @flags: Flags specifying how the mapping should be performed, see mmap (2)
 * for details.
 * @vm_flags: VMA flags which should be set by default, or 0 otherwise.
 * @pgoff: Page offset into the @file if file-backed, should be 0 otherwise.
 * @populate: A pointer to a value which will be set to 0 if no population of
 * the range is required, or the number of bytes to populate if it is. Must be
 * non-NULL. See mmap (2) for details as to under what circumstances population
 * of the range occurs.
 * @uf: An optional pointer to a list head to track userfaultfd unmap events
 * should unmapping events arise. If provided, it is up to the caller to manage
 * this.
 *
 * Returns: Either an error, or the address at which the requested mapping has
 * been performed.
 */
unsigned long do_mmap(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, vm_flags_t vm_flags,
                        unsigned long pgoff, unsigned long *populate,
                        struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        int pkey = 0;

        *populate = 0;

        mmap_assert_write_locked(mm);

        if (!len)
                return -EINVAL;

        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
         *
         * (the exception is when the underlying filesystem is noexec
         *  mounted, in which case we don't add PROT_EXEC.)
         */
        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
                if (!(file && path_noexec(&file->f_path)))
                        prot |= PROT_EXEC;

        /* force arch specific MAP_FIXED handling in get_unmapped_area */
        if (flags & MAP_FIXED_NOREPLACE)
                flags |= MAP_FIXED;

        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);

        /* Careful about overflows.. */
        len = PAGE_ALIGN(len);
        if (!len)
                return -ENOMEM;

        /* offset overflow? */
        if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)
                return -EOVERFLOW;

        /* Too many mappings? */
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        /*
         * addr is returned from get_unmapped_area,
         * There are two cases:
         * 1> MAP_FIXED == false
         *        unallocated memory, no need to check sealing.
         * 1> MAP_FIXED == true
         *        sealing is checked inside mmap_region when
         *        do_vmi_munmap is called.
         */

        if (prot == PROT_EXEC) {
                pkey = execute_only_pkey(mm);
                if (pkey < 0)
                        pkey = 0;
        }

        /* Do simple checking here so the lower-level routines won't have
         * to. we assume access permissions have been handled by the open
         * of the memory object, so we don't do any here.
         */
        vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) |
                        mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
        addr = __get_unmapped_area(file, addr, len, pgoff, flags, vm_flags);
        if (IS_ERR_VALUE(addr))
                return addr;

        if (flags & MAP_FIXED_NOREPLACE) {
                if (find_vma_intersection(mm, addr, addr + len))
                        return -EEXIST;
        }

        if (flags & MAP_LOCKED)
                if (!can_do_mlock())
                        return -EPERM;

        if (!mlock_future_ok(mm, vm_flags, len))
                return -EAGAIN;

        if (file) {
                struct inode *inode = file_inode(file);
                unsigned long flags_mask;
                int err;

                if (!file_mmap_ok(file, inode, pgoff, len))
                        return -EOVERFLOW;

                flags_mask = LEGACY_MAP_MASK;
                if (file->f_op->fop_flags & FOP_MMAP_SYNC)
                        flags_mask |= MAP_SYNC;

                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        /*
                         * Force use of MAP_SHARED_VALIDATE with non-legacy
                         * flags. E.g. MAP_SYNC is dangerous to use with
                         * MAP_SHARED as you don't know which consistency model
                         * you will get. We silently ignore unsupported flags
                         * with MAP_SHARED to preserve backward compatibility.
                         */
                        flags &= LEGACY_MAP_MASK;
                        fallthrough;
                case MAP_SHARED_VALIDATE:
                        if (flags & ~flags_mask)
                                return -EOPNOTSUPP;
                        if (prot & PROT_WRITE) {
                                if (!(file->f_mode & FMODE_WRITE))
                                        return -EACCES;
                                if (IS_SWAPFILE(file->f_mapping->host))
                                        return -ETXTBSY;
                        }

                        /*
                         * Make sure we don't allow writing to an append-only
                         * file..
                         */
                        if (IS_APPEND(inode) && (file->f_mode & FMODE_WRITE))
                                return -EACCES;

                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        if (!(file->f_mode & FMODE_WRITE))
                                vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
                        fallthrough;
                case MAP_PRIVATE:
                        if (!(file->f_mode & FMODE_READ))
                                return -EACCES;
                        if (path_noexec(&file->f_path)) {
                                if (vm_flags & VM_EXEC)
                                        return -EPERM;
                                vm_flags &= ~VM_MAYEXEC;
                        }

                        if (!file->f_op->mmap)
                                return -ENODEV;
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        break;

                default:
                        return -EINVAL;
                }

                /*
                 * Check to see if we are violating any seals and update VMA
                 * flags if necessary to avoid future seal violations.
                 */
                err = memfd_check_seals_mmap(file, &vm_flags);
                if (err)
                        return (unsigned long)err;
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
                        if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
                                return -EINVAL;
                        /*
                         * Ignore pgoff.
                         */
                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_DROPPABLE:
                        if (VM_DROPPABLE == VM_NONE)
                                return -ENOTSUPP;
                        /*
                         * A locked or stack area makes no sense to be droppable.
                         *
                         * Also, since droppable pages can just go away at any time
                         * it makes no sense to copy them on fork or dump them.
                         *
                         * And don't attempt to combine with hugetlb for now.
                         */
                        if (flags & (MAP_LOCKED | MAP_HUGETLB))
                                return -EINVAL;
                        if (vm_flags & (VM_GROWSDOWN | VM_GROWSUP))
                                return -EINVAL;

                        vm_flags |= VM_DROPPABLE;

                        /*
                         * If the pages can be dropped, then it doesn't make
                         * sense to reserve them.
                         */
                        vm_flags |= VM_NORESERVE;

                        /*
                         * Likewise, they're volatile enough that they
                         * shouldn't survive forks or coredumps.
                         */
                        vm_flags |= VM_WIPEONFORK | VM_DONTDUMP;
                        fallthrough;
                case MAP_PRIVATE:
                        /*
                         * Set pgoff according to addr for anon_vma.
                         */
                        pgoff = addr >> PAGE_SHIFT;
                        break;
                default:
                        return -EINVAL;
                }
        }

        /*
         * Set 'VM_NORESERVE' if we should not account for the
         * memory use of this mapping.
         */
        if (flags & MAP_NORESERVE) {
                /* We honor MAP_NORESERVE if allowed to overcommit */
                if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
                        vm_flags |= VM_NORESERVE;

                /* hugetlb applies strict overcommit unless MAP_NORESERVE */
                if (file && is_file_hugepages(file))
                        vm_flags |= VM_NORESERVE;
        }

        addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
        if (!IS_ERR_VALUE(addr) &&
            ((vm_flags & VM_LOCKED) ||
             (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
                *populate = len;
        return addr;
}

unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                              unsigned long prot, unsigned long flags,
                              unsigned long fd, unsigned long pgoff)
{
        struct file *file = NULL;
        unsigned long retval;

        if (!(flags & MAP_ANONYMOUS)) {
                audit_mmap_fd(fd, flags);
                file = fget(fd);
                if (!file)
                        return -EBADF;
                if (is_file_hugepages(file)) {
                        len = ALIGN(len, huge_page_size(hstate_file(file)));
                } else if (unlikely(flags & MAP_HUGETLB)) {
                        retval = -EINVAL;
                        goto out_fput;
                }
        } else if (flags & MAP_HUGETLB) {
                struct hstate *hs;

                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (!hs)
                        return -EINVAL;

                len = ALIGN(len, huge_page_size(hs));
                /*
                 * VM_NORESERVE is used because the reservations will be
                 * taken when vm_ops->mmap() is called
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                VM_NORESERVE,
                                HUGETLB_ANONHUGE_INODE,
                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
        }

        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
        if (file)
                fput(file);
        return retval;
}

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, pgoff)
{
        return ksys_mmap_pgoff(addr, len, prot, flags, fd, pgoff);
}

#ifdef __ARCH_WANT_SYS_OLD_MMAP
struct mmap_arg_struct {
        unsigned long addr;
        unsigned long len;
        unsigned long prot;
        unsigned long flags;
        unsigned long fd;
        unsigned long offset;
};

SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
{
        struct mmap_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        if (offset_in_page(a.offset))
                return -EINVAL;

        return ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
                               a.offset >> PAGE_SHIFT);
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */

/*
 * Determine if the allocation needs to ensure that there is no
 * existing mapping within it's guard gaps, for use as start_gap.
 */
static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
{
        if (vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

/*
 * Search for an unmapped address range.
 *
 * We are looking for a range that:
 * - does not intersect with any VMA;
 * - is contained within the [low_limit, high_limit) interval;
 * - is at least the desired size.
 * - satisfies (begin_addr & align_mask) == (align_offset & align_mask)
 */
unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long addr;

        if (info->flags & VM_UNMAPPED_AREA_TOPDOWN)
                addr = unmapped_area_topdown(info);
        else
                addr = unmapped_area(info);

        trace_vm_unmapped_area(addr, info);
        return addr;
}

/* Get an address range which is currently unmapped.
 * For shmat() with addr=0.
 *
 * Ugly calling convention alert:
 * Return value with the low bits set means error value,
 * ie
 *        if (ret & ~PAGE_MASK)
 *                error = ret;
 *
 * This function "knows" that -ENOMEM has the bits set.
 */
unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags, vm_flags_t vm_flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                    (!vma || addr + len <= vm_start_gap(vma)) &&
                    (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.length = len;
        info.low_limit = mm->mmap_base;
        info.high_limit = mmap_end;
        info.start_gap = stack_guard_placement(vm_flags);
        if (filp && is_file_hugepages(filp))
                info.align_mask = huge_page_mask_align(filp);
        return vm_unmapped_area(&info);
}

#ifndef HAVE_ARCH_UNMAPPED_AREA
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                       unsigned long len, unsigned long pgoff,
                       unsigned long flags, vm_flags_t vm_flags)
{
        return generic_get_unmapped_area(filp, addr, len, pgoff, flags,
                                         vm_flags);
}
#endif

/*
 * This mmap-allocator allocates new areas top-down from below the
 * stack's low limit (the base):
 */
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags, vm_flags_t vm_flags)
{
        struct vm_area_struct *vma, *prev;
        struct mm_struct *mm = current->mm;
        struct vm_unmapped_area_info info = {};
        const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);

        /* requested length too big for entire address space */
        if (len > mmap_end - mmap_min_addr)
                return -ENOMEM;

        if (flags & MAP_FIXED)
                return addr;

        /* requesting a specific address */
        if (addr) {
                addr = PAGE_ALIGN(addr);
                vma = find_vma_prev(mm, addr, &prev);
                if (mmap_end - len >= addr && addr >= mmap_min_addr &&
                                (!vma || addr + len <= vm_start_gap(vma)) &&
                                (!prev || addr >= vm_end_gap(prev)))
                        return addr;
        }

        info.flags = VM_UNMAPPED_AREA_TOPDOWN;
        info.length = len;
        info.low_limit = PAGE_SIZE;
        info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
        info.start_gap = stack_guard_placement(vm_flags);
        if (filp && is_file_hugepages(filp))
                info.align_mask = huge_page_mask_align(filp);
        addr = vm_unmapped_area(&info);

        /*
         * A failed mmap() very likely causes application failure,
         * so fall back to the bottom-up function here. This scenario
         * can happen with large stack limits and large mmap()
         * allocations.
         */
        if (offset_in_page(addr)) {
                VM_BUG_ON(addr != -ENOMEM);
                info.flags = 0;
                info.low_limit = TASK_UNMAPPED_BASE;
                info.high_limit = mmap_end;
                addr = vm_unmapped_area(&info);
        }

        return addr;
}

#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags, vm_flags_t vm_flags)
{
        return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags,
                                                 vm_flags);
}
#endif

unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *filp,
                                           unsigned long addr, unsigned long len,
                                           unsigned long pgoff, unsigned long flags,
                                           vm_flags_t vm_flags)
{
        if (test_bit(MMF_TOPDOWN, &mm->flags))
                return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
                                                      flags, vm_flags);
        return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
}

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
{
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long)
                                  = NULL;

        unsigned long error = arch_mmap_check(addr, len, flags);
        if (error)
                return error;

        /* Careful about overflows.. */
        if (len > TASK_SIZE)
                return -ENOMEM;

        if (file) {
                if (file->f_op->get_unmapped_area)
                        get_area = file->f_op->get_unmapped_area;
        } else if (flags & MAP_SHARED) {
                /*
                 * mmap_region() will call shmem_zero_setup() to create a file,
                 * so use shmem's get_unmapped_area in case it can be huge.
                 */
                get_area = shmem_get_unmapped_area;
        }

        /* Always treat pgoff as zero for anonymous memory. */
        if (!file)
                pgoff = 0;

        if (get_area) {
                addr = get_area(file, addr, len, pgoff, flags);
        } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && !file
                   && !addr /* no hint */
                   && IS_ALIGNED(len, PMD_SIZE)) {
                /* Ensures that larger anonymous mappings are THP aligned. */
                addr = thp_get_unmapped_area_vmflags(file, addr, len,
                                                     pgoff, flags, vm_flags);
        } else {
                addr = mm_get_unmapped_area_vmflags(current->mm, file, addr, len,
                                                    pgoff, flags, vm_flags);
        }
        if (IS_ERR_VALUE(addr))
                return addr;

        if (addr > TASK_SIZE - len)
                return -ENOMEM;
        if (offset_in_page(addr))
                return -EINVAL;

        error = security_mmap_addr(addr);
        return error ? error : addr;
}

unsigned long
mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
                     unsigned long addr, unsigned long len,
                     unsigned long pgoff, unsigned long flags)
{
        if (test_bit(MMF_TOPDOWN, &mm->flags))
                return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0);
        return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0);
}
EXPORT_SYMBOL(mm_get_unmapped_area);

/**
 * find_vma_intersection() - Look up the first VMA which intersects the interval
 * @mm: The process address space.
 * @start_addr: The inclusive start user address.
 * @end_addr: The exclusive end user address.
 *
 * Returns: The first VMA within the provided range, %NULL otherwise.  Assumes
 * start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                                             unsigned long start_addr,
                                             unsigned long end_addr)
{
        unsigned long index = start_addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, end_addr - 1);
}
EXPORT_SYMBOL(find_vma_intersection);

/**
 * find_vma() - Find the VMA for a given address, or the next VMA.
 * @mm: The mm_struct to check
 * @addr: The address
 *
 * Returns: The VMA associated with addr, or the next VMA.
 * May return %NULL in the case of no VMA at addr or above.
 */
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
        unsigned long index = addr;

        mmap_assert_locked(mm);
        return mt_find(&mm->mm_mt, &index, ULONG_MAX);
}
EXPORT_SYMBOL(find_vma);

/**
 * find_vma_prev() - Find the VMA for a given address, or the next vma and
 * set %pprev to the previous VMA, if any.
 * @mm: The mm_struct to check
 * @addr: The address
 * @pprev: The pointer to set to the previous VMA
 *
 * Note that RCU lock is missing here since the external mmap_lock() is used
 * instead.
 *
 * Returns: The VMA associated with @addr, or the next vma.
 * May return %NULL in the case of no vma at addr or above.
 */
struct vm_area_struct *
find_vma_prev(struct mm_struct *mm, unsigned long addr,
                        struct vm_area_struct **pprev)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, addr);

        vma = vma_iter_load(&vmi);
        *pprev = vma_prev(&vmi);
        if (!vma)
                vma = vma_next(&vmi);
        return vma;
}

/* enforced gap between the expanding stack and other mappings. */
unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;

static int __init cmdline_parse_stack_guard_gap(char *p)
{
        unsigned long val;
        char *endptr;

        val = simple_strtoul(p, &endptr, 10);
        if (!*endptr)
                stack_guard_gap = val << PAGE_SHIFT;

        return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);

#ifdef CONFIG_STACK_GROWSUP
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_upwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        addr &= PAGE_MASK;
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
        if (!prev)
                return NULL;
        if (expand_stack_locked(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED)
                populate_vma_page_range(prev, addr, prev->vm_end, NULL);
        return prev;
}
#else
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address)
{
        return expand_downwards(vma, address);
}

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma;
        unsigned long start;

        addr &= PAGE_MASK;
        vma = find_vma(mm, addr);
        if (!vma)
                return NULL;
        if (vma->vm_start <= addr)
                return vma;
        start = vma->vm_start;
        if (expand_stack_locked(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED)
                populate_vma_page_range(vma, addr, start, NULL);
        return vma;
}
#endif

#if defined(CONFIG_STACK_GROWSUP)

#define vma_expand_up(vma,addr) expand_upwards(vma, addr)
#define vma_expand_down(vma, addr) (-EFAULT)

#else

#define vma_expand_up(vma,addr) (-EFAULT)
#define vma_expand_down(vma, addr) expand_downwards(vma, addr)

#endif

/*
 * expand_stack(): legacy interface for page faulting. Don't use unless
 * you have to.
 *
 * This is called with the mm locked for reading, drops the lock, takes
 * the lock for writing, tries to look up a vma again, expands it if
 * necessary, and downgrades the lock to reading again.
 *
 * If no vma is found or it can't be expanded, it returns NULL and has
 * dropped the lock.
 */
struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
{
        struct vm_area_struct *vma, *prev;

        mmap_read_unlock(mm);
        if (mmap_write_lock_killable(mm))
                return NULL;

        vma = find_vma_prev(mm, addr, &prev);
        if (vma && vma->vm_start <= addr)
                goto success;

        if (prev && !vma_expand_up(prev, addr)) {
                vma = prev;
                goto success;
        }

        if (vma && !vma_expand_down(vma, addr))
                goto success;

        mmap_write_unlock(mm);
        return NULL;

success:
        mmap_write_downgrade(mm);
        return vma;
}

/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
 * @mm: The mm_struct
 * @start: The start address to munmap
 * @len: The length to be munmapped.
 * @uf: The userfaultfd list_head
 *
 * Return: 0 on success, error otherwise.
 */
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
              struct list_head *uf)
{
        VMA_ITERATOR(vmi, mm, start);

        return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}

int vm_munmap(unsigned long start, size_t len)
{
        return __vm_munmap(start, len, false);
}
EXPORT_SYMBOL(vm_munmap);

SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
        addr = untagged_addr(addr);
        return __vm_munmap(addr, len, true);
}


/*
 * Emulation of deprecated remap_file_pages() syscall.
 */
SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
{

        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long populate = 0;
        unsigned long ret = -EINVAL;
        struct file *file;
        vm_flags_t vm_flags;

        pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
                     current->comm, current->pid);

        if (prot)
                return ret;
        start = start & PAGE_MASK;
        size = size & PAGE_MASK;

        if (start + size <= start)
                return ret;

        /* Does pgoff wrap? */
        if (pgoff + (size >> PAGE_SHIFT) < pgoff)
                return ret;

        if (mmap_read_lock_killable(mm))
                return -EINTR;

        /*
         * Look up VMA under read lock first so we can perform the security
         * without holding locks (which can be problematic). We reacquire a
         * write lock later and check nothing changed underneath us.
         */
        vma = vma_lookup(mm, start);

        if (!vma || !(vma->vm_flags & VM_SHARED)) {
                mmap_read_unlock(mm);
                return -EINVAL;
        }

        prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
        prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
        prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;

        flags &= MAP_NONBLOCK;
        flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;

        /* Save vm_flags used to calculate prot and flags, and recheck later. */
        vm_flags = vma->vm_flags;
        file = get_file(vma->vm_file);

        mmap_read_unlock(mm);

        /* Call outside mmap_lock to be consistent with other callers. */
        ret = security_mmap_file(file, prot, flags);
        if (ret) {
                fput(file);
                return ret;
        }

        ret = -EINVAL;

        /* OK security check passed, take write lock + let it rip. */
        if (mmap_write_lock_killable(mm)) {
                fput(file);
                return -EINTR;
        }

        vma = vma_lookup(mm, start);

        if (!vma)
                goto out;

        /* Make sure things didn't change under us. */
        if (vma->vm_flags != vm_flags)
                goto out;
        if (vma->vm_file != file)
                goto out;

        if (start + size > vma->vm_end) {
                VMA_ITERATOR(vmi, mm, vma->vm_end);
                struct vm_area_struct *next, *prev = vma;

                for_each_vma_range(vmi, next, start + size) {
                        /* hole between vmas ? */
                        if (next->vm_start != prev->vm_end)
                                goto out;

                        if (next->vm_file != vma->vm_file)
                                goto out;

                        if (next->vm_flags != vma->vm_flags)
                                goto out;

                        if (start + size <= next->vm_end)
                                break;

                        prev = next;
                }

                if (!next)
                        goto out;
        }

        ret = do_mmap(vma->vm_file, start, size,
                        prot, flags, 0, pgoff, &populate, NULL);
out:
        mmap_write_unlock(mm);
        fput(file);
        if (populate)
                mm_populate(ret, populate);
        if (!IS_ERR_VALUE(ret))
                ret = 0;
        return ret;
}

int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        unsigned long len;
        int ret;
        bool populate;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, addr);

        len = PAGE_ALIGN(request);
        if (len < request)
                return -ENOMEM;
        if (!len)
                return 0;

        /* Until we need other flags, refuse anything except VM_EXEC. */
        if ((flags & (~VM_EXEC)) != 0)
                return -EINVAL;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = check_brk_limits(addr, len);
        if (ret)
                goto limits_failed;

        ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
        if (ret)
                goto munmap_failed;

        vma = vma_prev(&vmi);
        ret = do_brk_flags(&vmi, vma, addr, len, flags);
        populate = ((mm->def_flags & VM_LOCKED) != 0);
        mmap_write_unlock(mm);
        userfaultfd_unmap_complete(mm, &uf);
        if (populate && !ret)
                mm_populate(addr, len);
        return ret;

munmap_failed:
limits_failed:
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(vm_brk_flags);

/* Release all mmaps. */
void exit_mmap(struct mm_struct *mm)
{
        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        VMA_ITERATOR(vmi, mm, 0);
        int count = 0;

        /* mm's last user has gone, and its about to be pulled down */
        mmu_notifier_release(mm);

        mmap_read_lock(mm);
        arch_exit_mmap(mm);

        vma = vma_next(&vmi);
        if (!vma || unlikely(xa_is_zero(vma))) {
                /* Can happen if dup_mmap() received an OOM */
                mmap_read_unlock(mm);
                mmap_write_lock(mm);
                goto destroy;
        }

        flush_cache_mm(mm);
        tlb_gather_mmu_fullmm(&tlb, mm);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
        unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
        mmap_read_unlock(mm);

        /*
         * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper
         * because the memory has been already freed.
         */
        set_bit(MMF_OOM_SKIP, &mm->flags);
        mmap_write_lock(mm);
        mt_clear_in_rcu(&mm->mm_mt);
        vma_iter_set(&vmi, vma->vm_end);
        free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
                      USER_PGTABLES_CEILING, true);
        tlb_finish_mmu(&tlb);

        /*
         * Walk the list again, actually closing and freeing it, with preemption
         * enabled, without holding any MM locks besides the unreachable
         * mmap_write_lock.
         */
        vma_iter_set(&vmi, vma->vm_end);
        do {
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += vma_pages(vma);
                vma_mark_detached(vma);
                remove_vma(vma);
                count++;
                cond_resched();
                vma = vma_next(&vmi);
        } while (vma && likely(!xa_is_zero(vma)));

        BUG_ON(count != mm->map_count);

        trace_exit_mmap(mm);
destroy:
        __mt_destroy(&mm->mm_mt);
        mmap_write_unlock(mm);
        vm_unacct_memory(nr_accounted);
}

/* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
 * then i_mmap_rwsem is taken here.
 */
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
{
        unsigned long charged = vma_pages(vma);


        if (find_vma_intersection(mm, vma->vm_start, vma->vm_end))
                return -ENOMEM;

        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, charged))
                return -ENOMEM;

        /*
         * The vm_pgoff of a purely anonymous vma should be irrelevant
         * until its first write fault, when page's anon_vma and index
         * are set.  But now set the vm_pgoff it will almost certainly
         * end up with (unless mremap moves it elsewhere before that
         * first wfault), so /proc/pid/maps tells a consistent story.
         *
         * By setting it to reflect the virtual start address of the
         * vma, merges and splits can happen in a seamless way, just
         * using the existing file pgoff checks and manipulations.
         * Similarly in do_mmap and in do_brk_flags.
         */
        if (vma_is_anonymous(vma)) {
                BUG_ON(vma->anon_vma);
                vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
        }

        if (vma_link(mm, vma)) {
                if (vma->vm_flags & VM_ACCOUNT)
                        vm_unacct_memory(charged);
                return -ENOMEM;
        }

        return 0;
}

/*
 * Return true if the calling process may expand its vm space by the passed
 * number of pages
 */
bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
{
        if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
                return false;

        if (is_data_mapping(flags) &&
            mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
                /* Workaround for Valgrind */
                if (rlimit(RLIMIT_DATA) == 0 &&
                    mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
                        return true;

                pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits%s.\n",
                             current->comm, current->pid,
                             (mm->data_vm + npages) << PAGE_SHIFT,
                             rlimit(RLIMIT_DATA),
                             ignore_rlimit_data ? "" : " or use boot option ignore_rlimit_data");

                if (!ignore_rlimit_data)
                        return false;
        }

        return true;
}

void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
        WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);

        if (is_exec_mapping(flags))
                mm->exec_vm += npages;
        else if (is_stack_mapping(flags))
                mm->stack_vm += npages;
        else if (is_data_mapping(flags))
                mm->data_vm += npages;
}

static vm_fault_t special_mapping_fault(struct vm_fault *vmf);

/*
 * Close hook, called for unmap() and on the old vma for mremap().
 *
 * Having a close hook prevents vma merging regardless of flags.
 */
static void special_mapping_close(struct vm_area_struct *vma)
{
        const struct vm_special_mapping *sm = vma->vm_private_data;

        if (sm->close)
                sm->close(sm, vma);
}

static const char *special_mapping_name(struct vm_area_struct *vma)
{
        return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}

static int special_mapping_mremap(struct vm_area_struct *new_vma)
{
        struct vm_special_mapping *sm = new_vma->vm_private_data;

        if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
                return -EFAULT;

        if (sm->mremap)
                return sm->mremap(sm, new_vma);

        return 0;
}

static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
{
        /*
         * Forbid splitting special mappings - kernel has expectations over
         * the number of pages in mapping. Together with VM_DONTEXPAND
         * the size of vma should stay the same over the special mapping's
         * lifetime.
         */
        return -EINVAL;
}

static const struct vm_operations_struct special_mapping_vmops = {
        .close = special_mapping_close,
        .fault = special_mapping_fault,
        .mremap = special_mapping_mremap,
        .name = special_mapping_name,
        /* vDSO code relies that VVAR can't be accessed remotely */
        .access = NULL,
        .may_split = special_mapping_split,
};

static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        pgoff_t pgoff;
        struct page **pages;
        struct vm_special_mapping *sm = vma->vm_private_data;

        if (sm->fault)
                return sm->fault(sm, vmf->vma, vmf);

        pages = sm->pages;

        for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
                pgoff--;

        if (*pages) {
                struct page *page = *pages;
                get_page(page);
                vmf->page = page;
                return 0;
        }

        return VM_FAULT_SIGBUS;
}

static struct vm_area_struct *__install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, void *priv,
        const struct vm_operations_struct *ops)
{
        int ret;
        struct vm_area_struct *vma;

        vma = vm_area_alloc(mm);
        if (unlikely(vma == NULL))
                return ERR_PTR(-ENOMEM);

        vma_set_range(vma, addr, addr + len, 0);
        vm_flags_init(vma, (vm_flags | mm->def_flags |
                      VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        vma->vm_ops = ops;
        vma->vm_private_data = priv;

        ret = insert_vm_struct(mm, vma);
        if (ret)
                goto out;

        vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);

        perf_event_mmap(vma);

        return vma;

out:
        vm_area_free(vma);
        return ERR_PTR(ret);
}

bool vma_is_special_mapping(const struct vm_area_struct *vma,
        const struct vm_special_mapping *sm)
{
        return vma->vm_private_data == sm &&
                vma->vm_ops == &special_mapping_vmops;
}

/*
 * Called with mm->mmap_lock held for writing.
 * Insert a new vma covering the given region, with the given flags.
 * Its pages are supplied by the given array of struct page *.
 * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
 * The region past the last page supplied will always produce SIGBUS.
 * The array pointer and the pages it points to are assumed to stay alive
 * for as long as this mapping might exist.
 */
struct vm_area_struct *_install_special_mapping(
        struct mm_struct *mm,
        unsigned long addr, unsigned long len,
        unsigned long vm_flags, const struct vm_special_mapping *spec)
{
        return __install_special_mapping(mm, addr, len, vm_flags, (void *)spec,
                                        &special_mapping_vmops);
}

#ifdef CONFIG_SYSCTL
#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
                defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
int sysctl_legacy_va_layout;
#endif

static const struct ctl_table mmap_table[] = {
                {
                                .procname       = "max_map_count",
                                .data           = &sysctl_max_map_count,
                                .maxlen         = sizeof(sysctl_max_map_count),
                                .mode           = 0644,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = SYSCTL_ZERO,
                },
#if defined(HAVE_ARCH_PICK_MMAP_LAYOUT) || \
                defined(CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT)
                {
                                .procname       = "legacy_va_layout",
                                .data           = &sysctl_legacy_va_layout,
                                .maxlen         = sizeof(sysctl_legacy_va_layout),
                                .mode           = 0644,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = SYSCTL_ZERO,
                },
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
                {
                                .procname       = "mmap_rnd_bits",
                                .data           = &mmap_rnd_bits,
                                .maxlen         = sizeof(mmap_rnd_bits),
                                .mode           = 0600,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = (void *)&mmap_rnd_bits_min,
                                .extra2         = (void *)&mmap_rnd_bits_max,
                },
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
                {
                                .procname       = "mmap_rnd_compat_bits",
                                .data           = &mmap_rnd_compat_bits,
                                .maxlen         = sizeof(mmap_rnd_compat_bits),
                                .mode           = 0600,
                                .proc_handler   = proc_dointvec_minmax,
                                .extra1         = (void *)&mmap_rnd_compat_bits_min,
                                .extra2         = (void *)&mmap_rnd_compat_bits_max,
                },
#endif
};
#endif /* CONFIG_SYSCTL */

/*
 * initialise the percpu counter for VM
 */
void __init mmap_init(void)
{
        int ret;

        ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
        VM_BUG_ON(ret);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", mmap_table);
#endif
}

/*
 * Initialise sysctl_user_reserve_kbytes.
 *
 * This is intended to prevent a user from starting a single memory hogging
 * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
 * mode.
 *
 * The default value is min(3% of free memory, 128MB)
 * 128MB is enough to recover with sshd/login, bash, and top/kill.
 */
static int init_user_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_user_reserve_kbytes = min(free_kbytes / 32, SZ_128K);
        return 0;
}
subsys_initcall(init_user_reserve);

/*
 * Initialise sysctl_admin_reserve_kbytes.
 *
 * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
 * to log in and kill a memory hogging process.
 *
 * Systems with more than 256MB will reserve 8MB, enough to recover
 * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
 * only reserve 3% of free pages by default.
 */
static int init_admin_reserve(void)
{
        unsigned long free_kbytes;

        free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

        sysctl_admin_reserve_kbytes = min(free_kbytes / 32, SZ_8K);
        return 0;
}
subsys_initcall(init_admin_reserve);

/*
 * Reinititalise user and admin reserves if memory is added or removed.
 *
 * The default user reserve max is 128MB, and the default max for the
 * admin reserve is 8MB. These are usually, but not always, enough to
 * enable recovery from a memory hogging process using login/sshd, a shell,
 * and tools like top. It may make sense to increase or even disable the
 * reserve depending on the existence of swap or variations in the recovery
 * tools. So, the admin may have changed them.
 *
 * If memory is added and the reserves have been eliminated or increased above
 * the default max, then we'll trust the admin.
 *
 * If memory is removed and there isn't enough free memory, then we
 * need to reset the reserves.
 *
 * Otherwise keep the reserve set by the admin.
 */
static int reserve_mem_notifier(struct notifier_block *nb,
                             unsigned long action, void *data)
{
        unsigned long tmp, free_kbytes;

        switch (action) {
        case MEM_ONLINE:
                /* Default max is 128MB. Leave alone if modified by operator. */
                tmp = sysctl_user_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_128K)
                        init_user_reserve();

                /* Default max is 8MB.  Leave alone if modified by operator. */
                tmp = sysctl_admin_reserve_kbytes;
                if (tmp > 0 && tmp < SZ_8K)
                        init_admin_reserve();

                break;
        case MEM_OFFLINE:
                free_kbytes = K(global_zone_page_state(NR_FREE_PAGES));

                if (sysctl_user_reserve_kbytes > free_kbytes) {
                        init_user_reserve();
                        pr_info("vm.user_reserve_kbytes reset to %lu\n",
                                sysctl_user_reserve_kbytes);
                }

                if (sysctl_admin_reserve_kbytes > free_kbytes) {
                        init_admin_reserve();
                        pr_info("vm.admin_reserve_kbytes reset to %lu\n",
                                sysctl_admin_reserve_kbytes);
                }
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static int __meminit init_reserve_notifier(void)
{
        if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI))
                pr_err("Failed registering memory add/remove notifier for admin reserve\n");

        return 0;
}
subsys_initcall(init_reserve_notifier);

/*
 * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
 * this VMA and its relocated range, which will now reside at [vma->vm_start -
 * shift, vma->vm_end - shift).
 *
 * This function is almost certainly NOT what you want for anything other than
 * early executable temporary stack relocation.
 */
int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
{
        /*
         * The process proceeds as follows:
         *
         * 1) Use shift to calculate the new vma endpoints.
         * 2) Extend vma to cover both the old and new ranges.  This ensures the
         *    arguments passed to subsequent functions are consistent.
         * 3) Move vma's page tables to the new range.
         * 4) Free up any cleared pgd range.
         * 5) Shrink the vma to cover only the new range.
         */

        struct mm_struct *mm = vma->vm_mm;
        unsigned long old_start = vma->vm_start;
        unsigned long old_end = vma->vm_end;
        unsigned long length = old_end - old_start;
        unsigned long new_start = old_start - shift;
        unsigned long new_end = old_end - shift;
        VMA_ITERATOR(vmi, mm, new_start);
        VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
        struct vm_area_struct *next;
        struct mmu_gather tlb;
        PAGETABLE_MOVE(pmc, vma, vma, old_start, new_start, length);

        BUG_ON(new_start > new_end);

        /*
         * ensure there are no vmas between where we want to go
         * and where we are
         */
        if (vma != vma_next(&vmi))
                return -EFAULT;

        vma_iter_prev_range(&vmi);
        /*
         * cover the whole range: [new_start, old_end)
         */
        vmg.middle = vma;
        if (vma_expand(&vmg))
                return -ENOMEM;

        /*
         * move the page tables downwards, on failure we rely on
         * process cleanup to remove whatever mess we made.
         */
        pmc.for_stack = true;
        if (length != move_page_tables(&pmc))
                return -ENOMEM;

        tlb_gather_mmu(&tlb, mm);
        next = vma_next(&vmi);
        if (new_end > old_start) {
                /*
                 * when the old and new regions overlap clear from new_end.
                 */
                free_pgd_range(&tlb, new_end, old_end, new_end,
                        next ? next->vm_start : USER_PGTABLES_CEILING);
        } else {
                /*
                 * otherwise, clean from old_start; this is done to not touch
                 * the address space in [new_end, old_start) some architectures
                 * have constraints on va-space that make this illegal (IA64) -
                 * for the others its just a little faster.
                 */
                free_pgd_range(&tlb, old_start, old_end, new_end,
                        next ? next->vm_start : USER_PGTABLES_CEILING);
        }
        tlb_finish_mmu(&tlb);

        vma_prev(&vmi);
        /* Shrink the vma to just the new range */
        return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
}

#ifdef CONFIG_MMU
/*
 * Obtain a read lock on mm->mmap_lock, if the specified address is below the
 * start of the VMA, the intent is to perform a write, and it is a
 * downward-growing stack, then attempt to expand the stack to contain it.
 *
 * This function is intended only for obtaining an argument page from an ELF
 * image, and is almost certainly NOT what you want to use for any other
 * purpose.
 *
 * IMPORTANT - VMA fields are accessed without an mmap lock being held, so the
 * VMA referenced must not be linked in any user-visible tree, i.e. it must be a
 * new VMA being mapped.
 *
 * The function assumes that addr is either contained within the VMA or below
 * it, and makes no attempt to validate this value beyond that.
 *
 * Returns true if the read lock was obtained and a stack was perhaps expanded,
 * false if the stack expansion failed.
 *
 * On stack expansion the function temporarily acquires an mmap write lock
 * before downgrading it.
 */
bool mmap_read_lock_maybe_expand(struct mm_struct *mm,
                                 struct vm_area_struct *new_vma,
                                 unsigned long addr, bool write)
{
        if (!write || addr >= new_vma->vm_start) {
                mmap_read_lock(mm);
                return true;
        }

        if (!(new_vma->vm_flags & VM_GROWSDOWN))
                return false;

        mmap_write_lock(mm);
        if (expand_downwards(new_vma, addr)) {
                mmap_write_unlock(mm);
                return false;
        }

        mmap_write_downgrade(mm);
        return true;
}
#else
bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
                                 unsigned long addr, bool write)
{
        return false;
}
#endif












































































































































    3 














































    3 






    3 







    3 

















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2018, Intel Corporation. */

/* A common module to handle registrations and notifications for paravirtual
 * drivers to enable accelerated datapath and support VF live migration.
 *
 * The notifier and event handling code is based on netvsc driver.
 */

#include <linux/module.h>
#include <linux/etherdevice.h>
#include <uapi/linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/if_vlan.h>
#include <net/failover.h>

static LIST_HEAD(failover_list);
static DEFINE_SPINLOCK(failover_lock);

static struct net_device *failover_get_bymac(u8 *mac, struct failover_ops **ops)
{
        struct net_device *failover_dev;
        struct failover *failover;

        spin_lock(&failover_lock);
        list_for_each_entry(failover, &failover_list, list) {
                failover_dev = rtnl_dereference(failover->failover_dev);
                if (ether_addr_equal(failover_dev->perm_addr, mac)) {
                        *ops = rtnl_dereference(failover->ops);
                        spin_unlock(&failover_lock);
                        return failover_dev;
                }
        }
        spin_unlock(&failover_lock);
        return NULL;
}

/**
 * failover_slave_register - Register a slave netdev
 *
 * @slave_dev: slave netdev that is being registered
 *
 * Registers a slave device to a failover instance. Only ethernet devices
 * are supported.
 */
static int failover_slave_register(struct net_device *slave_dev)
{
        struct netdev_lag_upper_info lag_upper_info;
        struct net_device *failover_dev;
        struct failover_ops *fops;
        int err;

        if (slave_dev->type != ARPHRD_ETHER)
                goto done;

        ASSERT_RTNL();

        failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
        if (!failover_dev)
                goto done;

        if (fops && fops->slave_pre_register &&
            fops->slave_pre_register(slave_dev, failover_dev))
                goto done;

        err = netdev_rx_handler_register(slave_dev, fops->slave_handle_frame,
                                         failover_dev);
        if (err) {
                netdev_err(slave_dev, "can not register failover rx handler (err = %d)\n",
                           err);
                goto done;
        }

        lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
        err = netdev_master_upper_dev_link(slave_dev, failover_dev, NULL,
                                           &lag_upper_info, NULL);
        if (err) {
                netdev_err(slave_dev, "can not set failover device %s (err = %d)\n",
                           failover_dev->name, err);
                goto err_upper_link;
        }

        slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);

        if (fops && fops->slave_register &&
            !fops->slave_register(slave_dev, failover_dev))
                return NOTIFY_OK;

        netdev_upper_dev_unlink(slave_dev, failover_dev);
        slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
err_upper_link:
        netdev_rx_handler_unregister(slave_dev);
done:
        return NOTIFY_DONE;
}

/**
 * failover_slave_unregister - Unregister a slave netdev
 *
 * @slave_dev: slave netdev that is being unregistered
 *
 * Unregisters a slave device from a failover instance.
 */
int failover_slave_unregister(struct net_device *slave_dev)
{
        struct net_device *failover_dev;
        struct failover_ops *fops;

        if (!netif_is_failover_slave(slave_dev))
                goto done;

        ASSERT_RTNL();

        failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
        if (!failover_dev)
                goto done;

        if (fops && fops->slave_pre_unregister &&
            fops->slave_pre_unregister(slave_dev, failover_dev))
                goto done;

        netdev_rx_handler_unregister(slave_dev);
        netdev_upper_dev_unlink(slave_dev, failover_dev);
        slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);

        if (fops && fops->slave_unregister &&
            !fops->slave_unregister(slave_dev, failover_dev))
                return NOTIFY_OK;

done:
        return NOTIFY_DONE;
}
EXPORT_SYMBOL_GPL(failover_slave_unregister);

static int failover_slave_link_change(struct net_device *slave_dev)
{
        struct net_device *failover_dev;
        struct failover_ops *fops;

        if (!netif_is_failover_slave(slave_dev))
                goto done;

        ASSERT_RTNL();

        failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
        if (!failover_dev)
                goto done;

        if (!netif_running(failover_dev))
                goto done;

        if (fops && fops->slave_link_change &&
            !fops->slave_link_change(slave_dev, failover_dev))
                return NOTIFY_OK;

done:
        return NOTIFY_DONE;
}

static int failover_slave_name_change(struct net_device *slave_dev)
{
        struct net_device *failover_dev;
        struct failover_ops *fops;

        if (!netif_is_failover_slave(slave_dev))
                goto done;

        ASSERT_RTNL();

        failover_dev = failover_get_bymac(slave_dev->perm_addr, &fops);
        if (!failover_dev)
                goto done;

        if (!netif_running(failover_dev))
                goto done;

        if (fops && fops->slave_name_change &&
            !fops->slave_name_change(slave_dev, failover_dev))
                return NOTIFY_OK;

done:
        return NOTIFY_DONE;
}

static int
failover_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);

        /* Skip parent events */
        if (netif_is_failover(event_dev))
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_REGISTER:
                return failover_slave_register(event_dev);
        case NETDEV_UNREGISTER:
                return failover_slave_unregister(event_dev);
        case NETDEV_UP:
        case NETDEV_DOWN:
        case NETDEV_CHANGE:
                return failover_slave_link_change(event_dev);
        case NETDEV_CHANGENAME:
                return failover_slave_name_change(event_dev);
        default:
                return NOTIFY_DONE;
        }
}

static struct notifier_block failover_notifier = {
        .notifier_call = failover_event,
};

static void
failover_existing_slave_register(struct net_device *failover_dev)
{
        struct net *net = dev_net(failover_dev);
        struct net_device *dev;

        rtnl_lock();
        for_each_netdev(net, dev) {
                if (netif_is_failover(dev))
                        continue;
                if (ether_addr_equal(failover_dev->perm_addr, dev->perm_addr))
                        failover_slave_register(dev);
        }
        rtnl_unlock();
}

/**
 * failover_register - Register a failover instance
 *
 * @dev: failover netdev
 * @ops: failover ops
 *
 * Allocate and register a failover instance for a failover netdev. ops
 * provides handlers for slave device register/unregister/link change/
 * name change events.
 *
 * Return: pointer to failover instance
 */
struct failover *failover_register(struct net_device *dev,
                                   struct failover_ops *ops)
{
        struct failover *failover;

        if (dev->type != ARPHRD_ETHER)
                return ERR_PTR(-EINVAL);

        failover = kzalloc(sizeof(*failover), GFP_KERNEL);
        if (!failover)
                return ERR_PTR(-ENOMEM);

        rcu_assign_pointer(failover->ops, ops);
        netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL);
        dev->priv_flags |= IFF_FAILOVER;
        rcu_assign_pointer(failover->failover_dev, dev);

        spin_lock(&failover_lock);
        list_add_tail(&failover->list, &failover_list);
        spin_unlock(&failover_lock);

        netdev_info(dev, "failover master:%s registered\n", dev->name);

        failover_existing_slave_register(dev);

        return failover;
}
EXPORT_SYMBOL_GPL(failover_register);

/**
 * failover_unregister - Unregister a failover instance
 *
 * @failover: pointer to failover instance
 *
 * Unregisters and frees a failover instance.
 */
void failover_unregister(struct failover *failover)
{
        struct net_device *failover_dev;

        failover_dev = rcu_dereference(failover->failover_dev);

        netdev_info(failover_dev, "failover master:%s unregistered\n",
                    failover_dev->name);

        failover_dev->priv_flags &= ~IFF_FAILOVER;
        netdev_put(failover_dev, &failover->dev_tracker);

        spin_lock(&failover_lock);
        list_del(&failover->list);
        spin_unlock(&failover_lock);

        kfree(failover);
}
EXPORT_SYMBOL_GPL(failover_unregister);

static __init int
failover_init(void)
{
        register_netdevice_notifier(&failover_notifier);

        return 0;
}
module_init(failover_init);

static __exit
void failover_exit(void)
{
        unregister_netdevice_notifier(&failover_notifier);
}
module_exit(failover_exit);

MODULE_DESCRIPTION("Generic failover infrastructure/interface");
MODULE_LICENSE("GPL v2");

























































































































































































































































































































































































































































































































































































































































































































































































    3 




















    3 








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
/*
 * Copyright (c) 2015, Mellanox Technologies inc.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "core_priv.h"

#include <linux/in.h>
#include <linux/in6.h>

/* For in6_dev_get/in6_dev_put */
#include <net/addrconf.h>
#include <net/bonding.h>

#include <rdma/ib_cache.h>
#include <rdma/ib_addr.h>

static struct workqueue_struct *gid_cache_wq;

enum gid_op_type {
        GID_DEL = 0,
        GID_ADD
};

struct update_gid_event_work {
        struct work_struct work;
        union ib_gid       gid;
        struct ib_gid_attr gid_attr;
        enum gid_op_type gid_op;
};

#define ROCE_NETDEV_CALLBACK_SZ                3
struct netdev_event_work_cmd {
        roce_netdev_callback        cb;
        roce_netdev_filter        filter;
        struct net_device        *ndev;
        struct net_device        *filter_ndev;
};

struct netdev_event_work {
        struct work_struct                work;
        struct netdev_event_work_cmd        cmds[ROCE_NETDEV_CALLBACK_SZ];
};

static const struct {
        bool (*is_supported)(const struct ib_device *device, u32 port_num);
        enum ib_gid_type gid_type;
} PORT_CAP_TO_GID_TYPE[] = {
        {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE},
        {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP},
};

#define CAP_TO_GID_TABLE_SIZE        ARRAY_SIZE(PORT_CAP_TO_GID_TYPE)

unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port)
{
        int i;
        unsigned int ret_flags = 0;

        if (!rdma_protocol_roce(ib_dev, port))
                return 1UL << IB_GID_TYPE_IB;

        for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++)
                if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port))
                        ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type;

        return ret_flags;
}
EXPORT_SYMBOL(roce_gid_type_mask_support);

static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev,
                       u32 port, union ib_gid *gid,
                       struct ib_gid_attr *gid_attr)
{
        int i;
        unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port);

        for (i = 0; i < IB_GID_TYPE_SIZE; i++) {
                if ((1UL << i) & gid_type_mask) {
                        gid_attr->gid_type = i;
                        switch (gid_op) {
                        case GID_ADD:
                                ib_cache_gid_add(ib_dev, port,
                                                 gid, gid_attr);
                                break;
                        case GID_DEL:
                                ib_cache_gid_del(ib_dev, port,
                                                 gid, gid_attr);
                                break;
                        }
                }
        }
}

enum bonding_slave_state {
        BONDING_SLAVE_STATE_ACTIVE        = 1UL << 0,
        BONDING_SLAVE_STATE_INACTIVE        = 1UL << 1,
        /* No primary slave or the device isn't a slave in bonding */
        BONDING_SLAVE_STATE_NA                = 1UL << 2,
};

static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_device *dev,
                                                                   struct net_device *upper)
{
        if (upper && netif_is_bond_master(upper)) {
                struct net_device *pdev =
                        bond_option_active_slave_get_rcu(netdev_priv(upper));

                if (pdev)
                        return dev == pdev ? BONDING_SLAVE_STATE_ACTIVE :
                                BONDING_SLAVE_STATE_INACTIVE;
        }

        return BONDING_SLAVE_STATE_NA;
}

#define REQUIRED_BOND_STATES                (BONDING_SLAVE_STATE_ACTIVE |        \
                                         BONDING_SLAVE_STATE_NA)
static bool
is_eth_port_of_netdev_filter(struct ib_device *ib_dev, u32 port,
                             struct net_device *rdma_ndev, void *cookie)
{
        struct net_device *real_dev;
        bool res;

        if (!rdma_ndev)
                return false;

        rcu_read_lock();
        real_dev = rdma_vlan_dev_real_dev(cookie);
        if (!real_dev)
                real_dev = cookie;

        res = ((rdma_is_upper_dev_rcu(rdma_ndev, cookie) &&
               (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) &
                REQUIRED_BOND_STATES)) ||
               real_dev == rdma_ndev);

        rcu_read_unlock();
        return res;
}

static bool
is_eth_port_inactive_slave_filter(struct ib_device *ib_dev, u32 port,
                                  struct net_device *rdma_ndev, void *cookie)
{
        struct net_device *master_dev;
        bool res;

        if (!rdma_ndev)
                return false;

        rcu_read_lock();
        master_dev = netdev_master_upper_dev_get_rcu(rdma_ndev);
        res = is_eth_active_slave_of_bonding_rcu(rdma_ndev, master_dev) ==
                BONDING_SLAVE_STATE_INACTIVE;
        rcu_read_unlock();

        return res;
}

/**
 * is_ndev_for_default_gid_filter - Check if a given netdevice
 * can be considered for default GIDs or not.
 * @ib_dev:                IB device to check
 * @port:                Port to consider for adding default GID
 * @rdma_ndev:                rdma netdevice pointer
 * @cookie:             Netdevice to consider to form a default GID
 *
 * is_ndev_for_default_gid_filter() returns true if a given netdevice can be
 * considered for deriving default RoCE GID, returns false otherwise.
 */
static bool
is_ndev_for_default_gid_filter(struct ib_device *ib_dev, u32 port,
                               struct net_device *rdma_ndev, void *cookie)
{
        struct net_device *cookie_ndev = cookie;
        bool res;

        if (!rdma_ndev)
                return false;

        rcu_read_lock();

        /*
         * When rdma netdevice is used in bonding, bonding master netdevice
         * should be considered for default GIDs. Therefore, ignore slave rdma
         * netdevices when bonding is considered.
         * Additionally when event(cookie) netdevice is bond master device,
         * make sure that it the upper netdevice of rdma netdevice.
         */
        res = ((cookie_ndev == rdma_ndev && !netif_is_bond_slave(rdma_ndev)) ||
               (netif_is_bond_master(cookie_ndev) &&
                rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev)));

        rcu_read_unlock();
        return res;
}

static bool pass_all_filter(struct ib_device *ib_dev, u32 port,
                            struct net_device *rdma_ndev, void *cookie)
{
        return true;
}

static bool upper_device_filter(struct ib_device *ib_dev, u32 port,
                                struct net_device *rdma_ndev, void *cookie)
{
        bool res;

        if (!rdma_ndev)
                return false;

        if (rdma_ndev == cookie)
                return true;

        rcu_read_lock();
        res = rdma_is_upper_dev_rcu(rdma_ndev, cookie);
        rcu_read_unlock();

        return res;
}

/**
 * is_upper_ndev_bond_master_filter - Check if a given netdevice
 * is bond master device of netdevice of the RDMA device of port.
 * @ib_dev:                IB device to check
 * @port:                Port to consider for adding default GID
 * @rdma_ndev:                Pointer to rdma netdevice
 * @cookie:                Netdevice to consider to form a default GID
 *
 * is_upper_ndev_bond_master_filter() returns true if a cookie_netdev
 * is bond master device and rdma_ndev is its lower netdevice. It might
 * not have been established as slave device yet.
 */
static bool
is_upper_ndev_bond_master_filter(struct ib_device *ib_dev, u32 port,
                                 struct net_device *rdma_ndev,
                                 void *cookie)
{
        struct net_device *cookie_ndev = cookie;
        bool match = false;

        if (!rdma_ndev)
                return false;

        rcu_read_lock();
        if (netif_is_bond_master(cookie_ndev) &&
            rdma_is_upper_dev_rcu(rdma_ndev, cookie_ndev))
                match = true;
        rcu_read_unlock();
        return match;
}

static void update_gid_ip(enum gid_op_type gid_op,
                          struct ib_device *ib_dev,
                          u32 port, struct net_device *ndev,
                          struct sockaddr *addr)
{
        union ib_gid gid;
        struct ib_gid_attr gid_attr;

        rdma_ip2gid(addr, &gid);
        memset(&gid_attr, 0, sizeof(gid_attr));
        gid_attr.ndev = ndev;

        update_gid(gid_op, ib_dev, port, &gid, &gid_attr);
}

static void bond_delete_netdev_default_gids(struct ib_device *ib_dev,
                                            u32 port,
                                            struct net_device *rdma_ndev,
                                            struct net_device *event_ndev)
{
        struct net_device *real_dev = rdma_vlan_dev_real_dev(event_ndev);
        unsigned long gid_type_mask;

        if (!rdma_ndev)
                return;

        if (!real_dev)
                real_dev = event_ndev;

        rcu_read_lock();

        if (((rdma_ndev != event_ndev &&
              !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) ||
             is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev)
                                                 ==
             BONDING_SLAVE_STATE_INACTIVE)) {
                rcu_read_unlock();
                return;
        }

        rcu_read_unlock();

        gid_type_mask = roce_gid_type_mask_support(ib_dev, port);

        ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev,
                                     gid_type_mask,
                                     IB_CACHE_GID_DEFAULT_MODE_DELETE);
}

static void enum_netdev_ipv4_ips(struct ib_device *ib_dev,
                                 u32 port, struct net_device *ndev)
{
        const struct in_ifaddr *ifa;
        struct in_device *in_dev;
        struct sin_list {
                struct list_head        list;
                struct sockaddr_in        ip;
        };
        struct sin_list *sin_iter;
        struct sin_list *sin_temp;

        LIST_HEAD(sin_list);
        if (ndev->reg_state >= NETREG_UNREGISTERING)
                return;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(ndev);
        if (!in_dev) {
                rcu_read_unlock();
                return;
        }

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                struct sin_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);

                if (!entry)
                        continue;

                entry->ip.sin_family = AF_INET;
                entry->ip.sin_addr.s_addr = ifa->ifa_address;
                list_add_tail(&entry->list, &sin_list);
        }

        rcu_read_unlock();

        list_for_each_entry_safe(sin_iter, sin_temp, &sin_list, list) {
                update_gid_ip(GID_ADD, ib_dev, port, ndev,
                              (struct sockaddr *)&sin_iter->ip);
                list_del(&sin_iter->list);
                kfree(sin_iter);
        }
}

static void enum_netdev_ipv6_ips(struct ib_device *ib_dev,
                                 u32 port, struct net_device *ndev)
{
        struct inet6_ifaddr *ifp;
        struct inet6_dev *in6_dev;
        struct sin6_list {
                struct list_head        list;
                struct sockaddr_in6        sin6;
        };
        struct sin6_list *sin6_iter;
        struct sin6_list *sin6_temp;
        struct ib_gid_attr gid_attr = {.ndev = ndev};
        LIST_HEAD(sin6_list);

        if (ndev->reg_state >= NETREG_UNREGISTERING)
                return;

        in6_dev = in6_dev_get(ndev);
        if (!in6_dev)
                return;

        read_lock_bh(&in6_dev->lock);
        list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
                struct sin6_list *entry = kzalloc(sizeof(*entry), GFP_ATOMIC);

                if (!entry)
                        continue;

                entry->sin6.sin6_family = AF_INET6;
                entry->sin6.sin6_addr = ifp->addr;
                list_add_tail(&entry->list, &sin6_list);
        }
        read_unlock_bh(&in6_dev->lock);

        in6_dev_put(in6_dev);

        list_for_each_entry_safe(sin6_iter, sin6_temp, &sin6_list, list) {
                union ib_gid        gid;

                rdma_ip2gid((struct sockaddr *)&sin6_iter->sin6, &gid);
                update_gid(GID_ADD, ib_dev, port, &gid, &gid_attr);
                list_del(&sin6_iter->list);
                kfree(sin6_iter);
        }
}

static void _add_netdev_ips(struct ib_device *ib_dev, u32 port,
                            struct net_device *ndev)
{
        enum_netdev_ipv4_ips(ib_dev, port, ndev);
        if (IS_ENABLED(CONFIG_IPV6))
                enum_netdev_ipv6_ips(ib_dev, port, ndev);
}

static void add_netdev_ips(struct ib_device *ib_dev, u32 port,
                           struct net_device *rdma_ndev, void *cookie)
{
        _add_netdev_ips(ib_dev, port, cookie);
}

static void del_netdev_ips(struct ib_device *ib_dev, u32 port,
                           struct net_device *rdma_ndev, void *cookie)
{
        ib_cache_gid_del_all_netdev_gids(ib_dev, port, cookie);
}

/**
 * del_default_gids - Delete default GIDs of the event/cookie netdevice
 * @ib_dev:        RDMA device pointer
 * @port:        Port of the RDMA device whose GID table to consider
 * @rdma_ndev:        Unused rdma netdevice
 * @cookie:        Pointer to event netdevice
 *
 * del_default_gids() deletes the default GIDs of the event/cookie netdevice.
 */
static void del_default_gids(struct ib_device *ib_dev, u32 port,
                             struct net_device *rdma_ndev, void *cookie)
{
        struct net_device *cookie_ndev = cookie;
        unsigned long gid_type_mask;

        gid_type_mask = roce_gid_type_mask_support(ib_dev, port);

        ib_cache_gid_set_default_gid(ib_dev, port, cookie_ndev, gid_type_mask,
                                     IB_CACHE_GID_DEFAULT_MODE_DELETE);
}

static void add_default_gids(struct ib_device *ib_dev, u32 port,
                             struct net_device *rdma_ndev, void *cookie)
{
        struct net_device *event_ndev = cookie;
        unsigned long gid_type_mask;

        gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
        ib_cache_gid_set_default_gid(ib_dev, port, event_ndev, gid_type_mask,
                                     IB_CACHE_GID_DEFAULT_MODE_SET);
}

static void enum_all_gids_of_dev_cb(struct ib_device *ib_dev,
                                    u32 port,
                                    struct net_device *rdma_ndev,
                                    void *cookie)
{
        struct net *net;
        struct net_device *ndev;

        /* Lock the rtnl to make sure the netdevs does not move under
         * our feet
         */
        rtnl_lock();
        down_read(&net_rwsem);
        for_each_net(net)
                for_each_netdev(net, ndev) {
                        /*
                         * Filter and add default GIDs of the primary netdevice
                         * when not in bonding mode, or add default GIDs
                         * of bond master device, when in bonding mode.
                         */
                        if (is_ndev_for_default_gid_filter(ib_dev, port,
                                                           rdma_ndev, ndev))
                                add_default_gids(ib_dev, port, rdma_ndev, ndev);

                        if (is_eth_port_of_netdev_filter(ib_dev, port,
                                                         rdma_ndev, ndev))
                                _add_netdev_ips(ib_dev, port, ndev);
                }
        up_read(&net_rwsem);
        rtnl_unlock();
}

/**
 * rdma_roce_rescan_device - Rescan all of the network devices in the system
 * and add their gids, as needed, to the relevant RoCE devices.
 *
 * @ib_dev:         the rdma device
 */
void rdma_roce_rescan_device(struct ib_device *ib_dev)
{
        ib_enum_roce_netdev(ib_dev, pass_all_filter, NULL,
                            enum_all_gids_of_dev_cb, NULL);
}
EXPORT_SYMBOL(rdma_roce_rescan_device);

/**
 * rdma_roce_rescan_port - Rescan all of the network devices in the system
 * and add their gids if relevant to the port of the RoCE device.
 *
 * @ib_dev: IB device
 * @port: Port number
 */
void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port)
{
        struct net_device *ndev = NULL;

        if (rdma_protocol_roce(ib_dev, port)) {
                ndev = ib_device_get_netdev(ib_dev, port);
                if (!ndev)
                        return;
                enum_all_gids_of_dev_cb(ib_dev, port, ndev, ndev);
                dev_put(ndev);
        }
}
EXPORT_SYMBOL(rdma_roce_rescan_port);

static void callback_for_addr_gid_device_scan(struct ib_device *device,
                                              u32 port,
                                              struct net_device *rdma_ndev,
                                              void *cookie)
{
        struct update_gid_event_work *parsed = cookie;

        return update_gid(parsed->gid_op, device,
                          port, &parsed->gid,
                          &parsed->gid_attr);
}

struct upper_list {
        struct list_head list;
        struct net_device *upper;
};

static int netdev_upper_walk(struct net_device *upper,
                             struct netdev_nested_priv *priv)
{
        struct upper_list *entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
        struct list_head *upper_list = (struct list_head *)priv->data;

        if (!entry)
                return 0;

        list_add_tail(&entry->list, upper_list);
        dev_hold(upper);
        entry->upper = upper;

        return 0;
}

static void handle_netdev_upper(struct ib_device *ib_dev, u32 port,
                                void *cookie,
                                void (*handle_netdev)(struct ib_device *ib_dev,
                                                      u32 port,
                                                      struct net_device *ndev))
{
        struct net_device *ndev = cookie;
        struct netdev_nested_priv priv;
        struct upper_list *upper_iter;
        struct upper_list *upper_temp;
        LIST_HEAD(upper_list);

        priv.data = &upper_list;
        rcu_read_lock();
        netdev_walk_all_upper_dev_rcu(ndev, netdev_upper_walk, &priv);
        rcu_read_unlock();

        handle_netdev(ib_dev, port, ndev);
        list_for_each_entry_safe(upper_iter, upper_temp, &upper_list,
                                 list) {
                handle_netdev(ib_dev, port, upper_iter->upper);
                dev_put(upper_iter->upper);
                list_del(&upper_iter->list);
                kfree(upper_iter);
        }
}

void roce_del_all_netdev_gids(struct ib_device *ib_dev,
                              u32 port, struct net_device *ndev)
{
        ib_cache_gid_del_all_netdev_gids(ib_dev, port, ndev);
}
EXPORT_SYMBOL(roce_del_all_netdev_gids);

static void del_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
                                 struct net_device *rdma_ndev, void *cookie)
{
        handle_netdev_upper(ib_dev, port, cookie, roce_del_all_netdev_gids);
}

static void add_netdev_upper_ips(struct ib_device *ib_dev, u32 port,
                                 struct net_device *rdma_ndev, void *cookie)
{
        handle_netdev_upper(ib_dev, port, cookie, _add_netdev_ips);
}

static void del_netdev_default_ips_join(struct ib_device *ib_dev, u32 port,
                                        struct net_device *rdma_ndev,
                                        void *cookie)
{
        struct net_device *master_ndev;

        rcu_read_lock();
        master_ndev = netdev_master_upper_dev_get_rcu(rdma_ndev);
        dev_hold(master_ndev);
        rcu_read_unlock();

        if (master_ndev) {
                bond_delete_netdev_default_gids(ib_dev, port, rdma_ndev,
                                                master_ndev);
                dev_put(master_ndev);
        }
}

/* The following functions operate on all IB devices. netdevice_event and
 * addr_event execute ib_enum_all_roce_netdevs through a work.
 * ib_enum_all_roce_netdevs iterates through all IB devices.
 */

static void netdevice_event_work_handler(struct work_struct *_work)
{
        struct netdev_event_work *work =
                container_of(_work, struct netdev_event_work, work);
        unsigned int i;

        for (i = 0; i < ARRAY_SIZE(work->cmds) && work->cmds[i].cb; i++) {
                ib_enum_all_roce_netdevs(work->cmds[i].filter,
                                         work->cmds[i].filter_ndev,
                                         work->cmds[i].cb,
                                         work->cmds[i].ndev);
                dev_put(work->cmds[i].ndev);
                dev_put(work->cmds[i].filter_ndev);
        }

        kfree(work);
}

static int netdevice_queue_work(struct netdev_event_work_cmd *cmds,
                                struct net_device *ndev)
{
        unsigned int i;
        struct netdev_event_work *ndev_work =
                kmalloc(sizeof(*ndev_work), GFP_KERNEL);

        if (!ndev_work)
                return NOTIFY_DONE;

        memcpy(ndev_work->cmds, cmds, sizeof(ndev_work->cmds));
        for (i = 0; i < ARRAY_SIZE(ndev_work->cmds) && ndev_work->cmds[i].cb; i++) {
                if (!ndev_work->cmds[i].ndev)
                        ndev_work->cmds[i].ndev = ndev;
                if (!ndev_work->cmds[i].filter_ndev)
                        ndev_work->cmds[i].filter_ndev = ndev;
                dev_hold(ndev_work->cmds[i].ndev);
                dev_hold(ndev_work->cmds[i].filter_ndev);
        }
        INIT_WORK(&ndev_work->work, netdevice_event_work_handler);

        queue_work(gid_cache_wq, &ndev_work->work);

        return NOTIFY_DONE;
}

static const struct netdev_event_work_cmd add_cmd = {
        .cb        = add_netdev_ips,
        .filter        = is_eth_port_of_netdev_filter
};

static const struct netdev_event_work_cmd add_cmd_upper_ips = {
        .cb        = add_netdev_upper_ips,
        .filter = is_eth_port_of_netdev_filter
};

static void
ndev_event_unlink(struct netdev_notifier_changeupper_info *changeupper_info,
                  struct netdev_event_work_cmd *cmds)
{
        static const struct netdev_event_work_cmd
                        upper_ips_del_cmd = {
                                .cb        = del_netdev_upper_ips,
                                .filter        = upper_device_filter
        };

        cmds[0] = upper_ips_del_cmd;
        cmds[0].ndev = changeupper_info->upper_dev;
        cmds[1] = add_cmd;
}

static const struct netdev_event_work_cmd bonding_default_add_cmd = {
        .cb        = add_default_gids,
        .filter        = is_upper_ndev_bond_master_filter
};

static void
ndev_event_link(struct net_device *event_ndev,
                struct netdev_notifier_changeupper_info *changeupper_info,
                struct netdev_event_work_cmd *cmds)
{
        static const struct netdev_event_work_cmd
                        bonding_default_del_cmd = {
                                .cb        = del_default_gids,
                                .filter        = is_upper_ndev_bond_master_filter
                        };
        /*
         * When a lower netdev is linked to its upper bonding
         * netdev, delete lower slave netdev's default GIDs.
         */
        cmds[0] = bonding_default_del_cmd;
        cmds[0].ndev = event_ndev;
        cmds[0].filter_ndev = changeupper_info->upper_dev;

        /* Now add bonding upper device default GIDs */
        cmds[1] = bonding_default_add_cmd;
        cmds[1].ndev = changeupper_info->upper_dev;
        cmds[1].filter_ndev = changeupper_info->upper_dev;

        /* Now add bonding upper device IP based GIDs */
        cmds[2] = add_cmd_upper_ips;
        cmds[2].ndev = changeupper_info->upper_dev;
        cmds[2].filter_ndev = changeupper_info->upper_dev;
}

static void netdevice_event_changeupper(struct net_device *event_ndev,
                struct netdev_notifier_changeupper_info *changeupper_info,
                struct netdev_event_work_cmd *cmds)
{
        if (changeupper_info->linking)
                ndev_event_link(event_ndev, changeupper_info, cmds);
        else
                ndev_event_unlink(changeupper_info, cmds);
}

static const struct netdev_event_work_cmd add_default_gid_cmd = {
        .cb        = add_default_gids,
        .filter        = is_ndev_for_default_gid_filter,
};

static int netdevice_event(struct notifier_block *this, unsigned long event,
                           void *ptr)
{
        static const struct netdev_event_work_cmd del_cmd = {
                .cb = del_netdev_ips, .filter = pass_all_filter};
        static const struct netdev_event_work_cmd
                        bonding_default_del_cmd_join = {
                                .cb        = del_netdev_default_ips_join,
                                .filter        = is_eth_port_inactive_slave_filter
                        };
        static const struct netdev_event_work_cmd
                        netdev_del_cmd = {
                                .cb        = del_netdev_ips,
                                .filter = is_eth_port_of_netdev_filter
                        };
        static const struct netdev_event_work_cmd bonding_event_ips_del_cmd = {
                .cb = del_netdev_upper_ips, .filter = upper_device_filter};
        struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
        struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ] = { {NULL} };

        if (ndev->type != ARPHRD_ETHER)
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_REGISTER:
        case NETDEV_UP:
                cmds[0] = bonding_default_del_cmd_join;
                cmds[1] = add_default_gid_cmd;
                cmds[2] = add_cmd;
                break;

        case NETDEV_UNREGISTER:
                if (ndev->reg_state < NETREG_UNREGISTERED)
                        cmds[0] = del_cmd;
                else
                        return NOTIFY_DONE;
                break;

        case NETDEV_CHANGEADDR:
                cmds[0] = netdev_del_cmd;
                if (ndev->reg_state == NETREG_REGISTERED) {
                        cmds[1] = add_default_gid_cmd;
                        cmds[2] = add_cmd;
                }
                break;

        case NETDEV_CHANGEUPPER:
                netdevice_event_changeupper(ndev,
                        container_of(ptr, struct netdev_notifier_changeupper_info, info),
                        cmds);
                break;

        case NETDEV_BONDING_FAILOVER:
                cmds[0] = bonding_event_ips_del_cmd;
                /* Add default GIDs of the bond device */
                cmds[1] = bonding_default_add_cmd;
                /* Add IP based GIDs of the bond device */
                cmds[2] = add_cmd_upper_ips;
                break;

        default:
                return NOTIFY_DONE;
        }

        return netdevice_queue_work(cmds, ndev);
}

static void update_gid_event_work_handler(struct work_struct *_work)
{
        struct update_gid_event_work *work =
                container_of(_work, struct update_gid_event_work, work);

        ib_enum_all_roce_netdevs(is_eth_port_of_netdev_filter,
                                 work->gid_attr.ndev,
                                 callback_for_addr_gid_device_scan, work);

        dev_put(work->gid_attr.ndev);
        kfree(work);
}

static int addr_event(struct notifier_block *this, unsigned long event,
                      struct sockaddr *sa, struct net_device *ndev)
{
        struct update_gid_event_work *work;
        enum gid_op_type gid_op;

        if (ndev->type != ARPHRD_ETHER)
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_UP:
                gid_op = GID_ADD;
                break;

        case NETDEV_DOWN:
                gid_op = GID_DEL;
                break;

        default:
                return NOTIFY_DONE;
        }

        work = kmalloc(sizeof(*work), GFP_ATOMIC);
        if (!work)
                return NOTIFY_DONE;

        INIT_WORK(&work->work, update_gid_event_work_handler);

        rdma_ip2gid(sa, &work->gid);
        work->gid_op = gid_op;

        memset(&work->gid_attr, 0, sizeof(work->gid_attr));
        dev_hold(ndev);
        work->gid_attr.ndev   = ndev;

        queue_work(gid_cache_wq, &work->work);

        return NOTIFY_DONE;
}

static int inetaddr_event(struct notifier_block *this, unsigned long event,
                          void *ptr)
{
        struct sockaddr_in        in;
        struct net_device        *ndev;
        struct in_ifaddr        *ifa = ptr;

        in.sin_family = AF_INET;
        in.sin_addr.s_addr = ifa->ifa_address;
        ndev = ifa->ifa_dev->dev;

        return addr_event(this, event, (struct sockaddr *)&in, ndev);
}

static int inet6addr_event(struct notifier_block *this, unsigned long event,
                           void *ptr)
{
        struct sockaddr_in6        in6;
        struct net_device        *ndev;
        struct inet6_ifaddr        *ifa6 = ptr;

        in6.sin6_family = AF_INET6;
        in6.sin6_addr = ifa6->addr;
        ndev = ifa6->idev->dev;

        return addr_event(this, event, (struct sockaddr *)&in6, ndev);
}

static struct notifier_block nb_netdevice = {
        .notifier_call = netdevice_event
};

static struct notifier_block nb_inetaddr = {
        .notifier_call = inetaddr_event
};

static struct notifier_block nb_inet6addr = {
        .notifier_call = inet6addr_event
};

int __init roce_gid_mgmt_init(void)
{
        gid_cache_wq = alloc_ordered_workqueue("gid-cache-wq", 0);
        if (!gid_cache_wq)
                return -ENOMEM;

        register_inetaddr_notifier(&nb_inetaddr);
        if (IS_ENABLED(CONFIG_IPV6))
                register_inet6addr_notifier(&nb_inet6addr);
        /* We relay on the netdevice notifier to enumerate all
         * existing devices in the system. Register to this notifier
         * last to make sure we will not miss any IP add/del
         * callbacks.
         */
        register_netdevice_notifier(&nb_netdevice);

        return 0;
}

void __exit roce_gid_mgmt_cleanup(void)
{
        if (IS_ENABLED(CONFIG_IPV6))
                unregister_inet6addr_notifier(&nb_inet6addr);
        unregister_inetaddr_notifier(&nb_inetaddr);
        unregister_netdevice_notifier(&nb_netdevice);
        /* Ensure all gid deletion tasks complete before we go down,
         * to avoid any reference to free'd memory. By the time
         * ib-core is removed, all physical devices have been removed,
         * so no issue with remaining hardware contexts.
         */
        destroy_workqueue(gid_cache_wq);
}





































































































































































































































   76 
   76 














































































  235 



  233 














  235 
  229 
   21 
  235 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/exit.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/tty.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/cpu.h>
#include <linux/acct.h>
#include <linux/tsacct_kern.h>
#include <linux/file.h>
#include <linux/freezer.h>
#include <linux/binfmts.h>
#include <linux/nsproxy.h>
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/profile.h>
#include <linux/mount.h>
#include <linux/proc_fs.h>
#include <linux/kthread.h>
#include <linux/mempolicy.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
#include <linux/cgroup.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
#include <linux/posix-timers.h>
#include <linux/cn_proc.h>
#include <linux/mutex.h>
#include <linux/futex.h>
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/task_work.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
#include <linux/kcov.h>
#include <linux/kmsan.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
#include <linux/uaccess.h>
#include <linux/pidfs.h>

#include <uapi/linux/wait.h>

#include <asm/unistd.h>
#include <asm/mmu_context.h>

#include "exit.h"

/*
 * The default value should be high enough to not crash a system that randomly
 * crashes its kernel from time to time, but low enough to at least not permit
 * overflowing 32-bit refcounts or the ldsem writer count.
 */
static unsigned int oops_limit = 10000;

#ifdef CONFIG_SYSCTL
static const struct ctl_table kern_exit_table[] = {
        {
                .procname       = "oops_limit",
                .data           = &oops_limit,
                .maxlen         = sizeof(oops_limit),
                .mode           = 0644,
                .proc_handler   = proc_douintvec,
        },
};

static __init int kernel_exit_sysctls_init(void)
{
        register_sysctl_init("kernel", kern_exit_table);
        return 0;
}
late_initcall(kernel_exit_sysctls_init);
#endif

static atomic_t oops_count = ATOMIC_INIT(0);

#ifdef CONFIG_SYSFS
static ssize_t oops_count_show(struct kobject *kobj, struct kobj_attribute *attr,
                               char *page)
{
        return sysfs_emit(page, "%d\n", atomic_read(&oops_count));
}

static struct kobj_attribute oops_count_attr = __ATTR_RO(oops_count);

static __init int kernel_exit_sysfs_init(void)
{
        sysfs_add_file_to_group(kernel_kobj, &oops_count_attr.attr, NULL);
        return 0;
}
late_initcall(kernel_exit_sysfs_init);
#endif

/*
 * For things release_task() would like to do *after* tasklist_lock is released.
 */
struct release_task_post {
        struct pid *pids[PIDTYPE_MAX];
};

static void __unhash_process(struct release_task_post *post, struct task_struct *p,
                             bool group_dead)
{
        nr_threads--;
        detach_pid(post->pids, p, PIDTYPE_PID);
        if (group_dead) {
                detach_pid(post->pids, p, PIDTYPE_TGID);
                detach_pid(post->pids, p, PIDTYPE_PGID);
                detach_pid(post->pids, p, PIDTYPE_SID);

                list_del_rcu(&p->tasks);
                list_del_init(&p->sibling);
                __this_cpu_dec(process_counts);
        }
        list_del_rcu(&p->thread_node);
}

/*
 * This function expects the tasklist_lock write-locked.
 */
static void __exit_signal(struct release_task_post *post, struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        bool group_dead = thread_group_leader(tsk);
        struct sighand_struct *sighand;
        struct tty_struct *tty;
        u64 utime, stime;

        sighand = rcu_dereference_check(tsk->sighand,
                                        lockdep_tasklist_lock_is_held());
        spin_lock(&sighand->siglock);

#ifdef CONFIG_POSIX_TIMERS
        posix_cpu_timers_exit(tsk);
        if (group_dead)
                posix_cpu_timers_exit_group(tsk);
#endif

        if (group_dead) {
                tty = sig->tty;
                sig->tty = NULL;
        } else {
                /*
                 * If there is any task waiting for the group exit
                 * then notify it:
                 */
                if (sig->notify_count > 0 && !--sig->notify_count)
                        wake_up_process(sig->group_exec_task);

                if (tsk == sig->curr_target)
                        sig->curr_target = next_thread(tsk);
        }

        /*
         * Accumulate here the counters for all threads as they die. We could
         * skip the group leader because it is the last user of signal_struct,
         * but we want to avoid the race with thread_group_cputime() which can
         * see the empty ->thread_head list.
         */
        task_cputime(tsk, &utime, &stime);
        write_seqlock(&sig->stats_lock);
        sig->utime += utime;
        sig->stime += stime;
        sig->gtime += task_gtime(tsk);
        sig->min_flt += tsk->min_flt;
        sig->maj_flt += tsk->maj_flt;
        sig->nvcsw += tsk->nvcsw;
        sig->nivcsw += tsk->nivcsw;
        sig->inblock += task_io_get_inblock(tsk);
        sig->oublock += task_io_get_oublock(tsk);
        task_io_accounting_add(&sig->ioac, &tsk->ioac);
        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        sig->nr_threads--;
        __unhash_process(post, tsk, group_dead);
        write_sequnlock(&sig->stats_lock);

        tsk->sighand = NULL;
        spin_unlock(&sighand->siglock);

        __cleanup_sighand(sighand);
        if (group_dead)
                tty_kref_put(tty);
}

static void delayed_put_task_struct(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        kprobe_flush_task(tsk);
        rethook_flush_task(tsk);
        perf_event_delayed_put(tsk);
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
}

void put_task_struct_rcu_user(struct task_struct *task)
{
        if (refcount_dec_and_test(&task->rcu_users))
                call_rcu(&task->rcu, delayed_put_task_struct);
}

void __weak release_thread(struct task_struct *dead_task)
{
}

void release_task(struct task_struct *p)
{
        struct release_task_post post;
        struct task_struct *leader;
        struct pid *thread_pid;
        int zap_leader;
repeat:
        memset(&post, 0, sizeof(post));

        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. But shut RCU-lockdep up */
        rcu_read_lock();
        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        rcu_read_unlock();

        pidfs_exit(p);
        cgroup_release(p);

        thread_pid = get_pid(p->thread_pid);

        write_lock_irq(&tasklist_lock);
        ptrace_release_task(p);
        __exit_signal(&post, p);

        /*
         * If we are the last non-leader member of the thread
         * group, and the leader is zombie, then notify the
         * group leader's parent process. (if it wants notification.)
         */
        zap_leader = 0;
        leader = p->group_leader;
        if (leader != p && thread_group_empty(leader)
                        && leader->exit_state == EXIT_ZOMBIE) {
                /* for pidfs_exit() and do_notify_parent() */
                if (leader->signal->flags & SIGNAL_GROUP_EXIT)
                        leader->exit_code = leader->signal->group_exit_code;
                /*
                 * If we were the last child thread and the leader has
                 * exited already, and the leader's parent ignores SIGCHLD,
                 * then we are the one who should release the leader.
                 */
                zap_leader = do_notify_parent(leader, leader->exit_signal);
                if (zap_leader)
                        leader->exit_state = EXIT_DEAD;
        }

        write_unlock_irq(&tasklist_lock);
        proc_flush_pid(thread_pid);
        put_pid(thread_pid);
        add_device_randomness(&p->se.sum_exec_runtime,
                              sizeof(p->se.sum_exec_runtime));
        free_pids(post.pids);
        release_thread(p);
        /*
         * This task was already removed from the process/thread/pid lists
         * and lock_task_sighand(p) can't succeed. Nobody else can touch
         * ->pending or, if group dead, signal->shared_pending. We can call
         * flush_sigqueue() lockless.
         */
        flush_sigqueue(&p->pending);
        if (thread_group_leader(p))
                flush_sigqueue(&p->signal->shared_pending);

        put_task_struct_rcu_user(p);

        p = leader;
        if (unlikely(zap_leader))
                goto repeat;
}

int rcuwait_wake_up(struct rcuwait *w)
{
        int ret = 0;
        struct task_struct *task;

        rcu_read_lock();

        /*
         * Order condition vs @task, such that everything prior to the load
         * of @task is visible. This is the condition as to why the user called
         * rcuwait_wake() in the first place. Pairs with set_current_state()
         * barrier (A) in rcuwait_wait_event().
         *
         *    WAIT                WAKE
         *    [S] tsk = current          [S] cond = true
         *        MB (A)              MB (B)
         *    [L] cond                  [L] tsk
         */
        smp_mb(); /* (B) */

        task = rcu_dereference(w->task);
        if (task)
                ret = wake_up_process(task);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(rcuwait_wake_up);

/*
 * Determine if a process group is "orphaned", according to the POSIX
 * definition in 2.2.2.52.  Orphaned process groups are not to be affected
 * by terminal-generated stop signals.  Newly orphaned process groups are
 * to receive a SIGHUP and a SIGCONT.
 *
 * "I ask you, have you ever known what it is to be an orphan?"
 */
static int will_become_orphaned_pgrp(struct pid *pgrp,
                                        struct task_struct *ignored_task)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if ((p == ignored_task) ||
                    (p->exit_state && thread_group_empty(p)) ||
                    is_global_init(p->real_parent))
                        continue;

                if (task_pgrp(p->real_parent) != pgrp &&
                    task_session(p->real_parent) == task_session(p))
                        return 0;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return 1;
}

int is_current_pgrp_orphaned(void)
{
        int retval;

        read_lock(&tasklist_lock);
        retval = will_become_orphaned_pgrp(task_pgrp(current), NULL);
        read_unlock(&tasklist_lock);

        return retval;
}

static bool has_stopped_jobs(struct pid *pgrp)
{
        struct task_struct *p;

        do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return true;
        } while_each_pid_task(pgrp, PIDTYPE_PGID, p);

        return false;
}

/*
 * Check to see if any process groups have become orphaned as
 * a result of our exiting, and if they have any stopped jobs,
 * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
 */
static void
kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
{
        struct pid *pgrp = task_pgrp(tsk);
        struct task_struct *ignored_task = tsk;

        if (!parent)
                /* exit: our father is in a different pgrp than
                 * we are and we were the only connection outside.
                 */
                parent = tsk->real_parent;
        else
                /* reparent: our child is in a different pgrp than
                 * we are, and it was the only connection outside.
                 */
                ignored_task = NULL;

        if (task_pgrp(parent) != pgrp &&
            task_session(parent) == task_session(tsk) &&
            will_become_orphaned_pgrp(pgrp, ignored_task) &&
            has_stopped_jobs(pgrp)) {
                __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
                __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp);
        }
}

static void coredump_task_exit(struct task_struct *tsk)
{
        struct core_state *core_state;

        /*
         * Serialize with any possible pending coredump.
         * We must hold siglock around checking core_state
         * and setting PF_POSTCOREDUMP.  The core-inducing thread
         * will increment ->nr_threads for each thread in the
         * group without PF_POSTCOREDUMP set.
         */
        spin_lock_irq(&tsk->sighand->siglock);
        tsk->flags |= PF_POSTCOREDUMP;
        core_state = tsk->signal->core_state;
        spin_unlock_irq(&tsk->sighand->siglock);
        if (core_state) {
                struct core_thread self;

                self.task = current;
                if (self.task->flags & PF_SIGNALED)
                        self.next = xchg(&core_state->dumper.next, &self);
                else
                        self.task = NULL;
                /*
                 * Implies mb(), the result of xchg() must be visible
                 * to core_state->dumper.
                 */
                if (atomic_dec_and_test(&core_state->nr_threads))
                        complete(&core_state->startup);

                for (;;) {
                        set_current_state(TASK_IDLE|TASK_FREEZABLE);
                        if (!self.task) /* see coredump_finish() */
                                break;
                        schedule();
                }
                __set_current_state(TASK_RUNNING);
        }
}

#ifdef CONFIG_MEMCG
/* drops tasklist_lock if succeeds */
static bool __try_to_set_owner(struct task_struct *tsk, struct mm_struct *mm)
{
        bool ret = false;

        task_lock(tsk);
        if (likely(tsk->mm == mm)) {
                /* tsk can't pass exit_mm/exec_mmap and exit */
                read_unlock(&tasklist_lock);
                WRITE_ONCE(mm->owner, tsk);
                lru_gen_migrate_mm(mm);
                ret = true;
        }
        task_unlock(tsk);
        return ret;
}

static bool try_to_set_owner(struct task_struct *g, struct mm_struct *mm)
{
        struct task_struct *t;

        for_each_thread(g, t) {
                struct mm_struct *t_mm = READ_ONCE(t->mm);
                if (t_mm == mm) {
                        if (__try_to_set_owner(t, mm))
                                return true;
                } else if (t_mm)
                        break;
        }

        return false;
}

/*
 * A task is exiting.   If it owned this mm, find a new owner for the mm.
 */
void mm_update_next_owner(struct mm_struct *mm)
{
        struct task_struct *g, *p = current;

        /*
         * If the exiting or execing task is not the owner, it's
         * someone else's problem.
         */
        if (mm->owner != p)
                return;
        /*
         * The current owner is exiting/execing and there are no other
         * candidates.  Do not leave the mm pointing to a possibly
         * freed task structure.
         */
        if (atomic_read(&mm->mm_users) <= 1) {
                WRITE_ONCE(mm->owner, NULL);
                return;
        }

        read_lock(&tasklist_lock);
        /*
         * Search in the children
         */
        list_for_each_entry(g, &p->children, sibling) {
                if (try_to_set_owner(g, mm))
                        goto ret;
        }
        /*
         * Search in the siblings
         */
        list_for_each_entry(g, &p->real_parent->children, sibling) {
                if (try_to_set_owner(g, mm))
                        goto ret;
        }
        /*
         * Search through everything else, we should not get here often.
         */
        for_each_process(g) {
                if (atomic_read(&mm->mm_users) <= 1)
                        break;
                if (g->flags & PF_KTHREAD)
                        continue;
                if (try_to_set_owner(g, mm))
                        goto ret;
        }
        read_unlock(&tasklist_lock);
        /*
         * We found no owner yet mm_users > 1: this implies that we are
         * most likely racing with swapoff (try_to_unuse()) or /proc or
         * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
         */
        WRITE_ONCE(mm->owner, NULL);
 ret:
        return;

}
#endif /* CONFIG_MEMCG */

/*
 * Turn us into a lazy TLB process if we
 * aren't already..
 */
static void exit_mm(void)
{
        struct mm_struct *mm = current->mm;

        exit_mm_release(current, mm);
        if (!mm)
                return;
        mmap_read_lock(mm);
        mmgrab_lazy_tlb(mm);
        BUG_ON(mm != current->active_mm);
        /* more a memory barrier than a real lock */
        task_lock(current);
        /*
         * When a thread stops operating on an address space, the loop
         * in membarrier_private_expedited() may not observe that
         * tsk->mm, and the loop in membarrier_global_expedited() may
         * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
         * rq->membarrier_state, so those would not issue an IPI.
         * Membarrier requires a memory barrier after accessing
         * user-space memory, before clearing tsk->mm or the
         * rq->membarrier_state.
         */
        smp_mb__after_spinlock();
        local_irq_disable();
        current->mm = NULL;
        membarrier_update_current_mm(NULL);
        enter_lazy_tlb(mm, current);
        local_irq_enable();
        task_unlock(current);
        mmap_read_unlock(mm);
        mm_update_next_owner(mm);
        mmput(mm);
        if (test_thread_flag(TIF_MEMDIE))
                exit_oom_victim();
}

static struct task_struct *find_alive_thread(struct task_struct *p)
{
        struct task_struct *t;

        for_each_thread(p, t) {
                if (!(t->flags & PF_EXITING))
                        return t;
        }
        return NULL;
}

static struct task_struct *find_child_reaper(struct task_struct *father,
                                                struct list_head *dead)
        __releases(&tasklist_lock)
        __acquires(&tasklist_lock)
{
        struct pid_namespace *pid_ns = task_active_pid_ns(father);
        struct task_struct *reaper = pid_ns->child_reaper;
        struct task_struct *p, *n;

        if (likely(reaper != father))
                return reaper;

        reaper = find_alive_thread(father);
        if (reaper) {
                pid_ns->child_reaper = reaper;
                return reaper;
        }

        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }

        zap_pid_ns_processes(pid_ns);
        write_lock_irq(&tasklist_lock);

        return father;
}

/*
 * When we die, we re-parent all our children, and try to:
 * 1. give them to another thread in our thread group, if such a member exists
 * 2. give it to the first ancestor process which prctl'd itself as a
 *    child_subreaper for its children (like a service manager)
 * 3. give it to the init process (PID 1) in our pid namespace
 */
static struct task_struct *find_new_reaper(struct task_struct *father,
                                           struct task_struct *child_reaper)
{
        struct task_struct *thread, *reaper;

        thread = find_alive_thread(father);
        if (thread)
                return thread;

        if (father->signal->has_child_subreaper) {
                unsigned int ns_level = task_pid(father)->level;
                /*
                 * Find the first ->is_child_subreaper ancestor in our pid_ns.
                 * We can't check reaper != child_reaper to ensure we do not
                 * cross the namespaces, the exiting parent could be injected
                 * by setns() + fork().
                 * We check pid->level, this is slightly more efficient than
                 * task_active_pid_ns(reaper) != task_active_pid_ns(father).
                 */
                for (reaper = father->real_parent;
                     task_pid(reaper)->level == ns_level;
                     reaper = reaper->real_parent) {
                        if (reaper == &init_task)
                                break;
                        if (!reaper->signal->is_child_subreaper)
                                continue;
                        thread = find_alive_thread(reaper);
                        if (thread)
                                return thread;
                }
        }

        return child_reaper;
}

/*
* Any that need to be release_task'd are put on the @dead list.
 */
static void reparent_leader(struct task_struct *father, struct task_struct *p,
                                struct list_head *dead)
{
        if (unlikely(p->exit_state == EXIT_DEAD))
                return;

        /* We don't want people slaying init. */
        p->exit_signal = SIGCHLD;

        /* If it has exited notify the new parent about this child's death. */
        if (!p->ptrace &&
            p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
                if (do_notify_parent(p, p->exit_signal)) {
                        p->exit_state = EXIT_DEAD;
                        list_add(&p->ptrace_entry, dead);
                }
        }

        kill_orphaned_pgrp(p, father);
}

/*
 * This does two things:
 *
 * A.  Make init inherit all the child processes
 * B.  Check to see if any process groups have become orphaned
 *        as a result of our exiting, and if they have any stopped
 *        jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
 */
static void forget_original_parent(struct task_struct *father,
                                        struct list_head *dead)
{
        struct task_struct *p, *t, *reaper;

        if (unlikely(!list_empty(&father->ptraced)))
                exit_ptrace(father, dead);

        /* Can drop and reacquire tasklist_lock */
        reaper = find_child_reaper(father, dead);
        if (list_empty(&father->children))
                return;

        reaper = find_new_reaper(father, reaper);
        list_for_each_entry(p, &father->children, sibling) {
                for_each_thread(p, t) {
                        RCU_INIT_POINTER(t->real_parent, reaper);
                        BUG_ON((!t->ptrace) != (rcu_access_pointer(t->parent) == father));
                        if (likely(!t->ptrace))
                                t->parent = t->real_parent;
                        if (t->pdeath_signal)
                                group_send_sig_info(t->pdeath_signal,
                                                    SEND_SIG_NOINFO, t,
                                                    PIDTYPE_TGID);
                }
                /*
                 * If this is a threaded reparent there is no need to
                 * notify anyone anything has happened.
                 */
                if (!same_thread_group(reaper, father))
                        reparent_leader(father, p, dead);
        }
        list_splice_tail_init(&father->children, &reaper->children);
}

/*
 * Send signals to all our closest relatives so that they know
 * to properly mourn us..
 */
static void exit_notify(struct task_struct *tsk, int group_dead)
{
        bool autoreap;
        struct task_struct *p, *n;
        LIST_HEAD(dead);

        write_lock_irq(&tasklist_lock);
        forget_original_parent(tsk, &dead);

        if (group_dead)
                kill_orphaned_pgrp(tsk->group_leader, NULL);

        tsk->exit_state = EXIT_ZOMBIE;

        if (unlikely(tsk->ptrace)) {
                int sig = thread_group_leader(tsk) &&
                                thread_group_empty(tsk) &&
                                !ptrace_reparented(tsk) ?
                        tsk->exit_signal : SIGCHLD;
                autoreap = do_notify_parent(tsk, sig);
        } else if (thread_group_leader(tsk)) {
                autoreap = thread_group_empty(tsk) &&
                        do_notify_parent(tsk, tsk->exit_signal);
        } else {
                autoreap = true;
                /* untraced sub-thread */
                do_notify_pidfd(tsk);
        }

        if (autoreap) {
                tsk->exit_state = EXIT_DEAD;
                list_add(&tsk->ptrace_entry, &dead);
        }

        /* mt-exec, de_thread() is waiting for group leader */
        if (unlikely(tsk->signal->notify_count < 0))
                wake_up_process(tsk->signal->group_exec_task);
        write_unlock_irq(&tasklist_lock);

        list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
                list_del_init(&p->ptrace_entry);
                release_task(p);
        }
}

#ifdef CONFIG_DEBUG_STACK_USAGE
unsigned long stack_not_used(struct task_struct *p)
{
        unsigned long *n = end_of_stack(p);

        do {        /* Skip over canary */
# ifdef CONFIG_STACK_GROWSUP
                n--;
# else
                n++;
# endif
        } while (!*n);

# ifdef CONFIG_STACK_GROWSUP
        return (unsigned long)end_of_stack(p) - (unsigned long)n;
# else
        return (unsigned long)n - (unsigned long)end_of_stack(p);
# endif
}

/* Count the maximum pages reached in kernel stacks */
static inline void kstack_histogram(unsigned long used_stack)
{
#ifdef CONFIG_VM_EVENT_COUNTERS
        if (used_stack <= 1024)
                count_vm_event(KSTACK_1K);
#if THREAD_SIZE > 1024
        else if (used_stack <= 2048)
                count_vm_event(KSTACK_2K);
#endif
#if THREAD_SIZE > 2048
        else if (used_stack <= 4096)
                count_vm_event(KSTACK_4K);
#endif
#if THREAD_SIZE > 4096
        else if (used_stack <= 8192)
                count_vm_event(KSTACK_8K);
#endif
#if THREAD_SIZE > 8192
        else if (used_stack <= 16384)
                count_vm_event(KSTACK_16K);
#endif
#if THREAD_SIZE > 16384
        else if (used_stack <= 32768)
                count_vm_event(KSTACK_32K);
#endif
#if THREAD_SIZE > 32768
        else if (used_stack <= 65536)
                count_vm_event(KSTACK_64K);
#endif
#if THREAD_SIZE > 65536
        else
                count_vm_event(KSTACK_REST);
#endif
#endif /* CONFIG_VM_EVENT_COUNTERS */
}

static void check_stack_usage(void)
{
        static DEFINE_SPINLOCK(low_water_lock);
        static int lowest_to_date = THREAD_SIZE;
        unsigned long free;

        free = stack_not_used(current);
        kstack_histogram(THREAD_SIZE - free);

        if (free >= lowest_to_date)
                return;

        spin_lock(&low_water_lock);
        if (free < lowest_to_date) {
                pr_info("%s (%d) used greatest stack depth: %lu bytes left\n",
                        current->comm, task_pid_nr(current), free);
                lowest_to_date = free;
        }
        spin_unlock(&low_water_lock);
}
#else
static inline void check_stack_usage(void) {}
#endif

static void synchronize_group_exit(struct task_struct *tsk, long code)
{
        struct sighand_struct *sighand = tsk->sighand;
        struct signal_struct *signal = tsk->signal;

        spin_lock_irq(&sighand->siglock);
        signal->quick_threads--;
        if ((signal->quick_threads == 0) &&
            !(signal->flags & SIGNAL_GROUP_EXIT)) {
                signal->flags = SIGNAL_GROUP_EXIT;
                signal->group_exit_code = code;
                signal->group_stop_count = 0;
        }
        spin_unlock_irq(&sighand->siglock);
}

void __noreturn do_exit(long code)
{
        struct task_struct *tsk = current;
        int group_dead;

        WARN_ON(irqs_disabled());

        synchronize_group_exit(tsk, code);

        WARN_ON(tsk->plug);

        kcov_task_exit(tsk);
        kmsan_task_exit(tsk);

        coredump_task_exit(tsk);
        ptrace_event(PTRACE_EVENT_EXIT, code);
        user_events_exit(tsk);

        io_uring_files_cancel();
        exit_signals(tsk);  /* sets PF_EXITING */

        seccomp_filter_release(tsk);

        acct_update_integrals(tsk);
        group_dead = atomic_dec_and_test(&tsk->signal->live);
        if (group_dead) {
                /*
                 * If the last thread of global init has exited, panic
                 * immediately to get a useable coredump.
                 */
                if (unlikely(is_global_init(tsk)))
                        panic("Attempted to kill init! exitcode=0x%08x\n",
                                tsk->signal->group_exit_code ?: (int)code);

#ifdef CONFIG_POSIX_TIMERS
                hrtimer_cancel(&tsk->signal->real_timer);
                exit_itimers(tsk);
#endif
                if (tsk->mm)
                        setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
        }
        acct_collect(code, group_dead);
        if (group_dead)
                tty_audit_exit();
        audit_free(tsk);

        tsk->exit_code = code;
        taskstats_exit(tsk, group_dead);

        exit_mm();

        if (group_dead)
                acct_process();
        trace_sched_process_exit(tsk);

        exit_sem(tsk);
        exit_shm(tsk);
        exit_files(tsk);
        exit_fs(tsk);
        if (group_dead)
                disassociate_ctty(1);
        exit_task_namespaces(tsk);
        exit_task_work(tsk);
        exit_thread(tsk);

        /*
         * Flush inherited counters to the parent - before the parent
         * gets woken up by child-exit notifications.
         *
         * because of cgroup mode, must be called before cgroup_exit()
         */
        perf_event_exit_task(tsk);

        sched_autogroup_exit_task(tsk);
        cgroup_exit(tsk);

        /*
         * FIXME: do that only when needed, using sched_exit tracepoint
         */
        flush_ptrace_hw_breakpoint(tsk);

        exit_tasks_rcu_start();
        exit_notify(tsk, group_dead);
        proc_exit_connector(tsk);
        mpol_put_task_policy(tsk);
#ifdef CONFIG_FUTEX
        if (unlikely(current->pi_state_cache))
                kfree(current->pi_state_cache);
#endif
        /*
         * Make sure we are holding no locks:
         */
        debug_check_no_locks_held();

        if (tsk->io_context)
                exit_io_context(tsk);

        if (tsk->splice_pipe)
                free_pipe_info(tsk->splice_pipe);

        if (tsk->task_frag.page)
                put_page(tsk->task_frag.page);

        exit_task_stack_account(tsk);

        check_stack_usage();
        preempt_disable();
        if (tsk->nr_dirtied)
                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
        exit_tasks_rcu_finish();

        lockdep_free_task(tsk);
        do_task_dead();
}

void __noreturn make_task_dead(int signr)
{
        /*
         * Take the task off the cpu after something catastrophic has
         * happened.
         *
         * We can get here from a kernel oops, sometimes with preemption off.
         * Start by checking for critical errors.
         * Then fix up important state like USER_DS and preemption.
         * Then do everything else.
         */
        struct task_struct *tsk = current;
        unsigned int limit;

        if (unlikely(in_interrupt()))
                panic("Aiee, killing interrupt handler!");
        if (unlikely(!tsk->pid))
                panic("Attempted to kill the idle task!");

        if (unlikely(irqs_disabled())) {
                pr_info("note: %s[%d] exited with irqs disabled\n",
                        current->comm, task_pid_nr(current));
                local_irq_enable();
        }
        if (unlikely(in_atomic())) {
                pr_info("note: %s[%d] exited with preempt_count %d\n",
                        current->comm, task_pid_nr(current),
                        preempt_count());
                preempt_count_set(PREEMPT_ENABLED);
        }

        /*
         * Every time the system oopses, if the oops happens while a reference
         * to an object was held, the reference leaks.
         * If the oops doesn't also leak memory, repeated oopsing can cause
         * reference counters to wrap around (if they're not using refcount_t).
         * This means that repeated oopsing can make unexploitable-looking bugs
         * exploitable through repeated oopsing.
         * To make sure this can't happen, place an upper bound on how often the
         * kernel may oops without panic().
         */
        limit = READ_ONCE(oops_limit);
        if (atomic_inc_return(&oops_count) >= limit && limit)
                panic("Oopsed too often (kernel.oops_limit is %d)", limit);

        /*
         * We're taking recursive faults here in make_task_dead. Safest is to just
         * leave this task alone and wait for reboot.
         */
        if (unlikely(tsk->flags & PF_EXITING)) {
                pr_alert("Fixing recursive fault but reboot is needed!\n");
                futex_exit_recursive(tsk);
                tsk->exit_state = EXIT_DEAD;
                refcount_inc(&tsk->rcu_users);
                do_task_dead();
        }

        do_exit(signr);
}

SYSCALL_DEFINE1(exit, int, error_code)
{
        do_exit((error_code&0xff)<<8);
}

/*
 * Take down every thread in the group.  This is called by fatal signals
 * as well as by sys_exit_group (below).
 */
void __noreturn
do_group_exit(int exit_code)
{
        struct signal_struct *sig = current->signal;

        if (sig->flags & SIGNAL_GROUP_EXIT)
                exit_code = sig->group_exit_code;
        else if (sig->group_exec_task)
                exit_code = 0;
        else {
                struct sighand_struct *const sighand = current->sighand;

                spin_lock_irq(&sighand->siglock);
                if (sig->flags & SIGNAL_GROUP_EXIT)
                        /* Another thread got here before we took the lock.  */
                        exit_code = sig->group_exit_code;
                else if (sig->group_exec_task)
                        exit_code = 0;
                else {
                        sig->group_exit_code = exit_code;
                        sig->flags = SIGNAL_GROUP_EXIT;
                        zap_other_threads(current);
                }
                spin_unlock_irq(&sighand->siglock);
        }

        do_exit(exit_code);
        /* NOTREACHED */
}

/*
 * this kills every thread in the thread group. Note that any externally
 * wait4()-ing process will get the correct exit code - even if this
 * thread is not the thread group leader.
 */
SYSCALL_DEFINE1(exit_group, int, error_code)
{
        do_group_exit((error_code & 0xff) << 8);
        /* NOTREACHED */
        return 0;
}

static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
{
        return        wo->wo_type == PIDTYPE_MAX ||
                task_pid_type(p, wo->wo_type) == wo->wo_pid;
}

static int
eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
{
        if (!eligible_pid(wo, p))
                return 0;

        /*
         * Wait for all children (clone and not) if __WALL is set or
         * if it is traced by us.
         */
        if (ptrace || (wo->wo_flags & __WALL))
                return 1;

        /*
         * Otherwise, wait for clone children *only* if __WCLONE is set;
         * otherwise, wait for non-clone children *only*.
         *
         * Note: a "clone" child here is one that reports to its parent
         * using a signal other than SIGCHLD, or a non-leader thread which
         * we can only see if it is traced by us.
         */
        if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE))
                return 0;

        return 1;
}

/*
 * Handle sys_wait4 work for one task in state EXIT_ZOMBIE.  We hold
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
        int state, status;
        pid_t pid = task_pid_vnr(p);
        uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
        struct waitid_info *infop;

        if (!likely(wo->wo_flags & WEXITED))
                return 0;

        if (unlikely(wo->wo_flags & WNOWAIT)) {
                status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                        ? p->signal->group_exit_code : p->exit_code;
                get_task_struct(p);
                read_unlock(&tasklist_lock);
                sched_annotate_sleep();
                if (wo->wo_rusage)
                        getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
                put_task_struct(p);
                goto out_info;
        }
        /*
         * Move the task's state to DEAD/TRACE, only one thread can do this.
         */
        state = (ptrace_reparented(p) && thread_group_leader(p)) ?
                EXIT_TRACE : EXIT_DEAD;
        if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE)
                return 0;
        /*
         * We own this thread, nobody else can reap it.
         */
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();

        /*
         * Check thread_group_leader() to exclude the traced sub-threads.
         */
        if (state == EXIT_DEAD && thread_group_leader(p)) {
                struct signal_struct *sig = p->signal;
                struct signal_struct *psig = current->signal;
                unsigned long maxrss;
                u64 tgutime, tgstime;

                /*
                 * The resource counters for the group leader are in its
                 * own task_struct.  Those for dead threads in the group
                 * are in its signal_struct, as are those for the child
                 * processes it has previously reaped.  All these
                 * accumulate in the parent's signal_struct c* fields.
                 *
                 * We don't bother to take a lock here to protect these
                 * p->signal fields because the whole thread group is dead
                 * and nobody can change them.
                 *
                 * psig->stats_lock also protects us from our sub-threads
                 * which can reap other children at the same time.
                 *
                 * We use thread_group_cputime_adjusted() to get times for
                 * the thread group, which consolidates times for all threads
                 * in the group including the group leader.
                 */
                thread_group_cputime_adjusted(p, &tgutime, &tgstime);
                write_seqlock_irq(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
                        p->maj_flt + sig->maj_flt + sig->cmaj_flt;
                psig->cnvcsw +=
                        p->nvcsw + sig->nvcsw + sig->cnvcsw;
                psig->cnivcsw +=
                        p->nivcsw + sig->nivcsw + sig->cnivcsw;
                psig->cinblock +=
                        task_io_get_inblock(p) +
                        sig->inblock + sig->cinblock;
                psig->coublock +=
                        task_io_get_oublock(p) +
                        sig->oublock + sig->coublock;
                maxrss = max(sig->maxrss, sig->cmaxrss);
                if (psig->cmaxrss < maxrss)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
                write_sequnlock_irq(&psig->stats_lock);
        }

        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                ? p->signal->group_exit_code : p->exit_code;
        wo->wo_stat = status;

        if (state == EXIT_TRACE) {
                write_lock_irq(&tasklist_lock);
                /* We dropped tasklist, ptracer could die and untrace */
                ptrace_unlink(p);

                /* If parent wants a zombie, don't release it now */
                state = EXIT_ZOMBIE;
                if (do_notify_parent(p, p->exit_signal))
                        state = EXIT_DEAD;
                p->exit_state = state;
                write_unlock_irq(&tasklist_lock);
        }
        if (state == EXIT_DEAD)
                release_task(p);

out_info:
        infop = wo->wo_info;
        if (infop) {
                if ((status & 0x7f) == 0) {
                        infop->cause = CLD_EXITED;
                        infop->status = status >> 8;
                } else {
                        infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
                        infop->status = status & 0x7f;
                }
                infop->pid = pid;
                infop->uid = uid;
        }

        return pid;
}

static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
        if (ptrace) {
                if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING))
                        return &p->exit_code;
        } else {
                if (p->signal->flags & SIGNAL_STOP_STOPPED)
                        return &p->signal->group_exit_code;
        }
        return NULL;
}

/**
 * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
 * @wo: wait options
 * @ptrace: is the wait for ptrace
 * @p: task to wait for
 *
 * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
 *
 * CONTEXT:
 * read_lock(&tasklist_lock), which is released if return value is
 * non-zero.  Also, grabs and releases @p->sighand->siglock.
 *
 * RETURNS:
 * 0 if wait condition didn't exist and search for other wait conditions
 * should continue.  Non-zero return, -errno on failure and @p's pid on
 * success, implies that tasklist_lock is released and wait condition
 * search should terminate.
 */
static int wait_task_stopped(struct wait_opts *wo,
                                int ptrace, struct task_struct *p)
{
        struct waitid_info *infop;
        int exit_code, *p_code, why;
        uid_t uid = 0; /* unneeded, required by compiler */
        pid_t pid;

        /*
         * Traditionally we see ptrace'd stopped tasks regardless of options.
         */
        if (!ptrace && !(wo->wo_flags & WUNTRACED))
                return 0;

        if (!task_stopped_code(p, ptrace))
                return 0;

        exit_code = 0;
        spin_lock_irq(&p->sighand->siglock);

        p_code = task_stopped_code(p, ptrace);
        if (unlikely(!p_code))
                goto unlock_sig;

        exit_code = *p_code;
        if (!exit_code)
                goto unlock_sig;

        if (!unlikely(wo->wo_flags & WNOWAIT))
                *p_code = 0;

        uid = from_kuid_munged(current_user_ns(), task_uid(p));
unlock_sig:
        spin_unlock_irq(&p->sighand->siglock);
        if (!exit_code)
                return 0;

        /*
         * Now we are pretty sure this task is interesting.
         * Make sure it doesn't get reaped out from under us while we
         * give up the lock and then examine it below.  We don't want to
         * keep holding onto the tasklist_lock while we call getrusage and
         * possibly take page faults for user memory.
         */
        get_task_struct(p);
        pid = task_pid_vnr(p);
        why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        if (likely(!(wo->wo_flags & WNOWAIT)))
                wo->wo_stat = (exit_code << 8) | 0x7f;

        infop = wo->wo_info;
        if (infop) {
                infop->cause = why;
                infop->status = exit_code;
                infop->pid = pid;
                infop->uid = uid;
        }
        return pid;
}

/*
 * Handle do_wait work for one task in a live, non-stopped state.
 * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
 * the lock and this task is uninteresting.  If we return nonzero, we have
 * released the lock and the system call should return.
 */
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
        struct waitid_info *infop;
        pid_t pid;
        uid_t uid;

        if (!unlikely(wo->wo_flags & WCONTINUED))
                return 0;

        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
                return 0;

        spin_lock_irq(&p->sighand->siglock);
        /* Re-check with the lock held.  */
        if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) {
                spin_unlock_irq(&p->sighand->siglock);
                return 0;
        }
        if (!unlikely(wo->wo_flags & WNOWAIT))
                p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
        uid = from_kuid_munged(current_user_ns(), task_uid(p));
        spin_unlock_irq(&p->sighand->siglock);

        pid = task_pid_vnr(p);
        get_task_struct(p);
        read_unlock(&tasklist_lock);
        sched_annotate_sleep();
        if (wo->wo_rusage)
                getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
        put_task_struct(p);

        infop = wo->wo_info;
        if (!infop) {
                wo->wo_stat = 0xffff;
        } else {
                infop->cause = CLD_CONTINUED;
                infop->pid = pid;
                infop->uid = uid;
                infop->status = SIGCONT;
        }
        return pid;
}

/*
 * Consider @p for a wait by @parent.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue;
 * then ->notask_error is 0 if @p is an eligible child,
 * or still -ECHILD.
 */
static int wait_consider_task(struct wait_opts *wo, int ptrace,
                                struct task_struct *p)
{
        /*
         * We can race with wait_task_zombie() from another thread.
         * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
         * can't confuse the checks below.
         */
        int exit_state = READ_ONCE(p->exit_state);
        int ret;

        if (unlikely(exit_state == EXIT_DEAD))
                return 0;

        ret = eligible_child(wo, ptrace, p);
        if (!ret)
                return ret;

        if (unlikely(exit_state == EXIT_TRACE)) {
                /*
                 * ptrace == 0 means we are the natural parent. In this case
                 * we should clear notask_error, debugger will notify us.
                 */
                if (likely(!ptrace))
                        wo->notask_error = 0;
                return 0;
        }

        if (likely(!ptrace) && unlikely(p->ptrace)) {
                /*
                 * If it is traced by its real parent's group, just pretend
                 * the caller is ptrace_do_wait() and reap this child if it
                 * is zombie.
                 *
                 * This also hides group stop state from real parent; otherwise
                 * a single stop can be reported twice as group and ptrace stop.
                 * If a ptracer wants to distinguish these two events for its
                 * own children it should create a separate process which takes
                 * the role of real parent.
                 */
                if (!ptrace_reparented(p))
                        ptrace = 1;
        }

        /* slay zombie? */
        if (exit_state == EXIT_ZOMBIE) {
                /* we don't reap group leaders with subthreads */
                if (!delay_group_leader(p)) {
                        /*
                         * A zombie ptracee is only visible to its ptracer.
                         * Notification and reaping will be cascaded to the
                         * real parent when the ptracer detaches.
                         */
                        if (unlikely(ptrace) || likely(!p->ptrace))
                                return wait_task_zombie(wo, p);
                }

                /*
                 * Allow access to stopped/continued state via zombie by
                 * falling through.  Clearing of notask_error is complex.
                 *
                 * When !@ptrace:
                 *
                 * If WEXITED is set, notask_error should naturally be
                 * cleared.  If not, subset of WSTOPPED|WCONTINUED is set,
                 * so, if there are live subthreads, there are events to
                 * wait for.  If all subthreads are dead, it's still safe
                 * to clear - this function will be called again in finite
                 * amount time once all the subthreads are released and
                 * will then return without clearing.
                 *
                 * When @ptrace:
                 *
                 * Stopped state is per-task and thus can't change once the
                 * target task dies.  Only continued and exited can happen.
                 * Clear notask_error if WCONTINUED | WEXITED.
                 */
                if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
                        wo->notask_error = 0;
        } else {
                /*
                 * @p is alive and it's gonna stop, continue or exit, so
                 * there always is something to wait for.
                 */
                wo->notask_error = 0;
        }

        /*
         * Wait for stopped.  Depending on @ptrace, different stopped state
         * is used and the two don't interact with each other.
         */
        ret = wait_task_stopped(wo, ptrace, p);
        if (ret)
                return ret;

        /*
         * Wait for continued.  There's only one continued state and the
         * ptracer can consume it which can confuse the real parent.  Don't
         * use WCONTINUED from ptracer.  You don't need or want it.
         */
        return wait_task_continued(wo, p);
}

/*
 * Do the work of do_wait() for one thread in the group, @tsk.
 *
 * -ECHILD should be in ->notask_error before the first call.
 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
 * Returns zero if the search for a child should continue; then
 * ->notask_error is 0 if there were any eligible children,
 * or still -ECHILD.
 */
static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->children, sibling) {
                int ret = wait_consider_task(wo, 0, p);

                if (ret)
                        return ret;
        }

        return 0;
}

static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
{
        struct task_struct *p;

        list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
                int ret = wait_consider_task(wo, 1, p);

                if (ret)
                        return ret;
        }

        return 0;
}

bool pid_child_should_wake(struct wait_opts *wo, struct task_struct *p)
{
        if (!eligible_pid(wo, p))
                return false;

        if ((wo->wo_flags & __WNOTHREAD) && wo->child_wait.private != p->parent)
                return false;

        return true;
}

static int child_wait_callback(wait_queue_entry_t *wait, unsigned mode,
                                int sync, void *key)
{
        struct wait_opts *wo = container_of(wait, struct wait_opts,
                                                child_wait);
        struct task_struct *p = key;

        if (pid_child_should_wake(wo, p))
                return default_wake_function(wait, mode, sync, key);

        return 0;
}

void __wake_up_parent(struct task_struct *p, struct task_struct *parent)
{
        __wake_up_sync_key(&parent->signal->wait_chldexit,
                           TASK_INTERRUPTIBLE, p);
}

static bool is_effectively_child(struct wait_opts *wo, bool ptrace,
                                 struct task_struct *target)
{
        struct task_struct *parent =
                !ptrace ? target->real_parent : target->parent;

        return current == parent || (!(wo->wo_flags & __WNOTHREAD) &&
                                     same_thread_group(current, parent));
}

/*
 * Optimization for waiting on PIDTYPE_PID. No need to iterate through child
 * and tracee lists to find the target task.
 */
static int do_wait_pid(struct wait_opts *wo)
{
        bool ptrace;
        struct task_struct *target;
        int retval;

        ptrace = false;
        target = pid_task(wo->wo_pid, PIDTYPE_TGID);
        if (target && is_effectively_child(wo, ptrace, target)) {
                retval = wait_consider_task(wo, ptrace, target);
                if (retval)
                        return retval;
        }

        ptrace = true;
        target = pid_task(wo->wo_pid, PIDTYPE_PID);
        if (target && target->ptrace &&
            is_effectively_child(wo, ptrace, target)) {
                retval = wait_consider_task(wo, ptrace, target);
                if (retval)
                        return retval;
        }

        return 0;
}

long __do_wait(struct wait_opts *wo)
{
        long retval;

        /*
         * If there is nothing that can match our criteria, just get out.
         * We will clear ->notask_error to zero if we see any child that
         * might later match our criteria, even if we are not able to reap
         * it yet.
         */
        wo->notask_error = -ECHILD;
        if ((wo->wo_type < PIDTYPE_MAX) &&
           (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
                goto notask;

        read_lock(&tasklist_lock);

        if (wo->wo_type == PIDTYPE_PID) {
                retval = do_wait_pid(wo);
                if (retval)
                        return retval;
        } else {
                struct task_struct *tsk = current;

                do {
                        retval = do_wait_thread(wo, tsk);
                        if (retval)
                                return retval;

                        retval = ptrace_do_wait(wo, tsk);
                        if (retval)
                                return retval;

                        if (wo->wo_flags & __WNOTHREAD)
                                break;
                } while_each_thread(current, tsk);
        }
        read_unlock(&tasklist_lock);

notask:
        retval = wo->notask_error;
        if (!retval && !(wo->wo_flags & WNOHANG))
                return -ERESTARTSYS;

        return retval;
}

static long do_wait(struct wait_opts *wo)
{
        int retval;

        trace_sched_process_wait(wo->wo_pid);

        init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
        wo->child_wait.private = current;
        add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);

        do {
                set_current_state(TASK_INTERRUPTIBLE);
                retval = __do_wait(wo);
                if (retval != -ERESTARTSYS)
                        break;
                if (signal_pending(current))
                        break;
                schedule();
        } while (1);

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
        return retval;
}

int kernel_waitid_prepare(struct wait_opts *wo, int which, pid_t upid,
                          struct waitid_info *infop, int options,
                          struct rusage *ru)
{
        unsigned int f_flags = 0;
        struct pid *pid = NULL;
        enum pid_type type;

        if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;
        if (!(options & (WEXITED|WSTOPPED|WCONTINUED)))
                return -EINVAL;

        switch (which) {
        case P_ALL:
                type = PIDTYPE_MAX;
                break;
        case P_PID:
                type = PIDTYPE_PID;
                if (upid <= 0)
                        return -EINVAL;

                pid = find_get_pid(upid);
                break;
        case P_PGID:
                type = PIDTYPE_PGID;
                if (upid < 0)
                        return -EINVAL;

                if (upid)
                        pid = find_get_pid(upid);
                else
                        pid = get_task_pid(current, PIDTYPE_PGID);
                break;
        case P_PIDFD:
                type = PIDTYPE_PID;
                if (upid < 0)
                        return -EINVAL;

                pid = pidfd_get_pid(upid, &f_flags);
                if (IS_ERR(pid))
                        return PTR_ERR(pid);

                break;
        default:
                return -EINVAL;
        }

        wo->wo_type        = type;
        wo->wo_pid        = pid;
        wo->wo_flags        = options;
        wo->wo_info        = infop;
        wo->wo_rusage        = ru;
        if (f_flags & O_NONBLOCK)
                wo->wo_flags |= WNOHANG;

        return 0;
}

static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
                          int options, struct rusage *ru)
{
        struct wait_opts wo;
        long ret;

        ret = kernel_waitid_prepare(&wo, which, upid, infop, options, ru);
        if (ret)
                return ret;

        ret = do_wait(&wo);
        if (!ret && !(options & WNOHANG) && (wo.wo_flags & WNOHANG))
                ret = -EAGAIN;

        put_pid(wo.wo_pid);
        return ret;
}

SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
                infop, int, options, struct rusage __user *, ru)
{
        struct rusage r;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
        int signo = 0;

        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}

long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
                  struct rusage *ru)
{
        struct wait_opts wo;
        struct pid *pid = NULL;
        enum pid_type type;
        long ret;

        if (options & ~(WNOHANG|WUNTRACED|WCONTINUED|
                        __WNOTHREAD|__WCLONE|__WALL))
                return -EINVAL;

        /* -INT_MIN is not defined */
        if (upid == INT_MIN)
                return -ESRCH;

        if (upid == -1)
                type = PIDTYPE_MAX;
        else if (upid < 0) {
                type = PIDTYPE_PGID;
                pid = find_get_pid(-upid);
        } else if (upid == 0) {
                type = PIDTYPE_PGID;
                pid = get_task_pid(current, PIDTYPE_PGID);
        } else /* upid > 0 */ {
                type = PIDTYPE_PID;
                pid = find_get_pid(upid);
        }

        wo.wo_type        = type;
        wo.wo_pid        = pid;
        wo.wo_flags        = options | WEXITED;
        wo.wo_info        = NULL;
        wo.wo_stat        = 0;
        wo.wo_rusage        = ru;
        ret = do_wait(&wo);
        put_pid(pid);
        if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
                ret = -EFAULT;

        return ret;
}

int kernel_wait(pid_t pid, int *stat)
{
        struct wait_opts wo = {
                .wo_type        = PIDTYPE_PID,
                .wo_pid                = find_get_pid(pid),
                .wo_flags        = WEXITED,
        };
        int ret;

        ret = do_wait(&wo);
        if (ret > 0 && wo.wo_stat)
                *stat = wo.wo_stat;
        put_pid(wo.wo_pid);
        return ret;
}

SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
                int, options, struct rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);

        if (err > 0) {
                if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
                        return -EFAULT;
        }
        return err;
}

#ifdef __ARCH_WANT_SYS_WAITPID

/*
 * sys_waitpid() remains for compatibility. waitpid() should be
 * implemented by calling sys_wait4() from libc.a.
 */
SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
{
        return kernel_wait4(pid, stat_addr, options, NULL);
}

#endif

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(wait4,
        compat_pid_t, pid,
        compat_uint_t __user *, stat_addr,
        int, options,
        struct compat_rusage __user *, ru)
{
        struct rusage r;
        long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
        if (err > 0) {
                if (ru && put_compat_rusage(&r, ru))
                        return -EFAULT;
        }
        return err;
}

COMPAT_SYSCALL_DEFINE5(waitid,
                int, which, compat_pid_t, pid,
                struct compat_siginfo __user *, infop, int, options,
                struct compat_rusage __user *, uru)
{
        struct rusage ru;
        struct waitid_info info = {.status = 0};
        long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
        int signo = 0;
        if (err > 0) {
                signo = SIGCHLD;
                err = 0;
                if (uru) {
                        /* kernel_waitid() overwrites everything in ru */
                        if (COMPAT_USE_64BIT_TIME)
                                err = copy_to_user(uru, &ru, sizeof(ru));
                        else
                                err = put_compat_rusage(&ru, uru);
                        if (err)
                                return -EFAULT;
                }
        }

        if (!infop)
                return err;

        if (!user_write_access_begin(infop, sizeof(*infop)))
                return -EFAULT;

        unsafe_put_user(signo, &infop->si_signo, Efault);
        unsafe_put_user(0, &infop->si_errno, Efault);
        unsafe_put_user(info.cause, &infop->si_code, Efault);
        unsafe_put_user(info.pid, &infop->si_pid, Efault);
        unsafe_put_user(info.uid, &infop->si_uid, Efault);
        unsafe_put_user(info.status, &infop->si_status, Efault);
        user_write_access_end();
        return err;
Efault:
        user_write_access_end();
        return -EFAULT;
}
#endif

/*
 * This needs to be __function_aligned as GCC implicitly makes any
 * implementation of abort() cold and drops alignment specified by
 * -falign-functions=N.
 *
 * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88345#c11
 */
__weak __function_aligned void abort(void)
{
        BUG();

        /* if that doesn't kill us, halt */
        panic("Oops failed to kill thread");
}
EXPORT_SYMBOL(abort);









































































































  161 






















  246 


  246 











  189 

  166 






  246 









  166 




  190 
  190 


















  246 










    8 
    8 









    8 











    8 
































   24 





















   24 








   13 

   22 





   24 



   22 
















   13 
   13 


















   24 
   24 











































































  209 
  209 






























































































  208 
  209 




  209 






























































































































































































































































































































































































  208 











  209 

  209 





  209 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
// SPDX-License-Identifier: GPL-2.0-only
/*
 * jump label support
 *
 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
 * Copyright (C) 2011 Peter Zijlstra
 *
 */
#include <linux/memory.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/err.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/bug.h>
#include <linux/cpu.h>
#include <asm/sections.h>

/* mutex to protect coming/going of the jump_label table */
static DEFINE_MUTEX(jump_label_mutex);

void jump_label_lock(void)
{
        mutex_lock(&jump_label_mutex);
}

void jump_label_unlock(void)
{
        mutex_unlock(&jump_label_mutex);
}

static int jump_label_cmp(const void *a, const void *b)
{
        const struct jump_entry *jea = a;
        const struct jump_entry *jeb = b;

        /*
         * Entrires are sorted by key.
         */
        if (jump_entry_key(jea) < jump_entry_key(jeb))
                return -1;

        if (jump_entry_key(jea) > jump_entry_key(jeb))
                return 1;

        /*
         * In the batching mode, entries should also be sorted by the code
         * inside the already sorted list of entries, enabling a bsearch in
         * the vector.
         */
        if (jump_entry_code(jea) < jump_entry_code(jeb))
                return -1;

        if (jump_entry_code(jea) > jump_entry_code(jeb))
                return 1;

        return 0;
}

static void jump_label_swap(void *a, void *b, int size)
{
        long delta = (unsigned long)a - (unsigned long)b;
        struct jump_entry *jea = a;
        struct jump_entry *jeb = b;
        struct jump_entry tmp = *jea;

        jea->code        = jeb->code - delta;
        jea->target        = jeb->target - delta;
        jea->key        = jeb->key - delta;

        jeb->code        = tmp.code + delta;
        jeb->target        = tmp.target + delta;
        jeb->key        = tmp.key + delta;
}

static void
jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
{
        unsigned long size;
        void *swapfn = NULL;

        if (IS_ENABLED(CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE))
                swapfn = jump_label_swap;

        size = (((unsigned long)stop - (unsigned long)start)
                                        / sizeof(struct jump_entry));
        sort(start, size, sizeof(struct jump_entry), jump_label_cmp, swapfn);
}

static void jump_label_update(struct static_key *key);

/*
 * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h.
 * The use of 'atomic_read()' requires atomic.h and its problematic for some
 * kernel headers such as kernel.h and others. Since static_key_count() is not
 * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok
 * to have it be a function here. Similarly, for 'static_key_enable()' and
 * 'static_key_disable()', which require bug.h. This should allow jump_label.h
 * to be included from most/all places for CONFIG_JUMP_LABEL.
 */
int static_key_count(struct static_key *key)
{
        /*
         * -1 means the first static_key_slow_inc() is in progress.
         *  static_key_enabled() must return true, so return 1 here.
         */
        int n = atomic_read(&key->enabled);

        return n >= 0 ? n : 1;
}
EXPORT_SYMBOL_GPL(static_key_count);

/*
 * static_key_fast_inc_not_disabled - adds a user for a static key
 * @key: static key that must be already enabled
 *
 * The caller must make sure that the static key can't get disabled while
 * in this function. It doesn't patch jump labels, only adds a user to
 * an already enabled static key.
 *
 * Returns true if the increment was done. Unlike refcount_t the ref counter
 * is not saturated, but will fail to increment on overflow.
 */
bool static_key_fast_inc_not_disabled(struct static_key *key)
{
        int v;

        STATIC_KEY_CHECK_USE(key);
        /*
         * Negative key->enabled has a special meaning: it sends
         * static_key_slow_inc/dec() down the slow path, and it is non-zero
         * so it counts as "enabled" in jump_label_update().
         *
         * The INT_MAX overflow condition is either used by the networking
         * code to reset or detected in the slow path of
         * static_key_slow_inc_cpuslocked().
         */
        v = atomic_read(&key->enabled);
        do {
                if (v <= 0 || v == INT_MAX)
                        return false;
        } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));

        return true;
}
EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled);

bool static_key_slow_inc_cpuslocked(struct static_key *key)
{
        lockdep_assert_cpus_held();

        /*
         * Careful if we get concurrent static_key_slow_inc/dec() calls;
         * later calls must wait for the first one to _finish_ the
         * jump_label_update() process.  At the same time, however,
         * the jump_label_update() call below wants to see
         * static_key_enabled(&key) for jumps to be updated properly.
         */
        if (static_key_fast_inc_not_disabled(key))
                return true;

        guard(mutex)(&jump_label_mutex);
        /* Try to mark it as 'enabling in progress. */
        if (!atomic_cmpxchg(&key->enabled, 0, -1)) {
                jump_label_update(key);
                /*
                 * Ensure that when static_key_fast_inc_not_disabled() or
                 * static_key_dec_not_one() observe the positive value,
                 * they must also observe all the text changes.
                 */
                atomic_set_release(&key->enabled, 1);
        } else {
                /*
                 * While holding the mutex this should never observe
                 * anything else than a value >= 1 and succeed
                 */
                if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key)))
                        return false;
        }
        return true;
}

bool static_key_slow_inc(struct static_key *key)
{
        bool ret;

        cpus_read_lock();
        ret = static_key_slow_inc_cpuslocked(key);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);

void static_key_enable_cpuslocked(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        lockdep_assert_cpus_held();

        if (atomic_read(&key->enabled) > 0) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
                return;
        }

        jump_label_lock();
        if (atomic_read(&key->enabled) == 0) {
                atomic_set(&key->enabled, -1);
                jump_label_update(key);
                /*
                 * See static_key_slow_inc().
                 */
                atomic_set_release(&key->enabled, 1);
        }
        jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked);

void static_key_enable(struct static_key *key)
{
        cpus_read_lock();
        static_key_enable_cpuslocked(key);
        cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(static_key_enable);

void static_key_disable_cpuslocked(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        lockdep_assert_cpus_held();

        if (atomic_read(&key->enabled) != 1) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
                return;
        }

        jump_label_lock();
        if (atomic_cmpxchg(&key->enabled, 1, 0) == 1)
                jump_label_update(key);
        jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked);

void static_key_disable(struct static_key *key)
{
        cpus_read_lock();
        static_key_disable_cpuslocked(key);
        cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(static_key_disable);

static bool static_key_dec_not_one(struct static_key *key)
{
        int v;

        /*
         * Go into the slow path if key::enabled is less than or equal than
         * one. One is valid to shut down the key, anything less than one
         * is an imbalance, which is handled at the call site.
         *
         * That includes the special case of '-1' which is set in
         * static_key_slow_inc_cpuslocked(), but that's harmless as it is
         * fully serialized in the slow path below. By the time this task
         * acquires the jump label lock the value is back to one and the
         * retry under the lock must succeed.
         */
        v = atomic_read(&key->enabled);
        do {
                /*
                 * Warn about the '-1' case though; since that means a
                 * decrement is concurrent with a first (0->1) increment. IOW
                 * people are trying to disable something that wasn't yet fully
                 * enabled. This suggests an ordering problem on the user side.
                 */
                WARN_ON_ONCE(v < 0);

                /*
                 * Warn about underflow, and lie about success in an attempt to
                 * not make things worse.
                 */
                if (WARN_ON_ONCE(v == 0))
                        return true;

                if (v <= 1)
                        return false;
        } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1)));

        return true;
}

static void __static_key_slow_dec_cpuslocked(struct static_key *key)
{
        lockdep_assert_cpus_held();
        int val;

        if (static_key_dec_not_one(key))
                return;

        guard(mutex)(&jump_label_mutex);
        val = atomic_read(&key->enabled);
        /*
         * It should be impossible to observe -1 with jump_label_mutex held,
         * see static_key_slow_inc_cpuslocked().
         */
        if (WARN_ON_ONCE(val == -1))
                return;
        /*
         * Cannot already be 0, something went sideways.
         */
        if (WARN_ON_ONCE(val == 0))
                return;

        if (atomic_dec_and_test(&key->enabled))
                jump_label_update(key);
}

static void __static_key_slow_dec(struct static_key *key)
{
        cpus_read_lock();
        __static_key_slow_dec_cpuslocked(key);
        cpus_read_unlock();
}

void jump_label_update_timeout(struct work_struct *work)
{
        struct static_key_deferred *key =
                container_of(work, struct static_key_deferred, work.work);
        __static_key_slow_dec(&key->key);
}
EXPORT_SYMBOL_GPL(jump_label_update_timeout);

void static_key_slow_dec(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        __static_key_slow_dec(key);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);

void static_key_slow_dec_cpuslocked(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        __static_key_slow_dec_cpuslocked(key);
}

void __static_key_slow_dec_deferred(struct static_key *key,
                                    struct delayed_work *work,
                                    unsigned long timeout)
{
        STATIC_KEY_CHECK_USE(key);

        if (static_key_dec_not_one(key))
                return;

        schedule_delayed_work(work, timeout);
}
EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred);

void __static_key_deferred_flush(void *key, struct delayed_work *work)
{
        STATIC_KEY_CHECK_USE(key);
        flush_delayed_work(work);
}
EXPORT_SYMBOL_GPL(__static_key_deferred_flush);

void jump_label_rate_limit(struct static_key_deferred *key,
                unsigned long rl)
{
        STATIC_KEY_CHECK_USE(key);
        key->timeout = rl;
        INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
EXPORT_SYMBOL_GPL(jump_label_rate_limit);

static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
        if (jump_entry_code(entry) <= (unsigned long)end &&
            jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start)
                return 1;

        return 0;
}

static int __jump_label_text_reserved(struct jump_entry *iter_start,
                struct jump_entry *iter_stop, void *start, void *end, bool init)
{
        struct jump_entry *iter;

        iter = iter_start;
        while (iter < iter_stop) {
                if (init || !jump_entry_is_init(iter)) {
                        if (addr_conflict(iter, start, end))
                                return 1;
                }
                iter++;
        }

        return 0;
}

#ifndef arch_jump_label_transform_static
static void arch_jump_label_transform_static(struct jump_entry *entry,
                                             enum jump_label_type type)
{
        /* nothing to do on most architectures */
}
#endif

static inline struct jump_entry *static_key_entries(struct static_key *key)
{
        WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED);
        return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK);
}

static inline bool static_key_type(struct static_key *key)
{
        return key->type & JUMP_TYPE_TRUE;
}

static inline bool static_key_linked(struct static_key *key)
{
        return key->type & JUMP_TYPE_LINKED;
}

static inline void static_key_clear_linked(struct static_key *key)
{
        key->type &= ~JUMP_TYPE_LINKED;
}

static inline void static_key_set_linked(struct static_key *key)
{
        key->type |= JUMP_TYPE_LINKED;
}

/***
 * A 'struct static_key' uses a union such that it either points directly
 * to a table of 'struct jump_entry' or to a linked list of modules which in
 * turn point to 'struct jump_entry' tables.
 *
 * The two lower bits of the pointer are used to keep track of which pointer
 * type is in use and to store the initial branch direction, we use an access
 * function which preserves these bits.
 */
static void static_key_set_entries(struct static_key *key,
                                   struct jump_entry *entries)
{
        unsigned long type;

        WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK);
        type = key->type & JUMP_TYPE_MASK;
        key->entries = entries;
        key->type |= type;
}

static enum jump_label_type jump_label_type(struct jump_entry *entry)
{
        struct static_key *key = jump_entry_key(entry);
        bool enabled = static_key_enabled(key);
        bool branch = jump_entry_is_branch(entry);

        /* See the comment in linux/jump_label.h */
        return enabled ^ branch;
}

static bool jump_label_can_update(struct jump_entry *entry, bool init)
{
        /*
         * Cannot update code that was in an init text area.
         */
        if (!init && jump_entry_is_init(entry))
                return false;

        if (!kernel_text_address(jump_entry_code(entry))) {
                /*
                 * This skips patching built-in __exit, which
                 * is part of init_section_contains() but is
                 * not part of kernel_text_address().
                 *
                 * Skipping built-in __exit is fine since it
                 * will never be executed.
                 */
                WARN_ONCE(!jump_entry_is_init(entry),
                          "can't patch jump_label at %pS",
                          (void *)jump_entry_code(entry));
                return false;
        }

        return true;
}

#ifndef HAVE_JUMP_LABEL_BATCH
static void __jump_label_update(struct static_key *key,
                                struct jump_entry *entry,
                                struct jump_entry *stop,
                                bool init)
{
        for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
                if (jump_label_can_update(entry, init))
                        arch_jump_label_transform(entry, jump_label_type(entry));
        }
}
#else
static void __jump_label_update(struct static_key *key,
                                struct jump_entry *entry,
                                struct jump_entry *stop,
                                bool init)
{
        for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {

                if (!jump_label_can_update(entry, init))
                        continue;

                if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) {
                        /*
                         * Queue is full: Apply the current queue and try again.
                         */
                        arch_jump_label_transform_apply();
                        BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry)));
                }
        }
        arch_jump_label_transform_apply();
}
#endif

void __init jump_label_init(void)
{
        struct jump_entry *iter_start = __start___jump_table;
        struct jump_entry *iter_stop = __stop___jump_table;
        struct static_key *key = NULL;
        struct jump_entry *iter;

        /*
         * Since we are initializing the static_key.enabled field with
         * with the 'raw' int values (to avoid pulling in atomic.h) in
         * jump_label.h, let's make sure that is safe. There are only two
         * cases to check since we initialize to 0 or 1.
         */
        BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0);
        BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1);

        if (static_key_initialized)
                return;

        cpus_read_lock();
        jump_label_lock();
        jump_label_sort_entries(iter_start, iter_stop);

        for (iter = iter_start; iter < iter_stop; iter++) {
                struct static_key *iterk;
                bool in_init;

                /* rewrite NOPs */
                if (jump_label_type(iter) == JUMP_LABEL_NOP)
                        arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);

                in_init = init_section_contains((void *)jump_entry_code(iter), 1);
                jump_entry_set_init(iter, in_init);

                iterk = jump_entry_key(iter);
                if (iterk == key)
                        continue;

                key = iterk;
                static_key_set_entries(key, iter);
        }
        static_key_initialized = true;
        jump_label_unlock();
        cpus_read_unlock();
}

static inline bool static_key_sealed(struct static_key *key)
{
        return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK);
}

static inline void static_key_seal(struct static_key *key)
{
        unsigned long type = key->type & JUMP_TYPE_TRUE;
        key->type = JUMP_TYPE_LINKED | type;
}

void jump_label_init_ro(void)
{
        struct jump_entry *iter_start = __start___jump_table;
        struct jump_entry *iter_stop = __stop___jump_table;
        struct jump_entry *iter;

        if (WARN_ON_ONCE(!static_key_initialized))
                return;

        cpus_read_lock();
        jump_label_lock();

        for (iter = iter_start; iter < iter_stop; iter++) {
                struct static_key *iterk = jump_entry_key(iter);

                if (!is_kernel_ro_after_init((unsigned long)iterk))
                        continue;

                if (static_key_sealed(iterk))
                        continue;

                static_key_seal(iterk);
        }

        jump_label_unlock();
        cpus_read_unlock();
}

#ifdef CONFIG_MODULES

enum jump_label_type jump_label_init_type(struct jump_entry *entry)
{
        struct static_key *key = jump_entry_key(entry);
        bool type = static_key_type(key);
        bool branch = jump_entry_is_branch(entry);

        /* See the comment in linux/jump_label.h */
        return type ^ branch;
}

struct static_key_mod {
        struct static_key_mod *next;
        struct jump_entry *entries;
        struct module *mod;
};

static inline struct static_key_mod *static_key_mod(struct static_key *key)
{
        WARN_ON_ONCE(!static_key_linked(key));
        return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK);
}

/***
 * key->type and key->next are the same via union.
 * This sets key->next and preserves the type bits.
 *
 * See additional comments above static_key_set_entries().
 */
static void static_key_set_mod(struct static_key *key,
                               struct static_key_mod *mod)
{
        unsigned long type;

        WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK);
        type = key->type & JUMP_TYPE_MASK;
        key->next = mod;
        key->type |= type;
}

static int __jump_label_mod_text_reserved(void *start, void *end)
{
        struct module *mod;
        int ret;

        scoped_guard(rcu) {
                mod = __module_text_address((unsigned long)start);
                WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
                if (!try_module_get(mod))
                        mod = NULL;
        }
        if (!mod)
                return 0;

        ret = __jump_label_text_reserved(mod->jump_entries,
                                mod->jump_entries + mod->num_jump_entries,
                                start, end, mod->state == MODULE_STATE_COMING);

        module_put(mod);

        return ret;
}

static void __jump_label_mod_update(struct static_key *key)
{
        struct static_key_mod *mod;

        for (mod = static_key_mod(key); mod; mod = mod->next) {
                struct jump_entry *stop;
                struct module *m;

                /*
                 * NULL if the static_key is defined in a module
                 * that does not use it
                 */
                if (!mod->entries)
                        continue;

                m = mod->mod;
                if (!m)
                        stop = __stop___jump_table;
                else
                        stop = m->jump_entries + m->num_jump_entries;
                __jump_label_update(key, mod->entries, stop,
                                    m && m->state == MODULE_STATE_COMING);
        }
}

static int jump_label_add_module(struct module *mod)
{
        struct jump_entry *iter_start = mod->jump_entries;
        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
        struct jump_entry *iter;
        struct static_key *key = NULL;
        struct static_key_mod *jlm, *jlm2;

        /* if the module doesn't have jump label entries, just return */
        if (iter_start == iter_stop)
                return 0;

        jump_label_sort_entries(iter_start, iter_stop);

        for (iter = iter_start; iter < iter_stop; iter++) {
                struct static_key *iterk;
                bool in_init;

                in_init = within_module_init(jump_entry_code(iter), mod);
                jump_entry_set_init(iter, in_init);

                iterk = jump_entry_key(iter);
                if (iterk == key)
                        continue;

                key = iterk;
                if (within_module((unsigned long)key, mod)) {
                        static_key_set_entries(key, iter);
                        continue;
                }

                /*
                 * If the key was sealed at init, then there's no need to keep a
                 * reference to its module entries - just patch them now and be
                 * done with it.
                 */
                if (static_key_sealed(key))
                        goto do_poke;

                jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL);
                if (!jlm)
                        return -ENOMEM;
                if (!static_key_linked(key)) {
                        jlm2 = kzalloc(sizeof(struct static_key_mod),
                                       GFP_KERNEL);
                        if (!jlm2) {
                                kfree(jlm);
                                return -ENOMEM;
                        }
                        scoped_guard(rcu)
                                jlm2->mod = __module_address((unsigned long)key);

                        jlm2->entries = static_key_entries(key);
                        jlm2->next = NULL;
                        static_key_set_mod(key, jlm2);
                        static_key_set_linked(key);
                }
                jlm->mod = mod;
                jlm->entries = iter;
                jlm->next = static_key_mod(key);
                static_key_set_mod(key, jlm);
                static_key_set_linked(key);

                /* Only update if we've changed from our initial state */
do_poke:
                if (jump_label_type(iter) != jump_label_init_type(iter))
                        __jump_label_update(key, iter, iter_stop, true);
        }

        return 0;
}

static void jump_label_del_module(struct module *mod)
{
        struct jump_entry *iter_start = mod->jump_entries;
        struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
        struct jump_entry *iter;
        struct static_key *key = NULL;
        struct static_key_mod *jlm, **prev;

        for (iter = iter_start; iter < iter_stop; iter++) {
                if (jump_entry_key(iter) == key)
                        continue;

                key = jump_entry_key(iter);

                if (within_module((unsigned long)key, mod))
                        continue;

                /* No @jlm allocated because key was sealed at init. */
                if (static_key_sealed(key))
                        continue;

                /* No memory during module load */
                if (WARN_ON(!static_key_linked(key)))
                        continue;

                prev = &key->next;
                jlm = static_key_mod(key);

                while (jlm && jlm->mod != mod) {
                        prev = &jlm->next;
                        jlm = jlm->next;
                }

                /* No memory during module load */
                if (WARN_ON(!jlm))
                        continue;

                if (prev == &key->next)
                        static_key_set_mod(key, jlm->next);
                else
                        *prev = jlm->next;

                kfree(jlm);

                jlm = static_key_mod(key);
                /* if only one etry is left, fold it back into the static_key */
                if (jlm->next == NULL) {
                        static_key_set_entries(key, jlm->entries);
                        static_key_clear_linked(key);
                        kfree(jlm);
                }
        }
}

static int
jump_label_module_notify(struct notifier_block *self, unsigned long val,
                         void *data)
{
        struct module *mod = data;
        int ret = 0;

        cpus_read_lock();
        jump_label_lock();

        switch (val) {
        case MODULE_STATE_COMING:
                ret = jump_label_add_module(mod);
                if (ret) {
                        WARN(1, "Failed to allocate memory: jump_label may not work properly.\n");
                        jump_label_del_module(mod);
                }
                break;
        case MODULE_STATE_GOING:
                jump_label_del_module(mod);
                break;
        }

        jump_label_unlock();
        cpus_read_unlock();

        return notifier_from_errno(ret);
}

static struct notifier_block jump_label_module_nb = {
        .notifier_call = jump_label_module_notify,
        .priority = 1, /* higher than tracepoints */
};

static __init int jump_label_init_module(void)
{
        return register_module_notifier(&jump_label_module_nb);
}
early_initcall(jump_label_init_module);

#endif /* CONFIG_MODULES */

/***
 * jump_label_text_reserved - check if addr range is reserved
 * @start: start text addr
 * @end: end text addr
 *
 * checks if the text addr located between @start and @end
 * overlaps with any of the jump label patch addresses. Code
 * that wants to modify kernel text should first verify that
 * it does not overlap with any of the jump label addresses.
 * Caller must hold jump_label_mutex.
 *
 * returns 1 if there is an overlap, 0 otherwise
 */
int jump_label_text_reserved(void *start, void *end)
{
        bool init = system_state < SYSTEM_RUNNING;
        int ret = __jump_label_text_reserved(__start___jump_table,
                        __stop___jump_table, start, end, init);

        if (ret)
                return ret;

#ifdef CONFIG_MODULES
        ret = __jump_label_mod_text_reserved(start, end);
#endif
        return ret;
}

static void jump_label_update(struct static_key *key)
{
        struct jump_entry *stop = __stop___jump_table;
        bool init = system_state < SYSTEM_RUNNING;
        struct jump_entry *entry;
#ifdef CONFIG_MODULES
        struct module *mod;

        if (static_key_linked(key)) {
                __jump_label_mod_update(key);
                return;
        }

        scoped_guard(rcu) {
                mod = __module_address((unsigned long)key);
                if (mod) {
                        stop = mod->jump_entries + mod->num_jump_entries;
                        init = mod->state == MODULE_STATE_COMING;
                }
        }
#endif
        entry = static_key_entries(key);
        /* if there are no users, entry can be NULL */
        if (entry)
                __jump_label_update(key, entry, stop, init);
}

#ifdef CONFIG_STATIC_KEYS_SELFTEST
static DEFINE_STATIC_KEY_TRUE(sk_true);
static DEFINE_STATIC_KEY_FALSE(sk_false);

static __init int jump_label_test(void)
{
        int i;

        for (i = 0; i < 2; i++) {
                WARN_ON(static_key_enabled(&sk_true.key) != true);
                WARN_ON(static_key_enabled(&sk_false.key) != false);

                WARN_ON(!static_branch_likely(&sk_true));
                WARN_ON(!static_branch_unlikely(&sk_true));
                WARN_ON(static_branch_likely(&sk_false));
                WARN_ON(static_branch_unlikely(&sk_false));

                static_branch_disable(&sk_true);
                static_branch_enable(&sk_false);

                WARN_ON(static_key_enabled(&sk_true.key) == true);
                WARN_ON(static_key_enabled(&sk_false.key) == false);

                WARN_ON(static_branch_likely(&sk_true));
                WARN_ON(static_branch_unlikely(&sk_true));
                WARN_ON(!static_branch_likely(&sk_false));
                WARN_ON(!static_branch_unlikely(&sk_false));

                static_branch_enable(&sk_true);
                static_branch_disable(&sk_false);
        }

        return 0;
}
early_initcall(jump_label_test);
#endif /* STATIC_KEYS_SELFTEST */





































































































































































































    9 














































































































































    9 



























    9 



















































    9 









    9 
    9 
    9 

    9 
    9 




































































































































































































    9 



















    9 








    9 


    9 














    9 

    9 











































































































  208 







  209 







  208 

  209 







  208 


  209 






  208 



  209 










































  209 







  209 



















































  208 












































































































































  208 


  208 















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic helpers for smp ipi calls
 *
 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
#include <linux/sched/idle.h>
#include <linux/hypervisor.h>
#include <linux/sched/clock.h>
#include <linux/nmi.h>
#include <linux/sched/debug.h>
#include <linux/jump_label.h>
#include <linux/string_choices.h>

#include <trace/events/ipi.h>
#define CREATE_TRACE_POINTS
#include <trace/events/csd.h>
#undef CREATE_TRACE_POINTS

#include "smpboot.h"
#include "sched/smp.h"

#define CSD_TYPE(_csd)        ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK)

struct call_function_data {
        call_single_data_t        __percpu *csd;
        cpumask_var_t                cpumask;
        cpumask_var_t                cpumask_ipi;
};

static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);

static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);

static DEFINE_PER_CPU(atomic_t, trigger_backtrace) = ATOMIC_INIT(1);

static void __flush_smp_call_function_queue(bool warn_cpu_offline);

int smpcfd_prepare_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
                                     cpu_to_node(cpu)))
                return -ENOMEM;
        if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
                                     cpu_to_node(cpu))) {
                free_cpumask_var(cfd->cpumask);
                return -ENOMEM;
        }
        cfd->csd = alloc_percpu(call_single_data_t);
        if (!cfd->csd) {
                free_cpumask_var(cfd->cpumask);
                free_cpumask_var(cfd->cpumask_ipi);
                return -ENOMEM;
        }

        return 0;
}

int smpcfd_dead_cpu(unsigned int cpu)
{
        struct call_function_data *cfd = &per_cpu(cfd_data, cpu);

        free_cpumask_var(cfd->cpumask);
        free_cpumask_var(cfd->cpumask_ipi);
        free_percpu(cfd->csd);
        return 0;
}

int smpcfd_dying_cpu(unsigned int cpu)
{
        /*
         * The IPIs for the smp-call-function callbacks queued by other
         * CPUs might arrive late, either due to hardware latencies or
         * because this CPU disabled interrupts (inside stop-machine)
         * before the IPIs were sent. So flush out any pending callbacks
         * explicitly (without waiting for the IPIs to arrive), to
         * ensure that the outgoing CPU doesn't go offline with work
         * still pending.
         */
        __flush_smp_call_function_queue(false);
        irq_work_run();
        return 0;
}

void __init call_function_init(void)
{
        int i;

        for_each_possible_cpu(i)
                init_llist_head(&per_cpu(call_single_queue, i));

        smpcfd_prepare_cpu(smp_processor_id());
}

static __always_inline void
send_call_function_single_ipi(int cpu)
{
        if (call_function_single_prep_ipi(cpu)) {
                trace_ipi_send_cpu(cpu, _RET_IP_,
                                   generic_smp_call_function_single_interrupt);
                arch_send_call_function_single_ipi(cpu);
        }
}

static __always_inline void
send_call_function_ipi_mask(struct cpumask *mask)
{
        trace_ipi_send_cpumask(mask, _RET_IP_,
                               generic_smp_call_function_single_interrupt);
        arch_send_call_function_ipi_mask(mask);
}

static __always_inline void
csd_do_func(smp_call_func_t func, void *info, call_single_data_t *csd)
{
        trace_csd_function_entry(func, csd);
        func(info);
        trace_csd_function_exit(func, csd);
}

#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG

static DEFINE_STATIC_KEY_MAYBE(CONFIG_CSD_LOCK_WAIT_DEBUG_DEFAULT, csdlock_debug_enabled);

/*
 * Parse the csdlock_debug= kernel boot parameter.
 *
 * If you need to restore the old "ext" value that once provided
 * additional debugging information, reapply the following commits:
 *
 * de7b09ef658d ("locking/csd_lock: Prepare more CSD lock debugging")
 * a5aabace5fb8 ("locking/csd_lock: Add more data to CSD lock debugging")
 */
static int __init csdlock_debug(char *str)
{
        int ret;
        unsigned int val = 0;

        ret = get_option(&str, &val);
        if (ret) {
                if (val)
                        static_branch_enable(&csdlock_debug_enabled);
                else
                        static_branch_disable(&csdlock_debug_enabled);
        }

        return 1;
}
__setup("csdlock_debug=", csdlock_debug);

static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);

static ulong csd_lock_timeout = 5000;  /* CSD lock timeout in milliseconds. */
module_param(csd_lock_timeout, ulong, 0644);
static int panic_on_ipistall;  /* CSD panic timeout in milliseconds, 300000 for five minutes. */
module_param(panic_on_ipistall, int, 0644);

static atomic_t csd_bug_count = ATOMIC_INIT(0);

/* Record current CSD work for current CPU, NULL to erase. */
static void __csd_lock_record(call_single_data_t *csd)
{
        if (!csd) {
                smp_mb(); /* NULL cur_csd after unlock. */
                __this_cpu_write(cur_csd, NULL);
                return;
        }
        __this_cpu_write(cur_csd_func, csd->func);
        __this_cpu_write(cur_csd_info, csd->info);
        smp_wmb(); /* func and info before csd. */
        __this_cpu_write(cur_csd, csd);
        smp_mb(); /* Update cur_csd before function call. */
                  /* Or before unlock, as the case may be. */
}

static __always_inline void csd_lock_record(call_single_data_t *csd)
{
        if (static_branch_unlikely(&csdlock_debug_enabled))
                __csd_lock_record(csd);
}

static int csd_lock_wait_getcpu(call_single_data_t *csd)
{
        unsigned int csd_type;

        csd_type = CSD_TYPE(csd);
        if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
                return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */
        return -1;
}

static atomic_t n_csd_lock_stuck;

/**
 * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long?
 *
 * Returns @true if a CSD-lock acquisition is stuck and has been stuck
 * long enough for a "non-responsive CSD lock" message to be printed.
 */
bool csd_lock_is_stuck(void)
{
        return !!atomic_read(&n_csd_lock_stuck);
}

/*
 * Complain if too much time spent waiting.  Note that only
 * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
 * so waiting on other types gets much less information.
 */
static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id, unsigned long *nmessages)
{
        int cpu = -1;
        int cpux;
        bool firsttime;
        u64 ts2, ts_delta;
        call_single_data_t *cpu_cur_csd;
        unsigned int flags = READ_ONCE(csd->node.u_flags);
        unsigned long long csd_lock_timeout_ns = csd_lock_timeout * NSEC_PER_MSEC;

        if (!(flags & CSD_FLAG_LOCK)) {
                if (!unlikely(*bug_id))
                        return true;
                cpu = csd_lock_wait_getcpu(csd);
                pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
                         *bug_id, raw_smp_processor_id(), cpu);
                atomic_dec(&n_csd_lock_stuck);
                return true;
        }

        ts2 = ktime_get_mono_fast_ns();
        /* How long since we last checked for a stuck CSD lock.*/
        ts_delta = ts2 - *ts1;
        if (likely(ts_delta <= csd_lock_timeout_ns * (*nmessages + 1) *
                               (!*nmessages ? 1 : (ilog2(num_online_cpus()) / 2 + 1)) ||
                   csd_lock_timeout_ns == 0))
                return false;

        if (ts0 > ts2) {
                /* Our own sched_clock went backward; don't blame another CPU. */
                ts_delta = ts0 - ts2;
                pr_alert("sched_clock on CPU %d went backward by %llu ns\n", raw_smp_processor_id(), ts_delta);
                *ts1 = ts2;
                return false;
        }

        firsttime = !*bug_id;
        if (firsttime)
                *bug_id = atomic_inc_return(&csd_bug_count);
        cpu = csd_lock_wait_getcpu(csd);
        if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
                cpux = 0;
        else
                cpux = cpu;
        cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
        /* How long since this CSD lock was stuck. */
        ts_delta = ts2 - ts0;
        pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n",
                 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta,
                 cpu, csd->func, csd->info);
        (*nmessages)++;
        if (firsttime)
                atomic_inc(&n_csd_lock_stuck);
        /*
         * If the CSD lock is still stuck after 5 minutes, it is unlikely
         * to become unstuck. Use a signed comparison to avoid triggering
         * on underflows when the TSC is out of sync between sockets.
         */
        BUG_ON(panic_on_ipistall > 0 && (s64)ts_delta > ((s64)panic_on_ipistall * NSEC_PER_MSEC));
        if (cpu_cur_csd && csd != cpu_cur_csd) {
                pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
                         *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
                         READ_ONCE(per_cpu(cur_csd_info, cpux)));
        } else {
                pr_alert("\tcsd: CSD lock (#%d) %s.\n",
                         *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
        }
        if (cpu >= 0) {
                if (atomic_cmpxchg_acquire(&per_cpu(trigger_backtrace, cpu), 1, 0))
                        dump_cpu_task(cpu);
                if (!cpu_cur_csd) {
                        pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
                        arch_send_call_function_single_ipi(cpu);
                }
        }
        if (firsttime)
                dump_stack();
        *ts1 = ts2;

        return false;
}

/*
 * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
 *
 * For non-synchronous ipi calls the csd can still be in use by the
 * previous function call. For multi-cpu calls its even more interesting
 * as we'll have to ensure no other cpu is observing our csd.
 */
static void __csd_lock_wait(call_single_data_t *csd)
{
        unsigned long nmessages = 0;
        int bug_id = 0;
        u64 ts0, ts1;

        ts1 = ts0 = ktime_get_mono_fast_ns();
        for (;;) {
                if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id, &nmessages))
                        break;
                cpu_relax();
        }
        smp_acquire__after_ctrl_dep();
}

static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
        if (static_branch_unlikely(&csdlock_debug_enabled)) {
                __csd_lock_wait(csd);
                return;
        }

        smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#else
static void csd_lock_record(call_single_data_t *csd)
{
}

static __always_inline void csd_lock_wait(call_single_data_t *csd)
{
        smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK));
}
#endif

static __always_inline void csd_lock(call_single_data_t *csd)
{
        csd_lock_wait(csd);
        csd->node.u_flags |= CSD_FLAG_LOCK;

        /*
         * prevent CPU from reordering the above assignment
         * to ->flags with any subsequent assignments to other
         * fields of the specified call_single_data_t structure:
         */
        smp_wmb();
}

static __always_inline void csd_unlock(call_single_data_t *csd)
{
        WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK));

        /*
         * ensure we're all done before releasing data:
         */
        smp_store_release(&csd->node.u_flags, 0);
}

static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data);

void __smp_call_single_queue(int cpu, struct llist_node *node)
{
        /*
         * We have to check the type of the CSD before queueing it, because
         * once queued it can have its flags cleared by
         *   flush_smp_call_function_queue()
         * even if we haven't sent the smp_call IPI yet (e.g. the stopper
         * executes migration_cpu_stop() on the remote CPU).
         */
        if (trace_csd_queue_cpu_enabled()) {
                call_single_data_t *csd;
                smp_call_func_t func;

                csd = container_of(node, call_single_data_t, node.llist);
                func = CSD_TYPE(csd) == CSD_TYPE_TTWU ?
                        sched_ttwu_pending : csd->func;

                trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);
        }

        /*
         * The list addition should be visible to the target CPU when it pops
         * the head of the list to pull the entry off it in the IPI handler
         * because of normal cache coherency rules implied by the underlying
         * llist ops.
         *
         * If IPIs can go out of order to the cache coherency protocol
         * in an architecture, sufficient synchronisation should be added
         * to arch code to make it appear to obey cache coherency WRT
         * locking and barrier primitives. Generic code isn't really
         * equipped to do the right thing...
         */
        if (llist_add(node, &per_cpu(call_single_queue, cpu)))
                send_call_function_single_ipi(cpu);
}

/*
 * Insert a previously allocated call_single_data_t element
 * for execution on the given CPU. data must already have
 * ->func, ->info, and ->flags set.
 */
static int generic_exec_single(int cpu, call_single_data_t *csd)
{
        if (cpu == smp_processor_id()) {
                smp_call_func_t func = csd->func;
                void *info = csd->info;
                unsigned long flags;

                /*
                 * We can unlock early even for the synchronous on-stack case,
                 * since we're doing this from the same CPU..
                 */
                csd_lock_record(csd);
                csd_unlock(csd);
                local_irq_save(flags);
                csd_do_func(func, info, NULL);
                csd_lock_record(NULL);
                local_irq_restore(flags);
                return 0;
        }

        if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) {
                csd_unlock(csd);
                return -ENXIO;
        }

        __smp_call_single_queue(cpu, &csd->node.llist);

        return 0;
}

/**
 * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
 *
 * Invoked by arch to handle an IPI for call function single.
 * Must be called with interrupts disabled.
 */
void generic_smp_call_function_single_interrupt(void)
{
        __flush_smp_call_function_queue(true);
}

/**
 * __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *
 * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
 *                      offline CPU. Skip this check if set to 'false'.
 *
 * Flush any pending smp-call-function callbacks queued on this CPU. This is
 * invoked by the generic IPI handler, as well as by a CPU about to go offline,
 * to ensure that all pending IPI callbacks are run before it goes completely
 * offline.
 *
 * Loop through the call_single_queue and run all the queued callbacks.
 * Must be called with interrupts disabled.
 */
static void __flush_smp_call_function_queue(bool warn_cpu_offline)
{
        call_single_data_t *csd, *csd_next;
        struct llist_node *entry, *prev;
        struct llist_head *head;
        static bool warned;
        atomic_t *tbt;

        lockdep_assert_irqs_disabled();

        /* Allow waiters to send backtrace NMI from here onwards */
        tbt = this_cpu_ptr(&trigger_backtrace);
        atomic_set_release(tbt, 1);

        head = this_cpu_ptr(&call_single_queue);
        entry = llist_del_all(head);
        entry = llist_reverse_order(entry);

        /* There shouldn't be any pending callbacks on an offline CPU. */
        if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
                     !warned && entry != NULL)) {
                warned = true;
                WARN(1, "IPI on offline CPU %d\n", smp_processor_id());

                /*
                 * We don't have to use the _safe() variant here
                 * because we are not invoking the IPI handlers yet.
                 */
                llist_for_each_entry(csd, entry, node.llist) {
                        switch (CSD_TYPE(csd)) {
                        case CSD_TYPE_ASYNC:
                        case CSD_TYPE_SYNC:
                        case CSD_TYPE_IRQ_WORK:
                                pr_warn("IPI callback %pS sent to offline CPU\n",
                                        csd->func);
                                break;

                        case CSD_TYPE_TTWU:
                                pr_warn("IPI task-wakeup sent to offline CPU\n");
                                break;

                        default:
                                pr_warn("IPI callback, unknown type %d, sent to offline CPU\n",
                                        CSD_TYPE(csd));
                                break;
                        }
                }
        }

        /*
         * First; run all SYNC callbacks, people are waiting for us.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
                /* Do we wait until *after* callback? */
                if (CSD_TYPE(csd) == CSD_TYPE_SYNC) {
                        smp_call_func_t func = csd->func;
                        void *info = csd->info;

                        if (prev) {
                                prev->next = &csd_next->node.llist;
                        } else {
                                entry = &csd_next->node.llist;
                        }

                        csd_lock_record(csd);
                        csd_do_func(func, info, csd);
                        csd_unlock(csd);
                        csd_lock_record(NULL);
                } else {
                        prev = &csd->node.llist;
                }
        }

        if (!entry)
                return;

        /*
         * Second; run all !SYNC callbacks.
         */
        prev = NULL;
        llist_for_each_entry_safe(csd, csd_next, entry, node.llist) {
                int type = CSD_TYPE(csd);

                if (type != CSD_TYPE_TTWU) {
                        if (prev) {
                                prev->next = &csd_next->node.llist;
                        } else {
                                entry = &csd_next->node.llist;
                        }

                        if (type == CSD_TYPE_ASYNC) {
                                smp_call_func_t func = csd->func;
                                void *info = csd->info;

                                csd_lock_record(csd);
                                csd_unlock(csd);
                                csd_do_func(func, info, csd);
                                csd_lock_record(NULL);
                        } else if (type == CSD_TYPE_IRQ_WORK) {
                                irq_work_single(csd);
                        }

                } else {
                        prev = &csd->node.llist;
                }
        }

        /*
         * Third; only CSD_TYPE_TTWU is left, issue those.
         */
        if (entry) {
                csd = llist_entry(entry, typeof(*csd), node.llist);
                csd_do_func(sched_ttwu_pending, entry, csd);
        }
}


/**
 * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
 *                                   from task context (idle, migration thread)
 *
 * When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
 * set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
 * setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
 * handle queued SMP function calls before scheduling.
 *
 * The migration thread has to ensure that an eventually pending wakeup has
 * been handled before it migrates a task.
 */
void flush_smp_call_function_queue(void)
{
        unsigned int was_pending;
        unsigned long flags;

        if (llist_empty(this_cpu_ptr(&call_single_queue)))
                return;

        local_irq_save(flags);
        /* Get the already pending soft interrupts for RT enabled kernels */
        was_pending = local_softirq_pending();
        __flush_smp_call_function_queue(true);
        if (local_softirq_pending())
                do_softirq_post_smp_call_flush(was_pending);

        local_irq_restore(flags);
}

/*
 * smp_call_function_single - Run a function on a specific CPU
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed on other CPUs.
 *
 * Returns 0 on success, else a negative status code.
 */
int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
                             int wait)
{
        call_single_data_t *csd;
        call_single_data_t csd_stack = {
                .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, },
        };
        int this_cpu;
        int err;

        /*
         * prevent preemption and reschedule on another processor,
         * as well as CPU removal
         */
        this_cpu = get_cpu();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
                     && !oops_in_progress);

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        csd = &csd_stack;
        if (!wait) {
                csd = this_cpu_ptr(&csd_data);
                csd_lock(csd);
        }

        csd->func = func;
        csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
        csd->node.src = smp_processor_id();
        csd->node.dst = cpu;
#endif

        err = generic_exec_single(cpu, csd);

        if (wait)
                csd_lock_wait(csd);

        put_cpu();

        return err;
}
EXPORT_SYMBOL(smp_call_function_single);

/**
 * smp_call_function_single_async() - Run an asynchronous function on a
 *                                  specific CPU.
 * @cpu: The CPU to run on.
 * @csd: Pre-allocated and setup data structure
 *
 * Like smp_call_function_single(), but the call is asynchonous and
 * can thus be done from contexts with disabled interrupts.
 *
 * The caller passes his own pre-allocated data structure
 * (ie: embedded in an object) and is responsible for synchronizing it
 * such that the IPIs performed on the @csd are strictly serialized.
 *
 * If the function is called with one csd which has not yet been
 * processed by previous call to smp_call_function_single_async(), the
 * function will return immediately with -EBUSY showing that the csd
 * object is still in progress.
 *
 * NOTE: Be careful, there is unfortunately no current debugging facility to
 * validate the correctness of this serialization.
 *
 * Return: %0 on success or negative errno value on error
 */
int smp_call_function_single_async(int cpu, call_single_data_t *csd)
{
        int err = 0;

        preempt_disable();

        if (csd->node.u_flags & CSD_FLAG_LOCK) {
                err = -EBUSY;
                goto out;
        }

        csd->node.u_flags = CSD_FLAG_LOCK;
        smp_wmb();

        err = generic_exec_single(cpu, csd);

out:
        preempt_enable();

        return err;
}
EXPORT_SYMBOL_GPL(smp_call_function_single_async);

/*
 * smp_call_function_any - Run a function on any of the given cpus
 * @mask: The mask of cpus it can run on.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait until function has completed.
 *
 * Returns 0 on success, else a negative status code (if no cpus were online).
 *
 * Selection preference:
 *        1) current cpu if in @mask
 *        2) any cpu of current node if in @mask
 *        3) any other online cpu in @mask
 */
int smp_call_function_any(const struct cpumask *mask,
                          smp_call_func_t func, void *info, int wait)
{
        unsigned int cpu;
        const struct cpumask *nodemask;
        int ret;

        /* Try for same CPU (cheapest) */
        cpu = get_cpu();
        if (cpumask_test_cpu(cpu, mask))
                goto call;

        /* Try for same node. */
        nodemask = cpumask_of_node(cpu_to_node(cpu));
        for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
             cpu = cpumask_next_and(cpu, nodemask, mask)) {
                if (cpu_online(cpu))
                        goto call;
        }

        /* Any online will do: smp_call_function_single handles nr_cpu_ids. */
        cpu = cpumask_any_and(mask, cpu_online_mask);
call:
        ret = smp_call_function_single(cpu, func, info, wait);
        put_cpu();
        return ret;
}
EXPORT_SYMBOL_GPL(smp_call_function_any);

/*
 * Flags to be used as scf_flags argument of smp_call_function_many_cond().
 *
 * %SCF_WAIT:                Wait until function execution is completed
 * %SCF_RUN_LOCAL:        Run also locally if local cpu is set in cpumask
 */
#define SCF_WAIT        (1U << 0)
#define SCF_RUN_LOCAL        (1U << 1)

static void smp_call_function_many_cond(const struct cpumask *mask,
                                        smp_call_func_t func, void *info,
                                        unsigned int scf_flags,
                                        smp_cond_func_t cond_func)
{
        int cpu, last_cpu, this_cpu = smp_processor_id();
        struct call_function_data *cfd;
        bool wait = scf_flags & SCF_WAIT;
        int nr_cpus = 0;
        bool run_remote = false;
        bool run_local = false;

        lockdep_assert_preemption_disabled();

        /*
         * Can deadlock when called with interrupts disabled.
         * We allow cpu's that are not yet online though, as no one else can
         * send smp call function interrupt to this cpu and as such deadlocks
         * can't happen.
         */
        if (cpu_online(this_cpu) && !oops_in_progress &&
            !early_boot_irqs_disabled)
                lockdep_assert_irqs_enabled();

        /*
         * When @wait we can deadlock when we interrupt between llist_add() and
         * arch_send_call_function_ipi*(); when !@wait we can deadlock due to
         * csd_lock() on because the interrupt context uses the same csd
         * storage.
         */
        WARN_ON_ONCE(!in_task());

        /* Check if we need local execution. */
        if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) &&
            (!cond_func || cond_func(this_cpu, info)))
                run_local = true;

        /* Check if we need remote execution, i.e., any CPU excluding this one. */
        cpu = cpumask_first_and(mask, cpu_online_mask);
        if (cpu == this_cpu)
                cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
        if (cpu < nr_cpu_ids)
                run_remote = true;

        if (run_remote) {
                cfd = this_cpu_ptr(&cfd_data);
                cpumask_and(cfd->cpumask, mask, cpu_online_mask);
                __cpumask_clear_cpu(this_cpu, cfd->cpumask);

                cpumask_clear(cfd->cpumask_ipi);
                for_each_cpu(cpu, cfd->cpumask) {
                        call_single_data_t *csd = per_cpu_ptr(cfd->csd, cpu);

                        if (cond_func && !cond_func(cpu, info)) {
                                __cpumask_clear_cpu(cpu, cfd->cpumask);
                                continue;
                        }

                        csd_lock(csd);
                        if (wait)
                                csd->node.u_flags |= CSD_TYPE_SYNC;
                        csd->func = func;
                        csd->info = info;
#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
                        csd->node.src = smp_processor_id();
                        csd->node.dst = cpu;
#endif
                        trace_csd_queue_cpu(cpu, _RET_IP_, func, csd);

                        if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
                                __cpumask_set_cpu(cpu, cfd->cpumask_ipi);
                                nr_cpus++;
                                last_cpu = cpu;
                        }
                }

                /*
                 * Choose the most efficient way to send an IPI. Note that the
                 * number of CPUs might be zero due to concurrent changes to the
                 * provided mask.
                 */
                if (nr_cpus == 1)
                        send_call_function_single_ipi(last_cpu);
                else if (likely(nr_cpus > 1))
                        send_call_function_ipi_mask(cfd->cpumask_ipi);
        }

        if (run_local) {
                unsigned long flags;

                local_irq_save(flags);
                csd_do_func(func, info, NULL);
                local_irq_restore(flags);
        }

        if (run_remote && wait) {
                for_each_cpu(cpu, cfd->cpumask) {
                        call_single_data_t *csd;

                        csd = per_cpu_ptr(cfd->csd, cpu);
                        csd_lock_wait(csd);
                }
        }
}

/**
 * smp_call_function_many(): Run a function on a set of CPUs.
 * @mask: The set of cpus to run on (only runs on online subset).
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
 *        (atomically) until function has completed on other CPUs. If
 *        %SCF_RUN_LOCAL is set, the function will also be run locally
 *        if the local CPU is set in the @cpumask.
 *
 * If @wait is true, then returns once @func has returned.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler. Preemption
 * must be disabled when calling this function.
 */
void smp_call_function_many(const struct cpumask *mask,
                            smp_call_func_t func, void *info, bool wait)
{
        smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL);
}
EXPORT_SYMBOL(smp_call_function_many);

/**
 * smp_call_function(): Run a function on all other CPUs.
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * Returns 0.
 *
 * If @wait is true, then returns once @func has returned; otherwise
 * it returns just before the target cpu calls @func.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.
 */
void smp_call_function(smp_call_func_t func, void *info, int wait)
{
        preempt_disable();
        smp_call_function_many(cpu_online_mask, func, info, wait);
        preempt_enable();
}
EXPORT_SYMBOL(smp_call_function);

/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
EXPORT_SYMBOL(setup_max_cpus);


/*
 * Setup routine for controlling SMP activation
 *
 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
 * activation entirely (the MPS table probe still happens, though).
 *
 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
 * greater than 0, limits the maximum number of CPUs activated in
 * SMP mode to <NUM>.
 */

void __weak __init arch_disable_smp_support(void) { }

static int __init nosmp(char *str)
{
        setup_max_cpus = 0;
        arch_disable_smp_support();

        return 0;
}

early_param("nosmp", nosmp);

/* this is hard limit */
static int __init nrcpus(char *str)
{
        int nr_cpus;

        if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids)
                set_nr_cpu_ids(nr_cpus);

        return 0;
}

early_param("nr_cpus", nrcpus);

static int __init maxcpus(char *str)
{
        get_option(&str, &setup_max_cpus);
        if (setup_max_cpus == 0)
                arch_disable_smp_support();

        return 0;
}

early_param("maxcpus", maxcpus);

#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS)
/* Setup number of possible processor ids */
unsigned int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);
#endif

/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
        set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1);
}

/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
        int num_nodes, num_cpus;

        idle_threads_init();
        cpuhp_threads_init();

        pr_info("Bringing up secondary CPUs ...\n");

        bringup_nonboot_cpus(setup_max_cpus);

        num_nodes = num_online_nodes();
        num_cpus  = num_online_cpus();
        pr_info("Brought up %d node%s, %d CPU%s\n",
                num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus));

        /* Any cleanup work */
        smp_cpus_done(setup_max_cpus);
}

/*
 * on_each_cpu_cond(): Call a function on each processor for which
 * the supplied function cond_func returns true, optionally waiting
 * for all the required CPUs to finish. This may include the local
 * processor.
 * @cond_func:        A callback function that is passed a cpu id and
 *                the info parameter. The function is called
 *                with preemption disabled. The function should
 *                return a blooean value indicating whether to IPI
 *                the specified CPU.
 * @func:        The function to run on all applicable CPUs.
 *                This must be fast and non-blocking.
 * @info:        An arbitrary pointer to pass to both functions.
 * @wait:        If true, wait (atomically) until function has
 *                completed on other CPUs.
 *
 * Preemption is disabled to protect against CPUs going offline but not online.
 * CPUs going online during the call will not be seen or sent an IPI.
 *
 * You must not call this function with disabled interrupts or
 * from a hardware interrupt handler or from a bottom half handler.
 */
void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
                           void *info, bool wait, const struct cpumask *mask)
{
        unsigned int scf_flags = SCF_RUN_LOCAL;

        if (wait)
                scf_flags |= SCF_WAIT;

        preempt_disable();
        smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
        preempt_enable();
}
EXPORT_SYMBOL(on_each_cpu_cond_mask);

static void do_nothing(void *unused)
{
}

/**
 * kick_all_cpus_sync - Force all cpus out of idle
 *
 * Used to synchronize the update of pm_idle function pointer. It's
 * called after the pointer is updated and returns after the dummy
 * callback function has been executed on all cpus. The execution of
 * the function can only happen on the remote cpus after they have
 * left the idle function which had been called via pm_idle function
 * pointer. So it's guaranteed that nothing uses the previous pointer
 * anymore.
 */
void kick_all_cpus_sync(void)
{
        /* Make sure the change is visible before we kick the cpus */
        smp_mb();
        smp_call_function(do_nothing, NULL, 1);
}
EXPORT_SYMBOL_GPL(kick_all_cpus_sync);

/**
 * wake_up_all_idle_cpus - break all cpus out of idle
 * wake_up_all_idle_cpus try to break all cpus which is in idle state even
 * including idle polling cpus, for non-idle cpus, we will do nothing
 * for them.
 */
void wake_up_all_idle_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                preempt_disable();
                if (cpu != smp_processor_id() && cpu_online(cpu))
                        wake_up_if_idle(cpu);
                preempt_enable();
        }
}
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);

/**
 * struct smp_call_on_cpu_struct - Call a function on a specific CPU
 * @work: &work_struct
 * @done: &completion to signal
 * @func: function to call
 * @data: function's data argument
 * @ret: return value from @func
 * @cpu: target CPU (%-1 for any CPU)
 *
 * Used to call a function on a specific cpu and wait for it to return.
 * Optionally make sure the call is done on a specified physical cpu via vcpu
 * pinning in order to support virtualized environments.
 */
struct smp_call_on_cpu_struct {
        struct work_struct        work;
        struct completion        done;
        int                        (*func)(void *);
        void                        *data;
        int                        ret;
        int                        cpu;
};

static void smp_call_on_cpu_callback(struct work_struct *work)
{
        struct smp_call_on_cpu_struct *sscs;

        sscs = container_of(work, struct smp_call_on_cpu_struct, work);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(sscs->cpu);
        sscs->ret = sscs->func(sscs->data);
        if (sscs->cpu >= 0)
                hypervisor_pin_vcpu(-1);

        complete(&sscs->done);
}

int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
{
        struct smp_call_on_cpu_struct sscs = {
                .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done),
                .func = func,
                .data = par,
                .cpu  = phys ? cpu : -1,
        };

        INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback);

        if (cpu >= nr_cpu_ids || !cpu_online(cpu))
                return -ENXIO;

        queue_work_on(cpu, system_wq, &sscs.work);
        wait_for_completion(&sscs.done);
        destroy_work_on_stack(&sscs.work);

        return sscs.ret;
}
EXPORT_SYMBOL_GPL(smp_call_on_cpu);












 1518 





























































   21 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_PREEMPT_H
#define __ASM_PREEMPT_H

#include <linux/jump_label.h>
#include <linux/thread_info.h>

#define PREEMPT_NEED_RESCHED        BIT(32)
#define PREEMPT_ENABLED        (PREEMPT_NEED_RESCHED)

static inline int preempt_count(void)
{
        return READ_ONCE(current_thread_info()->preempt.count);
}

static inline void preempt_count_set(u64 pc)
{
        /* Preserve existing value of PREEMPT_NEED_RESCHED */
        WRITE_ONCE(current_thread_info()->preempt.count, pc);
}

#define init_task_preempt_count(p) do { \
        task_thread_info(p)->preempt_count = FORK_PREEMPT_COUNT; \
} while (0)

#define init_idle_preempt_count(p, cpu) do { \
        task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
} while (0)

static inline void set_preempt_need_resched(void)
{
        current_thread_info()->preempt.need_resched = 0;
}

static inline void clear_preempt_need_resched(void)
{
        current_thread_info()->preempt.need_resched = 1;
}

static inline bool test_preempt_need_resched(void)
{
        return !current_thread_info()->preempt.need_resched;
}

static inline void __preempt_count_add(int val)
{
        u32 pc = READ_ONCE(current_thread_info()->preempt.count);
        pc += val;
        WRITE_ONCE(current_thread_info()->preempt.count, pc);
}

static inline void __preempt_count_sub(int val)
{
        u32 pc = READ_ONCE(current_thread_info()->preempt.count);
        pc -= val;
        WRITE_ONCE(current_thread_info()->preempt.count, pc);
}

static inline bool __preempt_count_dec_and_test(void)
{
        struct thread_info *ti = current_thread_info();
        u64 pc = READ_ONCE(ti->preempt_count);

        /* Update only the count field, leaving need_resched unchanged */
        WRITE_ONCE(ti->preempt.count, --pc);

        /*
         * If we wrote back all zeroes, then we're preemptible and in
         * need of a reschedule. Otherwise, we need to reload the
         * preempt_count in case the need_resched flag was cleared by an
         * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE
         * pair.
         */
        return !pc || !READ_ONCE(ti->preempt_count);
}

static inline bool should_resched(int preempt_offset)
{
        u64 pc = READ_ONCE(current_thread_info()->preempt_count);
        return pc == preempt_offset;
}

#ifdef CONFIG_PREEMPTION

void preempt_schedule(void);
void preempt_schedule_notrace(void);

#ifdef CONFIG_PREEMPT_DYNAMIC

DECLARE_STATIC_KEY_TRUE(sk_dynamic_irqentry_exit_cond_resched);
void dynamic_preempt_schedule(void);
#define __preempt_schedule()                dynamic_preempt_schedule()
void dynamic_preempt_schedule_notrace(void);
#define __preempt_schedule_notrace()        dynamic_preempt_schedule_notrace()

#else /* CONFIG_PREEMPT_DYNAMIC */

#define __preempt_schedule()                preempt_schedule()
#define __preempt_schedule_notrace()        preempt_schedule_notrace()

#endif /* CONFIG_PREEMPT_DYNAMIC */
#endif /* CONFIG_PREEMPTION */

#endif /* __ASM_PREEMPT_H */






















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_RTNETLINK_H
#define __LINUX_RTNETLINK_H


#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/wait.h>
#include <linux/refcount.h>
#include <uapi/linux/rtnetlink.h>

extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);

static inline int rtnetlink_maybe_send(struct sk_buff *skb, struct net *net,
                                       u32 pid, u32 group, int echo)
{
        return !skb ? 0 : rtnetlink_send(skb, net, pid, group, echo);
}

extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
                        u32 group, const struct nlmsghdr *nlh, gfp_t flags);
extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
                              u32 id, long expires, u32 error);

void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags,
                  u32 portid, const struct nlmsghdr *nlh);
void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
                         gfp_t flags, int *new_nsid, int new_ifindex);
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
                                       unsigned change, u32 event,
                                       gfp_t flags, int *new_nsid,
                                       int new_ifindex, u32 portid,
                                       const struct nlmsghdr *nlh);
void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
                       gfp_t flags, u32 portid, const struct nlmsghdr *nlh);


/* RTNL is used as a global lock for all changes to network configuration  */
extern void rtnl_lock(void);
extern void rtnl_unlock(void);
extern int rtnl_trylock(void);
extern int rtnl_is_locked(void);
extern int rtnl_lock_interruptible(void);
extern int rtnl_lock_killable(void);
extern bool refcount_dec_and_rtnl_lock(refcount_t *r);

extern wait_queue_head_t netdev_unregistering_wq;
extern atomic_t dev_unreg_count;
extern struct rw_semaphore pernet_ops_rwsem;
extern struct rw_semaphore net_rwsem;

#define ASSERT_RTNL() \
        WARN_ONCE(!rtnl_is_locked(), \
                  "RTNL: assertion failed at %s (%d)\n", __FILE__,  __LINE__)

#ifdef CONFIG_PROVE_LOCKING
extern bool lockdep_rtnl_is_held(void);
#else
static inline bool lockdep_rtnl_is_held(void)
{
        return true;
}
#endif /* #ifdef CONFIG_PROVE_LOCKING */

/**
 * rcu_dereference_rtnl - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 *
 * Do an rcu_dereference(p), but check caller either holds rcu_read_lock()
 * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference()
 */
#define rcu_dereference_rtnl(p)                                        \
        rcu_dereference_check(p, lockdep_rtnl_is_held())

/**
 * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL
 * @p: The pointer to read, prior to dereferencing
 *
 * Return: the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE(), because caller holds RTNL.
 */
#define rtnl_dereference(p)                                        \
        rcu_dereference_protected(p, lockdep_rtnl_is_held())

/**
 * rcu_replace_pointer_rtnl - replace an RCU pointer under rtnl_lock, returning
 * its old value
 * @rp: RCU pointer, whose value is returned
 * @p: regular pointer
 *
 * Perform a replacement under rtnl_lock, where @rp is an RCU-annotated
 * pointer. The old value of @rp is returned, and @rp is set to @p
 */
#define rcu_replace_pointer_rtnl(rp, p)                        \
        rcu_replace_pointer(rp, p, lockdep_rtnl_is_held())

#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
void __rtnl_net_lock(struct net *net);
void __rtnl_net_unlock(struct net *net);
void rtnl_net_lock(struct net *net);
void rtnl_net_unlock(struct net *net);
int rtnl_net_trylock(struct net *net);
int rtnl_net_lock_killable(struct net *net);
int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b);

bool rtnl_net_is_locked(struct net *net);

#define ASSERT_RTNL_NET(net)                                                \
        WARN_ONCE(!rtnl_net_is_locked(net),                                \
                  "RTNL_NET: assertion failed at %s (%d)\n",                \
                  __FILE__,  __LINE__)

bool lockdep_rtnl_net_is_held(struct net *net);

#define rcu_dereference_rtnl_net(net, p)                                \
        rcu_dereference_check(p, lockdep_rtnl_net_is_held(net))
#define rtnl_net_dereference(net, p)                                        \
        rcu_dereference_protected(p, lockdep_rtnl_net_is_held(net))
#define rcu_replace_pointer_rtnl_net(net, rp, p)                        \
        rcu_replace_pointer(rp, p, lockdep_rtnl_net_is_held(net))
#else
static inline void __rtnl_net_lock(struct net *net) {}
static inline void __rtnl_net_unlock(struct net *net) {}

static inline void rtnl_net_lock(struct net *net)
{
        rtnl_lock();
}

static inline void rtnl_net_unlock(struct net *net)
{
        rtnl_unlock();
}

static inline int rtnl_net_trylock(struct net *net)
{
        return rtnl_trylock();
}

static inline int rtnl_net_lock_killable(struct net *net)
{
        return rtnl_lock_killable();
}

static inline void ASSERT_RTNL_NET(struct net *net)
{
        ASSERT_RTNL();
}

#define rcu_dereference_rtnl_net(net, p)                \
        rcu_dereference_rtnl(p)
#define rtnl_net_dereference(net, p)                        \
        rtnl_dereference(p)
#define rcu_replace_pointer_rtnl_net(net, rp, p)        \
        rcu_replace_pointer_rtnl(rp, p)
#endif

static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev)
{
        return rtnl_dereference(dev->ingress_queue);
}

static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev)
{
        return rcu_dereference(dev->ingress_queue);
}

struct netdev_queue *dev_ingress_queue_create(struct net_device *dev);

#ifdef CONFIG_NET_INGRESS
void net_inc_ingress_queue(void);
void net_dec_ingress_queue(void);
#endif

#ifdef CONFIG_NET_EGRESS
void net_inc_egress_queue(void);
void net_dec_egress_queue(void);
void netdev_xmit_skip_txqueue(bool skip);
#endif

void rtnetlink_init(void);
void __rtnl_unlock(void);
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail);

/* Shared by rtnl_fdb_dump() and various ndo_fdb_dump() helpers. */
struct ndo_fdb_dump_context {
        unsigned long ifindex;
        unsigned long fdb_idx;
};

extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct net_device *dev,
                             struct net_device *filter_dev,
                             int *idx);
extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
                            struct nlattr *tb[],
                            struct net_device *dev,
                            const unsigned char *addr,
                            u16 vid,
                            u16 flags);
extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
                            struct nlattr *tb[],
                            struct net_device *dev,
                            const unsigned char *addr,
                            u16 vid);

extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
                                   struct net_device *dev, u16 mode,
                                   u32 flags, u32 mask, int nlflags,
                                   u32 filter_mask,
                                   int (*vlan_fill)(struct sk_buff *skb,
                                                    struct net_device *dev,
                                                    u32 filter_mask));

extern void rtnl_offload_xstats_notify(struct net_device *dev);

static inline int rtnl_has_listeners(const struct net *net, u32 group)
{
        struct sock *rtnl = net->rtnl;

        return netlink_has_listeners(rtnl, group);
}

/**
 * rtnl_notify_needed - check if notification is needed
 * @net: Pointer to the net namespace
 * @nlflags: netlink ingress message flags
 * @group: rtnl group
 *
 * Based on the ingress message flags and rtnl group, returns true
 * if a notification is needed, false otherwise.
 */
static inline bool
rtnl_notify_needed(const struct net *net, u16 nlflags, u32 group)
{
        return (nlflags & NLM_F_ECHO) || rtnl_has_listeners(net, group);
}

void netif_set_operstate(struct net_device *dev, int newstate);

#endif        /* __LINUX_RTNETLINK_H */
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































 1257 












































  203 


























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * security/tomoyo/common.h
 *
 * Header file for TOMOYO.
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#ifndef _SECURITY_TOMOYO_COMMON_H
#define _SECURITY_TOMOYO_COMMON_H

#define pr_fmt(fmt) fmt

#include <linux/ctype.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/kmod.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/list.h>
#include <linux/cred.h>
#include <linux/poll.h>
#include <linux/binfmts.h>
#include <linux/highmem.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/un.h>
#include <linux/lsm_hooks.h>
#include <net/sock.h>
#include <net/af_unix.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/udp.h>

/********** Constants definitions. **********/

/*
 * TOMOYO uses this hash only when appending a string into the string
 * table. Frequency of appending strings is very low. So we don't need
 * large (e.g. 64k) hash size. 256 will be sufficient.
 */
#define TOMOYO_HASH_BITS  8
#define TOMOYO_MAX_HASH (1u<<TOMOYO_HASH_BITS)

/*
 * TOMOYO checks only SOCK_STREAM, SOCK_DGRAM, SOCK_RAW, SOCK_SEQPACKET.
 * Therefore, we don't need SOCK_MAX.
 */
#define TOMOYO_SOCK_MAX 6

#define TOMOYO_EXEC_TMPSIZE     4096

/* Garbage collector is trying to kfree() this element. */
#define TOMOYO_GC_IN_PROGRESS -1

/* Profile number is an integer between 0 and 255. */
#define TOMOYO_MAX_PROFILES 256

/* Group number is an integer between 0 and 255. */
#define TOMOYO_MAX_ACL_GROUPS 256

/* Index numbers for "struct tomoyo_condition". */
enum tomoyo_conditions_index {
        TOMOYO_TASK_UID,             /* current_uid()   */
        TOMOYO_TASK_EUID,            /* current_euid()  */
        TOMOYO_TASK_SUID,            /* current_suid()  */
        TOMOYO_TASK_FSUID,           /* current_fsuid() */
        TOMOYO_TASK_GID,             /* current_gid()   */
        TOMOYO_TASK_EGID,            /* current_egid()  */
        TOMOYO_TASK_SGID,            /* current_sgid()  */
        TOMOYO_TASK_FSGID,           /* current_fsgid() */
        TOMOYO_TASK_PID,             /* sys_getpid()   */
        TOMOYO_TASK_PPID,            /* sys_getppid()  */
        TOMOYO_EXEC_ARGC,            /* "struct linux_binprm *"->argc */
        TOMOYO_EXEC_ENVC,            /* "struct linux_binprm *"->envc */
        TOMOYO_TYPE_IS_SOCKET,       /* S_IFSOCK */
        TOMOYO_TYPE_IS_SYMLINK,      /* S_IFLNK */
        TOMOYO_TYPE_IS_FILE,         /* S_IFREG */
        TOMOYO_TYPE_IS_BLOCK_DEV,    /* S_IFBLK */
        TOMOYO_TYPE_IS_DIRECTORY,    /* S_IFDIR */
        TOMOYO_TYPE_IS_CHAR_DEV,     /* S_IFCHR */
        TOMOYO_TYPE_IS_FIFO,         /* S_IFIFO */
        TOMOYO_MODE_SETUID,          /* S_ISUID */
        TOMOYO_MODE_SETGID,          /* S_ISGID */
        TOMOYO_MODE_STICKY,          /* S_ISVTX */
        TOMOYO_MODE_OWNER_READ,      /* S_IRUSR */
        TOMOYO_MODE_OWNER_WRITE,     /* S_IWUSR */
        TOMOYO_MODE_OWNER_EXECUTE,   /* S_IXUSR */
        TOMOYO_MODE_GROUP_READ,      /* S_IRGRP */
        TOMOYO_MODE_GROUP_WRITE,     /* S_IWGRP */
        TOMOYO_MODE_GROUP_EXECUTE,   /* S_IXGRP */
        TOMOYO_MODE_OTHERS_READ,     /* S_IROTH */
        TOMOYO_MODE_OTHERS_WRITE,    /* S_IWOTH */
        TOMOYO_MODE_OTHERS_EXECUTE,  /* S_IXOTH */
        TOMOYO_EXEC_REALPATH,
        TOMOYO_SYMLINK_TARGET,
        TOMOYO_PATH1_UID,
        TOMOYO_PATH1_GID,
        TOMOYO_PATH1_INO,
        TOMOYO_PATH1_MAJOR,
        TOMOYO_PATH1_MINOR,
        TOMOYO_PATH1_PERM,
        TOMOYO_PATH1_TYPE,
        TOMOYO_PATH1_DEV_MAJOR,
        TOMOYO_PATH1_DEV_MINOR,
        TOMOYO_PATH2_UID,
        TOMOYO_PATH2_GID,
        TOMOYO_PATH2_INO,
        TOMOYO_PATH2_MAJOR,
        TOMOYO_PATH2_MINOR,
        TOMOYO_PATH2_PERM,
        TOMOYO_PATH2_TYPE,
        TOMOYO_PATH2_DEV_MAJOR,
        TOMOYO_PATH2_DEV_MINOR,
        TOMOYO_PATH1_PARENT_UID,
        TOMOYO_PATH1_PARENT_GID,
        TOMOYO_PATH1_PARENT_INO,
        TOMOYO_PATH1_PARENT_PERM,
        TOMOYO_PATH2_PARENT_UID,
        TOMOYO_PATH2_PARENT_GID,
        TOMOYO_PATH2_PARENT_INO,
        TOMOYO_PATH2_PARENT_PERM,
        TOMOYO_MAX_CONDITION_KEYWORD,
        TOMOYO_NUMBER_UNION,
        TOMOYO_NAME_UNION,
        TOMOYO_ARGV_ENTRY,
        TOMOYO_ENVP_ENTRY,
};


/* Index numbers for stat(). */
enum tomoyo_path_stat_index {
        /* Do not change this order. */
        TOMOYO_PATH1,
        TOMOYO_PATH1_PARENT,
        TOMOYO_PATH2,
        TOMOYO_PATH2_PARENT,
        TOMOYO_MAX_PATH_STAT
};

/* Index numbers for operation mode. */
enum tomoyo_mode_index {
        TOMOYO_CONFIG_DISABLED,
        TOMOYO_CONFIG_LEARNING,
        TOMOYO_CONFIG_PERMISSIVE,
        TOMOYO_CONFIG_ENFORCING,
        TOMOYO_CONFIG_MAX_MODE,
        TOMOYO_CONFIG_WANT_REJECT_LOG =  64,
        TOMOYO_CONFIG_WANT_GRANT_LOG  = 128,
        TOMOYO_CONFIG_USE_DEFAULT     = 255,
};

/* Index numbers for entry type. */
enum tomoyo_policy_id {
        TOMOYO_ID_GROUP,
        TOMOYO_ID_ADDRESS_GROUP,
        TOMOYO_ID_PATH_GROUP,
        TOMOYO_ID_NUMBER_GROUP,
        TOMOYO_ID_TRANSITION_CONTROL,
        TOMOYO_ID_AGGREGATOR,
        TOMOYO_ID_MANAGER,
        TOMOYO_ID_CONDITION,
        TOMOYO_ID_NAME,
        TOMOYO_ID_ACL,
        TOMOYO_ID_DOMAIN,
        TOMOYO_MAX_POLICY
};

/* Index numbers for domain's attributes. */
enum tomoyo_domain_info_flags_index {
        /* Quota warnning flag.   */
        TOMOYO_DIF_QUOTA_WARNED,
        /*
         * This domain was unable to create a new domain at
         * tomoyo_find_next_domain() because the name of the domain to be
         * created was too long or it could not allocate memory.
         * More than one process continued execve() without domain transition.
         */
        TOMOYO_DIF_TRANSITION_FAILED,
        TOMOYO_MAX_DOMAIN_INFO_FLAGS
};

/* Index numbers for audit type. */
enum tomoyo_grant_log {
        /* Follow profile's configuration. */
        TOMOYO_GRANTLOG_AUTO,
        /* Do not generate grant log. */
        TOMOYO_GRANTLOG_NO,
        /* Generate grant_log. */
        TOMOYO_GRANTLOG_YES,
};

/* Index numbers for group entries. */
enum tomoyo_group_id {
        TOMOYO_PATH_GROUP,
        TOMOYO_NUMBER_GROUP,
        TOMOYO_ADDRESS_GROUP,
        TOMOYO_MAX_GROUP
};

/* Index numbers for type of numeric values. */
enum tomoyo_value_type {
        TOMOYO_VALUE_TYPE_INVALID,
        TOMOYO_VALUE_TYPE_DECIMAL,
        TOMOYO_VALUE_TYPE_OCTAL,
        TOMOYO_VALUE_TYPE_HEXADECIMAL,
};

/* Index numbers for domain transition control keywords. */
enum tomoyo_transition_type {
        /* Do not change this order, */
        TOMOYO_TRANSITION_CONTROL_NO_RESET,
        TOMOYO_TRANSITION_CONTROL_RESET,
        TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE,
        TOMOYO_TRANSITION_CONTROL_INITIALIZE,
        TOMOYO_TRANSITION_CONTROL_NO_KEEP,
        TOMOYO_TRANSITION_CONTROL_KEEP,
        TOMOYO_MAX_TRANSITION_TYPE
};

/* Index numbers for Access Controls. */
enum tomoyo_acl_entry_type_index {
        TOMOYO_TYPE_PATH_ACL,
        TOMOYO_TYPE_PATH2_ACL,
        TOMOYO_TYPE_PATH_NUMBER_ACL,
        TOMOYO_TYPE_MKDEV_ACL,
        TOMOYO_TYPE_MOUNT_ACL,
        TOMOYO_TYPE_INET_ACL,
        TOMOYO_TYPE_UNIX_ACL,
        TOMOYO_TYPE_ENV_ACL,
        TOMOYO_TYPE_MANUAL_TASK_ACL,
};

/* Index numbers for access controls with one pathname. */
enum tomoyo_path_acl_index {
        TOMOYO_TYPE_EXECUTE,
        TOMOYO_TYPE_READ,
        TOMOYO_TYPE_WRITE,
        TOMOYO_TYPE_APPEND,
        TOMOYO_TYPE_UNLINK,
        TOMOYO_TYPE_GETATTR,
        TOMOYO_TYPE_RMDIR,
        TOMOYO_TYPE_TRUNCATE,
        TOMOYO_TYPE_SYMLINK,
        TOMOYO_TYPE_CHROOT,
        TOMOYO_TYPE_UMOUNT,
        TOMOYO_MAX_PATH_OPERATION
};

/* Index numbers for /sys/kernel/security/tomoyo/stat interface. */
enum tomoyo_memory_stat_type {
        TOMOYO_MEMORY_POLICY,
        TOMOYO_MEMORY_AUDIT,
        TOMOYO_MEMORY_QUERY,
        TOMOYO_MAX_MEMORY_STAT
};

enum tomoyo_mkdev_acl_index {
        TOMOYO_TYPE_MKBLOCK,
        TOMOYO_TYPE_MKCHAR,
        TOMOYO_MAX_MKDEV_OPERATION
};

/* Index numbers for socket operations. */
enum tomoyo_network_acl_index {
        TOMOYO_NETWORK_BIND,    /* bind() operation. */
        TOMOYO_NETWORK_LISTEN,  /* listen() operation. */
        TOMOYO_NETWORK_CONNECT, /* connect() operation. */
        TOMOYO_NETWORK_SEND,    /* send() operation. */
        TOMOYO_MAX_NETWORK_OPERATION
};

/* Index numbers for access controls with two pathnames. */
enum tomoyo_path2_acl_index {
        TOMOYO_TYPE_LINK,
        TOMOYO_TYPE_RENAME,
        TOMOYO_TYPE_PIVOT_ROOT,
        TOMOYO_MAX_PATH2_OPERATION
};

/* Index numbers for access controls with one pathname and one number. */
enum tomoyo_path_number_acl_index {
        TOMOYO_TYPE_CREATE,
        TOMOYO_TYPE_MKDIR,
        TOMOYO_TYPE_MKFIFO,
        TOMOYO_TYPE_MKSOCK,
        TOMOYO_TYPE_IOCTL,
        TOMOYO_TYPE_CHMOD,
        TOMOYO_TYPE_CHOWN,
        TOMOYO_TYPE_CHGRP,
        TOMOYO_MAX_PATH_NUMBER_OPERATION
};

/* Index numbers for /sys/kernel/security/tomoyo/ interfaces. */
enum tomoyo_securityfs_interface_index {
        TOMOYO_DOMAINPOLICY,
        TOMOYO_EXCEPTIONPOLICY,
        TOMOYO_PROCESS_STATUS,
        TOMOYO_STAT,
        TOMOYO_AUDIT,
        TOMOYO_VERSION,
        TOMOYO_PROFILE,
        TOMOYO_QUERY,
        TOMOYO_MANAGER
};

/* Index numbers for special mount operations. */
enum tomoyo_special_mount {
        TOMOYO_MOUNT_BIND,            /* mount --bind /source /dest   */
        TOMOYO_MOUNT_MOVE,            /* mount --move /old /new       */
        TOMOYO_MOUNT_REMOUNT,         /* mount -o remount /dir        */
        TOMOYO_MOUNT_MAKE_UNBINDABLE, /* mount --make-unbindable /dir */
        TOMOYO_MOUNT_MAKE_PRIVATE,    /* mount --make-private /dir    */
        TOMOYO_MOUNT_MAKE_SLAVE,      /* mount --make-slave /dir      */
        TOMOYO_MOUNT_MAKE_SHARED,     /* mount --make-shared /dir     */
        TOMOYO_MAX_SPECIAL_MOUNT
};

/* Index numbers for functionality. */
enum tomoyo_mac_index {
        TOMOYO_MAC_FILE_EXECUTE,
        TOMOYO_MAC_FILE_OPEN,
        TOMOYO_MAC_FILE_CREATE,
        TOMOYO_MAC_FILE_UNLINK,
        TOMOYO_MAC_FILE_GETATTR,
        TOMOYO_MAC_FILE_MKDIR,
        TOMOYO_MAC_FILE_RMDIR,
        TOMOYO_MAC_FILE_MKFIFO,
        TOMOYO_MAC_FILE_MKSOCK,
        TOMOYO_MAC_FILE_TRUNCATE,
        TOMOYO_MAC_FILE_SYMLINK,
        TOMOYO_MAC_FILE_MKBLOCK,
        TOMOYO_MAC_FILE_MKCHAR,
        TOMOYO_MAC_FILE_LINK,
        TOMOYO_MAC_FILE_RENAME,
        TOMOYO_MAC_FILE_CHMOD,
        TOMOYO_MAC_FILE_CHOWN,
        TOMOYO_MAC_FILE_CHGRP,
        TOMOYO_MAC_FILE_IOCTL,
        TOMOYO_MAC_FILE_CHROOT,
        TOMOYO_MAC_FILE_MOUNT,
        TOMOYO_MAC_FILE_UMOUNT,
        TOMOYO_MAC_FILE_PIVOT_ROOT,
        TOMOYO_MAC_NETWORK_INET_STREAM_BIND,
        TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN,
        TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT,
        TOMOYO_MAC_NETWORK_INET_DGRAM_BIND,
        TOMOYO_MAC_NETWORK_INET_DGRAM_SEND,
        TOMOYO_MAC_NETWORK_INET_RAW_BIND,
        TOMOYO_MAC_NETWORK_INET_RAW_SEND,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN,
        TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT,
        TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND,
        TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN,
        TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT,
        TOMOYO_MAC_ENVIRON,
        TOMOYO_MAX_MAC_INDEX
};

/* Index numbers for category of functionality. */
enum tomoyo_mac_category_index {
        TOMOYO_MAC_CATEGORY_FILE,
        TOMOYO_MAC_CATEGORY_NETWORK,
        TOMOYO_MAC_CATEGORY_MISC,
        TOMOYO_MAX_MAC_CATEGORY_INDEX
};

/*
 * Retry this request. Returned by tomoyo_supervisor() if policy violation has
 * occurred in enforcing mode and the userspace daemon decided to retry.
 *
 * We must choose a positive value in order to distinguish "granted" (which is
 * 0) and "rejected" (which is a negative value) and "retry".
 */
#define TOMOYO_RETRY_REQUEST 1

/* Index numbers for /sys/kernel/security/tomoyo/stat interface. */
enum tomoyo_policy_stat_type {
        /* Do not change this order. */
        TOMOYO_STAT_POLICY_UPDATES,
        TOMOYO_STAT_POLICY_LEARNING,   /* == TOMOYO_CONFIG_LEARNING */
        TOMOYO_STAT_POLICY_PERMISSIVE, /* == TOMOYO_CONFIG_PERMISSIVE */
        TOMOYO_STAT_POLICY_ENFORCING,  /* == TOMOYO_CONFIG_ENFORCING */
        TOMOYO_MAX_POLICY_STAT
};

/* Index numbers for profile's PREFERENCE values. */
enum tomoyo_pref_index {
        TOMOYO_PREF_MAX_AUDIT_LOG,
        TOMOYO_PREF_MAX_LEARNING_ENTRY,
        TOMOYO_MAX_PREF
};

/********** Structure definitions. **********/

/* Common header for holding ACL entries. */
struct tomoyo_acl_head {
        struct list_head list;
        s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */
} __packed;

/* Common header for shared entries. */
struct tomoyo_shared_acl_head {
        struct list_head list;
        atomic_t users;
} __packed;

struct tomoyo_policy_namespace;

/* Structure for request info. */
struct tomoyo_request_info {
        /*
         * For holding parameters specific to operations which deal files.
         * NULL if not dealing files.
         */
        struct tomoyo_obj_info *obj;
        /*
         * For holding parameters specific to execve() request.
         * NULL if not dealing execve().
         */
        struct tomoyo_execve *ee;
        struct tomoyo_domain_info *domain;
        /* For holding parameters. */
        union {
                struct {
                        const struct tomoyo_path_info *filename;
                        /* For using wildcards at tomoyo_find_next_domain(). */
                        const struct tomoyo_path_info *matched_path;
                        /* One of values in "enum tomoyo_path_acl_index". */
                        u8 operation;
                } path;
                struct {
                        const struct tomoyo_path_info *filename1;
                        const struct tomoyo_path_info *filename2;
                        /* One of values in "enum tomoyo_path2_acl_index". */
                        u8 operation;
                } path2;
                struct {
                        const struct tomoyo_path_info *filename;
                        unsigned int mode;
                        unsigned int major;
                        unsigned int minor;
                        /* One of values in "enum tomoyo_mkdev_acl_index". */
                        u8 operation;
                } mkdev;
                struct {
                        const struct tomoyo_path_info *filename;
                        unsigned long number;
                        /*
                         * One of values in
                         * "enum tomoyo_path_number_acl_index".
                         */
                        u8 operation;
                } path_number;
                struct {
                        const struct tomoyo_path_info *name;
                } environ;
                struct {
                        const __be32 *address;
                        u16 port;
                        /* One of values smaller than TOMOYO_SOCK_MAX. */
                        u8 protocol;
                        /* One of values in "enum tomoyo_network_acl_index". */
                        u8 operation;
                        bool is_ipv6;
                } inet_network;
                struct {
                        const struct tomoyo_path_info *address;
                        /* One of values smaller than TOMOYO_SOCK_MAX. */
                        u8 protocol;
                        /* One of values in "enum tomoyo_network_acl_index". */
                        u8 operation;
                } unix_network;
                struct {
                        const struct tomoyo_path_info *type;
                        const struct tomoyo_path_info *dir;
                        const struct tomoyo_path_info *dev;
                        unsigned long flags;
                        int need_dev;
                } mount;
                struct {
                        const struct tomoyo_path_info *domainname;
                } task;
        } param;
        struct tomoyo_acl_info *matched_acl;
        u8 param_type;
        bool granted;
        u8 retry;
        u8 profile;
        u8 mode; /* One of tomoyo_mode_index . */
        u8 type;
};

/* Structure for holding a token. */
struct tomoyo_path_info {
        const char *name;
        u32 hash;          /* = full_name_hash(name, strlen(name)) */
        u16 const_len;     /* = tomoyo_const_part_length(name)     */
        bool is_dir;       /* = tomoyo_strendswith(name, "/")      */
        bool is_patterned; /* = tomoyo_path_contains_pattern(name) */
};

/* Structure for holding string data. */
struct tomoyo_name {
        struct tomoyo_shared_acl_head head;
        struct tomoyo_path_info entry;
};

/* Structure for holding a word. */
struct tomoyo_name_union {
        /* Either @filename or @group is NULL. */
        const struct tomoyo_path_info *filename;
        struct tomoyo_group *group;
};

/* Structure for holding a number. */
struct tomoyo_number_union {
        unsigned long values[2];
        struct tomoyo_group *group; /* Maybe NULL. */
        /* One of values in "enum tomoyo_value_type". */
        u8 value_type[2];
};

/* Structure for holding an IP address. */
struct tomoyo_ipaddr_union {
        struct in6_addr ip[2]; /* Big endian. */
        struct tomoyo_group *group; /* Pointer to address group. */
        bool is_ipv6; /* Valid only if @group == NULL. */
};

/* Structure for "path_group"/"number_group"/"address_group" directive. */
struct tomoyo_group {
        struct tomoyo_shared_acl_head head;
        const struct tomoyo_path_info *group_name;
        struct list_head member_list;
};

/* Structure for "path_group" directive. */
struct tomoyo_path_group {
        struct tomoyo_acl_head head;
        const struct tomoyo_path_info *member_name;
};

/* Structure for "number_group" directive. */
struct tomoyo_number_group {
        struct tomoyo_acl_head head;
        struct tomoyo_number_union number;
};

/* Structure for "address_group" directive. */
struct tomoyo_address_group {
        struct tomoyo_acl_head head;
        /* Structure for holding an IP address. */
        struct tomoyo_ipaddr_union address;
};

/* Subset of "struct stat". Used by conditional ACL and audit logs. */
struct tomoyo_mini_stat {
        kuid_t uid;
        kgid_t gid;
        ino_t ino;
        umode_t mode;
        dev_t dev;
        dev_t rdev;
};

/* Structure for dumping argv[] and envp[] of "struct linux_binprm". */
struct tomoyo_page_dump {
        struct page *page;    /* Previously dumped page. */
        char *data;           /* Contents of "page". Size is PAGE_SIZE. */
};

/* Structure for attribute checks in addition to pathname checks. */
struct tomoyo_obj_info {
        /*
         * True if tomoyo_get_attributes() was already called, false otherwise.
         */
        bool validate_done;
        /* True if @stat[] is valid. */
        bool stat_valid[TOMOYO_MAX_PATH_STAT];
        /* First pathname. Initialized with { NULL, NULL } if no path. */
        struct path path1;
        /* Second pathname. Initialized with { NULL, NULL } if no path. */
        struct path path2;
        /*
         * Information on @path1, @path1's parent directory, @path2, @path2's
         * parent directory.
         */
        struct tomoyo_mini_stat stat[TOMOYO_MAX_PATH_STAT];
        /*
         * Content of symbolic link to be created. NULL for operations other
         * than symlink().
         */
        struct tomoyo_path_info *symlink_target;
};

/* Structure for argv[]. */
struct tomoyo_argv {
        unsigned long index;
        const struct tomoyo_path_info *value;
        bool is_not;
};

/* Structure for envp[]. */
struct tomoyo_envp {
        const struct tomoyo_path_info *name;
        const struct tomoyo_path_info *value;
        bool is_not;
};

/* Structure for execve() operation. */
struct tomoyo_execve {
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj;
        struct linux_binprm *bprm;
        const struct tomoyo_path_info *transition;
        /* For dumping argv[] and envp[]. */
        struct tomoyo_page_dump dump;
        /* For temporary use. */
        char *tmp; /* Size is TOMOYO_EXEC_TMPSIZE bytes */
};

/* Structure for entries which follows "struct tomoyo_condition". */
struct tomoyo_condition_element {
        /*
         * Left hand operand. A "struct tomoyo_argv" for TOMOYO_ARGV_ENTRY, a
         * "struct tomoyo_envp" for TOMOYO_ENVP_ENTRY is attached to the tail
         * of the array of this struct.
         */
        u8 left;
        /*
         * Right hand operand. A "struct tomoyo_number_union" for
         * TOMOYO_NUMBER_UNION, a "struct tomoyo_name_union" for
         * TOMOYO_NAME_UNION is attached to the tail of the array of this
         * struct.
         */
        u8 right;
        /* Equation operator. True if equals or overlaps, false otherwise. */
        bool equals;
};

/* Structure for optional arguments. */
struct tomoyo_condition {
        struct tomoyo_shared_acl_head head;
        u32 size; /* Memory size allocated for this entry. */
        u16 condc; /* Number of conditions in this struct. */
        u16 numbers_count; /* Number of "struct tomoyo_number_union values". */
        u16 names_count; /* Number of "struct tomoyo_name_union names". */
        u16 argc; /* Number of "struct tomoyo_argv". */
        u16 envc; /* Number of "struct tomoyo_envp". */
        u8 grant_log; /* One of values in "enum tomoyo_grant_log". */
        const struct tomoyo_path_info *transit; /* Maybe NULL. */
        /*
         * struct tomoyo_condition_element condition[condc];
         * struct tomoyo_number_union values[numbers_count];
         * struct tomoyo_name_union names[names_count];
         * struct tomoyo_argv argv[argc];
         * struct tomoyo_envp envp[envc];
         */
};

/* Common header for individual entries. */
struct tomoyo_acl_info {
        struct list_head list;
        struct tomoyo_condition *cond; /* Maybe NULL. */
        s8 is_deleted; /* true or false or TOMOYO_GC_IN_PROGRESS */
        u8 type; /* One of values in "enum tomoyo_acl_entry_type_index". */
} __packed;

/* Structure for domain information. */
struct tomoyo_domain_info {
        struct list_head list;
        struct list_head acl_info_list;
        /* Name of this domain. Never NULL.          */
        const struct tomoyo_path_info *domainname;
        /* Namespace for this domain. Never NULL. */
        struct tomoyo_policy_namespace *ns;
        /* Group numbers to use.   */
        unsigned long group[TOMOYO_MAX_ACL_GROUPS / BITS_PER_LONG];
        u8 profile;        /* Profile number to use. */
        bool is_deleted;   /* Delete flag.           */
        bool flags[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
        atomic_t users; /* Number of referring tasks. */
};

/*
 * Structure for "task manual_domain_transition" directive.
 */
struct tomoyo_task_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MANUAL_TASK_ACL */
        /* Pointer to domainname. */
        const struct tomoyo_path_info *domainname;
};

/*
 * Structure for "file execute", "file read", "file write", "file append",
 * "file unlink", "file getattr", "file rmdir", "file truncate",
 * "file symlink", "file chroot" and "file unmount" directive.
 */
struct tomoyo_path_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_ACL */
        u16 perm; /* Bitmask of values in "enum tomoyo_path_acl_index". */
        struct tomoyo_name_union name;
};

/*
 * Structure for "file create", "file mkdir", "file mkfifo", "file mksock",
 * "file ioctl", "file chmod", "file chown" and "file chgrp" directive.
 */
struct tomoyo_path_number_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH_NUMBER_ACL */
        /* Bitmask of values in "enum tomoyo_path_number_acl_index". */
        u8 perm;
        struct tomoyo_name_union name;
        struct tomoyo_number_union number;
};

/* Structure for "file mkblock" and "file mkchar" directive. */
struct tomoyo_mkdev_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MKDEV_ACL */
        u8 perm; /* Bitmask of values in "enum tomoyo_mkdev_acl_index". */
        struct tomoyo_name_union name;
        struct tomoyo_number_union mode;
        struct tomoyo_number_union major;
        struct tomoyo_number_union minor;
};

/*
 * Structure for "file rename", "file link" and "file pivot_root" directive.
 */
struct tomoyo_path2_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_PATH2_ACL */
        u8 perm; /* Bitmask of values in "enum tomoyo_path2_acl_index". */
        struct tomoyo_name_union name1;
        struct tomoyo_name_union name2;
};

/* Structure for "file mount" directive. */
struct tomoyo_mount_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_MOUNT_ACL */
        struct tomoyo_name_union dev_name;
        struct tomoyo_name_union dir_name;
        struct tomoyo_name_union fs_type;
        struct tomoyo_number_union flags;
};

/* Structure for "misc env" directive in domain policy. */
struct tomoyo_env_acl {
        struct tomoyo_acl_info head;        /* type = TOMOYO_TYPE_ENV_ACL  */
        const struct tomoyo_path_info *env; /* environment variable */
};

/* Structure for "network inet" directive. */
struct tomoyo_inet_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_INET_ACL */
        u8 protocol;
        u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */
        struct tomoyo_ipaddr_union address;
        struct tomoyo_number_union port;
};

/* Structure for "network unix" directive. */
struct tomoyo_unix_acl {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_UNIX_ACL */
        u8 protocol;
        u8 perm; /* Bitmask of values in "enum tomoyo_network_acl_index" */
        struct tomoyo_name_union name;
};

/* Structure for holding a line from /sys/kernel/security/tomoyo/ interface. */
struct tomoyo_acl_param {
        char *data;
        struct list_head *list;
        struct tomoyo_policy_namespace *ns;
        bool is_delete;
};

#define TOMOYO_MAX_IO_READ_QUEUE 64

/*
 * Structure for reading/writing policy via /sys/kernel/security/tomoyo
 * interfaces.
 */
struct tomoyo_io_buffer {
        void (*read)(struct tomoyo_io_buffer *head);
        int (*write)(struct tomoyo_io_buffer *head);
        __poll_t (*poll)(struct file *file, poll_table *wait);
        /* Exclusive lock for this structure.   */
        struct mutex io_sem;
        char __user *read_user_buf;
        size_t read_user_buf_avail;
        struct {
                struct list_head *ns;
                struct list_head *domain;
                struct list_head *group;
                struct list_head *acl;
                size_t avail;
                unsigned int step;
                unsigned int query_index;
                u16 index;
                u16 cond_index;
                u8 acl_group_index;
                u8 cond_step;
                u8 bit;
                u8 w_pos;
                bool eof;
                bool print_this_domain_only;
                bool print_transition_related_only;
                bool print_cond_part;
                const char *w[TOMOYO_MAX_IO_READ_QUEUE];
        } r;
        struct {
                struct tomoyo_policy_namespace *ns;
                /* The position currently writing to.   */
                struct tomoyo_domain_info *domain;
                /* Bytes available for writing.         */
                size_t avail;
                bool is_delete;
        } w;
        /* Buffer for reading.                  */
        char *read_buf;
        /* Size of read buffer.                 */
        size_t readbuf_size;
        /* Buffer for writing.                  */
        char *write_buf;
        /* Size of write buffer.                */
        size_t writebuf_size;
        /* Type of this interface.              */
        enum tomoyo_securityfs_interface_index type;
        /* Users counter protected by tomoyo_io_buffer_list_lock. */
        u8 users;
        /* List for telling GC not to kfree() elements. */
        struct list_head list;
};

/*
 * Structure for "initialize_domain"/"no_initialize_domain"/"keep_domain"/
 * "no_keep_domain" keyword.
 */
struct tomoyo_transition_control {
        struct tomoyo_acl_head head;
        u8 type; /* One of values in "enum tomoyo_transition_type".  */
        /* True if the domainname is tomoyo_get_last_name(). */
        bool is_last_name;
        const struct tomoyo_path_info *domainname; /* Maybe NULL */
        const struct tomoyo_path_info *program;    /* Maybe NULL */
};

/* Structure for "aggregator" keyword. */
struct tomoyo_aggregator {
        struct tomoyo_acl_head head;
        const struct tomoyo_path_info *original_name;
        const struct tomoyo_path_info *aggregated_name;
};

/* Structure for policy manager. */
struct tomoyo_manager {
        struct tomoyo_acl_head head;
        /* A path to program or a domainname. */
        const struct tomoyo_path_info *manager;
};

struct tomoyo_preference {
        unsigned int learning_max_entry;
        bool enforcing_verbose;
        bool learning_verbose;
        bool permissive_verbose;
};

/* Structure for /sys/kernel/security/tomnoyo/profile interface. */
struct tomoyo_profile {
        const struct tomoyo_path_info *comment;
        struct tomoyo_preference *learning;
        struct tomoyo_preference *permissive;
        struct tomoyo_preference *enforcing;
        struct tomoyo_preference preference;
        u8 default_config;
        u8 config[TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX];
        unsigned int pref[TOMOYO_MAX_PREF];
};

/* Structure for representing YYYY/MM/DD hh/mm/ss. */
struct tomoyo_time {
        u16 year;
        u8 month;
        u8 day;
        u8 hour;
        u8 min;
        u8 sec;
};

/* Structure for policy namespace. */
struct tomoyo_policy_namespace {
        /* Profile table. Memory is allocated as needed. */
        struct tomoyo_profile *profile_ptr[TOMOYO_MAX_PROFILES];
        /* List of "struct tomoyo_group". */
        struct list_head group_list[TOMOYO_MAX_GROUP];
        /* List of policy. */
        struct list_head policy_list[TOMOYO_MAX_POLICY];
        /* The global ACL referred by "use_group" keyword. */
        struct list_head acl_group[TOMOYO_MAX_ACL_GROUPS];
        /* List for connecting to tomoyo_namespace_list list. */
        struct list_head namespace_list;
        /* Profile version. Currently only 20150505 is defined. */
        unsigned int profile_version;
        /* Name of this namespace (e.g. "<kernel>", "</usr/sbin/httpd>" ). */
        const char *name;
};

/* Structure for "struct task_struct"->security. */
struct tomoyo_task {
        struct tomoyo_domain_info *domain_info;
        struct tomoyo_domain_info *old_domain_info;
};

/********** Function prototypes. **********/

bool tomoyo_address_matches_group(const bool is_ipv6, const __be32 *address,
                                  const struct tomoyo_group *group);
bool tomoyo_compare_number_union(const unsigned long value,
                                 const struct tomoyo_number_union *ptr);
bool tomoyo_condition(struct tomoyo_request_info *r,
                      const struct tomoyo_condition *cond);
bool tomoyo_correct_domain(const unsigned char *domainname);
bool tomoyo_correct_path(const char *filename);
bool tomoyo_correct_word(const char *string);
bool tomoyo_domain_def(const unsigned char *buffer);
bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r);
bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
                      struct tomoyo_page_dump *dump);
bool tomoyo_memory_ok(void *ptr);
bool tomoyo_number_matches_group(const unsigned long min,
                                 const unsigned long max,
                                 const struct tomoyo_group *group);
bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param,
                               struct tomoyo_ipaddr_union *ptr);
bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
                             struct tomoyo_name_union *ptr);
bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
                               struct tomoyo_number_union *ptr);
bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
                                 const struct tomoyo_path_info *pattern);
bool tomoyo_permstr(const char *string, const char *keyword);
bool tomoyo_str_starts(char **src, const char *find);
char *tomoyo_encode(const char *str);
char *tomoyo_encode2(const char *str, int str_len);
char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
                      va_list args) __printf(3, 0);
char *tomoyo_read_token(struct tomoyo_acl_param *param);
char *tomoyo_realpath_from_path(const struct path *path);
char *tomoyo_realpath_nofollow(const char *pathname);
const char *tomoyo_get_exe(void);
const struct tomoyo_path_info *tomoyo_compare_name_union
(const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr);
const struct tomoyo_path_info *tomoyo_get_domainname
(struct tomoyo_acl_param *param);
const struct tomoyo_path_info *tomoyo_get_name(const char *name);
const struct tomoyo_path_info *tomoyo_path_matches_group
(const struct tomoyo_path_info *pathname, const struct tomoyo_group *group);
int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
                                 const struct path *path, const int flag);
void tomoyo_close_control(struct tomoyo_io_buffer *head);
int tomoyo_env_perm(struct tomoyo_request_info *r, const char *env);
int tomoyo_execute_permission(struct tomoyo_request_info *r,
                              const struct tomoyo_path_info *filename);
int tomoyo_find_next_domain(struct linux_binprm *bprm);
int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile,
                    const u8 index);
int tomoyo_init_request_info(struct tomoyo_request_info *r,
                             struct tomoyo_domain_info *domain,
                             const u8 index);
int tomoyo_mkdev_perm(const u8 operation, const struct path *path,
                      const unsigned int mode, unsigned int dev);
int tomoyo_mount_permission(const char *dev_name, const struct path *path,
                            const char *type, unsigned long flags,
                            void *data_page);
int tomoyo_open_control(const u8 type, struct file *file);
int tomoyo_path2_perm(const u8 operation, const struct path *path1,
                      const struct path *path2);
int tomoyo_path_number_perm(const u8 operation, const struct path *path,
                            unsigned long number);
int tomoyo_path_perm(const u8 operation, const struct path *path,
                     const char *target);
__poll_t tomoyo_poll_control(struct file *file, poll_table *wait);
__poll_t tomoyo_poll_log(struct file *file, poll_table *wait);
int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr,
                                  int addr_len);
int tomoyo_socket_connect_permission(struct socket *sock,
                                     struct sockaddr *addr, int addr_len);
int tomoyo_socket_listen_permission(struct socket *sock);
int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg,
                                     int size);
int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
        __printf(2, 3);
int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)
                         (const struct tomoyo_acl_info *,
                          const struct tomoyo_acl_info *),
                         bool (*merge_duplicate)
                         (struct tomoyo_acl_info *, struct tomoyo_acl_info *,
                          const bool));
int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)
                         (const struct tomoyo_acl_head *,
                          const struct tomoyo_acl_head *));
int tomoyo_write_aggregator(struct tomoyo_acl_param *param);
int tomoyo_write_file(struct tomoyo_acl_param *param);
int tomoyo_write_group(struct tomoyo_acl_param *param, const u8 type);
int tomoyo_write_misc(struct tomoyo_acl_param *param);
int tomoyo_write_inet_network(struct tomoyo_acl_param *param);
int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
                                    const u8 type);
int tomoyo_write_unix_network(struct tomoyo_acl_param *param);
ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
                            const int buffer_len);
ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
                             const char __user *buffer, const int buffer_len);
struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param);
struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
                                                const bool transit);
struct tomoyo_domain_info *tomoyo_domain(void);
struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname);
struct tomoyo_group *tomoyo_get_group(struct tomoyo_acl_param *param,
                                      const u8 idx);
struct tomoyo_policy_namespace *tomoyo_assign_namespace
(const char *domainname);
struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
                                      const u8 profile);
u8 tomoyo_parse_ulong(unsigned long *result, char **str);
void *tomoyo_commit_ok(void *data, const unsigned int size);
void __init tomoyo_load_builtin_policy(void);
void __init tomoyo_mm_init(void);
void tomoyo_check_acl(struct tomoyo_request_info *r,
                      bool (*check_entry)(struct tomoyo_request_info *,
                                          const struct tomoyo_acl_info *));
void tomoyo_check_profile(void);
void tomoyo_convert_time(time64_t time, struct tomoyo_time *stamp);
void tomoyo_del_condition(struct list_head *element);
void tomoyo_fill_path_info(struct tomoyo_path_info *ptr);
void tomoyo_get_attributes(struct tomoyo_obj_info *obj);
void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns);
void tomoyo_load_policy(const char *filename);
void tomoyo_normalize_line(unsigned char *buffer);
void tomoyo_notify_gc(struct tomoyo_io_buffer *head, const bool is_register);
void tomoyo_print_ip(char *buf, const unsigned int size,
                     const struct tomoyo_ipaddr_union *ptr);
void tomoyo_print_ulong(char *buffer, const int buffer_len,
                        const unsigned long value, const u8 type);
void tomoyo_put_name_union(struct tomoyo_name_union *ptr);
void tomoyo_put_number_union(struct tomoyo_number_union *ptr);
void tomoyo_read_log(struct tomoyo_io_buffer *head);
void tomoyo_update_stat(const u8 index);
void tomoyo_warn_oom(const char *function);
void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
        __printf(2, 3);
void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
                       va_list args) __printf(3, 0);

/********** External variable definitions. **********/

extern bool tomoyo_policy_loaded;
extern int tomoyo_enabled;
extern const char * const tomoyo_condition_keyword
[TOMOYO_MAX_CONDITION_KEYWORD];
extern const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS];
extern const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX
                                              + TOMOYO_MAX_MAC_CATEGORY_INDEX];
extern const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE];
extern const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION];
extern const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX];
extern const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION];
extern const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX];
extern const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION];
extern const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION];
extern const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION];
extern struct list_head tomoyo_condition_list;
extern struct list_head tomoyo_domain_list;
extern struct list_head tomoyo_name_list[TOMOYO_MAX_HASH];
extern struct list_head tomoyo_namespace_list;
extern struct mutex tomoyo_policy_lock;
extern struct srcu_struct tomoyo_ss;
extern struct tomoyo_domain_info tomoyo_kernel_domain;
extern struct tomoyo_policy_namespace tomoyo_kernel_namespace;
extern unsigned int tomoyo_memory_quota[TOMOYO_MAX_MEMORY_STAT];
extern unsigned int tomoyo_memory_used[TOMOYO_MAX_MEMORY_STAT];
extern struct lsm_blob_sizes tomoyo_blob_sizes;

/********** Inlined functions. **********/

/**
 * tomoyo_read_lock - Take lock for protecting policy.
 *
 * Returns index number for tomoyo_read_unlock().
 */
static inline int tomoyo_read_lock(void)
{
        return srcu_read_lock(&tomoyo_ss);
}

/**
 * tomoyo_read_unlock - Release lock for protecting policy.
 *
 * @idx: Index number returned by tomoyo_read_lock().
 *
 * Returns nothing.
 */
static inline void tomoyo_read_unlock(int idx)
{
        srcu_read_unlock(&tomoyo_ss, idx);
}

/**
 * tomoyo_sys_getppid - Copy of getppid().
 *
 * Returns parent process's PID.
 *
 * Alpha does not have getppid() defined. To be able to build this module on
 * Alpha, I have to copy getppid() from kernel/timer.c.
 */
static inline pid_t tomoyo_sys_getppid(void)
{
        pid_t pid;

        rcu_read_lock();
        pid = task_tgid_vnr(rcu_dereference(current->real_parent));
        rcu_read_unlock();
        return pid;
}

/**
 * tomoyo_sys_getpid - Copy of getpid().
 *
 * Returns current thread's PID.
 *
 * Alpha does not have getpid() defined. To be able to build this module on
 * Alpha, I have to copy getpid() from kernel/timer.c.
 */
static inline pid_t tomoyo_sys_getpid(void)
{
        return task_tgid_vnr(current);
}

/**
 * tomoyo_pathcmp - strcmp() for "struct tomoyo_path_info" structure.
 *
 * @a: Pointer to "struct tomoyo_path_info".
 * @b: Pointer to "struct tomoyo_path_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_pathcmp(const struct tomoyo_path_info *a,
                                  const struct tomoyo_path_info *b)
{
        return a->hash != b->hash || strcmp(a->name, b->name);
}

/**
 * tomoyo_put_name - Drop reference on "struct tomoyo_name".
 *
 * @name: Pointer to "struct tomoyo_path_info". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_name(const struct tomoyo_path_info *name)
{
        if (name) {
                struct tomoyo_name *ptr =
                        container_of(name, typeof(*ptr), entry);
                atomic_dec(&ptr->head.users);
        }
}

/**
 * tomoyo_put_condition - Drop reference on "struct tomoyo_condition".
 *
 * @cond: Pointer to "struct tomoyo_condition". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_condition(struct tomoyo_condition *cond)
{
        if (cond)
                atomic_dec(&cond->head.users);
}

/**
 * tomoyo_put_group - Drop reference on "struct tomoyo_group".
 *
 * @group: Pointer to "struct tomoyo_group". Maybe NULL.
 *
 * Returns nothing.
 */
static inline void tomoyo_put_group(struct tomoyo_group *group)
{
        if (group)
                atomic_dec(&group->head.users);
}

/**
 * tomoyo_task - Get "struct tomoyo_task" for specified thread.
 *
 * @task - Pointer to "struct task_struct".
 *
 * Returns pointer to "struct tomoyo_task" for specified thread.
 */
static inline struct tomoyo_task *tomoyo_task(struct task_struct *task)
{
        return task->security + tomoyo_blob_sizes.lbs_task;
}

/**
 * tomoyo_same_name_union - Check for duplicated "struct tomoyo_name_union" entry.
 *
 * @a: Pointer to "struct tomoyo_name_union".
 * @b: Pointer to "struct tomoyo_name_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_name_union
(const struct tomoyo_name_union *a, const struct tomoyo_name_union *b)
{
        return a->filename == b->filename && a->group == b->group;
}

/**
 * tomoyo_same_number_union - Check for duplicated "struct tomoyo_number_union" entry.
 *
 * @a: Pointer to "struct tomoyo_number_union".
 * @b: Pointer to "struct tomoyo_number_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_number_union
(const struct tomoyo_number_union *a, const struct tomoyo_number_union *b)
{
        return a->values[0] == b->values[0] && a->values[1] == b->values[1] &&
                a->group == b->group && a->value_type[0] == b->value_type[0] &&
                a->value_type[1] == b->value_type[1];
}

/**
 * tomoyo_same_ipaddr_union - Check for duplicated "struct tomoyo_ipaddr_union" entry.
 *
 * @a: Pointer to "struct tomoyo_ipaddr_union".
 * @b: Pointer to "struct tomoyo_ipaddr_union".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_ipaddr_union
(const struct tomoyo_ipaddr_union *a, const struct tomoyo_ipaddr_union *b)
{
        return !memcmp(a->ip, b->ip, sizeof(a->ip)) && a->group == b->group &&
                a->is_ipv6 == b->is_ipv6;
}

/**
 * tomoyo_current_namespace - Get "struct tomoyo_policy_namespace" for current thread.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" for current thread.
 */
static inline struct tomoyo_policy_namespace *tomoyo_current_namespace(void)
{
        return tomoyo_domain()->ns;
}

/**
 * list_for_each_cookie - iterate over a list with cookie.
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:       the head for your list.
 */
#define list_for_each_cookie(pos, head)                                        \
        if (!pos)                                                        \
                pos =  srcu_dereference((head)->next, &tomoyo_ss);        \
        for ( ; pos != (head); pos = srcu_dereference(pos->next, &tomoyo_ss))

#endif /* !defined(_SECURITY_TOMOYO_COMMON_H) */

















































































  238 












































  239 

  238 

  239 




































































































































































































   26 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  139 















  318 


  319 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
// SPDX-License-Identifier: GPL-2.0-or-later
/* Common capabilities, needed by capability.o.
 */

#include <linux/capability.h>
#include <linux/audit.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/lsm_hooks.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/ptrace.h>
#include <linux/xattr.h>
#include <linux/hugetlb.h>
#include <linux/mount.h>
#include <linux/sched.h>
#include <linux/prctl.h>
#include <linux/securebits.h>
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
#include <linux/personality.h>
#include <linux/mnt_idmapping.h>
#include <uapi/linux/lsm.h>

#define CREATE_TRACE_POINTS
#include <trace/events/capability.h>

/*
 * If a non-root user executes a setuid-root binary in
 * !secure(SECURE_NOROOT) mode, then we raise capabilities.
 * However if fE is also set, then the intent is for only
 * the file capabilities to be applied, and the setuid-root
 * bit is left on either to change the uid (plausible) or
 * to get full privilege on a kernel without file capabilities
 * support.  So in that case we do not raise capabilities.
 *
 * Warn if that happens, once per boot.
 */
static void warn_setuid_and_fcaps_mixed(const char *fname)
{
        static int warned;
        if (!warned) {
                printk(KERN_INFO "warning: `%s' has both setuid-root and"
                        " effective capabilities. Therefore not raising all"
                        " capabilities.\n", fname);
                warned = 1;
        }
}

/**
 * cap_capable_helper - Determine whether a task has a particular effective
 * capability.
 * @cred: The credentials to use
 * @target_ns:  The user namespace of the resource being accessed
 * @cred_ns:  The user namespace of the credentials
 * @cap: The capability to check for
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * See cap_capable for more details.
 */
static inline int cap_capable_helper(const struct cred *cred,
                                     struct user_namespace *target_ns,
                                     const struct user_namespace *cred_ns,
                                     int cap)
{
        struct user_namespace *ns = target_ns;

        /* See if cred has the capability in the target user namespace
         * by examining the target user namespace and all of the target
         * user namespace's parents.
         */
        for (;;) {
                /* Do we have the necessary capabilities? */
                if (likely(ns == cred_ns))
                        return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM;

                /*
                 * If we're already at a lower level than we're looking for,
                 * we're done searching.
                 */
                if (ns->level <= cred_ns->level)
                        return -EPERM;

                /* 
                 * The owner of the user namespace in the parent of the
                 * user namespace has all caps.
                 */
                if ((ns->parent == cred_ns) && uid_eq(ns->owner, cred->euid))
                        return 0;

                /*
                 * If you have a capability in a parent user ns, then you have
                 * it over all children user namespaces as well.
                 */
                ns = ns->parent;
        }

        /* We never get here */
}

/**
 * cap_capable - Determine whether a task has a particular effective capability
 * @cred: The credentials to use
 * @target_ns:  The user namespace of the resource being accessed
 * @cap: The capability to check for
 * @opts: Bitmask of options defined in include/linux/security.h (unused)
 *
 * Determine whether the nominated task has the specified capability amongst
 * its effective set, returning 0 if it does, -ve if it does not.
 *
 * NOTE WELL: cap_capable() has reverse semantics to the capable() call
 * and friends. That is cap_capable() returns an int 0 when a task has
 * a capability, while the kernel's capable(), has_ns_capability(),
 * has_ns_capability_noaudit(), and has_capability_noaudit() return a
 * bool true (1) for this case.
 */
int cap_capable(const struct cred *cred, struct user_namespace *target_ns,
                int cap, unsigned int opts)
{
        const struct user_namespace *cred_ns = cred->user_ns;
        int ret = cap_capable_helper(cred, target_ns, cred_ns, cap);

        trace_cap_capable(cred, target_ns, cred_ns, cap, ret);
        return ret;
}

/**
 * cap_settime - Determine whether the current process may set the system clock
 * @ts: The time to set
 * @tz: The timezone to set
 *
 * Determine whether the current process may set the system clock and timezone
 * information, returning 0 if permission granted, -ve if denied.
 */
int cap_settime(const struct timespec64 *ts, const struct timezone *tz)
{
        if (!capable(CAP_SYS_TIME))
                return -EPERM;
        return 0;
}

/**
 * cap_ptrace_access_check - Determine whether the current process may access
 *                           another
 * @child: The process to be accessed
 * @mode: The mode of attachment.
 *
 * If we are in the same or an ancestor user_ns and have all the target
 * task's capabilities, then ptrace access is allowed.
 * If we have the ptrace capability to the target user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether a process may access another, returning 0 if permission
 * granted, -ve if denied.
 */
int cap_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        int ret = 0;
        const struct cred *cred, *child_cred;
        const kernel_cap_t *caller_caps;

        rcu_read_lock();
        cred = current_cred();
        child_cred = __task_cred(child);
        if (mode & PTRACE_MODE_FSCREDS)
                caller_caps = &cred->cap_effective;
        else
                caller_caps = &cred->cap_permitted;
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, *caller_caps))
                goto out;
        if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_ptrace_traceme - Determine whether another process may trace the current
 * @parent: The task proposed to be the tracer
 *
 * If parent is in the same or an ancestor user_ns and has all current's
 * capabilities, then ptrace access is allowed.
 * If parent has the ptrace capability to current's user_ns, then ptrace
 * access is allowed.
 * Else denied.
 *
 * Determine whether the nominated task is permitted to trace the current
 * process, returning 0 if permission is granted, -ve if denied.
 */
int cap_ptrace_traceme(struct task_struct *parent)
{
        int ret = 0;
        const struct cred *cred, *child_cred;

        rcu_read_lock();
        cred = __task_cred(parent);
        child_cred = current_cred();
        if (cred->user_ns == child_cred->user_ns &&
            cap_issubset(child_cred->cap_permitted, cred->cap_permitted))
                goto out;
        if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE))
                goto out;
        ret = -EPERM;
out:
        rcu_read_unlock();
        return ret;
}

/**
 * cap_capget - Retrieve a task's capability sets
 * @target: The task from which to retrieve the capability sets
 * @effective: The place to record the effective set
 * @inheritable: The place to record the inheritable set
 * @permitted: The place to record the permitted set
 *
 * This function retrieves the capabilities of the nominated task and returns
 * them to the caller.
 */
int cap_capget(const struct task_struct *target, kernel_cap_t *effective,
               kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        const struct cred *cred;

        /* Derived from kernel/capability.c:sys_capget. */
        rcu_read_lock();
        cred = __task_cred(target);
        *effective   = cred->cap_effective;
        *inheritable = cred->cap_inheritable;
        *permitted   = cred->cap_permitted;
        rcu_read_unlock();
        return 0;
}

/*
 * Determine whether the inheritable capabilities are limited to the old
 * permitted set.  Returns 1 if they are limited, 0 if they are not.
 */
static inline int cap_inh_is_capped(void)
{
        /* they are so limited unless the current task has the CAP_SETPCAP
         * capability
         */
        if (cap_capable(current_cred(), current_cred()->user_ns,
                        CAP_SETPCAP, CAP_OPT_NONE) == 0)
                return 0;
        return 1;
}

/**
 * cap_capset - Validate and apply proposed changes to current's capabilities
 * @new: The proposed new credentials; alterations should be made here
 * @old: The current task's current credentials
 * @effective: A pointer to the proposed new effective capabilities set
 * @inheritable: A pointer to the proposed new inheritable capabilities set
 * @permitted: A pointer to the proposed new permitted capabilities set
 *
 * This function validates and applies a proposed mass change to the current
 * process's capability sets.  The changes are made to the proposed new
 * credentials, and assuming no error, will be committed by the caller of LSM.
 */
int cap_capset(struct cred *new,
               const struct cred *old,
               const kernel_cap_t *effective,
               const kernel_cap_t *inheritable,
               const kernel_cap_t *permitted)
{
        if (cap_inh_is_capped() &&
            !cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_permitted)))
                /* incapable of using this inheritable set */
                return -EPERM;

        if (!cap_issubset(*inheritable,
                          cap_combine(old->cap_inheritable,
                                      old->cap_bset)))
                /* no new pI capabilities outside bounding set */
                return -EPERM;

        /* verify restrictions on target's new Permitted set */
        if (!cap_issubset(*permitted, old->cap_permitted))
                return -EPERM;

        /* verify the _new_Effective_ is a subset of the _new_Permitted_ */
        if (!cap_issubset(*effective, *permitted))
                return -EPERM;

        new->cap_effective   = *effective;
        new->cap_inheritable = *inheritable;
        new->cap_permitted   = *permitted;

        /*
         * Mask off ambient bits that are no longer both permitted and
         * inheritable.
         */
        new->cap_ambient = cap_intersect(new->cap_ambient,
                                         cap_intersect(*permitted,
                                                       *inheritable));
        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EINVAL;
        return 0;
}

/**
 * cap_inode_need_killpriv - Determine if inode change affects privileges
 * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV
 *
 * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV
 * affects the security markings on that inode, and if it is, should
 * inode_killpriv() be invoked or the change rejected.
 *
 * Return: 1 if security.capability has a value, meaning inode_killpriv()
 * is required, 0 otherwise, meaning inode_killpriv() is not required.
 */
int cap_inode_need_killpriv(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        int error;

        error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0);
        return error > 0;
}

/**
 * cap_inode_killpriv - Erase the security markings on an inode
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry to alter
 *
 * Erase the privilege-enhancing security markings on an inode.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_inode_killpriv(struct mnt_idmap *idmap, struct dentry *dentry)
{
        int error;

        error = __vfs_removexattr(idmap, dentry, XATTR_NAME_CAPS);
        if (error == -EOPNOTSUPP)
                error = 0;
        return error;
}

static bool rootid_owns_currentns(vfsuid_t rootvfsuid)
{
        struct user_namespace *ns;
        kuid_t kroot;

        if (!vfsuid_valid(rootvfsuid))
                return false;

        kroot = vfsuid_into_kuid(rootvfsuid);
        for (ns = current_user_ns();; ns = ns->parent) {
                if (from_kuid(ns, kroot) == 0)
                        return true;
                if (ns == &init_user_ns)
                        break;
        }

        return false;
}

static __u32 sansflags(__u32 m)
{
        return m & ~VFS_CAP_FLAGS_EFFECTIVE;
}

static bool is_v2header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_2)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2;
}

static bool is_v3header(int size, const struct vfs_cap_data *cap)
{
        if (size != XATTR_CAPS_SZ_3)
                return false;
        return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3;
}

/*
 * getsecurity: We are called for security.* before any attempt to read the
 * xattr from the inode itself.
 *
 * This gives us a chance to read the on-disk value and convert it.  If we
 * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler.
 *
 * Note we are not called by vfs_getxattr_alloc(), but that is only called
 * by the integrity subsystem, which really wants the unconverted values -
 * so that's good.
 */
int cap_inode_getsecurity(struct mnt_idmap *idmap,
                          struct inode *inode, const char *name, void **buffer,
                          bool alloc)
{
        int size;
        kuid_t kroot;
        vfsuid_t vfsroot;
        u32 nsmagic, magic;
        uid_t root, mappedroot;
        char *tmpbuf = NULL;
        struct vfs_cap_data *cap;
        struct vfs_ns_cap_data *nscap = NULL;
        struct dentry *dentry;
        struct user_namespace *fs_ns;

        if (strcmp(name, "capability") != 0)
                return -EOPNOTSUPP;

        dentry = d_find_any_alias(inode);
        if (!dentry)
                return -EINVAL;
        size = vfs_getxattr_alloc(idmap, dentry, XATTR_NAME_CAPS, &tmpbuf,
                                  sizeof(struct vfs_ns_cap_data), GFP_NOFS);
        dput(dentry);
        /* gcc11 complains if we don't check for !tmpbuf */
        if (size < 0 || !tmpbuf)
                goto out_free;

        fs_ns = inode->i_sb->s_user_ns;
        cap = (struct vfs_cap_data *) tmpbuf;
        if (is_v2header(size, cap)) {
                root = 0;
        } else if (is_v3header(size, cap)) {
                nscap = (struct vfs_ns_cap_data *) tmpbuf;
                root = le32_to_cpu(nscap->rootid);
        } else {
                size = -EINVAL;
                goto out_free;
        }

        kroot = make_kuid(fs_ns, root);

        /* If this is an idmapped mount shift the kuid. */
        vfsroot = make_vfsuid(idmap, fs_ns, kroot);

        /* If the root kuid maps to a valid uid in current ns, then return
         * this as a nscap. */
        mappedroot = from_kuid(current_user_ns(), vfsuid_into_kuid(vfsroot));
        if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) {
                size = sizeof(struct vfs_ns_cap_data);
                if (alloc) {
                        if (!nscap) {
                                /* v2 -> v3 conversion */
                                nscap = kzalloc(size, GFP_ATOMIC);
                                if (!nscap) {
                                        size = -ENOMEM;
                                        goto out_free;
                                }
                                nsmagic = VFS_CAP_REVISION_3;
                                magic = le32_to_cpu(cap->magic_etc);
                                if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                                        nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
                                memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                                nscap->magic_etc = cpu_to_le32(nsmagic);
                        } else {
                                /* use allocated v3 buffer */
                                tmpbuf = NULL;
                        }
                        nscap->rootid = cpu_to_le32(mappedroot);
                        *buffer = nscap;
                }
                goto out_free;
        }

        if (!rootid_owns_currentns(vfsroot)) {
                size = -EOVERFLOW;
                goto out_free;
        }

        /* This comes from a parent namespace.  Return as a v2 capability */
        size = sizeof(struct vfs_cap_data);
        if (alloc) {
                if (nscap) {
                        /* v3 -> v2 conversion */
                        cap = kzalloc(size, GFP_ATOMIC);
                        if (!cap) {
                                size = -ENOMEM;
                                goto out_free;
                        }
                        magic = VFS_CAP_REVISION_2;
                        nsmagic = le32_to_cpu(nscap->magic_etc);
                        if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE)
                                magic |= VFS_CAP_FLAGS_EFFECTIVE;
                        memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32);
                        cap->magic_etc = cpu_to_le32(magic);
                } else {
                        /* use unconverted v2 */
                        tmpbuf = NULL;
                }
                *buffer = cap;
        }
out_free:
        kfree(tmpbuf);
        return size;
}

/**
 * rootid_from_xattr - translate root uid of vfs caps
 *
 * @value:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 * @task_ns:        user namespace of the caller
 */
static vfsuid_t rootid_from_xattr(const void *value, size_t size,
                                  struct user_namespace *task_ns)
{
        const struct vfs_ns_cap_data *nscap = value;
        uid_t rootid = 0;

        if (size == XATTR_CAPS_SZ_3)
                rootid = le32_to_cpu(nscap->rootid);

        return VFSUIDT_INIT(make_kuid(task_ns, rootid));
}

static bool validheader(size_t size, const struct vfs_cap_data *cap)
{
        return is_v2header(size, cap) || is_v3header(size, cap);
}

/**
 * cap_convert_nscap - check vfs caps
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        used to retrieve inode to check permissions on
 * @ivalue:        vfs caps value which may be modified by this function
 * @size:        size of @ivalue
 *
 * User requested a write of security.capability.  If needed, update the
 * xattr to change from v2 to v3, or to fixup the v3 rootid.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Return: On success, return the new size; on error, return < 0.
 */
int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
                      const void **ivalue, size_t size)
{
        struct vfs_ns_cap_data *nscap;
        uid_t nsrootid;
        const struct vfs_cap_data *cap = *ivalue;
        __u32 magic, nsmagic;
        struct inode *inode = d_backing_inode(dentry);
        struct user_namespace *task_ns = current_user_ns(),
                *fs_ns = inode->i_sb->s_user_ns;
        kuid_t rootid;
        vfsuid_t vfsrootid;
        size_t newsize;

        if (!*ivalue)
                return -EINVAL;
        if (!validheader(size, cap))
                return -EINVAL;
        if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                return -EPERM;
        if (size == XATTR_CAPS_SZ_2 && (idmap == &nop_mnt_idmap))
                if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP))
                        /* user is privileged, just write the v2 */
                        return size;

        vfsrootid = rootid_from_xattr(*ivalue, size, task_ns);
        if (!vfsuid_valid(vfsrootid))
                return -EINVAL;

        rootid = from_vfsuid(idmap, fs_ns, vfsrootid);
        if (!uid_valid(rootid))
                return -EINVAL;

        nsrootid = from_kuid(fs_ns, rootid);
        if (nsrootid == -1)
                return -EINVAL;

        newsize = sizeof(struct vfs_ns_cap_data);
        nscap = kmalloc(newsize, GFP_ATOMIC);
        if (!nscap)
                return -ENOMEM;
        nscap->rootid = cpu_to_le32(nsrootid);
        nsmagic = VFS_CAP_REVISION_3;
        magic = le32_to_cpu(cap->magic_etc);
        if (magic & VFS_CAP_FLAGS_EFFECTIVE)
                nsmagic |= VFS_CAP_FLAGS_EFFECTIVE;
        nscap->magic_etc = cpu_to_le32(nsmagic);
        memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32);

        *ivalue = nscap;
        return newsize;
}

/*
 * Calculate the new process capability sets from the capability sets attached
 * to a file.
 */
static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps,
                                          struct linux_binprm *bprm,
                                          bool *effective,
                                          bool *has_fcap)
{
        struct cred *new = bprm->cred;
        int ret = 0;

        if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE)
                *effective = true;

        if (caps->magic_etc & VFS_CAP_REVISION_MASK)
                *has_fcap = true;

        /*
         * pP' = (X & fP) | (pI & fI)
         * The addition of pA' is handled later.
         */
        new->cap_permitted.val =
                (new->cap_bset.val & caps->permitted.val) |
                (new->cap_inheritable.val & caps->inheritable.val);

        if (caps->permitted.val & ~new->cap_permitted.val)
                /* insufficient to execute correctly */
                ret = -EPERM;

        /*
         * For legacy apps, with no internal support for recognizing they
         * do not have enough capabilities, we return an error if they are
         * missing some "forced" (aka file-permitted) capabilities.
         */
        return *effective ? ret : 0;
}

/**
 * get_vfs_caps_from_disk - retrieve vfs caps from disk
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        dentry from which @inode is retrieved
 * @cpu_caps:        vfs capabilities
 *
 * Extract the on-exec-apply capability sets for an executable file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
                           const struct dentry *dentry,
                           struct cpu_vfs_cap_data *cpu_caps)
{
        struct inode *inode = d_backing_inode(dentry);
        __u32 magic_etc;
        int size;
        struct vfs_ns_cap_data data, *nscaps = &data;
        struct vfs_cap_data *caps = (struct vfs_cap_data *) &data;
        kuid_t rootkuid;
        vfsuid_t rootvfsuid;
        struct user_namespace *fs_ns;

        memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data));

        if (!inode)
                return -ENODATA;

        fs_ns = inode->i_sb->s_user_ns;
        size = __vfs_getxattr((struct dentry *)dentry, inode,
                              XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ);
        if (size == -ENODATA || size == -EOPNOTSUPP)
                /* no data, that's ok */
                return -ENODATA;

        if (size < 0)
                return size;

        if (size < sizeof(magic_etc))
                return -EINVAL;

        cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc);

        rootkuid = make_kuid(fs_ns, 0);
        switch (magic_etc & VFS_CAP_REVISION_MASK) {
        case VFS_CAP_REVISION_1:
                if (size != XATTR_CAPS_SZ_1)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_2:
                if (size != XATTR_CAPS_SZ_2)
                        return -EINVAL;
                break;
        case VFS_CAP_REVISION_3:
                if (size != XATTR_CAPS_SZ_3)
                        return -EINVAL;
                rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid));
                break;

        default:
                return -EINVAL;
        }

        rootvfsuid = make_vfsuid(idmap, fs_ns, rootkuid);
        if (!vfsuid_valid(rootvfsuid))
                return -ENODATA;

        /* Limit the caps to the mounter of the filesystem
         * or the more limited uid specified in the xattr.
         */
        if (!rootid_owns_currentns(rootvfsuid))
                return -ENODATA;

        cpu_caps->permitted.val = le32_to_cpu(caps->data[0].permitted);
        cpu_caps->inheritable.val = le32_to_cpu(caps->data[0].inheritable);

        /*
         * Rev1 had just a single 32-bit word, later expanded
         * to a second one for the high bits
         */
        if ((magic_etc & VFS_CAP_REVISION_MASK) != VFS_CAP_REVISION_1) {
                cpu_caps->permitted.val += (u64)le32_to_cpu(caps->data[1].permitted) << 32;
                cpu_caps->inheritable.val += (u64)le32_to_cpu(caps->data[1].inheritable) << 32;
        }

        cpu_caps->permitted.val &= CAP_VALID_MASK;
        cpu_caps->inheritable.val &= CAP_VALID_MASK;

        cpu_caps->rootid = vfsuid_into_kuid(rootvfsuid);

        return 0;
}

/*
 * Attempt to get the on-exec apply capability sets for an executable file from
 * its xattrs and, if present, apply them to the proposed credentials being
 * constructed by execve().
 */
static int get_file_caps(struct linux_binprm *bprm, const struct file *file,
                         bool *effective, bool *has_fcap)
{
        int rc = 0;
        struct cpu_vfs_cap_data vcaps;

        cap_clear(bprm->cred->cap_permitted);

        if (!file_caps_enabled)
                return 0;

        if (!mnt_may_suid(file->f_path.mnt))
                return 0;

        /*
         * This check is redundant with mnt_may_suid() but is kept to make
         * explicit that capability bits are limited to s_user_ns and its
         * descendants.
         */
        if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns))
                return 0;

        rc = get_vfs_caps_from_disk(file_mnt_idmap(file),
                                    file->f_path.dentry, &vcaps);
        if (rc < 0) {
                if (rc == -EINVAL)
                        printk(KERN_NOTICE "Invalid argument reading file caps for %s\n",
                                        bprm->filename);
                else if (rc == -ENODATA)
                        rc = 0;
                goto out;
        }

        rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap);

out:
        if (rc)
                cap_clear(bprm->cred->cap_permitted);

        return rc;
}

static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); }

static inline bool __is_real(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->uid, uid); }

static inline bool __is_eff(kuid_t uid, struct cred *cred)
{ return uid_eq(cred->euid, uid); }

static inline bool __is_suid(kuid_t uid, struct cred *cred)
{ return !__is_real(uid, cred) && __is_eff(uid, cred); }

/*
 * handle_privileged_root - Handle case of privileged root
 * @bprm: The execution parameters, including the proposed creds
 * @has_fcap: Are any file capabilities set?
 * @effective: Do we have effective root privilege?
 * @root_uid: This namespace' root UID WRT initial USER namespace
 *
 * Handle the case where root is privileged and hasn't been neutered by
 * SECURE_NOROOT.  If file capabilities are set, they won't be combined with
 * set UID root and nothing is changed.  If we are root, cap_permitted is
 * updated.  If we have become set UID root, the effective bit is set.
 */
static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap,
                                   bool *effective, kuid_t root_uid)
{
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;

        if (!root_privileged())
                return;
        /*
         * If the legacy file capability is set, then don't set privs
         * for a setuid root binary run by a non-root user.  Do set it
         * for a root user just to cause least surprise to an admin.
         */
        if (has_fcap && __is_suid(root_uid, new)) {
                warn_setuid_and_fcaps_mixed(bprm->filename);
                return;
        }
        /*
         * To support inheritance of root-permissions and suid-root
         * executables under compatibility mode, we override the
         * capability sets for the file.
         */
        if (__is_eff(root_uid, new) || __is_real(root_uid, new)) {
                /* pP' = (cap_bset & ~0) | (pI & ~0) */
                new->cap_permitted = cap_combine(old->cap_bset,
                                                 old->cap_inheritable);
        }
        /*
         * If only the real uid is 0, we do not set the effective bit.
         */
        if (__is_eff(root_uid, new))
                *effective = true;
}

#define __cap_gained(field, target, source) \
        !cap_issubset(target->cap_##field, source->cap_##field)
#define __cap_grew(target, source, cred) \
        !cap_issubset(cred->cap_##target, cred->cap_##source)
#define __cap_full(field, cred) \
        cap_issubset(CAP_FULL_SET, cred->cap_##field)

static inline bool __is_setuid(struct cred *new, const struct cred *old)
{ return !uid_eq(new->euid, old->uid); }

static inline bool __is_setgid(struct cred *new, const struct cred *old)
{ return !gid_eq(new->egid, old->gid); }

/*
 * 1) Audit candidate if current->cap_effective is set
 *
 * We do not bother to audit if 3 things are true:
 *   1) cap_effective has all caps
 *   2) we became root *OR* are were already root
 *   3) root is supposed to have all caps (SECURE_NOROOT)
 * Since this is just a normal root execing a process.
 *
 * Number 1 above might fail if you don't have a full bset, but I think
 * that is interesting information to audit.
 *
 * A number of other conditions require logging:
 * 2) something prevented setuid root getting all caps
 * 3) non-setuid root gets fcaps
 * 4) non-setuid root gets ambient
 */
static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old,
                                     kuid_t root, bool has_fcap)
{
        bool ret = false;

        if ((__cap_grew(effective, ambient, new) &&
             !(__cap_full(effective, new) &&
               (__is_eff(root, new) || __is_real(root, new)) &&
               root_privileged())) ||
            (root_privileged() &&
             __is_suid(root, new) &&
             !__cap_full(effective, new)) ||
            (!__is_setuid(new, old) &&
             ((has_fcap &&
               __cap_gained(permitted, new, old)) ||
              __cap_gained(ambient, new, old))))

                ret = true;

        return ret;
}

/**
 * cap_bprm_creds_from_file - Set up the proposed credentials for execve().
 * @bprm: The execution parameters, including the proposed creds
 * @file: The file to pull the credentials from
 *
 * Set up the proposed credentials for a new execution context being
 * constructed by execve().  The proposed creds in @bprm->cred is altered,
 * which won't take effect immediately.
 *
 * Return: 0 if successful, -ve on error.
 */
int cap_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        /* Process setpcap binaries and capabilities for uid 0 */
        const struct cred *old = current_cred();
        struct cred *new = bprm->cred;
        bool effective = false, has_fcap = false, is_setid;
        int ret;
        kuid_t root_uid;

        if (WARN_ON(!cap_ambient_invariant_ok(old)))
                return -EPERM;

        ret = get_file_caps(bprm, file, &effective, &has_fcap);
        if (ret < 0)
                return ret;

        root_uid = make_kuid(new->user_ns, 0);

        handle_privileged_root(bprm, has_fcap, &effective, root_uid);

        /* if we have fs caps, clear dangerous personality flags */
        if (__cap_gained(permitted, new, old))
                bprm->per_clear |= PER_CLEAR_ON_SETID;

        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
         * credentials unless they have the appropriate permit.
         *
         * In addition, if NO_NEW_PRIVS, then ensure we get no new privs.
         */
        is_setid = __is_setuid(new, old) || __is_setgid(new, old);

        if ((is_setid || __cap_gained(permitted, new, old)) &&
            ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) ||
             !ptracer_capable(current, new->user_ns))) {
                /* downgrade; they get no more than they had, and maybe less */
                if (!ns_capable(new->user_ns, CAP_SETUID) ||
                    (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) {
                        new->euid = new->uid;
                        new->egid = new->gid;
                }
                new->cap_permitted = cap_intersect(new->cap_permitted,
                                                   old->cap_permitted);
        }

        new->suid = new->fsuid = new->euid;
        new->sgid = new->fsgid = new->egid;

        /* File caps or setid cancels ambient. */
        if (has_fcap || is_setid)
                cap_clear(new->cap_ambient);

        /*
         * Now that we've computed pA', update pP' to give:
         *   pP' = (X & fP) | (pI & fI) | pA'
         */
        new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient);

        /*
         * Set pE' = (fE ? pP' : pA').  Because pA' is zero if fE is set,
         * this is the same as pE' = (fE ? pP' : 0) | pA'.
         */
        if (effective)
                new->cap_effective = new->cap_permitted;
        else
                new->cap_effective = new->cap_ambient;

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        if (nonroot_raised_pE(new, old, root_uid, has_fcap)) {
                ret = audit_log_bprm_fcaps(bprm, new, old);
                if (ret < 0)
                        return ret;
        }

        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);

        if (WARN_ON(!cap_ambient_invariant_ok(new)))
                return -EPERM;

        /* Check for privilege-elevated exec. */
        if (is_setid ||
            (!__is_real(root_uid, new) &&
             (effective ||
              __cap_grew(permitted, ambient, new))))
                bprm->secureexec = 1;

        return 0;
}

/**
 * cap_inode_setxattr - Determine whether an xattr may be altered
 * @dentry: The inode/dentry being altered
 * @name: The name of the xattr to be changed
 * @value: The value that the xattr will be changed to
 * @size: The size of value
 * @flags: The replacement flag
 *
 * Determine whether an xattr may be altered or set on an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * This is used to make sure security xattrs don't get updated or set by those
 * who aren't privileged to do so.
 */
int cap_inode_setxattr(struct dentry *dentry, const char *name,
                       const void *value, size_t size, int flags)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        /*
         * For XATTR_NAME_CAPS the check will be done in
         * cap_convert_nscap(), called by setxattr()
         */
        if (strcmp(name, XATTR_NAME_CAPS) == 0)
                return 0;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/**
 * cap_inode_removexattr - Determine whether an xattr may be removed
 *
 * @idmap:        idmap of the mount the inode was found from
 * @dentry:        The inode/dentry being altered
 * @name:        The name of the xattr to be changed
 *
 * Determine whether an xattr may be removed from an inode, returning 0 if
 * permission is granted, -ve if denied.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * This is used to make sure security xattrs don't get removed by those who
 * aren't privileged to remove them.
 */
int cap_inode_removexattr(struct mnt_idmap *idmap,
                          struct dentry *dentry, const char *name)
{
        struct user_namespace *user_ns = dentry->d_sb->s_user_ns;

        /* Ignore non-security xattrs */
        if (strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) != 0)
                return 0;

        if (strcmp(name, XATTR_NAME_CAPS) == 0) {
                /* security.capability gets namespaced */
                struct inode *inode = d_backing_inode(dentry);
                if (!inode)
                        return -EINVAL;
                if (!capable_wrt_inode_uidgid(idmap, inode, CAP_SETFCAP))
                        return -EPERM;
                return 0;
        }

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

/*
 * cap_emulate_setxuid() fixes the effective / permitted capabilities of
 * a process after a call to setuid, setreuid, or setresuid.
 *
 *  1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of
 *  {r,e,s}uid != 0, the permitted and effective capabilities are
 *  cleared.
 *
 *  2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective
 *  capabilities of the process are cleared.
 *
 *  3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective
 *  capabilities are set to the permitted capabilities.
 *
 *  fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should
 *  never happen.
 *
 *  -astor
 *
 * cevans - New behaviour, Oct '99
 * A process may, via prctl(), elect to keep its capabilities when it
 * calls setuid() and switches away from uid==0. Both permitted and
 * effective sets will be retained.
 * Without this change, it was impossible for a daemon to drop only some
 * of its privilege. The call to setuid(!=0) would drop all privileges!
 * Keeping uid 0 is not an option because uid 0 owns too many vital
 * files..
 * Thanks to Olaf Kirch and Peter Benie for spotting this.
 */
static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old)
{
        kuid_t root_uid = make_kuid(old->user_ns, 0);

        if ((uid_eq(old->uid, root_uid) ||
             uid_eq(old->euid, root_uid) ||
             uid_eq(old->suid, root_uid)) &&
            (!uid_eq(new->uid, root_uid) &&
             !uid_eq(new->euid, root_uid) &&
             !uid_eq(new->suid, root_uid))) {
                if (!issecure(SECURE_KEEP_CAPS)) {
                        cap_clear(new->cap_permitted);
                        cap_clear(new->cap_effective);
                }

                /*
                 * Pre-ambient programs expect setresuid to nonroot followed
                 * by exec to drop capabilities.  We should make sure that
                 * this remains the case.
                 */
                cap_clear(new->cap_ambient);
        }
        if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid))
                cap_clear(new->cap_effective);
        if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid))
                new->cap_effective = new->cap_permitted;
}

/**
 * cap_task_fix_setuid - Fix up the results of setuid() call
 * @new: The proposed credentials
 * @old: The current task's current credentials
 * @flags: Indications of what has changed
 *
 * Fix up the results of setuid() call before the credential changes are
 * actually applied.
 *
 * Return: 0 to grant the changes, -ve to deny them.
 */
int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags)
{
        switch (flags) {
        case LSM_SETID_RE:
        case LSM_SETID_ID:
        case LSM_SETID_RES:
                /* juggle the capabilities to follow [RES]UID changes unless
                 * otherwise suppressed */
                if (!issecure(SECURE_NO_SETUID_FIXUP))
                        cap_emulate_setxuid(new, old);
                break;

        case LSM_SETID_FS:
                /* juggle the capabilities to follow FSUID changes, unless
                 * otherwise suppressed
                 *
                 * FIXME - is fsuser used for all CAP_FS_MASK capabilities?
                 *          if not, we might be a bit too harsh here.
                 */
                if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                        kuid_t root_uid = make_kuid(old->user_ns, 0);
                        if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_drop_fs_set(new->cap_effective);

                        if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid))
                                new->cap_effective =
                                        cap_raise_fs_set(new->cap_effective,
                                                         new->cap_permitted);
                }
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

/*
 * Rationale: code calling task_setscheduler, task_setioprio, and
 * task_setnice, assumes that
 *   . if capable(cap_sys_nice), then those actions should be allowed
 *   . if not capable(cap_sys_nice), but acting on your own processes,
 *           then those actions should be allowed
 * This is insufficient now since you can call code without suid, but
 * yet with increased caps.
 * So we check for increased caps on the target process.
 */
static int cap_safe_nice(struct task_struct *p)
{
        int is_subset, ret = 0;

        rcu_read_lock();
        is_subset = cap_issubset(__task_cred(p)->cap_permitted,
                                 current_cred()->cap_permitted);
        if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
                ret = -EPERM;
        rcu_read_unlock();

        return ret;
}

/**
 * cap_task_setscheduler - Determine if scheduler policy change is permitted
 * @p: The task to affect
 *
 * Determine if the requested scheduler policy change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setscheduler(struct task_struct *p)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setioprio - Determine if I/O priority change is permitted
 * @p: The task to affect
 * @ioprio: The I/O priority to set
 *
 * Determine if the requested I/O priority change is permitted for the specified
 * task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setioprio(struct task_struct *p, int ioprio)
{
        return cap_safe_nice(p);
}

/**
 * cap_task_setnice - Determine if task priority change is permitted
 * @p: The task to affect
 * @nice: The nice value to set
 *
 * Determine if the requested task priority change is permitted for the
 * specified task.
 *
 * Return: 0 if permission is granted, -ve if denied.
 */
int cap_task_setnice(struct task_struct *p, int nice)
{
        return cap_safe_nice(p);
}

/*
 * Implement PR_CAPBSET_DROP.  Attempt to remove the specified capability from
 * the current task's bounding set.  Returns 0 on success, -ve on error.
 */
static int cap_prctl_drop(unsigned long cap)
{
        struct cred *new;

        if (!ns_capable(current_user_ns(), CAP_SETPCAP))
                return -EPERM;
        if (!cap_valid(cap))
                return -EINVAL;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;
        cap_lower(new->cap_bset, cap);
        return commit_creds(new);
}

/**
 * cap_task_prctl - Implement process control functions for this security module
 * @option: The process control function requested
 * @arg2: The argument data for this function
 * @arg3: The argument data for this function
 * @arg4: The argument data for this function
 * @arg5: The argument data for this function
 *
 * Allow process control functions (sys_prctl()) to alter capabilities; may
 * also deny access to other functions not otherwise implemented here.
 *
 * Return: 0 or +ve on success, -ENOSYS if this function is not implemented
 * here, other -ve on error.  If -ENOSYS is returned, sys_prctl() and other LSM
 * modules will consider performing the function.
 */
int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                   unsigned long arg4, unsigned long arg5)
{
        const struct cred *old = current_cred();
        struct cred *new;

        switch (option) {
        case PR_CAPBSET_READ:
                if (!cap_valid(arg2))
                        return -EINVAL;
                return !!cap_raised(old->cap_bset, arg2);

        case PR_CAPBSET_DROP:
                return cap_prctl_drop(arg2);

        /*
         * The next four prctl's remain to assist with transitioning a
         * system from legacy UID=0 based privilege (when filesystem
         * capabilities are not in use) to a system using filesystem
         * capabilities only - as the POSIX.1e draft intended.
         *
         * Note:
         *
         *  PR_SET_SECUREBITS =
         *      issecure_mask(SECURE_KEEP_CAPS_LOCKED)
         *    | issecure_mask(SECURE_NOROOT)
         *    | issecure_mask(SECURE_NOROOT_LOCKED)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP)
         *    | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED)
         *
         * will ensure that the current process and all of its
         * children will be locked into a pure
         * capability-based-privilege environment.
         */
        case PR_SET_SECUREBITS:
                if ((((old->securebits & SECURE_ALL_LOCKS) >> 1)
                     & (old->securebits ^ arg2))                        /*[1]*/
                    || ((old->securebits & SECURE_ALL_LOCKS & ~arg2))        /*[2]*/
                    || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS))        /*[3]*/
                        /*
                         * [1] no changing of bits that are locked
                         * [2] no unlocking of locks
                         * [3] no setting of unsupported bits
                         */
                    )
                        /* cannot change a locked bit */
                        return -EPERM;

                /*
                 * Doing anything requires privilege (go read about the
                 * "sendmail capabilities bug"), except for unprivileged bits.
                 * Indeed, the SECURE_ALL_UNPRIVILEGED bits are not
                 * restrictions enforced by the kernel but by user space on
                 * itself.
                 */
                if (cap_capable(current_cred(), current_cred()->user_ns,
                                CAP_SETPCAP, CAP_OPT_NONE) != 0) {
                        const unsigned long unpriv_and_locks =
                                SECURE_ALL_UNPRIVILEGED |
                                SECURE_ALL_UNPRIVILEGED << 1;
                        const unsigned long changed = old->securebits ^ arg2;

                        /* For legacy reason, denies non-change. */
                        if (!changed)
                                return -EPERM;

                        /* Denies privileged changes. */
                        if (changed & ~unpriv_and_locks)
                                return -EPERM;
                }

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                new->securebits = arg2;
                return commit_creds(new);

        case PR_GET_SECUREBITS:
                return old->securebits;

        case PR_GET_KEEPCAPS:
                return !!issecure(SECURE_KEEP_CAPS);

        case PR_SET_KEEPCAPS:
                if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */
                        return -EINVAL;
                if (issecure(SECURE_KEEP_CAPS_LOCKED))
                        return -EPERM;

                new = prepare_creds();
                if (!new)
                        return -ENOMEM;
                if (arg2)
                        new->securebits |= issecure_mask(SECURE_KEEP_CAPS);
                else
                        new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS);
                return commit_creds(new);

        case PR_CAP_AMBIENT:
                if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) {
                        if (arg3 | arg4 | arg5)
                                return -EINVAL;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        cap_clear(new->cap_ambient);
                        return commit_creds(new);
                }

                if (((!cap_valid(arg3)) | arg4 | arg5))
                        return -EINVAL;

                if (arg2 == PR_CAP_AMBIENT_IS_SET) {
                        return !!cap_raised(current_cred()->cap_ambient, arg3);
                } else if (arg2 != PR_CAP_AMBIENT_RAISE &&
                           arg2 != PR_CAP_AMBIENT_LOWER) {
                        return -EINVAL;
                } else {
                        if (arg2 == PR_CAP_AMBIENT_RAISE &&
                            (!cap_raised(current_cred()->cap_permitted, arg3) ||
                             !cap_raised(current_cred()->cap_inheritable,
                                         arg3) ||
                             issecure(SECURE_NO_CAP_AMBIENT_RAISE)))
                                return -EPERM;

                        new = prepare_creds();
                        if (!new)
                                return -ENOMEM;
                        if (arg2 == PR_CAP_AMBIENT_RAISE)
                                cap_raise(new->cap_ambient, arg3);
                        else
                                cap_lower(new->cap_ambient, arg3);
                        return commit_creds(new);
                }

        default:
                /* No functionality available - continue with default */
                return -ENOSYS;
        }
}

/**
 * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted
 * @mm: The VM space in which the new mapping is to be made
 * @pages: The size of the mapping
 *
 * Determine whether the allocation of a new virtual mapping by the current
 * task is permitted.
 *
 * Return: 0 if permission granted, negative error code if not.
 */
int cap_vm_enough_memory(struct mm_struct *mm, long pages)
{
        return cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN,
                           CAP_OPT_NOAUDIT);
}

/**
 * cap_mmap_addr - check if able to map given addr
 * @addr: address attempting to be mapped
 *
 * If the process is attempting to map memory below dac_mmap_min_addr they need
 * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
 * capability security module.
 *
 * Return: 0 if this mapping should be allowed or -EPERM if not.
 */
int cap_mmap_addr(unsigned long addr)
{
        int ret = 0;

        if (addr < dac_mmap_min_addr) {
                ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO,
                                  CAP_OPT_NONE);
                /* set PF_SUPERPRIV if it turns out we allow the low mmap */
                if (ret == 0)
                        current->flags |= PF_SUPERPRIV;
        }
        return ret;
}

#ifdef CONFIG_SECURITY

static const struct lsm_id capability_lsmid = {
        .name = "capability",
        .id = LSM_ID_CAPABILITY,
};

static struct security_hook_list capability_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(capable, cap_capable),
        LSM_HOOK_INIT(settime, cap_settime),
        LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme),
        LSM_HOOK_INIT(capget, cap_capget),
        LSM_HOOK_INIT(capset, cap_capset),
        LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file),
        LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv),
        LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv),
        LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity),
        LSM_HOOK_INIT(mmap_addr, cap_mmap_addr),
        LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid),
        LSM_HOOK_INIT(task_prctl, cap_task_prctl),
        LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler),
        LSM_HOOK_INIT(task_setioprio, cap_task_setioprio),
        LSM_HOOK_INIT(task_setnice, cap_task_setnice),
        LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory),
};

static int __init capability_init(void)
{
        security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks),
                           &capability_lsmid);
        return 0;
}

DEFINE_LSM(capability) = {
        .name = "capability",
        .order = LSM_ORDER_FIRST,
        .init = capability_init,
};

#endif /* CONFIG_SECURITY */



































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fs-verity: read-only file-based authenticity protection
 *
 * This header declares the interface between the fs/verity/ support layer and
 * filesystems that support fs-verity.
 *
 * Copyright 2019 Google LLC
 */

#ifndef _LINUX_FSVERITY_H
#define _LINUX_FSVERITY_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <crypto/hash_info.h>
#include <crypto/sha2.h>
#include <uapi/linux/fsverity.h>

/*
 * Largest digest size among all hash algorithms supported by fs-verity.
 * Currently assumed to be <= size of fsverity_descriptor::root_hash.
 */
#define FS_VERITY_MAX_DIGEST_SIZE        SHA512_DIGEST_SIZE

/* Arbitrary limit to bound the kmalloc() size.  Can be changed. */
#define FS_VERITY_MAX_DESCRIPTOR_SIZE        16384

/* Verity operations for filesystems */
struct fsverity_operations {

        /**
         * Begin enabling verity on the given file.
         *
         * @filp: a readonly file descriptor for the file
         *
         * The filesystem must do any needed filesystem-specific preparations
         * for enabling verity, e.g. evicting inline data.  It also must return
         * -EBUSY if verity is already being enabled on the given file.
         *
         * i_rwsem is held for write.
         *
         * Return: 0 on success, -errno on failure
         */
        int (*begin_enable_verity)(struct file *filp);

        /**
         * End enabling verity on the given file.
         *
         * @filp: a readonly file descriptor for the file
         * @desc: the verity descriptor to write, or NULL on failure
         * @desc_size: size of verity descriptor, or 0 on failure
         * @merkle_tree_size: total bytes the Merkle tree took up
         *
         * If desc == NULL, then enabling verity failed and the filesystem only
         * must do any necessary cleanups.  Else, it must also store the given
         * verity descriptor to a fs-specific location associated with the inode
         * and do any fs-specific actions needed to mark the inode as a verity
         * inode, e.g. setting a bit in the on-disk inode.  The filesystem is
         * also responsible for setting the S_VERITY flag in the VFS inode.
         *
         * i_rwsem is held for write, but it may have been dropped between
         * ->begin_enable_verity() and ->end_enable_verity().
         *
         * Return: 0 on success, -errno on failure
         */
        int (*end_enable_verity)(struct file *filp, const void *desc,
                                 size_t desc_size, u64 merkle_tree_size);

        /**
         * Get the verity descriptor of the given inode.
         *
         * @inode: an inode with the S_VERITY flag set
         * @buf: buffer in which to place the verity descriptor
         * @bufsize: size of @buf, or 0 to retrieve the size only
         *
         * If bufsize == 0, then the size of the verity descriptor is returned.
         * Otherwise the verity descriptor is written to 'buf' and its actual
         * size is returned; -ERANGE is returned if it's too large.  This may be
         * called by multiple processes concurrently on the same inode.
         *
         * Return: the size on success, -errno on failure
         */
        int (*get_verity_descriptor)(struct inode *inode, void *buf,
                                     size_t bufsize);

        /**
         * Read a Merkle tree page of the given inode.
         *
         * @inode: the inode
         * @index: 0-based index of the page within the Merkle tree
         * @num_ra_pages: The number of Merkle tree pages that should be
         *                  prefetched starting at @index if the page at @index
         *                  isn't already cached.  Implementations may ignore this
         *                  argument; it's only a performance optimization.
         *
         * This can be called at any time on an open verity file.  It may be
         * called by multiple processes concurrently, even with the same page.
         *
         * Note that this must retrieve a *page*, not necessarily a *block*.
         *
         * Return: the page on success, ERR_PTR() on failure
         */
        struct page *(*read_merkle_tree_page)(struct inode *inode,
                                              pgoff_t index,
                                              unsigned long num_ra_pages);

        /**
         * Write a Merkle tree block to the given inode.
         *
         * @inode: the inode for which the Merkle tree is being built
         * @buf: the Merkle tree block to write
         * @pos: the position of the block in the Merkle tree (in bytes)
         * @size: the Merkle tree block size (in bytes)
         *
         * This is only called between ->begin_enable_verity() and
         * ->end_enable_verity().
         *
         * Return: 0 on success, -errno on failure
         */
        int (*write_merkle_tree_block)(struct inode *inode, const void *buf,
                                       u64 pos, unsigned int size);
};

#ifdef CONFIG_FS_VERITY

static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
        /*
         * Pairs with the cmpxchg_release() in fsverity_set_info().
         * I.e., another task may publish ->i_verity_info concurrently,
         * executing a RELEASE barrier.  We need to use smp_load_acquire() here
         * to safely ACQUIRE the memory the other task published.
         */
        return smp_load_acquire(&inode->i_verity_info);
}

/* enable.c */

int fsverity_ioctl_enable(struct file *filp, const void __user *arg);

/* measure.c */

int fsverity_ioctl_measure(struct file *filp, void __user *arg);
int fsverity_get_digest(struct inode *inode,
                        u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE],
                        u8 *alg, enum hash_algo *halg);

/* open.c */

int __fsverity_file_open(struct inode *inode, struct file *filp);
int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr);
void __fsverity_cleanup_inode(struct inode *inode);

/**
 * fsverity_cleanup_inode() - free the inode's verity info, if present
 * @inode: an inode being evicted
 *
 * Filesystems must call this on inode eviction to free ->i_verity_info.
 */
static inline void fsverity_cleanup_inode(struct inode *inode)
{
        if (inode->i_verity_info)
                __fsverity_cleanup_inode(inode);
}

/* read_metadata.c */

int fsverity_ioctl_read_metadata(struct file *filp, const void __user *uarg);

/* verify.c */

bool fsverity_verify_blocks(struct folio *folio, size_t len, size_t offset);
void fsverity_verify_bio(struct bio *bio);
void fsverity_enqueue_verify_work(struct work_struct *work);

#else /* !CONFIG_FS_VERITY */

static inline struct fsverity_info *fsverity_get_info(const struct inode *inode)
{
        return NULL;
}

/* enable.c */

static inline int fsverity_ioctl_enable(struct file *filp,
                                        const void __user *arg)
{
        return -EOPNOTSUPP;
}

/* measure.c */

static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg)
{
        return -EOPNOTSUPP;
}

static inline int fsverity_get_digest(struct inode *inode,
                                      u8 raw_digest[FS_VERITY_MAX_DIGEST_SIZE],
                                      u8 *alg, enum hash_algo *halg)
{
        /*
         * fsverity is not enabled in the kernel configuration, so always report
         * that the file doesn't have fsverity enabled (digest size 0).
         */
        return 0;
}

/* open.c */

static inline int __fsverity_file_open(struct inode *inode, struct file *filp)
{
        return -EOPNOTSUPP;
}

static inline int __fsverity_prepare_setattr(struct dentry *dentry,
                                             struct iattr *attr)
{
        return -EOPNOTSUPP;
}

static inline void fsverity_cleanup_inode(struct inode *inode)
{
}

/* read_metadata.c */

static inline int fsverity_ioctl_read_metadata(struct file *filp,
                                               const void __user *uarg)
{
        return -EOPNOTSUPP;
}

/* verify.c */

static inline bool fsverity_verify_blocks(struct folio *folio, size_t len,
                                          size_t offset)
{
        WARN_ON_ONCE(1);
        return false;
}

static inline void fsverity_verify_bio(struct bio *bio)
{
        WARN_ON_ONCE(1);
}

static inline void fsverity_enqueue_verify_work(struct work_struct *work)
{
        WARN_ON_ONCE(1);
}

#endif        /* !CONFIG_FS_VERITY */

static inline bool fsverity_verify_folio(struct folio *folio)
{
        return fsverity_verify_blocks(folio, folio_size(folio), 0);
}

static inline bool fsverity_verify_page(struct page *page)
{
        return fsverity_verify_blocks(page_folio(page), PAGE_SIZE, 0);
}

/**
 * fsverity_active() - do reads from the inode need to go through fs-verity?
 * @inode: inode to check
 *
 * This checks whether ->i_verity_info has been set.
 *
 * Filesystems call this from ->readahead() to check whether the pages need to
 * be verified or not.  Don't use IS_VERITY() for this purpose; it's subject to
 * a race condition where the file is being read concurrently with
 * FS_IOC_ENABLE_VERITY completing.  (S_VERITY is set before ->i_verity_info.)
 *
 * Return: true if reads need to go through fs-verity, otherwise false
 */
static inline bool fsverity_active(const struct inode *inode)
{
        return fsverity_get_info(inode) != NULL;
}

/**
 * fsverity_file_open() - prepare to open a verity file
 * @inode: the inode being opened
 * @filp: the struct file being set up
 *
 * When opening a verity file, deny the open if it is for writing.  Otherwise,
 * set up the inode's ->i_verity_info if not already done.
 *
 * When combined with fscrypt, this must be called after fscrypt_file_open().
 * Otherwise, we won't have the key set up to decrypt the verity metadata.
 *
 * Return: 0 on success, -errno on failure
 */
static inline int fsverity_file_open(struct inode *inode, struct file *filp)
{
        if (IS_VERITY(inode))
                return __fsverity_file_open(inode, filp);
        return 0;
}

/**
 * fsverity_prepare_setattr() - prepare to change a verity inode's attributes
 * @dentry: dentry through which the inode is being changed
 * @attr: attributes to change
 *
 * Verity files are immutable, so deny truncates.  This isn't covered by the
 * open-time check because sys_truncate() takes a path, not a file descriptor.
 *
 * Return: 0 on success, -errno on failure
 */
static inline int fsverity_prepare_setattr(struct dentry *dentry,
                                           struct iattr *attr)
{
        if (IS_VERITY(d_inode(dentry)))
                return __fsverity_prepare_setattr(dentry, attr);
        return 0;
}

#endif        /* _LINUX_FSVERITY_H */























































































































































    3 







    3 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/etherdevice.h>
#include <linux/if_macvlan.h>
#include <linux/if_tap.h>
#include <linux/if_vlan.h>
#include <linux/interrupt.h>
#include <linux/nsproxy.h>
#include <linux/compat.h>
#include <linux/if_tun.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/cache.h>
#include <linux/sched/signal.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/cdev.h>
#include <linux/idr.h>
#include <linux/fs.h>
#include <linux/uio.h>

#include <net/net_namespace.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <linux/virtio_net.h>
#include <linux/skb_array.h>

struct macvtap_dev {
        struct macvlan_dev vlan;
        struct tap_dev    tap;
};

/*
 * Variables for dealing with macvtaps device numbers.
 */
static dev_t macvtap_major;

static const void *macvtap_net_namespace(const struct device *d)
{
        const struct net_device *dev = to_net_dev(d->parent);
        return dev_net(dev);
}

static struct class macvtap_class = {
        .name = "macvtap",
        .ns_type = &net_ns_type_operations,
        .namespace = macvtap_net_namespace,
};
static struct cdev macvtap_cdev;

#define TUN_OFFLOADS (NETIF_F_HW_CSUM | NETIF_F_TSO_ECN | NETIF_F_TSO | \
                      NETIF_F_TSO6)

static void macvtap_count_tx_dropped(struct tap_dev *tap)
{
        struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
        struct macvlan_dev *vlan = &vlantap->vlan;

        this_cpu_inc(vlan->pcpu_stats->tx_dropped);
}

static void macvtap_count_rx_dropped(struct tap_dev *tap)
{
        struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
        struct macvlan_dev *vlan = &vlantap->vlan;

        macvlan_count_rx(vlan, 0, 0, 0);
}

static void macvtap_update_features(struct tap_dev *tap,
                                    netdev_features_t features)
{
        struct macvtap_dev *vlantap = container_of(tap, struct macvtap_dev, tap);
        struct macvlan_dev *vlan = &vlantap->vlan;

        vlan->set_features = features;
        netdev_update_features(vlan->dev);
}

static int macvtap_newlink(struct net_device *dev,
                           struct rtnl_newlink_params *params,
                           struct netlink_ext_ack *extack)
{
        struct macvtap_dev *vlantap = netdev_priv(dev);
        int err;

        INIT_LIST_HEAD(&vlantap->tap.queue_list);

        /* Since macvlan supports all offloads by default, make
         * tap support all offloads also.
         */
        vlantap->tap.tap_features = TUN_OFFLOADS;

        /* Register callbacks for rx/tx drops accounting and updating
         * net_device features
         */
        vlantap->tap.count_tx_dropped = macvtap_count_tx_dropped;
        vlantap->tap.count_rx_dropped = macvtap_count_rx_dropped;
        vlantap->tap.update_features  = macvtap_update_features;

        err = netdev_rx_handler_register(dev, tap_handle_frame, &vlantap->tap);
        if (err)
                return err;

        /* Don't put anything that may fail after macvlan_common_newlink
         * because we can't undo what it does.
         */
        err = macvlan_common_newlink(dev, params, extack);
        if (err) {
                netdev_rx_handler_unregister(dev);
                return err;
        }

        vlantap->tap.dev = vlantap->vlan.dev;

        return 0;
}

static void macvtap_dellink(struct net_device *dev,
                            struct list_head *head)
{
        struct macvtap_dev *vlantap = netdev_priv(dev);

        netdev_rx_handler_unregister(dev);
        tap_del_queues(&vlantap->tap);
        macvlan_dellink(dev, head);
}

static void macvtap_setup(struct net_device *dev)
{
        macvlan_common_setup(dev);
        dev->tx_queue_len = TUN_READQ_SIZE;
}

static struct net *macvtap_link_net(const struct net_device *dev)
{
        return dev_net(macvlan_dev_real_dev(dev));
}

static struct rtnl_link_ops macvtap_link_ops __read_mostly = {
        .kind                = "macvtap",
        .setup                = macvtap_setup,
        .newlink        = macvtap_newlink,
        .dellink        = macvtap_dellink,
        .get_link_net        = macvtap_link_net,
        .priv_size      = sizeof(struct macvtap_dev),
};

static int macvtap_device_event(struct notifier_block *unused,
                                unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct macvtap_dev *vlantap;
        struct device *classdev;
        dev_t devt;
        int err;
        char tap_name[IFNAMSIZ];

        if (dev->rtnl_link_ops != &macvtap_link_ops)
                return NOTIFY_DONE;

        snprintf(tap_name, IFNAMSIZ, "tap%d", dev->ifindex);
        vlantap = netdev_priv(dev);

        switch (event) {
        case NETDEV_REGISTER:
                /* Create the device node here after the network device has
                 * been registered but before register_netdevice has
                 * finished running.
                 */
                err = tap_get_minor(macvtap_major, &vlantap->tap);
                if (err)
                        return notifier_from_errno(err);

                devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
                classdev = device_create(&macvtap_class, &dev->dev, devt,
                                         dev, "%s", tap_name);
                if (IS_ERR(classdev)) {
                        tap_free_minor(macvtap_major, &vlantap->tap);
                        return notifier_from_errno(PTR_ERR(classdev));
                }
                err = sysfs_create_link(&dev->dev.kobj, &classdev->kobj,
                                        tap_name);
                if (err)
                        return notifier_from_errno(err);
                break;
        case NETDEV_UNREGISTER:
                /* vlan->minor == 0 if NETDEV_REGISTER above failed */
                if (vlantap->tap.minor == 0)
                        break;
                sysfs_remove_link(&dev->dev.kobj, tap_name);
                devt = MKDEV(MAJOR(macvtap_major), vlantap->tap.minor);
                device_destroy(&macvtap_class, devt);
                tap_free_minor(macvtap_major, &vlantap->tap);
                break;
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                if (tap_queue_resize(&vlantap->tap))
                        return NOTIFY_BAD;
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block macvtap_notifier_block __read_mostly = {
        .notifier_call        = macvtap_device_event,
};

static int __init macvtap_init(void)
{
        int err;

        err = tap_create_cdev(&macvtap_cdev, &macvtap_major, "macvtap",
                              THIS_MODULE);
        if (err)
                goto out1;

        err = class_register(&macvtap_class);
        if (err)
                goto out2;

        err = register_netdevice_notifier(&macvtap_notifier_block);
        if (err)
                goto out3;

        err = macvlan_link_register(&macvtap_link_ops);
        if (err)
                goto out4;

        return 0;

out4:
        unregister_netdevice_notifier(&macvtap_notifier_block);
out3:
        class_unregister(&macvtap_class);
out2:
        tap_destroy_cdev(macvtap_major, &macvtap_cdev);
out1:
        return err;
}
module_init(macvtap_init);

static void __exit macvtap_exit(void)
{
        rtnl_link_unregister(&macvtap_link_ops);
        unregister_netdevice_notifier(&macvtap_notifier_block);
        class_unregister(&macvtap_class);
        tap_destroy_cdev(macvtap_major, &macvtap_cdev);
}
module_exit(macvtap_exit);

MODULE_ALIAS_RTNL_LINK("macvtap");
MODULE_DESCRIPTION("MAC-VLAN based tap driver");
MODULE_AUTHOR("Arnd Bergmann <arnd@arndb.de>");
MODULE_LICENSE("GPL");



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
// SPDX-License-Identifier: GPL-2.0
/*
 * HugeTLB Vmemmap Optimization (HVO)
 *
 * Copyright (c) 2020, ByteDance. All rights reserved.
 *
 *     Author: Muchun Song <songmuchun@bytedance.com>
 */
#ifndef _LINUX_HUGETLB_VMEMMAP_H
#define _LINUX_HUGETLB_VMEMMAP_H
#include <linux/hugetlb.h>
#include <linux/io.h>
#include <linux/memblock.h>

/*
 * Reserve one vmemmap page, all vmemmap addresses are mapped to it. See
 * Documentation/mm/vmemmap_dedup.rst.
 */
#define HUGETLB_VMEMMAP_RESERVE_SIZE        PAGE_SIZE
#define HUGETLB_VMEMMAP_RESERVE_PAGES        (HUGETLB_VMEMMAP_RESERVE_SIZE / sizeof(struct page))

#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio);
long hugetlb_vmemmap_restore_folios(const struct hstate *h,
                                        struct list_head *folio_list,
                                        struct list_head *non_hvo_folios);
void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio);
void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list);
#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
void hugetlb_vmemmap_init_early(int nid);
void hugetlb_vmemmap_init_late(int nid);
#endif


static inline unsigned int hugetlb_vmemmap_size(const struct hstate *h)
{
        return pages_per_huge_page(h) * sizeof(struct page);
}

/*
 * Return how many vmemmap size associated with a HugeTLB page that can be
 * optimized and can be freed to the buddy allocator.
 */
static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
{
        int size = hugetlb_vmemmap_size(h) - HUGETLB_VMEMMAP_RESERVE_SIZE;

        if (!is_power_of_2(sizeof(struct page)))
                return 0;
        return size > 0 ? size : 0;
}
#else
static inline int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
{
        return 0;
}

static inline long hugetlb_vmemmap_restore_folios(const struct hstate *h,
                                        struct list_head *folio_list,
                                        struct list_head *non_hvo_folios)
{
        list_splice_init(folio_list, non_hvo_folios);
        return 0;
}

static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
{
}

static inline void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
{
}

static inline void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h,
                                                struct list_head *folio_list)
{
}

static inline void hugetlb_vmemmap_init_early(int nid)
{
}

static inline void hugetlb_vmemmap_init_late(int nid)
{
}

static inline unsigned int hugetlb_vmemmap_optimizable_size(const struct hstate *h)
{
        return 0;
}
#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */

static inline bool hugetlb_vmemmap_optimizable(const struct hstate *h)
{
        return hugetlb_vmemmap_optimizable_size(h) != 0;
}
#endif /* _LINUX_HUGETLB_VMEMMAP_H */














   40 


















   40 



   94 








   47 









    3 






   41 



   41 









   51 





























   51 








   51 




   50 





   42 





   34 
   42 



   42 
   42 






   51 





  104 
















    1 


    1 




    1 




















   99 



   94 

















   52 


   40 






   82 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <trace/events/kvm.h>

#include "trace.h"

void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data)
{
        void *datap = NULL;
        union {
                u8        byte;
                u16        hword;
                u32        word;
                u64        dword;
        } tmp;

        switch (len) {
        case 1:
                tmp.byte        = data;
                datap                = &tmp.byte;
                break;
        case 2:
                tmp.hword        = data;
                datap                = &tmp.hword;
                break;
        case 4:
                tmp.word        = data;
                datap                = &tmp.word;
                break;
        case 8:
                tmp.dword        = data;
                datap                = &tmp.dword;
                break;
        }

        memcpy(buf, datap, len);
}

unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len)
{
        unsigned long data = 0;
        union {
                u16        hword;
                u32        word;
                u64        dword;
        } tmp;

        switch (len) {
        case 1:
                data = *(u8 *)buf;
                break;
        case 2:
                memcpy(&tmp.hword, buf, len);
                data = tmp.hword;
                break;
        case 4:
                memcpy(&tmp.word, buf, len);
                data = tmp.word;
                break;
        case 8:
                memcpy(&tmp.dword, buf, len);
                data = tmp.dword;
                break;
        }

        return data;
}

static bool kvm_pending_sync_exception(struct kvm_vcpu *vcpu)
{
        if (!vcpu_get_flag(vcpu, PENDING_EXCEPTION))
                return false;

        if (vcpu_el1_is_32bit(vcpu)) {
                switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
                case unpack_vcpu_flag(EXCEPT_AA32_UND):
                case unpack_vcpu_flag(EXCEPT_AA32_IABT):
                case unpack_vcpu_flag(EXCEPT_AA32_DABT):
                        return true;
                default:
                        return false;
                }
        } else {
                switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
                case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC):
                case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
                        return true;
                default:
                        return false;
                }
        }
}

/**
 * kvm_handle_mmio_return -- Handle MMIO loads after user space emulation
 *                             or in-kernel IO emulation
 *
 * @vcpu: The VCPU pointer
 */
int kvm_handle_mmio_return(struct kvm_vcpu *vcpu)
{
        unsigned long data;
        unsigned int len;
        int mask;

        /*
         * Detect if the MMIO return was already handled or if userspace aborted
         * the MMIO access.
         */
        if (unlikely(!vcpu->mmio_needed || kvm_pending_sync_exception(vcpu)))
                return 1;

        vcpu->mmio_needed = 0;

        if (!kvm_vcpu_dabt_iswrite(vcpu)) {
                struct kvm_run *run = vcpu->run;

                len = kvm_vcpu_dabt_get_as(vcpu);
                data = kvm_mmio_read_buf(run->mmio.data, len);

                if (kvm_vcpu_dabt_issext(vcpu) &&
                    len < sizeof(unsigned long)) {
                        mask = 1U << ((len * 8) - 1);
                        data = (data ^ mask) - mask;
                }

                if (!kvm_vcpu_dabt_issf(vcpu))
                        data = data & 0xffffffff;

                trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
                               &data);
                data = vcpu_data_host_to_guest(vcpu, data, len);
                vcpu_set_reg(vcpu, kvm_vcpu_dabt_get_rd(vcpu), data);
        }

        /*
         * The MMIO instruction is emulated and should not be re-executed
         * in the guest.
         */
        kvm_incr_pc(vcpu);

        return 1;
}

int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
        struct kvm_run *run = vcpu->run;
        unsigned long data;
        unsigned long rt;
        int ret;
        bool is_write;
        int len;
        u8 data_buf[8];

        /*
         * No valid syndrome? Ask userspace for help if it has
         * volunteered to do so, and bail out otherwise.
         *
         * In the protected VM case, there isn't much userspace can do
         * though, so directly deliver an exception to the guest.
         */
        if (!kvm_vcpu_dabt_isvalid(vcpu)) {
                trace_kvm_mmio_nisv(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
                                    kvm_vcpu_get_hfar(vcpu), fault_ipa);

                if (vcpu_is_protected(vcpu)) {
                        kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
                        return 1;
                }

                if (test_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
                             &vcpu->kvm->arch.flags)) {
                        run->exit_reason = KVM_EXIT_ARM_NISV;
                        run->arm_nisv.esr_iss = kvm_vcpu_dabt_iss_nisv_sanitized(vcpu);
                        run->arm_nisv.fault_ipa = fault_ipa;
                        return 0;
                }

                return -ENOSYS;
        }

        /*
         * Prepare MMIO operation. First decode the syndrome data we get
         * from the CPU. Then try if some in-kernel emulation feels
         * responsible, otherwise let user space do its magic.
         */
        is_write = kvm_vcpu_dabt_iswrite(vcpu);
        len = kvm_vcpu_dabt_get_as(vcpu);
        rt = kvm_vcpu_dabt_get_rd(vcpu);

        if (is_write) {
                data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
                                               len);

                trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
                kvm_mmio_write_buf(data_buf, len, data);

                ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
                                       data_buf);
        } else {
                trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
                               fault_ipa, NULL);

                ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
                                      data_buf);
        }

        /* Now prepare kvm_run for the potential return to userland. */
        run->mmio.is_write        = is_write;
        run->mmio.phys_addr        = fault_ipa;
        run->mmio.len                = len;
        vcpu->mmio_needed        = 1;

        if (!ret) {
                /* We handled the access successfully in the kernel. */
                if (!is_write)
                        memcpy(run->mmio.data, data_buf, len);
                vcpu->stat.mmio_exit_kernel++;
                kvm_handle_mmio_return(vcpu);
                return 1;
        }

        if (is_write)
                memcpy(run->mmio.data, data_buf, len);
        vcpu->stat.mmio_exit_user++;
        run->exit_reason        = KVM_EXIT_MMIO;
        return 0;
}































































































































































































































  135 
































  135 























   50 










   12 















  221 















  221 







































  134 







  135 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#ifndef __ARM64_KVM_MMU_H__
#define __ARM64_KVM_MMU_H__

#include <asm/page.h>
#include <asm/memory.h>
#include <asm/mmu.h>
#include <asm/cpufeature.h>

/*
 * As ARMv8.0 only has the TTBR0_EL2 register, we cannot express
 * "negative" addresses. This makes it impossible to directly share
 * mappings with the kernel.
 *
 * Instead, give the HYP mode its own VA region at a fixed offset from
 * the kernel by just masking the top bits (which are all ones for a
 * kernel address). We need to find out how many bits to mask.
 *
 * We want to build a set of page tables that cover both parts of the
 * idmap (the trampoline page used to initialize EL2), and our normal
 * runtime VA space, at the same time.
 *
 * Given that the kernel uses VA_BITS for its entire address space,
 * and that half of that space (VA_BITS - 1) is used for the linear
 * mapping, we can also limit the EL2 space to (VA_BITS - 1).
 *
 * The main question is "Within the VA_BITS space, does EL2 use the
 * top or the bottom half of that space to shadow the kernel's linear
 * mapping?". As we need to idmap the trampoline page, this is
 * determined by the range in which this page lives.
 *
 * If the page is in the bottom half, we have to use the top half. If
 * the page is in the top half, we have to use the bottom half:
 *
 * T = __pa_symbol(__hyp_idmap_text_start)
 * if (T & BIT(VA_BITS - 1))
 *        HYP_VA_MIN = 0  //idmap in upper half
 * else
 *        HYP_VA_MIN = 1 << (VA_BITS - 1)
 * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
 *
 * When using VHE, there are no separate hyp mappings and all KVM
 * functionality is already mapped as part of the main kernel
 * mappings, and none of this applies in that case.
 */

#ifdef __ASSEMBLY__

#include <asm/alternative.h>

/*
 * Convert a hypervisor VA to a PA
 * reg: hypervisor address to be converted in place
 * tmp: temporary register
 */
.macro hyp_pa reg, tmp
        ldr_l        \tmp, hyp_physvirt_offset
        add        \reg, \reg, \tmp
.endm

/*
 * Convert a hypervisor VA to a kernel image address
 * reg: hypervisor address to be converted in place
 * tmp: temporary register
 *
 * The actual code generation takes place in kvm_get_kimage_voffset, and
 * the instructions below are only there to reserve the space and
 * perform the register allocation (kvm_get_kimage_voffset uses the
 * specific registers encoded in the instructions).
 */
.macro hyp_kimg_va reg, tmp
        /* Convert hyp VA -> PA. */
        hyp_pa        \reg, \tmp

        /* Load kimage_voffset. */
alternative_cb ARM64_ALWAYS_SYSTEM, kvm_get_kimage_voffset
        movz        \tmp, #0
        movk        \tmp, #0, lsl #16
        movk        \tmp, #0, lsl #32
        movk        \tmp, #0, lsl #48
alternative_cb_end

        /* Convert PA -> kimg VA. */
        add        \reg, \reg, \tmp
.endm

#else

#include <linux/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/cache.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>
#include <asm/kvm_nested.h>

void kvm_update_va_mask(struct alt_instr *alt,
                        __le32 *origptr, __le32 *updptr, int nr_inst);
void kvm_compute_layout(void);
void kvm_apply_hyp_relocations(void);

#define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset)

/*
 * Convert a kernel VA into a HYP VA.
 *
 * Can be called from hyp or non-hyp context.
 *
 * The actual code generation takes place in kvm_update_va_mask(), and
 * the instructions below are only there to reserve the space and
 * perform the register allocation (kvm_update_va_mask() uses the
 * specific registers encoded in the instructions).
 */
static __always_inline unsigned long __kern_hyp_va(unsigned long v)
{
/*
 * This #ifndef is an optimisation for when this is called from VHE hyp
 * context.  When called from a VHE non-hyp context, kvm_update_va_mask() will
 * replace the instructions with `nop`s.
 */
#ifndef __KVM_VHE_HYPERVISOR__
        asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n"         /* mask with va_mask */
                                    "ror %0, %0, #1\n"         /* rotate to the first tag bit */
                                    "add %0, %0, #0\n"         /* insert the low 12 bits of the tag */
                                    "add %0, %0, #0, lsl 12\n" /* insert the top 12 bits of the tag */
                                    "ror %0, %0, #63\n",       /* rotate back */
                                    ARM64_ALWAYS_SYSTEM,
                                    kvm_update_va_mask)
                     : "+r" (v));
#endif
        return v;
}

#define kern_hyp_va(v)         ((typeof(v))(__kern_hyp_va((unsigned long)(v))))

extern u32 __hyp_va_bits;

/*
 * We currently support using a VM-specified IPA size. For backward
 * compatibility, the default IPA size is fixed to 40bits.
 */
#define KVM_PHYS_SHIFT        (40)

#define kvm_phys_shift(mmu)                VTCR_EL2_IPA((mmu)->vtcr)
#define kvm_phys_size(mmu)                (_AC(1, ULL) << kvm_phys_shift(mmu))
#define kvm_phys_mask(mmu)                (kvm_phys_size(mmu) - _AC(1, ULL))

#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>

int kvm_share_hyp(void *from, void *to);
void kvm_unshare_hyp(void *from, void *to);
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot);
int __create_hyp_mappings(unsigned long start, unsigned long size,
                          unsigned long phys, enum kvm_pgtable_prot prot);
int hyp_alloc_private_va_range(size_t size, unsigned long *haddr);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
                           void __iomem **kaddr,
                           void __iomem **haddr);
int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
                             void **haddr);
int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr);
void __init free_hyp_pgds(void);

void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
                            u64 size, bool may_block);
void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end);
void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end);

void stage2_unmap_vm(struct kvm *kvm);
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
void kvm_uninit_stage2_mmu(struct kvm *kvm);
void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
                          phys_addr_t pa, unsigned long size, bool writable);

int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);

phys_addr_t kvm_mmu_get_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
int __init kvm_mmu_init(u32 *hyp_va_bits);

static inline void *__kvm_vector_slot2addr(void *base,
                                           enum arm64_hyp_spectre_vector slot)
{
        int idx = slot - (slot != HYP_VECTOR_DIRECT);

        return base + (idx * SZ_2K);
}

struct kvm;

#define kvm_flush_dcache_to_poc(a,l)        \
        dcache_clean_inval_poc((unsigned long)(a), (unsigned long)(a)+(l))

static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
{
        u64 cache_bits = SCTLR_ELx_M | SCTLR_ELx_C;
        int reg;

        if (vcpu_is_el2(vcpu))
                reg = SCTLR_EL2;
        else
                reg = SCTLR_EL1;

        return (vcpu_read_sys_reg(vcpu, reg) & cache_bits) == cache_bits;
}

static inline void __clean_dcache_guest_page(void *va, size_t size)
{
        /*
         * With FWB, we ensure that the guest always accesses memory using
         * cacheable attributes, and we don't have to clean to PoC when
         * faulting in pages. Furthermore, FWB implies IDC, so cleaning to
         * PoU is not required either in this case.
         */
        if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
                return;

        kvm_flush_dcache_to_poc(va, size);
}

static inline size_t __invalidate_icache_max_range(void)
{
        u8 iminline;
        u64 ctr;

        asm volatile(ALTERNATIVE_CB("movz %0, #0\n"
                                    "movk %0, #0, lsl #16\n"
                                    "movk %0, #0, lsl #32\n"
                                    "movk %0, #0, lsl #48\n",
                                    ARM64_ALWAYS_SYSTEM,
                                    kvm_compute_final_ctr_el0)
                     : "=r" (ctr));

        iminline = SYS_FIELD_GET(CTR_EL0, IminLine, ctr) + 2;
        return MAX_DVM_OPS << iminline;
}

static inline void __invalidate_icache_guest_page(void *va, size_t size)
{
        /*
         * Blow the whole I-cache if it is aliasing (i.e. VIPT) or the
         * invalidation range exceeds our arbitrary limit on invadations by
         * cache line.
         */
        if (icache_is_aliasing() || size > __invalidate_icache_max_range())
                icache_inval_all_pou();
        else
                icache_inval_pou((unsigned long)va, (unsigned long)va + size);
}

void kvm_set_way_flush(struct kvm_vcpu *vcpu);
void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled);

static inline unsigned int kvm_get_vmid_bits(void)
{
        int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);

        return get_vmid_bits(reg);
}

/*
 * We are not in the kvm->srcu critical section most of the time, so we take
 * the SRCU read lock here. Since we copy the data from the user page, we
 * can immediately drop the lock again.
 */
static inline int kvm_read_guest_lock(struct kvm *kvm,
                                      gpa_t gpa, void *data, unsigned long len)
{
        int srcu_idx = srcu_read_lock(&kvm->srcu);
        int ret = kvm_read_guest(kvm, gpa, data, len);

        srcu_read_unlock(&kvm->srcu, srcu_idx);

        return ret;
}

static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa,
                                       const void *data, unsigned long len)
{
        int srcu_idx = srcu_read_lock(&kvm->srcu);
        int ret = kvm_write_guest(kvm, gpa, data, len);

        srcu_read_unlock(&kvm->srcu, srcu_idx);

        return ret;
}

#define kvm_phys_to_vttbr(addr)                phys_to_ttbr(addr)

/*
 * When this is (directly or indirectly) used on the TLB invalidation
 * path, we rely on a previously issued DSB so that page table updates
 * and VMID reads are correctly ordered.
 */
static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu)
{
        struct kvm_vmid *vmid = &mmu->vmid;
        u64 vmid_field, baddr;
        u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0;

        baddr = mmu->pgd_phys;
        vmid_field = atomic64_read(&vmid->id) << VTTBR_VMID_SHIFT;
        vmid_field &= VTTBR_VMID_MASK(kvm_arm_vmid_bits);
        return kvm_phys_to_vttbr(baddr) | vmid_field | cnp;
}

/*
 * Must be called from hyp code running at EL2 with an updated VTTBR
 * and interrupts disabled.
 */
static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu,
                                          struct kvm_arch *arch)
{
        write_sysreg(mmu->vtcr, vtcr_el2);
        write_sysreg(kvm_get_vttbr(mmu), vttbr_el2);

        /*
         * ARM errata 1165522 and 1530923 require the actual execution of the
         * above before we can switch to the EL1/EL0 translation regime used by
         * the guest.
         */
        asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT));
}

static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu)
{
        return container_of(mmu->arch, struct kvm, arch);
}

static inline u64 get_vmid(u64 vttbr)
{
        return (vttbr & VTTBR_VMID_MASK(kvm_get_vmid_bits())) >>
                VTTBR_VMID_SHIFT;
}

static inline bool kvm_s2_mmu_valid(struct kvm_s2_mmu *mmu)
{
        return !(mmu->tlb_vttbr & VTTBR_CNP_BIT);
}

static inline bool kvm_is_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
{
        /*
         * Be careful, mmu may not be fully initialised so do look at
         * *any* of its fields.
         */
        return &kvm->arch.mmu != mmu;
}

static inline void kvm_fault_lock(struct kvm *kvm)
{
        if (is_protected_kvm_enabled())
                write_lock(&kvm->mmu_lock);
        else
                read_lock(&kvm->mmu_lock);
}

static inline void kvm_fault_unlock(struct kvm *kvm)
{
        if (is_protected_kvm_enabled())
                write_unlock(&kvm->mmu_lock);
        else
                read_unlock(&kvm->mmu_lock);
}

#ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
void kvm_s2_ptdump_create_debugfs(struct kvm *kvm);
#else
static inline void kvm_s2_ptdump_create_debugfs(struct kvm *kvm) {}
#endif /* CONFIG_PTDUMP_STAGE2_DEBUGFS */

#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */

















































  169 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  include/linux/eventpoll.h ( Efficient event polling implementation )
 *  Copyright (C) 2001,...,2006         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */
#ifndef _LINUX_EVENTPOLL_H
#define _LINUX_EVENTPOLL_H

#include <uapi/linux/eventpoll.h>
#include <uapi/linux/kcmp.h>


/* Forward declarations to avoid compiler errors */
struct file;


#ifdef CONFIG_EPOLL

#ifdef CONFIG_KCMP
struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff);
#endif

/* Used to release the epoll bits inside the "struct file" */
void eventpoll_release_file(struct file *file);

/* Copy ready events to userspace */
int epoll_sendevents(struct file *file, struct epoll_event __user *events,
                     int maxevents);

/*
 * This is called from inside fs/file_table.c:__fput() to unlink files
 * from the eventpoll interface. We need to have this facility to cleanup
 * correctly files that are closed without being removed from the eventpoll
 * interface.
 */
static inline void eventpoll_release(struct file *file)
{

        /*
         * Fast check to avoid the get/release of the semaphore. Since
         * we're doing this outside the semaphore lock, it might return
         * false negatives, but we don't care. It'll help in 99.99% of cases
         * to avoid the semaphore lock. False positives simply cannot happen
         * because the file in on the way to be removed and nobody ( but
         * eventpoll ) has still a reference to this file.
         */
        if (likely(!READ_ONCE(file->f_ep)))
                return;

        /*
         * The file is being closed while it is still linked to an epoll
         * descriptor. We need to handle this by correctly unlinking it
         * from its containers.
         */
        eventpoll_release_file(file);
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock);

/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
        return op != EPOLL_CTL_DEL;
}

#else

static inline void eventpoll_release(struct file *file) {}

#endif

#if defined(CONFIG_ARM) && defined(CONFIG_OABI_COMPAT)
/* ARM OABI has an incompatible struct layout and needs a special handler */
extern struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
                 struct epoll_event __user *uevent);
#else
static inline struct epoll_event __user *
epoll_put_uevent(__poll_t revents, __u64 data,
                 struct epoll_event __user *uevent)
{
        if (__put_user(revents, &uevent->events) ||
            __put_user(data, &uevent->data))
                return NULL;

        return uevent+1;
}
#endif

#endif /* #ifndef _LINUX_EVENTPOLL_H */
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NETFILTER_NETDEV_H_
#define _NETFILTER_NETDEV_H_

#include <linux/netfilter.h>
#include <linux/netdevice.h>

#ifdef CONFIG_NETFILTER_INGRESS
static inline bool nf_hook_ingress_active(const struct sk_buff *skb)
{
#ifdef CONFIG_JUMP_LABEL
        if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS]))
                return false;
#endif
        return rcu_access_pointer(skb->dev->nf_hooks_ingress);
}

/* caller must hold rcu_read_lock */
static inline int nf_hook_ingress(struct sk_buff *skb)
{
        struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress);
        struct nf_hook_state state;
        int ret;

        /* Must recheck the ingress hook head, in the event it became NULL
         * after the check in nf_hook_ingress_active evaluated to true.
         */
        if (unlikely(!e))
                return 0;

        nf_hook_state_init(&state, NF_NETDEV_INGRESS,
                           NFPROTO_NETDEV, skb->dev, NULL, NULL,
                           dev_net(skb->dev), NULL);
        ret = nf_hook_slow(skb, &state, e, 0);
        if (ret == 0)
                return -1;

        return ret;
}

#else /* CONFIG_NETFILTER_INGRESS */
static inline int nf_hook_ingress_active(struct sk_buff *skb)
{
        return 0;
}

static inline int nf_hook_ingress(struct sk_buff *skb)
{
        return 0;
}
#endif /* CONFIG_NETFILTER_INGRESS */

#ifdef CONFIG_NETFILTER_EGRESS
static inline bool nf_hook_egress_active(void)
{
#ifdef CONFIG_JUMP_LABEL
        if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS]))
                return false;
#endif
        return true;
}

/**
 * nf_hook_egress - classify packets before transmission
 * @skb: packet to be classified
 * @rc: result code which shall be returned by __dev_queue_xmit() on failure
 * @dev: netdev whose egress hooks shall be applied to @skb
 *
 * Caller must hold rcu_read_lock.
 *
 * On ingress, packets are classified first by tc, then by netfilter.
 * On egress, the order is reversed for symmetry.  Conceptually, tc and
 * netfilter can be thought of as layers, with netfilter layered above tc:
 * When tc redirects a packet to another interface, netfilter is not applied
 * because the packet is on the tc layer.
 *
 * The nf_skip_egress flag controls whether netfilter is applied on egress.
 * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the
 * packet passes through tc and netfilter.  Because __dev_queue_xmit() may be
 * called recursively by tunnel drivers such as vxlan, the flag is reverted to
 * false after sch_handle_egress().  This ensures that netfilter is applied
 * both on the overlay and underlying network.
 *
 * Returns: @skb on success or %NULL if the packet was consumed or filtered.
 */
static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
                                             struct net_device *dev)
{
        struct nf_hook_entries *e;
        struct nf_hook_state state;
        int ret;

#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        if (skb->nf_skip_egress)
                return skb;
#endif

        e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held());
        if (!e)
                return skb;

        nf_hook_state_init(&state, NF_NETDEV_EGRESS,
                           NFPROTO_NETDEV, NULL, dev, NULL,
                           dev_net(dev), NULL);

        /* nf assumes rcu_read_lock, not just read_lock_bh */
        rcu_read_lock();
        ret = nf_hook_slow(skb, &state, e, 0);
        rcu_read_unlock();

        if (ret == 1) {
                return skb;
        } else if (ret < 0) {
                *rc = NET_XMIT_DROP;
                return NULL;
        } else { /* ret == 0 */
                *rc = NET_XMIT_SUCCESS;
                return NULL;
        }
}
#else /* CONFIG_NETFILTER_EGRESS */
static inline bool nf_hook_egress_active(void)
{
        return false;
}

static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc,
                                             struct net_device *dev)
{
        return skb;
}
#endif /* CONFIG_NETFILTER_EGRESS */

static inline void nf_skip_egress(struct sk_buff *skb, bool skip)
{
#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        skb->nf_skip_egress = skip;
#endif
}

static inline void nf_hook_netdev_init(struct net_device *dev)
{
#ifdef CONFIG_NETFILTER_INGRESS
        RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL);
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        RCU_INIT_POINTER(dev->nf_hooks_egress, NULL);
#endif
}

#endif /* _NETFILTER_NETDEV_H_ */






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_FLS64_H_
#define _ASM_GENERIC_BITOPS_FLS64_H_

#include <asm/types.h>

/**
 * fls64 - find last set bit in a 64-bit word
 * @x: the word to search
 *
 * This is defined in a similar way as the libc and compiler builtin
 * ffsll, but returns the position of the most significant set bit.
 *
 * fls64(value) returns 0 if value is 0 or the position of the last
 * set bit if value is nonzero. The last (most significant) bit is
 * at position 64.
 */
#if BITS_PER_LONG == 32
static __always_inline int fls64(__u64 x)
{
        __u32 h = x >> 32;
        if (h)
                return fls(h) + 32;
        return fls(x);
}
#elif BITS_PER_LONG == 64
static __always_inline int fls64(__u64 x)
{
        if (x == 0)
                return 0;
        return __fls(x) + 1;
}
#else
#error BITS_PER_LONG not 32 or 64
#endif

#endif /* _ASM_GENERIC_BITOPS_FLS64_H_ */























  125 
   99 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
/* SPDX-License-Identifier: GPL-2.0-only */

#ifndef LINUX_RESUME_USER_MODE_H
#define LINUX_RESUME_USER_MODE_H

#include <linux/sched.h>
#include <linux/task_work.h>
#include <linux/memcontrol.h>
#include <linux/rseq.h>
#include <linux/blk-cgroup.h>

/**
 * set_notify_resume - cause resume_user_mode_work() to be called
 * @task:                task that will call resume_user_mode_work()
 *
 * Calling this arranges that @task will call resume_user_mode_work()
 * before returning to user mode.  If it's already running in user mode,
 * it will enter the kernel and call resume_user_mode_work() soon.
 * If it's blocked, it will not be woken.
 */
static inline void set_notify_resume(struct task_struct *task)
{
        if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_RESUME))
                kick_process(task);
}


/**
 * resume_user_mode_work - Perform work before returning to user mode
 * @regs:                user-mode registers of @current task
 *
 * This is called when %TIF_NOTIFY_RESUME has been set.  Now we are
 * about to return to user mode, and the user state in @regs can be
 * inspected or adjusted.  The caller in arch code has cleared
 * %TIF_NOTIFY_RESUME before the call.  If the flag gets set again
 * asynchronously, this will be called again before we return to
 * user mode.
 *
 * Called without locks.
 */
static inline void resume_user_mode_work(struct pt_regs *regs)
{
        clear_thread_flag(TIF_NOTIFY_RESUME);
        /*
         * This barrier pairs with task_work_add()->set_notify_resume() after
         * hlist_add_head(task->task_works);
         */
        smp_mb__after_atomic();
        if (unlikely(task_work_pending(current)))
                task_work_run();

#ifdef CONFIG_KEYS_REQUEST_CACHE
        if (unlikely(current->cached_requested_key)) {
                key_put(current->cached_requested_key);
                current->cached_requested_key = NULL;
        }
#endif

        mem_cgroup_handle_over_high(GFP_KERNEL);
        blkcg_maybe_throttle_current();

        rseq_handle_notify_resume(NULL, regs);
}

#endif /* LINUX_RESUME_USER_MODE_H */





































    3 









    3 

















    3 













    3 







    3 






    2 














    3 







    3 



    1 





    3 






















    3 



    3 

    3 






















    3 











    3 


































    4 



































   26 



















   26 














  166 



  109 





   58 

   58 

   58 

   58 




   27 



















   27 
    1 












   27 





































    2 
















    2 




    2 
















    1 
    2 
    2 








    1 


    1 













    2 
















   35 



   92 
   83 
    1 










   83 








   83 









   83 
















   35 


    6 
    5 







    2 



    2 
    1 







   59 




   59 



    1 






















   58 





















































































































































   59 









   59 


   59 

   59 



   46 








   46 
   45 


   46 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/irqchip/arm-gic-v3.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/kstrtox.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/string_choices.h>
#include <kvm/arm_vgic.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_asm.h>

#include "vgic.h"

static bool group0_trap;
static bool group1_trap;
static bool common_trap;
static bool dir_trap;
static bool gicv4_enable;

void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
{
        struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;

        cpuif->vgic_hcr |= ICH_HCR_EL2_UIE;
}

static bool lr_signals_eoi_mi(u64 lr_val)
{
        return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) &&
               !(lr_val & ICH_LR_HW);
}

void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
        u32 model = vcpu->kvm->arch.vgic.vgic_model;
        int lr;

        DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());

        cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE;

        for (lr = 0; lr < cpuif->used_lrs; lr++) {
                u64 val = cpuif->vgic_lr[lr];
                u32 intid, cpuid;
                struct vgic_irq *irq;
                bool is_v2_sgi = false;
                bool deactivated;

                cpuid = val & GICH_LR_PHYSID_CPUID;
                cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;

                if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
                        intid = val & ICH_LR_VIRTUAL_ID_MASK;
                } else {
                        intid = val & GICH_LR_VIRTUALID;
                        is_v2_sgi = vgic_irq_is_sgi(intid);
                }

                /* Notify fds when the guest EOI'ed a level-triggered IRQ */
                if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
                        kvm_notify_acked_irq(vcpu->kvm, 0,
                                             intid - VGIC_NR_PRIVATE_IRQS);

                irq = vgic_get_vcpu_irq(vcpu, intid);
                if (!irq)        /* An LPI could have been unmapped. */
                        continue;

                raw_spin_lock(&irq->irq_lock);

                /* Always preserve the active bit, note deactivation */
                deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
                irq->active = !!(val & ICH_LR_ACTIVE_BIT);

                if (irq->active && is_v2_sgi)
                        irq->active_source = cpuid;

                /* Edge is the only case where we preserve the pending bit */
                if (irq->config == VGIC_CONFIG_EDGE &&
                    (val & ICH_LR_PENDING_BIT)) {
                        irq->pending_latch = true;

                        if (is_v2_sgi)
                                irq->source |= (1 << cpuid);
                }

                /*
                 * Clear soft pending state when level irqs have been acked.
                 */
                if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
                        irq->pending_latch = false;

                /* Handle resampling for mapped interrupts if required */
                vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);

                raw_spin_unlock(&irq->irq_lock);
                vgic_put_irq(vcpu->kvm, irq);
        }

        cpuif->used_lrs = 0;
}

/* Requires the irq to be locked already */
void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
{
        u32 model = vcpu->kvm->arch.vgic.vgic_model;
        u64 val = irq->intid;
        bool allow_pending = true, is_v2_sgi;

        is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
                     model == KVM_DEV_TYPE_ARM_VGIC_V2);

        if (irq->active) {
                val |= ICH_LR_ACTIVE_BIT;
                if (is_v2_sgi)
                        val |= irq->active_source << GICH_LR_PHYSID_CPUID_SHIFT;
                if (vgic_irq_is_multi_sgi(irq)) {
                        allow_pending = false;
                        val |= ICH_LR_EOI;
                }
        }

        if (irq->hw && !vgic_irq_needs_resampling(irq)) {
                val |= ICH_LR_HW;
                val |= ((u64)irq->hwintid) << ICH_LR_PHYS_ID_SHIFT;
                /*
                 * Never set pending+active on a HW interrupt, as the
                 * pending state is kept at the physical distributor
                 * level.
                 */
                if (irq->active)
                        allow_pending = false;
        } else {
                if (irq->config == VGIC_CONFIG_LEVEL) {
                        val |= ICH_LR_EOI;

                        /*
                         * Software resampling doesn't work very well
                         * if we allow P+A, so let's not do that.
                         */
                        if (irq->active)
                                allow_pending = false;
                }
        }

        if (allow_pending && irq_is_pending(irq)) {
                val |= ICH_LR_PENDING_BIT;

                if (irq->config == VGIC_CONFIG_EDGE)
                        irq->pending_latch = false;

                if (vgic_irq_is_sgi(irq->intid) &&
                    model == KVM_DEV_TYPE_ARM_VGIC_V2) {
                        u32 src = ffs(irq->source);

                        if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
                                           irq->intid))
                                return;

                        val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
                        irq->source &= ~(1 << (src - 1));
                        if (irq->source) {
                                irq->pending_latch = true;
                                val |= ICH_LR_EOI;
                        }
                }
        }

        /*
         * Level-triggered mapped IRQs are special because we only observe
         * rising edges as input to the VGIC.  We therefore lower the line
         * level here, so that we can take new virtual IRQs.  See
         * vgic_v3_fold_lr_state for more info.
         */
        if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
                irq->line_level = false;

        if (irq->group)
                val |= ICH_LR_GROUP;

        val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;

        vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
}

void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
{
        vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = 0;
}

void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
{
        struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
        u32 model = vcpu->kvm->arch.vgic.vgic_model;
        u32 vmcr;

        if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
                vmcr = (vmcrp->ackctl << ICH_VMCR_ACK_CTL_SHIFT) &
                        ICH_VMCR_ACK_CTL_MASK;
                vmcr |= (vmcrp->fiqen << ICH_VMCR_FIQ_EN_SHIFT) &
                        ICH_VMCR_FIQ_EN_MASK;
        } else {
                /*
                 * When emulating GICv3 on GICv3 with SRE=1 on the
                 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
                 */
                vmcr = ICH_VMCR_FIQ_EN_MASK;
        }

        vmcr |= (vmcrp->cbpr << ICH_VMCR_CBPR_SHIFT) & ICH_VMCR_CBPR_MASK;
        vmcr |= (vmcrp->eoim << ICH_VMCR_EOIM_SHIFT) & ICH_VMCR_EOIM_MASK;
        vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
        vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
        vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
        vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK;
        vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK;

        cpu_if->vgic_vmcr = vmcr;
}

void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
{
        struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
        u32 model = vcpu->kvm->arch.vgic.vgic_model;
        u32 vmcr;

        vmcr = cpu_if->vgic_vmcr;

        if (model == KVM_DEV_TYPE_ARM_VGIC_V2) {
                vmcrp->ackctl = (vmcr & ICH_VMCR_ACK_CTL_MASK) >>
                        ICH_VMCR_ACK_CTL_SHIFT;
                vmcrp->fiqen = (vmcr & ICH_VMCR_FIQ_EN_MASK) >>
                        ICH_VMCR_FIQ_EN_SHIFT;
        } else {
                /*
                 * When emulating GICv3 on GICv3 with SRE=1 on the
                 * VFIQEn bit is RES1 and the VAckCtl bit is RES0.
                 */
                vmcrp->fiqen = 1;
                vmcrp->ackctl = 0;
        }

        vmcrp->cbpr = (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;
        vmcrp->eoim = (vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT;
        vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
        vmcrp->bpr  = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
        vmcrp->pmr  = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
        vmcrp->grpen0 = (vmcr & ICH_VMCR_ENG0_MASK) >> ICH_VMCR_ENG0_SHIFT;
        vmcrp->grpen1 = (vmcr & ICH_VMCR_ENG1_MASK) >> ICH_VMCR_ENG1_SHIFT;
}

#define INITIAL_PENDBASER_VALUE                                                  \
        (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb)                | \
        GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner)        | \
        GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))

void vgic_v3_enable(struct kvm_vcpu *vcpu)
{
        struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;

        /*
         * By forcing VMCR to zero, the GIC will restore the binary
         * points to their reset values. Anything else resets to zero
         * anyway.
         */
        vgic_v3->vgic_vmcr = 0;

        /*
         * If we are emulating a GICv3, we do it in an non-GICv2-compatible
         * way, so we force SRE to 1 to demonstrate this to the guest.
         * Also, we don't support any form of IRQ/FIQ bypass.
         * This goes with the spec allowing the value to be RAO/WI.
         */
        if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
                vgic_v3->vgic_sre = (ICC_SRE_EL1_DIB |
                                     ICC_SRE_EL1_DFB |
                                     ICC_SRE_EL1_SRE);
                vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
        } else {
                vgic_v3->vgic_sre = 0;
        }

        vcpu->arch.vgic_cpu.num_id_bits = FIELD_GET(ICH_VTR_EL2_IDbits,
                                                    kvm_vgic_global_state.ich_vtr_el2);
        vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits,
                                                     kvm_vgic_global_state.ich_vtr_el2) + 1;

        /* Get the show on the road... */
        vgic_v3->vgic_hcr = ICH_HCR_EL2_En;
}

void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
{
        struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;

        /* Hide GICv3 sysreg if necessary */
        if (!kvm_has_gicv3(vcpu->kvm)) {
                vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
                                      ICH_HCR_EL2_TC);
                return;
        }

        if (group0_trap)
                vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0;
        if (group1_trap)
                vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1;
        if (common_trap)
                vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC;
        if (dir_trap)
                vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR;
}

int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
{
        struct kvm_vcpu *vcpu;
        int byte_offset, bit_nr;
        gpa_t pendbase, ptr;
        bool status;
        u8 val;
        int ret;
        unsigned long flags;

retry:
        vcpu = irq->target_vcpu;
        if (!vcpu)
                return 0;

        pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);

        byte_offset = irq->intid / BITS_PER_BYTE;
        bit_nr = irq->intid % BITS_PER_BYTE;
        ptr = pendbase + byte_offset;

        ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
        if (ret)
                return ret;

        status = val & (1 << bit_nr);

        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        if (irq->target_vcpu != vcpu) {
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                goto retry;
        }
        irq->pending_latch = status;
        vgic_queue_irq_unlock(vcpu->kvm, irq, flags);

        if (status) {
                /* clear consumed data */
                val &= ~(1 << bit_nr);
                ret = vgic_write_guest_lock(kvm, ptr, &val, 1);
                if (ret)
                        return ret;
        }
        return 0;
}

/*
 * The deactivation of the doorbell interrupt will trigger the
 * unmapping of the associated vPE.
 */
static void unmap_all_vpes(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        int i;

        for (i = 0; i < dist->its_vm.nr_vpes; i++)
                free_irq(dist->its_vm.vpes[i]->irq, kvm_get_vcpu(kvm, i));
}

static void map_all_vpes(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        int i;

        for (i = 0; i < dist->its_vm.nr_vpes; i++)
                WARN_ON(vgic_v4_request_vpe_irq(kvm_get_vcpu(kvm, i),
                                                dist->its_vm.vpes[i]->irq));
}

/*
 * vgic_v3_save_pending_tables - Save the pending tables into guest RAM
 * kvm lock and all vcpu lock must be held
 */
int vgic_v3_save_pending_tables(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct vgic_irq *irq;
        gpa_t last_ptr = ~(gpa_t)0;
        bool vlpi_avail = false;
        unsigned long index;
        int ret = 0;
        u8 val;

        if (unlikely(!vgic_initialized(kvm)))
                return -ENXIO;

        /*
         * A preparation for getting any VLPI states.
         * The above vgic initialized check also ensures that the allocation
         * and enabling of the doorbells have already been done.
         */
        if (kvm_vgic_global_state.has_gicv4_1) {
                unmap_all_vpes(kvm);
                vlpi_avail = true;
        }

        xa_for_each(&dist->lpi_xa, index, irq) {
                int byte_offset, bit_nr;
                struct kvm_vcpu *vcpu;
                gpa_t pendbase, ptr;
                bool is_pending;
                bool stored;

                vcpu = irq->target_vcpu;
                if (!vcpu)
                        continue;

                pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);

                byte_offset = irq->intid / BITS_PER_BYTE;
                bit_nr = irq->intid % BITS_PER_BYTE;
                ptr = pendbase + byte_offset;

                if (ptr != last_ptr) {
                        ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
                        if (ret)
                                goto out;
                        last_ptr = ptr;
                }

                stored = val & (1U << bit_nr);

                is_pending = irq->pending_latch;

                if (irq->hw && vlpi_avail)
                        vgic_v4_get_vlpi_state(irq, &is_pending);

                if (stored == is_pending)
                        continue;

                if (is_pending)
                        val |= 1 << bit_nr;
                else
                        val &= ~(1 << bit_nr);

                ret = vgic_write_guest_lock(kvm, ptr, &val, 1);
                if (ret)
                        goto out;
        }

out:
        if (vlpi_avail)
                map_all_vpes(kvm);

        return ret;
}

/**
 * vgic_v3_rdist_overlap - check if a region overlaps with any
 * existing redistributor region
 *
 * @kvm: kvm handle
 * @base: base of the region
 * @size: size of region
 *
 * Return: true if there is an overlap
 */
bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size)
{
        struct vgic_dist *d = &kvm->arch.vgic;
        struct vgic_redist_region *rdreg;

        list_for_each_entry(rdreg, &d->rd_regions, list) {
                if ((base + size > rdreg->base) &&
                        (base < rdreg->base + vgic_v3_rd_region_size(kvm, rdreg)))
                        return true;
        }
        return false;
}

/*
 * Check for overlapping regions and for regions crossing the end of memory
 * for base addresses which have already been set.
 */
bool vgic_v3_check_base(struct kvm *kvm)
{
        struct vgic_dist *d = &kvm->arch.vgic;
        struct vgic_redist_region *rdreg;

        if (!IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
            d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE < d->vgic_dist_base)
                return false;

        list_for_each_entry(rdreg, &d->rd_regions, list) {
                size_t sz = vgic_v3_rd_region_size(kvm, rdreg);

                if (vgic_check_iorange(kvm, VGIC_ADDR_UNDEF,
                                       rdreg->base, SZ_64K, sz))
                        return false;
        }

        if (IS_VGIC_ADDR_UNDEF(d->vgic_dist_base))
                return true;

        return !vgic_v3_rdist_overlap(kvm, d->vgic_dist_base,
                                      KVM_VGIC_V3_DIST_SIZE);
}

/**
 * vgic_v3_rdist_free_slot - Look up registered rdist regions and identify one
 * which has free space to put a new rdist region.
 *
 * @rd_regions: redistributor region list head
 *
 * A redistributor regions maps n redistributors, n = region size / (2 x 64kB).
 * Stride between redistributors is 0 and regions are filled in the index order.
 *
 * Return: the redist region handle, if any, that has space to map a new rdist
 * region.
 */
struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rd_regions)
{
        struct vgic_redist_region *rdreg;

        list_for_each_entry(rdreg, rd_regions, list) {
                if (!vgic_v3_redist_region_full(rdreg))
                        return rdreg;
        }
        return NULL;
}

struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
                                                           u32 index)
{
        struct list_head *rd_regions = &kvm->arch.vgic.rd_regions;
        struct vgic_redist_region *rdreg;

        list_for_each_entry(rdreg, rd_regions, list) {
                if (rdreg->index == index)
                        return rdreg;
        }
        return NULL;
}


int vgic_v3_map_resources(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu;
        unsigned long c;

        kvm_for_each_vcpu(c, vcpu, kvm) {
                struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;

                if (IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr)) {
                        kvm_debug("vcpu %ld redistributor base not set\n", c);
                        return -ENXIO;
                }
        }

        if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base)) {
                kvm_debug("Need to set vgic distributor addresses first\n");
                return -ENXIO;
        }

        if (!vgic_v3_check_base(kvm)) {
                kvm_debug("VGIC redist and dist frames overlap\n");
                return -EINVAL;
        }

        /*
         * For a VGICv3 we require the userland to explicitly initialize
         * the VGIC before we need to use it.
         */
        if (!vgic_initialized(kvm)) {
                return -EBUSY;
        }

        if (kvm_vgic_global_state.has_gicv4_1)
                vgic_v4_configure_vsgis(kvm);

        return 0;
}

DEFINE_STATIC_KEY_FALSE(vgic_v3_cpuif_trap);

static int __init early_group0_trap_cfg(char *buf)
{
        return kstrtobool(buf, &group0_trap);
}
early_param("kvm-arm.vgic_v3_group0_trap", early_group0_trap_cfg);

static int __init early_group1_trap_cfg(char *buf)
{
        return kstrtobool(buf, &group1_trap);
}
early_param("kvm-arm.vgic_v3_group1_trap", early_group1_trap_cfg);

static int __init early_common_trap_cfg(char *buf)
{
        return kstrtobool(buf, &common_trap);
}
early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg);

static int __init early_gicv4_enable(char *buf)
{
        return kstrtobool(buf, &gicv4_enable);
}
early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);

static const struct midr_range broken_seis[] = {
        MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_PRO),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX),
        MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX),
        {},
};

static bool vgic_v3_broken_seis(void)
{
        return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) &&
                is_midr_in_range_list(broken_seis));
}

/**
 * vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
 * @info:        pointer to the GIC description
 *
 * Returns 0 if the VGICv3 has been probed successfully, returns an error code
 * otherwise
 */
int vgic_v3_probe(const struct gic_kvm_info *info)
{
        u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
        bool has_v2;
        int ret;

        has_v2 = ich_vtr_el2 >> 63;
        ich_vtr_el2 = (u32)ich_vtr_el2;

        /*
         * The ListRegs field is 5 bits, but there is an architectural
         * maximum of 16 list registers. Just ignore bit 4...
         */
        kvm_vgic_global_state.nr_lr = (ich_vtr_el2 & 0xf) + 1;
        kvm_vgic_global_state.can_emulate_gicv2 = false;
        kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2;

        /* GICv4 support? */
        if (info->has_v4) {
                kvm_vgic_global_state.has_gicv4 = gicv4_enable;
                kvm_vgic_global_state.has_gicv4_1 = info->has_v4_1 && gicv4_enable;
                kvm_info("GICv4%s support %s\n",
                         kvm_vgic_global_state.has_gicv4_1 ? ".1" : "",
                         str_enabled_disabled(gicv4_enable));
        }

        kvm_vgic_global_state.vcpu_base = 0;

        if (!info->vcpu.start) {
                kvm_info("GICv3: no GICV resource entry\n");
        } else if (!has_v2) {
                pr_warn(FW_BUG "CPU interface incapable of MMIO access\n");
        } else if (!PAGE_ALIGNED(info->vcpu.start)) {
                pr_warn("GICV physical address 0x%llx not page aligned\n",
                        (unsigned long long)info->vcpu.start);
        } else if (kvm_get_mode() != KVM_MODE_PROTECTED) {
                kvm_vgic_global_state.vcpu_base = info->vcpu.start;
                kvm_vgic_global_state.can_emulate_gicv2 = true;
                ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
                if (ret) {
                        kvm_err("Cannot register GICv2 KVM device.\n");
                        return ret;
                }
                kvm_info("vgic-v2@%llx\n", info->vcpu.start);
        }
        ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
        if (ret) {
                kvm_err("Cannot register GICv3 KVM device.\n");
                kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
                return ret;
        }

        if (kvm_vgic_global_state.vcpu_base == 0)
                kvm_info("disabling GICv2 emulation\n");

        if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
                group0_trap = true;
                group1_trap = true;
        }

        if (vgic_v3_broken_seis()) {
                kvm_info("GICv3 with broken locally generated SEI\n");

                kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS;
                group0_trap = true;
                group1_trap = true;
                if (ich_vtr_el2 & ICH_VTR_EL2_TDS)
                        dir_trap = true;
                else
                        common_trap = true;
        }

        if (group0_trap || group1_trap || common_trap | dir_trap) {
                kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n",
                         group0_trap ? "G0" : "",
                         group1_trap ? "G1" : "",
                         common_trap ? "C"  : "",
                         dir_trap    ? "D"  : "");
                static_branch_enable(&vgic_v3_cpuif_trap);
        }

        kvm_vgic_global_state.vctrl_base = NULL;
        kvm_vgic_global_state.type = VGIC_V3;
        kvm_vgic_global_state.max_gic_vcpus = VGIC_V3_MAX_CPUS;

        return 0;
}

void vgic_v3_load(struct kvm_vcpu *vcpu)
{
        struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;

        /* If the vgic is nested, perform the full state loading */
        if (vgic_state_is_nested(vcpu)) {
                vgic_v3_load_nested(vcpu);
                return;
        }

        if (likely(!is_protected_kvm_enabled()))
                kvm_call_hyp(__vgic_v3_restore_vmcr_aprs, cpu_if);

        if (has_vhe())
                __vgic_v3_activate_traps(cpu_if);

        WARN_ON(vgic_v4_load(vcpu));
}

void vgic_v3_put(struct kvm_vcpu *vcpu)
{
        struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;

        if (vgic_state_is_nested(vcpu)) {
                vgic_v3_put_nested(vcpu);
                return;
        }

        if (likely(!is_protected_kvm_enabled()))
                kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if);
        WARN_ON(vgic_v4_put(vcpu));

        if (has_vhe())
                __vgic_v3_deactivate_traps(cpu_if);
}









































    8 










    8 







    8 































































































































































































    9 




    3 












    7 

















    4 





    6 

































    1 


    4 


    1 



























































































































































































































































































































































































































































































































































   33 





   33 















  204 












    3 





  148 
   28 
   33 






  162 
   44 

















  201 



  202 







  202 

  199 



  201 
    2 

  202 


  199 
    3 
  202 







  202 






  159 






   48 

















































    2 


    2 





































  202 




    1 





  201 






    3 









    3 















































































































  218 
    5 

   48 
  182 




  222 































   48 

  182 









    1 



  217 





    5 




  217 

    5 




























  185 

    9 
   33 



















  219 





































































  222 





    2 




    6 





   22 


  200 








  222 











  222 




  222 






































































   50 








   50 


    1 
   49 





















   52 





    6 










   50 
































   39 
   39 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/open.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/string.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fsnotify.h>
#include <linux/module.h>
#include <linux/tty.h>
#include <linux/namei.h>
#include <linux/backing-dev.h>
#include <linux/capability.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/mount.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/personality.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/rcupdate.h>
#include <linux/audit.h>
#include <linux/falloc.h>
#include <linux/fs_struct.h>
#include <linux/dnotify.h>
#include <linux/compat.h>
#include <linux/mnt_idmapping.h>
#include <linux/filelock.h>

#include "internal.h"

int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
                loff_t length, unsigned int time_attrs, struct file *filp)
{
        int ret;
        struct iattr newattrs;

        /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
        if (length < 0)
                return -EINVAL;

        newattrs.ia_size = length;
        newattrs.ia_valid = ATTR_SIZE | time_attrs;
        if (filp) {
                newattrs.ia_file = filp;
                newattrs.ia_valid |= ATTR_FILE;
        }

        /* Remove suid, sgid, and file capabilities on truncate too */
        ret = dentry_needs_remove_privs(idmap, dentry);
        if (ret < 0)
                return ret;
        if (ret)
                newattrs.ia_valid |= ret | ATTR_FORCE;

        inode_lock(dentry->d_inode);
        /* Note any delegations or leases have already been broken: */
        ret = notify_change(idmap, dentry, &newattrs, NULL);
        inode_unlock(dentry->d_inode);
        return ret;
}

int vfs_truncate(const struct path *path, loff_t length)
{
        struct mnt_idmap *idmap;
        struct inode *inode;
        int error;

        inode = path->dentry->d_inode;

        /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
        if (S_ISDIR(inode->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;

        idmap = mnt_idmap(path->mnt);
        error = inode_permission(idmap, inode, MAY_WRITE);
        if (error)
                return error;

        error = fsnotify_truncate_perm(path, length);
        if (error)
                return error;

        error = mnt_want_write(path->mnt);
        if (error)
                return error;

        error = -EPERM;
        if (IS_APPEND(inode))
                goto mnt_drop_write_and_out;

        error = get_write_access(inode);
        if (error)
                goto mnt_drop_write_and_out;

        /*
         * Make sure that there are no leases.  get_write_access() protects
         * against the truncate racing with a lease-granting setlease().
         */
        error = break_lease(inode, O_WRONLY);
        if (error)
                goto put_write_and_out;

        error = security_path_truncate(path);
        if (!error)
                error = do_truncate(idmap, path->dentry, length, 0, NULL);

put_write_and_out:
        put_write_access(inode);
mnt_drop_write_and_out:
        mnt_drop_write(path->mnt);

        return error;
}
EXPORT_SYMBOL_GPL(vfs_truncate);

int do_sys_truncate(const char __user *pathname, loff_t length)
{
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        struct path path;
        int error;

        if (length < 0)        /* sorry, but loff_t says... */
                return -EINVAL;

retry:
        error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
        if (!error) {
                error = vfs_truncate(&path, length);
                path_put(&path);
        }
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
{
        return do_sys_truncate(path, length);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length)
{
        return do_sys_truncate(path, length);
}
#endif

int do_ftruncate(struct file *file, loff_t length, int small)
{
        struct inode *inode;
        struct dentry *dentry;
        int error;

        /* explicitly opened as large or we are on 64-bit box */
        if (file->f_flags & O_LARGEFILE)
                small = 0;

        dentry = file->f_path.dentry;
        inode = dentry->d_inode;
        if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
                return -EINVAL;

        /* Cannot ftruncate over 2^31 bytes without large file support */
        if (small && length > MAX_NON_LFS)
                return -EINVAL;

        /* Check IS_APPEND on real upper inode */
        if (IS_APPEND(file_inode(file)))
                return -EPERM;

        error = security_file_truncate(file);
        if (error)
                return error;

        error = fsnotify_truncate_perm(&file->f_path, length);
        if (error)
                return error;

        sb_start_write(inode->i_sb);
        error = do_truncate(file_mnt_idmap(file), dentry, length,
                            ATTR_MTIME | ATTR_CTIME, file);
        sb_end_write(inode->i_sb);

        return error;
}

int do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
        if (length < 0)
                return -EINVAL;
        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        return do_ftruncate(fd_file(f), length, small);
}

SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
{
        return do_sys_ftruncate(fd, length, 1);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_off_t, length)
{
        return do_sys_ftruncate(fd, length, 1);
}
#endif

/* LFS versions of truncate are only needed on 32 bit machines */
#if BITS_PER_LONG == 32
SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
{
        return do_sys_truncate(path, length);
}

SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
{
        return do_sys_ftruncate(fd, length, 0);
}
#endif /* BITS_PER_LONG == 32 */

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_TRUNCATE64)
COMPAT_SYSCALL_DEFINE3(truncate64, const char __user *, pathname,
                       compat_arg_u64_dual(length))
{
        return ksys_truncate(pathname, compat_arg_u64_glue(length));
}
#endif

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FTRUNCATE64)
COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
                       compat_arg_u64_dual(length))
{
        return ksys_ftruncate(fd, compat_arg_u64_glue(length));
}
#endif

int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
        struct inode *inode = file_inode(file);
        int ret;
        loff_t sum;

        if (offset < 0 || len <= 0)
                return -EINVAL;

        if (mode & ~(FALLOC_FL_MODE_MASK | FALLOC_FL_KEEP_SIZE))
                return -EOPNOTSUPP;

        /*
         * Modes are exclusive, even if that is not obvious from the encoding
         * as bit masks and the mix with the flag in the same namespace.
         *
         * To make things even more complicated, FALLOC_FL_ALLOCATE_RANGE is
         * encoded as no bit set.
         */
        switch (mode & FALLOC_FL_MODE_MASK) {
        case FALLOC_FL_ALLOCATE_RANGE:
        case FALLOC_FL_UNSHARE_RANGE:
        case FALLOC_FL_ZERO_RANGE:
                break;
        case FALLOC_FL_PUNCH_HOLE:
                if (!(mode & FALLOC_FL_KEEP_SIZE))
                        return -EOPNOTSUPP;
                break;
        case FALLOC_FL_COLLAPSE_RANGE:
        case FALLOC_FL_INSERT_RANGE:
                if (mode & FALLOC_FL_KEEP_SIZE)
                        return -EOPNOTSUPP;
                break;
        default:
                return -EOPNOTSUPP;
        }

        if (!(file->f_mode & FMODE_WRITE))
                return -EBADF;

        /*
         * On append-only files only space preallocation is supported.
         */
        if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode))
                return -EPERM;

        if (IS_IMMUTABLE(inode))
                return -EPERM;

        /*
         * We cannot allow any fallocate operation on an active swapfile
         */
        if (IS_SWAPFILE(inode))
                return -ETXTBSY;

        /*
         * Revalidate the write permissions, in case security policy has
         * changed since the files were opened.
         */
        ret = security_file_permission(file, MAY_WRITE);
        if (ret)
                return ret;

        ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
        if (ret)
                return ret;

        if (S_ISFIFO(inode->i_mode))
                return -ESPIPE;

        if (S_ISDIR(inode->i_mode))
                return -EISDIR;

        if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
                return -ENODEV;

        /* Check for wraparound */
        if (check_add_overflow(offset, len, &sum))
                return -EFBIG;

        if (sum > inode->i_sb->s_maxbytes)
                return -EFBIG;

        if (!file->f_op->fallocate)
                return -EOPNOTSUPP;

        file_start_write(file);
        ret = file->f_op->fallocate(file, mode, offset, len);

        /*
         * Create inotify and fanotify events.
         *
         * To keep the logic simple always create events if fallocate succeeds.
         * This implies that events are even created if the file size remains
         * unchanged, e.g. when using flag FALLOC_FL_KEEP_SIZE.
         */
        if (ret == 0)
                fsnotify_modify(file);

        file_end_write(file);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_fallocate);

int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;

        return vfs_fallocate(fd_file(f), mode, offset, len);
}

SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
{
        return ksys_fallocate(fd, mode, offset, len);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_FALLOCATE)
COMPAT_SYSCALL_DEFINE6(fallocate, int, fd, int, mode, compat_arg_u64_dual(offset),
                       compat_arg_u64_dual(len))
{
        return ksys_fallocate(fd, mode, compat_arg_u64_glue(offset),
                              compat_arg_u64_glue(len));
}
#endif

/*
 * access() needs to use the real uid/gid, not the effective uid/gid.
 * We do this by temporarily clearing all FS-related capabilities and
 * switching the fsuid/fsgid around to the real ones.
 *
 * Creating new credentials is expensive, so we try to skip doing it,
 * which we can if the result would match what we already got.
 */
static bool access_need_override_creds(int flags)
{
        const struct cred *cred;

        if (flags & AT_EACCESS)
                return false;

        cred = current_cred();
        if (!uid_eq(cred->fsuid, cred->uid) ||
            !gid_eq(cred->fsgid, cred->gid))
                return true;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                kuid_t root_uid = make_kuid(cred->user_ns, 0);
                if (!uid_eq(cred->uid, root_uid)) {
                        if (!cap_isclear(cred->cap_effective))
                                return true;
                } else {
                        if (!cap_isidentical(cred->cap_effective,
                            cred->cap_permitted))
                                return true;
                }
        }

        return false;
}

static const struct cred *access_override_creds(void)
{
        struct cred *override_cred;

        override_cred = prepare_creds();
        if (!override_cred)
                return NULL;

        /*
         * XXX access_need_override_creds performs checks in hopes of skipping
         * this work. Make sure it stays in sync if making any changes in this
         * routine.
         */

        override_cred->fsuid = override_cred->uid;
        override_cred->fsgid = override_cred->gid;

        if (!issecure(SECURE_NO_SETUID_FIXUP)) {
                /* Clear the capabilities if we switch to a non-root user */
                kuid_t root_uid = make_kuid(override_cred->user_ns, 0);
                if (!uid_eq(override_cred->uid, root_uid))
                        cap_clear(override_cred->cap_effective);
                else
                        override_cred->cap_effective =
                                override_cred->cap_permitted;
        }

        /*
         * The new set of credentials can *only* be used in
         * task-synchronous circumstances, and does not need
         * RCU freeing, unless somebody then takes a separate
         * reference to it.
         *
         * NOTE! This is _only_ true because this credential
         * is used purely for override_creds() that installs
         * it as the subjective cred. Other threads will be
         * accessing ->real_cred, not the subjective cred.
         *
         * If somebody _does_ make a copy of this (using the
         * 'get_current_cred()' function), that will clear the
         * non_rcu field, because now that other user may be
         * expecting RCU freeing. But normal thread-synchronous
         * cred accesses will keep things non-racy to avoid RCU
         * freeing.
         */
        override_cred->non_rcu = 1;
        return override_creds(override_cred);
}

static int do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
        struct path path;
        struct inode *inode;
        int res;
        unsigned int lookup_flags = LOOKUP_FOLLOW;
        const struct cred *old_cred = NULL;

        if (mode & ~S_IRWXO)        /* where's F_OK, X_OK, W_OK, R_OK? */
                return -EINVAL;

        if (flags & ~(AT_EACCESS | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))
                return -EINVAL;

        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (access_need_override_creds(flags)) {
                old_cred = access_override_creds();
                if (!old_cred)
                        return -ENOMEM;
        }

retry:
        res = user_path_at(dfd, filename, lookup_flags, &path);
        if (res)
                goto out;

        inode = d_backing_inode(path.dentry);

        if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                /*
                 * MAY_EXEC on regular files is denied if the fs is mounted
                 * with the "noexec" flag.
                 */
                res = -EACCES;
                if (path_noexec(&path))
                        goto out_path_release;
        }

        res = inode_permission(mnt_idmap(path.mnt), inode, mode | MAY_ACCESS);
        /* SuS v2 requires we report a read only fs too */
        if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
                goto out_path_release;
        /*
         * This is a rare case where using __mnt_is_readonly()
         * is OK without a mnt_want/drop_write() pair.  Since
         * no actual write to the fs is performed here, we do
         * not need to telegraph to that to anyone.
         *
         * By doing this, we accept that this access is
         * inherently racy and know that the fs may change
         * state before we even see this result.
         */
        if (__mnt_is_readonly(path.mnt))
                res = -EROFS;

out_path_release:
        path_put(&path);
        if (retry_estale(res, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        if (old_cred)
                put_cred(revert_creds(old_cred));

        return res;
}

SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
{
        return do_faccessat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE4(faccessat2, int, dfd, const char __user *, filename, int, mode,
                int, flags)
{
        return do_faccessat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE2(access, const char __user *, filename, int, mode)
{
        return do_faccessat(AT_FDCWD, filename, mode, 0);
}

SYSCALL_DEFINE1(chdir, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        set_fs_pwd(current->fs, &path);

dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
        CLASS(fd_raw, f)(fd);
        int error;

        if (fd_empty(f))
                return -EBADF;

        if (!d_can_lookup(fd_file(f)->f_path.dentry))
                return -ENOTDIR;

        error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
        if (!error)
                set_fs_pwd(current->fs, &fd_file(f)->f_path);
        return error;
}

SYSCALL_DEFINE1(chroot, const char __user *, filename)
{
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
retry:
        error = user_path_at(AT_FDCWD, filename, lookup_flags, &path);
        if (error)
                goto out;

        error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
        if (error)
                goto dput_and_out;

        error = -EPERM;
        if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT))
                goto dput_and_out;
        error = security_path_chroot(&path);
        if (error)
                goto dput_and_out;

        set_fs_root(current->fs, &path);
        error = 0;
dput_and_out:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

int chmod_common(const struct path *path, umode_t mode)
{
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        struct iattr newattrs;
        int error;

        error = mnt_want_write(path->mnt);
        if (error)
                return error;
retry_deleg:
        inode_lock(inode);
        error = security_path_chmod(path, mode);
        if (error)
                goto out_unlock;
        newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
        newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
        error = notify_change(mnt_idmap(path->mnt), path->dentry,
                              &newattrs, &delegated_inode);
out_unlock:
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path->mnt);
        return error;
}

int vfs_fchmod(struct file *file, umode_t mode)
{
        audit_file(file);
        return chmod_common(&file->f_path, mode);
}

SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;

        return vfs_fchmod(fd_file(f), mode);
}

static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
                       unsigned int flags)
{
        struct path path;
        int error;
        unsigned int lookup_flags;

        if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)))
                return -EINVAL;

        lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (!error) {
                error = chmod_common(&path, mode);
                path_put(&path);
                if (retry_estale(error, lookup_flags)) {
                        lookup_flags |= LOOKUP_REVAL;
                        goto retry;
                }
        }
        return error;
}

SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename,
                umode_t, mode, unsigned int, flags)
{
        return do_fchmodat(dfd, filename, mode, flags);
}

SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
                umode_t, mode)
{
        return do_fchmodat(dfd, filename, mode, 0);
}

SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode)
{
        return do_fchmodat(AT_FDCWD, filename, mode, 0);
}

/*
 * Check whether @kuid is valid and if so generate and set vfsuid_t in
 * ia_vfsuid.
 *
 * Return: true if @kuid is valid, false if not.
 */
static inline bool setattr_vfsuid(struct iattr *attr, kuid_t kuid)
{
        if (!uid_valid(kuid))
                return false;
        attr->ia_valid |= ATTR_UID;
        attr->ia_vfsuid = VFSUIDT_INIT(kuid);
        return true;
}

/*
 * Check whether @kgid is valid and if so generate and set vfsgid_t in
 * ia_vfsgid.
 *
 * Return: true if @kgid is valid, false if not.
 */
static inline bool setattr_vfsgid(struct iattr *attr, kgid_t kgid)
{
        if (!gid_valid(kgid))
                return false;
        attr->ia_valid |= ATTR_GID;
        attr->ia_vfsgid = VFSGIDT_INIT(kgid);
        return true;
}

int chown_common(const struct path *path, uid_t user, gid_t group)
{
        struct mnt_idmap *idmap;
        struct user_namespace *fs_userns;
        struct inode *inode = path->dentry->d_inode;
        struct inode *delegated_inode = NULL;
        int error;
        struct iattr newattrs;
        kuid_t uid;
        kgid_t gid;

        uid = make_kuid(current_user_ns(), user);
        gid = make_kgid(current_user_ns(), group);

        idmap = mnt_idmap(path->mnt);
        fs_userns = i_user_ns(inode);

retry_deleg:
        newattrs.ia_vfsuid = INVALID_VFSUID;
        newattrs.ia_vfsgid = INVALID_VFSGID;
        newattrs.ia_valid =  ATTR_CTIME;
        if ((user != (uid_t)-1) && !setattr_vfsuid(&newattrs, uid))
                return -EINVAL;
        if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid))
                return -EINVAL;
        inode_lock(inode);
        if (!S_ISDIR(inode->i_mode))
                newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV |
                                     setattr_should_drop_sgid(idmap, inode);
        /* Continue to send actual fs values, not the mount values. */
        error = security_path_chown(
                path,
                from_vfsuid(idmap, fs_userns, newattrs.ia_vfsuid),
                from_vfsgid(idmap, fs_userns, newattrs.ia_vfsgid));
        if (!error)
                error = notify_change(idmap, path->dentry, &newattrs,
                                      &delegated_inode);
        inode_unlock(inode);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        return error;
}

int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag)
{
        struct path path;
        int error = -EINVAL;
        int lookup_flags;

        if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                goto out;

        lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
retry:
        error = user_path_at(dfd, filename, lookup_flags, &path);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
        if (error)
                goto out_release;
        error = chown_common(&path, user, group);
        mnt_drop_write(path.mnt);
out_release:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        return error;
}

SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
                gid_t, group, int, flag)
{
        return do_fchownat(dfd, filename, user, group, flag);
}

SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group, 0);
}

SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group)
{
        return do_fchownat(AT_FDCWD, filename, user, group,
                           AT_SYMLINK_NOFOLLOW);
}

int vfs_fchown(struct file *file, uid_t user, gid_t group)
{
        int error;

        error = mnt_want_write_file(file);
        if (error)
                return error;
        audit_file(file);
        error = chown_common(&file->f_path, user, group);
        mnt_drop_write_file(file);
        return error;
}

int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return -EBADF;

        return vfs_fchown(fd_file(f), user, group);
}

SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
{
        return ksys_fchown(fd, user, group);
}

static inline int file_get_write_access(struct file *f)
{
        int error;

        error = get_write_access(f->f_inode);
        if (unlikely(error))
                return error;
        error = mnt_get_write_access(f->f_path.mnt);
        if (unlikely(error))
                goto cleanup_inode;
        if (unlikely(f->f_mode & FMODE_BACKING)) {
                error = mnt_get_write_access(backing_file_user_path(f)->mnt);
                if (unlikely(error))
                        goto cleanup_mnt;
        }
        return 0;

cleanup_mnt:
        mnt_put_write_access(f->f_path.mnt);
cleanup_inode:
        put_write_access(f->f_inode);
        return error;
}

static int do_dentry_open(struct file *f,
                          int (*open)(struct inode *, struct file *))
{
        static const struct file_operations empty_fops = {};
        struct inode *inode = f->f_path.dentry->d_inode;
        int error;

        path_get(&f->f_path);
        f->f_inode = inode;
        f->f_mapping = inode->i_mapping;
        f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
        f->f_sb_err = file_sample_sb_err(f);

        if (unlikely(f->f_flags & O_PATH)) {
                f->f_mode = FMODE_PATH | FMODE_OPENED;
                file_set_fsnotify_mode(f, FMODE_NONOTIFY);
                f->f_op = &empty_fops;
                return 0;
        }

        if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
                i_readcount_inc(inode);
        } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
                error = file_get_write_access(f);
                if (unlikely(error))
                        goto cleanup_file;
                f->f_mode |= FMODE_WRITER;
        }

        /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
                f->f_mode |= FMODE_ATOMIC_POS;

        f->f_op = fops_get(inode->i_fop);
        if (WARN_ON(!f->f_op)) {
                error = -ENODEV;
                goto cleanup_all;
        }

        error = security_file_open(f);
        if (error)
                goto cleanup_all;

        /*
         * Set FMODE_NONOTIFY_* bits according to existing permission watches.
         * If FMODE_NONOTIFY mode was already set for an fanotify fd or for a
         * pseudo file, this call will not change the mode.
         */
        file_set_fsnotify_mode_from_watchers(f);
        error = fsnotify_open_perm(f);
        if (error)
                goto cleanup_all;

        error = break_lease(file_inode(f), f->f_flags);
        if (error)
                goto cleanup_all;

        /* normally all 3 are set; ->open() can clear them if needed */
        f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        if (!open)
                open = f->f_op->open;
        if (open) {
                error = open(inode, f);
                if (error)
                        goto cleanup_all;
        }
        f->f_mode |= FMODE_OPENED;
        if ((f->f_mode & FMODE_READ) &&
             likely(f->f_op->read || f->f_op->read_iter))
                f->f_mode |= FMODE_CAN_READ;
        if ((f->f_mode & FMODE_WRITE) &&
             likely(f->f_op->write || f->f_op->write_iter))
                f->f_mode |= FMODE_CAN_WRITE;
        if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
                f->f_mode &= ~FMODE_LSEEK;
        if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
                f->f_mode |= FMODE_CAN_ODIRECT;

        f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
        f->f_iocb_flags = iocb_flags(f);

        file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

        if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
                return -EINVAL;

        /*
         * XXX: Huge page cache doesn't support writing yet. Drop all page
         * cache for this file before processing writes.
         */
        if (f->f_mode & FMODE_WRITE) {
                /*
                 * Depends on full fence from get_write_access() to synchronize
                 * against collapse_file() regarding i_writecount and nr_thps
                 * updates. Ensures subsequent insertion of THPs into the page
                 * cache will fail.
                 */
                if (filemap_nr_thps(inode->i_mapping)) {
                        struct address_space *mapping = inode->i_mapping;

                        filemap_invalidate_lock(inode->i_mapping);
                        /*
                         * unmap_mapping_range just need to be called once
                         * here, because the private pages is not need to be
                         * unmapped mapping (e.g. data segment of dynamic
                         * shared libraries here).
                         */
                        unmap_mapping_range(mapping, 0, 0, 0);
                        truncate_inode_pages(mapping, 0);
                        filemap_invalidate_unlock(inode->i_mapping);
                }
        }

        return 0;

cleanup_all:
        if (WARN_ON_ONCE(error > 0))
                error = -EINVAL;
        fops_put(f->f_op);
        put_file_access(f);
cleanup_file:
        path_put(&f->f_path);
        f->f_path.mnt = NULL;
        f->f_path.dentry = NULL;
        f->f_inode = NULL;
        return error;
}

/**
 * finish_open - finish opening a file
 * @file: file pointer
 * @dentry: pointer to dentry
 * @open: open callback
 *
 * This can be used to finish opening a file passed to i_op->atomic_open().
 *
 * If the open callback is set to NULL, then the standard f_op->open()
 * filesystem callback is substituted.
 *
 * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
 * the return value of d_splice_alias(), then the caller needs to perform dput()
 * on it after finish_open().
 *
 * Returns zero on success or -errno if the open failed.
 */
int finish_open(struct file *file, struct dentry *dentry,
                int (*open)(struct inode *, struct file *))
{
        BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */

        file->f_path.dentry = dentry;
        return do_dentry_open(file, open);
}
EXPORT_SYMBOL(finish_open);

/**
 * finish_no_open - finish ->atomic_open() without opening the file
 *
 * @file: file pointer
 * @dentry: dentry or NULL (as returned from ->lookup())
 *
 * This can be used to set the result of a successful lookup in ->atomic_open().
 *
 * NB: unlike finish_open() this function does consume the dentry reference and
 * the caller need not dput() it.
 *
 * Returns "0" which must be the return value of ->atomic_open() after having
 * called this function.
 */
int finish_no_open(struct file *file, struct dentry *dentry)
{
        file->f_path.dentry = dentry;
        return 0;
}
EXPORT_SYMBOL(finish_no_open);

char *file_path(struct file *filp, char *buf, int buflen)
{
        return d_path(&filp->f_path, buf, buflen);
}
EXPORT_SYMBOL(file_path);

/**
 * vfs_open - open the file at the given path
 * @path: path to open
 * @file: newly allocated file with f_flag initialized
 */
int vfs_open(const struct path *path, struct file *file)
{
        int ret;

        file->f_path = *path;
        ret = do_dentry_open(file, NULL);
        if (!ret) {
                /*
                 * Once we return a file with FMODE_OPENED, __fput() will call
                 * fsnotify_close(), so we need fsnotify_open() here for
                 * symmetry.
                 */
                fsnotify_open(file);
        }
        return ret;
}

struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *cred)
{
        int error;
        struct file *f;

        /* We must always pass in a valid mount pointer. */
        BUG_ON(!path->mnt);

        f = alloc_empty_file(flags, cred);
        if (!IS_ERR(f)) {
                error = vfs_open(path, f);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}
EXPORT_SYMBOL(dentry_open);

struct file *dentry_open_nonotify(const struct path *path, int flags,
                                  const struct cred *cred)
{
        struct file *f = alloc_empty_file(flags, cred);
        if (!IS_ERR(f)) {
                int error;

                file_set_fsnotify_mode(f, FMODE_NONOTIFY);
                error = vfs_open(path, f);
                if (error) {
                        fput(f);
                        f = ERR_PTR(error);
                }
        }
        return f;
}

/**
 * dentry_create - Create and open a file
 * @path: path to create
 * @flags: O_ flags
 * @mode: mode bits for new file
 * @cred: credentials to use
 *
 * Caller must hold the parent directory's lock, and have prepared
 * a negative dentry, placed in @path->dentry, for the new file.
 *
 * Caller sets @path->mnt to the vfsmount of the filesystem where
 * the new file is to be created. The parent directory and the
 * negative dentry must reside on the same filesystem instance.
 *
 * On success, returns a "struct file *". Otherwise a ERR_PTR
 * is returned.
 */
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
                           const struct cred *cred)
{
        struct file *f;
        int error;

        f = alloc_empty_file(flags, cred);
        if (IS_ERR(f))
                return f;

        error = vfs_create(mnt_idmap(path->mnt),
                           d_inode(path->dentry->d_parent),
                           path->dentry, mode, true);
        if (!error)
                error = vfs_open(path, f);

        if (unlikely(error)) {
                fput(f);
                return ERR_PTR(error);
        }
        return f;
}
EXPORT_SYMBOL(dentry_create);

/**
 * kernel_file_open - open a file for kernel internal use
 * @path:        path of the file to open
 * @flags:        open flags
 * @cred:        credentials for open
 *
 * Open a file for use by in-kernel consumers. The file is not accounted
 * against nr_files and must not be installed into the file descriptor
 * table.
 *
 * Return: Opened file on success, an error pointer on failure.
 */
struct file *kernel_file_open(const struct path *path, int flags,
                                const struct cred *cred)
{
        struct file *f;
        int error;

        f = alloc_empty_file_noaccount(flags, cred);
        if (IS_ERR(f))
                return f;

        f->f_path = *path;
        error = do_dentry_open(f, NULL);
        if (error) {
                fput(f);
                return ERR_PTR(error);
        }

        fsnotify_open(f);
        return f;
}
EXPORT_SYMBOL_GPL(kernel_file_open);

#define WILL_CREATE(flags)        (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS                (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)

inline struct open_how build_open_how(int flags, umode_t mode)
{
        struct open_how how = {
                .flags = flags & VALID_OPEN_FLAGS,
                .mode = mode & S_IALLUGO,
        };

        /* O_PATH beats everything else. */
        if (how.flags & O_PATH)
                how.flags &= O_PATH_FLAGS;
        /* Modes should only be set for create-like flags. */
        if (!WILL_CREATE(how.flags))
                how.mode = 0;
        return how;
}

inline int build_open_flags(const struct open_how *how, struct open_flags *op)
{
        u64 flags = how->flags;
        u64 strip = O_CLOEXEC;
        int lookup_flags = 0;
        int acc_mode = ACC_MODE(flags);

        BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
                         "struct open_flags doesn't yet handle flags > 32 bits");

        /*
         * Strip flags that aren't relevant in determining struct open_flags.
         */
        flags &= ~strip;

        /*
         * Older syscalls implicitly clear all of the invalid flags or argument
         * values before calling build_open_flags(), but openat2(2) checks all
         * of its arguments.
         */
        if (flags & ~VALID_OPEN_FLAGS)
                return -EINVAL;
        if (how->resolve & ~VALID_RESOLVE_FLAGS)
                return -EINVAL;

        /* Scoping flags are mutually exclusive. */
        if ((how->resolve & RESOLVE_BENEATH) && (how->resolve & RESOLVE_IN_ROOT))
                return -EINVAL;

        /* Deal with the mode. */
        if (WILL_CREATE(flags)) {
                if (how->mode & ~S_IALLUGO)
                        return -EINVAL;
                op->mode = how->mode | S_IFREG;
        } else {
                if (how->mode != 0)
                        return -EINVAL;
                op->mode = 0;
        }

        /*
         * Block bugs where O_DIRECTORY | O_CREAT created regular files.
         * Note, that blocking O_DIRECTORY | O_CREAT here also protects
         * O_TMPFILE below which requires O_DIRECTORY being raised.
         */
        if ((flags & (O_DIRECTORY | O_CREAT)) == (O_DIRECTORY | O_CREAT))
                return -EINVAL;

        /* Now handle the creative implementation of O_TMPFILE. */
        if (flags & __O_TMPFILE) {
                /*
                 * In order to ensure programs get explicit errors when trying
                 * to use O_TMPFILE on old kernels we enforce that O_DIRECTORY
                 * is raised alongside __O_TMPFILE.
                 */
                if (!(flags & O_DIRECTORY))
                        return -EINVAL;
                if (!(acc_mode & MAY_WRITE))
                        return -EINVAL;
        }
        if (flags & O_PATH) {
                /* O_PATH only permits certain other flags to be set. */
                if (flags & ~O_PATH_FLAGS)
                        return -EINVAL;
                acc_mode = 0;
        }

        /*
         * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
         * check for O_DSYNC if the need any syncing at all we enforce it's
         * always set instead of having to deal with possibly weird behaviour
         * for malicious applications setting only __O_SYNC.
         */
        if (flags & __O_SYNC)
                flags |= O_DSYNC;

        op->open_flag = flags;

        /* O_TRUNC implies we need access checks for write permissions */
        if (flags & O_TRUNC)
                acc_mode |= MAY_WRITE;

        /* Allow the LSM permission hook to distinguish append
           access from general write access. */
        if (flags & O_APPEND)
                acc_mode |= MAY_APPEND;

        op->acc_mode = acc_mode;

        op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;

        if (flags & O_CREAT) {
                op->intent |= LOOKUP_CREATE;
                if (flags & O_EXCL) {
                        op->intent |= LOOKUP_EXCL;
                        flags |= O_NOFOLLOW;
                }
        }

        if (flags & O_DIRECTORY)
                lookup_flags |= LOOKUP_DIRECTORY;
        if (!(flags & O_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;

        if (how->resolve & RESOLVE_NO_XDEV)
                lookup_flags |= LOOKUP_NO_XDEV;
        if (how->resolve & RESOLVE_NO_MAGICLINKS)
                lookup_flags |= LOOKUP_NO_MAGICLINKS;
        if (how->resolve & RESOLVE_NO_SYMLINKS)
                lookup_flags |= LOOKUP_NO_SYMLINKS;
        if (how->resolve & RESOLVE_BENEATH)
                lookup_flags |= LOOKUP_BENEATH;
        if (how->resolve & RESOLVE_IN_ROOT)
                lookup_flags |= LOOKUP_IN_ROOT;
        if (how->resolve & RESOLVE_CACHED) {
                /* Don't bother even trying for create/truncate/tmpfile open */
                if (flags & (O_TRUNC | O_CREAT | __O_TMPFILE))
                        return -EAGAIN;
                lookup_flags |= LOOKUP_CACHED;
        }

        op->lookup_flags = lookup_flags;
        return 0;
}

/**
 * file_open_name - open file and return file pointer
 *
 * @name:        struct filename containing path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *file_open_name(struct filename *name, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_filp_open(AT_FDCWD, name, &op);
}

/**
 * filp_open - open file and return file pointer
 *
 * @filename:        path to open
 * @flags:        open flags as per the open(2) second argument
 * @mode:        mode for the new file if O_CREAT is set, else ignored
 *
 * This is the helper to open a file from kernelspace if you really
 * have to.  But in generally you should not do this, so please move
 * along, nothing to see here..
 */
struct file *filp_open(const char *filename, int flags, umode_t mode)
{
        struct filename *name = getname_kernel(filename);
        struct file *file = ERR_CAST(name);

        if (!IS_ERR(name)) {
                file = file_open_name(name, flags, mode);
                putname(name);
        }
        return file;
}
EXPORT_SYMBOL(filp_open);

struct file *file_open_root(const struct path *root,
                            const char *filename, int flags, umode_t mode)
{
        struct open_flags op;
        struct open_how how = build_open_how(flags, mode);
        int err = build_open_flags(&how, &op);
        if (err)
                return ERR_PTR(err);
        return do_file_open_root(root, filename, &op);
}
EXPORT_SYMBOL(file_open_root);

static int do_sys_openat2(int dfd, const char __user *filename,
                          struct open_how *how)
{
        struct open_flags op;
        struct filename *tmp;
        int err, fd;

        err = build_open_flags(how, &op);
        if (unlikely(err))
                return err;

        tmp = getname(filename);
        if (IS_ERR(tmp))
                return PTR_ERR(tmp);

        fd = get_unused_fd_flags(how->flags);
        if (likely(fd >= 0)) {
                struct file *f = do_filp_open(dfd, tmp, &op);
                if (IS_ERR(f)) {
                        put_unused_fd(fd);
                        fd = PTR_ERR(f);
                } else {
                        fd_install(fd, f);
                }
        }
        putname(tmp);
        return fd;
}

int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
        struct open_how how = build_open_how(flags, mode);
        return do_sys_openat2(dfd, filename, &how);
}


SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
                umode_t, mode)
{
        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(dfd, filename, flags, mode);
}

SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
                struct open_how __user *, how, size_t, usize)
{
        int err;
        struct open_how tmp;

        BUILD_BUG_ON(sizeof(struct open_how) < OPEN_HOW_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct open_how) != OPEN_HOW_SIZE_LATEST);

        if (unlikely(usize < OPEN_HOW_SIZE_VER0))
                return -EINVAL;
        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;

        err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
        if (err)
                return err;

        audit_openat2_how(&tmp);

        /* O_LARGEFILE is only allowed for non-O_PATH. */
        if (!(tmp.flags & O_PATH) && force_o_largefile())
                tmp.flags |= O_LARGEFILE;

        return do_sys_openat2(dfd, filename, &tmp);
}

#ifdef CONFIG_COMPAT
/*
 * Exactly like sys_open(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(AT_FDCWD, filename, flags, mode);
}

/*
 * Exactly like sys_openat(), except that it doesn't set the
 * O_LARGEFILE flag.
 */
COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode)
{
        return do_sys_open(dfd, filename, flags, mode);
}
#endif

#ifndef __alpha__

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode)
{
        int flags = O_CREAT | O_WRONLY | O_TRUNC;

        if (force_o_largefile())
                flags |= O_LARGEFILE;
        return do_sys_open(AT_FDCWD, pathname, flags, mode);
}
#endif

/*
 * "id" is the POSIX thread ID. We use the
 * files pointer for this..
 */
static int filp_flush(struct file *filp, fl_owner_t id)
{
        int retval = 0;

        if (CHECK_DATA_CORRUPTION(file_count(filp) == 0, filp,
                        "VFS: Close: file count is 0 (f_op=%ps)",
                        filp->f_op)) {
                return 0;
        }

        if (filp->f_op->flush)
                retval = filp->f_op->flush(filp, id);

        if (likely(!(filp->f_mode & FMODE_PATH))) {
                dnotify_flush(filp, id);
                locks_remove_posix(filp, id);
        }
        return retval;
}

int filp_close(struct file *filp, fl_owner_t id)
{
        int retval;

        retval = filp_flush(filp, id);
        fput_close(filp);

        return retval;
}
EXPORT_SYMBOL(filp_close);

/*
 * Careful here! We test whether the file pointer is NULL before
 * releasing the fd. This ensures that one clone task can't release
 * an fd while another clone is opening it.
 */
SYSCALL_DEFINE1(close, unsigned int, fd)
{
        int retval;
        struct file *file;

        file = file_close_fd(fd);
        if (!file)
                return -EBADF;

        retval = filp_flush(file, current->files);

        /*
         * We're returning to user space. Don't bother
         * with any delayed fput() cases.
         */
        fput_close_sync(file);

        if (likely(retval == 0))
                return 0;

        /* can't restart close syscall because file table entry was cleared */
        if (retval == -ERESTARTSYS ||
            retval == -ERESTARTNOINTR ||
            retval == -ERESTARTNOHAND ||
            retval == -ERESTART_RESTARTBLOCK)
                retval = -EINTR;

        return retval;
}

/*
 * This routine simulates a hangup on the tty, to arrange that users
 * are given clean terminals at login time.
 */
SYSCALL_DEFINE0(vhangup)
{
        if (capable(CAP_SYS_TTY_CONFIG)) {
                tty_vhangup_self();
                return 0;
        }
        return -EPERM;
}

/*
 * Called when an inode is about to be open.
 * We use this to disallow opening large files on 32bit systems if
 * the caller didn't specify O_LARGEFILE.  On 64bit systems we force
 * on this flag in sys_open.
 */
int generic_file_open(struct inode * inode, struct file * filp)
{
        if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
                return -EOVERFLOW;
        return 0;
}

EXPORT_SYMBOL(generic_file_open);

/*
 * This is used by subsystems that don't want seekable
 * file descriptors. The function is not supposed to ever fail, the only
 * reason it returns an 'int' and not 'void' is so that it can be plugged
 * directly into file_operations structure.
 */
int nonseekable_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
        return 0;
}

EXPORT_SYMBOL(nonseekable_open);

/*
 * stream_open is used by subsystems that want stream-like file descriptors.
 * Such file descriptors are not seekable and don't have notion of position
 * (file.f_pos is always 0 and ppos passed to .read()/.write() is always NULL).
 * Contrary to file descriptors of other regular files, .read() and .write()
 * can run simultaneously.
 *
 * stream_open never fails and is marked to return int so that it could be
 * directly used as file_operations.open .
 */
int stream_open(struct inode *inode, struct file *filp)
{
        filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE | FMODE_ATOMIC_POS);
        filp->f_mode |= FMODE_STREAM;
        return 0;
}

EXPORT_SYMBOL(stream_open);





































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _linux_POSIX_TIMERS_H
#define _linux_POSIX_TIMERS_H

#include <linux/alarmtimer.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/pid.h>
#include <linux/posix-timers_types.h>
#include <linux/rcuref.h>
#include <linux/spinlock.h>
#include <linux/timerqueue.h>

struct kernel_siginfo;
struct task_struct;
struct sigqueue;
struct k_itimer;

static inline clockid_t make_process_cpuclock(const unsigned int pid,
                const clockid_t clock)
{
        return ((~pid) << 3) | clock;
}
static inline clockid_t make_thread_cpuclock(const unsigned int tid,
                const clockid_t clock)
{
        return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK);
}

static inline clockid_t fd_to_clockid(const int fd)
{
        return make_process_cpuclock((unsigned int) fd, CLOCKFD);
}

static inline int clockid_to_fd(const clockid_t clk)
{
        return ~(clk >> 3);
}

#ifdef CONFIG_POSIX_TIMERS

#include <linux/signal_types.h>

/**
 * cpu_timer - Posix CPU timer representation for k_itimer
 * @node:        timerqueue node to queue in the task/sig
 * @head:        timerqueue head on which this timer is queued
 * @pid:        Pointer to target task PID
 * @elist:        List head for the expiry list
 * @firing:        Timer is currently firing
 * @nanosleep:        Timer is used for nanosleep and is not a regular posix-timer
 * @handling:        Pointer to the task which handles expiry
 */
struct cpu_timer {
        struct timerqueue_node                node;
        struct timerqueue_head                *head;
        struct pid                        *pid;
        struct list_head                elist;
        bool                                firing;
        bool                                nanosleep;
        struct task_struct __rcu        *handling;
};

static inline bool cpu_timer_enqueue(struct timerqueue_head *head,
                                     struct cpu_timer *ctmr)
{
        ctmr->head = head;
        return timerqueue_add(head, &ctmr->node);
}

static inline bool cpu_timer_queued(struct cpu_timer *ctmr)
{
        return !!ctmr->head;
}

static inline bool cpu_timer_dequeue(struct cpu_timer *ctmr)
{
        if (cpu_timer_queued(ctmr)) {
                timerqueue_del(ctmr->head, &ctmr->node);
                ctmr->head = NULL;
                return true;
        }
        return false;
}

static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr)
{
        return ctmr->node.expires;
}

static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp)
{
        ctmr->node.expires = exp;
}

static inline void posix_cputimers_init(struct posix_cputimers *pct)
{
        memset(pct, 0, sizeof(*pct));
        pct->bases[0].nextevt = U64_MAX;
        pct->bases[1].nextevt = U64_MAX;
        pct->bases[2].nextevt = U64_MAX;
}

void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit);

static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct,
                                               u64 runtime)
{
        pct->bases[CPUCLOCK_SCHED].nextevt = runtime;
}

void posixtimer_rearm_itimer(struct task_struct *p);
bool posixtimer_init_sigqueue(struct sigqueue *q);
void posixtimer_send_sigqueue(struct k_itimer *tmr);
bool posixtimer_deliver_signal(struct kernel_siginfo *info, struct sigqueue *timer_sigq);
void posixtimer_free_timer(struct k_itimer *timer);
long posixtimer_create_prctl(unsigned long ctrl);

/* Init task static initializer */
#define INIT_CPU_TIMERBASE(b) {                                                \
        .nextevt        = U64_MAX,                                        \
}

#define INIT_CPU_TIMERBASES(b) {                                        \
        INIT_CPU_TIMERBASE(b[0]),                                        \
        INIT_CPU_TIMERBASE(b[1]),                                        \
        INIT_CPU_TIMERBASE(b[2]),                                        \
}

#define INIT_CPU_TIMERS(s)                                                \
        .posix_cputimers = {                                                \
                .bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases),        \
        },
#else
struct cpu_timer { };
#define INIT_CPU_TIMERS(s)
static inline void posix_cputimers_init(struct posix_cputimers *pct) { }
static inline void posix_cputimers_group_init(struct posix_cputimers *pct,
                                              u64 cpu_limit) { }
static inline void posixtimer_rearm_itimer(struct task_struct *p) { }
static inline bool posixtimer_deliver_signal(struct kernel_siginfo *info,
                                             struct sigqueue *timer_sigq) { return false; }
static inline void posixtimer_free_timer(struct k_itimer *timer) { }
static inline long posixtimer_create_prctl(unsigned long ctrl) { return -EINVAL; }
#endif

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
void clear_posix_cputimers_work(struct task_struct *p);
void posix_cputimers_init_work(void);
#else
static inline void clear_posix_cputimers_work(struct task_struct *p) { }
static inline void posix_cputimers_init_work(void) { }
#endif

/**
 * struct k_itimer - POSIX.1b interval timer structure.
 * @list:                List node for binding the timer to tsk::signal::posix_timers
 * @ignored_list:        List node for tracking ignored timers in tsk::signal::ignored_posix_timers
 * @t_hash:                Entry in the posix timer hash table
 * @it_lock:                Lock protecting the timer
 * @kclock:                Pointer to the k_clock struct handling this timer
 * @it_clock:                The posix timer clock id
 * @it_id:                The posix timer id for identifying the timer
 * @it_status:                The status of the timer
 * @it_sig_periodic:        The periodic status at signal delivery
 * @it_overrun:                The overrun counter for pending signals
 * @it_overrun_last:        The overrun at the time of the last delivered signal
 * @it_signal_seq:        Sequence count to control signal delivery
 * @it_sigqueue_seq:        The sequence count at the point where the signal was queued
 * @it_sigev_notify:        The notify word of sigevent struct for signal delivery
 * @it_interval:        The interval for periodic timers
 * @it_signal:                Pointer to the creators signal struct
 * @it_pid:                The pid of the process/task targeted by the signal
 * @it_process:                The task to wakeup on clock_nanosleep (CPU timers)
 * @rcuref:                Reference count for life time management
 * @sigq:                Embedded sigqueue
 * @it:                        Union representing the various posix timer type
 *                        internals.
 * @rcu:                RCU head for freeing the timer.
 */
struct k_itimer {
        /* 1st cacheline contains read-mostly fields */
        struct hlist_node        t_hash;
        struct hlist_node        list;
        timer_t                        it_id;
        clockid_t                it_clock;
        int                        it_sigev_notify;
        enum pid_type                it_pid_type;
        struct signal_struct        *it_signal;
        const struct k_clock        *kclock;

        /* 2nd cacheline and above contain fields which are modified regularly */
        spinlock_t                it_lock;
        int                        it_status;
        bool                        it_sig_periodic;
        s64                        it_overrun;
        s64                        it_overrun_last;
        unsigned int                it_signal_seq;
        unsigned int                it_sigqueue_seq;
        ktime_t                        it_interval;
        struct hlist_node        ignored_list;
        union {
                struct pid                *it_pid;
                struct task_struct        *it_process;
        };
        struct sigqueue                sigq;
        rcuref_t                rcuref;
        union {
                struct {
                        struct hrtimer        timer;
                } real;
                struct cpu_timer        cpu;
                struct {
                        struct alarm        alarmtimer;
                } alarm;
        } it;
        struct rcu_head                rcu;
} ____cacheline_aligned_in_smp;

void run_posix_cpu_timers(void);
void posix_cpu_timers_exit(struct task_struct *task);
void posix_cpu_timers_exit_group(struct task_struct *task);
void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
                           u64 *newval, u64 *oldval);

int update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new);

#ifdef CONFIG_POSIX_TIMERS
static inline void posixtimer_putref(struct k_itimer *tmr)
{
        if (rcuref_put(&tmr->rcuref))
                posixtimer_free_timer(tmr);
}

static inline void posixtimer_sigqueue_getref(struct sigqueue *q)
{
        struct k_itimer *tmr = container_of(q, struct k_itimer, sigq);

        WARN_ON_ONCE(!rcuref_get(&tmr->rcuref));
}

static inline void posixtimer_sigqueue_putref(struct sigqueue *q)
{
        struct k_itimer *tmr = container_of(q, struct k_itimer, sigq);

        posixtimer_putref(tmr);
}

static inline bool posixtimer_valid(const struct k_itimer *timer)
{
        unsigned long val = (unsigned long)timer->it_signal;

        return !(val & 0x1UL);
}
#else  /* CONFIG_POSIX_TIMERS */
static inline void posixtimer_sigqueue_getref(struct sigqueue *q) { }
static inline void posixtimer_sigqueue_putref(struct sigqueue *q) { }
#endif /* !CONFIG_POSIX_TIMERS */

#endif


























































































   34 



















   34 















































   34 


























   34 













   34 










   34 











































   34 
   34 
   34 


   34 
   34 









































































































   34 
   34 

   34 
   34 

   34 

   34 












































































































































































































































































































































































































































































































































































































































































































































































































   34 



   34 






   34 
   34 






   34 










   34 









   34 







































   34 






   34 






   34 










   34 
   34 

   34 
   34 

   34 
   34 
   34 

   34 

   34 




   34 



   34 



   34 
   34 
   34 


   34 










   34 










   34 





   34 




















   34 






































   34 
   34 
















   34 










   34 












   34 












































   34 

   34 













   34 



















   34 






































   34 


   34 


   34 
   34 


   34 

   34 


   34 







   34 



















   34 



   34 










   34 





















   34 





































   34 





   34 








   34 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/kernel/signal.c
 *
 * Copyright (C) 1995-2009 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/cache.h>
#include <linux/compat.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/signal.h>
#include <linux/freezer.h>
#include <linux/stddef.h>
#include <linux/uaccess.h>
#include <linux/sizes.h>
#include <linux/string.h>
#include <linux/ratelimit.h>
#include <linux/rseq.h>
#include <linux/syscalls.h>
#include <linux/pkeys.h>

#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
#include <asm/elf.h>
#include <asm/exception.h>
#include <asm/cacheflush.h>
#include <asm/gcs.h>
#include <asm/ucontext.h>
#include <asm/unistd.h>
#include <asm/fpsimd.h>
#include <asm/ptrace.h>
#include <asm/syscall.h>
#include <asm/signal32.h>
#include <asm/traps.h>
#include <asm/vdso.h>

#define GCS_SIGNAL_CAP(addr) (((unsigned long)addr) & GCS_CAP_ADDR_MASK)

/*
 * Do a signal return; undo the signal stack. These are aligned to 128-bit.
 */
struct rt_sigframe {
        struct siginfo info;
        struct ucontext uc;
};

struct rt_sigframe_user_layout {
        struct rt_sigframe __user *sigframe;
        struct frame_record __user *next_frame;

        unsigned long size;        /* size of allocated sigframe data */
        unsigned long limit;        /* largest allowed size */

        unsigned long fpsimd_offset;
        unsigned long esr_offset;
        unsigned long gcs_offset;
        unsigned long sve_offset;
        unsigned long tpidr2_offset;
        unsigned long za_offset;
        unsigned long zt_offset;
        unsigned long fpmr_offset;
        unsigned long poe_offset;
        unsigned long extra_offset;
        unsigned long end_offset;
};

/*
 * Holds any EL0-controlled state that influences unprivileged memory accesses.
 * This includes both accesses done in userspace and uaccess done in the kernel.
 *
 * This state needs to be carefully managed to ensure that it doesn't cause
 * uaccess to fail when setting up the signal frame, and the signal handler
 * itself also expects a well-defined state when entered.
 */
struct user_access_state {
        u64 por_el0;
};

#define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16)
#define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16)

/*
 * Save the user access state into ua_state and reset it to disable any
 * restrictions.
 */
static void save_reset_user_access_state(struct user_access_state *ua_state)
{
        if (system_supports_poe()) {
                u64 por_enable_all = 0;

                for (int pkey = 0; pkey < arch_max_pkey(); pkey++)
                        por_enable_all |= POR_ELx_PERM_PREP(pkey, POE_RWX);

                ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0);
                write_sysreg_s(por_enable_all, SYS_POR_EL0);
                /* Ensure that any subsequent uaccess observes the updated value */
                isb();
        }
}

/*
 * Set the user access state for invoking the signal handler.
 *
 * No uaccess should be done after that function is called.
 */
static void set_handler_user_access_state(void)
{
        if (system_supports_poe())
                write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0);
}

/*
 * Restore the user access state to the values saved in ua_state.
 *
 * No uaccess should be done after that function is called.
 */
static void restore_user_access_state(const struct user_access_state *ua_state)
{
        if (system_supports_poe())
                write_sysreg_s(ua_state->por_el0, SYS_POR_EL0);
}

static void init_user_layout(struct rt_sigframe_user_layout *user)
{
        const size_t reserved_size =
                sizeof(user->sigframe->uc.uc_mcontext.__reserved);

        memset(user, 0, sizeof(*user));
        user->size = offsetof(struct rt_sigframe, uc.uc_mcontext.__reserved);

        user->limit = user->size + reserved_size;

        user->limit -= TERMINATOR_SIZE;
        user->limit -= EXTRA_CONTEXT_SIZE;
        /* Reserve space for extension and terminator ^ */
}

static size_t sigframe_size(struct rt_sigframe_user_layout const *user)
{
        return round_up(max(user->size, sizeof(struct rt_sigframe)), 16);
}

/*
 * Sanity limit on the approximate maximum size of signal frame we'll
 * try to generate.  Stack alignment padding and the frame record are
 * not taken into account.  This limit is not a guarantee and is
 * NOT ABI.
 */
#define SIGFRAME_MAXSZ SZ_256K

static int __sigframe_alloc(struct rt_sigframe_user_layout *user,
                            unsigned long *offset, size_t size, bool extend)
{
        size_t padded_size = round_up(size, 16);

        if (padded_size > user->limit - user->size &&
            !user->extra_offset &&
            extend) {
                int ret;

                user->limit += EXTRA_CONTEXT_SIZE;
                ret = __sigframe_alloc(user, &user->extra_offset,
                                       sizeof(struct extra_context), false);
                if (ret) {
                        user->limit -= EXTRA_CONTEXT_SIZE;
                        return ret;
                }

                /* Reserve space for the __reserved[] terminator */
                user->size += TERMINATOR_SIZE;

                /*
                 * Allow expansion up to SIGFRAME_MAXSZ, ensuring space for
                 * the terminator:
                 */
                user->limit = SIGFRAME_MAXSZ - TERMINATOR_SIZE;
        }

        /* Still not enough space?  Bad luck! */
        if (padded_size > user->limit - user->size)
                return -ENOMEM;

        *offset = user->size;
        user->size += padded_size;

        return 0;
}

/*
 * Allocate space for an optional record of <size> bytes in the user
 * signal frame.  The offset from the signal frame base address to the
 * allocated block is assigned to *offset.
 */
static int sigframe_alloc(struct rt_sigframe_user_layout *user,
                          unsigned long *offset, size_t size)
{
        return __sigframe_alloc(user, offset, size, true);
}

/* Allocate the null terminator record and prevent further allocations */
static int sigframe_alloc_end(struct rt_sigframe_user_layout *user)
{
        int ret;

        /* Un-reserve the space reserved for the terminator: */
        user->limit += TERMINATOR_SIZE;

        ret = sigframe_alloc(user, &user->end_offset,
                             sizeof(struct _aarch64_ctx));
        if (ret)
                return ret;

        /* Prevent further allocation: */
        user->limit = user->size;
        return 0;
}

static void __user *apply_user_offset(
        struct rt_sigframe_user_layout const *user, unsigned long offset)
{
        char __user *base = (char __user *)user->sigframe;

        return base + offset;
}

struct user_ctxs {
        struct fpsimd_context __user *fpsimd;
        u32 fpsimd_size;
        struct sve_context __user *sve;
        u32 sve_size;
        struct tpidr2_context __user *tpidr2;
        u32 tpidr2_size;
        struct za_context __user *za;
        u32 za_size;
        struct zt_context __user *zt;
        u32 zt_size;
        struct fpmr_context __user *fpmr;
        u32 fpmr_size;
        struct poe_context __user *poe;
        u32 poe_size;
        struct gcs_context __user *gcs;
        u32 gcs_size;
};

static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
{
        struct user_fpsimd_state const *fpsimd =
                &current->thread.uw.fpsimd_state;
        int err;

        /* copy the FP and status/control registers */
        err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
        __put_user_error(fpsimd->fpsr, &ctx->fpsr, err);
        __put_user_error(fpsimd->fpcr, &ctx->fpcr, err);

        /* copy the magic/size information */
        __put_user_error(FPSIMD_MAGIC, &ctx->head.magic, err);
        __put_user_error(sizeof(struct fpsimd_context), &ctx->head.size, err);

        return err ? -EFAULT : 0;
}

static int restore_fpsimd_context(struct user_ctxs *user)
{
        struct user_fpsimd_state fpsimd;
        int err = 0;

        /* check the size information */
        if (user->fpsimd_size != sizeof(struct fpsimd_context))
                return -EINVAL;

        /* copy the FP and status/control registers */
        err = __copy_from_user(fpsimd.vregs, &(user->fpsimd->vregs),
                               sizeof(fpsimd.vregs));
        __get_user_error(fpsimd.fpsr, &(user->fpsimd->fpsr), err);
        __get_user_error(fpsimd.fpcr, &(user->fpsimd->fpcr), err);

        clear_thread_flag(TIF_SVE);
        current->thread.fp_type = FP_STATE_FPSIMD;

        /* load the hardware registers from the fpsimd_state structure */
        if (!err)
                fpsimd_update_current_state(&fpsimd);

        return err ? -EFAULT : 0;
}

static int preserve_fpmr_context(struct fpmr_context __user *ctx)
{
        int err = 0;

        current->thread.uw.fpmr = read_sysreg_s(SYS_FPMR);

        __put_user_error(FPMR_MAGIC, &ctx->head.magic, err);
        __put_user_error(sizeof(*ctx), &ctx->head.size, err);
        __put_user_error(current->thread.uw.fpmr, &ctx->fpmr, err);

        return err;
}

static int restore_fpmr_context(struct user_ctxs *user)
{
        u64 fpmr;
        int err = 0;

        if (user->fpmr_size != sizeof(*user->fpmr))
                return -EINVAL;

        __get_user_error(fpmr, &user->fpmr->fpmr, err);
        if (!err)
                write_sysreg_s(fpmr, SYS_FPMR);

        return err;
}

static int preserve_poe_context(struct poe_context __user *ctx,
                                const struct user_access_state *ua_state)
{
        int err = 0;

        __put_user_error(POE_MAGIC, &ctx->head.magic, err);
        __put_user_error(sizeof(*ctx), &ctx->head.size, err);
        __put_user_error(ua_state->por_el0, &ctx->por_el0, err);

        return err;
}

static int restore_poe_context(struct user_ctxs *user,
                               struct user_access_state *ua_state)
{
        u64 por_el0;
        int err = 0;

        if (user->poe_size != sizeof(*user->poe))
                return -EINVAL;

        __get_user_error(por_el0, &(user->poe->por_el0), err);
        if (!err)
                ua_state->por_el0 = por_el0;

        return err;
}

#ifdef CONFIG_ARM64_SVE

static int preserve_sve_context(struct sve_context __user *ctx)
{
        int err = 0;
        u16 reserved[ARRAY_SIZE(ctx->__reserved)];
        u16 flags = 0;
        unsigned int vl = task_get_sve_vl(current);
        unsigned int vq = 0;

        if (thread_sm_enabled(&current->thread)) {
                vl = task_get_sme_vl(current);
                vq = sve_vq_from_vl(vl);
                flags |= SVE_SIG_FLAG_SM;
        } else if (current->thread.fp_type == FP_STATE_SVE) {
                vq = sve_vq_from_vl(vl);
        }

        memset(reserved, 0, sizeof(reserved));

        __put_user_error(SVE_MAGIC, &ctx->head.magic, err);
        __put_user_error(round_up(SVE_SIG_CONTEXT_SIZE(vq), 16),
                         &ctx->head.size, err);
        __put_user_error(vl, &ctx->vl, err);
        __put_user_error(flags, &ctx->flags, err);
        BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved));
        err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved));

        if (vq) {
                /*
                 * This assumes that the SVE state has already been saved to
                 * the task struct by calling the function
                 * fpsimd_signal_preserve_current_state().
                 */
                err |= __copy_to_user((char __user *)ctx + SVE_SIG_REGS_OFFSET,
                                      current->thread.sve_state,
                                      SVE_SIG_REGS_SIZE(vq));
        }

        return err ? -EFAULT : 0;
}

static int restore_sve_fpsimd_context(struct user_ctxs *user)
{
        int err = 0;
        unsigned int vl, vq;
        struct user_fpsimd_state fpsimd;
        u16 user_vl, flags;

        if (user->sve_size < sizeof(*user->sve))
                return -EINVAL;

        __get_user_error(user_vl, &(user->sve->vl), err);
        __get_user_error(flags, &(user->sve->flags), err);
        if (err)
                return err;

        if (flags & SVE_SIG_FLAG_SM) {
                if (!system_supports_sme())
                        return -EINVAL;

                vl = task_get_sme_vl(current);
        } else {
                /*
                 * A SME only system use SVE for streaming mode so can
                 * have a SVE formatted context with a zero VL and no
                 * payload data.
                 */
                if (!system_supports_sve() && !system_supports_sme())
                        return -EINVAL;

                vl = task_get_sve_vl(current);
        }

        if (user_vl != vl)
                return -EINVAL;

        if (user->sve_size == sizeof(*user->sve)) {
                clear_thread_flag(TIF_SVE);
                current->thread.svcr &= ~SVCR_SM_MASK;
                current->thread.fp_type = FP_STATE_FPSIMD;
                goto fpsimd_only;
        }

        vq = sve_vq_from_vl(vl);

        if (user->sve_size < SVE_SIG_CONTEXT_SIZE(vq))
                return -EINVAL;

        /*
         * Careful: we are about __copy_from_user() directly into
         * thread.sve_state with preemption enabled, so protection is
         * needed to prevent a racing context switch from writing stale
         * registers back over the new data.
         */

        fpsimd_flush_task_state(current);
        /* From now, fpsimd_thread_switch() won't touch thread.sve_state */

        sve_alloc(current, true);
        if (!current->thread.sve_state) {
                clear_thread_flag(TIF_SVE);
                return -ENOMEM;
        }

        err = __copy_from_user(current->thread.sve_state,
                               (char __user const *)user->sve +
                                        SVE_SIG_REGS_OFFSET,
                               SVE_SIG_REGS_SIZE(vq));
        if (err)
                return -EFAULT;

        if (flags & SVE_SIG_FLAG_SM)
                current->thread.svcr |= SVCR_SM_MASK;
        else
                set_thread_flag(TIF_SVE);
        current->thread.fp_type = FP_STATE_SVE;

fpsimd_only:
        /* copy the FP and status/control registers */
        /* restore_sigframe() already checked that user->fpsimd != NULL. */
        err = __copy_from_user(fpsimd.vregs, user->fpsimd->vregs,
                               sizeof(fpsimd.vregs));
        __get_user_error(fpsimd.fpsr, &user->fpsimd->fpsr, err);
        __get_user_error(fpsimd.fpcr, &user->fpsimd->fpcr, err);

        /* load the hardware registers from the fpsimd_state structure */
        if (!err)
                fpsimd_update_current_state(&fpsimd);

        return err ? -EFAULT : 0;
}

#else /* ! CONFIG_ARM64_SVE */

static int restore_sve_fpsimd_context(struct user_ctxs *user)
{
        WARN_ON_ONCE(1);
        return -EINVAL;
}

/* Turn any non-optimised out attempts to use this into a link error: */
extern int preserve_sve_context(void __user *ctx);

#endif /* ! CONFIG_ARM64_SVE */

#ifdef CONFIG_ARM64_SME

static int preserve_tpidr2_context(struct tpidr2_context __user *ctx)
{
        int err = 0;

        current->thread.tpidr2_el0 = read_sysreg_s(SYS_TPIDR2_EL0);

        __put_user_error(TPIDR2_MAGIC, &ctx->head.magic, err);
        __put_user_error(sizeof(*ctx), &ctx->head.size, err);
        __put_user_error(current->thread.tpidr2_el0, &ctx->tpidr2, err);

        return err;
}

static int restore_tpidr2_context(struct user_ctxs *user)
{
        u64 tpidr2_el0;
        int err = 0;

        if (user->tpidr2_size != sizeof(*user->tpidr2))
                return -EINVAL;

        __get_user_error(tpidr2_el0, &user->tpidr2->tpidr2, err);
        if (!err)
                write_sysreg_s(tpidr2_el0, SYS_TPIDR2_EL0);

        return err;
}

static int preserve_za_context(struct za_context __user *ctx)
{
        int err = 0;
        u16 reserved[ARRAY_SIZE(ctx->__reserved)];
        unsigned int vl = task_get_sme_vl(current);
        unsigned int vq;

        if (thread_za_enabled(&current->thread))
                vq = sve_vq_from_vl(vl);
        else
                vq = 0;

        memset(reserved, 0, sizeof(reserved));

        __put_user_error(ZA_MAGIC, &ctx->head.magic, err);
        __put_user_error(round_up(ZA_SIG_CONTEXT_SIZE(vq), 16),
                         &ctx->head.size, err);
        __put_user_error(vl, &ctx->vl, err);
        BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved));
        err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved));

        if (vq) {
                /*
                 * This assumes that the ZA state has already been saved to
                 * the task struct by calling the function
                 * fpsimd_signal_preserve_current_state().
                 */
                err |= __copy_to_user((char __user *)ctx + ZA_SIG_REGS_OFFSET,
                                      current->thread.sme_state,
                                      ZA_SIG_REGS_SIZE(vq));
        }

        return err ? -EFAULT : 0;
}

static int restore_za_context(struct user_ctxs *user)
{
        int err = 0;
        unsigned int vq;
        u16 user_vl;

        if (user->za_size < sizeof(*user->za))
                return -EINVAL;

        __get_user_error(user_vl, &(user->za->vl), err);
        if (err)
                return err;

        if (user_vl != task_get_sme_vl(current))
                return -EINVAL;

        if (user->za_size == sizeof(*user->za)) {
                current->thread.svcr &= ~SVCR_ZA_MASK;
                return 0;
        }

        vq = sve_vq_from_vl(user_vl);

        if (user->za_size < ZA_SIG_CONTEXT_SIZE(vq))
                return -EINVAL;

        /*
         * Careful: we are about __copy_from_user() directly into
         * thread.sme_state with preemption enabled, so protection is
         * needed to prevent a racing context switch from writing stale
         * registers back over the new data.
         */

        fpsimd_flush_task_state(current);
        /* From now, fpsimd_thread_switch() won't touch thread.sve_state */

        sme_alloc(current, true);
        if (!current->thread.sme_state) {
                current->thread.svcr &= ~SVCR_ZA_MASK;
                clear_thread_flag(TIF_SME);
                return -ENOMEM;
        }

        err = __copy_from_user(current->thread.sme_state,
                               (char __user const *)user->za +
                                        ZA_SIG_REGS_OFFSET,
                               ZA_SIG_REGS_SIZE(vq));
        if (err)
                return -EFAULT;

        set_thread_flag(TIF_SME);
        current->thread.svcr |= SVCR_ZA_MASK;

        return 0;
}

static int preserve_zt_context(struct zt_context __user *ctx)
{
        int err = 0;
        u16 reserved[ARRAY_SIZE(ctx->__reserved)];

        if (WARN_ON(!thread_za_enabled(&current->thread)))
                return -EINVAL;

        memset(reserved, 0, sizeof(reserved));

        __put_user_error(ZT_MAGIC, &ctx->head.magic, err);
        __put_user_error(round_up(ZT_SIG_CONTEXT_SIZE(1), 16),
                         &ctx->head.size, err);
        __put_user_error(1, &ctx->nregs, err);
        BUILD_BUG_ON(sizeof(ctx->__reserved) != sizeof(reserved));
        err |= __copy_to_user(&ctx->__reserved, reserved, sizeof(reserved));

        /*
         * This assumes that the ZT state has already been saved to
         * the task struct by calling the function
         * fpsimd_signal_preserve_current_state().
         */
        err |= __copy_to_user((char __user *)ctx + ZT_SIG_REGS_OFFSET,
                              thread_zt_state(&current->thread),
                              ZT_SIG_REGS_SIZE(1));

        return err ? -EFAULT : 0;
}

static int restore_zt_context(struct user_ctxs *user)
{
        int err;
        u16 nregs;

        /* ZA must be restored first for this check to be valid */
        if (!thread_za_enabled(&current->thread))
                return -EINVAL;

        if (user->zt_size != ZT_SIG_CONTEXT_SIZE(1))
                return -EINVAL;

        if (__copy_from_user(&nregs, &(user->zt->nregs), sizeof(nregs)))
                return -EFAULT;

        if (nregs != 1)
                return -EINVAL;

        /*
         * Careful: we are about __copy_from_user() directly into
         * thread.zt_state with preemption enabled, so protection is
         * needed to prevent a racing context switch from writing stale
         * registers back over the new data.
         */

        fpsimd_flush_task_state(current);
        /* From now, fpsimd_thread_switch() won't touch ZT in thread state */

        err = __copy_from_user(thread_zt_state(&current->thread),
                               (char __user const *)user->zt +
                                        ZT_SIG_REGS_OFFSET,
                               ZT_SIG_REGS_SIZE(1));
        if (err)
                return -EFAULT;

        return 0;
}

#else /* ! CONFIG_ARM64_SME */

/* Turn any non-optimised out attempts to use these into a link error: */
extern int preserve_tpidr2_context(void __user *ctx);
extern int restore_tpidr2_context(struct user_ctxs *user);
extern int preserve_za_context(void __user *ctx);
extern int restore_za_context(struct user_ctxs *user);
extern int preserve_zt_context(void __user *ctx);
extern int restore_zt_context(struct user_ctxs *user);

#endif /* ! CONFIG_ARM64_SME */

#ifdef CONFIG_ARM64_GCS

static int preserve_gcs_context(struct gcs_context __user *ctx)
{
        int err = 0;
        u64 gcspr = read_sysreg_s(SYS_GCSPR_EL0);

        /*
         * If GCS is enabled we will add a cap token to the frame,
         * include it in the GCSPR_EL0 we report to support stack
         * switching via sigreturn if GCS is enabled.  We do not allow
         * enabling via sigreturn so the token is only relevant for
         * threads with GCS enabled.
         */
        if (task_gcs_el0_enabled(current))
                gcspr -= 8;

        __put_user_error(GCS_MAGIC, &ctx->head.magic, err);
        __put_user_error(sizeof(*ctx), &ctx->head.size, err);
        __put_user_error(gcspr, &ctx->gcspr, err);
        __put_user_error(0, &ctx->reserved, err);
        __put_user_error(current->thread.gcs_el0_mode,
                         &ctx->features_enabled, err);

        return err;
}

static int restore_gcs_context(struct user_ctxs *user)
{
        u64 gcspr, enabled;
        int err = 0;

        if (user->gcs_size != sizeof(*user->gcs))
                return -EINVAL;

        __get_user_error(gcspr, &user->gcs->gcspr, err);
        __get_user_error(enabled, &user->gcs->features_enabled, err);
        if (err)
                return err;

        /* Don't allow unknown modes */
        if (enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
                return -EINVAL;

        err = gcs_check_locked(current, enabled);
        if (err != 0)
                return err;

        /* Don't allow enabling */
        if (!task_gcs_el0_enabled(current) &&
            (enabled & PR_SHADOW_STACK_ENABLE))
                return -EINVAL;

        /* If we are disabling disable everything */
        if (!(enabled & PR_SHADOW_STACK_ENABLE))
                enabled = 0;

        current->thread.gcs_el0_mode = enabled;

        /*
         * We let userspace set GCSPR_EL0 to anything here, we will
         * validate later in gcs_restore_signal().
         */
        write_sysreg_s(gcspr, SYS_GCSPR_EL0);

        return 0;
}

#else /* ! CONFIG_ARM64_GCS */

/* Turn any non-optimised out attempts to use these into a link error: */
extern int preserve_gcs_context(void __user *ctx);
extern int restore_gcs_context(struct user_ctxs *user);

#endif /* ! CONFIG_ARM64_GCS */

static int parse_user_sigframe(struct user_ctxs *user,
                               struct rt_sigframe __user *sf)
{
        struct sigcontext __user *const sc = &sf->uc.uc_mcontext;
        struct _aarch64_ctx __user *head;
        char __user *base = (char __user *)&sc->__reserved;
        size_t offset = 0;
        size_t limit = sizeof(sc->__reserved);
        bool have_extra_context = false;
        char const __user *const sfp = (char const __user *)sf;

        user->fpsimd = NULL;
        user->sve = NULL;
        user->tpidr2 = NULL;
        user->za = NULL;
        user->zt = NULL;
        user->fpmr = NULL;
        user->poe = NULL;
        user->gcs = NULL;

        if (!IS_ALIGNED((unsigned long)base, 16))
                goto invalid;

        while (1) {
                int err = 0;
                u32 magic, size;
                char const __user *userp;
                struct extra_context const __user *extra;
                u64 extra_datap;
                u32 extra_size;
                struct _aarch64_ctx const __user *end;
                u32 end_magic, end_size;

                if (limit - offset < sizeof(*head))
                        goto invalid;

                if (!IS_ALIGNED(offset, 16))
                        goto invalid;

                head = (struct _aarch64_ctx __user *)(base + offset);
                __get_user_error(magic, &head->magic, err);
                __get_user_error(size, &head->size, err);
                if (err)
                        return err;

                if (limit - offset < size)
                        goto invalid;

                switch (magic) {
                case 0:
                        if (size)
                                goto invalid;

                        goto done;

                case FPSIMD_MAGIC:
                        if (!system_supports_fpsimd())
                                goto invalid;
                        if (user->fpsimd)
                                goto invalid;

                        user->fpsimd = (struct fpsimd_context __user *)head;
                        user->fpsimd_size = size;
                        break;

                case ESR_MAGIC:
                        /* ignore */
                        break;

                case POE_MAGIC:
                        if (!system_supports_poe())
                                goto invalid;

                        if (user->poe)
                                goto invalid;

                        user->poe = (struct poe_context __user *)head;
                        user->poe_size = size;
                        break;

                case SVE_MAGIC:
                        if (!system_supports_sve() && !system_supports_sme())
                                goto invalid;

                        if (user->sve)
                                goto invalid;

                        user->sve = (struct sve_context __user *)head;
                        user->sve_size = size;
                        break;

                case TPIDR2_MAGIC:
                        if (!system_supports_tpidr2())
                                goto invalid;

                        if (user->tpidr2)
                                goto invalid;

                        user->tpidr2 = (struct tpidr2_context __user *)head;
                        user->tpidr2_size = size;
                        break;

                case ZA_MAGIC:
                        if (!system_supports_sme())
                                goto invalid;

                        if (user->za)
                                goto invalid;

                        user->za = (struct za_context __user *)head;
                        user->za_size = size;
                        break;

                case ZT_MAGIC:
                        if (!system_supports_sme2())
                                goto invalid;

                        if (user->zt)
                                goto invalid;

                        user->zt = (struct zt_context __user *)head;
                        user->zt_size = size;
                        break;

                case FPMR_MAGIC:
                        if (!system_supports_fpmr())
                                goto invalid;

                        if (user->fpmr)
                                goto invalid;

                        user->fpmr = (struct fpmr_context __user *)head;
                        user->fpmr_size = size;
                        break;

                case GCS_MAGIC:
                        if (!system_supports_gcs())
                                goto invalid;

                        if (user->gcs)
                                goto invalid;

                        user->gcs = (struct gcs_context __user *)head;
                        user->gcs_size = size;
                        break;

                case EXTRA_MAGIC:
                        if (have_extra_context)
                                goto invalid;

                        if (size < sizeof(*extra))
                                goto invalid;

                        userp = (char const __user *)head;

                        extra = (struct extra_context const __user *)userp;
                        userp += size;

                        __get_user_error(extra_datap, &extra->datap, err);
                        __get_user_error(extra_size, &extra->size, err);
                        if (err)
                                return err;

                        /* Check for the dummy terminator in __reserved[]: */

                        if (limit - offset - size < TERMINATOR_SIZE)
                                goto invalid;

                        end = (struct _aarch64_ctx const __user *)userp;
                        userp += TERMINATOR_SIZE;

                        __get_user_error(end_magic, &end->magic, err);
                        __get_user_error(end_size, &end->size, err);
                        if (err)
                                return err;

                        if (end_magic || end_size)
                                goto invalid;

                        /* Prevent looping/repeated parsing of extra_context */
                        have_extra_context = true;

                        base = (__force void __user *)extra_datap;
                        if (!IS_ALIGNED((unsigned long)base, 16))
                                goto invalid;

                        if (!IS_ALIGNED(extra_size, 16))
                                goto invalid;

                        if (base != userp)
                                goto invalid;

                        /* Reject "unreasonably large" frames: */
                        if (extra_size > sfp + SIGFRAME_MAXSZ - userp)
                                goto invalid;

                        /*
                         * Ignore trailing terminator in __reserved[]
                         * and start parsing extra data:
                         */
                        offset = 0;
                        limit = extra_size;

                        if (!access_ok(base, limit))
                                goto invalid;

                        continue;

                default:
                        goto invalid;
                }

                if (size < sizeof(*head))
                        goto invalid;

                if (limit - offset < size)
                        goto invalid;

                offset += size;
        }

done:
        return 0;

invalid:
        return -EINVAL;
}

static int restore_sigframe(struct pt_regs *regs,
                            struct rt_sigframe __user *sf,
                            struct user_access_state *ua_state)
{
        sigset_t set;
        int i, err;
        struct user_ctxs user;

        err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
        if (err == 0)
                set_current_blocked(&set);

        for (i = 0; i < 31; i++)
                __get_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i],
                                 err);
        __get_user_error(regs->sp, &sf->uc.uc_mcontext.sp, err);
        __get_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
        __get_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);

        /*
         * Avoid sys_rt_sigreturn() restarting.
         */
        forget_syscall(regs);

        err |= !valid_user_regs(&regs->user_regs, current);
        if (err == 0)
                err = parse_user_sigframe(&user, sf);

        if (err == 0 && system_supports_fpsimd()) {
                if (!user.fpsimd)
                        return -EINVAL;

                if (user.sve)
                        err = restore_sve_fpsimd_context(&user);
                else
                        err = restore_fpsimd_context(&user);
        }

        if (err == 0 && system_supports_gcs() && user.gcs)
                err = restore_gcs_context(&user);

        if (err == 0 && system_supports_tpidr2() && user.tpidr2)
                err = restore_tpidr2_context(&user);

        if (err == 0 && system_supports_fpmr() && user.fpmr)
                err = restore_fpmr_context(&user);

        if (err == 0 && system_supports_sme() && user.za)
                err = restore_za_context(&user);

        if (err == 0 && system_supports_sme2() && user.zt)
                err = restore_zt_context(&user);

        if (err == 0 && system_supports_poe() && user.poe)
                err = restore_poe_context(&user, ua_state);

        return err;
}

#ifdef CONFIG_ARM64_GCS
static int gcs_restore_signal(void)
{
        u64 gcspr_el0, cap;
        int ret;

        if (!system_supports_gcs())
                return 0;

        if (!(current->thread.gcs_el0_mode & PR_SHADOW_STACK_ENABLE))
                return 0;

        gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0);

        /*
         * Ensure that any changes to the GCS done via GCS operations
         * are visible to the normal reads we do to validate the
         * token.
         */
        gcsb_dsync();

        /*
         * GCSPR_EL0 should be pointing at a capped GCS, read the cap.
         * We don't enforce that this is in a GCS page, if it is not
         * then faults will be generated on GCS operations - the main
         * concern is to protect GCS pages.
         */
        ret = copy_from_user(&cap, (unsigned long __user *)gcspr_el0,
                             sizeof(cap));
        if (ret)
                return -EFAULT;

        /*
         * Check that the cap is the actual GCS before replacing it.
         */
        if (cap != GCS_SIGNAL_CAP(gcspr_el0))
                return -EINVAL;

        /* Invalidate the token to prevent reuse */
        put_user_gcs(0, (unsigned long __user *)gcspr_el0, &ret);
        if (ret != 0)
                return -EFAULT;

        write_sysreg_s(gcspr_el0 + 8, SYS_GCSPR_EL0);

        return 0;
}

#else
static int gcs_restore_signal(void) { return 0; }
#endif

SYSCALL_DEFINE0(rt_sigreturn)
{
        struct pt_regs *regs = current_pt_regs();
        struct rt_sigframe __user *frame;
        struct user_access_state ua_state;

        /* Always make any pending restarted system calls return -EINTR */
        current->restart_block.fn = do_no_restart_syscall;

        /*
         * Since we stacked the signal on a 128-bit boundary, then 'sp' should
         * be word aligned here.
         */
        if (regs->sp & 15)
                goto badframe;

        frame = (struct rt_sigframe __user *)regs->sp;

        if (!access_ok(frame, sizeof (*frame)))
                goto badframe;

        if (restore_sigframe(regs, frame, &ua_state))
                goto badframe;

        if (gcs_restore_signal())
                goto badframe;

        if (restore_altstack(&frame->uc.uc_stack))
                goto badframe;

        restore_user_access_state(&ua_state);

        return regs->regs[0];

badframe:
        arm64_notify_segfault(regs->sp);
        return 0;
}

/*
 * Determine the layout of optional records in the signal frame
 *
 * add_all: if true, lays out the biggest possible signal frame for
 *        this task; otherwise, generates a layout for the current state
 *        of the task.
 */
static int setup_sigframe_layout(struct rt_sigframe_user_layout *user,
                                 bool add_all)
{
        int err;

        if (system_supports_fpsimd()) {
                err = sigframe_alloc(user, &user->fpsimd_offset,
                                     sizeof(struct fpsimd_context));
                if (err)
                        return err;
        }

        /* fault information, if valid */
        if (add_all || current->thread.fault_code) {
                err = sigframe_alloc(user, &user->esr_offset,
                                     sizeof(struct esr_context));
                if (err)
                        return err;
        }

#ifdef CONFIG_ARM64_GCS
        if (system_supports_gcs() && (add_all || current->thread.gcspr_el0)) {
                err = sigframe_alloc(user, &user->gcs_offset,
                                     sizeof(struct gcs_context));
                if (err)
                        return err;
        }
#endif

        if (system_supports_sve() || system_supports_sme()) {
                unsigned int vq = 0;

                if (add_all || current->thread.fp_type == FP_STATE_SVE ||
                    thread_sm_enabled(&current->thread)) {
                        int vl = max(sve_max_vl(), sme_max_vl());

                        if (!add_all)
                                vl = thread_get_cur_vl(&current->thread);

                        vq = sve_vq_from_vl(vl);
                }

                err = sigframe_alloc(user, &user->sve_offset,
                                     SVE_SIG_CONTEXT_SIZE(vq));
                if (err)
                        return err;
        }

        if (system_supports_tpidr2()) {
                err = sigframe_alloc(user, &user->tpidr2_offset,
                                     sizeof(struct tpidr2_context));
                if (err)
                        return err;
        }

        if (system_supports_sme()) {
                unsigned int vl;
                unsigned int vq = 0;

                if (add_all)
                        vl = sme_max_vl();
                else
                        vl = task_get_sme_vl(current);

                if (thread_za_enabled(&current->thread))
                        vq = sve_vq_from_vl(vl);

                err = sigframe_alloc(user, &user->za_offset,
                                     ZA_SIG_CONTEXT_SIZE(vq));
                if (err)
                        return err;
        }

        if (system_supports_sme2()) {
                if (add_all || thread_za_enabled(&current->thread)) {
                        err = sigframe_alloc(user, &user->zt_offset,
                                             ZT_SIG_CONTEXT_SIZE(1));
                        if (err)
                                return err;
                }
        }

        if (system_supports_fpmr()) {
                err = sigframe_alloc(user, &user->fpmr_offset,
                                     sizeof(struct fpmr_context));
                if (err)
                        return err;
        }

        if (system_supports_poe()) {
                err = sigframe_alloc(user, &user->poe_offset,
                                     sizeof(struct poe_context));
                if (err)
                        return err;
        }

        return sigframe_alloc_end(user);
}

static int setup_sigframe(struct rt_sigframe_user_layout *user,
                          struct pt_regs *regs, sigset_t *set,
                          const struct user_access_state *ua_state)
{
        int i, err = 0;
        struct rt_sigframe __user *sf = user->sigframe;

        /* set up the stack frame for unwinding */
        __put_user_error(regs->regs[29], &user->next_frame->fp, err);
        __put_user_error(regs->regs[30], &user->next_frame->lr, err);

        for (i = 0; i < 31; i++)
                __put_user_error(regs->regs[i], &sf->uc.uc_mcontext.regs[i],
                                 err);
        __put_user_error(regs->sp, &sf->uc.uc_mcontext.sp, err);
        __put_user_error(regs->pc, &sf->uc.uc_mcontext.pc, err);
        __put_user_error(regs->pstate, &sf->uc.uc_mcontext.pstate, err);

        __put_user_error(current->thread.fault_address, &sf->uc.uc_mcontext.fault_address, err);

        err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));

        if (err == 0 && system_supports_fpsimd()) {
                struct fpsimd_context __user *fpsimd_ctx =
                        apply_user_offset(user, user->fpsimd_offset);
                err |= preserve_fpsimd_context(fpsimd_ctx);
        }

        /* fault information, if valid */
        if (err == 0 && user->esr_offset) {
                struct esr_context __user *esr_ctx =
                        apply_user_offset(user, user->esr_offset);

                __put_user_error(ESR_MAGIC, &esr_ctx->head.magic, err);
                __put_user_error(sizeof(*esr_ctx), &esr_ctx->head.size, err);
                __put_user_error(current->thread.fault_code, &esr_ctx->esr, err);
        }

        if (system_supports_gcs() && err == 0 && user->gcs_offset) {
                struct gcs_context __user *gcs_ctx =
                        apply_user_offset(user, user->gcs_offset);
                err |= preserve_gcs_context(gcs_ctx);
        }

        /* Scalable Vector Extension state (including streaming), if present */
        if ((system_supports_sve() || system_supports_sme()) &&
            err == 0 && user->sve_offset) {
                struct sve_context __user *sve_ctx =
                        apply_user_offset(user, user->sve_offset);
                err |= preserve_sve_context(sve_ctx);
        }

        /* TPIDR2 if supported */
        if (system_supports_tpidr2() && err == 0) {
                struct tpidr2_context __user *tpidr2_ctx =
                        apply_user_offset(user, user->tpidr2_offset);
                err |= preserve_tpidr2_context(tpidr2_ctx);
        }

        /* FPMR if supported */
        if (system_supports_fpmr() && err == 0) {
                struct fpmr_context __user *fpmr_ctx =
                        apply_user_offset(user, user->fpmr_offset);
                err |= preserve_fpmr_context(fpmr_ctx);
        }

        if (system_supports_poe() && err == 0) {
                struct poe_context __user *poe_ctx =
                        apply_user_offset(user, user->poe_offset);

                err |= preserve_poe_context(poe_ctx, ua_state);
        }

        /* ZA state if present */
        if (system_supports_sme() && err == 0 && user->za_offset) {
                struct za_context __user *za_ctx =
                        apply_user_offset(user, user->za_offset);
                err |= preserve_za_context(za_ctx);
        }

        /* ZT state if present */
        if (system_supports_sme2() && err == 0 && user->zt_offset) {
                struct zt_context __user *zt_ctx =
                        apply_user_offset(user, user->zt_offset);
                err |= preserve_zt_context(zt_ctx);
        }

        if (err == 0 && user->extra_offset) {
                char __user *sfp = (char __user *)user->sigframe;
                char __user *userp =
                        apply_user_offset(user, user->extra_offset);

                struct extra_context __user *extra;
                struct _aarch64_ctx __user *end;
                u64 extra_datap;
                u32 extra_size;

                extra = (struct extra_context __user *)userp;
                userp += EXTRA_CONTEXT_SIZE;

                end = (struct _aarch64_ctx __user *)userp;
                userp += TERMINATOR_SIZE;

                /*
                 * extra_datap is just written to the signal frame.
                 * The value gets cast back to a void __user *
                 * during sigreturn.
                 */
                extra_datap = (__force u64)userp;
                extra_size = sfp + round_up(user->size, 16) - userp;

                __put_user_error(EXTRA_MAGIC, &extra->head.magic, err);
                __put_user_error(EXTRA_CONTEXT_SIZE, &extra->head.size, err);
                __put_user_error(extra_datap, &extra->datap, err);
                __put_user_error(extra_size, &extra->size, err);

                /* Add the terminator */
                __put_user_error(0, &end->magic, err);
                __put_user_error(0, &end->size, err);
        }

        /* set the "end" magic */
        if (err == 0) {
                struct _aarch64_ctx __user *end =
                        apply_user_offset(user, user->end_offset);

                __put_user_error(0, &end->magic, err);
                __put_user_error(0, &end->size, err);
        }

        return err;
}

static int get_sigframe(struct rt_sigframe_user_layout *user,
                         struct ksignal *ksig, struct pt_regs *regs)
{
        unsigned long sp, sp_top;
        int err;

        init_user_layout(user);
        err = setup_sigframe_layout(user, false);
        if (err)
                return err;

        sp = sp_top = sigsp(regs->sp, ksig);

        sp = round_down(sp - sizeof(struct frame_record), 16);
        user->next_frame = (struct frame_record __user *)sp;

        sp = round_down(sp, 16) - sigframe_size(user);
        user->sigframe = (struct rt_sigframe __user *)sp;

        /*
         * Check that we can actually write to the signal frame.
         */
        if (!access_ok(user->sigframe, sp_top - sp))
                return -EFAULT;

        return 0;
}

#ifdef CONFIG_ARM64_GCS

static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig)
{
        u64 gcspr_el0;
        int ret = 0;

        if (!system_supports_gcs())
                return 0;

        if (!task_gcs_el0_enabled(current))
                return 0;

        /*
         * We are entering a signal handler, current register state is
         * active.
         */
        gcspr_el0 = read_sysreg_s(SYS_GCSPR_EL0);

        /*
         * Push a cap and the GCS entry for the trampoline onto the GCS.
         */
        put_user_gcs((unsigned long)sigtramp,
                     (unsigned long __user *)(gcspr_el0 - 16), &ret);
        put_user_gcs(GCS_SIGNAL_CAP(gcspr_el0 - 8),
                     (unsigned long __user *)(gcspr_el0 - 8), &ret);
        if (ret != 0)
                return ret;

        gcspr_el0 -= 16;
        write_sysreg_s(gcspr_el0, SYS_GCSPR_EL0);

        return 0;
}
#else

static int gcs_signal_entry(__sigrestore_t sigtramp, struct ksignal *ksig)
{
        return 0;
}

#endif

static int setup_return(struct pt_regs *regs, struct ksignal *ksig,
                         struct rt_sigframe_user_layout *user, int usig)
{
        __sigrestore_t sigtramp;
        int err;

        if (ksig->ka.sa.sa_flags & SA_RESTORER)
                sigtramp = ksig->ka.sa.sa_restorer;
        else
                sigtramp = VDSO_SYMBOL(current->mm->context.vdso, sigtramp);

        err = gcs_signal_entry(sigtramp, ksig);
        if (err)
                return err;

        /*
         * We must not fail from this point onwards. We are going to update
         * registers, including SP, in order to invoke the signal handler. If
         * we failed and attempted to deliver a nested SIGSEGV to a handler
         * after that point, the subsequent sigreturn would end up restoring
         * the (partial) state for the original signal handler.
         */

        regs->regs[0] = usig;
        if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
                regs->regs[1] = (unsigned long)&user->sigframe->info;
                regs->regs[2] = (unsigned long)&user->sigframe->uc;
        }
        regs->sp = (unsigned long)user->sigframe;
        regs->regs[29] = (unsigned long)&user->next_frame->fp;
        regs->regs[30] = (unsigned long)sigtramp;
        regs->pc = (unsigned long)ksig->ka.sa.sa_handler;

        /*
         * Signal delivery is a (wacky) indirect function call in
         * userspace, so simulate the same setting of BTYPE as a BLR
         * <register containing the signal handler entry point>.
         * Signal delivery to a location in a PROT_BTI guarded page
         * that is not a function entry point will now trigger a
         * SIGILL in userspace.
         *
         * If the signal handler entry point is not in a PROT_BTI
         * guarded page, this is harmless.
         */
        if (system_supports_bti()) {
                regs->pstate &= ~PSR_BTYPE_MASK;
                regs->pstate |= PSR_BTYPE_C;
        }

        /* TCO (Tag Check Override) always cleared for signal handlers */
        regs->pstate &= ~PSR_TCO_BIT;

        /* Signal handlers are invoked with ZA and streaming mode disabled */
        if (system_supports_sme()) {
                /*
                 * If we were in streaming mode the saved register
                 * state was SVE but we will exit SM and use the
                 * FPSIMD register state - flush the saved FPSIMD
                 * register state in case it gets loaded.
                 */
                if (current->thread.svcr & SVCR_SM_MASK) {
                        memset(&current->thread.uw.fpsimd_state, 0,
                               sizeof(current->thread.uw.fpsimd_state));
                        current->thread.fp_type = FP_STATE_FPSIMD;
                }

                current->thread.svcr &= ~(SVCR_ZA_MASK |
                                          SVCR_SM_MASK);
                sme_smstop();
        }

        return 0;
}

static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set,
                          struct pt_regs *regs)
{
        struct rt_sigframe_user_layout user;
        struct rt_sigframe __user *frame;
        struct user_access_state ua_state;
        int err = 0;

        fpsimd_signal_preserve_current_state();

        if (get_sigframe(&user, ksig, regs))
                return 1;

        save_reset_user_access_state(&ua_state);
        frame = user.sigframe;

        __put_user_error(0, &frame->uc.uc_flags, err);
        __put_user_error(NULL, &frame->uc.uc_link, err);

        err |= __save_altstack(&frame->uc.uc_stack, regs->sp);
        err |= setup_sigframe(&user, regs, set, &ua_state);
        if (ksig->ka.sa.sa_flags & SA_SIGINFO)
                err |= copy_siginfo_to_user(&frame->info, &ksig->info);

        if (err == 0)
                err = setup_return(regs, ksig, &user, usig);

        /*
         * We must not fail if setup_return() succeeded - see comment at the
         * beginning of setup_return().
         */

        if (err == 0)
                set_handler_user_access_state();
        else
                restore_user_access_state(&ua_state);

        return err;
}

static void setup_restart_syscall(struct pt_regs *regs)
{
        if (is_compat_task())
                compat_setup_restart_syscall(regs);
        else
                regs->regs[8] = __NR_restart_syscall;
}

/*
 * OK, we're invoking a handler
 */
static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
{
        sigset_t *oldset = sigmask_to_save();
        int usig = ksig->sig;
        int ret;

        rseq_signal_deliver(ksig, regs);

        /*
         * Set up the stack frame
         */
        if (is_compat_task()) {
                if (ksig->ka.sa.sa_flags & SA_SIGINFO)
                        ret = compat_setup_rt_frame(usig, ksig, oldset, regs);
                else
                        ret = compat_setup_frame(usig, ksig, oldset, regs);
        } else {
                ret = setup_rt_frame(usig, ksig, oldset, regs);
        }

        /*
         * Check that the resulting registers are actually sane.
         */
        ret |= !valid_user_regs(&regs->user_regs, current);

        /* Step into the signal handler if we are stepping */
        signal_setup_done(ret, ksig, test_thread_flag(TIF_SINGLESTEP));
}

/*
 * Note that 'init' is a special process: it doesn't get signals it doesn't
 * want to handle. Thus you cannot kill init even with a SIGKILL even by
 * mistake.
 *
 * Note that we go through the signals twice: once to check the signals that
 * the kernel can handle, and then we build all the user-level signal handling
 * stack-frames in one go after that.
 */
void do_signal(struct pt_regs *regs)
{
        unsigned long continue_addr = 0, restart_addr = 0;
        int retval = 0;
        struct ksignal ksig;
        bool syscall = in_syscall(regs);

        /*
         * If we were from a system call, check for system call restarting...
         */
        if (syscall) {
                continue_addr = regs->pc;
                restart_addr = continue_addr - (compat_thumb_mode(regs) ? 2 : 4);
                retval = regs->regs[0];

                /*
                 * Avoid additional syscall restarting via ret_to_user.
                 */
                forget_syscall(regs);

                /*
                 * Prepare for system call restart. We do this here so that a
                 * debugger will see the already changed PC.
                 */
                switch (retval) {
                case -ERESTARTNOHAND:
                case -ERESTARTSYS:
                case -ERESTARTNOINTR:
                case -ERESTART_RESTARTBLOCK:
                        regs->regs[0] = regs->orig_x0;
                        regs->pc = restart_addr;
                        break;
                }
        }

        /*
         * Get the signal to deliver. When running under ptrace, at this point
         * the debugger may change all of our registers.
         */
        if (get_signal(&ksig)) {
                /*
                 * Depending on the signal settings, we may need to revert the
                 * decision to restart the system call, but skip this if a
                 * debugger has chosen to restart at a different PC.
                 */
                if (regs->pc == restart_addr &&
                    (retval == -ERESTARTNOHAND ||
                     retval == -ERESTART_RESTARTBLOCK ||
                     (retval == -ERESTARTSYS &&
                      !(ksig.ka.sa.sa_flags & SA_RESTART)))) {
                        syscall_set_return_value(current, regs, -EINTR, 0);
                        regs->pc = continue_addr;
                }

                handle_signal(&ksig, regs);
                return;
        }

        /*
         * Handle restarting a different system call. As above, if a debugger
         * has chosen to restart at a different PC, ignore the restart.
         */
        if (syscall && regs->pc == restart_addr) {
                if (retval == -ERESTART_RESTARTBLOCK)
                        setup_restart_syscall(regs);
                user_rewind_single_step(current);
        }

        restore_saved_sigmask();
}

unsigned long __ro_after_init signal_minsigstksz;

/*
 * Determine the stack space required for guaranteed signal devliery.
 * This function is used to populate AT_MINSIGSTKSZ at process startup.
 * cpufeatures setup is assumed to be complete.
 */
void __init minsigstksz_setup(void)
{
        struct rt_sigframe_user_layout user;

        init_user_layout(&user);

        /*
         * If this fails, SIGFRAME_MAXSZ needs to be enlarged.  It won't
         * be big enough, but it's our best guess:
         */
        if (WARN_ON(setup_sigframe_layout(&user, true)))
                return;

        signal_minsigstksz = sigframe_size(&user) +
                round_up(sizeof(struct frame_record), 16) +
                16; /* max alignment padding */
}

/*
 * Compile-time assertions for siginfo_t offsets. Check NSIG* as well, as
 * changes likely come with new fields that should be added below.
 */
static_assert(NSIGILL        == 11);
static_assert(NSIGFPE        == 15);
static_assert(NSIGSEGV        == 10);
static_assert(NSIGBUS        == 5);
static_assert(NSIGTRAP        == 6);
static_assert(NSIGCHLD        == 6);
static_assert(NSIGSYS        == 2);
static_assert(sizeof(siginfo_t) == 128);
static_assert(__alignof__(siginfo_t) == 8);
static_assert(offsetof(siginfo_t, si_signo)        == 0x00);
static_assert(offsetof(siginfo_t, si_errno)        == 0x04);
static_assert(offsetof(siginfo_t, si_code)        == 0x08);
static_assert(offsetof(siginfo_t, si_pid)        == 0x10);
static_assert(offsetof(siginfo_t, si_uid)        == 0x14);
static_assert(offsetof(siginfo_t, si_tid)        == 0x10);
static_assert(offsetof(siginfo_t, si_overrun)        == 0x14);
static_assert(offsetof(siginfo_t, si_status)        == 0x18);
static_assert(offsetof(siginfo_t, si_utime)        == 0x20);
static_assert(offsetof(siginfo_t, si_stime)        == 0x28);
static_assert(offsetof(siginfo_t, si_value)        == 0x18);
static_assert(offsetof(siginfo_t, si_int)        == 0x18);
static_assert(offsetof(siginfo_t, si_ptr)        == 0x18);
static_assert(offsetof(siginfo_t, si_addr)        == 0x10);
static_assert(offsetof(siginfo_t, si_addr_lsb)        == 0x18);
static_assert(offsetof(siginfo_t, si_lower)        == 0x20);
static_assert(offsetof(siginfo_t, si_upper)        == 0x28);
static_assert(offsetof(siginfo_t, si_pkey)        == 0x20);
static_assert(offsetof(siginfo_t, si_perf_data)        == 0x18);
static_assert(offsetof(siginfo_t, si_perf_type)        == 0x20);
static_assert(offsetof(siginfo_t, si_perf_flags) == 0x24);
static_assert(offsetof(siginfo_t, si_band)        == 0x10);
static_assert(offsetof(siginfo_t, si_fd)        == 0x18);
static_assert(offsetof(siginfo_t, si_call_addr)        == 0x10);
static_assert(offsetof(siginfo_t, si_syscall)        == 0x18);
static_assert(offsetof(siginfo_t, si_arch)        == 0x1c);






























































































































































































































































































































































































































































































  265 














































































































































































































































  261 


  261 
  261 
  261 

  261 



































































































































































































































































































































































































































































































































































  261 










  261 



























    3 



    3 























  265 









  265 


























    3 

  111 

  261 







  223 
  235 
















  235 










  265 





  265 




































































































































































































































































































































































































































































































































































  235 






  235 





































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  fs/eventpoll.c (Efficient event retrieval implementation)
 *  Copyright (C) 2001,...,2009         Davide Libenzi
 *
 *  Davide Libenzi <davidel@xmailserver.org>
 */

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/rbtree.h>
#include <linux/wait.h>
#include <linux/eventpoll.h>
#include <linux/mount.h>
#include <linux/bitops.h>
#include <linux/mutex.h>
#include <linux/anon_inodes.h>
#include <linux/device.h>
#include <linux/uaccess.h>
#include <asm/io.h>
#include <asm/mman.h>
#include <linux/atomic.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/compat.h>
#include <linux/rculist.h>
#include <linux/capability.h>
#include <net/busy_poll.h>

/*
 * LOCKING:
 * There are three level of locking required by epoll :
 *
 * 1) epnested_mutex (mutex)
 * 2) ep->mtx (mutex)
 * 3) ep->lock (rwlock)
 *
 * The acquire order is the one listed above, from 1 to 3.
 * We need a rwlock (ep->lock) because we manipulate objects
 * from inside the poll callback, that might be triggered from
 * a wake_up() that in turn might be called from IRQ context.
 * So we can't sleep inside the poll callback and hence we need
 * a spinlock. During the event transfer loop (from kernel to
 * user space) we could end up sleeping due a copy_to_user(), so
 * we need a lock that will allow us to sleep. This lock is a
 * mutex (ep->mtx). It is acquired during the event transfer loop,
 * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file().
 * The epnested_mutex is acquired when inserting an epoll fd onto another
 * epoll fd. We do this so that we walk the epoll tree and ensure that this
 * insertion does not create a cycle of epoll file descriptors, which
 * could lead to deadlock. We need a global mutex to prevent two
 * simultaneous inserts (A into B and B into A) from racing and
 * constructing a cycle without either insert observing that it is
 * going to.
 * It is necessary to acquire multiple "ep->mtx"es at once in the
 * case when one epoll fd is added to another. In this case, we
 * always acquire the locks in the order of nesting (i.e. after
 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
 * before e2->mtx). Since we disallow cycles of epoll file
 * descriptors, this ensures that the mutexes are well-ordered. In
 * order to communicate this nesting to lockdep, when walking a tree
 * of epoll file descriptors, we use the current recursion depth as
 * the lockdep subkey.
 * It is possible to drop the "ep->mtx" and to use the global
 * mutex "epnested_mutex" (together with "ep->lock") to have it working,
 * but having "ep->mtx" will make the interface more scalable.
 * Events that require holding "epnested_mutex" are very rare, while for
 * normal operations the epoll private "ep->mtx" will guarantee
 * a better scalability.
 */

/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)

#define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT)

#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \
                                EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)

/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4

#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))

#define EP_UNACTIVE_PTR ((void *) -1L)

#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry))

struct epoll_filefd {
        struct file *file;
        int fd;
} __packed;

/* Wait structure used by the poll hooks */
struct eppoll_entry {
        /* List header used to link this structure to the "struct epitem" */
        struct eppoll_entry *next;

        /* The "base" pointer is set to the container "struct epitem" */
        struct epitem *base;

        /*
         * Wait queue item that will be linked to the target file wait
         * queue head.
         */
        wait_queue_entry_t wait;

        /* The wait queue head that linked the "wait" wait queue item */
        wait_queue_head_t *whead;
};

/*
 * Each file descriptor added to the eventpoll interface will
 * have an entry of this type linked to the "rbr" RB tree.
 * Avoid increasing the size of this struct, there can be many thousands
 * of these on a server and we do not want this to take another cache line.
 */
struct epitem {
        union {
                /* RB tree node links this structure to the eventpoll RB tree */
                struct rb_node rbn;
                /* Used to free the struct epitem */
                struct rcu_head rcu;
        };

        /* List header used to link this structure to the eventpoll ready list */
        struct list_head rdllink;

        /*
         * Works together "struct eventpoll"->ovflist in keeping the
         * single linked chain of items.
         */
        struct epitem *next;

        /* The file descriptor information this item refers to */
        struct epoll_filefd ffd;

        /*
         * Protected by file->f_lock, true for to-be-released epitem already
         * removed from the "struct file" items list; together with
         * eventpoll->refcount orchestrates "struct eventpoll" disposal
         */
        bool dying;

        /* List containing poll wait queues */
        struct eppoll_entry *pwqlist;

        /* The "container" of this item */
        struct eventpoll *ep;

        /* List header used to link this item to the "struct file" items list */
        struct hlist_node fllink;

        /* wakeup_source used when EPOLLWAKEUP is set */
        struct wakeup_source __rcu *ws;

        /* The structure that describe the interested events and the source fd */
        struct epoll_event event;
};

/*
 * This structure is stored inside the "private_data" member of the file
 * structure and represents the main data structure for the eventpoll
 * interface.
 */
struct eventpoll {
        /*
         * This mutex is used to ensure that files are not removed
         * while epoll is using them. This is held during the event
         * collection loop, the file cleanup path, the epoll file exit
         * code and the ctl operations.
         */
        struct mutex mtx;

        /* Wait queue used by sys_epoll_wait() */
        wait_queue_head_t wq;

        /* Wait queue used by file->poll() */
        wait_queue_head_t poll_wait;

        /* List of ready file descriptors */
        struct list_head rdllist;

        /* Lock which protects rdllist and ovflist */
        rwlock_t lock;

        /* RB tree root used to store monitored fd structs */
        struct rb_root_cached rbr;

        /*
         * This is a single linked list that chains all the "struct epitem" that
         * happened while transferring ready events to userspace w/out
         * holding ->lock.
         */
        struct epitem *ovflist;

        /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
        struct wakeup_source *ws;

        /* The user that created the eventpoll descriptor */
        struct user_struct *user;

        struct file *file;

        /* used to optimize loop detection check */
        u64 gen;
        struct hlist_head refs;

        /*
         * usage count, used together with epitem->dying to
         * orchestrate the disposal of this struct
         */
        refcount_t refcount;

#ifdef CONFIG_NET_RX_BUSY_POLL
        /* used to track busy poll napi_id */
        unsigned int napi_id;
        /* busy poll timeout */
        u32 busy_poll_usecs;
        /* busy poll packet budget */
        u16 busy_poll_budget;
        bool prefer_busy_poll;
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
        /* tracks wakeup nests for lockdep validation */
        u8 nests;
#endif
};

/* Wrapper struct used by poll queueing */
struct ep_pqueue {
        poll_table pt;
        struct epitem *epi;
};

/*
 * Configuration options available inside /proc/sys/fs/epoll/
 */
/* Maximum number of epoll watched descriptors, per user */
static long max_user_watches __read_mostly;

/* Used for cycles detection */
static DEFINE_MUTEX(epnested_mutex);

static u64 loop_check_gen = 0;

/* Used to check for epoll file descriptor inclusion loops */
static struct eventpoll *inserting_into;

/* Slab cache used to allocate "struct epitem" */
static struct kmem_cache *epi_cache __ro_after_init;

/* Slab cache used to allocate "struct eppoll_entry" */
static struct kmem_cache *pwq_cache __ro_after_init;

/*
 * List of files with newly added links, where we may need to limit the number
 * of emanating paths. Protected by the epnested_mutex.
 */
struct epitems_head {
        struct hlist_head epitems;
        struct epitems_head *next;
};
static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR;

static struct kmem_cache *ephead_cache __ro_after_init;

static inline void free_ephead(struct epitems_head *head)
{
        if (head)
                kmem_cache_free(ephead_cache, head);
}

static void list_file(struct file *file)
{
        struct epitems_head *head;

        head = container_of(file->f_ep, struct epitems_head, epitems);
        if (!head->next) {
                head->next = tfile_check_list;
                tfile_check_list = head;
        }
}

static void unlist_file(struct epitems_head *head)
{
        struct epitems_head *to_free = head;
        struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems));
        if (p) {
                struct epitem *epi= container_of(p, struct epitem, fllink);
                spin_lock(&epi->ffd.file->f_lock);
                if (!hlist_empty(&head->epitems))
                        to_free = NULL;
                head->next = NULL;
                spin_unlock(&epi->ffd.file->f_lock);
        }
        free_ephead(to_free);
}

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static long long_zero;
static long long_max = LONG_MAX;

static const struct ctl_table epoll_table[] = {
        {
                .procname        = "max_user_watches",
                .data                = &max_user_watches,
                .maxlen                = sizeof(max_user_watches),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = &long_zero,
                .extra2                = &long_max,
        },
};

static void __init epoll_sysctls_init(void)
{
        register_sysctl("fs/epoll", epoll_table);
}
#else
#define epoll_sysctls_init() do { } while (0)
#endif /* CONFIG_SYSCTL */

static const struct file_operations eventpoll_fops;

static inline int is_file_epoll(struct file *f)
{
        return f->f_op == &eventpoll_fops;
}

/* Setup the structure that is used as key for the RB tree */
static inline void ep_set_ffd(struct epoll_filefd *ffd,
                              struct file *file, int fd)
{
        ffd->file = file;
        ffd->fd = fd;
}

/* Compare RB tree keys */
static inline int ep_cmp_ffd(struct epoll_filefd *p1,
                             struct epoll_filefd *p2)
{
        return (p1->file > p2->file ? +1:
                (p1->file < p2->file ? -1 : p1->fd - p2->fd));
}

/* Tells us if the item is currently linked */
static inline int ep_is_linked(struct epitem *epi)
{
        return !list_empty(&epi->rdllink);
}

static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait);
}

/* Get the "struct epitem" from a wait queue pointer */
static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p)
{
        return container_of(p, struct eppoll_entry, wait)->base;
}

/**
 * ep_events_available - Checks if ready events might be available.
 *
 * @ep: Pointer to the eventpoll context.
 *
 * Return: a value different than %zero if ready events are available,
 *          or %zero otherwise.
 */
static inline int ep_events_available(struct eventpoll *ep)
{
        return !list_empty_careful(&ep->rdllist) ||
                READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR;
}

#ifdef CONFIG_NET_RX_BUSY_POLL
/**
 * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
 * from the epoll instance ep is preferred, but if it is not set fallback to
 * the system-wide global via busy_loop_timeout.
 *
 * @start_time: The start time used to compute the remaining time until timeout.
 * @ep: Pointer to the eventpoll context.
 *
 * Return: true if the timeout has expired, false otherwise.
 */
static bool busy_loop_ep_timeout(unsigned long start_time,
                                 struct eventpoll *ep)
{
        unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);

        if (bp_usec) {
                unsigned long end_time = start_time + bp_usec;
                unsigned long now = busy_loop_current_time();

                return time_after(now, end_time);
        } else {
                return busy_loop_timeout(start_time);
        }
}

static bool ep_busy_loop_on(struct eventpoll *ep)
{
        return !!READ_ONCE(ep->busy_poll_usecs) ||
               READ_ONCE(ep->prefer_busy_poll) ||
               net_busy_loop_on();
}

static bool ep_busy_loop_end(void *p, unsigned long start_time)
{
        struct eventpoll *ep = p;

        return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
}

/*
 * Busy poll if globally on and supporting sockets found && no events,
 * busy loop will return if need_resched or ep_events_available.
 *
 * we must do our busy polling with irqs enabled
 */
static bool ep_busy_loop(struct eventpoll *ep)
{
        unsigned int napi_id = READ_ONCE(ep->napi_id);
        u16 budget = READ_ONCE(ep->busy_poll_budget);
        bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);

        if (!budget)
                budget = BUSY_POLL_BUDGET;

        if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) {
                napi_busy_loop(napi_id, ep_busy_loop_end,
                               ep, prefer_busy_poll, budget);
                if (ep_events_available(ep))
                        return true;
                /*
                 * Busy poll timed out.  Drop NAPI ID for now, we can add
                 * it back in when we have moved a socket with a valid NAPI
                 * ID onto the ready list.
                 */
                if (prefer_busy_poll)
                        napi_resume_irqs(napi_id);
                ep->napi_id = 0;
                return false;
        }
        return false;
}

/*
 * Set epoll busy poll NAPI ID from sk.
 */
static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
        struct eventpoll *ep = epi->ep;
        unsigned int napi_id;
        struct socket *sock;
        struct sock *sk;

        if (!ep_busy_loop_on(ep))
                return;

        sock = sock_from_file(epi->ffd.file);
        if (!sock)
                return;

        sk = sock->sk;
        if (!sk)
                return;

        napi_id = READ_ONCE(sk->sk_napi_id);

        /* Non-NAPI IDs can be rejected
         *        or
         * Nothing to do if we already have this ID
         */
        if (!napi_id_valid(napi_id) || napi_id == ep->napi_id)
                return;

        /* record NAPI ID for use in next busy poll */
        ep->napi_id = napi_id;
}

static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        struct eventpoll *ep = file->private_data;
        void __user *uarg = (void __user *)arg;
        struct epoll_params epoll_params;

        switch (cmd) {
        case EPIOCSPARAMS:
                if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
                        return -EFAULT;

                /* pad byte must be zero */
                if (epoll_params.__pad)
                        return -EINVAL;

                if (epoll_params.busy_poll_usecs > S32_MAX)
                        return -EINVAL;

                if (epoll_params.prefer_busy_poll > 1)
                        return -EINVAL;

                if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
                    !capable(CAP_NET_ADMIN))
                        return -EPERM;

                WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
                WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
                WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
                return 0;
        case EPIOCGPARAMS:
                memset(&epoll_params, 0, sizeof(epoll_params));
                epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
                epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
                epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
                if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
                        return -EFAULT;
                return 0;
        default:
                return -ENOIOCTLCMD;
        }
}

static void ep_suspend_napi_irqs(struct eventpoll *ep)
{
        unsigned int napi_id = READ_ONCE(ep->napi_id);

        if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
                napi_suspend_irqs(napi_id);
}

static void ep_resume_napi_irqs(struct eventpoll *ep)
{
        unsigned int napi_id = READ_ONCE(ep->napi_id);

        if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll))
                napi_resume_irqs(napi_id);
}

#else

static inline bool ep_busy_loop(struct eventpoll *ep)
{
        return false;
}

static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
{
}

static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        return -EOPNOTSUPP;
}

static void ep_suspend_napi_irqs(struct eventpoll *ep)
{
}

static void ep_resume_napi_irqs(struct eventpoll *ep)
{
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

/*
 * As described in commit 0ccf831cb lockdep: annotate epoll
 * the use of wait queues used by epoll is done in a very controlled
 * manner. Wake ups can nest inside each other, but are never done
 * with the same locking. For example:
 *
 *   dfd = socket(...);
 *   efd1 = epoll_create();
 *   efd2 = epoll_create();
 *   epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
 *   epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
 *
 * When a packet arrives to the device underneath "dfd", the net code will
 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
 * callback wakeup entry on that queue, and the wake_up() performed by the
 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
 * (efd1) notices that it may have some event ready, so it needs to wake up
 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
 * that ends up in another wake_up(), after having checked about the
 * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid
 * stack blasting.
 *
 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
 * this special case of epoll.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             unsigned pollflags)
{
        struct eventpoll *ep_src;
        unsigned long flags;
        u8 nests = 0;

        /*
         * To set the subclass or nesting level for spin_lock_irqsave_nested()
         * it might be natural to create a per-cpu nest count. However, since
         * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can
         * schedule() in the -rt kernel, the per-cpu variable are no longer
         * protected. Thus, we are introducing a per eventpoll nest field.
         * If we are not being call from ep_poll_callback(), epi is NULL and
         * we are at the first level of nesting, 0. Otherwise, we are being
         * called from ep_poll_callback() and if a previous wakeup source is
         * not an epoll file itself, we are at depth 1 since the wakeup source
         * is depth 0. If the wakeup source is a previous epoll file in the
         * wakeup chain then we use its nests value and record ours as
         * nests + 1. The previous epoll file nests value is stable since its
         * already holding its own poll_wait.lock.
         */
        if (epi) {
                if ((is_file_epoll(epi->ffd.file))) {
                        ep_src = epi->ffd.file->private_data;
                        nests = ep_src->nests;
                } else {
                        nests = 1;
                }
        }
        spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests);
        ep->nests = nests + 1;
        wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags);
        ep->nests = 0;
        spin_unlock_irqrestore(&ep->poll_wait.lock, flags);
}

#else

static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi,
                             __poll_t pollflags)
{
        wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags);
}

#endif

static void ep_remove_wait_queue(struct eppoll_entry *pwq)
{
        wait_queue_head_t *whead;

        rcu_read_lock();
        /*
         * If it is cleared by POLLFREE, it should be rcu-safe.
         * If we read NULL we need a barrier paired with
         * smp_store_release() in ep_poll_callback(), otherwise
         * we rely on whead->lock.
         */
        whead = smp_load_acquire(&pwq->whead);
        if (whead)
                remove_wait_queue(whead, &pwq->wait);
        rcu_read_unlock();
}

/*
 * This function unregisters poll callbacks from the associated file
 * descriptor.  Must be called with "mtx" held.
 */
static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
{
        struct eppoll_entry **p = &epi->pwqlist;
        struct eppoll_entry *pwq;

        while ((pwq = *p) != NULL) {
                *p = pwq->next;
                ep_remove_wait_queue(pwq);
                kmem_cache_free(pwq_cache, pwq);
        }
}

/* call only when ep->mtx is held */
static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi)
{
        return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx));
}

/* call only when ep->mtx is held */
static inline void ep_pm_stay_awake(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        if (ws)
                __pm_stay_awake(ws);
}

static inline bool ep_has_wakeup_source(struct epitem *epi)
{
        return rcu_access_pointer(epi->ws) ? true : false;
}

/* call when ep->mtx cannot be held (ep_poll_callback) */
static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
{
        struct wakeup_source *ws;

        rcu_read_lock();
        ws = rcu_dereference(epi->ws);
        if (ws)
                __pm_stay_awake(ws);
        rcu_read_unlock();
}


/*
 * ep->mutex needs to be held because we could be hit by
 * eventpoll_release_file() and epoll_ctl().
 */
static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
{
        /*
         * Steal the ready list, and re-init the original one to the
         * empty list. Also, set ep->ovflist to NULL so that events
         * happening while looping w/out locks, are not lost. We cannot
         * have the poll callback to queue directly on ep->rdllist,
         * because we want the "sproc" callback to be able to do it
         * in a lockless way.
         */
        lockdep_assert_irqs_enabled();
        write_lock_irq(&ep->lock);
        list_splice_init(&ep->rdllist, txlist);
        WRITE_ONCE(ep->ovflist, NULL);
        write_unlock_irq(&ep->lock);
}

static void ep_done_scan(struct eventpoll *ep,
                         struct list_head *txlist)
{
        struct epitem *epi, *nepi;

        write_lock_irq(&ep->lock);
        /*
         * During the time we spent inside the "sproc" callback, some
         * other events might have been queued by the poll callback.
         * We re-insert them inside the main ready-list here.
         */
        for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL;
             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
                /*
                 * We need to check if the item is already in the list.
                 * During the "sproc" callback execution time, items are
                 * queued into ->ovflist but the "txlist" might already
                 * contain them, and the list_splice() below takes care of them.
                 */
                if (!ep_is_linked(epi)) {
                        /*
                         * ->ovflist is LIFO, so we have to reverse it in order
                         * to keep in FIFO.
                         */
                        list_add(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }
        /*
         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
         * releasing the lock, events will be queued in the normal way inside
         * ep->rdllist.
         */
        WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR);

        /*
         * Quickly re-inject items left on "txlist".
         */
        list_splice(txlist, &ep->rdllist);
        __pm_relax(ep->ws);

        if (!list_empty(&ep->rdllist)) {
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
        }

        write_unlock_irq(&ep->lock);
}

static void ep_get(struct eventpoll *ep)
{
        refcount_inc(&ep->refcount);
}

/*
 * Returns true if the event poll can be disposed
 */
static bool ep_refcount_dec_and_test(struct eventpoll *ep)
{
        if (!refcount_dec_and_test(&ep->refcount))
                return false;

        WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root));
        return true;
}

static void ep_free(struct eventpoll *ep)
{
        ep_resume_napi_irqs(ep);
        mutex_destroy(&ep->mtx);
        free_uid(ep->user);
        wakeup_source_unregister(ep->ws);
        kfree(ep);
}

/*
 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
 * all the associated resources. Must be called with "mtx" held.
 * If the dying flag is set, do the removal only if force is true.
 * This prevents ep_clear_and_put() from dropping all the ep references
 * while running concurrently with eventpoll_release_file().
 * Returns true if the eventpoll can be disposed.
 */
static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
{
        struct file *file = epi->ffd.file;
        struct epitems_head *to_free;
        struct hlist_head *head;

        lockdep_assert_irqs_enabled();

        /*
         * Removes poll wait queue hooks.
         */
        ep_unregister_pollwait(ep, epi);

        /* Remove the current item from the list of epoll hooks */
        spin_lock(&file->f_lock);
        if (epi->dying && !force) {
                spin_unlock(&file->f_lock);
                return false;
        }

        to_free = NULL;
        head = file->f_ep;
        if (head->first == &epi->fllink && !epi->fllink.next) {
                /* See eventpoll_release() for details. */
                WRITE_ONCE(file->f_ep, NULL);
                if (!is_file_epoll(file)) {
                        struct epitems_head *v;
                        v = container_of(head, struct epitems_head, epitems);
                        if (!smp_load_acquire(&v->next))
                                to_free = v;
                }
        }
        hlist_del_rcu(&epi->fllink);
        spin_unlock(&file->f_lock);
        free_ephead(to_free);

        rb_erase_cached(&epi->rbn, &ep->rbr);

        write_lock_irq(&ep->lock);
        if (ep_is_linked(epi))
                list_del_init(&epi->rdllink);
        write_unlock_irq(&ep->lock);

        wakeup_source_unregister(ep_wakeup_source(epi));
        /*
         * At this point it is safe to free the eventpoll item. Use the union
         * field epi->rcu, since we are trying to minimize the size of
         * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
         * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
         * use of the rbn field.
         */
        kfree_rcu(epi, rcu);

        percpu_counter_dec(&ep->user->epoll_watches);
        return ep_refcount_dec_and_test(ep);
}

/*
 * ep_remove variant for callers owing an additional reference to the ep
 */
static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi)
{
        WARN_ON_ONCE(__ep_remove(ep, epi, false));
}

static void ep_clear_and_put(struct eventpoll *ep)
{
        struct rb_node *rbp, *next;
        struct epitem *epi;
        bool dispose;

        /* We need to release all tasks waiting for these file */
        if (waitqueue_active(&ep->poll_wait))
                ep_poll_safewake(ep, NULL, 0);

        mutex_lock(&ep->mtx);

        /*
         * Walks through the whole tree by unregistering poll callbacks.
         */
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);

                ep_unregister_pollwait(ep, epi);
                cond_resched();
        }

        /*
         * Walks through the whole tree and try to free each "struct epitem".
         * Note that ep_remove_safe() will not remove the epitem in case of a
         * racing eventpoll_release_file(); the latter will do the removal.
         * At this point we are sure no poll callbacks will be lingering around.
         * Since we still own a reference to the eventpoll struct, the loop can't
         * dispose it.
         */
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) {
                next = rb_next(rbp);
                epi = rb_entry(rbp, struct epitem, rbn);
                ep_remove_safe(ep, epi);
                cond_resched();
        }

        dispose = ep_refcount_dec_and_test(ep);
        mutex_unlock(&ep->mtx);

        if (dispose)
                ep_free(ep);
}

static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        int ret;

        if (!is_file_epoll(file))
                return -EINVAL;

        switch (cmd) {
        case EPIOCSPARAMS:
        case EPIOCGPARAMS:
                ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
                break;
        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
        struct eventpoll *ep = file->private_data;

        if (ep)
                ep_clear_and_put(ep);

        return 0;
}

static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth);

static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth)
{
        struct eventpoll *ep = file->private_data;
        LIST_HEAD(txlist);
        struct epitem *epi, *tmp;
        poll_table pt;
        __poll_t res = 0;

        init_poll_funcptr(&pt, NULL);

        /* Insert inside our poll wait queue */
        poll_wait(file, &ep->poll_wait, wait);

        /*
         * Proceed to find out if wanted events are really available inside
         * the ready list.
         */
        mutex_lock_nested(&ep->mtx, depth);
        ep_start_scan(ep, &txlist);
        list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
                if (ep_item_poll(epi, &pt, depth + 1)) {
                        res = EPOLLIN | EPOLLRDNORM;
                        break;
                } else {
                        /*
                         * Item has been dropped into the ready list by the poll
                         * callback, but it's not actually ready, as far as
                         * caller requested events goes. We can remove it here.
                         */
                        __pm_relax(ep_wakeup_source(epi));
                        list_del_init(&epi->rdllink);
                }
        }
        ep_done_scan(ep, &txlist);
        mutex_unlock(&ep->mtx);
        return res;
}

/*
 * The ffd.file pointer may be in the process of being torn down due to
 * being closed, but we may not have finished eventpoll_release() yet.
 *
 * Normally, even with the atomic_long_inc_not_zero, the file may have
 * been free'd and then gotten re-allocated to something else (since
 * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
 *
 * But for epoll, users hold the ep->mtx mutex, and as such any file in
 * the process of being free'd will block in eventpoll_release_file()
 * and thus the underlying file allocation will not be free'd, and the
 * file re-use cannot happen.
 *
 * For the same reason we can avoid a rcu_read_lock() around the
 * operation - 'ffd.file' cannot go away even if the refcount has
 * reached zero (but we must still not call out to ->poll() functions
 * etc).
 */
static struct file *epi_fget(const struct epitem *epi)
{
        struct file *file;

        file = epi->ffd.file;
        if (!file_ref_get(&file->f_ref))
                file = NULL;
        return file;
}

/*
 * Differs from ep_eventpoll_poll() in that internal callers already have
 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
 * is correctly annotated.
 */
static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt,
                                 int depth)
{
        struct file *file = epi_fget(epi);
        __poll_t res;

        /*
         * We could return EPOLLERR | EPOLLHUP or something, but let's
         * treat this more as "file doesn't exist, poll didn't happen".
         */
        if (!file)
                return 0;

        pt->_key = epi->event.events;
        if (!is_file_epoll(file))
                res = vfs_poll(file, pt);
        else
                res = __ep_eventpoll_poll(file, pt, depth);
        fput(file);
        return res & epi->event.events;
}

static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait)
{
        return __ep_eventpoll_poll(file, wait, 0);
}

#ifdef CONFIG_PROC_FS
static void ep_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct eventpoll *ep = f->private_data;
        struct rb_node *rbp;

        mutex_lock(&ep->mtx);
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
                struct inode *inode = file_inode(epi->ffd.file);

                seq_printf(m, "tfd: %8d events: %8x data: %16llx "
                           " pos:%lli ino:%lx sdev:%x\n",
                           epi->ffd.fd, epi->event.events,
                           (long long)epi->event.data,
                           (long long)epi->ffd.file->f_pos,
                           inode->i_ino, inode->i_sb->s_dev);
                if (seq_has_overflowed(m))
                        break;
        }
        mutex_unlock(&ep->mtx);
}
#endif

/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = ep_show_fdinfo,
#endif
        .release        = ep_eventpoll_release,
        .poll                = ep_eventpoll_poll,
        .llseek                = noop_llseek,
        .unlocked_ioctl        = ep_eventpoll_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
};

/*
 * This is called from eventpoll_release() to unlink files from the eventpoll
 * interface. We need to have this facility to cleanup correctly files that are
 * closed without being removed from the eventpoll interface.
 */
void eventpoll_release_file(struct file *file)
{
        struct eventpoll *ep;
        struct epitem *epi;
        bool dispose;

        /*
         * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
         * touching the epitems list before eventpoll_release_file() can access
         * the ep->mtx.
         */
again:
        spin_lock(&file->f_lock);
        if (file->f_ep && file->f_ep->first) {
                epi = hlist_entry(file->f_ep->first, struct epitem, fllink);
                epi->dying = true;
                spin_unlock(&file->f_lock);

                /*
                 * ep access is safe as we still own a reference to the ep
                 * struct
                 */
                ep = epi->ep;
                mutex_lock(&ep->mtx);
                dispose = __ep_remove(ep, epi, true);
                mutex_unlock(&ep->mtx);

                if (dispose)
                        ep_free(ep);
                goto again;
        }
        spin_unlock(&file->f_lock);
}

static int ep_alloc(struct eventpoll **pep)
{
        struct eventpoll *ep;

        ep = kzalloc(sizeof(*ep), GFP_KERNEL);
        if (unlikely(!ep))
                return -ENOMEM;

        mutex_init(&ep->mtx);
        rwlock_init(&ep->lock);
        init_waitqueue_head(&ep->wq);
        init_waitqueue_head(&ep->poll_wait);
        INIT_LIST_HEAD(&ep->rdllist);
        ep->rbr = RB_ROOT_CACHED;
        ep->ovflist = EP_UNACTIVE_PTR;
        ep->user = get_current_user();
        refcount_set(&ep->refcount, 1);

        *pep = ep;

        return 0;
}

/*
 * Search the file inside the eventpoll tree. The RB tree operations
 * are protected by the "mtx" mutex, and ep_find() must be called with
 * "mtx" held.
 */
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
        int kcmp;
        struct rb_node *rbp;
        struct epitem *epi, *epir = NULL;
        struct epoll_filefd ffd;

        ep_set_ffd(&ffd, file, fd);
        for (rbp = ep->rbr.rb_root.rb_node; rbp; ) {
                epi = rb_entry(rbp, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
                if (kcmp > 0)
                        rbp = rbp->rb_right;
                else if (kcmp < 0)
                        rbp = rbp->rb_left;
                else {
                        epir = epi;
                        break;
                }
        }

        return epir;
}

#ifdef CONFIG_KCMP
static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff)
{
        struct rb_node *rbp;
        struct epitem *epi;

        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (epi->ffd.fd == tfd) {
                        if (toff == 0)
                                return epi;
                        else
                                toff--;
                }
                cond_resched();
        }

        return NULL;
}

struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
                                     unsigned long toff)
{
        struct file *file_raw;
        struct eventpoll *ep;
        struct epitem *epi;

        if (!is_file_epoll(file))
                return ERR_PTR(-EINVAL);

        ep = file->private_data;

        mutex_lock(&ep->mtx);
        epi = ep_find_tfd(ep, tfd, toff);
        if (epi)
                file_raw = epi->ffd.file;
        else
                file_raw = ERR_PTR(-ENOENT);
        mutex_unlock(&ep->mtx);

        return file_raw;
}
#endif /* CONFIG_KCMP */

/*
 * Adds a new entry to the tail of the list in a lockless way, i.e.
 * multiple CPUs are allowed to call this function concurrently.
 *
 * Beware: it is necessary to prevent any other modifications of the
 *         existing list until all changes are completed, in other words
 *         concurrent list_add_tail_lockless() calls should be protected
 *         with a read lock, where write lock acts as a barrier which
 *         makes sure all list_add_tail_lockless() calls are fully
 *         completed.
 *
 *        Also an element can be locklessly added to the list only in one
 *        direction i.e. either to the tail or to the head, otherwise
 *        concurrent access will corrupt the list.
 *
 * Return: %false if element has been already added to the list, %true
 * otherwise.
 */
static inline bool list_add_tail_lockless(struct list_head *new,
                                          struct list_head *head)
{
        struct list_head *prev;

        /*
         * This is simple 'new->next = head' operation, but cmpxchg()
         * is used in order to detect that same element has been just
         * added to the list from another CPU: the winner observes
         * new->next == new.
         */
        if (!try_cmpxchg(&new->next, &new, head))
                return false;

        /*
         * Initially ->next of a new element must be updated with the head
         * (we are inserting to the tail) and only then pointers are atomically
         * exchanged.  XCHG guarantees memory ordering, thus ->next should be
         * updated before pointers are actually swapped and pointers are
         * swapped before prev->next is updated.
         */

        prev = xchg(&head->prev, new);

        /*
         * It is safe to modify prev->next and new->prev, because a new element
         * is added only to the tail and new->next is updated before XCHG.
         */

        prev->next = new;
        new->prev = prev;

        return true;
}

/*
 * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
 * i.e. multiple CPUs are allowed to call this function concurrently.
 *
 * Return: %false if epi element has been already chained, %true otherwise.
 */
static inline bool chain_epi_lockless(struct epitem *epi)
{
        struct eventpoll *ep = epi->ep;

        /* Fast preliminary check */
        if (epi->next != EP_UNACTIVE_PTR)
                return false;

        /* Check that the same epi has not been just chained from another CPU */
        if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
                return false;

        /* Atomically exchange tail */
        epi->next = xchg(&ep->ovflist, epi);

        return true;
}

/*
 * This is the callback that is passed to the wait queue wakeup
 * mechanism. It is called by the stored file descriptors when they
 * have events to report.
 *
 * This callback takes a read lock in order not to contend with concurrent
 * events from another file descriptor, thus all modifications to ->rdllist
 * or ->ovflist are lockless.  Read lock is paired with the write lock from
 * ep_start/done_scan(), which stops all list modifications and guarantees
 * that lists state is seen correctly.
 *
 * Another thing worth to mention is that ep_poll_callback() can be called
 * concurrently for the same @epi from different CPUs if poll table was inited
 * with several wait queues entries.  Plural wakeup from different CPUs of a
 * single wait queue is serialized by wq.lock, but the case when multiple wait
 * queues are used should be detected accordingly.  This is detected using
 * cmpxchg() operation.
 */
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        int pwake = 0;
        struct epitem *epi = ep_item_from_wait(wait);
        struct eventpoll *ep = epi->ep;
        __poll_t pollflags = key_to_poll(key);
        unsigned long flags;
        int ewake = 0;

        read_lock_irqsave(&ep->lock, flags);

        ep_set_busy_poll_napi_id(epi);

        /*
         * If the event mask does not contain any poll(2) event, we consider the
         * descriptor to be disabled. This condition is likely the effect of the
         * EPOLLONESHOT bit that disables the descriptor when an event is received,
         * until the next EPOLL_CTL_MOD will be issued.
         */
        if (!(epi->event.events & ~EP_PRIVATE_BITS))
                goto out_unlock;

        /*
         * Check the events coming with the callback. At this stage, not
         * every device reports the events in the "key" parameter of the
         * callback. We need to be able to handle both cases here, hence the
         * test for "key" != NULL before the event match test.
         */
        if (pollflags && !(pollflags & epi->event.events))
                goto out_unlock;

        /*
         * If we are transferring events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
         * semantics). All the events that happen during that period of time are
         * chained in ep->ovflist and requeued later on.
         */
        if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
                if (chain_epi_lockless(epi))
                        ep_pm_stay_awake_rcu(epi);
        } else if (!ep_is_linked(epi)) {
                /* In the usual case, add event to ready list. */
                if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
                        ep_pm_stay_awake_rcu(epi);
        }

        /*
         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
         * wait list.
         */
        if (waitqueue_active(&ep->wq)) {
                if ((epi->event.events & EPOLLEXCLUSIVE) &&
                                        !(pollflags & POLLFREE)) {
                        switch (pollflags & EPOLLINOUT_BITS) {
                        case EPOLLIN:
                                if (epi->event.events & EPOLLIN)
                                        ewake = 1;
                                break;
                        case EPOLLOUT:
                                if (epi->event.events & EPOLLOUT)
                                        ewake = 1;
                                break;
                        case 0:
                                ewake = 1;
                                break;
                        }
                }
                if (sync)
                        wake_up_sync(&ep->wq);
                else
                        wake_up(&ep->wq);
        }
        if (waitqueue_active(&ep->poll_wait))
                pwake++;

out_unlock:
        read_unlock_irqrestore(&ep->lock, flags);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE);

        if (!(epi->event.events & EPOLLEXCLUSIVE))
                ewake = 1;

        if (pollflags & POLLFREE) {
                /*
                 * If we race with ep_remove_wait_queue() it can miss
                 * ->whead = NULL and do another remove_wait_queue() after
                 * us, so we can't use __remove_wait_queue().
                 */
                list_del_init(&wait->entry);
                /*
                 * ->whead != NULL protects us from the race with
                 * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue()
                 * takes whead->lock held by the caller. Once we nullify it,
                 * nothing protects ep/epi or even wait.
                 */
                smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL);
        }

        return ewake;
}

/*
 * This is the callback that is used to add our wait queue to the
 * target file wakeup lists.
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                                 poll_table *pt)
{
        struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt);
        struct epitem *epi = epq->epi;
        struct eppoll_entry *pwq;

        if (unlikely(!epi))        // an earlier allocation has failed
                return;

        pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL);
        if (unlikely(!pwq)) {
                epq->epi = NULL;
                return;
        }

        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        if (epi->event.events & EPOLLEXCLUSIVE)
                add_wait_queue_exclusive(whead, &pwq->wait);
        else
                add_wait_queue(whead, &pwq->wait);
        pwq->next = epi->pwqlist;
        epi->pwqlist = pwq;
}

static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
{
        int kcmp;
        struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL;
        struct epitem *epic;
        bool leftmost = true;

        while (*p) {
                parent = *p;
                epic = rb_entry(parent, struct epitem, rbn);
                kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd);
                if (kcmp > 0) {
                        p = &parent->rb_right;
                        leftmost = false;
                } else
                        p = &parent->rb_left;
        }
        rb_link_node(&epi->rbn, parent, p);
        rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost);
}



#define PATH_ARR_SIZE 5
/*
 * These are the number paths of length 1 to 5, that we are allowing to emanate
 * from a single file of interest. For example, we allow 1000 paths of length
 * 1, to emanate from each file of interest. This essentially represents the
 * potential wakeup paths, which need to be limited in order to avoid massive
 * uncontrolled wakeup storms. The common use case should be a single ep which
 * is connected to n file sources. In this case each file source has 1 path
 * of length 1. Thus, the numbers below should be more than sufficient. These
 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
 * and delete can't add additional paths. Protected by the epnested_mutex.
 */
static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
static int path_count[PATH_ARR_SIZE];

static int path_count_inc(int nests)
{
        /* Allow an arbitrary number of depth 1 paths */
        if (nests == 0)
                return 0;

        if (++path_count[nests] > path_limits[nests])
                return -1;
        return 0;
}

static void path_count_init(void)
{
        int i;

        for (i = 0; i < PATH_ARR_SIZE; i++)
                path_count[i] = 0;
}

static int reverse_path_check_proc(struct hlist_head *refs, int depth)
{
        int error = 0;
        struct epitem *epi;

        if (depth > EP_MAX_NESTS) /* too deep nesting */
                return -1;

        /* CTL_DEL can remove links here, but that can't increase our count */
        hlist_for_each_entry_rcu(epi, refs, fllink) {
                struct hlist_head *refs = &epi->ep->refs;
                if (hlist_empty(refs))
                        error = path_count_inc(depth);
                else
                        error = reverse_path_check_proc(refs, depth + 1);
                if (error != 0)
                        break;
        }
        return error;
}

/**
 * reverse_path_check - The tfile_check_list is list of epitem_head, which have
 *                      links that are proposed to be newly added. We need to
 *                      make sure that those added links don't add too many
 *                      paths such that we will spend all our time waking up
 *                      eventpoll objects.
 *
 * Return: %zero if the proposed links don't create too many paths,
 *            %-1 otherwise.
 */
static int reverse_path_check(void)
{
        struct epitems_head *p;

        for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) {
                int error;
                path_count_init();
                rcu_read_lock();
                error = reverse_path_check_proc(&p->epitems, 0);
                rcu_read_unlock();
                if (error)
                        return error;
        }
        return 0;
}

static int ep_create_wakeup_source(struct epitem *epi)
{
        struct name_snapshot n;
        struct wakeup_source *ws;

        if (!epi->ep->ws) {
                epi->ep->ws = wakeup_source_register(NULL, "eventpoll");
                if (!epi->ep->ws)
                        return -ENOMEM;
        }

        take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry);
        ws = wakeup_source_register(NULL, n.name.name);
        release_dentry_name_snapshot(&n);

        if (!ws)
                return -ENOMEM;
        rcu_assign_pointer(epi->ws, ws);

        return 0;
}

/* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */
static noinline void ep_destroy_wakeup_source(struct epitem *epi)
{
        struct wakeup_source *ws = ep_wakeup_source(epi);

        RCU_INIT_POINTER(epi->ws, NULL);

        /*
         * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is
         * used internally by wakeup_source_remove, too (called by
         * wakeup_source_unregister), so we cannot use call_rcu
         */
        synchronize_rcu();
        wakeup_source_unregister(ws);
}

static int attach_epitem(struct file *file, struct epitem *epi)
{
        struct epitems_head *to_free = NULL;
        struct hlist_head *head = NULL;
        struct eventpoll *ep = NULL;

        if (is_file_epoll(file))
                ep = file->private_data;

        if (ep) {
                head = &ep->refs;
        } else if (!READ_ONCE(file->f_ep)) {
allocate:
                to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL);
                if (!to_free)
                        return -ENOMEM;
                head = &to_free->epitems;
        }
        spin_lock(&file->f_lock);
        if (!file->f_ep) {
                if (unlikely(!head)) {
                        spin_unlock(&file->f_lock);
                        goto allocate;
                }
                /* See eventpoll_release() for details. */
                WRITE_ONCE(file->f_ep, head);
                to_free = NULL;
        }
        hlist_add_head_rcu(&epi->fllink, file->f_ep);
        spin_unlock(&file->f_lock);
        free_ephead(to_free);
        return 0;
}

/*
 * Must be called with "mtx" held.
 */
static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
                     struct file *tfile, int fd, int full_check)
{
        int error, pwake = 0;
        __poll_t revents;
        struct epitem *epi;
        struct ep_pqueue epq;
        struct eventpoll *tep = NULL;

        if (is_file_epoll(tfile))
                tep = tfile->private_data;

        lockdep_assert_irqs_enabled();

        if (unlikely(percpu_counter_compare(&ep->user->epoll_watches,
                                            max_user_watches) >= 0))
                return -ENOSPC;
        percpu_counter_inc(&ep->user->epoll_watches);

        if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) {
                percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
        }

        /* Item initialization follow here ... */
        INIT_LIST_HEAD(&epi->rdllink);
        epi->ep = ep;
        ep_set_ffd(&epi->ffd, tfile, fd);
        epi->event = *event;
        epi->next = EP_UNACTIVE_PTR;

        if (tep)
                mutex_lock_nested(&tep->mtx, 1);
        /* Add the current item to the list of active epoll hook for this file */
        if (unlikely(attach_epitem(tfile, epi) < 0)) {
                if (tep)
                        mutex_unlock(&tep->mtx);
                kmem_cache_free(epi_cache, epi);
                percpu_counter_dec(&ep->user->epoll_watches);
                return -ENOMEM;
        }

        if (full_check && !tep)
                list_file(tfile);

        /*
         * Add the current item to the RB tree. All RB tree operations are
         * protected by "mtx", and ep_insert() is called with "mtx" held.
         */
        ep_rbtree_insert(ep, epi);
        if (tep)
                mutex_unlock(&tep->mtx);

        /*
         * ep_remove_safe() calls in the later error paths can't lead to
         * ep_free() as the ep file itself still holds an ep reference.
         */
        ep_get(ep);

        /* now check if we've created too many backpaths */
        if (unlikely(full_check && reverse_path_check())) {
                ep_remove_safe(ep, epi);
                return -EINVAL;
        }

        if (epi->event.events & EPOLLWAKEUP) {
                error = ep_create_wakeup_source(epi);
                if (error) {
                        ep_remove_safe(ep, epi);
                        return error;
                }
        }

        /* Initialize the poll table using the queue callback */
        epq.epi = epi;
        init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

        /*
         * Attach the item to the poll hooks and get current event bits.
         * We can safely use the file* here because its usage count has
         * been increased by the caller of this function. Note that after
         * this operation completes, the poll callback can start hitting
         * the new item.
         */
        revents = ep_item_poll(epi, &epq.pt, 1);

        /*
         * We have to check if something went wrong during the poll wait queue
         * install process. Namely an allocation for a wait queue failed due
         * high memory pressure.
         */
        if (unlikely(!epq.epi)) {
                ep_remove_safe(ep, epi);
                return -ENOMEM;
        }

        /* We have to drop the new item inside our item list to keep track of it */
        write_lock_irq(&ep->lock);

        /* record NAPI ID of new item if present */
        ep_set_busy_poll_napi_id(epi);

        /* If the file is already "ready" we drop it inside the ready list */
        if (revents && !ep_is_linked(epi)) {
                list_add_tail(&epi->rdllink, &ep->rdllist);
                ep_pm_stay_awake(epi);

                /* Notify waiting tasks that events are available */
                if (waitqueue_active(&ep->wq))
                        wake_up(&ep->wq);
                if (waitqueue_active(&ep->poll_wait))
                        pwake++;
        }

        write_unlock_irq(&ep->lock);

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;
}

/*
 * Modify the interest event mask by dropping an event if the new mask
 * has a match in the current file status. Must be called with "mtx" held.
 */
static int ep_modify(struct eventpoll *ep, struct epitem *epi,
                     const struct epoll_event *event)
{
        int pwake = 0;
        poll_table pt;

        lockdep_assert_irqs_enabled();

        init_poll_funcptr(&pt, NULL);

        /*
         * Set the new event interest mask before calling f_op->poll();
         * otherwise we might miss an event that happens between the
         * f_op->poll() call and the new event set registering.
         */
        epi->event.events = event->events; /* need barrier below */
        epi->event.data = event->data; /* protected by mtx */
        if (epi->event.events & EPOLLWAKEUP) {
                if (!ep_has_wakeup_source(epi))
                        ep_create_wakeup_source(epi);
        } else if (ep_has_wakeup_source(epi)) {
                ep_destroy_wakeup_source(epi);
        }

        /*
         * The following barrier has two effects:
         *
         * 1) Flush epi changes above to other CPUs.  This ensures
         *    we do not miss events from ep_poll_callback if an
         *    event occurs immediately after we call f_op->poll().
         *    We need this because we did not take ep->lock while
         *    changing epi above (but ep_poll_callback does take
         *    ep->lock).
         *
         * 2) We also need to ensure we do not miss _past_ events
         *    when calling f_op->poll().  This barrier also
         *    pairs with the barrier in wq_has_sleeper (see
         *    comments for wq_has_sleeper).
         *
         * This barrier will now guarantee ep_poll_callback or f_op->poll
         * (or both) will notice the readiness of an item.
         */
        smp_mb();

        /*
         * Get current event bits. We can safely use the file* here because
         * its usage count has been increased by the caller of this function.
         * If the item is "hot" and it is not registered inside the ready
         * list, push it inside.
         */
        if (ep_item_poll(epi, &pt, 1)) {
                write_lock_irq(&ep->lock);
                if (!ep_is_linked(epi)) {
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);

                        /* Notify waiting tasks that events are available */
                        if (waitqueue_active(&ep->wq))
                                wake_up(&ep->wq);
                        if (waitqueue_active(&ep->poll_wait))
                                pwake++;
                }
                write_unlock_irq(&ep->lock);
        }

        /* We have to call this outside the lock */
        if (pwake)
                ep_poll_safewake(ep, NULL, 0);

        return 0;
}

static int ep_send_events(struct eventpoll *ep,
                          struct epoll_event __user *events, int maxevents)
{
        struct epitem *epi, *tmp;
        LIST_HEAD(txlist);
        poll_table pt;
        int res = 0;

        /*
         * Always short-circuit for fatal signals to allow threads to make a
         * timely exit without the chance of finding more events available and
         * fetching repeatedly.
         */
        if (fatal_signal_pending(current))
                return -EINTR;

        init_poll_funcptr(&pt, NULL);

        mutex_lock(&ep->mtx);
        ep_start_scan(ep, &txlist);

        /*
         * We can loop without lock because we are passed a task private list.
         * Items cannot vanish during the loop we are holding ep->mtx.
         */
        list_for_each_entry_safe(epi, tmp, &txlist, rdllink) {
                struct wakeup_source *ws;
                __poll_t revents;

                if (res >= maxevents)
                        break;

                /*
                 * Activate ep->ws before deactivating epi->ws to prevent
                 * triggering auto-suspend here (in case we reactive epi->ws
                 * below).
                 *
                 * This could be rearranged to delay the deactivation of epi->ws
                 * instead, but then epi->ws would temporarily be out of sync
                 * with ep_is_linked().
                 */
                ws = ep_wakeup_source(epi);
                if (ws) {
                        if (ws->active)
                                __pm_stay_awake(ep->ws);
                        __pm_relax(ws);
                }

                list_del_init(&epi->rdllink);

                /*
                 * If the event mask intersect the caller-requested one,
                 * deliver the event to userspace. Again, we are holding ep->mtx,
                 * so no operations coming from userspace can change the item.
                 */
                revents = ep_item_poll(epi, &pt, 1);
                if (!revents)
                        continue;

                events = epoll_put_uevent(revents, epi->event.data, events);
                if (!events) {
                        list_add(&epi->rdllink, &txlist);
                        ep_pm_stay_awake(epi);
                        if (!res)
                                res = -EFAULT;
                        break;
                }
                res++;
                if (epi->event.events & EPOLLONESHOT)
                        epi->event.events &= EP_PRIVATE_BITS;
                else if (!(epi->event.events & EPOLLET)) {
                        /*
                         * If this file has been added with Level
                         * Trigger mode, we need to insert back inside
                         * the ready list, so that the next call to
                         * epoll_wait() will check again the events
                         * availability. At this point, no one can insert
                         * into ep->rdllist besides us. The epoll_ctl()
                         * callers are locked out by
                         * ep_send_events() holding "mtx" and the
                         * poll callback will queue them in ep->ovflist.
                         */
                        list_add_tail(&epi->rdllink, &ep->rdllist);
                        ep_pm_stay_awake(epi);
                }
        }
        ep_done_scan(ep, &txlist);
        mutex_unlock(&ep->mtx);

        return res;
}

static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms)
{
        struct timespec64 now;

        if (ms < 0)
                return NULL;

        if (!ms) {
                to->tv_sec = 0;
                to->tv_nsec = 0;
                return to;
        }

        to->tv_sec = ms / MSEC_PER_SEC;
        to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC);

        ktime_get_ts64(&now);
        *to = timespec64_add_safe(now, *to);
        return to;
}

/*
 * autoremove_wake_function, but remove even on failure to wake up, because we
 * know that default_wake_function/ttwu will only fail if the thread is already
 * woken, and in that case the ep_poll loop will remove the entry anyways, not
 * try to reuse it.
 */
static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
                                       unsigned int mode, int sync, void *key)
{
        int ret = default_wake_function(wq_entry, mode, sync, key);

        /*
         * Pairs with list_empty_careful in ep_poll, and ensures future loop
         * iterations see the cause of this wakeup.
         */
        list_del_init_careful(&wq_entry->entry);
        return ret;
}

static int ep_try_send_events(struct eventpoll *ep,
                              struct epoll_event __user *events, int maxevents)
{
        int res;

        /*
         * Try to transfer events to user space. In case we get 0 events and
         * there's still timeout left over, we go trying again in search of
         * more luck.
         */
        res = ep_send_events(ep, events, maxevents);
        if (res > 0)
                ep_suspend_napi_irqs(ep);
        return res;
}

static int ep_schedule_timeout(ktime_t *to)
{
        if (to)
                return ktime_after(*to, ktime_get());
        else
                return 1;
}

/**
 * ep_poll - Retrieves ready events, and delivers them to the caller-supplied
 *           event buffer.
 *
 * @ep: Pointer to the eventpoll context.
 * @events: Pointer to the userspace buffer where the ready events should be
 *          stored.
 * @maxevents: Size (in terms of number of events) of the caller event buffer.
 * @timeout: Maximum timeout for the ready events fetch operation, in
 *           timespec. If the timeout is zero, the function will not block,
 *           while if the @timeout ptr is NULL, the function will block
 *           until at least one event has been retrieved (or an error
 *           occurred).
 *
 * Return: the number of ready events which have been fetched, or an
 *          error code, in case of error.
 */
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
                   int maxevents, struct timespec64 *timeout)
{
        int res, eavail, timed_out = 0;
        u64 slack = 0;
        wait_queue_entry_t wait;
        ktime_t expires, *to = NULL;

        lockdep_assert_irqs_enabled();

        if (timeout && (timeout->tv_sec | timeout->tv_nsec)) {
                slack = select_estimate_accuracy(timeout);
                to = &expires;
                *to = timespec64_to_ktime(*timeout);
        } else if (timeout) {
                /*
                 * Avoid the unnecessary trip to the wait queue loop, if the
                 * caller specified a non blocking operation.
                 */
                timed_out = 1;
        }

        /*
         * This call is racy: We may or may not see events that are being added
         * to the ready list under the lock (e.g., in IRQ callbacks). For cases
         * with a non-zero timeout, this thread will check the ready list under
         * lock and will add to the wait queue.  For cases with a zero
         * timeout, the user by definition should not care and will have to
         * recheck again.
         */
        eavail = ep_events_available(ep);

        while (1) {
                if (eavail) {
                        res = ep_try_send_events(ep, events, maxevents);
                        if (res)
                                return res;
                }

                if (timed_out)
                        return 0;

                eavail = ep_busy_loop(ep);
                if (eavail)
                        continue;

                if (signal_pending(current))
                        return -EINTR;

                /*
                 * Internally init_wait() uses autoremove_wake_function(),
                 * thus wait entry is removed from the wait queue on each
                 * wakeup. Why it is important? In case of several waiters
                 * each new wakeup will hit the next waiter, giving it the
                 * chance to harvest new event. Otherwise wakeup can be
                 * lost. This is also good performance-wise, because on
                 * normal wakeup path no need to call __remove_wait_queue()
                 * explicitly, thus ep->lock is not taken, which halts the
                 * event delivery.
                 *
                 * In fact, we now use an even more aggressive function that
                 * unconditionally removes, because we don't reuse the wait
                 * entry between loop iterations. This lets us also avoid the
                 * performance issue if a process is killed, causing all of its
                 * threads to wake up without being removed normally.
                 */
                init_wait(&wait);
                wait.func = ep_autoremove_wake_function;

                write_lock_irq(&ep->lock);
                /*
                 * Barrierless variant, waitqueue_active() is called under
                 * the same lock on wakeup ep_poll_callback() side, so it
                 * is safe to avoid an explicit barrier.
                 */
                __set_current_state(TASK_INTERRUPTIBLE);

                /*
                 * Do the final check under the lock. ep_start/done_scan()
                 * plays with two lists (->rdllist and ->ovflist) and there
                 * is always a race when both lists are empty for short
                 * period of time although events are pending, so lock is
                 * important.
                 */
                eavail = ep_events_available(ep);
                if (!eavail)
                        __add_wait_queue_exclusive(&ep->wq, &wait);

                write_unlock_irq(&ep->lock);

                if (!eavail && ep_schedule_timeout(to))
                        timed_out = !schedule_hrtimeout_range(to, slack,
                                                              HRTIMER_MODE_ABS);
                __set_current_state(TASK_RUNNING);

                /*
                 * We were woken up, thus go and try to harvest some events.
                 * If timed out and still on the wait queue, recheck eavail
                 * carefully under lock, below.
                 */
                eavail = 1;

                if (!list_empty_careful(&wait.entry)) {
                        write_lock_irq(&ep->lock);
                        /*
                         * If the thread timed out and is not on the wait queue,
                         * it means that the thread was woken up after its
                         * timeout expired before it could reacquire the lock.
                         * Thus, when wait.entry is empty, it needs to harvest
                         * events.
                         */
                        if (timed_out)
                                eavail = list_empty(&wait.entry);
                        __remove_wait_queue(&ep->wq, &wait);
                        write_unlock_irq(&ep->lock);
                }
        }
}

/**
 * ep_loop_check_proc - verify that adding an epoll file inside another
 *                      epoll structure does not violate the constraints, in
 *                      terms of closed loops, or too deep chains (which can
 *                      result in excessive stack usage).
 *
 * @ep: the &struct eventpoll to be currently checked.
 * @depth: Current depth of the path being checked.
 *
 * Return: %zero if adding the epoll @file inside current epoll
 *          structure @ep does not violate the constraints, or %-1 otherwise.
 */
static int ep_loop_check_proc(struct eventpoll *ep, int depth)
{
        int error = 0;
        struct rb_node *rbp;
        struct epitem *epi;

        mutex_lock_nested(&ep->mtx, depth + 1);
        ep->gen = loop_check_gen;
        for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) {
                epi = rb_entry(rbp, struct epitem, rbn);
                if (unlikely(is_file_epoll(epi->ffd.file))) {
                        struct eventpoll *ep_tovisit;
                        ep_tovisit = epi->ffd.file->private_data;
                        if (ep_tovisit->gen == loop_check_gen)
                                continue;
                        if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS)
                                error = -1;
                        else
                                error = ep_loop_check_proc(ep_tovisit, depth + 1);
                        if (error != 0)
                                break;
                } else {
                        /*
                         * If we've reached a file that is not associated with
                         * an ep, then we need to check if the newly added
                         * links are going to add too many wakeup paths. We do
                         * this by adding it to the tfile_check_list, if it's
                         * not already there, and calling reverse_path_check()
                         * during ep_insert().
                         */
                        list_file(epi->ffd.file);
                }
        }
        mutex_unlock(&ep->mtx);

        return error;
}

/**
 * ep_loop_check - Performs a check to verify that adding an epoll file (@to)
 *                 into another epoll file (represented by @ep) does not create
 *                 closed loops or too deep chains.
 *
 * @ep: Pointer to the epoll we are inserting into.
 * @to: Pointer to the epoll to be inserted.
 *
 * Return: %zero if adding the epoll @to inside the epoll @from
 * does not violate the constraints, or %-1 otherwise.
 */
static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to)
{
        inserting_into = ep;
        return ep_loop_check_proc(to, 0);
}

static void clear_tfile_check_list(void)
{
        rcu_read_lock();
        while (tfile_check_list != EP_UNACTIVE_PTR) {
                struct epitems_head *head = tfile_check_list;
                tfile_check_list = head->next;
                unlist_file(head);
        }
        rcu_read_unlock();
}

/*
 * Open an eventpoll file descriptor.
 */
static int do_epoll_create(int flags)
{
        int error, fd;
        struct eventpoll *ep = NULL;
        struct file *file;

        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);

        if (flags & ~EPOLL_CLOEXEC)
                return -EINVAL;
        /*
         * Create the internal data structure ("struct eventpoll").
         */
        error = ep_alloc(&ep);
        if (error < 0)
                return error;
        /*
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
        fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
        if (fd < 0) {
                error = fd;
                goto out_free_ep;
        }
        file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
                                 O_RDWR | (flags & O_CLOEXEC));
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto out_free_fd;
        }
        ep->file = file;
        fd_install(fd, file);
        return fd;

out_free_fd:
        put_unused_fd(fd);
out_free_ep:
        ep_clear_and_put(ep);
        return error;
}

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
        return do_epoll_create(flags);
}

SYSCALL_DEFINE1(epoll_create, int, size)
{
        if (size <= 0)
                return -EINVAL;

        return do_epoll_create(0);
}

#ifdef CONFIG_PM_SLEEP
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
                epev->events &= ~EPOLLWAKEUP;
}
#else
static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev)
{
        epev->events &= ~EPOLLWAKEUP;
}
#endif

static inline int epoll_mutex_lock(struct mutex *mutex, int depth,
                                   bool nonblock)
{
        if (!nonblock) {
                mutex_lock_nested(mutex, depth);
                return 0;
        }
        if (mutex_trylock(mutex))
                return 0;
        return -EAGAIN;
}

int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
                 bool nonblock)
{
        int error;
        int full_check = 0;
        struct eventpoll *ep;
        struct epitem *epi;
        struct eventpoll *tep = NULL;

        CLASS(fd, f)(epfd);
        if (fd_empty(f))
                return -EBADF;

        /* Get the "struct file *" for the target file */
        CLASS(fd, tf)(fd);
        if (fd_empty(tf))
                return -EBADF;

        /* The target file descriptor must support poll */
        if (!file_can_poll(fd_file(tf)))
                return -EPERM;

        /* Check if EPOLLWAKEUP is allowed */
        if (ep_op_has_event(op))
                ep_take_care_of_epollwakeup(epds);

        /*
         * We have to check that the file structure underneath the file descriptor
         * the user passed to us _is_ an eventpoll file. And also we do not permit
         * adding an epoll file descriptor inside itself.
         */
        error = -EINVAL;
        if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f)))
                goto error_tgt_fput;

        /*
         * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
         * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
         * Also, we do not currently supported nested exclusive wakeups.
         */
        if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) {
                if (op == EPOLL_CTL_MOD)
                        goto error_tgt_fput;
                if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) ||
                                (epds->events & ~EPOLLEXCLUSIVE_OK_BITS)))
                        goto error_tgt_fput;
        }

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = fd_file(f)->private_data;

        /*
         * When we insert an epoll file descriptor inside another epoll file
         * descriptor, there is the chance of creating closed loops, which are
         * better be handled here, than in more critical paths. While we are
         * checking for loops we also determine the list of files reachable
         * and hang them on the tfile_check_list, so we can check that we
         * haven't created too many possible wakeup paths.
         *
         * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
         * the epoll file descriptor is attaching directly to a wakeup source,
         * unless the epoll file descriptor is nested. The purpose of taking the
         * 'epnested_mutex' on add is to prevent complex toplogies such as loops and
         * deep wakeup paths from forming in parallel through multiple
         * EPOLL_CTL_ADD operations.
         */
        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
        if (error)
                goto error_tgt_fput;
        if (op == EPOLL_CTL_ADD) {
                if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen ||
                    is_file_epoll(fd_file(tf))) {
                        mutex_unlock(&ep->mtx);
                        error = epoll_mutex_lock(&epnested_mutex, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                        loop_check_gen++;
                        full_check = 1;
                        if (is_file_epoll(fd_file(tf))) {
                                tep = fd_file(tf)->private_data;
                                error = -ELOOP;
                                if (ep_loop_check(ep, tep) != 0)
                                        goto error_tgt_fput;
                        }
                        error = epoll_mutex_lock(&ep->mtx, 0, nonblock);
                        if (error)
                                goto error_tgt_fput;
                }
        }

        /*
         * Try to lookup the file inside our RB tree. Since we grabbed "mtx"
         * above, we can be sure to be able to use the item looked up by
         * ep_find() till we release the mutex.
         */
        epi = ep_find(ep, fd_file(tf), fd);

        error = -EINVAL;
        switch (op) {
        case EPOLL_CTL_ADD:
                if (!epi) {
                        epds->events |= EPOLLERR | EPOLLHUP;
                        error = ep_insert(ep, epds, fd_file(tf), fd, full_check);
                } else
                        error = -EEXIST;
                break;
        case EPOLL_CTL_DEL:
                if (epi) {
                        /*
                         * The eventpoll itself is still alive: the refcount
                         * can't go to zero here.
                         */
                        ep_remove_safe(ep, epi);
                        error = 0;
                } else {
                        error = -ENOENT;
                }
                break;
        case EPOLL_CTL_MOD:
                if (epi) {
                        if (!(epi->event.events & EPOLLEXCLUSIVE)) {
                                epds->events |= EPOLLERR | EPOLLHUP;
                                error = ep_modify(ep, epi, epds);
                        }
                } else
                        error = -ENOENT;
                break;
        }
        mutex_unlock(&ep->mtx);

error_tgt_fput:
        if (full_check) {
                clear_tfile_check_list();
                loop_check_gen++;
                mutex_unlock(&epnested_mutex);
        }
        return error;
}

/*
 * The following function implements the controller interface for
 * the eventpoll file that enables the insertion/removal/change of
 * file descriptors inside the interest set.
 */
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
                struct epoll_event __user *, event)
{
        struct epoll_event epds;

        if (ep_op_has_event(op) &&
            copy_from_user(&epds, event, sizeof(struct epoll_event)))
                return -EFAULT;

        return do_epoll_ctl(epfd, op, fd, &epds, false);
}

static int ep_check_params(struct file *file, struct epoll_event __user *evs,
                           int maxevents)
{
        /* The maximum number of event must be greater than zero */
        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
                return -EINVAL;

        /* Verify that the area passed by the user is writeable */
        if (!access_ok(evs, maxevents * sizeof(struct epoll_event)))
                return -EFAULT;

        /*
         * We have to check that the file structure underneath the fd
         * the user passed to us _is_ an eventpoll file.
         */
        if (!is_file_epoll(file))
                return -EINVAL;

        return 0;
}

int epoll_sendevents(struct file *file, struct epoll_event __user *events,
                     int maxevents)
{
        struct eventpoll *ep;
        int ret;

        ret = ep_check_params(file, events, maxevents);
        if (unlikely(ret))
                return ret;

        ep = file->private_data;
        /*
         * Racy call, but that's ok - it should get retried based on
         * poll readiness anyway.
         */
        if (ep_events_available(ep))
                return ep_try_send_events(ep, events, maxevents);
        return 0;
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_wait(2).
 */
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
                         int maxevents, struct timespec64 *to)
{
        struct eventpoll *ep;
        int ret;

        /* Get the "struct file *" for the eventpoll file */
        CLASS(fd, f)(epfd);
        if (fd_empty(f))
                return -EBADF;

        ret = ep_check_params(fd_file(f), events, maxevents);
        if (unlikely(ret))
                return ret;

        /*
         * At this point it is safe to assume that the "private_data" contains
         * our own data structure.
         */
        ep = fd_file(f)->private_data;

        /* Time to fish for events ... */
        return ep_poll(ep, events, maxevents, to);
}

SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout)
{
        struct timespec64 to;

        return do_epoll_wait(epfd, events, maxevents,
                             ep_timeout_to_timespec(&to, timeout));
}

/*
 * Implement the event wait interface for the eventpoll file. It is the kernel
 * part of the user space epoll_pwait(2).
 */
static int do_epoll_pwait(int epfd, struct epoll_event __user *events,
                          int maxevents, struct timespec64 *to,
                          const sigset_t __user *sigmask, size_t sigsetsize)
{
        int error;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        error = set_user_sigmask(sigmask, sigsetsize);
        if (error)
                return error;

        error = do_epoll_wait(epfd, events, maxevents, to);

        restore_saved_sigmask_unless(error == -EINTR);

        return error;
}

SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
                int, maxevents, int, timeout, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 to;

        return do_epoll_pwait(epfd, events, maxevents,
                              ep_timeout_to_timespec(&to, timeout),
                              sigmask, sigsetsize);
}

SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events,
                int, maxevents, const struct __kernel_timespec __user *, timeout,
                const sigset_t __user *, sigmask, size_t, sigsetsize)
{
        struct timespec64 ts, *to = NULL;

        if (timeout) {
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                to = &ts;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        return do_epoll_pwait(epfd, events, maxevents, to,
                              sigmask, sigsetsize);
}

#ifdef CONFIG_COMPAT
static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events,
                                 int maxevents, struct timespec64 *timeout,
                                 const compat_sigset_t __user *sigmask,
                                 compat_size_t sigsetsize)
{
        long err;

        /*
         * If the caller wants a certain signal mask to be set during the wait,
         * we apply it here.
         */
        err = set_compat_user_sigmask(sigmask, sigsetsize);
        if (err)
                return err;

        err = do_epoll_wait(epfd, events, maxevents, timeout);

        restore_saved_sigmask_unless(err == -EINTR);

        return err;
}

COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
                       struct epoll_event __user *, events,
                       int, maxevents, int, timeout,
                       const compat_sigset_t __user *, sigmask,
                       compat_size_t, sigsetsize)
{
        struct timespec64 to;

        return do_compat_epoll_pwait(epfd, events, maxevents,
                                     ep_timeout_to_timespec(&to, timeout),
                                     sigmask, sigsetsize);
}

COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd,
                       struct epoll_event __user *, events,
                       int, maxevents,
                       const struct __kernel_timespec __user *, timeout,
                       const compat_sigset_t __user *, sigmask,
                       compat_size_t, sigsetsize)
{
        struct timespec64 ts, *to = NULL;

        if (timeout) {
                if (get_timespec64(&ts, timeout))
                        return -EFAULT;
                to = &ts;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        return do_compat_epoll_pwait(epfd, events, maxevents, to,
                                     sigmask, sigsetsize);
}

#endif

static int __init eventpoll_init(void)
{
        struct sysinfo si;

        si_meminfo(&si);
        /*
         * Allows top 4% of lomem to be allocated for epoll watches (per user).
         */
        max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
                EP_ITEM_COST;
        BUG_ON(max_user_watches < 0);

        /*
         * We can have many thousands of epitems, so prevent this from
         * using an extra cache line on 64-bit (and smaller) CPUs
         */
        BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128);

        /* Allocates slab cache used to allocate "struct epitem" items */
        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        /* Allocates slab cache used to allocate "struct eppoll_entry" */
        pwq_cache = kmem_cache_create("eventpoll_pwq",
                sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);
        epoll_sysctls_init();

        ephead_cache = kmem_cache_create("ep_head",
                sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL);

        return 0;
}
fs_initcall(eventpoll_init);


























    8 























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Definitions for diskquota-operations. When diskquota is configured these
 * macros expand to the right source-code.
 *
 * Author:  Marco van Wieringen <mvw@planets.elm.net>
 */
#ifndef _LINUX_QUOTAOPS_
#define _LINUX_QUOTAOPS_

#include <linux/fs.h>

#define DQUOT_SPACE_WARN        0x1
#define DQUOT_SPACE_RESERVE        0x2
#define DQUOT_SPACE_NOFAIL        0x4

static inline struct quota_info *sb_dqopt(struct super_block *sb)
{
        return &sb->s_dquot;
}

/* i_mutex must being held */
static inline bool is_quota_modification(struct mnt_idmap *idmap,
                                         struct inode *inode, struct iattr *ia)
{
        return ((ia->ia_valid & ATTR_SIZE) ||
                i_uid_needs_update(idmap, ia, inode) ||
                i_gid_needs_update(idmap, ia, inode));
}

#if defined(CONFIG_QUOTA)

#define quota_error(sb, fmt, args...) \
        __quota_error((sb), __func__, fmt , ## args)

extern __printf(3, 4)
void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...);

/*
 * declaration of quota_function calls in kernel.
 */
int dquot_initialize(struct inode *inode);
bool dquot_initialize_needed(struct inode *inode);
void dquot_drop(struct inode *inode);
struct dquot *dqget(struct super_block *sb, struct kqid qid);
static inline struct dquot *dqgrab(struct dquot *dquot)
{
        /* Make sure someone else has active reference to dquot */
        WARN_ON_ONCE(!atomic_read(&dquot->dq_count));
        WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
        atomic_inc(&dquot->dq_count);
        return dquot;
}

static inline bool dquot_is_busy(struct dquot *dquot)
{
        if (test_bit(DQ_MOD_B, &dquot->dq_flags))
                return true;
        if (atomic_read(&dquot->dq_count) > 0)
                return true;
        return false;
}

void dqput(struct dquot *dquot);
int dquot_scan_active(struct super_block *sb,
                      int (*fn)(struct dquot *dquot, unsigned long priv),
                      unsigned long priv);
struct dquot *dquot_alloc(struct super_block *sb, int type);
void dquot_destroy(struct dquot *dquot);

int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags);
void __dquot_free_space(struct inode *inode, qsize_t number, int flags);

int dquot_alloc_inode(struct inode *inode);

void dquot_claim_space_nodirty(struct inode *inode, qsize_t number);
void dquot_free_inode(struct inode *inode);
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number);

int dquot_disable(struct super_block *sb, int type, unsigned int flags);
/* Suspend quotas on remount RO */
static inline int dquot_suspend(struct super_block *sb, int type)
{
        return dquot_disable(sb, type, DQUOT_SUSPENDED);
}
int dquot_resume(struct super_block *sb, int type);

int dquot_commit(struct dquot *dquot);
int dquot_acquire(struct dquot *dquot);
int dquot_release(struct dquot *dquot);
int dquot_commit_info(struct super_block *sb, int type);
int dquot_get_next_id(struct super_block *sb, struct kqid *qid);
int dquot_mark_dquot_dirty(struct dquot *dquot);

int dquot_file_open(struct inode *inode, struct file *file);

int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
        unsigned int flags);
int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
        unsigned int flags);
int dquot_quota_on(struct super_block *sb, int type, int format_id,
        const struct path *path);
int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
         int format_id, int type);
int dquot_quota_off(struct super_block *sb, int type);
int dquot_writeback_dquots(struct super_block *sb, int type);
int dquot_quota_sync(struct super_block *sb, int type);
int dquot_get_state(struct super_block *sb, struct qc_state *state);
int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii);
int dquot_get_dqblk(struct super_block *sb, struct kqid id,
                struct qc_dqblk *di);
int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id,
                struct qc_dqblk *di);
int dquot_set_dqblk(struct super_block *sb, struct kqid id,
                struct qc_dqblk *di);

int __dquot_transfer(struct inode *inode, struct dquot **transfer_to);
int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode,
                   struct iattr *iattr);

static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->info + type;
}

/*
 * Functions for checking status of quota
 */

static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_USAGE_ENABLED, type);
}

static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
}

static inline bool sb_has_quota_suspended(struct super_block *sb, int type)
{
        return sb_dqopt(sb)->flags &
                                dquot_state_flag(DQUOT_SUSPENDED, type);
}

static inline unsigned sb_any_quota_suspended(struct super_block *sb)
{
        return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED);
}

/* Does kernel know about any quota information for given sb + type? */
static inline bool sb_has_quota_loaded(struct super_block *sb, int type)
{
        /* Currently if anything is on, then quota usage is on as well */
        return sb_has_quota_usage_enabled(sb, type);
}

static inline unsigned sb_any_quota_loaded(struct super_block *sb)
{
        return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED);
}

static inline bool sb_has_quota_active(struct super_block *sb, int type)
{
        return sb_has_quota_loaded(sb, type) &&
               !sb_has_quota_suspended(sb, type);
}

/*
 * Operations supported for diskquotas.
 */
extern const struct dquot_operations dquot_operations;
extern const struct quotactl_ops dquot_quotactl_sysfile_ops;

#else

static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_has_quota_suspended(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_any_quota_suspended(struct super_block *sb)
{
        return 0;
}

/* Does kernel know about any quota information for given sb + type? */
static inline int sb_has_quota_loaded(struct super_block *sb, int type)
{
        return 0;
}

static inline int sb_any_quota_loaded(struct super_block *sb)
{
        return 0;
}

static inline int sb_has_quota_active(struct super_block *sb, int type)
{
        return 0;
}

static inline int dquot_initialize(struct inode *inode)
{
        return 0;
}

static inline bool dquot_initialize_needed(struct inode *inode)
{
        return false;
}

static inline void dquot_drop(struct inode *inode)
{
}

static inline int dquot_alloc_inode(struct inode *inode)
{
        return 0;
}

static inline void dquot_free_inode(struct inode *inode)
{
}

static inline int dquot_transfer(struct mnt_idmap *idmap,
                                 struct inode *inode, struct iattr *iattr)
{
        return 0;
}

static inline int __dquot_alloc_space(struct inode *inode, qsize_t number,
                int flags)
{
        if (!(flags & DQUOT_SPACE_RESERVE))
                inode_add_bytes(inode, number);
        return 0;
}

static inline void __dquot_free_space(struct inode *inode, qsize_t number,
                int flags)
{
        if (!(flags & DQUOT_SPACE_RESERVE))
                inode_sub_bytes(inode, number);
}

static inline void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
        inode_add_bytes(inode, number);
}

static inline int dquot_reclaim_space_nodirty(struct inode *inode,
                                              qsize_t number)
{
        inode_sub_bytes(inode, number);
        return 0;
}

static inline int dquot_disable(struct super_block *sb, int type,
                unsigned int flags)
{
        return 0;
}

static inline int dquot_suspend(struct super_block *sb, int type)
{
        return 0;
}

static inline int dquot_resume(struct super_block *sb, int type)
{
        return 0;
}

#define dquot_file_open                generic_file_open

static inline int dquot_writeback_dquots(struct super_block *sb, int type)
{
        return 0;
}

#endif /* CONFIG_QUOTA */

static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN);
}

static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr)
{
        __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL);
        mark_inode_dirty_sync(inode);
}

static inline int dquot_alloc_space(struct inode *inode, qsize_t nr)
{
        int ret;

        ret = dquot_alloc_space_nodirty(inode, nr);
        if (!ret) {
                /*
                 * Mark inode fully dirty. Since we are allocating blocks, inode
                 * would become fully dirty soon anyway and it reportedly
                 * reduces lock contention.
                 */
                mark_inode_dirty(inode);
        }
        return ret;
}

static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr)
{
        return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits);
}

static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr)
{
        dquot_alloc_space_nofail(inode, nr << inode->i_blkbits);
}

static inline int dquot_alloc_block(struct inode *inode, qsize_t nr)
{
        return dquot_alloc_space(inode, nr << inode->i_blkbits);
}

static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0);
}

static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr)
{
        int ret;

        ret = dquot_prealloc_block_nodirty(inode, nr);
        if (!ret)
                mark_inode_dirty_sync(inode);
        return ret;
}

static inline int dquot_reserve_block(struct inode *inode, qsize_t nr)
{
        return __dquot_alloc_space(inode, nr << inode->i_blkbits,
                                DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE);
}

static inline void dquot_claim_block(struct inode *inode, qsize_t nr)
{
        dquot_claim_space_nodirty(inode, nr << inode->i_blkbits);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr)
{
        dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr)
{
        __dquot_free_space(inode, nr, 0);
}

static inline void dquot_free_space(struct inode *inode, qsize_t nr)
{
        dquot_free_space_nodirty(inode, nr);
        mark_inode_dirty_sync(inode);
}

static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr)
{
        dquot_free_space_nodirty(inode, nr << inode->i_blkbits);
}

static inline void dquot_free_block(struct inode *inode, qsize_t nr)
{
        dquot_free_space(inode, nr << inode->i_blkbits);
}

static inline void dquot_release_reservation_block(struct inode *inode,
                qsize_t nr)
{
        __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE);
}

unsigned int qtype_enforce_flag(int type);

#endif /* _LINUX_QUOTAOPS_ */






























































    5 

















































  255 










  255 
















































  255 










  255 

















   43 

    5 





    5 




























































































































   26 




   26 






















































    8 
    8 


























    8 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  

  linux/include/linux/rbtree.h

  To use rbtrees you'll have to implement your own insert and search cores.
  This will avoid us to use callbacks and to drop drammatically performances.
  I know it's not the cleaner way,  but in C (not in C++) to get
  performances and genericity...

  See Documentation/core-api/rbtree.rst for documentation and samples.
*/

#ifndef        _LINUX_RBTREE_H
#define        _LINUX_RBTREE_H

#include <linux/container_of.h>
#include <linux/rbtree_types.h>

#include <linux/stddef.h>
#include <linux/rcupdate.h>

#define rb_parent(r)   ((struct rb_node *)((r)->__rb_parent_color & ~3))

#define        rb_entry(ptr, type, member) container_of(ptr, type, member)

#define RB_EMPTY_ROOT(root)  (READ_ONCE((root)->rb_node) == NULL)

/* 'empty' nodes are nodes that are known not to be inserted in an rbtree */
#define RB_EMPTY_NODE(node)  \
        ((node)->__rb_parent_color == (unsigned long)(node))
#define RB_CLEAR_NODE(node)  \
        ((node)->__rb_parent_color = (unsigned long)(node))


extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);


/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);

/* Postorder iteration - always visit the parent after its children */
extern struct rb_node *rb_first_postorder(const struct rb_root *);
extern struct rb_node *rb_next_postorder(const struct rb_node *);

/* Fast replacement of a single node without remove/rebalance/add/rebalance */
extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
                            struct rb_root *root);
extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new,
                                struct rb_root *root);

static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
                                struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        *rb_link = node;
}

static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent,
                                    struct rb_node **rb_link)
{
        node->__rb_parent_color = (unsigned long)parent;
        node->rb_left = node->rb_right = NULL;

        rcu_assign_pointer(*rb_link, node);
}

#define rb_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? rb_entry(____ptr, type, member) : NULL; \
        })

/**
 * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of
 * given type allowing the backing memory of @pos to be invalidated
 *
 * @pos:        the 'type *' to use as a loop cursor.
 * @n:                another 'type *' to use as temporary storage
 * @root:        'rb_root *' of the rbtree.
 * @field:        the name of the rb_node field within 'type'.
 *
 * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as
 * list_for_each_entry_safe() and allows the iteration to continue independent
 * of changes to @pos by the body of the loop.
 *
 * Note, however, that it cannot handle other modifications that re-order the
 * rbtree it is iterating over. This includes calling rb_erase() on @pos, as
 * rb_erase() may rebalance the tree, causing us to miss some nodes.
 */
#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
        for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \
             pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \
                        typeof(*pos), field); 1; }); \
             pos = n)

/* Same as rb_first(), but O(1) */
#define rb_first_cached(root) (root)->rb_leftmost

static inline void rb_insert_color_cached(struct rb_node *node,
                                          struct rb_root_cached *root,
                                          bool leftmost)
{
        if (leftmost)
                root->rb_leftmost = node;
        rb_insert_color(node, &root->rb_root);
}


static inline struct rb_node *
rb_erase_cached(struct rb_node *node, struct rb_root_cached *root)
{
        struct rb_node *leftmost = NULL;

        if (root->rb_leftmost == node)
                leftmost = root->rb_leftmost = rb_next(node);

        rb_erase(node, &root->rb_root);

        return leftmost;
}

static inline void rb_replace_node_cached(struct rb_node *victim,
                                          struct rb_node *new,
                                          struct rb_root_cached *root)
{
        if (root->rb_leftmost == victim)
                root->rb_leftmost = new;
        rb_replace_node(victim, new, &root->rb_root);
}

/*
 * The below helper functions use 2 operators with 3 different
 * calling conventions. The operators are related like:
 *
 *        comp(a->key,b) < 0  := less(a,b)
 *        comp(a->key,b) > 0  := less(b,a)
 *        comp(a->key,b) == 0 := !less(a,b) && !less(b,a)
 *
 * If these operators define a partial order on the elements we make no
 * guarantee on which of the elements matching the key is found. See
 * rb_find().
 *
 * The reason for this is to allow the find() interface without requiring an
 * on-stack dummy object, which might not be feasible due to object size.
 */

/**
 * rb_add_cached() - insert @node into the leftmost cached tree @tree
 * @node: node to insert
 * @tree: leftmost cached tree to insert @node into
 * @less: operator defining the (partial) node order
 *
 * Returns @node when it is the new leftmost, or NULL.
 */
static __always_inline struct rb_node *
rb_add_cached(struct rb_node *node, struct rb_root_cached *tree,
              bool (*less)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        rb_insert_color_cached(node, tree, leftmost);

        return leftmost ? node : NULL;
}

/**
 * rb_add() - insert @node into @tree
 * @node: node to insert
 * @tree: tree to insert @node into
 * @less: operator defining the (partial) node order
 */
static __always_inline void
rb_add(struct rb_node *node, struct rb_root *tree,
       bool (*less)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;

        while (*link) {
                parent = *link;
                if (less(node, parent))
                        link = &parent->rb_left;
                else
                        link = &parent->rb_right;
        }

        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
}

/**
 * rb_find_add_cached() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add_cached(struct rb_node *node, struct rb_root_cached *tree,
            int (*cmp)(const struct rb_node *new, const struct rb_node *exist))
{
        bool leftmost = true;
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0) {
                        link = &parent->rb_left;
                } else if (c > 0) {
                        link = &parent->rb_right;
                        leftmost = false;
                } else {
                        return parent;
                }
        }

        rb_link_node(node, parent, link);
        rb_insert_color_cached(node, tree, leftmost);
        return NULL;
}

/**
 * rb_find_add() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add(struct rb_node *node, struct rb_root *tree,
            int (*cmp)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0)
                        link = &parent->rb_left;
                else if (c > 0)
                        link = &parent->rb_right;
                else
                        return parent;
        }

        rb_link_node(node, parent, link);
        rb_insert_color(node, tree);
        return NULL;
}

/**
 * rb_find_add_rcu() - find equivalent @node in @tree, or add @node
 * @node: node to look-for / insert
 * @tree: tree to search / modify
 * @cmp: operator defining the node order
 *
 * Adds a Store-Release for link_node.
 *
 * Returns the rb_node matching @node, or NULL when no match is found and @node
 * is inserted.
 */
static __always_inline struct rb_node *
rb_find_add_rcu(struct rb_node *node, struct rb_root *tree,
                int (*cmp)(struct rb_node *, const struct rb_node *))
{
        struct rb_node **link = &tree->rb_node;
        struct rb_node *parent = NULL;
        int c;

        while (*link) {
                parent = *link;
                c = cmp(node, parent);

                if (c < 0)
                        link = &parent->rb_left;
                else if (c > 0)
                        link = &parent->rb_right;
                else
                        return parent;
        }

        rb_link_node_rcu(node, parent, link);
        rb_insert_color(node, tree);
        return NULL;
}

/**
 * rb_find() - find @key in tree @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining the node order
 *
 * Returns the rb_node matching @key or NULL.
 */
static __always_inline struct rb_node *
rb_find(const void *key, const struct rb_root *tree,
        int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;

        while (node) {
                int c = cmp(key, node);

                if (c < 0)
                        node = node->rb_left;
                else if (c > 0)
                        node = node->rb_right;
                else
                        return node;
        }

        return NULL;
}

/**
 * rb_find_rcu() - find @key in tree @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining the node order
 *
 * Notably, tree descent vs concurrent tree rotations is unsound and can result
 * in false-negatives.
 *
 * Returns the rb_node matching @key or NULL.
 */
static __always_inline struct rb_node *
rb_find_rcu(const void *key, const struct rb_root *tree,
            int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;

        while (node) {
                int c = cmp(key, node);

                if (c < 0)
                        node = rcu_dereference_raw(node->rb_left);
                else if (c > 0)
                        node = rcu_dereference_raw(node->rb_right);
                else
                        return node;
        }

        return NULL;
}

/**
 * rb_find_first() - find the first @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the leftmost node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_find_first(const void *key, const struct rb_root *tree,
              int (*cmp)(const void *key, const struct rb_node *))
{
        struct rb_node *node = tree->rb_node;
        struct rb_node *match = NULL;

        while (node) {
                int c = cmp(key, node);

                if (c <= 0) {
                        if (!c)
                                match = node;
                        node = node->rb_left;
                } else if (c > 0) {
                        node = node->rb_right;
                }
        }

        return match;
}

/**
 * rb_next_match() - find the next @key in @tree
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 *
 * Returns the next node matching @key, or NULL.
 */
static __always_inline struct rb_node *
rb_next_match(const void *key, struct rb_node *node,
              int (*cmp)(const void *key, const struct rb_node *))
{
        node = rb_next(node);
        if (node && cmp(key, node))
                node = NULL;
        return node;
}

/**
 * rb_for_each() - iterates a subtree matching @key
 * @node: iterator
 * @key: key to match
 * @tree: tree to search
 * @cmp: operator defining node order
 */
#define rb_for_each(node, key, tree, cmp) \
        for ((node) = rb_find_first((key), (tree), (cmp)); \
             (node); (node) = rb_next_match((key), (node), (cmp)))

#endif        /* _LINUX_RBTREE_H */















































































































































  246 




























































  247 























   26 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MM_PERCPU_INTERNAL_H
#define _MM_PERCPU_INTERNAL_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/memcontrol.h>

/*
 * pcpu_block_md is the metadata block struct.
 * Each chunk's bitmap is split into a number of full blocks.
 * All units are in terms of bits.
 *
 * The scan hint is the largest known contiguous area before the contig hint.
 * It is not necessarily the actual largest contig hint though.  There is an
 * invariant that the scan_hint_start > contig_hint_start iff
 * scan_hint == contig_hint.  This is necessary because when scanning forward,
 * we don't know if a new contig hint would be better than the current one.
 */
struct pcpu_block_md {
        int                        scan_hint;        /* scan hint for block */
        int                        scan_hint_start; /* block relative starting
                                                    position of the scan hint */
        int                     contig_hint;    /* contig hint for block */
        int                     contig_hint_start; /* block relative starting
                                                      position of the contig hint */
        int                     left_free;      /* size of free space along
                                                   the left side of the block */
        int                     right_free;     /* size of free space along
                                                   the right side of the block */
        int                     first_free;     /* block position of first free */
        int                        nr_bits;        /* total bits responsible for */
};

struct pcpuobj_ext {
#ifdef CONFIG_MEMCG
        struct obj_cgroup        *cgroup;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref        tag;
#endif
};

#if defined(CONFIG_MEMCG) || defined(CONFIG_MEM_ALLOC_PROFILING)
#define NEED_PCPUOBJ_EXT
#endif

struct pcpu_chunk {
#ifdef CONFIG_PERCPU_STATS
        int                        nr_alloc;        /* # of allocations */
        size_t                        max_alloc_size; /* largest allocation size */
#endif

        struct list_head        list;                /* linked to pcpu_slot lists */
        int                        free_bytes;        /* free bytes in the chunk */
        struct pcpu_block_md        chunk_md;
        unsigned long                *bound_map;        /* boundary map */

        /*
         * base_addr is the base address of this chunk.
         * To reduce false sharing, current layout is optimized to make sure
         * base_addr locate in the different cacheline with free_bytes and
         * chunk_md.
         */
        void                        *base_addr ____cacheline_aligned_in_smp;

        unsigned long                *alloc_map;        /* allocation map */
        struct pcpu_block_md        *md_blocks;        /* metadata blocks */

        void                        *data;                /* chunk data */
        bool                        immutable;        /* no [de]population allowed */
        bool                        isolated;        /* isolated from active chunk
                                                   slots */
        int                        start_offset;        /* the overlap with the previous
                                                   region to have a page aligned
                                                   base_addr */
        int                        end_offset;        /* additional area required to
                                                   have the region end page
                                                   aligned */
#ifdef NEED_PCPUOBJ_EXT
        struct pcpuobj_ext        *obj_exts;        /* vector of object cgroups */
#endif

        int                        nr_pages;        /* # of pages served by this chunk */
        int                        nr_populated;        /* # of populated pages */
        int                     nr_empty_pop_pages; /* # of empty populated pages */
        unsigned long                populated[];        /* populated bitmap */
};

static inline bool need_pcpuobj_ext(void)
{
        if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING))
                return true;
        if (!mem_cgroup_kmem_disabled())
                return true;
        return false;
}

extern spinlock_t pcpu_lock;

extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_sidelined_slot;
extern int pcpu_to_depopulate_slot;
extern int pcpu_nr_empty_pop_pages;

extern struct pcpu_chunk *pcpu_first_chunk;
extern struct pcpu_chunk *pcpu_reserved_chunk;

/**
 * pcpu_chunk_nr_blocks - converts nr_pages to # of md_blocks
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bitmap blocks used.
 */
static inline int pcpu_chunk_nr_blocks(struct pcpu_chunk *chunk)
{
        return chunk->nr_pages * PAGE_SIZE / PCPU_BITMAP_BLOCK_SIZE;
}

/**
 * pcpu_nr_pages_to_map_bits - converts the pages to size of bitmap
 * @pages: number of physical pages
 *
 * This conversion is from physical pages to the number of bits
 * required in the bitmap.
 */
static inline int pcpu_nr_pages_to_map_bits(int pages)
{
        return pages * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_chunk_map_bits - helper to convert nr_pages to size of bitmap
 * @chunk: chunk of interest
 *
 * This conversion is from the number of physical pages that the chunk
 * serves to the number of bits in the bitmap.
 */
static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
{
        return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}

/**
 * pcpu_obj_full_size - helper to calculate size of each accounted object
 * @size: size of area to allocate in bytes
 *
 * For each accounted object there is an extra space which is used to store
 * obj_cgroup membership if kmemcg is not disabled. Charge it too.
 */
static inline size_t pcpu_obj_full_size(size_t size)
{
        size_t extra_size = 0;

#ifdef CONFIG_MEMCG
        if (!mem_cgroup_kmem_disabled())
                extra_size += size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
#endif

        return size * num_possible_cpus() + extra_size;
}

#ifdef CONFIG_PERCPU_STATS

#include <linux/spinlock.h>

struct percpu_stats {
        u64 nr_alloc;                /* lifetime # of allocations */
        u64 nr_dealloc;                /* lifetime # of deallocations */
        u64 nr_cur_alloc;        /* current # of allocations */
        u64 nr_max_alloc;        /* max # of live allocations */
        u32 nr_chunks;                /* current # of live chunks */
        u32 nr_max_chunks;        /* max # of live chunks */
        size_t min_alloc_size;        /* min allocation size */
        size_t max_alloc_size;        /* max allocation size */
};

extern struct percpu_stats pcpu_stats;
extern struct pcpu_alloc_info pcpu_stats_ai;

/*
 * For debug purposes. We don't care about the flexible array.
 */
static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
        memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));

        /* initialize min_alloc_size to unit_size */
        pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
}

/*
 * pcpu_stats_area_alloc - increment area allocation stats
 * @chunk: the location of the area being allocated
 * @size: size of area to allocate in bytes
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_alloc++;
        pcpu_stats.nr_cur_alloc++;
        pcpu_stats.nr_max_alloc =
                max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
        pcpu_stats.min_alloc_size =
                min(pcpu_stats.min_alloc_size, size);
        pcpu_stats.max_alloc_size =
                max(pcpu_stats.max_alloc_size, size);

        chunk->nr_alloc++;
        chunk->max_alloc_size = max(chunk->max_alloc_size, size);
}

/*
 * pcpu_stats_area_dealloc - decrement allocation stats
 * @chunk: the location of the area being deallocated
 *
 * CONTEXT:
 * pcpu_lock.
 */
static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        pcpu_stats.nr_dealloc++;
        pcpu_stats.nr_cur_alloc--;

        chunk->nr_alloc--;
}

/*
 * pcpu_stats_chunk_alloc - increment chunk stats
 */
static inline void pcpu_stats_chunk_alloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks++;
        pcpu_stats.nr_max_chunks =
                max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

/*
 * pcpu_stats_chunk_dealloc - decrement chunk stats
 */
static inline void pcpu_stats_chunk_dealloc(void)
{
        unsigned long flags;
        spin_lock_irqsave(&pcpu_lock, flags);

        pcpu_stats.nr_chunks--;

        spin_unlock_irqrestore(&pcpu_lock, flags);
}

#else

static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
{
}

static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
{
}

static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
{
}

static inline void pcpu_stats_chunk_alloc(void)
{
}

static inline void pcpu_stats_chunk_dealloc(void)
{
}

#endif /* !CONFIG_PERCPU_STATS */

#endif




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 












































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
 * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
 *
 * Copyright (c) 2002-2017 Volkswagen Group Electronic Research
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of Volkswagen nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * Alternatively, provided that this notice is retained in full, this
 * software may be distributed under the terms of the GNU General
 * Public License ("GPL") version 2, in which case the provisions of the
 * GPL apply INSTEAD OF those given above.
 *
 * The provided data structures and external interfaces from this code
 * are not restricted to be used by modules with a GPL compatible license.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/hrtimer.h>
#include <linux/list.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/socket.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/can.h>
#include <linux/can/core.h>
#include <linux/can/skb.h>
#include <linux/can/bcm.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/net_namespace.h>

/*
 * To send multiple CAN frame content within TX_SETUP or to filter
 * CAN messages with multiplex index within RX_SETUP, the number of
 * different filters is limited to 256 due to the one byte index value.
 */
#define MAX_NFRAMES 256

/* limit timers to 400 days for sending/timeouts */
#define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60)

/* use of last_frames[index].flags */
#define RX_LOCAL   0x10 /* frame was created on the local host */
#define RX_OWN     0x20 /* frame was sent via the socket it was received on */
#define RX_RECV    0x40 /* received data for this element */
#define RX_THR     0x80 /* element not been sent due to throttle feature */
#define BCM_CAN_FLAGS_MASK 0x0F /* to clean private flags after usage */

/* get best masking value for can_rx_register() for a given single can_id */
#define REGMASK(id) ((id & CAN_EFF_FLAG) ? \
                     (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
                     (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))

MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
MODULE_ALIAS("can-proto-2");

#define BCM_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex)

/*
 * easy access to the first 64 bit of can(fd)_frame payload. cp->data is
 * 64 bit aligned so the offset has to be multiples of 8 which is ensured
 * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler().
 */
static inline u64 get_u64(const struct canfd_frame *cp, int offset)
{
        return *(u64 *)(cp->data + offset);
}

struct bcm_op {
        struct list_head list;
        struct rcu_head rcu;
        int ifindex;
        canid_t can_id;
        u32 flags;
        unsigned long frames_abs, frames_filtered;
        struct bcm_timeval ival1, ival2;
        struct hrtimer timer, thrtimer;
        ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
        int rx_ifindex;
        int cfsiz;
        u32 count;
        u32 nframes;
        u32 currframe;
        /* void pointers to arrays of struct can[fd]_frame */
        void *frames;
        void *last_frames;
        struct canfd_frame sframe;
        struct canfd_frame last_sframe;
        struct sock *sk;
        struct net_device *rx_reg_dev;
};

struct bcm_sock {
        struct sock sk;
        int bound;
        int ifindex;
        struct list_head notifier;
        struct list_head rx_ops;
        struct list_head tx_ops;
        unsigned long dropped_usr_msgs;
        struct proc_dir_entry *bcm_proc_read;
        char procname [32]; /* inode number in decimal with \0 */
};

static LIST_HEAD(bcm_notifier_list);
static DEFINE_SPINLOCK(bcm_notifier_lock);
static struct bcm_sock *bcm_busy_notifier;

/* Return pointer to store the extra msg flags for bcm_recvmsg().
 * We use the space of one unsigned int beyond the 'struct sockaddr_can'
 * in skb->cb.
 */
static inline unsigned int *bcm_flags(struct sk_buff *skb)
{
        /* return pointer after struct sockaddr_can */
        return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
}

static inline struct bcm_sock *bcm_sk(const struct sock *sk)
{
        return (struct bcm_sock *)sk;
}

static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv)
{
        return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
}

/* check limitations for timeval provided by user */
static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head)
{
        if ((msg_head->ival1.tv_sec < 0) ||
            (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) ||
            (msg_head->ival1.tv_usec < 0) ||
            (msg_head->ival1.tv_usec >= USEC_PER_SEC) ||
            (msg_head->ival2.tv_sec < 0) ||
            (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) ||
            (msg_head->ival2.tv_usec < 0) ||
            (msg_head->ival2.tv_usec >= USEC_PER_SEC))
                return true;

        return false;
}

#define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU)
#define OPSIZ sizeof(struct bcm_op)
#define MHSIZ sizeof(struct bcm_msg_head)

/*
 * procfs functions
 */
#if IS_ENABLED(CONFIG_PROC_FS)
static char *bcm_proc_getifname(struct net *net, char *result, int ifindex)
{
        struct net_device *dev;

        if (!ifindex)
                return "any";

        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        if (dev)
                strcpy(result, dev->name);
        else
                strcpy(result, "???");
        rcu_read_unlock();

        return result;
}

static int bcm_proc_show(struct seq_file *m, void *v)
{
        char ifname[IFNAMSIZ];
        struct net *net = m->private;
        struct sock *sk = (struct sock *)pde_data(m->file->f_inode);
        struct bcm_sock *bo = bcm_sk(sk);
        struct bcm_op *op;

        seq_printf(m, ">>> socket %pK", sk->sk_socket);
        seq_printf(m, " / sk %pK", sk);
        seq_printf(m, " / bo %pK", bo);
        seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs);
        seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex));
        seq_printf(m, " <<<\n");

        list_for_each_entry(op, &bo->rx_ops, list) {

                unsigned long reduction;

                /* print only active entries & prevent division by zero */
                if (!op->frames_abs)
                        continue;

                seq_printf(m, "rx_op: %03X %-5s ", op->can_id,
                           bcm_proc_getifname(net, ifname, op->ifindex));

                if (op->flags & CAN_FD_FRAME)
                        seq_printf(m, "(%u)", op->nframes);
                else
                        seq_printf(m, "[%u]", op->nframes);

                seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' ');

                if (op->kt_ival1)
                        seq_printf(m, "timeo=%lld ",
                                   (long long)ktime_to_us(op->kt_ival1));

                if (op->kt_ival2)
                        seq_printf(m, "thr=%lld ",
                                   (long long)ktime_to_us(op->kt_ival2));

                seq_printf(m, "# recv %ld (%ld) => reduction: ",
                           op->frames_filtered, op->frames_abs);

                reduction = 100 - (op->frames_filtered * 100) / op->frames_abs;

                seq_printf(m, "%s%ld%%\n",
                           (reduction == 100) ? "near " : "", reduction);
        }

        list_for_each_entry(op, &bo->tx_ops, list) {

                seq_printf(m, "tx_op: %03X %s ", op->can_id,
                           bcm_proc_getifname(net, ifname, op->ifindex));

                if (op->flags & CAN_FD_FRAME)
                        seq_printf(m, "(%u) ", op->nframes);
                else
                        seq_printf(m, "[%u] ", op->nframes);

                if (op->kt_ival1)
                        seq_printf(m, "t1=%lld ",
                                   (long long)ktime_to_us(op->kt_ival1));

                if (op->kt_ival2)
                        seq_printf(m, "t2=%lld ",
                                   (long long)ktime_to_us(op->kt_ival2));

                seq_printf(m, "# sent %ld\n", op->frames_abs);
        }
        seq_putc(m, '\n');
        return 0;
}
#endif /* CONFIG_PROC_FS */

/*
 * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface
 *              of the given bcm tx op
 */
static void bcm_can_tx(struct bcm_op *op)
{
        struct sk_buff *skb;
        struct net_device *dev;
        struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe;
        int err;

        /* no target device? => exit */
        if (!op->ifindex)
                return;

        dev = dev_get_by_index(sock_net(op->sk), op->ifindex);
        if (!dev) {
                /* RFC: should this bcm_op remove itself here? */
                return;
        }

        skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any());
        if (!skb)
                goto out;

        can_skb_reserve(skb);
        can_skb_prv(skb)->ifindex = dev->ifindex;
        can_skb_prv(skb)->skbcnt = 0;

        skb_put_data(skb, cf, op->cfsiz);

        /* send with loopback */
        skb->dev = dev;
        can_skb_set_owner(skb, op->sk);
        err = can_send(skb, 1);
        if (!err)
                op->frames_abs++;

        op->currframe++;

        /* reached last frame? */
        if (op->currframe >= op->nframes)
                op->currframe = 0;
out:
        dev_put(dev);
}

/*
 * bcm_send_to_user - send a BCM message to the userspace
 *                    (consisting of bcm_msg_head + x CAN frames)
 */
static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
                             struct canfd_frame *frames, int has_timestamp)
{
        struct sk_buff *skb;
        struct canfd_frame *firstframe;
        struct sockaddr_can *addr;
        struct sock *sk = op->sk;
        unsigned int datalen = head->nframes * op->cfsiz;
        int err;
        unsigned int *pflags;

        skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
        if (!skb)
                return;

        skb_put_data(skb, head, sizeof(*head));

        /* ensure space for sockaddr_can and msg flags */
        sock_skb_cb_check_size(sizeof(struct sockaddr_can) +
                               sizeof(unsigned int));

        /* initialize msg flags */
        pflags = bcm_flags(skb);
        *pflags = 0;

        if (head->nframes) {
                /* CAN frames starting here */
                firstframe = (struct canfd_frame *)skb_tail_pointer(skb);

                skb_put_data(skb, frames, datalen);

                /*
                 * the BCM uses the flags-element of the canfd_frame
                 * structure for internal purposes. This is only
                 * relevant for updates that are generated by the
                 * BCM, where nframes is 1
                 */
                if (head->nframes == 1) {
                        if (firstframe->flags & RX_LOCAL)
                                *pflags |= MSG_DONTROUTE;
                        if (firstframe->flags & RX_OWN)
                                *pflags |= MSG_CONFIRM;

                        firstframe->flags &= BCM_CAN_FLAGS_MASK;
                }
        }

        if (has_timestamp) {
                /* restore rx timestamp */
                skb->tstamp = op->rx_stamp;
        }

        /*
         *  Put the datagram to the queue so that bcm_recvmsg() can
         *  get it from there.  We need to pass the interface index to
         *  bcm_recvmsg().  We pass a whole struct sockaddr_can in skb->cb
         *  containing the interface index.
         */

        addr = (struct sockaddr_can *)skb->cb;
        memset(addr, 0, sizeof(*addr));
        addr->can_family  = AF_CAN;
        addr->can_ifindex = op->rx_ifindex;

        err = sock_queue_rcv_skb(sk, skb);
        if (err < 0) {
                struct bcm_sock *bo = bcm_sk(sk);

                kfree_skb(skb);
                /* don't care about overflows in this statistic */
                bo->dropped_usr_msgs++;
        }
}

static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
{
        ktime_t ival;

        if (op->kt_ival1 && op->count)
                ival = op->kt_ival1;
        else if (op->kt_ival2)
                ival = op->kt_ival2;
        else
                return false;

        hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
        return true;
}

static void bcm_tx_start_timer(struct bcm_op *op)
{
        if (bcm_tx_set_expiry(op, &op->timer))
                hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
}

/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
{
        struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
        struct bcm_msg_head msg_head;

        if (op->kt_ival1 && (op->count > 0)) {
                op->count--;
                if (!op->count && (op->flags & TX_COUNTEVT)) {

                        /* create notification to user */
                        memset(&msg_head, 0, sizeof(msg_head));
                        msg_head.opcode  = TX_EXPIRED;
                        msg_head.flags   = op->flags;
                        msg_head.count   = op->count;
                        msg_head.ival1   = op->ival1;
                        msg_head.ival2   = op->ival2;
                        msg_head.can_id  = op->can_id;
                        msg_head.nframes = 0;

                        bcm_send_to_user(op, &msg_head, NULL, 0);
                }
                bcm_can_tx(op);

        } else if (op->kt_ival2) {
                bcm_can_tx(op);
        }

        return bcm_tx_set_expiry(op, &op->timer) ?
                HRTIMER_RESTART : HRTIMER_NORESTART;
}

/*
 * bcm_rx_changed - create a RX_CHANGED notification due to changed content
 */
static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data)
{
        struct bcm_msg_head head;

        /* update statistics */
        op->frames_filtered++;

        /* prevent statistics overflow */
        if (op->frames_filtered > ULONG_MAX/100)
                op->frames_filtered = op->frames_abs = 0;

        /* this element is not throttled anymore */
        data->flags &= ~RX_THR;

        memset(&head, 0, sizeof(head));
        head.opcode  = RX_CHANGED;
        head.flags   = op->flags;
        head.count   = op->count;
        head.ival1   = op->ival1;
        head.ival2   = op->ival2;
        head.can_id  = op->can_id;
        head.nframes = 1;

        bcm_send_to_user(op, &head, data, 1);
}

/*
 * bcm_rx_update_and_send - process a detected relevant receive content change
 *                          1. update the last received data
 *                          2. send a notification to the user (if possible)
 */
static void bcm_rx_update_and_send(struct bcm_op *op,
                                   struct canfd_frame *lastdata,
                                   const struct canfd_frame *rxdata,
                                   unsigned char traffic_flags)
{
        memcpy(lastdata, rxdata, op->cfsiz);

        /* mark as used and throttled by default */
        lastdata->flags |= (RX_RECV|RX_THR);

        /* add own/local/remote traffic flags */
        lastdata->flags |= traffic_flags;

        /* throttling mode inactive ? */
        if (!op->kt_ival2) {
                /* send RX_CHANGED to the user immediately */
                bcm_rx_changed(op, lastdata);
                return;
        }

        /* with active throttling timer we are just done here */
        if (hrtimer_active(&op->thrtimer))
                return;

        /* first reception with enabled throttling mode */
        if (!op->kt_lastmsg)
                goto rx_changed_settime;

        /* got a second frame inside a potential throttle period? */
        if (ktime_us_delta(ktime_get(), op->kt_lastmsg) <
            ktime_to_us(op->kt_ival2)) {
                /* do not send the saved data - only start throttle timer */
                hrtimer_start(&op->thrtimer,
                              ktime_add(op->kt_lastmsg, op->kt_ival2),
                              HRTIMER_MODE_ABS_SOFT);
                return;
        }

        /* the gap was that big, that throttling was not needed here */
rx_changed_settime:
        bcm_rx_changed(op, lastdata);
        op->kt_lastmsg = ktime_get();
}

/*
 * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly
 *                       received data stored in op->last_frames[]
 */
static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
                                const struct canfd_frame *rxdata,
                                unsigned char traffic_flags)
{
        struct canfd_frame *cf = op->frames + op->cfsiz * index;
        struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
        int i;

        /*
         * no one uses the MSBs of flags for comparison,
         * so we use it here to detect the first time of reception
         */

        if (!(lcf->flags & RX_RECV)) {
                /* received data for the first time => send update to user */
                bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
                return;
        }

        /* do a real check in CAN frame data section */
        for (i = 0; i < rxdata->len; i += 8) {
                if ((get_u64(cf, i) & get_u64(rxdata, i)) !=
                    (get_u64(cf, i) & get_u64(lcf, i))) {
                        bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
                        return;
                }
        }

        if (op->flags & RX_CHECK_DLC) {
                /* do a real check in CAN frame length */
                if (rxdata->len != lcf->len) {
                        bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
                        return;
                }
        }
}

/*
 * bcm_rx_starttimer - enable timeout monitoring for CAN frame reception
 */
static void bcm_rx_starttimer(struct bcm_op *op)
{
        if (op->flags & RX_NO_AUTOTIMER)
                return;

        if (op->kt_ival1)
                hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
}

/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
{
        struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
        struct bcm_msg_head msg_head;

        /* if user wants to be informed, when cyclic CAN-Messages come back */
        if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
                /* clear received CAN frames to indicate 'nothing received' */
                memset(op->last_frames, 0, op->nframes * op->cfsiz);
        }

        /* create notification to user */
        memset(&msg_head, 0, sizeof(msg_head));
        msg_head.opcode  = RX_TIMEOUT;
        msg_head.flags   = op->flags;
        msg_head.count   = op->count;
        msg_head.ival1   = op->ival1;
        msg_head.ival2   = op->ival2;
        msg_head.can_id  = op->can_id;
        msg_head.nframes = 0;

        bcm_send_to_user(op, &msg_head, NULL, 0);

        return HRTIMER_NORESTART;
}

/*
 * bcm_rx_do_flush - helper for bcm_rx_thr_flush
 */
static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
{
        struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;

        if ((op->last_frames) && (lcf->flags & RX_THR)) {
                bcm_rx_changed(op, lcf);
                return 1;
        }
        return 0;
}

/*
 * bcm_rx_thr_flush - Check for throttled data and send it to the userspace
 */
static int bcm_rx_thr_flush(struct bcm_op *op)
{
        int updated = 0;

        if (op->nframes > 1) {
                unsigned int i;

                /* for MUX filter we start at index 1 */
                for (i = 1; i < op->nframes; i++)
                        updated += bcm_rx_do_flush(op, i);

        } else {
                /* for RX_FILTER_ID and simple filter */
                updated += bcm_rx_do_flush(op, 0);
        }

        return updated;
}

/*
 * bcm_rx_thr_handler - the time for blocked content updates is over now:
 *                      Check for throttled data and send it to the userspace
 */
static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
{
        struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);

        if (bcm_rx_thr_flush(op)) {
                hrtimer_forward_now(hrtimer, op->kt_ival2);
                return HRTIMER_RESTART;
        } else {
                /* rearm throttle handling */
                op->kt_lastmsg = 0;
                return HRTIMER_NORESTART;
        }
}

/*
 * bcm_rx_handler - handle a CAN frame reception
 */
static void bcm_rx_handler(struct sk_buff *skb, void *data)
{
        struct bcm_op *op = (struct bcm_op *)data;
        const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data;
        unsigned int i;
        unsigned char traffic_flags;

        if (op->can_id != rxframe->can_id)
                return;

        /* make sure to handle the correct frame type (CAN / CAN FD) */
        if (op->flags & CAN_FD_FRAME) {
                if (!can_is_canfd_skb(skb))
                        return;
        } else {
                if (!can_is_can_skb(skb))
                        return;
        }

        /* disable timeout */
        hrtimer_cancel(&op->timer);

        /* save rx timestamp */
        op->rx_stamp = skb->tstamp;
        /* save originator for recvfrom() */
        op->rx_ifindex = skb->dev->ifindex;
        /* update statistics */
        op->frames_abs++;

        if (op->flags & RX_RTR_FRAME) {
                /* send reply for RTR-request (placed in op->frames[0]) */
                bcm_can_tx(op);
                return;
        }

        /* compute flags to distinguish between own/local/remote CAN traffic */
        traffic_flags = 0;
        if (skb->sk) {
                traffic_flags |= RX_LOCAL;
                if (skb->sk == op->sk)
                        traffic_flags |= RX_OWN;
        }

        if (op->flags & RX_FILTER_ID) {
                /* the easiest case */
                bcm_rx_update_and_send(op, op->last_frames, rxframe,
                                       traffic_flags);
                goto rx_starttimer;
        }

        if (op->nframes == 1) {
                /* simple compare with index 0 */
                bcm_rx_cmp_to_index(op, 0, rxframe, traffic_flags);
                goto rx_starttimer;
        }

        if (op->nframes > 1) {
                /*
                 * multiplex compare
                 *
                 * find the first multiplex mask that fits.
                 * Remark: The MUX-mask is stored in index 0 - but only the
                 * first 64 bits of the frame data[] are relevant (CAN FD)
                 */

                for (i = 1; i < op->nframes; i++) {
                        if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) ==
                            (get_u64(op->frames, 0) &
                             get_u64(op->frames + op->cfsiz * i, 0))) {
                                bcm_rx_cmp_to_index(op, i, rxframe,
                                                    traffic_flags);
                                break;
                        }
                }
        }

rx_starttimer:
        bcm_rx_starttimer(op);
}

/*
 * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements
 */
static struct bcm_op *bcm_find_op(struct list_head *ops,
                                  struct bcm_msg_head *mh, int ifindex)
{
        struct bcm_op *op;

        list_for_each_entry(op, ops, list) {
                if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
                    (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME))
                        return op;
        }

        return NULL;
}

static void bcm_free_op_rcu(struct rcu_head *rcu_head)
{
        struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu);

        if ((op->frames) && (op->frames != &op->sframe))
                kfree(op->frames);

        if ((op->last_frames) && (op->last_frames != &op->last_sframe))
                kfree(op->last_frames);

        kfree(op);
}

static void bcm_remove_op(struct bcm_op *op)
{
        hrtimer_cancel(&op->timer);
        hrtimer_cancel(&op->thrtimer);

        call_rcu(&op->rcu, bcm_free_op_rcu);
}

static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
{
        if (op->rx_reg_dev == dev) {
                can_rx_unregister(dev_net(dev), dev, op->can_id,
                                  REGMASK(op->can_id), bcm_rx_handler, op);

                /* mark as removed subscription */
                op->rx_reg_dev = NULL;
        } else
                printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device "
                       "mismatch %p %p\n", op->rx_reg_dev, dev);
}

/*
 * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops)
 */
static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
                            int ifindex)
{
        struct bcm_op *op, *n;

        list_for_each_entry_safe(op, n, ops, list) {
                if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
                    (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {

                        /* disable automatic timer on frame reception */
                        op->flags |= RX_NO_AUTOTIMER;

                        /*
                         * Don't care if we're bound or not (due to netdev
                         * problems) can_rx_unregister() is always a save
                         * thing to do here.
                         */
                        if (op->ifindex) {
                                /*
                                 * Only remove subscriptions that had not
                                 * been removed due to NETDEV_UNREGISTER
                                 * in bcm_notifier()
                                 */
                                if (op->rx_reg_dev) {
                                        struct net_device *dev;

                                        dev = dev_get_by_index(sock_net(op->sk),
                                                               op->ifindex);
                                        if (dev) {
                                                bcm_rx_unreg(dev, op);
                                                dev_put(dev);
                                        }
                                }
                        } else
                                can_rx_unregister(sock_net(op->sk), NULL,
                                                  op->can_id,
                                                  REGMASK(op->can_id),
                                                  bcm_rx_handler, op);

                        list_del(&op->list);
                        bcm_remove_op(op);
                        return 1; /* done */
                }
        }

        return 0; /* not found */
}

/*
 * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops)
 */
static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh,
                            int ifindex)
{
        struct bcm_op *op, *n;

        list_for_each_entry_safe(op, n, ops, list) {
                if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
                    (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
                        list_del(&op->list);
                        bcm_remove_op(op);
                        return 1; /* done */
                }
        }

        return 0; /* not found */
}

/*
 * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg)
 */
static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head,
                       int ifindex)
{
        struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex);

        if (!op)
                return -EINVAL;

        /* put current values into msg_head */
        msg_head->flags   = op->flags;
        msg_head->count   = op->count;
        msg_head->ival1   = op->ival1;
        msg_head->ival2   = op->ival2;
        msg_head->nframes = op->nframes;

        bcm_send_to_user(op, msg_head, op->frames, 0);

        return MHSIZ;
}

/*
 * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg)
 */
static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
                        int ifindex, struct sock *sk)
{
        struct bcm_sock *bo = bcm_sk(sk);
        struct bcm_op *op;
        struct canfd_frame *cf;
        unsigned int i;
        int err;

        /* we need a real device to send frames */
        if (!ifindex)
                return -ENODEV;

        /* check nframes boundaries - we need at least one CAN frame */
        if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
                return -EINVAL;

        /* check timeval limitations */
        if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
                return -EINVAL;

        /* check the given can_id */
        op = bcm_find_op(&bo->tx_ops, msg_head, ifindex);
        if (op) {
                /* update existing BCM operation */

                /*
                 * Do we need more space for the CAN frames than currently
                 * allocated? -> This is a _really_ unusual use-case and
                 * therefore (complexity / locking) it is not supported.
                 */
                if (msg_head->nframes > op->nframes)
                        return -E2BIG;

                /* update CAN frames content */
                for (i = 0; i < msg_head->nframes; i++) {

                        cf = op->frames + op->cfsiz * i;
                        err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);

                        if (op->flags & CAN_FD_FRAME) {
                                if (cf->len > 64)
                                        err = -EINVAL;
                        } else {
                                if (cf->len > 8)
                                        err = -EINVAL;
                        }

                        if (err < 0)
                                return err;

                        if (msg_head->flags & TX_CP_CAN_ID) {
                                /* copy can_id into frame */
                                cf->can_id = msg_head->can_id;
                        }
                }
                op->flags = msg_head->flags;

        } else {
                /* insert new BCM operation for the given can_id */

                op = kzalloc(OPSIZ, GFP_KERNEL);
                if (!op)
                        return -ENOMEM;

                op->can_id = msg_head->can_id;
                op->cfsiz = CFSIZ(msg_head->flags);
                op->flags = msg_head->flags;

                /* create array for CAN frames and copy the data */
                if (msg_head->nframes > 1) {
                        op->frames = kmalloc_array(msg_head->nframes,
                                                   op->cfsiz,
                                                   GFP_KERNEL);
                        if (!op->frames) {
                                kfree(op);
                                return -ENOMEM;
                        }
                } else
                        op->frames = &op->sframe;

                for (i = 0; i < msg_head->nframes; i++) {

                        cf = op->frames + op->cfsiz * i;
                        err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);
                        if (err < 0)
                                goto free_op;

                        if (op->flags & CAN_FD_FRAME) {
                                if (cf->len > 64)
                                        err = -EINVAL;
                        } else {
                                if (cf->len > 8)
                                        err = -EINVAL;
                        }

                        if (err < 0)
                                goto free_op;

                        if (msg_head->flags & TX_CP_CAN_ID) {
                                /* copy can_id into frame */
                                cf->can_id = msg_head->can_id;
                        }
                }

                /* tx_ops never compare with previous received messages */
                op->last_frames = NULL;

                /* bcm_can_tx / bcm_tx_timeout_handler needs this */
                op->sk = sk;
                op->ifindex = ifindex;

                /* initialize uninitialized (kzalloc) structure */
                hrtimer_setup(&op->timer, bcm_tx_timeout_handler, CLOCK_MONOTONIC,
                              HRTIMER_MODE_REL_SOFT);

                /* currently unused in tx_ops */
                hrtimer_setup(&op->thrtimer, hrtimer_dummy_timeout, CLOCK_MONOTONIC,
                              HRTIMER_MODE_REL_SOFT);

                /* add this bcm_op to the list of the tx_ops */
                list_add(&op->list, &bo->tx_ops);

        } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */

        if (op->nframes != msg_head->nframes) {
                op->nframes   = msg_head->nframes;
                /* start multiple frame transmission with index 0 */
                op->currframe = 0;
        }

        /* check flags */

        if (op->flags & TX_RESET_MULTI_IDX) {
                /* start multiple frame transmission with index 0 */
                op->currframe = 0;
        }

        if (op->flags & SETTIMER) {
                /* set timer values */
                op->count = msg_head->count;
                op->ival1 = msg_head->ival1;
                op->ival2 = msg_head->ival2;
                op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
                op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);

                /* disable an active timer due to zero values? */
                if (!op->kt_ival1 && !op->kt_ival2)
                        hrtimer_cancel(&op->timer);
        }

        if (op->flags & STARTTIMER) {
                hrtimer_cancel(&op->timer);
                /* spec: send CAN frame when starting timer */
                op->flags |= TX_ANNOUNCE;
        }

        if (op->flags & TX_ANNOUNCE) {
                bcm_can_tx(op);
                if (op->count)
                        op->count--;
        }

        if (op->flags & STARTTIMER)
                bcm_tx_start_timer(op);

        return msg_head->nframes * op->cfsiz + MHSIZ;

free_op:
        if (op->frames != &op->sframe)
                kfree(op->frames);
        kfree(op);
        return err;
}

/*
 * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg)
 */
static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
                        int ifindex, struct sock *sk)
{
        struct bcm_sock *bo = bcm_sk(sk);
        struct bcm_op *op;
        int do_rx_register;
        int err = 0;

        if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) {
                /* be robust against wrong usage ... */
                msg_head->flags |= RX_FILTER_ID;
                /* ignore trailing garbage */
                msg_head->nframes = 0;
        }

        /* the first element contains the mux-mask => MAX_NFRAMES + 1  */
        if (msg_head->nframes > MAX_NFRAMES + 1)
                return -EINVAL;

        if ((msg_head->flags & RX_RTR_FRAME) &&
            ((msg_head->nframes != 1) ||
             (!(msg_head->can_id & CAN_RTR_FLAG))))
                return -EINVAL;

        /* check timeval limitations */
        if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
                return -EINVAL;

        /* check the given can_id */
        op = bcm_find_op(&bo->rx_ops, msg_head, ifindex);
        if (op) {
                /* update existing BCM operation */

                /*
                 * Do we need more space for the CAN frames than currently
                 * allocated? -> This is a _really_ unusual use-case and
                 * therefore (complexity / locking) it is not supported.
                 */
                if (msg_head->nframes > op->nframes)
                        return -E2BIG;

                if (msg_head->nframes) {
                        /* update CAN frames content */
                        err = memcpy_from_msg(op->frames, msg,
                                              msg_head->nframes * op->cfsiz);
                        if (err < 0)
                                return err;

                        /* clear last_frames to indicate 'nothing received' */
                        memset(op->last_frames, 0, msg_head->nframes * op->cfsiz);
                }

                op->nframes = msg_head->nframes;
                op->flags = msg_head->flags;

                /* Only an update -> do not call can_rx_register() */
                do_rx_register = 0;

        } else {
                /* insert new BCM operation for the given can_id */
                op = kzalloc(OPSIZ, GFP_KERNEL);
                if (!op)
                        return -ENOMEM;

                op->can_id = msg_head->can_id;
                op->nframes = msg_head->nframes;
                op->cfsiz = CFSIZ(msg_head->flags);
                op->flags = msg_head->flags;

                if (msg_head->nframes > 1) {
                        /* create array for CAN frames and copy the data */
                        op->frames = kmalloc_array(msg_head->nframes,
                                                   op->cfsiz,
                                                   GFP_KERNEL);
                        if (!op->frames) {
                                kfree(op);
                                return -ENOMEM;
                        }

                        /* create and init array for received CAN frames */
                        op->last_frames = kcalloc(msg_head->nframes,
                                                  op->cfsiz,
                                                  GFP_KERNEL);
                        if (!op->last_frames) {
                                kfree(op->frames);
                                kfree(op);
                                return -ENOMEM;
                        }

                } else {
                        op->frames = &op->sframe;
                        op->last_frames = &op->last_sframe;
                }

                if (msg_head->nframes) {
                        err = memcpy_from_msg(op->frames, msg,
                                              msg_head->nframes * op->cfsiz);
                        if (err < 0) {
                                if (op->frames != &op->sframe)
                                        kfree(op->frames);
                                if (op->last_frames != &op->last_sframe)
                                        kfree(op->last_frames);
                                kfree(op);
                                return err;
                        }
                }

                /* bcm_can_tx / bcm_tx_timeout_handler needs this */
                op->sk = sk;
                op->ifindex = ifindex;

                /* ifindex for timeout events w/o previous frame reception */
                op->rx_ifindex = ifindex;

                /* initialize uninitialized (kzalloc) structure */
                hrtimer_setup(&op->timer, bcm_rx_timeout_handler, CLOCK_MONOTONIC,
                              HRTIMER_MODE_REL_SOFT);
                hrtimer_setup(&op->thrtimer, bcm_rx_thr_handler, CLOCK_MONOTONIC,
                              HRTIMER_MODE_REL_SOFT);

                /* add this bcm_op to the list of the rx_ops */
                list_add(&op->list, &bo->rx_ops);

                /* call can_rx_register() */
                do_rx_register = 1;

        } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */

        /* check flags */

        if (op->flags & RX_RTR_FRAME) {
                struct canfd_frame *frame0 = op->frames;

                /* no timers in RTR-mode */
                hrtimer_cancel(&op->thrtimer);
                hrtimer_cancel(&op->timer);

                /*
                 * funny feature in RX(!)_SETUP only for RTR-mode:
                 * copy can_id into frame BUT without RTR-flag to
                 * prevent a full-load-loopback-test ... ;-]
                 */
                if ((op->flags & TX_CP_CAN_ID) ||
                    (frame0->can_id == op->can_id))
                        frame0->can_id = op->can_id & ~CAN_RTR_FLAG;

        } else {
                if (op->flags & SETTIMER) {

                        /* set timer value */
                        op->ival1 = msg_head->ival1;
                        op->ival2 = msg_head->ival2;
                        op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
                        op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2);

                        /* disable an active timer due to zero value? */
                        if (!op->kt_ival1)
                                hrtimer_cancel(&op->timer);

                        /*
                         * In any case cancel the throttle timer, flush
                         * potentially blocked msgs and reset throttle handling
                         */
                        op->kt_lastmsg = 0;
                        hrtimer_cancel(&op->thrtimer);
                        bcm_rx_thr_flush(op);
                }

                if ((op->flags & STARTTIMER) && op->kt_ival1)
                        hrtimer_start(&op->timer, op->kt_ival1,
                                      HRTIMER_MODE_REL_SOFT);
        }

        /* now we can register for can_ids, if we added a new bcm_op */
        if (do_rx_register) {
                if (ifindex) {
                        struct net_device *dev;

                        dev = dev_get_by_index(sock_net(sk), ifindex);
                        if (dev) {
                                err = can_rx_register(sock_net(sk), dev,
                                                      op->can_id,
                                                      REGMASK(op->can_id),
                                                      bcm_rx_handler, op,
                                                      "bcm", sk);

                                op->rx_reg_dev = dev;
                                dev_put(dev);
                        }

                } else
                        err = can_rx_register(sock_net(sk), NULL, op->can_id,
                                              REGMASK(op->can_id),
                                              bcm_rx_handler, op, "bcm", sk);
                if (err) {
                        /* this bcm rx op is broken -> remove it */
                        list_del(&op->list);
                        bcm_remove_op(op);
                        return err;
                }
        }

        return msg_head->nframes * op->cfsiz + MHSIZ;
}

/*
 * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg)
 */
static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk,
                       int cfsiz)
{
        struct sk_buff *skb;
        struct net_device *dev;
        int err;

        /* we need a real device to send frames */
        if (!ifindex)
                return -ENODEV;

        skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL);
        if (!skb)
                return -ENOMEM;

        can_skb_reserve(skb);

        err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz);
        if (err < 0) {
                kfree_skb(skb);
                return err;
        }

        dev = dev_get_by_index(sock_net(sk), ifindex);
        if (!dev) {
                kfree_skb(skb);
                return -ENODEV;
        }

        can_skb_prv(skb)->ifindex = dev->ifindex;
        can_skb_prv(skb)->skbcnt = 0;
        skb->dev = dev;
        can_skb_set_owner(skb, sk);
        err = can_send(skb, 1); /* send with loopback */
        dev_put(dev);

        if (err)
                return err;

        return cfsiz + MHSIZ;
}

/*
 * bcm_sendmsg - process BCM commands (opcodes) from the userspace
 */
static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
        struct sock *sk = sock->sk;
        struct bcm_sock *bo = bcm_sk(sk);
        int ifindex = bo->ifindex; /* default ifindex for this bcm_op */
        struct bcm_msg_head msg_head;
        int cfsiz;
        int ret; /* read bytes or error codes as return value */

        if (!bo->bound)
                return -ENOTCONN;

        /* check for valid message length from userspace */
        if (size < MHSIZ)
                return -EINVAL;

        /* read message head information */
        ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ);
        if (ret < 0)
                return ret;

        cfsiz = CFSIZ(msg_head.flags);
        if ((size - MHSIZ) % cfsiz)
                return -EINVAL;

        /* check for alternative ifindex for this bcm_op */

        if (!ifindex && msg->msg_name) {
                /* no bound device as default => check msg_name */
                DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);

                if (msg->msg_namelen < BCM_MIN_NAMELEN)
                        return -EINVAL;

                if (addr->can_family != AF_CAN)
                        return -EINVAL;

                /* ifindex from sendto() */
                ifindex = addr->can_ifindex;

                if (ifindex) {
                        struct net_device *dev;

                        dev = dev_get_by_index(sock_net(sk), ifindex);
                        if (!dev)
                                return -ENODEV;

                        if (dev->type != ARPHRD_CAN) {
                                dev_put(dev);
                                return -ENODEV;
                        }

                        dev_put(dev);
                }
        }

        lock_sock(sk);

        switch (msg_head.opcode) {

        case TX_SETUP:
                ret = bcm_tx_setup(&msg_head, msg, ifindex, sk);
                break;

        case RX_SETUP:
                ret = bcm_rx_setup(&msg_head, msg, ifindex, sk);
                break;

        case TX_DELETE:
                if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex))
                        ret = MHSIZ;
                else
                        ret = -EINVAL;
                break;

        case RX_DELETE:
                if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex))
                        ret = MHSIZ;
                else
                        ret = -EINVAL;
                break;

        case TX_READ:
                /* reuse msg_head for the reply to TX_READ */
                msg_head.opcode  = TX_STATUS;
                ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex);
                break;

        case RX_READ:
                /* reuse msg_head for the reply to RX_READ */
                msg_head.opcode  = RX_STATUS;
                ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex);
                break;

        case TX_SEND:
                /* we need exactly one CAN frame behind the msg head */
                if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ))
                        ret = -EINVAL;
                else
                        ret = bcm_tx_send(msg, ifindex, sk, cfsiz);
                break;

        default:
                ret = -EINVAL;
                break;
        }

        release_sock(sk);

        return ret;
}

/*
 * notification handler for netdevice status changes
 */
static void bcm_notify(struct bcm_sock *bo, unsigned long msg,
                       struct net_device *dev)
{
        struct sock *sk = &bo->sk;
        struct bcm_op *op;
        int notify_enodev = 0;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                return;

        switch (msg) {

        case NETDEV_UNREGISTER:
                lock_sock(sk);

                /* remove device specific receive entries */
                list_for_each_entry(op, &bo->rx_ops, list)
                        if (op->rx_reg_dev == dev)
                                bcm_rx_unreg(dev, op);

                /* remove device reference, if this is our bound device */
                if (bo->bound && bo->ifindex == dev->ifindex) {
#if IS_ENABLED(CONFIG_PROC_FS)
                        if (sock_net(sk)->can.bcmproc_dir && bo->bcm_proc_read) {
                                remove_proc_entry(bo->procname, sock_net(sk)->can.bcmproc_dir);
                                bo->bcm_proc_read = NULL;
                        }
#endif
                        bo->bound   = 0;
                        bo->ifindex = 0;
                        notify_enodev = 1;
                }

                release_sock(sk);

                if (notify_enodev) {
                        sk->sk_err = ENODEV;
                        if (!sock_flag(sk, SOCK_DEAD))
                                sk_error_report(sk);
                }
                break;

        case NETDEV_DOWN:
                if (bo->bound && bo->ifindex == dev->ifindex) {
                        sk->sk_err = ENETDOWN;
                        if (!sock_flag(sk, SOCK_DEAD))
                                sk_error_report(sk);
                }
        }
}

static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
                        void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        if (dev->type != ARPHRD_CAN)
                return NOTIFY_DONE;
        if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
                return NOTIFY_DONE;
        if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */
                return NOTIFY_DONE;

        spin_lock(&bcm_notifier_lock);
        list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) {
                spin_unlock(&bcm_notifier_lock);
                bcm_notify(bcm_busy_notifier, msg, dev);
                spin_lock(&bcm_notifier_lock);
        }
        bcm_busy_notifier = NULL;
        spin_unlock(&bcm_notifier_lock);
        return NOTIFY_DONE;
}

/*
 * initial settings for all BCM sockets to be set at socket creation time
 */
static int bcm_init(struct sock *sk)
{
        struct bcm_sock *bo = bcm_sk(sk);

        bo->bound            = 0;
        bo->ifindex          = 0;
        bo->dropped_usr_msgs = 0;
        bo->bcm_proc_read    = NULL;

        INIT_LIST_HEAD(&bo->tx_ops);
        INIT_LIST_HEAD(&bo->rx_ops);

        /* set notifier */
        spin_lock(&bcm_notifier_lock);
        list_add_tail(&bo->notifier, &bcm_notifier_list);
        spin_unlock(&bcm_notifier_lock);

        return 0;
}

/*
 * standard socket functions
 */
static int bcm_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct net *net;
        struct bcm_sock *bo;
        struct bcm_op *op, *next;

        if (!sk)
                return 0;

        net = sock_net(sk);
        bo = bcm_sk(sk);

        /* remove bcm_ops, timer, rx_unregister(), etc. */

        spin_lock(&bcm_notifier_lock);
        while (bcm_busy_notifier == bo) {
                spin_unlock(&bcm_notifier_lock);
                schedule_timeout_uninterruptible(1);
                spin_lock(&bcm_notifier_lock);
        }
        list_del(&bo->notifier);
        spin_unlock(&bcm_notifier_lock);

        lock_sock(sk);

#if IS_ENABLED(CONFIG_PROC_FS)
        /* remove procfs entry */
        if (net->can.bcmproc_dir && bo->bcm_proc_read)
                remove_proc_entry(bo->procname, net->can.bcmproc_dir);
#endif /* CONFIG_PROC_FS */

        list_for_each_entry_safe(op, next, &bo->tx_ops, list)
                bcm_remove_op(op);

        list_for_each_entry_safe(op, next, &bo->rx_ops, list) {
                /*
                 * Don't care if we're bound or not (due to netdev problems)
                 * can_rx_unregister() is always a save thing to do here.
                 */
                if (op->ifindex) {
                        /*
                         * Only remove subscriptions that had not
                         * been removed due to NETDEV_UNREGISTER
                         * in bcm_notifier()
                         */
                        if (op->rx_reg_dev) {
                                struct net_device *dev;

                                dev = dev_get_by_index(net, op->ifindex);
                                if (dev) {
                                        bcm_rx_unreg(dev, op);
                                        dev_put(dev);
                                }
                        }
                } else
                        can_rx_unregister(net, NULL, op->can_id,
                                          REGMASK(op->can_id),
                                          bcm_rx_handler, op);

        }

        synchronize_rcu();

        list_for_each_entry_safe(op, next, &bo->rx_ops, list)
                bcm_remove_op(op);

        /* remove device reference */
        if (bo->bound) {
                bo->bound   = 0;
                bo->ifindex = 0;
        }

        sock_orphan(sk);
        sock->sk = NULL;

        release_sock(sk);
        sock_prot_inuse_add(net, sk->sk_prot, -1);
        sock_put(sk);

        return 0;
}

static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
                       int flags)
{
        struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
        struct sock *sk = sock->sk;
        struct bcm_sock *bo = bcm_sk(sk);
        struct net *net = sock_net(sk);
        int ret = 0;

        if (len < BCM_MIN_NAMELEN)
                return -EINVAL;

        lock_sock(sk);

        if (bo->bound) {
                ret = -EISCONN;
                goto fail;
        }

        /* bind a device to this socket */
        if (addr->can_ifindex) {
                struct net_device *dev;

                dev = dev_get_by_index(net, addr->can_ifindex);
                if (!dev) {
                        ret = -ENODEV;
                        goto fail;
                }
                if (dev->type != ARPHRD_CAN) {
                        dev_put(dev);
                        ret = -ENODEV;
                        goto fail;
                }

                bo->ifindex = dev->ifindex;
                dev_put(dev);

        } else {
                /* no interface reference for ifindex = 0 ('any' CAN device) */
                bo->ifindex = 0;
        }

#if IS_ENABLED(CONFIG_PROC_FS)
        if (net->can.bcmproc_dir) {
                /* unique socket address as filename */
                sprintf(bo->procname, "%lu", sock_i_ino(sk));
                bo->bcm_proc_read = proc_create_net_single(bo->procname, 0644,
                                                     net->can.bcmproc_dir,
                                                     bcm_proc_show, sk);
                if (!bo->bcm_proc_read) {
                        ret = -ENOMEM;
                        goto fail;
                }
        }
#endif /* CONFIG_PROC_FS */

        bo->bound = 1;

fail:
        release_sock(sk);

        return ret;
}

static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                       int flags)
{
        struct sock *sk = sock->sk;
        struct sk_buff *skb;
        int error = 0;
        int err;

        skb = skb_recv_datagram(sk, flags, &error);
        if (!skb)
                return error;

        if (skb->len < size)
                size = skb->len;

        err = memcpy_to_msg(msg, skb->data, size);
        if (err < 0) {
                skb_free_datagram(sk, skb);
                return err;
        }

        sock_recv_cmsgs(msg, sk, skb);

        if (msg->msg_name) {
                __sockaddr_check_size(BCM_MIN_NAMELEN);
                msg->msg_namelen = BCM_MIN_NAMELEN;
                memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
        }

        /* assign the flags that have been recorded in bcm_send_to_user() */
        msg->msg_flags |= *(bcm_flags(skb));

        skb_free_datagram(sk, skb);

        return size;
}

static int bcm_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
                                unsigned long arg)
{
        /* no ioctls for socket layer -> hand it down to NIC layer */
        return -ENOIOCTLCMD;
}

static const struct proto_ops bcm_ops = {
        .family        = PF_CAN,
        .release       = bcm_release,
        .bind          = sock_no_bind,
        .connect       = bcm_connect,
        .socketpair    = sock_no_socketpair,
        .accept        = sock_no_accept,
        .getname       = sock_no_getname,
        .poll          = datagram_poll,
        .ioctl         = bcm_sock_no_ioctlcmd,
        .gettstamp     = sock_gettstamp,
        .listen        = sock_no_listen,
        .shutdown      = sock_no_shutdown,
        .sendmsg       = bcm_sendmsg,
        .recvmsg       = bcm_recvmsg,
        .mmap          = sock_no_mmap,
};

static struct proto bcm_proto __read_mostly = {
        .name       = "CAN_BCM",
        .owner      = THIS_MODULE,
        .obj_size   = sizeof(struct bcm_sock),
        .init       = bcm_init,
};

static const struct can_proto bcm_can_proto = {
        .type       = SOCK_DGRAM,
        .protocol   = CAN_BCM,
        .ops        = &bcm_ops,
        .prot       = &bcm_proto,
};

static int canbcm_pernet_init(struct net *net)
{
#if IS_ENABLED(CONFIG_PROC_FS)
        /* create /proc/net/can-bcm directory */
        net->can.bcmproc_dir = proc_net_mkdir(net, "can-bcm", net->proc_net);
#endif /* CONFIG_PROC_FS */

        return 0;
}

static void canbcm_pernet_exit(struct net *net)
{
#if IS_ENABLED(CONFIG_PROC_FS)
        /* remove /proc/net/can-bcm directory */
        if (net->can.bcmproc_dir)
                remove_proc_entry("can-bcm", net->proc_net);
#endif /* CONFIG_PROC_FS */
}

static struct pernet_operations canbcm_pernet_ops __read_mostly = {
        .init = canbcm_pernet_init,
        .exit = canbcm_pernet_exit,
};

static struct notifier_block canbcm_notifier = {
        .notifier_call = bcm_notifier
};

static int __init bcm_module_init(void)
{
        int err;

        pr_info("can: broadcast manager protocol\n");

        err = register_pernet_subsys(&canbcm_pernet_ops);
        if (err)
                return err;

        err = register_netdevice_notifier(&canbcm_notifier);
        if (err)
                goto register_notifier_failed;

        err = can_proto_register(&bcm_can_proto);
        if (err < 0) {
                printk(KERN_ERR "can: registration of bcm protocol failed\n");
                goto register_proto_failed;
        }

        return 0;

register_proto_failed:
        unregister_netdevice_notifier(&canbcm_notifier);
register_notifier_failed:
        unregister_pernet_subsys(&canbcm_pernet_ops);
        return err;
}

static void __exit bcm_module_exit(void)
{
        can_proto_unregister(&bcm_can_proto);
        unregister_netdevice_notifier(&canbcm_notifier);
        unregister_pernet_subsys(&canbcm_pernet_ops);
}

module_init(bcm_module_init);
module_exit(bcm_module_exit);











 1493 





 1494 




  220 






  220 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
// SPDX-License-Identifier: GPL-2.0
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/mmdebug.h>
#include <linux/mm.h>

#include <asm/memory.h>

phys_addr_t __virt_to_phys(unsigned long x)
{
        WARN(!__is_lm_address(__tag_reset(x)),
             "virt_to_phys used for non-linear address: %p (%pS)\n",
              (void *)x,
              (void *)x);

        return __virt_to_phys_nodebug(x);
}
EXPORT_SYMBOL(__virt_to_phys);

phys_addr_t __phys_addr_symbol(unsigned long x)
{
        /*
         * This is bounds checking against the kernel image only.
         * __pa_symbol should only be used on kernel symbol addresses.
         */
        VIRTUAL_BUG_ON(x < (unsigned long) KERNEL_START ||
                       x > (unsigned long) KERNEL_END);
        return __pa_symbol_nodebug(x);
}
EXPORT_SYMBOL(__phys_addr_symbol);




























   22 




   21 

   22 


















    3 







    3 

    3 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Lock-less NULL terminated single linked list
 *
 * The basic atomic operation of this list is cmpxchg on long.  On
 * architectures that don't have NMI-safe cmpxchg implementation, the
 * list can NOT be used in NMI handlers.  So code that uses the list in
 * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG.
 *
 * Copyright 2010,2011 Intel Corp.
 *   Author: Huang Ying <ying.huang@intel.com>
 */
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/llist.h>


/**
 * llist_add_batch - add several linked entries in batch
 * @new_first:        first entry in batch to be added
 * @new_last:        last entry in batch to be added
 * @head:        the head for your lock-less list
 *
 * Return whether list is empty before adding.
 */
bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last,
                     struct llist_head *head)
{
        struct llist_node *first = READ_ONCE(head->first);

        do {
                new_last->next = first;
        } while (!try_cmpxchg(&head->first, &first, new_first));

        return !first;
}
EXPORT_SYMBOL_GPL(llist_add_batch);

/**
 * llist_del_first - delete the first entry of lock-less list
 * @head:        the head for your lock-less list
 *
 * If list is empty, return NULL, otherwise, return the first entry
 * deleted, this is the newest added one.
 *
 * Only one llist_del_first user can be used simultaneously with
 * multiple llist_add users without lock.  Because otherwise
 * llist_del_first, llist_add, llist_add (or llist_del_all, llist_add,
 * llist_add) sequence in another user may change @head->first->next,
 * but keep @head->first.  If multiple consumers are needed, please
 * use llist_del_all or use lock between consumers.
 */
struct llist_node *llist_del_first(struct llist_head *head)
{
        struct llist_node *entry, *next;

        entry = smp_load_acquire(&head->first);
        do {
                if (entry == NULL)
                        return NULL;
                next = READ_ONCE(entry->next);
        } while (!try_cmpxchg(&head->first, &entry, next));

        return entry;
}
EXPORT_SYMBOL_GPL(llist_del_first);

/**
 * llist_del_first_this - delete given entry of lock-less list if it is first
 * @head:        the head for your lock-less list
 * @this:        a list entry.
 *
 * If head of the list is given entry, delete and return %true else
 * return %false.
 *
 * Multiple callers can safely call this concurrently with multiple
 * llist_add() callers, providing all the callers offer a different @this.
 */
bool llist_del_first_this(struct llist_head *head,
                          struct llist_node *this)
{
        struct llist_node *entry, *next;

        /* acquire ensures orderig wrt try_cmpxchg() is llist_del_first() */
        entry = smp_load_acquire(&head->first);
        do {
                if (entry != this)
                        return false;
                next = READ_ONCE(entry->next);
        } while (!try_cmpxchg(&head->first, &entry, next));

        return true;
}
EXPORT_SYMBOL_GPL(llist_del_first_this);

/**
 * llist_reverse_order - reverse order of a llist chain
 * @head:        first item of the list to be reversed
 *
 * Reverse the order of a chain of llist entries and return the
 * new first entry.
 */
struct llist_node *llist_reverse_order(struct llist_node *head)
{
        struct llist_node *new_head = NULL;

        while (head) {
                struct llist_node *tmp = head;
                head = head->next;
                tmp->next = new_head;
                new_head = tmp;
        }

        return new_head;
}
EXPORT_SYMBOL_GPL(llist_reverse_order);






























































    3 






    3 


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        "TEE" target extension for Xtables
 *        Copyright © Sebastian Claßen, 2007
 *        Jan Engelhardt, 2007-2010
 *
 *        based on ipt_ROUTE.c from Cédric de Launois
 *        <delaunois@info.ucl.be>
 */
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/route.h>
#include <linux/netfilter/x_tables.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/route.h>
#include <net/netfilter/ipv4/nf_dup_ipv4.h>
#include <net/netfilter/ipv6/nf_dup_ipv6.h>
#include <linux/netfilter/xt_TEE.h>

struct xt_tee_priv {
        struct list_head        list;
        struct xt_tee_tginfo        *tginfo;
        int                        oif;
};

static unsigned int tee_net_id __read_mostly;
static const union nf_inet_addr tee_zero_address;

struct tee_net {
        struct list_head priv_list;
        /* lock protects the priv_list */
        struct mutex lock;
};

static unsigned int
tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
        const struct xt_tee_tginfo *info = par->targinfo;
        int oif = info->priv ? info->priv->oif : 0;

        nf_dup_ipv4(xt_net(par), skb, xt_hooknum(par), &info->gw.in, oif);

        return XT_CONTINUE;
}

#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static unsigned int
tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
        const struct xt_tee_tginfo *info = par->targinfo;
        int oif = info->priv ? info->priv->oif : 0;

        nf_dup_ipv6(xt_net(par), skb, xt_hooknum(par), &info->gw.in6, oif);

        return XT_CONTINUE;
}
#endif

static int tee_netdev_event(struct notifier_block *this, unsigned long event,
                            void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);
        struct tee_net *tn = net_generic(net, tee_net_id);
        struct xt_tee_priv *priv;

        mutex_lock(&tn->lock);
        list_for_each_entry(priv, &tn->priv_list, list) {
                switch (event) {
                case NETDEV_REGISTER:
                        if (!strcmp(dev->name, priv->tginfo->oif))
                                priv->oif = dev->ifindex;
                        break;
                case NETDEV_UNREGISTER:
                        if (dev->ifindex == priv->oif)
                                priv->oif = -1;
                        break;
                case NETDEV_CHANGENAME:
                        if (!strcmp(dev->name, priv->tginfo->oif))
                                priv->oif = dev->ifindex;
                        else if (dev->ifindex == priv->oif)
                                priv->oif = -1;
                        break;
                }
        }
        mutex_unlock(&tn->lock);

        return NOTIFY_DONE;
}

static int tee_tg_check(const struct xt_tgchk_param *par)
{
        struct tee_net *tn = net_generic(par->net, tee_net_id);
        struct xt_tee_tginfo *info = par->targinfo;
        struct xt_tee_priv *priv;

        /* 0.0.0.0 and :: not allowed */
        if (memcmp(&info->gw, &tee_zero_address,
                   sizeof(tee_zero_address)) == 0)
                return -EINVAL;

        if (info->oif[0]) {
                struct net_device *dev;

                if (info->oif[sizeof(info->oif)-1] != '\0')
                        return -EINVAL;

                priv = kzalloc(sizeof(*priv), GFP_KERNEL);
                if (priv == NULL)
                        return -ENOMEM;

                priv->tginfo  = info;
                priv->oif     = -1;
                info->priv    = priv;

                dev = dev_get_by_name(par->net, info->oif);
                if (dev) {
                        priv->oif = dev->ifindex;
                        dev_put(dev);
                }
                mutex_lock(&tn->lock);
                list_add(&priv->list, &tn->priv_list);
                mutex_unlock(&tn->lock);
        } else
                info->priv = NULL;

        static_key_slow_inc(&xt_tee_enabled);
        return 0;
}

static void tee_tg_destroy(const struct xt_tgdtor_param *par)
{
        struct tee_net *tn = net_generic(par->net, tee_net_id);
        struct xt_tee_tginfo *info = par->targinfo;

        if (info->priv) {
                mutex_lock(&tn->lock);
                list_del(&info->priv->list);
                mutex_unlock(&tn->lock);
                kfree(info->priv);
        }
        static_key_slow_dec(&xt_tee_enabled);
}

static struct xt_target tee_tg_reg[] __read_mostly = {
        {
                .name       = "TEE",
                .revision   = 1,
                .family     = NFPROTO_IPV4,
                .target     = tee_tg4,
                .targetsize = sizeof(struct xt_tee_tginfo),
                .usersize   = offsetof(struct xt_tee_tginfo, priv),
                .checkentry = tee_tg_check,
                .destroy    = tee_tg_destroy,
                .me         = THIS_MODULE,
        },
#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
        {
                .name       = "TEE",
                .revision   = 1,
                .family     = NFPROTO_IPV6,
                .target     = tee_tg6,
                .targetsize = sizeof(struct xt_tee_tginfo),
                .usersize   = offsetof(struct xt_tee_tginfo, priv),
                .checkentry = tee_tg_check,
                .destroy    = tee_tg_destroy,
                .me         = THIS_MODULE,
        },
#endif
};

static int __net_init tee_net_init(struct net *net)
{
        struct tee_net *tn = net_generic(net, tee_net_id);

        INIT_LIST_HEAD(&tn->priv_list);
        mutex_init(&tn->lock);
        return 0;
}

static struct pernet_operations tee_net_ops = {
        .init = tee_net_init,
        .id   = &tee_net_id,
        .size = sizeof(struct tee_net),
};

static struct notifier_block tee_netdev_notifier = {
        .notifier_call = tee_netdev_event,
};

static int __init tee_tg_init(void)
{
        int ret;

        ret = register_pernet_subsys(&tee_net_ops);
        if (ret < 0)
                return ret;

        ret = xt_register_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
        if (ret < 0)
                goto cleanup_subsys;

        ret = register_netdevice_notifier(&tee_netdev_notifier);
        if (ret < 0)
                goto unregister_targets;

        return 0;

unregister_targets:
        xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
cleanup_subsys:
        unregister_pernet_subsys(&tee_net_ops);
        return ret;
}

static void __exit tee_tg_exit(void)
{
        unregister_netdevice_notifier(&tee_netdev_notifier);
        xt_unregister_targets(tee_tg_reg, ARRAY_SIZE(tee_tg_reg));
        unregister_pernet_subsys(&tee_net_ops);
}

module_init(tee_tg_init);
module_exit(tee_tg_exit);
MODULE_AUTHOR("Sebastian Claßen <sebastian.classen@freenet.ag>");
MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
MODULE_DESCRIPTION("Xtables: Reroute packet copy");
MODULE_LICENSE("GPL");
MODULE_ALIAS("ipt_TEE");
MODULE_ALIAS("ip6t_TEE");























  935 



  936 



























  960 


  960 

  960 




  960 
  935 










  948 



  947 


  946 
  945 















  947 


   89 
















  925 




  928 















  926 












  926 



  927 


   76 

















  961 


  962 


























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
// SPDX-License-Identifier: GPL-2.0
/*
 * Lockless hierarchical page accounting & limiting
 *
 * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
 */

#include <linux/page_counter.h>
#include <linux/atomic.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/sched.h>
#include <linux/bug.h>
#include <asm/page.h>

static bool track_protection(struct page_counter *c)
{
        return c->protection_support;
}

static void propagate_protected_usage(struct page_counter *c,
                                      unsigned long usage)
{
        unsigned long protected, old_protected;
        long delta;

        if (!c->parent)
                return;

        protected = min(usage, READ_ONCE(c->min));
        old_protected = atomic_long_read(&c->min_usage);
        if (protected != old_protected) {
                old_protected = atomic_long_xchg(&c->min_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_min_usage);
        }

        protected = min(usage, READ_ONCE(c->low));
        old_protected = atomic_long_read(&c->low_usage);
        if (protected != old_protected) {
                old_protected = atomic_long_xchg(&c->low_usage, protected);
                delta = protected - old_protected;
                if (delta)
                        atomic_long_add(delta, &c->parent->children_low_usage);
        }
}

/**
 * page_counter_cancel - take pages out of the local counter
 * @counter: counter
 * @nr_pages: number of pages to cancel
 */
void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
{
        long new;

        new = atomic_long_sub_return(nr_pages, &counter->usage);
        /* More uncharges than charges? */
        if (WARN_ONCE(new < 0, "page_counter underflow: %ld nr_pages=%lu\n",
                      new, nr_pages)) {
                new = 0;
                atomic_long_set(&counter->usage, new);
        }
        if (track_protection(counter))
                propagate_protected_usage(counter, new);
}

/**
 * page_counter_charge - hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 *
 * NOTE: This does not consider any configured counter limits.
 */
void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;
        bool protection = track_protection(counter);

        for (c = counter; c; c = c->parent) {
                long new;

                new = atomic_long_add_return(nr_pages, &c->usage);
                if (protection)
                        propagate_protected_usage(c, new);
                /*
                 * This is indeed racy, but we can live with some
                 * inaccuracy in the watermark.
                 *
                 * Notably, we have two watermarks to allow for both a globally
                 * visible peak and one that can be reset at a smaller scope.
                 *
                 * Since we reset both watermarks when the global reset occurs,
                 * we can guarantee that watermark >= local_watermark, so we
                 * don't need to do both comparisons every time.
                 *
                 * On systems with branch predictors, the inner condition should
                 * be almost free.
                 */
                if (new > READ_ONCE(c->local_watermark)) {
                        WRITE_ONCE(c->local_watermark, new);
                        if (new > READ_ONCE(c->watermark))
                                WRITE_ONCE(c->watermark, new);
                }
        }
}

/**
 * page_counter_try_charge - try to hierarchically charge pages
 * @counter: counter
 * @nr_pages: number of pages to charge
 * @fail: points first counter to hit its limit, if any
 *
 * Returns %true on success, or %false and @fail if the counter or one
 * of its ancestors has hit its configured limit.
 */
bool page_counter_try_charge(struct page_counter *counter,
                             unsigned long nr_pages,
                             struct page_counter **fail)
{
        struct page_counter *c;
        bool protection = track_protection(counter);
        bool track_failcnt = counter->track_failcnt;

        for (c = counter; c; c = c->parent) {
                long new;
                /*
                 * Charge speculatively to avoid an expensive CAS.  If
                 * a bigger charge fails, it might falsely lock out a
                 * racing smaller charge and send it into reclaim
                 * early, but the error is limited to the difference
                 * between the two sizes, which is less than 2M/4M in
                 * case of a THP locking out a regular page charge.
                 *
                 * The atomic_long_add_return() implies a full memory
                 * barrier between incrementing the count and reading
                 * the limit.  When racing with page_counter_set_max(),
                 * we either see the new limit or the setter sees the
                 * counter has changed and retries.
                 */
                new = atomic_long_add_return(nr_pages, &c->usage);
                if (new > c->max) {
                        atomic_long_sub(nr_pages, &c->usage);
                        /*
                         * This is racy, but we can live with some
                         * inaccuracy in the failcnt which is only used
                         * to report stats.
                         */
                        if (track_failcnt)
                                data_race(c->failcnt++);
                        *fail = c;
                        goto failed;
                }
                if (protection)
                        propagate_protected_usage(c, new);

                /* see comment on page_counter_charge */
                if (new > READ_ONCE(c->local_watermark)) {
                        WRITE_ONCE(c->local_watermark, new);
                        if (new > READ_ONCE(c->watermark))
                                WRITE_ONCE(c->watermark, new);
                }
        }
        return true;

failed:
        for (c = counter; c != *fail; c = c->parent)
                page_counter_cancel(c, nr_pages);

        return false;
}

/**
 * page_counter_uncharge - hierarchically uncharge pages
 * @counter: counter
 * @nr_pages: number of pages to uncharge
 */
void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        for (c = counter; c; c = c->parent)
                page_counter_cancel(c, nr_pages);
}

/**
 * page_counter_set_max - set the maximum number of pages allowed
 * @counter: counter
 * @nr_pages: limit to set
 *
 * Returns 0 on success, -EBUSY if the current number of pages on the
 * counter already exceeds the specified limit.
 *
 * The caller must serialize invocations on the same counter.
 */
int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
{
        for (;;) {
                unsigned long old;
                long usage;

                /*
                 * Update the limit while making sure that it's not
                 * below the concurrently-changing counter value.
                 *
                 * The xchg implies two full memory barriers before
                 * and after, so the read-swap-read is ordered and
                 * ensures coherency with page_counter_try_charge():
                 * that function modifies the count before checking
                 * the limit, so if it sees the old limit, we see the
                 * modified counter and retry.
                 */
                usage = page_counter_read(counter);

                if (usage > nr_pages)
                        return -EBUSY;

                old = xchg(&counter->max, nr_pages);

                if (page_counter_read(counter) <= usage || nr_pages >= old)
                        return 0;

                counter->max = old;
                cond_resched();
        }
}

/**
 * page_counter_set_min - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->min, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_set_low - set the amount of protected memory
 * @counter: counter
 * @nr_pages: value to set
 *
 * The caller must serialize invocations on the same counter.
 */
void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
{
        struct page_counter *c;

        WRITE_ONCE(counter->low, nr_pages);

        for (c = counter; c; c = c->parent)
                propagate_protected_usage(c, atomic_long_read(&c->usage));
}

/**
 * page_counter_memparse - memparse() for page counter limits
 * @buf: string to parse
 * @max: string meaning maximum possible value
 * @nr_pages: returns the result in number of pages
 *
 * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
 * limited to %PAGE_COUNTER_MAX.
 */
int page_counter_memparse(const char *buf, const char *max,
                          unsigned long *nr_pages)
{
        char *end;
        u64 bytes;

        if (!strcmp(buf, max)) {
                *nr_pages = PAGE_COUNTER_MAX;
                return 0;
        }

        bytes = memparse(buf, &end);
        if (*end != '\0')
                return -EINVAL;

        *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);

        return 0;
}


#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
/*
 * This function calculates an individual page counter's effective
 * protection which is derived from its own memory.min/low, its
 * parent's and siblings' settings, as well as the actual memory
 * distribution in the tree.
 *
 * The following rules apply to the effective protection values:
 *
 * 1. At the first level of reclaim, effective protection is equal to
 *    the declared protection in memory.min and memory.low.
 *
 * 2. To enable safe delegation of the protection configuration, at
 *    subsequent levels the effective protection is capped to the
 *    parent's effective protection.
 *
 * 3. To make complex and dynamic subtrees easier to configure, the
 *    user is allowed to overcommit the declared protection at a given
 *    level. If that is the case, the parent's effective protection is
 *    distributed to the children in proportion to how much protection
 *    they have declared and how much of it they are utilizing.
 *
 *    This makes distribution proportional, but also work-conserving:
 *    if one counter claims much more protection than it uses memory,
 *    the unused remainder is available to its siblings.
 *
 * 4. Conversely, when the declared protection is undercommitted at a
 *    given level, the distribution of the larger parental protection
 *    budget is NOT proportional. A counter's protection from a sibling
 *    is capped to its own memory.min/low setting.
 *
 * 5. However, to allow protecting recursive subtrees from each other
 *    without having to declare each individual counter's fixed share
 *    of the ancestor's claim to protection, any unutilized -
 *    "floating" - protection from up the tree is distributed in
 *    proportion to each counter's *usage*. This makes the protection
 *    neutral wrt sibling cgroups and lets them compete freely over
 *    the shared parental protection budget, but it protects the
 *    subtree as a whole from neighboring subtrees.
 *
 * Note that 4. and 5. are not in conflict: 4. is about protecting
 * against immediate siblings whereas 5. is about protecting against
 * neighboring subtrees.
 */
static unsigned long effective_protection(unsigned long usage,
                                          unsigned long parent_usage,
                                          unsigned long setting,
                                          unsigned long parent_effective,
                                          unsigned long siblings_protected,
                                          bool recursive_protection)
{
        unsigned long protected;
        unsigned long ep;

        protected = min(usage, setting);
        /*
         * If all cgroups at this level combined claim and use more
         * protection than what the parent affords them, distribute
         * shares in proportion to utilization.
         *
         * We are using actual utilization rather than the statically
         * claimed protection in order to be work-conserving: claimed
         * but unused protection is available to siblings that would
         * otherwise get a smaller chunk than what they claimed.
         */
        if (siblings_protected > parent_effective)
                return protected * parent_effective / siblings_protected;

        /*
         * Ok, utilized protection of all children is within what the
         * parent affords them, so we know whatever this child claims
         * and utilizes is effectively protected.
         *
         * If there is unprotected usage beyond this value, reclaim
         * will apply pressure in proportion to that amount.
         *
         * If there is unutilized protection, the cgroup will be fully
         * shielded from reclaim, but we do return a smaller value for
         * protection than what the group could enjoy in theory. This
         * is okay. With the overcommit distribution above, effective
         * protection is always dependent on how memory is actually
         * consumed among the siblings anyway.
         */
        ep = protected;

        /*
         * If the children aren't claiming (all of) the protection
         * afforded to them by the parent, distribute the remainder in
         * proportion to the (unprotected) memory of each cgroup. That
         * way, cgroups that aren't explicitly prioritized wrt each
         * other compete freely over the allowance, but they are
         * collectively protected from neighboring trees.
         *
         * We're using unprotected memory for the weight so that if
         * some cgroups DO claim explicit protection, we don't protect
         * the same bytes twice.
         *
         * Check both usage and parent_usage against the respective
         * protected values. One should imply the other, but they
         * aren't read atomically - make sure the division is sane.
         */
        if (!recursive_protection)
                return ep;

        if (parent_effective > siblings_protected &&
            parent_usage > siblings_protected &&
            usage > protected) {
                unsigned long unclaimed;

                unclaimed = parent_effective - siblings_protected;
                unclaimed *= usage - protected;
                unclaimed /= parent_usage - siblings_protected;

                ep += unclaimed;
        }

        return ep;
}


/**
 * page_counter_calculate_protection - check if memory consumption is in the normal range
 * @root: the top ancestor of the sub-tree being checked
 * @counter: the page_counter the counter to update
 * @recursive_protection: Whether to use memory_recursiveprot behavior.
 *
 * Calculates elow/emin thresholds for given page_counter.
 *
 * WARNING: This function is not stateless! It can only be used as part
 *          of a top-down tree iteration, not for isolated queries.
 */
void page_counter_calculate_protection(struct page_counter *root,
                                       struct page_counter *counter,
                                       bool recursive_protection)
{
        unsigned long usage, parent_usage;
        struct page_counter *parent = counter->parent;

        /*
         * Effective values of the reclaim targets are ignored so they
         * can be stale. Have a look at mem_cgroup_protection for more
         * details.
         * TODO: calculation should be more robust so that we do not need
         * that special casing.
         */
        if (root == counter)
                return;

        usage = page_counter_read(counter);
        if (!usage)
                return;

        if (parent == root) {
                counter->emin = READ_ONCE(counter->min);
                counter->elow = READ_ONCE(counter->low);
                return;
        }

        parent_usage = page_counter_read(parent);

        WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage,
                        READ_ONCE(counter->min),
                        READ_ONCE(parent->emin),
                        atomic_long_read(&parent->children_min_usage),
                        recursive_protection));

        WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage,
                        READ_ONCE(counter->low),
                        READ_ONCE(parent->elow),
                        atomic_long_read(&parent->children_low_usage),
                        recursive_protection));
}
#endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */

























































































































  780 




















  969 







































































































































































  116 




















  961 






































































































































































  262 































































































































































  209 



















  145 



















 1396 









































































































































































































































































  251 




















 1259 



















































































  681 




















  585 




















  476 





























































































































  275 































































































































    5 




























































































   95 















































  142 























   22 




















  186 




























































 1306 








































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-long.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_LONG_H
#define _LINUX_ATOMIC_LONG_H

#include <linux/compiler.h>
#include <asm/types.h>

#ifdef CONFIG_64BIT
typedef atomic64_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC64_INIT(i)
#define atomic_long_cond_read_acquire        atomic64_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic64_cond_read_relaxed
#else
typedef atomic_t atomic_long_t;
#define ATOMIC_LONG_INIT(i)                ATOMIC_INIT(i)
#define atomic_long_cond_read_acquire        atomic_cond_read_acquire
#define atomic_long_cond_read_relaxed        atomic_cond_read_relaxed
#endif

/**
 * raw_atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read(v);
#else
        return raw_atomic_read(v);
#endif
}

/**
 * raw_atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
raw_atomic_long_read_acquire(const atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_read_acquire(v);
#else
        return raw_atomic_read_acquire(v);
#endif
}

/**
 * raw_atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set(v, i);
#else
        raw_atomic_set(v, i);
#endif
}

/**
 * raw_atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_set_release(atomic_long_t *v, long i)
{
#ifdef CONFIG_64BIT
        raw_atomic64_set_release(v, i);
#else
        raw_atomic_set_release(v, i);
#endif
}

/**
 * raw_atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_add(i, v);
#else
        raw_atomic_add(i, v);
#endif
}

/**
 * raw_atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return(i, v);
#else
        return raw_atomic_add_return(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_acquire(i, v);
#else
        return raw_atomic_add_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_release(i, v);
#else
        return raw_atomic_add_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_return_relaxed(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add(i, v);
#else
        return raw_atomic_fetch_add(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_acquire(i, v);
#else
        return raw_atomic_fetch_add_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_release(i, v);
#else
        return raw_atomic_fetch_add_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_relaxed(i, v);
#else
        return raw_atomic_fetch_add_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_sub(i, v);
#else
        raw_atomic_sub(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return(i, v);
#else
        return raw_atomic_sub_return(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_acquire(i, v);
#else
        return raw_atomic_sub_return_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_release(i, v);
#else
        return raw_atomic_sub_return_release(i, v);
#endif
}

/**
 * raw_atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_return_relaxed(i, v);
#else
        return raw_atomic_sub_return_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub(i, v);
#else
        return raw_atomic_fetch_sub(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_acquire(i, v);
#else
        return raw_atomic_fetch_sub_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_release(i, v);
#else
        return raw_atomic_fetch_sub_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_sub_relaxed(i, v);
#else
        return raw_atomic_fetch_sub_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_inc(v);
#else
        raw_atomic_inc(v);
#endif
}

/**
 * raw_atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return(v);
#else
        return raw_atomic_inc_return(v);
#endif
}

/**
 * raw_atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_acquire(v);
#else
        return raw_atomic_inc_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_release(v);
#else
        return raw_atomic_inc_return_release(v);
#endif
}

/**
 * raw_atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_inc_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_return_relaxed(v);
#else
        return raw_atomic_inc_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc(v);
#else
        return raw_atomic_fetch_inc(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_acquire(v);
#else
        return raw_atomic_fetch_inc_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_release(v);
#else
        return raw_atomic_fetch_inc_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_inc_relaxed(v);
#else
        return raw_atomic_fetch_inc_relaxed(v);
#endif
}

/**
 * raw_atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_dec(v);
#else
        raw_atomic_dec(v);
#endif
}

/**
 * raw_atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return(v);
#else
        return raw_atomic_dec_return(v);
#endif
}

/**
 * raw_atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_acquire(v);
#else
        return raw_atomic_dec_return_acquire(v);
#endif
}

/**
 * raw_atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_release(v);
#else
        return raw_atomic_dec_return_release(v);
#endif
}

/**
 * raw_atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
raw_atomic_long_dec_return_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_return_relaxed(v);
#else
        return raw_atomic_dec_return_relaxed(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec(v);
#else
        return raw_atomic_fetch_dec(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_acquire(v);
#else
        return raw_atomic_fetch_dec_acquire(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_release(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_release(v);
#else
        return raw_atomic_fetch_dec_release(v);
#endif
}

/**
 * raw_atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_dec_relaxed(v);
#else
        return raw_atomic_fetch_dec_relaxed(v);
#endif
}

/**
 * raw_atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_and(i, v);
#else
        raw_atomic_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and(i, v);
#else
        return raw_atomic_fetch_and(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_acquire(i, v);
#else
        return raw_atomic_fetch_and_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_release(i, v);
#else
        return raw_atomic_fetch_and_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_and_relaxed(i, v);
#else
        return raw_atomic_fetch_and_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_andnot(i, v);
#else
        raw_atomic_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_andnot(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_acquire(i, v);
#else
        return raw_atomic_fetch_andnot_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_release(i, v);
#else
        return raw_atomic_fetch_andnot_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_andnot_relaxed(i, v);
#else
        return raw_atomic_fetch_andnot_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_or(i, v);
#else
        raw_atomic_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or(i, v);
#else
        return raw_atomic_fetch_or(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_acquire(i, v);
#else
        return raw_atomic_fetch_or_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_release(i, v);
#else
        return raw_atomic_fetch_or_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_or_relaxed(i, v);
#else
        return raw_atomic_fetch_or_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_long_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        raw_atomic64_xor(i, v);
#else
        raw_atomic_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor(i, v);
#else
        return raw_atomic_fetch_xor(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_acquire(i, v);
#else
        return raw_atomic_fetch_xor_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_release(i, v);
#else
        return raw_atomic_fetch_xor_release(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_xor_relaxed(i, v);
#else
        return raw_atomic_fetch_xor_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg(v, new);
#else
        return raw_atomic_xchg(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_acquire(v, new);
#else
        return raw_atomic_xchg_acquire(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_release(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_release(v, new);
#else
        return raw_atomic_xchg_release(v, new);
#endif
}

/**
 * raw_atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_xchg_relaxed(v, new);
#else
        return raw_atomic_xchg_relaxed(v, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg(v, old, new);
#else
        return raw_atomic_cmpxchg(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_acquire(v, old, new);
#else
        return raw_atomic_cmpxchg_acquire(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_release(v, old, new);
#else
        return raw_atomic_cmpxchg_release(v, old, new);
#endif
}

/**
 * raw_atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
#else
        return raw_atomic_cmpxchg_relaxed(v, old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_acquire(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_acquire(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_release(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_release(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_try_cmpxchg_relaxed(v, (s64 *)old, new);
#else
        return raw_atomic_try_cmpxchg_relaxed(v, (int *)old, new);
#endif
}

/**
 * raw_atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_sub_and_test(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_sub_and_test(i, v);
#else
        return raw_atomic_sub_and_test(i, v);
#endif
}

/**
 * raw_atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_and_test(v);
#else
        return raw_atomic_dec_and_test(v);
#endif
}

/**
 * raw_atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_and_test(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_and_test(v);
#else
        return raw_atomic_inc_and_test(v);
#endif
}

/**
 * raw_atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative(i, v);
#else
        return raw_atomic_add_negative(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_acquire(i, v);
#else
        return raw_atomic_add_negative_acquire(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_release(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_release(i, v);
#else
        return raw_atomic_add_negative_release(i, v);
#endif
}

/**
 * raw_atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_negative_relaxed(i, v);
#else
        return raw_atomic_add_negative_relaxed(i, v);
#endif
}

/**
 * raw_atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline long
raw_atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_fetch_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_add_unless(v, a, u);
#else
        return raw_atomic_add_unless(v, a, u);
#endif
}

/**
 * raw_atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_not_zero(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_not_zero(v);
#else
        return raw_atomic_inc_not_zero(v);
#endif
}

/**
 * raw_atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_inc_unless_negative(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_inc_unless_negative(v);
#else
        return raw_atomic_inc_unless_negative(v);
#endif
}

/**
 * raw_atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_long_dec_unless_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_unless_positive(v);
#else
        return raw_atomic_dec_unless_positive(v);
#endif
}

/**
 * raw_atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_long_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
raw_atomic_long_dec_if_positive(atomic_long_t *v)
{
#ifdef CONFIG_64BIT
        return raw_atomic64_dec_if_positive(v);
#else
        return raw_atomic_dec_if_positive(v);
#endif
}

#endif /* _LINUX_ATOMIC_LONG_H */
// eadf183c3600b8b92b91839dd3be6bcc560c752d











































































































 1278 










 1278 









































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
// SPDX-License-Identifier: GPL-2.0
/*
 * Generic sched_clock() support, to extend low level hardware time
 * counters to full 64-bit ns values.
 */
#include <linux/clocksource.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/ktime.h>
#include <linux/kernel.h>
#include <linux/math.h>
#include <linux/moduleparam.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/syscore_ops.h>
#include <linux/hrtimer.h>
#include <linux/sched_clock.h>
#include <linux/seqlock.h>
#include <linux/bitops.h>

#include "timekeeping.h"

/**
 * struct clock_data - all data needed for sched_clock() (including
 *                     registration of a new clock source)
 *
 * @seq:                Sequence counter for protecting updates. The lowest
 *                        bit is the index for @read_data.
 * @read_data:                Data required to read from sched_clock.
 * @wrap_kt:                Duration for which clock can run before wrapping.
 * @rate:                Tick rate of the registered clock.
 * @actual_read_sched_clock: Registered hardware level clock read function.
 *
 * The ordering of this structure has been chosen to optimize cache
 * performance. In particular 'seq' and 'read_data[0]' (combined) should fit
 * into a single 64-byte cache line.
 */
struct clock_data {
        seqcount_latch_t        seq;
        struct clock_read_data        read_data[2];
        ktime_t                        wrap_kt;
        unsigned long                rate;

        u64 (*actual_read_sched_clock)(void);
};

static struct hrtimer sched_clock_timer;
static int irqtime = -1;

core_param(irqtime, irqtime, int, 0400);

static u64 notrace jiffy_sched_clock_read(void)
{
        /*
         * We don't need to use get_jiffies_64 on 32-bit arches here
         * because we register with BITS_PER_LONG
         */
        return (u64)(jiffies - INITIAL_JIFFIES);
}

static struct clock_data cd ____cacheline_aligned = {
        .read_data[0] = { .mult = NSEC_PER_SEC / HZ,
                          .read_sched_clock = jiffy_sched_clock_read, },
        .actual_read_sched_clock = jiffy_sched_clock_read,
};

static __always_inline u64 cyc_to_ns(u64 cyc, u32 mult, u32 shift)
{
        return (cyc * mult) >> shift;
}

notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
{
        *seq = read_seqcount_latch(&cd.seq);
        return cd.read_data + (*seq & 1);
}

notrace int sched_clock_read_retry(unsigned int seq)
{
        return read_seqcount_latch_retry(&cd.seq, seq);
}

static __always_inline unsigned long long __sched_clock(void)
{
        struct clock_read_data *rd;
        unsigned int seq;
        u64 cyc, res;

        do {
                seq = raw_read_seqcount_latch(&cd.seq);
                rd = cd.read_data + (seq & 1);

                cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
                      rd->sched_clock_mask;
                res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
        } while (raw_read_seqcount_latch_retry(&cd.seq, seq));

        return res;
}

unsigned long long noinstr sched_clock_noinstr(void)
{
        return __sched_clock();
}

unsigned long long notrace sched_clock(void)
{
        unsigned long long ns;
        preempt_disable_notrace();
        /*
         * All of __sched_clock() is a seqcount_latch reader critical section,
         * but relies on the raw helpers which are uninstrumented. For KCSAN,
         * mark all accesses in __sched_clock() as atomic.
         */
        kcsan_nestable_atomic_begin();
        ns = __sched_clock();
        kcsan_nestable_atomic_end();
        preempt_enable_notrace();
        return ns;
}

/*
 * Updating the data required to read the clock.
 *
 * sched_clock() will never observe mis-matched data even if called from
 * an NMI. We do this by maintaining an odd/even copy of the data and
 * steering sched_clock() to one or the other using a sequence counter.
 * In order to preserve the data cache profile of sched_clock() as much
 * as possible the system reverts back to the even copy when the update
 * completes; the odd copy is used *only* during an update.
 */
static void update_clock_read_data(struct clock_read_data *rd)
{
        /* steer readers towards the odd copy */
        write_seqcount_latch_begin(&cd.seq);

        /* now its safe for us to update the normal (even) copy */
        cd.read_data[0] = *rd;

        /* switch readers back to the even copy */
        write_seqcount_latch(&cd.seq);

        /* update the backup (odd) copy with the new data */
        cd.read_data[1] = *rd;

        write_seqcount_latch_end(&cd.seq);
}

/*
 * Atomically update the sched_clock() epoch.
 */
static void update_sched_clock(void)
{
        u64 cyc;
        u64 ns;
        struct clock_read_data rd;

        rd = cd.read_data[0];

        cyc = cd.actual_read_sched_clock();
        ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);

        rd.epoch_ns = ns;
        rd.epoch_cyc = cyc;

        update_clock_read_data(&rd);
}

static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
{
        update_sched_clock();
        hrtimer_forward_now(hrt, cd.wrap_kt);

        return HRTIMER_RESTART;
}

void __init
sched_clock_register(u64 (*read)(void), int bits, unsigned long rate)
{
        u64 res, wrap, new_mask, new_epoch, cyc, ns;
        u32 new_mult, new_shift;
        unsigned long r, flags;
        char r_unit;
        struct clock_read_data rd;

        if (cd.rate > rate)
                return;

        /* Cannot register a sched_clock with interrupts on */
        local_irq_save(flags);

        /* Calculate the mult/shift to convert counter ticks to ns. */
        clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);

        new_mask = CLOCKSOURCE_MASK(bits);
        cd.rate = rate;

        /* Calculate how many nanosecs until we risk wrapping */
        wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL);
        cd.wrap_kt = ns_to_ktime(wrap);

        rd = cd.read_data[0];

        /* Update epoch for new counter and update 'epoch_ns' from old counter*/
        new_epoch = read();
        cyc = cd.actual_read_sched_clock();
        ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift);
        cd.actual_read_sched_clock = read;

        rd.read_sched_clock        = read;
        rd.sched_clock_mask        = new_mask;
        rd.mult                        = new_mult;
        rd.shift                = new_shift;
        rd.epoch_cyc                = new_epoch;
        rd.epoch_ns                = ns;

        update_clock_read_data(&rd);

        if (sched_clock_timer.function != NULL) {
                /* update timeout for clock wrap */
                hrtimer_start(&sched_clock_timer, cd.wrap_kt,
                              HRTIMER_MODE_REL_HARD);
        }

        r = rate;
        if (r >= 4000000) {
                r = DIV_ROUND_CLOSEST(r, 1000000);
                r_unit = 'M';
        } else if (r >= 4000) {
                r = DIV_ROUND_CLOSEST(r, 1000);
                r_unit = 'k';
        } else {
                r_unit = ' ';
        }

        /* Calculate the ns resolution of this counter */
        res = cyc_to_ns(1ULL, new_mult, new_shift);

        pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
                bits, r, r_unit, res, wrap);

        /* Enable IRQ time accounting if we have a fast enough sched_clock() */
        if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
                enable_sched_clock_irqtime();

        local_irq_restore(flags);

        pr_debug("Registered %pS as sched_clock source\n", read);
}

void __init generic_sched_clock_init(void)
{
        /*
         * If no sched_clock() function has been provided at that point,
         * make it the final one.
         */
        if (cd.actual_read_sched_clock == jiffy_sched_clock_read)
                sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);

        update_sched_clock();

        /*
         * Start the timer to keep sched_clock() properly updated and
         * sets the initial epoch.
         */
        hrtimer_setup(&sched_clock_timer, sched_clock_poll, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
}

/*
 * Clock read function for use when the clock is suspended.
 *
 * This function makes it appear to sched_clock() as if the clock
 * stopped counting at its last update.
 *
 * This function must only be called from the critical
 * section in sched_clock(). It relies on the read_seqcount_retry()
 * at the end of the critical section to be sure we observe the
 * correct copy of 'epoch_cyc'.
 */
static u64 notrace suspended_sched_clock_read(void)
{
        unsigned int seq = read_seqcount_latch(&cd.seq);

        return cd.read_data[seq & 1].epoch_cyc;
}

int sched_clock_suspend(void)
{
        struct clock_read_data *rd = &cd.read_data[0];

        update_sched_clock();
        hrtimer_cancel(&sched_clock_timer);
        rd->read_sched_clock = suspended_sched_clock_read;

        return 0;
}

void sched_clock_resume(void)
{
        struct clock_read_data *rd = &cd.read_data[0];

        rd->epoch_cyc = cd.actual_read_sched_clock();
        hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL_HARD);
        rd->read_sched_clock = cd.actual_read_sched_clock;
}

static struct syscore_ops sched_clock_ops = {
        .suspend        = sched_clock_suspend,
        .resume                = sched_clock_resume,
};

static int __init sched_clock_syscore_init(void)
{
        register_syscore_ops(&sched_clock_ops);

        return 0;
}
device_initcall(sched_clock_syscore_init);





















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Events for filesystem locks
 *
 * Copyright 2013 Jeff Layton <jlayton@poochiereds.net>
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM filelock

#if !defined(_TRACE_FILELOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FILELOCK_H

#include <linux/tracepoint.h>
#include <linux/fs.h>
#include <linux/device.h>
#include <linux/kdev_t.h>

#define show_fl_flags(val)                                                \
        __print_flags(val, "|",                                         \
                { FL_POSIX,                "FL_POSIX" },                        \
                { FL_FLOCK,                "FL_FLOCK" },                        \
                { FL_DELEG,                "FL_DELEG" },                        \
                { FL_ACCESS,                "FL_ACCESS" },                        \
                { FL_EXISTS,                "FL_EXISTS" },                        \
                { FL_LEASE,                "FL_LEASE" },                        \
                { FL_CLOSE,                "FL_CLOSE" },                        \
                { FL_SLEEP,                "FL_SLEEP" },                        \
                { FL_DOWNGRADE_PENDING,        "FL_DOWNGRADE_PENDING" },        \
                { FL_UNLOCK_PENDING,        "FL_UNLOCK_PENDING" },                \
                { FL_OFDLCK,                "FL_OFDLCK" })

#define show_fl_type(val)                                \
        __print_symbolic(val,                                \
                        { F_RDLCK, "F_RDLCK" },                \
                        { F_WRLCK, "F_WRLCK" },                \
                        { F_UNLCK, "F_UNLCK" })

TRACE_EVENT(locks_get_lock_context,
        TP_PROTO(struct inode *inode, int type, struct file_lock_context *ctx),

        TP_ARGS(inode, type, ctx),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(unsigned char, type)
                __field(struct file_lock_context *, ctx)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->type = type;
                __entry->ctx = ctx;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%lx type=%s ctx=%p",
                  MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                  __entry->i_ino, show_fl_type(__entry->type), __entry->ctx)
);

DECLARE_EVENT_CLASS(filelock_lock,
        TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),

        TP_ARGS(inode, fl, ret),

        TP_STRUCT__entry(
                __field(struct file_lock *, fl)
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(struct file_lock_core *, blocker)
                __field(fl_owner_t, owner)
                __field(unsigned int, pid)
                __field(unsigned int, flags)
                __field(unsigned char, type)
                __field(loff_t, fl_start)
                __field(loff_t, fl_end)
                __field(int, ret)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->blocker = fl ? fl->c.flc_blocker : NULL;
                __entry->owner = fl ? fl->c.flc_owner : NULL;
                __entry->pid = fl ? fl->c.flc_pid : 0;
                __entry->flags = fl ? fl->c.flc_flags : 0;
                __entry->type = fl ? fl->c.flc_type : 0;
                __entry->fl_start = fl ? fl->fl_start : 0;
                __entry->fl_end = fl ? fl->fl_end : 0;
                __entry->ret = ret;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_pid=%u fl_flags=%s fl_type=%s fl_start=%lld fl_end=%lld ret=%d",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->blocker, __entry->owner,
                __entry->pid, show_fl_flags(__entry->flags),
                show_fl_type(__entry->type),
                __entry->fl_start, __entry->fl_end, __entry->ret)
);

DEFINE_EVENT(filelock_lock, posix_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, fcntl_setlk,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, locks_remove_posix,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DEFINE_EVENT(filelock_lock, flock_lock_inode,
                TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
                TP_ARGS(inode, fl, ret));

DECLARE_EVENT_CLASS(filelock_lease,
        TP_PROTO(struct inode *inode, struct file_lease *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(struct file_lease *, fl)
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(struct file_lock_core *, blocker)
                __field(fl_owner_t, owner)
                __field(unsigned int, flags)
                __field(unsigned char, type)
                __field(unsigned long, break_time)
                __field(unsigned long, downgrade_time)
        ),

        TP_fast_assign(
                __entry->fl = fl ? fl : NULL;
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->blocker = fl ? fl->c.flc_blocker : NULL;
                __entry->owner = fl ? fl->c.flc_owner : NULL;
                __entry->flags = fl ? fl->c.flc_flags : 0;
                __entry->type = fl ? fl->c.flc_type : 0;
                __entry->break_time = fl ? fl->fl_break_time : 0;
                __entry->downgrade_time = fl ? fl->fl_downgrade_time : 0;
        ),

        TP_printk("fl=%p dev=0x%x:0x%x ino=0x%lx fl_blocker=%p fl_owner=%p fl_flags=%s fl_type=%s fl_break_time=%lu fl_downgrade_time=%lu",
                __entry->fl, MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->blocker, __entry->owner,
                show_fl_flags(__entry->flags),
                show_fl_type(__entry->type),
                __entry->break_time, __entry->downgrade_time)
);

DEFINE_EVENT(filelock_lease, break_lease_noblock, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_block, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, break_lease_unblock, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, generic_delete_lease, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

DEFINE_EVENT(filelock_lease, time_out_leases, TP_PROTO(struct inode *inode, struct file_lease *fl),
                TP_ARGS(inode, fl));

TRACE_EVENT(generic_add_lease,
        TP_PROTO(struct inode *inode, struct file_lease *fl),

        TP_ARGS(inode, fl),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(int, wcount)
                __field(int, rcount)
                __field(int, icount)
                __field(dev_t, s_dev)
                __field(fl_owner_t, owner)
                __field(unsigned int, flags)
                __field(unsigned char, type)
        ),

        TP_fast_assign(
                __entry->s_dev = inode->i_sb->s_dev;
                __entry->i_ino = inode->i_ino;
                __entry->wcount = atomic_read(&inode->i_writecount);
                __entry->rcount = atomic_read(&inode->i_readcount);
                __entry->icount = atomic_read(&inode->i_count);
                __entry->owner = fl->c.flc_owner;
                __entry->flags = fl->c.flc_flags;
                __entry->type = fl->c.flc_type;
        ),

        TP_printk("dev=0x%x:0x%x ino=0x%lx wcount=%d rcount=%d icount=%d fl_owner=%p fl_flags=%s fl_type=%s",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino, __entry->wcount, __entry->rcount,
                __entry->icount, __entry->owner,
                show_fl_flags(__entry->flags),
                show_fl_type(__entry->type))
);

TRACE_EVENT(leases_conflict,
        TP_PROTO(bool conflict, struct file_lease *lease, struct file_lease *breaker),

        TP_ARGS(conflict, lease, breaker),

        TP_STRUCT__entry(
                __field(void *, lease)
                __field(void *, breaker)
                __field(unsigned int, l_fl_flags)
                __field(unsigned int, b_fl_flags)
                __field(unsigned char, l_fl_type)
                __field(unsigned char, b_fl_type)
                __field(bool, conflict)
        ),

        TP_fast_assign(
                __entry->lease = lease;
                __entry->l_fl_flags = lease->c.flc_flags;
                __entry->l_fl_type = lease->c.flc_type;
                __entry->breaker = breaker;
                __entry->b_fl_flags = breaker->c.flc_flags;
                __entry->b_fl_type = breaker->c.flc_type;
                __entry->conflict = conflict;
        ),

        TP_printk("conflict %d: lease=%p fl_flags=%s fl_type=%s; breaker=%p fl_flags=%s fl_type=%s",
                __entry->conflict,
                __entry->lease,
                show_fl_flags(__entry->l_fl_flags),
                show_fl_type(__entry->l_fl_type),
                __entry->breaker,
                show_fl_flags(__entry->b_fl_flags),
                show_fl_type(__entry->b_fl_type))
);

#endif /* _TRACE_FILELOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



























 1484 



  353 




























   22 

  957 













  755 
  708 
















   34 

   34 
  820 

































  251 
  681 

  799 



  168 



























 1259 
  657 
  275 
 1506 














 1496 



















 1506 






























































  904 
  222 


  177 




  142 


  611 
  991 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/atomic.h
 *
 * Copyright (C) 1996 Russell King.
 * Copyright (C) 2002 Deep Blue Solutions Ltd.
 * Copyright (C) 2012 ARM Ltd.
 */

#ifndef __ASM_ATOMIC_LSE_H
#define __ASM_ATOMIC_LSE_H

#define ATOMIC_OP(op, asm_op)                                                \
static __always_inline void                                                \
__lse_atomic_##op(int i, atomic_t *v)                                        \
{                                                                        \
        asm volatile(                                                        \
        __LSE_PREAMBLE                                                        \
        "        " #asm_op "        %w[i], %[v]\n"                                \
        : [v] "+Q" (v->counter)                                                \
        : [i] "r" (i));                                                        \
}

ATOMIC_OP(andnot, stclr)
ATOMIC_OP(or, stset)
ATOMIC_OP(xor, steor)
ATOMIC_OP(add, stadd)

static __always_inline void __lse_atomic_sub(int i, atomic_t *v)
{
        __lse_atomic_add(-i, v);
}

#undef ATOMIC_OP

#define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...)                        \
static __always_inline int                                                \
__lse_atomic_fetch_##op##name(int i, atomic_t *v)                        \
{                                                                        \
        int old;                                                        \
                                                                        \
        asm volatile(                                                        \
        __LSE_PREAMBLE                                                        \
        "        " #asm_op #mb "        %w[i], %w[old], %[v]"                        \
        : [v] "+Q" (v->counter),                                        \
          [old] "=r" (old)                                                \
        : [i] "r" (i)                                                        \
        : cl);                                                                \
                                                                        \
        return old;                                                        \
}

#define ATOMIC_FETCH_OPS(op, asm_op)                                        \
        ATOMIC_FETCH_OP(_relaxed,   , op, asm_op)                        \
        ATOMIC_FETCH_OP(_acquire,  a, op, asm_op, "memory")                \
        ATOMIC_FETCH_OP(_release,  l, op, asm_op, "memory")                \
        ATOMIC_FETCH_OP(        , al, op, asm_op, "memory")

ATOMIC_FETCH_OPS(andnot, ldclr)
ATOMIC_FETCH_OPS(or, ldset)
ATOMIC_FETCH_OPS(xor, ldeor)
ATOMIC_FETCH_OPS(add, ldadd)

#undef ATOMIC_FETCH_OP
#undef ATOMIC_FETCH_OPS

#define ATOMIC_FETCH_OP_SUB(name)                                        \
static __always_inline int                                                \
__lse_atomic_fetch_sub##name(int i, atomic_t *v)                        \
{                                                                        \
        return __lse_atomic_fetch_add##name(-i, v);                        \
}

ATOMIC_FETCH_OP_SUB(_relaxed)
ATOMIC_FETCH_OP_SUB(_acquire)
ATOMIC_FETCH_OP_SUB(_release)
ATOMIC_FETCH_OP_SUB(        )

#undef ATOMIC_FETCH_OP_SUB

#define ATOMIC_OP_ADD_SUB_RETURN(name)                                        \
static __always_inline int                                                \
__lse_atomic_add_return##name(int i, atomic_t *v)                        \
{                                                                        \
        return __lse_atomic_fetch_add##name(i, v) + i;                        \
}                                                                        \
                                                                        \
static __always_inline int                                                \
__lse_atomic_sub_return##name(int i, atomic_t *v)                        \
{                                                                        \
        return __lse_atomic_fetch_sub(i, v) - i;                        \
}

ATOMIC_OP_ADD_SUB_RETURN(_relaxed)
ATOMIC_OP_ADD_SUB_RETURN(_acquire)
ATOMIC_OP_ADD_SUB_RETURN(_release)
ATOMIC_OP_ADD_SUB_RETURN(        )

#undef ATOMIC_OP_ADD_SUB_RETURN

static __always_inline void __lse_atomic_and(int i, atomic_t *v)
{
        return __lse_atomic_andnot(~i, v);
}

#define ATOMIC_FETCH_OP_AND(name, mb, cl...)                                \
static __always_inline int                                                \
__lse_atomic_fetch_and##name(int i, atomic_t *v)                        \
{                                                                        \
        return __lse_atomic_fetch_andnot##name(~i, v);                        \
}

ATOMIC_FETCH_OP_AND(_relaxed,   )
ATOMIC_FETCH_OP_AND(_acquire,  a, "memory")
ATOMIC_FETCH_OP_AND(_release,  l, "memory")
ATOMIC_FETCH_OP_AND(        , al, "memory")

#undef ATOMIC_FETCH_OP_AND

#define ATOMIC64_OP(op, asm_op)                                                \
static __always_inline void                                                \
__lse_atomic64_##op(s64 i, atomic64_t *v)                                \
{                                                                        \
        asm volatile(                                                        \
        __LSE_PREAMBLE                                                        \
        "        " #asm_op "        %[i], %[v]\n"                                \
        : [v] "+Q" (v->counter)                                                \
        : [i] "r" (i));                                                        \
}

ATOMIC64_OP(andnot, stclr)
ATOMIC64_OP(or, stset)
ATOMIC64_OP(xor, steor)
ATOMIC64_OP(add, stadd)

static __always_inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
{
        __lse_atomic64_add(-i, v);
}

#undef ATOMIC64_OP

#define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...)                        \
static __always_inline long                                                \
__lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v)                        \
{                                                                        \
        s64 old;                                                        \
                                                                        \
        asm volatile(                                                        \
        __LSE_PREAMBLE                                                        \
        "        " #asm_op #mb "        %[i], %[old], %[v]"                        \
        : [v] "+Q" (v->counter),                                        \
          [old] "=r" (old)                                                \
        : [i] "r" (i)                                                         \
        : cl);                                                                \
                                                                        \
        return old;                                                        \
}

#define ATOMIC64_FETCH_OPS(op, asm_op)                                        \
        ATOMIC64_FETCH_OP(_relaxed,   , op, asm_op)                        \
        ATOMIC64_FETCH_OP(_acquire,  a, op, asm_op, "memory")                \
        ATOMIC64_FETCH_OP(_release,  l, op, asm_op, "memory")                \
        ATOMIC64_FETCH_OP(        , al, op, asm_op, "memory")

ATOMIC64_FETCH_OPS(andnot, ldclr)
ATOMIC64_FETCH_OPS(or, ldset)
ATOMIC64_FETCH_OPS(xor, ldeor)
ATOMIC64_FETCH_OPS(add, ldadd)

#undef ATOMIC64_FETCH_OP
#undef ATOMIC64_FETCH_OPS

#define ATOMIC64_FETCH_OP_SUB(name)                                        \
static __always_inline long                                                \
__lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v)                        \
{                                                                        \
        return __lse_atomic64_fetch_add##name(-i, v);                        \
}

ATOMIC64_FETCH_OP_SUB(_relaxed)
ATOMIC64_FETCH_OP_SUB(_acquire)
ATOMIC64_FETCH_OP_SUB(_release)
ATOMIC64_FETCH_OP_SUB(        )

#undef ATOMIC64_FETCH_OP_SUB

#define ATOMIC64_OP_ADD_SUB_RETURN(name)                                \
static __always_inline long                                                \
__lse_atomic64_add_return##name(s64 i, atomic64_t *v)                        \
{                                                                        \
        return __lse_atomic64_fetch_add##name(i, v) + i;                \
}                                                                        \
                                                                        \
static __always_inline long                                                \
__lse_atomic64_sub_return##name(s64 i, atomic64_t *v)                        \
{                                                                        \
        return __lse_atomic64_fetch_sub##name(i, v) - i;                \
}

ATOMIC64_OP_ADD_SUB_RETURN(_relaxed)
ATOMIC64_OP_ADD_SUB_RETURN(_acquire)
ATOMIC64_OP_ADD_SUB_RETURN(_release)
ATOMIC64_OP_ADD_SUB_RETURN(        )

#undef ATOMIC64_OP_ADD_SUB_RETURN

static __always_inline void __lse_atomic64_and(s64 i, atomic64_t *v)
{
        return __lse_atomic64_andnot(~i, v);
}

#define ATOMIC64_FETCH_OP_AND(name, mb, cl...)                                \
static __always_inline long                                                \
__lse_atomic64_fetch_and##name(s64 i, atomic64_t *v)                        \
{                                                                        \
        return __lse_atomic64_fetch_andnot##name(~i, v);                \
}

ATOMIC64_FETCH_OP_AND(_relaxed,   )
ATOMIC64_FETCH_OP_AND(_acquire,  a, "memory")
ATOMIC64_FETCH_OP_AND(_release,  l, "memory")
ATOMIC64_FETCH_OP_AND(        , al, "memory")

#undef ATOMIC64_FETCH_OP_AND

static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
{
        unsigned long tmp;

        asm volatile(
        __LSE_PREAMBLE
        "1:        ldr        %x[tmp], %[v]\n"
        "        subs        %[ret], %x[tmp], #1\n"
        "        b.lt        2f\n"
        "        casal        %x[tmp], %[ret], %[v]\n"
        "        sub        %x[tmp], %x[tmp], #1\n"
        "        sub        %x[tmp], %x[tmp], %[ret]\n"
        "        cbnz        %x[tmp], 1b\n"
        "2:"
        : [ret] "+&r" (v), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)
        :
        : "cc", "memory");

        return (long)v;
}

#define __CMPXCHG_CASE(w, sfx, name, sz, mb, cl...)                        \
static __always_inline u##sz                                                \
__lse__cmpxchg_case_##name##sz(volatile void *ptr,                        \
                                              u##sz old,                \
                                              u##sz new)                \
{                                                                        \
        asm volatile(                                                        \
        __LSE_PREAMBLE                                                        \
        "        cas" #mb #sfx "        %" #w "[old], %" #w "[new], %[v]\n"        \
        : [v] "+Q" (*(u##sz *)ptr),                                        \
          [old] "+r" (old)                                                \
        : [new] "rZ" (new)                                                \
        : cl);                                                                \
                                                                        \
        return old;                                                        \
}

__CMPXCHG_CASE(w, b,     ,  8,   )
__CMPXCHG_CASE(w, h,     , 16,   )
__CMPXCHG_CASE(w,  ,     , 32,   )
__CMPXCHG_CASE(x,  ,     , 64,   )
__CMPXCHG_CASE(w, b, acq_,  8,  a, "memory")
__CMPXCHG_CASE(w, h, acq_, 16,  a, "memory")
__CMPXCHG_CASE(w,  , acq_, 32,  a, "memory")
__CMPXCHG_CASE(x,  , acq_, 64,  a, "memory")
__CMPXCHG_CASE(w, b, rel_,  8,  l, "memory")
__CMPXCHG_CASE(w, h, rel_, 16,  l, "memory")
__CMPXCHG_CASE(w,  , rel_, 32,  l, "memory")
__CMPXCHG_CASE(x,  , rel_, 64,  l, "memory")
__CMPXCHG_CASE(w, b,  mb_,  8, al, "memory")
__CMPXCHG_CASE(w, h,  mb_, 16, al, "memory")
__CMPXCHG_CASE(w,  ,  mb_, 32, al, "memory")
__CMPXCHG_CASE(x,  ,  mb_, 64, al, "memory")

#undef __CMPXCHG_CASE

#define __CMPXCHG128(name, mb, cl...)                                        \
static __always_inline u128                                                \
__lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)                \
{                                                                        \
        union __u128_halves r, o = { .full = (old) },                        \
                               n = { .full = (new) };                        \
        register unsigned long x0 asm ("x0") = o.low;                        \
        register unsigned long x1 asm ("x1") = o.high;                        \
        register unsigned long x2 asm ("x2") = n.low;                        \
        register unsigned long x3 asm ("x3") = n.high;                        \
        register unsigned long x4 asm ("x4") = (unsigned long)ptr;        \
                                                                        \
        asm volatile(                                                        \
        __LSE_PREAMBLE                                                        \
        "        casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
        : [old1] "+&r" (x0), [old2] "+&r" (x1),                                \
          [v] "+Q" (*(u128 *)ptr)                                        \
        : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4),                \
          [oldval1] "r" (o.low), [oldval2] "r" (o.high)                        \
        : cl);                                                                \
                                                                        \
        r.low = x0; r.high = x1;                                        \
                                                                        \
        return r.full;                                                        \
}

__CMPXCHG128(   ,   )
__CMPXCHG128(_mb, al, "memory")

#undef __CMPXCHG128

#endif        /* __ASM_ATOMIC_LSE_H */



































































  318 














  342 






  256 













  291 







































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * NUMA memory policies for Linux.
 * Copyright 2003,2004 Andi Kleen SuSE Labs
 */
#ifndef _LINUX_MEMPOLICY_H
#define _LINUX_MEMPOLICY_H 1

#include <linux/sched.h>
#include <linux/mmzone.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <uapi/linux/mempolicy.h>

struct mm_struct;

#define NO_INTERLEAVE_INDEX (-1UL)        /* use task il_prev for interleaving */

#ifdef CONFIG_NUMA

/*
 * Describe a memory policy.
 *
 * A mempolicy can be either associated with a process or with a VMA.
 * For VMA related allocations the VMA policy is preferred, otherwise
 * the process policy is used. Interrupts ignore the memory policy
 * of the current process.
 *
 * Locking policy for interleave:
 * In process context there is no locking because only the process accesses
 * its own state. All vma manipulation is somewhat protected by a down_read on
 * mmap_lock.
 *
 * Freeing policy:
 * Mempolicy objects are reference counted.  A mempolicy will be freed when
 * mpol_put() decrements the reference count to zero.
 *
 * Duplicating policy objects:
 * mpol_dup() allocates a new mempolicy and copies the specified mempolicy
 * to the new storage.  The reference count of the new object is initialized
 * to 1, representing the caller of mpol_dup().
 */
struct mempolicy {
        atomic_t refcnt;
        unsigned short mode;         /* See MPOL_* above */
        unsigned short flags;        /* See set_mempolicy() MPOL_F_* above */
        nodemask_t nodes;        /* interleave/bind/preferred/etc */
        int home_node;                /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */

        union {
                nodemask_t cpuset_mems_allowed;        /* relative to these nodes */
                nodemask_t user_nodemask;        /* nodemask passed by user */
        } w;
};

/*
 * Support for managing mempolicy data objects (clone, copy, destroy)
 * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
 */

extern void __mpol_put(struct mempolicy *pol);
static inline void mpol_put(struct mempolicy *pol)
{
        if (pol)
                __mpol_put(pol);
}

/*
 * Does mempolicy pol need explicit unref after use?
 * Currently only needed for shared policies.
 */
static inline int mpol_needs_cond_ref(struct mempolicy *pol)
{
        return (pol && (pol->flags & MPOL_F_SHARED));
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
        if (mpol_needs_cond_ref(pol))
                __mpol_put(pol);
}

extern struct mempolicy *__mpol_dup(struct mempolicy *pol);
static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
{
        if (pol)
                pol = __mpol_dup(pol);
        return pol;
}

static inline void mpol_get(struct mempolicy *pol)
{
        if (pol)
                atomic_inc(&pol->refcnt);
}

extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (a == b)
                return true;
        return __mpol_equal(a, b);
}

/*
 * Tree of shared policies for a shared memory region.
 */
struct shared_policy {
        struct rb_root root;
        rwlock_t lock;
};
struct sp_node {
        struct rb_node nd;
        pgoff_t start, end;
        struct mempolicy *policy;
};

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
int mpol_set_shared_policy(struct shared_policy *sp,
                           struct vm_area_struct *vma, struct mempolicy *mpol);
void mpol_free_shared_policy(struct shared_policy *sp);
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                            pgoff_t idx);

struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, pgoff_t *ilx);
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                unsigned long addr, int order, pgoff_t *ilx);
bool vma_policy_mof(struct vm_area_struct *vma);

extern void numa_default_policy(void);
extern void numa_policy_init(void);
extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new);
extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);

extern int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask);
extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                const nodemask_t *mask);
extern unsigned int mempolicy_slab_node(void);

extern enum zone_type policy_zone;

static inline void check_highest_zone(enum zone_type k)
{
        if (k > policy_zone && k != ZONE_MOVABLE)
                policy_zone = k;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags);


#ifdef CONFIG_TMPFS
extern int mpol_parse_str(char *str, struct mempolicy **mpol);
#endif

extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);

/* Check if a vma is migratable */
extern bool vma_migratable(struct vm_area_struct *vma);

int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                                        unsigned long addr);
extern void mpol_put_task_policy(struct task_struct *);

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  (pol->mode == MPOL_PREFERRED_MANY);
}

extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);

#else

struct mempolicy {};

static inline struct mempolicy *get_task_policy(struct task_struct *p)
{
        return NULL;
}

static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        return true;
}

static inline void mpol_put(struct mempolicy *pol)
{
}

static inline void mpol_cond_put(struct mempolicy *pol)
{
}

static inline void mpol_get(struct mempolicy *pol)
{
}

struct shared_policy {};

static inline void mpol_shared_policy_init(struct shared_policy *sp,
                                                struct mempolicy *mpol)
{
}

static inline void mpol_free_shared_policy(struct shared_policy *sp)
{
}

static inline struct mempolicy *
mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
{
        return NULL;
}

static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                unsigned long addr, int order, pgoff_t *ilx)
{
        *ilx = 0;
        return NULL;
}

static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        return 0;
}

static inline void numa_policy_init(void)
{
}

static inline void numa_default_policy(void)
{
}

static inline void mpol_rebind_task(struct task_struct *tsk,
                                const nodemask_t *new)
{
}

static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
}

static inline int huge_node(struct vm_area_struct *vma,
                                unsigned long addr, gfp_t gfp_flags,
                                struct mempolicy **mpol, nodemask_t **nodemask)
{
        *mpol = NULL;
        *nodemask = NULL;
        return 0;
}

static inline bool init_nodemask_of_mempolicy(nodemask_t *m)
{
        return false;
}

static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                                   const nodemask_t *to, int flags)
{
        return 0;
}

static inline void check_highest_zone(int k)
{
}

#ifdef CONFIG_TMPFS
static inline int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        return 1;        /* error */
}
#endif

static inline int mpol_misplaced(struct folio *folio,
                                 struct vm_fault *vmf,
                                 unsigned long address)
{
        return -1; /* no node preference */
}

static inline void mpol_put_task_policy(struct task_struct *task)
{
}

static inline bool mpol_is_preferred_many(struct mempolicy *pol)
{
        return  false;
}

#endif /* CONFIG_NUMA */
#endif




































  255 

  255 

  255 













  255 
  255 

  255 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Generic Timer-queue
 *
 *  Manages a simple queue of timers, ordered by expiration time.
 *  Uses rbtrees for quick list adds and expiration.
 *
 *  NOTE: All of the following functions need to be serialized
 *  to avoid races. No locking is done by this library code.
 */

#include <linux/bug.h>
#include <linux/timerqueue.h>
#include <linux/rbtree.h>
#include <linux/export.h>

#define __node_2_tq(_n) \
        rb_entry((_n), struct timerqueue_node, node)

static inline bool __timerqueue_less(struct rb_node *a, const struct rb_node *b)
{
        return __node_2_tq(a)->expires < __node_2_tq(b)->expires;
}

/**
 * timerqueue_add - Adds timer to timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be added
 *
 * Adds the timer node to the timerqueue, sorted by the node's expires
 * value. Returns true if the newly added timer is the first expiring timer in
 * the queue.
 */
bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
{
        /* Make sure we don't add nodes that are already added */
        WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));

        return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less);
}
EXPORT_SYMBOL_GPL(timerqueue_add);

/**
 * timerqueue_del - Removes a timer from the timerqueue.
 *
 * @head: head of timerqueue
 * @node: timer node to be removed
 *
 * Removes the timer node from the timerqueue. Returns true if the queue is
 * not empty after the remove.
 */
bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));

        rb_erase_cached(&node->node, &head->rb_root);
        RB_CLEAR_NODE(&node->node);

        return !RB_EMPTY_ROOT(&head->rb_root.rb_root);
}
EXPORT_SYMBOL_GPL(timerqueue_del);

/**
 * timerqueue_iterate_next - Returns the timer after the provided timer
 *
 * @node: Pointer to a timer.
 *
 * Provides the timer that is after the given node. This is used, when
 * necessary, to iterate through the list of timers in a timer list
 * without modifying the list.
 */
struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node)
{
        struct rb_node *next;

        if (!node)
                return NULL;
        next = rb_next(&node->node);
        if (!next)
                return NULL;
        return container_of(next, struct timerqueue_node, node);
}
EXPORT_SYMBOL_GPL(timerqueue_iterate_next);


























































































































































































































































































































































































































































































































































































































































































































    1 





  156 

  156 
   18 
























   11 








  156 


















    1 





































  145 
  156 





  552 




















  552 


  552 






  328 

















  331 




  331 




  302 
  302 






    4 

    4 

    4 























































  285 



































  301 






   72 


























   54 





















































































  188 





   43 





  241 









































































   32 

























   32 

   13 












  231 
  382 

   32 








































  276 




  271 



















































































































   46 
















    5 




















   42 





























































































































  410 




  229 


  229 

























    6 
  271 
   47 


















   47 












































  156 




  158 






















































































   16 




























































































































































































































   16 









   16 

































































































































































































  471 




















   29 



























































  257 






































































































  580 









  229 
  261 





































































































































































































































































































































































  230 






















  179 








   55 























  153 
  285 




  164 
  223 



























































































































   53 























  158 




   82 





















  471 
  471 





  470 





  467 
   36 















   53 




































    7 


































  391 



















   53 























  169 


  168 
  169 






    7 
    7 
    7 



    7 
    7 
    7 




   53 




























































   40 




   40 




   40 





































































































































































































































































































  203 


























































































































  148 








































































    3 








































































































  307 
































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_H
#define _LINUX_MM_H

#include <linux/errno.h>
#include <linux/mmdebug.h>
#include <linux/gfp.h>
#include <linux/pgalloc_tag.h>
#include <linux/bug.h>
#include <linux/list.h>
#include <linux/mmzone.h>
#include <linux/rbtree.h>
#include <linux/atomic.h>
#include <linux/debug_locks.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/range.h>
#include <linux/pfn.h>
#include <linux/percpu-refcount.h>
#include <linux/bit_spinlock.h>
#include <linux/shrinker.h>
#include <linux/resource.h>
#include <linux/page_ext.h>
#include <linux/err.h>
#include <linux/page-flags.h>
#include <linux/page_ref.h>
#include <linux/overflow.h>
#include <linux/sizes.h>
#include <linux/sched.h>
#include <linux/pgtable.h>
#include <linux/kasan.h>
#include <linux/memremap.h>
#include <linux/slab.h>
#include <linux/cacheinfo.h>
#include <linux/rcuwait.h>

struct mempolicy;
struct anon_vma;
struct anon_vma_chain;
struct user_struct;
struct pt_regs;
struct folio_batch;

void arch_mm_preinit(void);
void mm_core_init(void);
void init_mm_internals(void);

extern atomic_long_t _totalram_pages;
static inline unsigned long totalram_pages(void)
{
        return (unsigned long)atomic_long_read(&_totalram_pages);
}

static inline void totalram_pages_inc(void)
{
        atomic_long_inc(&_totalram_pages);
}

static inline void totalram_pages_dec(void)
{
        atomic_long_dec(&_totalram_pages);
}

static inline void totalram_pages_add(long count)
{
        atomic_long_add(count, &_totalram_pages);
}

extern void * high_memory;

#ifdef CONFIG_SYSCTL
extern int sysctl_legacy_va_layout;
#else
#define sysctl_legacy_va_layout 0
#endif

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
extern const int mmap_rnd_bits_min;
extern int mmap_rnd_bits_max __ro_after_init;
extern int mmap_rnd_bits __read_mostly;
#endif
#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
extern const int mmap_rnd_compat_bits_min;
extern const int mmap_rnd_compat_bits_max;
extern int mmap_rnd_compat_bits __read_mostly;
#endif

#ifndef DIRECT_MAP_PHYSMEM_END
# ifdef MAX_PHYSMEM_BITS
# define DIRECT_MAP_PHYSMEM_END        ((1ULL << MAX_PHYSMEM_BITS) - 1)
# else
# define DIRECT_MAP_PHYSMEM_END        (((phys_addr_t)-1)&~(1ULL<<63))
# endif
#endif

#include <asm/page.h>
#include <asm/processor.h>

#ifndef __pa_symbol
#define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
#endif

#ifndef page_to_virt
#define page_to_virt(x)        __va(PFN_PHYS(page_to_pfn(x)))
#endif

#ifndef lm_alias
#define lm_alias(x)        __va(__pa_symbol(x))
#endif

/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This macro should be defined within <asm/pgtable.h>.
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */
#ifndef mm_forbids_zeropage
#define mm_forbids_zeropage(X)        (0)
#endif

/*
 * On some architectures it is expensive to call memset() for small sizes.
 * If an architecture decides to implement their own version of
 * mm_zero_struct_page they should wrap the defines below in a #ifndef and
 * define their own version of this macro in <asm/pgtable.h>
 */
#if BITS_PER_LONG == 64
/* This function must be updated when the size of struct page grows above 96
 * or reduces below 56. The idea that compiler optimizes out switch()
 * statement, and only leaves move/store instructions. Also the compiler can
 * combine write statements if they are both assignments and can be reordered,
 * this can result in several of the writes here being dropped.
 */
#define        mm_zero_struct_page(pp) __mm_zero_struct_page(pp)
static inline void __mm_zero_struct_page(struct page *page)
{
        unsigned long *_pp = (void *)page;

         /* Check that struct page is either 56, 64, 72, 80, 88 or 96 bytes */
        BUILD_BUG_ON(sizeof(struct page) & 7);
        BUILD_BUG_ON(sizeof(struct page) < 56);
        BUILD_BUG_ON(sizeof(struct page) > 96);

        switch (sizeof(struct page)) {
        case 96:
                _pp[11] = 0;
                fallthrough;
        case 88:
                _pp[10] = 0;
                fallthrough;
        case 80:
                _pp[9] = 0;
                fallthrough;
        case 72:
                _pp[8] = 0;
                fallthrough;
        case 64:
                _pp[7] = 0;
                fallthrough;
        case 56:
                _pp[6] = 0;
                _pp[5] = 0;
                _pp[4] = 0;
                _pp[3] = 0;
                _pp[2] = 0;
                _pp[1] = 0;
                _pp[0] = 0;
        }
}
#else
#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
#endif

/*
 * Default maximum number of active map areas, this limits the number of vmas
 * per mm struct. Users can overwrite this number by sysctl but there is a
 * problem.
 *
 * When a program's coredump is generated as ELF format, a section is created
 * per a vma. In ELF, the number of sections is represented in unsigned short.
 * This means the number of sections should be smaller than 65535 at coredump.
 * Because the kernel adds some informative sections to a image of program at
 * generating coredump, we need some margin. The number of extra sections is
 * 1-3 now and depends on arch. We use "5" as safe margin, here.
 *
 * ELF extended numbering allows more than 65535 sections, so 16-bit bound is
 * not a hard limit any more. Although some userspace tools can be surprised by
 * that.
 */
#define MAPCOUNT_ELF_CORE_MARGIN        (5)
#define DEFAULT_MAX_MAP_COUNT        (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)

extern int sysctl_max_map_count;

extern unsigned long sysctl_user_reserve_kbytes;
extern unsigned long sysctl_admin_reserve_kbytes;

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
#define folio_page_idx(folio, p)        (page_to_pfn(p) - folio_pfn(folio))
#else
#define nth_page(page,n) ((page) + (n))
#define folio_page_idx(folio, p)        ((p) - &(folio)->page)
#endif

/* to align the pointer to the (next) page boundary */
#define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE)

/* to align the pointer to the (prev) page boundary */
#define PAGE_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PAGE_SIZE)

/* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */
#define PAGE_ALIGNED(addr)        IS_ALIGNED((unsigned long)(addr), PAGE_SIZE)

static inline struct folio *lru_to_folio(struct list_head *head)
{
        return list_entry((head)->prev, struct folio, lru);
}

void setup_initial_init_mm(void *start_code, void *end_code,
                           void *end_data, void *brk);

/*
 * Linux kernel virtual memory manager primitives.
 * The idea being to have a "virtual" mm in the same way
 * we have a virtual fs - giving a cleaner interface to the
 * mm details, and allowing different kinds of memory mappings
 * (from shared memory to executable loading to arbitrary
 * mmap() functions).
 */

struct vm_area_struct *vm_area_alloc(struct mm_struct *);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
void vm_area_free(struct vm_area_struct *);

#ifndef CONFIG_MMU
extern struct rb_root nommu_region_tree;
extern struct rw_semaphore nommu_region_sem;

extern unsigned int kobjsize(const void *objp);
#endif

/*
 * vm_flags in vm_area_struct, see mm_types.h.
 * When changing, update also include/trace/events/mmflags.h
 */
#define VM_NONE                0x00000000

#define VM_READ                0x00000001        /* currently active flags */
#define VM_WRITE        0x00000002
#define VM_EXEC                0x00000004
#define VM_SHARED        0x00000008

/* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */
#define VM_MAYREAD        0x00000010        /* limits for mprotect() etc */
#define VM_MAYWRITE        0x00000020
#define VM_MAYEXEC        0x00000040
#define VM_MAYSHARE        0x00000080

#define VM_GROWSDOWN        0x00000100        /* general info on the segment */
#ifdef CONFIG_MMU
#define VM_UFFD_MISSING        0x00000200        /* missing pages tracking */
#else /* CONFIG_MMU */
#define VM_MAYOVERLAY        0x00000200        /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */
#define VM_UFFD_MISSING        0
#endif /* CONFIG_MMU */
#define VM_PFNMAP        0x00000400        /* Page-ranges managed without "struct page", just pure PFN */
#define VM_UFFD_WP        0x00001000        /* wrprotect pages tracking */

#define VM_LOCKED        0x00002000
#define VM_IO           0x00004000        /* Memory mapped I/O or similar */

                                        /* Used by sys_madvise() */
#define VM_SEQ_READ        0x00008000        /* App will access data sequentially */
#define VM_RAND_READ        0x00010000        /* App will not benefit from clustered reads */

#define VM_DONTCOPY        0x00020000      /* Do not copy this vma on fork */
#define VM_DONTEXPAND        0x00040000        /* Cannot expand with mremap() */
#define VM_LOCKONFAULT        0x00080000        /* Lock the pages covered when they are faulted in */
#define VM_ACCOUNT        0x00100000        /* Is a VM accounted object */
#define VM_NORESERVE        0x00200000        /* should the VM suppress accounting */
#define VM_HUGETLB        0x00400000        /* Huge TLB Page VM */
#define VM_SYNC                0x00800000        /* Synchronous page faults */
#define VM_ARCH_1        0x01000000        /* Architecture-specific flag */
#define VM_WIPEONFORK        0x02000000        /* Wipe VMA contents in child. */
#define VM_DONTDUMP        0x04000000        /* Do not include in the core dump */

#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY        0x08000000        /* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY        0
#endif

#define VM_MIXEDMAP        0x10000000        /* Can contain "struct page" and pure PFN pages */
#define VM_HUGEPAGE        0x20000000        /* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE        0x40000000        /* MADV_NOHUGEPAGE marked this vma */
#define VM_MERGEABLE        0x80000000        /* KSM may merge identical pages */

#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
#define VM_HIGH_ARCH_BIT_0        32        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_1        33        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_2        34        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3        35        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4        36        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_5        37        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_6        38        /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0        BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1        BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2        BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3        BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4        BIT(VM_HIGH_ARCH_BIT_4)
#define VM_HIGH_ARCH_5        BIT(VM_HIGH_ARCH_BIT_5)
#define VM_HIGH_ARCH_6        BIT(VM_HIGH_ARCH_BIT_6)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */

#ifdef CONFIG_ARCH_HAS_PKEYS
# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
# define VM_PKEY_BIT0  VM_HIGH_ARCH_0
# define VM_PKEY_BIT1  VM_HIGH_ARCH_1
# define VM_PKEY_BIT2  VM_HIGH_ARCH_2
#if CONFIG_ARCH_PKEY_BITS > 3
# define VM_PKEY_BIT3  VM_HIGH_ARCH_3
#else
# define VM_PKEY_BIT3  0
#endif
#if CONFIG_ARCH_PKEY_BITS > 4
# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
#else
# define VM_PKEY_BIT4  0
#endif
#endif /* CONFIG_ARCH_HAS_PKEYS */

#ifdef CONFIG_X86_USER_SHADOW_STACK
/*
 * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of
 * support core mm.
 *
 * These VMAs will get a single end guard page. This helps userspace protect
 * itself from attacks. A single page is enough for current shadow stack archs
 * (x86). See the comments near alloc_shstk() in arch/x86/kernel/shstk.c
 * for more details on the guard size.
 */
# define VM_SHADOW_STACK        VM_HIGH_ARCH_5
#endif

#if defined(CONFIG_ARM64_GCS)
/*
 * arm64's Guarded Control Stack implements similar functionality and
 * has similar constraints to shadow stacks.
 */
# define VM_SHADOW_STACK        VM_HIGH_ARCH_6
#endif

#ifndef VM_SHADOW_STACK
# define VM_SHADOW_STACK        VM_NONE
#endif

#if defined(CONFIG_X86)
# define VM_PAT                VM_ARCH_1        /* PAT reserves whole VMA at once (x86) */
#elif defined(CONFIG_PPC64)
# define VM_SAO                VM_ARCH_1        /* Strong Access Ordering (powerpc) */
#elif defined(CONFIG_PARISC)
# define VM_GROWSUP        VM_ARCH_1
#elif defined(CONFIG_SPARC64)
# define VM_SPARC_ADI        VM_ARCH_1        /* Uses ADI tag for access control */
# define VM_ARCH_CLEAR        VM_SPARC_ADI
#elif defined(CONFIG_ARM64)
# define VM_ARM64_BTI        VM_ARCH_1        /* BTI guarded page, a.k.a. GP bit */
# define VM_ARCH_CLEAR        VM_ARM64_BTI
#elif !defined(CONFIG_MMU)
# define VM_MAPPED_COPY        VM_ARCH_1        /* T if mapped copy of data (nommu mmap) */
#endif

#if defined(CONFIG_ARM64_MTE)
# define VM_MTE                VM_HIGH_ARCH_4        /* Use Tagged memory for access control */
# define VM_MTE_ALLOWED        VM_HIGH_ARCH_5        /* Tagged memory permitted */
#else
# define VM_MTE                VM_NONE
# define VM_MTE_ALLOWED        VM_NONE
#endif

#ifndef VM_GROWSUP
# define VM_GROWSUP        VM_NONE
#endif

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
# define VM_UFFD_MINOR_BIT        38
# define VM_UFFD_MINOR                BIT(VM_UFFD_MINOR_BIT)        /* UFFD minor faults */
#else /* !CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
# define VM_UFFD_MINOR                VM_NONE
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */

/*
 * This flag is used to connect VFIO to arch specific KVM code. It
 * indicates that the memory under this VMA is safe for use with any
 * non-cachable memory type inside KVM. Some VFIO devices, on some
 * platforms, are thought to be unsafe and can cause machine crashes
 * if KVM does not lock down the memory type.
 */
#ifdef CONFIG_64BIT
#define VM_ALLOW_ANY_UNCACHED_BIT        39
#define VM_ALLOW_ANY_UNCACHED                BIT(VM_ALLOW_ANY_UNCACHED_BIT)
#else
#define VM_ALLOW_ANY_UNCACHED                VM_NONE
#endif

#ifdef CONFIG_64BIT
#define VM_DROPPABLE_BIT        40
#define VM_DROPPABLE                BIT(VM_DROPPABLE_BIT)
#elif defined(CONFIG_PPC32)
#define VM_DROPPABLE                VM_ARCH_1
#else
#define VM_DROPPABLE                VM_NONE
#endif

#ifdef CONFIG_64BIT
/* VM is sealed, in vm_flags */
#define VM_SEALED        _BITUL(63)
#endif

/* Bits set in the VMA until the stack is in its final location */
#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)

#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0)

/* Common data flag combinations */
#define VM_DATA_FLAGS_TSK_EXEC        (VM_READ | VM_WRITE | TASK_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_NON_EXEC        (VM_READ | VM_WRITE | VM_MAYREAD | \
                                 VM_MAYWRITE | VM_MAYEXEC)
#define VM_DATA_FLAGS_EXEC        (VM_READ | VM_WRITE | VM_EXEC | \
                                 VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)

#ifndef VM_DATA_DEFAULT_FLAGS                /* arch can override this */
#define VM_DATA_DEFAULT_FLAGS  VM_DATA_FLAGS_EXEC
#endif

#ifndef VM_STACK_DEFAULT_FLAGS                /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
#endif

#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK)

#ifdef CONFIG_STACK_GROWSUP
#define VM_STACK        VM_GROWSUP
#define VM_STACK_EARLY        VM_GROWSDOWN
#else
#define VM_STACK        VM_GROWSDOWN
#define VM_STACK_EARLY        0
#endif

#define VM_STACK_FLAGS        (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT)

/* VMA basic access permission flags */
#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC)


/*
 * Special vmas that are non-mergable, non-mlock()able.
 */
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)

/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)

/* This mask defines which mm->def_flags a process can inherit its parent */
#define VM_INIT_DEF_MASK        VM_NOHUGEPAGE

/* This mask represents all the VMA flag bits used by mlock */
#define VM_LOCKED_MASK        (VM_LOCKED | VM_LOCKONFAULT)

/* Arch-specific flags to clear when updating VM flags on protection change */
#ifndef VM_ARCH_CLEAR
# define VM_ARCH_CLEAR        VM_NONE
#endif
#define VM_FLAGS_CLEAR        (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR)

/*
 * mapping from the currently active vm_flags protection bits (the
 * low four bits) to a page protection mask..
 */

/*
 * The default fault flags that should be used by most of the
 * arch-specific page fault handlers.
 */
#define FAULT_FLAG_DEFAULT  (FAULT_FLAG_ALLOW_RETRY | \
                             FAULT_FLAG_KILLABLE | \
                             FAULT_FLAG_INTERRUPTIBLE)

/**
 * fault_flag_allow_retry_first - check ALLOW_RETRY the first time
 * @flags: Fault flags.
 *
 * This is mostly used for places where we want to try to avoid taking
 * the mmap_lock for too long a time when waiting for another condition
 * to change, in which case we can try to be polite to release the
 * mmap_lock in the first round to avoid potential starvation of other
 * processes that would also want the mmap_lock.
 *
 * Return: true if the page fault allows retry and this is the first
 * attempt of the fault handling; false otherwise.
 */
static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
{
        return (flags & FAULT_FLAG_ALLOW_RETRY) &&
            (!(flags & FAULT_FLAG_TRIED));
}

#define FAULT_FLAG_TRACE \
        { FAULT_FLAG_WRITE,                "WRITE" }, \
        { FAULT_FLAG_MKWRITE,                "MKWRITE" }, \
        { FAULT_FLAG_ALLOW_RETRY,        "ALLOW_RETRY" }, \
        { FAULT_FLAG_RETRY_NOWAIT,        "RETRY_NOWAIT" }, \
        { FAULT_FLAG_KILLABLE,                "KILLABLE" }, \
        { FAULT_FLAG_TRIED,                "TRIED" }, \
        { FAULT_FLAG_USER,                "USER" }, \
        { FAULT_FLAG_REMOTE,                "REMOTE" }, \
        { FAULT_FLAG_INSTRUCTION,        "INSTRUCTION" }, \
        { FAULT_FLAG_INTERRUPTIBLE,        "INTERRUPTIBLE" }, \
        { FAULT_FLAG_VMA_LOCK,                "VMA_LOCK" }

/*
 * vm_fault is filled by the pagefault handler and passed to the vma's
 * ->fault function. The vma's ->fault is responsible for returning a bitmask
 * of VM_FAULT_xxx flags that give details about how the fault was handled.
 *
 * MM layer fills up gfp_mask for page allocations but fault handler might
 * alter it if its implementation requires a different allocation context.
 *
 * pgoff should be used in favour of virtual_address, if possible.
 */
struct vm_fault {
        const struct {
                struct vm_area_struct *vma;        /* Target VMA */
                gfp_t gfp_mask;                        /* gfp mask to be used for allocations */
                pgoff_t pgoff;                        /* Logical page offset based on vma */
                unsigned long address;                /* Faulting virtual address - masked */
                unsigned long real_address;        /* Faulting virtual address - unmasked */
        };
        enum fault_flag flags;                /* FAULT_FLAG_xxx flags
                                         * XXX: should really be 'const' */
        pmd_t *pmd;                        /* Pointer to pmd entry matching
                                         * the 'address' */
        pud_t *pud;                        /* Pointer to pud entry matching
                                         * the 'address'
                                         */
        union {
                pte_t orig_pte;                /* Value of PTE at the time of fault */
                pmd_t orig_pmd;                /* Value of PMD at the time of fault,
                                         * used by PMD fault only.
                                         */
        };

        struct page *cow_page;                /* Page handler may use for COW fault */
        struct page *page;                /* ->fault handlers should return a
                                         * page here, unless VM_FAULT_NOPAGE
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
        /* These three entries are valid only while holding ptl lock */
        pte_t *pte;                        /* Pointer to pte entry matching
                                         * the 'address'. NULL if the page
                                         * table hasn't been allocated.
                                         */
        spinlock_t *ptl;                /* Page table lock.
                                         * Protects pte page table if 'pte'
                                         * is not NULL, otherwise pmd.
                                         */
        pgtable_t prealloc_pte;                /* Pre-allocated pte page table.
                                         * vm_ops->map_pages() sets up a page
                                         * table from atomic context.
                                         * do_fault_around() pre-allocates
                                         * page table to avoid allocation from
                                         * atomic context.
                                         */
};

/*
 * These are the virtual MM functions - opening of an area, closing and
 * unmapping it (needed to keep files on disk up-to-date etc), pointer
 * to the functions called when a no-page or a wp-page exception occurs.
 */
struct vm_operations_struct {
        void (*open)(struct vm_area_struct * area);
        /**
         * @close: Called when the VMA is being removed from the MM.
         * Context: User context.  May sleep.  Caller holds mmap_lock.
         */
        void (*close)(struct vm_area_struct * area);
        /* Called any time before splitting to check if it's allowed */
        int (*may_split)(struct vm_area_struct *area, unsigned long addr);
        int (*mremap)(struct vm_area_struct *area);
        /*
         * Called by mprotect() to make driver-specific permission
         * checks before mprotect() is finalised.   The VMA must not
         * be modified.  Returns 0 if mprotect() can proceed.
         */
        int (*mprotect)(struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, unsigned long newflags);
        vm_fault_t (*fault)(struct vm_fault *vmf);
        vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order);
        vm_fault_t (*map_pages)(struct vm_fault *vmf,
                        pgoff_t start_pgoff, pgoff_t end_pgoff);
        unsigned long (*pagesize)(struct vm_area_struct * area);

        /* notification that a previously read-only page is about to become
         * writable, if an error is returned it will cause a SIGBUS */
        vm_fault_t (*page_mkwrite)(struct vm_fault *vmf);

        /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
        vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf);

        /* called by access_process_vm when get_user_pages() fails, typically
         * for use by special VMAs. See also generic_access_phys() for a generic
         * implementation useful for any iomem mapping.
         */
        int (*access)(struct vm_area_struct *vma, unsigned long addr,
                      void *buf, int len, int write);

        /* Called by the /proc/PID/maps code to ask the vma whether it
         * has a special name.  Returning non-NULL will also cause this
         * vma to be dumped unconditionally. */
        const char *(*name)(struct vm_area_struct *vma);

#ifdef CONFIG_NUMA
        /*
         * set_policy() op must add a reference to any non-NULL @new mempolicy
         * to hold the policy upon return.  Caller should pass NULL @new to
         * remove a policy and fall back to surrounding context--i.e. do not
         * install a MPOL_DEFAULT policy, nor the task or system default
         * mempolicy.
         */
        int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

        /*
         * get_policy() op must add reference [mpol_get()] to any policy at
         * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
         * in mm/mempolicy.c will do this automatically.
         * get_policy() must NOT add a ref if the policy at (vma,addr) is not
         * marked as MPOL_SHARED. vma policies are protected by the mmap_lock.
         * If no [shared/vma] mempolicy exists at the addr, get_policy() op
         * must return NULL--i.e., do not "fallback" to task or system default
         * policy.
         */
        struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
                                        unsigned long addr, pgoff_t *ilx);
#endif
        /*
         * Called by vm_normal_page() for special PTEs to find the
         * page for @addr.  This is useful if the default behavior
         * (using pte_page()) would not find the correct page.
         */
        struct page *(*find_special_page)(struct vm_area_struct *vma,
                                          unsigned long addr);
};

#ifdef CONFIG_NUMA_BALANCING
static inline void vma_numab_state_init(struct vm_area_struct *vma)
{
        vma->numab_state = NULL;
}
static inline void vma_numab_state_free(struct vm_area_struct *vma)
{
        kfree(vma->numab_state);
}
#else
static inline void vma_numab_state_init(struct vm_area_struct *vma) {}
static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_PER_VMA_LOCK
static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
{
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        static struct lock_class_key lockdep_key;

        lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0);
#endif
        if (reset_refcnt)
                refcount_set(&vma->vm_refcnt, 0);
        vma->vm_lock_seq = UINT_MAX;
}

static inline bool is_vma_writer_only(int refcnt)
{
        /*
         * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma
         * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on
         * a detached vma happens only in vma_mark_detached() and is a rare
         * case, therefore most of the time there will be no unnecessary wakeup.
         */
        return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1;
}

static inline void vma_refcount_put(struct vm_area_struct *vma)
{
        /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */
        struct mm_struct *mm = vma->vm_mm;
        int oldcnt;

        rwsem_release(&vma->vmlock_dep_map, _RET_IP_);
        if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) {

                if (is_vma_writer_only(oldcnt - 1))
                        rcuwait_wake_up(&mm->vma_writer_wait);
        }
}

/*
 * Try to read-lock a vma. The function is allowed to occasionally yield false
 * locked result to avoid performance overhead, in which case we fall back to
 * using mmap_lock. The function should never yield false unlocked result.
 * False locked result is possible if mm_lock_seq overflows or if vma gets
 * reused and attached to a different mm before we lock it.
 * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
 * detached.
 */
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
                                                    struct vm_area_struct *vma)
{
        int oldcnt;

        /*
         * Check before locking. A race might cause false locked result.
         * We can use READ_ONCE() for the mm_lock_seq here, and don't need
         * ACQUIRE semantics, because this is just a lockless check whose result
         * we don't rely on for anything - the mm_lock_seq read against which we
         * need ordering is below.
         */
        if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence))
                return NULL;

        /*
         * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire()
         * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET.
         * Acquire fence is required here to avoid reordering against later
         * vm_lock_seq check and checks inside lock_vma_under_rcu().
         */
        if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
                                                              VMA_REF_LIMIT))) {
                /* return EAGAIN if vma got detached from under us */
                return oldcnt ? NULL : ERR_PTR(-EAGAIN);
        }

        rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
        /*
         * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
         * False unlocked result is impossible because we modify and check
         * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
         * modification invalidates all existing locks.
         *
         * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
         * racing with vma_end_write_all(), we only start reading from the VMA
         * after it has been unlocked.
         * This pairs with RELEASE semantics in vma_end_write_all().
         */
        if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
                vma_refcount_put(vma);
                return NULL;
        }

        return vma;
}

/*
 * Use only while holding mmap read lock which guarantees that locking will not
 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
 * not be used in such cases because it might fail due to mm_lock_seq overflow.
 * This functionality is used to obtain vma read lock and drop the mmap read lock.
 */
static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
{
        int oldcnt;

        mmap_assert_locked(vma->vm_mm);
        if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
                                                              VMA_REF_LIMIT)))
                return false;

        rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_);
        return true;
}

/*
 * Use only while holding mmap read lock which guarantees that locking will not
 * fail (nobody can concurrently write-lock the vma). vma_start_read() should
 * not be used in such cases because it might fail due to mm_lock_seq overflow.
 * This functionality is used to obtain vma read lock and drop the mmap read lock.
 */
static inline bool vma_start_read_locked(struct vm_area_struct *vma)
{
        return vma_start_read_locked_nested(vma, 0);
}

static inline void vma_end_read(struct vm_area_struct *vma)
{
        vma_refcount_put(vma);
}

/* WARNING! Can only be used if mmap_lock is expected to be write-locked */
static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_lock_seq)
{
        mmap_assert_write_locked(vma->vm_mm);

        /*
         * current task is holding mmap_write_lock, both vma->vm_lock_seq and
         * mm->mm_lock_seq can't be concurrently modified.
         */
        *mm_lock_seq = vma->vm_mm->mm_lock_seq.sequence;
        return (vma->vm_lock_seq == *mm_lock_seq);
}

void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq);

/*
 * Begin writing to a VMA.
 * Exclude concurrent readers under the per-VMA lock until the currently
 * write-locked mmap_lock is dropped or downgraded.
 */
static inline void vma_start_write(struct vm_area_struct *vma)
{
        unsigned int mm_lock_seq;

        if (__is_vma_write_locked(vma, &mm_lock_seq))
                return;

        __vma_start_write(vma, mm_lock_seq);
}

static inline void vma_assert_write_locked(struct vm_area_struct *vma)
{
        unsigned int mm_lock_seq;

        VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
}

static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        unsigned int mm_lock_seq;

        VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 &&
                      !__is_vma_write_locked(vma, &mm_lock_seq), vma);
}

/*
 * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these
 * assertions should be made either under mmap_write_lock or when the object
 * has been isolated under mmap_write_lock, ensuring no competing writers.
 */
static inline void vma_assert_attached(struct vm_area_struct *vma)
{
        WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt));
}

static inline void vma_assert_detached(struct vm_area_struct *vma)
{
        WARN_ON_ONCE(refcount_read(&vma->vm_refcnt));
}

static inline void vma_mark_attached(struct vm_area_struct *vma)
{
        vma_assert_write_locked(vma);
        vma_assert_detached(vma);
        refcount_set_release(&vma->vm_refcnt, 1);
}

void vma_mark_detached(struct vm_area_struct *vma);

static inline void release_fault_lock(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_end_read(vmf->vma);
        else
                mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(struct vm_fault *vmf)
{
        if (vmf->flags & FAULT_FLAG_VMA_LOCK)
                vma_assert_locked(vmf->vma);
        else
                mmap_assert_locked(vmf->vma->vm_mm);
}

struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                                          unsigned long address);

#else /* CONFIG_PER_VMA_LOCK */

static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {}
static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
                                                    struct vm_area_struct *vma)
                { return NULL; }
static inline void vma_end_read(struct vm_area_struct *vma) {}
static inline void vma_start_write(struct vm_area_struct *vma) {}
static inline void vma_assert_write_locked(struct vm_area_struct *vma)
                { mmap_assert_write_locked(vma->vm_mm); }
static inline void vma_assert_attached(struct vm_area_struct *vma) {}
static inline void vma_assert_detached(struct vm_area_struct *vma) {}
static inline void vma_mark_attached(struct vm_area_struct *vma) {}
static inline void vma_mark_detached(struct vm_area_struct *vma) {}

static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
                unsigned long address)
{
        return NULL;
}

static inline void vma_assert_locked(struct vm_area_struct *vma)
{
        mmap_assert_locked(vma->vm_mm);
}

static inline void release_fault_lock(struct vm_fault *vmf)
{
        mmap_read_unlock(vmf->vma->vm_mm);
}

static inline void assert_fault_locked(struct vm_fault *vmf)
{
        mmap_assert_locked(vmf->vma->vm_mm);
}

#endif /* CONFIG_PER_VMA_LOCK */

extern const struct vm_operations_struct vma_dummy_vm_ops;

static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_ops = &vma_dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
        vma_lock_init(vma, false);
}

/* Use when VMA is not part of the VMA tree and needs no locking */
static inline void vm_flags_init(struct vm_area_struct *vma,
                                 vm_flags_t flags)
{
        ACCESS_PRIVATE(vma, __vm_flags) = flags;
}

/*
 * Use when VMA is part of the VMA tree and modifications need coordination
 * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and
 * it should be locked explicitly beforehand.
 */
static inline void vm_flags_reset(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        vma_assert_write_locked(vma);
        vm_flags_init(vma, flags);
}

static inline void vm_flags_reset_once(struct vm_area_struct *vma,
                                       vm_flags_t flags)
{
        vma_assert_write_locked(vma);
        WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
}

static inline void vm_flags_set(struct vm_area_struct *vma,
                                vm_flags_t flags)
{
        vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) |= flags;
}

static inline void vm_flags_clear(struct vm_area_struct *vma,
                                  vm_flags_t flags)
{
        vma_start_write(vma);
        ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
}

/*
 * Use only if VMA is not part of the VMA tree or has no other users and
 * therefore needs no locking.
 */
static inline void __vm_flags_mod(struct vm_area_struct *vma,
                                  vm_flags_t set, vm_flags_t clear)
{
        vm_flags_init(vma, (vma->vm_flags | set) & ~clear);
}

/*
 * Use only when the order of set/clear operations is unimportant, otherwise
 * use vm_flags_{set|clear} explicitly.
 */
static inline void vm_flags_mod(struct vm_area_struct *vma,
                                vm_flags_t set, vm_flags_t clear)
{
        vma_start_write(vma);
        __vm_flags_mod(vma, set, clear);
}

static inline void vma_set_anonymous(struct vm_area_struct *vma)
{
        vma->vm_ops = NULL;
}

static inline bool vma_is_anonymous(struct vm_area_struct *vma)
{
        return !vma->vm_ops;
}

/*
 * Indicate if the VMA is a heap for the given task; for
 * /proc/PID/maps that is the heap of the main task.
 */
static inline bool vma_is_initial_heap(const struct vm_area_struct *vma)
{
        return vma->vm_start < vma->vm_mm->brk &&
                vma->vm_end > vma->vm_mm->start_brk;
}

/*
 * Indicate if the VMA is a stack for the given task; for
 * /proc/PID/maps that is the stack of the main task.
 */
static inline bool vma_is_initial_stack(const struct vm_area_struct *vma)
{
        /*
         * We make no effort to guess what a given thread considers to be
         * its "stack".  It's not even well-defined for programs written
         * languages like Go.
         */
        return vma->vm_start <= vma->vm_mm->start_stack &&
                vma->vm_end >= vma->vm_mm->start_stack;
}

static inline bool vma_is_temporary_stack(struct vm_area_struct *vma)
{
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);

        if (!maybe_stack)
                return false;

        if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
                                                VM_STACK_INCOMPLETE_SETUP)
                return true;

        return false;
}

static inline bool vma_is_foreign(struct vm_area_struct *vma)
{
        if (!current->mm)
                return true;

        if (current->mm != vma->vm_mm)
                return true;

        return false;
}

static inline bool vma_is_accessible(struct vm_area_struct *vma)
{
        return vma->vm_flags & VM_ACCESS_FLAGS;
}

static inline bool is_shared_maywrite(vm_flags_t vm_flags)
{
        return (vm_flags & (VM_SHARED | VM_MAYWRITE)) ==
                (VM_SHARED | VM_MAYWRITE);
}

static inline bool vma_is_shared_maywrite(struct vm_area_struct *vma)
{
        return is_shared_maywrite(vma->vm_flags);
}

static inline
struct vm_area_struct *vma_find(struct vma_iterator *vmi, unsigned long max)
{
        return mas_find(&vmi->mas, max - 1);
}

static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
{
        /*
         * Uses mas_find() to get the first VMA when the iterator starts.
         * Calling mas_next() could skip the first entry.
         */
        return mas_find(&vmi->mas, ULONG_MAX);
}

static inline
struct vm_area_struct *vma_iter_next_range(struct vma_iterator *vmi)
{
        return mas_next_range(&vmi->mas, ULONG_MAX);
}


static inline struct vm_area_struct *vma_prev(struct vma_iterator *vmi)
{
        return mas_prev(&vmi->mas, 0);
}

static inline int vma_iter_clear_gfp(struct vma_iterator *vmi,
                        unsigned long start, unsigned long end, gfp_t gfp)
{
        __mas_set_range(&vmi->mas, start, end - 1);
        mas_store_gfp(&vmi->mas, NULL, gfp);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        return 0;
}

/* Free any unused preallocations */
static inline void vma_iter_free(struct vma_iterator *vmi)
{
        mas_destroy(&vmi->mas);
}

static inline int vma_iter_bulk_store(struct vma_iterator *vmi,
                                      struct vm_area_struct *vma)
{
        vmi->mas.index = vma->vm_start;
        vmi->mas.last = vma->vm_end - 1;
        mas_store(&vmi->mas, vma);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        vma_mark_attached(vma);
        return 0;
}

static inline void vma_iter_invalidate(struct vma_iterator *vmi)
{
        mas_pause(&vmi->mas);
}

static inline void vma_iter_set(struct vma_iterator *vmi, unsigned long addr)
{
        mas_set(&vmi->mas, addr);
}

#define for_each_vma(__vmi, __vma)                                        \
        while (((__vma) = vma_next(&(__vmi))) != NULL)

/* The MM code likes to work with exclusive end addresses */
#define for_each_vma_range(__vmi, __vma, __end)                                \
        while (((__vma) = vma_find(&(__vmi), (__end))) != NULL)

#ifdef CONFIG_SHMEM
/*
 * The vma_is_shmem is not inline because it is used only by slow
 * paths in userfault.
 */
bool vma_is_shmem(struct vm_area_struct *vma);
bool vma_is_anon_shmem(struct vm_area_struct *vma);
#else
static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; }
static inline bool vma_is_anon_shmem(struct vm_area_struct *vma) { return false; }
#endif

int vma_is_stack_for_current(struct vm_area_struct *vma);

/* flush_tlb_range() takes a vma, not a mm, and can care about flags */
#define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) }

struct mmu_gather;
struct inode;

extern void prep_compound_page(struct page *page, unsigned int order);

static inline unsigned int folio_large_order(const struct folio *folio)
{
        return folio->_flags_1 & 0xff;
}

#ifdef NR_PAGES_IN_LARGE_FOLIO
static inline long folio_large_nr_pages(const struct folio *folio)
{
        return folio->_nr_pages;
}
#else
static inline long folio_large_nr_pages(const struct folio *folio)
{
        return 1L << folio_large_order(folio);
}
#endif

/*
 * compound_order() can be called without holding a reference, which means
 * that niceties like page_folio() don't work.  These callers should be
 * prepared to handle wild return values.  For example, PG_head may be
 * set before the order is initialised, or this may be a tail page.
 * See compaction.c for some good examples.
 */
static inline unsigned int compound_order(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags))
                return 0;
        return folio_large_order(folio);
}

/**
 * folio_order - The allocation order of a folio.
 * @folio: The folio.
 *
 * A folio is composed of 2^order pages.  See get_order() for the definition
 * of order.
 *
 * Return: The order of the folio.
 */
static inline unsigned int folio_order(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return 0;
        return folio_large_order(folio);
}

/**
 * folio_reset_order - Reset the folio order and derived _nr_pages
 * @folio: The folio.
 *
 * Reset the order and derived _nr_pages to 0. Must only be used in the
 * process of splitting large folios.
 */
static inline void folio_reset_order(struct folio *folio)
{
        if (WARN_ON_ONCE(!folio_test_large(folio)))
                return;
        folio->_flags_1 &= ~0xffUL;
#ifdef NR_PAGES_IN_LARGE_FOLIO
        folio->_nr_pages = 0;
#endif
}

#include <linux/huge_mm.h>

/*
 * Methods to modify the page usage count.
 *
 * What counts for a page usage:
 * - cache mapping   (page->mapping)
 * - private data    (page->private)
 * - page mapped in a task's page tables, each mapping
 *   is counted separately
 *
 * Also, many kernel routines increase the page count before a critical
 * routine so they can be sure the page doesn't go away from under them.
 */

/*
 * Drop a ref, return true if the refcount fell to zero (the page has no users)
 */
static inline int put_page_testzero(struct page *page)
{
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
        return page_ref_dec_and_test(page);
}

static inline int folio_put_testzero(struct folio *folio)
{
        return put_page_testzero(&folio->page);
}

/*
 * Try to grab a ref unless the page has a refcount of zero, return false if
 * that is the case.
 * This can be called when MMU is off so it must not access
 * any of the virtual mappings.
 */
static inline bool get_page_unless_zero(struct page *page)
{
        return page_ref_add_unless(page, 1, 0);
}

static inline struct folio *folio_get_nontail_page(struct page *page)
{
        if (unlikely(!get_page_unless_zero(page)))
                return NULL;
        return (struct folio *)page;
}

extern int page_is_ram(unsigned long pfn);

enum {
        REGION_INTERSECTS,
        REGION_DISJOINT,
        REGION_MIXED,
};

int region_intersects(resource_size_t offset, size_t size, unsigned long flags,
                      unsigned long desc);

/* Support for virtually mapped pages */
struct page *vmalloc_to_page(const void *addr);
unsigned long vmalloc_to_pfn(const void *addr);

/*
 * Determine if an address is within the vmalloc range
 *
 * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
 * is no special casing required.
 */
#ifdef CONFIG_MMU
extern bool is_vmalloc_addr(const void *x);
extern int is_vmalloc_or_module_addr(const void *x);
#else
static inline bool is_vmalloc_addr(const void *x)
{
        return false;
}
static inline int is_vmalloc_or_module_addr(const void *x)
{
        return 0;
}
#endif

/*
 * How many times the entire folio is mapped as a single unit (eg by a
 * PMD or PUD entry).  This is probably not what you want, except for
 * debugging purposes or implementation of other core folio_*() primitives.
 */
static inline int folio_entire_mapcount(const struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
        if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio_large_order(folio) == 1))
                return 0;
        return atomic_read(&folio->_entire_mapcount) + 1;
}

static inline int folio_large_mapcount(const struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_large(folio), folio);
        return atomic_read(&folio->_large_mapcount) + 1;
}

/**
 * folio_mapcount() - Number of mappings of this folio.
 * @folio: The folio.
 *
 * The folio mapcount corresponds to the number of present user page table
 * entries that reference any part of a folio. Each such present user page
 * table entry must be paired with exactly on folio reference.
 *
 * For ordindary folios, each user page table entry (PTE/PMD/PUD/...) counts
 * exactly once.
 *
 * For hugetlb folios, each abstracted "hugetlb" user page table entry that
 * references the entire folio counts exactly once, even when such special
 * page table entries are comprised of multiple ordinary page table entries.
 *
 * Will report 0 for pages which cannot be mapped into userspace, such as
 * slab, page tables and similar.
 *
 * Return: The number of times this folio is mapped.
 */
static inline int folio_mapcount(const struct folio *folio)
{
        int mapcount;

        if (likely(!folio_test_large(folio))) {
                mapcount = atomic_read(&folio->_mapcount) + 1;
                if (page_mapcount_is_type(mapcount))
                        mapcount = 0;
                return mapcount;
        }
        return folio_large_mapcount(folio);
}

/**
 * folio_mapped - Is this folio mapped into userspace?
 * @folio: The folio.
 *
 * Return: True if any page in this folio is referenced by user page tables.
 */
static inline bool folio_mapped(const struct folio *folio)
{
        return folio_mapcount(folio) >= 1;
}

/*
 * Return true if this page is mapped into pagetables.
 * For compound page it returns true if any sub-page of compound page is mapped,
 * even if this particular sub-page is not itself mapped by any PTE or PMD.
 */
static inline bool page_mapped(const struct page *page)
{
        return folio_mapped(page_folio(page));
}

static inline struct page *virt_to_head_page(const void *x)
{
        struct page *page = virt_to_page(x);

        return compound_head(page);
}

static inline struct folio *virt_to_folio(const void *x)
{
        struct page *page = virt_to_page(x);

        return page_folio(page);
}

void __folio_put(struct folio *folio);

void split_page(struct page *page, unsigned int order);
void folio_copy(struct folio *dst, struct folio *src);
int folio_mc_copy(struct folio *dst, struct folio *src);

unsigned long nr_free_buffer_pages(void);

/* Returns the number of bytes in this potentially compound page. */
static inline unsigned long page_size(struct page *page)
{
        return PAGE_SIZE << compound_order(page);
}

/* Returns the number of bits needed for the number of bytes in a page */
static inline unsigned int page_shift(struct page *page)
{
        return PAGE_SHIFT + compound_order(page);
}

/**
 * thp_order - Order of a transparent huge page.
 * @page: Head page of a transparent huge page.
 */
static inline unsigned int thp_order(struct page *page)
{
        VM_BUG_ON_PGFLAGS(PageTail(page), page);
        return compound_order(page);
}

/**
 * thp_size - Size of a transparent huge page.
 * @page: Head page of a transparent huge page.
 *
 * Return: Number of bytes in this page.
 */
static inline unsigned long thp_size(struct page *page)
{
        return PAGE_SIZE << thp_order(page);
}

#ifdef CONFIG_MMU
/*
 * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
 * servicing faults for write access.  In the normal case, do always want
 * pte_mkwrite.  But get_user_pages can cause write faults for mappings
 * that do not have writing enabled, when used by access_process_vm.
 */
static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pte = pte_mkwrite(pte, vma);
        return pte;
}

vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                struct page *page, unsigned int nr, unsigned long addr);

vm_fault_t finish_fault(struct vm_fault *vmf);
#endif

/*
 * Multiple processes may "see" the same page. E.g. for untouched
 * mappings of /dev/null, all processes see the same page full of
 * zeroes, and text pages of executables and shared libraries have
 * only one copy in memory, at most, normally.
 *
 * For the non-reserved pages, page_count(page) denotes a reference count.
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
 *   page_count() > 0  means the page has been allocated.
 *
 * Pages are allocated by the slab allocator in order to provide memory
 * to kmalloc and kmem_cache_alloc. In this case, the management of the
 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
 * unless a particular usage is carefully commented. (the responsibility of
 * freeing the kmalloc memory is the caller's, of course).
 *
 * A page may be used by anyone else who does a __get_free_page().
 * In this case, page_count still tracks the references, and should only
 * be used through the normal accessor functions. The top bits of page->flags
 * and page->virtual store page management information, but all other fields
 * are unused and could be used privately, carefully. The management of this
 * page is the responsibility of the one who allocated it, and those who have
 * subsequently been given references to it.
 *
 * The other pages (we may call them "pagecache pages") are completely
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
 *
 * A pagecache page contains an opaque `private' member, which belongs to the
 * page's address_space. Usually, this is the address of a circular list of
 * the page's disk buffers. PG_private must be set to tell the VM to call
 * into the filesystem to release these pages.
 *
 * A page may belong to an inode's memory mapping. In this case, page->mapping
 * is the pointer to the inode, and page->index is the file offset of the page,
 * in units of PAGE_SIZE.
 *
 * If pagecache pages are not associated with an inode, they are said to be
 * anonymous pages. These may become associated with the swapcache, and in that
 * case PG_swapcache is set, and page->private is an offset into the swapcache.
 *
 * In either case (swapcache or inode backed), the pagecache itself holds one
 * reference to the page. Setting PG_private should also increment the
 * refcount. The each user mapping also has a reference to the page.
 *
 * The pagecache pages are stored in a per-mapping radix tree, which is
 * rooted at mapping->i_pages, and indexed by offset.
 * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space
 * lists, we instead now tag pages as dirty/writeback in the radix tree.
 *
 * All pagecache pages may be subject to I/O:
 * - inode pages may need to be read from disk,
 * - inode pages which have been modified and are MAP_SHARED may need
 *   to be written back to the inode on disk,
 * - anonymous pages (including MAP_PRIVATE file mappings) which have been
 *   modified may need to be swapped out to swap space and (later) to be read
 *   back into memory.
 */

/* 127: arbitrary random number, small enough to assemble well */
#define folio_ref_zero_or_close_to_overflow(folio) \
        ((unsigned int) folio_ref_count(folio) + 127u <= 127u)

/**
 * folio_get - Increment the reference count on a folio.
 * @folio: The folio.
 *
 * Context: May be called in any context, as long as you know that
 * you have a refcount on the folio.  If you do not already have one,
 * folio_try_get() may be the right interface for you to use.
 */
static inline void folio_get(struct folio *folio)
{
        VM_BUG_ON_FOLIO(folio_ref_zero_or_close_to_overflow(folio), folio);
        folio_ref_inc(folio);
}

static inline void get_page(struct page *page)
{
        struct folio *folio = page_folio(page);
        if (WARN_ON_ONCE(folio_test_slab(folio)))
                return;
        folio_get(folio);
}

static inline __must_check bool try_get_page(struct page *page)
{
        page = compound_head(page);
        if (WARN_ON_ONCE(page_ref_count(page) <= 0))
                return false;
        page_ref_inc(page);
        return true;
}

/**
 * folio_put - Decrement the reference count on a folio.
 * @folio: The folio.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put() unless you can be sure that it wasn't the
 * last reference.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put(struct folio *folio)
{
        if (folio_put_testzero(folio))
                __folio_put(folio);
}

/**
 * folio_put_refs - Reduce the reference count on a folio.
 * @folio: The folio.
 * @refs: The amount to subtract from the folio's reference count.
 *
 * If the folio's reference count reaches zero, the memory will be
 * released back to the page allocator and may be used by another
 * allocation immediately.  Do not access the memory or the struct folio
 * after calling folio_put_refs() unless you can be sure that these weren't
 * the last references.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folio_put_refs(struct folio *folio, int refs)
{
        if (folio_ref_sub_and_test(folio, refs))
                __folio_put(folio);
}

void folios_put_refs(struct folio_batch *folios, unsigned int *refs);

/*
 * union release_pages_arg - an array of pages or folios
 *
 * release_pages() releases a simple array of multiple pages, and
 * accepts various different forms of said page array: either
 * a regular old boring array of pages, an array of folios, or
 * an array of encoded page pointers.
 *
 * The transparent union syntax for this kind of "any of these
 * argument types" is all kinds of ugly, so look away.
 */
typedef union {
        struct page **pages;
        struct folio **folios;
        struct encoded_page **encoded_pages;
} release_pages_arg __attribute__ ((__transparent_union__));

void release_pages(release_pages_arg, int nr);

/**
 * folios_put - Decrement the reference count on an array of folios.
 * @folios: The folios.
 *
 * Like folio_put(), but for a batch of folios.  This is more efficient
 * than writing the loop yourself as it will optimise the locks which need
 * to be taken if the folios are freed.  The folios batch is returned
 * empty and ready to be reused for another batch; there is no need to
 * reinitialise it.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
static inline void folios_put(struct folio_batch *folios)
{
        folios_put_refs(folios, NULL);
}

static inline void put_page(struct page *page)
{
        struct folio *folio = page_folio(page);

        if (folio_test_slab(folio))
                return;

        folio_put(folio);
}

/*
 * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
 * the page's refcount so that two separate items are tracked: the original page
 * reference count, and also a new count of how many pin_user_pages() calls were
 * made against the page. ("gup-pinned" is another term for the latter).
 *
 * With this scheme, pin_user_pages() becomes special: such pages are marked as
 * distinct from normal pages. As such, the unpin_user_page() call (and its
 * variants) must be used in order to release gup-pinned pages.
 *
 * Choice of value:
 *
 * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
 * counts with respect to pin_user_pages() and unpin_user_page() becomes
 * simpler, due to the fact that adding an even power of two to the page
 * refcount has the effect of using only the upper N bits, for the code that
 * counts up using the bias value. This means that the lower bits are left for
 * the exclusive use of the original code that increments and decrements by one
 * (or at least, by much smaller values than the bias value).
 *
 * Of course, once the lower bits overflow into the upper bits (and this is
 * OK, because subtraction recovers the original values), then visual inspection
 * no longer suffices to directly view the separate counts. However, for normal
 * applications that don't have huge page reference counts, this won't be an
 * issue.
 *
 * Locking: the lockless algorithm described in folio_try_get_rcu()
 * provides safe operation for get_user_pages(), folio_mkclean() and
 * other calls that race to set up page table entries.
 */
#define GUP_PIN_COUNTING_BIAS (1U << 10)

void unpin_user_page(struct page *page);
void unpin_folio(struct folio *folio);
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty);
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
                                      bool make_dirty);
void unpin_user_pages(struct page **pages, unsigned long npages);
void unpin_user_folio(struct folio *folio, unsigned long npages);
void unpin_folios(struct folio **folios, unsigned long nfolios);

static inline bool is_cow_mapping(vm_flags_t flags)
{
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}

#ifndef CONFIG_MMU
static inline bool is_nommu_shared_mapping(vm_flags_t flags)
{
        /*
         * NOMMU shared mappings are ordinary MAP_SHARED mappings and selected
         * R/O MAP_PRIVATE file mappings that are an effective R/O overlay of
         * a file mapping. R/O MAP_PRIVATE mappings might still modify
         * underlying memory if ptrace is active, so this is only possible if
         * ptrace does not apply. Note that there is no mprotect() to upgrade
         * write permissions later.
         */
        return flags & (VM_MAYSHARE | VM_MAYOVERLAY);
}
#endif

#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
#define SECTION_IN_PAGE_FLAGS
#endif

/*
 * The identification function is mainly used by the buddy allocator for
 * determining if two pages could be buddies. We are not really identifying
 * the zone since we could be using the section number id if we do not have
 * node id available in page flags.
 * We only guarantee that it will return the same value for two combinable
 * pages in a zone.
 */
static inline int page_zone_id(struct page *page)
{
        return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK;
}

#ifdef NODE_NOT_IN_PAGE_FLAGS
int page_to_nid(const struct page *page);
#else
static inline int page_to_nid(const struct page *page)
{
        return (PF_POISONED_CHECK(page)->flags >> NODES_PGSHIFT) & NODES_MASK;
}
#endif

static inline int folio_nid(const struct folio *folio)
{
        return page_to_nid(&folio->page);
}

#ifdef CONFIG_NUMA_BALANCING
/* page access time bits needs to hold at least 4 seconds */
#define PAGE_ACCESS_TIME_MIN_BITS        12
#if LAST_CPUPID_SHIFT < PAGE_ACCESS_TIME_MIN_BITS
#define PAGE_ACCESS_TIME_BUCKETS                                \
        (PAGE_ACCESS_TIME_MIN_BITS - LAST_CPUPID_SHIFT)
#else
#define PAGE_ACCESS_TIME_BUCKETS        0
#endif

#define PAGE_ACCESS_TIME_MASK                                \
        (LAST_CPUPID_MASK << PAGE_ACCESS_TIME_BUCKETS)

static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
        return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK);
}

static inline int cpupid_to_pid(int cpupid)
{
        return cpupid & LAST__PID_MASK;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK;
}

static inline int cpupid_to_nid(int cpupid)
{
        return cpu_to_node(cpupid_to_cpu(cpupid));
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK);
}

static inline bool cpupid_cpu_unset(int cpupid)
{
        return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK);
}

static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid)
{
        return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid);
}

#define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid)
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return xchg(&folio->_last_cpupid, cpupid & LAST_CPUPID_MASK);
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio->_last_cpupid;
}
static inline void page_cpupid_reset_last(struct page *page)
{
        page->_last_cpupid = -1 & LAST_CPUPID_MASK;
}
#else
static inline int folio_last_cpupid(struct folio *folio)
{
        return (folio->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK;
}

int folio_xchg_last_cpupid(struct folio *folio, int cpupid);

static inline void page_cpupid_reset_last(struct page *page)
{
        page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
}
#endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        int last_time;

        last_time = folio_xchg_last_cpupid(folio,
                                           time >> PAGE_ACCESS_TIME_BUCKETS);
        return last_time << PAGE_ACCESS_TIME_BUCKETS;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
        unsigned int pid_bit;

        pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
        if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
                __set_bit(pid_bit, &vma->numab_state->pids_active[1]);
        }
}

bool folio_use_access_time(struct folio *folio);
#else /* !CONFIG_NUMA_BALANCING */
static inline int folio_xchg_last_cpupid(struct folio *folio, int cpupid)
{
        return folio_nid(folio); /* XXX */
}

static inline int folio_xchg_access_time(struct folio *folio, int time)
{
        return 0;
}

static inline int folio_last_cpupid(struct folio *folio)
{
        return folio_nid(folio); /* XXX */
}

static inline int cpupid_to_nid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_pid(int cpupid)
{
        return -1;
}

static inline int cpupid_to_cpu(int cpupid)
{
        return -1;
}

static inline int cpu_pid_to_cpupid(int nid, int pid)
{
        return -1;
}

static inline bool cpupid_pid_unset(int cpupid)
{
        return true;
}

static inline void page_cpupid_reset_last(struct page *page)
{
}

static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
{
        return false;
}

static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
}
static inline bool folio_use_access_time(struct folio *folio)
{
        return false;
}
#endif /* CONFIG_NUMA_BALANCING */

#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)

/*
 * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid
 * setting tags for all pages to native kernel tag value 0xff, as the default
 * value 0x00 maps to 0xff.
 */

static inline u8 page_kasan_tag(const struct page *page)
{
        u8 tag = KASAN_TAG_KERNEL;

        if (kasan_enabled()) {
                tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK;
                tag ^= 0xff;
        }

        return tag;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag)
{
        unsigned long old_flags, flags;

        if (!kasan_enabled())
                return;

        tag ^= 0xff;
        old_flags = READ_ONCE(page->flags);
        do {
                flags = old_flags;
                flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT);
                flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT;
        } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags)));
}

static inline void page_kasan_tag_reset(struct page *page)
{
        if (kasan_enabled())
                page_kasan_tag_set(page, KASAN_TAG_KERNEL);
}

#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline u8 page_kasan_tag(const struct page *page)
{
        return 0xff;
}

static inline void page_kasan_tag_set(struct page *page, u8 tag) { }
static inline void page_kasan_tag_reset(struct page *page) { }

#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline struct zone *page_zone(const struct page *page)
{
        return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}

static inline pg_data_t *page_pgdat(const struct page *page)
{
        return NODE_DATA(page_to_nid(page));
}

static inline struct zone *folio_zone(const struct folio *folio)
{
        return page_zone(&folio->page);
}

static inline pg_data_t *folio_pgdat(const struct folio *folio)
{
        return page_pgdat(&folio->page);
}

#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
        page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT);
        page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
}

static inline unsigned long page_to_section(const struct page *page)
{
        return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
}
#endif

/**
 * folio_pfn - Return the Page Frame Number of a folio.
 * @folio: The folio.
 *
 * A folio may contain multiple pages.  The pages have consecutive
 * Page Frame Numbers.
 *
 * Return: The Page Frame Number of the first page in the folio.
 */
static inline unsigned long folio_pfn(const struct folio *folio)
{
        return page_to_pfn(&folio->page);
}

static inline struct folio *pfn_folio(unsigned long pfn)
{
        return page_folio(pfn_to_page(pfn));
}

static inline bool folio_has_pincount(const struct folio *folio)
{
        if (IS_ENABLED(CONFIG_64BIT))
                return folio_test_large(folio);
        return folio_order(folio) > 1;
}

/**
 * folio_maybe_dma_pinned - Report if a folio may be pinned for DMA.
 * @folio: The folio.
 *
 * This function checks if a folio has been pinned via a call to
 * a function in the pin_user_pages() family.
 *
 * For small folios, the return value is partially fuzzy: false is not fuzzy,
 * because it means "definitely not pinned for DMA", but true means "probably
 * pinned for DMA, but possibly a false positive due to having at least
 * GUP_PIN_COUNTING_BIAS worth of normal folio references".
 *
 * False positives are OK, because: a) it's unlikely for a folio to
 * get that many refcounts, and b) all the callers of this routine are
 * expected to be able to deal gracefully with a false positive.
 *
 * For most large folios, the result will be exactly correct. That's because
 * we have more tracking data available: the _pincount field is used
 * instead of the GUP_PIN_COUNTING_BIAS scheme.
 *
 * For more information, please see Documentation/core-api/pin_user_pages.rst.
 *
 * Return: True, if it is likely that the folio has been "dma-pinned".
 * False, if the folio is definitely not dma-pinned.
 */
static inline bool folio_maybe_dma_pinned(struct folio *folio)
{
        if (folio_has_pincount(folio))
                return atomic_read(&folio->_pincount) > 0;

        /*
         * folio_ref_count() is signed. If that refcount overflows, then
         * folio_ref_count() returns a negative value, and callers will avoid
         * further incrementing the refcount.
         *
         * Here, for that overflow case, use the sign bit to count a little
         * bit higher via unsigned math, and thus still get an accurate result.
         */
        return ((unsigned int)folio_ref_count(folio)) >=
                GUP_PIN_COUNTING_BIAS;
}

/*
 * This should most likely only be called during fork() to see whether we
 * should break the cow immediately for an anon page on the src mm.
 *
 * The caller has to hold the PT lock and the vma->vm_mm->->write_protect_seq.
 */
static inline bool folio_needs_cow_for_dma(struct vm_area_struct *vma,
                                          struct folio *folio)
{
        VM_BUG_ON(!(raw_read_seqcount(&vma->vm_mm->write_protect_seq) & 1));

        if (!test_bit(MMF_HAS_PINNED, &vma->vm_mm->flags))
                return false;

        return folio_maybe_dma_pinned(folio);
}

/**
 * is_zero_page - Query if a page is a zero page
 * @page: The page to query
 *
 * This returns true if @page is one of the permanent zero pages.
 */
static inline bool is_zero_page(const struct page *page)
{
        return is_zero_pfn(page_to_pfn(page));
}

/**
 * is_zero_folio - Query if a folio is a zero page
 * @folio: The folio to query
 *
 * This returns true if @folio is one of the permanent zero pages.
 */
static inline bool is_zero_folio(const struct folio *folio)
{
        return is_zero_page(&folio->page);
}

/* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */
#ifdef CONFIG_MIGRATION
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
#ifdef CONFIG_CMA
        int mt = folio_migratetype(folio);

        if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
                return false;
#endif
        /* The zero page can be "pinned" but gets special handling. */
        if (is_zero_folio(folio))
                return true;

        /* Coherent device memory must always allow eviction. */
        if (folio_is_device_coherent(folio))
                return false;

        /*
         * Filesystems can only tolerate transient delays to truncate and
         * hole-punch operations
         */
        if (folio_is_fsdax(folio))
                return false;

        /* Otherwise, non-movable zone folios can be pinned. */
        return !folio_is_zone_movable(folio);

}
#else
static inline bool folio_is_longterm_pinnable(struct folio *folio)
{
        return true;
}
#endif

static inline void set_page_zone(struct page *page, enum zone_type zone)
{
        page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
        page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}

static inline void set_page_node(struct page *page, unsigned long node)
{
        page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
        page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}

static inline void set_page_links(struct page *page, enum zone_type zone,
        unsigned long node, unsigned long pfn)
{
        set_page_zone(page, zone);
        set_page_node(page, node);
#ifdef SECTION_IN_PAGE_FLAGS
        set_page_section(page, pfn_to_section_nr(pfn));
#endif
}

/**
 * folio_nr_pages - The number of pages in the folio.
 * @folio: The folio.
 *
 * Return: A positive power of two.
 */
static inline long folio_nr_pages(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return 1;
        return folio_large_nr_pages(folio);
}

/* Only hugetlbfs can allocate folios larger than MAX_ORDER */
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
#define MAX_FOLIO_NR_PAGES        (1UL << PUD_ORDER)
#else
#define MAX_FOLIO_NR_PAGES        MAX_ORDER_NR_PAGES
#endif

/*
 * compound_nr() returns the number of pages in this potentially compound
 * page.  compound_nr() can be called on a tail page, and is defined to
 * return 1 in that case.
 */
static inline long compound_nr(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (!test_bit(PG_head, &folio->flags))
                return 1;
        return folio_large_nr_pages(folio);
}

/**
 * thp_nr_pages - The number of regular pages in this huge page.
 * @page: The head page of a huge page.
 */
static inline long thp_nr_pages(struct page *page)
{
        return folio_nr_pages((struct folio *)page);
}

/**
 * folio_next - Move to the next physical folio.
 * @folio: The folio we're currently operating on.
 *
 * If you have physically contiguous memory which may span more than
 * one folio (eg a &struct bio_vec), use this function to move from one
 * folio to the next.  Do not use it if the memory is only virtually
 * contiguous as the folios are almost certainly not adjacent to each
 * other.  This is the folio equivalent to writing ``page++``.
 *
 * Context: We assume that the folios are refcounted and/or locked at a
 * higher level and do not adjust the reference counts.
 * Return: The next struct folio.
 */
static inline struct folio *folio_next(struct folio *folio)
{
        return (struct folio *)folio_page(folio, folio_nr_pages(folio));
}

/**
 * folio_shift - The size of the memory described by this folio.
 * @folio: The folio.
 *
 * A folio represents a number of bytes which is a power-of-two in size.
 * This function tells you which power-of-two the folio is.  See also
 * folio_size() and folio_order().
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The base-2 logarithm of the size of this folio.
 */
static inline unsigned int folio_shift(const struct folio *folio)
{
        return PAGE_SHIFT + folio_order(folio);
}

/**
 * folio_size - The number of bytes in a folio.
 * @folio: The folio.
 *
 * Context: The caller should have a reference on the folio to prevent
 * it from being split.  It is not necessary for the folio to be locked.
 * Return: The number of bytes in this folio.
 */
static inline size_t folio_size(const struct folio *folio)
{
        return PAGE_SIZE << folio_order(folio);
}

/**
 * folio_maybe_mapped_shared - Whether the folio is mapped into the page
 *                               tables of more than one MM
 * @folio: The folio.
 *
 * This function checks if the folio maybe currently mapped into more than one
 * MM ("maybe mapped shared"), or if the folio is certainly mapped into a single
 * MM ("mapped exclusively").
 *
 * For KSM folios, this function also returns "mapped shared" when a folio is
 * mapped multiple times into the same MM, because the individual page mappings
 * are independent.
 *
 * For small anonymous folios and anonymous hugetlb folios, the return
 * value will be exactly correct: non-KSM folios can only be mapped at most once
 * into an MM, and they cannot be partially mapped. KSM folios are
 * considered shared even if mapped multiple times into the same MM.
 *
 * For other folios, the result can be fuzzy:
 *    #. For partially-mappable large folios (THP), the return value can wrongly
 *       indicate "mapped shared" (false positive) if a folio was mapped by
 *       more than two MMs at one point in time.
 *    #. For pagecache folios (including hugetlb), the return value can wrongly
 *       indicate "mapped shared" (false positive) when two VMAs in the same MM
 *       cover the same file range.
 *
 * Further, this function only considers current page table mappings that
 * are tracked using the folio mapcount(s).
 *
 * This function does not consider:
 *    #. If the folio might get mapped in the (near) future (e.g., swapcache,
 *       pagecache, temporary unmapping for migration).
 *    #. If the folio is mapped differently (VM_PFNMAP).
 *    #. If hugetlb page table sharing applies. Callers might want to check
 *       hugetlb_pmd_shared().
 *
 * Return: Whether the folio is estimated to be mapped into more than one MM.
 */
static inline bool folio_maybe_mapped_shared(struct folio *folio)
{
        int mapcount = folio_mapcount(folio);

        /* Only partially-mappable folios require more care. */
        if (!folio_test_large(folio) || unlikely(folio_test_hugetlb(folio)))
                return mapcount > 1;

        /*
         * vm_insert_page() without CONFIG_TRANSPARENT_HUGEPAGE ...
         * simply assume "mapped shared", nobody should really care
         * about this for arbitrary kernel allocations.
         */
        if (!IS_ENABLED(CONFIG_MM_ID))
                return true;

        /*
         * A single mapping implies "mapped exclusively", even if the
         * folio flag says something different: it's easier to handle this
         * case here instead of on the RMAP hot path.
         */
        if (mapcount <= 1)
                return false;
        return folio_test_large_maybe_mapped_shared(folio);
}

#ifndef HAVE_ARCH_MAKE_FOLIO_ACCESSIBLE
static inline int arch_make_folio_accessible(struct folio *folio)
{
        return 0;
}
#endif

/*
 * Some inline functions in vmstat.h depend on page_zone()
 */
#include <linux/vmstat.h>

#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
#define HASHED_PAGE_VIRTUAL
#endif

#if defined(WANT_PAGE_VIRTUAL)
static inline void *page_address(const struct page *page)
{
        return page->virtual;
}
static inline void set_page_address(struct page *page, void *address)
{
        page->virtual = address;
}
#define page_address_init()  do { } while(0)
#endif

#if defined(HASHED_PAGE_VIRTUAL)
void *page_address(const struct page *page);
void set_page_address(struct page *page, void *virtual);
void page_address_init(void);
#endif

static __always_inline void *lowmem_page_address(const struct page *page)
{
        return page_to_virt(page);
}

#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
#define page_address(page) lowmem_page_address(page)
#define set_page_address(page, address)  do { } while(0)
#define page_address_init()  do { } while(0)
#endif

static inline void *folio_address(const struct folio *folio)
{
        return page_address(&folio->page);
}

/*
 * Return true only if the page has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool page_is_pfmemalloc(const struct page *page)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)page->lru.next & BIT(1);
}

/*
 * Return true only if the folio has been allocated with
 * ALLOC_NO_WATERMARKS and the low watermark was not
 * met implying that the system is under some pressure.
 */
static inline bool folio_is_pfmemalloc(const struct folio *folio)
{
        /*
         * lru.next has bit 1 set if the page is allocated from the
         * pfmemalloc reserves.  Callers may simply overwrite it if
         * they do not need to preserve that information.
         */
        return (uintptr_t)folio->lru.next & BIT(1);
}

/*
 * Only to be called by the page allocator on a freshly allocated
 * page.
 */
static inline void set_page_pfmemalloc(struct page *page)
{
        page->lru.next = (void *)BIT(1);
}

static inline void clear_page_pfmemalloc(struct page *page)
{
        page->lru.next = NULL;
}

/*
 * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
 */
extern void pagefault_out_of_memory(void);

#define offset_in_page(p)        ((unsigned long)(p) & ~PAGE_MASK)
#define offset_in_thp(page, p)        ((unsigned long)(p) & (thp_size(page) - 1))
#define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1))

/*
 * Parameter block passed down to zap_pte_range in exceptional cases.
 */
struct zap_details {
        struct folio *single_folio;        /* Locked folio to be unmapped */
        bool even_cows;                        /* Zap COWed private pages too? */
        bool reclaim_pt;                /* Need reclaim page tables? */
        zap_flags_t zap_flags;                /* Extra flags for zapping */
};

/*
 * Whether to drop the pte markers, for example, the uffd-wp information for
 * file-backed memory.  This should only be specified when we will completely
 * drop the page in the mm, either by truncation or unmapping of the vma.  By
 * default, the flag is not set.
 */
#define  ZAP_FLAG_DROP_MARKER        ((__force zap_flags_t) BIT(0))
/* Set in unmap_vmas() to indicate a final unmap call.  Only used by hugetlb */
#define  ZAP_FLAG_UNMAP              ((__force zap_flags_t) BIT(1))

#ifdef CONFIG_SCHED_MM_CID
void sched_mm_cid_before_execve(struct task_struct *t);
void sched_mm_cid_after_execve(struct task_struct *t);
void sched_mm_cid_fork(struct task_struct *t);
void sched_mm_cid_exit_signals(struct task_struct *t);
static inline int task_mm_cid(struct task_struct *t)
{
        return t->mm_cid;
}
#else
static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
static inline void sched_mm_cid_fork(struct task_struct *t) { }
static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
static inline int task_mm_cid(struct task_struct *t)
{
        /*
         * Use the processor id as a fall-back when the mm cid feature is
         * disabled. This provides functional per-cpu data structure accesses
         * in user-space, althrough it won't provide the memory usage benefits.
         */
        return raw_smp_processor_id();
}
#endif

#ifdef CONFIG_MMU
extern bool can_do_mlock(void);
#else
static inline bool can_do_mlock(void) { return false; }
#endif
extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct ucounts *);

struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
struct folio *vm_normal_folio_pmd(struct vm_area_struct *vma,
                                  unsigned long addr, pmd_t pmd);
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
                                pmd_t pmd);

void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
                  unsigned long size);
void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
                           unsigned long size, struct zap_details *details);
static inline void zap_vma_pages(struct vm_area_struct *vma)
{
        zap_page_range_single(vma, vma->vm_start,
                              vma->vm_end - vma->vm_start, NULL);
}
void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas,
                struct vm_area_struct *start_vma, unsigned long start,
                unsigned long end, unsigned long tree_end, bool mm_wr_locked);

struct mmu_notifier_range;

void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
                unsigned long end, unsigned long floor, unsigned long ceiling);
int
copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
                        void *buf, int len, int write);

struct follow_pfnmap_args {
        /**
         * Inputs:
         * @vma: Pointer to @vm_area_struct struct
         * @address: the virtual address to walk
         */
        struct vm_area_struct *vma;
        unsigned long address;
        /**
         * Internals:
         *
         * The caller shouldn't touch any of these.
         */
        spinlock_t *lock;
        pte_t *ptep;
        /**
         * Outputs:
         *
         * @pfn: the PFN of the address
         * @addr_mask: address mask covering pfn
         * @pgprot: the pgprot_t of the mapping
         * @writable: whether the mapping is writable
         * @special: whether the mapping is a special mapping (real PFN maps)
         */
        unsigned long pfn;
        unsigned long addr_mask;
        pgprot_t pgprot;
        bool writable;
        bool special;
};
int follow_pfnmap_start(struct follow_pfnmap_args *args);
void follow_pfnmap_end(struct follow_pfnmap_args *args);

extern void truncate_pagecache(struct inode *inode, loff_t new);
extern void truncate_setsize(struct inode *inode, loff_t newsize);
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to);
void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end);
int generic_error_remove_folio(struct address_space *mapping,
                struct folio *folio);

struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
                unsigned long address, struct pt_regs *regs);

#ifdef CONFIG_MMU
extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                  unsigned long address, unsigned int flags,
                                  struct pt_regs *regs);
extern int fixup_user_fault(struct mm_struct *mm,
                            unsigned long address, unsigned int fault_flags,
                            bool *unlocked);
void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows);
void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows);
#else
static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma,
                                         unsigned long address, unsigned int flags,
                                         struct pt_regs *regs)
{
        /* should never happen if there's no MMU */
        BUG();
        return VM_FAULT_SIGBUS;
}
static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address,
                unsigned int fault_flags, bool *unlocked)
{
        /* should never happen if there's no MMU */
        BUG();
        return -EFAULT;
}
static inline void unmap_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t nr, bool even_cows) { }
static inline void unmap_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif

static inline void unmap_shared_mapping_range(struct address_space *mapping,
                loff_t const holebegin, loff_t const holelen)
{
        unmap_mapping_range(mapping, holebegin, holelen, 0);
}

static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm,
                                                unsigned long addr);

extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);
extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
                void *buf, int len, unsigned int gup_flags);

#ifdef CONFIG_BPF_SYSCALL
extern int copy_remote_vm_str(struct task_struct *tsk, unsigned long addr,
                              void *buf, int len, unsigned int gup_flags);
#endif

long get_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked);

/*
 * Retrieves a single page alongside its VMA. Does not support FOLL_NOWAIT.
 */
static inline struct page *get_user_page_vma_remote(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    int gup_flags,
                                                    struct vm_area_struct **vmap)
{
        struct page *page;
        struct vm_area_struct *vma;
        int got;

        if (WARN_ON_ONCE(unlikely(gup_flags & FOLL_NOWAIT)))
                return ERR_PTR(-EINVAL);

        got = get_user_pages_remote(mm, addr, 1, gup_flags, &page, NULL);

        if (got < 0)
                return ERR_PTR(got);

        vma = vma_lookup(mm, addr);
        if (WARN_ON_ONCE(!vma)) {
                put_page(page);
                return ERR_PTR(-EINVAL);
        }

        *vmap = vma;
        return page;
}

long get_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                    struct page **pages, unsigned int gup_flags);
long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
                      struct folio **folios, unsigned int max_folios,
                      pgoff_t *offset);
int folio_add_pins(struct folio *folio, unsigned int pins);

int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages);
void folio_add_pin(struct folio *folio);

int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc);
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim);

struct kvec;
struct page *get_dump_page(unsigned long addr, int *locked);

bool folio_mark_dirty(struct folio *folio);
bool folio_mark_dirty_lock(struct folio *folio);
bool set_page_dirty(struct page *page);
int set_page_dirty_lock(struct page *page);

int get_cmdline(struct task_struct *task, char *buffer, int buflen);

/*
 * Flags used by change_protection().  For now we make it a bitmap so
 * that we can pass in multiple flags just like parameters.  However
 * for now all the callers are only use one of the flags at the same
 * time.
 */
/*
 * Whether we should manually check if we can map individual PTEs writable,
 * because something (e.g., COW, uffd-wp) blocks that from happening for all
 * PTEs automatically in a writable mapping.
 */
#define  MM_CP_TRY_CHANGE_WRITABLE           (1UL << 0)
/* Whether this protection change is for NUMA hints */
#define  MM_CP_PROT_NUMA                   (1UL << 1)
/* Whether this change is for write protecting */
#define  MM_CP_UFFD_WP                     (1UL << 2) /* do wp */
#define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
#define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)

bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
extern long change_protection(struct mmu_gather *tlb,
                              struct vm_area_struct *vma, unsigned long start,
                              unsigned long end, unsigned long cp_flags);
extern int mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
          struct vm_area_struct *vma, struct vm_area_struct **pprev,
          unsigned long start, unsigned long end, unsigned long newflags);

/*
 * doesn't attempt to fault and will return short.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages);

static inline bool get_user_page_fast_only(unsigned long addr,
                        unsigned int gup_flags, struct page **pagep)
{
        return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1;
}
/*
 * per-process(per-mm_struct) statistics.
 */
static inline unsigned long get_mm_counter(struct mm_struct *mm, int member)
{
        return percpu_counter_read_positive(&mm->rss_stat[member]);
}

void mm_trace_rss_stat(struct mm_struct *mm, int member);

static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
{
        percpu_counter_add(&mm->rss_stat[member], value);

        mm_trace_rss_stat(mm, member);
}

static inline void inc_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_inc(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

static inline void dec_mm_counter(struct mm_struct *mm, int member)
{
        percpu_counter_dec(&mm->rss_stat[member]);

        mm_trace_rss_stat(mm, member);
}

/* Optimized variant when folio is already known not to be anon */
static inline int mm_counter_file(struct folio *folio)
{
        if (folio_test_swapbacked(folio))
                return MM_SHMEMPAGES;
        return MM_FILEPAGES;
}

static inline int mm_counter(struct folio *folio)
{
        if (folio_test_anon(folio))
                return MM_ANONPAGES;
        return mm_counter_file(folio);
}

static inline unsigned long get_mm_rss(struct mm_struct *mm)
{
        return get_mm_counter(mm, MM_FILEPAGES) +
                get_mm_counter(mm, MM_ANONPAGES) +
                get_mm_counter(mm, MM_SHMEMPAGES);
}

static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
{
        return max(mm->hiwater_rss, get_mm_rss(mm));
}

static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
{
        return max(mm->hiwater_vm, mm->total_vm);
}

static inline void update_hiwater_rss(struct mm_struct *mm)
{
        unsigned long _rss = get_mm_rss(mm);

        if ((mm)->hiwater_rss < _rss)
                (mm)->hiwater_rss = _rss;
}

static inline void update_hiwater_vm(struct mm_struct *mm)
{
        if (mm->hiwater_vm < mm->total_vm)
                mm->hiwater_vm = mm->total_vm;
}

static inline void reset_mm_hiwater_rss(struct mm_struct *mm)
{
        mm->hiwater_rss = get_mm_rss(mm);
}

static inline void setmax_mm_hiwater_rss(unsigned long *maxrss,
                                         struct mm_struct *mm)
{
        unsigned long hiwater_rss = get_mm_hiwater_rss(mm);

        if (*maxrss < hiwater_rss)
                *maxrss = hiwater_rss;
}

#ifndef CONFIG_ARCH_HAS_PTE_SPECIAL
static inline int pte_special(pte_t pte)
{
        return 0;
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return pte;
}
#endif

#ifndef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
static inline bool pmd_special(pmd_t pmd)
{
        return false;
}

static inline pmd_t pmd_mkspecial(pmd_t pmd)
{
        return pmd;
}
#endif        /* CONFIG_ARCH_SUPPORTS_PMD_PFNMAP */

#ifndef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
static inline bool pud_special(pud_t pud)
{
        return false;
}

static inline pud_t pud_mkspecial(pud_t pud)
{
        return pud;
}
#endif        /* CONFIG_ARCH_SUPPORTS_PUD_PFNMAP */

#ifndef CONFIG_ARCH_HAS_PTE_DEVMAP
static inline int pte_devmap(pte_t pte)
{
        return 0;
}
#endif

extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                               spinlock_t **ptl);
static inline pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
                                    spinlock_t **ptl)
{
        pte_t *ptep;
        __cond_lock(*ptl, ptep = __get_locked_pte(mm, addr, ptl));
        return ptep;
}

#ifdef __PAGETABLE_P4D_FOLDED
static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                                                unsigned long address)
{
        return 0;
}
#else
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif

#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                                                unsigned long address)
{
        return 0;
}
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}

#else
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);

static inline void mm_inc_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_puds(struct mm_struct *mm)
{
        if (mm_pud_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes);
}
#endif

#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
                                                unsigned long address)
{
        return 0;
}

static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}

#else
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);

static inline void mm_inc_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_pmds(struct mm_struct *mm)
{
        if (mm_pmd_folded(mm))
                return;
        atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes);
}
#endif

#ifdef CONFIG_MMU
static inline void mm_pgtables_bytes_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->pgtables_bytes, 0);
}

static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return atomic_long_read(&mm->pgtables_bytes);
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm)
{
        atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}

static inline void mm_dec_nr_ptes(struct mm_struct *mm)
{
        atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes);
}
#else

static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {}
static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm)
{
        return 0;
}

static inline void mm_inc_nr_ptes(struct mm_struct *mm) {}
static inline void mm_dec_nr_ptes(struct mm_struct *mm) {}
#endif

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd);
int __pte_alloc_kernel(pmd_t *pmd);

#if defined(CONFIG_MMU)

static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
                unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
                NULL : p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
                unsigned long address)
{
        return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
                NULL : pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

static inline struct ptdesc *virt_to_ptdesc(const void *x)
{
        return page_ptdesc(virt_to_page(x));
}

static inline void *ptdesc_to_virt(const struct ptdesc *pt)
{
        return page_to_virt(ptdesc_page(pt));
}

static inline void *ptdesc_address(const struct ptdesc *pt)
{
        return folio_address(ptdesc_folio(pt));
}

static inline bool pagetable_is_reserved(struct ptdesc *pt)
{
        return folio_test_reserved(ptdesc_folio(pt));
}

/**
 * pagetable_alloc - Allocate pagetables
 * @gfp:    GFP flags
 * @order:  desired pagetable order
 *
 * pagetable_alloc allocates memory for page tables as well as a page table
 * descriptor to describe that memory.
 *
 * Return: The ptdesc describing the allocated page tables.
 */
static inline struct ptdesc *pagetable_alloc_noprof(gfp_t gfp, unsigned int order)
{
        struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order);

        return page_ptdesc(page);
}
#define pagetable_alloc(...)        alloc_hooks(pagetable_alloc_noprof(__VA_ARGS__))

/**
 * pagetable_free - Free pagetables
 * @pt:        The page table descriptor
 *
 * pagetable_free frees the memory of all page tables described by a page
 * table descriptor and the memory for the descriptor itself.
 */
static inline void pagetable_free(struct ptdesc *pt)
{
        struct page *page = ptdesc_page(pt);

        __free_pages(page, compound_order(page));
}

#if defined(CONFIG_SPLIT_PTE_PTLOCKS)
#if ALLOC_SPLIT_PTLOCKS
void __init ptlock_cache_init(void);
bool ptlock_alloc(struct ptdesc *ptdesc);
void ptlock_free(struct ptdesc *ptdesc);

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return ptdesc->ptl;
}
#else /* ALLOC_SPLIT_PTLOCKS */
static inline void ptlock_cache_init(void)
{
}

static inline bool ptlock_alloc(struct ptdesc *ptdesc)
{
        return true;
}

static inline void ptlock_free(struct ptdesc *ptdesc)
{
}

static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc)
{
        return &ptdesc->ptl;
}
#endif /* ALLOC_SPLIT_PTLOCKS */

static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(page_ptdesc(pmd_page(*pmd)));
}

static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
{
        BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE));
        BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE);
        return ptlock_ptr(virt_to_ptdesc(pte));
}

static inline bool ptlock_init(struct ptdesc *ptdesc)
{
        /*
         * prep_new_page() initialize page->private (and therefore page->ptl)
         * with 0. Make sure nobody took it in use in between.
         *
         * It can happen if arch try to use slab for page table allocation:
         * slab code uses page->slab_cache, which share storage with page->ptl.
         */
        VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc));
        if (!ptlock_alloc(ptdesc))
                return false;
        spin_lock_init(ptlock_ptr(ptdesc));
        return true;
}

#else        /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */
/*
 * We use mm->page_table_lock to guard all pagetable pages of the mm.
 */
static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}
static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte)
{
        return &mm->page_table_lock;
}
static inline void ptlock_cache_init(void) {}
static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; }
static inline void ptlock_free(struct ptdesc *ptdesc) {}
#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */

static inline void __pagetable_ctor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        __folio_set_pgtable(folio);
        lruvec_stat_add_folio(folio, NR_PAGETABLE);
}

static inline void pagetable_dtor(struct ptdesc *ptdesc)
{
        struct folio *folio = ptdesc_folio(ptdesc);

        ptlock_free(ptdesc);
        __folio_clear_pgtable(folio);
        lruvec_stat_sub_folio(folio, NR_PAGETABLE);
}

static inline void pagetable_dtor_free(struct ptdesc *ptdesc)
{
        pagetable_dtor(ptdesc);
        pagetable_free(ptdesc);
}

static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc)
{
        if (!ptlock_init(ptdesc))
                return false;
        __pagetable_ctor(ptdesc);
        return true;
}

pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp);
static inline pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr,
                        pmd_t *pmdvalp)
{
        pte_t *pte;

        __cond_lock(RCU, pte = ___pte_offset_map(pmd, addr, pmdvalp));
        return pte;
}
static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr)
{
        return __pte_offset_map(pmd, addr, NULL);
}

pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp);
static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                        unsigned long addr, spinlock_t **ptlp)
{
        pte_t *pte;

        __cond_lock(RCU, __cond_lock(*ptlp,
                        pte = __pte_offset_map_lock(mm, pmd, addr, ptlp)));
        return pte;
}

pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, spinlock_t **ptlp);
pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, pmd_t *pmdvalp,
                                spinlock_t **ptlp);

#define pte_unmap_unlock(pte, ptl)        do {                \
        spin_unlock(ptl);                                \
        pte_unmap(pte);                                        \
} while (0)

#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))

#define pte_alloc_map(mm, pmd, address)                        \
        (pte_alloc(mm, pmd) ? NULL : pte_offset_map(pmd, address))

#define pte_alloc_map_lock(mm, pmd, address, ptlp)        \
        (pte_alloc(mm, pmd) ?                        \
                 NULL : pte_offset_map_lock(mm, pmd, address, ptlp))

#define pte_alloc_kernel(pmd, address)                        \
        ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd))? \
                NULL: pte_offset_kernel(pmd, address))

#if defined(CONFIG_SPLIT_PMD_PTLOCKS)

static inline struct page *pmd_pgtable_page(pmd_t *pmd)
{
        unsigned long mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
        return virt_to_page((void *)((unsigned long) pmd & mask));
}

static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd)
{
        return page_ptdesc(pmd_pgtable_page(pmd));
}

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return ptlock_ptr(pmd_ptdesc(pmd));
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        ptdesc->pmd_huge_pte = NULL;
#endif
        return ptlock_init(ptdesc);
}

#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte)

#else

static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd)
{
        return &mm->page_table_lock;
}

static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; }

#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte)

#endif

static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd)
{
        spinlock_t *ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
        return ptl;
}

static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc)
{
        if (!pmd_ptlock_init(ptdesc))
                return false;
        ptdesc_pmd_pts_init(ptdesc);
        __pagetable_ctor(ptdesc);
        return true;
}

/*
 * No scalability reason to split PUD locks yet, but follow the same pattern
 * as the PMD locks to make it easier if we decide to.  The VM should not be
 * considered ready to switch to split PUD locks yet; there may be places
 * which need to be converted from page_table_lock.
 */
static inline spinlock_t *pud_lockptr(struct mm_struct *mm, pud_t *pud)
{
        return &mm->page_table_lock;
}

static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud)
{
        spinlock_t *ptl = pud_lockptr(mm, pud);

        spin_lock(ptl);
        return ptl;
}

static inline void pagetable_pud_ctor(struct ptdesc *ptdesc)
{
        __pagetable_ctor(ptdesc);
}

static inline void pagetable_p4d_ctor(struct ptdesc *ptdesc)
{
        __pagetable_ctor(ptdesc);
}

static inline void pagetable_pgd_ctor(struct ptdesc *ptdesc)
{
        __pagetable_ctor(ptdesc);
}

extern void __init pagecache_init(void);
extern void free_initmem(void);

/*
 * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
 * into the buddy system. The freed pages will be poisoned with pattern
 * "poison" if it's within range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
extern unsigned long free_reserved_area(void *start, void *end,
                                        int poison, const char *s);

extern void adjust_managed_page_count(struct page *page, long count);

extern void reserve_bootmem_region(phys_addr_t start,
                                   phys_addr_t end, int nid);

/* Free the reserved page into the buddy system, so it gets managed. */
void free_reserved_page(struct page *page);

static inline void mark_page_reserved(struct page *page)
{
        SetPageReserved(page);
        adjust_managed_page_count(page, -1);
}

static inline void free_reserved_ptdesc(struct ptdesc *pt)
{
        free_reserved_page(ptdesc_page(pt));
}

/*
 * Default method to free all the __init memory into the buddy system.
 * The freed pages will be poisoned with pattern "poison" if it's within
 * range [0, UCHAR_MAX].
 * Return pages freed into the buddy system.
 */
static inline unsigned long free_initmem_default(int poison)
{
        extern char __init_begin[], __init_end[];

        return free_reserved_area(&__init_begin, &__init_end,
                                  poison, "unused kernel image (initmem)");
}

static inline unsigned long get_num_physpages(void)
{
        int nid;
        unsigned long phys_pages = 0;

        for_each_online_node(nid)
                phys_pages += node_present_pages(nid);

        return phys_pages;
}

/*
 * Using memblock node mappings, an architecture may initialise its
 * zones, allocate the backing mem_map and account for memory holes in an
 * architecture independent manner.
 *
 * An architecture is expected to register range of page frames backed by
 * physical memory with memblock_add[_node]() before calling
 * free_area_init() passing in the PFN each zone ends at. At a basic
 * usage, an architecture is expected to do something like
 *
 * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn,
 *                                                          max_highmem_pfn};
 * for_each_valid_physical_page_range()
 *        memblock_add_node(base, size, nid, MEMBLOCK_NONE)
 * free_area_init(max_zone_pfns);
 */
void free_area_init(unsigned long *max_zone_pfn);
unsigned long node_map_pfn_alignment(void);
extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                unsigned long end_pfn);
extern void get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn);

#ifndef CONFIG_NUMA
static inline int early_pfn_to_nid(unsigned long pfn)
{
        return 0;
}
#else
/* please see mm/page_alloc.c */
extern int __meminit early_pfn_to_nid(unsigned long pfn);
#endif

extern void mem_init(void);
extern void __init mmap_init(void);

extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx);
static inline void show_mem(void)
{
        __show_mem(0, NULL, MAX_NR_ZONES - 1);
}
extern long si_mem_available(void);
extern void si_meminfo(struct sysinfo * val);
extern void si_meminfo_node(struct sysinfo *val, int nid);

extern __printf(3, 4)
void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...);

extern void setup_per_cpu_pageset(void);

/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);

/* interval_tree.c */
void vma_interval_tree_insert(struct vm_area_struct *node,
                              struct rb_root_cached *root);
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root);
void vma_interval_tree_remove(struct vm_area_struct *node,
                              struct rb_root_cached *root);
struct vm_area_struct *vma_interval_tree_iter_first(struct rb_root_cached *root,
                                unsigned long start, unsigned long last);
struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
                                unsigned long start, unsigned long last);

#define vma_interval_tree_foreach(vma, root, start, last)                \
        for (vma = vma_interval_tree_iter_first(root, start, last);        \
             vma; vma = vma_interval_tree_iter_next(vma, start, last))

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root);
struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long start, unsigned long last);
struct anon_vma_chain *anon_vma_interval_tree_iter_next(
        struct anon_vma_chain *node, unsigned long start, unsigned long last);
#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
#endif

#define anon_vma_interval_tree_foreach(avc, root, start, last)                 \
        for (avc = anon_vma_interval_tree_iter_first(root, start, last); \
             avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))

/* mmap.c */
extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void exit_mmap(struct mm_struct *);
int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift);
bool mmap_read_lock_maybe_expand(struct mm_struct *mm, struct vm_area_struct *vma,
                                 unsigned long addr, bool write);

static inline int check_data_rlimit(unsigned long rlim,
                                    unsigned long new,
                                    unsigned long start,
                                    unsigned long end_data,
                                    unsigned long start_data)
{
        if (rlim < RLIM_INFINITY) {
                if (((new - start) + (end_data - start_data)) > rlim)
                        return -ENOSPC;
        }

        return 0;
}

extern int mm_take_all_locks(struct mm_struct *mm);
extern void mm_drop_all_locks(struct mm_struct *mm);

extern int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
extern struct file *get_mm_exe_file(struct mm_struct *mm);
extern struct file *get_task_exe_file(struct task_struct *task);

extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages);
extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages);

extern bool vma_is_special_mapping(const struct vm_area_struct *vma,
                                   const struct vm_special_mapping *sm);
extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm,
                                   unsigned long addr, unsigned long len,
                                   unsigned long flags,
                                   const struct vm_special_mapping *spec);

unsigned long randomize_stack_top(unsigned long stack_top);
unsigned long randomize_page(unsigned long start, unsigned long range);

unsigned long
__get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                    unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags);

static inline unsigned long
get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                  unsigned long pgoff, unsigned long flags)
{
        return __get_unmapped_area(file, addr, len, pgoff, flags, 0);
}

extern unsigned long do_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot, unsigned long flags,
        vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
        struct list_head *uf);
extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                         unsigned long start, size_t len, struct list_head *uf,
                         bool unlock);
int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                    struct mm_struct *mm, unsigned long start,
                    unsigned long end, struct list_head *uf, bool unlock);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
                     struct list_head *uf);
extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior);

#ifdef CONFIG_MMU
extern int __mm_populate(unsigned long addr, unsigned long len,
                         int ignore_errors);
static inline void mm_populate(unsigned long addr, unsigned long len)
{
        /* Ignore errors */
        (void) __mm_populate(addr, len, 1);
}
#else
static inline void mm_populate(unsigned long addr, unsigned long len) {}
#endif

/* This takes the mm semaphore itself */
extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long);
extern int vm_munmap(unsigned long, size_t);
extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

struct vm_unmapped_area_info {
#define VM_UNMAPPED_AREA_TOPDOWN 1
        unsigned long flags;
        unsigned long length;
        unsigned long low_limit;
        unsigned long high_limit;
        unsigned long align_mask;
        unsigned long align_offset;
        unsigned long start_gap;
};

extern unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info);

/* truncate.c */
extern void truncate_inode_pages(struct address_space *, loff_t);
extern void truncate_inode_pages_range(struct address_space *,
                                       loff_t lstart, loff_t lend);
extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern vm_fault_t filemap_fault(struct vm_fault *vmf);
extern vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                pgoff_t start_pgoff, pgoff_t end_pgoff);
extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf);

extern unsigned long stack_guard_gap;
/* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
int expand_stack_locked(struct vm_area_struct *vma, unsigned long address);
struct vm_area_struct *expand_stack(struct mm_struct * mm, unsigned long addr);

/* Look up the first VMA which satisfies  addr < vm_end,  NULL if none. */
extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr);
extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr,
                                             struct vm_area_struct **pprev);

/*
 * Look up the first VMA which intersects the interval [start_addr, end_addr)
 * NULL if none.  Assume start_addr < end_addr.
 */
struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
                        unsigned long start_addr, unsigned long end_addr);

/**
 * vma_lookup() - Find a VMA at a specific address
 * @mm: The process address space.
 * @addr: The user address.
 *
 * Return: The vm_area_struct at the given address, %NULL otherwise.
 */
static inline
struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr)
{
        return mtree_load(&mm->mm_mt, addr);
}

static inline unsigned long stack_guard_start_gap(struct vm_area_struct *vma)
{
        if (vma->vm_flags & VM_GROWSDOWN)
                return stack_guard_gap;

        /* See reasoning around the VM_SHADOW_STACK definition */
        if (vma->vm_flags & VM_SHADOW_STACK)
                return PAGE_SIZE;

        return 0;
}

static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
{
        unsigned long gap = stack_guard_start_gap(vma);
        unsigned long vm_start = vma->vm_start;

        vm_start -= gap;
        if (vm_start > vma->vm_start)
                vm_start = 0;
        return vm_start;
}

static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
{
        unsigned long vm_end = vma->vm_end;

        if (vma->vm_flags & VM_GROWSUP) {
                vm_end += stack_guard_gap;
                if (vm_end < vma->vm_end)
                        vm_end = -PAGE_SIZE;
        }
        return vm_end;
}

static inline unsigned long vma_pages(struct vm_area_struct *vma)
{
        return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}

/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
                                unsigned long vm_start, unsigned long vm_end)
{
        struct vm_area_struct *vma = vma_lookup(mm, vm_start);

        if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
                vma = NULL;

        return vma;
}

static inline bool range_in_vma(struct vm_area_struct *vma,
                                unsigned long start, unsigned long end)
{
        return (vma && vma->vm_start <= start && end <= vma->vm_end);
}

#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
void vma_set_page_prot(struct vm_area_struct *vma);
#else
static inline pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
        return __pgprot(0);
}
static inline void vma_set_page_prot(struct vm_area_struct *vma)
{
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
}
#endif

void vma_set_file(struct vm_area_struct *vma, struct file *file);

#ifdef CONFIG_NUMA_BALANCING
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
#endif

struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
                unsigned long addr);
int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
                        unsigned long pfn, unsigned long size, pgprot_t);
int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
                unsigned long pfn, unsigned long size, pgprot_t prot);
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
                        struct page **pages, unsigned long *num);
int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
                                unsigned long num);
vm_fault_t vmf_insert_page_mkwrite(struct vm_fault *vmf, struct page *page,
                        bool write);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
                        pfn_t pfn);
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
                unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);

static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
                                unsigned long addr, struct page *page)
{
        int err = vm_insert_page(vma, addr, page);

        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        if (err < 0 && err != -EBUSY)
                return VM_FAULT_SIGBUS;

        return VM_FAULT_NOPAGE;
}

#ifndef io_remap_pfn_range
static inline int io_remap_pfn_range(struct vm_area_struct *vma,
                                     unsigned long addr, unsigned long pfn,
                                     unsigned long size, pgprot_t prot)
{
        return remap_pfn_range(vma, addr, pfn, size, pgprot_decrypted(prot));
}
#endif

static inline vm_fault_t vmf_error(int err)
{
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        else if (err == -EHWPOISON)
                return VM_FAULT_HWPOISON;
        return VM_FAULT_SIGBUS;
}

/*
 * Convert errno to return value for ->page_mkwrite() calls.
 *
 * This should eventually be merged with vmf_error() above, but will need a
 * careful audit of all vmf_error() callers.
 */
static inline vm_fault_t vmf_fs_error(int err)
{
        if (err == 0)
                return VM_FAULT_LOCKED;
        if (err == -EFAULT || err == -EAGAIN)
                return VM_FAULT_NOPAGE;
        if (err == -ENOMEM)
                return VM_FAULT_OOM;
        /* -ENOSPC, -EDQUOT, -EIO ... */
        return VM_FAULT_SIGBUS;
}

static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
{
        if (vm_fault & VM_FAULT_OOM)
                return -ENOMEM;
        if (vm_fault & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
                return (foll_flags & FOLL_HWPOISON) ? -EHWPOISON : -EFAULT;
        if (vm_fault & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
                return -EFAULT;
        return 0;
}

/*
 * Indicates whether GUP can follow a PROT_NONE mapped page, or whether
 * a (NUMA hinting) fault is required.
 */
static inline bool gup_can_follow_protnone(struct vm_area_struct *vma,
                                           unsigned int flags)
{
        /*
         * If callers don't want to honor NUMA hinting faults, no need to
         * determine if we would actually have to trigger a NUMA hinting fault.
         */
        if (!(flags & FOLL_HONOR_NUMA_FAULT))
                return true;

        /*
         * NUMA hinting faults don't apply in inaccessible (PROT_NONE) VMAs.
         *
         * Requiring a fault here even for inaccessible VMAs would mean that
         * FOLL_FORCE cannot make any progress, because handle_mm_fault()
         * refuses to process NUMA hinting faults in inaccessible VMAs.
         */
        return !vma_is_accessible(vma);
}

typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data);
extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
                               unsigned long size, pte_fn_t fn, void *data);
extern int apply_to_existing_page_range(struct mm_struct *mm,
                                   unsigned long address, unsigned long size,
                                   pte_fn_t fn, void *data);

#ifdef CONFIG_PAGE_POISONING
extern void __kernel_poison_pages(struct page *page, int numpages);
extern void __kernel_unpoison_pages(struct page *page, int numpages);
extern bool _page_poisoning_enabled_early;
DECLARE_STATIC_KEY_FALSE(_page_poisoning_enabled);
static inline bool page_poisoning_enabled(void)
{
        return _page_poisoning_enabled_early;
}
/*
 * For use in fast paths after init_mem_debugging() has run, or when a
 * false negative result is not harmful when called too early.
 */
static inline bool page_poisoning_enabled_static(void)
{
        return static_branch_unlikely(&_page_poisoning_enabled);
}
static inline void kernel_poison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_poison_pages(page, numpages);
}
static inline void kernel_unpoison_pages(struct page *page, int numpages)
{
        if (page_poisoning_enabled_static())
                __kernel_unpoison_pages(page, numpages);
}
#else
static inline bool page_poisoning_enabled(void) { return false; }
static inline bool page_poisoning_enabled_static(void) { return false; }
static inline void __kernel_poison_pages(struct page *page, int nunmpages) { }
static inline void kernel_poison_pages(struct page *page, int numpages) { }
static inline void kernel_unpoison_pages(struct page *page, int numpages) { }
#endif

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
static inline bool want_init_on_alloc(gfp_t flags)
{
        if (static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                &init_on_alloc))
                return true;
        return flags & __GFP_ZERO;
}

DECLARE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
static inline bool want_init_on_free(void)
{
        return static_branch_maybe(CONFIG_INIT_ON_FREE_DEFAULT_ON,
                                   &init_on_free);
}

extern bool _debug_pagealloc_enabled_early;
DECLARE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);

static inline bool debug_pagealloc_enabled(void)
{
        return IS_ENABLED(CONFIG_DEBUG_PAGEALLOC) &&
                _debug_pagealloc_enabled_early;
}

/*
 * For use in fast paths after mem_debugging_and_hardening_init() has run,
 * or when a false negative result is not harmful when called too early.
 */
static inline bool debug_pagealloc_enabled_static(void)
{
        if (!IS_ENABLED(CONFIG_DEBUG_PAGEALLOC))
                return false;

        return static_branch_unlikely(&_debug_pagealloc_enabled);
}

/*
 * To support DEBUG_PAGEALLOC architecture must ensure that
 * __kernel_map_pages() never fails
 */
extern void __kernel_map_pages(struct page *page, int numpages, int enable);
#ifdef CONFIG_DEBUG_PAGEALLOC
static inline void debug_pagealloc_map_pages(struct page *page, int numpages)
{
        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 1);
}

static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages)
{
        if (debug_pagealloc_enabled_static())
                __kernel_map_pages(page, numpages, 0);
}

extern unsigned int _debug_guardpage_minorder;
DECLARE_STATIC_KEY_FALSE(_debug_guardpage_enabled);

static inline unsigned int debug_guardpage_minorder(void)
{
        return _debug_guardpage_minorder;
}

static inline bool debug_guardpage_enabled(void)
{
        return static_branch_unlikely(&_debug_guardpage_enabled);
}

static inline bool page_is_guard(struct page *page)
{
        if (!debug_guardpage_enabled())
                return false;

        return PageGuard(page);
}

bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline bool set_page_guard(struct zone *zone, struct page *page,
                                  unsigned int order)
{
        if (!debug_guardpage_enabled())
                return false;
        return __set_page_guard(zone, page, order);
}

void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                    unsigned int order)
{
        if (!debug_guardpage_enabled())
                return;
        __clear_page_guard(zone, page, order);
}

#else        /* CONFIG_DEBUG_PAGEALLOC */
static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {}
static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {}
static inline unsigned int debug_guardpage_minorder(void) { return 0; }
static inline bool debug_guardpage_enabled(void) { return false; }
static inline bool page_is_guard(struct page *page) { return false; }
static inline bool set_page_guard(struct zone *zone, struct page *page,
                        unsigned int order) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
                                unsigned int order) {}
#endif        /* CONFIG_DEBUG_PAGEALLOC */

#ifdef __HAVE_ARCH_GATE_AREA
extern struct vm_area_struct *get_gate_vma(struct mm_struct *mm);
extern int in_gate_area_no_mm(unsigned long addr);
extern int in_gate_area(struct mm_struct *mm, unsigned long addr);
#else
static inline struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
{
        return NULL;
}
static inline int in_gate_area_no_mm(unsigned long addr) { return 0; }
static inline int in_gate_area(struct mm_struct *mm, unsigned long addr)
{
        return 0;
}
#endif        /* __HAVE_ARCH_GATE_AREA */

extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm);

void drop_slab(void);

#ifndef CONFIG_MMU
#define randomize_va_space 0
#else
extern int randomize_va_space;
#endif

const char * arch_vma_name(struct vm_area_struct *vma);
#ifdef CONFIG_MMU
void print_vma_addr(char *prefix, unsigned long rip);
#else
static inline void print_vma_addr(char *prefix, unsigned long rip)
{
}
#endif

void *sparse_buffer_alloc(unsigned long size);
unsigned long section_map_size(void);
struct page * __populate_section_memmap(unsigned long pfn,
                unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
                struct dev_pagemap *pgmap);
pgd_t *vmemmap_pgd_populate(unsigned long addr, int node);
p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node);
pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node);
pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node);
pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
                            struct vmem_altmap *altmap, unsigned long ptpfn,
                            unsigned long flags);
void *vmemmap_alloc_block(unsigned long size, int node);
struct vmem_altmap;
void *vmemmap_alloc_block_buf(unsigned long size, int node,
                              struct vmem_altmap *altmap);
void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
void vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
                     unsigned long addr, unsigned long next);
int vmemmap_check_pmd(pmd_t *pmd, int node,
                      unsigned long addr, unsigned long next);
int vmemmap_populate_basepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate_hugepages(unsigned long start, unsigned long end,
                               int node, struct vmem_altmap *altmap);
int vmemmap_populate(unsigned long start, unsigned long end, int node,
                struct vmem_altmap *altmap);
int vmemmap_populate_hvo(unsigned long start, unsigned long end, int node,
                         unsigned long headsize);
int vmemmap_undo_hvo(unsigned long start, unsigned long end, int node,
                     unsigned long headsize);
void vmemmap_wrprotect_hvo(unsigned long start, unsigned long end, int node,
                          unsigned long headsize);
void vmemmap_populate_print_last(void);
#ifdef CONFIG_MEMORY_HOTPLUG
void vmemmap_free(unsigned long start, unsigned long end,
                struct vmem_altmap *altmap);
#endif

#ifdef CONFIG_SPARSEMEM_VMEMMAP
static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
        /* number of pfns from base where pfn_to_page() is valid */
        if (altmap)
                return altmap->reserve + altmap->free;
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
        altmap->alloc -= nr_pfns;
}
#else
static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
{
        return 0;
}

static inline void vmem_altmap_free(struct vmem_altmap *altmap,
                                    unsigned long nr_pfns)
{
}
#endif

#define VMEMMAP_RESERVE_NR        2
#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
                                          struct dev_pagemap *pgmap)
{
        unsigned long nr_pages;
        unsigned long nr_vmemmap_pages;

        if (!pgmap || !is_power_of_2(sizeof(struct page)))
                return false;

        nr_pages = pgmap_vmemmap_nr(pgmap);
        nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
        /*
         * For vmemmap optimization with DAX we need minimum 2 vmemmap
         * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
         */
        return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
}
/*
 * If we don't have an architecture override, use the generic rule
 */
#ifndef vmemmap_can_optimize
#define vmemmap_can_optimize __vmemmap_can_optimize
#endif

#else
static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
                                           struct dev_pagemap *pgmap)
{
        return false;
}
#endif

enum mf_flags {
        MF_COUNT_INCREASED = 1 << 0,
        MF_ACTION_REQUIRED = 1 << 1,
        MF_MUST_KILL = 1 << 2,
        MF_SOFT_OFFLINE = 1 << 3,
        MF_UNPOISON = 1 << 4,
        MF_SW_SIMULATED = 1 << 5,
        MF_NO_RETRY = 1 << 6,
        MF_MEM_PRE_REMOVE = 1 << 7,
};
int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
                      unsigned long count, int mf_flags);
extern int memory_failure(unsigned long pfn, int flags);
extern void memory_failure_queue_kick(int cpu);
extern int unpoison_memory(unsigned long pfn);
extern atomic_long_t num_poisoned_pages __read_mostly;
extern int soft_offline_page(unsigned long pfn, int flags);
#ifdef CONFIG_MEMORY_FAILURE
/*
 * Sysfs entries for memory failure handling statistics.
 */
extern const struct attribute_group memory_failure_attr_group;
extern void memory_failure_queue(unsigned long pfn, int flags);
extern int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared);
void num_poisoned_pages_inc(unsigned long pfn);
void num_poisoned_pages_sub(unsigned long pfn, long i);
#else
static inline void memory_failure_queue(unsigned long pfn, int flags)
{
}

static inline int __get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared)
{
        return 0;
}

static inline void num_poisoned_pages_inc(unsigned long pfn)
{
}

static inline void num_poisoned_pages_sub(unsigned long pfn, long i)
{
}
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
extern void memblk_nr_poison_inc(unsigned long pfn);
extern void memblk_nr_poison_sub(unsigned long pfn, long i);
#else
static inline void memblk_nr_poison_inc(unsigned long pfn)
{
}

static inline void memblk_nr_poison_sub(unsigned long pfn, long i)
{
}
#endif

#ifndef arch_memory_failure
static inline int arch_memory_failure(unsigned long pfn, int flags)
{
        return -ENXIO;
}
#endif

#ifndef arch_is_platform_page
static inline bool arch_is_platform_page(u64 paddr)
{
        return false;
}
#endif

/*
 * Error handlers for various types of pages.
 */
enum mf_result {
        MF_IGNORED,        /* Error: cannot be handled */
        MF_FAILED,        /* Error: handling failed */
        MF_DELAYED,        /* Will be handled later */
        MF_RECOVERED,        /* Successfully recovered */
};

enum mf_action_page_type {
        MF_MSG_KERNEL,
        MF_MSG_KERNEL_HIGH_ORDER,
        MF_MSG_DIFFERENT_COMPOUND,
        MF_MSG_HUGE,
        MF_MSG_FREE_HUGE,
        MF_MSG_GET_HWPOISON,
        MF_MSG_UNMAP_FAILED,
        MF_MSG_DIRTY_SWAPCACHE,
        MF_MSG_CLEAN_SWAPCACHE,
        MF_MSG_DIRTY_MLOCKED_LRU,
        MF_MSG_CLEAN_MLOCKED_LRU,
        MF_MSG_DIRTY_UNEVICTABLE_LRU,
        MF_MSG_CLEAN_UNEVICTABLE_LRU,
        MF_MSG_DIRTY_LRU,
        MF_MSG_CLEAN_LRU,
        MF_MSG_TRUNCATED_LRU,
        MF_MSG_BUDDY,
        MF_MSG_DAX,
        MF_MSG_UNSPLIT_THP,
        MF_MSG_ALREADY_POISONED,
        MF_MSG_UNKNOWN,
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
void folio_zero_user(struct folio *folio, unsigned long addr_hint);
int copy_user_large_folio(struct folio *dst, struct folio *src,
                          unsigned long addr_hint,
                          struct vm_area_struct *vma);
long copy_folio_from_user(struct folio *dst_folio,
                           const void __user *usr_src,
                           bool allow_pagefault);

/**
 * vma_is_special_huge - Are transhuge page-table entries considered special?
 * @vma: Pointer to the struct vm_area_struct to consider
 *
 * Whether transhuge page-table entries are considered "special" following
 * the definition in vm_normal_page().
 *
 * Return: true if transhuge page-table entries should be considered special,
 * false otherwise.
 */
static inline bool vma_is_special_huge(const struct vm_area_struct *vma)
{
        return vma_is_dax(vma) || (vma->vm_file &&
                                   (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
}

#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */

#if MAX_NUMNODES > 1
void __init setup_nr_node_ids(void);
#else
static inline void setup_nr_node_ids(void) {}
#endif

extern int memcmp_pages(struct page *page1, struct page *page2);

static inline int pages_identical(struct page *page1, struct page *page2)
{
        return !memcmp_pages(page1, page2);
}

#ifdef CONFIG_MAPPING_DIRTY_HELPERS
unsigned long clean_record_shared_mapping_range(struct address_space *mapping,
                                                pgoff_t first_index, pgoff_t nr,
                                                pgoff_t bitmap_pgoff,
                                                unsigned long *bitmap,
                                                pgoff_t *start,
                                                pgoff_t *end);

unsigned long wp_shared_mapping_range(struct address_space *mapping,
                                      pgoff_t first_index, pgoff_t nr);
#endif

#ifdef CONFIG_ANON_VMA_NAME
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
                          unsigned long len_in,
                          struct anon_vma_name *anon_name);
#else
static inline int
madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
                      unsigned long len_in, struct anon_vma_name *anon_name) {
        return 0;
}
#endif

#ifdef CONFIG_UNACCEPTED_MEMORY

bool range_contains_unaccepted_memory(phys_addr_t start, unsigned long size);
void accept_memory(phys_addr_t start, unsigned long size);

#else

static inline bool range_contains_unaccepted_memory(phys_addr_t start,
                                                    unsigned long size)
{
        return false;
}

static inline void accept_memory(phys_addr_t start, unsigned long size)
{
}

#endif

static inline bool pfn_is_unaccepted_memory(unsigned long pfn)
{
        return range_contains_unaccepted_memory(pfn << PAGE_SHIFT, PAGE_SIZE);
}

void vma_pgtable_walk_begin(struct vm_area_struct *vma);
void vma_pgtable_walk_end(struct vm_area_struct *vma);

int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size);
int reserve_mem_release_by_name(const char *name);

#ifdef CONFIG_64BIT
int do_mseal(unsigned long start, size_t len_in, unsigned long flags);
#else
static inline int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
{
        /* noop on 32 bit */
        return 0;
}
#endif

/*
 * user_alloc_needs_zeroing checks if a user folio from page allocator needs to
 * be zeroed or not.
 */
static inline bool user_alloc_needs_zeroing(void)
{
        /*
         * for user folios, arch with cache aliasing requires cache flush and
         * arc changes folio->flags to make icache coherent with dcache, so
         * always return false to make caller use
         * clear_user_page()/clear_user_highpage().
         */
        return cpu_dcache_is_aliasing() || cpu_icache_is_aliasing() ||
               !static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON,
                                   &init_on_alloc);
}

int arch_get_shadow_stack_status(struct task_struct *t, unsigned long __user *status);
int arch_set_shadow_stack_status(struct task_struct *t, unsigned long status);
int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status);


/*
 * mseal of userspace process's system mappings.
 */
#ifdef CONFIG_MSEAL_SYSTEM_MAPPINGS
#define VM_SEALED_SYSMAP        VM_SEALED
#else
#define VM_SEALED_SYSMAP        VM_NONE
#endif

#endif /* _LINUX_MM_H */










































































































































































































































































































































































    3 

    3 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2010-2011 EIA Electronics,
//                         Pieter Beyens <pieter.beyens@eia.be>
// Copyright (c) 2010-2011 EIA Electronics,
//                         Kurt Van Dijck <kurt.van.dijck@eia.be>
// Copyright (c) 2018 Protonic,
//                         Robin van der Gracht <robin@protonic.nl>
// Copyright (c) 2017-2019 Pengutronix,
//                         Marc Kleine-Budde <kernel@pengutronix.de>
// Copyright (c) 2017-2019 Pengutronix,
//                         Oleksij Rempel <kernel@pengutronix.de>

/* Core of can-j1939 that links j1939 to CAN. */

#include <linux/can/can-ml.h>
#include <linux/can/core.h>
#include <linux/can/skb.h>
#include <linux/if_arp.h>
#include <linux/module.h>

#include "j1939-priv.h"

MODULE_DESCRIPTION("PF_CAN SAE J1939");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)");
MODULE_ALIAS("can-proto-" __stringify(CAN_J1939));

/* LOWLEVEL CAN interface */

/* CAN_HDR: #bytes before can_frame data part */
#define J1939_CAN_HDR (offsetof(struct can_frame, data))

/* lowest layer */
static void j1939_can_recv(struct sk_buff *iskb, void *data)
{
        struct j1939_priv *priv = data;
        struct sk_buff *skb;
        struct j1939_sk_buff_cb *skcb, *iskcb;
        struct can_frame *cf;

        /* make sure we only get Classical CAN frames */
        if (!can_is_can_skb(iskb))
                return;

        /* create a copy of the skb
         * j1939 only delivers the real data bytes,
         * the header goes into sockaddr.
         * j1939 may not touch the incoming skb in such way
         */
        skb = skb_clone(iskb, GFP_ATOMIC);
        if (!skb)
                return;

        j1939_priv_get(priv);
        can_skb_set_owner(skb, iskb->sk);

        /* get a pointer to the header of the skb
         * the skb payload (pointer) is moved, so that the next skb_data
         * returns the actual payload
         */
        cf = (void *)skb->data;
        skb_pull(skb, J1939_CAN_HDR);

        /* fix length, set to dlc, with 8 maximum */
        skb_trim(skb, min_t(uint8_t, cf->len, 8));

        /* set addr */
        skcb = j1939_skb_to_cb(skb);
        memset(skcb, 0, sizeof(*skcb));

        iskcb = j1939_skb_to_cb(iskb);
        skcb->tskey = iskcb->tskey;
        skcb->priority = (cf->can_id >> 26) & 0x7;
        skcb->addr.sa = cf->can_id;
        skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX;
        /* set default message type */
        skcb->addr.type = J1939_TP;

        if (!j1939_address_is_valid(skcb->addr.sa)) {
                netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n",
                                __func__);
                goto done;
        }

        if (j1939_pgn_is_pdu1(skcb->addr.pgn)) {
                /* Type 1: with destination address */
                skcb->addr.da = skcb->addr.pgn;
                /* normalize pgn: strip dst address */
                skcb->addr.pgn &= 0x3ff00;
        } else {
                /* set broadcast address */
                skcb->addr.da = J1939_NO_ADDR;
        }

        /* update localflags */
        read_lock_bh(&priv->lock);
        if (j1939_address_is_unicast(skcb->addr.sa) &&
            priv->ents[skcb->addr.sa].nusers)
                skcb->flags |= J1939_ECU_LOCAL_SRC;
        if (j1939_address_is_unicast(skcb->addr.da) &&
            priv->ents[skcb->addr.da].nusers)
                skcb->flags |= J1939_ECU_LOCAL_DST;
        read_unlock_bh(&priv->lock);

        /* deliver into the j1939 stack ... */
        j1939_ac_recv(priv, skb);

        if (j1939_tp_recv(priv, skb))
                /* this means the transport layer processed the message */
                goto done;

        j1939_simple_recv(priv, skb);
        j1939_sk_recv(priv, skb);
 done:
        j1939_priv_put(priv);
        kfree_skb(skb);
}

/* NETDEV MANAGEMENT */

/* values for can_rx_(un)register */
#define J1939_CAN_ID CAN_EFF_FLAG
#define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG)

static DEFINE_MUTEX(j1939_netdev_lock);

static struct j1939_priv *j1939_priv_create(struct net_device *ndev)
{
        struct j1939_priv *priv;

        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
        if (!priv)
                return NULL;

        rwlock_init(&priv->lock);
        INIT_LIST_HEAD(&priv->ecus);
        priv->ndev = ndev;
        kref_init(&priv->kref);
        kref_init(&priv->rx_kref);
        dev_hold(ndev);

        netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv);

        return priv;
}

static inline void j1939_priv_set(struct net_device *ndev,
                                  struct j1939_priv *priv)
{
        struct can_ml_priv *can_ml = can_get_ml_priv(ndev);

        can_ml->j1939_priv = priv;
}

static void __j1939_priv_release(struct kref *kref)
{
        struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref);
        struct net_device *ndev = priv->ndev;

        netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv);

        WARN_ON_ONCE(!list_empty(&priv->active_session_list));
        WARN_ON_ONCE(!list_empty(&priv->ecus));
        WARN_ON_ONCE(!list_empty(&priv->j1939_socks));

        dev_put(ndev);
        kfree(priv);
}

void j1939_priv_put(struct j1939_priv *priv)
{
        kref_put(&priv->kref, __j1939_priv_release);
}

void j1939_priv_get(struct j1939_priv *priv)
{
        kref_get(&priv->kref);
}

static int j1939_can_rx_register(struct j1939_priv *priv)
{
        struct net_device *ndev = priv->ndev;
        int ret;

        j1939_priv_get(priv);
        ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
                              j1939_can_recv, priv, "j1939", NULL);
        if (ret < 0) {
                j1939_priv_put(priv);
                return ret;
        }

        return 0;
}

static void j1939_can_rx_unregister(struct j1939_priv *priv)
{
        struct net_device *ndev = priv->ndev;

        can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
                          j1939_can_recv, priv);

        /* The last reference of priv is dropped by the RCU deferred
         * j1939_sk_sock_destruct() of the last socket, so we can
         * safely drop this reference here.
         */
        j1939_priv_put(priv);
}

static void __j1939_rx_release(struct kref *kref)
        __releases(&j1939_netdev_lock)
{
        struct j1939_priv *priv = container_of(kref, struct j1939_priv,
                                               rx_kref);

        j1939_can_rx_unregister(priv);
        j1939_ecu_unmap_all(priv);
        j1939_priv_set(priv->ndev, NULL);
        mutex_unlock(&j1939_netdev_lock);
}

/* get pointer to priv without increasing ref counter */
static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev)
{
        struct can_ml_priv *can_ml = can_get_ml_priv(ndev);

        return can_ml->j1939_priv;
}

static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev)
{
        struct j1939_priv *priv;

        lockdep_assert_held(&j1939_netdev_lock);

        priv = j1939_ndev_to_priv(ndev);
        if (priv)
                j1939_priv_get(priv);

        return priv;
}

static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev)
{
        struct j1939_priv *priv;

        mutex_lock(&j1939_netdev_lock);
        priv = j1939_priv_get_by_ndev_locked(ndev);
        mutex_unlock(&j1939_netdev_lock);

        return priv;
}

struct j1939_priv *j1939_netdev_start(struct net_device *ndev)
{
        struct j1939_priv *priv, *priv_new;
        int ret;

        mutex_lock(&j1939_netdev_lock);
        priv = j1939_priv_get_by_ndev_locked(ndev);
        if (priv) {
                kref_get(&priv->rx_kref);
                mutex_unlock(&j1939_netdev_lock);
                return priv;
        }
        mutex_unlock(&j1939_netdev_lock);

        priv = j1939_priv_create(ndev);
        if (!priv)
                return ERR_PTR(-ENOMEM);

        j1939_tp_init(priv);
        rwlock_init(&priv->j1939_socks_lock);
        INIT_LIST_HEAD(&priv->j1939_socks);

        mutex_lock(&j1939_netdev_lock);
        priv_new = j1939_priv_get_by_ndev_locked(ndev);
        if (priv_new) {
                /* Someone was faster than us, use their priv and roll
                 * back our's.
                 */
                kref_get(&priv_new->rx_kref);
                mutex_unlock(&j1939_netdev_lock);
                dev_put(ndev);
                kfree(priv);
                return priv_new;
        }
        j1939_priv_set(ndev, priv);

        ret = j1939_can_rx_register(priv);
        if (ret < 0)
                goto out_priv_put;

        mutex_unlock(&j1939_netdev_lock);
        return priv;

 out_priv_put:
        j1939_priv_set(ndev, NULL);
        mutex_unlock(&j1939_netdev_lock);

        dev_put(ndev);
        kfree(priv);

        return ERR_PTR(ret);
}

void j1939_netdev_stop(struct j1939_priv *priv)
{
        kref_put_mutex(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock);
        j1939_priv_put(priv);
}

int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb)
{
        int ret, dlc;
        canid_t canid;
        struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
        struct can_frame *cf;

        /* apply sanity checks */
        if (j1939_pgn_is_pdu1(skcb->addr.pgn))
                skcb->addr.pgn &= J1939_PGN_PDU1_MAX;
        else
                skcb->addr.pgn &= J1939_PGN_MAX;

        if (skcb->priority > 7)
                skcb->priority = 6;

        ret = j1939_ac_fixup(priv, skb);
        if (unlikely(ret))
                goto failed;
        dlc = skb->len;

        /* re-claim the CAN_HDR from the SKB */
        cf = skb_push(skb, J1939_CAN_HDR);

        /* initialize header structure */
        memset(cf, 0, J1939_CAN_HDR);

        /* make it a full can frame again */
        skb_put_zero(skb, 8 - dlc);

        canid = CAN_EFF_FLAG |
                (skcb->priority << 26) |
                (skcb->addr.pgn << 8) |
                skcb->addr.sa;
        if (j1939_pgn_is_pdu1(skcb->addr.pgn))
                canid |= skcb->addr.da << 8;

        cf->can_id = canid;
        cf->len = dlc;

        return can_send(skb, 1);

 failed:
        kfree_skb(skb);
        return ret;
}

static int j1939_netdev_notify(struct notifier_block *nb,
                               unsigned long msg, void *data)
{
        struct net_device *ndev = netdev_notifier_info_to_dev(data);
        struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
        struct j1939_priv *priv;

        if (!can_ml)
                goto notify_done;

        priv = j1939_priv_get_by_ndev(ndev);
        if (!priv)
                goto notify_done;

        switch (msg) {
        case NETDEV_DOWN:
                j1939_cancel_active_session(priv, NULL);
                j1939_sk_netdev_event_netdown(priv);
                j1939_ecu_unmap_all(priv);
                break;
        }

        j1939_priv_put(priv);

notify_done:
        return NOTIFY_DONE;
}

static struct notifier_block j1939_netdev_notifier = {
        .notifier_call = j1939_netdev_notify,
};

/* MODULE interface */
static __init int j1939_module_init(void)
{
        int ret;

        pr_info("can: SAE J1939\n");

        ret = register_netdevice_notifier(&j1939_netdev_notifier);
        if (ret)
                goto fail_notifier;

        ret = can_proto_register(&j1939_can_proto);
        if (ret < 0) {
                pr_err("can: registration of j1939 protocol failed\n");
                goto fail_sk;
        }

        return 0;

 fail_sk:
        unregister_netdevice_notifier(&j1939_netdev_notifier);
 fail_notifier:
        return ret;
}

static __exit void j1939_module_exit(void)
{
        can_proto_unregister(&j1939_can_proto);

        unregister_netdevice_notifier(&j1939_netdev_notifier);
}

module_init(j1939_module_init);
module_exit(j1939_module_exit);


















































































  640 





























  162 
















   58 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This header provides generic wrappers for memory access instrumentation that
 * the compiler cannot emit for: KASAN, KCSAN, KMSAN.
 */
#ifndef _LINUX_INSTRUMENTED_H
#define _LINUX_INSTRUMENTED_H

#include <linux/compiler.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>
#include <linux/kmsan-checks.h>
#include <linux/types.h>

/**
 * instrument_read - instrument regular read access
 * @v: address of access
 * @size: size of access
 *
 * Instrument a regular read access. The instrumentation should be inserted
 * before the actual read happens.
 */
static __always_inline void instrument_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_read(v, size);
}

/**
 * instrument_write - instrument regular write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 */
static __always_inline void instrument_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_write(v, size);
}

/**
 * instrument_read_write - instrument regular read-write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument a regular write access. The instrumentation should be inserted
 * before the actual write happens.
 */
static __always_inline void instrument_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_read_write(v, size);
}

/**
 * instrument_atomic_read - instrument atomic read access
 * @v: address of access
 * @size: size of access
 *
 * Instrument an atomic read access. The instrumentation should be inserted
 * before the actual read happens.
 */
static __always_inline void instrument_atomic_read(const volatile void *v, size_t size)
{
        kasan_check_read(v, size);
        kcsan_check_atomic_read(v, size);
}

/**
 * instrument_atomic_write - instrument atomic write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument an atomic write access. The instrumentation should be inserted
 * before the actual write happens.
 */
static __always_inline void instrument_atomic_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_write(v, size);
}

/**
 * instrument_atomic_read_write - instrument atomic read-write access
 * @v: address of access
 * @size: size of access
 *
 * Instrument an atomic read-write access. The instrumentation should be
 * inserted before the actual write happens.
 */
static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size)
{
        kasan_check_write(v, size);
        kcsan_check_atomic_read_write(v, size);
}

/**
 * instrument_copy_to_user - instrument reads of copy_to_user
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 *
 * Instrument reads from kernel memory, that are due to copy_to_user (and
 * variants). The instrumentation must be inserted before the accesses.
 */
static __always_inline void
instrument_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        kasan_check_read(from, n);
        kcsan_check_read(from, n);
        kmsan_copy_to_user(to, from, n, 0);
}

/**
 * instrument_copy_from_user_before - add instrumentation before copy_from_user
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 *
 * Instrument writes to kernel memory, that are due to copy_from_user (and
 * variants). The instrumentation should be inserted before the accesses.
 */
static __always_inline void
instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n)
{
        kasan_check_write(to, n);
        kcsan_check_write(to, n);
}

/**
 * instrument_copy_from_user_after - add instrumentation after copy_from_user
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 * @left: number of bytes not copied (as returned by copy_from_user)
 *
 * Instrument writes to kernel memory, that are due to copy_from_user (and
 * variants). The instrumentation should be inserted after the accesses.
 */
static __always_inline void
instrument_copy_from_user_after(const void *to, const void __user *from,
                                unsigned long n, unsigned long left)
{
        kmsan_unpoison_memory(to, n - left);
}

/**
 * instrument_memcpy_before - add instrumentation before non-instrumented memcpy
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 *
 * Instrument memory accesses that happen in custom memcpy implementations. The
 * instrumentation should be inserted before the memcpy call.
 */
static __always_inline void instrument_memcpy_before(void *to, const void *from,
                                                     unsigned long n)
{
        kasan_check_write(to, n);
        kasan_check_read(from, n);
        kcsan_check_write(to, n);
        kcsan_check_read(from, n);
}

/**
 * instrument_memcpy_after - add instrumentation after non-instrumented memcpy
 * @to: destination address
 * @from: source address
 * @n: number of bytes to copy
 * @left: number of bytes not copied (if known)
 *
 * Instrument memory accesses that happen in custom memcpy implementations. The
 * instrumentation should be inserted after the memcpy call.
 */
static __always_inline void instrument_memcpy_after(void *to, const void *from,
                                                    unsigned long n,
                                                    unsigned long left)
{
        kmsan_memmove(to, from, n - left);
}

/**
 * instrument_get_user() - add instrumentation to get_user()-like macros
 * @to: destination variable, may not be address-taken
 *
 * get_user() and friends are fragile, so it may depend on the implementation
 * whether the instrumentation happens before or after the data is copied from
 * the userspace.
 */
#define instrument_get_user(to)                                \
({                                                        \
        u64 __tmp = (u64)(to);                                \
        kmsan_unpoison_memory(&__tmp, sizeof(__tmp));        \
        to = __tmp;                                        \
})


/**
 * instrument_put_user() - add instrumentation to put_user()-like macros
 * @from: source address
 * @ptr: userspace pointer to copy to
 * @size: number of bytes to copy
 *
 * put_user() and friends are fragile, so it may depend on the implementation
 * whether the instrumentation happens before or after the data is copied from
 * the userspace.
 */
#define instrument_put_user(from, ptr, size)                        \
({                                                                \
        kmsan_copy_to_user(ptr, &from, sizeof(from), 0);        \
})

#endif /* _LINUX_INSTRUMENTED_H */

































 1261 
















 1260 



    8 


 1256 

   53 



















 1256 
 1261 





















   32 


  508 
    3 




























   40 



















   41 

   40 




   41 














   41 




































   42 























 1255 


















  543 
  999 






  536 

    8 



  535 




  509 
   33 









 1256 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/realpath.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/magic.h>
#include <linux/proc_fs.h>

/**
 * tomoyo_encode2 - Encode binary string to ascii string.
 *
 * @str:     String in binary format.
 * @str_len: Size of @str in byte.
 *
 * Returns pointer to @str in ascii format on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_encode2(const char *str, int str_len)
{
        int i;
        int len = 0;
        const char *p = str;
        char *cp;
        char *cp0;

        if (!p)
                return NULL;
        for (i = 0; i < str_len; i++) {
                const unsigned char c = p[i];

                if (c == '\\')
                        len += 2;
                else if (c > ' ' && c < 127)
                        len++;
                else
                        len += 4;
        }
        len++;
        /* Reserve space for appending "/". */
        cp = kzalloc(len + 10, GFP_NOFS);
        if (!cp)
                return NULL;
        cp0 = cp;
        p = str;
        for (i = 0; i < str_len; i++) {
                const unsigned char c = p[i];

                if (c == '\\') {
                        *cp++ = '\\';
                        *cp++ = '\\';
                } else if (c > ' ' && c < 127) {
                        *cp++ = c;
                } else {
                        *cp++ = '\\';
                        *cp++ = (c >> 6) + '0';
                        *cp++ = ((c >> 3) & 7) + '0';
                        *cp++ = (c & 7) + '0';
                }
        }
        return cp0;
}

/**
 * tomoyo_encode - Encode binary string to ascii string.
 *
 * @str: String in binary format.
 *
 * Returns pointer to @str in ascii format on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_encode(const char *str)
{
        return str ? tomoyo_encode2(str, strlen(str)) : NULL;
}

/**
 * tomoyo_get_absolute_path - Get the path of a dentry but ignores chroot'ed root.
 *
 * @path:   Pointer to "struct path".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 */
static char *tomoyo_get_absolute_path(const struct path *path, char * const buffer,
                                      const int buflen)
{
        char *pos = ERR_PTR(-ENOMEM);

        if (buflen >= 256) {
                /* go to whatever namespace root we are under */
                pos = d_absolute_path(path, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(path->dentry);

                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
                        }
                }
        }
        return pos;
}

/**
 * tomoyo_get_dentry_path - Get the path of a dentry.
 *
 * @dentry: Pointer to "struct dentry".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 */
static char *tomoyo_get_dentry_path(struct dentry *dentry, char * const buffer,
                                    const int buflen)
{
        char *pos = ERR_PTR(-ENOMEM);

        if (buflen >= 256) {
                pos = dentry_path_raw(dentry, buffer, buflen - 1);
                if (!IS_ERR(pos) && *pos == '/' && pos[1]) {
                        struct inode *inode = d_backing_inode(dentry);

                        if (inode && S_ISDIR(inode->i_mode)) {
                                buffer[buflen - 2] = '/';
                                buffer[buflen - 1] = '\0';
                        }
                }
        }
        return pos;
}

/**
 * tomoyo_get_local_path - Get the path of a dentry.
 *
 * @dentry: Pointer to "struct dentry".
 * @buffer: Pointer to buffer to return value in.
 * @buflen: Sizeof @buffer.
 *
 * Returns the buffer on success, an error code otherwise.
 */
static char *tomoyo_get_local_path(struct dentry *dentry, char * const buffer,
                                   const int buflen)
{
        struct super_block *sb = dentry->d_sb;
        char *pos = tomoyo_get_dentry_path(dentry, buffer, buflen);

        if (IS_ERR(pos))
                return pos;
        /* Convert from $PID to self if $PID is current thread. */
        if (sb->s_magic == PROC_SUPER_MAGIC && *pos == '/') {
                char *ep;
                const pid_t pid = (pid_t) simple_strtoul(pos + 1, &ep, 10);
                struct pid_namespace *proc_pidns = proc_pid_ns(sb);

                if (*ep == '/' && pid && pid ==
                    task_tgid_nr_ns(current, proc_pidns)) {
                        pos = ep - 5;
                        if (pos < buffer)
                                goto out;
                        memmove(pos, "/self", 5);
                }
                goto prepend_filesystem_name;
        }
        /* Use filesystem name for unnamed devices. */
        if (!MAJOR(sb->s_dev))
                goto prepend_filesystem_name;
        {
                struct inode *inode = d_backing_inode(sb->s_root);

                /*
                 * Use filesystem name if filesystem does not support rename()
                 * operation.
                 */
                if (!inode->i_op->rename)
                        goto prepend_filesystem_name;
        }
        /* Prepend device name. */
        {
                char name[64];
                int name_len;
                const dev_t dev = sb->s_dev;

                name[sizeof(name) - 1] = '\0';
                snprintf(name, sizeof(name) - 1, "dev(%u,%u):", MAJOR(dev),
                         MINOR(dev));
                name_len = strlen(name);
                pos -= name_len;
                if (pos < buffer)
                        goto out;
                memmove(pos, name, name_len);
                return pos;
        }
        /* Prepend filesystem name. */
prepend_filesystem_name:
        {
                const char *name = sb->s_type->name;
                const int name_len = strlen(name);

                pos -= name_len + 1;
                if (pos < buffer)
                        goto out;
                memmove(pos, name, name_len);
                pos[name_len] = ':';
        }
        return pos;
out:
        return ERR_PTR(-ENOMEM);
}

/**
 * tomoyo_realpath_from_path - Returns realpath(3) of the given pathname but ignores chroot'ed root.
 *
 * @path: Pointer to "struct path".
 *
 * Returns the realpath of the given @path on success, NULL otherwise.
 *
 * If dentry is a directory, trailing '/' is appended.
 * Characters out of 0x20 < c < 0x7F range are converted to
 * \ooo style octal string.
 * Character \ is converted to \\ string.
 *
 * These functions use kzalloc(), so the caller must call kfree()
 * if these functions didn't return NULL.
 */
char *tomoyo_realpath_from_path(const struct path *path)
{
        char *buf = NULL;
        char *name = NULL;
        unsigned int buf_len = PAGE_SIZE / 2;
        struct dentry *dentry = path->dentry;
        struct super_block *sb = dentry->d_sb;

        while (1) {
                char *pos;
                struct inode *inode;

                buf_len <<= 1;
                kfree(buf);
                buf = kmalloc(buf_len, GFP_NOFS);
                if (!buf)
                        break;
                /* To make sure that pos is '\0' terminated. */
                buf[buf_len - 1] = '\0';
                /* For "pipe:[\$]" and "socket:[\$]". */
                if (dentry->d_op && dentry->d_op->d_dname) {
                        pos = dentry->d_op->d_dname(dentry, buf, buf_len - 1);
                        goto encode;
                }
                inode = d_backing_inode(sb->s_root);
                /*
                 * Get local name for filesystems without rename() operation
                 */
                if ((!inode->i_op->rename &&
                     !(sb->s_type->fs_flags & FS_REQUIRES_DEV)))
                        pos = tomoyo_get_local_path(path->dentry, buf,
                                                    buf_len - 1);
                /* Get absolute name for the rest. */
                else {
                        pos = tomoyo_get_absolute_path(path, buf, buf_len - 1);
                        /*
                         * Fall back to local name if absolute name is not
                         * available.
                         */
                        if (pos == ERR_PTR(-EINVAL))
                                pos = tomoyo_get_local_path(path->dentry, buf,
                                                            buf_len - 1);
                }
encode:
                if (IS_ERR(pos))
                        continue;
                name = tomoyo_encode(pos);
                break;
        }
        kfree(buf);
        if (!name)
                tomoyo_warn_oom(__func__);
        return name;
}

/**
 * tomoyo_realpath_nofollow - Get realpath of a pathname.
 *
 * @pathname: The pathname to solve.
 *
 * Returns the realpath of @pathname on success, NULL otherwise.
 */
char *tomoyo_realpath_nofollow(const char *pathname)
{
        struct path path;

        if (pathname && kern_path(pathname, 0, &path) == 0) {
                char *buf = tomoyo_realpath_from_path(&path);

                path_put(&path);
                return buf;
        }
        return NULL;
}























































































































































































































































































































































































































































































































































































































































































    3 





























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
// SPDX-License-Identifier: GPL-2.0
/* Copyright 2011-2014 Autronica Fire and Security AS
 *
 * Author(s):
 *        2011-2014 Arvid Brodin, arvid.brodin@alten.se
 * This file contains device methods for creating, using and destroying
 * virtual HSR or PRP devices.
 */

#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/pkt_sched.h>
#include "hsr_device.h"
#include "hsr_slave.h"
#include "hsr_framereg.h"
#include "hsr_main.h"
#include "hsr_forward.h"

static bool is_admin_up(struct net_device *dev)
{
        return dev && (dev->flags & IFF_UP);
}

static bool is_slave_up(struct net_device *dev)
{
        return dev && is_admin_up(dev) && netif_oper_up(dev);
}

static void hsr_set_operstate(struct hsr_port *master, bool has_carrier)
{
        struct net_device *dev = master->dev;

        if (!is_admin_up(dev)) {
                netif_set_operstate(dev, IF_OPER_DOWN);
                return;
        }

        if (has_carrier)
                netif_set_operstate(dev, IF_OPER_UP);
        else
                netif_set_operstate(dev, IF_OPER_LOWERLAYERDOWN);
}

static bool hsr_check_carrier(struct hsr_port *master)
{
        struct hsr_port *port;

        ASSERT_RTNL();

        hsr_for_each_port(master->hsr, port) {
                if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) {
                        netif_carrier_on(master->dev);
                        return true;
                }
        }

        netif_carrier_off(master->dev);

        return false;
}

static void hsr_check_announce(struct net_device *hsr_dev)
{
        struct hsr_priv *hsr;

        hsr = netdev_priv(hsr_dev);
        if (netif_running(hsr_dev) && netif_oper_up(hsr_dev)) {
                /* Enable announce timer and start sending supervisory frames */
                if (!timer_pending(&hsr->announce_timer)) {
                        hsr->announce_count = 0;
                        mod_timer(&hsr->announce_timer, jiffies +
                                  msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL));
                }

                if (hsr->redbox && !timer_pending(&hsr->announce_proxy_timer))
                        mod_timer(&hsr->announce_proxy_timer, jiffies +
                                  msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL) / 2);
        } else {
                /* Deactivate the announce timer  */
                timer_delete(&hsr->announce_timer);
                if (hsr->redbox)
                        timer_delete(&hsr->announce_proxy_timer);
        }
}

void hsr_check_carrier_and_operstate(struct hsr_priv *hsr)
{
        struct hsr_port *master;
        bool has_carrier;

        master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
        /* netif_stacked_transfer_operstate() cannot be used here since
         * it doesn't set IF_OPER_LOWERLAYERDOWN (?)
         */
        has_carrier = hsr_check_carrier(master);
        hsr_set_operstate(master, has_carrier);
        hsr_check_announce(master->dev);
}

int hsr_get_max_mtu(struct hsr_priv *hsr)
{
        unsigned int mtu_max;
        struct hsr_port *port;

        mtu_max = ETH_DATA_LEN;
        hsr_for_each_port(hsr, port)
                if (port->type != HSR_PT_MASTER)
                        mtu_max = min(port->dev->mtu, mtu_max);

        if (mtu_max < HSR_HLEN)
                return 0;
        return mtu_max - HSR_HLEN;
}

static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu)
{
        struct hsr_priv *hsr;

        hsr = netdev_priv(dev);

        if (new_mtu > hsr_get_max_mtu(hsr)) {
                netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n",
                            HSR_HLEN);
                return -EINVAL;
        }

        WRITE_ONCE(dev->mtu, new_mtu);

        return 0;
}

static int hsr_dev_open(struct net_device *dev)
{
        struct hsr_priv *hsr;
        struct hsr_port *port;
        const char *designation = NULL;

        hsr = netdev_priv(dev);

        hsr_for_each_port(hsr, port) {
                if (port->type == HSR_PT_MASTER)
                        continue;
                switch (port->type) {
                case HSR_PT_SLAVE_A:
                        designation = "Slave A";
                        break;
                case HSR_PT_SLAVE_B:
                        designation = "Slave B";
                        break;
                case HSR_PT_INTERLINK:
                        designation = "Interlink";
                        break;
                default:
                        designation = "Unknown";
                }
                if (!is_slave_up(port->dev))
                        netdev_warn(dev, "%s (%s) is not up; please bring it up to get a fully working HSR network\n",
                                    designation, port->dev->name);
        }

        if (!designation)
                netdev_warn(dev, "No slave devices configured\n");

        return 0;
}

static int hsr_dev_close(struct net_device *dev)
{
        struct hsr_port *port;
        struct hsr_priv *hsr;

        hsr = netdev_priv(dev);
        hsr_for_each_port(hsr, port) {
                if (port->type == HSR_PT_MASTER)
                        continue;
                switch (port->type) {
                case HSR_PT_SLAVE_A:
                case HSR_PT_SLAVE_B:
                        dev_uc_unsync(port->dev, dev);
                        dev_mc_unsync(port->dev, dev);
                        break;
                default:
                        break;
                }
        }

        return 0;
}

static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr,
                                                netdev_features_t features)
{
        netdev_features_t mask;
        struct hsr_port *port;

        mask = features;

        /* Mask out all features that, if supported by one device, should be
         * enabled for all devices (see NETIF_F_ONE_FOR_ALL).
         *
         * Anything that's off in mask will not be enabled - so only things
         * that were in features originally, and also is in NETIF_F_ONE_FOR_ALL,
         * may become enabled.
         */
        features &= ~NETIF_F_ONE_FOR_ALL;
        hsr_for_each_port(hsr, port)
                features = netdev_increment_features(features,
                                                     port->dev->features,
                                                     mask);

        return features;
}

static netdev_features_t hsr_fix_features(struct net_device *dev,
                                          netdev_features_t features)
{
        struct hsr_priv *hsr = netdev_priv(dev);

        return hsr_features_recompute(hsr, features);
}

static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct hsr_priv *hsr = netdev_priv(dev);
        struct hsr_port *master;

        master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
        if (master) {
                skb->dev = master->dev;
                skb_reset_mac_header(skb);
                skb_reset_mac_len(skb);
                spin_lock_bh(&hsr->seqnr_lock);
                hsr_forward_skb(skb, master);
                spin_unlock_bh(&hsr->seqnr_lock);
        } else {
                dev_core_stats_tx_dropped_inc(dev);
                dev_kfree_skb_any(skb);
        }
        return NETDEV_TX_OK;
}

static const struct header_ops hsr_header_ops = {
        .create         = eth_header,
        .parse         = eth_header_parse,
};

static struct sk_buff *hsr_init_skb(struct hsr_port *master, int extra)
{
        struct hsr_priv *hsr = master->hsr;
        struct sk_buff *skb;
        int hlen, tlen;
        int len;

        hlen = LL_RESERVED_SPACE(master->dev);
        tlen = master->dev->needed_tailroom;
        len = sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload);
        /* skb size is same for PRP/HSR frames, only difference
         * being, for PRP it is a trailer and for HSR it is a
         * header.
         * RedBox might use @extra more bytes.
         */
        skb = dev_alloc_skb(len + extra + hlen + tlen);

        if (!skb)
                return skb;

        skb_reserve(skb, hlen);
        skb->dev = master->dev;
        skb->priority = TC_PRIO_CONTROL;

        skb_reset_network_header(skb);
        skb_reset_transport_header(skb);
        if (dev_hard_header(skb, skb->dev, ETH_P_PRP,
                            hsr->sup_multicast_addr,
                            skb->dev->dev_addr, skb->len) <= 0)
                goto out;

        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        return skb;
out:
        kfree_skb(skb);

        return NULL;
}

static void send_hsr_supervision_frame(struct hsr_port *port,
                                       unsigned long *interval,
                                       const unsigned char *addr)
{
        struct hsr_priv *hsr = port->hsr;
        __u8 type = HSR_TLV_LIFE_CHECK;
        struct hsr_sup_payload *hsr_sp;
        struct hsr_sup_tlv *hsr_stlv;
        struct hsr_sup_tag *hsr_stag;
        struct sk_buff *skb;
        int extra = 0;

        *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
        if (hsr->announce_count < 3 && hsr->prot_version == 0) {
                type = HSR_TLV_ANNOUNCE;
                *interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
                hsr->announce_count++;
        }

        if (hsr->redbox)
                extra = sizeof(struct hsr_sup_tlv) +
                        sizeof(struct hsr_sup_payload);

        skb = hsr_init_skb(port, extra);
        if (!skb) {
                netdev_warn_once(port->dev, "HSR: Could not send supervision frame\n");
                return;
        }

        hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag));
        set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf));
        set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version);

        /* From HSRv1 on we have separate supervision sequence numbers. */
        spin_lock_bh(&hsr->seqnr_lock);
        if (hsr->prot_version > 0) {
                hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr);
                hsr->sup_sequence_nr++;
        } else {
                hsr_stag->sequence_nr = htons(hsr->sequence_nr);
                hsr->sequence_nr++;
        }

        hsr_stag->tlv.HSR_TLV_type = type;
        /* TODO: Why 12 in HSRv0? */
        hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ?
                                sizeof(struct hsr_sup_payload) : 12;

        /* Payload: MacAddressA / SAN MAC from ProxyNodeTable */
        hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
        ether_addr_copy(hsr_sp->macaddress_A, addr);

        if (hsr->redbox &&
            hsr_is_node_in_db(&hsr->proxy_node_db, addr)) {
                hsr_stlv = skb_put(skb, sizeof(struct hsr_sup_tlv));
                hsr_stlv->HSR_TLV_type = PRP_TLV_REDBOX_MAC;
                hsr_stlv->HSR_TLV_length = sizeof(struct hsr_sup_payload);

                /* Payload: MacAddressRedBox */
                hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
                ether_addr_copy(hsr_sp->macaddress_A, hsr->macaddress_redbox);
        }

        if (skb_put_padto(skb, ETH_ZLEN)) {
                spin_unlock_bh(&hsr->seqnr_lock);
                return;
        }

        hsr_forward_skb(skb, port);
        spin_unlock_bh(&hsr->seqnr_lock);
        return;
}

static void send_prp_supervision_frame(struct hsr_port *master,
                                       unsigned long *interval,
                                       const unsigned char *addr)
{
        struct hsr_priv *hsr = master->hsr;
        struct hsr_sup_payload *hsr_sp;
        struct hsr_sup_tag *hsr_stag;
        struct sk_buff *skb;

        skb = hsr_init_skb(master, 0);
        if (!skb) {
                netdev_warn_once(master->dev, "PRP: Could not send supervision frame\n");
                return;
        }

        *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
        hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag));
        set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf));
        set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0));

        /* From HSRv1 on we have separate supervision sequence numbers. */
        spin_lock_bh(&hsr->seqnr_lock);
        hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr);
        hsr->sup_sequence_nr++;
        hsr_stag->tlv.HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD;
        hsr_stag->tlv.HSR_TLV_length = sizeof(struct hsr_sup_payload);

        /* Payload: MacAddressA */
        hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
        ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr);

        if (skb_put_padto(skb, ETH_ZLEN)) {
                spin_unlock_bh(&hsr->seqnr_lock);
                return;
        }

        hsr_forward_skb(skb, master);
        spin_unlock_bh(&hsr->seqnr_lock);
}

/* Announce (supervision frame) timer function
 */
static void hsr_announce(struct timer_list *t)
{
        struct hsr_priv *hsr;
        struct hsr_port *master;
        unsigned long interval;

        hsr = from_timer(hsr, t, announce_timer);

        rcu_read_lock();
        master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
        hsr->proto_ops->send_sv_frame(master, &interval, master->dev->dev_addr);

        if (is_admin_up(master->dev))
                mod_timer(&hsr->announce_timer, jiffies + interval);

        rcu_read_unlock();
}

/* Announce (supervision frame) timer function for RedBox
 */
static void hsr_proxy_announce(struct timer_list *t)
{
        struct hsr_priv *hsr = from_timer(hsr, t, announce_proxy_timer);
        struct hsr_port *interlink;
        unsigned long interval = 0;
        struct hsr_node *node;

        rcu_read_lock();
        /* RedBOX sends supervisory frames to HSR network with MAC addresses
         * of SAN nodes stored in ProxyNodeTable.
         */
        interlink = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK);
        if (!interlink)
                goto done;

        list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) {
                if (hsr_addr_is_redbox(hsr, node->macaddress_A))
                        continue;
                hsr->proto_ops->send_sv_frame(interlink, &interval,
                                              node->macaddress_A);
        }

        if (is_admin_up(interlink->dev)) {
                if (!interval)
                        interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);

                mod_timer(&hsr->announce_proxy_timer, jiffies + interval);
        }

done:
        rcu_read_unlock();
}

void hsr_del_ports(struct hsr_priv *hsr)
{
        struct hsr_port *port;

        port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
        if (port)
                hsr_del_port(port);

        port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
        if (port)
                hsr_del_port(port);

        port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK);
        if (port)
                hsr_del_port(port);

        port = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
        if (port)
                hsr_del_port(port);
}

static void hsr_set_rx_mode(struct net_device *dev)
{
        struct hsr_port *port;
        struct hsr_priv *hsr;

        hsr = netdev_priv(dev);

        hsr_for_each_port(hsr, port) {
                if (port->type == HSR_PT_MASTER)
                        continue;
                switch (port->type) {
                case HSR_PT_SLAVE_A:
                case HSR_PT_SLAVE_B:
                        dev_mc_sync_multiple(port->dev, dev);
                        dev_uc_sync_multiple(port->dev, dev);
                        break;
                default:
                        break;
                }
        }
}

static void hsr_change_rx_flags(struct net_device *dev, int change)
{
        struct hsr_port *port;
        struct hsr_priv *hsr;

        hsr = netdev_priv(dev);

        hsr_for_each_port(hsr, port) {
                if (port->type == HSR_PT_MASTER)
                        continue;
                switch (port->type) {
                case HSR_PT_SLAVE_A:
                case HSR_PT_SLAVE_B:
                        if (change & IFF_ALLMULTI)
                                dev_set_allmulti(port->dev,
                                                 dev->flags &
                                                 IFF_ALLMULTI ? 1 : -1);
                        break;
                default:
                        break;
                }
        }
}

static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev,
                                   __be16 proto, u16 vid)
{
        bool is_slave_a_added = false;
        bool is_slave_b_added = false;
        struct hsr_port *port;
        struct hsr_priv *hsr;
        int ret = 0;

        hsr = netdev_priv(dev);

        hsr_for_each_port(hsr, port) {
                if (port->type == HSR_PT_MASTER ||
                    port->type == HSR_PT_INTERLINK)
                        continue;

                ret = vlan_vid_add(port->dev, proto, vid);
                switch (port->type) {
                case HSR_PT_SLAVE_A:
                        if (ret) {
                                /* clean up Slave-B */
                                netdev_err(dev, "add vid failed for Slave-A\n");
                                if (is_slave_b_added)
                                        vlan_vid_del(port->dev, proto, vid);
                                return ret;
                        }

                        is_slave_a_added = true;
                        break;

                case HSR_PT_SLAVE_B:
                        if (ret) {
                                /* clean up Slave-A */
                                netdev_err(dev, "add vid failed for Slave-B\n");
                                if (is_slave_a_added)
                                        vlan_vid_del(port->dev, proto, vid);
                                return ret;
                        }

                        is_slave_b_added = true;
                        break;
                default:
                        break;
                }
        }

        return 0;
}

static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev,
                                    __be16 proto, u16 vid)
{
        struct hsr_port *port;
        struct hsr_priv *hsr;

        hsr = netdev_priv(dev);

        hsr_for_each_port(hsr, port) {
                switch (port->type) {
                case HSR_PT_SLAVE_A:
                case HSR_PT_SLAVE_B:
                        vlan_vid_del(port->dev, proto, vid);
                        break;
                default:
                        break;
                }
        }

        return 0;
}

static const struct net_device_ops hsr_device_ops = {
        .ndo_change_mtu = hsr_dev_change_mtu,
        .ndo_open = hsr_dev_open,
        .ndo_stop = hsr_dev_close,
        .ndo_start_xmit = hsr_dev_xmit,
        .ndo_change_rx_flags = hsr_change_rx_flags,
        .ndo_fix_features = hsr_fix_features,
        .ndo_set_rx_mode = hsr_set_rx_mode,
        .ndo_vlan_rx_add_vid = hsr_ndo_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid = hsr_ndo_vlan_rx_kill_vid,
};

static const struct device_type hsr_type = {
        .name = "hsr",
};

static struct hsr_proto_ops hsr_ops = {
        .send_sv_frame = send_hsr_supervision_frame,
        .create_tagged_frame = hsr_create_tagged_frame,
        .get_untagged_frame = hsr_get_untagged_frame,
        .drop_frame = hsr_drop_frame,
        .fill_frame_info = hsr_fill_frame_info,
        .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame,
        .register_frame_out = hsr_register_frame_out,
};

static struct hsr_proto_ops prp_ops = {
        .send_sv_frame = send_prp_supervision_frame,
        .create_tagged_frame = prp_create_tagged_frame,
        .get_untagged_frame = prp_get_untagged_frame,
        .drop_frame = prp_drop_frame,
        .fill_frame_info = prp_fill_frame_info,
        .handle_san_frame = prp_handle_san_frame,
        .update_san_info = prp_update_san_info,
        .register_frame_out = prp_register_frame_out,
};

void hsr_dev_setup(struct net_device *dev)
{
        eth_hw_addr_random(dev);

        ether_setup(dev);
        dev->min_mtu = 0;
        dev->header_ops = &hsr_header_ops;
        dev->netdev_ops = &hsr_device_ops;
        SET_NETDEV_DEVTYPE(dev, &hsr_type);
        dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL;
        /* Prevent recursive tx locking */
        dev->lltx = true;
        /* Not sure about this. Taken from bridge code. netdevice.h says
         * it means "Does not change network namespaces".
         */
        dev->netns_immutable = true;

        dev->needs_free_netdev = true;

        dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
                           NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
                           NETIF_F_HW_VLAN_CTAG_TX |
                           NETIF_F_HW_VLAN_CTAG_FILTER;

        dev->features = dev->hw_features;
}

/* Return true if dev is a HSR master; return false otherwise.
 */
bool is_hsr_master(struct net_device *dev)
{
        return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit);
}
EXPORT_SYMBOL(is_hsr_master);

struct net_device *hsr_get_port_ndev(struct net_device *ndev,
                                     enum hsr_port_type pt)
{
        struct hsr_priv *hsr = netdev_priv(ndev);
        struct hsr_port *port;

        hsr_for_each_port(hsr, port)
                if (port->type == pt)
                        return port->dev;
        return NULL;
}
EXPORT_SYMBOL(hsr_get_port_ndev);

/* Default multicast address for HSR Supervision frames */
static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
        0x01, 0x15, 0x4e, 0x00, 0x01, 0x00
};

int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
                     struct net_device *interlink, unsigned char multicast_spec,
                     u8 protocol_version, struct netlink_ext_ack *extack)
{
        bool unregister = false;
        struct hsr_priv *hsr;
        int res;

        hsr = netdev_priv(hsr_dev);
        INIT_LIST_HEAD(&hsr->ports);
        INIT_LIST_HEAD(&hsr->node_db);
        INIT_LIST_HEAD(&hsr->proxy_node_db);
        spin_lock_init(&hsr->list_lock);

        eth_hw_addr_set(hsr_dev, slave[0]->dev_addr);

        /* initialize protocol specific functions */
        if (protocol_version == PRP_V1) {
                /* For PRP, lan_id has most significant 3 bits holding
                 * the net_id of PRP_LAN_ID
                 */
                hsr->net_id = PRP_LAN_ID << 1;
                hsr->proto_ops = &prp_ops;
        } else {
                hsr->proto_ops = &hsr_ops;
        }

        /* Make sure we recognize frames from ourselves in hsr_rcv() */
        res = hsr_create_self_node(hsr, hsr_dev->dev_addr,
                                   slave[1]->dev_addr);
        if (res < 0)
                return res;

        spin_lock_init(&hsr->seqnr_lock);
        /* Overflow soon to find bugs easier: */
        hsr->sequence_nr = HSR_SEQNR_START;
        hsr->sup_sequence_nr = HSR_SUP_SEQNR_START;

        timer_setup(&hsr->announce_timer, hsr_announce, 0);
        timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0);
        timer_setup(&hsr->prune_proxy_timer, hsr_prune_proxy_nodes, 0);
        timer_setup(&hsr->announce_proxy_timer, hsr_proxy_announce, 0);

        ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr);
        hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec;

        hsr->prot_version = protocol_version;

        /* Make sure the 1st call to netif_carrier_on() gets through */
        netif_carrier_off(hsr_dev);

        res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER, extack);
        if (res)
                goto err_add_master;

        /* HSR forwarding offload supported in lower device? */
        if ((slave[0]->features & NETIF_F_HW_HSR_FWD) &&
            (slave[1]->features & NETIF_F_HW_HSR_FWD))
                hsr->fwd_offloaded = true;

        if ((slave[0]->features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
            (slave[1]->features & NETIF_F_HW_VLAN_CTAG_FILTER))
                hsr_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;

        res = register_netdevice(hsr_dev);
        if (res)
                goto err_unregister;

        unregister = true;

        res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A, extack);
        if (res)
                goto err_unregister;

        res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B, extack);
        if (res)
                goto err_unregister;

        if (interlink) {
                res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack);
                if (res)
                        goto err_unregister;

                hsr->redbox = true;
                ether_addr_copy(hsr->macaddress_redbox, interlink->dev_addr);
                mod_timer(&hsr->prune_proxy_timer,
                          jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD));
        }

        hsr_debugfs_init(hsr, hsr_dev);
        mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD));

        return 0;

err_unregister:
        hsr_del_ports(hsr);
err_add_master:
        hsr_del_self_node(hsr);

        if (unregister)
                unregister_netdevice(hsr_dev);
        return res;
}






























  132 



























  132 






  136 





  136 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/tlb.h
 *
 * Copyright (C) 2002 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_TLB_H
#define __ASM_TLB_H

#include <linux/pagemap.h>


#define tlb_flush tlb_flush
static void tlb_flush(struct mmu_gather *tlb);

#include <asm-generic/tlb.h>

/*
 * get the tlbi levels in arm64.  Default value is TLBI_TTL_UNKNOWN if more than
 * one of cleared_* is set or neither is set - this elides the level hinting to
 * the hardware.
 */
static inline int tlb_get_level(struct mmu_gather *tlb)
{
        /* The TTL field is only valid for the leaf entry. */
        if (tlb->freed_tables)
                return TLBI_TTL_UNKNOWN;

        if (tlb->cleared_ptes && !(tlb->cleared_pmds ||
                                   tlb->cleared_puds ||
                                   tlb->cleared_p4ds))
                return 3;

        if (tlb->cleared_pmds && !(tlb->cleared_ptes ||
                                   tlb->cleared_puds ||
                                   tlb->cleared_p4ds))
                return 2;

        if (tlb->cleared_puds && !(tlb->cleared_ptes ||
                                   tlb->cleared_pmds ||
                                   tlb->cleared_p4ds))
                return 1;

        if (tlb->cleared_p4ds && !(tlb->cleared_ptes ||
                                   tlb->cleared_pmds ||
                                   tlb->cleared_puds))
                return 0;

        return TLBI_TTL_UNKNOWN;
}

static inline void tlb_flush(struct mmu_gather *tlb)
{
        struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0);
        bool last_level = !tlb->freed_tables;
        unsigned long stride = tlb_get_unmap_size(tlb);
        int tlb_level = tlb_get_level(tlb);

        /*
         * If we're tearing down the address space then we only care about
         * invalidating the walk-cache, since the ASID allocator won't
         * reallocate our ASID without invalidating the entire TLB.
         */
        if (tlb->fullmm) {
                if (!last_level)
                        flush_tlb_mm(tlb->mm);
                return;
        }

        __flush_tlb_range(&vma, tlb->start, tlb->end, stride,
                          last_level, tlb_level);
}

static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
                                  unsigned long addr)
{
        struct ptdesc *ptdesc = page_ptdesc(pte);

        tlb_remove_ptdesc(tlb, ptdesc);
}

#if CONFIG_PGTABLE_LEVELS > 2
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp,
                                  unsigned long addr)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pmdp);

        tlb_remove_ptdesc(tlb, ptdesc);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 3
static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp,
                                  unsigned long addr)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pudp);

        if (!pgtable_l4_enabled())
                return;

        tlb_remove_ptdesc(tlb, ptdesc);
}
#endif

#if CONFIG_PGTABLE_LEVELS > 4
static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4dp,
                                  unsigned long addr)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(p4dp);

        if (!pgtable_l5_enabled())
                return;

        tlb_remove_ptdesc(tlb, ptdesc);
}
#endif

#endif





















  147 

























































































































































































































  188 
  189 

  189 











  189 


























































































































































































































































































































































































































  202 













  202 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#include <linux/dcache.h>
#include <linux/fs.h>
#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/srcu.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

/*
 * Clear all of the marks on an inode when it is being evicted from core
 */
void __fsnotify_inode_delete(struct inode *inode)
{
        fsnotify_clear_marks_by_inode(inode);
}
EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);

void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        fsnotify_clear_marks_by_mount(mnt);
}

void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
{
        fsnotify_clear_marks_by_mntns(mntns);
}

/**
 * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
 * @sb: superblock being unmounted.
 *
 * Called during unmount with no locks held, so needs to be safe against
 * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
 */
static void fsnotify_unmount_inodes(struct super_block *sb)
{
        struct inode *inode, *iput_inode = NULL;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 * We cannot __iget() an inode in state I_FREEING,
                 * I_WILL_FREE, or I_NEW which is fine because by that point
                 * the inode cannot have any associated watches.
                 */
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                /*
                 * If i_count is zero, the inode cannot have any watches and
                 * doing an __iget/iput with SB_ACTIVE clear would actually
                 * evict all inodes with zero i_count from icache which is
                 * unnecessarily violent and may in fact be illegal to do.
                 * However, we should have been called /after/ evict_inodes
                 * removed all zero refcount inodes, in any case.  Test to
                 * be sure.
                 */
                if (!atomic_read(&inode->i_count)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

                iput(iput_inode);

                /* for each watch, send FS_UNMOUNT and then remove it */
                fsnotify_inode(inode, FS_UNMOUNT);

                fsnotify_inode_delete(inode);

                iput_inode = inode;

                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);

        iput(iput_inode);
}

void fsnotify_sb_delete(struct super_block *sb)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        /* Were any marks ever added to any object on this sb? */
        if (!sbinfo)
                return;

        fsnotify_unmount_inodes(sb);
        fsnotify_clear_marks_by_sb(sb);
        /* Wait for outstanding object references from connectors */
        wait_var_event(fsnotify_sb_watched_objects(sb),
                       !atomic_long_read(fsnotify_sb_watched_objects(sb)));
        WARN_ON(fsnotify_sb_has_priority_watchers(sb, FSNOTIFY_PRIO_CONTENT));
        WARN_ON(fsnotify_sb_has_priority_watchers(sb,
                                                  FSNOTIFY_PRIO_PRE_CONTENT));
}

void fsnotify_sb_free(struct super_block *sb)
{
        kfree(sb->s_fsnotify_info);
}

/*
 * Given an inode, first check if we care what happens to our children.  Inotify
 * and dnotify both tell their parents about events.  If we care about any event
 * on a child we run all of our children and set a dentry flag saying that the
 * parent cares.  Thus when an event happens on a child it can quickly tell
 * if there is a need to find a parent and send the event to the parent.
 */
void fsnotify_set_children_dentry_flags(struct inode *inode)
{
        struct dentry *alias;

        if (!S_ISDIR(inode->i_mode))
                return;

        spin_lock(&inode->i_lock);
        /* run all of the dentries associated with this inode.  Since this is a
         * directory, there damn well better only be one item on this list */
        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                struct dentry *child;

                /* run all of the children of the original inode and fix their
                 * d_flags to indicate parental interest (their parent is the
                 * original inode) */
                spin_lock(&alias->d_lock);
                hlist_for_each_entry(child, &alias->d_children, d_sib) {
                        if (!child->d_inode)
                                continue;

                        spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                        child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
                        spin_unlock(&child->d_lock);
                }
                spin_unlock(&alias->d_lock);
        }
        spin_unlock(&inode->i_lock);
}

/*
 * Lazily clear false positive PARENT_WATCHED flag for child whose parent had
 * stopped watching children.
 */
static void fsnotify_clear_child_dentry_flag(struct inode *pinode,
                                             struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        /*
         * d_lock is a sufficient barrier to prevent observing a non-watched
         * parent state from before the fsnotify_set_children_dentry_flags()
         * or fsnotify_update_flags() call that had set PARENT_WATCHED.
         */
        if (!fsnotify_inode_watches_children(pinode))
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
        spin_unlock(&dentry->d_lock);
}

/* Are inode/sb/mount interested in parent and name info with this event? */
static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
                                        __u32 mask)
{
        __u32 marks_mask = 0;

        /* We only send parent/name to inode/sb/mount for events on non-dir */
        if (mask & FS_ISDIR)
                return false;

        /*
         * All events that are possible on child can also may be reported with
         * parent/name info to inode/sb/mount.  Otherwise, a watching parent
         * could result in events reported with unexpected name info to sb/mount.
         */
        BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT);

        /* Did either inode/sb/mount subscribe for events with parent/name? */
        marks_mask |= fsnotify_parent_needed_mask(
                                READ_ONCE(inode->i_fsnotify_mask));
        marks_mask |= fsnotify_parent_needed_mask(
                                READ_ONCE(inode->i_sb->s_fsnotify_mask));
        marks_mask |= fsnotify_parent_needed_mask(mnt_mask);

        /* Did they subscribe for this event with parent/name info? */
        return mask & marks_mask;
}

/* Are there any inode/mount/sb objects that watch for these events? */
static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
                                           __u32 mask)
{
        __u32 marks_mask = READ_ONCE(inode->i_fsnotify_mask) | mnt_mask |
                           READ_ONCE(inode->i_sb->s_fsnotify_mask);

        return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
}

/* Report pre-content event with optional range info */
int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
                         size_t count)
{
        struct file_range range;

        /* Report page aligned range only when pos is known */
        if (!ppos)
                return fsnotify_path(path, FS_PRE_ACCESS);

        range.path = path;
        range.pos = PAGE_ALIGN_DOWN(*ppos);
        range.count = PAGE_ALIGN(*ppos + count) - range.pos;

        return fsnotify_parent(path->dentry, FS_PRE_ACCESS, &range,
                               FSNOTIFY_EVENT_FILE_RANGE);
}

/*
 * Notify this dentry's parent about a child's events with child name info
 * if parent is watching or if inode/sb/mount are interested in events with
 * parent and name info.
 *
 * Notify only the child without name info if parent is not watching and
 * inode/sb/mount are not interested in events with parent and name info.
 */
int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                      int data_type)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        __u32 mnt_mask = path ?
                READ_ONCE(real_mount(path->mnt)->mnt_fsnotify_mask) : 0;
        struct inode *inode = d_inode(dentry);
        struct dentry *parent;
        bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
        bool parent_needed, parent_interested;
        __u32 p_mask;
        struct inode *p_inode = NULL;
        struct name_snapshot name;
        struct qstr *file_name = NULL;
        int ret = 0;

        /* Optimize the likely case of nobody watching this path */
        if (likely(!parent_watched &&
                   !fsnotify_object_watched(inode, mnt_mask, mask)))
                return 0;

        parent = NULL;
        parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask);
        if (!parent_watched && !parent_needed)
                goto notify;

        /* Does parent inode care about events on children? */
        parent = dget_parent(dentry);
        p_inode = parent->d_inode;
        p_mask = fsnotify_inode_watches_children(p_inode);
        if (unlikely(parent_watched && !p_mask))
                fsnotify_clear_child_dentry_flag(p_inode, dentry);

        /*
         * Include parent/name in notification either if some notification
         * groups require parent info or the parent is interested in this event.
         */
        parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS;
        if (parent_needed || parent_interested) {
                /* When notifying parent, child should be passed as data */
                WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));

                /* Notify both parent and child with child name info */
                take_dentry_name_snapshot(&name, dentry);
                file_name = &name.name;
                if (parent_interested)
                        mask |= FS_EVENT_ON_CHILD;
        }

notify:
        ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0);

        if (file_name)
                release_dentry_name_snapshot(&name);
        dput(parent);

        return ret;
}
EXPORT_SYMBOL_GPL(__fsnotify_parent);

static int fsnotify_handle_inode_event(struct fsnotify_group *group,
                                       struct fsnotify_mark *inode_mark,
                                       u32 mask, const void *data, int data_type,
                                       struct inode *dir, const struct qstr *name,
                                       u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct inode *inode = fsnotify_data_inode(data, data_type);
        const struct fsnotify_ops *ops = group->ops;

        if (WARN_ON_ONCE(!ops->handle_inode_event))
                return 0;

        if (WARN_ON_ONCE(!inode && !dir))
                return 0;

        if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) &&
            path && d_unlinked(path->dentry))
                return 0;

        /* Check interest of this mark in case event was sent with two marks */
        if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS))
                return 0;

        return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie);
}

static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
                                 const void *data, int data_type,
                                 struct inode *dir, const struct qstr *name,
                                 u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info);
        struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info);
        int ret;

        if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) ||
            WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
                return 0;

        /*
         * For FS_RENAME, 'dir' is old dir and 'data' is new dentry.
         * The only ->handle_inode_event() backend that supports FS_RENAME is
         * dnotify, where it means file was renamed within same parent.
         */
        if (mask & FS_RENAME) {
                struct dentry *moved = fsnotify_data_dentry(data, data_type);

                if (dir != moved->d_parent->d_inode)
                        return 0;
        }

        if (parent_mark) {
                ret = fsnotify_handle_inode_event(group, parent_mark, mask,
                                                  data, data_type, dir, name, 0);
                if (ret)
                        return ret;
        }

        if (!inode_mark)
                return 0;

        /*
         * Some events can be sent on both parent dir and child marks (e.g.
         * FS_ATTRIB).  If both parent dir and child are watching, report the
         * event once to parent dir with name (if interested) and once to child
         * without name (if interested).
         *
         * In any case regardless whether the parent is watching or not, the
         * child watcher is expecting an event without the FS_EVENT_ON_CHILD
         * flag. The file name is expected if and only if this is a directory
         * event.
         */
        mask &= ~FS_EVENT_ON_CHILD;
        if (!(mask & ALL_FSNOTIFY_DIRENT_EVENTS)) {
                dir = NULL;
                name = NULL;
        }

        return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type,
                                           dir, name, cookie);
}

static int send_to_group(__u32 mask, const void *data, int data_type,
                         struct inode *dir, const struct qstr *file_name,
                         u32 cookie, struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *group = NULL;
        __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        __u32 marks_mask = 0;
        __u32 marks_ignore_mask = 0;
        bool is_dir = mask & FS_ISDIR;
        struct fsnotify_mark *mark;
        int type;

        if (!iter_info->report_mask)
                return 0;

        /* clear ignored on inode modification */
        if (mask & FS_MODIFY) {
                fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                        if (!(mark->flags &
                              FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                                mark->ignore_mask = 0;
                }
        }

        /* Are any of the group marks interested in this event? */
        fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
                group = mark->group;
                marks_mask |= mark->mask;
                marks_ignore_mask |=
                        fsnotify_effective_ignore_mask(mark, is_dir, type);
        }

        pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
                 __func__, group, mask, marks_mask, marks_ignore_mask,
                 data, data_type, dir, cookie);

        if (!(test_mask & marks_mask & ~marks_ignore_mask))
                return 0;

        if (group->ops->handle_event) {
                return group->ops->handle_event(group, mask, data, data_type, dir,
                                                file_name, cookie, iter_info);
        }

        return fsnotify_handle_event(group, mask, data, data_type, dir,
                                     file_name, cookie, iter_info);
}

static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp)
{
        struct fsnotify_mark_connector *conn;
        struct hlist_node *node = NULL;

        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (conn)
                node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
{
        struct hlist_node *node = NULL;

        if (mark)
                node = srcu_dereference(mark->obj_list.next,
                                        &fsnotify_mark_srcu);

        return hlist_entry_safe(node, struct fsnotify_mark, obj_list);
}

/*
 * iter_info is a multi head priority queue of marks.
 * Pick a subset of marks from queue heads, all with the same group
 * and set the report_mask to a subset of the selected marks.
 * Returns false if there are no more groups to iterate.
 */
static bool fsnotify_iter_select_report_types(
                struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_group *max_prio_group = NULL;
        struct fsnotify_mark *mark;
        int type;

        /* Choose max prio group among groups of all queue heads */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark &&
                    fsnotify_compare_groups(max_prio_group, mark->group) > 0)
                        max_prio_group = mark->group;
        }

        if (!max_prio_group)
                return false;

        /* Set the report mask for marks from same group as max prio group */
        iter_info->current_group = max_prio_group;
        iter_info->report_mask = 0;
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group) {
                        /*
                         * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode
                         * is watching children and interested in this event,
                         * which is an event possible on child.
                         * But is *this mark* watching children?
                         */
                        if (type == FSNOTIFY_ITER_TYPE_PARENT &&
                            !(mark->mask & FS_EVENT_ON_CHILD) &&
                            !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD))
                                continue;

                        fsnotify_iter_set_report_type(iter_info, type);
                }
        }

        return true;
}

/*
 * Pop from iter_info multi head queue, the marks that belong to the group of
 * current iteration step.
 */
static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
{
        struct fsnotify_mark *mark;
        int type;

        /*
         * We cannot use fsnotify_foreach_iter_mark_type() here because we
         * may need to advance a mark of type X that belongs to current_group
         * but was not selected for reporting.
         */
        fsnotify_foreach_iter_type(type) {
                mark = iter_info->marks[type];
                if (mark && mark->group == iter_info->current_group)
                        iter_info->marks[type] =
                                fsnotify_next_mark(iter_info->marks[type]);
        }
}

/*
 * fsnotify - This is the main call to fsnotify.
 *
 * The VFS calls into hook specific functions in linux/fsnotify.h.
 * Those functions then in turn call here.  Here will call out to all of the
 * registered fsnotify_group.  Those groups can then use the notification event
 * in whatever means they feel necessary.
 *
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @inode:        optional inode associated with event -
 *                If @dir and @inode are both non-NULL, event may be
 *                reported to both.
 * @cookie:        inotify rename cookie
 */
int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
             const struct qstr *file_name, struct inode *inode, u32 cookie)
{
        const struct path *path = fsnotify_data_path(data, data_type);
        struct super_block *sb = fsnotify_data_sb(data, data_type);
        const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type);
        struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL;
        struct fsnotify_iter_info iter_info = {};
        struct mount *mnt = NULL;
        struct inode *inode2 = NULL;
        struct dentry *moved;
        int inode2_type;
        int ret = 0;
        __u32 test_mask, marks_mask = 0;

        if (path)
                mnt = real_mount(path->mnt);

        if (!inode) {
                /* Dirent event - report on TYPE_INODE to dir */
                inode = dir;
                /* For FS_RENAME, inode is old_dir and inode2 is new_dir */
                if (mask & FS_RENAME) {
                        moved = fsnotify_data_dentry(data, data_type);
                        inode2 = moved->d_parent->d_inode;
                        inode2_type = FSNOTIFY_ITER_TYPE_INODE2;
                }
        } else if (mask & FS_EVENT_ON_CHILD) {
                /*
                 * Event on child - report on TYPE_PARENT to dir if it is
                 * watching children and on TYPE_INODE to child.
                 */
                inode2 = dir;
                inode2_type = FSNOTIFY_ITER_TYPE_PARENT;
        }

        /*
         * Optimization: srcu_read_lock() has a memory barrier which can
         * be expensive.  It protects walking the *_fsnotify_marks lists.
         * However, if we do not walk the lists, we do not have to do
         * SRCU because we have no references to any objects and do not
         * need SRCU to keep them "alive".
         */
        if ((!sbinfo || !sbinfo->sb_marks) &&
            (!mnt || !mnt->mnt_fsnotify_marks) &&
            (!inode || !inode->i_fsnotify_marks) &&
            (!inode2 || !inode2->i_fsnotify_marks) &&
            (!mnt_data || !mnt_data->ns->n_fsnotify_marks))
                return 0;

        if (sb)
                marks_mask |= READ_ONCE(sb->s_fsnotify_mask);
        if (mnt)
                marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask);
        if (inode)
                marks_mask |= READ_ONCE(inode->i_fsnotify_mask);
        if (inode2)
                marks_mask |= READ_ONCE(inode2->i_fsnotify_mask);
        if (mnt_data)
                marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask);

        /*
         * If this is a modify event we may need to clear some ignore masks.
         * In that case, the object with ignore masks will have the FS_MODIFY
         * event in its mask.
         * Otherwise, return if none of the marks care about this type of event.
         */
        test_mask = (mask & ALL_FSNOTIFY_EVENTS);
        if (!(test_mask & marks_mask))
                return 0;

        iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);

        if (sbinfo) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
                        fsnotify_first_mark(&sbinfo->sb_marks);
        }
        if (mnt) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
                        fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
        }
        if (inode) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] =
                        fsnotify_first_mark(&inode->i_fsnotify_marks);
        }
        if (inode2) {
                iter_info.marks[inode2_type] =
                        fsnotify_first_mark(&inode2->i_fsnotify_marks);
        }
        if (mnt_data) {
                iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] =
                        fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks);
        }

        /*
         * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
         * ignore masks are properly reflected for mount/sb mark notifications.
         * That's why this traversal is so complicated...
         */
        while (fsnotify_iter_select_report_types(&iter_info)) {
                ret = send_to_group(mask, data, data_type, dir, file_name,
                                    cookie, &iter_info);

                if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS))
                        goto out;

                fsnotify_iter_next(&iter_info);
        }
        ret = 0;
out:
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx);

        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify);

#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
/*
 * At open time we check fsnotify_sb_has_priority_watchers() and set the
 * FMODE_NONOTIFY_ mode bits accordignly.
 * Later, fsnotify permission hooks do not check if there are permission event
 * watches, but that there were permission event watches at open time.
 */
void file_set_fsnotify_mode_from_watchers(struct file *file)
{
        struct dentry *dentry = file->f_path.dentry, *parent;
        struct super_block *sb = dentry->d_sb;
        __u32 mnt_mask, p_mask;

        /* Is it a file opened by fanotify? */
        if (FMODE_FSNOTIFY_NONE(file->f_mode))
                return;

        /*
         * Permission events is a super set of pre-content events, so if there
         * are no permission event watchers, there are also no pre-content event
         * watchers and this is implied from the single FMODE_NONOTIFY_PERM bit.
         */
        if (likely(!fsnotify_sb_has_priority_watchers(sb,
                                                FSNOTIFY_PRIO_CONTENT))) {
                file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
                return;
        }

        /*
         * If there are permission event watchers but no pre-content event
         * watchers, set FMODE_NONOTIFY | FMODE_NONOTIFY_PERM to indicate that.
         */
        if ((!d_is_dir(dentry) && !d_is_reg(dentry)) ||
            likely(!fsnotify_sb_has_priority_watchers(sb,
                                                FSNOTIFY_PRIO_PRE_CONTENT))) {
                file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
                return;
        }

        /*
         * OK, there are some pre-content watchers. Check if anybody is
         * watching for pre-content events on *this* file.
         */
        mnt_mask = READ_ONCE(real_mount(file->f_path.mnt)->mnt_fsnotify_mask);
        if (unlikely(fsnotify_object_watched(d_inode(dentry), mnt_mask,
                                     FSNOTIFY_PRE_CONTENT_EVENTS))) {
                /* Enable pre-content events */
                file_set_fsnotify_mode(file, 0);
                return;
        }

        /* Is parent watching for pre-content events on this file? */
        if (dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED) {
                parent = dget_parent(dentry);
                p_mask = fsnotify_inode_watches_children(d_inode(parent));
                dput(parent);
                if (p_mask & FSNOTIFY_PRE_CONTENT_EVENTS) {
                        /* Enable pre-content events */
                        file_set_fsnotify_mode(file, 0);
                        return;
                }
        }
        /* Nobody watching for pre-content events from this file */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY | FMODE_NONOTIFY_PERM);
}
#endif

void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt)
{
        struct fsnotify_mnt data = {
                .ns = ns,
                .mnt_id = real_mount(mnt)->mnt_id_unique,
        };

        if (WARN_ON_ONCE(!ns))
                return;

        /*
         * This is an optimization as well as making sure fsnotify_init() has
         * been called.
         */
        if (!ns->n_fsnotify_marks)
                return;

        fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0);
}

static __init int fsnotify_init(void)
{
        int ret;

        BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26);

        ret = init_srcu_struct(&fsnotify_mark_srcu);
        if (ret)
                panic("initializing fsnotify_mark_srcu");

        fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector,
                                                    SLAB_PANIC);

        return 0;
}
core_initcall(fsnotify_init);























































































































































































































































































































































































































































































































































































































































































































































































































































































  507 



































    7 








    6 















    7 




    2 


    7 



















































































































































































































































































































































































































































































































































































  506 




  506 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   65 
   65 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Contains CPU feature definitions
 *
 * Copyright (C) 2015 ARM Ltd.
 *
 * A note for the weary kernel hacker: the code here is confusing and hard to
 * follow! That's partly because it's solving a nasty problem, but also because
 * there's a little bit of over-abstraction that tends to obscure what's going
 * on behind a maze of helper functions and macros.
 *
 * The basic problem is that hardware folks have started gluing together CPUs
 * with distinct architectural features; in some cases even creating SoCs where
 * user-visible instructions are available only on a subset of the available
 * cores. We try to address this by snapshotting the feature registers of the
 * boot CPU and comparing these with the feature registers of each secondary
 * CPU when bringing them up. If there is a mismatch, then we update the
 * snapshot state to indicate the lowest-common denominator of the feature,
 * known as the "safe" value. This snapshot state can be queried to view the
 * "sanitised" value of a feature register.
 *
 * The sanitised register values are used to decide which capabilities we
 * have in the system. These may be in the form of traditional "hwcaps"
 * advertised to userspace or internal "cpucaps" which are used to configure
 * things like alternative patching and static keys. While a feature mismatch
 * may result in a TAINT_CPU_OUT_OF_SPEC kernel taint, a capability mismatch
 * may prevent a CPU from being onlined at all.
 *
 * Some implementation details worth remembering:
 *
 * - Mismatched features are *always* sanitised to a "safe" value, which
 *   usually indicates that the feature is not supported.
 *
 * - A mismatched feature marked with FTR_STRICT will cause a "SANITY CHECK"
 *   warning when onlining an offending CPU and the kernel will be tainted
 *   with TAINT_CPU_OUT_OF_SPEC.
 *
 * - Features marked as FTR_VISIBLE have their sanitised value visible to
 *   userspace. FTR_VISIBLE features in registers that are only visible
 *   to EL0 by trapping *must* have a corresponding HWCAP so that late
 *   onlining of CPUs cannot lead to features disappearing at runtime.
 *
 * - A "feature" is typically a 4-bit register field. A "capability" is the
 *   high-level description derived from the sanitised field value.
 *
 * - Read the Arm ARM (DDI 0487F.a) section D13.1.3 ("Principles of the ID
 *   scheme for fields in ID registers") to understand when feature fields
 *   may be signed or unsigned (FTR_SIGNED and FTR_UNSIGNED accordingly).
 *
 * - KVM exposes its own view of the feature registers to guest operating
 *   systems regardless of FTR_VISIBLE. This is typically driven from the
 *   sanitised register values to allow virtual CPUs to be migrated between
 *   arbitrary physical CPUs, but some features not present on the host are
 *   also advertised and emulated. Look at sys_reg_descs[] for the gory
 *   details.
 *
 * - If the arm64_ftr_bits[] for a register has a missing field, then this
 *   field is treated as STRICT RES0, including for read_sanitised_ftr_reg().
 *   This is stronger than FTR_HIDDEN and can be used to hide features from
 *   KVM guests.
 */

#define pr_fmt(fmt) "CPU features: " fmt

#include <linux/bsearch.h>
#include <linux/cpumask.h>
#include <linux/crash_dump.h>
#include <linux/kstrtox.h>
#include <linux/sort.h>
#include <linux/stop_machine.h>
#include <linux/sysfs.h>
#include <linux/types.h>
#include <linux/minmax.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/kasan.h>
#include <linux/percpu.h>
#include <linux/sched/isolation.h>

#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include <asm/cpu_ops.h>
#include <asm/fpsimd.h>
#include <asm/hwcap.h>
#include <asm/insn.h>
#include <asm/kvm_host.h>
#include <asm/mmu_context.h>
#include <asm/mte.h>
#include <asm/hypervisor.h>
#include <asm/processor.h>
#include <asm/smp.h>
#include <asm/sysreg.h>
#include <asm/traps.h>
#include <asm/vectors.h>
#include <asm/virt.h>

/* Kernel representation of AT_HWCAP and AT_HWCAP2 */
static DECLARE_BITMAP(elf_hwcap, MAX_CPU_FEATURES) __read_mostly;

#ifdef CONFIG_COMPAT
#define COMPAT_ELF_HWCAP_DEFAULT        \
                                (COMPAT_HWCAP_HALF|COMPAT_HWCAP_THUMB|\
                                 COMPAT_HWCAP_FAST_MULT|COMPAT_HWCAP_EDSP|\
                                 COMPAT_HWCAP_TLS|COMPAT_HWCAP_IDIV|\
                                 COMPAT_HWCAP_LPAE)
unsigned int compat_elf_hwcap __read_mostly = COMPAT_ELF_HWCAP_DEFAULT;
unsigned int compat_elf_hwcap2 __read_mostly;
unsigned int compat_elf_hwcap3 __read_mostly;
#endif

DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS);
EXPORT_SYMBOL(system_cpucaps);
static struct arm64_cpu_capabilities const __ro_after_init *cpucap_ptrs[ARM64_NCAPS];

DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS);

bool arm64_use_ng_mappings = false;
EXPORT_SYMBOL(arm64_use_ng_mappings);

DEFINE_PER_CPU_READ_MOSTLY(const char *, this_cpu_vector) = vectors;

/*
 * Permit PER_LINUX32 and execve() of 32-bit binaries even if not all CPUs
 * support it?
 */
static bool __read_mostly allow_mismatched_32bit_el0;

/*
 * Static branch enabled only if allow_mismatched_32bit_el0 is set and we have
 * seen at least one CPU capable of 32-bit EL0.
 */
DEFINE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);

/*
 * Mask of CPUs supporting 32-bit EL0.
 * Only valid if arm64_mismatched_32bit_el0 is enabled.
 */
static cpumask_var_t cpu_32bit_el0_mask __cpumask_var_read_mostly;

void dump_cpu_features(void)
{
        /* file-wide pr_fmt adds "CPU features: " prefix */
        pr_emerg("0x%*pb\n", ARM64_NCAPS, &system_cpucaps);
}

#define __ARM64_MAX_POSITIVE(reg, field)                                \
                ((reg##_##field##_SIGNED ?                                \
                  BIT(reg##_##field##_WIDTH - 1) :                        \
                  BIT(reg##_##field##_WIDTH)) - 1)

#define __ARM64_MIN_NEGATIVE(reg, field)  BIT(reg##_##field##_WIDTH - 1)

#define __ARM64_CPUID_FIELDS(reg, field, min_value, max_value)                \
                .sys_reg = SYS_##reg,                                        \
                .field_pos = reg##_##field##_SHIFT,                        \
                .field_width = reg##_##field##_WIDTH,                        \
                .sign = reg##_##field##_SIGNED,                                \
                .min_field_value = min_value,                                \
                .max_field_value = max_value,

/*
 * ARM64_CPUID_FIELDS() encodes a field with a range from min_value to
 * an implicit maximum that depends on the sign-ess of the field.
 *
 * An unsigned field will be capped at all ones, while a signed field
 * will be limited to the positive half only.
 */
#define ARM64_CPUID_FIELDS(reg, field, min_value)                        \
        __ARM64_CPUID_FIELDS(reg, field,                                \
                             SYS_FIELD_VALUE(reg, field, min_value),        \
                             __ARM64_MAX_POSITIVE(reg, field))

/*
 * ARM64_CPUID_FIELDS_NEG() encodes a field with a range from an
 * implicit minimal value to max_value. This should be used when
 * matching a non-implemented property.
 */
#define ARM64_CPUID_FIELDS_NEG(reg, field, max_value)                        \
        __ARM64_CPUID_FIELDS(reg, field,                                \
                             __ARM64_MIN_NEGATIVE(reg, field),                \
                             SYS_FIELD_VALUE(reg, field, max_value))

#define __ARM64_FTR_BITS(SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \
        {                                                \
                .sign = SIGNED,                                \
                .visible = VISIBLE,                        \
                .strict = STRICT,                        \
                .type = TYPE,                                \
                .shift = SHIFT,                                \
                .width = WIDTH,                                \
                .safe_val = SAFE_VAL,                        \
        }

/* Define a feature with unsigned values */
#define ARM64_FTR_BITS(VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \
        __ARM64_FTR_BITS(FTR_UNSIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL)

/* Define a feature with a signed value */
#define S_ARM64_FTR_BITS(VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL) \
        __ARM64_FTR_BITS(FTR_SIGNED, VISIBLE, STRICT, TYPE, SHIFT, WIDTH, SAFE_VAL)

#define ARM64_FTR_END                                        \
        {                                                \
                .width = 0,                                \
        }

static void cpu_enable_cnp(struct arm64_cpu_capabilities const *cap);

static bool __system_matches_cap(unsigned int n);

/*
 * NOTE: Any changes to the visibility of features should be kept in
 * sync with the documentation of the CPU feature register ABI.
 */
static const struct arm64_ftr_bits ftr_id_aa64isar0[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RNDR_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TLB_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_TS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_FHM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_DP_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM4_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SM3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_RDM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_ATOMIC_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_CRC32_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA2_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_SHA1_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_EL1_AES_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_XS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_I8MM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DGH_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_BF16_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SPECRES_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_SB_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FRINTTS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPI_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_GPA_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_LRCPC_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_FCMA_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_JSCVT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
                       FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_API_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
                       FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_EL1_APA_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR1_EL1_DPB_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_LUT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CSSC_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRFM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_CLRBHB_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_BC_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_MOPS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
                       FTR_STRICT, FTR_EXACT, ID_AA64ISAR2_EL1_APA3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_PTR_AUTH),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_GPA3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_RPRES_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_EL1_WFxT_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64isar3[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FPRCVT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR3_EL1_FAMINMAX_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_CSV2_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_DIT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AMU_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_MPAM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SEL2_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                                   FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SVE_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_RAS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_GIC_SHIFT, 4, 0),
        S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_AdvSIMD_SHIFT, 4, ID_AA64PFR0_EL1_AdvSIMD_NI),
        S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_FP_SHIFT, 4, ID_AA64PFR0_EL1_FP_NI),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL2_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL1_SHIFT, 4, ID_AA64PFR0_EL1_EL1_IMP),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_EL0_SHIFT, 4, ID_AA64PFR0_EL1_EL0_IMP),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = {
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_GCS),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_GCS_SHIFT, 4, 0),
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_frac_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SME_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MPAM_frac_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_RAS_frac_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_MTE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_MTE_SHIFT, 4, ID_AA64PFR1_EL1_MTE_NI),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_SSBS_SHIFT, 4, ID_AA64PFR1_EL1_SSBS_NI),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_BTI),
                                    FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR1_EL1_BT_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64pfr2[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR2_EL1_FPMR_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F64MM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F32MM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_F16MM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_I8MM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SM4_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SHA3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_B16B16_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BF16_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_BitPerm_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_EltPerm_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_AES_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
                       FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_EL1_SVEver_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_LUTv2_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMEver_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I64_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F64F64_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I16I32_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16B16_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F16_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F16_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F8F32_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_I8I32_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F16F32_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_B16F32_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_BI32I32_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_F32F32_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8FMA_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP4_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SF8DP2_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SBitPerm_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_AES_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SFEXPA_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_STMOP_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
                       FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_SMOP4_SHIFT, 1, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64fpfr0[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8CVT_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8FMA_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP4_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP2_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM8_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM4_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E4M3_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E5M2_SHIFT, 1, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ECV_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_FGT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_EXS_SHIFT, 4, 0),
        /*
         * Page size not being supported at Stage-2 is not fatal. You
         * just give up KVM if PAGE_SIZE isn't supported there. Go fix
         * your favourite nesting hypervisor.
         *
         * There is a small corner case where the hypervisor explicitly
         * advertises a given granule size at Stage-2 (value 2) on some
         * vCPUs, and uses the fallback to Stage-1 (value 0) for other
         * vCPUs. Although this is not forbidden by the architecture, it
         * indicates that the hypervisor is being silly (or buggy).
         *
         * We make no effort to cope with this and pretend that if these
         * fields are inconsistent across vCPUs, then it isn't worth
         * trying to bring KVM up.
         */
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN4_2_SHIFT, 4, 1),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN64_2_SHIFT, 4, 1),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64MMFR0_EL1_TGRAN16_2_SHIFT, 4, 1),
        /*
         * We already refuse to boot CPUs that don't support our configured
         * page size, so we can only detect mismatches for a page size other
         * than the one we're currently using. Unfortunately, SoCs like this
         * exist in the wild so, even though we don't like it, we'll have to go
         * along with it and treat them as non-strict.
         */
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN4_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN4_NI),
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN64_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN64_NI),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_TGRAN16_SHIFT, 4, ID_AA64MMFR0_EL1_TGRAN16_NI),

        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT, 4, 0),
        /* Linux shouldn't care about secure memory */
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_SNSMEM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_BIGEND_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_ASIDBITS_SHIFT, 4, 0),
        /*
         * Differing PARange is fine as long as all peripherals and memory are mapped
         * within the minimum PARange of all CPUs
         */
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EL1_PARANGE_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ECBHB_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TIDCP1_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_AFP_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HCX_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_ETS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_TWED_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_XNX_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_EL1_SpecSEI_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_PAN_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_LO_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HPDS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VH_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_VMIDBits_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_EL1_HAFDBS_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_E0PD_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_EVT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_BBM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_TTL_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_FWB_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IDS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_AT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_ST_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_NV_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CCIDX_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_VARange_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_IESB_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_LSM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_UAO_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EL1_CnP_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64mmfr3[] = {
        ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_POE),
                       FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1POE_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_S1PIE_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR3_EL1_TCRX_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64mmfr4[] = {
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_E2H0_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR4_EL1_NV_frac_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_ctr[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 31, 1, 1), /* RES1 */
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DIC_SHIFT, 1, 1),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IDC_SHIFT, 1, 1),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_CWG_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_HIGHER_OR_ZERO_SAFE, CTR_EL0_ERG_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_DminLine_SHIFT, 4, 1),
        /*
         * Linux can handle differing I-cache policies. Userspace JITs will
         * make use of *minLine.
         * If we have differing I-cache policies, report it as the weakest - VIPT.
         */
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_EL0_L1Ip_SHIFT, 2, CTR_EL0_L1Ip_VIPT),        /* L1Ip */
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_EL0_IminLine_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static struct arm64_ftr_override __ro_after_init no_override = { };

struct arm64_ftr_reg arm64_ftr_reg_ctrel0 = {
        .name                = "SYS_CTR_EL0",
        .ftr_bits        = ftr_ctr,
        .override        = &no_override,
};

static const struct arm64_ftr_bits ftr_id_mmfr0[] = {
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_InnerShr_SHIFT, 4, 0xf),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_FCSE_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_AuxReg_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_TCM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_ShareLvl_SHIFT, 4, 0),
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_OuterShr_SHIFT, 4, 0xf),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_PMSA_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_EL1_VMSA_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = {
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_DoubleLock_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_PMSVer_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_CTX_CMPs_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_WRPs_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_EL1_BRPs_SHIFT, 4, 0),
        /*
         * We can instantiate multiple PMU instances with different levels
         * of support.
         */
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_EL1_PMUVer_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_EL1_DebugVer_SHIFT, 4, 0x6),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_mvfr0[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPRound_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPShVec_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPSqrt_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPDivide_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPTrap_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPDP_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_FPSP_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR0_EL1_SIMDReg_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_mvfr1[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDFMAC_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPHP_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDHP_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDSP_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDInt_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_SIMDLS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPDNaN_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR1_EL1_FPFtZ_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_mvfr2[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_EL1_FPMisc_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_EL1_SIMDMisc_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_dczid[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_EL0_DZP_SHIFT, 1, 1),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_EL0_BS_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_gmid[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, GMID_EL1_BS_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_isar0[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Divide_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Debug_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Coproc_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_CmpBranch_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_BitField_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_BitCount_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR0_EL1_Swap_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_isar5[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_RDM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_CRC32_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SHA2_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SHA1_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_AES_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR5_EL1_SEVL_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_mmfr4[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_EVT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_CCIDX_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_LSM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_HPDS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_CnP_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_XNX_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_EL1_AC2_SHIFT, 4, 0),

        /*
         * SpecSEI = 1 indicates that the PE might generate an SError on an
         * external abort on speculative read. It is safe to assume that an
         * SError might be generated than it will not be. Hence it has been
         * classified as FTR_HIGHER_SAFE.
         */
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_MMFR4_EL1_SpecSEI_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_isar4[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SWP_frac_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_PSR_M_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SynchPrim_frac_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Barrier_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_SMC_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Writeback_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_WithShifts_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR4_EL1_Unpriv_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_mmfr5[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR5_EL1_ETS_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_isar6[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_I8MM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_BF16_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_SPECRES_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_SB_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_FHM_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_DP_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_ISAR6_EL1_JSCVT_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_pfr0[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_DIT_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_CSV2_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State3_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State2_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State1_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_EL1_State0_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_pfr1[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_GIC_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Virt_frac_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Sec_frac_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_GenTimer_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Virtualization_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_MProgMod_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_Security_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR1_EL1_ProgMod_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_pfr2[] = {
        ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_EL1_SSBS_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR2_EL1_CSV3_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_dfr0[] = {
        /* [31:28] TraceFilt */
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_DFR0_EL1_PerfMon_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MProfDbg_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapTrc_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopTrc_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_MMapDbg_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopSDbg_SHIFT, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_EL1_CopDbg_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_id_dfr1[] = {
        S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR1_EL1_MTPMU_SHIFT, 4, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_mpamidr[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PMG_MAX_SHIFT, MPAMIDR_EL1_PMG_MAX_WIDTH, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_VPMR_MAX_SHIFT, MPAMIDR_EL1_VPMR_MAX_WIDTH, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_HAS_HCR_SHIFT, 1, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, MPAMIDR_EL1_PARTID_MAX_SHIFT, MPAMIDR_EL1_PARTID_MAX_WIDTH, 0),
        ARM64_FTR_END,
};

/*
 * Common ftr bits for a 32bit register with all hidden, strict
 * attributes, with 4bit feature fields and a default safe value of
 * 0. Covers the following 32bit registers:
 * id_isar[1-3], id_mmfr[1-3]
 */
static const struct arm64_ftr_bits ftr_generic_32bits[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0),
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0),
        ARM64_FTR_END,
};

/* Table for a single 32bit feature value */
static const struct arm64_ftr_bits ftr_single32[] = {
        ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, 0, 32, 0),
        ARM64_FTR_END,
};

static const struct arm64_ftr_bits ftr_raz[] = {
        ARM64_FTR_END,
};

#define __ARM64_FTR_REG_OVERRIDE(id_str, id, table, ovr) {        \
                .sys_id = id,                                        \
                .reg =         &(struct arm64_ftr_reg){                \
                        .name = id_str,                                \
                        .override = (ovr),                        \
                        .ftr_bits = &((table)[0]),                \
        }}

#define ARM64_FTR_REG_OVERRIDE(id, table, ovr)        \
        __ARM64_FTR_REG_OVERRIDE(#id, id, table, ovr)

#define ARM64_FTR_REG(id, table)                \
        __ARM64_FTR_REG_OVERRIDE(#id, id, table, &no_override)

struct arm64_ftr_override id_aa64mmfr0_override;
struct arm64_ftr_override id_aa64mmfr1_override;
struct arm64_ftr_override id_aa64mmfr2_override;
struct arm64_ftr_override id_aa64pfr0_override;
struct arm64_ftr_override id_aa64pfr1_override;
struct arm64_ftr_override id_aa64zfr0_override;
struct arm64_ftr_override id_aa64smfr0_override;
struct arm64_ftr_override id_aa64isar1_override;
struct arm64_ftr_override id_aa64isar2_override;

struct arm64_ftr_override arm64_sw_feature_override;

static const struct __ftr_reg_entry {
        u32                        sys_id;
        struct arm64_ftr_reg         *reg;
} arm64_ftr_regs[] = {

        /* Op1 = 0, CRn = 0, CRm = 1 */
        ARM64_FTR_REG(SYS_ID_PFR0_EL1, ftr_id_pfr0),
        ARM64_FTR_REG(SYS_ID_PFR1_EL1, ftr_id_pfr1),
        ARM64_FTR_REG(SYS_ID_DFR0_EL1, ftr_id_dfr0),
        ARM64_FTR_REG(SYS_ID_MMFR0_EL1, ftr_id_mmfr0),
        ARM64_FTR_REG(SYS_ID_MMFR1_EL1, ftr_generic_32bits),
        ARM64_FTR_REG(SYS_ID_MMFR2_EL1, ftr_generic_32bits),
        ARM64_FTR_REG(SYS_ID_MMFR3_EL1, ftr_generic_32bits),

        /* Op1 = 0, CRn = 0, CRm = 2 */
        ARM64_FTR_REG(SYS_ID_ISAR0_EL1, ftr_id_isar0),
        ARM64_FTR_REG(SYS_ID_ISAR1_EL1, ftr_generic_32bits),
        ARM64_FTR_REG(SYS_ID_ISAR2_EL1, ftr_generic_32bits),
        ARM64_FTR_REG(SYS_ID_ISAR3_EL1, ftr_generic_32bits),
        ARM64_FTR_REG(SYS_ID_ISAR4_EL1, ftr_id_isar4),
        ARM64_FTR_REG(SYS_ID_ISAR5_EL1, ftr_id_isar5),
        ARM64_FTR_REG(SYS_ID_MMFR4_EL1, ftr_id_mmfr4),
        ARM64_FTR_REG(SYS_ID_ISAR6_EL1, ftr_id_isar6),

        /* Op1 = 0, CRn = 0, CRm = 3 */
        ARM64_FTR_REG(SYS_MVFR0_EL1, ftr_mvfr0),
        ARM64_FTR_REG(SYS_MVFR1_EL1, ftr_mvfr1),
        ARM64_FTR_REG(SYS_MVFR2_EL1, ftr_mvfr2),
        ARM64_FTR_REG(SYS_ID_PFR2_EL1, ftr_id_pfr2),
        ARM64_FTR_REG(SYS_ID_DFR1_EL1, ftr_id_dfr1),
        ARM64_FTR_REG(SYS_ID_MMFR5_EL1, ftr_id_mmfr5),

        /* Op1 = 0, CRn = 0, CRm = 4 */
        ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0,
                               &id_aa64pfr0_override),
        ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1,
                               &id_aa64pfr1_override),
        ARM64_FTR_REG(SYS_ID_AA64PFR2_EL1, ftr_id_aa64pfr2),
        ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0,
                               &id_aa64zfr0_override),
        ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64SMFR0_EL1, ftr_id_aa64smfr0,
                               &id_aa64smfr0_override),
        ARM64_FTR_REG(SYS_ID_AA64FPFR0_EL1, ftr_id_aa64fpfr0),

        /* Op1 = 0, CRn = 0, CRm = 5 */
        ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0),
        ARM64_FTR_REG(SYS_ID_AA64DFR1_EL1, ftr_raz),

        /* Op1 = 0, CRn = 0, CRm = 6 */
        ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0),
        ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1,
                               &id_aa64isar1_override),
        ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2,
                               &id_aa64isar2_override),
        ARM64_FTR_REG(SYS_ID_AA64ISAR3_EL1, ftr_id_aa64isar3),

        /* Op1 = 0, CRn = 0, CRm = 7 */
        ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0,
                               &id_aa64mmfr0_override),
        ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR1_EL1, ftr_id_aa64mmfr1,
                               &id_aa64mmfr1_override),
        ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64MMFR2_EL1, ftr_id_aa64mmfr2,
                               &id_aa64mmfr2_override),
        ARM64_FTR_REG(SYS_ID_AA64MMFR3_EL1, ftr_id_aa64mmfr3),
        ARM64_FTR_REG(SYS_ID_AA64MMFR4_EL1, ftr_id_aa64mmfr4),

        /* Op1 = 0, CRn = 10, CRm = 4 */
        ARM64_FTR_REG(SYS_MPAMIDR_EL1, ftr_mpamidr),

        /* Op1 = 1, CRn = 0, CRm = 0 */
        ARM64_FTR_REG(SYS_GMID_EL1, ftr_gmid),

        /* Op1 = 3, CRn = 0, CRm = 0 */
        { SYS_CTR_EL0, &arm64_ftr_reg_ctrel0 },
        ARM64_FTR_REG(SYS_DCZID_EL0, ftr_dczid),

        /* Op1 = 3, CRn = 14, CRm = 0 */
        ARM64_FTR_REG(SYS_CNTFRQ_EL0, ftr_single32),
};

static int search_cmp_ftr_reg(const void *id, const void *regp)
{
        return (int)(unsigned long)id - (int)((const struct __ftr_reg_entry *)regp)->sys_id;
}

/*
 * get_arm64_ftr_reg_nowarn - Looks up a feature register entry using
 * its sys_reg() encoding. With the array arm64_ftr_regs sorted in the
 * ascending order of sys_id, we use binary search to find a matching
 * entry.
 *
 * returns - Upon success,  matching ftr_reg entry for id.
 *         - NULL on failure. It is upto the caller to decide
 *             the impact of a failure.
 */
static struct arm64_ftr_reg *get_arm64_ftr_reg_nowarn(u32 sys_id)
{
        const struct __ftr_reg_entry *ret;

        ret = bsearch((const void *)(unsigned long)sys_id,
                        arm64_ftr_regs,
                        ARRAY_SIZE(arm64_ftr_regs),
                        sizeof(arm64_ftr_regs[0]),
                        search_cmp_ftr_reg);
        if (ret)
                return ret->reg;
        return NULL;
}

/*
 * get_arm64_ftr_reg - Looks up a feature register entry using
 * its sys_reg() encoding. This calls get_arm64_ftr_reg_nowarn().
 *
 * returns - Upon success,  matching ftr_reg entry for id.
 *         - NULL on failure but with an WARN_ON().
 */
struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id)
{
        struct arm64_ftr_reg *reg;

        reg = get_arm64_ftr_reg_nowarn(sys_id);

        /*
         * Requesting a non-existent register search is an error. Warn
         * and let the caller handle it.
         */
        WARN_ON(!reg);
        return reg;
}

static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg,
                               s64 ftr_val)
{
        u64 mask = arm64_ftr_mask(ftrp);

        reg &= ~mask;
        reg |= (ftr_val << ftrp->shift) & mask;
        return reg;
}

s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
                                s64 cur)
{
        s64 ret = 0;

        switch (ftrp->type) {
        case FTR_EXACT:
                ret = ftrp->safe_val;
                break;
        case FTR_LOWER_SAFE:
                ret = min(new, cur);
                break;
        case FTR_HIGHER_OR_ZERO_SAFE:
                if (!cur || !new)
                        break;
                fallthrough;
        case FTR_HIGHER_SAFE:
                ret = max(new, cur);
                break;
        default:
                BUG();
        }

        return ret;
}

static void __init sort_ftr_regs(void)
{
        unsigned int i;

        for (i = 0; i < ARRAY_SIZE(arm64_ftr_regs); i++) {
                const struct arm64_ftr_reg *ftr_reg = arm64_ftr_regs[i].reg;
                const struct arm64_ftr_bits *ftr_bits = ftr_reg->ftr_bits;
                unsigned int j = 0;

                /*
                 * Features here must be sorted in descending order with respect
                 * to their shift values and should not overlap with each other.
                 */
                for (; ftr_bits->width != 0; ftr_bits++, j++) {
                        unsigned int width = ftr_reg->ftr_bits[j].width;
                        unsigned int shift = ftr_reg->ftr_bits[j].shift;
                        unsigned int prev_shift;

                        WARN((shift  + width) > 64,
                                "%s has invalid feature at shift %d\n",
                                ftr_reg->name, shift);

                        /*
                         * Skip the first feature. There is nothing to
                         * compare against for now.
                         */
                        if (j == 0)
                                continue;

                        prev_shift = ftr_reg->ftr_bits[j - 1].shift;
                        WARN((shift + width) > prev_shift,
                                "%s has feature overlap at shift %d\n",
                                ftr_reg->name, shift);
                }

                /*
                 * Skip the first register. There is nothing to
                 * compare against for now.
                 */
                if (i == 0)
                        continue;
                /*
                 * Registers here must be sorted in ascending order with respect
                 * to sys_id for subsequent binary search in get_arm64_ftr_reg()
                 * to work correctly.
                 */
                BUG_ON(arm64_ftr_regs[i].sys_id <= arm64_ftr_regs[i - 1].sys_id);
        }
}

/*
 * Initialise the CPU feature register from Boot CPU values.
 * Also initiliases the strict_mask for the register.
 * Any bits that are not covered by an arm64_ftr_bits entry are considered
 * RES0 for the system-wide value, and must strictly match.
 */
static void init_cpu_ftr_reg(u32 sys_reg, u64 new)
{
        u64 val = 0;
        u64 strict_mask = ~0x0ULL;
        u64 user_mask = 0;
        u64 valid_mask = 0;

        const struct arm64_ftr_bits *ftrp;
        struct arm64_ftr_reg *reg = get_arm64_ftr_reg(sys_reg);

        if (!reg)
                return;

        for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) {
                u64 ftr_mask = arm64_ftr_mask(ftrp);
                s64 ftr_new = arm64_ftr_value(ftrp, new);
                s64 ftr_ovr = arm64_ftr_value(ftrp, reg->override->val);

                if ((ftr_mask & reg->override->mask) == ftr_mask) {
                        s64 tmp = arm64_ftr_safe_value(ftrp, ftr_ovr, ftr_new);
                        char *str = NULL;

                        if (ftr_ovr != tmp) {
                                /* Unsafe, remove the override */
                                reg->override->mask &= ~ftr_mask;
                                reg->override->val &= ~ftr_mask;
                                tmp = ftr_ovr;
                                str = "ignoring override";
                        } else if (ftr_new != tmp) {
                                /* Override was valid */
                                ftr_new = tmp;
                                str = "forced";
                        } else {
                                /* Override was the safe value */
                                str = "already set";
                        }

                        pr_warn("%s[%d:%d]: %s to %llx\n",
                                reg->name,
                                ftrp->shift + ftrp->width - 1,
                                ftrp->shift, str,
                                tmp & (BIT(ftrp->width) - 1));
                } else if ((ftr_mask & reg->override->val) == ftr_mask) {
                        reg->override->val &= ~ftr_mask;
                        pr_warn("%s[%d:%d]: impossible override, ignored\n",
                                reg->name,
                                ftrp->shift + ftrp->width - 1,
                                ftrp->shift);
                }

                val = arm64_ftr_set_value(ftrp, val, ftr_new);

                valid_mask |= ftr_mask;
                if (!ftrp->strict)
                        strict_mask &= ~ftr_mask;
                if (ftrp->visible)
                        user_mask |= ftr_mask;
                else
                        reg->user_val = arm64_ftr_set_value(ftrp,
                                                            reg->user_val,
                                                            ftrp->safe_val);
        }

        val &= valid_mask;

        reg->sys_val = val;
        reg->strict_mask = strict_mask;
        reg->user_mask = user_mask;
}

extern const struct arm64_cpu_capabilities arm64_errata[];
static const struct arm64_cpu_capabilities arm64_features[];

static void __init
init_cpucap_indirect_list_from_array(const struct arm64_cpu_capabilities *caps)
{
        for (; caps->matches; caps++) {
                if (WARN(caps->capability >= ARM64_NCAPS,
                        "Invalid capability %d\n", caps->capability))
                        continue;
                if (WARN(cpucap_ptrs[caps->capability],
                        "Duplicate entry for capability %d\n",
                        caps->capability))
                        continue;
                cpucap_ptrs[caps->capability] = caps;
        }
}

static void __init init_cpucap_indirect_list(void)
{
        init_cpucap_indirect_list_from_array(arm64_features);
        init_cpucap_indirect_list_from_array(arm64_errata);
}

static void __init setup_boot_cpu_capabilities(void);

static void init_32bit_cpu_features(struct cpuinfo_32bit *info)
{
        init_cpu_ftr_reg(SYS_ID_DFR0_EL1, info->reg_id_dfr0);
        init_cpu_ftr_reg(SYS_ID_DFR1_EL1, info->reg_id_dfr1);
        init_cpu_ftr_reg(SYS_ID_ISAR0_EL1, info->reg_id_isar0);
        init_cpu_ftr_reg(SYS_ID_ISAR1_EL1, info->reg_id_isar1);
        init_cpu_ftr_reg(SYS_ID_ISAR2_EL1, info->reg_id_isar2);
        init_cpu_ftr_reg(SYS_ID_ISAR3_EL1, info->reg_id_isar3);
        init_cpu_ftr_reg(SYS_ID_ISAR4_EL1, info->reg_id_isar4);
        init_cpu_ftr_reg(SYS_ID_ISAR5_EL1, info->reg_id_isar5);
        init_cpu_ftr_reg(SYS_ID_ISAR6_EL1, info->reg_id_isar6);
        init_cpu_ftr_reg(SYS_ID_MMFR0_EL1, info->reg_id_mmfr0);
        init_cpu_ftr_reg(SYS_ID_MMFR1_EL1, info->reg_id_mmfr1);
        init_cpu_ftr_reg(SYS_ID_MMFR2_EL1, info->reg_id_mmfr2);
        init_cpu_ftr_reg(SYS_ID_MMFR3_EL1, info->reg_id_mmfr3);
        init_cpu_ftr_reg(SYS_ID_MMFR4_EL1, info->reg_id_mmfr4);
        init_cpu_ftr_reg(SYS_ID_MMFR5_EL1, info->reg_id_mmfr5);
        init_cpu_ftr_reg(SYS_ID_PFR0_EL1, info->reg_id_pfr0);
        init_cpu_ftr_reg(SYS_ID_PFR1_EL1, info->reg_id_pfr1);
        init_cpu_ftr_reg(SYS_ID_PFR2_EL1, info->reg_id_pfr2);
        init_cpu_ftr_reg(SYS_MVFR0_EL1, info->reg_mvfr0);
        init_cpu_ftr_reg(SYS_MVFR1_EL1, info->reg_mvfr1);
        init_cpu_ftr_reg(SYS_MVFR2_EL1, info->reg_mvfr2);
}

#ifdef CONFIG_ARM64_PSEUDO_NMI
static bool enable_pseudo_nmi;

static int __init early_enable_pseudo_nmi(char *p)
{
        return kstrtobool(p, &enable_pseudo_nmi);
}
early_param("irqchip.gicv3_pseudo_nmi", early_enable_pseudo_nmi);

static __init void detect_system_supports_pseudo_nmi(void)
{
        struct device_node *np;

        if (!enable_pseudo_nmi)
                return;

        /*
         * Detect broken MediaTek firmware that doesn't properly save and
         * restore GIC priorities.
         */
        np = of_find_compatible_node(NULL, NULL, "arm,gic-v3");
        if (np && of_property_read_bool(np, "mediatek,broken-save-restore-fw")) {
                pr_info("Pseudo-NMI disabled due to MediaTek Chromebook GICR save problem\n");
                enable_pseudo_nmi = false;
        }
        of_node_put(np);
}
#else /* CONFIG_ARM64_PSEUDO_NMI */
static inline void detect_system_supports_pseudo_nmi(void) { }
#endif

void __init init_cpu_features(struct cpuinfo_arm64 *info)
{
        /* Before we start using the tables, make sure it is sorted */
        sort_ftr_regs();

        init_cpu_ftr_reg(SYS_CTR_EL0, info->reg_ctr);
        init_cpu_ftr_reg(SYS_DCZID_EL0, info->reg_dczid);
        init_cpu_ftr_reg(SYS_CNTFRQ_EL0, info->reg_cntfrq);
        init_cpu_ftr_reg(SYS_ID_AA64DFR0_EL1, info->reg_id_aa64dfr0);
        init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1);
        init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0);
        init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1);
        init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2);
        init_cpu_ftr_reg(SYS_ID_AA64ISAR3_EL1, info->reg_id_aa64isar3);
        init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0);
        init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1);
        init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
        init_cpu_ftr_reg(SYS_ID_AA64MMFR3_EL1, info->reg_id_aa64mmfr3);
        init_cpu_ftr_reg(SYS_ID_AA64MMFR4_EL1, info->reg_id_aa64mmfr4);
        init_cpu_ftr_reg(SYS_ID_AA64PFR0_EL1, info->reg_id_aa64pfr0);
        init_cpu_ftr_reg(SYS_ID_AA64PFR1_EL1, info->reg_id_aa64pfr1);
        init_cpu_ftr_reg(SYS_ID_AA64PFR2_EL1, info->reg_id_aa64pfr2);
        init_cpu_ftr_reg(SYS_ID_AA64ZFR0_EL1, info->reg_id_aa64zfr0);
        init_cpu_ftr_reg(SYS_ID_AA64SMFR0_EL1, info->reg_id_aa64smfr0);
        init_cpu_ftr_reg(SYS_ID_AA64FPFR0_EL1, info->reg_id_aa64fpfr0);

        if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0))
                init_32bit_cpu_features(&info->aarch32);

        if (IS_ENABLED(CONFIG_ARM64_SVE) &&
            id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
                unsigned long cpacr = cpacr_save_enable_kernel_sve();

                vec_init_vq_map(ARM64_VEC_SVE);

                cpacr_restore(cpacr);
        }

        if (IS_ENABLED(CONFIG_ARM64_SME) &&
            id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
                unsigned long cpacr = cpacr_save_enable_kernel_sme();

                vec_init_vq_map(ARM64_VEC_SME);

                cpacr_restore(cpacr);
        }

        if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0))
                init_cpu_ftr_reg(SYS_MPAMIDR_EL1, info->reg_mpamidr);

        if (id_aa64pfr1_mte(info->reg_id_aa64pfr1))
                init_cpu_ftr_reg(SYS_GMID_EL1, info->reg_gmid);
}

static void update_cpu_ftr_reg(struct arm64_ftr_reg *reg, u64 new)
{
        const struct arm64_ftr_bits *ftrp;

        for (ftrp = reg->ftr_bits; ftrp->width; ftrp++) {
                s64 ftr_cur = arm64_ftr_value(ftrp, reg->sys_val);
                s64 ftr_new = arm64_ftr_value(ftrp, new);

                if (ftr_cur == ftr_new)
                        continue;
                /* Find a safe value */
                ftr_new = arm64_ftr_safe_value(ftrp, ftr_new, ftr_cur);
                reg->sys_val = arm64_ftr_set_value(ftrp, reg->sys_val, ftr_new);
        }

}

static int check_update_ftr_reg(u32 sys_id, int cpu, u64 val, u64 boot)
{
        struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id);

        if (!regp)
                return 0;

        update_cpu_ftr_reg(regp, val);
        if ((boot & regp->strict_mask) == (val & regp->strict_mask))
                return 0;
        pr_warn("SANITY CHECK: Unexpected variation in %s. Boot CPU: %#016llx, CPU%d: %#016llx\n",
                        regp->name, boot, cpu, val);
        return 1;
}

static void relax_cpu_ftr_reg(u32 sys_id, int field)
{
        const struct arm64_ftr_bits *ftrp;
        struct arm64_ftr_reg *regp = get_arm64_ftr_reg(sys_id);

        if (!regp)
                return;

        for (ftrp = regp->ftr_bits; ftrp->width; ftrp++) {
                if (ftrp->shift == field) {
                        regp->strict_mask &= ~arm64_ftr_mask(ftrp);
                        break;
                }
        }

        /* Bogus field? */
        WARN_ON(!ftrp->width);
}

static void lazy_init_32bit_cpu_features(struct cpuinfo_arm64 *info,
                                         struct cpuinfo_arm64 *boot)
{
        static bool boot_cpu_32bit_regs_overridden = false;

        if (!allow_mismatched_32bit_el0 || boot_cpu_32bit_regs_overridden)
                return;

        if (id_aa64pfr0_32bit_el0(boot->reg_id_aa64pfr0))
                return;

        boot->aarch32 = info->aarch32;
        init_32bit_cpu_features(&boot->aarch32);
        boot_cpu_32bit_regs_overridden = true;
}

static int update_32bit_cpu_features(int cpu, struct cpuinfo_32bit *info,
                                     struct cpuinfo_32bit *boot)
{
        int taint = 0;
        u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);

        /*
         * If we don't have AArch32 at EL1, then relax the strictness of
         * EL1-dependent register fields to avoid spurious sanity check fails.
         */
        if (!id_aa64pfr0_32bit_el1(pfr0)) {
                relax_cpu_ftr_reg(SYS_ID_ISAR4_EL1, ID_ISAR4_EL1_SMC_SHIFT);
                relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Virt_frac_SHIFT);
                relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Sec_frac_SHIFT);
                relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Virtualization_SHIFT);
                relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_Security_SHIFT);
                relax_cpu_ftr_reg(SYS_ID_PFR1_EL1, ID_PFR1_EL1_ProgMod_SHIFT);
        }

        taint |= check_update_ftr_reg(SYS_ID_DFR0_EL1, cpu,
                                      info->reg_id_dfr0, boot->reg_id_dfr0);
        taint |= check_update_ftr_reg(SYS_ID_DFR1_EL1, cpu,
                                      info->reg_id_dfr1, boot->reg_id_dfr1);
        taint |= check_update_ftr_reg(SYS_ID_ISAR0_EL1, cpu,
                                      info->reg_id_isar0, boot->reg_id_isar0);
        taint |= check_update_ftr_reg(SYS_ID_ISAR1_EL1, cpu,
                                      info->reg_id_isar1, boot->reg_id_isar1);
        taint |= check_update_ftr_reg(SYS_ID_ISAR2_EL1, cpu,
                                      info->reg_id_isar2, boot->reg_id_isar2);
        taint |= check_update_ftr_reg(SYS_ID_ISAR3_EL1, cpu,
                                      info->reg_id_isar3, boot->reg_id_isar3);
        taint |= check_update_ftr_reg(SYS_ID_ISAR4_EL1, cpu,
                                      info->reg_id_isar4, boot->reg_id_isar4);
        taint |= check_update_ftr_reg(SYS_ID_ISAR5_EL1, cpu,
                                      info->reg_id_isar5, boot->reg_id_isar5);
        taint |= check_update_ftr_reg(SYS_ID_ISAR6_EL1, cpu,
                                      info->reg_id_isar6, boot->reg_id_isar6);

        /*
         * Regardless of the value of the AuxReg field, the AIFSR, ADFSR, and
         * ACTLR formats could differ across CPUs and therefore would have to
         * be trapped for virtualization anyway.
         */
        taint |= check_update_ftr_reg(SYS_ID_MMFR0_EL1, cpu,
                                      info->reg_id_mmfr0, boot->reg_id_mmfr0);
        taint |= check_update_ftr_reg(SYS_ID_MMFR1_EL1, cpu,
                                      info->reg_id_mmfr1, boot->reg_id_mmfr1);
        taint |= check_update_ftr_reg(SYS_ID_MMFR2_EL1, cpu,
                                      info->reg_id_mmfr2, boot->reg_id_mmfr2);
        taint |= check_update_ftr_reg(SYS_ID_MMFR3_EL1, cpu,
                                      info->reg_id_mmfr3, boot->reg_id_mmfr3);
        taint |= check_update_ftr_reg(SYS_ID_MMFR4_EL1, cpu,
                                      info->reg_id_mmfr4, boot->reg_id_mmfr4);
        taint |= check_update_ftr_reg(SYS_ID_MMFR5_EL1, cpu,
                                      info->reg_id_mmfr5, boot->reg_id_mmfr5);
        taint |= check_update_ftr_reg(SYS_ID_PFR0_EL1, cpu,
                                      info->reg_id_pfr0, boot->reg_id_pfr0);
        taint |= check_update_ftr_reg(SYS_ID_PFR1_EL1, cpu,
                                      info->reg_id_pfr1, boot->reg_id_pfr1);
        taint |= check_update_ftr_reg(SYS_ID_PFR2_EL1, cpu,
                                      info->reg_id_pfr2, boot->reg_id_pfr2);
        taint |= check_update_ftr_reg(SYS_MVFR0_EL1, cpu,
                                      info->reg_mvfr0, boot->reg_mvfr0);
        taint |= check_update_ftr_reg(SYS_MVFR1_EL1, cpu,
                                      info->reg_mvfr1, boot->reg_mvfr1);
        taint |= check_update_ftr_reg(SYS_MVFR2_EL1, cpu,
                                      info->reg_mvfr2, boot->reg_mvfr2);

        return taint;
}

/*
 * Update system wide CPU feature registers with the values from a
 * non-boot CPU. Also performs SANITY checks to make sure that there
 * aren't any insane variations from that of the boot CPU.
 */
void update_cpu_features(int cpu,
                         struct cpuinfo_arm64 *info,
                         struct cpuinfo_arm64 *boot)
{
        int taint = 0;

        /*
         * The kernel can handle differing I-cache policies, but otherwise
         * caches should look identical. Userspace JITs will make use of
         * *minLine.
         */
        taint |= check_update_ftr_reg(SYS_CTR_EL0, cpu,
                                      info->reg_ctr, boot->reg_ctr);

        /*
         * Userspace may perform DC ZVA instructions. Mismatched block sizes
         * could result in too much or too little memory being zeroed if a
         * process is preempted and migrated between CPUs.
         */
        taint |= check_update_ftr_reg(SYS_DCZID_EL0, cpu,
                                      info->reg_dczid, boot->reg_dczid);

        /* If different, timekeeping will be broken (especially with KVM) */
        taint |= check_update_ftr_reg(SYS_CNTFRQ_EL0, cpu,
                                      info->reg_cntfrq, boot->reg_cntfrq);

        /*
         * The kernel uses self-hosted debug features and expects CPUs to
         * support identical debug features. We presently need CTX_CMPs, WRPs,
         * and BRPs to be identical.
         * ID_AA64DFR1 is currently RES0.
         */
        taint |= check_update_ftr_reg(SYS_ID_AA64DFR0_EL1, cpu,
                                      info->reg_id_aa64dfr0, boot->reg_id_aa64dfr0);
        taint |= check_update_ftr_reg(SYS_ID_AA64DFR1_EL1, cpu,
                                      info->reg_id_aa64dfr1, boot->reg_id_aa64dfr1);
        /*
         * Even in big.LITTLE, processors should be identical instruction-set
         * wise.
         */
        taint |= check_update_ftr_reg(SYS_ID_AA64ISAR0_EL1, cpu,
                                      info->reg_id_aa64isar0, boot->reg_id_aa64isar0);
        taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu,
                                      info->reg_id_aa64isar1, boot->reg_id_aa64isar1);
        taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu,
                                      info->reg_id_aa64isar2, boot->reg_id_aa64isar2);
        taint |= check_update_ftr_reg(SYS_ID_AA64ISAR3_EL1, cpu,
                                      info->reg_id_aa64isar3, boot->reg_id_aa64isar3);

        /*
         * Differing PARange support is fine as long as all peripherals and
         * memory are mapped within the minimum PARange of all CPUs.
         * Linux should not care about secure memory.
         */
        taint |= check_update_ftr_reg(SYS_ID_AA64MMFR0_EL1, cpu,
                                      info->reg_id_aa64mmfr0, boot->reg_id_aa64mmfr0);
        taint |= check_update_ftr_reg(SYS_ID_AA64MMFR1_EL1, cpu,
                                      info->reg_id_aa64mmfr1, boot->reg_id_aa64mmfr1);
        taint |= check_update_ftr_reg(SYS_ID_AA64MMFR2_EL1, cpu,
                                      info->reg_id_aa64mmfr2, boot->reg_id_aa64mmfr2);
        taint |= check_update_ftr_reg(SYS_ID_AA64MMFR3_EL1, cpu,
                                      info->reg_id_aa64mmfr3, boot->reg_id_aa64mmfr3);

        taint |= check_update_ftr_reg(SYS_ID_AA64PFR0_EL1, cpu,
                                      info->reg_id_aa64pfr0, boot->reg_id_aa64pfr0);
        taint |= check_update_ftr_reg(SYS_ID_AA64PFR1_EL1, cpu,
                                      info->reg_id_aa64pfr1, boot->reg_id_aa64pfr1);
        taint |= check_update_ftr_reg(SYS_ID_AA64PFR2_EL1, cpu,
                                      info->reg_id_aa64pfr2, boot->reg_id_aa64pfr2);

        taint |= check_update_ftr_reg(SYS_ID_AA64ZFR0_EL1, cpu,
                                      info->reg_id_aa64zfr0, boot->reg_id_aa64zfr0);

        taint |= check_update_ftr_reg(SYS_ID_AA64SMFR0_EL1, cpu,
                                      info->reg_id_aa64smfr0, boot->reg_id_aa64smfr0);

        taint |= check_update_ftr_reg(SYS_ID_AA64FPFR0_EL1, cpu,
                                      info->reg_id_aa64fpfr0, boot->reg_id_aa64fpfr0);

        /* Probe vector lengths */
        if (IS_ENABLED(CONFIG_ARM64_SVE) &&
            id_aa64pfr0_sve(read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1))) {
                if (!system_capabilities_finalized()) {
                        unsigned long cpacr = cpacr_save_enable_kernel_sve();

                        vec_update_vq_map(ARM64_VEC_SVE);

                        cpacr_restore(cpacr);
                }
        }

        if (IS_ENABLED(CONFIG_ARM64_SME) &&
            id_aa64pfr1_sme(read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1))) {
                unsigned long cpacr = cpacr_save_enable_kernel_sme();

                /* Probe vector lengths */
                if (!system_capabilities_finalized())
                        vec_update_vq_map(ARM64_VEC_SME);

                cpacr_restore(cpacr);
        }

        if (id_aa64pfr0_mpam(info->reg_id_aa64pfr0)) {
                taint |= check_update_ftr_reg(SYS_MPAMIDR_EL1, cpu,
                                        info->reg_mpamidr, boot->reg_mpamidr);
        }

        /*
         * The kernel uses the LDGM/STGM instructions and the number of tags
         * they read/write depends on the GMID_EL1.BS field. Check that the
         * value is the same on all CPUs.
         */
        if (IS_ENABLED(CONFIG_ARM64_MTE) &&
            id_aa64pfr1_mte(info->reg_id_aa64pfr1)) {
                taint |= check_update_ftr_reg(SYS_GMID_EL1, cpu,
                                              info->reg_gmid, boot->reg_gmid);
        }

        /*
         * If we don't have AArch32 at all then skip the checks entirely
         * as the register values may be UNKNOWN and we're not going to be
         * using them for anything.
         *
         * This relies on a sanitised view of the AArch64 ID registers
         * (e.g. SYS_ID_AA64PFR0_EL1), so we call it last.
         */
        if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
                lazy_init_32bit_cpu_features(info, boot);
                taint |= update_32bit_cpu_features(cpu, &info->aarch32,
                                                   &boot->aarch32);
        }

        /*
         * Mismatched CPU features are a recipe for disaster. Don't even
         * pretend to support them.
         */
        if (taint) {
                pr_warn_once("Unsupported CPU feature variation detected.\n");
                add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
        }
}

u64 read_sanitised_ftr_reg(u32 id)
{
        struct arm64_ftr_reg *regp = get_arm64_ftr_reg(id);

        if (!regp)
                return 0;
        return regp->sys_val;
}
EXPORT_SYMBOL_GPL(read_sanitised_ftr_reg);

#define read_sysreg_case(r)        \
        case r:                val = read_sysreg_s(r); break;

/*
 * __read_sysreg_by_encoding() - Used by a STARTING cpu before cpuinfo is populated.
 * Read the system register on the current CPU
 */
u64 __read_sysreg_by_encoding(u32 sys_id)
{
        struct arm64_ftr_reg *regp;
        u64 val;

        switch (sys_id) {
        read_sysreg_case(SYS_ID_PFR0_EL1);
        read_sysreg_case(SYS_ID_PFR1_EL1);
        read_sysreg_case(SYS_ID_PFR2_EL1);
        read_sysreg_case(SYS_ID_DFR0_EL1);
        read_sysreg_case(SYS_ID_DFR1_EL1);
        read_sysreg_case(SYS_ID_MMFR0_EL1);
        read_sysreg_case(SYS_ID_MMFR1_EL1);
        read_sysreg_case(SYS_ID_MMFR2_EL1);
        read_sysreg_case(SYS_ID_MMFR3_EL1);
        read_sysreg_case(SYS_ID_MMFR4_EL1);
        read_sysreg_case(SYS_ID_MMFR5_EL1);
        read_sysreg_case(SYS_ID_ISAR0_EL1);
        read_sysreg_case(SYS_ID_ISAR1_EL1);
        read_sysreg_case(SYS_ID_ISAR2_EL1);
        read_sysreg_case(SYS_ID_ISAR3_EL1);
        read_sysreg_case(SYS_ID_ISAR4_EL1);
        read_sysreg_case(SYS_ID_ISAR5_EL1);
        read_sysreg_case(SYS_ID_ISAR6_EL1);
        read_sysreg_case(SYS_MVFR0_EL1);
        read_sysreg_case(SYS_MVFR1_EL1);
        read_sysreg_case(SYS_MVFR2_EL1);

        read_sysreg_case(SYS_ID_AA64PFR0_EL1);
        read_sysreg_case(SYS_ID_AA64PFR1_EL1);
        read_sysreg_case(SYS_ID_AA64PFR2_EL1);
        read_sysreg_case(SYS_ID_AA64ZFR0_EL1);
        read_sysreg_case(SYS_ID_AA64SMFR0_EL1);
        read_sysreg_case(SYS_ID_AA64FPFR0_EL1);
        read_sysreg_case(SYS_ID_AA64DFR0_EL1);
        read_sysreg_case(SYS_ID_AA64DFR1_EL1);
        read_sysreg_case(SYS_ID_AA64MMFR0_EL1);
        read_sysreg_case(SYS_ID_AA64MMFR1_EL1);
        read_sysreg_case(SYS_ID_AA64MMFR2_EL1);
        read_sysreg_case(SYS_ID_AA64MMFR3_EL1);
        read_sysreg_case(SYS_ID_AA64MMFR4_EL1);
        read_sysreg_case(SYS_ID_AA64ISAR0_EL1);
        read_sysreg_case(SYS_ID_AA64ISAR1_EL1);
        read_sysreg_case(SYS_ID_AA64ISAR2_EL1);
        read_sysreg_case(SYS_ID_AA64ISAR3_EL1);

        read_sysreg_case(SYS_CNTFRQ_EL0);
        read_sysreg_case(SYS_CTR_EL0);
        read_sysreg_case(SYS_DCZID_EL0);

        default:
                BUG();
                return 0;
        }

        regp  = get_arm64_ftr_reg(sys_id);
        if (regp) {
                val &= ~regp->override->mask;
                val |= (regp->override->val & regp->override->mask);
        }

        return val;
}

#include <linux/irqchip/arm-gic-v3.h>

static bool
has_always(const struct arm64_cpu_capabilities *entry, int scope)
{
        return true;
}

static bool
feature_matches(u64 reg, const struct arm64_cpu_capabilities *entry)
{
        int val, min, max;
        u64 tmp;

        val = cpuid_feature_extract_field_width(reg, entry->field_pos,
                                                entry->field_width,
                                                entry->sign);

        tmp = entry->min_field_value;
        tmp <<= entry->field_pos;

        min = cpuid_feature_extract_field_width(tmp, entry->field_pos,
                                                entry->field_width,
                                                entry->sign);

        tmp = entry->max_field_value;
        tmp <<= entry->field_pos;

        max = cpuid_feature_extract_field_width(tmp, entry->field_pos,
                                                entry->field_width,
                                                entry->sign);

        return val >= min && val <= max;
}

static u64
read_scoped_sysreg(const struct arm64_cpu_capabilities *entry, int scope)
{
        WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible());
        if (scope == SCOPE_SYSTEM)
                return read_sanitised_ftr_reg(entry->sys_reg);
        else
                return __read_sysreg_by_encoding(entry->sys_reg);
}

static bool
has_user_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope)
{
        int mask;
        struct arm64_ftr_reg *regp;
        u64 val = read_scoped_sysreg(entry, scope);

        regp = get_arm64_ftr_reg(entry->sys_reg);
        if (!regp)
                return false;

        mask = cpuid_feature_extract_unsigned_field_width(regp->user_mask,
                                                          entry->field_pos,
                                                          entry->field_width);
        if (!mask)
                return false;

        return feature_matches(val, entry);
}

static bool
has_cpuid_feature(const struct arm64_cpu_capabilities *entry, int scope)
{
        u64 val = read_scoped_sysreg(entry, scope);
        return feature_matches(val, entry);
}

const struct cpumask *system_32bit_el0_cpumask(void)
{
        if (!system_supports_32bit_el0())
                return cpu_none_mask;

        if (static_branch_unlikely(&arm64_mismatched_32bit_el0))
                return cpu_32bit_el0_mask;

        return cpu_possible_mask;
}

const struct cpumask *task_cpu_fallback_mask(struct task_struct *p)
{
        return __task_cpu_possible_mask(p, housekeeping_cpumask(HK_TYPE_TICK));
}

static int __init parse_32bit_el0_param(char *str)
{
        allow_mismatched_32bit_el0 = true;
        return 0;
}
early_param("allow_mismatched_32bit_el0", parse_32bit_el0_param);

static ssize_t aarch32_el0_show(struct device *dev,
                                struct device_attribute *attr, char *buf)
{
        const struct cpumask *mask = system_32bit_el0_cpumask();

        return sysfs_emit(buf, "%*pbl\n", cpumask_pr_args(mask));
}
static const DEVICE_ATTR_RO(aarch32_el0);

static int __init aarch32_el0_sysfs_init(void)
{
        struct device *dev_root;
        int ret = 0;

        if (!allow_mismatched_32bit_el0)
                return 0;

        dev_root = bus_get_dev_root(&cpu_subsys);
        if (dev_root) {
                ret = device_create_file(dev_root, &dev_attr_aarch32_el0);
                put_device(dev_root);
        }
        return ret;
}
device_initcall(aarch32_el0_sysfs_init);

static bool has_32bit_el0(const struct arm64_cpu_capabilities *entry, int scope)
{
        if (!has_cpuid_feature(entry, scope))
                return allow_mismatched_32bit_el0;

        if (scope == SCOPE_SYSTEM)
                pr_info("detected: 32-bit EL0 Support\n");

        return true;
}

static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry, int scope)
{
        bool has_sre;

        if (!has_cpuid_feature(entry, scope))
                return false;

        has_sre = gic_enable_sre();
        if (!has_sre)
                pr_warn_once("%s present but disabled by higher exception level\n",
                             entry->desc);

        return has_sre;
}

static bool has_cache_idc(const struct arm64_cpu_capabilities *entry,
                          int scope)
{
        u64 ctr;

        if (scope == SCOPE_SYSTEM)
                ctr = arm64_ftr_reg_ctrel0.sys_val;
        else
                ctr = read_cpuid_effective_cachetype();

        return ctr & BIT(CTR_EL0_IDC_SHIFT);
}

static void cpu_emulate_effective_ctr(const struct arm64_cpu_capabilities *__unused)
{
        /*
         * If the CPU exposes raw CTR_EL0.IDC = 0, while effectively
         * CTR_EL0.IDC = 1 (from CLIDR values), we need to trap accesses
         * to the CTR_EL0 on this CPU and emulate it with the real/safe
         * value.
         */
        if (!(read_cpuid_cachetype() & BIT(CTR_EL0_IDC_SHIFT)))
                sysreg_clear_set(sctlr_el1, SCTLR_EL1_UCT, 0);
}

static bool has_cache_dic(const struct arm64_cpu_capabilities *entry,
                          int scope)
{
        u64 ctr;

        if (scope == SCOPE_SYSTEM)
                ctr = arm64_ftr_reg_ctrel0.sys_val;
        else
                ctr = read_cpuid_cachetype();

        return ctr & BIT(CTR_EL0_DIC_SHIFT);
}

static bool __maybe_unused
has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope)
{
        /*
         * Kdump isn't guaranteed to power-off all secondary CPUs, CNP
         * may share TLB entries with a CPU stuck in the crashed
         * kernel.
         */
        if (is_kdump_kernel())
                return false;

        if (cpus_have_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP))
                return false;

        return has_cpuid_feature(entry, scope);
}

static bool __meltdown_safe = true;
static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */

static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry,
                                int scope)
{
        /* List of CPUs that are not vulnerable and don't need KPTI */
        static const struct midr_range kpti_safe_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2),
                MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN),
                MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A73),
                MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
                MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_GOLD),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
                { /* sentinel */ }
        };
        char const *str = "kpti command line option";
        bool meltdown_safe;

        meltdown_safe = is_midr_in_range_list(kpti_safe_list);

        /* Defer to CPU feature registers */
        if (has_cpuid_feature(entry, scope))
                meltdown_safe = true;

        if (!meltdown_safe)
                __meltdown_safe = false;

        /*
         * For reasons that aren't entirely clear, enabling KPTI on Cavium
         * ThunderX leads to apparent I-cache corruption of kernel text, which
         * ends as well as you might imagine. Don't even try. We cannot rely
         * on the cpus_have_*cap() helpers here to detect the CPU erratum
         * because cpucap detection order may change. However, since we know
         * affected CPUs are always in a homogeneous configuration, it is
         * safe to rely on this_cpu_has_cap() here.
         */
        if (this_cpu_has_cap(ARM64_WORKAROUND_CAVIUM_27456)) {
                str = "ARM64_WORKAROUND_CAVIUM_27456";
                __kpti_forced = -1;
        }

        /* Useful for KASLR robustness */
        if (kaslr_enabled() && kaslr_requires_kpti()) {
                if (!__kpti_forced) {
                        str = "KASLR";
                        __kpti_forced = 1;
                }
        }

        if (cpu_mitigations_off() && !__kpti_forced) {
                str = "mitigations=off";
                __kpti_forced = -1;
        }

        if (!IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) {
                pr_info_once("kernel page table isolation disabled by kernel configuration\n");
                return false;
        }

        /* Forced? */
        if (__kpti_forced) {
                pr_info_once("kernel page table isolation forced %s by %s\n",
                             __kpti_forced > 0 ? "ON" : "OFF", str);
                return __kpti_forced > 0;
        }

        return !meltdown_safe;
}

static bool has_nv1(const struct arm64_cpu_capabilities *entry, int scope)
{
        /*
         * Although the Apple M2 family appears to support NV1, the
         * PTW barfs on the nVHE EL2 S1 page table format. Pretend
         * that it doesn't support NV1 at all.
         */
        static const struct midr_range nv1_ni_list[] = {
                MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD),
                MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE),
                MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_PRO),
                MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_PRO),
                MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD_MAX),
                MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE_MAX),
                {}
        };

        return (__system_matches_cap(ARM64_HAS_NESTED_VIRT) &&
                !(has_cpuid_feature(entry, scope) ||
                  is_midr_in_range_list(nv1_ni_list)));
}

#if defined(ID_AA64MMFR0_EL1_TGRAN_LPA2) && defined(ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2)
static bool has_lpa2_at_stage1(u64 mmfr0)
{
        unsigned int tgran;

        tgran = cpuid_feature_extract_unsigned_field(mmfr0,
                                        ID_AA64MMFR0_EL1_TGRAN_SHIFT);
        return tgran == ID_AA64MMFR0_EL1_TGRAN_LPA2;
}

static bool has_lpa2_at_stage2(u64 mmfr0)
{
        unsigned int tgran;

        tgran = cpuid_feature_extract_unsigned_field(mmfr0,
                                        ID_AA64MMFR0_EL1_TGRAN_2_SHIFT);
        return tgran == ID_AA64MMFR0_EL1_TGRAN_2_SUPPORTED_LPA2;
}

static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope)
{
        u64 mmfr0;

        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        return has_lpa2_at_stage1(mmfr0) && has_lpa2_at_stage2(mmfr0);
}
#else
static bool has_lpa2(const struct arm64_cpu_capabilities *entry, int scope)
{
        return false;
}
#endif

#ifdef CONFIG_HW_PERF_EVENTS
static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope)
{
        u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
        unsigned int pmuver;

        /*
         * PMUVer follows the standard ID scheme for an unsigned field with the
         * exception of 0xF (IMP_DEF) which is treated specially and implies
         * FEAT_PMUv3 is not implemented.
         *
         * See DDI0487L.a D24.1.3.2 for more details.
         */
        pmuver = cpuid_feature_extract_unsigned_field(dfr0,
                                                      ID_AA64DFR0_EL1_PMUVer_SHIFT);
        if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
                return false;

        return pmuver >= ID_AA64DFR0_EL1_PMUVer_IMP;
}
#endif

#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
#define KPTI_NG_TEMP_VA                (-(1UL << PMD_SHIFT))

extern
void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
                             phys_addr_t size, pgprot_t prot,
                             phys_addr_t (*pgtable_alloc)(int), int flags);

static phys_addr_t __initdata kpti_ng_temp_alloc;

static phys_addr_t __init kpti_ng_pgd_alloc(int shift)
{
        kpti_ng_temp_alloc -= PAGE_SIZE;
        return kpti_ng_temp_alloc;
}

static int __init __kpti_install_ng_mappings(void *__unused)
{
        typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long);
        extern kpti_remap_fn idmap_kpti_install_ng_mappings;
        kpti_remap_fn *remap_fn;

        int cpu = smp_processor_id();
        int levels = CONFIG_PGTABLE_LEVELS;
        int order = order_base_2(levels);
        u64 kpti_ng_temp_pgd_pa = 0;
        pgd_t *kpti_ng_temp_pgd;
        u64 alloc = 0;

        if (levels == 5 && !pgtable_l5_enabled())
                levels = 4;
        else if (levels == 4 && !pgtable_l4_enabled())
                levels = 3;

        remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings);

        if (!cpu) {
                alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
                kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE);
                kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd);

                //
                // Create a minimal page table hierarchy that permits us to map
                // the swapper page tables temporarily as we traverse them.
                //
                // The physical pages are laid out as follows:
                //
                // +--------+-/-------+-/------ +-/------ +-\\\--------+
                // :  PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[]  :
                // +--------+-\-------+-\------ +-\------ +-///--------+
                //      ^
                // The first page is mapped into this hierarchy at a PMD_SHIFT
                // aligned virtual address, so that we can manipulate the PTE
                // level entries while the mapping is active. The first entry
                // covers the PTE[] page itself, the remaining entries are free
                // to be used as a ad-hoc fixmap.
                //
                create_kpti_ng_temp_pgd(kpti_ng_temp_pgd, __pa(alloc),
                                        KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL,
                                        kpti_ng_pgd_alloc, 0);
        }

        cpu_install_idmap();
        remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA);
        cpu_uninstall_idmap();

        if (!cpu) {
                free_pages(alloc, order);
                arm64_use_ng_mappings = true;
        }

        return 0;
}

static void __init kpti_install_ng_mappings(void)
{
        /* Check whether KPTI is going to be used */
        if (!arm64_kernel_unmapped_at_el0())
                return;

        /*
         * We don't need to rewrite the page-tables if either we've done
         * it already or we have KASLR enabled and therefore have not
         * created any global mappings at all.
         */
        if (arm64_use_ng_mappings)
                return;

        stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask);
}

#else
static inline void kpti_install_ng_mappings(void)
{
}
#endif        /* CONFIG_UNMAP_KERNEL_AT_EL0 */

static void cpu_enable_kpti(struct arm64_cpu_capabilities const *cap)
{
        if (__this_cpu_read(this_cpu_vector) == vectors) {
                const char *v = arm64_get_bp_hardening_vector(EL1_VECTOR_KPTI);

                __this_cpu_write(this_cpu_vector, v);
        }

}

static int __init parse_kpti(char *str)
{
        bool enabled;
        int ret = kstrtobool(str, &enabled);

        if (ret)
                return ret;

        __kpti_forced = enabled ? 1 : -1;
        return 0;
}
early_param("kpti", parse_kpti);

#ifdef CONFIG_ARM64_HW_AFDBM
static struct cpumask dbm_cpus __read_mostly;

static inline void __cpu_enable_hw_dbm(void)
{
        u64 tcr = read_sysreg(tcr_el1) | TCR_HD;

        write_sysreg(tcr, tcr_el1);
        isb();
        local_flush_tlb_all();
}

static bool cpu_has_broken_dbm(void)
{
        /* List of CPUs which have broken DBM support. */
        static const struct midr_range cpus[] = {
#ifdef CONFIG_ARM64_ERRATUM_1024718
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
                /* Kryo4xx Silver (rdpe => r1p0) */
                MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe),
#endif
#ifdef CONFIG_ARM64_ERRATUM_2051678
                MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2),
#endif
                {},
        };

        return is_midr_in_range_list(cpus);
}

static bool cpu_can_use_dbm(const struct arm64_cpu_capabilities *cap)
{
        return has_cpuid_feature(cap, SCOPE_LOCAL_CPU) &&
               !cpu_has_broken_dbm();
}

static void cpu_enable_hw_dbm(struct arm64_cpu_capabilities const *cap)
{
        if (cpu_can_use_dbm(cap)) {
                __cpu_enable_hw_dbm();
                cpumask_set_cpu(smp_processor_id(), &dbm_cpus);
        }
}

static bool has_hw_dbm(const struct arm64_cpu_capabilities *cap,
                       int __unused)
{
        /*
         * DBM is a non-conflicting feature. i.e, the kernel can safely
         * run a mix of CPUs with and without the feature. So, we
         * unconditionally enable the capability to allow any late CPU
         * to use the feature. We only enable the control bits on the
         * CPU, if it is supported.
         */

        return true;
}

#endif

#ifdef CONFIG_ARM64_AMU_EXTN

/*
 * The "amu_cpus" cpumask only signals that the CPU implementation for the
 * flagged CPUs supports the Activity Monitors Unit (AMU) but does not provide
 * information regarding all the events that it supports. When a CPU bit is
 * set in the cpumask, the user of this feature can only rely on the presence
 * of the 4 fixed counters for that CPU. But this does not guarantee that the
 * counters are enabled or access to these counters is enabled by code
 * executed at higher exception levels (firmware).
 */
static struct cpumask amu_cpus __read_mostly;

bool cpu_has_amu_feat(int cpu)
{
        return cpumask_test_cpu(cpu, &amu_cpus);
}

int get_cpu_with_amu_feat(void)
{
        return cpumask_any(&amu_cpus);
}

static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap)
{
        if (has_cpuid_feature(cap, SCOPE_LOCAL_CPU)) {
                cpumask_set_cpu(smp_processor_id(), &amu_cpus);

                /* 0 reference values signal broken/disabled counters */
                if (!this_cpu_has_cap(ARM64_WORKAROUND_2457168))
                        update_freq_counters_refs();
        }
}

static bool has_amu(const struct arm64_cpu_capabilities *cap,
                    int __unused)
{
        /*
         * The AMU extension is a non-conflicting feature: the kernel can
         * safely run a mix of CPUs with and without support for the
         * activity monitors extension. Therefore, unconditionally enable
         * the capability to allow any late CPU to use the feature.
         *
         * With this feature unconditionally enabled, the cpu_enable
         * function will be called for all CPUs that match the criteria,
         * including secondary and hotplugged, marking this feature as
         * present on that respective CPU. The enable function will also
         * print a detection message.
         */

        return true;
}
#else
int get_cpu_with_amu_feat(void)
{
        return nr_cpu_ids;
}
#endif

static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused)
{
        return is_kernel_in_hyp_mode();
}

static void cpu_copy_el2regs(const struct arm64_cpu_capabilities *__unused)
{
        /*
         * Copy register values that aren't redirected by hardware.
         *
         * Before code patching, we only set tpidr_el1, all CPUs need to copy
         * this value to tpidr_el2 before we patch the code. Once we've done
         * that, freshly-onlined CPUs will set tpidr_el2, so we don't need to
         * do anything here.
         */
        if (!alternative_is_applied(ARM64_HAS_VIRT_HOST_EXTN))
                write_sysreg(read_sysreg(tpidr_el1), tpidr_el2);
}

static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap,
                                    int scope)
{
        if (kvm_get_mode() != KVM_MODE_NV)
                return false;

        if (!cpucap_multi_entry_cap_matches(cap, scope)) {
                pr_warn("unavailable: %s\n", cap->desc);
                return false;
        }

        return true;
}

static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
                          int __unused)
{
        return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE);
}

#ifdef CONFIG_ARM64_PAN
static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused)
{
        /*
         * We modify PSTATE. This won't work from irq context as the PSTATE
         * is discarded once we return from the exception.
         */
        WARN_ON_ONCE(in_interrupt());

        sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0);
        set_pstate_pan(1);
}
#endif /* CONFIG_ARM64_PAN */

#ifdef CONFIG_ARM64_RAS_EXTN
static void cpu_clear_disr(const struct arm64_cpu_capabilities *__unused)
{
        /* Firmware may have left a deferred SError in this register. */
        write_sysreg_s(0, SYS_DISR_EL1);
}
#endif /* CONFIG_ARM64_RAS_EXTN */

#ifdef CONFIG_ARM64_PTR_AUTH
static bool has_address_auth_cpucap(const struct arm64_cpu_capabilities *entry, int scope)
{
        int boot_val, sec_val;

        /* We don't expect to be called with SCOPE_SYSTEM */
        WARN_ON(scope == SCOPE_SYSTEM);
        /*
         * The ptr-auth feature levels are not intercompatible with lower
         * levels. Hence we must match ptr-auth feature level of the secondary
         * CPUs with that of the boot CPU. The level of boot cpu is fetched
         * from the sanitised register whereas direct register read is done for
         * the secondary CPUs.
         * The sanitised feature state is guaranteed to match that of the
         * boot CPU as a mismatched secondary CPU is parked before it gets
         * a chance to update the state, with the capability.
         */
        boot_val = cpuid_feature_extract_field(read_sanitised_ftr_reg(entry->sys_reg),
                                               entry->field_pos, entry->sign);
        if (scope & SCOPE_BOOT_CPU)
                return boot_val >= entry->min_field_value;
        /* Now check for the secondary CPUs with SCOPE_LOCAL_CPU scope */
        sec_val = cpuid_feature_extract_field(__read_sysreg_by_encoding(entry->sys_reg),
                                              entry->field_pos, entry->sign);
        return (sec_val >= entry->min_field_value) && (sec_val == boot_val);
}

static bool has_address_auth_metacap(const struct arm64_cpu_capabilities *entry,
                                     int scope)
{
        bool api = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_IMP_DEF], scope);
        bool apa = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5], scope);
        bool apa3 = has_address_auth_cpucap(cpucap_ptrs[ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3], scope);

        return apa || apa3 || api;
}

static bool has_generic_auth(const struct arm64_cpu_capabilities *entry,
                             int __unused)
{
        bool gpi = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_IMP_DEF);
        bool gpa = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5);
        bool gpa3 = __system_matches_cap(ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3);

        return gpa || gpa3 || gpi;
}
#endif /* CONFIG_ARM64_PTR_AUTH */

#ifdef CONFIG_ARM64_E0PD
static void cpu_enable_e0pd(struct arm64_cpu_capabilities const *cap)
{
        if (this_cpu_has_cap(ARM64_HAS_E0PD))
                sysreg_clear_set(tcr_el1, 0, TCR_E0PD1);
}
#endif /* CONFIG_ARM64_E0PD */

#ifdef CONFIG_ARM64_PSEUDO_NMI
static bool can_use_gic_priorities(const struct arm64_cpu_capabilities *entry,
                                   int scope)
{
        /*
         * ARM64_HAS_GIC_CPUIF_SYSREGS has a lower index, and is a boot CPU
         * feature, so will be detected earlier.
         */
        BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_MASKING <= ARM64_HAS_GIC_CPUIF_SYSREGS);
        if (!cpus_have_cap(ARM64_HAS_GIC_CPUIF_SYSREGS))
                return false;

        return enable_pseudo_nmi;
}

static bool has_gic_prio_relaxed_sync(const struct arm64_cpu_capabilities *entry,
                                      int scope)
{
        /*
         * If we're not using priority masking then we won't be poking PMR_EL1,
         * and there's no need to relax synchronization of writes to it, and
         * ICC_CTLR_EL1 might not be accessible and we must avoid reads from
         * that.
         *
         * ARM64_HAS_GIC_PRIO_MASKING has a lower index, and is a boot CPU
         * feature, so will be detected earlier.
         */
        BUILD_BUG_ON(ARM64_HAS_GIC_PRIO_RELAXED_SYNC <= ARM64_HAS_GIC_PRIO_MASKING);
        if (!cpus_have_cap(ARM64_HAS_GIC_PRIO_MASKING))
                return false;

        /*
         * When Priority Mask Hint Enable (PMHE) == 0b0, PMR is not used as a
         * hint for interrupt distribution, a DSB is not necessary when
         * unmasking IRQs via PMR, and we can relax the barrier to a NOP.
         *
         * Linux itself doesn't use 1:N distribution, so has no need to
         * set PMHE. The only reason to have it set is if EL3 requires it
         * (and we can't change it).
         */
        return (gic_read_ctlr() & ICC_CTLR_EL1_PMHE_MASK) == 0;
}
#endif

#ifdef CONFIG_ARM64_BTI
static void bti_enable(const struct arm64_cpu_capabilities *__unused)
{
        /*
         * Use of X16/X17 for tail-calls and trampolines that jump to
         * function entry points using BR is a requirement for
         * marking binaries with GNU_PROPERTY_AARCH64_FEATURE_1_BTI.
         * So, be strict and forbid other BRs using other registers to
         * jump onto a PACIxSP instruction:
         */
        sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_BT0 | SCTLR_EL1_BT1);
        isb();
}
#endif /* CONFIG_ARM64_BTI */

#ifdef CONFIG_ARM64_MTE
static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap)
{
        sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0);

        mte_cpu_setup();

        /*
         * Clear the tags in the zero page. This needs to be done via the
         * linear map which has the Tagged attribute.
         */
        if (try_page_mte_tagging(ZERO_PAGE(0))) {
                mte_clear_page_tags(lm_alias(empty_zero_page));
                set_page_mte_tagged(ZERO_PAGE(0));
        }

        kasan_init_hw_tags_cpu();
}
#endif /* CONFIG_ARM64_MTE */

static void user_feature_fixup(void)
{
        if (cpus_have_cap(ARM64_WORKAROUND_2658417)) {
                struct arm64_ftr_reg *regp;

                regp = get_arm64_ftr_reg(SYS_ID_AA64ISAR1_EL1);
                if (regp)
                        regp->user_mask &= ~ID_AA64ISAR1_EL1_BF16_MASK;
        }

        if (cpus_have_cap(ARM64_WORKAROUND_SPECULATIVE_SSBS)) {
                struct arm64_ftr_reg *regp;

                regp = get_arm64_ftr_reg(SYS_ID_AA64PFR1_EL1);
                if (regp)
                        regp->user_mask &= ~ID_AA64PFR1_EL1_SSBS_MASK;
        }
}

static void elf_hwcap_fixup(void)
{
#ifdef CONFIG_COMPAT
        if (cpus_have_cap(ARM64_WORKAROUND_1742098))
                compat_elf_hwcap2 &= ~COMPAT_HWCAP2_AES;
#endif /* CONFIG_COMPAT */
}

#ifdef CONFIG_KVM
static bool is_kvm_protected_mode(const struct arm64_cpu_capabilities *entry, int __unused)
{
        return kvm_get_mode() == KVM_MODE_PROTECTED;
}
#endif /* CONFIG_KVM */

static void cpu_trap_el0_impdef(const struct arm64_cpu_capabilities *__unused)
{
        sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_TIDCP);
}

static void cpu_enable_dit(const struct arm64_cpu_capabilities *__unused)
{
        set_pstate_dit(1);
}

static void cpu_enable_mops(const struct arm64_cpu_capabilities *__unused)
{
        sysreg_clear_set(sctlr_el1, 0, SCTLR_EL1_MSCEn);
}

#ifdef CONFIG_ARM64_POE
static void cpu_enable_poe(const struct arm64_cpu_capabilities *__unused)
{
        sysreg_clear_set(REG_TCR2_EL1, 0, TCR2_EL1_E0POE);
        sysreg_clear_set(CPACR_EL1, 0, CPACR_EL1_E0POE);
}
#endif

#ifdef CONFIG_ARM64_GCS
static void cpu_enable_gcs(const struct arm64_cpu_capabilities *__unused)
{
        /* GCSPR_EL0 is always readable */
        write_sysreg_s(GCSCRE0_EL1_nTR, SYS_GCSCRE0_EL1);
}
#endif

/* Internal helper functions to match cpu capability type */
static bool
cpucap_late_cpu_optional(const struct arm64_cpu_capabilities *cap)
{
        return !!(cap->type & ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU);
}

static bool
cpucap_late_cpu_permitted(const struct arm64_cpu_capabilities *cap)
{
        return !!(cap->type & ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU);
}

static bool
cpucap_panic_on_conflict(const struct arm64_cpu_capabilities *cap)
{
        return !!(cap->type & ARM64_CPUCAP_PANIC_ON_CONFLICT);
}

static bool
test_has_mpam(const struct arm64_cpu_capabilities *entry, int scope)
{
        if (!has_cpuid_feature(entry, scope))
                return false;

        /* Check firmware actually enabled MPAM on this cpu. */
        return (read_sysreg_s(SYS_MPAM1_EL1) & MPAM1_EL1_MPAMEN);
}

static void
cpu_enable_mpam(const struct arm64_cpu_capabilities *entry)
{
        /*
         * Access by the kernel (at EL1) should use the reserved PARTID
         * which is configured unrestricted. This avoids priority-inversion
         * where latency sensitive tasks have to wait for a task that has
         * been throttled to release the lock.
         */
        write_sysreg_s(0, SYS_MPAM1_EL1);
}

static bool
test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope)
{
        u64 idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1);

        return idr & MPAMIDR_EL1_HAS_HCR;
}

static const struct arm64_cpu_capabilities arm64_features[] = {
        {
                .capability = ARM64_ALWAYS_BOOT,
                .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
                .matches = has_always,
        },
        {
                .capability = ARM64_ALWAYS_SYSTEM,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_always,
        },
        {
                .desc = "GIC system register CPU interface",
                .capability = ARM64_HAS_GIC_CPUIF_SYSREGS,
                .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
                .matches = has_useable_gicv3_cpuif,
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, GIC, IMP)
        },
        {
                .desc = "Enhanced Counter Virtualization",
                .capability = ARM64_HAS_ECV,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, IMP)
        },
        {
                .desc = "Enhanced Counter Virtualization (CNTPOFF)",
                .capability = ARM64_HAS_ECV_CNTPOFF,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, ECV, CNTPOFF)
        },
#ifdef CONFIG_ARM64_PAN
        {
                .desc = "Privileged Access Never",
                .capability = ARM64_HAS_PAN,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_pan,
                ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, IMP)
        },
#endif /* CONFIG_ARM64_PAN */
#ifdef CONFIG_ARM64_EPAN
        {
                .desc = "Enhanced Privileged Access Never",
                .capability = ARM64_HAS_EPAN,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, PAN, PAN3)
        },
#endif /* CONFIG_ARM64_EPAN */
#ifdef CONFIG_ARM64_LSE_ATOMICS
        {
                .desc = "LSE atomic instructions",
                .capability = ARM64_HAS_LSE_ATOMICS,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, ATOMIC, IMP)
        },
#endif /* CONFIG_ARM64_LSE_ATOMICS */
        {
                .desc = "Virtualization Host Extensions",
                .capability = ARM64_HAS_VIRT_HOST_EXTN,
                .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
                .matches = runs_at_el2,
                .cpu_enable = cpu_copy_el2regs,
        },
        {
                .desc = "Nested Virtualization Support",
                .capability = ARM64_HAS_NESTED_VIRT,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_nested_virt_support,
                .match_list = (const struct arm64_cpu_capabilities []){
                        {
                                .matches = has_cpuid_feature,
                                ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, NV, NV2)
                        },
                        {
                                .matches = has_cpuid_feature,
                                ARM64_CPUID_FIELDS(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY)
                        },
                        { /* Sentinel */ }
                },
        },
        {
                .capability = ARM64_HAS_32BIT_EL0_DO_NOT_USE,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_32bit_el0,
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, EL0, AARCH32)
        },
#ifdef CONFIG_KVM
        {
                .desc = "32-bit EL1 Support",
                .capability = ARM64_HAS_32BIT_EL1,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, EL1, AARCH32)
        },
        {
                .desc = "Protected KVM",
                .capability = ARM64_KVM_PROTECTED_MODE,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = is_kvm_protected_mode,
        },
        {
                .desc = "HCRX_EL2 register",
                .capability = ARM64_HAS_HCX,
                .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HCX, IMP)
        },
#endif
        {
                .desc = "Kernel page table isolation (KPTI)",
                .capability = ARM64_UNMAP_KERNEL_AT_EL0,
                .type = ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE,
                .cpu_enable = cpu_enable_kpti,
                .matches = unmap_kernel_at_el0,
                /*
                 * The ID feature fields below are used to indicate that
                 * the CPU doesn't need KPTI. See unmap_kernel_at_el0 for
                 * more details.
                 */
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, CSV3, IMP)
        },
        {
                .capability = ARM64_HAS_FPSIMD,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_fpsimd,
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, FP, IMP)
        },
#ifdef CONFIG_ARM64_PMEM
        {
                .desc = "Data cache clean to Point of Persistence",
                .capability = ARM64_HAS_DCPOP,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, DPB, IMP)
        },
        {
                .desc = "Data cache clean to Point of Deep Persistence",
                .capability = ARM64_HAS_DCPODP,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, DPB, DPB2)
        },
#endif
#ifdef CONFIG_ARM64_SVE
        {
                .desc = "Scalable Vector Extension",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_SVE,
                .cpu_enable = cpu_enable_sve,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, SVE, IMP)
        },
#endif /* CONFIG_ARM64_SVE */
#ifdef CONFIG_ARM64_RAS_EXTN
        {
                .desc = "RAS Extension Support",
                .capability = ARM64_HAS_RAS_EXTN,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_clear_disr,
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, RAS, IMP)
        },
#endif /* CONFIG_ARM64_RAS_EXTN */
#ifdef CONFIG_ARM64_AMU_EXTN
        {
                .desc = "Activity Monitors Unit (AMU)",
                .capability = ARM64_HAS_AMU_EXTN,
                .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
                .matches = has_amu,
                .cpu_enable = cpu_amu_enable,
                .cpus = &amu_cpus,
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, AMU, IMP)
        },
#endif /* CONFIG_ARM64_AMU_EXTN */
        {
                .desc = "Data cache clean to the PoU not required for I/D coherence",
                .capability = ARM64_HAS_CACHE_IDC,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cache_idc,
                .cpu_enable = cpu_emulate_effective_ctr,
        },
        {
                .desc = "Instruction cache invalidation not required for I/D coherence",
                .capability = ARM64_HAS_CACHE_DIC,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cache_dic,
        },
        {
                .desc = "Stage-2 Force Write-Back",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_HAS_STAGE2_FWB,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, FWB, IMP)
        },
        {
                .desc = "ARMv8.4 Translation Table Level",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_HAS_ARMv8_4_TTL,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, TTL, IMP)
        },
        {
                .desc = "TLB range maintenance instructions",
                .capability = ARM64_HAS_TLB_RANGE,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, TLB, RANGE)
        },
#ifdef CONFIG_ARM64_HW_AFDBM
        {
                .desc = "Hardware dirty bit management",
                .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE,
                .capability = ARM64_HW_DBM,
                .matches = has_hw_dbm,
                .cpu_enable = cpu_enable_hw_dbm,
                .cpus = &dbm_cpus,
                ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, DBM)
        },
#endif
#ifdef CONFIG_ARM64_HAFT
        {
                .desc = "Hardware managed Access Flag for Table Descriptors",
                /*
                 * Contrary to the page/block access flag, the table access flag
                 * cannot be emulated in software (no access fault will occur).
                 * Therefore this should be used only if it's supported system
                 * wide.
                 */
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_HAFT,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, HAFDBS, HAFT)
        },
#endif
        {
                .desc = "CRC32 instructions",
                .capability = ARM64_HAS_CRC32,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, CRC32, IMP)
        },
        {
                .desc = "Speculative Store Bypassing Safe (SSBS)",
                .capability = ARM64_SSBS,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SSBS, IMP)
        },
#ifdef CONFIG_ARM64_CNP
        {
                .desc = "Common not Private translations",
                .capability = ARM64_HAS_CNP,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_useable_cnp,
                .cpu_enable = cpu_enable_cnp,
                ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, CnP, IMP)
        },
#endif
        {
                .desc = "Speculation barrier (SB)",
                .capability = ARM64_HAS_SB,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, SB, IMP)
        },
#ifdef CONFIG_ARM64_PTR_AUTH
        {
                .desc = "Address authentication (architected QARMA5 algorithm)",
                .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA5,
                .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
                .matches = has_address_auth_cpucap,
                ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, APA, PAuth)
        },
        {
                .desc = "Address authentication (architected QARMA3 algorithm)",
                .capability = ARM64_HAS_ADDRESS_AUTH_ARCH_QARMA3,
                .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
                .matches = has_address_auth_cpucap,
                ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, APA3, PAuth)
        },
        {
                .desc = "Address authentication (IMP DEF algorithm)",
                .capability = ARM64_HAS_ADDRESS_AUTH_IMP_DEF,
                .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
                .matches = has_address_auth_cpucap,
                ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, API, PAuth)
        },
        {
                .capability = ARM64_HAS_ADDRESS_AUTH,
                .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
                .matches = has_address_auth_metacap,
        },
        {
                .desc = "Generic authentication (architected QARMA5 algorithm)",
                .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA5,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, GPA, IMP)
        },
        {
                .desc = "Generic authentication (architected QARMA3 algorithm)",
                .capability = ARM64_HAS_GENERIC_AUTH_ARCH_QARMA3,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, GPA3, IMP)
        },
        {
                .desc = "Generic authentication (IMP DEF algorithm)",
                .capability = ARM64_HAS_GENERIC_AUTH_IMP_DEF,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, GPI, IMP)
        },
        {
                .capability = ARM64_HAS_GENERIC_AUTH,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_generic_auth,
        },
#endif /* CONFIG_ARM64_PTR_AUTH */
#ifdef CONFIG_ARM64_PSEUDO_NMI
        {
                /*
                 * Depends on having GICv3
                 */
                .desc = "IRQ priority masking",
                .capability = ARM64_HAS_GIC_PRIO_MASKING,
                .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
                .matches = can_use_gic_priorities,
        },
        {
                /*
                 * Depends on ARM64_HAS_GIC_PRIO_MASKING
                 */
                .capability = ARM64_HAS_GIC_PRIO_RELAXED_SYNC,
                .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
                .matches = has_gic_prio_relaxed_sync,
        },
#endif
#ifdef CONFIG_ARM64_E0PD
        {
                .desc = "E0PD",
                .capability = ARM64_HAS_E0PD,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .cpu_enable = cpu_enable_e0pd,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, E0PD, IMP)
        },
#endif
        {
                .desc = "Random Number Generator",
                .capability = ARM64_HAS_RNG,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, RNDR, IMP)
        },
#ifdef CONFIG_ARM64_BTI
        {
                .desc = "Branch Target Identification",
                .capability = ARM64_BTI,
#ifdef CONFIG_ARM64_BTI_KERNEL
                .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
#else
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
#endif
                .matches = has_cpuid_feature,
                .cpu_enable = bti_enable,
                ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, BT, IMP)
        },
#endif
#ifdef CONFIG_ARM64_MTE
        {
                .desc = "Memory Tagging Extension",
                .capability = ARM64_MTE,
                .type = ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_mte,
                ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE2)
        },
        {
                .desc = "Asymmetric MTE Tag Check Fault",
                .capability = ARM64_MTE_ASYMM,
                .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, MTE, MTE3)
        },
#endif /* CONFIG_ARM64_MTE */
        {
                .desc = "RCpc load-acquire (LDAPR)",
                .capability = ARM64_HAS_LDAPR,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR1_EL1, LRCPC, IMP)
        },
        {
                .desc = "Fine Grained Traps",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_HAS_FGT,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, IMP)
        },
        {
                .desc = "Fine Grained Traps 2",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_HAS_FGT2,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, FGT, FGT2)
        },
#ifdef CONFIG_ARM64_SME
        {
                .desc = "Scalable Matrix Extension",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_SME,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_sme,
                ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, IMP)
        },
        /* FA64 should be sorted after the base SME capability */
        {
                .desc = "FA64",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_SME_FA64,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_fa64,
                ARM64_CPUID_FIELDS(ID_AA64SMFR0_EL1, FA64, IMP)
        },
        {
                .desc = "SME2",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_SME2,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_sme2,
                ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, SME, SME2)
        },
#endif /* CONFIG_ARM64_SME */
        {
                .desc = "WFx with timeout",
                .capability = ARM64_HAS_WFXT,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, WFxT, IMP)
        },
        {
                .desc = "Trap EL0 IMPLEMENTATION DEFINED functionality",
                .capability = ARM64_HAS_TIDCP1,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_trap_el0_impdef,
                ARM64_CPUID_FIELDS(ID_AA64MMFR1_EL1, TIDCP1, IMP)
        },
        {
                .desc = "Data independent timing control (DIT)",
                .capability = ARM64_HAS_DIT,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_dit,
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, DIT, IMP)
        },
        {
                .desc = "Memory Copy and Memory Set instructions",
                .capability = ARM64_HAS_MOPS,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_mops,
                ARM64_CPUID_FIELDS(ID_AA64ISAR2_EL1, MOPS, IMP)
        },
        {
                .capability = ARM64_HAS_TCR2,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, TCRX, IMP)
        },
        {
                .desc = "Stage-1 Permission Indirection Extension (S1PIE)",
                .capability = ARM64_HAS_S1PIE,
                .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1PIE, IMP)
        },
        {
                .desc = "VHE for hypervisor only",
                .capability = ARM64_KVM_HVHE,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = hvhe_possible,
        },
        {
                .desc = "Enhanced Virtualization Traps",
                .capability = ARM64_HAS_EVT,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, EVT, IMP)
        },
        {
                .desc = "52-bit Virtual Addressing for KVM (LPA2)",
                .capability = ARM64_HAS_LPA2,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_lpa2,
        },
        {
                .desc = "FPMR",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_HAS_FPMR,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_fpmr,
                ARM64_CPUID_FIELDS(ID_AA64PFR2_EL1, FPMR, IMP)
        },
#ifdef CONFIG_ARM64_VA_BITS_52
        {
                .capability = ARM64_HAS_VA52,
                .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
                .matches = has_cpuid_feature,
#ifdef CONFIG_ARM64_64K_PAGES
                .desc = "52-bit Virtual Addressing (LVA)",
                ARM64_CPUID_FIELDS(ID_AA64MMFR2_EL1, VARange, 52)
#else
                .desc = "52-bit Virtual Addressing (LPA2)",
#ifdef CONFIG_ARM64_4K_PAGES
                ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN4, 52_BIT)
#else
                ARM64_CPUID_FIELDS(ID_AA64MMFR0_EL1, TGRAN16, 52_BIT)
#endif
#endif
        },
#endif
        {
                .desc = "Memory Partitioning And Monitoring",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_MPAM,
                .matches = test_has_mpam,
                .cpu_enable = cpu_enable_mpam,
                ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, MPAM, 1)
        },
        {
                .desc = "Memory Partitioning And Monitoring Virtualisation",
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .capability = ARM64_MPAM_HCR,
                .matches = test_has_mpam_hcr,
        },
        {
                .desc = "NV1",
                .capability = ARM64_HAS_HCR_NV1,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_nv1,
                ARM64_CPUID_FIELDS_NEG(ID_AA64MMFR4_EL1, E2H0, NI_NV1)
        },
#ifdef CONFIG_ARM64_POE
        {
                .desc = "Stage-1 Permission Overlay Extension (S1POE)",
                .capability = ARM64_HAS_S1POE,
                .type = ARM64_CPUCAP_BOOT_CPU_FEATURE,
                .matches = has_cpuid_feature,
                .cpu_enable = cpu_enable_poe,
                ARM64_CPUID_FIELDS(ID_AA64MMFR3_EL1, S1POE, IMP)
        },
#endif
#ifdef CONFIG_ARM64_GCS
        {
                .desc = "Guarded Control Stack (GCS)",
                .capability = ARM64_HAS_GCS,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .cpu_enable = cpu_enable_gcs,
                .matches = has_cpuid_feature,
                ARM64_CPUID_FIELDS(ID_AA64PFR1_EL1, GCS, IMP)
        },
#endif
#ifdef CONFIG_HW_PERF_EVENTS
        {
                .desc = "PMUv3",
                .capability = ARM64_HAS_PMUV3,
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,
                .matches = has_pmuv3,
        },
#endif
        {},
};

#define HWCAP_CPUID_MATCH(reg, field, min_value)                        \
                .matches = has_user_cpuid_feature,                        \
                ARM64_CPUID_FIELDS(reg, field, min_value)

#define __HWCAP_CAP(name, cap_type, cap)                                        \
                .desc = name,                                                        \
                .type = ARM64_CPUCAP_SYSTEM_FEATURE,                                \
                .hwcap_type = cap_type,                                                \
                .hwcap = cap,                                                        \

#define HWCAP_CAP(reg, field, min_value, cap_type, cap)                \
        {                                                                        \
                __HWCAP_CAP(#cap, cap_type, cap)                                \
                HWCAP_CPUID_MATCH(reg, field, min_value)                 \
        }

#define HWCAP_MULTI_CAP(list, cap_type, cap)                                        \
        {                                                                        \
                __HWCAP_CAP(#cap, cap_type, cap)                                \
                .matches = cpucap_multi_entry_cap_matches,                        \
                .match_list = list,                                                \
        }

#define HWCAP_CAP_MATCH(match, cap_type, cap)                                        \
        {                                                                        \
                __HWCAP_CAP(#cap, cap_type, cap)                                \
                .matches = match,                                                \
        }

#define HWCAP_CAP_MATCH_ID(match, reg, field, min_value, cap_type, cap)                \
        {                                                                        \
                __HWCAP_CAP(#cap, cap_type, cap)                                \
                HWCAP_CPUID_MATCH(reg, field, min_value)                         \
                .matches = match,                                                \
        }

#ifdef CONFIG_ARM64_PTR_AUTH
static const struct arm64_cpu_capabilities ptr_auth_hwcap_addr_matches[] = {
        {
                HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, APA, PAuth)
        },
        {
                HWCAP_CPUID_MATCH(ID_AA64ISAR2_EL1, APA3, PAuth)
        },
        {
                HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, API, PAuth)
        },
        {},
};

static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = {
        {
                HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, GPA, IMP)
        },
        {
                HWCAP_CPUID_MATCH(ID_AA64ISAR2_EL1, GPA3, IMP)
        },
        {
                HWCAP_CPUID_MATCH(ID_AA64ISAR1_EL1, GPI, IMP)
        },
        {},
};
#endif

#ifdef CONFIG_ARM64_SVE
static bool has_sve_feature(const struct arm64_cpu_capabilities *cap, int scope)
{
        return system_supports_sve() && has_user_cpuid_feature(cap, scope);
}
#endif

static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
        HWCAP_CAP(ID_AA64ISAR0_EL1, AES, PMULL, CAP_HWCAP, KERNEL_HWCAP_PMULL),
        HWCAP_CAP(ID_AA64ISAR0_EL1, AES, AES, CAP_HWCAP, KERNEL_HWCAP_AES),
        HWCAP_CAP(ID_AA64ISAR0_EL1, SHA1, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA1),
        HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA256, CAP_HWCAP, KERNEL_HWCAP_SHA2),
        HWCAP_CAP(ID_AA64ISAR0_EL1, SHA2, SHA512, CAP_HWCAP, KERNEL_HWCAP_SHA512),
        HWCAP_CAP(ID_AA64ISAR0_EL1, CRC32, IMP, CAP_HWCAP, KERNEL_HWCAP_CRC32),
        HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, IMP, CAP_HWCAP, KERNEL_HWCAP_ATOMICS),
        HWCAP_CAP(ID_AA64ISAR0_EL1, ATOMIC, FEAT_LSE128, CAP_HWCAP, KERNEL_HWCAP_LSE128),
        HWCAP_CAP(ID_AA64ISAR0_EL1, RDM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM),
        HWCAP_CAP(ID_AA64ISAR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SHA3),
        HWCAP_CAP(ID_AA64ISAR0_EL1, SM3, IMP, CAP_HWCAP, KERNEL_HWCAP_SM3),
        HWCAP_CAP(ID_AA64ISAR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SM4),
        HWCAP_CAP(ID_AA64ISAR0_EL1, DP, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP),
        HWCAP_CAP(ID_AA64ISAR0_EL1, FHM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM),
        HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM),
        HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2),
        HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG),
        HWCAP_CAP(ID_AA64ISAR3_EL1, FPRCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_FPRCVT),
        HWCAP_CAP(ID_AA64PFR0_EL1, FP, IMP, CAP_HWCAP, KERNEL_HWCAP_FP),
        HWCAP_CAP(ID_AA64PFR0_EL1, FP, FP16, CAP_HWCAP, KERNEL_HWCAP_FPHP),
        HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMD),
        HWCAP_CAP(ID_AA64PFR0_EL1, AdvSIMD, FP16, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP),
        HWCAP_CAP(ID_AA64PFR0_EL1, DIT, IMP, CAP_HWCAP, KERNEL_HWCAP_DIT),
        HWCAP_CAP(ID_AA64PFR2_EL1, FPMR, IMP, CAP_HWCAP, KERNEL_HWCAP_FPMR),
        HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, IMP, CAP_HWCAP, KERNEL_HWCAP_DCPOP),
        HWCAP_CAP(ID_AA64ISAR1_EL1, DPB, DPB2, CAP_HWCAP, KERNEL_HWCAP_DCPODP),
        HWCAP_CAP(ID_AA64ISAR1_EL1, JSCVT, IMP, CAP_HWCAP, KERNEL_HWCAP_JSCVT),
        HWCAP_CAP(ID_AA64ISAR1_EL1, FCMA, IMP, CAP_HWCAP, KERNEL_HWCAP_FCMA),
        HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, IMP, CAP_HWCAP, KERNEL_HWCAP_LRCPC),
        HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC),
        HWCAP_CAP(ID_AA64ISAR1_EL1, LRCPC, LRCPC3, CAP_HWCAP, KERNEL_HWCAP_LRCPC3),
        HWCAP_CAP(ID_AA64ISAR1_EL1, FRINTTS, IMP, CAP_HWCAP, KERNEL_HWCAP_FRINT),
        HWCAP_CAP(ID_AA64ISAR1_EL1, SB, IMP, CAP_HWCAP, KERNEL_HWCAP_SB),
        HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_BF16),
        HWCAP_CAP(ID_AA64ISAR1_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_EBF16),
        HWCAP_CAP(ID_AA64ISAR1_EL1, DGH, IMP, CAP_HWCAP, KERNEL_HWCAP_DGH),
        HWCAP_CAP(ID_AA64ISAR1_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_I8MM),
        HWCAP_CAP(ID_AA64ISAR2_EL1, LUT, IMP, CAP_HWCAP, KERNEL_HWCAP_LUT),
        HWCAP_CAP(ID_AA64ISAR3_EL1, FAMINMAX, IMP, CAP_HWCAP, KERNEL_HWCAP_FAMINMAX),
        HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT),
#ifdef CONFIG_ARM64_SVE
        HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p2, CAP_HWCAP, KERNEL_HWCAP_SVE2P2),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEAES),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, PMULL128, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, AES, AES2, CAP_HWCAP, KERNEL_HWCAP_SVE_AES2),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, BFSCALE, CAP_HWCAP, KERNEL_HWCAP_SVE_BFSCALE),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESM4),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, I8MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEI8MM),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F32MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF32MM),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F64MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEF64MM),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, F16MM, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_F16MM),
        HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, EltPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_ELTPERM),
#endif
#ifdef CONFIG_ARM64_GCS
        HWCAP_CAP(ID_AA64PFR1_EL1, GCS, IMP, CAP_HWCAP, KERNEL_HWCAP_GCS),
#endif
        HWCAP_CAP(ID_AA64PFR1_EL1, SSBS, SSBS2, CAP_HWCAP, KERNEL_HWCAP_SSBS),
#ifdef CONFIG_ARM64_BTI
        HWCAP_CAP(ID_AA64PFR1_EL1, BT, IMP, CAP_HWCAP, KERNEL_HWCAP_BTI),
#endif
#ifdef CONFIG_ARM64_PTR_AUTH
        HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA),
        HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG),
#endif
#ifdef CONFIG_ARM64_MTE
        HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE2, CAP_HWCAP, KERNEL_HWCAP_MTE),
        HWCAP_CAP(ID_AA64PFR1_EL1, MTE, MTE3, CAP_HWCAP, KERNEL_HWCAP_MTE3),
#endif /* CONFIG_ARM64_MTE */
        HWCAP_CAP(ID_AA64MMFR0_EL1, ECV, IMP, CAP_HWCAP, KERNEL_HWCAP_ECV),
        HWCAP_CAP(ID_AA64MMFR1_EL1, AFP, IMP, CAP_HWCAP, KERNEL_HWCAP_AFP),
        HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, IMP, CAP_HWCAP, KERNEL_HWCAP_CSSC),
        HWCAP_CAP(ID_AA64ISAR2_EL1, CSSC, CMPBR, CAP_HWCAP, KERNEL_HWCAP_CMPBR),
        HWCAP_CAP(ID_AA64ISAR2_EL1, RPRFM, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRFM),
        HWCAP_CAP(ID_AA64ISAR2_EL1, RPRES, IMP, CAP_HWCAP, KERNEL_HWCAP_RPRES),
        HWCAP_CAP(ID_AA64ISAR2_EL1, WFxT, IMP, CAP_HWCAP, KERNEL_HWCAP_WFXT),
        HWCAP_CAP(ID_AA64ISAR2_EL1, MOPS, IMP, CAP_HWCAP, KERNEL_HWCAP_MOPS),
        HWCAP_CAP(ID_AA64ISAR2_EL1, BC, IMP, CAP_HWCAP, KERNEL_HWCAP_HBC),
#ifdef CONFIG_ARM64_SME
        HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME),
        HWCAP_CAP(ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
        HWCAP_CAP(ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2),
        HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2),
        HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1),
        HWCAP_CAP(ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2),
        HWCAP_CAP(ID_AA64SMFR0_EL1, I16I64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I64),
        HWCAP_CAP(ID_AA64SMFR0_EL1, F64F64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F64F64),
        HWCAP_CAP(ID_AA64SMFR0_EL1, I16I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I16I32),
        HWCAP_CAP(ID_AA64SMFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16B16),
        HWCAP_CAP(ID_AA64SMFR0_EL1, F16F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F16),
        HWCAP_CAP(ID_AA64SMFR0_EL1, F8F16, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F16),
        HWCAP_CAP(ID_AA64SMFR0_EL1, F8F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F8F32),
        HWCAP_CAP(ID_AA64SMFR0_EL1, I8I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_I8I32),
        HWCAP_CAP(ID_AA64SMFR0_EL1, F16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F16F32),
        HWCAP_CAP(ID_AA64SMFR0_EL1, B16F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_B16F32),
        HWCAP_CAP(ID_AA64SMFR0_EL1, BI32I32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_BI32I32),
        HWCAP_CAP(ID_AA64SMFR0_EL1, F32F32, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_F32F32),
        HWCAP_CAP(ID_AA64SMFR0_EL1, SF8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8FMA),
        HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP4),
        HWCAP_CAP(ID_AA64SMFR0_EL1, SF8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SF8DP2),
        HWCAP_CAP(ID_AA64SMFR0_EL1, SBitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SBITPERM),
        HWCAP_CAP(ID_AA64SMFR0_EL1, AES, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_AES),
        HWCAP_CAP(ID_AA64SMFR0_EL1, SFEXPA, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SFEXPA),
        HWCAP_CAP(ID_AA64SMFR0_EL1, STMOP, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_STMOP),
        HWCAP_CAP(ID_AA64SMFR0_EL1, SMOP4, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_SMOP4),
#endif /* CONFIG_ARM64_SME */
        HWCAP_CAP(ID_AA64FPFR0_EL1, F8CVT, IMP, CAP_HWCAP, KERNEL_HWCAP_F8CVT),
        HWCAP_CAP(ID_AA64FPFR0_EL1, F8FMA, IMP, CAP_HWCAP, KERNEL_HWCAP_F8FMA),
        HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP4),
        HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2),
        HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8),
        HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4),
        HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3),
        HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2),
#ifdef CONFIG_ARM64_POE
        HWCAP_CAP(ID_AA64MMFR3_EL1, S1POE, IMP, CAP_HWCAP, KERNEL_HWCAP_POE),
#endif
        {},
};

#ifdef CONFIG_COMPAT
static bool compat_has_neon(const struct arm64_cpu_capabilities *cap, int scope)
{
        /*
         * Check that all of MVFR1_EL1.{SIMDSP, SIMDInt, SIMDLS} are available,
         * in line with that of arm32 as in vfp_init(). We make sure that the
         * check is future proof, by making sure value is non-zero.
         */
        u32 mvfr1;

        WARN_ON(scope == SCOPE_LOCAL_CPU && preemptible());
        if (scope == SCOPE_SYSTEM)
                mvfr1 = read_sanitised_ftr_reg(SYS_MVFR1_EL1);
        else
                mvfr1 = read_sysreg_s(SYS_MVFR1_EL1);

        return cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDSP_SHIFT) &&
                cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDInt_SHIFT) &&
                cpuid_feature_extract_unsigned_field(mvfr1, MVFR1_EL1_SIMDLS_SHIFT);
}
#endif

static const struct arm64_cpu_capabilities compat_elf_hwcaps[] = {
#ifdef CONFIG_COMPAT
        HWCAP_CAP_MATCH(compat_has_neon, CAP_COMPAT_HWCAP, COMPAT_HWCAP_NEON),
        HWCAP_CAP(MVFR1_EL1, SIMDFMAC, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv4),
        /* Arm v8 mandates MVFR0.FPDP == {0, 2}. So, piggy back on this for the presence of VFP support */
        HWCAP_CAP(MVFR0_EL1, FPDP, VFPv3, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFP),
        HWCAP_CAP(MVFR0_EL1, FPDP, VFPv3, CAP_COMPAT_HWCAP, COMPAT_HWCAP_VFPv3),
        HWCAP_CAP(MVFR1_EL1, FPHP, FP16, CAP_COMPAT_HWCAP, COMPAT_HWCAP_FPHP),
        HWCAP_CAP(MVFR1_EL1, SIMDHP, SIMDHP_FLOAT, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDHP),
        HWCAP_CAP(ID_ISAR5_EL1, AES, VMULL, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_PMULL),
        HWCAP_CAP(ID_ISAR5_EL1, AES, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_AES),
        HWCAP_CAP(ID_ISAR5_EL1, SHA1, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA1),
        HWCAP_CAP(ID_ISAR5_EL1, SHA2, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SHA2),
        HWCAP_CAP(ID_ISAR5_EL1, CRC32, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_CRC32),
        HWCAP_CAP(ID_ISAR6_EL1, DP, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDDP),
        HWCAP_CAP(ID_ISAR6_EL1, FHM, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDFHM),
        HWCAP_CAP(ID_ISAR6_EL1, SB, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SB),
        HWCAP_CAP(ID_ISAR6_EL1, BF16, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_ASIMDBF16),
        HWCAP_CAP(ID_ISAR6_EL1, I8MM, IMP, CAP_COMPAT_HWCAP, COMPAT_HWCAP_I8MM),
        HWCAP_CAP(ID_PFR2_EL1, SSBS, IMP, CAP_COMPAT_HWCAP2, COMPAT_HWCAP2_SSBS),
#endif
        {},
};

static void cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap)
{
        switch (cap->hwcap_type) {
        case CAP_HWCAP:
                cpu_set_feature(cap->hwcap);
                break;
#ifdef CONFIG_COMPAT
        case CAP_COMPAT_HWCAP:
                compat_elf_hwcap |= (u32)cap->hwcap;
                break;
        case CAP_COMPAT_HWCAP2:
                compat_elf_hwcap2 |= (u32)cap->hwcap;
                break;
#endif
        default:
                WARN_ON(1);
                break;
        }
}

/* Check if we have a particular HWCAP enabled */
static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap)
{
        bool rc;

        switch (cap->hwcap_type) {
        case CAP_HWCAP:
                rc = cpu_have_feature(cap->hwcap);
                break;
#ifdef CONFIG_COMPAT
        case CAP_COMPAT_HWCAP:
                rc = (compat_elf_hwcap & (u32)cap->hwcap) != 0;
                break;
        case CAP_COMPAT_HWCAP2:
                rc = (compat_elf_hwcap2 & (u32)cap->hwcap) != 0;
                break;
#endif
        default:
                WARN_ON(1);
                rc = false;
        }

        return rc;
}

static void setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps)
{
        /* We support emulation of accesses to CPU ID feature registers */
        cpu_set_named_feature(CPUID);
        for (; hwcaps->matches; hwcaps++)
                if (hwcaps->matches(hwcaps, cpucap_default_scope(hwcaps)))
                        cap_set_elf_hwcap(hwcaps);
}

static void update_cpu_capabilities(u16 scope_mask)
{
        int i;
        const struct arm64_cpu_capabilities *caps;

        scope_mask &= ARM64_CPUCAP_SCOPE_MASK;
        for (i = 0; i < ARM64_NCAPS; i++) {
                caps = cpucap_ptrs[i];
                if (!caps || !(caps->type & scope_mask) ||
                    cpus_have_cap(caps->capability) ||
                    !caps->matches(caps, cpucap_default_scope(caps)))
                        continue;

                if (caps->desc && !caps->cpus)
                        pr_info("detected: %s\n", caps->desc);

                __set_bit(caps->capability, system_cpucaps);

                if ((scope_mask & SCOPE_BOOT_CPU) && (caps->type & SCOPE_BOOT_CPU))
                        set_bit(caps->capability, boot_cpucaps);
        }
}

/*
 * Enable all the available capabilities on this CPU. The capabilities
 * with BOOT_CPU scope are handled separately and hence skipped here.
 */
static int cpu_enable_non_boot_scope_capabilities(void *__unused)
{
        int i;
        u16 non_boot_scope = SCOPE_ALL & ~SCOPE_BOOT_CPU;

        for_each_available_cap(i) {
                const struct arm64_cpu_capabilities *cap = cpucap_ptrs[i];

                if (WARN_ON(!cap))
                        continue;

                if (!(cap->type & non_boot_scope))
                        continue;

                if (cap->cpu_enable)
                        cap->cpu_enable(cap);
        }
        return 0;
}

/*
 * Run through the enabled capabilities and enable() it on all active
 * CPUs
 */
static void __init enable_cpu_capabilities(u16 scope_mask)
{
        int i;
        const struct arm64_cpu_capabilities *caps;
        bool boot_scope;

        scope_mask &= ARM64_CPUCAP_SCOPE_MASK;
        boot_scope = !!(scope_mask & SCOPE_BOOT_CPU);

        for (i = 0; i < ARM64_NCAPS; i++) {
                caps = cpucap_ptrs[i];
                if (!caps || !(caps->type & scope_mask) ||
                    !cpus_have_cap(caps->capability))
                        continue;

                if (boot_scope && caps->cpu_enable)
                        /*
                         * Capabilities with SCOPE_BOOT_CPU scope are finalised
                         * before any secondary CPU boots. Thus, each secondary
                         * will enable the capability as appropriate via
                         * check_local_cpu_capabilities(). The only exception is
                         * the boot CPU, for which the capability must be
                         * enabled here. This approach avoids costly
                         * stop_machine() calls for this case.
                         */
                        caps->cpu_enable(caps);
        }

        /*
         * For all non-boot scope capabilities, use stop_machine()
         * as it schedules the work allowing us to modify PSTATE,
         * instead of on_each_cpu() which uses an IPI, giving us a
         * PSTATE that disappears when we return.
         */
        if (!boot_scope)
                stop_machine(cpu_enable_non_boot_scope_capabilities,
                             NULL, cpu_online_mask);
}

/*
 * Run through the list of capabilities to check for conflicts.
 * If the system has already detected a capability, take necessary
 * action on this CPU.
 */
static void verify_local_cpu_caps(u16 scope_mask)
{
        int i;
        bool cpu_has_cap, system_has_cap;
        const struct arm64_cpu_capabilities *caps;

        scope_mask &= ARM64_CPUCAP_SCOPE_MASK;

        for (i = 0; i < ARM64_NCAPS; i++) {
                caps = cpucap_ptrs[i];
                if (!caps || !(caps->type & scope_mask))
                        continue;

                cpu_has_cap = caps->matches(caps, SCOPE_LOCAL_CPU);
                system_has_cap = cpus_have_cap(caps->capability);

                if (system_has_cap) {
                        /*
                         * Check if the new CPU misses an advertised feature,
                         * which is not safe to miss.
                         */
                        if (!cpu_has_cap && !cpucap_late_cpu_optional(caps))
                                break;
                        /*
                         * We have to issue cpu_enable() irrespective of
                         * whether the CPU has it or not, as it is enabeld
                         * system wide. It is upto the call back to take
                         * appropriate action on this CPU.
                         */
                        if (caps->cpu_enable)
                                caps->cpu_enable(caps);
                } else {
                        /*
                         * Check if the CPU has this capability if it isn't
                         * safe to have when the system doesn't.
                         */
                        if (cpu_has_cap && !cpucap_late_cpu_permitted(caps))
                                break;
                }
        }

        if (i < ARM64_NCAPS) {
                pr_crit("CPU%d: Detected conflict for capability %d (%s), System: %d, CPU: %d\n",
                        smp_processor_id(), caps->capability,
                        caps->desc, system_has_cap, cpu_has_cap);

                if (cpucap_panic_on_conflict(caps))
                        cpu_panic_kernel();
                else
                        cpu_die_early();
        }
}

/*
 * Check for CPU features that are used in early boot
 * based on the Boot CPU value.
 */
static void check_early_cpu_features(void)
{
        verify_cpu_asid_bits();

        verify_local_cpu_caps(SCOPE_BOOT_CPU);
}

static void
__verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps)
{

        for (; caps->matches; caps++)
                if (cpus_have_elf_hwcap(caps) && !caps->matches(caps, SCOPE_LOCAL_CPU)) {
                        pr_crit("CPU%d: missing HWCAP: %s\n",
                                        smp_processor_id(), caps->desc);
                        cpu_die_early();
                }
}

static void verify_local_elf_hwcaps(void)
{
        __verify_local_elf_hwcaps(arm64_elf_hwcaps);

        if (id_aa64pfr0_32bit_el0(read_cpuid(ID_AA64PFR0_EL1)))
                __verify_local_elf_hwcaps(compat_elf_hwcaps);
}

static void verify_sve_features(void)
{
        unsigned long cpacr = cpacr_save_enable_kernel_sve();

        if (vec_verify_vq_map(ARM64_VEC_SVE)) {
                pr_crit("CPU%d: SVE: vector length support mismatch\n",
                        smp_processor_id());
                cpu_die_early();
        }

        cpacr_restore(cpacr);
}

static void verify_sme_features(void)
{
        unsigned long cpacr = cpacr_save_enable_kernel_sme();

        if (vec_verify_vq_map(ARM64_VEC_SME)) {
                pr_crit("CPU%d: SME: vector length support mismatch\n",
                        smp_processor_id());
                cpu_die_early();
        }

        cpacr_restore(cpacr);
}

static void verify_hyp_capabilities(void)
{
        u64 safe_mmfr1, mmfr0, mmfr1;
        int parange, ipa_max;
        unsigned int safe_vmid_bits, vmid_bits;

        if (!IS_ENABLED(CONFIG_KVM))
                return;

        safe_mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);

        /* Verify VMID bits */
        safe_vmid_bits = get_vmid_bits(safe_mmfr1);
        vmid_bits = get_vmid_bits(mmfr1);
        if (vmid_bits < safe_vmid_bits) {
                pr_crit("CPU%d: VMID width mismatch\n", smp_processor_id());
                cpu_die_early();
        }

        /* Verify IPA range */
        parange = cpuid_feature_extract_unsigned_field(mmfr0,
                                ID_AA64MMFR0_EL1_PARANGE_SHIFT);
        ipa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
        if (ipa_max < get_kvm_ipa_limit()) {
                pr_crit("CPU%d: IPA range mismatch\n", smp_processor_id());
                cpu_die_early();
        }
}

static void verify_mpam_capabilities(void)
{
        u64 cpu_idr = read_cpuid(ID_AA64PFR0_EL1);
        u64 sys_idr = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
        u16 cpu_partid_max, cpu_pmg_max, sys_partid_max, sys_pmg_max;

        if (FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, cpu_idr) !=
            FIELD_GET(ID_AA64PFR0_EL1_MPAM_MASK, sys_idr)) {
                pr_crit("CPU%d: MPAM version mismatch\n", smp_processor_id());
                cpu_die_early();
        }

        cpu_idr = read_cpuid(MPAMIDR_EL1);
        sys_idr = read_sanitised_ftr_reg(SYS_MPAMIDR_EL1);
        if (FIELD_GET(MPAMIDR_EL1_HAS_HCR, cpu_idr) !=
            FIELD_GET(MPAMIDR_EL1_HAS_HCR, sys_idr)) {
                pr_crit("CPU%d: Missing MPAM HCR\n", smp_processor_id());
                cpu_die_early();
        }

        cpu_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, cpu_idr);
        cpu_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, cpu_idr);
        sys_partid_max = FIELD_GET(MPAMIDR_EL1_PARTID_MAX, sys_idr);
        sys_pmg_max = FIELD_GET(MPAMIDR_EL1_PMG_MAX, sys_idr);
        if (cpu_partid_max < sys_partid_max || cpu_pmg_max < sys_pmg_max) {
                pr_crit("CPU%d: MPAM PARTID/PMG max values are mismatched\n", smp_processor_id());
                cpu_die_early();
        }
}

/*
 * Run through the enabled system capabilities and enable() it on this CPU.
 * The capabilities were decided based on the available CPUs at the boot time.
 * Any new CPU should match the system wide status of the capability. If the
 * new CPU doesn't have a capability which the system now has enabled, we
 * cannot do anything to fix it up and could cause unexpected failures. So
 * we park the CPU.
 */
static void verify_local_cpu_capabilities(void)
{
        /*
         * The capabilities with SCOPE_BOOT_CPU are checked from
         * check_early_cpu_features(), as they need to be verified
         * on all secondary CPUs.
         */
        verify_local_cpu_caps(SCOPE_ALL & ~SCOPE_BOOT_CPU);
        verify_local_elf_hwcaps();

        if (system_supports_sve())
                verify_sve_features();

        if (system_supports_sme())
                verify_sme_features();

        if (is_hyp_mode_available())
                verify_hyp_capabilities();

        if (system_supports_mpam())
                verify_mpam_capabilities();
}

void check_local_cpu_capabilities(void)
{
        /*
         * All secondary CPUs should conform to the early CPU features
         * in use by the kernel based on boot CPU.
         */
        check_early_cpu_features();

        /*
         * If we haven't finalised the system capabilities, this CPU gets
         * a chance to update the errata work arounds and local features.
         * Otherwise, this CPU should verify that it has all the system
         * advertised capabilities.
         */
        if (!system_capabilities_finalized())
                update_cpu_capabilities(SCOPE_LOCAL_CPU);
        else
                verify_local_cpu_capabilities();
}

bool this_cpu_has_cap(unsigned int n)
{
        if (!WARN_ON(preemptible()) && n < ARM64_NCAPS) {
                const struct arm64_cpu_capabilities *cap = cpucap_ptrs[n];

                if (cap)
                        return cap->matches(cap, SCOPE_LOCAL_CPU);
        }

        return false;
}
EXPORT_SYMBOL_GPL(this_cpu_has_cap);

/*
 * This helper function is used in a narrow window when,
 * - The system wide safe registers are set with all the SMP CPUs and,
 * - The SYSTEM_FEATURE system_cpucaps may not have been set.
 */
static bool __maybe_unused __system_matches_cap(unsigned int n)
{
        if (n < ARM64_NCAPS) {
                const struct arm64_cpu_capabilities *cap = cpucap_ptrs[n];

                if (cap)
                        return cap->matches(cap, SCOPE_SYSTEM);
        }
        return false;
}

void cpu_set_feature(unsigned int num)
{
        set_bit(num, elf_hwcap);
}

bool cpu_have_feature(unsigned int num)
{
        return test_bit(num, elf_hwcap);
}
EXPORT_SYMBOL_GPL(cpu_have_feature);

unsigned long cpu_get_elf_hwcap(void)
{
        /*
         * We currently only populate the first 32 bits of AT_HWCAP. Please
         * note that for userspace compatibility we guarantee that bits 62
         * and 63 will always be returned as 0.
         */
        return elf_hwcap[0];
}

unsigned long cpu_get_elf_hwcap2(void)
{
        return elf_hwcap[1];
}

unsigned long cpu_get_elf_hwcap3(void)
{
        return elf_hwcap[2];
}

static void __init setup_boot_cpu_capabilities(void)
{
        kvm_arm_target_impl_cpu_init();
        /*
         * The boot CPU's feature register values have been recorded. Detect
         * boot cpucaps and local cpucaps for the boot CPU, then enable and
         * patch alternatives for the available boot cpucaps.
         */
        update_cpu_capabilities(SCOPE_BOOT_CPU | SCOPE_LOCAL_CPU);
        enable_cpu_capabilities(SCOPE_BOOT_CPU);
        apply_boot_alternatives();
}

void __init setup_boot_cpu_features(void)
{
        /*
         * Initialize the indirect array of CPU capabilities pointers before we
         * handle the boot CPU.
         */
        init_cpucap_indirect_list();

        /*
         * Detect broken pseudo-NMI. Must be called _before_ the call to
         * setup_boot_cpu_capabilities() since it interacts with
         * can_use_gic_priorities().
         */
        detect_system_supports_pseudo_nmi();

        setup_boot_cpu_capabilities();
}

static void __init setup_system_capabilities(void)
{
        /*
         * The system-wide safe feature register values have been finalized.
         * Detect, enable, and patch alternatives for the available system
         * cpucaps.
         */
        update_cpu_capabilities(SCOPE_SYSTEM);
        enable_cpu_capabilities(SCOPE_ALL & ~SCOPE_BOOT_CPU);
        apply_alternatives_all();

        /*
         * Log any cpucaps with a cpumask as these aren't logged by
         * update_cpu_capabilities().
         */
        for (int i = 0; i < ARM64_NCAPS; i++) {
                const struct arm64_cpu_capabilities *caps = cpucap_ptrs[i];

                if (caps && caps->cpus && caps->desc &&
                        cpumask_any(caps->cpus) < nr_cpu_ids)
                        pr_info("detected: %s on CPU%*pbl\n",
                                caps->desc, cpumask_pr_args(caps->cpus));
        }

        /*
         * TTBR0 PAN doesn't have its own cpucap, so log it manually.
         */
        if (system_uses_ttbr0_pan())
                pr_info("emulated: Privileged Access Never (PAN) using TTBR0_EL1 switching\n");
}

void __init setup_system_features(void)
{
        setup_system_capabilities();

        kpti_install_ng_mappings();

        sve_setup();
        sme_setup();

        /*
         * Check for sane CTR_EL0.CWG value.
         */
        if (!cache_type_cwg())
                pr_warn("No Cache Writeback Granule information, assuming %d\n",
                        ARCH_DMA_MINALIGN);
}

void __init setup_user_features(void)
{
        user_feature_fixup();

        setup_elf_hwcaps(arm64_elf_hwcaps);

        if (system_supports_32bit_el0()) {
                setup_elf_hwcaps(compat_elf_hwcaps);
                elf_hwcap_fixup();
        }

        minsigstksz_setup();
}

static int enable_mismatched_32bit_el0(unsigned int cpu)
{
        /*
         * The first 32-bit-capable CPU we detected and so can no longer
         * be offlined by userspace. -1 indicates we haven't yet onlined
         * a 32-bit-capable CPU.
         */
        static int lucky_winner = -1;

        struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu);
        bool cpu_32bit = false;

        if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
                if (!housekeeping_cpu(cpu, HK_TYPE_TICK))
                        pr_info("Treating adaptive-ticks CPU %u as 64-bit only\n", cpu);
                else
                        cpu_32bit = true;
        }

        if (cpu_32bit) {
                cpumask_set_cpu(cpu, cpu_32bit_el0_mask);
                static_branch_enable_cpuslocked(&arm64_mismatched_32bit_el0);
        }

        if (cpumask_test_cpu(0, cpu_32bit_el0_mask) == cpu_32bit)
                return 0;

        if (lucky_winner >= 0)
                return 0;

        /*
         * We've detected a mismatch. We need to keep one of our CPUs with
         * 32-bit EL0 online so that is_cpu_allowed() doesn't end up rejecting
         * every CPU in the system for a 32-bit task.
         */
        lucky_winner = cpu_32bit ? cpu : cpumask_any_and(cpu_32bit_el0_mask,
                                                         cpu_active_mask);
        get_cpu_device(lucky_winner)->offline_disabled = true;
        setup_elf_hwcaps(compat_elf_hwcaps);
        elf_hwcap_fixup();
        pr_info("Asymmetric 32-bit EL0 support detected on CPU %u; CPU hot-unplug disabled on CPU %u\n",
                cpu, lucky_winner);
        return 0;
}

static int __init init_32bit_el0_mask(void)
{
        if (!allow_mismatched_32bit_el0)
                return 0;

        if (!zalloc_cpumask_var(&cpu_32bit_el0_mask, GFP_KERNEL))
                return -ENOMEM;

        return cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
                                 "arm64/mismatched_32bit_el0:online",
                                 enable_mismatched_32bit_el0, NULL);
}
subsys_initcall_sync(init_32bit_el0_mask);

static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap)
{
        cpu_enable_swapper_cnp();
}

/*
 * We emulate only the following system register space.
 * Op0 = 0x3, CRn = 0x0, Op1 = 0x0, CRm = [0, 2 - 7]
 * See Table C5-6 System instruction encodings for System register accesses,
 * ARMv8 ARM(ARM DDI 0487A.f) for more details.
 */
static inline bool __attribute_const__ is_emulated(u32 id)
{
        return (sys_reg_Op0(id) == 0x3 &&
                sys_reg_CRn(id) == 0x0 &&
                sys_reg_Op1(id) == 0x0 &&
                (sys_reg_CRm(id) == 0 ||
                 ((sys_reg_CRm(id) >= 2) && (sys_reg_CRm(id) <= 7))));
}

/*
 * With CRm == 0, reg should be one of :
 * MIDR_EL1, MPIDR_EL1 or REVIDR_EL1.
 */
static inline int emulate_id_reg(u32 id, u64 *valp)
{
        switch (id) {
        case SYS_MIDR_EL1:
                *valp = read_cpuid_id();
                break;
        case SYS_MPIDR_EL1:
                *valp = SYS_MPIDR_SAFE_VAL;
                break;
        case SYS_REVIDR_EL1:
                /* IMPLEMENTATION DEFINED values are emulated with 0 */
                *valp = 0;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static int emulate_sys_reg(u32 id, u64 *valp)
{
        struct arm64_ftr_reg *regp;

        if (!is_emulated(id))
                return -EINVAL;

        if (sys_reg_CRm(id) == 0)
                return emulate_id_reg(id, valp);

        regp = get_arm64_ftr_reg_nowarn(id);
        if (regp)
                *valp = arm64_ftr_reg_user_value(regp);
        else
                /*
                 * The untracked registers are either IMPLEMENTATION DEFINED
                 * (e.g, ID_AFR0_EL1) or reserved RAZ.
                 */
                *valp = 0;
        return 0;
}

int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt)
{
        int rc;
        u64 val;

        rc = emulate_sys_reg(sys_reg, &val);
        if (!rc) {
                pt_regs_write_reg(regs, rt, val);
                arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
        }
        return rc;
}

bool try_emulate_mrs(struct pt_regs *regs, u32 insn)
{
        u32 sys_reg, rt;

        if (compat_user_mode(regs) || !aarch64_insn_is_mrs(insn))
                return false;

        /*
         * sys_reg values are defined as used in mrs/msr instruction.
         * shift the imm value to get the encoding.
         */
        sys_reg = (u32)aarch64_insn_decode_immediate(AARCH64_INSN_IMM_16, insn) << 5;
        rt = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RT, insn);
        return do_emulate_mrs(regs, sys_reg, rt) == 0;
}

enum mitigation_state arm64_get_meltdown_state(void)
{
        if (__meltdown_safe)
                return SPECTRE_UNAFFECTED;

        if (arm64_kernel_unmapped_at_el0())
                return SPECTRE_MITIGATED;

        return SPECTRE_VULNERABLE;
}

ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        switch (arm64_get_meltdown_state()) {
        case SPECTRE_UNAFFECTED:
                return sprintf(buf, "Not affected\n");

        case SPECTRE_MITIGATED:
                return sprintf(buf, "Mitigation: PTI\n");

        default:
                return sprintf(buf, "Vulnerable\n");
        }
}





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 








































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
// SPDX-License-Identifier: GPL-2.0
/* XDP sockets
 *
 * AF_XDP sockets allows a channel between XDP programs and userspace
 * applications.
 * Copyright(c) 2018 Intel Corporation.
 *
 * Author(s): Björn Töpel <bjorn.topel@intel.com>
 *              Magnus Karlsson <magnus.karlsson@intel.com>
 */

#define pr_fmt(fmt) "AF_XDP: %s: " fmt, __func__

#include <linux/if_xdp.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/socket.h>
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/rculist.h>
#include <linux/vmalloc.h>
#include <net/xdp_sock_drv.h>
#include <net/busy_poll.h>
#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/xdp.h>

#include "xsk_queue.h"
#include "xdp_umem.h"
#include "xsk.h"

#define TX_BATCH_SIZE 32
#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)

void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
        if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
                return;

        pool->fq->ring->flags |= XDP_RING_NEED_WAKEUP;
        pool->cached_need_wakeup |= XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_set_rx_need_wakeup);

void xsk_set_tx_need_wakeup(struct xsk_buff_pool *pool)
{
        struct xdp_sock *xs;

        if (pool->cached_need_wakeup & XDP_WAKEUP_TX)
                return;

        rcu_read_lock();
        list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
                xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
        }
        rcu_read_unlock();

        pool->cached_need_wakeup |= XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_set_tx_need_wakeup);

void xsk_clear_rx_need_wakeup(struct xsk_buff_pool *pool)
{
        if (!(pool->cached_need_wakeup & XDP_WAKEUP_RX))
                return;

        pool->fq->ring->flags &= ~XDP_RING_NEED_WAKEUP;
        pool->cached_need_wakeup &= ~XDP_WAKEUP_RX;
}
EXPORT_SYMBOL(xsk_clear_rx_need_wakeup);

void xsk_clear_tx_need_wakeup(struct xsk_buff_pool *pool)
{
        struct xdp_sock *xs;

        if (!(pool->cached_need_wakeup & XDP_WAKEUP_TX))
                return;

        rcu_read_lock();
        list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
                xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP;
        }
        rcu_read_unlock();

        pool->cached_need_wakeup &= ~XDP_WAKEUP_TX;
}
EXPORT_SYMBOL(xsk_clear_tx_need_wakeup);

bool xsk_uses_need_wakeup(struct xsk_buff_pool *pool)
{
        return pool->uses_need_wakeup;
}
EXPORT_SYMBOL(xsk_uses_need_wakeup);

struct xsk_buff_pool *xsk_get_pool_from_qid(struct net_device *dev,
                                            u16 queue_id)
{
        if (queue_id < dev->real_num_rx_queues)
                return dev->_rx[queue_id].pool;
        if (queue_id < dev->real_num_tx_queues)
                return dev->_tx[queue_id].pool;

        return NULL;
}
EXPORT_SYMBOL(xsk_get_pool_from_qid);

void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id)
{
        if (queue_id < dev->num_rx_queues)
                dev->_rx[queue_id].pool = NULL;
        if (queue_id < dev->num_tx_queues)
                dev->_tx[queue_id].pool = NULL;
}

/* The buffer pool is stored both in the _rx struct and the _tx struct as we do
 * not know if the device has more tx queues than rx, or the opposite.
 * This might also change during run time.
 */
int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool,
                        u16 queue_id)
{
        if (queue_id >= max_t(unsigned int,
                              dev->real_num_rx_queues,
                              dev->real_num_tx_queues))
                return -EINVAL;

        if (queue_id < dev->real_num_rx_queues)
                dev->_rx[queue_id].pool = pool;
        if (queue_id < dev->real_num_tx_queues)
                dev->_tx[queue_id].pool = pool;

        return 0;
}

static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len,
                        u32 flags)
{
        u64 addr;
        int err;

        addr = xp_get_handle(xskb, xskb->pool);
        err = xskq_prod_reserve_desc(xs->rx, addr, len, flags);
        if (err) {
                xs->rx_queue_full++;
                return err;
        }

        xp_release(xskb);
        return 0;
}

static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
        struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
        u32 frags = xdp_buff_has_frags(xdp);
        struct xdp_buff_xsk *pos, *tmp;
        struct list_head *xskb_list;
        u32 contd = 0;
        int err;

        if (frags)
                contd = XDP_PKT_CONTD;

        err = __xsk_rcv_zc(xs, xskb, len, contd);
        if (err)
                goto err;
        if (likely(!frags))
                return 0;

        xskb_list = &xskb->pool->xskb_list;
        list_for_each_entry_safe(pos, tmp, xskb_list, list_node) {
                if (list_is_singular(xskb_list))
                        contd = 0;
                len = pos->xdp.data_end - pos->xdp.data;
                err = __xsk_rcv_zc(xs, pos, len, contd);
                if (err)
                        goto err;
                list_del(&pos->list_node);
        }

        return 0;
err:
        xsk_buff_free(xdp);
        return err;
}

static void *xsk_copy_xdp_start(struct xdp_buff *from)
{
        if (unlikely(xdp_data_meta_unsupported(from)))
                return from->data;
        else
                return from->data_meta;
}

static u32 xsk_copy_xdp(void *to, void **from, u32 to_len,
                        u32 *from_len, skb_frag_t **frag, u32 rem)
{
        u32 copied = 0;

        while (1) {
                u32 copy_len = min_t(u32, *from_len, to_len);

                memcpy(to, *from, copy_len);
                copied += copy_len;
                if (rem == copied)
                        return copied;

                if (*from_len == copy_len) {
                        *from = skb_frag_address(*frag);
                        *from_len = skb_frag_size((*frag)++);
                } else {
                        *from += copy_len;
                        *from_len -= copy_len;
                }
                if (to_len == copy_len)
                        return copied;

                to_len -= copy_len;
                to += copy_len;
        }
}

static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
        u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool);
        void *copy_from = xsk_copy_xdp_start(xdp), *copy_to;
        u32 from_len, meta_len, rem, num_desc;
        struct xdp_buff_xsk *xskb;
        struct xdp_buff *xsk_xdp;
        skb_frag_t *frag;

        from_len = xdp->data_end - copy_from;
        meta_len = xdp->data - copy_from;
        rem = len + meta_len;

        if (len <= frame_size && !xdp_buff_has_frags(xdp)) {
                int err;

                xsk_xdp = xsk_buff_alloc(xs->pool);
                if (!xsk_xdp) {
                        xs->rx_dropped++;
                        return -ENOMEM;
                }
                memcpy(xsk_xdp->data - meta_len, copy_from, rem);
                xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
                err = __xsk_rcv_zc(xs, xskb, len, 0);
                if (err) {
                        xsk_buff_free(xsk_xdp);
                        return err;
                }

                return 0;
        }

        num_desc = (len - 1) / frame_size + 1;

        if (!xsk_buff_can_alloc(xs->pool, num_desc)) {
                xs->rx_dropped++;
                return -ENOMEM;
        }
        if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) {
                xs->rx_queue_full++;
                return -ENOBUFS;
        }

        if (xdp_buff_has_frags(xdp)) {
                struct skb_shared_info *sinfo;

                sinfo = xdp_get_shared_info_from_buff(xdp);
                frag =  &sinfo->frags[0];
        }

        do {
                u32 to_len = frame_size + meta_len;
                u32 copied;

                xsk_xdp = xsk_buff_alloc(xs->pool);
                copy_to = xsk_xdp->data - meta_len;

                copied = xsk_copy_xdp(copy_to, &copy_from, to_len, &from_len, &frag, rem);
                rem -= copied;

                xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp);
                __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0);
                meta_len = 0;
        } while (rem);

        return 0;
}

static bool xsk_tx_writeable(struct xdp_sock *xs)
{
        if (xskq_cons_present_entries(xs->tx) > xs->tx->nentries / 2)
                return false;

        return true;
}

static bool xsk_is_bound(struct xdp_sock *xs)
{
        if (READ_ONCE(xs->state) == XSK_BOUND) {
                /* Matches smp_wmb() in bind(). */
                smp_rmb();
                return true;
        }
        return false;
}

static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
        if (!xsk_is_bound(xs))
                return -ENXIO;

        if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index)
                return -EINVAL;

        if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
                xs->rx_dropped++;
                return -ENOSPC;
        }

        return 0;
}

static void xsk_flush(struct xdp_sock *xs)
{
        xskq_prod_submit(xs->rx);
        __xskq_cons_release(xs->pool->fq);
        sock_def_readable(&xs->sk);
}

int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
        u32 len = xdp_get_buff_len(xdp);
        int err;

        spin_lock_bh(&xs->rx_lock);
        err = xsk_rcv_check(xs, xdp, len);
        if (!err) {
                err = __xsk_rcv(xs, xdp, len);
                xsk_flush(xs);
        }
        spin_unlock_bh(&xs->rx_lock);
        return err;
}

static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
{
        u32 len = xdp_get_buff_len(xdp);
        int err;

        err = xsk_rcv_check(xs, xdp, len);
        if (err)
                return err;

        if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) {
                len = xdp->data_end - xdp->data;
                return xsk_rcv_zc(xs, xdp, len);
        }

        err = __xsk_rcv(xs, xdp, len);
        if (!err)
                xdp_return_buff(xdp);
        return err;
}

int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp)
{
        int err;

        err = xsk_rcv(xs, xdp);
        if (err)
                return err;

        if (!xs->flush_node.prev) {
                struct list_head *flush_list = bpf_net_ctx_get_xskmap_flush_list();

                list_add(&xs->flush_node, flush_list);
        }

        return 0;
}

void __xsk_map_flush(struct list_head *flush_list)
{
        struct xdp_sock *xs, *tmp;

        list_for_each_entry_safe(xs, tmp, flush_list, flush_node) {
                xsk_flush(xs);
                __list_del_clearprev(&xs->flush_node);
        }
}

void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries)
{
        xskq_prod_submit_n(pool->cq, nb_entries);
}
EXPORT_SYMBOL(xsk_tx_completed);

void xsk_tx_release(struct xsk_buff_pool *pool)
{
        struct xdp_sock *xs;

        rcu_read_lock();
        list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
                __xskq_cons_release(xs->tx);
                if (xsk_tx_writeable(xs))
                        xs->sk.sk_write_space(&xs->sk);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(xsk_tx_release);

bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc)
{
        bool budget_exhausted = false;
        struct xdp_sock *xs;

        rcu_read_lock();
again:
        list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) {
                if (xs->tx_budget_spent >= MAX_PER_SOCKET_BUDGET) {
                        budget_exhausted = true;
                        continue;
                }

                if (!xskq_cons_peek_desc(xs->tx, desc, pool)) {
                        if (xskq_has_descs(xs->tx))
                                xskq_cons_release(xs->tx);
                        continue;
                }

                xs->tx_budget_spent++;

                /* This is the backpressure mechanism for the Tx path.
                 * Reserve space in the completion queue and only proceed
                 * if there is space in it. This avoids having to implement
                 * any buffering in the Tx path.
                 */
                if (xskq_prod_reserve_addr(pool->cq, desc->addr))
                        goto out;

                xskq_cons_release(xs->tx);
                rcu_read_unlock();
                return true;
        }

        if (budget_exhausted) {
                list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list)
                        xs->tx_budget_spent = 0;

                budget_exhausted = false;
                goto again;
        }

out:
        rcu_read_unlock();
        return false;
}
EXPORT_SYMBOL(xsk_tx_peek_desc);

static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entries)
{
        struct xdp_desc *descs = pool->tx_descs;
        u32 nb_pkts = 0;

        while (nb_pkts < max_entries && xsk_tx_peek_desc(pool, &descs[nb_pkts]))
                nb_pkts++;

        xsk_tx_release(pool);
        return nb_pkts;
}

u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts)
{
        struct xdp_sock *xs;

        rcu_read_lock();
        if (!list_is_singular(&pool->xsk_tx_list)) {
                /* Fallback to the non-batched version */
                rcu_read_unlock();
                return xsk_tx_peek_release_fallback(pool, nb_pkts);
        }

        xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list);
        if (!xs) {
                nb_pkts = 0;
                goto out;
        }

        nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts);

        /* This is the backpressure mechanism for the Tx path. Try to
         * reserve space in the completion queue for all packets, but
         * if there are fewer slots available, just process that many
         * packets. This avoids having to implement any buffering in
         * the Tx path.
         */
        nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts);
        if (!nb_pkts)
                goto out;

        nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts);
        if (!nb_pkts) {
                xs->tx->queue_empty_descs++;
                goto out;
        }

        __xskq_cons_release(xs->tx);
        xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts);
        xs->sk.sk_write_space(&xs->sk);

out:
        rcu_read_unlock();
        return nb_pkts;
}
EXPORT_SYMBOL(xsk_tx_peek_release_desc_batch);

static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
{
        struct net_device *dev = xs->dev;

        return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
}

static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&pool->cq_lock, flags);
        ret = xskq_prod_reserve_addr(pool->cq, addr);
        spin_unlock_irqrestore(&pool->cq_lock, flags);

        return ret;
}

static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
{
        unsigned long flags;

        spin_lock_irqsave(&pool->cq_lock, flags);
        xskq_prod_submit_n(pool->cq, n);
        spin_unlock_irqrestore(&pool->cq_lock, flags);
}

static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
{
        unsigned long flags;

        spin_lock_irqsave(&pool->cq_lock, flags);
        xskq_prod_cancel_n(pool->cq, n);
        spin_unlock_irqrestore(&pool->cq_lock, flags);
}

static u32 xsk_get_num_desc(struct sk_buff *skb)
{
        return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
}

static void xsk_destruct_skb(struct sk_buff *skb)
{
        struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;

        if (compl->tx_timestamp) {
                /* sw completion timestamp, not a real one */
                *compl->tx_timestamp = ktime_get_tai_fast_ns();
        }

        xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
        sock_wfree(skb);
}

static void xsk_set_destructor_arg(struct sk_buff *skb)
{
        long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;

        skb_shinfo(skb)->destructor_arg = (void *)num;
}

static void xsk_consume_skb(struct sk_buff *skb)
{
        struct xdp_sock *xs = xdp_sk(skb->sk);

        skb->destructor = sock_wfree;
        xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
        /* Free skb without triggering the perf drop trace */
        consume_skb(skb);
        xs->skb = NULL;
}

static void xsk_drop_skb(struct sk_buff *skb)
{
        xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb);
        xsk_consume_skb(skb);
}

static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
                                              struct xdp_desc *desc)
{
        struct xsk_buff_pool *pool = xs->pool;
        u32 hr, len, ts, offset, copy, copied;
        struct sk_buff *skb = xs->skb;
        struct page *page;
        void *buffer;
        int err, i;
        u64 addr;

        if (!skb) {
                hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));

                skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
                if (unlikely(!skb))
                        return ERR_PTR(err);

                skb_reserve(skb, hr);
        }

        addr = desc->addr;
        len = desc->len;
        ts = pool->unaligned ? len : pool->chunk_size;

        buffer = xsk_buff_raw_get_data(pool, addr);
        offset = offset_in_page(buffer);
        addr = buffer - pool->addrs;

        for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) {
                if (unlikely(i >= MAX_SKB_FRAGS))
                        return ERR_PTR(-EOVERFLOW);

                page = pool->umem->pgs[addr >> PAGE_SHIFT];
                get_page(page);

                copy = min_t(u32, PAGE_SIZE - offset, len - copied);
                skb_fill_page_desc(skb, i, page, offset, copy);

                copied += copy;
                addr += copy;
                offset = 0;
        }

        skb->len += len;
        skb->data_len += len;
        skb->truesize += ts;

        refcount_add(ts, &xs->sk.sk_wmem_alloc);

        return skb;
}

static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
                                     struct xdp_desc *desc)
{
        struct xsk_tx_metadata *meta = NULL;
        struct net_device *dev = xs->dev;
        struct sk_buff *skb = xs->skb;
        bool first_frag = false;
        int err;

        if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
                skb = xsk_build_skb_zerocopy(xs, desc);
                if (IS_ERR(skb)) {
                        err = PTR_ERR(skb);
                        goto free_err;
                }
        } else {
                u32 hr, tr, len;
                void *buffer;

                buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
                len = desc->len;

                if (!skb) {
                        first_frag = true;

                        hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
                        tr = dev->needed_tailroom;
                        skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
                        if (unlikely(!skb))
                                goto free_err;

                        skb_reserve(skb, hr);
                        skb_put(skb, len);

                        err = skb_store_bits(skb, 0, buffer, len);
                        if (unlikely(err))
                                goto free_err;
                } else {
                        int nr_frags = skb_shinfo(skb)->nr_frags;
                        struct page *page;
                        u8 *vaddr;

                        if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
                                err = -EOVERFLOW;
                                goto free_err;
                        }

                        page = alloc_page(xs->sk.sk_allocation);
                        if (unlikely(!page)) {
                                err = -EAGAIN;
                                goto free_err;
                        }

                        vaddr = kmap_local_page(page);
                        memcpy(vaddr, buffer, len);
                        kunmap_local(vaddr);

                        skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
                        refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
                }

                if (first_frag && desc->options & XDP_TX_METADATA) {
                        if (unlikely(xs->pool->tx_metadata_len == 0)) {
                                err = -EINVAL;
                                goto free_err;
                        }

                        meta = buffer - xs->pool->tx_metadata_len;
                        if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
                                err = -EINVAL;
                                goto free_err;
                        }

                        if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
                                if (unlikely(meta->request.csum_start +
                                             meta->request.csum_offset +
                                             sizeof(__sum16) > len)) {
                                        err = -EINVAL;
                                        goto free_err;
                                }

                                skb->csum_start = hr + meta->request.csum_start;
                                skb->csum_offset = meta->request.csum_offset;
                                skb->ip_summed = CHECKSUM_PARTIAL;

                                if (unlikely(xs->pool->tx_sw_csum)) {
                                        err = skb_checksum_help(skb);
                                        if (err)
                                                goto free_err;
                                }
                        }

                        if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
                                skb->skb_mstamp_ns = meta->request.launch_time;
                }
        }

        skb->dev = dev;
        skb->priority = READ_ONCE(xs->sk.sk_priority);
        skb->mark = READ_ONCE(xs->sk.sk_mark);
        skb->destructor = xsk_destruct_skb;
        xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
        xsk_set_destructor_arg(skb);

        return skb;

free_err:
        if (first_frag && skb)
                kfree_skb(skb);

        if (err == -EOVERFLOW) {
                /* Drop the packet */
                xsk_set_destructor_arg(xs->skb);
                xsk_drop_skb(xs->skb);
                xskq_cons_release(xs->tx);
        } else {
                /* Let application retry */
                xsk_cq_cancel_locked(xs->pool, 1);
        }

        return ERR_PTR(err);
}

static int __xsk_generic_xmit(struct sock *sk)
{
        struct xdp_sock *xs = xdp_sk(sk);
        u32 max_batch = TX_BATCH_SIZE;
        bool sent_frame = false;
        struct xdp_desc desc;
        struct sk_buff *skb;
        int err = 0;

        mutex_lock(&xs->mutex);

        /* Since we dropped the RCU read lock, the socket state might have changed. */
        if (unlikely(!xsk_is_bound(xs))) {
                err = -ENXIO;
                goto out;
        }

        if (xs->queue_id >= xs->dev->real_num_tx_queues)
                goto out;

        while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
                if (max_batch-- == 0) {
                        err = -EAGAIN;
                        goto out;
                }

                /* This is the backpressure mechanism for the Tx path.
                 * Reserve space in the completion queue and only proceed
                 * if there is space in it. This avoids having to implement
                 * any buffering in the Tx path.
                 */
                err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
                if (err) {
                        err = -EAGAIN;
                        goto out;
                }

                skb = xsk_build_skb(xs, &desc);
                if (IS_ERR(skb)) {
                        err = PTR_ERR(skb);
                        if (err != -EOVERFLOW)
                                goto out;
                        err = 0;
                        continue;
                }

                xskq_cons_release(xs->tx);

                if (xp_mb_desc(&desc)) {
                        xs->skb = skb;
                        continue;
                }

                err = __dev_direct_xmit(skb, xs->queue_id);
                if  (err == NETDEV_TX_BUSY) {
                        /* Tell user-space to retry the send */
                        xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb));
                        xsk_consume_skb(skb);
                        err = -EAGAIN;
                        goto out;
                }

                /* Ignore NET_XMIT_CN as packet might have been sent */
                if (err == NET_XMIT_DROP) {
                        /* SKB completed but not sent */
                        err = -EBUSY;
                        xs->skb = NULL;
                        goto out;
                }

                sent_frame = true;
                xs->skb = NULL;
        }

        if (xskq_has_descs(xs->tx)) {
                if (xs->skb)
                        xsk_drop_skb(xs->skb);
                xskq_cons_release(xs->tx);
        }

out:
        if (sent_frame)
                if (xsk_tx_writeable(xs))
                        sk->sk_write_space(sk);

        mutex_unlock(&xs->mutex);
        return err;
}

static int xsk_generic_xmit(struct sock *sk)
{
        int ret;

        /* Drop the RCU lock since the SKB path might sleep. */
        rcu_read_unlock();
        ret = __xsk_generic_xmit(sk);
        /* Reaquire RCU lock before going into common code. */
        rcu_read_lock();

        return ret;
}

static bool xsk_no_wakeup(struct sock *sk)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        /* Prefer busy-polling, skip the wakeup. */
        return READ_ONCE(sk->sk_prefer_busy_poll) && READ_ONCE(sk->sk_ll_usec) &&
                napi_id_valid(READ_ONCE(sk->sk_napi_id));
#else
        return false;
#endif
}

static int xsk_check_common(struct xdp_sock *xs)
{
        if (unlikely(!xsk_is_bound(xs)))
                return -ENXIO;
        if (unlikely(!(xs->dev->flags & IFF_UP)))
                return -ENETDOWN;

        return 0;
}

static int __xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
        bool need_wait = !(m->msg_flags & MSG_DONTWAIT);
        struct sock *sk = sock->sk;
        struct xdp_sock *xs = xdp_sk(sk);
        struct xsk_buff_pool *pool;
        int err;

        err = xsk_check_common(xs);
        if (err)
                return err;
        if (unlikely(need_wait))
                return -EOPNOTSUPP;
        if (unlikely(!xs->tx))
                return -ENOBUFS;

        if (sk_can_busy_loop(sk))
                sk_busy_loop(sk, 1); /* only support non-blocking sockets */

        if (xs->zc && xsk_no_wakeup(sk))
                return 0;

        pool = xs->pool;
        if (pool->cached_need_wakeup & XDP_WAKEUP_TX) {
                if (xs->zc)
                        return xsk_wakeup(xs, XDP_WAKEUP_TX);
                return xsk_generic_xmit(sk);
        }
        return 0;
}

static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
        int ret;

        rcu_read_lock();
        ret = __xsk_sendmsg(sock, m, total_len);
        rcu_read_unlock();

        return ret;
}

static int __xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
{
        bool need_wait = !(flags & MSG_DONTWAIT);
        struct sock *sk = sock->sk;
        struct xdp_sock *xs = xdp_sk(sk);
        int err;

        err = xsk_check_common(xs);
        if (err)
                return err;
        if (unlikely(!xs->rx))
                return -ENOBUFS;
        if (unlikely(need_wait))
                return -EOPNOTSUPP;

        if (sk_can_busy_loop(sk))
                sk_busy_loop(sk, 1); /* only support non-blocking sockets */

        if (xsk_no_wakeup(sk))
                return 0;

        if (xs->pool->cached_need_wakeup & XDP_WAKEUP_RX && xs->zc)
                return xsk_wakeup(xs, XDP_WAKEUP_RX);
        return 0;
}

static int xsk_recvmsg(struct socket *sock, struct msghdr *m, size_t len, int flags)
{
        int ret;

        rcu_read_lock();
        ret = __xsk_recvmsg(sock, m, len, flags);
        rcu_read_unlock();

        return ret;
}

static __poll_t xsk_poll(struct file *file, struct socket *sock,
                             struct poll_table_struct *wait)
{
        __poll_t mask = 0;
        struct sock *sk = sock->sk;
        struct xdp_sock *xs = xdp_sk(sk);
        struct xsk_buff_pool *pool;

        sock_poll_wait(file, sock, wait);

        rcu_read_lock();
        if (xsk_check_common(xs))
                goto out;

        pool = xs->pool;

        if (pool->cached_need_wakeup) {
                if (xs->zc)
                        xsk_wakeup(xs, pool->cached_need_wakeup);
                else if (xs->tx)
                        /* Poll needs to drive Tx also in copy mode */
                        xsk_generic_xmit(sk);
        }

        if (xs->rx && !xskq_prod_is_empty(xs->rx))
                mask |= EPOLLIN | EPOLLRDNORM;
        if (xs->tx && xsk_tx_writeable(xs))
                mask |= EPOLLOUT | EPOLLWRNORM;
out:
        rcu_read_unlock();
        return mask;
}

static int xsk_init_queue(u32 entries, struct xsk_queue **queue,
                          bool umem_queue)
{
        struct xsk_queue *q;

        if (entries == 0 || *queue || !is_power_of_2(entries))
                return -EINVAL;

        q = xskq_create(entries, umem_queue);
        if (!q)
                return -ENOMEM;

        /* Make sure queue is ready before it can be seen by others */
        smp_wmb();
        WRITE_ONCE(*queue, q);
        return 0;
}

static void xsk_unbind_dev(struct xdp_sock *xs)
{
        struct net_device *dev = xs->dev;

        if (xs->state != XSK_BOUND)
                return;
        WRITE_ONCE(xs->state, XSK_UNBOUND);

        /* Wait for driver to stop using the xdp socket. */
        xp_del_xsk(xs->pool, xs);
        synchronize_net();
        dev_put(dev);
}

static struct xsk_map *xsk_get_map_list_entry(struct xdp_sock *xs,
                                              struct xdp_sock __rcu ***map_entry)
{
        struct xsk_map *map = NULL;
        struct xsk_map_node *node;

        *map_entry = NULL;

        spin_lock_bh(&xs->map_list_lock);
        node = list_first_entry_or_null(&xs->map_list, struct xsk_map_node,
                                        node);
        if (node) {
                bpf_map_inc(&node->map->map);
                map = node->map;
                *map_entry = node->map_entry;
        }
        spin_unlock_bh(&xs->map_list_lock);
        return map;
}

static void xsk_delete_from_maps(struct xdp_sock *xs)
{
        /* This function removes the current XDP socket from all the
         * maps it resides in. We need to take extra care here, due to
         * the two locks involved. Each map has a lock synchronizing
         * updates to the entries, and each socket has a lock that
         * synchronizes access to the list of maps (map_list). For
         * deadlock avoidance the locks need to be taken in the order
         * "map lock"->"socket map list lock". We start off by
         * accessing the socket map list, and take a reference to the
         * map to guarantee existence between the
         * xsk_get_map_list_entry() and xsk_map_try_sock_delete()
         * calls. Then we ask the map to remove the socket, which
         * tries to remove the socket from the map. Note that there
         * might be updates to the map between
         * xsk_get_map_list_entry() and xsk_map_try_sock_delete().
         */
        struct xdp_sock __rcu **map_entry = NULL;
        struct xsk_map *map;

        while ((map = xsk_get_map_list_entry(xs, &map_entry))) {
                xsk_map_try_sock_delete(map, xs, map_entry);
                bpf_map_put(&map->map);
        }
}

static int xsk_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct xdp_sock *xs = xdp_sk(sk);
        struct net *net;

        if (!sk)
                return 0;

        net = sock_net(sk);

        if (xs->skb)
                xsk_drop_skb(xs->skb);

        mutex_lock(&net->xdp.lock);
        sk_del_node_init_rcu(sk);
        mutex_unlock(&net->xdp.lock);

        sock_prot_inuse_add(net, sk->sk_prot, -1);

        xsk_delete_from_maps(xs);
        mutex_lock(&xs->mutex);
        xsk_unbind_dev(xs);
        mutex_unlock(&xs->mutex);

        xskq_destroy(xs->rx);
        xskq_destroy(xs->tx);
        xskq_destroy(xs->fq_tmp);
        xskq_destroy(xs->cq_tmp);

        sock_orphan(sk);
        sock->sk = NULL;

        sock_put(sk);

        return 0;
}

static struct socket *xsk_lookup_xsk_from_fd(int fd)
{
        struct socket *sock;
        int err;

        sock = sockfd_lookup(fd, &err);
        if (!sock)
                return ERR_PTR(-ENOTSOCK);

        if (sock->sk->sk_family != PF_XDP) {
                sockfd_put(sock);
                return ERR_PTR(-ENOPROTOOPT);
        }

        return sock;
}

static bool xsk_validate_queues(struct xdp_sock *xs)
{
        return xs->fq_tmp && xs->cq_tmp;
}

static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
{
        struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr;
        struct sock *sk = sock->sk;
        struct xdp_sock *xs = xdp_sk(sk);
        struct net_device *dev;
        int bound_dev_if;
        u32 flags, qid;
        int err = 0;

        if (addr_len < sizeof(struct sockaddr_xdp))
                return -EINVAL;
        if (sxdp->sxdp_family != AF_XDP)
                return -EINVAL;

        flags = sxdp->sxdp_flags;
        if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY |
                      XDP_USE_NEED_WAKEUP | XDP_USE_SG))
                return -EINVAL;

        bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
        if (bound_dev_if && bound_dev_if != sxdp->sxdp_ifindex)
                return -EINVAL;

        rtnl_lock();
        mutex_lock(&xs->mutex);
        if (xs->state != XSK_READY) {
                err = -EBUSY;
                goto out_release;
        }

        dev = dev_get_by_index(sock_net(sk), sxdp->sxdp_ifindex);
        if (!dev) {
                err = -ENODEV;
                goto out_release;
        }

        netdev_lock_ops(dev);

        if (!xs->rx && !xs->tx) {
                err = -EINVAL;
                goto out_unlock;
        }

        qid = sxdp->sxdp_queue_id;

        if (flags & XDP_SHARED_UMEM) {
                struct xdp_sock *umem_xs;
                struct socket *sock;

                if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) ||
                    (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) {
                        /* Cannot specify flags for shared sockets. */
                        err = -EINVAL;
                        goto out_unlock;
                }

                if (xs->umem) {
                        /* We have already our own. */
                        err = -EINVAL;
                        goto out_unlock;
                }

                sock = xsk_lookup_xsk_from_fd(sxdp->sxdp_shared_umem_fd);
                if (IS_ERR(sock)) {
                        err = PTR_ERR(sock);
                        goto out_unlock;
                }

                umem_xs = xdp_sk(sock->sk);
                if (!xsk_is_bound(umem_xs)) {
                        err = -EBADF;
                        sockfd_put(sock);
                        goto out_unlock;
                }

                if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
                        /* Share the umem with another socket on another qid
                         * and/or device.
                         */
                        xs->pool = xp_create_and_assign_umem(xs,
                                                             umem_xs->umem);
                        if (!xs->pool) {
                                err = -ENOMEM;
                                sockfd_put(sock);
                                goto out_unlock;
                        }

                        err = xp_assign_dev_shared(xs->pool, umem_xs, dev,
                                                   qid);
                        if (err) {
                                xp_destroy(xs->pool);
                                xs->pool = NULL;
                                sockfd_put(sock);
                                goto out_unlock;
                        }
                } else {
                        /* Share the buffer pool with the other socket. */
                        if (xs->fq_tmp || xs->cq_tmp) {
                                /* Do not allow setting your own fq or cq. */
                                err = -EINVAL;
                                sockfd_put(sock);
                                goto out_unlock;
                        }

                        xp_get_pool(umem_xs->pool);
                        xs->pool = umem_xs->pool;

                        /* If underlying shared umem was created without Tx
                         * ring, allocate Tx descs array that Tx batching API
                         * utilizes
                         */
                        if (xs->tx && !xs->pool->tx_descs) {
                                err = xp_alloc_tx_descs(xs->pool, xs);
                                if (err) {
                                        xp_put_pool(xs->pool);
                                        xs->pool = NULL;
                                        sockfd_put(sock);
                                        goto out_unlock;
                                }
                        }
                }

                xdp_get_umem(umem_xs->umem);
                WRITE_ONCE(xs->umem, umem_xs->umem);
                sockfd_put(sock);
        } else if (!xs->umem || !xsk_validate_queues(xs)) {
                err = -EINVAL;
                goto out_unlock;
        } else {
                /* This xsk has its own umem. */
                xs->pool = xp_create_and_assign_umem(xs, xs->umem);
                if (!xs->pool) {
                        err = -ENOMEM;
                        goto out_unlock;
                }

                err = xp_assign_dev(xs->pool, dev, qid, flags);
                if (err) {
                        xp_destroy(xs->pool);
                        xs->pool = NULL;
                        goto out_unlock;
                }
        }

        /* FQ and CQ are now owned by the buffer pool and cleaned up with it. */
        xs->fq_tmp = NULL;
        xs->cq_tmp = NULL;

        xs->dev = dev;
        xs->zc = xs->umem->zc;
        xs->sg = !!(xs->umem->flags & XDP_UMEM_SG_FLAG);
        xs->queue_id = qid;
        xp_add_xsk(xs->pool, xs);

        if (xs->zc && qid < dev->real_num_rx_queues) {
                struct netdev_rx_queue *rxq;

                rxq = __netif_get_rx_queue(dev, qid);
                if (rxq->napi)
                        __sk_mark_napi_id_once(sk, rxq->napi->napi_id);
        }

out_unlock:
        if (err) {
                dev_put(dev);
        } else {
                /* Matches smp_rmb() in bind() for shared umem
                 * sockets, and xsk_is_bound().
                 */
                smp_wmb();
                WRITE_ONCE(xs->state, XSK_BOUND);
        }
        netdev_unlock_ops(dev);
out_release:
        mutex_unlock(&xs->mutex);
        rtnl_unlock();
        return err;
}

struct xdp_umem_reg_v1 {
        __u64 addr; /* Start of packet data area */
        __u64 len; /* Length of packet data area */
        __u32 chunk_size;
        __u32 headroom;
};

static int xsk_setsockopt(struct socket *sock, int level, int optname,
                          sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct xdp_sock *xs = xdp_sk(sk);
        int err;

        if (level != SOL_XDP)
                return -ENOPROTOOPT;

        switch (optname) {
        case XDP_RX_RING:
        case XDP_TX_RING:
        {
                struct xsk_queue **q;
                int entries;

                if (optlen < sizeof(entries))
                        return -EINVAL;
                if (copy_from_sockptr(&entries, optval, sizeof(entries)))
                        return -EFAULT;

                mutex_lock(&xs->mutex);
                if (xs->state != XSK_READY) {
                        mutex_unlock(&xs->mutex);
                        return -EBUSY;
                }
                q = (optname == XDP_TX_RING) ? &xs->tx : &xs->rx;
                err = xsk_init_queue(entries, q, false);
                if (!err && optname == XDP_TX_RING)
                        /* Tx needs to be explicitly woken up the first time */
                        xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP;
                mutex_unlock(&xs->mutex);
                return err;
        }
        case XDP_UMEM_REG:
        {
                size_t mr_size = sizeof(struct xdp_umem_reg);
                struct xdp_umem_reg mr = {};
                struct xdp_umem *umem;

                if (optlen < sizeof(struct xdp_umem_reg_v1))
                        return -EINVAL;
                else if (optlen < sizeof(mr))
                        mr_size = sizeof(struct xdp_umem_reg_v1);

                BUILD_BUG_ON(sizeof(struct xdp_umem_reg_v1) >= sizeof(struct xdp_umem_reg));

                /* Make sure the last field of the struct doesn't have
                 * uninitialized padding. All padding has to be explicit
                 * and has to be set to zero by the userspace to make
                 * struct xdp_umem_reg extensible in the future.
                 */
                BUILD_BUG_ON(offsetof(struct xdp_umem_reg, tx_metadata_len) +
                             sizeof_field(struct xdp_umem_reg, tx_metadata_len) !=
                             sizeof(struct xdp_umem_reg));

                if (copy_from_sockptr(&mr, optval, mr_size))
                        return -EFAULT;

                mutex_lock(&xs->mutex);
                if (xs->state != XSK_READY || xs->umem) {
                        mutex_unlock(&xs->mutex);
                        return -EBUSY;
                }

                umem = xdp_umem_create(&mr);
                if (IS_ERR(umem)) {
                        mutex_unlock(&xs->mutex);
                        return PTR_ERR(umem);
                }

                /* Make sure umem is ready before it can be seen by others */
                smp_wmb();
                WRITE_ONCE(xs->umem, umem);
                mutex_unlock(&xs->mutex);
                return 0;
        }
        case XDP_UMEM_FILL_RING:
        case XDP_UMEM_COMPLETION_RING:
        {
                struct xsk_queue **q;
                int entries;

                if (optlen < sizeof(entries))
                        return -EINVAL;
                if (copy_from_sockptr(&entries, optval, sizeof(entries)))
                        return -EFAULT;

                mutex_lock(&xs->mutex);
                if (xs->state != XSK_READY) {
                        mutex_unlock(&xs->mutex);
                        return -EBUSY;
                }

                q = (optname == XDP_UMEM_FILL_RING) ? &xs->fq_tmp :
                        &xs->cq_tmp;
                err = xsk_init_queue(entries, q, true);
                mutex_unlock(&xs->mutex);
                return err;
        }
        default:
                break;
        }

        return -ENOPROTOOPT;
}

static void xsk_enter_rxtx_offsets(struct xdp_ring_offset_v1 *ring)
{
        ring->producer = offsetof(struct xdp_rxtx_ring, ptrs.producer);
        ring->consumer = offsetof(struct xdp_rxtx_ring, ptrs.consumer);
        ring->desc = offsetof(struct xdp_rxtx_ring, desc);
}

static void xsk_enter_umem_offsets(struct xdp_ring_offset_v1 *ring)
{
        ring->producer = offsetof(struct xdp_umem_ring, ptrs.producer);
        ring->consumer = offsetof(struct xdp_umem_ring, ptrs.consumer);
        ring->desc = offsetof(struct xdp_umem_ring, desc);
}

struct xdp_statistics_v1 {
        __u64 rx_dropped;
        __u64 rx_invalid_descs;
        __u64 tx_invalid_descs;
};

static int xsk_getsockopt(struct socket *sock, int level, int optname,
                          char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        struct xdp_sock *xs = xdp_sk(sk);
        int len;

        if (level != SOL_XDP)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case XDP_STATISTICS:
        {
                struct xdp_statistics stats = {};
                bool extra_stats = true;
                size_t stats_size;

                if (len < sizeof(struct xdp_statistics_v1)) {
                        return -EINVAL;
                } else if (len < sizeof(stats)) {
                        extra_stats = false;
                        stats_size = sizeof(struct xdp_statistics_v1);
                } else {
                        stats_size = sizeof(stats);
                }

                mutex_lock(&xs->mutex);
                stats.rx_dropped = xs->rx_dropped;
                if (extra_stats) {
                        stats.rx_ring_full = xs->rx_queue_full;
                        stats.rx_fill_ring_empty_descs =
                                xs->pool ? xskq_nb_queue_empty_descs(xs->pool->fq) : 0;
                        stats.tx_ring_empty_descs = xskq_nb_queue_empty_descs(xs->tx);
                } else {
                        stats.rx_dropped += xs->rx_queue_full;
                }
                stats.rx_invalid_descs = xskq_nb_invalid_descs(xs->rx);
                stats.tx_invalid_descs = xskq_nb_invalid_descs(xs->tx);
                mutex_unlock(&xs->mutex);

                if (copy_to_user(optval, &stats, stats_size))
                        return -EFAULT;
                if (put_user(stats_size, optlen))
                        return -EFAULT;

                return 0;
        }
        case XDP_MMAP_OFFSETS:
        {
                struct xdp_mmap_offsets off;
                struct xdp_mmap_offsets_v1 off_v1;
                bool flags_supported = true;
                void *to_copy;

                if (len < sizeof(off_v1))
                        return -EINVAL;
                else if (len < sizeof(off))
                        flags_supported = false;

                if (flags_supported) {
                        /* xdp_ring_offset is identical to xdp_ring_offset_v1
                         * except for the flags field added to the end.
                         */
                        xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
                                               &off.rx);
                        xsk_enter_rxtx_offsets((struct xdp_ring_offset_v1 *)
                                               &off.tx);
                        xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
                                               &off.fr);
                        xsk_enter_umem_offsets((struct xdp_ring_offset_v1 *)
                                               &off.cr);
                        off.rx.flags = offsetof(struct xdp_rxtx_ring,
                                                ptrs.flags);
                        off.tx.flags = offsetof(struct xdp_rxtx_ring,
                                                ptrs.flags);
                        off.fr.flags = offsetof(struct xdp_umem_ring,
                                                ptrs.flags);
                        off.cr.flags = offsetof(struct xdp_umem_ring,
                                                ptrs.flags);

                        len = sizeof(off);
                        to_copy = &off;
                } else {
                        xsk_enter_rxtx_offsets(&off_v1.rx);
                        xsk_enter_rxtx_offsets(&off_v1.tx);
                        xsk_enter_umem_offsets(&off_v1.fr);
                        xsk_enter_umem_offsets(&off_v1.cr);

                        len = sizeof(off_v1);
                        to_copy = &off_v1;
                }

                if (copy_to_user(optval, to_copy, len))
                        return -EFAULT;
                if (put_user(len, optlen))
                        return -EFAULT;

                return 0;
        }
        case XDP_OPTIONS:
        {
                struct xdp_options opts = {};

                if (len < sizeof(opts))
                        return -EINVAL;

                mutex_lock(&xs->mutex);
                if (xs->zc)
                        opts.flags |= XDP_OPTIONS_ZEROCOPY;
                mutex_unlock(&xs->mutex);

                len = sizeof(opts);
                if (copy_to_user(optval, &opts, len))
                        return -EFAULT;
                if (put_user(len, optlen))
                        return -EFAULT;

                return 0;
        }
        default:
                break;
        }

        return -EOPNOTSUPP;
}

static int xsk_mmap(struct file *file, struct socket *sock,
                    struct vm_area_struct *vma)
{
        loff_t offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
        unsigned long size = vma->vm_end - vma->vm_start;
        struct xdp_sock *xs = xdp_sk(sock->sk);
        int state = READ_ONCE(xs->state);
        struct xsk_queue *q = NULL;

        if (state != XSK_READY && state != XSK_BOUND)
                return -EBUSY;

        if (offset == XDP_PGOFF_RX_RING) {
                q = READ_ONCE(xs->rx);
        } else if (offset == XDP_PGOFF_TX_RING) {
                q = READ_ONCE(xs->tx);
        } else {
                /* Matches the smp_wmb() in XDP_UMEM_REG */
                smp_rmb();
                if (offset == XDP_UMEM_PGOFF_FILL_RING)
                        q = state == XSK_READY ? READ_ONCE(xs->fq_tmp) :
                                                 READ_ONCE(xs->pool->fq);
                else if (offset == XDP_UMEM_PGOFF_COMPLETION_RING)
                        q = state == XSK_READY ? READ_ONCE(xs->cq_tmp) :
                                                 READ_ONCE(xs->pool->cq);
        }

        if (!q)
                return -EINVAL;

        /* Matches the smp_wmb() in xsk_init_queue */
        smp_rmb();
        if (size > q->ring_vmalloc_size)
                return -EINVAL;

        return remap_vmalloc_range(vma, q->ring, 0);
}

static int xsk_notifier(struct notifier_block *this,
                        unsigned long msg, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);
        struct sock *sk;

        switch (msg) {
        case NETDEV_UNREGISTER:
                mutex_lock(&net->xdp.lock);
                sk_for_each(sk, &net->xdp.list) {
                        struct xdp_sock *xs = xdp_sk(sk);

                        mutex_lock(&xs->mutex);
                        if (xs->dev == dev) {
                                sk->sk_err = ENETDOWN;
                                if (!sock_flag(sk, SOCK_DEAD))
                                        sk_error_report(sk);

                                xsk_unbind_dev(xs);

                                /* Clear device references. */
                                xp_clear_dev(xs->pool);
                        }
                        mutex_unlock(&xs->mutex);
                }
                mutex_unlock(&net->xdp.lock);
                break;
        }
        return NOTIFY_DONE;
}

static struct proto xsk_proto = {
        .name =                "XDP",
        .owner =        THIS_MODULE,
        .obj_size =        sizeof(struct xdp_sock),
};

static const struct proto_ops xsk_proto_ops = {
        .family                = PF_XDP,
        .owner                = THIS_MODULE,
        .release        = xsk_release,
        .bind                = xsk_bind,
        .connect        = sock_no_connect,
        .socketpair        = sock_no_socketpair,
        .accept                = sock_no_accept,
        .getname        = sock_no_getname,
        .poll                = xsk_poll,
        .ioctl                = sock_no_ioctl,
        .listen                = sock_no_listen,
        .shutdown        = sock_no_shutdown,
        .setsockopt        = xsk_setsockopt,
        .getsockopt        = xsk_getsockopt,
        .sendmsg        = xsk_sendmsg,
        .recvmsg        = xsk_recvmsg,
        .mmap                = xsk_mmap,
};

static void xsk_destruct(struct sock *sk)
{
        struct xdp_sock *xs = xdp_sk(sk);

        if (!sock_flag(sk, SOCK_DEAD))
                return;

        if (!xp_put_pool(xs->pool))
                xdp_put_umem(xs->umem, !xs->pool);
}

static int xsk_create(struct net *net, struct socket *sock, int protocol,
                      int kern)
{
        struct xdp_sock *xs;
        struct sock *sk;

        if (!ns_capable(net->user_ns, CAP_NET_RAW))
                return -EPERM;
        if (sock->type != SOCK_RAW)
                return -ESOCKTNOSUPPORT;

        if (protocol)
                return -EPROTONOSUPPORT;

        sock->state = SS_UNCONNECTED;

        sk = sk_alloc(net, PF_XDP, GFP_KERNEL, &xsk_proto, kern);
        if (!sk)
                return -ENOBUFS;

        sock->ops = &xsk_proto_ops;

        sock_init_data(sock, sk);

        sk->sk_family = PF_XDP;

        sk->sk_destruct = xsk_destruct;

        sock_set_flag(sk, SOCK_RCU_FREE);

        xs = xdp_sk(sk);
        xs->state = XSK_READY;
        mutex_init(&xs->mutex);
        spin_lock_init(&xs->rx_lock);

        INIT_LIST_HEAD(&xs->map_list);
        spin_lock_init(&xs->map_list_lock);

        mutex_lock(&net->xdp.lock);
        sk_add_node_rcu(sk, &net->xdp.list);
        mutex_unlock(&net->xdp.lock);

        sock_prot_inuse_add(net, &xsk_proto, 1);

        return 0;
}

static const struct net_proto_family xsk_family_ops = {
        .family = PF_XDP,
        .create = xsk_create,
        .owner        = THIS_MODULE,
};

static struct notifier_block xsk_netdev_notifier = {
        .notifier_call        = xsk_notifier,
};

static int __net_init xsk_net_init(struct net *net)
{
        mutex_init(&net->xdp.lock);
        INIT_HLIST_HEAD(&net->xdp.list);
        return 0;
}

static void __net_exit xsk_net_exit(struct net *net)
{
        WARN_ON_ONCE(!hlist_empty(&net->xdp.list));
}

static struct pernet_operations xsk_net_ops = {
        .init = xsk_net_init,
        .exit = xsk_net_exit,
};

static int __init xsk_init(void)
{
        int err;

        err = proto_register(&xsk_proto, 0 /* no slab */);
        if (err)
                goto out;

        err = sock_register(&xsk_family_ops);
        if (err)
                goto out_proto;

        err = register_pernet_subsys(&xsk_net_ops);
        if (err)
                goto out_sk;

        err = register_netdevice_notifier(&xsk_netdev_notifier);
        if (err)
                goto out_pernet;

        return 0;

out_pernet:
        unregister_pernet_subsys(&xsk_net_ops);
out_sk:
        sock_unregister(PF_XDP);
out_proto:
        proto_unregister(&xsk_proto);
out:
        return err;
}

fs_initcall(xsk_init);










































































































































































































  171 



  170 












  249 
















  249 





























































































































































































































  320 



  160 




  153 


   65 



  153 
   65 





































































































































  196 





































































































































    2 



    2 



































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Integrity Measurement Architecture
 *
 * Copyright (C) 2005,2006,2007,2008 IBM Corporation
 *
 * Authors:
 * Reiner Sailer <sailer@watson.ibm.com>
 * Serge Hallyn <serue@us.ibm.com>
 * Kylene Hall <kylene@us.ibm.com>
 * Mimi Zohar <zohar@us.ibm.com>
 *
 * File: ima_main.c
 *        implements the IMA hooks: ima_bprm_check, ima_file_mmap,
 *        and ima_file_check.
 */

#include <linux/module.h>
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/kernel_read_file.h>
#include <linux/mount.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include <linux/ima.h>
#include <linux/fs.h>
#include <linux/iversion.h>
#include <linux/evm.h>

#include "ima.h"

#ifdef CONFIG_IMA_APPRAISE
int ima_appraise = IMA_APPRAISE_ENFORCE;
#else
int ima_appraise;
#endif

int __ro_after_init ima_hash_algo = HASH_ALGO_SHA1;
static int hash_setup_done;

static struct notifier_block ima_lsm_policy_notifier = {
        .notifier_call = ima_lsm_policy_change,
};

static int __init hash_setup(char *str)
{
        struct ima_template_desc *template_desc = ima_template_desc_current();
        int i;

        if (hash_setup_done)
                return 1;

        if (strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) == 0) {
                if (strncmp(str, "sha1", 4) == 0) {
                        ima_hash_algo = HASH_ALGO_SHA1;
                } else if (strncmp(str, "md5", 3) == 0) {
                        ima_hash_algo = HASH_ALGO_MD5;
                } else {
                        pr_err("invalid hash algorithm \"%s\" for template \"%s\"",
                                str, IMA_TEMPLATE_IMA_NAME);
                        return 1;
                }
                goto out;
        }

        i = match_string(hash_algo_name, HASH_ALGO__LAST, str);
        if (i < 0) {
                pr_err("invalid hash algorithm \"%s\"", str);
                return 1;
        }

        ima_hash_algo = i;
out:
        hash_setup_done = 1;
        return 1;
}
__setup("ima_hash=", hash_setup);

enum hash_algo ima_get_current_hash_algo(void)
{
        return ima_hash_algo;
}

/* Prevent mmap'ing a file execute that is already mmap'ed write */
static int mmap_violation_check(enum ima_hooks func, struct file *file,
                                char **pathbuf, const char **pathname,
                                char *filename)
{
        struct inode *inode;
        int rc = 0;

        if ((func == MMAP_CHECK || func == MMAP_CHECK_REQPROT) &&
            mapping_writably_mapped(file->f_mapping)) {
                rc = -ETXTBSY;
                inode = file_inode(file);

                if (!*pathbuf)        /* ima_rdwr_violation possibly pre-fetched */
                        *pathname = ima_d_path(&file->f_path, pathbuf,
                                               filename);
                integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, *pathname,
                                    "mmap_file", "mmapped_writers", rc, 0);
        }
        return rc;
}

/*
 * ima_rdwr_violation_check
 *
 * Only invalidate the PCR for measured files:
 *        - Opening a file for write when already open for read,
 *          results in a time of measure, time of use (ToMToU) error.
 *        - Opening a file for read when already open for write,
 *          could result in a file measurement error.
 *
 */
static void ima_rdwr_violation_check(struct file *file,
                                     struct ima_iint_cache *iint,
                                     int must_measure,
                                     char **pathbuf,
                                     const char **pathname,
                                     char *filename)
{
        struct inode *inode = file_inode(file);
        fmode_t mode = file->f_mode;
        bool send_tomtou = false, send_writers = false;

        if (mode & FMODE_WRITE) {
                if (atomic_read(&inode->i_readcount) && IS_IMA(inode)) {
                        if (!iint)
                                iint = ima_iint_find(inode);

                        /* IMA_MEASURE is set from reader side */
                        if (iint && test_and_clear_bit(IMA_MAY_EMIT_TOMTOU,
                                                       &iint->atomic_flags))
                                send_tomtou = true;
                }
        } else {
                if (must_measure)
                        set_bit(IMA_MAY_EMIT_TOMTOU, &iint->atomic_flags);

                /* Limit number of open_writers violations */
                if (inode_is_open_for_write(inode) && must_measure) {
                        if (!test_and_set_bit(IMA_EMITTED_OPENWRITERS,
                                              &iint->atomic_flags))
                                send_writers = true;
                }
        }

        if (!send_tomtou && !send_writers)
                return;

        *pathname = ima_d_path(&file->f_path, pathbuf, filename);

        if (send_tomtou)
                ima_add_violation(file, *pathname, iint,
                                  "invalid_pcr", "ToMToU");
        if (send_writers)
                ima_add_violation(file, *pathname, iint,
                                  "invalid_pcr", "open_writers");
}

static void ima_check_last_writer(struct ima_iint_cache *iint,
                                  struct inode *inode, struct file *file)
{
        fmode_t mode = file->f_mode;
        bool update;

        if (!(mode & FMODE_WRITE))
                return;

        mutex_lock(&iint->mutex);
        if (atomic_read(&inode->i_writecount) == 1) {
                struct kstat stat;

                clear_bit(IMA_EMITTED_OPENWRITERS, &iint->atomic_flags);

                update = test_and_clear_bit(IMA_UPDATE_XATTR,
                                            &iint->atomic_flags);
                if ((iint->flags & IMA_NEW_FILE) ||
                    vfs_getattr_nosec(&file->f_path, &stat,
                                      STATX_CHANGE_COOKIE,
                                      AT_STATX_SYNC_AS_STAT) ||
                    !(stat.result_mask & STATX_CHANGE_COOKIE) ||
                    stat.change_cookie != iint->real_inode.version) {
                        iint->flags &= ~(IMA_DONE_MASK | IMA_NEW_FILE);
                        iint->measured_pcrs = 0;
                        if (update)
                                ima_update_xattr(iint, file);
                }
        }
        mutex_unlock(&iint->mutex);
}

/**
 * ima_file_free - called on __fput()
 * @file: pointer to file structure being freed
 *
 * Flag files that changed, based on i_version
 */
static void ima_file_free(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct ima_iint_cache *iint;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        iint = ima_iint_find(inode);
        if (!iint)
                return;

        ima_check_last_writer(iint, inode, file);
}

static int process_measurement(struct file *file, const struct cred *cred,
                               struct lsm_prop *prop, char *buf, loff_t size,
                               int mask, enum ima_hooks func)
{
        struct inode *real_inode, *inode = file_inode(file);
        struct ima_iint_cache *iint = NULL;
        struct ima_template_desc *template_desc = NULL;
        struct inode *metadata_inode;
        char *pathbuf = NULL;
        char filename[NAME_MAX];
        const char *pathname = NULL;
        int rc = 0, action, must_appraise = 0;
        int pcr = CONFIG_IMA_MEASURE_PCR_IDX;
        struct evm_ima_xattr_data *xattr_value = NULL;
        struct modsig *modsig = NULL;
        int xattr_len = 0;
        bool violation_check;
        enum hash_algo hash_algo;
        unsigned int allowed_algos = 0;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return 0;

        /* Return an IMA_MEASURE, IMA_APPRAISE, IMA_AUDIT action
         * bitmask based on the appraise/audit/measurement policy.
         * Included is the appraise submask.
         */
        action = ima_get_action(file_mnt_idmap(file), inode, cred, prop,
                                mask, func, &pcr, &template_desc, NULL,
                                &allowed_algos);
        violation_check = ((func == FILE_CHECK || func == MMAP_CHECK ||
                            func == MMAP_CHECK_REQPROT) &&
                           (ima_policy_flag & IMA_MEASURE) &&
                           ((action & IMA_MEASURE) ||
                            (file->f_mode & FMODE_WRITE)));
        if (!action && !violation_check)
                return 0;

        must_appraise = action & IMA_APPRAISE;

        /*  Is the appraise rule hook specific?  */
        if (action & IMA_FILE_APPRAISE)
                func = FILE_CHECK;

        inode_lock(inode);

        if (action) {
                iint = ima_inode_get(inode);
                if (!iint)
                        rc = -ENOMEM;
        }

        if (!rc && violation_check)
                ima_rdwr_violation_check(file, iint, action & IMA_MEASURE,
                                         &pathbuf, &pathname, filename);

        inode_unlock(inode);

        if (rc)
                goto out;
        if (!action)
                goto out;

        mutex_lock(&iint->mutex);

        if (test_and_clear_bit(IMA_CHANGE_ATTR, &iint->atomic_flags))
                /*
                 * Reset appraisal flags (action and non-action rule-specific)
                 * if ima_inode_post_setattr was called.
                 */
                iint->flags &= ~(IMA_APPRAISE | IMA_APPRAISED |
                                 IMA_APPRAISE_SUBMASK | IMA_APPRAISED_SUBMASK |
                                 IMA_NONACTION_RULE_FLAGS);

        /*
         * Re-evaulate the file if either the xattr has changed or the
         * kernel has no way of detecting file change on the filesystem.
         * (Limited to privileged mounted filesystems.)
         */
        if (test_and_clear_bit(IMA_CHANGE_XATTR, &iint->atomic_flags) ||
            ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) &&
             !(inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) &&
             !(action & IMA_FAIL_UNVERIFIABLE_SIGS))) {
                iint->flags &= ~IMA_DONE_MASK;
                iint->measured_pcrs = 0;
        }

        /*
         * On stacked filesystems, detect and re-evaluate file data and
         * metadata changes.
         */
        real_inode = d_real_inode(file_dentry(file));
        if (real_inode != inode &&
            (action & IMA_DO_MASK) && (iint->flags & IMA_DONE_MASK)) {
                if (!IS_I_VERSION(real_inode) ||
                    integrity_inode_attrs_changed(&iint->real_inode,
                                                  real_inode)) {
                        iint->flags &= ~IMA_DONE_MASK;
                        iint->measured_pcrs = 0;
                }

                /*
                 * Reset the EVM status when metadata changed.
                 */
                metadata_inode = d_inode(d_real(file_dentry(file),
                                         D_REAL_METADATA));
                if (evm_metadata_changed(inode, metadata_inode))
                        iint->flags &= ~(IMA_APPRAISED |
                                         IMA_APPRAISED_SUBMASK);
        }

        /* Determine if already appraised/measured based on bitmask
         * (IMA_MEASURE, IMA_MEASURED, IMA_XXXX_APPRAISE, IMA_XXXX_APPRAISED,
         *  IMA_AUDIT, IMA_AUDITED)
         */
        iint->flags |= action;
        action &= IMA_DO_MASK;
        action &= ~((iint->flags & (IMA_DONE_MASK ^ IMA_MEASURED)) >> 1);

        /* If target pcr is already measured, unset IMA_MEASURE action */
        if ((action & IMA_MEASURE) && (iint->measured_pcrs & (0x1 << pcr)))
                action ^= IMA_MEASURE;

        /* HASH sets the digital signature and update flags, nothing else */
        if ((action & IMA_HASH) &&
            !(test_bit(IMA_DIGSIG, &iint->atomic_flags))) {
                xattr_len = ima_read_xattr(file_dentry(file),
                                           &xattr_value, xattr_len);
                if ((xattr_value && xattr_len > 2) &&
                    (xattr_value->type == EVM_IMA_XATTR_DIGSIG))
                        set_bit(IMA_DIGSIG, &iint->atomic_flags);
                iint->flags |= IMA_HASHED;
                action ^= IMA_HASH;
                set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }

        /* Nothing to do, just return existing appraised status */
        if (!action) {
                if (must_appraise) {
                        rc = mmap_violation_check(func, file, &pathbuf,
                                                  &pathname, filename);
                        if (!rc)
                                rc = ima_get_cache_status(iint, func);
                }
                goto out_locked;
        }

        if ((action & IMA_APPRAISE_SUBMASK) ||
            strcmp(template_desc->name, IMA_TEMPLATE_IMA_NAME) != 0) {
                /* read 'security.ima' */
                xattr_len = ima_read_xattr(file_dentry(file),
                                           &xattr_value, xattr_len);

                /*
                 * Read the appended modsig if allowed by the policy, and allow
                 * an additional measurement list entry, if needed, based on the
                 * template format and whether the file was already measured.
                 */
                if (iint->flags & IMA_MODSIG_ALLOWED) {
                        rc = ima_read_modsig(func, buf, size, &modsig);

                        if (!rc && ima_template_has_modsig(template_desc) &&
                            iint->flags & IMA_MEASURED)
                                action |= IMA_MEASURE;
                }
        }

        hash_algo = ima_get_hash_algo(xattr_value, xattr_len);

        rc = ima_collect_measurement(iint, file, buf, size, hash_algo, modsig);
        if (rc != 0 && rc != -EBADF && rc != -EINVAL)
                goto out_locked;

        if (!pathbuf)        /* ima_rdwr_violation possibly pre-fetched */
                pathname = ima_d_path(&file->f_path, &pathbuf, filename);

        if (action & IMA_MEASURE)
                ima_store_measurement(iint, file, pathname,
                                      xattr_value, xattr_len, modsig, pcr,
                                      template_desc);
        if (rc == 0 && (action & IMA_APPRAISE_SUBMASK)) {
                rc = ima_check_blacklist(iint, modsig, pcr);
                if (rc != -EPERM) {
                        inode_lock(inode);
                        rc = ima_appraise_measurement(func, iint, file,
                                                      pathname, xattr_value,
                                                      xattr_len, modsig);
                        inode_unlock(inode);
                }
                if (!rc)
                        rc = mmap_violation_check(func, file, &pathbuf,
                                                  &pathname, filename);
        }
        if (action & IMA_AUDIT)
                ima_audit_measurement(iint, pathname);

        if ((file->f_flags & O_DIRECT) && (iint->flags & IMA_PERMIT_DIRECTIO))
                rc = 0;

        /* Ensure the digest was generated using an allowed algorithm */
        if (rc == 0 && must_appraise && allowed_algos != 0 &&
            (allowed_algos & (1U << hash_algo)) == 0) {
                rc = -EACCES;

                integrity_audit_msg(AUDIT_INTEGRITY_DATA, file_inode(file),
                                    pathname, "collect_data",
                                    "denied-hash-algorithm", rc, 0);
        }
out_locked:
        if ((mask & MAY_WRITE) && test_bit(IMA_DIGSIG, &iint->atomic_flags) &&
             !(iint->flags & IMA_NEW_FILE))
                rc = -EACCES;
        mutex_unlock(&iint->mutex);
        kfree(xattr_value);
        ima_free_modsig(modsig);
out:
        if (pathbuf)
                __putname(pathbuf);
        if (must_appraise) {
                if (rc && (ima_appraise & IMA_APPRAISE_ENFORCE))
                        return -EACCES;
                if (file->f_mode & FMODE_WRITE)
                        set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }
        return 0;
}

/**
 * ima_file_mmap - based on policy, collect/store measurement.
 * @file: pointer to the file to be measured (May be NULL)
 * @reqprot: protection requested by the application
 * @prot: protection that will be applied by the kernel
 * @flags: operational flags
 *
 * Measure files being mmapped executable based on the ima_must_measure()
 * policy decision.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_file_mmap(struct file *file, unsigned long reqprot,
                         unsigned long prot, unsigned long flags)
{
        struct lsm_prop prop;
        int ret;

        if (!file)
                return 0;

        security_current_getlsmprop_subj(&prop);

        if (reqprot & PROT_EXEC) {
                ret = process_measurement(file, current_cred(), &prop, NULL,
                                          0, MAY_EXEC, MMAP_CHECK_REQPROT);
                if (ret)
                        return ret;
        }

        if (prot & PROT_EXEC)
                return process_measurement(file, current_cred(), &prop, NULL,
                                           0, MAY_EXEC, MMAP_CHECK);

        return 0;
}

/**
 * ima_file_mprotect - based on policy, limit mprotect change
 * @vma: vm_area_struct protection is set to
 * @reqprot: protection requested by the application
 * @prot: protection that will be applied by the kernel
 *
 * Files can be mmap'ed read/write and later changed to execute to circumvent
 * IMA's mmap appraisal policy rules.  Due to locking issues (mmap semaphore
 * would be taken before i_mutex), files can not be measured or appraised at
 * this point.  Eliminate this integrity gap by denying the mprotect
 * PROT_EXECUTE change, if an mmap appraise policy rule exists.
 *
 * On mprotect change success, return 0.  On failure, return -EACESS.
 */
static int ima_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                             unsigned long prot)
{
        struct ima_template_desc *template = NULL;
        struct file *file;
        char filename[NAME_MAX];
        char *pathbuf = NULL;
        const char *pathname = NULL;
        struct inode *inode;
        struct lsm_prop prop;
        int result = 0;
        int action;
        int pcr;

        /* Is mprotect making an mmap'ed file executable? */
        if (!(ima_policy_flag & IMA_APPRAISE) || !vma->vm_file ||
            !(prot & PROT_EXEC) || (vma->vm_flags & VM_EXEC))
                return 0;

        security_current_getlsmprop_subj(&prop);
        inode = file_inode(vma->vm_file);
        action = ima_get_action(file_mnt_idmap(vma->vm_file), inode,
                                current_cred(), &prop, MAY_EXEC, MMAP_CHECK,
                                &pcr, &template, NULL, NULL);
        action |= ima_get_action(file_mnt_idmap(vma->vm_file), inode,
                                 current_cred(), &prop, MAY_EXEC,
                                 MMAP_CHECK_REQPROT, &pcr, &template, NULL,
                                 NULL);

        /* Is the mmap'ed file in policy? */
        if (!(action & (IMA_MEASURE | IMA_APPRAISE_SUBMASK)))
                return 0;

        if (action & IMA_APPRAISE_SUBMASK)
                result = -EPERM;

        file = vma->vm_file;
        pathname = ima_d_path(&file->f_path, &pathbuf, filename);
        integrity_audit_msg(AUDIT_INTEGRITY_DATA, inode, pathname,
                            "collect_data", "failed-mprotect", result, 0);
        if (pathbuf)
                __putname(pathbuf);

        return result;
}

/**
 * ima_bprm_check - based on policy, collect/store measurement.
 * @bprm: contains the linux_binprm structure
 *
 * The OS protects against an executable file, already open for write,
 * from being executed in deny_write_access() and an executable file,
 * already open for execute, from being modified in get_write_access().
 * So we can be certain that what we verify and measure here is actually
 * what is being executed.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_bprm_check(struct linux_binprm *bprm)
{
        int ret;
        struct lsm_prop prop;

        security_current_getlsmprop_subj(&prop);
        ret = process_measurement(bprm->file, current_cred(),
                                  &prop, NULL, 0, MAY_EXEC, BPRM_CHECK);
        if (ret)
                return ret;

        security_cred_getlsmprop(bprm->cred, &prop);
        return process_measurement(bprm->file, bprm->cred, &prop, NULL, 0,
                                   MAY_EXEC, CREDS_CHECK);
}

/**
 * ima_bprm_creds_for_exec - collect/store/appraise measurement.
 * @bprm: contains the linux_binprm structure
 *
 * Based on the IMA policy and the execveat(2) AT_EXECVE_CHECK flag, measure
 * and appraise the integrity of a file to be executed by script interpreters.
 * Unlike any of the other LSM hooks where the kernel enforces file integrity,
 * enforcing file integrity is left up to the discretion of the script
 * interpreter (userspace).
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        /*
         * As security_bprm_check() is called multiple times, both
         * the script and the shebang interpreter are measured, appraised,
         * and audited. Limit usage of this LSM hook to just measuring,
         * appraising, and auditing the indirect script execution
         * (e.g. ./sh example.sh).
         */
        if (!bprm->is_check)
                return 0;

        return ima_bprm_check(bprm);
}

/**
 * ima_file_check - based on policy, collect/store measurement.
 * @file: pointer to the file to be measured
 * @mask: contains MAY_READ, MAY_WRITE, MAY_EXEC or MAY_APPEND
 *
 * Measure files based on the ima_must_measure() policy decision.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_file_check(struct file *file, int mask)
{
        struct lsm_prop prop;

        security_current_getlsmprop_subj(&prop);
        return process_measurement(file, current_cred(), &prop, NULL, 0,
                                   mask & (MAY_READ | MAY_WRITE | MAY_EXEC |
                                           MAY_APPEND), FILE_CHECK);
}

static int __ima_inode_hash(struct inode *inode, struct file *file, char *buf,
                            size_t buf_size)
{
        struct ima_iint_cache *iint = NULL, tmp_iint;
        int rc, hash_algo;

        if (ima_policy_flag) {
                iint = ima_iint_find(inode);
                if (iint)
                        mutex_lock(&iint->mutex);
        }

        if ((!iint || !(iint->flags & IMA_COLLECTED)) && file) {
                if (iint)
                        mutex_unlock(&iint->mutex);

                memset(&tmp_iint, 0, sizeof(tmp_iint));
                mutex_init(&tmp_iint.mutex);

                rc = ima_collect_measurement(&tmp_iint, file, NULL, 0,
                                             ima_hash_algo, NULL);
                if (rc < 0) {
                        /* ima_hash could be allocated in case of failure. */
                        if (rc != -ENOMEM)
                                kfree(tmp_iint.ima_hash);

                        return -EOPNOTSUPP;
                }

                iint = &tmp_iint;
                mutex_lock(&iint->mutex);
        }

        if (!iint)
                return -EOPNOTSUPP;

        /*
         * ima_file_hash can be called when ima_collect_measurement has still
         * not been called, we might not always have a hash.
         */
        if (!iint->ima_hash || !(iint->flags & IMA_COLLECTED)) {
                mutex_unlock(&iint->mutex);
                return -EOPNOTSUPP;
        }

        if (buf) {
                size_t copied_size;

                copied_size = min_t(size_t, iint->ima_hash->length, buf_size);
                memcpy(buf, iint->ima_hash->digest, copied_size);
        }
        hash_algo = iint->ima_hash->algo;
        mutex_unlock(&iint->mutex);

        if (iint == &tmp_iint)
                kfree(iint->ima_hash);

        return hash_algo;
}

/**
 * ima_file_hash - return a measurement of the file
 * @file: pointer to the file
 * @buf: buffer in which to store the hash
 * @buf_size: length of the buffer
 *
 * On success, return the hash algorithm (as defined in the enum hash_algo).
 * If buf is not NULL, this function also outputs the hash into buf.
 * If the hash is larger than buf_size, then only buf_size bytes will be copied.
 * It generally just makes sense to pass a buffer capable of holding the largest
 * possible hash: IMA_MAX_DIGEST_SIZE.
 * The file hash returned is based on the entire file, including the appended
 * signature.
 *
 * If the measurement cannot be performed, return -EOPNOTSUPP.
 * If the parameters are incorrect, return -EINVAL.
 */
int ima_file_hash(struct file *file, char *buf, size_t buf_size)
{
        if (!file)
                return -EINVAL;

        return __ima_inode_hash(file_inode(file), file, buf, buf_size);
}
EXPORT_SYMBOL_GPL(ima_file_hash);

/**
 * ima_inode_hash - return the stored measurement if the inode has been hashed
 * and is in the iint cache.
 * @inode: pointer to the inode
 * @buf: buffer in which to store the hash
 * @buf_size: length of the buffer
 *
 * On success, return the hash algorithm (as defined in the enum hash_algo).
 * If buf is not NULL, this function also outputs the hash into buf.
 * If the hash is larger than buf_size, then only buf_size bytes will be copied.
 * It generally just makes sense to pass a buffer capable of holding the largest
 * possible hash: IMA_MAX_DIGEST_SIZE.
 * The hash returned is based on the entire contents, including the appended
 * signature.
 *
 * If IMA is disabled or if no measurement is available, return -EOPNOTSUPP.
 * If the parameters are incorrect, return -EINVAL.
 */
int ima_inode_hash(struct inode *inode, char *buf, size_t buf_size)
{
        if (!inode)
                return -EINVAL;

        return __ima_inode_hash(inode, NULL, buf, buf_size);
}
EXPORT_SYMBOL_GPL(ima_inode_hash);

/**
 * ima_post_create_tmpfile - mark newly created tmpfile as new
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode of the newly created tmpfile
 *
 * No measuring, appraising or auditing of newly created tmpfiles is needed.
 * Skip calling process_measurement(), but indicate which newly, created
 * tmpfiles are in policy.
 */
static void ima_post_create_tmpfile(struct mnt_idmap *idmap,
                                    struct inode *inode)

{
        struct ima_iint_cache *iint;
        int must_appraise;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
                return;

        /* Nothing to do if we can't allocate memory */
        iint = ima_inode_get(inode);
        if (!iint)
                return;

        /* needed for writing the security xattrs */
        set_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        iint->ima_file_status = INTEGRITY_PASS;
}

/**
 * ima_post_path_mknod - mark as a new inode
 * @idmap: idmap of the mount the inode was found from
 * @dentry: newly created dentry
 *
 * Mark files created via the mknodat syscall as new, so that the
 * file data can be written later.
 */
static void ima_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        struct ima_iint_cache *iint;
        struct inode *inode = dentry->d_inode;
        int must_appraise;

        if (!ima_policy_flag || !S_ISREG(inode->i_mode))
                return;

        must_appraise = ima_must_appraise(idmap, inode, MAY_ACCESS,
                                          FILE_CHECK);
        if (!must_appraise)
                return;

        /* Nothing to do if we can't allocate memory */
        iint = ima_inode_get(inode);
        if (!iint)
                return;

        /* needed for re-opening empty files */
        iint->flags |= IMA_NEW_FILE;
}

/**
 * ima_read_file - pre-measure/appraise hook decision based on policy
 * @file: pointer to the file to be measured/appraised/audit
 * @read_id: caller identifier
 * @contents: whether a subsequent call will be made to ima_post_read_file()
 *
 * Permit reading a file based on policy. The policy rules are written
 * in terms of the policy identifier.  Appraising the integrity of
 * a file requires a file descriptor.
 *
 * For permission return 0, otherwise return -EACCES.
 */
static int ima_read_file(struct file *file, enum kernel_read_file_id read_id,
                         bool contents)
{
        enum ima_hooks func;
        struct lsm_prop prop;

        /*
         * Do devices using pre-allocated memory run the risk of the
         * firmware being accessible to the device prior to the completion
         * of IMA's signature verification any more than when using two
         * buffers? It may be desirable to include the buffer address
         * in this API and walk all the dma_map_single() mappings to check.
         */

        /*
         * There will be a call made to ima_post_read_file() with
         * a filled buffer, so we don't need to perform an extra
         * read early here.
         */
        if (contents)
                return 0;

        /* Read entire file for all partial reads. */
        func = read_idmap[read_id] ?: FILE_CHECK;
        security_current_getlsmprop_subj(&prop);
        return process_measurement(file, current_cred(), &prop, NULL, 0,
                                   MAY_READ, func);
}

const int read_idmap[READING_MAX_ID] = {
        [READING_FIRMWARE] = FIRMWARE_CHECK,
        [READING_MODULE] = MODULE_CHECK,
        [READING_KEXEC_IMAGE] = KEXEC_KERNEL_CHECK,
        [READING_KEXEC_INITRAMFS] = KEXEC_INITRAMFS_CHECK,
        [READING_POLICY] = POLICY_CHECK
};

/**
 * ima_post_read_file - in memory collect/appraise/audit measurement
 * @file: pointer to the file to be measured/appraised/audit
 * @buf: pointer to in memory file contents
 * @size: size of in memory file contents
 * @read_id: caller identifier
 *
 * Measure/appraise/audit in memory file based on policy.  Policy rules
 * are written in terms of a policy identifier.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_post_read_file(struct file *file, char *buf, loff_t size,
                              enum kernel_read_file_id read_id)
{
        enum ima_hooks func;
        struct lsm_prop prop;

        /* permit signed certs */
        if (!file && read_id == READING_X509_CERTIFICATE)
                return 0;

        if (!file || !buf || size == 0) { /* should never happen */
                if (ima_appraise & IMA_APPRAISE_ENFORCE)
                        return -EACCES;
                return 0;
        }

        func = read_idmap[read_id] ?: FILE_CHECK;
        security_current_getlsmprop_subj(&prop);
        return process_measurement(file, current_cred(), &prop, buf, size,
                                   MAY_READ, func);
}

/**
 * ima_load_data - appraise decision based on policy
 * @id: kernel load data caller identifier
 * @contents: whether the full contents will be available in a later
 *              call to ima_post_load_data().
 *
 * Callers of this LSM hook can not measure, appraise, or audit the
 * data provided by userspace.  Enforce policy rules requiring a file
 * signature (eg. kexec'ed kernel image).
 *
 * For permission return 0, otherwise return -EACCES.
 */
static int ima_load_data(enum kernel_load_data_id id, bool contents)
{
        bool ima_enforce, sig_enforce;

        ima_enforce =
                (ima_appraise & IMA_APPRAISE_ENFORCE) == IMA_APPRAISE_ENFORCE;

        switch (id) {
        case LOADING_KEXEC_IMAGE:
                if (IS_ENABLED(CONFIG_KEXEC_SIG)
                    && arch_ima_get_secureboot()) {
                        pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
                        return -EACCES;
                }

                if (ima_enforce && (ima_appraise & IMA_APPRAISE_KEXEC)) {
                        pr_err("impossible to appraise a kernel image without a file descriptor; try using kexec_file_load syscall.\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        case LOADING_FIRMWARE:
                if (ima_enforce && (ima_appraise & IMA_APPRAISE_FIRMWARE) && !contents) {
                        pr_err("Prevent firmware sysfs fallback loading.\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        case LOADING_MODULE:
                sig_enforce = is_module_sig_enforced();

                if (ima_enforce && (!sig_enforce
                                    && (ima_appraise & IMA_APPRAISE_MODULES))) {
                        pr_err("impossible to appraise a module without a file descriptor. sig_enforce kernel parameter might help\n");
                        return -EACCES;        /* INTEGRITY_UNKNOWN */
                }
                break;
        default:
                break;
        }
        return 0;
}

/**
 * ima_post_load_data - appraise decision based on policy
 * @buf: pointer to in memory file contents
 * @size: size of in memory file contents
 * @load_id: kernel load data caller identifier
 * @description: @load_id-specific description of contents
 *
 * Measure/appraise/audit in memory buffer based on policy.  Policy rules
 * are written in terms of a policy identifier.
 *
 * On success return 0.  On integrity appraisal error, assuming the file
 * is in policy and IMA-appraisal is in enforcing mode, return -EACCES.
 */
static int ima_post_load_data(char *buf, loff_t size,
                              enum kernel_load_data_id load_id,
                              char *description)
{
        if (load_id == LOADING_FIRMWARE) {
                if ((ima_appraise & IMA_APPRAISE_FIRMWARE) &&
                    (ima_appraise & IMA_APPRAISE_ENFORCE)) {
                        pr_err("Prevent firmware loading_store.\n");
                        return -EACCES; /* INTEGRITY_UNKNOWN */
                }
                return 0;
        }

        /*
         * Measure the init_module syscall buffer containing the ELF image.
         */
        if (load_id == LOADING_MODULE)
                ima_measure_critical_data("modules", "init_module",
                                          buf, size, true, NULL, 0);

        return 0;
}

/**
 * process_buffer_measurement - Measure the buffer or the buffer data hash
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode associated with the object being measured (NULL for KEY_CHECK)
 * @buf: pointer to the buffer that needs to be added to the log.
 * @size: size of buffer(in bytes).
 * @eventname: event name to be used for the buffer entry.
 * @func: IMA hook
 * @pcr: pcr to extend the measurement
 * @func_data: func specific data, may be NULL
 * @buf_hash: measure buffer data hash
 * @digest: buffer digest will be written to
 * @digest_len: buffer length
 *
 * Based on policy, either the buffer data or buffer data hash is measured
 *
 * Return: 0 if the buffer has been successfully measured, 1 if the digest
 * has been written to the passed location but not added to a measurement entry,
 * a negative value otherwise.
 */
int process_buffer_measurement(struct mnt_idmap *idmap,
                               struct inode *inode, const void *buf, int size,
                               const char *eventname, enum ima_hooks func,
                               int pcr, const char *func_data,
                               bool buf_hash, u8 *digest, size_t digest_len)
{
        int ret = 0;
        const char *audit_cause = "ENOMEM";
        struct ima_template_entry *entry = NULL;
        struct ima_iint_cache iint = {};
        struct ima_event_data event_data = {.iint = &iint,
                                            .filename = eventname,
                                            .buf = buf,
                                            .buf_len = size};
        struct ima_template_desc *template;
        struct ima_max_digest_data hash;
        struct ima_digest_data *hash_hdr = container_of(&hash.hdr,
                                                struct ima_digest_data, hdr);
        char digest_hash[IMA_MAX_DIGEST_SIZE];
        int digest_hash_len = hash_digest_size[ima_hash_algo];
        int violation = 0;
        int action = 0;
        struct lsm_prop prop;

        if (digest && digest_len < digest_hash_len)
                return -EINVAL;

        if (!ima_policy_flag && !digest)
                return -ENOENT;

        template = ima_template_desc_buf();
        if (!template) {
                ret = -EINVAL;
                audit_cause = "ima_template_desc_buf";
                goto out;
        }

        /*
         * Both LSM hooks and auxiliary based buffer measurements are
         * based on policy. To avoid code duplication, differentiate
         * between the LSM hooks and auxiliary buffer measurements,
         * retrieving the policy rule information only for the LSM hook
         * buffer measurements.
         */
        if (func) {
                security_current_getlsmprop_subj(&prop);
                action = ima_get_action(idmap, inode, current_cred(),
                                        &prop, 0, func, &pcr, &template,
                                        func_data, NULL);
                if (!(action & IMA_MEASURE) && !digest)
                        return -ENOENT;
        }

        if (!pcr)
                pcr = CONFIG_IMA_MEASURE_PCR_IDX;

        iint.ima_hash = hash_hdr;
        iint.ima_hash->algo = ima_hash_algo;
        iint.ima_hash->length = hash_digest_size[ima_hash_algo];

        ret = ima_calc_buffer_hash(buf, size, iint.ima_hash);
        if (ret < 0) {
                audit_cause = "hashing_error";
                goto out;
        }

        if (buf_hash) {
                memcpy(digest_hash, hash_hdr->digest, digest_hash_len);

                ret = ima_calc_buffer_hash(digest_hash, digest_hash_len,
                                           iint.ima_hash);
                if (ret < 0) {
                        audit_cause = "hashing_error";
                        goto out;
                }

                event_data.buf = digest_hash;
                event_data.buf_len = digest_hash_len;
        }

        if (digest)
                memcpy(digest, iint.ima_hash->digest, digest_hash_len);

        if (!ima_policy_flag || (func && !(action & IMA_MEASURE)))
                return 1;

        ret = ima_alloc_init_template(&event_data, &entry, template);
        if (ret < 0) {
                audit_cause = "alloc_entry";
                goto out;
        }

        ret = ima_store_template(entry, violation, NULL, event_data.buf, pcr);
        if (ret < 0) {
                audit_cause = "store_entry";
                ima_free_template_entry(entry);
        }

out:
        if (ret < 0)
                integrity_audit_message(AUDIT_INTEGRITY_PCR, NULL, eventname,
                                        func_measure_str(func),
                                        audit_cause, ret, 0, ret);

        return ret;
}

/**
 * ima_kexec_cmdline - measure kexec cmdline boot args
 * @kernel_fd: file descriptor of the kexec kernel being loaded
 * @buf: pointer to buffer
 * @size: size of buffer
 *
 * Buffers can only be measured, not appraised.
 */
void ima_kexec_cmdline(int kernel_fd, const void *buf, int size)
{
        if (!buf || !size)
                return;

        CLASS(fd, f)(kernel_fd);
        if (fd_empty(f))
                return;

        process_buffer_measurement(file_mnt_idmap(fd_file(f)), file_inode(fd_file(f)),
                                   buf, size, "kexec-cmdline", KEXEC_CMDLINE, 0,
                                   NULL, false, NULL, 0);
}

/**
 * ima_measure_critical_data - measure kernel integrity critical data
 * @event_label: unique event label for grouping and limiting critical data
 * @event_name: event name for the record in the IMA measurement list
 * @buf: pointer to buffer data
 * @buf_len: length of buffer data (in bytes)
 * @hash: measure buffer data hash
 * @digest: buffer digest will be written to
 * @digest_len: buffer length
 *
 * Measure data critical to the integrity of the kernel into the IMA log
 * and extend the pcr.  Examples of critical data could be various data
 * structures, policies, and states stored in kernel memory that can
 * impact the integrity of the system.
 *
 * Return: 0 if the buffer has been successfully measured, 1 if the digest
 * has been written to the passed location but not added to a measurement entry,
 * a negative value otherwise.
 */
int ima_measure_critical_data(const char *event_label,
                              const char *event_name,
                              const void *buf, size_t buf_len,
                              bool hash, u8 *digest, size_t digest_len)
{
        if (!event_name || !event_label || !buf || !buf_len)
                return -ENOPARAM;

        return process_buffer_measurement(&nop_mnt_idmap, NULL, buf, buf_len,
                                          event_name, CRITICAL_DATA, 0,
                                          event_label, hash, digest,
                                          digest_len);
}
EXPORT_SYMBOL_GPL(ima_measure_critical_data);

#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS

/**
 * ima_kernel_module_request - Prevent crypto-pkcs1(rsa,*) requests
 * @kmod_name: kernel module name
 *
 * Avoid a verification loop where verifying the signature of the modprobe
 * binary requires executing modprobe itself. Since the modprobe iint->mutex
 * is already held when the signature verification is performed, a deadlock
 * occurs as soon as modprobe is executed within the critical region, since
 * the same lock cannot be taken again.
 *
 * This happens when public_key_verify_signature(), in case of RSA algorithm,
 * use alg_name to store internal information in order to construct an
 * algorithm on the fly, but crypto_larval_lookup() will try to use alg_name
 * in order to load a kernel module with same name.
 *
 * Since we don't have any real "crypto-pkcs1(rsa,*)" kernel modules,
 * we are safe to fail such module request from crypto_larval_lookup(), and
 * avoid the verification loop.
 *
 * Return: Zero if it is safe to load the kernel module, -EINVAL otherwise.
 */
static int ima_kernel_module_request(char *kmod_name)
{
        if (strncmp(kmod_name, "crypto-pkcs1(rsa,", 17) == 0)
                return -EINVAL;

        return 0;
}

#endif /* CONFIG_INTEGRITY_ASYMMETRIC_KEYS */

static int __init init_ima(void)
{
        int error;

        ima_appraise_parse_cmdline();
        ima_init_template_list();
        hash_setup(CONFIG_IMA_DEFAULT_HASH);
        error = ima_init();

        if (error && strcmp(hash_algo_name[ima_hash_algo],
                            CONFIG_IMA_DEFAULT_HASH) != 0) {
                pr_info("Allocating %s failed, going to use default hash algorithm %s\n",
                        hash_algo_name[ima_hash_algo], CONFIG_IMA_DEFAULT_HASH);
                hash_setup_done = 0;
                hash_setup(CONFIG_IMA_DEFAULT_HASH);
                error = ima_init();
        }

        if (error)
                return error;

        error = register_blocking_lsm_notifier(&ima_lsm_policy_notifier);
        if (error)
                pr_warn("Couldn't register LSM notifier, error %d\n", error);

        if (!error)
                ima_update_policy_flags();

        return error;
}

static struct security_hook_list ima_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(bprm_check_security, ima_bprm_check),
        LSM_HOOK_INIT(bprm_creds_for_exec, ima_bprm_creds_for_exec),
        LSM_HOOK_INIT(file_post_open, ima_file_check),
        LSM_HOOK_INIT(inode_post_create_tmpfile, ima_post_create_tmpfile),
        LSM_HOOK_INIT(file_release, ima_file_free),
        LSM_HOOK_INIT(mmap_file, ima_file_mmap),
        LSM_HOOK_INIT(file_mprotect, ima_file_mprotect),
        LSM_HOOK_INIT(kernel_load_data, ima_load_data),
        LSM_HOOK_INIT(kernel_post_load_data, ima_post_load_data),
        LSM_HOOK_INIT(kernel_read_file, ima_read_file),
        LSM_HOOK_INIT(kernel_post_read_file, ima_post_read_file),
        LSM_HOOK_INIT(path_post_mknod, ima_post_path_mknod),
#ifdef CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS
        LSM_HOOK_INIT(key_post_create_or_update, ima_post_key_create_or_update),
#endif
#ifdef CONFIG_INTEGRITY_ASYMMETRIC_KEYS
        LSM_HOOK_INIT(kernel_module_request, ima_kernel_module_request),
#endif
        LSM_HOOK_INIT(inode_free_security_rcu, ima_inode_free_rcu),
};

static const struct lsm_id ima_lsmid = {
        .name = "ima",
        .id = LSM_ID_IMA,
};

static int __init init_ima_lsm(void)
{
        ima_iintcache_init();
        security_add_hooks(ima_hooks, ARRAY_SIZE(ima_hooks), &ima_lsmid);
        init_ima_appraise_lsm(&ima_lsmid);
        return 0;
}

struct lsm_blob_sizes ima_blob_sizes __ro_after_init = {
        .lbs_inode = sizeof(struct ima_iint_cache *),
};

DEFINE_LSM(ima) = {
        .name = "ima",
        .init = init_ima_lsm,
        .order = LSM_ORDER_LAST,
        .blobs = &ima_blob_sizes,
};

late_initcall(init_ima);        /* Start IMA after the TPM is available */











  248 










































   26 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM percpu

#if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PERCPU_H

#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(percpu_alloc_percpu,

        TP_PROTO(unsigned long call_site,
                 bool reserved, bool is_atomic, size_t size,
                 size_t align, void *base_addr, int off,
                 void __percpu *ptr, size_t bytes_alloc, gfp_t gfp_flags),

        TP_ARGS(call_site, reserved, is_atomic, size, align, base_addr, off,
                ptr, bytes_alloc, gfp_flags),

        TP_STRUCT__entry(
                __field(        unsigned long,                call_site        )
                __field(        bool,                        reserved        )
                __field(        bool,                        is_atomic        )
                __field(        size_t,                        size                )
                __field(        size_t,                        align                )
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
                __field(        size_t,                        bytes_alloc        )
                __field(        unsigned long,                gfp_flags        )
        ),
        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
        ),

        TP_printk("call_site=%pS reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p bytes_alloc=%zu gfp_flags=%s",
                  (void *)__entry->call_site,
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align,
                  __entry->base_addr, __entry->off, __entry->ptr,
                  __entry->bytes_alloc, show_gfp_flags(__entry->gfp_flags))
);

TRACE_EVENT(percpu_free_percpu,

        TP_PROTO(void *base_addr, int off, void __percpu *ptr),

        TP_ARGS(base_addr, off, ptr),

        TP_STRUCT__entry(
                __field(        void *,                        base_addr        )
                __field(        int,                        off                )
                __field(        void __percpu *,        ptr                )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
                __entry->off                = off;
                __entry->ptr                = ptr;
        ),

        TP_printk("base_addr=%p off=%d ptr=%p",
                __entry->base_addr, __entry->off, __entry->ptr)
);

TRACE_EVENT(percpu_alloc_percpu_fail,

        TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align),

        TP_ARGS(reserved, is_atomic, size, align),

        TP_STRUCT__entry(
                __field(        bool,        reserved        )
                __field(        bool,        is_atomic        )
                __field(        size_t,        size                )
                __field(        size_t, align                )
        ),

        TP_fast_assign(
                __entry->reserved        = reserved;
                __entry->is_atomic        = is_atomic;
                __entry->size                = size;
                __entry->align                = align;
        ),

        TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu",
                  __entry->reserved, __entry->is_atomic,
                  __entry->size, __entry->align)
);

TRACE_EVENT(percpu_create_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *, base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

TRACE_EVENT(percpu_destroy_chunk,

        TP_PROTO(void *base_addr),

        TP_ARGS(base_addr),

        TP_STRUCT__entry(
                __field(        void *,        base_addr        )
        ),

        TP_fast_assign(
                __entry->base_addr        = base_addr;
        ),

        TP_printk("base_addr=%p", __entry->base_addr)
);

#endif /* _TRACE_PERCPU_H */

#include <trace/define_trace.h>




































































































































































































































 1233 



   34 

 1206 























































































































































































































































































































































































































 1261 





   53 










   53 


















 1256 



 1261 
 1261 































































































































































































































































  204 






  203 




    1 










































  202 











 1254 














 1259 



  202 
 1234 




 1254 













 1256 








 1257 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/util.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/slab.h>
#include <linux/rculist.h>

#include "common.h"

/* Lock for protecting policy. */
DEFINE_MUTEX(tomoyo_policy_lock);

/* Has /sbin/init started? */
bool tomoyo_policy_loaded;

/*
 * Mapping table from "enum tomoyo_mac_index" to
 * "enum tomoyo_mac_category_index".
 */
const u8 tomoyo_index2category[TOMOYO_MAX_MAC_INDEX] = {
        /* CONFIG::file group */
        [TOMOYO_MAC_FILE_EXECUTE]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_OPEN]       = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CREATE]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_UNLINK]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_GETATTR]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKDIR]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_RMDIR]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKFIFO]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKSOCK]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_TRUNCATE]   = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_SYMLINK]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKBLOCK]    = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MKCHAR]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_LINK]       = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_RENAME]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHMOD]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHOWN]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHGRP]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_IOCTL]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_CHROOT]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_MOUNT]      = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_UMOUNT]     = TOMOYO_MAC_CATEGORY_FILE,
        [TOMOYO_MAC_FILE_PIVOT_ROOT] = TOMOYO_MAC_CATEGORY_FILE,
        /* CONFIG::network group */
        [TOMOYO_MAC_NETWORK_INET_STREAM_BIND]       =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN]     =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_RAW_BIND]          =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_INET_RAW_SEND]          =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND]       =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN]     =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND]        =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND]    =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN]  =
        TOMOYO_MAC_CATEGORY_NETWORK,
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] =
        TOMOYO_MAC_CATEGORY_NETWORK,
        /* CONFIG::misc group */
        [TOMOYO_MAC_ENVIRON]         = TOMOYO_MAC_CATEGORY_MISC,
};

/**
 * tomoyo_convert_time - Convert time_t to YYYY/MM/DD hh/mm/ss.
 *
 * @time64: Seconds since 1970/01/01 00:00:00.
 * @stamp:  Pointer to "struct tomoyo_time".
 *
 * Returns nothing.
 */
void tomoyo_convert_time(time64_t time64, struct tomoyo_time *stamp)
{
        struct tm tm;

        time64_to_tm(time64, 0, &tm);
        stamp->sec = tm.tm_sec;
        stamp->min = tm.tm_min;
        stamp->hour = tm.tm_hour;
        stamp->day = tm.tm_mday;
        stamp->month = tm.tm_mon + 1;
        stamp->year = tm.tm_year + 1900;
}

/**
 * tomoyo_permstr - Find permission keywords.
 *
 * @string: String representation for permissions in foo/bar/buz format.
 * @keyword: Keyword to find from @string/
 *
 * Returns true if @keyword was found in @string, false otherwise.
 *
 * This function assumes that strncmp(w1, w2, strlen(w1)) != 0 if w1 != w2.
 */
bool tomoyo_permstr(const char *string, const char *keyword)
{
        const char *cp = strstr(string, keyword);

        if (cp)
                return cp == string || *(cp - 1) == '/';
        return false;
}

/**
 * tomoyo_read_token - Read a word from a line.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns a word on success, "" otherwise.
 *
 * To allow the caller to skip NULL check, this function returns "" rather than
 * NULL if there is no more words to read.
 */
char *tomoyo_read_token(struct tomoyo_acl_param *param)
{
        char *pos = param->data;
        char *del = strchr(pos, ' ');

        if (del)
                *del++ = '\0';
        else
                del = pos + strlen(pos);
        param->data = del;
        return pos;
}

static bool tomoyo_correct_path2(const char *filename, const size_t len);

/**
 * tomoyo_get_domainname - Read a domainname from a line.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns a domainname on success, NULL otherwise.
 */
const struct tomoyo_path_info *tomoyo_get_domainname
(struct tomoyo_acl_param *param)
{
        char *start = param->data;
        char *pos = start;

        while (*pos) {
                if (*pos++ != ' ' ||
                    tomoyo_correct_path2(pos, strchrnul(pos, ' ') - pos))
                        continue;
                *(pos - 1) = '\0';
                break;
        }
        param->data = pos;
        if (tomoyo_correct_domain(start))
                return tomoyo_get_name(start);
        return NULL;
}

/**
 * tomoyo_parse_ulong - Parse an "unsigned long" value.
 *
 * @result: Pointer to "unsigned long".
 * @str:    Pointer to string to parse.
 *
 * Returns one of values in "enum tomoyo_value_type".
 *
 * The @src is updated to point the first character after the value
 * on success.
 */
u8 tomoyo_parse_ulong(unsigned long *result, char **str)
{
        const char *cp = *str;
        char *ep;
        int base = 10;

        if (*cp == '0') {
                char c = *(cp + 1);

                if (c == 'x' || c == 'X') {
                        base = 16;
                        cp += 2;
                } else if (c >= '0' && c <= '7') {
                        base = 8;
                        cp++;
                }
        }
        *result = simple_strtoul(cp, &ep, base);
        if (cp == ep)
                return TOMOYO_VALUE_TYPE_INVALID;
        *str = ep;
        switch (base) {
        case 16:
                return TOMOYO_VALUE_TYPE_HEXADECIMAL;
        case 8:
                return TOMOYO_VALUE_TYPE_OCTAL;
        default:
                return TOMOYO_VALUE_TYPE_DECIMAL;
        }
}

/**
 * tomoyo_print_ulong - Print an "unsigned long" value.
 *
 * @buffer:     Pointer to buffer.
 * @buffer_len: Size of @buffer.
 * @value:      An "unsigned long" value.
 * @type:       Type of @value.
 *
 * Returns nothing.
 */
void tomoyo_print_ulong(char *buffer, const int buffer_len,
                        const unsigned long value, const u8 type)
{
        if (type == TOMOYO_VALUE_TYPE_DECIMAL)
                snprintf(buffer, buffer_len, "%lu", value);
        else if (type == TOMOYO_VALUE_TYPE_OCTAL)
                snprintf(buffer, buffer_len, "0%lo", value);
        else if (type == TOMOYO_VALUE_TYPE_HEXADECIMAL)
                snprintf(buffer, buffer_len, "0x%lX", value);
        else
                snprintf(buffer, buffer_len, "type(%u)", type);
}

/**
 * tomoyo_parse_name_union - Parse a tomoyo_name_union.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_name_union".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_parse_name_union(struct tomoyo_acl_param *param,
                             struct tomoyo_name_union *ptr)
{
        char *filename;

        if (param->data[0] == '@') {
                param->data++;
                ptr->group = tomoyo_get_group(param, TOMOYO_PATH_GROUP);
                return ptr->group != NULL;
        }
        filename = tomoyo_read_token(param);
        if (!tomoyo_correct_word(filename))
                return false;
        ptr->filename = tomoyo_get_name(filename);
        return ptr->filename != NULL;
}

/**
 * tomoyo_parse_number_union - Parse a tomoyo_number_union.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_number_union".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_parse_number_union(struct tomoyo_acl_param *param,
                               struct tomoyo_number_union *ptr)
{
        char *data;
        u8 type;
        unsigned long v;

        memset(ptr, 0, sizeof(*ptr));
        if (param->data[0] == '@') {
                param->data++;
                ptr->group = tomoyo_get_group(param, TOMOYO_NUMBER_GROUP);
                return ptr->group != NULL;
        }
        data = tomoyo_read_token(param);
        type = tomoyo_parse_ulong(&v, &data);
        if (type == TOMOYO_VALUE_TYPE_INVALID)
                return false;
        ptr->values[0] = v;
        ptr->value_type[0] = type;
        if (!*data) {
                ptr->values[1] = v;
                ptr->value_type[1] = type;
                return true;
        }
        if (*data++ != '-')
                return false;
        type = tomoyo_parse_ulong(&v, &data);
        if (type == TOMOYO_VALUE_TYPE_INVALID || *data || ptr->values[0] > v)
                return false;
        ptr->values[1] = v;
        ptr->value_type[1] = type;
        return true;
}

/**
 * tomoyo_byte_range - Check whether the string is a \ooo style octal value.
 *
 * @str: Pointer to the string.
 *
 * Returns true if @str is a \ooo style octal value, false otherwise.
 *
 * TOMOYO uses \ooo style representation for 0x01 - 0x20 and 0x7F - 0xFF.
 * This function verifies that \ooo is in valid range.
 */
static inline bool tomoyo_byte_range(const char *str)
{
        return *str >= '0' && *str++ <= '3' &&
                *str >= '0' && *str++ <= '7' &&
                *str >= '0' && *str <= '7';
}

/**
 * tomoyo_alphabet_char - Check whether the character is an alphabet.
 *
 * @c: The character to check.
 *
 * Returns true if @c is an alphabet character, false otherwise.
 */
static inline bool tomoyo_alphabet_char(const char c)
{
        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
}

/**
 * tomoyo_make_byte - Make byte value from three octal characters.
 *
 * @c1: The first character.
 * @c2: The second character.
 * @c3: The third character.
 *
 * Returns byte value.
 */
static inline u8 tomoyo_make_byte(const u8 c1, const u8 c2, const u8 c3)
{
        return ((c1 - '0') << 6) + ((c2 - '0') << 3) + (c3 - '0');
}

/**
 * tomoyo_valid - Check whether the character is a valid char.
 *
 * @c: The character to check.
 *
 * Returns true if @c is a valid character, false otherwise.
 */
static inline bool tomoyo_valid(const unsigned char c)
{
        return c > ' ' && c < 127;
}

/**
 * tomoyo_invalid - Check whether the character is an invalid char.
 *
 * @c: The character to check.
 *
 * Returns true if @c is an invalid character, false otherwise.
 */
static inline bool tomoyo_invalid(const unsigned char c)
{
        return c && (c <= ' ' || c >= 127);
}

/**
 * tomoyo_str_starts - Check whether the given string starts with the given keyword.
 *
 * @src:  Pointer to pointer to the string.
 * @find: Pointer to the keyword.
 *
 * Returns true if @src starts with @find, false otherwise.
 *
 * The @src is updated to point the first character after the @find
 * if @src starts with @find.
 */
bool tomoyo_str_starts(char **src, const char *find)
{
        const int len = strlen(find);
        char *tmp = *src;

        if (strncmp(tmp, find, len))
                return false;
        tmp += len;
        *src = tmp;
        return true;
}

/**
 * tomoyo_normalize_line - Format string.
 *
 * @buffer: The line to normalize.
 *
 * Leading and trailing whitespaces are removed.
 * Multiple whitespaces are packed into single space.
 *
 * Returns nothing.
 */
void tomoyo_normalize_line(unsigned char *buffer)
{
        unsigned char *sp = buffer;
        unsigned char *dp = buffer;
        bool first = true;

        while (tomoyo_invalid(*sp))
                sp++;
        while (*sp) {
                if (!first)
                        *dp++ = ' ';
                first = false;
                while (tomoyo_valid(*sp))
                        *dp++ = *sp++;
                while (tomoyo_invalid(*sp))
                        sp++;
        }
        *dp = '\0';
}

/**
 * tomoyo_correct_word2 - Validate a string.
 *
 * @string: The string to check. Maybe non-'\0'-terminated.
 * @len:    Length of @string.
 *
 * Check whether the given string follows the naming rules.
 * Returns true if @string follows the naming rules, false otherwise.
 */
static bool tomoyo_correct_word2(const char *string, size_t len)
{
        u8 recursion = 20;
        const char *const start = string;
        bool in_repetition = false;

        if (!len)
                goto out;
        while (len--) {
                unsigned char c = *string++;

                if (c == '\\') {
                        if (!len--)
                                goto out;
                        c = *string++;
                        if (c >= '0' && c <= '3') {
                                unsigned char d;
                                unsigned char e;

                                if (!len-- || !len--)
                                        goto out;
                                d = *string++;
                                e = *string++;
                                if (d < '0' || d > '7' || e < '0' || e > '7')
                                        goto out;
                                c = tomoyo_make_byte(c, d, e);
                                if (c <= ' ' || c >= 127)
                                        continue;
                                goto out;
                        }
                        switch (c) {
                        case '\\':  /* "\\" */
                        case '+':   /* "\+" */
                        case '?':   /* "\?" */
                        case 'x':   /* "\x" */
                        case 'a':   /* "\a" */
                        case '-':   /* "\-" */
                                continue;
                        }
                        if (!recursion--)
                                goto out;
                        switch (c) {
                        case '*':   /* "\*" */
                        case '@':   /* "\@" */
                        case '$':   /* "\$" */
                        case 'X':   /* "\X" */
                        case 'A':   /* "\A" */
                                continue;
                        case '{':   /* "/\{" */
                                if (string - 3 < start || *(string - 3) != '/')
                                        goto out;
                                in_repetition = true;
                                continue;
                        case '}':   /* "\}/" */
                                if (*string != '/')
                                        goto out;
                                if (!in_repetition)
                                        goto out;
                                in_repetition = false;
                                continue;
                        }
                        goto out;
                } else if (in_repetition && c == '/') {
                        goto out;
                } else if (c <= ' ' || c >= 127) {
                        goto out;
                }
        }
        if (in_repetition)
                goto out;
        return true;
 out:
        return false;
}

/**
 * tomoyo_correct_word - Validate a string.
 *
 * @string: The string to check.
 *
 * Check whether the given string follows the naming rules.
 * Returns true if @string follows the naming rules, false otherwise.
 */
bool tomoyo_correct_word(const char *string)
{
        return tomoyo_correct_word2(string, strlen(string));
}

/**
 * tomoyo_correct_path2 - Check whether the given pathname follows the naming rules.
 *
 * @filename: The pathname to check.
 * @len:      Length of @filename.
 *
 * Returns true if @filename follows the naming rules, false otherwise.
 */
static bool tomoyo_correct_path2(const char *filename, const size_t len)
{
        const char *cp1 = memchr(filename, '/', len);
        const char *cp2 = memchr(filename, '.', len);

        return cp1 && (!cp2 || (cp1 < cp2)) && tomoyo_correct_word2(filename, len);
}

/**
 * tomoyo_correct_path - Validate a pathname.
 *
 * @filename: The pathname to check.
 *
 * Check whether the given pathname follows the naming rules.
 * Returns true if @filename follows the naming rules, false otherwise.
 */
bool tomoyo_correct_path(const char *filename)
{
        return tomoyo_correct_path2(filename, strlen(filename));
}

/**
 * tomoyo_correct_domain - Check whether the given domainname follows the naming rules.
 *
 * @domainname: The domainname to check.
 *
 * Returns true if @domainname follows the naming rules, false otherwise.
 */
bool tomoyo_correct_domain(const unsigned char *domainname)
{
        if (!domainname || !tomoyo_domain_def(domainname))
                return false;
        domainname = strchr(domainname, ' ');
        if (!domainname++)
                return true;
        while (1) {
                const unsigned char *cp = strchr(domainname, ' ');

                if (!cp)
                        break;
                if (!tomoyo_correct_path2(domainname, cp - domainname))
                        return false;
                domainname = cp + 1;
        }
        return tomoyo_correct_path(domainname);
}

/**
 * tomoyo_domain_def - Check whether the given token can be a domainname.
 *
 * @buffer: The token to check.
 *
 * Returns true if @buffer possibly be a domainname, false otherwise.
 */
bool tomoyo_domain_def(const unsigned char *buffer)
{
        const unsigned char *cp;
        int len;

        if (*buffer != '<')
                return false;
        cp = strchr(buffer, ' ');
        if (!cp)
                len = strlen(buffer);
        else
                len = cp - buffer;
        if (buffer[len - 1] != '>' ||
            !tomoyo_correct_word2(buffer + 1, len - 2))
                return false;
        return true;
}

/**
 * tomoyo_find_domain - Find a domain by the given name.
 *
 * @domainname: The domainname to find.
 *
 * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname)
{
        struct tomoyo_domain_info *domain;
        struct tomoyo_path_info name;

        name.name = domainname;
        tomoyo_fill_path_info(&name);
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (!domain->is_deleted &&
                    !tomoyo_pathcmp(&name, domain->domainname))
                        return domain;
        }
        return NULL;
}

/**
 * tomoyo_const_part_length - Evaluate the initial length without a pattern in a token.
 *
 * @filename: The string to evaluate.
 *
 * Returns the initial length without a pattern in @filename.
 */
static int tomoyo_const_part_length(const char *filename)
{
        char c;
        int len = 0;

        if (!filename)
                return 0;
        while ((c = *filename++) != '\0') {
                if (c != '\\') {
                        len++;
                        continue;
                }
                c = *filename++;
                switch (c) {
                case '\\':  /* "\\" */
                        len += 2;
                        continue;
                case '0':   /* "\ooo" */
                case '1':
                case '2':
                case '3':
                        c = *filename++;
                        if (c < '0' || c > '7')
                                break;
                        c = *filename++;
                        if (c < '0' || c > '7')
                                break;
                        len += 4;
                        continue;
                }
                break;
        }
        return len;
}

/**
 * tomoyo_fill_path_info - Fill in "struct tomoyo_path_info" members.
 *
 * @ptr: Pointer to "struct tomoyo_path_info" to fill in.
 *
 * The caller sets "struct tomoyo_path_info"->name.
 */
void tomoyo_fill_path_info(struct tomoyo_path_info *ptr)
{
        const char *name = ptr->name;
        const int len = strlen(name);

        ptr->const_len = tomoyo_const_part_length(name);
        ptr->is_dir = len && (name[len - 1] == '/');
        ptr->is_patterned = (ptr->const_len < len);
        ptr->hash = full_name_hash(NULL, name, len);
}

/**
 * tomoyo_file_matches_pattern2 - Pattern matching without '/' character and "\-" pattern.
 *
 * @filename:     The start of string to check.
 * @filename_end: The end of string to check.
 * @pattern:      The start of pattern to compare.
 * @pattern_end:  The end of pattern to compare.
 *
 * Returns true if @filename matches @pattern, false otherwise.
 */
static bool tomoyo_file_matches_pattern2(const char *filename,
                                         const char *filename_end,
                                         const char *pattern,
                                         const char *pattern_end)
{
        while (filename < filename_end && pattern < pattern_end) {
                char c;
                int i;
                int j;

                if (*pattern != '\\') {
                        if (*filename++ != *pattern++)
                                return false;
                        continue;
                }
                c = *filename;
                pattern++;
                switch (*pattern) {
                case '?':
                        if (c == '/') {
                                return false;
                        } else if (c == '\\') {
                                if (filename[1] == '\\')
                                        filename++;
                                else if (tomoyo_byte_range(filename + 1))
                                        filename += 3;
                                else
                                        return false;
                        }
                        break;
                case '\\':
                        if (c != '\\')
                                return false;
                        if (*++filename != '\\')
                                return false;
                        break;
                case '+':
                        if (!isdigit(c))
                                return false;
                        break;
                case 'x':
                        if (!isxdigit(c))
                                return false;
                        break;
                case 'a':
                        if (!tomoyo_alphabet_char(c))
                                return false;
                        break;
                case '0':
                case '1':
                case '2':
                case '3':
                        if (c == '\\' && tomoyo_byte_range(filename + 1)
                            && strncmp(filename + 1, pattern, 3) == 0) {
                                filename += 3;
                                pattern += 2;
                                break;
                        }
                        return false; /* Not matched. */
                case '*':
                case '@':
                        for (i = 0; i <= filename_end - filename; i++) {
                                if (tomoyo_file_matches_pattern2(
                                                    filename + i, filename_end,
                                                    pattern + 1, pattern_end))
                                        return true;
                                c = filename[i];
                                if (c == '.' && *pattern == '@')
                                        break;
                                if (c != '\\')
                                        continue;
                                if (filename[i + 1] == '\\')
                                        i++;
                                else if (tomoyo_byte_range(filename + i + 1))
                                        i += 3;
                                else
                                        break; /* Bad pattern. */
                        }
                        return false; /* Not matched. */
                default:
                        j = 0;
                        c = *pattern;
                        if (c == '$') {
                                while (isdigit(filename[j]))
                                        j++;
                        } else if (c == 'X') {
                                while (isxdigit(filename[j]))
                                        j++;
                        } else if (c == 'A') {
                                while (tomoyo_alphabet_char(filename[j]))
                                        j++;
                        }
                        for (i = 1; i <= j; i++) {
                                if (tomoyo_file_matches_pattern2(
                                                    filename + i, filename_end,
                                                    pattern + 1, pattern_end))
                                        return true;
                        }
                        return false; /* Not matched or bad pattern. */
                }
                filename++;
                pattern++;
        }
        while (*pattern == '\\' &&
               (*(pattern + 1) == '*' || *(pattern + 1) == '@'))
                pattern += 2;
        return filename == filename_end && pattern == pattern_end;
}

/**
 * tomoyo_file_matches_pattern - Pattern matching without '/' character.
 *
 * @filename:     The start of string to check.
 * @filename_end: The end of string to check.
 * @pattern:      The start of pattern to compare.
 * @pattern_end:  The end of pattern to compare.
 *
 * Returns true if @filename matches @pattern, false otherwise.
 */
static bool tomoyo_file_matches_pattern(const char *filename,
                                        const char *filename_end,
                                        const char *pattern,
                                        const char *pattern_end)
{
        const char *pattern_start = pattern;
        bool first = true;
        bool result;

        while (pattern < pattern_end - 1) {
                /* Split at "\-" pattern. */
                if (*pattern++ != '\\' || *pattern++ != '-')
                        continue;
                result = tomoyo_file_matches_pattern2(filename,
                                                      filename_end,
                                                      pattern_start,
                                                      pattern - 2);
                if (first)
                        result = !result;
                if (result)
                        return false;
                first = false;
                pattern_start = pattern;
        }
        result = tomoyo_file_matches_pattern2(filename, filename_end,
                                              pattern_start, pattern_end);
        return first ? result : !result;
}

/**
 * tomoyo_path_matches_pattern2 - Do pathname pattern matching.
 *
 * @f: The start of string to check.
 * @p: The start of pattern to compare.
 *
 * Returns true if @f matches @p, false otherwise.
 */
static bool tomoyo_path_matches_pattern2(const char *f, const char *p)
{
        const char *f_delimiter;
        const char *p_delimiter;

        while (*f && *p) {
                f_delimiter = strchr(f, '/');
                if (!f_delimiter)
                        f_delimiter = f + strlen(f);
                p_delimiter = strchr(p, '/');
                if (!p_delimiter)
                        p_delimiter = p + strlen(p);
                if (*p == '\\' && *(p + 1) == '{')
                        goto recursive;
                if (!tomoyo_file_matches_pattern(f, f_delimiter, p,
                                                 p_delimiter))
                        return false;
                f = f_delimiter;
                if (*f)
                        f++;
                p = p_delimiter;
                if (*p)
                        p++;
        }
        /* Ignore trailing "\*" and "\@" in @pattern. */
        while (*p == '\\' &&
               (*(p + 1) == '*' || *(p + 1) == '@'))
                p += 2;
        return !*f && !*p;
 recursive:
        /*
         * The "\{" pattern is permitted only after '/' character.
         * This guarantees that below "*(p - 1)" is safe.
         * Also, the "\}" pattern is permitted only before '/' character
         * so that "\{" + "\}" pair will not break the "\-" operator.
         */
        if (*(p - 1) != '/' || p_delimiter <= p + 3 || *p_delimiter != '/' ||
            *(p_delimiter - 1) != '}' || *(p_delimiter - 2) != '\\')
                return false; /* Bad pattern. */
        do {
                /* Compare current component with pattern. */
                if (!tomoyo_file_matches_pattern(f, f_delimiter, p + 2,
                                                 p_delimiter - 2))
                        break;
                /* Proceed to next component. */
                f = f_delimiter;
                if (!*f)
                        break;
                f++;
                /* Continue comparison. */
                if (tomoyo_path_matches_pattern2(f, p_delimiter + 1))
                        return true;
                f_delimiter = strchr(f, '/');
        } while (f_delimiter);
        return false; /* Not matched. */
}

/**
 * tomoyo_path_matches_pattern - Check whether the given filename matches the given pattern.
 *
 * @filename: The filename to check.
 * @pattern:  The pattern to compare.
 *
 * Returns true if matches, false otherwise.
 *
 * The following patterns are available.
 *   \\     \ itself.
 *   \ooo   Octal representation of a byte.
 *   \*     Zero or more repetitions of characters other than '/'.
 *   \@     Zero or more repetitions of characters other than '/' or '.'.
 *   \?     1 byte character other than '/'.
 *   \$     One or more repetitions of decimal digits.
 *   \+     1 decimal digit.
 *   \X     One or more repetitions of hexadecimal digits.
 *   \x     1 hexadecimal digit.
 *   \A     One or more repetitions of alphabet characters.
 *   \a     1 alphabet character.
 *
 *   \-     Subtraction operator.
 *
 *   /\{dir\}/   '/' + 'One or more repetitions of dir/' (e.g. /dir/ /dir/dir/
 *               /dir/dir/dir/ ).
 */
bool tomoyo_path_matches_pattern(const struct tomoyo_path_info *filename,
                                 const struct tomoyo_path_info *pattern)
{
        const char *f = filename->name;
        const char *p = pattern->name;
        const int len = pattern->const_len;

        /* If @pattern doesn't contain pattern, I can use strcmp(). */
        if (!pattern->is_patterned)
                return !tomoyo_pathcmp(filename, pattern);
        /* Don't compare directory and non-directory. */
        if (filename->is_dir != pattern->is_dir)
                return false;
        /* Compare the initial length without patterns. */
        if (strncmp(f, p, len))
                return false;
        f += len;
        p += len;
        return tomoyo_path_matches_pattern2(f, p);
}

/**
 * tomoyo_get_exe - Get tomoyo_realpath() of current process.
 *
 * Returns the tomoyo_realpath() of current process on success, NULL otherwise.
 *
 * This function uses kzalloc(), so the caller must call kfree()
 * if this function didn't return NULL.
 */
const char *tomoyo_get_exe(void)
{
        struct file *exe_file;
        const char *cp;
        struct mm_struct *mm = current->mm;

        if (!mm)
                return NULL;
        exe_file = get_mm_exe_file(mm);
        if (!exe_file)
                return NULL;

        cp = tomoyo_realpath_from_path(&exe_file->f_path);
        fput(exe_file);
        return cp;
}

/**
 * tomoyo_get_mode - Get MAC mode.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number.
 * @index:   Index number of functionality.
 *
 * Returns mode.
 */
int tomoyo_get_mode(const struct tomoyo_policy_namespace *ns, const u8 profile,
                    const u8 index)
{
        u8 mode;
        struct tomoyo_profile *p;

        if (!tomoyo_policy_loaded)
                return TOMOYO_CONFIG_DISABLED;
        p = tomoyo_profile(ns, profile);
        mode = p->config[index];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->config[tomoyo_index2category[index]
                                 + TOMOYO_MAX_MAC_INDEX];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->default_config;
        return mode & 3;
}

/**
 * tomoyo_init_request_info - Initialize "struct tomoyo_request_info" members.
 *
 * @r:      Pointer to "struct tomoyo_request_info" to initialize.
 * @domain: Pointer to "struct tomoyo_domain_info". NULL for tomoyo_domain().
 * @index:  Index number of functionality.
 *
 * Returns mode.
 */
int tomoyo_init_request_info(struct tomoyo_request_info *r,
                             struct tomoyo_domain_info *domain, const u8 index)
{
        u8 profile;

        memset(r, 0, sizeof(*r));
        if (!domain)
                domain = tomoyo_domain();
        r->domain = domain;
        profile = domain->profile;
        r->profile = profile;
        r->type = index;
        r->mode = tomoyo_get_mode(domain->ns, profile, index);
        return r->mode;
}

/**
 * tomoyo_domain_quota_is_ok - Check for domain's quota.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns true if the domain is not exceeded quota, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
bool tomoyo_domain_quota_is_ok(struct tomoyo_request_info *r)
{
        unsigned int count = 0;
        struct tomoyo_domain_info *domain = r->domain;
        struct tomoyo_acl_info *ptr;

        if (r->mode != TOMOYO_CONFIG_LEARNING)
                return false;
        if (!domain)
                return true;
        if (READ_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED]))
                return false;
        list_for_each_entry_rcu(ptr, &domain->acl_info_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                u16 perm;

                if (ptr->is_deleted)
                        continue;
                /*
                 * Reading perm bitmap might race with tomoyo_merge_*() because
                 * caller does not hold tomoyo_policy_lock mutex. But exceeding
                 * max_learning_entry parameter by a few entries does not harm.
                 */
                switch (ptr->type) {
                case TOMOYO_TYPE_PATH_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_PATH2_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path2_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_PATH_NUMBER_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_path_number_acl, head)
                                  ->perm);
                        break;
                case TOMOYO_TYPE_MKDEV_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_mkdev_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_INET_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_inet_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_UNIX_ACL:
                        perm = data_race(container_of(ptr, struct tomoyo_unix_acl, head)->perm);
                        break;
                case TOMOYO_TYPE_MANUAL_TASK_ACL:
                        perm = 0;
                        break;
                default:
                        perm = 1;
                }
                count += hweight16(perm);
        }
        if (count < tomoyo_profile(domain->ns, domain->profile)->
            pref[TOMOYO_PREF_MAX_LEARNING_ENTRY])
                return true;
        WRITE_ONCE(domain->flags[TOMOYO_DIF_QUOTA_WARNED], true);
        /* r->granted = false; */
        tomoyo_write_log(r, "%s", tomoyo_dif[TOMOYO_DIF_QUOTA_WARNED]);
#ifndef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING
        pr_warn("WARNING: Domain '%s' has too many ACLs to hold. Stopped learning mode.\n",
                domain->domainname->name);
#endif
        return false;
}






















    1 
























    1 
























   16 





























   28 




































    9 


















    9 















   15 




    2 


    1 





    1 




    1 




    3 

    9 


























   26 







   26 
    9 


   29 
   16 







    2 

















   30 







   29 






    2 






    8 

    1 



























































    1 



    2 



    1 





    1 













   13 



















  246 










   24 




    8 




    5 


    5 
    5 













    1 


    3 
















    2 









    2 














   10 
    1 





    2 




    4 





    1 


    1 


    1 





   10 


































    4 




    4 











   28 






    1 


   13 






    9 

    1 

    1 






    4 

    4 







    2 


    2 





    1 




    1 







    1 
























    8 








    1 









   16 


    1 

   15 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2019 Arm Ltd.

#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>

#include <asm/kvm_emulate.h>

#include <kvm/arm_hypercalls.h>
#include <kvm/arm_psci.h>

#define KVM_ARM_SMCCC_STD_FEATURES                                \
        GENMASK(KVM_REG_ARM_STD_BMAP_BIT_COUNT - 1, 0)
#define KVM_ARM_SMCCC_STD_HYP_FEATURES                                \
        GENMASK(KVM_REG_ARM_STD_HYP_BMAP_BIT_COUNT - 1, 0)
#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES                        \
        GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_BIT_COUNT - 1, 0)
#define KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2                        \
        GENMASK(KVM_REG_ARM_VENDOR_HYP_BMAP_2_BIT_COUNT - 1, 0)

static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val)
{
        struct system_time_snapshot systime_snapshot;
        u64 cycles = ~0UL;
        u32 feature;

        /*
         * system time and counter value must captured at the same
         * time to keep consistency and precision.
         */
        ktime_get_snapshot(&systime_snapshot);

        /*
         * This is only valid if the current clocksource is the
         * architected counter, as this is the only one the guest
         * can see.
         */
        if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER)
                return;

        /*
         * The guest selects one of the two reference counters
         * (virtual or physical) with the first argument of the SMCCC
         * call. In case the identifier is not supported, error out.
         */
        feature = smccc_get_arg1(vcpu);
        switch (feature) {
        case KVM_PTP_VIRT_COUNTER:
                cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.voffset;
                break;
        case KVM_PTP_PHYS_COUNTER:
                cycles = systime_snapshot.cycles - vcpu->kvm->arch.timer_data.poffset;
                break;
        default:
                return;
        }

        /*
         * This relies on the top bit of val[0] never being set for
         * valid values of system time, because that is *really* far
         * in the future (about 292 years from 1970, and at that stage
         * nobody will give a damn about it).
         */
        val[0] = upper_32_bits(systime_snapshot.real);
        val[1] = lower_32_bits(systime_snapshot.real);
        val[2] = upper_32_bits(cycles);
        val[3] = lower_32_bits(cycles);
}

static bool kvm_smccc_default_allowed(u32 func_id)
{
        switch (func_id) {
        /*
         * List of function-ids that are not gated with the bitmapped
         * feature firmware registers, and are to be allowed for
         * servicing the call by default.
         */
        case ARM_SMCCC_VERSION_FUNC_ID:
        case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
                return true;
        default:
                /* PSCI 0.2 and up is in the 0:0x1f range */
                if (ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD &&
                    ARM_SMCCC_FUNC_NUM(func_id) <= 0x1f)
                        return true;

                /*
                 * KVM's PSCI 0.1 doesn't comply with SMCCC, and has
                 * its own function-id base and range
                 */
                if (func_id >= KVM_PSCI_FN(0) && func_id <= KVM_PSCI_FN(3))
                        return true;

                return false;
        }
}

static bool kvm_smccc_test_fw_bmap(struct kvm_vcpu *vcpu, u32 func_id)
{
        struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;

        switch (func_id) {
        case ARM_SMCCC_TRNG_VERSION:
        case ARM_SMCCC_TRNG_FEATURES:
        case ARM_SMCCC_TRNG_GET_UUID:
        case ARM_SMCCC_TRNG_RND32:
        case ARM_SMCCC_TRNG_RND64:
                return test_bit(KVM_REG_ARM_STD_BIT_TRNG_V1_0,
                                &smccc_feat->std_bmap);
        case ARM_SMCCC_HV_PV_TIME_FEATURES:
        case ARM_SMCCC_HV_PV_TIME_ST:
                return test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME,
                                &smccc_feat->std_hyp_bmap);
        case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
        case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
                return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_FUNC_FEAT,
                                &smccc_feat->vendor_hyp_bmap);
        case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
                return test_bit(KVM_REG_ARM_VENDOR_HYP_BIT_PTP,
                                &smccc_feat->vendor_hyp_bmap);
        default:
                return false;
        }
}

#define SMC32_ARCH_RANGE_BEGIN        ARM_SMCCC_VERSION_FUNC_ID
#define SMC32_ARCH_RANGE_END        ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                \
                                                   ARM_SMCCC_SMC_32,                \
                                                   0, ARM_SMCCC_FUNC_MASK)

#define SMC64_ARCH_RANGE_BEGIN        ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                \
                                                   ARM_SMCCC_SMC_64,                \
                                                   0, 0)
#define SMC64_ARCH_RANGE_END        ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,                \
                                                   ARM_SMCCC_SMC_64,                \
                                                   0, ARM_SMCCC_FUNC_MASK)

static int kvm_smccc_filter_insert_reserved(struct kvm *kvm)
{
        int r;

        /*
         * Prevent userspace from handling any SMCCC calls in the architecture
         * range, avoiding the risk of misrepresenting Spectre mitigation status
         * to the guest.
         */
        r = mtree_insert_range(&kvm->arch.smccc_filter,
                               SMC32_ARCH_RANGE_BEGIN, SMC32_ARCH_RANGE_END,
                               xa_mk_value(KVM_SMCCC_FILTER_HANDLE),
                               GFP_KERNEL_ACCOUNT);
        if (r)
                goto out_destroy;

        r = mtree_insert_range(&kvm->arch.smccc_filter,
                               SMC64_ARCH_RANGE_BEGIN, SMC64_ARCH_RANGE_END,
                               xa_mk_value(KVM_SMCCC_FILTER_HANDLE),
                               GFP_KERNEL_ACCOUNT);
        if (r)
                goto out_destroy;

        return 0;
out_destroy:
        mtree_destroy(&kvm->arch.smccc_filter);
        return r;
}

static bool kvm_smccc_filter_configured(struct kvm *kvm)
{
        return !mtree_empty(&kvm->arch.smccc_filter);
}

static int kvm_smccc_set_filter(struct kvm *kvm, struct kvm_smccc_filter __user *uaddr)
{
        const void *zero_page = page_to_virt(ZERO_PAGE(0));
        struct kvm_smccc_filter filter;
        u32 start, end;
        int r;

        if (copy_from_user(&filter, uaddr, sizeof(filter)))
                return -EFAULT;

        if (memcmp(filter.pad, zero_page, sizeof(filter.pad)))
                return -EINVAL;

        start = filter.base;
        end = start + filter.nr_functions - 1;

        if (end < start || filter.action >= NR_SMCCC_FILTER_ACTIONS)
                return -EINVAL;

        mutex_lock(&kvm->arch.config_lock);

        if (kvm_vm_has_ran_once(kvm)) {
                r = -EBUSY;
                goto out_unlock;
        }

        if (!kvm_smccc_filter_configured(kvm)) {
                r = kvm_smccc_filter_insert_reserved(kvm);
                if (WARN_ON_ONCE(r))
                        goto out_unlock;
        }

        r = mtree_insert_range(&kvm->arch.smccc_filter, start, end,
                               xa_mk_value(filter.action), GFP_KERNEL_ACCOUNT);
out_unlock:
        mutex_unlock(&kvm->arch.config_lock);
        return r;
}

static u8 kvm_smccc_filter_get_action(struct kvm *kvm, u32 func_id)
{
        unsigned long idx = func_id;
        void *val;

        if (!kvm_smccc_filter_configured(kvm))
                return KVM_SMCCC_FILTER_HANDLE;

        /*
         * But where's the error handling, you say?
         *
         * mt_find() returns NULL if no entry was found, which just so happens
         * to match KVM_SMCCC_FILTER_HANDLE.
         */
        val = mt_find(&kvm->arch.smccc_filter, &idx, idx);
        return xa_to_value(val);
}

static u8 kvm_smccc_get_action(struct kvm_vcpu *vcpu, u32 func_id)
{
        /*
         * Intervening actions in the SMCCC filter take precedence over the
         * pseudo-firmware register bitmaps.
         */
        u8 action = kvm_smccc_filter_get_action(vcpu->kvm, func_id);
        if (action != KVM_SMCCC_FILTER_HANDLE)
                return action;

        if (kvm_smccc_test_fw_bmap(vcpu, func_id) ||
            kvm_smccc_default_allowed(func_id))
                return KVM_SMCCC_FILTER_HANDLE;

        return KVM_SMCCC_FILTER_DENY;
}

static void kvm_prepare_hypercall_exit(struct kvm_vcpu *vcpu, u32 func_id)
{
        u8 ec = ESR_ELx_EC(kvm_vcpu_get_esr(vcpu));
        struct kvm_run *run = vcpu->run;
        u64 flags = 0;

        if (ec == ESR_ELx_EC_SMC32 || ec == ESR_ELx_EC_SMC64)
                flags |= KVM_HYPERCALL_EXIT_SMC;

        if (!kvm_vcpu_trap_il_is32bit(vcpu))
                flags |= KVM_HYPERCALL_EXIT_16BIT;

        run->exit_reason = KVM_EXIT_HYPERCALL;
        run->hypercall = (typeof(run->hypercall)) {
                .nr        = func_id,
                .flags        = flags,
        };
}

int kvm_smccc_call_handler(struct kvm_vcpu *vcpu)
{
        struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
        u32 func_id = smccc_get_function(vcpu);
        u64 val[4] = {SMCCC_RET_NOT_SUPPORTED};
        u32 feature;
        u8 action;
        gpa_t gpa;

        action = kvm_smccc_get_action(vcpu, func_id);
        switch (action) {
        case KVM_SMCCC_FILTER_HANDLE:
                break;
        case KVM_SMCCC_FILTER_DENY:
                goto out;
        case KVM_SMCCC_FILTER_FWD_TO_USER:
                kvm_prepare_hypercall_exit(vcpu, func_id);
                return 0;
        default:
                WARN_RATELIMIT(1, "Unhandled SMCCC filter action: %d\n", action);
                goto out;
        }

        switch (func_id) {
        case ARM_SMCCC_VERSION_FUNC_ID:
                val[0] = ARM_SMCCC_VERSION_1_1;
                break;
        case ARM_SMCCC_ARCH_FEATURES_FUNC_ID:
                feature = smccc_get_arg1(vcpu);
                switch (feature) {
                case ARM_SMCCC_ARCH_WORKAROUND_1:
                        switch (arm64_get_spectre_v2_state()) {
                        case SPECTRE_VULNERABLE:
                                break;
                        case SPECTRE_MITIGATED:
                                val[0] = SMCCC_RET_SUCCESS;
                                break;
                        case SPECTRE_UNAFFECTED:
                                val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
                                break;
                        }
                        break;
                case ARM_SMCCC_ARCH_WORKAROUND_2:
                        switch (arm64_get_spectre_v4_state()) {
                        case SPECTRE_VULNERABLE:
                                break;
                        case SPECTRE_MITIGATED:
                                /*
                                 * SSBS everywhere: Indicate no firmware
                                 * support, as the SSBS support will be
                                 * indicated to the guest and the default is
                                 * safe.
                                 *
                                 * Otherwise, expose a permanent mitigation
                                 * to the guest, and hide SSBS so that the
                                 * guest stays protected.
                                 */
                                if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP))
                                        break;
                                fallthrough;
                        case SPECTRE_UNAFFECTED:
                                val[0] = SMCCC_RET_NOT_REQUIRED;
                                break;
                        }
                        break;
                case ARM_SMCCC_ARCH_WORKAROUND_3:
                        switch (arm64_get_spectre_bhb_state()) {
                        case SPECTRE_VULNERABLE:
                                break;
                        case SPECTRE_MITIGATED:
                                val[0] = SMCCC_RET_SUCCESS;
                                break;
                        case SPECTRE_UNAFFECTED:
                                val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
                                break;
                        }
                        break;
                case ARM_SMCCC_HV_PV_TIME_FEATURES:
                        if (test_bit(KVM_REG_ARM_STD_HYP_BIT_PV_TIME,
                                     &smccc_feat->std_hyp_bmap))
                                val[0] = SMCCC_RET_SUCCESS;
                        break;
                }
                break;
        case ARM_SMCCC_HV_PV_TIME_FEATURES:
                val[0] = kvm_hypercall_pv_features(vcpu);
                break;
        case ARM_SMCCC_HV_PV_TIME_ST:
                gpa = kvm_init_stolen_time(vcpu);
                if (gpa != INVALID_GPA)
                        val[0] = gpa;
                break;
        case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID:
                val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0;
                val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1;
                val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2;
                val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3;
                break;
        case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID:
                val[0] = smccc_feat->vendor_hyp_bmap;
                /* Function numbers 2-63 are reserved for pKVM for now */
                val[2] = smccc_feat->vendor_hyp_bmap_2;
                break;
        case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID:
                kvm_ptp_get_time(vcpu, val);
                break;
        case ARM_SMCCC_TRNG_VERSION:
        case ARM_SMCCC_TRNG_FEATURES:
        case ARM_SMCCC_TRNG_GET_UUID:
        case ARM_SMCCC_TRNG_RND32:
        case ARM_SMCCC_TRNG_RND64:
                return kvm_trng_call(vcpu);
        default:
                return kvm_psci_call(vcpu);
        }

out:
        smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]);
        return 1;
}

static const u64 kvm_arm_fw_reg_ids[] = {
        KVM_REG_ARM_PSCI_VERSION,
        KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1,
        KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2,
        KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3,
        KVM_REG_ARM_STD_BMAP,
        KVM_REG_ARM_STD_HYP_BMAP,
        KVM_REG_ARM_VENDOR_HYP_BMAP,
        KVM_REG_ARM_VENDOR_HYP_BMAP_2,
};

void kvm_arm_init_hypercalls(struct kvm *kvm)
{
        struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat;

        smccc_feat->std_bmap = KVM_ARM_SMCCC_STD_FEATURES;
        smccc_feat->std_hyp_bmap = KVM_ARM_SMCCC_STD_HYP_FEATURES;
        smccc_feat->vendor_hyp_bmap = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;

        mt_init(&kvm->arch.smccc_filter);
}

void kvm_arm_teardown_hypercalls(struct kvm *kvm)
{
        mtree_destroy(&kvm->arch.smccc_filter);
}

int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu)
{
        return ARRAY_SIZE(kvm_arm_fw_reg_ids);
}

int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(kvm_arm_fw_reg_ids); i++) {
                if (put_user(kvm_arm_fw_reg_ids[i], uindices++))
                        return -EFAULT;
        }

        return 0;
}

#define KVM_REG_FEATURE_LEVEL_MASK        GENMASK(3, 0)

/*
 * Convert the workaround level into an easy-to-compare number, where higher
 * values mean better protection.
 */
static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid)
{
        switch (regid) {
        case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
                switch (arm64_get_spectre_v2_state()) {
                case SPECTRE_VULNERABLE:
                        return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
                case SPECTRE_MITIGATED:
                        return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL;
                case SPECTRE_UNAFFECTED:
                        return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED;
                }
                return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL;
        case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
                switch (arm64_get_spectre_v4_state()) {
                case SPECTRE_MITIGATED:
                        /*
                         * As for the hypercall discovery, we pretend we
                         * don't have any FW mitigation if SSBS is there at
                         * all times.
                         */
                        if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP))
                                return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
                        fallthrough;
                case SPECTRE_UNAFFECTED:
                        return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
                case SPECTRE_VULNERABLE:
                        return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
                }
                break;
        case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
                switch (arm64_get_spectre_bhb_state()) {
                case SPECTRE_VULNERABLE:
                        return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL;
                case SPECTRE_MITIGATED:
                        return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_AVAIL;
                case SPECTRE_UNAFFECTED:
                        return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_REQUIRED;
                }
                return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3_NOT_AVAIL;
        }

        return -EINVAL;
}

int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        struct kvm_smccc_features *smccc_feat = &vcpu->kvm->arch.smccc_feat;
        void __user *uaddr = (void __user *)(long)reg->addr;
        u64 val;

        switch (reg->id) {
        case KVM_REG_ARM_PSCI_VERSION:
                val = kvm_psci_version(vcpu);
                break;
        case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
        case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
        case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
                val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK;
                break;
        case KVM_REG_ARM_STD_BMAP:
                val = READ_ONCE(smccc_feat->std_bmap);
                break;
        case KVM_REG_ARM_STD_HYP_BMAP:
                val = READ_ONCE(smccc_feat->std_hyp_bmap);
                break;
        case KVM_REG_ARM_VENDOR_HYP_BMAP:
                val = READ_ONCE(smccc_feat->vendor_hyp_bmap);
                break;
        case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
                val = READ_ONCE(smccc_feat->vendor_hyp_bmap_2);
                break;
        default:
                return -ENOENT;
        }

        if (copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)))
                return -EFAULT;

        return 0;
}

static int kvm_arm_set_fw_reg_bmap(struct kvm_vcpu *vcpu, u64 reg_id, u64 val)
{
        int ret = 0;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_smccc_features *smccc_feat = &kvm->arch.smccc_feat;
        unsigned long *fw_reg_bmap, fw_reg_features;

        switch (reg_id) {
        case KVM_REG_ARM_STD_BMAP:
                fw_reg_bmap = &smccc_feat->std_bmap;
                fw_reg_features = KVM_ARM_SMCCC_STD_FEATURES;
                break;
        case KVM_REG_ARM_STD_HYP_BMAP:
                fw_reg_bmap = &smccc_feat->std_hyp_bmap;
                fw_reg_features = KVM_ARM_SMCCC_STD_HYP_FEATURES;
                break;
        case KVM_REG_ARM_VENDOR_HYP_BMAP:
                fw_reg_bmap = &smccc_feat->vendor_hyp_bmap;
                fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES;
                break;
        case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
                fw_reg_bmap = &smccc_feat->vendor_hyp_bmap_2;
                fw_reg_features = KVM_ARM_SMCCC_VENDOR_HYP_FEATURES_2;
                break;
        default:
                return -ENOENT;
        }

        /* Check for unsupported bit */
        if (val & ~fw_reg_features)
                return -EINVAL;

        mutex_lock(&kvm->arch.config_lock);

        if (kvm_vm_has_ran_once(kvm) && val != *fw_reg_bmap) {
                ret = -EBUSY;
                goto out;
        }

        WRITE_ONCE(*fw_reg_bmap, val);
out:
        mutex_unlock(&kvm->arch.config_lock);
        return ret;
}

int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
{
        void __user *uaddr = (void __user *)(long)reg->addr;
        u64 val;
        int wa_level;

        if (KVM_REG_SIZE(reg->id) != sizeof(val))
                return -ENOENT;
        if (copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)))
                return -EFAULT;

        switch (reg->id) {
        case KVM_REG_ARM_PSCI_VERSION:
        {
                bool wants_02;

                wants_02 = vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2);

                switch (val) {
                case KVM_ARM_PSCI_0_1:
                        if (wants_02)
                                return -EINVAL;
                        vcpu->kvm->arch.psci_version = val;
                        return 0;
                case KVM_ARM_PSCI_0_2:
                case KVM_ARM_PSCI_1_0:
                case KVM_ARM_PSCI_1_1:
                case KVM_ARM_PSCI_1_2:
                case KVM_ARM_PSCI_1_3:
                        if (!wants_02)
                                return -EINVAL;
                        vcpu->kvm->arch.psci_version = val;
                        return 0;
                }
                break;
        }

        case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1:
        case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3:
                if (val & ~KVM_REG_FEATURE_LEVEL_MASK)
                        return -EINVAL;

                if (get_kernel_wa_level(vcpu, reg->id) < val)
                        return -EINVAL;

                return 0;

        case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2:
                if (val & ~(KVM_REG_FEATURE_LEVEL_MASK |
                            KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED))
                        return -EINVAL;

                /* The enabled bit must not be set unless the level is AVAIL. */
                if ((val & KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED) &&
                    (val & KVM_REG_FEATURE_LEVEL_MASK) != KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL)
                        return -EINVAL;

                /*
                 * Map all the possible incoming states to the only two we
                 * really want to deal with.
                 */
                switch (val & KVM_REG_FEATURE_LEVEL_MASK) {
                case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL:
                case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN:
                        wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL;
                        break;
                case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL:
                case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED:
                        wa_level = KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED;
                        break;
                default:
                        return -EINVAL;
                }

                /*
                 * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the
                 * other way around.
                 */
                if (get_kernel_wa_level(vcpu, reg->id) < wa_level)
                        return -EINVAL;

                return 0;
        case KVM_REG_ARM_STD_BMAP:
        case KVM_REG_ARM_STD_HYP_BMAP:
        case KVM_REG_ARM_VENDOR_HYP_BMAP:
        case KVM_REG_ARM_VENDOR_HYP_BMAP_2:
                return kvm_arm_set_fw_reg_bmap(vcpu, reg->id, val);
        default:
                return -ENOENT;
        }

        return -EINVAL;
}

int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
{
        switch (attr->attr) {
        case KVM_ARM_VM_SMCCC_FILTER:
                return 0;
        default:
                return -ENXIO;
        }
}

int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
{
        void __user *uaddr = (void __user *)attr->addr;

        switch (attr->attr) {
        case KVM_ARM_VM_SMCCC_FILTER:
                return kvm_smccc_set_filter(kvm, uaddr);
        default:
                return -ENXIO;
        }
}



































































































































  198 












































































  411 















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_JUMP_LABEL_H
#define _LINUX_JUMP_LABEL_H

/*
 * Jump label support
 *
 * Copyright (C) 2009-2012 Jason Baron <jbaron@redhat.com>
 * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra
 *
 * DEPRECATED API:
 *
 * The use of 'struct static_key' directly, is now DEPRECATED. In addition
 * static_key_{true,false}() is also DEPRECATED. IE DO NOT use the following:
 *
 * struct static_key false = STATIC_KEY_INIT_FALSE;
 * struct static_key true = STATIC_KEY_INIT_TRUE;
 * static_key_true()
 * static_key_false()
 *
 * The updated API replacements are:
 *
 * DEFINE_STATIC_KEY_TRUE(key);
 * DEFINE_STATIC_KEY_FALSE(key);
 * DEFINE_STATIC_KEY_ARRAY_TRUE(keys, count);
 * DEFINE_STATIC_KEY_ARRAY_FALSE(keys, count);
 * static_branch_likely()
 * static_branch_unlikely()
 *
 * Jump labels provide an interface to generate dynamic branches using
 * self-modifying code. Assuming toolchain and architecture support, if we
 * define a "key" that is initially false via "DEFINE_STATIC_KEY_FALSE(key)",
 * an "if (static_branch_unlikely(&key))" statement is an unconditional branch
 * (which defaults to false - and the true block is placed out of line).
 * Similarly, we can define an initially true key via
 * "DEFINE_STATIC_KEY_TRUE(key)", and use it in the same
 * "if (static_branch_unlikely(&key))", in which case we will generate an
 * unconditional branch to the out-of-line true branch. Keys that are
 * initially true or false can be using in both static_branch_unlikely()
 * and static_branch_likely() statements.
 *
 * At runtime we can change the branch target by setting the key
 * to true via a call to static_branch_enable(), or false using
 * static_branch_disable(). If the direction of the branch is switched by
 * these calls then we run-time modify the branch target via a
 * no-op -> jump or jump -> no-op conversion. For example, for an
 * initially false key that is used in an "if (static_branch_unlikely(&key))"
 * statement, setting the key to true requires us to patch in a jump
 * to the out-of-line of true branch.
 *
 * In addition to static_branch_{enable,disable}, we can also reference count
 * the key or branch direction via static_branch_{inc,dec}. Thus,
 * static_branch_inc() can be thought of as a 'make more true' and
 * static_branch_dec() as a 'make more false'.
 *
 * Since this relies on modifying code, the branch modifying functions
 * must be considered absolute slow paths (machine wide synchronization etc.).
 * OTOH, since the affected branches are unconditional, their runtime overhead
 * will be absolutely minimal, esp. in the default (off) case where the total
 * effect is a single NOP of appropriate size. The on case will patch in a jump
 * to the out-of-line block.
 *
 * When the control is directly exposed to userspace, it is prudent to delay the
 * decrement to avoid high frequency code modifications which can (and do)
 * cause significant performance degradation. Struct static_key_deferred and
 * static_key_slow_dec_deferred() provide for this.
 *
 * Lacking toolchain and or architecture support, static keys fall back to a
 * simple conditional branch.
 *
 * Additional babbling in: Documentation/staging/static-keys.rst
 */

#ifndef __ASSEMBLY__

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/cleanup.h>

extern bool static_key_initialized;

#define STATIC_KEY_CHECK_USE(key) WARN(!static_key_initialized,                      \
                                    "%s(): static key '%pS' used before call to jump_label_init()", \
                                    __func__, (key))

struct static_key {
        atomic_t enabled;
#ifdef CONFIG_JUMP_LABEL
/*
 * Note:
 *   To make anonymous unions work with old compilers, the static
 *   initialization of them requires brackets. This creates a dependency
 *   on the order of the struct with the initializers. If any fields
 *   are added, STATIC_KEY_INIT_TRUE and STATIC_KEY_INIT_FALSE may need
 *   to be modified.
 *
 * bit 0 => 1 if key is initially true
 *            0 if initially false
 * bit 1 => 1 if points to struct static_key_mod
 *            0 if points to struct jump_entry
 */
        union {
                unsigned long type;
                struct jump_entry *entries;
                struct static_key_mod *next;
        };
#endif        /* CONFIG_JUMP_LABEL */
};

#endif /* __ASSEMBLY__ */

#ifdef CONFIG_JUMP_LABEL
#include <asm/jump_label.h>

#ifndef __ASSEMBLY__
#ifdef CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE

struct jump_entry {
        s32 code;
        s32 target;
        long key;        // key may be far away from the core kernel under KASLR
};

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return (unsigned long)&entry->code + entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return (unsigned long)&entry->target + entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        long offset = entry->key & ~3L;

        return (struct static_key *)((unsigned long)&entry->key + offset);
}

#else

static inline unsigned long jump_entry_code(const struct jump_entry *entry)
{
        return entry->code;
}

static inline unsigned long jump_entry_target(const struct jump_entry *entry)
{
        return entry->target;
}

static inline struct static_key *jump_entry_key(const struct jump_entry *entry)
{
        return (struct static_key *)((unsigned long)entry->key & ~3UL);
}

#endif

static inline bool jump_entry_is_branch(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 1UL;
}

static inline bool jump_entry_is_init(const struct jump_entry *entry)
{
        return (unsigned long)entry->key & 2UL;
}

static inline void jump_entry_set_init(struct jump_entry *entry, bool set)
{
        if (set)
                entry->key |= 2;
        else
                entry->key &= ~2;
}

static inline int jump_entry_size(struct jump_entry *entry)
{
#ifdef JUMP_LABEL_NOP_SIZE
        return JUMP_LABEL_NOP_SIZE;
#else
        return arch_jump_entry_size(entry);
#endif
}

#endif
#endif

#ifndef __ASSEMBLY__

enum jump_label_type {
        JUMP_LABEL_NOP = 0,
        JUMP_LABEL_JMP,
};

struct module;

#ifdef CONFIG_JUMP_LABEL

#define JUMP_TYPE_FALSE                0UL
#define JUMP_TYPE_TRUE                1UL
#define JUMP_TYPE_LINKED        2UL
#define JUMP_TYPE_MASK                3UL

static __always_inline bool static_key_false(struct static_key *key)
{
        return arch_static_branch(key, false);
}

static __always_inline bool static_key_true(struct static_key *key)
{
        return !arch_static_branch(key, true);
}

extern struct jump_entry __start___jump_table[];
extern struct jump_entry __stop___jump_table[];

extern void jump_label_init(void);
extern void jump_label_init_ro(void);
extern void jump_label_lock(void);
extern void jump_label_unlock(void);
extern void arch_jump_label_transform(struct jump_entry *entry,
                                      enum jump_label_type type);
extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
                                            enum jump_label_type type);
extern void arch_jump_label_transform_apply(void);
extern int jump_label_text_reserved(void *start, void *end);
extern bool static_key_slow_inc(struct static_key *key);
extern bool static_key_fast_inc_not_disabled(struct static_key *key);
extern void static_key_slow_dec(struct static_key *key);
extern bool static_key_slow_inc_cpuslocked(struct static_key *key);
extern void static_key_slow_dec_cpuslocked(struct static_key *key);
extern int static_key_count(struct static_key *key);
extern void static_key_enable(struct static_key *key);
extern void static_key_disable(struct static_key *key);
extern void static_key_enable_cpuslocked(struct static_key *key);
extern void static_key_disable_cpuslocked(struct static_key *key);
extern enum jump_label_type jump_label_init_type(struct jump_entry *entry);

/*
 * We should be using ATOMIC_INIT() for initializing .enabled, but
 * the inclusion of atomic.h is problematic for inclusion of jump_label.h
 * in 'low-level' headers. Thus, we are initializing .enabled with a
 * raw value, but have added a BUILD_BUG_ON() to catch any issues in
 * jump_label_init() see: kernel/jump_label.c.
 */
#define STATIC_KEY_INIT_TRUE                                        \
        { .enabled = { 1 },                                        \
          { .type = JUMP_TYPE_TRUE } }
#define STATIC_KEY_INIT_FALSE                                        \
        { .enabled = { 0 },                                        \
          { .type = JUMP_TYPE_FALSE } }

#else  /* !CONFIG_JUMP_LABEL */

#include <linux/atomic.h>
#include <linux/bug.h>

static __always_inline int static_key_count(struct static_key *key)
{
        return raw_atomic_read(&key->enabled);
}

static __always_inline void jump_label_init(void)
{
        static_key_initialized = true;
}

static __always_inline void jump_label_init_ro(void) { }

static __always_inline bool static_key_false(struct static_key *key)
{
        if (unlikely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static __always_inline bool static_key_true(struct static_key *key)
{
        if (likely_notrace(static_key_count(key) > 0))
                return true;
        return false;
}

static inline bool static_key_fast_inc_not_disabled(struct static_key *key)
{
        int v;

        STATIC_KEY_CHECK_USE(key);
        /*
         * Prevent key->enabled getting negative to follow the same semantics
         * as for CONFIG_JUMP_LABEL=y, see kernel/jump_label.c comment.
         */
        v = atomic_read(&key->enabled);
        do {
                if (v < 0 || (v + 1) < 0)
                        return false;
        } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1)));
        return true;
}
#define static_key_slow_inc(key)        static_key_fast_inc_not_disabled(key)

static inline void static_key_slow_dec(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);
        atomic_dec(&key->enabled);
}

#define static_key_slow_inc_cpuslocked(key) static_key_slow_inc(key)
#define static_key_slow_dec_cpuslocked(key) static_key_slow_dec(key)

static inline int jump_label_text_reserved(void *start, void *end)
{
        return 0;
}

static inline void jump_label_lock(void) {}
static inline void jump_label_unlock(void) {}

static inline void static_key_enable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 0) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
                return;
        }
        atomic_set(&key->enabled, 1);
}

static inline void static_key_disable(struct static_key *key)
{
        STATIC_KEY_CHECK_USE(key);

        if (atomic_read(&key->enabled) != 1) {
                WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
                return;
        }
        atomic_set(&key->enabled, 0);
}

#define static_key_enable_cpuslocked(k)                static_key_enable((k))
#define static_key_disable_cpuslocked(k)        static_key_disable((k))

#define STATIC_KEY_INIT_TRUE        { .enabled = ATOMIC_INIT(1) }
#define STATIC_KEY_INIT_FALSE        { .enabled = ATOMIC_INIT(0) }

#endif        /* CONFIG_JUMP_LABEL */

DEFINE_LOCK_GUARD_0(jump_label_lock, jump_label_lock(), jump_label_unlock())

#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
#define jump_label_enabled static_key_enabled

/* -------------------------------------------------------------------------- */

/*
 * Two type wrappers around static_key, such that we can use compile time
 * type differentiation to emit the right code.
 *
 * All the below code is macros in order to play type games.
 */

struct static_key_true {
        struct static_key key;
};

struct static_key_false {
        struct static_key key;
};

#define STATIC_KEY_TRUE_INIT  (struct static_key_true) { .key = STATIC_KEY_INIT_TRUE,  }
#define STATIC_KEY_FALSE_INIT (struct static_key_false){ .key = STATIC_KEY_INIT_FALSE, }

#define DEFINE_STATIC_KEY_TRUE(name)        \
        struct static_key_true name = STATIC_KEY_TRUE_INIT

#define DEFINE_STATIC_KEY_TRUE_RO(name)        \
        struct static_key_true name __ro_after_init = STATIC_KEY_TRUE_INIT

#define DECLARE_STATIC_KEY_TRUE(name)        \
        extern struct static_key_true name

#define DEFINE_STATIC_KEY_FALSE(name)        \
        struct static_key_false name = STATIC_KEY_FALSE_INIT

#define DEFINE_STATIC_KEY_FALSE_RO(name)        \
        struct static_key_false name __ro_after_init = STATIC_KEY_FALSE_INIT

#define DECLARE_STATIC_KEY_FALSE(name)        \
        extern struct static_key_false name

#define DEFINE_STATIC_KEY_ARRAY_TRUE(name, count)                \
        struct static_key_true name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_TRUE_INIT,        \
        }

#define DEFINE_STATIC_KEY_ARRAY_FALSE(name, count)                \
        struct static_key_false name[count] = {                        \
                [0 ... (count) - 1] = STATIC_KEY_FALSE_INIT,        \
        }

#define _DEFINE_STATIC_KEY_1(name)        DEFINE_STATIC_KEY_TRUE(name)
#define _DEFINE_STATIC_KEY_0(name)        DEFINE_STATIC_KEY_FALSE(name)
#define DEFINE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_, IS_ENABLED(cfg))(name)

#define _DEFINE_STATIC_KEY_RO_1(name)        DEFINE_STATIC_KEY_TRUE_RO(name)
#define _DEFINE_STATIC_KEY_RO_0(name)        DEFINE_STATIC_KEY_FALSE_RO(name)
#define DEFINE_STATIC_KEY_MAYBE_RO(cfg, name)                        \
        __PASTE(_DEFINE_STATIC_KEY_RO_, IS_ENABLED(cfg))(name)

#define _DECLARE_STATIC_KEY_1(name)        DECLARE_STATIC_KEY_TRUE(name)
#define _DECLARE_STATIC_KEY_0(name)        DECLARE_STATIC_KEY_FALSE(name)
#define DECLARE_STATIC_KEY_MAYBE(cfg, name)                        \
        __PASTE(_DECLARE_STATIC_KEY_, IS_ENABLED(cfg))(name)

extern bool ____wrong_branch_error(void);

#define static_key_enabled(x)                                                        \
({                                                                                \
        if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&        \
            !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
            !__builtin_types_compatible_p(typeof(*x), struct static_key_false))        \
                ____wrong_branch_error();                                        \
        static_key_count((struct static_key *)x) > 0;                                \
})

#ifdef CONFIG_JUMP_LABEL

/*
 * Combine the right initial value (type) with the right branch order
 * to generate the desired result.
 *
 *
 * type\branch|        likely (1)              |        unlikely (0)
 * -----------+-----------------------+------------------
 *            |                       |
 *  true (1)  |           ...                      |           ...
 *            |    NOP                      |           JMP L
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *            |                       |
 *  false (0) |           ...                      |           ...
 *            |    JMP L              |           NOP
 *            |    <br-stmts>              |        1: ...
 *            |        L: ...                      |
 *            |                              |
 *            |                              |        L: <br-stmts>
 *            |                              |           jmp 1b
 *            |                       |
 * -----------+-----------------------+------------------
 *
 * The initial value is encoded in the LSB of static_key::entries,
 * type: 0 = false, 1 = true.
 *
 * The branch type is encoded in the LSB of jump_entry::key,
 * branch: 0 = unlikely, 1 = likely.
 *
 * This gives the following logic table:
 *
 *        enabled        type        branch          instuction
 * -----------------------------+-----------
 *        0        0        0        | NOP
 *        0        0        1        | JMP
 *        0        1        0        | NOP
 *        0        1        1        | JMP
 *
 *        1        0        0        | JMP
 *        1        0        1        | NOP
 *        1        1        0        | JMP
 *        1        1        1        | NOP
 *
 * Which gives the following functions:
 *
 *   dynamic: instruction = enabled ^ branch
 *   static:  instruction = type ^ branch
 *
 * See jump_label_type() / jump_label_init_type().
 */

#define static_branch_likely(x)                                                        \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = !arch_static_branch(&(x)->key, true);                        \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = !arch_static_branch_jump(&(x)->key, true);                \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        likely_notrace(branch);                                                                \
})

#define static_branch_unlikely(x)                                                \
({                                                                                \
        bool branch;                                                                \
        if (__builtin_types_compatible_p(typeof(*x), struct static_key_true))        \
                branch = arch_static_branch_jump(&(x)->key, false);                \
        else if (__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
                branch = arch_static_branch(&(x)->key, false);                        \
        else                                                                        \
                branch = ____wrong_branch_error();                                \
        unlikely_notrace(branch);                                                        \
})

#else /* !CONFIG_JUMP_LABEL */

#define static_branch_likely(x)                likely_notrace(static_key_enabled(&(x)->key))
#define static_branch_unlikely(x)        unlikely_notrace(static_key_enabled(&(x)->key))

#endif /* CONFIG_JUMP_LABEL */

#define static_branch_maybe(config, x)                                        \
        (IS_ENABLED(config) ? static_branch_likely(x)                        \
                            : static_branch_unlikely(x))

/*
 * Advanced usage; refcount, branch is enabled when: count != 0
 */

#define static_branch_inc(x)                static_key_slow_inc(&(x)->key)
#define static_branch_dec(x)                static_key_slow_dec(&(x)->key)
#define static_branch_inc_cpuslocked(x)        static_key_slow_inc_cpuslocked(&(x)->key)
#define static_branch_dec_cpuslocked(x)        static_key_slow_dec_cpuslocked(&(x)->key)

/*
 * Normal usage; boolean enable/disable.
 */

#define static_branch_enable(x)                        static_key_enable(&(x)->key)
#define static_branch_disable(x)                static_key_disable(&(x)->key)
#define static_branch_enable_cpuslocked(x)        static_key_enable_cpuslocked(&(x)->key)
#define static_branch_disable_cpuslocked(x)        static_key_disable_cpuslocked(&(x)->key)

#endif /* __ASSEMBLY__ */

#endif        /* _LINUX_JUMP_LABEL_H */





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MM_TYPES_H
#define _LINUX_MM_TYPES_H

#include <linux/mm_types_task.h>

#include <linux/auxvec.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
#include <linux/maple_tree.h>
#include <linux/rwsem.h>
#include <linux/completion.h>
#include <linux/cpumask.h>
#include <linux/uprobes.h>
#include <linux/rcupdate.h>
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
#include <linux/percpu_counter.h>
#include <linux/types.h>

#include <asm/mmu.h>

#ifndef AT_VECTOR_SIZE_ARCH
#define AT_VECTOR_SIZE_ARCH 0
#endif
#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))

#define INIT_PASID        0

struct address_space;
struct mem_cgroup;

/*
 * Each physical page in the system has a struct page associated with
 * it to keep track of whatever it is we are using the page for at the
 * moment. Note that we have no way to track which tasks are using
 * a page, though if it is a pagecache page, rmap structures can tell us
 * who is mapping it.
 *
 * If you allocate the page using alloc_pages(), you can use some of the
 * space in struct page for your own purposes.  The five words in the main
 * union are available, except for bit 0 of the first word which must be
 * kept clear.  Many users use this word to store a pointer to an object
 * which is guaranteed to be aligned.  If you use the same storage as
 * page->mapping, you must restore it to NULL before freeing the page.
 *
 * The mapcount field must not be used for own purposes.
 *
 * If you want to use the refcount field, it must be used in such a way
 * that other CPUs temporarily incrementing and then decrementing the
 * refcount does not cause problems.  On receiving the page from
 * alloc_pages(), the refcount will be positive.
 *
 * If you allocate pages of order > 0, you can use some of the fields
 * in each subpage, but you may need to restore some of their values
 * afterwards.
 *
 * SLUB uses cmpxchg_double() to atomically update its freelist and counters.
 * That requires that freelist & counters in struct slab be adjacent and
 * double-word aligned. Because struct slab currently just reinterprets the
 * bits of struct page, we align all struct pages to double-word boundaries,
 * and ensure that 'freelist' is aligned within struct slab.
 */
#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE
#define _struct_page_alignment        __aligned(2 * sizeof(unsigned long))
#else
#define _struct_page_alignment        __aligned(sizeof(unsigned long))
#endif

struct page {
        unsigned long flags;                /* Atomic flags, some possibly
                                         * updated asynchronously */
        /*
         * Five words (20/40 bytes) are available in this union.
         * WARNING: bit 0 of the first word is used for PageTail(). That
         * means the other users of this union MUST NOT use the bit to
         * avoid collision and false-positive PageTail().
         */
        union {
                struct {        /* Page cache and anonymous pages */
                        /**
                         * @lru: Pageout list, eg. active_list protected by
                         * lruvec->lru_lock.  Sometimes used as a generic list
                         * by the page owner.
                         */
                        union {
                                struct list_head lru;

                                /* Or, for the Unevictable "LRU list" slot */
                                struct {
                                        /* Always even, to negate PageTail */
                                        void *__filler;
                                        /* Count page's or folio's mlocks */
                                        unsigned int mlock_count;
                                };

                                /* Or, free page */
                                struct list_head buddy_list;
                                struct list_head pcp_list;
                                struct {
                                        struct llist_node pcp_llist;
                                        unsigned int order;
                                };
                        };
                        /* See page-flags.h for PAGE_MAPPING_FLAGS */
                        struct address_space *mapping;
                        union {
                                pgoff_t index;                /* Our offset within mapping. */
                                unsigned long share;        /* share count for fsdax */
                        };
                        /**
                         * @private: Mapping-private opaque data.
                         * Usually used for buffer_heads if PagePrivate.
                         * Used for swp_entry_t if swapcache flag set.
                         * Indicates order in the buddy system if PageBuddy.
                         */
                        unsigned long private;
                };
                struct {        /* page_pool used by netstack */
                        /**
                         * @pp_magic: magic value to avoid recycling non
                         * page_pool allocated pages.
                         */
                        unsigned long pp_magic;
                        struct page_pool *pp;
                        unsigned long _pp_mapping_pad;
                        unsigned long dma_addr;
                        atomic_long_t pp_ref_count;
                };
                struct {        /* Tail pages of compound page */
                        unsigned long compound_head;        /* Bit zero is set */
                };
                struct {        /* ZONE_DEVICE pages */
                        /*
                         * The first word is used for compound_head or folio
                         * pgmap
                         */
                        void *_unused_pgmap_compound_head;
                        void *zone_device_data;
                        /*
                         * ZONE_DEVICE private pages are counted as being
                         * mapped so the next 3 words hold the mapping, index,
                         * and private fields from the source anonymous or
                         * page cache page while the page is migrated to device
                         * private memory.
                         * ZONE_DEVICE MEMORY_DEVICE_FS_DAX pages also
                         * use the mapping, index, and private fields when
                         * pmem backed DAX files are mapped.
                         */
                };

                /** @rcu_head: You can use this to free a page by RCU. */
                struct rcu_head rcu_head;
        };

        union {                /* This union is 4 bytes in size. */
                /*
                 * For head pages of typed folios, the value stored here
                 * allows for determining what this page is used for. The
                 * tail pages of typed folios will not store a type
                 * (page_type == _mapcount == -1).
                 *
                 * See page-flags.h for a list of page types which are currently
                 * stored here.
                 *
                 * Owners of typed folios may reuse the lower 16 bit of the
                 * head page page_type field after setting the page type,
                 * but must reset these 16 bit to -1 before clearing the
                 * page type.
                 */
                unsigned int page_type;

                /*
                 * For pages that are part of non-typed folios for which mappings
                 * are tracked via the RMAP, encodes the number of times this page
                 * is directly referenced by a page table.
                 *
                 * Note that the mapcount is always initialized to -1, so that
                 * transitions both from it and to it can be tracked, using
                 * atomic_inc_and_test() and atomic_add_negative(-1).
                 */
                atomic_t _mapcount;
        };

        /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */
        atomic_t _refcount;

#ifdef CONFIG_MEMCG
        unsigned long memcg_data;
#elif defined(CONFIG_SLAB_OBJ_EXT)
        unsigned long _unused_slab_obj_exts;
#endif

        /*
         * On machines where all RAM is mapped into kernel address space,
         * we can simply calculate the virtual address. On machines with
         * highmem some memory is mapped into kernel virtual memory
         * dynamically, so we need a place to store that address.
         * Note that this field could be 16 bits on x86 ... ;)
         *
         * Architectures with slow multiplication can define
         * WANT_PAGE_VIRTUAL in asm/page.h
         */
#if defined(WANT_PAGE_VIRTUAL)
        void *virtual;                        /* Kernel virtual address (NULL if
                                           not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */

#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
        int _last_cpupid;
#endif

#ifdef CONFIG_KMSAN
        /*
         * KMSAN metadata for this page:
         *  - shadow page: every bit indicates whether the corresponding
         *    bit of the original page is initialized (0) or not (1);
         *  - origin page: every 4 bytes contain an id of the stack trace
         *    where the uninitialized value was created.
         */
        struct page *kmsan_shadow;
        struct page *kmsan_origin;
#endif
} _struct_page_alignment;

/*
 * struct encoded_page - a nonexistent type marking this pointer
 *
 * An 'encoded_page' pointer is a pointer to a regular 'struct page', but
 * with the low bits of the pointer indicating extra context-dependent
 * information. Only used in mmu_gather handling, and this acts as a type
 * system check on that use.
 *
 * We only really have two guaranteed bits in general, although you could
 * play with 'struct page' alignment (see CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
 * for more.
 *
 * Use the supplied helper functions to endcode/decode the pointer and bits.
 */
struct encoded_page;

#define ENCODED_PAGE_BITS                        3ul

/* Perform rmap removal after we have flushed the TLB. */
#define ENCODED_PAGE_BIT_DELAY_RMAP                1ul

/*
 * The next item in an encoded_page array is the "nr_pages" argument, specifying
 * the number of consecutive pages starting from this page, that all belong to
 * the same folio. For example, "nr_pages" corresponds to the number of folio
 * references that must be dropped. If this bit is not set, "nr_pages" is
 * implicitly 1.
 */
#define ENCODED_PAGE_BIT_NR_PAGES_NEXT                2ul

static __always_inline struct encoded_page *encode_page(struct page *page, unsigned long flags)
{
        BUILD_BUG_ON(flags > ENCODED_PAGE_BITS);
        return (struct encoded_page *)(flags | (unsigned long)page);
}

static inline unsigned long encoded_page_flags(struct encoded_page *page)
{
        return ENCODED_PAGE_BITS & (unsigned long)page;
}

static inline struct page *encoded_page_ptr(struct encoded_page *page)
{
        return (struct page *)(~ENCODED_PAGE_BITS & (unsigned long)page);
}

static __always_inline struct encoded_page *encode_nr_pages(unsigned long nr)
{
        VM_WARN_ON_ONCE((nr << 2) >> 2 != nr);
        return (struct encoded_page *)(nr << 2);
}

static __always_inline unsigned long encoded_nr_pages(struct encoded_page *page)
{
        return ((unsigned long)page) >> 2;
}

/*
 * A swap entry has to fit into a "unsigned long", as the entry is hidden
 * in the "index" field of the swapper address space.
 */
typedef struct {
        unsigned long val;
} swp_entry_t;

#if defined(CONFIG_MEMCG) || defined(CONFIG_SLAB_OBJ_EXT)
/* We have some extra room after the refcount in tail pages. */
#define NR_PAGES_IN_LARGE_FOLIO
#endif

/*
 * On 32bit, we can cut the required metadata in half, because:
 * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have,
 *     so we can limit MM IDs to 15 bit (32767).
 * (b) We don't expect folios where even a single complete PTE mapping by
 *     one MM would exceed 15 bits (order-15).
 */
#ifdef CONFIG_64BIT
typedef int mm_id_mapcount_t;
#define MM_ID_MAPCOUNT_MAX                INT_MAX
typedef unsigned int mm_id_t;
#else /* !CONFIG_64BIT */
typedef short mm_id_mapcount_t;
#define MM_ID_MAPCOUNT_MAX                SHRT_MAX
typedef unsigned short mm_id_t;
#endif /* CONFIG_64BIT */

/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */
#define MM_ID_DUMMY                        0
#define MM_ID_MIN                        (MM_ID_DUMMY + 1)

/*
 * We leave the highest bit of each MM id unused, so we can store a flag
 * in the highest bit of each folio->_mm_id[].
 */
#define MM_ID_BITS                        ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1)
#define MM_ID_MASK                        ((1U << MM_ID_BITS) - 1)
#define MM_ID_MAX                        MM_ID_MASK

/*
 * In order to use bit_spin_lock(), which requires an unsigned long, we
 * operate on folio->_mm_ids when working on flags.
 */
#define FOLIO_MM_IDS_LOCK_BITNUM        MM_ID_BITS
#define FOLIO_MM_IDS_LOCK_BIT                BIT(FOLIO_MM_IDS_LOCK_BITNUM)
#define FOLIO_MM_IDS_SHARED_BITNUM        (2 * MM_ID_BITS + 1)
#define FOLIO_MM_IDS_SHARED_BIT                BIT(FOLIO_MM_IDS_SHARED_BITNUM)

/**
 * struct folio - Represents a contiguous set of bytes.
 * @flags: Identical to the page flags.
 * @lru: Least Recently Used list; tracks how recently this folio was used.
 * @mlock_count: Number of times this folio has been pinned by mlock().
 * @mapping: The file this page belongs to, or refers to the anon_vma for
 *    anonymous memory.
 * @index: Offset within the file, in units of pages.  For anonymous memory,
 *    this is the index from the beginning of the mmap.
 * @share: number of DAX mappings that reference this folio. See
 *    dax_associate_entry.
 * @private: Filesystem per-folio data (see folio_attach_private()).
 * @swap: Used for swp_entry_t if folio_test_swapcache().
 * @_mapcount: Do not access this member directly.  Use folio_mapcount() to
 *    find out how many times this folio is mapped by userspace.
 * @_refcount: Do not access this member directly.  Use folio_ref_count()
 *    to find how many references there are to this folio.
 * @memcg_data: Memory Control Group data.
 * @pgmap: Metadata for ZONE_DEVICE mappings
 * @virtual: Virtual address in the kernel direct map.
 * @_last_cpupid: IDs of last CPU and last process that accessed the folio.
 * @_entire_mapcount: Do not use directly, call folio_entire_mapcount().
 * @_large_mapcount: Do not use directly, call folio_mapcount().
 * @_nr_pages_mapped: Do not use outside of rmap and debug code.
 * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
 * @_nr_pages: Do not use directly, call folio_nr_pages().
 * @_mm_id: Do not use outside of rmap code.
 * @_mm_ids: Do not use outside of rmap code.
 * @_mm_id_mapcount: Do not use outside of rmap code.
 * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
 * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
 * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
 * @_hugetlb_hwpoison: Do not use directly, call raw_hwp_list_head().
 * @_deferred_list: Folios to be split under memory pressure.
 * @_unused_slab_obj_exts: Placeholder to match obj_exts in struct slab.
 *
 * A folio is a physically, virtually and logically contiguous set
 * of bytes.  It is a power-of-two in size, and it is aligned to that
 * same power-of-two.  It is at least as large as %PAGE_SIZE.  If it is
 * in the page cache, it is at a file offset which is a multiple of that
 * power-of-two.  It may be mapped into userspace at an address which is
 * at an arbitrary page offset, but its kernel virtual address is aligned
 * to its size.
 */
struct folio {
        /* private: don't document the anon union */
        union {
                struct {
        /* public: */
                        unsigned long flags;
                        union {
                                struct list_head lru;
        /* private: avoid cluttering the output */
                                struct {
                                        void *__filler;
        /* public: */
                                        unsigned int mlock_count;
        /* private: */
                                };
        /* public: */
                                struct dev_pagemap *pgmap;
                        };
                        struct address_space *mapping;
                        union {
                                pgoff_t index;
                                unsigned long share;
                        };
                        union {
                                void *private;
                                swp_entry_t swap;
                        };
                        atomic_t _mapcount;
                        atomic_t _refcount;
#ifdef CONFIG_MEMCG
                        unsigned long memcg_data;
#elif defined(CONFIG_SLAB_OBJ_EXT)
                        unsigned long _unused_slab_obj_exts;
#endif
#if defined(WANT_PAGE_VIRTUAL)
                        void *virtual;
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
                        int _last_cpupid;
#endif
        /* private: the union with struct page is transitional */
                };
                struct page page;
        };
        union {
                struct {
                        unsigned long _flags_1;
                        unsigned long _head_1;
                        union {
                                struct {
        /* public: */
                                        atomic_t _large_mapcount;
                                        atomic_t _nr_pages_mapped;
#ifdef CONFIG_64BIT
                                        atomic_t _entire_mapcount;
                                        atomic_t _pincount;
#endif /* CONFIG_64BIT */
                                        mm_id_mapcount_t _mm_id_mapcount[2];
                                        union {
                                                mm_id_t _mm_id[2];
                                                unsigned long _mm_ids;
                                        };
        /* private: the union with struct page is transitional */
                                };
                                unsigned long _usable_1[4];
                        };
                        atomic_t _mapcount_1;
                        atomic_t _refcount_1;
        /* public: */
#ifdef NR_PAGES_IN_LARGE_FOLIO
                        unsigned int _nr_pages;
#endif /* NR_PAGES_IN_LARGE_FOLIO */
        /* private: the union with struct page is transitional */
                };
                struct page __page_1;
        };
        union {
                struct {
                        unsigned long _flags_2;
                        unsigned long _head_2;
        /* public: */
                        struct list_head _deferred_list;
#ifndef CONFIG_64BIT
                        atomic_t _entire_mapcount;
                        atomic_t _pincount;
#endif /* !CONFIG_64BIT */
        /* private: the union with struct page is transitional */
                };
                struct page __page_2;
        };
        union {
                struct {
                        unsigned long _flags_3;
                        unsigned long _head_3;
        /* public: */
                        void *_hugetlb_subpool;
                        void *_hugetlb_cgroup;
                        void *_hugetlb_cgroup_rsvd;
                        void *_hugetlb_hwpoison;
        /* private: the union with struct page is transitional */
                };
                struct page __page_3;
        };
};

#define FOLIO_MATCH(pg, fl)                                                \
        static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl))
FOLIO_MATCH(flags, flags);
FOLIO_MATCH(lru, lru);
FOLIO_MATCH(mapping, mapping);
FOLIO_MATCH(compound_head, lru);
FOLIO_MATCH(index, index);
FOLIO_MATCH(private, private);
FOLIO_MATCH(_mapcount, _mapcount);
FOLIO_MATCH(_refcount, _refcount);
#ifdef CONFIG_MEMCG
FOLIO_MATCH(memcg_data, memcg_data);
#endif
#if defined(WANT_PAGE_VIRTUAL)
FOLIO_MATCH(virtual, virtual);
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
FOLIO_MATCH(_last_cpupid, _last_cpupid);
#endif
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl)                                                \
        static_assert(offsetof(struct folio, fl) ==                        \
                        offsetof(struct page, pg) + sizeof(struct page))
FOLIO_MATCH(flags, _flags_1);
FOLIO_MATCH(compound_head, _head_1);
FOLIO_MATCH(_mapcount, _mapcount_1);
FOLIO_MATCH(_refcount, _refcount_1);
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl)                                                \
        static_assert(offsetof(struct folio, fl) ==                        \
                        offsetof(struct page, pg) + 2 * sizeof(struct page))
FOLIO_MATCH(flags, _flags_2);
FOLIO_MATCH(compound_head, _head_2);
#undef FOLIO_MATCH
#define FOLIO_MATCH(pg, fl)                                                \
        static_assert(offsetof(struct folio, fl) ==                        \
                        offsetof(struct page, pg) + 3 * sizeof(struct page))
FOLIO_MATCH(flags, _flags_3);
FOLIO_MATCH(compound_head, _head_3);
#undef FOLIO_MATCH

/**
 * struct ptdesc -    Memory descriptor for page tables.
 * @__page_flags:     Same as page flags. Powerpc only.
 * @pt_rcu_head:      For freeing page table pages.
 * @pt_list:          List of used page tables. Used for s390 gmap shadow pages
 *                    (which are not linked into the user page tables) and x86
 *                    pgds.
 * @_pt_pad_1:        Padding that aliases with page's compound head.
 * @pmd_huge_pte:     Protected by ptdesc->ptl, used for THPs.
 * @__page_mapping:   Aliases with page->mapping. Unused for page tables.
 * @pt_index:         Used for s390 gmap.
 * @pt_mm:            Used for x86 pgds.
 * @pt_frag_refcount: For fragmented page table tracking. Powerpc only.
 * @pt_share_count:   Used for HugeTLB PMD page table share count.
 * @_pt_pad_2:        Padding to ensure proper alignment.
 * @ptl:              Lock for the page table.
 * @__page_type:      Same as page->page_type. Unused for page tables.
 * @__page_refcount:  Same as page refcount.
 * @pt_memcg_data:    Memcg data. Tracked for page tables here.
 *
 * This struct overlays struct page for now. Do not modify without a good
 * understanding of the issues.
 */
struct ptdesc {
        unsigned long __page_flags;

        union {
                struct rcu_head pt_rcu_head;
                struct list_head pt_list;
                struct {
                        unsigned long _pt_pad_1;
                        pgtable_t pmd_huge_pte;
                };
        };
        unsigned long __page_mapping;

        union {
                pgoff_t pt_index;
                struct mm_struct *pt_mm;
                atomic_t pt_frag_refcount;
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
                atomic_t pt_share_count;
#endif
        };

        union {
                unsigned long _pt_pad_2;
#if ALLOC_SPLIT_PTLOCKS
                spinlock_t *ptl;
#else
                spinlock_t ptl;
#endif
        };
        unsigned int __page_type;
        atomic_t __page_refcount;
#ifdef CONFIG_MEMCG
        unsigned long pt_memcg_data;
#endif
};

#define TABLE_MATCH(pg, pt)                                                \
        static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt))
TABLE_MATCH(flags, __page_flags);
TABLE_MATCH(compound_head, pt_list);
TABLE_MATCH(compound_head, _pt_pad_1);
TABLE_MATCH(mapping, __page_mapping);
TABLE_MATCH(index, pt_index);
TABLE_MATCH(rcu_head, pt_rcu_head);
TABLE_MATCH(page_type, __page_type);
TABLE_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
TABLE_MATCH(memcg_data, pt_memcg_data);
#endif
#undef TABLE_MATCH
static_assert(sizeof(struct ptdesc) <= sizeof(struct page));

#define ptdesc_page(pt)                        (_Generic((pt),                        \
        const struct ptdesc *:                (const struct page *)(pt),        \
        struct ptdesc *:                (struct page *)(pt)))

#define ptdesc_folio(pt)                (_Generic((pt),                        \
        const struct ptdesc *:                (const struct folio *)(pt),        \
        struct ptdesc *:                (struct folio *)(pt)))

#define page_ptdesc(p)                        (_Generic((p),                        \
        const struct page *:                (const struct ptdesc *)(p),        \
        struct page *:                        (struct ptdesc *)(p)))

#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
{
        atomic_set(&ptdesc->pt_share_count, 0);
}

static inline void ptdesc_pmd_pts_inc(struct ptdesc *ptdesc)
{
        atomic_inc(&ptdesc->pt_share_count);
}

static inline void ptdesc_pmd_pts_dec(struct ptdesc *ptdesc)
{
        atomic_dec(&ptdesc->pt_share_count);
}

static inline int ptdesc_pmd_pts_count(struct ptdesc *ptdesc)
{
        return atomic_read(&ptdesc->pt_share_count);
}
#else
static inline void ptdesc_pmd_pts_init(struct ptdesc *ptdesc)
{
}
#endif

/*
 * Used for sizing the vmemmap region on some architectures
 */
#define STRUCT_PAGE_MAX_SHIFT        (order_base_2(sizeof(struct page)))

/*
 * page_private can be used on tail pages.  However, PagePrivate is only
 * checked by the VM on the head page.  So page_private on the tail pages
 * should be used for data that's ancillary to the head page (eg attaching
 * buffer heads to tail pages after attaching buffer heads to the head page)
 */
#define page_private(page)                ((page)->private)

static inline void set_page_private(struct page *page, unsigned long private)
{
        page->private = private;
}

static inline void *folio_get_private(struct folio *folio)
{
        return folio->private;
}

typedef unsigned long vm_flags_t;

/*
 * freeptr_t represents a SLUB freelist pointer, which might be encoded
 * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
 */
typedef struct { unsigned long v; } freeptr_t;

/*
 * A region containing a mapping of a non-memory backed file under NOMMU
 * conditions.  These are held in a global tree and are pinned by the VMAs that
 * map parts of them.
 */
struct vm_region {
        struct rb_node        vm_rb;                /* link in global region tree */
        vm_flags_t        vm_flags;        /* VMA vm_flags */
        unsigned long        vm_start;        /* start address of region */
        unsigned long        vm_end;                /* region initialised to here */
        unsigned long        vm_top;                /* region allocated to here */
        unsigned long        vm_pgoff;        /* the offset in vm_file corresponding to vm_start */
        struct file        *vm_file;        /* the backing file or NULL */

        int                vm_usage;        /* region usage count (access under nommu_region_sem) */
        bool                vm_icache_flushed : 1; /* true if the icache has been flushed for
                                                * this region */
};

#ifdef CONFIG_USERFAULTFD
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) { NULL, })
struct vm_userfaultfd_ctx {
        struct userfaultfd_ctx *ctx;
};
#else /* CONFIG_USERFAULTFD */
#define NULL_VM_UFFD_CTX ((struct vm_userfaultfd_ctx) {})
struct vm_userfaultfd_ctx {};
#endif /* CONFIG_USERFAULTFD */

struct anon_vma_name {
        struct kref kref;
        /* The name needs to be at the end because it is dynamically sized. */
        char name[];
};

#ifdef CONFIG_ANON_VMA_NAME
/*
 * mmap_lock should be read-locked when calling anon_vma_name(). Caller should
 * either keep holding the lock while using the returned pointer or it should
 * raise anon_vma_name refcount before releasing the lock.
 */
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma);
struct anon_vma_name *anon_vma_name_alloc(const char *name);
void anon_vma_name_free(struct kref *kref);
#else /* CONFIG_ANON_VMA_NAME */
static inline struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
        return NULL;
}

static inline struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
        return NULL;
}
#endif

#define VMA_LOCK_OFFSET        0x40000000
#define VMA_REF_LIMIT        (VMA_LOCK_OFFSET - 1)

struct vma_numab_state {
        /*
         * Initialised as time in 'jiffies' after which VMA
         * should be scanned.  Delays first scan of new VMA by at
         * least sysctl_numa_balancing_scan_delay:
         */
        unsigned long next_scan;

        /*
         * Time in jiffies when pids_active[] is reset to
         * detect phase change behaviour:
         */
        unsigned long pids_active_reset;

        /*
         * Approximate tracking of PIDs that trapped a NUMA hinting
         * fault. May produce false positives due to hash collisions.
         *
         *   [0] Previous PID tracking
         *   [1] Current PID tracking
         *
         * Window moves after next_pid_reset has expired approximately
         * every VMA_PID_RESET_PERIOD jiffies:
         */
        unsigned long pids_active[2];

        /* MM scan sequence ID when scan first started after VMA creation */
        int start_scan_seq;

        /*
         * MM scan sequence ID when the VMA was last completely scanned.
         * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq
         */
        int prev_scan_seq;
};

/*
 * This struct describes a virtual memory area. There is one of these
 * per VM-area/task. A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 *
 * Only explicitly marked struct members may be accessed by RCU readers before
 * getting a stable reference.
 *
 * WARNING: when adding new members, please update vm_area_init_from() to copy
 * them during vm_area_struct content duplication.
 */
struct vm_area_struct {
        /* The first cache line has the info for VMA tree walking. */

        union {
                struct {
                        /* VMA covers [vm_start; vm_end) addresses within mm */
                        unsigned long vm_start;
                        unsigned long vm_end;
                };
                freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
        };

        /*
         * The address space we belong to.
         * Unstable RCU readers are allowed to read this.
         */
        struct mm_struct *vm_mm;
        pgprot_t vm_page_prot;          /* Access permissions of this VMA. */

        /*
         * Flags, see mm.h.
         * To modify use vm_flags_{init|reset|set|clear|mod} functions.
         */
        union {
                const vm_flags_t vm_flags;
                vm_flags_t __private __vm_flags;
        };

#ifdef CONFIG_PER_VMA_LOCK
        /*
         * Can only be written (using WRITE_ONCE()) while holding both:
         *  - mmap_lock (in write mode)
         *  - vm_refcnt bit at VMA_LOCK_OFFSET is set
         * Can be read reliably while holding one of:
         *  - mmap_lock (in read or write mode)
         *  - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1
         * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
         * while holding nothing (except RCU to keep the VMA struct allocated).
         *
         * This sequence counter is explicitly allowed to overflow; sequence
         * counter reuse can only lead to occasional unnecessary use of the
         * slowpath.
         */
        unsigned int vm_lock_seq;
#endif
        /*
         * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
         * list, after a COW of one of the file pages.        A MAP_SHARED vma
         * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
         * or brk vma (with NULL file) can only be in an anon_vma list.
         */
        struct list_head anon_vma_chain; /* Serialized by mmap_lock &
                                          * page_table_lock */
        struct anon_vma *anon_vma;        /* Serialized by page_table_lock */

        /* Function pointers to deal with this struct. */
        const struct vm_operations_struct *vm_ops;

        /* Information about our backing store: */
        unsigned long vm_pgoff;                /* Offset (within vm_file) in PAGE_SIZE
                                           units */
        struct file * vm_file;                /* File we map to (can be NULL). */
        void * vm_private_data;                /* was vm_pte (shared mem) */

#ifdef CONFIG_SWAP
        atomic_long_t swap_readahead_info;
#endif
#ifndef CONFIG_MMU
        struct vm_region *vm_region;        /* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
        struct mempolicy *vm_policy;        /* NUMA policy for the VMA */
#endif
#ifdef CONFIG_NUMA_BALANCING
        struct vma_numab_state *numab_state;        /* NUMA Balancing state */
#endif
#ifdef CONFIG_PER_VMA_LOCK
        /* Unstable RCU readers are allowed to read this. */
        refcount_t vm_refcnt ____cacheline_aligned_in_smp;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map vmlock_dep_map;
#endif
#endif
        /*
         * For areas with an address space and backing store,
         * linkage into the address_space->i_mmap interval tree.
         *
         */
        struct {
                struct rb_node rb;
                unsigned long rb_subtree_last;
        } shared;
#ifdef CONFIG_ANON_VMA_NAME
        /*
         * For private and shared anonymous mappings, a pointer to a null
         * terminated string containing the name given to the vma, or NULL if
         * unnamed. Serialized by mmap_lock. Use anon_vma_name to access.
         */
        struct anon_vma_name *anon_name;
#endif
        struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;

#ifdef CONFIG_NUMA
#define vma_policy(vma) ((vma)->vm_policy)
#else
#define vma_policy(vma) NULL
#endif

#ifdef CONFIG_SCHED_MM_CID
struct mm_cid {
        u64 time;
        int cid;
        int recent_cid;
};
#endif

struct kioctx_table;
struct iommu_mm_data;
struct mm_struct {
        struct {
                /*
                 * Fields which are often written to are placed in a separate
                 * cache line.
                 */
                struct {
                        /**
                         * @mm_count: The number of references to &struct
                         * mm_struct (@mm_users count as 1).
                         *
                         * Use mmgrab()/mmdrop() to modify. When this drops to
                         * 0, the &struct mm_struct is freed.
                         */
                        atomic_t mm_count;
                } ____cacheline_aligned_in_smp;

                struct maple_tree mm_mt;

                unsigned long mmap_base;        /* base of mmap area */
                unsigned long mmap_legacy_base;        /* base of mmap area in bottom-up allocations */
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
                /* Base addresses for compatible mmap() */
                unsigned long mmap_compat_base;
                unsigned long mmap_compat_legacy_base;
#endif
                unsigned long task_size;        /* size of task vm space */
                pgd_t * pgd;

#ifdef CONFIG_MEMBARRIER
                /**
                 * @membarrier_state: Flags controlling membarrier behavior.
                 *
                 * This field is close to @pgd to hopefully fit in the same
                 * cache-line, which needs to be touched by switch_mm().
                 */
                atomic_t membarrier_state;
#endif

                /**
                 * @mm_users: The number of users including userspace.
                 *
                 * Use mmget()/mmget_not_zero()/mmput() to modify. When this
                 * drops to 0 (i.e. when the task exits and there are no other
                 * temporary reference holders), we also release a reference on
                 * @mm_count (which may then free the &struct mm_struct if
                 * @mm_count also drops to 0).
                 */
                atomic_t mm_users;

#ifdef CONFIG_SCHED_MM_CID
                /**
                 * @pcpu_cid: Per-cpu current cid.
                 *
                 * Keep track of the currently allocated mm_cid for each cpu.
                 * The per-cpu mm_cid values are serialized by their respective
                 * runqueue locks.
                 */
                struct mm_cid __percpu *pcpu_cid;
                /*
                 * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
                 *
                 * When the next mm_cid scan is due (in jiffies).
                 */
                unsigned long mm_cid_next_scan;
                /**
                 * @nr_cpus_allowed: Number of CPUs allowed for mm.
                 *
                 * Number of CPUs allowed in the union of all mm's
                 * threads allowed CPUs.
                 */
                unsigned int nr_cpus_allowed;
                /**
                 * @max_nr_cid: Maximum number of allowed concurrency
                 *              IDs allocated.
                 *
                 * Track the highest number of allowed concurrency IDs
                 * allocated for the mm.
                 */
                atomic_t max_nr_cid;
                /**
                 * @cpus_allowed_lock: Lock protecting mm cpus_allowed.
                 *
                 * Provide mutual exclusion for mm cpus_allowed and
                 * mm nr_cpus_allowed updates.
                 */
                raw_spinlock_t cpus_allowed_lock;
#endif
#ifdef CONFIG_MMU
                atomic_long_t pgtables_bytes;        /* size of all page tables */
#endif
                int map_count;                        /* number of VMAs */

                spinlock_t page_table_lock; /* Protects page tables and some
                                             * counters
                                             */
                /*
                 * With some kernel config, the current mmap_lock's offset
                 * inside 'mm_struct' is at 0x120, which is very optimal, as
                 * its two hot fields 'count' and 'owner' sit in 2 different
                 * cachelines,  and when mmap_lock is highly contended, both
                 * of the 2 fields will be accessed frequently, current layout
                 * will help to reduce cache bouncing.
                 *
                 * So please be careful with adding new fields before
                 * mmap_lock, which can easily push the 2 fields into one
                 * cacheline.
                 */
                struct rw_semaphore mmap_lock;

                struct list_head mmlist; /* List of maybe swapped mm's.        These
                                          * are globally strung together off
                                          * init_mm.mmlist, and are protected
                                          * by mmlist_lock
                                          */
#ifdef CONFIG_PER_VMA_LOCK
                struct rcuwait vma_writer_wait;
                /*
                 * This field has lock-like semantics, meaning it is sometimes
                 * accessed with ACQUIRE/RELEASE semantics.
                 * Roughly speaking, incrementing the sequence number is
                 * equivalent to releasing locks on VMAs; reading the sequence
                 * number can be part of taking a read lock on a VMA.
                 * Incremented every time mmap_lock is write-locked/unlocked.
                 * Initialized to 0, therefore odd values indicate mmap_lock
                 * is write-locked and even values that it's released.
                 *
                 * Can be modified under write mmap_lock using RELEASE
                 * semantics.
                 * Can be read with no other protection when holding write
                 * mmap_lock.
                 * Can be read with ACQUIRE semantics if not holding write
                 * mmap_lock.
                 */
                seqcount_t mm_lock_seq;
#endif


                unsigned long hiwater_rss; /* High-watermark of RSS usage */
                unsigned long hiwater_vm;  /* High-water virtual memory usage */

                unsigned long total_vm;           /* Total pages mapped */
                unsigned long locked_vm;   /* Pages that have PG_mlocked set */
                atomic64_t    pinned_vm;   /* Refcount permanently increased */
                unsigned long data_vm;           /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
                unsigned long exec_vm;           /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
                unsigned long stack_vm;           /* VM_STACK */
                unsigned long def_flags;

                /**
                 * @write_protect_seq: Locked when any thread is write
                 * protecting pages mapped by this mm to enforce a later COW,
                 * for instance during page table copying for fork().
                 */
                seqcount_t write_protect_seq;

                spinlock_t arg_lock; /* protect the below fields */

                unsigned long start_code, end_code, start_data, end_data;
                unsigned long start_brk, brk, start_stack;
                unsigned long arg_start, arg_end, env_start, env_end;

                unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

                struct percpu_counter rss_stat[NR_MM_COUNTERS];

                struct linux_binfmt *binfmt;

                /* Architecture-specific MM context */
                mm_context_t context;

                unsigned long flags; /* Must use atomic bitops to access */

#ifdef CONFIG_AIO
                spinlock_t                        ioctx_lock;
                struct kioctx_table __rcu        *ioctx_table;
#endif
#ifdef CONFIG_MEMCG
                /*
                 * "owner" points to a task that is regarded as the canonical
                 * user/owner of this mm. All of the following must be true in
                 * order for it to be changed:
                 *
                 * current == mm->owner
                 * current->mm != mm
                 * new_owner->mm == mm
                 * new_owner->alloc_lock is held
                 */
                struct task_struct __rcu *owner;
#endif
                struct user_namespace *user_ns;

                /* store ref to file /proc/<pid>/exe symlink points to */
                struct file __rcu *exe_file;
#ifdef CONFIG_MMU_NOTIFIER
                struct mmu_notifier_subscriptions *notifier_subscriptions;
#endif
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
                pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
#ifdef CONFIG_NUMA_BALANCING
                /*
                 * numa_next_scan is the next time that PTEs will be remapped
                 * PROT_NONE to trigger NUMA hinting faults; such faults gather
                 * statistics and migrate pages to new nodes if necessary.
                 */
                unsigned long numa_next_scan;

                /* Restart point for scanning and remapping PTEs. */
                unsigned long numa_scan_offset;

                /* numa_scan_seq prevents two threads remapping PTEs. */
                int numa_scan_seq;
#endif
                /*
                 * An operation with batched TLB flushing is going on. Anything
                 * that can move process memory needs to flush the TLB when
                 * moving a PROT_NONE mapped page.
                 */
                atomic_t tlb_flush_pending;
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
                /* See flush_tlb_batched_pending() */
                atomic_t tlb_flush_batched;
#endif
                struct uprobes_state uprobes_state;
#ifdef CONFIG_PREEMPT_RT
                struct rcu_head delayed_drop;
#endif
#ifdef CONFIG_HUGETLB_PAGE
                atomic_long_t hugetlb_usage;
#endif
                struct work_struct async_put_work;

#ifdef CONFIG_IOMMU_MM_DATA
                struct iommu_mm_data *iommu_mm;
#endif
#ifdef CONFIG_KSM
                /*
                 * Represent how many pages of this process are involved in KSM
                 * merging (not including ksm_zero_pages).
                 */
                unsigned long ksm_merging_pages;
                /*
                 * Represent how many pages are checked for ksm merging
                 * including merged and not merged.
                 */
                unsigned long ksm_rmap_items;
                /*
                 * Represent how many empty pages are merged with kernel zero
                 * pages when enabling KSM use_zero_pages.
                 */
                atomic_long_t ksm_zero_pages;
#endif /* CONFIG_KSM */
#ifdef CONFIG_LRU_GEN_WALKS_MMU
                struct {
                        /* this mm_struct is on lru_gen_mm_list */
                        struct list_head list;
                        /*
                         * Set when switching to this mm_struct, as a hint of
                         * whether it has been used since the last time per-node
                         * page table walkers cleared the corresponding bits.
                         */
                        unsigned long bitmap;
#ifdef CONFIG_MEMCG
                        /* points to the memcg of "owner" above */
                        struct mem_cgroup *memcg;
#endif
                } lru_gen;
#endif /* CONFIG_LRU_GEN_WALKS_MMU */
#ifdef CONFIG_MM_ID
                mm_id_t mm_id;
#endif /* CONFIG_MM_ID */
        } __randomize_layout;

        /*
         * The mm_cpumask needs to be at the end of mm_struct, because it
         * is dynamically sized based on nr_cpu_ids.
         */
        unsigned long cpu_bitmap[];
};

#define MM_MT_FLAGS        (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
                         MT_FLAGS_USE_RCU)
extern struct mm_struct init_mm;

/* Pointer magic because the dynamic array size confuses some compilers. */
static inline void mm_init_cpumask(struct mm_struct *mm)
{
        unsigned long cpu_bitmap = (unsigned long)mm;

        cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
        cpumask_clear((struct cpumask *)cpu_bitmap);
}

/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
{
        return (struct cpumask *)&mm->cpu_bitmap;
}

#ifdef CONFIG_LRU_GEN

struct lru_gen_mm_list {
        /* mm_struct list for page table walkers */
        struct list_head fifo;
        /* protects the list above */
        spinlock_t lock;
};

#endif /* CONFIG_LRU_GEN */

#ifdef CONFIG_LRU_GEN_WALKS_MMU

void lru_gen_add_mm(struct mm_struct *mm);
void lru_gen_del_mm(struct mm_struct *mm);
void lru_gen_migrate_mm(struct mm_struct *mm);

static inline void lru_gen_init_mm(struct mm_struct *mm)
{
        INIT_LIST_HEAD(&mm->lru_gen.list);
        mm->lru_gen.bitmap = 0;
#ifdef CONFIG_MEMCG
        mm->lru_gen.memcg = NULL;
#endif
}

static inline void lru_gen_use_mm(struct mm_struct *mm)
{
        /*
         * When the bitmap is set, page reclaim knows this mm_struct has been
         * used since the last time it cleared the bitmap. So it might be worth
         * walking the page tables of this mm_struct to clear the accessed bit.
         */
        WRITE_ONCE(mm->lru_gen.bitmap, -1);
}

#else /* !CONFIG_LRU_GEN_WALKS_MMU */

static inline void lru_gen_add_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_del_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_migrate_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_init_mm(struct mm_struct *mm)
{
}

static inline void lru_gen_use_mm(struct mm_struct *mm)
{
}

#endif /* CONFIG_LRU_GEN_WALKS_MMU */

struct vma_iterator {
        struct ma_state mas;
};

#define VMA_ITERATOR(name, __mm, __addr)                                \
        struct vma_iterator name = {                                        \
                .mas = {                                                \
                        .tree = &(__mm)->mm_mt,                                \
                        .index = __addr,                                \
                        .node = NULL,                                        \
                        .status = ma_start,                                \
                },                                                        \
        }

static inline void vma_iter_init(struct vma_iterator *vmi,
                struct mm_struct *mm, unsigned long addr)
{
        mas_init(&vmi->mas, &mm->mm_mt, addr);
}

#ifdef CONFIG_SCHED_MM_CID

enum mm_cid_state {
        MM_CID_UNSET = -1U,                /* Unset state has lazy_put flag set. */
        MM_CID_LAZY_PUT = (1U << 31),
};

static inline bool mm_cid_is_unset(int cid)
{
        return cid == MM_CID_UNSET;
}

static inline bool mm_cid_is_lazy_put(int cid)
{
        return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
}

static inline bool mm_cid_is_valid(int cid)
{
        return !(cid & MM_CID_LAZY_PUT);
}

static inline int mm_cid_set_lazy_put(int cid)
{
        return cid | MM_CID_LAZY_PUT;
}

static inline int mm_cid_clear_lazy_put(int cid)
{
        return cid & ~MM_CID_LAZY_PUT;
}

/*
 * mm_cpus_allowed: Union of all mm's threads allowed CPUs.
 */
static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm)
{
        unsigned long bitmap = (unsigned long)mm;

        bitmap += offsetof(struct mm_struct, cpu_bitmap);
        /* Skip cpu_bitmap */
        bitmap += cpumask_size();
        return (struct cpumask *)bitmap;
}

/* Accessor for struct mm_struct's cidmask. */
static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
{
        unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm);

        /* Skip mm_cpus_allowed */
        cid_bitmap += cpumask_size();
        return (struct cpumask *)cid_bitmap;
}

static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
{
        int i;

        for_each_possible_cpu(i) {
                struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);

                pcpu_cid->cid = MM_CID_UNSET;
                pcpu_cid->recent_cid = MM_CID_UNSET;
                pcpu_cid->time = 0;
        }
        mm->nr_cpus_allowed = p->nr_cpus_allowed;
        atomic_set(&mm->max_nr_cid, 0);
        raw_spin_lock_init(&mm->cpus_allowed_lock);
        cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
        cpumask_clear(mm_cidmask(mm));
}

static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p)
{
        mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid);
        if (!mm->pcpu_cid)
                return -ENOMEM;
        mm_init_cid(mm, p);
        return 0;
}
#define mm_alloc_cid(...)        alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))

static inline void mm_destroy_cid(struct mm_struct *mm)
{
        free_percpu(mm->pcpu_cid);
        mm->pcpu_cid = NULL;
}

static inline unsigned int mm_cid_size(void)
{
        return 2 * cpumask_size();        /* mm_cpus_allowed(), mm_cidmask(). */
}

static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask)
{
        struct cpumask *mm_allowed = mm_cpus_allowed(mm);

        if (!mm)
                return;
        /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */
        raw_spin_lock(&mm->cpus_allowed_lock);
        cpumask_or(mm_allowed, mm_allowed, cpumask);
        WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
        raw_spin_unlock(&mm->cpus_allowed_lock);
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
static inline void mm_destroy_cid(struct mm_struct *mm) { }

static inline unsigned int mm_cid_size(void)
{
        return 0;
}
static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
#endif /* CONFIG_SCHED_MM_CID */

struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_finish_mmu(struct mmu_gather *tlb);

struct vm_fault;

/**
 * typedef vm_fault_t - Return type for page fault handlers.
 *
 * Page fault handlers return a bitmask of %VM_FAULT values.
 */
typedef __bitwise unsigned int vm_fault_t;

/**
 * enum vm_fault_reason - Page fault handlers return a bitmask of
 * these values to tell the core VM what happened when handling the
 * fault. Used to decide whether a process gets delivered SIGBUS or
 * just gets major/minor fault counters bumped up.
 *
 * @VM_FAULT_OOM:                Out Of Memory
 * @VM_FAULT_SIGBUS:                Bad access
 * @VM_FAULT_MAJOR:                Page read from storage
 * @VM_FAULT_HWPOISON:                Hit poisoned small page
 * @VM_FAULT_HWPOISON_LARGE:        Hit poisoned large page. Index encoded
 *                                in upper bits
 * @VM_FAULT_SIGSEGV:                segmentation fault
 * @VM_FAULT_NOPAGE:                ->fault installed the pte, not return page
 * @VM_FAULT_LOCKED:                ->fault locked the returned page
 * @VM_FAULT_RETRY:                ->fault blocked, must retry
 * @VM_FAULT_FALLBACK:                huge page fault failed, fall back to small
 * @VM_FAULT_DONE_COW:                ->fault has fully handled COW
 * @VM_FAULT_NEEDDSYNC:                ->fault did not modify page tables and needs
 *                                fsync() to complete (for synchronous page faults
 *                                in DAX)
 * @VM_FAULT_COMPLETED:                ->fault completed, meanwhile mmap lock released
 * @VM_FAULT_HINDEX_MASK:        mask HINDEX value
 *
 */
enum vm_fault_reason {
        VM_FAULT_OOM            = (__force vm_fault_t)0x000001,
        VM_FAULT_SIGBUS         = (__force vm_fault_t)0x000002,
        VM_FAULT_MAJOR          = (__force vm_fault_t)0x000004,
        VM_FAULT_HWPOISON       = (__force vm_fault_t)0x000010,
        VM_FAULT_HWPOISON_LARGE = (__force vm_fault_t)0x000020,
        VM_FAULT_SIGSEGV        = (__force vm_fault_t)0x000040,
        VM_FAULT_NOPAGE         = (__force vm_fault_t)0x000100,
        VM_FAULT_LOCKED         = (__force vm_fault_t)0x000200,
        VM_FAULT_RETRY          = (__force vm_fault_t)0x000400,
        VM_FAULT_FALLBACK       = (__force vm_fault_t)0x000800,
        VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
        VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
        VM_FAULT_COMPLETED      = (__force vm_fault_t)0x004000,
        VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
};

/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((__force vm_fault_t)((x) << 16))
#define VM_FAULT_GET_HINDEX(x) (((__force unsigned int)(x) >> 16) & 0xf)

#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS |        \
                        VM_FAULT_SIGSEGV | VM_FAULT_HWPOISON |        \
                        VM_FAULT_HWPOISON_LARGE | VM_FAULT_FALLBACK)

#define VM_FAULT_RESULT_TRACE \
        { VM_FAULT_OOM,                 "OOM" },        \
        { VM_FAULT_SIGBUS,              "SIGBUS" },        \
        { VM_FAULT_MAJOR,               "MAJOR" },        \
        { VM_FAULT_HWPOISON,            "HWPOISON" },        \
        { VM_FAULT_HWPOISON_LARGE,      "HWPOISON_LARGE" },        \
        { VM_FAULT_SIGSEGV,             "SIGSEGV" },        \
        { VM_FAULT_NOPAGE,              "NOPAGE" },        \
        { VM_FAULT_LOCKED,              "LOCKED" },        \
        { VM_FAULT_RETRY,               "RETRY" },        \
        { VM_FAULT_FALLBACK,            "FALLBACK" },        \
        { VM_FAULT_DONE_COW,            "DONE_COW" },        \
        { VM_FAULT_NEEDDSYNC,           "NEEDDSYNC" },        \
        { VM_FAULT_COMPLETED,           "COMPLETED" }

struct vm_special_mapping {
        const char *name;        /* The name, e.g. "[vdso]". */

        /*
         * If .fault is not provided, this points to a
         * NULL-terminated array of pages that back the special mapping.
         *
         * This must not be NULL unless .fault is provided.
         */
        struct page **pages;

        /*
         * If non-NULL, then this is called to resolve page faults
         * on the special mapping.  If used, .pages is not checked.
         */
        vm_fault_t (*fault)(const struct vm_special_mapping *sm,
                                struct vm_area_struct *vma,
                                struct vm_fault *vmf);

        int (*mremap)(const struct vm_special_mapping *sm,
                     struct vm_area_struct *new_vma);

        void (*close)(const struct vm_special_mapping *sm,
                      struct vm_area_struct *vma);
};

enum tlb_flush_reason {
        TLB_FLUSH_ON_TASK_SWITCH,
        TLB_REMOTE_SHOOTDOWN,
        TLB_LOCAL_SHOOTDOWN,
        TLB_LOCAL_MM_SHOOTDOWN,
        TLB_REMOTE_SEND_IPI,
        TLB_REMOTE_WRONG_CPU,
        NR_TLB_FLUSH_REASONS,
};

/**
 * enum fault_flag - Fault flag definitions.
 * @FAULT_FLAG_WRITE: Fault was a write fault.
 * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
 * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
 * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
 * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
 * @FAULT_FLAG_TRIED: The fault has been tried once.
 * @FAULT_FLAG_USER: The fault originated in userspace.
 * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
 * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
 * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
 * @FAULT_FLAG_UNSHARE: The fault is an unsharing request to break COW in a
 *                      COW mapping, making sure that an exclusive anon page is
 *                      mapped after the fault.
 * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
 *                        We should only access orig_pte if this flag set.
 * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
 *
 * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
 * whether we would allow page faults to retry by specifying these two
 * fault flags correctly.  Currently there can be three legal combinations:
 *
 * (a) ALLOW_RETRY and !TRIED:  this means the page fault allows retry, and
 *                              this is the first try
 *
 * (b) ALLOW_RETRY and TRIED:   this means the page fault allows retry, and
 *                              we've already tried at least once
 *
 * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
 *
 * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
 * be used.  Note that page faults can be allowed to retry for multiple times,
 * in which case we'll have an initial fault with flags (a) then later on
 * continuous faults with flags (b).  We should always try to detect pending
 * signals before a retry to make sure the continuous page faults can still be
 * interrupted if necessary.
 *
 * The combination FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE is illegal.
 * FAULT_FLAG_UNSHARE is ignored and treated like an ordinary read fault when
 * applied to mappings that are not COW mappings.
 */
enum fault_flag {
        FAULT_FLAG_WRITE =                1 << 0,
        FAULT_FLAG_MKWRITE =                1 << 1,
        FAULT_FLAG_ALLOW_RETRY =        1 << 2,
        FAULT_FLAG_RETRY_NOWAIT =         1 << 3,
        FAULT_FLAG_KILLABLE =                1 << 4,
        FAULT_FLAG_TRIED =                 1 << 5,
        FAULT_FLAG_USER =                1 << 6,
        FAULT_FLAG_REMOTE =                1 << 7,
        FAULT_FLAG_INSTRUCTION =        1 << 8,
        FAULT_FLAG_INTERRUPTIBLE =        1 << 9,
        FAULT_FLAG_UNSHARE =                1 << 10,
        FAULT_FLAG_ORIG_PTE_VALID =        1 << 11,
        FAULT_FLAG_VMA_LOCK =                1 << 12,
};

typedef unsigned int __bitwise zap_flags_t;

/* Flags for clear_young_dirty_ptes(). */
typedef int __bitwise cydp_t;

/* Clear the access bit */
#define CYDP_CLEAR_YOUNG                ((__force cydp_t)BIT(0))

/* Clear the dirty bit */
#define CYDP_CLEAR_DIRTY                ((__force cydp_t)BIT(1))

/*
 * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
 * other. Here is what they mean, and how to use them:
 *
 *
 * FIXME: For pages which are part of a filesystem, mappings are subject to the
 * lifetime enforced by the filesystem and we need guarantees that longterm
 * users like RDMA and V4L2 only establish mappings which coordinate usage with
 * the filesystem.  Ideas for this coordination include revoking the longterm
 * pin, delaying writeback, bounce buffer page writeback, etc.  As FS DAX was
 * added after the problem with filesystems was found FS DAX VMAs are
 * specifically failed.  Filesystem pages are still subject to bugs and use of
 * FOLL_LONGTERM should be avoided on those pages.
 *
 * In the CMA case: long term pins in a CMA region would unnecessarily fragment
 * that region.  And so, CMA attempts to migrate the page before pinning, when
 * FOLL_LONGTERM is specified.
 *
 * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount,
 * but an additional pin counting system) will be invoked. This is intended for
 * anything that gets a page reference and then touches page data (for example,
 * Direct IO). This lets the filesystem know that some non-file-system entity is
 * potentially changing the pages' data. In contrast to FOLL_GET (whose pages
 * are released via put_page()), FOLL_PIN pages must be released, ultimately, by
 * a call to unpin_user_page().
 *
 * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different
 * and separate refcounting mechanisms, however, and that means that each has
 * its own acquire and release mechanisms:
 *
 *     FOLL_GET: get_user_pages*() to acquire, and put_page() to release.
 *
 *     FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release.
 *
 * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call.
 * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based
 * calls applied to them, and that's perfectly OK. This is a constraint on the
 * callers, not on the pages.)
 *
 * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never
 * directly by the caller. That's in order to help avoid mismatches when
 * releasing pages: get_user_pages*() pages must be released via put_page(),
 * while pin_user_pages*() pages must be released via unpin_user_page().
 *
 * Please see Documentation/core-api/pin_user_pages.rst for more information.
 */

enum {
        /* check pte is writable */
        FOLL_WRITE = 1 << 0,
        /* do get_page on page */
        FOLL_GET = 1 << 1,
        /* give error on hole if it would be zero */
        FOLL_DUMP = 1 << 2,
        /* get_user_pages read/write w/o permission */
        FOLL_FORCE = 1 << 3,
        /*
         * if a disk transfer is needed, start the IO and return without waiting
         * upon it
         */
        FOLL_NOWAIT = 1 << 4,
        /* do not fault in pages */
        FOLL_NOFAULT = 1 << 5,
        /* check page is hwpoisoned */
        FOLL_HWPOISON = 1 << 6,
        /* don't do file mappings */
        FOLL_ANON = 1 << 7,
        /*
         * FOLL_LONGTERM indicates that the page will be held for an indefinite
         * time period _often_ under userspace control.  This is in contrast to
         * iov_iter_get_pages(), whose usages are transient.
         */
        FOLL_LONGTERM = 1 << 8,
        /* split huge pmd before returning */
        FOLL_SPLIT_PMD = 1 << 9,
        /* allow returning PCI P2PDMA pages */
        FOLL_PCI_P2PDMA = 1 << 10,
        /* allow interrupts from generic signals */
        FOLL_INTERRUPTIBLE = 1 << 11,
        /*
         * Always honor (trigger) NUMA hinting faults.
         *
         * FOLL_WRITE implicitly honors NUMA hinting faults because a
         * PROT_NONE-mapped page is not writable (exceptions with FOLL_FORCE
         * apply). get_user_pages_fast_only() always implicitly honors NUMA
         * hinting faults.
         */
        FOLL_HONOR_NUMA_FAULT = 1 << 12,

        /* See also internal only FOLL flags in mm/internal.h */
};

/* mm flags */

/*
 * The first two bits represent core dump modes for set-user-ID,
 * the modes are SUID_DUMP_* defined in linux/sched/coredump.h
 */
#define MMF_DUMPABLE_BITS 2
#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE        2
#define MMF_DUMP_ANON_SHARED        3
#define MMF_DUMP_MAPPED_PRIVATE        4
#define MMF_DUMP_MAPPED_SHARED        5
#define MMF_DUMP_ELF_HEADERS        6
#define MMF_DUMP_HUGETLB_PRIVATE 7
#define MMF_DUMP_HUGETLB_SHARED  8
#define MMF_DUMP_DAX_PRIVATE        9
#define MMF_DUMP_DAX_SHARED        10

#define MMF_DUMP_FILTER_SHIFT        MMF_DUMPABLE_BITS
#define MMF_DUMP_FILTER_BITS        9
#define MMF_DUMP_FILTER_MASK \
        (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
#define MMF_DUMP_FILTER_DEFAULT \
        ((1 << MMF_DUMP_ANON_PRIVATE) |        (1 << MMF_DUMP_ANON_SHARED) |\
         (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)

#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
# define MMF_DUMP_MASK_DEFAULT_ELF        (1 << MMF_DUMP_ELF_HEADERS)
#else
# define MMF_DUMP_MASK_DEFAULT_ELF        0
#endif
                                        /* leave room for more dump flags */
#define MMF_VM_MERGEABLE        16        /* KSM may merge identical pages */
#define MMF_VM_HUGEPAGE                17        /* set when mm is available for khugepaged */

/*
 * This one-shot flag is dropped due to necessity of changing exe once again
 * on NFS restore
 */
//#define MMF_EXE_FILE_CHANGED        18        /* see prctl_set_mm_exe_file() */

#define MMF_HAS_UPROBES                19        /* has uprobes */
#define MMF_RECALC_UPROBES        20        /* MMF_HAS_UPROBES can be wrong */
#define MMF_OOM_SKIP                21        /* mm is of no interest for the OOM killer */
#define MMF_UNSTABLE                22        /* mm is unstable for copy_from_user */
#define MMF_HUGE_ZERO_PAGE        23      /* mm has ever used the global huge zero page */
#define MMF_DISABLE_THP                24        /* disable THP for all VMAs */
#define MMF_DISABLE_THP_MASK        (1 << MMF_DISABLE_THP)
#define MMF_OOM_REAP_QUEUED        25        /* mm was queued for oom_reaper */
#define MMF_MULTIPROCESS        26        /* mm is shared between processes */
/*
 * MMF_HAS_PINNED: Whether this mm has pinned any pages.  This can be either
 * replaced in the future by mm.pinned_vm when it becomes stable, or grow into
 * a counter on its own. We're aggresive on this bit for now: even if the
 * pinned pages were unpinned later on, we'll still keep this bit set for the
 * lifecycle of this mm, just for simplicity.
 */
#define MMF_HAS_PINNED                27        /* FOLL_PIN has run, never cleared */

#define MMF_HAS_MDWE                28
#define MMF_HAS_MDWE_MASK        (1 << MMF_HAS_MDWE)


#define MMF_HAS_MDWE_NO_INHERIT        29

#define MMF_VM_MERGE_ANY        30
#define MMF_VM_MERGE_ANY_MASK        (1 << MMF_VM_MERGE_ANY)

#define MMF_TOPDOWN                31        /* mm searches top down by default */
#define MMF_TOPDOWN_MASK        (1 << MMF_TOPDOWN)

#define MMF_INIT_MASK                (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
                                 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
                                 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)

static inline unsigned long mmf_init_flags(unsigned long flags)
{
        if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT))
                flags &= ~((1UL << MMF_HAS_MDWE) |
                           (1UL << MMF_HAS_MDWE_NO_INHERIT));
        return flags & MMF_INIT_MASK;
}

#endif /* _LINUX_MM_TYPES_H */





















































    1 
    1 
















    1 


    1 
    1 


    1 

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/*
 * llc_core.c - Minimum needed routines for sap handling and module init/exit
 *
 * Copyright (c) 1997 by Procom Technology, Inc.
 *                  2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 * This program can be redistributed or modified under the terms of the
 * GNU General Public License as published by the Free Software Foundation.
 * This program is distributed without any warranty or implied warranty
 * of merchantability or fitness for a particular purpose.
 *
 * See the GNU General Public License for more details.
 */

#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/init.h>
#include <net/net_namespace.h>
#include <net/llc.h>

LIST_HEAD(llc_sap_list);
static DEFINE_SPINLOCK(llc_sap_list_lock);

/**
 *        llc_sap_alloc - allocates and initializes sap.
 *
 *        Allocates and initializes sap.
 */
static struct llc_sap *llc_sap_alloc(void)
{
        struct llc_sap *sap = kzalloc(sizeof(*sap), GFP_ATOMIC);
        int i;

        if (sap) {
                /* sap->laddr.mac - leave as a null, it's filled by bind */
                sap->state = LLC_SAP_STATE_ACTIVE;
                spin_lock_init(&sap->sk_lock);
                for (i = 0; i < LLC_SK_LADDR_HASH_ENTRIES; i++)
                        INIT_HLIST_NULLS_HEAD(&sap->sk_laddr_hash[i], i);
                refcount_set(&sap->refcnt, 1);
        }
        return sap;
}

static struct llc_sap *__llc_sap_find(unsigned char sap_value)
{
        struct llc_sap *sap;

        list_for_each_entry(sap, &llc_sap_list, node)
                if (sap->laddr.lsap == sap_value)
                        goto out;
        sap = NULL;
out:
        return sap;
}

/**
 *        llc_sap_find - searches a SAP in station
 *        @sap_value: sap to be found
 *
 *        Searches for a sap in the sap list of the LLC's station upon the sap ID.
 *        If the sap is found it will be refcounted and the user will have to do
 *        a llc_sap_put after use.
 *        Returns the sap or %NULL if not found.
 */
struct llc_sap *llc_sap_find(unsigned char sap_value)
{
        struct llc_sap *sap;

        rcu_read_lock_bh();
        sap = __llc_sap_find(sap_value);
        if (!sap || !llc_sap_hold_safe(sap))
                sap = NULL;
        rcu_read_unlock_bh();
        return sap;
}

/**
 *        llc_sap_open - open interface to the upper layers.
 *        @lsap: SAP number.
 *        @func: rcv func for datalink protos
 *
 *        Interface function to upper layer. Each one who wants to get a SAP
 *        (for example NetBEUI) should call this function. Returns the opened
 *        SAP for success, NULL for failure.
 */
struct llc_sap *llc_sap_open(unsigned char lsap,
                             int (*func)(struct sk_buff *skb,
                                         struct net_device *dev,
                                         struct packet_type *pt,
                                         struct net_device *orig_dev))
{
        struct llc_sap *sap = NULL;

        spin_lock_bh(&llc_sap_list_lock);
        if (__llc_sap_find(lsap)) /* SAP already exists */
                goto out;
        sap = llc_sap_alloc();
        if (!sap)
                goto out;
        sap->laddr.lsap = lsap;
        sap->rcv_func        = func;
        list_add_tail_rcu(&sap->node, &llc_sap_list);
out:
        spin_unlock_bh(&llc_sap_list_lock);
        return sap;
}

/**
 *        llc_sap_close - close interface for upper layers.
 *        @sap: SAP to be closed.
 *
 *        Close interface function to upper layer. Each one who wants to
 *        close an open SAP (for example NetBEUI) should call this function.
 *         Removes this sap from the list of saps in the station and then
 *         frees the memory for this sap.
 */
void llc_sap_close(struct llc_sap *sap)
{
        WARN_ON(sap->sk_count);

        spin_lock_bh(&llc_sap_list_lock);
        list_del_rcu(&sap->node);
        spin_unlock_bh(&llc_sap_list_lock);

        kfree_rcu(sap, rcu);
}

static struct packet_type llc_packet_type __read_mostly = {
        .type = cpu_to_be16(ETH_P_802_2),
        .func = llc_rcv,
};

static int __init llc_init(void)
{
        dev_add_pack(&llc_packet_type);
        return 0;
}

static void __exit llc_exit(void)
{
        dev_remove_pack(&llc_packet_type);
}

module_init(llc_init);
module_exit(llc_exit);

EXPORT_SYMBOL(llc_sap_list);
EXPORT_SYMBOL(llc_sap_find);
EXPORT_SYMBOL(llc_sap_open);
EXPORT_SYMBOL(llc_sap_close);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003");
MODULE_DESCRIPTION("LLC IEEE 802.2 core support");






















































































































































































































































































   96 





































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_NODEMASK_H
#define __LINUX_NODEMASK_H

/*
 * Nodemasks provide a bitmap suitable for representing the
 * set of Node's in a system, one bit position per Node number.
 *
 * See detailed comments in the file linux/bitmap.h describing the
 * data type on which these nodemasks are based.
 *
 * For details of nodemask_parse_user(), see bitmap_parse_user() in
 * lib/bitmap.c.  For details of nodelist_parse(), see bitmap_parselist(),
 * also in bitmap.c.  For details of node_remap(), see bitmap_bitremap in
 * lib/bitmap.c.  For details of nodes_remap(), see bitmap_remap in
 * lib/bitmap.c.  For details of nodes_onto(), see bitmap_onto in
 * lib/bitmap.c.  For details of nodes_fold(), see bitmap_fold in
 * lib/bitmap.c.
 *
 * The available nodemask operations are:
 *
 * void node_set(node, mask)                turn on bit 'node' in mask
 * void node_clear(node, mask)                turn off bit 'node' in mask
 * void nodes_setall(mask)                set all bits
 * void nodes_clear(mask)                clear all bits
 * int node_isset(node, mask)                true iff bit 'node' set in mask
 * int node_test_and_set(node, mask)        test and set bit 'node' in mask
 *
 * void nodes_and(dst, src1, src2)        dst = src1 & src2  [intersection]
 * void nodes_or(dst, src1, src2)        dst = src1 | src2  [union]
 * void nodes_xor(dst, src1, src2)        dst = src1 ^ src2
 * void nodes_andnot(dst, src1, src2)        dst = src1 & ~src2
 * void nodes_complement(dst, src)        dst = ~src
 *
 * int nodes_equal(mask1, mask2)        Does mask1 == mask2?
 * int nodes_intersects(mask1, mask2)        Do mask1 and mask2 intersect?
 * int nodes_subset(mask1, mask2)        Is mask1 a subset of mask2?
 * int nodes_empty(mask)                Is mask empty (no bits sets)?
 * int nodes_full(mask)                        Is mask full (all bits sets)?
 * int nodes_weight(mask)                Hamming weight - number of set bits
 *
 * void nodes_shift_right(dst, src, n)        Shift right
 * void nodes_shift_left(dst, src, n)        Shift left
 *
 * unsigned int first_node(mask)        Number lowest set bit, or MAX_NUMNODES
 * unsigend int next_node(node, mask)        Next node past 'node', or MAX_NUMNODES
 * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first,
 *                                        or MAX_NUMNODES
 * unsigned int first_unset_node(mask)        First node not set in mask, or
 *                                        MAX_NUMNODES
 *
 * nodemask_t nodemask_of_node(node)        Return nodemask with bit 'node' set
 * NODE_MASK_ALL                        Initializer - all bits set
 * NODE_MASK_NONE                        Initializer - no bits set
 * unsigned long *nodes_addr(mask)        Array of unsigned long's in mask
 *
 * int nodemask_parse_user(ubuf, ulen, mask)        Parse ascii string as nodemask
 * int nodelist_parse(buf, map)                Parse ascii string as nodelist
 * int node_remap(oldbit, old, new)        newbit = map(old, new)(oldbit)
 * void nodes_remap(dst, src, old, new)        *dst = map(old, new)(src)
 * void nodes_onto(dst, orig, relmap)        *dst = orig relative to relmap
 * void nodes_fold(dst, orig, sz)        dst bits = orig bits mod sz
 *
 * for_each_node_mask(node, mask)        for-loop node over mask
 *
 * int num_online_nodes()                Number of online Nodes
 * int num_possible_nodes()                Number of all possible Nodes
 *
 * int node_random(mask)                Random node with set bit in mask
 *
 * int node_online(node)                Is some node online?
 * int node_possible(node)                Is some node possible?
 *
 * node_set_online(node)                set bit 'node' in node_online_map
 * node_set_offline(node)                clear bit 'node' in node_online_map
 *
 * for_each_node(node)                        for-loop node over node_possible_map
 * for_each_online_node(node)                for-loop node over node_online_map
 *
 * Subtlety:
 * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway)
 *    to generate slightly worse code.  So use a simple one-line #define
 *    for node_isset(), instead of wrapping an inline inside a macro, the
 *    way we do the other calls.
 *
 * NODEMASK_SCRATCH
 * When doing above logical AND, OR, XOR, Remap operations the callers tend to
 * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large,
 * nodemask_t's consume too much stack space.  NODEMASK_SCRATCH is a helper
 * for such situations. See below and CPUMASK_ALLOC also.
 */

#include <linux/threads.h>
#include <linux/bitmap.h>
#include <linux/minmax.h>
#include <linux/nodemask_types.h>
#include <linux/random.h>

extern nodemask_t _unused_nodemask_arg_;

/**
 * nodemask_pr_args - printf args to output a nodemask
 * @maskp: nodemask to be printed
 *
 * Can be used to provide arguments for '%*pb[l]' when printing a nodemask.
 */
#define nodemask_pr_args(maskp)        __nodemask_pr_numnodes(maskp), \
                                __nodemask_pr_bits(maskp)
static __always_inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m)
{
        return m ? MAX_NUMNODES : 0;
}
static __always_inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m)
{
        return m ? m->bits : NULL;
}

/*
 * The inline keyword gives the compiler room to decide to inline, or
 * not inline a function as it sees best.  However, as these functions
 * are called in both __init and non-__init functions, if they are not
 * inlined we will end up with a section mismatch error (of the type of
 * freeable items not being freed).  So we must use __always_inline here
 * to fix the problem.  If other functions in the future also end up in
 * this situation they will also need to be annotated as __always_inline
 */
#define node_set(node, dst) __node_set((node), &(dst))
static __always_inline void __node_set(int node, volatile nodemask_t *dstp)
{
        set_bit(node, dstp->bits);
}

#define node_clear(node, dst) __node_clear((node), &(dst))
static __always_inline void __node_clear(int node, volatile nodemask_t *dstp)
{
        clear_bit(node, dstp->bits);
}

#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
static __always_inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_fill(dstp->bits, nbits);
}

#define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES)
static __always_inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_zero(dstp->bits, nbits);
}

/* No static inline type checking - see Subtlety (1) above. */
#define node_isset(node, nodemask) test_bit((node), (nodemask).bits)

#define node_test_and_set(node, nodemask) \
                        __node_test_and_set((node), &(nodemask))
static __always_inline bool __node_test_and_set(int node, nodemask_t *addr)
{
        return test_and_set_bit(node, addr->bits);
}

#define nodes_and(dst, src1, src2) \
                        __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_or(dst, src1, src2) \
                        __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_xor(dst, src1, src2) \
                        __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_andnot(dst, src1, src2) \
                        __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES)
static __always_inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits);
}

#define nodes_copy(dst, src) __nodes_copy(&(dst), &(src), MAX_NUMNODES)
static __always_inline void __nodes_copy(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_copy(dstp->bits, srcp->bits, nbits);
}

#define nodes_complement(dst, src) \
                        __nodes_complement(&(dst), &(src), MAX_NUMNODES)
static __always_inline void __nodes_complement(nodemask_t *dstp,
                                        const nodemask_t *srcp, unsigned int nbits)
{
        bitmap_complement(dstp->bits, srcp->bits, nbits);
}

#define nodes_equal(src1, src2) \
                        __nodes_equal(&(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_equal(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_equal(src1p->bits, src2p->bits, nbits);
}

#define nodes_intersects(src1, src2) \
                        __nodes_intersects(&(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_intersects(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_intersects(src1p->bits, src2p->bits, nbits);
}

#define nodes_subset(src1, src2) \
                        __nodes_subset(&(src1), &(src2), MAX_NUMNODES)
static __always_inline bool __nodes_subset(const nodemask_t *src1p,
                                        const nodemask_t *src2p, unsigned int nbits)
{
        return bitmap_subset(src1p->bits, src2p->bits, nbits);
}

#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
static __always_inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_empty(srcp->bits, nbits);
}

#define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES)
static __always_inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_full(srcp->bits, nbits);
}

#define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES)
static __always_inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_weight(srcp->bits, nbits);
}

#define nodes_shift_right(dst, src, n) \
                        __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES)
static __always_inline void __nodes_shift_right(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_right(dstp->bits, srcp->bits, n, nbits);
}

#define nodes_shift_left(dst, src, n) \
                        __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES)
static __always_inline void __nodes_shift_left(nodemask_t *dstp,
                                        const nodemask_t *srcp, int n, int nbits)
{
        bitmap_shift_left(dstp->bits, srcp->bits, n, nbits);
}

/* FIXME: better would be to fix all architectures to never return
          > MAX_NUMNODES, then the silly min_ts could be dropped. */

#define first_node(src) __first_node(&(src))
static __always_inline unsigned int __first_node(const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

#define next_node(n, src) __next_node((n), &(src))
static __always_inline unsigned int __next_node(int n, const nodemask_t *srcp)
{
        return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

/*
 * Find the next present node in src, starting after node n, wrapping around to
 * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
 */
#define next_node_in(n, src) __next_node_in((n), &(src))
static __always_inline unsigned int __next_node_in(int node, const nodemask_t *srcp)
{
        unsigned int ret = __next_node(node, srcp);

        if (ret == MAX_NUMNODES)
                ret = __first_node(srcp);
        return ret;
}

static __always_inline void init_nodemask_of_node(nodemask_t *mask, int node)
{
        nodes_clear(*mask);
        node_set(node, *mask);
}

#define nodemask_of_node(node)                                                \
({                                                                        \
        typeof(_unused_nodemask_arg_) m;                                \
        if (sizeof(m) == sizeof(unsigned long)) {                        \
                m.bits[0] = 1UL << (node);                                \
        } else {                                                        \
                init_nodemask_of_node(&m, (node));                        \
        }                                                                \
        m;                                                                \
})

#define first_unset_node(mask) __first_unset_node(&(mask))
static __always_inline unsigned int __first_unset_node(const nodemask_t *maskp)
{
        return min_t(unsigned int, MAX_NUMNODES,
                        find_first_zero_bit(maskp->bits, MAX_NUMNODES));
}

#define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES)

#if MAX_NUMNODES <= BITS_PER_LONG

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#else

#define NODE_MASK_ALL                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL,                        \
        [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD                \
} })

#endif

#define NODE_MASK_NONE                                                        \
((nodemask_t) { {                                                        \
        [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] =  0UL                        \
} })

#define nodes_addr(src) ((src).bits)

#define nodemask_parse_user(ubuf, ulen, dst) \
                __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES)
static __always_inline int __nodemask_parse_user(const char __user *buf, int len,
                                        nodemask_t *dstp, int nbits)
{
        return bitmap_parse_user(buf, len, dstp->bits, nbits);
}

#define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES)
static __always_inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits)
{
        return bitmap_parselist(buf, dstp->bits, nbits);
}

#define node_remap(oldbit, old, new) \
                __node_remap((oldbit), &(old), &(new), MAX_NUMNODES)
static __always_inline int __node_remap(int oldbit,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits);
}

#define nodes_remap(dst, src, old, new) \
                __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES)
static __always_inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp,
                const nodemask_t *oldp, const nodemask_t *newp, int nbits)
{
        bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits);
}

#define nodes_onto(dst, orig, relmap) \
                __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES)
static __always_inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp,
                const nodemask_t *relmapp, int nbits)
{
        bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits);
}

#define nodes_fold(dst, orig, sz) \
                __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES)
static __always_inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
                int sz, int nbits)
{
        bitmap_fold(dstp->bits, origp->bits, sz, nbits);
}

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)                                    \
        for ((node) = first_node(mask);                                    \
             (node) < MAX_NUMNODES;                                    \
             (node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask)                                  \
        for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++)
#endif /* MAX_NUMNODES */

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
        N_POSSIBLE,                /* The node could become online at some point */
        N_ONLINE,                /* The node is online */
        N_NORMAL_MEMORY,        /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
        N_HIGH_MEMORY,                /* The node has regular or high memory */
#else
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
        N_MEMORY,                /* The node has memory(regular, high, movable) */
        N_CPU,                /* The node has one or more cpus */
        N_GENERIC_INITIATOR,        /* The node has one or more Generic Initiators */
        NR_NODE_STATES
};

/*
 * The following particular system nodemasks and operations
 * on them manage all possible and online nodes.
 */

extern nodemask_t node_states[NR_NODE_STATES];

#if MAX_NUMNODES > 1
static __always_inline int node_state(int node, enum node_states state)
{
        return node_isset(node, node_states[state]);
}

static __always_inline void node_set_state(int node, enum node_states state)
{
        __node_set(node, &node_states[state]);
}

static __always_inline void node_clear_state(int node, enum node_states state)
{
        __node_clear(node, &node_states[state]);
}

static __always_inline int num_node_state(enum node_states state)
{
        return nodes_weight(node_states[state]);
}

#define for_each_node_state(__node, __state) \
        for_each_node_mask((__node), node_states[__state])

#define first_online_node        first_node(node_states[N_ONLINE])
#define first_memory_node        first_node(node_states[N_MEMORY])
static __always_inline unsigned int next_online_node(int nid)
{
        return next_node(nid, node_states[N_ONLINE]);
}
static __always_inline unsigned int next_memory_node(int nid)
{
        return next_node(nid, node_states[N_MEMORY]);
}

extern unsigned int nr_node_ids;
extern unsigned int nr_online_nodes;

static __always_inline void node_set_online(int nid)
{
        node_set_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

static __always_inline void node_set_offline(int nid)
{
        node_clear_state(nid, N_ONLINE);
        nr_online_nodes = num_node_state(N_ONLINE);
}

#else

static __always_inline int node_state(int node, enum node_states state)
{
        return node == 0;
}

static __always_inline void node_set_state(int node, enum node_states state)
{
}

static __always_inline void node_clear_state(int node, enum node_states state)
{
}

static __always_inline int num_node_state(enum node_states state)
{
        return 1;
}

#define for_each_node_state(node, __state) \
        for ( (node) = 0; (node) == 0; (node) = 1)

#define first_online_node        0
#define first_memory_node        0
#define next_online_node(nid)        (MAX_NUMNODES)
#define next_memory_node(nid)        (MAX_NUMNODES)
#define nr_node_ids                1U
#define nr_online_nodes                1U

#define node_set_online(node)           node_set_state((node), N_ONLINE)
#define node_set_offline(node)           node_clear_state((node), N_ONLINE)

#endif

static __always_inline int node_random(const nodemask_t *maskp)
{
#if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
        int w, bit;

        w = nodes_weight(*maskp);
        switch (w) {
        case 0:
                bit = NUMA_NO_NODE;
                break;
        case 1:
                bit = first_node(*maskp);
                break;
        default:
                bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_u32_below(w));
                break;
        }
        return bit;
#else
        return 0;
#endif
}

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map         node_states[N_POSSIBLE]

#define num_online_nodes()        num_node_state(N_ONLINE)
#define num_possible_nodes()        num_node_state(N_POSSIBLE)
#define node_online(node)        node_state((node), N_ONLINE)
#define node_possible(node)        node_state((node), N_POSSIBLE)

#define for_each_node(node)           for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

/*
 * For nodemask scratch area.
 * NODEMASK_ALLOC(type, name) allocates an object with a specified type and
 * name.
 */
#if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */
#define NODEMASK_ALLOC(type, name, gfp_flags)        \
                        type *name = kmalloc(sizeof(*name), gfp_flags)
#define NODEMASK_FREE(m)                        kfree(m)
#else
#define NODEMASK_ALLOC(type, name, gfp_flags)        type _##name, *name = &_##name
#define NODEMASK_FREE(m)                        do {} while (0)
#endif

/* Example structure for using NODEMASK_ALLOC, used in mempolicy. */
struct nodemask_scratch {
        nodemask_t        mask1;
        nodemask_t        mask2;
};

#define NODEMASK_SCRATCH(x)                                                \
                        NODEMASK_ALLOC(struct nodemask_scratch, x,        \
                                        GFP_KERNEL | __GFP_NORETRY)
#define NODEMASK_SCRATCH_FREE(x)        NODEMASK_FREE(x)


#endif /* __LINUX_NODEMASK_H */



































































































































































































































































































































































































































































































































    3 



    3 
    3 




































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * net/sched/act_mirred.c        packet mirroring and redirect actions
 *
 * Authors:        Jamal Hadi Salim (2002-4)
 *
 * TODO: Add ingress support (and socket redirect support)
 */

#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/gfp.h>
#include <linux/if_arp.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/dst.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <linux/tc_act/tc_mirred.h>
#include <net/tc_act/tc_mirred.h>
#include <net/tc_wrapper.h>

static LIST_HEAD(mirred_list);
static DEFINE_SPINLOCK(mirred_list_lock);

#define MIRRED_NEST_LIMIT    4
static DEFINE_PER_CPU(unsigned int, mirred_nest_level);

static bool tcf_mirred_is_act_redirect(int action)
{
        return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR;
}

static bool tcf_mirred_act_wants_ingress(int action)
{
        switch (action) {
        case TCA_EGRESS_REDIR:
        case TCA_EGRESS_MIRROR:
                return false;
        case TCA_INGRESS_REDIR:
        case TCA_INGRESS_MIRROR:
                return true;
        default:
                BUG();
        }
}

static bool tcf_mirred_can_reinsert(int action)
{
        switch (action) {
        case TC_ACT_SHOT:
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
        case TC_ACT_TRAP:
                return true;
        }
        return false;
}

static struct net_device *tcf_mirred_dev_dereference(struct tcf_mirred *m)
{
        return rcu_dereference_protected(m->tcfm_dev,
                                         lockdep_is_held(&m->tcf_lock));
}

static void tcf_mirred_release(struct tc_action *a)
{
        struct tcf_mirred *m = to_mirred(a);
        struct net_device *dev;

        spin_lock(&mirred_list_lock);
        list_del(&m->tcfm_list);
        spin_unlock(&mirred_list_lock);

        /* last reference to action, no need to lock */
        dev = rcu_dereference_protected(m->tcfm_dev, 1);
        netdev_put(dev, &m->tcfm_dev_tracker);
}

static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
        [TCA_MIRRED_PARMS]        = { .len = sizeof(struct tc_mirred) },
        [TCA_MIRRED_BLOCKID]        = NLA_POLICY_MIN(NLA_U32, 1),
};

static struct tc_action_ops act_mirred_ops;

static void tcf_mirred_replace_dev(struct tcf_mirred *m,
                                   struct net_device *ndev)
{
        struct net_device *odev;

        odev = rcu_replace_pointer(m->tcfm_dev, ndev,
                                   lockdep_is_held(&m->tcf_lock));
        netdev_put(odev, &m->tcfm_dev_tracker);
}

static int tcf_mirred_init(struct net *net, struct nlattr *nla,
                           struct nlattr *est, struct tc_action **a,
                           struct tcf_proto *tp,
                           u32 flags, struct netlink_ext_ack *extack)
{
        struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id);
        bool bind = flags & TCA_ACT_FLAGS_BIND;
        struct nlattr *tb[TCA_MIRRED_MAX + 1];
        struct tcf_chain *goto_ch = NULL;
        bool mac_header_xmit = false;
        struct tc_mirred *parm;
        struct tcf_mirred *m;
        bool exists = false;
        int ret, err;
        u32 index;

        if (!nla) {
                NL_SET_ERR_MSG_MOD(extack, "Mirred requires attributes to be passed");
                return -EINVAL;
        }
        ret = nla_parse_nested_deprecated(tb, TCA_MIRRED_MAX, nla,
                                          mirred_policy, extack);
        if (ret < 0)
                return ret;
        if (!tb[TCA_MIRRED_PARMS]) {
                NL_SET_ERR_MSG_MOD(extack, "Missing required mirred parameters");
                return -EINVAL;
        }
        parm = nla_data(tb[TCA_MIRRED_PARMS]);
        index = parm->index;
        err = tcf_idr_check_alloc(tn, &index, a, bind);
        if (err < 0)
                return err;
        exists = err;
        if (exists && bind)
                return ACT_P_BOUND;

        if (tb[TCA_MIRRED_BLOCKID] && parm->ifindex) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Cannot specify Block ID and dev simultaneously");
                if (exists)
                        tcf_idr_release(*a, bind);
                else
                        tcf_idr_cleanup(tn, index);

                return -EINVAL;
        }

        switch (parm->eaction) {
        case TCA_EGRESS_MIRROR:
        case TCA_EGRESS_REDIR:
        case TCA_INGRESS_REDIR:
        case TCA_INGRESS_MIRROR:
                break;
        default:
                if (exists)
                        tcf_idr_release(*a, bind);
                else
                        tcf_idr_cleanup(tn, index);
                NL_SET_ERR_MSG_MOD(extack, "Unknown mirred option");
                return -EINVAL;
        }

        if (!exists) {
                if (!parm->ifindex && !tb[TCA_MIRRED_BLOCKID]) {
                        tcf_idr_cleanup(tn, index);
                        NL_SET_ERR_MSG_MOD(extack,
                                           "Must specify device or block");
                        return -EINVAL;
                }
                ret = tcf_idr_create_from_flags(tn, index, est, a,
                                                &act_mirred_ops, bind, flags);
                if (ret) {
                        tcf_idr_cleanup(tn, index);
                        return ret;
                }
                ret = ACT_P_CREATED;
        } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
                tcf_idr_release(*a, bind);
                return -EEXIST;
        }

        m = to_mirred(*a);
        if (ret == ACT_P_CREATED)
                INIT_LIST_HEAD(&m->tcfm_list);

        err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
        if (err < 0)
                goto release_idr;

        spin_lock_bh(&m->tcf_lock);

        if (parm->ifindex) {
                struct net_device *ndev;

                ndev = dev_get_by_index(net, parm->ifindex);
                if (!ndev) {
                        spin_unlock_bh(&m->tcf_lock);
                        err = -ENODEV;
                        goto put_chain;
                }
                mac_header_xmit = dev_is_mac_header_xmit(ndev);
                tcf_mirred_replace_dev(m, ndev);
                netdev_tracker_alloc(ndev, &m->tcfm_dev_tracker, GFP_ATOMIC);
                m->tcfm_mac_header_xmit = mac_header_xmit;
                m->tcfm_blockid = 0;
        } else if (tb[TCA_MIRRED_BLOCKID]) {
                tcf_mirred_replace_dev(m, NULL);
                m->tcfm_mac_header_xmit = false;
                m->tcfm_blockid = nla_get_u32(tb[TCA_MIRRED_BLOCKID]);
        }
        goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
        m->tcfm_eaction = parm->eaction;
        spin_unlock_bh(&m->tcf_lock);
        if (goto_ch)
                tcf_chain_put_by_act(goto_ch);

        if (ret == ACT_P_CREATED) {
                spin_lock(&mirred_list_lock);
                list_add(&m->tcfm_list, &mirred_list);
                spin_unlock(&mirred_list_lock);
        }

        return ret;
put_chain:
        if (goto_ch)
                tcf_chain_put_by_act(goto_ch);
release_idr:
        tcf_idr_release(*a, bind);
        return err;
}

static int
tcf_mirred_forward(bool at_ingress, bool want_ingress, struct sk_buff *skb)
{
        int err;

        if (!want_ingress)
                err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
        else if (!at_ingress)
                err = netif_rx(skb);
        else
                err = netif_receive_skb(skb);

        return err;
}

static int tcf_mirred_to_dev(struct sk_buff *skb, struct tcf_mirred *m,
                             struct net_device *dev,
                             const bool m_mac_header_xmit, int m_eaction,
                             int retval)
{
        struct sk_buff *skb_to_send = skb;
        bool want_ingress;
        bool is_redirect;
        bool expects_nh;
        bool at_ingress;
        bool dont_clone;
        int mac_len;
        bool at_nh;
        int err;

        is_redirect = tcf_mirred_is_act_redirect(m_eaction);
        if (unlikely(!(dev->flags & IFF_UP)) || !netif_carrier_ok(dev)) {
                net_notice_ratelimited("tc mirred to Houston: device %s is down\n",
                                       dev->name);
                goto err_cant_do;
        }

        /* we could easily avoid the clone only if called by ingress and clsact;
         * since we can't easily detect the clsact caller, skip clone only for
         * ingress - that covers the TC S/W datapath.
         */
        at_ingress = skb_at_tc_ingress(skb);
        dont_clone = skb_at_tc_ingress(skb) && is_redirect &&
                tcf_mirred_can_reinsert(retval);
        if (!dont_clone) {
                skb_to_send = skb_clone(skb, GFP_ATOMIC);
                if (!skb_to_send)
                        goto err_cant_do;
        }

        want_ingress = tcf_mirred_act_wants_ingress(m_eaction);

        /* All mirred/redirected skbs should clear previous ct info */
        nf_reset_ct(skb_to_send);
        if (want_ingress && !at_ingress) /* drop dst for egress -> ingress */
                skb_dst_drop(skb_to_send);

        expects_nh = want_ingress || !m_mac_header_xmit;
        at_nh = skb->data == skb_network_header(skb);
        if (at_nh != expects_nh) {
                mac_len = at_ingress ? skb->mac_len :
                          skb_network_offset(skb);
                if (expects_nh) {
                        /* target device/action expect data at nh */
                        skb_pull_rcsum(skb_to_send, mac_len);
                } else {
                        /* target device/action expect data at mac */
                        skb_push_rcsum(skb_to_send, mac_len);
                }
        }

        skb_to_send->skb_iif = skb->dev->ifindex;
        skb_to_send->dev = dev;

        if (is_redirect) {
                if (skb == skb_to_send)
                        retval = TC_ACT_CONSUMED;

                skb_set_redirected(skb_to_send, skb_to_send->tc_at_ingress);

                err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send);
        } else {
                err = tcf_mirred_forward(at_ingress, want_ingress, skb_to_send);
        }
        if (err)
                tcf_action_inc_overlimit_qstats(&m->common);

        return retval;

err_cant_do:
        if (is_redirect)
                retval = TC_ACT_SHOT;
        tcf_action_inc_overlimit_qstats(&m->common);
        return retval;
}

static int tcf_blockcast_redir(struct sk_buff *skb, struct tcf_mirred *m,
                               struct tcf_block *block, int m_eaction,
                               const u32 exception_ifindex, int retval)
{
        struct net_device *dev_prev = NULL;
        struct net_device *dev = NULL;
        unsigned long index;
        int mirred_eaction;

        mirred_eaction = tcf_mirred_act_wants_ingress(m_eaction) ?
                TCA_INGRESS_MIRROR : TCA_EGRESS_MIRROR;

        xa_for_each(&block->ports, index, dev) {
                if (index == exception_ifindex)
                        continue;

                if (!dev_prev)
                        goto assign_prev;

                tcf_mirred_to_dev(skb, m, dev_prev,
                                  dev_is_mac_header_xmit(dev),
                                  mirred_eaction, retval);
assign_prev:
                dev_prev = dev;
        }

        if (dev_prev)
                return tcf_mirred_to_dev(skb, m, dev_prev,
                                         dev_is_mac_header_xmit(dev_prev),
                                         m_eaction, retval);

        return retval;
}

static int tcf_blockcast_mirror(struct sk_buff *skb, struct tcf_mirred *m,
                                struct tcf_block *block, int m_eaction,
                                const u32 exception_ifindex, int retval)
{
        struct net_device *dev = NULL;
        unsigned long index;

        xa_for_each(&block->ports, index, dev) {
                if (index == exception_ifindex)
                        continue;

                tcf_mirred_to_dev(skb, m, dev,
                                  dev_is_mac_header_xmit(dev),
                                  m_eaction, retval);
        }

        return retval;
}

static int tcf_blockcast(struct sk_buff *skb, struct tcf_mirred *m,
                         const u32 blockid, struct tcf_result *res,
                         int retval)
{
        const u32 exception_ifindex = skb->dev->ifindex;
        struct tcf_block *block;
        bool is_redirect;
        int m_eaction;

        m_eaction = READ_ONCE(m->tcfm_eaction);
        is_redirect = tcf_mirred_is_act_redirect(m_eaction);

        /* we are already under rcu protection, so can call block lookup
         * directly.
         */
        block = tcf_block_lookup(dev_net(skb->dev), blockid);
        if (!block || xa_empty(&block->ports)) {
                tcf_action_inc_overlimit_qstats(&m->common);
                return retval;
        }

        if (is_redirect)
                return tcf_blockcast_redir(skb, m, block, m_eaction,
                                           exception_ifindex, retval);

        /* If it's not redirect, it is mirror */
        return tcf_blockcast_mirror(skb, m, block, m_eaction, exception_ifindex,
                                    retval);
}

TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb,
                                     const struct tc_action *a,
                                     struct tcf_result *res)
{
        struct tcf_mirred *m = to_mirred(a);
        int retval = READ_ONCE(m->tcf_action);
        unsigned int nest_level;
        bool m_mac_header_xmit;
        struct net_device *dev;
        int m_eaction;
        u32 blockid;

        nest_level = __this_cpu_inc_return(mirred_nest_level);
        if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
                net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
                                     netdev_name(skb->dev));
                retval = TC_ACT_SHOT;
                goto dec_nest_level;
        }

        tcf_lastuse_update(&m->tcf_tm);
        tcf_action_update_bstats(&m->common, skb);

        blockid = READ_ONCE(m->tcfm_blockid);
        if (blockid) {
                retval = tcf_blockcast(skb, m, blockid, res, retval);
                goto dec_nest_level;
        }

        dev = rcu_dereference_bh(m->tcfm_dev);
        if (unlikely(!dev)) {
                pr_notice_once("tc mirred: target device is gone\n");
                tcf_action_inc_overlimit_qstats(&m->common);
                goto dec_nest_level;
        }

        m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
        m_eaction = READ_ONCE(m->tcfm_eaction);

        retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction,
                                   retval);

dec_nest_level:
        __this_cpu_dec(mirred_nest_level);

        return retval;
}

static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
                             u64 drops, u64 lastuse, bool hw)
{
        struct tcf_mirred *m = to_mirred(a);
        struct tcf_t *tm = &m->tcf_tm;

        tcf_action_update_stats(a, bytes, packets, drops, hw);
        tm->lastuse = max_t(u64, tm->lastuse, lastuse);
}

static int tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind,
                           int ref)
{
        unsigned char *b = skb_tail_pointer(skb);
        struct tcf_mirred *m = to_mirred(a);
        struct tc_mirred opt = {
                .index   = m->tcf_index,
                .refcnt  = refcount_read(&m->tcf_refcnt) - ref,
                .bindcnt = atomic_read(&m->tcf_bindcnt) - bind,
        };
        struct net_device *dev;
        struct tcf_t t;
        u32 blockid;

        spin_lock_bh(&m->tcf_lock);
        opt.action = m->tcf_action;
        opt.eaction = m->tcfm_eaction;
        dev = tcf_mirred_dev_dereference(m);
        if (dev)
                opt.ifindex = dev->ifindex;

        if (nla_put(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt))
                goto nla_put_failure;

        blockid = m->tcfm_blockid;
        if (blockid && nla_put_u32(skb, TCA_MIRRED_BLOCKID, blockid))
                goto nla_put_failure;

        tcf_tm_dump(&t, &m->tcf_tm);
        if (nla_put_64bit(skb, TCA_MIRRED_TM, sizeof(t), &t, TCA_MIRRED_PAD))
                goto nla_put_failure;
        spin_unlock_bh(&m->tcf_lock);

        return skb->len;

nla_put_failure:
        spin_unlock_bh(&m->tcf_lock);
        nlmsg_trim(skb, b);
        return -1;
}

static int mirred_device_event(struct notifier_block *unused,
                               unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct tcf_mirred *m;

        ASSERT_RTNL();
        if (event == NETDEV_UNREGISTER) {
                spin_lock(&mirred_list_lock);
                list_for_each_entry(m, &mirred_list, tcfm_list) {
                        spin_lock_bh(&m->tcf_lock);
                        if (tcf_mirred_dev_dereference(m) == dev) {
                                netdev_put(dev, &m->tcfm_dev_tracker);
                                /* Note : no rcu grace period necessary, as
                                 * net_device are already rcu protected.
                                 */
                                RCU_INIT_POINTER(m->tcfm_dev, NULL);
                        }
                        spin_unlock_bh(&m->tcf_lock);
                }
                spin_unlock(&mirred_list_lock);
        }

        return NOTIFY_DONE;
}

static struct notifier_block mirred_device_notifier = {
        .notifier_call = mirred_device_event,
};

static void tcf_mirred_dev_put(void *priv)
{
        struct net_device *dev = priv;

        dev_put(dev);
}

static struct net_device *
tcf_mirred_get_dev(const struct tc_action *a,
                   tc_action_priv_destructor *destructor)
{
        struct tcf_mirred *m = to_mirred(a);
        struct net_device *dev;

        rcu_read_lock();
        dev = rcu_dereference(m->tcfm_dev);
        if (dev) {
                dev_hold(dev);
                *destructor = tcf_mirred_dev_put;
        }
        rcu_read_unlock();

        return dev;
}

static size_t tcf_mirred_get_fill_size(const struct tc_action *act)
{
        return nla_total_size(sizeof(struct tc_mirred));
}

static void tcf_offload_mirred_get_dev(struct flow_action_entry *entry,
                                       const struct tc_action *act)
{
        entry->dev = act->ops->get_dev(act, &entry->destructor);
        if (!entry->dev)
                return;
        entry->destructor_priv = entry->dev;
}

static int tcf_mirred_offload_act_setup(struct tc_action *act, void *entry_data,
                                        u32 *index_inc, bool bind,
                                        struct netlink_ext_ack *extack)
{
        if (bind) {
                struct flow_action_entry *entry = entry_data;

                if (is_tcf_mirred_egress_redirect(act)) {
                        entry->id = FLOW_ACTION_REDIRECT;
                        tcf_offload_mirred_get_dev(entry, act);
                } else if (is_tcf_mirred_egress_mirror(act)) {
                        entry->id = FLOW_ACTION_MIRRED;
                        tcf_offload_mirred_get_dev(entry, act);
                } else if (is_tcf_mirred_ingress_redirect(act)) {
                        entry->id = FLOW_ACTION_REDIRECT_INGRESS;
                        tcf_offload_mirred_get_dev(entry, act);
                } else if (is_tcf_mirred_ingress_mirror(act)) {
                        entry->id = FLOW_ACTION_MIRRED_INGRESS;
                        tcf_offload_mirred_get_dev(entry, act);
                } else {
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported mirred offload");
                        return -EOPNOTSUPP;
                }
                *index_inc = 1;
        } else {
                struct flow_offload_action *fl_action = entry_data;

                if (is_tcf_mirred_egress_redirect(act))
                        fl_action->id = FLOW_ACTION_REDIRECT;
                else if (is_tcf_mirred_egress_mirror(act))
                        fl_action->id = FLOW_ACTION_MIRRED;
                else if (is_tcf_mirred_ingress_redirect(act))
                        fl_action->id = FLOW_ACTION_REDIRECT_INGRESS;
                else if (is_tcf_mirred_ingress_mirror(act))
                        fl_action->id = FLOW_ACTION_MIRRED_INGRESS;
                else
                        return -EOPNOTSUPP;
        }

        return 0;
}

static struct tc_action_ops act_mirred_ops = {
        .kind                =        "mirred",
        .id                =        TCA_ID_MIRRED,
        .owner                =        THIS_MODULE,
        .act                =        tcf_mirred_act,
        .stats_update        =        tcf_stats_update,
        .dump                =        tcf_mirred_dump,
        .cleanup        =        tcf_mirred_release,
        .init                =        tcf_mirred_init,
        .get_fill_size        =        tcf_mirred_get_fill_size,
        .offload_act_setup =        tcf_mirred_offload_act_setup,
        .size                =        sizeof(struct tcf_mirred),
        .get_dev        =        tcf_mirred_get_dev,
};
MODULE_ALIAS_NET_ACT("mirred");

static __net_init int mirred_init_net(struct net *net)
{
        struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id);

        return tc_action_net_init(net, tn, &act_mirred_ops);
}

static void __net_exit mirred_exit_net(struct list_head *net_list)
{
        tc_action_net_exit(net_list, act_mirred_ops.net_id);
}

static struct pernet_operations mirred_net_ops = {
        .init = mirred_init_net,
        .exit_batch = mirred_exit_net,
        .id   = &act_mirred_ops.net_id,
        .size = sizeof(struct tc_action_net),
};

MODULE_AUTHOR("Jamal Hadi Salim(2002)");
MODULE_DESCRIPTION("Device Mirror/redirect actions");
MODULE_LICENSE("GPL");

static int __init mirred_init_module(void)
{
        int err = register_netdevice_notifier(&mirred_device_notifier);
        if (err)
                return err;

        pr_info("Mirror/redirect action on\n");
        err = tcf_register_action(&act_mirred_ops, &mirred_net_ops);
        if (err)
                unregister_netdevice_notifier(&mirred_device_notifier);

        return err;
}

static void __exit mirred_cleanup_module(void)
{
        tcf_unregister_action(&act_mirred_ops, &mirred_net_ops);
        unregister_netdevice_notifier(&mirred_device_notifier);
}

module_init(mirred_init_module);
module_exit(mirred_cleanup_module);































































































  157 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_ENTRYKVM_H
#define __LINUX_ENTRYKVM_H

#include <linux/static_call_types.h>
#include <linux/resume_user_mode.h>
#include <linux/syscalls.h>
#include <linux/seccomp.h>
#include <linux/sched.h>
#include <linux/tick.h>

/* Transfer to guest mode work */
#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK

#ifndef ARCH_XFER_TO_GUEST_MODE_WORK
# define ARCH_XFER_TO_GUEST_MODE_WORK        (0)
#endif

#define XFER_TO_GUEST_MODE_WORK                                                \
        (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \
         _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME |                        \
         ARCH_XFER_TO_GUEST_MODE_WORK)

struct kvm_vcpu;

/**
 * arch_xfer_to_guest_mode_handle_work - Architecture specific xfer to guest
 *                                         mode work handling function.
 * @vcpu:        Pointer to current's VCPU data
 * @ti_work:        Cached TIF flags gathered in xfer_to_guest_mode_handle_work()
 *
 * Invoked from xfer_to_guest_mode_handle_work(). Defaults to NOOP. Can be
 * replaced by architecture specific code.
 */
static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu,
                                                      unsigned long ti_work);

#ifndef arch_xfer_to_guest_mode_work
static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu,
                                                      unsigned long ti_work)
{
        return 0;
}
#endif

/**
 * xfer_to_guest_mode_handle_work - Check and handle pending work which needs
 *                                    to be handled before going to guest mode
 * @vcpu:        Pointer to current's VCPU data
 *
 * Returns: 0 or an error code
 */
int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu);

/**
 * xfer_to_guest_mode_prepare - Perform last minute preparation work that
 *                                need to be handled while IRQs are disabled
 *                                upon entering to guest.
 *
 * Has to be invoked with interrupts disabled before the last call
 * to xfer_to_guest_mode_work_pending().
 */
static inline void xfer_to_guest_mode_prepare(void)
{
        lockdep_assert_irqs_disabled();
        tick_nohz_user_enter_prepare();
}

/**
 * __xfer_to_guest_mode_work_pending - Check if work is pending
 *
 * Returns: True if work pending, False otherwise.
 *
 * Bare variant of xfer_to_guest_mode_work_pending(). Can be called from
 * interrupt enabled code for racy quick checks with care.
 */
static inline bool __xfer_to_guest_mode_work_pending(void)
{
        unsigned long ti_work = read_thread_flags();

        return !!(ti_work & XFER_TO_GUEST_MODE_WORK);
}

/**
 * xfer_to_guest_mode_work_pending - Check if work is pending which needs to be
 *                                     handled before returning to guest mode
 *
 * Returns: True if work pending, False otherwise.
 *
 * Has to be invoked with interrupts disabled before the transition to
 * guest mode.
 */
static inline bool xfer_to_guest_mode_work_pending(void)
{
        lockdep_assert_irqs_disabled();
        return __xfer_to_guest_mode_work_pending();
}
#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */

#endif






















  220 



  220 

  218 




  220 





























  220 


  220 




  219 




  220 



  220 
    3 

  220 





  220 




   14 













   14 

















    7 













    7 

















   97 















   97 



























  161 






































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#include <linux/irqflags.h>

#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/tlbflush.h>

struct tlb_inv_context {
        struct kvm_s2_mmu        *mmu;
        unsigned long                flags;
        u64                        tcr;
        u64                        sctlr;
};

static void enter_vmid_context(struct kvm_s2_mmu *mmu,
                               struct tlb_inv_context *cxt)
{
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
        u64 val;

        local_irq_save(cxt->flags);

        if (vcpu && mmu != vcpu->arch.hw_mmu)
                cxt->mmu = vcpu->arch.hw_mmu;
        else
                cxt->mmu = NULL;

        if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
                /*
                 * For CPUs that are affected by ARM errata 1165522 or 1530923,
                 * we cannot trust stage-1 to be in a correct state at that
                 * point. Since we do not want to force a full load of the
                 * vcpu state, we prevent the EL1 page-table walker to
                 * allocate new TLBs. This is done by setting the EPD bits
                 * in the TCR_EL1 register. We also need to prevent it to
                 * allocate IPA->PA walks, so we enable the S1 MMU...
                 */
                val = cxt->tcr = read_sysreg_el1(SYS_TCR);
                val |= TCR_EPD1_MASK | TCR_EPD0_MASK;
                write_sysreg_el1(val, SYS_TCR);
                val = cxt->sctlr = read_sysreg_el1(SYS_SCTLR);
                val |= SCTLR_ELx_M;
                write_sysreg_el1(val, SYS_SCTLR);
        }

        /*
         * With VHE enabled, we have HCR_EL2.{E2H,TGE} = {1,1}, and
         * most TLB operations target EL2/EL0. In order to affect the
         * guest TLBs (EL1/EL0), we need to change one of these two
         * bits. Changing E2H is impossible (goodbye TTBR1_EL2), so
         * let's flip TGE before executing the TLB operation.
         *
         * ARM erratum 1165522 requires some special handling (again),
         * as we need to make sure both stages of translation are in
         * place before clearing TGE. __load_stage2() already
         * has an ISB in order to deal with this.
         */
        __load_stage2(mmu, mmu->arch);
        val = read_sysreg(hcr_el2);
        val &= ~HCR_TGE;
        write_sysreg_hcr(val);
        isb();
}

static void exit_vmid_context(struct tlb_inv_context *cxt)
{
        /*
         * We're done with the TLB operation, let's restore the host's
         * view of HCR_EL2.
         */
        write_sysreg_hcr(HCR_HOST_VHE_FLAGS);
        isb();

        /* ... and the stage-2 MMU context that we switched away from */
        if (cxt->mmu)
                __load_stage2(cxt->mmu, cxt->mmu->arch);

        if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
                /* Restore the registers to what they were */
                write_sysreg_el1(cxt->tcr, SYS_TCR);
                write_sysreg_el1(cxt->sctlr, SYS_SCTLR);
        }

        local_irq_restore(cxt->flags);
}

void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu,
                              phys_addr_t ipa, int level)
{
        struct tlb_inv_context cxt;

        dsb(ishst);

        /* Switch to requested VMID */
        enter_vmid_context(mmu, &cxt);

        /*
         * We could do so much better if we had the VA as well.
         * Instead, we invalidate Stage-2 for this IPA, and the
         * whole of Stage-1. Weep...
         */
        ipa >>= 12;
        __tlbi_level(ipas2e1is, ipa, level);

        /*
         * We have to ensure completion of the invalidation at Stage-2,
         * since a table walk on another CPU could refill a TLB with a
         * complete (S1 + S2) walk based on the old Stage-2 mapping if
         * the Stage-1 invalidation happened first.
         */
        dsb(ish);
        __tlbi(vmalle1is);
        dsb(ish);
        isb();

        exit_vmid_context(&cxt);
}

void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu,
                                  phys_addr_t ipa, int level)
{
        struct tlb_inv_context cxt;

        dsb(nshst);

        /* Switch to requested VMID */
        enter_vmid_context(mmu, &cxt);

        /*
         * We could do so much better if we had the VA as well.
         * Instead, we invalidate Stage-2 for this IPA, and the
         * whole of Stage-1. Weep...
         */
        ipa >>= 12;
        __tlbi_level(ipas2e1, ipa, level);

        /*
         * We have to ensure completion of the invalidation at Stage-2,
         * since a table walk on another CPU could refill a TLB with a
         * complete (S1 + S2) walk based on the old Stage-2 mapping if
         * the Stage-1 invalidation happened first.
         */
        dsb(nsh);
        __tlbi(vmalle1);
        dsb(nsh);
        isb();

        exit_vmid_context(&cxt);
}

void __kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
                                phys_addr_t start, unsigned long pages)
{
        struct tlb_inv_context cxt;
        unsigned long stride;

        /*
         * Since the range of addresses may not be mapped at
         * the same level, assume the worst case as PAGE_SIZE
         */
        stride = PAGE_SIZE;
        start = round_down(start, stride);

        dsb(ishst);

        /* Switch to requested VMID */
        enter_vmid_context(mmu, &cxt);

        __flush_s2_tlb_range_op(ipas2e1is, start, pages, stride,
                                TLBI_TTL_UNKNOWN);

        dsb(ish);
        __tlbi(vmalle1is);
        dsb(ish);
        isb();

        exit_vmid_context(&cxt);
}

void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu)
{
        struct tlb_inv_context cxt;

        dsb(ishst);

        /* Switch to requested VMID */
        enter_vmid_context(mmu, &cxt);

        __tlbi(vmalls12e1is);
        dsb(ish);
        isb();

        exit_vmid_context(&cxt);
}

void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu)
{
        struct tlb_inv_context cxt;

        /* Switch to requested VMID */
        enter_vmid_context(mmu, &cxt);

        __tlbi(vmalle1);
        asm volatile("ic iallu");
        dsb(nsh);
        isb();

        exit_vmid_context(&cxt);
}

void __kvm_flush_vm_context(void)
{
        dsb(ishst);
        __tlbi(alle1is);
        dsb(ish);
}

/*
 * TLB invalidation emulation for NV. For any given instruction, we
 * perform the following transformtions:
 *
 * - a TLBI targeting EL2 S1 is remapped to EL1 S1
 * - a non-shareable TLBI is upgraded to being inner-shareable
 * - an outer-shareable TLBI is also mapped to inner-shareable
 * - an nXS TLBI is upgraded to XS
 */
int __kvm_tlbi_s1e2(struct kvm_s2_mmu *mmu, u64 va, u64 sys_encoding)
{
        struct tlb_inv_context cxt;
        int ret = 0;

        /*
         * The guest will have provided its own DSB ISHST before trapping.
         * If it hasn't, that's its own problem, and we won't paper over it
         * (plus, there is plenty of extra synchronisation before we even
         * get here...).
         */

        if (mmu)
                enter_vmid_context(mmu, &cxt);

        switch (sys_encoding) {
        case OP_TLBI_ALLE2:
        case OP_TLBI_ALLE2IS:
        case OP_TLBI_ALLE2OS:
        case OP_TLBI_VMALLE1:
        case OP_TLBI_VMALLE1IS:
        case OP_TLBI_VMALLE1OS:
        case OP_TLBI_ALLE2NXS:
        case OP_TLBI_ALLE2ISNXS:
        case OP_TLBI_ALLE2OSNXS:
        case OP_TLBI_VMALLE1NXS:
        case OP_TLBI_VMALLE1ISNXS:
        case OP_TLBI_VMALLE1OSNXS:
                __tlbi(vmalle1is);
                break;
        case OP_TLBI_VAE2:
        case OP_TLBI_VAE2IS:
        case OP_TLBI_VAE2OS:
        case OP_TLBI_VAE1:
        case OP_TLBI_VAE1IS:
        case OP_TLBI_VAE1OS:
        case OP_TLBI_VAE2NXS:
        case OP_TLBI_VAE2ISNXS:
        case OP_TLBI_VAE2OSNXS:
        case OP_TLBI_VAE1NXS:
        case OP_TLBI_VAE1ISNXS:
        case OP_TLBI_VAE1OSNXS:
                __tlbi(vae1is, va);
                break;
        case OP_TLBI_VALE2:
        case OP_TLBI_VALE2IS:
        case OP_TLBI_VALE2OS:
        case OP_TLBI_VALE1:
        case OP_TLBI_VALE1IS:
        case OP_TLBI_VALE1OS:
        case OP_TLBI_VALE2NXS:
        case OP_TLBI_VALE2ISNXS:
        case OP_TLBI_VALE2OSNXS:
        case OP_TLBI_VALE1NXS:
        case OP_TLBI_VALE1ISNXS:
        case OP_TLBI_VALE1OSNXS:
                __tlbi(vale1is, va);
                break;
        case OP_TLBI_ASIDE1:
        case OP_TLBI_ASIDE1IS:
        case OP_TLBI_ASIDE1OS:
        case OP_TLBI_ASIDE1NXS:
        case OP_TLBI_ASIDE1ISNXS:
        case OP_TLBI_ASIDE1OSNXS:
                __tlbi(aside1is, va);
                break;
        case OP_TLBI_VAAE1:
        case OP_TLBI_VAAE1IS:
        case OP_TLBI_VAAE1OS:
        case OP_TLBI_VAAE1NXS:
        case OP_TLBI_VAAE1ISNXS:
        case OP_TLBI_VAAE1OSNXS:
                __tlbi(vaae1is, va);
                break;
        case OP_TLBI_VAALE1:
        case OP_TLBI_VAALE1IS:
        case OP_TLBI_VAALE1OS:
        case OP_TLBI_VAALE1NXS:
        case OP_TLBI_VAALE1ISNXS:
        case OP_TLBI_VAALE1OSNXS:
                __tlbi(vaale1is, va);
                break;
        case OP_TLBI_RVAE2:
        case OP_TLBI_RVAE2IS:
        case OP_TLBI_RVAE2OS:
        case OP_TLBI_RVAE1:
        case OP_TLBI_RVAE1IS:
        case OP_TLBI_RVAE1OS:
        case OP_TLBI_RVAE2NXS:
        case OP_TLBI_RVAE2ISNXS:
        case OP_TLBI_RVAE2OSNXS:
        case OP_TLBI_RVAE1NXS:
        case OP_TLBI_RVAE1ISNXS:
        case OP_TLBI_RVAE1OSNXS:
                __tlbi(rvae1is, va);
                break;
        case OP_TLBI_RVALE2:
        case OP_TLBI_RVALE2IS:
        case OP_TLBI_RVALE2OS:
        case OP_TLBI_RVALE1:
        case OP_TLBI_RVALE1IS:
        case OP_TLBI_RVALE1OS:
        case OP_TLBI_RVALE2NXS:
        case OP_TLBI_RVALE2ISNXS:
        case OP_TLBI_RVALE2OSNXS:
        case OP_TLBI_RVALE1NXS:
        case OP_TLBI_RVALE1ISNXS:
        case OP_TLBI_RVALE1OSNXS:
                __tlbi(rvale1is, va);
                break;
        case OP_TLBI_RVAAE1:
        case OP_TLBI_RVAAE1IS:
        case OP_TLBI_RVAAE1OS:
        case OP_TLBI_RVAAE1NXS:
        case OP_TLBI_RVAAE1ISNXS:
        case OP_TLBI_RVAAE1OSNXS:
                __tlbi(rvaae1is, va);
                break;
        case OP_TLBI_RVAALE1:
        case OP_TLBI_RVAALE1IS:
        case OP_TLBI_RVAALE1OS:
        case OP_TLBI_RVAALE1NXS:
        case OP_TLBI_RVAALE1ISNXS:
        case OP_TLBI_RVAALE1OSNXS:
                __tlbi(rvaale1is, va);
                break;
        default:
                ret = -EINVAL;
        }
        dsb(ish);
        isb();

        if (mmu)
                exit_vmid_context(&cxt);

        return ret;
}












































































































































































































































































































































































































































































  156 

  156 











































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#ifndef __ASM_ESR_H
#define __ASM_ESR_H

#include <asm/memory.h>
#include <asm/sysreg.h>

#define ESR_ELx_EC_UNKNOWN        UL(0x00)
#define ESR_ELx_EC_WFx                UL(0x01)
/* Unallocated EC: 0x02 */
#define ESR_ELx_EC_CP15_32        UL(0x03)
#define ESR_ELx_EC_CP15_64        UL(0x04)
#define ESR_ELx_EC_CP14_MR        UL(0x05)
#define ESR_ELx_EC_CP14_LS        UL(0x06)
#define ESR_ELx_EC_FP_ASIMD        UL(0x07)
#define ESR_ELx_EC_CP10_ID        UL(0x08)        /* EL2 only */
#define ESR_ELx_EC_PAC                UL(0x09)        /* EL2 and above */
#define ESR_ELx_EC_OTHER        UL(0x0A)
/* Unallocated EC: 0x0B */
#define ESR_ELx_EC_CP14_64        UL(0x0C)
#define ESR_ELx_EC_BTI                UL(0x0D)
#define ESR_ELx_EC_ILL                UL(0x0E)
/* Unallocated EC: 0x0F - 0x10 */
#define ESR_ELx_EC_SVC32        UL(0x11)
#define ESR_ELx_EC_HVC32        UL(0x12)        /* EL2 only */
#define ESR_ELx_EC_SMC32        UL(0x13)        /* EL2 and above */
/* Unallocated EC: 0x14 */
#define ESR_ELx_EC_SVC64        UL(0x15)
#define ESR_ELx_EC_HVC64        UL(0x16)        /* EL2 and above */
#define ESR_ELx_EC_SMC64        UL(0x17)        /* EL2 and above */
#define ESR_ELx_EC_SYS64        UL(0x18)
#define ESR_ELx_EC_SVE                UL(0x19)
#define ESR_ELx_EC_ERET                UL(0x1a)        /* EL2 only */
/* Unallocated EC: 0x1B */
#define ESR_ELx_EC_FPAC                UL(0x1C)        /* EL1 and above */
#define ESR_ELx_EC_SME                UL(0x1D)
/* Unallocated EC: 0x1E */
#define ESR_ELx_EC_IMP_DEF        UL(0x1f)        /* EL3 only */
#define ESR_ELx_EC_IABT_LOW        UL(0x20)
#define ESR_ELx_EC_IABT_CUR        UL(0x21)
#define ESR_ELx_EC_PC_ALIGN        UL(0x22)
/* Unallocated EC: 0x23 */
#define ESR_ELx_EC_DABT_LOW        UL(0x24)
#define ESR_ELx_EC_DABT_CUR        UL(0x25)
#define ESR_ELx_EC_SP_ALIGN        UL(0x26)
#define ESR_ELx_EC_MOPS                UL(0x27)
#define ESR_ELx_EC_FP_EXC32        UL(0x28)
/* Unallocated EC: 0x29 - 0x2B */
#define ESR_ELx_EC_FP_EXC64        UL(0x2C)
#define ESR_ELx_EC_GCS                UL(0x2D)
/* Unallocated EC:  0x2E */
#define ESR_ELx_EC_SERROR        UL(0x2F)
#define ESR_ELx_EC_BREAKPT_LOW        UL(0x30)
#define ESR_ELx_EC_BREAKPT_CUR        UL(0x31)
#define ESR_ELx_EC_SOFTSTP_LOW        UL(0x32)
#define ESR_ELx_EC_SOFTSTP_CUR        UL(0x33)
#define ESR_ELx_EC_WATCHPT_LOW        UL(0x34)
#define ESR_ELx_EC_WATCHPT_CUR        UL(0x35)
/* Unallocated EC: 0x36 - 0x37 */
#define ESR_ELx_EC_BKPT32        UL(0x38)
/* Unallocated EC: 0x39 */
#define ESR_ELx_EC_VECTOR32        UL(0x3A)        /* EL2 only */
/* Unallocated EC: 0x3B */
#define ESR_ELx_EC_BRK64        UL(0x3C)
/* Unallocated EC: 0x3D - 0x3F */
#define ESR_ELx_EC_MAX                UL(0x3F)

#define ESR_ELx_EC_SHIFT        (26)
#define ESR_ELx_EC_WIDTH        (6)
#define ESR_ELx_EC_MASK                (UL(0x3F) << ESR_ELx_EC_SHIFT)
#define ESR_ELx_EC(esr)                (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT)

#define ESR_ELx_IL_SHIFT        (25)
#define ESR_ELx_IL                (UL(1) << ESR_ELx_IL_SHIFT)
#define ESR_ELx_ISS_MASK        (GENMASK(24, 0))
#define ESR_ELx_ISS(esr)        ((esr) & ESR_ELx_ISS_MASK)
#define ESR_ELx_ISS2_SHIFT        (32)
#define ESR_ELx_ISS2_MASK        (GENMASK_ULL(55, 32))
#define ESR_ELx_ISS2(esr)        (((esr) & ESR_ELx_ISS2_MASK) >> ESR_ELx_ISS2_SHIFT)

/* ISS field definitions shared by different classes */
#define ESR_ELx_WNR_SHIFT        (6)
#define ESR_ELx_WNR                (UL(1) << ESR_ELx_WNR_SHIFT)

/* Asynchronous Error Type */
#define ESR_ELx_IDS_SHIFT        (24)
#define ESR_ELx_IDS                (UL(1) << ESR_ELx_IDS_SHIFT)
#define ESR_ELx_AET_SHIFT        (10)
#define ESR_ELx_AET                (UL(0x7) << ESR_ELx_AET_SHIFT)

#define ESR_ELx_AET_UC                (UL(0) << ESR_ELx_AET_SHIFT)
#define ESR_ELx_AET_UEU                (UL(1) << ESR_ELx_AET_SHIFT)
#define ESR_ELx_AET_UEO                (UL(2) << ESR_ELx_AET_SHIFT)
#define ESR_ELx_AET_UER                (UL(3) << ESR_ELx_AET_SHIFT)
#define ESR_ELx_AET_CE                (UL(6) << ESR_ELx_AET_SHIFT)

/* Shared ISS field definitions for Data/Instruction aborts */
#define ESR_ELx_VNCR_SHIFT        (13)
#define ESR_ELx_VNCR                (UL(1) << ESR_ELx_VNCR_SHIFT)
#define ESR_ELx_SET_SHIFT        (11)
#define ESR_ELx_SET_MASK        (UL(3) << ESR_ELx_SET_SHIFT)
#define ESR_ELx_FnV_SHIFT        (10)
#define ESR_ELx_FnV                (UL(1) << ESR_ELx_FnV_SHIFT)
#define ESR_ELx_EA_SHIFT        (9)
#define ESR_ELx_EA                (UL(1) << ESR_ELx_EA_SHIFT)
#define ESR_ELx_S1PTW_SHIFT        (7)
#define ESR_ELx_S1PTW                (UL(1) << ESR_ELx_S1PTW_SHIFT)

/* Shared ISS fault status code(IFSC/DFSC) for Data/Instruction aborts */
#define ESR_ELx_FSC                (0x3F)
#define ESR_ELx_FSC_TYPE        (0x3C)
#define ESR_ELx_FSC_LEVEL        (0x03)
#define ESR_ELx_FSC_EXTABT        (0x10)
#define ESR_ELx_FSC_MTE                (0x11)
#define ESR_ELx_FSC_SERROR        (0x11)
#define ESR_ELx_FSC_ACCESS        (0x08)
#define ESR_ELx_FSC_FAULT        (0x04)
#define ESR_ELx_FSC_PERM        (0x0C)
#define ESR_ELx_FSC_SEA_TTW(n)        (0x14 + (n))
#define ESR_ELx_FSC_SECC        (0x18)
#define ESR_ELx_FSC_SECC_TTW(n)        (0x1c + (n))
#define ESR_ELx_FSC_ADDRSZ        (0x00)

/*
 * Annoyingly, the negative levels for Address size faults aren't laid out
 * contiguously (or in the desired order)
 */
#define ESR_ELx_FSC_ADDRSZ_nL(n)        ((n) == -1 ? 0x25 : 0x2C)
#define ESR_ELx_FSC_ADDRSZ_L(n)                ((n) < 0 ? ESR_ELx_FSC_ADDRSZ_nL(n) : \
                                                   (ESR_ELx_FSC_ADDRSZ + (n)))

/* Status codes for individual page table levels */
#define ESR_ELx_FSC_ACCESS_L(n)        (ESR_ELx_FSC_ACCESS + (n))
#define ESR_ELx_FSC_PERM_L(n)        (ESR_ELx_FSC_PERM + (n))

#define ESR_ELx_FSC_FAULT_nL        (0x2C)
#define ESR_ELx_FSC_FAULT_L(n)        (((n) < 0 ? ESR_ELx_FSC_FAULT_nL : \
                                            ESR_ELx_FSC_FAULT) + (n))

/* ISS field definitions for Data Aborts */
#define ESR_ELx_ISV_SHIFT        (24)
#define ESR_ELx_ISV                (UL(1) << ESR_ELx_ISV_SHIFT)
#define ESR_ELx_SAS_SHIFT        (22)
#define ESR_ELx_SAS                (UL(3) << ESR_ELx_SAS_SHIFT)
#define ESR_ELx_SSE_SHIFT        (21)
#define ESR_ELx_SSE                (UL(1) << ESR_ELx_SSE_SHIFT)
#define ESR_ELx_SRT_SHIFT        (16)
#define ESR_ELx_SRT_MASK        (UL(0x1F) << ESR_ELx_SRT_SHIFT)
#define ESR_ELx_SF_SHIFT        (15)
#define ESR_ELx_SF                 (UL(1) << ESR_ELx_SF_SHIFT)
#define ESR_ELx_AR_SHIFT        (14)
#define ESR_ELx_AR                 (UL(1) << ESR_ELx_AR_SHIFT)
#define ESR_ELx_CM_SHIFT        (8)
#define ESR_ELx_CM                 (UL(1) << ESR_ELx_CM_SHIFT)

/* ISS2 field definitions for Data Aborts */
#define ESR_ELx_TnD_SHIFT        (10)
#define ESR_ELx_TnD                 (UL(1) << ESR_ELx_TnD_SHIFT)
#define ESR_ELx_TagAccess_SHIFT        (9)
#define ESR_ELx_TagAccess        (UL(1) << ESR_ELx_TagAccess_SHIFT)
#define ESR_ELx_GCS_SHIFT        (8)
#define ESR_ELx_GCS                 (UL(1) << ESR_ELx_GCS_SHIFT)
#define ESR_ELx_Overlay_SHIFT        (6)
#define ESR_ELx_Overlay                (UL(1) << ESR_ELx_Overlay_SHIFT)
#define ESR_ELx_DirtyBit_SHIFT        (5)
#define ESR_ELx_DirtyBit        (UL(1) << ESR_ELx_DirtyBit_SHIFT)
#define ESR_ELx_Xs_SHIFT        (0)
#define ESR_ELx_Xs_MASK                (GENMASK_ULL(4, 0))

/* ISS field definitions for exceptions taken in to Hyp */
#define ESR_ELx_CV                (UL(1) << 24)
#define ESR_ELx_COND_SHIFT        (20)
#define ESR_ELx_COND_MASK        (UL(0xF) << ESR_ELx_COND_SHIFT)
#define ESR_ELx_WFx_ISS_RN        (UL(0x1F) << 5)
#define ESR_ELx_WFx_ISS_RV        (UL(1) << 2)
#define ESR_ELx_WFx_ISS_TI        (UL(3) << 0)
#define ESR_ELx_WFx_ISS_WFxT        (UL(2) << 0)
#define ESR_ELx_WFx_ISS_WFI        (UL(0) << 0)
#define ESR_ELx_WFx_ISS_WFE        (UL(1) << 0)
#define ESR_ELx_xVC_IMM_MASK        ((UL(1) << 16) - 1)

/* ISS definitions for LD64B/ST64B/{T,P}SBCSYNC instructions */
#define ESR_ELx_ISS_OTHER_ST64BV        (0)
#define ESR_ELx_ISS_OTHER_ST64BV0        (1)
#define ESR_ELx_ISS_OTHER_LDST64B        (2)
#define ESR_ELx_ISS_OTHER_TSBCSYNC        (3)
#define ESR_ELx_ISS_OTHER_PSBCSYNC        (4)

#define DISR_EL1_IDS                (UL(1) << 24)
/*
 * DISR_EL1 and ESR_ELx share the bottom 13 bits, but the RES0 bits may mean
 * different things in the future...
 */
#define DISR_EL1_ESR_MASK        (ESR_ELx_AET | ESR_ELx_EA | ESR_ELx_FSC)

/* ESR value templates for specific events */
#define ESR_ELx_WFx_MASK        (ESR_ELx_EC_MASK |                        \
                                 (ESR_ELx_WFx_ISS_TI & ~ESR_ELx_WFx_ISS_WFxT))
#define ESR_ELx_WFx_WFI_VAL        ((ESR_ELx_EC_WFx << ESR_ELx_EC_SHIFT) |        \
                                 ESR_ELx_WFx_ISS_WFI)

/* BRK instruction trap from AArch64 state */
#define ESR_ELx_BRK64_ISS_COMMENT_MASK        0xffff

/* ISS field definitions for System instruction traps */
#define ESR_ELx_SYS64_ISS_RES0_SHIFT        22
#define ESR_ELx_SYS64_ISS_RES0_MASK        (UL(0x7) << ESR_ELx_SYS64_ISS_RES0_SHIFT)
#define ESR_ELx_SYS64_ISS_DIR_MASK        0x1
#define ESR_ELx_SYS64_ISS_DIR_READ        0x1
#define ESR_ELx_SYS64_ISS_DIR_WRITE        0x0

#define ESR_ELx_SYS64_ISS_RT_SHIFT        5
#define ESR_ELx_SYS64_ISS_RT_MASK        (UL(0x1f) << ESR_ELx_SYS64_ISS_RT_SHIFT)
#define ESR_ELx_SYS64_ISS_CRM_SHIFT        1
#define ESR_ELx_SYS64_ISS_CRM_MASK        (UL(0xf) << ESR_ELx_SYS64_ISS_CRM_SHIFT)
#define ESR_ELx_SYS64_ISS_CRN_SHIFT        10
#define ESR_ELx_SYS64_ISS_CRN_MASK        (UL(0xf) << ESR_ELx_SYS64_ISS_CRN_SHIFT)
#define ESR_ELx_SYS64_ISS_OP1_SHIFT        14
#define ESR_ELx_SYS64_ISS_OP1_MASK        (UL(0x7) << ESR_ELx_SYS64_ISS_OP1_SHIFT)
#define ESR_ELx_SYS64_ISS_OP2_SHIFT        17
#define ESR_ELx_SYS64_ISS_OP2_MASK        (UL(0x7) << ESR_ELx_SYS64_ISS_OP2_SHIFT)
#define ESR_ELx_SYS64_ISS_OP0_SHIFT        20
#define ESR_ELx_SYS64_ISS_OP0_MASK        (UL(0x3) << ESR_ELx_SYS64_ISS_OP0_SHIFT)
#define ESR_ELx_SYS64_ISS_SYS_MASK        (ESR_ELx_SYS64_ISS_OP0_MASK | \
                                         ESR_ELx_SYS64_ISS_OP1_MASK | \
                                         ESR_ELx_SYS64_ISS_OP2_MASK | \
                                         ESR_ELx_SYS64_ISS_CRN_MASK | \
                                         ESR_ELx_SYS64_ISS_CRM_MASK)
#define ESR_ELx_SYS64_ISS_SYS_VAL(op0, op1, op2, crn, crm) \
                                        (((op0) << ESR_ELx_SYS64_ISS_OP0_SHIFT) | \
                                         ((op1) << ESR_ELx_SYS64_ISS_OP1_SHIFT) | \
                                         ((op2) << ESR_ELx_SYS64_ISS_OP2_SHIFT) | \
                                         ((crn) << ESR_ELx_SYS64_ISS_CRN_SHIFT) | \
                                         ((crm) << ESR_ELx_SYS64_ISS_CRM_SHIFT))

#define ESR_ELx_SYS64_ISS_SYS_OP_MASK        (ESR_ELx_SYS64_ISS_SYS_MASK | \
                                         ESR_ELx_SYS64_ISS_DIR_MASK)
#define ESR_ELx_SYS64_ISS_RT(esr) \
        (((esr) & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT)
/*
 * User space cache operations have the following sysreg encoding
 * in System instructions.
 * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 13, 14 }, WRITE (L=0)
 */
#define ESR_ELx_SYS64_ISS_CRM_DC_CIVAC        14
#define ESR_ELx_SYS64_ISS_CRM_DC_CVADP        13
#define ESR_ELx_SYS64_ISS_CRM_DC_CVAP        12
#define ESR_ELx_SYS64_ISS_CRM_DC_CVAU        11
#define ESR_ELx_SYS64_ISS_CRM_DC_CVAC        10
#define ESR_ELx_SYS64_ISS_CRM_IC_IVAU        5

#define ESR_ELx_SYS64_ISS_EL0_CACHE_OP_MASK        (ESR_ELx_SYS64_ISS_OP0_MASK | \
                                                 ESR_ELx_SYS64_ISS_OP1_MASK | \
                                                 ESR_ELx_SYS64_ISS_OP2_MASK | \
                                                 ESR_ELx_SYS64_ISS_CRN_MASK | \
                                                 ESR_ELx_SYS64_ISS_DIR_MASK)
#define ESR_ELx_SYS64_ISS_EL0_CACHE_OP_VAL \
                                (ESR_ELx_SYS64_ISS_SYS_VAL(1, 3, 1, 7, 0) | \
                                 ESR_ELx_SYS64_ISS_DIR_WRITE)
/*
 * User space MRS operations which are supported for emulation
 * have the following sysreg encoding in System instructions.
 * op0 = 3, op1= 0, crn = 0, {crm = 0, 4-7}, READ (L = 1)
 */
#define ESR_ELx_SYS64_ISS_SYS_MRS_OP_MASK        (ESR_ELx_SYS64_ISS_OP0_MASK | \
                                                 ESR_ELx_SYS64_ISS_OP1_MASK | \
                                                 ESR_ELx_SYS64_ISS_CRN_MASK | \
                                                 ESR_ELx_SYS64_ISS_DIR_MASK)
#define ESR_ELx_SYS64_ISS_SYS_MRS_OP_VAL \
                                (ESR_ELx_SYS64_ISS_SYS_VAL(3, 0, 0, 0, 0) | \
                                 ESR_ELx_SYS64_ISS_DIR_READ)

#define ESR_ELx_SYS64_ISS_SYS_CTR        ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 1, 0, 0)
#define ESR_ELx_SYS64_ISS_SYS_CTR_READ        (ESR_ELx_SYS64_ISS_SYS_CTR | \
                                         ESR_ELx_SYS64_ISS_DIR_READ)

#define ESR_ELx_SYS64_ISS_SYS_CNTVCT        (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 2, 14, 0) | \
                                         ESR_ELx_SYS64_ISS_DIR_READ)

#define ESR_ELx_SYS64_ISS_SYS_CNTVCTSS        (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 6, 14, 0) | \
                                         ESR_ELx_SYS64_ISS_DIR_READ)

#define ESR_ELx_SYS64_ISS_SYS_CNTFRQ        (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \
                                         ESR_ELx_SYS64_ISS_DIR_READ)

#define esr_sys64_to_sysreg(e)                                        \
        sys_reg((((e) & ESR_ELx_SYS64_ISS_OP0_MASK) >>                \
                 ESR_ELx_SYS64_ISS_OP0_SHIFT),                        \
                (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >>                \
                 ESR_ELx_SYS64_ISS_OP1_SHIFT),                        \
                (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >>                \
                 ESR_ELx_SYS64_ISS_CRN_SHIFT),                        \
                (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >>                \
                 ESR_ELx_SYS64_ISS_CRM_SHIFT),                        \
                (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >>                \
                 ESR_ELx_SYS64_ISS_OP2_SHIFT))

#define esr_cp15_to_sysreg(e)                                        \
        sys_reg(3,                                                \
                (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >>                \
                 ESR_ELx_SYS64_ISS_OP1_SHIFT),                        \
                (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >>                \
                 ESR_ELx_SYS64_ISS_CRN_SHIFT),                        \
                (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >>                \
                 ESR_ELx_SYS64_ISS_CRM_SHIFT),                        \
                (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >>                \
                 ESR_ELx_SYS64_ISS_OP2_SHIFT))

/* ISS field definitions for ERET/ERETAA/ERETAB trapping */
#define ESR_ELx_ERET_ISS_ERET                0x2
#define ESR_ELx_ERET_ISS_ERETA                0x1

/*
 * ISS field definitions for floating-point exception traps
 * (FP_EXC_32/FP_EXC_64).
 *
 * (The FPEXC_* constants are used instead for common bits.)
 */

#define ESR_ELx_FP_EXC_TFV        (UL(1) << 23)

/*
 * ISS field definitions for CP15 accesses
 */
#define ESR_ELx_CP15_32_ISS_DIR_MASK        0x1
#define ESR_ELx_CP15_32_ISS_DIR_READ        0x1
#define ESR_ELx_CP15_32_ISS_DIR_WRITE        0x0

#define ESR_ELx_CP15_32_ISS_RT_SHIFT        5
#define ESR_ELx_CP15_32_ISS_RT_MASK        (UL(0x1f) << ESR_ELx_CP15_32_ISS_RT_SHIFT)
#define ESR_ELx_CP15_32_ISS_CRM_SHIFT        1
#define ESR_ELx_CP15_32_ISS_CRM_MASK        (UL(0xf) << ESR_ELx_CP15_32_ISS_CRM_SHIFT)
#define ESR_ELx_CP15_32_ISS_CRN_SHIFT        10
#define ESR_ELx_CP15_32_ISS_CRN_MASK        (UL(0xf) << ESR_ELx_CP15_32_ISS_CRN_SHIFT)
#define ESR_ELx_CP15_32_ISS_OP1_SHIFT        14
#define ESR_ELx_CP15_32_ISS_OP1_MASK        (UL(0x7) << ESR_ELx_CP15_32_ISS_OP1_SHIFT)
#define ESR_ELx_CP15_32_ISS_OP2_SHIFT        17
#define ESR_ELx_CP15_32_ISS_OP2_MASK        (UL(0x7) << ESR_ELx_CP15_32_ISS_OP2_SHIFT)

#define ESR_ELx_CP15_32_ISS_SYS_MASK        (ESR_ELx_CP15_32_ISS_OP1_MASK | \
                                         ESR_ELx_CP15_32_ISS_OP2_MASK | \
                                         ESR_ELx_CP15_32_ISS_CRN_MASK | \
                                         ESR_ELx_CP15_32_ISS_CRM_MASK | \
                                         ESR_ELx_CP15_32_ISS_DIR_MASK)
#define ESR_ELx_CP15_32_ISS_SYS_VAL(op1, op2, crn, crm) \
                                        (((op1) << ESR_ELx_CP15_32_ISS_OP1_SHIFT) | \
                                         ((op2) << ESR_ELx_CP15_32_ISS_OP2_SHIFT) | \
                                         ((crn) << ESR_ELx_CP15_32_ISS_CRN_SHIFT) | \
                                         ((crm) << ESR_ELx_CP15_32_ISS_CRM_SHIFT))

#define ESR_ELx_CP15_64_ISS_DIR_MASK        0x1
#define ESR_ELx_CP15_64_ISS_DIR_READ        0x1
#define ESR_ELx_CP15_64_ISS_DIR_WRITE        0x0

#define ESR_ELx_CP15_64_ISS_RT_SHIFT        5
#define ESR_ELx_CP15_64_ISS_RT_MASK        (UL(0x1f) << ESR_ELx_CP15_64_ISS_RT_SHIFT)

#define ESR_ELx_CP15_64_ISS_RT2_SHIFT        10
#define ESR_ELx_CP15_64_ISS_RT2_MASK        (UL(0x1f) << ESR_ELx_CP15_64_ISS_RT2_SHIFT)

#define ESR_ELx_CP15_64_ISS_OP1_SHIFT        16
#define ESR_ELx_CP15_64_ISS_OP1_MASK        (UL(0xf) << ESR_ELx_CP15_64_ISS_OP1_SHIFT)
#define ESR_ELx_CP15_64_ISS_CRM_SHIFT        1
#define ESR_ELx_CP15_64_ISS_CRM_MASK        (UL(0xf) << ESR_ELx_CP15_64_ISS_CRM_SHIFT)

#define ESR_ELx_CP15_64_ISS_SYS_VAL(op1, crm) \
                                        (((op1) << ESR_ELx_CP15_64_ISS_OP1_SHIFT) | \
                                         ((crm) << ESR_ELx_CP15_64_ISS_CRM_SHIFT))

#define ESR_ELx_CP15_64_ISS_SYS_MASK        (ESR_ELx_CP15_64_ISS_OP1_MASK |        \
                                         ESR_ELx_CP15_64_ISS_CRM_MASK | \
                                         ESR_ELx_CP15_64_ISS_DIR_MASK)

#define ESR_ELx_CP15_64_ISS_SYS_CNTVCT        (ESR_ELx_CP15_64_ISS_SYS_VAL(1, 14) | \
                                         ESR_ELx_CP15_64_ISS_DIR_READ)

#define ESR_ELx_CP15_64_ISS_SYS_CNTVCTSS (ESR_ELx_CP15_64_ISS_SYS_VAL(9, 14) | \
                                         ESR_ELx_CP15_64_ISS_DIR_READ)

#define ESR_ELx_CP15_32_ISS_SYS_CNTFRQ        (ESR_ELx_CP15_32_ISS_SYS_VAL(0, 0, 14, 0) |\
                                         ESR_ELx_CP15_32_ISS_DIR_READ)

/*
 * ISS values for SME traps
 */

#define ESR_ELx_SME_ISS_SME_DISABLED        0
#define ESR_ELx_SME_ISS_ILL                1
#define ESR_ELx_SME_ISS_SM_DISABLED        2
#define ESR_ELx_SME_ISS_ZA_DISABLED        3
#define ESR_ELx_SME_ISS_ZT_DISABLED        4

/* ISS field definitions for MOPS exceptions */
#define ESR_ELx_MOPS_ISS_MEM_INST        (UL(1) << 24)
#define ESR_ELx_MOPS_ISS_FROM_EPILOGUE        (UL(1) << 18)
#define ESR_ELx_MOPS_ISS_WRONG_OPTION        (UL(1) << 17)
#define ESR_ELx_MOPS_ISS_OPTION_A        (UL(1) << 16)
#define ESR_ELx_MOPS_ISS_DESTREG(esr)        (((esr) & (UL(0x1f) << 10)) >> 10)
#define ESR_ELx_MOPS_ISS_SRCREG(esr)        (((esr) & (UL(0x1f) << 5)) >> 5)
#define ESR_ELx_MOPS_ISS_SIZEREG(esr)        (((esr) & (UL(0x1f) << 0)) >> 0)

/* ISS field definitions for GCS */
#define ESR_ELx_ExType_SHIFT        (20)
#define ESR_ELx_ExType_MASK                GENMASK(23, 20)
#define ESR_ELx_Raddr_SHIFT                (10)
#define ESR_ELx_Raddr_MASK                GENMASK(14, 10)
#define ESR_ELx_Rn_SHIFT                (5)
#define ESR_ELx_Rn_MASK                        GENMASK(9, 5)
#define ESR_ELx_Rvalue_SHIFT                5
#define ESR_ELx_Rvalue_MASK                GENMASK(9, 5)
#define ESR_ELx_IT_SHIFT                (0)
#define ESR_ELx_IT_MASK                        GENMASK(4, 0)

#define ESR_ELx_ExType_DATA_CHECK        0
#define ESR_ELx_ExType_EXLOCK                1
#define ESR_ELx_ExType_STR                2

#define ESR_ELx_IT_RET                        0
#define ESR_ELx_IT_GCSPOPM                1
#define ESR_ELx_IT_RET_KEYA                2
#define ESR_ELx_IT_RET_KEYB                3
#define ESR_ELx_IT_GCSSS1                4
#define ESR_ELx_IT_GCSSS2                5
#define ESR_ELx_IT_GCSPOPCX                6
#define ESR_ELx_IT_GCSPOPX                7

#ifndef __ASSEMBLY__
#include <asm/types.h>

static inline unsigned long esr_brk_comment(unsigned long esr)
{
        return esr & ESR_ELx_BRK64_ISS_COMMENT_MASK;
}

static inline bool esr_is_data_abort(unsigned long esr)
{
        const unsigned long ec = ESR_ELx_EC(esr);

        return ec == ESR_ELx_EC_DABT_LOW || ec == ESR_ELx_EC_DABT_CUR;
}

static inline bool esr_is_cfi_brk(unsigned long esr)
{
        return ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
               (esr_brk_comment(esr) & ~CFI_BRK_IMM_MASK) == CFI_BRK_IMM_BASE;
}

static inline bool esr_is_ubsan_brk(unsigned long esr)
{
        return (esr_brk_comment(esr) & ~UBSAN_BRK_MASK) == UBSAN_BRK_IMM;
}

static inline bool esr_fsc_is_translation_fault(unsigned long esr)
{
        esr = esr & ESR_ELx_FSC;

        return (esr == ESR_ELx_FSC_FAULT_L(3)) ||
               (esr == ESR_ELx_FSC_FAULT_L(2)) ||
               (esr == ESR_ELx_FSC_FAULT_L(1)) ||
               (esr == ESR_ELx_FSC_FAULT_L(0)) ||
               (esr == ESR_ELx_FSC_FAULT_L(-1));
}

static inline bool esr_fsc_is_permission_fault(unsigned long esr)
{
        esr = esr & ESR_ELx_FSC;

        return (esr == ESR_ELx_FSC_PERM_L(3)) ||
               (esr == ESR_ELx_FSC_PERM_L(2)) ||
               (esr == ESR_ELx_FSC_PERM_L(1)) ||
               (esr == ESR_ELx_FSC_PERM_L(0));
}

static inline bool esr_fsc_is_access_flag_fault(unsigned long esr)
{
        esr = esr & ESR_ELx_FSC;

        return (esr == ESR_ELx_FSC_ACCESS_L(3)) ||
               (esr == ESR_ELx_FSC_ACCESS_L(2)) ||
               (esr == ESR_ELx_FSC_ACCESS_L(1)) ||
               (esr == ESR_ELx_FSC_ACCESS_L(0));
}

static inline bool esr_fsc_is_addr_sz_fault(unsigned long esr)
{
        esr &= ESR_ELx_FSC;

        return (esr == ESR_ELx_FSC_ADDRSZ_L(3))        ||
               (esr == ESR_ELx_FSC_ADDRSZ_L(2))        ||
               (esr == ESR_ELx_FSC_ADDRSZ_L(1)) ||
               (esr == ESR_ELx_FSC_ADDRSZ_L(0))        ||
               (esr == ESR_ELx_FSC_ADDRSZ_L(-1));
}

static inline bool esr_fsc_is_sea_ttw(unsigned long esr)
{
        esr = esr & ESR_ELx_FSC;

        return (esr == ESR_ELx_FSC_SEA_TTW(3)) ||
               (esr == ESR_ELx_FSC_SEA_TTW(2)) ||
               (esr == ESR_ELx_FSC_SEA_TTW(1)) ||
               (esr == ESR_ELx_FSC_SEA_TTW(0)) ||
               (esr == ESR_ELx_FSC_SEA_TTW(-1));
}

static inline bool esr_fsc_is_secc_ttw(unsigned long esr)
{
        esr = esr & ESR_ELx_FSC;

        return (esr == ESR_ELx_FSC_SECC_TTW(3)) ||
               (esr == ESR_ELx_FSC_SECC_TTW(2)) ||
               (esr == ESR_ELx_FSC_SECC_TTW(1)) ||
               (esr == ESR_ELx_FSC_SECC_TTW(0)) ||
               (esr == ESR_ELx_FSC_SECC_TTW(-1));
}

/* Indicate whether ESR.EC==0x1A is for an ERETAx instruction */
static inline bool esr_iss_is_eretax(unsigned long esr)
{
        return esr & ESR_ELx_ERET_ISS_ERET;
}

/* Indicate which key is used for ERETAx (false: A-Key, true: B-Key) */
static inline bool esr_iss_is_eretab(unsigned long esr)
{
        return esr & ESR_ELx_ERET_ISS_ERETA;
}

const char *esr_get_class_string(unsigned long esr);
#endif /* __ASSEMBLY */

#endif /* __ASM_ESR_H */

















































































































































































































































   95 


   95 





   95 















   95 































































   95 


   95 



   95 













   95 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Percpu refcounts:
 * (C) 2012 Google, Inc.
 * Author: Kent Overstreet <koverstreet@google.com>
 *
 * This implements a refcount with similar semantics to atomic_t - atomic_inc(),
 * atomic_dec_and_test() - but percpu.
 *
 * There's one important difference between percpu refs and normal atomic_t
 * refcounts; you have to keep track of your initial refcount, and then when you
 * start shutting down you call percpu_ref_kill() _before_ dropping the initial
 * refcount.
 *
 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
 * than an atomic_t - this is because of the way shutdown works, see
 * percpu_ref_kill()/PERCPU_COUNT_BIAS.
 *
 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
 * puts the ref back in single atomic_t mode, collecting the per cpu refs and
 * issuing the appropriate barriers, and then marks the ref as shutting down so
 * that percpu_ref_put() will check for the ref hitting 0.  After it returns,
 * it's safe to drop the initial ref.
 *
 * USAGE:
 *
 * See fs/aio.c for some example usage; it's used there for struct kioctx, which
 * is created when userspaces calls io_setup(), and destroyed when userspace
 * calls io_destroy() or the process exits.
 *
 * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
 * removes the kioctx from the proccess's table of kioctxs and kills percpu_ref.
 * After that, there can't be any new users of the kioctx (from lookup_ioctx())
 * and it's then safe to drop the initial ref with percpu_ref_put().
 *
 * Note that the free path, free_ioctx(), needs to go through explicit call_rcu()
 * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations don't
 * imply RCU grace periods of any kind and if a user wants to combine percpu_ref
 * with RCU protection, it must be done explicitly.
 *
 * Code that does a two stage shutdown like this often needs some kind of
 * explicit synchronization to ensure the initial refcount can only be dropped
 * once - percpu_ref_kill() does this for you, it returns true once and false if
 * someone else already called it. The aio code uses it this way, but it's not
 * necessary if the code has some other mechanism to synchronize teardown.
 * around.
 */

#ifndef _LINUX_PERCPU_REFCOUNT_H
#define _LINUX_PERCPU_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/types.h>
#include <linux/gfp.h>

struct percpu_ref;
typedef void (percpu_ref_func_t)(struct percpu_ref *);

/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
enum {
        __PERCPU_REF_ATOMIC        = 1LU << 0,        /* operating in atomic mode */
        __PERCPU_REF_DEAD        = 1LU << 1,        /* (being) killed */
        __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,

        __PERCPU_REF_FLAG_BITS        = 2,
};

/* @flags for percpu_ref_init() */
enum {
        /*
         * Start w/ ref == 1 in atomic mode.  Can be switched to percpu
         * operation using percpu_ref_switch_to_percpu().  If initialized
         * with this flag, the ref will stay in atomic mode until
         * percpu_ref_switch_to_percpu() is invoked on it.
         * Implies ALLOW_REINIT.
         */
        PERCPU_REF_INIT_ATOMIC        = 1 << 0,

        /*
         * Start dead w/ ref == 0 in atomic mode.  Must be revived with
         * percpu_ref_reinit() before used.  Implies INIT_ATOMIC and
         * ALLOW_REINIT.
         */
        PERCPU_REF_INIT_DEAD        = 1 << 1,

        /*
         * Allow switching from atomic mode to percpu mode.
         */
        PERCPU_REF_ALLOW_REINIT        = 1 << 2,
};

struct percpu_ref_data {
        atomic_long_t                count;
        percpu_ref_func_t        *release;
        percpu_ref_func_t        *confirm_switch;
        bool                        force_atomic:1;
        bool                        allow_reinit:1;
        struct rcu_head                rcu;
        struct percpu_ref        *ref;
};

struct percpu_ref {
        /*
         * The low bit of the pointer indicates whether the ref is in percpu
         * mode; if set, then get/put will manipulate the atomic_t.
         */
        unsigned long                percpu_count_ptr;

        /*
         * 'percpu_ref' is often embedded into user structure, and only
         * 'percpu_count_ptr' is required in fast path, move other fields
         * into 'percpu_ref_data', so we can reduce memory footprint in
         * fast path.
         */
        struct percpu_ref_data  *data;
};

int __must_check percpu_ref_init(struct percpu_ref *ref,
                                 percpu_ref_func_t *release, unsigned int flags,
                                 gfp_t gfp);
void percpu_ref_exit(struct percpu_ref *ref);
void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_switch);
void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref);
void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
                                 percpu_ref_func_t *confirm_kill);
void percpu_ref_resurrect(struct percpu_ref *ref);
void percpu_ref_reinit(struct percpu_ref *ref);
bool percpu_ref_is_zero(struct percpu_ref *ref);

/**
 * percpu_ref_kill - drop the initial ref
 * @ref: percpu_ref to kill
 *
 * Must be used to drop the initial ref on a percpu refcount; must be called
 * precisely once before shutdown.
 *
 * Switches @ref into atomic mode before gathering up the percpu counters
 * and dropping the initial ref.
 *
 * There are no implied RCU grace periods between kill and release.
 */
static inline void percpu_ref_kill(struct percpu_ref *ref)
{
        percpu_ref_kill_and_confirm(ref, NULL);
}

/*
 * Internal helper.  Don't use outside percpu-refcount proper.  The
 * function doesn't return the pointer and let the caller test it for NULL
 * because doing so forces the compiler to generate two conditional
 * branches as it can't assume that @ref->percpu_count is not NULL.
 */
static inline bool __ref_is_percpu(struct percpu_ref *ref,
                                          unsigned long __percpu **percpu_countp)
{
        unsigned long percpu_ptr;

        /*
         * The value of @ref->percpu_count_ptr is tested for
         * !__PERCPU_REF_ATOMIC, which may be set asynchronously, and then
         * used as a pointer.  If the compiler generates a separate fetch
         * when using it as a pointer, __PERCPU_REF_ATOMIC may be set in
         * between contaminating the pointer value, meaning that
         * READ_ONCE() is required when fetching it.
         *
         * The dependency ordering from the READ_ONCE() pairs
         * with smp_store_release() in __percpu_ref_switch_to_percpu().
         */
        percpu_ptr = READ_ONCE(ref->percpu_count_ptr);

        /*
         * Theoretically, the following could test just ATOMIC; however,
         * then we'd have to mask off DEAD separately as DEAD may be
         * visible without ATOMIC if we race with percpu_ref_kill().  DEAD
         * implies ATOMIC anyway.  Test them together.
         */
        if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC_DEAD))
                return false;

        *percpu_countp = (unsigned long __percpu *)percpu_ptr;
        return true;
}

/**
 * percpu_ref_get_many - increment a percpu refcount
 * @ref: percpu_ref to get
 * @nr: number of references to get
 *
 * Analogous to atomic_long_add().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_add(*percpu_count, nr);
        else
                atomic_long_add(nr, &ref->data->count);

        rcu_read_unlock();
}

/**
 * percpu_ref_get - increment a percpu refcount
 * @ref: percpu_ref to get
 *
 * Analogous to atomic_long_inc().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_get(struct percpu_ref *ref)
{
        percpu_ref_get_many(ref, 1);
}

/**
 * percpu_ref_tryget_many - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 * @nr: number of references to get
 *
 * Increment a percpu refcount  by @nr unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_many(struct percpu_ref *ref,
                                          unsigned long nr)
{
        unsigned long __percpu *percpu_count;
        bool ret;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count)) {
                this_cpu_add(*percpu_count, nr);
                ret = true;
        } else {
                ret = atomic_long_add_unless(&ref->data->count, nr, 0);
        }

        rcu_read_unlock();

        return ret;
}

/**
 * percpu_ref_tryget - try to increment a percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless its count already reached zero.
 * Returns %true on success; %false on failure.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget(struct percpu_ref *ref)
{
        return percpu_ref_tryget_many(ref, 1);
}

/**
 * percpu_ref_tryget_live_rcu - same as percpu_ref_tryget_live() but the
 * caller is responsible for taking RCU.
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live_rcu(struct percpu_ref *ref)
{
        unsigned long __percpu *percpu_count;
        bool ret = false;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (likely(__ref_is_percpu(ref, &percpu_count))) {
                this_cpu_inc(*percpu_count);
                ret = true;
        } else if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) {
                ret = atomic_long_inc_not_zero(&ref->data->count);
        }
        return ret;
}

/**
 * percpu_ref_tryget_live - try to increment a live percpu refcount
 * @ref: percpu_ref to try-get
 *
 * Increment a percpu refcount unless it has already been killed.  Returns
 * %true on success; %false on failure.
 *
 * Completion of percpu_ref_kill() in itself doesn't guarantee that this
 * function will fail.  For such guarantee, percpu_ref_kill_and_confirm()
 * should be used.  After the confirm_kill callback is invoked, it's
 * guaranteed that no new reference will be given out by
 * percpu_ref_tryget_live().
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
{
        bool ret = false;

        rcu_read_lock();
        ret = percpu_ref_tryget_live_rcu(ref);
        rcu_read_unlock();
        return ret;
}

/**
 * percpu_ref_put_many - decrement a percpu refcount
 * @ref: percpu_ref to put
 * @nr: number of references to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put_many(struct percpu_ref *ref, unsigned long nr)
{
        unsigned long __percpu *percpu_count;

        rcu_read_lock();

        if (__ref_is_percpu(ref, &percpu_count))
                this_cpu_sub(*percpu_count, nr);
        else if (unlikely(atomic_long_sub_and_test(nr, &ref->data->count)))
                ref->data->release(ref);

        rcu_read_unlock();
}

/**
 * percpu_ref_put - decrement a percpu refcount
 * @ref: percpu_ref to put
 *
 * Decrement the refcount, and if 0, call the release function (which was passed
 * to percpu_ref_init())
 *
 * This function is safe to call as long as @ref is between init and exit.
 */
static inline void percpu_ref_put(struct percpu_ref *ref)
{
        percpu_ref_put_many(ref, 1);
}

/**
 * percpu_ref_is_dying - test whether a percpu refcount is dying or dead
 * @ref: percpu_ref to test
 *
 * Returns %true if @ref is dying or dead.
 *
 * This function is safe to call as long as @ref is between init and exit
 * and the caller is responsible for synchronizing against state changes.
 */
static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
{
        return ref->percpu_count_ptr & __PERCPU_REF_DEAD;
}

#endif















































































































  162 


  162 




































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/cpu.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/uaccess.h>

#include <kvm/arm_vgic.h>

#include <asm/kvm_arm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_nested.h>

#include "vgic.h"

#define ICH_LRN(n)        (ICH_LR0_EL2 + (n))
#define ICH_AP0RN(n)        (ICH_AP0R0_EL2 + (n))
#define ICH_AP1RN(n)        (ICH_AP1R0_EL2 + (n))

struct mi_state {
        u16        eisr;
        u16        elrsr;
        bool        pend;
};

/*
 * The shadow registers loaded to the hardware when running a L2 guest
 * with the virtual IMO/FMO bits set.
 */
struct shadow_if {
        struct vgic_v3_cpu_if        cpuif;
        unsigned long                lr_map;
};

static DEFINE_PER_CPU(struct shadow_if, shadow_if);

/*
 * Nesting GICv3 support
 *
 * On a non-nesting VM (only running at EL0/EL1), the host hypervisor
 * completely controls the interrupts injected via the list registers.
 * Consequently, most of the state that is modified by the guest (by ACK-ing
 * and EOI-ing interrupts) is synced by KVM on each entry/exit, so that we
 * keep a semi-consistent view of the interrupts.
 *
 * This still applies for a NV guest, but only while "InHost" (either
 * running at EL2, or at EL0 with HCR_EL2.{E2H.TGE}=={1,1}.
 *
 * When running a L2 guest ("not InHost"), things are radically different,
 * as the L1 guest is in charge of provisioning the interrupts via its own
 * view of the ICH_LR*_EL2 registers, which conveniently live in the VNCR
 * page.  This means that the flow described above does work (there is no
 * state to rebuild in the L0 hypervisor), and that most things happed on L2
 * load/put:
 *
 * - on L2 load: move the in-memory L1 vGIC configuration into a shadow,
 *   per-CPU data structure that is used to populate the actual LRs. This is
 *   an extra copy that we could avoid, but life is short. In the process,
 *   we remap any interrupt that has the HW bit set to the mapped interrupt
 *   on the host, should the host consider it a HW one. This allows the HW
 *   deactivation to take its course, such as for the timer.
 *
 * - on L2 put: perform the inverse transformation, so that the result of L2
 *   running becomes visible to L1 in the VNCR-accessible registers.
 *
 * - there is nothing to do on L2 entry, as everything will have happened
 *   on load. However, this is the point where we detect that an interrupt
 *   targeting L1 and prepare the grand switcheroo.
 *
 * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1
 *   interrupt. The L0 active state will be cleared by the HW if the L1
 *   interrupt was itself backed by a HW interrupt.
 *
 * Maintenance Interrupt (MI) management:
 *
 * Since the L2 guest runs the vgic in its full glory, MIs get delivered and
 * used as a handover point between L2 and L1.
 *
 * - on delivery of a MI to L0 while L2 is running: make the L1 MI pending,
 *   and let it rip. This will initiate a vcpu_put() on L2, and allow L1 to
 *   run and process the MI.
 *
 * - L1 MI is a fully virtual interrupt, not linked to the host's MI. Its
 *   state must be computed at each entry/exit of the guest, much like we do
 *   it for the PMU interrupt.
 *
 * - because most of the ICH_*_EL2 registers live in the VNCR page, the
 *   quality of emulation is poor: L1 can setup the vgic so that an MI would
 *   immediately fire, and not observe anything until the next exit. Trying
 *   to read ICH_MISR_EL2 would do the trick, for example.
 *
 * System register emulation:
 *
 * We get two classes of registers:
 *
 * - those backed by memory (LRs, APRs, HCR, VMCR): L1 can freely access
 *   them, and L0 doesn't see a thing.
 *
 * - those that always trap (ELRSR, EISR, MISR): these are status registers
 *   that are built on the fly based on the in-memory state.
 *
 * Only L1 can access the ICH_*_EL2 registers. A non-NV L2 obviously cannot,
 * and a NV L2 would either access the VNCR page provided by L1 (memory
 * based registers), or see the access redirected to L1 (registers that
 * trap) thanks to NV being set by L1.
 */

bool vgic_state_is_nested(struct kvm_vcpu *vcpu)
{
        u64 xmo;

        if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
                xmo = __vcpu_sys_reg(vcpu, HCR_EL2) & (HCR_IMO | HCR_FMO);
                WARN_ONCE(xmo && xmo != (HCR_IMO | HCR_FMO),
                          "Separate virtual IRQ/FIQ settings not supported\n");

                return !!xmo;
        }

        return false;
}

static struct shadow_if *get_shadow_if(void)
{
        return this_cpu_ptr(&shadow_if);
}

static bool lr_triggers_eoi(u64 lr)
{
        return !(lr & (ICH_LR_STATE | ICH_LR_HW)) && (lr & ICH_LR_EOI);
}

static void vgic_compute_mi_state(struct kvm_vcpu *vcpu, struct mi_state *mi_state)
{
        u16 eisr = 0, elrsr = 0;
        bool pend = false;

        for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
                u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));

                if (lr_triggers_eoi(lr))
                        eisr |= BIT(i);
                if (!(lr & ICH_LR_STATE))
                        elrsr |= BIT(i);
                pend |= (lr & ICH_LR_PENDING_BIT);
        }

        mi_state->eisr        = eisr;
        mi_state->elrsr        = elrsr;
        mi_state->pend        = pend;
}

u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu)
{
        struct mi_state mi_state;

        vgic_compute_mi_state(vcpu, &mi_state);
        return mi_state.eisr;
}

u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu)
{
        struct mi_state mi_state;

        vgic_compute_mi_state(vcpu, &mi_state);
        return mi_state.elrsr;
}

u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu)
{
        struct mi_state mi_state;
        u64 reg = 0, hcr, vmcr;

        hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
        vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);

        vgic_compute_mi_state(vcpu, &mi_state);

        if (mi_state.eisr)
                reg |= ICH_MISR_EL2_EOI;

        if (__vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_UIE) {
                int used_lrs = kvm_vgic_global_state.nr_lr;

                used_lrs -= hweight16(mi_state.elrsr);
                reg |= (used_lrs <= 1) ? ICH_MISR_EL2_U : 0;
        }

        if ((hcr & ICH_HCR_EL2_LRENPIE) && FIELD_GET(ICH_HCR_EL2_EOIcount_MASK, hcr))
                reg |= ICH_MISR_EL2_LRENP;

        if ((hcr & ICH_HCR_EL2_NPIE) && !mi_state.pend)
                reg |= ICH_MISR_EL2_NP;

        if ((hcr & ICH_HCR_EL2_VGrp0EIE) && (vmcr & ICH_VMCR_ENG0_MASK))
                reg |= ICH_MISR_EL2_VGrp0E;

        if ((hcr & ICH_HCR_EL2_VGrp0DIE) && !(vmcr & ICH_VMCR_ENG0_MASK))
                reg |= ICH_MISR_EL2_VGrp0D;

        if ((hcr & ICH_HCR_EL2_VGrp1EIE) && (vmcr & ICH_VMCR_ENG1_MASK))
                reg |= ICH_MISR_EL2_VGrp1E;

        if ((hcr & ICH_HCR_EL2_VGrp1DIE) && !(vmcr & ICH_VMCR_ENG1_MASK))
                reg |= ICH_MISR_EL2_VGrp1D;

        return reg;
}

/*
 * For LRs which have HW bit set such as timer interrupts, we modify them to
 * have the host hardware interrupt number instead of the virtual one programmed
 * by the guest hypervisor.
 */
static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
                                     struct vgic_v3_cpu_if *s_cpu_if)
{
        unsigned long lr_map = 0;
        int index = 0;

        for (int i = 0; i < kvm_vgic_global_state.nr_lr; i++) {
                u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
                struct vgic_irq *irq;

                if (!(lr & ICH_LR_STATE))
                        lr = 0;

                if (!(lr & ICH_LR_HW))
                        goto next;

                /* We have the HW bit set, check for validity of pINTID */
                irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
                if (!irq || !irq->hw || irq->intid > VGIC_MAX_SPI ) {
                        /* There was no real mapping, so nuke the HW bit */
                        lr &= ~ICH_LR_HW;
                        if (irq)
                                vgic_put_irq(vcpu->kvm, irq);
                        goto next;
                }

                /* Translate the virtual mapping to the real one */
                lr &= ~ICH_LR_PHYS_ID_MASK;
                lr |= FIELD_PREP(ICH_LR_PHYS_ID_MASK, (u64)irq->hwintid);

                vgic_put_irq(vcpu->kvm, irq);

next:
                s_cpu_if->vgic_lr[index] = lr;
                if (lr) {
                        lr_map |= BIT(i);
                        index++;
                }
        }

        container_of(s_cpu_if, struct shadow_if, cpuif)->lr_map = lr_map;
        s_cpu_if->used_lrs = index;
}

void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
{
        struct shadow_if *shadow_if = get_shadow_if();
        int i, index = 0;

        for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
                u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
                struct vgic_irq *irq;

                if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
                        goto next;

                /*
                 * If we had a HW lr programmed by the guest hypervisor, we
                 * need to emulate the HW effect between the guest hypervisor
                 * and the nested guest.
                 */
                irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
                if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
                        goto next;

                lr = __gic_v3_get_lr(index);
                if (!(lr & ICH_LR_STATE))
                        irq->active = false;

                vgic_put_irq(vcpu->kvm, irq);
        next:
                index++;
        }
}

static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
                                        struct vgic_v3_cpu_if *s_cpu_if)
{
        struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
        u64 val = 0;
        int i;

        /*
         * If we're on a system with a broken vgic that requires
         * trapping, propagate the trapping requirements.
         *
         * Ah, the smell of rotten fruits...
         */
        if (static_branch_unlikely(&vgic_v3_cpuif_trap))
                val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
                                           ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR);
        s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val;
        s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
        s_cpu_if->vgic_sre = host_if->vgic_sre;

        for (i = 0; i < 4; i++) {
                s_cpu_if->vgic_ap0r[i] = __vcpu_sys_reg(vcpu, ICH_AP0RN(i));
                s_cpu_if->vgic_ap1r[i] = __vcpu_sys_reg(vcpu, ICH_AP1RN(i));
        }

        vgic_v3_create_shadow_lr(vcpu, s_cpu_if);
}

void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
{
        struct shadow_if *shadow_if = get_shadow_if();
        struct vgic_v3_cpu_if *cpu_if = &shadow_if->cpuif;

        BUG_ON(!vgic_state_is_nested(vcpu));

        vgic_v3_create_shadow_state(vcpu, cpu_if);

        __vgic_v3_restore_vmcr_aprs(cpu_if);
        __vgic_v3_activate_traps(cpu_if);

        __vgic_v3_restore_state(cpu_if);

        /*
         * Propagate the number of used LRs for the benefit of the HYP
         * GICv3 emulation code. Yes, this is a pretty sorry hack.
         */
        vcpu->arch.vgic_cpu.vgic_v3.used_lrs = cpu_if->used_lrs;
}

void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
{
        struct shadow_if *shadow_if = get_shadow_if();
        struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
        u64 val;
        int i;

        __vgic_v3_save_vmcr_aprs(s_cpu_if);
        __vgic_v3_deactivate_traps(s_cpu_if);
        __vgic_v3_save_state(s_cpu_if);

        /*
         * Translate the shadow state HW fields back to the virtual ones
         * before copying the shadow struct back to the nested one.
         */
        val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
        val &= ~ICH_HCR_EL2_EOIcount_MASK;
        val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK);
        __vcpu_sys_reg(vcpu, ICH_HCR_EL2) = val;
        __vcpu_sys_reg(vcpu, ICH_VMCR_EL2) = s_cpu_if->vgic_vmcr;

        for (i = 0; i < 4; i++) {
                __vcpu_sys_reg(vcpu, ICH_AP0RN(i)) = s_cpu_if->vgic_ap0r[i];
                __vcpu_sys_reg(vcpu, ICH_AP1RN(i)) = s_cpu_if->vgic_ap1r[i];
        }

        for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
                val = __vcpu_sys_reg(vcpu, ICH_LRN(i));

                val &= ~ICH_LR_STATE;
                val |= s_cpu_if->vgic_lr[i] & ICH_LR_STATE;

                __vcpu_sys_reg(vcpu, ICH_LRN(i)) = val;
                s_cpu_if->vgic_lr[i] = 0;
        }

        shadow_if->lr_map = 0;
        vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
}

/*
 * If we exit a L2 VM with a pending maintenance interrupt from the GIC,
 * then we need to forward this to L1 so that it can re-sync the appropriate
 * LRs and sample level triggered interrupts again.
 */
void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu)
{
        bool state = read_sysreg_s(SYS_ICH_MISR_EL2);

        /* This will force a switch back to L1 if the level is high */
        kvm_vgic_inject_irq(vcpu->kvm, vcpu,
                            vcpu->kvm->arch.vgic.mi_intid, state, vcpu);

        sysreg_clear_set_s(SYS_ICH_HCR_EL2, ICH_HCR_EL2_En, 0);
}

void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu)
{
        bool level;

        level  = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) & ICH_HCR_EL2_En;
        if (level)
                level &= vgic_v3_get_misr(vcpu);
        kvm_vgic_inject_irq(vcpu->kvm, vcpu,
                            vcpu->kvm->arch.vgic.mi_intid, level, vcpu);
}
























































  163 
























































































































































































































































































































  163 




















































































































































































































































































































































































































































































  163 
  163 


  163 





  163 












  163 
  163 





  163 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
// SPDX-License-Identifier: GPL-2.0
/*
 * device_cgroup.c - device cgroup subsystem
 *
 * Copyright 2007 IBM Corp
 */

#include <linux/bpf-cgroup.h>
#include <linux/device_cgroup.h>
#include <linux/cgroup.h>
#include <linux/ctype.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>

#ifdef CONFIG_CGROUP_DEVICE

static DEFINE_MUTEX(devcgroup_mutex);

enum devcg_behavior {
        DEVCG_DEFAULT_NONE,
        DEVCG_DEFAULT_ALLOW,
        DEVCG_DEFAULT_DENY,
};

/*
 * exception list locking rules:
 * hold devcgroup_mutex for update/read.
 * hold rcu_read_lock() for read.
 */

struct dev_exception_item {
        u32 major, minor;
        short type;
        short access;
        struct list_head list;
        struct rcu_head rcu;
};

struct dev_cgroup {
        struct cgroup_subsys_state css;
        struct list_head exceptions;
        enum devcg_behavior behavior;
};

static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s)
{
        return s ? container_of(s, struct dev_cgroup, css) : NULL;
}

static inline struct dev_cgroup *task_devcgroup(struct task_struct *task)
{
        return css_to_devcgroup(task_css(task, devices_cgrp_id));
}

/*
 * called under devcgroup_mutex
 */
static int dev_exceptions_copy(struct list_head *dest, struct list_head *orig)
{
        struct dev_exception_item *ex, *tmp, *new;

        lockdep_assert_held(&devcgroup_mutex);

        list_for_each_entry(ex, orig, list) {
                new = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
                if (!new)
                        goto free_and_exit;
                list_add_tail(&new->list, dest);
        }

        return 0;

free_and_exit:
        list_for_each_entry_safe(ex, tmp, dest, list) {
                list_del(&ex->list);
                kfree(ex);
        }
        return -ENOMEM;
}

static void dev_exceptions_move(struct list_head *dest, struct list_head *orig)
{
        struct dev_exception_item *ex, *tmp;

        lockdep_assert_held(&devcgroup_mutex);

        list_for_each_entry_safe(ex, tmp, orig, list) {
                list_move_tail(&ex->list, dest);
        }
}

/*
 * called under devcgroup_mutex
 */
static int dev_exception_add(struct dev_cgroup *dev_cgroup,
                             struct dev_exception_item *ex)
{
        struct dev_exception_item *excopy, *walk;

        lockdep_assert_held(&devcgroup_mutex);

        excopy = kmemdup(ex, sizeof(*ex), GFP_KERNEL);
        if (!excopy)
                return -ENOMEM;

        list_for_each_entry(walk, &dev_cgroup->exceptions, list) {
                if (walk->type != ex->type)
                        continue;
                if (walk->major != ex->major)
                        continue;
                if (walk->minor != ex->minor)
                        continue;

                walk->access |= ex->access;
                kfree(excopy);
                excopy = NULL;
        }

        if (excopy != NULL)
                list_add_tail_rcu(&excopy->list, &dev_cgroup->exceptions);
        return 0;
}

/*
 * called under devcgroup_mutex
 */
static void dev_exception_rm(struct dev_cgroup *dev_cgroup,
                             struct dev_exception_item *ex)
{
        struct dev_exception_item *walk, *tmp;

        lockdep_assert_held(&devcgroup_mutex);

        list_for_each_entry_safe(walk, tmp, &dev_cgroup->exceptions, list) {
                if (walk->type != ex->type)
                        continue;
                if (walk->major != ex->major)
                        continue;
                if (walk->minor != ex->minor)
                        continue;

                walk->access &= ~ex->access;
                if (!walk->access) {
                        list_del_rcu(&walk->list);
                        kfree_rcu(walk, rcu);
                }
        }
}

static void __dev_exception_clean(struct dev_cgroup *dev_cgroup)
{
        struct dev_exception_item *ex, *tmp;

        list_for_each_entry_safe(ex, tmp, &dev_cgroup->exceptions, list) {
                list_del_rcu(&ex->list);
                kfree_rcu(ex, rcu);
        }
}

/**
 * dev_exception_clean - frees all entries of the exception list
 * @dev_cgroup: dev_cgroup with the exception list to be cleaned
 *
 * called under devcgroup_mutex
 */
static void dev_exception_clean(struct dev_cgroup *dev_cgroup)
{
        lockdep_assert_held(&devcgroup_mutex);

        __dev_exception_clean(dev_cgroup);
}

static inline bool is_devcg_online(const struct dev_cgroup *devcg)
{
        return (devcg->behavior != DEVCG_DEFAULT_NONE);
}

/**
 * devcgroup_online - initializes devcgroup's behavior and exceptions based on
 *                       parent's
 * @css: css getting online
 * returns 0 in case of success, error code otherwise
 */
static int devcgroup_online(struct cgroup_subsys_state *css)
{
        struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);
        struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css->parent);
        int ret = 0;

        mutex_lock(&devcgroup_mutex);

        if (parent_dev_cgroup == NULL)
                dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW;
        else {
                ret = dev_exceptions_copy(&dev_cgroup->exceptions,
                                          &parent_dev_cgroup->exceptions);
                if (!ret)
                        dev_cgroup->behavior = parent_dev_cgroup->behavior;
        }
        mutex_unlock(&devcgroup_mutex);

        return ret;
}

static void devcgroup_offline(struct cgroup_subsys_state *css)
{
        struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);

        mutex_lock(&devcgroup_mutex);
        dev_cgroup->behavior = DEVCG_DEFAULT_NONE;
        mutex_unlock(&devcgroup_mutex);
}

/*
 * called from kernel/cgroup/cgroup.c with cgroup_lock() held.
 */
static struct cgroup_subsys_state *
devcgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct dev_cgroup *dev_cgroup;

        dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
        if (!dev_cgroup)
                return ERR_PTR(-ENOMEM);
        INIT_LIST_HEAD(&dev_cgroup->exceptions);
        dev_cgroup->behavior = DEVCG_DEFAULT_NONE;

        return &dev_cgroup->css;
}

static void devcgroup_css_free(struct cgroup_subsys_state *css)
{
        struct dev_cgroup *dev_cgroup = css_to_devcgroup(css);

        __dev_exception_clean(dev_cgroup);
        kfree(dev_cgroup);
}

#define DEVCG_ALLOW 1
#define DEVCG_DENY 2
#define DEVCG_LIST 3

#define MAJMINLEN 13
#define ACCLEN 4

static void set_access(char *acc, short access)
{
        int idx = 0;
        memset(acc, 0, ACCLEN);
        if (access & DEVCG_ACC_READ)
                acc[idx++] = 'r';
        if (access & DEVCG_ACC_WRITE)
                acc[idx++] = 'w';
        if (access & DEVCG_ACC_MKNOD)
                acc[idx++] = 'm';
}

static char type_to_char(short type)
{
        if (type == DEVCG_DEV_ALL)
                return 'a';
        if (type == DEVCG_DEV_CHAR)
                return 'c';
        if (type == DEVCG_DEV_BLOCK)
                return 'b';
        return 'X';
}

static void set_majmin(char *str, unsigned m)
{
        if (m == ~0)
                strcpy(str, "*");
        else
                sprintf(str, "%u", m);
}

static int devcgroup_seq_show(struct seq_file *m, void *v)
{
        struct dev_cgroup *devcgroup = css_to_devcgroup(seq_css(m));
        struct dev_exception_item *ex;
        char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];

        rcu_read_lock();
        /*
         * To preserve the compatibility:
         * - Only show the "all devices" when the default policy is to allow
         * - List the exceptions in case the default policy is to deny
         * This way, the file remains as a "whitelist of devices"
         */
        if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
                set_access(acc, DEVCG_ACC_MASK);
                set_majmin(maj, ~0);
                set_majmin(min, ~0);
                seq_printf(m, "%c %s:%s %s\n", type_to_char(DEVCG_DEV_ALL),
                           maj, min, acc);
        } else {
                list_for_each_entry_rcu(ex, &devcgroup->exceptions, list) {
                        set_access(acc, ex->access);
                        set_majmin(maj, ex->major);
                        set_majmin(min, ex->minor);
                        seq_printf(m, "%c %s:%s %s\n", type_to_char(ex->type),
                                   maj, min, acc);
                }
        }
        rcu_read_unlock();

        return 0;
}

/**
 * match_exception        - iterates the exception list trying to find a complete match
 * @exceptions: list of exceptions
 * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR)
 * @major: device file major number, ~0 to match all
 * @minor: device file minor number, ~0 to match all
 * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD)
 *
 * It is considered a complete match if an exception is found that will
 * contain the entire range of provided parameters.
 *
 * Return: true in case it matches an exception completely
 */
static bool match_exception(struct list_head *exceptions, short type,
                            u32 major, u32 minor, short access)
{
        struct dev_exception_item *ex;

        list_for_each_entry_rcu(ex, exceptions, list) {
                if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK))
                        continue;
                if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR))
                        continue;
                if (ex->major != ~0 && ex->major != major)
                        continue;
                if (ex->minor != ~0 && ex->minor != minor)
                        continue;
                /* provided access cannot have more than the exception rule */
                if (access & (~ex->access))
                        continue;
                return true;
        }
        return false;
}

/**
 * match_exception_partial - iterates the exception list trying to find a partial match
 * @exceptions: list of exceptions
 * @type: device type (DEVCG_DEV_BLOCK or DEVCG_DEV_CHAR)
 * @major: device file major number, ~0 to match all
 * @minor: device file minor number, ~0 to match all
 * @access: permission mask (DEVCG_ACC_READ, DEVCG_ACC_WRITE, DEVCG_ACC_MKNOD)
 *
 * It is considered a partial match if an exception's range is found to
 * contain *any* of the devices specified by provided parameters. This is
 * used to make sure no extra access is being granted that is forbidden by
 * any of the exception list.
 *
 * Return: true in case the provided range mat matches an exception completely
 */
static bool match_exception_partial(struct list_head *exceptions, short type,
                                    u32 major, u32 minor, short access)
{
        struct dev_exception_item *ex;

        list_for_each_entry_rcu(ex, exceptions, list,
                                lockdep_is_held(&devcgroup_mutex)) {
                if ((type & DEVCG_DEV_BLOCK) && !(ex->type & DEVCG_DEV_BLOCK))
                        continue;
                if ((type & DEVCG_DEV_CHAR) && !(ex->type & DEVCG_DEV_CHAR))
                        continue;
                /*
                 * We must be sure that both the exception and the provided
                 * range aren't masking all devices
                 */
                if (ex->major != ~0 && major != ~0 && ex->major != major)
                        continue;
                if (ex->minor != ~0 && minor != ~0 && ex->minor != minor)
                        continue;
                /*
                 * In order to make sure the provided range isn't matching
                 * an exception, all its access bits shouldn't match the
                 * exception's access bits
                 */
                if (!(access & ex->access))
                        continue;
                return true;
        }
        return false;
}

/**
 * verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions
 * @dev_cgroup: dev cgroup to be tested against
 * @refex: new exception
 * @behavior: behavior of the exception's dev_cgroup
 *
 * This is used to make sure a child cgroup won't have more privileges
 * than its parent
 */
static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
                          struct dev_exception_item *refex,
                          enum devcg_behavior behavior)
{
        bool match = false;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
                         !lockdep_is_held(&devcgroup_mutex),
                         "device_cgroup:verify_new_ex called without proper synchronization");

        if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
                if (behavior == DEVCG_DEFAULT_ALLOW) {
                        /*
                         * new exception in the child doesn't matter, only
                         * adding extra restrictions
                         */ 
                        return true;
                } else {
                        /*
                         * new exception in the child will add more devices
                         * that can be accessed, so it can't match any of
                         * parent's exceptions, even slightly
                         */ 
                        match = match_exception_partial(&dev_cgroup->exceptions,
                                                        refex->type,
                                                        refex->major,
                                                        refex->minor,
                                                        refex->access);

                        if (match)
                                return false;
                        return true;
                }
        } else {
                /*
                 * Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore
                 * the new exception will add access to more devices and must
                 * be contained completely in an parent's exception to be
                 * allowed
                 */
                match = match_exception(&dev_cgroup->exceptions, refex->type,
                                        refex->major, refex->minor,
                                        refex->access);

                if (match)
                        /* parent has an exception that matches the proposed */
                        return true;
                else
                        return false;
        }
        return false;
}

/*
 * parent_has_perm:
 * when adding a new allow rule to a device exception list, the rule
 * must be allowed in the parent device
 */
static int parent_has_perm(struct dev_cgroup *childcg,
                                  struct dev_exception_item *ex)
{
        struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);

        if (!parent)
                return 1;
        return verify_new_ex(parent, ex, childcg->behavior);
}

/**
 * parent_allows_removal - verify if it's ok to remove an exception
 * @childcg: child cgroup from where the exception will be removed
 * @ex: exception being removed
 *
 * When removing an exception in cgroups with default ALLOW policy, it must
 * be checked if removing it will give the child cgroup more access than the
 * parent.
 *
 * Return: true if it's ok to remove exception, false otherwise
 */
static bool parent_allows_removal(struct dev_cgroup *childcg,
                                  struct dev_exception_item *ex)
{
        struct dev_cgroup *parent = css_to_devcgroup(childcg->css.parent);

        if (!parent)
                return true;

        /* It's always allowed to remove access to devices */
        if (childcg->behavior == DEVCG_DEFAULT_DENY)
                return true;

        /*
         * Make sure you're not removing part or a whole exception existing in
         * the parent cgroup
         */
        return !match_exception_partial(&parent->exceptions, ex->type,
                                        ex->major, ex->minor, ex->access);
}

/**
 * may_allow_all - checks if it's possible to change the behavior to
 *                   allow based on parent's rules.
 * @parent: device cgroup's parent
 * returns: != 0 in case it's allowed, 0 otherwise
 */
static inline int may_allow_all(struct dev_cgroup *parent)
{
        if (!parent)
                return 1;
        return parent->behavior == DEVCG_DEFAULT_ALLOW;
}

/**
 * revalidate_active_exceptions - walks through the active exception list and
 *                                   revalidates the exceptions based on parent's
 *                                   behavior and exceptions. The exceptions that
 *                                   are no longer valid will be removed.
 *                                   Called with devcgroup_mutex held.
 * @devcg: cgroup which exceptions will be checked
 *
 * This is one of the three key functions for hierarchy implementation.
 * This function is responsible for re-evaluating all the cgroup's active
 * exceptions due to a parent's exception change.
 * Refer to Documentation/admin-guide/cgroup-v1/devices.rst for more details.
 */
static void revalidate_active_exceptions(struct dev_cgroup *devcg)
{
        struct dev_exception_item *ex;
        struct list_head *this, *tmp;

        list_for_each_safe(this, tmp, &devcg->exceptions) {
                ex = container_of(this, struct dev_exception_item, list);
                if (!parent_has_perm(devcg, ex))
                        dev_exception_rm(devcg, ex);
        }
}

/**
 * propagate_exception - propagates a new exception to the children
 * @devcg_root: device cgroup that added a new exception
 * @ex: new exception to be propagated
 *
 * returns: 0 in case of success, != 0 in case of error
 */
static int propagate_exception(struct dev_cgroup *devcg_root,
                               struct dev_exception_item *ex)
{
        struct cgroup_subsys_state *pos;
        int rc = 0;

        rcu_read_lock();

        css_for_each_descendant_pre(pos, &devcg_root->css) {
                struct dev_cgroup *devcg = css_to_devcgroup(pos);

                /*
                 * Because devcgroup_mutex is held, no devcg will become
                 * online or offline during the tree walk (see on/offline
                 * methods), and online ones are safe to access outside RCU
                 * read lock without bumping refcnt.
                 */
                if (pos == &devcg_root->css || !is_devcg_online(devcg))
                        continue;

                rcu_read_unlock();

                /*
                 * in case both root's behavior and devcg is allow, a new
                 * restriction means adding to the exception list
                 */
                if (devcg_root->behavior == DEVCG_DEFAULT_ALLOW &&
                    devcg->behavior == DEVCG_DEFAULT_ALLOW) {
                        rc = dev_exception_add(devcg, ex);
                        if (rc)
                                return rc;
                } else {
                        /*
                         * in the other possible cases:
                         * root's behavior: allow, devcg's: deny
                         * root's behavior: deny, devcg's: deny
                         * the exception will be removed
                         */
                        dev_exception_rm(devcg, ex);
                }
                revalidate_active_exceptions(devcg);

                rcu_read_lock();
        }

        rcu_read_unlock();
        return rc;
}

/*
 * Modify the exception list using allow/deny rules.
 * CAP_SYS_ADMIN is needed for this.  It's at least separate from CAP_MKNOD
 * so we can give a container CAP_MKNOD to let it create devices but not
 * modify the exception list.
 * It seems likely we'll want to add a CAP_CONTAINER capability to allow
 * us to also grant CAP_SYS_ADMIN to containers without giving away the
 * device exception list controls, but for now we'll stick with CAP_SYS_ADMIN
 *
 * Taking rules away is always allowed (given CAP_SYS_ADMIN).  Granting
 * new access is only allowed if you're in the top-level cgroup, or your
 * parent cgroup has the access you're asking for.
 */
static int devcgroup_update_access(struct dev_cgroup *devcgroup,
                                   int filetype, char *buffer)
{
        const char *b;
        char temp[12];                /* 11 + 1 characters needed for a u32 */
        int count, rc = 0;
        struct dev_exception_item ex;
        struct dev_cgroup *parent = css_to_devcgroup(devcgroup->css.parent);
        struct dev_cgroup tmp_devcgrp;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        memset(&ex, 0, sizeof(ex));
        memset(&tmp_devcgrp, 0, sizeof(tmp_devcgrp));
        b = buffer;

        switch (*b) {
        case 'a':
                switch (filetype) {
                case DEVCG_ALLOW:
                        if (css_has_online_children(&devcgroup->css))
                                return -EINVAL;

                        if (!may_allow_all(parent))
                                return -EPERM;
                        if (!parent) {
                                devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
                                dev_exception_clean(devcgroup);
                                break;
                        }

                        INIT_LIST_HEAD(&tmp_devcgrp.exceptions);
                        rc = dev_exceptions_copy(&tmp_devcgrp.exceptions,
                                                 &devcgroup->exceptions);
                        if (rc)
                                return rc;
                        dev_exception_clean(devcgroup);
                        rc = dev_exceptions_copy(&devcgroup->exceptions,
                                                 &parent->exceptions);
                        if (rc) {
                                dev_exceptions_move(&devcgroup->exceptions,
                                                    &tmp_devcgrp.exceptions);
                                return rc;
                        }
                        devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
                        dev_exception_clean(&tmp_devcgrp);
                        break;
                case DEVCG_DENY:
                        if (css_has_online_children(&devcgroup->css))
                                return -EINVAL;

                        dev_exception_clean(devcgroup);
                        devcgroup->behavior = DEVCG_DEFAULT_DENY;
                        break;
                default:
                        return -EINVAL;
                }
                return 0;
        case 'b':
                ex.type = DEVCG_DEV_BLOCK;
                break;
        case 'c':
                ex.type = DEVCG_DEV_CHAR;
                break;
        default:
                return -EINVAL;
        }
        b++;
        if (!isspace(*b))
                return -EINVAL;
        b++;
        if (*b == '*') {
                ex.major = ~0;
                b++;
        } else if (isdigit(*b)) {
                memset(temp, 0, sizeof(temp));
                for (count = 0; count < sizeof(temp) - 1; count++) {
                        temp[count] = *b;
                        b++;
                        if (!isdigit(*b))
                                break;
                }
                rc = kstrtou32(temp, 10, &ex.major);
                if (rc)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (*b != ':')
                return -EINVAL;
        b++;

        /* read minor */
        if (*b == '*') {
                ex.minor = ~0;
                b++;
        } else if (isdigit(*b)) {
                memset(temp, 0, sizeof(temp));
                for (count = 0; count < sizeof(temp) - 1; count++) {
                        temp[count] = *b;
                        b++;
                        if (!isdigit(*b))
                                break;
                }
                rc = kstrtou32(temp, 10, &ex.minor);
                if (rc)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (!isspace(*b))
                return -EINVAL;
        for (b++, count = 0; count < 3; count++, b++) {
                switch (*b) {
                case 'r':
                        ex.access |= DEVCG_ACC_READ;
                        break;
                case 'w':
                        ex.access |= DEVCG_ACC_WRITE;
                        break;
                case 'm':
                        ex.access |= DEVCG_ACC_MKNOD;
                        break;
                case '\n':
                case '\0':
                        count = 3;
                        break;
                default:
                        return -EINVAL;
                }
        }

        switch (filetype) {
        case DEVCG_ALLOW:
                /*
                 * If the default policy is to allow by default, try to remove
                 * an matching exception instead. And be silent about it: we
                 * don't want to break compatibility
                 */
                if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
                        /* Check if the parent allows removing it first */
                        if (!parent_allows_removal(devcgroup, &ex))
                                return -EPERM;
                        dev_exception_rm(devcgroup, &ex);
                        break;
                }

                if (!parent_has_perm(devcgroup, &ex))
                        return -EPERM;
                rc = dev_exception_add(devcgroup, &ex);
                break;
        case DEVCG_DENY:
                /*
                 * If the default policy is to deny by default, try to remove
                 * an matching exception instead. And be silent about it: we
                 * don't want to break compatibility
                 */
                if (devcgroup->behavior == DEVCG_DEFAULT_DENY)
                        dev_exception_rm(devcgroup, &ex);
                else
                        rc = dev_exception_add(devcgroup, &ex);

                if (rc)
                        break;
                /* we only propagate new restrictions */
                rc = propagate_exception(devcgroup, &ex);
                break;
        default:
                rc = -EINVAL;
        }
        return rc;
}

static ssize_t devcgroup_access_write(struct kernfs_open_file *of,
                                      char *buf, size_t nbytes, loff_t off)
{
        int retval;

        mutex_lock(&devcgroup_mutex);
        retval = devcgroup_update_access(css_to_devcgroup(of_css(of)),
                                         of_cft(of)->private, strstrip(buf));
        mutex_unlock(&devcgroup_mutex);
        return retval ?: nbytes;
}

static struct cftype dev_cgroup_files[] = {
        {
                .name = "allow",
                .write = devcgroup_access_write,
                .private = DEVCG_ALLOW,
        },
        {
                .name = "deny",
                .write = devcgroup_access_write,
                .private = DEVCG_DENY,
        },
        {
                .name = "list",
                .seq_show = devcgroup_seq_show,
                .private = DEVCG_LIST,
        },
        { }        /* terminate */
};

struct cgroup_subsys devices_cgrp_subsys = {
        .css_alloc = devcgroup_css_alloc,
        .css_free = devcgroup_css_free,
        .css_online = devcgroup_online,
        .css_offline = devcgroup_offline,
        .legacy_cftypes = dev_cgroup_files,
};

/**
 * devcgroup_legacy_check_permission - checks if an inode operation is permitted
 * @type: device type
 * @major: device major number
 * @minor: device minor number
 * @access: combination of DEVCG_ACC_WRITE, DEVCG_ACC_READ and DEVCG_ACC_MKNOD
 *
 * returns 0 on success, -EPERM case the operation is not permitted
 */
static int devcgroup_legacy_check_permission(short type, u32 major, u32 minor,
                                        short access)
{
        struct dev_cgroup *dev_cgroup;
        bool rc;

        rcu_read_lock();
        dev_cgroup = task_devcgroup(current);
        if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW)
                /* Can't match any of the exceptions, even partially */
                rc = !match_exception_partial(&dev_cgroup->exceptions,
                                              type, major, minor, access);
        else
                /* Need to match completely one exception to be allowed */
                rc = match_exception(&dev_cgroup->exceptions, type, major,
                                     minor, access);
        rcu_read_unlock();

        if (!rc)
                return -EPERM;

        return 0;
}

#endif /* CONFIG_CGROUP_DEVICE */

#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)

int devcgroup_check_permission(short type, u32 major, u32 minor, short access)
{
        int rc = BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access);

        if (rc)
                return rc;

        #ifdef CONFIG_CGROUP_DEVICE
        return devcgroup_legacy_check_permission(type, major, minor, access);

        #else /* CONFIG_CGROUP_DEVICE */
        return 0;

        #endif /* CONFIG_CGROUP_DEVICE */
}
EXPORT_SYMBOL(devcgroup_check_permission);
#endif /* defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) */

















   13 
























   13 










   13 



















  156 















  156 

   13 

   13 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#ifndef __ARM64_KVM_HYP_FAULT_H__
#define __ARM64_KVM_HYP_FAULT_H__

#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>

static inline bool __fault_safe_to_translate(u64 esr)
{
        u64 fsc = esr & ESR_ELx_FSC;

        if (esr_fsc_is_sea_ttw(esr) || esr_fsc_is_secc_ttw(esr))
                return false;

        return !(fsc == ESR_ELx_FSC_EXTABT && (esr & ESR_ELx_FnV));
}

static inline bool __translate_far_to_hpfar(u64 far, u64 *hpfar)
{
        int ret;
        u64 par, tmp;

        /*
         * Resolve the IPA the hard way using the guest VA.
         *
         * Stage-1 translation already validated the memory access
         * rights. As such, we can use the EL1 translation regime, and
         * don't have to distinguish between EL0 and EL1 access.
         *
         * We do need to save/restore PAR_EL1 though, as we haven't
         * saved the guest context yet, and we may return early...
         */
        par = read_sysreg_par();
        ret = system_supports_poe() ? __kvm_at(OP_AT_S1E1A, far) :
                                      __kvm_at(OP_AT_S1E1R, far);
        if (!ret)
                tmp = read_sysreg_par();
        else
                tmp = SYS_PAR_EL1_F; /* back to the guest */
        write_sysreg(par, par_el1);

        if (unlikely(tmp & SYS_PAR_EL1_F))
                return false; /* Translation failed, back to guest */

        /* Convert PAR to HPFAR format */
        *hpfar = PAR_TO_HPFAR(tmp);
        return true;
}

/*
 * Checks for the conditions when HPFAR_EL2 is written, per ARM ARM R_FKLWR.
 */
static inline bool __hpfar_valid(u64 esr)
{
        /*
         * CPUs affected by ARM erratum #834220 may incorrectly report a
         * stage-2 translation fault when a stage-1 permission fault occurs.
         *
         * Re-walk the page tables to determine if a stage-1 fault actually
         * occurred.
         */
        if (cpus_have_final_cap(ARM64_WORKAROUND_834220) &&
            esr_fsc_is_translation_fault(esr))
                return false;

        if (esr_fsc_is_translation_fault(esr) || esr_fsc_is_access_flag_fault(esr))
                return true;

        if ((esr & ESR_ELx_S1PTW) && esr_fsc_is_permission_fault(esr))
                return true;

        return esr_fsc_is_addr_sz_fault(esr);
}

static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault)
{
        u64 hpfar;

        fault->far_el2                = read_sysreg_el2(SYS_FAR);
        fault->hpfar_el2        = 0;

        if (__hpfar_valid(esr))
                hpfar = read_sysreg(hpfar_el2);
        else if (unlikely(!__fault_safe_to_translate(esr)))
                return true;
        else if (!__translate_far_to_hpfar(fault->far_el2, &hpfar))
                return false;

        /*
         * Hijack HPFAR_EL2.NS (RES0 in Non-secure) to indicate a valid
         * HPFAR value.
         */
        fault->hpfar_el2 = hpfar | HPFAR_EL2_NS;
        return true;
}

#endif





































  211 



    4 















  216 














    4 









































  221 









  221 












    1 










  220 
  220 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/kasan-checks.h>
#include <linux/thread_info.h>
#include <linux/uaccess.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/mm.h>

#include <asm/byteorder.h>
#include <asm/word-at-a-time.h>

#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
#define IS_UNALIGNED(src, dst)        0
#else
#define IS_UNALIGNED(src, dst)        \
        (((long) dst | (long) src) & (sizeof(long) - 1))
#endif

/*
 * Do a strncpy, return length of string without final '\0'.
 * 'count' is the user-supplied count (return 'count' if we
 * hit it), 'max' is the address space maximum (and we return
 * -EFAULT if we hit it).
 */
static __always_inline long do_strncpy_from_user(char *dst, const char __user *src,
                                        unsigned long count, unsigned long max)
{
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
        unsigned long res = 0;

        if (IS_UNALIGNED(src, dst))
                goto byte_at_a_time;

        while (max >= sizeof(unsigned long)) {
                unsigned long c, data, mask;

                /* Fall back to byte-at-a-time if we get a page fault */
                unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time);

                /*
                 * Note that we mask out the bytes following the NUL. This is
                 * important to do because string oblivious code may read past
                 * the NUL. For those routines, we don't want to give them
                 * potentially random bytes after the NUL in `src`.
                 *
                 * One example of such code is BPF map keys. BPF treats map keys
                 * as an opaque set of bytes. Without the post-NUL mask, any BPF
                 * maps keyed by strings returned from strncpy_from_user() may
                 * have multiple entries for semantically identical strings.
                 */
                if (has_zero(c, &data, &constants)) {
                        data = prep_zero_mask(c, data, &constants);
                        data = create_zero_mask(data);
                        mask = zero_bytemask(data);
                        *(unsigned long *)(dst+res) = c & mask;
                        return res + find_zero(data);
                }

                *(unsigned long *)(dst+res) = c;

                res += sizeof(unsigned long);
                max -= sizeof(unsigned long);
        }

byte_at_a_time:
        while (max) {
                char c;

                unsafe_get_user(c,src+res, efault);
                dst[res] = c;
                if (!c)
                        return res;
                res++;
                max--;
        }

        /*
         * Uhhuh. We hit 'max'. But was that the user-specified maximum
         * too? If so, that's ok - we got as much as the user asked for.
         */
        if (res >= count)
                return res;

        /*
         * Nope: we hit the address space limit, and we still had more
         * characters the caller would have wanted. That's an EFAULT.
         */
efault:
        return -EFAULT;
}

/**
 * strncpy_from_user: - Copy a NUL terminated string from userspace.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @src:   Source address, in user space.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from userspace to kernel space.
 *
 * On success, returns the length of the string (not including the trailing
 * NUL).
 *
 * If access to userspace fails, returns -EFAULT (some data may have been
 * copied).
 *
 * If @count is smaller than the length of the string, copies @count bytes
 * and returns @count.
 */
long strncpy_from_user(char *dst, const char __user *src, long count)
{
        unsigned long max_addr, src_addr;

        might_fault();
        if (should_fail_usercopy())
                return -EFAULT;
        if (unlikely(count <= 0))
                return 0;

        kasan_check_write(dst, count);
        check_object_size(dst, count, false);

        if (can_do_masked_user_access()) {
                long retval;

                src = masked_user_access_begin(src);
                retval = do_strncpy_from_user(dst, src, count, count);
                user_read_access_end();
                return retval;
        }

        max_addr = TASK_SIZE_MAX;
        src_addr = (unsigned long)untagged_addr(src);
        if (likely(src_addr < max_addr)) {
                unsigned long max = max_addr - src_addr;
                long retval;

                /*
                 * Truncate 'max' to the user-specified limit, so that
                 * we only have one limit we need to check in the loop
                 */
                if (max > count)
                        max = count;

                if (user_read_access_begin(src, max)) {
                        retval = do_strncpy_from_user(dst, src, count, max);
                        user_read_access_end();
                        return retval;
                }
        }
        return -EFAULT;
}
EXPORT_SYMBOL(strncpy_from_user);





























































































    3 






    4 







    7 



    7 
    7 
    7 
    7 

    7 




    7 



    7 
    7 
    7 
    7 

    7 









  154 





    7 












  154 





    7 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#ifndef __ARM64_KVM_HYP_DEBUG_SR_H__
#define __ARM64_KVM_HYP_DEBUG_SR_H__

#include <linux/compiler.h>
#include <linux/kvm_host.h>

#include <asm/debug-monitors.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>

#define read_debug(r,n)                read_sysreg(r##n##_el1)
#define write_debug(v,r,n)        write_sysreg(v, r##n##_el1)

#define save_debug(ptr,reg,nr)                                                \
        switch (nr) {                                                        \
        case 15:        ptr[15] = read_debug(reg, 15);                        \
                        fallthrough;                                        \
        case 14:        ptr[14] = read_debug(reg, 14);                        \
                        fallthrough;                                        \
        case 13:        ptr[13] = read_debug(reg, 13);                        \
                        fallthrough;                                        \
        case 12:        ptr[12] = read_debug(reg, 12);                        \
                        fallthrough;                                        \
        case 11:        ptr[11] = read_debug(reg, 11);                        \
                        fallthrough;                                        \
        case 10:        ptr[10] = read_debug(reg, 10);                        \
                        fallthrough;                                        \
        case 9:                ptr[9] = read_debug(reg, 9);                        \
                        fallthrough;                                        \
        case 8:                ptr[8] = read_debug(reg, 8);                        \
                        fallthrough;                                        \
        case 7:                ptr[7] = read_debug(reg, 7);                        \
                        fallthrough;                                        \
        case 6:                ptr[6] = read_debug(reg, 6);                        \
                        fallthrough;                                        \
        case 5:                ptr[5] = read_debug(reg, 5);                        \
                        fallthrough;                                        \
        case 4:                ptr[4] = read_debug(reg, 4);                        \
                        fallthrough;                                        \
        case 3:                ptr[3] = read_debug(reg, 3);                        \
                        fallthrough;                                        \
        case 2:                ptr[2] = read_debug(reg, 2);                        \
                        fallthrough;                                        \
        case 1:                ptr[1] = read_debug(reg, 1);                        \
                        fallthrough;                                        \
        default:        ptr[0] = read_debug(reg, 0);                        \
        }

#define restore_debug(ptr,reg,nr)                                        \
        switch (nr) {                                                        \
        case 15:        write_debug(ptr[15], reg, 15);                        \
                        fallthrough;                                        \
        case 14:        write_debug(ptr[14], reg, 14);                        \
                        fallthrough;                                        \
        case 13:        write_debug(ptr[13], reg, 13);                        \
                        fallthrough;                                        \
        case 12:        write_debug(ptr[12], reg, 12);                        \
                        fallthrough;                                        \
        case 11:        write_debug(ptr[11], reg, 11);                        \
                        fallthrough;                                        \
        case 10:        write_debug(ptr[10], reg, 10);                        \
                        fallthrough;                                        \
        case 9:                write_debug(ptr[9], reg, 9);                        \
                        fallthrough;                                        \
        case 8:                write_debug(ptr[8], reg, 8);                        \
                        fallthrough;                                        \
        case 7:                write_debug(ptr[7], reg, 7);                        \
                        fallthrough;                                        \
        case 6:                write_debug(ptr[6], reg, 6);                        \
                        fallthrough;                                        \
        case 5:                write_debug(ptr[5], reg, 5);                        \
                        fallthrough;                                        \
        case 4:                write_debug(ptr[4], reg, 4);                        \
                        fallthrough;                                        \
        case 3:                write_debug(ptr[3], reg, 3);                        \
                        fallthrough;                                        \
        case 2:                write_debug(ptr[2], reg, 2);                        \
                        fallthrough;                                        \
        case 1:                write_debug(ptr[1], reg, 1);                        \
                        fallthrough;                                        \
        default:        write_debug(ptr[0], reg, 0);                        \
        }

static struct kvm_guest_debug_arch *__vcpu_debug_regs(struct kvm_vcpu *vcpu)
{
        switch (vcpu->arch.debug_owner) {
        case VCPU_DEBUG_FREE:
                WARN_ON_ONCE(1);
                fallthrough;
        case VCPU_DEBUG_GUEST_OWNED:
                return &vcpu->arch.vcpu_debug_state;
        case VCPU_DEBUG_HOST_OWNED:
                return &vcpu->arch.external_debug_state;
        }

        return NULL;
}

static void __debug_save_state(struct kvm_guest_debug_arch *dbg,
                               struct kvm_cpu_context *ctxt)
{
        int brps = *host_data_ptr(debug_brps);
        int wrps = *host_data_ptr(debug_wrps);

        save_debug(dbg->dbg_bcr, dbgbcr, brps);
        save_debug(dbg->dbg_bvr, dbgbvr, brps);
        save_debug(dbg->dbg_wcr, dbgwcr, wrps);
        save_debug(dbg->dbg_wvr, dbgwvr, wrps);

        ctxt_sys_reg(ctxt, MDCCINT_EL1) = read_sysreg(mdccint_el1);
}

static void __debug_restore_state(struct kvm_guest_debug_arch *dbg,
                                  struct kvm_cpu_context *ctxt)
{
        int brps = *host_data_ptr(debug_brps);
        int wrps = *host_data_ptr(debug_wrps);

        restore_debug(dbg->dbg_bcr, dbgbcr, brps);
        restore_debug(dbg->dbg_bvr, dbgbvr, brps);
        restore_debug(dbg->dbg_wcr, dbgwcr, wrps);
        restore_debug(dbg->dbg_wvr, dbgwvr, wrps);

        write_sysreg(ctxt_sys_reg(ctxt, MDCCINT_EL1), mdccint_el1);
}

static inline void __debug_switch_to_guest_common(struct kvm_vcpu *vcpu)
{
        struct kvm_cpu_context *host_ctxt;
        struct kvm_cpu_context *guest_ctxt;
        struct kvm_guest_debug_arch *host_dbg;
        struct kvm_guest_debug_arch *guest_dbg;

        if (!kvm_debug_regs_in_use(vcpu))
                return;

        host_ctxt = host_data_ptr(host_ctxt);
        guest_ctxt = &vcpu->arch.ctxt;
        host_dbg = host_data_ptr(host_debug_state.regs);
        guest_dbg = __vcpu_debug_regs(vcpu);

        __debug_save_state(host_dbg, host_ctxt);
        __debug_restore_state(guest_dbg, guest_ctxt);
}

static inline void __debug_switch_to_host_common(struct kvm_vcpu *vcpu)
{
        struct kvm_cpu_context *host_ctxt;
        struct kvm_cpu_context *guest_ctxt;
        struct kvm_guest_debug_arch *host_dbg;
        struct kvm_guest_debug_arch *guest_dbg;

        if (!kvm_debug_regs_in_use(vcpu))
                return;

        host_ctxt = host_data_ptr(host_ctxt);
        guest_ctxt = &vcpu->arch.ctxt;
        host_dbg = host_data_ptr(host_debug_state.regs);
        guest_dbg = __vcpu_debug_regs(vcpu);

        __debug_save_state(guest_dbg, guest_ctxt);
        __debug_restore_state(host_dbg, host_ctxt);
}

#endif /* __ARM64_KVM_HYP_DEBUG_SR_H__ */





















































































































    1 





































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
// SPDX-License-Identifier: (GPL-2.0-only OR BSD-3-Clause)
/* Copyright (C) 2016-2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 *
 * SipHash: a fast short-input PRF
 * https://131002.net/siphash/
 *
 * This implementation is specifically for SipHash2-4 for a secure PRF
 * and HalfSipHash1-3/SipHash1-3 for an insecure PRF only suitable for
 * hashtables.
 */

#include <linux/siphash.h>
#include <linux/unaligned.h>

#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
#include <linux/dcache.h>
#include <asm/word-at-a-time.h>
#endif

#define SIPROUND SIPHASH_PERMUTATION(v0, v1, v2, v3)

#define PREAMBLE(len) \
        u64 v0 = SIPHASH_CONST_0; \
        u64 v1 = SIPHASH_CONST_1; \
        u64 v2 = SIPHASH_CONST_2; \
        u64 v3 = SIPHASH_CONST_3; \
        u64 b = ((u64)(len)) << 56; \
        v3 ^= key->key[1]; \
        v2 ^= key->key[0]; \
        v1 ^= key->key[1]; \
        v0 ^= key->key[0];

#define POSTAMBLE \
        v3 ^= b; \
        SIPROUND; \
        SIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        SIPROUND; \
        SIPROUND; \
        SIPROUND; \
        SIPROUND; \
        return (v0 ^ v1) ^ (v2 ^ v3);

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u64 __siphash_aligned(const void *data, size_t len, const siphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        PREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = le64_to_cpup(data);
                v3 ^= m;
                SIPROUND;
                SIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= le32_to_cpup(data); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
#endif
        POSTAMBLE
}
EXPORT_SYMBOL(__siphash_aligned);
#endif

u64 __siphash_unaligned(const void *data, size_t len, const siphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        PREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = get_unaligned_le64(data);
                v3 ^= m;
                SIPROUND;
                SIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= get_unaligned_le32(end); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
#endif
        POSTAMBLE
}
EXPORT_SYMBOL(__siphash_unaligned);

/**
 * siphash_1u64 - compute 64-bit siphash PRF value of a u64
 * @first: first u64
 * @key: the siphash key
 */
u64 siphash_1u64(const u64 first, const siphash_key_t *key)
{
        PREAMBLE(8)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_1u64);

/**
 * siphash_2u64 - compute 64-bit siphash PRF value of 2 u64
 * @first: first u64
 * @second: second u64
 * @key: the siphash key
 */
u64 siphash_2u64(const u64 first, const u64 second, const siphash_key_t *key)
{
        PREAMBLE(16)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_2u64);

/**
 * siphash_3u64 - compute 64-bit siphash PRF value of 3 u64
 * @first: first u64
 * @second: second u64
 * @third: third u64
 * @key: the siphash key
 */
u64 siphash_3u64(const u64 first, const u64 second, const u64 third,
                 const siphash_key_t *key)
{
        PREAMBLE(24)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        v3 ^= third;
        SIPROUND;
        SIPROUND;
        v0 ^= third;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_3u64);

/**
 * siphash_4u64 - compute 64-bit siphash PRF value of 4 u64
 * @first: first u64
 * @second: second u64
 * @third: third u64
 * @forth: forth u64
 * @key: the siphash key
 */
u64 siphash_4u64(const u64 first, const u64 second, const u64 third,
                 const u64 forth, const siphash_key_t *key)
{
        PREAMBLE(32)
        v3 ^= first;
        SIPROUND;
        SIPROUND;
        v0 ^= first;
        v3 ^= second;
        SIPROUND;
        SIPROUND;
        v0 ^= second;
        v3 ^= third;
        SIPROUND;
        SIPROUND;
        v0 ^= third;
        v3 ^= forth;
        SIPROUND;
        SIPROUND;
        v0 ^= forth;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_4u64);

u64 siphash_1u32(const u32 first, const siphash_key_t *key)
{
        PREAMBLE(4)
        b |= first;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_1u32);

u64 siphash_3u32(const u32 first, const u32 second, const u32 third,
                 const siphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        PREAMBLE(12)
        v3 ^= combined;
        SIPROUND;
        SIPROUND;
        v0 ^= combined;
        b |= third;
        POSTAMBLE
}
EXPORT_SYMBOL(siphash_3u32);

#if BITS_PER_LONG == 64
/* Note that on 64-bit, we make HalfSipHash1-3 actually be SipHash1-3, for
 * performance reasons. On 32-bit, below, we actually implement HalfSipHash1-3.
 */

#define HSIPROUND SIPROUND
#define HPREAMBLE(len) PREAMBLE(len)
#define HPOSTAMBLE \
        v3 ^= b; \
        HSIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        HSIPROUND; \
        HSIPROUND; \
        HSIPROUND; \
        return (v0 ^ v1) ^ (v2 ^ v3);

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = le64_to_cpup(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= le32_to_cpup(data); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
#endif
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_aligned);
#endif

u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u64));
        const u8 left = len & (sizeof(u64) - 1);
        u64 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u64)) {
                m = get_unaligned_le64(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
#if defined(CONFIG_DCACHE_WORD_ACCESS) && BITS_PER_LONG == 64
        if (left)
                b |= le64_to_cpu((__force __le64)(load_unaligned_zeropad(data) &
                                                  bytemask_from_count(left)));
#else
        switch (left) {
        case 7: b |= ((u64)end[6]) << 48; fallthrough;
        case 6: b |= ((u64)end[5]) << 40; fallthrough;
        case 5: b |= ((u64)end[4]) << 32; fallthrough;
        case 4: b |= get_unaligned_le32(end); break;
        case 3: b |= ((u64)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
#endif
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_unaligned);

/**
 * hsiphash_1u32 - compute 64-bit hsiphash PRF value of a u32
 * @first: first u32
 * @key: the hsiphash key
 */
u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
{
        HPREAMBLE(4)
        b |= first;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_1u32);

/**
 * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
 * @first: first u32
 * @second: second u32
 * @key: the hsiphash key
 */
u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(8)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_2u32);

/**
 * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @key: the hsiphash key
 */
u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
                  const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(12)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        b |= third;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_3u32);

/**
 * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @forth: forth u32
 * @key: the hsiphash key
 */
u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
                  const u32 forth, const hsiphash_key_t *key)
{
        u64 combined = (u64)second << 32 | first;
        HPREAMBLE(16)
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        combined = (u64)forth << 32 | third;
        v3 ^= combined;
        HSIPROUND;
        v0 ^= combined;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_4u32);
#else
#define HSIPROUND HSIPHASH_PERMUTATION(v0, v1, v2, v3)

#define HPREAMBLE(len) \
        u32 v0 = HSIPHASH_CONST_0; \
        u32 v1 = HSIPHASH_CONST_1; \
        u32 v2 = HSIPHASH_CONST_2; \
        u32 v3 = HSIPHASH_CONST_3; \
        u32 b = ((u32)(len)) << 24; \
        v3 ^= key->key[1]; \
        v2 ^= key->key[0]; \
        v1 ^= key->key[1]; \
        v0 ^= key->key[0];

#define HPOSTAMBLE \
        v3 ^= b; \
        HSIPROUND; \
        v0 ^= b; \
        v2 ^= 0xff; \
        HSIPROUND; \
        HSIPROUND; \
        HSIPROUND; \
        return v1 ^ v3;

#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
u32 __hsiphash_aligned(const void *data, size_t len, const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u32));
        const u8 left = len & (sizeof(u32) - 1);
        u32 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u32)) {
                m = le32_to_cpup(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
        switch (left) {
        case 3: b |= ((u32)end[2]) << 16; fallthrough;
        case 2: b |= le16_to_cpup(data); break;
        case 1: b |= end[0];
        }
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_aligned);
#endif

u32 __hsiphash_unaligned(const void *data, size_t len,
                         const hsiphash_key_t *key)
{
        const u8 *end = data + len - (len % sizeof(u32));
        const u8 left = len & (sizeof(u32) - 1);
        u32 m;
        HPREAMBLE(len)
        for (; data != end; data += sizeof(u32)) {
                m = get_unaligned_le32(data);
                v3 ^= m;
                HSIPROUND;
                v0 ^= m;
        }
        switch (left) {
        case 3: b |= ((u32)end[2]) << 16; fallthrough;
        case 2: b |= get_unaligned_le16(end); break;
        case 1: b |= end[0];
        }
        HPOSTAMBLE
}
EXPORT_SYMBOL(__hsiphash_unaligned);

/**
 * hsiphash_1u32 - compute 32-bit hsiphash PRF value of a u32
 * @first: first u32
 * @key: the hsiphash key
 */
u32 hsiphash_1u32(const u32 first, const hsiphash_key_t *key)
{
        HPREAMBLE(4)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_1u32);

/**
 * hsiphash_2u32 - compute 32-bit hsiphash PRF value of 2 u32
 * @first: first u32
 * @second: second u32
 * @key: the hsiphash key
 */
u32 hsiphash_2u32(const u32 first, const u32 second, const hsiphash_key_t *key)
{
        HPREAMBLE(8)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_2u32);

/**
 * hsiphash_3u32 - compute 32-bit hsiphash PRF value of 3 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @key: the hsiphash key
 */
u32 hsiphash_3u32(const u32 first, const u32 second, const u32 third,
                  const hsiphash_key_t *key)
{
        HPREAMBLE(12)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        v3 ^= third;
        HSIPROUND;
        v0 ^= third;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_3u32);

/**
 * hsiphash_4u32 - compute 32-bit hsiphash PRF value of 4 u32
 * @first: first u32
 * @second: second u32
 * @third: third u32
 * @forth: forth u32
 * @key: the hsiphash key
 */
u32 hsiphash_4u32(const u32 first, const u32 second, const u32 third,
                  const u32 forth, const hsiphash_key_t *key)
{
        HPREAMBLE(16)
        v3 ^= first;
        HSIPROUND;
        v0 ^= first;
        v3 ^= second;
        HSIPROUND;
        v0 ^= second;
        v3 ^= third;
        HSIPROUND;
        v0 ^= third;
        v3 ^= forth;
        HSIPROUND;
        v0 ^= forth;
        HPOSTAMBLE
}
EXPORT_SYMBOL(hsiphash_4u32);
#endif











































































































































































































































   34 







   34 
















   34 




   34 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/kernel/traps.c
 *
 * Copyright (C) 1995-2009 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/bug.h>
#include <linux/context_tracking.h>
#include <linux/signal.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/spinlock.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
#include <linux/kdebug.h>
#include <linux/module.h>
#include <linux/kexec.h>
#include <linux/delay.h>
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
#include <linux/sizes.h>
#include <linux/syscalls.h>
#include <linux/mm_types.h>
#include <linux/kasan.h>
#include <linux/ubsan.h>
#include <linux/cfi.h>

#include <asm/atomic.h>
#include <asm/bug.h>
#include <asm/cpufeature.h>
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
#include <asm/efi.h>
#include <asm/esr.h>
#include <asm/exception.h>
#include <asm/extable.h>
#include <asm/insn.h>
#include <asm/kprobes.h>
#include <asm/text-patching.h>
#include <asm/traps.h>
#include <asm/smp.h>
#include <asm/stack_pointer.h>
#include <asm/stacktrace.h>
#include <asm/system_misc.h>
#include <asm/sysreg.h>

static bool __kprobes __check_eq(unsigned long pstate)
{
        return (pstate & PSR_Z_BIT) != 0;
}

static bool __kprobes __check_ne(unsigned long pstate)
{
        return (pstate & PSR_Z_BIT) == 0;
}

static bool __kprobes __check_cs(unsigned long pstate)
{
        return (pstate & PSR_C_BIT) != 0;
}

static bool __kprobes __check_cc(unsigned long pstate)
{
        return (pstate & PSR_C_BIT) == 0;
}

static bool __kprobes __check_mi(unsigned long pstate)
{
        return (pstate & PSR_N_BIT) != 0;
}

static bool __kprobes __check_pl(unsigned long pstate)
{
        return (pstate & PSR_N_BIT) == 0;
}

static bool __kprobes __check_vs(unsigned long pstate)
{
        return (pstate & PSR_V_BIT) != 0;
}

static bool __kprobes __check_vc(unsigned long pstate)
{
        return (pstate & PSR_V_BIT) == 0;
}

static bool __kprobes __check_hi(unsigned long pstate)
{
        pstate &= ~(pstate >> 1);        /* PSR_C_BIT &= ~PSR_Z_BIT */
        return (pstate & PSR_C_BIT) != 0;
}

static bool __kprobes __check_ls(unsigned long pstate)
{
        pstate &= ~(pstate >> 1);        /* PSR_C_BIT &= ~PSR_Z_BIT */
        return (pstate & PSR_C_BIT) == 0;
}

static bool __kprobes __check_ge(unsigned long pstate)
{
        pstate ^= (pstate << 3);        /* PSR_N_BIT ^= PSR_V_BIT */
        return (pstate & PSR_N_BIT) == 0;
}

static bool __kprobes __check_lt(unsigned long pstate)
{
        pstate ^= (pstate << 3);        /* PSR_N_BIT ^= PSR_V_BIT */
        return (pstate & PSR_N_BIT) != 0;
}

static bool __kprobes __check_gt(unsigned long pstate)
{
        /*PSR_N_BIT ^= PSR_V_BIT */
        unsigned long temp = pstate ^ (pstate << 3);

        temp |= (pstate << 1);        /*PSR_N_BIT |= PSR_Z_BIT */
        return (temp & PSR_N_BIT) == 0;
}

static bool __kprobes __check_le(unsigned long pstate)
{
        /*PSR_N_BIT ^= PSR_V_BIT */
        unsigned long temp = pstate ^ (pstate << 3);

        temp |= (pstate << 1);        /*PSR_N_BIT |= PSR_Z_BIT */
        return (temp & PSR_N_BIT) != 0;
}

static bool __kprobes __check_al(unsigned long pstate)
{
        return true;
}

/*
 * Note that the ARMv8 ARM calls condition code 0b1111 "nv", but states that
 * it behaves identically to 0b1110 ("al").
 */
pstate_check_t * const aarch32_opcode_cond_checks[16] = {
        __check_eq, __check_ne, __check_cs, __check_cc,
        __check_mi, __check_pl, __check_vs, __check_vc,
        __check_hi, __check_ls, __check_ge, __check_lt,
        __check_gt, __check_le, __check_al, __check_al
};

int show_unhandled_signals = 0;

static void dump_kernel_instr(const char *lvl, struct pt_regs *regs)
{
        unsigned long addr = instruction_pointer(regs);
        char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str;
        int i;

        if (user_mode(regs))
                return;

        for (i = -4; i < 1; i++) {
                unsigned int val, bad;

                bad = aarch64_insn_read(&((u32 *)addr)[i], &val);

                if (!bad)
                        p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val);
                else
                        p += sprintf(p, i == 0 ? "(????????) " : "???????? ");
        }

        printk("%sCode: %s\n", lvl, str);
}

#define S_SMP " SMP"

static int __die(const char *str, long err, struct pt_regs *regs)
{
        static int die_counter;
        int ret;

        pr_emerg("Internal error: %s: %016lx [#%d] " S_SMP "\n",
                 str, err, ++die_counter);

        /* trap and error numbers are mostly meaningless on ARM */
        ret = notify_die(DIE_OOPS, str, regs, err, 0, SIGSEGV);
        if (ret == NOTIFY_STOP)
                return ret;

        print_modules();
        show_regs(regs);

        dump_kernel_instr(KERN_EMERG, regs);

        return ret;
}

static DEFINE_RAW_SPINLOCK(die_lock);

/*
 * This function is protected against re-entrancy.
 */
void die(const char *str, struct pt_regs *regs, long err)
{
        int ret;
        unsigned long flags;

        raw_spin_lock_irqsave(&die_lock, flags);

        oops_enter();

        console_verbose();
        bust_spinlocks(1);
        ret = __die(str, err, regs);

        if (regs && kexec_should_crash(current))
                crash_kexec(regs);

        bust_spinlocks(0);
        add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
        oops_exit();

        if (in_interrupt())
                panic("%s: Fatal exception in interrupt", str);
        if (panic_on_oops)
                panic("%s: Fatal exception", str);

        raw_spin_unlock_irqrestore(&die_lock, flags);

        if (ret != NOTIFY_STOP)
                make_task_dead(SIGSEGV);
}

static void arm64_show_signal(int signo, const char *str)
{
        static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);
        struct task_struct *tsk = current;
        unsigned long esr = tsk->thread.fault_code;
        struct pt_regs *regs = task_pt_regs(tsk);

        /* Leave if the signal won't be shown */
        if (!show_unhandled_signals ||
            !unhandled_signal(tsk, signo) ||
            !__ratelimit(&rs))
                return;

        pr_info("%s[%d]: unhandled exception: ", tsk->comm, task_pid_nr(tsk));
        if (esr)
                pr_cont("%s, ESR 0x%016lx, ", esr_get_class_string(esr), esr);

        pr_cont("%s", str);
        print_vma_addr(KERN_CONT " in ", regs->pc);
        pr_cont("\n");
        __show_regs(regs);
}

void arm64_force_sig_fault(int signo, int code, unsigned long far,
                           const char *str)
{
        arm64_show_signal(signo, str);
        if (signo == SIGKILL)
                force_sig(SIGKILL);
        else
                force_sig_fault(signo, code, (void __user *)far);
}

void arm64_force_sig_fault_pkey(unsigned long far, const char *str, int pkey)
{
        arm64_show_signal(SIGSEGV, str);
        force_sig_pkuerr((void __user *)far, pkey);
}

void arm64_force_sig_mceerr(int code, unsigned long far, short lsb,
                            const char *str)
{
        arm64_show_signal(SIGBUS, str);
        force_sig_mceerr(code, (void __user *)far, lsb);
}

void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far,
                                       const char *str)
{
        arm64_show_signal(SIGTRAP, str);
        force_sig_ptrace_errno_trap(errno, (void __user *)far);
}

void arm64_notify_die(const char *str, struct pt_regs *regs,
                      int signo, int sicode, unsigned long far,
                      unsigned long err)
{
        if (user_mode(regs)) {
                WARN_ON(regs != current_pt_regs());
                current->thread.fault_address = 0;
                current->thread.fault_code = err;

                arm64_force_sig_fault(signo, sicode, far, str);
        } else {
                die(str, regs, err);
        }
}

#ifdef CONFIG_COMPAT
#define PSTATE_IT_1_0_SHIFT        25
#define PSTATE_IT_1_0_MASK        (0x3 << PSTATE_IT_1_0_SHIFT)
#define PSTATE_IT_7_2_SHIFT        10
#define PSTATE_IT_7_2_MASK        (0x3f << PSTATE_IT_7_2_SHIFT)

static u32 compat_get_it_state(struct pt_regs *regs)
{
        u32 it, pstate = regs->pstate;

        it  = (pstate & PSTATE_IT_1_0_MASK) >> PSTATE_IT_1_0_SHIFT;
        it |= ((pstate & PSTATE_IT_7_2_MASK) >> PSTATE_IT_7_2_SHIFT) << 2;

        return it;
}

static void compat_set_it_state(struct pt_regs *regs, u32 it)
{
        u32 pstate_it;

        pstate_it  = (it << PSTATE_IT_1_0_SHIFT) & PSTATE_IT_1_0_MASK;
        pstate_it |= ((it >> 2) << PSTATE_IT_7_2_SHIFT) & PSTATE_IT_7_2_MASK;

        regs->pstate &= ~PSR_AA32_IT_MASK;
        regs->pstate |= pstate_it;
}

static void advance_itstate(struct pt_regs *regs)
{
        u32 it;

        /* ARM mode */
        if (!(regs->pstate & PSR_AA32_T_BIT) ||
            !(regs->pstate & PSR_AA32_IT_MASK))
                return;

        it  = compat_get_it_state(regs);

        /*
         * If this is the last instruction of the block, wipe the IT
         * state. Otherwise advance it.
         */
        if (!(it & 7))
                it = 0;
        else
                it = (it & 0xe0) | ((it << 1) & 0x1f);

        compat_set_it_state(regs, it);
}
#else
static void advance_itstate(struct pt_regs *regs)
{
}
#endif

void arm64_skip_faulting_instruction(struct pt_regs *regs, unsigned long size)
{
        regs->pc += size;

        /*
         * If we were single stepping, we want to get the step exception after
         * we return from the trap.
         */
        if (user_mode(regs))
                user_fastforward_single_step(current);

        if (compat_user_mode(regs))
                advance_itstate(regs);
        else
                regs->pstate &= ~PSR_BTYPE_MASK;
}

static int user_insn_read(struct pt_regs *regs, u32 *insnp)
{
        u32 instr;
        unsigned long pc = instruction_pointer(regs);

        if (compat_thumb_mode(regs)) {
                /* 16-bit Thumb instruction */
                __le16 instr_le;
                if (get_user(instr_le, (__le16 __user *)pc))
                        return -EFAULT;
                instr = le16_to_cpu(instr_le);
                if (aarch32_insn_is_wide(instr)) {
                        u32 instr2;

                        if (get_user(instr_le, (__le16 __user *)(pc + 2)))
                                return -EFAULT;
                        instr2 = le16_to_cpu(instr_le);
                        instr = (instr << 16) | instr2;
                }
        } else {
                /* 32-bit ARM instruction */
                __le32 instr_le;
                if (get_user(instr_le, (__le32 __user *)pc))
                        return -EFAULT;
                instr = le32_to_cpu(instr_le);
        }

        *insnp = instr;
        return 0;
}

void force_signal_inject(int signal, int code, unsigned long address, unsigned long err)
{
        const char *desc;
        struct pt_regs *regs = current_pt_regs();

        if (WARN_ON(!user_mode(regs)))
                return;

        switch (signal) {
        case SIGILL:
                desc = "undefined instruction";
                break;
        case SIGSEGV:
                desc = "illegal memory access";
                break;
        default:
                desc = "unknown or unrecoverable error";
                break;
        }

        /* Force signals we don't understand to SIGKILL */
        if (WARN_ON(signal != SIGKILL &&
                    siginfo_layout(signal, code) != SIL_FAULT)) {
                signal = SIGKILL;
        }

        arm64_notify_die(desc, regs, signal, code, address, err);
}

/*
 * Set up process info to signal segmentation fault - called on access error.
 */
void arm64_notify_segfault(unsigned long addr)
{
        int code;

        mmap_read_lock(current->mm);
        if (find_vma(current->mm, untagged_addr(addr)) == NULL)
                code = SEGV_MAPERR;
        else
                code = SEGV_ACCERR;
        mmap_read_unlock(current->mm);

        force_signal_inject(SIGSEGV, code, addr, 0);
}

void do_el0_undef(struct pt_regs *regs, unsigned long esr)
{
        u32 insn;

        /* check for AArch32 breakpoint instructions */
        if (!aarch32_break_handler(regs))
                return;

        if (user_insn_read(regs, &insn))
                goto out_err;

        if (try_emulate_mrs(regs, insn))
                return;

        if (try_emulate_armv8_deprecated(regs, insn))
                return;

out_err:
        force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
}

void do_el1_undef(struct pt_regs *regs, unsigned long esr)
{
        u32 insn;

        if (aarch64_insn_read((void *)regs->pc, &insn))
                goto out_err;

        if (try_emulate_el1_ssbs(regs, insn))
                return;

out_err:
        die("Oops - Undefined instruction", regs, esr);
}

void do_el0_bti(struct pt_regs *regs)
{
        force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
}

void do_el1_bti(struct pt_regs *regs, unsigned long esr)
{
        if (efi_runtime_fixup_exception(regs, "BTI violation")) {
                regs->pstate &= ~PSR_BTYPE_MASK;
                return;
        }
        die("Oops - BTI", regs, esr);
}

void do_el0_gcs(struct pt_regs *regs, unsigned long esr)
{
        force_signal_inject(SIGSEGV, SEGV_CPERR, regs->pc, 0);
}

void do_el1_gcs(struct pt_regs *regs, unsigned long esr)
{
        die("Oops - GCS", regs, esr);
}

void do_el0_fpac(struct pt_regs *regs, unsigned long esr)
{
        force_signal_inject(SIGILL, ILL_ILLOPN, regs->pc, esr);
}

void do_el1_fpac(struct pt_regs *regs, unsigned long esr)
{
        /*
         * Unexpected FPAC exception in the kernel: kill the task before it
         * does any more harm.
         */
        die("Oops - FPAC", regs, esr);
}

void do_el0_mops(struct pt_regs *regs, unsigned long esr)
{
        arm64_mops_reset_regs(&regs->user_regs, esr);

        /*
         * If single stepping then finish the step before executing the
         * prologue instruction.
         */
        user_fastforward_single_step(current);
}

void do_el1_mops(struct pt_regs *regs, unsigned long esr)
{
        arm64_mops_reset_regs(&regs->user_regs, esr);

        kernel_fastforward_single_step(regs);
}

#define __user_cache_maint(insn, address, res)                        \
        if (address >= TASK_SIZE_MAX) {                                \
                res = -EFAULT;                                        \
        } else {                                                \
                uaccess_ttbr0_enable();                                \
                asm volatile (                                        \
                        "1:        " insn ", %1\n"                        \
                        "        mov        %w0, #0\n"                \
                        "2:\n"                                        \
                        _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)        \
                        : "=r" (res)                                \
                        : "r" (address));                        \
                uaccess_ttbr0_disable();                        \
        }

static void user_cache_maint_handler(unsigned long esr, struct pt_regs *regs)
{
        unsigned long tagged_address, address;
        int rt = ESR_ELx_SYS64_ISS_RT(esr);
        int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;
        int ret = 0;

        tagged_address = pt_regs_read_reg(regs, rt);
        address = untagged_addr(tagged_address);

        switch (crm) {
        case ESR_ELx_SYS64_ISS_CRM_DC_CVAU:        /* DC CVAU, gets promoted */
                __user_cache_maint("dc civac", address, ret);
                break;
        case ESR_ELx_SYS64_ISS_CRM_DC_CVAC:        /* DC CVAC, gets promoted */
                __user_cache_maint("dc civac", address, ret);
                break;
        case ESR_ELx_SYS64_ISS_CRM_DC_CVADP:        /* DC CVADP */
                __user_cache_maint("sys 3, c7, c13, 1", address, ret);
                break;
        case ESR_ELx_SYS64_ISS_CRM_DC_CVAP:        /* DC CVAP */
                __user_cache_maint("sys 3, c7, c12, 1", address, ret);
                break;
        case ESR_ELx_SYS64_ISS_CRM_DC_CIVAC:        /* DC CIVAC */
                __user_cache_maint("dc civac", address, ret);
                break;
        case ESR_ELx_SYS64_ISS_CRM_IC_IVAU:        /* IC IVAU */
                __user_cache_maint("ic ivau", address, ret);
                break;
        default:
                force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
                return;
        }

        if (ret)
                arm64_notify_segfault(tagged_address);
        else
                arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
}

static void ctr_read_handler(unsigned long esr, struct pt_regs *regs)
{
        int rt = ESR_ELx_SYS64_ISS_RT(esr);
        unsigned long val = arm64_ftr_reg_user_value(&arm64_ftr_reg_ctrel0);

        if (cpus_have_final_cap(ARM64_WORKAROUND_1542419)) {
                /* Hide DIC so that we can trap the unnecessary maintenance...*/
                val &= ~BIT(CTR_EL0_DIC_SHIFT);

                /* ... and fake IminLine to reduce the number of traps. */
                val &= ~CTR_EL0_IminLine_MASK;
                val |= (PAGE_SHIFT - 2) & CTR_EL0_IminLine_MASK;
        }

        pt_regs_write_reg(regs, rt, val);

        arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
}

static void cntvct_read_handler(unsigned long esr, struct pt_regs *regs)
{
        if (test_thread_flag(TIF_TSC_SIGSEGV)) {
                force_sig(SIGSEGV);
        } else {
                int rt = ESR_ELx_SYS64_ISS_RT(esr);

                pt_regs_write_reg(regs, rt, arch_timer_read_counter());
                arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
        }
}

static void cntfrq_read_handler(unsigned long esr, struct pt_regs *regs)
{
        if (test_thread_flag(TIF_TSC_SIGSEGV)) {
                force_sig(SIGSEGV);
        } else {
                int rt = ESR_ELx_SYS64_ISS_RT(esr);

                pt_regs_write_reg(regs, rt, arch_timer_get_rate());
                arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
        }
}

static void mrs_handler(unsigned long esr, struct pt_regs *regs)
{
        u32 sysreg, rt;

        rt = ESR_ELx_SYS64_ISS_RT(esr);
        sysreg = esr_sys64_to_sysreg(esr);

        if (do_emulate_mrs(regs, sysreg, rt) != 0)
                force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
}

static void wfi_handler(unsigned long esr, struct pt_regs *regs)
{
        arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
}

struct sys64_hook {
        unsigned long esr_mask;
        unsigned long esr_val;
        void (*handler)(unsigned long esr, struct pt_regs *regs);
};

static const struct sys64_hook sys64_hooks[] = {
        {
                .esr_mask = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_MASK,
                .esr_val = ESR_ELx_SYS64_ISS_EL0_CACHE_OP_VAL,
                .handler = user_cache_maint_handler,
        },
        {
                /* Trap read access to CTR_EL0 */
                .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK,
                .esr_val = ESR_ELx_SYS64_ISS_SYS_CTR_READ,
                .handler = ctr_read_handler,
        },
        {
                /* Trap read access to CNTVCT_EL0 */
                .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK,
                .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCT,
                .handler = cntvct_read_handler,
        },
        {
                /* Trap read access to CNTVCTSS_EL0 */
                .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK,
                .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCTSS,
                .handler = cntvct_read_handler,
        },
        {
                /* Trap read access to CNTFRQ_EL0 */
                .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK,
                .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ,
                .handler = cntfrq_read_handler,
        },
        {
                /* Trap read access to CPUID registers */
                .esr_mask = ESR_ELx_SYS64_ISS_SYS_MRS_OP_MASK,
                .esr_val = ESR_ELx_SYS64_ISS_SYS_MRS_OP_VAL,
                .handler = mrs_handler,
        },
        {
                /* Trap WFI instructions executed in userspace */
                .esr_mask = ESR_ELx_WFx_MASK,
                .esr_val = ESR_ELx_WFx_WFI_VAL,
                .handler = wfi_handler,
        },
        {},
};

#ifdef CONFIG_COMPAT
static bool cp15_cond_valid(unsigned long esr, struct pt_regs *regs)
{
        int cond;

        /* Only a T32 instruction can trap without CV being set */
        if (!(esr & ESR_ELx_CV)) {
                u32 it;

                it = compat_get_it_state(regs);
                if (!it)
                        return true;

                cond = it >> 4;
        } else {
                cond = (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT;
        }

        return aarch32_opcode_cond_checks[cond](regs->pstate);
}

static void compat_cntfrq_read_handler(unsigned long esr, struct pt_regs *regs)
{
        int reg = (esr & ESR_ELx_CP15_32_ISS_RT_MASK) >> ESR_ELx_CP15_32_ISS_RT_SHIFT;

        pt_regs_write_reg(regs, reg, arch_timer_get_rate());
        arm64_skip_faulting_instruction(regs, 4);
}

static const struct sys64_hook cp15_32_hooks[] = {
        {
                .esr_mask = ESR_ELx_CP15_32_ISS_SYS_MASK,
                .esr_val = ESR_ELx_CP15_32_ISS_SYS_CNTFRQ,
                .handler = compat_cntfrq_read_handler,
        },
        {},
};

static void compat_cntvct_read_handler(unsigned long esr, struct pt_regs *regs)
{
        int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT;
        int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT;
        u64 val = arch_timer_read_counter();

        pt_regs_write_reg(regs, rt, lower_32_bits(val));
        pt_regs_write_reg(regs, rt2, upper_32_bits(val));
        arm64_skip_faulting_instruction(regs, 4);
}

static const struct sys64_hook cp15_64_hooks[] = {
        {
                .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK,
                .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCT,
                .handler = compat_cntvct_read_handler,
        },
        {
                .esr_mask = ESR_ELx_CP15_64_ISS_SYS_MASK,
                .esr_val = ESR_ELx_CP15_64_ISS_SYS_CNTVCTSS,
                .handler = compat_cntvct_read_handler,
        },
        {},
};

void do_el0_cp15(unsigned long esr, struct pt_regs *regs)
{
        const struct sys64_hook *hook, *hook_base;

        if (!cp15_cond_valid(esr, regs)) {
                /*
                 * There is no T16 variant of a CP access, so we
                 * always advance PC by 4 bytes.
                 */
                arm64_skip_faulting_instruction(regs, 4);
                return;
        }

        switch (ESR_ELx_EC(esr)) {
        case ESR_ELx_EC_CP15_32:
                hook_base = cp15_32_hooks;
                break;
        case ESR_ELx_EC_CP15_64:
                hook_base = cp15_64_hooks;
                break;
        default:
                do_el0_undef(regs, esr);
                return;
        }

        for (hook = hook_base; hook->handler; hook++)
                if ((hook->esr_mask & esr) == hook->esr_val) {
                        hook->handler(esr, regs);
                        return;
                }

        /*
         * New cp15 instructions may previously have been undefined at
         * EL0. Fall back to our usual undefined instruction handler
         * so that we handle these consistently.
         */
        do_el0_undef(regs, esr);
}
#endif

void do_el0_sys(unsigned long esr, struct pt_regs *regs)
{
        const struct sys64_hook *hook;

        for (hook = sys64_hooks; hook->handler; hook++)
                if ((hook->esr_mask & esr) == hook->esr_val) {
                        hook->handler(esr, regs);
                        return;
                }

        /*
         * New SYS instructions may previously have been undefined at EL0. Fall
         * back to our usual undefined instruction handler so that we handle
         * these consistently.
         */
        do_el0_undef(regs, esr);
}

static const char *esr_class_str[] = {
        [0 ... ESR_ELx_EC_MAX]                = "UNRECOGNIZED EC",
        [ESR_ELx_EC_UNKNOWN]                = "Unknown/Uncategorized",
        [ESR_ELx_EC_WFx]                = "WFI/WFE",
        [ESR_ELx_EC_CP15_32]                = "CP15 MCR/MRC",
        [ESR_ELx_EC_CP15_64]                = "CP15 MCRR/MRRC",
        [ESR_ELx_EC_CP14_MR]                = "CP14 MCR/MRC",
        [ESR_ELx_EC_CP14_LS]                = "CP14 LDC/STC",
        [ESR_ELx_EC_FP_ASIMD]                = "ASIMD",
        [ESR_ELx_EC_CP10_ID]                = "CP10 MRC/VMRS",
        [ESR_ELx_EC_PAC]                = "PAC",
        [ESR_ELx_EC_CP14_64]                = "CP14 MCRR/MRRC",
        [ESR_ELx_EC_BTI]                = "BTI",
        [ESR_ELx_EC_ILL]                = "PSTATE.IL",
        [ESR_ELx_EC_SVC32]                = "SVC (AArch32)",
        [ESR_ELx_EC_HVC32]                = "HVC (AArch32)",
        [ESR_ELx_EC_SMC32]                = "SMC (AArch32)",
        [ESR_ELx_EC_SVC64]                = "SVC (AArch64)",
        [ESR_ELx_EC_HVC64]                = "HVC (AArch64)",
        [ESR_ELx_EC_SMC64]                = "SMC (AArch64)",
        [ESR_ELx_EC_SYS64]                = "MSR/MRS (AArch64)",
        [ESR_ELx_EC_SVE]                = "SVE",
        [ESR_ELx_EC_ERET]                = "ERET/ERETAA/ERETAB",
        [ESR_ELx_EC_FPAC]                = "FPAC",
        [ESR_ELx_EC_SME]                = "SME",
        [ESR_ELx_EC_IMP_DEF]                = "EL3 IMP DEF",
        [ESR_ELx_EC_IABT_LOW]                = "IABT (lower EL)",
        [ESR_ELx_EC_IABT_CUR]                = "IABT (current EL)",
        [ESR_ELx_EC_PC_ALIGN]                = "PC Alignment",
        [ESR_ELx_EC_DABT_LOW]                = "DABT (lower EL)",
        [ESR_ELx_EC_DABT_CUR]                = "DABT (current EL)",
        [ESR_ELx_EC_SP_ALIGN]                = "SP Alignment",
        [ESR_ELx_EC_MOPS]                = "MOPS",
        [ESR_ELx_EC_FP_EXC32]                = "FP (AArch32)",
        [ESR_ELx_EC_FP_EXC64]                = "FP (AArch64)",
        [ESR_ELx_EC_GCS]                = "Guarded Control Stack",
        [ESR_ELx_EC_SERROR]                = "SError",
        [ESR_ELx_EC_BREAKPT_LOW]        = "Breakpoint (lower EL)",
        [ESR_ELx_EC_BREAKPT_CUR]        = "Breakpoint (current EL)",
        [ESR_ELx_EC_SOFTSTP_LOW]        = "Software Step (lower EL)",
        [ESR_ELx_EC_SOFTSTP_CUR]        = "Software Step (current EL)",
        [ESR_ELx_EC_WATCHPT_LOW]        = "Watchpoint (lower EL)",
        [ESR_ELx_EC_WATCHPT_CUR]        = "Watchpoint (current EL)",
        [ESR_ELx_EC_BKPT32]                = "BKPT (AArch32)",
        [ESR_ELx_EC_VECTOR32]                = "Vector catch (AArch32)",
        [ESR_ELx_EC_BRK64]                = "BRK (AArch64)",
};

const char *esr_get_class_string(unsigned long esr)
{
        return esr_class_str[ESR_ELx_EC(esr)];
}

/*
 * bad_el0_sync handles unexpected, but potentially recoverable synchronous
 * exceptions taken from EL0.
 */
void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr)
{
        unsigned long pc = instruction_pointer(regs);

        current->thread.fault_address = 0;
        current->thread.fault_code = esr;

        arm64_force_sig_fault(SIGILL, ILL_ILLOPC, pc,
                              "Bad EL0 synchronous exception");
}

#ifdef CONFIG_VMAP_STACK

DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack)
        __aligned(16);

void __noreturn panic_bad_stack(struct pt_regs *regs, unsigned long esr, unsigned long far)
{
        unsigned long tsk_stk = (unsigned long)current->stack;
        unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr);
        unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack);

        console_verbose();
        pr_emerg("Insufficient stack space to handle exception!");

        pr_emerg("ESR: 0x%016lx -- %s\n", esr, esr_get_class_string(esr));
        pr_emerg("FAR: 0x%016lx\n", far);

        pr_emerg("Task stack:     [0x%016lx..0x%016lx]\n",
                 tsk_stk, tsk_stk + THREAD_SIZE);
        pr_emerg("IRQ stack:      [0x%016lx..0x%016lx]\n",
                 irq_stk, irq_stk + IRQ_STACK_SIZE);
        pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n",
                 ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE);

        __show_regs(regs);

        /*
         * We use nmi_panic to limit the potential for recusive overflows, and
         * to get a better stack trace.
         */
        nmi_panic(NULL, "kernel stack overflow");
        cpu_park_loop();
}
#endif

void __noreturn arm64_serror_panic(struct pt_regs *regs, unsigned long esr)
{
        console_verbose();

        pr_crit("SError Interrupt on CPU%d, code 0x%016lx -- %s\n",
                smp_processor_id(), esr, esr_get_class_string(esr));
        if (regs)
                __show_regs(regs);

        nmi_panic(regs, "Asynchronous SError Interrupt");

        cpu_park_loop();
}

bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr)
{
        unsigned long aet = arm64_ras_serror_get_severity(esr);

        switch (aet) {
        case ESR_ELx_AET_CE:        /* corrected error */
        case ESR_ELx_AET_UEO:        /* restartable, not yet consumed */
                /*
                 * The CPU can make progress. We may take UEO again as
                 * a more severe error.
                 */
                return false;

        case ESR_ELx_AET_UEU:        /* Uncorrected Unrecoverable */
        case ESR_ELx_AET_UER:        /* Uncorrected Recoverable */
                /*
                 * The CPU can't make progress. The exception may have
                 * been imprecise.
                 *
                 * Neoverse-N1 #1349291 means a non-KVM SError reported as
                 * Unrecoverable should be treated as Uncontainable. We
                 * call arm64_serror_panic() in both cases.
                 */
                return true;

        case ESR_ELx_AET_UC:        /* Uncontainable or Uncategorized error */
        default:
                /* Error has been silently propagated */
                arm64_serror_panic(regs, esr);
        }
}

void do_serror(struct pt_regs *regs, unsigned long esr)
{
        /* non-RAS errors are not containable */
        if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr))
                arm64_serror_panic(regs, esr);
}

/* GENERIC_BUG traps */
#ifdef CONFIG_GENERIC_BUG
int is_valid_bugaddr(unsigned long addr)
{
        /*
         * bug_handler() only called for BRK #BUG_BRK_IMM.
         * So the answer is trivial -- any spurious instances with no
         * bug table entry will be rejected by report_bug() and passed
         * back to the debug-monitors code and handled as a fatal
         * unexpected debug exception.
         */
        return 1;
}
#endif

static int bug_handler(struct pt_regs *regs, unsigned long esr)
{
        switch (report_bug(regs->pc, regs)) {
        case BUG_TRAP_TYPE_BUG:
                die("Oops - BUG", regs, esr);
                break;

        case BUG_TRAP_TYPE_WARN:
                break;

        default:
                /* unknown/unrecognised bug trap type */
                return DBG_HOOK_ERROR;
        }

        /* If thread survives, skip over the BUG instruction and continue: */
        arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
        return DBG_HOOK_HANDLED;
}

static struct break_hook bug_break_hook = {
        .fn = bug_handler,
        .imm = BUG_BRK_IMM,
};

#ifdef CONFIG_CFI_CLANG
static int cfi_handler(struct pt_regs *regs, unsigned long esr)
{
        unsigned long target;
        u32 type;

        target = pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TARGET, esr));
        type = (u32)pt_regs_read_reg(regs, FIELD_GET(CFI_BRK_IMM_TYPE, esr));

        switch (report_cfi_failure(regs, regs->pc, &target, type)) {
        case BUG_TRAP_TYPE_BUG:
                die("Oops - CFI", regs, esr);
                break;

        case BUG_TRAP_TYPE_WARN:
                break;

        default:
                return DBG_HOOK_ERROR;
        }

        arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
        return DBG_HOOK_HANDLED;
}

static struct break_hook cfi_break_hook = {
        .fn = cfi_handler,
        .imm = CFI_BRK_IMM_BASE,
        .mask = CFI_BRK_IMM_MASK,
};
#endif /* CONFIG_CFI_CLANG */

static int reserved_fault_handler(struct pt_regs *regs, unsigned long esr)
{
        pr_err("%s generated an invalid instruction at %pS!\n",
                "Kernel text patching",
                (void *)instruction_pointer(regs));

        /* We cannot handle this */
        return DBG_HOOK_ERROR;
}

static struct break_hook fault_break_hook = {
        .fn = reserved_fault_handler,
        .imm = FAULT_BRK_IMM,
};

#ifdef CONFIG_KASAN_SW_TAGS

#define KASAN_ESR_RECOVER        0x20
#define KASAN_ESR_WRITE        0x10
#define KASAN_ESR_SIZE_MASK        0x0f
#define KASAN_ESR_SIZE(esr)        (1 << ((esr) & KASAN_ESR_SIZE_MASK))

static int kasan_handler(struct pt_regs *regs, unsigned long esr)
{
        bool recover = esr & KASAN_ESR_RECOVER;
        bool write = esr & KASAN_ESR_WRITE;
        size_t size = KASAN_ESR_SIZE(esr);
        void *addr = (void *)regs->regs[0];
        u64 pc = regs->pc;

        kasan_report(addr, size, write, pc);

        /*
         * The instrumentation allows to control whether we can proceed after
         * a crash was detected. This is done by passing the -recover flag to
         * the compiler. Disabling recovery allows to generate more compact
         * code.
         *
         * Unfortunately disabling recovery doesn't work for the kernel right
         * now. KASAN reporting is disabled in some contexts (for example when
         * the allocator accesses slab object metadata; this is controlled by
         * current->kasan_depth). All these accesses are detected by the tool,
         * even though the reports for them are not printed.
         *
         * This is something that might be fixed at some point in the future.
         */
        if (!recover)
                die("Oops - KASAN", regs, esr);

        /* If thread survives, skip over the brk instruction and continue: */
        arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
        return DBG_HOOK_HANDLED;
}

static struct break_hook kasan_break_hook = {
        .fn        = kasan_handler,
        .imm        = KASAN_BRK_IMM,
        .mask        = KASAN_BRK_MASK,
};
#endif

#ifdef CONFIG_UBSAN_TRAP
static int ubsan_handler(struct pt_regs *regs, unsigned long esr)
{
        die(report_ubsan_failure(esr & UBSAN_BRK_MASK), regs, esr);
        return DBG_HOOK_HANDLED;
}

static struct break_hook ubsan_break_hook = {
        .fn        = ubsan_handler,
        .imm        = UBSAN_BRK_IMM,
        .mask        = UBSAN_BRK_MASK,
};
#endif

/*
 * Initial handler for AArch64 BRK exceptions
 * This handler only used until debug_traps_init().
 */
int __init early_brk64(unsigned long addr, unsigned long esr,
                struct pt_regs *regs)
{
#ifdef CONFIG_CFI_CLANG
        if (esr_is_cfi_brk(esr))
                return cfi_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
#ifdef CONFIG_KASAN_SW_TAGS
        if ((esr_brk_comment(esr) & ~KASAN_BRK_MASK) == KASAN_BRK_IMM)
                return kasan_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
#ifdef CONFIG_UBSAN_TRAP
        if (esr_is_ubsan_brk(esr))
                return ubsan_handler(regs, esr) != DBG_HOOK_HANDLED;
#endif
        return bug_handler(regs, esr) != DBG_HOOK_HANDLED;
}

void __init trap_init(void)
{
        register_kernel_break_hook(&bug_break_hook);
#ifdef CONFIG_CFI_CLANG
        register_kernel_break_hook(&cfi_break_hook);
#endif
        register_kernel_break_hook(&fault_break_hook);
#ifdef CONFIG_KASAN_SW_TAGS
        register_kernel_break_hook(&kasan_break_hook);
#endif
#ifdef CONFIG_UBSAN_TRAP
        register_kernel_break_hook(&ubsan_break_hook);
#endif
        debug_traps_init();
}





































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
// SPDX-License-Identifier: GPL-2.0
/*
 * lib/smp_processor_id.c
 *
 * DEBUG_PREEMPT variant of smp_processor_id().
 */
#include <linux/export.h>
#include <linux/kprobes.h>
#include <linux/sched.h>

noinstr static
unsigned int check_preemption_disabled(const char *what1, const char *what2)
{
        int this_cpu = raw_smp_processor_id();

        if (likely(preempt_count()))
                goto out;

        if (irqs_disabled())
                goto out;

        if (is_percpu_thread())
                goto out;

#ifdef CONFIG_SMP
        if (current->migration_disabled)
                goto out;
#endif

        /*
         * It is valid to assume CPU-locality during early bootup:
         */
        if (system_state < SYSTEM_SCHEDULING)
                goto out;

        /*
         * Avoid recursion:
         */
        preempt_disable_notrace();

        instrumentation_begin();
        if (!printk_ratelimit())
                goto out_enable;

        printk(KERN_ERR "BUG: using %s%s() in preemptible [%08x] code: %s/%d\n",
                what1, what2, preempt_count() - 1, current->comm, current->pid);

        printk("caller is %pS\n", __builtin_return_address(0));
        dump_stack();

out_enable:
        instrumentation_end();
        preempt_enable_no_resched_notrace();
out:
        return this_cpu;
}

noinstr unsigned int debug_smp_processor_id(void)
{
        return check_preemption_disabled("smp_processor_id", "");
}
EXPORT_SYMBOL(debug_smp_processor_id);

noinstr void __this_cpu_preempt_check(const char *op)
{
        check_preemption_disabled("__this_cpu_", op);
}
EXPORT_SYMBOL(__this_cpu_preempt_check);





























































    6 

















































































































  230 















   27 












































































































































































































































































































































































































































































































































































































   69 







































































































































































































































































































































































































































  725 






  231 








  664 








   96 






   36 
   36 







   17 






   38 





   66 





  230 






































































































































































   91 













    1 


















































































  275 
















































































  156 





   66 


  121 
  121 








































































    3 



















  160 



  160 




   87 



   87 




   86 



















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_XARRAY_H
#define _LINUX_XARRAY_H
/*
 * eXtensible Arrays
 * Copyright (c) 2017 Microsoft Corporation
 * Author: Matthew Wilcox <willy@infradead.org>
 *
 * See Documentation/core-api/xarray.rst for how to use the XArray.
 */

#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/gfp.h>
#include <linux/kconfig.h>
#include <linux/limits.h>
#include <linux/lockdep.h>
#include <linux/rcupdate.h>
#include <linux/sched/mm.h>
#include <linux/spinlock.h>
#include <linux/types.h>

struct list_lru;

/*
 * The bottom two bits of the entry determine how the XArray interprets
 * the contents:
 *
 * 00: Pointer entry
 * 10: Internal entry
 * x1: Value entry or tagged pointer
 *
 * Attempting to store internal entries in the XArray is a bug.
 *
 * Most internal entries are pointers to the next node in the tree.
 * The following internal entries have a special meaning:
 *
 * 0-62: Sibling entries
 * 256: Retry entry
 * 257: Zero entry
 *
 * Errors are also represented as internal entries, but use the negative
 * space (-4094 to -2).  They're never stored in the slots array; only
 * returned by the normal API.
 */

#define BITS_PER_XA_VALUE        (BITS_PER_LONG - 1)

/**
 * xa_mk_value() - Create an XArray entry from an integer.
 * @v: Value to store in XArray.
 *
 * Context: Any context.
 * Return: An entry suitable for storing in the XArray.
 */
static inline void *xa_mk_value(unsigned long v)
{
        WARN_ON((long)v < 0);
        return (void *)((v << 1) | 1);
}

/**
 * xa_to_value() - Get value stored in an XArray entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value stored in the XArray entry.
 */
static inline unsigned long xa_to_value(const void *entry)
{
        return (unsigned long)entry >> 1;
}

/**
 * xa_is_value() - Determine if an entry is a value.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: True if the entry is a value, false if it is a pointer.
 */
static inline bool xa_is_value(const void *entry)
{
        return (unsigned long)entry & 1;
}

/**
 * xa_tag_pointer() - Create an XArray entry for a tagged pointer.
 * @p: Plain pointer.
 * @tag: Tag value (0, 1 or 3).
 *
 * If the user of the XArray prefers, they can tag their pointers instead
 * of storing value entries.  Three tags are available (0, 1 and 3).
 * These are distinct from the xa_mark_t as they are not replicated up
 * through the array and cannot be searched for.
 *
 * Context: Any context.
 * Return: An XArray entry.
 */
static inline void *xa_tag_pointer(void *p, unsigned long tag)
{
        return (void *)((unsigned long)p | tag);
}

/**
 * xa_untag_pointer() - Turn an XArray entry into a plain pointer.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the untagged version of the pointer.
 *
 * Context: Any context.
 * Return: A pointer.
 */
static inline void *xa_untag_pointer(void *entry)
{
        return (void *)((unsigned long)entry & ~3UL);
}

/**
 * xa_pointer_tag() - Get the tag stored in an XArray entry.
 * @entry: XArray entry.
 *
 * If you have stored a tagged pointer in the XArray, call this function
 * to get the tag of that pointer.
 *
 * Context: Any context.
 * Return: A tag.
 */
static inline unsigned int xa_pointer_tag(void *entry)
{
        return (unsigned long)entry & 3UL;
}

/*
 * xa_mk_internal() - Create an internal entry.
 * @v: Value to turn into an internal entry.
 *
 * Internal entries are used for a number of purposes.  Entries 0-255 are
 * used for sibling entries (only 0-62 are used by the current code).  256
 * is used for the retry entry.  257 is used for the reserved / zero entry.
 * Negative internal entries are used to represent errnos.  Node pointers
 * are also tagged as internal entries in some situations.
 *
 * Context: Any context.
 * Return: An XArray internal entry corresponding to this value.
 */
static inline void *xa_mk_internal(unsigned long v)
{
        return (void *)((v << 2) | 2);
}

/*
 * xa_to_internal() - Extract the value from an internal entry.
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: The value which was stored in the internal entry.
 */
static inline unsigned long xa_to_internal(const void *entry)
{
        return (unsigned long)entry >> 2;
}

/*
 * xa_is_internal() - Is the entry an internal entry?
 * @entry: XArray entry.
 *
 * Context: Any context.
 * Return: %true if the entry is an internal entry.
 */
static inline bool xa_is_internal(const void *entry)
{
        return ((unsigned long)entry & 3) == 2;
}

#define XA_ZERO_ENTRY                xa_mk_internal(257)

/**
 * xa_is_zero() - Is the entry a zero entry?
 * @entry: Entry retrieved from the XArray
 *
 * The normal API will return NULL as the contents of a slot containing
 * a zero entry.  You can only see zero entries by using the advanced API.
 *
 * Return: %true if the entry is a zero entry.
 */
static inline bool xa_is_zero(const void *entry)
{
        return unlikely(entry == XA_ZERO_ENTRY);
}

/**
 * xa_is_err() - Report whether an XArray operation returned an error
 * @entry: Result from calling an XArray function
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special value indicating an error.  This function tells you
 * whether an error occurred; xa_err() tells you which error occurred.
 *
 * Context: Any context.
 * Return: %true if the entry indicates an error.
 */
static inline bool xa_is_err(const void *entry)
{
        return unlikely(xa_is_internal(entry) &&
                        entry >= xa_mk_internal(-MAX_ERRNO));
}

/**
 * xa_err() - Turn an XArray result into an errno.
 * @entry: Result from calling an XArray function.
 *
 * If an XArray operation cannot complete an operation, it will return
 * a special pointer value which encodes an errno.  This function extracts
 * the errno from the pointer value, or returns 0 if the pointer does not
 * represent an errno.
 *
 * Context: Any context.
 * Return: A negative errno or 0.
 */
static inline int xa_err(void *entry)
{
        /* xa_to_internal() would not do sign extension. */
        if (xa_is_err(entry))
                return (long)entry >> 2;
        return 0;
}

/**
 * struct xa_limit - Represents a range of IDs.
 * @min: The lowest ID to allocate (inclusive).
 * @max: The maximum ID to allocate (inclusive).
 *
 * This structure is used either directly or via the XA_LIMIT() macro
 * to communicate the range of IDs that are valid for allocation.
 * Three common ranges are predefined for you:
 * * xa_limit_32b        - [0 - UINT_MAX]
 * * xa_limit_31b        - [0 - INT_MAX]
 * * xa_limit_16b        - [0 - USHRT_MAX]
 */
struct xa_limit {
        u32 max;
        u32 min;
};

#define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max }

#define xa_limit_32b        XA_LIMIT(0, UINT_MAX)
#define xa_limit_31b        XA_LIMIT(0, INT_MAX)
#define xa_limit_16b        XA_LIMIT(0, USHRT_MAX)

typedef unsigned __bitwise xa_mark_t;
#define XA_MARK_0                ((__force xa_mark_t)0U)
#define XA_MARK_1                ((__force xa_mark_t)1U)
#define XA_MARK_2                ((__force xa_mark_t)2U)
#define XA_PRESENT                ((__force xa_mark_t)8U)
#define XA_MARK_MAX                XA_MARK_2
#define XA_FREE_MARK                XA_MARK_0

enum xa_lock_type {
        XA_LOCK_IRQ = 1,
        XA_LOCK_BH = 2,
};

/*
 * Values for xa_flags.  The radix tree stores its GFP flags in the xa_flags,
 * and we remain compatible with that.
 */
#define XA_FLAGS_LOCK_IRQ        ((__force gfp_t)XA_LOCK_IRQ)
#define XA_FLAGS_LOCK_BH        ((__force gfp_t)XA_LOCK_BH)
#define XA_FLAGS_TRACK_FREE        ((__force gfp_t)4U)
#define XA_FLAGS_ZERO_BUSY        ((__force gfp_t)8U)
#define XA_FLAGS_ALLOC_WRAPPED        ((__force gfp_t)16U)
#define XA_FLAGS_ACCOUNT        ((__force gfp_t)32U)
#define XA_FLAGS_MARK(mark)        ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \
                                                (__force unsigned)(mark)))

/* ALLOC is for a normal 0-based alloc.  ALLOC1 is for an 1-based alloc */
#define XA_FLAGS_ALLOC        (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK))
#define XA_FLAGS_ALLOC1        (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY)

/**
 * struct xarray - The anchor of the XArray.
 * @xa_lock: Lock that protects the contents of the XArray.
 *
 * To use the xarray, define it statically or embed it in your data structure.
 * It is a very small data structure, so it does not usually make sense to
 * allocate it separately and keep a pointer to it in your data structure.
 *
 * You may use the xa_lock to protect your own data structures as well.
 */
/*
 * If all of the entries in the array are NULL, @xa_head is a NULL pointer.
 * If the only non-NULL entry in the array is at index 0, @xa_head is that
 * entry.  If any other entry in the array is non-NULL, @xa_head points
 * to an @xa_node.
 */
struct xarray {
        spinlock_t        xa_lock;
/* private: The rest of the data structure is not to be used directly. */
        gfp_t                xa_flags;
        void __rcu *        xa_head;
};

#define XARRAY_INIT(name, flags) {                                \
        .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock),                \
        .xa_flags = flags,                                        \
        .xa_head = NULL,                                        \
}

/**
 * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags.
 * @name: A string that names your XArray.
 * @flags: XA_FLAG values.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name and flags.  It is
 * equivalent to calling xa_init_flags() on the array, but it does the
 * initialisation at compiletime instead of runtime.
 */
#define DEFINE_XARRAY_FLAGS(name, flags)                                \
        struct xarray name = XARRAY_INIT(name, flags)

/**
 * DEFINE_XARRAY() - Define an XArray.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of XArrays.  It declares
 * and initialises an empty XArray with the chosen name.  It is equivalent
 * to calling xa_init() on the array, but it does the initialisation at
 * compiletime instead of runtime.
 */
#define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0)

/**
 * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC)

/**
 * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1.
 * @name: A string that names your XArray.
 *
 * This is intended for file scope definitions of allocating XArrays.
 * See also DEFINE_XARRAY().
 */
#define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1)

void *xa_load(struct xarray *, unsigned long index);
void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *xa_erase(struct xarray *, unsigned long index);
void *xa_store_range(struct xarray *, unsigned long first, unsigned long last,
                        void *entry, gfp_t);
bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);
void *xa_find(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
void *xa_find_after(struct xarray *xa, unsigned long *index,
                unsigned long max, xa_mark_t) __attribute__((nonnull(2)));
unsigned int xa_extract(struct xarray *, void **dst, unsigned long start,
                unsigned long max, unsigned int n, xa_mark_t);
void xa_destroy(struct xarray *);

/**
 * xa_init_flags() - Initialise an empty XArray with flags.
 * @xa: XArray.
 * @flags: XA_FLAG values.
 *
 * If you need to initialise an XArray with special flags (eg you need
 * to take the lock from interrupt context), use this function instead
 * of xa_init().
 *
 * Context: Any context.
 */
static inline void xa_init_flags(struct xarray *xa, gfp_t flags)
{
        spin_lock_init(&xa->xa_lock);
        xa->xa_flags = flags;
        xa->xa_head = NULL;
}

/**
 * xa_init() - Initialise an empty XArray.
 * @xa: XArray.
 *
 * An empty XArray is full of NULL entries.
 *
 * Context: Any context.
 */
static inline void xa_init(struct xarray *xa)
{
        xa_init_flags(xa, 0);
}

/**
 * xa_empty() - Determine if an array has any present entries.
 * @xa: XArray.
 *
 * Context: Any context.
 * Return: %true if the array contains only NULL pointers.
 */
static inline bool xa_empty(const struct xarray *xa)
{
        return xa->xa_head == NULL;
}

/**
 * xa_marked() - Inquire whether any entry in this array has a mark set
 * @xa: Array
 * @mark: Mark value
 *
 * Context: Any context.
 * Return: %true if any entry has this mark set.
 */
static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark)
{
        return xa->xa_flags & XA_FLAGS_MARK(mark);
}

/**
 * xa_for_each_range() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 * @last: Last index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_range() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_range().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_range(xa, index, entry, start, last)                \
        for (index = start,                                                \
             entry = xa_find(xa, &index, last, XA_PRESENT);                \
             entry;                                                        \
             entry = xa_find_after(xa, &index, last, XA_PRESENT))

/**
 * xa_for_each_start() - Iterate over a portion of an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @start: First index to retrieve from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you
 * want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set
 * to NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_start() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each() iterator instead.
 * The xas_for_each() iterator will expand into more inline code than
 * xa_for_each_start().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_start(xa, index, entry, start) \
        xa_for_each_range(xa, index, entry, start, ULONG_MAX)

/**
 * xa_for_each() - Iterate over present entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  You may modify @index during the iteration if you want
 * to skip or reprocess indices.  It is safe to modify the array during the
 * iteration.  At the end of the iteration, @entry will be set to NULL and
 * @index will have a value less than or equal to max.
 *
 * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n).  You have
 * to handle your own locking with xas_for_each(), and if you have to unlock
 * after each iteration, it will also end up being O(n.log(n)).  xa_for_each()
 * will spin if it hits a retry entry; if you intend to see retry entries,
 * you should use the xas_for_each() iterator instead.  The xas_for_each()
 * iterator will expand into more inline code than xa_for_each().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each(xa, index, entry) \
        xa_for_each_start(xa, index, entry, 0)

/**
 * xa_for_each_marked() - Iterate over marked entries in an XArray.
 * @xa: XArray.
 * @index: Index of @entry.
 * @entry: Entry retrieved from array.
 * @filter: Selection criterion.
 *
 * During the iteration, @entry will have the value of the entry stored
 * in @xa at @index.  The iteration will skip all entries in the array
 * which do not match @filter.  You may modify @index during the iteration
 * if you want to skip or reprocess indices.  It is safe to modify the array
 * during the iteration.  At the end of the iteration, @entry will be set to
 * NULL and @index will have a value less than or equal to max.
 *
 * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n).
 * You have to handle your own locking with xas_for_each(), and if you have
 * to unlock after each iteration, it will also end up being O(n.log(n)).
 * xa_for_each_marked() will spin if it hits a retry entry; if you intend to
 * see retry entries, you should use the xas_for_each_marked() iterator
 * instead.  The xas_for_each_marked() iterator will expand into more inline
 * code than xa_for_each_marked().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 */
#define xa_for_each_marked(xa, index, entry, filter) \
        for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \
             entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter))

#define xa_trylock(xa)                spin_trylock(&(xa)->xa_lock)
#define xa_lock(xa)                spin_lock(&(xa)->xa_lock)
#define xa_unlock(xa)                spin_unlock(&(xa)->xa_lock)
#define xa_lock_bh(xa)                spin_lock_bh(&(xa)->xa_lock)
#define xa_unlock_bh(xa)        spin_unlock_bh(&(xa)->xa_lock)
#define xa_lock_irq(xa)                spin_lock_irq(&(xa)->xa_lock)
#define xa_unlock_irq(xa)        spin_unlock_irq(&(xa)->xa_lock)
#define xa_lock_irqsave(xa, flags) \
                                spin_lock_irqsave(&(xa)->xa_lock, flags)
#define xa_unlock_irqrestore(xa, flags) \
                                spin_unlock_irqrestore(&(xa)->xa_lock, flags)
#define xa_lock_nested(xa, subclass) \
                                spin_lock_nested(&(xa)->xa_lock, subclass)
#define xa_lock_bh_nested(xa, subclass) \
                                spin_lock_bh_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irq_nested(xa, subclass) \
                                spin_lock_irq_nested(&(xa)->xa_lock, subclass)
#define xa_lock_irqsave_nested(xa, flags, subclass) \
                spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass)

/*
 * Versions of the normal API which require the caller to hold the
 * xa_lock.  If the GFP flags allow it, they will drop the lock to
 * allocate memory, then reacquire it afterwards.  These functions
 * may also re-enable interrupts if the XArray flags indicate the
 * locking should be interrupt safe.
 */
void *__xa_erase(struct xarray *, unsigned long index);
void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t);
void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old,
                void *entry, gfp_t);
int __must_check __xa_insert(struct xarray *, unsigned long index,
                void *entry, gfp_t);
int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry,
                struct xa_limit, gfp_t);
int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry,
                struct xa_limit, u32 *next, gfp_t);
void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t);
void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t);

/**
 * xa_store_bh() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_bh(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_store_irq() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_store() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
static inline void *xa_store_irq(struct xarray *xa, unsigned long index,
                void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_erase_bh() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_bh(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_bh(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_bh(xa);

        return entry;
}

/**
 * xa_erase_irq() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: The entry which used to be at this index.
 */
static inline void *xa_erase_irq(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock_irq(xa);
        entry = __xa_erase(xa, index);
        xa_unlock_irq(xa);

        return entry;
}

/**
 * xa_cmpxchg() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * If the entry at @index is the same as @old, replace it with @entry.
 * If the return value is equal to @old, then the exchange was successful.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock(xa);

        return curr;
}

/**
 * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables softirqs
 * while holding the array lock.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_bh(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_bh(xa);

        return curr;
}

/**
 * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New value to place in array.
 * @gfp: Memory allocation flags.
 *
 * This function is like calling xa_cmpxchg() except it disables interrupts
 * while holding the array lock.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: The old value at this index or xa_err() if an error happened.
 */
static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        void *curr;

        might_alloc(gfp);
        xa_lock_irq(xa);
        curr = __xa_cmpxchg(xa, index, old, entry, gfp);
        xa_unlock_irq(xa);

        return curr;
}

/**
 * xa_insert() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_insert_bh() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_bh(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_insert_irq() - Store this entry in the XArray unless another entry is
 *                        already present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
static inline int __must_check xa_insert_irq(struct xarray *xa,
                unsigned long index, void *entry, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_insert(xa, index, entry, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline __must_check int xa_alloc(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id,
                void *entry, struct xa_limit limit, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc(xa, id, entry, limit, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock.  May sleep if
 * the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock(xa);

        return err;
}

/**
 * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_bh(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_bh(xa);

        return err;
}

/**
 * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.  May sleep if the @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        int err;

        might_alloc(gfp);
        xa_lock_irq(xa);
        err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp);
        xa_unlock_irq(xa);

        return err;
}

/**
 * xa_reserve() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * Ensures there is somewhere to store an entry at @index in the array.
 * If there is already something stored at @index, this function does
 * nothing.  If there was nothing there, the entry is marked as reserved.
 * Loading from a reserved entry returns a %NULL pointer.
 *
 * If you do not use the entry that you have reserved, call xa_release()
 * or xa_erase() to free any unnecessary memory.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_bh() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * A softirq-disabling version of xa_reserve().
 *
 * Context: Any context.  Takes and releases the xa_lock while
 * disabling softirqs.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_reserve_irq() - Reserve this index in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @gfp: Memory allocation flags.
 *
 * An interrupt-disabling version of xa_reserve().
 *
 * Context: Process context.  Takes and releases the xa_lock while
 * disabling interrupts.
 * Return: 0 if the reservation succeeded or -ENOMEM if it failed.
 */
static inline __must_check
int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp)
{
        return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp));
}

/**
 * xa_release() - Release a reserved entry.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After calling xa_reserve(), you can call this function to release the
 * reservation.  If the entry at @index has been stored to, this function
 * will do nothing.
 */
static inline void xa_release(struct xarray *xa, unsigned long index)
{
        xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0);
}

/* Everything below here is the Advanced API.  Proceed with caution. */

/*
 * The xarray is constructed out of a set of 'chunks' of pointers.  Choosing
 * the best chunk size requires some tradeoffs.  A power of two recommends
 * itself so that we can walk the tree based purely on shifts and masks.
 * Generally, the larger the better; as the number of slots per level of the
 * tree increases, the less tall the tree needs to be.  But that needs to be
 * balanced against the memory consumption of each node.  On a 64-bit system,
 * xa_node is currently 576 bytes, and we get 7 of them per 4kB page.  If we
 * doubled the number of slots per node, we'd get only 3 nodes per 4kB page.
 */
#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT                (IS_ENABLED(CONFIG_BASE_SMALL) ? 4 : 6)
#endif
#define XA_CHUNK_SIZE                (1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK                (XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS                3
#define XA_MARK_LONGS                BITS_TO_LONGS(XA_CHUNK_SIZE)

/*
 * @count is the count of every non-NULL element in the ->slots array
 * whether that is a value entry, a retry entry, a user pointer,
 * a sibling entry or a pointer to the next level of the tree.
 * @nr_values is the count of every element in ->slots which is
 * either a value entry or a sibling of a value entry.
 */
struct xa_node {
        unsigned char        shift;                /* Bits remaining in each slot */
        unsigned char        offset;                /* Slot offset in parent */
        unsigned char        count;                /* Total entry count */
        unsigned char        nr_values;        /* Value entry count */
        struct xa_node __rcu *parent;        /* NULL at top of tree */
        struct xarray        *array;                /* The array we belong to */
        union {
                struct list_head private_list;        /* For tree user */
                struct rcu_head        rcu_head;        /* Used when freeing node */
        };
        void __rcu        *slots[XA_CHUNK_SIZE];
        union {
                unsigned long        tags[XA_MAX_MARKS][XA_MARK_LONGS];
                unsigned long        marks[XA_MAX_MARKS][XA_MARK_LONGS];
        };
};

void xa_dump(const struct xarray *);
void xa_dump_node(const struct xa_node *);

#ifdef XA_DEBUG
#define XA_BUG_ON(xa, x) do {                                        \
                if (x) {                                        \
                        xa_dump(xa);                                \
                        BUG();                                        \
                }                                                \
        } while (0)
#define XA_NODE_BUG_ON(node, x) do {                                \
                if (x) {                                        \
                        if (node) xa_dump_node(node);                \
                        BUG();                                        \
                }                                                \
        } while (0)
#else
#define XA_BUG_ON(xa, x)        do { } while (0)
#define XA_NODE_BUG_ON(node, x)        do { } while (0)
#endif

/* Private */
static inline void *xa_head(const struct xarray *xa)
{
        return rcu_dereference_check(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_head_locked(const struct xarray *xa)
{
        return rcu_dereference_protected(xa->xa_head,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_check(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_entry_locked(const struct xarray *xa,
                                const struct xa_node *node, unsigned int offset)
{
        XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
        return rcu_dereference_protected(node->slots[offset],
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_check(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline struct xa_node *xa_parent_locked(const struct xarray *xa,
                                        const struct xa_node *node)
{
        return rcu_dereference_protected(node->parent,
                                                lockdep_is_held(&xa->xa_lock));
}

/* Private */
static inline void *xa_mk_node(const struct xa_node *node)
{
        return (void *)((unsigned long)node | 2);
}

/* Private */
static inline struct xa_node *xa_to_node(const void *entry)
{
        return (struct xa_node *)((unsigned long)entry - 2);
}

/* Private */
static inline bool xa_is_node(const void *entry)
{
        return xa_is_internal(entry) && (unsigned long)entry > 4096;
}

/* Private */
static inline void *xa_mk_sibling(unsigned int offset)
{
        return xa_mk_internal(offset);
}

/* Private */
static inline unsigned long xa_to_sibling(const void *entry)
{
        return xa_to_internal(entry);
}

/**
 * xa_is_sibling() - Is the entry a sibling entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a sibling entry.
 */
static inline bool xa_is_sibling(const void *entry)
{
        return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) &&
                (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1));
}

#define XA_RETRY_ENTRY                xa_mk_internal(256)

/**
 * xa_is_retry() - Is the entry a retry entry?
 * @entry: Entry retrieved from the XArray
 *
 * Return: %true if the entry is a retry entry.
 */
static inline bool xa_is_retry(const void *entry)
{
        return unlikely(entry == XA_RETRY_ENTRY);
}

/**
 * xa_is_advanced() - Is the entry only permitted for the advanced API?
 * @entry: Entry to be stored in the XArray.
 *
 * Return: %true if the entry cannot be stored by the normal API.
 */
static inline bool xa_is_advanced(const void *entry)
{
        return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY);
}

/**
 * typedef xa_update_node_t - A callback function from the XArray.
 * @node: The node which is being processed
 *
 * This function is called every time the XArray updates the count of
 * present and value entries in a node.  It allows advanced users to
 * maintain the private_list in the node.
 *
 * Context: The xa_lock is held and interrupts may be disabled.
 *            Implementations should not drop the xa_lock, nor re-enable
 *            interrupts.
 */
typedef void (*xa_update_node_t)(struct xa_node *node);

void xa_delete_node(struct xa_node *, xa_update_node_t);

/*
 * The xa_state is opaque to its users.  It contains various different pieces
 * of state involved in the current operation on the XArray.  It should be
 * declared on the stack and passed between the various internal routines.
 * The various elements in it should not be accessed directly, but only
 * through the provided accessor functions.  The below documentation is for
 * the benefit of those working on the code, not for users of the XArray.
 *
 * @xa_node usually points to the xa_node containing the slot we're operating
 * on (and @xa_offset is the offset in the slots array).  If there is a
 * single entry in the array at index 0, there are no allocated xa_nodes to
 * point to, and so we store %NULL in @xa_node.  @xa_node is set to
 * the value %XAS_RESTART if the xa_state is not walked to the correct
 * position in the tree of nodes for this operation.  If an error occurs
 * during an operation, it is set to an %XAS_ERROR value.  If we run off the
 * end of the allocated nodes, it is set to %XAS_BOUNDS.
 */
struct xa_state {
        struct xarray *xa;
        unsigned long xa_index;
        unsigned char xa_shift;
        unsigned char xa_sibs;
        unsigned char xa_offset;
        unsigned char xa_pad;                /* Helps gcc generate better code */
        struct xa_node *xa_node;
        struct xa_node *xa_alloc;
        xa_update_node_t xa_update;
        struct list_lru *xa_lru;
};

/*
 * We encode errnos in the xas->xa_node.  If an error has happened, we need to
 * drop the lock to fix it, and once we've done so the xa_state is invalid.
 */
#define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL))
#define XAS_BOUNDS        ((struct xa_node *)1UL)
#define XAS_RESTART        ((struct xa_node *)3UL)

#define __XA_STATE(array, index, shift, sibs)  {        \
        .xa = array,                                        \
        .xa_index = index,                                \
        .xa_shift = shift,                                \
        .xa_sibs = sibs,                                \
        .xa_offset = 0,                                        \
        .xa_pad = 0,                                        \
        .xa_node = XAS_RESTART,                                \
        .xa_alloc = NULL,                                \
        .xa_update = NULL,                                \
        .xa_lru = NULL,                                        \
}

/**
 * XA_STATE() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 *
 * Declare and initialise an xa_state on the stack.
 */
#define XA_STATE(name, array, index)                                \
        struct xa_state name = __XA_STATE(array, index, 0, 0)

/**
 * XA_STATE_ORDER() - Declare an XArray operation state.
 * @name: Name of this operation state (usually xas).
 * @array: Array to operate on.
 * @index: Initial index of interest.
 * @order: Order of entry.
 *
 * Declare and initialise an xa_state on the stack.  This variant of
 * XA_STATE() allows you to specify the 'order' of the element you
 * want to operate on.`
 */
#define XA_STATE_ORDER(name, array, index, order)                \
        struct xa_state name = __XA_STATE(array,                \
                        (index >> order) << order,                \
                        order - (order % XA_CHUNK_SHIFT),        \
                        (1U << (order % XA_CHUNK_SHIFT)) - 1)

#define xas_marked(xas, mark)        xa_marked((xas)->xa, (mark))
#define xas_trylock(xas)        xa_trylock((xas)->xa)
#define xas_lock(xas)                xa_lock((xas)->xa)
#define xas_unlock(xas)                xa_unlock((xas)->xa)
#define xas_lock_bh(xas)        xa_lock_bh((xas)->xa)
#define xas_unlock_bh(xas)        xa_unlock_bh((xas)->xa)
#define xas_lock_irq(xas)        xa_lock_irq((xas)->xa)
#define xas_unlock_irq(xas)        xa_unlock_irq((xas)->xa)
#define xas_lock_irqsave(xas, flags) \
                                xa_lock_irqsave((xas)->xa, flags)
#define xas_unlock_irqrestore(xas, flags) \
                                xa_unlock_irqrestore((xas)->xa, flags)

/**
 * xas_error() - Return an errno stored in the xa_state.
 * @xas: XArray operation state.
 *
 * Return: 0 if no error has been noted.  A negative errno if one has.
 */
static inline int xas_error(const struct xa_state *xas)
{
        return xa_err(xas->xa_node);
}

/**
 * xas_set_err() - Note an error in the xa_state.
 * @xas: XArray operation state.
 * @err: Negative error number.
 *
 * Only call this function with a negative @err; zero or positive errors
 * will probably not behave the way you think they should.  If you want
 * to clear the error from an xa_state, use xas_reset().
 */
static inline void xas_set_err(struct xa_state *xas, long err)
{
        xas->xa_node = XA_ERROR(err);
}

/**
 * xas_invalid() - Is the xas in a retry or error state?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas cannot be used for operations.
 */
static inline bool xas_invalid(const struct xa_state *xas)
{
        return (unsigned long)xas->xa_node & 3;
}

/**
 * xas_valid() - Is the xas a valid cursor into the array?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas can be used for operations.
 */
static inline bool xas_valid(const struct xa_state *xas)
{
        return !xas_invalid(xas);
}

/**
 * xas_is_node() - Does the xas point to a node?
 * @xas: XArray operation state.
 *
 * Return: %true if the xas currently references a node.
 */
static inline bool xas_is_node(const struct xa_state *xas)
{
        return xas_valid(xas) && xas->xa_node;
}

/* True if the pointer is something other than a node */
static inline bool xas_not_node(struct xa_node *node)
{
        return ((unsigned long)node & 3) || !node;
}

/* True if the node represents RESTART or an error */
static inline bool xas_frozen(struct xa_node *node)
{
        return (unsigned long)node & 2;
}

/* True if the node represents head-of-tree, RESTART or BOUNDS */
static inline bool xas_top(struct xa_node *node)
{
        return node <= XAS_RESTART;
}

/**
 * xas_reset() - Reset an XArray operation state.
 * @xas: XArray operation state.
 *
 * Resets the error or walk state of the @xas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * xarray lock and want to reuse the xa_state.
 *
 * Context: Any context.
 */
static inline void xas_reset(struct xa_state *xas)
{
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_retry() - Retry the operation if appropriate.
 * @xas: XArray operation state.
 * @entry: Entry from xarray.
 *
 * The advanced functions may sometimes return an internal entry, such as
 * a retry entry or a zero entry.  This function sets up the @xas to restart
 * the walk from the head of the array if needed.
 *
 * Context: Any context.
 * Return: true if the operation needs to be retried.
 */
static inline bool xas_retry(struct xa_state *xas, const void *entry)
{
        if (xa_is_zero(entry))
                return true;
        if (!xa_is_retry(entry))
                return false;
        xas_reset(xas);
        return true;
}

void *xas_load(struct xa_state *);
void *xas_store(struct xa_state *, void *entry);
void *xas_find(struct xa_state *, unsigned long max);
void *xas_find_conflict(struct xa_state *);

bool xas_get_mark(const struct xa_state *, xa_mark_t);
void xas_set_mark(const struct xa_state *, xa_mark_t);
void xas_clear_mark(const struct xa_state *, xa_mark_t);
void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t);
void xas_init_marks(const struct xa_state *);

bool xas_nomem(struct xa_state *, gfp_t);
void xas_destroy(struct xa_state *);
void xas_pause(struct xa_state *);

void xas_create_range(struct xa_state *);

#ifdef CONFIG_XARRAY_MULTI
int xa_get_order(struct xarray *, unsigned long index);
int xas_get_order(struct xa_state *xas);
void xas_split(struct xa_state *, void *entry, unsigned int order);
void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t);
void xas_try_split(struct xa_state *xas, void *entry, unsigned int order);
unsigned int xas_try_split_min_order(unsigned int order);
#else
static inline int xa_get_order(struct xarray *xa, unsigned long index)
{
        return 0;
}

static inline int xas_get_order(struct xa_state *xas)
{
        return 0;
}

static inline void xas_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
        xas_store(xas, entry);
}

static inline void xas_split_alloc(struct xa_state *xas, void *entry,
                unsigned int order, gfp_t gfp)
{
}

static inline void xas_try_split(struct xa_state *xas, void *entry,
                unsigned int order)
{
}

static inline unsigned int xas_try_split_min_order(unsigned int order)
{
        return 0;
}

#endif

/**
 * xas_reload() - Refetch an entry from the xarray.
 * @xas: XArray operation state.
 *
 * Use this function to check that a previously loaded entry still has
 * the same value.  This is useful for the lockless pagecache lookup where
 * we walk the array with only the RCU lock to protect us, lock the page,
 * then check that the page hasn't moved since we looked it up.
 *
 * The caller guarantees that @xas is still valid.  If it may be in an
 * error or restart state, call xas_load() instead.
 *
 * Return: The entry at this location in the xarray.
 */
static inline void *xas_reload(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        char offset;

        if (!node)
                return xa_head(xas->xa);
        if (IS_ENABLED(CONFIG_XARRAY_MULTI)) {
                offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK;
                entry = xa_entry(xas->xa, node, offset);
                if (!xa_is_sibling(entry))
                        return entry;
                offset = xa_to_sibling(entry);
        } else {
                offset = xas->xa_offset;
        }
        return xa_entry(xas->xa, node, offset);
}

/**
 * xas_set() - Set up XArray operation state for a different index.
 * @xas: XArray operation state.
 * @index: New index into the XArray.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see xas_next()
 * to move to an adjacent index.
 */
static inline void xas_set(struct xa_state *xas, unsigned long index)
{
        xas->xa_index = index;
        xas->xa_node = XAS_RESTART;
}

/**
 * xas_advance() - Skip over sibling entries.
 * @xas: XArray operation state.
 * @index: Index of last sibling entry.
 *
 * Move the operation state to refer to the last sibling entry.
 * This is useful for loops that normally want to see sibling
 * entries but sometimes want to skip them.  Use xas_set() if you
 * want to move to an index which is not part of this entry.
 */
static inline void xas_advance(struct xa_state *xas, unsigned long index)
{
        unsigned char shift = xas_is_node(xas) ? xas->xa_node->shift : 0;

        xas->xa_index = index;
        xas->xa_offset = (index >> shift) & XA_CHUNK_MASK;
}

/**
 * xas_set_order() - Set up XArray operation state for a multislot entry.
 * @xas: XArray operation state.
 * @index: Target of the operation.
 * @order: Entry occupies 2^@order indices.
 */
static inline void xas_set_order(struct xa_state *xas, unsigned long index,
                                        unsigned int order)
{
#ifdef CONFIG_XARRAY_MULTI
        xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0;
        xas->xa_shift = order - (order % XA_CHUNK_SHIFT);
        xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        xas->xa_node = XAS_RESTART;
#else
        BUG_ON(order > 0);
        xas_set(xas, index);
#endif
}

/**
 * xas_set_update() - Set up XArray operation state for a callback.
 * @xas: XArray operation state.
 * @update: Function to call when updating a node.
 *
 * The XArray can notify a caller after it has updated an xa_node.
 * This is advanced functionality and is only needed by the page
 * cache and swap cache.
 */
static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update)
{
        xas->xa_update = update;
}

static inline void xas_set_lru(struct xa_state *xas, struct list_lru *lru)
{
        xas->xa_lru = lru;
}

/**
 * xas_next_entry() - Advance iterator to next present entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * xas_next_entry() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find(), and will call xas_find()
 * for all the hard cases.
 *
 * Return: The next present entry after the one currently referred to by @xas.
 */
static inline void *xas_next_entry(struct xa_state *xas, unsigned long max)
{
        struct xa_node *node = xas->xa_node;
        void *entry;

        if (unlikely(xas_not_node(node) || node->shift ||
                        xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)))
                return xas_find(xas, max);

        do {
                if (unlikely(xas->xa_index >= max))
                        return xas_find(xas, max);
                if (unlikely(xas->xa_offset == XA_CHUNK_MASK))
                        return xas_find(xas, max);
                entry = xa_entry(xas->xa, node, xas->xa_offset + 1);
                if (unlikely(xa_is_internal(entry)))
                        return xas_find(xas, max);
                xas->xa_offset++;
                xas->xa_index++;
        } while (!entry);

        return entry;
}

/* Private */
static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance,
                xa_mark_t mark)
{
        unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark];
        unsigned int offset = xas->xa_offset;

        if (advance)
                offset++;
        if (XA_CHUNK_SIZE == BITS_PER_LONG) {
                if (offset < XA_CHUNK_SIZE) {
                        unsigned long data = *addr & (~0UL << offset);
                        if (data)
                                return __ffs(data);
                }
                return XA_CHUNK_SIZE;
        }

        return find_next_bit(addr, XA_CHUNK_SIZE, offset);
}

/**
 * xas_next_marked() - Advance iterator to next marked entry.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark to search for.
 *
 * xas_next_marked() is an inline function to optimise xarray traversal for
 * speed.  It is equivalent to calling xas_find_marked(), and will call
 * xas_find_marked() for all the hard cases.
 *
 * Return: The next marked entry after the one currently referred to by @xas.
 */
static inline void *xas_next_marked(struct xa_state *xas, unsigned long max,
                                                                xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        void *entry;
        unsigned int offset;

        if (unlikely(xas_not_node(node) || node->shift))
                return xas_find_marked(xas, max, mark);
        offset = xas_find_chunk(xas, true, mark);
        xas->xa_offset = offset;
        xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset;
        if (xas->xa_index > max)
                return NULL;
        if (offset == XA_CHUNK_SIZE)
                return xas_find_marked(xas, max, mark);
        entry = xa_entry(xas->xa, node, offset);
        if (!entry)
                return xas_find_marked(xas, max, mark);
        return entry;
}

/*
 * If iterating while holding a lock, drop the lock and reschedule
 * every %XA_CHECK_SCHED loops.
 */
enum {
        XA_CHECK_SCHED = 4096,
};

/**
 * xas_for_each() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 *
 * The loop body will be executed for each entry present in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each(xas, entry, max) \
        for (entry = xas_find(xas, max); entry; \
             entry = xas_next_entry(xas, max))

/**
 * xas_for_each_marked() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 * @max: Maximum index to retrieve from array.
 * @mark: Mark to search for.
 *
 * The loop body will be executed for each marked entry in the xarray
 * between the current xas position and @max.  @entry will be set to
 * the entry retrieved from the xarray.  It is safe to delete entries
 * from the array in the loop body.  You should hold either the RCU lock
 * or the xa_lock while iterating.  If you need to drop the lock, call
 * xas_pause() first.
 */
#define xas_for_each_marked(xas, entry, max, mark) \
        for (entry = xas_find_marked(xas, max, mark); entry; \
             entry = xas_next_marked(xas, max, mark))

/**
 * xas_for_each_conflict() - Iterate over a range of an XArray.
 * @xas: XArray operation state.
 * @entry: Entry retrieved from the array.
 *
 * The loop body will be executed for each entry in the XArray that
 * lies within the range specified by @xas.  If the loop terminates
 * normally, @entry will be %NULL.  The user may break out of the loop,
 * which will leave @entry set to the conflicting entry.  The caller
 * may also call xa_set_err() to exit the loop while setting an error
 * to record the reason.
 */
#define xas_for_each_conflict(xas, entry) \
        while ((entry = xas_find_conflict(xas)))

void *__xas_next(struct xa_state *);
void *__xas_prev(struct xa_state *);

/**
 * xas_prev() - Move iterator to previous index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * subtracted from the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index 0, this function wraps
 * around to %ULONG_MAX.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_prev(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == 0))
                return __xas_prev(xas);

        xas->xa_index--;
        xas->xa_offset--;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

/**
 * xas_next() - Move state to next index.
 * @xas: XArray operation state.
 *
 * If the @xas was in an error state, it will remain in an error state
 * and this function will return %NULL.  If the @xas has never been walked,
 * it will have the effect of calling xas_load().  Otherwise one will be
 * added to the index and the state will be walked to the correct
 * location in the array for the next operation.
 *
 * If the iterator was referencing index %ULONG_MAX, this function wraps
 * around to 0.
 *
 * Return: The entry at the new index.  This may be %NULL or an internal
 * entry.
 */
static inline void *xas_next(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (unlikely(xas_not_node(node) || node->shift ||
                                xas->xa_offset == XA_CHUNK_MASK))
                return __xas_next(xas);

        xas->xa_index++;
        xas->xa_offset++;
        return xa_entry(xas->xa, node, xas->xa_offset);
}

#endif /* _LINUX_XARRAY_H */































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/*
 * include/net/tipc.h: Include file for TIPC message header routines
 *
 * Copyright (c) 2017 Ericsson AB
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * Alternatively, this software may be distributed under the terms of the
 * GNU General Public License ("GPL") version 2 as published by the Free
 * Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef _TIPC_HDR_H
#define _TIPC_HDR_H

#include <linux/random.h>

#define KEEPALIVE_MSG_MASK 0x0e080000  /* LINK_PROTOCOL + MSG_IS_KEEPALIVE */

struct tipc_basic_hdr {
        __be32 w[4];
};

static inline __be32 tipc_hdr_rps_key(struct tipc_basic_hdr *hdr)
{
        u32 w0 = ntohl(hdr->w[0]);
        bool keepalive_msg = (w0 & KEEPALIVE_MSG_MASK) == KEEPALIVE_MSG_MASK;
        __be32 key;

        /* Return source node identity as key */
        if (likely(!keepalive_msg))
                return hdr->w[3];

        /* Spread PROBE/PROBE_REPLY messages across the cores */
        get_random_bytes(&key, sizeof(key));
        return key;
}

#endif



























  701 













  701 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// SPDX-License-Identifier: GPL-2.0
#include <linux/fault-inject.h>
#include <linux/debugfs.h>
#include <linux/error-injection.h>
#include <linux/mm.h>

static struct {
        struct fault_attr attr;

        bool ignore_gfp_highmem;
        bool ignore_gfp_reclaim;
        u32 min_order;
} fail_page_alloc = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_reclaim = true,
        .ignore_gfp_highmem = true,
        .min_order = 1,
};

static int __init setup_fail_page_alloc(char *str)
{
        return setup_fault_attr(&fail_page_alloc.attr, str);
}
__setup("fail_page_alloc=", setup_fail_page_alloc);

bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
        int flags = 0;

        if (order < fail_page_alloc.min_order)
                return false;
        if (gfp_mask & __GFP_NOFAIL)
                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                return false;
        if (fail_page_alloc.ignore_gfp_reclaim &&
                        (gfp_mask & __GFP_DIRECT_RECLAIM))
                return false;

        /* See comment in __should_failslab() */
        if (gfp_mask & __GFP_NOWARN)
                flags |= FAULT_NOWARN;

        return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags);
}
ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_page_alloc_debugfs(void)
{
        umode_t mode = S_IFREG | 0600;
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
                                        &fail_page_alloc.attr);

        debugfs_create_bool("ignore-gfp-wait", mode, dir,
                            &fail_page_alloc.ignore_gfp_reclaim);
        debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                            &fail_page_alloc.ignore_gfp_highmem);
        debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);

        return 0;
}

late_initcall(fail_page_alloc_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

































































































































































































































































































































































































































































































 1401 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_PREEMPT_H
#define __LINUX_PREEMPT_H

/*
 * include/linux/preempt.h - macros for accessing and manipulating
 * preempt_count (used for kernel preemption, interrupt count, etc.)
 */

#include <linux/linkage.h>
#include <linux/cleanup.h>
#include <linux/types.h>

/*
 * We put the hardirq and softirq counter into the preemption
 * counter. The bitmask has the following meaning:
 *
 * - bits 0-7 are the preemption count (max preemption depth: 256)
 * - bits 8-15 are the softirq count (max # of softirqs: 256)
 *
 * The hardirq count could in theory be the same as the number of
 * interrupts in the system, but we run all interrupt handlers with
 * interrupts disabled, so we cannot have nesting interrupts. Though
 * there are a few palaeontologic drivers which reenable interrupts in
 * the handler, so we need more than one bit here.
 *
 *         PREEMPT_MASK:        0x000000ff
 *         SOFTIRQ_MASK:        0x0000ff00
 *         HARDIRQ_MASK:        0x000f0000
 *             NMI_MASK:        0x00f00000
 * PREEMPT_NEED_RESCHED:        0x80000000
 */
#define PREEMPT_BITS        8
#define SOFTIRQ_BITS        8
#define HARDIRQ_BITS        4
#define NMI_BITS        4

#define PREEMPT_SHIFT        0
#define SOFTIRQ_SHIFT        (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT        (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
#define NMI_SHIFT        (HARDIRQ_SHIFT + HARDIRQ_BITS)

#define __IRQ_MASK(x)        ((1UL << (x))-1)

#define PREEMPT_MASK        (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK        (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK        (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
#define NMI_MASK        (__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)

#define PREEMPT_OFFSET        (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET        (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET        (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET        (1UL << NMI_SHIFT)

#define SOFTIRQ_DISABLE_OFFSET        (2 * SOFTIRQ_OFFSET)

#define PREEMPT_DISABLED        (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)

/*
 * Disable preemption until the scheduler is running -- use an unconditional
 * value so that it also works on !PREEMPT_COUNT kernels.
 *
 * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
 */
#define INIT_PREEMPT_COUNT        PREEMPT_OFFSET

/*
 * Initial preempt_count value; reflects the preempt_count schedule invariant
 * which states that during context switches:
 *
 *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
 *
 * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
 * Note: See finish_task_switch().
 */
#define FORK_PREEMPT_COUNT        (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)

/* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */
#include <asm/preempt.h>

/**
 * interrupt_context_level - return interrupt context level
 *
 * Returns the current interrupt context level.
 *  0 - normal context
 *  1 - softirq context
 *  2 - hardirq context
 *  3 - NMI context
 */
static __always_inline unsigned char interrupt_context_level(void)
{
        unsigned long pc = preempt_count();
        unsigned char level = 0;

        level += !!(pc & (NMI_MASK));
        level += !!(pc & (NMI_MASK | HARDIRQ_MASK));
        level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));

        return level;
}

/*
 * These macro definitions avoid redundant invocations of preempt_count()
 * because such invocations would result in redundant loads given that
 * preempt_count() is commonly implemented with READ_ONCE().
 */

#define nmi_count()        (preempt_count() & NMI_MASK)
#define hardirq_count()        (preempt_count() & HARDIRQ_MASK)
#ifdef CONFIG_PREEMPT_RT
# define softirq_count()        (current->softirq_disable_cnt & SOFTIRQ_MASK)
# define irq_count()                ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count())
#else
# define softirq_count()        (preempt_count() & SOFTIRQ_MASK)
# define irq_count()                (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
#endif

/*
 * Macros to retrieve the current execution context:
 *
 * in_nmi()                - We're in NMI context
 * in_hardirq()                - We're in hard IRQ context
 * in_serving_softirq()        - We're in softirq context
 * in_task()                - We're in task context
 */
#define in_nmi()                (nmi_count())
#define in_hardirq()                (hardirq_count())
#define in_serving_softirq()        (softirq_count() & SOFTIRQ_OFFSET)
#ifdef CONFIG_PREEMPT_RT
# define in_task()                (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq()))
#else
# define in_task()                (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
#endif

/*
 * The following macros are deprecated and should not be used in new code:
 * in_irq()       - Obsolete version of in_hardirq()
 * in_softirq()   - We have BH disabled, or are processing softirqs
 * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
 */
#define in_irq()                (hardirq_count())
#define in_softirq()                (softirq_count())
#define in_interrupt()                (irq_count())

/*
 * The preempt_count offset after preempt_disable();
 */
#if defined(CONFIG_PREEMPT_COUNT)
# define PREEMPT_DISABLE_OFFSET        PREEMPT_OFFSET
#else
# define PREEMPT_DISABLE_OFFSET        0
#endif

/*
 * The preempt_count offset after spin_lock()
 */
#if !defined(CONFIG_PREEMPT_RT)
#define PREEMPT_LOCK_OFFSET                PREEMPT_DISABLE_OFFSET
#else
/* Locks on RT do not disable preemption */
#define PREEMPT_LOCK_OFFSET                0
#endif

/*
 * The preempt_count offset needed for things like:
 *
 *  spin_lock_bh()
 *
 * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and
 * softirqs, such that unlock sequences of:
 *
 *  spin_unlock();
 *  local_bh_enable();
 *
 * Work as expected.
 */
#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)

/*
 * Are we running in atomic context?  WARNING: this macro cannot
 * always detect atomic context; in particular, it cannot know about
 * held spinlocks in non-preemptible kernels.  Thus it should not be
 * used in the general case to determine whether sleeping is possible.
 * Do not use in_atomic() in driver code.
 */
#define in_atomic()        (preempt_count() != 0)

/*
 * Check whether we were atomic before we did preempt_disable():
 * (used by the scheduler)
 */
#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)

#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE)
extern void preempt_count_add(int val);
extern void preempt_count_sub(int val);
#define preempt_count_dec_and_test() \
        ({ preempt_count_sub(1); should_resched(0); })
#else
#define preempt_count_add(val)        __preempt_count_add(val)
#define preempt_count_sub(val)        __preempt_count_sub(val)
#define preempt_count_dec_and_test() __preempt_count_dec_and_test()
#endif

#define __preempt_count_inc() __preempt_count_add(1)
#define __preempt_count_dec() __preempt_count_sub(1)

#define preempt_count_inc() preempt_count_add(1)
#define preempt_count_dec() preempt_count_sub(1)

#ifdef CONFIG_PREEMPT_COUNT

#define preempt_disable() \
do { \
        preempt_count_inc(); \
        barrier(); \
} while (0)

#define sched_preempt_enable_no_resched() \
do { \
        barrier(); \
        preempt_count_dec(); \
} while (0)

#define preempt_enable_no_resched() sched_preempt_enable_no_resched()

#define preemptible()        (preempt_count() == 0 && !irqs_disabled())

#ifdef CONFIG_PREEMPTION
#define preempt_enable() \
do { \
        barrier(); \
        if (unlikely(preempt_count_dec_and_test())) \
                __preempt_schedule(); \
} while (0)

#define preempt_enable_notrace() \
do { \
        barrier(); \
        if (unlikely(__preempt_count_dec_and_test())) \
                __preempt_schedule_notrace(); \
} while (0)

#define preempt_check_resched() \
do { \
        if (should_resched(0)) \
                __preempt_schedule(); \
} while (0)

#else /* !CONFIG_PREEMPTION */
#define preempt_enable() \
do { \
        barrier(); \
        preempt_count_dec(); \
} while (0)

#define preempt_enable_notrace() \
do { \
        barrier(); \
        __preempt_count_dec(); \
} while (0)

#define preempt_check_resched() do { } while (0)
#endif /* CONFIG_PREEMPTION */

#define preempt_disable_notrace() \
do { \
        __preempt_count_inc(); \
        barrier(); \
} while (0)

#define preempt_enable_no_resched_notrace() \
do { \
        barrier(); \
        __preempt_count_dec(); \
} while (0)

#else /* !CONFIG_PREEMPT_COUNT */

/*
 * Even if we don't have any preemption, we need preempt disable/enable
 * to be barriers, so that we don't have things like get_user/put_user
 * that can cause faults and scheduling migrate into our preempt-protected
 * region.
 */
#define preempt_disable()                        barrier()
#define sched_preempt_enable_no_resched()        barrier()
#define preempt_enable_no_resched()                barrier()
#define preempt_enable()                        barrier()
#define preempt_check_resched()                        do { } while (0)

#define preempt_disable_notrace()                barrier()
#define preempt_enable_no_resched_notrace()        barrier()
#define preempt_enable_notrace()                barrier()
#define preemptible()                                0

#endif /* CONFIG_PREEMPT_COUNT */

#ifdef MODULE
/*
 * Modules have no business playing preemption tricks.
 */
#undef sched_preempt_enable_no_resched
#undef preempt_enable_no_resched
#undef preempt_enable_no_resched_notrace
#undef preempt_check_resched
#endif

#define preempt_set_need_resched() \
do { \
        set_preempt_need_resched(); \
} while (0)
#define preempt_fold_need_resched() \
do { \
        if (tif_need_resched()) \
                set_preempt_need_resched(); \
} while (0)

#ifdef CONFIG_PREEMPT_NOTIFIERS

struct preempt_notifier;
struct task_struct;

/**
 * preempt_ops - notifiers called when a task is preempted and rescheduled
 * @sched_in: we're about to be rescheduled:
 *    notifier: struct preempt_notifier for the task being scheduled
 *    cpu:  cpu we're scheduled on
 * @sched_out: we've just been preempted
 *    notifier: struct preempt_notifier for the task being preempted
 *    next: the task that's kicking us out
 *
 * Please note that sched_in and out are called under different
 * contexts.  sched_out is called with rq lock held and irq disabled
 * while sched_in is called without rq lock and irq enabled.  This
 * difference is intentional and depended upon by its users.
 */
struct preempt_ops {
        void (*sched_in)(struct preempt_notifier *notifier, int cpu);
        void (*sched_out)(struct preempt_notifier *notifier,
                          struct task_struct *next);
};

/**
 * preempt_notifier - key for installing preemption notifiers
 * @link: internal use
 * @ops: defines the notifier functions to be called
 *
 * Usually used in conjunction with container_of().
 */
struct preempt_notifier {
        struct hlist_node link;
        struct preempt_ops *ops;
};

void preempt_notifier_inc(void);
void preempt_notifier_dec(void);
void preempt_notifier_register(struct preempt_notifier *notifier);
void preempt_notifier_unregister(struct preempt_notifier *notifier);

static inline void preempt_notifier_init(struct preempt_notifier *notifier,
                                     struct preempt_ops *ops)
{
        /* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */
        notifier->link.next = NULL;
        notifier->link.pprev = NULL;
        notifier->ops = ops;
}

#endif

#ifdef CONFIG_SMP

/*
 * Migrate-Disable and why it is undesired.
 *
 * When a preempted task becomes elegible to run under the ideal model (IOW it
 * becomes one of the M highest priority tasks), it might still have to wait
 * for the preemptee's migrate_disable() section to complete. Thereby suffering
 * a reduction in bandwidth in the exact duration of the migrate_disable()
 * section.
 *
 * Per this argument, the change from preempt_disable() to migrate_disable()
 * gets us:
 *
 * - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
 *   it would have had to wait for the lower priority task.
 *
 * - a lower priority tasks; which under preempt_disable() could've instantly
 *   migrated away when another CPU becomes available, is now constrained
 *   by the ability to push the higher priority task away, which might itself be
 *   in a migrate_disable() section, reducing it's available bandwidth.
 *
 * IOW it trades latency / moves the interference term, but it stays in the
 * system, and as long as it remains unbounded, the system is not fully
 * deterministic.
 *
 *
 * The reason we have it anyway.
 *
 * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
 * number of primitives into becoming preemptible, they would also allow
 * migration. This turns out to break a bunch of per-cpu usage. To this end,
 * all these primitives employ migirate_disable() to restore this implicit
 * assumption.
 *
 * This is a 'temporary' work-around at best. The correct solution is getting
 * rid of the above assumptions and reworking the code to employ explicit
 * per-cpu locking or short preempt-disable regions.
 *
 * The end goal must be to get rid of migrate_disable(), alternatively we need
 * a schedulability theory that does not depend on abritrary migration.
 *
 *
 * Notes on the implementation.
 *
 * The implementation is particularly tricky since existing code patterns
 * dictate neither migrate_disable() nor migrate_enable() is allowed to block.
 * This means that it cannot use cpus_read_lock() to serialize against hotplug,
 * nor can it easily migrate itself into a pending affinity mask change on
 * migrate_enable().
 *
 *
 * Note: even non-work-conserving schedulers like semi-partitioned depends on
 *       migration, so migrate_disable() is not only a problem for
 *       work-conserving schedulers.
 *
 */
extern void migrate_disable(void);
extern void migrate_enable(void);

#else

static inline void migrate_disable(void) { }
static inline void migrate_enable(void) { }

#endif /* CONFIG_SMP */

/**
 * preempt_disable_nested - Disable preemption inside a normally preempt disabled section
 *
 * Use for code which requires preemption protection inside a critical
 * section which has preemption disabled implicitly on non-PREEMPT_RT
 * enabled kernels, by e.g.:
 *  - holding a spinlock/rwlock
 *  - soft interrupt context
 *  - regular interrupt handlers
 *
 * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft
 * interrupt context and regular interrupt handlers are preemptible and
 * only prevent migration. preempt_disable_nested() ensures that preemption
 * is disabled for cases which require CPU local serialization even on
 * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP.
 *
 * The use cases are code sequences which are not serialized by a
 * particular lock instance, e.g.:
 *  - seqcount write side critical sections where the seqcount is not
 *    associated to a particular lock and therefore the automatic
 *    protection mechanism does not work. This prevents a live lock
 *    against a preempting high priority reader.
 *  - RMW per CPU variable updates like vmstat.
 */
/* Macro to avoid header recursion hell vs. lockdep */
#define preempt_disable_nested()                                \
do {                                                                \
        if (IS_ENABLED(CONFIG_PREEMPT_RT))                        \
                preempt_disable();                                \
        else                                                        \
                lockdep_assert_preemption_disabled();                \
} while (0)

/**
 * preempt_enable_nested - Undo the effect of preempt_disable_nested()
 */
static __always_inline void preempt_enable_nested(void)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable();
}

DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())
DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())

#ifdef CONFIG_PREEMPT_DYNAMIC

extern bool preempt_model_none(void);
extern bool preempt_model_voluntary(void);
extern bool preempt_model_full(void);
extern bool preempt_model_lazy(void);

#else

static inline bool preempt_model_none(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_NONE);
}
static inline bool preempt_model_voluntary(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
}
static inline bool preempt_model_full(void)
{
        return IS_ENABLED(CONFIG_PREEMPT);
}

static inline bool preempt_model_lazy(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_LAZY);
}

#endif

static inline bool preempt_model_rt(void)
{
        return IS_ENABLED(CONFIG_PREEMPT_RT);
}

extern const char *preempt_model_str(void);

/*
 * Does the preemption model allow non-cooperative preemption?
 *
 * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
 * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
 * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
 * PREEMPT_NONE model.
 */
static inline bool preempt_model_preemptible(void)
{
        return preempt_model_full() || preempt_model_lazy() || preempt_model_rt();
}

#endif /* __LINUX_PREEMPT_H */





























































































   66 






















    9 
   66 


















































    6 



























































   29 
































































































































































































































































































































































   16 
   20 




















































    3 





  204 
   81 




















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_H
#define _LINUX_RCULIST_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list.h>
#include <linux/rcupdate.h>

/*
 * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers
 * @list: list to be initialized
 *
 * You should instead use INIT_LIST_HEAD() for normal initialization and
 * cleanup tasks, when readers have no access to the list being initialized.
 * However, if the list being initialized is visible to readers, you
 * need to keep the compiler from being too mischievous.
 */
static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

/*
 * return the ->next pointer of a list_head in an rcu safe
 * way, we must not access it directly
 */
#define list_next_rcu(list)        (*((struct list_head __rcu **)(&(list)->next)))
/*
 * Return the ->prev pointer of a list_head in an rcu safe way. Don't
 * access it directly.
 *
 * Any list traversed with list_bidir_prev_rcu() must never use
 * list_del_rcu().  Doing so will poison the ->prev pointer that
 * list_bidir_prev_rcu() relies on, which will result in segfaults.
 * To prevent these segfaults, use list_bidir_del_rcu() instead
 * of list_del_rcu().
 */
#define list_bidir_prev_rcu(list) (*((struct list_head __rcu **)(&(list)->prev)))

/**
 * list_tail_rcu - returns the prev pointer of the head of the list
 * @head: the head of the list
 *
 * Note: This should only be used with the list header, and even then
 * only if list_del() and similar primitives are not also used on the
 * list header.
 */
#define list_tail_rcu(head)        (*((struct list_head __rcu **)(&(head)->prev)))

/*
 * Check during list traversal that we are within an RCU reader
 */

#define check_arg_count_one(dummy)

#ifdef CONFIG_PROVE_RCU_LIST
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({                                                                \
        check_arg_count_one(extra);                                        \
        RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(),                \
                         "RCU-list traversed in non-reader section!");        \
        })

#define __list_check_srcu(cond)                                         \
        ({                                                                 \
        RCU_LOCKDEP_WARN(!(cond),                                         \
                "RCU-list traversed without holding the required lock!");\
        })
#else
#define __list_check_rcu(dummy, cond, extra...)                                \
        ({ check_arg_count_one(extra); })

#define __list_check_srcu(cond) ({ })
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add_rcu(struct list_head *new,
                struct list_head *prev, struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        new->next = next;
        new->prev = prev;
        rcu_assign_pointer(list_next_rcu(prev), new);
        next->prev = new;
}

/**
 * list_add_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_rcu(struct list_head *new, struct list_head *head)
{
        __list_add_rcu(new, head, head->next);
}

/**
 * list_add_tail_rcu - add a new entry to rcu-protected list
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_add_tail_rcu()
 * or list_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 */
static inline void list_add_tail_rcu(struct list_head *new,
                                        struct list_head *head)
{
        __list_add_rcu(new, head->prev, head);
}

/**
 * list_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * Note: list_empty() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as list_del_rcu()
 * or list_add_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->prev = LIST_POISON2;
}

/**
 * list_bidir_del_rcu - deletes entry from list without re-initialization
 * @entry: the element to delete from the list.
 *
 * In contrast to list_del_rcu() doesn't poison the prev pointer thus
 * allowing backwards traversal via list_bidir_prev_rcu().
 *
 * Note: list_empty() on entry does not return true after this because
 * the entry is in a special undefined state that permits RCU-based
 * lockfree reverse traversal. In particular this means that we can not
 * poison the forward and backwards pointers that may still be used for
 * walking the list.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another list-mutation
 * primitive, such as list_bidir_del_rcu() or list_add_rcu(), running on
 * this same list. However, it is perfectly legal to run concurrently
 * with the _rcu list-traversal primitives, such as
 * list_for_each_entry_rcu().
 *
 * Note that list_del_rcu() and list_bidir_del_rcu() must not be used on
 * the same list.
 *
 * Note that the caller is not permitted to immediately free
 * the newly deleted entry.  Instead, either synchronize_rcu()
 * or call_rcu() must be used to defer freeing until an RCU
 * grace period has elapsed.
 */
static inline void list_bidir_del_rcu(struct list_head *entry)
{
        __list_del_entry(entry);
}

/**
 * hlist_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_add_head_rcu() or
 * hlist_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_for_each_entry_rcu().
 */
static inline void hlist_del_init_rcu(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * list_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically from
 * the perspective of concurrent readers.  It is the caller's responsibility
 * to synchronize with concurrent updaters, if any.
 *
 * Note: @old should not be empty.
 */
static inline void list_replace_rcu(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->prev = old->prev;
        rcu_assign_pointer(list_next_rcu(new->prev), new);
        new->next->prev = new;
        old->prev = LIST_POISON2;
}

/**
 * __list_splice_init_rcu - join an RCU-protected list into an existing list.
 * @list:        the RCU-protected list to splice
 * @prev:        points to the last element of the existing list
 * @next:        points to the first element of the existing list
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 *
 * The list pointed to by @prev and @next can be RCU-read traversed
 * concurrently with this function.
 *
 * Note that this function blocks.
 *
 * Important note: the caller must take whatever action is necessary to prevent
 * any other updates to the existing list.  In principle, it is possible to
 * modify the list as soon as sync() begins execution. If this sort of thing
 * becomes necessary, an alternative version based on call_rcu() could be
 * created.  But only if -really- needed -- there is no shortage of RCU API
 * members.
 */
static inline void __list_splice_init_rcu(struct list_head *list,
                                          struct list_head *prev,
                                          struct list_head *next,
                                          void (*sync)(void))
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        /*
         * "first" and "last" tracking list, so initialize it.  RCU readers
         * have access to this list, so we must use INIT_LIST_HEAD_RCU()
         * instead of INIT_LIST_HEAD().
         */

        INIT_LIST_HEAD_RCU(list);

        /*
         * At this point, the list body still points to the source list.
         * Wait for any readers to finish using the list before splicing
         * the list body into the new list.  Any new readers will see
         * an empty list.
         */

        sync();
        ASSERT_EXCLUSIVE_ACCESS(*first);
        ASSERT_EXCLUSIVE_ACCESS(*last);

        /*
         * Readers are finished with the source list, so perform splice.
         * The order is important if the new list is global and accessible
         * to concurrent RCU readers.  Note that RCU readers are not
         * permitted to traverse the prev pointers without excluding
         * this function.
         */

        last->next = next;
        rcu_assign_pointer(list_next_rcu(prev), first);
        first->prev = prev;
        next->prev = last;
}

/**
 * list_splice_init_rcu - splice an RCU-protected list into an existing list,
 *                        designed for stacks.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_init_rcu(struct list_head *list,
                                        struct list_head *head,
                                        void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head, head->next, sync);
}

/**
 * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
 *                             list, designed for queues.
 * @list:        the RCU-protected list to splice
 * @head:        the place in the existing list to splice the first list into
 * @sync:        synchronize_rcu, synchronize_rcu_expedited, ...
 */
static inline void list_splice_tail_init_rcu(struct list_head *list,
                                             struct list_head *head,
                                             void (*sync)(void))
{
        if (!list_empty(list))
                __list_splice_init_rcu(list, head->prev, head, sync);
}

/**
 * list_entry_rcu - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_entry_rcu(ptr, type, member) \
        container_of(READ_ONCE(ptr), type, member)

/*
 * Where are list_empty_rcu() and list_first_entry_rcu()?
 *
 * They do not exist because they would lead to subtle race conditions:
 *
 * if (!list_empty_rcu(mylist)) {
 *        struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member);
 *        do_something(bar);
 * }
 *
 * The list might be non-empty when list_empty_rcu() checks it, but it
 * might have become empty by the time that list_first_entry_rcu() rereads
 * the ->next pointer, which would result in a SEGV.
 *
 * When not using RCU, it is OK for list_first_entry() to re-read that
 * pointer because both functions should be protected by some lock that
 * blocks writers.
 *
 * When using RCU, list_empty() uses READ_ONCE() to fetch the
 * RCU-protected ->next pointer and then compares it to the address of the
 * list head.  However, it neither dereferences this pointer nor provides
 * this pointer to its caller.  Thus, READ_ONCE() suffices (that is,
 * rcu_dereference() is not needed), which means that list_empty() can be
 * used anywhere you would want to use list_empty_rcu().  Just don't
 * expect anything useful to happen if you do a subsequent lockless
 * call to list_first_entry_rcu()!!!
 *
 * See list_first_or_null_rcu for an alternative.
 */

/**
 * list_first_or_null_rcu - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_first_or_null_rcu(ptr, type, member) \
({ \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \
})

/**
 * list_next_or_null_rcu - get the next element from a list
 * @head:        the head for the list.
 * @ptr:        the list head to take the next element from.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * Note that if the ptr is at the end of the list, NULL is returned.
 *
 * This primitive may safely run concurrently with the _rcu list-mutation
 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
 */
#define list_next_or_null_rcu(head, ptr, type, member) \
({ \
        struct list_head *__head = (head); \
        struct list_head *__ptr = (ptr); \
        struct list_head *__next = READ_ONCE(__ptr->next); \
        likely(__next != __head) ? list_entry_rcu(__next, type, \
                                                  member) : NULL; \
})

/**
 * list_for_each_entry_rcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define list_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_srcu        -        iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as list_add_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define list_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = list_entry_rcu((head)->next, typeof(*pos), member);        \
                &pos->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_entry_lockless - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:       the type of the struct this is embedded in.
 * @member:     the name of the list_head within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_entry_lockless(ptr, type, member) \
        container_of((typeof(ptr))READ_ONCE(ptr), type, member)

/**
 * list_for_each_entry_lockless - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_struct within the struct.
 *
 * This primitive may safely run concurrently with the _rcu
 * list-mutation primitives such as list_add_rcu(), but requires some
 * implicit RCU read-side guarding.  One example is running within a special
 * exception-time environment where preemption is disabled and where lockdep
 * cannot be invoked.  Another example is when items are added to the list,
 * but never deleted.
 */
#define list_for_each_entry_lockless(pos, head, member) \
        for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
             &pos->member != (head); \
             pos = list_entry_lockless(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_continue_rcu - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position which must have been in the list when the RCU read
 * lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_from_rcu() except
 * this starts after the given position and that one starts at the given
 * position.
 */
#define list_for_each_entry_continue_rcu(pos, head, member)                 \
        for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \
             &pos->member != (head);        \
             pos = list_entry_rcu(pos->member.next, typeof(*pos), member))

/**
 * list_for_each_entry_from_rcu - iterate over a list from current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_node within the struct.
 *
 * Iterate over the tail of a list starting from a given position,
 * which must have been in the list when the RCU read lock was taken.
 * This would typically require either that you obtained the node from a
 * previous walk of the list in the same RCU read-side critical section, or
 * that you held some sort of non-RCU reference (such as a reference count)
 * to keep the node alive *and* in the list.
 *
 * This iterator is similar to list_for_each_entry_continue_rcu() except
 * this starts from the given position and that one starts from the position
 * after the given position.
 */
#define list_for_each_entry_from_rcu(pos, head, member)                        \
        for (; &(pos)->member != (head);                                        \
                pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member))

/**
 * hlist_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: list_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry().
 */
static inline void hlist_del_rcu(struct hlist_node *n)
{
        __hlist_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_replace_rcu - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * The @old entry will be replaced with the @new entry atomically from
 * the perspective of concurrent readers.  It is the caller's responsibility
 * to synchronize with concurrent updaters, if any.
 */
static inline void hlist_replace_rcu(struct hlist_node *old,
                                        struct hlist_node *new)
{
        struct hlist_node *next = old->next;

        new->next = next;
        WRITE_ONCE(new->pprev, old->pprev);
        rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
        if (next)
                WRITE_ONCE(new->next->pprev, &new->next);
        WRITE_ONCE(old->pprev, LIST_POISON2);
}

/**
 * hlists_swap_heads_rcu - swap the lists the hlist heads point to
 * @left:  The hlist head on the left
 * @right: The hlist head on the right
 *
 * The lists start out as [@left  ][node1 ... ] and
 *                        [@right ][node2 ... ]
 * The lists end up as    [@left  ][node2 ... ]
 *                        [@right ][node1 ... ]
 */
static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right)
{
        struct hlist_node *node1 = left->first;
        struct hlist_node *node2 = right->first;

        rcu_assign_pointer(left->first, node2);
        rcu_assign_pointer(right->first, node1);
        WRITE_ONCE(node2->pprev, &left->first);
        WRITE_ONCE(node1->pprev, &right->first);
}

/*
 * return the first or the next element in an RCU protected hlist
 */
#define hlist_first_rcu(head)        (*((struct hlist_node __rcu **)(&(head)->first)))
#define hlist_next_rcu(node)        (*((struct hlist_node __rcu **)(&(node)->next)))
#define hlist_pprev_rcu(node)        (*((struct hlist_node __rcu **)((node)->pprev)))

/**
 * hlist_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_head_rcu(struct hlist_node *n,
                                        struct hlist_head *h)
{
        struct hlist_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_first_rcu(h), n);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_add_tail_rcu(struct hlist_node *n,
                                      struct hlist_head *h)
{
        struct hlist_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; i; i = i->next)
                last = i;

        if (last) {
                n->next = last->next;
                WRITE_ONCE(n->pprev, &last->next);
                rcu_assign_pointer(hlist_next_rcu(last), n);
        } else {
                hlist_add_head_rcu(n, h);
        }
}

/**
 * hlist_add_before_rcu
 * @n: the new element to add to the hash list.
 * @next: the existing element to add the new element before.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * before the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_before_rcu(struct hlist_node *n,
                                        struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        n->next = next;
        rcu_assign_pointer(hlist_pprev_rcu(n), n);
        WRITE_ONCE(next->pprev, &n->next);
}

/**
 * hlist_add_behind_rcu
 * @n: the new element to add to the hash list.
 * @prev: the existing element to add the new element after.
 *
 * Description:
 * Adds the specified element to the specified hlist
 * after the specified node while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_add_head_rcu()
 * or hlist_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.
 */
static inline void hlist_add_behind_rcu(struct hlist_node *n,
                                        struct hlist_node *prev)
{
        n->next = prev->next;
        WRITE_ONCE(n->pprev, &prev->next);
        rcu_assign_pointer(hlist_next_rcu(prev), n);
        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

#define __hlist_for_each_rcu(pos, head)                                \
        for (pos = rcu_dereference(hlist_first_rcu(head));        \
             pos;                                                \
             pos = rcu_dereference(hlist_next_rcu(pos)))

/**
 * hlist_for_each_entry_rcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        optional lockdep expression if called from non-RCU protection.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu(pos, head, member, cond...)                \
        for (__list_check_rcu(dummy, ## cond, 0),                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_srcu - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 * @cond:        lockdep expression for the lock required to traverse the list.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by srcu_read_lock().
 * The lockdep expression srcu_read_lock_held() can be passed as the
 * cond argument from read side.
 */
#define hlist_for_each_entry_srcu(pos, head, member, cond)                \
        for (__list_check_srcu(cond),                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing)
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 *
 * This is the same as hlist_for_each_entry_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hlist_for_each_entry_rcu_notrace(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 * This list-traversal primitive may safely run concurrently with
 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
 * as long as the traversal is guarded by rcu_read_lock().
 */
#define hlist_for_each_entry_rcu_bh(pos, head, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\
                        typeof(*(pos)), member);                        \
                pos;                                                        \
                pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu(pos, member)                        \
        for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue_rcu_bh(pos, member)                \
        for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(  \
                        &(pos)->member)), typeof(*(pos)), member);        \
             pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from_rcu(pos, member)                        \
        for (; pos;                                                        \
             pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(        \
                        &(pos)->member)), typeof(*(pos)), member))

#endif        /* __KERNEL__ */
#endif














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   27 































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips
 * Copyright (c) 2008-2009 Marvell Semiconductor
 */

#ifndef __LINUX_NET_DSA_H
#define __LINUX_NET_DSA_H

#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/of.h>
#include <linux/ethtool.h>
#include <linux/net_tstamp.h>
#include <linux/phy.h>
#include <linux/platform_data/dsa.h>
#include <linux/phylink.h>
#include <net/devlink.h>
#include <net/switchdev.h>

struct dsa_8021q_context;
struct tc_action;

#define DSA_TAG_PROTO_NONE_VALUE                0
#define DSA_TAG_PROTO_BRCM_VALUE                1
#define DSA_TAG_PROTO_BRCM_PREPEND_VALUE        2
#define DSA_TAG_PROTO_DSA_VALUE                        3
#define DSA_TAG_PROTO_EDSA_VALUE                4
#define DSA_TAG_PROTO_GSWIP_VALUE                5
#define DSA_TAG_PROTO_KSZ9477_VALUE                6
#define DSA_TAG_PROTO_KSZ9893_VALUE                7
#define DSA_TAG_PROTO_LAN9303_VALUE                8
#define DSA_TAG_PROTO_MTK_VALUE                        9
#define DSA_TAG_PROTO_QCA_VALUE                        10
#define DSA_TAG_PROTO_TRAILER_VALUE                11
#define DSA_TAG_PROTO_8021Q_VALUE                12
#define DSA_TAG_PROTO_SJA1105_VALUE                13
#define DSA_TAG_PROTO_KSZ8795_VALUE                14
#define DSA_TAG_PROTO_OCELOT_VALUE                15
#define DSA_TAG_PROTO_AR9331_VALUE                16
#define DSA_TAG_PROTO_RTL4_A_VALUE                17
#define DSA_TAG_PROTO_HELLCREEK_VALUE                18
#define DSA_TAG_PROTO_XRS700X_VALUE                19
#define DSA_TAG_PROTO_OCELOT_8021Q_VALUE        20
#define DSA_TAG_PROTO_SEVILLE_VALUE                21
#define DSA_TAG_PROTO_BRCM_LEGACY_VALUE                22
#define DSA_TAG_PROTO_SJA1110_VALUE                23
#define DSA_TAG_PROTO_RTL8_4_VALUE                24
#define DSA_TAG_PROTO_RTL8_4T_VALUE                25
#define DSA_TAG_PROTO_RZN1_A5PSW_VALUE                26
#define DSA_TAG_PROTO_LAN937X_VALUE                27
#define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE        28

enum dsa_tag_protocol {
        DSA_TAG_PROTO_NONE                = DSA_TAG_PROTO_NONE_VALUE,
        DSA_TAG_PROTO_BRCM                = DSA_TAG_PROTO_BRCM_VALUE,
        DSA_TAG_PROTO_BRCM_LEGACY        = DSA_TAG_PROTO_BRCM_LEGACY_VALUE,
        DSA_TAG_PROTO_BRCM_PREPEND        = DSA_TAG_PROTO_BRCM_PREPEND_VALUE,
        DSA_TAG_PROTO_DSA                = DSA_TAG_PROTO_DSA_VALUE,
        DSA_TAG_PROTO_EDSA                = DSA_TAG_PROTO_EDSA_VALUE,
        DSA_TAG_PROTO_GSWIP                = DSA_TAG_PROTO_GSWIP_VALUE,
        DSA_TAG_PROTO_KSZ9477                = DSA_TAG_PROTO_KSZ9477_VALUE,
        DSA_TAG_PROTO_KSZ9893                = DSA_TAG_PROTO_KSZ9893_VALUE,
        DSA_TAG_PROTO_LAN9303                = DSA_TAG_PROTO_LAN9303_VALUE,
        DSA_TAG_PROTO_MTK                = DSA_TAG_PROTO_MTK_VALUE,
        DSA_TAG_PROTO_QCA                = DSA_TAG_PROTO_QCA_VALUE,
        DSA_TAG_PROTO_TRAILER                = DSA_TAG_PROTO_TRAILER_VALUE,
        DSA_TAG_PROTO_8021Q                = DSA_TAG_PROTO_8021Q_VALUE,
        DSA_TAG_PROTO_SJA1105                = DSA_TAG_PROTO_SJA1105_VALUE,
        DSA_TAG_PROTO_KSZ8795                = DSA_TAG_PROTO_KSZ8795_VALUE,
        DSA_TAG_PROTO_OCELOT                = DSA_TAG_PROTO_OCELOT_VALUE,
        DSA_TAG_PROTO_AR9331                = DSA_TAG_PROTO_AR9331_VALUE,
        DSA_TAG_PROTO_RTL4_A                = DSA_TAG_PROTO_RTL4_A_VALUE,
        DSA_TAG_PROTO_HELLCREEK                = DSA_TAG_PROTO_HELLCREEK_VALUE,
        DSA_TAG_PROTO_XRS700X                = DSA_TAG_PROTO_XRS700X_VALUE,
        DSA_TAG_PROTO_OCELOT_8021Q        = DSA_TAG_PROTO_OCELOT_8021Q_VALUE,
        DSA_TAG_PROTO_SEVILLE                = DSA_TAG_PROTO_SEVILLE_VALUE,
        DSA_TAG_PROTO_SJA1110                = DSA_TAG_PROTO_SJA1110_VALUE,
        DSA_TAG_PROTO_RTL8_4                = DSA_TAG_PROTO_RTL8_4_VALUE,
        DSA_TAG_PROTO_RTL8_4T                = DSA_TAG_PROTO_RTL8_4T_VALUE,
        DSA_TAG_PROTO_RZN1_A5PSW        = DSA_TAG_PROTO_RZN1_A5PSW_VALUE,
        DSA_TAG_PROTO_LAN937X                = DSA_TAG_PROTO_LAN937X_VALUE,
        DSA_TAG_PROTO_VSC73XX_8021Q        = DSA_TAG_PROTO_VSC73XX_8021Q_VALUE,
};

struct dsa_switch;

struct dsa_device_ops {
        struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
        struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
        void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
                             int *offset);
        int (*connect)(struct dsa_switch *ds);
        void (*disconnect)(struct dsa_switch *ds);
        unsigned int needed_headroom;
        unsigned int needed_tailroom;
        const char *name;
        enum dsa_tag_protocol proto;
        /* Some tagging protocols either mangle or shift the destination MAC
         * address, in which case the DSA conduit would drop packets on ingress
         * if what it understands out of the destination MAC address is not in
         * its RX filter.
         */
        bool promisc_on_conduit;
};

struct dsa_lag {
        struct net_device *dev;
        unsigned int id;
        struct mutex fdb_lock;
        struct list_head fdbs;
        refcount_t refcount;
};

struct dsa_switch_tree {
        struct list_head        list;

        /* List of switch ports */
        struct list_head ports;

        /* Notifier chain for switch-wide events */
        struct raw_notifier_head        nh;

        /* Tree identifier */
        unsigned int index;

        /* Number of switches attached to this tree */
        struct kref refcount;

        /* Maps offloaded LAG netdevs to a zero-based linear ID for
         * drivers that need it.
         */
        struct dsa_lag **lags;

        /* Tagging protocol operations */
        const struct dsa_device_ops *tag_ops;

        /* Default tagging protocol preferred by the switches in this
         * tree.
         */
        enum dsa_tag_protocol default_proto;

        /* Has this tree been applied to the hardware? */
        bool setup;

        /*
         * Configuration data for the platform device that owns
         * this dsa switch tree instance.
         */
        struct dsa_platform_data        *pd;

        /* List of DSA links composing the routing table */
        struct list_head rtable;

        /* Length of "lags" array */
        unsigned int lags_len;

        /* Track the largest switch index within a tree */
        unsigned int last_switch;
};

/* LAG IDs are one-based, the dst->lags array is zero-based */
#define dsa_lags_foreach_id(_id, _dst)                                \
        for ((_id) = 1; (_id) <= (_dst)->lags_len; (_id)++)        \
                if ((_dst)->lags[(_id) - 1])

#define dsa_lag_foreach_port(_dp, _dst, _lag)                        \
        list_for_each_entry((_dp), &(_dst)->ports, list)        \
                if (dsa_port_offloads_lag((_dp), (_lag)))

#define dsa_hsr_foreach_port(_dp, _ds, _hsr)                        \
        list_for_each_entry((_dp), &(_ds)->dst->ports, list)        \
                if ((_dp)->ds == (_ds) && (_dp)->hsr_dev == (_hsr))

static inline struct dsa_lag *dsa_lag_by_id(struct dsa_switch_tree *dst,
                                            unsigned int id)
{
        /* DSA LAG IDs are one-based, dst->lags is zero-based */
        return dst->lags[id - 1];
}

static inline int dsa_lag_id(struct dsa_switch_tree *dst,
                             struct net_device *lag_dev)
{
        unsigned int id;

        dsa_lags_foreach_id(id, dst) {
                struct dsa_lag *lag = dsa_lag_by_id(dst, id);

                if (lag->dev == lag_dev)
                        return lag->id;
        }

        return -ENODEV;
}

/* TC matchall action types */
enum dsa_port_mall_action_type {
        DSA_PORT_MALL_MIRROR,
        DSA_PORT_MALL_POLICER,
};

/* TC mirroring entry */
struct dsa_mall_mirror_tc_entry {
        u8 to_local_port;
        bool ingress;
};

/* TC port policer entry */
struct dsa_mall_policer_tc_entry {
        u32 burst;
        u64 rate_bytes_per_sec;
};

/* TC matchall entry */
struct dsa_mall_tc_entry {
        struct list_head list;
        unsigned long cookie;
        enum dsa_port_mall_action_type type;
        union {
                struct dsa_mall_mirror_tc_entry mirror;
                struct dsa_mall_policer_tc_entry policer;
        };
};

struct dsa_bridge {
        struct net_device *dev;
        unsigned int num;
        bool tx_fwd_offload;
        refcount_t refcount;
};

struct dsa_port {
        /* A CPU port is physically connected to a conduit device. A user port
         * exposes a network device to user-space, called 'user' here.
         */
        union {
                struct net_device *conduit;
                struct net_device *user;
        };

        /* Copy of the tagging protocol operations, for quicker access
         * in the data path. Valid only for the CPU ports.
         */
        const struct dsa_device_ops *tag_ops;

        /* Copies for faster access in conduit receive hot path */
        struct dsa_switch_tree *dst;
        struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);

        struct dsa_switch        *ds;

        unsigned int                index;

        enum {
                DSA_PORT_TYPE_UNUSED = 0,
                DSA_PORT_TYPE_CPU,
                DSA_PORT_TYPE_DSA,
                DSA_PORT_TYPE_USER,
        } type;

        const char                *name;
        struct dsa_port                *cpu_dp;
        u8                        mac[ETH_ALEN];

        u8                        stp_state;

        /* Warning: the following bit fields are not atomic, and updating them
         * can only be done from code paths where concurrency is not possible
         * (probe time or under rtnl_lock).
         */
        u8                        vlan_filtering:1;

        /* Managed by DSA on user ports and by drivers on CPU and DSA ports */
        u8                        learning:1;

        u8                        lag_tx_enabled:1;

        /* conduit state bits, valid only on CPU ports */
        u8                        conduit_admin_up:1;
        u8                        conduit_oper_up:1;

        /* Valid only on user ports */
        u8                        cpu_port_in_lag:1;

        u8                        setup:1;

        struct device_node        *dn;
        unsigned int                ageing_time;

        struct dsa_bridge        *bridge;
        struct devlink_port        devlink_port;
        struct phylink                *pl;
        struct phylink_config        pl_config;
        struct dsa_lag                *lag;
        struct net_device        *hsr_dev;

        struct list_head list;

        /*
         * Original copy of the conduit netdev ethtool_ops
         */
        const struct ethtool_ops *orig_ethtool_ops;

        /* List of MAC addresses that must be forwarded on this port.
         * These are only valid on CPU ports and DSA links.
         */
        struct mutex                addr_lists_lock;
        struct list_head        fdbs;
        struct list_head        mdbs;

        struct mutex                vlans_lock;
        union {
                /* List of VLANs that CPU and DSA ports are members of.
                 * Access to this is serialized by the sleepable @vlans_lock.
                 */
                struct list_head        vlans;
                /* List of VLANs that user ports are members of.
                 * Access to this is serialized by netif_addr_lock_bh().
                 */
                struct list_head        user_vlans;
        };
};

static inline struct dsa_port *
dsa_phylink_to_port(struct phylink_config *config)
{
        return container_of(config, struct dsa_port, pl_config);
}

/* TODO: ideally DSA ports would have a single dp->link_dp member,
 * and no dst->rtable nor this struct dsa_link would be needed,
 * but this would require some more complex tree walking,
 * so keep it stupid at the moment and list them all.
 */
struct dsa_link {
        struct dsa_port *dp;
        struct dsa_port *link_dp;
        struct list_head list;
};

enum dsa_db_type {
        DSA_DB_PORT,
        DSA_DB_LAG,
        DSA_DB_BRIDGE,
};

struct dsa_db {
        enum dsa_db_type type;

        union {
                const struct dsa_port *dp;
                struct dsa_lag lag;
                struct dsa_bridge bridge;
        };
};

struct dsa_mac_addr {
        unsigned char addr[ETH_ALEN];
        u16 vid;
        refcount_t refcount;
        struct list_head list;
        struct dsa_db db;
};

struct dsa_vlan {
        u16 vid;
        refcount_t refcount;
        struct list_head list;
};

struct dsa_switch {
        struct device *dev;

        /*
         * Parent switch tree, and switch index.
         */
        struct dsa_switch_tree        *dst;
        unsigned int                index;

        /* Warning: the following bit fields are not atomic, and updating them
         * can only be done from code paths where concurrency is not possible
         * (probe time or under rtnl_lock).
         */
        u32                        setup:1;

        /* Disallow bridge core from requesting different VLAN awareness
         * settings on ports if not hardware-supported
         */
        u32                        vlan_filtering_is_global:1;

        /* Keep VLAN filtering enabled on ports not offloading any upper */
        u32                        needs_standalone_vlan_filtering:1;

        /* Pass .port_vlan_add and .port_vlan_del to drivers even for bridges
         * that have vlan_filtering=0. All drivers should ideally set this (and
         * then the option would get removed), but it is unknown whether this
         * would break things or not.
         */
        u32                        configure_vlan_while_not_filtering:1;

        /* Pop the default_pvid of VLAN-unaware bridge ports from tagged frames.
         * DEPRECATED: Do NOT set this field in new drivers. Instead look at
         * the dsa_software_vlan_untag() comments.
         */
        u32                        untag_bridge_pvid:1;
        /* Pop the default_pvid of VLAN-aware bridge ports from tagged frames.
         * Useful if the switch cannot preserve the VLAN tag as seen on the
         * wire for user port ingress, and chooses to send all frames as
         * VLAN-tagged to the CPU, including those which were originally
         * untagged.
         */
        u32                        untag_vlan_aware_bridge_pvid:1;

        /* Let DSA manage the FDB entries towards the
         * CPU, based on the software bridge database.
         */
        u32                        assisted_learning_on_cpu_port:1;

        /* In case vlan_filtering_is_global is set, the VLAN awareness state
         * should be retrieved from here and not from the per-port settings.
         */
        u32                        vlan_filtering:1;

        /* For switches that only have the MRU configurable. To ensure the
         * configured MTU is not exceeded, normalization of MRU on all bridged
         * interfaces is needed.
         */
        u32                        mtu_enforcement_ingress:1;

        /* Drivers that isolate the FDBs of multiple bridges must set this
         * to true to receive the bridge as an argument in .port_fdb_{add,del}
         * and .port_mdb_{add,del}. Otherwise, the bridge.num will always be
         * passed as zero.
         */
        u32                        fdb_isolation:1;

        /* Drivers that have global DSCP mapping settings must set this to
         * true to automatically apply the settings to all ports.
         */
        u32                        dscp_prio_mapping_is_global:1;

        /* Listener for switch fabric events */
        struct notifier_block        nb;

        /*
         * Give the switch driver somewhere to hang its private data
         * structure.
         */
        void *priv;

        void *tagger_data;

        /*
         * Configuration data for this switch.
         */
        struct dsa_chip_data        *cd;

        /*
         * The switch operations.
         */
        const struct dsa_switch_ops        *ops;

        /*
         * Allow a DSA switch driver to override the phylink MAC ops
         */
        const struct phylink_mac_ops        *phylink_mac_ops;

        /*
         * User mii_bus and devices for the individual ports.
         */
        u32                        phys_mii_mask;
        struct mii_bus                *user_mii_bus;

        /* Ageing Time limits in msecs */
        unsigned int ageing_time_min;
        unsigned int ageing_time_max;

        /* Storage for drivers using tag_8021q */
        struct dsa_8021q_context *tag_8021q_ctx;

        /* devlink used to represent this switch device */
        struct devlink                *devlink;

        /* Number of switch port queues */
        unsigned int                num_tx_queues;

        /* Drivers that benefit from having an ID associated with each
         * offloaded LAG should set this to the maximum number of
         * supported IDs. DSA will then maintain a mapping of _at
         * least_ these many IDs, accessible to drivers via
         * dsa_lag_id().
         */
        unsigned int                num_lag_ids;

        /* Drivers that support bridge forwarding offload or FDB isolation
         * should set this to the maximum number of bridges spanning the same
         * switch tree (or all trees, in the case of cross-tree bridging
         * support) that can be offloaded.
         */
        unsigned int                max_num_bridges;

        unsigned int                num_ports;
};

static inline struct dsa_port *dsa_to_port(struct dsa_switch *ds, int p)
{
        struct dsa_switch_tree *dst = ds->dst;
        struct dsa_port *dp;

        list_for_each_entry(dp, &dst->ports, list)
                if (dp->ds == ds && dp->index == p)
                        return dp;

        return NULL;
}

static inline bool dsa_port_is_dsa(struct dsa_port *port)
{
        return port->type == DSA_PORT_TYPE_DSA;
}

static inline bool dsa_port_is_cpu(struct dsa_port *port)
{
        return port->type == DSA_PORT_TYPE_CPU;
}

static inline bool dsa_port_is_user(struct dsa_port *dp)
{
        return dp->type == DSA_PORT_TYPE_USER;
}

static inline bool dsa_port_is_unused(struct dsa_port *dp)
{
        return dp->type == DSA_PORT_TYPE_UNUSED;
}

static inline bool dsa_port_conduit_is_operational(struct dsa_port *dp)
{
        return dsa_port_is_cpu(dp) && dp->conduit_admin_up &&
               dp->conduit_oper_up;
}

static inline bool dsa_is_unused_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_UNUSED;
}

static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_CPU;
}

static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_DSA;
}

static inline bool dsa_is_user_port(struct dsa_switch *ds, int p)
{
        return dsa_to_port(ds, p)->type == DSA_PORT_TYPE_USER;
}

#define dsa_tree_for_each_user_port(_dp, _dst) \
        list_for_each_entry((_dp), &(_dst)->ports, list) \
                if (dsa_port_is_user((_dp)))

#define dsa_tree_for_each_user_port_continue_reverse(_dp, _dst) \
        list_for_each_entry_continue_reverse((_dp), &(_dst)->ports, list) \
                if (dsa_port_is_user((_dp)))

#define dsa_tree_for_each_cpu_port(_dp, _dst) \
        list_for_each_entry((_dp), &(_dst)->ports, list) \
                if (dsa_port_is_cpu((_dp)))

#define dsa_switch_for_each_port(_dp, _ds) \
        list_for_each_entry((_dp), &(_ds)->dst->ports, list) \
                if ((_dp)->ds == (_ds))

#define dsa_switch_for_each_port_safe(_dp, _next, _ds) \
        list_for_each_entry_safe((_dp), (_next), &(_ds)->dst->ports, list) \
                if ((_dp)->ds == (_ds))

#define dsa_switch_for_each_port_continue_reverse(_dp, _ds) \
        list_for_each_entry_continue_reverse((_dp), &(_ds)->dst->ports, list) \
                if ((_dp)->ds == (_ds))

#define dsa_switch_for_each_available_port(_dp, _ds) \
        dsa_switch_for_each_port((_dp), (_ds)) \
                if (!dsa_port_is_unused((_dp)))

#define dsa_switch_for_each_user_port(_dp, _ds) \
        dsa_switch_for_each_port((_dp), (_ds)) \
                if (dsa_port_is_user((_dp)))

#define dsa_switch_for_each_user_port_continue_reverse(_dp, _ds) \
        dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \
                if (dsa_port_is_user((_dp)))

#define dsa_switch_for_each_cpu_port(_dp, _ds) \
        dsa_switch_for_each_port((_dp), (_ds)) \
                if (dsa_port_is_cpu((_dp)))

#define dsa_switch_for_each_cpu_port_continue_reverse(_dp, _ds) \
        dsa_switch_for_each_port_continue_reverse((_dp), (_ds)) \
                if (dsa_port_is_cpu((_dp)))

static inline u32 dsa_user_ports(struct dsa_switch *ds)
{
        struct dsa_port *dp;
        u32 mask = 0;

        dsa_switch_for_each_user_port(dp, ds)
                mask |= BIT(dp->index);

        return mask;
}

static inline u32 dsa_cpu_ports(struct dsa_switch *ds)
{
        struct dsa_port *cpu_dp;
        u32 mask = 0;

        dsa_switch_for_each_cpu_port(cpu_dp, ds)
                mask |= BIT(cpu_dp->index);

        return mask;
}

/* Return the local port used to reach an arbitrary switch device */
static inline unsigned int dsa_routing_port(struct dsa_switch *ds, int device)
{
        struct dsa_switch_tree *dst = ds->dst;
        struct dsa_link *dl;

        list_for_each_entry(dl, &dst->rtable, list)
                if (dl->dp->ds == ds && dl->link_dp->ds->index == device)
                        return dl->dp->index;

        return ds->num_ports;
}

/* Return the local port used to reach an arbitrary switch port */
static inline unsigned int dsa_towards_port(struct dsa_switch *ds, int device,
                                            int port)
{
        if (device == ds->index)
                return port;
        else
                return dsa_routing_port(ds, device);
}

/* Return the local port used to reach the dedicated CPU port */
static inline unsigned int dsa_upstream_port(struct dsa_switch *ds, int port)
{
        const struct dsa_port *dp = dsa_to_port(ds, port);
        const struct dsa_port *cpu_dp = dp->cpu_dp;

        if (!cpu_dp)
                return port;

        return dsa_towards_port(ds, cpu_dp->ds->index, cpu_dp->index);
}

/* Return true if this is the local port used to reach the CPU port */
static inline bool dsa_is_upstream_port(struct dsa_switch *ds, int port)
{
        if (dsa_is_unused_port(ds, port))
                return false;

        return port == dsa_upstream_port(ds, port);
}

/* Return true if this is a DSA port leading away from the CPU */
static inline bool dsa_is_downstream_port(struct dsa_switch *ds, int port)
{
        return dsa_is_dsa_port(ds, port) && !dsa_is_upstream_port(ds, port);
}

/* Return the local port used to reach the CPU port */
static inline unsigned int dsa_switch_upstream_port(struct dsa_switch *ds)
{
        struct dsa_port *dp;

        dsa_switch_for_each_available_port(dp, ds) {
                return dsa_upstream_port(ds, dp->index);
        }

        return ds->num_ports;
}

/* Return true if @upstream_ds is an upstream switch of @downstream_ds, meaning
 * that the routing port from @downstream_ds to @upstream_ds is also the port
 * which @downstream_ds uses to reach its dedicated CPU.
 */
static inline bool dsa_switch_is_upstream_of(struct dsa_switch *upstream_ds,
                                             struct dsa_switch *downstream_ds)
{
        int routing_port;

        if (upstream_ds == downstream_ds)
                return true;

        routing_port = dsa_routing_port(downstream_ds, upstream_ds->index);

        return dsa_is_upstream_port(downstream_ds, routing_port);
}

static inline bool dsa_port_is_vlan_filtering(const struct dsa_port *dp)
{
        const struct dsa_switch *ds = dp->ds;

        if (ds->vlan_filtering_is_global)
                return ds->vlan_filtering;
        else
                return dp->vlan_filtering;
}

static inline unsigned int dsa_port_lag_id_get(struct dsa_port *dp)
{
        return dp->lag ? dp->lag->id : 0;
}

static inline struct net_device *dsa_port_lag_dev_get(struct dsa_port *dp)
{
        return dp->lag ? dp->lag->dev : NULL;
}

static inline bool dsa_port_offloads_lag(struct dsa_port *dp,
                                         const struct dsa_lag *lag)
{
        return dsa_port_lag_dev_get(dp) == lag->dev;
}

static inline struct net_device *dsa_port_to_conduit(const struct dsa_port *dp)
{
        if (dp->cpu_port_in_lag)
                return dsa_port_lag_dev_get(dp->cpu_dp);

        return dp->cpu_dp->conduit;
}

static inline
struct net_device *dsa_port_to_bridge_port(const struct dsa_port *dp)
{
        if (!dp->bridge)
                return NULL;

        if (dp->lag)
                return dp->lag->dev;
        else if (dp->hsr_dev)
                return dp->hsr_dev;

        return dp->user;
}

static inline struct net_device *
dsa_port_bridge_dev_get(const struct dsa_port *dp)
{
        return dp->bridge ? dp->bridge->dev : NULL;
}

static inline unsigned int dsa_port_bridge_num_get(struct dsa_port *dp)
{
        return dp->bridge ? dp->bridge->num : 0;
}

static inline bool dsa_port_bridge_same(const struct dsa_port *a,
                                        const struct dsa_port *b)
{
        struct net_device *br_a = dsa_port_bridge_dev_get(a);
        struct net_device *br_b = dsa_port_bridge_dev_get(b);

        /* Standalone ports are not in the same bridge with one another */
        return (!br_a || !br_b) ? false : (br_a == br_b);
}

static inline bool dsa_port_offloads_bridge_port(struct dsa_port *dp,
                                                 const struct net_device *dev)
{
        return dsa_port_to_bridge_port(dp) == dev;
}

static inline bool
dsa_port_offloads_bridge_dev(struct dsa_port *dp,
                             const struct net_device *bridge_dev)
{
        /* DSA ports connected to a bridge, and event was emitted
         * for the bridge.
         */
        return dsa_port_bridge_dev_get(dp) == bridge_dev;
}

static inline bool dsa_port_offloads_bridge(struct dsa_port *dp,
                                            const struct dsa_bridge *bridge)
{
        return dsa_port_bridge_dev_get(dp) == bridge->dev;
}

/* Returns true if any port of this tree offloads the given net_device */
static inline bool dsa_tree_offloads_bridge_port(struct dsa_switch_tree *dst,
                                                 const struct net_device *dev)
{
        struct dsa_port *dp;

        list_for_each_entry(dp, &dst->ports, list)
                if (dsa_port_offloads_bridge_port(dp, dev))
                        return true;

        return false;
}

/* Returns true if any port of this tree offloads the given bridge */
static inline bool
dsa_tree_offloads_bridge_dev(struct dsa_switch_tree *dst,
                             const struct net_device *bridge_dev)
{
        struct dsa_port *dp;

        list_for_each_entry(dp, &dst->ports, list)
                if (dsa_port_offloads_bridge_dev(dp, bridge_dev))
                        return true;

        return false;
}

static inline bool dsa_port_tree_same(const struct dsa_port *a,
                                      const struct dsa_port *b)
{
        return a->ds->dst == b->ds->dst;
}

typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid,
                              bool is_static, void *data);
struct dsa_switch_ops {
        /*
         * Tagging protocol helpers called for the CPU ports and DSA links.
         * @get_tag_protocol retrieves the initial tagging protocol and is
         * mandatory. Switches which can operate using multiple tagging
         * protocols should implement @change_tag_protocol and report in
         * @get_tag_protocol the tagger in current use.
         */
        enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds,
                                                  int port,
                                                  enum dsa_tag_protocol mprot);
        int        (*change_tag_protocol)(struct dsa_switch *ds,
                                       enum dsa_tag_protocol proto);
        /*
         * Method for switch drivers to connect to the tagging protocol driver
         * in current use. The switch driver can provide handlers for certain
         * types of packets for switch management.
         */
        int        (*connect_tag_protocol)(struct dsa_switch *ds,
                                        enum dsa_tag_protocol proto);

        int        (*port_change_conduit)(struct dsa_switch *ds, int port,
                                       struct net_device *conduit,
                                       struct netlink_ext_ack *extack);

        /* Optional switch-wide initialization and destruction methods */
        int        (*setup)(struct dsa_switch *ds);
        void        (*teardown)(struct dsa_switch *ds);

        /* Per-port initialization and destruction methods. Mandatory if the
         * driver registers devlink port regions, optional otherwise.
         */
        int        (*port_setup)(struct dsa_switch *ds, int port);
        void        (*port_teardown)(struct dsa_switch *ds, int port);

        u32        (*get_phy_flags)(struct dsa_switch *ds, int port);

        /*
         * Access to the switch's PHY registers.
         */
        int        (*phy_read)(struct dsa_switch *ds, int port, int regnum);
        int        (*phy_write)(struct dsa_switch *ds, int port,
                             int regnum, u16 val);

        /*
         * PHYLINK integration
         */
        void        (*phylink_get_caps)(struct dsa_switch *ds, int port,
                                    struct phylink_config *config);
        void        (*phylink_fixed_state)(struct dsa_switch *ds, int port,
                                       struct phylink_link_state *state);
        /*
         * Port statistics counters.
         */
        void        (*get_strings)(struct dsa_switch *ds, int port,
                               u32 stringset, uint8_t *data);
        void        (*get_ethtool_stats)(struct dsa_switch *ds,
                                     int port, uint64_t *data);
        int        (*get_sset_count)(struct dsa_switch *ds, int port, int sset);
        void        (*get_ethtool_phy_stats)(struct dsa_switch *ds,
                                         int port, uint64_t *data);
        void        (*get_eth_phy_stats)(struct dsa_switch *ds, int port,
                                     struct ethtool_eth_phy_stats *phy_stats);
        void        (*get_eth_mac_stats)(struct dsa_switch *ds, int port,
                                     struct ethtool_eth_mac_stats *mac_stats);
        void        (*get_eth_ctrl_stats)(struct dsa_switch *ds, int port,
                                      struct ethtool_eth_ctrl_stats *ctrl_stats);
        void        (*get_rmon_stats)(struct dsa_switch *ds, int port,
                                  struct ethtool_rmon_stats *rmon_stats,
                                  const struct ethtool_rmon_hist_range **ranges);
        void        (*get_ts_stats)(struct dsa_switch *ds, int port,
                                struct ethtool_ts_stats *ts_stats);
        void        (*get_stats64)(struct dsa_switch *ds, int port,
                                   struct rtnl_link_stats64 *s);
        void        (*get_pause_stats)(struct dsa_switch *ds, int port,
                                   struct ethtool_pause_stats *pause_stats);
        void        (*self_test)(struct dsa_switch *ds, int port,
                             struct ethtool_test *etest, u64 *data);

        /*
         * ethtool Wake-on-LAN
         */
        void        (*get_wol)(struct dsa_switch *ds, int port,
                           struct ethtool_wolinfo *w);
        int        (*set_wol)(struct dsa_switch *ds, int port,
                           struct ethtool_wolinfo *w);

        /*
         * ethtool timestamp info
         */
        int        (*get_ts_info)(struct dsa_switch *ds, int port,
                               struct kernel_ethtool_ts_info *ts);

        /*
         * ethtool MAC merge layer
         */
        int        (*get_mm)(struct dsa_switch *ds, int port,
                          struct ethtool_mm_state *state);
        int        (*set_mm)(struct dsa_switch *ds, int port,
                          struct ethtool_mm_cfg *cfg,
                          struct netlink_ext_ack *extack);
        void        (*get_mm_stats)(struct dsa_switch *ds, int port,
                                struct ethtool_mm_stats *stats);

        /*
         * DCB ops
         */
        int        (*port_get_default_prio)(struct dsa_switch *ds, int port);
        int        (*port_set_default_prio)(struct dsa_switch *ds, int port,
                                         u8 prio);
        int        (*port_get_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp);
        int        (*port_add_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp,
                                      u8 prio);
        int        (*port_del_dscp_prio)(struct dsa_switch *ds, int port, u8 dscp,
                                      u8 prio);
        int        (*port_set_apptrust)(struct dsa_switch *ds, int port,
                                     const u8 *sel, int nsel);
        int        (*port_get_apptrust)(struct dsa_switch *ds, int port, u8 *sel,
                                     int *nsel);

        /*
         * Suspend and resume
         */
        int        (*suspend)(struct dsa_switch *ds);
        int        (*resume)(struct dsa_switch *ds);

        /*
         * Port enable/disable
         */
        int        (*port_enable)(struct dsa_switch *ds, int port,
                               struct phy_device *phy);
        void        (*port_disable)(struct dsa_switch *ds, int port);


        /*
         * Notification for MAC address changes on user ports. Drivers can
         * currently only veto operations. They should not use the method to
         * program the hardware, since the operation is not rolled back in case
         * of other errors.
         */
        int        (*port_set_mac_address)(struct dsa_switch *ds, int port,
                                        const unsigned char *addr);

        /*
         * Compatibility between device trees defining multiple CPU ports and
         * drivers which are not OK to use by default the numerically smallest
         * CPU port of a switch for its local ports. This can return NULL,
         * meaning "don't know/don't care".
         */
        struct dsa_port *(*preferred_default_local_cpu_port)(struct dsa_switch *ds);

        /*
         * Port's MAC EEE settings
         */
        bool        (*support_eee)(struct dsa_switch *ds, int port);
        int        (*set_mac_eee)(struct dsa_switch *ds, int port,
                               struct ethtool_keee *e);

        /* EEPROM access */
        int        (*get_eeprom_len)(struct dsa_switch *ds);
        int        (*get_eeprom)(struct dsa_switch *ds,
                              struct ethtool_eeprom *eeprom, u8 *data);
        int        (*set_eeprom)(struct dsa_switch *ds,
                              struct ethtool_eeprom *eeprom, u8 *data);

        /*
         * Register access.
         */
        int        (*get_regs_len)(struct dsa_switch *ds, int port);
        void        (*get_regs)(struct dsa_switch *ds, int port,
                            struct ethtool_regs *regs, void *p);

        /*
         * Upper device tracking.
         */
        int        (*port_prechangeupper)(struct dsa_switch *ds, int port,
                                       struct netdev_notifier_changeupper_info *info);

        /*
         * Bridge integration
         */
        int        (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs);
        int        (*port_bridge_join)(struct dsa_switch *ds, int port,
                                    struct dsa_bridge bridge,
                                    bool *tx_fwd_offload,
                                    struct netlink_ext_ack *extack);
        void        (*port_bridge_leave)(struct dsa_switch *ds, int port,
                                     struct dsa_bridge bridge);
        void        (*port_stp_state_set)(struct dsa_switch *ds, int port,
                                      u8 state);
        int        (*port_mst_state_set)(struct dsa_switch *ds, int port,
                                      const struct switchdev_mst_state *state);
        void        (*port_fast_age)(struct dsa_switch *ds, int port);
        int        (*port_vlan_fast_age)(struct dsa_switch *ds, int port, u16 vid);
        int        (*port_pre_bridge_flags)(struct dsa_switch *ds, int port,
                                         struct switchdev_brport_flags flags,
                                         struct netlink_ext_ack *extack);
        int        (*port_bridge_flags)(struct dsa_switch *ds, int port,
                                     struct switchdev_brport_flags flags,
                                     struct netlink_ext_ack *extack);
        void        (*port_set_host_flood)(struct dsa_switch *ds, int port,
                                       bool uc, bool mc);

        /*
         * VLAN support
         */
        int        (*port_vlan_filtering)(struct dsa_switch *ds, int port,
                                       bool vlan_filtering,
                                       struct netlink_ext_ack *extack);
        int        (*port_vlan_add)(struct dsa_switch *ds, int port,
                                 const struct switchdev_obj_port_vlan *vlan,
                                 struct netlink_ext_ack *extack);
        int        (*port_vlan_del)(struct dsa_switch *ds, int port,
                                 const struct switchdev_obj_port_vlan *vlan);
        int        (*vlan_msti_set)(struct dsa_switch *ds, struct dsa_bridge bridge,
                                 const struct switchdev_vlan_msti *msti);

        /*
         * Forwarding database
         */
        int        (*port_fdb_add)(struct dsa_switch *ds, int port,
                                const unsigned char *addr, u16 vid,
                                struct dsa_db db);
        int        (*port_fdb_del)(struct dsa_switch *ds, int port,
                                const unsigned char *addr, u16 vid,
                                struct dsa_db db);
        int        (*port_fdb_dump)(struct dsa_switch *ds, int port,
                                 dsa_fdb_dump_cb_t *cb, void *data);
        int        (*lag_fdb_add)(struct dsa_switch *ds, struct dsa_lag lag,
                               const unsigned char *addr, u16 vid,
                               struct dsa_db db);
        int        (*lag_fdb_del)(struct dsa_switch *ds, struct dsa_lag lag,
                               const unsigned char *addr, u16 vid,
                               struct dsa_db db);

        /*
         * Multicast database
         */
        int        (*port_mdb_add)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_port_mdb *mdb,
                                struct dsa_db db);
        int        (*port_mdb_del)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_port_mdb *mdb,
                                struct dsa_db db);
        /*
         * RXNFC
         */
        int        (*get_rxnfc)(struct dsa_switch *ds, int port,
                             struct ethtool_rxnfc *nfc, u32 *rule_locs);
        int        (*set_rxnfc)(struct dsa_switch *ds, int port,
                             struct ethtool_rxnfc *nfc);

        /*
         * TC integration
         */
        int        (*cls_flower_add)(struct dsa_switch *ds, int port,
                                  struct flow_cls_offload *cls, bool ingress);
        int        (*cls_flower_del)(struct dsa_switch *ds, int port,
                                  struct flow_cls_offload *cls, bool ingress);
        int        (*cls_flower_stats)(struct dsa_switch *ds, int port,
                                    struct flow_cls_offload *cls, bool ingress);
        int        (*port_mirror_add)(struct dsa_switch *ds, int port,
                                   struct dsa_mall_mirror_tc_entry *mirror,
                                   bool ingress, struct netlink_ext_ack *extack);
        void        (*port_mirror_del)(struct dsa_switch *ds, int port,
                                   struct dsa_mall_mirror_tc_entry *mirror);
        int        (*port_policer_add)(struct dsa_switch *ds, int port,
                                    struct dsa_mall_policer_tc_entry *policer);
        void        (*port_policer_del)(struct dsa_switch *ds, int port);
        int        (*port_setup_tc)(struct dsa_switch *ds, int port,
                                 enum tc_setup_type type, void *type_data);

        /*
         * Cross-chip operations
         */
        int        (*crosschip_bridge_join)(struct dsa_switch *ds, int tree_index,
                                         int sw_index, int port,
                                         struct dsa_bridge bridge,
                                         struct netlink_ext_ack *extack);
        void        (*crosschip_bridge_leave)(struct dsa_switch *ds, int tree_index,
                                          int sw_index, int port,
                                          struct dsa_bridge bridge);
        int        (*crosschip_lag_change)(struct dsa_switch *ds, int sw_index,
                                        int port);
        int        (*crosschip_lag_join)(struct dsa_switch *ds, int sw_index,
                                      int port, struct dsa_lag lag,
                                      struct netdev_lag_upper_info *info,
                                      struct netlink_ext_ack *extack);
        int        (*crosschip_lag_leave)(struct dsa_switch *ds, int sw_index,
                                       int port, struct dsa_lag lag);

        /*
         * PTP functionality
         */
        int        (*port_hwtstamp_get)(struct dsa_switch *ds, int port,
                                     struct ifreq *ifr);
        int        (*port_hwtstamp_set)(struct dsa_switch *ds, int port,
                                     struct ifreq *ifr);
        void        (*port_txtstamp)(struct dsa_switch *ds, int port,
                                 struct sk_buff *skb);
        bool        (*port_rxtstamp)(struct dsa_switch *ds, int port,
                                 struct sk_buff *skb, unsigned int type);

        /* Devlink parameters, etc */
        int        (*devlink_param_get)(struct dsa_switch *ds, u32 id,
                                     struct devlink_param_gset_ctx *ctx);
        int        (*devlink_param_set)(struct dsa_switch *ds, u32 id,
                                     struct devlink_param_gset_ctx *ctx);
        int        (*devlink_info_get)(struct dsa_switch *ds,
                                    struct devlink_info_req *req,
                                    struct netlink_ext_ack *extack);
        int        (*devlink_sb_pool_get)(struct dsa_switch *ds,
                                       unsigned int sb_index, u16 pool_index,
                                       struct devlink_sb_pool_info *pool_info);
        int        (*devlink_sb_pool_set)(struct dsa_switch *ds, unsigned int sb_index,
                                       u16 pool_index, u32 size,
                                       enum devlink_sb_threshold_type threshold_type,
                                       struct netlink_ext_ack *extack);
        int        (*devlink_sb_port_pool_get)(struct dsa_switch *ds, int port,
                                            unsigned int sb_index, u16 pool_index,
                                            u32 *p_threshold);
        int        (*devlink_sb_port_pool_set)(struct dsa_switch *ds, int port,
                                            unsigned int sb_index, u16 pool_index,
                                            u32 threshold,
                                            struct netlink_ext_ack *extack);
        int        (*devlink_sb_tc_pool_bind_get)(struct dsa_switch *ds, int port,
                                               unsigned int sb_index, u16 tc_index,
                                               enum devlink_sb_pool_type pool_type,
                                               u16 *p_pool_index, u32 *p_threshold);
        int        (*devlink_sb_tc_pool_bind_set)(struct dsa_switch *ds, int port,
                                               unsigned int sb_index, u16 tc_index,
                                               enum devlink_sb_pool_type pool_type,
                                               u16 pool_index, u32 threshold,
                                               struct netlink_ext_ack *extack);
        int        (*devlink_sb_occ_snapshot)(struct dsa_switch *ds,
                                           unsigned int sb_index);
        int        (*devlink_sb_occ_max_clear)(struct dsa_switch *ds,
                                            unsigned int sb_index);
        int        (*devlink_sb_occ_port_pool_get)(struct dsa_switch *ds, int port,
                                                unsigned int sb_index, u16 pool_index,
                                                u32 *p_cur, u32 *p_max);
        int        (*devlink_sb_occ_tc_port_bind_get)(struct dsa_switch *ds, int port,
                                                   unsigned int sb_index, u16 tc_index,
                                                   enum devlink_sb_pool_type pool_type,
                                                   u32 *p_cur, u32 *p_max);

        /*
         * MTU change functionality. Switches can also adjust their MRU through
         * this method. By MTU, one understands the SDU (L2 payload) length.
         * If the switch needs to account for the DSA tag on the CPU port, this
         * method needs to do so privately.
         */
        int        (*port_change_mtu)(struct dsa_switch *ds, int port,
                                   int new_mtu);
        int        (*port_max_mtu)(struct dsa_switch *ds, int port);

        /*
         * LAG integration
         */
        int        (*port_lag_change)(struct dsa_switch *ds, int port);
        int        (*port_lag_join)(struct dsa_switch *ds, int port,
                                 struct dsa_lag lag,
                                 struct netdev_lag_upper_info *info,
                                 struct netlink_ext_ack *extack);
        int        (*port_lag_leave)(struct dsa_switch *ds, int port,
                                  struct dsa_lag lag);

        /*
         * HSR integration
         */
        int        (*port_hsr_join)(struct dsa_switch *ds, int port,
                                 struct net_device *hsr,
                                 struct netlink_ext_ack *extack);
        int        (*port_hsr_leave)(struct dsa_switch *ds, int port,
                                  struct net_device *hsr);

        /*
         * MRP integration
         */
        int        (*port_mrp_add)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_mrp *mrp);
        int        (*port_mrp_del)(struct dsa_switch *ds, int port,
                                const struct switchdev_obj_mrp *mrp);
        int        (*port_mrp_add_ring_role)(struct dsa_switch *ds, int port,
                                          const struct switchdev_obj_ring_role_mrp *mrp);
        int        (*port_mrp_del_ring_role)(struct dsa_switch *ds, int port,
                                          const struct switchdev_obj_ring_role_mrp *mrp);

        /*
         * tag_8021q operations
         */
        int        (*tag_8021q_vlan_add)(struct dsa_switch *ds, int port, u16 vid,
                                      u16 flags);
        int        (*tag_8021q_vlan_del)(struct dsa_switch *ds, int port, u16 vid);

        /*
         * DSA conduit tracking operations
         */
        void        (*conduit_state_change)(struct dsa_switch *ds,
                                        const struct net_device *conduit,
                                        bool operational);
};

#define DSA_DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes)                \
        DEVLINK_PARAM_DRIVER(_id, _name, _type, _cmodes,                \
                             dsa_devlink_param_get, dsa_devlink_param_set, NULL)

int dsa_devlink_param_get(struct devlink *dl, u32 id,
                          struct devlink_param_gset_ctx *ctx);
int dsa_devlink_param_set(struct devlink *dl, u32 id,
                          struct devlink_param_gset_ctx *ctx,
                          struct netlink_ext_ack *extack);
int dsa_devlink_params_register(struct dsa_switch *ds,
                                const struct devlink_param *params,
                                size_t params_count);
void dsa_devlink_params_unregister(struct dsa_switch *ds,
                                   const struct devlink_param *params,
                                   size_t params_count);
int dsa_devlink_resource_register(struct dsa_switch *ds,
                                  const char *resource_name,
                                  u64 resource_size,
                                  u64 resource_id,
                                  u64 parent_resource_id,
                                  const struct devlink_resource_size_params *size_params);

void dsa_devlink_resources_unregister(struct dsa_switch *ds);

void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
                                           u64 resource_id,
                                           devlink_resource_occ_get_t *occ_get,
                                           void *occ_get_priv);
void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
                                             u64 resource_id);
struct devlink_region *
dsa_devlink_region_create(struct dsa_switch *ds,
                          const struct devlink_region_ops *ops,
                          u32 region_max_snapshots, u64 region_size);
struct devlink_region *
dsa_devlink_port_region_create(struct dsa_switch *ds,
                               int port,
                               const struct devlink_port_region_ops *ops,
                               u32 region_max_snapshots, u64 region_size);
void dsa_devlink_region_destroy(struct devlink_region *region);

struct dsa_port *dsa_port_from_netdev(struct net_device *netdev);

struct dsa_devlink_priv {
        struct dsa_switch *ds;
};

static inline struct dsa_switch *dsa_devlink_to_ds(struct devlink *dl)
{
        struct dsa_devlink_priv *dl_priv = devlink_priv(dl);

        return dl_priv->ds;
}

static inline
struct dsa_switch *dsa_devlink_port_to_ds(struct devlink_port *port)
{
        struct devlink *dl = port->devlink;
        struct dsa_devlink_priv *dl_priv = devlink_priv(dl);

        return dl_priv->ds;
}

static inline int dsa_devlink_port_to_port(struct devlink_port *port)
{
        return port->index;
}

struct dsa_switch_driver {
        struct list_head        list;
        const struct dsa_switch_ops *ops;
};

bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
                                 const unsigned char *addr, u16 vid,
                                 struct dsa_db db);
bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
                                 const struct switchdev_obj_port_mdb *mdb,
                                 struct dsa_db db);

/* Keep inline for faster access in hot path */
static inline bool netdev_uses_dsa(const struct net_device *dev)
{
#if IS_ENABLED(CONFIG_NET_DSA)
        return dev->dsa_ptr && dev->dsa_ptr->rcv;
#endif
        return false;
}

/* All DSA tags that push the EtherType to the right (basically all except tail
 * tags, which don't break dissection) can be treated the same from the
 * perspective of the flow dissector.
 *
 * We need to return:
 *  - offset: the (B - A) difference between:
 *    A. the position of the real EtherType and
 *    B. the current skb->data (aka ETH_HLEN bytes into the frame, aka 2 bytes
 *       after the normal EtherType was supposed to be)
 *    The offset in bytes is exactly equal to the tagger overhead (and half of
 *    that, in __be16 shorts).
 *
 *  - proto: the value of the real EtherType.
 */
static inline void dsa_tag_generic_flow_dissect(const struct sk_buff *skb,
                                                __be16 *proto, int *offset)
{
#if IS_ENABLED(CONFIG_NET_DSA)
        const struct dsa_device_ops *ops = skb->dev->dsa_ptr->tag_ops;
        int tag_len = ops->needed_headroom;

        *offset = tag_len;
        *proto = ((__be16 *)skb->data)[(tag_len / 2) - 1];
#endif
}

void dsa_unregister_switch(struct dsa_switch *ds);
int dsa_register_switch(struct dsa_switch *ds);
void dsa_switch_shutdown(struct dsa_switch *ds);
struct dsa_switch *dsa_switch_find(int tree_index, int sw_index);
void dsa_flush_workqueue(void);
#ifdef CONFIG_PM_SLEEP
int dsa_switch_suspend(struct dsa_switch *ds);
int dsa_switch_resume(struct dsa_switch *ds);
#else
static inline int dsa_switch_suspend(struct dsa_switch *ds)
{
        return 0;
}
static inline int dsa_switch_resume(struct dsa_switch *ds)
{
        return 0;
}
#endif /* CONFIG_PM_SLEEP */

#if IS_ENABLED(CONFIG_NET_DSA)
bool dsa_user_dev_check(const struct net_device *dev);
#else
static inline bool dsa_user_dev_check(const struct net_device *dev)
{
        return false;
}
#endif

netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev);
void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up);
bool dsa_supports_eee(struct dsa_switch *ds, int port);

#endif





























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct skb_array' datastructure.
 *
 *        Author:
 *                Michael S. Tsirkin <mst@redhat.com>
 *
 *        Copyright (C) 2016 Red Hat, Inc.
 *
 *        Limited-size FIFO of skbs. Can be used more or less whenever
 *        sk_buff_head can be used, except you need to know the queue size in
 *        advance.
 *        Implemented as a type-safe wrapper around ptr_ring.
 */

#ifndef _LINUX_SKB_ARRAY_H
#define _LINUX_SKB_ARRAY_H 1

#ifdef __KERNEL__
#include <linux/ptr_ring.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
#endif

struct skb_array {
        struct ptr_ring ring;
};

/* Might be slightly faster than skb_array_full below, but callers invoking
 * this in a loop must use a compiler barrier, for example cpu_relax().
 */
static inline bool __skb_array_full(struct skb_array *a)
{
        return __ptr_ring_full(&a->ring);
}

static inline bool skb_array_full(struct skb_array *a)
{
        return ptr_ring_full(&a->ring);
}

static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb)
{
        return ptr_ring_produce(&a->ring, skb);
}

static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb)
{
        return ptr_ring_produce_irq(&a->ring, skb);
}

static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb)
{
        return ptr_ring_produce_bh(&a->ring, skb);
}

static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb)
{
        return ptr_ring_produce_any(&a->ring, skb);
}

/* Might be slightly faster than skb_array_empty below, but only safe if the
 * array is never resized. Also, callers invoking this in a loop must take care
 * to use a compiler barrier, for example cpu_relax().
 */
static inline bool __skb_array_empty(struct skb_array *a)
{
        return __ptr_ring_empty(&a->ring);
}

static inline struct sk_buff *__skb_array_peek(struct skb_array *a)
{
        return __ptr_ring_peek(&a->ring);
}

static inline bool skb_array_empty(struct skb_array *a)
{
        return ptr_ring_empty(&a->ring);
}

static inline bool skb_array_empty_bh(struct skb_array *a)
{
        return ptr_ring_empty_bh(&a->ring);
}

static inline bool skb_array_empty_irq(struct skb_array *a)
{
        return ptr_ring_empty_irq(&a->ring);
}

static inline bool skb_array_empty_any(struct skb_array *a)
{
        return ptr_ring_empty_any(&a->ring);
}

static inline struct sk_buff *__skb_array_consume(struct skb_array *a)
{
        return __ptr_ring_consume(&a->ring);
}

static inline struct sk_buff *skb_array_consume(struct skb_array *a)
{
        return ptr_ring_consume(&a->ring);
}

static inline int skb_array_consume_batched(struct skb_array *a,
                                            struct sk_buff **array, int n)
{
        return ptr_ring_consume_batched(&a->ring, (void **)array, n);
}

static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a)
{
        return ptr_ring_consume_irq(&a->ring);
}

static inline int skb_array_consume_batched_irq(struct skb_array *a,
                                                struct sk_buff **array, int n)
{
        return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n);
}

static inline struct sk_buff *skb_array_consume_any(struct skb_array *a)
{
        return ptr_ring_consume_any(&a->ring);
}

static inline int skb_array_consume_batched_any(struct skb_array *a,
                                                struct sk_buff **array, int n)
{
        return ptr_ring_consume_batched_any(&a->ring, (void **)array, n);
}


static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a)
{
        return ptr_ring_consume_bh(&a->ring);
}

static inline int skb_array_consume_batched_bh(struct skb_array *a,
                                               struct sk_buff **array, int n)
{
        return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n);
}

static inline int __skb_array_len_with_tag(struct sk_buff *skb)
{
        if (likely(skb)) {
                int len = skb->len;

                if (skb_vlan_tag_present(skb))
                        len += VLAN_HLEN;

                return len;
        } else {
                return 0;
        }
}

static inline int skb_array_peek_len(struct skb_array *a)
{
        return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag);
}

static inline int skb_array_peek_len_irq(struct skb_array *a)
{
        return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag);
}

static inline int skb_array_peek_len_bh(struct skb_array *a)
{
        return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag);
}

static inline int skb_array_peek_len_any(struct skb_array *a)
{
        return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag);
}

static inline int skb_array_init_noprof(struct skb_array *a, int size, gfp_t gfp)
{
        return ptr_ring_init_noprof(&a->ring, size, gfp);
}
#define skb_array_init(...)        alloc_hooks(skb_array_init_noprof(__VA_ARGS__))

static void __skb_array_destroy_skb(void *ptr)
{
        kfree_skb(ptr);
}

static inline void skb_array_unconsume(struct skb_array *a,
                                       struct sk_buff **skbs, int n)
{
        ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb);
}

static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp)
{
        return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb);
}

static inline int skb_array_resize_multiple_bh_noprof(struct skb_array **rings,
                                                      int nrings,
                                                      unsigned int size,
                                                      gfp_t gfp)
{
        BUILD_BUG_ON(offsetof(struct skb_array, ring));
        return ptr_ring_resize_multiple_bh_noprof((struct ptr_ring **)rings,
                                                  nrings, size, gfp,
                                                  __skb_array_destroy_skb);
}
#define skb_array_resize_multiple_bh(...)        \
                alloc_hooks(skb_array_resize_multiple_bh_noprof(__VA_ARGS__))

static inline void skb_array_cleanup(struct skb_array *a)
{
        ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb);
}

#endif /* _LINUX_SKB_ARRAY_H  */




































  290 




  265 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NF_CONNTRACK_COMMON_H
#define _NF_CONNTRACK_COMMON_H

#include <linux/refcount.h>
#include <uapi/linux/netfilter/nf_conntrack_common.h>

struct ip_conntrack_stat {
        unsigned int found;
        unsigned int invalid;
        unsigned int insert;
        unsigned int insert_failed;
        unsigned int clash_resolve;
        unsigned int drop;
        unsigned int early_drop;
        unsigned int error;
        unsigned int expect_new;
        unsigned int expect_create;
        unsigned int expect_delete;
        unsigned int search_restart;
        unsigned int chaintoolong;
};

#define NFCT_INFOMASK        7UL
#define NFCT_PTRMASK        ~(NFCT_INFOMASK)

struct nf_conntrack {
        refcount_t use;
};

void nf_conntrack_destroy(struct nf_conntrack *nfct);

/* like nf_ct_put, but without module dependency on nf_conntrack */
static inline void nf_conntrack_put(struct nf_conntrack *nfct)
{
        if (nfct && refcount_dec_and_test(&nfct->use))
                nf_conntrack_destroy(nfct);
}
static inline void nf_conntrack_get(struct nf_conntrack *nfct)
{
        if (nfct)
                refcount_inc(&nfct->use);
}

#endif /* _NF_CONNTRACK_COMMON_H */




















   99 
    2 


    3 





   96 















    3 




    1 


    1 







    1 






























   50 








   50 
   48 







    1 

























    3 























   35 


    2 








    2 

















    2 
    2 

   35 








   37 
    2 






   58 


    1 

   46 






   40 








    2 











    1 



   35 






    1 

   35 














    1 



    2 















    7 


    1 

    4 




    2 















   44 




    7 



































































































































































    9 







   21 





   17 


    8 


    6 














   30 







   30 






    7 









   23 

   14 

   13 






    2 





    1 






   14 


    5 


    3 







    3 










    1 




















   16 

   12 







   68 
   16 







   58 





   21 
   15 







    7 





   15 
   11 

    2 















    1 





    1 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
// SPDX-License-Identifier: GPL-2.0-only
/*
 * VGIC: KVM DEVICE API
 *
 * Copyright (C) 2015 ARM Ltd.
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */
#include <linux/kvm_host.h>
#include <kvm/arm_vgic.h>
#include <linux/uaccess.h>
#include <asm/kvm_mmu.h>
#include <asm/cputype.h>
#include "vgic.h"

/* common helpers */

int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr,
                       phys_addr_t addr, phys_addr_t alignment,
                       phys_addr_t size)
{
        if (!IS_VGIC_ADDR_UNDEF(ioaddr))
                return -EEXIST;

        if (!IS_ALIGNED(addr, alignment) || !IS_ALIGNED(size, alignment))
                return -EINVAL;

        if (addr + size < addr)
                return -EINVAL;

        if (addr & ~kvm_phys_mask(&kvm->arch.mmu) ||
            (addr + size) > kvm_phys_size(&kvm->arch.mmu))
                return -E2BIG;

        return 0;
}

static int vgic_check_type(struct kvm *kvm, int type_needed)
{
        if (kvm->arch.vgic.vgic_model != type_needed)
                return -ENODEV;
        else
                return 0;
}

int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr)
{
        struct vgic_dist *vgic = &kvm->arch.vgic;
        int r;

        mutex_lock(&kvm->arch.config_lock);
        switch (FIELD_GET(KVM_ARM_DEVICE_TYPE_MASK, dev_addr->id)) {
        case KVM_VGIC_V2_ADDR_TYPE_DIST:
                r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
                if (!r)
                        r = vgic_check_iorange(kvm, vgic->vgic_dist_base, dev_addr->addr,
                                               SZ_4K, KVM_VGIC_V2_DIST_SIZE);
                if (!r)
                        vgic->vgic_dist_base = dev_addr->addr;
                break;
        case KVM_VGIC_V2_ADDR_TYPE_CPU:
                r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
                if (!r)
                        r = vgic_check_iorange(kvm, vgic->vgic_cpu_base, dev_addr->addr,
                                               SZ_4K, KVM_VGIC_V2_CPU_SIZE);
                if (!r)
                        vgic->vgic_cpu_base = dev_addr->addr;
                break;
        default:
                r = -ENODEV;
        }

        mutex_unlock(&kvm->arch.config_lock);

        return r;
}

/**
 * kvm_vgic_addr - set or get vgic VM base addresses
 * @kvm:   pointer to the vm struct
 * @attr:  pointer to the attribute being retrieved/updated
 * @write: if true set the address in the VM address space, if false read the
 *          address
 *
 * Set or get the vgic base addresses for the distributor and the virtual CPU
 * interface in the VM physical address space.  These addresses are properties
 * of the emulated core/SoC and therefore user space initially knows this
 * information.
 * Check them for sanity (alignment, double assignment). We can't check for
 * overlapping regions in case of a virtual GICv3 here, since we don't know
 * the number of VCPUs yet, so we defer this check to map_resources().
 */
static int kvm_vgic_addr(struct kvm *kvm, struct kvm_device_attr *attr, bool write)
{
        u64 __user *uaddr = (u64 __user *)attr->addr;
        struct vgic_dist *vgic = &kvm->arch.vgic;
        phys_addr_t *addr_ptr, alignment, size;
        u64 undef_value = VGIC_ADDR_UNDEF;
        u64 addr;
        int r;

        /* Reading a redistributor region addr implies getting the index */
        if (write || attr->attr == KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION)
                if (get_user(addr, uaddr))
                        return -EFAULT;

        /*
         * Since we can't hold config_lock while registering the redistributor
         * iodevs, take the slots_lock immediately.
         */
        mutex_lock(&kvm->slots_lock);
        switch (attr->attr) {
        case KVM_VGIC_V2_ADDR_TYPE_DIST:
                r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
                addr_ptr = &vgic->vgic_dist_base;
                alignment = SZ_4K;
                size = KVM_VGIC_V2_DIST_SIZE;
                break;
        case KVM_VGIC_V2_ADDR_TYPE_CPU:
                r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
                addr_ptr = &vgic->vgic_cpu_base;
                alignment = SZ_4K;
                size = KVM_VGIC_V2_CPU_SIZE;
                break;
        case KVM_VGIC_V3_ADDR_TYPE_DIST:
                r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
                addr_ptr = &vgic->vgic_dist_base;
                alignment = SZ_64K;
                size = KVM_VGIC_V3_DIST_SIZE;
                break;
        case KVM_VGIC_V3_ADDR_TYPE_REDIST: {
                struct vgic_redist_region *rdreg;

                r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
                if (r)
                        break;
                if (write) {
                        r = vgic_v3_set_redist_base(kvm, 0, addr, 0);
                        goto out;
                }
                rdreg = list_first_entry_or_null(&vgic->rd_regions,
                                                 struct vgic_redist_region, list);
                if (!rdreg)
                        addr_ptr = &undef_value;
                else
                        addr_ptr = &rdreg->base;
                break;
        }
        case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
        {
                struct vgic_redist_region *rdreg;
                u8 index;

                r = vgic_check_type(kvm, KVM_DEV_TYPE_ARM_VGIC_V3);
                if (r)
                        break;

                index = addr & KVM_VGIC_V3_RDIST_INDEX_MASK;

                if (write) {
                        gpa_t base = addr & KVM_VGIC_V3_RDIST_BASE_MASK;
                        u32 count = FIELD_GET(KVM_VGIC_V3_RDIST_COUNT_MASK, addr);
                        u8 flags = FIELD_GET(KVM_VGIC_V3_RDIST_FLAGS_MASK, addr);

                        if (!count || flags)
                                r = -EINVAL;
                        else
                                r = vgic_v3_set_redist_base(kvm, index,
                                                            base, count);
                        goto out;
                }

                rdreg = vgic_v3_rdist_region_from_index(kvm, index);
                if (!rdreg) {
                        r = -ENOENT;
                        goto out;
                }

                addr = index;
                addr |= rdreg->base;
                addr |= (u64)rdreg->count << KVM_VGIC_V3_RDIST_COUNT_SHIFT;
                goto out;
        }
        default:
                r = -ENODEV;
        }

        if (r)
                goto out;

        mutex_lock(&kvm->arch.config_lock);
        if (write) {
                r = vgic_check_iorange(kvm, *addr_ptr, addr, alignment, size);
                if (!r)
                        *addr_ptr = addr;
        } else {
                addr = *addr_ptr;
        }
        mutex_unlock(&kvm->arch.config_lock);

out:
        mutex_unlock(&kvm->slots_lock);

        if (!r && !write)
                r =  put_user(addr, uaddr);

        return r;
}

static int vgic_set_common_attr(struct kvm_device *dev,
                                struct kvm_device_attr *attr)
{
        int r;

        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_ADDR:
                r = kvm_vgic_addr(dev->kvm, attr, true);
                return (r == -ENODEV) ? -ENXIO : r;
        case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
                u32 __user *uaddr = (u32 __user *)(long)attr->addr;
                u32 val;
                int ret = 0;

                if (get_user(val, uaddr))
                        return -EFAULT;

                /*
                 * We require:
                 * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
                 * - at most 1024 interrupts
                 * - a multiple of 32 interrupts
                 */
                if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
                    val > VGIC_MAX_RESERVED ||
                    (val & 31))
                        return -EINVAL;

                mutex_lock(&dev->kvm->arch.config_lock);

                /*
                 * Either userspace has already configured NR_IRQS or
                 * the vgic has already been initialized and vgic_init()
                 * supplied a default amount of SPIs.
                 */
                if (dev->kvm->arch.vgic.nr_spis)
                        ret = -EBUSY;
                else
                        dev->kvm->arch.vgic.nr_spis =
                                val - VGIC_NR_PRIVATE_IRQS;

                mutex_unlock(&dev->kvm->arch.config_lock);

                return ret;
        }
        case KVM_DEV_ARM_VGIC_GRP_CTRL: {
                switch (attr->attr) {
                case KVM_DEV_ARM_VGIC_CTRL_INIT:
                        mutex_lock(&dev->kvm->arch.config_lock);
                        r = vgic_init(dev->kvm);
                        mutex_unlock(&dev->kvm->arch.config_lock);
                        return r;
                case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
                        /*
                         * OK, this one isn't common at all, but we
                         * want to handle all control group attributes
                         * in a single place.
                         */
                        if (vgic_check_type(dev->kvm, KVM_DEV_TYPE_ARM_VGIC_V3))
                                return -ENXIO;
                        mutex_lock(&dev->kvm->lock);

                        if (!lock_all_vcpus(dev->kvm)) {
                                mutex_unlock(&dev->kvm->lock);
                                return -EBUSY;
                        }

                        mutex_lock(&dev->kvm->arch.config_lock);
                        r = vgic_v3_save_pending_tables(dev->kvm);
                        mutex_unlock(&dev->kvm->arch.config_lock);
                        unlock_all_vcpus(dev->kvm);
                        mutex_unlock(&dev->kvm->lock);
                        return r;
                }
                break;
        }
        }

        return -ENXIO;
}

static int vgic_get_common_attr(struct kvm_device *dev,
                                struct kvm_device_attr *attr)
{
        int r = -ENXIO;

        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_ADDR:
                r = kvm_vgic_addr(dev->kvm, attr, false);
                return (r == -ENODEV) ? -ENXIO : r;
        case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
                u32 __user *uaddr = (u32 __user *)(long)attr->addr;

                r = put_user(dev->kvm->arch.vgic.nr_spis +
                             VGIC_NR_PRIVATE_IRQS, uaddr);
                break;
        }
        case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ: {
                u32 __user *uaddr = (u32 __user *)(long)attr->addr;

                r = put_user(dev->kvm->arch.vgic.mi_intid, uaddr);
                break;
        }
        }

        return r;
}

static int vgic_create(struct kvm_device *dev, u32 type)
{
        return kvm_vgic_create(dev->kvm, type);
}

static void vgic_destroy(struct kvm_device *dev)
{
        kfree(dev);
}

int kvm_register_vgic_device(unsigned long type)
{
        int ret = -ENODEV;

        switch (type) {
        case KVM_DEV_TYPE_ARM_VGIC_V2:
                ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
                                              KVM_DEV_TYPE_ARM_VGIC_V2);
                break;
        case KVM_DEV_TYPE_ARM_VGIC_V3:
                ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
                                              KVM_DEV_TYPE_ARM_VGIC_V3);

                if (ret)
                        break;
                ret = kvm_vgic_register_its_device();
                break;
        }

        return ret;
}

int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
                       struct vgic_reg_attr *reg_attr)
{
        int cpuid = FIELD_GET(KVM_DEV_ARM_VGIC_CPUID_MASK, attr->attr);

        reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
        reg_attr->vcpu = kvm_get_vcpu_by_id(dev->kvm, cpuid);
        if (!reg_attr->vcpu)
                return -EINVAL;

        return 0;
}

/**
 * vgic_v2_attr_regs_access - allows user space to access VGIC v2 state
 *
 * @dev:      kvm device handle
 * @attr:     kvm device attribute
 * @is_write: true if userspace is writing a register
 */
static int vgic_v2_attr_regs_access(struct kvm_device *dev,
                                    struct kvm_device_attr *attr,
                                    bool is_write)
{
        u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr;
        struct vgic_reg_attr reg_attr;
        gpa_t addr;
        struct kvm_vcpu *vcpu;
        int ret;
        u32 val;

        ret = vgic_v2_parse_attr(dev, attr, &reg_attr);
        if (ret)
                return ret;

        vcpu = reg_attr.vcpu;
        addr = reg_attr.addr;

        if (is_write)
                if (get_user(val, uaddr))
                        return -EFAULT;

        mutex_lock(&dev->kvm->lock);

        if (!lock_all_vcpus(dev->kvm)) {
                mutex_unlock(&dev->kvm->lock);
                return -EBUSY;
        }

        mutex_lock(&dev->kvm->arch.config_lock);

        ret = vgic_init(dev->kvm);
        if (ret)
                goto out;

        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
                ret = vgic_v2_cpuif_uaccess(vcpu, is_write, addr, &val);
                break;
        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
                ret = vgic_v2_dist_uaccess(vcpu, is_write, addr, &val);
                break;
        default:
                ret = -EINVAL;
                break;
        }

out:
        mutex_unlock(&dev->kvm->arch.config_lock);
        unlock_all_vcpus(dev->kvm);
        mutex_unlock(&dev->kvm->lock);

        if (!ret && !is_write)
                ret = put_user(val, uaddr);

        return ret;
}

static int vgic_v2_set_attr(struct kvm_device *dev,
                            struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
                return vgic_v2_attr_regs_access(dev, attr, true);
        default:
                return vgic_set_common_attr(dev, attr);
        }
}

static int vgic_v2_get_attr(struct kvm_device *dev,
                            struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
                return vgic_v2_attr_regs_access(dev, attr, false);
        default:
                return vgic_get_common_attr(dev, attr);
        }
}

static int vgic_v2_has_attr(struct kvm_device *dev,
                            struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_ADDR:
                switch (attr->attr) {
                case KVM_VGIC_V2_ADDR_TYPE_DIST:
                case KVM_VGIC_V2_ADDR_TYPE_CPU:
                        return 0;
                }
                break;
        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
        case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
                return vgic_v2_has_attr_regs(dev, attr);
        case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
                return 0;
        case KVM_DEV_ARM_VGIC_GRP_CTRL:
                switch (attr->attr) {
                case KVM_DEV_ARM_VGIC_CTRL_INIT:
                        return 0;
                }
        }
        return -ENXIO;
}

struct kvm_device_ops kvm_arm_vgic_v2_ops = {
        .name = "kvm-arm-vgic-v2",
        .create = vgic_create,
        .destroy = vgic_destroy,
        .set_attr = vgic_v2_set_attr,
        .get_attr = vgic_v2_get_attr,
        .has_attr = vgic_v2_has_attr,
};

int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
                       struct vgic_reg_attr *reg_attr)
{
        unsigned long vgic_mpidr, mpidr_reg;

        /*
         * For KVM_DEV_ARM_VGIC_GRP_DIST_REGS group,
         * attr might not hold MPIDR. Hence assume vcpu0.
         */
        if (attr->group != KVM_DEV_ARM_VGIC_GRP_DIST_REGS) {
                vgic_mpidr = (attr->attr & KVM_DEV_ARM_VGIC_V3_MPIDR_MASK) >>
                              KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT;

                mpidr_reg = VGIC_TO_MPIDR(vgic_mpidr);
                reg_attr->vcpu = kvm_mpidr_to_vcpu(dev->kvm, mpidr_reg);
        } else {
                reg_attr->vcpu = kvm_get_vcpu(dev->kvm, 0);
        }

        if (!reg_attr->vcpu)
                return -EINVAL;

        reg_attr->addr = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;

        return 0;
}

/*
 * vgic_v3_attr_regs_access - allows user space to access VGIC v3 state
 *
 * @dev:      kvm device handle
 * @attr:     kvm device attribute
 * @is_write: true if userspace is writing a register
 */
static int vgic_v3_attr_regs_access(struct kvm_device *dev,
                                    struct kvm_device_attr *attr,
                                    bool is_write)
{
        struct vgic_reg_attr reg_attr;
        gpa_t addr;
        struct kvm_vcpu *vcpu;
        bool uaccess, post_init = true;
        u32 val;
        int ret;

        ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
        if (ret)
                return ret;

        vcpu = reg_attr.vcpu;
        addr = reg_attr.addr;

        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
                /* Sysregs uaccess is performed by the sysreg handling code */
                uaccess = false;
                break;
        case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
                post_init = false;
                fallthrough;
        default:
                uaccess = true;
        }

        if (uaccess && is_write) {
                u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr;
                if (get_user(val, uaddr))
                        return -EFAULT;
        }

        mutex_lock(&dev->kvm->lock);

        if (!lock_all_vcpus(dev->kvm)) {
                mutex_unlock(&dev->kvm->lock);
                return -EBUSY;
        }

        mutex_lock(&dev->kvm->arch.config_lock);

        if (post_init != vgic_initialized(dev->kvm)) {
                ret = -EBUSY;
                goto out;
        }

        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
                ret = vgic_v3_dist_uaccess(vcpu, is_write, addr, &val);
                break;
        case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
                ret = vgic_v3_redist_uaccess(vcpu, is_write, addr, &val);
                break;
        case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
                ret = vgic_v3_cpu_sysregs_uaccess(vcpu, attr, is_write);
                break;
        case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
                unsigned int info, intid;

                info = (attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
                        KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT;
                if (info == VGIC_LEVEL_INFO_LINE_LEVEL) {
                        intid = attr->attr &
                                KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK;
                        ret = vgic_v3_line_level_info_uaccess(vcpu, is_write,
                                                              intid, &val);
                } else {
                        ret = -EINVAL;
                }
                break;
        }
        case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
                if (!is_write) {
                        val = dev->kvm->arch.vgic.mi_intid;
                        ret = 0;
                        break;
                }

                ret = -EINVAL;
                if ((val < VGIC_NR_PRIVATE_IRQS) && (val >= VGIC_NR_SGIS)) {
                        dev->kvm->arch.vgic.mi_intid = val;
                        ret = 0;
                }
                break;
        default:
                ret = -EINVAL;
                break;
        }

out:
        mutex_unlock(&dev->kvm->arch.config_lock);
        unlock_all_vcpus(dev->kvm);
        mutex_unlock(&dev->kvm->lock);

        if (!ret && uaccess && !is_write) {
                u32 __user *uaddr = (u32 __user *)(unsigned long)attr->addr;
                ret = put_user(val, uaddr);
        }

        return ret;
}

static int vgic_v3_set_attr(struct kvm_device *dev,
                            struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
        case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
        case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
        case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
        case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
                return vgic_v3_attr_regs_access(dev, attr, true);
        default:
                return vgic_set_common_attr(dev, attr);
        }
}

static int vgic_v3_get_attr(struct kvm_device *dev,
                            struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
        case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
        case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
        case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO:
        case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
                return vgic_v3_attr_regs_access(dev, attr, false);
        default:
                return vgic_get_common_attr(dev, attr);
        }
}

static int vgic_v3_has_attr(struct kvm_device *dev,
                            struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_ADDR:
                switch (attr->attr) {
                case KVM_VGIC_V3_ADDR_TYPE_DIST:
                case KVM_VGIC_V3_ADDR_TYPE_REDIST:
                case KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION:
                        return 0;
                }
                break;
        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
        case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:
        case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
                return vgic_v3_has_attr_regs(dev, attr);
        case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
        case KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ:
                return 0;
        case KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO: {
                if (((attr->attr & KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK) >>
                      KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) ==
                      VGIC_LEVEL_INFO_LINE_LEVEL)
                        return 0;
                break;
        }
        case KVM_DEV_ARM_VGIC_GRP_CTRL:
                switch (attr->attr) {
                case KVM_DEV_ARM_VGIC_CTRL_INIT:
                        return 0;
                case KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES:
                        return 0;
                }
        }
        return -ENXIO;
}

struct kvm_device_ops kvm_arm_vgic_v3_ops = {
        .name = "kvm-arm-vgic-v3",
        .create = vgic_create,
        .destroy = vgic_destroy,
        .set_attr = vgic_v3_set_attr,
        .get_attr = vgic_v3_get_attr,
        .has_attr = vgic_v3_has_attr,
};










































  443 




    6 




  162 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM timestamp

#if !defined(_TRACE_TIMESTAMP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_TIMESTAMP_H

#include <linux/tracepoint.h>
#include <linux/fs.h>

#define CTIME_QUERIED_FLAGS \
        { I_CTIME_QUERIED, "Q" }

DECLARE_EVENT_CLASS(ctime,
        TP_PROTO(struct inode *inode,
                 struct timespec64 *ctime),

        TP_ARGS(inode, ctime),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(time64_t,        ctime_s)
                __field(u32,                ctime_ns)
                __field(u32,                gen)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->gen                = inode->i_generation;
                __entry->ctime_s        = ctime->tv_sec;
                __entry->ctime_ns        = ctime->tv_nsec;
        ),

        TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
                __entry->ctime_s, __entry->ctime_ns
        )
);

DEFINE_EVENT(ctime, inode_set_ctime_to_ts,
                TP_PROTO(struct inode *inode,
                         struct timespec64 *ctime),
                TP_ARGS(inode, ctime));

DEFINE_EVENT(ctime, ctime_xchg_skip,
                TP_PROTO(struct inode *inode,
                         struct timespec64 *ctime),
                TP_ARGS(inode, ctime));

TRACE_EVENT(ctime_ns_xchg,
        TP_PROTO(struct inode *inode,
                 u32 old,
                 u32 new,
                 u32 cur),

        TP_ARGS(inode, old, new, cur),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(u32,                gen)
                __field(u32,                old)
                __field(u32,                new)
                __field(u32,                cur)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->gen                = inode->i_generation;
                __entry->old                = old;
                __entry->new                = new;
                __entry->cur                = cur;
        ),

        TP_printk("ino=%d:%d:%ld:%u old=%u:%s new=%u cur=%u:%s",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
                __entry->old & ~I_CTIME_QUERIED,
                __print_flags(__entry->old & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS),
                __entry->new,
                __entry->cur & ~I_CTIME_QUERIED,
                __print_flags(__entry->cur & I_CTIME_QUERIED, "|", CTIME_QUERIED_FLAGS)
        )
);

TRACE_EVENT(fill_mg_cmtime,
        TP_PROTO(struct inode *inode,
                 struct timespec64 *ctime,
                 struct timespec64 *mtime),

        TP_ARGS(inode, ctime, mtime),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(time64_t,        ctime_s)
                __field(time64_t,        mtime_s)
                __field(u32,                ctime_ns)
                __field(u32,                mtime_ns)
                __field(u32,                gen)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->gen                = inode->i_generation;
                __entry->ctime_s        = ctime->tv_sec;
                __entry->mtime_s        = mtime->tv_sec;
                __entry->ctime_ns        = ctime->tv_nsec;
                __entry->mtime_ns        = mtime->tv_nsec;
        ),

        TP_printk("ino=%d:%d:%ld:%u ctime=%lld.%u mtime=%lld.%u",
                MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->gen,
                __entry->ctime_s, __entry->ctime_ns,
                __entry->mtime_s, __entry->mtime_ns
        )
);
#endif /* _TRACE_TIMESTAMP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>













   71 
































  627 

































  351 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM maple_tree

#if !defined(_TRACE_MM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MM_H


#include <linux/tracepoint.h>

struct ma_state;

TRACE_EVENT(ma_op,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)
TRACE_EVENT(ma_read,

        TP_PROTO(const char *fn, struct ma_state *mas),

        TP_ARGS(fn, mas),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode: %p (%lu %lu) range: %lu-%lu",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last
        )
)

TRACE_EVENT(ma_write,

        TP_PROTO(const char *fn, struct ma_state *mas, unsigned long piv,
                 void *val),

        TP_ARGS(fn, mas, piv, val),

        TP_STRUCT__entry(
                        __field(const char *, fn)
                        __field(unsigned long, min)
                        __field(unsigned long, max)
                        __field(unsigned long, index)
                        __field(unsigned long, last)
                        __field(unsigned long, piv)
                        __field(void *, val)
                        __field(void *, node)
        ),

        TP_fast_assign(
                        __entry->fn                = fn;
                        __entry->min                = mas->min;
                        __entry->max                = mas->max;
                        __entry->index                = mas->index;
                        __entry->last                = mas->last;
                        __entry->piv                = piv;
                        __entry->val                = val;
                        __entry->node                = mas->node;
        ),

        TP_printk("%s\tNode %p (%lu %lu) range:%lu-%lu piv (%lu) val %p",
                  __entry->fn,
                  (void *) __entry->node,
                  (unsigned long) __entry->min,
                  (unsigned long) __entry->max,
                  (unsigned long) __entry->index,
                  (unsigned long) __entry->last,
                  (unsigned long) __entry->piv,
                  (void *) __entry->val
        )
)
#endif /* _TRACE_MM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



























































































































































  155 
  157 




  156 
  157 
  157 




  157 
  157 




  156 
  157 
  157 















  165 





  166 









  164 








  166 
  166 

  166 


  165 







  166 
  166 


  166 


  166 














  126 





  127 


  127 

  127 
  126 


  127 

  127 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012-2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#include <hyp/sysreg-sr.h>

#include <linux/compiler.h>
#include <linux/kvm_host.h>

#include <asm/kprobes.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_nested.h>

static void __sysreg_save_vel2_state(struct kvm_vcpu *vcpu)
{
        /* These registers are common with EL1 */
        __vcpu_sys_reg(vcpu, PAR_EL1)        = read_sysreg(par_el1);
        __vcpu_sys_reg(vcpu, TPIDR_EL1)        = read_sysreg(tpidr_el1);

        __vcpu_sys_reg(vcpu, ESR_EL2)        = read_sysreg_el1(SYS_ESR);
        __vcpu_sys_reg(vcpu, AFSR0_EL2)        = read_sysreg_el1(SYS_AFSR0);
        __vcpu_sys_reg(vcpu, AFSR1_EL2)        = read_sysreg_el1(SYS_AFSR1);
        __vcpu_sys_reg(vcpu, FAR_EL2)        = read_sysreg_el1(SYS_FAR);
        __vcpu_sys_reg(vcpu, MAIR_EL2)        = read_sysreg_el1(SYS_MAIR);
        __vcpu_sys_reg(vcpu, VBAR_EL2)        = read_sysreg_el1(SYS_VBAR);
        __vcpu_sys_reg(vcpu, CONTEXTIDR_EL2) = read_sysreg_el1(SYS_CONTEXTIDR);
        __vcpu_sys_reg(vcpu, AMAIR_EL2)        = read_sysreg_el1(SYS_AMAIR);

        /*
         * In VHE mode those registers are compatible between EL1 and EL2,
         * and the guest uses the _EL1 versions on the CPU naturally.
         * So we save them into their _EL2 versions here.
         * For nVHE mode we trap accesses to those registers, so our
         * _EL2 copy in sys_regs[] is always up-to-date and we don't need
         * to save anything here.
         */
        if (vcpu_el2_e2h_is_set(vcpu)) {
                u64 val;

                /*
                 * We don't save CPTR_EL2, as accesses to CPACR_EL1
                 * are always trapped, ensuring that the in-memory
                 * copy is always up-to-date. A small blessing...
                 */
                __vcpu_sys_reg(vcpu, SCTLR_EL2)        = read_sysreg_el1(SYS_SCTLR);
                __vcpu_sys_reg(vcpu, TTBR0_EL2)        = read_sysreg_el1(SYS_TTBR0);
                __vcpu_sys_reg(vcpu, TTBR1_EL2)        = read_sysreg_el1(SYS_TTBR1);
                __vcpu_sys_reg(vcpu, TCR_EL2)        = read_sysreg_el1(SYS_TCR);

                if (ctxt_has_tcrx(&vcpu->arch.ctxt)) {
                        __vcpu_sys_reg(vcpu, TCR2_EL2) = read_sysreg_el1(SYS_TCR2);

                        if (ctxt_has_s1pie(&vcpu->arch.ctxt)) {
                                __vcpu_sys_reg(vcpu, PIRE0_EL2) = read_sysreg_el1(SYS_PIRE0);
                                __vcpu_sys_reg(vcpu, PIR_EL2) = read_sysreg_el1(SYS_PIR);
                        }

                        if (ctxt_has_s1poe(&vcpu->arch.ctxt))
                                __vcpu_sys_reg(vcpu, POR_EL2) = read_sysreg_el1(SYS_POR);
                }

                /*
                 * The EL1 view of CNTKCTL_EL1 has a bunch of RES0 bits where
                 * the interesting CNTHCTL_EL2 bits live. So preserve these
                 * bits when reading back the guest-visible value.
                 */
                val = read_sysreg_el1(SYS_CNTKCTL);
                val &= CNTKCTL_VALID_BITS;
                __vcpu_sys_reg(vcpu, CNTHCTL_EL2) &= ~CNTKCTL_VALID_BITS;
                __vcpu_sys_reg(vcpu, CNTHCTL_EL2) |= val;
        }

        __vcpu_sys_reg(vcpu, SP_EL2)        = read_sysreg(sp_el1);
        __vcpu_sys_reg(vcpu, ELR_EL2)        = read_sysreg_el1(SYS_ELR);
        __vcpu_sys_reg(vcpu, SPSR_EL2)        = read_sysreg_el1(SYS_SPSR);
}

static void __sysreg_restore_vel2_state(struct kvm_vcpu *vcpu)
{
        u64 val;

        /* These registers are common with EL1 */
        write_sysreg(__vcpu_sys_reg(vcpu, PAR_EL1),        par_el1);
        write_sysreg(__vcpu_sys_reg(vcpu, TPIDR_EL1),        tpidr_el1);

        write_sysreg(ctxt_midr_el1(&vcpu->arch.ctxt),                        vpidr_el2);
        write_sysreg(__vcpu_sys_reg(vcpu, MPIDR_EL1),                        vmpidr_el2);
        write_sysreg_el1(__vcpu_sys_reg(vcpu, MAIR_EL2),                SYS_MAIR);
        write_sysreg_el1(__vcpu_sys_reg(vcpu, VBAR_EL2),                SYS_VBAR);
        write_sysreg_el1(__vcpu_sys_reg(vcpu, CONTEXTIDR_EL2),                SYS_CONTEXTIDR);
        write_sysreg_el1(__vcpu_sys_reg(vcpu, AMAIR_EL2),                SYS_AMAIR);

        if (vcpu_el2_e2h_is_set(vcpu)) {
                /*
                 * In VHE mode those registers are compatible between
                 * EL1 and EL2.
                 */
                write_sysreg_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2),   SYS_SCTLR);
                write_sysreg_el1(__vcpu_sys_reg(vcpu, CPTR_EL2),    SYS_CPACR);
                write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2),   SYS_TTBR0);
                write_sysreg_el1(__vcpu_sys_reg(vcpu, TTBR1_EL2),   SYS_TTBR1);
                write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR_EL2),            SYS_TCR);
                write_sysreg_el1(__vcpu_sys_reg(vcpu, CNTHCTL_EL2), SYS_CNTKCTL);
        } else {
                /*
                 * CNTHCTL_EL2 only affects EL1 when running nVHE, so
                 * no need to restore it.
                 */
                val = translate_sctlr_el2_to_sctlr_el1(__vcpu_sys_reg(vcpu, SCTLR_EL2));
                write_sysreg_el1(val, SYS_SCTLR);
                val = translate_cptr_el2_to_cpacr_el1(__vcpu_sys_reg(vcpu, CPTR_EL2));
                write_sysreg_el1(val, SYS_CPACR);
                val = translate_ttbr0_el2_to_ttbr0_el1(__vcpu_sys_reg(vcpu, TTBR0_EL2));
                write_sysreg_el1(val, SYS_TTBR0);
                val = translate_tcr_el2_to_tcr_el1(__vcpu_sys_reg(vcpu, TCR_EL2));
                write_sysreg_el1(val, SYS_TCR);
        }

        if (ctxt_has_tcrx(&vcpu->arch.ctxt)) {
                write_sysreg_el1(__vcpu_sys_reg(vcpu, TCR2_EL2), SYS_TCR2);

                if (ctxt_has_s1pie(&vcpu->arch.ctxt)) {
                        write_sysreg_el1(__vcpu_sys_reg(vcpu, PIR_EL2), SYS_PIR);
                        write_sysreg_el1(__vcpu_sys_reg(vcpu, PIRE0_EL2), SYS_PIRE0);
                }

                if (ctxt_has_s1poe(&vcpu->arch.ctxt))
                        write_sysreg_el1(__vcpu_sys_reg(vcpu, POR_EL2), SYS_POR);
        }

        write_sysreg_el1(__vcpu_sys_reg(vcpu, ESR_EL2),                SYS_ESR);
        write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR0_EL2),        SYS_AFSR0);
        write_sysreg_el1(__vcpu_sys_reg(vcpu, AFSR1_EL2),        SYS_AFSR1);
        write_sysreg_el1(__vcpu_sys_reg(vcpu, FAR_EL2),                SYS_FAR);
        write_sysreg(__vcpu_sys_reg(vcpu, SP_EL2),                sp_el1);
        write_sysreg_el1(__vcpu_sys_reg(vcpu, ELR_EL2),                SYS_ELR);
        write_sysreg_el1(__vcpu_sys_reg(vcpu, SPSR_EL2),        SYS_SPSR);
}

/*
 * VHE: Host and guest must save mdscr_el1 and sp_el0 (and the PC and
 * pstate, which are handled as part of the el2 return state) on every
 * switch (sp_el0 is being dealt with in the assembly code).
 * tpidr_el0 and tpidrro_el0 only need to be switched when going
 * to host userspace or a different VCPU.  EL1 registers only need to be
 * switched when potentially going to run a different VCPU.  The latter two
 * classes are handled as part of kvm_arch_vcpu_load and kvm_arch_vcpu_put.
 */

void sysreg_save_host_state_vhe(struct kvm_cpu_context *ctxt)
{
        __sysreg_save_common_state(ctxt);
}
NOKPROBE_SYMBOL(sysreg_save_host_state_vhe);

void sysreg_save_guest_state_vhe(struct kvm_cpu_context *ctxt)
{
        __sysreg_save_common_state(ctxt);
        __sysreg_save_el2_return_state(ctxt);
}
NOKPROBE_SYMBOL(sysreg_save_guest_state_vhe);

void sysreg_restore_host_state_vhe(struct kvm_cpu_context *ctxt)
{
        __sysreg_restore_common_state(ctxt);
}
NOKPROBE_SYMBOL(sysreg_restore_host_state_vhe);

void sysreg_restore_guest_state_vhe(struct kvm_cpu_context *ctxt)
{
        __sysreg_restore_common_state(ctxt);
        __sysreg_restore_el2_return_state(ctxt);
}
NOKPROBE_SYMBOL(sysreg_restore_guest_state_vhe);

/**
 * __vcpu_load_switch_sysregs - Load guest system registers to the physical CPU
 *
 * @vcpu: The VCPU pointer
 *
 * Load system registers that do not affect the host's execution, for
 * example EL1 system registers on a VHE system where the host kernel
 * runs at EL2.  This function is called from KVM's vcpu_load() function
 * and loading system register state early avoids having to load them on
 * every entry to the VM.
 */
void __vcpu_load_switch_sysregs(struct kvm_vcpu *vcpu)
{
        struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
        struct kvm_cpu_context *host_ctxt;
        u64 midr, mpidr;

        host_ctxt = host_data_ptr(host_ctxt);
        __sysreg_save_user_state(host_ctxt);

        /*
         * When running a normal EL1 guest, we only load a new vcpu
         * after a context switch, which imvolves a DSB, so all
         * speculative EL1&0 walks will have already completed.
         * If running NV, the vcpu may transition between vEL1 and
         * vEL2 without a context switch, so make sure we complete
         * those walks before loading a new context.
         */
        if (vcpu_has_nv(vcpu))
                dsb(nsh);

        /*
         * Load guest EL1 and user state
         *
         * We must restore the 32-bit state before the sysregs, thanks
         * to erratum #852523 (Cortex-A57) or #853709 (Cortex-A72).
         */
        __sysreg32_restore_state(vcpu);
        __sysreg_restore_user_state(guest_ctxt);

        if (unlikely(is_hyp_ctxt(vcpu))) {
                __sysreg_restore_vel2_state(vcpu);
        } else {
                if (vcpu_has_nv(vcpu)) {
                        /*
                         * As we're restoring a nested guest, set the value
                         * provided by the guest hypervisor.
                         */
                        midr = ctxt_sys_reg(guest_ctxt, VPIDR_EL2);
                        mpidr = ctxt_sys_reg(guest_ctxt, VMPIDR_EL2);
                } else {
                        midr = ctxt_midr_el1(guest_ctxt);
                        mpidr = ctxt_sys_reg(guest_ctxt, MPIDR_EL1);
                }

                __sysreg_restore_el1_state(guest_ctxt, midr, mpidr);
        }

        vcpu_set_flag(vcpu, SYSREGS_ON_CPU);
}

/**
 * __vcpu_put_switch_sysregs - Restore host system registers to the physical CPU
 *
 * @vcpu: The VCPU pointer
 *
 * Save guest system registers that do not affect the host's execution, for
 * example EL1 system registers on a VHE system where the host kernel
 * runs at EL2.  This function is called from KVM's vcpu_put() function
 * and deferring saving system register state until we're no longer running the
 * VCPU avoids having to save them on every exit from the VM.
 */
void __vcpu_put_switch_sysregs(struct kvm_vcpu *vcpu)
{
        struct kvm_cpu_context *guest_ctxt = &vcpu->arch.ctxt;
        struct kvm_cpu_context *host_ctxt;

        host_ctxt = host_data_ptr(host_ctxt);

        if (unlikely(is_hyp_ctxt(vcpu)))
                __sysreg_save_vel2_state(vcpu);
        else
                __sysreg_save_el1_state(guest_ctxt);

        __sysreg_save_user_state(guest_ctxt);
        __sysreg32_save_state(vcpu);

        /* Restore host user state */
        __sysreg_restore_user_state(host_ctxt);

        vcpu_clear_flag(vcpu, SYSREGS_ON_CPU);
}


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2020 ARM Ltd.
 */
#ifndef __ASM_VDSO_PROCESSOR_H
#define __ASM_VDSO_PROCESSOR_H

#ifndef __ASSEMBLY__

static inline void cpu_relax(void)
{
        asm volatile("yield" ::: "memory");
}

#endif /* __ASSEMBLY__ */

#endif /* __ASM_VDSO_PROCESSOR_H */





































































































































































    5 
    5 

    5 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
// SPDX-License-Identifier: GPL-2.0-only
/* Page fragment allocator
 *
 * Page Fragment:
 *  An arbitrary-length arbitrary-offset area of memory which resides within a
 *  0 or higher order page.  Multiple fragments within that page are
 *  individually refcounted, in the page's reference counter.
 *
 * The page_frag functions provide a simple allocation framework for page
 * fragments.  This is used by the network stack and network device drivers to
 * provide a backing region of memory for use as either an sk_buff->head, or to
 * be used in the "frags" portion of skb_shared_info.
 */

#include <linux/build_bug.h>
#include <linux/export.h>
#include <linux/gfp_types.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/page_frag_cache.h>
#include "internal.h"

static unsigned long encoded_page_create(struct page *page, unsigned int order,
                                         bool pfmemalloc)
{
        BUILD_BUG_ON(PAGE_FRAG_CACHE_MAX_ORDER > PAGE_FRAG_CACHE_ORDER_MASK);
        BUILD_BUG_ON(PAGE_FRAG_CACHE_PFMEMALLOC_BIT >= PAGE_SIZE);

        return (unsigned long)page_address(page) |
                (order & PAGE_FRAG_CACHE_ORDER_MASK) |
                ((unsigned long)pfmemalloc * PAGE_FRAG_CACHE_PFMEMALLOC_BIT);
}

static unsigned long encoded_page_decode_order(unsigned long encoded_page)
{
        return encoded_page & PAGE_FRAG_CACHE_ORDER_MASK;
}

static void *encoded_page_decode_virt(unsigned long encoded_page)
{
        return (void *)(encoded_page & PAGE_MASK);
}

static struct page *encoded_page_decode_page(unsigned long encoded_page)
{
        return virt_to_page((void *)encoded_page);
}

static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
                                             gfp_t gfp_mask)
{
        unsigned long order = PAGE_FRAG_CACHE_MAX_ORDER;
        struct page *page = NULL;
        gfp_t gfp = gfp_mask;

#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
        gfp_mask = (gfp_mask & ~__GFP_DIRECT_RECLAIM) |  __GFP_COMP |
                   __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
        page = __alloc_pages(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER,
                             numa_mem_id(), NULL);
#endif
        if (unlikely(!page)) {
                page = __alloc_pages(gfp, 0, numa_mem_id(), NULL);
                order = 0;
        }

        nc->encoded_page = page ?
                encoded_page_create(page, order, page_is_pfmemalloc(page)) : 0;

        return page;
}

void page_frag_cache_drain(struct page_frag_cache *nc)
{
        if (!nc->encoded_page)
                return;

        __page_frag_cache_drain(encoded_page_decode_page(nc->encoded_page),
                                nc->pagecnt_bias);
        nc->encoded_page = 0;
}
EXPORT_SYMBOL(page_frag_cache_drain);

void __page_frag_cache_drain(struct page *page, unsigned int count)
{
        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);

        if (page_ref_sub_and_test(page, count))
                free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);

void *__page_frag_alloc_align(struct page_frag_cache *nc,
                              unsigned int fragsz, gfp_t gfp_mask,
                              unsigned int align_mask)
{
        unsigned long encoded_page = nc->encoded_page;
        unsigned int size, offset;
        struct page *page;

        if (unlikely(!encoded_page)) {
refill:
                page = __page_frag_cache_refill(nc, gfp_mask);
                if (!page)
                        return NULL;

                encoded_page = nc->encoded_page;

                /* Even if we own the page, we do not use atomic_set().
                 * This would break get_page_unless_zero() users.
                 */
                page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);

                /* reset page count bias and offset to start of new frag */
                nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
                nc->offset = 0;
        }

        size = PAGE_SIZE << encoded_page_decode_order(encoded_page);
        offset = __ALIGN_KERNEL_MASK(nc->offset, ~align_mask);
        if (unlikely(offset + fragsz > size)) {
                if (unlikely(fragsz > PAGE_SIZE)) {
                        /*
                         * The caller is trying to allocate a fragment
                         * with fragsz > PAGE_SIZE but the cache isn't big
                         * enough to satisfy the request, this may
                         * happen in low memory conditions.
                         * We don't release the cache page because
                         * it could make memory pressure worse
                         * so we simply return NULL here.
                         */
                        return NULL;
                }

                page = encoded_page_decode_page(encoded_page);

                if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
                        goto refill;

                if (unlikely(encoded_page_decode_pfmemalloc(encoded_page))) {
                        free_frozen_pages(page,
                                        encoded_page_decode_order(encoded_page));
                        goto refill;
                }

                /* OK, page count is 0, we can safely set it */
                set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);

                /* reset page count bias and offset to start of new frag */
                nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
                offset = 0;
        }

        nc->pagecnt_bias--;
        nc->offset = offset + fragsz;

        return encoded_page_decode_virt(encoded_page) + offset;
}
EXPORT_SYMBOL(__page_frag_alloc_align);

/*
 * Frees a page fragment allocated out of either a compound or order 0 page.
 */
void page_frag_free(void *addr)
{
        struct page *page = virt_to_head_page(addr);

        if (unlikely(put_page_testzero(page)))
                free_frozen_pages(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);



































































  862 



  786 
  258 

  868 
























































































































































   13 































































































































































































































































































































































































































































































































































































































    3 
  602 














































































































































































    5 






















































































































  595 


























   23 
    5 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SEQLOCK_H
#define __LINUX_SEQLOCK_H

/*
 * seqcount_t / seqlock_t - a reader-writer consistency mechanism with
 * lockless readers (read-only retry loops), and no writer starvation.
 *
 * See Documentation/locking/seqlock.rst
 *
 * Copyrights:
 * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli
 * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH
 */

#include <linux/compiler.h>
#include <linux/kcsan-checks.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/preempt.h>
#include <linux/seqlock_types.h>
#include <linux/spinlock.h>

#include <asm/processor.h>

/*
 * The seqlock seqcount_t interface does not prescribe a precise sequence of
 * read begin/retry/end. For readers, typically there is a call to
 * read_seqcount_begin() and read_seqcount_retry(), however, there are more
 * esoteric cases which do not follow this pattern.
 *
 * As a consequence, we take the following best-effort approach for raw usage
 * via seqcount_t under KCSAN: upon beginning a seq-reader critical section,
 * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as
 * atomics; if there is a matching read_seqcount_retry() call, no following
 * memory operations are considered atomic. Usage of the seqlock_t interface
 * is not affected.
 */
#define KCSAN_SEQLOCK_REGION_MAX 1000

static inline void __seqcount_init(seqcount_t *s, const char *name,
                                          struct lock_class_key *key)
{
        /*
         * Make sure we are not reinitializing a held lock:
         */
        lockdep_init_map(&s->dep_map, name, key, 0);
        s->sequence = 0;
}

#ifdef CONFIG_DEBUG_LOCK_ALLOC

# define SEQCOUNT_DEP_MAP_INIT(lockname)                                \
                .dep_map = { .name = #lockname }

/**
 * seqcount_init() - runtime initializer for seqcount_t
 * @s: Pointer to the seqcount_t instance
 */
# define seqcount_init(s)                                                \
        do {                                                                \
                static struct lock_class_key __key;                        \
                __seqcount_init((s), #s, &__key);                        \
        } while (0)

static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
{
        seqcount_t *l = (seqcount_t *)s;
        unsigned long flags;

        local_irq_save(flags);
        seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_);
        seqcount_release(&l->dep_map, _RET_IP_);
        local_irq_restore(flags);
}

#else
# define SEQCOUNT_DEP_MAP_INIT(lockname)
# define seqcount_init(s) __seqcount_init(s, NULL, NULL)
# define seqcount_lockdep_reader_access(x)
#endif

/**
 * SEQCNT_ZERO() - static initializer for seqcount_t
 * @name: Name of the seqcount_t instance
 */
#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) }

/*
 * Sequence counters with associated locks (seqcount_LOCKNAME_t)
 *
 * A sequence counter which associates the lock used for writer
 * serialization at initialization time. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * For associated locks which do not implicitly disable preemption,
 * preemption protection is enforced in the write side function.
 *
 * Lockdep is never used in any for the raw write variants.
 *
 * See Documentation/locking/seqlock.rst
 */

/*
 * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated
 * @seqcount:        The real sequence counter
 * @lock:        Pointer to the associated lock
 *
 * A plain sequence counter with external writer synchronization by
 * LOCKNAME @lock. The lock is associated to the sequence counter in the
 * static initializer or init function. This enables lockdep to validate
 * that the write side critical section is properly serialized.
 *
 * LOCKNAME:        raw_spinlock, spinlock, rwlock or mutex
 */

/*
 * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t
 * @s:                Pointer to the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated lock
 */

#define seqcount_LOCKNAME_init(s, _lock, lockname)                        \
        do {                                                                \
                seqcount_##lockname##_t *____s = (s);                        \
                seqcount_init(&____s->seqcount);                        \
                __SEQ_LOCK(____s->lock = (_lock));                        \
        } while (0)

#define seqcount_raw_spinlock_init(s, lock)        seqcount_LOCKNAME_init(s, lock, raw_spinlock)
#define seqcount_spinlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, spinlock)
#define seqcount_rwlock_init(s, lock)                seqcount_LOCKNAME_init(s, lock, rwlock)
#define seqcount_mutex_init(s, lock)                seqcount_LOCKNAME_init(s, lock, mutex)

/*
 * SEQCOUNT_LOCKNAME()        - Instantiate seqcount_LOCKNAME_t and helpers
 * seqprop_LOCKNAME_*()        - Property accessors for seqcount_LOCKNAME_t
 *
 * @lockname:                "LOCKNAME" part of seqcount_LOCKNAME_t
 * @locktype:                LOCKNAME canonical C data type
 * @preemptible:        preemptibility of above locktype
 * @lockbase:                prefix for associated lock/unlock
 */
#define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase)        \
static __always_inline seqcount_t *                                        \
__seqprop_##lockname##_ptr(seqcount_##lockname##_t *s)                        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline const seqcount_t *                                \
__seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s)        \
{                                                                        \
        return &s->seqcount;                                                \
}                                                                        \
                                                                        \
static __always_inline unsigned                                                \
__seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s)        \
{                                                                        \
        unsigned seq = smp_load_acquire(&s->seqcount.sequence);                \
                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return seq;                                                \
                                                                        \
        if (preemptible && unlikely(seq & 1)) {                                \
                __SEQ_LOCK(lockbase##_lock(s->lock));                        \
                __SEQ_LOCK(lockbase##_unlock(s->lock));                        \
                                                                        \
                /*                                                        \
                 * Re-read the sequence counter since the (possibly        \
                 * preempted) writer made progress.                        \
                 */                                                        \
                seq = smp_load_acquire(&s->seqcount.sequence);                \
        }                                                                \
                                                                        \
        return seq;                                                        \
}                                                                        \
                                                                        \
static __always_inline bool                                                \
__seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s)        \
{                                                                        \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))                                \
                return preemptible;                                        \
                                                                        \
        /* PREEMPT_RT relies on the above LOCK+UNLOCK */                \
        return false;                                                        \
}                                                                        \
                                                                        \
static __always_inline void                                                \
__seqprop_##lockname##_assert(const seqcount_##lockname##_t *s)                \
{                                                                        \
        __SEQ_LOCK(lockdep_assert_held(s->lock));                        \
}

/*
 * __seqprop() for seqcount_t
 */

static inline seqcount_t *__seqprop_ptr(seqcount_t *s)
{
        return s;
}

static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s)
{
        return s;
}

static inline unsigned __seqprop_sequence(const seqcount_t *s)
{
        return smp_load_acquire(&s->sequence);
}

static inline bool __seqprop_preemptible(const seqcount_t *s)
{
        return false;
}

static inline void __seqprop_assert(const seqcount_t *s)
{
        lockdep_assert_preemption_disabled();
}

#define __SEQ_RT        IS_ENABLED(CONFIG_PREEMPT_RT)

SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t,  false,    raw_spin)
SEQCOUNT_LOCKNAME(spinlock,     spinlock_t,      __SEQ_RT, spin)
SEQCOUNT_LOCKNAME(rwlock,       rwlock_t,        __SEQ_RT, read)
SEQCOUNT_LOCKNAME(mutex,        struct mutex,    true,     mutex)
#undef SEQCOUNT_LOCKNAME

/*
 * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t
 * @name:        Name of the seqcount_LOCKNAME_t instance
 * @lock:        Pointer to the associated LOCKNAME
 */

#define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) {                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
        __SEQ_LOCK(.lock        = (assoc_lock))                                \
}

#define SEQCNT_RAW_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_SPINLOCK_ZERO(name, lock)        SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_RWLOCK_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_MUTEX_ZERO(name, lock)                SEQCOUNT_LOCKNAME_ZERO(name, lock)
#define SEQCNT_WW_MUTEX_ZERO(name, lock)         SEQCOUNT_LOCKNAME_ZERO(name, lock)

#define __seqprop_case(s, lockname, prop)                                \
        seqcount_##lockname##_t: __seqprop_##lockname##_##prop

#define __seqprop(s, prop) _Generic(*(s),                                \
        seqcount_t:                __seqprop_##prop,                        \
        __seqprop_case((s),        raw_spinlock,        prop),                        \
        __seqprop_case((s),        spinlock,        prop),                        \
        __seqprop_case((s),        rwlock,                prop),                        \
        __seqprop_case((s),        mutex,                prop))

#define seqprop_ptr(s)                        __seqprop(s, ptr)(s)
#define seqprop_const_ptr(s)                __seqprop(s, const_ptr)(s)
#define seqprop_sequence(s)                __seqprop(s, sequence)(s)
#define seqprop_preemptible(s)                __seqprop(s, preemptible)(s)
#define seqprop_assert(s)                __seqprop(s, assert)(s)

/**
 * __read_seqcount_begin() - begin a seqcount_t read section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define __read_seqcount_begin(s)                                        \
({                                                                        \
        unsigned __seq;                                                        \
                                                                        \
        while (unlikely((__seq = seqprop_sequence(s)) & 1))                \
                cpu_relax();                                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount_begin(s) __read_seqcount_begin(s)

/**
 * read_seqcount_begin() - begin a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define read_seqcount_begin(s)                                                \
({                                                                        \
        seqcount_lockdep_reader_access(seqprop_const_ptr(s));                \
        raw_read_seqcount_begin(s);                                        \
})

/**
 * raw_read_seqcount() - read the raw seqcount_t counter value
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_read_seqcount opens a read critical section of the given
 * seqcount_t, without any lockdep checking, and without checking or
 * masking the sequence counter LSB. Calling code is responsible for
 * handling that.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_read_seqcount(s)                                                \
({                                                                        \
        unsigned __seq = seqprop_sequence(s);                                \
                                                                        \
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);                        \
        __seq;                                                                \
})

/**
 * raw_seqcount_try_begin() - begin a seqcount_t read critical section
 *                            w/o lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count to be passed to read_seqcount_retry()
 *
 * Similar to raw_seqcount_begin(), except it enables eliding the critical
 * section entirely if odd, instead of doing the speculation knowing it will
 * fail.
 *
 * Useful when counter stabilization is more or less equivalent to taking
 * the lock and there is a slowpath that does that.
 *
 * If true, start will be set to the (even) sequence count read.
 *
 * Return: true when a read critical section is started.
 */
#define raw_seqcount_try_begin(s, start)                                \
({                                                                        \
        start = raw_read_seqcount(s);                                        \
        !(start & 1);                                                        \
})

/**
 * raw_seqcount_begin() - begin a seqcount_t read critical section w/o
 *                        lockdep and w/o counter stabilization
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * raw_seqcount_begin opens a read critical section of the given
 * seqcount_t. Unlike read_seqcount_begin(), this function will not wait
 * for the count to stabilize. If a writer is active when it begins, it
 * will fail the read_seqcount_retry() at the end of the read critical
 * section instead of stabilizing at the beginning of it.
 *
 * Use this only in special kernel hot paths where the read section is
 * small and has a high probability of success through other external
 * means. It will save a single branching instruction.
 *
 * Return: count to be passed to read_seqcount_retry()
 */
#define raw_seqcount_begin(s)                                                \
({                                                                        \
        /*                                                                \
         * If the counter is odd, let read_seqcount_retry() fail        \
         * by decrementing the counter.                                        \
         */                                                                \
        raw_read_seqcount(s) & ~1;                                        \
})

/**
 * __read_seqcount_retry() - end a seqcount_t read section w/o barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
 * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
 * provided before actually loading any of the variables that are to be
 * protected in this critical section.
 *
 * Use carefully, only in critical code, and comment how the barrier is
 * provided.
 *
 * Return: true if a read section retry is required, else false
 */
#define __read_seqcount_retry(s, start)                                        \
        do___read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return unlikely(READ_ONCE(s->sequence) != start);
}

/**
 * read_seqcount_retry() - end a seqcount_t read critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @start: count, from read_seqcount_begin()
 *
 * read_seqcount_retry closes the read critical section of given
 * seqcount_t.  If the critical section was invalid, it must be ignored
 * (and typically retried).
 *
 * Return: true if a read section retry is required, else false
 */
#define read_seqcount_retry(s, start)                                        \
        do_read_seqcount_retry(seqprop_const_ptr(s), start)

static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start)
{
        smp_rmb();
        return do___read_seqcount_retry(s, start);
}

/**
 * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_begin()
 */
#define raw_write_seqcount_begin(s)                                        \
do {                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_raw_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_raw_write_seqcount_begin(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
}

/**
 * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: check write_seqcount_end()
 */
#define raw_write_seqcount_end(s)                                        \
do {                                                                        \
        do_raw_write_seqcount_end(seqprop_ptr(s));                        \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_raw_write_seqcount_end(seqcount_t *s)
{
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_begin_nested() - start a seqcount_t write section with
 *                                 custom lockdep nesting level
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 * @subclass: lockdep nesting level
 *
 * See Documentation/locking/lockdep-design.rst
 * Context: check write_seqcount_begin()
 */
#define write_seqcount_begin_nested(s, subclass)                        \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin_nested(seqprop_ptr(s), subclass);        \
} while (0)

static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass)
{
        seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
        do_raw_write_seqcount_begin(s);
}

/**
 * write_seqcount_begin() - start a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: sequence counter write side sections must be serialized and
 * non-preemptible. Preemption will be automatically disabled if and
 * only if the seqcount write serialization lock is associated, and
 * preemptible.  If readers can be invoked from hardirq or softirq
 * context, interrupts or bottom halves must be respectively disabled.
 */
#define write_seqcount_begin(s)                                                \
do {                                                                        \
        seqprop_assert(s);                                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_disable();                                        \
                                                                        \
        do_write_seqcount_begin(seqprop_ptr(s));                        \
} while (0)

static inline void do_write_seqcount_begin(seqcount_t *s)
{
        do_write_seqcount_begin_nested(s, 0);
}

/**
 * write_seqcount_end() - end a seqcount_t write side critical section
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * Context: Preemption will be automatically re-enabled if and only if
 * the seqcount write serialization lock is associated, and preemptible.
 */
#define write_seqcount_end(s)                                                \
do {                                                                        \
        do_write_seqcount_end(seqprop_ptr(s));                                \
                                                                        \
        if (seqprop_preemptible(s))                                        \
                preempt_enable();                                        \
} while (0)

static inline void do_write_seqcount_end(seqcount_t *s)
{
        seqcount_release(&s->dep_map, _RET_IP_);
        do_raw_write_seqcount_end(s);
}

/**
 * raw_write_seqcount_barrier() - do a seqcount_t write barrier
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * This can be used to provide an ordering guarantee instead of the usual
 * consistency guarantee. It is one wmb cheaper, because it can collapse
 * the two back-to-back wmb()s.
 *
 * Note that writes surrounding the barrier should be declared atomic (e.g.
 * via WRITE_ONCE): a) to ensure the writes become visible to other threads
 * atomically, avoiding compiler optimizations; b) to document which writes are
 * meant to propagate to the reader critical section. This is necessary because
 * neither writes before nor after the barrier are enclosed in a seq-writer
 * critical section that would ensure readers are aware of ongoing writes::
 *
 *        seqcount_t seq;
 *        bool X = true, Y = false;
 *
 *        void read(void)
 *        {
 *                bool x, y;
 *
 *                do {
 *                        int s = read_seqcount_begin(&seq);
 *
 *                        x = X; y = Y;
 *
 *                } while (read_seqcount_retry(&seq, s));
 *
 *                BUG_ON(!x && !y);
 *      }
 *
 *      void write(void)
 *      {
 *                WRITE_ONCE(Y, true);
 *
 *                raw_write_seqcount_barrier(seq);
 *
 *                WRITE_ONCE(X, false);
 *      }
 */
#define raw_write_seqcount_barrier(s)                                        \
        do_raw_write_seqcount_barrier(seqprop_ptr(s))

static inline void do_raw_write_seqcount_barrier(seqcount_t *s)
{
        kcsan_nestable_atomic_begin();
        s->sequence++;
        smp_wmb();
        s->sequence++;
        kcsan_nestable_atomic_end();
}

/**
 * write_seqcount_invalidate() - invalidate in-progress seqcount_t read
 *                               side operations
 * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants
 *
 * After write_seqcount_invalidate, no seqcount_t read side operations
 * will complete successfully and see data older than this.
 */
#define write_seqcount_invalidate(s)                                        \
        do_write_seqcount_invalidate(seqprop_ptr(s))

static inline void do_write_seqcount_invalidate(seqcount_t *s)
{
        smp_wmb();
        kcsan_nestable_atomic_begin();
        s->sequence+=2;
        kcsan_nestable_atomic_end();
}

/*
 * Latch sequence counters (seqcount_latch_t)
 *
 * A sequence counter variant where the counter even/odd value is used to
 * switch between two copies of protected data. This allows the read path,
 * typically NMIs, to safely interrupt the write side critical section.
 *
 * As the write sections are fully preemptible, no special handling for
 * PREEMPT_RT is needed.
 */
typedef struct {
        seqcount_t seqcount;
} seqcount_latch_t;

/**
 * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t
 * @seq_name: Name of the seqcount_latch_t instance
 */
#define SEQCNT_LATCH_ZERO(seq_name) {                                        \
        .seqcount                = SEQCNT_ZERO(seq_name.seqcount),        \
}

/**
 * seqcount_latch_init() - runtime initializer for seqcount_latch_t
 * @s: Pointer to the seqcount_latch_t instance
 */
#define seqcount_latch_init(s) seqcount_init(&(s)->seqcount)

/**
 * raw_read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See raw_write_seqcount_latch() for details and a full reader/writer
 * usage example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with raw_read_seqcount_latch_retry().
 */
static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s)
{
        /*
         * Pairs with the first smp_wmb() in raw_write_seqcount_latch().
         * Due to the dependent load, a full smp_rmb() is not needed.
         */
        return READ_ONCE(s->seqcount.sequence);
}

/**
 * read_seqcount_latch() - pick even/odd latch data copy
 * @s: Pointer to seqcount_latch_t
 *
 * See write_seqcount_latch() for details and a full reader/writer usage
 * example.
 *
 * Return: sequence counter raw value. Use the lowest bit as an index for
 * picking which data copy to read. The full counter must then be checked
 * with read_seqcount_latch_retry().
 */
static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s)
{
        kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
        return raw_read_seqcount_latch(s);
}

/**
 * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from raw_read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static __always_inline int
raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        smp_rmb();
        return unlikely(READ_ONCE(s->seqcount.sequence) != start);
}

/**
 * read_seqcount_latch_retry() - end a seqcount_latch_t read section
 * @s:                Pointer to seqcount_latch_t
 * @start:        count, from read_seqcount_latch()
 *
 * Return: true if a read section retry is required, else false
 */
static __always_inline int
read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start)
{
        kcsan_atomic_next(0);
        return raw_read_seqcount_latch_retry(s, start);
}

/**
 * raw_write_seqcount_latch() - redirect latch readers to even/odd copy
 * @s: Pointer to seqcount_latch_t
 */
static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s)
{
        smp_wmb();        /* prior stores before incrementing "sequence" */
        s->seqcount.sequence++;
        smp_wmb();      /* increment "sequence" before following stores */
}

/**
 * write_seqcount_latch_begin() - redirect latch readers to odd copy
 * @s: Pointer to seqcount_latch_t
 *
 * The latch technique is a multiversion concurrency control method that allows
 * queries during non-atomic modifications. If you can guarantee queries never
 * interrupt the modification -- e.g. the concurrency is strictly between CPUs
 * -- you most likely do not need this.
 *
 * Where the traditional RCU/lockless data structures rely on atomic
 * modifications to ensure queries observe either the old or the new state the
 * latch allows the same for non-atomic updates. The trade-off is doubling the
 * cost of storage; we have to maintain two copies of the entire data
 * structure.
 *
 * Very simply put: we first modify one copy and then the other. This ensures
 * there is always one copy in a stable state, ready to give us an answer.
 *
 * The basic form is a data structure like::
 *
 *        struct latch_struct {
 *                seqcount_latch_t        seq;
 *                struct data_struct        data[2];
 *        };
 *
 * Where a modification, which is assumed to be externally serialized, does the
 * following::
 *
 *        void latch_modify(struct latch_struct *latch, ...)
 *        {
 *                write_seqcount_latch_begin(&latch->seq);
 *                modify(latch->data[0], ...);
 *                write_seqcount_latch(&latch->seq);
 *                modify(latch->data[1], ...);
 *                write_seqcount_latch_end(&latch->seq);
 *        }
 *
 * The query will have a form like::
 *
 *        struct entry *latch_query(struct latch_struct *latch, ...)
 *        {
 *                struct entry *entry;
 *                unsigned seq, idx;
 *
 *                do {
 *                        seq = read_seqcount_latch(&latch->seq);
 *
 *                        idx = seq & 0x01;
 *                        entry = data_query(latch->data[idx], ...);
 *
 *                // This includes needed smp_rmb()
 *                } while (read_seqcount_latch_retry(&latch->seq, seq));
 *
 *                return entry;
 *        }
 *
 * So during the modification, queries are first redirected to data[1]. Then we
 * modify data[0]. When that is complete, we redirect queries back to data[0]
 * and we can modify data[1].
 *
 * NOTE:
 *
 *        The non-requirement for atomic modifications does _NOT_ include
 *        the publishing of new entries in the case where data is a dynamic
 *        data structure.
 *
 *        An iteration might start in data[0] and get suspended long enough
 *        to miss an entire modification sequence, once it resumes it might
 *        observe the new entry.
 *
 * NOTE2:
 *
 *        When data is a dynamic data structure; one should use regular RCU
 *        patterns to manage the lifetimes of the objects within.
 */
static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s)
{
        kcsan_nestable_atomic_begin();
        raw_write_seqcount_latch(s);
}

/**
 * write_seqcount_latch() - redirect latch readers to even copy
 * @s: Pointer to seqcount_latch_t
 */
static __always_inline void write_seqcount_latch(seqcount_latch_t *s)
{
        raw_write_seqcount_latch(s);
}

/**
 * write_seqcount_latch_end() - end a seqcount_latch_t write section
 * @s:                Pointer to seqcount_latch_t
 *
 * Marks the end of a seqcount_latch_t writer section, after all copies of the
 * latch-protected data have been updated.
 */
static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s)
{
        kcsan_nestable_atomic_end();
}

#define __SEQLOCK_UNLOCKED(lockname)                                        \
        {                                                                \
                .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \
                .lock =        __SPIN_LOCK_UNLOCKED(lockname)                        \
        }

/**
 * seqlock_init() - dynamic initializer for seqlock_t
 * @sl: Pointer to the seqlock_t instance
 */
#define seqlock_init(sl)                                                \
        do {                                                                \
                spin_lock_init(&(sl)->lock);                                \
                seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock);        \
        } while (0)

/**
 * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t
 * @sl: Name of the seqlock_t instance
 */
#define DEFINE_SEQLOCK(sl) \
                seqlock_t sl = __SEQLOCK_UNLOCKED(sl)

/**
 * read_seqbegin() - start a seqlock_t read side critical section
 * @sl: Pointer to seqlock_t
 *
 * Return: count, to be passed to read_seqretry()
 */
static inline unsigned read_seqbegin(const seqlock_t *sl)
{
        return read_seqcount_begin(&sl->seqcount);
}

/**
 * read_seqretry() - end a seqlock_t read side section
 * @sl: Pointer to seqlock_t
 * @start: count, from read_seqbegin()
 *
 * read_seqretry closes the read side critical section of given seqlock_t.
 * If the critical section was invalid, it must be ignored (and typically
 * retried).
 *
 * Return: true if a read section retry is required, else false
 */
static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
{
        return read_seqcount_retry(&sl->seqcount, start);
}

/*
 * For all seqlock_t write side functions, use the internal
 * do_write_seqcount_begin() instead of generic write_seqcount_begin().
 * This way, no redundant lockdep_assert_held() checks are added.
 */

/**
 * write_seqlock() - start a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_seqlock opens a write side critical section for the given
 * seqlock_t.  It also implicitly acquires the spinlock_t embedded inside
 * that sequential lock. All seqlock_t write side sections are thus
 * automatically serialized and non-preemptible.
 *
 * Context: if the seqlock_t read section, or other write side critical
 * sections, can be invoked from hardirq or softirq contexts, use the
 * _irqsave or _bh variants of this function instead.
 */
static inline void write_seqlock(seqlock_t *sl)
{
        spin_lock(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock() - end a seqlock_t write side critical section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock closes the (serialized and non-preemptible) write side
 * critical section of given seqlock_t.
 */
static inline void write_sequnlock(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock(&sl->lock);
}

/**
 * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of write_seqlock(). Use only if the read side section, or
 * other write side sections, can be invoked from softirq contexts.
 */
static inline void write_seqlock_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_bh closes the serialized, non-preemptible, and
 * softirqs-disabled, seqlock_t write side critical section opened with
 * write_seqlock_bh().
 */
static inline void write_sequnlock_bh(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_bh(&sl->lock);
}

/**
 * write_seqlock_irq() - start a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of write_seqlock(). Use only if the read side section, or
 * other write sections, can be invoked from hardirq contexts.
 */
static inline void write_seqlock_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
}

/**
 * write_sequnlock_irq() - end a non-interruptible seqlock_t write section
 * @sl: Pointer to seqlock_t
 *
 * write_sequnlock_irq closes the serialized and non-interruptible
 * seqlock_t write side section opened with write_seqlock_irq().
 */
static inline void write_sequnlock_irq(seqlock_t *sl)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        do_write_seqcount_begin(&sl->seqcount.seqcount);
        return flags;
}

/**
 * write_seqlock_irqsave() - start a non-interruptible seqlock_t write
 *                           section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to write_sequnlock_irqrestore().
 *
 * _irqsave variant of write_seqlock(). Use it only if the read side
 * section, or other write sections, can be invoked from hardirq context.
 */
#define write_seqlock_irqsave(lock, flags)                                \
        do { flags = __write_seqlock_irqsave(lock); } while (0)

/**
 * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write
 *                                section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller's saved interrupt state, from write_seqlock_irqsave()
 *
 * write_sequnlock_irqrestore closes the serialized and non-interruptible
 * seqlock_t write section previously opened with write_seqlock_irqsave().
 */
static inline void
write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
{
        do_write_seqcount_end(&sl->seqcount.seqcount);
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqlock_excl() - begin a seqlock_t locking reader section
 * @sl:        Pointer to seqlock_t
 *
 * read_seqlock_excl opens a seqlock_t locking reader critical section.  A
 * locking reader exclusively locks out *both* other writers *and* other
 * locking readers, but it does not update the embedded sequence number.
 *
 * Locking readers act like a normal spin_lock()/spin_unlock().
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * The opened read section must be closed with read_sequnlock_excl().
 */
static inline void read_seqlock_excl(seqlock_t *sl)
{
        spin_lock(&sl->lock);
}

/**
 * read_sequnlock_excl() - end a seqlock_t locking reader critical section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl(seqlock_t *sl)
{
        spin_unlock(&sl->lock);
}

/**
 * read_seqlock_excl_bh() - start a seqlock_t locking reader section with
 *                            softirqs disabled
 * @sl: Pointer to seqlock_t
 *
 * _bh variant of read_seqlock_excl(). Use this variant only if the
 * seqlock_t write side section, *or other read sections*, can be invoked
 * from softirq contexts.
 */
static inline void read_seqlock_excl_bh(seqlock_t *sl)
{
        spin_lock_bh(&sl->lock);
}

/**
 * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking
 *                              reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_bh(seqlock_t *sl)
{
        spin_unlock_bh(&sl->lock);
}

/**
 * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking
 *                             reader section
 * @sl: Pointer to seqlock_t
 *
 * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
static inline void read_seqlock_excl_irq(seqlock_t *sl)
{
        spin_lock_irq(&sl->lock);
}

/**
 * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t
 *                             locking reader section
 * @sl: Pointer to seqlock_t
 */
static inline void read_sequnlock_excl_irq(seqlock_t *sl)
{
        spin_unlock_irq(&sl->lock);
}

static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
{
        unsigned long flags;

        spin_lock_irqsave(&sl->lock, flags);
        return flags;
}

/**
 * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t
 *                                 locking reader section
 * @lock:  Pointer to seqlock_t
 * @flags: Stack-allocated storage for saving caller's local interrupt
 *         state, to be passed to read_sequnlock_excl_irqrestore().
 *
 * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t
 * write side section, *or other read sections*, can be invoked from a
 * hardirq context.
 */
#define read_seqlock_excl_irqsave(lock, flags)                                \
        do { flags = __read_seqlock_excl_irqsave(lock); } while (0)

/**
 * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t
 *                                      locking reader section
 * @sl:    Pointer to seqlock_t
 * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave()
 */
static inline void
read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
{
        spin_unlock_irqrestore(&sl->lock, flags);
}

/**
 * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader
 * @lock: Pointer to seqlock_t
 * @seq : Marker and return parameter. If the passed value is even, the
 * reader will become a *lockless* seqlock_t reader as in read_seqbegin().
 * If the passed value is odd, the reader will become a *locking* reader
 * as in read_seqlock_excl().  In the first call to this function, the
 * caller *must* initialize and pass an even value to @seq; this way, a
 * lockless read can be optimistically tried first.
 *
 * read_seqbegin_or_lock is an API designed to optimistically try a normal
 * lockless seqlock_t read section first.  If an odd counter is found, the
 * lockless read trial has failed, and the next read iteration transforms
 * itself into a full seqlock_t locking reader.
 *
 * This is typically used to avoid seqlock_t lockless readers starvation
 * (too much retry loops) in the case of a sharp spike in write side
 * activity.
 *
 * Context: if the seqlock_t write section, *or other read sections*, can
 * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
 * variant of this function instead.
 *
 * Check Documentation/locking/seqlock.rst for template example code.
 *
 * Return: the encountered sequence counter value, through the @seq
 * parameter, which is overloaded as a return parameter. This returned
 * value must be checked with need_seqretry(). If the read section need to
 * be retried, this returned value must also be passed as the @seq
 * parameter of the next read_seqbegin_or_lock() iteration.
 */
static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
{
        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl(lock);
}

/**
 * need_seqretry() - validate seqlock_t "locking or lockless" read section
 * @lock: Pointer to seqlock_t
 * @seq: sequence count, from read_seqbegin_or_lock()
 *
 * Return: true if a read section retry is required, false otherwise
 */
static inline int need_seqretry(seqlock_t *lock, int seq)
{
        return !(seq & 1) && read_seqretry(lock, seq);
}

/**
 * done_seqretry() - end seqlock_t "locking or lockless" reader section
 * @lock: Pointer to seqlock_t
 * @seq: count, from read_seqbegin_or_lock()
 *
 * done_seqretry finishes the seqlock_t read side critical section started
 * with read_seqbegin_or_lock() and validated by need_seqretry().
 */
static inline void done_seqretry(seqlock_t *lock, int seq)
{
        if (seq & 1)
                read_sequnlock_excl(lock);
}

/**
 * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or
 *                                   a non-interruptible locking reader
 * @lock: Pointer to seqlock_t
 * @seq:  Marker and return parameter. Check read_seqbegin_or_lock().
 *
 * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if
 * the seqlock_t write section, *or other read sections*, can be invoked
 * from hardirq context.
 *
 * Note: Interrupts will be disabled only for "locking reader" mode.
 *
 * Return:
 *
 *   1. The saved local interrupts state in case of a locking reader, to
 *      be passed to done_seqretry_irqrestore().
 *
 *   2. The encountered sequence counter value, returned through @seq
 *      overloaded as a return parameter. Check read_seqbegin_or_lock().
 */
static inline unsigned long
read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
{
        unsigned long flags = 0;

        if (!(*seq & 1))        /* Even */
                *seq = read_seqbegin(lock);
        else                        /* Odd */
                read_seqlock_excl_irqsave(lock, flags);

        return flags;
}

/**
 * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a
 *                                non-interruptible locking reader section
 * @lock:  Pointer to seqlock_t
 * @seq:   Count, from read_seqbegin_or_lock_irqsave()
 * @flags: Caller's saved local interrupt state in case of a locking
 *           reader, also from read_seqbegin_or_lock_irqsave()
 *
 * This is the _irqrestore variant of done_seqretry(). The read section
 * must've been opened with read_seqbegin_or_lock_irqsave(), and validated
 * by need_seqretry().
 */
static inline void
done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
{
        if (seq & 1)
                read_sequnlock_excl_irqrestore(lock, flags);
}
#endif /* __LINUX_SEQLOCK_H */

























































    1 
















    1 
    1 









    8 







    8 










   38 
   46 




    9 



    9 




   11 




   11 




















































    2 
    3 

    2 




































































    9 




    1 

    9 

    1 



    5 

    1 
    2 




    1 


    3 
    4 










































































   52 

    4 

   48 
   48 












   16 


    5 



   49 





    8 









    1 

























    7 






    8 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/eventfd.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 */

#include <linux/file.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/anon_inodes.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/kref.h>
#include <linux/eventfd.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/idr.h>
#include <linux/uio.h>

static DEFINE_IDA(eventfd_ida);

struct eventfd_ctx {
        struct kref kref;
        wait_queue_head_t wqh;
        /*
         * Every time that a write(2) is performed on an eventfd, the
         * value of the __u64 being written is added to "count" and a
         * wakeup is performed on "wqh". If EFD_SEMAPHORE flag was not
         * specified, a read(2) will return the "count" value to userspace,
         * and will reset "count" to zero. The kernel side eventfd_signal()
         * also, adds to the "count" counter and issue a wakeup.
         */
        __u64 count;
        unsigned int flags;
        int id;
};

/**
 * eventfd_signal_mask - Increment the event counter
 * @ctx: [in] Pointer to the eventfd context.
 * @mask: [in] poll mask
 *
 * This function is supposed to be called by the kernel in paths that do not
 * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
 * value, and we signal this as overflow condition by returning a EPOLLERR
 * to poll(2).
 */
void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
{
        unsigned long flags;

        /*
         * Deadlock or stack overflow issues can happen if we recurse here
         * through waitqueue wakeup handlers. If the caller users potentially
         * nested waitqueues with custom wakeup handlers, then it should
         * check eventfd_signal_allowed() before calling this function. If
         * it returns false, the eventfd_signal() call should be deferred to a
         * safe context.
         */
        if (WARN_ON_ONCE(current->in_eventfd))
                return;

        spin_lock_irqsave(&ctx->wqh.lock, flags);
        current->in_eventfd = 1;
        if (ctx->count < ULLONG_MAX)
                ctx->count++;
        if (waitqueue_active(&ctx->wqh))
                wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
        current->in_eventfd = 0;
        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
}
EXPORT_SYMBOL_GPL(eventfd_signal_mask);

static void eventfd_free_ctx(struct eventfd_ctx *ctx)
{
        if (ctx->id >= 0)
                ida_free(&eventfd_ida, ctx->id);
        kfree(ctx);
}

static void eventfd_free(struct kref *kref)
{
        struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);

        eventfd_free_ctx(ctx);
}

/**
 * eventfd_ctx_put - Releases a reference to the internal eventfd context.
 * @ctx: [in] Pointer to eventfd context.
 *
 * The eventfd context reference must have been previously acquired either
 * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
 */
void eventfd_ctx_put(struct eventfd_ctx *ctx)
{
        kref_put(&ctx->kref, eventfd_free);
}
EXPORT_SYMBOL_GPL(eventfd_ctx_put);

static int eventfd_release(struct inode *inode, struct file *file)
{
        struct eventfd_ctx *ctx = file->private_data;

        wake_up_poll(&ctx->wqh, EPOLLHUP);
        eventfd_ctx_put(ctx);
        return 0;
}

static __poll_t eventfd_poll(struct file *file, poll_table *wait)
{
        struct eventfd_ctx *ctx = file->private_data;
        __poll_t events = 0;
        u64 count;

        poll_wait(file, &ctx->wqh, wait);

        /*
         * All writes to ctx->count occur within ctx->wqh.lock.  This read
         * can be done outside ctx->wqh.lock because we know that poll_wait
         * takes that lock (through add_wait_queue) if our caller will sleep.
         *
         * The read _can_ therefore seep into add_wait_queue's critical
         * section, but cannot move above it!  add_wait_queue's spin_lock acts
         * as an acquire barrier and ensures that the read be ordered properly
         * against the writes.  The following CAN happen and is safe:
         *
         *     poll                               write
         *     -----------------                  ------------
         *     lock ctx->wqh.lock (in poll_wait)
         *     count = ctx->count
         *     __add_wait_queue
         *     unlock ctx->wqh.lock
         *                                        lock ctx->qwh.lock
         *                                        ctx->count += n
         *                                        if (waitqueue_active)
         *                                          wake_up_locked_poll
         *                                        unlock ctx->qwh.lock
         *     eventfd_poll returns 0
         *
         * but the following, which would miss a wakeup, cannot happen:
         *
         *     poll                               write
         *     -----------------                  ------------
         *     count = ctx->count (INVALID!)
         *                                        lock ctx->qwh.lock
         *                                        ctx->count += n
         *                                        **waitqueue_active is false**
         *                                        **no wake_up_locked_poll!**
         *                                        unlock ctx->qwh.lock
         *     lock ctx->wqh.lock (in poll_wait)
         *     __add_wait_queue
         *     unlock ctx->wqh.lock
         *     eventfd_poll returns 0
         */
        count = READ_ONCE(ctx->count);

        if (count > 0)
                events |= EPOLLIN;
        if (count == ULLONG_MAX)
                events |= EPOLLERR;
        if (ULLONG_MAX - 1 > count)
                events |= EPOLLOUT;

        return events;
}

void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
{
        lockdep_assert_held(&ctx->wqh.lock);

        *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count;
        ctx->count -= *cnt;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);

/**
 * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
 * @ctx: [in] Pointer to eventfd context.
 * @wait: [in] Wait queue to be removed.
 * @cnt: [out] Pointer to the 64-bit counter value.
 *
 * Returns %0 if successful, or the following error codes:
 *
 * -EAGAIN      : The operation would have blocked.
 *
 * This is used to atomically remove a wait queue entry from the eventfd wait
 * queue head, and read/reset the counter value.
 */
int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
                                  __u64 *cnt)
{
        unsigned long flags;

        spin_lock_irqsave(&ctx->wqh.lock, flags);
        eventfd_ctx_do_read(ctx, cnt);
        __remove_wait_queue(&ctx->wqh, wait);
        if (*cnt != 0 && waitqueue_active(&ctx->wqh))
                wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
        spin_unlock_irqrestore(&ctx->wqh.lock, flags);

        return *cnt != 0 ? 0 : -EAGAIN;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);

static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct eventfd_ctx *ctx = file->private_data;
        __u64 ucnt = 0;

        if (iov_iter_count(to) < sizeof(ucnt))
                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
        if (!ctx->count) {
                if ((file->f_flags & O_NONBLOCK) ||
                    (iocb->ki_flags & IOCB_NOWAIT)) {
                        spin_unlock_irq(&ctx->wqh.lock);
                        return -EAGAIN;
                }

                if (wait_event_interruptible_locked_irq(ctx->wqh, ctx->count)) {
                        spin_unlock_irq(&ctx->wqh.lock);
                        return -ERESTARTSYS;
                }
        }
        eventfd_ctx_do_read(ctx, &ucnt);
        current->in_eventfd = 1;
        if (waitqueue_active(&ctx->wqh))
                wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
        current->in_eventfd = 0;
        spin_unlock_irq(&ctx->wqh.lock);
        if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
                return -EFAULT;

        return sizeof(ucnt);
}

static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
                             loff_t *ppos)
{
        struct eventfd_ctx *ctx = file->private_data;
        ssize_t res;
        __u64 ucnt;

        if (count != sizeof(ucnt))
                return -EINVAL;
        if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
                return -EFAULT;
        if (ucnt == ULLONG_MAX)
                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
        res = -EAGAIN;
        if (ULLONG_MAX - ctx->count > ucnt)
                res = sizeof(ucnt);
        else if (!(file->f_flags & O_NONBLOCK)) {
                res = wait_event_interruptible_locked_irq(ctx->wqh,
                                ULLONG_MAX - ctx->count > ucnt);
                if (!res)
                        res = sizeof(ucnt);
        }
        if (likely(res > 0)) {
                ctx->count += ucnt;
                current->in_eventfd = 1;
                if (waitqueue_active(&ctx->wqh))
                        wake_up_locked_poll(&ctx->wqh, EPOLLIN);
                current->in_eventfd = 0;
        }
        spin_unlock_irq(&ctx->wqh.lock);

        return res;
}

#ifdef CONFIG_PROC_FS
static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct eventfd_ctx *ctx = f->private_data;
        __u64 cnt;

        spin_lock_irq(&ctx->wqh.lock);
        cnt = ctx->count;
        spin_unlock_irq(&ctx->wqh.lock);

        seq_printf(m,
                   "eventfd-count: %16llx\n"
                   "eventfd-id: %d\n"
                   "eventfd-semaphore: %d\n",
                   cnt,
                   ctx->id,
                   !!(ctx->flags & EFD_SEMAPHORE));
}
#endif

static const struct file_operations eventfd_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = eventfd_show_fdinfo,
#endif
        .release        = eventfd_release,
        .poll                = eventfd_poll,
        .read_iter        = eventfd_read,
        .write                = eventfd_write,
        .llseek                = noop_llseek,
};

/**
 * eventfd_fget - Acquire a reference of an eventfd file descriptor.
 * @fd: [in] Eventfd file descriptor.
 *
 * Returns a pointer to the eventfd file structure in case of success, or the
 * following error pointer:
 *
 * -EBADF    : Invalid @fd file descriptor.
 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 */
struct file *eventfd_fget(int fd)
{
        struct file *file;

        file = fget(fd);
        if (!file)
                return ERR_PTR(-EBADF);
        if (file->f_op != &eventfd_fops) {
                fput(file);
                return ERR_PTR(-EINVAL);
        }

        return file;
}
EXPORT_SYMBOL_GPL(eventfd_fget);

/**
 * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
 * @fd: [in] Eventfd file descriptor.
 *
 * Returns a pointer to the internal eventfd context, otherwise the error
 * pointers returned by the following functions:
 *
 * eventfd_fget
 */
struct eventfd_ctx *eventfd_ctx_fdget(int fd)
{
        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return ERR_PTR(-EBADF);
        return eventfd_ctx_fileget(fd_file(f));
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);

/**
 * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
 * @file: [in] Eventfd file pointer.
 *
 * Returns a pointer to the internal eventfd context, otherwise the error
 * pointer:
 *
 * -EINVAL   : The @fd file descriptor is not an eventfd file.
 */
struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
{
        struct eventfd_ctx *ctx;

        if (file->f_op != &eventfd_fops)
                return ERR_PTR(-EINVAL);

        ctx = file->private_data;
        kref_get(&ctx->kref);
        return ctx;
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);

static int do_eventfd(unsigned int count, int flags)
{
        struct eventfd_ctx *ctx;
        struct file *file;
        int fd;

        /* Check the EFD_* constants for consistency.  */
        BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
        BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0));

        if (flags & ~EFD_FLAGS_SET)
                return -EINVAL;

        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        kref_init(&ctx->kref);
        init_waitqueue_head(&ctx->wqh);
        ctx->count = count;
        ctx->flags = flags;
        ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL);

        flags &= EFD_SHARED_FCNTL_FLAGS;
        flags |= O_RDWR;
        fd = get_unused_fd_flags(flags);
        if (fd < 0)
                goto err;

        file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops,
                                        ctx, flags, FMODE_NOWAIT);
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                fd = PTR_ERR(file);
                goto err;
        }
        fd_install(fd, file);
        return fd;
err:
        eventfd_free_ctx(ctx);
        return fd;
}

SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
{
        return do_eventfd(count, flags);
}

SYSCALL_DEFINE1(eventfd, unsigned int, count)
{
        return do_eventfd(count, 0);
}











































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CPUSET_H
#define _LINUX_CPUSET_H
/*
 *  cpuset interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/sched/topology.h>
#include <linux/sched/task.h>
#include <linux/cpumask.h>
#include <linux/nodemask.h>
#include <linux/mm.h>
#include <linux/mmu_context.h>
#include <linux/jump_label.h>

#ifdef CONFIG_CPUSETS

/*
 * Static branch rewrites can happen in an arbitrary order for a given
 * key. In code paths where we need to loop with read_mems_allowed_begin() and
 * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
 * to ensure that begin() always gets rewritten before retry() in the
 * disabled -> enabled transition. If not, then if local irqs are disabled
 * around the loop, we can deadlock since retry() would always be
 * comparing the latest value of the mems_allowed seqcount against 0 as
 * begin() still would see cpusets_enabled() as false. The enabled -> disabled
 * transition should happen in reverse order for the same reasons (want to stop
 * looking at real value of mems_allowed.sequence in retry() first).
 */
extern struct static_key_false cpusets_pre_enable_key;
extern struct static_key_false cpusets_enabled_key;
extern struct static_key_false cpusets_insane_config_key;

static inline bool cpusets_enabled(void)
{
        return static_branch_unlikely(&cpusets_enabled_key);
}

static inline void cpuset_inc(void)
{
        static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
        static_branch_inc_cpuslocked(&cpusets_enabled_key);
}

static inline void cpuset_dec(void)
{
        static_branch_dec_cpuslocked(&cpusets_enabled_key);
        static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
}

/*
 * This will get enabled whenever a cpuset configuration is considered
 * unsupportable in general. E.g. movable only node which cannot satisfy
 * any non movable allocations (see update_nodemask). Page allocator
 * needs to make additional checks for those configurations and this
 * check is meant to guard those checks without any overhead for sane
 * configurations.
 */
static inline bool cpusets_insane_config(void)
{
        return static_branch_unlikely(&cpusets_insane_config_key);
}

extern int cpuset_init(void);
extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
extern void inc_dl_tasks_cs(struct task_struct *task);
extern void dec_dl_tasks_cs(struct task_struct *task);
extern void cpuset_lock(void);
extern void cpuset_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
extern bool cpuset_cpu_is_isolated(int cpu);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);

extern bool cpuset_node_allowed(int node, gfp_t gfp_mask);

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        if (cpusets_enabled())
                return __cpuset_zone_allowed(z, gfp_mask);
        return true;
}

extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                          const struct task_struct *tsk2);

#ifdef CONFIG_CPUSETS_V1
#define cpuset_memory_pressure_bump()                                 \
        do {                                                        \
                if (cpuset_memory_pressure_enabled)                \
                        __cpuset_memory_pressure_bump();        \
        } while (0)
extern int cpuset_memory_pressure_enabled;
extern void __cpuset_memory_pressure_bump(void);
#else
static inline void cpuset_memory_pressure_bump(void) { }
#endif

extern void cpuset_task_status_allowed(struct seq_file *m,
                                        struct task_struct *task);
extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                            struct pid *pid, struct task_struct *tsk);

extern int cpuset_mem_spread_node(void);

static inline int cpuset_do_page_mem_spread(void)
{
        return task_spread_page(current);
}

extern bool current_cpuset_is_being_rebound(void);

extern void dl_rebuild_rd_accounting(void);
extern void rebuild_sched_domains(void);

extern void cpuset_print_current_mems_allowed(void);
extern void cpuset_reset_sched_domains(void);

/*
 * read_mems_allowed_begin is required when making decisions involving
 * mems_allowed such as during page allocation. mems_allowed can be updated in
 * parallel and depending on the new value an operation can fail potentially
 * causing process failure. A retry loop with read_mems_allowed_begin and
 * read_mems_allowed_retry prevents these artificial failures.
 */
static inline unsigned int read_mems_allowed_begin(void)
{
        if (!static_branch_unlikely(&cpusets_pre_enable_key))
                return 0;

        return read_seqcount_begin(&current->mems_allowed_seq);
}

/*
 * If this returns true, the operation that took place after
 * read_mems_allowed_begin may have failed artificially due to a concurrent
 * update of mems_allowed. It is up to the caller to retry the operation if
 * appropriate.
 */
static inline bool read_mems_allowed_retry(unsigned int seq)
{
        if (!static_branch_unlikely(&cpusets_enabled_key))
                return false;

        return read_seqcount_retry(&current->mems_allowed_seq, seq);
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
        unsigned long flags;

        task_lock(current);
        local_irq_save(flags);
        write_seqcount_begin(&current->mems_allowed_seq);
        current->mems_allowed = nodemask;
        write_seqcount_end(&current->mems_allowed_seq);
        local_irq_restore(flags);
        task_unlock(current);
}

#else /* !CONFIG_CPUSETS */

static inline bool cpusets_enabled(void) { return false; }

static inline bool cpusets_insane_config(void) { return false; }

static inline int cpuset_init(void) { return 0; }
static inline void cpuset_init_smp(void) {}

static inline void cpuset_force_rebuild(void) { }

static inline void cpuset_update_active_cpus(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void inc_dl_tasks_cs(struct task_struct *task) { }
static inline void dec_dl_tasks_cs(struct task_struct *task) { }
static inline void cpuset_lock(void) { }
static inline void cpuset_unlock(void) { }

static inline void cpuset_cpus_allowed(struct task_struct *p,
                                       struct cpumask *mask)
{
        cpumask_copy(mask, task_cpu_possible_mask(p));
}

static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p)
{
        return false;
}

static inline bool cpuset_cpu_is_isolated(int cpu)
{
        return false;
}

static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
{
        return node_possible_map;
}

#define cpuset_current_mems_allowed (node_states[N_MEMORY])
static inline void cpuset_init_current_mems_allowed(void) {}

static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
        return 1;
}

static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
{
        return true;
}

static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
                                                 const struct task_struct *tsk2)
{
        return 1;
}

static inline void cpuset_memory_pressure_bump(void) {}

static inline void cpuset_task_status_allowed(struct seq_file *m,
                                                struct task_struct *task)
{
}

static inline int cpuset_mem_spread_node(void)
{
        return 0;
}

static inline int cpuset_do_page_mem_spread(void)
{
        return 0;
}

static inline bool current_cpuset_is_being_rebound(void)
{
        return false;
}

static inline void dl_rebuild_rd_accounting(void)
{
}

static inline void rebuild_sched_domains(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void cpuset_reset_sched_domains(void)
{
        partition_sched_domains(1, NULL, NULL);
}

static inline void cpuset_print_current_mems_allowed(void)
{
}

static inline void set_mems_allowed(nodemask_t nodemask)
{
}

static inline unsigned int read_mems_allowed_begin(void)
{
        return 0;
}

static inline bool read_mems_allowed_retry(unsigned int seq)
{
        return false;
}

#endif /* !CONFIG_CPUSETS */

#endif /* _LINUX_CPUSET_H */
























    1 
    2 




































    2 























    2 




    1 


    2 































































































    7 



    7 




























    2 


    2 


    2 











    2 







    2 

    1 



    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
// SPDX-License-Identifier: GPL-2.0
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/list_sort.h>
#include <linux/list.h>

/*
 * Returns a list organized in an intermediate format suited
 * to chaining of merge() calls: null-terminated, no reserved or
 * sentinel head node, "prev" links not maintained.
 */
__attribute__((nonnull(2,3,4)))
static struct list_head *merge(void *priv, list_cmp_func_t cmp,
                                struct list_head *a, struct list_head *b)
{
        struct list_head *head, **tail = &head;

        for (;;) {
                /* if equal, take 'a' -- important for sort stability */
                if (cmp(priv, a, b) <= 0) {
                        *tail = a;
                        tail = &a->next;
                        a = a->next;
                        if (!a) {
                                *tail = b;
                                break;
                        }
                } else {
                        *tail = b;
                        tail = &b->next;
                        b = b->next;
                        if (!b) {
                                *tail = a;
                                break;
                        }
                }
        }
        return head;
}

/*
 * Combine final list merge with restoration of standard doubly-linked
 * list structure.  This approach duplicates code from merge(), but
 * runs faster than the tidier alternatives of either a separate final
 * prev-link restoration pass, or maintaining the prev links
 * throughout.
 */
__attribute__((nonnull(2,3,4,5)))
static void merge_final(void *priv, list_cmp_func_t cmp, struct list_head *head,
                        struct list_head *a, struct list_head *b)
{
        struct list_head *tail = head;
        u8 count = 0;

        for (;;) {
                /* if equal, take 'a' -- important for sort stability */
                if (cmp(priv, a, b) <= 0) {
                        tail->next = a;
                        a->prev = tail;
                        tail = a;
                        a = a->next;
                        if (!a)
                                break;
                } else {
                        tail->next = b;
                        b->prev = tail;
                        tail = b;
                        b = b->next;
                        if (!b) {
                                b = a;
                                break;
                        }
                }
        }

        /* Finish linking remainder of list b on to tail */
        tail->next = b;
        do {
                /*
                 * If the merge is highly unbalanced (e.g. the input is
                 * already sorted), this loop may run many iterations.
                 * Continue callbacks to the client even though no
                 * element comparison is needed, so the client's cmp()
                 * routine can invoke cond_resched() periodically.
                 */
                if (unlikely(!++count))
                        cmp(priv, b, b);
                b->prev = tail;
                tail = b;
                b = b->next;
        } while (b);

        /* And the final links to make a circular doubly-linked list */
        tail->next = head;
        head->prev = tail;
}

/**
 * list_sort - sort a list
 * @priv: private data, opaque to list_sort(), passed to @cmp
 * @head: the list to sort
 * @cmp: the elements comparison function
 *
 * The comparison function @cmp must return > 0 if @a should sort after
 * @b ("@a > @b" if you want an ascending sort), and <= 0 if @a should
 * sort before @b *or* their original order should be preserved.  It is
 * always called with the element that came first in the input in @a,
 * and list_sort is a stable sort, so it is not necessary to distinguish
 * the @a < @b and @a == @b cases.
 *
 * The comparison function must adhere to specific mathematical properties
 * to ensure correct and stable sorting:
 * - Antisymmetry: cmp(@a, @b) must return the opposite sign of
 * cmp(@b, @a).
 * - Transitivity: if cmp(@a, @b) <= 0 and cmp(@b, @c) <= 0, then
 * cmp(@a, @c) <= 0.
 *
 * This is compatible with two styles of @cmp function:
 * - The traditional style which returns <0 / =0 / >0, or
 * - Returning a boolean 0/1.
 * The latter offers a chance to save a few cycles in the comparison
 * (which is used by e.g. plug_ctx_cmp() in block/blk-mq.c).
 *
 * A good way to write a multi-word comparison is::
 *
 *        if (a->high != b->high)
 *                return a->high > b->high;
 *        if (a->middle != b->middle)
 *                return a->middle > b->middle;
 *        return a->low > b->low;
 *
 *
 * This mergesort is as eager as possible while always performing at least
 * 2:1 balanced merges.  Given two pending sublists of size 2^k, they are
 * merged to a size-2^(k+1) list as soon as we have 2^k following elements.
 *
 * Thus, it will avoid cache thrashing as long as 3*2^k elements can
 * fit into the cache.  Not quite as good as a fully-eager bottom-up
 * mergesort, but it does use 0.2*n fewer comparisons, so is faster in
 * the common case that everything fits into L1.
 *
 *
 * The merging is controlled by "count", the number of elements in the
 * pending lists.  This is beautifully simple code, but rather subtle.
 *
 * Each time we increment "count", we set one bit (bit k) and clear
 * bits k-1 .. 0.  Each time this happens (except the very first time
 * for each bit, when count increments to 2^k), we merge two lists of
 * size 2^k into one list of size 2^(k+1).
 *
 * This merge happens exactly when the count reaches an odd multiple of
 * 2^k, which is when we have 2^k elements pending in smaller lists,
 * so it's safe to merge away two lists of size 2^k.
 *
 * After this happens twice, we have created two lists of size 2^(k+1),
 * which will be merged into a list of size 2^(k+2) before we create
 * a third list of size 2^(k+1), so there are never more than two pending.
 *
 * The number of pending lists of size 2^k is determined by the
 * state of bit k of "count" plus two extra pieces of information:
 *
 * - The state of bit k-1 (when k == 0, consider bit -1 always set), and
 * - Whether the higher-order bits are zero or non-zero (i.e.
 *   is count >= 2^(k+1)).
 *
 * There are six states we distinguish.  "x" represents some arbitrary
 * bits, and "y" represents some arbitrary non-zero bits:
 * 0:  00x: 0 pending of size 2^k;           x pending of sizes < 2^k
 * 1:  01x: 0 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
 * 2: x10x: 0 pending of size 2^k; 2^k     + x pending of sizes < 2^k
 * 3: x11x: 1 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
 * 4: y00x: 1 pending of size 2^k; 2^k     + x pending of sizes < 2^k
 * 5: y01x: 2 pending of size 2^k; 2^(k-1) + x pending of sizes < 2^k
 * (merge and loop back to state 2)
 *
 * We gain lists of size 2^k in the 2->3 and 4->5 transitions (because
 * bit k-1 is set while the more significant bits are non-zero) and
 * merge them away in the 5->2 transition.  Note in particular that just
 * before the 5->2 transition, all lower-order bits are 11 (state 3),
 * so there is one list of each smaller size.
 *
 * When we reach the end of the input, we merge all the pending
 * lists, from smallest to largest.  If you work through cases 2 to
 * 5 above, you can see that the number of elements we merge with a list
 * of size 2^k varies from 2^(k-1) (cases 3 and 5 when x == 0) to
 * 2^(k+1) - 1 (second merge of case 5 when x == 2^(k-1) - 1).
 */
__attribute__((nonnull(2,3)))
void list_sort(void *priv, struct list_head *head, list_cmp_func_t cmp)
{
        struct list_head *list = head->next, *pending = NULL;
        size_t count = 0;        /* Count of pending */

        if (list == head->prev)        /* Zero or one elements */
                return;

        /* Convert to a null-terminated singly-linked list. */
        head->prev->next = NULL;

        /*
         * Data structure invariants:
         * - All lists are singly linked and null-terminated; prev
         *   pointers are not maintained.
         * - pending is a prev-linked "list of lists" of sorted
         *   sublists awaiting further merging.
         * - Each of the sorted sublists is power-of-two in size.
         * - Sublists are sorted by size and age, smallest & newest at front.
         * - There are zero to two sublists of each size.
         * - A pair of pending sublists are merged as soon as the number
         *   of following pending elements equals their size (i.e.
         *   each time count reaches an odd multiple of that size).
         *   That ensures each later final merge will be at worst 2:1.
         * - Each round consists of:
         *   - Merging the two sublists selected by the highest bit
         *     which flips when count is incremented, and
         *   - Adding an element from the input as a size-1 sublist.
         */
        do {
                size_t bits;
                struct list_head **tail = &pending;

                /* Find the least-significant clear bit in count */
                for (bits = count; bits & 1; bits >>= 1)
                        tail = &(*tail)->prev;
                /* Do the indicated merge */
                if (likely(bits)) {
                        struct list_head *a = *tail, *b = a->prev;

                        a = merge(priv, cmp, b, a);
                        /* Install the merged result in place of the inputs */
                        a->prev = b->prev;
                        *tail = a;
                }

                /* Move one element from input list to pending */
                list->prev = pending;
                pending = list;
                list = list->next;
                pending->next = NULL;
                count++;
        } while (list);

        /* End of input; merge together all the pending lists. */
        list = pending;
        pending = pending->prev;
        for (;;) {
                struct list_head *next = pending->prev;

                if (!next)
                        break;
                list = merge(priv, cmp, pending, list);
                pending = next;
        }
        /* The final merge, rebuilding prev links */
        merge_final(priv, cmp, head, pending, list);
}
EXPORT_SYMBOL(list_sort);





























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#ifndef LLC_PDU_H
#define LLC_PDU_H
/*
 * Copyright (c) 1997 by Procom Technology,Inc.
 *                  2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 * This program can be redistributed or modified under the terms of the
 * GNU General Public License as published by the Free Software Foundation.
 * This program is distributed without any warranty or implied warranty
 * of merchantability or fitness for a particular purpose.
 *
 * See the GNU General Public License for more details.
 */

#include <linux/if_ether.h>

/* Lengths of frame formats */
#define LLC_PDU_LEN_I                4       /* header and 2 control bytes */
#define LLC_PDU_LEN_S                4
#define LLC_PDU_LEN_U                3       /* header and 1 control byte */
/* header and 1 control byte and XID info */
#define LLC_PDU_LEN_U_XID        (LLC_PDU_LEN_U + sizeof(struct llc_xid_info))
/* Known SAP addresses */
#define LLC_GLOBAL_SAP        0xFF
#define LLC_NULL_SAP        0x00        /* not network-layer visible */
#define LLC_MGMT_INDIV        0x02        /* station LLC mgmt indiv addr */
#define LLC_MGMT_GRP        0x03        /* station LLC mgmt group addr */
#define LLC_RDE_SAP        0xA6        /* route ... */

/* SAP field bit masks */
#define LLC_ISO_RESERVED_SAP        0x02
#define LLC_SAP_GROUP_DSAP        0x01
#define LLC_SAP_RESP_SSAP        0x01

/* Group/individual DSAP indicator is DSAP field */
#define LLC_PDU_GROUP_DSAP_MASK    0x01
#define LLC_PDU_IS_GROUP_DSAP(pdu)      \
        ((pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1)
#define LLC_PDU_IS_INDIV_DSAP(pdu)      \
        (!(pdu->dsap & LLC_PDU_GROUP_DSAP_MASK) ? 0 : 1)

/* Command/response PDU indicator in SSAP field */
#define LLC_PDU_CMD_RSP_MASK        0x01
#define LLC_PDU_CMD                0
#define LLC_PDU_RSP                1
#define LLC_PDU_IS_CMD(pdu)    ((pdu->ssap & LLC_PDU_RSP) ? 0 : 1)
#define LLC_PDU_IS_RSP(pdu)    ((pdu->ssap & LLC_PDU_RSP) ? 1 : 0)

/* Get PDU type from 2 lowest-order bits of control field first byte */
#define LLC_PDU_TYPE_I_MASK    0x01        /* 16-bit control field */
#define LLC_PDU_TYPE_S_MASK    0x03
#define LLC_PDU_TYPE_U_MASK    0x03        /* 8-bit control field */
#define LLC_PDU_TYPE_MASK      0x03

#define LLC_PDU_TYPE_I                0        /* first bit */
#define LLC_PDU_TYPE_S                1        /* first two bits */
#define LLC_PDU_TYPE_U                3        /* first two bits */
#define LLC_PDU_TYPE_U_XID        4        /* private type for detecting XID commands */

#define LLC_PDU_TYPE_IS_I(pdu) \
        ((!(pdu->ctrl_1 & LLC_PDU_TYPE_I_MASK)) ? 1 : 0)

#define LLC_PDU_TYPE_IS_U(pdu) \
        (((pdu->ctrl_1 & LLC_PDU_TYPE_U_MASK) == LLC_PDU_TYPE_U) ? 1 : 0)

#define LLC_PDU_TYPE_IS_S(pdu) \
        (((pdu->ctrl_1 & LLC_PDU_TYPE_S_MASK) == LLC_PDU_TYPE_S) ? 1 : 0)

/* U-format PDU control field masks */
#define LLC_U_PF_BIT_MASK      0x10        /* P/F bit mask */
#define LLC_U_PF_IS_1(pdu)     ((pdu->ctrl_1 & LLC_U_PF_BIT_MASK) ? 1 : 0)
#define LLC_U_PF_IS_0(pdu)     ((!(pdu->ctrl_1 & LLC_U_PF_BIT_MASK)) ? 1 : 0)

#define LLC_U_PDU_CMD_MASK     0xEC        /* cmd/rsp mask */
#define LLC_U_PDU_CMD(pdu)     (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK)
#define LLC_U_PDU_RSP(pdu)     (pdu->ctrl_1 & LLC_U_PDU_CMD_MASK)

#define LLC_1_PDU_CMD_UI       0x00        /* Type 1 cmds/rsps */
#define LLC_1_PDU_CMD_XID      0xAC
#define LLC_1_PDU_CMD_TEST     0xE0

#define LLC_2_PDU_CMD_SABME    0x6C        /* Type 2 cmds/rsps */
#define LLC_2_PDU_CMD_DISC     0x40
#define LLC_2_PDU_RSP_UA       0x60
#define LLC_2_PDU_RSP_DM       0x0C
#define LLC_2_PDU_RSP_FRMR     0x84

/* Type 1 operations */

/* XID information field bit masks */

/* LLC format identifier (byte 1) */
#define LLC_XID_FMT_ID                0x81        /* first byte must be this */

/* LLC types/classes identifier (byte 2) */
#define LLC_XID_CLASS_ZEROS_MASK        0xE0        /* these must be zeros */
#define LLC_XID_CLASS_MASK                0x1F        /* AND with byte to get below */

#define LLC_XID_NULL_CLASS_1        0x01        /* if NULL LSAP...use these */
#define LLC_XID_NULL_CLASS_2        0x03
#define LLC_XID_NULL_CLASS_3        0x05
#define LLC_XID_NULL_CLASS_4        0x07

#define LLC_XID_NNULL_TYPE_1        0x01        /* if non-NULL LSAP...use these */
#define LLC_XID_NNULL_TYPE_2        0x02
#define LLC_XID_NNULL_TYPE_3        0x04
#define LLC_XID_NNULL_TYPE_1_2        0x03
#define LLC_XID_NNULL_TYPE_1_3        0x05
#define LLC_XID_NNULL_TYPE_2_3        0x06
#define LLC_XID_NNULL_ALL                0x07

/* Sender Receive Window (byte 3) */
#define LLC_XID_RW_MASK        0xFE        /* AND with value to get below */

#define LLC_XID_MIN_RW        0x02        /* lowest-order bit always zero */

/* Type 2 operations */

#define LLC_2_SEQ_NBR_MODULO   ((u8) 128)

/* I-PDU masks ('ctrl' is I-PDU control word) */
#define LLC_I_GET_NS(pdu)     (u8)((pdu->ctrl_1 & 0xFE) >> 1)
#define LLC_I_GET_NR(pdu)     (u8)((pdu->ctrl_2 & 0xFE) >> 1)

#define LLC_I_PF_BIT_MASK      0x01

#define LLC_I_PF_IS_0(pdu)     ((!(pdu->ctrl_2 & LLC_I_PF_BIT_MASK)) ? 1 : 0)
#define LLC_I_PF_IS_1(pdu)     ((pdu->ctrl_2 & LLC_I_PF_BIT_MASK) ? 1 : 0)

/* S-PDU supervisory commands and responses */

#define LLC_S_PDU_CMD_MASK     0x0C
#define LLC_S_PDU_CMD(pdu)     (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK)
#define LLC_S_PDU_RSP(pdu)     (pdu->ctrl_1 & LLC_S_PDU_CMD_MASK)

#define LLC_2_PDU_CMD_RR       0x00        /* rx ready cmd */
#define LLC_2_PDU_RSP_RR       0x00        /* rx ready rsp */
#define LLC_2_PDU_CMD_REJ      0x08        /* reject PDU cmd */
#define LLC_2_PDU_RSP_REJ      0x08        /* reject PDU rsp */
#define LLC_2_PDU_CMD_RNR      0x04        /* rx not ready cmd */
#define LLC_2_PDU_RSP_RNR      0x04        /* rx not ready rsp */

#define LLC_S_PF_BIT_MASK      0x01
#define LLC_S_PF_IS_0(pdu)     ((!(pdu->ctrl_2 & LLC_S_PF_BIT_MASK)) ? 1 : 0)
#define LLC_S_PF_IS_1(pdu)     ((pdu->ctrl_2 & LLC_S_PF_BIT_MASK) ? 1 : 0)

#define PDU_SUPV_GET_Nr(pdu)   ((pdu->ctrl_2 & 0xFE) >> 1)
#define PDU_GET_NEXT_Vr(sn)    (((sn) + 1) & ~LLC_2_SEQ_NBR_MODULO)

/* FRMR information field macros */

#define FRMR_INFO_LENGTH       5        /* 5 bytes of information */

/*
 * info is pointer to FRMR info field structure; 'rej_ctrl' is byte pointer
 * (if U-PDU) or word pointer to rejected PDU control field
 */
#define FRMR_INFO_SET_REJ_CNTRL(info,rej_ctrl) \
        info->rej_pdu_ctrl = ((*((u8 *) rej_ctrl) & \
                                LLC_PDU_TYPE_U) != LLC_PDU_TYPE_U ? \
                                (u16)*((u16 *) rej_ctrl) : \
                                (((u16) *((u8 *) rej_ctrl)) & 0x00FF))

/*
 * Info is pointer to FRMR info field structure; 'vs' is a byte containing
 * send state variable value in low-order 7 bits (insure the lowest-order
 * bit remains zero (0))
 */
#define FRMR_INFO_SET_Vs(info,vs) (info->curr_ssv = (((u8) vs) << 1))
#define FRMR_INFO_SET_Vr(info,vr) (info->curr_rsv = (((u8) vr) << 1))

/*
 * Info is pointer to FRMR info field structure; 'cr' is a byte containing
 * the C/R bit value in the low-order bit
 */
#define FRMR_INFO_SET_C_R_BIT(info, cr)  (info->curr_rsv |= (((u8) cr) & 0x01))

/*
 * In the remaining five macros, 'info' is pointer to FRMR info field
 * structure; 'ind' is a byte containing the bit value to set in the
 * lowest-order bit)
 */
#define FRMR_INFO_SET_INVALID_PDU_CTRL_IND(info, ind) \
       (info->ind_bits = ((info->ind_bits & 0xFE) | (((u8) ind) & 0x01)))

#define FRMR_INFO_SET_INVALID_PDU_INFO_IND(info, ind) \
       (info->ind_bits = ( (info->ind_bits & 0xFD) | (((u8) ind) & 0x02)))

#define FRMR_INFO_SET_PDU_INFO_2LONG_IND(info, ind) \
       (info->ind_bits = ( (info->ind_bits & 0xFB) | (((u8) ind) & 0x04)))

#define FRMR_INFO_SET_PDU_INVALID_Nr_IND(info, ind) \
       (info->ind_bits = ( (info->ind_bits & 0xF7) | (((u8) ind) & 0x08)))

#define FRMR_INFO_SET_PDU_INVALID_Ns_IND(info, ind) \
       (info->ind_bits = ( (info->ind_bits & 0xEF) | (((u8) ind) & 0x10)))

/* Sequence-numbered PDU format (4 bytes in length) */
struct llc_pdu_sn {
        u8 dsap;
        u8 ssap;
        u8 ctrl_1;
        u8 ctrl_2;
} __packed;

static inline struct llc_pdu_sn *llc_pdu_sn_hdr(struct sk_buff *skb)
{
        return (struct llc_pdu_sn *)skb_network_header(skb);
}

/* Un-numbered PDU format (3 bytes in length) */
struct llc_pdu_un {
        u8 dsap;
        u8 ssap;
        u8 ctrl_1;
} __packed;

static inline struct llc_pdu_un *llc_pdu_un_hdr(struct sk_buff *skb)
{
        return (struct llc_pdu_un *)skb_network_header(skb);
}

/**
 *        llc_pdu_header_init - initializes pdu header
 *        @skb: input skb that header must be set into it.
 *        @type: type of PDU (U, I or S).
 *        @ssap: source sap.
 *        @dsap: destination sap.
 *        @cr: command/response bit (0 or 1).
 *
 *        This function sets DSAP, SSAP and command/Response bit in LLC header.
 */
static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type,
                                       u8 ssap, u8 dsap, u8 cr)
{
        int hlen = 4; /* default value for I and S types */
        struct llc_pdu_un *pdu;

        switch (type) {
        case LLC_PDU_TYPE_U:
                hlen = 3;
                break;
        case LLC_PDU_TYPE_U_XID:
                hlen = 6;
                break;
        }

        skb_push(skb, hlen);
        skb_reset_network_header(skb);
        pdu = llc_pdu_un_hdr(skb);
        pdu->dsap = dsap;
        pdu->ssap = ssap;
        pdu->ssap |= cr;
}

/**
 *        llc_pdu_decode_sa - extracts, source address (MAC) of input frame
 *        @skb: input skb that source address must be extracted from it.
 *        @sa: pointer to source address (6 byte array).
 *
 *        This function extracts source address(MAC) of input frame.
 */
static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa)
{
        memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN);
}

/**
 *        llc_pdu_decode_da - extracts dest address of input frame
 *        @skb: input skb that destination address must be extracted from it
 *        @da: pointer to destination address (6 byte array).
 *
 *        This function extracts destination address(MAC) of input frame.
 */
static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da)
{
        memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN);
}

/**
 *        llc_pdu_decode_ssap - extracts source SAP of input frame
 *        @skb: input skb that source SAP must be extracted from it.
 *        @ssap: source SAP (output argument).
 *
 *        This function extracts source SAP of input frame. Right bit of SSAP is
 *        command/response bit.
 */
static inline void llc_pdu_decode_ssap(struct sk_buff *skb, u8 *ssap)
{
        *ssap = llc_pdu_un_hdr(skb)->ssap & 0xFE;
}

/**
 *        llc_pdu_decode_dsap - extracts dest SAP of input frame
 *        @skb: input skb that destination SAP must be extracted from it.
 *        @dsap: destination SAP (output argument).
 *
 *        This function extracts destination SAP of input frame. right bit of
 *        DSAP designates individual/group SAP.
 */
static inline void llc_pdu_decode_dsap(struct sk_buff *skb, u8 *dsap)
{
        *dsap = llc_pdu_un_hdr(skb)->dsap & 0xFE;
}

/**
 *        llc_pdu_init_as_ui_cmd - sets LLC header as UI PDU
 *        @skb: input skb that header must be set into it.
 *
 *        This function sets third byte of LLC header as a UI PDU.
 */
static inline void llc_pdu_init_as_ui_cmd(struct sk_buff *skb)
{
        struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);

        pdu->ctrl_1  = LLC_PDU_TYPE_U;
        pdu->ctrl_1 |= LLC_1_PDU_CMD_UI;
}

/**
 *        llc_pdu_init_as_test_cmd - sets PDU as TEST
 *        @skb: Address of the skb to build
 *
 *         Sets a PDU as TEST
 */
static inline void llc_pdu_init_as_test_cmd(struct sk_buff *skb)
{
        struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);

        pdu->ctrl_1  = LLC_PDU_TYPE_U;
        pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST;
        pdu->ctrl_1 |= LLC_U_PF_BIT_MASK;
}

/**
 *        llc_pdu_init_as_test_rsp - build TEST response PDU
 *        @skb: Address of the skb to build
 *        @ev_skb: The received TEST command PDU frame
 *
 *        Builds a pdu frame as a TEST response.
 */
static inline void llc_pdu_init_as_test_rsp(struct sk_buff *skb,
                                            struct sk_buff *ev_skb)
{
        struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);

        pdu->ctrl_1  = LLC_PDU_TYPE_U;
        pdu->ctrl_1 |= LLC_1_PDU_CMD_TEST;
        pdu->ctrl_1 |= LLC_U_PF_BIT_MASK;
        if (ev_skb->protocol == htons(ETH_P_802_2)) {
                struct llc_pdu_un *ev_pdu = llc_pdu_un_hdr(ev_skb);
                int dsize;

                dsize = ntohs(eth_hdr(ev_skb)->h_proto) - 3;
                memcpy(((u8 *)pdu) + 3, ((u8 *)ev_pdu) + 3, dsize);
                skb_put(skb, dsize);
        }
}

/* LLC Type 1 XID command/response information fields format */
struct llc_xid_info {
        u8 fmt_id;        /* always 0x81 for LLC */
        u8 type;        /* different if NULL/non-NULL LSAP */
        u8 rw;                /* sender receive window */
} __packed;

/**
 *        llc_pdu_init_as_xid_cmd - sets bytes 3, 4 & 5 of LLC header as XID
 *        @skb: input skb that header must be set into it.
 *        @svcs_supported: The class of the LLC (I or II)
 *        @rx_window: The size of the receive window of the LLC
 *
 *        This function sets third,fourth,fifth and sixth bytes of LLC header as
 *        a XID PDU.
 */
static inline void llc_pdu_init_as_xid_cmd(struct sk_buff *skb,
                                           u8 svcs_supported, u8 rx_window)
{
        struct llc_xid_info *xid_info;
        struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);

        pdu->ctrl_1         = LLC_PDU_TYPE_U;
        pdu->ctrl_1        |= LLC_1_PDU_CMD_XID;
        pdu->ctrl_1        |= LLC_U_PF_BIT_MASK;
        xid_info         = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1);
        xid_info->fmt_id = LLC_XID_FMT_ID;        /* 0x81 */
        xid_info->type         = svcs_supported;
        xid_info->rw         = rx_window << 1;        /* size of receive window */

        /* no need to push/put since llc_pdu_header_init() has already
         * pushed 3 + 3 bytes
         */
}

/**
 *        llc_pdu_init_as_xid_rsp - builds XID response PDU
 *        @skb: Address of the skb to build
 *        @svcs_supported: The class of the LLC (I or II)
 *        @rx_window: The size of the receive window of the LLC
 *
 *        Builds a pdu frame as an XID response.
 */
static inline void llc_pdu_init_as_xid_rsp(struct sk_buff *skb,
                                           u8 svcs_supported, u8 rx_window)
{
        struct llc_xid_info *xid_info;
        struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);

        pdu->ctrl_1         = LLC_PDU_TYPE_U;
        pdu->ctrl_1        |= LLC_1_PDU_CMD_XID;
        pdu->ctrl_1        |= LLC_U_PF_BIT_MASK;

        xid_info         = (struct llc_xid_info *)(((u8 *)&pdu->ctrl_1) + 1);
        xid_info->fmt_id = LLC_XID_FMT_ID;
        xid_info->type         = svcs_supported;
        xid_info->rw         = rx_window << 1;
        skb_put(skb, sizeof(struct llc_xid_info));
}

/* LLC Type 2 FRMR response information field format */
struct llc_frmr_info {
        u16 rej_pdu_ctrl;        /* bits 1-8 if U-PDU */
        u8  curr_ssv;                /* current send state variable val */
        u8  curr_rsv;                /* current receive state variable */
        u8  ind_bits;                /* indicator bits set with macro */
} __packed;

void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 type);
void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value);
void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit);
void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit);
void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr);
void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr);
void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr);
void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr);
void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit);
void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit);
void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu,
                              u8 f_bit, u8 vs, u8 vr, u8 vzyxw);
void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr);
void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr);
void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr);
void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit);
#endif /* LLC_PDU_H */




























   78 











    2 




   29 


















   19 



   15 

   16 




















    7 




    8 



    8 


    3 
    8 




















    8 



   10 




   10 





    9 
    3 


   10 











    4 




    5 



    5 



    5 











    5 












    2 
    2 









    1 
    1 
    1 
    1 








   18 

















    8 

















   72 













   12 


   12 
   12 
   12 






   14 

























   14 



    5 


    5 


















    6 




    3 
    2 




   12 
    3 



   14 


    1 


   14 


   13 



    6 

   12 


   14 








   13 
   14 

   14 





    6 
   11 






   13 










   14 




    6 







  102 

  103 












   80 




   75 
    6 










   13 
   14 










   14 
   14 






















    1 


    1 







































    8 
    5 























































    1 









    4 







    4 
    3 


    4 

    4 
    2 

    1 




    3 

    3 





   13 

   14 

   14 


   14 


   13 




    8 








    8 

























    8 










    8 







    8 

    8 
    7 


    6 





    6 







    8 







    8 








    8 








    7 


    8 






    8 


    8 










    8 














    1 




    1 






















































   11 












































































   13 


   14 
   14 
   14 





   19 
    2 















    3 

   14 








    1 







    1 


    1 





















    3 



    3 
    3 



    1 


    2 












    2 













   14 



   12 



   12 














   12 


   12 















   11 
   11 




   11 










    2 


    1 

    1 

    1 




    1 

























   24 


   25 

    1 


    1 


    1 




    1 


    5 



    1 


    2 


    1 


    1 














   11 



   11 


    1 
    1 






    1 










    7 






    4 

    4 







    2 


    2 











    4 






    3 
    1 




    1 


    1 






    1 







    3 
    1 





    2 







   13 



















   13 







   14 
   15 


   14 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015 Linaro Ltd.
 * Author: Shannon Zhao <shannon.zhao@linaro.org>
 */

#include <linux/cpu.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/list.h>
#include <linux/perf_event.h>
#include <linux/perf/arm_pmu.h>
#include <linux/uaccess.h>
#include <asm/kvm_emulate.h>
#include <kvm/arm_pmu.h>
#include <kvm/arm_vgic.h>

#define PERF_ATTR_CFG1_COUNTER_64BIT        BIT(0)

static LIST_HEAD(arm_pmus);
static DEFINE_MUTEX(arm_pmus_lock);

static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc);
static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc);
static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc);

bool kvm_supports_guest_pmuv3(void)
{
        guard(mutex)(&arm_pmus_lock);
        return !list_empty(&arm_pmus);
}

static struct kvm_vcpu *kvm_pmc_to_vcpu(const struct kvm_pmc *pmc)
{
        return container_of(pmc, struct kvm_vcpu, arch.pmu.pmc[pmc->idx]);
}

static struct kvm_pmc *kvm_vcpu_idx_to_pmc(struct kvm_vcpu *vcpu, int cnt_idx)
{
        return &vcpu->arch.pmu.pmc[cnt_idx];
}

static u32 __kvm_pmu_event_mask(unsigned int pmuver)
{
        switch (pmuver) {
        case ID_AA64DFR0_EL1_PMUVer_IMP:
                return GENMASK(9, 0);
        case ID_AA64DFR0_EL1_PMUVer_V3P1:
        case ID_AA64DFR0_EL1_PMUVer_V3P4:
        case ID_AA64DFR0_EL1_PMUVer_V3P5:
        case ID_AA64DFR0_EL1_PMUVer_V3P7:
                return GENMASK(15, 0);
        default:                /* Shouldn't be here, just for sanity */
                WARN_ONCE(1, "Unknown PMU version %d\n", pmuver);
                return 0;
        }
}

static u32 kvm_pmu_event_mask(struct kvm *kvm)
{
        u64 dfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64DFR0_EL1);
        u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, dfr0);

        return __kvm_pmu_event_mask(pmuver);
}

u64 kvm_pmu_evtyper_mask(struct kvm *kvm)
{
        u64 mask = ARMV8_PMU_EXCLUDE_EL1 | ARMV8_PMU_EXCLUDE_EL0 |
                   kvm_pmu_event_mask(kvm);

        if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL2, IMP))
                mask |= ARMV8_PMU_INCLUDE_EL2;

        if (kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP))
                mask |= ARMV8_PMU_EXCLUDE_NS_EL0 |
                        ARMV8_PMU_EXCLUDE_NS_EL1 |
                        ARMV8_PMU_EXCLUDE_EL3;

        return mask;
}

/**
 * kvm_pmc_is_64bit - determine if counter is 64bit
 * @pmc: counter context
 */
static bool kvm_pmc_is_64bit(struct kvm_pmc *pmc)
{
        struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);

        return (pmc->idx == ARMV8_PMU_CYCLE_IDX ||
                kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5));
}

static bool kvm_pmc_has_64bit_overflow(struct kvm_pmc *pmc)
{
        struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
        u64 val = kvm_vcpu_read_pmcr(vcpu);

        if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx))
                return __vcpu_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HLP;

        return (pmc->idx < ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LP)) ||
               (pmc->idx == ARMV8_PMU_CYCLE_IDX && (val & ARMV8_PMU_PMCR_LC));
}

static bool kvm_pmu_counter_can_chain(struct kvm_pmc *pmc)
{
        return (!(pmc->idx & 1) && (pmc->idx + 1) < ARMV8_PMU_CYCLE_IDX &&
                !kvm_pmc_has_64bit_overflow(pmc));
}

static u32 counter_index_to_reg(u64 idx)
{
        return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCNTR_EL0 : PMEVCNTR0_EL0 + idx;
}

static u32 counter_index_to_evtreg(u64 idx)
{
        return (idx == ARMV8_PMU_CYCLE_IDX) ? PMCCFILTR_EL0 : PMEVTYPER0_EL0 + idx;
}

static u64 kvm_pmc_read_evtreg(const struct kvm_pmc *pmc)
{
        return __vcpu_sys_reg(kvm_pmc_to_vcpu(pmc), counter_index_to_evtreg(pmc->idx));
}

static u64 kvm_pmu_get_pmc_value(struct kvm_pmc *pmc)
{
        struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
        u64 counter, reg, enabled, running;

        reg = counter_index_to_reg(pmc->idx);
        counter = __vcpu_sys_reg(vcpu, reg);

        /*
         * The real counter value is equal to the value of counter register plus
         * the value perf event counts.
         */
        if (pmc->perf_event)
                counter += perf_event_read_value(pmc->perf_event, &enabled,
                                                 &running);

        if (!kvm_pmc_is_64bit(pmc))
                counter = lower_32_bits(counter);

        return counter;
}

/**
 * kvm_pmu_get_counter_value - get PMU counter value
 * @vcpu: The vcpu pointer
 * @select_idx: The counter index
 */
u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
{
        return kvm_pmu_get_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
}

static void kvm_pmu_set_pmc_value(struct kvm_pmc *pmc, u64 val, bool force)
{
        struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
        u64 reg;

        kvm_pmu_release_perf_event(pmc);

        reg = counter_index_to_reg(pmc->idx);

        if (vcpu_mode_is_32bit(vcpu) && pmc->idx != ARMV8_PMU_CYCLE_IDX &&
            !force) {
                /*
                 * Even with PMUv3p5, AArch32 cannot write to the top
                 * 32bit of the counters. The only possible course of
                 * action is to use PMCR.P, which will reset them to
                 * 0 (the only use of the 'force' parameter).
                 */
                val  = __vcpu_sys_reg(vcpu, reg) & GENMASK(63, 32);
                val |= lower_32_bits(val);
        }

        __vcpu_sys_reg(vcpu, reg) = val;

        /* Recreate the perf event to reflect the updated sample_period */
        kvm_pmu_create_perf_event(pmc);
}

/**
 * kvm_pmu_set_counter_value - set PMU counter value
 * @vcpu: The vcpu pointer
 * @select_idx: The counter index
 * @val: The counter value
 */
void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
{
        kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, select_idx), val, false);
}

/**
 * kvm_pmu_set_counter_value_user - set PMU counter value from user
 * @vcpu: The vcpu pointer
 * @select_idx: The counter index
 * @val: The counter value
 */
void kvm_pmu_set_counter_value_user(struct kvm_vcpu *vcpu, u64 select_idx, u64 val)
{
        kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, select_idx));
        __vcpu_sys_reg(vcpu, counter_index_to_reg(select_idx)) = val;
        kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);
}

/**
 * kvm_pmu_release_perf_event - remove the perf event
 * @pmc: The PMU counter pointer
 */
static void kvm_pmu_release_perf_event(struct kvm_pmc *pmc)
{
        if (pmc->perf_event) {
                perf_event_disable(pmc->perf_event);
                perf_event_release_kernel(pmc->perf_event);
                pmc->perf_event = NULL;
        }
}

/**
 * kvm_pmu_stop_counter - stop PMU counter
 * @pmc: The PMU counter pointer
 *
 * If this counter has been configured to monitor some event, release it here.
 */
static void kvm_pmu_stop_counter(struct kvm_pmc *pmc)
{
        struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
        u64 reg, val;

        if (!pmc->perf_event)
                return;

        val = kvm_pmu_get_pmc_value(pmc);

        reg = counter_index_to_reg(pmc->idx);

        __vcpu_sys_reg(vcpu, reg) = val;

        kvm_pmu_release_perf_event(pmc);
}

/**
 * kvm_pmu_vcpu_init - assign pmu counter idx for cpu
 * @vcpu: The vcpu pointer
 *
 */
void kvm_pmu_vcpu_init(struct kvm_vcpu *vcpu)
{
        int i;
        struct kvm_pmu *pmu = &vcpu->arch.pmu;

        for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++)
                pmu->pmc[i].idx = i;
}

/**
 * kvm_pmu_vcpu_destroy - free perf event of PMU for cpu
 * @vcpu: The vcpu pointer
 *
 */
void kvm_pmu_vcpu_destroy(struct kvm_vcpu *vcpu)
{
        int i;

        for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++)
                kvm_pmu_release_perf_event(kvm_vcpu_idx_to_pmc(vcpu, i));
        irq_work_sync(&vcpu->arch.pmu.overflow_work);
}

static u64 kvm_pmu_hyp_counter_mask(struct kvm_vcpu *vcpu)
{
        unsigned int hpmn, n;

        if (!vcpu_has_nv(vcpu))
                return 0;

        hpmn = SYS_FIELD_GET(MDCR_EL2, HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2));
        n = vcpu->kvm->arch.nr_pmu_counters;

        /*
         * Programming HPMN to a value greater than PMCR_EL0.N is
         * CONSTRAINED UNPREDICTABLE. Make the implementation choice that an
         * UNKNOWN number of counters (in our case, zero) are reserved for EL2.
         */
        if (hpmn >= n)
                return 0;

        /*
         * Programming HPMN=0 is CONSTRAINED UNPREDICTABLE if FEAT_HPMN0 isn't
         * implemented. Since KVM's ability to emulate HPMN=0 does not directly
         * depend on hardware (all PMU registers are trapped), make the
         * implementation choice that all counters are included in the second
         * range reserved for EL2/EL3.
         */
        return GENMASK(n - 1, hpmn);
}

bool kvm_pmu_counter_is_hyp(struct kvm_vcpu *vcpu, unsigned int idx)
{
        return kvm_pmu_hyp_counter_mask(vcpu) & BIT(idx);
}

u64 kvm_pmu_accessible_counter_mask(struct kvm_vcpu *vcpu)
{
        u64 mask = kvm_pmu_implemented_counter_mask(vcpu);

        if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu))
                return mask;

        return mask & ~kvm_pmu_hyp_counter_mask(vcpu);
}

u64 kvm_pmu_implemented_counter_mask(struct kvm_vcpu *vcpu)
{
        u64 val = FIELD_GET(ARMV8_PMU_PMCR_N, kvm_vcpu_read_pmcr(vcpu));

        if (val == 0)
                return BIT(ARMV8_PMU_CYCLE_IDX);
        else
                return GENMASK(val - 1, 0) | BIT(ARMV8_PMU_CYCLE_IDX);
}

static void kvm_pmc_enable_perf_event(struct kvm_pmc *pmc)
{
        if (!pmc->perf_event) {
                kvm_pmu_create_perf_event(pmc);
                return;
        }

        perf_event_enable(pmc->perf_event);
        if (pmc->perf_event->state != PERF_EVENT_STATE_ACTIVE)
                kvm_debug("fail to enable perf event\n");
}

static void kvm_pmc_disable_perf_event(struct kvm_pmc *pmc)
{
        if (pmc->perf_event)
                perf_event_disable(pmc->perf_event);
}

void kvm_pmu_reprogram_counter_mask(struct kvm_vcpu *vcpu, u64 val)
{
        int i;

        if (!val)
                return;

        for (i = 0; i < KVM_ARMV8_PMU_MAX_COUNTERS; i++) {
                struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);

                if (!(val & BIT(i)))
                        continue;

                if (kvm_pmu_counter_is_enabled(pmc))
                        kvm_pmc_enable_perf_event(pmc);
                else
                        kvm_pmc_disable_perf_event(pmc);
        }

        kvm_vcpu_pmu_restore_guest(vcpu);
}

/*
 * Returns the PMU overflow state, which is true if there exists an event
 * counter where the values of the global enable control, PMOVSSET_EL0[n], and
 * PMINTENSET_EL1[n] are all 1.
 */
static bool kvm_pmu_overflow_status(struct kvm_vcpu *vcpu)
{
        u64 reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0);

        reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1);

        /*
         * PMCR_EL0.E is the global enable control for event counters available
         * to EL0 and EL1.
         */
        if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E))
                reg &= kvm_pmu_hyp_counter_mask(vcpu);

        /*
         * Otherwise, MDCR_EL2.HPME is the global enable control for event
         * counters reserved for EL2.
         */
        if (!(vcpu_read_sys_reg(vcpu, MDCR_EL2) & MDCR_EL2_HPME))
                reg &= ~kvm_pmu_hyp_counter_mask(vcpu);

        return reg;
}

static void kvm_pmu_update_state(struct kvm_vcpu *vcpu)
{
        struct kvm_pmu *pmu = &vcpu->arch.pmu;
        bool overflow;

        overflow = kvm_pmu_overflow_status(vcpu);
        if (pmu->irq_level == overflow)
                return;

        pmu->irq_level = overflow;

        if (likely(irqchip_in_kernel(vcpu->kvm))) {
                int ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu,
                                              pmu->irq_num, overflow, pmu);
                WARN_ON(ret);
        }
}

bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu)
{
        struct kvm_pmu *pmu = &vcpu->arch.pmu;
        struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
        bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU;

        if (likely(irqchip_in_kernel(vcpu->kvm)))
                return false;

        return pmu->irq_level != run_level;
}

/*
 * Reflect the PMU overflow interrupt output level into the kvm_run structure
 */
void kvm_pmu_update_run(struct kvm_vcpu *vcpu)
{
        struct kvm_sync_regs *regs = &vcpu->run->s.regs;

        /* Populate the timer bitmap for user space */
        regs->device_irq_level &= ~KVM_ARM_DEV_PMU;
        if (vcpu->arch.pmu.irq_level)
                regs->device_irq_level |= KVM_ARM_DEV_PMU;
}

/**
 * kvm_pmu_flush_hwstate - flush pmu state to cpu
 * @vcpu: The vcpu pointer
 *
 * Check if the PMU has overflowed while we were running in the host, and inject
 * an interrupt if that was the case.
 */
void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu)
{
        kvm_pmu_update_state(vcpu);
}

/**
 * kvm_pmu_sync_hwstate - sync pmu state from cpu
 * @vcpu: The vcpu pointer
 *
 * Check if the PMU has overflowed while we were running in the guest, and
 * inject an interrupt if that was the case.
 */
void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu)
{
        kvm_pmu_update_state(vcpu);
}

/*
 * When perf interrupt is an NMI, we cannot safely notify the vcpu corresponding
 * to the event.
 * This is why we need a callback to do it once outside of the NMI context.
 */
static void kvm_pmu_perf_overflow_notify_vcpu(struct irq_work *work)
{
        struct kvm_vcpu *vcpu;

        vcpu = container_of(work, struct kvm_vcpu, arch.pmu.overflow_work);
        kvm_vcpu_kick(vcpu);
}

/*
 * Perform an increment on any of the counters described in @mask,
 * generating the overflow if required, and propagate it as a chained
 * event if possible.
 */
static void kvm_pmu_counter_increment(struct kvm_vcpu *vcpu,
                                      unsigned long mask, u32 event)
{
        int i;

        if (!(kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E))
                return;

        /* Weed out disabled counters */
        mask &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);

        for_each_set_bit(i, &mask, ARMV8_PMU_CYCLE_IDX) {
                struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);
                u64 type, reg;

                /* Filter on event type */
                type = __vcpu_sys_reg(vcpu, counter_index_to_evtreg(i));
                type &= kvm_pmu_event_mask(vcpu->kvm);
                if (type != event)
                        continue;

                /* Increment this counter */
                reg = __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) + 1;
                if (!kvm_pmc_is_64bit(pmc))
                        reg = lower_32_bits(reg);
                __vcpu_sys_reg(vcpu, counter_index_to_reg(i)) = reg;

                /* No overflow? move on */
                if (kvm_pmc_has_64bit_overflow(pmc) ? reg : lower_32_bits(reg))
                        continue;

                /* Mark overflow */
                __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);

                if (kvm_pmu_counter_can_chain(pmc))
                        kvm_pmu_counter_increment(vcpu, BIT(i + 1),
                                                  ARMV8_PMUV3_PERFCTR_CHAIN);
        }
}

/* Compute the sample period for a given counter value */
static u64 compute_period(struct kvm_pmc *pmc, u64 counter)
{
        u64 val;

        if (kvm_pmc_is_64bit(pmc) && kvm_pmc_has_64bit_overflow(pmc))
                val = (-counter) & GENMASK(63, 0);
        else
                val = (-counter) & GENMASK(31, 0);

        return val;
}

/*
 * When the perf event overflows, set the overflow status and inform the vcpu.
 */
static void kvm_pmu_perf_overflow(struct perf_event *perf_event,
                                  struct perf_sample_data *data,
                                  struct pt_regs *regs)
{
        struct kvm_pmc *pmc = perf_event->overflow_handler_context;
        struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu);
        struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
        int idx = pmc->idx;
        u64 period;

        cpu_pmu->pmu.stop(perf_event, PERF_EF_UPDATE);

        /*
         * Reset the sample period to the architectural limit,
         * i.e. the point where the counter overflows.
         */
        period = compute_period(pmc, local64_read(&perf_event->count));

        local64_set(&perf_event->hw.period_left, 0);
        perf_event->attr.sample_period = period;
        perf_event->hw.sample_period = period;

        __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(idx);

        if (kvm_pmu_counter_can_chain(pmc))
                kvm_pmu_counter_increment(vcpu, BIT(idx + 1),
                                          ARMV8_PMUV3_PERFCTR_CHAIN);

        if (kvm_pmu_overflow_status(vcpu)) {
                kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);

                if (!in_nmi())
                        kvm_vcpu_kick(vcpu);
                else
                        irq_work_queue(&vcpu->arch.pmu.overflow_work);
        }

        cpu_pmu->pmu.start(perf_event, PERF_EF_RELOAD);
}

/**
 * kvm_pmu_software_increment - do software increment
 * @vcpu: The vcpu pointer
 * @val: the value guest writes to PMSWINC register
 */
void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
{
        kvm_pmu_counter_increment(vcpu, val, ARMV8_PMUV3_PERFCTR_SW_INCR);
}

/**
 * kvm_pmu_handle_pmcr - handle PMCR register
 * @vcpu: The vcpu pointer
 * @val: the value guest writes to PMCR register
 */
void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val)
{
        int i;

        /* Fixup PMCR_EL0 to reconcile the PMU version and the LP bit */
        if (!kvm_has_feat(vcpu->kvm, ID_AA64DFR0_EL1, PMUVer, V3P5))
                val &= ~ARMV8_PMU_PMCR_LP;

        /* Request a reload of the PMU to enable/disable affected counters */
        if ((__vcpu_sys_reg(vcpu, PMCR_EL0) ^ val) & ARMV8_PMU_PMCR_E)
                kvm_make_request(KVM_REQ_RELOAD_PMU, vcpu);

        /* The reset bits don't indicate any state, and shouldn't be saved. */
        __vcpu_sys_reg(vcpu, PMCR_EL0) = val & ~(ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_P);

        if (val & ARMV8_PMU_PMCR_C)
                kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0);

        if (val & ARMV8_PMU_PMCR_P) {
                unsigned long mask = kvm_pmu_implemented_counter_mask(vcpu) &
                                     ~BIT(ARMV8_PMU_CYCLE_IDX);

                if (!vcpu_is_el2(vcpu))
                        mask &= ~kvm_pmu_hyp_counter_mask(vcpu);

                for_each_set_bit(i, &mask, 32)
                        kvm_pmu_set_pmc_value(kvm_vcpu_idx_to_pmc(vcpu, i), 0, true);
        }
}

static bool kvm_pmu_counter_is_enabled(struct kvm_pmc *pmc)
{
        struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
        unsigned int mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);

        if (!(__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & BIT(pmc->idx)))
                return false;

        if (kvm_pmu_counter_is_hyp(vcpu, pmc->idx))
                return mdcr & MDCR_EL2_HPME;

        return kvm_vcpu_read_pmcr(vcpu) & ARMV8_PMU_PMCR_E;
}

static bool kvm_pmc_counts_at_el0(struct kvm_pmc *pmc)
{
        u64 evtreg = kvm_pmc_read_evtreg(pmc);
        bool nsu = evtreg & ARMV8_PMU_EXCLUDE_NS_EL0;
        bool u = evtreg & ARMV8_PMU_EXCLUDE_EL0;

        return u == nsu;
}

static bool kvm_pmc_counts_at_el1(struct kvm_pmc *pmc)
{
        u64 evtreg = kvm_pmc_read_evtreg(pmc);
        bool nsk = evtreg & ARMV8_PMU_EXCLUDE_NS_EL1;
        bool p = evtreg & ARMV8_PMU_EXCLUDE_EL1;

        return p == nsk;
}

static bool kvm_pmc_counts_at_el2(struct kvm_pmc *pmc)
{
        struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
        u64 mdcr = __vcpu_sys_reg(vcpu, MDCR_EL2);

        if (!kvm_pmu_counter_is_hyp(vcpu, pmc->idx) && (mdcr & MDCR_EL2_HPMD))
                return false;

        return kvm_pmc_read_evtreg(pmc) & ARMV8_PMU_INCLUDE_EL2;
}

static int kvm_map_pmu_event(struct kvm *kvm, unsigned int eventsel)
{
        struct arm_pmu *pmu = kvm->arch.arm_pmu;

        /*
         * The CPU PMU likely isn't PMUv3; let the driver provide a mapping
         * for the guest's PMUv3 event ID.
         */
        if (unlikely(pmu->map_pmuv3_event))
                return pmu->map_pmuv3_event(eventsel);

        return eventsel;
}

/**
 * kvm_pmu_create_perf_event - create a perf event for a counter
 * @pmc: Counter context
 */
static void kvm_pmu_create_perf_event(struct kvm_pmc *pmc)
{
        struct kvm_vcpu *vcpu = kvm_pmc_to_vcpu(pmc);
        struct arm_pmu *arm_pmu = vcpu->kvm->arch.arm_pmu;
        struct perf_event *event;
        struct perf_event_attr attr;
        int eventsel;
        u64 evtreg;

        evtreg = kvm_pmc_read_evtreg(pmc);

        kvm_pmu_stop_counter(pmc);
        if (pmc->idx == ARMV8_PMU_CYCLE_IDX)
                eventsel = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
        else
                eventsel = evtreg & kvm_pmu_event_mask(vcpu->kvm);

        /*
         * Neither SW increment nor chained events need to be backed
         * by a perf event.
         */
        if (eventsel == ARMV8_PMUV3_PERFCTR_SW_INCR ||
            eventsel == ARMV8_PMUV3_PERFCTR_CHAIN)
                return;

        /*
         * If we have a filter in place and that the event isn't allowed, do
         * not install a perf event either.
         */
        if (vcpu->kvm->arch.pmu_filter &&
            !test_bit(eventsel, vcpu->kvm->arch.pmu_filter))
                return;

        /*
         * Don't create an event if we're running on hardware that requires
         * PMUv3 event translation and we couldn't find a valid mapping.
         */
        eventsel = kvm_map_pmu_event(vcpu->kvm, eventsel);
        if (eventsel < 0)
                return;

        memset(&attr, 0, sizeof(struct perf_event_attr));
        attr.type = arm_pmu->pmu.type;
        attr.size = sizeof(attr);
        attr.pinned = 1;
        attr.disabled = !kvm_pmu_counter_is_enabled(pmc);
        attr.exclude_user = !kvm_pmc_counts_at_el0(pmc);
        attr.exclude_hv = 1; /* Don't count EL2 events */
        attr.exclude_host = 1; /* Don't count host events */
        attr.config = eventsel;

        /*
         * Filter events at EL1 (i.e. vEL2) when in a hyp context based on the
         * guest's EL2 filter.
         */
        if (unlikely(is_hyp_ctxt(vcpu)))
                attr.exclude_kernel = !kvm_pmc_counts_at_el2(pmc);
        else
                attr.exclude_kernel = !kvm_pmc_counts_at_el1(pmc);

        /*
         * If counting with a 64bit counter, advertise it to the perf
         * code, carefully dealing with the initial sample period
         * which also depends on the overflow.
         */
        if (kvm_pmc_is_64bit(pmc))
                attr.config1 |= PERF_ATTR_CFG1_COUNTER_64BIT;

        attr.sample_period = compute_period(pmc, kvm_pmu_get_pmc_value(pmc));

        event = perf_event_create_kernel_counter(&attr, -1, current,
                                                 kvm_pmu_perf_overflow, pmc);

        if (IS_ERR(event)) {
                pr_err_once("kvm: pmu event creation failed %ld\n",
                            PTR_ERR(event));
                return;
        }

        pmc->perf_event = event;
}

/**
 * kvm_pmu_set_counter_event_type - set selected counter to monitor some event
 * @vcpu: The vcpu pointer
 * @data: The data guest writes to PMXEVTYPER_EL0
 * @select_idx: The number of selected counter
 *
 * When OS accesses PMXEVTYPER_EL0, that means it wants to set a PMC to count an
 * event with given hardware event number. Here we call perf_event API to
 * emulate this action and create a kernel perf event for it.
 */
void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data,
                                    u64 select_idx)
{
        struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, select_idx);
        u64 reg;

        reg = counter_index_to_evtreg(pmc->idx);
        __vcpu_sys_reg(vcpu, reg) = data & kvm_pmu_evtyper_mask(vcpu->kvm);

        kvm_pmu_create_perf_event(pmc);
}

void kvm_host_pmu_init(struct arm_pmu *pmu)
{
        struct arm_pmu_entry *entry;

        /*
         * Check the sanitised PMU version for the system, as KVM does not
         * support implementations where PMUv3 exists on a subset of CPUs.
         */
        if (!pmuv3_implemented(kvm_arm_pmu_get_pmuver_limit()))
                return;

        guard(mutex)(&arm_pmus_lock);

        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
                return;

        entry->arm_pmu = pmu;
        list_add_tail(&entry->entry, &arm_pmus);
}

static struct arm_pmu *kvm_pmu_probe_armpmu(void)
{
        struct arm_pmu_entry *entry;
        struct arm_pmu *pmu;
        int cpu;

        guard(mutex)(&arm_pmus_lock);

        /*
         * It is safe to use a stale cpu to iterate the list of PMUs so long as
         * the same value is used for the entirety of the loop. Given this, and
         * the fact that no percpu data is used for the lookup there is no need
         * to disable preemption.
         *
         * It is still necessary to get a valid cpu, though, to probe for the
         * default PMU instance as userspace is not required to specify a PMU
         * type. In order to uphold the preexisting behavior KVM selects the
         * PMU instance for the core during vcpu init. A dependent use
         * case would be a user with disdain of all things big.LITTLE that
         * affines the VMM to a particular cluster of cores.
         *
         * In any case, userspace should just do the sane thing and use the UAPI
         * to select a PMU type directly. But, be wary of the baggage being
         * carried here.
         */
        cpu = raw_smp_processor_id();
        list_for_each_entry(entry, &arm_pmus, entry) {
                pmu = entry->arm_pmu;

                if (cpumask_test_cpu(cpu, &pmu->supported_cpus))
                        return pmu;
        }

        return NULL;
}

static u64 __compute_pmceid(struct arm_pmu *pmu, bool pmceid1)
{
        u32 hi[2], lo[2];

        bitmap_to_arr32(lo, pmu->pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
        bitmap_to_arr32(hi, pmu->pmceid_ext_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);

        return ((u64)hi[pmceid1] << 32) | lo[pmceid1];
}

static u64 compute_pmceid0(struct arm_pmu *pmu)
{
        u64 val = __compute_pmceid(pmu, 0);

        /* always support SW_INCR */
        val |= BIT(ARMV8_PMUV3_PERFCTR_SW_INCR);
        /* always support CHAIN */
        val |= BIT(ARMV8_PMUV3_PERFCTR_CHAIN);
        return val;
}

static u64 compute_pmceid1(struct arm_pmu *pmu)
{
        u64 val = __compute_pmceid(pmu, 1);

        /*
         * Don't advertise STALL_SLOT*, as PMMIR_EL0 is handled
         * as RAZ
         */
        val &= ~(BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT - 32) |
                 BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND - 32) |
                 BIT_ULL(ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND - 32));
        return val;
}

u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
{
        struct arm_pmu *cpu_pmu = vcpu->kvm->arch.arm_pmu;
        unsigned long *bmap = vcpu->kvm->arch.pmu_filter;
        u64 val, mask = 0;
        int base, i, nr_events;

        if (!pmceid1) {
                val = compute_pmceid0(cpu_pmu);
                base = 0;
        } else {
                val = compute_pmceid1(cpu_pmu);
                base = 32;
        }

        if (!bmap)
                return val;

        nr_events = kvm_pmu_event_mask(vcpu->kvm) + 1;

        for (i = 0; i < 32; i += 8) {
                u64 byte;

                byte = bitmap_get_value8(bmap, base + i);
                mask |= byte << i;
                if (nr_events >= (0x4000 + base + 32)) {
                        byte = bitmap_get_value8(bmap, 0x4000 + base + i);
                        mask |= byte << (32 + i);
                }
        }

        return val & mask;
}

void kvm_vcpu_reload_pmu(struct kvm_vcpu *vcpu)
{
        u64 mask = kvm_pmu_implemented_counter_mask(vcpu);

        __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= mask;
        __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= mask;
        __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= mask;

        kvm_pmu_reprogram_counter_mask(vcpu, mask);
}

int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu)
{
        if (!vcpu->arch.pmu.created)
                return -EINVAL;

        /*
         * A valid interrupt configuration for the PMU is either to have a
         * properly configured interrupt number and using an in-kernel
         * irqchip, or to not have an in-kernel GIC and not set an IRQ.
         */
        if (irqchip_in_kernel(vcpu->kvm)) {
                int irq = vcpu->arch.pmu.irq_num;
                /*
                 * If we are using an in-kernel vgic, at this point we know
                 * the vgic will be initialized, so we can check the PMU irq
                 * number against the dimensions of the vgic and make sure
                 * it's valid.
                 */
                if (!irq_is_ppi(irq) && !vgic_valid_spi(vcpu->kvm, irq))
                        return -EINVAL;
        } else if (kvm_arm_pmu_irq_initialized(vcpu)) {
                   return -EINVAL;
        }

        return 0;
}

static int kvm_arm_pmu_v3_init(struct kvm_vcpu *vcpu)
{
        if (irqchip_in_kernel(vcpu->kvm)) {
                int ret;

                /*
                 * If using the PMU with an in-kernel virtual GIC
                 * implementation, we require the GIC to be already
                 * initialized when initializing the PMU.
                 */
                if (!vgic_initialized(vcpu->kvm))
                        return -ENODEV;

                if (!kvm_arm_pmu_irq_initialized(vcpu))
                        return -ENXIO;

                ret = kvm_vgic_set_owner(vcpu, vcpu->arch.pmu.irq_num,
                                         &vcpu->arch.pmu);
                if (ret)
                        return ret;
        }

        init_irq_work(&vcpu->arch.pmu.overflow_work,
                      kvm_pmu_perf_overflow_notify_vcpu);

        vcpu->arch.pmu.created = true;
        return 0;
}

/*
 * For one VM the interrupt type must be same for each vcpu.
 * As a PPI, the interrupt number is the same for all vcpus,
 * while as an SPI it must be a separate number per vcpu.
 */
static bool pmu_irq_is_valid(struct kvm *kvm, int irq)
{
        unsigned long i;
        struct kvm_vcpu *vcpu;

        kvm_for_each_vcpu(i, vcpu, kvm) {
                if (!kvm_arm_pmu_irq_initialized(vcpu))
                        continue;

                if (irq_is_ppi(irq)) {
                        if (vcpu->arch.pmu.irq_num != irq)
                                return false;
                } else {
                        if (vcpu->arch.pmu.irq_num == irq)
                                return false;
                }
        }

        return true;
}

/**
 * kvm_arm_pmu_get_max_counters - Return the max number of PMU counters.
 * @kvm: The kvm pointer
 */
u8 kvm_arm_pmu_get_max_counters(struct kvm *kvm)
{
        struct arm_pmu *arm_pmu = kvm->arch.arm_pmu;

        /*
         * PMUv3 requires that all event counters are capable of counting any
         * event, though the same may not be true of non-PMUv3 hardware.
         */
        if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
                return 1;

        /*
         * The arm_pmu->cntr_mask considers the fixed counter(s) as well.
         * Ignore those and return only the general-purpose counters.
         */
        return bitmap_weight(arm_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS);
}

static void kvm_arm_set_nr_counters(struct kvm *kvm, unsigned int nr)
{
        kvm->arch.nr_pmu_counters = nr;

        /* Reset MDCR_EL2.HPMN behind the vcpus' back... */
        if (test_bit(KVM_ARM_VCPU_HAS_EL2, kvm->arch.vcpu_features)) {
                struct kvm_vcpu *vcpu;
                unsigned long i;

                kvm_for_each_vcpu(i, vcpu, kvm) {
                        u64 val = __vcpu_sys_reg(vcpu, MDCR_EL2);
                        val &= ~MDCR_EL2_HPMN;
                        val |= FIELD_PREP(MDCR_EL2_HPMN, kvm->arch.nr_pmu_counters);
                        __vcpu_sys_reg(vcpu, MDCR_EL2) = val;
                }
        }
}

static void kvm_arm_set_pmu(struct kvm *kvm, struct arm_pmu *arm_pmu)
{
        lockdep_assert_held(&kvm->arch.config_lock);

        kvm->arch.arm_pmu = arm_pmu;
        kvm_arm_set_nr_counters(kvm, kvm_arm_pmu_get_max_counters(kvm));
}

/**
 * kvm_arm_set_default_pmu - No PMU set, get the default one.
 * @kvm: The kvm pointer
 *
 * The observant among you will notice that the supported_cpus
 * mask does not get updated for the default PMU even though it
 * is quite possible the selected instance supports only a
 * subset of cores in the system. This is intentional, and
 * upholds the preexisting behavior on heterogeneous systems
 * where vCPUs can be scheduled on any core but the guest
 * counters could stop working.
 */
int kvm_arm_set_default_pmu(struct kvm *kvm)
{
        struct arm_pmu *arm_pmu = kvm_pmu_probe_armpmu();

        if (!arm_pmu)
                return -ENODEV;

        kvm_arm_set_pmu(kvm, arm_pmu);
        return 0;
}

static int kvm_arm_pmu_v3_set_pmu(struct kvm_vcpu *vcpu, int pmu_id)
{
        struct kvm *kvm = vcpu->kvm;
        struct arm_pmu_entry *entry;
        struct arm_pmu *arm_pmu;
        int ret = -ENXIO;

        lockdep_assert_held(&kvm->arch.config_lock);
        mutex_lock(&arm_pmus_lock);

        list_for_each_entry(entry, &arm_pmus, entry) {
                arm_pmu = entry->arm_pmu;
                if (arm_pmu->pmu.type == pmu_id) {
                        if (kvm_vm_has_ran_once(kvm) ||
                            (kvm->arch.pmu_filter && kvm->arch.arm_pmu != arm_pmu)) {
                                ret = -EBUSY;
                                break;
                        }

                        kvm_arm_set_pmu(kvm, arm_pmu);
                        cpumask_copy(kvm->arch.supported_cpus, &arm_pmu->supported_cpus);
                        ret = 0;
                        break;
                }
        }

        mutex_unlock(&arm_pmus_lock);
        return ret;
}

static int kvm_arm_pmu_v3_set_nr_counters(struct kvm_vcpu *vcpu, unsigned int n)
{
        struct kvm *kvm = vcpu->kvm;

        if (!kvm->arch.arm_pmu)
                return -EINVAL;

        if (n > kvm_arm_pmu_get_max_counters(kvm))
                return -EINVAL;

        kvm_arm_set_nr_counters(kvm, n);
        return 0;
}

int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
        struct kvm *kvm = vcpu->kvm;

        lockdep_assert_held(&kvm->arch.config_lock);

        if (!kvm_vcpu_has_pmu(vcpu))
                return -ENODEV;

        if (vcpu->arch.pmu.created)
                return -EBUSY;

        switch (attr->attr) {
        case KVM_ARM_VCPU_PMU_V3_IRQ: {
                int __user *uaddr = (int __user *)(long)attr->addr;
                int irq;

                if (!irqchip_in_kernel(kvm))
                        return -EINVAL;

                if (get_user(irq, uaddr))
                        return -EFAULT;

                /* The PMU overflow interrupt can be a PPI or a valid SPI. */
                if (!(irq_is_ppi(irq) || irq_is_spi(irq)))
                        return -EINVAL;

                if (!pmu_irq_is_valid(kvm, irq))
                        return -EINVAL;

                if (kvm_arm_pmu_irq_initialized(vcpu))
                        return -EBUSY;

                kvm_debug("Set kvm ARM PMU irq: %d\n", irq);
                vcpu->arch.pmu.irq_num = irq;
                return 0;
        }
        case KVM_ARM_VCPU_PMU_V3_FILTER: {
                u8 pmuver = kvm_arm_pmu_get_pmuver_limit();
                struct kvm_pmu_event_filter __user *uaddr;
                struct kvm_pmu_event_filter filter;
                int nr_events;

                /*
                 * Allow userspace to specify an event filter for the entire
                 * event range supported by PMUVer of the hardware, rather
                 * than the guest's PMUVer for KVM backward compatibility.
                 */
                nr_events = __kvm_pmu_event_mask(pmuver) + 1;

                uaddr = (struct kvm_pmu_event_filter __user *)(long)attr->addr;

                if (copy_from_user(&filter, uaddr, sizeof(filter)))
                        return -EFAULT;

                if (((u32)filter.base_event + filter.nevents) > nr_events ||
                    (filter.action != KVM_PMU_EVENT_ALLOW &&
                     filter.action != KVM_PMU_EVENT_DENY))
                        return -EINVAL;

                if (kvm_vm_has_ran_once(kvm))
                        return -EBUSY;

                if (!kvm->arch.pmu_filter) {
                        kvm->arch.pmu_filter = bitmap_alloc(nr_events, GFP_KERNEL_ACCOUNT);
                        if (!kvm->arch.pmu_filter)
                                return -ENOMEM;

                        /*
                         * The default depends on the first applied filter.
                         * If it allows events, the default is to deny.
                         * Conversely, if the first filter denies a set of
                         * events, the default is to allow.
                         */
                        if (filter.action == KVM_PMU_EVENT_ALLOW)
                                bitmap_zero(kvm->arch.pmu_filter, nr_events);
                        else
                                bitmap_fill(kvm->arch.pmu_filter, nr_events);
                }

                if (filter.action == KVM_PMU_EVENT_ALLOW)
                        bitmap_set(kvm->arch.pmu_filter, filter.base_event, filter.nevents);
                else
                        bitmap_clear(kvm->arch.pmu_filter, filter.base_event, filter.nevents);

                return 0;
        }
        case KVM_ARM_VCPU_PMU_V3_SET_PMU: {
                int __user *uaddr = (int __user *)(long)attr->addr;
                int pmu_id;

                if (get_user(pmu_id, uaddr))
                        return -EFAULT;

                return kvm_arm_pmu_v3_set_pmu(vcpu, pmu_id);
        }
        case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS: {
                unsigned int __user *uaddr = (unsigned int __user *)(long)attr->addr;
                unsigned int n;

                if (get_user(n, uaddr))
                        return -EFAULT;

                return kvm_arm_pmu_v3_set_nr_counters(vcpu, n);
        }
        case KVM_ARM_VCPU_PMU_V3_INIT:
                return kvm_arm_pmu_v3_init(vcpu);
        }

        return -ENXIO;
}

int kvm_arm_pmu_v3_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
        switch (attr->attr) {
        case KVM_ARM_VCPU_PMU_V3_IRQ: {
                int __user *uaddr = (int __user *)(long)attr->addr;
                int irq;

                if (!irqchip_in_kernel(vcpu->kvm))
                        return -EINVAL;

                if (!kvm_vcpu_has_pmu(vcpu))
                        return -ENODEV;

                if (!kvm_arm_pmu_irq_initialized(vcpu))
                        return -ENXIO;

                irq = vcpu->arch.pmu.irq_num;
                return put_user(irq, uaddr);
        }
        }

        return -ENXIO;
}

int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
        switch (attr->attr) {
        case KVM_ARM_VCPU_PMU_V3_IRQ:
        case KVM_ARM_VCPU_PMU_V3_INIT:
        case KVM_ARM_VCPU_PMU_V3_FILTER:
        case KVM_ARM_VCPU_PMU_V3_SET_PMU:
        case KVM_ARM_VCPU_PMU_V3_SET_NR_COUNTERS:
                if (kvm_vcpu_has_pmu(vcpu))
                        return 0;
        }

        return -ENXIO;
}

u8 kvm_arm_pmu_get_pmuver_limit(void)
{
        unsigned int pmuver;

        pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer,
                               read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1));

        /*
         * Spoof a barebones PMUv3 implementation if the system supports IMPDEF
         * traps of the PMUv3 sysregs
         */
        if (cpus_have_final_cap(ARM64_WORKAROUND_PMUV3_IMPDEF_TRAPS))
                return ID_AA64DFR0_EL1_PMUVer_IMP;

        /*
         * Otherwise, treat IMPLEMENTATION DEFINED functionality as
         * unimplemented
         */
        if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
                return 0;

        return min(pmuver, ID_AA64DFR0_EL1_PMUVer_V3P5);
}

/**
 * kvm_vcpu_read_pmcr - Read PMCR_EL0 register for the vCPU
 * @vcpu: The vcpu pointer
 */
u64 kvm_vcpu_read_pmcr(struct kvm_vcpu *vcpu)
{
        u64 pmcr = __vcpu_sys_reg(vcpu, PMCR_EL0);
        u64 n = vcpu->kvm->arch.nr_pmu_counters;

        if (vcpu_has_nv(vcpu) && !vcpu_is_el2(vcpu))
                n = FIELD_GET(MDCR_EL2_HPMN, __vcpu_sys_reg(vcpu, MDCR_EL2));

        return u64_replace_bits(pmcr, n, ARMV8_PMU_PMCR_N);
}

void kvm_pmu_nested_transition(struct kvm_vcpu *vcpu)
{
        bool reprogrammed = false;
        unsigned long mask;
        int i;

        mask = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
        for_each_set_bit(i, &mask, 32) {
                struct kvm_pmc *pmc = kvm_vcpu_idx_to_pmc(vcpu, i);

                /*
                 * We only need to reconfigure events where the filter is
                 * different at EL1 vs. EL2, as we're multiplexing the true EL1
                 * event filter bit for nested.
                 */
                if (kvm_pmc_counts_at_el1(pmc) == kvm_pmc_counts_at_el2(pmc))
                        continue;

                kvm_pmu_create_perf_event(pmc);
                reprogrammed = true;
        }

        if (reprogrammed)
                kvm_vcpu_pmu_restore_guest(vcpu);
}
















































   15 








   15 















    2 










































    1 
























    1 




    1 






    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// SPDX-License-Identifier: GPL-2.0
/*
 * Hyp portion of the (not much of an) Emulation layer for 32bit guests.
 *
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * based on arch/arm/kvm/emulate.c
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>

/*
 * stolen from arch/arm/kernel/opcodes.c
 *
 * condition code lookup table
 * index into the table is test code: EQ, NE, ... LT, GT, AL, NV
 *
 * bit position in short is condition code: NZCV
 */
static const unsigned short cc_map[16] = {
        0xF0F0,                        /* EQ == Z set            */
        0x0F0F,                        /* NE                     */
        0xCCCC,                        /* CS == C set            */
        0x3333,                        /* CC                     */
        0xFF00,                        /* MI == N set            */
        0x00FF,                        /* PL                     */
        0xAAAA,                        /* VS == V set            */
        0x5555,                        /* VC                     */
        0x0C0C,                        /* HI == C set && Z clear */
        0xF3F3,                        /* LS == C clear || Z set */
        0xAA55,                        /* GE == (N==V)           */
        0x55AA,                        /* LT == (N!=V)           */
        0x0A05,                        /* GT == (!Z && (N==V))   */
        0xF5FA,                        /* LE == (Z || (N!=V))    */
        0xFFFF,                        /* AL always              */
        0                        /* NV                     */
};

/*
 * Check if a trapped instruction should have been executed or not.
 */
bool kvm_condition_valid32(const struct kvm_vcpu *vcpu)
{
        unsigned long cpsr;
        u32 cpsr_cond;
        int cond;

        /*
         * These are the exception classes that could fire with a
         * conditional instruction.
         */
        switch (kvm_vcpu_trap_get_class(vcpu)) {
        case ESR_ELx_EC_CP15_32:
        case ESR_ELx_EC_CP15_64:
        case ESR_ELx_EC_CP14_MR:
        case ESR_ELx_EC_CP14_LS:
        case ESR_ELx_EC_FP_ASIMD:
        case ESR_ELx_EC_CP10_ID:
        case ESR_ELx_EC_CP14_64:
        case ESR_ELx_EC_SVC32:
                break;
        default:
                return true;
        }

        /* Is condition field valid? */
        cond = kvm_vcpu_get_condition(vcpu);
        if (cond == 0xE)
                return true;

        cpsr = *vcpu_cpsr(vcpu);

        if (cond < 0) {
                /* This can happen in Thumb mode: examine IT state. */
                unsigned long it;

                it = ((cpsr >> 8) & 0xFC) | ((cpsr >> 25) & 0x3);

                /* it == 0 => unconditional. */
                if (it == 0)
                        return true;

                /* The cond for this insn works out as the top 4 bits. */
                cond = (it >> 4);
        }

        cpsr_cond = cpsr >> 28;

        if (!((cc_map[cond] >> cpsr_cond) & 1))
                return false;

        return true;
}

/**
 * kvm_adjust_itstate - adjust ITSTATE when emulating instructions in IT-block
 * @vcpu:        The VCPU pointer
 *
 * When exceptions occur while instructions are executed in Thumb IF-THEN
 * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
 * to do this little bit of work manually. The fields map like this:
 *
 * IT[7:0] -> CPSR[26:25],CPSR[15:10]
 */
static void kvm_adjust_itstate(struct kvm_vcpu *vcpu)
{
        unsigned long itbits, cond;
        unsigned long cpsr = *vcpu_cpsr(vcpu);
        bool is_arm = !(cpsr & PSR_AA32_T_BIT);

        if (is_arm || !(cpsr & PSR_AA32_IT_MASK))
                return;

        cond = (cpsr & 0xe000) >> 13;
        itbits = (cpsr & 0x1c00) >> (10 - 2);
        itbits |= (cpsr & (0x3 << 25)) >> 25;

        /* Perform ITAdvance (see page A2-52 in ARM DDI 0406C) */
        if ((itbits & 0x7) == 0)
                itbits = cond = 0;
        else
                itbits = (itbits << 1) & 0x1f;

        cpsr &= ~PSR_AA32_IT_MASK;
        cpsr |= cond << 13;
        cpsr |= (itbits & 0x1c) << (10 - 2);
        cpsr |= (itbits & 0x3) << 25;
        *vcpu_cpsr(vcpu) = cpsr;
}

/**
 * kvm_skip_instr32 - skip a trapped instruction and proceed to the next
 * @vcpu: The vcpu pointer
 */
void kvm_skip_instr32(struct kvm_vcpu *vcpu)
{
        u32 pc = *vcpu_pc(vcpu);
        bool is_thumb;

        is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
        if (is_thumb && !kvm_vcpu_trap_il_is32bit(vcpu))
                pc += 2;
        else
                pc += 4;

        *vcpu_pc(vcpu) = pc;

        kvm_adjust_itstate(vcpu);
}


































































































  161 














  162 




















  126 




  165 
















  166 
  159 








  162 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
// SPDX-License-Identifier: GPL-2.0
/*
 * VMID allocator.
 *
 * Based on Arm64 ASID allocator algorithm.
 * Please refer arch/arm64/mm/context.c for detailed
 * comments on algorithm.
 *
 * Copyright (C) 2002-2003 Deep Blue Solutions Ltd, all rights reserved.
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/bitfield.h>
#include <linux/bitops.h>

#include <asm/kvm_asm.h>
#include <asm/kvm_mmu.h>

unsigned int __ro_after_init kvm_arm_vmid_bits;
static DEFINE_RAW_SPINLOCK(cpu_vmid_lock);

static atomic64_t vmid_generation;
static unsigned long *vmid_map;

static DEFINE_PER_CPU(atomic64_t, active_vmids);
static DEFINE_PER_CPU(u64, reserved_vmids);

#define VMID_MASK                (~GENMASK(kvm_arm_vmid_bits - 1, 0))
#define VMID_FIRST_VERSION        (1UL << kvm_arm_vmid_bits)

#define NUM_USER_VMIDS                VMID_FIRST_VERSION
#define vmid2idx(vmid)                ((vmid) & ~VMID_MASK)
#define idx2vmid(idx)                vmid2idx(idx)

/*
 * As vmid #0 is always reserved, we will never allocate one
 * as below and can be treated as invalid. This is used to
 * set the active_vmids on vCPU schedule out.
 */
#define VMID_ACTIVE_INVALID                VMID_FIRST_VERSION

#define vmid_gen_match(vmid) \
        (!(((vmid) ^ atomic64_read(&vmid_generation)) >> kvm_arm_vmid_bits))

static void flush_context(void)
{
        int cpu;
        u64 vmid;

        bitmap_zero(vmid_map, NUM_USER_VMIDS);

        for_each_possible_cpu(cpu) {
                vmid = atomic64_xchg_relaxed(&per_cpu(active_vmids, cpu), 0);

                /* Preserve reserved VMID */
                if (vmid == 0)
                        vmid = per_cpu(reserved_vmids, cpu);
                __set_bit(vmid2idx(vmid), vmid_map);
                per_cpu(reserved_vmids, cpu) = vmid;
        }

        /*
         * Unlike ASID allocator, we expect less frequent rollover in
         * case of VMIDs. Hence, instead of marking the CPU as
         * flush_pending and issuing a local context invalidation on
         * the next context-switch, we broadcast TLB flush + I-cache
         * invalidation over the inner shareable domain on rollover.
         */
        kvm_call_hyp(__kvm_flush_vm_context);
}

static bool check_update_reserved_vmid(u64 vmid, u64 newvmid)
{
        int cpu;
        bool hit = false;

        /*
         * Iterate over the set of reserved VMIDs looking for a match
         * and update to use newvmid (i.e. the same VMID in the current
         * generation).
         */
        for_each_possible_cpu(cpu) {
                if (per_cpu(reserved_vmids, cpu) == vmid) {
                        hit = true;
                        per_cpu(reserved_vmids, cpu) = newvmid;
                }
        }

        return hit;
}

static u64 new_vmid(struct kvm_vmid *kvm_vmid)
{
        static u32 cur_idx = 1;
        u64 vmid = atomic64_read(&kvm_vmid->id);
        u64 generation = atomic64_read(&vmid_generation);

        if (vmid != 0) {
                u64 newvmid = generation | (vmid & ~VMID_MASK);

                if (check_update_reserved_vmid(vmid, newvmid)) {
                        atomic64_set(&kvm_vmid->id, newvmid);
                        return newvmid;
                }

                if (!__test_and_set_bit(vmid2idx(vmid), vmid_map)) {
                        atomic64_set(&kvm_vmid->id, newvmid);
                        return newvmid;
                }
        }

        vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, cur_idx);
        if (vmid != NUM_USER_VMIDS)
                goto set_vmid;

        /* We're out of VMIDs, so increment the global generation count */
        generation = atomic64_add_return_relaxed(VMID_FIRST_VERSION,
                                                 &vmid_generation);
        flush_context();

        /* We have more VMIDs than CPUs, so this will always succeed */
        vmid = find_next_zero_bit(vmid_map, NUM_USER_VMIDS, 1);

set_vmid:
        __set_bit(vmid, vmid_map);
        cur_idx = vmid;
        vmid = idx2vmid(vmid) | generation;
        atomic64_set(&kvm_vmid->id, vmid);
        return vmid;
}

/* Called from vCPU sched out with preemption disabled */
void kvm_arm_vmid_clear_active(void)
{
        atomic64_set(this_cpu_ptr(&active_vmids), VMID_ACTIVE_INVALID);
}

void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid)
{
        unsigned long flags;
        u64 vmid, old_active_vmid;

        vmid = atomic64_read(&kvm_vmid->id);

        /*
         * Please refer comments in check_and_switch_context() in
         * arch/arm64/mm/context.c.
         *
         * Unlike ASID allocator, we set the active_vmids to
         * VMID_ACTIVE_INVALID on vCPU schedule out to avoid
         * reserving the VMID space needlessly on rollover.
         * Hence explicitly check here for a "!= 0" to
         * handle the sync with a concurrent rollover.
         */
        old_active_vmid = atomic64_read(this_cpu_ptr(&active_vmids));
        if (old_active_vmid != 0 && vmid_gen_match(vmid) &&
            0 != atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_vmids),
                                          old_active_vmid, vmid))
                return;

        raw_spin_lock_irqsave(&cpu_vmid_lock, flags);

        /* Check that our VMID belongs to the current generation. */
        vmid = atomic64_read(&kvm_vmid->id);
        if (!vmid_gen_match(vmid))
                vmid = new_vmid(kvm_vmid);

        atomic64_set(this_cpu_ptr(&active_vmids), vmid);
        raw_spin_unlock_irqrestore(&cpu_vmid_lock, flags);
}

/*
 * Initialize the VMID allocator
 */
int __init kvm_arm_vmid_alloc_init(void)
{
        kvm_arm_vmid_bits = kvm_get_vmid_bits();

        /*
         * Expect allocation after rollover to fail if we don't have
         * at least one more VMID than CPUs. VMID #0 is always reserved.
         */
        WARN_ON(NUM_USER_VMIDS - 1 <= num_possible_cpus());
        atomic64_set(&vmid_generation, VMID_FIRST_VERSION);
        vmid_map = bitmap_zalloc(NUM_USER_VMIDS, GFP_KERNEL);
        if (!vmid_map)
                return -ENOMEM;

        return 0;
}

void __init kvm_arm_vmid_alloc_free(void)
{
        bitmap_free(vmid_map);
}
































































  158 









  158 
  158 

  158 
  158 




  110 


   94 


  158 









  440 









  441 
  442 

  441 
  443 




  254 


  340 


  439 







  724 



  729 

  730 
  731 





  728 



  184 
  463 



  157 





































  442 





  654 


  307 


  442 

  442 
  442 

  440 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
// SPDX-License-Identifier: GPL-2.0

/*
 * Copyright (c) 2021, Google LLC.
 * Pasha Tatashin <pasha.tatashin@soleen.com>
 */
#include <linux/kstrtox.h>
#include <linux/mm.h>
#include <linux/page_table_check.h>
#include <linux/swap.h>
#include <linux/swapops.h>

#undef pr_fmt
#define pr_fmt(fmt)        "page_table_check: " fmt

struct page_table_check {
        atomic_t anon_map_count;
        atomic_t file_map_count;
};

static bool __page_table_check_enabled __initdata =
                                IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED);

DEFINE_STATIC_KEY_TRUE(page_table_check_disabled);
EXPORT_SYMBOL(page_table_check_disabled);

static int __init early_page_table_check_param(char *buf)
{
        return kstrtobool(buf, &__page_table_check_enabled);
}

early_param("page_table_check", early_page_table_check_param);

static bool __init need_page_table_check(void)
{
        return __page_table_check_enabled;
}

static void __init init_page_table_check(void)
{
        if (!__page_table_check_enabled)
                return;
        static_branch_disable(&page_table_check_disabled);
}

struct page_ext_operations page_table_check_ops = {
        .size = sizeof(struct page_table_check),
        .need = need_page_table_check,
        .init = init_page_table_check,
        .need_shared_flags = false,
};

static struct page_table_check *get_page_table_check(struct page_ext *page_ext)
{
        BUG_ON(!page_ext);
        return page_ext_data(page_ext, &page_table_check_ops);
}

/*
 * An entry is removed from the page table, decrement the counters for that page
 * verify that it is of correct type and counters do not become negative.
 */
static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt)
{
        struct page_ext_iter iter;
        struct page_ext *page_ext;
        struct page *page;
        bool anon;

        if (!pfn_valid(pfn))
                return;

        page = pfn_to_page(pfn);
        BUG_ON(PageSlab(page));
        anon = PageAnon(page);

        rcu_read_lock();
        for_each_page_ext(page, pgcnt, page_ext, iter) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0);
                }
        }
        rcu_read_unlock();
}

/*
 * A new entry is added to the page table, increment the counters for that page
 * verify that it is of correct type and is not being mapped with a different
 * type to a different process.
 */
static void page_table_check_set(unsigned long pfn, unsigned long pgcnt,
                                 bool rw)
{
        struct page_ext_iter iter;
        struct page_ext *page_ext;
        struct page *page;
        bool anon;

        if (!pfn_valid(pfn))
                return;

        page = pfn_to_page(pfn);
        BUG_ON(PageSlab(page));
        anon = PageAnon(page);

        rcu_read_lock();
        for_each_page_ext(page, pgcnt, page_ext, iter) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                if (anon) {
                        BUG_ON(atomic_read(&ptc->file_map_count));
                        BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw);
                } else {
                        BUG_ON(atomic_read(&ptc->anon_map_count));
                        BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0);
                }
        }
        rcu_read_unlock();
}

/*
 * page is on free list, or is being allocated, verify that counters are zeroes
 * crash if they are not.
 */
void __page_table_check_zero(struct page *page, unsigned int order)
{
        struct page_ext_iter iter;
        struct page_ext *page_ext;

        BUG_ON(PageSlab(page));

        rcu_read_lock();
        for_each_page_ext(page, 1 << order, page_ext, iter) {
                struct page_table_check *ptc = get_page_table_check(page_ext);

                BUG_ON(atomic_read(&ptc->anon_map_count));
                BUG_ON(atomic_read(&ptc->file_map_count));
        }
        rcu_read_unlock();
}

void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
        if (&init_mm == mm)
                return;

        if (pte_user_accessible_page(pte)) {
                page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT);
        }
}
EXPORT_SYMBOL(__page_table_check_pte_clear);

void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
        if (&init_mm == mm)
                return;

        if (pmd_user_accessible_page(pmd)) {
                page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT);
        }
}
EXPORT_SYMBOL(__page_table_check_pmd_clear);

void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
        if (&init_mm == mm)
                return;

        if (pud_user_accessible_page(pud)) {
                page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT);
        }
}
EXPORT_SYMBOL(__page_table_check_pud_clear);

/* Whether the swap entry cached writable information */
static inline bool swap_cached_writable(swp_entry_t entry)
{
        return is_writable_device_private_entry(entry) ||
               is_writable_migration_entry(entry);
}

static inline void page_table_check_pte_flags(pte_t pte)
{
        if (pte_present(pte) && pte_uffd_wp(pte))
                WARN_ON_ONCE(pte_write(pte));
        else if (is_swap_pte(pte) && pte_swp_uffd_wp(pte))
                WARN_ON_ONCE(swap_cached_writable(pte_to_swp_entry(pte)));
}

void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
                unsigned int nr)
{
        unsigned int i;

        if (&init_mm == mm)
                return;

        page_table_check_pte_flags(pte);

        for (i = 0; i < nr; i++)
                __page_table_check_pte_clear(mm, ptep_get(ptep + i));
        if (pte_user_accessible_page(pte))
                page_table_check_set(pte_pfn(pte), nr, pte_write(pte));
}
EXPORT_SYMBOL(__page_table_check_ptes_set);

static inline void page_table_check_pmd_flags(pmd_t pmd)
{
        if (pmd_present(pmd) && pmd_uffd_wp(pmd))
                WARN_ON_ONCE(pmd_write(pmd));
        else if (is_swap_pmd(pmd) && pmd_swp_uffd_wp(pmd))
                WARN_ON_ONCE(swap_cached_writable(pmd_to_swp_entry(pmd)));
}

void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd)
{
        if (&init_mm == mm)
                return;

        page_table_check_pmd_flags(pmd);

        __page_table_check_pmd_clear(mm, *pmdp);
        if (pmd_user_accessible_page(pmd)) {
                page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT,
                                     pmd_write(pmd));
        }
}
EXPORT_SYMBOL(__page_table_check_pmd_set);

void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud)
{
        if (&init_mm == mm)
                return;

        __page_table_check_pud_clear(mm, *pudp);
        if (pud_user_accessible_page(pud)) {
                page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT,
                                     pud_write(pud));
        }
}
EXPORT_SYMBOL(__page_table_check_pud_set);

void __page_table_check_pte_clear_range(struct mm_struct *mm,
                                        unsigned long addr,
                                        pmd_t pmd)
{
        if (&init_mm == mm)
                return;

        if (!pmd_bad(pmd) && !pmd_leaf(pmd)) {
                pte_t *ptep = pte_offset_map(&pmd, addr);
                unsigned long i;

                if (WARN_ON(!ptep))
                        return;
                for (i = 0; i < PTRS_PER_PTE; i++) {
                        __page_table_check_pte_clear(mm, ptep_get(ptep));
                        addr += PAGE_SIZE;
                        ptep++;
                }
                pte_unmap(ptep - PTRS_PER_PTE);
        }
}










































































































    1 

    1 






    7 
  164 
    1 




































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* fs/ internal definitions
 *
 * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

struct super_block;
struct file_system_type;
struct iomap;
struct iomap_ops;
struct linux_binprm;
struct path;
struct mount;
struct shrink_control;
struct fs_context;
struct pipe_inode_info;
struct iov_iter;
struct mnt_idmap;
struct ns_common;

/*
 * block/bdev.c
 */
#ifdef CONFIG_BLOCK
extern void __init bdev_cache_init(void);
#else
static inline void bdev_cache_init(void)
{
}
#endif /* CONFIG_BLOCK */

/*
 * buffer.c
 */
int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block, const struct iomap *iomap);

/*
 * char_dev.c
 */
extern void __init chrdev_init(void);

/*
 * fs_context.c
 */
extern const struct fs_context_operations legacy_fs_context_ops;
extern int parse_monolithic_mount_data(struct fs_context *, void *);
extern void vfs_clean_context(struct fs_context *fc);
extern int finish_clean_context(struct fs_context *fc);

/*
 * namei.c
 */
extern int filename_lookup(int dfd, struct filename *name, unsigned flags,
                           struct path *path, struct path *root);
int do_rmdir(int dfd, struct filename *name);
int do_unlinkat(int dfd, struct filename *name);
int may_linkat(struct mnt_idmap *idmap, const struct path *link);
int do_renameat2(int olddfd, struct filename *oldname, int newdfd,
                 struct filename *newname, unsigned int flags);
int do_mkdirat(int dfd, struct filename *name, umode_t mode);
int do_symlinkat(struct filename *from, int newdfd, struct filename *to);
int do_linkat(int olddfd, struct filename *old, int newdfd,
                        struct filename *new, int flags);
int vfs_tmpfile(struct mnt_idmap *idmap,
                const struct path *parentpath,
                struct file *file, umode_t mode);

/*
 * namespace.c
 */
extern struct vfsmount *lookup_mnt(const struct path *);
extern int finish_automount(struct vfsmount *, const struct path *);

extern int sb_prepare_remount_readonly(struct super_block *);

extern void __init mnt_init(void);

int mnt_get_write_access_file(struct file *file);
void mnt_put_write_access_file(struct file *file);

extern void dissolve_on_fput(struct vfsmount *);
extern bool may_mount(void);

int path_mount(const char *dev_name, struct path *path,
                const char *type_page, unsigned long flags, void *data_page);
int path_umount(struct path *path, int flags);

int show_path(struct seq_file *m, struct dentry *root);

/*
 * fs_struct.c
 */
extern void chroot_fs_refs(const struct path *, const struct path *);

/*
 * file_table.c
 */
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);

static inline void file_put_write_access(struct file *file)
{
        put_write_access(file->f_inode);
        mnt_put_write_access(file->f_path.mnt);
        if (unlikely(file->f_mode & FMODE_BACKING))
                mnt_put_write_access(backing_file_user_path(file)->mnt);
}

static inline void put_file_access(struct file *file)
{
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
                i_readcount_dec(file->f_inode);
        } else if (file->f_mode & FMODE_WRITER) {
                file_put_write_access(file);
        }
}

void fput_close_sync(struct file *);
void fput_close(struct file *);

/*
 * super.c
 */
extern int reconfigure_super(struct fs_context *);
extern bool super_trylock_shared(struct super_block *sb);
struct super_block *user_get_super(dev_t, bool excl);
void put_super(struct super_block *sb);
extern bool mount_capable(struct fs_context *);
int sb_init_dio_done_wq(struct super_block *sb);

/*
 * Prepare superblock for changing its read-only state (i.e., either remount
 * read-write superblock read-only or vice versa). After this function returns
 * mnt_is_readonly() will return true for any mount of the superblock if its
 * caller is able to observe any changes done by the remount. This holds until
 * sb_end_ro_state_change() is called.
 */
static inline void sb_start_ro_state_change(struct super_block *sb)
{
        WRITE_ONCE(sb->s_readonly_remount, 1);
        /*
         * For RO->RW transition, the barrier pairs with the barrier in
         * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY
         * cleared, it will see s_readonly_remount set.
         * For RW->RO transition, the barrier pairs with the barrier in
         * mnt_get_write_access() before the mnt_is_readonly() check.
         * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD
         * already cleared, it will see s_readonly_remount set.
         */
        smp_wmb();
}

/*
 * Ends section changing read-only state of the superblock. After this function
 * returns if mnt_is_readonly() returns false, the caller will be able to
 * observe all the changes remount did to the superblock.
 */
static inline void sb_end_ro_state_change(struct super_block *sb)
{
        /*
         * This barrier provides release semantics that pairs with
         * the smp_rmb() acquire semantics in mnt_is_readonly().
         * This barrier pair ensure that when mnt_is_readonly() sees
         * 0 for sb->s_readonly_remount, it will also see all the
         * preceding flag changes that were made during the RO state
         * change.
         */
        smp_wmb();
        WRITE_ONCE(sb->s_readonly_remount, 0);
}

/*
 * open.c
 */
struct open_flags {
        int open_flag;
        umode_t mode;
        int acc_mode;
        int intent;
        int lookup_flags;
};
extern struct file *do_filp_open(int dfd, struct filename *pathname,
                const struct open_flags *op);
extern struct file *do_file_open_root(const struct path *,
                const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);

int do_ftruncate(struct file *file, loff_t length, int small);
int do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
                int flag);
int chown_common(const struct path *path, uid_t user, gid_t group);
extern int vfs_open(const struct path *, struct file *);

/*
 * inode.c
 */
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid);

/*
 * fs-writeback.c
 */
extern long get_nr_dirty_inodes(void);

/*
 * dcache.c
 */
extern int d_set_mounted(struct dentry *dentry);
extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
extern struct dentry *d_alloc_cursor(struct dentry *);
extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *);
extern char *simple_dname(struct dentry *, char *, int);
extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
extern void shrink_dcache_for_umount(struct super_block *);
extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name, unsigned *seq);
extern void d_genocide(struct dentry *);

/*
 * pipe.c
 */
extern const struct file_operations pipefifo_fops;

/*
 * fs_pin.c
 */
extern void group_pin_kill(struct hlist_head *p);
extern void mnt_pin_kill(struct mount *m);

/*
 * fs/nsfs.c
 */
extern const struct dentry_operations ns_dentry_operations;
int open_namespace(struct ns_common *ns);

/*
 * fs/stat.c:
 */

int do_statx(int dfd, struct filename *filename, unsigned int flags,
             unsigned int mask, struct statx __user *buffer);
int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
                struct statx __user *buffer);

/*
 * fs/splice.c:
 */
ssize_t splice_file_to_pipe(struct file *in,
                            struct pipe_inode_info *opipe,
                            loff_t *offset,
                            size_t len, unsigned int flags);

/*
 * fs/xattr.c:
 */
struct xattr_name {
        char name[XATTR_NAME_MAX + 1];
};

struct kernel_xattr_ctx {
        /* Value of attribute */
        union {
                const void __user *cvalue;
                void __user *value;
        };
        void *kvalue;
        size_t size;
        /* Attribute name */
        struct xattr_name *kname;
        unsigned int flags;
};

ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx);
ssize_t filename_getxattr(int dfd, struct filename *filename,
                          unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx);
int filename_setxattr(int dfd, struct filename *filename,
                      unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx);
int import_xattr_name(struct xattr_name *kname, const char __user *name);

int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode);

#ifdef CONFIG_FS_POSIX_ACL
int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
               const char *acl_name, const void *kvalue, size_t size);
ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name, void *kvalue, size_t size);
#else
static inline int do_set_acl(struct mnt_idmap *idmap,
                             struct dentry *dentry, const char *acl_name,
                             const void *kvalue, size_t size)
{
        return -EOPNOTSUPP;
}
static inline ssize_t do_get_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name,
                                 void *kvalue, size_t size)
{
        return -EOPNOTSUPP;
}
#endif

ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos);

/*
 * fs/attr.c
 */
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
void mnt_idmap_put(struct mnt_idmap *idmap);
struct stashed_operations {
        void (*put_data)(void *data);
        int (*init_inode)(struct inode *inode, void *data);
};
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
                      struct path *path);
void stashed_dentry_prune(struct dentry *dentry);
struct dentry *stashed_dentry_get(struct dentry **stashed);
/**
 * path_mounted - check whether path is mounted
 * @path: path to check
 *
 * Determine whether @path refers to the root of a mount.
 *
 * Return: true if @path is the root of a mount, false if not.
 */
static inline bool path_mounted(const struct path *path)
{
        return path->mnt->mnt_root == path->dentry;
}
void file_f_owner_release(struct file *file);
bool file_seek_cur_needs_f_lock(struct file *file);
int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map);



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *  Universal TUN/TAP device driver.
 *  Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com>
 */
#ifndef __IF_TUN_H
#define __IF_TUN_H

#include <uapi/linux/if_tun.h>
#include <uapi/linux/virtio_net.h>

#define TUN_XDP_FLAG 0x1UL

#define TUN_MSG_UBUF 1
#define TUN_MSG_PTR  2
struct tun_msg_ctl {
        unsigned short type;
        unsigned short num;
        void *ptr;
};

struct tun_xdp_hdr {
        int buflen;
        struct virtio_net_hdr gso;
};

#if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
struct socket *tun_get_socket(struct file *);
struct ptr_ring *tun_get_tx_ring(struct file *file);

static inline bool tun_is_xdp_frame(void *ptr)
{
        return (unsigned long)ptr & TUN_XDP_FLAG;
}

static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp)
{
        return (void *)((unsigned long)xdp | TUN_XDP_FLAG);
}

static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
{
        return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG);
}

void tun_ptr_free(void *ptr);
#else
#include <linux/err.h>
#include <linux/errno.h>
struct file;
struct socket;

static inline struct socket *tun_get_socket(struct file *f)
{
        return ERR_PTR(-EINVAL);
}

static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
{
        return ERR_PTR(-EINVAL);
}

static inline bool tun_is_xdp_frame(void *ptr)
{
        return false;
}

static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp)
{
        return NULL;
}

static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr)
{
        return NULL;
}

static inline void tun_ptr_free(void *ptr)
{
}
#endif /* CONFIG_TUN */
#endif /* __IF_TUN_H */
































  157 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
 *
 * Copyright IBM Corporation, 2008
 *
 * Author: Dipankar Sarma <dipankar@in.ibm.com>
 *           Paul E. McKenney <paulmck@linux.ibm.com> Hierarchical algorithm
 *
 * Based on the original work by Paul McKenney <paulmck@linux.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *        Documentation/RCU
 */

#ifndef __LINUX_RCUTREE_H
#define __LINUX_RCUTREE_H

void rcu_softirq_qs(void);
void rcu_note_context_switch(bool preempt);
int rcu_needs_cpu(void);
void rcu_cpu_stall_reset(void);
void rcu_request_urgent_qs_task(struct task_struct *t);

/*
 * Note a virtualization-based context switch.  This is simply a
 * wrapper around rcu_note_context_switch(), which allows TINY_RCU
 * to save a few bytes. The caller must have disabled interrupts.
 */
static inline void rcu_virt_note_context_switch(void)
{
        rcu_note_context_switch(false);
}

void synchronize_rcu_expedited(void);

void rcu_barrier(void);
void rcu_momentary_eqs(void);

struct rcu_gp_oldstate {
        unsigned long rgos_norm;
        unsigned long rgos_exp;
};

// Maximum number of rcu_gp_oldstate values corresponding to
// not-yet-completed RCU grace periods.
#define NUM_ACTIVE_RCU_POLL_FULL_OLDSTATE 4

/**
 * same_state_synchronize_rcu_full - Are two old-state values identical?
 * @rgosp1: First old-state value.
 * @rgosp2: Second old-state value.
 *
 * The two old-state values must have been obtained from either
 * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(),
 * or get_completed_synchronize_rcu_full().  Returns @true if the two
 * values are identical and @false otherwise.  This allows structures
 * whose lifetimes are tracked by old-state values to push these values
 * to a list header, allowing those structures to be slightly smaller.
 *
 * Note that equality is judged on a bitwise basis, so that an
 * @rcu_gp_oldstate structure with an already-completed state in one field
 * will compare not-equal to a structure with an already-completed state
 * in the other field.  After all, the @rcu_gp_oldstate structure is opaque
 * so how did such a situation come to pass in the first place?
 */
static inline bool same_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp1,
                                                   struct rcu_gp_oldstate *rgosp2)
{
        return rgosp1->rgos_norm == rgosp2->rgos_norm && rgosp1->rgos_exp == rgosp2->rgos_exp;
}

unsigned long start_poll_synchronize_rcu_expedited(void);
void start_poll_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
void cond_synchronize_rcu_expedited(unsigned long oldstate);
void cond_synchronize_rcu_expedited_full(struct rcu_gp_oldstate *rgosp);
unsigned long get_state_synchronize_rcu(void);
void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
unsigned long start_poll_synchronize_rcu(void);
void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
bool poll_state_synchronize_rcu(unsigned long oldstate);
bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);
void cond_synchronize_rcu(unsigned long oldstate);
void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);

#ifdef CONFIG_PROVE_RCU
void rcu_irq_exit_check_preempt(void);
#else
static inline void rcu_irq_exit_check_preempt(void) { }
#endif

struct task_struct;
void rcu_preempt_deferred_qs(struct task_struct *t);

void exit_rcu(void);

void rcu_scheduler_starting(void);
extern int rcu_scheduler_active;
void rcu_end_inkernel_boot(void);
bool rcu_inkernel_boot_has_ended(void);
bool rcu_is_watching(void);
#ifndef CONFIG_PREEMPT_RCU
void rcu_all_qs(void);
#endif

/* RCUtree hotplug events */
int rcutree_prepare_cpu(unsigned int cpu);
int rcutree_online_cpu(unsigned int cpu);
void rcutree_report_cpu_starting(unsigned int cpu);

#ifdef CONFIG_HOTPLUG_CPU
int rcutree_dead_cpu(unsigned int cpu);
int rcutree_dying_cpu(unsigned int cpu);
int rcutree_offline_cpu(unsigned int cpu);
#else
#define rcutree_dead_cpu NULL
#define rcutree_dying_cpu NULL
#define rcutree_offline_cpu NULL
#endif

void rcutree_migrate_callbacks(int cpu);

/* Called from hotplug and also arm64 early secondary boot failure */
void rcutree_report_cpu_dead(void);

#endif /* __LINUX_RCUTREE_H */
























    1 












    1 



























    7 


    1 


    1 




    1 









    1 




    1 








    1 


    1 


















































   24 
   24 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef TUN_VNET_H
#define TUN_VNET_H

/* High bits in flags field are unused. */
#define TUN_VNET_LE     0x80000000
#define TUN_VNET_BE     0x40000000

static inline bool tun_vnet_legacy_is_little_endian(unsigned int flags)
{
        bool be = IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE) &&
                  (flags & TUN_VNET_BE);

        return !be && virtio_legacy_is_little_endian();
}

static inline long tun_get_vnet_be(unsigned int flags, int __user *argp)
{
        int be = !!(flags & TUN_VNET_BE);

        if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE))
                return -EINVAL;

        if (put_user(be, argp))
                return -EFAULT;

        return 0;
}

static inline long tun_set_vnet_be(unsigned int *flags, int __user *argp)
{
        int be;

        if (!IS_ENABLED(CONFIG_TUN_VNET_CROSS_LE))
                return -EINVAL;

        if (get_user(be, argp))
                return -EFAULT;

        if (be)
                *flags |= TUN_VNET_BE;
        else
                *flags &= ~TUN_VNET_BE;

        return 0;
}

static inline bool tun_vnet_is_little_endian(unsigned int flags)
{
        return flags & TUN_VNET_LE || tun_vnet_legacy_is_little_endian(flags);
}

static inline u16 tun_vnet16_to_cpu(unsigned int flags, __virtio16 val)
{
        return __virtio16_to_cpu(tun_vnet_is_little_endian(flags), val);
}

static inline __virtio16 cpu_to_tun_vnet16(unsigned int flags, u16 val)
{
        return __cpu_to_virtio16(tun_vnet_is_little_endian(flags), val);
}

static inline long tun_vnet_ioctl(int *vnet_hdr_sz, unsigned int *flags,
                                  unsigned int cmd, int __user *sp)
{
        int s;

        switch (cmd) {
        case TUNGETVNETHDRSZ:
                s = *vnet_hdr_sz;
                if (put_user(s, sp))
                        return -EFAULT;
                return 0;

        case TUNSETVNETHDRSZ:
                if (get_user(s, sp))
                        return -EFAULT;
                if (s < (int)sizeof(struct virtio_net_hdr))
                        return -EINVAL;

                *vnet_hdr_sz = s;
                return 0;

        case TUNGETVNETLE:
                s = !!(*flags & TUN_VNET_LE);
                if (put_user(s, sp))
                        return -EFAULT;
                return 0;

        case TUNSETVNETLE:
                if (get_user(s, sp))
                        return -EFAULT;
                if (s)
                        *flags |= TUN_VNET_LE;
                else
                        *flags &= ~TUN_VNET_LE;
                return 0;

        case TUNGETVNETBE:
                return tun_get_vnet_be(*flags, sp);

        case TUNSETVNETBE:
                return tun_set_vnet_be(flags, sp);

        default:
                return -EINVAL;
        }
}

static inline int tun_vnet_hdr_get(int sz, unsigned int flags,
                                   struct iov_iter *from,
                                   struct virtio_net_hdr *hdr)
{
        u16 hdr_len;

        if (iov_iter_count(from) < sz)
                return -EINVAL;

        if (!copy_from_iter_full(hdr, sizeof(*hdr), from))
                return -EFAULT;

        hdr_len = tun_vnet16_to_cpu(flags, hdr->hdr_len);

        if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
                hdr_len = max(tun_vnet16_to_cpu(flags, hdr->csum_start) + tun_vnet16_to_cpu(flags, hdr->csum_offset) + 2, hdr_len);
                hdr->hdr_len = cpu_to_tun_vnet16(flags, hdr_len);
        }

        if (hdr_len > iov_iter_count(from))
                return -EINVAL;

        iov_iter_advance(from, sz - sizeof(*hdr));

        return hdr_len;
}

static inline int tun_vnet_hdr_put(int sz, struct iov_iter *iter,
                                   const struct virtio_net_hdr *hdr)
{
        if (unlikely(iov_iter_count(iter) < sz))
                return -EINVAL;

        if (unlikely(copy_to_iter(hdr, sizeof(*hdr), iter) != sizeof(*hdr)))
                return -EFAULT;

        if (iov_iter_zero(sz - sizeof(*hdr), iter) != sz - sizeof(*hdr))
                return -EFAULT;

        return 0;
}

static inline int tun_vnet_hdr_to_skb(unsigned int flags, struct sk_buff *skb,
                                      const struct virtio_net_hdr *hdr)
{
        return virtio_net_hdr_to_skb(skb, hdr, tun_vnet_is_little_endian(flags));
}

static inline int tun_vnet_hdr_from_skb(unsigned int flags,
                                        const struct net_device *dev,
                                        const struct sk_buff *skb,
                                        struct virtio_net_hdr *hdr)
{
        int vlan_hlen = skb_vlan_tag_present(skb) ? VLAN_HLEN : 0;

        if (virtio_net_hdr_from_skb(skb, hdr,
                                    tun_vnet_is_little_endian(flags), true,
                                    vlan_hlen)) {
                struct skb_shared_info *sinfo = skb_shinfo(skb);

                if (net_ratelimit()) {
                        netdev_err(dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
                                   sinfo->gso_type, tun_vnet16_to_cpu(flags, hdr->gso_size),
                                   tun_vnet16_to_cpu(flags, hdr->hdr_len));
                        print_hex_dump(KERN_ERR, "tun: ",
                                       DUMP_PREFIX_NONE,
                                       16, 1, skb->head,
                                       min(tun_vnet16_to_cpu(flags, hdr->hdr_len), 64), true);
                }
                WARN_ON_ONCE(1);
                return -EINVAL;
        }

        return 0;
}

#endif /* TUN_VNET_H */





































































   18 

















    8 

  440 


  445 































  440 
  439 











































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Access vector cache interface for object managers.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

#ifndef _SELINUX_AVC_H_
#define _SELINUX_AVC_H_

#include <linux/stddef.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/kdev_t.h>
#include <linux/spinlock.h>
#include <linux/init.h>
#include <linux/audit.h>
#include <linux/lsm_audit.h>
#include <linux/in6.h>
#include "flask.h"
#include "av_permissions.h"
#include "security.h"

/*
 * An entry in the AVC.
 */
struct avc_entry;

struct task_struct;
struct inode;
struct sock;
struct sk_buff;

/*
 * AVC statistics
 */
struct avc_cache_stats {
        unsigned int lookups;
        unsigned int misses;
        unsigned int allocations;
        unsigned int reclaims;
        unsigned int frees;
};

/*
 * We only need this data after we have decided to send an audit message.
 */
struct selinux_audit_data {
        u32 ssid;
        u32 tsid;
        u16 tclass;
        u32 requested;
        u32 audited;
        u32 denied;
        int result;
} __randomize_layout;

/*
 * AVC operations
 */

void __init avc_init(void);

static inline u32 avc_audit_required(u32 requested, struct av_decision *avd,
                                     int result, u32 auditdeny, u32 *deniedp)
{
        u32 denied, audited;
        denied = requested & ~avd->allowed;
        if (unlikely(denied)) {
                audited = denied & avd->auditdeny;
                /*
                 * auditdeny is TRICKY!  Setting a bit in
                 * this field means that ANY denials should NOT be audited if
                 * the policy contains an explicit dontaudit rule for that
                 * permission.  Take notice that this is unrelated to the
                 * actual permissions that were denied.  As an example lets
                 * assume:
                 *
                 * denied == READ
                 * avd.auditdeny & ACCESS == 0 (not set means explicit rule)
                 * auditdeny & ACCESS == 1
                 *
                 * We will NOT audit the denial even though the denied
                 * permission was READ and the auditdeny checks were for
                 * ACCESS
                 */
                if (auditdeny && !(auditdeny & avd->auditdeny))
                        audited = 0;
        } else if (result)
                audited = denied = requested;
        else
                audited = requested & avd->auditallow;
        *deniedp = denied;
        return audited;
}

int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested, u32 audited,
                   u32 denied, int result, struct common_audit_data *a);

/**
 * avc_audit - Audit the granting or denial of permissions.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions
 * @avd: access vector decisions
 * @result: result from avc_has_perm_noaudit
 * @a:  auxiliary audit data
 *
 * Audit the granting or denial of permissions in accordance
 * with the policy.  This function is typically called by
 * avc_has_perm() after a permission check, but can also be
 * called directly by callers who use avc_has_perm_noaudit()
 * in order to separate the permission check from the auditing.
 * For example, this separation is useful when the permission check must
 * be performed under a lock, to allow the lock to be released
 * before calling the auditing code.
 */
static inline int avc_audit(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                            struct av_decision *avd, int result,
                            struct common_audit_data *a)
{
        u32 audited, denied;
        audited = avc_audit_required(requested, avd, result, 0, &denied);
        if (likely(!audited))
                return 0;
        return slow_avc_audit(ssid, tsid, tclass, requested, audited, denied,
                              result, a);
}

#define AVC_STRICT           1 /* Ignore permissive mode. */
#define AVC_EXTENDED_PERMS 2 /* update extended permissions */
int avc_has_perm_noaudit(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                         unsigned int flags, struct av_decision *avd);

int avc_has_perm(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                 struct common_audit_data *auditdata);

#define AVC_EXT_IOCTL        (1 << 0) /* Cache entry for an ioctl extended permission */
#define AVC_EXT_NLMSG        (1 << 1) /* Cache entry for an nlmsg extended permission */
int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                           u8 driver, u8 base_perm, u8 perm,
                           struct common_audit_data *ad);

u32 avc_policy_seqno(void);

#define AVC_CALLBACK_GRANT                1
#define AVC_CALLBACK_TRY_REVOKE                2
#define AVC_CALLBACK_REVOKE                4
#define AVC_CALLBACK_RESET                8
#define AVC_CALLBACK_AUDITALLOW_ENABLE        16
#define AVC_CALLBACK_AUDITALLOW_DISABLE 32
#define AVC_CALLBACK_AUDITDENY_ENABLE        64
#define AVC_CALLBACK_AUDITDENY_DISABLE        128
#define AVC_CALLBACK_ADD_XPERMS                256

int avc_add_callback(int (*callback)(u32 event), u32 events);

/* Exported to selinuxfs */
int avc_get_hash_stats(char *page);
unsigned int avc_get_cache_threshold(void);
void avc_set_cache_threshold(unsigned int cache_threshold);

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
DECLARE_PER_CPU(struct avc_cache_stats, avc_cache_stats);
#endif

#endif /* _SELINUX_AVC_H_ */
















































































 1027 




 1026 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/kernel/irq.c
 *
 * Copyright (C) 1992 Linus Torvalds
 * Modifications for ARM processor Copyright (C) 1995-2000 Russell King.
 * Support for Dynamic Tick Timer Copyright (C) 2004-2005 Nokia Corporation.
 * Dynamic Tick Timer written by Tony Lindgren <tony@atomide.com> and
 * Tuukka Tikkanen <tuukka.tikkanen@elektrobit.com>.
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/hardirq.h>
#include <linux/init.h>
#include <linux/irq.h>
#include <linux/irqchip.h>
#include <linux/kprobes.h>
#include <linux/memory.h>
#include <linux/scs.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
#include <linux/vmalloc.h>
#include <asm/daifflags.h>
#include <asm/exception.h>
#include <asm/numa.h>
#include <asm/softirq_stack.h>
#include <asm/stacktrace.h>
#include <asm/vmap_stack.h>

/* Only access this in an NMI enter/exit */
DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts);

DEFINE_PER_CPU(unsigned long *, irq_stack_ptr);


DECLARE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr);

#ifdef CONFIG_SHADOW_CALL_STACK
DEFINE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr);
#endif

static void init_irq_scs(void)
{
        int cpu;

        if (!scs_is_enabled())
                return;

        for_each_possible_cpu(cpu)
                per_cpu(irq_shadow_call_stack_ptr, cpu) =
                        scs_alloc(early_cpu_to_node(cpu));
}

#ifdef CONFIG_VMAP_STACK
static void __init init_irq_stacks(void)
{
        int cpu;
        unsigned long *p;

        for_each_possible_cpu(cpu) {
                p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, early_cpu_to_node(cpu));
                per_cpu(irq_stack_ptr, cpu) = p;
        }
}
#else
/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */
DEFINE_PER_CPU_ALIGNED(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack);

static void init_irq_stacks(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu);
}
#endif

#ifndef CONFIG_PREEMPT_RT
static void ____do_softirq(struct pt_regs *regs)
{
        __do_softirq();
}

void do_softirq_own_stack(void)
{
        call_on_irq_stack(NULL, ____do_softirq);
}
#endif

static void default_handle_irq(struct pt_regs *regs)
{
        panic("IRQ taken without a root IRQ handler\n");
}

static void default_handle_fiq(struct pt_regs *regs)
{
        panic("FIQ taken without a root FIQ handler\n");
}

void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq;
void (*handle_arch_fiq)(struct pt_regs *) __ro_after_init = default_handle_fiq;

int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
{
        if (handle_arch_irq != default_handle_irq)
                return -EBUSY;

        handle_arch_irq = handle_irq;
        pr_info("Root IRQ handler: %ps\n", handle_irq);
        return 0;
}

int __init set_handle_fiq(void (*handle_fiq)(struct pt_regs *))
{
        if (handle_arch_fiq != default_handle_fiq)
                return -EBUSY;

        handle_arch_fiq = handle_fiq;
        pr_info("Root FIQ handler: %ps\n", handle_fiq);
        return 0;
}

void __init init_IRQ(void)
{
        init_irq_stacks();
        init_irq_scs();
        irqchip_init();

        if (system_uses_irq_prio_masking()) {
                /*
                 * Now that we have a stack for our IRQ handler, set
                 * the PMR/PSR pair to a consistent state.
                 */
                WARN_ON(read_sysreg(daif) & PSR_A_BIT);
                local_daif_restore(DAIF_PROCCTX_NOIRQ);
        }
}


































  974 




















































  347 


























  347 































  246 
  246 
























    9 


    9 


























































  112 

































  112 
  112 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/anon_inodes.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 *  Thanks to Arnd Bergmann for code review and suggestions.
 *  More changes for Thomas Gleixner suggestions.
 *
 */

#include <linux/cred.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/magic.h>
#include <linux/anon_inodes.h>
#include <linux/pseudo_fs.h>

#include <linux/uaccess.h>

static struct vfsmount *anon_inode_mnt __ro_after_init;
static struct inode *anon_inode_inode __ro_after_init;

/*
 * anon_inodefs_dname() is called from d_path().
 */
static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "anon_inode:%s",
                                dentry->d_name.name);
}

static const struct dentry_operations anon_inodefs_dentry_operations = {
        .d_dname        = anon_inodefs_dname,
};

static int anon_inodefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->dops = &anon_inodefs_dentry_operations;
        return 0;
}

static struct file_system_type anon_inode_fs_type = {
        .name                = "anon_inodefs",
        .init_fs_context = anon_inodefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

static struct inode *anon_inode_make_secure_inode(
        const char *name,
        const struct inode *context_inode)
{
        struct inode *inode;
        int error;

        inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
        if (IS_ERR(inode))
                return inode;
        inode->i_flags &= ~S_PRIVATE;
        error =        security_inode_init_security_anon(inode, &QSTR(name),
                                                  context_inode);
        if (error) {
                iput(inode);
                return ERR_PTR(error);
        }
        return inode;
}

static struct file *__anon_inode_getfile(const char *name,
                                         const struct file_operations *fops,
                                         void *priv, int flags,
                                         const struct inode *context_inode,
                                         bool make_inode)
{
        struct inode *inode;
        struct file *file;

        if (fops->owner && !try_module_get(fops->owner))
                return ERR_PTR(-ENOENT);

        if (make_inode) {
                inode =        anon_inode_make_secure_inode(name, context_inode);
                if (IS_ERR(inode)) {
                        file = ERR_CAST(inode);
                        goto err;
                }
        } else {
                inode =        anon_inode_inode;
                if (IS_ERR(inode)) {
                        file = ERR_PTR(-ENODEV);
                        goto err;
                }
                /*
                 * We know the anon_inode inode count is always
                 * greater than zero, so ihold() is safe.
                 */
                ihold(inode);
        }

        file = alloc_file_pseudo(inode, anon_inode_mnt, name,
                                 flags & (O_ACCMODE | O_NONBLOCK), fops);
        if (IS_ERR(file))
                goto err_iput;

        file->f_mapping = inode->i_mapping;

        file->private_data = priv;

        return file;

err_iput:
        iput(inode);
err:
        module_put(fops->owner);
        return file;
}

/**
 * anon_inode_getfile - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup.  Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_getfile(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags)
{
        return __anon_inode_getfile(name, fops, priv, flags, NULL, false);
}
EXPORT_SYMBOL_GPL(anon_inode_getfile);

/**
 * anon_inode_getfile_fmode - creates a new file instance by hooking it up to an
 *                      anonymous inode, and a dentry that describe the "class"
 *                      of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @f_mode:  [in]    fmode
 *
 * Creates a new file by hooking it on a single inode. This is useful for files
 * that do not need to have a full-fledged inode in order to operate correctly.
 * All the files created with anon_inode_getfile() will share a single inode,
 * hence saving memory and avoiding code duplication for the file/inode/dentry
 * setup. Allows setting the fmode. Returns the newly created file* or an error
 * pointer.
 */
struct file *anon_inode_getfile_fmode(const char *name,
                                const struct file_operations *fops,
                                void *priv, int flags, fmode_t f_mode)
{
        struct file *file;

        file = __anon_inode_getfile(name, fops, priv, flags, NULL, false);
        if (!IS_ERR(file))
                file->f_mode |= f_mode;

        return file;
}
EXPORT_SYMBOL_GPL(anon_inode_getfile_fmode);

/**
 * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
 *                             !S_PRIVATE anon inode rather than reuse the
 *                             singleton anon inode and calls the
 *                             inode_init_security_anon() LSM hook.
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @context_inode:
 *           [in]    the logical relationship with the new inode (optional)
 *
 * Create a new anonymous inode and file pair.  This can be done for two
 * reasons:
 *
 * - for the inode to have its own security context, so that LSMs can enforce
 *   policy on the inode's creation;
 *
 * - if the caller needs a unique inode, for example in order to customize
 *   the size returned by fstat()
 *
 * The LSM may use @context_inode in inode_init_security_anon(), but a
 * reference to it is not held.
 *
 * Returns the newly created file* or an error pointer.
 */
struct file *anon_inode_create_getfile(const char *name,
                                       const struct file_operations *fops,
                                       void *priv, int flags,
                                       const struct inode *context_inode)
{
        return __anon_inode_getfile(name, fops, priv, flags,
                                    context_inode, true);
}
EXPORT_SYMBOL_GPL(anon_inode_create_getfile);

static int __anon_inode_getfd(const char *name,
                              const struct file_operations *fops,
                              void *priv, int flags,
                              const struct inode *context_inode,
                              bool make_inode)
{
        int error, fd;
        struct file *file;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                return error;
        fd = error;

        file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
                                    make_inode);
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto err_put_unused_fd;
        }
        fd_install(fd, file);

        return fd;

err_put_unused_fd:
        put_unused_fd(fd);
        return error;
}

/**
 * anon_inode_getfd - creates a new file instance by hooking it up to
 *                    an anonymous inode and a dentry that describe
 *                    the "class" of the file
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 *
 * Creates a new file by hooking it on a single inode. This is
 * useful for files that do not need to have a full-fledged inode in
 * order to operate correctly.  All the files created with
 * anon_inode_getfd() will use the same singleton inode, reducing
 * memory use and avoiding code duplication for the file/inode/dentry
 * setup.  Returns a newly created file descriptor or an error code.
 */
int anon_inode_getfd(const char *name, const struct file_operations *fops,
                     void *priv, int flags)
{
        return __anon_inode_getfd(name, fops, priv, flags, NULL, false);
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);

/**
 * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new
 * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
 * the inode_init_security_anon() LSM hook.
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops:    [in]    file operations for the new file
 * @priv:    [in]    private data for the new file (will be file's private_data)
 * @flags:   [in]    flags
 * @context_inode:
 *           [in]    the logical relationship with the new inode (optional)
 *
 * Create a new anonymous inode and file pair.  This can be done for two
 * reasons:
 *
 * - for the inode to have its own security context, so that LSMs can enforce
 *   policy on the inode's creation;
 *
 * - if the caller needs a unique inode, for example in order to customize
 *   the size returned by fstat()
 *
 * The LSM may use @context_inode in inode_init_security_anon(), but a
 * reference to it is not held.
 *
 * Returns a newly created file descriptor or an error code.
 */
int anon_inode_create_getfd(const char *name, const struct file_operations *fops,
                            void *priv, int flags,
                            const struct inode *context_inode)
{
        return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
}


static int __init anon_inode_init(void)
{
        anon_inode_mnt = kern_mount(&anon_inode_fs_type);
        if (IS_ERR(anon_inode_mnt))
                panic("anon_inode_init() kernel mount failed (%ld)\n", PTR_ERR(anon_inode_mnt));

        anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb);
        if (IS_ERR(anon_inode_inode))
                panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode));

        return 0;
}

fs_initcall(anon_inode_init);



















 1256 
 1254 











 1254 
 1258 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BH_H
#define _LINUX_BH_H

#include <linux/instruction_pointer.h>
#include <linux/preempt.h>

#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS)
extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt);
#else
static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
{
        preempt_count_add(cnt);
        barrier();
}
#endif

static inline void local_bh_disable(void)
{
        __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

extern void _local_bh_enable(void);
extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt);

static inline void local_bh_enable_ip(unsigned long ip)
{
        __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET);
}

static inline void local_bh_enable(void)
{
        __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
}

#ifdef CONFIG_PREEMPT_RT
extern bool local_bh_blocked(void);
#else
static inline bool local_bh_blocked(void) { return false; }
#endif

#endif /* _LINUX_BH_H */


























































































































































  776 
































































































  776 


  776 








  776 































































































































































































































































  774 
  777 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/memblock.h>
#include <linux/page_ext.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate.h>
#include <linux/pgalloc_tag.h>

/*
 * struct page extension
 *
 * This is the feature to manage memory for extended data per page.
 *
 * Until now, we must modify struct page itself to store extra data per page.
 * This requires rebuilding the kernel and it is really time consuming process.
 * And, sometimes, rebuild is impossible due to third party module dependency.
 * At last, enlarging struct page could cause un-wanted system behaviour change.
 *
 * This feature is intended to overcome above mentioned problems. This feature
 * allocates memory for extended data per page in certain place rather than
 * the struct page itself. This memory can be accessed by the accessor
 * functions provided by this code. During the boot process, it checks whether
 * allocation of huge chunk of memory is needed or not. If not, it avoids
 * allocating memory at all. With this advantage, we can include this feature
 * into the kernel in default and can avoid rebuild and solve related problems.
 *
 * To help these things to work well, there are two callbacks for clients. One
 * is the need callback which is mandatory if user wants to avoid useless
 * memory allocation at boot-time. The other is optional, init callback, which
 * is used to do proper initialization after memory is allocated.
 *
 * The need callback is used to decide whether extended memory allocation is
 * needed or not. Sometimes users want to deactivate some features in this
 * boot and extra memory would be unnecessary. In this case, to avoid
 * allocating huge chunk of memory, each clients represent their need of
 * extra memory through the need callback. If one of the need callbacks
 * returns true, it means that someone needs extra memory so that
 * page extension core should allocates memory for page extension. If
 * none of need callbacks return true, memory isn't needed at all in this boot
 * and page extension core can skip to allocate memory. As result,
 * none of memory is wasted.
 *
 * When need callback returns true, page_ext checks if there is a request for
 * extra memory through size in struct page_ext_operations. If it is non-zero,
 * extra space is allocated for each page_ext entry and offset is returned to
 * user through offset in struct page_ext_operations.
 *
 * The init callback is used to do proper initialization after page extension
 * is completely initialized. In sparse memory system, extra memory is
 * allocated some time later than memmap is allocated. In other words, lifetime
 * of memory for page extension isn't same with memmap for struct page.
 * Therefore, clients can't store extra data until page extension is
 * initialized, even if pages are allocated and used freely. This could
 * cause inadequate state of extra data per page, so, to prevent it, client
 * can utilize this callback to initialize the state of it correctly.
 */

#ifdef CONFIG_SPARSEMEM
#define PAGE_EXT_INVALID       (0x1)
#endif

#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
static bool need_page_idle(void)
{
        return true;
}
static struct page_ext_operations page_idle_ops __initdata = {
        .need = need_page_idle,
        .need_shared_flags = true,
};
#endif

static struct page_ext_operations *page_ext_ops[] __initdata = {
#ifdef CONFIG_PAGE_OWNER
        &page_owner_ops,
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
        &page_idle_ops,
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        &page_alloc_tagging_ops,
#endif
#ifdef CONFIG_PAGE_TABLE_CHECK
        &page_table_check_ops,
#endif
};

unsigned long page_ext_size;

static unsigned long total_usage;

#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
/*
 * To ensure correct allocation tagging for pages, page_ext should be available
 * before the first page allocation. Otherwise early task stacks will be
 * allocated before page_ext initialization and missing tags will be flagged.
 */
bool early_page_ext __meminitdata = true;
#else
bool early_page_ext __meminitdata;
#endif
static int __init setup_early_page_ext(char *str)
{
        early_page_ext = true;
        return 0;
}
early_param("early_page_ext", setup_early_page_ext);

static bool __init invoke_need_callbacks(void)
{
        int i;
        int entries = ARRAY_SIZE(page_ext_ops);
        bool need = false;

        for (i = 0; i < entries; i++) {
                if (page_ext_ops[i]->need()) {
                        if (page_ext_ops[i]->need_shared_flags) {
                                page_ext_size = sizeof(struct page_ext);
                                break;
                        }
                }
        }

        for (i = 0; i < entries; i++) {
                if (page_ext_ops[i]->need()) {
                        page_ext_ops[i]->offset = page_ext_size;
                        page_ext_size += page_ext_ops[i]->size;
                        need = true;
                }
        }

        return need;
}

static void __init invoke_init_callbacks(void)
{
        int i;
        int entries = ARRAY_SIZE(page_ext_ops);

        for (i = 0; i < entries; i++) {
                if (page_ext_ops[i]->init)
                        page_ext_ops[i]->init();
        }
}

static inline struct page_ext *get_entry(void *base, unsigned long index)
{
        return base + page_ext_size * index;
}

#ifndef CONFIG_SPARSEMEM
void __init page_ext_init_flatmem_late(void)
{
        invoke_init_callbacks();
}

void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
{
        pgdat->node_page_ext = NULL;
}

static struct page_ext *lookup_page_ext(const struct page *page)
{
        unsigned long pfn = page_to_pfn(page);
        unsigned long index;
        struct page_ext *base;

        WARN_ON_ONCE(!rcu_read_lock_held());
        base = NODE_DATA(page_to_nid(page))->node_page_ext;
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
         */
        if (unlikely(!base))
                return NULL;
        index = pfn - round_down(node_start_pfn(page_to_nid(page)),
                                        MAX_ORDER_NR_PAGES);
        return get_entry(base, index);
}

static int __init alloc_node_page_ext(int nid)
{
        struct page_ext *base;
        unsigned long table_size;
        unsigned long nr_pages;

        nr_pages = NODE_DATA(nid)->node_spanned_pages;
        if (!nr_pages)
                return 0;

        /*
         * Need extra space if node range is not aligned with
         * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
         * checks buddy's status, range could be out of exact node range.
         */
        if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
                !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
                nr_pages += MAX_ORDER_NR_PAGES;

        table_size = page_ext_size * nr_pages;

        base = memblock_alloc_try_nid(
                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
                        MEMBLOCK_ALLOC_ACCESSIBLE, nid);
        if (!base)
                return -ENOMEM;
        NODE_DATA(nid)->node_page_ext = base;
        total_usage += table_size;
        memmap_boot_pages_add(DIV_ROUND_UP(table_size, PAGE_SIZE));
        return 0;
}

void __init page_ext_init_flatmem(void)
{

        int nid, fail;

        if (!invoke_need_callbacks())
                return;

        for_each_online_node(nid)  {
                fail = alloc_node_page_ext(nid);
                if (fail)
                        goto fail;
        }
        pr_info("allocated %ld bytes of page_ext\n", total_usage);
        return;

fail:
        pr_crit("allocation of page_ext failed.\n");
        panic("Out of memory");
}

#else /* CONFIG_SPARSEMEM */
static bool page_ext_invalid(struct page_ext *page_ext)
{
        return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
}

static struct page_ext *lookup_page_ext(const struct page *page)
{
        unsigned long pfn = page_to_pfn(page);
        struct mem_section *section = __pfn_to_section(pfn);
        struct page_ext *page_ext = READ_ONCE(section->page_ext);

        WARN_ON_ONCE(!rcu_read_lock_held());
        /*
         * The sanity checks the page allocator does upon freeing a
         * page can reach here before the page_ext arrays are
         * allocated when feeding a range of pages to the allocator
         * for the first time during bootup or memory hotplug.
         */
        if (page_ext_invalid(page_ext))
                return NULL;
        return get_entry(page_ext, pfn);
}

static void *__meminit alloc_page_ext(size_t size, int nid)
{
        gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
        void *addr = NULL;

        addr = alloc_pages_exact_nid(nid, size, flags);
        if (addr)
                kmemleak_alloc(addr, size, 1, flags);
        else
                addr = vzalloc_node(size, nid);

        if (addr)
                memmap_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));

        return addr;
}

static int __meminit init_section_page_ext(unsigned long pfn, int nid)
{
        struct mem_section *section;
        struct page_ext *base;
        unsigned long table_size;

        section = __pfn_to_section(pfn);

        if (section->page_ext)
                return 0;

        table_size = page_ext_size * PAGES_PER_SECTION;
        base = alloc_page_ext(table_size, nid);

        /*
         * The value stored in section->page_ext is (base - pfn)
         * and it does not point to the memory block allocated above,
         * causing kmemleak false positives.
         */
        kmemleak_not_leak(base);

        if (!base) {
                pr_err("page ext allocation failure\n");
                return -ENOMEM;
        }

        /*
         * The passed "pfn" may not be aligned to SECTION.  For the calculation
         * we need to apply a mask.
         */
        pfn &= PAGE_SECTION_MASK;
        section->page_ext = (void *)base - page_ext_size * pfn;
        total_usage += table_size;
        return 0;
}

static void free_page_ext(void *addr)
{
        size_t table_size;
        struct page *page;

        table_size = page_ext_size * PAGES_PER_SECTION;
        memmap_pages_add(-1L * (DIV_ROUND_UP(table_size, PAGE_SIZE)));

        if (is_vmalloc_addr(addr)) {
                vfree(addr);
        } else {
                page = virt_to_page(addr);
                BUG_ON(PageReserved(page));
                kmemleak_free(addr);
                free_pages_exact(addr, table_size);
        }
}

static void __free_page_ext(unsigned long pfn)
{
        struct mem_section *ms;
        struct page_ext *base;

        ms = __pfn_to_section(pfn);
        if (!ms || !ms->page_ext)
                return;

        base = READ_ONCE(ms->page_ext);
        /*
         * page_ext here can be valid while doing the roll back
         * operation in online_page_ext().
         */
        if (page_ext_invalid(base))
                base = (void *)base - PAGE_EXT_INVALID;
        WRITE_ONCE(ms->page_ext, NULL);

        base = get_entry(base, pfn);
        free_page_ext(base);
}

static void __invalidate_page_ext(unsigned long pfn)
{
        struct mem_section *ms;
        void *val;

        ms = __pfn_to_section(pfn);
        if (!ms || !ms->page_ext)
                return;
        val = (void *)ms->page_ext + PAGE_EXT_INVALID;
        WRITE_ONCE(ms->page_ext, val);
}

static int __meminit online_page_ext(unsigned long start_pfn,
                                unsigned long nr_pages,
                                int nid)
{
        unsigned long start, end, pfn;
        int fail = 0;

        start = SECTION_ALIGN_DOWN(start_pfn);
        end = SECTION_ALIGN_UP(start_pfn + nr_pages);

        if (nid == NUMA_NO_NODE) {
                /*
                 * In this case, "nid" already exists and contains valid memory.
                 * "start_pfn" passed to us is a pfn which is an arg for
                 * online__pages(), and start_pfn should exist.
                 */
                nid = pfn_to_nid(start_pfn);
                VM_BUG_ON(!node_online(nid));
        }

        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
                fail = init_section_page_ext(pfn, nid);
        if (!fail)
                return 0;

        /* rollback */
        end = pfn - PAGES_PER_SECTION;
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __free_page_ext(pfn);

        return -ENOMEM;
}

static void __meminit offline_page_ext(unsigned long start_pfn,
                                unsigned long nr_pages)
{
        unsigned long start, end, pfn;

        start = SECTION_ALIGN_DOWN(start_pfn);
        end = SECTION_ALIGN_UP(start_pfn + nr_pages);

        /*
         * Freeing of page_ext is done in 3 steps to avoid
         * use-after-free of it:
         * 1) Traverse all the sections and mark their page_ext
         *    as invalid.
         * 2) Wait for all the existing users of page_ext who
         *    started before invalidation to finish.
         * 3) Free the page_ext.
         */
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __invalidate_page_ext(pfn);

        synchronize_rcu();

        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
                __free_page_ext(pfn);
}

static int __meminit page_ext_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
{
        struct memory_notify *mn = arg;
        int ret = 0;

        switch (action) {
        case MEM_GOING_ONLINE:
                ret = online_page_ext(mn->start_pfn,
                                   mn->nr_pages, mn->status_change_nid);
                break;
        case MEM_OFFLINE:
                offline_page_ext(mn->start_pfn,
                                mn->nr_pages);
                break;
        case MEM_CANCEL_ONLINE:
                offline_page_ext(mn->start_pfn,
                                mn->nr_pages);
                break;
        case MEM_GOING_OFFLINE:
                break;
        case MEM_ONLINE:
        case MEM_CANCEL_OFFLINE:
                break;
        }

        return notifier_from_errno(ret);
}

void __init page_ext_init(void)
{
        unsigned long pfn;
        int nid;

        if (!invoke_need_callbacks())
                return;

        for_each_node_state(nid, N_MEMORY) {
                unsigned long start_pfn, end_pfn;

                start_pfn = node_start_pfn(nid);
                end_pfn = node_end_pfn(nid);
                /*
                 * start_pfn and end_pfn may not be aligned to SECTION and the
                 * page->flags of out of node pages are not initialized.  So we
                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
                 */
                for (pfn = start_pfn; pfn < end_pfn;
                        pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {

                        if (!pfn_valid(pfn))
                                continue;
                        /*
                         * Nodes's pfns can be overlapping.
                         * We know some arch can have a nodes layout such as
                         * -------------pfn-------------->
                         * N0 | N1 | N2 | N0 | N1 | N2|....
                         */
                        if (pfn_to_nid(pfn) != nid)
                                continue;
                        if (init_section_page_ext(pfn, nid))
                                goto oom;
                        cond_resched();
                }
        }
        hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
        pr_info("allocated %ld bytes of page_ext\n", total_usage);
        invoke_init_callbacks();
        return;

oom:
        panic("Out of memory");
}

void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
{
}

#endif

/**
 * page_ext_lookup() - Lookup a page extension for a PFN.
 * @pfn: PFN of the page we're interested in.
 *
 * Must be called with RCU read lock taken and @pfn must be valid.
 *
 * Return: NULL if no page_ext exists for this page.
 */
struct page_ext *page_ext_lookup(unsigned long pfn)
{
        return lookup_page_ext(pfn_to_page(pfn));
}

/**
 * page_ext_get() - Get the extended information for a page.
 * @page: The page we're interested in.
 *
 * Ensures that the page_ext will remain valid until page_ext_put()
 * is called.
 *
 * Return: NULL if no page_ext exists for this page.
 * Context: Any context.  Caller may not sleep until they have called
 * page_ext_put().
 */
struct page_ext *page_ext_get(const struct page *page)
{
        struct page_ext *page_ext;

        rcu_read_lock();
        page_ext = lookup_page_ext(page);
        if (!page_ext) {
                rcu_read_unlock();
                return NULL;
        }

        return page_ext;
}

/**
 * page_ext_put() - Working with page extended information is done.
 * @page_ext: Page extended information received from page_ext_get().
 *
 * The page extended information of the page may not be valid after this
 * function is called.
 *
 * Return: None.
 * Context: Any context with corresponding page_ext_get() is called.
 */
void page_ext_put(struct page_ext *page_ext)
{
        if (unlikely(!page_ext))
                return;

        rcu_read_unlock();
}
































































































































































































































































































































































    7 




    7 





































    8 







  412 

















  413 
































   33 

   32 






































   37 









   39 























  305 
   73 
  254 


























  306 









































































    7 
    7 
    4 





    7 











   32 


















   33 
   33 
   33 






   33 












   32 














   31 






   33 





   33 

   33 






   33 









   33 
















   33 






   33 




































































   32 





















    1 

    1 



















   33 










   32 


























   33 










































































    7 


    7 





   33 



   32 


















  414 
















  415 

    7 







  415 


































  413 
    8 


  415 

  413 






















    7 





    6 
    1 




























  705 






  704 
  299 

    7 





  704 

  705 
   25 






















  415 






  417 







  247 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Implementation of the kernel access vector cache (AVC).
 *
 * Authors:  Stephen Smalley, <stephen.smalley.work@gmail.com>
 *             James Morris <jmorris@redhat.com>
 *
 * Update:   KaiGai, Kohei <kaigai@ak.jp.nec.com>
 *        Replaced the avc_lock spinlock by RCU.
 *
 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/dcache.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/percpu.h>
#include <linux/list.h>
#include <net/sock.h>
#include <linux/un.h>
#include <net/af_unix.h>
#include <linux/ip.h>
#include <linux/audit.h>
#include <linux/ipv6.h>
#include <net/ipv6.h>
#include "avc.h"
#include "avc_ss.h"
#include "classmap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/avc.h>

#define AVC_CACHE_SLOTS                        512
#define AVC_DEF_CACHE_THRESHOLD                512
#define AVC_CACHE_RECLAIM                16

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
#define avc_cache_stats_incr(field)        this_cpu_inc(avc_cache_stats.field)
#else
#define avc_cache_stats_incr(field)        do {} while (0)
#endif

struct avc_entry {
        u32                        ssid;
        u32                        tsid;
        u16                        tclass;
        struct av_decision        avd;
        struct avc_xperms_node        *xp_node;
};

struct avc_node {
        struct avc_entry        ae;
        struct hlist_node        list; /* anchored in avc_cache->slots[i] */
        struct rcu_head                rhead;
};

struct avc_xperms_decision_node {
        struct extended_perms_decision xpd;
        struct list_head xpd_list; /* list of extended_perms_decision */
};

struct avc_xperms_node {
        struct extended_perms xp;
        struct list_head xpd_head; /* list head of extended_perms_decision */
};

struct avc_cache {
        struct hlist_head        slots[AVC_CACHE_SLOTS]; /* head for avc_node->list */
        spinlock_t                slots_lock[AVC_CACHE_SLOTS]; /* lock for writes */
        atomic_t                lru_hint;        /* LRU hint for reclaim scan */
        atomic_t                active_nodes;
        u32                        latest_notif;        /* latest revocation notification */
};

struct avc_callback_node {
        int (*callback) (u32 event);
        u32 events;
        struct avc_callback_node *next;
};

#ifdef CONFIG_SECURITY_SELINUX_AVC_STATS
DEFINE_PER_CPU(struct avc_cache_stats, avc_cache_stats) = { 0 };
#endif

struct selinux_avc {
        unsigned int avc_cache_threshold;
        struct avc_cache avc_cache;
};

static struct selinux_avc selinux_avc;

void selinux_avc_init(void)
{
        int i;

        selinux_avc.avc_cache_threshold = AVC_DEF_CACHE_THRESHOLD;
        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                INIT_HLIST_HEAD(&selinux_avc.avc_cache.slots[i]);
                spin_lock_init(&selinux_avc.avc_cache.slots_lock[i]);
        }
        atomic_set(&selinux_avc.avc_cache.active_nodes, 0);
        atomic_set(&selinux_avc.avc_cache.lru_hint, 0);
}

unsigned int avc_get_cache_threshold(void)
{
        return selinux_avc.avc_cache_threshold;
}

void avc_set_cache_threshold(unsigned int cache_threshold)
{
        selinux_avc.avc_cache_threshold = cache_threshold;
}

static struct avc_callback_node *avc_callbacks __ro_after_init;
static struct kmem_cache *avc_node_cachep __ro_after_init;
static struct kmem_cache *avc_xperms_data_cachep __ro_after_init;
static struct kmem_cache *avc_xperms_decision_cachep __ro_after_init;
static struct kmem_cache *avc_xperms_cachep __ro_after_init;

static inline u32 avc_hash(u32 ssid, u32 tsid, u16 tclass)
{
        return (ssid ^ (tsid<<2) ^ (tclass<<4)) & (AVC_CACHE_SLOTS - 1);
}

/**
 * avc_init - Initialize the AVC.
 *
 * Initialize the access vector cache.
 */
void __init avc_init(void)
{
        avc_node_cachep = KMEM_CACHE(avc_node, SLAB_PANIC);
        avc_xperms_cachep = KMEM_CACHE(avc_xperms_node, SLAB_PANIC);
        avc_xperms_decision_cachep = KMEM_CACHE(avc_xperms_decision_node, SLAB_PANIC);
        avc_xperms_data_cachep = KMEM_CACHE(extended_perms_data, SLAB_PANIC);
}

int avc_get_hash_stats(char *page)
{
        int i, chain_len, max_chain_len, slots_used;
        struct avc_node *node;
        struct hlist_head *head;

        rcu_read_lock();

        slots_used = 0;
        max_chain_len = 0;
        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                head = &selinux_avc.avc_cache.slots[i];
                if (!hlist_empty(head)) {
                        slots_used++;
                        chain_len = 0;
                        hlist_for_each_entry_rcu(node, head, list)
                                chain_len++;
                        if (chain_len > max_chain_len)
                                max_chain_len = chain_len;
                }
        }

        rcu_read_unlock();

        return scnprintf(page, PAGE_SIZE, "entries: %d\nbuckets used: %d/%d\n"
                         "longest chain: %d\n",
                         atomic_read(&selinux_avc.avc_cache.active_nodes),
                         slots_used, AVC_CACHE_SLOTS, max_chain_len);
}

/*
 * using a linked list for extended_perms_decision lookup because the list is
 * always small. i.e. less than 5, typically 1
 */
static struct extended_perms_decision *
avc_xperms_decision_lookup(u8 driver, u8 base_perm,
                           struct avc_xperms_node *xp_node)
{
        struct avc_xperms_decision_node *xpd_node;

        list_for_each_entry(xpd_node, &xp_node->xpd_head, xpd_list) {
                if (xpd_node->xpd.driver == driver &&
                    xpd_node->xpd.base_perm == base_perm)
                        return &xpd_node->xpd;
        }
        return NULL;
}

static inline unsigned int
avc_xperms_has_perm(struct extended_perms_decision *xpd,
                                        u8 perm, u8 which)
{
        unsigned int rc = 0;

        if ((which == XPERMS_ALLOWED) &&
                        (xpd->used & XPERMS_ALLOWED))
                rc = security_xperm_test(xpd->allowed->p, perm);
        else if ((which == XPERMS_AUDITALLOW) &&
                        (xpd->used & XPERMS_AUDITALLOW))
                rc = security_xperm_test(xpd->auditallow->p, perm);
        else if ((which == XPERMS_DONTAUDIT) &&
                        (xpd->used & XPERMS_DONTAUDIT))
                rc = security_xperm_test(xpd->dontaudit->p, perm);
        return rc;
}

static void avc_xperms_allow_perm(struct avc_xperms_node *xp_node,
                                  u8 driver, u8 base_perm, u8 perm)
{
        struct extended_perms_decision *xpd;
        security_xperm_set(xp_node->xp.drivers.p, driver);
        xp_node->xp.base_perms |= base_perm;
        xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node);
        if (xpd && xpd->allowed)
                security_xperm_set(xpd->allowed->p, perm);
}

static void avc_xperms_decision_free(struct avc_xperms_decision_node *xpd_node)
{
        struct extended_perms_decision *xpd;

        xpd = &xpd_node->xpd;
        if (xpd->allowed)
                kmem_cache_free(avc_xperms_data_cachep, xpd->allowed);
        if (xpd->auditallow)
                kmem_cache_free(avc_xperms_data_cachep, xpd->auditallow);
        if (xpd->dontaudit)
                kmem_cache_free(avc_xperms_data_cachep, xpd->dontaudit);
        kmem_cache_free(avc_xperms_decision_cachep, xpd_node);
}

static void avc_xperms_free(struct avc_xperms_node *xp_node)
{
        struct avc_xperms_decision_node *xpd_node, *tmp;

        if (!xp_node)
                return;

        list_for_each_entry_safe(xpd_node, tmp, &xp_node->xpd_head, xpd_list) {
                list_del(&xpd_node->xpd_list);
                avc_xperms_decision_free(xpd_node);
        }
        kmem_cache_free(avc_xperms_cachep, xp_node);
}

static void avc_copy_xperms_decision(struct extended_perms_decision *dest,
                                        struct extended_perms_decision *src)
{
        dest->base_perm = src->base_perm;
        dest->driver = src->driver;
        dest->used = src->used;
        if (dest->used & XPERMS_ALLOWED)
                memcpy(dest->allowed->p, src->allowed->p,
                                sizeof(src->allowed->p));
        if (dest->used & XPERMS_AUDITALLOW)
                memcpy(dest->auditallow->p, src->auditallow->p,
                                sizeof(src->auditallow->p));
        if (dest->used & XPERMS_DONTAUDIT)
                memcpy(dest->dontaudit->p, src->dontaudit->p,
                                sizeof(src->dontaudit->p));
}

/*
 * similar to avc_copy_xperms_decision, but only copy decision
 * information relevant to this perm
 */
static inline void avc_quick_copy_xperms_decision(u8 perm,
                        struct extended_perms_decision *dest,
                        struct extended_perms_decision *src)
{
        /*
         * compute index of the u32 of the 256 bits (8 u32s) that contain this
         * command permission
         */
        u8 i = perm >> 5;

        dest->base_perm = src->base_perm;
        dest->used = src->used;
        if (dest->used & XPERMS_ALLOWED)
                dest->allowed->p[i] = src->allowed->p[i];
        if (dest->used & XPERMS_AUDITALLOW)
                dest->auditallow->p[i] = src->auditallow->p[i];
        if (dest->used & XPERMS_DONTAUDIT)
                dest->dontaudit->p[i] = src->dontaudit->p[i];
}

static struct avc_xperms_decision_node
                *avc_xperms_decision_alloc(u8 which)
{
        struct avc_xperms_decision_node *xpd_node;
        struct extended_perms_decision *xpd;

        xpd_node = kmem_cache_zalloc(avc_xperms_decision_cachep,
                                     GFP_NOWAIT | __GFP_NOWARN);
        if (!xpd_node)
                return NULL;

        xpd = &xpd_node->xpd;
        if (which & XPERMS_ALLOWED) {
                xpd->allowed = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                GFP_NOWAIT | __GFP_NOWARN);
                if (!xpd->allowed)
                        goto error;
        }
        if (which & XPERMS_AUDITALLOW) {
                xpd->auditallow = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                GFP_NOWAIT | __GFP_NOWARN);
                if (!xpd->auditallow)
                        goto error;
        }
        if (which & XPERMS_DONTAUDIT) {
                xpd->dontaudit = kmem_cache_zalloc(avc_xperms_data_cachep,
                                                GFP_NOWAIT | __GFP_NOWARN);
                if (!xpd->dontaudit)
                        goto error;
        }
        return xpd_node;
error:
        avc_xperms_decision_free(xpd_node);
        return NULL;
}

static int avc_add_xperms_decision(struct avc_node *node,
                        struct extended_perms_decision *src)
{
        struct avc_xperms_decision_node *dest_xpd;

        dest_xpd = avc_xperms_decision_alloc(src->used);
        if (!dest_xpd)
                return -ENOMEM;
        avc_copy_xperms_decision(&dest_xpd->xpd, src);
        list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head);
        node->ae.xp_node->xp.len++;
        return 0;
}

static struct avc_xperms_node *avc_xperms_alloc(void)
{
        struct avc_xperms_node *xp_node;

        xp_node = kmem_cache_zalloc(avc_xperms_cachep, GFP_NOWAIT | __GFP_NOWARN);
        if (!xp_node)
                return xp_node;
        INIT_LIST_HEAD(&xp_node->xpd_head);
        return xp_node;
}

static int avc_xperms_populate(struct avc_node *node,
                                struct avc_xperms_node *src)
{
        struct avc_xperms_node *dest;
        struct avc_xperms_decision_node *dest_xpd;
        struct avc_xperms_decision_node *src_xpd;

        if (src->xp.len == 0)
                return 0;
        dest = avc_xperms_alloc();
        if (!dest)
                return -ENOMEM;

        memcpy(dest->xp.drivers.p, src->xp.drivers.p, sizeof(dest->xp.drivers.p));
        dest->xp.len = src->xp.len;
        dest->xp.base_perms = src->xp.base_perms;

        /* for each source xpd allocate a destination xpd and copy */
        list_for_each_entry(src_xpd, &src->xpd_head, xpd_list) {
                dest_xpd = avc_xperms_decision_alloc(src_xpd->xpd.used);
                if (!dest_xpd)
                        goto error;
                avc_copy_xperms_decision(&dest_xpd->xpd, &src_xpd->xpd);
                list_add(&dest_xpd->xpd_list, &dest->xpd_head);
        }
        node->ae.xp_node = dest;
        return 0;
error:
        avc_xperms_free(dest);
        return -ENOMEM;

}

static inline u32 avc_xperms_audit_required(u32 requested,
                                        struct av_decision *avd,
                                        struct extended_perms_decision *xpd,
                                        u8 perm,
                                        int result,
                                        u32 *deniedp)
{
        u32 denied, audited;

        denied = requested & ~avd->allowed;
        if (unlikely(denied)) {
                audited = denied & avd->auditdeny;
                if (audited && xpd) {
                        if (avc_xperms_has_perm(xpd, perm, XPERMS_DONTAUDIT))
                                audited = 0;
                }
        } else if (result) {
                audited = denied = requested;
        } else {
                audited = requested & avd->auditallow;
                if (audited && xpd) {
                        if (!avc_xperms_has_perm(xpd, perm, XPERMS_AUDITALLOW))
                                audited = 0;
                }
        }

        *deniedp = denied;
        return audited;
}

static inline int avc_xperms_audit(u32 ssid, u32 tsid, u16 tclass,
                                   u32 requested, struct av_decision *avd,
                                   struct extended_perms_decision *xpd,
                                   u8 perm, int result,
                                   struct common_audit_data *ad)
{
        u32 audited, denied;

        audited = avc_xperms_audit_required(
                        requested, avd, xpd, perm, result, &denied);
        if (likely(!audited))
                return 0;
        return slow_avc_audit(ssid, tsid, tclass, requested,
                        audited, denied, result, ad);
}

static void avc_node_free(struct rcu_head *rhead)
{
        struct avc_node *node = container_of(rhead, struct avc_node, rhead);
        avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
}

static void avc_node_delete(struct avc_node *node)
{
        hlist_del_rcu(&node->list);
        call_rcu(&node->rhead, avc_node_free);
        atomic_dec(&selinux_avc.avc_cache.active_nodes);
}

static void avc_node_kill(struct avc_node *node)
{
        avc_xperms_free(node->ae.xp_node);
        kmem_cache_free(avc_node_cachep, node);
        avc_cache_stats_incr(frees);
        atomic_dec(&selinux_avc.avc_cache.active_nodes);
}

static void avc_node_replace(struct avc_node *new, struct avc_node *old)
{
        hlist_replace_rcu(&old->list, &new->list);
        call_rcu(&old->rhead, avc_node_free);
        atomic_dec(&selinux_avc.avc_cache.active_nodes);
}

static inline int avc_reclaim_node(void)
{
        struct avc_node *node;
        int hvalue, try, ecx;
        unsigned long flags;
        struct hlist_head *head;
        spinlock_t *lock;

        for (try = 0, ecx = 0; try < AVC_CACHE_SLOTS; try++) {
                hvalue = atomic_inc_return(&selinux_avc.avc_cache.lru_hint) &
                        (AVC_CACHE_SLOTS - 1);
                head = &selinux_avc.avc_cache.slots[hvalue];
                lock = &selinux_avc.avc_cache.slots_lock[hvalue];

                if (!spin_trylock_irqsave(lock, flags))
                        continue;

                rcu_read_lock();
                hlist_for_each_entry(node, head, list) {
                        avc_node_delete(node);
                        avc_cache_stats_incr(reclaims);
                        ecx++;
                        if (ecx >= AVC_CACHE_RECLAIM) {
                                rcu_read_unlock();
                                spin_unlock_irqrestore(lock, flags);
                                goto out;
                        }
                }
                rcu_read_unlock();
                spin_unlock_irqrestore(lock, flags);
        }
out:
        return ecx;
}

static struct avc_node *avc_alloc_node(void)
{
        struct avc_node *node;

        node = kmem_cache_zalloc(avc_node_cachep, GFP_NOWAIT | __GFP_NOWARN);
        if (!node)
                goto out;

        INIT_HLIST_NODE(&node->list);
        avc_cache_stats_incr(allocations);

        if (atomic_inc_return(&selinux_avc.avc_cache.active_nodes) >
            selinux_avc.avc_cache_threshold)
                avc_reclaim_node();

out:
        return node;
}

static void avc_node_populate(struct avc_node *node, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd)
{
        node->ae.ssid = ssid;
        node->ae.tsid = tsid;
        node->ae.tclass = tclass;
        memcpy(&node->ae.avd, avd, sizeof(node->ae.avd));
}

static inline struct avc_node *avc_search_node(u32 ssid, u32 tsid, u16 tclass)
{
        struct avc_node *node, *ret = NULL;
        u32 hvalue;
        struct hlist_head *head;

        hvalue = avc_hash(ssid, tsid, tclass);
        head = &selinux_avc.avc_cache.slots[hvalue];
        hlist_for_each_entry_rcu(node, head, list) {
                if (ssid == node->ae.ssid &&
                    tclass == node->ae.tclass &&
                    tsid == node->ae.tsid) {
                        ret = node;
                        break;
                }
        }

        return ret;
}

/**
 * avc_lookup - Look up an AVC entry.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 *
 * Look up an AVC entry that is valid for the
 * (@ssid, @tsid), interpreting the permissions
 * based on @tclass.  If a valid AVC entry exists,
 * then this function returns the avc_node.
 * Otherwise, this function returns NULL.
 */
static struct avc_node *avc_lookup(u32 ssid, u32 tsid, u16 tclass)
{
        struct avc_node *node;

        avc_cache_stats_incr(lookups);
        node = avc_search_node(ssid, tsid, tclass);

        if (node)
                return node;

        avc_cache_stats_incr(misses);
        return NULL;
}

static int avc_latest_notif_update(u32 seqno, int is_insert)
{
        int ret = 0;
        static DEFINE_SPINLOCK(notif_lock);
        unsigned long flag;

        spin_lock_irqsave(&notif_lock, flag);
        if (is_insert) {
                if (seqno < selinux_avc.avc_cache.latest_notif) {
                        pr_warn("SELinux: avc:  seqno %d < latest_notif %d\n",
                               seqno, selinux_avc.avc_cache.latest_notif);
                        ret = -EAGAIN;
                }
        } else {
                if (seqno > selinux_avc.avc_cache.latest_notif)
                        selinux_avc.avc_cache.latest_notif = seqno;
        }
        spin_unlock_irqrestore(&notif_lock, flag);

        return ret;
}

/**
 * avc_insert - Insert an AVC entry.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @avd: resulting av decision
 * @xp_node: resulting extended permissions
 *
 * Insert an AVC entry for the SID pair
 * (@ssid, @tsid) and class @tclass.
 * The access vectors and the sequence number are
 * normally provided by the security server in
 * response to a security_compute_av() call.  If the
 * sequence number @avd->seqno is not less than the latest
 * revocation notification, then the function copies
 * the access vectors into a cache entry.
 */
static void avc_insert(u32 ssid, u32 tsid, u16 tclass,
                       struct av_decision *avd, struct avc_xperms_node *xp_node)
{
        struct avc_node *pos, *node = NULL;
        u32 hvalue;
        unsigned long flag;
        spinlock_t *lock;
        struct hlist_head *head;

        if (avc_latest_notif_update(avd->seqno, 1))
                return;

        node = avc_alloc_node();
        if (!node)
                return;

        avc_node_populate(node, ssid, tsid, tclass, avd);
        if (avc_xperms_populate(node, xp_node)) {
                avc_node_kill(node);
                return;
        }

        hvalue = avc_hash(ssid, tsid, tclass);
        head = &selinux_avc.avc_cache.slots[hvalue];
        lock = &selinux_avc.avc_cache.slots_lock[hvalue];
        spin_lock_irqsave(lock, flag);
        hlist_for_each_entry(pos, head, list) {
                if (pos->ae.ssid == ssid &&
                        pos->ae.tsid == tsid &&
                        pos->ae.tclass == tclass) {
                        avc_node_replace(node, pos);
                        goto found;
                }
        }
        hlist_add_head_rcu(&node->list, head);
found:
        spin_unlock_irqrestore(lock, flag);
}

/**
 * avc_audit_pre_callback - SELinux specific information
 * will be called by generic audit code
 * @ab: the audit buffer
 * @a: audit_data
 */
static void avc_audit_pre_callback(struct audit_buffer *ab, void *a)
{
        struct common_audit_data *ad = a;
        struct selinux_audit_data *sad = ad->selinux_audit_data;
        u32 av = sad->audited, perm;
        const char *const *perms;
        u32 i;

        audit_log_format(ab, "avc:  %s ", sad->denied ? "denied" : "granted");

        if (av == 0) {
                audit_log_format(ab, " null");
                return;
        }

        perms = secclass_map[sad->tclass-1].perms;

        audit_log_format(ab, " {");
        i = 0;
        perm = 1;
        while (i < (sizeof(av) * 8)) {
                if ((perm & av) && perms[i]) {
                        audit_log_format(ab, " %s", perms[i]);
                        av &= ~perm;
                }
                i++;
                perm <<= 1;
        }

        if (av)
                audit_log_format(ab, " 0x%x", av);

        audit_log_format(ab, " } for ");
}

/**
 * avc_audit_post_callback - SELinux specific information
 * will be called by generic audit code
 * @ab: the audit buffer
 * @a: audit_data
 */
static void avc_audit_post_callback(struct audit_buffer *ab, void *a)
{
        struct common_audit_data *ad = a;
        struct selinux_audit_data *sad = ad->selinux_audit_data;
        char *scontext = NULL;
        char *tcontext = NULL;
        const char *tclass = NULL;
        u32 scontext_len;
        u32 tcontext_len;
        int rc;

        rc = security_sid_to_context(sad->ssid, &scontext,
                                     &scontext_len);
        if (rc)
                audit_log_format(ab, " ssid=%d", sad->ssid);
        else
                audit_log_format(ab, " scontext=%s", scontext);

        rc = security_sid_to_context(sad->tsid, &tcontext,
                                     &tcontext_len);
        if (rc)
                audit_log_format(ab, " tsid=%d", sad->tsid);
        else
                audit_log_format(ab, " tcontext=%s", tcontext);

        tclass = secclass_map[sad->tclass-1].name;
        audit_log_format(ab, " tclass=%s", tclass);

        if (sad->denied)
                audit_log_format(ab, " permissive=%u", sad->result ? 0 : 1);

        trace_selinux_audited(sad, scontext, tcontext, tclass);
        kfree(tcontext);
        kfree(scontext);

        /* in case of invalid context report also the actual context string */
        rc = security_sid_to_context_inval(sad->ssid, &scontext,
                                           &scontext_len);
        if (!rc && scontext) {
                if (scontext_len && scontext[scontext_len - 1] == '\0')
                        scontext_len--;
                audit_log_format(ab, " srawcon=");
                audit_log_n_untrustedstring(ab, scontext, scontext_len);
                kfree(scontext);
        }

        rc = security_sid_to_context_inval(sad->tsid, &scontext,
                                           &scontext_len);
        if (!rc && scontext) {
                if (scontext_len && scontext[scontext_len - 1] == '\0')
                        scontext_len--;
                audit_log_format(ab, " trawcon=");
                audit_log_n_untrustedstring(ab, scontext, scontext_len);
                kfree(scontext);
        }
}

/*
 * This is the slow part of avc audit with big stack footprint.
 * Note that it is non-blocking and can be called from under
 * rcu_read_lock().
 */
noinline int slow_avc_audit(u32 ssid, u32 tsid, u16 tclass,
                            u32 requested, u32 audited, u32 denied, int result,
                            struct common_audit_data *a)
{
        struct common_audit_data stack_data;
        struct selinux_audit_data sad;

        if (WARN_ON(!tclass || tclass >= ARRAY_SIZE(secclass_map)))
                return -EINVAL;

        if (!a) {
                a = &stack_data;
                a->type = LSM_AUDIT_DATA_NONE;
        }

        sad.tclass = tclass;
        sad.requested = requested;
        sad.ssid = ssid;
        sad.tsid = tsid;
        sad.audited = audited;
        sad.denied = denied;
        sad.result = result;

        a->selinux_audit_data = &sad;

        common_lsm_audit(a, avc_audit_pre_callback, avc_audit_post_callback);
        return 0;
}

/**
 * avc_add_callback - Register a callback for security events.
 * @callback: callback function
 * @events: security events
 *
 * Register a callback function for events in the set @events.
 * Returns %0 on success or -%ENOMEM if insufficient memory
 * exists to add the callback.
 */
int __init avc_add_callback(int (*callback)(u32 event), u32 events)
{
        struct avc_callback_node *c;
        int rc = 0;

        c = kmalloc(sizeof(*c), GFP_KERNEL);
        if (!c) {
                rc = -ENOMEM;
                goto out;
        }

        c->callback = callback;
        c->events = events;
        c->next = avc_callbacks;
        avc_callbacks = c;
out:
        return rc;
}

/**
 * avc_update_node - Update an AVC entry
 * @event : Updating event
 * @perms : Permission mask bits
 * @driver: xperm driver information
 * @base_perm: the base permission associated with the extended permission
 * @xperm: xperm permissions
 * @ssid: AVC entry source sid
 * @tsid: AVC entry target sid
 * @tclass : AVC entry target object class
 * @seqno : sequence number when decision was made
 * @xpd: extended_perms_decision to be added to the node
 * @flags: the AVC_* flags, e.g. AVC_EXTENDED_PERMS, or 0.
 *
 * if a valid AVC entry doesn't exist,this function returns -ENOENT.
 * if kmalloc() called internal returns NULL, this function returns -ENOMEM.
 * otherwise, this function updates the AVC entry. The original AVC-entry object
 * will release later by RCU.
 */
static int avc_update_node(u32 event, u32 perms, u8 driver, u8 base_perm,
                           u8 xperm, u32 ssid, u32 tsid, u16 tclass, u32 seqno,
                           struct extended_perms_decision *xpd, u32 flags)
{
        u32 hvalue;
        int rc = 0;
        unsigned long flag;
        struct avc_node *pos, *node, *orig = NULL;
        struct hlist_head *head;
        spinlock_t *lock;

        node = avc_alloc_node();
        if (!node) {
                rc = -ENOMEM;
                goto out;
        }

        /* Lock the target slot */
        hvalue = avc_hash(ssid, tsid, tclass);

        head = &selinux_avc.avc_cache.slots[hvalue];
        lock = &selinux_avc.avc_cache.slots_lock[hvalue];

        spin_lock_irqsave(lock, flag);

        hlist_for_each_entry(pos, head, list) {
                if (ssid == pos->ae.ssid &&
                    tsid == pos->ae.tsid &&
                    tclass == pos->ae.tclass &&
                    seqno == pos->ae.avd.seqno){
                        orig = pos;
                        break;
                }
        }

        if (!orig) {
                rc = -ENOENT;
                avc_node_kill(node);
                goto out_unlock;
        }

        /*
         * Copy and replace original node.
         */

        avc_node_populate(node, ssid, tsid, tclass, &orig->ae.avd);

        if (orig->ae.xp_node) {
                rc = avc_xperms_populate(node, orig->ae.xp_node);
                if (rc) {
                        avc_node_kill(node);
                        goto out_unlock;
                }
        }

        switch (event) {
        case AVC_CALLBACK_GRANT:
                node->ae.avd.allowed |= perms;
                if (node->ae.xp_node && (flags & AVC_EXTENDED_PERMS))
                        avc_xperms_allow_perm(node->ae.xp_node, driver, base_perm, xperm);
                break;
        case AVC_CALLBACK_TRY_REVOKE:
        case AVC_CALLBACK_REVOKE:
                node->ae.avd.allowed &= ~perms;
                break;
        case AVC_CALLBACK_AUDITALLOW_ENABLE:
                node->ae.avd.auditallow |= perms;
                break;
        case AVC_CALLBACK_AUDITALLOW_DISABLE:
                node->ae.avd.auditallow &= ~perms;
                break;
        case AVC_CALLBACK_AUDITDENY_ENABLE:
                node->ae.avd.auditdeny |= perms;
                break;
        case AVC_CALLBACK_AUDITDENY_DISABLE:
                node->ae.avd.auditdeny &= ~perms;
                break;
        case AVC_CALLBACK_ADD_XPERMS:
                rc = avc_add_xperms_decision(node, xpd);
                if (rc) {
                        avc_node_kill(node);
                        goto out_unlock;
                }
                break;
        }
        avc_node_replace(node, orig);
out_unlock:
        spin_unlock_irqrestore(lock, flag);
out:
        return rc;
}

/**
 * avc_flush - Flush the cache
 */
static void avc_flush(void)
{
        struct hlist_head *head;
        struct avc_node *node;
        spinlock_t *lock;
        unsigned long flag;
        int i;

        for (i = 0; i < AVC_CACHE_SLOTS; i++) {
                head = &selinux_avc.avc_cache.slots[i];
                lock = &selinux_avc.avc_cache.slots_lock[i];

                spin_lock_irqsave(lock, flag);
                /*
                 * With preemptible RCU, the outer spinlock does not
                 * prevent RCU grace periods from ending.
                 */
                rcu_read_lock();
                hlist_for_each_entry(node, head, list)
                        avc_node_delete(node);
                rcu_read_unlock();
                spin_unlock_irqrestore(lock, flag);
        }
}

/**
 * avc_ss_reset - Flush the cache and revalidate migrated permissions.
 * @seqno: policy sequence number
 */
int avc_ss_reset(u32 seqno)
{
        struct avc_callback_node *c;
        int rc = 0, tmprc;

        avc_flush();

        for (c = avc_callbacks; c; c = c->next) {
                if (c->events & AVC_CALLBACK_RESET) {
                        tmprc = c->callback(AVC_CALLBACK_RESET);
                        /* save the first error encountered for the return
                           value and continue processing the callbacks */
                        if (!rc)
                                rc = tmprc;
                }
        }

        avc_latest_notif_update(seqno, 0);
        return rc;
}

/**
 * avc_compute_av - Add an entry to the AVC based on the security policy
 * @ssid: subject
 * @tsid: object/target
 * @tclass: object class
 * @avd: access vector decision
 * @xp_node: AVC extended permissions node
 *
 * Slow-path helper function for avc_has_perm_noaudit, when the avc_node lookup
 * fails.  Don't inline this, since it's the slow-path and just results in a
 * bigger stack frame.
 */
static noinline void avc_compute_av(u32 ssid, u32 tsid, u16 tclass,
                                    struct av_decision *avd,
                                    struct avc_xperms_node *xp_node)
{
        INIT_LIST_HEAD(&xp_node->xpd_head);
        security_compute_av(ssid, tsid, tclass, avd, &xp_node->xp);
        avc_insert(ssid, tsid, tclass, avd, xp_node);
}

static noinline int avc_denied(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                               u8 driver, u8 base_perm, u8 xperm,
                               unsigned int flags, struct av_decision *avd)
{
        if (flags & AVC_STRICT)
                return -EACCES;

        if (enforcing_enabled() &&
            !(avd->flags & AVD_FLAGS_PERMISSIVE))
                return -EACCES;

        avc_update_node(AVC_CALLBACK_GRANT, requested, driver, base_perm,
                        xperm, ssid, tsid, tclass, avd->seqno, NULL, flags);
        return 0;
}

/*
 * The avc extended permissions logic adds an additional 256 bits of
 * permissions to an avc node when extended permissions for that node are
 * specified in the avtab. If the additional 256 permissions is not adequate,
 * as-is the case with ioctls, then multiple may be chained together and the
 * driver field is used to specify which set contains the permission.
 */
int avc_has_extended_perms(u32 ssid, u32 tsid, u16 tclass, u32 requested,
                           u8 driver, u8 base_perm, u8 xperm,
                           struct common_audit_data *ad)
{
        struct avc_node *node;
        struct av_decision avd;
        u32 denied;
        struct extended_perms_decision local_xpd;
        struct extended_perms_decision *xpd = NULL;
        struct extended_perms_data allowed;
        struct extended_perms_data auditallow;
        struct extended_perms_data dontaudit;
        struct avc_xperms_node local_xp_node;
        struct avc_xperms_node *xp_node;
        int rc = 0, rc2;

        xp_node = &local_xp_node;
        if (WARN_ON(!requested))
                return -EACCES;

        rcu_read_lock();

        node = avc_lookup(ssid, tsid, tclass);
        if (unlikely(!node)) {
                avc_compute_av(ssid, tsid, tclass, &avd, xp_node);
        } else {
                memcpy(&avd, &node->ae.avd, sizeof(avd));
                xp_node = node->ae.xp_node;
        }
        /* if extended permissions are not defined, only consider av_decision */
        if (!xp_node || !xp_node->xp.len)
                goto decision;

        local_xpd.allowed = &allowed;
        local_xpd.auditallow = &auditallow;
        local_xpd.dontaudit = &dontaudit;

        xpd = avc_xperms_decision_lookup(driver, base_perm, xp_node);
        if (unlikely(!xpd)) {
                /*
                 * Compute the extended_perms_decision only if the driver
                 * is flagged and the base permission is known.
                 */
                if (!security_xperm_test(xp_node->xp.drivers.p, driver) ||
                    !(xp_node->xp.base_perms & base_perm)) {
                        avd.allowed &= ~requested;
                        goto decision;
                }
                rcu_read_unlock();
                security_compute_xperms_decision(ssid, tsid, tclass, driver,
                                                 base_perm, &local_xpd);
                rcu_read_lock();
                avc_update_node(AVC_CALLBACK_ADD_XPERMS, requested, driver,
                                base_perm, xperm, ssid, tsid, tclass, avd.seqno,
                                &local_xpd, 0);
        } else {
                avc_quick_copy_xperms_decision(xperm, &local_xpd, xpd);
        }
        xpd = &local_xpd;

        if (!avc_xperms_has_perm(xpd, xperm, XPERMS_ALLOWED))
                avd.allowed &= ~requested;

decision:
        denied = requested & ~(avd.allowed);
        if (unlikely(denied))
                rc = avc_denied(ssid, tsid, tclass, requested, driver,
                                base_perm, xperm, AVC_EXTENDED_PERMS, &avd);

        rcu_read_unlock();

        rc2 = avc_xperms_audit(ssid, tsid, tclass, requested,
                        &avd, xpd, xperm, rc, ad);
        if (rc2)
                return rc2;
        return rc;
}

/**
 * avc_perm_nonode - Add an entry to the AVC
 * @ssid: subject
 * @tsid: object/target
 * @tclass: object class
 * @requested: requested permissions
 * @flags: AVC flags
 * @avd: access vector decision
 *
 * This is the "we have no node" part of avc_has_perm_noaudit(), which is
 * unlikely and needs extra stack space for the new node that we generate, so
 * don't inline it.
 */
static noinline int avc_perm_nonode(u32 ssid, u32 tsid, u16 tclass,
                                    u32 requested, unsigned int flags,
                                    struct av_decision *avd)
{
        u32 denied;
        struct avc_xperms_node xp_node;

        avc_compute_av(ssid, tsid, tclass, avd, &xp_node);
        denied = requested & ~(avd->allowed);
        if (unlikely(denied))
                return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0,
                                  flags, avd);
        return 0;
}

/**
 * avc_has_perm_noaudit - Check permissions but perform no auditing.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions, interpreted based on @tclass
 * @flags:  AVC_STRICT or 0
 * @avd: access vector decisions
 *
 * Check the AVC to determine whether the @requested permissions are granted
 * for the SID pair (@ssid, @tsid), interpreting the permissions
 * based on @tclass, and call the security server on a cache miss to obtain
 * a new decision and add it to the cache.  Return a copy of the decisions
 * in @avd.  Return %0 if all @requested permissions are granted,
 * -%EACCES if any permissions are denied, or another -errno upon
 * other errors.  This function is typically called by avc_has_perm(),
 * but may also be called directly to separate permission checking from
 * auditing, e.g. in cases where a lock must be held for the check but
 * should be released for the auditing.
 */
inline int avc_has_perm_noaudit(u32 ssid, u32 tsid,
                                u16 tclass, u32 requested,
                                unsigned int flags,
                                struct av_decision *avd)
{
        u32 denied;
        struct avc_node *node;

        if (WARN_ON(!requested))
                return -EACCES;

        rcu_read_lock();
        node = avc_lookup(ssid, tsid, tclass);
        if (unlikely(!node)) {
                rcu_read_unlock();
                return avc_perm_nonode(ssid, tsid, tclass, requested,
                                       flags, avd);
        }
        denied = requested & ~node->ae.avd.allowed;
        memcpy(avd, &node->ae.avd, sizeof(*avd));
        rcu_read_unlock();

        if (unlikely(denied))
                return avc_denied(ssid, tsid, tclass, requested, 0, 0, 0,
                                  flags, avd);
        return 0;
}

/**
 * avc_has_perm - Check permissions and perform any appropriate auditing.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @requested: requested permissions, interpreted based on @tclass
 * @auditdata: auxiliary audit data
 *
 * Check the AVC to determine whether the @requested permissions are granted
 * for the SID pair (@ssid, @tsid), interpreting the permissions
 * based on @tclass, and call the security server on a cache miss to obtain
 * a new decision and add it to the cache.  Audit the granting or denial of
 * permissions in accordance with the policy.  Return %0 if all @requested
 * permissions are granted, -%EACCES if any permissions are denied, or
 * another -errno upon other errors.
 */
int avc_has_perm(u32 ssid, u32 tsid, u16 tclass,
                 u32 requested, struct common_audit_data *auditdata)
{
        struct av_decision avd;
        int rc, rc2;

        rc = avc_has_perm_noaudit(ssid, tsid, tclass, requested, 0,
                                  &avd);

        rc2 = avc_audit(ssid, tsid, tclass, requested, &avd, rc,
                        auditdata);
        if (rc2)
                return rc2;
        return rc;
}

u32 avc_policy_seqno(void)
{
        return selinux_avc.avc_cache.latest_notif;
}























































































































































































   46 





















































































  219 




   73 



























































































































































































































    4 























































































































































































































































































































































































































   50 












   96 














   84 














   50 


























































































































































  229 

































   90 

  108 


































































































   18 
   31 
















































































































































































    4 





























    4 
    4 


    4 






































































    4 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGEMAP_H
#define _LINUX_PAGEMAP_H

/*
 * Copyright 1995 Linus Torvalds
 */
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/compiler.h>
#include <linux/uaccess.h>
#include <linux/gfp.h>
#include <linux/bitops.h>
#include <linux/hardirq.h> /* for in_interrupt() */
#include <linux/hugetlb_inline.h>

struct folio_batch;

unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                        pgoff_t start, pgoff_t end);

static inline void invalidate_remote_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
            S_ISLNK(inode->i_mode))
                invalidate_mapping_pages(inode->i_mapping, 0, -1);
}
int invalidate_inode_pages2(struct address_space *mapping);
int invalidate_inode_pages2_range(struct address_space *mapping,
                pgoff_t start, pgoff_t end);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
int filemap_invalidate_pages(struct address_space *mapping,
                             loff_t pos, loff_t end, bool nowait);

int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *);
int filemap_flush(struct address_space *);
int filemap_fdatawait_keep_errors(struct address_space *mapping);
int filemap_fdatawait_range(struct address_space *, loff_t lstart, loff_t lend);
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte);
int filemap_invalidate_inode(struct inode *inode, bool flush,
                             loff_t start, loff_t end);

static inline int filemap_fdatawait(struct address_space *mapping)
{
        return filemap_fdatawait_range(mapping, 0, LLONG_MAX);
}

bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend);
int filemap_write_and_wait_range(struct address_space *mapping,
                loff_t lstart, loff_t lend);
int __filemap_fdatawrite_range(struct address_space *mapping,
                loff_t start, loff_t end, int sync_mode);
int filemap_fdatawrite_range(struct address_space *mapping,
                loff_t start, loff_t end);
int filemap_check_errors(struct address_space *mapping);
void __filemap_set_wb_err(struct address_space *mapping, int err);
int filemap_fdatawrite_wbc(struct address_space *mapping,
                           struct writeback_control *wbc);
int kiocb_write_and_wait(struct kiocb *iocb, size_t count);

static inline int filemap_write_and_wait(struct address_space *mapping)
{
        return filemap_write_and_wait_range(mapping, 0, LLONG_MAX);
}

/**
 * filemap_set_wb_err - set a writeback error on an address_space
 * @mapping: mapping in which to set writeback error
 * @err: error to be set in mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * filemap_set_wb_err to record the error in the mapping so that it will be
 * automatically reported whenever fsync is called on the file.
 */
static inline void filemap_set_wb_err(struct address_space *mapping, int err)
{
        /* Fastpath for common case of no error */
        if (unlikely(err))
                __filemap_set_wb_err(mapping, err);
}

/**
 * filemap_check_wb_err - has an error occurred since the mark was sampled?
 * @mapping: mapping to check for writeback errors
 * @since: previously-sampled errseq_t
 *
 * Grab the errseq_t value from the mapping, and see if it has changed "since"
 * the given value was sampled.
 *
 * If it has then report the latest error set, otherwise return 0.
 */
static inline int filemap_check_wb_err(struct address_space *mapping,
                                        errseq_t since)
{
        return errseq_check(&mapping->wb_err, since);
}

/**
 * filemap_sample_wb_err - sample the current errseq_t to test for later errors
 * @mapping: mapping to be sampled
 *
 * Writeback errors are always reported relative to a particular sample point
 * in the past. This function provides those sample points.
 */
static inline errseq_t filemap_sample_wb_err(struct address_space *mapping)
{
        return errseq_sample(&mapping->wb_err);
}

/**
 * file_sample_sb_err - sample the current errseq_t to test for later errors
 * @file: file pointer to be sampled
 *
 * Grab the most current superblock-level errseq_t value for the given
 * struct file.
 */
static inline errseq_t file_sample_sb_err(struct file *file)
{
        return errseq_sample(&file->f_path.dentry->d_sb->s_wb_err);
}

/*
 * Flush file data before changing attributes.  Caller must hold any locks
 * required to prevent further writes to this file until we're done setting
 * flags.
 */
static inline int inode_drain_writes(struct inode *inode)
{
        inode_dio_wait(inode);
        return filemap_write_and_wait(inode->i_mapping);
}

static inline bool mapping_empty(struct address_space *mapping)
{
        return xa_empty(&mapping->i_pages);
}

/*
 * mapping_shrinkable - test if page cache state allows inode reclaim
 * @mapping: the page cache mapping
 *
 * This checks the mapping's cache state for the pupose of inode
 * reclaim and LRU management.
 *
 * The caller is expected to hold the i_lock, but is not required to
 * hold the i_pages lock, which usually protects cache state. That's
 * because the i_lock and the list_lru lock that protect the inode and
 * its LRU state don't nest inside the irq-safe i_pages lock.
 *
 * Cache deletions are performed under the i_lock, which ensures that
 * when an inode goes empty, it will reliably get queued on the LRU.
 *
 * Cache additions do not acquire the i_lock and may race with this
 * check, in which case we'll report the inode as shrinkable when it
 * has cache pages. This is okay: the shrinker also checks the
 * refcount and the referenced bit, which will be elevated or set in
 * the process of adding new cache pages to an inode.
 */
static inline bool mapping_shrinkable(struct address_space *mapping)
{
        void *head;

        /*
         * On highmem systems, there could be lowmem pressure from the
         * inodes before there is highmem pressure from the page
         * cache. Make inodes shrinkable regardless of cache state.
         */
        if (IS_ENABLED(CONFIG_HIGHMEM))
                return true;

        /* Cache completely empty? Shrink away. */
        head = rcu_access_pointer(mapping->i_pages.xa_head);
        if (!head)
                return true;

        /*
         * The xarray stores single offset-0 entries directly in the
         * head pointer, which allows non-resident page cache entries
         * to escape the shadow shrinker's list of xarray nodes. The
         * inode shrinker needs to pick them up under memory pressure.
         */
        if (!xa_is_node(head) && xa_is_value(head))
                return true;

        return false;
}

/*
 * Bits in mapping->flags.
 */
enum mapping_flags {
        AS_EIO                = 0,        /* IO error on async write */
        AS_ENOSPC        = 1,        /* ENOSPC on async write */
        AS_MM_ALL_LOCKS        = 2,        /* under mm_take_all_locks() */
        AS_UNEVICTABLE        = 3,        /* e.g., ramdisk, SHM_LOCK */
        AS_EXITING        = 4,         /* final truncate in progress */
        /* writeback related tags are not used */
        AS_NO_WRITEBACK_TAGS = 5,
        AS_RELEASE_ALWAYS = 6,        /* Call ->release_folio(), even if no private data */
        AS_STABLE_WRITES = 7,        /* must wait for writeback before modifying
                                   folio contents */
        AS_INACCESSIBLE = 8,        /* Do not attempt direct R/W access to the mapping */
        /* Bits 16-25 are used for FOLIO_ORDER */
        AS_FOLIO_ORDER_BITS = 5,
        AS_FOLIO_ORDER_MIN = 16,
        AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS,
};

#define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1)
#define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN)
#define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX)
#define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK)

/**
 * mapping_set_error - record a writeback error in the address_space
 * @mapping: the mapping in which an error should be set
 * @error: the error to set in the mapping
 *
 * When writeback fails in some way, we must record that error so that
 * userspace can be informed when fsync and the like are called.  We endeavor
 * to report errors on any file that was open at the time of the error.  Some
 * internal callers also need to know when writeback errors have occurred.
 *
 * When a writeback error occurs, most filesystems will want to call
 * mapping_set_error to record the error in the mapping so that it can be
 * reported when the application calls fsync(2).
 */
static inline void mapping_set_error(struct address_space *mapping, int error)
{
        if (likely(!error))
                return;

        /* Record in wb_err for checkers using errseq_t based tracking */
        __filemap_set_wb_err(mapping, error);

        /* Record it in superblock */
        if (mapping->host)
                errseq_set(&mapping->host->i_sb->s_wb_err, error);

        /* Record it in flags for now, for legacy callers */
        if (error == -ENOSPC)
                set_bit(AS_ENOSPC, &mapping->flags);
        else
                set_bit(AS_EIO, &mapping->flags);
}

static inline void mapping_set_unevictable(struct address_space *mapping)
{
        set_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_clear_unevictable(struct address_space *mapping)
{
        clear_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline bool mapping_unevictable(struct address_space *mapping)
{
        return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags);
}

static inline void mapping_set_exiting(struct address_space *mapping)
{
        set_bit(AS_EXITING, &mapping->flags);
}

static inline int mapping_exiting(struct address_space *mapping)
{
        return test_bit(AS_EXITING, &mapping->flags);
}

static inline void mapping_set_no_writeback_tags(struct address_space *mapping)
{
        set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline int mapping_use_writeback_tags(struct address_space *mapping)
{
        return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
}

static inline bool mapping_release_always(const struct address_space *mapping)
{
        return test_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_set_release_always(struct address_space *mapping)
{
        set_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline void mapping_clear_release_always(struct address_space *mapping)
{
        clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
}

static inline bool mapping_stable_writes(const struct address_space *mapping)
{
        return test_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_stable_writes(struct address_space *mapping)
{
        set_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_clear_stable_writes(struct address_space *mapping)
{
        clear_bit(AS_STABLE_WRITES, &mapping->flags);
}

static inline void mapping_set_inaccessible(struct address_space *mapping)
{
        /*
         * It's expected inaccessible mappings are also unevictable. Compaction
         * migrate scanner (isolate_migratepages_block()) relies on this to
         * reduce page locking.
         */
        set_bit(AS_UNEVICTABLE, &mapping->flags);
        set_bit(AS_INACCESSIBLE, &mapping->flags);
}

static inline bool mapping_inaccessible(struct address_space *mapping)
{
        return test_bit(AS_INACCESSIBLE, &mapping->flags);
}

static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
{
        return mapping->gfp_mask;
}

/* Restricts the given gfp_mask to what the mapping allows. */
static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
                gfp_t gfp_mask)
{
        return mapping_gfp_mask(mapping) & gfp_mask;
}

/*
 * This is non-atomic.  Only to be used before the mapping is activated.
 * Probably needs a barrier...
 */
static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
{
        m->gfp_mask = mask;
}

/*
 * There are some parts of the kernel which assume that PMD entries
 * are exactly HPAGE_PMD_ORDER.  Those should be fixed, but until then,
 * limit the maximum allocation order to PMD size.  I'm not aware of any
 * assumptions about maximum order if THP are disabled, but 8 seems like
 * a good order (that's 1MB if you're using 4kB pages)
 */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define PREFERRED_MAX_PAGECACHE_ORDER        HPAGE_PMD_ORDER
#else
#define PREFERRED_MAX_PAGECACHE_ORDER        8
#endif

/*
 * xas_split_alloc() does not support arbitrary orders. This implies no
 * 512MB THP on ARM64 with 64KB base page size.
 */
#define MAX_XAS_ORDER                (XA_CHUNK_SHIFT * 2 - 1)
#define MAX_PAGECACHE_ORDER        min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)

/*
 * mapping_max_folio_size_supported() - Check the max folio size supported
 *
 * The filesystem should call this function at mount time if there is a
 * requirement on the folio mapping size in the page cache.
 */
static inline size_t mapping_max_folio_size_supported(void)
{
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER);
        return PAGE_SIZE;
}

/*
 * mapping_set_folio_order_range() - Set the orders supported by a file.
 * @mapping: The address space of the file.
 * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive).
 * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive).
 *
 * The filesystem should call this function in its inode constructor to
 * indicate which base size (min) and maximum size (max) of folio the VFS
 * can use to cache the contents of the file.  This should only be used
 * if the filesystem needs special handling of folio sizes (ie there is
 * something the core cannot know).
 * Do not tune it based on, eg, i_size.
 *
 * Context: This should not be called while the inode is active as it
 * is non-atomic.
 */
static inline void mapping_set_folio_order_range(struct address_space *mapping,
                                                 unsigned int min,
                                                 unsigned int max)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return;

        if (min > MAX_PAGECACHE_ORDER)
                min = MAX_PAGECACHE_ORDER;

        if (max > MAX_PAGECACHE_ORDER)
                max = MAX_PAGECACHE_ORDER;

        if (max < min)
                max = min;

        mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) |
                (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX);
}

static inline void mapping_set_folio_min_order(struct address_space *mapping,
                                               unsigned int min)
{
        mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER);
}

/**
 * mapping_set_large_folios() - Indicate the file supports large folios.
 * @mapping: The address space of the file.
 *
 * The filesystem should call this function in its inode constructor to
 * indicate that the VFS can use large folios to cache the contents of
 * the file.
 *
 * Context: This should not be called while the inode is active as it
 * is non-atomic.
 */
static inline void mapping_set_large_folios(struct address_space *mapping)
{
        mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER);
}

static inline unsigned int
mapping_max_folio_order(const struct address_space *mapping)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return 0;
        return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX;
}

static inline unsigned int
mapping_min_folio_order(const struct address_space *mapping)
{
        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return 0;
        return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN;
}

static inline unsigned long
mapping_min_folio_nrpages(struct address_space *mapping)
{
        return 1UL << mapping_min_folio_order(mapping);
}

/**
 * mapping_align_index() - Align index for this mapping.
 * @mapping: The address_space.
 * @index: The page index.
 *
 * The index of a folio must be naturally aligned.  If you are adding a
 * new folio to the page cache and need to know what index to give it,
 * call this function.
 */
static inline pgoff_t mapping_align_index(struct address_space *mapping,
                                          pgoff_t index)
{
        return round_down(index, mapping_min_folio_nrpages(mapping));
}

/*
 * Large folio support currently depends on THP.  These dependencies are
 * being worked on but are not yet fixed.
 */
static inline bool mapping_large_folio_support(struct address_space *mapping)
{
        /* AS_FOLIO_ORDER is only reasonable for pagecache folios */
        VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON,
                        "Anonymous mapping always supports large folio");

        return mapping_max_folio_order(mapping) > 0;
}

/* Return the maximum folio size for this pagecache mapping, in bytes. */
static inline size_t mapping_max_folio_size(const struct address_space *mapping)
{
        return PAGE_SIZE << mapping_max_folio_order(mapping);
}

static inline int filemap_nr_thps(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        return atomic_read(&mapping->nr_thps);
#else
        return 0;
#endif
}

static inline void filemap_nr_thps_inc(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_inc(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

static inline void filemap_nr_thps_dec(struct address_space *mapping)
{
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        if (!mapping_large_folio_support(mapping))
                atomic_dec(&mapping->nr_thps);
#else
        WARN_ON_ONCE(mapping_large_folio_support(mapping) == 0);
#endif
}

struct address_space *folio_mapping(struct folio *);
struct address_space *swapcache_mapping(struct folio *);

/**
 * folio_flush_mapping - Find the file mapping this folio belongs to.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Anonymous folios return NULL, even if they're in
 * the swap cache.  Other kinds of folio also return NULL.
 *
 * This is ONLY used by architecture cache flushing code.  If you aren't
 * writing cache flushing code, you want either folio_mapping() or
 * folio_file_mapping().
 */
static inline struct address_space *folio_flush_mapping(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return NULL;

        return folio_mapping(folio);
}

/**
 * folio_inode - Get the host inode for this folio.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the inode that this folio
 * belongs to.
 *
 * Do not call this for folios which aren't in the page cache.
 */
static inline struct inode *folio_inode(struct folio *folio)
{
        return folio->mapping->host;
}

/**
 * folio_attach_private - Attach private data to a folio.
 * @folio: Folio to attach data to.
 * @data: Data to attach to folio.
 *
 * Attaching private data to a folio increments the page's reference count.
 * The data must be detached before the folio will be freed.
 */
static inline void folio_attach_private(struct folio *folio, void *data)
{
        folio_get(folio);
        folio->private = data;
        folio_set_private(folio);
}

/**
 * folio_change_private - Change private data on a folio.
 * @folio: Folio to change the data on.
 * @data: Data to set on the folio.
 *
 * Change the private data attached to a folio and return the old
 * data.  The page must previously have had data attached and the data
 * must be detached before the folio will be freed.
 *
 * Return: Data that was previously attached to the folio.
 */
static inline void *folio_change_private(struct folio *folio, void *data)
{
        void *old = folio_get_private(folio);

        folio->private = data;
        return old;
}

/**
 * folio_detach_private - Detach private data from a folio.
 * @folio: Folio to detach data from.
 *
 * Removes the data that was previously attached to the folio and decrements
 * the refcount on the page.
 *
 * Return: Data that was attached to the folio.
 */
static inline void *folio_detach_private(struct folio *folio)
{
        void *data = folio_get_private(folio);

        if (!folio_test_private(folio))
                return NULL;
        folio_clear_private(folio);
        folio->private = NULL;
        folio_put(folio);

        return data;
}

static inline void attach_page_private(struct page *page, void *data)
{
        folio_attach_private(page_folio(page), data);
}

static inline void *detach_page_private(struct page *page)
{
        return folio_detach_private(page_folio(page));
}

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order);
#else
static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
        return folio_alloc_noprof(gfp, order);
}
#endif

#define filemap_alloc_folio(...)                                \
        alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__))

static inline struct page *__page_cache_alloc(gfp_t gfp)
{
        return &filemap_alloc_folio(gfp, 0)->page;
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
        return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}

typedef int filler_t(struct file *, struct folio *);

pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan);

/**
 * typedef fgf_t - Flags for getting folios from the page cache.
 *
 * Most users of the page cache will not need to use these flags;
 * there are convenience functions such as filemap_get_folio() and
 * filemap_lock_folio().  For users which need more control over exactly
 * what is done with the folios, these flags to __filemap_get_folio()
 * are available.
 *
 * * %FGP_ACCESSED - The folio will be marked accessed.
 * * %FGP_LOCK - The folio is returned locked.
 * * %FGP_CREAT - If no folio is present then a new folio is allocated,
 *   added to the page cache and the VM's LRU list.  The folio is
 *   returned locked.
 * * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
 *   folio is already in cache.  If the folio was allocated, unlock it
 *   before returning so the caller can do the same dance.
 * * %FGP_WRITE - The folio will be written to by the caller.
 * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
 * * %FGP_NOWAIT - Don't block on the folio lock.
 * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
 * * %FGP_DONTCACHE - Uncached buffered IO
 * * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
 *   implementation.
 */
typedef unsigned int __bitwise fgf_t;

#define FGP_ACCESSED                ((__force fgf_t)0x00000001)
#define FGP_LOCK                ((__force fgf_t)0x00000002)
#define FGP_CREAT                ((__force fgf_t)0x00000004)
#define FGP_WRITE                ((__force fgf_t)0x00000008)
#define FGP_NOFS                ((__force fgf_t)0x00000010)
#define FGP_NOWAIT                ((__force fgf_t)0x00000020)
#define FGP_FOR_MMAP                ((__force fgf_t)0x00000040)
#define FGP_STABLE                ((__force fgf_t)0x00000080)
#define FGP_DONTCACHE                ((__force fgf_t)0x00000100)
#define FGF_GET_ORDER(fgf)        (((__force unsigned)fgf) >> 26)        /* top 6 bits */

#define FGP_WRITEBEGIN                (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)

static inline unsigned int filemap_get_order(size_t size)
{
        unsigned int shift = ilog2(size);

        if (shift <= PAGE_SHIFT)
                return 0;

        return shift - PAGE_SHIFT;
}

/**
 * fgf_set_order - Encode a length in the fgf_t flags.
 * @size: The suggested size of the folio to create.
 *
 * The caller of __filemap_get_folio() can use this to suggest a preferred
 * size for the folio that is created.  If there is already a folio at
 * the index, it will be returned, no matter what its size.  If a folio
 * is freshly created, it may be of a different size than requested
 * due to alignment constraints, memory pressure, or the presence of
 * other folios at nearby indices.
 */
static inline fgf_t fgf_set_order(size_t size)
{
        unsigned int order = filemap_get_order(size);

        if (!order)
                return 0;
        return (__force fgf_t)(order << 26);
}

void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp);

/**
 * filemap_get_folio - Find and get a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned with an increased refcount.
 *
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_get_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, 0, 0);
}

/**
 * filemap_lock_folio - Find and lock a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 *
 * Looks up the page cache entry at @mapping & @index.  If a folio is
 * present, it is returned locked with an increased refcount.
 *
 * Context: May sleep.
 * Return: A folio or ERR_PTR(-ENOENT) if there is no folio in the cache for
 * this index.  Will not return a shadow, swap or DAX entry.
 */
static inline struct folio *filemap_lock_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index, FGP_LOCK, 0);
}

/**
 * filemap_grab_folio - grab a folio from the page cache
 * @mapping: The address space to search
 * @index: The page index
 *
 * Looks up the page cache entry at @mapping & @index. If no folio is found,
 * a new folio is created. The folio is locked, marked as accessed, and
 * returned.
 *
 * Return: A found or created folio. ERR_PTR(-ENOMEM) if no folio is found
 * and failed to create a folio.
 */
static inline struct folio *filemap_grab_folio(struct address_space *mapping,
                                        pgoff_t index)
{
        return __filemap_get_folio(mapping, index,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                        mapping_gfp_mask(mapping));
}

/**
 * find_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned with an increased refcount.
 *
 * Otherwise, %NULL is returned.
 */
static inline struct page *find_get_page(struct address_space *mapping,
                                        pgoff_t offset)
{
        return pagecache_get_page(mapping, offset, 0, 0);
}

static inline struct page *find_get_page_flags(struct address_space *mapping,
                                        pgoff_t offset, fgf_t fgp_flags)
{
        return pagecache_get_page(mapping, offset, fgp_flags, 0);
}

/**
 * find_lock_page - locate, pin and lock a pagecache page
 * @mapping: the address_space to search
 * @index: the page index
 *
 * Looks up the page cache entry at @mapping & @index.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * Context: May sleep.
 * Return: A struct page or %NULL if there is no page in the cache for this
 * index.
 */
static inline struct page *find_lock_page(struct address_space *mapping,
                                        pgoff_t index)
{
        return pagecache_get_page(mapping, index, FGP_LOCK, 0);
}

/**
 * find_or_create_page - locate or add a pagecache page
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned locked and with an increased
 * refcount.
 *
 * If the page is not present, a new page is allocated using @gfp_mask
 * and added to the page cache and the VM's LRU list.  The page is
 * returned locked and with an increased refcount.
 *
 * On memory exhaustion, %NULL is returned.
 *
 * find_or_create_page() may sleep, even if @gfp_flags specifies an
 * atomic allocation!
 */
static inline struct page *find_or_create_page(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask)
{
        return pagecache_get_page(mapping, index,
                                        FGP_LOCK|FGP_ACCESSED|FGP_CREAT,
                                        gfp_mask);
}

/**
 * grab_cache_page_nowait - returns locked page at given index in given cache
 * @mapping: target address_space
 * @index: the page index
 *
 * Same as grab_cache_page(), but do not wait if the page is unavailable.
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
 * be safe to call while holding the lock for another page.
 *
 * Clear __GFP_FS when allocating the page to avoid recursion into the fs
 * and deadlock against the caller's locked page.
 */
static inline struct page *grab_cache_page_nowait(struct address_space *mapping,
                                pgoff_t index)
{
        return pagecache_get_page(mapping, index,
                        FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
                        mapping_gfp_mask(mapping));
}

extern pgoff_t __folio_swap_cache_index(struct folio *folio);

/**
 * folio_index - File index of a folio.
 * @folio: The folio.
 *
 * For a folio which is either in the page cache or the swap cache,
 * return its index within the address_space it belongs to.  If you know
 * the page is definitely in the page cache, you can look at the folio's
 * index directly.
 *
 * Return: The index (offset in units of pages) of a folio in its file.
 */
static inline pgoff_t folio_index(struct folio *folio)
{
        if (unlikely(folio_test_swapcache(folio)))
                return __folio_swap_cache_index(folio);
        return folio->index;
}

/**
 * folio_next_index - Get the index of the next folio.
 * @folio: The current folio.
 *
 * Return: The index of the folio which follows this folio in the file.
 */
static inline pgoff_t folio_next_index(struct folio *folio)
{
        return folio->index + folio_nr_pages(folio);
}

/**
 * folio_file_page - The page for a particular index.
 * @folio: The folio which contains this index.
 * @index: The index we want to look up.
 *
 * Sometimes after looking up a folio in the page cache, we need to
 * obtain the specific page for an index (eg a page fault).
 *
 * Return: The page containing the file data for this index.
 */
static inline struct page *folio_file_page(struct folio *folio, pgoff_t index)
{
        return folio_page(folio, index & (folio_nr_pages(folio) - 1));
}

/**
 * folio_contains - Does this folio contain this index?
 * @folio: The folio.
 * @index: The page index within the file.
 *
 * Context: The caller should have the page locked in order to prevent
 * (eg) shmem from moving the page between the page cache and swap cache
 * and changing its index in the middle of the operation.
 * Return: true or false.
 */
static inline bool folio_contains(struct folio *folio, pgoff_t index)
{
        return index - folio_index(folio) < folio_nr_pages(folio);
}

/*
 * Given the page we found in the page cache, return the page corresponding
 * to this index in the file
 */
static inline struct page *find_subpage(struct page *head, pgoff_t index)
{
        /* HugeTLBfs wants the head page regardless */
        if (PageHuge(head))
                return head;

        return head + (index & (thp_nr_pages(head) - 1));
}

unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_contig(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch);
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch);

/*
 * Returns locked page at given index in given cache, creating it if needed.
 */
static inline struct page *grab_cache_page(struct address_space *mapping,
                                                                pgoff_t index)
{
        return find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
}

struct folio *read_cache_folio(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
struct folio *mapping_read_folio_gfp(struct address_space *, pgoff_t index,
                gfp_t flags);
struct page *read_cache_page(struct address_space *, pgoff_t index,
                filler_t *filler, struct file *file);
extern struct page * read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);

static inline struct page *read_mapping_page(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_page(mapping, index, NULL, file);
}

static inline struct folio *read_mapping_folio(struct address_space *mapping,
                                pgoff_t index, struct file *file)
{
        return read_cache_folio(mapping, index, NULL, file);
}

/**
 * page_pgoff - Calculate the logical page offset of this page.
 * @folio: The folio containing this page.
 * @page: The page which we need the offset of.
 *
 * For file pages, this is the offset from the beginning of the file
 * in units of PAGE_SIZE.  For anonymous pages, this is the offset from
 * the beginning of the anon_vma in units of PAGE_SIZE.  This will
 * return nonsense for KSM pages.
 *
 * Context: Caller must have a reference on the folio or otherwise
 * prevent it from being split or freed.
 *
 * Return: The offset in units of PAGE_SIZE.
 */
static inline pgoff_t page_pgoff(const struct folio *folio,
                const struct page *page)
{
        return folio->index + folio_page_idx(folio, page);
}

/**
 * folio_pos - Returns the byte position of this folio in its file.
 * @folio: The folio.
 */
static inline loff_t folio_pos(const struct folio *folio)
{
        return ((loff_t)folio->index) * PAGE_SIZE;
}

/*
 * Return byte-offset into filesystem object for page.
 */
static inline loff_t page_offset(struct page *page)
{
        struct folio *folio = page_folio(page);

        return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE;
}

/*
 * Get the offset in PAGE_SIZE (even for hugetlb folios).
 */
static inline pgoff_t folio_pgoff(struct folio *folio)
{
        return folio->index;
}

static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
                                        unsigned long address)
{
        pgoff_t pgoff;
        pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
        pgoff += vma->vm_pgoff;
        return pgoff;
}

struct wait_page_key {
        struct folio *folio;
        int bit_nr;
        int page_match;
};

struct wait_page_queue {
        struct folio *folio;
        int bit_nr;
        wait_queue_entry_t wait;
};

static inline bool wake_page_match(struct wait_page_queue *wait_page,
                                  struct wait_page_key *key)
{
        if (wait_page->folio != key->folio)
               return false;
        key->page_match = 1;

        if (wait_page->bit_nr != key->bit_nr)
                return false;

        return true;
}

void __folio_lock(struct folio *folio);
int __folio_lock_killable(struct folio *folio);
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf);
void unlock_page(struct page *page);
void folio_unlock(struct folio *folio);

/**
 * folio_trylock() - Attempt to lock a folio.
 * @folio: The folio to attempt to lock.
 *
 * Sometimes it is undesirable to wait for a folio to be unlocked (eg
 * when the locks are being taken in the wrong order, or if making
 * progress through a batch of folios is more important than processing
 * them in order).  Usually folio_lock() is the correct function to call.
 *
 * Context: Any context.
 * Return: Whether the lock was successfully acquired.
 */
static inline bool folio_trylock(struct folio *folio)
{
        return likely(!test_and_set_bit_lock(PG_locked, folio_flags(folio, 0)));
}

/*
 * Return true if the page was successfully locked
 */
static inline bool trylock_page(struct page *page)
{
        return folio_trylock(page_folio(page));
}

/**
 * folio_lock() - Lock this folio.
 * @folio: The folio to lock.
 *
 * The folio lock protects against many things, probably more than it
 * should.  It is primarily held while a folio is being brought uptodate,
 * either from its backing file or from swap.  It is also held while a
 * folio is being truncated from its address_space, so holding the lock
 * is sufficient to keep folio->mapping stable.
 *
 * The folio lock is also held while write() is modifying the page to
 * provide POSIX atomicity guarantees (as long as the write does not
 * cross a page boundary).  Other modifications to the data in the folio
 * do not hold the folio lock and can race with writes, eg DMA and stores
 * to mapped pages.
 *
 * Context: May sleep.  If you need to acquire the locks of two or
 * more folios, they must be in order of ascending index, if they are
 * in the same address_space.  If they are in different address_spaces,
 * acquire the lock of the folio which belongs to the address_space which
 * has the lowest address in memory first.
 */
static inline void folio_lock(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * lock_page() - Lock the folio containing this page.
 * @page: The page to lock.
 *
 * See folio_lock() for a description of what the lock protects.
 * This is a legacy function and new code should probably use folio_lock()
 * instead.
 *
 * Context: May sleep.  Pages in the same folio share a lock, so do not
 * attempt to lock two pages which share a folio.
 */
static inline void lock_page(struct page *page)
{
        struct folio *folio;
        might_sleep();

        folio = page_folio(page);
        if (!folio_trylock(folio))
                __folio_lock(folio);
}

/**
 * folio_lock_killable() - Lock this folio, interruptible by a fatal signal.
 * @folio: The folio to lock.
 *
 * Attempts to lock the folio, like folio_lock(), except that the sleep
 * to acquire the lock is interruptible by a fatal signal.
 *
 * Context: May sleep; see folio_lock().
 * Return: 0 if the lock was acquired; -EINTR if a fatal signal was received.
 */
static inline int folio_lock_killable(struct folio *folio)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_killable(folio);
        return 0;
}

/*
 * folio_lock_or_retry - Lock the folio, unless this would block and the
 * caller indicated that it can handle a retry.
 *
 * Return value and mmap_lock implications depend on flags; see
 * __folio_lock_or_retry().
 */
static inline vm_fault_t folio_lock_or_retry(struct folio *folio,
                                             struct vm_fault *vmf)
{
        might_sleep();
        if (!folio_trylock(folio))
                return __folio_lock_or_retry(folio, vmf);
        return 0;
}

/*
 * This is exported only for folio_wait_locked/folio_wait_writeback, etc.,
 * and should not be used directly.
 */
void folio_wait_bit(struct folio *folio, int bit_nr);
int folio_wait_bit_killable(struct folio *folio, int bit_nr);

/* 
 * Wait for a folio to be unlocked.
 *
 * This must be called with the caller "holding" the folio,
 * ie with increased folio reference count so that the folio won't
 * go away during the wait.
 */
static inline void folio_wait_locked(struct folio *folio)
{
        if (folio_test_locked(folio))
                folio_wait_bit(folio, PG_locked);
}

static inline int folio_wait_locked_killable(struct folio *folio)
{
        if (!folio_test_locked(folio))
                return 0;
        return folio_wait_bit_killable(folio, PG_locked);
}

void folio_end_read(struct folio *folio, bool success);
void wait_on_page_writeback(struct page *page);
void folio_wait_writeback(struct folio *folio);
int folio_wait_writeback_killable(struct folio *folio);
void end_page_writeback(struct page *page);
void folio_end_writeback(struct folio *folio);
void folio_wait_stable(struct folio *folio);
void __folio_mark_dirty(struct folio *folio, struct address_space *, int warn);
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb);
void __folio_cancel_dirty(struct folio *folio);
static inline void folio_cancel_dirty(struct folio *folio)
{
        /* Avoid atomic ops, locking, etc. when not actually needed. */
        if (folio_test_dirty(folio))
                __folio_cancel_dirty(folio);
}
bool folio_clear_dirty_for_io(struct folio *folio);
bool clear_page_dirty_for_io(struct page *page);
void folio_invalidate(struct folio *folio, size_t offset, size_t length);
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_MIGRATION
int filemap_migrate_folio(struct address_space *mapping, struct folio *dst,
                struct folio *src, enum migrate_mode mode);
#else
#define filemap_migrate_folio NULL
#endif
void folio_end_private_2(struct folio *folio);
void folio_wait_private_2(struct folio *folio);
int folio_wait_private_2_killable(struct folio *folio);

/*
 * Fault in userspace address range.
 */
size_t fault_in_writeable(char __user *uaddr, size_t size);
size_t fault_in_subpage_writeable(char __user *uaddr, size_t size);
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
size_t fault_in_readable(const char __user *uaddr, size_t size);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                pgoff_t index, gfp_t gfp);
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp);
void filemap_remove_folio(struct folio *folio);
void __filemap_remove_folio(struct folio *folio, void *shadow);
void replace_page_cache_folio(struct folio *old, struct folio *new);
void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct folio_batch *fbatch);
bool filemap_release_folio(struct folio *folio, gfp_t gfp);
loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end,
                int whence);

/* Must be non-static for BPF error injection */
int __filemap_add_folio(struct address_space *mapping, struct folio *folio,
                pgoff_t index, gfp_t gfp, void **shadowp);

bool filemap_range_has_writeback(struct address_space *mapping,
                                 loff_t start_byte, loff_t end_byte);

/**
 * filemap_range_needs_writeback - check if range potentially needs writeback
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback. Used by O_DIRECT
 * read/write with IOCB_NOWAIT, to see if the caller needs to do
 * filemap_write_and_wait_range() before proceeding.
 *
 * Return: %true if the caller should do filemap_write_and_wait_range() before
 * doing O_DIRECT to a page in this range, %false otherwise.
 */
static inline bool filemap_range_needs_writeback(struct address_space *mapping,
                                                 loff_t start_byte,
                                                 loff_t end_byte)
{
        if (!mapping->nrpages)
                return false;
        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            !mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                return false;
        return filemap_range_has_writeback(mapping, start_byte, end_byte);
}

/**
 * struct readahead_control - Describes a readahead request.
 *
 * A readahead request is for consecutive pages.  Filesystems which
 * implement the ->readahead method should call readahead_page() or
 * readahead_page_batch() in a loop and attempt to start I/O against
 * each page in the request.
 *
 * Most of the fields in this struct are private and should be accessed
 * by the functions below.
 *
 * @file: The file, used primarily by network filesystems for authentication.
 *          May be NULL if invoked internally by the filesystem.
 * @mapping: Readahead this filesystem object.
 * @ra: File readahead state.  May be NULL.
 */
struct readahead_control {
        struct file *file;
        struct address_space *mapping;
        struct file_ra_state *ra;
/* private: use the readahead_* accessors instead */
        pgoff_t _index;
        unsigned int _nr_pages;
        unsigned int _batch_count;
        bool dropbehind;
        bool _workingset;
        unsigned long _pflags;
};

#define DEFINE_READAHEAD(ractl, f, r, m, i)                                \
        struct readahead_control ractl = {                                \
                .file = f,                                                \
                .mapping = m,                                                \
                .ra = r,                                                \
                ._index = i,                                                \
        }

#define VM_READAHEAD_PAGES        (SZ_128K / PAGE_SIZE)

void page_cache_ra_unbounded(struct readahead_control *,
                unsigned long nr_to_read, unsigned long lookahead_count);
void page_cache_sync_ra(struct readahead_control *, unsigned long req_count);
void page_cache_async_ra(struct readahead_control *, struct folio *,
                unsigned long req_count);
void readahead_expand(struct readahead_control *ractl,
                      loff_t new_start, size_t new_len);

/**
 * page_cache_sync_readahead - generic file readahead
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @index: Index of first page to be read.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_sync_readahead() should be called when a cache miss happened:
 * it will submit the read.  The readahead logic may decide to piggyback more
 * pages onto the read request if access patterns suggest it will improve
 * performance.
 */
static inline
void page_cache_sync_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file, pgoff_t index,
                unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, index);
        page_cache_sync_ra(&ractl, req_count);
}

/**
 * page_cache_async_readahead - file readahead for marked pages
 * @mapping: address_space which holds the pagecache and I/O vectors
 * @ra: file_ra_state which holds the readahead state
 * @file: Used by the filesystem for authentication.
 * @folio: The folio which triggered the readahead call.
 * @req_count: Total number of pages being read by the caller.
 *
 * page_cache_async_readahead() should be called when a page is used which
 * is marked as PageReadahead; this is a marker to suggest that the application
 * has used up enough of the readahead window that we should start pulling in
 * more pages.
 */
static inline
void page_cache_async_readahead(struct address_space *mapping,
                struct file_ra_state *ra, struct file *file,
                struct folio *folio, unsigned long req_count)
{
        DEFINE_READAHEAD(ractl, file, ra, mapping, folio->index);
        page_cache_async_ra(&ractl, folio, req_count);
}

static inline struct folio *__readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio;

        BUG_ON(ractl->_batch_count > ractl->_nr_pages);
        ractl->_nr_pages -= ractl->_batch_count;
        ractl->_index += ractl->_batch_count;

        if (!ractl->_nr_pages) {
                ractl->_batch_count = 0;
                return NULL;
        }

        folio = xa_load(&ractl->mapping->i_pages, ractl->_index);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        ractl->_batch_count = folio_nr_pages(folio);

        return folio;
}

/**
 * readahead_page - Get the next page to read.
 * @ractl: The current readahead request.
 *
 * Context: The page is locked and has an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: A pointer to the next page, or %NULL if we are done.
 */
static inline struct page *readahead_page(struct readahead_control *ractl)
{
        struct folio *folio = __readahead_folio(ractl);

        return &folio->page;
}

/**
 * readahead_folio - Get the next folio to read.
 * @ractl: The current readahead request.
 *
 * Context: The folio is locked.  The caller should unlock the folio once
 * all I/O to that folio has completed.
 * Return: A pointer to the next folio, or %NULL if we are done.
 */
static inline struct folio *readahead_folio(struct readahead_control *ractl)
{
        struct folio *folio = __readahead_folio(ractl);

        if (folio)
                folio_put(folio);
        return folio;
}

static inline unsigned int __readahead_batch(struct readahead_control *rac,
                struct page **array, unsigned int array_sz)
{
        unsigned int i = 0;
        XA_STATE(xas, &rac->mapping->i_pages, 0);
        struct page *page;

        BUG_ON(rac->_batch_count > rac->_nr_pages);
        rac->_nr_pages -= rac->_batch_count;
        rac->_index += rac->_batch_count;
        rac->_batch_count = 0;

        xas_set(&xas, rac->_index);
        rcu_read_lock();
        xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) {
                if (xas_retry(&xas, page))
                        continue;
                VM_BUG_ON_PAGE(!PageLocked(page), page);
                VM_BUG_ON_PAGE(PageTail(page), page);
                array[i++] = page;
                rac->_batch_count += thp_nr_pages(page);
                if (i == array_sz)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * readahead_page_batch - Get a batch of pages to read.
 * @rac: The current readahead request.
 * @array: An array of pointers to struct page.
 *
 * Context: The pages are locked and have an elevated refcount.  The caller
 * should decreases the refcount once the page has been submitted for I/O
 * and unlock the page once all I/O to that page has completed.
 * Return: The number of pages placed in the array.  0 indicates the request
 * is complete.
 */
#define readahead_page_batch(rac, array)                                \
        __readahead_batch(rac, array, ARRAY_SIZE(array))

/**
 * readahead_pos - The byte offset into the file of this readahead request.
 * @rac: The readahead request.
 */
static inline loff_t readahead_pos(struct readahead_control *rac)
{
        return (loff_t)rac->_index * PAGE_SIZE;
}

/**
 * readahead_length - The number of bytes in this readahead request.
 * @rac: The readahead request.
 */
static inline size_t readahead_length(struct readahead_control *rac)
{
        return rac->_nr_pages * PAGE_SIZE;
}

/**
 * readahead_index - The index of the first page in this readahead request.
 * @rac: The readahead request.
 */
static inline pgoff_t readahead_index(struct readahead_control *rac)
{
        return rac->_index;
}

/**
 * readahead_count - The number of pages in this readahead request.
 * @rac: The readahead request.
 */
static inline unsigned int readahead_count(struct readahead_control *rac)
{
        return rac->_nr_pages;
}

/**
 * readahead_batch_length - The number of bytes in the current batch.
 * @rac: The readahead request.
 */
static inline size_t readahead_batch_length(struct readahead_control *rac)
{
        return rac->_batch_count * PAGE_SIZE;
}

static inline unsigned long dir_pages(struct inode *inode)
{
        return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
                               PAGE_SHIFT;
}

/**
 * folio_mkwrite_check_truncate - check if folio was truncated
 * @folio: the folio to check
 * @inode: the inode to check the folio against
 *
 * Return: the number of bytes in the folio up to EOF,
 * or -EFAULT if the folio was truncated.
 */
static inline ssize_t folio_mkwrite_check_truncate(struct folio *folio,
                                              struct inode *inode)
{
        loff_t size = i_size_read(inode);
        pgoff_t index = size >> PAGE_SHIFT;
        size_t offset = offset_in_folio(folio, size);

        if (!folio->mapping)
                return -EFAULT;

        /* folio is wholly inside EOF */
        if (folio_next_index(folio) - 1 < index)
                return folio_size(folio);
        /* folio is wholly past EOF */
        if (folio->index > index || !offset)
                return -EFAULT;
        /* folio is partially inside EOF */
        return offset;
}

/**
 * i_blocks_per_folio - How many blocks fit in this folio.
 * @inode: The inode which contains the blocks.
 * @folio: The folio.
 *
 * If the block size is larger than the size of this folio, return zero.
 *
 * Context: The caller should hold a refcount on the folio to prevent it
 * from being split.
 * Return: The number of filesystem blocks covered by this folio.
 */
static inline
unsigned int i_blocks_per_folio(struct inode *inode, struct folio *folio)
{
        return folio_size(folio) >> inode->i_blkbits;
}
#endif /* _LINUX_PAGEMAP_H */


























































































































































































































































































































































































    3 





















































































































  441 
  441 









  443 





































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  This file contains the interface functions for the various time related
 *  system calls: time, stime, gettimeofday, settimeofday, adjtime
 *
 * Modification history:
 *
 * 1993-09-02    Philip Gladstone
 *      Created file with time related functions from sched/core.c and adjtimex()
 * 1993-10-08    Torsten Duwe
 *      adjtime interface update and CMOS clock write code
 * 1995-08-13    Torsten Duwe
 *      kernel PLL updated to 1994-12-13 specs (rfc-1589)
 * 1999-01-16    Ulrich Windl
 *        Introduced error checking for many cases in adjtimex().
 *        Updated NTP code according to technical memorandum Jan '96
 *        "A Kernel Model for Precision Timekeeping" by Dave Mills
 *        Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10)
 *        (Even though the technical memorandum forbids it)
 * 2004-07-14         Christoph Lameter
 *        Added getnstimeofday to allow the posix timer functions to return
 *        with nanosecond accuracy
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/timekeeper_internal.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
#include <linux/math64.h>
#include <linux/ptrace.h>

#include <linux/uaccess.h>
#include <linux/compat.h>
#include <asm/unistd.h>

#include <generated/timeconst.h>
#include "timekeeping.h"

/*
 * The timezone where the local system is located.  Used as a default by some
 * programs who obtain this value by using gettimeofday.
 */
struct timezone sys_tz;

EXPORT_SYMBOL(sys_tz);

#ifdef __ARCH_WANT_SYS_TIME

/*
 * sys_time() can be implemented in user-level using
 * sys_gettimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */
SYSCALL_DEFINE1(time, __kernel_old_time_t __user *, tloc)
{
        __kernel_old_time_t i = (__kernel_old_time_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

/*
 * sys_stime() can be implemented in user-level using
 * sys_settimeofday().  Is this for backwards compatibility?  If so,
 * why not move it into the appropriate arch directory (for those
 * architectures that need it).
 */

SYSCALL_DEFINE1(stime, __kernel_old_time_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME */

#ifdef CONFIG_COMPAT_32BIT_TIME
#ifdef __ARCH_WANT_SYS_TIME32

/* old_time32_t is a 32 bit "long" and needs to get converted. */
SYSCALL_DEFINE1(time32, old_time32_t __user *, tloc)
{
        old_time32_t i;

        i = (old_time32_t)ktime_get_real_seconds();

        if (tloc) {
                if (put_user(i,tloc))
                        return -EFAULT;
        }
        force_successful_syscall_return();
        return i;
}

SYSCALL_DEFINE1(stime32, old_time32_t __user *, tptr)
{
        struct timespec64 tv;
        int err;

        if (get_user(tv.tv_sec, tptr))
                return -EFAULT;

        tv.tv_nsec = 0;

        err = security_settime64(&tv, NULL);
        if (err)
                return err;

        do_settimeofday64(&tv);
        return 0;
}

#endif /* __ARCH_WANT_SYS_TIME32 */
#endif

SYSCALL_DEFINE2(gettimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        if (likely(tv != NULL)) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (unlikely(tz != NULL)) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }
        return 0;
}

/*
 * In case for some reason the CMOS clock has not already been running
 * in UTC, but in some local time: The first time we set the timezone,
 * we will warp the clock so that it is ticking UTC time instead of
 * local time. Presumably, if someone is setting the timezone then we
 * are running in an environment where the programs understand about
 * timezones. This should be done at boot time in the /etc/rc script,
 * as soon as possible, so that the clock can be set right. Otherwise,
 * various programs will get confused when the clock gets warped.
 */

int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz)
{
        static int firsttime = 1;
        int error = 0;

        if (tv && !timespec64_valid_settod(tv))
                return -EINVAL;

        error = security_settime64(tv, tz);
        if (error)
                return error;

        if (tz) {
                /* Verify we're within the +-15 hrs range */
                if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
                        return -EINVAL;

                sys_tz = *tz;
                update_vsyscall_tz();
                if (firsttime) {
                        firsttime = 0;
                        if (!tv)
                                timekeeping_warp_clock();
                }
        }
        if (tv)
                return do_settimeofday64(tv);
        return 0;
}

SYSCALL_DEFINE2(settimeofday, struct __kernel_old_timeval __user *, tv,
                struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(gettimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        if (tv) {
                struct timespec64 ts;

                ktime_get_real_ts64(&ts);
                if (put_user(ts.tv_sec, &tv->tv_sec) ||
                    put_user(ts.tv_nsec / 1000, &tv->tv_usec))
                        return -EFAULT;
        }
        if (tz) {
                if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
                        return -EFAULT;
        }

        return 0;
}

COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv,
                       struct timezone __user *, tz)
{
        struct timespec64 new_ts;
        struct timezone new_tz;

        if (tv) {
                if (get_user(new_ts.tv_sec, &tv->tv_sec) ||
                    get_user(new_ts.tv_nsec, &tv->tv_usec))
                        return -EFAULT;

                if (new_ts.tv_nsec > USEC_PER_SEC || new_ts.tv_nsec < 0)
                        return -EINVAL;

                new_ts.tv_nsec *= NSEC_PER_USEC;
        }
        if (tz) {
                if (copy_from_user(&new_tz, tz, sizeof(*tz)))
                        return -EFAULT;
        }

        return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL);
}
#endif

#ifdef CONFIG_64BIT
SYSCALL_DEFINE1(adjtimex, struct __kernel_timex __user *, txc_p)
{
        struct __kernel_timex txc;                /* Local copy of parameter */
        int ret;

        /* Copy the user data space into the kernel copy
         * structure. But bear in mind that the structures
         * may change
         */
        if (copy_from_user(&txc, txc_p, sizeof(struct __kernel_timex)))
                return -EFAULT;
        ret = do_adjtimex(&txc);
        return copy_to_user(txc_p, &txc, sizeof(struct __kernel_timex)) ? -EFAULT : ret;
}
#endif

#ifdef CONFIG_COMPAT_32BIT_TIME
int get_old_timex32(struct __kernel_timex *txc, const struct old_timex32 __user *utp)
{
        struct old_timex32 tx32;

        memset(txc, 0, sizeof(struct __kernel_timex));
        if (copy_from_user(&tx32, utp, sizeof(struct old_timex32)))
                return -EFAULT;

        txc->modes = tx32.modes;
        txc->offset = tx32.offset;
        txc->freq = tx32.freq;
        txc->maxerror = tx32.maxerror;
        txc->esterror = tx32.esterror;
        txc->status = tx32.status;
        txc->constant = tx32.constant;
        txc->precision = tx32.precision;
        txc->tolerance = tx32.tolerance;
        txc->time.tv_sec = tx32.time.tv_sec;
        txc->time.tv_usec = tx32.time.tv_usec;
        txc->tick = tx32.tick;
        txc->ppsfreq = tx32.ppsfreq;
        txc->jitter = tx32.jitter;
        txc->shift = tx32.shift;
        txc->stabil = tx32.stabil;
        txc->jitcnt = tx32.jitcnt;
        txc->calcnt = tx32.calcnt;
        txc->errcnt = tx32.errcnt;
        txc->stbcnt = tx32.stbcnt;

        return 0;
}

int put_old_timex32(struct old_timex32 __user *utp, const struct __kernel_timex *txc)
{
        struct old_timex32 tx32;

        memset(&tx32, 0, sizeof(struct old_timex32));
        tx32.modes = txc->modes;
        tx32.offset = txc->offset;
        tx32.freq = txc->freq;
        tx32.maxerror = txc->maxerror;
        tx32.esterror = txc->esterror;
        tx32.status = txc->status;
        tx32.constant = txc->constant;
        tx32.precision = txc->precision;
        tx32.tolerance = txc->tolerance;
        tx32.time.tv_sec = txc->time.tv_sec;
        tx32.time.tv_usec = txc->time.tv_usec;
        tx32.tick = txc->tick;
        tx32.ppsfreq = txc->ppsfreq;
        tx32.jitter = txc->jitter;
        tx32.shift = txc->shift;
        tx32.stabil = txc->stabil;
        tx32.jitcnt = txc->jitcnt;
        tx32.calcnt = txc->calcnt;
        tx32.errcnt = txc->errcnt;
        tx32.stbcnt = txc->stbcnt;
        tx32.tai = txc->tai;
        if (copy_to_user(utp, &tx32, sizeof(struct old_timex32)))
                return -EFAULT;
        return 0;
}

SYSCALL_DEFINE1(adjtimex_time32, struct old_timex32 __user *, utp)
{
        struct __kernel_timex txc;
        int err, ret;

        err = get_old_timex32(&txc, utp);
        if (err)
                return err;

        ret = do_adjtimex(&txc);

        err = put_old_timex32(utp, &txc);
        if (err)
                return err;

        return ret;
}
#endif

/**
 * jiffies_to_msecs - Convert jiffies to milliseconds
 * @j: jiffies value
 *
 * Avoid unnecessary multiplications/divisions in the
 * two most common HZ cases.
 *
 * Return: milliseconds value
 */
unsigned int jiffies_to_msecs(const unsigned long j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
        return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
#else
# if BITS_PER_LONG == 32
        return (HZ_TO_MSEC_MUL32 * j + (1ULL << HZ_TO_MSEC_SHR32) - 1) >>
               HZ_TO_MSEC_SHR32;
# else
        return DIV_ROUND_UP(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_msecs);

/**
 * jiffies_to_usecs - Convert jiffies to microseconds
 * @j: jiffies value
 *
 * Return: microseconds value
 */
unsigned int jiffies_to_usecs(const unsigned long j)
{
        /*
         * Hz usually doesn't go much further MSEC_PER_SEC.
         * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
         */
        BUILD_BUG_ON(HZ > USEC_PER_SEC);

#if !(USEC_PER_SEC % HZ)
        return (USEC_PER_SEC / HZ) * j;
#else
# if BITS_PER_LONG == 32
        return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
# else
        return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN;
# endif
#endif
}
EXPORT_SYMBOL(jiffies_to_usecs);

/**
 * mktime64 - Converts date to seconds.
 * @year0: year to convert
 * @mon0: month to convert
 * @day: day to convert
 * @hour: hour to convert
 * @min: minute to convert
 * @sec: second to convert
 *
 * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
 *
 * [For the Julian calendar (which was used in Russia before 1917,
 * Britain & colonies before 1752, anywhere else before 1582,
 * and is still in use by some communities) leave out the
 * -year/100+year/400 terms, and add 10.]
 *
 * This algorithm was first published by Gauss (I think).
 *
 * A leap second can be indicated by calling this function with sec as
 * 60 (allowable under ISO 8601).  The leap second is treated the same
 * as the following second since they don't exist in UNIX time.
 *
 * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight
 * tomorrow - (allowable under ISO 8601) is supported.
 *
 * Return: seconds since the epoch time for the given input date
 */
time64_t mktime64(const unsigned int year0, const unsigned int mon0,
                const unsigned int day, const unsigned int hour,
                const unsigned int min, const unsigned int sec)
{
        unsigned int mon = mon0, year = year0;

        /* 1..12 -> 11,12,1..10 */
        if (0 >= (int) (mon -= 2)) {
                mon += 12;        /* Puts Feb last since it has leap day */
                year -= 1;
        }

        return ((((time64_t)
                  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
                  year*365 - 719499
            )*24 + hour /* now have hours - midnight tomorrow handled here */
          )*60 + min /* now have minutes */
        )*60 + sec; /* finally seconds */
}
EXPORT_SYMBOL(mktime64);

struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec)
{
        struct timespec64 ts = ns_to_timespec64(nsec);
        struct __kernel_old_timeval tv;

        tv.tv_sec = ts.tv_sec;
        tv.tv_usec = (suseconds_t)ts.tv_nsec / 1000;

        return tv;
}
EXPORT_SYMBOL(ns_to_kernel_old_timeval);

/**
 * set_normalized_timespec64 - set timespec sec and nsec parts and normalize
 *
 * @ts:                pointer to timespec variable to be set
 * @sec:        seconds to set
 * @nsec:        nanoseconds to set
 *
 * Set seconds and nanoseconds field of a timespec variable and
 * normalize to the timespec storage format
 *
 * Note: The tv_nsec part is always in the range of 0 <= tv_nsec < NSEC_PER_SEC.
 * For negative values only the tv_sec field is negative !
 */
void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec)
{
        while (nsec >= NSEC_PER_SEC) {
                /*
                 * The following asm() prevents the compiler from
                 * optimising this loop into a modulo operation. See
                 * also __iter_div_u64_rem() in include/linux/time.h
                 */
                asm("" : "+rm"(nsec));
                nsec -= NSEC_PER_SEC;
                ++sec;
        }
        while (nsec < 0) {
                asm("" : "+rm"(nsec));
                nsec += NSEC_PER_SEC;
                --sec;
        }
        ts->tv_sec = sec;
        ts->tv_nsec = nsec;
}
EXPORT_SYMBOL(set_normalized_timespec64);

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:       the nanoseconds value to be converted
 *
 * Return: the timespec64 representation of the nsec parameter.
 */
struct timespec64 ns_to_timespec64(s64 nsec)
{
        struct timespec64 ts = { 0, 0 };
        s32 rem;

        if (likely(nsec > 0)) {
                ts.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
                ts.tv_nsec = rem;
        } else if (nsec < 0) {
                /*
                 * With negative times, tv_sec points to the earlier
                 * second, and tv_nsec counts the nanoseconds since
                 * then, so tv_nsec is always a positive number.
                 */
                ts.tv_sec = -div_u64_rem(-nsec - 1, NSEC_PER_SEC, &rem) - 1;
                ts.tv_nsec = NSEC_PER_SEC - rem - 1;
        }

        return ts;
}
EXPORT_SYMBOL(ns_to_timespec64);

/**
 * __msecs_to_jiffies: - convert milliseconds to jiffies
 * @m:        time in milliseconds
 *
 * conversion is done as follows:
 *
 * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
 *
 * - 'too large' values [that would result in larger than
 *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
 *
 * - all other values are converted to jiffies by either multiplying
 *   the input value by a factor or dividing it with a factor and
 *   handling any 32-bit overflows.
 *   for the details see _msecs_to_jiffies()
 *
 * msecs_to_jiffies() checks for the passed in value being a constant
 * via __builtin_constant_p() allowing gcc to eliminate most of the
 * code, __msecs_to_jiffies() is called if the value passed does not
 * allow constant folding and the actual conversion must be done at
 * runtime.
 * The _msecs_to_jiffies helpers are the HZ dependent conversion
 * routines found in include/linux/jiffies.h
 *
 * Return: jiffies value
 */
unsigned long __msecs_to_jiffies(const unsigned int m)
{
        /*
         * Negative value, means infinite timeout:
         */
        if ((int)m < 0)
                return MAX_JIFFY_OFFSET;
        return _msecs_to_jiffies(m);
}
EXPORT_SYMBOL(__msecs_to_jiffies);

/**
 * __usecs_to_jiffies: - convert microseconds to jiffies
 * @u:        time in milliseconds
 *
 * Return: jiffies value
 */
unsigned long __usecs_to_jiffies(const unsigned int u)
{
        if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
                return MAX_JIFFY_OFFSET;
        return _usecs_to_jiffies(u);
}
EXPORT_SYMBOL(__usecs_to_jiffies);

/**
 * timespec64_to_jiffies - convert a timespec64 value to jiffies
 * @value: pointer to &struct timespec64
 *
 * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
 * that a remainder subtract here would not do the right thing as the
 * resolution values don't fall on second boundaries.  I.e. the line:
 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
 * Note that due to the small error in the multiplier here, this
 * rounding is incorrect for sufficiently large values of tv_nsec, but
 * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
 * OK.
 *
 * Rather, we just shift the bits off the right.
 *
 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
 * value to a scaled second value.
 *
 * Return: jiffies value
 */
unsigned long
timespec64_to_jiffies(const struct timespec64 *value)
{
        u64 sec = value->tv_sec;
        long nsec = value->tv_nsec + TICK_NSEC - 1;

        if (sec >= MAX_SEC_IN_JIFFIES){
                sec = MAX_SEC_IN_JIFFIES;
                nsec = 0;
        }
        return ((sec * SEC_CONVERSION) +
                (((u64)nsec * NSEC_CONVERSION) >>
                 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;

}
EXPORT_SYMBOL(timespec64_to_jiffies);

/**
 * jiffies_to_timespec64 - convert jiffies value to &struct timespec64
 * @jiffies: jiffies value
 * @value: pointer to &struct timespec64
 */
void
jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
{
        /*
         * Convert jiffies to nanoseconds and separate with
         * one divide.
         */
        u32 rem;
        value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC,
                                    NSEC_PER_SEC, &rem);
        value->tv_nsec = rem;
}
EXPORT_SYMBOL(jiffies_to_timespec64);

/*
 * Convert jiffies/jiffies_64 to clock_t and back.
 */

/**
 * jiffies_to_clock_t - Convert jiffies to clock_t
 * @x: jiffies value
 *
 * Return: jiffies converted to clock_t (CLOCKS_PER_SEC)
 */
clock_t jiffies_to_clock_t(unsigned long x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        return x * (USER_HZ / HZ);
# else
        return x / (HZ / USER_HZ);
# endif
#else
        return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ);
#endif
}
EXPORT_SYMBOL(jiffies_to_clock_t);

/**
 * clock_t_to_jiffies - Convert clock_t to jiffies
 * @x: clock_t value
 *
 * Return: clock_t value converted to jiffies
 */
unsigned long clock_t_to_jiffies(unsigned long x)
{
#if (HZ % USER_HZ)==0
        if (x >= ~0UL / (HZ / USER_HZ))
                return ~0UL;
        return x * (HZ / USER_HZ);
#else
        /* Don't worry about loss of precision here .. */
        if (x >= ~0UL / HZ * USER_HZ)
                return ~0UL;

        /* .. but do try to contain it here */
        return div_u64((u64)x * HZ, USER_HZ);
#endif
}
EXPORT_SYMBOL(clock_t_to_jiffies);

/**
 * jiffies_64_to_clock_t - Convert jiffies_64 to clock_t
 * @x: jiffies_64 value
 *
 * Return: jiffies_64 value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
u64 jiffies_64_to_clock_t(u64 x)
{
#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
# if HZ < USER_HZ
        x = div_u64(x * USER_HZ, HZ);
# elif HZ > USER_HZ
        x = div_u64(x, HZ / USER_HZ);
# else
        /* Nothing to do */
# endif
#else
        /*
         * There are better ways that don't overflow early,
         * but even this doesn't overflow in hundreds of years
         * in 64 bits, so..
         */
        x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
#endif
        return x;
}
EXPORT_SYMBOL(jiffies_64_to_clock_t);

/**
 * nsec_to_clock_t - Convert nsec value to clock_t
 * @x: nsec value
 *
 * Return: nsec value converted to 64-bit "clock_t" (CLOCKS_PER_SEC)
 */
u64 nsec_to_clock_t(u64 x)
{
#if (NSEC_PER_SEC % USER_HZ) == 0
        return div_u64(x, NSEC_PER_SEC / USER_HZ);
#elif (USER_HZ % 512) == 0
        return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024,
         * overflow after 64.99 years.
         * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ...
         */
        return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ);
#endif
}

/**
 * jiffies64_to_nsecs - Convert jiffies64 to nanoseconds
 * @j: jiffies64 value
 *
 * Return: nanoseconds value
 */
u64 jiffies64_to_nsecs(u64 j)
{
#if !(NSEC_PER_SEC % HZ)
        return (NSEC_PER_SEC / HZ) * j;
# else
        return div_u64(j * HZ_TO_NSEC_NUM, HZ_TO_NSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_nsecs);

/**
 * jiffies64_to_msecs - Convert jiffies64 to milliseconds
 * @j: jiffies64 value
 *
 * Return: milliseconds value
 */
u64 jiffies64_to_msecs(const u64 j)
{
#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
        return (MSEC_PER_SEC / HZ) * j;
#else
        return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
#endif
}
EXPORT_SYMBOL(jiffies64_to_msecs);

/**
 * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies64 value
 */
u64 nsecs_to_jiffies64(u64 n)
{
#if (NSEC_PER_SEC % HZ) == 0
        /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
        return div_u64(n, NSEC_PER_SEC / HZ);
#elif (HZ % 512) == 0
        /* overflow after 292 years if HZ = 1024 */
        return div_u64(n * HZ / 512, NSEC_PER_SEC / 512);
#else
        /*
         * Generic case - optimized for cases where HZ is a multiple of 3.
         * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc.
         */
        return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ);
#endif
}
EXPORT_SYMBOL(nsecs_to_jiffies64);

/**
 * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
 *
 * @n:        nsecs in u64
 *
 * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
 * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
 * for scheduler, not for use in device drivers to calculate timeout value.
 *
 * note:
 *   NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
 *   ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
 *
 * Return: nsecs converted to jiffies value
 */
unsigned long nsecs_to_jiffies(u64 n)
{
        return (unsigned long)nsecs_to_jiffies64(n);
}
EXPORT_SYMBOL_GPL(nsecs_to_jiffies);

/**
 * timespec64_add_safe - Add two timespec64 values and do a safety check
 * for overflow.
 * @lhs: first (left) timespec64 to add
 * @rhs: second (right) timespec64 to add
 *
 * It's assumed that both values are valid (>= 0).
 * And, each timespec64 is in normalized form.
 *
 * Return: sum of @lhs + @rhs
 */
struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                const struct timespec64 rhs)
{
        struct timespec64 res;

        set_normalized_timespec64(&res, (timeu64_t) lhs.tv_sec + rhs.tv_sec,
                        lhs.tv_nsec + rhs.tv_nsec);

        if (unlikely(res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)) {
                res.tv_sec = TIME64_MAX;
                res.tv_nsec = 0;
        }

        return res;
}

/**
 * get_timespec64 - get user's time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's time value as &struct __kernel_timespec
 *
 * Handles compat or 32-bit modes.
 *
 * Return: 0 on success or negative errno on error
 */
int get_timespec64(struct timespec64 *ts,
                   const struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts;
        int ret;

        ret = copy_from_user(&kts, uts, sizeof(kts));
        if (ret)
                return -EFAULT;

        ts->tv_sec = kts.tv_sec;

        /* Zero out the padding in compat mode */
        if (in_compat_syscall())
                kts.tv_nsec &= 0xFFFFFFFFUL;

        /* In 32-bit mode, this drops the padding */
        ts->tv_nsec = kts.tv_nsec;

        return 0;
}
EXPORT_SYMBOL_GPL(get_timespec64);

/**
 * put_timespec64 - convert timespec64 value to __kernel_timespec format and
 *                     copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct __kernel_timespec
 *
 * Return: 0 on success or negative errno on error
 */
int put_timespec64(const struct timespec64 *ts,
                   struct __kernel_timespec __user *uts)
{
        struct __kernel_timespec kts = {
                .tv_sec = ts->tv_sec,
                .tv_nsec = ts->tv_nsec
        };

        return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
}
EXPORT_SYMBOL_GPL(put_timespec64);

static int __get_old_timespec32(struct timespec64 *ts64,
                                   const struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts;
        int ret;

        ret = copy_from_user(&ts, cts, sizeof(ts));
        if (ret)
                return -EFAULT;

        ts64->tv_sec = ts.tv_sec;
        ts64->tv_nsec = ts.tv_nsec;

        return 0;
}

static int __put_old_timespec32(const struct timespec64 *ts64,
                                   struct old_timespec32 __user *cts)
{
        struct old_timespec32 ts = {
                .tv_sec = ts64->tv_sec,
                .tv_nsec = ts64->tv_nsec
        };
        return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
}

/**
 * get_old_timespec32 - get user's old-format time value into kernel space
 * @ts: destination &struct timespec64
 * @uts: user's old-format time value (&struct old_timespec32)
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: 0 on success or negative errno on error
 */
int get_old_timespec32(struct timespec64 *ts, const void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __get_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(get_old_timespec32);

/**
 * put_old_timespec32 - convert timespec64 value to &struct old_timespec32 and
 *                         copy the latter to userspace
 * @ts: input &struct timespec64
 * @uts: user's &struct old_timespec32
 *
 * Handles X86_X32_ABI compatibility conversion.
 *
 * Return: 0 on success or negative errno on error
 */
int put_old_timespec32(const struct timespec64 *ts, void __user *uts)
{
        if (COMPAT_USE_64BIT_TIME)
                return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
        else
                return __put_old_timespec32(ts, uts);
}
EXPORT_SYMBOL_GPL(put_old_timespec32);

/**
 * get_itimerspec64 - get user's &struct __kernel_itimerspec into kernel space
 * @it: destination &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: 0 on success or negative errno on error
 */
int get_itimerspec64(struct itimerspec64 *it,
                        const struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = get_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = get_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(get_itimerspec64);

/**
 * put_itimerspec64 - convert &struct itimerspec64 to __kernel_itimerspec format
 *                       and copy the latter to userspace
 * @it: input &struct itimerspec64
 * @uit: user's &struct __kernel_itimerspec
 *
 * Return: 0 on success or negative errno on error
 */
int put_itimerspec64(const struct itimerspec64 *it,
                        struct __kernel_itimerspec __user *uit)
{
        int ret;

        ret = put_timespec64(&it->it_interval, &uit->it_interval);
        if (ret)
                return ret;

        ret = put_timespec64(&it->it_value, &uit->it_value);

        return ret;
}
EXPORT_SYMBOL_GPL(put_itimerspec64);

/**
 * get_old_itimerspec32 - get user's &struct old_itimerspec32 into kernel space
 * @its: destination &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: 0 on success or negative errno on error
 */
int get_old_itimerspec32(struct itimerspec64 *its,
                        const struct old_itimerspec32 __user *uits)
{

        if (__get_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __get_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(get_old_itimerspec32);

/**
 * put_old_itimerspec32 - convert &struct itimerspec64 to &struct
 *                          old_itimerspec32 and copy the latter to userspace
 * @its: input &struct itimerspec64
 * @uits: user's &struct old_itimerspec32
 *
 * Return: 0 on success or negative errno on error
 */
int put_old_itimerspec32(const struct itimerspec64 *its,
                        struct old_itimerspec32 __user *uits)
{
        if (__put_old_timespec32(&its->it_interval, &uits->it_interval) ||
            __put_old_timespec32(&its->it_value, &uits->it_value))
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(put_old_itimerspec32);














































































   27 






    1 




























   27 








   27 




















    1 




   27 
    1 












































    3 






    1 












   31 
   12 
















   13 



   27 
   17 



























   33 
    8 

















   27 





   27 


    1 




   27 




   27 













    5 







    5 




















   28 















    4 













    5 



    4 

























   34 


























































    1 











    2 


    1 

    1 



































    2 






    1 


















    1 


    2 



    2 


    2 


    2 
















































   20 

























    3 



    2 


    4 























    1 




    2 

















    4 














    5 





    5 
    2 

















    5 






    3 




    2 


    2 















    2 
    5 


    2 
    4 










   28 























    2 

































    1 













    1 





   42 






   42 












   43 








   39 




   32 









    1 




   40 











    2 

















    2 









































   32 
















    1 



    5 


    1 

    5 

















   27 









   28 










   10 



    1 



   28 


    2 




   27 


    4 
   27 











   27 







   27 



    2 




   27 







   13 







   13 
    4 

   13 

   13 








   15 





   21 


   21 
    3 


















   35 

















    1 





   28 







    5 






    1 

   27 




















    3 
    3 




    1 


   28 








   28 






    5 


















    1 





























    2 















   24 





   24 




   23 



   24 




















    7 



   24 
























































    2 












    1 

   28 


   29 


    1 


   27 


    2 


    2 


    1 





    2 


    2 


   26 


















































   31 






   34 

    8 


   34 

















   34 




    1 




   33 
   31 









   31 
















   32 









    1 



   31 









   31 









































   34 


   34 





   33 













   38 





    8 


    1 

   35 




   35 


















   35 




    1 


    1 









   34 















   35 







    1 




   33 
    8 
    1 
































    2 


































   34 

   34 




    5 
    5 




































    7 












    6 

    2 












































    3 







    3 

    3 
















    1 




    1 









   19 





















    2 




    1 






    1 







    1 










    2 


   10 

    2 














    3 












    2 


    2 



































   12 













   12 
    6 



    8 



    4 


   11 















    4 





    4 












    6 


















    4 


    1 



    1 




























    2 




























    4 


    4 














































    3 






    3 















   11 
















   10 





    1 


    8 




    4 
    4 






























    9 




    2 







    4 



    3 




















    3 
    3 








    3 






















    2 






    3 


    9 








    2 
    9 































    7 









    5 



    3 



    1 


    4 




    1 














    6 




    5 














    5 
















   12 






    4 
    7 
    2 



















   12 


   12 



   10 








   20 


   19 



   14 






























    2 





    7 
    1 

    2 





    2 











    2 









    1 




    2 







    1 

    2 


   12 


   19 





















    2 







   59 



    1 








    9 




    2 






    5 



   34 




   17 










    7 
    1 






    1 


    1 









    2 

    2 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
// SPDX-License-Identifier: GPL-2.0-only
/*
 * GICv3 ITS emulation
 *
 * Copyright (C) 2015,2016 ARM Ltd.
 * Author: Andre Przywara <andre.przywara@arm.com>
 */

#include <linux/cpu.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/interrupt.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/list_sort.h>

#include <linux/irqchip/arm-gic-v3.h>

#include <asm/kvm_emulate.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>

#include "vgic.h"
#include "vgic-mmio.h"

static struct kvm_device_ops kvm_arm_vgic_its_ops;

static int vgic_its_save_tables_v0(struct vgic_its *its);
static int vgic_its_restore_tables_v0(struct vgic_its *its);
static int vgic_its_commit_v0(struct vgic_its *its);
static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
                             struct kvm_vcpu *filter_vcpu, bool needs_inv);

#define vgic_its_read_entry_lock(i, g, valp, t)                                \
        ({                                                                \
                int __sz = vgic_its_get_abi(i)->t##_esz;                \
                struct kvm *__k = (i)->dev->kvm;                        \
                int __ret;                                                \
                                                                        \
                BUILD_BUG_ON(NR_ITS_ABIS == 1 &&                        \
                             sizeof(*(valp)) != ABI_0_ESZ);                \
                if (NR_ITS_ABIS > 1 &&                                        \
                    KVM_BUG_ON(__sz != sizeof(*(valp)), __k))                \
                        __ret = -EINVAL;                                \
                else                                                        \
                        __ret = kvm_read_guest_lock(__k, (g),                \
                                                    valp, __sz);        \
                __ret;                                                        \
        })

#define vgic_its_write_entry_lock(i, g, val, t)                                \
        ({                                                                \
                int __sz = vgic_its_get_abi(i)->t##_esz;                \
                struct kvm *__k = (i)->dev->kvm;                        \
                typeof(val) __v = (val);                                \
                int __ret;                                                \
                                                                        \
                BUILD_BUG_ON(NR_ITS_ABIS == 1 &&                        \
                             sizeof(__v) != ABI_0_ESZ);                        \
                if (NR_ITS_ABIS > 1 &&                                        \
                    KVM_BUG_ON(__sz != sizeof(__v), __k))                \
                        __ret = -EINVAL;                                \
                else                                                        \
                        __ret = vgic_write_guest_lock(__k, (g),                \
                                                      &__v, __sz);        \
                __ret;                                                        \
        })

/*
 * Creates a new (reference to a) struct vgic_irq for a given LPI.
 * If this LPI is already mapped on another ITS, we increase its refcount
 * and return a pointer to the existing structure.
 * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
 * This function returns a pointer to the _unlocked_ structure.
 */
static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
                                     struct kvm_vcpu *vcpu)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq;
        unsigned long flags;
        int ret;

        /* In this case there is no put, since we keep the reference. */
        if (irq)
                return irq;

        irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT);
        if (!irq)
                return ERR_PTR(-ENOMEM);

        ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
        if (ret) {
                kfree(irq);
                return ERR_PTR(ret);
        }

        INIT_LIST_HEAD(&irq->ap_list);
        raw_spin_lock_init(&irq->irq_lock);

        irq->config = VGIC_CONFIG_EDGE;
        kref_init(&irq->refcount);
        irq->intid = intid;
        irq->target_vcpu = vcpu;
        irq->group = 1;

        xa_lock_irqsave(&dist->lpi_xa, flags);

        /*
         * There could be a race with another vgic_add_lpi(), so we need to
         * check that we don't add a second list entry with the same LPI.
         */
        oldirq = xa_load(&dist->lpi_xa, intid);
        if (vgic_try_get_irq_kref(oldirq)) {
                /* Someone was faster with adding this LPI, lets use that. */
                kfree(irq);
                irq = oldirq;

                goto out_unlock;
        }

        ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
        if (ret) {
                xa_release(&dist->lpi_xa, intid);
                kfree(irq);
        }

out_unlock:
        xa_unlock_irqrestore(&dist->lpi_xa, flags);

        if (ret)
                return ERR_PTR(ret);

        /*
         * We "cache" the configuration table entries in our struct vgic_irq's.
         * However we only have those structs for mapped IRQs, so we read in
         * the respective config data from memory here upon mapping the LPI.
         *
         * Should any of these fail, behave as if we couldn't create the LPI
         * by dropping the refcount and returning the error.
         */
        ret = update_lpi_config(kvm, irq, NULL, false);
        if (ret) {
                vgic_put_irq(kvm, irq);
                return ERR_PTR(ret);
        }

        ret = vgic_v3_lpi_sync_pending_status(kvm, irq);
        if (ret) {
                vgic_put_irq(kvm, irq);
                return ERR_PTR(ret);
        }

        return irq;
}

/**
 * struct vgic_its_abi - ITS abi ops and settings
 * @cte_esz: collection table entry size
 * @dte_esz: device table entry size
 * @ite_esz: interrupt translation table entry size
 * @save_tables: save the ITS tables into guest RAM
 * @restore_tables: restore the ITS internal structs from tables
 *  stored in guest RAM
 * @commit: initialize the registers which expose the ABI settings,
 *  especially the entry sizes
 */
struct vgic_its_abi {
        int cte_esz;
        int dte_esz;
        int ite_esz;
        int (*save_tables)(struct vgic_its *its);
        int (*restore_tables)(struct vgic_its *its);
        int (*commit)(struct vgic_its *its);
};

#define ABI_0_ESZ        8
#define ESZ_MAX                ABI_0_ESZ

static const struct vgic_its_abi its_table_abi_versions[] = {
        [0] = {
         .cte_esz = ABI_0_ESZ,
         .dte_esz = ABI_0_ESZ,
         .ite_esz = ABI_0_ESZ,
         .save_tables = vgic_its_save_tables_v0,
         .restore_tables = vgic_its_restore_tables_v0,
         .commit = vgic_its_commit_v0,
        },
};

#define NR_ITS_ABIS        ARRAY_SIZE(its_table_abi_versions)

inline const struct vgic_its_abi *vgic_its_get_abi(struct vgic_its *its)
{
        return &its_table_abi_versions[its->abi_rev];
}

static int vgic_its_set_abi(struct vgic_its *its, u32 rev)
{
        const struct vgic_its_abi *abi;

        its->abi_rev = rev;
        abi = vgic_its_get_abi(its);
        return abi->commit(its);
}

/*
 * Find and returns a device in the device table for an ITS.
 * Must be called with the its_lock mutex held.
 */
static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
{
        struct its_device *device;

        list_for_each_entry(device, &its->device_list, dev_list)
                if (device_id == device->device_id)
                        return device;

        return NULL;
}

/*
 * Find and returns an interrupt translation table entry (ITTE) for a given
 * Device ID/Event ID pair on an ITS.
 * Must be called with the its_lock mutex held.
 */
static struct its_ite *find_ite(struct vgic_its *its, u32 device_id,
                                  u32 event_id)
{
        struct its_device *device;
        struct its_ite *ite;

        device = find_its_device(its, device_id);
        if (device == NULL)
                return NULL;

        list_for_each_entry(ite, &device->itt_head, ite_list)
                if (ite->event_id == event_id)
                        return ite;

        return NULL;
}

/* To be used as an iterator this macro misses the enclosing parentheses */
#define for_each_lpi_its(dev, ite, its) \
        list_for_each_entry(dev, &(its)->device_list, dev_list) \
                list_for_each_entry(ite, &(dev)->itt_head, ite_list)

#define GIC_LPI_OFFSET 8192

#define VITS_TYPER_IDBITS                16
#define VITS_MAX_EVENTID                (BIT(VITS_TYPER_IDBITS) - 1)
#define VITS_TYPER_DEVBITS                16
#define VITS_MAX_DEVID                        (BIT(VITS_TYPER_DEVBITS) - 1)
#define VITS_DTE_MAX_DEVID_OFFSET        (BIT(14) - 1)
#define VITS_ITE_MAX_EVENTID_OFFSET        (BIT(16) - 1)

/*
 * Finds and returns a collection in the ITS collection table.
 * Must be called with the its_lock mutex held.
 */
static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
{
        struct its_collection *collection;

        list_for_each_entry(collection, &its->collection_list, coll_list) {
                if (coll_id == collection->collection_id)
                        return collection;
        }

        return NULL;
}

#define LPI_PROP_ENABLE_BIT(p)        ((p) & LPI_PROP_ENABLED)
#define LPI_PROP_PRIORITY(p)        ((p) & 0xfc)

/*
 * Reads the configuration data for a given LPI from guest memory and
 * updates the fields in struct vgic_irq.
 * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
 * VCPU. Unconditionally applies if filter_vcpu is NULL.
 */
static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
                             struct kvm_vcpu *filter_vcpu, bool needs_inv)
{
        u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
        u8 prop;
        int ret;
        unsigned long flags;

        ret = kvm_read_guest_lock(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
                                  &prop, 1);

        if (ret)
                return ret;

        raw_spin_lock_irqsave(&irq->irq_lock, flags);

        if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
                irq->priority = LPI_PROP_PRIORITY(prop);
                irq->enabled = LPI_PROP_ENABLE_BIT(prop);

                if (!irq->hw) {
                        vgic_queue_irq_unlock(kvm, irq, flags);
                        return 0;
                }
        }

        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

        if (irq->hw)
                return its_prop_update_vlpi(irq->host_irq, prop, needs_inv);

        return 0;
}

static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu)
{
        int ret = 0;
        unsigned long flags;

        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        irq->target_vcpu = vcpu;
        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

        if (irq->hw) {
                struct its_vlpi_map map;

                ret = its_get_vlpi(irq->host_irq, &map);
                if (ret)
                        return ret;

                if (map.vpe)
                        atomic_dec(&map.vpe->vlpi_count);
                map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
                atomic_inc(&map.vpe->vlpi_count);

                ret = its_map_vlpi(irq->host_irq, &map);
        }

        return ret;
}

static struct kvm_vcpu *collection_to_vcpu(struct kvm *kvm,
                                           struct its_collection *col)
{
        return kvm_get_vcpu_by_id(kvm, col->target_addr);
}

/*
 * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
 * is targeting) to the VGIC's view, which deals with target VCPUs.
 * Needs to be called whenever either the collection for a LPIs has
 * changed or the collection itself got retargeted.
 */
static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite)
{
        struct kvm_vcpu *vcpu;

        if (!its_is_collection_mapped(ite->collection))
                return;

        vcpu = collection_to_vcpu(kvm, ite->collection);
        update_affinity(ite->irq, vcpu);
}

/*
 * Updates the target VCPU for every LPI targeting this collection.
 * Must be called with the its_lock mutex held.
 */
static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
                                       struct its_collection *coll)
{
        struct its_device *device;
        struct its_ite *ite;

        for_each_lpi_its(device, ite, its) {
                if (ite->collection != coll)
                        continue;

                update_affinity_ite(kvm, ite);
        }
}

static u32 max_lpis_propbaser(u64 propbaser)
{
        int nr_idbits = (propbaser & 0x1f) + 1;

        return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
}

/*
 * Sync the pending table pending bit of LPIs targeting @vcpu
 * with our own data structures. This relies on the LPI being
 * mapped before.
 */
static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
{
        gpa_t pendbase = GICR_PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        unsigned long intid, flags;
        struct vgic_irq *irq;
        int last_byte_offset = -1;
        int ret = 0;
        u8 pendmask;

        xa_for_each(&dist->lpi_xa, intid, irq) {
                int byte_offset, bit_nr;

                byte_offset = intid / BITS_PER_BYTE;
                bit_nr = intid % BITS_PER_BYTE;

                /*
                 * For contiguously allocated LPIs chances are we just read
                 * this very same byte in the last iteration. Reuse that.
                 */
                if (byte_offset != last_byte_offset) {
                        ret = kvm_read_guest_lock(vcpu->kvm,
                                                  pendbase + byte_offset,
                                                  &pendmask, 1);
                        if (ret)
                                return ret;

                        last_byte_offset = byte_offset;
                }

                irq = vgic_get_irq(vcpu->kvm, intid);
                if (!irq)
                        continue;

                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                if (irq->target_vcpu == vcpu)
                        irq->pending_latch = pendmask & (1U << bit_nr);
                vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }

        return ret;
}

static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
                                              struct vgic_its *its,
                                              gpa_t addr, unsigned int len)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        u64 reg = GITS_TYPER_PLPIS;

        /*
         * We use linear CPU numbers for redistributor addressing,
         * so GITS_TYPER.PTA is 0.
         * Also we force all PROPBASER registers to be the same, so
         * CommonLPIAff is 0 as well.
         * To avoid memory waste in the guest, we keep the number of IDBits and
         * DevBits low - as least for the time being.
         */
        reg |= GIC_ENCODE_SZ(VITS_TYPER_DEVBITS, 5) << GITS_TYPER_DEVBITS_SHIFT;
        reg |= GIC_ENCODE_SZ(VITS_TYPER_IDBITS, 5) << GITS_TYPER_IDBITS_SHIFT;
        reg |= GIC_ENCODE_SZ(abi->ite_esz, 4) << GITS_TYPER_ITT_ENTRY_SIZE_SHIFT;

        return extract_bytes(reg, addr & 7, len);
}

static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
                                             struct vgic_its *its,
                                             gpa_t addr, unsigned int len)
{
        u32 val;

        val = (its->abi_rev << GITS_IIDR_REV_SHIFT) & GITS_IIDR_REV_MASK;
        val |= (PRODUCT_ID_KVM << GITS_IIDR_PRODUCTID_SHIFT) | IMPLEMENTER_ARM;
        return val;
}

static int vgic_mmio_uaccess_write_its_iidr(struct kvm *kvm,
                                            struct vgic_its *its,
                                            gpa_t addr, unsigned int len,
                                            unsigned long val)
{
        u32 rev = GITS_IIDR_REV(val);

        if (rev >= NR_ITS_ABIS)
                return -EINVAL;
        return vgic_its_set_abi(its, rev);
}

static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
                                               struct vgic_its *its,
                                               gpa_t addr, unsigned int len)
{
        switch (addr & 0xffff) {
        case GITS_PIDR0:
                return 0x92;        /* part number, bits[7:0] */
        case GITS_PIDR1:
                return 0xb4;        /* part number, bits[11:8] */
        case GITS_PIDR2:
                return GIC_PIDR2_ARCH_GICv3 | 0x0b;
        case GITS_PIDR4:
                return 0x40;        /* This is a 64K software visible page */
        /* The following are the ID registers for (any) GIC. */
        case GITS_CIDR0:
                return 0x0d;
        case GITS_CIDR1:
                return 0xf0;
        case GITS_CIDR2:
                return 0x05;
        case GITS_CIDR3:
                return 0xb1;
        }

        return 0;
}

static struct vgic_its *__vgic_doorbell_to_its(struct kvm *kvm, gpa_t db)
{
        struct kvm_io_device *kvm_io_dev;
        struct vgic_io_device *iodev;

        kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, db);
        if (!kvm_io_dev)
                return ERR_PTR(-EINVAL);

        if (kvm_io_dev->ops != &kvm_io_gic_ops)
                return ERR_PTR(-EINVAL);

        iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
        if (iodev->iodev_type != IODEV_ITS)
                return ERR_PTR(-EINVAL);

        return iodev->its;
}

static unsigned long vgic_its_cache_key(u32 devid, u32 eventid)
{
        return (((unsigned long)devid) << VITS_TYPER_IDBITS) | eventid;

}

static struct vgic_irq *vgic_its_check_cache(struct kvm *kvm, phys_addr_t db,
                                             u32 devid, u32 eventid)
{
        unsigned long cache_key = vgic_its_cache_key(devid, eventid);
        struct vgic_its *its;
        struct vgic_irq *irq;

        if (devid > VITS_MAX_DEVID || eventid > VITS_MAX_EVENTID)
                return NULL;

        its = __vgic_doorbell_to_its(kvm, db);
        if (IS_ERR(its))
                return NULL;

        rcu_read_lock();

        irq = xa_load(&its->translation_cache, cache_key);
        if (!vgic_try_get_irq_kref(irq))
                irq = NULL;

        rcu_read_unlock();

        return irq;
}

static void vgic_its_cache_translation(struct kvm *kvm, struct vgic_its *its,
                                       u32 devid, u32 eventid,
                                       struct vgic_irq *irq)
{
        unsigned long cache_key = vgic_its_cache_key(devid, eventid);
        struct vgic_irq *old;

        /* Do not cache a directly injected interrupt */
        if (irq->hw)
                return;

        /*
         * The irq refcount is guaranteed to be nonzero while holding the
         * its_lock, as the ITE (and the reference it holds) cannot be freed.
         */
        lockdep_assert_held(&its->its_lock);
        vgic_get_irq_kref(irq);

        old = xa_store(&its->translation_cache, cache_key, irq, GFP_KERNEL_ACCOUNT);

        /*
         * Put the reference taken on @irq if the store fails. Intentionally do
         * not return the error as the translation cache is best effort.
         */
        if (xa_is_err(old)) {
                vgic_put_irq(kvm, irq);
                return;
        }

        /*
         * We could have raced with another CPU caching the same
         * translation behind our back, ensure we don't leak a
         * reference if that is the case.
         */
        if (old)
                vgic_put_irq(kvm, old);
}

static void vgic_its_invalidate_cache(struct vgic_its *its)
{
        struct kvm *kvm = its->dev->kvm;
        struct vgic_irq *irq;
        unsigned long idx;

        xa_for_each(&its->translation_cache, idx, irq) {
                xa_erase(&its->translation_cache, idx);
                vgic_put_irq(kvm, irq);
        }
}

void vgic_its_invalidate_all_caches(struct kvm *kvm)
{
        struct kvm_device *dev;
        struct vgic_its *its;

        rcu_read_lock();

        list_for_each_entry_rcu(dev, &kvm->devices, vm_node) {
                if (dev->ops != &kvm_arm_vgic_its_ops)
                        continue;

                its = dev->private;
                vgic_its_invalidate_cache(its);
        }

        rcu_read_unlock();
}

int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
                         u32 devid, u32 eventid, struct vgic_irq **irq)
{
        struct kvm_vcpu *vcpu;
        struct its_ite *ite;

        if (!its->enabled)
                return -EBUSY;

        ite = find_ite(its, devid, eventid);
        if (!ite || !its_is_collection_mapped(ite->collection))
                return E_ITS_INT_UNMAPPED_INTERRUPT;

        vcpu = collection_to_vcpu(kvm, ite->collection);
        if (!vcpu)
                return E_ITS_INT_UNMAPPED_INTERRUPT;

        if (!vgic_lpis_enabled(vcpu))
                return -EBUSY;

        vgic_its_cache_translation(kvm, its, devid, eventid, ite->irq);

        *irq = ite->irq;
        return 0;
}

struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi)
{
        u64 address;

        if (!vgic_has_its(kvm))
                return ERR_PTR(-ENODEV);

        if (!(msi->flags & KVM_MSI_VALID_DEVID))
                return ERR_PTR(-EINVAL);

        address = (u64)msi->address_hi << 32 | msi->address_lo;

        return __vgic_doorbell_to_its(kvm, address);
}

/*
 * Find the target VCPU and the LPI number for a given devid/eventid pair
 * and make this IRQ pending, possibly injecting it.
 * Must be called with the its_lock mutex held.
 * Returns 0 on success, a positive error value for any ITS mapping
 * related errors and negative error values for generic errors.
 */
static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
                                u32 devid, u32 eventid)
{
        struct vgic_irq *irq = NULL;
        unsigned long flags;
        int err;

        err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq);
        if (err)
                return err;

        if (irq->hw)
                return irq_set_irqchip_state(irq->host_irq,
                                             IRQCHIP_STATE_PENDING, true);

        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        irq->pending_latch = true;
        vgic_queue_irq_unlock(kvm, irq, flags);

        return 0;
}

int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi)
{
        struct vgic_irq *irq;
        unsigned long flags;
        phys_addr_t db;

        db = (u64)msi->address_hi << 32 | msi->address_lo;
        irq = vgic_its_check_cache(kvm, db, msi->devid, msi->data);
        if (!irq)
                return -EWOULDBLOCK;

        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        irq->pending_latch = true;
        vgic_queue_irq_unlock(kvm, irq, flags);
        vgic_put_irq(kvm, irq);

        return 0;
}

/*
 * Queries the KVM IO bus framework to get the ITS pointer from the given
 * doorbell address.
 * We then call vgic_its_trigger_msi() with the decoded data.
 * According to the KVM_SIGNAL_MSI API description returns 1 on success.
 */
int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
{
        struct vgic_its *its;
        int ret;

        if (!vgic_its_inject_cached_translation(kvm, msi))
                return 1;

        its = vgic_msi_to_its(kvm, msi);
        if (IS_ERR(its))
                return PTR_ERR(its);

        mutex_lock(&its->its_lock);
        ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data);
        mutex_unlock(&its->its_lock);

        if (ret < 0)
                return ret;

        /*
         * KVM_SIGNAL_MSI demands a return value > 0 for success and 0
         * if the guest has blocked the MSI. So we map any LPI mapping
         * related error to that.
         */
        if (ret)
                return 0;
        else
                return 1;
}

/* Requires the its_lock to be held. */
static void its_free_ite(struct kvm *kvm, struct its_ite *ite)
{
        list_del(&ite->ite_list);

        /* This put matches the get in vgic_add_lpi. */
        if (ite->irq) {
                if (ite->irq->hw)
                        WARN_ON(its_unmap_vlpi(ite->irq->host_irq));

                vgic_put_irq(kvm, ite->irq);
        }

        kfree(ite);
}

static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
{
        return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
}

#define its_cmd_get_command(cmd)        its_cmd_mask_field(cmd, 0,  0,  8)
#define its_cmd_get_deviceid(cmd)        its_cmd_mask_field(cmd, 0, 32, 32)
#define its_cmd_get_size(cmd)                (its_cmd_mask_field(cmd, 1,  0,  5) + 1)
#define its_cmd_get_id(cmd)                its_cmd_mask_field(cmd, 1,  0, 32)
#define its_cmd_get_physical_id(cmd)        its_cmd_mask_field(cmd, 1, 32, 32)
#define its_cmd_get_collection(cmd)        its_cmd_mask_field(cmd, 2,  0, 16)
#define its_cmd_get_ittaddr(cmd)        (its_cmd_mask_field(cmd, 2,  8, 44) << 8)
#define its_cmd_get_target_addr(cmd)        its_cmd_mask_field(cmd, 2, 16, 32)
#define its_cmd_get_validbit(cmd)        its_cmd_mask_field(cmd, 2, 63,  1)

/*
 * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
 * Must be called with the its_lock mutex held.
 */
static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
                                       u64 *its_cmd)
{
        u32 device_id = its_cmd_get_deviceid(its_cmd);
        u32 event_id = its_cmd_get_id(its_cmd);
        struct its_ite *ite;

        ite = find_ite(its, device_id, event_id);
        if (ite && its_is_collection_mapped(ite->collection)) {
                struct its_device *device = find_its_device(its, device_id);
                int ite_esz = vgic_its_get_abi(its)->ite_esz;
                gpa_t gpa = device->itt_addr + ite->event_id * ite_esz;
                /*
                 * Though the spec talks about removing the pending state, we
                 * don't bother here since we clear the ITTE anyway and the
                 * pending state is a property of the ITTE struct.
                 */
                vgic_its_invalidate_cache(its);

                its_free_ite(kvm, ite);

                return vgic_its_write_entry_lock(its, gpa, 0ULL, ite);
        }

        return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
}

/*
 * The MOVI command moves an ITTE to a different collection.
 * Must be called with the its_lock mutex held.
 */
static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
                                    u64 *its_cmd)
{
        u32 device_id = its_cmd_get_deviceid(its_cmd);
        u32 event_id = its_cmd_get_id(its_cmd);
        u32 coll_id = its_cmd_get_collection(its_cmd);
        struct kvm_vcpu *vcpu;
        struct its_ite *ite;
        struct its_collection *collection;

        ite = find_ite(its, device_id, event_id);
        if (!ite)
                return E_ITS_MOVI_UNMAPPED_INTERRUPT;

        if (!its_is_collection_mapped(ite->collection))
                return E_ITS_MOVI_UNMAPPED_COLLECTION;

        collection = find_collection(its, coll_id);
        if (!its_is_collection_mapped(collection))
                return E_ITS_MOVI_UNMAPPED_COLLECTION;

        ite->collection = collection;
        vcpu = collection_to_vcpu(kvm, collection);

        vgic_its_invalidate_cache(its);

        return update_affinity(ite->irq, vcpu);
}

static bool __is_visible_gfn_locked(struct vgic_its *its, gpa_t gpa)
{
        gfn_t gfn = gpa >> PAGE_SHIFT;
        int idx;
        bool ret;

        idx = srcu_read_lock(&its->dev->kvm->srcu);
        ret = kvm_is_visible_gfn(its->dev->kvm, gfn);
        srcu_read_unlock(&its->dev->kvm->srcu, idx);
        return ret;
}

/*
 * Check whether an ID can be stored into the corresponding guest table.
 * For a direct table this is pretty easy, but gets a bit nasty for
 * indirect tables. We check whether the resulting guest physical address
 * is actually valid (covered by a memslot and guest accessible).
 * For this we have to read the respective first level entry.
 */
static bool vgic_its_check_id(struct vgic_its *its, u64 baser, u32 id,
                              gpa_t *eaddr)
{
        int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
        u64 indirect_ptr, type = GITS_BASER_TYPE(baser);
        phys_addr_t base = GITS_BASER_ADDR_48_to_52(baser);
        int esz = GITS_BASER_ENTRY_SIZE(baser);
        int index;

        switch (type) {
        case GITS_BASER_TYPE_DEVICE:
                if (id > VITS_MAX_DEVID)
                        return false;
                break;
        case GITS_BASER_TYPE_COLLECTION:
                /* as GITS_TYPER.CIL == 0, ITS supports 16-bit collection ID */
                if (id >= BIT_ULL(16))
                        return false;
                break;
        default:
                return false;
        }

        if (!(baser & GITS_BASER_INDIRECT)) {
                phys_addr_t addr;

                if (id >= (l1_tbl_size / esz))
                        return false;

                addr = base + id * esz;

                if (eaddr)
                        *eaddr = addr;

                return __is_visible_gfn_locked(its, addr);
        }

        /* calculate and check the index into the 1st level */
        index = id / (SZ_64K / esz);
        if (index >= (l1_tbl_size / sizeof(u64)))
                return false;

        /* Each 1st level entry is represented by a 64-bit value. */
        if (kvm_read_guest_lock(its->dev->kvm,
                           base + index * sizeof(indirect_ptr),
                           &indirect_ptr, sizeof(indirect_ptr)))
                return false;

        indirect_ptr = le64_to_cpu(indirect_ptr);

        /* check the valid bit of the first level entry */
        if (!(indirect_ptr & BIT_ULL(63)))
                return false;

        /* Mask the guest physical address and calculate the frame number. */
        indirect_ptr &= GENMASK_ULL(51, 16);

        /* Find the address of the actual entry */
        index = id % (SZ_64K / esz);
        indirect_ptr += index * esz;

        if (eaddr)
                *eaddr = indirect_ptr;

        return __is_visible_gfn_locked(its, indirect_ptr);
}

/*
 * Check whether an event ID can be stored in the corresponding Interrupt
 * Translation Table, which starts at device->itt_addr.
 */
static bool vgic_its_check_event_id(struct vgic_its *its, struct its_device *device,
                u32 event_id)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        int ite_esz = abi->ite_esz;
        gpa_t gpa;

        /* max table size is: BIT_ULL(device->num_eventid_bits) * ite_esz */
        if (event_id >= BIT_ULL(device->num_eventid_bits))
                return false;

        gpa = device->itt_addr + event_id * ite_esz;
        return __is_visible_gfn_locked(its, gpa);
}

/*
 * Add a new collection into the ITS collection table.
 * Returns 0 on success, and a negative error value for generic errors.
 */
static int vgic_its_alloc_collection(struct vgic_its *its,
                                     struct its_collection **colp,
                                     u32 coll_id)
{
        struct its_collection *collection;

        collection = kzalloc(sizeof(*collection), GFP_KERNEL_ACCOUNT);
        if (!collection)
                return -ENOMEM;

        collection->collection_id = coll_id;
        collection->target_addr = COLLECTION_NOT_MAPPED;

        list_add_tail(&collection->coll_list, &its->collection_list);
        *colp = collection;

        return 0;
}

static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
{
        struct its_collection *collection;
        struct its_device *device;
        struct its_ite *ite;

        /*
         * Clearing the mapping for that collection ID removes the
         * entry from the list. If there wasn't any before, we can
         * go home early.
         */
        collection = find_collection(its, coll_id);
        if (!collection)
                return;

        for_each_lpi_its(device, ite, its)
                if (ite->collection &&
                    ite->collection->collection_id == coll_id)
                        ite->collection = NULL;

        list_del(&collection->coll_list);
        kfree(collection);
}

/* Must be called with its_lock mutex held */
static struct its_ite *vgic_its_alloc_ite(struct its_device *device,
                                          struct its_collection *collection,
                                          u32 event_id)
{
        struct its_ite *ite;

        ite = kzalloc(sizeof(*ite), GFP_KERNEL_ACCOUNT);
        if (!ite)
                return ERR_PTR(-ENOMEM);

        ite->event_id        = event_id;
        ite->collection = collection;

        list_add_tail(&ite->ite_list, &device->itt_head);
        return ite;
}

/*
 * The MAPTI and MAPI commands map LPIs to ITTEs.
 * Must be called with its_lock mutex held.
 */
static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
                                    u64 *its_cmd)
{
        u32 device_id = its_cmd_get_deviceid(its_cmd);
        u32 event_id = its_cmd_get_id(its_cmd);
        u32 coll_id = its_cmd_get_collection(its_cmd);
        struct its_ite *ite;
        struct kvm_vcpu *vcpu = NULL;
        struct its_device *device;
        struct its_collection *collection, *new_coll = NULL;
        struct vgic_irq *irq;
        int lpi_nr;

        device = find_its_device(its, device_id);
        if (!device)
                return E_ITS_MAPTI_UNMAPPED_DEVICE;

        if (!vgic_its_check_event_id(its, device, event_id))
                return E_ITS_MAPTI_ID_OOR;

        if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
                lpi_nr = its_cmd_get_physical_id(its_cmd);
        else
                lpi_nr = event_id;
        if (lpi_nr < GIC_LPI_OFFSET ||
            lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
                return E_ITS_MAPTI_PHYSICALID_OOR;

        /* If there is an existing mapping, behavior is UNPREDICTABLE. */
        if (find_ite(its, device_id, event_id))
                return 0;

        collection = find_collection(its, coll_id);
        if (!collection) {
                int ret;

                if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
                        return E_ITS_MAPC_COLLECTION_OOR;

                ret = vgic_its_alloc_collection(its, &collection, coll_id);
                if (ret)
                        return ret;
                new_coll = collection;
        }

        ite = vgic_its_alloc_ite(device, collection, event_id);
        if (IS_ERR(ite)) {
                if (new_coll)
                        vgic_its_free_collection(its, coll_id);
                return PTR_ERR(ite);
        }

        if (its_is_collection_mapped(collection))
                vcpu = collection_to_vcpu(kvm, collection);

        irq = vgic_add_lpi(kvm, lpi_nr, vcpu);
        if (IS_ERR(irq)) {
                if (new_coll)
                        vgic_its_free_collection(its, coll_id);
                its_free_ite(kvm, ite);
                return PTR_ERR(irq);
        }
        ite->irq = irq;

        return 0;
}

/* Requires the its_lock to be held. */
static void vgic_its_free_device(struct kvm *kvm, struct vgic_its *its,
                                 struct its_device *device)
{
        struct its_ite *ite, *temp;

        /*
         * The spec says that unmapping a device with still valid
         * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
         * since we cannot leave the memory unreferenced.
         */
        list_for_each_entry_safe(ite, temp, &device->itt_head, ite_list)
                its_free_ite(kvm, ite);

        vgic_its_invalidate_cache(its);

        list_del(&device->dev_list);
        kfree(device);
}

/* its lock must be held */
static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its)
{
        struct its_device *cur, *temp;

        list_for_each_entry_safe(cur, temp, &its->device_list, dev_list)
                vgic_its_free_device(kvm, its, cur);
}

/* its lock must be held */
static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its)
{
        struct its_collection *cur, *temp;

        list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list)
                vgic_its_free_collection(its, cur->collection_id);
}

/* Must be called with its_lock mutex held */
static struct its_device *vgic_its_alloc_device(struct vgic_its *its,
                                                u32 device_id, gpa_t itt_addr,
                                                u8 num_eventid_bits)
{
        struct its_device *device;

        device = kzalloc(sizeof(*device), GFP_KERNEL_ACCOUNT);
        if (!device)
                return ERR_PTR(-ENOMEM);

        device->device_id = device_id;
        device->itt_addr = itt_addr;
        device->num_eventid_bits = num_eventid_bits;
        INIT_LIST_HEAD(&device->itt_head);

        list_add_tail(&device->dev_list, &its->device_list);
        return device;
}

/*
 * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
 * Must be called with the its_lock mutex held.
 */
static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
                                    u64 *its_cmd)
{
        u32 device_id = its_cmd_get_deviceid(its_cmd);
        bool valid = its_cmd_get_validbit(its_cmd);
        u8 num_eventid_bits = its_cmd_get_size(its_cmd);
        gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd);
        struct its_device *device;
        gpa_t gpa;

        if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa))
                return E_ITS_MAPD_DEVICE_OOR;

        if (valid && num_eventid_bits > VITS_TYPER_IDBITS)
                return E_ITS_MAPD_ITTSIZE_OOR;

        device = find_its_device(its, device_id);

        /*
         * The spec says that calling MAPD on an already mapped device
         * invalidates all cached data for this device. We implement this
         * by removing the mapping and re-establishing it.
         */
        if (device)
                vgic_its_free_device(kvm, its, device);

        /*
         * The spec does not say whether unmapping a not-mapped device
         * is an error, so we are done in any case.
         */
        if (!valid)
                return vgic_its_write_entry_lock(its, gpa, 0ULL, dte);

        device = vgic_its_alloc_device(its, device_id, itt_addr,
                                       num_eventid_bits);

        return PTR_ERR_OR_ZERO(device);
}

/*
 * The MAPC command maps collection IDs to redistributors.
 * Must be called with the its_lock mutex held.
 */
static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
                                    u64 *its_cmd)
{
        u16 coll_id;
        struct its_collection *collection;
        bool valid;

        valid = its_cmd_get_validbit(its_cmd);
        coll_id = its_cmd_get_collection(its_cmd);

        if (!valid) {
                vgic_its_free_collection(its, coll_id);
                vgic_its_invalidate_cache(its);
        } else {
                struct kvm_vcpu *vcpu;

                vcpu = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd));
                if (!vcpu)
                        return E_ITS_MAPC_PROCNUM_OOR;

                collection = find_collection(its, coll_id);

                if (!collection) {
                        int ret;

                        if (!vgic_its_check_id(its, its->baser_coll_table,
                                                coll_id, NULL))
                                return E_ITS_MAPC_COLLECTION_OOR;

                        ret = vgic_its_alloc_collection(its, &collection,
                                                        coll_id);
                        if (ret)
                                return ret;
                        collection->target_addr = vcpu->vcpu_id;
                } else {
                        collection->target_addr = vcpu->vcpu_id;
                        update_affinity_collection(kvm, its, collection);
                }
        }

        return 0;
}

/*
 * The CLEAR command removes the pending state for a particular LPI.
 * Must be called with the its_lock mutex held.
 */
static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
                                     u64 *its_cmd)
{
        u32 device_id = its_cmd_get_deviceid(its_cmd);
        u32 event_id = its_cmd_get_id(its_cmd);
        struct its_ite *ite;


        ite = find_ite(its, device_id, event_id);
        if (!ite)
                return E_ITS_CLEAR_UNMAPPED_INTERRUPT;

        ite->irq->pending_latch = false;

        if (ite->irq->hw)
                return irq_set_irqchip_state(ite->irq->host_irq,
                                             IRQCHIP_STATE_PENDING, false);

        return 0;
}

int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq)
{
        return update_lpi_config(kvm, irq, NULL, true);
}

/*
 * The INV command syncs the configuration bits from the memory table.
 * Must be called with the its_lock mutex held.
 */
static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
                                   u64 *its_cmd)
{
        u32 device_id = its_cmd_get_deviceid(its_cmd);
        u32 event_id = its_cmd_get_id(its_cmd);
        struct its_ite *ite;


        ite = find_ite(its, device_id, event_id);
        if (!ite)
                return E_ITS_INV_UNMAPPED_INTERRUPT;

        return vgic_its_inv_lpi(kvm, ite->irq);
}

/**
 * vgic_its_invall - invalidate all LPIs targeting a given vcpu
 * @vcpu: the vcpu for which the RD is targeted by an invalidation
 *
 * Contrary to the INVALL command, this targets a RD instead of a
 * collection, and we don't need to hold the its_lock, since no ITS is
 * involved here.
 */
int vgic_its_invall(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct vgic_irq *irq;
        unsigned long intid;

        xa_for_each(&dist->lpi_xa, intid, irq) {
                irq = vgic_get_irq(kvm, intid);
                if (!irq)
                        continue;

                update_lpi_config(kvm, irq, vcpu, false);
                vgic_put_irq(kvm, irq);
        }

        if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm)
                its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe);

        return 0;
}

/*
 * The INVALL command requests flushing of all IRQ data in this collection.
 * Find the VCPU mapped to that collection, then iterate over the VM's list
 * of mapped LPIs and update the configuration for each IRQ which targets
 * the specified vcpu. The configuration will be read from the in-memory
 * configuration table.
 * Must be called with the its_lock mutex held.
 */
static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
                                      u64 *its_cmd)
{
        u32 coll_id = its_cmd_get_collection(its_cmd);
        struct its_collection *collection;
        struct kvm_vcpu *vcpu;

        collection = find_collection(its, coll_id);
        if (!its_is_collection_mapped(collection))
                return E_ITS_INVALL_UNMAPPED_COLLECTION;

        vcpu = collection_to_vcpu(kvm, collection);
        vgic_its_invall(vcpu);

        return 0;
}

/*
 * The MOVALL command moves the pending state of all IRQs targeting one
 * redistributor to another. We don't hold the pending state in the VCPUs,
 * but in the IRQs instead, so there is really not much to do for us here.
 * However the spec says that no IRQ must target the old redistributor
 * afterwards, so we make sure that no LPI is using the associated target_vcpu.
 * This command affects all LPIs in the system that target that redistributor.
 */
static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
                                      u64 *its_cmd)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu1, *vcpu2;
        struct vgic_irq *irq;
        unsigned long intid;

        /* We advertise GITS_TYPER.PTA==0, making the address the vcpu ID */
        vcpu1 = kvm_get_vcpu_by_id(kvm, its_cmd_get_target_addr(its_cmd));
        vcpu2 = kvm_get_vcpu_by_id(kvm, its_cmd_mask_field(its_cmd, 3, 16, 32));

        if (!vcpu1 || !vcpu2)
                return E_ITS_MOVALL_PROCNUM_OOR;

        if (vcpu1 == vcpu2)
                return 0;

        xa_for_each(&dist->lpi_xa, intid, irq) {
                irq = vgic_get_irq(kvm, intid);
                if (!irq)
                        continue;

                update_affinity(irq, vcpu2);

                vgic_put_irq(kvm, irq);
        }

        vgic_its_invalidate_cache(its);

        return 0;
}

/*
 * The INT command injects the LPI associated with that DevID/EvID pair.
 * Must be called with the its_lock mutex held.
 */
static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
                                   u64 *its_cmd)
{
        u32 msi_data = its_cmd_get_id(its_cmd);
        u64 msi_devid = its_cmd_get_deviceid(its_cmd);

        return vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
}

/*
 * This function is called with the its_cmd lock held, but the ITS data
 * structure lock dropped.
 */
static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
                                   u64 *its_cmd)
{
        int ret = -ENODEV;

        mutex_lock(&its->its_lock);
        switch (its_cmd_get_command(its_cmd)) {
        case GITS_CMD_MAPD:
                ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
                break;
        case GITS_CMD_MAPC:
                ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
                break;
        case GITS_CMD_MAPI:
                ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
                break;
        case GITS_CMD_MAPTI:
                ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
                break;
        case GITS_CMD_MOVI:
                ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
                break;
        case GITS_CMD_DISCARD:
                ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
                break;
        case GITS_CMD_CLEAR:
                ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
                break;
        case GITS_CMD_MOVALL:
                ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
                break;
        case GITS_CMD_INT:
                ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
                break;
        case GITS_CMD_INV:
                ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
                break;
        case GITS_CMD_INVALL:
                ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
                break;
        case GITS_CMD_SYNC:
                /* we ignore this command: we are in sync all of the time */
                ret = 0;
                break;
        }
        mutex_unlock(&its->its_lock);

        return ret;
}

static u64 vgic_sanitise_its_baser(u64 reg)
{
        reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
                                  GITS_BASER_SHAREABILITY_SHIFT,
                                  vgic_sanitise_shareability);
        reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
                                  GITS_BASER_INNER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_inner_cacheability);
        reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
                                  GITS_BASER_OUTER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_outer_cacheability);

        /* We support only one (ITS) page size: 64K */
        reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;

        return reg;
}

static u64 vgic_sanitise_its_cbaser(u64 reg)
{
        reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
                                  GITS_CBASER_SHAREABILITY_SHIFT,
                                  vgic_sanitise_shareability);
        reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
                                  GITS_CBASER_INNER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_inner_cacheability);
        reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
                                  GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_outer_cacheability);

        /* Sanitise the physical address to be 64k aligned. */
        reg &= ~GENMASK_ULL(15, 12);

        return reg;
}

static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
                                               struct vgic_its *its,
                                               gpa_t addr, unsigned int len)
{
        return extract_bytes(its->cbaser, addr & 7, len);
}

static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
                                       gpa_t addr, unsigned int len,
                                       unsigned long val)
{
        /* When GITS_CTLR.Enable is 1, this register is RO. */
        if (its->enabled)
                return;

        mutex_lock(&its->cmd_lock);
        its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
        its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
        its->creadr = 0;
        /*
         * CWRITER is architecturally UNKNOWN on reset, but we need to reset
         * it to CREADR to make sure we start with an empty command buffer.
         */
        its->cwriter = its->creadr;
        mutex_unlock(&its->cmd_lock);
}

#define ITS_CMD_BUFFER_SIZE(baser)        ((((baser) & 0xff) + 1) << 12)
#define ITS_CMD_SIZE                        32
#define ITS_CMD_OFFSET(reg)                ((reg) & GENMASK(19, 5))

/* Must be called with the cmd_lock held. */
static void vgic_its_process_commands(struct kvm *kvm, struct vgic_its *its)
{
        gpa_t cbaser;
        u64 cmd_buf[4];

        /* Commands are only processed when the ITS is enabled. */
        if (!its->enabled)
                return;

        cbaser = GITS_CBASER_ADDRESS(its->cbaser);

        while (its->cwriter != its->creadr) {
                int ret = kvm_read_guest_lock(kvm, cbaser + its->creadr,
                                              cmd_buf, ITS_CMD_SIZE);
                /*
                 * If kvm_read_guest() fails, this could be due to the guest
                 * programming a bogus value in CBASER or something else going
                 * wrong from which we cannot easily recover.
                 * According to section 6.3.2 in the GICv3 spec we can just
                 * ignore that command then.
                 */
                if (!ret)
                        vgic_its_handle_command(kvm, its, cmd_buf);

                its->creadr += ITS_CMD_SIZE;
                if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
                        its->creadr = 0;
        }
}

/*
 * By writing to CWRITER the guest announces new commands to be processed.
 * To avoid any races in the first place, we take the its_cmd lock, which
 * protects our ring buffer variables, so that there is only one user
 * per ITS handling commands at a given time.
 */
static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
                                        gpa_t addr, unsigned int len,
                                        unsigned long val)
{
        u64 reg;

        if (!its)
                return;

        mutex_lock(&its->cmd_lock);

        reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
        reg = ITS_CMD_OFFSET(reg);
        if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
                mutex_unlock(&its->cmd_lock);
                return;
        }
        its->cwriter = reg;

        vgic_its_process_commands(kvm, its);

        mutex_unlock(&its->cmd_lock);
}

static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
                                                struct vgic_its *its,
                                                gpa_t addr, unsigned int len)
{
        return extract_bytes(its->cwriter, addr & 0x7, len);
}

static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
                                               struct vgic_its *its,
                                               gpa_t addr, unsigned int len)
{
        return extract_bytes(its->creadr, addr & 0x7, len);
}

static int vgic_mmio_uaccess_write_its_creadr(struct kvm *kvm,
                                              struct vgic_its *its,
                                              gpa_t addr, unsigned int len,
                                              unsigned long val)
{
        u32 cmd_offset;
        int ret = 0;

        mutex_lock(&its->cmd_lock);

        if (its->enabled) {
                ret = -EBUSY;
                goto out;
        }

        cmd_offset = ITS_CMD_OFFSET(val);
        if (cmd_offset >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
                ret = -EINVAL;
                goto out;
        }

        its->creadr = cmd_offset;
out:
        mutex_unlock(&its->cmd_lock);
        return ret;
}

#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
                                              struct vgic_its *its,
                                              gpa_t addr, unsigned int len)
{
        u64 reg;

        switch (BASER_INDEX(addr)) {
        case 0:
                reg = its->baser_device_table;
                break;
        case 1:
                reg = its->baser_coll_table;
                break;
        default:
                reg = 0;
                break;
        }

        return extract_bytes(reg, addr & 7, len);
}

#define GITS_BASER_RO_MASK        (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
static void vgic_mmio_write_its_baser(struct kvm *kvm,
                                      struct vgic_its *its,
                                      gpa_t addr, unsigned int len,
                                      unsigned long val)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        u64 entry_size, table_type;
        u64 reg, *regptr, clearbits = 0;

        /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
        if (its->enabled)
                return;

        switch (BASER_INDEX(addr)) {
        case 0:
                regptr = &its->baser_device_table;
                entry_size = abi->dte_esz;
                table_type = GITS_BASER_TYPE_DEVICE;
                break;
        case 1:
                regptr = &its->baser_coll_table;
                entry_size = abi->cte_esz;
                table_type = GITS_BASER_TYPE_COLLECTION;
                clearbits = GITS_BASER_INDIRECT;
                break;
        default:
                return;
        }

        reg = update_64bit_reg(*regptr, addr & 7, len, val);
        reg &= ~GITS_BASER_RO_MASK;
        reg &= ~clearbits;

        reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
        reg |= table_type << GITS_BASER_TYPE_SHIFT;
        reg = vgic_sanitise_its_baser(reg);

        *regptr = reg;

        if (!(reg & GITS_BASER_VALID)) {
                /* Take the its_lock to prevent a race with a save/restore */
                mutex_lock(&its->its_lock);
                switch (table_type) {
                case GITS_BASER_TYPE_DEVICE:
                        vgic_its_free_device_list(kvm, its);
                        break;
                case GITS_BASER_TYPE_COLLECTION:
                        vgic_its_free_collection_list(kvm, its);
                        break;
                }
                mutex_unlock(&its->its_lock);
        }
}

static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
                                             struct vgic_its *its,
                                             gpa_t addr, unsigned int len)
{
        u32 reg = 0;

        mutex_lock(&its->cmd_lock);
        if (its->creadr == its->cwriter)
                reg |= GITS_CTLR_QUIESCENT;
        if (its->enabled)
                reg |= GITS_CTLR_ENABLE;
        mutex_unlock(&its->cmd_lock);

        return reg;
}

static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
                                     gpa_t addr, unsigned int len,
                                     unsigned long val)
{
        mutex_lock(&its->cmd_lock);

        /*
         * It is UNPREDICTABLE to enable the ITS if any of the CBASER or
         * device/collection BASER are invalid
         */
        if (!its->enabled && (val & GITS_CTLR_ENABLE) &&
                (!(its->baser_device_table & GITS_BASER_VALID) ||
                 !(its->baser_coll_table & GITS_BASER_VALID) ||
                 !(its->cbaser & GITS_CBASER_VALID)))
                goto out;

        its->enabled = !!(val & GITS_CTLR_ENABLE);
        if (!its->enabled)
                vgic_its_invalidate_cache(its);

        /*
         * Try to process any pending commands. This function bails out early
         * if the ITS is disabled or no commands have been queued.
         */
        vgic_its_process_commands(kvm, its);

out:
        mutex_unlock(&its->cmd_lock);
}

#define REGISTER_ITS_DESC(off, rd, wr, length, acc)                \
{                                                                \
        .reg_offset = off,                                        \
        .len = length,                                                \
        .access_flags = acc,                                        \
        .its_read = rd,                                                \
        .its_write = wr,                                        \
}

#define REGISTER_ITS_DESC_UACCESS(off, rd, wr, uwr, length, acc)\
{                                                                \
        .reg_offset = off,                                        \
        .len = length,                                                \
        .access_flags = acc,                                        \
        .its_read = rd,                                                \
        .its_write = wr,                                        \
        .uaccess_its_write = uwr,                                \
}

static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
                              gpa_t addr, unsigned int len, unsigned long val)
{
        /* Ignore */
}

static struct vgic_register_region its_registers[] = {
        REGISTER_ITS_DESC(GITS_CTLR,
                vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
                VGIC_ACCESS_32bit),
        REGISTER_ITS_DESC_UACCESS(GITS_IIDR,
                vgic_mmio_read_its_iidr, its_mmio_write_wi,
                vgic_mmio_uaccess_write_its_iidr, 4,
                VGIC_ACCESS_32bit),
        REGISTER_ITS_DESC(GITS_TYPER,
                vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_ITS_DESC(GITS_CBASER,
                vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_ITS_DESC(GITS_CWRITER,
                vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_ITS_DESC_UACCESS(GITS_CREADR,
                vgic_mmio_read_its_creadr, its_mmio_write_wi,
                vgic_mmio_uaccess_write_its_creadr, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_ITS_DESC(GITS_BASER,
                vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_ITS_DESC(GITS_IDREGS_BASE,
                vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
                VGIC_ACCESS_32bit),
};

/* This is called on setting the LPI enable bit in the redistributor. */
void vgic_enable_lpis(struct kvm_vcpu *vcpu)
{
        if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
                its_sync_lpi_pending_table(vcpu);
}

static int vgic_register_its_iodev(struct kvm *kvm, struct vgic_its *its,
                                   u64 addr)
{
        struct vgic_io_device *iodev = &its->iodev;
        int ret;

        mutex_lock(&kvm->slots_lock);
        if (!IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
                ret = -EBUSY;
                goto out;
        }

        its->vgic_its_base = addr;
        iodev->regions = its_registers;
        iodev->nr_regions = ARRAY_SIZE(its_registers);
        kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);

        iodev->base_addr = its->vgic_its_base;
        iodev->iodev_type = IODEV_ITS;
        iodev->its = its;
        ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
                                      KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
out:
        mutex_unlock(&kvm->slots_lock);

        return ret;
}

#define INITIAL_BASER_VALUE                                                  \
        (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb)                | \
         GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner)                | \
         GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)                | \
         GITS_BASER_PAGE_SIZE_64K)

#define INITIAL_PROPBASER_VALUE                                                  \
        (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb)                | \
         GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner)        | \
         GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))

static int vgic_its_create(struct kvm_device *dev, u32 type)
{
        int ret;
        struct vgic_its *its;

        if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
                return -ENODEV;

        its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL_ACCOUNT);
        if (!its)
                return -ENOMEM;

        mutex_lock(&dev->kvm->arch.config_lock);

        if (vgic_initialized(dev->kvm)) {
                ret = vgic_v4_init(dev->kvm);
                if (ret < 0) {
                        mutex_unlock(&dev->kvm->arch.config_lock);
                        kfree(its);
                        return ret;
                }
        }

        mutex_init(&its->its_lock);
        mutex_init(&its->cmd_lock);

        /* Yep, even more trickery for lock ordering... */
#ifdef CONFIG_LOCKDEP
        mutex_lock(&its->cmd_lock);
        mutex_lock(&its->its_lock);
        mutex_unlock(&its->its_lock);
        mutex_unlock(&its->cmd_lock);
#endif

        its->vgic_its_base = VGIC_ADDR_UNDEF;

        INIT_LIST_HEAD(&its->device_list);
        INIT_LIST_HEAD(&its->collection_list);
        xa_init(&its->translation_cache);

        dev->kvm->arch.vgic.msis_require_devid = true;
        dev->kvm->arch.vgic.has_its = true;
        its->enabled = false;
        its->dev = dev;

        its->baser_device_table = INITIAL_BASER_VALUE                        |
                ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
        its->baser_coll_table = INITIAL_BASER_VALUE |
                ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
        dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;

        dev->private = its;

        ret = vgic_its_set_abi(its, NR_ITS_ABIS - 1);

        mutex_unlock(&dev->kvm->arch.config_lock);

        return ret;
}

static void vgic_its_destroy(struct kvm_device *kvm_dev)
{
        struct kvm *kvm = kvm_dev->kvm;
        struct vgic_its *its = kvm_dev->private;

        mutex_lock(&its->its_lock);

        vgic_its_debug_destroy(kvm_dev);

        vgic_its_free_device_list(kvm, its);
        vgic_its_free_collection_list(kvm, its);
        vgic_its_invalidate_cache(its);
        xa_destroy(&its->translation_cache);

        mutex_unlock(&its->its_lock);
        kfree(its);
        kfree(kvm_dev);/* alloc by kvm_ioctl_create_device, free by .destroy */
}

static int vgic_its_has_attr_regs(struct kvm_device *dev,
                                  struct kvm_device_attr *attr)
{
        const struct vgic_register_region *region;
        gpa_t offset = attr->attr;
        int align;

        align = (offset < GITS_TYPER) || (offset >= GITS_PIDR4) ? 0x3 : 0x7;

        if (offset & align)
                return -EINVAL;

        region = vgic_find_mmio_region(its_registers,
                                       ARRAY_SIZE(its_registers),
                                       offset);
        if (!region)
                return -ENXIO;

        return 0;
}

static int vgic_its_attr_regs_access(struct kvm_device *dev,
                                     struct kvm_device_attr *attr,
                                     u64 *reg, bool is_write)
{
        const struct vgic_register_region *region;
        struct vgic_its *its;
        gpa_t addr, offset;
        unsigned int len;
        int align, ret = 0;

        its = dev->private;
        offset = attr->attr;

        /*
         * Although the spec supports upper/lower 32-bit accesses to
         * 64-bit ITS registers, the userspace ABI requires 64-bit
         * accesses to all 64-bit wide registers. We therefore only
         * support 32-bit accesses to GITS_CTLR, GITS_IIDR and GITS ID
         * registers
         */
        if ((offset < GITS_TYPER) || (offset >= GITS_PIDR4))
                align = 0x3;
        else
                align = 0x7;

        if (offset & align)
                return -EINVAL;

        mutex_lock(&dev->kvm->lock);

        if (!lock_all_vcpus(dev->kvm)) {
                mutex_unlock(&dev->kvm->lock);
                return -EBUSY;
        }

        mutex_lock(&dev->kvm->arch.config_lock);

        if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base)) {
                ret = -ENXIO;
                goto out;
        }

        region = vgic_find_mmio_region(its_registers,
                                       ARRAY_SIZE(its_registers),
                                       offset);
        if (!region) {
                ret = -ENXIO;
                goto out;
        }

        addr = its->vgic_its_base + offset;

        len = region->access_flags & VGIC_ACCESS_64bit ? 8 : 4;

        if (is_write) {
                if (region->uaccess_its_write)
                        ret = region->uaccess_its_write(dev->kvm, its, addr,
                                                        len, *reg);
                else
                        region->its_write(dev->kvm, its, addr, len, *reg);
        } else {
                *reg = region->its_read(dev->kvm, its, addr, len);
        }
out:
        mutex_unlock(&dev->kvm->arch.config_lock);
        unlock_all_vcpus(dev->kvm);
        mutex_unlock(&dev->kvm->lock);
        return ret;
}

static u32 compute_next_devid_offset(struct list_head *h,
                                     struct its_device *dev)
{
        struct its_device *next;
        u32 next_offset;

        if (list_is_last(&dev->dev_list, h))
                return 0;
        next = list_next_entry(dev, dev_list);
        next_offset = next->device_id - dev->device_id;

        return min_t(u32, next_offset, VITS_DTE_MAX_DEVID_OFFSET);
}

static u32 compute_next_eventid_offset(struct list_head *h, struct its_ite *ite)
{
        struct its_ite *next;
        u32 next_offset;

        if (list_is_last(&ite->ite_list, h))
                return 0;
        next = list_next_entry(ite, ite_list);
        next_offset = next->event_id - ite->event_id;

        return min_t(u32, next_offset, VITS_ITE_MAX_EVENTID_OFFSET);
}

/**
 * typedef entry_fn_t - Callback called on a table entry restore path
 * @its: its handle
 * @id: id of the entry
 * @entry: pointer to the entry
 * @opaque: pointer to an opaque data
 *
 * Return: < 0 on error, 0 if last element was identified, id offset to next
 * element otherwise
 */
typedef int (*entry_fn_t)(struct vgic_its *its, u32 id, void *entry,
                          void *opaque);

/**
 * scan_its_table - Scan a contiguous table in guest RAM and applies a function
 * to each entry
 *
 * @its: its handle
 * @base: base gpa of the table
 * @size: size of the table in bytes
 * @esz: entry size in bytes
 * @start_id: the ID of the first entry in the table
 * (non zero for 2d level tables)
 * @fn: function to apply on each entry
 * @opaque: pointer to opaque data
 *
 * Return: < 0 on error, 0 if last element was identified, 1 otherwise
 * (the last element may not be found on second level tables)
 */
static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz,
                          int start_id, entry_fn_t fn, void *opaque)
{
        struct kvm *kvm = its->dev->kvm;
        unsigned long len = size;
        int id = start_id;
        gpa_t gpa = base;
        char entry[ESZ_MAX];
        int ret;

        memset(entry, 0, esz);

        while (true) {
                int next_offset;
                size_t byte_offset;

                ret = kvm_read_guest_lock(kvm, gpa, entry, esz);
                if (ret)
                        return ret;

                next_offset = fn(its, id, entry, opaque);
                if (next_offset <= 0)
                        return next_offset;

                byte_offset = next_offset * esz;
                if (byte_offset >= len)
                        break;

                id += next_offset;
                gpa += byte_offset;
                len -= byte_offset;
        }
        return 1;
}

/*
 * vgic_its_save_ite - Save an interrupt translation entry at @gpa
 */
static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev,
                              struct its_ite *ite, gpa_t gpa)
{
        u32 next_offset;
        u64 val;

        next_offset = compute_next_eventid_offset(&dev->itt_head, ite);
        val = ((u64)next_offset << KVM_ITS_ITE_NEXT_SHIFT) |
               ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) |
                ite->collection->collection_id;
        val = cpu_to_le64(val);

        return vgic_its_write_entry_lock(its, gpa, val, ite);
}

/**
 * vgic_its_restore_ite - restore an interrupt translation entry
 *
 * @its: its handle
 * @event_id: id used for indexing
 * @ptr: pointer to the ITE entry
 * @opaque: pointer to the its_device
 */
static int vgic_its_restore_ite(struct vgic_its *its, u32 event_id,
                                void *ptr, void *opaque)
{
        struct its_device *dev = opaque;
        struct its_collection *collection;
        struct kvm *kvm = its->dev->kvm;
        struct kvm_vcpu *vcpu = NULL;
        u64 val;
        u64 *p = (u64 *)ptr;
        struct vgic_irq *irq;
        u32 coll_id, lpi_id;
        struct its_ite *ite;
        u32 offset;

        val = *p;

        val = le64_to_cpu(val);

        coll_id = val & KVM_ITS_ITE_ICID_MASK;
        lpi_id = (val & KVM_ITS_ITE_PINTID_MASK) >> KVM_ITS_ITE_PINTID_SHIFT;

        if (!lpi_id)
                return 1; /* invalid entry, no choice but to scan next entry */

        if (lpi_id < VGIC_MIN_LPI)
                return -EINVAL;

        offset = val >> KVM_ITS_ITE_NEXT_SHIFT;
        if (event_id + offset >= BIT_ULL(dev->num_eventid_bits))
                return -EINVAL;

        collection = find_collection(its, coll_id);
        if (!collection)
                return -EINVAL;

        if (!vgic_its_check_event_id(its, dev, event_id))
                return -EINVAL;

        ite = vgic_its_alloc_ite(dev, collection, event_id);
        if (IS_ERR(ite))
                return PTR_ERR(ite);

        if (its_is_collection_mapped(collection))
                vcpu = kvm_get_vcpu_by_id(kvm, collection->target_addr);

        irq = vgic_add_lpi(kvm, lpi_id, vcpu);
        if (IS_ERR(irq)) {
                its_free_ite(kvm, ite);
                return PTR_ERR(irq);
        }
        ite->irq = irq;

        return offset;
}

static int vgic_its_ite_cmp(void *priv, const struct list_head *a,
                            const struct list_head *b)
{
        struct its_ite *itea = container_of(a, struct its_ite, ite_list);
        struct its_ite *iteb = container_of(b, struct its_ite, ite_list);

        if (itea->event_id < iteb->event_id)
                return -1;
        else
                return 1;
}

static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        gpa_t base = device->itt_addr;
        struct its_ite *ite;
        int ret;
        int ite_esz = abi->ite_esz;

        list_sort(NULL, &device->itt_head, vgic_its_ite_cmp);

        list_for_each_entry(ite, &device->itt_head, ite_list) {
                gpa_t gpa = base + ite->event_id * ite_esz;

                /*
                 * If an LPI carries the HW bit, this means that this
                 * interrupt is controlled by GICv4, and we do not
                 * have direct access to that state without GICv4.1.
                 * Let's simply fail the save operation...
                 */
                if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1)
                        return -EACCES;

                ret = vgic_its_save_ite(its, device, ite, gpa);
                if (ret)
                        return ret;
        }
        return 0;
}

/**
 * vgic_its_restore_itt - restore the ITT of a device
 *
 * @its: its handle
 * @dev: device handle
 *
 * Return 0 on success, < 0 on error
 */
static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        gpa_t base = dev->itt_addr;
        int ret;
        int ite_esz = abi->ite_esz;
        size_t max_size = BIT_ULL(dev->num_eventid_bits) * ite_esz;

        ret = scan_its_table(its, base, max_size, ite_esz, 0,
                             vgic_its_restore_ite, dev);

        /* scan_its_table returns +1 if all ITEs are invalid */
        if (ret > 0)
                ret = 0;

        return ret;
}

/**
 * vgic_its_save_dte - Save a device table entry at a given GPA
 *
 * @its: ITS handle
 * @dev: ITS device
 * @ptr: GPA
 */
static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev,
                             gpa_t ptr)
{
        u64 val, itt_addr_field;
        u32 next_offset;

        itt_addr_field = dev->itt_addr >> 8;
        next_offset = compute_next_devid_offset(&its->device_list, dev);
        val = (1ULL << KVM_ITS_DTE_VALID_SHIFT |
               ((u64)next_offset << KVM_ITS_DTE_NEXT_SHIFT) |
               (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) |
                (dev->num_eventid_bits - 1));
        val = cpu_to_le64(val);

        return vgic_its_write_entry_lock(its, ptr, val, dte);
}

/**
 * vgic_its_restore_dte - restore a device table entry
 *
 * @its: its handle
 * @id: device id the DTE corresponds to
 * @ptr: kernel VA where the 8 byte DTE is located
 * @opaque: unused
 *
 * Return: < 0 on error, 0 if the dte is the last one, id offset to the
 * next dte otherwise
 */
static int vgic_its_restore_dte(struct vgic_its *its, u32 id,
                                void *ptr, void *opaque)
{
        struct its_device *dev;
        u64 baser = its->baser_device_table;
        gpa_t itt_addr;
        u8 num_eventid_bits;
        u64 entry = *(u64 *)ptr;
        bool valid;
        u32 offset;
        int ret;

        entry = le64_to_cpu(entry);

        valid = entry >> KVM_ITS_DTE_VALID_SHIFT;
        num_eventid_bits = (entry & KVM_ITS_DTE_SIZE_MASK) + 1;
        itt_addr = ((entry & KVM_ITS_DTE_ITTADDR_MASK)
                        >> KVM_ITS_DTE_ITTADDR_SHIFT) << 8;

        if (!valid)
                return 1;

        /* dte entry is valid */
        offset = (entry & KVM_ITS_DTE_NEXT_MASK) >> KVM_ITS_DTE_NEXT_SHIFT;

        if (!vgic_its_check_id(its, baser, id, NULL))
                return -EINVAL;

        dev = vgic_its_alloc_device(its, id, itt_addr, num_eventid_bits);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        ret = vgic_its_restore_itt(its, dev);
        if (ret) {
                vgic_its_free_device(its->dev->kvm, its, dev);
                return ret;
        }

        return offset;
}

static int vgic_its_device_cmp(void *priv, const struct list_head *a,
                               const struct list_head *b)
{
        struct its_device *deva = container_of(a, struct its_device, dev_list);
        struct its_device *devb = container_of(b, struct its_device, dev_list);

        if (deva->device_id < devb->device_id)
                return -1;
        else
                return 1;
}

/*
 * vgic_its_save_device_tables - Save the device table and all ITT
 * into guest RAM
 *
 * L1/L2 handling is hidden by vgic_its_check_id() helper which directly
 * returns the GPA of the device entry
 */
static int vgic_its_save_device_tables(struct vgic_its *its)
{
        u64 baser = its->baser_device_table;
        struct its_device *dev;

        if (!(baser & GITS_BASER_VALID))
                return 0;

        list_sort(NULL, &its->device_list, vgic_its_device_cmp);

        list_for_each_entry(dev, &its->device_list, dev_list) {
                int ret;
                gpa_t eaddr;

                if (!vgic_its_check_id(its, baser,
                                       dev->device_id, &eaddr))
                        return -EINVAL;

                ret = vgic_its_save_itt(its, dev);
                if (ret)
                        return ret;

                ret = vgic_its_save_dte(its, dev, eaddr);
                if (ret)
                        return ret;
        }
        return 0;
}

/**
 * handle_l1_dte - callback used for L1 device table entries (2 stage case)
 *
 * @its: its handle
 * @id: index of the entry in the L1 table
 * @addr: kernel VA
 * @opaque: unused
 *
 * L1 table entries are scanned by steps of 1 entry
 * Return < 0 if error, 0 if last dte was found when scanning the L2
 * table, +1 otherwise (meaning next L1 entry must be scanned)
 */
static int handle_l1_dte(struct vgic_its *its, u32 id, void *addr,
                         void *opaque)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        int l2_start_id = id * (SZ_64K / abi->dte_esz);
        u64 entry = *(u64 *)addr;
        int dte_esz = abi->dte_esz;
        gpa_t gpa;
        int ret;

        entry = le64_to_cpu(entry);

        if (!(entry & KVM_ITS_L1E_VALID_MASK))
                return 1;

        gpa = entry & KVM_ITS_L1E_ADDR_MASK;

        ret = scan_its_table(its, gpa, SZ_64K, dte_esz,
                             l2_start_id, vgic_its_restore_dte, NULL);

        return ret;
}

/*
 * vgic_its_restore_device_tables - Restore the device table and all ITT
 * from guest RAM to internal data structs
 */
static int vgic_its_restore_device_tables(struct vgic_its *its)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        u64 baser = its->baser_device_table;
        int l1_esz, ret;
        int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
        gpa_t l1_gpa;

        if (!(baser & GITS_BASER_VALID))
                return 0;

        l1_gpa = GITS_BASER_ADDR_48_to_52(baser);

        if (baser & GITS_BASER_INDIRECT) {
                l1_esz = GITS_LVL1_ENTRY_SIZE;
                ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
                                     handle_l1_dte, NULL);
        } else {
                l1_esz = abi->dte_esz;
                ret = scan_its_table(its, l1_gpa, l1_tbl_size, l1_esz, 0,
                                     vgic_its_restore_dte, NULL);
        }

        /* scan_its_table returns +1 if all entries are invalid */
        if (ret > 0)
                ret = 0;

        if (ret < 0)
                vgic_its_free_device_list(its->dev->kvm, its);

        return ret;
}

static int vgic_its_save_cte(struct vgic_its *its,
                             struct its_collection *collection,
                             gpa_t gpa)
{
        u64 val;

        val = (1ULL << KVM_ITS_CTE_VALID_SHIFT |
               ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) |
               collection->collection_id);
        val = cpu_to_le64(val);

        return vgic_its_write_entry_lock(its, gpa, val, cte);
}

/*
 * Restore a collection entry into the ITS collection table.
 * Return +1 on success, 0 if the entry was invalid (which should be
 * interpreted as end-of-table), and a negative error value for generic errors.
 */
static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa)
{
        struct its_collection *collection;
        struct kvm *kvm = its->dev->kvm;
        u32 target_addr, coll_id;
        u64 val;
        int ret;

        ret = vgic_its_read_entry_lock(its, gpa, &val, cte);
        if (ret)
                return ret;
        val = le64_to_cpu(val);
        if (!(val & KVM_ITS_CTE_VALID_MASK))
                return 0;

        target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT);
        coll_id = val & KVM_ITS_CTE_ICID_MASK;

        if (target_addr != COLLECTION_NOT_MAPPED &&
            !kvm_get_vcpu_by_id(kvm, target_addr))
                return -EINVAL;

        collection = find_collection(its, coll_id);
        if (collection)
                return -EEXIST;

        if (!vgic_its_check_id(its, its->baser_coll_table, coll_id, NULL))
                return -EINVAL;

        ret = vgic_its_alloc_collection(its, &collection, coll_id);
        if (ret)
                return ret;
        collection->target_addr = target_addr;
        return 1;
}

/*
 * vgic_its_save_collection_table - Save the collection table into
 * guest RAM
 */
static int vgic_its_save_collection_table(struct vgic_its *its)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        u64 baser = its->baser_coll_table;
        gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser);
        struct its_collection *collection;
        size_t max_size, filled = 0;
        int ret, cte_esz = abi->cte_esz;

        if (!(baser & GITS_BASER_VALID))
                return 0;

        max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;

        list_for_each_entry(collection, &its->collection_list, coll_list) {
                ret = vgic_its_save_cte(its, collection, gpa);
                if (ret)
                        return ret;
                gpa += cte_esz;
                filled += cte_esz;
        }

        if (filled == max_size)
                return 0;

        /*
         * table is not fully filled, add a last dummy element
         * with valid bit unset
         */
        return vgic_its_write_entry_lock(its, gpa, 0ULL, cte);
}

/*
 * vgic_its_restore_collection_table - reads the collection table
 * in guest memory and restores the ITS internal state. Requires the
 * BASER registers to be restored before.
 */
static int vgic_its_restore_collection_table(struct vgic_its *its)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        u64 baser = its->baser_coll_table;
        int cte_esz = abi->cte_esz;
        size_t max_size, read = 0;
        gpa_t gpa;
        int ret;

        if (!(baser & GITS_BASER_VALID))
                return 0;

        gpa = GITS_BASER_ADDR_48_to_52(baser);

        max_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;

        while (read < max_size) {
                ret = vgic_its_restore_cte(its, gpa);
                if (ret <= 0)
                        break;
                gpa += cte_esz;
                read += cte_esz;
        }

        if (ret > 0)
                return 0;

        if (ret < 0)
                vgic_its_free_collection_list(its->dev->kvm, its);

        return ret;
}

/*
 * vgic_its_save_tables_v0 - Save the ITS tables into guest ARM
 * according to v0 ABI
 */
static int vgic_its_save_tables_v0(struct vgic_its *its)
{
        int ret;

        ret = vgic_its_save_device_tables(its);
        if (ret)
                return ret;

        return vgic_its_save_collection_table(its);
}

/*
 * vgic_its_restore_tables_v0 - Restore the ITS tables from guest RAM
 * to internal data structs according to V0 ABI
 *
 */
static int vgic_its_restore_tables_v0(struct vgic_its *its)
{
        int ret;

        ret = vgic_its_restore_collection_table(its);
        if (ret)
                return ret;

        ret = vgic_its_restore_device_tables(its);
        if (ret)
                vgic_its_free_collection_list(its->dev->kvm, its);
        return ret;
}

static int vgic_its_commit_v0(struct vgic_its *its)
{
        const struct vgic_its_abi *abi;

        abi = vgic_its_get_abi(its);
        its->baser_coll_table &= ~GITS_BASER_ENTRY_SIZE_MASK;
        its->baser_device_table &= ~GITS_BASER_ENTRY_SIZE_MASK;

        its->baser_coll_table |= (GIC_ENCODE_SZ(abi->cte_esz, 5)
                                        << GITS_BASER_ENTRY_SIZE_SHIFT);

        its->baser_device_table |= (GIC_ENCODE_SZ(abi->dte_esz, 5)
                                        << GITS_BASER_ENTRY_SIZE_SHIFT);
        return 0;
}

static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its)
{
        /* We need to keep the ABI specific field values */
        its->baser_coll_table &= ~GITS_BASER_VALID;
        its->baser_device_table &= ~GITS_BASER_VALID;
        its->cbaser = 0;
        its->creadr = 0;
        its->cwriter = 0;
        its->enabled = 0;
        vgic_its_free_device_list(kvm, its);
        vgic_its_free_collection_list(kvm, its);
}

static int vgic_its_has_attr(struct kvm_device *dev,
                             struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_ADDR:
                switch (attr->attr) {
                case KVM_VGIC_ITS_ADDR_TYPE:
                        return 0;
                }
                break;
        case KVM_DEV_ARM_VGIC_GRP_CTRL:
                switch (attr->attr) {
                case KVM_DEV_ARM_VGIC_CTRL_INIT:
                        return 0;
                case KVM_DEV_ARM_ITS_CTRL_RESET:
                        return 0;
                case KVM_DEV_ARM_ITS_SAVE_TABLES:
                        return 0;
                case KVM_DEV_ARM_ITS_RESTORE_TABLES:
                        return 0;
                }
                break;
        case KVM_DEV_ARM_VGIC_GRP_ITS_REGS:
                return vgic_its_has_attr_regs(dev, attr);
        }
        return -ENXIO;
}

static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr)
{
        const struct vgic_its_abi *abi = vgic_its_get_abi(its);
        int ret = 0;

        if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */
                return 0;

        mutex_lock(&kvm->lock);

        if (!lock_all_vcpus(kvm)) {
                mutex_unlock(&kvm->lock);
                return -EBUSY;
        }

        mutex_lock(&kvm->arch.config_lock);
        mutex_lock(&its->its_lock);

        switch (attr) {
        case KVM_DEV_ARM_ITS_CTRL_RESET:
                vgic_its_reset(kvm, its);
                break;
        case KVM_DEV_ARM_ITS_SAVE_TABLES:
                ret = abi->save_tables(its);
                break;
        case KVM_DEV_ARM_ITS_RESTORE_TABLES:
                ret = abi->restore_tables(its);
                break;
        }

        mutex_unlock(&its->its_lock);
        mutex_unlock(&kvm->arch.config_lock);
        unlock_all_vcpus(kvm);
        mutex_unlock(&kvm->lock);
        return ret;
}

/*
 * kvm_arch_allow_write_without_running_vcpu - allow writing guest memory
 * without the running VCPU when dirty ring is enabled.
 *
 * The running VCPU is required to track dirty guest pages when dirty ring
 * is enabled. Otherwise, the backup bitmap should be used to track the
 * dirty guest pages. When vgic/its tables are being saved, the backup
 * bitmap is used to track the dirty guest pages due to the missed running
 * VCPU in the period.
 */
bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;

        return dist->table_write_in_progress;
}

static int vgic_its_set_attr(struct kvm_device *dev,
                             struct kvm_device_attr *attr)
{
        struct vgic_its *its = dev->private;
        int ret;

        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_ADDR: {
                u64 __user *uaddr = (u64 __user *)(long)attr->addr;
                unsigned long type = (unsigned long)attr->attr;
                u64 addr;

                if (type != KVM_VGIC_ITS_ADDR_TYPE)
                        return -ENODEV;

                if (copy_from_user(&addr, uaddr, sizeof(addr)))
                        return -EFAULT;

                ret = vgic_check_iorange(dev->kvm, its->vgic_its_base,
                                         addr, SZ_64K, KVM_VGIC_V3_ITS_SIZE);
                if (ret)
                        return ret;

                ret = vgic_register_its_iodev(dev->kvm, its, addr);
                if (ret)
                        return ret;

                return vgic_its_debug_init(dev);

        }
        case KVM_DEV_ARM_VGIC_GRP_CTRL:
                return vgic_its_ctrl(dev->kvm, its, attr->attr);
        case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
                u64 __user *uaddr = (u64 __user *)(long)attr->addr;
                u64 reg;

                if (get_user(reg, uaddr))
                        return -EFAULT;

                return vgic_its_attr_regs_access(dev, attr, &reg, true);
        }
        }
        return -ENXIO;
}

static int vgic_its_get_attr(struct kvm_device *dev,
                             struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_ADDR: {
                struct vgic_its *its = dev->private;
                u64 addr = its->vgic_its_base;
                u64 __user *uaddr = (u64 __user *)(long)attr->addr;
                unsigned long type = (unsigned long)attr->attr;

                if (type != KVM_VGIC_ITS_ADDR_TYPE)
                        return -ENODEV;

                if (copy_to_user(uaddr, &addr, sizeof(addr)))
                        return -EFAULT;
                break;
        }
        case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: {
                u64 __user *uaddr = (u64 __user *)(long)attr->addr;
                u64 reg;
                int ret;

                ret = vgic_its_attr_regs_access(dev, attr, &reg, false);
                if (ret)
                        return ret;
                return put_user(reg, uaddr);
        }
        default:
                return -ENXIO;
        }

        return 0;
}

static struct kvm_device_ops kvm_arm_vgic_its_ops = {
        .name = "kvm-arm-vgic-its",
        .create = vgic_its_create,
        .destroy = vgic_its_destroy,
        .set_attr = vgic_its_set_attr,
        .get_attr = vgic_its_get_attr,
        .has_attr = vgic_its_has_attr,
};

int kvm_vgic_register_its_device(void)
{
        return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
                                       KVM_DEV_TYPE_ARM_VGIC_ITS);
}
































































































   87 



































   64 























  222 






























   94 




   93 








    8 
   87 











   94 







   94 

   94 






















  220 









  205 
  220 






















  255 



  256 











  172 
  171 








  172 




   98 
  172 
  256 


















































































  321 







  320 


  219 







   64 



  185 


  205 
  220 







  321 






  320 


   64 
   64 
   64 

   64 





    1 


































































































































































































































































































  179 




  179 



































































  150 






  151 


  151 




   60 


   96 

































































































































































































































































































































































































































  242 




  240 










































































































  253 










































  389 


  253 
  314 


  391 













































































































































  251 



  253 
  254 





    3 
  250 
  253 





  254 





























  254 



  254 








  242 

  242 




  241 















  242 
  242 



















































  132 




  132 













































































  132 









  132 















  132 
  132 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   64 


   64 
   64 







































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
/*
 * mm/rmap.c - physical to virtual reverse mappings
 *
 * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
 * Released under the General Public License (GPL).
 *
 * Simple, low overhead reverse mapping scheme.
 * Please try to keep this thing as modular as possible.
 *
 * Provides methods for unmapping each kind of mapped page:
 * the anon methods track anonymous pages, and
 * the file methods track pages belonging to an inode.
 *
 * Original design by Rik van Riel <riel@conectiva.com.br> 2001
 * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
 * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
 * Contributions by Hugh Dickins 2003, 2004
 */

/*
 * Lock ordering in mm:
 *
 * inode->i_rwsem        (while writing or truncating, not reading or faulting)
 *   mm->mmap_lock
 *     mapping->invalidate_lock (in filemap_fault)
 *       folio_lock
 *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
 *           vma_start_write
 *             mapping->i_mmap_rwsem
 *               anon_vma->rwsem
 *                 mm->page_table_lock or pte_lock
 *                   swap_lock (in swap_duplicate, swap_info_get)
 *                     mmlist_lock (in mmput, drain_mmlist and others)
 *                     mapping->private_lock (in block_dirty_folio)
 *                         i_pages lock (widely used)
 *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
 *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
 *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
 *                       sb_lock (within inode_lock in fs/fs-writeback.c)
 *                       i_pages lock (widely used, in set_page_dirty,
 *                                 in arch-dependent flush_dcache_mmap_lock,
 *                                 within bdi.wb->list_lock in __sync_single_inode)
 *
 * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
 *   ->tasklist_lock
 *     pte map lock
 *
 * hugetlbfs PageHuge() take locks in this order:
 *   hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
 *     vma_lock (hugetlb specific lock for pmd_sharing)
 *       mapping->i_mmap_rwsem (also used for hugetlb pmd sharing)
 *         folio_lock
 */

#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/mm_inline.h>
#include <linux/oom.h>

#include <asm/tlbflush.h>

#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
#include <trace/events/migrate.h>

#include "internal.h"

static struct kmem_cache *anon_vma_cachep;
static struct kmem_cache *anon_vma_chain_cachep;

static inline struct anon_vma *anon_vma_alloc(void)
{
        struct anon_vma *anon_vma;

        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
        if (anon_vma) {
                atomic_set(&anon_vma->refcount, 1);
                anon_vma->num_children = 0;
                anon_vma->num_active_vmas = 0;
                anon_vma->parent = anon_vma;
                /*
                 * Initialise the anon_vma root to point to itself. If called
                 * from fork, the root will be reset to the parents anon_vma.
                 */
                anon_vma->root = anon_vma;
        }

        return anon_vma;
}

static inline void anon_vma_free(struct anon_vma *anon_vma)
{
        VM_BUG_ON(atomic_read(&anon_vma->refcount));

        /*
         * Synchronize against folio_lock_anon_vma_read() such that
         * we can safely hold the lock without the anon_vma getting
         * freed.
         *
         * Relies on the full mb implied by the atomic_dec_and_test() from
         * put_anon_vma() against the acquire barrier implied by
         * down_read_trylock() from folio_lock_anon_vma_read(). This orders:
         *
         * folio_lock_anon_vma_read()        VS        put_anon_vma()
         *   down_read_trylock()                  atomic_dec_and_test()
         *   LOCK                                  MB
         *   atomic_read()                          rwsem_is_locked()
         *
         * LOCK should suffice since the actual taking of the lock must
         * happen _before_ what follows.
         */
        might_sleep();
        if (rwsem_is_locked(&anon_vma->root->rwsem)) {
                anon_vma_lock_write(anon_vma);
                anon_vma_unlock_write(anon_vma);
        }

        kmem_cache_free(anon_vma_cachep, anon_vma);
}

static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
{
        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
}

static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
{
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
}

static void anon_vma_chain_link(struct vm_area_struct *vma,
                                struct anon_vma_chain *avc,
                                struct anon_vma *anon_vma)
{
        avc->vma = vma;
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
        anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
}

/**
 * __anon_vma_prepare - attach an anon_vma to a memory region
 * @vma: the memory region in question
 *
 * This makes sure the memory mapping described by 'vma' has
 * an 'anon_vma' attached to it, so that we can associate the
 * anonymous pages mapped into it with that anon_vma.
 *
 * The common case will be that we already have one, which
 * is handled inline by anon_vma_prepare(). But if
 * not we either need to find an adjacent mapping that we
 * can re-use the anon_vma from (very common when the only
 * reason for splitting a vma has been mprotect()), or we
 * allocate a new one.
 *
 * Anon-vma allocations are very subtle, because we may have
 * optimistically looked up an anon_vma in folio_lock_anon_vma_read()
 * and that may actually touch the rwsem even in the newly
 * allocated vma (it depends on RCU to make sure that the
 * anon_vma isn't actually destroyed).
 *
 * As a result, we need to do proper anon_vma locking even
 * for the new allocation. At the same time, we do not want
 * to do any locking for the common case of already having
 * an anon_vma.
 */
int __anon_vma_prepare(struct vm_area_struct *vma)
{
        struct mm_struct *mm = vma->vm_mm;
        struct anon_vma *anon_vma, *allocated;
        struct anon_vma_chain *avc;

        mmap_assert_locked(mm);
        might_sleep();

        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_enomem;

        anon_vma = find_mergeable_anon_vma(vma);
        allocated = NULL;
        if (!anon_vma) {
                anon_vma = anon_vma_alloc();
                if (unlikely(!anon_vma))
                        goto out_enomem_free_avc;
                anon_vma->num_children++; /* self-parent link for new root */
                allocated = anon_vma;
        }

        anon_vma_lock_write(anon_vma);
        /* page_table_lock to protect against threads */
        spin_lock(&mm->page_table_lock);
        if (likely(!vma->anon_vma)) {
                vma->anon_vma = anon_vma;
                anon_vma_chain_link(vma, avc, anon_vma);
                anon_vma->num_active_vmas++;
                allocated = NULL;
                avc = NULL;
        }
        spin_unlock(&mm->page_table_lock);
        anon_vma_unlock_write(anon_vma);

        if (unlikely(allocated))
                put_anon_vma(allocated);
        if (unlikely(avc))
                anon_vma_chain_free(avc);

        return 0;

 out_enomem_free_avc:
        anon_vma_chain_free(avc);
 out_enomem:
        return -ENOMEM;
}

/*
 * This is a useful helper function for locking the anon_vma root as
 * we traverse the vma->anon_vma_chain, looping over anon_vma's that
 * have the same vma.
 *
 * Such anon_vma's should have the same root, so you'd expect to see
 * just a single mutex_lock for the whole traversal.
 */
static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
{
        struct anon_vma *new_root = anon_vma->root;
        if (new_root != root) {
                if (WARN_ON_ONCE(root))
                        up_write(&root->rwsem);
                root = new_root;
                down_write(&root->rwsem);
        }
        return root;
}

static inline void unlock_anon_vma_root(struct anon_vma *root)
{
        if (root)
                up_write(&root->rwsem);
}

/*
 * Attach the anon_vmas from src to dst.
 * Returns 0 on success, -ENOMEM on failure.
 *
 * anon_vma_clone() is called by vma_expand(), vma_merge(), __split_vma(),
 * copy_vma() and anon_vma_fork(). The first four want an exact copy of src,
 * while the last one, anon_vma_fork(), may try to reuse an existing anon_vma to
 * prevent endless growth of anon_vma. Since dst->anon_vma is set to NULL before
 * call, we can identify this case by checking (!dst->anon_vma &&
 * src->anon_vma).
 *
 * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
 * and reuse existing anon_vma which has no vmas and only one child anon_vma.
 * This prevents degradation of anon_vma hierarchy to endless linear chain in
 * case of constantly forking task. On the other hand, an anon_vma with more
 * than one child isn't reused even if there was no alive vma, thus rmap
 * walker has a good chance of avoiding scanning the whole hierarchy when it
 * searches where page is mapped.
 */
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
        struct anon_vma_chain *avc, *pavc;
        struct anon_vma *root = NULL;

        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma;

                avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
                if (unlikely(!avc)) {
                        unlock_anon_vma_root(root);
                        root = NULL;
                        avc = anon_vma_chain_alloc(GFP_KERNEL);
                        if (!avc)
                                goto enomem_failure;
                }
                anon_vma = pavc->anon_vma;
                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_chain_link(dst, avc, anon_vma);

                /*
                 * Reuse existing anon_vma if it has no vma and only one
                 * anon_vma child.
                 *
                 * Root anon_vma is never reused:
                 * it has self-parent reference and at least one child.
                 */
                if (!dst->anon_vma && src->anon_vma &&
                    anon_vma->num_children < 2 &&
                    anon_vma->num_active_vmas == 0)
                        dst->anon_vma = anon_vma;
        }
        if (dst->anon_vma)
                dst->anon_vma->num_active_vmas++;
        unlock_anon_vma_root(root);
        return 0;

 enomem_failure:
        /*
         * dst->anon_vma is dropped here otherwise its num_active_vmas can
         * be incorrectly decremented in unlink_anon_vmas().
         * We can safely do this because callers of anon_vma_clone() don't care
         * about dst->anon_vma if anon_vma_clone() failed.
         */
        dst->anon_vma = NULL;
        unlink_anon_vmas(dst);
        return -ENOMEM;
}

/*
 * Attach vma to its own anon_vma, as well as to the anon_vmas that
 * the corresponding VMA in the parent process is attached to.
 * Returns 0 on success, non-zero on failure.
 */
int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
{
        struct anon_vma_chain *avc;
        struct anon_vma *anon_vma;
        int error;

        /* Don't bother if the parent process has no anon_vma here. */
        if (!pvma->anon_vma)
                return 0;

        /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
        vma->anon_vma = NULL;

        /*
         * First, attach the new VMA to the parent VMA's anon_vmas,
         * so rmap can find non-COWed pages in child processes.
         */
        error = anon_vma_clone(vma, pvma);
        if (error)
                return error;

        /* An existing anon_vma has been reused, all done then. */
        if (vma->anon_vma)
                return 0;

        /* Then add our own anon_vma. */
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
        anon_vma->num_active_vmas++;
        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;

        /*
         * The root anon_vma's rwsem is the lock actually used when we
         * lock any of the anon_vmas in this anon_vma tree.
         */
        anon_vma->root = pvma->anon_vma->root;
        anon_vma->parent = pvma->anon_vma;
        /*
         * With refcounts, an anon_vma can stay around longer than the
         * process it belongs to. The root anon_vma needs to be pinned until
         * this anon_vma is freed, because the lock lives in the root.
         */
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
        anon_vma_lock_write(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
        anon_vma->parent->num_children++;
        anon_vma_unlock_write(anon_vma);

        return 0;

 out_error_free_anon_vma:
        put_anon_vma(anon_vma);
 out_error:
        unlink_anon_vmas(vma);
        return -ENOMEM;
}

void unlink_anon_vmas(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc, *next;
        struct anon_vma *root = NULL;

        /*
         * Unlink each anon_vma chained to the VMA.  This list is ordered
         * from newest to oldest, ensuring the root anon_vma gets freed last.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                root = lock_anon_vma_root(root, anon_vma);
                anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);

                /*
                 * Leave empty anon_vmas on the list - we'll need
                 * to free them outside the lock.
                 */
                if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
                        anon_vma->parent->num_children--;
                        continue;
                }

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
        if (vma->anon_vma) {
                vma->anon_vma->num_active_vmas--;

                /*
                 * vma would still be needed after unlink, and anon_vma will be prepared
                 * when handle fault.
                 */
                vma->anon_vma = NULL;
        }
        unlock_anon_vma_root(root);

        /*
         * Iterate the list once more, it now only contains empty and unlinked
         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
         * needing to write-acquire the anon_vma->root->rwsem.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                struct anon_vma *anon_vma = avc->anon_vma;

                VM_WARN_ON(anon_vma->num_children);
                VM_WARN_ON(anon_vma->num_active_vmas);
                put_anon_vma(anon_vma);

                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
}

static void anon_vma_ctor(void *data)
{
        struct anon_vma *anon_vma = data;

        init_rwsem(&anon_vma->rwsem);
        atomic_set(&anon_vma->refcount, 0);
        anon_vma->rb_root = RB_ROOT_CACHED;
}

void __init anon_vma_init(void)
{
        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
                        0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
                        anon_vma_ctor);
        anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
                        SLAB_PANIC|SLAB_ACCOUNT);
}

/*
 * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
 *
 * Since there is no serialization what so ever against folio_remove_rmap_*()
 * the best this function can do is return a refcount increased anon_vma
 * that might have been relevant to this page.
 *
 * The page might have been remapped to a different anon_vma or the anon_vma
 * returned may already be freed (and even reused).
 *
 * In case it was remapped to a different anon_vma, the new anon_vma will be a
 * child of the old anon_vma, and the anon_vma lifetime rules will therefore
 * ensure that any anon_vma obtained from the page will still be valid for as
 * long as we observe page_mapped() [ hence all those page_mapped() tests ].
 *
 * All users of this function must be very careful when walking the anon_vma
 * chain and verify that the page in question is indeed mapped in it
 * [ something equivalent to page_mapped_in_vma() ].
 *
 * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
 * folio_remove_rmap_*() that the anon_vma pointer from page->mapping is valid
 * if there is a mapcount, we can dereference the anon_vma after observing
 * those.
 *
 * NOTE: the caller should normally hold folio lock when calling this.  If
 * not, the caller needs to double check the anon_vma didn't change after
 * taking the anon_vma lock for either read or write (UFFDIO_MOVE can modify it
 * concurrently without folio lock protection). See folio_lock_anon_vma_read()
 * which has already covered that, and comment above remap_pages().
 */
struct anon_vma *folio_get_anon_vma(const struct folio *folio)
{
        struct anon_vma *anon_vma = NULL;
        unsigned long anon_mapping;

        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!folio_mapped(folio))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        /*
         * If this folio is still mapped, then its anon_vma cannot have been
         * freed.  But if it has been unmapped, we have no security against the
         * anon_vma structure being freed and reused (for another anon_vma:
         * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
         * above cannot corrupt).
         */
        if (!folio_mapped(folio)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }
out:
        rcu_read_unlock();

        return anon_vma;
}

/*
 * Similar to folio_get_anon_vma() except it locks the anon_vma.
 *
 * Its a little more complex as it tries to keep the fast path to a single
 * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
 * reference like with folio_get_anon_vma() and then block on the mutex
 * on !rwc->try_lock case.
 */
struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
                                          struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma = NULL;
        struct anon_vma *root_anon_vma;
        unsigned long anon_mapping;

retry:
        rcu_read_lock();
        anon_mapping = (unsigned long)READ_ONCE(folio->mapping);
        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!folio_mapped(folio))
                goto out;

        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
        root_anon_vma = READ_ONCE(anon_vma->root);
        if (down_read_trylock(&root_anon_vma->rwsem)) {
                /*
                 * folio_move_anon_rmap() might have changed the anon_vma as we
                 * might not hold the folio lock here.
                 */
                if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
                             anon_mapping)) {
                        up_read(&root_anon_vma->rwsem);
                        rcu_read_unlock();
                        goto retry;
                }

                /*
                 * If the folio is still mapped, then this anon_vma is still
                 * its anon_vma, and holding the mutex ensures that it will
                 * not go away, see anon_vma_free().
                 */
                if (!folio_mapped(folio)) {
                        up_read(&root_anon_vma->rwsem);
                        anon_vma = NULL;
                }
                goto out;
        }

        if (rwc && rwc->try_lock) {
                anon_vma = NULL;
                rwc->contended = true;
                goto out;
        }

        /* trylock failed, we got to sleep */
        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
                anon_vma = NULL;
                goto out;
        }

        if (!folio_mapped(folio)) {
                rcu_read_unlock();
                put_anon_vma(anon_vma);
                return NULL;
        }

        /* we pinned the anon_vma, its safe to sleep */
        rcu_read_unlock();
        anon_vma_lock_read(anon_vma);

        /*
         * folio_move_anon_rmap() might have changed the anon_vma as we might
         * not hold the folio lock here.
         */
        if (unlikely((unsigned long)READ_ONCE(folio->mapping) !=
                     anon_mapping)) {
                anon_vma_unlock_read(anon_vma);
                put_anon_vma(anon_vma);
                anon_vma = NULL;
                goto retry;
        }

        if (atomic_dec_and_test(&anon_vma->refcount)) {
                /*
                 * Oops, we held the last refcount, release the lock
                 * and bail -- can't simply use put_anon_vma() because
                 * we'll deadlock on the anon_vma_lock_write() recursion.
                 */
                anon_vma_unlock_read(anon_vma);
                __put_anon_vma(anon_vma);
                anon_vma = NULL;
        }

        return anon_vma;

out:
        rcu_read_unlock();
        return anon_vma;
}

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
/*
 * Flush TLB entries for recently unmapped pages from remote CPUs. It is
 * important if a PTE was dirty when it was unmapped that it's flushed
 * before any IO is initiated on the page to prevent lost writes. Similarly,
 * it must be flushed before freeing to prevent data leakage.
 */
void try_to_unmap_flush(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (!tlb_ubc->flush_required)
                return;

        arch_tlbbatch_flush(&tlb_ubc->arch);
        tlb_ubc->flush_required = false;
        tlb_ubc->writable = false;
}

/* Flush iff there are potentially writable TLB entries that can race with IO */
void try_to_unmap_flush_dirty(void)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;

        if (tlb_ubc->writable)
                try_to_unmap_flush();
}

/*
 * Bits 0-14 of mm->tlb_flush_batched record pending generations.
 * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
 */
#define TLB_FLUSH_BATCH_FLUSHED_SHIFT        16
#define TLB_FLUSH_BATCH_PENDING_MASK                        \
        ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
#define TLB_FLUSH_BATCH_PENDING_LARGE                        \
        (TLB_FLUSH_BATCH_PENDING_MASK / 2)

static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
                unsigned long start, unsigned long end)
{
        struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
        int batch;
        bool writable = pte_dirty(pteval);

        if (!pte_accessible(mm, pteval))
                return;

        arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, start, end);
        tlb_ubc->flush_required = true;

        /*
         * Ensure compiler does not re-order the setting of tlb_flush_batched
         * before the PTE is cleared.
         */
        barrier();
        batch = atomic_read(&mm->tlb_flush_batched);
retry:
        if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
                /*
                 * Prevent `pending' from catching up with `flushed' because of
                 * overflow.  Reset `pending' and `flushed' to be 1 and 0 if
                 * `pending' becomes large.
                 */
                if (!atomic_try_cmpxchg(&mm->tlb_flush_batched, &batch, 1))
                        goto retry;
        } else {
                atomic_inc(&mm->tlb_flush_batched);
        }

        /*
         * If the PTE was dirty then it's best to assume it's writable. The
         * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
         * before the page is queued for IO.
         */
        if (writable)
                tlb_ubc->writable = true;
}

/*
 * Returns true if the TLB flush should be deferred to the end of a batch of
 * unmap operations to reduce IPIs.
 */
static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        if (!(flags & TTU_BATCH_FLUSH))
                return false;

        return arch_tlbbatch_should_defer(mm);
}

/*
 * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
 * releasing the PTL if TLB flushes are batched. It's possible for a parallel
 * operation such as mprotect or munmap to race between reclaim unmapping
 * the page and flushing the page. If this race occurs, it potentially allows
 * access to data via a stale TLB entry. Tracking all mm's that have TLB
 * batching in flight would be expensive during reclaim so instead track
 * whether TLB batching occurred in the past and if so then do a flush here
 * if required. This will cost one additional flush per reclaim cycle paid
 * by the first operation at risk such as mprotect and mumap.
 *
 * This must be called under the PTL so that an access to tlb_flush_batched
 * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
 * via the PTL.
 */
void flush_tlb_batched_pending(struct mm_struct *mm)
{
        int batch = atomic_read(&mm->tlb_flush_batched);
        int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
        int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;

        if (pending != flushed) {
                arch_flush_tlb_batched_pending(mm);
                /*
                 * If the new TLB flushing is pending during flushing, leave
                 * mm->tlb_flush_batched as is, to avoid losing flushing.
                 */
                atomic_cmpxchg(&mm->tlb_flush_batched, batch,
                               pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
        }
}
#else
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval,
                unsigned long start, unsigned long end)
{
}

static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
{
        return false;
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

/**
 * page_address_in_vma - The virtual address of a page in this VMA.
 * @folio: The folio containing the page.
 * @page: The page within the folio.
 * @vma: The VMA we need to know the address in.
 *
 * Calculates the user virtual address of this page in the specified VMA.
 * It is the caller's responsibililty to check the page is actually
 * within the VMA.  There may not currently be a PTE pointing at this
 * page, but if a page fault occurs at this address, this is the page
 * which will be accessed.
 *
 * Context: Caller should hold a reference to the folio.  Caller should
 * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the
 * VMA from being altered.
 *
 * Return: The virtual address corresponding to this page in the VMA.
 */
unsigned long page_address_in_vma(const struct folio *folio,
                const struct page *page, const struct vm_area_struct *vma)
{
        if (folio_test_anon(folio)) {
                struct anon_vma *page__anon_vma = folio_anon_vma(folio);
                /*
                 * Note: swapoff's unuse_vma() is more efficient with this
                 * check, and needs it to match anon_vma when KSM is active.
                 */
                if (!vma->anon_vma || !page__anon_vma ||
                    vma->anon_vma->root != page__anon_vma->root)
                        return -EFAULT;
        } else if (!vma->vm_file) {
                return -EFAULT;
        } else if (vma->vm_file->f_mapping != folio->mapping) {
                return -EFAULT;
        }

        /* KSM folios don't reach here because of the !page__anon_vma check */
        return vma_address(vma, page_pgoff(folio, page), 1);
}

/*
 * Returns the actual pmd_t* where we expect 'address' to be mapped from, or
 * NULL if it doesn't exist.  No guarantees / checks on what the pmd_t*
 * represents.
 */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd = NULL;

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;

        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                goto out;

        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;

        pmd = pmd_offset(pud, address);
out:
        return pmd;
}

struct folio_referenced_arg {
        int mapcount;
        int referenced;
        unsigned long vm_flags;
        struct mem_cgroup *memcg;
};

/*
 * arg: folio_referenced_arg will be passed
 */
static bool folio_referenced_one(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address, void *arg)
{
        struct folio_referenced_arg *pra = arg;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        int referenced = 0;
        unsigned long start = address, ptes = 0;

        while (page_vma_mapped_walk(&pvmw)) {
                address = pvmw.address;

                if (vma->vm_flags & VM_LOCKED) {
                        if (!folio_test_large(folio) || !pvmw.pte) {
                                /* Restore the mlock which got missed */
                                mlock_vma_folio(folio, vma);
                                page_vma_mapped_walk_done(&pvmw);
                                pra->vm_flags |= VM_LOCKED;
                                return false; /* To break the loop */
                        }
                        /*
                         * For large folio fully mapped to VMA, will
                         * be handled after the pvmw loop.
                         *
                         * For large folio cross VMA boundaries, it's
                         * expected to be picked  by page reclaim. But
                         * should skip reference of pages which are in
                         * the range of VM_LOCKED vma. As page reclaim
                         * should just count the reference of pages out
                         * the range of VM_LOCKED vma.
                         */
                        ptes++;
                        pra->mapcount--;
                        continue;
                }

                /*
                 * Skip the non-shared swapbacked folio mapped solely by
                 * the exiting or OOM-reaped process. This avoids redundant
                 * swap-out followed by an immediate unmap.
                 */
                if ((!atomic_read(&vma->vm_mm->mm_users) ||
                    check_stable_address_space(vma->vm_mm)) &&
                    folio_test_anon(folio) && folio_test_swapbacked(folio) &&
                    !folio_maybe_mapped_shared(folio)) {
                        pra->referenced = -1;
                        page_vma_mapped_walk_done(&pvmw);
                        return false;
                }

                if (lru_gen_enabled() && pvmw.pte) {
                        if (lru_gen_look_around(&pvmw))
                                referenced++;
                } else if (pvmw.pte) {
                        if (ptep_clear_flush_young_notify(vma, address,
                                                pvmw.pte))
                                referenced++;
                } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
                        if (pmdp_clear_flush_young_notify(vma, address,
                                                pvmw.pmd))
                                referenced++;
                } else {
                        /* unexpected pmd-mapped folio? */
                        WARN_ON_ONCE(1);
                }

                pra->mapcount--;
        }

        if ((vma->vm_flags & VM_LOCKED) &&
                        folio_test_large(folio) &&
                        folio_within_vma(folio, vma)) {
                unsigned long s_align, e_align;

                s_align = ALIGN_DOWN(start, PMD_SIZE);
                e_align = ALIGN_DOWN(start + folio_size(folio) - 1, PMD_SIZE);

                /* folio doesn't cross page table boundary and fully mapped */
                if ((s_align == e_align) && (ptes == folio_nr_pages(folio))) {
                        /* Restore the mlock which got missed */
                        mlock_vma_folio(folio, vma);
                        pra->vm_flags |= VM_LOCKED;
                        return false; /* To break the loop */
                }
        }

        if (referenced)
                folio_clear_idle(folio);
        if (folio_test_clear_young(folio))
                referenced++;

        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags & ~VM_LOCKED;
        }

        if (!pra->mapcount)
                return false; /* To break the loop */

        return true;
}

static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
{
        struct folio_referenced_arg *pra = arg;
        struct mem_cgroup *memcg = pra->memcg;

        /*
         * Ignore references from this mapping if it has no recency. If the
         * folio has been used in another mapping, we will catch it; if this
         * other mapping is already gone, the unmap path will have set the
         * referenced flag or activated the folio in zap_pte_range().
         */
        if (!vma_has_recency(vma))
                return true;

        /*
         * If we are reclaiming on behalf of a cgroup, skip counting on behalf
         * of references from different cgroups.
         */
        if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
                return true;

        return false;
}

/**
 * folio_referenced() - Test if the folio was referenced.
 * @folio: The folio to test.
 * @is_locked: Caller holds lock on the folio.
 * @memcg: target memory cgroup
 * @vm_flags: A combination of all the vma->vm_flags which referenced the folio.
 *
 * Quick test_and_clear_referenced for all mappings of a folio,
 *
 * Return: The number of mappings which referenced the folio. Return -1 if
 * the function bailed out due to rmap lock contention.
 */
int folio_referenced(struct folio *folio, int is_locked,
                     struct mem_cgroup *memcg, unsigned long *vm_flags)
{
        bool we_locked = false;
        struct folio_referenced_arg pra = {
                .mapcount = folio_mapcount(folio),
                .memcg = memcg,
        };
        struct rmap_walk_control rwc = {
                .rmap_one = folio_referenced_one,
                .arg = (void *)&pra,
                .anon_lock = folio_lock_anon_vma_read,
                .try_lock = true,
                .invalid_vma = invalid_folio_referenced_vma,
        };

        *vm_flags = 0;
        if (!pra.mapcount)
                return 0;

        if (!folio_raw_mapping(folio))
                return 0;

        if (!is_locked && (!folio_test_anon(folio) || folio_test_ksm(folio))) {
                we_locked = folio_trylock(folio);
                if (!we_locked)
                        return 1;
        }

        rmap_walk(folio, &rwc);
        *vm_flags = pra.vm_flags;

        if (we_locked)
                folio_unlock(folio);

        return rwc.contended ? -1 : pra.referenced;
}

static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
{
        int cleaned = 0;
        struct vm_area_struct *vma = pvmw->vma;
        struct mmu_notifier_range range;
        unsigned long address = pvmw->address;

        /*
         * We have to assume the worse case ie pmd for invalidation. Note that
         * the folio can not be freed from this function.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0,
                                vma->vm_mm, address, vma_address_end(pvmw));
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(pvmw)) {
                int ret = 0;

                address = pvmw->address;
                if (pvmw->pte) {
                        pte_t *pte = pvmw->pte;
                        pte_t entry = ptep_get(pte);

                        /*
                         * PFN swap PTEs, such as device-exclusive ones, that
                         * actually map pages are clean and not writable from a
                         * CPU perspective. The MMU notifier takes care of any
                         * device aspects.
                         */
                        if (!pte_present(entry))
                                continue;
                        if (!pte_dirty(entry) && !pte_write(entry))
                                continue;

                        flush_cache_page(vma, address, pte_pfn(entry));
                        entry = ptep_clear_flush(vma, address, pte);
                        entry = pte_wrprotect(entry);
                        entry = pte_mkclean(entry);
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        ret = 1;
                } else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        pmd_t *pmd = pvmw->pmd;
                        pmd_t entry;

                        if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
                                continue;

                        flush_cache_range(vma, address,
                                          address + HPAGE_PMD_SIZE);
                        entry = pmdp_invalidate(vma, address, pmd);
                        entry = pmd_wrprotect(entry);
                        entry = pmd_mkclean(entry);
                        set_pmd_at(vma->vm_mm, address, pmd, entry);
                        ret = 1;
#else
                        /* unexpected pmd-mapped folio? */
                        WARN_ON_ONCE(1);
#endif
                }

                if (ret)
                        cleaned++;
        }

        mmu_notifier_invalidate_range_end(&range);

        return cleaned;
}

static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
                             unsigned long address, void *arg)
{
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
        int *cleaned = arg;

        *cleaned += page_vma_mkclean_one(&pvmw);

        return true;
}

static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
{
        if (vma->vm_flags & VM_SHARED)
                return false;

        return true;
}

int folio_mkclean(struct folio *folio)
{
        int cleaned = 0;
        struct address_space *mapping;
        struct rmap_walk_control rwc = {
                .arg = (void *)&cleaned,
                .rmap_one = page_mkclean_one,
                .invalid_vma = invalid_mkclean_vma,
        };

        BUG_ON(!folio_test_locked(folio));

        if (!folio_mapped(folio))
                return 0;

        mapping = folio_mapping(folio);
        if (!mapping)
                return 0;

        rmap_walk(folio, &rwc);

        return cleaned;
}
EXPORT_SYMBOL_GPL(folio_mkclean);

struct wrprotect_file_state {
        int cleaned;
        pgoff_t pgoff;
        unsigned long pfn;
        unsigned long nr_pages;
};

static bool mapping_wrprotect_range_one(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address, void *arg)
{
        struct wrprotect_file_state *state = (struct wrprotect_file_state *)arg;
        struct page_vma_mapped_walk pvmw = {
                .pfn                = state->pfn,
                .nr_pages        = state->nr_pages,
                .pgoff                = state->pgoff,
                .vma                = vma,
                .address        = address,
                .flags                = PVMW_SYNC,
        };

        state->cleaned += page_vma_mkclean_one(&pvmw);

        return true;
}

static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
                             pgoff_t pgoff_start, unsigned long nr_pages,
                             struct rmap_walk_control *rwc, bool locked);

/**
 * mapping_wrprotect_range() - Write-protect all mappings in a specified range.
 *
 * @mapping:        The mapping whose reverse mapping should be traversed.
 * @pgoff:        The page offset at which @pfn is mapped within @mapping.
 * @pfn:        The PFN of the page mapped in @mapping at @pgoff.
 * @nr_pages:        The number of physically contiguous base pages spanned.
 *
 * Traverses the reverse mapping, finding all VMAs which contain a shared
 * mapping of the pages in the specified range in @mapping, and write-protects
 * them (that is, updates the page tables to mark the mappings read-only such
 * that a write protection fault arises when the mappings are written to).
 *
 * The @pfn value need not refer to a folio, but rather can reference a kernel
 * allocation which is mapped into userland. We therefore do not require that
 * the page maps to a folio with a valid mapping or index field, rather the
 * caller specifies these in @mapping and @pgoff.
 *
 * Return: the number of write-protected PTEs, or an error.
 */
int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
                unsigned long pfn, unsigned long nr_pages)
{
        struct wrprotect_file_state state = {
                .cleaned = 0,
                .pgoff = pgoff,
                .pfn = pfn,
                .nr_pages = nr_pages,
        };
        struct rmap_walk_control rwc = {
                .arg = (void *)&state,
                .rmap_one = mapping_wrprotect_range_one,
                .invalid_vma = invalid_mkclean_vma,
        };

        if (!mapping)
                return 0;

        __rmap_walk_file(/* folio = */NULL, mapping, pgoff, nr_pages, &rwc,
                         /* locked = */false);

        return state.cleaned;
}
EXPORT_SYMBOL_GPL(mapping_wrprotect_range);

/**
 * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
 *                     [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
 *                     within the @vma of shared mappings. And since clean PTEs
 *                     should also be readonly, write protects them too.
 * @pfn: start pfn.
 * @nr_pages: number of physically contiguous pages srarting with @pfn.
 * @pgoff: page offset that the @pfn mapped with.
 * @vma: vma that @pfn mapped within.
 *
 * Returns the number of cleaned PTEs (including PMDs).
 */
int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
                      struct vm_area_struct *vma)
{
        struct page_vma_mapped_walk pvmw = {
                .pfn                = pfn,
                .nr_pages        = nr_pages,
                .pgoff                = pgoff,
                .vma                = vma,
                .flags                = PVMW_SYNC,
        };

        if (invalid_mkclean_vma(vma, NULL))
                return 0;

        pvmw.address = vma_address(vma, pgoff, nr_pages);
        VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);

        return page_vma_mkclean_one(&pvmw);
}

static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum rmap_level level, int *nr_pmdmapped)
{
        atomic_t *mapped = &folio->_nr_pages_mapped;
        const int orig_nr_pages = nr_pages;
        int first = 0, nr = 0;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case RMAP_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        nr = atomic_inc_and_test(&folio->_mapcount);
                        break;
                }

                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
                        nr = folio_add_return_large_mapcount(folio, orig_nr_pages, vma);
                        if (nr == orig_nr_pages)
                                /* Was completely unmapped. */
                                nr = folio_large_nr_pages(folio);
                        else
                                nr = 0;
                        break;
                }

                do {
                        first += atomic_inc_and_test(&page->_mapcount);
                } while (page++, --nr_pages > 0);

                if (first &&
                    atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
                        nr = first;

                folio_add_large_mapcount(folio, orig_nr_pages, vma);
                break;
        case RMAP_LEVEL_PMD:
        case RMAP_LEVEL_PUD:
                first = atomic_inc_and_test(&folio->_entire_mapcount);
                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
                        if (level == RMAP_LEVEL_PMD && first)
                                *nr_pmdmapped = folio_large_nr_pages(folio);
                        nr = folio_inc_return_large_mapcount(folio, vma);
                        if (nr == 1)
                                /* Was completely unmapped. */
                                nr = folio_large_nr_pages(folio);
                        else
                                nr = 0;
                        break;
                }

                if (first) {
                        nr = atomic_add_return_relaxed(ENTIRELY_MAPPED, mapped);
                        if (likely(nr < ENTIRELY_MAPPED + ENTIRELY_MAPPED)) {
                                nr_pages = folio_large_nr_pages(folio);
                                /*
                                 * We only track PMD mappings of PMD-sized
                                 * folios separately.
                                 */
                                if (level == RMAP_LEVEL_PMD)
                                        *nr_pmdmapped = nr_pages;
                                nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
                                /* Raced ahead of a remove and another add? */
                                if (unlikely(nr < 0))
                                        nr = 0;
                        } else {
                                /* Raced ahead of a remove of ENTIRELY_MAPPED */
                                nr = 0;
                        }
                }
                folio_inc_large_mapcount(folio, vma);
                break;
        }
        return nr;
}

/**
 * folio_move_anon_rmap - move a folio to our anon_vma
 * @folio:        The folio to move to our anon_vma
 * @vma:        The vma the folio belongs to
 *
 * When a folio belongs exclusively to one process after a COW event,
 * that folio can be moved into the anon_vma that belongs to just that
 * process, so the rmap code will not search the parent or sibling processes.
 */
void folio_move_anon_rmap(struct folio *folio, struct vm_area_struct *vma)
{
        void *anon_vma = vma->anon_vma;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_VMA(!anon_vma, vma);

        anon_vma += PAGE_MAPPING_ANON;
        /*
         * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
         * simultaneously, so a concurrent reader (eg folio_referenced()'s
         * folio_test_anon()) will not see one without the other.
         */
        WRITE_ONCE(folio->mapping, anon_vma);
}

/**
 * __folio_set_anon - set up a new anonymous rmap for a folio
 * @folio:        The folio to set up the new anonymous rmap for.
 * @vma:        VM area to add the folio to.
 * @address:        User virtual address of the mapping
 * @exclusive:        Whether the folio is exclusive to the process.
 */
static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma,
                             unsigned long address, bool exclusive)
{
        struct anon_vma *anon_vma = vma->anon_vma;

        BUG_ON(!anon_vma);

        /*
         * If the folio isn't exclusive to this vma, we must use the _oldest_
         * possible anon_vma for the folio mapping!
         */
        if (!exclusive)
                anon_vma = anon_vma->root;

        /*
         * page_idle does a lockless/optimistic rmap scan on folio->mapping.
         * Make sure the compiler doesn't split the stores of anon_vma and
         * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
         * could mistake the mapping for a struct address_space and crash.
         */
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        WRITE_ONCE(folio->mapping, (struct address_space *) anon_vma);
        folio->index = linear_page_index(vma, address);
}

/**
 * __page_check_anon_rmap - sanity check anonymous rmap addition
 * @folio:        The folio containing @page.
 * @page:        the page to check the mapping of
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 */
static void __page_check_anon_rmap(const struct folio *folio,
                const struct page *page, struct vm_area_struct *vma,
                unsigned long address)
{
        /*
         * The page's anon-rmap details (mapping and index) are guaranteed to
         * be set up correctly at this point.
         *
         * We have exclusion against folio_add_anon_rmap_*() because the caller
         * always holds the page locked.
         *
         * We have exclusion against folio_add_new_anon_rmap because those pages
         * are initially only visible via the pagetables, and the pte is locked
         * over the call to folio_add_new_anon_rmap.
         */
        VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root,
                        folio);
        VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address),
                       page);
}

static void __folio_mod_stat(struct folio *folio, int nr, int nr_pmdmapped)
{
        int idx;

        if (nr) {
                idx = folio_test_anon(folio) ? NR_ANON_MAPPED : NR_FILE_MAPPED;
                __lruvec_stat_mod_folio(folio, idx, nr);
        }
        if (nr_pmdmapped) {
                if (folio_test_anon(folio)) {
                        idx = NR_ANON_THPS;
                        __lruvec_stat_mod_folio(folio, idx, nr_pmdmapped);
                } else {
                        /* NR_*_PMDMAPPED are not maintained per-memcg */
                        idx = folio_test_swapbacked(folio) ?
                                NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED;
                        __mod_node_page_state(folio_pgdat(folio), idx,
                                              nr_pmdmapped);
                }
        }
}

static __always_inline void __folio_add_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags, enum rmap_level level)
{
        int i, nr, nr_pmdmapped = 0;

        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);

        if (likely(!folio_test_ksm(folio)))
                __page_check_anon_rmap(folio, page, vma, address);

        __folio_mod_stat(folio, nr, nr_pmdmapped);

        if (flags & RMAP_EXCLUSIVE) {
                switch (level) {
                case RMAP_LEVEL_PTE:
                        for (i = 0; i < nr_pages; i++)
                                SetPageAnonExclusive(page + i);
                        break;
                case RMAP_LEVEL_PMD:
                        SetPageAnonExclusive(page);
                        break;
                case RMAP_LEVEL_PUD:
                        /*
                         * Keep the compiler happy, we don't support anonymous
                         * PUD mappings.
                         */
                        WARN_ON_ONCE(1);
                        break;
                }
        }

        VM_WARN_ON_FOLIO(!folio_test_large(folio) && PageAnonExclusive(page) &&
                         atomic_read(&folio->_mapcount) > 0, folio);
        for (i = 0; i < nr_pages; i++) {
                struct page *cur_page = page + i;

                VM_WARN_ON_FOLIO(folio_test_large(folio) &&
                                 folio_entire_mapcount(folio) > 1 &&
                                 PageAnonExclusive(cur_page), folio);
                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
                        continue;

                /*
                 * While PTE-mapping a THP we have a PMD and a PTE
                 * mapping.
                 */
                VM_WARN_ON_FOLIO(atomic_read(&cur_page->_mapcount) > 0 &&
                                 PageAnonExclusive(cur_page), folio);
        }

        /*
         * For large folio, only mlock it if it's fully mapped to VMA. It's
         * not easy to check whether the large folio is fully mapped to VMA
         * here. Only mlock normal 4K folio and leave page reclaim to handle
         * large folio.
         */
        if (!folio_test_large(folio))
                mlock_vma_folio(folio, vma);
}

/**
 * folio_add_anon_rmap_ptes - add PTE mappings to a page range of an anon folio
 * @folio:        The folio to add the mappings to
 * @page:        The first page to add
 * @nr_pages:        The number of pages which will be mapped
 * @vma:        The vm area in which the mappings are added
 * @address:        The user virtual address of the first page to map
 * @flags:        The rmap flags
 *
 * The page range of folio is defined by [first_page, first_page + nr_pages)
 *
 * The caller needs to hold the page table lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting,
 * and to ensure that an anon folio is not being upgraded racily to a KSM folio
 * (but KSM folios are never downgraded).
 */
void folio_add_anon_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma, unsigned long address,
                rmap_t flags)
{
        __folio_add_anon_rmap(folio, page, nr_pages, vma, address, flags,
                              RMAP_LEVEL_PTE);
}

/**
 * folio_add_anon_rmap_pmd - add a PMD mapping to a page range of an anon folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 * @address:        The user virtual address of the first page to map
 * @flags:        The rmap flags
 *
 * The page range of folio is defined by [first_page, first_page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock, and the page must be locked in
 * the anon_vma case: to serialize mapping,index checking after setting.
 */
void folio_add_anon_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma, unsigned long address, rmap_t flags)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_add_anon_rmap(folio, page, HPAGE_PMD_NR, vma, address, flags,
                              RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/**
 * folio_add_new_anon_rmap - Add mapping to a new anonymous folio.
 * @folio:        The folio to add the mapping to.
 * @vma:        the vm area in which the mapping is added
 * @address:        the user virtual address mapped
 * @flags:        The rmap flags
 *
 * Like folio_add_anon_rmap_*() but must only be called on *new* folios.
 * This means the inc-and-test can be bypassed.
 * The folio doesn't necessarily need to be locked while it's exclusive
 * unless two threads map it concurrently. However, the folio must be
 * locked if it's shared.
 *
 * If the folio is pmd-mappable, it is accounted as a THP.
 */
void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags)
{
        const bool exclusive = flags & RMAP_EXCLUSIVE;
        int nr = 1, nr_pmdmapped = 0;

        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!exclusive && !folio_test_locked(folio), folio);

        /*
         * VM_DROPPABLE mappings don't swap; instead they're just dropped when
         * under memory pressure.
         */
        if (!folio_test_swapbacked(folio) && !(vma->vm_flags & VM_DROPPABLE))
                __folio_set_swapbacked(folio);
        __folio_set_anon(folio, vma, address, exclusive);

        if (likely(!folio_test_large(folio))) {
                /* increment count (starts at -1) */
                atomic_set(&folio->_mapcount, 0);
                if (exclusive)
                        SetPageAnonExclusive(&folio->page);
        } else if (!folio_test_pmd_mappable(folio)) {
                int i;

                nr = folio_large_nr_pages(folio);
                for (i = 0; i < nr; i++) {
                        struct page *page = folio_page(folio, i);

                        if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                                /* increment count (starts at -1) */
                                atomic_set(&page->_mapcount, 0);
                        if (exclusive)
                                SetPageAnonExclusive(page);
                }

                folio_set_large_mapcount(folio, nr, vma);
                if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                        atomic_set(&folio->_nr_pages_mapped, nr);
        } else {
                nr = folio_large_nr_pages(folio);
                /* increment count (starts at -1) */
                atomic_set(&folio->_entire_mapcount, 0);
                folio_set_large_mapcount(folio, 1, vma);
                if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                        atomic_set(&folio->_nr_pages_mapped, ENTIRELY_MAPPED);
                if (exclusive)
                        SetPageAnonExclusive(&folio->page);
                nr_pmdmapped = nr;
        }

        VM_WARN_ON_ONCE(address < vma->vm_start ||
                        address + (nr << PAGE_SHIFT) > vma->vm_end);

        __folio_mod_stat(folio, nr, nr_pmdmapped);
        mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
}

static __always_inline void __folio_add_file_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum rmap_level level)
{
        int nr, nr_pmdmapped = 0;

        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);

        nr = __folio_add_rmap(folio, page, nr_pages, vma, level, &nr_pmdmapped);
        __folio_mod_stat(folio, nr, nr_pmdmapped);

        /* See comments in folio_add_anon_rmap_*() */
        if (!folio_test_large(folio))
                mlock_vma_folio(folio, vma);
}

/**
 * folio_add_file_rmap_ptes - add PTE mappings to a page range of a folio
 * @folio:        The folio to add the mappings to
 * @page:        The first page to add
 * @nr_pages:        The number of pages that will be mapped using PTEs
 * @vma:        The vm area in which the mappings are added
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma)
{
        __folio_add_file_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
}

/**
 * folio_add_file_rmap_pmd - add a PMD mapping to a page range of a folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_add_file_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/**
 * folio_add_file_rmap_pud - add a PUD mapping to a page range of a folio
 * @folio:        The folio to add the mapping to
 * @page:        The first page to add
 * @vma:        The vm area in which the mapping is added
 *
 * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_add_file_rmap_pud(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        __folio_add_file_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
#else
        WARN_ON_ONCE(true);
#endif
}

static __always_inline void __folio_remove_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *vma,
                enum rmap_level level)
{
        atomic_t *mapped = &folio->_nr_pages_mapped;
        int last = 0, nr = 0, nr_pmdmapped = 0;
        bool partially_mapped = false;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case RMAP_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        nr = atomic_add_negative(-1, &folio->_mapcount);
                        break;
                }

                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
                        nr = folio_sub_return_large_mapcount(folio, nr_pages, vma);
                        if (!nr) {
                                /* Now completely unmapped. */
                                nr = folio_nr_pages(folio);
                        } else {
                                partially_mapped = nr < folio_large_nr_pages(folio) &&
                                                   !folio_entire_mapcount(folio);
                                nr = 0;
                        }
                        break;
                }

                folio_sub_large_mapcount(folio, nr_pages, vma);
                do {
                        last += atomic_add_negative(-1, &page->_mapcount);
                } while (page++, --nr_pages > 0);

                if (last &&
                    atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED)
                        nr = last;

                partially_mapped = nr && atomic_read(mapped);
                break;
        case RMAP_LEVEL_PMD:
        case RMAP_LEVEL_PUD:
                if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) {
                        last = atomic_add_negative(-1, &folio->_entire_mapcount);
                        if (level == RMAP_LEVEL_PMD && last)
                                nr_pmdmapped = folio_large_nr_pages(folio);
                        nr = folio_dec_return_large_mapcount(folio, vma);
                        if (!nr) {
                                /* Now completely unmapped. */
                                nr = folio_large_nr_pages(folio);
                        } else {
                                partially_mapped = last &&
                                                   nr < folio_large_nr_pages(folio);
                                nr = 0;
                        }
                        break;
                }

                folio_dec_large_mapcount(folio, vma);
                last = atomic_add_negative(-1, &folio->_entire_mapcount);
                if (last) {
                        nr = atomic_sub_return_relaxed(ENTIRELY_MAPPED, mapped);
                        if (likely(nr < ENTIRELY_MAPPED)) {
                                nr_pages = folio_large_nr_pages(folio);
                                if (level == RMAP_LEVEL_PMD)
                                        nr_pmdmapped = nr_pages;
                                nr = nr_pages - (nr & FOLIO_PAGES_MAPPED);
                                /* Raced ahead of another remove and an add? */
                                if (unlikely(nr < 0))
                                        nr = 0;
                        } else {
                                /* An add of ENTIRELY_MAPPED raced ahead */
                                nr = 0;
                        }
                }

                partially_mapped = nr && nr < nr_pmdmapped;
                break;
        }

        /*
         * Queue anon large folio for deferred split if at least one page of
         * the folio is unmapped and at least one page is still mapped.
         *
         * Check partially_mapped first to ensure it is a large folio.
         */
        if (partially_mapped && folio_test_anon(folio) &&
            !folio_test_partially_mapped(folio))
                deferred_split_folio(folio, true);

        __folio_mod_stat(folio, -nr, -nr_pmdmapped);

        /*
         * It would be tidy to reset folio_test_anon mapping when fully
         * unmapped, but that might overwrite a racing folio_add_anon_rmap_*()
         * which increments mapcount after us but sets mapping before us:
         * so leave the reset to free_pages_prepare, and remember that
         * it's only reliable while mapped.
         */

        munlock_vma_folio(folio, vma);
}

/**
 * folio_remove_rmap_ptes - remove PTE mappings from a page range of a folio
 * @folio:        The folio to remove the mappings from
 * @page:        The first page to remove
 * @nr_pages:        The number of pages that will be removed from the mapping
 * @vma:        The vm area from which the mappings are removed
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_ptes(struct folio *folio, struct page *page,
                int nr_pages, struct vm_area_struct *vma)
{
        __folio_remove_rmap(folio, page, nr_pages, vma, RMAP_LEVEL_PTE);
}

/**
 * folio_remove_rmap_pmd - remove a PMD mapping from a page range of a folio
 * @folio:        The folio to remove the mapping from
 * @page:        The first page to remove
 * @vma:        The vm area from which the mapping is removed
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_pmd(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_remove_rmap(folio, page, HPAGE_PMD_NR, vma, RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
#endif
}

/**
 * folio_remove_rmap_pud - remove a PUD mapping from a page range of a folio
 * @folio:        The folio to remove the mapping from
 * @page:        The first page to remove
 * @vma:        The vm area from which the mapping is removed
 *
 * The page range of the folio is defined by [page, page + HPAGE_PUD_NR)
 *
 * The caller needs to hold the page table lock.
 */
void folio_remove_rmap_pud(struct folio *folio, struct page *page,
                struct vm_area_struct *vma)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        __folio_remove_rmap(folio, page, HPAGE_PUD_NR, vma, RMAP_LEVEL_PUD);
#else
        WARN_ON_ONCE(true);
#endif
}

/* We support batch unmapping of PTEs for lazyfree large folios */
static inline bool can_batch_unmap_folio_ptes(unsigned long addr,
                        struct folio *folio, pte_t *ptep)
{
        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
        int max_nr = folio_nr_pages(folio);
        pte_t pte = ptep_get(ptep);

        if (!folio_test_anon(folio) || folio_test_swapbacked(folio))
                return false;
        if (pte_unused(pte))
                return false;
        if (pte_pfn(pte) != folio_pfn(folio))
                return false;

        return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
                               NULL, NULL) == max_nr;
}

/*
 * @arg: enum ttu_flags will be passed to this argument
 */
static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        bool anon_exclusive, ret = true;
        pte_t pteval;
        struct page *subpage;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long nr_pages = 1, end_addr;
        unsigned long pfn;
        unsigned long hsz = 0;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
         * try_to_unmap() may return before page_mapped() has become false,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the folio can not be freed in this function as call of
         * try_to_unmap() must hold a reference on the folio.
         */
        range.end = vma_address_end(&pvmw);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, range.end);
        if (folio_test_hugetlb(folio)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);

                /* We need the huge page size for set_huge_pte_at() */
                hsz = huge_page_size(hstate_vma(vma));
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
                /*
                 * If the folio is in an mlock()d vma, we must not swap it out.
                 */
                if (!(flags & TTU_IGNORE_MLOCK) &&
                    (vma->vm_flags & VM_LOCKED)) {
                        /* Restore the mlock which got missed */
                        if (!folio_test_large(folio))
                                mlock_vma_folio(folio, vma);
                        goto walk_abort;
                }

                if (!pvmw.pte) {
                        if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
                                if (unmap_huge_pmd_locked(vma, pvmw.address, pvmw.pmd, folio))
                                        goto walk_done;
                                /*
                                 * unmap_huge_pmd_locked has either already marked
                                 * the folio as swap-backed or decided to retain it
                                 * due to GUP or speculative references.
                                 */
                                goto walk_abort;
                        }

                        if (flags & TTU_SPLIT_HUGE_PMD) {
                                /*
                                 * We temporarily have to drop the PTL and
                                 * restart so we can process the PTE-mapped THP.
                                 */
                                split_huge_pmd_locked(vma, pvmw.address,
                                                      pvmw.pmd, false, folio);
                                flags &= ~TTU_SPLIT_HUGE_PMD;
                                page_vma_mapped_walk_restart(&pvmw);
                                continue;
                        }
                }

                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                /*
                 * Handle PFN swap PTEs, such as device-exclusive ones, that
                 * actually map pages.
                 */
                pteval = ptep_get(pvmw.pte);
                if (likely(pte_present(pteval))) {
                        pfn = pte_pfn(pteval);
                } else {
                        pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
                        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
                }

                subpage = folio_page(folio, pfn - folio_pfn(folio));
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);

                if (folio_test_hugetlb(folio)) {
                        bool anon = folio_test_anon(folio);

                        /*
                         * The try_to_unmap() is only passed a hugetlb page
                         * in the case where the hugetlb page is poisoned.
                         */
                        VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
                        /*
                         * huge_pmd_unshare may unmap an entire PMD page.
                         * There is no way of knowing exactly which PMDs may
                         * be cached for this mm, so we must flush them all.
                         * start/end were already adjusted above to cover this
                         * range.
                         */
                        flush_cache_range(vma, range.start, range.end);

                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         *
                         * We also must hold hugetlb vma_lock in write mode.
                         * Lock order dictates acquiring vma_lock BEFORE
                         * i_mmap_rwsem.  We can only try lock here and fail
                         * if unsuccessful.
                         */
                        if (!anon) {
                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                                if (!hugetlb_vma_trylock_write(vma))
                                        goto walk_abort;
                                if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
                                        hugetlb_vma_unlock_write(vma);
                                        flush_tlb_range(vma,
                                                range.start, range.end);
                                        /*
                                         * The ref count of the PMD page was
                                         * dropped which is part of the way map
                                         * counting is done for shared PMDs.
                                         * Return 'true' here.  When there is
                                         * no other sharing, huge_pmd_unshare
                                         * returns false and we will unmap the
                                         * actual page and drop map count
                                         * to zero.
                                         */
                                        goto walk_done;
                                }
                                hugetlb_vma_unlock_write(vma);
                        }
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                        if (pte_dirty(pteval))
                                folio_mark_dirty(folio);
                } else if (likely(pte_present(pteval))) {
                        if (folio_test_large(folio) && !(flags & TTU_HWPOISON) &&
                            can_batch_unmap_folio_ptes(address, folio, pvmw.pte))
                                nr_pages = folio_nr_pages(folio);
                        end_addr = address + nr_pages * PAGE_SIZE;
                        flush_cache_range(vma, address, end_addr);

                        /* Nuke the page table entry. */
                        pteval = get_and_clear_full_ptes(mm, address, pvmw.pte, nr_pages, 0);
                        /*
                         * We clear the PTE but do not flush so potentially
                         * a remote CPU could still be writing to the folio.
                         * If the entry was previously clean then the
                         * architecture must guarantee that a clear->dirty
                         * transition on a cached TLB entry is written through
                         * and traps if the PTE is unmapped.
                         */
                        if (should_defer_flush(mm, flags))
                                set_tlb_ubc_flush_pending(mm, pteval, address, end_addr);
                        else
                                flush_tlb_range(vma, address, end_addr);
                        if (pte_dirty(pteval))
                                folio_mark_dirty(folio);
                } else {
                        pte_clear(mm, address, pvmw.pte);
                }

                /*
                 * Now the pte is cleared. If this pte was uffd-wp armed,
                 * we may want to replace a none pte with a marker pte if
                 * it's file-backed, so we don't lose the tracking info.
                 */
                pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (PageHWPoison(subpage) && (flags & TTU_HWPOISON)) {
                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (folio_test_hugetlb(folio)) {
                                hugetlb_count_sub(folio_nr_pages(folio), mm);
                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
                                                hsz);
                        } else {
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }
                } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
                           !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(folio));
                } else if (folio_test_anon(folio)) {
                        swp_entry_t entry = page_swap_entry(subpage);
                        pte_t swp_pte;
                        /*
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
                        if (unlikely(folio_test_swapbacked(folio) !=
                                        folio_test_swapcache(folio))) {
                                WARN_ON_ONCE(1);
                                goto walk_abort;
                        }

                        /* MADV_FREE page check */
                        if (!folio_test_swapbacked(folio)) {
                                int ref_count, map_count;

                                /*
                                 * Synchronize with gup_pte_range():
                                 * - clear PTE; barrier; read refcount
                                 * - inc refcount; barrier; read PTE
                                 */
                                smp_mb();

                                ref_count = folio_ref_count(folio);
                                map_count = folio_mapcount(folio);

                                /*
                                 * Order reads for page refcount and dirty flag
                                 * (see comments in __remove_mapping()).
                                 */
                                smp_rmb();

                                if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
                                        /*
                                         * redirtied either using the page table or a previously
                                         * obtained GUP reference.
                                         */
                                        set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
                                        folio_set_swapbacked(folio);
                                        goto walk_abort;
                                } else if (ref_count != 1 + map_count) {
                                        /*
                                         * Additional reference. Could be a GUP reference or any
                                         * speculative reference. GUP users must mark the folio
                                         * dirty if there was a modification. This folio cannot be
                                         * reclaimed right now either way, so act just like nothing
                                         * happened.
                                         * We'll come back here later and detect if the folio was
                                         * dirtied when the additional reference is gone.
                                         */
                                        set_ptes(mm, address, pvmw.pte, pteval, nr_pages);
                                        goto walk_abort;
                                }
                                add_mm_counter(mm, MM_ANONPAGES, -nr_pages);
                                goto discard;
                        }

                        if (swap_duplicate(entry) < 0) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                goto walk_abort;
                        }

                        /*
                         * arch_unmap_one() is expected to be a NOP on
                         * architectures where we could have PFN swap PTEs,
                         * so we'll not check/care.
                         */
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                swap_free(entry);
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                goto walk_abort;
                        }

                        /* See folio_try_share_anon_rmap(): clear PTE first. */
                        if (anon_exclusive &&
                            folio_try_share_anon_rmap_pte(folio, subpage)) {
                                swap_free(entry);
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                goto walk_abort;
                        }
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
                                        list_add(&mm->mmlist, &init_mm.mmlist);
                                spin_unlock(&mmlist_lock);
                        }
                        dec_mm_counter(mm, MM_ANONPAGES);
                        inc_mm_counter(mm, MM_SWAPENTS);
                        swp_pte = swp_entry_to_pte(entry);
                        if (anon_exclusive)
                                swp_pte = pte_swp_mkexclusive(swp_pte);
                        if (likely(pte_present(pteval))) {
                                if (pte_soft_dirty(pteval))
                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                                if (pte_uffd_wp(pteval))
                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        } else {
                                if (pte_swp_soft_dirty(pteval))
                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                                if (pte_swp_uffd_wp(pteval))
                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        }
                        set_pte_at(mm, address, pvmw.pte, swp_pte);
                } else {
                        /*
                         * This is a locked file-backed folio,
                         * so it cannot be removed from the page
                         * cache and replaced by a new folio before
                         * mmu_notifier_invalidate_range_end, so no
                         * concurrent thread might update its page table
                         * to point at a new folio while a device is
                         * still using this folio.
                         *
                         * See Documentation/mm/mmu_notifier.rst
                         */
                        dec_mm_counter(mm, mm_counter_file(folio));
                }
discard:
                if (unlikely(folio_test_hugetlb(folio))) {
                        hugetlb_remove_rmap(folio);
                } else {
                        folio_remove_rmap_ptes(folio, subpage, nr_pages, vma);
                        folio_ref_sub(folio, nr_pages - 1);
                }
                if (vma->vm_flags & VM_LOCKED)
                        mlock_drain_local();
                folio_put(folio);
                /* We have already batched the entire folio */
                if (nr_pages > 1)
                        goto walk_done;
                continue;
walk_abort:
                ret = false;
walk_done:
                page_vma_mapped_walk_done(&pvmw);
                break;
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
        return vma_is_temporary_stack(vma);
}

static int folio_not_mapped(struct folio *folio)
{
        return !folio_mapped(folio);
}

/**
 * try_to_unmap - Try to remove all page table mappings to a folio.
 * @folio: The folio to unmap.
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this
 * folio.  It is the caller's responsibility to check if the folio is
 * still mapped if needed (use TTU_SYNC to prevent accounting races).
 *
 * Context: Caller must hold the folio lock.
 */
void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_unmap_one,
                .arg = (void *)flags,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
        };

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(folio, &rwc);
        else
                rmap_walk(folio, &rwc);
}

/*
 * @arg: enum ttu_flags will be passed to this argument.
 *
 * If TTU_SPLIT_HUGE_PMD is specified any PMD mappings will be split into PTEs
 * containing migration entries.
 */
static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
                     unsigned long address, void *arg)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
        bool anon_exclusive, writable, ret = true;
        pte_t pteval;
        struct page *subpage;
        struct mmu_notifier_range range;
        enum ttu_flags flags = (enum ttu_flags)(long)arg;
        unsigned long pfn;
        unsigned long hsz = 0;

        /*
         * When racing against e.g. zap_pte_range() on another cpu,
         * in between its ptep_get_and_clear_full() and folio_remove_rmap_*(),
         * try_to_migrate() may return before page_mapped() has become false,
         * if page table locking is skipped: use TTU_SYNC to wait for that.
         */
        if (flags & TTU_SYNC)
                pvmw.flags = PVMW_SYNC;

        /*
         * unmap_page() in mm/huge_memory.c is the only user of migration with
         * TTU_SPLIT_HUGE_PMD and it wants to freeze.
         */
        if (flags & TTU_SPLIT_HUGE_PMD)
                split_huge_pmd_address(vma, address, true, folio);

        /*
         * For THP, we have to assume the worse case ie pmd for invalidation.
         * For hugetlb, it could be much worse if we need to do pud
         * invalidation in the case of pmd sharing.
         *
         * Note that the page can not be free in this function as call of
         * try_to_unmap() must hold a reference on the page.
         */
        range.end = vma_address_end(&pvmw);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address, range.end);
        if (folio_test_hugetlb(folio)) {
                /*
                 * If sharing is possible, start and end will be adjusted
                 * accordingly.
                 */
                adjust_range_if_pmd_sharing_possible(vma, &range.start,
                                                     &range.end);

                /* We need the huge page size for set_huge_pte_at() */
                hsz = huge_page_size(hstate_vma(vma));
        }
        mmu_notifier_invalidate_range_start(&range);

        while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
                /* PMD-mapped THP migration entry */
                if (!pvmw.pte) {
                        subpage = folio_page(folio,
                                pmd_pfn(*pvmw.pmd) - folio_pfn(folio));
                        VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
                                        !folio_test_pmd_mappable(folio), folio);

                        if (set_pmd_migration_entry(&pvmw, subpage)) {
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }
                        continue;
                }
#endif

                /* Unexpected PMD-mapped THP? */
                VM_BUG_ON_FOLIO(!pvmw.pte, folio);

                /*
                 * Handle PFN swap PTEs, such as device-exclusive ones, that
                 * actually map pages.
                 */
                pteval = ptep_get(pvmw.pte);
                if (likely(pte_present(pteval))) {
                        pfn = pte_pfn(pteval);
                } else {
                        pfn = swp_offset_pfn(pte_to_swp_entry(pteval));
                        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
                }

                subpage = folio_page(folio, pfn - folio_pfn(folio));
                address = pvmw.address;
                anon_exclusive = folio_test_anon(folio) &&
                                 PageAnonExclusive(subpage);

                if (folio_test_hugetlb(folio)) {
                        bool anon = folio_test_anon(folio);

                        /*
                         * huge_pmd_unshare may unmap an entire PMD page.
                         * There is no way of knowing exactly which PMDs may
                         * be cached for this mm, so we must flush them all.
                         * start/end were already adjusted above to cover this
                         * range.
                         */
                        flush_cache_range(vma, range.start, range.end);

                        /*
                         * To call huge_pmd_unshare, i_mmap_rwsem must be
                         * held in write mode.  Caller needs to explicitly
                         * do this outside rmap routines.
                         *
                         * We also must hold hugetlb vma_lock in write mode.
                         * Lock order dictates acquiring vma_lock BEFORE
                         * i_mmap_rwsem.  We can only try lock here and
                         * fail if unsuccessful.
                         */
                        if (!anon) {
                                VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
                                if (!hugetlb_vma_trylock_write(vma)) {
                                        page_vma_mapped_walk_done(&pvmw);
                                        ret = false;
                                        break;
                                }
                                if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
                                        hugetlb_vma_unlock_write(vma);
                                        flush_tlb_range(vma,
                                                range.start, range.end);

                                        /*
                                         * The ref count of the PMD page was
                                         * dropped which is part of the way map
                                         * counting is done for shared PMDs.
                                         * Return 'true' here.  When there is
                                         * no other sharing, huge_pmd_unshare
                                         * returns false and we will unmap the
                                         * actual page and drop map count
                                         * to zero.
                                         */
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                                hugetlb_vma_unlock_write(vma);
                        }
                        /* Nuke the hugetlb page table entry */
                        pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
                        if (pte_dirty(pteval))
                                folio_mark_dirty(folio);
                        writable = pte_write(pteval);
                } else if (likely(pte_present(pteval))) {
                        flush_cache_page(vma, address, pfn);
                        /* Nuke the page table entry. */
                        if (should_defer_flush(mm, flags)) {
                                /*
                                 * We clear the PTE but do not flush so potentially
                                 * a remote CPU could still be writing to the folio.
                                 * If the entry was previously clean then the
                                 * architecture must guarantee that a clear->dirty
                                 * transition on a cached TLB entry is written through
                                 * and traps if the PTE is unmapped.
                                 */
                                pteval = ptep_get_and_clear(mm, address, pvmw.pte);

                                set_tlb_ubc_flush_pending(mm, pteval, address, address + PAGE_SIZE);
                        } else {
                                pteval = ptep_clear_flush(vma, address, pvmw.pte);
                        }
                        if (pte_dirty(pteval))
                                folio_mark_dirty(folio);
                        writable = pte_write(pteval);
                } else {
                        pte_clear(mm, address, pvmw.pte);
                        writable = is_writable_device_private_entry(pte_to_swp_entry(pteval));
                }

                VM_WARN_ON_FOLIO(writable && folio_test_anon(folio) &&
                                !anon_exclusive, folio);

                /* Update high watermark before we lower rss */
                update_hiwater_rss(mm);

                if (PageHWPoison(subpage)) {
                        VM_WARN_ON_FOLIO(folio_is_device_private(folio), folio);

                        pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
                        if (folio_test_hugetlb(folio)) {
                                hugetlb_count_sub(folio_nr_pages(folio), mm);
                                set_huge_pte_at(mm, address, pvmw.pte, pteval,
                                                hsz);
                        } else {
                                dec_mm_counter(mm, mm_counter(folio));
                                set_pte_at(mm, address, pvmw.pte, pteval);
                        }
                } else if (likely(pte_present(pteval)) && pte_unused(pteval) &&
                           !userfaultfd_armed(vma)) {
                        /*
                         * The guest indicated that the page content is of no
                         * interest anymore. Simply discard the pte, vmscan
                         * will take care of the rest.
                         * A future reference will then fault in a new zero
                         * page. When userfaultfd is active, we must not drop
                         * this page though, as its main user (postcopy
                         * migration) will not expect userfaults on already
                         * copied pages.
                         */
                        dec_mm_counter(mm, mm_counter(folio));
                } else {
                        swp_entry_t entry;
                        pte_t swp_pte;

                        /*
                         * arch_unmap_one() is expected to be a NOP on
                         * architectures where we could have PFN swap PTEs,
                         * so we'll not check/care.
                         */
                        if (arch_unmap_one(mm, vma, address, pteval) < 0) {
                                if (folio_test_hugetlb(folio))
                                        set_huge_pte_at(mm, address, pvmw.pte,
                                                        pteval, hsz);
                                else
                                        set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
                        if (folio_test_hugetlb(folio)) {
                                if (anon_exclusive &&
                                    hugetlb_try_share_anon_rmap(folio)) {
                                        set_huge_pte_at(mm, address, pvmw.pte,
                                                        pteval, hsz);
                                        ret = false;
                                        page_vma_mapped_walk_done(&pvmw);
                                        break;
                                }
                        } else if (anon_exclusive &&
                                   folio_try_share_anon_rmap_pte(folio, subpage)) {
                                set_pte_at(mm, address, pvmw.pte, pteval);
                                ret = false;
                                page_vma_mapped_walk_done(&pvmw);
                                break;
                        }

                        /*
                         * Store the pfn of the page in a special migration
                         * pte. do_swap_page() will wait until the migration
                         * pte is removed and then restart fault handling.
                         */
                        if (writable)
                                entry = make_writable_migration_entry(
                                                        page_to_pfn(subpage));
                        else if (anon_exclusive)
                                entry = make_readable_exclusive_migration_entry(
                                                        page_to_pfn(subpage));
                        else
                                entry = make_readable_migration_entry(
                                                        page_to_pfn(subpage));
                        if (likely(pte_present(pteval))) {
                                if (pte_young(pteval))
                                        entry = make_migration_entry_young(entry);
                                if (pte_dirty(pteval))
                                        entry = make_migration_entry_dirty(entry);
                                swp_pte = swp_entry_to_pte(entry);
                                if (pte_soft_dirty(pteval))
                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                                if (pte_uffd_wp(pteval))
                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        } else {
                                swp_pte = swp_entry_to_pte(entry);
                                if (pte_swp_soft_dirty(pteval))
                                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                                if (pte_swp_uffd_wp(pteval))
                                        swp_pte = pte_swp_mkuffd_wp(swp_pte);
                        }
                        if (folio_test_hugetlb(folio))
                                set_huge_pte_at(mm, address, pvmw.pte, swp_pte,
                                                hsz);
                        else
                                set_pte_at(mm, address, pvmw.pte, swp_pte);
                        trace_set_migration_pte(address, pte_val(swp_pte),
                                                folio_order(folio));
                        /*
                         * No need to invalidate here it will synchronize on
                         * against the special swap migration pte.
                         */
                }

                if (unlikely(folio_test_hugetlb(folio)))
                        hugetlb_remove_rmap(folio);
                else
                        folio_remove_rmap_pte(folio, subpage, vma);
                if (vma->vm_flags & VM_LOCKED)
                        mlock_drain_local();
                folio_put(folio);
        }

        mmu_notifier_invalidate_range_end(&range);

        return ret;
}

/**
 * try_to_migrate - try to replace all page table mappings with swap entries
 * @folio: the folio to replace page table entries for
 * @flags: action and flags
 *
 * Tries to remove all the page table entries which are mapping this folio and
 * replace them with special swap entries. Caller must hold the folio lock.
 */
void try_to_migrate(struct folio *folio, enum ttu_flags flags)
{
        struct rmap_walk_control rwc = {
                .rmap_one = try_to_migrate_one,
                .arg = (void *)flags,
                .done = folio_not_mapped,
                .anon_lock = folio_lock_anon_vma_read,
        };

        /*
         * Migration always ignores mlock and only supports TTU_RMAP_LOCKED and
         * TTU_SPLIT_HUGE_PMD, TTU_SYNC, and TTU_BATCH_FLUSH flags.
         */
        if (WARN_ON_ONCE(flags & ~(TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
                                        TTU_SYNC | TTU_BATCH_FLUSH)))
                return;

        if (folio_is_zone_device(folio) &&
            (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
                return;

        /*
         * During exec, a temporary VMA is setup and later moved.
         * The VMA is moved under the anon_vma lock but not the
         * page tables leading to a race where migration cannot
         * find the migration ptes. Rather than increasing the
         * locking requirements of exec(), migration skips
         * temporary VMAs until after exec() completes.
         */
        if (!folio_test_ksm(folio) && folio_test_anon(folio))
                rwc.invalid_vma = invalid_migration_vma;

        if (flags & TTU_RMAP_LOCKED)
                rmap_walk_locked(folio, &rwc);
        else
                rmap_walk(folio, &rwc);
}

#ifdef CONFIG_DEVICE_PRIVATE
/**
 * make_device_exclusive() - Mark a page for exclusive use by a device
 * @mm: mm_struct of associated target process
 * @addr: the virtual address to mark for exclusive device access
 * @owner: passed to MMU_NOTIFY_EXCLUSIVE range notifier to allow filtering
 * @foliop: folio pointer will be stored here on success.
 *
 * This function looks up the page mapped at the given address, grabs a
 * folio reference, locks the folio and replaces the PTE with special
 * device-exclusive PFN swap entry, preventing access through the process
 * page tables. The function will return with the folio locked and referenced.
 *
 * On fault, the device-exclusive entries are replaced with the original PTE
 * under folio lock, after calling MMU notifiers.
 *
 * Only anonymous non-hugetlb folios are supported and the VMA must have
 * write permissions such that we can fault in the anonymous page writable
 * in order to mark it exclusive. The caller must hold the mmap_lock in read
 * mode.
 *
 * A driver using this to program access from a device must use a mmu notifier
 * critical section to hold a device specific lock during programming. Once
 * programming is complete it should drop the folio lock and reference after
 * which point CPU access to the page will revoke the exclusive access.
 *
 * Notes:
 *   #. This function always operates on individual PTEs mapping individual
 *      pages. PMD-sized THPs are first remapped to be mapped by PTEs before
 *      the conversion happens on a single PTE corresponding to @addr.
 *   #. While concurrent access through the process page tables is prevented,
 *      concurrent access through other page references (e.g., earlier GUP
 *      invocation) is not handled and not supported.
 *   #. device-exclusive entries are considered "clean" and "old" by core-mm.
 *      Device drivers must update the folio state when informed by MMU
 *      notifiers.
 *
 * Returns: pointer to mapped page on success, otherwise a negative error.
 */
struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
                void *owner, struct folio **foliop)
{
        struct mmu_notifier_range range;
        struct folio *folio, *fw_folio;
        struct vm_area_struct *vma;
        struct folio_walk fw;
        struct page *page;
        swp_entry_t entry;
        pte_t swp_pte;
        int ret;

        mmap_assert_locked(mm);
        addr = PAGE_ALIGN_DOWN(addr);

        /*
         * Fault in the page writable and try to lock it; note that if the
         * address would already be marked for exclusive use by a device,
         * the GUP call would undo that first by triggering a fault.
         *
         * If any other device would already map this page exclusively, the
         * fault will trigger a conversion to an ordinary
         * (non-device-exclusive) PTE and issue a MMU_NOTIFY_EXCLUSIVE.
         */
retry:
        page = get_user_page_vma_remote(mm, addr,
                                        FOLL_GET | FOLL_WRITE | FOLL_SPLIT_PMD,
                                        &vma);
        if (IS_ERR(page))
                return page;
        folio = page_folio(page);

        if (!folio_test_anon(folio) || folio_test_hugetlb(folio)) {
                folio_put(folio);
                return ERR_PTR(-EOPNOTSUPP);
        }

        ret = folio_lock_killable(folio);
        if (ret) {
                folio_put(folio);
                return ERR_PTR(ret);
        }

        /*
         * Inform secondary MMUs that we are going to convert this PTE to
         * device-exclusive, such that they unmap it now. Note that the
         * caller must filter this event out to prevent livelocks.
         */
        mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
                                      mm, addr, addr + PAGE_SIZE, owner);
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Let's do a second walk and make sure we still find the same page
         * mapped writable. Note that any page of an anonymous folio can
         * only be mapped writable using exactly one PTE ("exclusive"), so
         * there cannot be other mappings.
         */
        fw_folio = folio_walk_start(&fw, vma, addr, 0);
        if (fw_folio != folio || fw.page != page ||
            fw.level != FW_LEVEL_PTE || !pte_write(fw.pte)) {
                if (fw_folio)
                        folio_walk_end(&fw, vma);
                mmu_notifier_invalidate_range_end(&range);
                folio_unlock(folio);
                folio_put(folio);
                goto retry;
        }

        /* Nuke the page table entry so we get the uptodate dirty bit. */
        flush_cache_page(vma, addr, page_to_pfn(page));
        fw.pte = ptep_clear_flush(vma, addr, fw.ptep);

        /* Set the dirty flag on the folio now the PTE is gone. */
        if (pte_dirty(fw.pte))
                folio_mark_dirty(folio);

        /*
         * Store the pfn of the page in a special device-exclusive PFN swap PTE.
         * do_swap_page() will trigger the conversion back while holding the
         * folio lock.
         */
        entry = make_device_exclusive_entry(page_to_pfn(page));
        swp_pte = swp_entry_to_pte(entry);
        if (pte_soft_dirty(fw.pte))
                swp_pte = pte_swp_mksoft_dirty(swp_pte);
        /* The pte is writable, uffd-wp does not apply. */
        set_pte_at(mm, addr, fw.ptep, swp_pte);

        folio_walk_end(&fw, vma);
        mmu_notifier_invalidate_range_end(&range);
        *foliop = folio;
        return page;
}
EXPORT_SYMBOL_GPL(make_device_exclusive);
#endif

void __put_anon_vma(struct anon_vma *anon_vma)
{
        struct anon_vma *root = anon_vma->root;

        anon_vma_free(anon_vma);
        if (root != anon_vma && atomic_dec_and_test(&root->refcount))
                anon_vma_free(root);
}

static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio,
                                            struct rmap_walk_control *rwc)
{
        struct anon_vma *anon_vma;

        if (rwc->anon_lock)
                return rwc->anon_lock(folio, rwc);

        /*
         * Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
         * because that depends on page_mapped(); but not all its usages
         * are holding mmap_lock. Users without mmap_lock are required to
         * take a reference count to prevent the anon_vma disappearing
         */
        anon_vma = folio_anon_vma(folio);
        if (!anon_vma)
                return NULL;

        if (anon_vma_trylock_read(anon_vma))
                goto out;

        if (rwc->try_lock) {
                anon_vma = NULL;
                rwc->contended = true;
                goto out;
        }

        anon_vma_lock_read(anon_vma);
out:
        return anon_vma;
}

/*
 * rmap_walk_anon - do something to anonymous page using the object-based
 * rmap method
 * @folio: the folio to be handled
 * @rwc: control variable according to each walk type
 * @locked: caller holds relevant rmap lock
 *
 * Find all the mappings of a folio using the mapping pointer and the vma
 * chains contained in the anon_vma struct it points to.
 */
static void rmap_walk_anon(struct folio *folio,
                struct rmap_walk_control *rwc, bool locked)
{
        struct anon_vma *anon_vma;
        pgoff_t pgoff_start, pgoff_end;
        struct anon_vma_chain *avc;

        if (locked) {
                anon_vma = folio_anon_vma(folio);
                /* anon_vma disappear under us? */
                VM_BUG_ON_FOLIO(!anon_vma, folio);
        } else {
                anon_vma = rmap_walk_anon_lock(folio, rwc);
        }
        if (!anon_vma)
                return;

        pgoff_start = folio_pgoff(folio);
        pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
        anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
                        pgoff_start, pgoff_end) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(vma, pgoff_start,
                                folio_nr_pages(folio));

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                        break;
                if (rwc->done && rwc->done(folio))
                        break;
        }

        if (!locked)
                anon_vma_unlock_read(anon_vma);
}

/**
 * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping
 * of a page mapped within a specified page cache object at a specified offset.
 *
 * @folio:                 Either the folio whose mappings to traverse, or if NULL,
 *                         the callbacks specified in @rwc will be configured such
 *                         as to be able to look up mappings correctly.
 * @mapping:                 The page cache object whose mapping VMAs we intend to
 *                         traverse. If @folio is non-NULL, this should be equal to
 *                        folio_mapping(folio).
 * @pgoff_start:        The offset within @mapping of the page which we are
 *                         looking up. If @folio is non-NULL, this should be equal
 *                         to folio_pgoff(folio).
 * @nr_pages:                The number of pages mapped by the mapping. If @folio is
 *                        non-NULL, this should be equal to folio_nr_pages(folio).
 * @rwc:                The reverse mapping walk control object describing how
 *                        the traversal should proceed.
 * @locked:                Is the @mapping already locked? If not, we acquire the
 *                        lock.
 */
static void __rmap_walk_file(struct folio *folio, struct address_space *mapping,
                             pgoff_t pgoff_start, unsigned long nr_pages,
                             struct rmap_walk_control *rwc, bool locked)
{
        pgoff_t pgoff_end = pgoff_start + nr_pages - 1;
        struct vm_area_struct *vma;

        VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio);
        VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio);
        VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio);

        if (!locked) {
                if (i_mmap_trylock_read(mapping))
                        goto lookup;

                if (rwc->try_lock) {
                        rwc->contended = true;
                        return;
                }

                i_mmap_lock_read(mapping);
        }
lookup:
        vma_interval_tree_foreach(vma, &mapping->i_mmap,
                        pgoff_start, pgoff_end) {
                unsigned long address = vma_address(vma, pgoff_start, nr_pages);

                VM_BUG_ON_VMA(address == -EFAULT, vma);
                cond_resched();

                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;

                if (!rwc->rmap_one(folio, vma, address, rwc->arg))
                        goto done;
                if (rwc->done && rwc->done(folio))
                        goto done;
        }
done:
        if (!locked)
                i_mmap_unlock_read(mapping);
}

/*
 * rmap_walk_file - do something to file page using the object-based rmap method
 * @folio: the folio to be handled
 * @rwc: control variable according to each walk type
 * @locked: caller holds relevant rmap lock
 *
 * Find all the mappings of a folio using the mapping pointer and the vma chains
 * contained in the address_space struct it points to.
 */
static void rmap_walk_file(struct folio *folio,
                struct rmap_walk_control *rwc, bool locked)
{
        /*
         * The folio lock not only makes sure that folio->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the structure
         * at mapping cannot be freed and reused yet, so we can safely take
         * mapping->i_mmap_rwsem.
         */
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (!folio->mapping)
                return;

        __rmap_walk_file(folio, folio->mapping, folio->index,
                         folio_nr_pages(folio), rwc, locked);
}

void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
        if (unlikely(folio_test_ksm(folio)))
                rmap_walk_ksm(folio, rwc);
        else if (folio_test_anon(folio))
                rmap_walk_anon(folio, rwc, false);
        else
                rmap_walk_file(folio, rwc, false);
}

/* Like rmap_walk, but caller holds relevant rmap lock */
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
{
        /* no ksm support for now */
        VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
        if (folio_test_anon(folio))
                rmap_walk_anon(folio, rwc, true);
        else
                rmap_walk_file(folio, rwc, true);
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * The following two functions are for anonymous (private mapped) hugepages.
 * Unlike common anonymous pages, anonymous hugepages have no accounting code
 * and no lru code, because we handle hugepages differently from common pages.
 */
void hugetlb_add_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
                unsigned long address, rmap_t flags)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
        if (flags & RMAP_EXCLUSIVE)
                SetPageAnonExclusive(&folio->page);
        VM_WARN_ON_FOLIO(folio_entire_mapcount(folio) > 1 &&
                         PageAnonExclusive(&folio->page), folio);
}

void hugetlb_add_new_anon_rmap(struct folio *folio,
                struct vm_area_struct *vma, unsigned long address)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);

        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        /* increment count (starts at -1) */
        atomic_set(&folio->_entire_mapcount, 0);
        atomic_set(&folio->_large_mapcount, 0);
        folio_clear_hugetlb_restore_reserve(folio);
        __folio_set_anon(folio, vma, address, true);
        SetPageAnonExclusive(&folio->page);
}
#endif /* CONFIG_HUGETLB_PAGE */






















































































































































































































































































































































































































































































































































































































































































































































    3 








    3 









































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
// SPDX-License-Identifier: GPL-2.0-or-later
/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
 */

#include <linux/ethtool.h>
#include <net/netdev_lock.h>

#include "ipvlan.h"

static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval,
                                struct netlink_ext_ack *extack)
{
        struct ipvl_dev *ipvlan;
        unsigned int flags;
        int err;

        ASSERT_RTNL();
        if (port->mode != nval) {
                list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
                        flags = ipvlan->dev->flags;
                        if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S) {
                                err = dev_change_flags(ipvlan->dev,
                                                       flags | IFF_NOARP,
                                                       extack);
                        } else {
                                err = dev_change_flags(ipvlan->dev,
                                                       flags & ~IFF_NOARP,
                                                       extack);
                        }
                        if (unlikely(err))
                                goto fail;
                }
                if (nval == IPVLAN_MODE_L3S) {
                        /* New mode is L3S */
                        err = ipvlan_l3s_register(port);
                        if (err)
                                goto fail;
                } else if (port->mode == IPVLAN_MODE_L3S) {
                        /* Old mode was L3S */
                        ipvlan_l3s_unregister(port);
                }
                port->mode = nval;
        }
        return 0;

fail:
        /* Undo the flags changes that have been done so far. */
        list_for_each_entry_continue_reverse(ipvlan, &port->ipvlans, pnode) {
                flags = ipvlan->dev->flags;
                if (port->mode == IPVLAN_MODE_L3 ||
                    port->mode == IPVLAN_MODE_L3S)
                        dev_change_flags(ipvlan->dev, flags | IFF_NOARP,
                                         NULL);
                else
                        dev_change_flags(ipvlan->dev, flags & ~IFF_NOARP,
                                         NULL);
        }

        return err;
}

static int ipvlan_port_create(struct net_device *dev)
{
        struct ipvl_port *port;
        int err, idx;

        port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL);
        if (!port)
                return -ENOMEM;

        write_pnet(&port->pnet, dev_net(dev));
        port->dev = dev;
        port->mode = IPVLAN_MODE_L3;
        INIT_LIST_HEAD(&port->ipvlans);
        for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++)
                INIT_HLIST_HEAD(&port->hlhead[idx]);

        skb_queue_head_init(&port->backlog);
        INIT_WORK(&port->wq, ipvlan_process_multicast);
        ida_init(&port->ida);
        port->dev_id_start = 1;

        err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
        if (err)
                goto err;

        netdev_hold(dev, &port->dev_tracker, GFP_KERNEL);
        return 0;

err:
        kfree(port);
        return err;
}

static void ipvlan_port_destroy(struct net_device *dev)
{
        struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
        struct sk_buff *skb;

        netdev_put(dev, &port->dev_tracker);
        if (port->mode == IPVLAN_MODE_L3S)
                ipvlan_l3s_unregister(port);
        netdev_rx_handler_unregister(dev);
        cancel_work_sync(&port->wq);
        while ((skb = __skb_dequeue(&port->backlog)) != NULL) {
                dev_put(skb->dev);
                kfree_skb(skb);
        }
        ida_destroy(&port->ida);
        kfree(port);
}

#define IPVLAN_ALWAYS_ON_OFLOADS \
        (NETIF_F_SG | NETIF_F_HW_CSUM | \
         NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL)

#define IPVLAN_ALWAYS_ON \
        (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_VLAN_CHALLENGED)

#define IPVLAN_FEATURES \
        (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
         NETIF_F_GSO | NETIF_F_ALL_TSO | NETIF_F_GSO_ROBUST | \
         NETIF_F_GRO | NETIF_F_RXCSUM | \
         NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)

        /* NETIF_F_GSO_ENCAP_ALL NETIF_F_GSO_SOFTWARE Newly added */

#define IPVLAN_STATE_MASK \
        ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))

static int ipvlan_init(struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;
        struct ipvl_port *port;
        int err;

        dev->state = (dev->state & ~IPVLAN_STATE_MASK) |
                     (phy_dev->state & IPVLAN_STATE_MASK);
        dev->features = phy_dev->features & IPVLAN_FEATURES;
        dev->features |= IPVLAN_ALWAYS_ON;
        dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES;
        dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS;
        dev->hw_enc_features |= dev->features;
        dev->lltx = true;
        netif_inherit_tso_max(dev, phy_dev);
        dev->hard_header_len = phy_dev->hard_header_len;

        netdev_lockdep_set_classes(dev);

        ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats);
        if (!ipvlan->pcpu_stats)
                return -ENOMEM;

        if (!netif_is_ipvlan_port(phy_dev)) {
                err = ipvlan_port_create(phy_dev);
                if (err < 0) {
                        free_percpu(ipvlan->pcpu_stats);
                        return err;
                }
        }
        port = ipvlan_port_get_rtnl(phy_dev);
        port->count += 1;
        return 0;
}

static void ipvlan_uninit(struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;
        struct ipvl_port *port;

        free_percpu(ipvlan->pcpu_stats);

        port = ipvlan_port_get_rtnl(phy_dev);
        port->count -= 1;
        if (!port->count)
                ipvlan_port_destroy(port->dev);
}

static int ipvlan_open(struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_addr *addr;

        if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
            ipvlan->port->mode == IPVLAN_MODE_L3S)
                dev->flags |= IFF_NOARP;
        else
                dev->flags &= ~IFF_NOARP;

        rcu_read_lock();
        list_for_each_entry_rcu(addr, &ipvlan->addrs, anode)
                ipvlan_ht_addr_add(ipvlan, addr);
        rcu_read_unlock();

        return 0;
}

static int ipvlan_stop(struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;
        struct ipvl_addr *addr;

        dev_uc_unsync(phy_dev, dev);
        dev_mc_unsync(phy_dev, dev);

        rcu_read_lock();
        list_for_each_entry_rcu(addr, &ipvlan->addrs, anode)
                ipvlan_ht_addr_del(addr);
        rcu_read_unlock();

        return 0;
}

static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb,
                                     struct net_device *dev)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        int skblen = skb->len;
        int ret;

        ret = ipvlan_queue_xmit(skb, dev);
        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
                struct ipvl_pcpu_stats *pcptr;

                pcptr = this_cpu_ptr(ipvlan->pcpu_stats);

                u64_stats_update_begin(&pcptr->syncp);
                u64_stats_inc(&pcptr->tx_pkts);
                u64_stats_add(&pcptr->tx_bytes, skblen);
                u64_stats_update_end(&pcptr->syncp);
        } else {
                this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
        }
        return ret;
}

static netdev_features_t ipvlan_fix_features(struct net_device *dev,
                                             netdev_features_t features)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        features |= NETIF_F_ALL_FOR_ALL;
        features &= (ipvlan->sfeatures | ~IPVLAN_FEATURES);
        features = netdev_increment_features(ipvlan->phy_dev->features,
                                             features, features);
        features |= IPVLAN_ALWAYS_ON;
        features &= (IPVLAN_FEATURES | IPVLAN_ALWAYS_ON);

        return features;
}

static void ipvlan_change_rx_flags(struct net_device *dev, int change)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;

        if (change & IFF_ALLMULTI)
                dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1);
}

static void ipvlan_set_multicast_mac_filter(struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
                bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE);
        } else {
                struct netdev_hw_addr *ha;
                DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE);

                bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE);
                netdev_for_each_mc_addr(ha, dev)
                        __set_bit(ipvlan_mac_hash(ha->addr), mc_filters);

                /* Turn-on broadcast bit irrespective of address family,
                 * since broadcast is deferred to a work-queue, hence no
                 * impact on fast-path processing.
                 */
                __set_bit(ipvlan_mac_hash(dev->broadcast), mc_filters);

                bitmap_copy(ipvlan->mac_filters, mc_filters,
                            IPVLAN_MAC_FILTER_SIZE);
        }
        dev_uc_sync(ipvlan->phy_dev, dev);
        dev_mc_sync(ipvlan->phy_dev, dev);
}

static void ipvlan_get_stats64(struct net_device *dev,
                               struct rtnl_link_stats64 *s)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        if (ipvlan->pcpu_stats) {
                struct ipvl_pcpu_stats *pcptr;
                u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes;
                u32 rx_errs = 0, tx_drps = 0;
                u32 strt;
                int idx;

                for_each_possible_cpu(idx) {
                        pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx);
                        do {
                                strt = u64_stats_fetch_begin(&pcptr->syncp);
                                rx_pkts = u64_stats_read(&pcptr->rx_pkts);
                                rx_bytes = u64_stats_read(&pcptr->rx_bytes);
                                rx_mcast = u64_stats_read(&pcptr->rx_mcast);
                                tx_pkts = u64_stats_read(&pcptr->tx_pkts);
                                tx_bytes = u64_stats_read(&pcptr->tx_bytes);
                        } while (u64_stats_fetch_retry(&pcptr->syncp,
                                                           strt));

                        s->rx_packets += rx_pkts;
                        s->rx_bytes += rx_bytes;
                        s->multicast += rx_mcast;
                        s->tx_packets += tx_pkts;
                        s->tx_bytes += tx_bytes;

                        /* u32 values are updated without syncp protection. */
                        rx_errs += READ_ONCE(pcptr->rx_errs);
                        tx_drps += READ_ONCE(pcptr->tx_drps);
                }
                s->rx_errors = rx_errs;
                s->rx_dropped = rx_errs;
                s->tx_dropped = tx_drps;
        }
        s->tx_errors = DEV_STATS_READ(dev, tx_errors);
}

static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;

        return vlan_vid_add(phy_dev, proto, vid);
}

static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
                                   u16 vid)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;

        vlan_vid_del(phy_dev, proto, vid);
        return 0;
}

static int ipvlan_get_iflink(const struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        return READ_ONCE(ipvlan->phy_dev->ifindex);
}

static const struct net_device_ops ipvlan_netdev_ops = {
        .ndo_init                = ipvlan_init,
        .ndo_uninit                = ipvlan_uninit,
        .ndo_open                = ipvlan_open,
        .ndo_stop                = ipvlan_stop,
        .ndo_start_xmit                = ipvlan_start_xmit,
        .ndo_fix_features        = ipvlan_fix_features,
        .ndo_change_rx_flags        = ipvlan_change_rx_flags,
        .ndo_set_rx_mode        = ipvlan_set_multicast_mac_filter,
        .ndo_get_stats64        = ipvlan_get_stats64,
        .ndo_vlan_rx_add_vid        = ipvlan_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid        = ipvlan_vlan_rx_kill_vid,
        .ndo_get_iflink                = ipvlan_get_iflink,
};

static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
                              unsigned short type, const void *daddr,
                              const void *saddr, unsigned len)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct net_device *phy_dev = ipvlan->phy_dev;

        /* TODO Probably use a different field than dev_addr so that the
         * mac-address on the virtual device is portable and can be carried
         * while the packets use the mac-addr on the physical device.
         */
        return dev_hard_header(skb, phy_dev, type, daddr,
                               saddr ? : phy_dev->dev_addr, len);
}

static const struct header_ops ipvlan_header_ops = {
        .create          = ipvlan_hard_header,
        .parse                = eth_header_parse,
        .cache                = eth_header_cache,
        .cache_update        = eth_header_cache_update,
        .parse_protocol        = eth_header_parse_protocol,
};

static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
{
        ipvlan->dev->mtu = dev->mtu;
}

static bool netif_is_ipvlan(const struct net_device *dev)
{
        /* both ipvlan and ipvtap devices use the same netdev_ops */
        return dev->netdev_ops == &ipvlan_netdev_ops;
}

static int ipvlan_ethtool_get_link_ksettings(struct net_device *dev,
                                             struct ethtool_link_ksettings *cmd)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);

        return __ethtool_get_link_ksettings(ipvlan->phy_dev, cmd);
}

static void ipvlan_ethtool_get_drvinfo(struct net_device *dev,
                                       struct ethtool_drvinfo *drvinfo)
{
        strscpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver));
        strscpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version));
}

static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev)
{
        const struct ipvl_dev *ipvlan = netdev_priv(dev);

        return ipvlan->msg_enable;
}

static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        ipvlan->msg_enable = value;
}

static const struct ethtool_ops ipvlan_ethtool_ops = {
        .get_link        = ethtool_op_get_link,
        .get_link_ksettings        = ipvlan_ethtool_get_link_ksettings,
        .get_drvinfo        = ipvlan_ethtool_get_drvinfo,
        .get_msglevel        = ipvlan_ethtool_get_msglevel,
        .set_msglevel        = ipvlan_ethtool_set_msglevel,
};

static int ipvlan_nl_changelink(struct net_device *dev,
                                struct nlattr *tb[], struct nlattr *data[],
                                struct netlink_ext_ack *extack)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
        int err = 0;

        if (!data)
                return 0;
        if (!ns_capable(dev_net(ipvlan->phy_dev)->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        if (data[IFLA_IPVLAN_MODE]) {
                u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);

                err = ipvlan_set_port_mode(port, nmode, extack);
        }

        if (!err && data[IFLA_IPVLAN_FLAGS]) {
                u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);

                if (flags & IPVLAN_F_PRIVATE)
                        ipvlan_mark_private(port);
                else
                        ipvlan_clear_private(port);

                if (flags & IPVLAN_F_VEPA)
                        ipvlan_mark_vepa(port);
                else
                        ipvlan_clear_vepa(port);
        }

        return err;
}

static size_t ipvlan_nl_getsize(const struct net_device *dev)
{
        return (0
                + nla_total_size(2) /* IFLA_IPVLAN_MODE */
                + nla_total_size(2) /* IFLA_IPVLAN_FLAGS */
                );
}

static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[],
                              struct netlink_ext_ack *extack)
{
        if (!data)
                return 0;

        if (data[IFLA_IPVLAN_MODE]) {
                u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);

                if (mode >= IPVLAN_MODE_MAX)
                        return -EINVAL;
        }
        if (data[IFLA_IPVLAN_FLAGS]) {
                u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);

                /* Only two bits are used at this moment. */
                if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
                        return -EINVAL;
                /* Also both flags can't be active at the same time. */
                if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ==
                    (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
                        return -EINVAL;
        }

        return 0;
}

static int ipvlan_nl_fillinfo(struct sk_buff *skb,
                              const struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
        int ret = -EINVAL;

        if (!port)
                goto err;

        ret = -EMSGSIZE;
        if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode))
                goto err;
        if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags))
                goto err;

        return 0;

err:
        return ret;
}

int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params,
                    struct netlink_ext_ack *extack)
{
        struct net *link_net = rtnl_newlink_link_net(params);
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct ipvl_port *port;
        struct net_device *phy_dev;
        int err;
        u16 mode = IPVLAN_MODE_L3;

        if (!tb[IFLA_LINK])
                return -EINVAL;

        phy_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK]));
        if (!phy_dev)
                return -ENODEV;

        if (netif_is_ipvlan(phy_dev)) {
                struct ipvl_dev *tmp = netdev_priv(phy_dev);

                phy_dev = tmp->phy_dev;
                if (!ns_capable(dev_net(phy_dev)->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
        } else if (!netif_is_ipvlan_port(phy_dev)) {
                /* Exit early if the underlying link is invalid or busy */
                if (phy_dev->type != ARPHRD_ETHER ||
                    phy_dev->flags & IFF_LOOPBACK) {
                        netdev_err(phy_dev,
                                   "Master is either lo or non-ether device\n");
                        return -EINVAL;
                }

                if (netdev_is_rx_handler_busy(phy_dev)) {
                        netdev_err(phy_dev, "Device is already in use.\n");
                        return -EBUSY;
                }
        }

        ipvlan->phy_dev = phy_dev;
        ipvlan->dev = dev;
        ipvlan->sfeatures = IPVLAN_FEATURES;
        if (!tb[IFLA_MTU])
                ipvlan_adjust_mtu(ipvlan, phy_dev);
        INIT_LIST_HEAD(&ipvlan->addrs);
        spin_lock_init(&ipvlan->addrs_lock);

        /* TODO Probably put random address here to be presented to the
         * world but keep using the physical-dev address for the outgoing
         * packets.
         */
        eth_hw_addr_set(dev, phy_dev->dev_addr);

        dev->priv_flags |= IFF_NO_RX_HANDLER;

        err = register_netdevice(dev);
        if (err < 0)
                return err;

        /* ipvlan_init() would have created the port, if required */
        port = ipvlan_port_get_rtnl(phy_dev);
        ipvlan->port = port;

        /* If the port-id base is at the MAX value, then wrap it around and
         * begin from 0x1 again. This may be due to a busy system where lots
         * of slaves are getting created and deleted.
         */
        if (port->dev_id_start == 0xFFFE)
                port->dev_id_start = 0x1;

        /* Since L2 address is shared among all IPvlan slaves including
         * master, use unique 16 bit dev-ids to differentiate among them.
         * Assign IDs between 0x1 and 0xFFFE (used by the master) to each
         * slave link [see addrconf_ifid_eui48()].
         */
        err = ida_alloc_range(&port->ida, port->dev_id_start, 0xFFFD,
                              GFP_KERNEL);
        if (err < 0)
                err = ida_alloc_range(&port->ida, 0x1, port->dev_id_start - 1,
                                      GFP_KERNEL);
        if (err < 0)
                goto unregister_netdev;
        dev->dev_id = err;

        /* Increment id-base to the next slot for the future assignment */
        port->dev_id_start = err + 1;

        err = netdev_upper_dev_link(phy_dev, dev, extack);
        if (err)
                goto remove_ida;

        /* Flags are per port and latest update overrides. User has
         * to be consistent in setting it just like the mode attribute.
         */
        if (data && data[IFLA_IPVLAN_FLAGS])
                port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);

        if (data && data[IFLA_IPVLAN_MODE])
                mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);

        err = ipvlan_set_port_mode(port, mode, extack);
        if (err)
                goto unlink_netdev;

        list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
        netif_stacked_transfer_operstate(phy_dev, dev);
        return 0;

unlink_netdev:
        netdev_upper_dev_unlink(phy_dev, dev);
remove_ida:
        ida_free(&port->ida, dev->dev_id);
unregister_netdev:
        unregister_netdevice(dev);
        return err;
}
EXPORT_SYMBOL_GPL(ipvlan_link_new);

void ipvlan_link_delete(struct net_device *dev, struct list_head *head)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct ipvl_addr *addr, *next;

        spin_lock_bh(&ipvlan->addrs_lock);
        list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) {
                ipvlan_ht_addr_del(addr);
                list_del_rcu(&addr->anode);
                kfree_rcu(addr, rcu);
        }
        spin_unlock_bh(&ipvlan->addrs_lock);

        ida_free(&ipvlan->port->ida, dev->dev_id);
        list_del_rcu(&ipvlan->pnode);
        unregister_netdevice_queue(dev, head);
        netdev_upper_dev_unlink(ipvlan->phy_dev, dev);
}
EXPORT_SYMBOL_GPL(ipvlan_link_delete);

void ipvlan_link_setup(struct net_device *dev)
{
        ether_setup(dev);

        dev->max_mtu = ETH_MAX_MTU;
        dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
        dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
        dev->netdev_ops = &ipvlan_netdev_ops;
        dev->needs_free_netdev = true;
        dev->header_ops = &ipvlan_header_ops;
        dev->ethtool_ops = &ipvlan_ethtool_ops;
}
EXPORT_SYMBOL_GPL(ipvlan_link_setup);

static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] =
{
        [IFLA_IPVLAN_MODE] = { .type = NLA_U16 },
        [IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 },
};

static struct net *ipvlan_get_link_net(const struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        return dev_net(ipvlan->phy_dev);
}

static struct rtnl_link_ops ipvlan_link_ops = {
        .kind                = "ipvlan",
        .priv_size        = sizeof(struct ipvl_dev),

        .setup                = ipvlan_link_setup,
        .newlink        = ipvlan_link_new,
        .dellink        = ipvlan_link_delete,
        .get_link_net   = ipvlan_get_link_net,
};

int ipvlan_link_register(struct rtnl_link_ops *ops)
{
        ops->get_size        = ipvlan_nl_getsize;
        ops->policy        = ipvlan_nl_policy;
        ops->validate        = ipvlan_nl_validate;
        ops->fill_info        = ipvlan_nl_fillinfo;
        ops->changelink = ipvlan_nl_changelink;
        ops->maxtype        = IFLA_IPVLAN_MAX;
        return rtnl_link_register(ops);
}
EXPORT_SYMBOL_GPL(ipvlan_link_register);

static int ipvlan_device_event(struct notifier_block *unused,
                               unsigned long event, void *ptr)
{
        struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
        struct netdev_notifier_pre_changeaddr_info *prechaddr_info;
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct ipvl_dev *ipvlan, *next;
        struct ipvl_port *port;
        LIST_HEAD(lst_kill);
        int err;

        if (!netif_is_ipvlan_port(dev))
                return NOTIFY_DONE;

        port = ipvlan_port_get_rtnl(dev);

        switch (event) {
        case NETDEV_UP:
        case NETDEV_DOWN:
        case NETDEV_CHANGE:
                list_for_each_entry(ipvlan, &port->ipvlans, pnode)
                        netif_stacked_transfer_operstate(ipvlan->phy_dev,
                                                         ipvlan->dev);
                break;

        case NETDEV_REGISTER: {
                struct net *oldnet, *newnet = dev_net(dev);

                oldnet = read_pnet(&port->pnet);
                if (net_eq(newnet, oldnet))
                        break;

                write_pnet(&port->pnet, newnet);

                if (port->mode == IPVLAN_MODE_L3S)
                        ipvlan_migrate_l3s_hook(oldnet, newnet);
                break;
        }
        case NETDEV_UNREGISTER:
                if (dev->reg_state != NETREG_UNREGISTERING)
                        break;

                list_for_each_entry_safe(ipvlan, next, &port->ipvlans, pnode)
                        ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev,
                                                            &lst_kill);
                unregister_netdevice_many(&lst_kill);
                break;

        case NETDEV_FEAT_CHANGE:
                list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
                        netif_inherit_tso_max(ipvlan->dev, dev);
                        netdev_update_features(ipvlan->dev);
                }
                break;

        case NETDEV_CHANGEMTU:
                list_for_each_entry(ipvlan, &port->ipvlans, pnode)
                        ipvlan_adjust_mtu(ipvlan, dev);
                break;

        case NETDEV_PRE_CHANGEADDR:
                prechaddr_info = ptr;
                list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
                        err = dev_pre_changeaddr_notify(ipvlan->dev,
                                                    prechaddr_info->dev_addr,
                                                    extack);
                        if (err)
                                return notifier_from_errno(err);
                }
                break;

        case NETDEV_CHANGEADDR:
                list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
                        eth_hw_addr_set(ipvlan->dev, dev->dev_addr);
                        call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev);
                }
                break;

        case NETDEV_PRE_TYPE_CHANGE:
                /* Forbid underlying device to change its type. */
                return NOTIFY_BAD;

        case NETDEV_NOTIFY_PEERS:
        case NETDEV_BONDING_FAILOVER:
        case NETDEV_RESEND_IGMP:
                list_for_each_entry(ipvlan, &port->ipvlans, pnode)
                        call_netdevice_notifiers(event, ipvlan->dev);
        }
        return NOTIFY_DONE;
}

/* the caller must held the addrs lock */
static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
{
        struct ipvl_addr *addr;

        addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC);
        if (!addr)
                return -ENOMEM;

        addr->master = ipvlan;
        if (!is_v6) {
                memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr));
                addr->atype = IPVL_IPV4;
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr));
                addr->atype = IPVL_IPV6;
#endif
        }

        list_add_tail_rcu(&addr->anode, &ipvlan->addrs);

        /* If the interface is not up, the address will be added to the hash
         * list by ipvlan_open.
         */
        if (netif_running(ipvlan->dev))
                ipvlan_ht_addr_add(ipvlan, addr);

        return 0;
}

static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
{
        struct ipvl_addr *addr;

        spin_lock_bh(&ipvlan->addrs_lock);
        addr = ipvlan_find_addr(ipvlan, iaddr, is_v6);
        if (!addr) {
                spin_unlock_bh(&ipvlan->addrs_lock);
                return;
        }

        ipvlan_ht_addr_del(addr);
        list_del_rcu(&addr->anode);
        spin_unlock_bh(&ipvlan->addrs_lock);
        kfree_rcu(addr, rcu);
}

static bool ipvlan_is_valid_dev(const struct net_device *dev)
{
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        if (!netif_is_ipvlan(dev))
                return false;

        if (!ipvlan || !ipvlan->port)
                return false;

        return true;
}

#if IS_ENABLED(CONFIG_IPV6)
static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
{
        int ret = -EINVAL;

        spin_lock_bh(&ipvlan->addrs_lock);
        if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true))
                netif_err(ipvlan, ifup, ipvlan->dev,
                          "Failed to add IPv6=%pI6c addr for %s intf\n",
                          ip6_addr, ipvlan->dev->name);
        else
                ret = ipvlan_add_addr(ipvlan, ip6_addr, true);
        spin_unlock_bh(&ipvlan->addrs_lock);
        return ret;
}

static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
{
        return ipvlan_del_addr(ipvlan, ip6_addr, true);
}

static int ipvlan_addr6_event(struct notifier_block *unused,
                              unsigned long event, void *ptr)
{
        struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr;
        struct net_device *dev = (struct net_device *)if6->idev->dev;
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        if (!ipvlan_is_valid_dev(dev))
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_UP:
                if (ipvlan_add_addr6(ipvlan, &if6->addr))
                        return NOTIFY_BAD;
                break;

        case NETDEV_DOWN:
                ipvlan_del_addr6(ipvlan, &if6->addr);
                break;
        }

        return NOTIFY_OK;
}

static int ipvlan_addr6_validator_event(struct notifier_block *unused,
                                        unsigned long event, void *ptr)
{
        struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr;
        struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev;
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        if (!ipvlan_is_valid_dev(dev))
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_UP:
                if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) {
                        NL_SET_ERR_MSG(i6vi->extack,
                                       "Address already assigned to an ipvlan device");
                        return notifier_from_errno(-EADDRINUSE);
                }
                break;
        }

        return NOTIFY_OK;
}
#endif

static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
{
        int ret = -EINVAL;

        spin_lock_bh(&ipvlan->addrs_lock);
        if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false))
                netif_err(ipvlan, ifup, ipvlan->dev,
                          "Failed to add IPv4=%pI4 on %s intf.\n",
                          ip4_addr, ipvlan->dev->name);
        else
                ret = ipvlan_add_addr(ipvlan, ip4_addr, false);
        spin_unlock_bh(&ipvlan->addrs_lock);
        return ret;
}

static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
{
        return ipvlan_del_addr(ipvlan, ip4_addr, false);
}

static int ipvlan_addr4_event(struct notifier_block *unused,
                              unsigned long event, void *ptr)
{
        struct in_ifaddr *if4 = (struct in_ifaddr *)ptr;
        struct net_device *dev = (struct net_device *)if4->ifa_dev->dev;
        struct ipvl_dev *ipvlan = netdev_priv(dev);
        struct in_addr ip4_addr;

        if (!ipvlan_is_valid_dev(dev))
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_UP:
                ip4_addr.s_addr = if4->ifa_address;
                if (ipvlan_add_addr4(ipvlan, &ip4_addr))
                        return NOTIFY_BAD;
                break;

        case NETDEV_DOWN:
                ip4_addr.s_addr = if4->ifa_address;
                ipvlan_del_addr4(ipvlan, &ip4_addr);
                break;
        }

        return NOTIFY_OK;
}

static int ipvlan_addr4_validator_event(struct notifier_block *unused,
                                        unsigned long event, void *ptr)
{
        struct in_validator_info *ivi = (struct in_validator_info *)ptr;
        struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev;
        struct ipvl_dev *ipvlan = netdev_priv(dev);

        if (!ipvlan_is_valid_dev(dev))
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_UP:
                if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) {
                        NL_SET_ERR_MSG(ivi->extack,
                                       "Address already assigned to an ipvlan device");
                        return notifier_from_errno(-EADDRINUSE);
                }
                break;
        }

        return NOTIFY_OK;
}

static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = {
        .notifier_call = ipvlan_addr4_event,
};

static struct notifier_block ipvlan_addr4_vtor_notifier_block __read_mostly = {
        .notifier_call = ipvlan_addr4_validator_event,
};

static struct notifier_block ipvlan_notifier_block __read_mostly = {
        .notifier_call = ipvlan_device_event,
};

#if IS_ENABLED(CONFIG_IPV6)
static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = {
        .notifier_call = ipvlan_addr6_event,
};

static struct notifier_block ipvlan_addr6_vtor_notifier_block __read_mostly = {
        .notifier_call = ipvlan_addr6_validator_event,
};
#endif

static int __init ipvlan_init_module(void)
{
        int err;

        ipvlan_init_secret();
        register_netdevice_notifier(&ipvlan_notifier_block);
#if IS_ENABLED(CONFIG_IPV6)
        register_inet6addr_notifier(&ipvlan_addr6_notifier_block);
        register_inet6addr_validator_notifier(
            &ipvlan_addr6_vtor_notifier_block);
#endif
        register_inetaddr_notifier(&ipvlan_addr4_notifier_block);
        register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block);

        err = ipvlan_l3s_init();
        if (err < 0)
                goto error;

        err = ipvlan_link_register(&ipvlan_link_ops);
        if (err < 0) {
                ipvlan_l3s_cleanup();
                goto error;
        }

        return 0;
error:
        unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
        unregister_inetaddr_validator_notifier(
            &ipvlan_addr4_vtor_notifier_block);
#if IS_ENABLED(CONFIG_IPV6)
        unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
        unregister_inet6addr_validator_notifier(
            &ipvlan_addr6_vtor_notifier_block);
#endif
        unregister_netdevice_notifier(&ipvlan_notifier_block);
        return err;
}

static void __exit ipvlan_cleanup_module(void)
{
        rtnl_link_unregister(&ipvlan_link_ops);
        ipvlan_l3s_cleanup();
        unregister_netdevice_notifier(&ipvlan_notifier_block);
        unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
        unregister_inetaddr_validator_notifier(
            &ipvlan_addr4_vtor_notifier_block);
#if IS_ENABLED(CONFIG_IPV6)
        unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
        unregister_inet6addr_validator_notifier(
            &ipvlan_addr6_vtor_notifier_block);
#endif
}

module_init(ipvlan_init_module);
module_exit(ipvlan_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>");
MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs");
MODULE_ALIAS_RTNL_LINK("ipvlan");



























  449 








  449 















  323 








  306 




  145 














   58 


  189 











  189 











    8 




  242 










  170 


  242 











    6 





   76 




   78 































  321 



































  201 


















































































































  145 












  306 

  306 



































   23 






























   23 











  244 

  244 












   23 















   55 












  203 










  171 















    8 














    8 






    8 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_NOTIFY_H
#define _LINUX_FS_NOTIFY_H

/*
 * include/linux/fsnotify.h - generic hooks for filesystem notification, to
 * reduce in-source duplication from both dnotify and inotify.
 *
 * We don't compile any of this away in some complicated menagerie of ifdefs.
 * Instead, we rely on the code inside to optimize away as needed.
 *
 * (C) Copyright 2005 Robert Love
 */

#include <linux/fsnotify_backend.h>
#include <linux/audit.h>
#include <linux/slab.h>
#include <linux/bug.h>

/* Are there any inode/mount/sb objects watched with priority prio or above? */
static inline bool fsnotify_sb_has_priority_watchers(struct super_block *sb,
                                                     int prio)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        /* Were any marks ever added to any object on this sb? */
        if (!sbinfo)
                return false;

        return atomic_long_read(&sbinfo->watched_objects[prio]);
}

/* Are there any inode/mount/sb objects that are being watched at all? */
static inline bool fsnotify_sb_has_watchers(struct super_block *sb)
{
        return fsnotify_sb_has_priority_watchers(sb, 0);
}

/*
 * Notify this @dir inode about a change in a child directory entry.
 * The directory entry may have turned positive or negative or its inode may
 * have changed (i.e. renamed over).
 *
 * Unlike fsnotify_parent(), the event will be reported regardless of the
 * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
 * the child is interested and not the parent.
 */
static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
                                struct inode *dir, const struct qstr *name,
                                u32 cookie)
{
        if (!fsnotify_sb_has_watchers(dir->i_sb))
                return 0;

        return fsnotify(mask, data, data_type, dir, name, NULL, cookie);
}

static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
                                   __u32 mask)
{
        fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0);
}

static inline void fsnotify_inode(struct inode *inode, __u32 mask)
{
        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify(mask, inode, FSNOTIFY_EVENT_INODE, NULL, NULL, inode, 0);
}

/* Notify this dentry's parent about a child's events. */
static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        struct inode *inode = d_inode(dentry);

        if (!fsnotify_sb_has_watchers(inode->i_sb))
                return 0;

        if (S_ISDIR(inode->i_mode)) {
                mask |= FS_ISDIR;

                /* sb/mount marks are not interested in name of directory */
                if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
                        goto notify_child;
        }

        /* disconnected dentry cannot notify parent */
        if (IS_ROOT(dentry))
                goto notify_child;

        return __fsnotify_parent(dentry, mask, data, data_type);

notify_child:
        return fsnotify(mask, data, data_type, NULL, NULL, inode, 0);
}

/*
 * Simple wrappers to consolidate calls to fsnotify_parent() when an event
 * is on a file/dentry.
 */
static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
{
        fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
}

static inline int fsnotify_path(const struct path *path, __u32 mask)
{
        return fsnotify_parent(path->dentry, mask, path, FSNOTIFY_EVENT_PATH);
}

static inline int fsnotify_file(struct file *file, __u32 mask)
{
        /*
         * FMODE_NONOTIFY are fds generated by fanotify itself which should not
         * generate new events. We also don't want to generate events for
         * FMODE_PATH fds (involves open & close events) as they are just
         * handle creation / destruction events and not "real" file events.
         */
        if (FMODE_FSNOTIFY_NONE(file->f_mode))
                return 0;

        return fsnotify_path(&file->f_path, mask);
}

#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS

void file_set_fsnotify_mode_from_watchers(struct file *file);

/*
 * fsnotify_file_area_perm - permission hook before access to file range
 */
static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        /*
         * filesystem may be modified in the context of permission events
         * (e.g. by HSM filling a file on access), so sb freeze protection
         * must not be held.
         */
        lockdep_assert_once(file_write_not_started(file));

        if (!(perm_mask & (MAY_READ | MAY_WRITE | MAY_ACCESS)))
                return 0;

        if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
                return 0;

        /*
         * read()/write() and other types of access generate pre-content events.
         */
        if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
                int ret = fsnotify_pre_content(&file->f_path, ppos, count);

                if (ret)
                        return ret;
        }

        if (!(perm_mask & MAY_READ))
                return 0;

        /*
         * read() also generates the legacy FS_ACCESS_PERM event, so content
         * scanners can inspect the content filled by pre-content event.
         */
        return fsnotify_path(&file->f_path, FS_ACCESS_PERM);
}

/*
 * fsnotify_mmap_perm - permission hook before mmap of file range
 */
static inline int fsnotify_mmap_perm(struct file *file, int prot,
                                     const loff_t off, size_t len)
{
        /*
         * mmap() generates only pre-content events.
         */
        if (!file || likely(!FMODE_FSNOTIFY_HSM(file->f_mode)))
                return 0;

        return fsnotify_pre_content(&file->f_path, &off, len);
}

/*
 * fsnotify_truncate_perm - permission hook before file truncate
 */
static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
{
        struct inode *inode = d_inode(path->dentry);

        if (!(inode->i_sb->s_iflags & SB_I_ALLOW_HSM) ||
            !fsnotify_sb_has_priority_watchers(inode->i_sb,
                                               FSNOTIFY_PRIO_PRE_CONTENT))
                return 0;

        return fsnotify_pre_content(path, &length, 0);
}

/*
 * fsnotify_file_perm - permission hook before file access (unknown range)
 */
static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return fsnotify_file_area_perm(file, perm_mask, NULL, 0);
}

/*
 * fsnotify_open_perm - permission hook before file open
 */
static inline int fsnotify_open_perm(struct file *file)
{
        int ret;

        if (likely(!FMODE_FSNOTIFY_PERM(file->f_mode)))
                return 0;

        if (file->f_flags & __FMODE_EXEC) {
                ret = fsnotify_path(&file->f_path, FS_OPEN_EXEC_PERM);
                if (ret)
                        return ret;
        }

        return fsnotify_path(&file->f_path, FS_OPEN_PERM);
}

#else
static inline void file_set_fsnotify_mode_from_watchers(struct file *file)
{
}

static inline int fsnotify_file_area_perm(struct file *file, int perm_mask,
                                          const loff_t *ppos, size_t count)
{
        return 0;
}

static inline int fsnotify_mmap_perm(struct file *file, int prot,
                                     const loff_t off, size_t len)
{
        return 0;
}

static inline int fsnotify_truncate_perm(const struct path *path, loff_t length)
{
        return 0;
}

static inline int fsnotify_file_perm(struct file *file, int perm_mask)
{
        return 0;
}

static inline int fsnotify_open_perm(struct file *file)
{
        return 0;
}
#endif

/*
 * fsnotify_link_count - inode's link count changed
 */
static inline void fsnotify_link_count(struct inode *inode)
{
        fsnotify_inode(inode, FS_ATTRIB);
}

/*
 * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
 */
static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
                                 const struct qstr *old_name,
                                 int isdir, struct inode *target,
                                 struct dentry *moved)
{
        struct inode *source = moved->d_inode;
        u32 fs_cookie = fsnotify_get_cookie();
        __u32 old_dir_mask = FS_MOVED_FROM;
        __u32 new_dir_mask = FS_MOVED_TO;
        __u32 rename_mask = FS_RENAME;
        const struct qstr *new_name = &moved->d_name;

        if (isdir) {
                old_dir_mask |= FS_ISDIR;
                new_dir_mask |= FS_ISDIR;
                rename_mask |= FS_ISDIR;
        }

        /* Event with information about both old and new parent+name */
        fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
                      old_dir, old_name, 0);

        fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      old_dir, old_name, fs_cookie);
        fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
                      new_dir, new_name, fs_cookie);

        if (target)
                fsnotify_link_count(target);
        fsnotify_inode(source, FS_MOVE_SELF);
        audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
}

/*
 * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
 */
static inline void fsnotify_inode_delete(struct inode *inode)
{
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_vfsmount_delete - a vfsmount is being destroyed, clean up is needed
 */
static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
{
        __fsnotify_vfsmount_delete(mnt);
}

static inline void fsnotify_mntns_delete(struct mnt_namespace *mntns)
{
        __fsnotify_mntns_delete(mntns);
}

/*
 * fsnotify_inoderemove - an inode is going away
 */
static inline void fsnotify_inoderemove(struct inode *inode)
{
        fsnotify_inode(inode, FS_DELETE_SELF);
        __fsnotify_inode_delete(inode);
}

/*
 * fsnotify_create - 'name' was linked in
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_create(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE);
}

/*
 * fsnotify_link - new hardlink in 'inode' directory
 *
 * Caller must make sure that new_dentry->d_name is stable.
 * Note: We have to pass also the linked inode ptr as some filesystems leave
 *   new_dentry->d_inode NULL and instantiate inode pointer later
 */
static inline void fsnotify_link(struct inode *dir, struct inode *inode,
                                 struct dentry *new_dentry)
{
        fsnotify_link_count(inode);
        audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE,
                      dir, &new_dentry->d_name, 0);
}

/*
 * fsnotify_delete - @dentry was unlinked and unhashed
 *
 * Caller must make sure that dentry->d_name is stable.
 *
 * Note: unlike fsnotify_unlink(), we have to pass also the unlinked inode
 * as this may be called after d_delete() and old_dentry may be negative.
 */
static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
                                   struct dentry *dentry)
{
        __u32 mask = FS_DELETE;

        if (S_ISDIR(inode->i_mode))
                mask |= FS_ISDIR;

        fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
                      0);
}

/**
 * d_delete_notify - delete a dentry and call fsnotify_delete()
 * @dentry: The dentry to delete
 *
 * This helper is used to guaranty that the unlinked inode cannot be found
 * by lookup of this name after fsnotify_delete() event has been delivered.
 */
static inline void d_delete_notify(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        ihold(inode);
        d_delete(dentry);
        fsnotify_delete(dir, inode, dentry);
        iput(inode);
}

/*
 * fsnotify_unlink - 'name' was unlinked
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_mkdir - directory 'name' was created
 *
 * Caller must make sure that dentry->d_name is stable.
 * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
 * ->d_inode later
 */
static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
{
        audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);

        fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR);
}

/*
 * fsnotify_rmdir - directory 'name' was removed
 *
 * Caller must make sure that dentry->d_name is stable.
 */
static inline void fsnotify_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (WARN_ON_ONCE(d_is_negative(dentry)))
                return;

        fsnotify_delete(dir, d_inode(dentry), dentry);
}

/*
 * fsnotify_access - file was read
 */
static inline void fsnotify_access(struct file *file)
{
        fsnotify_file(file, FS_ACCESS);
}

/*
 * fsnotify_modify - file was modified
 */
static inline void fsnotify_modify(struct file *file)
{
        fsnotify_file(file, FS_MODIFY);
}

/*
 * fsnotify_open - file was opened
 */
static inline void fsnotify_open(struct file *file)
{
        __u32 mask = FS_OPEN;

        if (file->f_flags & __FMODE_EXEC)
                mask |= FS_OPEN_EXEC;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_close - file was closed
 */
static inline void fsnotify_close(struct file *file)
{
        __u32 mask = (file->f_mode & FMODE_WRITE) ? FS_CLOSE_WRITE :
                                                    FS_CLOSE_NOWRITE;

        fsnotify_file(file, mask);
}

/*
 * fsnotify_xattr - extended attributes were changed
 */
static inline void fsnotify_xattr(struct dentry *dentry)
{
        fsnotify_dentry(dentry, FS_ATTRIB);
}

/*
 * fsnotify_change - notify_change event.  file was modified and/or metadata
 * was changed.
 */
static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
{
        __u32 mask = 0;

        if (ia_valid & ATTR_UID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_GID)
                mask |= FS_ATTRIB;
        if (ia_valid & ATTR_SIZE)
                mask |= FS_MODIFY;

        /* both times implies a utime(s) call */
        if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
                mask |= FS_ATTRIB;
        else if (ia_valid & ATTR_ATIME)
                mask |= FS_ACCESS;
        else if (ia_valid & ATTR_MTIME)
                mask |= FS_MODIFY;

        if (ia_valid & ATTR_MODE)
                mask |= FS_ATTRIB;

        if (mask)
                fsnotify_dentry(dentry, mask);
}

static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
                                    int error)
{
        struct fs_error_report report = {
                .error = error,
                .inode = inode,
                .sb = sb,
        };

        return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
                        NULL, NULL, NULL, 0);
}

static inline void fsnotify_mnt_attach(struct mnt_namespace *ns, struct vfsmount *mnt)
{
        fsnotify_mnt(FS_MNT_ATTACH, ns, mnt);
}

static inline void fsnotify_mnt_detach(struct mnt_namespace *ns, struct vfsmount *mnt)
{
        fsnotify_mnt(FS_MNT_DETACH, ns, mnt);
}

static inline void fsnotify_mnt_move(struct mnt_namespace *ns, struct vfsmount *mnt)
{
        fsnotify_mnt(FS_MNT_MOVE, ns, mnt);
}

#endif        /* _LINUX_FS_NOTIFY_H */






























































  376 



  517 





























  208 
























































































  509 
































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_FIND_H_
#define __LINUX_FIND_H_

#ifndef __LINUX_BITMAP_H
#error only <linux/bitmap.h> can be included directly
#endif

#include <linux/bitops.h>

unsigned long _find_next_bit(const unsigned long *addr1, unsigned long nbits,
                                unsigned long start);
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_or_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long nbits, unsigned long start);
unsigned long _find_next_zero_bit(const unsigned long *addr, unsigned long nbits,
                                         unsigned long start);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
unsigned long __find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n);
unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n);
unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        unsigned long size, unsigned long n);
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                        const unsigned long *addr3, unsigned long size,
                                        unsigned long n);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
                                         const unsigned long *addr2, unsigned long size);
unsigned long _find_first_and_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                      const unsigned long *addr3, unsigned long size);
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_last_bit(const unsigned long *addr, unsigned long size);

#ifdef __BIG_ENDIAN
unsigned long _find_first_zero_bit_le(const unsigned long *addr, unsigned long size);
unsigned long _find_next_zero_bit_le(const  unsigned long *addr, unsigned
                                        long size, unsigned long offset);
unsigned long _find_next_bit_le(const unsigned long *addr, unsigned
                                long size, unsigned long offset);
#endif

#ifndef find_next_bit
/**
 * find_next_bit - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
                            unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit(addr, size, offset);
}
#endif

#ifndef find_next_and_bit
/**
 * find_next_and_bit - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_and_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & *addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_and_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_andnot_bit
/**
 * find_next_andnot_bit - find the next set bit in *addr1 excluding all the bits
 *                        in *addr2
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_andnot_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr1 & ~*addr2 & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_andnot_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_or_bit
/**
 * find_next_or_bit - find the next set bit in either memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_or_bit(const unsigned long *addr1,
                const unsigned long *addr2, unsigned long size,
                unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = (*addr1 | *addr2) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_or_bit(addr1, addr2, size, offset);
}
#endif

#ifndef find_next_zero_bit
/**
 * find_next_zero_bit - find the next cleared bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number of the next zero bit
 * If no bits are zero, returns @size.
 */
static __always_inline
unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
                                 unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val;

                if (unlikely(offset >= size))
                        return size;

                val = *addr | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit(addr, size, offset);
}
#endif

#ifndef find_first_bit
/**
 * find_first_bit - find the first set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first set bit.
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_first_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_bit(addr, size);
}
#endif

/**
 * find_nth_bit - find N'th set bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * The following is semantically equivalent:
 *         idx = find_nth_bit(addr, size, 0);
 *         idx = find_first_bit(addr, size);
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns >= @size.
 */
static __always_inline
unsigned long find_nth_bit(const unsigned long *addr, unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_bit(addr, size, n);
}

/**
 * find_nth_and_bit - find N'th set bit in 2 memory regions
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_and_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_bit(addr1, addr2, size, n);
}

/**
 * find_nth_andnot_bit - find N'th set bit in 2 memory regions,
 *                         flipping bits in 2nd region
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
                                unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & (~*addr2) & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_andnot_bit(addr1, addr2, size, n);
}

/**
 * find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
 *                             excluding those set in 3rd region
 * @addr1: The 1st address to start the search at
 * @addr2: The 2nd address to start the search at
 * @addr3: The 3rd address to start the search at
 * @size: The maximum number of bits to search
 * @n: The number of set bit, which position is needed, counting from 0
 *
 * Returns the bit number of the N'th set bit.
 * If no such, returns @size.
 */
static __always_inline
unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        const unsigned long *addr3,
                                        unsigned long size, unsigned long n)
{
        if (n >= size)
                return size;

        if (small_const_nbits(size)) {
                unsigned long val =  *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);

                return val ? fns(val, n) : size;
        }

        return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
}

#ifndef find_first_and_bit
/**
 * find_first_and_bit - find the first set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the next set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_first_and_bit(const unsigned long *addr1,
                                 const unsigned long *addr2,
                                 unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_bit(addr1, addr2, size);
}
#endif

/**
 * find_first_and_and_bit - find the first set bit in 3 memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @addr3: The third address to base the search on
 * @size: The bitmap size in bits
 *
 * Returns the bit number for the first set bit
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_first_and_and_bit(const unsigned long *addr1,
                                     const unsigned long *addr2,
                                     const unsigned long *addr3,
                                     unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr1 & *addr2 & *addr3 & GENMASK(size - 1, 0);

                return val ? __ffs(val) : size;
        }

        return _find_first_and_and_bit(addr1, addr2, addr3, size);
}

#ifndef find_first_zero_bit
/**
 * find_first_zero_bit - find the first cleared bit in a memory region
 * @addr: The address to start the search at
 * @size: The maximum number of bits to search
 *
 * Returns the bit number of the first cleared bit.
 * If no bits are zero, returns @size.
 */
static __always_inline
unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit(addr, size);
}
#endif

#ifndef find_last_bit
/**
 * find_last_bit - find the last set bit in a memory region
 * @addr: The address to start the search at
 * @size: The number of bits to search
 *
 * Returns the bit number of the last set bit, or size.
 */
static __always_inline
unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = *addr & GENMASK(size - 1, 0);

                return val ? __fls(val) : size;
        }

        return _find_last_bit(addr, size);
}
#endif

/**
 * find_next_and_bit_wrap - find the next set bit in both memory regions
 * @addr1: The first address to base the search on
 * @addr2: The second address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_and_bit_wrap(const unsigned long *addr1,
                                        const unsigned long *addr2,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_and_bit(addr1, addr2, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_and_bit(addr1, addr2, offset);
        return bit < offset ? bit : size;
}

/**
 * find_next_bit_wrap - find the next set bit in a memory region
 * @addr: The address to base the search on
 * @size: The bitmap size in bits
 * @offset: The bitnumber to start searching at
 *
 * Returns the bit number for the next set bit, or first set bit up to @offset
 * If no bits are set, returns @size.
 */
static __always_inline
unsigned long find_next_bit_wrap(const unsigned long *addr,
                                        unsigned long size, unsigned long offset)
{
        unsigned long bit = find_next_bit(addr, size, offset);

        if (bit < size || offset == 0)
                return bit;

        bit = find_first_bit(addr, offset);
        return bit < offset ? bit : size;
}

/*
 * Helper for for_each_set_bit_wrap(). Make sure you're doing right thing
 * before using it alone.
 */
static __always_inline
unsigned long __for_each_wrap(const unsigned long *bitmap, unsigned long size,
                                 unsigned long start, unsigned long n)
{
        unsigned long bit;

        /* If not wrapped around */
        if (n > start) {
                /* and have a bit, just return it. */
                bit = find_next_bit(bitmap, size, n);
                if (bit < size)
                        return bit;

                /* Otherwise, wrap around and ... */
                n = 0;
        }

        /* Search the other part. */
        bit = find_next_bit(bitmap, start, n);
        return bit < start ? bit : size;
}

/**
 * find_next_clump8 - find next 8-bit clump with set bits in a memory region
 * @clump: location to store copy of found clump
 * @addr: address to base the search on
 * @size: bitmap size in number of bits
 * @offset: bit offset at which to start searching
 *
 * Returns the bit offset for the next set clump; the found clump value is
 * copied to the location pointed by @clump. If no bits are set, returns @size.
 */
extern unsigned long find_next_clump8(unsigned long *clump,
                                      const unsigned long *addr,
                                      unsigned long size, unsigned long offset);

#define find_first_clump8(clump, bits, size) \
        find_next_clump8((clump), (bits), (size), 0)

#if defined(__LITTLE_ENDIAN)

static __always_inline
unsigned long find_next_zero_bit_le(const void *addr, unsigned long size, unsigned long offset)
{
        return find_next_zero_bit(addr, size, offset);
}

static __always_inline
unsigned long find_next_bit_le(const void *addr, unsigned long size, unsigned long offset)
{
        return find_next_bit(addr, size, offset);
}

static __always_inline
unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
{
        return find_first_zero_bit(addr, size);
}

#elif defined(__BIG_ENDIAN)

#ifndef find_next_zero_bit_le
static __always_inline
unsigned long find_next_zero_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) | ~GENMASK(size - 1, offset);
                return val == ~0UL ? size : ffz(val);
        }

        return _find_next_zero_bit_le(addr, size, offset);
}
#endif

#ifndef find_first_zero_bit_le
static __always_inline
unsigned long find_first_zero_bit_le(const void *addr, unsigned long size)
{
        if (small_const_nbits(size)) {
                unsigned long val = swab(*(const unsigned long *)addr) | ~GENMASK(size - 1, 0);

                return val == ~0UL ? size : ffz(val);
        }

        return _find_first_zero_bit_le(addr, size);
}
#endif

#ifndef find_next_bit_le
static __always_inline
unsigned long find_next_bit_le(const void *addr, unsigned
                long size, unsigned long offset)
{
        if (small_const_nbits(size)) {
                unsigned long val = *(const unsigned long *)addr;

                if (unlikely(offset >= size))
                        return size;

                val = swab(val) & GENMASK(size - 1, offset);
                return val ? __ffs(val) : size;
        }

        return _find_next_bit_le(addr, size, offset);
}
#endif

#else
#error "Please fix <asm/byteorder.h>"
#endif

#define for_each_set_bit(bit, addr, size) \
        for ((bit) = 0; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_and_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_andnot_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

#define for_each_or_bit(bit, addr1, addr2, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_or_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
             (bit)++)

/* same as for_each_set_bit() but use bit as value to start with */
#define for_each_set_bit_from(bit, addr, size) \
        for (; (bit) = find_next_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

#define for_each_clear_bit(bit, addr, size) \
        for ((bit) = 0;                                                                        \
             (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size);                \
             (bit)++)

/* same as for_each_clear_bit() but use bit as value to start with */
#define for_each_clear_bit_from(bit, addr, size) \
        for (; (bit) = find_next_zero_bit((addr), (size), (bit)), (bit) < (size); (bit)++)

/**
 * for_each_set_bitrange - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit)
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange(b, e, addr, size)                        \
        for ((b) = 0;                                                \
             (b) = find_next_bit((addr), (size), b),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bitrange_from - iterate over all set bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_bit((addr), (size), (b)),                \
             (e) = find_next_zero_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first unset bit)
 * @e: bit offset of end of current bitrange (first set bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange(b, e, addr, size)                \
        for ((b) = 0;                                                \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_clear_bitrange_from - iterate over all unset bit ranges [b; e)
 * @b: bit offset of start of current bitrange (first set bit); must be initialized
 * @e: bit offset of end of current bitrange (first unset bit)
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_clear_bitrange_from(b, e, addr, size)                \
        for (;                                                        \
             (b) = find_next_zero_bit((addr), (size), (b)),        \
             (e) = find_next_bit((addr), (size), (b) + 1),        \
             (b) < (size);                                        \
             (b) = (e) + 1)

/**
 * for_each_set_bit_wrap - iterate over all set bits starting from @start, and
 * wrapping around the end of bitmap.
 * @bit: offset for current iteration
 * @addr: bitmap address to base the search on
 * @size: bitmap size in number of bits
 * @start: Starting bit for bitmap traversing, wrapping around the bitmap end
 */
#define for_each_set_bit_wrap(bit, addr, size, start) \
        for ((bit) = find_next_bit_wrap((addr), (size), (start));                \
             (bit) < (size);                                                        \
             (bit) = __for_each_wrap((addr), (size), (start), (bit) + 1))

/**
 * for_each_set_clump8 - iterate over bitmap for each 8-bit clump with set bits
 * @start: bit offset to start search and to store the current iteration offset
 * @clump: location to store copy of current 8-bit clump
 * @bits: bitmap address to base the search on
 * @size: bitmap size in number of bits
 */
#define for_each_set_clump8(start, clump, bits, size) \
        for ((start) = find_first_clump8(&(clump), (bits), (size)); \
             (start) < (size); \
             (start) = find_next_clump8(&(clump), (bits), (size), (start) + 8))

#endif /*__LINUX_FIND_H_ */


























































































































































































































































































  777 

  246 







































































































































































































































































    2 
   42 


  182 

  221 
   44 
  285 


  225 














  279 

  250 























    4 




















  128 









   57 

  219 


  220 

   35 
   16 





















































































  314 




  254 






  479 
















































  275 























    4 
   28 























  106 















































































































































































































































  168 

















  777 




























  253 




















   27 




















  254 
  254 
  252 














































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Macros for manipulating and testing page->flags
 */

#ifndef PAGE_FLAGS_H
#define PAGE_FLAGS_H

#include <linux/types.h>
#include <linux/bug.h>
#include <linux/mmdebug.h>
#ifndef __GENERATING_BOUNDS_H
#include <linux/mm_types.h>
#include <generated/bounds.h>
#endif /* !__GENERATING_BOUNDS_H */

/*
 * Various page->flags bits:
 *
 * PG_reserved is set for special pages. The "struct page" of such a page
 * should in general not be touched (e.g. set dirty) except by its owner.
 * Pages marked as PG_reserved include:
 * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS,
 *   initrd, HW tables)
 * - Pages reserved or allocated early during boot (before the page allocator
 *   was initialized). This includes (depending on the architecture) the
 *   initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much
 *   much more. Once (if ever) freed, PG_reserved is cleared and they will
 *   be given to the page allocator.
 * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying
 *   to read/write these pages might end badly. Don't touch!
 * - The zero page(s)
 * - Pages allocated in the context of kexec/kdump (loaded kernel image,
 *   control pages, vmcoreinfo)
 * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are
 *   not marked PG_reserved (as they might be in use by somebody else who does
 *   not respect the caching strategy).
 * - MCA pages on ia64
 * - Pages holding CPU notes for POWER Firmware Assisted Dump
 * - Device memory (e.g. PMEM, DAX, HMM)
 * Some PG_reserved pages will be excluded from the hibernation image.
 * PG_reserved does in general not hinder anybody from dumping or swapping
 * and is no longer required for remap_pfn_range(). ioremap might require it.
 * Consequently, PG_reserved for a page mapped into user space can indicate
 * the zero page, the vDSO, MMIO pages or device memory.
 *
 * The PG_private bitflag is set on pagecache pages if they contain filesystem
 * specific data (which is normally at page->private). It can be used by
 * private allocations for its own usage.
 *
 * During initiation of disk I/O, PG_locked is set. This bit is set before I/O
 * and cleared when writeback _starts_ or when read _completes_. PG_writeback
 * is set before writeback starts and cleared when it finishes.
 *
 * PG_locked also pins a page in pagecache, and blocks truncation of the file
 * while it is held.
 *
 * page_waitqueue(page) is a wait queue of all tasks waiting for the page
 * to become unlocked.
 *
 * PG_swapbacked is set when a page uses swap as a backing storage.  This are
 * usually PageAnon or shmem pages but please note that even anonymous pages
 * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as
 * a result of MADV_FREE).
 *
 * PG_referenced, PG_reclaim are used for page reclaim for anonymous and
 * file-backed pagecache (see mm/vmscan.c).
 *
 * PG_arch_1 is an architecture specific page state bit.  The generic code
 * guarantees that this bit is cleared for a page when it first is entered into
 * the page cache.
 *
 * PG_hwpoison indicates that a page got corrupted in hardware and contains
 * data with incorrect ECC bits that triggered a machine check. Accessing is
 * not safe since it may cause another machine check. Don't touch!
 */

/*
 * Don't use the pageflags directly.  Use the PageFoo macros.
 *
 * The page flags field is split into two parts, the main flags area
 * which extends from the low bits upwards, and the fields area which
 * extends from the high bits downwards.
 *
 *  | FIELD | ... | FLAGS |
 *  N-1           ^       0
 *               (NR_PAGEFLAGS)
 *
 * The fields area is reserved for fields mapping zone, node (for NUMA) and
 * SPARSEMEM section (for variants of SPARSEMEM that require section ids like
 * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP).
 */
enum pageflags {
        PG_locked,                /* Page is locked. Don't touch. */
        PG_writeback,                /* Page is under writeback */
        PG_referenced,
        PG_uptodate,
        PG_dirty,
        PG_lru,
        PG_head,                /* Must be in bit 6 */
        PG_waiters,                /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
        PG_active,
        PG_workingset,
        PG_owner_priv_1,        /* Owner use. If pagecache, fs may use */
        PG_owner_2,                /* Owner use. If pagecache, fs may use */
        PG_arch_1,
        PG_reserved,
        PG_private,                /* If pagecache, has fs-private data */
        PG_private_2,                /* If pagecache, has fs aux data */
        PG_reclaim,                /* To be reclaimed asap */
        PG_swapbacked,                /* Page is backed by RAM/swap */
        PG_unevictable,                /* Page is "unevictable"  */
        PG_dropbehind,                /* drop pages on IO completion */
#ifdef CONFIG_MMU
        PG_mlocked,                /* Page is vma mlocked */
#endif
#ifdef CONFIG_MEMORY_FAILURE
        PG_hwpoison,                /* hardware poisoned page. Don't touch */
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && defined(CONFIG_64BIT)
        PG_young,
        PG_idle,
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
        PG_arch_2,
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
        PG_arch_3,
#endif
        __NR_PAGEFLAGS,

        PG_readahead = PG_reclaim,

        /* Anonymous memory (and shmem) */
        PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */
        /* Some filesystems */
        PG_checked = PG_owner_priv_1,

        /*
         * Depending on the way an anonymous folio can be mapped into a page
         * table (e.g., single PMD/PUD/CONT of the head page vs. PTE-mapped
         * THP), PG_anon_exclusive may be set only for the head page or for
         * tail pages of an anonymous folio. For now, we only expect it to be
         * set on tail pages for PTE-mapped THP.
         */
        PG_anon_exclusive = PG_owner_2,

        /*
         * Set if all buffer heads in the folio are mapped.
         * Filesystems which do not use BHs can use it for their own purpose.
         */
        PG_mappedtodisk = PG_owner_2,

        /* Two page bits are conscripted by FS-Cache to maintain local caching
         * state.  These bits are set on pages belonging to the netfs's inodes
         * when those inodes are being locally cached.
         */
        PG_fscache = PG_private_2,        /* page backed by cache */

        /* XEN */
        /* Pinned in Xen as a read-only pagetable page. */
        PG_pinned = PG_owner_priv_1,
        /* Pinned as part of domain save (see xen_mm_pin_all()). */
        PG_savepinned = PG_dirty,
        /* Has a grant mapping of another (foreign) domain's page. */
        PG_foreign = PG_owner_priv_1,
        /* Remapped by swiotlb-xen. */
        PG_xen_remapped = PG_owner_priv_1,

        /* non-lru isolated movable page */
        PG_isolated = PG_reclaim,

        /* Only valid for buddy pages. Used to track pages that are reported */
        PG_reported = PG_uptodate,

#ifdef CONFIG_MEMORY_HOTPLUG
        /* For self-hosted memmap pages */
        PG_vmemmap_self_hosted = PG_owner_priv_1,
#endif

        /*
         * Flags only valid for compound pages.  Stored in first tail page's
         * flags word.  Cannot use the first 8 flags or any flag marked as
         * PF_ANY.
         */

        /* At least one page in this folio has the hwpoison flag set */
        PG_has_hwpoisoned = PG_active,
        PG_large_rmappable = PG_workingset, /* anon or file-backed */
        PG_partially_mapped = PG_reclaim, /* was identified to be partially mapped */
};

#define PAGEFLAGS_MASK                ((1UL << NR_PAGEFLAGS) - 1)

#ifndef __GENERATING_BOUNDS_H

#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
DECLARE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);

/*
 * Return the real head page struct iff the @page is a fake head page, otherwise
 * return the @page itself. See Documentation/mm/vmemmap_dedup.rst.
 */
static __always_inline const struct page *page_fixed_fake_head(const struct page *page)
{
        if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
                return page;

        /*
         * Only addresses aligned with PAGE_SIZE of struct page may be fake head
         * struct page. The alignment check aims to avoid access the fields (
         * e.g. compound_head) of the @page[1]. It can avoid touch a (possibly)
         * cold cacheline in some cases.
         */
        if (IS_ALIGNED((unsigned long)page, PAGE_SIZE) &&
            test_bit(PG_head, &page->flags)) {
                /*
                 * We can safely access the field of the @page[1] with PG_head
                 * because the @page is a compound page composed with at least
                 * two contiguous pages.
                 */
                unsigned long head = READ_ONCE(page[1].compound_head);

                if (likely(head & 1))
                        return (const struct page *)(head - 1);
        }
        return page;
}

static __always_inline bool page_count_writable(const struct page *page, int u)
{
        if (!static_branch_unlikely(&hugetlb_optimize_vmemmap_key))
                return true;

        /*
         * The refcount check is ordered before the fake-head check to prevent
         * the following race:
         *   CPU 1 (HVO)                     CPU 2 (speculative PFN walker)
         *
         *   page_ref_freeze()
         *   synchronize_rcu()
         *                                   rcu_read_lock()
         *                                   page_is_fake_head() is false
         *   vmemmap_remap_pte()
         *   XXX: struct page[] becomes r/o
         *
         *   page_ref_unfreeze()
         *                                   page_ref_count() is not zero
         *
         *                                   atomic_add_unless(&page->_refcount)
         *                                   XXX: try to modify r/o struct page[]
         *
         * The refcount check also prevents modification attempts to other (r/o)
         * tail pages that are not fake heads.
         */
        if (atomic_read_acquire(&page->_refcount) == u)
                return false;

        return page_fixed_fake_head(page) == page;
}
#else
static inline const struct page *page_fixed_fake_head(const struct page *page)
{
        return page;
}

static inline bool page_count_writable(const struct page *page, int u)
{
        return true;
}
#endif

static __always_inline int page_is_fake_head(const struct page *page)
{
        return page_fixed_fake_head(page) != page;
}

static __always_inline unsigned long _compound_head(const struct page *page)
{
        unsigned long head = READ_ONCE(page->compound_head);

        if (unlikely(head & 1))
                return head - 1;
        return (unsigned long)page_fixed_fake_head(page);
}

#define compound_head(page)        ((typeof(page))_compound_head(page))

/**
 * page_folio - Converts from page to folio.
 * @p: The page.
 *
 * Every page is part of a folio.  This function cannot be called on a
 * NULL pointer.
 *
 * Context: No reference, nor lock is required on @page.  If the caller
 * does not hold a reference, this call may race with a folio split, so
 * it should re-check the folio still contains this page after gaining
 * a reference on the folio.
 * Return: The folio which contains this page.
 */
#define page_folio(p)                (_Generic((p),                                \
        const struct page *:        (const struct folio *)_compound_head(p), \
        struct page *:                (struct folio *)_compound_head(p)))

/**
 * folio_page - Return a page from a folio.
 * @folio: The folio.
 * @n: The page number to return.
 *
 * @n is relative to the start of the folio.  This function does not
 * check that the page number lies within @folio; the caller is presumed
 * to have a reference to the page.
 */
#define folio_page(folio, n)        nth_page(&(folio)->page, n)

static __always_inline int PageTail(const struct page *page)
{
        return READ_ONCE(page->compound_head) & 1 || page_is_fake_head(page);
}

static __always_inline int PageCompound(const struct page *page)
{
        return test_bit(PG_head, &page->flags) ||
               READ_ONCE(page->compound_head) & 1;
}

#define        PAGE_POISON_PATTERN        -1l
static inline int PagePoisoned(const struct page *page)
{
        return READ_ONCE(page->flags) == PAGE_POISON_PATTERN;
}

#ifdef CONFIG_DEBUG_VM
void page_init_poison(struct page *page, size_t size);
#else
static inline void page_init_poison(struct page *page, size_t size)
{
}
#endif

static const unsigned long *const_folio_flags(const struct folio *folio,
                unsigned n)
{
        const struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
        return &page[n].flags;
}

static unsigned long *folio_flags(struct folio *folio, unsigned n)
{
        struct page *page = &folio->page;

        VM_BUG_ON_PGFLAGS(page->compound_head & 1, page);
        VM_BUG_ON_PGFLAGS(n > 0 && !test_bit(PG_head, &page->flags), page);
        return &page[n].flags;
}

/*
 * Page flags policies wrt compound pages
 *
 * PF_POISONED_CHECK
 *     check if this struct page poisoned/uninitialized
 *
 * PF_ANY:
 *     the page flag is relevant for small, head and tail pages.
 *
 * PF_HEAD:
 *     for compound page all operations related to the page flag applied to
 *     head page.
 *
 * PF_NO_TAIL:
 *     modifications of the page flag must be done on small or head pages,
 *     checks can be done on tail pages too.
 *
 * PF_NO_COMPOUND:
 *     the page flag is not relevant for compound pages.
 *
 * PF_SECOND:
 *     the page flag is stored in the first tail page.
 */
#define PF_POISONED_CHECK(page) ({                                        \
                VM_BUG_ON_PGFLAGS(PagePoisoned(page), page);                \
                page; })
#define PF_ANY(page, enforce)        PF_POISONED_CHECK(page)
#define PF_HEAD(page, enforce)        PF_POISONED_CHECK(compound_head(page))
#define PF_NO_TAIL(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page);        \
                PF_POISONED_CHECK(compound_head(page)); })
#define PF_NO_COMPOUND(page, enforce) ({                                \
                VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page);        \
                PF_POISONED_CHECK(page); })
#define PF_SECOND(page, enforce) ({                                        \
                VM_BUG_ON_PGFLAGS(!PageHead(page), page);                \
                PF_POISONED_CHECK(&page[1]); })

/* Which page is the flag stored in */
#define FOLIO_PF_ANY                0
#define FOLIO_PF_HEAD                0
#define FOLIO_PF_NO_TAIL        0
#define FOLIO_PF_NO_COMPOUND        0
#define FOLIO_PF_SECOND                1

#define FOLIO_HEAD_PAGE                0
#define FOLIO_SECOND_PAGE        1

/*
 * Macros to create function definitions for page flags
 */
#define FOLIO_TEST_FLAG(name, page)                                        \
static __always_inline bool folio_test_##name(const struct folio *folio) \
{ return test_bit(PG_##name, const_folio_flags(folio, page)); }

#define FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void folio_set_##name(struct folio *folio)        \
{ set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void folio_clear_##name(struct folio *folio)        \
{ clear_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_SET_FLAG(name, page)                                        \
static __always_inline void __folio_set_##name(struct folio *folio)        \
{ __set_bit(PG_##name, folio_flags(folio, page)); }

#define __FOLIO_CLEAR_FLAG(name, page)                                        \
static __always_inline void __folio_clear_##name(struct folio *folio)        \
{ __clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_SET_FLAG(name, page)                                        \
static __always_inline bool folio_test_set_##name(struct folio *folio)        \
{ return test_and_set_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_TEST_CLEAR_FLAG(name, page)                                \
static __always_inline bool folio_test_clear_##name(struct folio *folio) \
{ return test_and_clear_bit(PG_##name, folio_flags(folio, page)); }

#define FOLIO_FLAG(name, page)                                                \
FOLIO_TEST_FLAG(name, page)                                                \
FOLIO_SET_FLAG(name, page)                                                \
FOLIO_CLEAR_FLAG(name, page)

#define TESTPAGEFLAG(uname, lname, policy)                                \
FOLIO_TEST_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline int Page##uname(const struct page *page)                \
{ return test_bit(PG_##lname, &policy(page, 0)->flags); }

#define SETPAGEFLAG(uname, lname, policy)                                \
FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void SetPage##uname(struct page *page)                \
{ set_bit(PG_##lname, &policy(page, 1)->flags); }

#define CLEARPAGEFLAG(uname, lname, policy)                                \
FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void ClearPage##uname(struct page *page)                \
{ clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define __SETPAGEFLAG(uname, lname, policy)                                \
__FOLIO_SET_FLAG(lname, FOLIO_##policy)                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{ __set_bit(PG_##lname, &policy(page, 1)->flags); }

#define __CLEARPAGEFLAG(uname, lname, policy)                                \
__FOLIO_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline void __ClearPage##uname(struct page *page)        \
{ __clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTSETFLAG(uname, lname, policy)                                \
FOLIO_TEST_SET_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestSetPage##uname(struct page *page)        \
{ return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); }

#define TESTCLEARFLAG(uname, lname, policy)                                \
FOLIO_TEST_CLEAR_FLAG(lname, FOLIO_##policy)                                \
static __always_inline int TestClearPage##uname(struct page *page)        \
{ return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); }

#define PAGEFLAG(uname, lname, policy)                                        \
        TESTPAGEFLAG(uname, lname, policy)                                \
        SETPAGEFLAG(uname, lname, policy)                                \
        CLEARPAGEFLAG(uname, lname, policy)

#define __PAGEFLAG(uname, lname, policy)                                \
        TESTPAGEFLAG(uname, lname, policy)                                \
        __SETPAGEFLAG(uname, lname, policy)                                \
        __CLEARPAGEFLAG(uname, lname, policy)

#define TESTSCFLAG(uname, lname, policy)                                \
        TESTSETFLAG(uname, lname, policy)                                \
        TESTCLEARFLAG(uname, lname, policy)

#define FOLIO_TEST_FLAG_FALSE(name)                                        \
static inline bool folio_test_##name(const struct folio *folio)                \
{ return false; }
#define FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void folio_set_##name(struct folio *folio) { }
#define FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void folio_clear_##name(struct folio *folio) { }
#define __FOLIO_SET_FLAG_NOOP(name)                                        \
static inline void __folio_set_##name(struct folio *folio) { }
#define __FOLIO_CLEAR_FLAG_NOOP(name)                                        \
static inline void __folio_clear_##name(struct folio *folio) { }
#define FOLIO_TEST_SET_FLAG_FALSE(name)                                        \
static inline bool folio_test_set_##name(struct folio *folio)                \
{ return false; }
#define FOLIO_TEST_CLEAR_FLAG_FALSE(name)                                \
static inline bool folio_test_clear_##name(struct folio *folio)                \
{ return false; }

#define FOLIO_FLAG_FALSE(name)                                                \
FOLIO_TEST_FLAG_FALSE(name)                                                \
FOLIO_SET_FLAG_NOOP(name)                                                \
FOLIO_CLEAR_FLAG_NOOP(name)

#define TESTPAGEFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_FLAG_FALSE(lname)                                                \
static inline int Page##uname(const struct page *page) { return 0; }

#define SETPAGEFLAG_NOOP(uname, lname)                                        \
FOLIO_SET_FLAG_NOOP(lname)                                                \
static inline void SetPage##uname(struct page *page) {  }

#define CLEARPAGEFLAG_NOOP(uname, lname)                                \
FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void ClearPage##uname(struct page *page) {  }

#define __CLEARPAGEFLAG_NOOP(uname, lname)                                \
__FOLIO_CLEAR_FLAG_NOOP(lname)                                                \
static inline void __ClearPage##uname(struct page *page) {  }

#define TESTSETFLAG_FALSE(uname, lname)                                        \
FOLIO_TEST_SET_FLAG_FALSE(lname)                                        \
static inline int TestSetPage##uname(struct page *page) { return 0; }

#define TESTCLEARFLAG_FALSE(uname, lname)                                \
FOLIO_TEST_CLEAR_FLAG_FALSE(lname)                                        \
static inline int TestClearPage##uname(struct page *page) { return 0; }

#define PAGEFLAG_FALSE(uname, lname) TESTPAGEFLAG_FALSE(uname, lname)        \
        SETPAGEFLAG_NOOP(uname, lname) CLEARPAGEFLAG_NOOP(uname, lname)

#define TESTSCFLAG_FALSE(uname, lname)                                        \
        TESTSETFLAG_FALSE(uname, lname) TESTCLEARFLAG_FALSE(uname, lname)

__PAGEFLAG(Locked, locked, PF_NO_TAIL)
FOLIO_FLAG(waiters, FOLIO_HEAD_PAGE)
FOLIO_FLAG(referenced, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(referenced, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(referenced, FOLIO_HEAD_PAGE)
PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD)
        __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD)
PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD)
        TESTCLEARFLAG(LRU, lru, PF_HEAD)
FOLIO_FLAG(active, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(active, FOLIO_HEAD_PAGE)
PAGEFLAG(Workingset, workingset, PF_HEAD)
        TESTCLEARFLAG(Workingset, workingset, PF_HEAD)
PAGEFLAG(Checked, checked, PF_NO_COMPOUND)           /* Used by some filesystems */

/* Xen */
PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND)
        TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND)
PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND);
PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND);
PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)
        TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND)

PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
        __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND)
FOLIO_FLAG(swapbacked, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(swapbacked, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(swapbacked, FOLIO_HEAD_PAGE)

/*
 * Private page markings that may be used by the filesystem that owns the page
 * for its own purposes.
 * - PG_private and PG_private_2 cause release_folio() and co to be invoked
 */
PAGEFLAG(Private, private, PF_ANY)
FOLIO_FLAG(private_2, FOLIO_HEAD_PAGE)

/* owner_2 can be set on tail pages for anon memory */
FOLIO_FLAG(owner_2, FOLIO_HEAD_PAGE)

/*
 * Only test-and-set exist for PG_writeback.  The unconditional operators are
 * risky: they bypass page accounting.
 */
TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL)
        TESTSCFLAG(Writeback, writeback, PF_NO_TAIL)
FOLIO_FLAG(mappedtodisk, FOLIO_HEAD_PAGE)

/* PG_readahead is only used for reads; PG_reclaim is only for writes */
PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL)
        TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL)
FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE)

FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE)
        __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE)

#ifdef CONFIG_HIGHMEM
/*
 * Must use a macro here due to header dependency issues. page_zone() is not
 * available at this point.
 */
#define PageHighMem(__p) is_highmem_idx(page_zonenum(__p))
#define folio_test_highmem(__f)        is_highmem_idx(folio_zonenum(__f))
#else
PAGEFLAG_FALSE(HighMem, highmem)
#endif

#ifdef CONFIG_SWAP
static __always_inline bool folio_test_swapcache(const struct folio *folio)
{
        return folio_test_swapbacked(folio) &&
                        test_bit(PG_swapcache, const_folio_flags(folio, 0));
}

FOLIO_SET_FLAG(swapcache, FOLIO_HEAD_PAGE)
FOLIO_CLEAR_FLAG(swapcache, FOLIO_HEAD_PAGE)
#else
FOLIO_FLAG_FALSE(swapcache)
#endif

FOLIO_FLAG(unevictable, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(unevictable, FOLIO_HEAD_PAGE)

#ifdef CONFIG_MMU
FOLIO_FLAG(mlocked, FOLIO_HEAD_PAGE)
        __FOLIO_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
        FOLIO_TEST_CLEAR_FLAG(mlocked, FOLIO_HEAD_PAGE)
        FOLIO_TEST_SET_FLAG(mlocked, FOLIO_HEAD_PAGE)
#else
FOLIO_FLAG_FALSE(mlocked)
        __FOLIO_CLEAR_FLAG_NOOP(mlocked)
        FOLIO_TEST_CLEAR_FLAG_FALSE(mlocked)
        FOLIO_TEST_SET_FLAG_FALSE(mlocked)
#endif

#ifdef CONFIG_MEMORY_FAILURE
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
#define __PG_HWPOISON (1UL << PG_hwpoison)
#else
PAGEFLAG_FALSE(HWPoison, hwpoison)
#define __PG_HWPOISON 0
#endif

#ifdef CONFIG_PAGE_IDLE_FLAG
#ifdef CONFIG_64BIT
FOLIO_TEST_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_SET_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_TEST_CLEAR_FLAG(young, FOLIO_HEAD_PAGE)
FOLIO_FLAG(idle, FOLIO_HEAD_PAGE)
#endif
/* See page_idle.h for !64BIT workaround */
#else /* !CONFIG_PAGE_IDLE_FLAG */
FOLIO_FLAG_FALSE(young)
FOLIO_TEST_CLEAR_FLAG_FALSE(young)
FOLIO_FLAG_FALSE(idle)
#endif

/*
 * PageReported() is used to track reported free pages within the Buddy
 * allocator. We can use the non-atomic version of the test and set
 * operations as both should be shielded with the zone lock to prevent
 * any possible races on the setting or clearing of the bit.
 */
__PAGEFLAG(Reported, reported, PF_NO_COMPOUND)

#ifdef CONFIG_MEMORY_HOTPLUG
PAGEFLAG(VmemmapSelfHosted, vmemmap_self_hosted, PF_ANY)
#else
PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted)
#endif

/*
 * On an anonymous folio mapped into a user virtual memory area,
 * folio->mapping points to its anon_vma, not to a struct address_space;
 * with the PAGE_MAPPING_ANON bit set to distinguish it.  See rmap.h.
 *
 * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
 * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON
 * bit; and then folio->mapping points, not to an anon_vma, but to a private
 * structure which KSM associates with that merged page.  See ksm.h.
 *
 * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable
 * page and then folio->mapping points to a struct movable_operations.
 *
 * Please note that, confusingly, "folio_mapping" refers to the inode
 * address_space which maps the folio from disk; whereas "folio_mapped"
 * refers to user virtual address space into which the folio is mapped.
 *
 * For slab pages, since slab reuses the bits in struct page to store its
 * internal states, the folio->mapping does not exist as such, nor do
 * these flags below.  So in order to avoid testing non-existent bits,
 * please make sure that folio_test_slab(folio) actually evaluates to
 * false before calling the following functions (e.g., folio_test_anon).
 * See mm/slab.h.
 */
#define PAGE_MAPPING_ANON        0x1
#define PAGE_MAPPING_MOVABLE        0x2
#define PAGE_MAPPING_KSM        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)
#define PAGE_MAPPING_FLAGS        (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE)

static __always_inline bool folio_mapping_flags(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) != 0;
}

static __always_inline bool PageMappingFlags(const struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0;
}

static __always_inline bool folio_test_anon(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0;
}

static __always_inline bool PageAnonNotKsm(const struct page *page)
{
        unsigned long flags = (unsigned long)page_folio(page)->mapping;

        return (flags & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_ANON;
}

static __always_inline bool PageAnon(const struct page *page)
{
        return folio_test_anon(page_folio(page));
}

static __always_inline bool __folio_test_movable(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
                        PAGE_MAPPING_MOVABLE;
}

static __always_inline bool __PageMovable(const struct page *page)
{
        return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_MOVABLE;
}

#ifdef CONFIG_KSM
/*
 * A KSM page is one of those write-protected "shared pages" or "merged pages"
 * which KSM maps into multiple mms, wherever identical anonymous page content
 * is found in VM_MERGEABLE vmas.  It's a PageAnon page, pointing not to any
 * anon_vma, but to that page's node of the stable tree.
 */
static __always_inline bool folio_test_ksm(const struct folio *folio)
{
        return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) ==
                                PAGE_MAPPING_KSM;
}
#else
FOLIO_TEST_FLAG_FALSE(ksm)
#endif

u64 stable_page_flags(const struct page *page);

/**
 * folio_xor_flags_has_waiters - Change some folio flags.
 * @folio: The folio.
 * @mask: Bits set in this word will be changed.
 *
 * This must only be used for flags which are changed with the folio
 * lock held.  For example, it is unsafe to use for PG_dirty as that
 * can be set without the folio lock held.  It can also only be used
 * on flags which are in the range 0-6 as some of the implementations
 * only affect those bits.
 *
 * Return: Whether there are tasks waiting on the folio.
 */
static inline bool folio_xor_flags_has_waiters(struct folio *folio,
                unsigned long mask)
{
        return xor_unlock_is_negative_byte(mask, folio_flags(folio, 0));
}

/**
 * folio_test_uptodate - Is this folio up to date?
 * @folio: The folio.
 *
 * The uptodate flag is set on a folio when every byte in the folio is
 * at least as new as the corresponding bytes on storage.  Anonymous
 * and CoW folios are always uptodate.  If the folio is not uptodate,
 * some of the bytes in it may be; see the is_partially_uptodate()
 * address_space operation.
 */
static inline bool folio_test_uptodate(const struct folio *folio)
{
        bool ret = test_bit(PG_uptodate, const_folio_flags(folio, 0));
        /*
         * Must ensure that the data we read out of the folio is loaded
         * _after_ we've loaded folio->flags to check the uptodate bit.
         * We can skip the barrier if the folio is not uptodate, because
         * we wouldn't be reading anything from it.
         *
         * See folio_mark_uptodate() for the other side of the story.
         */
        if (ret)
                smp_rmb();

        return ret;
}

static inline bool PageUptodate(const struct page *page)
{
        return folio_test_uptodate(page_folio(page));
}

static __always_inline void __folio_mark_uptodate(struct folio *folio)
{
        smp_wmb();
        __set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void folio_mark_uptodate(struct folio *folio)
{
        /*
         * Memory barrier must be issued before setting the PG_uptodate bit,
         * so that all previous stores issued in order to bring the folio
         * uptodate are actually visible before folio_test_uptodate becomes true.
         */
        smp_wmb();
        set_bit(PG_uptodate, folio_flags(folio, 0));
}

static __always_inline void __SetPageUptodate(struct page *page)
{
        __folio_mark_uptodate((struct folio *)page);
}

static __always_inline void SetPageUptodate(struct page *page)
{
        folio_mark_uptodate((struct folio *)page);
}

CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL)

void __folio_start_writeback(struct folio *folio, bool keep_write);
void set_page_writeback(struct page *page);

#define folio_start_writeback(folio)                        \
        __folio_start_writeback(folio, false)
#define folio_start_writeback_keepwrite(folio)        \
        __folio_start_writeback(folio, true)

static __always_inline bool folio_test_head(const struct folio *folio)
{
        return test_bit(PG_head, const_folio_flags(folio, FOLIO_PF_ANY));
}

static __always_inline int PageHead(const struct page *page)
{
        PF_POISONED_CHECK(page);
        return test_bit(PG_head, &page->flags) && !page_is_fake_head(page);
}

__SETPAGEFLAG(Head, head, PF_ANY)
__CLEARPAGEFLAG(Head, head, PF_ANY)
CLEARPAGEFLAG(Head, head, PF_ANY)

/**
 * folio_test_large() - Does this folio contain more than one page?
 * @folio: The folio to test.
 *
 * Return: True if the folio is larger than one page.
 */
static inline bool folio_test_large(const struct folio *folio)
{
        return folio_test_head(folio);
}

static __always_inline void set_compound_head(struct page *page, struct page *head)
{
        WRITE_ONCE(page->compound_head, (unsigned long)head + 1);
}

static __always_inline void clear_compound_head(struct page *page)
{
        WRITE_ONCE(page->compound_head, 0);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void ClearPageCompound(struct page *page)
{
        BUG_ON(!PageHead(page));
        ClearPageHead(page);
}
FOLIO_FLAG(large_rmappable, FOLIO_SECOND_PAGE)
FOLIO_FLAG(partially_mapped, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(large_rmappable)
FOLIO_FLAG_FALSE(partially_mapped)
#endif

#define PG_head_mask ((1UL << PG_head))

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * PageHuge() only returns true for hugetlbfs pages, but not for
 * normal or transparent huge pages.
 *
 * PageTransHuge() returns true for both transparent huge and
 * hugetlbfs pages, but not normal pages. PageTransHuge() can only be
 * called only in the core VM paths where hugetlbfs pages can't exist.
 */
static inline int PageTransHuge(const struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        return PageHead(page);
}

/*
 * PageTransCompound returns true for both transparent huge pages
 * and hugetlbfs pages, so it should only be called when it's known
 * that hugetlbfs pages aren't involved.
 */
static inline int PageTransCompound(const struct page *page)
{
        return PageCompound(page);
}
#else
TESTPAGEFLAG_FALSE(TransHuge, transhuge)
TESTPAGEFLAG_FALSE(TransCompound, transcompound)
#endif

#if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
/*
 * PageHasHWPoisoned indicates that at least one subpage is hwpoisoned in the
 * compound page.
 *
 * This flag is set by hwpoison handler.  Cleared by THP split or free page.
 */
FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE)
#else
FOLIO_FLAG_FALSE(has_hwpoisoned)
#endif

/*
 * For pages that do not use mapcount, page_type may be used.
 * The low 24 bits of pagetype may be used for your own purposes, as long
 * as you are careful to not affect the top 8 bits.  The low bits of
 * pagetype will be overwritten when you clear the page_type from the page.
 */
enum pagetype {
        /* 0x00-0x7f are positive numbers, ie mapcount */
        /* Reserve 0x80-0xef for mapcount overflow. */
        PGTY_buddy                = 0xf0,
        PGTY_offline                = 0xf1,
        PGTY_table                = 0xf2,
        PGTY_guard                = 0xf3,
        PGTY_hugetlb                = 0xf4,
        PGTY_slab                = 0xf5,
        PGTY_zsmalloc                = 0xf6,
        PGTY_unaccepted                = 0xf7,
        PGTY_large_kmalloc        = 0xf8,

        PGTY_mapcount_underflow = 0xff
};

static inline bool page_type_has_type(int page_type)
{
        return page_type < (PGTY_mapcount_underflow << 24);
}

/* This takes a mapcount which is one more than page->_mapcount */
static inline bool page_mapcount_is_type(unsigned int mapcount)
{
        return page_type_has_type(mapcount - 1);
}

static inline bool page_has_type(const struct page *page)
{
        return page_mapcount_is_type(data_race(page->page_type));
}

#define FOLIO_TYPE_OPS(lname, fname)                                        \
static __always_inline bool folio_test_##fname(const struct folio *folio) \
{                                                                        \
        return data_race(folio->page.page_type >> 24) == PGTY_##lname;        \
}                                                                        \
static __always_inline void __folio_set_##fname(struct folio *folio)        \
{                                                                        \
        if (folio_test_##fname(folio))                                        \
                return;                                                        \
        VM_BUG_ON_FOLIO(data_race(folio->page.page_type) != UINT_MAX,        \
                        folio);                                                \
        folio->page.page_type = (unsigned int)PGTY_##lname << 24;        \
}                                                                        \
static __always_inline void __folio_clear_##fname(struct folio *folio)        \
{                                                                        \
        if (folio->page.page_type == UINT_MAX)                                \
                return;                                                        \
        VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio);                \
        folio->page.page_type = UINT_MAX;                                \
}

#define PAGE_TYPE_OPS(uname, lname, fname)                                \
FOLIO_TYPE_OPS(lname, fname)                                                \
static __always_inline int Page##uname(const struct page *page)                \
{                                                                        \
        return data_race(page->page_type >> 24) == PGTY_##lname;        \
}                                                                        \
static __always_inline void __SetPage##uname(struct page *page)                \
{                                                                        \
        if (Page##uname(page))                                                \
                return;                                                        \
        VM_BUG_ON_PAGE(data_race(page->page_type) != UINT_MAX, page);        \
        page->page_type = (unsigned int)PGTY_##lname << 24;                \
}                                                                        \
static __always_inline void __ClearPage##uname(struct page *page)        \
{                                                                        \
        if (page->page_type == UINT_MAX)                                \
                return;                                                        \
        VM_BUG_ON_PAGE(!Page##uname(page), page);                        \
        page->page_type = UINT_MAX;                                        \
}

/*
 * PageBuddy() indicates that the page is free and in the buddy system
 * (see mm/page_alloc.c).
 */
PAGE_TYPE_OPS(Buddy, buddy, buddy)

/*
 * PageOffline() indicates that the page is logically offline although the
 * containing section is online. (e.g. inflated in a balloon driver or
 * not onlined when onlining the section).
 * The content of these pages is effectively stale. Such pages should not
 * be touched (read/write/dump/save) except by their owner.
 *
 * When a memory block gets onlined, all pages are initialized with a
 * refcount of 1 and PageOffline(). generic_online_page() will
 * take care of clearing PageOffline().
 *
 * If a driver wants to allow to offline unmovable PageOffline() pages without
 * putting them back to the buddy, it can do so via the memory notifier by
 * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the
 * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline()
 * pages (now with a reference count of zero) are treated like free (unmanaged)
 * pages, allowing the containing memory block to get offlined. A driver that
 * relies on this feature is aware that re-onlining the memory block will
 * require not giving them to the buddy via generic_online_page().
 *
 * Memory offlining code will not adjust the managed page count for any
 * PageOffline() pages, treating them like they were never exposed to the
 * buddy using generic_online_page().
 *
 * There are drivers that mark a page PageOffline() and expect there won't be
 * any further access to page content. PFN walkers that read content of random
 * pages should check PageOffline() and synchronize with such drivers using
 * page_offline_freeze()/page_offline_thaw().
 */
PAGE_TYPE_OPS(Offline, offline, offline)

extern void page_offline_freeze(void);
extern void page_offline_thaw(void);
extern void page_offline_begin(void);
extern void page_offline_end(void);

/*
 * Marks pages in use as page tables.
 */
PAGE_TYPE_OPS(Table, table, pgtable)

/*
 * Marks guardpages used with debug_pagealloc.
 */
PAGE_TYPE_OPS(Guard, guard, guard)

FOLIO_TYPE_OPS(slab, slab)

/**
 * PageSlab - Determine if the page belongs to the slab allocator
 * @page: The page to test.
 *
 * Context: Any context.
 * Return: True for slab pages, false for any other kind of page.
 */
static inline bool PageSlab(const struct page *page)
{
        return folio_test_slab(page_folio(page));
}

#ifdef CONFIG_HUGETLB_PAGE
FOLIO_TYPE_OPS(hugetlb, hugetlb)
#else
FOLIO_TEST_FLAG_FALSE(hugetlb)
#endif

PAGE_TYPE_OPS(Zsmalloc, zsmalloc, zsmalloc)

/*
 * Mark pages that has to be accepted before touched for the first time.
 *
 * Serialized with zone lock.
 */
PAGE_TYPE_OPS(Unaccepted, unaccepted, unaccepted)
FOLIO_TYPE_OPS(large_kmalloc, large_kmalloc)

/**
 * PageHuge - Determine if the page belongs to hugetlbfs
 * @page: The page to test.
 *
 * Context: Any context.
 * Return: True for hugetlbfs pages, false for anon pages or pages
 * belonging to other filesystems.
 */
static inline bool PageHuge(const struct page *page)
{
        return folio_test_hugetlb(page_folio(page));
}

/*
 * Check if a page is currently marked HWPoisoned. Note that this check is
 * best effort only and inherently racy: there is no way to synchronize with
 * failing hardware.
 */
static inline bool is_page_hwpoison(const struct page *page)
{
        const struct folio *folio;

        if (PageHWPoison(page))
                return true;
        folio = page_folio(page);
        return folio_test_hugetlb(folio) && PageHWPoison(&folio->page);
}

static inline bool folio_contain_hwpoisoned_page(struct folio *folio)
{
        return folio_test_hwpoison(folio) ||
            (folio_test_large(folio) && folio_test_has_hwpoisoned(folio));
}

bool is_free_buddy_page(const struct page *page);

PAGEFLAG(Isolated, isolated, PF_ANY);

static __always_inline int PageAnonExclusive(const struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        /*
         * HugeTLB stores this information on the head page; THP keeps it per
         * page
         */
        if (PageHuge(page))
                page = compound_head(page);
        return test_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

static __always_inline void SetPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

static __always_inline void ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

static __always_inline void __ClearPageAnonExclusive(struct page *page)
{
        VM_BUG_ON_PGFLAGS(!PageAnon(page), page);
        VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page);
        __clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags);
}

#ifdef CONFIG_MMU
#define __PG_MLOCKED                (1UL << PG_mlocked)
#else
#define __PG_MLOCKED                0
#endif

/*
 * Flags checked when a page is freed.  Pages being freed should not have
 * these flags set.  If they are, there is a problem.
 */
#define PAGE_FLAGS_CHECK_AT_FREE                                \
        (1UL << PG_lru                | 1UL << PG_locked        |        \
         1UL << PG_private        | 1UL << PG_private_2        |        \
         1UL << PG_writeback        | 1UL << PG_reserved        |        \
         1UL << PG_active         |                                \
         1UL << PG_unevictable        | __PG_MLOCKED | LRU_GEN_MASK)

/*
 * Flags checked when a page is prepped for return by the page allocator.
 * Pages being prepped should not have these flags set.  If they are set,
 * there has been a kernel bug or struct page corruption.
 *
 * __PG_HWPOISON is exceptional because it needs to be kept beyond page's
 * alloc-free cycle to prevent from reusing the page.
 */
#define PAGE_FLAGS_CHECK_AT_PREP        \
        ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)

/*
 * Flags stored in the second page of a compound page.  They may overlap
 * the CHECK_AT_FREE flags above, so need to be cleared.
 */
#define PAGE_FLAGS_SECOND                                                \
        (0xffUL /* order */                | 1UL << PG_has_hwpoisoned |        \
         1UL << PG_large_rmappable        | 1UL << PG_partially_mapped)

#define PAGE_FLAGS_PRIVATE                                \
        (1UL << PG_private | 1UL << PG_private_2)
/**
 * folio_has_private - Determine if folio has private stuff
 * @folio: The folio to be checked
 *
 * Determine if a folio has private stuff, indicating that release routines
 * should be invoked upon it.
 */
static inline int folio_has_private(const struct folio *folio)
{
        return !!(folio->flags & PAGE_FLAGS_PRIVATE);
}

static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio)
{
        return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
}
#undef PF_ANY
#undef PF_HEAD
#undef PF_NO_TAIL
#undef PF_NO_COMPOUND
#undef PF_SECOND
#endif /* !__GENERATING_BOUNDS_H */

#endif        /* PAGE_FLAGS_H */









































































































































































































































































































































































  323 





















  224 











































  223 

  323 
  224 







































































































































































































































   65 







  140 
































































































































































  224 



















































































  325 

  325 




  324 





  325 
  325 

  325 
  325 































  470 

  469 



















  469 
  470 

  470 
  469 




  470 
  471 











































































































































































































































































































































































































































































































































   57 













  224 





   57 
   57 
   21 












  256 
  224 



























































































































































































































































































  156 





  155 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* memcontrol.h - Memory Controller
 *
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <balbir@linux.vnet.ibm.com>
 *
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 */

#ifndef _LINUX_MEMCONTROL_H
#define _LINUX_MEMCONTROL_H
#include <linux/cgroup.h>
#include <linux/vm_event_item.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/page_counter.h>
#include <linux/vmpressure.h>
#include <linux/eventfd.h>
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/writeback.h>
#include <linux/page-flags.h>
#include <linux/shrinker.h>

struct mem_cgroup;
struct obj_cgroup;
struct page;
struct mm_struct;
struct kmem_cache;

/* Cgroup-specific page state, on top of universal node page state */
enum memcg_stat_item {
        MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
        MEMCG_SOCK,
        MEMCG_PERCPU_B,
        MEMCG_VMALLOC,
        MEMCG_KMEM,
        MEMCG_ZSWAP_B,
        MEMCG_ZSWAPPED,
        MEMCG_NR_STAT,
};

enum memcg_memory_event {
        MEMCG_LOW,
        MEMCG_HIGH,
        MEMCG_MAX,
        MEMCG_OOM,
        MEMCG_OOM_KILL,
        MEMCG_OOM_GROUP_KILL,
        MEMCG_SWAP_HIGH,
        MEMCG_SWAP_MAX,
        MEMCG_SWAP_FAIL,
        MEMCG_NR_MEMORY_EVENTS,
};

struct mem_cgroup_reclaim_cookie {
        pg_data_t *pgdat;
        int generation;
};

#ifdef CONFIG_MEMCG

#define MEM_CGROUP_ID_SHIFT        16

struct mem_cgroup_id {
        int id;
        refcount_t ref;
};

struct memcg_vmstats_percpu;
struct memcg1_events_percpu;
struct memcg_vmstats;
struct lruvec_stats_percpu;
struct lruvec_stats;

struct mem_cgroup_reclaim_iter {
        struct mem_cgroup *position;
        /* scan generation, increased every round-trip */
        atomic_t generation;
};

/*
 * per-node information in memory controller.
 */
struct mem_cgroup_per_node {
        /* Keep the read-only fields at the start */
        struct mem_cgroup        *memcg;                /* Back pointer, we cannot */
                                                /* use container_of           */

        struct lruvec_stats_percpu __percpu        *lruvec_stats_percpu;
        struct lruvec_stats                        *lruvec_stats;
        struct shrinker_info __rcu        *shrinker_info;

#ifdef CONFIG_MEMCG_V1
        /*
         * Memcg-v1 only stuff in middle as buffer between read mostly fields
         * and update often fields to avoid false sharing. If v1 stuff is
         * not present, an explicit padding is needed.
         */

        struct rb_node                tree_node;        /* RB tree node */
        unsigned long                usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                        on_tree;
#else
        CACHELINE_PADDING(_pad1_);
#endif

        /* Fields which get updated often at the end. */
        struct lruvec                lruvec;
        CACHELINE_PADDING(_pad2_);
        unsigned long                lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
        struct mem_cgroup_reclaim_iter        iter;
};

struct mem_cgroup_threshold {
        struct eventfd_ctx *eventfd;
        unsigned long threshold;
};

/* For threshold */
struct mem_cgroup_threshold_ary {
        /* An array index points to threshold just below or equal to usage. */
        int current_threshold;
        /* Size of entries[] */
        unsigned int size;
        /* Array of thresholds */
        struct mem_cgroup_threshold entries[] __counted_by(size);
};

struct mem_cgroup_thresholds {
        /* Primary thresholds array */
        struct mem_cgroup_threshold_ary *primary;
        /*
         * Spare threshold array.
         * This is needed to make mem_cgroup_unregister_event() "never fail".
         * It must be able to store at least primary->size - 1 entries.
         */
        struct mem_cgroup_threshold_ary *spare;
};

/*
 * Remember four most recent foreign writebacks with dirty pages in this
 * cgroup.  Inode sharing is expected to be uncommon and, even if we miss
 * one in a given round, we're likely to catch it later if it keeps
 * foreign-dirtying, so a fairly low count should be enough.
 *
 * See mem_cgroup_track_foreign_dirty_slowpath() for details.
 */
#define MEMCG_CGWB_FRN_CNT        4

struct memcg_cgwb_frn {
        u64 bdi_id;                        /* bdi->id of the foreign inode */
        int memcg_id;                        /* memcg->css.id of foreign inode */
        u64 at;                                /* jiffies_64 at the time of dirtying */
        struct wb_completion done;        /* tracks in-flight foreign writebacks */
};

/*
 * Bucket for arbitrarily byte-sized objects charged to a memory
 * cgroup. The bucket can be reparented in one piece when the cgroup
 * is destroyed, without having to round up the individual references
 * of all live memory objects in the wild.
 */
struct obj_cgroup {
        struct percpu_ref refcnt;
        struct mem_cgroup *memcg;
        atomic_t nr_charged_bytes;
        union {
                struct list_head list; /* protected by objcg_lock */
                struct rcu_head rcu;
        };
};

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 */
struct mem_cgroup {
        struct cgroup_subsys_state css;

        /* Private memcg ID. Used to ID objects that outlive the cgroup */
        struct mem_cgroup_id id;

        /* Accounted resources */
        struct page_counter memory;                /* Both v1 & v2 */

        union {
                struct page_counter swap;        /* v2 only */
                struct page_counter memsw;        /* v1 only */
        };

        /* registered local peak watchers */
        struct list_head memory_peaks;
        struct list_head swap_peaks;
        spinlock_t         peaks_lock;

        /* Range enforcement for interrupt charges */
        struct work_struct high_work;

#ifdef CONFIG_ZSWAP
        unsigned long zswap_max;

        /*
         * Prevent pages from this memcg from being written back from zswap to
         * swap, and from being swapped out on zswap store failures.
         */
        bool zswap_writeback;
#endif

        /* vmpressure notifications */
        struct vmpressure vmpressure;

        /*
         * Should the OOM killer kill all belonging tasks, had it kill one?
         */
        bool oom_group;

        int swappiness;

        /* memory.events and memory.events.local */
        struct cgroup_file events_file;
        struct cgroup_file events_local_file;

        /* handle for "memory.swap.events" */
        struct cgroup_file swap_events_file;

        /* memory.stat */
        struct memcg_vmstats        *vmstats;

        /* memory.events */
        atomic_long_t                memory_events[MEMCG_NR_MEMORY_EVENTS];
        atomic_long_t                memory_events_local[MEMCG_NR_MEMORY_EVENTS];

        /*
         * Hint of reclaim pressure for socket memroy management. Note
         * that this indicator should NOT be used in legacy cgroup mode
         * where socket memory is accounted/charged separately.
         */
        unsigned long                socket_pressure;

        int kmemcg_id;
        /*
         * memcg->objcg is wiped out as a part of the objcg repaprenting
         * process. memcg->orig_objcg preserves a pointer (and a reference)
         * to the original objcg until the end of live of memcg.
         */
        struct obj_cgroup __rcu        *objcg;
        struct obj_cgroup        *orig_objcg;
        /* list of inherited objcgs, protected by objcg_lock */
        struct list_head objcg_list;

        struct memcg_vmstats_percpu __percpu *vmstats_percpu;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head cgwb_list;
        struct wb_domain cgwb_domain;
        struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        struct deferred_split deferred_split_queue;
#endif

#ifdef CONFIG_LRU_GEN_WALKS_MMU
        /* per-memcg mm_struct list */
        struct lru_gen_mm_list mm_list;
#endif

#ifdef CONFIG_MEMCG_V1
        /* Legacy consumer-oriented counters */
        struct page_counter kmem;                /* v1 only */
        struct page_counter tcpmem;                /* v1 only */

        struct memcg1_events_percpu __percpu *events_percpu;

        unsigned long soft_limit;

        /* protected by memcg_oom_lock */
        bool oom_lock;
        int under_oom;

        /* OOM-Killer disable */
        int oom_kill_disable;

        /* protect arrays of thresholds */
        struct mutex thresholds_lock;

        /* thresholds for memory usage. RCU-protected */
        struct mem_cgroup_thresholds thresholds;

        /* thresholds for mem+swap usage. RCU-protected */
        struct mem_cgroup_thresholds memsw_thresholds;

        /* For oom notifier event fd */
        struct list_head oom_notify;

        /* Legacy tcp memory accounting */
        bool tcpmem_active;
        int tcpmem_pressure;

        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;
#endif /* CONFIG_MEMCG_V1 */

        struct mem_cgroup_per_node *nodeinfo[];
};

/*
 * size of first charge trial.
 * TODO: maybe necessary to use big numbers in big irons or dynamic based of the
 * workload.
 */
#define MEMCG_CHARGE_BATCH 64U

extern struct mem_cgroup *root_mem_cgroup;

enum page_memcg_data_flags {
        /* page->memcg_data is a pointer to an slabobj_ext vector */
        MEMCG_DATA_OBJEXTS = (1UL << 0),
        /* page has been accounted as a non-slab kernel page */
        MEMCG_DATA_KMEM = (1UL << 1),
        /* the next bit after the last actual flag */
        __NR_MEMCG_DATA_FLAGS  = (1UL << 2),
};

#define __FIRST_OBJEXT_FLAG        __NR_MEMCG_DATA_FLAGS

#else /* CONFIG_MEMCG */

#define __FIRST_OBJEXT_FLAG        (1UL << 0)

#endif /* CONFIG_MEMCG */

enum objext_flags {
        /* slabobj_ext vector failed to allocate */
        OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG,
        /* the next bit after the last actual flag */
        __NR_OBJEXTS_FLAGS  = (__FIRST_OBJEXT_FLAG << 1),
};

#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1)

#ifdef CONFIG_MEMCG

static inline bool folio_memcg_kmem(struct folio *folio);

/*
 * After the initialization objcg->memcg is always pointing at
 * a valid memcg, but can be atomically swapped to the parent memcg.
 *
 * The caller must ensure that the returned memcg won't be released.
 */
static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
{
        lockdep_assert_once(rcu_read_lock_held() || lockdep_is_held(&cgroup_mutex));
        return READ_ONCE(objcg->memcg);
}

/*
 * __folio_memcg - Get the memory cgroup associated with a non-kmem folio
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios or
 * kmem folios.
 */
static inline struct mem_cgroup *__folio_memcg(struct folio *folio)
{
        unsigned long memcg_data = folio->memcg_data;

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * __folio_objcg - get the object cgroup associated with a kmem folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the object cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper object cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios or
 * LRU folios.
 */
static inline struct obj_cgroup *__folio_objcg(struct folio *folio)
{
        unsigned long memcg_data = folio->memcg_data;

        VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
        VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
        VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);

        return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

/*
 * folio_memcg - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function assumes that the folio is known to have a
 * proper memory cgroup pointer. It's not safe to call this function
 * against some type of folios, e.g. slab folios or ex-slab folios.
 *
 * For a non-kmem folio any of the following ensures folio and memcg binding
 * stability:
 *
 * - the folio lock
 * - LRU isolation
 * - exclusive reference
 *
 * For a kmem folio a caller should hold an rcu read lock to protect memcg
 * associated with a kmem folio from being released.
 */
static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        if (folio_memcg_kmem(folio))
                return obj_cgroup_memcg(__folio_objcg(folio));
        return __folio_memcg(folio);
}

/*
 * folio_memcg_charged - If a folio is charged to a memory cgroup.
 * @folio: Pointer to the folio.
 *
 * Returns true if folio is charged to a memory cgroup, otherwise returns false.
 */
static inline bool folio_memcg_charged(struct folio *folio)
{
        return folio->memcg_data != 0;
}

/*
 * folio_memcg_check - Get the memory cgroup associated with a folio.
 * @folio: Pointer to the folio.
 *
 * Returns a pointer to the memory cgroup associated with the folio,
 * or NULL. This function unlike folio_memcg() can take any folio
 * as an argument. It has to be used in cases when it's not known if a folio
 * has an associated memory cgroup pointer or an object cgroups vector or
 * an object cgroup.
 *
 * For a non-kmem folio any of the following ensures folio and memcg binding
 * stability:
 *
 * - the folio lock
 * - LRU isolation
 * - exclusive reference
 *
 * For a kmem folio a caller should hold an rcu read lock to protect memcg
 * associated with a kmem folio from being released.
 */
static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        /*
         * Because folio->memcg_data might be changed asynchronously
         * for slabs, READ_ONCE() should be used here.
         */
        unsigned long memcg_data = READ_ONCE(folio->memcg_data);

        if (memcg_data & MEMCG_DATA_OBJEXTS)
                return NULL;

        if (memcg_data & MEMCG_DATA_KMEM) {
                struct obj_cgroup *objcg;

                objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
                return obj_cgroup_memcg(objcg);
        }

        return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK);
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        if (PageTail(page))
                return NULL;
        return folio_memcg_check((struct folio *)page);
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        struct mem_cgroup *memcg;

        rcu_read_lock();
retry:
        memcg = obj_cgroup_memcg(objcg);
        if (unlikely(!css_tryget(&memcg->css)))
                goto retry;
        rcu_read_unlock();

        return memcg;
}

/*
 * folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
 * @folio: Pointer to the folio.
 *
 * Checks if the folio has MemcgKmem flag set. The caller must ensure
 * that the folio has an associated memory cgroup. It's not safe to call
 * this function against some types of folios, e.g. slab folios.
 */
static inline bool folio_memcg_kmem(struct folio *folio)
{
        VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
        VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
        return folio->memcg_data & MEMCG_DATA_KMEM;
}

static inline bool PageMemcgKmem(struct page *page)
{
        return folio_memcg_kmem(page_folio(page));
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return (memcg == root_mem_cgroup);
}

static inline bool mem_cgroup_disabled(void)
{
        return !cgroup_subsys_enabled(memory_cgrp_subsys);
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;

        if (mem_cgroup_disabled())
                return;

        /*
         * There is no reclaim protection applied to a targeted reclaim.
         * We are special casing this specific case here because
         * mem_cgroup_calculate_protection is not robust enough to keep
         * the protection invariant for calculated effective values for
         * parallel reclaimers with different reclaim target. This is
         * especially a problem for tail memcgs (as they have pages on LRU)
         * which would want to have effective values 0 for targeted reclaim
         * but a different value for external reclaim.
         *
         * Example
         * Let's have global and A's reclaim in parallel:
         *  |
         *  A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G)
         *  |\
         *  | C (low = 1G, usage = 2.5G)
         *  B (low = 1G, usage = 0.5G)
         *
         * For the global reclaim
         * A.elow = A.low
         * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow
         * C.elow = min(C.usage, C.low)
         *
         * With the effective values resetting we have A reclaim
         * A.elow = 0
         * B.elow = B.low
         * C.elow = C.low
         *
         * If the global reclaim races with A's reclaim then
         * B.elow = C.elow = 0 because children_low_usage > A.elow)
         * is possible and reclaiming B would be violating the protection.
         *
         */
        if (root == memcg)
                return;

        *min = READ_ONCE(memcg->memory.emin);
        *low = READ_ONCE(memcg->memory.elow);
}

void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                     struct mem_cgroup *memcg);

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        /*
         * The root memcg doesn't account charges, and doesn't support
         * protection. The target memcg's protection is ignored, see
         * mem_cgroup_calculate_protection() and mem_cgroup_protection()
         */
        return mem_cgroup_disabled() || mem_cgroup_is_root(memcg) ||
                memcg == target;
}

static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.elow) >=
                page_counter_read(&memcg->memory);
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        if (mem_cgroup_unprotected(target, memcg))
                return false;

        return READ_ONCE(memcg->memory.emin) >=
                page_counter_read(&memcg->memory);
}

int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp);

/**
 * mem_cgroup_charge - Charge a newly allocated folio to a cgroup.
 * @folio: Folio to charge.
 * @mm: mm context of the allocating task.
 * @gfp: Reclaim mode.
 *
 * Try to charge @folio to the memcg that @mm belongs to, reclaiming
 * pages according to @gfp if necessary.  If @mm is NULL, try to
 * charge to the active memcg.
 *
 * Do not use this for folios allocated for swapin.
 *
 * Return: 0 on success. Otherwise, an error code is returned.
 */
static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm,
                                    gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_charge(folio, mm, gfp);
}

int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp);

int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
                                  gfp_t gfp, swp_entry_t entry);

void __mem_cgroup_uncharge(struct folio *folio);

/**
 * mem_cgroup_uncharge - Uncharge a folio.
 * @folio: Folio to uncharge.
 *
 * Uncharge a folio previously charged with mem_cgroup_charge().
 */
static inline void mem_cgroup_uncharge(struct folio *folio)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge(folio);
}

void __mem_cgroup_uncharge_folios(struct folio_batch *folios);
static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_folios(folios);
}

void mem_cgroup_replace_folio(struct folio *old, struct folio *new);
void mem_cgroup_migrate(struct folio *old, struct folio *new);

/**
 * mem_cgroup_lruvec - get the lru list vector for a memcg & node
 * @memcg: memcg of the wanted lruvec
 * @pgdat: pglist_data
 *
 * Returns the lru list vector holding pages for a given @memcg &
 * @pgdat combination. This can be the node lruvec, if the memory
 * controller is disabled.
 */
static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        struct mem_cgroup_per_node *mz;
        struct lruvec *lruvec;

        if (mem_cgroup_disabled()) {
                lruvec = &pgdat->__lruvec;
                goto out;
        }

        if (!memcg)
                memcg = root_mem_cgroup;

        mz = memcg->nodeinfo[pgdat->node_id];
        lruvec = &mz->lruvec;
out:
        /*
         * Since a node can be onlined after the mem_cgroup was created,
         * we have to be prepared to initialize lruvec->pgdat here;
         * and if offlined then reonlined, we need to reinitialize it.
         */
        if (unlikely(lruvec->pgdat != pgdat))
                lruvec->pgdat = pgdat;
        return lruvec;
}

/**
 * folio_lruvec - return lruvec for isolating/putting an LRU folio
 * @folio: Pointer to the folio.
 *
 * This function relies on folio->mem_cgroup being stable.
 */
static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct mem_cgroup *memcg = folio_memcg(folio);

        VM_WARN_ON_ONCE_FOLIO(!memcg && !mem_cgroup_disabled(), folio);
        return mem_cgroup_lruvec(memcg, folio_pgdat(folio));
}

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);

struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);

struct mem_cgroup *get_mem_cgroup_from_current(void);

struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio);

struct lruvec *folio_lruvec_lock(struct folio *folio);
struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                                                unsigned long *flags);

#ifdef CONFIG_DEBUG_VM
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio);
#else
static inline
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
}
#endif

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){
        return css ? container_of(css, struct mem_cgroup, css) : NULL;
}

static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg)
{
        return percpu_ref_tryget(&objcg->refcnt);
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
        percpu_ref_get(&objcg->refcnt);
}

static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
                                       unsigned long nr)
{
        percpu_ref_get_many(&objcg->refcnt, nr);
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
        if (objcg)
                percpu_ref_put(&objcg->refcnt);
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget(&memcg->css);
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return !memcg || css_tryget_online(&memcg->css);
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
        if (memcg)
                css_put(&memcg->css);
}

#define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)

struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
                                   struct mem_cgroup_reclaim_cookie *);
void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                           int (*)(struct task_struct *, void *), void *arg);

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return 0;

        return memcg->id.id;
}
struct mem_cgroup *mem_cgroup_from_id(unsigned short id);

#ifdef CONFIG_SHRINKER_DEBUG
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
{
        return memcg ? cgroup_ino(memcg->css.cgroup) : 0;
}

struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino);
#endif

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return mem_cgroup_from_css(seq_css(m));
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        struct mem_cgroup_per_node *mz;

        if (mem_cgroup_disabled())
                return NULL;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return mz->memcg;
}

/**
 * parent_mem_cgroup - find the accounting parent of a memcg
 * @memcg: memcg whose parent to find
 *
 * Returns the parent memcg, or NULL if this is the root.
 */
static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return mem_cgroup_from_css(memcg->css.parent);
}

static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg,
                              struct mem_cgroup *root)
{
        if (root == memcg)
                return true;
        return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                                   struct mem_cgroup *memcg)
{
        struct mem_cgroup *task_memcg;
        bool match = false;

        rcu_read_lock();
        task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (task_memcg)
                match = mem_cgroup_is_descendant(task_memcg, memcg);
        rcu_read_unlock();
        return match;
}

struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio);
ino_t page_cgroup_ino(struct page *page);

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        if (mem_cgroup_disabled())
                return true;
        return !!(memcg->css.flags & CSS_ONLINE);
}

void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
                int zid, int nr_pages);

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        struct mem_cgroup_per_node *mz;

        mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        return READ_ONCE(mz->lru_zone_size[zone_idx][lru]);
}

void mem_cgroup_handle_over_high(gfp_t gfp_mask);

unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);

unsigned long mem_cgroup_size(struct mem_cgroup *memcg);

void mem_cgroup_print_oom_context(struct mem_cgroup *memcg,
                                struct task_struct *p);

void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg);

struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
                                            struct mem_cgroup *oom_domain);
void mem_cgroup_print_oom_group(struct mem_cgroup *memcg);

void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
                       int val);

/* idx can be of type enum memcg_stat_item or node_stat_item */
static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   enum memcg_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_memcg_state(memcg, idx, val);
        local_irq_restore(flags);
}

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = folio_memcg(page_folio(page));
        if (memcg)
                mod_memcg_state(memcg, idx, val);
        rcu_read_unlock();
}

unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx);
unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx);
unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                      enum node_stat_item idx);

void mem_cgroup_flush_stats(struct mem_cgroup *memcg);
void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg);

void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val);

static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                         int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_kmem_state(p, idx, val);
        local_irq_restore(flags);
}

void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
                          unsigned long count);

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
        unsigned long flags;

        local_irq_save(flags);
        __count_memcg_events(memcg, idx, count);
        local_irq_restore(flags);
}

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
        struct mem_cgroup *memcg = folio_memcg(folio);

        if (memcg)
                count_memcg_events(memcg, idx, nr);
}

static inline void count_memcg_events_mm(struct mm_struct *mm,
                                        enum vm_event_item idx, unsigned long count)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                count_memcg_events(memcg, idx, count);
        rcu_read_unlock();
}

static inline void count_memcg_event_mm(struct mm_struct *mm,
                                        enum vm_event_item idx)
{
        count_memcg_events_mm(mm, idx, 1);
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
        bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX ||
                          event == MEMCG_SWAP_FAIL;

        atomic_long_inc(&memcg->memory_events_local[event]);
        if (!swap_event)
                cgroup_file_notify(&memcg->events_local_file);

        do {
                atomic_long_inc(&memcg->memory_events[event]);
                if (swap_event)
                        cgroup_file_notify(&memcg->swap_events_file);
                else
                        cgroup_file_notify(&memcg->events_file);

                if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                        break;
                if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
                        break;
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
                memcg_memory_event(memcg, event);
        rcu_read_unlock();
}

void split_page_memcg(struct page *first, unsigned order);
void folio_split_memcg_refs(struct folio *folio, unsigned old_order,
                unsigned new_order);

static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
{
        struct mem_cgroup *memcg;
        u64 id;

        if (mem_cgroup_disabled())
                return 0;

        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (!memcg)
                memcg = root_mem_cgroup;
        id = cgroup_id(memcg->css.cgroup);
        rcu_read_unlock();
        return id;
}

#else /* CONFIG_MEMCG */

#define MEM_CGROUP_ID_SHIFT        0

static inline struct mem_cgroup *folio_memcg(struct folio *folio)
{
        return NULL;
}

static inline bool folio_memcg_charged(struct folio *folio)
{
        return false;
}

static inline struct mem_cgroup *folio_memcg_check(struct folio *folio)
{
        return NULL;
}

static inline struct mem_cgroup *page_memcg_check(struct page *page)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
        return NULL;
}

static inline bool folio_memcg_kmem(struct folio *folio)
{
        return false;
}

static inline bool PageMemcgKmem(struct page *page)
{
        return false;
}

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_disabled(void)
{
        return true;
}

static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
{
}

static inline void memcg_memory_event_mm(struct mm_struct *mm,
                                         enum memcg_memory_event event)
{
}

static inline void mem_cgroup_protection(struct mem_cgroup *root,
                                         struct mem_cgroup *memcg,
                                         unsigned long *min,
                                         unsigned long *low)
{
        *min = *low = 0;
}

static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                                                   struct mem_cgroup *memcg)
{
}

static inline bool mem_cgroup_unprotected(struct mem_cgroup *target,
                                          struct mem_cgroup *memcg)
{
        return true;
}
static inline bool mem_cgroup_below_low(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline bool mem_cgroup_below_min(struct mem_cgroup *target,
                                        struct mem_cgroup *memcg)
{
        return false;
}

static inline int mem_cgroup_charge(struct folio *folio,
                struct mm_struct *mm, gfp_t gfp)
{
        return 0;
}

static inline int mem_cgroup_charge_hugetlb(struct folio* folio, gfp_t gfp)
{
        return 0;
}

static inline int mem_cgroup_swapin_charge_folio(struct folio *folio,
                        struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_uncharge(struct folio *folio)
{
}

static inline void mem_cgroup_uncharge_folios(struct folio_batch *folios)
{
}

static inline void mem_cgroup_replace_folio(struct folio *old,
                struct folio *new)
{
}

static inline void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
}

static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg,
                                               struct pglist_data *pgdat)
{
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);
        return &pgdat->__lruvec;
}

static inline
void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
}

static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
{
        return NULL;
}

static inline bool mm_match_cgroup(struct mm_struct *mm,
                struct mem_cgroup *memcg)
{
        return true;
}

static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
{
        return NULL;
}

static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
{
        return NULL;
}

static inline void obj_cgroup_get(struct obj_cgroup *objcg)
{
}

static inline void obj_cgroup_put(struct obj_cgroup *objcg)
{
}

static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
{
        return true;
}

static inline bool mem_cgroup_tryget_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline void mem_cgroup_put(struct mem_cgroup *memcg)
{
}

static inline struct lruvec *folio_lruvec_lock(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock_irq(&pgdat->__lruvec.lru_lock);
        return &pgdat->__lruvec;
}

static inline struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
                unsigned long *flagsp)
{
        struct pglist_data *pgdat = folio_pgdat(folio);

        spin_lock_irqsave(&pgdat->__lruvec.lru_lock, *flagsp);
        return &pgdat->__lruvec;
}

static inline struct mem_cgroup *
mem_cgroup_iter(struct mem_cgroup *root,
                struct mem_cgroup *prev,
                struct mem_cgroup_reclaim_cookie *reclaim)
{
        return NULL;
}

static inline void mem_cgroup_iter_break(struct mem_cgroup *root,
                                         struct mem_cgroup *prev)
{
}

static inline void mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
                int (*fn)(struct task_struct *, void *), void *arg)
{
}

static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
        WARN_ON_ONCE(id);
        /* XXX: This should always return root_mem_cgroup */
        return NULL;
}

#ifdef CONFIG_SHRINKER_DEBUG
static inline unsigned long mem_cgroup_ino(struct mem_cgroup *memcg)
{
        return 0;
}

static inline struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
{
        return NULL;
}
#endif

static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m)
{
        return NULL;
}

static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec)
{
        return NULL;
}

static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
{
        return true;
}

static inline
unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec,
                enum lru_list lru, int zone_idx)
{
        return 0;
}

static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
{
        return 0;
}

static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
{
        return 0;
}

static inline void
mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
}

static inline void
mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
}

static inline void mem_cgroup_handle_over_high(gfp_t gfp_mask)
{
}

static inline struct mem_cgroup *mem_cgroup_get_oom_group(
        struct task_struct *victim, struct mem_cgroup *oom_domain)
{
        return NULL;
}

static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
{
}

static inline void __mod_memcg_state(struct mem_cgroup *memcg,
                                     enum memcg_stat_item idx,
                                     int nr)
{
}

static inline void mod_memcg_state(struct mem_cgroup *memcg,
                                   enum memcg_stat_item idx,
                                   int nr)
{
}

static inline void mod_memcg_page_state(struct page *page,
                                        enum memcg_stat_item idx, int val)
{
}

static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
{
        return 0;
}

static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
                                              enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
                                                    enum node_stat_item idx)
{
        return node_page_state(lruvec_pgdat(lruvec), idx);
}

static inline void mem_cgroup_flush_stats(struct mem_cgroup *memcg)
{
}

static inline void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg)
{
}

static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                           int val)
{
        struct page *page = virt_to_head_page(p);

        __mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx,
                                         int val)
{
        struct page *page = virt_to_head_page(p);

        mod_node_page_state(page_pgdat(page), idx, val);
}

static inline void count_memcg_events(struct mem_cgroup *memcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
}

static inline void __count_memcg_events(struct mem_cgroup *memcg,
                                        enum vm_event_item idx,
                                        unsigned long count)
{
}

static inline void count_memcg_folio_events(struct folio *folio,
                enum vm_event_item idx, unsigned long nr)
{
}

static inline void count_memcg_events_mm(struct mm_struct *mm,
                                        enum vm_event_item idx, unsigned long count)
{
}

static inline
void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
{
}

static inline void split_page_memcg(struct page *first, unsigned order)
{
}

static inline void folio_split_memcg_refs(struct folio *folio,
                unsigned old_order, unsigned new_order)
{
}

static inline u64 cgroup_id_from_mm(struct mm_struct *mm)
{
        return 0;
}
#endif /* CONFIG_MEMCG */

/*
 * Extended information for slab objects stored as an array in page->memcg_data
 * if MEMCG_DATA_OBJEXTS is set.
 */
struct slabobj_ext {
#ifdef CONFIG_MEMCG
        struct obj_cgroup *objcg;
#endif
#ifdef CONFIG_MEM_ALLOC_PROFILING
        union codetag_ref ref;
#endif
} __aligned(8);

static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_kmem_state(p, idx, 1);
}

static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx)
{
        __mod_lruvec_kmem_state(p, idx, -1);
}

static inline struct lruvec *parent_lruvec(struct lruvec *lruvec)
{
        struct mem_cgroup *memcg;

        memcg = lruvec_memcg(lruvec);
        if (!memcg)
                return NULL;
        memcg = parent_mem_cgroup(memcg);
        if (!memcg)
                return NULL;
        return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec));
}

static inline void unlock_page_lruvec(struct lruvec *lruvec)
{
        spin_unlock(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irq(struct lruvec *lruvec)
{
        spin_unlock_irq(&lruvec->lru_lock);
}

static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec,
                unsigned long flags)
{
        spin_unlock_irqrestore(&lruvec->lru_lock, flags);
}

/* Test requires a stable folio->memcg binding, see folio_memcg() */
static inline bool folio_matches_lruvec(struct folio *folio,
                struct lruvec *lruvec)
{
        return lruvec_pgdat(lruvec) == folio_pgdat(folio) &&
               lruvec_memcg(lruvec) == folio_memcg(folio);
}

/* Don't lock again iff page's lruvec locked */
static inline struct lruvec *folio_lruvec_relock_irq(struct folio *folio,
                struct lruvec *locked_lruvec)
{
        if (locked_lruvec) {
                if (folio_matches_lruvec(folio, locked_lruvec))
                        return locked_lruvec;

                unlock_page_lruvec_irq(locked_lruvec);
        }

        return folio_lruvec_lock_irq(folio);
}

/* Don't lock again iff folio's lruvec locked */
static inline void folio_lruvec_relock_irqsave(struct folio *folio,
                struct lruvec **lruvecp, unsigned long *flags)
{
        if (*lruvecp) {
                if (folio_matches_lruvec(folio, *lruvecp))
                        return;

                unlock_page_lruvec_irqrestore(*lruvecp, *flags);
        }

        *lruvecp = folio_lruvec_lock_irqsave(folio, flags);
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
                         unsigned long *pheadroom, unsigned long *pdirty,
                         unsigned long *pwriteback);

void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
                                             struct bdi_writeback *wb);

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
        struct mem_cgroup *memcg;

        if (mem_cgroup_disabled())
                return;

        memcg = folio_memcg(folio);
        if (unlikely(memcg && &memcg->css != wb->memcg_css))
                mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
}

void mem_cgroup_flush_foreign(struct bdi_writeback *wb);

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
{
        return NULL;
}

static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
                                       unsigned long *pfilepages,
                                       unsigned long *pheadroom,
                                       unsigned long *pdirty,
                                       unsigned long *pwriteback)
{
}

static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
                                                  struct bdi_writeback *wb)
{
}

static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

struct sock;
bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
                             gfp_t gfp_mask);
void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages);
#ifdef CONFIG_MEMCG
extern struct static_key_false memcg_sockets_enabled_key;
#define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key)
void mem_cgroup_sk_alloc(struct sock *sk);
void mem_cgroup_sk_free(struct sock *sk);
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
#ifdef CONFIG_MEMCG_V1
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return !!memcg->tcpmem_pressure;
#endif /* CONFIG_MEMCG_V1 */
        do {
                if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
                        return true;
        } while ((memcg = parent_mem_cgroup(memcg)));
        return false;
}

int alloc_shrinker_info(struct mem_cgroup *memcg);
void free_shrinker_info(struct mem_cgroup *memcg);
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
void reparent_shrinker_deferred(struct mem_cgroup *memcg);
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
static inline void mem_cgroup_sk_free(struct sock *sk) { };
static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
{
        return false;
}

static inline void set_shrinker_bit(struct mem_cgroup *memcg,
                                    int nid, int shrinker_id)
{
}
#endif

#ifdef CONFIG_MEMCG
bool mem_cgroup_kmem_disabled(void);
int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
void __memcg_kmem_uncharge_page(struct page *page, int order);

/*
 * The returned objcg pointer is safe to use without additional
 * protection within a scope. The scope is defined either by
 * the current task (similar to the "current" global variable)
 * or by set_active_memcg() pair.
 * Please, use obj_cgroup_get() to get a reference if the pointer
 * needs to be used outside of the local scope.
 */
struct obj_cgroup *current_obj_cgroup(void);
struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio);

static inline struct obj_cgroup *get_obj_cgroup_from_current(void)
{
        struct obj_cgroup *objcg = current_obj_cgroup();

        if (objcg)
                obj_cgroup_get(objcg);

        return objcg;
}

int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);

extern struct static_key_false memcg_bpf_enabled_key;
static inline bool memcg_bpf_enabled(void)
{
        return static_branch_likely(&memcg_bpf_enabled_key);
}

extern struct static_key_false memcg_kmem_online_key;

static inline bool memcg_kmem_online(void)
{
        return static_branch_likely(&memcg_kmem_online_key);
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        if (memcg_kmem_online())
                return __memcg_kmem_charge_page(page, gfp, order);
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
        if (memcg_kmem_online())
                __memcg_kmem_uncharge_page(page, order);
}

/*
 * A helper for accessing memcg's kmem_id, used for getting
 * corresponding LRU lists.
 */
static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return memcg ? memcg->kmemcg_id : -1;
}

struct mem_cgroup *mem_cgroup_from_slab_obj(void *p);

static inline void count_objcg_events(struct obj_cgroup *objcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
        struct mem_cgroup *memcg;

        if (!memcg_kmem_online())
                return;

        rcu_read_lock();
        memcg = obj_cgroup_memcg(objcg);
        count_memcg_events(memcg, idx, count);
        rcu_read_unlock();
}

#else
static inline bool mem_cgroup_kmem_disabled(void)
{
        return true;
}

static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                         int order)
{
        return 0;
}

static inline void memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp,
                                           int order)
{
        return 0;
}

static inline void __memcg_kmem_uncharge_page(struct page *page, int order)
{
}

static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline bool memcg_bpf_enabled(void)
{
        return false;
}

static inline bool memcg_kmem_online(void)
{
        return false;
}

static inline int memcg_kmem_id(struct mem_cgroup *memcg)
{
        return -1;
}

static inline struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
{
        return NULL;
}

static inline void count_objcg_events(struct obj_cgroup *objcg,
                                      enum vm_event_item idx,
                                      unsigned long count)
{
}

#endif /* CONFIG_MEMCG */

#if defined(CONFIG_MEMCG) && defined(CONFIG_ZSWAP)
bool obj_cgroup_may_zswap(struct obj_cgroup *objcg);
void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size);
void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg, size_t size);
bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg);
#else
static inline bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
{
        return true;
}
static inline void obj_cgroup_charge_zswap(struct obj_cgroup *objcg,
                                           size_t size)
{
}
static inline void obj_cgroup_uncharge_zswap(struct obj_cgroup *objcg,
                                             size_t size)
{
}
static inline bool mem_cgroup_zswap_writeback_enabled(struct mem_cgroup *memcg)
{
        /* if zswap is disabled, do not block pages going to the swapping device */
        return true;
}
#endif


/* Cgroup v1-related declarations */

#ifdef CONFIG_MEMCG_V1
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                        gfp_t gfp_mask,
                                        unsigned long *total_scanned);

bool mem_cgroup_oom_synchronize(bool wait);

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return p->memcg_in_oom;
}

static inline void mem_cgroup_enter_user_fault(void)
{
        WARN_ON(current->in_user_fault);
        current->in_user_fault = 1;
}

static inline void mem_cgroup_exit_user_fault(void)
{
        WARN_ON(!current->in_user_fault);
        current->in_user_fault = 0;
}

void memcg1_swapout(struct folio *folio, swp_entry_t entry);
void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages);

#else /* CONFIG_MEMCG_V1 */
static inline
unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order,
                                        gfp_t gfp_mask,
                                        unsigned long *total_scanned)
{
        return 0;
}

static inline bool task_in_memcg_oom(struct task_struct *p)
{
        return false;
}

static inline bool mem_cgroup_oom_synchronize(bool wait)
{
        return false;
}

static inline void mem_cgroup_enter_user_fault(void)
{
}

static inline void mem_cgroup_exit_user_fault(void)
{
}

static inline void memcg1_swapout(struct folio *folio, swp_entry_t entry)
{
}

static inline void memcg1_swapin(swp_entry_t entry, unsigned int nr_pages)
{
}

#endif /* CONFIG_MEMCG_V1 */

#endif /* _LINUX_MEMCONTROL_H */






































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct ptr_ring' datastructure.
 *
 *        Author:
 *                Michael S. Tsirkin <mst@redhat.com>
 *
 *        Copyright (C) 2016 Red Hat, Inc.
 *
 *        This is a limited-size FIFO maintaining pointers in FIFO order, with
 *        one CPU producing entries and another consuming entries from a FIFO.
 *
 *        This implementation tries to minimize cache-contention when there is a
 *        single producer and a single consumer CPU.
 */

#ifndef _LINUX_PTR_RING_H
#define _LINUX_PTR_RING_H 1

#ifdef __KERNEL__
#include <linux/spinlock.h>
#include <linux/cache.h>
#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <asm/errno.h>
#endif

struct ptr_ring {
        int producer ____cacheline_aligned_in_smp;
        spinlock_t producer_lock;
        int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */
        int consumer_tail; /* next entry to invalidate */
        spinlock_t consumer_lock;
        /* Shared consumer/producer data */
        /* Read-only by both the producer and the consumer */
        int size ____cacheline_aligned_in_smp; /* max entries in queue */
        int batch; /* number of entries to consume in a batch */
        void **queue;
};

/* Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax().
 *
 * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock:
 * see e.g. ptr_ring_full.
 */
static inline bool __ptr_ring_full(struct ptr_ring *r)
{
        return r->queue[r->producer];
}

static inline bool ptr_ring_full(struct ptr_ring *r)
{
        bool ret;

        spin_lock(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock(&r->producer_lock);

        return ret;
}

static inline bool ptr_ring_full_irq(struct ptr_ring *r)
{
        bool ret;

        spin_lock_irq(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock_irq(&r->producer_lock);

        return ret;
}

static inline bool ptr_ring_full_any(struct ptr_ring *r)
{
        unsigned long flags;
        bool ret;

        spin_lock_irqsave(&r->producer_lock, flags);
        ret = __ptr_ring_full(r);
        spin_unlock_irqrestore(&r->producer_lock, flags);

        return ret;
}

static inline bool ptr_ring_full_bh(struct ptr_ring *r)
{
        bool ret;

        spin_lock_bh(&r->producer_lock);
        ret = __ptr_ring_full(r);
        spin_unlock_bh(&r->producer_lock);

        return ret;
}

/* Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax(). Callers must hold producer_lock.
 * Callers are responsible for making sure pointer that is being queued
 * points to a valid data.
 */
static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
        if (unlikely(!r->size) || r->queue[r->producer])
                return -ENOSPC;

        /* Make sure the pointer we are storing points to a valid data. */
        /* Pairs with the dependency ordering in __ptr_ring_consume. */
        smp_wmb();

        WRITE_ONCE(r->queue[r->producer++], ptr);
        if (unlikely(r->producer >= r->size))
                r->producer = 0;
        return 0;
}

/*
 * Note: resize (below) nests producer lock within consumer lock, so if you
 * consume in interrupt or BH context, you must disable interrupts/BH when
 * calling this.
 */
static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock(&r->producer_lock);

        return ret;
}

static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock_irq(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_irq(&r->producer_lock);

        return ret;
}

static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&r->producer_lock, flags);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_irqrestore(&r->producer_lock, flags);

        return ret;
}

static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr)
{
        int ret;

        spin_lock_bh(&r->producer_lock);
        ret = __ptr_ring_produce(r, ptr);
        spin_unlock_bh(&r->producer_lock);

        return ret;
}

static inline void *__ptr_ring_peek(struct ptr_ring *r)
{
        if (likely(r->size))
                return READ_ONCE(r->queue[r->consumer_head]);
        return NULL;
}

/*
 * Test ring empty status without taking any locks.
 *
 * NB: This is only safe to call if ring is never resized.
 *
 * However, if some other CPU consumes ring entries at the same time, the value
 * returned is not guaranteed to be correct.
 *
 * In this case - to avoid incorrectly detecting the ring
 * as empty - the CPU consuming the ring entries is responsible
 * for either consuming all ring entries until the ring is empty,
 * or synchronizing with some other CPU and causing it to
 * re-test __ptr_ring_empty and/or consume the ring enteries
 * after the synchronization point.
 *
 * Note: callers invoking this in a loop must use a compiler barrier,
 * for example cpu_relax().
 */
static inline bool __ptr_ring_empty(struct ptr_ring *r)
{
        if (likely(r->size))
                return !r->queue[READ_ONCE(r->consumer_head)];
        return true;
}

static inline bool ptr_ring_empty(struct ptr_ring *r)
{
        bool ret;

        spin_lock(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock(&r->consumer_lock);

        return ret;
}

static inline bool ptr_ring_empty_irq(struct ptr_ring *r)
{
        bool ret;

        spin_lock_irq(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock_irq(&r->consumer_lock);

        return ret;
}

static inline bool ptr_ring_empty_any(struct ptr_ring *r)
{
        unsigned long flags;
        bool ret;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ret = __ptr_ring_empty(r);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ret;
}

static inline bool ptr_ring_empty_bh(struct ptr_ring *r)
{
        bool ret;

        spin_lock_bh(&r->consumer_lock);
        ret = __ptr_ring_empty(r);
        spin_unlock_bh(&r->consumer_lock);

        return ret;
}

/* Must only be called after __ptr_ring_peek returned !NULL */
static inline void __ptr_ring_discard_one(struct ptr_ring *r)
{
        /* Fundamentally, what we want to do is update consumer
         * index and zero out the entry so producer can reuse it.
         * Doing it naively at each consume would be as simple as:
         *       consumer = r->consumer;
         *       r->queue[consumer++] = NULL;
         *       if (unlikely(consumer >= r->size))
         *               consumer = 0;
         *       r->consumer = consumer;
         * but that is suboptimal when the ring is full as producer is writing
         * out new entries in the same cache line.  Defer these updates until a
         * batch of entries has been consumed.
         */
        /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty
         * to work correctly.
         */
        int consumer_head = r->consumer_head;
        int head = consumer_head++;

        /* Once we have processed enough entries invalidate them in
         * the ring all at once so producer can reuse their space in the ring.
         * We also do this when we reach end of the ring - not mandatory
         * but helps keep the implementation simple.
         */
        if (unlikely(consumer_head - r->consumer_tail >= r->batch ||
                     consumer_head >= r->size)) {
                /* Zero out entries in the reverse order: this way we touch the
                 * cache line that producer might currently be reading the last;
                 * producer won't make progress and touch other cache lines
                 * besides the first one until we write out all entries.
                 */
                while (likely(head >= r->consumer_tail))
                        r->queue[head--] = NULL;
                r->consumer_tail = consumer_head;
        }
        if (unlikely(consumer_head >= r->size)) {
                consumer_head = 0;
                r->consumer_tail = 0;
        }
        /* matching READ_ONCE in __ptr_ring_empty for lockless tests */
        WRITE_ONCE(r->consumer_head, consumer_head);
}

static inline void *__ptr_ring_consume(struct ptr_ring *r)
{
        void *ptr;

        /* The READ_ONCE in __ptr_ring_peek guarantees that anyone
         * accessing data through the pointer is up to date. Pairs
         * with smp_wmb in __ptr_ring_produce.
         */
        ptr = __ptr_ring_peek(r);
        if (ptr)
                __ptr_ring_discard_one(r);

        return ptr;
}

static inline int __ptr_ring_consume_batched(struct ptr_ring *r,
                                             void **array, int n)
{
        void *ptr;
        int i;

        for (i = 0; i < n; i++) {
                ptr = __ptr_ring_consume(r);
                if (!ptr)
                        break;
                array[i] = ptr;
        }

        return i;
}

/*
 * Note: resize (below) nests producer lock within consumer lock, so if you
 * call this in interrupt or BH context, you must disable interrupts/BH when
 * producing.
 */
static inline void *ptr_ring_consume(struct ptr_ring *r)
{
        void *ptr;

        spin_lock(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock(&r->consumer_lock);

        return ptr;
}

static inline void *ptr_ring_consume_irq(struct ptr_ring *r)
{
        void *ptr;

        spin_lock_irq(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock_irq(&r->consumer_lock);

        return ptr;
}

static inline void *ptr_ring_consume_any(struct ptr_ring *r)
{
        unsigned long flags;
        void *ptr;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ptr = __ptr_ring_consume(r);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ptr;
}

static inline void *ptr_ring_consume_bh(struct ptr_ring *r)
{
        void *ptr;

        spin_lock_bh(&r->consumer_lock);
        ptr = __ptr_ring_consume(r);
        spin_unlock_bh(&r->consumer_lock);

        return ptr;
}

static inline int ptr_ring_consume_batched(struct ptr_ring *r,
                                           void **array, int n)
{
        int ret;

        spin_lock(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock(&r->consumer_lock);

        return ret;
}

static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r,
                                               void **array, int n)
{
        int ret;

        spin_lock_irq(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_irq(&r->consumer_lock);

        return ret;
}

static inline int ptr_ring_consume_batched_any(struct ptr_ring *r,
                                               void **array, int n)
{
        unsigned long flags;
        int ret;

        spin_lock_irqsave(&r->consumer_lock, flags);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_irqrestore(&r->consumer_lock, flags);

        return ret;
}

static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r,
                                              void **array, int n)
{
        int ret;

        spin_lock_bh(&r->consumer_lock);
        ret = __ptr_ring_consume_batched(r, array, n);
        spin_unlock_bh(&r->consumer_lock);

        return ret;
}

/* Cast to structure type and call a function without discarding from FIFO.
 * Function must return a value.
 * Callers must take consumer_lock.
 */
#define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r)))

#define PTR_RING_PEEK_CALL(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock_irq(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_irq(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_BH(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        \
        spin_lock_bh(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_bh(&(r)->consumer_lock); \
        __PTR_RING_PEEK_CALL_v; \
})

#define PTR_RING_PEEK_CALL_ANY(r, f) ({ \
        typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \
        unsigned long __PTR_RING_PEEK_CALL_f;\
        \
        spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \
        __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \
        spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \
        __PTR_RING_PEEK_CALL_v; \
})

/* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See
 * documentation for vmalloc for which of them are legal.
 */
static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp)
{
        if (size > KMALLOC_MAX_SIZE / sizeof(void *))
                return NULL;
        return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO);
}

static inline void __ptr_ring_set_size(struct ptr_ring *r, int size)
{
        r->size = size;
        r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue));
        /* We need to set batch at least to 1 to make logic
         * in __ptr_ring_discard_one work correctly.
         * Batching too much (because ring is small) would cause a lot of
         * burstiness. Needs tuning, for now disable batching.
         */
        if (r->batch > r->size / 2 || !r->batch)
                r->batch = 1;
}

static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp)
{
        r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
        if (!r->queue)
                return -ENOMEM;

        __ptr_ring_set_size(r, size);
        r->producer = r->consumer_head = r->consumer_tail = 0;
        spin_lock_init(&r->producer_lock);
        spin_lock_init(&r->consumer_lock);

        return 0;
}
#define ptr_ring_init(...)        alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__))

/*
 * Return entries into ring. Destroy entries that don't fit.
 *
 * Note: this is expected to be a rare slow path operation.
 *
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in interrupt or BH context, you must
 * disable interrupts/BH when doing so.
 */
static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n,
                                      void (*destroy)(void *))
{
        unsigned long flags;
        int head;

        spin_lock_irqsave(&r->consumer_lock, flags);
        spin_lock(&r->producer_lock);

        if (!r->size)
                goto done;

        /*
         * Clean out buffered entries (for simplicity). This way following code
         * can test entries for NULL and if not assume they are valid.
         */
        head = r->consumer_head - 1;
        while (likely(head >= r->consumer_tail))
                r->queue[head--] = NULL;
        r->consumer_tail = r->consumer_head;

        /*
         * Go over entries in batch, start moving head back and copy entries.
         * Stop when we run into previously unconsumed entries.
         */
        while (n) {
                head = r->consumer_head - 1;
                if (head < 0)
                        head = r->size - 1;
                if (r->queue[head]) {
                        /* This batch entry will have to be destroyed. */
                        goto done;
                }
                r->queue[head] = batch[--n];
                r->consumer_tail = head;
                /* matching READ_ONCE in __ptr_ring_empty for lockless tests */
                WRITE_ONCE(r->consumer_head, head);
        }

done:
        /* Destroy all entries left in the batch. */
        while (n)
                destroy(batch[--n]);
        spin_unlock(&r->producer_lock);
        spin_unlock_irqrestore(&r->consumer_lock, flags);
}

static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue,
                                           int size, gfp_t gfp,
                                           void (*destroy)(void *))
{
        int producer = 0;
        void **old;
        void *ptr;

        while ((ptr = __ptr_ring_consume(r)))
                if (producer < size)
                        queue[producer++] = ptr;
                else if (destroy)
                        destroy(ptr);

        if (producer >= size)
                producer = 0;
        __ptr_ring_set_size(r, size);
        r->producer = producer;
        r->consumer_head = 0;
        r->consumer_tail = 0;
        old = r->queue;
        r->queue = queue;

        return old;
}

/*
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in interrupt or BH context, you must
 * disable interrupts/BH when doing so.
 */
static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp,
                                  void (*destroy)(void *))
{
        unsigned long flags;
        void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp);
        void **old;

        if (!queue)
                return -ENOMEM;

        spin_lock_irqsave(&(r)->consumer_lock, flags);
        spin_lock(&(r)->producer_lock);

        old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy);

        spin_unlock(&(r)->producer_lock);
        spin_unlock_irqrestore(&(r)->consumer_lock, flags);

        kvfree(old);

        return 0;
}
#define ptr_ring_resize(...)        alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__))

/*
 * Note: producer lock is nested within consumer lock, so if you
 * resize you must make sure all uses nest correctly.
 * In particular if you consume ring in BH context, you must
 * disable BH when doing so.
 */
static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings,
                                                     unsigned int nrings,
                                                     int size, gfp_t gfp,
                                                     void (*destroy)(void *))
{
        void ***queues;
        int i;

        queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp);
        if (!queues)
                goto noqueues;

        for (i = 0; i < nrings; ++i) {
                queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp);
                if (!queues[i])
                        goto nomem;
        }

        for (i = 0; i < nrings; ++i) {
                spin_lock_bh(&(rings[i])->consumer_lock);
                spin_lock(&(rings[i])->producer_lock);
                queues[i] = __ptr_ring_swap_queue(rings[i], queues[i],
                                                  size, gfp, destroy);
                spin_unlock(&(rings[i])->producer_lock);
                spin_unlock_bh(&(rings[i])->consumer_lock);
        }

        for (i = 0; i < nrings; ++i)
                kvfree(queues[i]);

        kfree(queues);

        return 0;

nomem:
        while (--i >= 0)
                kvfree(queues[i]);

        kfree(queues);

noqueues:
        return -ENOMEM;
}
#define ptr_ring_resize_multiple_bh(...) \
                alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__))

static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *))
{
        void *ptr;

        if (destroy)
                while ((ptr = ptr_ring_consume(r)))
                        destroy(ptr);
        kvfree(r->queue);
}

#endif /* _LINUX_PTR_RING_H  */









































































































































































    3 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * A security context is a set of security attributes
 * associated with each subject and object controlled
 * by the security policy.  Security contexts are
  * externally represented as variable-length strings
 * that can be interpreted by a user or application
 * with an understanding of the security policy.
 * Internally, the security server uses a simple
 * structure.  This structure is private to the
 * security server and can be changed without affecting
 * clients of the security server.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

#ifndef _SS_CONTEXT_H_
#define _SS_CONTEXT_H_

#include "ebitmap.h"
#include "mls_types.h"
#include "security.h"

/*
 * A security context consists of an authenticated user
 * identity, a role, a type and a MLS range.
 */
struct context {
        u32 user;
        u32 role;
        u32 type;
        u32 len; /* length of string in bytes */
        struct mls_range range;
        char *str; /* string representation if context cannot be mapped. */
};

static inline void mls_context_init(struct context *c)
{
        memset(&c->range, 0, sizeof(c->range));
}

static inline int mls_context_cpy(struct context *dst,
                                  const struct context *src)
{
        int rc;

        dst->range.level[0].sens = src->range.level[0].sens;
        rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat);
        if (rc)
                goto out;

        dst->range.level[1].sens = src->range.level[1].sens;
        rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat);
        if (rc)
                ebitmap_destroy(&dst->range.level[0].cat);
out:
        return rc;
}

/*
 * Sets both levels in the MLS range of 'dst' to the low level of 'src'.
 */
static inline int mls_context_cpy_low(struct context *dst,
                                      const struct context *src)
{
        int rc;

        dst->range.level[0].sens = src->range.level[0].sens;
        rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat);
        if (rc)
                goto out;

        dst->range.level[1].sens = src->range.level[0].sens;
        rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[0].cat);
        if (rc)
                ebitmap_destroy(&dst->range.level[0].cat);
out:
        return rc;
}

/*
 * Sets both levels in the MLS range of 'dst' to the high level of 'src'.
 */
static inline int mls_context_cpy_high(struct context *dst,
                                       const struct context *src)
{
        int rc;

        dst->range.level[0].sens = src->range.level[1].sens;
        rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[1].cat);
        if (rc)
                goto out;

        dst->range.level[1].sens = src->range.level[1].sens;
        rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat);
        if (rc)
                ebitmap_destroy(&dst->range.level[0].cat);
out:
        return rc;
}

static inline int mls_context_glblub(struct context *dst,
                                     const struct context *c1,
                                     const struct context *c2)
{
        struct mls_range *dr = &dst->range;
        const struct mls_range *r1 = &c1->range, *r2 = &c2->range;
        int rc = 0;

        if (r1->level[1].sens < r2->level[0].sens ||
            r2->level[1].sens < r1->level[0].sens)
                /* These ranges have no common sensitivities */
                return -EINVAL;

        /* Take the greatest of the low */
        dr->level[0].sens = max(r1->level[0].sens, r2->level[0].sens);

        /* Take the least of the high */
        dr->level[1].sens = min(r1->level[1].sens, r2->level[1].sens);

        rc = ebitmap_and(&dr->level[0].cat, &r1->level[0].cat,
                         &r2->level[0].cat);
        if (rc)
                goto out;

        rc = ebitmap_and(&dr->level[1].cat, &r1->level[1].cat,
                         &r2->level[1].cat);
        if (rc)
                goto out;

out:
        return rc;
}

static inline bool mls_context_equal(const struct context *c1,
                                     const struct context *c2)
{
        return ((c1->range.level[0].sens == c2->range.level[0].sens) &&
                ebitmap_equal(&c1->range.level[0].cat, &c2->range.level[0].cat) &&
                (c1->range.level[1].sens == c2->range.level[1].sens) &&
                ebitmap_equal(&c1->range.level[1].cat, &c2->range.level[1].cat));
}

static inline void mls_context_destroy(struct context *c)
{
        ebitmap_destroy(&c->range.level[0].cat);
        ebitmap_destroy(&c->range.level[1].cat);
        mls_context_init(c);
}

static inline void context_init(struct context *c)
{
        memset(c, 0, sizeof(*c));
}

static inline int context_cpy(struct context *dst, const struct context *src)
{
        int rc;

        dst->user = src->user;
        dst->role = src->role;
        dst->type = src->type;
        if (src->str) {
                dst->str = kstrdup(src->str, GFP_ATOMIC);
                if (!dst->str)
                        return -ENOMEM;
                dst->len = src->len;
        } else {
                dst->str = NULL;
                dst->len = 0;
        }
        rc = mls_context_cpy(dst, src);
        if (rc) {
                kfree(dst->str);
                dst->str = NULL;
                dst->len = 0;
                return rc;
        }
        return 0;
}

static inline void context_destroy(struct context *c)
{
        c->user = c->role = c->type = 0;
        kfree(c->str);
        c->str = NULL;
        c->len = 0;
        mls_context_destroy(c);
}

static inline bool context_equal(const struct context *c1,
                                 const struct context *c2)
{
        if (c1->len && c2->len)
                return (c1->len == c2->len && !strcmp(c1->str, c2->str));
        if (c1->len || c2->len)
                return 0;
        return ((c1->user == c2->user) && (c1->role == c2->role) &&
                (c1->type == c2->type) && mls_context_equal(c1, c2));
}

u32 context_compute_hash(const struct context *c);

#endif /* _SS_CONTEXT_H_ */
























































































































































































































































































































































  144 

















    3 
  144 



















































  141 
  141 











































   10 














   10 



   10 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/kernel/capability.c
 *
 * Copyright (C) 1997  Andrew Main <zefram@fysh.org>
 *
 * Integrated into 2.1.97+,  Andrew G. Morgan <morgan@kernel.org>
 * 30 May 2002:        Cleanup, Robert M. Love <rml@tech9.net>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/uaccess.h>

int file_caps_enabled = 1;

static int __init file_caps_disable(char *str)
{
        file_caps_enabled = 0;
        return 1;
}
__setup("no_file_caps", file_caps_disable);

#ifdef CONFIG_MULTIUSER
/*
 * More recent versions of libcap are available from:
 *
 *   http://www.kernel.org/pub/linux/libs/security/linux-privs/
 */

static void warn_legacy_capability_use(void)
{
        pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
                     current->comm);
}

/*
 * Version 2 capabilities worked fine, but the linux/capability.h file
 * that accompanied their introduction encouraged their use without
 * the necessary user-space source code changes. As such, we have
 * created a version 3 with equivalent functionality to version 2, but
 * with a header change to protect legacy source code from using
 * version 2 when it wanted to use version 1. If your system has code
 * that trips the following warning, it is using version 2 specific
 * capabilities and may be doing so insecurely.
 *
 * The remedy is to either upgrade your version of libcap (to 2.10+,
 * if the application is linked against it), or recompile your
 * application with modern kernel headers and this warning will go
 * away.
 */

static void warn_deprecated_v2(void)
{
        pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
                     current->comm);
}

/*
 * Version check. Return the number of u32s in each capability flag
 * array, or a negative value on error.
 */
static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
{
        __u32 version;

        if (get_user(version, &header->version))
                return -EFAULT;

        switch (version) {
        case _LINUX_CAPABILITY_VERSION_1:
                warn_legacy_capability_use();
                *tocopy = _LINUX_CAPABILITY_U32S_1;
                break;
        case _LINUX_CAPABILITY_VERSION_2:
                warn_deprecated_v2();
                fallthrough;        /* v3 is otherwise equivalent to v2 */
        case _LINUX_CAPABILITY_VERSION_3:
                *tocopy = _LINUX_CAPABILITY_U32S_3;
                break;
        default:
                if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
                        return -EFAULT;
                return -EINVAL;
        }

        return 0;
}

/*
 * The only thing that can change the capabilities of the current
 * process is the current process. As such, we can't be in this code
 * at the same time as we are in the process of setting capabilities
 * in this process. The net result is that we can limit our use of
 * locks to when we are reading the caps of another process.
 */
static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
                                     kernel_cap_t *pIp, kernel_cap_t *pPp)
{
        int ret;

        if (pid && (pid != task_pid_vnr(current))) {
                const struct task_struct *target;

                rcu_read_lock();

                target = find_task_by_vpid(pid);
                if (!target)
                        ret = -ESRCH;
                else
                        ret = security_capget(target, pEp, pIp, pPp);

                rcu_read_unlock();
        } else
                ret = security_capget(current, pEp, pIp, pPp);

        return ret;
}

/**
 * sys_capget - get the capabilities of a given process.
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @dataptr: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities that are returned
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr)
{
        int ret = 0;
        pid_t pid;
        unsigned tocopy;
        kernel_cap_t pE, pI, pP;
        struct __user_cap_data_struct kdata[2];

        ret = cap_validate_magic(header, &tocopy);
        if ((dataptr == NULL) || (ret != 0))
                return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        if (pid < 0)
                return -EINVAL;

        ret = cap_get_target_pid(pid, &pE, &pI, &pP);
        if (ret)
                return ret;

        /*
         * Annoying legacy format with 64-bit capabilities exposed
         * as two sets of 32-bit fields, so we need to split the
         * capability values up.
         */
        kdata[0].effective   = pE.val; kdata[1].effective   = pE.val >> 32;
        kdata[0].permitted   = pP.val; kdata[1].permitted   = pP.val >> 32;
        kdata[0].inheritable = pI.val; kdata[1].inheritable = pI.val >> 32;

        /*
         * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
         * we silently drop the upper capabilities here. This
         * has the effect of making older libcap
         * implementations implicitly drop upper capability
         * bits when they perform a: capget/modify/capset
         * sequence.
         *
         * This behavior is considered fail-safe
         * behavior. Upgrading the application to a newer
         * version of libcap will enable access to the newer
         * capabilities.
         *
         * An alternative would be to return an error here
         * (-ERANGE), but that causes legacy applications to
         * unexpectedly fail; the capget/modify/capset aborts
         * before modification is attempted and the application
         * fails.
         */
        if (copy_to_user(dataptr, kdata, tocopy * sizeof(kdata[0])))
                return -EFAULT;

        return 0;
}

static kernel_cap_t mk_kernel_cap(u32 low, u32 high)
{
        return (kernel_cap_t) { (low | ((u64)high << 32)) & CAP_VALID_MASK };
}

/**
 * sys_capset - set capabilities for a process or (*) a group of processes
 * @header: pointer to struct that contains capability version and
 *        target pid data
 * @data: pointer to struct that contains the effective, permitted,
 *        and inheritable capabilities
 *
 * Set capabilities for the current process only.  The ability to any other
 * process(es) has been deprecated and removed.
 *
 * The restrictions on setting capabilities are specified as:
 *
 * I: any raised capabilities must be a subset of the old permitted
 * P: any raised capabilities must be a subset of the old permitted
 * E: must be set to a subset of new permitted
 *
 * Returns 0 on success and < 0 on error.
 */
SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
{
        struct __user_cap_data_struct kdata[2] = { { 0, }, };
        unsigned tocopy, copybytes;
        kernel_cap_t inheritable, permitted, effective;
        struct cred *new;
        int ret;
        pid_t pid;

        ret = cap_validate_magic(header, &tocopy);
        if (ret != 0)
                return ret;

        if (get_user(pid, &header->pid))
                return -EFAULT;

        /* may only affect current now */
        if (pid != 0 && pid != task_pid_vnr(current))
                return -EPERM;

        copybytes = tocopy * sizeof(struct __user_cap_data_struct);
        if (copybytes > sizeof(kdata))
                return -EFAULT;

        if (copy_from_user(&kdata, data, copybytes))
                return -EFAULT;

        effective   = mk_kernel_cap(kdata[0].effective,   kdata[1].effective);
        permitted   = mk_kernel_cap(kdata[0].permitted,   kdata[1].permitted);
        inheritable = mk_kernel_cap(kdata[0].inheritable, kdata[1].inheritable);

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        ret = security_capset(new, current_cred(),
                              &effective, &inheritable, &permitted);
        if (ret < 0)
                goto error;

        audit_log_capset(new, current_cred());

        return commit_creds(new);

error:
        abort_creds(new);
        return ret;
}

/**
 * has_ns_capability - Does a task have a capability in a specific user ns
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability(struct task_struct *t,
                       struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_ns_capability_noaudit - Does a task have a capability (unaudited)
 * in a specific user ns.
 * @t: The task in question
 * @ns: target user namespace
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to the specified user namespace, false if not.
 * Do not write an audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_ns_capability_noaudit(struct task_struct *t,
                               struct user_namespace *ns, int cap)
{
        int ret;

        rcu_read_lock();
        ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT);
        rcu_read_unlock();

        return (ret == 0);
}

/**
 * has_capability_noaudit - Does a task have a capability (unaudited) in the
 * initial user ns
 * @t: The task in question
 * @cap: The capability to be tested for
 *
 * Return true if the specified task has the given superior capability
 * currently in effect to init_user_ns, false if not.  Don't write an
 * audit message for the check.
 *
 * Note that this does not set PF_SUPERPRIV on the task.
 */
bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
EXPORT_SYMBOL(has_capability_noaudit);

static bool ns_capable_common(struct user_namespace *ns,
                              int cap,
                              unsigned int opts)
{
        int capable;

        if (unlikely(!cap_valid(cap))) {
                pr_crit("capable() called with invalid cap=%u\n", cap);
                BUG();
        }

        capable = security_capable(current_cred(), ns, cap, opts);
        if (capable == 0) {
                current->flags |= PF_SUPERPRIV;
                return true;
        }
        return false;
}

/**
 * ns_capable - Determine if the current task has a superior capability in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NONE);
}
EXPORT_SYMBOL(ns_capable);

/**
 * ns_capable_noaudit - Determine if the current task has a superior capability
 * (unaudited) in effect
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT);
}
EXPORT_SYMBOL(ns_capable_noaudit);

/**
 * ns_capable_setid - Determine if the current task has a superior capability
 * in effect, while signalling that this check is being done from within a
 * setid or setgroups syscall.
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return ns_capable_common(ns, cap, CAP_OPT_INSETID);
}
EXPORT_SYMBOL(ns_capable_setid);

/**
 * capable - Determine if the current task has a superior capability in effect
 * @cap: The capability to be tested for
 *
 * Return true if the current task has the given superior capability currently
 * available for use, false if not.
 *
 * This sets PF_SUPERPRIV on the task if the capability is available on the
 * assumption that it's about to be used.
 */
bool capable(int cap)
{
        return ns_capable(&init_user_ns, cap);
}
EXPORT_SYMBOL(capable);
#endif /* CONFIG_MULTIUSER */

/**
 * file_ns_capable - Determine if the file's opener had a capability in effect
 * @file:  The file we want to check
 * @ns:  The usernamespace we want the capability in
 * @cap: The capability to be tested for
 *
 * Return true if task that opened the file had a capability in effect
 * when the file was opened.
 *
 * This does not set PF_SUPERPRIV because the caller may not
 * actually be privileged.
 */
bool file_ns_capable(const struct file *file, struct user_namespace *ns,
                     int cap)
{

        if (WARN_ON_ONCE(!cap_valid(cap)))
                return false;

        if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0)
                return true;

        return false;
}
EXPORT_SYMBOL(file_ns_capable);

/**
 * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode?
 * @ns: The user namespace in question
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 *
 * Return true if the inode uid and gid are within the namespace.
 */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
                                 struct mnt_idmap *idmap,
                                 const struct inode *inode)
{
        return vfsuid_has_mapping(ns, i_uid_into_vfsuid(idmap, inode)) &&
               vfsgid_has_mapping(ns, i_gid_into_vfsgid(idmap, inode));
}

/**
 * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped
 * @idmap: idmap of the mount @inode was found from
 * @inode: The inode in question
 * @cap: The capability in question
 *
 * Return true if the current task has the given capability targeted at
 * its own user namespace and that the given inode's uid and gid are
 * mapped into the current user namespace.
 */
bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
                              const struct inode *inode, int cap)
{
        struct user_namespace *ns = current_user_ns();

        return ns_capable(ns, cap) &&
               privileged_wrt_inode_uidgid(ns, idmap, inode);
}
EXPORT_SYMBOL(capable_wrt_inode_uidgid);

/**
 * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace
 * @tsk: The task that may be ptraced
 * @ns: The user namespace to search for CAP_SYS_PTRACE in
 *
 * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE
 * in the specified user namespace.
 */
bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns)
{
        int ret = 0;  /* An absent tracer adds no restrictions */
        const struct cred *cred;

        rcu_read_lock();
        cred = rcu_dereference(tsk->ptracer_cred);
        if (cred)
                ret = security_capable(cred, ns, CAP_SYS_PTRACE,
                                       CAP_OPT_NOAUDIT);
        rcu_read_unlock();
        return (ret == 0);
}













































  369 










  370 












  370 
















 1118 











 1115 


  370 











  369 




  276 










  370 


























































































































































































































































































































































 1006 



 1005 





 1006 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
// SPDX-License-Identifier: GPL-2.0-only
#include "cgroup-internal.h"

#include <linux/sched/cputime.h>

#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/btf_ids.h>

#include <trace/events/cgroup.h>

static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu);

static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
{
        return per_cpu_ptr(cgrp->rstat_cpu, cpu);
}

/*
 * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock).
 *
 * This makes it easier to diagnose locking issues and contention in
 * production environments. The parameter @fast_path determine the
 * tracepoints being added, allowing us to diagnose "flush" related
 * operations without handling high-frequency fast-path "update" events.
 */
static __always_inline
unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu,
                                     struct cgroup *cgrp, const bool fast_path)
{
        unsigned long flags;
        bool contended;

        /*
         * The _irqsave() is needed because cgroup_rstat_lock is
         * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
         * this lock with the _irq() suffix only disables interrupts on
         * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
         * interrupts on both configurations. The _irqsave() ensures
         * that interrupts are always disabled and later restored.
         */
        contended = !raw_spin_trylock_irqsave(cpu_lock, flags);
        if (contended) {
                if (fast_path)
                        trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended);
                else
                        trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended);

                raw_spin_lock_irqsave(cpu_lock, flags);
        }

        if (fast_path)
                trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended);
        else
                trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended);

        return flags;
}

static __always_inline
void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu,
                              struct cgroup *cgrp, unsigned long flags,
                              const bool fast_path)
{
        if (fast_path)
                trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false);
        else
                trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false);

        raw_spin_unlock_irqrestore(cpu_lock, flags);
}

/**
 * cgroup_rstat_updated - keep track of updated rstat_cpu
 * @cgrp: target cgroup
 * @cpu: cpu on which rstat_cpu was updated
 *
 * @cgrp's rstat_cpu on @cpu was updated.  Put it on the parent's matching
 * rstat_cpu->updated_children list.  See the comment on top of
 * cgroup_rstat_cpu definition for details.
 */
__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
        unsigned long flags;

        /*
         * Speculative already-on-list test. This may race leading to
         * temporary inaccuracies, which is fine.
         *
         * Because @parent's updated_children is terminated with @parent
         * instead of NULL, we can tell whether @cgrp is on the list by
         * testing the next pointer for NULL.
         */
        if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
                return;

        flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true);

        /* put @cgrp and all ancestors on the corresponding updated lists */
        while (true) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
                struct cgroup *parent = cgroup_parent(cgrp);
                struct cgroup_rstat_cpu *prstatc;

                /*
                 * Both additions and removals are bottom-up.  If a cgroup
                 * is already in the tree, all ancestors are.
                 */
                if (rstatc->updated_next)
                        break;

                /* Root has no parent to link it to, but mark it busy */
                if (!parent) {
                        rstatc->updated_next = cgrp;
                        break;
                }

                prstatc = cgroup_rstat_cpu(parent, cpu);
                rstatc->updated_next = prstatc->updated_children;
                prstatc->updated_children = cgrp;

                cgrp = parent;
        }

        _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true);
}

/**
 * cgroup_rstat_push_children - push children cgroups into the given list
 * @head: current head of the list (= subtree root)
 * @child: first child of the root
 * @cpu: target cpu
 * Return: A new singly linked list of cgroups to be flush
 *
 * Iteratively traverse down the cgroup_rstat_cpu updated tree level by
 * level and push all the parents first before their next level children
 * into a singly linked list built from the tail backward like "pushing"
 * cgroups into a stack. The root is pushed by the caller.
 */
static struct cgroup *cgroup_rstat_push_children(struct cgroup *head,
                                                 struct cgroup *child, int cpu)
{
        struct cgroup *chead = child;        /* Head of child cgroup level */
        struct cgroup *ghead = NULL;        /* Head of grandchild cgroup level */
        struct cgroup *parent, *grandchild;
        struct cgroup_rstat_cpu *crstatc;

        child->rstat_flush_next = NULL;

next_level:
        while (chead) {
                child = chead;
                chead = child->rstat_flush_next;
                parent = cgroup_parent(child);

                /* updated_next is parent cgroup terminated */
                while (child != parent) {
                        child->rstat_flush_next = head;
                        head = child;
                        crstatc = cgroup_rstat_cpu(child, cpu);
                        grandchild = crstatc->updated_children;
                        if (grandchild != child) {
                                /* Push the grand child to the next level */
                                crstatc->updated_children = child;
                                grandchild->rstat_flush_next = ghead;
                                ghead = grandchild;
                        }
                        child = crstatc->updated_next;
                        crstatc->updated_next = NULL;
                }
        }

        if (ghead) {
                chead = ghead;
                ghead = NULL;
                goto next_level;
        }
        return head;
}

/**
 * cgroup_rstat_updated_list - return a list of updated cgroups to be flushed
 * @root: root of the cgroup subtree to traverse
 * @cpu: target cpu
 * Return: A singly linked list of cgroups to be flushed
 *
 * Walks the updated rstat_cpu tree on @cpu from @root.  During traversal,
 * each returned cgroup is unlinked from the updated tree.
 *
 * The only ordering guarantee is that, for a parent and a child pair
 * covered by a given traversal, the child is before its parent in
 * the list.
 *
 * Note that updated_children is self terminated and points to a list of
 * child cgroups if not empty. Whereas updated_next is like a sibling link
 * within the children list and terminated by the parent cgroup. An exception
 * here is the cgroup root whose updated_next can be self terminated.
 */
static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu)
{
        raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(root, cpu);
        struct cgroup *head = NULL, *parent, *child;
        unsigned long flags;

        flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false);

        /* Return NULL if this subtree is not on-list */
        if (!rstatc->updated_next)
                goto unlock_ret;

        /*
         * Unlink @root from its parent. As the updated_children list is
         * singly linked, we have to walk it to find the removal point.
         */
        parent = cgroup_parent(root);
        if (parent) {
                struct cgroup_rstat_cpu *prstatc;
                struct cgroup **nextp;

                prstatc = cgroup_rstat_cpu(parent, cpu);
                nextp = &prstatc->updated_children;
                while (*nextp != root) {
                        struct cgroup_rstat_cpu *nrstatc;

                        nrstatc = cgroup_rstat_cpu(*nextp, cpu);
                        WARN_ON_ONCE(*nextp == parent);
                        nextp = &nrstatc->updated_next;
                }
                *nextp = rstatc->updated_next;
        }

        rstatc->updated_next = NULL;

        /* Push @root to the list first before pushing the children */
        head = root;
        root->rstat_flush_next = NULL;
        child = rstatc->updated_children;
        rstatc->updated_children = root;
        if (child != root)
                head = cgroup_rstat_push_children(head, child, cpu);
unlock_ret:
        _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false);
        return head;
}

/*
 * A hook for bpf stat collectors to attach to and flush their stats.
 * Together with providing bpf kfuncs for cgroup_rstat_updated() and
 * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
 * collect cgroup stats can integrate with rstat for efficient flushing.
 *
 * A static noinline declaration here could cause the compiler to optimize away
 * the function. A global noinline declaration will keep the definition, but may
 * optimize away the callsite. Therefore, __weak is needed to ensure that the
 * call is still emitted, by telling the compiler that we don't know what the
 * function might eventually be.
 */

__bpf_hook_start();

__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
                                     struct cgroup *parent, int cpu)
{
}

__bpf_hook_end();

/*
 * Helper functions for locking cgroup_rstat_lock.
 *
 * This makes it easier to diagnose locking issues and contention in
 * production environments.  The parameter @cpu_in_loop indicate lock
 * was released and re-taken when collection data from the CPUs. The
 * value -1 is used when obtaining the main lock else this is the CPU
 * number processed last.
 */
static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop)
        __acquires(&cgroup_rstat_lock)
{
        bool contended;

        contended = !spin_trylock_irq(&cgroup_rstat_lock);
        if (contended) {
                trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended);
                spin_lock_irq(&cgroup_rstat_lock);
        }
        trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended);
}

static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop)
        __releases(&cgroup_rstat_lock)
{
        trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false);
        spin_unlock_irq(&cgroup_rstat_lock);
}

/**
 * cgroup_rstat_flush - flush stats in @cgrp's subtree
 * @cgrp: target cgroup
 *
 * Collect all per-cpu stats in @cgrp's subtree into the global counters
 * and propagate them upwards.  After this function returns, all cgroups in
 * the subtree have up-to-date ->stat.
 *
 * This also gets all cgroups in the subtree including @cgrp off the
 * ->updated_children lists.
 *
 * This function may block.
 */
__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp)
{
        int cpu;

        might_sleep();
        for_each_possible_cpu(cpu) {
                struct cgroup *pos;

                /* Reacquire for each CPU to avoid disabling IRQs too long */
                __cgroup_rstat_lock(cgrp, cpu);
                pos = cgroup_rstat_updated_list(cgrp, cpu);
                for (; pos; pos = pos->rstat_flush_next) {
                        struct cgroup_subsys_state *css;

                        cgroup_base_stat_flush(pos, cpu);
                        bpf_rstat_flush(pos, cgroup_parent(pos), cpu);

                        rcu_read_lock();
                        list_for_each_entry_rcu(css, &pos->rstat_css_list,
                                                rstat_css_node)
                                css->ss->css_rstat_flush(css, cpu);
                        rcu_read_unlock();
                }
                __cgroup_rstat_unlock(cgrp, cpu);
                if (!cond_resched())
                        cpu_relax();
        }
}

int cgroup_rstat_init(struct cgroup *cgrp)
{
        int cpu;

        /* the root cgrp has rstat_cpu preallocated */
        if (!cgrp->rstat_cpu) {
                cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu);
                if (!cgrp->rstat_cpu)
                        return -ENOMEM;
        }

        /* ->updated_children list is self terminated */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                rstatc->updated_children = cgrp;
                u64_stats_init(&rstatc->bsync);
        }

        return 0;
}

void cgroup_rstat_exit(struct cgroup *cgrp)
{
        int cpu;

        cgroup_rstat_flush(cgrp);

        /* sanity check */
        for_each_possible_cpu(cpu) {
                struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);

                if (WARN_ON_ONCE(rstatc->updated_children != cgrp) ||
                    WARN_ON_ONCE(rstatc->updated_next))
                        return;
        }

        free_percpu(cgrp->rstat_cpu);
        cgrp->rstat_cpu = NULL;
}

void __init cgroup_rstat_boot(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
}

/*
 * Functions for cgroup basic resource statistics implemented on top of
 * rstat.
 */
static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime += src_bstat->cputime.utime;
        dst_bstat->cputime.stime += src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
#endif
        dst_bstat->ntime += src_bstat->ntime;
}

static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
                                 struct cgroup_base_stat *src_bstat)
{
        dst_bstat->cputime.utime -= src_bstat->cputime.utime;
        dst_bstat->cputime.stime -= src_bstat->cputime.stime;
        dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
#ifdef CONFIG_SCHED_CORE
        dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
#endif
        dst_bstat->ntime -= src_bstat->ntime;
}

static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
        struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
        struct cgroup *parent = cgroup_parent(cgrp);
        struct cgroup_rstat_cpu *prstatc;
        struct cgroup_base_stat delta;
        unsigned seq;

        /* Root-level stats are sourced from system-wide CPU stats */
        if (!parent)
                return;

        /* fetch the current per-cpu values */
        do {
                seq = __u64_stats_fetch_begin(&rstatc->bsync);
                delta = rstatc->bstat;
        } while (__u64_stats_fetch_retry(&rstatc->bsync, seq));

        /* propagate per-cpu delta to cgroup and per-cpu global statistics */
        cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
        cgroup_base_stat_add(&cgrp->bstat, &delta);
        cgroup_base_stat_add(&rstatc->last_bstat, &delta);
        cgroup_base_stat_add(&rstatc->subtree_bstat, &delta);

        /* propagate cgroup and per-cpu global delta to parent (unless that's root) */
        if (cgroup_parent(parent)) {
                delta = cgrp->bstat;
                cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
                cgroup_base_stat_add(&parent->bstat, &delta);
                cgroup_base_stat_add(&cgrp->last_bstat, &delta);

                delta = rstatc->subtree_bstat;
                prstatc = cgroup_rstat_cpu(parent, cpu);
                cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat);
                cgroup_base_stat_add(&prstatc->subtree_bstat, &delta);
                cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta);
        }
}

static struct cgroup_rstat_cpu *
cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
        struct cgroup_rstat_cpu *rstatc;

        rstatc = get_cpu_ptr(cgrp->rstat_cpu);
        *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
        return rstatc;
}

static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
                                                 struct cgroup_rstat_cpu *rstatc,
                                                 unsigned long flags)
{
        u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
        cgroup_rstat_updated(cgrp, smp_processor_id());
        put_cpu_ptr(rstatc);
}

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;
        unsigned long flags;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
        rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
        cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec)
{
        struct cgroup_rstat_cpu *rstatc;
        unsigned long flags;

        rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);

        switch (index) {
        case CPUTIME_NICE:
                rstatc->bstat.ntime += delta_exec;
                fallthrough;
        case CPUTIME_USER:
                rstatc->bstat.cputime.utime += delta_exec;
                break;
        case CPUTIME_SYSTEM:
        case CPUTIME_IRQ:
        case CPUTIME_SOFTIRQ:
                rstatc->bstat.cputime.stime += delta_exec;
                break;
#ifdef CONFIG_SCHED_CORE
        case CPUTIME_FORCEIDLE:
                rstatc->bstat.forceidle_sum += delta_exec;
                break;
#endif
        default:
                break;
        }

        cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}

/*
 * compute the cputime for the root cgroup by getting the per cpu data
 * at a global level, then categorizing the fields in a manner consistent
 * with how it is done by __cgroup_account_cputime_field for each bit of
 * cpu time attributed to a cgroup.
 */
static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
        struct task_cputime *cputime = &bstat->cputime;
        int i;

        memset(bstat, 0, sizeof(*bstat));
        for_each_possible_cpu(i) {
                struct kernel_cpustat kcpustat;
                u64 *cpustat = kcpustat.cpustat;
                u64 user = 0;
                u64 sys = 0;

                kcpustat_cpu_fetch(&kcpustat, i);

                user += cpustat[CPUTIME_USER];
                user += cpustat[CPUTIME_NICE];
                cputime->utime += user;

                sys += cpustat[CPUTIME_SYSTEM];
                sys += cpustat[CPUTIME_IRQ];
                sys += cpustat[CPUTIME_SOFTIRQ];
                cputime->stime += sys;

                cputime->sum_exec_runtime += user;
                cputime->sum_exec_runtime += sys;

#ifdef CONFIG_SCHED_CORE
                bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
#endif
                bstat->ntime += cpustat[CPUTIME_NICE];
        }
}


static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
{
#ifdef CONFIG_SCHED_CORE
        u64 forceidle_time = bstat->forceidle_sum;

        do_div(forceidle_time, NSEC_PER_USEC);
        seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
#endif
}

void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
        struct cgroup *cgrp = seq_css(seq)->cgroup;
        struct cgroup_base_stat bstat;

        if (cgroup_parent(cgrp)) {
                cgroup_rstat_flush(cgrp);
                __cgroup_rstat_lock(cgrp, -1);
                bstat = cgrp->bstat;
                cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
                               &bstat.cputime.utime, &bstat.cputime.stime);
                __cgroup_rstat_unlock(cgrp, -1);
        } else {
                root_cgroup_cputime(&bstat);
        }

        do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC);
        do_div(bstat.cputime.utime, NSEC_PER_USEC);
        do_div(bstat.cputime.stime, NSEC_PER_USEC);
        do_div(bstat.ntime, NSEC_PER_USEC);

        seq_printf(seq, "usage_usec %llu\n"
                        "user_usec %llu\n"
                        "system_usec %llu\n"
                        "nice_usec %llu\n",
                        bstat.cputime.sum_exec_runtime,
                        bstat.cputime.utime,
                        bstat.cputime.stime,
                        bstat.ntime);

        cgroup_force_idle_show(seq, &bstat);
}

/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
BTF_KFUNCS_START(bpf_rstat_kfunc_ids)
BTF_ID_FLAGS(func, cgroup_rstat_updated)
BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
BTF_KFUNCS_END(bpf_rstat_kfunc_ids)

static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
        .owner          = THIS_MODULE,
        .set            = &bpf_rstat_kfunc_ids,
};

static int __init bpf_rstat_kfunc_init(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
                                         &bpf_rstat_kfunc_set);
}
late_initcall(bpf_rstat_kfunc_init);













































































































































































































































































































































































































































































































































































































































































































    7 
















































  672 





















  499 
































































































































































































































































































































































































  151 
  152 











































































  138 










  139 

  139 
















































































































































































































































































































































































































































































  499 
  499 



  499 





























  147 
  147 


  146 
















































































   35 














   35 



   35 


   35 













   35 

   35 









































   34 


   34 

















































































































































































































   34 


   34 












    2 


    2 
































































































































































































  445 


  445 

















    8 


    8 













    8 


    8 


















































































































































































































































































   26 
   26 












































































































































































































   76 
   78 












  671 
  672 



  672 












  171 
  171 









  188 


  189 



  189 


















 1208 
 1216 



























  260 

  101 





































  321 
  323 












  317 
  319 




















































































































  202 
  202 














  196 
  196 














    8 
    8 





















































































































































































































































































































































































  376 

  376 









































































































































































































































































































































































































































































































































































  562 
  563 

  442 




























































































































































































































































































































































































































































































































































































































































































































  265 
  265 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    8 


    7 




    8 













































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Security plug functions
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001-2002 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2016 Mellanox Technologies
 * Copyright (C) 2023 Microsoft Corporation <paul@paul-moore.com>
 */

#define pr_fmt(fmt) "LSM: " fmt

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/dcache.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/lsm_hooks.h>
#include <linux/mman.h>
#include <linux/mount.h>
#include <linux/personality.h>
#include <linux/backing-dev.h>
#include <linux/string.h>
#include <linux/xattr.h>
#include <linux/msg.h>
#include <linux/overflow.h>
#include <linux/perf_event.h>
#include <linux/fs.h>
#include <net/flow.h>
#include <net/sock.h>

#define SECURITY_HOOK_ACTIVE_KEY(HOOK, IDX) security_hook_active_##HOOK##_##IDX

/*
 * Identifier for the LSM static calls.
 * HOOK is an LSM hook as defined in linux/lsm_hookdefs.h
 * IDX is the index of the static call. 0 <= NUM < MAX_LSM_COUNT
 */
#define LSM_STATIC_CALL(HOOK, IDX) lsm_static_call_##HOOK##_##IDX

/*
 * Call the macro M for each LSM hook MAX_LSM_COUNT times.
 */
#define LSM_LOOP_UNROLL(M, ...)                 \
do {                                                \
        UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)        \
} while (0)

#define LSM_DEFINE_UNROLL(M, ...) UNROLL(MAX_LSM_COUNT, M, __VA_ARGS__)

/*
 * These are descriptions of the reasons that can be passed to the
 * security_locked_down() LSM hook. Placing this array here allows
 * all security modules to use the same descriptions for auditing
 * purposes.
 */
const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX + 1] = {
        [LOCKDOWN_NONE] = "none",
        [LOCKDOWN_MODULE_SIGNATURE] = "unsigned module loading",
        [LOCKDOWN_DEV_MEM] = "/dev/mem,kmem,port",
        [LOCKDOWN_EFI_TEST] = "/dev/efi_test access",
        [LOCKDOWN_KEXEC] = "kexec of unsigned images",
        [LOCKDOWN_HIBERNATION] = "hibernation",
        [LOCKDOWN_PCI_ACCESS] = "direct PCI access",
        [LOCKDOWN_IOPORT] = "raw io port access",
        [LOCKDOWN_MSR] = "raw MSR access",
        [LOCKDOWN_ACPI_TABLES] = "modifying ACPI tables",
        [LOCKDOWN_DEVICE_TREE] = "modifying device tree contents",
        [LOCKDOWN_PCMCIA_CIS] = "direct PCMCIA CIS storage",
        [LOCKDOWN_TIOCSSERIAL] = "reconfiguration of serial port IO",
        [LOCKDOWN_MODULE_PARAMETERS] = "unsafe module parameters",
        [LOCKDOWN_MMIOTRACE] = "unsafe mmio",
        [LOCKDOWN_DEBUGFS] = "debugfs access",
        [LOCKDOWN_XMON_WR] = "xmon write access",
        [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM",
        [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM",
        [LOCKDOWN_RTAS_ERROR_INJECTION] = "RTAS error injection",
        [LOCKDOWN_INTEGRITY_MAX] = "integrity",
        [LOCKDOWN_KCORE] = "/proc/kcore access",
        [LOCKDOWN_KPROBES] = "use of kprobes",
        [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM",
        [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM",
        [LOCKDOWN_PERF] = "unsafe use of perf",
        [LOCKDOWN_TRACEFS] = "use of tracefs",
        [LOCKDOWN_XMON_RW] = "xmon read and write access",
        [LOCKDOWN_XFRM_SECRET] = "xfrm SA secret",
        [LOCKDOWN_CONFIDENTIALITY_MAX] = "confidentiality",
};

static BLOCKING_NOTIFIER_HEAD(blocking_lsm_notifier_chain);

static struct kmem_cache *lsm_file_cache;
static struct kmem_cache *lsm_inode_cache;

char *lsm_names;
static struct lsm_blob_sizes blob_sizes __ro_after_init;

/* Boot-time LSM user choice */
static __initdata const char *chosen_lsm_order;
static __initdata const char *chosen_major_lsm;

static __initconst const char *const builtin_lsm_order = CONFIG_LSM;

/* Ordered list of LSMs to initialize. */
static __initdata struct lsm_info *ordered_lsms[MAX_LSM_COUNT + 1];
static __initdata struct lsm_info *exclusive;

#ifdef CONFIG_HAVE_STATIC_CALL
#define LSM_HOOK_TRAMP(NAME, NUM) \
        &STATIC_CALL_TRAMP(LSM_STATIC_CALL(NAME, NUM))
#else
#define LSM_HOOK_TRAMP(NAME, NUM) NULL
#endif

/*
 * Define static calls and static keys for each LSM hook.
 */
#define DEFINE_LSM_STATIC_CALL(NUM, NAME, RET, ...)                        \
        DEFINE_STATIC_CALL_NULL(LSM_STATIC_CALL(NAME, NUM),                \
                                *((RET(*)(__VA_ARGS__))NULL));                \
        DEFINE_STATIC_KEY_FALSE(SECURITY_HOOK_ACTIVE_KEY(NAME, NUM));

#define LSM_HOOK(RET, DEFAULT, NAME, ...)                                \
        LSM_DEFINE_UNROLL(DEFINE_LSM_STATIC_CALL, NAME, RET, __VA_ARGS__)
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
#undef DEFINE_LSM_STATIC_CALL

/*
 * Initialise a table of static calls for each LSM hook.
 * DEFINE_STATIC_CALL_NULL invocation above generates a key (STATIC_CALL_KEY)
 * and a trampoline (STATIC_CALL_TRAMP) which are used to call
 * __static_call_update when updating the static call.
 *
 * The static calls table is used by early LSMs, some architectures can fault on
 * unaligned accesses and the fault handling code may not be ready by then.
 * Thus, the static calls table should be aligned to avoid any unhandled faults
 * in early init.
 */
struct lsm_static_calls_table
        static_calls_table __ro_after_init __aligned(sizeof(u64)) = {
#define INIT_LSM_STATIC_CALL(NUM, NAME)                                        \
        (struct lsm_static_call) {                                        \
                .key = &STATIC_CALL_KEY(LSM_STATIC_CALL(NAME, NUM)),        \
                .trampoline = LSM_HOOK_TRAMP(NAME, NUM),                \
                .active = &SECURITY_HOOK_ACTIVE_KEY(NAME, NUM),                \
        },
#define LSM_HOOK(RET, DEFAULT, NAME, ...)                                \
        .NAME = {                                                        \
                LSM_DEFINE_UNROLL(INIT_LSM_STATIC_CALL, NAME)                \
        },
#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK
#undef INIT_LSM_STATIC_CALL
        };

static __initdata bool debug;
#define init_debug(...)                                                \
        do {                                                        \
                if (debug)                                        \
                        pr_info(__VA_ARGS__);                        \
        } while (0)

static bool __init is_enabled(struct lsm_info *lsm)
{
        if (!lsm->enabled)
                return false;

        return *lsm->enabled;
}

/* Mark an LSM's enabled flag. */
static int lsm_enabled_true __initdata = 1;
static int lsm_enabled_false __initdata = 0;
static void __init set_enabled(struct lsm_info *lsm, bool enabled)
{
        /*
         * When an LSM hasn't configured an enable variable, we can use
         * a hard-coded location for storing the default enabled state.
         */
        if (!lsm->enabled) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
                else
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_true) {
                if (!enabled)
                        lsm->enabled = &lsm_enabled_false;
        } else if (lsm->enabled == &lsm_enabled_false) {
                if (enabled)
                        lsm->enabled = &lsm_enabled_true;
        } else {
                *lsm->enabled = enabled;
        }
}

/* Is an LSM already listed in the ordered LSMs list? */
static bool __init exists_ordered_lsm(struct lsm_info *lsm)
{
        struct lsm_info **check;

        for (check = ordered_lsms; *check; check++)
                if (*check == lsm)
                        return true;

        return false;
}

/* Append an LSM to the list of ordered LSMs to initialize. */
static int last_lsm __initdata;
static void __init append_ordered_lsm(struct lsm_info *lsm, const char *from)
{
        /* Ignore duplicate selections. */
        if (exists_ordered_lsm(lsm))
                return;

        if (WARN(last_lsm == MAX_LSM_COUNT, "%s: out of LSM static calls!?\n", from))
                return;

        /* Enable this LSM, if it is not already set. */
        if (!lsm->enabled)
                lsm->enabled = &lsm_enabled_true;
        ordered_lsms[last_lsm++] = lsm;

        init_debug("%s ordered: %s (%s)\n", from, lsm->name,
                   is_enabled(lsm) ? "enabled" : "disabled");
}

/* Is an LSM allowed to be initialized? */
static bool __init lsm_allowed(struct lsm_info *lsm)
{
        /* Skip if the LSM is disabled. */
        if (!is_enabled(lsm))
                return false;

        /* Not allowed if another exclusive LSM already initialized. */
        if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && exclusive) {
                init_debug("exclusive disabled: %s\n", lsm->name);
                return false;
        }

        return true;
}

static void __init lsm_set_blob_size(int *need, int *lbs)
{
        int offset;

        if (*need <= 0)
                return;

        offset = ALIGN(*lbs, sizeof(void *));
        *lbs = offset + *need;
        *need = offset;
}

static void __init lsm_set_blob_sizes(struct lsm_blob_sizes *needed)
{
        if (!needed)
                return;

        lsm_set_blob_size(&needed->lbs_cred, &blob_sizes.lbs_cred);
        lsm_set_blob_size(&needed->lbs_file, &blob_sizes.lbs_file);
        lsm_set_blob_size(&needed->lbs_ib, &blob_sizes.lbs_ib);
        /*
         * The inode blob gets an rcu_head in addition to
         * what the modules might need.
         */
        if (needed->lbs_inode && blob_sizes.lbs_inode == 0)
                blob_sizes.lbs_inode = sizeof(struct rcu_head);
        lsm_set_blob_size(&needed->lbs_inode, &blob_sizes.lbs_inode);
        lsm_set_blob_size(&needed->lbs_ipc, &blob_sizes.lbs_ipc);
        lsm_set_blob_size(&needed->lbs_key, &blob_sizes.lbs_key);
        lsm_set_blob_size(&needed->lbs_msg_msg, &blob_sizes.lbs_msg_msg);
        lsm_set_blob_size(&needed->lbs_perf_event, &blob_sizes.lbs_perf_event);
        lsm_set_blob_size(&needed->lbs_sock, &blob_sizes.lbs_sock);
        lsm_set_blob_size(&needed->lbs_superblock, &blob_sizes.lbs_superblock);
        lsm_set_blob_size(&needed->lbs_task, &blob_sizes.lbs_task);
        lsm_set_blob_size(&needed->lbs_tun_dev, &blob_sizes.lbs_tun_dev);
        lsm_set_blob_size(&needed->lbs_xattr_count,
                          &blob_sizes.lbs_xattr_count);
        lsm_set_blob_size(&needed->lbs_bdev, &blob_sizes.lbs_bdev);
}

/* Prepare LSM for initialization. */
static void __init prepare_lsm(struct lsm_info *lsm)
{
        int enabled = lsm_allowed(lsm);

        /* Record enablement (to handle any following exclusive LSMs). */
        set_enabled(lsm, enabled);

        /* If enabled, do pre-initialization work. */
        if (enabled) {
                if ((lsm->flags & LSM_FLAG_EXCLUSIVE) && !exclusive) {
                        exclusive = lsm;
                        init_debug("exclusive chosen:   %s\n", lsm->name);
                }

                lsm_set_blob_sizes(lsm->blobs);
        }
}

/* Initialize a given LSM, if it is enabled. */
static void __init initialize_lsm(struct lsm_info *lsm)
{
        if (is_enabled(lsm)) {
                int ret;

                init_debug("initializing %s\n", lsm->name);
                ret = lsm->init();
                WARN(ret, "%s failed to initialize: %d\n", lsm->name, ret);
        }
}

/*
 * Current index to use while initializing the lsm id list.
 */
u32 lsm_active_cnt __ro_after_init;
const struct lsm_id *lsm_idlist[MAX_LSM_COUNT];

/* Populate ordered LSMs list from comma-separated LSM name list. */
static void __init ordered_lsm_parse(const char *order, const char *origin)
{
        struct lsm_info *lsm;
        char *sep, *name, *next;

        /* LSM_ORDER_FIRST is always first. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_FIRST)
                        append_ordered_lsm(lsm, "  first");
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                struct lsm_info *major;

                /*
                 * To match the original "security=" behavior, this
                 * explicitly does NOT fallback to another Legacy Major
                 * if the selected one was separately disabled: disable
                 * all non-matching Legacy Major LSMs.
                 */
                for (major = __start_lsm_info; major < __end_lsm_info;
                     major++) {
                        if ((major->flags & LSM_FLAG_LEGACY_MAJOR) &&
                            strcmp(major->name, chosen_major_lsm) != 0) {
                                set_enabled(major, false);
                                init_debug("security=%s disabled: %s (only one legacy major LSM)\n",
                                           chosen_major_lsm, major->name);
                        }
                }
        }

        sep = kstrdup(order, GFP_KERNEL);
        next = sep;
        /* Walk the list, looking for matching LSMs. */
        while ((name = strsep(&next, ",")) != NULL) {
                bool found = false;

                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (strcmp(lsm->name, name) == 0) {
                                if (lsm->order == LSM_ORDER_MUTABLE)
                                        append_ordered_lsm(lsm, origin);
                                found = true;
                        }
                }

                if (!found)
                        init_debug("%s ignored: %s (not built into kernel)\n",
                                   origin, name);
        }

        /* Process "security=", if given. */
        if (chosen_major_lsm) {
                for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                        if (exists_ordered_lsm(lsm))
                                continue;
                        if (strcmp(lsm->name, chosen_major_lsm) == 0)
                                append_ordered_lsm(lsm, "security=");
                }
        }

        /* LSM_ORDER_LAST is always last. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (lsm->order == LSM_ORDER_LAST)
                        append_ordered_lsm(lsm, "   last");
        }

        /* Disable all LSMs not in the ordered list. */
        for (lsm = __start_lsm_info; lsm < __end_lsm_info; lsm++) {
                if (exists_ordered_lsm(lsm))
                        continue;
                set_enabled(lsm, false);
                init_debug("%s skipped: %s (not in requested order)\n",
                           origin, lsm->name);
        }

        kfree(sep);
}

static void __init lsm_static_call_init(struct security_hook_list *hl)
{
        struct lsm_static_call *scall = hl->scalls;
        int i;

        for (i = 0; i < MAX_LSM_COUNT; i++) {
                /* Update the first static call that is not used yet */
                if (!scall->hl) {
                        __static_call_update(scall->key, scall->trampoline,
                                             hl->hook.lsm_func_addr);
                        scall->hl = hl;
                        static_branch_enable(scall->active);
                        return;
                }
                scall++;
        }
        panic("%s - Ran out of static slots.\n", __func__);
}

static void __init lsm_early_cred(struct cred *cred);
static void __init lsm_early_task(struct task_struct *task);

static int lsm_append(const char *new, char **result);

static void __init report_lsm_order(void)
{
        struct lsm_info **lsm, *early;
        int first = 0;

        pr_info("initializing lsm=");

        /* Report each enabled LSM name, comma separated. */
        for (early = __start_early_lsm_info;
             early < __end_early_lsm_info; early++)
                if (is_enabled(early))
                        pr_cont("%s%s", first++ == 0 ? "" : ",", early->name);
        for (lsm = ordered_lsms; *lsm; lsm++)
                if (is_enabled(*lsm))
                        pr_cont("%s%s", first++ == 0 ? "" : ",", (*lsm)->name);

        pr_cont("\n");
}

static void __init ordered_lsm_init(void)
{
        struct lsm_info **lsm;

        if (chosen_lsm_order) {
                if (chosen_major_lsm) {
                        pr_warn("security=%s is ignored because it is superseded by lsm=%s\n",
                                chosen_major_lsm, chosen_lsm_order);
                        chosen_major_lsm = NULL;
                }
                ordered_lsm_parse(chosen_lsm_order, "cmdline");
        } else
                ordered_lsm_parse(builtin_lsm_order, "builtin");

        for (lsm = ordered_lsms; *lsm; lsm++)
                prepare_lsm(*lsm);

        report_lsm_order();

        init_debug("cred blob size       = %d\n", blob_sizes.lbs_cred);
        init_debug("file blob size       = %d\n", blob_sizes.lbs_file);
        init_debug("ib blob size         = %d\n", blob_sizes.lbs_ib);
        init_debug("inode blob size      = %d\n", blob_sizes.lbs_inode);
        init_debug("ipc blob size        = %d\n", blob_sizes.lbs_ipc);
#ifdef CONFIG_KEYS
        init_debug("key blob size        = %d\n", blob_sizes.lbs_key);
#endif /* CONFIG_KEYS */
        init_debug("msg_msg blob size    = %d\n", blob_sizes.lbs_msg_msg);
        init_debug("sock blob size       = %d\n", blob_sizes.lbs_sock);
        init_debug("superblock blob size = %d\n", blob_sizes.lbs_superblock);
        init_debug("perf event blob size = %d\n", blob_sizes.lbs_perf_event);
        init_debug("task blob size       = %d\n", blob_sizes.lbs_task);
        init_debug("tun device blob size = %d\n", blob_sizes.lbs_tun_dev);
        init_debug("xattr slots          = %d\n", blob_sizes.lbs_xattr_count);
        init_debug("bdev blob size       = %d\n", blob_sizes.lbs_bdev);

        /*
         * Create any kmem_caches needed for blobs
         */
        if (blob_sizes.lbs_file)
                lsm_file_cache = kmem_cache_create("lsm_file_cache",
                                                   blob_sizes.lbs_file, 0,
                                                   SLAB_PANIC, NULL);
        if (blob_sizes.lbs_inode)
                lsm_inode_cache = kmem_cache_create("lsm_inode_cache",
                                                    blob_sizes.lbs_inode, 0,
                                                    SLAB_PANIC, NULL);

        lsm_early_cred((struct cred *) current->cred);
        lsm_early_task(current);
        for (lsm = ordered_lsms; *lsm; lsm++)
                initialize_lsm(*lsm);
}

int __init early_security_init(void)
{
        struct lsm_info *lsm;

        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                if (!lsm->enabled)
                        lsm->enabled = &lsm_enabled_true;
                prepare_lsm(lsm);
                initialize_lsm(lsm);
        }

        return 0;
}

/**
 * security_init - initializes the security framework
 *
 * This should be called early in the kernel initialization sequence.
 */
int __init security_init(void)
{
        struct lsm_info *lsm;

        init_debug("legacy security=%s\n", chosen_major_lsm ? : " *unspecified*");
        init_debug("  CONFIG_LSM=%s\n", builtin_lsm_order);
        init_debug("boot arg lsm=%s\n", chosen_lsm_order ? : " *unspecified*");

        /*
         * Append the names of the early LSM modules now that kmalloc() is
         * available
         */
        for (lsm = __start_early_lsm_info; lsm < __end_early_lsm_info; lsm++) {
                init_debug("  early started: %s (%s)\n", lsm->name,
                           is_enabled(lsm) ? "enabled" : "disabled");
                if (lsm->enabled)
                        lsm_append(lsm->name, &lsm_names);
        }

        /* Load LSMs in specified order. */
        ordered_lsm_init();

        return 0;
}

/* Save user chosen LSM */
static int __init choose_major_lsm(char *str)
{
        chosen_major_lsm = str;
        return 1;
}
__setup("security=", choose_major_lsm);

/* Explicitly choose LSM initialization order. */
static int __init choose_lsm_order(char *str)
{
        chosen_lsm_order = str;
        return 1;
}
__setup("lsm=", choose_lsm_order);

/* Enable LSM order debugging. */
static int __init enable_debug(char *str)
{
        debug = true;
        return 1;
}
__setup("lsm.debug", enable_debug);

static bool match_last_lsm(const char *list, const char *lsm)
{
        const char *last;

        if (WARN_ON(!list || !lsm))
                return false;
        last = strrchr(list, ',');
        if (last)
                /* Pass the comma, strcmp() will check for '\0' */
                last++;
        else
                last = list;
        return !strcmp(last, lsm);
}

static int lsm_append(const char *new, char **result)
{
        char *cp;

        if (*result == NULL) {
                *result = kstrdup(new, GFP_KERNEL);
                if (*result == NULL)
                        return -ENOMEM;
        } else {
                /* Check if it is the last registered name */
                if (match_last_lsm(*result, new))
                        return 0;
                cp = kasprintf(GFP_KERNEL, "%s,%s", *result, new);
                if (cp == NULL)
                        return -ENOMEM;
                kfree(*result);
                *result = cp;
        }
        return 0;
}

/**
 * security_add_hooks - Add a modules hooks to the hook lists.
 * @hooks: the hooks to add
 * @count: the number of hooks to add
 * @lsmid: the identification information for the security module
 *
 * Each LSM has to register its hooks with the infrastructure.
 */
void __init security_add_hooks(struct security_hook_list *hooks, int count,
                               const struct lsm_id *lsmid)
{
        int i;

        /*
         * A security module may call security_add_hooks() more
         * than once during initialization, and LSM initialization
         * is serialized. Landlock is one such case.
         * Look at the previous entry, if there is one, for duplication.
         */
        if (lsm_active_cnt == 0 || lsm_idlist[lsm_active_cnt - 1] != lsmid) {
                if (lsm_active_cnt >= MAX_LSM_COUNT)
                        panic("%s Too many LSMs registered.\n", __func__);
                lsm_idlist[lsm_active_cnt++] = lsmid;
        }

        for (i = 0; i < count; i++) {
                hooks[i].lsmid = lsmid;
                lsm_static_call_init(&hooks[i]);
        }

        /*
         * Don't try to append during early_security_init(), we'll come back
         * and fix this up afterwards.
         */
        if (slab_is_available()) {
                if (lsm_append(lsmid->name, &lsm_names) < 0)
                        panic("%s - Cannot get early memory.\n", __func__);
        }
}

int call_blocking_lsm_notifier(enum lsm_event event, void *data)
{
        return blocking_notifier_call_chain(&blocking_lsm_notifier_chain,
                                            event, data);
}
EXPORT_SYMBOL(call_blocking_lsm_notifier);

int register_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&blocking_lsm_notifier_chain,
                                                nb);
}
EXPORT_SYMBOL(register_blocking_lsm_notifier);

int unregister_blocking_lsm_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&blocking_lsm_notifier_chain,
                                                  nb);
}
EXPORT_SYMBOL(unregister_blocking_lsm_notifier);

/**
 * lsm_blob_alloc - allocate a composite blob
 * @dest: the destination for the blob
 * @size: the size of the blob
 * @gfp: allocation type
 *
 * Allocate a blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_blob_alloc(void **dest, size_t size, gfp_t gfp)
{
        if (size == 0) {
                *dest = NULL;
                return 0;
        }

        *dest = kzalloc(size, gfp);
        if (*dest == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_cred_alloc - allocate a composite cred blob
 * @cred: the cred that needs a blob
 * @gfp: allocation type
 *
 * Allocate the cred blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
{
        return lsm_blob_alloc(&cred->security, blob_sizes.lbs_cred, gfp);
}

/**
 * lsm_early_cred - during initialization allocate a composite cred blob
 * @cred: the cred that needs a blob
 *
 * Allocate the cred blob for all the modules
 */
static void __init lsm_early_cred(struct cred *cred)
{
        int rc = lsm_cred_alloc(cred, GFP_KERNEL);

        if (rc)
                panic("%s: Early cred alloc failed.\n", __func__);
}

/**
 * lsm_file_alloc - allocate a composite file blob
 * @file: the file that needs a blob
 *
 * Allocate the file blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_file_alloc(struct file *file)
{
        if (!lsm_file_cache) {
                file->f_security = NULL;
                return 0;
        }

        file->f_security = kmem_cache_zalloc(lsm_file_cache, GFP_KERNEL);
        if (file->f_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_inode_alloc - allocate a composite inode blob
 * @inode: the inode that needs a blob
 * @gfp: allocation flags
 *
 * Allocate the inode blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_inode_alloc(struct inode *inode, gfp_t gfp)
{
        if (!lsm_inode_cache) {
                inode->i_security = NULL;
                return 0;
        }

        inode->i_security = kmem_cache_zalloc(lsm_inode_cache, gfp);
        if (inode->i_security == NULL)
                return -ENOMEM;
        return 0;
}

/**
 * lsm_task_alloc - allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_task_alloc(struct task_struct *task)
{
        return lsm_blob_alloc(&task->security, blob_sizes.lbs_task, GFP_KERNEL);
}

/**
 * lsm_ipc_alloc - allocate a composite ipc blob
 * @kip: the ipc that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_ipc_alloc(struct kern_ipc_perm *kip)
{
        return lsm_blob_alloc(&kip->security, blob_sizes.lbs_ipc, GFP_KERNEL);
}

#ifdef CONFIG_KEYS
/**
 * lsm_key_alloc - allocate a composite key blob
 * @key: the key that needs a blob
 *
 * Allocate the key blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_key_alloc(struct key *key)
{
        return lsm_blob_alloc(&key->security, blob_sizes.lbs_key, GFP_KERNEL);
}
#endif /* CONFIG_KEYS */

/**
 * lsm_msg_msg_alloc - allocate a composite msg_msg blob
 * @mp: the msg_msg that needs a blob
 *
 * Allocate the ipc blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_msg_msg_alloc(struct msg_msg *mp)
{
        return lsm_blob_alloc(&mp->security, blob_sizes.lbs_msg_msg,
                              GFP_KERNEL);
}

/**
 * lsm_bdev_alloc - allocate a composite block_device blob
 * @bdev: the block_device that needs a blob
 *
 * Allocate the block_device blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_bdev_alloc(struct block_device *bdev)
{
        if (blob_sizes.lbs_bdev == 0) {
                bdev->bd_security = NULL;
                return 0;
        }

        bdev->bd_security = kzalloc(blob_sizes.lbs_bdev, GFP_KERNEL);
        if (!bdev->bd_security)
                return -ENOMEM;

        return 0;
}

/**
 * lsm_early_task - during initialization allocate a composite task blob
 * @task: the task that needs a blob
 *
 * Allocate the task blob for all the modules
 */
static void __init lsm_early_task(struct task_struct *task)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                panic("%s: Early task alloc failed.\n", __func__);
}

/**
 * lsm_superblock_alloc - allocate a composite superblock blob
 * @sb: the superblock that needs a blob
 *
 * Allocate the superblock blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_superblock_alloc(struct super_block *sb)
{
        return lsm_blob_alloc(&sb->s_security, blob_sizes.lbs_superblock,
                              GFP_KERNEL);
}

/**
 * lsm_fill_user_ctx - Fill a user space lsm_ctx structure
 * @uctx: a userspace LSM context to be filled
 * @uctx_len: available uctx size (input), used uctx size (output)
 * @val: the new LSM context value
 * @val_len: the size of the new LSM context value
 * @id: LSM id
 * @flags: LSM defined flags
 *
 * Fill all of the fields in a userspace lsm_ctx structure.  If @uctx is NULL
 * simply calculate the required size to output via @utc_len and return
 * success.
 *
 * Returns 0 on success, -E2BIG if userspace buffer is not large enough,
 * -EFAULT on a copyout error, -ENOMEM if memory can't be allocated.
 */
int lsm_fill_user_ctx(struct lsm_ctx __user *uctx, u32 *uctx_len,
                      void *val, size_t val_len,
                      u64 id, u64 flags)
{
        struct lsm_ctx *nctx = NULL;
        size_t nctx_len;
        int rc = 0;

        nctx_len = ALIGN(struct_size(nctx, ctx, val_len), sizeof(void *));
        if (nctx_len > *uctx_len) {
                rc = -E2BIG;
                goto out;
        }

        /* no buffer - return success/0 and set @uctx_len to the req size */
        if (!uctx)
                goto out;

        nctx = kzalloc(nctx_len, GFP_KERNEL);
        if (nctx == NULL) {
                rc = -ENOMEM;
                goto out;
        }
        nctx->id = id;
        nctx->flags = flags;
        nctx->len = nctx_len;
        nctx->ctx_len = val_len;
        memcpy(nctx->ctx, val, val_len);

        if (copy_to_user(uctx, nctx, nctx_len))
                rc = -EFAULT;

out:
        kfree(nctx);
        *uctx_len = nctx_len;
        return rc;
}

/*
 * The default value of the LSM hook is defined in linux/lsm_hook_defs.h and
 * can be accessed with:
 *
 *        LSM_RET_DEFAULT(<hook_name>)
 *
 * The macros below define static constants for the default value of each
 * LSM hook.
 */
#define LSM_RET_DEFAULT(NAME) (NAME##_default)
#define DECLARE_LSM_RET_DEFAULT_void(DEFAULT, NAME)
#define DECLARE_LSM_RET_DEFAULT_int(DEFAULT, NAME) \
        static const int __maybe_unused LSM_RET_DEFAULT(NAME) = (DEFAULT);
#define LSM_HOOK(RET, DEFAULT, NAME, ...) \
        DECLARE_LSM_RET_DEFAULT_##RET(DEFAULT, NAME)

#include <linux/lsm_hook_defs.h>
#undef LSM_HOOK

/*
 * Hook list operation macros.
 *
 * call_void_hook:
 *        This is a hook that does not return a value.
 *
 * call_int_hook:
 *        This is a hook that returns a value.
 */
#define __CALL_STATIC_VOID(NUM, HOOK, ...)                                     \
do {                                                                             \
        if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) {    \
                static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__);             \
        }                                                                     \
} while (0);

#define call_void_hook(HOOK, ...)                                 \
        do {                                                      \
                LSM_LOOP_UNROLL(__CALL_STATIC_VOID, HOOK, __VA_ARGS__); \
        } while (0)


#define __CALL_STATIC_INT(NUM, R, HOOK, LABEL, ...)                             \
do {                                                                             \
        if (static_branch_unlikely(&SECURITY_HOOK_ACTIVE_KEY(HOOK, NUM))) {  \
                R = static_call(LSM_STATIC_CALL(HOOK, NUM))(__VA_ARGS__);    \
                if (R != LSM_RET_DEFAULT(HOOK))                                     \
                        goto LABEL;                                             \
        }                                                                     \
} while (0);

#define call_int_hook(HOOK, ...)                                        \
({                                                                        \
        __label__ OUT;                                                        \
        int RC = LSM_RET_DEFAULT(HOOK);                                        \
                                                                        \
        LSM_LOOP_UNROLL(__CALL_STATIC_INT, RC, HOOK, OUT, __VA_ARGS__);        \
OUT:                                                                        \
        RC;                                                                \
})

#define lsm_for_each_hook(scall, NAME)                                        \
        for (scall = static_calls_table.NAME;                                \
             scall - static_calls_table.NAME < MAX_LSM_COUNT; scall++)  \
                if (static_key_enabled(&scall->active->key))

/* Security operations */

/**
 * security_binder_set_context_mgr() - Check if becoming binder ctx mgr is ok
 * @mgr: task credentials of current binder process
 *
 * Check whether @mgr is allowed to be the binder context manager.
 *
 * Return: Return 0 if permission is granted.
 */
int security_binder_set_context_mgr(const struct cred *mgr)
{
        return call_int_hook(binder_set_context_mgr, mgr);
}

/**
 * security_binder_transaction() - Check if a binder transaction is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to invoke a binder transaction call to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transaction(const struct cred *from,
                                const struct cred *to)
{
        return call_int_hook(binder_transaction, from, to);
}

/**
 * security_binder_transfer_binder() - Check if a binder transfer is allowed
 * @from: sending process
 * @to: receiving process
 *
 * Check whether @from is allowed to transfer a binder reference to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_binder(const struct cred *from,
                                    const struct cred *to)
{
        return call_int_hook(binder_transfer_binder, from, to);
}

/**
 * security_binder_transfer_file() - Check if a binder file xfer is allowed
 * @from: sending process
 * @to: receiving process
 * @file: file being transferred
 *
 * Check whether @from is allowed to transfer @file to @to.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_binder_transfer_file(const struct cred *from,
                                  const struct cred *to, const struct file *file)
{
        return call_int_hook(binder_transfer_file, from, to, file);
}

/**
 * security_ptrace_access_check() - Check if tracing is allowed
 * @child: target process
 * @mode: PTRACE_MODE flags
 *
 * Check permission before allowing the current process to trace the @child
 * process.  Security modules may also want to perform a process tracing check
 * during an execve in the set_security or apply_creds hooks of tracing check
 * during an execve in the bprm_set_creds hook of binprm_security_ops if the
 * process is being traced and its security attributes would be changed by the
 * execve.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_access_check(struct task_struct *child, unsigned int mode)
{
        return call_int_hook(ptrace_access_check, child, mode);
}

/**
 * security_ptrace_traceme() - Check if tracing is allowed
 * @parent: tracing process
 *
 * Check that the @parent process has sufficient permission to trace the
 * current process before allowing the current process to present itself to the
 * @parent process for tracing.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ptrace_traceme(struct task_struct *parent)
{
        return call_int_hook(ptrace_traceme, parent);
}

/**
 * security_capget() - Get the capability sets for a process
 * @target: target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Get the @effective, @inheritable, and @permitted capability sets for the
 * @target process.  The hook may also perform permission checking to determine
 * if the current process is allowed to see the capability sets of the @target
 * process.
 *
 * Return: Returns 0 if the capability sets were successfully obtained.
 */
int security_capget(const struct task_struct *target,
                    kernel_cap_t *effective,
                    kernel_cap_t *inheritable,
                    kernel_cap_t *permitted)
{
        return call_int_hook(capget, target, effective, inheritable, permitted);
}

/**
 * security_capset() - Set the capability sets for a process
 * @new: new credentials for the target process
 * @old: current credentials of the target process
 * @effective: effective capability set
 * @inheritable: inheritable capability set
 * @permitted: permitted capability set
 *
 * Set the @effective, @inheritable, and @permitted capability sets for the
 * current process.
 *
 * Return: Returns 0 and update @new if permission is granted.
 */
int security_capset(struct cred *new, const struct cred *old,
                    const kernel_cap_t *effective,
                    const kernel_cap_t *inheritable,
                    const kernel_cap_t *permitted)
{
        return call_int_hook(capset, new, old, effective, inheritable,
                             permitted);
}

/**
 * security_capable() - Check if a process has the necessary capability
 * @cred: credentials to examine
 * @ns: user namespace
 * @cap: capability requested
 * @opts: capability check options
 *
 * Check whether the @tsk process has the @cap capability in the indicated
 * credentials.  @cap contains the capability <include/linux/capability.h>.
 * @opts contains options for the capable check <include/linux/security.h>.
 *
 * Return: Returns 0 if the capability is granted.
 */
int security_capable(const struct cred *cred,
                     struct user_namespace *ns,
                     int cap,
                     unsigned int opts)
{
        return call_int_hook(capable, cred, ns, cap, opts);
}

/**
 * security_quotactl() - Check if a quotactl() syscall is allowed for this fs
 * @cmds: commands
 * @type: type
 * @id: id
 * @sb: filesystem
 *
 * Check whether the quotactl syscall is allowed for this @sb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quotactl(int cmds, int type, int id, const struct super_block *sb)
{
        return call_int_hook(quotactl, cmds, type, id, sb);
}

/**
 * security_quota_on() - Check if QUOTAON is allowed for a dentry
 * @dentry: dentry
 *
 * Check whether QUOTAON is allowed for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_quota_on(struct dentry *dentry)
{
        return call_int_hook(quota_on, dentry);
}

/**
 * security_syslog() - Check if accessing the kernel message ring is allowed
 * @type: SYSLOG_ACTION_* type
 *
 * Check permission before accessing the kernel message ring or changing
 * logging to the console.  See the syslog(2) manual page for an explanation of
 * the @type values.
 *
 * Return: Return 0 if permission is granted.
 */
int security_syslog(int type)
{
        return call_int_hook(syslog, type);
}

/**
 * security_settime64() - Check if changing the system time is allowed
 * @ts: new time
 * @tz: timezone
 *
 * Check permission to change the system time, struct timespec64 is defined in
 * <include/linux/time64.h> and timezone is defined in <include/linux/time.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_settime64(const struct timespec64 *ts, const struct timezone *tz)
{
        return call_int_hook(settime, ts, tz);
}

/**
 * security_vm_enough_memory_mm() - Check if allocating a new mem map is allowed
 * @mm: mm struct
 * @pages: number of pages
 *
 * Check permissions for allocating a new virtual mapping.  If all LSMs return
 * a positive value, __vm_enough_memory() will be called with cap_sys_admin
 * set. If at least one LSM returns 0 or negative, __vm_enough_memory() will be
 * called with cap_sys_admin cleared.
 *
 * Return: Returns 0 if permission is granted by the LSM infrastructure to the
 *         caller.
 */
int security_vm_enough_memory_mm(struct mm_struct *mm, long pages)
{
        struct lsm_static_call *scall;
        int cap_sys_admin = 1;
        int rc;

        /*
         * The module will respond with 0 if it thinks the __vm_enough_memory()
         * call should be made with the cap_sys_admin set. If all of the modules
         * agree that it should be set it will. If any module thinks it should
         * not be set it won't.
         */
        lsm_for_each_hook(scall, vm_enough_memory) {
                rc = scall->hl->hook.vm_enough_memory(mm, pages);
                if (rc < 0) {
                        cap_sys_admin = 0;
                        break;
                }
        }
        return __vm_enough_memory(mm, pages, cap_sys_admin);
}

/**
 * security_bprm_creds_for_exec() - Prepare the credentials for exec()
 * @bprm: binary program information
 *
 * If the setup in prepare_exec_creds did not setup @bprm->cred->security
 * properly for executing @bprm->file, update the LSM's portion of
 * @bprm->cred->security to be what commit_creds needs to install for the new
 * program.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  @bprm
 * contains the linux_binprm structure.
 *
 * If execveat(2) is called with the AT_EXECVE_CHECK flag, bprm->is_check is
 * set.  The result must be the same as without this flag even if the execution
 * will never really happen and @bprm will always be dropped.
 *
 * This hook must not change current->cred, only @bprm->cred.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_creds_for_exec, bprm);
}

/**
 * security_bprm_creds_from_file() - Update linux_binprm creds based on file
 * @bprm: binary program information
 * @file: associated file
 *
 * If @file is setpcap, suid, sgid or otherwise marked to change privilege upon
 * exec, update @bprm->cred to reflect that change. This is called after
 * finding the binary that will be executed without an interpreter.  This
 * ensures that the credentials will not be derived from a script that the
 * binary will need to reopen, which when reopend may end up being a completely
 * different file.  This hook may also optionally check permissions (e.g. for
 * transitions between security domains).  The hook must set @bprm->secureexec
 * to 1 if AT_SECURE should be set to request libc enable secure mode.  The
 * hook must add to @bprm->per_clear any personality flags that should be
 * cleared from current->personality.  @bprm contains the linux_binprm
 * structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_creds_from_file(struct linux_binprm *bprm, const struct file *file)
{
        return call_int_hook(bprm_creds_from_file, bprm, file);
}

/**
 * security_bprm_check() - Mediate binary handler search
 * @bprm: binary program information
 *
 * This hook mediates the point when a search for a binary handler will begin.
 * It allows a check against the @bprm->cred->security value which was set in
 * the preceding creds_for_exec call.  The argv list and envp list are reliably
 * available in @bprm.  This hook may be called multiple times during a single
 * execve.  @bprm contains the linux_binprm structure.
 *
 * Return: Returns 0 if the hook is successful and permission is granted.
 */
int security_bprm_check(struct linux_binprm *bprm)
{
        return call_int_hook(bprm_check_security, bprm);
}

/**
 * security_bprm_committing_creds() - Install creds for a process during exec()
 * @bprm: binary program information
 *
 * Prepare to install the new security attributes of a process being
 * transformed by an execve operation, based on the old credentials pointed to
 * by @current->cred and the information set in @bprm->cred by the
 * bprm_creds_for_exec hook.  @bprm points to the linux_binprm structure.  This
 * hook is a good place to perform state changes on the process such as closing
 * open file descriptors to which access will no longer be granted when the
 * attributes are changed.  This is called immediately before commit_creds().
 */
void security_bprm_committing_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committing_creds, bprm);
}

/**
 * security_bprm_committed_creds() - Tidy up after cred install during exec()
 * @bprm: binary program information
 *
 * Tidy up after the installation of the new security attributes of a process
 * being transformed by an execve operation.  The new credentials have, by this
 * point, been set to @current->cred.  @bprm points to the linux_binprm
 * structure.  This hook is a good place to perform state changes on the
 * process such as clearing out non-inheritable signal state.  This is called
 * immediately after commit_creds().
 */
void security_bprm_committed_creds(const struct linux_binprm *bprm)
{
        call_void_hook(bprm_committed_creds, bprm);
}

/**
 * security_fs_context_submount() - Initialise fc->security
 * @fc: new filesystem context
 * @reference: dentry reference for submount/remount
 *
 * Fill out the ->security field for a new fs_context.
 *
 * Return: Returns 0 on success or negative error code on failure.
 */
int security_fs_context_submount(struct fs_context *fc, struct super_block *reference)
{
        return call_int_hook(fs_context_submount, fc, reference);
}

/**
 * security_fs_context_dup() - Duplicate a fs_context LSM blob
 * @fc: destination filesystem context
 * @src_fc: source filesystem context
 *
 * Allocate and attach a security structure to sc->security.  This pointer is
 * initialised to NULL by the caller.  @fc indicates the new filesystem context.
 * @src_fc indicates the original filesystem context.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
{
        return call_int_hook(fs_context_dup, fc, src_fc);
}

/**
 * security_fs_context_parse_param() - Configure a filesystem context
 * @fc: filesystem context
 * @param: filesystem parameter
 *
 * Userspace provided a parameter to configure a superblock.  The LSM can
 * consume the parameter or return it to the caller for use elsewhere.
 *
 * Return: If the parameter is used by the LSM it should return 0, if it is
 *         returned to the caller -ENOPARAM is returned, otherwise a negative
 *         error code is returned.
 */
int security_fs_context_parse_param(struct fs_context *fc,
                                    struct fs_parameter *param)
{
        struct lsm_static_call *scall;
        int trc;
        int rc = -ENOPARAM;

        lsm_for_each_hook(scall, fs_context_parse_param) {
                trc = scall->hl->hook.fs_context_parse_param(fc, param);
                if (trc == 0)
                        rc = 0;
                else if (trc != -ENOPARAM)
                        return trc;
        }
        return rc;
}

/**
 * security_sb_alloc() - Allocate a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Allocate and attach a security structure to the sb->s_security field.  The
 * s_security field is initialized to NULL when the structure is allocated.
 * @sb contains the super_block structure to be modified.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_sb_alloc(struct super_block *sb)
{
        int rc = lsm_superblock_alloc(sb);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sb_alloc_security, sb);
        if (unlikely(rc))
                security_sb_free(sb);
        return rc;
}

/**
 * security_sb_delete() - Release super_block LSM associated objects
 * @sb: filesystem superblock
 *
 * Release objects tied to a superblock (e.g. inodes).  @sb contains the
 * super_block structure being released.
 */
void security_sb_delete(struct super_block *sb)
{
        call_void_hook(sb_delete, sb);
}

/**
 * security_sb_free() - Free a super_block LSM blob
 * @sb: filesystem superblock
 *
 * Deallocate and clear the sb->s_security field.  @sb contains the super_block
 * structure to be modified.
 */
void security_sb_free(struct super_block *sb)
{
        call_void_hook(sb_free_security, sb);
        kfree(sb->s_security);
        sb->s_security = NULL;
}

/**
 * security_free_mnt_opts() - Free memory associated with mount options
 * @mnt_opts: LSM processed mount options
 *
 * Free memory associated with @mnt_ops.
 */
void security_free_mnt_opts(void **mnt_opts)
{
        if (!*mnt_opts)
                return;
        call_void_hook(sb_free_mnt_opts, *mnt_opts);
        *mnt_opts = NULL;
}
EXPORT_SYMBOL(security_free_mnt_opts);

/**
 * security_sb_eat_lsm_opts() - Consume LSM mount options
 * @options: mount options
 * @mnt_opts: LSM processed mount options
 *
 * Eat (scan @options) and save them in @mnt_opts.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        return call_int_hook(sb_eat_lsm_opts, options, mnt_opts);
}
EXPORT_SYMBOL(security_sb_eat_lsm_opts);

/**
 * security_sb_mnt_opts_compat() - Check if new mount options are allowed
 * @sb: filesystem superblock
 * @mnt_opts: new mount options
 *
 * Determine if the new mount options in @mnt_opts are allowed given the
 * existing mounted filesystem at @sb.  @sb superblock being compared.
 *
 * Return: Returns 0 if options are compatible.
 */
int security_sb_mnt_opts_compat(struct super_block *sb,
                                void *mnt_opts)
{
        return call_int_hook(sb_mnt_opts_compat, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_mnt_opts_compat);

/**
 * security_sb_remount() - Verify no incompatible mount changes during remount
 * @sb: filesystem superblock
 * @mnt_opts: (re)mount options
 *
 * Extracts security system specific mount options and verifies no changes are
 * being made to those options.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_remount(struct super_block *sb,
                        void *mnt_opts)
{
        return call_int_hook(sb_remount, sb, mnt_opts);
}
EXPORT_SYMBOL(security_sb_remount);

/**
 * security_sb_kern_mount() - Check if a kernel mount is allowed
 * @sb: filesystem superblock
 *
 * Mount this @sb if allowed by permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_kern_mount(const struct super_block *sb)
{
        return call_int_hook(sb_kern_mount, sb);
}

/**
 * security_sb_show_options() - Output the mount options for a superblock
 * @m: output file
 * @sb: filesystem superblock
 *
 * Show (print on @m) mount options for this @sb.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        return call_int_hook(sb_show_options, m, sb);
}

/**
 * security_sb_statfs() - Check if accessing fs stats is allowed
 * @dentry: superblock handle
 *
 * Check permission before obtaining filesystem statistics for the @mnt
 * mountpoint.  @dentry is a handle on the superblock for the filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_statfs(struct dentry *dentry)
{
        return call_int_hook(sb_statfs, dentry);
}

/**
 * security_sb_mount() - Check permission for mounting a filesystem
 * @dev_name: filesystem backing device
 * @path: mount point
 * @type: filesystem type
 * @flags: mount flags
 * @data: filesystem specific data
 *
 * Check permission before an object specified by @dev_name is mounted on the
 * mount point named by @nd.  For an ordinary mount, @dev_name identifies a
 * device if the file system type requires a device.  For a remount
 * (@flags & MS_REMOUNT), @dev_name is irrelevant.  For a loopback/bind mount
 * (@flags & MS_BIND), @dev_name identifies the        pathname of the object being
 * mounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_mount(const char *dev_name, const struct path *path,
                      const char *type, unsigned long flags, void *data)
{
        return call_int_hook(sb_mount, dev_name, path, type, flags, data);
}

/**
 * security_sb_umount() - Check permission for unmounting a filesystem
 * @mnt: mounted filesystem
 * @flags: unmount flags
 *
 * Check permission before the @mnt file system is unmounted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_umount(struct vfsmount *mnt, int flags)
{
        return call_int_hook(sb_umount, mnt, flags);
}

/**
 * security_sb_pivotroot() - Check permissions for pivoting the rootfs
 * @old_path: new location for current rootfs
 * @new_path: location of the new rootfs
 *
 * Check permission before pivoting the root filesystem.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sb_pivotroot(const struct path *old_path,
                          const struct path *new_path)
{
        return call_int_hook(sb_pivotroot, old_path, new_path);
}

/**
 * security_sb_set_mnt_opts() - Set the mount options for a filesystem
 * @sb: filesystem superblock
 * @mnt_opts: binary mount options
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Set the security relevant mount options used for a superblock.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_set_mnt_opts(struct super_block *sb,
                             void *mnt_opts,
                             unsigned long kern_flags,
                             unsigned long *set_kern_flags)
{
        struct lsm_static_call *scall;
        int rc = mnt_opts ? -EOPNOTSUPP : LSM_RET_DEFAULT(sb_set_mnt_opts);

        lsm_for_each_hook(scall, sb_set_mnt_opts) {
                rc = scall->hl->hook.sb_set_mnt_opts(sb, mnt_opts, kern_flags,
                                              set_kern_flags);
                if (rc != LSM_RET_DEFAULT(sb_set_mnt_opts))
                        break;
        }
        return rc;
}
EXPORT_SYMBOL(security_sb_set_mnt_opts);

/**
 * security_sb_clone_mnt_opts() - Duplicate superblock mount options
 * @oldsb: source superblock
 * @newsb: destination superblock
 * @kern_flags: kernel flags (in)
 * @set_kern_flags: kernel flags (out)
 *
 * Copy all security options from a given superblock to another.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sb_clone_mnt_opts(const struct super_block *oldsb,
                               struct super_block *newsb,
                               unsigned long kern_flags,
                               unsigned long *set_kern_flags)
{
        return call_int_hook(sb_clone_mnt_opts, oldsb, newsb,
                             kern_flags, set_kern_flags);
}
EXPORT_SYMBOL(security_sb_clone_mnt_opts);

/**
 * security_move_mount() - Check permissions for moving a mount
 * @from_path: source mount point
 * @to_path: destination mount point
 *
 * Check permission before a mount is moved.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_move_mount(const struct path *from_path,
                        const struct path *to_path)
{
        return call_int_hook(move_mount, from_path, to_path);
}

/**
 * security_path_notify() - Check if setting a watch is allowed
 * @path: file path
 * @mask: event mask
 * @obj_type: file path type
 *
 * Check permissions before setting a watch on events as defined by @mask, on
 * an object at @path, whose type is defined by @obj_type.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_notify(const struct path *path, u64 mask,
                         unsigned int obj_type)
{
        return call_int_hook(path_notify, path, mask, obj_type);
}

/**
 * security_inode_alloc() - Allocate an inode LSM blob
 * @inode: the inode
 * @gfp: allocation flags
 *
 * Allocate and attach a security structure to @inode->i_security.  The
 * i_security field is initialized to NULL when the inode structure is
 * allocated.
 *
 * Return: Return 0 if operation was successful.
 */
int security_inode_alloc(struct inode *inode, gfp_t gfp)
{
        int rc = lsm_inode_alloc(inode, gfp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(inode_alloc_security, inode);
        if (unlikely(rc))
                security_inode_free(inode);
        return rc;
}

static void inode_free_by_rcu(struct rcu_head *head)
{
        /* The rcu head is at the start of the inode blob */
        call_void_hook(inode_free_security_rcu, head);
        kmem_cache_free(lsm_inode_cache, head);
}

/**
 * security_inode_free() - Free an inode's LSM blob
 * @inode: the inode
 *
 * Release any LSM resources associated with @inode, although due to the
 * inode's RCU protections it is possible that the resources will not be
 * fully released until after the current RCU grace period has elapsed.
 *
 * It is important for LSMs to note that despite being present in a call to
 * security_inode_free(), @inode may still be referenced in a VFS path walk
 * and calls to security_inode_permission() may be made during, or after,
 * a call to security_inode_free().  For this reason the inode->i_security
 * field is released via a call_rcu() callback and any LSMs which need to
 * retain inode state for use in security_inode_permission() should only
 * release that state in the inode_free_security_rcu() LSM hook callback.
 */
void security_inode_free(struct inode *inode)
{
        call_void_hook(inode_free_security, inode);
        if (!inode->i_security)
                return;
        call_rcu((struct rcu_head *)inode->i_security, inode_free_by_rcu);
}

/**
 * security_dentry_init_security() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @xattr_name: name of the security/LSM xattr
 * @lsmctx: pointer to the resulting LSM context
 *
 * Compute a context for a dentry as the inode is not yet available since NFSv4
 * has no label backed by an EA anyway.  It is important to note that
 * @xattr_name does not need to be free'd by the caller, it is a static string.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_dentry_init_security(struct dentry *dentry, int mode,
                                  const struct qstr *name,
                                  const char **xattr_name,
                                  struct lsm_context *lsmctx)
{
        return call_int_hook(dentry_init_security, dentry, mode, name,
                             xattr_name, lsmctx);
}
EXPORT_SYMBOL(security_dentry_init_security);

/**
 * security_dentry_create_files_as() - Perform dentry initialization
 * @dentry: the dentry to initialize
 * @mode: mode used to determine resource type
 * @name: name of the last path component
 * @old: creds to use for LSM context calculations
 * @new: creds to modify
 *
 * Compute a context for a dentry as the inode is not yet available and set
 * that context in passed in creds so that new files are created using that
 * context. Context is calculated using the passed in creds and not the creds
 * of the caller.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_dentry_create_files_as(struct dentry *dentry, int mode,
                                    struct qstr *name,
                                    const struct cred *old, struct cred *new)
{
        return call_int_hook(dentry_create_files_as, dentry, mode,
                             name, old, new);
}
EXPORT_SYMBOL(security_dentry_create_files_as);

/**
 * security_inode_init_security() - Initialize an inode's LSM context
 * @inode: the inode
 * @dir: parent directory
 * @qstr: last component of the pathname
 * @initxattrs: callback function to write xattrs
 * @fs_data: filesystem specific data
 *
 * Obtain the security attribute name suffix and value to set on a newly
 * created inode and set up the incore security field for the new inode.  This
 * hook is called by the fs code as part of the inode creation transaction and
 * provides for atomic labeling of the inode, unlike the post_create/mkdir/...
 * hooks called by the VFS.
 *
 * The hook function is expected to populate the xattrs array, by calling
 * lsm_get_xattr_slot() to retrieve the slots reserved by the security module
 * with the lbs_xattr_count field of the lsm_blob_sizes structure.  For each
 * slot, the hook function should set ->name to the attribute name suffix
 * (e.g. selinux), to allocate ->value (will be freed by the caller) and set it
 * to the attribute value, to set ->value_len to the length of the value.  If
 * the security module does not use security attributes or does not wish to put
 * a security attribute on this particular inode, then it should return
 * -EOPNOTSUPP to skip this processing.
 *
 * Return: Returns 0 if the LSM successfully initialized all of the inode
 *         security attributes that are required, negative values otherwise.
 */
int security_inode_init_security(struct inode *inode, struct inode *dir,
                                 const struct qstr *qstr,
                                 const initxattrs initxattrs, void *fs_data)
{
        struct lsm_static_call *scall;
        struct xattr *new_xattrs = NULL;
        int ret = -EOPNOTSUPP, xattr_count = 0;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        if (!blob_sizes.lbs_xattr_count)
                return 0;

        if (initxattrs) {
                /* Allocate +1 as terminator. */
                new_xattrs = kcalloc(blob_sizes.lbs_xattr_count + 1,
                                     sizeof(*new_xattrs), GFP_NOFS);
                if (!new_xattrs)
                        return -ENOMEM;
        }

        lsm_for_each_hook(scall, inode_init_security) {
                ret = scall->hl->hook.inode_init_security(inode, dir, qstr, new_xattrs,
                                                  &xattr_count);
                if (ret && ret != -EOPNOTSUPP)
                        goto out;
                /*
                 * As documented in lsm_hooks.h, -EOPNOTSUPP in this context
                 * means that the LSM is not willing to provide an xattr, not
                 * that it wants to signal an error. Thus, continue to invoke
                 * the remaining LSMs.
                 */
        }

        /* If initxattrs() is NULL, xattr_count is zero, skip the call. */
        if (!xattr_count)
                goto out;

        ret = initxattrs(inode, new_xattrs, fs_data);
out:
        for (; xattr_count > 0; xattr_count--)
                kfree(new_xattrs[xattr_count - 1].value);
        kfree(new_xattrs);
        return (ret == -EOPNOTSUPP) ? 0 : ret;
}
EXPORT_SYMBOL(security_inode_init_security);

/**
 * security_inode_init_security_anon() - Initialize an anonymous inode
 * @inode: the inode
 * @name: the anonymous inode class
 * @context_inode: an optional related inode
 *
 * Set up the incore security field for the new anonymous inode and return
 * whether the inode creation is permitted by the security module or not.
 *
 * Return: Returns 0 on success, -EACCES if the security module denies the
 * creation of this inode, or another -errno upon other errors.
 */
int security_inode_init_security_anon(struct inode *inode,
                                      const struct qstr *name,
                                      const struct inode *context_inode)
{
        return call_int_hook(inode_init_security_anon, inode, name,
                             context_inode);
}

#ifdef CONFIG_SECURITY_PATH
/**
 * security_path_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a file. Note that this hook is called even
 * if mknod operation is being done for a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mknod(const struct path *dir, struct dentry *dentry,
                        umode_t mode, unsigned int dev)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mknod, dir, dentry, mode, dev);
}
EXPORT_SYMBOL(security_path_mknod);

/**
 * security_path_post_mknod() - Update inode security after reg file creation
 * @idmap: idmap of the mount
 * @dentry: new file
 *
 * Update inode security field after a regular file has been created.
 */
void security_path_post_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(path_post_mknod, idmap, dentry);
}

/**
 * security_path_mkdir() - Check if creating a new directory is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_mkdir(const struct path *dir, struct dentry *dentry,
                        umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL(security_path_mkdir);

/**
 * security_path_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to remove
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rmdir(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_rmdir, dir, dentry);
}

/**
 * security_path_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_unlink(const struct path *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_unlink, dir, dentry);
}
EXPORT_SYMBOL(security_path_unlink);

/**
 * security_path_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: file pathname
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_symlink(const struct path *dir, struct dentry *dentry,
                          const char *old_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dir->dentry))))
                return 0;
        return call_int_hook(path_symlink, dir, dentry, old_name);
}

/**
 * security_path_link - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @new_dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
                       struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(path_link, old_dentry, new_dir, new_dentry);
}

/**
 * security_path_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
                         const struct path *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        return call_int_hook(path_rename, old_dir, old_dentry, new_dir,
                             new_dentry, flags);
}
EXPORT_SYMBOL(security_path_rename);

/**
 * security_path_truncate() - Check if truncating a file is allowed
 * @path: file
 *
 * Check permission before truncating the file indicated by path.  Note that
 * truncation permissions may also be checked based on already opened files,
 * using the security_file_truncate() hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_truncate(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_truncate, path);
}

/**
 * security_path_chmod() - Check if changing the file's mode is allowed
 * @path: file
 * @mode: new mode
 *
 * Check for permission to change a mode of the file @path. The new mode is
 * specified in @mode which is a bitmask of constants from
 * <include/uapi/linux/stat.h>.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chmod(const struct path *path, umode_t mode)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chmod, path, mode);
}

/**
 * security_path_chown() - Check if changing the file's owner/group is allowed
 * @path: file
 * @uid: file owner
 * @gid: file group
 *
 * Check for permission to change owner/group of a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(path_chown, path, uid, gid);
}

/**
 * security_path_chroot() - Check if changing the root directory is allowed
 * @path: directory
 *
 * Check for permission to change root directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_path_chroot(const struct path *path)
{
        return call_int_hook(path_chroot, path);
}
#endif /* CONFIG_SECURITY_PATH */

/**
 * security_inode_create() - Check if creating a file is allowed
 * @dir: the parent directory
 * @dentry: the file being created
 * @mode: requested file mode
 *
 * Check permission to create a regular file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_create(struct inode *dir, struct dentry *dentry,
                          umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_create, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_create);

/**
 * security_inode_post_create_tmpfile() - Update inode security of new tmpfile
 * @idmap: idmap of the mount
 * @inode: inode of the new tmpfile
 *
 * Update inode security data after a tmpfile has been created.
 */
void security_inode_post_create_tmpfile(struct mnt_idmap *idmap,
                                        struct inode *inode)
{
        if (unlikely(IS_PRIVATE(inode)))
                return;
        call_void_hook(inode_post_create_tmpfile, idmap, inode);
}

/**
 * security_inode_link() - Check if creating a hard link is allowed
 * @old_dentry: existing file
 * @dir: new parent directory
 * @new_dentry: new link
 *
 * Check permission before creating a new hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                        struct dentry *new_dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry))))
                return 0;
        return call_int_hook(inode_link, old_dentry, dir, new_dentry);
}

/**
 * security_inode_unlink() - Check if removing a hard link is allowed
 * @dir: parent directory
 * @dentry: file
 *
 * Check the permission to remove a hard link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_unlink, dir, dentry);
}

/**
 * security_inode_symlink() - Check if creating a symbolic link is allowed
 * @dir: parent directory
 * @dentry: symbolic link
 * @old_name: existing filename
 *
 * Check the permission to create a symbolic link to a file.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_symlink(struct inode *dir, struct dentry *dentry,
                           const char *old_name)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_symlink, dir, dentry, old_name);
}

/**
 * security_inode_mkdir() - Check if creation a new director is allowed
 * @dir: parent directory
 * @dentry: new directory
 * @mode: new directory mode
 *
 * Check permissions to create a new directory in the existing directory
 * associated with inode structure @dir.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mkdir, dir, dentry, mode);
}
EXPORT_SYMBOL_GPL(security_inode_mkdir);

/**
 * security_inode_rmdir() - Check if removing a directory is allowed
 * @dir: parent directory
 * @dentry: directory to be removed
 *
 * Check the permission to remove a directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_rmdir, dir, dentry);
}

/**
 * security_inode_mknod() - Check if creating a special file is allowed
 * @dir: parent directory
 * @dentry: new file
 * @mode: new file mode
 * @dev: device number
 *
 * Check permissions when creating a special file (or a socket or a fifo file
 * created via the mknod system call).  Note that if mknod operation is being
 * done for a regular file, then the create hook will be called and not this
 * hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_mknod(struct inode *dir, struct dentry *dentry,
                         umode_t mode, dev_t dev)
{
        if (unlikely(IS_PRIVATE(dir)))
                return 0;
        return call_int_hook(inode_mknod, dir, dentry, mode, dev);
}

/**
 * security_inode_rename() - Check if renaming a file is allowed
 * @old_dir: parent directory of the old file
 * @old_dentry: the old file
 * @new_dir: parent directory of the new file
 * @new_dentry: the new file
 * @flags: flags
 *
 * Check for permission to rename a file or directory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry,
                          struct inode *new_dir, struct dentry *new_dentry,
                          unsigned int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(old_dentry)) ||
                     (d_is_positive(new_dentry) &&
                      IS_PRIVATE(d_backing_inode(new_dentry)))))
                return 0;

        if (flags & RENAME_EXCHANGE) {
                int err = call_int_hook(inode_rename, new_dir, new_dentry,
                                        old_dir, old_dentry);
                if (err)
                        return err;
        }

        return call_int_hook(inode_rename, old_dir, old_dentry,
                             new_dir, new_dentry);
}

/**
 * security_inode_readlink() - Check if reading a symbolic link is allowed
 * @dentry: link
 *
 * Check the permission to read the symbolic link.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_readlink(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_readlink, dentry);
}

/**
 * security_inode_follow_link() - Check if following a symbolic link is allowed
 * @dentry: link dentry
 * @inode: link inode
 * @rcu: true if in RCU-walk mode
 *
 * Check permission to follow a symbolic link when looking up a pathname.  If
 * @rcu is true, @inode is not stable.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
                               bool rcu)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_follow_link, dentry, inode, rcu);
}

/**
 * security_inode_permission() - Check if accessing an inode is allowed
 * @inode: inode
 * @mask: access mask
 *
 * Check permission before accessing an inode.  This hook is called by the
 * existing Linux permission function, so a security module can use it to
 * provide additional checking for existing Linux permission checks.  Notice
 * that this hook is called when a file is opened (as well as many other
 * operations), whereas the file_security_ops permission hook is called when
 * the actual read/write operations are performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_permission(struct inode *inode, int mask)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_permission, inode, mask);
}

/**
 * security_inode_setattr() - Check if setting file attributes is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @attr: new attributes
 *
 * Check permission before setting file attributes.  Note that the kernel call
 * to notify_change is performed from several locations, whenever file
 * attributes change (such as when a file is truncated, chown/chmod operations,
 * transferring disk quotas, etc).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setattr(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_setattr, idmap, dentry, attr);
}
EXPORT_SYMBOL_GPL(security_inode_setattr);

/**
 * security_inode_post_setattr() - Update the inode after a setattr operation
 * @idmap: idmap of the mount
 * @dentry: file
 * @ia_valid: file attributes set
 *
 * Update inode security field after successful setting file attributes.
 */
void security_inode_post_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 int ia_valid)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setattr, idmap, dentry, ia_valid);
}

/**
 * security_inode_getattr() - Check if getting file attributes is allowed
 * @path: file
 *
 * Check permission before obtaining file attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getattr(const struct path *path)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(path->dentry))))
                return 0;
        return call_int_hook(inode_getattr, path);
}

/**
 * security_inode_setxattr() - Check if setting file xattrs is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: size of xattr value
 * @flags: flags
 *
 * This hook performs the desired permission checks before setting the extended
 * attributes (xattrs) on @dentry.  It is important to note that we have some
 * additional logic before the main LSM implementation calls to detect if we
 * need to perform an additional capability check at the LSM layer.
 *
 * Normally we enforce a capability check prior to executing the various LSM
 * hook implementations, but if a LSM wants to avoid this capability check,
 * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for
 * xattrs that it wants to avoid the capability check, leaving the LSM fully
 * responsible for enforcing the access control for the specific xattr.  If all
 * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook,
 * or return a 0 (the default return value), the capability check is still
 * performed.  If no 'inode_xattr_skipcap' hooks are registered the capability
 * check is performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_setxattr(struct mnt_idmap *idmap,
                            struct dentry *dentry, const char *name,
                            const void *value, size_t size, int flags)
{
        int rc;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        /* enforce the capability checks at the lsm layer, if needed */
        if (!call_int_hook(inode_xattr_skipcap, name)) {
                rc = cap_inode_setxattr(dentry, name, value, size, flags);
                if (rc)
                        return rc;
        }

        return call_int_hook(inode_setxattr, idmap, dentry, name, value, size,
                             flags);
}

/**
 * security_inode_set_acl() - Check if setting posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Check permission before setting posix acls, the posix acls in @kacl are
 * identified by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_set_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name,
                           struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_set_acl, idmap, dentry, acl_name, kacl);
}

/**
 * security_inode_post_set_acl() - Update inode security from posix acls set
 * @dentry: file
 * @acl_name: acl name
 * @kacl: acl struct
 *
 * Update inode security data after successfully setting posix acls on @dentry.
 * The posix acls in @kacl are identified by @acl_name.
 */
void security_inode_post_set_acl(struct dentry *dentry, const char *acl_name,
                                 struct posix_acl *kacl)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_set_acl, dentry, acl_name, kacl);
}

/**
 * security_inode_get_acl() - Check if reading posix acls is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before getting osix acls, the posix acls are identified by
 * @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_get_acl(struct mnt_idmap *idmap,
                           struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_get_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_remove_acl() - Check if removing a posix acl is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Check permission before removing posix acls, the posix acls are identified
 * by @acl_name.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_remove_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_remove_acl() - Update inode security after rm posix acls
 * @idmap: idmap of the mount
 * @dentry: file
 * @acl_name: acl name
 *
 * Update inode security data after successfully removing posix acls on
 * @dentry in @idmap. The posix acls are identified by @acl_name.
 */
void security_inode_post_remove_acl(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *acl_name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_remove_acl, idmap, dentry, acl_name);
}

/**
 * security_inode_post_setxattr() - Update the inode after a setxattr operation
 * @dentry: file
 * @name: xattr name
 * @value: xattr value
 * @size: xattr value size
 * @flags: flags
 *
 * Update inode security field after successful setxattr operation.
 */
void security_inode_post_setxattr(struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_setxattr, dentry, name, value, size, flags);
}

/**
 * security_inode_getxattr() - Check if xattr access is allowed
 * @dentry: file
 * @name: xattr name
 *
 * Check permission before obtaining the extended attributes identified by
 * @name for @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_getxattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_getxattr, dentry, name);
}

/**
 * security_inode_listxattr() - Check if listing xattrs is allowed
 * @dentry: file
 *
 * Check permission before obtaining the list of extended attribute names for
 * @dentry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_listxattr(struct dentry *dentry)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;
        return call_int_hook(inode_listxattr, dentry);
}

/**
 * security_inode_removexattr() - Check if removing an xattr is allowed
 * @idmap: idmap of the mount
 * @dentry: file
 * @name: xattr name
 *
 * This hook performs the desired permission checks before setting the extended
 * attributes (xattrs) on @dentry.  It is important to note that we have some
 * additional logic before the main LSM implementation calls to detect if we
 * need to perform an additional capability check at the LSM layer.
 *
 * Normally we enforce a capability check prior to executing the various LSM
 * hook implementations, but if a LSM wants to avoid this capability check,
 * it can register a 'inode_xattr_skipcap' hook and return a value of 1 for
 * xattrs that it wants to avoid the capability check, leaving the LSM fully
 * responsible for enforcing the access control for the specific xattr.  If all
 * of the enabled LSMs refrain from registering a 'inode_xattr_skipcap' hook,
 * or return a 0 (the default return value), the capability check is still
 * performed.  If no 'inode_xattr_skipcap' hooks are registered the capability
 * check is performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inode_removexattr(struct mnt_idmap *idmap,
                               struct dentry *dentry, const char *name)
{
        int rc;

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        /* enforce the capability checks at the lsm layer, if needed */
        if (!call_int_hook(inode_xattr_skipcap, name)) {
                rc = cap_inode_removexattr(idmap, dentry, name);
                if (rc)
                        return rc;
        }

        return call_int_hook(inode_removexattr, idmap, dentry, name);
}

/**
 * security_inode_post_removexattr() - Update the inode after a removexattr op
 * @dentry: file
 * @name: xattr name
 *
 * Update the inode after a successful removexattr operation.
 */
void security_inode_post_removexattr(struct dentry *dentry, const char *name)
{
        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return;
        call_void_hook(inode_post_removexattr, dentry, name);
}

/**
 * security_inode_need_killpriv() - Check if security_inode_killpriv() required
 * @dentry: associated dentry
 *
 * Called when an inode has been changed to determine if
 * security_inode_killpriv() should be called.
 *
 * Return: Return <0 on error to abort the inode change operation, return 0 if
 *         security_inode_killpriv() does not need to be called, return >0 if
 *         security_inode_killpriv() does need to be called.
 */
int security_inode_need_killpriv(struct dentry *dentry)
{
        return call_int_hook(inode_need_killpriv, dentry);
}

/**
 * security_inode_killpriv() - The setuid bit is removed, update LSM state
 * @idmap: idmap of the mount
 * @dentry: associated dentry
 *
 * The @dentry's setuid bit is being removed.  Remove similar security labels.
 * Called with the dentry->d_inode->i_mutex held.
 *
 * Return: Return 0 on success.  If error is returned, then the operation
 *         causing setuid bit removal is failed.
 */
int security_inode_killpriv(struct mnt_idmap *idmap,
                            struct dentry *dentry)
{
        return call_int_hook(inode_killpriv, idmap, dentry);
}

/**
 * security_inode_getsecurity() - Get the xattr security label of an inode
 * @idmap: idmap of the mount
 * @inode: inode
 * @name: xattr name
 * @buffer: security label buffer
 * @alloc: allocation flag
 *
 * Retrieve a copy of the extended attribute representation of the security
 * label associated with @name for @inode via @buffer.  Note that @name is the
 * remainder of the attribute name after the security prefix has been removed.
 * @alloc is used to specify if the call should return a value via the buffer
 * or just the value length.
 *
 * Return: Returns size of buffer on success.
 */
int security_inode_getsecurity(struct mnt_idmap *idmap,
                               struct inode *inode, const char *name,
                               void **buffer, bool alloc)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_getsecurity);

        return call_int_hook(inode_getsecurity, idmap, inode, name, buffer,
                             alloc);
}

/**
 * security_inode_setsecurity() - Set the xattr security label of an inode
 * @inode: inode
 * @name: xattr name
 * @value: security label
 * @size: length of security label
 * @flags: flags
 *
 * Set the security label associated with @name for @inode from the extended
 * attribute value @value.  @size indicates the size of the @value in bytes.
 * @flags may be XATTR_CREATE, XATTR_REPLACE, or 0. Note that @name is the
 * remainder of the attribute name after the security. prefix has been removed.
 *
 * Return: Returns 0 on success.
 */
int security_inode_setsecurity(struct inode *inode, const char *name,
                               const void *value, size_t size, int flags)
{
        if (unlikely(IS_PRIVATE(inode)))
                return LSM_RET_DEFAULT(inode_setsecurity);

        return call_int_hook(inode_setsecurity, inode, name, value, size,
                             flags);
}

/**
 * security_inode_listsecurity() - List the xattr security label names
 * @inode: inode
 * @buffer: buffer
 * @buffer_size: size of buffer
 *
 * Copy the extended attribute names for the security labels associated with
 * @inode into @buffer.  The maximum size of @buffer is specified by
 * @buffer_size.  @buffer may be NULL to request the size of the buffer
 * required.
 *
 * Return: Returns number of bytes used/required on success.
 */
int security_inode_listsecurity(struct inode *inode,
                                char *buffer, size_t buffer_size)
{
        if (unlikely(IS_PRIVATE(inode)))
                return 0;
        return call_int_hook(inode_listsecurity, inode, buffer, buffer_size);
}
EXPORT_SYMBOL(security_inode_listsecurity);

/**
 * security_inode_getlsmprop() - Get an inode's LSM data
 * @inode: inode
 * @prop: lsm specific information to return
 *
 * Get the lsm specific information associated with the node.
 */
void security_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
{
        call_void_hook(inode_getlsmprop, inode, prop);
}

/**
 * security_inode_copy_up() - Create new creds for an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @new: newly created creds
 *
 * A file is about to be copied up from lower layer to upper layer of overlay
 * filesystem. Security module can prepare a set of new creds and modify as
 * need be and return new creds. Caller will switch to new creds temporarily to
 * create new file and release newly allocated creds.
 *
 * Return: Returns 0 on success or a negative error code on error.
 */
int security_inode_copy_up(struct dentry *src, struct cred **new)
{
        return call_int_hook(inode_copy_up, src, new);
}
EXPORT_SYMBOL(security_inode_copy_up);

/**
 * security_inode_copy_up_xattr() - Filter xattrs in an overlayfs copy-up op
 * @src: union dentry of copy-up file
 * @name: xattr name
 *
 * Filter the xattrs being copied up when a unioned file is copied up from a
 * lower layer to the union/overlay layer.   The caller is responsible for
 * reading and writing the xattrs, this hook is merely a filter.
 *
 * Return: Returns 0 to accept the xattr, -ECANCELED to discard the xattr,
 *         -EOPNOTSUPP if the security module does not know about attribute,
 *         or a negative error code to abort the copy up.
 */
int security_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        int rc;

        rc = call_int_hook(inode_copy_up_xattr, src, name);
        if (rc != LSM_RET_DEFAULT(inode_copy_up_xattr))
                return rc;

        return LSM_RET_DEFAULT(inode_copy_up_xattr);
}
EXPORT_SYMBOL(security_inode_copy_up_xattr);

/**
 * security_inode_setintegrity() - Set the inode's integrity data
 * @inode: inode
 * @type: type of integrity, e.g. hash digest, signature, etc
 * @value: the integrity value
 * @size: size of the integrity value
 *
 * Register a verified integrity measurement of a inode with LSMs.
 * LSMs should free the previously saved data if @value is NULL.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_inode_setintegrity(const struct inode *inode,
                                enum lsm_integrity_type type, const void *value,
                                size_t size)
{
        return call_int_hook(inode_setintegrity, inode, type, value, size);
}
EXPORT_SYMBOL(security_inode_setintegrity);

/**
 * security_kernfs_init_security() - Init LSM context for a kernfs node
 * @kn_dir: parent kernfs node
 * @kn: the kernfs node to initialize
 *
 * Initialize the security context of a newly created kernfs node based on its
 * own and its parent's attributes.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernfs_init_security(struct kernfs_node *kn_dir,
                                  struct kernfs_node *kn)
{
        return call_int_hook(kernfs_init_security, kn_dir, kn);
}

/**
 * security_file_permission() - Check file permissions
 * @file: file
 * @mask: requested permissions
 *
 * Check file permissions before accessing an open file.  This hook is called
 * by various operations that read or write files.  A security module can use
 * this hook to perform additional checking on these operations, e.g. to
 * revalidate permissions on use to support privilege bracketing or policy
 * changes.  Notice that this hook is used when the actual read/write
 * operations are performed, whereas the inode_security_ops hook is called when
 * a file is opened (as well as many other operations).  Although this hook can
 * be used to revalidate permissions for various system call operations that
 * read or write files, it does not address the revalidation of permissions for
 * memory-mapped files.  Security modules must handle this separately if they
 * need such revalidation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_permission(struct file *file, int mask)
{
        return call_int_hook(file_permission, file, mask);
}

/**
 * security_file_alloc() - Allocate and init a file's LSM blob
 * @file: the file
 *
 * Allocate and attach a security structure to the file->f_security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if the hook is successful and permission is granted.
 */
int security_file_alloc(struct file *file)
{
        int rc = lsm_file_alloc(file);

        if (rc)
                return rc;
        rc = call_int_hook(file_alloc_security, file);
        if (unlikely(rc))
                security_file_free(file);
        return rc;
}

/**
 * security_file_release() - Perform actions before releasing the file ref
 * @file: the file
 *
 * Perform actions before releasing the last reference to a file.
 */
void security_file_release(struct file *file)
{
        call_void_hook(file_release, file);
}

/**
 * security_file_free() - Free a file's LSM blob
 * @file: the file
 *
 * Deallocate and free any security structures stored in file->f_security.
 */
void security_file_free(struct file *file)
{
        void *blob;

        call_void_hook(file_free_security, file);

        blob = file->f_security;
        if (blob) {
                file->f_security = NULL;
                kmem_cache_free(lsm_file_cache, blob);
        }
}

/**
 * security_file_ioctl() - Check if an ioctl is allowed
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Check permission for an ioctl operation on @file.  Note that @arg sometimes
 * represents a user space pointer; in other cases, it may be a simple integer
 * value.  When @arg represents a user space pointer, it should never be used
 * by the security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_ioctl, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl);

/**
 * security_file_ioctl_compat() - Check if an ioctl is allowed in compat mode
 * @file: associated file
 * @cmd: ioctl cmd
 * @arg: ioctl arguments
 *
 * Compat version of security_file_ioctl() that correctly handles 32-bit
 * processes running on 64-bit kernels.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_ioctl_compat(struct file *file, unsigned int cmd,
                               unsigned long arg)
{
        return call_int_hook(file_ioctl_compat, file, cmd, arg);
}
EXPORT_SYMBOL_GPL(security_file_ioctl_compat);

static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
{
        /*
         * Does we have PROT_READ and does the application expect
         * it to imply PROT_EXEC?  If not, nothing to talk about...
         */
        if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
                return prot;
        if (!(current->personality & READ_IMPLIES_EXEC))
                return prot;
        /*
         * if that's an anonymous mapping, let it.
         */
        if (!file)
                return prot | PROT_EXEC;
        /*
         * ditto if it's not on noexec mount, except that on !MMU we need
         * NOMMU_MAP_EXEC (== VM_MAYEXEC) in this case
         */
        if (!path_noexec(&file->f_path)) {
#ifndef CONFIG_MMU
                if (file->f_op->mmap_capabilities) {
                        unsigned caps = file->f_op->mmap_capabilities(file);
                        if (!(caps & NOMMU_MAP_EXEC))
                                return prot;
                }
#endif
                return prot | PROT_EXEC;
        }
        /* anything on noexec mount won't get PROT_EXEC */
        return prot;
}

/**
 * security_mmap_file() - Check if mmap'ing a file is allowed
 * @file: file
 * @prot: protection applied by the kernel
 * @flags: flags
 *
 * Check permissions for a mmap operation.  The @file may be NULL, e.g. if
 * mapping anonymous memory.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_file(struct file *file, unsigned long prot,
                       unsigned long flags)
{
        return call_int_hook(mmap_file, file, prot, mmap_prot(file, prot),
                             flags);
}

/**
 * security_mmap_addr() - Check if mmap'ing an address is allowed
 * @addr: address
 *
 * Check permissions for a mmap operation at @addr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_mmap_addr(unsigned long addr)
{
        return call_int_hook(mmap_addr, addr);
}

/**
 * security_file_mprotect() - Check if changing memory protections is allowed
 * @vma: memory region
 * @reqprot: application requested protection
 * @prot: protection applied by the kernel
 *
 * Check permissions before changing memory access permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                           unsigned long prot)
{
        return call_int_hook(file_mprotect, vma, reqprot, prot);
}

/**
 * security_file_lock() - Check if a file lock is allowed
 * @file: file
 * @cmd: lock operation (e.g. F_RDLCK, F_WRLCK)
 *
 * Check permission before performing file locking operations.  Note the hook
 * mediates both flock and fcntl style locks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_lock(struct file *file, unsigned int cmd)
{
        return call_int_hook(file_lock, file, cmd);
}

/**
 * security_file_fcntl() - Check if fcntl() op is allowed
 * @file: file
 * @cmd: fcntl command
 * @arg: command argument
 *
 * Check permission before allowing the file operation specified by @cmd from
 * being performed on the file @file.  Note that @arg sometimes represents a
 * user space pointer; in other cases, it may be a simple integer value.  When
 * @arg represents a user space pointer, it should never be used by the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
{
        return call_int_hook(file_fcntl, file, cmd, arg);
}

/**
 * security_file_set_fowner() - Set the file owner info in the LSM blob
 * @file: the file
 *
 * Save owner security information (typically from current->security) in
 * file->f_security for later use by the send_sigiotask hook.
 *
 * This hook is called with file->f_owner.lock held.
 *
 * Return: Returns 0 on success.
 */
void security_file_set_fowner(struct file *file)
{
        call_void_hook(file_set_fowner, file);
}

/**
 * security_file_send_sigiotask() - Check if sending SIGIO/SIGURG is allowed
 * @tsk: target task
 * @fown: signal sender
 * @sig: signal to be sent, SIGIO is sent if 0
 *
 * Check permission for the file owner @fown to send SIGIO or SIGURG to the
 * process @tsk.  Note that this hook is sometimes called from interrupt.  Note
 * that the fown_struct, @fown, is never outside the context of a struct file,
 * so the file structure (and associated security information) can always be
 * obtained: container_of(fown, struct file, f_owner).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_send_sigiotask(struct task_struct *tsk,
                                 struct fown_struct *fown, int sig)
{
        return call_int_hook(file_send_sigiotask, tsk, fown, sig);
}

/**
 * security_file_receive() - Check if receiving a file via IPC is allowed
 * @file: file being received
 *
 * This hook allows security modules to control the ability of a process to
 * receive an open file descriptor via socket IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_receive(struct file *file)
{
        return call_int_hook(file_receive, file);
}

/**
 * security_file_open() - Save open() time state for late use by the LSM
 * @file:
 *
 * Save open-time permission checking state for later use upon file_permission,
 * and recheck access if anything has changed since inode_permission.
 *
 * We can check if a file is opened for execution (e.g. execve(2) call), either
 * directly or indirectly (e.g. ELF's ld.so) by checking file->f_flags &
 * __FMODE_EXEC .
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_open(struct file *file)
{
        return call_int_hook(file_open, file);
}

/**
 * security_file_post_open() - Evaluate a file after it has been opened
 * @file: the file
 * @mask: access mask
 *
 * Evaluate an opened file and the access mask requested with open(). The hook
 * is useful for LSMs that require the file content to be available in order to
 * make decisions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_post_open(struct file *file, int mask)
{
        return call_int_hook(file_post_open, file, mask);
}
EXPORT_SYMBOL_GPL(security_file_post_open);

/**
 * security_file_truncate() - Check if truncating a file is allowed
 * @file: file
 *
 * Check permission before truncating a file, i.e. using ftruncate.  Note that
 * truncation permission may also be checked based on the path, using the
 * @path_truncate hook.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_file_truncate(struct file *file)
{
        return call_int_hook(file_truncate, file);
}

/**
 * security_task_alloc() - Allocate a task's LSM blob
 * @task: the task
 * @clone_flags: flags indicating what is being shared
 *
 * Handle allocation of task-related resources.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_task_alloc(struct task_struct *task, unsigned long clone_flags)
{
        int rc = lsm_task_alloc(task);

        if (rc)
                return rc;
        rc = call_int_hook(task_alloc, task, clone_flags);
        if (unlikely(rc))
                security_task_free(task);
        return rc;
}

/**
 * security_task_free() - Free a task's LSM blob and related resources
 * @task: task
 *
 * Handle release of task-related resources.  Note that this can be called from
 * interrupt context.
 */
void security_task_free(struct task_struct *task)
{
        call_void_hook(task_free, task);

        kfree(task->security);
        task->security = NULL;
}

/**
 * security_cred_alloc_blank() - Allocate the min memory to allow cred_transfer
 * @cred: credentials
 * @gfp: gfp flags
 *
 * Only allocate sufficient memory and attach to @cred such that
 * cred_transfer() will not get ENOMEM.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_cred_alloc_blank(struct cred *cred, gfp_t gfp)
{
        int rc = lsm_cred_alloc(cred, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_alloc_blank, cred, gfp);
        if (unlikely(rc))
                security_cred_free(cred);
        return rc;
}

/**
 * security_cred_free() - Free the cred's LSM blob and associated resources
 * @cred: credentials
 *
 * Deallocate and clear the cred->security field in a set of credentials.
 */
void security_cred_free(struct cred *cred)
{
        /*
         * There is a failure case in prepare_creds() that
         * may result in a call here with ->security being NULL.
         */
        if (unlikely(cred->security == NULL))
                return;

        call_void_hook(cred_free, cred);

        kfree(cred->security);
        cred->security = NULL;
}

/**
 * security_prepare_creds() - Prepare a new set of credentials
 * @new: new credentials
 * @old: original credentials
 * @gfp: gfp flags
 *
 * Prepare a new set of credentials by copying the data from the old set.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp)
{
        int rc = lsm_cred_alloc(new, gfp);

        if (rc)
                return rc;

        rc = call_int_hook(cred_prepare, new, old, gfp);
        if (unlikely(rc))
                security_cred_free(new);
        return rc;
}

/**
 * security_transfer_creds() - Transfer creds
 * @new: target credentials
 * @old: original credentials
 *
 * Transfer data from original creds to new creds.
 */
void security_transfer_creds(struct cred *new, const struct cred *old)
{
        call_void_hook(cred_transfer, new, old);
}

/**
 * security_cred_getsecid() - Get the secid from a set of credentials
 * @c: credentials
 * @secid: secid value
 *
 * Retrieve the security identifier of the cred structure @c.  In case of
 * failure, @secid will be set to zero.
 */
void security_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = 0;
        call_void_hook(cred_getsecid, c, secid);
}
EXPORT_SYMBOL(security_cred_getsecid);

/**
 * security_cred_getlsmprop() - Get the LSM data from a set of credentials
 * @c: credentials
 * @prop: destination for the LSM data
 *
 * Retrieve the security data of the cred structure @c.  In case of
 * failure, @prop will be cleared.
 */
void security_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(cred_getlsmprop, c, prop);
}
EXPORT_SYMBOL(security_cred_getlsmprop);

/**
 * security_kernel_act_as() - Set the kernel credentials to act as secid
 * @new: credentials
 * @secid: secid
 *
 * Set the credentials for a kernel service to act as (subjective context).
 * The current task must be the one that nominated @secid.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_act_as(struct cred *new, u32 secid)
{
        return call_int_hook(kernel_act_as, new, secid);
}

/**
 * security_kernel_create_files_as() - Set file creation context using an inode
 * @new: target credentials
 * @inode: reference inode
 *
 * Set the file creation context in a set of credentials to be the same as the
 * objective context of the specified inode.  The current task must be the one
 * that nominated @inode.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        return call_int_hook(kernel_create_files_as, new, inode);
}

/**
 * security_kernel_module_request() - Check if loading a module is allowed
 * @kmod_name: module name
 *
 * Ability to trigger the kernel to automatically upcall to userspace for
 * userspace to load a kernel module with the given name.
 *
 * Return: Returns 0 if successful.
 */
int security_kernel_module_request(char *kmod_name)
{
        return call_int_hook(kernel_module_request, kmod_name);
}

/**
 * security_kernel_read_file() - Read a file specified by userspace
 * @file: file
 * @id: file identifier
 * @contents: trust if security_kernel_post_read_file() will be called
 *
 * Read a file specified by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_read_file(struct file *file, enum kernel_read_file_id id,
                              bool contents)
{
        return call_int_hook(kernel_read_file, file, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_read_file);

/**
 * security_kernel_post_read_file() - Read a file specified by userspace
 * @file: file
 * @buf: file contents
 * @size: size of file contents
 * @id: file identifier
 *
 * Read a file specified by userspace.  This must be paired with a prior call
 * to security_kernel_read_file() call that indicated this hook would also be
 * called, see security_kernel_read_file() for more information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_read_file(struct file *file, char *buf, loff_t size,
                                   enum kernel_read_file_id id)
{
        return call_int_hook(kernel_post_read_file, file, buf, size, id);
}
EXPORT_SYMBOL_GPL(security_kernel_post_read_file);

/**
 * security_kernel_load_data() - Load data provided by userspace
 * @id: data identifier
 * @contents: true if security_kernel_post_load_data() will be called
 *
 * Load data provided by userspace.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        return call_int_hook(kernel_load_data, id, contents);
}
EXPORT_SYMBOL_GPL(security_kernel_load_data);

/**
 * security_kernel_post_load_data() - Load userspace data from a non-file source
 * @buf: data
 * @size: size of data
 * @id: data identifier
 * @description: text description of data, specific to the id value
 *
 * Load data provided by a non-file source (usually userspace buffer).  This
 * must be paired with a prior security_kernel_load_data() call that indicated
 * this hook would also be called, see security_kernel_load_data() for more
 * information.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_kernel_post_load_data(char *buf, loff_t size,
                                   enum kernel_load_data_id id,
                                   char *description)
{
        return call_int_hook(kernel_post_load_data, buf, size, id, description);
}
EXPORT_SYMBOL_GPL(security_kernel_post_load_data);

/**
 * security_task_fix_setuid() - Update LSM with new user id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag values
 *
 * Update the module's state after setting one or more of the user identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*uid system calls invoked this hook.  If @new is the set of
 * credentials that will be installed.  Modifications should be made to this
 * rather than to @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setuid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setuid, new, old, flags);
}

/**
 * security_task_fix_setgid() - Update LSM with new group id attributes
 * @new: updated credentials
 * @old: credentials being replaced
 * @flags: LSM_SETID_* flag value
 *
 * Update the module's state after setting one or more of the group identity
 * attributes of the current process.  The @flags parameter indicates which of
 * the set*gid system calls invoked this hook.  @new is the set of credentials
 * that will be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgid(struct cred *new, const struct cred *old,
                             int flags)
{
        return call_int_hook(task_fix_setgid, new, old, flags);
}

/**
 * security_task_fix_setgroups() - Update LSM with new supplementary groups
 * @new: updated credentials
 * @old: credentials being replaced
 *
 * Update the module's state after setting the supplementary group identity
 * attributes of the current process.  @new is the set of credentials that will
 * be installed.  Modifications should be made to this rather than to
 * @current->cred.
 *
 * Return: Returns 0 on success.
 */
int security_task_fix_setgroups(struct cred *new, const struct cred *old)
{
        return call_int_hook(task_fix_setgroups, new, old);
}

/**
 * security_task_setpgid() - Check if setting the pgid is allowed
 * @p: task being modified
 * @pgid: new pgid
 *
 * Check permission before setting the process group identifier of the process
 * @p to @pgid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return call_int_hook(task_setpgid, p, pgid);
}

/**
 * security_task_getpgid() - Check if getting the pgid is allowed
 * @p: task
 *
 * Check permission before getting the process group identifier of the process
 * @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getpgid(struct task_struct *p)
{
        return call_int_hook(task_getpgid, p);
}

/**
 * security_task_getsid() - Check if getting the session id is allowed
 * @p: task
 *
 * Check permission before getting the session identifier of the process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getsid(struct task_struct *p)
{
        return call_int_hook(task_getsid, p);
}

/**
 * security_current_getlsmprop_subj() - Current task's subjective LSM data
 * @prop: lsm specific information
 *
 * Retrieve the subjective security identifier of the current task and return
 * it in @prop.
 */
void security_current_getlsmprop_subj(struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(current_getlsmprop_subj, prop);
}
EXPORT_SYMBOL(security_current_getlsmprop_subj);

/**
 * security_task_getlsmprop_obj() - Get a task's objective LSM data
 * @p: target task
 * @prop: lsm specific information
 *
 * Retrieve the objective security identifier of the task_struct in @p and
 * return it in @prop.
 */
void security_task_getlsmprop_obj(struct task_struct *p, struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(task_getlsmprop_obj, p, prop);
}
EXPORT_SYMBOL(security_task_getlsmprop_obj);

/**
 * security_task_setnice() - Check if setting a task's nice value is allowed
 * @p: target task
 * @nice: nice value
 *
 * Check permission before setting the nice value of @p to @nice.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setnice(struct task_struct *p, int nice)
{
        return call_int_hook(task_setnice, p, nice);
}

/**
 * security_task_setioprio() - Check if setting a task's ioprio is allowed
 * @p: target task
 * @ioprio: ioprio value
 *
 * Check permission before setting the ioprio value of @p to @ioprio.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setioprio(struct task_struct *p, int ioprio)
{
        return call_int_hook(task_setioprio, p, ioprio);
}

/**
 * security_task_getioprio() - Check if getting a task's ioprio is allowed
 * @p: task
 *
 * Check permission before getting the ioprio value of @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getioprio(struct task_struct *p)
{
        return call_int_hook(task_getioprio, p);
}

/**
 * security_task_prlimit() - Check if get/setting resources limits is allowed
 * @cred: current task credentials
 * @tcred: target task credentials
 * @flags: LSM_PRLIMIT_* flag bits indicating a get/set/both
 *
 * Check permission before getting and/or setting the resource limits of
 * another task.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_prlimit(const struct cred *cred, const struct cred *tcred,
                          unsigned int flags)
{
        return call_int_hook(task_prlimit, cred, tcred, flags);
}

/**
 * security_task_setrlimit() - Check if setting a new rlimit value is allowed
 * @p: target task's group leader
 * @resource: resource whose limit is being set
 * @new_rlim: new resource limit
 *
 * Check permission before setting the resource limits of process @p for
 * @resource to @new_rlim.  The old resource limit values can be examined by
 * dereferencing (p->signal->rlim + resource).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setrlimit(struct task_struct *p, unsigned int resource,
                            struct rlimit *new_rlim)
{
        return call_int_hook(task_setrlimit, p, resource, new_rlim);
}

/**
 * security_task_setscheduler() - Check if setting sched policy/param is allowed
 * @p: target task
 *
 * Check permission before setting scheduling policy and/or parameters of
 * process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_setscheduler(struct task_struct *p)
{
        return call_int_hook(task_setscheduler, p);
}

/**
 * security_task_getscheduler() - Check if getting scheduling info is allowed
 * @p: target task
 *
 * Check permission before obtaining scheduling information for process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_getscheduler(struct task_struct *p)
{
        return call_int_hook(task_getscheduler, p);
}

/**
 * security_task_movememory() - Check if moving memory is allowed
 * @p: task
 *
 * Check permission before moving memory owned by process @p.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_movememory(struct task_struct *p)
{
        return call_int_hook(task_movememory, p);
}

/**
 * security_task_kill() - Check if sending a signal is allowed
 * @p: target process
 * @info: signal information
 * @sig: signal value
 * @cred: credentials of the signal sender, NULL if @current
 *
 * Check permission before sending signal @sig to @p.  @info can be NULL, the
 * constant 1, or a pointer to a kernel_siginfo structure.  If @info is 1 or
 * SI_FROMKERNEL(info) is true, then the signal should be viewed as coming from
 * the kernel and should typically be permitted.  SIGIO signals are handled
 * separately by the send_sigiotask hook in file_security_ops.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                       int sig, const struct cred *cred)
{
        return call_int_hook(task_kill, p, info, sig, cred);
}

/**
 * security_task_prctl() - Check if a prctl op is allowed
 * @option: operation
 * @arg2: argument
 * @arg3: argument
 * @arg4: argument
 * @arg5: argument
 *
 * Check permission before performing a process control operation on the
 * current process.
 *
 * Return: Return -ENOSYS if no-one wanted to handle this op, any other value
 *         to cause prctl() to return immediately with that value.
 */
int security_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                        unsigned long arg4, unsigned long arg5)
{
        int thisrc;
        int rc = LSM_RET_DEFAULT(task_prctl);
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, task_prctl) {
                thisrc = scall->hl->hook.task_prctl(option, arg2, arg3, arg4, arg5);
                if (thisrc != LSM_RET_DEFAULT(task_prctl)) {
                        rc = thisrc;
                        if (thisrc != 0)
                                break;
                }
        }
        return rc;
}

/**
 * security_task_to_inode() - Set the security attributes of a task's inode
 * @p: task
 * @inode: inode
 *
 * Set the security attributes for an inode based on an associated task's
 * security attributes, e.g. for /proc/pid inodes.
 */
void security_task_to_inode(struct task_struct *p, struct inode *inode)
{
        call_void_hook(task_to_inode, p, inode);
}

/**
 * security_create_user_ns() - Check if creating a new userns is allowed
 * @cred: prepared creds
 *
 * Check permission prior to creating a new user namespace.
 *
 * Return: Returns 0 if successful, otherwise < 0 error code.
 */
int security_create_user_ns(const struct cred *cred)
{
        return call_int_hook(userns_create, cred);
}

/**
 * security_ipc_permission() - Check if sysv ipc access is allowed
 * @ipcp: ipc permission structure
 * @flag: requested permissions
 *
 * Check permissions for access to IPC.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        return call_int_hook(ipc_permission, ipcp, flag);
}

/**
 * security_ipc_getlsmprop() - Get the sysv ipc object LSM data
 * @ipcp: ipc permission structure
 * @prop: pointer to lsm information
 *
 * Get the lsm information associated with the ipc object.
 */

void security_ipc_getlsmprop(struct kern_ipc_perm *ipcp, struct lsm_prop *prop)
{
        lsmprop_init(prop);
        call_void_hook(ipc_getlsmprop, ipcp, prop);
}

/**
 * security_msg_msg_alloc() - Allocate a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Allocate and attach a security structure to the msg->security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Return 0 if operation was successful and permission is granted.
 */
int security_msg_msg_alloc(struct msg_msg *msg)
{
        int rc = lsm_msg_msg_alloc(msg);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_msg_alloc_security, msg);
        if (unlikely(rc))
                security_msg_msg_free(msg);
        return rc;
}

/**
 * security_msg_msg_free() - Free a sysv ipc message LSM blob
 * @msg: message structure
 *
 * Deallocate the security structure for this message.
 */
void security_msg_msg_free(struct msg_msg *msg)
{
        call_void_hook(msg_msg_free_security, msg);
        kfree(msg->security);
        msg->security = NULL;
}

/**
 * security_msg_queue_alloc() - Allocate a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Allocate and attach a security structure to @msg. The security field is
 * initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_msg_queue_alloc(struct kern_ipc_perm *msq)
{
        int rc = lsm_ipc_alloc(msq);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(msg_queue_alloc_security, msq);
        if (unlikely(rc))
                security_msg_queue_free(msq);
        return rc;
}

/**
 * security_msg_queue_free() - Free a sysv ipc msg queue LSM blob
 * @msq: sysv ipc permission structure
 *
 * Deallocate security field @perm->security for the message queue.
 */
void security_msg_queue_free(struct kern_ipc_perm *msq)
{
        call_void_hook(msg_queue_free_security, msq);
        kfree(msq->security);
        msq->security = NULL;
}

/**
 * security_msg_queue_associate() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @msqflg: operation flags
 *
 * Check permission when a message queue is requested through the msgget system
 * call. This hook is only called when returning the message queue identifier
 * for an existing message queue, not when a new message queue is created.
 *
 * Return: Return 0 if permission is granted.
 */
int security_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        return call_int_hook(msg_queue_associate, msq, msqflg);
}

/**
 * security_msg_queue_msgctl() - Check if a msg queue operation is allowed
 * @msq: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a message control operation specified by @cmd is to be
 * performed on the message queue with permissions.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        return call_int_hook(msg_queue_msgctl, msq, cmd);
}

/**
 * security_msg_queue_msgsnd() - Check if sending a sysv ipc message is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @msqflg: operation flags
 *
 * Check permission before a message, @msg, is enqueued on the message queue
 * with permissions specified in @msq.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgsnd(struct kern_ipc_perm *msq,
                              struct msg_msg *msg, int msqflg)
{
        return call_int_hook(msg_queue_msgsnd, msq, msg, msqflg);
}

/**
 * security_msg_queue_msgrcv() - Check if receiving a sysv ipc msg is allowed
 * @msq: sysv ipc permission structure
 * @msg: message
 * @target: target task
 * @type: type of message requested
 * @mode: operation flags
 *
 * Check permission before a message, @msg, is removed from the message        queue.
 * The @target task structure contains a pointer to the process that will be
 * receiving the message (not equal to the current process when inline receives
 * are being performed).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                              struct task_struct *target, long type, int mode)
{
        return call_int_hook(msg_queue_msgrcv, msq, msg, target, type, mode);
}

/**
 * security_shm_alloc() - Allocate a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @shp security field.  The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_shm_alloc(struct kern_ipc_perm *shp)
{
        int rc = lsm_ipc_alloc(shp);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(shm_alloc_security, shp);
        if (unlikely(rc))
                security_shm_free(shp);
        return rc;
}

/**
 * security_shm_free() - Free a sysv shm LSM blob
 * @shp: sysv ipc permission structure
 *
 * Deallocate the security structure @perm->security for the memory segment.
 */
void security_shm_free(struct kern_ipc_perm *shp)
{
        call_void_hook(shm_free_security, shp);
        kfree(shp->security);
        shp->security = NULL;
}

/**
 * security_shm_associate() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @shmflg: operation flags
 *
 * Check permission when a shared memory region is requested through the shmget
 * system call. This hook is only called when returning the shared memory
 * region identifier for an existing region, not when a new shared memory
 * region is created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        return call_int_hook(shm_associate, shp, shmflg);
}

/**
 * security_shm_shmctl() - Check if a sysv shm operation is allowed
 * @shp: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a shared memory control operation specified by @cmd is
 * to be performed on the shared memory region with permissions in @shp.
 *
 * Return: Return 0 if permission is granted.
 */
int security_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        return call_int_hook(shm_shmctl, shp, cmd);
}

/**
 * security_shm_shmat() - Check if a sysv shm attach operation is allowed
 * @shp: sysv ipc permission structure
 * @shmaddr: address of memory region to attach
 * @shmflg: operation flags
 *
 * Check permissions prior to allowing the shmat system call to attach the
 * shared memory segment with permissions @shp to the data segment of the
 * calling process. The attaching address is specified by @shmaddr.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_shm_shmat(struct kern_ipc_perm *shp,
                       char __user *shmaddr, int shmflg)
{
        return call_int_hook(shm_shmat, shp, shmaddr, shmflg);
}

/**
 * security_sem_alloc() - Allocate a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Allocate and attach a security structure to the @sma security field. The
 * security field is initialized to NULL when the structure is first created.
 *
 * Return: Returns 0 if operation was successful and permission is granted.
 */
int security_sem_alloc(struct kern_ipc_perm *sma)
{
        int rc = lsm_ipc_alloc(sma);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sem_alloc_security, sma);
        if (unlikely(rc))
                security_sem_free(sma);
        return rc;
}

/**
 * security_sem_free() - Free a sysv semaphore LSM blob
 * @sma: sysv ipc permission structure
 *
 * Deallocate security structure @sma->security for the semaphore.
 */
void security_sem_free(struct kern_ipc_perm *sma)
{
        call_void_hook(sem_free_security, sma);
        kfree(sma->security);
        sma->security = NULL;
}

/**
 * security_sem_associate() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @semflg: operation flags
 *
 * Check permission when a semaphore is requested through the semget system
 * call. This hook is only called when returning the semaphore identifier for
 * an existing semaphore, not when a new one must be created.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        return call_int_hook(sem_associate, sma, semflg);
}

/**
 * security_sem_semctl() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @cmd: operation
 *
 * Check permission when a semaphore operation specified by @cmd is to be
 * performed on the semaphore.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        return call_int_hook(sem_semctl, sma, cmd);
}

/**
 * security_sem_semop() - Check if a sysv semaphore operation is allowed
 * @sma: sysv ipc permission structure
 * @sops: operations to perform
 * @nsops: number of operations
 * @alter: flag indicating changes will be made
 *
 * Check permissions before performing operations on members of the semaphore
 * set. If the @alter flag is nonzero, the semaphore set may be modified.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sem_semop(struct kern_ipc_perm *sma, struct sembuf *sops,
                       unsigned nsops, int alter)
{
        return call_int_hook(sem_semop, sma, sops, nsops, alter);
}

/**
 * security_d_instantiate() - Populate an inode's LSM state based on a dentry
 * @dentry: dentry
 * @inode: inode
 *
 * Fill in @inode security information for a @dentry if allowed.
 */
void security_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (unlikely(inode && IS_PRIVATE(inode)))
                return;
        call_void_hook(d_instantiate, dentry, inode);
}
EXPORT_SYMBOL(security_d_instantiate);

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_getselfattr - Read an LSM attribute of the current process.
 * @attr: which attribute to return
 * @uctx: the user-space destination for the information, or NULL
 * @size: pointer to the size of space available to receive the data
 * @flags: special handling options. LSM_FLAG_SINGLE indicates that only
 * attributes associated with the LSM identified in the passed @ctx be
 * reported.
 *
 * A NULL value for @uctx can be used to get both the number of attributes
 * and the size of the data.
 *
 * Returns the number of attributes found on success, negative value
 * on error. @size is reset to the total size of the data.
 * If @size is insufficient to contain the data -E2BIG is returned.
 */
int security_getselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 __user *size, u32 flags)
{
        struct lsm_static_call *scall;
        struct lsm_ctx lctx = { .id = LSM_ID_UNDEF, };
        u8 __user *base = (u8 __user *)uctx;
        u32 entrysize;
        u32 total = 0;
        u32 left;
        bool toobig = false;
        bool single = false;
        int count = 0;
        int rc;

        if (attr == LSM_ATTR_UNDEF)
                return -EINVAL;
        if (size == NULL)
                return -EINVAL;
        if (get_user(left, size))
                return -EFAULT;

        if (flags) {
                /*
                 * Only flag supported is LSM_FLAG_SINGLE
                 */
                if (flags != LSM_FLAG_SINGLE || !uctx)
                        return -EINVAL;
                if (copy_from_user(&lctx, uctx, sizeof(lctx)))
                        return -EFAULT;
                /*
                 * If the LSM ID isn't specified it is an error.
                 */
                if (lctx.id == LSM_ID_UNDEF)
                        return -EINVAL;
                single = true;
        }

        /*
         * In the usual case gather all the data from the LSMs.
         * In the single case only get the data from the LSM specified.
         */
        lsm_for_each_hook(scall, getselfattr) {
                if (single && lctx.id != scall->hl->lsmid->id)
                        continue;
                entrysize = left;
                if (base)
                        uctx = (struct lsm_ctx __user *)(base + total);
                rc = scall->hl->hook.getselfattr(attr, uctx, &entrysize, flags);
                if (rc == -EOPNOTSUPP)
                        continue;
                if (rc == -E2BIG) {
                        rc = 0;
                        left = 0;
                        toobig = true;
                } else if (rc < 0)
                        return rc;
                else
                        left -= entrysize;

                total += entrysize;
                count += rc;
                if (single)
                        break;
        }
        if (put_user(total, size))
                return -EFAULT;
        if (toobig)
                return -E2BIG;
        if (count == 0)
                return LSM_RET_DEFAULT(getselfattr);
        return count;
}

/*
 * Please keep this in sync with it's counterpart in security/lsm_syscalls.c
 */

/**
 * security_setselfattr - Set an LSM attribute on the current process.
 * @attr: which attribute to set
 * @uctx: the user-space source for the information
 * @size: the size of the data
 * @flags: reserved for future use, must be 0
 *
 * Set an LSM attribute for the current process. The LSM, attribute
 * and new value are included in @uctx.
 *
 * Returns 0 on success, -EINVAL if the input is inconsistent, -EFAULT
 * if the user buffer is inaccessible, E2BIG if size is too big, or an
 * LSM specific failure.
 */
int security_setselfattr(unsigned int attr, struct lsm_ctx __user *uctx,
                         u32 size, u32 flags)
{
        struct lsm_static_call *scall;
        struct lsm_ctx *lctx;
        int rc = LSM_RET_DEFAULT(setselfattr);
        u64 required_len;

        if (flags)
                return -EINVAL;
        if (size < sizeof(*lctx))
                return -EINVAL;
        if (size > PAGE_SIZE)
                return -E2BIG;

        lctx = memdup_user(uctx, size);
        if (IS_ERR(lctx))
                return PTR_ERR(lctx);

        if (size < lctx->len ||
            check_add_overflow(sizeof(*lctx), lctx->ctx_len, &required_len) ||
            lctx->len < required_len) {
                rc = -EINVAL;
                goto free_out;
        }

        lsm_for_each_hook(scall, setselfattr)
                if ((scall->hl->lsmid->id) == lctx->id) {
                        rc = scall->hl->hook.setselfattr(attr, lctx, size, flags);
                        break;
                }

free_out:
        kfree(lctx);
        return rc;
}

/**
 * security_getprocattr() - Read an attribute for a task
 * @p: the task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 *
 * Read attribute @name for task @p and store it into @value if allowed.
 *
 * Return: Returns the length of @value on success, a negative value otherwise.
 */
int security_getprocattr(struct task_struct *p, int lsmid, const char *name,
                         char **value)
{
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, getprocattr) {
                if (lsmid != 0 && lsmid != scall->hl->lsmid->id)
                        continue;
                return scall->hl->hook.getprocattr(p, name, value);
        }
        return LSM_RET_DEFAULT(getprocattr);
}

/**
 * security_setprocattr() - Set an attribute for a task
 * @lsmid: LSM identification
 * @name: attribute name
 * @value: attribute value
 * @size: attribute value size
 *
 * Write (set) the current task's attribute @name to @value, size @size if
 * allowed.
 *
 * Return: Returns bytes written on success, a negative value otherwise.
 */
int security_setprocattr(int lsmid, const char *name, void *value, size_t size)
{
        struct lsm_static_call *scall;

        lsm_for_each_hook(scall, setprocattr) {
                if (lsmid != 0 && lsmid != scall->hl->lsmid->id)
                        continue;
                return scall->hl->hook.setprocattr(name, value, size);
        }
        return LSM_RET_DEFAULT(setprocattr);
}

/**
 * security_netlink_send() - Save info and check if netlink sending is allowed
 * @sk: sending socket
 * @skb: netlink message
 *
 * Save security information for a netlink message so that permission checking
 * can be performed when the message is processed.  The security information
 * can be saved using the eff_cap field of the netlink_skb_parms structure.
 * Also may be used to provide fine grained control over message transmission.
 *
 * Return: Returns 0 if the information was successfully saved and message is
 *         allowed to be transmitted.
 */
int security_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(netlink_send, sk, skb);
}

/**
 * security_ismaclabel() - Check if the named attribute is a MAC label
 * @name: full extended attribute name
 *
 * Check if the extended attribute specified by @name represents a MAC label.
 *
 * Return: Returns 1 if name is a MAC attribute otherwise returns 0.
 */
int security_ismaclabel(const char *name)
{
        return call_int_hook(ismaclabel, name);
}
EXPORT_SYMBOL(security_ismaclabel);

/**
 * security_secid_to_secctx() - Convert a secid to a secctx
 * @secid: secid
 * @cp: the LSM context
 *
 * Convert secid to security context.  If @cp is NULL the length of the
 * result will be returned, but no data will be returned.  This
 * does mean that the length could change between calls to check the length and
 * the next call which actually allocates and returns the data.
 *
 * Return: Return length of data on success, error on failure.
 */
int security_secid_to_secctx(u32 secid, struct lsm_context *cp)
{
        return call_int_hook(secid_to_secctx, secid, cp);
}
EXPORT_SYMBOL(security_secid_to_secctx);

/**
 * security_lsmprop_to_secctx() - Convert a lsm_prop to a secctx
 * @prop: lsm specific information
 * @cp: the LSM context
 *
 * Convert a @prop entry to security context.  If @cp is NULL the
 * length of the result will be returned. This does mean that the
 * length could change between calls to check the length and the
 * next call which actually allocates and returns the @cp.
 *
 * Return: Return length of data on success, error on failure.
 */
int security_lsmprop_to_secctx(struct lsm_prop *prop, struct lsm_context *cp)
{
        return call_int_hook(lsmprop_to_secctx, prop, cp);
}
EXPORT_SYMBOL(security_lsmprop_to_secctx);

/**
 * security_secctx_to_secid() - Convert a secctx to a secid
 * @secdata: secctx
 * @seclen: length of secctx
 * @secid: secid
 *
 * Convert security context to secid.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        *secid = 0;
        return call_int_hook(secctx_to_secid, secdata, seclen, secid);
}
EXPORT_SYMBOL(security_secctx_to_secid);

/**
 * security_release_secctx() - Free a secctx buffer
 * @cp: the security context
 *
 * Release the security context.
 */
void security_release_secctx(struct lsm_context *cp)
{
        call_void_hook(release_secctx, cp);
        memset(cp, 0, sizeof(*cp));
}
EXPORT_SYMBOL(security_release_secctx);

/**
 * security_inode_invalidate_secctx() - Invalidate an inode's security label
 * @inode: inode
 *
 * Notify the security module that it must revalidate the security context of
 * an inode.
 */
void security_inode_invalidate_secctx(struct inode *inode)
{
        call_void_hook(inode_invalidate_secctx, inode);
}
EXPORT_SYMBOL(security_inode_invalidate_secctx);

/**
 * security_inode_notifysecctx() - Notify the LSM of an inode's security label
 * @inode: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Notify the security module of what the security context of an inode should
 * be.  Initializes the incore security context managed by the security module
 * for this inode.  Example usage: NFS client invokes this hook to initialize
 * the security context in its incore inode to the value provided by the server
 * for the file when the server returned the file's attributes to the client.
 * Must be called with inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_notifysecctx, inode, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_notifysecctx);

/**
 * security_inode_setsecctx() - Change the security label of an inode
 * @dentry: inode
 * @ctx: secctx
 * @ctxlen: length of secctx
 *
 * Change the security context of an inode.  Updates the incore security
 * context managed by the security module and invokes the fs code as needed
 * (via __vfs_setxattr_noperm) to update any backing xattrs that represent the
 * context.  Example usage: NFS server invokes this hook to change the security
 * context in its incore inode and on the backing filesystem to a value
 * provided by the client on a SETATTR operation.  Must be called with
 * inode->i_mutex locked.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return call_int_hook(inode_setsecctx, dentry, ctx, ctxlen);
}
EXPORT_SYMBOL(security_inode_setsecctx);

/**
 * security_inode_getsecctx() - Get the security label of an inode
 * @inode: inode
 * @cp: security context
 *
 * On success, returns 0 and fills out @cp with the security context
 * for the given @inode.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_inode_getsecctx(struct inode *inode, struct lsm_context *cp)
{
        memset(cp, 0, sizeof(*cp));
        return call_int_hook(inode_getsecctx, inode, cp);
}
EXPORT_SYMBOL(security_inode_getsecctx);

#ifdef CONFIG_WATCH_QUEUE
/**
 * security_post_notification() - Check if a watch notification can be posted
 * @w_cred: credentials of the task that set the watch
 * @cred: credentials of the task which triggered the watch
 * @n: the notification
 *
 * Check to see if a watch notification can be posted to a particular queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_post_notification(const struct cred *w_cred,
                               const struct cred *cred,
                               struct watch_notification *n)
{
        return call_int_hook(post_notification, w_cred, cred, n);
}
#endif /* CONFIG_WATCH_QUEUE */

#ifdef CONFIG_KEY_NOTIFICATIONS
/**
 * security_watch_key() - Check if a task is allowed to watch for key events
 * @key: the key to watch
 *
 * Check to see if a process is allowed to watch for event notifications from
 * a key or keyring.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_watch_key(struct key *key)
{
        return call_int_hook(watch_key, key);
}
#endif /* CONFIG_KEY_NOTIFICATIONS */

#ifdef CONFIG_SECURITY_NETWORK
/**
 * security_unix_stream_connect() - Check if a AF_UNIX stream is allowed
 * @sock: originating sock
 * @other: peer sock
 * @newsk: new sock
 *
 * Check permissions before establishing a Unix domain stream connection
 * between @sock and @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_stream_connect(struct sock *sock, struct sock *other,
                                 struct sock *newsk)
{
        return call_int_hook(unix_stream_connect, sock, other, newsk);
}
EXPORT_SYMBOL(security_unix_stream_connect);

/**
 * security_unix_may_send() - Check if AF_UNIX socket can send datagrams
 * @sock: originating sock
 * @other: peer sock
 *
 * Check permissions before connecting or sending datagrams from @sock to
 * @other.
 *
 * The @unix_stream_connect and @unix_may_send hooks were necessary because
 * Linux provides an alternative to the conventional file name space for Unix
 * domain sockets.  Whereas binding and connecting to sockets in the file name
 * space is mediated by the typical file permissions (and caught by the mknod
 * and permission hooks in inode_security_ops), binding and connecting to
 * sockets in the abstract name space is completely unmediated.  Sufficient
 * control of Unix domain sockets in the abstract name space isn't possible
 * using only the socket layer hooks, since we need to know the actual target
 * socket, which is not looked up until we are inside the af_unix code.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_unix_may_send(struct socket *sock,  struct socket *other)
{
        return call_int_hook(unix_may_send, sock, other);
}
EXPORT_SYMBOL(security_unix_may_send);

/**
 * security_socket_create() - Check if creating a new socket is allowed
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * Check permissions prior to creating a new socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_create(int family, int type, int protocol, int kern)
{
        return call_int_hook(socket_create, family, type, protocol, kern);
}

/**
 * security_socket_post_create() - Initialize a newly created socket
 * @sock: socket
 * @family: protocol family
 * @type: communications type
 * @protocol: requested protocol
 * @kern: set to 1 if a kernel socket is requested
 *
 * This hook allows a module to update or allocate a per-socket security
 * structure. Note that the security field was not added directly to the socket
 * structure, but rather, the socket security information is stored in the
 * associated inode.  Typically, the inode alloc_security hook will allocate
 * and attach security information to SOCK_INODE(sock)->i_security.  This hook
 * may be used to update the SOCK_INODE(sock)->i_security field with additional
 * information that wasn't available when the inode was allocated.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_post_create(struct socket *sock, int family,
                                int type, int protocol, int kern)
{
        return call_int_hook(socket_post_create, sock, family, type,
                             protocol, kern);
}

/**
 * security_socket_socketpair() - Check if creating a socketpair is allowed
 * @socka: first socket
 * @sockb: second socket
 *
 * Check permissions before creating a fresh pair of sockets.
 *
 * Return: Returns 0 if permission is granted and the connection was
 *         established.
 */
int security_socket_socketpair(struct socket *socka, struct socket *sockb)
{
        return call_int_hook(socket_socketpair, socka, sockb);
}
EXPORT_SYMBOL(security_socket_socketpair);

/**
 * security_socket_bind() - Check if a socket bind operation is allowed
 * @sock: socket
 * @address: requested bind address
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer bind operation is performed
 * and the socket @sock is bound to the address specified in the @address
 * parameter.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_bind(struct socket *sock,
                         struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_bind, sock, address, addrlen);
}

/**
 * security_socket_connect() - Check if a socket connect operation is allowed
 * @sock: socket
 * @address: address of remote connection point
 * @addrlen: length of address
 *
 * Check permission before socket protocol layer connect operation attempts to
 * connect socket @sock to a remote address, @address.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_connect(struct socket *sock,
                            struct sockaddr *address, int addrlen)
{
        return call_int_hook(socket_connect, sock, address, addrlen);
}

/**
 * security_socket_listen() - Check if a socket is allowed to listen
 * @sock: socket
 * @backlog: connection queue size
 *
 * Check permission before socket protocol layer listen operation.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_listen(struct socket *sock, int backlog)
{
        return call_int_hook(socket_listen, sock, backlog);
}

/**
 * security_socket_accept() - Check if a socket is allowed to accept connections
 * @sock: listening socket
 * @newsock: newly creation connection socket
 *
 * Check permission before accepting a new connection.  Note that the new
 * socket, @newsock, has been created and some information copied to it, but
 * the accept operation has not actually been performed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_accept(struct socket *sock, struct socket *newsock)
{
        return call_int_hook(socket_accept, sock, newsock);
}

/**
 * security_socket_sendmsg() - Check if sending a message is allowed
 * @sock: sending socket
 * @msg: message to send
 * @size: size of message
 *
 * Check permission before transmitting a message to another socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
        return call_int_hook(socket_sendmsg, sock, msg, size);
}

/**
 * security_socket_recvmsg() - Check if receiving a message is allowed
 * @sock: receiving socket
 * @msg: message to receive
 * @size: size of message
 * @flags: operational flags
 *
 * Check permission before receiving a message from a socket.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                            int size, int flags)
{
        return call_int_hook(socket_recvmsg, sock, msg, size, flags);
}

/**
 * security_socket_getsockname() - Check if reading the socket addr is allowed
 * @sock: socket
 *
 * Check permission before reading the local address (name) of the socket
 * object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockname(struct socket *sock)
{
        return call_int_hook(socket_getsockname, sock);
}

/**
 * security_socket_getpeername() - Check if reading the peer's addr is allowed
 * @sock: socket
 *
 * Check permission before the remote address (name) of a socket object.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getpeername(struct socket *sock)
{
        return call_int_hook(socket_getpeername, sock);
}

/**
 * security_socket_getsockopt() - Check if reading a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before retrieving the options associated with socket
 * @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_getsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_getsockopt, sock, level, optname);
}

/**
 * security_socket_setsockopt() - Check if setting a socket option is allowed
 * @sock: socket
 * @level: option's protocol level
 * @optname: option name
 *
 * Check permissions before setting the options associated with socket @sock.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_setsockopt(struct socket *sock, int level, int optname)
{
        return call_int_hook(socket_setsockopt, sock, level, optname);
}

/**
 * security_socket_shutdown() - Checks if shutting down the socket is allowed
 * @sock: socket
 * @how: flag indicating how sends and receives are handled
 *
 * Checks permission before all or part of a connection on the socket @sock is
 * shut down.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_socket_shutdown(struct socket *sock, int how)
{
        return call_int_hook(socket_shutdown, sock, how);
}

/**
 * security_sock_rcv_skb() - Check if an incoming network packet is allowed
 * @sk: destination sock
 * @skb: incoming packet
 *
 * Check permissions on incoming network packets.  This hook is distinct from
 * Netfilter's IP input hooks since it is the first time that the incoming
 * sk_buff @skb has been associated with a particular socket, @sk.  Must not
 * sleep inside this hook because some callers hold spinlocks.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return call_int_hook(socket_sock_rcv_skb, sk, skb);
}
EXPORT_SYMBOL(security_sock_rcv_skb);

/**
 * security_socket_getpeersec_stream() - Get the remote peer label
 * @sock: socket
 * @optval: destination buffer
 * @optlen: size of peer label copied into the buffer
 * @len: maximum size of the destination buffer
 *
 * This hook allows the security module to provide peer socket security state
 * for unix or connected tcp sockets to userspace via getsockopt SO_GETPEERSEC.
 * For tcp sockets this can be meaningful if the socket is associated with an
 * ipsec SA.
 *
 * Return: Returns 0 if all is well, otherwise, typical getsockopt return
 *         values.
 */
int security_socket_getpeersec_stream(struct socket *sock, sockptr_t optval,
                                      sockptr_t optlen, unsigned int len)
{
        return call_int_hook(socket_getpeersec_stream, sock, optval, optlen,
                             len);
}

/**
 * security_socket_getpeersec_dgram() - Get the remote peer label
 * @sock: socket
 * @skb: datagram packet
 * @secid: remote peer label secid
 *
 * This hook allows the security module to provide peer socket security state
 * for udp sockets on a per-packet basis to userspace via getsockopt
 * SO_GETPEERSEC. The application must first have indicated the IP_PASSSEC
 * option via getsockopt. It can then retrieve the security state returned by
 * this hook for a packet via the SCM_SECURITY ancillary message type.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_socket_getpeersec_dgram(struct socket *sock,
                                     struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(socket_getpeersec_dgram, sock, skb, secid);
}
EXPORT_SYMBOL(security_socket_getpeersec_dgram);

/**
 * lsm_sock_alloc - allocate a composite sock blob
 * @sock: the sock that needs a blob
 * @gfp: allocation mode
 *
 * Allocate the sock blob for all the modules
 *
 * Returns 0, or -ENOMEM if memory can't be allocated.
 */
static int lsm_sock_alloc(struct sock *sock, gfp_t gfp)
{
        return lsm_blob_alloc(&sock->sk_security, blob_sizes.lbs_sock, gfp);
}

/**
 * security_sk_alloc() - Allocate and initialize a sock's LSM blob
 * @sk: sock
 * @family: protocol family
 * @priority: gfp flags
 *
 * Allocate and attach a security structure to the sk->sk_security field, which
 * is used to copy security attributes between local stream sockets.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
{
        int rc = lsm_sock_alloc(sk, priority);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(sk_alloc_security, sk, family, priority);
        if (unlikely(rc))
                security_sk_free(sk);
        return rc;
}

/**
 * security_sk_free() - Free the sock's LSM blob
 * @sk: sock
 *
 * Deallocate security structure.
 */
void security_sk_free(struct sock *sk)
{
        call_void_hook(sk_free_security, sk);
        kfree(sk->sk_security);
        sk->sk_security = NULL;
}

/**
 * security_sk_clone() - Clone a sock's LSM state
 * @sk: original sock
 * @newsk: target sock
 *
 * Clone/copy security structure.
 */
void security_sk_clone(const struct sock *sk, struct sock *newsk)
{
        call_void_hook(sk_clone_security, sk, newsk);
}
EXPORT_SYMBOL(security_sk_clone);

/**
 * security_sk_classify_flow() - Set a flow's secid based on socket
 * @sk: original socket
 * @flic: target flow
 *
 * Set the target flow's secid to socket's secid.
 */
void security_sk_classify_flow(const struct sock *sk, struct flowi_common *flic)
{
        call_void_hook(sk_getsecid, sk, &flic->flowic_secid);
}
EXPORT_SYMBOL(security_sk_classify_flow);

/**
 * security_req_classify_flow() - Set a flow's secid based on request_sock
 * @req: request_sock
 * @flic: target flow
 *
 * Sets @flic's secid to @req's secid.
 */
void security_req_classify_flow(const struct request_sock *req,
                                struct flowi_common *flic)
{
        call_void_hook(req_classify_flow, req, flic);
}
EXPORT_SYMBOL(security_req_classify_flow);

/**
 * security_sock_graft() - Reconcile LSM state when grafting a sock on a socket
 * @sk: sock being grafted
 * @parent: target parent socket
 *
 * Sets @parent's inode secid to @sk's secid and update @sk with any necessary
 * LSM state from @parent.
 */
void security_sock_graft(struct sock *sk, struct socket *parent)
{
        call_void_hook(sock_graft, sk, parent);
}
EXPORT_SYMBOL(security_sock_graft);

/**
 * security_inet_conn_request() - Set request_sock state using incoming connect
 * @sk: parent listening sock
 * @skb: incoming connection
 * @req: new request_sock
 *
 * Initialize the @req LSM state based on @sk and the incoming connect in @skb.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_inet_conn_request(const struct sock *sk,
                               struct sk_buff *skb, struct request_sock *req)
{
        return call_int_hook(inet_conn_request, sk, skb, req);
}
EXPORT_SYMBOL(security_inet_conn_request);

/**
 * security_inet_csk_clone() - Set new sock LSM state based on request_sock
 * @newsk: new sock
 * @req: connection request_sock
 *
 * Set that LSM state of @sock using the LSM state from @req.
 */
void security_inet_csk_clone(struct sock *newsk,
                             const struct request_sock *req)
{
        call_void_hook(inet_csk_clone, newsk, req);
}

/**
 * security_inet_conn_established() - Update sock's LSM state with connection
 * @sk: sock
 * @skb: connection packet
 *
 * Update @sock's LSM state to represent a new connection from @skb.
 */
void security_inet_conn_established(struct sock *sk,
                                    struct sk_buff *skb)
{
        call_void_hook(inet_conn_established, sk, skb);
}
EXPORT_SYMBOL(security_inet_conn_established);

/**
 * security_secmark_relabel_packet() - Check if setting a secmark is allowed
 * @secid: new secmark value
 *
 * Check if the process should be allowed to relabel packets to @secid.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_secmark_relabel_packet(u32 secid)
{
        return call_int_hook(secmark_relabel_packet, secid);
}
EXPORT_SYMBOL(security_secmark_relabel_packet);

/**
 * security_secmark_refcount_inc() - Increment the secmark labeling rule count
 *
 * Tells the LSM to increment the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_inc(void)
{
        call_void_hook(secmark_refcount_inc);
}
EXPORT_SYMBOL(security_secmark_refcount_inc);

/**
 * security_secmark_refcount_dec() - Decrement the secmark labeling rule count
 *
 * Tells the LSM to decrement the number of secmark labeling rules loaded.
 */
void security_secmark_refcount_dec(void)
{
        call_void_hook(secmark_refcount_dec);
}
EXPORT_SYMBOL(security_secmark_refcount_dec);

/**
 * security_tun_dev_alloc_security() - Allocate a LSM blob for a TUN device
 * @security: pointer to the LSM blob
 *
 * This hook allows a module to allocate a security structure for a TUN        device,
 * returning the pointer in @security.
 *
 * Return: Returns a zero on success, negative values on failure.
 */
int security_tun_dev_alloc_security(void **security)
{
        int rc;

        rc = lsm_blob_alloc(security, blob_sizes.lbs_tun_dev, GFP_KERNEL);
        if (rc)
                return rc;

        rc = call_int_hook(tun_dev_alloc_security, *security);
        if (rc) {
                kfree(*security);
                *security = NULL;
        }
        return rc;
}
EXPORT_SYMBOL(security_tun_dev_alloc_security);

/**
 * security_tun_dev_free_security() - Free a TUN device LSM blob
 * @security: LSM blob
 *
 * This hook allows a module to free the security structure for a TUN device.
 */
void security_tun_dev_free_security(void *security)
{
        kfree(security);
}
EXPORT_SYMBOL(security_tun_dev_free_security);

/**
 * security_tun_dev_create() - Check if creating a TUN device is allowed
 *
 * Check permissions prior to creating a new TUN device.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_create(void)
{
        return call_int_hook(tun_dev_create);
}
EXPORT_SYMBOL(security_tun_dev_create);

/**
 * security_tun_dev_attach_queue() - Check if attaching a TUN queue is allowed
 * @security: TUN device LSM blob
 *
 * Check permissions prior to attaching to a TUN device queue.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach_queue(void *security)
{
        return call_int_hook(tun_dev_attach_queue, security);
}
EXPORT_SYMBOL(security_tun_dev_attach_queue);

/**
 * security_tun_dev_attach() - Update TUN device LSM state on attach
 * @sk: associated sock
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's sock structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_attach(struct sock *sk, void *security)
{
        return call_int_hook(tun_dev_attach, sk, security);
}
EXPORT_SYMBOL(security_tun_dev_attach);

/**
 * security_tun_dev_open() - Update TUN device LSM state on open
 * @security: TUN device LSM blob
 *
 * This hook can be used by the module to update any security state associated
 * with the TUN device's security structure.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_tun_dev_open(void *security)
{
        return call_int_hook(tun_dev_open, security);
}
EXPORT_SYMBOL(security_tun_dev_open);

/**
 * security_sctp_assoc_request() - Update the LSM on a SCTP association req
 * @asoc: SCTP association
 * @skb: packet requesting the association
 *
 * Passes the @asoc and @chunk->skb of the association INIT packet to the LSM.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_assoc_request(struct sctp_association *asoc,
                                struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_request, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_request);

/**
 * security_sctp_bind_connect() - Validate a list of addrs for a SCTP option
 * @sk: socket
 * @optname: SCTP option to validate
 * @address: list of IP addresses to validate
 * @addrlen: length of the address list
 *
 * Validiate permissions required for each address associated with sock        @sk.
 * Depending on @optname, the addresses will be treated as either a connect or
 * bind service. The @addrlen is calculated on each IPv4 and IPv6 address using
 * sizeof(struct sockaddr_in) or sizeof(struct sockaddr_in6).
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_sctp_bind_connect(struct sock *sk, int optname,
                               struct sockaddr *address, int addrlen)
{
        return call_int_hook(sctp_bind_connect, sk, optname, address, addrlen);
}
EXPORT_SYMBOL(security_sctp_bind_connect);

/**
 * security_sctp_sk_clone() - Clone a SCTP sock's LSM state
 * @asoc: SCTP association
 * @sk: original sock
 * @newsk: target sock
 *
 * Called whenever a new socket is created by accept(2) (i.e. a TCP style
 * socket) or when a socket is 'peeled off' e.g userspace calls
 * sctp_peeloff(3).
 */
void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
                            struct sock *newsk)
{
        call_void_hook(sctp_sk_clone, asoc, sk, newsk);
}
EXPORT_SYMBOL(security_sctp_sk_clone);

/**
 * security_sctp_assoc_established() - Update LSM state when assoc established
 * @asoc: SCTP association
 * @skb: packet establishing the association
 *
 * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet to the
 * security module.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_sctp_assoc_established(struct sctp_association *asoc,
                                    struct sk_buff *skb)
{
        return call_int_hook(sctp_assoc_established, asoc, skb);
}
EXPORT_SYMBOL(security_sctp_assoc_established);

/**
 * security_mptcp_add_subflow() - Inherit the LSM label from the MPTCP socket
 * @sk: the owning MPTCP socket
 * @ssk: the new subflow
 *
 * Update the labeling for the given MPTCP subflow, to match the one of the
 * owning MPTCP socket. This hook has to be called after the socket creation and
 * initialization via the security_socket_create() and
 * security_socket_post_create() LSM hooks.
 *
 * Return: Returns 0 on success or a negative error code on failure.
 */
int security_mptcp_add_subflow(struct sock *sk, struct sock *ssk)
{
        return call_int_hook(mptcp_add_subflow, sk, ssk);
}

#endif        /* CONFIG_SECURITY_NETWORK */

#ifdef CONFIG_SECURITY_INFINIBAND
/**
 * security_ib_pkey_access() - Check if access to an IB pkey is allowed
 * @sec: LSM blob
 * @subnet_prefix: subnet prefix of the port
 * @pkey: IB pkey
 *
 * Check permission to access a pkey when modifying a QP.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_pkey_access(void *sec, u64 subnet_prefix, u16 pkey)
{
        return call_int_hook(ib_pkey_access, sec, subnet_prefix, pkey);
}
EXPORT_SYMBOL(security_ib_pkey_access);

/**
 * security_ib_endport_manage_subnet() - Check if SMPs traffic is allowed
 * @sec: LSM blob
 * @dev_name: IB device name
 * @port_num: port number
 *
 * Check permissions to send and receive SMPs on a end port.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_ib_endport_manage_subnet(void *sec,
                                      const char *dev_name, u8 port_num)
{
        return call_int_hook(ib_endport_manage_subnet, sec, dev_name, port_num);
}
EXPORT_SYMBOL(security_ib_endport_manage_subnet);

/**
 * security_ib_alloc_security() - Allocate an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Allocate a security structure for Infiniband objects.
 *
 * Return: Returns 0 on success, non-zero on failure.
 */
int security_ib_alloc_security(void **sec)
{
        int rc;

        rc = lsm_blob_alloc(sec, blob_sizes.lbs_ib, GFP_KERNEL);
        if (rc)
                return rc;

        rc = call_int_hook(ib_alloc_security, *sec);
        if (rc) {
                kfree(*sec);
                *sec = NULL;
        }
        return rc;
}
EXPORT_SYMBOL(security_ib_alloc_security);

/**
 * security_ib_free_security() - Free an Infiniband LSM blob
 * @sec: LSM blob
 *
 * Deallocate an Infiniband security structure.
 */
void security_ib_free_security(void *sec)
{
        kfree(sec);
}
EXPORT_SYMBOL(security_ib_free_security);
#endif        /* CONFIG_SECURITY_INFINIBAND */

#ifdef CONFIG_SECURITY_NETWORK_XFRM
/**
 * security_xfrm_policy_alloc() - Allocate a xfrm policy LSM blob
 * @ctxp: xfrm security context being added to the SPD
 * @sec_ctx: security label provided by userspace
 * @gfp: gfp flags
 *
 * Allocate a security structure to the xp->security field; the security field
 * is initialized to NULL when the xfrm_policy is allocated.
 *
 * Return:  Return 0 if operation was successful.
 */
int security_xfrm_policy_alloc(struct xfrm_sec_ctx **ctxp,
                               struct xfrm_user_sec_ctx *sec_ctx,
                               gfp_t gfp)
{
        return call_int_hook(xfrm_policy_alloc_security, ctxp, sec_ctx, gfp);
}
EXPORT_SYMBOL(security_xfrm_policy_alloc);

/**
 * security_xfrm_policy_clone() - Clone xfrm policy LSM state
 * @old_ctx: xfrm security context
 * @new_ctxp: target xfrm security context
 *
 * Allocate a security structure in new_ctxp that contains the information from
 * the old_ctx structure.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_policy_clone(struct xfrm_sec_ctx *old_ctx,
                               struct xfrm_sec_ctx **new_ctxp)
{
        return call_int_hook(xfrm_policy_clone_security, old_ctx, new_ctxp);
}

/**
 * security_xfrm_policy_free() - Free a xfrm security context
 * @ctx: xfrm security context
 *
 * Free LSM resources associated with @ctx.
 */
void security_xfrm_policy_free(struct xfrm_sec_ctx *ctx)
{
        call_void_hook(xfrm_policy_free_security, ctx);
}
EXPORT_SYMBOL(security_xfrm_policy_free);

/**
 * security_xfrm_policy_delete() - Check if deleting a xfrm policy is allowed
 * @ctx: xfrm security context
 *
 * Authorize deletion of a SPD entry.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_policy_delete(struct xfrm_sec_ctx *ctx)
{
        return call_int_hook(xfrm_policy_delete_security, ctx);
}

/**
 * security_xfrm_state_alloc() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @sec_ctx: security label provided by userspace
 *
 * Allocate a security structure to the @x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated. Set the context to
 * correspond to @sec_ctx.
 *
 * Return: Return 0 if operation was successful.
 */
int security_xfrm_state_alloc(struct xfrm_state *x,
                              struct xfrm_user_sec_ctx *sec_ctx)
{
        return call_int_hook(xfrm_state_alloc, x, sec_ctx);
}
EXPORT_SYMBOL(security_xfrm_state_alloc);

/**
 * security_xfrm_state_alloc_acquire() - Allocate a xfrm state LSM blob
 * @x: xfrm state being added to the SAD
 * @polsec: associated policy's security context
 * @secid: secid from the flow
 *
 * Allocate a security structure to the x->security field; the security field
 * is initialized to NULL when the xfrm_state is allocated.  Set the context to
 * correspond to secid.
 *
 * Return: Returns 0 if operation was successful.
 */
int security_xfrm_state_alloc_acquire(struct xfrm_state *x,
                                      struct xfrm_sec_ctx *polsec, u32 secid)
{
        return call_int_hook(xfrm_state_alloc_acquire, x, polsec, secid);
}

/**
 * security_xfrm_state_delete() - Check if deleting a xfrm state is allowed
 * @x: xfrm state
 *
 * Authorize deletion of x->security.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_xfrm_state_delete(struct xfrm_state *x)
{
        return call_int_hook(xfrm_state_delete_security, x);
}
EXPORT_SYMBOL(security_xfrm_state_delete);

/**
 * security_xfrm_state_free() - Free a xfrm state
 * @x: xfrm state
 *
 * Deallocate x->security.
 */
void security_xfrm_state_free(struct xfrm_state *x)
{
        call_void_hook(xfrm_state_free_security, x);
}

/**
 * security_xfrm_policy_lookup() - Check if using a xfrm policy is allowed
 * @ctx: target xfrm security context
 * @fl_secid: flow secid used to authorize access
 *
 * Check permission when a flow selects a xfrm_policy for processing XFRMs on a
 * packet.  The hook is called when selecting either a per-socket policy or a
 * generic xfrm policy.
 *
 * Return: Return 0 if permission is granted, -ESRCH otherwise, or -errno on
 *         other errors.
 */
int security_xfrm_policy_lookup(struct xfrm_sec_ctx *ctx, u32 fl_secid)
{
        return call_int_hook(xfrm_policy_lookup, ctx, fl_secid);
}

/**
 * security_xfrm_state_pol_flow_match() - Check for a xfrm match
 * @x: xfrm state to match
 * @xp: xfrm policy to check for a match
 * @flic: flow to check for a match.
 *
 * Check @xp and @flic for a match with @x.
 *
 * Return: Returns 1 if there is a match.
 */
int security_xfrm_state_pol_flow_match(struct xfrm_state *x,
                                       struct xfrm_policy *xp,
                                       const struct flowi_common *flic)
{
        struct lsm_static_call *scall;
        int rc = LSM_RET_DEFAULT(xfrm_state_pol_flow_match);

        /*
         * Since this function is expected to return 0 or 1, the judgment
         * becomes difficult if multiple LSMs supply this call. Fortunately,
         * we can use the first LSM's judgment because currently only SELinux
         * supplies this call.
         *
         * For speed optimization, we explicitly break the loop rather than
         * using the macro
         */
        lsm_for_each_hook(scall, xfrm_state_pol_flow_match) {
                rc = scall->hl->hook.xfrm_state_pol_flow_match(x, xp, flic);
                break;
        }
        return rc;
}

/**
 * security_xfrm_decode_session() - Determine the xfrm secid for a packet
 * @skb: xfrm packet
 * @secid: secid
 *
 * Decode the packet in @skb and return the security label in @secid.
 *
 * Return: Return 0 if all xfrms used have the same secid.
 */
int security_xfrm_decode_session(struct sk_buff *skb, u32 *secid)
{
        return call_int_hook(xfrm_decode_session, skb, secid, 1);
}

void security_skb_classify_flow(struct sk_buff *skb, struct flowi_common *flic)
{
        int rc = call_int_hook(xfrm_decode_session, skb, &flic->flowic_secid,
                               0);

        BUG_ON(rc);
}
EXPORT_SYMBOL(security_skb_classify_flow);
#endif        /* CONFIG_SECURITY_NETWORK_XFRM */

#ifdef CONFIG_KEYS
/**
 * security_key_alloc() - Allocate and initialize a kernel key LSM blob
 * @key: key
 * @cred: credentials
 * @flags: allocation flags
 *
 * Permit allocation of a key and assign security data. Note that key does not
 * have a serial number assigned at this point.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_alloc(struct key *key, const struct cred *cred,
                       unsigned long flags)
{
        int rc = lsm_key_alloc(key);

        if (unlikely(rc))
                return rc;
        rc = call_int_hook(key_alloc, key, cred, flags);
        if (unlikely(rc))
                security_key_free(key);
        return rc;
}

/**
 * security_key_free() - Free a kernel key LSM blob
 * @key: key
 *
 * Notification of destruction; free security data.
 */
void security_key_free(struct key *key)
{
        kfree(key->security);
        key->security = NULL;
}

/**
 * security_key_permission() - Check if a kernel key operation is allowed
 * @key_ref: key reference
 * @cred: credentials of actor requesting access
 * @need_perm: requested permissions
 *
 * See whether a specific operational right is granted to a process on a key.
 *
 * Return: Return 0 if permission is granted, -ve error otherwise.
 */
int security_key_permission(key_ref_t key_ref, const struct cred *cred,
                            enum key_need_perm need_perm)
{
        return call_int_hook(key_permission, key_ref, cred, need_perm);
}

/**
 * security_key_getsecurity() - Get the key's security label
 * @key: key
 * @buffer: security label buffer
 *
 * Get a textual representation of the security context attached to a key for
 * the purposes of honouring KEYCTL_GETSECURITY.  This function allocates the
 * storage for the NUL-terminated string and the caller should free it.
 *
 * Return: Returns the length of @buffer (including terminating NUL) or -ve if
 *         an error occurs.  May also return 0 (and a NULL buffer pointer) if
 *         there is no security label assigned to the key.
 */
int security_key_getsecurity(struct key *key, char **buffer)
{
        *buffer = NULL;
        return call_int_hook(key_getsecurity, key, buffer);
}

/**
 * security_key_post_create_or_update() - Notification of key create or update
 * @keyring: keyring to which the key is linked to
 * @key: created or updated key
 * @payload: data used to instantiate or update the key
 * @payload_len: length of payload
 * @flags: key flags
 * @create: flag indicating whether the key was created or updated
 *
 * Notify the caller of a key creation or update.
 */
void security_key_post_create_or_update(struct key *keyring, struct key *key,
                                        const void *payload, size_t payload_len,
                                        unsigned long flags, bool create)
{
        call_void_hook(key_post_create_or_update, keyring, key, payload,
                       payload_len, flags, create);
}
#endif        /* CONFIG_KEYS */

#ifdef CONFIG_AUDIT
/**
 * security_audit_rule_init() - Allocate and init an LSM audit rule struct
 * @field: audit action
 * @op: rule operator
 * @rulestr: rule context
 * @lsmrule: receive buffer for audit rule struct
 * @gfp: GFP flag used for kmalloc
 *
 * Allocate and initialize an LSM audit rule structure.
 *
 * Return: Return 0 if @lsmrule has been successfully set, -EINVAL in case of
 *         an invalid rule.
 */
int security_audit_rule_init(u32 field, u32 op, char *rulestr, void **lsmrule,
                             gfp_t gfp)
{
        return call_int_hook(audit_rule_init, field, op, rulestr, lsmrule, gfp);
}

/**
 * security_audit_rule_known() - Check if an audit rule contains LSM fields
 * @krule: audit rule
 *
 * Specifies whether given @krule contains any fields related to the current
 * LSM.
 *
 * Return: Returns 1 in case of relation found, 0 otherwise.
 */
int security_audit_rule_known(struct audit_krule *krule)
{
        return call_int_hook(audit_rule_known, krule);
}

/**
 * security_audit_rule_free() - Free an LSM audit rule struct
 * @lsmrule: audit rule struct
 *
 * Deallocate the LSM audit rule structure previously allocated by
 * audit_rule_init().
 */
void security_audit_rule_free(void *lsmrule)
{
        call_void_hook(audit_rule_free, lsmrule);
}

/**
 * security_audit_rule_match() - Check if a label matches an audit rule
 * @prop: security label
 * @field: LSM audit field
 * @op: matching operator
 * @lsmrule: audit rule
 *
 * Determine if given @secid matches a rule previously approved by
 * security_audit_rule_known().
 *
 * Return: Returns 1 if secid matches the rule, 0 if it does not, -ERRNO on
 *         failure.
 */
int security_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op,
                              void *lsmrule)
{
        return call_int_hook(audit_rule_match, prop, field, op, lsmrule);
}
#endif /* CONFIG_AUDIT */

#ifdef CONFIG_BPF_SYSCALL
/**
 * security_bpf() - Check if the bpf syscall operation is allowed
 * @cmd: command
 * @attr: bpf attribute
 * @size: size
 * @kernel: whether or not call originated from kernel
 *
 * Do a initial check for all bpf syscalls after the attribute is copied into
 * the kernel. The actual security module can implement their own rules to
 * check the specific cmd they need.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf(int cmd, union bpf_attr *attr, unsigned int size, bool kernel)
{
        return call_int_hook(bpf, cmd, attr, size, kernel);
}

/**
 * security_bpf_map() - Check if access to a bpf map is allowed
 * @map: bpf map
 * @fmode: mode
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * maps.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        return call_int_hook(bpf_map, map, fmode);
}

/**
 * security_bpf_prog() - Check if access to a bpf program is allowed
 * @prog: bpf program
 *
 * Do a check when the kernel generates and returns a file descriptor for eBPF
 * programs.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_bpf_prog(struct bpf_prog *prog)
{
        return call_int_hook(bpf_prog, prog);
}

/**
 * security_bpf_map_create() - Check if BPF map creation is allowed
 * @map: BPF map object
 * @attr: BPF syscall attributes used to create BPF map
 * @token: BPF token used to grant user access
 * @kernel: whether or not call originated from kernel
 *
 * Do a check when the kernel creates a new BPF map. This is also the
 * point where LSM blob is allocated for LSMs that need them.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
                            struct bpf_token *token, bool kernel)
{
        return call_int_hook(bpf_map_create, map, attr, token, kernel);
}

/**
 * security_bpf_prog_load() - Check if loading of BPF program is allowed
 * @prog: BPF program object
 * @attr: BPF syscall attributes used to create BPF program
 * @token: BPF token used to grant user access to BPF subsystem
 * @kernel: whether or not call originated from kernel
 *
 * Perform an access control check when the kernel loads a BPF program and
 * allocates associated BPF program object. This hook is also responsible for
 * allocating any required LSM state for the BPF program.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
                           struct bpf_token *token, bool kernel)
{
        return call_int_hook(bpf_prog_load, prog, attr, token, kernel);
}

/**
 * security_bpf_token_create() - Check if creating of BPF token is allowed
 * @token: BPF token object
 * @attr: BPF syscall attributes used to create BPF token
 * @path: path pointing to BPF FS mount point from which BPF token is created
 *
 * Do a check when the kernel instantiates a new BPF token object from BPF FS
 * instance. This is also the point where LSM blob can be allocated for LSMs.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
                              const struct path *path)
{
        return call_int_hook(bpf_token_create, token, attr, path);
}

/**
 * security_bpf_token_cmd() - Check if BPF token is allowed to delegate
 * requested BPF syscall command
 * @token: BPF token object
 * @cmd: BPF syscall command requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF syscall command.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_cmd(const struct bpf_token *token, enum bpf_cmd cmd)
{
        return call_int_hook(bpf_token_cmd, token, cmd);
}

/**
 * security_bpf_token_capable() - Check if BPF token is allowed to delegate
 * requested BPF-related capability
 * @token: BPF token object
 * @cap: capabilities requested to be delegated by BPF token
 *
 * Do a check when the kernel decides whether provided BPF token should allow
 * delegation of requested BPF-related capabilities.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_bpf_token_capable(const struct bpf_token *token, int cap)
{
        return call_int_hook(bpf_token_capable, token, cap);
}

/**
 * security_bpf_map_free() - Free a bpf map's LSM blob
 * @map: bpf map
 *
 * Clean up the security information stored inside bpf map.
 */
void security_bpf_map_free(struct bpf_map *map)
{
        call_void_hook(bpf_map_free, map);
}

/**
 * security_bpf_prog_free() - Free a BPF program's LSM blob
 * @prog: BPF program struct
 *
 * Clean up the security information stored inside BPF program.
 */
void security_bpf_prog_free(struct bpf_prog *prog)
{
        call_void_hook(bpf_prog_free, prog);
}

/**
 * security_bpf_token_free() - Free a BPF token's LSM blob
 * @token: BPF token struct
 *
 * Clean up the security information stored inside BPF token.
 */
void security_bpf_token_free(struct bpf_token *token)
{
        call_void_hook(bpf_token_free, token);
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * security_locked_down() - Check if a kernel feature is allowed
 * @what: requested kernel feature
 *
 * Determine whether a kernel feature that potentially enables arbitrary code
 * execution in kernel space should be permitted.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_locked_down(enum lockdown_reason what)
{
        return call_int_hook(locked_down, what);
}
EXPORT_SYMBOL(security_locked_down);

/**
 * security_bdev_alloc() - Allocate a block device LSM blob
 * @bdev: block device
 *
 * Allocate and attach a security structure to @bdev->bd_security.  The
 * security field is initialized to NULL when the bdev structure is
 * allocated.
 *
 * Return: Return 0 if operation was successful.
 */
int security_bdev_alloc(struct block_device *bdev)
{
        int rc = 0;

        rc = lsm_bdev_alloc(bdev);
        if (unlikely(rc))
                return rc;

        rc = call_int_hook(bdev_alloc_security, bdev);
        if (unlikely(rc))
                security_bdev_free(bdev);

        return rc;
}
EXPORT_SYMBOL(security_bdev_alloc);

/**
 * security_bdev_free() - Free a block device's LSM blob
 * @bdev: block device
 *
 * Deallocate the bdev security structure and set @bdev->bd_security to NULL.
 */
void security_bdev_free(struct block_device *bdev)
{
        if (!bdev->bd_security)
                return;

        call_void_hook(bdev_free_security, bdev);

        kfree(bdev->bd_security);
        bdev->bd_security = NULL;
}
EXPORT_SYMBOL(security_bdev_free);

/**
 * security_bdev_setintegrity() - Set the device's integrity data
 * @bdev: block device
 * @type: type of integrity, e.g. hash digest, signature, etc
 * @value: the integrity value
 * @size: size of the integrity value
 *
 * Register a verified integrity measurement of a bdev with LSMs.
 * LSMs should free the previously saved data if @value is NULL.
 * Please note that the new hook should be invoked every time the security
 * information is updated to keep these data current. For example, in dm-verity,
 * if the mapping table is reloaded and configured to use a different dm-verity
 * target with a new roothash and signing information, the previously stored
 * data in the LSM blob will become obsolete. It is crucial to re-invoke the
 * hook to refresh these data and ensure they are up to date. This necessity
 * arises from the design of device-mapper, where a device-mapper device is
 * first created, and then targets are subsequently loaded into it. These
 * targets can be modified multiple times during the device's lifetime.
 * Therefore, while the LSM blob is allocated during the creation of the block
 * device, its actual contents are not initialized at this stage and can change
 * substantially over time. This includes alterations from data that the LSMs
 * 'trusts' to those they do not, making it essential to handle these changes
 * correctly. Failure to address this dynamic aspect could potentially allow
 * for bypassing LSM checks.
 *
 * Return: Returns 0 on success, negative values on failure.
 */
int security_bdev_setintegrity(struct block_device *bdev,
                               enum lsm_integrity_type type, const void *value,
                               size_t size)
{
        return call_int_hook(bdev_setintegrity, bdev, type, value, size);
}
EXPORT_SYMBOL(security_bdev_setintegrity);

#ifdef CONFIG_PERF_EVENTS
/**
 * security_perf_event_open() - Check if a perf event open is allowed
 * @type: type of event
 *
 * Check whether the @type of perf_event_open syscall is allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_open(int type)
{
        return call_int_hook(perf_event_open, type);
}

/**
 * security_perf_event_alloc() - Allocate a perf event LSM blob
 * @event: perf event
 *
 * Allocate and save perf_event security info.
 *
 * Return: Returns 0 on success, error on failure.
 */
int security_perf_event_alloc(struct perf_event *event)
{
        int rc;

        rc = lsm_blob_alloc(&event->security, blob_sizes.lbs_perf_event,
                            GFP_KERNEL);
        if (rc)
                return rc;

        rc = call_int_hook(perf_event_alloc, event);
        if (rc) {
                kfree(event->security);
                event->security = NULL;
        }
        return rc;
}

/**
 * security_perf_event_free() - Free a perf event LSM blob
 * @event: perf event
 *
 * Release (free) perf_event security info.
 */
void security_perf_event_free(struct perf_event *event)
{
        kfree(event->security);
        event->security = NULL;
}

/**
 * security_perf_event_read() - Check if reading a perf event label is allowed
 * @event: perf event
 *
 * Read perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_read(struct perf_event *event)
{
        return call_int_hook(perf_event_read, event);
}

/**
 * security_perf_event_write() - Check if writing a perf event label is allowed
 * @event: perf event
 *
 * Write perf_event security info if allowed.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_perf_event_write(struct perf_event *event)
{
        return call_int_hook(perf_event_write, event);
}
#endif /* CONFIG_PERF_EVENTS */

#ifdef CONFIG_IO_URING
/**
 * security_uring_override_creds() - Check if overriding creds is allowed
 * @new: new credentials
 *
 * Check if the current task, executing an io_uring operation, is allowed to
 * override it's credentials with @new.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_override_creds(const struct cred *new)
{
        return call_int_hook(uring_override_creds, new);
}

/**
 * security_uring_sqpoll() - Check if IORING_SETUP_SQPOLL is allowed
 *
 * Check whether the current task is allowed to spawn a io_uring polling thread
 * (IORING_SETUP_SQPOLL).
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_sqpoll(void)
{
        return call_int_hook(uring_sqpoll);
}

/**
 * security_uring_cmd() - Check if a io_uring passthrough command is allowed
 * @ioucmd: command
 *
 * Check whether the file_operations uring_cmd is allowed to run.
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_cmd(struct io_uring_cmd *ioucmd)
{
        return call_int_hook(uring_cmd, ioucmd);
}

/**
 * security_uring_allowed() - Check if io_uring_setup() is allowed
 *
 * Check whether the current task is allowed to call io_uring_setup().
 *
 * Return: Returns 0 if permission is granted.
 */
int security_uring_allowed(void)
{
        return call_int_hook(uring_allowed);
}
#endif /* CONFIG_IO_URING */

/**
 * security_initramfs_populated() - Notify LSMs that initramfs has been loaded
 *
 * Tells the LSMs the initramfs has been unpacked into the rootfs.
 */
void security_initramfs_populated(void)
{
        call_void_hook(initramfs_populated);
}
































































































































































































































































































































































































































































































    4 















    4 









































































































































   11 










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H

#ifndef __ASSEMBLY__

#include <linux/align.h>
#include <linux/bitops.h>
#include <linux/cleanup.h>
#include <linux/errno.h>
#include <linux/find.h>
#include <linux/limits.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/bitmap-str.h>

struct device;

/*
 * bitmaps provide bit arrays that consume one or more unsigned
 * longs.  The bitmap interface and available operations are listed
 * here, in bitmap.h
 *
 * Function implementations generic to all architectures are in
 * lib/bitmap.c.  Functions implementations that are architecture
 * specific are in various arch/<arch>/include/asm/bitops.h headers
 * and other arch/<arch> specific files.
 *
 * See lib/bitmap.c for more details.
 */

/**
 * DOC: bitmap overview
 *
 * The available bitmap operations and their rough meaning in the
 * case that the bitmap is a single unsigned long are thus:
 *
 * The generated code is more efficient when nbits is known at
 * compile-time and at most BITS_PER_LONG.
 *
 * ::
 *
 *  bitmap_zero(dst, nbits)                     *dst = 0UL
 *  bitmap_fill(dst, nbits)                     *dst = ~0UL
 *  bitmap_copy(dst, src, nbits)                *dst = *src
 *  bitmap_and(dst, src1, src2, nbits)          *dst = *src1 & *src2
 *  bitmap_or(dst, src1, src2, nbits)           *dst = *src1 | *src2
 *  bitmap_xor(dst, src1, src2, nbits)          *dst = *src1 ^ *src2
 *  bitmap_andnot(dst, src1, src2, nbits)       *dst = *src1 & ~(*src2)
 *  bitmap_complement(dst, src, nbits)          *dst = ~(*src)
 *  bitmap_equal(src1, src2, nbits)             Are *src1 and *src2 equal?
 *  bitmap_intersects(src1, src2, nbits)        Do *src1 and *src2 overlap?
 *  bitmap_subset(src1, src2, nbits)            Is *src1 a subset of *src2?
 *  bitmap_empty(src, nbits)                    Are all bits zero in *src?
 *  bitmap_full(src, nbits)                     Are all bits set in *src?
 *  bitmap_weight(src, nbits)                   Hamming Weight: number set bits
 *  bitmap_weight_and(src1, src2, nbits)        Hamming Weight of and'ed bitmap
 *  bitmap_weight_andnot(src1, src2, nbits)     Hamming Weight of andnot'ed bitmap
 *  bitmap_set(dst, pos, nbits)                 Set specified bit area
 *  bitmap_clear(dst, pos, nbits)               Clear specified bit area
 *  bitmap_find_next_zero_area(buf, len, pos, n, mask)  Find bit free area
 *  bitmap_find_next_zero_area_off(buf, len, pos, n, mask, mask_off)  as above
 *  bitmap_shift_right(dst, src, n, nbits)      *dst = *src >> n
 *  bitmap_shift_left(dst, src, n, nbits)       *dst = *src << n
 *  bitmap_cut(dst, src, first, n, nbits)       Cut n bits from first, copy rest
 *  bitmap_replace(dst, old, new, mask, nbits)  *dst = (*old & ~(*mask)) | (*new & *mask)
 *  bitmap_scatter(dst, src, mask, nbits)        *dst = map(dense, sparse)(src)
 *  bitmap_gather(dst, src, mask, nbits)        *dst = map(sparse, dense)(src)
 *  bitmap_remap(dst, src, old, new, nbits)     *dst = map(old, new)(src)
 *  bitmap_bitremap(oldbit, old, new, nbits)    newbit = map(old, new)(oldbit)
 *  bitmap_onto(dst, orig, relmap, nbits)       *dst = orig relative to relmap
 *  bitmap_fold(dst, orig, sz, nbits)           dst bits = orig bits mod sz
 *  bitmap_parse(buf, buflen, dst, nbits)       Parse bitmap dst from kernel buf
 *  bitmap_parse_user(ubuf, ulen, dst, nbits)   Parse bitmap dst from user buf
 *  bitmap_parselist(buf, dst, nbits)           Parse bitmap dst from kernel buf
 *  bitmap_parselist_user(buf, dst, nbits)      Parse bitmap dst from user buf
 *  bitmap_find_free_region(bitmap, bits, order)  Find and allocate bit region
 *  bitmap_release_region(bitmap, pos, order)   Free specified bit region
 *  bitmap_allocate_region(bitmap, pos, order)  Allocate specified bit region
 *  bitmap_from_arr32(dst, buf, nbits)          Copy nbits from u32[] buf to dst
 *  bitmap_from_arr64(dst, buf, nbits)          Copy nbits from u64[] buf to dst
 *  bitmap_to_arr32(buf, src, nbits)            Copy nbits from buf to u32[] dst
 *  bitmap_to_arr64(buf, src, nbits)            Copy nbits from buf to u64[] dst
 *  bitmap_get_value8(map, start)               Get 8bit value from map at start
 *  bitmap_set_value8(map, value, start)        Set 8bit value to map at start
 *  bitmap_read(map, start, nbits)              Read an nbits-sized value from
 *                                              map at start
 *  bitmap_write(map, value, start, nbits)      Write an nbits-sized value to
 *                                              map at start
 *
 * Note, bitmap_zero() and bitmap_fill() operate over the region of
 * unsigned longs, that is, bits behind bitmap till the unsigned long
 * boundary will be zeroed or filled as well. Consider to use
 * bitmap_clear() or bitmap_set() to make explicit zeroing or filling
 * respectively.
 */

/**
 * DOC: bitmap bitops
 *
 * Also the following operations in asm/bitops.h apply to bitmaps.::
 *
 *  set_bit(bit, addr)                  *addr |= bit
 *  clear_bit(bit, addr)                *addr &= ~bit
 *  change_bit(bit, addr)               *addr ^= bit
 *  test_bit(bit, addr)                 Is bit set in *addr?
 *  test_and_set_bit(bit, addr)         Set bit and return old value
 *  test_and_clear_bit(bit, addr)       Clear bit and return old value
 *  test_and_change_bit(bit, addr)      Change bit and return old value
 *  find_first_zero_bit(addr, nbits)    Position first zero bit in *addr
 *  find_first_bit(addr, nbits)         Position first set bit in *addr
 *  find_next_zero_bit(addr, nbits, bit)
 *                                      Position next zero bit in *addr >= bit
 *  find_next_bit(addr, nbits, bit)     Position next set bit in *addr >= bit
 *  find_next_and_bit(addr1, addr2, nbits, bit)
 *                                      Same as find_next_bit, but in
 *                                      (*addr1 & *addr2)
 *
 */

/**
 * DOC: declare bitmap
 * The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
 * to declare an array named 'name' of just enough unsigned longs to
 * contain all bit positions from 0 to 'bits' - 1.
 */

/*
 * Allocation and deallocation of bitmap.
 * Provided in lib/bitmap.c to avoid circular dependency.
 */
unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags);
unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags);
unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node);
unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node);
void bitmap_free(const unsigned long *bitmap);

DEFINE_FREE(bitmap, unsigned long *, if (_T) bitmap_free(_T))

/* Managed variants of the above. */
unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags);
unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags);

/*
 * lib/bitmap.c provides these functions:
 */

bool __bitmap_equal(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int nbits);
bool __pure __bitmap_or_equal(const unsigned long *src1,
                              const unsigned long *src2,
                              const unsigned long *src3,
                              unsigned int nbits);
void __bitmap_complement(unsigned long *dst, const unsigned long *src,
                         unsigned int nbits);
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                          unsigned int shift, unsigned int nbits);
void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                         unsigned int shift, unsigned int nbits);
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits);
bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                 const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                 const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                  const unsigned long *bitmap2, unsigned int nbits);
bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits);
bool __bitmap_intersects(const unsigned long *bitmap1,
                         const unsigned long *bitmap2, unsigned int nbits);
bool __bitmap_subset(const unsigned long *bitmap1,
                     const unsigned long *bitmap2, unsigned int nbits);
unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits);
unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
                                 const unsigned long *bitmap2, unsigned int nbits);
unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
                                    const unsigned long *bitmap2, unsigned int nbits);
void __bitmap_set(unsigned long *map, unsigned int start, int len);
void __bitmap_clear(unsigned long *map, unsigned int start, int len);

unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset);

/**
 * bitmap_find_next_zero_area - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds is multiples of that
 * power of 2. A @align_mask of 0 means no alignment is required.
 */
static __always_inline
unsigned long bitmap_find_next_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned int nr,
                                         unsigned long align_mask)
{
        return bitmap_find_next_zero_area_off(map, size, start, nr,
                                              align_mask, 0);
}

void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new, unsigned int nbits);
int bitmap_bitremap(int oldbit,
                const unsigned long *old, const unsigned long *new, int bits);
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                const unsigned long *relmap, unsigned int bits);
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                unsigned int sz, unsigned int nbits);

#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) & (BITS_PER_LONG - 1)))
#define BITMAP_LAST_WORD_MASK(nbits) (~0UL >> (-(nbits) & (BITS_PER_LONG - 1)))

#define bitmap_size(nbits)        (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE)

static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);

        if (small_const_nbits(nbits))
                *dst = 0;
        else
                memset(dst, 0, len);
}

static __always_inline void bitmap_fill(unsigned long *dst, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);

        if (small_const_nbits(nbits))
                *dst = ~0UL;
        else
                memset(dst, 0xff, len);
}

static __always_inline
void bitmap_copy(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        unsigned int len = bitmap_size(nbits);

        if (small_const_nbits(nbits))
                *dst = *src;
        else
                memcpy(dst, src, len);
}

/*
 * Copy bitmap and clear tail bits in last word.
 */
static __always_inline
void bitmap_copy_clear_tail(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        bitmap_copy(dst, src, nbits);
        if (nbits % BITS_PER_LONG)
                dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits);
}

static inline void bitmap_copy_and_extend(unsigned long *to,
                                          const unsigned long *from,
                                          unsigned int count, unsigned int size)
{
        unsigned int copy = BITS_TO_LONGS(count);

        memcpy(to, from, copy * sizeof(long));
        if (count % BITS_PER_LONG)
                to[copy - 1] &= BITMAP_LAST_WORD_MASK(count);
        memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long));
}

/*
 * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64
 * machines the order of hi and lo parts of numbers match the bitmap structure.
 * In both cases conversion is not needed when copying data from/to arrays of
 * u32. But in LE64 case, typecast in bitmap_copy_clear_tail() may lead
 * to out-of-bound access. To avoid that, both LE and BE variants of 64-bit
 * architectures are not using bitmap_copy_clear_tail().
 */
#if BITS_PER_LONG == 64
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf,
                                                        unsigned int nbits);
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap,
                                                        unsigned int nbits);
#else
#define bitmap_from_arr32(bitmap, buf, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (bitmap),        \
                        (const unsigned long *) (buf), (nbits))
#define bitmap_to_arr32(buf, bitmap, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *) (buf),                \
                        (const unsigned long *) (bitmap), (nbits))
#endif

/*
 * On 64-bit systems bitmaps are represented as u64 arrays internally. So,
 * the conversion is not needed when copying data from/to arrays of u64.
 */
#if BITS_PER_LONG == 32
void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits);
void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits);
#else
#define bitmap_from_arr64(bitmap, buf, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *)(bitmap), (const unsigned long *)(buf), (nbits))
#define bitmap_to_arr64(buf, bitmap, nbits)                        \
        bitmap_copy_clear_tail((unsigned long *)(buf), (const unsigned long *)(bitmap), (nbits))
#endif

static __always_inline
bool bitmap_and(unsigned long *dst, const unsigned long *src1,
                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_and(dst, src1, src2, nbits);
}

static __always_inline
void bitmap_or(unsigned long *dst, const unsigned long *src1,
               const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 | *src2;
        else
                __bitmap_or(dst, src1, src2, nbits);
}

static __always_inline
void bitmap_xor(unsigned long *dst, const unsigned long *src1,
                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = *src1 ^ *src2;
        else
                __bitmap_xor(dst, src1, src2, nbits);
}

static __always_inline
bool bitmap_andnot(unsigned long *dst, const unsigned long *src1,
                   const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        return __bitmap_andnot(dst, src1, src2, nbits);
}

static __always_inline
void bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = ~(*src);
        else
                __bitmap_complement(dst, src, nbits);
}

#ifdef __LITTLE_ENDIAN
#define BITMAP_MEM_ALIGNMENT 8
#else
#define BITMAP_MEM_ALIGNMENT (8 * sizeof(unsigned long))
#endif
#define BITMAP_MEM_MASK (BITMAP_MEM_ALIGNMENT - 1)

static __always_inline
bool bitmap_equal(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return !((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
        if (__builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
            IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                return !memcmp(src1, src2, nbits / 8);
        return __bitmap_equal(src1, src2, nbits);
}

/**
 * bitmap_or_equal - Check whether the or of two bitmaps is equal to a third
 * @src1:        Pointer to bitmap 1
 * @src2:        Pointer to bitmap 2 will be or'ed with bitmap 1
 * @src3:        Pointer to bitmap 3. Compare to the result of *@src1 | *@src2
 * @nbits:        number of bits in each of these bitmaps
 *
 * Returns: True if (*@src1 | *@src2) == *@src3, false otherwise
 */
static __always_inline
bool bitmap_or_equal(const unsigned long *src1, const unsigned long *src2,
                     const unsigned long *src3, unsigned int nbits)
{
        if (!small_const_nbits(nbits))
                return __bitmap_or_equal(src1, src2, src3, nbits);

        return !(((*src1 | *src2) ^ *src3) & BITMAP_LAST_WORD_MASK(nbits));
}

static __always_inline
bool bitmap_intersects(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
        else
                return __bitmap_intersects(src1, src2, nbits);
}

static __always_inline
bool bitmap_subset(const unsigned long *src1, const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
        else
                return __bitmap_subset(src1, src2, nbits);
}

static __always_inline
bool bitmap_empty(const unsigned long *src, unsigned nbits)
{
        if (small_const_nbits(nbits))
                return ! (*src & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_bit(src, nbits) == nbits;
}

static __always_inline
bool bitmap_full(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));

        return find_first_zero_bit(src, nbits) == nbits;
}

static __always_inline
unsigned int bitmap_weight(const unsigned long *src, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight(src, nbits);
}

static __always_inline
unsigned long bitmap_weight_and(const unsigned long *src1,
                                const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight_and(src1, src2, nbits);
}

static __always_inline
unsigned long bitmap_weight_andnot(const unsigned long *src1,
                                   const unsigned long *src2, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                return hweight_long(*src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits));
        return __bitmap_weight_andnot(src1, src2, nbits);
}

static __always_inline
void bitmap_set(unsigned long *map, unsigned int start, unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __set_bit(start, map);
        else if (small_const_nbits(start + nbits))
                *map |= GENMASK(start + nbits - 1, start);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0xff, nbits / 8);
        else
                __bitmap_set(map, start, nbits);
}

static __always_inline
void bitmap_clear(unsigned long *map, unsigned int start, unsigned int nbits)
{
        if (__builtin_constant_p(nbits) && nbits == 1)
                __clear_bit(start, map);
        else if (small_const_nbits(start + nbits))
                *map &= ~GENMASK(start + nbits - 1, start);
        else if (__builtin_constant_p(start & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(start, BITMAP_MEM_ALIGNMENT) &&
                 __builtin_constant_p(nbits & BITMAP_MEM_MASK) &&
                 IS_ALIGNED(nbits, BITMAP_MEM_ALIGNMENT))
                memset((char *)map + start / 8, 0, nbits / 8);
        else
                __bitmap_clear(map, start, nbits);
}

static __always_inline
void bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> shift;
        else
                __bitmap_shift_right(dst, src, shift, nbits);
}

static __always_inline
void bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                       unsigned int shift, unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*src << shift) & BITMAP_LAST_WORD_MASK(nbits);
        else
                __bitmap_shift_left(dst, src, shift, nbits);
}

static __always_inline
void bitmap_replace(unsigned long *dst,
                    const unsigned long *old,
                    const unsigned long *new,
                    const unsigned long *mask,
                    unsigned int nbits)
{
        if (small_const_nbits(nbits))
                *dst = (*old & ~(*mask)) | (*new & *mask);
        else
                __bitmap_replace(dst, old, new, mask, nbits);
}

/**
 * bitmap_scatter - Scatter a bitmap according to the given mask
 * @dst: scattered bitmap
 * @src: gathered bitmap
 * @mask: mask representing bits to assign to in the scattered bitmap
 * @nbits: number of bits in each of these bitmaps
 *
 * Scatters bitmap with sequential bits according to the given @mask.
 *
 * Example:
 * If @src bitmap = 0x005a, with @mask = 0x1313, @dst will be 0x0302.
 *
 * Or in binary form
 * @src                        @mask                        @dst
 * 0000000001011010        0001001100010011        0000001100000010
 *
 * (Bits 0, 1, 2, 3, 4, 5 are copied to the bits 0, 1, 4, 8, 9, 12)
 *
 * A more 'visual' description of the operation::
 *
 *        src:  0000000001011010
 *                        ||||||
 *                 +------+|||||
 *                 |  +----+||||
 *                 |  |+----+|||
 *                 |  ||   +-+||
 *                 |  ||   |  ||
 *        mask: ...v..vv...v..vv
 *              ...0..11...0..10
 *        dst:  0000001100000010
 *
 * A relationship exists between bitmap_scatter() and bitmap_gather(). See
 * bitmap_gather() for the bitmap gather detailed operations. TL;DR:
 * bitmap_gather() can be seen as the 'reverse' bitmap_scatter() operation.
 */
static __always_inline
void bitmap_scatter(unsigned long *dst, const unsigned long *src,
                    const unsigned long *mask, unsigned int nbits)
{
        unsigned int n = 0;
        unsigned int bit;

        bitmap_zero(dst, nbits);

        for_each_set_bit(bit, mask, nbits)
                __assign_bit(bit, dst, test_bit(n++, src));
}

/**
 * bitmap_gather - Gather a bitmap according to given mask
 * @dst: gathered bitmap
 * @src: scattered bitmap
 * @mask: mask representing bits to extract from in the scattered bitmap
 * @nbits: number of bits in each of these bitmaps
 *
 * Gathers bitmap with sparse bits according to the given @mask.
 *
 * Example:
 * If @src bitmap = 0x0302, with @mask = 0x1313, @dst will be 0x001a.
 *
 * Or in binary form
 * @src                        @mask                        @dst
 * 0000001100000010        0001001100010011        0000000000011010
 *
 * (Bits 0, 1, 4, 8, 9, 12 are copied to the bits 0, 1, 2, 3, 4, 5)
 *
 * A more 'visual' description of the operation::
 *
 *        mask: ...v..vv...v..vv
 *        src:  0000001100000010
 *                 ^  ^^   ^   0
 *                 |  ||   |  10
 *                 |  ||   > 010
 *                 |  |+--> 1010
 *                 |  +--> 11010
 *                 +----> 011010
 *        dst:  0000000000011010
 *
 * A relationship exists between bitmap_gather() and bitmap_scatter(). See
 * bitmap_scatter() for the bitmap scatter detailed operations. TL;DR:
 * bitmap_scatter() can be seen as the 'reverse' bitmap_gather() operation.
 *
 * Suppose scattered computed using bitmap_scatter(scattered, src, mask, n).
 * The operation bitmap_gather(result, scattered, mask, n) leads to a result
 * equal or equivalent to src.
 *
 * The result can be 'equivalent' because bitmap_scatter() and bitmap_gather()
 * are not bijective.
 * The result and src values are equivalent in that sense that a call to
 * bitmap_scatter(res, src, mask, n) and a call to
 * bitmap_scatter(res, result, mask, n) will lead to the same res value.
 */
static __always_inline
void bitmap_gather(unsigned long *dst, const unsigned long *src,
                   const unsigned long *mask, unsigned int nbits)
{
        unsigned int n = 0;
        unsigned int bit;

        bitmap_zero(dst, nbits);

        for_each_set_bit(bit, mask, nbits)
                __assign_bit(n++, dst, test_bit(bit, src));
}

static __always_inline
void bitmap_next_set_region(unsigned long *bitmap, unsigned int *rs,
                            unsigned int *re, unsigned int end)
{
        *rs = find_next_bit(bitmap, end, *rs);
        *re = find_next_zero_bit(bitmap, end, *rs + 1);
}

/**
 * bitmap_release_region - release allocated bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to release
 *        @order: region size (log base 2 of number of bits) to release
 *
 * This is the complement to __bitmap_find_free_region() and releases
 * the found region (by clearing it in the bitmap).
 */
static __always_inline
void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order)
{
        bitmap_clear(bitmap, pos, BIT(order));
}

/**
 * bitmap_allocate_region - allocate bitmap region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @pos: beginning of bit region to allocate
 *        @order: region size (log base 2 of number of bits) to allocate
 *
 * Allocate (set bits in) a specified region of a bitmap.
 *
 * Returns: 0 on success, or %-EBUSY if specified region wasn't
 * free (not all bits were zero).
 */
static __always_inline
int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order)
{
        unsigned int len = BIT(order);

        if (find_next_bit(bitmap, pos + len, pos) < pos + len)
                return -EBUSY;
        bitmap_set(bitmap, pos, len);
        return 0;
}

/**
 * bitmap_find_free_region - find a contiguous aligned mem region
 *        @bitmap: array of unsigned longs corresponding to the bitmap
 *        @bits: number of bits in the bitmap
 *        @order: region size (log base 2 of number of bits) to find
 *
 * Find a region of free (zero) bits in a @bitmap of @bits bits and
 * allocate them (set them to one).  Only consider regions of length
 * a power (@order) of two, aligned to that power of two, which
 * makes the search algorithm much faster.
 *
 * Returns: the bit offset in bitmap of the allocated region,
 * or -errno on failure.
 */
static __always_inline
int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order)
{
        unsigned int pos, end;                /* scans bitmap by regions of size order */

        for (pos = 0; (end = pos + BIT(order)) <= bits; pos = end) {
                if (!bitmap_allocate_region(bitmap, pos, order))
                        return pos;
        }
        return -ENOMEM;
}

/**
 * BITMAP_FROM_U64() - Represent u64 value in the format suitable for bitmap.
 * @n: u64 value
 *
 * Linux bitmaps are internally arrays of unsigned longs, i.e. 32-bit
 * integers in 32-bit environment, and 64-bit integers in 64-bit one.
 *
 * There are four combinations of endianness and length of the word in linux
 * ABIs: LE64, BE64, LE32 and BE32.
 *
 * On 64-bit kernels 64-bit LE and BE numbers are naturally ordered in
 * bitmaps and therefore don't require any special handling.
 *
 * On 32-bit kernels 32-bit LE ABI orders lo word of 64-bit number in memory
 * prior to hi, and 32-bit BE orders hi word prior to lo. The bitmap on the
 * other hand is represented as an array of 32-bit words and the position of
 * bit N may therefore be calculated as: word #(N/32) and bit #(N%32) in that
 * word.  For example, bit #42 is located at 10th position of 2nd word.
 * It matches 32-bit LE ABI, and we can simply let the compiler store 64-bit
 * values in memory as it usually does. But for BE we need to swap hi and lo
 * words manually.
 *
 * With all that, the macro BITMAP_FROM_U64() does explicit reordering of hi and
 * lo parts of u64.  For LE32 it does nothing, and for BE environment it swaps
 * hi and lo words, as is expected by bitmap.
 */
#if __BITS_PER_LONG == 64
#define BITMAP_FROM_U64(n) (n)
#else
#define BITMAP_FROM_U64(n) ((unsigned long) ((u64)(n) & ULONG_MAX)), \
                                ((unsigned long) ((u64)(n) >> 32))
#endif

/**
 * bitmap_from_u64 - Check and swap words within u64.
 *  @mask: source bitmap
 *  @dst:  destination bitmap
 *
 * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]``
 * to read u64 mask, we will get the wrong word.
 * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits,
 * but we expect the lower 32-bits of u64.
 */
static __always_inline void bitmap_from_u64(unsigned long *dst, u64 mask)
{
        bitmap_from_arr64(dst, &mask, 64);
}

/**
 * bitmap_read - read a value of n-bits from the memory region
 * @map: address to the bitmap memory region
 * @start: bit offset of the n-bit value
 * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG
 *
 * Returns: value of @nbits bits located at the @start bit offset within the
 * @map memory region. For @nbits = 0 and @nbits > BITS_PER_LONG the return
 * value is undefined.
 */
static __always_inline
unsigned long bitmap_read(const unsigned long *map, unsigned long start, unsigned long nbits)
{
        size_t index = BIT_WORD(start);
        unsigned long offset = start % BITS_PER_LONG;
        unsigned long space = BITS_PER_LONG - offset;
        unsigned long value_low, value_high;

        if (unlikely(!nbits || nbits > BITS_PER_LONG))
                return 0;

        if (space >= nbits)
                return (map[index] >> offset) & BITMAP_LAST_WORD_MASK(nbits);

        value_low = map[index] & BITMAP_FIRST_WORD_MASK(start);
        value_high = map[index + 1] & BITMAP_LAST_WORD_MASK(start + nbits);
        return (value_low >> offset) | (value_high << space);
}

/**
 * bitmap_write - write n-bit value within a memory region
 * @map: address to the bitmap memory region
 * @value: value to write, clamped to nbits
 * @start: bit offset of the n-bit value
 * @nbits: size of value in bits, nonzero, up to BITS_PER_LONG.
 *
 * bitmap_write() behaves as-if implemented as @nbits calls of __assign_bit(),
 * i.e. bits beyond @nbits are ignored:
 *
 *   for (bit = 0; bit < nbits; bit++)
 *           __assign_bit(start + bit, bitmap, val & BIT(bit));
 *
 * For @nbits == 0 and @nbits > BITS_PER_LONG no writes are performed.
 */
static __always_inline
void bitmap_write(unsigned long *map, unsigned long value,
                  unsigned long start, unsigned long nbits)
{
        size_t index;
        unsigned long offset;
        unsigned long space;
        unsigned long mask;
        bool fit;

        if (unlikely(!nbits || nbits > BITS_PER_LONG))
                return;

        mask = BITMAP_LAST_WORD_MASK(nbits);
        value &= mask;
        offset = start % BITS_PER_LONG;
        space = BITS_PER_LONG - offset;
        fit = space >= nbits;
        index = BIT_WORD(start);

        map[index] &= (fit ? (~(mask << offset)) : ~BITMAP_FIRST_WORD_MASK(start));
        map[index] |= value << offset;
        if (fit)
                return;

        map[index + 1] &= BITMAP_FIRST_WORD_MASK(start + nbits);
        map[index + 1] |= (value >> space);
}

#define bitmap_get_value8(map, start)                        \
        bitmap_read(map, start, BITS_PER_BYTE)
#define bitmap_set_value8(map, value, start)                \
        bitmap_write(map, value, start, BITS_PER_BYTE)

#endif /* __ASSEMBLY__ */

#endif /* __LINUX_BITMAP_H */
















































































































































































































































































































































































































































































































































 1258 
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































 1250 















    1 


 1255 









 1251 

























































































































































































































































 1255 





























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/common.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/string_helpers.h>
#include "common.h"

/* String table for operation mode. */
const char * const tomoyo_mode[TOMOYO_CONFIG_MAX_MODE] = {
        [TOMOYO_CONFIG_DISABLED]   = "disabled",
        [TOMOYO_CONFIG_LEARNING]   = "learning",
        [TOMOYO_CONFIG_PERMISSIVE] = "permissive",
        [TOMOYO_CONFIG_ENFORCING]  = "enforcing"
};

/* String table for /sys/kernel/security/tomoyo/profile */
const char * const tomoyo_mac_keywords[TOMOYO_MAX_MAC_INDEX
                                       + TOMOYO_MAX_MAC_CATEGORY_INDEX] = {
        /* CONFIG::file group */
        [TOMOYO_MAC_FILE_EXECUTE]    = "execute",
        [TOMOYO_MAC_FILE_OPEN]       = "open",
        [TOMOYO_MAC_FILE_CREATE]     = "create",
        [TOMOYO_MAC_FILE_UNLINK]     = "unlink",
        [TOMOYO_MAC_FILE_GETATTR]    = "getattr",
        [TOMOYO_MAC_FILE_MKDIR]      = "mkdir",
        [TOMOYO_MAC_FILE_RMDIR]      = "rmdir",
        [TOMOYO_MAC_FILE_MKFIFO]     = "mkfifo",
        [TOMOYO_MAC_FILE_MKSOCK]     = "mksock",
        [TOMOYO_MAC_FILE_TRUNCATE]   = "truncate",
        [TOMOYO_MAC_FILE_SYMLINK]    = "symlink",
        [TOMOYO_MAC_FILE_MKBLOCK]    = "mkblock",
        [TOMOYO_MAC_FILE_MKCHAR]     = "mkchar",
        [TOMOYO_MAC_FILE_LINK]       = "link",
        [TOMOYO_MAC_FILE_RENAME]     = "rename",
        [TOMOYO_MAC_FILE_CHMOD]      = "chmod",
        [TOMOYO_MAC_FILE_CHOWN]      = "chown",
        [TOMOYO_MAC_FILE_CHGRP]      = "chgrp",
        [TOMOYO_MAC_FILE_IOCTL]      = "ioctl",
        [TOMOYO_MAC_FILE_CHROOT]     = "chroot",
        [TOMOYO_MAC_FILE_MOUNT]      = "mount",
        [TOMOYO_MAC_FILE_UMOUNT]     = "unmount",
        [TOMOYO_MAC_FILE_PIVOT_ROOT] = "pivot_root",
        /* CONFIG::network group */
        [TOMOYO_MAC_NETWORK_INET_STREAM_BIND]       = "inet_stream_bind",
        [TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN]     = "inet_stream_listen",
        [TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT]    = "inet_stream_connect",
        [TOMOYO_MAC_NETWORK_INET_DGRAM_BIND]        = "inet_dgram_bind",
        [TOMOYO_MAC_NETWORK_INET_DGRAM_SEND]        = "inet_dgram_send",
        [TOMOYO_MAC_NETWORK_INET_RAW_BIND]          = "inet_raw_bind",
        [TOMOYO_MAC_NETWORK_INET_RAW_SEND]          = "inet_raw_send",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND]       = "unix_stream_bind",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN]     = "unix_stream_listen",
        [TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT]    = "unix_stream_connect",
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND]        = "unix_dgram_bind",
        [TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND]        = "unix_dgram_send",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND]    = "unix_seqpacket_bind",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN]  = "unix_seqpacket_listen",
        [TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT] = "unix_seqpacket_connect",
        /* CONFIG::misc group */
        [TOMOYO_MAC_ENVIRON] = "env",
        /* CONFIG group */
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_FILE] = "file",
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_NETWORK] = "network",
        [TOMOYO_MAX_MAC_INDEX + TOMOYO_MAC_CATEGORY_MISC] = "misc",
};

/* String table for conditions. */
const char * const tomoyo_condition_keyword[TOMOYO_MAX_CONDITION_KEYWORD] = {
        [TOMOYO_TASK_UID]             = "task.uid",
        [TOMOYO_TASK_EUID]            = "task.euid",
        [TOMOYO_TASK_SUID]            = "task.suid",
        [TOMOYO_TASK_FSUID]           = "task.fsuid",
        [TOMOYO_TASK_GID]             = "task.gid",
        [TOMOYO_TASK_EGID]            = "task.egid",
        [TOMOYO_TASK_SGID]            = "task.sgid",
        [TOMOYO_TASK_FSGID]           = "task.fsgid",
        [TOMOYO_TASK_PID]             = "task.pid",
        [TOMOYO_TASK_PPID]            = "task.ppid",
        [TOMOYO_EXEC_ARGC]            = "exec.argc",
        [TOMOYO_EXEC_ENVC]            = "exec.envc",
        [TOMOYO_TYPE_IS_SOCKET]       = "socket",
        [TOMOYO_TYPE_IS_SYMLINK]      = "symlink",
        [TOMOYO_TYPE_IS_FILE]         = "file",
        [TOMOYO_TYPE_IS_BLOCK_DEV]    = "block",
        [TOMOYO_TYPE_IS_DIRECTORY]    = "directory",
        [TOMOYO_TYPE_IS_CHAR_DEV]     = "char",
        [TOMOYO_TYPE_IS_FIFO]         = "fifo",
        [TOMOYO_MODE_SETUID]          = "setuid",
        [TOMOYO_MODE_SETGID]          = "setgid",
        [TOMOYO_MODE_STICKY]          = "sticky",
        [TOMOYO_MODE_OWNER_READ]      = "owner_read",
        [TOMOYO_MODE_OWNER_WRITE]     = "owner_write",
        [TOMOYO_MODE_OWNER_EXECUTE]   = "owner_execute",
        [TOMOYO_MODE_GROUP_READ]      = "group_read",
        [TOMOYO_MODE_GROUP_WRITE]     = "group_write",
        [TOMOYO_MODE_GROUP_EXECUTE]   = "group_execute",
        [TOMOYO_MODE_OTHERS_READ]     = "others_read",
        [TOMOYO_MODE_OTHERS_WRITE]    = "others_write",
        [TOMOYO_MODE_OTHERS_EXECUTE]  = "others_execute",
        [TOMOYO_EXEC_REALPATH]        = "exec.realpath",
        [TOMOYO_SYMLINK_TARGET]       = "symlink.target",
        [TOMOYO_PATH1_UID]            = "path1.uid",
        [TOMOYO_PATH1_GID]            = "path1.gid",
        [TOMOYO_PATH1_INO]            = "path1.ino",
        [TOMOYO_PATH1_MAJOR]          = "path1.major",
        [TOMOYO_PATH1_MINOR]          = "path1.minor",
        [TOMOYO_PATH1_PERM]           = "path1.perm",
        [TOMOYO_PATH1_TYPE]           = "path1.type",
        [TOMOYO_PATH1_DEV_MAJOR]      = "path1.dev_major",
        [TOMOYO_PATH1_DEV_MINOR]      = "path1.dev_minor",
        [TOMOYO_PATH2_UID]            = "path2.uid",
        [TOMOYO_PATH2_GID]            = "path2.gid",
        [TOMOYO_PATH2_INO]            = "path2.ino",
        [TOMOYO_PATH2_MAJOR]          = "path2.major",
        [TOMOYO_PATH2_MINOR]          = "path2.minor",
        [TOMOYO_PATH2_PERM]           = "path2.perm",
        [TOMOYO_PATH2_TYPE]           = "path2.type",
        [TOMOYO_PATH2_DEV_MAJOR]      = "path2.dev_major",
        [TOMOYO_PATH2_DEV_MINOR]      = "path2.dev_minor",
        [TOMOYO_PATH1_PARENT_UID]     = "path1.parent.uid",
        [TOMOYO_PATH1_PARENT_GID]     = "path1.parent.gid",
        [TOMOYO_PATH1_PARENT_INO]     = "path1.parent.ino",
        [TOMOYO_PATH1_PARENT_PERM]    = "path1.parent.perm",
        [TOMOYO_PATH2_PARENT_UID]     = "path2.parent.uid",
        [TOMOYO_PATH2_PARENT_GID]     = "path2.parent.gid",
        [TOMOYO_PATH2_PARENT_INO]     = "path2.parent.ino",
        [TOMOYO_PATH2_PARENT_PERM]    = "path2.parent.perm",
};

/* String table for PREFERENCE keyword. */
static const char * const tomoyo_pref_keywords[TOMOYO_MAX_PREF] = {
        [TOMOYO_PREF_MAX_AUDIT_LOG]      = "max_audit_log",
        [TOMOYO_PREF_MAX_LEARNING_ENTRY] = "max_learning_entry",
};

/* String table for path operation. */
const char * const tomoyo_path_keyword[TOMOYO_MAX_PATH_OPERATION] = {
        [TOMOYO_TYPE_EXECUTE]    = "execute",
        [TOMOYO_TYPE_READ]       = "read",
        [TOMOYO_TYPE_WRITE]      = "write",
        [TOMOYO_TYPE_APPEND]     = "append",
        [TOMOYO_TYPE_UNLINK]     = "unlink",
        [TOMOYO_TYPE_GETATTR]    = "getattr",
        [TOMOYO_TYPE_RMDIR]      = "rmdir",
        [TOMOYO_TYPE_TRUNCATE]   = "truncate",
        [TOMOYO_TYPE_SYMLINK]    = "symlink",
        [TOMOYO_TYPE_CHROOT]     = "chroot",
        [TOMOYO_TYPE_UMOUNT]     = "unmount",
};

/* String table for socket's operation. */
const char * const tomoyo_socket_keyword[TOMOYO_MAX_NETWORK_OPERATION] = {
        [TOMOYO_NETWORK_BIND]    = "bind",
        [TOMOYO_NETWORK_LISTEN]  = "listen",
        [TOMOYO_NETWORK_CONNECT] = "connect",
        [TOMOYO_NETWORK_SEND]    = "send",
};

/* String table for categories. */
static const char * const tomoyo_category_keywords
[TOMOYO_MAX_MAC_CATEGORY_INDEX] = {
        [TOMOYO_MAC_CATEGORY_FILE]    = "file",
        [TOMOYO_MAC_CATEGORY_NETWORK] = "network",
        [TOMOYO_MAC_CATEGORY_MISC]    = "misc",
};

/* Permit policy management by non-root user? */
static bool tomoyo_manage_by_non_root;

/* Utility functions. */

/**
 * tomoyo_addprintf - strncat()-like-snprintf().
 *
 * @buffer: Buffer to write to. Must be '\0'-terminated.
 * @len:    Size of @buffer.
 * @fmt:    The printf()'s format string, followed by parameters.
 *
 * Returns nothing.
 */
__printf(3, 4)
static void tomoyo_addprintf(char *buffer, int len, const char *fmt, ...)
{
        va_list args;
        const int pos = strlen(buffer);

        va_start(args, fmt);
        vsnprintf(buffer + pos, len - pos - 1, fmt, args);
        va_end(args);
}

/**
 * tomoyo_flush - Flush queued string to userspace's buffer.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 *
 * Returns true if all data was flushed, false otherwise.
 */
static bool tomoyo_flush(struct tomoyo_io_buffer *head)
{
        while (head->r.w_pos) {
                const char *w = head->r.w[0];
                size_t len = strlen(w);

                if (len) {
                        if (len > head->read_user_buf_avail)
                                len = head->read_user_buf_avail;
                        if (!len)
                                return false;
                        if (copy_to_user(head->read_user_buf, w, len))
                                return false;
                        head->read_user_buf_avail -= len;
                        head->read_user_buf += len;
                        w += len;
                }
                head->r.w[0] = w;
                if (*w)
                        return false;
                /* Add '\0' for audit logs and query. */
                if (head->poll) {
                        if (!head->read_user_buf_avail ||
                            copy_to_user(head->read_user_buf, "", 1))
                                return false;
                        head->read_user_buf_avail--;
                        head->read_user_buf++;
                }
                head->r.w_pos--;
                for (len = 0; len < head->r.w_pos; len++)
                        head->r.w[len] = head->r.w[len + 1];
        }
        head->r.avail = 0;
        return true;
}

/**
 * tomoyo_set_string - Queue string to "struct tomoyo_io_buffer" structure.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 * @string: String to print.
 *
 * Note that @string has to be kept valid until @head is kfree()d.
 * This means that char[] allocated on stack memory cannot be passed to
 * this function. Use tomoyo_io_printf() for char[] allocated on stack memory.
 */
static void tomoyo_set_string(struct tomoyo_io_buffer *head, const char *string)
{
        if (head->r.w_pos < TOMOYO_MAX_IO_READ_QUEUE) {
                head->r.w[head->r.w_pos++] = string;
                tomoyo_flush(head);
        } else
                WARN_ON(1);
}

static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
                             ...) __printf(2, 3);

/**
 * tomoyo_io_printf - printf() to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @fmt:  The printf()'s format string, followed by parameters.
 */
static void tomoyo_io_printf(struct tomoyo_io_buffer *head, const char *fmt,
                             ...)
{
        va_list args;
        size_t len;
        size_t pos = head->r.avail;
        int size = head->readbuf_size - pos;

        if (size <= 0)
                return;
        va_start(args, fmt);
        len = vsnprintf(head->read_buf + pos, size, fmt, args) + 1;
        va_end(args);
        if (pos + len >= head->readbuf_size) {
                WARN_ON(1);
                return;
        }
        head->r.avail += len;
        tomoyo_set_string(head, head->read_buf + pos);
}

/**
 * tomoyo_set_space - Put a space to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_set_space(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, " ");
}

/**
 * tomoyo_set_lf - Put a line feed to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static bool tomoyo_set_lf(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, "\n");
        return !head->r.w_pos;
}

/**
 * tomoyo_set_slash - Put a shash to "struct tomoyo_io_buffer" structure.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_set_slash(struct tomoyo_io_buffer *head)
{
        tomoyo_set_string(head, "/");
}

/* List of namespaces. */
LIST_HEAD(tomoyo_namespace_list);
/* True if namespace other than tomoyo_kernel_namespace is defined. */
static bool tomoyo_namespace_enabled;

/**
 * tomoyo_init_policy_namespace - Initialize namespace.
 *
 * @ns: Pointer to "struct tomoyo_policy_namespace".
 *
 * Returns nothing.
 */
void tomoyo_init_policy_namespace(struct tomoyo_policy_namespace *ns)
{
        unsigned int idx;

        for (idx = 0; idx < TOMOYO_MAX_ACL_GROUPS; idx++)
                INIT_LIST_HEAD(&ns->acl_group[idx]);
        for (idx = 0; idx < TOMOYO_MAX_GROUP; idx++)
                INIT_LIST_HEAD(&ns->group_list[idx]);
        for (idx = 0; idx < TOMOYO_MAX_POLICY; idx++)
                INIT_LIST_HEAD(&ns->policy_list[idx]);
        ns->profile_version = 20150505;
        tomoyo_namespace_enabled = !list_empty(&tomoyo_namespace_list);
        list_add_tail_rcu(&ns->namespace_list, &tomoyo_namespace_list);
}

/**
 * tomoyo_print_namespace - Print namespace header.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_print_namespace(struct tomoyo_io_buffer *head)
{
        if (!tomoyo_namespace_enabled)
                return;
        tomoyo_set_string(head,
                          container_of(head->r.ns,
                                       struct tomoyo_policy_namespace,
                                       namespace_list)->name);
        tomoyo_set_space(head);
}

/**
 * tomoyo_print_name_union - Print a tomoyo_name_union.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 */
static void tomoyo_print_name_union(struct tomoyo_io_buffer *head,
                                    const struct tomoyo_name_union *ptr)
{
        tomoyo_set_space(head);
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                tomoyo_set_string(head, ptr->filename->name);
        }
}

/**
 * tomoyo_print_name_union_quoted - Print a tomoyo_name_union with a quote.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_name_union_quoted(struct tomoyo_io_buffer *head,
                                           const struct tomoyo_name_union *ptr)
{
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                tomoyo_set_string(head, "\"");
                tomoyo_set_string(head, ptr->filename->name);
                tomoyo_set_string(head, "\"");
        }
}

/**
 * tomoyo_print_number_union_nospace - Print a tomoyo_number_union without a space.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_number_union_nospace
(struct tomoyo_io_buffer *head, const struct tomoyo_number_union *ptr)
{
        if (ptr->group) {
                tomoyo_set_string(head, "@");
                tomoyo_set_string(head, ptr->group->group_name->name);
        } else {
                int i;
                unsigned long min = ptr->values[0];
                const unsigned long max = ptr->values[1];
                u8 min_type = ptr->value_type[0];
                const u8 max_type = ptr->value_type[1];
                char buffer[128];

                buffer[0] = '\0';
                for (i = 0; i < 2; i++) {
                        switch (min_type) {
                        case TOMOYO_VALUE_TYPE_HEXADECIMAL:
                                tomoyo_addprintf(buffer, sizeof(buffer),
                                                 "0x%lX", min);
                                break;
                        case TOMOYO_VALUE_TYPE_OCTAL:
                                tomoyo_addprintf(buffer, sizeof(buffer),
                                                 "0%lo", min);
                                break;
                        default:
                                tomoyo_addprintf(buffer, sizeof(buffer), "%lu",
                                                 min);
                                break;
                        }
                        if (min == max && min_type == max_type)
                                break;
                        tomoyo_addprintf(buffer, sizeof(buffer), "-");
                        min_type = max_type;
                        min = max;
                }
                tomoyo_io_printf(head, "%s", buffer);
        }
}

/**
 * tomoyo_print_number_union - Print a tomoyo_number_union.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @ptr:  Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
static void tomoyo_print_number_union(struct tomoyo_io_buffer *head,
                                      const struct tomoyo_number_union *ptr)
{
        tomoyo_set_space(head);
        tomoyo_print_number_union_nospace(head, ptr);
}

/**
 * tomoyo_assign_profile - Create a new profile.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number to create.
 *
 * Returns pointer to "struct tomoyo_profile" on success, NULL otherwise.
 */
static struct tomoyo_profile *tomoyo_assign_profile
(struct tomoyo_policy_namespace *ns, const unsigned int profile)
{
        struct tomoyo_profile *ptr;
        struct tomoyo_profile *entry;

        if (profile >= TOMOYO_MAX_PROFILES)
                return NULL;
        ptr = ns->profile_ptr[profile];
        if (ptr)
                return ptr;
        entry = kzalloc(sizeof(*entry), GFP_NOFS | __GFP_NOWARN);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        ptr = ns->profile_ptr[profile];
        if (!ptr && tomoyo_memory_ok(entry)) {
                ptr = entry;
                ptr->default_config = TOMOYO_CONFIG_DISABLED |
                        TOMOYO_CONFIG_WANT_GRANT_LOG |
                        TOMOYO_CONFIG_WANT_REJECT_LOG;
                memset(ptr->config, TOMOYO_CONFIG_USE_DEFAULT,
                       sizeof(ptr->config));
                ptr->pref[TOMOYO_PREF_MAX_AUDIT_LOG] =
                        CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG;
                ptr->pref[TOMOYO_PREF_MAX_LEARNING_ENTRY] =
                        CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY;
                mb(); /* Avoid out-of-order execution. */
                ns->profile_ptr[profile] = ptr;
                entry = NULL;
        }
        mutex_unlock(&tomoyo_policy_lock);
 out:
        kfree(entry);
        return ptr;
}

/**
 * tomoyo_profile - Find a profile.
 *
 * @ns:      Pointer to "struct tomoyo_policy_namespace".
 * @profile: Profile number to find.
 *
 * Returns pointer to "struct tomoyo_profile".
 */
struct tomoyo_profile *tomoyo_profile(const struct tomoyo_policy_namespace *ns,
                                      const u8 profile)
{
        static struct tomoyo_profile tomoyo_null_profile;
        struct tomoyo_profile *ptr = ns->profile_ptr[profile];

        if (!ptr)
                ptr = &tomoyo_null_profile;
        return ptr;
}

/**
 * tomoyo_find_yesno - Find values for specified keyword.
 *
 * @string: String to check.
 * @find:   Name of keyword.
 *
 * Returns 1 if "@find=yes" was found, 0 if "@find=no" was found, -1 otherwise.
 */
static s8 tomoyo_find_yesno(const char *string, const char *find)
{
        const char *cp = strstr(string, find);

        if (cp) {
                cp += strlen(find);
                if (!strncmp(cp, "=yes", 4))
                        return 1;
                else if (!strncmp(cp, "=no", 3))
                        return 0;
        }
        return -1;
}

/**
 * tomoyo_set_uint - Set value for specified preference.
 *
 * @i:      Pointer to "unsigned int".
 * @string: String to check.
 * @find:   Name of keyword.
 *
 * Returns nothing.
 */
static void tomoyo_set_uint(unsigned int *i, const char *string,
                            const char *find)
{
        const char *cp = strstr(string, find);

        if (cp)
                sscanf(cp + strlen(find), "=%u", i);
}

/**
 * tomoyo_set_mode - Set mode for specified profile.
 *
 * @name:    Name of functionality.
 * @value:   Mode for @name.
 * @profile: Pointer to "struct tomoyo_profile".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_set_mode(char *name, const char *value,
                           struct tomoyo_profile *profile)
{
        u8 i;
        u8 config;

        if (!strcmp(name, "CONFIG")) {
                i = TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX;
                config = profile->default_config;
        } else if (tomoyo_str_starts(&name, "CONFIG::")) {
                config = 0;
                for (i = 0; i < TOMOYO_MAX_MAC_INDEX
                             + TOMOYO_MAX_MAC_CATEGORY_INDEX; i++) {
                        int len = 0;

                        if (i < TOMOYO_MAX_MAC_INDEX) {
                                const u8 c = tomoyo_index2category[i];
                                const char *category =
                                        tomoyo_category_keywords[c];

                                len = strlen(category);
                                if (strncmp(name, category, len) ||
                                    name[len++] != ':' || name[len++] != ':')
                                        continue;
                        }
                        if (strcmp(name + len, tomoyo_mac_keywords[i]))
                                continue;
                        config = profile->config[i];
                        break;
                }
                if (i == TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX)
                        return -EINVAL;
        } else {
                return -EINVAL;
        }
        if (strstr(value, "use_default")) {
                config = TOMOYO_CONFIG_USE_DEFAULT;
        } else {
                u8 mode;

                for (mode = 0; mode < 4; mode++)
                        if (strstr(value, tomoyo_mode[mode]))
                                /*
                                 * Update lower 3 bits in order to distinguish
                                 * 'config' from 'TOMOYO_CONFIG_USE_DEFAULT'.
                                 */
                                config = (config & ~7) | mode;
                if (config != TOMOYO_CONFIG_USE_DEFAULT) {
                        switch (tomoyo_find_yesno(value, "grant_log")) {
                        case 1:
                                config |= TOMOYO_CONFIG_WANT_GRANT_LOG;
                                break;
                        case 0:
                                config &= ~TOMOYO_CONFIG_WANT_GRANT_LOG;
                                break;
                        }
                        switch (tomoyo_find_yesno(value, "reject_log")) {
                        case 1:
                                config |= TOMOYO_CONFIG_WANT_REJECT_LOG;
                                break;
                        case 0:
                                config &= ~TOMOYO_CONFIG_WANT_REJECT_LOG;
                                break;
                        }
                }
        }
        if (i < TOMOYO_MAX_MAC_INDEX + TOMOYO_MAX_MAC_CATEGORY_INDEX)
                profile->config[i] = config;
        else if (config != TOMOYO_CONFIG_USE_DEFAULT)
                profile->default_config = config;
        return 0;
}

/**
 * tomoyo_write_profile - Write profile table.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_write_profile(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        unsigned int i;
        char *cp;
        struct tomoyo_profile *profile;

        if (sscanf(data, "PROFILE_VERSION=%u", &head->w.ns->profile_version)
            == 1)
                return 0;
        i = simple_strtoul(data, &cp, 10);
        if (*cp != '-')
                return -EINVAL;
        data = cp + 1;
        profile = tomoyo_assign_profile(head->w.ns, i);
        if (!profile)
                return -EINVAL;
        cp = strchr(data, '=');
        if (!cp)
                return -EINVAL;
        *cp++ = '\0';
        if (!strcmp(data, "COMMENT")) {
                static DEFINE_SPINLOCK(lock);
                const struct tomoyo_path_info *new_comment
                        = tomoyo_get_name(cp);
                const struct tomoyo_path_info *old_comment;

                if (!new_comment)
                        return -ENOMEM;
                spin_lock(&lock);
                old_comment = profile->comment;
                profile->comment = new_comment;
                spin_unlock(&lock);
                tomoyo_put_name(old_comment);
                return 0;
        }
        if (!strcmp(data, "PREFERENCE")) {
                for (i = 0; i < TOMOYO_MAX_PREF; i++)
                        tomoyo_set_uint(&profile->pref[i], cp,
                                        tomoyo_pref_keywords[i]);
                return 0;
        }
        return tomoyo_set_mode(data, cp, profile);
}

/**
 * tomoyo_print_config - Print mode for specified functionality.
 *
 * @head:   Pointer to "struct tomoyo_io_buffer".
 * @config: Mode for that functionality.
 *
 * Returns nothing.
 *
 * Caller prints functionality's name.
 */
static void tomoyo_print_config(struct tomoyo_io_buffer *head, const u8 config)
{
        tomoyo_io_printf(head, "={ mode=%s grant_log=%s reject_log=%s }\n",
                         tomoyo_mode[config & 3],
                         str_yes_no(config & TOMOYO_CONFIG_WANT_GRANT_LOG),
                         str_yes_no(config & TOMOYO_CONFIG_WANT_REJECT_LOG));
}

/**
 * tomoyo_read_profile - Read profile table.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_read_profile(struct tomoyo_io_buffer *head)
{
        u8 index;
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        const struct tomoyo_profile *profile;

        if (head->r.eof)
                return;
 next:
        index = head->r.index;
        profile = ns->profile_ptr[index];
        switch (head->r.step) {
        case 0:
                tomoyo_print_namespace(head);
                tomoyo_io_printf(head, "PROFILE_VERSION=%u\n",
                                 ns->profile_version);
                head->r.step++;
                break;
        case 1:
                for ( ; head->r.index < TOMOYO_MAX_PROFILES;
                      head->r.index++)
                        if (ns->profile_ptr[head->r.index])
                                break;
                if (head->r.index == TOMOYO_MAX_PROFILES) {
                        head->r.eof = true;
                        return;
                }
                head->r.step++;
                break;
        case 2:
                {
                        u8 i;
                        const struct tomoyo_path_info *comment =
                                profile->comment;

                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-COMMENT=", index);
                        tomoyo_set_string(head, comment ? comment->name : "");
                        tomoyo_set_lf(head);
                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-PREFERENCE={ ", index);
                        for (i = 0; i < TOMOYO_MAX_PREF; i++)
                                tomoyo_io_printf(head, "%s=%u ",
                                                 tomoyo_pref_keywords[i],
                                                 profile->pref[i]);
                        tomoyo_set_string(head, "}\n");
                        head->r.step++;
                }
                break;
        case 3:
                {
                        tomoyo_print_namespace(head);
                        tomoyo_io_printf(head, "%u-%s", index, "CONFIG");
                        tomoyo_print_config(head, profile->default_config);
                        head->r.bit = 0;
                        head->r.step++;
                }
                break;
        case 4:
                for ( ; head->r.bit < TOMOYO_MAX_MAC_INDEX
                              + TOMOYO_MAX_MAC_CATEGORY_INDEX; head->r.bit++) {
                        const u8 i = head->r.bit;
                        const u8 config = profile->config[i];

                        if (config == TOMOYO_CONFIG_USE_DEFAULT)
                                continue;
                        tomoyo_print_namespace(head);
                        if (i < TOMOYO_MAX_MAC_INDEX)
                                tomoyo_io_printf(head, "%u-CONFIG::%s::%s",
                                                 index,
                                                 tomoyo_category_keywords
                                                 [tomoyo_index2category[i]],
                                                 tomoyo_mac_keywords[i]);
                        else
                                tomoyo_io_printf(head, "%u-CONFIG::%s", index,
                                                 tomoyo_mac_keywords[i]);
                        tomoyo_print_config(head, config);
                        head->r.bit++;
                        break;
                }
                if (head->r.bit == TOMOYO_MAX_MAC_INDEX
                    + TOMOYO_MAX_MAC_CATEGORY_INDEX) {
                        head->r.index++;
                        head->r.step = 1;
                }
                break;
        }
        if (tomoyo_flush(head))
                goto next;
}

/**
 * tomoyo_same_manager - Check for duplicated "struct tomoyo_manager" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_manager(const struct tomoyo_acl_head *a,
                                const struct tomoyo_acl_head *b)
{
        return container_of(a, struct tomoyo_manager, head)->manager ==
                container_of(b, struct tomoyo_manager, head)->manager;
}

/**
 * tomoyo_update_manager_entry - Add a manager entry.
 *
 * @manager:   The path to manager or the domainnamme.
 * @is_delete: True if it is a delete request.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_manager_entry(const char *manager,
                                       const bool is_delete)
{
        struct tomoyo_manager e = { };
        struct tomoyo_acl_param param = {
                /* .ns = &tomoyo_kernel_namespace, */
                .is_delete = is_delete,
                .list = &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER],
        };
        int error = is_delete ? -ENOENT : -ENOMEM;

        if (!tomoyo_correct_domain(manager) &&
            !tomoyo_correct_word(manager))
                return -EINVAL;
        e.manager = tomoyo_get_name(manager);
        if (e.manager) {
                error = tomoyo_update_policy(&e.head, sizeof(e), &param,
                                             tomoyo_same_manager);
                tomoyo_put_name(e.manager);
        }
        return error;
}

/**
 * tomoyo_write_manager - Write manager policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_manager(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;

        if (!strcmp(data, "manage_by_non_root")) {
                tomoyo_manage_by_non_root = !head->w.is_delete;
                return 0;
        }
        return tomoyo_update_manager_entry(data, head->w.is_delete);
}

/**
 * tomoyo_read_manager - Read manager policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_manager(struct tomoyo_io_buffer *head)
{
        if (head->r.eof)
                return;
        list_for_each_cookie(head->r.acl, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER]) {
                struct tomoyo_manager *ptr =
                        list_entry(head->r.acl, typeof(*ptr), head.list);

                if (ptr->head.is_deleted)
                        continue;
                if (!tomoyo_flush(head))
                        return;
                tomoyo_set_string(head, ptr->manager->name);
                tomoyo_set_lf(head);
        }
        head->r.eof = true;
}

/**
 * tomoyo_manager - Check whether the current process is a policy manager.
 *
 * Returns true if the current process is permitted to modify policy
 * via /sys/kernel/security/tomoyo/ interface.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_manager(void)
{
        struct tomoyo_manager *ptr;
        const char *exe;
        const struct task_struct *task = current;
        const struct tomoyo_path_info *domainname = tomoyo_domain()->domainname;
        bool found = IS_ENABLED(CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING);

        if (!tomoyo_policy_loaded)
                return true;
        if (!tomoyo_manage_by_non_root &&
            (!uid_eq(task->cred->uid,  GLOBAL_ROOT_UID) ||
             !uid_eq(task->cred->euid, GLOBAL_ROOT_UID)))
                return false;
        exe = tomoyo_get_exe();
        if (!exe)
                return false;
        list_for_each_entry_rcu(ptr, &tomoyo_kernel_namespace.policy_list[TOMOYO_ID_MANAGER], head.list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (!ptr->head.is_deleted &&
                    (!tomoyo_pathcmp(domainname, ptr->manager) ||
                     !strcmp(exe, ptr->manager->name))) {
                        found = true;
                        break;
                }
        }
        if (!found) { /* Reduce error messages. */
                static pid_t last_pid;
                const pid_t pid = current->pid;

                if (last_pid != pid) {
                        pr_warn("%s ( %s ) is not permitted to update policies.\n",
                                domainname->name, exe);
                        last_pid = pid;
                }
        }
        kfree(exe);
        return found;
}

static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
(unsigned int serial);

/**
 * tomoyo_select_domain - Parse select command.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @data: String to parse.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_select_domain(struct tomoyo_io_buffer *head,
                                 const char *data)
{
        unsigned int pid;
        struct tomoyo_domain_info *domain = NULL;
        bool global_pid = false;

        if (strncmp(data, "select ", 7))
                return false;
        data += 7;
        if (sscanf(data, "pid=%u", &pid) == 1 ||
            (global_pid = true, sscanf(data, "global-pid=%u", &pid) == 1)) {
                struct task_struct *p;

                rcu_read_lock();
                if (global_pid)
                        p = find_task_by_pid_ns(pid, &init_pid_ns);
                else
                        p = find_task_by_vpid(pid);
                if (p)
                        domain = tomoyo_task(p)->domain_info;
                rcu_read_unlock();
        } else if (!strncmp(data, "domain=", 7)) {
                if (tomoyo_domain_def(data + 7))
                        domain = tomoyo_find_domain(data + 7);
        } else if (sscanf(data, "Q=%u", &pid) == 1) {
                domain = tomoyo_find_domain_by_qid(pid);
        } else
                return false;
        head->w.domain = domain;
        /* Accessing read_buf is safe because head->io_sem is held. */
        if (!head->read_buf)
                return true; /* Do nothing if open(O_WRONLY). */
        memset(&head->r, 0, sizeof(head->r));
        head->r.print_this_domain_only = true;
        if (domain)
                head->r.domain = &domain->list;
        else
                head->r.eof = true;
        tomoyo_io_printf(head, "# select %s\n", data);
        if (domain && domain->is_deleted)
                tomoyo_io_printf(head, "# This is a deleted domain.\n");
        return true;
}

/**
 * tomoyo_same_task_acl - Check for duplicated "struct tomoyo_task_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_task_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_task_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_task_acl *p2 = container_of(b, typeof(*p2), head);

        return p1->domainname == p2->domainname;
}

/**
 * tomoyo_write_task - Update task related list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_task(struct tomoyo_acl_param *param)
{
        int error = -EINVAL;

        if (tomoyo_str_starts(&param->data, "manual_domain_transition ")) {
                struct tomoyo_task_acl e = {
                        .head.type = TOMOYO_TYPE_MANUAL_TASK_ACL,
                        .domainname = tomoyo_get_domainname(param),
                };

                if (e.domainname)
                        error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                                     tomoyo_same_task_acl,
                                                     NULL);
                tomoyo_put_name(e.domainname);
        }
        return error;
}

/**
 * tomoyo_delete_domain - Delete a domain.
 *
 * @domainname: The name of domain.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_delete_domain(char *domainname)
{
        struct tomoyo_domain_info *domain;
        struct tomoyo_path_info name;

        name.name = domainname;
        tomoyo_fill_path_info(&name);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                return -EINTR;
        /* Is there an active domain? */
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                /* Never delete tomoyo_kernel_domain */
                if (domain == &tomoyo_kernel_domain)
                        continue;
                if (domain->is_deleted ||
                    tomoyo_pathcmp(domain->domainname, &name))
                        continue;
                domain->is_deleted = true;
                break;
        }
        mutex_unlock(&tomoyo_policy_lock);
        return 0;
}

/**
 * tomoyo_write_domain2 - Write domain policy.
 *
 * @ns:        Pointer to "struct tomoyo_policy_namespace".
 * @list:      Pointer to "struct list_head".
 * @data:      Policy to be interpreted.
 * @is_delete: True if it is a delete request.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_domain2(struct tomoyo_policy_namespace *ns,
                                struct list_head *list, char *data,
                                const bool is_delete)
{
        struct tomoyo_acl_param param = {
                .ns = ns,
                .list = list,
                .data = data,
                .is_delete = is_delete,
        };
        static const struct {
                const char *keyword;
                int (*write)(struct tomoyo_acl_param *param);
        } tomoyo_callback[5] = {
                { "file ", tomoyo_write_file },
                { "network inet ", tomoyo_write_inet_network },
                { "network unix ", tomoyo_write_unix_network },
                { "misc ", tomoyo_write_misc },
                { "task ", tomoyo_write_task },
        };
        u8 i;

        for (i = 0; i < ARRAY_SIZE(tomoyo_callback); i++) {
                if (!tomoyo_str_starts(&param.data,
                                       tomoyo_callback[i].keyword))
                        continue;
                return tomoyo_callback[i].write(&param);
        }
        return -EINVAL;
}

/* String table for domain flags. */
const char * const tomoyo_dif[TOMOYO_MAX_DOMAIN_INFO_FLAGS] = {
        [TOMOYO_DIF_QUOTA_WARNED]      = "quota_exceeded\n",
        [TOMOYO_DIF_TRANSITION_FAILED] = "transition_failed\n",
};

/**
 * tomoyo_write_domain - Write domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_domain(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        struct tomoyo_policy_namespace *ns;
        struct tomoyo_domain_info *domain = head->w.domain;
        const bool is_delete = head->w.is_delete;
        bool is_select = !is_delete && tomoyo_str_starts(&data, "select ");
        unsigned int idx;

        if (*data == '<') {
                int ret = 0;

                domain = NULL;
                if (is_delete)
                        ret = tomoyo_delete_domain(data);
                else if (is_select)
                        domain = tomoyo_find_domain(data);
                else
                        domain = tomoyo_assign_domain(data, false);
                head->w.domain = domain;
                return ret;
        }
        if (!domain)
                return -EINVAL;
        ns = domain->ns;
        if (sscanf(data, "use_profile %u", &idx) == 1
            && idx < TOMOYO_MAX_PROFILES) {
                if (!tomoyo_policy_loaded || ns->profile_ptr[idx])
                        if (!is_delete)
                                domain->profile = (u8) idx;
                return 0;
        }
        if (sscanf(data, "use_group %u\n", &idx) == 1
            && idx < TOMOYO_MAX_ACL_GROUPS) {
                if (!is_delete)
                        set_bit(idx, domain->group);
                else
                        clear_bit(idx, domain->group);
                return 0;
        }
        for (idx = 0; idx < TOMOYO_MAX_DOMAIN_INFO_FLAGS; idx++) {
                const char *cp = tomoyo_dif[idx];

                if (strncmp(data, cp, strlen(cp) - 1))
                        continue;
                domain->flags[idx] = !is_delete;
                return 0;
        }
        return tomoyo_write_domain2(ns, &domain->acl_info_list, data,
                                    is_delete);
}

/**
 * tomoyo_print_condition - Print condition part.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @cond: Pointer to "struct tomoyo_condition".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_print_condition(struct tomoyo_io_buffer *head,
                                   const struct tomoyo_condition *cond)
{
        switch (head->r.cond_step) {
        case 0:
                head->r.cond_index = 0;
                head->r.cond_step++;
                if (cond->transit) {
                        tomoyo_set_space(head);
                        tomoyo_set_string(head, cond->transit->name);
                }
                fallthrough;
        case 1:
                {
                        const u16 condc = cond->condc;
                        const struct tomoyo_condition_element *condp =
                                (typeof(condp)) (cond + 1);
                        const struct tomoyo_number_union *numbers_p =
                                (typeof(numbers_p)) (condp + condc);
                        const struct tomoyo_name_union *names_p =
                                (typeof(names_p))
                                (numbers_p + cond->numbers_count);
                        const struct tomoyo_argv *argv =
                                (typeof(argv)) (names_p + cond->names_count);
                        const struct tomoyo_envp *envp =
                                (typeof(envp)) (argv + cond->argc);
                        u16 skip;

                        for (skip = 0; skip < head->r.cond_index; skip++) {
                                const u8 left = condp->left;
                                const u8 right = condp->right;

                                condp++;
                                switch (left) {
                                case TOMOYO_ARGV_ENTRY:
                                        argv++;
                                        continue;
                                case TOMOYO_ENVP_ENTRY:
                                        envp++;
                                        continue;
                                case TOMOYO_NUMBER_UNION:
                                        numbers_p++;
                                        break;
                                }
                                switch (right) {
                                case TOMOYO_NAME_UNION:
                                        names_p++;
                                        break;
                                case TOMOYO_NUMBER_UNION:
                                        numbers_p++;
                                        break;
                                }
                        }
                        while (head->r.cond_index < condc) {
                                const u8 match = condp->equals;
                                const u8 left = condp->left;
                                const u8 right = condp->right;

                                if (!tomoyo_flush(head))
                                        return false;
                                condp++;
                                head->r.cond_index++;
                                tomoyo_set_space(head);
                                switch (left) {
                                case TOMOYO_ARGV_ENTRY:
                                        tomoyo_io_printf(head,
                                                         "exec.argv[%lu]%s=\"",
                                                         argv->index, argv->is_not ? "!" : "");
                                        tomoyo_set_string(head,
                                                          argv->value->name);
                                        tomoyo_set_string(head, "\"");
                                        argv++;
                                        continue;
                                case TOMOYO_ENVP_ENTRY:
                                        tomoyo_set_string(head,
                                                          "exec.envp[\"");
                                        tomoyo_set_string(head,
                                                          envp->name->name);
                                        tomoyo_io_printf(head, "\"]%s=", envp->is_not ? "!" : "");
                                        if (envp->value) {
                                                tomoyo_set_string(head, "\"");
                                                tomoyo_set_string(head, envp->value->name);
                                                tomoyo_set_string(head, "\"");
                                        } else {
                                                tomoyo_set_string(head,
                                                                  "NULL");
                                        }
                                        envp++;
                                        continue;
                                case TOMOYO_NUMBER_UNION:
                                        tomoyo_print_number_union_nospace
                                                (head, numbers_p++);
                                        break;
                                default:
                                        tomoyo_set_string(head,
                                               tomoyo_condition_keyword[left]);
                                        break;
                                }
                                tomoyo_set_string(head, match ? "=" : "!=");
                                switch (right) {
                                case TOMOYO_NAME_UNION:
                                        tomoyo_print_name_union_quoted
                                                (head, names_p++);
                                        break;
                                case TOMOYO_NUMBER_UNION:
                                        tomoyo_print_number_union_nospace
                                                (head, numbers_p++);
                                        break;
                                default:
                                        tomoyo_set_string(head,
                                          tomoyo_condition_keyword[right]);
                                        break;
                                }
                        }
                }
                head->r.cond_step++;
                fallthrough;
        case 2:
                if (!tomoyo_flush(head))
                        break;
                head->r.cond_step++;
                fallthrough;
        case 3:
                if (cond->grant_log != TOMOYO_GRANTLOG_AUTO)
                        tomoyo_io_printf(head, " grant_log=%s",
                                         str_yes_no(cond->grant_log ==
                                                    TOMOYO_GRANTLOG_YES));
                tomoyo_set_lf(head);
                return true;
        }
        return false;
}

/**
 * tomoyo_set_group - Print "acl_group " header keyword and category name.
 *
 * @head:     Pointer to "struct tomoyo_io_buffer".
 * @category: Category name.
 *
 * Returns nothing.
 */
static void tomoyo_set_group(struct tomoyo_io_buffer *head,
                             const char *category)
{
        if (head->type == TOMOYO_EXCEPTIONPOLICY) {
                tomoyo_print_namespace(head);
                tomoyo_io_printf(head, "acl_group %u ",
                                 head->r.acl_group_index);
        }
        tomoyo_set_string(head, category);
}

/**
 * tomoyo_print_entry - Print an ACL entry.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @acl:  Pointer to an ACL entry.
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_print_entry(struct tomoyo_io_buffer *head,
                               struct tomoyo_acl_info *acl)
{
        const u8 acl_type = acl->type;
        bool first = true;
        u8 bit;

        if (head->r.print_cond_part)
                goto print_cond_part;
        if (acl->is_deleted)
                return true;
        if (!tomoyo_flush(head))
                return false;
        else if (acl_type == TOMOYO_TYPE_PATH_ACL) {
                struct tomoyo_path_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u16 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (head->r.print_transition_related_only &&
                            bit != TOMOYO_TYPE_EXECUTE)
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_path_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
        } else if (acl_type == TOMOYO_TYPE_MANUAL_TASK_ACL) {
                struct tomoyo_task_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "task ");
                tomoyo_set_string(head, "manual_domain_transition ");
                tomoyo_set_string(head, ptr->domainname->name);
        } else if (head->r.print_transition_related_only) {
                return true;
        } else if (acl_type == TOMOYO_TYPE_PATH2_ACL) {
                struct tomoyo_path2_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH2_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pp2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name1);
                tomoyo_print_name_union(head, &ptr->name2);
        } else if (acl_type == TOMOYO_TYPE_PATH_NUMBER_ACL) {
                struct tomoyo_path_number_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_PATH_NUMBER_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pn2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
                tomoyo_print_number_union(head, &ptr->number);
        } else if (acl_type == TOMOYO_TYPE_MKDEV_ACL) {
                struct tomoyo_mkdev_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_MKDEV_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "file ");
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_mac_keywords
                                          [tomoyo_pnnn2mac[bit]]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
                tomoyo_print_number_union(head, &ptr->mode);
                tomoyo_print_number_union(head, &ptr->major);
                tomoyo_print_number_union(head, &ptr->minor);
        } else if (acl_type == TOMOYO_TYPE_INET_ACL) {
                struct tomoyo_inet_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "network inet ");
                                tomoyo_set_string(head, tomoyo_proto_keyword
                                                  [ptr->protocol]);
                                tomoyo_set_space(head);
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_socket_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_set_space(head);
                if (ptr->address.group) {
                        tomoyo_set_string(head, "@");
                        tomoyo_set_string(head, ptr->address.group->group_name
                                          ->name);
                } else {
                        char buf[128];

                        tomoyo_print_ip(buf, sizeof(buf), &ptr->address);
                        tomoyo_io_printf(head, "%s", buf);
                }
                tomoyo_print_number_union(head, &ptr->port);
        } else if (acl_type == TOMOYO_TYPE_UNIX_ACL) {
                struct tomoyo_unix_acl *ptr =
                        container_of(acl, typeof(*ptr), head);
                const u8 perm = ptr->perm;

                for (bit = 0; bit < TOMOYO_MAX_NETWORK_OPERATION; bit++) {
                        if (!(perm & (1 << bit)))
                                continue;
                        if (first) {
                                tomoyo_set_group(head, "network unix ");
                                tomoyo_set_string(head, tomoyo_proto_keyword
                                                  [ptr->protocol]);
                                tomoyo_set_space(head);
                                first = false;
                        } else {
                                tomoyo_set_slash(head);
                        }
                        tomoyo_set_string(head, tomoyo_socket_keyword[bit]);
                }
                if (first)
                        return true;
                tomoyo_print_name_union(head, &ptr->name);
        } else if (acl_type == TOMOYO_TYPE_MOUNT_ACL) {
                struct tomoyo_mount_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "file mount");
                tomoyo_print_name_union(head, &ptr->dev_name);
                tomoyo_print_name_union(head, &ptr->dir_name);
                tomoyo_print_name_union(head, &ptr->fs_type);
                tomoyo_print_number_union(head, &ptr->flags);
        } else if (acl_type == TOMOYO_TYPE_ENV_ACL) {
                struct tomoyo_env_acl *ptr =
                        container_of(acl, typeof(*ptr), head);

                tomoyo_set_group(head, "misc env ");
                tomoyo_set_string(head, ptr->env->name);
        }
        if (acl->cond) {
                head->r.print_cond_part = true;
                head->r.cond_step = 0;
                if (!tomoyo_flush(head))
                        return false;
print_cond_part:
                if (!tomoyo_print_condition(head, acl->cond))
                        return false;
                head->r.print_cond_part = false;
        } else {
                tomoyo_set_lf(head);
        }
        return true;
}

/**
 * tomoyo_read_domain2 - Read domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @list: Pointer to "struct list_head".
 *
 * Caller holds tomoyo_read_lock().
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_read_domain2(struct tomoyo_io_buffer *head,
                                struct list_head *list)
{
        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_info *ptr =
                        list_entry(head->r.acl, typeof(*ptr), list);

                if (!tomoyo_print_entry(head, ptr))
                        return false;
        }
        head->r.acl = NULL;
        return true;
}

/**
 * tomoyo_read_domain - Read domain policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_domain(struct tomoyo_io_buffer *head)
{
        if (head->r.eof)
                return;
        list_for_each_cookie(head->r.domain, &tomoyo_domain_list) {
                struct tomoyo_domain_info *domain =
                        list_entry(head->r.domain, typeof(*domain), list);
                u8 i;

                switch (head->r.step) {
                case 0:
                        if (domain->is_deleted &&
                            !head->r.print_this_domain_only)
                                continue;
                        /* Print domainname and flags. */
                        tomoyo_set_string(head, domain->domainname->name);
                        tomoyo_set_lf(head);
                        tomoyo_io_printf(head, "use_profile %u\n",
                                         domain->profile);
                        for (i = 0; i < TOMOYO_MAX_DOMAIN_INFO_FLAGS; i++)
                                if (domain->flags[i])
                                        tomoyo_set_string(head, tomoyo_dif[i]);
                        head->r.index = 0;
                        head->r.step++;
                        fallthrough;
                case 1:
                        while (head->r.index < TOMOYO_MAX_ACL_GROUPS) {
                                i = head->r.index++;
                                if (!test_bit(i, domain->group))
                                        continue;
                                tomoyo_io_printf(head, "use_group %u\n", i);
                                if (!tomoyo_flush(head))
                                        return;
                        }
                        head->r.index = 0;
                        head->r.step++;
                        tomoyo_set_lf(head);
                        fallthrough;
                case 2:
                        if (!tomoyo_read_domain2(head, &domain->acl_info_list))
                                return;
                        head->r.step++;
                        if (!tomoyo_set_lf(head))
                                return;
                        fallthrough;
                case 3:
                        head->r.step = 0;
                        if (head->r.print_this_domain_only)
                                goto done;
                }
        }
 done:
        head->r.eof = true;
}

/**
 * tomoyo_write_pid: Specify PID to obtain domainname.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0.
 */
static int tomoyo_write_pid(struct tomoyo_io_buffer *head)
{
        head->r.eof = false;
        return 0;
}

/**
 * tomoyo_read_pid - Get domainname of the specified PID.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns the domainname which the specified PID is in on success,
 * empty string otherwise.
 * The PID is specified by tomoyo_write_pid() so that the user can obtain
 * using read()/write() interface rather than sysctl() interface.
 */
static void tomoyo_read_pid(struct tomoyo_io_buffer *head)
{
        char *buf = head->write_buf;
        bool global_pid = false;
        unsigned int pid;
        struct task_struct *p;
        struct tomoyo_domain_info *domain = NULL;

        /* Accessing write_buf is safe because head->io_sem is held. */
        if (!buf) {
                head->r.eof = true;
                return; /* Do nothing if open(O_RDONLY). */
        }
        if (head->r.w_pos || head->r.eof)
                return;
        head->r.eof = true;
        if (tomoyo_str_starts(&buf, "global-pid "))
                global_pid = true;
        if (kstrtouint(buf, 10, &pid))
                return;
        rcu_read_lock();
        if (global_pid)
                p = find_task_by_pid_ns(pid, &init_pid_ns);
        else
                p = find_task_by_vpid(pid);
        if (p)
                domain = tomoyo_task(p)->domain_info;
        rcu_read_unlock();
        if (!domain)
                return;
        tomoyo_io_printf(head, "%u %u ", pid, domain->profile);
        tomoyo_set_string(head, domain->domainname->name);
}

/* String table for domain transition control keywords. */
static const char *tomoyo_transition_type[TOMOYO_MAX_TRANSITION_TYPE] = {
        [TOMOYO_TRANSITION_CONTROL_NO_RESET]      = "no_reset_domain ",
        [TOMOYO_TRANSITION_CONTROL_RESET]         = "reset_domain ",
        [TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE] = "no_initialize_domain ",
        [TOMOYO_TRANSITION_CONTROL_INITIALIZE]    = "initialize_domain ",
        [TOMOYO_TRANSITION_CONTROL_NO_KEEP]       = "no_keep_domain ",
        [TOMOYO_TRANSITION_CONTROL_KEEP]          = "keep_domain ",
};

/* String table for grouping keywords. */
static const char *tomoyo_group_name[TOMOYO_MAX_GROUP] = {
        [TOMOYO_PATH_GROUP]    = "path_group ",
        [TOMOYO_NUMBER_GROUP]  = "number_group ",
        [TOMOYO_ADDRESS_GROUP] = "address_group ",
};

/**
 * tomoyo_write_exception - Write exception policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_write_exception(struct tomoyo_io_buffer *head)
{
        const bool is_delete = head->w.is_delete;
        struct tomoyo_acl_param param = {
                .ns = head->w.ns,
                .is_delete = is_delete,
                .data = head->write_buf,
        };
        u8 i;

        if (tomoyo_str_starts(&param.data, "aggregator "))
                return tomoyo_write_aggregator(&param);
        for (i = 0; i < TOMOYO_MAX_TRANSITION_TYPE; i++)
                if (tomoyo_str_starts(&param.data, tomoyo_transition_type[i]))
                        return tomoyo_write_transition_control(&param, i);
        for (i = 0; i < TOMOYO_MAX_GROUP; i++)
                if (tomoyo_str_starts(&param.data, tomoyo_group_name[i]))
                        return tomoyo_write_group(&param, i);
        if (tomoyo_str_starts(&param.data, "acl_group ")) {
                unsigned int group;
                char *data;

                group = simple_strtoul(param.data, &data, 10);
                if (group < TOMOYO_MAX_ACL_GROUPS && *data++ == ' ')
                        return tomoyo_write_domain2
                                (head->w.ns, &head->w.ns->acl_group[group],
                                 data, is_delete);
        }
        return -EINVAL;
}

/**
 * tomoyo_read_group - Read "struct tomoyo_path_group"/"struct tomoyo_number_group"/"struct tomoyo_address_group" list.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @idx:  Index number.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_read_group(struct tomoyo_io_buffer *head, const int idx)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->group_list[idx];

        list_for_each_cookie(head->r.group, list) {
                struct tomoyo_group *group =
                        list_entry(head->r.group, typeof(*group), head.list);

                list_for_each_cookie(head->r.acl, &group->member_list) {
                        struct tomoyo_acl_head *ptr =
                                list_entry(head->r.acl, typeof(*ptr), list);

                        if (ptr->is_deleted)
                                continue;
                        if (!tomoyo_flush(head))
                                return false;
                        tomoyo_print_namespace(head);
                        tomoyo_set_string(head, tomoyo_group_name[idx]);
                        tomoyo_set_string(head, group->group_name->name);
                        if (idx == TOMOYO_PATH_GROUP) {
                                tomoyo_set_space(head);
                                tomoyo_set_string(head, container_of
                                               (ptr, struct tomoyo_path_group,
                                                head)->member_name->name);
                        } else if (idx == TOMOYO_NUMBER_GROUP) {
                                tomoyo_print_number_union(head, &container_of
                                                          (ptr,
                                                   struct tomoyo_number_group,
                                                           head)->number);
                        } else if (idx == TOMOYO_ADDRESS_GROUP) {
                                char buffer[128];
                                struct tomoyo_address_group *member =
                                        container_of(ptr, typeof(*member),
                                                     head);

                                tomoyo_print_ip(buffer, sizeof(buffer),
                                                &member->address);
                                tomoyo_io_printf(head, " %s", buffer);
                        }
                        tomoyo_set_lf(head);
                }
                head->r.acl = NULL;
        }
        head->r.group = NULL;
        return true;
}

/**
 * tomoyo_read_policy - Read "struct tomoyo_..._entry" list.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @idx:  Index number.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static bool tomoyo_read_policy(struct tomoyo_io_buffer *head, const int idx)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);
        struct list_head *list = &ns->policy_list[idx];

        list_for_each_cookie(head->r.acl, list) {
                struct tomoyo_acl_head *acl =
                        container_of(head->r.acl, typeof(*acl), list);
                if (acl->is_deleted)
                        continue;
                if (!tomoyo_flush(head))
                        return false;
                switch (idx) {
                case TOMOYO_ID_TRANSITION_CONTROL:
                        {
                                struct tomoyo_transition_control *ptr =
                                        container_of(acl, typeof(*ptr), head);

                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, tomoyo_transition_type
                                                  [ptr->type]);
                                tomoyo_set_string(head, ptr->program ?
                                                  ptr->program->name : "any");
                                tomoyo_set_string(head, " from ");
                                tomoyo_set_string(head, ptr->domainname ?
                                                  ptr->domainname->name :
                                                  "any");
                        }
                        break;
                case TOMOYO_ID_AGGREGATOR:
                        {
                                struct tomoyo_aggregator *ptr =
                                        container_of(acl, typeof(*ptr), head);

                                tomoyo_print_namespace(head);
                                tomoyo_set_string(head, "aggregator ");
                                tomoyo_set_string(head,
                                                  ptr->original_name->name);
                                tomoyo_set_space(head);
                                tomoyo_set_string(head,
                                               ptr->aggregated_name->name);
                        }
                        break;
                default:
                        continue;
                }
                tomoyo_set_lf(head);
        }
        head->r.acl = NULL;
        return true;
}

/**
 * tomoyo_read_exception - Read exception policy.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Caller holds tomoyo_read_lock().
 */
static void tomoyo_read_exception(struct tomoyo_io_buffer *head)
{
        struct tomoyo_policy_namespace *ns =
                container_of(head->r.ns, typeof(*ns), namespace_list);

        if (head->r.eof)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY &&
               tomoyo_read_policy(head, head->r.step))
                head->r.step++;
        if (head->r.step < TOMOYO_MAX_POLICY)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP &&
               tomoyo_read_group(head, head->r.step - TOMOYO_MAX_POLICY))
                head->r.step++;
        if (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP)
                return;
        while (head->r.step < TOMOYO_MAX_POLICY + TOMOYO_MAX_GROUP
               + TOMOYO_MAX_ACL_GROUPS) {
                head->r.acl_group_index = head->r.step - TOMOYO_MAX_POLICY
                        - TOMOYO_MAX_GROUP;
                if (!tomoyo_read_domain2(head, &ns->acl_group
                                         [head->r.acl_group_index]))
                        return;
                head->r.step++;
        }
        head->r.eof = true;
}

/* Wait queue for kernel -> userspace notification. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_query_wait);
/* Wait queue for userspace -> kernel notification. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_answer_wait);

/* Structure for query. */
struct tomoyo_query {
        struct list_head list;
        struct tomoyo_domain_info *domain;
        char *query;
        size_t query_len;
        unsigned int serial;
        u8 timer;
        u8 answer;
        u8 retry;
};

/* The list for "struct tomoyo_query". */
static LIST_HEAD(tomoyo_query_list);

/* Lock for manipulating tomoyo_query_list. */
static DEFINE_SPINLOCK(tomoyo_query_list_lock);

/*
 * Number of "struct file" referring /sys/kernel/security/tomoyo/query
 * interface.
 */
static atomic_t tomoyo_query_observers = ATOMIC_INIT(0);

/**
 * tomoyo_truncate - Truncate a line.
 *
 * @str: String to truncate.
 *
 * Returns length of truncated @str.
 */
static int tomoyo_truncate(char *str)
{
        char *start = str;

        while (*(unsigned char *) str > (unsigned char) ' ')
                str++;
        *str = '\0';
        return strlen(start) + 1;
}

/**
 * tomoyo_numscan - sscanf() which stores the length of a decimal integer value.
 *
 * @str:   String to scan.
 * @head:  Leading string that must start with.
 * @width: Pointer to "int" for storing length of a decimal integer value after @head.
 * @tail:  Optional character that must match after a decimal integer value.
 *
 * Returns whether @str starts with @head and a decimal value follows @head.
 */
static bool tomoyo_numscan(const char *str, const char *head, int *width, const char tail)
{
        const char *cp;
        const int n = strlen(head);

        if (!strncmp(str, head, n)) {
                cp = str + n;
                while (*cp && *cp >= '0' && *cp <= '9')
                        cp++;
                if (*cp == tail || !tail) {
                        *width = cp - (str + n);
                        return *width != 0;
                }
        }
        *width = 0;
        return 0;
}

/**
 * tomoyo_patternize_path - Make patterns for file path. Used by learning mode.
 *
 * @buffer: Destination buffer.
 * @len:    Size of @buffer.
 * @entry:  Original line.
 *
 * Returns nothing.
 */
static void tomoyo_patternize_path(char *buffer, const int len, char *entry)
{
        int width;
        char *cp = entry;

        /* Nothing to do if this line is not for "file" related entry. */
        if (strncmp(entry, "file ", 5))
                goto flush;
        /*
         * Nothing to do if there is no colon in this line, for this rewriting
         * applies to only filesystems where numeric values in the path are volatile.
         */
        cp = strchr(entry + 5, ':');
        if (!cp) {
                cp = entry;
                goto flush;
        }
        /* Flush e.g. "file ioctl" part. */
        while (*cp != ' ')
                cp--;
        *cp++ = '\0';
        tomoyo_addprintf(buffer, len, "%s ", entry);
        /* e.g. file ioctl pipe:[$INO] $CMD */
        if (tomoyo_numscan(cp, "pipe:[", &width, ']')) {
                cp += width + 7;
                tomoyo_addprintf(buffer, len, "pipe:[\\$]");
                goto flush;
        }
        /* e.g. file ioctl socket:[$INO] $CMD */
        if (tomoyo_numscan(cp, "socket:[", &width, ']')) {
                cp += width + 9;
                tomoyo_addprintf(buffer, len, "socket:[\\$]");
                goto flush;
        }
        if (!strncmp(cp, "proc:/self", 10)) {
                /* e.g. file read proc:/self/task/$TID/fdinfo/$FD */
                cp += 10;
                tomoyo_addprintf(buffer, len, "proc:/self");
        } else if (tomoyo_numscan(cp, "proc:/", &width, 0)) {
                /* e.g. file read proc:/$PID/task/$TID/fdinfo/$FD */
                /*
                 * Don't patternize $PID part if $PID == 1, for several
                 * programs access only files in /proc/1/ directory.
                 */
                cp += width + 6;
                if (width == 1 && *(cp - 1) == '1')
                        tomoyo_addprintf(buffer, len, "proc:/1");
                else
                        tomoyo_addprintf(buffer, len, "proc:/\\$");
        } else {
                goto flush;
        }
        /* Patternize $TID part if "/task/" follows. */
        if (tomoyo_numscan(cp, "/task/", &width, 0)) {
                cp += width + 6;
                tomoyo_addprintf(buffer, len, "/task/\\$");
        }
        /* Patternize $FD part if "/fd/" or "/fdinfo/" follows. */
        if (tomoyo_numscan(cp, "/fd/", &width, 0)) {
                cp += width + 4;
                tomoyo_addprintf(buffer, len, "/fd/\\$");
        } else if (tomoyo_numscan(cp, "/fdinfo/", &width, 0)) {
                cp += width + 8;
                tomoyo_addprintf(buffer, len, "/fdinfo/\\$");
        }
flush:
        /* Flush remaining part if any. */
        if (*cp)
                tomoyo_addprintf(buffer, len, "%s", cp);
}

/**
 * tomoyo_add_entry - Add an ACL to current thread's domain. Used by learning mode.
 *
 * @domain: Pointer to "struct tomoyo_domain_info".
 * @header: Lines containing ACL.
 *
 * Returns nothing.
 */
static void tomoyo_add_entry(struct tomoyo_domain_info *domain, char *header)
{
        char *buffer;
        char *realpath = NULL;
        char *argv0 = NULL;
        char *symlink = NULL;
        char *cp = strchr(header, '\n');
        int len;

        if (!cp)
                return;
        cp = strchr(cp + 1, '\n');
        if (!cp)
                return;
        *cp++ = '\0';
        /* Reserve some space for potentially using patterns. */
        len = strlen(cp) + 16;
        /* strstr() will return NULL if ordering is wrong. */
        if (*cp == 'f') {
                argv0 = strstr(header, " argv[]={ \"");
                if (argv0) {
                        argv0 += 10;
                        len += tomoyo_truncate(argv0) + 14;
                }
                realpath = strstr(header, " exec={ realpath=\"");
                if (realpath) {
                        realpath += 8;
                        len += tomoyo_truncate(realpath) + 6;
                }
                symlink = strstr(header, " symlink.target=\"");
                if (symlink)
                        len += tomoyo_truncate(symlink + 1) + 1;
        }
        buffer = kmalloc(len, GFP_NOFS | __GFP_ZERO);
        if (!buffer)
                return;
        tomoyo_patternize_path(buffer, len, cp);
        if (realpath)
                tomoyo_addprintf(buffer, len, " exec.%s", realpath);
        if (argv0)
                tomoyo_addprintf(buffer, len, " exec.argv[0]=%s", argv0);
        if (symlink)
                tomoyo_addprintf(buffer, len, "%s", symlink);
        tomoyo_normalize_line(buffer);
        if (!tomoyo_write_domain2(domain->ns, &domain->acl_info_list, buffer,
                                  false))
                tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
        kfree(buffer);
}

/**
 * tomoyo_supervisor - Ask for the supervisor's decision.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @fmt: The printf()'s format string, followed by parameters.
 *
 * Returns 0 if the supervisor decided to permit the access request which
 * violated the policy in enforcing mode, TOMOYO_RETRY_REQUEST if the
 * supervisor decided to retry the access request which violated the policy in
 * enforcing mode, 0 if it is not in enforcing mode, -EPERM otherwise.
 */
int tomoyo_supervisor(struct tomoyo_request_info *r, const char *fmt, ...)
{
        va_list args;
        int error;
        int len;
        static unsigned int tomoyo_serial;
        struct tomoyo_query entry = { };
        bool quota_exceeded = false;

        va_start(args, fmt);
        len = vsnprintf(NULL, 0, fmt, args) + 1;
        va_end(args);
        /* Write /sys/kernel/security/tomoyo/audit. */
        va_start(args, fmt);
        tomoyo_write_log2(r, len, fmt, args);
        va_end(args);
        /* Nothing more to do if granted. */
        if (r->granted)
                return 0;
        if (r->mode)
                tomoyo_update_stat(r->mode);
        switch (r->mode) {
        case TOMOYO_CONFIG_ENFORCING:
                error = -EPERM;
                if (atomic_read(&tomoyo_query_observers))
                        break;
                goto out;
        case TOMOYO_CONFIG_LEARNING:
                error = 0;
                /* Check max_learning_entry parameter. */
                if (tomoyo_domain_quota_is_ok(r))
                        break;
                fallthrough;
        default:
                return 0;
        }
        /* Get message. */
        va_start(args, fmt);
        entry.query = tomoyo_init_log(r, len, fmt, args);
        va_end(args);
        if (!entry.query)
                goto out;
        entry.query_len = strlen(entry.query) + 1;
        if (!error) {
                tomoyo_add_entry(r->domain, entry.query);
                goto out;
        }
        len = kmalloc_size_roundup(entry.query_len);
        entry.domain = r->domain;
        spin_lock(&tomoyo_query_list_lock);
        if (tomoyo_memory_quota[TOMOYO_MEMORY_QUERY] &&
            tomoyo_memory_used[TOMOYO_MEMORY_QUERY] + len
            >= tomoyo_memory_quota[TOMOYO_MEMORY_QUERY]) {
                quota_exceeded = true;
        } else {
                entry.serial = tomoyo_serial++;
                entry.retry = r->retry;
                tomoyo_memory_used[TOMOYO_MEMORY_QUERY] += len;
                list_add_tail(&entry.list, &tomoyo_query_list);
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (quota_exceeded)
                goto out;
        /* Give 10 seconds for supervisor's opinion. */
        while (entry.timer < 10) {
                wake_up_all(&tomoyo_query_wait);
                if (wait_event_interruptible_timeout
                    (tomoyo_answer_wait, entry.answer ||
                     !atomic_read(&tomoyo_query_observers), HZ))
                        break;
                entry.timer++;
        }
        spin_lock(&tomoyo_query_list_lock);
        list_del(&entry.list);
        tomoyo_memory_used[TOMOYO_MEMORY_QUERY] -= len;
        spin_unlock(&tomoyo_query_list_lock);
        switch (entry.answer) {
        case 3: /* Asked to retry by administrator. */
                error = TOMOYO_RETRY_REQUEST;
                r->retry++;
                break;
        case 1:
                /* Granted by administrator. */
                error = 0;
                break;
        default:
                /* Timed out or rejected by administrator. */
                break;
        }
out:
        kfree(entry.query);
        return error;
}

/**
 * tomoyo_find_domain_by_qid - Get domain by query id.
 *
 * @serial: Query ID assigned by tomoyo_supervisor().
 *
 * Returns pointer to "struct tomoyo_domain_info" if found, NULL otherwise.
 */
static struct tomoyo_domain_info *tomoyo_find_domain_by_qid
(unsigned int serial)
{
        struct tomoyo_query *ptr;
        struct tomoyo_domain_info *domain = NULL;

        spin_lock(&tomoyo_query_list_lock);
        list_for_each_entry(ptr, &tomoyo_query_list, list) {
                if (ptr->serial != serial)
                        continue;
                domain = ptr->domain;
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        return domain;
}

/**
 * tomoyo_poll_query - poll() for /sys/kernel/security/tomoyo/query.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table".
 *
 * Returns EPOLLIN | EPOLLRDNORM when ready to read, 0 otherwise.
 *
 * Waits for access requests which violated policy in enforcing mode.
 */
static __poll_t tomoyo_poll_query(struct file *file, poll_table *wait)
{
        if (!list_empty(&tomoyo_query_list))
                return EPOLLIN | EPOLLRDNORM;
        poll_wait(file, &tomoyo_query_wait, wait);
        if (!list_empty(&tomoyo_query_list))
                return EPOLLIN | EPOLLRDNORM;
        return 0;
}

/**
 * tomoyo_read_query - Read access requests which violated policy in enforcing mode.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 */
static void tomoyo_read_query(struct tomoyo_io_buffer *head)
{
        struct list_head *tmp;
        unsigned int pos = 0;
        size_t len = 0;
        char *buf;

        if (head->r.w_pos)
                return;
        kfree(head->read_buf);
        head->read_buf = NULL;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (pos++ != head->r.query_index)
                        continue;
                len = ptr->query_len;
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (!len) {
                head->r.query_index = 0;
                return;
        }
        buf = kzalloc(len + 32, GFP_NOFS);
        if (!buf)
                return;
        pos = 0;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (pos++ != head->r.query_index)
                        continue;
                /*
                 * Some query can be skipped because tomoyo_query_list
                 * can change, but I don't care.
                 */
                if (len == ptr->query_len)
                        snprintf(buf, len + 31, "Q%u-%hu\n%s", ptr->serial,
                                 ptr->retry, ptr->query);
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (buf[0]) {
                head->read_buf = buf;
                head->r.w[head->r.w_pos++] = buf;
                head->r.query_index++;
        } else {
                kfree(buf);
        }
}

/**
 * tomoyo_write_answer - Write the supervisor's decision.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0 on success, -EINVAL otherwise.
 */
static int tomoyo_write_answer(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        struct list_head *tmp;
        unsigned int serial;
        unsigned int answer;

        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                ptr->timer = 0;
        }
        spin_unlock(&tomoyo_query_list_lock);
        if (sscanf(data, "A%u=%u", &serial, &answer) != 2)
                return -EINVAL;
        spin_lock(&tomoyo_query_list_lock);
        list_for_each(tmp, &tomoyo_query_list) {
                struct tomoyo_query *ptr = list_entry(tmp, typeof(*ptr), list);

                if (ptr->serial != serial)
                        continue;
                ptr->answer = answer;
                /* Remove from tomoyo_query_list. */
                if (ptr->answer)
                        list_del_init(&ptr->list);
                break;
        }
        spin_unlock(&tomoyo_query_list_lock);
        return 0;
}

/**
 * tomoyo_read_version: Get version.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns version information.
 */
static void tomoyo_read_version(struct tomoyo_io_buffer *head)
{
        if (!head->r.eof) {
                tomoyo_io_printf(head, "2.6.0");
                head->r.eof = true;
        }
}

/* String table for /sys/kernel/security/tomoyo/stat interface. */
static const char * const tomoyo_policy_headers[TOMOYO_MAX_POLICY_STAT] = {
        [TOMOYO_STAT_POLICY_UPDATES]    = "update:",
        [TOMOYO_STAT_POLICY_LEARNING]   = "violation in learning mode:",
        [TOMOYO_STAT_POLICY_PERMISSIVE] = "violation in permissive mode:",
        [TOMOYO_STAT_POLICY_ENFORCING]  = "violation in enforcing mode:",
};

/* String table for /sys/kernel/security/tomoyo/stat interface. */
static const char * const tomoyo_memory_headers[TOMOYO_MAX_MEMORY_STAT] = {
        [TOMOYO_MEMORY_POLICY] = "policy:",
        [TOMOYO_MEMORY_AUDIT]  = "audit log:",
        [TOMOYO_MEMORY_QUERY]  = "query message:",
};

/* Counter for number of updates. */
static atomic_t tomoyo_stat_updated[TOMOYO_MAX_POLICY_STAT];
/* Timestamp counter for last updated. */
static time64_t tomoyo_stat_modified[TOMOYO_MAX_POLICY_STAT];

/**
 * tomoyo_update_stat - Update statistic counters.
 *
 * @index: Index for policy type.
 *
 * Returns nothing.
 */
void tomoyo_update_stat(const u8 index)
{
        atomic_inc(&tomoyo_stat_updated[index]);
        tomoyo_stat_modified[index] = ktime_get_real_seconds();
}

/**
 * tomoyo_read_stat - Read statistic data.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static void tomoyo_read_stat(struct tomoyo_io_buffer *head)
{
        u8 i;
        unsigned int total = 0;

        if (head->r.eof)
                return;
        for (i = 0; i < TOMOYO_MAX_POLICY_STAT; i++) {
                tomoyo_io_printf(head, "Policy %-30s %10u",
                                 tomoyo_policy_headers[i],
                                 atomic_read(&tomoyo_stat_updated[i]));
                if (tomoyo_stat_modified[i]) {
                        struct tomoyo_time stamp;

                        tomoyo_convert_time(tomoyo_stat_modified[i], &stamp);
                        tomoyo_io_printf(head, " (Last: %04u/%02u/%02u %02u:%02u:%02u)",
                                         stamp.year, stamp.month, stamp.day,
                                         stamp.hour, stamp.min, stamp.sec);
                }
                tomoyo_set_lf(head);
        }
        for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++) {
                unsigned int used = tomoyo_memory_used[i];

                total += used;
                tomoyo_io_printf(head, "Memory used by %-22s %10u",
                                 tomoyo_memory_headers[i], used);
                used = tomoyo_memory_quota[i];
                if (used)
                        tomoyo_io_printf(head, " (Quota: %10u)", used);
                tomoyo_set_lf(head);
        }
        tomoyo_io_printf(head, "Total memory used:                    %10u\n",
                         total);
        head->r.eof = true;
}

/**
 * tomoyo_write_stat - Set memory quota.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns 0.
 */
static int tomoyo_write_stat(struct tomoyo_io_buffer *head)
{
        char *data = head->write_buf;
        u8 i;

        if (tomoyo_str_starts(&data, "Memory used by "))
                for (i = 0; i < TOMOYO_MAX_MEMORY_STAT; i++)
                        if (tomoyo_str_starts(&data, tomoyo_memory_headers[i]))
                                sscanf(data, "%u", &tomoyo_memory_quota[i]);
        return 0;
}

/**
 * tomoyo_open_control - open() for /sys/kernel/security/tomoyo/ interface.
 *
 * @type: Type of interface.
 * @file: Pointer to "struct file".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_open_control(const u8 type, struct file *file)
{
        struct tomoyo_io_buffer *head = kzalloc(sizeof(*head), GFP_NOFS);

        if (!head)
                return -ENOMEM;
        mutex_init(&head->io_sem);
        head->type = type;
        switch (type) {
        case TOMOYO_DOMAINPOLICY:
                /* /sys/kernel/security/tomoyo/domain_policy */
                head->write = tomoyo_write_domain;
                head->read = tomoyo_read_domain;
                break;
        case TOMOYO_EXCEPTIONPOLICY:
                /* /sys/kernel/security/tomoyo/exception_policy */
                head->write = tomoyo_write_exception;
                head->read = tomoyo_read_exception;
                break;
        case TOMOYO_AUDIT:
                /* /sys/kernel/security/tomoyo/audit */
                head->poll = tomoyo_poll_log;
                head->read = tomoyo_read_log;
                break;
        case TOMOYO_PROCESS_STATUS:
                /* /sys/kernel/security/tomoyo/.process_status */
                head->write = tomoyo_write_pid;
                head->read = tomoyo_read_pid;
                break;
        case TOMOYO_VERSION:
                /* /sys/kernel/security/tomoyo/version */
                head->read = tomoyo_read_version;
                head->readbuf_size = 128;
                break;
        case TOMOYO_STAT:
                /* /sys/kernel/security/tomoyo/stat */
                head->write = tomoyo_write_stat;
                head->read = tomoyo_read_stat;
                head->readbuf_size = 1024;
                break;
        case TOMOYO_PROFILE:
                /* /sys/kernel/security/tomoyo/profile */
                head->write = tomoyo_write_profile;
                head->read = tomoyo_read_profile;
                break;
        case TOMOYO_QUERY: /* /sys/kernel/security/tomoyo/query */
                head->poll = tomoyo_poll_query;
                head->write = tomoyo_write_answer;
                head->read = tomoyo_read_query;
                break;
        case TOMOYO_MANAGER:
                /* /sys/kernel/security/tomoyo/manager */
                head->write = tomoyo_write_manager;
                head->read = tomoyo_read_manager;
                break;
        }
        if (!(file->f_mode & FMODE_READ)) {
                /*
                 * No need to allocate read_buf since it is not opened
                 * for reading.
                 */
                head->read = NULL;
                head->poll = NULL;
        } else if (!head->poll) {
                /* Don't allocate read_buf for poll() access. */
                if (!head->readbuf_size)
                        head->readbuf_size = 4096 * 2;
                head->read_buf = kzalloc(head->readbuf_size, GFP_NOFS);
                if (!head->read_buf) {
                        kfree(head);
                        return -ENOMEM;
                }
        }
        if (!(file->f_mode & FMODE_WRITE)) {
                /*
                 * No need to allocate write_buf since it is not opened
                 * for writing.
                 */
                head->write = NULL;
        } else if (head->write) {
                head->writebuf_size = 4096 * 2;
                head->write_buf = kzalloc(head->writebuf_size, GFP_NOFS);
                if (!head->write_buf) {
                        kfree(head->read_buf);
                        kfree(head);
                        return -ENOMEM;
                }
        }
        /*
         * If the file is /sys/kernel/security/tomoyo/query , increment the
         * observer counter.
         * The obserber counter is used by tomoyo_supervisor() to see if
         * there is some process monitoring /sys/kernel/security/tomoyo/query.
         */
        if (type == TOMOYO_QUERY)
                atomic_inc(&tomoyo_query_observers);
        file->private_data = head;
        tomoyo_notify_gc(head, true);
        return 0;
}

/**
 * tomoyo_poll_control - poll() for /sys/kernel/security/tomoyo/ interface.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table". Maybe NULL.
 *
 * Returns EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM if ready to read/write,
 * EPOLLOUT | EPOLLWRNORM otherwise.
 */
__poll_t tomoyo_poll_control(struct file *file, poll_table *wait)
{
        struct tomoyo_io_buffer *head = file->private_data;

        if (head->poll)
                return head->poll(file, wait) | EPOLLOUT | EPOLLWRNORM;
        return EPOLLIN | EPOLLRDNORM | EPOLLOUT | EPOLLWRNORM;
}

/**
 * tomoyo_set_namespace_cursor - Set namespace to read.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
static inline void tomoyo_set_namespace_cursor(struct tomoyo_io_buffer *head)
{
        struct list_head *ns;

        if (head->type != TOMOYO_EXCEPTIONPOLICY &&
            head->type != TOMOYO_PROFILE)
                return;
        /*
         * If this is the first read, or reading previous namespace finished
         * and has more namespaces to read, update the namespace cursor.
         */
        ns = head->r.ns;
        if (!ns || (head->r.eof && ns->next != &tomoyo_namespace_list)) {
                /* Clearing is OK because tomoyo_flush() returned true. */
                memset(&head->r, 0, sizeof(head->r));
                head->r.ns = ns ? ns->next : tomoyo_namespace_list.next;
        }
}

/**
 * tomoyo_has_more_namespace - Check for unread namespaces.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns true if we have more entries to print, false otherwise.
 */
static inline bool tomoyo_has_more_namespace(struct tomoyo_io_buffer *head)
{
        return (head->type == TOMOYO_EXCEPTIONPOLICY ||
                head->type == TOMOYO_PROFILE) && head->r.eof &&
                head->r.ns->next != &tomoyo_namespace_list;
}

/**
 * tomoyo_read_control - read() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head:       Pointer to "struct tomoyo_io_buffer".
 * @buffer:     Pointer to buffer to write to.
 * @buffer_len: Size of @buffer.
 *
 * Returns bytes read on success, negative value otherwise.
 */
ssize_t tomoyo_read_control(struct tomoyo_io_buffer *head, char __user *buffer,
                            const int buffer_len)
{
        int len;
        int idx;

        if (!head->read)
                return -EINVAL;
        if (mutex_lock_interruptible(&head->io_sem))
                return -EINTR;
        head->read_user_buf = buffer;
        head->read_user_buf_avail = buffer_len;
        idx = tomoyo_read_lock();
        if (tomoyo_flush(head))
                /* Call the policy handler. */
                do {
                        tomoyo_set_namespace_cursor(head);
                        head->read(head);
                } while (tomoyo_flush(head) &&
                         tomoyo_has_more_namespace(head));
        tomoyo_read_unlock(idx);
        len = head->read_user_buf - buffer;
        mutex_unlock(&head->io_sem);
        return len;
}

/**
 * tomoyo_parse_policy - Parse a policy line.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 * @line: Line to parse.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_parse_policy(struct tomoyo_io_buffer *head, char *line)
{
        /* Delete request? */
        head->w.is_delete = !strncmp(line, "delete ", 7);
        if (head->w.is_delete)
                memmove(line, line + 7, strlen(line + 7) + 1);
        /* Selecting namespace to update. */
        if (head->type == TOMOYO_EXCEPTIONPOLICY ||
            head->type == TOMOYO_PROFILE) {
                if (*line == '<') {
                        char *cp = strchr(line, ' ');

                        if (cp) {
                                *cp++ = '\0';
                                head->w.ns = tomoyo_assign_namespace(line);
                                memmove(line, cp, strlen(cp) + 1);
                        } else
                                head->w.ns = NULL;
                } else
                        head->w.ns = &tomoyo_kernel_namespace;
                /* Don't allow updating if namespace is invalid. */
                if (!head->w.ns)
                        return -ENOENT;
        }
        /* Do the update. */
        return head->write(head);
}

/**
 * tomoyo_write_control - write() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head:       Pointer to "struct tomoyo_io_buffer".
 * @buffer:     Pointer to buffer to read from.
 * @buffer_len: Size of @buffer.
 *
 * Returns @buffer_len on success, negative value otherwise.
 */
ssize_t tomoyo_write_control(struct tomoyo_io_buffer *head,
                             const char __user *buffer, const int buffer_len)
{
        int error = buffer_len;
        size_t avail_len = buffer_len;
        char *cp0;
        int idx;

        if (!head->write)
                return -EINVAL;
        if (mutex_lock_interruptible(&head->io_sem))
                return -EINTR;
        cp0 = head->write_buf;
        head->read_user_buf_avail = 0;
        idx = tomoyo_read_lock();
        /* Read a line and dispatch it to the policy handler. */
        while (avail_len > 0) {
                char c;

                if (head->w.avail >= head->writebuf_size - 1) {
                        const int len = head->writebuf_size * 2;
                        char *cp = kzalloc(len, GFP_NOFS | __GFP_NOWARN);

                        if (!cp) {
                                error = -ENOMEM;
                                break;
                        }
                        memmove(cp, cp0, head->w.avail);
                        kfree(cp0);
                        head->write_buf = cp;
                        cp0 = cp;
                        head->writebuf_size = len;
                }
                if (get_user(c, buffer)) {
                        error = -EFAULT;
                        break;
                }
                buffer++;
                avail_len--;
                cp0[head->w.avail++] = c;
                if (c != '\n')
                        continue;
                cp0[head->w.avail - 1] = '\0';
                head->w.avail = 0;
                tomoyo_normalize_line(cp0);
                if (!strcmp(cp0, "reset")) {
                        head->w.ns = &tomoyo_kernel_namespace;
                        head->w.domain = NULL;
                        memset(&head->r, 0, sizeof(head->r));
                        continue;
                }
                /* Don't allow updating policies by non manager programs. */
                switch (head->type) {
                case TOMOYO_PROCESS_STATUS:
                        /* This does not write anything. */
                        break;
                case TOMOYO_DOMAINPOLICY:
                        if (tomoyo_select_domain(head, cp0))
                                continue;
                        fallthrough;
                case TOMOYO_EXCEPTIONPOLICY:
                        if (!strcmp(cp0, "select transition_only")) {
                                head->r.print_transition_related_only = true;
                                continue;
                        }
                        fallthrough;
                default:
                        if (!tomoyo_manager()) {
                                error = -EPERM;
                                goto out;
                        }
                }
                switch (tomoyo_parse_policy(head, cp0)) {
                case -EPERM:
                        error = -EPERM;
                        goto out;
                case 0:
                        switch (head->type) {
                        case TOMOYO_DOMAINPOLICY:
                        case TOMOYO_EXCEPTIONPOLICY:
                        case TOMOYO_STAT:
                        case TOMOYO_PROFILE:
                        case TOMOYO_MANAGER:
                                tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
                                break;
                        default:
                                break;
                        }
                        break;
                }
        }
out:
        tomoyo_read_unlock(idx);
        mutex_unlock(&head->io_sem);
        return error;
}

/**
 * tomoyo_close_control - close() for /sys/kernel/security/tomoyo/ interface.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 */
void tomoyo_close_control(struct tomoyo_io_buffer *head)
{
        /*
         * If the file is /sys/kernel/security/tomoyo/query , decrement the
         * observer counter.
         */
        if (head->type == TOMOYO_QUERY &&
            atomic_dec_and_test(&tomoyo_query_observers))
                wake_up_all(&tomoyo_answer_wait);
        tomoyo_notify_gc(head, false);
}

/**
 * tomoyo_check_profile - Check all profiles currently assigned to domains are defined.
 */
void tomoyo_check_profile(void)
{
        struct tomoyo_domain_info *domain;
        const int idx = tomoyo_read_lock();

        tomoyo_policy_loaded = true;
        pr_info("TOMOYO: 2.6.0\n");
        list_for_each_entry_rcu(domain, &tomoyo_domain_list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                const u8 profile = domain->profile;
                struct tomoyo_policy_namespace *ns = domain->ns;

                if (ns->profile_version == 20110903) {
                        pr_info_once("Converting profile version from %u to %u.\n",
                                     20110903, 20150505);
                        ns->profile_version = 20150505;
                }
                if (ns->profile_version != 20150505)
                        pr_err("Profile version %u is not supported.\n",
                               ns->profile_version);
                else if (!ns->profile_ptr[profile])
                        pr_err("Profile %u (used by '%s') is not defined.\n",
                               profile, domain->domainname->name);
                else
                        continue;
                pr_err("Userland tools for TOMOYO 2.6 must be installed and policy must be initialized.\n");
                pr_err("Please see https://tomoyo.sourceforge.net/2.6/ for more information.\n");
                panic("STOP!");
        }
        tomoyo_read_unlock(idx);
        pr_info("Mandatory Access Control activated.\n");
}

/**
 * tomoyo_load_builtin_policy - Load built-in policy.
 *
 * Returns nothing.
 */
void __init tomoyo_load_builtin_policy(void)
{
#ifdef CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING
        static char tomoyo_builtin_profile[] __initdata =
                "PROFILE_VERSION=20150505\n"
                "0-CONFIG={ mode=learning grant_log=no reject_log=yes }\n";
        static char tomoyo_builtin_exception_policy[] __initdata =
                "aggregator proc:/self/exe /proc/self/exe\n";
        static char tomoyo_builtin_domain_policy[] __initdata = "";
        static char tomoyo_builtin_manager[] __initdata = "";
        static char tomoyo_builtin_stat[] __initdata = "";
#else
        /*
         * This include file is manually created and contains built-in policy
         * named "tomoyo_builtin_profile", "tomoyo_builtin_exception_policy",
         * "tomoyo_builtin_domain_policy", "tomoyo_builtin_manager",
         * "tomoyo_builtin_stat" in the form of "static char [] __initdata".
         */
#include "builtin-policy.h"
#endif
        u8 i;
        const int idx = tomoyo_read_lock();

        for (i = 0; i < 5; i++) {
                struct tomoyo_io_buffer head = { };
                char *start = "";

                switch (i) {
                case 0:
                        start = tomoyo_builtin_profile;
                        head.type = TOMOYO_PROFILE;
                        head.write = tomoyo_write_profile;
                        break;
                case 1:
                        start = tomoyo_builtin_exception_policy;
                        head.type = TOMOYO_EXCEPTIONPOLICY;
                        head.write = tomoyo_write_exception;
                        break;
                case 2:
                        start = tomoyo_builtin_domain_policy;
                        head.type = TOMOYO_DOMAINPOLICY;
                        head.write = tomoyo_write_domain;
                        break;
                case 3:
                        start = tomoyo_builtin_manager;
                        head.type = TOMOYO_MANAGER;
                        head.write = tomoyo_write_manager;
                        break;
                case 4:
                        start = tomoyo_builtin_stat;
                        head.type = TOMOYO_STAT;
                        head.write = tomoyo_write_stat;
                        break;
                }
                while (1) {
                        char *end = strchr(start, '\n');

                        if (!end)
                                break;
                        *end = '\0';
                        tomoyo_normalize_line(start);
                        head.write_buf = start;
                        tomoyo_parse_policy(&head, start);
                        start = end + 1;
                }
        }
        tomoyo_read_unlock(idx);
#ifdef CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER
        tomoyo_check_profile();
#endif
}














  267 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_BUILTIN_FLS_H_
#define _ASM_GENERIC_BITOPS_BUILTIN_FLS_H_

/**
 * fls - find last (most-significant) bit set
 * @x: the word to search
 *
 * This is defined the same way as ffs.
 * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32.
 */
static __always_inline int fls(unsigned int x)
{
        return x ? sizeof(x) * 8 - __builtin_clz(x) : 0;
}

#endif





















































































































































































































































































































































    3 


    3 




























































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/* raw.c - Raw sockets for protocol family CAN
 *
 * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of Volkswagen nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * Alternatively, provided that this notice is retained in full, this
 * software may be distributed under the terms of the GNU General
 * Public License ("GPL") version 2, in which case the provisions of the
 * GPL apply INSTEAD OF those given above.
 *
 * The provided data structures and external interfaces from this code
 * are not restricted to be used by modules with a GPL compatible license.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/uio.h>
#include <linux/net.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/socket.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/can.h>
#include <linux/can/core.h>
#include <linux/can/dev.h> /* for can_is_canxl_dev_mtu() */
#include <linux/can/skb.h>
#include <linux/can/raw.h>
#include <net/sock.h>
#include <net/net_namespace.h>

MODULE_DESCRIPTION("PF_CAN raw protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>");
MODULE_ALIAS("can-proto-1");

#define RAW_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex)

#define MASK_ALL 0

/* A raw socket has a list of can_filters attached to it, each receiving
 * the CAN frames matching that filter.  If the filter list is empty,
 * no CAN frames will be received by the socket.  The default after
 * opening the socket, is to have one filter which receives all frames.
 * The filter list is allocated dynamically with the exception of the
 * list containing only one item.  This common case is optimized by
 * storing the single filter in dfilter, to avoid using dynamic memory.
 */

struct uniqframe {
        int skbcnt;
        const struct sk_buff *skb;
        unsigned int join_rx_count;
};

struct raw_sock {
        struct sock sk;
        int bound;
        int ifindex;
        struct net_device *dev;
        netdevice_tracker dev_tracker;
        struct list_head notifier;
        int loopback;
        int recv_own_msgs;
        int fd_frames;
        int xl_frames;
        struct can_raw_vcid_options raw_vcid_opts;
        canid_t tx_vcid_shifted;
        canid_t rx_vcid_shifted;
        canid_t rx_vcid_mask_shifted;
        int join_filters;
        int count;                 /* number of active filters */
        struct can_filter dfilter; /* default/single filter */
        struct can_filter *filter; /* pointer to filter(s) */
        can_err_mask_t err_mask;
        struct uniqframe __percpu *uniq;
};

static LIST_HEAD(raw_notifier_list);
static DEFINE_SPINLOCK(raw_notifier_lock);
static struct raw_sock *raw_busy_notifier;

/* Return pointer to store the extra msg flags for raw_recvmsg().
 * We use the space of one unsigned int beyond the 'struct sockaddr_can'
 * in skb->cb.
 */
static inline unsigned int *raw_flags(struct sk_buff *skb)
{
        sock_skb_cb_check_size(sizeof(struct sockaddr_can) +
                               sizeof(unsigned int));

        /* return pointer after struct sockaddr_can */
        return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
}

static inline struct raw_sock *raw_sk(const struct sock *sk)
{
        return (struct raw_sock *)sk;
}

static void raw_rcv(struct sk_buff *oskb, void *data)
{
        struct sock *sk = (struct sock *)data;
        struct raw_sock *ro = raw_sk(sk);
        struct sockaddr_can *addr;
        struct sk_buff *skb;
        unsigned int *pflags;

        /* check the received tx sock reference */
        if (!ro->recv_own_msgs && oskb->sk == sk)
                return;

        /* make sure to not pass oversized frames to the socket */
        if (!ro->fd_frames && can_is_canfd_skb(oskb))
                return;

        if (can_is_canxl_skb(oskb)) {
                struct canxl_frame *cxl = (struct canxl_frame *)oskb->data;

                /* make sure to not pass oversized frames to the socket */
                if (!ro->xl_frames)
                        return;

                /* filter CAN XL VCID content */
                if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_RX_FILTER) {
                        /* apply VCID filter if user enabled the filter */
                        if ((cxl->prio & ro->rx_vcid_mask_shifted) !=
                            (ro->rx_vcid_shifted & ro->rx_vcid_mask_shifted))
                                return;
                } else {
                        /* no filter => do not forward VCID tagged frames */
                        if (cxl->prio & CANXL_VCID_MASK)
                                return;
                }
        }

        /* eliminate multiple filter matches for the same skb */
        if (this_cpu_ptr(ro->uniq)->skb == oskb &&
            this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) {
                if (!ro->join_filters)
                        return;

                this_cpu_inc(ro->uniq->join_rx_count);
                /* drop frame until all enabled filters matched */
                if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count)
                        return;
        } else {
                this_cpu_ptr(ro->uniq)->skb = oskb;
                this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt;
                this_cpu_ptr(ro->uniq)->join_rx_count = 1;
                /* drop first frame to check all enabled filters? */
                if (ro->join_filters && ro->count > 1)
                        return;
        }

        /* clone the given skb to be able to enqueue it into the rcv queue */
        skb = skb_clone(oskb, GFP_ATOMIC);
        if (!skb)
                return;

        /* Put the datagram to the queue so that raw_recvmsg() can get
         * it from there. We need to pass the interface index to
         * raw_recvmsg(). We pass a whole struct sockaddr_can in
         * skb->cb containing the interface index.
         */

        sock_skb_cb_check_size(sizeof(struct sockaddr_can));
        addr = (struct sockaddr_can *)skb->cb;
        memset(addr, 0, sizeof(*addr));
        addr->can_family = AF_CAN;
        addr->can_ifindex = skb->dev->ifindex;

        /* add CAN specific message flags for raw_recvmsg() */
        pflags = raw_flags(skb);
        *pflags = 0;
        if (oskb->sk)
                *pflags |= MSG_DONTROUTE;
        if (oskb->sk == sk)
                *pflags |= MSG_CONFIRM;

        if (sock_queue_rcv_skb(sk, skb) < 0)
                kfree_skb(skb);
}

static int raw_enable_filters(struct net *net, struct net_device *dev,
                              struct sock *sk, struct can_filter *filter,
                              int count)
{
        int err = 0;
        int i;

        for (i = 0; i < count; i++) {
                err = can_rx_register(net, dev, filter[i].can_id,
                                      filter[i].can_mask,
                                      raw_rcv, sk, "raw", sk);
                if (err) {
                        /* clean up successfully registered filters */
                        while (--i >= 0)
                                can_rx_unregister(net, dev, filter[i].can_id,
                                                  filter[i].can_mask,
                                                  raw_rcv, sk);
                        break;
                }
        }

        return err;
}

static int raw_enable_errfilter(struct net *net, struct net_device *dev,
                                struct sock *sk, can_err_mask_t err_mask)
{
        int err = 0;

        if (err_mask)
                err = can_rx_register(net, dev, 0, err_mask | CAN_ERR_FLAG,
                                      raw_rcv, sk, "raw", sk);

        return err;
}

static void raw_disable_filters(struct net *net, struct net_device *dev,
                                struct sock *sk, struct can_filter *filter,
                                int count)
{
        int i;

        for (i = 0; i < count; i++)
                can_rx_unregister(net, dev, filter[i].can_id,
                                  filter[i].can_mask, raw_rcv, sk);
}

static inline void raw_disable_errfilter(struct net *net,
                                         struct net_device *dev,
                                         struct sock *sk,
                                         can_err_mask_t err_mask)

{
        if (err_mask)
                can_rx_unregister(net, dev, 0, err_mask | CAN_ERR_FLAG,
                                  raw_rcv, sk);
}

static inline void raw_disable_allfilters(struct net *net,
                                          struct net_device *dev,
                                          struct sock *sk)
{
        struct raw_sock *ro = raw_sk(sk);

        raw_disable_filters(net, dev, sk, ro->filter, ro->count);
        raw_disable_errfilter(net, dev, sk, ro->err_mask);
}

static int raw_enable_allfilters(struct net *net, struct net_device *dev,
                                 struct sock *sk)
{
        struct raw_sock *ro = raw_sk(sk);
        int err;

        err = raw_enable_filters(net, dev, sk, ro->filter, ro->count);
        if (!err) {
                err = raw_enable_errfilter(net, dev, sk, ro->err_mask);
                if (err)
                        raw_disable_filters(net, dev, sk, ro->filter,
                                            ro->count);
        }

        return err;
}

static void raw_notify(struct raw_sock *ro, unsigned long msg,
                       struct net_device *dev)
{
        struct sock *sk = &ro->sk;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                return;

        if (ro->dev != dev)
                return;

        switch (msg) {
        case NETDEV_UNREGISTER:
                lock_sock(sk);
                /* remove current filters & unregister */
                if (ro->bound) {
                        raw_disable_allfilters(dev_net(dev), dev, sk);
                        netdev_put(dev, &ro->dev_tracker);
                }

                if (ro->count > 1)
                        kfree(ro->filter);

                ro->ifindex = 0;
                ro->bound = 0;
                ro->dev = NULL;
                ro->count = 0;
                release_sock(sk);

                sk->sk_err = ENODEV;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
                break;

        case NETDEV_DOWN:
                sk->sk_err = ENETDOWN;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
                break;
        }
}

static int raw_notifier(struct notifier_block *nb, unsigned long msg,
                        void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        if (dev->type != ARPHRD_CAN)
                return NOTIFY_DONE;
        if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
                return NOTIFY_DONE;
        if (unlikely(raw_busy_notifier)) /* Check for reentrant bug. */
                return NOTIFY_DONE;

        spin_lock(&raw_notifier_lock);
        list_for_each_entry(raw_busy_notifier, &raw_notifier_list, notifier) {
                spin_unlock(&raw_notifier_lock);
                raw_notify(raw_busy_notifier, msg, dev);
                spin_lock(&raw_notifier_lock);
        }
        raw_busy_notifier = NULL;
        spin_unlock(&raw_notifier_lock);
        return NOTIFY_DONE;
}

static int raw_init(struct sock *sk)
{
        struct raw_sock *ro = raw_sk(sk);

        ro->bound            = 0;
        ro->ifindex          = 0;
        ro->dev              = NULL;

        /* set default filter to single entry dfilter */
        ro->dfilter.can_id   = 0;
        ro->dfilter.can_mask = MASK_ALL;
        ro->filter           = &ro->dfilter;
        ro->count            = 1;

        /* set default loopback behaviour */
        ro->loopback         = 1;
        ro->recv_own_msgs    = 0;
        ro->fd_frames        = 0;
        ro->xl_frames        = 0;
        ro->join_filters     = 0;

        /* alloc_percpu provides zero'ed memory */
        ro->uniq = alloc_percpu(struct uniqframe);
        if (unlikely(!ro->uniq))
                return -ENOMEM;

        /* set notifier */
        spin_lock(&raw_notifier_lock);
        list_add_tail(&ro->notifier, &raw_notifier_list);
        spin_unlock(&raw_notifier_lock);

        return 0;
}

static int raw_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct raw_sock *ro;
        struct net *net;

        if (!sk)
                return 0;

        ro = raw_sk(sk);
        net = sock_net(sk);

        spin_lock(&raw_notifier_lock);
        while (raw_busy_notifier == ro) {
                spin_unlock(&raw_notifier_lock);
                schedule_timeout_uninterruptible(1);
                spin_lock(&raw_notifier_lock);
        }
        list_del(&ro->notifier);
        spin_unlock(&raw_notifier_lock);

        rtnl_lock();
        lock_sock(sk);

        /* remove current filters & unregister */
        if (ro->bound) {
                if (ro->dev) {
                        raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk);
                        netdev_put(ro->dev, &ro->dev_tracker);
                } else {
                        raw_disable_allfilters(net, NULL, sk);
                }
        }

        if (ro->count > 1)
                kfree(ro->filter);

        ro->ifindex = 0;
        ro->bound = 0;
        ro->dev = NULL;
        ro->count = 0;
        free_percpu(ro->uniq);

        sock_orphan(sk);
        sock->sk = NULL;

        release_sock(sk);
        rtnl_unlock();

        sock_prot_inuse_add(net, sk->sk_prot, -1);
        sock_put(sk);

        return 0;
}

static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
{
        struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
        struct sock *sk = sock->sk;
        struct raw_sock *ro = raw_sk(sk);
        struct net_device *dev = NULL;
        int ifindex;
        int err = 0;
        int notify_enetdown = 0;

        if (len < RAW_MIN_NAMELEN)
                return -EINVAL;
        if (addr->can_family != AF_CAN)
                return -EINVAL;

        rtnl_lock();
        lock_sock(sk);

        if (ro->bound && addr->can_ifindex == ro->ifindex)
                goto out;

        if (addr->can_ifindex) {
                dev = dev_get_by_index(sock_net(sk), addr->can_ifindex);
                if (!dev) {
                        err = -ENODEV;
                        goto out;
                }
                if (dev->type != ARPHRD_CAN) {
                        err = -ENODEV;
                        goto out_put_dev;
                }

                if (!(dev->flags & IFF_UP))
                        notify_enetdown = 1;

                ifindex = dev->ifindex;

                /* filters set by default/setsockopt */
                err = raw_enable_allfilters(sock_net(sk), dev, sk);
                if (err)
                        goto out_put_dev;

        } else {
                ifindex = 0;

                /* filters set by default/setsockopt */
                err = raw_enable_allfilters(sock_net(sk), NULL, sk);
        }

        if (!err) {
                if (ro->bound) {
                        /* unregister old filters */
                        if (ro->dev) {
                                raw_disable_allfilters(dev_net(ro->dev),
                                                       ro->dev, sk);
                                /* drop reference to old ro->dev */
                                netdev_put(ro->dev, &ro->dev_tracker);
                        } else {
                                raw_disable_allfilters(sock_net(sk), NULL, sk);
                        }
                }
                ro->ifindex = ifindex;
                ro->bound = 1;
                /* bind() ok -> hold a reference for new ro->dev */
                ro->dev = dev;
                if (ro->dev)
                        netdev_hold(ro->dev, &ro->dev_tracker, GFP_KERNEL);
        }

out_put_dev:
        /* remove potential reference from dev_get_by_index() */
        dev_put(dev);
out:
        release_sock(sk);
        rtnl_unlock();

        if (notify_enetdown) {
                sk->sk_err = ENETDOWN;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
        }

        return err;
}

static int raw_getname(struct socket *sock, struct sockaddr *uaddr,
                       int peer)
{
        struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
        struct sock *sk = sock->sk;
        struct raw_sock *ro = raw_sk(sk);

        if (peer)
                return -EOPNOTSUPP;

        memset(addr, 0, RAW_MIN_NAMELEN);
        addr->can_family  = AF_CAN;
        addr->can_ifindex = ro->ifindex;

        return RAW_MIN_NAMELEN;
}

static int raw_setsockopt(struct socket *sock, int level, int optname,
                          sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct raw_sock *ro = raw_sk(sk);
        struct can_filter *filter = NULL;  /* dyn. alloc'ed filters */
        struct can_filter sfilter;         /* single filter */
        struct net_device *dev = NULL;
        can_err_mask_t err_mask = 0;
        int fd_frames;
        int count = 0;
        int err = 0;

        if (level != SOL_CAN_RAW)
                return -EINVAL;

        switch (optname) {
        case CAN_RAW_FILTER:
                if (optlen % sizeof(struct can_filter) != 0)
                        return -EINVAL;

                if (optlen > CAN_RAW_FILTER_MAX * sizeof(struct can_filter))
                        return -EINVAL;

                count = optlen / sizeof(struct can_filter);

                if (count > 1) {
                        /* filter does not fit into dfilter => alloc space */
                        filter = memdup_sockptr(optval, optlen);
                        if (IS_ERR(filter))
                                return PTR_ERR(filter);
                } else if (count == 1) {
                        if (copy_from_sockptr(&sfilter, optval, sizeof(sfilter)))
                                return -EFAULT;
                }

                rtnl_lock();
                lock_sock(sk);

                dev = ro->dev;
                if (ro->bound && dev) {
                        if (dev->reg_state != NETREG_REGISTERED) {
                                if (count > 1)
                                        kfree(filter);
                                err = -ENODEV;
                                goto out_fil;
                        }
                }

                if (ro->bound) {
                        /* (try to) register the new filters */
                        if (count == 1)
                                err = raw_enable_filters(sock_net(sk), dev, sk,
                                                         &sfilter, 1);
                        else
                                err = raw_enable_filters(sock_net(sk), dev, sk,
                                                         filter, count);
                        if (err) {
                                if (count > 1)
                                        kfree(filter);
                                goto out_fil;
                        }

                        /* remove old filter registrations */
                        raw_disable_filters(sock_net(sk), dev, sk, ro->filter,
                                            ro->count);
                }

                /* remove old filter space */
                if (ro->count > 1)
                        kfree(ro->filter);

                /* link new filters to the socket */
                if (count == 1) {
                        /* copy filter data for single filter */
                        ro->dfilter = sfilter;
                        filter = &ro->dfilter;
                }
                ro->filter = filter;
                ro->count  = count;

 out_fil:
                release_sock(sk);
                rtnl_unlock();

                break;

        case CAN_RAW_ERR_FILTER:
                if (optlen != sizeof(err_mask))
                        return -EINVAL;

                if (copy_from_sockptr(&err_mask, optval, optlen))
                        return -EFAULT;

                err_mask &= CAN_ERR_MASK;

                rtnl_lock();
                lock_sock(sk);

                dev = ro->dev;
                if (ro->bound && dev) {
                        if (dev->reg_state != NETREG_REGISTERED) {
                                err = -ENODEV;
                                goto out_err;
                        }
                }

                /* remove current error mask */
                if (ro->bound) {
                        /* (try to) register the new err_mask */
                        err = raw_enable_errfilter(sock_net(sk), dev, sk,
                                                   err_mask);

                        if (err)
                                goto out_err;

                        /* remove old err_mask registration */
                        raw_disable_errfilter(sock_net(sk), dev, sk,
                                              ro->err_mask);
                }

                /* link new err_mask to the socket */
                ro->err_mask = err_mask;

 out_err:
                release_sock(sk);
                rtnl_unlock();

                break;

        case CAN_RAW_LOOPBACK:
                if (optlen != sizeof(ro->loopback))
                        return -EINVAL;

                if (copy_from_sockptr(&ro->loopback, optval, optlen))
                        return -EFAULT;

                break;

        case CAN_RAW_RECV_OWN_MSGS:
                if (optlen != sizeof(ro->recv_own_msgs))
                        return -EINVAL;

                if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen))
                        return -EFAULT;

                break;

        case CAN_RAW_FD_FRAMES:
                if (optlen != sizeof(fd_frames))
                        return -EINVAL;

                if (copy_from_sockptr(&fd_frames, optval, optlen))
                        return -EFAULT;

                /* Enabling CAN XL includes CAN FD */
                if (ro->xl_frames && !fd_frames)
                        return -EINVAL;

                ro->fd_frames = fd_frames;
                break;

        case CAN_RAW_XL_FRAMES:
                if (optlen != sizeof(ro->xl_frames))
                        return -EINVAL;

                if (copy_from_sockptr(&ro->xl_frames, optval, optlen))
                        return -EFAULT;

                /* Enabling CAN XL includes CAN FD */
                if (ro->xl_frames)
                        ro->fd_frames = ro->xl_frames;
                break;

        case CAN_RAW_XL_VCID_OPTS:
                if (optlen != sizeof(ro->raw_vcid_opts))
                        return -EINVAL;

                if (copy_from_sockptr(&ro->raw_vcid_opts, optval, optlen))
                        return -EFAULT;

                /* prepare 32 bit values for handling in hot path */
                ro->tx_vcid_shifted = ro->raw_vcid_opts.tx_vcid << CANXL_VCID_OFFSET;
                ro->rx_vcid_shifted = ro->raw_vcid_opts.rx_vcid << CANXL_VCID_OFFSET;
                ro->rx_vcid_mask_shifted = ro->raw_vcid_opts.rx_vcid_mask << CANXL_VCID_OFFSET;
                break;

        case CAN_RAW_JOIN_FILTERS:
                if (optlen != sizeof(ro->join_filters))
                        return -EINVAL;

                if (copy_from_sockptr(&ro->join_filters, optval, optlen))
                        return -EFAULT;

                break;

        default:
                return -ENOPROTOOPT;
        }
        return err;
}

static int raw_getsockopt(struct socket *sock, int level, int optname,
                          char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        struct raw_sock *ro = raw_sk(sk);
        int len;
        void *val;

        if (level != SOL_CAN_RAW)
                return -EINVAL;
        if (get_user(len, optlen))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case CAN_RAW_FILTER: {
                int err = 0;

                lock_sock(sk);
                if (ro->count > 0) {
                        int fsize = ro->count * sizeof(struct can_filter);

                        /* user space buffer to small for filter list? */
                        if (len < fsize) {
                                /* return -ERANGE and needed space in optlen */
                                err = -ERANGE;
                                if (put_user(fsize, optlen))
                                        err = -EFAULT;
                        } else {
                                if (len > fsize)
                                        len = fsize;
                                if (copy_to_user(optval, ro->filter, len))
                                        err = -EFAULT;
                        }
                } else {
                        len = 0;
                }
                release_sock(sk);

                if (!err)
                        err = put_user(len, optlen);
                return err;
        }
        case CAN_RAW_ERR_FILTER:
                if (len > sizeof(can_err_mask_t))
                        len = sizeof(can_err_mask_t);
                val = &ro->err_mask;
                break;

        case CAN_RAW_LOOPBACK:
                if (len > sizeof(int))
                        len = sizeof(int);
                val = &ro->loopback;
                break;

        case CAN_RAW_RECV_OWN_MSGS:
                if (len > sizeof(int))
                        len = sizeof(int);
                val = &ro->recv_own_msgs;
                break;

        case CAN_RAW_FD_FRAMES:
                if (len > sizeof(int))
                        len = sizeof(int);
                val = &ro->fd_frames;
                break;

        case CAN_RAW_XL_FRAMES:
                if (len > sizeof(int))
                        len = sizeof(int);
                val = &ro->xl_frames;
                break;

        case CAN_RAW_XL_VCID_OPTS: {
                int err = 0;

                /* user space buffer to small for VCID opts? */
                if (len < sizeof(ro->raw_vcid_opts)) {
                        /* return -ERANGE and needed space in optlen */
                        err = -ERANGE;
                        if (put_user(sizeof(ro->raw_vcid_opts), optlen))
                                err = -EFAULT;
                } else {
                        if (len > sizeof(ro->raw_vcid_opts))
                                len = sizeof(ro->raw_vcid_opts);
                        if (copy_to_user(optval, &ro->raw_vcid_opts, len))
                                err = -EFAULT;
                }
                if (!err)
                        err = put_user(len, optlen);
                return err;
        }
        case CAN_RAW_JOIN_FILTERS:
                if (len > sizeof(int))
                        len = sizeof(int);
                val = &ro->join_filters;
                break;

        default:
                return -ENOPROTOOPT;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, val, len))
                return -EFAULT;
        return 0;
}

static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb)
{
        struct canxl_frame *cxl = (struct canxl_frame *)skb->data;

        /* sanitize non CAN XL bits */
        cxl->prio &= (CANXL_PRIO_MASK | CANXL_VCID_MASK);

        /* clear VCID in CAN XL frame if pass through is disabled */
        if (!(ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_PASS))
                cxl->prio &= CANXL_PRIO_MASK;

        /* set VCID in CAN XL frame if enabled */
        if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_SET) {
                cxl->prio &= CANXL_PRIO_MASK;
                cxl->prio |= ro->tx_vcid_shifted;
        }
}

static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu)
{
        /* Classical CAN -> no checks for flags and device capabilities */
        if (can_is_can_skb(skb))
                return CAN_MTU;

        /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */
        if (ro->fd_frames && can_is_canfd_skb(skb) &&
            (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu)))
                return CANFD_MTU;

        /* CAN XL -> needs to be enabled and a CAN XL device */
        if (ro->xl_frames && can_is_canxl_skb(skb) &&
            can_is_canxl_dev_mtu(mtu))
                return CANXL_MTU;

        return 0;
}

static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
        struct sock *sk = sock->sk;
        struct raw_sock *ro = raw_sk(sk);
        struct sockcm_cookie sockc;
        struct sk_buff *skb;
        struct net_device *dev;
        unsigned int txmtu;
        int ifindex;
        int err = -EINVAL;

        /* check for valid CAN frame sizes */
        if (size < CANXL_HDR_SIZE + CANXL_MIN_DLEN || size > CANXL_MTU)
                return -EINVAL;

        if (msg->msg_name) {
                DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);

                if (msg->msg_namelen < RAW_MIN_NAMELEN)
                        return -EINVAL;

                if (addr->can_family != AF_CAN)
                        return -EINVAL;

                ifindex = addr->can_ifindex;
        } else {
                ifindex = ro->ifindex;
        }

        dev = dev_get_by_index(sock_net(sk), ifindex);
        if (!dev)
                return -ENXIO;

        skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv),
                                  msg->msg_flags & MSG_DONTWAIT, &err);
        if (!skb)
                goto put_dev;

        can_skb_reserve(skb);
        can_skb_prv(skb)->ifindex = dev->ifindex;
        can_skb_prv(skb)->skbcnt = 0;

        /* fill the skb before testing for valid CAN frames */
        err = memcpy_from_msg(skb_put(skb, size), msg, size);
        if (err < 0)
                goto free_skb;

        err = -EINVAL;

        /* check for valid CAN (CC/FD/XL) frame content */
        txmtu = raw_check_txframe(ro, skb, dev->mtu);
        if (!txmtu)
                goto free_skb;

        /* only CANXL: clear/forward/set VCID value */
        if (txmtu == CANXL_MTU)
                raw_put_canxl_vcid(ro, skb);

        sockcm_init(&sockc, sk);
        if (msg->msg_controllen) {
                err = sock_cmsg_send(sk, msg, &sockc);
                if (unlikely(err))
                        goto free_skb;
        }

        skb->dev = dev;
        skb->priority = sockc.priority;
        skb->mark = sockc.mark;
        skb->tstamp = sockc.transmit_time;

        skb_setup_tx_timestamp(skb, &sockc);

        err = can_send(skb, ro->loopback);

        dev_put(dev);

        if (err)
                goto send_failed;

        return size;

free_skb:
        kfree_skb(skb);
put_dev:
        dev_put(dev);
send_failed:
        return err;
}

static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                       int flags)
{
        struct sock *sk = sock->sk;
        struct sk_buff *skb;
        int err = 0;

        if (flags & MSG_ERRQUEUE)
                return sock_recv_errqueue(sk, msg, size,
                                          SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE);

        skb = skb_recv_datagram(sk, flags, &err);
        if (!skb)
                return err;

        if (size < skb->len)
                msg->msg_flags |= MSG_TRUNC;
        else
                size = skb->len;

        err = memcpy_to_msg(msg, skb->data, size);
        if (err < 0) {
                skb_free_datagram(sk, skb);
                return err;
        }

        sock_recv_cmsgs(msg, sk, skb);

        if (msg->msg_name) {
                __sockaddr_check_size(RAW_MIN_NAMELEN);
                msg->msg_namelen = RAW_MIN_NAMELEN;
                memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
        }

        /* assign the flags that have been recorded in raw_rcv() */
        msg->msg_flags |= *(raw_flags(skb));

        skb_free_datagram(sk, skb);

        return size;
}

static int raw_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
                                unsigned long arg)
{
        /* no ioctls for socket layer -> hand it down to NIC layer */
        return -ENOIOCTLCMD;
}

static const struct proto_ops raw_ops = {
        .family        = PF_CAN,
        .release       = raw_release,
        .bind          = raw_bind,
        .connect       = sock_no_connect,
        .socketpair    = sock_no_socketpair,
        .accept        = sock_no_accept,
        .getname       = raw_getname,
        .poll          = datagram_poll,
        .ioctl         = raw_sock_no_ioctlcmd,
        .gettstamp     = sock_gettstamp,
        .listen        = sock_no_listen,
        .shutdown      = sock_no_shutdown,
        .setsockopt    = raw_setsockopt,
        .getsockopt    = raw_getsockopt,
        .sendmsg       = raw_sendmsg,
        .recvmsg       = raw_recvmsg,
        .mmap          = sock_no_mmap,
};

static struct proto raw_proto __read_mostly = {
        .name       = "CAN_RAW",
        .owner      = THIS_MODULE,
        .obj_size   = sizeof(struct raw_sock),
        .init       = raw_init,
};

static const struct can_proto raw_can_proto = {
        .type       = SOCK_RAW,
        .protocol   = CAN_RAW,
        .ops        = &raw_ops,
        .prot       = &raw_proto,
};

static struct notifier_block canraw_notifier = {
        .notifier_call = raw_notifier
};

static __init int raw_module_init(void)
{
        int err;

        pr_info("can: raw protocol\n");

        err = register_netdevice_notifier(&canraw_notifier);
        if (err)
                return err;

        err = can_proto_register(&raw_can_proto);
        if (err < 0) {
                pr_err("can: registration of raw protocol failed\n");
                goto register_proto_failed;
        }

        return 0;

register_proto_failed:
        unregister_netdevice_notifier(&canraw_notifier);
        return err;
}

static __exit void raw_module_exit(void)
{
        can_proto_unregister(&raw_can_proto);
        unregister_netdevice_notifier(&canraw_notifier);
}

module_init(raw_module_init);
module_exit(raw_module_exit);

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   34 




   34 




















   34 













































































































































































































































































































































































































































































































































































   34 
   34 
























































  671 




















  202 




  202 





































































    8 










    8 




























 1208 





























 1208 
 1208 
































































  187 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock - Filesystem management and hooks
 *
 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2018-2020 ANSSI
 * Copyright © 2021-2025 Microsoft Corporation
 * Copyright © 2022 Günther Noack <gnoack3000@gmail.com>
 * Copyright © 2023-2024 Google LLC
 */

#include <asm/ioctls.h>
#include <kunit/test.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/bits.h>
#include <linux/compiler_types.h>
#include <linux/dcache.h>
#include <linux/err.h>
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/limits.h>
#include <linux/list.h>
#include <linux/lsm_audit.h>
#include <linux/lsm_hooks.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/path.h>
#include <linux/pid.h>
#include <linux/rcupdate.h>
#include <linux/sched/signal.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/types.h>
#include <linux/wait_bit.h>
#include <linux/workqueue.h>
#include <uapi/linux/fiemap.h>
#include <uapi/linux/landlock.h>

#include "access.h"
#include "audit.h"
#include "common.h"
#include "cred.h"
#include "domain.h"
#include "fs.h"
#include "limits.h"
#include "object.h"
#include "ruleset.h"
#include "setup.h"

/* Underlying object management */

static void release_inode(struct landlock_object *const object)
        __releases(object->lock)
{
        struct inode *const inode = object->underobj;
        struct super_block *sb;

        if (!inode) {
                spin_unlock(&object->lock);
                return;
        }

        /*
         * Protects against concurrent use by hook_sb_delete() of the reference
         * to the underlying inode.
         */
        object->underobj = NULL;
        /*
         * Makes sure that if the filesystem is concurrently unmounted,
         * hook_sb_delete() will wait for us to finish iput().
         */
        sb = inode->i_sb;
        atomic_long_inc(&landlock_superblock(sb)->inode_refs);
        spin_unlock(&object->lock);
        /*
         * Because object->underobj was not NULL, hook_sb_delete() and
         * get_inode_object() guarantee that it is safe to reset
         * landlock_inode(inode)->object while it is not NULL.  It is therefore
         * not necessary to lock inode->i_lock.
         */
        rcu_assign_pointer(landlock_inode(inode)->object, NULL);
        /*
         * Now, new rules can safely be tied to @inode with get_inode_object().
         */

        iput(inode);
        if (atomic_long_dec_and_test(&landlock_superblock(sb)->inode_refs))
                wake_up_var(&landlock_superblock(sb)->inode_refs);
}

static const struct landlock_object_underops landlock_fs_underops = {
        .release = release_inode
};

/* IOCTL helpers */

/**
 * is_masked_device_ioctl - Determine whether an IOCTL command is always
 * permitted with Landlock for device files.  These commands can not be
 * restricted on device files by enforcing a Landlock policy.
 *
 * @cmd: The IOCTL command that is supposed to be run.
 *
 * By default, any IOCTL on a device file requires the
 * LANDLOCK_ACCESS_FS_IOCTL_DEV right.  However, we blanket-permit some
 * commands, if:
 *
 * 1. The command is implemented in fs/ioctl.c's do_vfs_ioctl(),
 *    not in f_ops->unlocked_ioctl() or f_ops->compat_ioctl().
 *
 * 2. The command is harmless when invoked on devices.
 *
 * We also permit commands that do not make sense for devices, but where the
 * do_vfs_ioctl() implementation returns a more conventional error code.
 *
 * Any new IOCTL commands that are implemented in fs/ioctl.c's do_vfs_ioctl()
 * should be considered for inclusion here.
 *
 * Returns: true if the IOCTL @cmd can not be restricted with Landlock for
 * device files.
 */
static __attribute_const__ bool is_masked_device_ioctl(const unsigned int cmd)
{
        switch (cmd) {
        /*
         * FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's
         * close-on-exec and the file's buffered-IO and async flags.  These
         * operations are also available through fcntl(2), and are
         * unconditionally permitted in Landlock.
         */
        case FIOCLEX:
        case FIONCLEX:
        case FIONBIO:
        case FIOASYNC:
        /*
         * FIOQSIZE queries the size of a regular file, directory, or link.
         *
         * We still permit it, because it always returns -ENOTTY for
         * other file types.
         */
        case FIOQSIZE:
        /*
         * FIFREEZE and FITHAW freeze and thaw the file system which the
         * given file belongs to.  Requires CAP_SYS_ADMIN.
         *
         * These commands operate on the file system's superblock rather
         * than on the file itself.  The same operations can also be
         * done through any other file or directory on the same file
         * system, so it is safe to permit these.
         */
        case FIFREEZE:
        case FITHAW:
        /*
         * FS_IOC_FIEMAP queries information about the allocation of
         * blocks within a file.
         *
         * This IOCTL command only makes sense for regular files and is
         * not implemented by devices. It is harmless to permit.
         */
        case FS_IOC_FIEMAP:
        /*
         * FIGETBSZ queries the file system's block size for a file or
         * directory.
         *
         * This command operates on the file system's superblock rather
         * than on the file itself.  The same operation can also be done
         * through any other file or directory on the same file system,
         * so it is safe to permit it.
         */
        case FIGETBSZ:
        /*
         * FICLONE, FICLONERANGE and FIDEDUPERANGE make files share
         * their underlying storage ("reflink") between source and
         * destination FDs, on file systems which support that.
         *
         * These IOCTL commands only apply to regular files
         * and are harmless to permit for device files.
         */
        case FICLONE:
        case FICLONERANGE:
        case FIDEDUPERANGE:
        /*
         * FS_IOC_GETFSUUID and FS_IOC_GETFSSYSFSPATH both operate on
         * the file system superblock, not on the specific file, so
         * these operations are available through any other file on the
         * same file system as well.
         */
        case FS_IOC_GETFSUUID:
        case FS_IOC_GETFSSYSFSPATH:
                return true;

        /*
         * FIONREAD, FS_IOC_GETFLAGS, FS_IOC_SETFLAGS, FS_IOC_FSGETXATTR and
         * FS_IOC_FSSETXATTR are forwarded to device implementations.
         */

        /*
         * file_ioctl() commands (FIBMAP, FS_IOC_RESVSP, FS_IOC_RESVSP64,
         * FS_IOC_UNRESVSP, FS_IOC_UNRESVSP64 and FS_IOC_ZERO_RANGE) are
         * forwarded to device implementations, so not permitted.
         */

        /* Other commands are guarded by the access right. */
        default:
                return false;
        }
}

/*
 * is_masked_device_ioctl_compat - same as the helper above, but checking the
 * "compat" IOCTL commands.
 *
 * The IOCTL commands with special handling in compat-mode should behave the
 * same as their non-compat counterparts.
 */
static __attribute_const__ bool
is_masked_device_ioctl_compat(const unsigned int cmd)
{
        switch (cmd) {
        /* FICLONE is permitted, same as in the non-compat variant. */
        case FICLONE:
                return true;

#if defined(CONFIG_X86_64)
        /*
         * FS_IOC_RESVSP_32, FS_IOC_RESVSP64_32, FS_IOC_UNRESVSP_32,
         * FS_IOC_UNRESVSP64_32, FS_IOC_ZERO_RANGE_32: not blanket-permitted,
         * for consistency with their non-compat variants.
         */
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
        case FS_IOC_UNRESVSP_32:
        case FS_IOC_UNRESVSP64_32:
        case FS_IOC_ZERO_RANGE_32:
#endif

        /*
         * FS_IOC32_GETFLAGS, FS_IOC32_SETFLAGS are forwarded to their device
         * implementations.
         */
        case FS_IOC32_GETFLAGS:
        case FS_IOC32_SETFLAGS:
                return false;
        default:
                return is_masked_device_ioctl(cmd);
        }
}

/* Ruleset management */

static struct landlock_object *get_inode_object(struct inode *const inode)
{
        struct landlock_object *object, *new_object;
        struct landlock_inode_security *inode_sec = landlock_inode(inode);

        rcu_read_lock();
retry:
        object = rcu_dereference(inode_sec->object);
        if (object) {
                if (likely(refcount_inc_not_zero(&object->usage))) {
                        rcu_read_unlock();
                        return object;
                }
                /*
                 * We are racing with release_inode(), the object is going
                 * away.  Wait for release_inode(), then retry.
                 */
                spin_lock(&object->lock);
                spin_unlock(&object->lock);
                goto retry;
        }
        rcu_read_unlock();

        /*
         * If there is no object tied to @inode, then create a new one (without
         * holding any locks).
         */
        new_object = landlock_create_object(&landlock_fs_underops, inode);
        if (IS_ERR(new_object))
                return new_object;

        /*
         * Protects against concurrent calls to get_inode_object() or
         * hook_sb_delete().
         */
        spin_lock(&inode->i_lock);
        if (unlikely(rcu_access_pointer(inode_sec->object))) {
                /* Someone else just created the object, bail out and retry. */
                spin_unlock(&inode->i_lock);
                kfree(new_object);

                rcu_read_lock();
                goto retry;
        }

        /*
         * @inode will be released by hook_sb_delete() on its superblock
         * shutdown, or by release_inode() when no more ruleset references the
         * related object.
         */
        ihold(inode);
        rcu_assign_pointer(inode_sec->object, new_object);
        spin_unlock(&inode->i_lock);
        return new_object;
}

/* All access rights that can be tied to files. */
/* clang-format off */
#define ACCESS_FILE ( \
        LANDLOCK_ACCESS_FS_EXECUTE | \
        LANDLOCK_ACCESS_FS_WRITE_FILE | \
        LANDLOCK_ACCESS_FS_READ_FILE | \
        LANDLOCK_ACCESS_FS_TRUNCATE | \
        LANDLOCK_ACCESS_FS_IOCTL_DEV)
/* clang-format on */

/*
 * @path: Should have been checked by get_path_from_fd().
 */
int landlock_append_fs_rule(struct landlock_ruleset *const ruleset,
                            const struct path *const path,
                            access_mask_t access_rights)
{
        int err;
        struct landlock_id id = {
                .type = LANDLOCK_KEY_INODE,
        };

        /* Files only get access rights that make sense. */
        if (!d_is_dir(path->dentry) &&
            (access_rights | ACCESS_FILE) != ACCESS_FILE)
                return -EINVAL;
        if (WARN_ON_ONCE(ruleset->num_layers != 1))
                return -EINVAL;

        /* Transforms relative access rights to absolute ones. */
        access_rights |= LANDLOCK_MASK_ACCESS_FS &
                         ~landlock_get_fs_access_mask(ruleset, 0);
        id.key.object = get_inode_object(d_backing_inode(path->dentry));
        if (IS_ERR(id.key.object))
                return PTR_ERR(id.key.object);
        mutex_lock(&ruleset->lock);
        err = landlock_insert_rule(ruleset, id, access_rights);
        mutex_unlock(&ruleset->lock);
        /*
         * No need to check for an error because landlock_insert_rule()
         * increments the refcount for the new object if needed.
         */
        landlock_put_object(id.key.object);
        return err;
}

/* Access-control management */

/*
 * The lifetime of the returned rule is tied to @domain.
 *
 * Returns NULL if no rule is found or if @dentry is negative.
 */
static const struct landlock_rule *
find_rule(const struct landlock_ruleset *const domain,
          const struct dentry *const dentry)
{
        const struct landlock_rule *rule;
        const struct inode *inode;
        struct landlock_id id = {
                .type = LANDLOCK_KEY_INODE,
        };

        /* Ignores nonexistent leafs. */
        if (d_is_negative(dentry))
                return NULL;

        inode = d_backing_inode(dentry);
        rcu_read_lock();
        id.key.object = rcu_dereference(landlock_inode(inode)->object);
        rule = landlock_find_rule(domain, id);
        rcu_read_unlock();
        return rule;
}

/*
 * Allows access to pseudo filesystems that will never be mountable (e.g.
 * sockfs, pipefs), but can still be reachable through
 * /proc/<pid>/fd/<file-descriptor>
 */
static bool is_nouser_or_private(const struct dentry *dentry)
{
        return (dentry->d_sb->s_flags & SB_NOUSER) ||
               (d_is_positive(dentry) &&
                unlikely(IS_PRIVATE(d_backing_inode(dentry))));
}

static const struct access_masks any_fs = {
        .fs = ~0,
};

/*
 * Check that a destination file hierarchy has more restrictions than a source
 * file hierarchy.  This is only used for link and rename actions.
 *
 * @layer_masks_child2: Optional child masks.
 */
static bool no_more_access(
        const layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
        const layer_mask_t (*const layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS],
        const bool child1_is_directory,
        const layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
        const layer_mask_t (*const layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS],
        const bool child2_is_directory)
{
        unsigned long access_bit;

        for (access_bit = 0; access_bit < ARRAY_SIZE(*layer_masks_parent2);
             access_bit++) {
                /* Ignores accesses that only make sense for directories. */
                const bool is_file_access =
                        !!(BIT_ULL(access_bit) & ACCESS_FILE);

                if (child1_is_directory || is_file_access) {
                        /*
                         * Checks if the destination restrictions are a
                         * superset of the source ones (i.e. inherited access
                         * rights without child exceptions):
                         * restrictions(parent2) >= restrictions(child1)
                         */
                        if ((((*layer_masks_parent1)[access_bit] &
                              (*layer_masks_child1)[access_bit]) |
                             (*layer_masks_parent2)[access_bit]) !=
                            (*layer_masks_parent2)[access_bit])
                                return false;
                }

                if (!layer_masks_child2)
                        continue;
                if (child2_is_directory || is_file_access) {
                        /*
                         * Checks inverted restrictions for RENAME_EXCHANGE:
                         * restrictions(parent1) >= restrictions(child2)
                         */
                        if ((((*layer_masks_parent2)[access_bit] &
                              (*layer_masks_child2)[access_bit]) |
                             (*layer_masks_parent1)[access_bit]) !=
                            (*layer_masks_parent1)[access_bit])
                                return false;
                }
        }
        return true;
}

#define NMA_TRUE(...) KUNIT_EXPECT_TRUE(test, no_more_access(__VA_ARGS__))
#define NMA_FALSE(...) KUNIT_EXPECT_FALSE(test, no_more_access(__VA_ARGS__))

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_no_more_access(struct kunit *const test)
{
        const layer_mask_t rx0[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_READ_FILE)] = BIT_ULL(0),
        };
        const layer_mask_t mx0[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_MAKE_REG)] = BIT_ULL(0),
        };
        const layer_mask_t x0[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
        };
        const layer_mask_t x1[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(1),
        };
        const layer_mask_t x01[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0) |
                                                          BIT_ULL(1),
        };
        const layer_mask_t allows_all[LANDLOCK_NUM_ACCESS_FS] = {};

        /* Checks without restriction. */
        NMA_TRUE(&x0, &allows_all, false, &allows_all, NULL, false);
        NMA_TRUE(&allows_all, &x0, false, &allows_all, NULL, false);
        NMA_FALSE(&x0, &x0, false, &allows_all, NULL, false);

        /*
         * Checks that we can only refer a file if no more access could be
         * inherited.
         */
        NMA_TRUE(&x0, &x0, false, &rx0, NULL, false);
        NMA_TRUE(&rx0, &rx0, false, &rx0, NULL, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, NULL, false);
        NMA_FALSE(&rx0, &rx0, false, &x1, NULL, false);

        /* Checks allowed referring with different nested domains. */
        NMA_TRUE(&x0, &x1, false, &x0, NULL, false);
        NMA_TRUE(&x1, &x0, false, &x0, NULL, false);
        NMA_TRUE(&x0, &x01, false, &x0, NULL, false);
        NMA_TRUE(&x0, &x01, false, &rx0, NULL, false);
        NMA_TRUE(&x01, &x0, false, &x0, NULL, false);
        NMA_TRUE(&x01, &x0, false, &rx0, NULL, false);
        NMA_FALSE(&x01, &x01, false, &x0, NULL, false);

        /* Checks that file access rights are also enforced for a directory. */
        NMA_FALSE(&rx0, &rx0, true, &x0, NULL, false);

        /* Checks that directory access rights don't impact file referring... */
        NMA_TRUE(&mx0, &mx0, false, &x0, NULL, false);
        /* ...but only directory referring. */
        NMA_FALSE(&mx0, &mx0, true, &x0, NULL, false);

        /* Checks directory exchange. */
        NMA_TRUE(&mx0, &mx0, true, &mx0, &mx0, true);
        NMA_TRUE(&mx0, &mx0, true, &mx0, &x0, true);
        NMA_FALSE(&mx0, &mx0, true, &x0, &mx0, true);
        NMA_FALSE(&mx0, &mx0, true, &x0, &x0, true);
        NMA_FALSE(&mx0, &mx0, true, &x1, &x1, true);

        /* Checks file exchange with directory access rights... */
        NMA_TRUE(&mx0, &mx0, false, &mx0, &mx0, false);
        NMA_TRUE(&mx0, &mx0, false, &mx0, &x0, false);
        NMA_TRUE(&mx0, &mx0, false, &x0, &mx0, false);
        NMA_TRUE(&mx0, &mx0, false, &x0, &x0, false);
        /* ...and with file access rights. */
        NMA_TRUE(&rx0, &rx0, false, &rx0, &rx0, false);
        NMA_TRUE(&rx0, &rx0, false, &rx0, &x0, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, &rx0, false);
        NMA_FALSE(&rx0, &rx0, false, &x0, &x0, false);
        NMA_FALSE(&rx0, &rx0, false, &x1, &x1, false);

        /*
         * Allowing the following requests should not be a security risk
         * because domain 0 denies execute access, and domain 1 is always
         * nested with domain 0.  However, adding an exception for this case
         * would mean to check all nested domains to make sure none can get
         * more privileges (e.g. processes only sandboxed by domain 0).
         * Moreover, this behavior (i.e. composition of N domains) could then
         * be inconsistent compared to domain 1's ruleset alone (e.g. it might
         * be denied to link/rename with domain 1's ruleset, whereas it would
         * be allowed if nested on top of domain 0).  Another drawback would be
         * to create a cover channel that could enable sandboxed processes to
         * infer most of the filesystem restrictions from their domain.  To
         * make it simple, efficient, safe, and more consistent, this case is
         * always denied.
         */
        NMA_FALSE(&x1, &x1, false, &x0, NULL, false);
        NMA_FALSE(&x1, &x1, false, &rx0, NULL, false);
        NMA_FALSE(&x1, &x1, true, &x0, NULL, false);
        NMA_FALSE(&x1, &x1, true, &rx0, NULL, false);

        /* Checks the same case of exclusive domains with a file... */
        NMA_TRUE(&x1, &x1, false, &x01, NULL, false);
        NMA_FALSE(&x1, &x1, false, &x01, &x0, false);
        NMA_FALSE(&x1, &x1, false, &x01, &x01, false);
        NMA_FALSE(&x1, &x1, false, &x0, &x0, false);
        /* ...and with a directory. */
        NMA_FALSE(&x1, &x1, false, &x0, &x0, true);
        NMA_FALSE(&x1, &x1, true, &x0, &x0, false);
        NMA_FALSE(&x1, &x1, true, &x0, &x0, true);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

#undef NMA_TRUE
#undef NMA_FALSE

static bool is_layer_masks_allowed(
        layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
{
        return !memchr_inv(layer_masks, 0, sizeof(*layer_masks));
}

/*
 * Removes @layer_masks accesses that are not requested.
 *
 * Returns true if the request is allowed, false otherwise.
 */
static bool
scope_to_request(const access_mask_t access_request,
                 layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS])
{
        const unsigned long access_req = access_request;
        unsigned long access_bit;

        if (WARN_ON_ONCE(!layer_masks))
                return true;

        for_each_clear_bit(access_bit, &access_req, ARRAY_SIZE(*layer_masks))
                (*layer_masks)[access_bit] = 0;

        return is_layer_masks_allowed(layer_masks);
}

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_scope_to_request_with_exec_none(struct kunit *const test)
{
        /* Allows everything. */
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};

        /* Checks and scopes with execute. */
        KUNIT_EXPECT_TRUE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
                                                 &layer_masks));
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
}

static void test_scope_to_request_with_exec_some(struct kunit *const test)
{
        /* Denies execute and write. */
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
        };

        /* Checks and scopes with execute. */
        KUNIT_EXPECT_FALSE(test, scope_to_request(LANDLOCK_ACCESS_FS_EXECUTE,
                                                  &layer_masks));
        KUNIT_EXPECT_EQ(test, BIT_ULL(0),
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
}

static void test_scope_to_request_without_access(struct kunit *const test)
{
        /* Denies execute and write. */
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)] = BIT_ULL(0),
                [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(1),
        };

        /* Checks and scopes without access request. */
        KUNIT_EXPECT_TRUE(test, scope_to_request(0, &layer_masks));
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_EXECUTE)]);
        KUNIT_EXPECT_EQ(test, 0,
                        layer_masks[BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)]);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

/*
 * Returns true if there is at least one access right different than
 * LANDLOCK_ACCESS_FS_REFER.
 */
static bool
is_eacces(const layer_mask_t (*const layer_masks)[LANDLOCK_NUM_ACCESS_FS],
          const access_mask_t access_request)
{
        unsigned long access_bit;
        /* LANDLOCK_ACCESS_FS_REFER alone must return -EXDEV. */
        const unsigned long access_check = access_request &
                                           ~LANDLOCK_ACCESS_FS_REFER;

        if (!layer_masks)
                return false;

        for_each_set_bit(access_bit, &access_check, ARRAY_SIZE(*layer_masks)) {
                if ((*layer_masks)[access_bit])
                        return true;
        }
        return false;
}

#define IE_TRUE(...) KUNIT_EXPECT_TRUE(test, is_eacces(__VA_ARGS__))
#define IE_FALSE(...) KUNIT_EXPECT_FALSE(test, is_eacces(__VA_ARGS__))

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

static void test_is_eacces_with_none(struct kunit *const test)
{
        const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};

        IE_FALSE(&layer_masks, 0);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

static void test_is_eacces_with_refer(struct kunit *const test)
{
        const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_REFER)] = BIT_ULL(0),
        };

        IE_FALSE(&layer_masks, 0);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

static void test_is_eacces_with_write(struct kunit *const test)
{
        const layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {
                [BIT_INDEX(LANDLOCK_ACCESS_FS_WRITE_FILE)] = BIT_ULL(0),
        };

        IE_FALSE(&layer_masks, 0);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_REFER);
        IE_FALSE(&layer_masks, LANDLOCK_ACCESS_FS_EXECUTE);

        IE_TRUE(&layer_masks, LANDLOCK_ACCESS_FS_WRITE_FILE);
}

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */

#undef IE_TRUE
#undef IE_FALSE

/**
 * is_access_to_paths_allowed - Check accesses for requests with a common path
 *
 * @domain: Domain to check against.
 * @path: File hierarchy to walk through.
 * @access_request_parent1: Accesses to check, once @layer_masks_parent1 is
 *     equal to @layer_masks_parent2 (if any).  This is tied to the unique
 *     requested path for most actions, or the source in case of a refer action
 *     (i.e. rename or link), or the source and destination in case of
 *     RENAME_EXCHANGE.
 * @layer_masks_parent1: Pointer to a matrix of layer masks per access
 *     masks, identifying the layers that forbid a specific access.  Bits from
 *     this matrix can be unset according to the @path walk.  An empty matrix
 *     means that @domain allows all possible Landlock accesses (i.e. not only
 *     those identified by @access_request_parent1).  This matrix can
 *     initially refer to domain layer masks and, when the accesses for the
 *     destination and source are the same, to requested layer masks.
 * @log_request_parent1: Audit request to fill if the related access is denied.
 * @dentry_child1: Dentry to the initial child of the parent1 path.  This
 *     pointer must be NULL for non-refer actions (i.e. not link nor rename).
 * @access_request_parent2: Similar to @access_request_parent1 but for a
 *     request involving a source and a destination.  This refers to the
 *     destination, except in case of RENAME_EXCHANGE where it also refers to
 *     the source.  Must be set to 0 when using a simple path request.
 * @layer_masks_parent2: Similar to @layer_masks_parent1 but for a refer
 *     action.  This must be NULL otherwise.
 * @log_request_parent2: Audit request to fill if the related access is denied.
 * @dentry_child2: Dentry to the initial child of the parent2 path.  This
 *     pointer is only set for RENAME_EXCHANGE actions and must be NULL
 *     otherwise.
 *
 * This helper first checks that the destination has a superset of restrictions
 * compared to the source (if any) for a common path.  Because of
 * RENAME_EXCHANGE actions, source and destinations may be swapped.  It then
 * checks that the collected accesses and the remaining ones are enough to
 * allow the request.
 *
 * Returns:
 * - true if the access request is granted;
 * - false otherwise.
 */
static bool is_access_to_paths_allowed(
        const struct landlock_ruleset *const domain,
        const struct path *const path,
        const access_mask_t access_request_parent1,
        layer_mask_t (*const layer_masks_parent1)[LANDLOCK_NUM_ACCESS_FS],
        struct landlock_request *const log_request_parent1,
        struct dentry *const dentry_child1,
        const access_mask_t access_request_parent2,
        layer_mask_t (*const layer_masks_parent2)[LANDLOCK_NUM_ACCESS_FS],
        struct landlock_request *const log_request_parent2,
        struct dentry *const dentry_child2)
{
        bool allowed_parent1 = false, allowed_parent2 = false, is_dom_check,
             child1_is_directory = true, child2_is_directory = true;
        struct path walker_path;
        access_mask_t access_masked_parent1, access_masked_parent2;
        layer_mask_t _layer_masks_child1[LANDLOCK_NUM_ACCESS_FS],
                _layer_masks_child2[LANDLOCK_NUM_ACCESS_FS];
        layer_mask_t(*layer_masks_child1)[LANDLOCK_NUM_ACCESS_FS] = NULL,
        (*layer_masks_child2)[LANDLOCK_NUM_ACCESS_FS] = NULL;

        if (!access_request_parent1 && !access_request_parent2)
                return true;

        if (WARN_ON_ONCE(!path))
                return true;

        if (is_nouser_or_private(path->dentry))
                return true;

        if (WARN_ON_ONCE(!layer_masks_parent1))
                return false;

        allowed_parent1 = is_layer_masks_allowed(layer_masks_parent1);

        if (unlikely(layer_masks_parent2)) {
                if (WARN_ON_ONCE(!dentry_child1))
                        return false;

                allowed_parent2 = is_layer_masks_allowed(layer_masks_parent2);

                /*
                 * For a double request, first check for potential privilege
                 * escalation by looking at domain handled accesses (which are
                 * a superset of the meaningful requested accesses).
                 */
                access_masked_parent1 = access_masked_parent2 =
                        landlock_union_access_masks(domain).fs;
                is_dom_check = true;
        } else {
                if (WARN_ON_ONCE(dentry_child1 || dentry_child2))
                        return false;
                /* For a simple request, only check for requested accesses. */
                access_masked_parent1 = access_request_parent1;
                access_masked_parent2 = access_request_parent2;
                is_dom_check = false;
        }

        if (unlikely(dentry_child1)) {
                landlock_unmask_layers(
                        find_rule(domain, dentry_child1),
                        landlock_init_layer_masks(
                                domain, LANDLOCK_MASK_ACCESS_FS,
                                &_layer_masks_child1, LANDLOCK_KEY_INODE),
                        &_layer_masks_child1, ARRAY_SIZE(_layer_masks_child1));
                layer_masks_child1 = &_layer_masks_child1;
                child1_is_directory = d_is_dir(dentry_child1);
        }
        if (unlikely(dentry_child2)) {
                landlock_unmask_layers(
                        find_rule(domain, dentry_child2),
                        landlock_init_layer_masks(
                                domain, LANDLOCK_MASK_ACCESS_FS,
                                &_layer_masks_child2, LANDLOCK_KEY_INODE),
                        &_layer_masks_child2, ARRAY_SIZE(_layer_masks_child2));
                layer_masks_child2 = &_layer_masks_child2;
                child2_is_directory = d_is_dir(dentry_child2);
        }

        walker_path = *path;
        path_get(&walker_path);
        /*
         * We need to walk through all the hierarchy to not miss any relevant
         * restriction.
         */
        while (true) {
                struct dentry *parent_dentry;
                const struct landlock_rule *rule;

                /*
                 * If at least all accesses allowed on the destination are
                 * already allowed on the source, respectively if there is at
                 * least as much as restrictions on the destination than on the
                 * source, then we can safely refer files from the source to
                 * the destination without risking a privilege escalation.
                 * This also applies in the case of RENAME_EXCHANGE, which
                 * implies checks on both direction.  This is crucial for
                 * standalone multilayered security policies.  Furthermore,
                 * this helps avoid policy writers to shoot themselves in the
                 * foot.
                 */
                if (unlikely(is_dom_check &&
                             no_more_access(
                                     layer_masks_parent1, layer_masks_child1,
                                     child1_is_directory, layer_masks_parent2,
                                     layer_masks_child2,
                                     child2_is_directory))) {
                        /*
                         * Now, downgrades the remaining checks from domain
                         * handled accesses to requested accesses.
                         */
                        is_dom_check = false;
                        access_masked_parent1 = access_request_parent1;
                        access_masked_parent2 = access_request_parent2;

                        allowed_parent1 =
                                allowed_parent1 ||
                                scope_to_request(access_masked_parent1,
                                                 layer_masks_parent1);
                        allowed_parent2 =
                                allowed_parent2 ||
                                scope_to_request(access_masked_parent2,
                                                 layer_masks_parent2);

                        /* Stops when all accesses are granted. */
                        if (allowed_parent1 && allowed_parent2)
                                break;
                }

                rule = find_rule(domain, walker_path.dentry);
                allowed_parent1 = allowed_parent1 ||
                                  landlock_unmask_layers(
                                          rule, access_masked_parent1,
                                          layer_masks_parent1,
                                          ARRAY_SIZE(*layer_masks_parent1));
                allowed_parent2 = allowed_parent2 ||
                                  landlock_unmask_layers(
                                          rule, access_masked_parent2,
                                          layer_masks_parent2,
                                          ARRAY_SIZE(*layer_masks_parent2));

                /* Stops when a rule from each layer grants access. */
                if (allowed_parent1 && allowed_parent2)
                        break;
jump_up:
                if (walker_path.dentry == walker_path.mnt->mnt_root) {
                        if (follow_up(&walker_path)) {
                                /* Ignores hidden mount points. */
                                goto jump_up;
                        } else {
                                /*
                                 * Stops at the real root.  Denies access
                                 * because not all layers have granted access.
                                 */
                                break;
                        }
                }
                if (unlikely(IS_ROOT(walker_path.dentry))) {
                        /*
                         * Stops at disconnected root directories.  Only allows
                         * access to internal filesystems (e.g. nsfs, which is
                         * reachable through /proc/<pid>/ns/<namespace>).
                         */
                        if (walker_path.mnt->mnt_flags & MNT_INTERNAL) {
                                allowed_parent1 = true;
                                allowed_parent2 = true;
                        }
                        break;
                }
                parent_dentry = dget_parent(walker_path.dentry);
                dput(walker_path.dentry);
                walker_path.dentry = parent_dentry;
        }
        path_put(&walker_path);

        if (!allowed_parent1) {
                log_request_parent1->type = LANDLOCK_REQUEST_FS_ACCESS;
                log_request_parent1->audit.type = LSM_AUDIT_DATA_PATH;
                log_request_parent1->audit.u.path = *path;
                log_request_parent1->access = access_masked_parent1;
                log_request_parent1->layer_masks = layer_masks_parent1;
                log_request_parent1->layer_masks_size =
                        ARRAY_SIZE(*layer_masks_parent1);
        }

        if (!allowed_parent2) {
                log_request_parent2->type = LANDLOCK_REQUEST_FS_ACCESS;
                log_request_parent2->audit.type = LSM_AUDIT_DATA_PATH;
                log_request_parent2->audit.u.path = *path;
                log_request_parent2->access = access_masked_parent2;
                log_request_parent2->layer_masks = layer_masks_parent2;
                log_request_parent2->layer_masks_size =
                        ARRAY_SIZE(*layer_masks_parent2);
        }
        return allowed_parent1 && allowed_parent2;
}

static int current_check_access_path(const struct path *const path,
                                     access_mask_t access_request)
{
        const struct access_masks masks = {
                .fs = access_request,
        };
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), masks, NULL);
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
        struct landlock_request request = {};

        if (!subject)
                return 0;

        access_request = landlock_init_layer_masks(subject->domain,
                                                   access_request, &layer_masks,
                                                   LANDLOCK_KEY_INODE);
        if (is_access_to_paths_allowed(subject->domain, path, access_request,
                                       &layer_masks, &request, NULL, 0, NULL,
                                       NULL, NULL))
                return 0;

        landlock_log_denial(subject, &request);
        return -EACCES;
}

static __attribute_const__ access_mask_t get_mode_access(const umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFLNK:
                return LANDLOCK_ACCESS_FS_MAKE_SYM;
        case S_IFDIR:
                return LANDLOCK_ACCESS_FS_MAKE_DIR;
        case S_IFCHR:
                return LANDLOCK_ACCESS_FS_MAKE_CHAR;
        case S_IFBLK:
                return LANDLOCK_ACCESS_FS_MAKE_BLOCK;
        case S_IFIFO:
                return LANDLOCK_ACCESS_FS_MAKE_FIFO;
        case S_IFSOCK:
                return LANDLOCK_ACCESS_FS_MAKE_SOCK;
        case S_IFREG:
        case 0:
                /* A zero mode translates to S_IFREG. */
        default:
                /* Treats weird files as regular files. */
                return LANDLOCK_ACCESS_FS_MAKE_REG;
        }
}

static access_mask_t maybe_remove(const struct dentry *const dentry)
{
        if (d_is_negative(dentry))
                return 0;
        return d_is_dir(dentry) ? LANDLOCK_ACCESS_FS_REMOVE_DIR :
                                  LANDLOCK_ACCESS_FS_REMOVE_FILE;
}

/**
 * collect_domain_accesses - Walk through a file path and collect accesses
 *
 * @domain: Domain to check against.
 * @mnt_root: Last directory to check.
 * @dir: Directory to start the walk from.
 * @layer_masks_dom: Where to store the collected accesses.
 *
 * This helper is useful to begin a path walk from the @dir directory to a
 * @mnt_root directory used as a mount point.  This mount point is the common
 * ancestor between the source and the destination of a renamed and linked
 * file.  While walking from @dir to @mnt_root, we record all the domain's
 * allowed accesses in @layer_masks_dom.
 *
 * This is similar to is_access_to_paths_allowed() but much simpler because it
 * only handles walking on the same mount point and only checks one set of
 * accesses.
 *
 * Returns:
 * - true if all the domain access rights are allowed for @dir;
 * - false if the walk reached @mnt_root.
 */
static bool collect_domain_accesses(
        const struct landlock_ruleset *const domain,
        const struct dentry *const mnt_root, struct dentry *dir,
        layer_mask_t (*const layer_masks_dom)[LANDLOCK_NUM_ACCESS_FS])
{
        unsigned long access_dom;
        bool ret = false;

        if (WARN_ON_ONCE(!domain || !mnt_root || !dir || !layer_masks_dom))
                return true;
        if (is_nouser_or_private(dir))
                return true;

        access_dom = landlock_init_layer_masks(domain, LANDLOCK_MASK_ACCESS_FS,
                                               layer_masks_dom,
                                               LANDLOCK_KEY_INODE);

        dget(dir);
        while (true) {
                struct dentry *parent_dentry;

                /* Gets all layers allowing all domain accesses. */
                if (landlock_unmask_layers(find_rule(domain, dir), access_dom,
                                           layer_masks_dom,
                                           ARRAY_SIZE(*layer_masks_dom))) {
                        /*
                         * Stops when all handled accesses are allowed by at
                         * least one rule in each layer.
                         */
                        ret = true;
                        break;
                }

                /* We should not reach a root other than @mnt_root. */
                if (dir == mnt_root || WARN_ON_ONCE(IS_ROOT(dir)))
                        break;

                parent_dentry = dget_parent(dir);
                dput(dir);
                dir = parent_dentry;
        }
        dput(dir);
        return ret;
}

/**
 * current_check_refer_path - Check if a rename or link action is allowed
 *
 * @old_dentry: File or directory requested to be moved or linked.
 * @new_dir: Destination parent directory.
 * @new_dentry: Destination file or directory.
 * @removable: Sets to true if it is a rename operation.
 * @exchange: Sets to true if it is a rename operation with RENAME_EXCHANGE.
 *
 * Because of its unprivileged constraints, Landlock relies on file hierarchies
 * (and not only inodes) to tie access rights to files.  Being able to link or
 * rename a file hierarchy brings some challenges.  Indeed, moving or linking a
 * file (i.e. creating a new reference to an inode) can have an impact on the
 * actions allowed for a set of files if it would change its parent directory
 * (i.e. reparenting).
 *
 * To avoid trivial access right bypasses, Landlock first checks if the file or
 * directory requested to be moved would gain new access rights inherited from
 * its new hierarchy.  Before returning any error, Landlock then checks that
 * the parent source hierarchy and the destination hierarchy would allow the
 * link or rename action.  If it is not the case, an error with EACCES is
 * returned to inform user space that there is no way to remove or create the
 * requested source file type.  If it should be allowed but the new inherited
 * access rights would be greater than the source access rights, then the
 * kernel returns an error with EXDEV.  Prioritizing EACCES over EXDEV enables
 * user space to abort the whole operation if there is no way to do it, or to
 * manually copy the source to the destination if this remains allowed, e.g.
 * because file creation is allowed on the destination directory but not direct
 * linking.
 *
 * To achieve this goal, the kernel needs to compare two file hierarchies: the
 * one identifying the source file or directory (including itself), and the
 * destination one.  This can be seen as a multilayer partial ordering problem.
 * The kernel walks through these paths and collects in a matrix the access
 * rights that are denied per layer.  These matrices are then compared to see
 * if the destination one has more (or the same) restrictions as the source
 * one.  If this is the case, the requested action will not return EXDEV, which
 * doesn't mean the action is allowed.  The parent hierarchy of the source
 * (i.e. parent directory), and the destination hierarchy must also be checked
 * to verify that they explicitly allow such action (i.e.  referencing,
 * creation and potentially removal rights).  The kernel implementation is then
 * required to rely on potentially four matrices of access rights: one for the
 * source file or directory (i.e. the child), a potentially other one for the
 * other source/destination (in case of RENAME_EXCHANGE), one for the source
 * parent hierarchy and a last one for the destination hierarchy.  These
 * ephemeral matrices take some space on the stack, which limits the number of
 * layers to a deemed reasonable number: 16.
 *
 * Returns:
 * - 0 if access is allowed;
 * - -EXDEV if @old_dentry would inherit new access rights from @new_dir;
 * - -EACCES if file removal or creation is denied.
 */
static int current_check_refer_path(struct dentry *const old_dentry,
                                    const struct path *const new_dir,
                                    struct dentry *const new_dentry,
                                    const bool removable, const bool exchange)
{
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs, NULL);
        bool allow_parent1, allow_parent2;
        access_mask_t access_request_parent1, access_request_parent2;
        struct path mnt_dir;
        struct dentry *old_parent;
        layer_mask_t layer_masks_parent1[LANDLOCK_NUM_ACCESS_FS] = {},
                     layer_masks_parent2[LANDLOCK_NUM_ACCESS_FS] = {};
        struct landlock_request request1 = {}, request2 = {};

        if (!subject)
                return 0;

        if (unlikely(d_is_negative(old_dentry)))
                return -ENOENT;
        if (exchange) {
                if (unlikely(d_is_negative(new_dentry)))
                        return -ENOENT;
                access_request_parent1 =
                        get_mode_access(d_backing_inode(new_dentry)->i_mode);
        } else {
                access_request_parent1 = 0;
        }
        access_request_parent2 =
                get_mode_access(d_backing_inode(old_dentry)->i_mode);
        if (removable) {
                access_request_parent1 |= maybe_remove(old_dentry);
                access_request_parent2 |= maybe_remove(new_dentry);
        }

        /* The mount points are the same for old and new paths, cf. EXDEV. */
        if (old_dentry->d_parent == new_dir->dentry) {
                /*
                 * The LANDLOCK_ACCESS_FS_REFER access right is not required
                 * for same-directory referer (i.e. no reparenting).
                 */
                access_request_parent1 = landlock_init_layer_masks(
                        subject->domain,
                        access_request_parent1 | access_request_parent2,
                        &layer_masks_parent1, LANDLOCK_KEY_INODE);
                if (is_access_to_paths_allowed(subject->domain, new_dir,
                                               access_request_parent1,
                                               &layer_masks_parent1, &request1,
                                               NULL, 0, NULL, NULL, NULL))
                        return 0;

                landlock_log_denial(subject, &request1);
                return -EACCES;
        }

        access_request_parent1 |= LANDLOCK_ACCESS_FS_REFER;
        access_request_parent2 |= LANDLOCK_ACCESS_FS_REFER;

        /* Saves the common mount point. */
        mnt_dir.mnt = new_dir->mnt;
        mnt_dir.dentry = new_dir->mnt->mnt_root;

        /*
         * old_dentry may be the root of the common mount point and
         * !IS_ROOT(old_dentry) at the same time (e.g. with open_tree() and
         * OPEN_TREE_CLONE).  We do not need to call dget(old_parent) because
         * we keep a reference to old_dentry.
         */
        old_parent = (old_dentry == mnt_dir.dentry) ? old_dentry :
                                                      old_dentry->d_parent;

        /* new_dir->dentry is equal to new_dentry->d_parent */
        allow_parent1 = collect_domain_accesses(subject->domain, mnt_dir.dentry,
                                                old_parent,
                                                &layer_masks_parent1);
        allow_parent2 = collect_domain_accesses(subject->domain, mnt_dir.dentry,
                                                new_dir->dentry,
                                                &layer_masks_parent2);

        if (allow_parent1 && allow_parent2)
                return 0;

        /*
         * To be able to compare source and destination domain access rights,
         * take into account the @old_dentry access rights aggregated with its
         * parent access rights.  This will be useful to compare with the
         * destination parent access rights.
         */
        if (is_access_to_paths_allowed(
                    subject->domain, &mnt_dir, access_request_parent1,
                    &layer_masks_parent1, &request1, old_dentry,
                    access_request_parent2, &layer_masks_parent2, &request2,
                    exchange ? new_dentry : NULL))
                return 0;

        if (request1.access) {
                request1.audit.u.path.dentry = old_parent;
                landlock_log_denial(subject, &request1);
        }
        if (request2.access) {
                request2.audit.u.path.dentry = new_dir->dentry;
                landlock_log_denial(subject, &request2);
        }

        /*
         * This prioritizes EACCES over EXDEV for all actions, including
         * renames with RENAME_EXCHANGE.
         */
        if (likely(is_eacces(&layer_masks_parent1, access_request_parent1) ||
                   is_eacces(&layer_masks_parent2, access_request_parent2)))
                return -EACCES;

        /*
         * Gracefully forbids reparenting if the destination directory
         * hierarchy is not a superset of restrictions of the source directory
         * hierarchy, or if LANDLOCK_ACCESS_FS_REFER is not allowed by the
         * source or the destination.
         */
        return -EXDEV;
}

/* Inode hooks */

static void hook_inode_free_security_rcu(void *inode_security)
{
        struct landlock_inode_security *inode_sec;

        /*
         * All inodes must already have been untied from their object by
         * release_inode() or hook_sb_delete().
         */
        inode_sec = inode_security + landlock_blob_sizes.lbs_inode;
        WARN_ON_ONCE(inode_sec->object);
}

/* Super-block hooks */

/*
 * Release the inodes used in a security policy.
 *
 * Cf. fsnotify_unmount_inodes() and evict_inodes()
 */
static void hook_sb_delete(struct super_block *const sb)
{
        struct inode *inode, *prev_inode = NULL;

        if (!landlock_initialized)
                return;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                struct landlock_object *object;

                /* Only handles referenced inodes. */
                if (!atomic_read(&inode->i_count))
                        continue;

                /*
                 * Protects against concurrent modification of inode (e.g.
                 * from get_inode_object()).
                 */
                spin_lock(&inode->i_lock);
                /*
                 * Checks I_FREEING and I_WILL_FREE  to protect against a race
                 * condition when release_inode() just called iput(), which
                 * could lead to a NULL dereference of inode->security or a
                 * second call to iput() for the same Landlock object.  Also
                 * checks I_NEW because such inode cannot be tied to an object.
                 */
                if (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }

                rcu_read_lock();
                object = rcu_dereference(landlock_inode(inode)->object);
                if (!object) {
                        rcu_read_unlock();
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                /* Keeps a reference to this inode until the next loop walk. */
                __iget(inode);
                spin_unlock(&inode->i_lock);

                /*
                 * If there is no concurrent release_inode() ongoing, then we
                 * are in charge of calling iput() on this inode, otherwise we
                 * will just wait for it to finish.
                 */
                spin_lock(&object->lock);
                if (object->underobj == inode) {
                        object->underobj = NULL;
                        spin_unlock(&object->lock);
                        rcu_read_unlock();

                        /*
                         * Because object->underobj was not NULL,
                         * release_inode() and get_inode_object() guarantee
                         * that it is safe to reset
                         * landlock_inode(inode)->object while it is not NULL.
                         * It is therefore not necessary to lock inode->i_lock.
                         */
                        rcu_assign_pointer(landlock_inode(inode)->object, NULL);
                        /*
                         * At this point, we own the ihold() reference that was
                         * originally set up by get_inode_object() and the
                         * __iget() reference that we just set in this loop
                         * walk.  Therefore the following call to iput() will
                         * not sleep nor drop the inode because there is now at
                         * least two references to it.
                         */
                        iput(inode);
                } else {
                        spin_unlock(&object->lock);
                        rcu_read_unlock();
                }

                if (prev_inode) {
                        /*
                         * At this point, we still own the __iget() reference
                         * that we just set in this loop walk.  Therefore we
                         * can drop the list lock and know that the inode won't
                         * disappear from under us until the next loop walk.
                         */
                        spin_unlock(&sb->s_inode_list_lock);
                        /*
                         * We can now actually put the inode reference from the
                         * previous loop walk, which is not needed anymore.
                         */
                        iput(prev_inode);
                        cond_resched();
                        spin_lock(&sb->s_inode_list_lock);
                }
                prev_inode = inode;
        }
        spin_unlock(&sb->s_inode_list_lock);

        /* Puts the inode reference from the last loop walk, if any. */
        if (prev_inode)
                iput(prev_inode);
        /* Waits for pending iput() in release_inode(). */
        wait_var_event(&landlock_superblock(sb)->inode_refs,
                       !atomic_long_read(&landlock_superblock(sb)->inode_refs));
}

static void
log_fs_change_topology_path(const struct landlock_cred_security *const subject,
                            size_t handle_layer, const struct path *const path)
{
        landlock_log_denial(subject, &(struct landlock_request) {
                .type = LANDLOCK_REQUEST_FS_CHANGE_TOPOLOGY,
                .audit = {
                        .type = LSM_AUDIT_DATA_PATH,
                        .u.path = *path,
                },
                .layer_plus_one = handle_layer + 1,
        });
}

static void log_fs_change_topology_dentry(
        const struct landlock_cred_security *const subject, size_t handle_layer,
        struct dentry *const dentry)
{
        landlock_log_denial(subject, &(struct landlock_request) {
                .type = LANDLOCK_REQUEST_FS_CHANGE_TOPOLOGY,
                .audit = {
                        .type = LSM_AUDIT_DATA_DENTRY,
                        .u.dentry = dentry,
                },
                .layer_plus_one = handle_layer + 1,
        });
}

/*
 * Because a Landlock security policy is defined according to the filesystem
 * topology (i.e. the mount namespace), changing it may grant access to files
 * not previously allowed.
 *
 * To make it simple, deny any filesystem topology modification by landlocked
 * processes.  Non-landlocked processes may still change the namespace of a
 * landlocked process, but this kind of threat must be handled by a system-wide
 * access-control security policy.
 *
 * This could be lifted in the future if Landlock can safely handle mount
 * namespace updates requested by a landlocked process.  Indeed, we could
 * update the current domain (which is currently read-only) by taking into
 * account the accesses of the source and the destination of a new mount point.
 * However, it would also require to make all the child domains dynamically
 * inherit these new constraints.  Anyway, for backward compatibility reasons,
 * a dedicated user space option would be required (e.g. as a ruleset flag).
 */
static int hook_sb_mount(const char *const dev_name,
                         const struct path *const path, const char *const type,
                         const unsigned long flags, void *const data)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_path(subject, handle_layer, path);
        return -EPERM;
}

static int hook_move_mount(const struct path *const from_path,
                           const struct path *const to_path)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_path(subject, handle_layer, to_path);
        return -EPERM;
}

/*
 * Removing a mount point may reveal a previously hidden file hierarchy, which
 * may then grant access to files, which may have previously been forbidden.
 */
static int hook_sb_umount(struct vfsmount *const mnt, const int flags)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_dentry(subject, handle_layer, mnt->mnt_root);
        return -EPERM;
}

static int hook_sb_remount(struct super_block *const sb, void *const mnt_opts)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_dentry(subject, handle_layer, sb->s_root);
        return -EPERM;
}

/*
 * pivot_root(2), like mount(2), changes the current mount namespace.  It must
 * then be forbidden for a landlocked process.
 *
 * However, chroot(2) may be allowed because it only changes the relative root
 * directory of the current process.  Moreover, it can be used to restrict the
 * view of the filesystem.
 */
static int hook_sb_pivotroot(const struct path *const old_path,
                             const struct path *const new_path)
{
        size_t handle_layer;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(current_cred(), any_fs,
                                                &handle_layer);

        if (!subject)
                return 0;

        log_fs_change_topology_path(subject, handle_layer, new_path);
        return -EPERM;
}

/* Path hooks */

static int hook_path_link(struct dentry *const old_dentry,
                          const struct path *const new_dir,
                          struct dentry *const new_dentry)
{
        return current_check_refer_path(old_dentry, new_dir, new_dentry, false,
                                        false);
}

static int hook_path_rename(const struct path *const old_dir,
                            struct dentry *const old_dentry,
                            const struct path *const new_dir,
                            struct dentry *const new_dentry,
                            const unsigned int flags)
{
        /* old_dir refers to old_dentry->d_parent and new_dir->mnt */
        return current_check_refer_path(old_dentry, new_dir, new_dentry, true,
                                        !!(flags & RENAME_EXCHANGE));
}

static int hook_path_mkdir(const struct path *const dir,
                           struct dentry *const dentry, const umode_t mode)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_DIR);
}

static int hook_path_mknod(const struct path *const dir,
                           struct dentry *const dentry, const umode_t mode,
                           const unsigned int dev)
{
        return current_check_access_path(dir, get_mode_access(mode));
}

static int hook_path_symlink(const struct path *const dir,
                             struct dentry *const dentry,
                             const char *const old_name)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_MAKE_SYM);
}

static int hook_path_unlink(const struct path *const dir,
                            struct dentry *const dentry)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_FILE);
}

static int hook_path_rmdir(const struct path *const dir,
                           struct dentry *const dentry)
{
        return current_check_access_path(dir, LANDLOCK_ACCESS_FS_REMOVE_DIR);
}

static int hook_path_truncate(const struct path *const path)
{
        return current_check_access_path(path, LANDLOCK_ACCESS_FS_TRUNCATE);
}

/* File hooks */

/**
 * get_required_file_open_access - Get access needed to open a file
 *
 * @file: File being opened.
 *
 * Returns the access rights that are required for opening the given file,
 * depending on the file type and open mode.
 */
static access_mask_t
get_required_file_open_access(const struct file *const file)
{
        access_mask_t access = 0;

        if (file->f_mode & FMODE_READ) {
                /* A directory can only be opened in read mode. */
                if (S_ISDIR(file_inode(file)->i_mode))
                        return LANDLOCK_ACCESS_FS_READ_DIR;
                access = LANDLOCK_ACCESS_FS_READ_FILE;
        }
        if (file->f_mode & FMODE_WRITE)
                access |= LANDLOCK_ACCESS_FS_WRITE_FILE;
        /* __FMODE_EXEC is indeed part of f_flags, not f_mode. */
        if (file->f_flags & __FMODE_EXEC)
                access |= LANDLOCK_ACCESS_FS_EXECUTE;
        return access;
}

static int hook_file_alloc_security(struct file *const file)
{
        /*
         * Grants all access rights, even if most of them are not checked later
         * on. It is more consistent.
         *
         * Notably, file descriptors for regular files can also be acquired
         * without going through the file_open hook, for example when using
         * memfd_create(2).
         */
        landlock_file(file)->allowed_access = LANDLOCK_MASK_ACCESS_FS;
        return 0;
}

static bool is_device(const struct file *const file)
{
        const struct inode *inode = file_inode(file);

        return S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode);
}

static int hook_file_open(struct file *const file)
{
        layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_FS] = {};
        access_mask_t open_access_request, full_access_request, allowed_access,
                optional_access;
        const struct landlock_cred_security *const subject =
                landlock_get_applicable_subject(file->f_cred, any_fs, NULL);
        struct landlock_request request = {};

        if (!subject)
                return 0;

        /*
         * Because a file may be opened with O_PATH, get_required_file_open_access()
         * may return 0.  This case will be handled with a future Landlock
         * evolution.
         */
        open_access_request = get_required_file_open_access(file);

        /*
         * We look up more access than what we immediately need for open(), so
         * that we can later authorize operations on opened files.
         */
        optional_access = LANDLOCK_ACCESS_FS_TRUNCATE;
        if (is_device(file))
                optional_access |= LANDLOCK_ACCESS_FS_IOCTL_DEV;

        full_access_request = open_access_request | optional_access;

        if (is_access_to_paths_allowed(
                    subject->domain, &file->f_path,
                    landlock_init_layer_masks(subject->domain,
                                              full_access_request, &layer_masks,
                                              LANDLOCK_KEY_INODE),
                    &layer_masks, &request, NULL, 0, NULL, NULL, NULL)) {
                allowed_access = full_access_request;
        } else {
                unsigned long access_bit;
                const unsigned long access_req = full_access_request;

                /*
                 * Calculate the actual allowed access rights from layer_masks.
                 * Add each access right to allowed_access which has not been
                 * vetoed by any layer.
                 */
                allowed_access = 0;
                for_each_set_bit(access_bit, &access_req,
                                 ARRAY_SIZE(layer_masks)) {
                        if (!layer_masks[access_bit])
                                allowed_access |= BIT_ULL(access_bit);
                }
        }

        /*
         * For operations on already opened files (i.e. ftruncate()), it is the
         * access rights at the time of open() which decide whether the
         * operation is permitted. Therefore, we record the relevant subset of
         * file access rights in the opened struct file.
         */
        landlock_file(file)->allowed_access = allowed_access;
#ifdef CONFIG_AUDIT
        landlock_file(file)->deny_masks = landlock_get_deny_masks(
                _LANDLOCK_ACCESS_FS_OPTIONAL, optional_access, &layer_masks,
                ARRAY_SIZE(layer_masks));
#endif /* CONFIG_AUDIT */

        if ((open_access_request & allowed_access) == open_access_request)
                return 0;

        /* Sets access to reflect the actual request. */
        request.access = open_access_request;
        landlock_log_denial(subject, &request);
        return -EACCES;
}

static int hook_file_truncate(struct file *const file)
{
        /*
         * Allows truncation if the truncate right was available at the time of
         * opening the file, to get a consistent access check as for read, write
         * and execute operations.
         *
         * Note: For checks done based on the file's Landlock allowed access, we
         * enforce them independently of whether the current thread is in a
         * Landlock domain, so that open files passed between independent
         * processes retain their behaviour.
         */
        if (landlock_file(file)->allowed_access & LANDLOCK_ACCESS_FS_TRUNCATE)
                return 0;

        landlock_log_denial(landlock_cred(file->f_cred), &(struct landlock_request) {
                .type = LANDLOCK_REQUEST_FS_ACCESS,
                .audit = {
                        .type = LSM_AUDIT_DATA_FILE,
                        .u.file = file,
                },
                .all_existing_optional_access = _LANDLOCK_ACCESS_FS_OPTIONAL,
                .access = LANDLOCK_ACCESS_FS_TRUNCATE,
#ifdef CONFIG_AUDIT
                .deny_masks = landlock_file(file)->deny_masks,
#endif /* CONFIG_AUDIT */
        });
        return -EACCES;
}

static int hook_file_ioctl_common(const struct file *const file,
                                  const unsigned int cmd, const bool is_compat)
{
        access_mask_t allowed_access = landlock_file(file)->allowed_access;

        /*
         * It is the access rights at the time of opening the file which
         * determine whether IOCTL can be used on the opened file later.
         *
         * The access right is attached to the opened file in hook_file_open().
         */
        if (allowed_access & LANDLOCK_ACCESS_FS_IOCTL_DEV)
                return 0;

        if (!is_device(file))
                return 0;

        if (unlikely(is_compat) ? is_masked_device_ioctl_compat(cmd) :
                                  is_masked_device_ioctl(cmd))
                return 0;

        landlock_log_denial(landlock_cred(file->f_cred), &(struct landlock_request) {
                .type = LANDLOCK_REQUEST_FS_ACCESS,
                .audit = {
                        .type = LSM_AUDIT_DATA_IOCTL_OP,
                        .u.op = &(struct lsm_ioctlop_audit) {
                                .path = file->f_path,
                                .cmd = cmd,
                        },
                },
                .all_existing_optional_access = _LANDLOCK_ACCESS_FS_OPTIONAL,
                .access = LANDLOCK_ACCESS_FS_IOCTL_DEV,
#ifdef CONFIG_AUDIT
                .deny_masks = landlock_file(file)->deny_masks,
#endif /* CONFIG_AUDIT */
        });
        return -EACCES;
}

static int hook_file_ioctl(struct file *file, unsigned int cmd,
                           unsigned long arg)
{
        return hook_file_ioctl_common(file, cmd, false);
}

static int hook_file_ioctl_compat(struct file *file, unsigned int cmd,
                                  unsigned long arg)
{
        return hook_file_ioctl_common(file, cmd, true);
}

/*
 * Always allow sending signals between threads of the same process.  This
 * ensures consistency with hook_task_kill().
 */
static bool control_current_fowner(struct fown_struct *const fown)
{
        struct task_struct *p;

        /*
         * Lock already held by __f_setown(), see commit 26f204380a3c ("fs: Fix
         * file_set_fowner LSM hook inconsistencies").
         */
        lockdep_assert_held(&fown->lock);

        /*
         * Some callers (e.g. fcntl_dirnotify) may not be in an RCU read-side
         * critical section.
         */
        guard(rcu)();
        p = pid_task(fown->pid, fown->pid_type);
        if (!p)
                return true;

        return !same_thread_group(p, current);
}

static void hook_file_set_fowner(struct file *file)
{
        struct landlock_ruleset *prev_dom;
        struct landlock_cred_security fown_subject = {};
        size_t fown_layer = 0;

        if (control_current_fowner(file_f_owner(file))) {
                static const struct access_masks signal_scope = {
                        .scope = LANDLOCK_SCOPE_SIGNAL,
                };
                const struct landlock_cred_security *new_subject =
                        landlock_get_applicable_subject(
                                current_cred(), signal_scope, &fown_layer);
                if (new_subject) {
                        landlock_get_ruleset(new_subject->domain);
                        fown_subject = *new_subject;
                }
        }

        prev_dom = landlock_file(file)->fown_subject.domain;
        landlock_file(file)->fown_subject = fown_subject;
#ifdef CONFIG_AUDIT
        landlock_file(file)->fown_layer = fown_layer;
#endif /* CONFIG_AUDIT*/

        /* May be called in an RCU read-side critical section. */
        landlock_put_ruleset_deferred(prev_dom);
}

static void hook_file_free_security(struct file *file)
{
        landlock_put_ruleset_deferred(landlock_file(file)->fown_subject.domain);
}

static struct security_hook_list landlock_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_free_security_rcu, hook_inode_free_security_rcu),

        LSM_HOOK_INIT(sb_delete, hook_sb_delete),
        LSM_HOOK_INIT(sb_mount, hook_sb_mount),
        LSM_HOOK_INIT(move_mount, hook_move_mount),
        LSM_HOOK_INIT(sb_umount, hook_sb_umount),
        LSM_HOOK_INIT(sb_remount, hook_sb_remount),
        LSM_HOOK_INIT(sb_pivotroot, hook_sb_pivotroot),

        LSM_HOOK_INIT(path_link, hook_path_link),
        LSM_HOOK_INIT(path_rename, hook_path_rename),
        LSM_HOOK_INIT(path_mkdir, hook_path_mkdir),
        LSM_HOOK_INIT(path_mknod, hook_path_mknod),
        LSM_HOOK_INIT(path_symlink, hook_path_symlink),
        LSM_HOOK_INIT(path_unlink, hook_path_unlink),
        LSM_HOOK_INIT(path_rmdir, hook_path_rmdir),
        LSM_HOOK_INIT(path_truncate, hook_path_truncate),

        LSM_HOOK_INIT(file_alloc_security, hook_file_alloc_security),
        LSM_HOOK_INIT(file_open, hook_file_open),
        LSM_HOOK_INIT(file_truncate, hook_file_truncate),
        LSM_HOOK_INIT(file_ioctl, hook_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, hook_file_ioctl_compat),
        LSM_HOOK_INIT(file_set_fowner, hook_file_set_fowner),
        LSM_HOOK_INIT(file_free_security, hook_file_free_security),
};

__init void landlock_add_fs_hooks(void)
{
        security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks),
                           &landlock_lsmid);
}

#ifdef CONFIG_SECURITY_LANDLOCK_KUNIT_TEST

/* clang-format off */
static struct kunit_case test_cases[] = {
        KUNIT_CASE(test_no_more_access),
        KUNIT_CASE(test_scope_to_request_with_exec_none),
        KUNIT_CASE(test_scope_to_request_with_exec_some),
        KUNIT_CASE(test_scope_to_request_without_access),
        KUNIT_CASE(test_is_eacces_with_none),
        KUNIT_CASE(test_is_eacces_with_refer),
        KUNIT_CASE(test_is_eacces_with_write),
        {}
};
/* clang-format on */

static struct kunit_suite test_suite = {
        .name = "landlock_fs",
        .test_cases = test_cases,
};

kunit_test_suite(test_suite);

#endif /* CONFIG_SECURITY_LANDLOCK_KUNIT_TEST */


















































































  157 












  155 







  156 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_KERNEL_VTIME_H
#define _LINUX_KERNEL_VTIME_H

#include <linux/context_tracking_state.h>
#include <linux/sched.h>

/*
 * Common vtime APIs
 */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
extern void vtime_account_kernel(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk);
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern void vtime_user_enter(struct task_struct *tsk);
extern void vtime_user_exit(struct task_struct *tsk);
extern void vtime_guest_enter(struct task_struct *tsk);
extern void vtime_guest_exit(struct task_struct *tsk);
extern void vtime_init_idle(struct task_struct *tsk, int cpu);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN  */
static inline void vtime_user_enter(struct task_struct *tsk) { }
static inline void vtime_user_exit(struct task_struct *tsk) { }
static inline void vtime_guest_enter(struct task_struct *tsk) { }
static inline void vtime_guest_exit(struct task_struct *tsk) { }
static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { }
#endif

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset);
extern void vtime_account_softirq(struct task_struct *tsk);
extern void vtime_account_hardirq(struct task_struct *tsk);
extern void vtime_flush(struct task_struct *tsk);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
static inline void vtime_account_softirq(struct task_struct *tsk) { }
static inline void vtime_account_hardirq(struct task_struct *tsk) { }
static inline void vtime_flush(struct task_struct *tsk) { }
#endif

/*
 * vtime_accounting_enabled_this_cpu() definitions/declarations
 */
#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE)

static inline bool vtime_accounting_enabled_this_cpu(void) { return true; }
extern void vtime_task_switch(struct task_struct *prev);

static __always_inline void vtime_account_guest_enter(void)
{
        vtime_account_kernel(current);
        current->flags |= PF_VCPU;
}

static __always_inline void vtime_account_guest_exit(void)
{
        vtime_account_kernel(current);
        current->flags &= ~PF_VCPU;
}

#elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN)

/*
 * Checks if vtime is enabled on some CPU. Cputime readers want to be careful
 * in that case and compute the tickless cputime.
 * For now vtime state is tied to context tracking. We might want to decouple
 * those later if necessary.
 */
static inline bool vtime_accounting_enabled(void)
{
        return context_tracking_enabled();
}

static inline bool vtime_accounting_enabled_cpu(int cpu)
{
        return context_tracking_enabled_cpu(cpu);
}

static inline bool vtime_accounting_enabled_this_cpu(void)
{
        return context_tracking_enabled_this_cpu();
}

extern void vtime_task_switch_generic(struct task_struct *prev);

static inline void vtime_task_switch(struct task_struct *prev)
{
        if (vtime_accounting_enabled_this_cpu())
                vtime_task_switch_generic(prev);
}

static __always_inline void vtime_account_guest_enter(void)
{
        if (vtime_accounting_enabled_this_cpu())
                vtime_guest_enter(current);
        else
                current->flags |= PF_VCPU;
}

static __always_inline void vtime_account_guest_exit(void)
{
        if (vtime_accounting_enabled_this_cpu())
                vtime_guest_exit(current);
        else
                current->flags &= ~PF_VCPU;
}

#else /* !CONFIG_VIRT_CPU_ACCOUNTING */

static inline bool vtime_accounting_enabled_this_cpu(void) { return false; }
static inline void vtime_task_switch(struct task_struct *prev) { }

static __always_inline void vtime_account_guest_enter(void)
{
        current->flags |= PF_VCPU;
}

static __always_inline void vtime_account_guest_exit(void)
{
        current->flags &= ~PF_VCPU;
}

#endif


#ifdef CONFIG_IRQ_TIME_ACCOUNTING
extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset);
#else
static inline void irqtime_account_irq(struct task_struct *tsk, unsigned int offset) { }
#endif

static inline void account_softirq_enter(struct task_struct *tsk)
{
        vtime_account_irq(tsk, SOFTIRQ_OFFSET);
        irqtime_account_irq(tsk, SOFTIRQ_OFFSET);
}

static inline void account_softirq_exit(struct task_struct *tsk)
{
        vtime_account_softirq(tsk);
        irqtime_account_irq(tsk, 0);
}

static inline void account_hardirq_enter(struct task_struct *tsk)
{
        vtime_account_irq(tsk, HARDIRQ_OFFSET);
        irqtime_account_irq(tsk, HARDIRQ_OFFSET);
}

static inline void account_hardirq_exit(struct task_struct *tsk)
{
        vtime_account_hardirq(tsk);
        irqtime_account_irq(tsk, 0);
}

#endif /* _LINUX_KERNEL_VTIME_H */











    1 

















   10 








































   23 














   21 

































    4 

























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/* SPDX-License-Identifier: GPL-2.0 */
#if !defined(_TRACE_HANDLE_EXIT_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_HANDLE_EXIT_ARM64_KVM_H

#include <linux/tracepoint.h>
#include "sys_regs.h"

#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm

TRACE_EVENT(kvm_wfx_arm64,
        TP_PROTO(unsigned long vcpu_pc, bool is_wfe),
        TP_ARGS(vcpu_pc, is_wfe),

        TP_STRUCT__entry(
                __field(unsigned long,        vcpu_pc)
                __field(bool,                is_wfe)
        ),

        TP_fast_assign(
                __entry->vcpu_pc = vcpu_pc;
                __entry->is_wfe  = is_wfe;
        ),

        TP_printk("guest executed wf%c at: 0x%016lx",
                  __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc)
);

TRACE_EVENT(kvm_hvc_arm64,
        TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm),
        TP_ARGS(vcpu_pc, r0, imm),

        TP_STRUCT__entry(
                __field(unsigned long, vcpu_pc)
                __field(unsigned long, r0)
                __field(unsigned long, imm)
        ),

        TP_fast_assign(
                __entry->vcpu_pc = vcpu_pc;
                __entry->r0 = r0;
                __entry->imm = imm;
        ),

        TP_printk("HVC at 0x%016lx (r0: 0x%016lx, imm: 0x%lx)",
                  __entry->vcpu_pc, __entry->r0, __entry->imm)
);

/*
 * The dreg32 name is a leftover from a distant past. This will really
 * output a 64bit value...
 */
TRACE_EVENT(kvm_arm_set_dreg32,
        TP_PROTO(const char *name, __u64 value),
        TP_ARGS(name, value),

        TP_STRUCT__entry(
                __field(const char *, name)
                __field(__u64, value)
        ),

        TP_fast_assign(
                __entry->name = name;
                __entry->value = value;
        ),

        TP_printk("%s: 0x%llx", __entry->name, __entry->value)
);

TRACE_EVENT(kvm_handle_sys_reg,
        TP_PROTO(unsigned long hsr),
        TP_ARGS(hsr),

        TP_STRUCT__entry(
                __field(unsigned long,        hsr)
        ),

        TP_fast_assign(
                __entry->hsr = hsr;
        ),

        TP_printk("HSR 0x%08lx", __entry->hsr)
);

TRACE_EVENT(kvm_sys_access,
        TP_PROTO(unsigned long vcpu_pc, struct sys_reg_params *params, const struct sys_reg_desc *reg),
        TP_ARGS(vcpu_pc, params, reg),

        TP_STRUCT__entry(
                __field(unsigned long,                        vcpu_pc)
                __field(bool,                                is_write)
                __field(const char *,                        name)
                __field(u8,                                Op0)
                __field(u8,                                Op1)
                __field(u8,                                CRn)
                __field(u8,                                CRm)
                __field(u8,                                Op2)
        ),

        TP_fast_assign(
                __entry->vcpu_pc = vcpu_pc;
                __entry->is_write = params->is_write;
                __entry->name = reg->name;
                __entry->Op0 = reg->Op0;
                __entry->Op0 = reg->Op0;
                __entry->Op1 = reg->Op1;
                __entry->CRn = reg->CRn;
                __entry->CRm = reg->CRm;
                __entry->Op2 = reg->Op2;
        ),

        TP_printk("PC: %lx %s (%d,%d,%d,%d,%d) %s",
                  __entry->vcpu_pc, __entry->name ?: "UNKN",
                  __entry->Op0, __entry->Op1, __entry->CRn,
                  __entry->CRm, __entry->Op2,
                  __entry->is_write ? "write" : "read")
);

TRACE_EVENT(kvm_set_guest_debug,
        TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
        TP_ARGS(vcpu, guest_debug),

        TP_STRUCT__entry(
                __field(struct kvm_vcpu *, vcpu)
                __field(__u32, guest_debug)
        ),

        TP_fast_assign(
                __entry->vcpu = vcpu;
                __entry->guest_debug = guest_debug;
        ),

        TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
);

#endif /* _TRACE_HANDLE_EXIT_ARM64_KVM_H */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace_handle_exit

/* This part must be outside protection */
#include <trace/define_trace.h>






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 






    3 




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux IPv6 multicast routing support for BSD pim6sd
 *        Based on net/ipv4/ipmr.c.
 *
 *        (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr>
 *                LSIIT Laboratory, Strasbourg, France
 *        (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com>
 *                6WIND, Paris, France
 *        Copyright (C)2007,2008 USAGI/WIDE Project
 *                YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
 */

#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/rhashtable.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <net/checksum.h>
#include <net/netlink.h>
#include <net/fib_rules.h>

#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <linux/mroute6.h>
#include <linux/pim.h>
#include <net/addrconf.h>
#include <linux/netfilter_ipv6.h>
#include <linux/export.h>
#include <net/ip6_checksum.h>
#include <linux/netconf.h>
#include <net/ip_tunnels.h>

#include <linux/nospec.h>

struct ip6mr_rule {
        struct fib_rule                common;
};

struct ip6mr_result {
        struct mr_table        *mrt;
};

/* Big lock, protecting vif table, mrt cache and mroute socket state.
   Note that the changes are semaphored via rtnl_lock.
 */

static DEFINE_SPINLOCK(mrt_lock);

static struct net_device *vif_dev_read(const struct vif_device *vif)
{
        return rcu_dereference(vif->dev);
}

/* Multicast router control variables */

/* Special spinlock for queue of unresolved entries */
static DEFINE_SPINLOCK(mfc_unres_lock);

/* We return to original Alan's scheme. Hash table of resolved
   entries is changed only in process context and protected
   with weak lock mrt_lock. Queue of unresolved entries is protected
   with strong spinlock mfc_unres_lock.

   In this case data path is free of exclusive locks at all.
 */

static struct kmem_cache *mrt_cachep __read_mostly;

static struct mr_table *ip6mr_new_table(struct net *net, u32 id);
static void ip6mr_free_table(struct mr_table *mrt);

static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
                           struct net_device *dev, struct sk_buff *skb,
                           struct mfc6_cache *cache);
static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
                              mifi_t mifi, int assert);
static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
                              int cmd);
static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack);
static int ip6mr_rtm_dumproute(struct sk_buff *skb,
                               struct netlink_callback *cb);
static void mroute_clean_tables(struct mr_table *mrt, int flags);
static void ipmr_expire_process(struct timer_list *t);

#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
#define ip6mr_for_each_table(mrt, net) \
        list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \
                                lockdep_rtnl_is_held() || \
                                list_empty(&net->ipv6.mr6_tables))

static bool ip6mr_can_free_table(struct net *net)
{
        return !check_net(net) || !net_initialized(net);
}

static struct mr_table *ip6mr_mr_table_iter(struct net *net,
                                            struct mr_table *mrt)
{
        struct mr_table *ret;

        if (!mrt)
                ret = list_entry_rcu(net->ipv6.mr6_tables.next,
                                     struct mr_table, list);
        else
                ret = list_entry_rcu(mrt->list.next,
                                     struct mr_table, list);

        if (&ret->list == &net->ipv6.mr6_tables)
                return NULL;
        return ret;
}

static struct mr_table *__ip6mr_get_table(struct net *net, u32 id)
{
        struct mr_table *mrt;

        ip6mr_for_each_table(mrt, net) {
                if (mrt->id == id)
                        return mrt;
        }
        return NULL;
}

static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
{
        struct mr_table *mrt;

        rcu_read_lock();
        mrt = __ip6mr_get_table(net, id);
        rcu_read_unlock();
        return mrt;
}

static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
                            struct mr_table **mrt)
{
        int err;
        struct ip6mr_result res;
        struct fib_lookup_arg arg = {
                .result = &res,
                .flags = FIB_LOOKUP_NOREF,
        };

        /* update flow if oif or iif point to device enslaved to l3mdev */
        l3mdev_update_flow(net, flowi6_to_flowi(flp6));

        err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
                               flowi6_to_flowi(flp6), 0, &arg);
        if (err < 0)
                return err;
        *mrt = res.mrt;
        return 0;
}

static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
                             int flags, struct fib_lookup_arg *arg)
{
        struct ip6mr_result *res = arg->result;
        struct mr_table *mrt;

        switch (rule->action) {
        case FR_ACT_TO_TBL:
                break;
        case FR_ACT_UNREACHABLE:
                return -ENETUNREACH;
        case FR_ACT_PROHIBIT:
                return -EACCES;
        case FR_ACT_BLACKHOLE:
        default:
                return -EINVAL;
        }

        arg->table = fib_rule_get_table(rule, arg);

        mrt = __ip6mr_get_table(rule->fr_net, arg->table);
        if (!mrt)
                return -EAGAIN;
        res->mrt = mrt;
        return 0;
}

static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags)
{
        return 1;
}

static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
                                struct fib_rule_hdr *frh, struct nlattr **tb,
                                struct netlink_ext_ack *extack)
{
        return 0;
}

static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
                              struct nlattr **tb)
{
        return 1;
}

static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
                           struct fib_rule_hdr *frh)
{
        frh->dst_len = 0;
        frh->src_len = 0;
        frh->tos     = 0;
        return 0;
}

static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
        .family                = RTNL_FAMILY_IP6MR,
        .rule_size        = sizeof(struct ip6mr_rule),
        .addr_size        = sizeof(struct in6_addr),
        .action                = ip6mr_rule_action,
        .match                = ip6mr_rule_match,
        .configure        = ip6mr_rule_configure,
        .compare        = ip6mr_rule_compare,
        .fill                = ip6mr_rule_fill,
        .nlgroup        = RTNLGRP_IPV6_RULE,
        .owner                = THIS_MODULE,
};

static int __net_init ip6mr_rules_init(struct net *net)
{
        struct fib_rules_ops *ops;
        struct mr_table *mrt;
        int err;

        ops = fib_rules_register(&ip6mr_rules_ops_template, net);
        if (IS_ERR(ops))
                return PTR_ERR(ops);

        INIT_LIST_HEAD(&net->ipv6.mr6_tables);

        mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
        if (IS_ERR(mrt)) {
                err = PTR_ERR(mrt);
                goto err1;
        }

        err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT);
        if (err < 0)
                goto err2;

        net->ipv6.mr6_rules_ops = ops;
        return 0;

err2:
        rtnl_lock();
        ip6mr_free_table(mrt);
        rtnl_unlock();
err1:
        fib_rules_unregister(ops);
        return err;
}

static void __net_exit ip6mr_rules_exit(struct net *net)
{
        struct mr_table *mrt, *next;

        ASSERT_RTNL();
        list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
                list_del(&mrt->list);
                ip6mr_free_table(mrt);
        }
        fib_rules_unregister(net->ipv6.mr6_rules_ops);
}

static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
                            struct netlink_ext_ack *extack)
{
        return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack);
}

static unsigned int ip6mr_rules_seq_read(const struct net *net)
{
        return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR);
}

bool ip6mr_rule_default(const struct fib_rule *rule)
{
        return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL &&
               rule->table == RT6_TABLE_DFLT && !rule->l3mdev;
}
EXPORT_SYMBOL(ip6mr_rule_default);
#else
#define ip6mr_for_each_table(mrt, net) \
        for (mrt = net->ipv6.mrt6; mrt; mrt = NULL)

static bool ip6mr_can_free_table(struct net *net)
{
        return !check_net(net);
}

static struct mr_table *ip6mr_mr_table_iter(struct net *net,
                                            struct mr_table *mrt)
{
        if (!mrt)
                return net->ipv6.mrt6;
        return NULL;
}

static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
{
        return net->ipv6.mrt6;
}

#define __ip6mr_get_table ip6mr_get_table

static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
                            struct mr_table **mrt)
{
        *mrt = net->ipv6.mrt6;
        return 0;
}

static int __net_init ip6mr_rules_init(struct net *net)
{
        struct mr_table *mrt;

        mrt = ip6mr_new_table(net, RT6_TABLE_DFLT);
        if (IS_ERR(mrt))
                return PTR_ERR(mrt);
        net->ipv6.mrt6 = mrt;
        return 0;
}

static void __net_exit ip6mr_rules_exit(struct net *net)
{
        ASSERT_RTNL();
        ip6mr_free_table(net->ipv6.mrt6);
        net->ipv6.mrt6 = NULL;
}

static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
                            struct netlink_ext_ack *extack)
{
        return 0;
}

static unsigned int ip6mr_rules_seq_read(const struct net *net)
{
        return 0;
}
#endif

static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg,
                          const void *ptr)
{
        const struct mfc6_cache_cmp_arg *cmparg = arg->key;
        struct mfc6_cache *c = (struct mfc6_cache *)ptr;

        return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) ||
               !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin);
}

static const struct rhashtable_params ip6mr_rht_params = {
        .head_offset = offsetof(struct mr_mfc, mnode),
        .key_offset = offsetof(struct mfc6_cache, cmparg),
        .key_len = sizeof(struct mfc6_cache_cmp_arg),
        .nelem_hint = 3,
        .obj_cmpfn = ip6mr_hash_cmp,
        .automatic_shrinking = true,
};

static void ip6mr_new_table_set(struct mr_table *mrt,
                                struct net *net)
{
#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
        list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables);
#endif
}

static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = {
        .mf6c_origin = IN6ADDR_ANY_INIT,
        .mf6c_mcastgrp = IN6ADDR_ANY_INIT,
};

static struct mr_table_ops ip6mr_mr_table_ops = {
        .rht_params = &ip6mr_rht_params,
        .cmparg_any = &ip6mr_mr_table_ops_cmparg_any,
};

static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
{
        struct mr_table *mrt;

        mrt = __ip6mr_get_table(net, id);
        if (mrt)
                return mrt;

        return mr_table_alloc(net, id, &ip6mr_mr_table_ops,
                              ipmr_expire_process, ip6mr_new_table_set);
}

static void ip6mr_free_table(struct mr_table *mrt)
{
        struct net *net = read_pnet(&mrt->net);

        WARN_ON_ONCE(!ip6mr_can_free_table(net));

        timer_shutdown_sync(&mrt->ipmr_expire_timer);
        mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC |
                                 MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC);
        rhltable_destroy(&mrt->mfc_hash);
        kfree(mrt);
}

#ifdef CONFIG_PROC_FS
/* The /proc interfaces to multicast routing
 * /proc/ip6_mr_cache /proc/ip6_mr_vif
 */

static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        struct mr_vif_iter *iter = seq->private;
        struct net *net = seq_file_net(seq);
        struct mr_table *mrt;

        rcu_read_lock();
        mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT);
        if (!mrt) {
                rcu_read_unlock();
                return ERR_PTR(-ENOENT);
        }

        iter->mrt = mrt;

        return mr_vif_seq_start(seq, pos);
}

static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        rcu_read_unlock();
}

static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
{
        struct mr_vif_iter *iter = seq->private;
        struct mr_table *mrt = iter->mrt;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags\n");
        } else {
                const struct vif_device *vif = v;
                const struct net_device *vif_dev;
                const char *name;

                vif_dev = vif_dev_read(vif);
                name = vif_dev ? vif_dev->name : "none";

                seq_printf(seq,
                           "%2td %-10s %8ld %7ld  %8ld %7ld %05X\n",
                           vif - mrt->vif_table,
                           name, vif->bytes_in, vif->pkt_in,
                           vif->bytes_out, vif->pkt_out,
                           vif->flags);
        }
        return 0;
}

static const struct seq_operations ip6mr_vif_seq_ops = {
        .start = ip6mr_vif_seq_start,
        .next  = mr_vif_seq_next,
        .stop  = ip6mr_vif_seq_stop,
        .show  = ip6mr_vif_seq_show,
};

static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct net *net = seq_file_net(seq);
        struct mr_table *mrt;

        mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
        if (!mrt)
                return ERR_PTR(-ENOENT);

        return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
}

static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
{
        int n;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "Group                            "
                         "Origin                           "
                         "Iif      Pkts  Bytes     Wrong  Oifs\n");
        } else {
                const struct mfc6_cache *mfc = v;
                const struct mr_mfc_iter *it = seq->private;
                struct mr_table *mrt = it->mrt;

                seq_printf(seq, "%pI6 %pI6 %-3hd",
                           &mfc->mf6c_mcastgrp, &mfc->mf6c_origin,
                           mfc->_c.mfc_parent);

                if (it->cache != &mrt->mfc_unres_queue) {
                        seq_printf(seq, " %8lu %8lu %8lu",
                                   atomic_long_read(&mfc->_c.mfc_un.res.pkt),
                                   atomic_long_read(&mfc->_c.mfc_un.res.bytes),
                                   atomic_long_read(&mfc->_c.mfc_un.res.wrong_if));
                        for (n = mfc->_c.mfc_un.res.minvif;
                             n < mfc->_c.mfc_un.res.maxvif; n++) {
                                if (VIF_EXISTS(mrt, n) &&
                                    mfc->_c.mfc_un.res.ttls[n] < 255)
                                        seq_printf(seq,
                                                   " %2d:%-3d", n,
                                                   mfc->_c.mfc_un.res.ttls[n]);
                        }
                } else {
                        /* unresolved mfc_caches don't contain
                         * pkt, bytes and wrong_if values
                         */
                        seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
                }
                seq_putc(seq, '\n');
        }
        return 0;
}

static const struct seq_operations ipmr_mfc_seq_ops = {
        .start = ipmr_mfc_seq_start,
        .next  = mr_mfc_seq_next,
        .stop  = mr_mfc_seq_stop,
        .show  = ipmr_mfc_seq_show,
};
#endif

#ifdef CONFIG_IPV6_PIMSM_V2

static int pim6_rcv(struct sk_buff *skb)
{
        struct pimreghdr *pim;
        struct ipv6hdr   *encap;
        struct net_device  *reg_dev = NULL;
        struct net *net = dev_net(skb->dev);
        struct mr_table *mrt;
        struct flowi6 fl6 = {
                .flowi6_iif        = skb->dev->ifindex,
                .flowi6_mark        = skb->mark,
        };
        int reg_vif_num;

        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
                goto drop;

        pim = (struct pimreghdr *)skb_transport_header(skb);
        if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) ||
            (pim->flags & PIM_NULL_REGISTER) ||
            (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
                             sizeof(*pim), IPPROTO_PIM,
                             csum_partial((void *)pim, sizeof(*pim), 0)) &&
             csum_fold(skb_checksum(skb, 0, skb->len, 0))))
                goto drop;

        /* check if the inner packet is destined to mcast group */
        encap = (struct ipv6hdr *)(skb_transport_header(skb) +
                                   sizeof(*pim));

        if (!ipv6_addr_is_multicast(&encap->daddr) ||
            encap->payload_len == 0 ||
            ntohs(encap->payload_len) + sizeof(*pim) > skb->len)
                goto drop;

        if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
                goto drop;

        /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */
        reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
        if (reg_vif_num >= 0)
                reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]);

        if (!reg_dev)
                goto drop;

        skb->mac_header = skb->network_header;
        skb_pull(skb, (u8 *)encap - skb->data);
        skb_reset_network_header(skb);
        skb->protocol = htons(ETH_P_IPV6);
        skb->ip_summed = CHECKSUM_NONE;

        skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));

        netif_rx(skb);

        return 0;
 drop:
        kfree_skb(skb);
        return 0;
}

static const struct inet6_protocol pim6_protocol = {
        .handler        =        pim6_rcv,
};

/* Service routines creating virtual interfaces: PIMREG */

static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
                                      struct net_device *dev)
{
        struct net *net = dev_net(dev);
        struct mr_table *mrt;
        struct flowi6 fl6 = {
                .flowi6_oif        = dev->ifindex,
                .flowi6_iif        = skb->skb_iif ? : LOOPBACK_IFINDEX,
                .flowi6_mark        = skb->mark,
        };

        if (!pskb_inet_may_pull(skb))
                goto tx_err;

        if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
                goto tx_err;

        DEV_STATS_ADD(dev, tx_bytes, skb->len);
        DEV_STATS_INC(dev, tx_packets);
        rcu_read_lock();
        ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
                           MRT6MSG_WHOLEPKT);
        rcu_read_unlock();
        kfree_skb(skb);
        return NETDEV_TX_OK;

tx_err:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
        return NETDEV_TX_OK;
}

static int reg_vif_get_iflink(const struct net_device *dev)
{
        return 0;
}

static const struct net_device_ops reg_vif_netdev_ops = {
        .ndo_start_xmit        = reg_vif_xmit,
        .ndo_get_iflink = reg_vif_get_iflink,
};

static void reg_vif_setup(struct net_device *dev)
{
        dev->type                = ARPHRD_PIMREG;
        dev->mtu                = 1500 - sizeof(struct ipv6hdr) - 8;
        dev->flags                = IFF_NOARP;
        dev->netdev_ops                = &reg_vif_netdev_ops;
        dev->needs_free_netdev        = true;
        dev->netns_immutable        = true;
}

static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt)
{
        struct net_device *dev;
        char name[IFNAMSIZ];

        if (mrt->id == RT6_TABLE_DFLT)
                sprintf(name, "pim6reg");
        else
                sprintf(name, "pim6reg%u", mrt->id);

        dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
        if (!dev)
                return NULL;

        dev_net_set(dev, net);

        if (register_netdevice(dev)) {
                free_netdev(dev);
                return NULL;
        }

        if (dev_open(dev, NULL))
                goto failure;

        dev_hold(dev);
        return dev;

failure:
        unregister_netdevice(dev);
        return NULL;
}
#endif

static int call_ip6mr_vif_entry_notifiers(struct net *net,
                                          enum fib_event_type event_type,
                                          struct vif_device *vif,
                                          struct net_device *vif_dev,
                                          mifi_t vif_index, u32 tb_id)
{
        return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
                                     vif, vif_dev, vif_index, tb_id,
                                     &net->ipv6.ipmr_seq);
}

static int call_ip6mr_mfc_entry_notifiers(struct net *net,
                                          enum fib_event_type event_type,
                                          struct mfc6_cache *mfc, u32 tb_id)
{
        return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
                                     &mfc->_c, tb_id, &net->ipv6.ipmr_seq);
}

/* Delete a VIF entry */
static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
                       struct list_head *head)
{
        struct vif_device *v;
        struct net_device *dev;
        struct inet6_dev *in6_dev;

        if (vifi < 0 || vifi >= mrt->maxvif)
                return -EADDRNOTAVAIL;

        v = &mrt->vif_table[vifi];

        dev = rtnl_dereference(v->dev);
        if (!dev)
                return -EADDRNOTAVAIL;

        call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
                                       FIB_EVENT_VIF_DEL, v, dev,
                                       vifi, mrt->id);
        spin_lock(&mrt_lock);
        RCU_INIT_POINTER(v->dev, NULL);

#ifdef CONFIG_IPV6_PIMSM_V2
        if (vifi == mrt->mroute_reg_vif_num) {
                /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */
                WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
        }
#endif

        if (vifi + 1 == mrt->maxvif) {
                int tmp;
                for (tmp = vifi - 1; tmp >= 0; tmp--) {
                        if (VIF_EXISTS(mrt, tmp))
                                break;
                }
                WRITE_ONCE(mrt->maxvif, tmp + 1);
        }

        spin_unlock(&mrt_lock);

        dev_set_allmulti(dev, -1);

        in6_dev = __in6_dev_get(dev);
        if (in6_dev) {
                atomic_dec(&in6_dev->cnf.mc_forwarding);
                inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
                                             NETCONFA_MC_FORWARDING,
                                             dev->ifindex, &in6_dev->cnf);
        }

        if ((v->flags & MIFF_REGISTER) && !notify)
                unregister_netdevice_queue(dev, head);

        netdev_put(dev, &v->dev_tracker);
        return 0;
}

static inline void ip6mr_cache_free_rcu(struct rcu_head *head)
{
        struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);

        kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c);
}

static inline void ip6mr_cache_free(struct mfc6_cache *c)
{
        call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu);
}

/* Destroy an unresolved cache entry, killing queued skbs
   and reporting error to netlink readers.
 */

static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c)
{
        struct net *net = read_pnet(&mrt->net);
        struct sk_buff *skb;

        atomic_dec(&mrt->cache_resolve_queue_len);

        while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) {
                if (ipv6_hdr(skb)->version == 0) {
                        struct nlmsghdr *nlh = skb_pull(skb,
                                                        sizeof(struct ipv6hdr));
                        nlh->nlmsg_type = NLMSG_ERROR;
                        nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
                        skb_trim(skb, nlh->nlmsg_len);
                        ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT;
                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
                } else
                        kfree_skb(skb);
        }

        ip6mr_cache_free(c);
}


/* Timer process for all the unresolved queue. */

static void ipmr_do_expire_process(struct mr_table *mrt)
{
        unsigned long now = jiffies;
        unsigned long expires = 10 * HZ;
        struct mr_mfc *c, *next;

        list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
                if (time_after(c->mfc_un.unres.expires, now)) {
                        /* not yet... */
                        unsigned long interval = c->mfc_un.unres.expires - now;
                        if (interval < expires)
                                expires = interval;
                        continue;
                }

                list_del(&c->list);
                mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
                ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
        }

        if (!list_empty(&mrt->mfc_unres_queue))
                mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
}

static void ipmr_expire_process(struct timer_list *t)
{
        struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);

        if (!spin_trylock(&mfc_unres_lock)) {
                mod_timer(&mrt->ipmr_expire_timer, jiffies + 1);
                return;
        }

        if (!list_empty(&mrt->mfc_unres_queue))
                ipmr_do_expire_process(mrt);

        spin_unlock(&mfc_unres_lock);
}

/* Fill oifs list. It is called under locked mrt_lock. */

static void ip6mr_update_thresholds(struct mr_table *mrt,
                                    struct mr_mfc *cache,
                                    unsigned char *ttls)
{
        int vifi;

        cache->mfc_un.res.minvif = MAXMIFS;
        cache->mfc_un.res.maxvif = 0;
        memset(cache->mfc_un.res.ttls, 255, MAXMIFS);

        for (vifi = 0; vifi < mrt->maxvif; vifi++) {
                if (VIF_EXISTS(mrt, vifi) &&
                    ttls[vifi] && ttls[vifi] < 255) {
                        cache->mfc_un.res.ttls[vifi] = ttls[vifi];
                        if (cache->mfc_un.res.minvif > vifi)
                                cache->mfc_un.res.minvif = vifi;
                        if (cache->mfc_un.res.maxvif <= vifi)
                                cache->mfc_un.res.maxvif = vifi + 1;
                }
        }
        WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies);
}

static int mif6_add(struct net *net, struct mr_table *mrt,
                    struct mif6ctl *vifc, int mrtsock)
{
        int vifi = vifc->mif6c_mifi;
        struct vif_device *v = &mrt->vif_table[vifi];
        struct net_device *dev;
        struct inet6_dev *in6_dev;
        int err;

        /* Is vif busy ? */
        if (VIF_EXISTS(mrt, vifi))
                return -EADDRINUSE;

        switch (vifc->mif6c_flags) {
#ifdef CONFIG_IPV6_PIMSM_V2
        case MIFF_REGISTER:
                /*
                 * Special Purpose VIF in PIM
                 * All the packets will be sent to the daemon
                 */
                if (mrt->mroute_reg_vif_num >= 0)
                        return -EADDRINUSE;
                dev = ip6mr_reg_vif(net, mrt);
                if (!dev)
                        return -ENOBUFS;
                err = dev_set_allmulti(dev, 1);
                if (err) {
                        unregister_netdevice(dev);
                        dev_put(dev);
                        return err;
                }
                break;
#endif
        case 0:
                dev = dev_get_by_index(net, vifc->mif6c_pifi);
                if (!dev)
                        return -EADDRNOTAVAIL;
                err = dev_set_allmulti(dev, 1);
                if (err) {
                        dev_put(dev);
                        return err;
                }
                break;
        default:
                return -EINVAL;
        }

        in6_dev = __in6_dev_get(dev);
        if (in6_dev) {
                atomic_inc(&in6_dev->cnf.mc_forwarding);
                inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
                                             NETCONFA_MC_FORWARDING,
                                             dev->ifindex, &in6_dev->cnf);
        }

        /* Fill in the VIF structures */
        vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold,
                        vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0),
                        MIFF_REGISTER);

        /* And finish update writing critical data */
        spin_lock(&mrt_lock);
        rcu_assign_pointer(v->dev, dev);
        netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
#ifdef CONFIG_IPV6_PIMSM_V2
        if (v->flags & MIFF_REGISTER)
                WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
#endif
        if (vifi + 1 > mrt->maxvif)
                WRITE_ONCE(mrt->maxvif, vifi + 1);
        spin_unlock(&mrt_lock);
        call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD,
                                       v, dev, vifi, mrt->id);
        return 0;
}

static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt,
                                           const struct in6_addr *origin,
                                           const struct in6_addr *mcastgrp)
{
        struct mfc6_cache_cmp_arg arg = {
                .mf6c_origin = *origin,
                .mf6c_mcastgrp = *mcastgrp,
        };

        return mr_mfc_find(mrt, &arg);
}

/* Look for a (*,G) entry */
static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt,
                                               struct in6_addr *mcastgrp,
                                               mifi_t mifi)
{
        struct mfc6_cache_cmp_arg arg = {
                .mf6c_origin = in6addr_any,
                .mf6c_mcastgrp = *mcastgrp,
        };

        if (ipv6_addr_any(mcastgrp))
                return mr_mfc_find_any_parent(mrt, mifi);
        return mr_mfc_find_any(mrt, mifi, &arg);
}

/* Look for a (S,G,iif) entry if parent != -1 */
static struct mfc6_cache *
ip6mr_cache_find_parent(struct mr_table *mrt,
                        const struct in6_addr *origin,
                        const struct in6_addr *mcastgrp,
                        int parent)
{
        struct mfc6_cache_cmp_arg arg = {
                .mf6c_origin = *origin,
                .mf6c_mcastgrp = *mcastgrp,
        };

        return mr_mfc_find_parent(mrt, &arg, parent);
}

/* Allocate a multicast cache entry */
static struct mfc6_cache *ip6mr_cache_alloc(void)
{
        struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
        if (!c)
                return NULL;
        c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
        c->_c.mfc_un.res.minvif = MAXMIFS;
        c->_c.free = ip6mr_cache_free_rcu;
        refcount_set(&c->_c.mfc_un.res.refcount, 1);
        return c;
}

static struct mfc6_cache *ip6mr_cache_alloc_unres(void)
{
        struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
        if (!c)
                return NULL;
        skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
        c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
        return c;
}

/*
 *        A cache entry has gone into a resolved state from queued
 */

static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
                                struct mfc6_cache *uc, struct mfc6_cache *c)
{
        struct sk_buff *skb;

        /*
         *        Play the pending entries through our router
         */

        while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
                if (ipv6_hdr(skb)->version == 0) {
                        struct nlmsghdr *nlh = skb_pull(skb,
                                                        sizeof(struct ipv6hdr));

                        if (mr_fill_mroute(mrt, skb, &c->_c,
                                           nlmsg_data(nlh)) > 0) {
                                nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh;
                        } else {
                                nlh->nlmsg_type = NLMSG_ERROR;
                                nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
                                skb_trim(skb, nlh->nlmsg_len);
                                ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE;
                        }
                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
                } else {
                        rcu_read_lock();
                        ip6_mr_forward(net, mrt, skb->dev, skb, c);
                        rcu_read_unlock();
                }
        }
}

/*
 *        Bounce a cache query up to pim6sd and netlink.
 *
 *        Called under rcu_read_lock()
 */

static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
                              mifi_t mifi, int assert)
{
        struct sock *mroute6_sk;
        struct sk_buff *skb;
        struct mrt6msg *msg;
        int ret;

#ifdef CONFIG_IPV6_PIMSM_V2
        if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE)
                skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
                                                +sizeof(*msg));
        else
#endif
                skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC);

        if (!skb)
                return -ENOBUFS;

        /* I suppose that internal messages
         * do not require checksums */

        skb->ip_summed = CHECKSUM_UNNECESSARY;

#ifdef CONFIG_IPV6_PIMSM_V2
        if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) {
                /* Ugly, but we have no choice with this interface.
                   Duplicate old header, fix length etc.
                   And all this only to mangle msg->im6_msgtype and
                   to set msg->im6_mbz to "mbz" :-)
                 */
                __skb_pull(skb, skb_network_offset(pkt));

                skb_push(skb, sizeof(*msg));
                skb_reset_transport_header(skb);
                msg = (struct mrt6msg *)skb_transport_header(skb);
                msg->im6_mbz = 0;
                msg->im6_msgtype = assert;
                if (assert == MRT6MSG_WRMIFWHOLE)
                        msg->im6_mif = mifi;
                else
                        msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num);
                msg->im6_pad = 0;
                msg->im6_src = ipv6_hdr(pkt)->saddr;
                msg->im6_dst = ipv6_hdr(pkt)->daddr;

                skb->ip_summed = CHECKSUM_UNNECESSARY;
        } else
#endif
        {
        /*
         *        Copy the IP header
         */

        skb_put(skb, sizeof(struct ipv6hdr));
        skb_reset_network_header(skb);
        skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr));

        /*
         *        Add our header
         */
        skb_put(skb, sizeof(*msg));
        skb_reset_transport_header(skb);
        msg = (struct mrt6msg *)skb_transport_header(skb);

        msg->im6_mbz = 0;
        msg->im6_msgtype = assert;
        msg->im6_mif = mifi;
        msg->im6_pad = 0;
        msg->im6_src = ipv6_hdr(pkt)->saddr;
        msg->im6_dst = ipv6_hdr(pkt)->daddr;

        skb_dst_set(skb, dst_clone(skb_dst(pkt)));
        skb->ip_summed = CHECKSUM_UNNECESSARY;
        }

        mroute6_sk = rcu_dereference(mrt->mroute_sk);
        if (!mroute6_sk) {
                kfree_skb(skb);
                return -EINVAL;
        }

        mrt6msg_netlink_event(mrt, skb);

        /* Deliver to user space multicast routing algorithms */
        ret = sock_queue_rcv_skb(mroute6_sk, skb);

        if (ret < 0) {
                net_warn_ratelimited("mroute6: pending queue full, dropping entries\n");
                kfree_skb(skb);
        }

        return ret;
}

/* Queue a packet for resolution. It gets locked cache entry! */
static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
                                  struct sk_buff *skb, struct net_device *dev)
{
        struct mfc6_cache *c;
        bool found = false;
        int err;

        spin_lock_bh(&mfc_unres_lock);
        list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
                if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) &&
                    ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) {
                        found = true;
                        break;
                }
        }

        if (!found) {
                /*
                 *        Create a new entry if allowable
                 */

                c = ip6mr_cache_alloc_unres();
                if (!c) {
                        spin_unlock_bh(&mfc_unres_lock);

                        kfree_skb(skb);
                        return -ENOBUFS;
                }

                /* Fill in the new cache entry */
                c->_c.mfc_parent = -1;
                c->mf6c_origin = ipv6_hdr(skb)->saddr;
                c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr;

                /*
                 *        Reflect first query at pim6sd
                 */
                err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE);
                if (err < 0) {
                        /* If the report failed throw the cache entry
                           out - Brad Parker
                         */
                        spin_unlock_bh(&mfc_unres_lock);

                        ip6mr_cache_free(c);
                        kfree_skb(skb);
                        return err;
                }

                atomic_inc(&mrt->cache_resolve_queue_len);
                list_add(&c->_c.list, &mrt->mfc_unres_queue);
                mr6_netlink_event(mrt, c, RTM_NEWROUTE);

                ipmr_do_expire_process(mrt);
        }

        /* See if we can append the packet */
        if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
                kfree_skb(skb);
                err = -ENOBUFS;
        } else {
                if (dev) {
                        skb->dev = dev;
                        skb->skb_iif = dev->ifindex;
                }
                skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
                err = 0;
        }

        spin_unlock_bh(&mfc_unres_lock);
        return err;
}

/*
 *        MFC6 cache manipulation by user space
 */

static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc,
                            int parent)
{
        struct mfc6_cache *c;

        /* The entries are added/deleted only under RTNL */
        rcu_read_lock();
        c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr,
                                    &mfc->mf6cc_mcastgrp.sin6_addr, parent);
        rcu_read_unlock();
        if (!c)
                return -ENOENT;
        rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params);
        list_del_rcu(&c->_c.list);

        call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
                                       FIB_EVENT_ENTRY_DEL, c, mrt->id);
        mr6_netlink_event(mrt, c, RTM_DELROUTE);
        mr_cache_put(&c->_c);
        return 0;
}

static int ip6mr_device_event(struct notifier_block *this,
                              unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);
        struct mr_table *mrt;
        struct vif_device *v;
        int ct;

        if (event != NETDEV_UNREGISTER)
                return NOTIFY_DONE;

        ip6mr_for_each_table(mrt, net) {
                v = &mrt->vif_table[0];
                for (ct = 0; ct < mrt->maxvif; ct++, v++) {
                        if (rcu_access_pointer(v->dev) == dev)
                                mif6_delete(mrt, ct, 1, NULL);
                }
        }

        return NOTIFY_DONE;
}

static unsigned int ip6mr_seq_read(const struct net *net)
{
        return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net);
}

static int ip6mr_dump(struct net *net, struct notifier_block *nb,
                      struct netlink_ext_ack *extack)
{
        return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
                       ip6mr_mr_table_iter, extack);
}

static struct notifier_block ip6_mr_notifier = {
        .notifier_call = ip6mr_device_event
};

static const struct fib_notifier_ops ip6mr_notifier_ops_template = {
        .family                = RTNL_FAMILY_IP6MR,
        .fib_seq_read        = ip6mr_seq_read,
        .fib_dump        = ip6mr_dump,
        .owner                = THIS_MODULE,
};

static int __net_init ip6mr_notifier_init(struct net *net)
{
        struct fib_notifier_ops *ops;

        net->ipv6.ipmr_seq = 0;

        ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net);
        if (IS_ERR(ops))
                return PTR_ERR(ops);

        net->ipv6.ip6mr_notifier_ops = ops;

        return 0;
}

static void __net_exit ip6mr_notifier_exit(struct net *net)
{
        fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops);
        net->ipv6.ip6mr_notifier_ops = NULL;
}

/* Setup for IP multicast routing */
static int __net_init ip6mr_net_init(struct net *net)
{
        int err;

        err = ip6mr_notifier_init(net);
        if (err)
                return err;

        err = ip6mr_rules_init(net);
        if (err < 0)
                goto ip6mr_rules_fail;

#ifdef CONFIG_PROC_FS
        err = -ENOMEM;
        if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops,
                        sizeof(struct mr_vif_iter)))
                goto proc_vif_fail;
        if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
                        sizeof(struct mr_mfc_iter)))
                goto proc_cache_fail;
#endif

        return 0;

#ifdef CONFIG_PROC_FS
proc_cache_fail:
        remove_proc_entry("ip6_mr_vif", net->proc_net);
proc_vif_fail:
        rtnl_lock();
        ip6mr_rules_exit(net);
        rtnl_unlock();
#endif
ip6mr_rules_fail:
        ip6mr_notifier_exit(net);
        return err;
}

static void __net_exit ip6mr_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("ip6_mr_cache", net->proc_net);
        remove_proc_entry("ip6_mr_vif", net->proc_net);
#endif
        ip6mr_notifier_exit(net);
}

static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list)
{
        struct net *net;

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list)
                ip6mr_rules_exit(net);
        rtnl_unlock();
}

static struct pernet_operations ip6mr_net_ops = {
        .init = ip6mr_net_init,
        .exit = ip6mr_net_exit,
        .exit_batch = ip6mr_net_exit_batch,
};

static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = {
        {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR,
         .msgtype = RTM_GETROUTE,
         .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute},
};

int __init ip6_mr_init(void)
{
        int err;

        mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN);
        if (!mrt_cachep)
                return -ENOMEM;

        err = register_pernet_subsys(&ip6mr_net_ops);
        if (err)
                goto reg_pernet_fail;

        err = register_netdevice_notifier(&ip6_mr_notifier);
        if (err)
                goto reg_notif_fail;
#ifdef CONFIG_IPV6_PIMSM_V2
        if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) {
                pr_err("%s: can't add PIM protocol\n", __func__);
                err = -EAGAIN;
                goto add_proto_fail;
        }
#endif
        err = rtnl_register_many(ip6mr_rtnl_msg_handlers);
        if (!err)
                return 0;

#ifdef CONFIG_IPV6_PIMSM_V2
        inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
add_proto_fail:
        unregister_netdevice_notifier(&ip6_mr_notifier);
#endif
reg_notif_fail:
        unregister_pernet_subsys(&ip6mr_net_ops);
reg_pernet_fail:
        kmem_cache_destroy(mrt_cachep);
        return err;
}

void __init ip6_mr_cleanup(void)
{
        rtnl_unregister_many(ip6mr_rtnl_msg_handlers);
#ifdef CONFIG_IPV6_PIMSM_V2
        inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
#endif
        unregister_netdevice_notifier(&ip6_mr_notifier);
        unregister_pernet_subsys(&ip6mr_net_ops);
        kmem_cache_destroy(mrt_cachep);
}

static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
                         struct mf6cctl *mfc, int mrtsock, int parent)
{
        unsigned char ttls[MAXMIFS];
        struct mfc6_cache *uc, *c;
        struct mr_mfc *_uc;
        bool found;
        int i, err;

        if (mfc->mf6cc_parent >= MAXMIFS)
                return -ENFILE;

        memset(ttls, 255, MAXMIFS);
        for (i = 0; i < MAXMIFS; i++) {
                if (IF_ISSET(i, &mfc->mf6cc_ifset))
                        ttls[i] = 1;
        }

        /* The entries are added/deleted only under RTNL */
        rcu_read_lock();
        c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr,
                                    &mfc->mf6cc_mcastgrp.sin6_addr, parent);
        rcu_read_unlock();
        if (c) {
                spin_lock(&mrt_lock);
                c->_c.mfc_parent = mfc->mf6cc_parent;
                ip6mr_update_thresholds(mrt, &c->_c, ttls);
                if (!mrtsock)
                        c->_c.mfc_flags |= MFC_STATIC;
                spin_unlock(&mrt_lock);
                call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
                                               c, mrt->id);
                mr6_netlink_event(mrt, c, RTM_NEWROUTE);
                return 0;
        }

        if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) &&
            !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr))
                return -EINVAL;

        c = ip6mr_cache_alloc();
        if (!c)
                return -ENOMEM;

        c->mf6c_origin = mfc->mf6cc_origin.sin6_addr;
        c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr;
        c->_c.mfc_parent = mfc->mf6cc_parent;
        ip6mr_update_thresholds(mrt, &c->_c, ttls);
        if (!mrtsock)
                c->_c.mfc_flags |= MFC_STATIC;

        err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode,
                                  ip6mr_rht_params);
        if (err) {
                pr_err("ip6mr: rhtable insert error %d\n", err);
                ip6mr_cache_free(c);
                return err;
        }
        list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list);

        /* Check to see if we resolved a queued list. If so we
         * need to send on the frames and tidy up.
         */
        found = false;
        spin_lock_bh(&mfc_unres_lock);
        list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) {
                uc = (struct mfc6_cache *)_uc;
                if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) &&
                    ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) {
                        list_del(&_uc->list);
                        atomic_dec(&mrt->cache_resolve_queue_len);
                        found = true;
                        break;
                }
        }
        if (list_empty(&mrt->mfc_unres_queue))
                timer_delete(&mrt->ipmr_expire_timer);
        spin_unlock_bh(&mfc_unres_lock);

        if (found) {
                ip6mr_cache_resolve(net, mrt, uc, c);
                ip6mr_cache_free(uc);
        }
        call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD,
                                       c, mrt->id);
        mr6_netlink_event(mrt, c, RTM_NEWROUTE);
        return 0;
}

/*
 *        Close the multicast socket, and clear the vif tables etc
 */

static void mroute_clean_tables(struct mr_table *mrt, int flags)
{
        struct mr_mfc *c, *tmp;
        LIST_HEAD(list);
        int i;

        /* Shut down all active vif entries */
        if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) {
                for (i = 0; i < mrt->maxvif; i++) {
                        if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
                             !(flags & MRT6_FLUSH_MIFS_STATIC)) ||
                            (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS)))
                                continue;
                        mif6_delete(mrt, i, 0, &list);
                }
                unregister_netdevice_many(&list);
        }

        /* Wipe the cache */
        if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) {
                list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
                        if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) ||
                            (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC)))
                                continue;
                        rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
                        list_del_rcu(&c->list);
                        call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
                                                       FIB_EVENT_ENTRY_DEL,
                                                       (struct mfc6_cache *)c, mrt->id);
                        mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
                        mr_cache_put(c);
                }
        }

        if (flags & MRT6_FLUSH_MFC) {
                if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
                        spin_lock_bh(&mfc_unres_lock);
                        list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
                                list_del(&c->list);
                                mr6_netlink_event(mrt, (struct mfc6_cache *)c,
                                                  RTM_DELROUTE);
                                ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
                        }
                        spin_unlock_bh(&mfc_unres_lock);
                }
        }
}

static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
{
        int err = 0;
        struct net *net = sock_net(sk);

        rtnl_lock();
        spin_lock(&mrt_lock);
        if (rtnl_dereference(mrt->mroute_sk)) {
                err = -EADDRINUSE;
        } else {
                rcu_assign_pointer(mrt->mroute_sk, sk);
                sock_set_flag(sk, SOCK_RCU_FREE);
                atomic_inc(&net->ipv6.devconf_all->mc_forwarding);
        }
        spin_unlock(&mrt_lock);

        if (!err)
                inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                             NETCONFA_MC_FORWARDING,
                                             NETCONFA_IFINDEX_ALL,
                                             net->ipv6.devconf_all);
        rtnl_unlock();

        return err;
}

int ip6mr_sk_done(struct sock *sk)
{
        struct net *net = sock_net(sk);
        struct ipv6_devconf *devconf;
        struct mr_table *mrt;
        int err = -EACCES;

        if (sk->sk_type != SOCK_RAW ||
            inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
                return err;

        devconf = net->ipv6.devconf_all;
        if (!devconf || !atomic_read(&devconf->mc_forwarding))
                return err;

        rtnl_lock();
        ip6mr_for_each_table(mrt, net) {
                if (sk == rtnl_dereference(mrt->mroute_sk)) {
                        spin_lock(&mrt_lock);
                        RCU_INIT_POINTER(mrt->mroute_sk, NULL);
                        /* Note that mroute_sk had SOCK_RCU_FREE set,
                         * so the RCU grace period before sk freeing
                         * is guaranteed by sk_destruct()
                         */
                        atomic_dec(&devconf->mc_forwarding);
                        spin_unlock(&mrt_lock);
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_MC_FORWARDING,
                                                     NETCONFA_IFINDEX_ALL,
                                                     net->ipv6.devconf_all);

                        mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC);
                        err = 0;
                        break;
                }
        }
        rtnl_unlock();

        return err;
}

bool mroute6_is_socket(struct net *net, struct sk_buff *skb)
{
        struct mr_table *mrt;
        struct flowi6 fl6 = {
                .flowi6_iif        = skb->skb_iif ? : LOOPBACK_IFINDEX,
                .flowi6_oif        = skb->dev->ifindex,
                .flowi6_mark        = skb->mark,
        };

        if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
                return NULL;

        return rcu_access_pointer(mrt->mroute_sk);
}
EXPORT_SYMBOL(mroute6_is_socket);

/*
 *        Socket options and virtual interface manipulation. The whole
 *        virtual interface system is a complete heap, but unfortunately
 *        that's how BSD mrouted happens to think. Maybe one day with a proper
 *        MOSPF/PIM router set up we can clean this up.
 */

int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
                          unsigned int optlen)
{
        int ret, parent = 0;
        struct mif6ctl vif;
        struct mf6cctl mfc;
        mifi_t mifi;
        struct net *net = sock_net(sk);
        struct mr_table *mrt;

        if (sk->sk_type != SOCK_RAW ||
            inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
                return -EOPNOTSUPP;

        mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
        if (!mrt)
                return -ENOENT;

        if (optname != MRT6_INIT) {
                if (sk != rcu_access_pointer(mrt->mroute_sk) &&
                    !ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EACCES;
        }

        switch (optname) {
        case MRT6_INIT:
                if (optlen < sizeof(int))
                        return -EINVAL;

                return ip6mr_sk_init(mrt, sk);

        case MRT6_DONE:
                return ip6mr_sk_done(sk);

        case MRT6_ADD_MIF:
                if (optlen < sizeof(vif))
                        return -EINVAL;
                if (copy_from_sockptr(&vif, optval, sizeof(vif)))
                        return -EFAULT;
                if (vif.mif6c_mifi >= MAXMIFS)
                        return -ENFILE;
                rtnl_lock();
                ret = mif6_add(net, mrt, &vif,
                               sk == rtnl_dereference(mrt->mroute_sk));
                rtnl_unlock();
                return ret;

        case MRT6_DEL_MIF:
                if (optlen < sizeof(mifi_t))
                        return -EINVAL;
                if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t)))
                        return -EFAULT;
                rtnl_lock();
                ret = mif6_delete(mrt, mifi, 0, NULL);
                rtnl_unlock();
                return ret;

        /*
         *        Manipulate the forwarding caches. These live
         *        in a sort of kernel/user symbiosis.
         */
        case MRT6_ADD_MFC:
        case MRT6_DEL_MFC:
                parent = -1;
                fallthrough;
        case MRT6_ADD_MFC_PROXY:
        case MRT6_DEL_MFC_PROXY:
                if (optlen < sizeof(mfc))
                        return -EINVAL;
                if (copy_from_sockptr(&mfc, optval, sizeof(mfc)))
                        return -EFAULT;
                if (parent == 0)
                        parent = mfc.mf6cc_parent;
                rtnl_lock();
                if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY)
                        ret = ip6mr_mfc_delete(mrt, &mfc, parent);
                else
                        ret = ip6mr_mfc_add(net, mrt, &mfc,
                                            sk ==
                                            rtnl_dereference(mrt->mroute_sk),
                                            parent);
                rtnl_unlock();
                return ret;

        case MRT6_FLUSH:
        {
                int flags;

                if (optlen != sizeof(flags))
                        return -EINVAL;
                if (copy_from_sockptr(&flags, optval, sizeof(flags)))
                        return -EFAULT;
                rtnl_lock();
                mroute_clean_tables(mrt, flags);
                rtnl_unlock();
                return 0;
        }

        /*
         *        Control PIM assert (to activate pim will activate assert)
         */
        case MRT6_ASSERT:
        {
                int v;

                if (optlen != sizeof(v))
                        return -EINVAL;
                if (copy_from_sockptr(&v, optval, sizeof(v)))
                        return -EFAULT;
                mrt->mroute_do_assert = v;
                return 0;
        }

#ifdef CONFIG_IPV6_PIMSM_V2
        case MRT6_PIM:
        {
                bool do_wrmifwhole;
                int v;

                if (optlen != sizeof(v))
                        return -EINVAL;
                if (copy_from_sockptr(&v, optval, sizeof(v)))
                        return -EFAULT;

                do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE);
                v = !!v;
                rtnl_lock();
                ret = 0;
                if (v != mrt->mroute_do_pim) {
                        mrt->mroute_do_pim = v;
                        mrt->mroute_do_assert = v;
                        mrt->mroute_do_wrvifwhole = do_wrmifwhole;
                }
                rtnl_unlock();
                return ret;
        }

#endif
#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
        case MRT6_TABLE:
        {
                u32 v;

                if (optlen != sizeof(u32))
                        return -EINVAL;
                if (copy_from_sockptr(&v, optval, sizeof(v)))
                        return -EFAULT;
                /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */
                if (v != RT_TABLE_DEFAULT && v >= 100000000)
                        return -EINVAL;
                if (sk == rcu_access_pointer(mrt->mroute_sk))
                        return -EBUSY;

                rtnl_lock();
                ret = 0;
                mrt = ip6mr_new_table(net, v);
                if (IS_ERR(mrt))
                        ret = PTR_ERR(mrt);
                else
                        raw6_sk(sk)->ip6mr_table = v;
                rtnl_unlock();
                return ret;
        }
#endif
        /*
         *        Spurious command, or MRT6_VERSION which you cannot
         *        set.
         */
        default:
                return -ENOPROTOOPT;
        }
}

/*
 *        Getsock opt support for the multicast routing system.
 */

int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
                          sockptr_t optlen)
{
        int olr;
        int val;
        struct net *net = sock_net(sk);
        struct mr_table *mrt;

        if (sk->sk_type != SOCK_RAW ||
            inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
                return -EOPNOTSUPP;

        mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
        if (!mrt)
                return -ENOENT;

        switch (optname) {
        case MRT6_VERSION:
                val = 0x0305;
                break;
#ifdef CONFIG_IPV6_PIMSM_V2
        case MRT6_PIM:
                val = mrt->mroute_do_pim;
                break;
#endif
        case MRT6_ASSERT:
                val = mrt->mroute_do_assert;
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (copy_from_sockptr(&olr, optlen, sizeof(int)))
                return -EFAULT;

        olr = min_t(int, olr, sizeof(int));
        if (olr < 0)
                return -EINVAL;

        if (copy_to_sockptr(optlen, &olr, sizeof(int)))
                return -EFAULT;
        if (copy_to_sockptr(optval, &val, olr))
                return -EFAULT;
        return 0;
}

/*
 *        The IP multicast ioctl support routines.
 */
int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
{
        struct sioc_sg_req6 *sr;
        struct sioc_mif_req6 *vr;
        struct vif_device *vif;
        struct mfc6_cache *c;
        struct net *net = sock_net(sk);
        struct mr_table *mrt;

        mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
        if (!mrt)
                return -ENOENT;

        switch (cmd) {
        case SIOCGETMIFCNT_IN6:
                vr = (struct sioc_mif_req6 *)arg;
                if (vr->mifi >= mrt->maxvif)
                        return -EINVAL;
                vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif);
                rcu_read_lock();
                vif = &mrt->vif_table[vr->mifi];
                if (VIF_EXISTS(mrt, vr->mifi)) {
                        vr->icount = READ_ONCE(vif->pkt_in);
                        vr->ocount = READ_ONCE(vif->pkt_out);
                        vr->ibytes = READ_ONCE(vif->bytes_in);
                        vr->obytes = READ_ONCE(vif->bytes_out);
                        rcu_read_unlock();
                        return 0;
                }
                rcu_read_unlock();
                return -EADDRNOTAVAIL;
        case SIOCGETSGCNT_IN6:
                sr = (struct sioc_sg_req6 *)arg;

                rcu_read_lock();
                c = ip6mr_cache_find(mrt, &sr->src.sin6_addr,
                                     &sr->grp.sin6_addr);
                if (c) {
                        sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
                        sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
                        sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
                        rcu_read_unlock();
                        return 0;
                }
                rcu_read_unlock();
                return -EADDRNOTAVAIL;
        default:
                return -ENOIOCTLCMD;
        }
}

#ifdef CONFIG_COMPAT
struct compat_sioc_sg_req6 {
        struct sockaddr_in6 src;
        struct sockaddr_in6 grp;
        compat_ulong_t pktcnt;
        compat_ulong_t bytecnt;
        compat_ulong_t wrong_if;
};

struct compat_sioc_mif_req6 {
        mifi_t        mifi;
        compat_ulong_t icount;
        compat_ulong_t ocount;
        compat_ulong_t ibytes;
        compat_ulong_t obytes;
};

int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
        struct compat_sioc_sg_req6 sr;
        struct compat_sioc_mif_req6 vr;
        struct vif_device *vif;
        struct mfc6_cache *c;
        struct net *net = sock_net(sk);
        struct mr_table *mrt;

        mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
        if (!mrt)
                return -ENOENT;

        switch (cmd) {
        case SIOCGETMIFCNT_IN6:
                if (copy_from_user(&vr, arg, sizeof(vr)))
                        return -EFAULT;
                if (vr.mifi >= mrt->maxvif)
                        return -EINVAL;
                vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
                rcu_read_lock();
                vif = &mrt->vif_table[vr.mifi];
                if (VIF_EXISTS(mrt, vr.mifi)) {
                        vr.icount = READ_ONCE(vif->pkt_in);
                        vr.ocount = READ_ONCE(vif->pkt_out);
                        vr.ibytes = READ_ONCE(vif->bytes_in);
                        vr.obytes = READ_ONCE(vif->bytes_out);
                        rcu_read_unlock();

                        if (copy_to_user(arg, &vr, sizeof(vr)))
                                return -EFAULT;
                        return 0;
                }
                rcu_read_unlock();
                return -EADDRNOTAVAIL;
        case SIOCGETSGCNT_IN6:
                if (copy_from_user(&sr, arg, sizeof(sr)))
                        return -EFAULT;

                rcu_read_lock();
                c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
                if (c) {
                        sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
                        sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
                        sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
                        rcu_read_unlock();

                        if (copy_to_user(arg, &sr, sizeof(sr)))
                                return -EFAULT;
                        return 0;
                }
                rcu_read_unlock();
                return -EADDRNOTAVAIL;
        default:
                return -ENOIOCTLCMD;
        }
}
#endif

static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
                      IPSTATS_MIB_OUTFORWDATAGRAMS);
        return dst_output(net, sk, skb);
}

/*
 *        Processing handlers for ip6mr_forward
 */

static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
                          struct sk_buff *skb, int vifi)
{
        struct vif_device *vif = &mrt->vif_table[vifi];
        struct net_device *vif_dev;
        struct ipv6hdr *ipv6h;
        struct dst_entry *dst;
        struct flowi6 fl6;

        vif_dev = vif_dev_read(vif);
        if (!vif_dev)
                goto out_free;

#ifdef CONFIG_IPV6_PIMSM_V2
        if (vif->flags & MIFF_REGISTER) {
                WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
                WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
                DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
                DEV_STATS_INC(vif_dev, tx_packets);
                ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT);
                goto out_free;
        }
#endif

        ipv6h = ipv6_hdr(skb);

        fl6 = (struct flowi6) {
                .flowi6_oif = vif->link,
                .daddr = ipv6h->daddr,
        };

        dst = ip6_route_output(net, NULL, &fl6);
        if (dst->error) {
                dst_release(dst);
                goto out_free;
        }

        skb_dst_drop(skb);
        skb_dst_set(skb, dst);

        /*
         * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
         * not only before forwarding, but after forwarding on all output
         * interfaces. It is clear, if mrouter runs a multicasting
         * program, it should receive packets not depending to what interface
         * program is joined.
         * If we will not make it, the program will have to join on all
         * interfaces. On the other hand, multihoming host (or router, but
         * not mrouter) cannot join to more than one interface - it will
         * result in receiving multiple packets.
         */
        skb->dev = vif_dev;
        WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
        WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);

        /* We are about to write */
        /* XXX: extension headers? */
        if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev)))
                goto out_free;

        ipv6h = ipv6_hdr(skb);
        ipv6h->hop_limit--;

        IP6CB(skb)->flags |= IP6SKB_FORWARDED;

        return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
                       net, NULL, skb, skb->dev, vif_dev,
                       ip6mr_forward2_finish);

out_free:
        kfree_skb(skb);
        return 0;
}

/* Called with rcu_read_lock() */
static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
{
        int ct;

        /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */
        for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
                if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
                        break;
        }
        return ct;
}

/* Called under rcu_read_lock() */
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
                           struct net_device *dev, struct sk_buff *skb,
                           struct mfc6_cache *c)
{
        int psend = -1;
        int vif, ct;
        int true_vifi = ip6mr_find_vif(mrt, dev);

        vif = c->_c.mfc_parent;
        atomic_long_inc(&c->_c.mfc_un.res.pkt);
        atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
        WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);

        if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) {
                struct mfc6_cache *cache_proxy;

                /* For an (*,G) entry, we only check that the incoming
                 * interface is part of the static tree.
                 */
                cache_proxy = mr_mfc_find_any_parent(mrt, vif);
                if (cache_proxy &&
                    cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
                        goto forward;
        }

        /*
         * Wrong interface: drop packet and (maybe) send PIM assert.
         */
        if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
                atomic_long_inc(&c->_c.mfc_un.res.wrong_if);

                if (true_vifi >= 0 && mrt->mroute_do_assert &&
                    /* pimsm uses asserts, when switching from RPT to SPT,
                       so that we cannot check that packet arrived on an oif.
                       It is bad, but otherwise we would need to move pretty
                       large chunk of pimd to kernel. Ough... --ANK
                     */
                    (mrt->mroute_do_pim ||
                     c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
                    time_after(jiffies,
                               c->_c.mfc_un.res.last_assert +
                               MFC_ASSERT_THRESH)) {
                        c->_c.mfc_un.res.last_assert = jiffies;
                        ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
                        if (mrt->mroute_do_wrvifwhole)
                                ip6mr_cache_report(mrt, skb, true_vifi,
                                                   MRT6MSG_WRMIFWHOLE);
                }
                goto dont_forward;
        }

forward:
        WRITE_ONCE(mrt->vif_table[vif].pkt_in,
                   mrt->vif_table[vif].pkt_in + 1);
        WRITE_ONCE(mrt->vif_table[vif].bytes_in,
                   mrt->vif_table[vif].bytes_in + skb->len);

        /*
         *        Forward the frame
         */
        if (ipv6_addr_any(&c->mf6c_origin) &&
            ipv6_addr_any(&c->mf6c_mcastgrp)) {
                if (true_vifi >= 0 &&
                    true_vifi != c->_c.mfc_parent &&
                    ipv6_hdr(skb)->hop_limit >
                                c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
                        /* It's an (*,*) entry and the packet is not coming from
                         * the upstream: forward the packet to the upstream
                         * only.
                         */
                        psend = c->_c.mfc_parent;
                        goto last_forward;
                }
                goto dont_forward;
        }
        for (ct = c->_c.mfc_un.res.maxvif - 1;
             ct >= c->_c.mfc_un.res.minvif; ct--) {
                /* For (*,G) entry, don't forward to the incoming interface */
                if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) &&
                    ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) {
                        if (psend != -1) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                                if (skb2)
                                        ip6mr_forward2(net, mrt, skb2, psend);
                        }
                        psend = ct;
                }
        }
last_forward:
        if (psend != -1) {
                ip6mr_forward2(net, mrt, skb, psend);
                return;
        }

dont_forward:
        kfree_skb(skb);
}


/*
 *        Multicast packets for forwarding arrive here
 */

int ip6_mr_input(struct sk_buff *skb)
{
        struct mfc6_cache *cache;
        struct net *net = dev_net(skb->dev);
        struct mr_table *mrt;
        struct flowi6 fl6 = {
                .flowi6_iif        = skb->dev->ifindex,
                .flowi6_mark        = skb->mark,
        };
        int err;
        struct net_device *dev;

        /* skb->dev passed in is the master dev for vrfs.
         * Get the proper interface that does have a vif associated with it.
         */
        dev = skb->dev;
        if (netif_is_l3_master(skb->dev)) {
                dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
                if (!dev) {
                        kfree_skb(skb);
                        return -ENODEV;
                }
        }

        err = ip6mr_fib_lookup(net, &fl6, &mrt);
        if (err < 0) {
                kfree_skb(skb);
                return err;
        }

        cache = ip6mr_cache_find(mrt,
                                 &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
        if (!cache) {
                int vif = ip6mr_find_vif(mrt, dev);

                if (vif >= 0)
                        cache = ip6mr_cache_find_any(mrt,
                                                     &ipv6_hdr(skb)->daddr,
                                                     vif);
        }

        /*
         *        No usable cache entry
         */
        if (!cache) {
                int vif;

                vif = ip6mr_find_vif(mrt, dev);
                if (vif >= 0) {
                        int err = ip6mr_cache_unresolved(mrt, vif, skb, dev);

                        return err;
                }
                kfree_skb(skb);
                return -ENODEV;
        }

        ip6_mr_forward(net, mrt, dev, skb, cache);

        return 0;
}

int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
                    u32 portid)
{
        int err;
        struct mr_table *mrt;
        struct mfc6_cache *cache;
        struct rt6_info *rt = dst_rt6_info(skb_dst(skb));

        rcu_read_lock();
        mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT);
        if (!mrt) {
                rcu_read_unlock();
                return -ENOENT;
        }

        cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
        if (!cache && skb->dev) {
                int vif = ip6mr_find_vif(mrt, skb->dev);

                if (vif >= 0)
                        cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr,
                                                     vif);
        }

        if (!cache) {
                struct sk_buff *skb2;
                struct ipv6hdr *iph;
                struct net_device *dev;
                int vif;

                dev = skb->dev;
                if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
                        rcu_read_unlock();
                        return -ENODEV;
                }

                /* really correct? */
                skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
                if (!skb2) {
                        rcu_read_unlock();
                        return -ENOMEM;
                }

                NETLINK_CB(skb2).portid = portid;
                skb_reset_transport_header(skb2);

                skb_put(skb2, sizeof(struct ipv6hdr));
                skb_reset_network_header(skb2);

                iph = ipv6_hdr(skb2);
                iph->version = 0;
                iph->priority = 0;
                iph->flow_lbl[0] = 0;
                iph->flow_lbl[1] = 0;
                iph->flow_lbl[2] = 0;
                iph->payload_len = 0;
                iph->nexthdr = IPPROTO_NONE;
                iph->hop_limit = 0;
                iph->saddr = rt->rt6i_src.addr;
                iph->daddr = rt->rt6i_dst.addr;

                err = ip6mr_cache_unresolved(mrt, vif, skb2, dev);
                rcu_read_unlock();

                return err;
        }

        err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
        rcu_read_unlock();
        return err;
}

static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
                             u32 portid, u32 seq, struct mfc6_cache *c, int cmd,
                             int flags)
{
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;
        int err;

        nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
        if (!nlh)
                return -EMSGSIZE;

        rtm = nlmsg_data(nlh);
        rtm->rtm_family   = RTNL_FAMILY_IP6MR;
        rtm->rtm_dst_len  = 128;
        rtm->rtm_src_len  = 128;
        rtm->rtm_tos      = 0;
        rtm->rtm_table    = mrt->id;
        if (nla_put_u32(skb, RTA_TABLE, mrt->id))
                goto nla_put_failure;
        rtm->rtm_type = RTN_MULTICAST;
        rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
        if (c->_c.mfc_flags & MFC_STATIC)
                rtm->rtm_protocol = RTPROT_STATIC;
        else
                rtm->rtm_protocol = RTPROT_MROUTED;
        rtm->rtm_flags    = 0;

        if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) ||
            nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp))
                goto nla_put_failure;
        err = mr_fill_mroute(mrt, skb, &c->_c, rtm);
        /* do not break the dump if cache is unresolved */
        if (err < 0 && err != -ENOENT)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
                              u32 portid, u32 seq, struct mr_mfc *c,
                              int cmd, int flags)
{
        return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c,
                                 cmd, flags);
}

static int mr6_msgsize(bool unresolved, int maxvif)
{
        size_t len =
                NLMSG_ALIGN(sizeof(struct rtmsg))
                + nla_total_size(4)        /* RTA_TABLE */
                + nla_total_size(sizeof(struct in6_addr))        /* RTA_SRC */
                + nla_total_size(sizeof(struct in6_addr))        /* RTA_DST */
                ;

        if (!unresolved)
                len = len
                      + nla_total_size(4)        /* RTA_IIF */
                      + nla_total_size(0)        /* RTA_MULTIPATH */
                      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
                                                /* RTA_MFC_STATS */
                      + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
                ;

        return len;
}

static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
                              int cmd)
{
        struct net *net = read_pnet(&mrt->net);
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif),
                        GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
        if (err < 0)
                goto errout;

        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC);
        return;

errout:
        kfree_skb(skb);
        rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err);
}

static size_t mrt6msg_netlink_msgsize(size_t payloadlen)
{
        size_t len =
                NLMSG_ALIGN(sizeof(struct rtgenmsg))
                + nla_total_size(1)        /* IP6MRA_CREPORT_MSGTYPE */
                + nla_total_size(4)        /* IP6MRA_CREPORT_MIF_ID */
                                        /* IP6MRA_CREPORT_SRC_ADDR */
                + nla_total_size(sizeof(struct in6_addr))
                                        /* IP6MRA_CREPORT_DST_ADDR */
                + nla_total_size(sizeof(struct in6_addr))
                                        /* IP6MRA_CREPORT_PKT */
                + nla_total_size(payloadlen)
                ;

        return len;
}

static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
{
        struct net *net = read_pnet(&mrt->net);
        struct nlmsghdr *nlh;
        struct rtgenmsg *rtgenm;
        struct mrt6msg *msg;
        struct sk_buff *skb;
        struct nlattr *nla;
        int payloadlen;

        payloadlen = pkt->len - sizeof(struct mrt6msg);
        msg = (struct mrt6msg *)skb_transport_header(pkt);

        skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC);
        if (!skb)
                goto errout;

        nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
                        sizeof(struct rtgenmsg), 0);
        if (!nlh)
                goto errout;
        rtgenm = nlmsg_data(nlh);
        rtgenm->rtgen_family = RTNL_FAMILY_IP6MR;
        if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) ||
            nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) ||
            nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR,
                             &msg->im6_src) ||
            nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR,
                             &msg->im6_dst))
                goto nla_put_failure;

        nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen);
        if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg),
                                  nla_data(nla), payloadlen))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);

        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC);
        return;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
errout:
        kfree_skb(skb);
        rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS);
}

static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = {
        [RTA_SRC]                = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
        [RTA_DST]                = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
        [RTA_TABLE]                = { .type = NLA_U32 },
};

static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb,
                                        const struct nlmsghdr *nlh,
                                        struct nlattr **tb,
                                        struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        int err;

        err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy,
                          extack);
        if (err)
                return err;

        rtm = nlmsg_data(nlh);
        if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
            (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
            rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
            rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Invalid values in header for multicast route get request");
                return -EINVAL;
        }

        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
                NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
                return -EINVAL;
        }

        return 0;
}

static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct in6_addr src = {}, grp = {};
        struct nlattr *tb[RTA_MAX + 1];
        struct mfc6_cache *cache;
        struct mr_table *mrt;
        struct sk_buff *skb;
        u32 tableid;
        int err;

        err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
        if (err < 0)
                return err;

        if (tb[RTA_SRC])
                src = nla_get_in6_addr(tb[RTA_SRC]);
        if (tb[RTA_DST])
                grp = nla_get_in6_addr(tb[RTA_DST]);
        tableid = nla_get_u32_default(tb[RTA_TABLE], 0);

        mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT);
        if (!mrt) {
                NL_SET_ERR_MSG_MOD(extack, "MR table does not exist");
                return -ENOENT;
        }

        /* entries are added/deleted only under RTNL */
        rcu_read_lock();
        cache = ip6mr_cache_find(mrt, &src, &grp);
        rcu_read_unlock();
        if (!cache) {
                NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found");
                return -ENOENT;
        }

        skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL);
        if (!skb)
                return -ENOBUFS;

        err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
                                nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0);
        if (err < 0) {
                kfree_skb(skb);
                return err;
        }

        return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
}

static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct fib_dump_filter filter = {
                .rtnl_held = true,
        };
        int err;

        if (cb->strict_check) {
                err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
                                            &filter, cb);
                if (err < 0)
                        return err;
        }

        if (filter.table_id) {
                struct mr_table *mrt;

                mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id);
                if (!mrt) {
                        if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR)
                                return skb->len;

                        NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
                        return -ENOENT;
                }
                err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute,
                                    &mfc_unres_lock, &filter);
                return skb->len ? : err;
        }

        return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
                                _ip6mr_fill_mroute, &mfc_unres_lock, &filter);
}





































































  125 











  125 






  125 






























































































































































































































  387 

   22 
  169 






  509 

  510 







   73 















































































































































































































  509 
















  510 







  510 





  509 


















  509 

  510 














  509 





   73 
   73 
   73 
    3 
   71 



   24 

















  492 






  491 












  493 


  493 
















   52 
   52 


   52 

    4 




    4 
   50 

















































































































































   52 































































































































































  460 













 1306 


























 1309 
    1 













 1315 


 1315 
 1320 
 1314 










   71 


























































































  448 

 1273 



 1214 
 1279 




    5 
    5 














   47 

   28 

















   75 
   76 


   75 







   24 










    3 


    3 




























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/file.c
 *
 *  Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
 *
 *  Manage the dynamic fd arrays in the process files_struct.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <linux/file_ref.h>
#include <net/sock.h>
#include <linux/init_task.h>

#include "internal.h"

static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt)
{
        /*
         * If the reference count was already in the dead zone, then this
         * put() operation is imbalanced. Warn, put the reference count back to
         * DEAD and tell the caller to not deconstruct the object.
         */
        if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
                atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
                return false;
        }

        /*
         * This is a put() operation on a saturated refcount. Restore the
         * mean saturation value and tell the caller to not deconstruct the
         * object.
         */
        if (cnt > FILE_REF_MAXREF)
                atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
        return false;
}

/**
 * __file_ref_put - Slowpath of file_ref_put()
 * @ref:        Pointer to the reference count
 * @cnt:        Current reference count
 *
 * Invoked when the reference count is outside of the valid zone.
 *
 * Return:
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely schedule the
 *        object, which is protected by the reference counter, for
 *        deconstruction.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        deconstruct the protected object.
 */
bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
{
        /* Did this drop the last reference? */
        if (likely(cnt == FILE_REF_NOREF)) {
                /*
                 * Carefully try to set the reference count to FILE_REF_DEAD.
                 *
                 * This can fail if a concurrent get() operation has
                 * elevated it again or the corresponding put() even marked
                 * it dead already. Both are valid situations and do not
                 * require a retry. If this fails the caller is not
                 * allowed to deconstruct the object.
                 */
                if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
                        return false;

                /*
                 * The caller can safely schedule the object for
                 * deconstruction. Provide acquire ordering.
                 */
                smp_acquire__after_ctrl_dep();
                return true;
        }

        return __file_ref_put_badval(ref, cnt);
}
EXPORT_SYMBOL_GPL(__file_ref_put);

unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
#define __const_min(x, y) ((x) < (y) ? (x) : (y))
unsigned int sysctl_nr_open_max =
        __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;

static void __free_fdtable(struct fdtable *fdt)
{
        kvfree(fdt->fd);
        kvfree(fdt->open_fds);
        kfree(fdt);
}

static void free_fdtable_rcu(struct rcu_head *rcu)
{
        __free_fdtable(container_of(rcu, struct fdtable, rcu));
}

#define BITBIT_NR(nr)        BITS_TO_LONGS(BITS_TO_LONGS(nr))
#define BITBIT_SIZE(nr)        (BITBIT_NR(nr) * sizeof(long))

#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds
/*
 * Copy 'count' fd bits from the old table to the new table and clear the extra
 * space if any.  This does not copy the file pointers.  Called with the files
 * spinlock held for write.
 */
static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
                            unsigned int copy_words)
{
        unsigned int nwords = fdt_words(nfdt);

        bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds,
                        copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
        bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec,
                        copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG);
        bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits,
                        copy_words, nwords);
}

/*
 * Copy all file descriptors from the old table to the new, expanded table and
 * clear the extra space.  Called with the files spinlock held for write.
 */
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
        size_t cpy, set;

        BUG_ON(nfdt->max_fds < ofdt->max_fds);

        cpy = ofdt->max_fds * sizeof(struct file *);
        set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
        memcpy(nfdt->fd, ofdt->fd, cpy);
        memset((char *)nfdt->fd + cpy, 0, set);

        copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt));
}

/*
 * Note how the fdtable bitmap allocations very much have to be a multiple of
 * BITS_PER_LONG. This is not only because we walk those things in chunks of
 * 'unsigned long' in some places, but simply because that is how the Linux
 * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
 * they are very much "bits in an array of unsigned long".
 */
static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{
        struct fdtable *fdt;
        unsigned int nr;
        void *data;

        /*
         * Figure out how many fds we actually want to support in this fdtable.
         * Allocation steps are keyed to the size of the fdarray, since it
         * grows far faster than any of the other dynamic data. We try to fit
         * the fdarray into comfortable page-tuned chunks: starting at 1024B
         * and growing in powers of two from there on.  Since we called only
         * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
         * already gives BITS_PER_LONG slots), the above boils down to
         * 1.  use the smallest power of two large enough to give us that many
         * slots.
         * 2.  on 32bit skip 64 and 128 - the minimal capacity we want there is
         * 256 slots (i.e. 1Kb fd array).
         * 3.  on 64bit don't skip anything, 1Kb fd array means 128 slots there
         * and we are never going to be asked for 64 or less.
         */
        if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
                nr = 256;
        else
                nr = roundup_pow_of_two(slots_wanted);
        /*
         * Note that this can drive nr *below* what we had passed if sysctl_nr_open
         * had been set lower between the check in expand_files() and here.
         *
         * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
         * bitmaps handling below becomes unpleasant, to put it mildly...
         */
        if (unlikely(nr > sysctl_nr_open)) {
                nr = round_down(sysctl_nr_open, BITS_PER_LONG);
                if (nr < slots_wanted)
                        return ERR_PTR(-EMFILE);
        }

        fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
        if (!fdt)
                goto out;
        fdt->max_fds = nr;
        data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_fdt;
        fdt->fd = data;

        data = kvmalloc(max_t(size_t,
                                 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
                                 GFP_KERNEL_ACCOUNT);
        if (!data)
                goto out_arr;
        fdt->open_fds = data;
        data += nr / BITS_PER_BYTE;
        fdt->close_on_exec = data;
        data += nr / BITS_PER_BYTE;
        fdt->full_fds_bits = data;

        return fdt;

out_arr:
        kvfree(fdt->fd);
out_fdt:
        kfree(fdt);
out:
        return ERR_PTR(-ENOMEM);
}

/*
 * Expand the file descriptor table.
 * This function will allocate a new fdtable and both fd array and fdset, of
 * the given size.
 * Return <0 error code on error; 0 on successful completion.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_fdtable(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *new_fdt, *cur_fdt;

        spin_unlock(&files->file_lock);
        new_fdt = alloc_fdtable(nr + 1);

        /* make sure all fd_install() have seen resize_in_progress
         * or have finished their rcu_read_lock_sched() section.
         */
        if (atomic_read(&files->count) > 1)
                synchronize_rcu();

        spin_lock(&files->file_lock);
        if (IS_ERR(new_fdt))
                return PTR_ERR(new_fdt);
        cur_fdt = files_fdtable(files);
        BUG_ON(nr < cur_fdt->max_fds);
        copy_fdtable(new_fdt, cur_fdt);
        rcu_assign_pointer(files->fdt, new_fdt);
        if (cur_fdt != &files->fdtab)
                call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
        /* coupled with smp_rmb() in fd_install() */
        smp_wmb();
        return 0;
}

/*
 * Expand files.
 * This function will expand the file structures, if the requested size exceeds
 * the current capacity and there is room for expansion.
 * Return <0 error code on error; 0 on success.
 * The files->file_lock should be held on entry, and will be held on exit.
 */
static int expand_files(struct files_struct *files, unsigned int nr)
        __releases(files->file_lock)
        __acquires(files->file_lock)
{
        struct fdtable *fdt;
        int error;

repeat:
        fdt = files_fdtable(files);

        /* Do we need to expand? */
        if (nr < fdt->max_fds)
                return 0;

        if (unlikely(files->resize_in_progress)) {
                spin_unlock(&files->file_lock);
                wait_event(files->resize_wait, !files->resize_in_progress);
                spin_lock(&files->file_lock);
                goto repeat;
        }

        /* Can we expand? */
        if (unlikely(nr >= sysctl_nr_open))
                return -EMFILE;

        /* All good, so we try */
        files->resize_in_progress = true;
        error = expand_fdtable(files, nr);
        files->resize_in_progress = false;

        wake_up_all(&files->resize_wait);
        return error;
}

static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
                                       bool set)
{
        if (set) {
                __set_bit(fd, fdt->close_on_exec);
        } else {
                if (test_bit(fd, fdt->close_on_exec))
                        __clear_bit(fd, fdt->close_on_exec);
        }
}

static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
{
        __set_bit(fd, fdt->open_fds);
        __set_close_on_exec(fd, fdt, set);
        fd /= BITS_PER_LONG;
        if (!~fdt->open_fds[fd])
                __set_bit(fd, fdt->full_fds_bits);
}

static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
        __clear_bit(fd, fdt->open_fds);
        fd /= BITS_PER_LONG;
        if (test_bit(fd, fdt->full_fds_bits))
                __clear_bit(fd, fdt->full_fds_bits);
}

static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
{
        return test_bit(fd, fdt->open_fds);
}

/*
 * Note that a sane fdtable size always has to be a multiple of
 * BITS_PER_LONG, since we have bitmaps that are sized by this.
 *
 * punch_hole is optional - when close_range() is asked to unshare
 * and close, we don't need to copy descriptors in that range, so
 * a smaller cloned descriptor table might suffice if the last
 * currently opened descriptor falls into that range.
 */
static unsigned int sane_fdtable_size(struct fdtable *fdt, struct fd_range *punch_hole)
{
        unsigned int last = find_last_bit(fdt->open_fds, fdt->max_fds);

        if (last == fdt->max_fds)
                return NR_OPEN_DEFAULT;
        if (punch_hole && punch_hole->to >= last && punch_hole->from <= last) {
                last = find_last_bit(fdt->open_fds, punch_hole->from);
                if (last == punch_hole->from)
                        return NR_OPEN_DEFAULT;
        }
        return ALIGN(last + 1, BITS_PER_LONG);
}

/*
 * Allocate a new descriptor table and copy contents from the passed in
 * instance.  Returns a pointer to cloned table on success, ERR_PTR()
 * on failure.  For 'punch_hole' see sane_fdtable_size().
 */
struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_hole)
{
        struct files_struct *newf;
        struct file **old_fds, **new_fds;
        unsigned int open_files, i;
        struct fdtable *old_fdt, *new_fdt;

        newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
        if (!newf)
                return ERR_PTR(-ENOMEM);

        atomic_set(&newf->count, 1);

        spin_lock_init(&newf->file_lock);
        newf->resize_in_progress = false;
        init_waitqueue_head(&newf->resize_wait);
        newf->next_fd = 0;
        new_fdt = &newf->fdtab;
        new_fdt->max_fds = NR_OPEN_DEFAULT;
        new_fdt->close_on_exec = newf->close_on_exec_init;
        new_fdt->open_fds = newf->open_fds_init;
        new_fdt->full_fds_bits = newf->full_fds_bits_init;
        new_fdt->fd = &newf->fd_array[0];

        spin_lock(&oldf->file_lock);
        old_fdt = files_fdtable(oldf);
        open_files = sane_fdtable_size(old_fdt, punch_hole);

        /*
         * Check whether we need to allocate a larger fd array and fd set.
         */
        while (unlikely(open_files > new_fdt->max_fds)) {
                spin_unlock(&oldf->file_lock);

                if (new_fdt != &newf->fdtab)
                        __free_fdtable(new_fdt);

                new_fdt = alloc_fdtable(open_files);
                if (IS_ERR(new_fdt)) {
                        kmem_cache_free(files_cachep, newf);
                        return ERR_CAST(new_fdt);
                }

                /*
                 * Reacquire the oldf lock and a pointer to its fd table
                 * who knows it may have a new bigger fd table. We need
                 * the latest pointer.
                 */
                spin_lock(&oldf->file_lock);
                old_fdt = files_fdtable(oldf);
                open_files = sane_fdtable_size(old_fdt, punch_hole);
        }

        copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG);

        old_fds = old_fdt->fd;
        new_fds = new_fdt->fd;

        /*
         * We may be racing against fd allocation from other threads using this
         * files_struct, despite holding ->file_lock.
         *
         * alloc_fd() might have already claimed a slot, while fd_install()
         * did not populate it yet. Note the latter operates locklessly, so
         * the file can show up as we are walking the array below.
         *
         * At the same time we know no files will disappear as all other
         * operations take the lock.
         *
         * Instead of trying to placate userspace racing with itself, we
         * ref the file if we see it and mark the fd slot as unused otherwise.
         */
        for (i = open_files; i != 0; i--) {
                struct file *f = rcu_dereference_raw(*old_fds++);
                if (f) {
                        get_file(f);
                } else {
                        __clear_open_fd(open_files - i, new_fdt);
                }
                rcu_assign_pointer(*new_fds++, f);
        }
        spin_unlock(&oldf->file_lock);

        /* clear the remainder */
        memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));

        rcu_assign_pointer(newf->fdt, new_fdt);

        return newf;
}

static struct fdtable *close_files(struct files_struct * files)
{
        /*
         * It is safe to dereference the fd table without RCU or
         * ->file_lock because this is the last reference to the
         * files structure.
         */
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned int i, j = 0;

        for (;;) {
                unsigned long set;
                i = j * BITS_PER_LONG;
                if (i >= fdt->max_fds)
                        break;
                set = fdt->open_fds[j++];
                while (set) {
                        if (set & 1) {
                                struct file *file = fdt->fd[i];
                                if (file) {
                                        filp_close(file, files);
                                        cond_resched();
                                }
                        }
                        i++;
                        set >>= 1;
                }
        }

        return fdt;
}

void put_files_struct(struct files_struct *files)
{
        if (atomic_dec_and_test(&files->count)) {
                struct fdtable *fdt = close_files(files);

                /* free the arrays if they are not embedded */
                if (fdt != &files->fdtab)
                        __free_fdtable(fdt);
                kmem_cache_free(files_cachep, files);
        }
}

void exit_files(struct task_struct *tsk)
{
        struct files_struct * files = tsk->files;

        if (files) {
                task_lock(tsk);
                tsk->files = NULL;
                task_unlock(tsk);
                put_files_struct(files);
        }
}

struct files_struct init_files = {
        .count                = ATOMIC_INIT(1),
        .fdt                = &init_files.fdtab,
        .fdtab                = {
                .max_fds        = NR_OPEN_DEFAULT,
                .fd                = &init_files.fd_array[0],
                .close_on_exec        = init_files.close_on_exec_init,
                .open_fds        = init_files.open_fds_init,
                .full_fds_bits        = init_files.full_fds_bits_init,
        },
        .file_lock        = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
        .resize_wait        = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
};

static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
{
        unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
        unsigned int maxbit = maxfd / BITS_PER_LONG;
        unsigned int bitbit = start / BITS_PER_LONG;
        unsigned int bit;

        /*
         * Try to avoid looking at the second level bitmap
         */
        bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
                                 start & (BITS_PER_LONG - 1));
        if (bit < BITS_PER_LONG)
                return bit + bitbit * BITS_PER_LONG;

        bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
        if (bitbit >= maxfd)
                return maxfd;
        if (bitbit > start)
                start = bitbit;
        return find_next_zero_bit(fdt->open_fds, maxfd, start);
}

/*
 * allocate a file descriptor, mark it busy.
 */
static int alloc_fd(unsigned start, unsigned end, unsigned flags)
{
        struct files_struct *files = current->files;
        unsigned int fd;
        int error;
        struct fdtable *fdt;

        spin_lock(&files->file_lock);
repeat:
        fdt = files_fdtable(files);
        fd = start;
        if (fd < files->next_fd)
                fd = files->next_fd;

        if (likely(fd < fdt->max_fds))
                fd = find_next_fd(fdt, fd);

        /*
         * N.B. For clone tasks sharing a files structure, this test
         * will limit the total number of files that can be opened.
         */
        error = -EMFILE;
        if (unlikely(fd >= end))
                goto out;

        if (unlikely(fd >= fdt->max_fds)) {
                error = expand_files(files, fd);
                if (error < 0)
                        goto out;

                goto repeat;
        }

        if (start <= files->next_fd)
                files->next_fd = fd + 1;

        __set_open_fd(fd, fdt, flags & O_CLOEXEC);
        error = fd;
        VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);

out:
        spin_unlock(&files->file_lock);
        return error;
}

int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
{
        return alloc_fd(0, nofile, flags);
}

int get_unused_fd_flags(unsigned flags)
{
        return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
}
EXPORT_SYMBOL(get_unused_fd_flags);

static void __put_unused_fd(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = files_fdtable(files);
        __clear_open_fd(fd, fdt);
        if (fd < files->next_fd)
                files->next_fd = fd;
}

void put_unused_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __put_unused_fd(files, fd);
        spin_unlock(&files->file_lock);
}

EXPORT_SYMBOL(put_unused_fd);

/**
 * fd_install - install a file pointer in the fd array
 * @fd: file descriptor to install the file in
 * @file: the file to install
 *
 * This consumes the "file" refcount, so callers should treat it
 * as if they had called fput(file).
 */
void fd_install(unsigned int fd, struct file *file)
{
        struct files_struct *files = current->files;
        struct fdtable *fdt;

        if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING)))
                return;

        rcu_read_lock_sched();

        if (unlikely(files->resize_in_progress)) {
                rcu_read_unlock_sched();
                spin_lock(&files->file_lock);
                fdt = files_fdtable(files);
                VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
                rcu_assign_pointer(fdt->fd[fd], file);
                spin_unlock(&files->file_lock);
                return;
        }
        /* coupled with smp_wmb() in expand_fdtable() */
        smp_rmb();
        fdt = rcu_dereference_sched(files->fdt);
        VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
        rcu_assign_pointer(fdt->fd[fd], file);
        rcu_read_unlock_sched();
}

EXPORT_SYMBOL(fd_install);

/**
 * file_close_fd_locked - return file associated with fd
 * @files: file struct to retrieve file from
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Context: files_lock must be held.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
{
        struct fdtable *fdt = files_fdtable(files);
        struct file *file;

        lockdep_assert_held(&files->file_lock);

        if (fd >= fdt->max_fds)
                return NULL;

        fd = array_index_nospec(fd, fdt->max_fds);
        file = rcu_dereference_raw(fdt->fd[fd]);
        if (file) {
                rcu_assign_pointer(fdt->fd[fd], NULL);
                __put_unused_fd(files, fd);
        }
        return file;
}

int close_fd(unsigned fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);
        if (!file)
                return -EBADF;

        return filp_close(file, files);
}
EXPORT_SYMBOL(close_fd);

/**
 * last_fd - return last valid index into fd table
 * @fdt: File descriptor table.
 *
 * Context: Either rcu read lock or files_lock must be held.
 *
 * Returns: Last valid index into fdtable.
 */
static inline unsigned last_fd(struct fdtable *fdt)
{
        return fdt->max_fds - 1;
}

static inline void __range_cloexec(struct files_struct *cur_fds,
                                   unsigned int fd, unsigned int max_fd)
{
        struct fdtable *fdt;

        /* make sure we're using the correct maximum value */
        spin_lock(&cur_fds->file_lock);
        fdt = files_fdtable(cur_fds);
        max_fd = min(last_fd(fdt), max_fd);
        if (fd <= max_fd)
                bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
        spin_unlock(&cur_fds->file_lock);
}

static inline void __range_close(struct files_struct *files, unsigned int fd,
                                 unsigned int max_fd)
{
        struct file *file;
        unsigned n;

        spin_lock(&files->file_lock);
        n = last_fd(files_fdtable(files));
        max_fd = min(max_fd, n);

        for (; fd <= max_fd; fd++) {
                file = file_close_fd_locked(files, fd);
                if (file) {
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                } else if (need_resched()) {
                        spin_unlock(&files->file_lock);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }
        }
        spin_unlock(&files->file_lock);
}

/**
 * sys_close_range() - Close all file descriptors in a given range.
 *
 * @fd:     starting file descriptor to close
 * @max_fd: last file descriptor to close
 * @flags:  CLOSE_RANGE flags.
 *
 * This closes a range of file descriptors. All file descriptors
 * from @fd up to and including @max_fd are closed.
 * Currently, errors to close a given file descriptor are ignored.
 */
SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
                unsigned int, flags)
{
        struct task_struct *me = current;
        struct files_struct *cur_fds = me->files, *fds = NULL;

        if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
                return -EINVAL;

        if (fd > max_fd)
                return -EINVAL;

        if ((flags & CLOSE_RANGE_UNSHARE) && atomic_read(&cur_fds->count) > 1) {
                struct fd_range range = {fd, max_fd}, *punch_hole = &range;

                /*
                 * If the caller requested all fds to be made cloexec we always
                 * copy all of the file descriptors since they still want to
                 * use them.
                 */
                if (flags & CLOSE_RANGE_CLOEXEC)
                        punch_hole = NULL;

                fds = dup_fd(cur_fds, punch_hole);
                if (IS_ERR(fds))
                        return PTR_ERR(fds);
                /*
                 * We used to share our file descriptor table, and have now
                 * created a private one, make sure we're using it below.
                 */
                swap(cur_fds, fds);
        }

        if (flags & CLOSE_RANGE_CLOEXEC)
                __range_cloexec(cur_fds, fd, max_fd);
        else
                __range_close(cur_fds, fd, max_fd);

        if (fds) {
                /*
                 * We're done closing the files we were supposed to. Time to install
                 * the new file descriptor table and drop the old one.
                 */
                task_lock(me);
                me->files = cur_fds;
                task_unlock(me);
                put_files_struct(fds);
        }

        return 0;
}

/**
 * file_close_fd - return file associated with fd
 * @fd: file descriptor to retrieve file for
 *
 * Doesn't take a separate reference count.
 *
 * Returns: The file associated with @fd (NULL if @fd is not open)
 */
struct file *file_close_fd(unsigned int fd)
{
        struct files_struct *files = current->files;
        struct file *file;

        spin_lock(&files->file_lock);
        file = file_close_fd_locked(files, fd);
        spin_unlock(&files->file_lock);

        return file;
}

void do_close_on_exec(struct files_struct *files)
{
        unsigned i;
        struct fdtable *fdt;

        /* exec unshares first */
        spin_lock(&files->file_lock);
        for (i = 0; ; i++) {
                unsigned long set;
                unsigned fd = i * BITS_PER_LONG;
                fdt = files_fdtable(files);
                if (fd >= fdt->max_fds)
                        break;
                set = fdt->close_on_exec[i];
                if (!set)
                        continue;
                fdt->close_on_exec[i] = 0;
                for ( ; set ; fd++, set >>= 1) {
                        struct file *file;
                        if (!(set & 1))
                                continue;
                        file = fdt->fd[fd];
                        if (!file)
                                continue;
                        rcu_assign_pointer(fdt->fd[fd], NULL);
                        __put_unused_fd(files, fd);
                        spin_unlock(&files->file_lock);
                        filp_close(file, files);
                        cond_resched();
                        spin_lock(&files->file_lock);
                }

        }
        spin_unlock(&files->file_lock);
}

static struct file *__get_file_rcu(struct file __rcu **f)
{
        struct file __rcu *file;
        struct file __rcu *file_reloaded;
        struct file __rcu *file_reloaded_cmp;

        file = rcu_dereference_raw(*f);
        if (!file)
                return NULL;

        if (unlikely(!file_ref_get(&file->f_ref)))
                return ERR_PTR(-EAGAIN);

        file_reloaded = rcu_dereference_raw(*f);

        /*
         * Ensure that all accesses have a dependency on the load from
         * rcu_dereference_raw() above so we get correct ordering
         * between reuse/allocation and the pointer check below.
         */
        file_reloaded_cmp = file_reloaded;
        OPTIMIZER_HIDE_VAR(file_reloaded_cmp);

        /*
         * file_ref_get() above provided a full memory barrier when we
         * acquired a reference.
         *
         * This is paired with the write barrier from assigning to the
         * __rcu protected file pointer so that if that pointer still
         * matches the current file, we know we have successfully
         * acquired a reference to the right file.
         *
         * If the pointers don't match the file has been reallocated by
         * SLAB_TYPESAFE_BY_RCU.
         */
        if (file == file_reloaded_cmp)
                return file_reloaded;

        fput(file);
        return ERR_PTR(-EAGAIN);
}

/**
 * get_file_rcu - try go get a reference to a file under rcu
 * @f: the file to get a reference on
 *
 * This function tries to get a reference on @f carefully verifying that
 * @f hasn't been reused.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_rcu(struct file __rcu **f)
{
        for (;;) {
                struct file __rcu *file;

                file = __get_file_rcu(f);
                if (!IS_ERR(file))
                        return file;
        }
}
EXPORT_SYMBOL_GPL(get_file_rcu);

/**
 * get_file_active - try go get a reference to a file
 * @f: the file to get a reference on
 *
 * In contast to get_file_rcu() the pointer itself isn't part of the
 * reference counting.
 *
 * This function should rarely have to be used and only by users who
 * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it.
 *
 * Return: Returns @f with the reference count increased or NULL.
 */
struct file *get_file_active(struct file **f)
{
        struct file __rcu *file;

        rcu_read_lock();
        file = __get_file_rcu(f);
        rcu_read_unlock();
        if (IS_ERR(file))
                file = NULL;
        return file;
}
EXPORT_SYMBOL_GPL(get_file_active);

static inline struct file *__fget_files_rcu(struct files_struct *files,
       unsigned int fd, fmode_t mask)
{
        for (;;) {
                struct file *file;
                struct fdtable *fdt = rcu_dereference_raw(files->fdt);
                struct file __rcu **fdentry;
                unsigned long nospec_mask;

                /* Mask is a 0 for invalid fd's, ~0 for valid ones */
                nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);

                /*
                 * fdentry points to the 'fd' offset, or fdt->fd[0].
                 * Loading from fdt->fd[0] is always safe, because the
                 * array always exists.
                 */
                fdentry = fdt->fd + (fd & nospec_mask);

                /* Do the load, then mask any invalid result */
                file = rcu_dereference_raw(*fdentry);
                file = (void *)(nospec_mask & (unsigned long)file);
                if (unlikely(!file))
                        return NULL;

                /*
                 * Ok, we have a file pointer that was valid at
                 * some point, but it might have become stale since.
                 *
                 * We need to confirm it by incrementing the refcount
                 * and then check the lookup again.
                 *
                 * file_ref_get() gives us a full memory barrier. We
                 * only really need an 'acquire' one to protect the
                 * loads below, but we don't have that.
                 */
                if (unlikely(!file_ref_get(&file->f_ref)))
                        continue;

                /*
                 * Such a race can take two forms:
                 *
                 *  (a) the file ref already went down to zero and the
                 *      file hasn't been reused yet or the file count
                 *      isn't zero but the file has already been reused.
                 *
                 *  (b) the file table entry has changed under us.
                 *       Note that we don't need to re-check the 'fdt->fd'
                 *       pointer having changed, because it always goes
                 *       hand-in-hand with 'fdt'.
                 *
                 * If so, we need to put our ref and try again.
                 */
                if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
                    unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
                        fput(file);
                        continue;
                }

                /*
                 * This isn't the file we're looking for or we're not
                 * allowed to get a reference to it.
                 */
                if (unlikely(file->f_mode & mask)) {
                        fput(file);
                        return NULL;
                }

                /*
                 * Ok, we have a ref to the file, and checked that it
                 * still exists.
                 */
                return file;
        }
}

static struct file *__fget_files(struct files_struct *files, unsigned int fd,
                                 fmode_t mask)
{
        struct file *file;

        rcu_read_lock();
        file = __fget_files_rcu(files, fd, mask);
        rcu_read_unlock();

        return file;
}

static inline struct file *__fget(unsigned int fd, fmode_t mask)
{
        return __fget_files(current->files, fd, mask);
}

struct file *fget(unsigned int fd)
{
        return __fget(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fget);

struct file *fget_raw(unsigned int fd)
{
        return __fget(fd, 0);
}
EXPORT_SYMBOL(fget_raw);

struct file *fget_task(struct task_struct *task, unsigned int fd)
{
        struct file *file = NULL;

        task_lock(task);
        if (task->files)
                file = __fget_files(task->files, fd, 0);
        task_unlock(task);

        return file;
}

struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
{
        /* Must be called with rcu_read_lock held */
        struct files_struct *files;
        unsigned int fd = *ret_fd;
        struct file *file = NULL;

        task_lock(task);
        files = task->files;
        if (files) {
                rcu_read_lock();
                for (; fd < files_fdtable(files)->max_fds; fd++) {
                        file = __fget_files_rcu(files, fd, 0);
                        if (file)
                                break;
                }
                rcu_read_unlock();
        }
        task_unlock(task);
        *ret_fd = fd;
        return file;
}
EXPORT_SYMBOL(fget_task_next);

/*
 * Lightweight file lookup - no refcnt increment if fd table isn't shared.
 *
 * You can use this instead of fget if you satisfy all of the following
 * conditions:
 * 1) You must call fput_light before exiting the syscall and returning control
 *    to userspace (i.e. you cannot remember the returned struct file * after
 *    returning to userspace).
 * 2) You must not call filp_close on the returned struct file * in between
 *    calls to fget_light and fput_light.
 * 3) You must not clone the current task in between the calls to fget_light
 *    and fput_light.
 *
 * The fput_needed flag returned by fget_light should be passed to the
 * corresponding fput_light.
 *
 * (As an exception to rule 2, you can call filp_close between fget_light and
 * fput_light provided that you capture a real refcount with get_file before
 * the call to filp_close, and ensure that this real refcount is fput *after*
 * the fput_light call.)
 *
 * See also the documentation in rust/kernel/file.rs.
 */
static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
{
        struct files_struct *files = current->files;
        struct file *file;

        /*
         * If another thread is concurrently calling close_fd() followed
         * by put_files_struct(), we must not observe the old table
         * entry combined with the new refcount - otherwise we could
         * return a file that is concurrently being freed.
         *
         * atomic_read_acquire() pairs with atomic_dec_and_test() in
         * put_files_struct().
         */
        if (likely(atomic_read_acquire(&files->count) == 1)) {
                file = files_lookup_fd_raw(files, fd);
                if (!file || unlikely(file->f_mode & mask))
                        return EMPTY_FD;
                return BORROWED_FD(file);
        } else {
                file = __fget_files(files, fd, mask);
                if (!file)
                        return EMPTY_FD;
                return CLONED_FD(file);
        }
}
struct fd fdget(unsigned int fd)
{
        return __fget_light(fd, FMODE_PATH);
}
EXPORT_SYMBOL(fdget);

struct fd fdget_raw(unsigned int fd)
{
        return __fget_light(fd, 0);
}

/*
 * Try to avoid f_pos locking. We only need it if the
 * file is marked for FMODE_ATOMIC_POS, and it can be
 * accessed multiple ways.
 *
 * Always do it for directories, because pidfd_getfd()
 * can make a file accessible even if it otherwise would
 * not be, and for directories this is a correctness
 * issue, not a "POSIX requirement".
 */
static inline bool file_needs_f_pos_lock(struct file *file)
{
        if (!(file->f_mode & FMODE_ATOMIC_POS))
                return false;
        if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF)
                return true;
        if (file->f_op->iterate_shared)
                return true;
        return false;
}

bool file_seek_cur_needs_f_lock(struct file *file)
{
        if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
                return false;

        VFS_WARN_ON_ONCE((file_count(file) > 1) &&
                         !mutex_is_locked(&file->f_pos_lock));
        return true;
}

struct fd fdget_pos(unsigned int fd)
{
        struct fd f = fdget(fd);
        struct file *file = fd_file(f);

        if (likely(file) && file_needs_f_pos_lock(file)) {
                f.word |= FDPUT_POS_UNLOCK;
                mutex_lock(&file->f_pos_lock);
        }
        return f;
}

void __f_unlock_pos(struct file *f)
{
        mutex_unlock(&f->f_pos_lock);
}

/*
 * We only lock f_pos if we have threads or if the file might be
 * shared with another process. In both cases we'll have an elevated
 * file count (done either by fdget() or by fork()).
 */

void set_close_on_exec(unsigned int fd, int flag)
{
        struct files_struct *files = current->files;
        spin_lock(&files->file_lock);
        __set_close_on_exec(fd, files_fdtable(files), flag);
        spin_unlock(&files->file_lock);
}

bool get_close_on_exec(unsigned int fd)
{
        bool res;
        rcu_read_lock();
        res = close_on_exec(fd, current->files);
        rcu_read_unlock();
        return res;
}

static int do_dup2(struct files_struct *files,
        struct file *file, unsigned fd, unsigned flags)
__releases(&files->file_lock)
{
        struct file *tofree;
        struct fdtable *fdt;

        /*
         * dup2() is expected to close the file installed in the target fd slot
         * (if any). However, userspace hand-picking a fd may be racing against
         * its own threads which happened to allocate it in open() et al but did
         * not populate it yet.
         *
         * Broadly speaking we may be racing against the following:
         * fd = get_unused_fd_flags();     // fd slot reserved, ->fd[fd] == NULL
         * file = hard_work_goes_here();
         * fd_install(fd, file);           // only now ->fd[fd] == file
         *
         * It is an invariant that a successfully allocated fd has a NULL entry
         * in the array until the matching fd_install().
         *
         * If we fit the window, we have the fd to populate, yet no target file
         * to close. Trying to ignore it and install our new file would violate
         * the invariant and make fd_install() overwrite our file.
         *
         * Things can be done(tm) to handle this. However, the issue does not
         * concern legitimate programs and we only need to make sure the kernel
         * does not trip over it.
         *
         * The simplest way out is to return an error if we find ourselves here.
         *
         * POSIX is silent on the issue, we return -EBUSY.
         */
        fdt = files_fdtable(files);
        fd = array_index_nospec(fd, fdt->max_fds);
        tofree = rcu_dereference_raw(fdt->fd[fd]);
        if (!tofree && fd_is_open(fd, fdt))
                goto Ebusy;
        get_file(file);
        rcu_assign_pointer(fdt->fd[fd], file);
        __set_open_fd(fd, fdt, flags & O_CLOEXEC);
        spin_unlock(&files->file_lock);

        if (tofree)
                filp_close(tofree, files);

        return fd;

Ebusy:
        spin_unlock(&files->file_lock);
        return -EBUSY;
}

int replace_fd(unsigned fd, struct file *file, unsigned flags)
{
        int err;
        struct files_struct *files = current->files;

        if (!file)
                return close_fd(fd);

        if (fd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, fd);
        if (unlikely(err < 0))
                goto out_unlock;
        return do_dup2(files, file, fd, flags);

out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

/**
 * receive_fd() - Install received file into file descriptor table
 * @file: struct file that was received from another process
 * @ufd: __user pointer to write new fd number to
 * @o_flags: the O_* flags to apply to the new fd entry
 *
 * Installs a received file into the file descriptor table, with appropriate
 * checks and count updates. Optionally writes the fd number to userspace, if
 * @ufd is non-NULL.
 *
 * This helper handles its own reference counting of the incoming
 * struct file.
 *
 * Returns newly install fd or -ve on error.
 */
int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
        int new_fd;
        int error;

        error = security_file_receive(file);
        if (error)
                return error;

        new_fd = get_unused_fd_flags(o_flags);
        if (new_fd < 0)
                return new_fd;

        if (ufd) {
                error = put_user(new_fd, ufd);
                if (error) {
                        put_unused_fd(new_fd);
                        return error;
                }
        }

        fd_install(new_fd, get_file(file));
        __receive_sock(file);
        return new_fd;
}
EXPORT_SYMBOL_GPL(receive_fd);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
{
        int error;

        error = security_file_receive(file);
        if (error)
                return error;
        error = replace_fd(new_fd, file, o_flags);
        if (error)
                return error;
        __receive_sock(file);
        return new_fd;
}

static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
        int err = -EBADF;
        struct file *file;
        struct files_struct *files = current->files;

        if ((flags & ~O_CLOEXEC) != 0)
                return -EINVAL;

        if (unlikely(oldfd == newfd))
                return -EINVAL;

        if (newfd >= rlimit(RLIMIT_NOFILE))
                return -EBADF;

        spin_lock(&files->file_lock);
        err = expand_files(files, newfd);
        file = files_lookup_fd_locked(files, oldfd);
        if (unlikely(!file))
                goto Ebadf;
        if (unlikely(err < 0)) {
                if (err == -EMFILE)
                        goto Ebadf;
                goto out_unlock;
        }
        return do_dup2(files, file, newfd, flags);

Ebadf:
        err = -EBADF;
out_unlock:
        spin_unlock(&files->file_lock);
        return err;
}

SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
{
        return ksys_dup3(oldfd, newfd, flags);
}

SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
{
        if (unlikely(newfd == oldfd)) { /* corner case */
                struct files_struct *files = current->files;
                struct file *f;
                int retval = oldfd;

                rcu_read_lock();
                f = __fget_files_rcu(files, oldfd, 0);
                if (!f)
                        retval = -EBADF;
                rcu_read_unlock();
                if (f)
                        fput(f);
                return retval;
        }
        return ksys_dup3(oldfd, newfd, 0);
}

SYSCALL_DEFINE1(dup, unsigned int, fildes)
{
        int ret = -EBADF;
        struct file *file = fget_raw(fildes);

        if (file) {
                ret = get_unused_fd_flags(0);
                if (ret >= 0)
                        fd_install(ret, file);
                else
                        fput(file);
        }
        return ret;
}

int f_dupfd(unsigned int from, struct file *file, unsigned flags)
{
        unsigned long nofile = rlimit(RLIMIT_NOFILE);
        int err;
        if (from >= nofile)
                return -EINVAL;
        err = alloc_fd(from, nofile, flags);
        if (err >= 0) {
                get_file(file);
                fd_install(err, file);
        }
        return err;
}

int iterate_fd(struct files_struct *files, unsigned n,
                int (*f)(const void *, struct file *, unsigned),
                const void *p)
{
        struct fdtable *fdt;
        int res = 0;
        if (!files)
                return 0;
        spin_lock(&files->file_lock);
        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
                struct file *file;
                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
                if (!file)
                        continue;
                res = f(p, file, n);
                if (res)
                        break;
        }
        spin_unlock(&files->file_lock);
        return res;
}
EXPORT_SYMBOL(iterate_fd);
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  411 


























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
/*
 * Performance events:
 *
 *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
 *    Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar
 *    Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra
 *
 * Data type definitions, declarations, prototypes.
 *
 *    Started by: Thomas Gleixner and Ingo Molnar
 *
 * For licencing details see kernel-base/COPYING
 */
#ifndef _LINUX_PERF_EVENT_H
#define _LINUX_PERF_EVENT_H

#include <uapi/linux/perf_event.h>
#include <uapi/linux/bpf_perf_event.h>

/*
 * Kernel-internal data types and definitions:
 */

#ifdef CONFIG_PERF_EVENTS
# include <asm/perf_event.h>
# include <asm/local64.h>
#endif

#define PERF_GUEST_ACTIVE        0x01
#define PERF_GUEST_USER        0x02

struct perf_guest_info_callbacks {
        unsigned int                        (*state)(void);
        unsigned long                        (*get_ip)(void);
        unsigned int                        (*handle_intel_pt_intr)(void);
};

#ifdef CONFIG_HAVE_HW_BREAKPOINT
#include <linux/rhashtable-types.h>
#include <asm/hw_breakpoint.h>
#endif

#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/fs.h>
#include <linux/pid_namespace.h>
#include <linux/workqueue.h>
#include <linux/ftrace.h>
#include <linux/cpu.h>
#include <linux/irq_work.h>
#include <linux/static_key.h>
#include <linux/jump_label_ratelimit.h>
#include <linux/atomic.h>
#include <linux/sysfs.h>
#include <linux/perf_regs.h>
#include <linux/cgroup.h>
#include <linux/refcount.h>
#include <linux/security.h>
#include <linux/static_call.h>
#include <linux/lockdep.h>
#include <asm/local.h>

struct perf_callchain_entry {
        __u64                                nr;
        __u64                                ip[]; /* /proc/sys/kernel/perf_event_max_stack */
};

struct perf_callchain_entry_ctx {
        struct perf_callchain_entry *entry;
        u32                            max_stack;
        u32                            nr;
        short                            contexts;
        bool                            contexts_maxed;
};

typedef unsigned long (*perf_copy_f)(void *dst, const void *src,
                                     unsigned long off, unsigned long len);

struct perf_raw_frag {
        union {
                struct perf_raw_frag        *next;
                unsigned long                pad;
        };
        perf_copy_f                        copy;
        void                                *data;
        u32                                size;
} __packed;

struct perf_raw_record {
        struct perf_raw_frag                frag;
        u32                                size;
};

static __always_inline bool perf_raw_frag_last(const struct perf_raw_frag *frag)
{
        return frag->pad < sizeof(u64);
}

/*
 * branch stack layout:
 *  nr: number of taken branches stored in entries[]
 *  hw_idx: The low level index of raw branch records
 *          for the most recent branch.
 *          -1ULL means invalid/unknown.
 *
 * Note that nr can vary from sample to sample
 * branches (to, from) are stored from most recent
 * to least recent, i.e., entries[0] contains the most
 * recent branch.
 * The entries[] is an abstraction of raw branch records,
 * which may not be stored in age order in HW, e.g. Intel LBR.
 * The hw_idx is to expose the low level index of raw
 * branch record for the most recent branch aka entries[0].
 * The hw_idx index is between -1 (unknown) and max depth,
 * which can be retrieved in /sys/devices/cpu/caps/branches.
 * For the architectures whose raw branch records are
 * already stored in age order, the hw_idx should be 0.
 */
struct perf_branch_stack {
        __u64                                nr;
        __u64                                hw_idx;
        struct perf_branch_entry        entries[];
};

struct task_struct;

/*
 * extra PMU register associated with an event
 */
struct hw_perf_event_extra {
        u64                config;        /* register value */
        unsigned int        reg;        /* register address or index */
        int                alloc;        /* extra register already allocated */
        int                idx;        /* index in shared_regs->regs[] */
};

/**
 * hw_perf_event::flag values
 *
 * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
 * usage.
 */
#define PERF_EVENT_FLAG_ARCH                        0x000fffff
#define PERF_EVENT_FLAG_USER_READ_CNT                0x80000000

static_assert((PERF_EVENT_FLAG_USER_READ_CNT & PERF_EVENT_FLAG_ARCH) == 0);

/**
 * struct hw_perf_event - performance event hardware details:
 */
struct hw_perf_event {
#ifdef CONFIG_PERF_EVENTS
        union {
                struct { /* hardware */
                        u64                config;
                        u64                last_tag;
                        unsigned long        config_base;
                        unsigned long        event_base;
                        int                event_base_rdpmc;
                        int                idx;
                        int                last_cpu;
                        int                flags;

                        struct hw_perf_event_extra extra_reg;
                        struct hw_perf_event_extra branch_reg;
                };
                struct { /* aux / Intel-PT */
                        u64                aux_config;
                        /*
                         * For AUX area events, aux_paused cannot be a state
                         * flag because it can be updated asynchronously to
                         * state.
                         */
                        unsigned int        aux_paused;
                };
                struct { /* software */
                        struct hrtimer        hrtimer;
                };
                struct { /* tracepoint */
                        /* for tp_event->class */
                        struct list_head        tp_list;
                };
                struct { /* amd_power */
                        u64        pwr_acc;
                        u64        ptsc;
                };
#ifdef CONFIG_HAVE_HW_BREAKPOINT
                struct { /* breakpoint */
                        /*
                         * Crufty hack to avoid the chicken and egg
                         * problem hw_breakpoint has with context
                         * creation and event initalization.
                         */
                        struct arch_hw_breakpoint        info;
                        struct rhlist_head                bp_list;
                };
#endif
                struct { /* amd_iommu */
                        u8        iommu_bank;
                        u8        iommu_cntr;
                        u16        padding;
                        u64        conf;
                        u64        conf1;
                };
        };
        /*
         * If the event is a per task event, this will point to the task in
         * question. See the comment in perf_event_alloc().
         */
        struct task_struct                *target;

        /*
         * PMU would store hardware filter configuration
         * here.
         */
        void                                *addr_filters;

        /* Last sync'ed generation of filters */
        unsigned long                        addr_filters_gen;

/*
 * hw_perf_event::state flags; used to track the PERF_EF_* state.
 */
#define PERF_HES_STOPPED        0x01 /* the counter is stopped */
#define PERF_HES_UPTODATE        0x02 /* event->count up-to-date */
#define PERF_HES_ARCH                0x04

        int                                state;

        /*
         * The last observed hardware counter value, updated with a
         * local64_cmpxchg() such that pmu::read() can be called nested.
         */
        local64_t                        prev_count;

        /*
         * The period to start the next sample with.
         */
        u64                                sample_period;

        union {
                struct { /* Sampling */
                        /*
                         * The period we started this sample with.
                         */
                        u64                                last_period;

                        /*
                         * However much is left of the current period;
                         * note that this is a full 64bit value and
                         * allows for generation of periods longer
                         * than hardware might allow.
                         */
                        local64_t                        period_left;
                };
                struct { /* Topdown events counting for context switch */
                        u64                                saved_metric;
                        u64                                saved_slots;
                };
        };

        /*
         * State for throttling the event, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                             interrupts_seq;
        u64                                interrupts;

        /*
         * State for freq target events, see __perf_event_overflow() and
         * perf_adjust_freq_unthr_context().
         */
        u64                                freq_time_stamp;
        u64                                freq_count_stamp;
#endif
};

struct perf_event;
struct perf_event_pmu_context;

/*
 * Common implementation detail of pmu::{start,commit,cancel}_txn
 */
#define PERF_PMU_TXN_ADD  0x1                /* txn to add/schedule event on PMU */
#define PERF_PMU_TXN_READ 0x2                /* txn to read event group from PMU */

/**
 * pmu::capabilities flags
 */
#define PERF_PMU_CAP_NO_INTERRUPT                0x0001
#define PERF_PMU_CAP_NO_NMI                        0x0002
#define PERF_PMU_CAP_AUX_NO_SG                        0x0004
#define PERF_PMU_CAP_EXTENDED_REGS                0x0008
#define PERF_PMU_CAP_EXCLUSIVE                        0x0010
#define PERF_PMU_CAP_ITRACE                        0x0020
#define PERF_PMU_CAP_NO_EXCLUDE                        0x0040
#define PERF_PMU_CAP_AUX_OUTPUT                        0x0080
#define PERF_PMU_CAP_EXTENDED_HW_TYPE                0x0100
#define PERF_PMU_CAP_AUX_PAUSE                        0x0200

/**
 * pmu::scope
 */
enum perf_pmu_scope {
        PERF_PMU_SCOPE_NONE        = 0,
        PERF_PMU_SCOPE_CORE,
        PERF_PMU_SCOPE_DIE,
        PERF_PMU_SCOPE_CLUSTER,
        PERF_PMU_SCOPE_PKG,
        PERF_PMU_SCOPE_SYS_WIDE,
        PERF_PMU_MAX_SCOPE,
};

struct perf_output_handle;

#define PMU_NULL_DEV        ((void *)(~0UL))

/**
 * struct pmu - generic performance monitoring unit
 */
struct pmu {
        struct list_head                entry;

        struct module                        *module;
        struct device                        *dev;
        struct device                        *parent;
        const struct attribute_group        **attr_groups;
        const struct attribute_group        **attr_update;
        const char                        *name;
        int                                type;

        /*
         * various common per-pmu feature flags
         */
        int                                capabilities;

        /*
         * PMU scope
         */
        unsigned int                        scope;

        struct perf_cpu_pmu_context * __percpu *cpu_pmu_context;
        atomic_t                        exclusive_cnt; /* < 0: cpu; > 0: tsk */
        int                                task_ctx_nr;
        int                                hrtimer_interval_ms;

        /* number of address filters this PMU can do */
        unsigned int                        nr_addr_filters;

        /*
         * Fully disable/enable this PMU, can be used to protect from the PMI
         * as well as for lazy/batch writing of the MSRs.
         */
        void (*pmu_enable)                (struct pmu *pmu); /* optional */
        void (*pmu_disable)                (struct pmu *pmu); /* optional */

        /*
         * Try and initialize the event for this PMU.
         *
         * Returns:
         *  -ENOENT        -- @event is not for this PMU
         *
         *  -ENODEV        -- @event is for this PMU but PMU not present
         *  -EBUSY        -- @event is for this PMU but PMU temporarily unavailable
         *  -EINVAL        -- @event is for this PMU but @event is not valid
         *  -EOPNOTSUPP -- @event is for this PMU, @event is valid, but not supported
         *  -EACCES        -- @event is for this PMU, @event is valid, but no privileges
         *
         *  0                -- @event is for this PMU and valid
         *
         * Other error return values are allowed.
         */
        int (*event_init)                (struct perf_event *event);

        /*
         * Notification that the event was mapped or unmapped.  Called
         * in the context of the mapping task.
         */
        void (*event_mapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */
        void (*event_unmapped)                (struct perf_event *event, struct mm_struct *mm); /* optional */

        /*
         * Flags for ->add()/->del()/ ->start()/->stop(). There are
         * matching hw_perf_event::state flags.
         */
#define PERF_EF_START        0x01                /* start the counter when adding    */
#define PERF_EF_RELOAD        0x02                /* reload the counter when starting */
#define PERF_EF_UPDATE        0x04                /* update the counter when stopping */
#define PERF_EF_PAUSE        0x08                /* AUX area event, pause tracing */
#define PERF_EF_RESUME        0x10                /* AUX area event, resume tracing */

        /*
         * Adds/Removes a counter to/from the PMU, can be done inside a
         * transaction, see the ->*_txn() methods.
         *
         * The add/del callbacks will reserve all hardware resources required
         * to service the event, this includes any counter constraint
         * scheduling etc.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on.
         *
         * ->add() called without PERF_EF_START should result in the same state
         *  as ->add() followed by ->stop().
         *
         * ->del() must always PERF_EF_UPDATE stop an event. If it calls
         *  ->stop() that must deal with already being stopped without
         *  PERF_EF_UPDATE.
         */
        int  (*add)                        (struct perf_event *event, int flags);
        void (*del)                        (struct perf_event *event, int flags);

        /*
         * Starts/Stops a counter present on the PMU.
         *
         * The PMI handler should stop the counter when perf_event_overflow()
         * returns !0. ->start() will be used to continue.
         *
         * Also used to change the sample period.
         *
         * Called with IRQs disabled and the PMU disabled on the CPU the event
         * is on -- will be called from NMI context with the PMU generates
         * NMIs.
         *
         * ->stop() with PERF_EF_UPDATE will read the counter and update
         *  period/count values like ->read() would.
         *
         * ->start() with PERF_EF_RELOAD will reprogram the counter
         *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
         *
         * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not
         * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with
         * PERF_EF_RESUME.
         *
         * ->start() with PERF_EF_RESUME will start as simply as possible but
         * only if the counter is not otherwise stopped. Will not overlap
         * another ->start() with PERF_EF_RESUME nor ->stop() with
         * PERF_EF_PAUSE.
         *
         * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other
         * ->stop()/->start() invocations, just not itself.
         */
        void (*start)                        (struct perf_event *event, int flags);
        void (*stop)                        (struct perf_event *event, int flags);

        /*
         * Updates the counter value of the event.
         *
         * For sampling capable PMUs this will also update the software period
         * hw_perf_event::period_left field.
         */
        void (*read)                        (struct perf_event *event);

        /*
         * Group events scheduling is treated as a transaction, add
         * group events as a whole and perform one schedulability test.
         * If the test fails, roll back the whole group
         *
         * Start the transaction, after this ->add() doesn't need to
         * do schedulability tests.
         *
         * Optional.
         */
        void (*start_txn)                (struct pmu *pmu, unsigned int txn_flags);
        /*
         * If ->start_txn() disabled the ->add() schedulability test
         * then ->commit_txn() is required to perform one. On success
         * the transaction is closed. On error the transaction is kept
         * open until ->cancel_txn() is called.
         *
         * Optional.
         */
        int  (*commit_txn)                (struct pmu *pmu);
        /*
         * Will cancel the transaction, assumes ->del() is called
         * for each successful ->add() during the transaction.
         *
         * Optional.
         */
        void (*cancel_txn)                (struct pmu *pmu);

        /*
         * Will return the value for perf_event_mmap_page::index for this event,
         * if no implementation is provided it will default to 0 (see
         * perf_event_idx_default).
         */
        int (*event_idx)                (struct perf_event *event); /*optional */

        /*
         * context-switches callback
         */
        void (*sched_task)                (struct perf_event_pmu_context *pmu_ctx,
                                         struct task_struct *task, bool sched_in);

        /*
         * Kmem cache of PMU specific data
         */
        struct kmem_cache                *task_ctx_cache;

        /*
         * Set up pmu-private data structures for an AUX area
         */
        void *(*setup_aux)                (struct perf_event *event, void **pages,
                                         int nr_pages, bool overwrite);
                                        /* optional */

        /*
         * Free pmu-private AUX data structures
         */
        void (*free_aux)                (void *aux); /* optional */

        /*
         * Take a snapshot of the AUX buffer without touching the event
         * state, so that preempting ->start()/->stop() callbacks does
         * not interfere with their logic. Called in PMI context.
         *
         * Returns the size of AUX data copied to the output handle.
         *
         * Optional.
         */
        long (*snapshot_aux)                (struct perf_event *event,
                                         struct perf_output_handle *handle,
                                         unsigned long size);

        /*
         * Validate address range filters: make sure the HW supports the
         * requested configuration and number of filters; return 0 if the
         * supplied filters are valid, -errno otherwise.
         *
         * Runs in the context of the ioctl()ing process and is not serialized
         * with the rest of the PMU callbacks.
         */
        int (*addr_filters_validate)        (struct list_head *filters);
                                        /* optional */

        /*
         * Synchronize address range filter configuration:
         * translate hw-agnostic filters into hardware configuration in
         * event::hw::addr_filters.
         *
         * Runs as a part of filter sync sequence that is done in ->start()
         * callback by calling perf_event_addr_filters_sync().
         *
         * May (and should) traverse event::addr_filters::list, for which its
         * caller provides necessary serialization.
         */
        void (*addr_filters_sync)        (struct perf_event *event);
                                        /* optional */

        /*
         * Check if event can be used for aux_output purposes for
         * events of this PMU.
         *
         * Runs from perf_event_open(). Should return 0 for "no match"
         * or non-zero for "match".
         */
        int (*aux_output_match)                (struct perf_event *event);
                                        /* optional */

        /*
         * Skip programming this PMU on the given CPU. Typically needed for
         * big.LITTLE things.
         */
        bool (*filter)                        (struct pmu *pmu, int cpu); /* optional */

        /*
         * Check period value for PERF_EVENT_IOC_PERIOD ioctl.
         */
        int (*check_period)                (struct perf_event *event, u64 value); /* optional */
};

enum perf_addr_filter_action_t {
        PERF_ADDR_FILTER_ACTION_STOP = 0,
        PERF_ADDR_FILTER_ACTION_START,
        PERF_ADDR_FILTER_ACTION_FILTER,
};

/**
 * struct perf_addr_filter - address range filter definition
 * @entry:        event's filter list linkage
 * @path:        object file's path for file-based filters
 * @offset:        filter range offset
 * @size:        filter range size (size==0 means single address trigger)
 * @action:        filter/start/stop
 *
 * This is a hardware-agnostic filter configuration as specified by the user.
 */
struct perf_addr_filter {
        struct list_head        entry;
        struct path                path;
        unsigned long                offset;
        unsigned long                size;
        enum perf_addr_filter_action_t        action;
};

/**
 * struct perf_addr_filters_head - container for address range filters
 * @list:        list of filters for this event
 * @lock:        spinlock that serializes accesses to the @list and event's
 *                (and its children's) filter generations.
 * @nr_file_filters:        number of file-based filters
 *
 * A child event will use parent's @list (and therefore @lock), so they are
 * bundled together; see perf_event_addr_filters().
 */
struct perf_addr_filters_head {
        struct list_head        list;
        raw_spinlock_t                lock;
        unsigned int                nr_file_filters;
};

struct perf_addr_filter_range {
        unsigned long                start;
        unsigned long                size;
};

/**
 * enum perf_event_state - the states of an event:
 */
enum perf_event_state {
        PERF_EVENT_STATE_DEAD                = -4,
        PERF_EVENT_STATE_EXIT                = -3,
        PERF_EVENT_STATE_ERROR                = -2,
        PERF_EVENT_STATE_OFF                = -1,
        PERF_EVENT_STATE_INACTIVE        =  0,
        PERF_EVENT_STATE_ACTIVE                =  1,
};

struct file;
struct perf_sample_data;

typedef void (*perf_overflow_handler_t)(struct perf_event *,
                                        struct perf_sample_data *,
                                        struct pt_regs *regs);

/*
 * Event capabilities. For event_caps and groups caps.
 *
 * PERF_EV_CAP_SOFTWARE: Is a software event.
 * PERF_EV_CAP_READ_ACTIVE_PKG: A CPU event (or cgroup event) that can be read
 * from any CPU in the package where it is active.
 * PERF_EV_CAP_SIBLING: An event with this flag must be a group sibling and
 * cannot be a group leader. If an event with this flag is detached from the
 * group it is scheduled out and moved into an unrecoverable ERROR state.
 * PERF_EV_CAP_READ_SCOPE: A CPU event that can be read from any CPU of the
 * PMU scope where it is active.
 */
#define PERF_EV_CAP_SOFTWARE                BIT(0)
#define PERF_EV_CAP_READ_ACTIVE_PKG        BIT(1)
#define PERF_EV_CAP_SIBLING                BIT(2)
#define PERF_EV_CAP_READ_SCOPE                BIT(3)

#define SWEVENT_HLIST_BITS                8
#define SWEVENT_HLIST_SIZE                (1 << SWEVENT_HLIST_BITS)

struct swevent_hlist {
        struct hlist_head                heads[SWEVENT_HLIST_SIZE];
        struct rcu_head                        rcu_head;
};

#define PERF_ATTACH_CONTEXT        0x0001
#define PERF_ATTACH_GROUP        0x0002
#define PERF_ATTACH_TASK        0x0004
#define PERF_ATTACH_TASK_DATA        0x0008
#define PERF_ATTACH_GLOBAL_DATA        0x0010
#define PERF_ATTACH_SCHED_CB        0x0020
#define PERF_ATTACH_CHILD        0x0040
#define PERF_ATTACH_EXCLUSIVE        0x0080
#define PERF_ATTACH_CALLCHAIN        0x0100
#define PERF_ATTACH_ITRACE        0x0200

struct bpf_prog;
struct perf_cgroup;
struct perf_buffer;

struct pmu_event_list {
        raw_spinlock_t                lock;
        struct list_head        list;
};

/*
 * event->sibling_list is modified whole holding both ctx->lock and ctx->mutex
 * as such iteration must hold either lock. However, since ctx->lock is an IRQ
 * safe lock, and is only held by the CPU doing the modification, having IRQs
 * disabled is sufficient since it will hold-off the IPIs.
 */
#ifdef CONFIG_PROVE_LOCKING
#define lockdep_assert_event_ctx(event)                                \
        WARN_ON_ONCE(__lockdep_enabled &&                        \
                     (this_cpu_read(hardirqs_enabled) &&        \
                      lockdep_is_held(&(event)->ctx->mutex) != LOCK_STATE_HELD))
#else
#define lockdep_assert_event_ctx(event)
#endif

#define for_each_sibling_event(sibling, event)                        \
        lockdep_assert_event_ctx(event);                        \
        if ((event)->group_leader == (event))                        \
                list_for_each_entry((sibling), &(event)->sibling_list, sibling_list)

/**
 * struct perf_event - performance event kernel representation:
 */
struct perf_event {
#ifdef CONFIG_PERF_EVENTS
        /*
         * entry onto perf_event_context::event_list;
         *   modifications require ctx->lock
         *   RCU safe iterations.
         */
        struct list_head                event_entry;

        /*
         * Locked for modification by both ctx->mutex and ctx->lock; holding
         * either sufficies for read.
         */
        struct list_head                sibling_list;
        struct list_head                active_list;
        /*
         * Node on the pinned or flexible tree located at the event context;
         */
        struct rb_node                        group_node;
        u64                                group_index;
        /*
         * We need storage to track the entries in perf_pmu_migrate_context; we
         * cannot use the event_entry because of RCU and we want to keep the
         * group in tact which avoids us using the other two entries.
         */
        struct list_head                migrate_entry;

        struct hlist_node                hlist_entry;
        struct list_head                active_entry;
        int                                nr_siblings;

        /* Not serialized. Only written during event initialization. */
        int                                event_caps;
        /* The cumulative AND of all event_caps for events in this group. */
        int                                group_caps;

        unsigned int                        group_generation;
        struct perf_event                *group_leader;
        /*
         * event->pmu will always point to pmu in which this event belongs.
         * Whereas event->pmu_ctx->pmu may point to other pmu when group of
         * different pmu events is created.
         */
        struct pmu                        *pmu;
        void                                *pmu_private;

        enum perf_event_state                state;
        unsigned int                        attach_state;
        local64_t                        count;
        atomic64_t                        child_count;

        /*
         * These are the total time in nanoseconds that the event
         * has been enabled (i.e. eligible to run, and the task has
         * been scheduled in, if this is a per-task event)
         * and running (scheduled onto the CPU), respectively.
         */
        u64                                total_time_enabled;
        u64                                total_time_running;
        u64                                tstamp;

        struct perf_event_attr                attr;
        u16                                header_size;
        u16                                id_header_size;
        u16                                read_size;
        struct hw_perf_event                hw;

        struct perf_event_context        *ctx;
        /*
         * event->pmu_ctx points to perf_event_pmu_context in which the event
         * is added. This pmu_ctx can be of other pmu for sw event when that
         * sw event is part of a group which also contains non-sw events.
         */
        struct perf_event_pmu_context        *pmu_ctx;
        atomic_long_t                        refcount;

        /*
         * These accumulate total time (in nanoseconds) that children
         * events have been enabled and running, respectively.
         */
        atomic64_t                        child_total_time_enabled;
        atomic64_t                        child_total_time_running;

        /*
         * Protect attach/detach and child_list:
         */
        struct mutex                        child_mutex;
        struct list_head                child_list;
        struct perf_event                *parent;

        int                                oncpu;
        int                                cpu;

        struct list_head                owner_entry;
        struct task_struct                *owner;

        /* mmap bits */
        struct mutex                        mmap_mutex;
        atomic_t                        mmap_count;

        struct perf_buffer                *rb;
        struct list_head                rb_entry;
        unsigned long                        rcu_batches;
        int                                rcu_pending;

        /* poll related */
        wait_queue_head_t                waitq;
        struct fasync_struct                *fasync;

        /* delayed work for NMIs and such */
        unsigned int                        pending_wakeup;
        unsigned int                        pending_kill;
        unsigned int                        pending_disable;
        unsigned long                        pending_addr;        /* SIGTRAP */
        struct irq_work                        pending_irq;
        struct irq_work                        pending_disable_irq;
        struct callback_head                pending_task;
        unsigned int                        pending_work;

        atomic_t                        event_limit;

        /* address range filters */
        struct perf_addr_filters_head        addr_filters;
        /* vma address array for file-based filders */
        struct perf_addr_filter_range        *addr_filter_ranges;
        unsigned long                        addr_filters_gen;

        /* for aux_output events */
        struct perf_event                *aux_event;

        void (*destroy)(struct perf_event *);
        struct rcu_head                        rcu_head;

        struct pid_namespace                *ns;
        u64                                id;

        atomic64_t                        lost_samples;

        u64                                (*clock)(void);
        perf_overflow_handler_t                overflow_handler;
        void                                *overflow_handler_context;
        struct bpf_prog                        *prog;
        u64                                bpf_cookie;

#ifdef CONFIG_EVENT_TRACING
        struct trace_event_call                *tp_event;
        struct event_filter                *filter;
#ifdef CONFIG_FUNCTION_TRACER
        struct ftrace_ops               ftrace_ops;
#endif
#endif

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp; /* cgroup event is attach to */
#endif

#ifdef CONFIG_SECURITY
        void *security;
#endif
        struct list_head                sb_list;

        /*
         * Certain events gets forwarded to another pmu internally by over-
         * writing kernel copy of event->attr.type without user being aware
         * of it. event->orig_type contains original 'type' requested by
         * user.
         */
        __u32                                orig_type;
#endif /* CONFIG_PERF_EVENTS */
};

/*
 *           ,-----------------------[1:n]------------------------.
 *           V                                                    V
 * perf_event_context <-[1:n]-> perf_event_pmu_context <-[1:n]- perf_event
 *                                        |                       |
 *                                        `--[n:1]-> pmu <-[1:n]--'
 *
 *
 * struct perf_event_pmu_context  lifetime is refcount based and RCU freed
 * (similar to perf_event_context). Locking is as if it were a member of
 * perf_event_context; specifically:
 *
 *   modification, both: ctx->mutex && ctx->lock
 *   reading, either:    ctx->mutex || ctx->lock
 *
 * There is one exception to this; namely put_pmu_ctx() isn't always called
 * with ctx->mutex held; this means that as long as we can guarantee the epc
 * has events the above rules hold.
 *
 * Specificially, sys_perf_event_open()'s group_leader case depends on
 * ctx->mutex pinning the configuration. Since we hold a reference on
 * group_leader (through the filedesc) it can't go away, therefore it's
 * associated pmu_ctx must exist and cannot change due to ctx->mutex.
 *
 * perf_event holds a refcount on perf_event_context
 * perf_event holds a refcount on perf_event_pmu_context
 */
struct perf_event_pmu_context {
        struct pmu                        *pmu;
        struct perf_event_context       *ctx;

        struct list_head                pmu_ctx_entry;

        struct list_head                pinned_active;
        struct list_head                flexible_active;

        /* Used to identify the per-cpu perf_event_pmu_context */
        unsigned int                        embedded : 1;

        unsigned int                        nr_events;
        unsigned int                        nr_cgroups;
        unsigned int                        nr_freq;

        atomic_t                        refcount; /* event <-> epc */
        struct rcu_head                        rcu_head;

        /*
         * Set when one or more (plausibly active) event can't be scheduled
         * due to pmu overcommit or pmu constraints, except tolerant to
         * events not necessary to be active due to scheduling constraints,
         * such as cgroups.
         */
        int                                rotate_necessary;
};

static inline bool perf_pmu_ctx_is_active(struct perf_event_pmu_context *epc)
{
        return !list_empty(&epc->flexible_active) || !list_empty(&epc->pinned_active);
}

struct perf_event_groups {
        struct rb_root        tree;
        u64                index;
};


/**
 * struct perf_event_context - event context structure
 *
 * Used as a container for task events and CPU events as well:
 */
struct perf_event_context {
        /*
         * Protect the states of the events in the list,
         * nr_active, and the list:
         */
        raw_spinlock_t                        lock;
        /*
         * Protect the list of events.  Locking either mutex or lock
         * is sufficient to ensure the list doesn't change; to change
         * the list you need to lock both the mutex and the spinlock.
         */
        struct mutex                        mutex;

        struct list_head                pmu_ctx_list;
        struct perf_event_groups        pinned_groups;
        struct perf_event_groups        flexible_groups;
        struct list_head                event_list;

        int                                nr_events;
        int                                nr_user;
        int                                is_active;

        int                                nr_stat;
        int                                nr_freq;
        int                                rotate_disable;

        refcount_t                        refcount; /* event <-> ctx */
        struct task_struct                *task;

        /*
         * Context clock, runs when context enabled.
         */
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;

        /*
         * These fields let us detect when two contexts have both
         * been cloned (inherited) from a common ancestor.
         */
        struct perf_event_context        *parent_ctx;
        u64                                parent_gen;
        u64                                generation;
        int                                pin_count;
#ifdef CONFIG_CGROUP_PERF
        int                                nr_cgroups;         /* cgroup evts */
#endif
        struct rcu_head                        rcu_head;

        /*
         * The count of events for which using the switch-out fast path
         * should be avoided.
         *
         * Sum (event->pending_work + events with
         *    (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ)))
         *
         * The SIGTRAP is targeted at ctx->task, as such it won't do changing
         * that until the signal is delivered.
         */
        local_t                                nr_no_switch_fast;
};

/**
 * struct perf_ctx_data - PMU specific data for a task
 * @rcu_head:  To avoid the race on free PMU specific data
 * @refcount:  To track users
 * @global:    To track system-wide users
 * @ctx_cache: Kmem cache of PMU specific data
 * @data:      PMU specific data
 *
 * Currently, the struct is only used in Intel LBR call stack mode to
 * save/restore the call stack of a task on context switches.
 *
 * The rcu_head is used to prevent the race on free the data.
 * The data only be allocated when Intel LBR call stack mode is enabled.
 * The data will be freed when the mode is disabled.
 * The content of the data will only be accessed in context switch, which
 * should be protected by rcu_read_lock().
 *
 * Because of the alignment requirement of Intel Arch LBR, the Kmem cache
 * is used to allocate the PMU specific data. The ctx_cache is to track
 * the Kmem cache.
 *
 * Careful: Struct perf_ctx_data is added as a pointer in struct task_struct.
 * When system-wide Intel LBR call stack mode is enabled, a buffer with
 * constant size will be allocated for each task.
 * Also, system memory consumption can further grow when the size of
 * struct perf_ctx_data enlarges.
 */
struct perf_ctx_data {
        struct rcu_head                        rcu_head;
        refcount_t                        refcount;
        int                                global;
        struct kmem_cache                *ctx_cache;
        void                                *data;
};

struct perf_cpu_pmu_context {
        struct perf_event_pmu_context        epc;
        struct perf_event_pmu_context        *task_epc;

        struct list_head                sched_cb_entry;
        int                                sched_cb_usage;

        int                                active_oncpu;
        int                                exclusive;
        int                                pmu_disable_count;

        raw_spinlock_t                        hrtimer_lock;
        struct hrtimer                        hrtimer;
        ktime_t                                hrtimer_interval;
        unsigned int                        hrtimer_active;
};

/**
 * struct perf_event_cpu_context - per cpu event context structure
 */
struct perf_cpu_context {
        struct perf_event_context        ctx;
        struct perf_event_context        *task_ctx;
        int                                online;

#ifdef CONFIG_CGROUP_PERF
        struct perf_cgroup                *cgrp;
#endif

        /*
         * Per-CPU storage for iterators used in visit_groups_merge. The default
         * storage is of size 2 to hold the CPU and any CPU event iterators.
         */
        int                                heap_size;
        struct perf_event                **heap;
        struct perf_event                *heap_default[2];
};

struct perf_output_handle {
        struct perf_event                *event;
        struct perf_buffer                *rb;
        unsigned long                        wakeup;
        unsigned long                        size;
        union {
                u64                        flags;                /* perf_output*() */
                u64                        aux_flags;        /* perf_aux_output*() */
                struct {
                        u64                skip_read : 1;
                };
        };
        union {
                void                        *addr;
                unsigned long                head;
        };
        int                                page;
};

struct bpf_perf_event_data_kern {
        bpf_user_pt_regs_t *regs;
        struct perf_sample_data *data;
        struct perf_event *event;
};

#ifdef CONFIG_CGROUP_PERF

/*
 * perf_cgroup_info keeps track of time_enabled for a cgroup.
 * This is a per-cpu dynamically allocated data structure.
 */
struct perf_cgroup_info {
        u64                                time;
        u64                                timestamp;
        u64                                timeoffset;
        int                                active;
};

struct perf_cgroup {
        struct cgroup_subsys_state        css;
        struct perf_cgroup_info        __percpu *info;
};

/*
 * Must ensure cgroup is pinned (css_get) before calling
 * this function. In other words, we cannot call this function
 * if there is no cgroup event for the current CPU context.
 */
static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx)
{
        return container_of(task_css_check(task, perf_event_cgrp_id,
                                           ctx ? lockdep_is_held(&ctx->lock)
                                               : true),
                            struct perf_cgroup, css);
}
#endif /* CONFIG_CGROUP_PERF */

#ifdef CONFIG_PERF_EVENTS

extern struct perf_event_context *perf_cpu_task_ctx(void);

extern void *perf_aux_output_begin(struct perf_output_handle *handle,
                                   struct perf_event *event);
extern void perf_aux_output_end(struct perf_output_handle *handle,
                                unsigned long size);
extern int perf_aux_output_skip(struct perf_output_handle *handle,
                                unsigned long size);
extern void *perf_get_aux(struct perf_output_handle *handle);
extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags);
extern void perf_event_itrace_started(struct perf_event *event);

extern int perf_pmu_register(struct pmu *pmu, const char *name, int type);
extern void perf_pmu_unregister(struct pmu *pmu);

extern void __perf_event_task_sched_in(struct task_struct *prev,
                                       struct task_struct *task);
extern void __perf_event_task_sched_out(struct task_struct *prev,
                                        struct task_struct *next);
extern int perf_event_init_task(struct task_struct *child, u64 clone_flags);
extern void perf_event_exit_task(struct task_struct *child);
extern void perf_event_free_task(struct task_struct *task);
extern void perf_event_delayed_put(struct task_struct *task);
extern struct file *perf_event_get(unsigned int fd);
extern const struct perf_event *perf_get_event(struct file *file);
extern const struct perf_event_attr *perf_event_attrs(struct perf_event *event);
extern void perf_event_print_debug(void);
extern void perf_pmu_disable(struct pmu *pmu);
extern void perf_pmu_enable(struct pmu *pmu);
extern void perf_sched_cb_dec(struct pmu *pmu);
extern void perf_sched_cb_inc(struct pmu *pmu);
extern int perf_event_task_disable(void);
extern int perf_event_task_enable(void);

extern void perf_pmu_resched(struct pmu *pmu);

extern int perf_event_refresh(struct perf_event *event, int refresh);
extern void perf_event_update_userpage(struct perf_event *event);
extern int perf_event_release_kernel(struct perf_event *event);
extern struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr,
                                int cpu,
                                struct task_struct *task,
                                perf_overflow_handler_t callback,
                                void *context);
extern void perf_pmu_migrate_context(struct pmu *pmu,
                                int src_cpu, int dst_cpu);
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running);
extern u64 perf_event_read_value(struct perf_event *event,
                                 u64 *enabled, u64 *running);

extern struct perf_callchain_entry *perf_callchain(struct perf_event *event, struct pt_regs *regs);

static inline bool branch_sample_no_flags(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_FLAGS;
}

static inline bool branch_sample_no_cycles(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_NO_CYCLES;
}

static inline bool branch_sample_type(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_TYPE_SAVE;
}

static inline bool branch_sample_hw_index(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX;
}

static inline bool branch_sample_priv(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_PRIV_SAVE;
}

static inline bool branch_sample_counters(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_COUNTERS;
}

static inline bool branch_sample_call_stack(const struct perf_event *event)
{
        return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK;
}

struct perf_sample_data {
        /*
         * Fields set by perf_sample_data_init() unconditionally,
         * group so as to minimize the cachelines touched.
         */
        u64                                sample_flags;
        u64                                period;
        u64                                dyn_size;

        /*
         * Fields commonly set by __perf_event_header__init_id(),
         * group so as to minimize the cachelines touched.
         */
        u64                                type;
        struct {
                u32        pid;
                u32        tid;
        }                                tid_entry;
        u64                                time;
        u64                                id;
        struct {
                u32        cpu;
                u32        reserved;
        }                                cpu_entry;

        /*
         * The other fields, optionally {set,used} by
         * perf_{prepare,output}_sample().
         */
        u64                                ip;
        struct perf_callchain_entry        *callchain;
        struct perf_raw_record                *raw;
        struct perf_branch_stack        *br_stack;
        u64                                *br_stack_cntr;
        union perf_sample_weight        weight;
        union  perf_mem_data_src        data_src;
        u64                                txn;

        struct perf_regs                regs_user;
        struct perf_regs                regs_intr;
        u64                                stack_user_size;

        u64                                stream_id;
        u64                                cgroup;
        u64                                addr;
        u64                                phys_addr;
        u64                                data_page_size;
        u64                                code_page_size;
        u64                                aux_size;
} ____cacheline_aligned;

/* default value for data source */
#define PERF_MEM_NA (PERF_MEM_S(OP, NA)   |\
                    PERF_MEM_S(LVL, NA)   |\
                    PERF_MEM_S(SNOOP, NA) |\
                    PERF_MEM_S(LOCK, NA)  |\
                    PERF_MEM_S(TLB, NA)   |\
                    PERF_MEM_S(LVLNUM, NA))

static inline void perf_sample_data_init(struct perf_sample_data *data,
                                         u64 addr, u64 period)
{
        /* remaining struct members initialized in perf_prepare_sample() */
        data->sample_flags = PERF_SAMPLE_PERIOD;
        data->period = period;
        data->dyn_size = 0;

        if (addr) {
                data->addr = addr;
                data->sample_flags |= PERF_SAMPLE_ADDR;
        }
}

static inline void perf_sample_save_callchain(struct perf_sample_data *data,
                                              struct perf_event *event,
                                              struct pt_regs *regs)
{
        int size = 1;

        if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN))
                return;
        if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_CALLCHAIN))
                return;

        data->callchain = perf_callchain(event, regs);
        size += data->callchain->nr;

        data->dyn_size += size * sizeof(u64);
        data->sample_flags |= PERF_SAMPLE_CALLCHAIN;
}

static inline void perf_sample_save_raw_data(struct perf_sample_data *data,
                                             struct perf_event *event,
                                             struct perf_raw_record *raw)
{
        struct perf_raw_frag *frag = &raw->frag;
        u32 sum = 0;
        int size;

        if (!(event->attr.sample_type & PERF_SAMPLE_RAW))
                return;
        if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_RAW))
                return;

        do {
                sum += frag->size;
                if (perf_raw_frag_last(frag))
                        break;
                frag = frag->next;
        } while (1);

        size = round_up(sum + sizeof(u32), sizeof(u64));
        raw->size = size - sizeof(u32);
        frag->pad = raw->size - sum;

        data->raw = raw;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_RAW;
}

static inline bool has_branch_stack(struct perf_event *event)
{
        return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
}

static inline void perf_sample_save_brstack(struct perf_sample_data *data,
                                            struct perf_event *event,
                                            struct perf_branch_stack *brs,
                                            u64 *brs_cntr)
{
        int size = sizeof(u64); /* nr */

        if (!has_branch_stack(event))
                return;
        if (WARN_ON_ONCE(data->sample_flags & PERF_SAMPLE_BRANCH_STACK))
                return;

        if (branch_sample_hw_index(event))
                size += sizeof(u64);

        brs->nr = min_t(u16, event->attr.sample_max_stack, brs->nr);

        size += brs->nr * sizeof(struct perf_branch_entry);

        /*
         * The extension space for counters is appended after the
         * struct perf_branch_stack. It is used to store the occurrences
         * of events of each branch.
         */
        if (brs_cntr)
                size += brs->nr * sizeof(u64);

        data->br_stack = brs;
        data->br_stack_cntr = brs_cntr;
        data->dyn_size += size;
        data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
}

static inline u32 perf_sample_data_size(struct perf_sample_data *data,
                                        struct perf_event *event)
{
        u32 size = sizeof(struct perf_event_header);

        size += event->header_size + event->id_header_size;
        size += data->dyn_size;

        return size;
}

/*
 * Clear all bitfields in the perf_branch_entry.
 * The to and from fields are not cleared because they are
 * systematically modified by caller.
 */
static inline void perf_clear_branch_entry_bitfields(struct perf_branch_entry *br)
{
        br->mispred = 0;
        br->predicted = 0;
        br->in_tx = 0;
        br->abort = 0;
        br->cycles = 0;
        br->type = 0;
        br->spec = PERF_BR_SPEC_NA;
        br->reserved = 0;
}

extern void perf_output_sample(struct perf_output_handle *handle,
                               struct perf_event_header *header,
                               struct perf_sample_data *data,
                               struct perf_event *event);
extern void perf_prepare_sample(struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);
extern void perf_prepare_header(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event,
                                struct pt_regs *regs);

extern int perf_event_overflow(struct perf_event *event,
                                 struct perf_sample_data *data,
                                 struct pt_regs *regs);

extern void perf_event_output_forward(struct perf_event *event,
                                     struct perf_sample_data *data,
                                     struct pt_regs *regs);
extern void perf_event_output_backward(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs);
extern int perf_event_output(struct perf_event *event,
                             struct perf_sample_data *data,
                             struct pt_regs *regs);

static inline bool
is_default_overflow_handler(struct perf_event *event)
{
        perf_overflow_handler_t overflow_handler = event->overflow_handler;

        if (likely(overflow_handler == perf_event_output_forward))
                return true;
        if (unlikely(overflow_handler == perf_event_output_backward))
                return true;
        return false;
}

extern void
perf_event_header__init_id(struct perf_event_header *header,
                           struct perf_sample_data *data,
                           struct perf_event *event);
extern void
perf_event__output_id_sample(struct perf_event *event,
                             struct perf_output_handle *handle,
                             struct perf_sample_data *sample);

extern void
perf_log_lost_samples(struct perf_event *event, u64 lost);

static inline bool event_has_any_exclude_flag(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        return attr->exclude_idle || attr->exclude_user ||
               attr->exclude_kernel || attr->exclude_hv ||
               attr->exclude_guest || attr->exclude_host;
}

static inline bool is_sampling_event(struct perf_event *event)
{
        return event->attr.sample_period != 0;
}

/*
 * Return 1 for a software event, 0 for a hardware event
 */
static inline int is_software_event(struct perf_event *event)
{
        return event->event_caps & PERF_EV_CAP_SOFTWARE;
}

/*
 * Return 1 for event in sw context, 0 for event in hw context
 */
static inline int in_software_context(struct perf_event *event)
{
        return event->pmu_ctx->pmu->task_ctx_nr == perf_sw_context;
}

static inline int is_exclusive_pmu(struct pmu *pmu)
{
        return pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE;
}

extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

extern void ___perf_sw_event(u32, u64, struct pt_regs *, u64);
extern void __perf_sw_event(u32, u64, struct pt_regs *, u64);

#ifndef perf_arch_fetch_caller_regs
static inline void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip) { }
#endif

/*
 * When generating a perf sample in-line, instead of from an interrupt /
 * exception, we lack a pt_regs. This is typically used from software events
 * like: SW_CONTEXT_SWITCHES, SW_MIGRATIONS and the tie-in with tracepoints.
 *
 * We typically don't need a full set, but (for x86) do require:
 * - ip for PERF_SAMPLE_IP
 * - cs for user_mode() tests
 * - sp for PERF_SAMPLE_CALLCHAIN
 * - eflags for MISC bits and CALLCHAIN (see: perf_hw_regs())
 *
 * NOTE: assumes @regs is otherwise already 0 filled; this is important for
 * things like PERF_SAMPLE_REGS_INTR.
 */
static inline void perf_fetch_caller_regs(struct pt_regs *regs)
{
        perf_arch_fetch_caller_regs(regs, CALLER_ADDR0);
}

static __always_inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        if (static_key_false(&perf_swevent_enabled[event_id]))
                __perf_sw_event(event_id, nr, regs, addr);
}

DECLARE_PER_CPU(struct pt_regs, __perf_regs[4]);

/*
 * 'Special' version for the scheduler, it hard assumes no recursion,
 * which is guaranteed by us not actually scheduling inside other swevents
 * because those disable preemption.
 */
static __always_inline void __perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
{
        struct pt_regs *regs = this_cpu_ptr(&__perf_regs[0]);

        perf_fetch_caller_regs(regs);
        ___perf_sw_event(event_id, nr, regs, addr);
}

extern struct static_key_false perf_sched_events;

static __always_inline bool __perf_sw_enabled(int swevt)
{
        return static_key_false(&perf_swevent_enabled[swevt]);
}

static inline void perf_event_task_migrate(struct task_struct *task)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS))
                task->sched_migrated = 1;
}

static inline void perf_event_task_sched_in(struct task_struct *prev,
                                            struct task_struct *task)
{
        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_in(prev, task);

        if (__perf_sw_enabled(PERF_COUNT_SW_CPU_MIGRATIONS) &&
            task->sched_migrated) {
                __perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
                task->sched_migrated = 0;
        }
}

static inline void perf_event_task_sched_out(struct task_struct *prev,
                                             struct task_struct *next)
{
        if (__perf_sw_enabled(PERF_COUNT_SW_CONTEXT_SWITCHES))
                __perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);

#ifdef CONFIG_CGROUP_PERF
        if (__perf_sw_enabled(PERF_COUNT_SW_CGROUP_SWITCHES) &&
            perf_cgroup_from_task(prev, NULL) !=
            perf_cgroup_from_task(next, NULL))
                __perf_sw_event_sched(PERF_COUNT_SW_CGROUP_SWITCHES, 1, 0);
#endif

        if (static_branch_unlikely(&perf_sched_events))
                __perf_event_task_sched_out(prev, next);
}

extern void perf_event_mmap(struct vm_area_struct *vma);

extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                               bool unregister, const char *sym);
extern void perf_event_bpf_event(struct bpf_prog *prog,
                                 enum perf_bpf_event_type type,
                                 u16 flags);

#ifdef CONFIG_GUEST_PERF_EVENTS
extern struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state);
DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);

static inline unsigned int perf_guest_state(void)
{
        return static_call(__perf_guest_state)();
}
static inline unsigned long perf_guest_get_ip(void)
{
        return static_call(__perf_guest_get_ip)();
}
static inline unsigned int perf_guest_handle_intel_pt_intr(void)
{
        return static_call(__perf_guest_handle_intel_pt_intr)();
}
extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
extern void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs);
#else
static inline unsigned int perf_guest_state(void)                 { return 0; }
static inline unsigned long perf_guest_get_ip(void)                 { return 0; }
static inline unsigned int perf_guest_handle_intel_pt_intr(void) { return 0; }
#endif /* CONFIG_GUEST_PERF_EVENTS */

extern void perf_event_exec(void);
extern void perf_event_comm(struct task_struct *tsk, bool exec);
extern void perf_event_namespaces(struct task_struct *tsk);
extern void perf_event_fork(struct task_struct *tsk);
extern void perf_event_text_poke(const void *addr,
                                 const void *old_bytes, size_t old_len,
                                 const void *new_bytes, size_t new_len);

/* Callchains */
DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);

extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
                   u32 max_stack, bool crosstask, bool add_mark);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);
extern struct perf_callchain_entry *get_callchain_entry(int *rctx);
extern void put_callchain_entry(int rctx);

extern int sysctl_perf_event_max_stack;
extern int sysctl_perf_event_max_contexts_per_stack;

static inline int perf_callchain_store_context(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->contexts < sysctl_perf_event_max_contexts_per_stack) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->contexts;
                return 0;
        } else {
                ctx->contexts_maxed = true;
                return -1; /* no more room, stop walking the stack */
        }
}

static inline int perf_callchain_store(struct perf_callchain_entry_ctx *ctx, u64 ip)
{
        if (ctx->nr < ctx->max_stack && !ctx->contexts_maxed) {
                struct perf_callchain_entry *entry = ctx->entry;
                entry->ip[entry->nr++] = ip;
                ++ctx->nr;
                return 0;
        } else {
                return -1; /* no more room, stop walking the stack */
        }
}

extern int sysctl_perf_event_paranoid;
extern int sysctl_perf_event_sample_rate;

extern void perf_sample_event_took(u64 sample_len_ns);

/* Access to perf_event_open(2) syscall. */
#define PERF_SECURITY_OPEN                0

/* Finer grained perf_event_open(2) access control. */
#define PERF_SECURITY_CPU                1
#define PERF_SECURITY_KERNEL                2
#define PERF_SECURITY_TRACEPOINT        3

static inline int perf_is_paranoid(void)
{
        return sysctl_perf_event_paranoid > -1;
}

int perf_allow_kernel(void);

static inline int perf_allow_cpu(void)
{
        if (sysctl_perf_event_paranoid > 0 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(PERF_SECURITY_CPU);
}

static inline int perf_allow_tracepoint(void)
{
        if (sysctl_perf_event_paranoid > -1 && !perfmon_capable())
                return -EPERM;

        return security_perf_event_open(PERF_SECURITY_TRACEPOINT);
}

extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs);

extern void perf_event_init(void);
extern void perf_tp_event(u16 event_type, u64 count, void *record,
                          int entry_size, struct pt_regs *regs,
                          struct hlist_head *head, int rctx,
                          struct task_struct *task);
extern void perf_bp_event(struct perf_event *event, void *data);

extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs);
extern unsigned long perf_instruction_pointer(struct perf_event *event,
                                              struct pt_regs *regs);

#ifndef perf_arch_misc_flags
# define perf_arch_misc_flags(regs) \
                (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL)
# define perf_arch_instruction_pointer(regs)        instruction_pointer(regs)
#endif
#ifndef perf_arch_bpf_user_pt_regs
# define perf_arch_bpf_user_pt_regs(regs) regs
#endif

#ifndef perf_arch_guest_misc_flags
static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs)
{
        unsigned long guest_state = perf_guest_state();

        if (!(guest_state & PERF_GUEST_ACTIVE))
                return 0;

        if (guest_state & PERF_GUEST_USER)
                return PERF_RECORD_MISC_GUEST_USER;
        else
                return PERF_RECORD_MISC_GUEST_KERNEL;
}
# define perf_arch_guest_misc_flags(regs)        perf_arch_guest_misc_flags(regs)
#endif

static inline bool needs_branch_stack(struct perf_event *event)
{
        return event->attr.branch_sample_type != 0;
}

static inline bool has_aux(struct perf_event *event)
{
        return event->pmu->setup_aux;
}

static inline bool has_aux_action(struct perf_event *event)
{
        return event->attr.aux_sample_size ||
               event->attr.aux_pause ||
               event->attr.aux_resume;
}

static inline bool is_write_backward(struct perf_event *event)
{
        return !!event->attr.write_backward;
}

static inline bool has_addr_filter(struct perf_event *event)
{
        return event->pmu->nr_addr_filters;
}

/*
 * An inherited event uses parent's filters
 */
static inline struct perf_addr_filters_head *
perf_event_addr_filters(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = &event->addr_filters;

        if (event->parent)
                ifh = &event->parent->addr_filters;

        return ifh;
}

static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
{
        /* Only the parent has fasync state */
        if (event->parent)
                event = event->parent;
        return &event->fasync;
}

extern void perf_event_addr_filters_sync(struct perf_event *event);
extern void perf_report_aux_output_id(struct perf_event *event, u64 hw_id);

extern int perf_output_begin(struct perf_output_handle *handle,
                             struct perf_sample_data *data,
                             struct perf_event *event, unsigned int size);
extern int perf_output_begin_forward(struct perf_output_handle *handle,
                                     struct perf_sample_data *data,
                                     struct perf_event *event,
                                     unsigned int size);
extern int perf_output_begin_backward(struct perf_output_handle *handle,
                                      struct perf_sample_data *data,
                                      struct perf_event *event,
                                      unsigned int size);

extern void perf_output_end(struct perf_output_handle *handle);
extern unsigned int perf_output_copy(struct perf_output_handle *handle,
                             const void *buf, unsigned int len);
extern unsigned int perf_output_skip(struct perf_output_handle *handle,
                                     unsigned int len);
extern long perf_output_copy_aux(struct perf_output_handle *aux_handle,
                                 struct perf_output_handle *handle,
                                 unsigned long from, unsigned long to);
extern int perf_swevent_get_recursion_context(void);
extern void perf_swevent_put_recursion_context(int rctx);
extern u64 perf_swevent_set_period(struct perf_event *event);
extern void perf_event_enable(struct perf_event *event);
extern void perf_event_disable(struct perf_event *event);
extern void perf_event_disable_local(struct perf_event *event);
extern void perf_event_disable_inatomic(struct perf_event *event);
extern void perf_event_task_tick(void);
extern int perf_event_account_interrupt(struct perf_event *event);
extern int perf_event_period(struct perf_event *event, u64 value);
extern u64 perf_event_pause(struct perf_event *event, bool reset);
#else /* !CONFIG_PERF_EVENTS: */
static inline void *
perf_aux_output_begin(struct perf_output_handle *handle,
                      struct perf_event *event)                                { return NULL; }
static inline void
perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
                                                                        { }
static inline int
perf_aux_output_skip(struct perf_output_handle *handle,
                     unsigned long size)                                { return -EINVAL; }
static inline void *
perf_get_aux(struct perf_output_handle *handle)                                { return NULL; }
static inline void
perf_event_task_migrate(struct task_struct *task)                        { }
static inline void
perf_event_task_sched_in(struct task_struct *prev,
                         struct task_struct *task)                        { }
static inline void
perf_event_task_sched_out(struct task_struct *prev,
                          struct task_struct *next)                        { }
static inline int perf_event_init_task(struct task_struct *child,
                                       u64 clone_flags)                        { return 0; }
static inline void perf_event_exit_task(struct task_struct *child)        { }
static inline void perf_event_free_task(struct task_struct *task)        { }
static inline void perf_event_delayed_put(struct task_struct *task)        { }
static inline struct file *perf_event_get(unsigned int fd)        { return ERR_PTR(-EINVAL); }
static inline const struct perf_event *perf_get_event(struct file *file)
{
        return ERR_PTR(-EINVAL);
}
static inline const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        return ERR_PTR(-EINVAL);
}
static inline int perf_event_read_local(struct perf_event *event, u64 *value,
                                        u64 *enabled, u64 *running)
{
        return -EINVAL;
}
static inline void perf_event_print_debug(void)                                { }
static inline int perf_event_task_disable(void)                                { return -EINVAL; }
static inline int perf_event_task_enable(void)                                { return -EINVAL; }
static inline int perf_event_refresh(struct perf_event *event, int refresh)
{
        return -EINVAL;
}

static inline void
perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)        { }
static inline void
perf_bp_event(struct perf_event *event, void *data)                        { }

static inline void perf_event_mmap(struct vm_area_struct *vma)                { }

typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data);
static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len,
                                      bool unregister, const char *sym)        { }
static inline void perf_event_bpf_event(struct bpf_prog *prog,
                                        enum perf_bpf_event_type type,
                                        u16 flags)                        { }
static inline void perf_event_exec(void)                                { }
static inline void perf_event_comm(struct task_struct *tsk, bool exec)        { }
static inline void perf_event_namespaces(struct task_struct *tsk)        { }
static inline void perf_event_fork(struct task_struct *tsk)                { }
static inline void perf_event_text_poke(const void *addr,
                                        const void *old_bytes,
                                        size_t old_len,
                                        const void *new_bytes,
                                        size_t new_len)                        { }
static inline void perf_event_init(void)                                { }
static inline int  perf_swevent_get_recursion_context(void)                { return -1; }
static inline void perf_swevent_put_recursion_context(int rctx)                { }
static inline u64 perf_swevent_set_period(struct perf_event *event)        { return 0; }
static inline void perf_event_enable(struct perf_event *event)                { }
static inline void perf_event_disable(struct perf_event *event)                { }
static inline int __perf_event_disable(void *info)                        { return -1; }
static inline void perf_event_task_tick(void)                                { }
static inline int perf_event_release_kernel(struct perf_event *event)        { return 0; }
static inline int perf_event_period(struct perf_event *event, u64 value)
{
        return -EINVAL;
}
static inline u64 perf_event_pause(struct perf_event *event, bool reset)
{
        return 0;
}
static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
{
        return 0;
}
#endif

#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
extern void perf_restore_debug_store(void);
#else
static inline void perf_restore_debug_store(void)                        { }
#endif

#define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))

struct perf_pmu_events_attr {
        struct device_attribute attr;
        u64 id;
        const char *event_str;
};

struct perf_pmu_events_ht_attr {
        struct device_attribute                        attr;
        u64                                        id;
        const char                                *event_str_ht;
        const char                                *event_str_noht;
};

struct perf_pmu_events_hybrid_attr {
        struct device_attribute                        attr;
        u64                                        id;
        const char                                *event_str;
        u64                                        pmu_type;
};

struct perf_pmu_format_hybrid_attr {
        struct device_attribute                        attr;
        u64                                        pmu_type;
};

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page);

#define PMU_EVENT_ATTR(_name, _var, _id, _show)                                \
static struct perf_pmu_events_attr _var = {                                \
        .attr = __ATTR(_name, 0444, _show, NULL),                        \
        .id   =  _id,                                                        \
};

#define PMU_EVENT_ATTR_STRING(_name, _var, _str)                            \
static struct perf_pmu_events_attr _var = {                                    \
        .attr                = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
        .id                = 0,                                                    \
        .event_str        = _str,                                                    \
};

#define PMU_EVENT_ATTR_ID(_name, _show, _id)                                \
        (&((struct perf_pmu_events_attr[]) {                                \
                { .attr = __ATTR(_name, 0444, _show, NULL),                \
                  .id = _id, }                                                \
        })[0].attr.attr)

#define PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
static ssize_t                                                                \
_name##_show(struct device *dev,                                        \
                               struct device_attribute *attr,                \
                               char *page)                                \
{                                                                        \
        BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);                        \
        return sprintf(page, _format "\n");                                \
}                                                                        \

#define PMU_FORMAT_ATTR(_name, _format)                                        \
        PMU_FORMAT_ATTR_SHOW(_name, _format)                                \
                                                                        \
static struct device_attribute format_attr_##_name = __ATTR_RO(_name)

/* Performance counter hotplug functions */
#ifdef CONFIG_PERF_EVENTS
int perf_event_init_cpu(unsigned int cpu);
int perf_event_exit_cpu(unsigned int cpu);
#else
#define perf_event_init_cpu        NULL
#define perf_event_exit_cpu        NULL
#endif

extern void arch_perf_update_userpage(struct perf_event *event,
                                      struct perf_event_mmap_page *userpg,
                                      u64 now);

/*
 * Snapshot branch stack on software events.
 *
 * Branch stack can be very useful in understanding software events. For
 * example, when a long function, e.g. sys_perf_event_open, returns an
 * errno, it is not obvious why the function failed. Branch stack could
 * provide very helpful information in this type of scenarios.
 *
 * On software event, it is necessary to stop the hardware branch recorder
 * fast. Otherwise, the hardware register/buffer will be flushed with
 * entries of the triggering event. Therefore, static call is used to
 * stop the hardware recorder.
 */

/*
 * cnt is the number of entries allocated for entries.
 * Return number of entries copied to .
 */
typedef int (perf_snapshot_branch_stack_t)(struct perf_branch_entry *entries,
                                           unsigned int cnt);
DECLARE_STATIC_CALL(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);

#ifndef PERF_NEEDS_LOPWR_CB
static inline void perf_lopwr_cb(bool mode)
{
}
#endif

#endif /* _LINUX_PERF_EVENT_H */






































































































































  265 








  265 



  265 







  265 
  265 
  265 








  264 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 






    1 

    1 










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Linux Socket Filter - Kernel level socket filtering
 *
 * Based on the design of the Berkeley Packet Filter. The new
 * internal format has been designed by PLUMgrid:
 *
 *        Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
 *
 * Authors:
 *
 *        Jay Schulist <jschlst@samba.org>
 *        Alexei Starovoitov <ast@plumgrid.com>
 *        Daniel Borkmann <dborkman@redhat.com>
 *
 * Andi Kleen - Fix a few bad bugs and races.
 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
 */

#include <linux/atomic.h>
#include <linux/bpf_verifier.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/sock_diag.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
#include <linux/if_arp.h>
#include <linux/gfp.h>
#include <net/inet_common.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
#include <linux/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
#include <linux/btf.h>
#include <net/sch_generic.h>
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
#include <net/dst.h>
#include <net/sock_reuseport.h>
#include <net/busy_poll.h>
#include <net/tcp.h>
#include <net/xfrm.h>
#include <net/udp.h>
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <linux/inetdevice.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
#include <net/net_namespace.h>
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
#include <net/lwtunnel.h>
#include <net/ipv6_stubs.h>
#include <net/bpf_sk_storage.h>
#include <net/transp_v6.h>
#include <linux/btf_ids.h>
#include <net/tls.h>
#include <net/xdp.h>
#include <net/mptcp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netkit.h>
#include <linux/un.h>
#include <net/xdp_sock_drv.h>
#include <net/inet_dscp.h>

#include "dev.h"

/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
{
        if (in_compat_syscall()) {
                struct compat_sock_fprog f32;

                if (len != sizeof(f32))
                        return -EINVAL;
                if (copy_from_sockptr(&f32, src, sizeof(f32)))
                        return -EFAULT;
                memset(dst, 0, sizeof(*dst));
                dst->len = f32.len;
                dst->filter = compat_ptr(f32.filter);
        } else {
                if (len != sizeof(*dst))
                        return -EINVAL;
                if (copy_from_sockptr(dst, src, sizeof(*dst)))
                        return -EFAULT;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);

/**
 *        sk_filter_trim_cap - run a packet through a socket filter
 *        @sk: sock associated with &sk_buff
 *        @skb: buffer to filter
 *        @cap: limit on how short the eBPF program may trim the packet
 *
 * Run the eBPF program and then cut skb->data to correct size returned by
 * the program. If pkt_len is 0 we toss packet. If skb->len is smaller
 * than pkt_len we keep whole skb->data. This is the socket level
 * wrapper to bpf_prog_run. It returns 0 if the packet should
 * be accepted or -EPERM if the packet should be tossed.
 *
 */
int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
{
        int err;
        struct sk_filter *filter;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
                return -ENOMEM;
        }
        err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
        if (err)
                return err;

        err = security_sock_rcv_skb(sk, skb);
        if (err)
                return err;

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter) {
                struct sock *save_sk = skb->sk;
                unsigned int pkt_len;

                skb->sk = sk;
                pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
                skb->sk = save_sk;
                err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
        }
        rcu_read_unlock();

        return err;
}
EXPORT_SYMBOL(sk_filter_trim_cap);

BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb)
{
        return skb_get_poff(skb);
}

BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = nla_find((struct nlattr *) &skb->data[a], skb->len - a, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
{
        struct nlattr *nla;

        if (skb_is_nonlinear(skb))
                return 0;

        if (skb->len < sizeof(struct nlattr))
                return 0;

        if (a > skb->len - sizeof(struct nlattr))
                return 0;

        nla = (struct nlattr *) &skb->data[a];
        if (!nla_ok(nla, skb->len - a))
                return 0;

        nla = nla_find_nested(nla, x);
        if (nla)
                return (void *) nla - (void *) skb->data;

        return 0;
}

static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
{
        if (likely(offset >= 0))
                return offset;

        if (offset >= SKF_NET_OFF)
                return offset - SKF_NET_OFF + skb_network_offset(skb);

        if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
                return offset - SKF_LL_OFF + skb_mac_offset(skb);

        return INT_MIN;
}

BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        u8 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return *(u8 *)(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return tmp;
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len,
                                         offset);
}

BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be16 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return get_unaligned_be16(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return be16_to_cpu(tmp);
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
           data, int, headlen, int, offset)
{
        __be32 tmp;
        const int len = sizeof(tmp);

        offset = bpf_skb_load_helper_convert_offset(skb, offset);
        if (offset == INT_MIN)
                return -EFAULT;

        if (headlen - offset >= len)
                return get_unaligned_be32(data + offset);
        if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
                return be32_to_cpu(tmp);
        else
                return -EFAULT;
}

BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
           int, offset)
{
        return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len,
                                          offset);
}

static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
                              struct bpf_insn *insn_buf)
{
        struct bpf_insn *insn = insn_buf;

        switch (skb_field) {
        case SKF_AD_MARK:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);

                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
                                      offsetof(struct sk_buff, mark));
                break;

        case SKF_AD_PKTTYPE:
                *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
                *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
#endif
                break;

        case SKF_AD_QUEUE:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);

                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, queue_mapping));
                break;

        case SKF_AD_VLAN_TAG:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);

                /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
                *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
                                      offsetof(struct sk_buff, vlan_tci));
                break;
        case SKF_AD_VLAN_TAG_PRESENT:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
                *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
                                      offsetof(struct sk_buff, vlan_all));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
                *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
                break;
        }

        return insn - insn_buf;
}

static bool convert_bpf_extensions(struct sock_filter *fp,
                                   struct bpf_insn **insnp)
{
        struct bpf_insn *insn = *insnp;
        u32 cnt;

        switch (fp->k) {
        case SKF_AD_OFF + SKF_AD_PROTOCOL:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);

                /* A = *(u16 *) (CTX + offsetof(protocol)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, protocol));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PKTTYPE:
                cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_IFINDEX:
        case SKF_AD_OFF + SKF_AD_HATYPE:
                BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
                BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      BPF_REG_TMP, BPF_REG_CTX,
                                      offsetof(struct sk_buff, dev));
                /* if (tmp != 0) goto pc + 1 */
                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_TMP, 0, 1);
                *insn++ = BPF_EXIT_INSN();
                if (fp->k == SKF_AD_OFF + SKF_AD_IFINDEX)
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, ifindex));
                else
                        *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_TMP,
                                            offsetof(struct net_device, type));
                break;

        case SKF_AD_OFF + SKF_AD_MARK:
                cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_RXHASH:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);

                *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
                                    offsetof(struct sk_buff, hash));
                break;

        case SKF_AD_OFF + SKF_AD_QUEUE:
                cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT:
                cnt = convert_skb_access(SKF_AD_VLAN_TAG_PRESENT,
                                         BPF_REG_A, BPF_REG_CTX, insn);
                insn += cnt - 1;
                break;

        case SKF_AD_OFF + SKF_AD_VLAN_TPID:
                BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);

                /* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
                *insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
                                      offsetof(struct sk_buff, vlan_proto));
                /* A = ntohs(A) [emitting a nop or swap16] */
                *insn = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, 16);
                break;

        case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
        case SKF_AD_OFF + SKF_AD_NLATTR:
        case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
        case SKF_AD_OFF + SKF_AD_CPU:
        case SKF_AD_OFF + SKF_AD_RANDOM:
                /* arg1 = CTX */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
                /* arg2 = A */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_A);
                /* arg3 = X */
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_X);
                /* Emit call(arg1=CTX, arg2=A, arg3=X) */
                switch (fp->k) {
                case SKF_AD_OFF + SKF_AD_PAY_OFFSET:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr);
                        break;
                case SKF_AD_OFF + SKF_AD_NLATTR_NEST:
                        *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest);
                        break;
                case SKF_AD_OFF + SKF_AD_CPU:
                        *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id);
                        break;
                case SKF_AD_OFF + SKF_AD_RANDOM:
                        *insn = BPF_EMIT_CALL(bpf_user_rnd_u32);
                        bpf_user_rnd_init_once();
                        break;
                }
                break;

        case SKF_AD_OFF + SKF_AD_ALU_XOR_X:
                /* A ^= X */
                *insn = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_X);
                break;

        default:
                /* This is just a dummy call to avoid letting the compiler
                 * evict __bpf_call_base() as an optimization. Placed here
                 * where no-one bothers.
                 */
                BUG_ON(__bpf_call_base(0, 0, 0, 0, 0) != 0);
                return false;
        }

        *insnp = insn;
        return true;
}

static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
{
        const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS);
        int size = bpf_size_to_bytes(BPF_SIZE(fp->code));
        bool endian = BPF_SIZE(fp->code) == BPF_H ||
                      BPF_SIZE(fp->code) == BPF_W;
        bool indirect = BPF_MODE(fp->code) == BPF_IND;
        const int ip_align = NET_IP_ALIGN;
        struct bpf_insn *insn = *insnp;
        int offset = fp->k;

        if (!indirect &&
            ((unaligned_ok && offset >= 0) ||
             (!unaligned_ok && offset >= 0 &&
              offset + ip_align >= 0 &&
              offset + ip_align % size == 0))) {
                bool ldx_off_ok = offset <= S16_MAX;

                *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
                if (offset)
                        *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
                *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
                                      size, 2 + endian + (!ldx_off_ok * 2));
                if (ldx_off_ok) {
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_D, offset);
                } else {
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
                                              BPF_REG_TMP, 0);
                }
                if (endian)
                        *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
                *insn++ = BPF_JMP_A(8);
        }

        *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D);
        *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H);
        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X);
                if (fp->k)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset);
        }

        switch (BPF_SIZE(fp->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32);
                break;
        default:
                return false;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
        *insn   = BPF_EXIT_INSN();

        *insnp = insn;
        return true;
}

/**
 *        bpf_convert_filter - convert filter program
 *        @prog: the user passed filter program
 *        @len: the length of the user passed filter program
 *        @new_prog: allocated 'struct bpf_prog' or NULL
 *        @new_len: pointer to store length of converted program
 *        @seen_ld_abs: bool whether we've seen ld_abs/ind
 *
 * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn'
 * style extended BPF (eBPF).
 * Conversion workflow:
 *
 * 1) First pass for calculating the new program length:
 *   bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs)
 *
 * 2) 2nd pass to remap in two passes: 1st pass finds new
 *    jump offsets, 2nd pass remapping:
 *   bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs)
 */
static int bpf_convert_filter(struct sock_filter *prog, int len,
                              struct bpf_prog *new_prog, int *new_len,
                              bool *seen_ld_abs)
{
        int new_flen = 0, pass = 0, target, i, stack_off;
        struct bpf_insn *new_insn, *first_insn = NULL;
        struct sock_filter *fp;
        int *addrs = NULL;
        u8 bpf_src;

        BUILD_BUG_ON(BPF_MEMWORDS * sizeof(u32) > MAX_BPF_STACK);
        BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG);

        if (len <= 0 || len > BPF_MAXINSNS)
                return -EINVAL;

        if (new_prog) {
                first_insn = new_prog->insnsi;
                addrs = kcalloc(len, sizeof(*addrs),
                                GFP_KERNEL | __GFP_NOWARN);
                if (!addrs)
                        return -ENOMEM;
        }

do_pass:
        new_insn = first_insn;
        fp = prog;

        /* Classic BPF related prologue emission. */
        if (new_prog) {
                /* Classic BPF expects A and X to be reset first. These need
                 * to be guaranteed to be the first two instructions.
                 */
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);

                /* All programs must keep CTX in callee saved BPF_REG_CTX.
                 * In eBPF case it's done by the compiler, here we need to
                 * do this ourself. Initial CTX is present in BPF_REG_ARG1.
                 */
                *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
                if (*seen_ld_abs) {
                        /* For packet access in classic BPF, cache skb->data
                         * in callee-saved BPF R8 and skb->len - skb->data_len
                         * (headlen) in BPF R9. Since classic BPF is read-only
                         * on CTX, we only need to cache it once.
                         */
                        *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                                  BPF_REG_D, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, len));
                        *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX,
                                                  offsetof(struct sk_buff, data_len));
                        *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP);
                }
        } else {
                new_insn += 3;
        }

        for (i = 0; i < len; fp++, i++) {
                struct bpf_insn tmp_insns[32] = { };
                struct bpf_insn *insn = tmp_insns;

                if (addrs)
                        addrs[i] = new_insn - first_insn;

                switch (fp->code) {
                /* All arithmetic insns and skb loads map as-is. */
                case BPF_ALU | BPF_ADD | BPF_X:
                case BPF_ALU | BPF_ADD | BPF_K:
                case BPF_ALU | BPF_SUB | BPF_X:
                case BPF_ALU | BPF_SUB | BPF_K:
                case BPF_ALU | BPF_AND | BPF_X:
                case BPF_ALU | BPF_AND | BPF_K:
                case BPF_ALU | BPF_OR | BPF_X:
                case BPF_ALU | BPF_OR | BPF_K:
                case BPF_ALU | BPF_LSH | BPF_X:
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_X:
                case BPF_ALU | BPF_RSH | BPF_K:
                case BPF_ALU | BPF_XOR | BPF_X:
                case BPF_ALU | BPF_XOR | BPF_K:
                case BPF_ALU | BPF_MUL | BPF_X:
                case BPF_ALU | BPF_MUL | BPF_K:
                case BPF_ALU | BPF_DIV | BPF_X:
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_X:
                case BPF_ALU | BPF_MOD | BPF_K:
                case BPF_ALU | BPF_NEG:
                case BPF_LD | BPF_ABS | BPF_W:
                case BPF_LD | BPF_ABS | BPF_H:
                case BPF_LD | BPF_ABS | BPF_B:
                case BPF_LD | BPF_IND | BPF_W:
                case BPF_LD | BPF_IND | BPF_H:
                case BPF_LD | BPF_IND | BPF_B:
                        /* Check for overloaded BPF extension and
                         * directly convert it if found, otherwise
                         * just move on with mapping.
                         */
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            BPF_MODE(fp->code) == BPF_ABS &&
                            convert_bpf_extensions(fp, &insn))
                                break;
                        if (BPF_CLASS(fp->code) == BPF_LD &&
                            convert_bpf_ld_abs(fp, &insn)) {
                                *seen_ld_abs = true;
                                break;
                        }

                        if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) ||
                            fp->code == (BPF_ALU | BPF_MOD | BPF_X)) {
                                *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X);
                                /* Error with exception code on div/mod by 0.
                                 * For cBPF programs, this was always return 0.
                                 */
                                *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2);
                                *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
                                *insn++ = BPF_EXIT_INSN();
                        }

                        *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k);
                        break;

                /* Jump transformation cannot use BPF block macros
                 * everywhere as offset calculation and target updates
                 * require a bit more work than the rest, i.e. jump
                 * opcodes map as-is, but offsets need adjustment.
                 */

#define BPF_EMIT_JMP                                                        \
        do {                                                                \
                const s32 off_min = S16_MIN, off_max = S16_MAX;                \
                s32 off;                                                \
                                                                        \
                if (target >= len || target < 0)                        \
                        goto err;                                        \
                off = addrs ? addrs[target] - addrs[i] - 1 : 0;                \
                /* Adjust pc relative offset for 2nd or 3rd insn. */        \
                off -= insn - tmp_insns;                                \
                /* Reject anything not fitting into insn->off. */        \
                if (off < off_min || off > off_max)                        \
                        goto err;                                        \
                insn->off = off;                                        \
        } while (0)

                case BPF_JMP | BPF_JA:
                        target = i + fp->k + 1;
                        insn->code = fp->code;
                        BPF_EMIT_JMP;
                        break;

                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                        if (BPF_SRC(fp->code) == BPF_K && (int) fp->k < 0) {
                                /* BPF immediates are signed, zero extend
                                 * immediate into tmp register and use it
                                 * in compare insn.
                                 */
                                *insn++ = BPF_MOV32_IMM(BPF_REG_TMP, fp->k);

                                insn->dst_reg = BPF_REG_A;
                                insn->src_reg = BPF_REG_TMP;
                                bpf_src = BPF_X;
                        } else {
                                insn->dst_reg = BPF_REG_A;
                                insn->imm = fp->k;
                                bpf_src = BPF_SRC(fp->code);
                                insn->src_reg = bpf_src == BPF_X ? BPF_REG_X : 0;
                        }

                        /* Common case where 'jump_false' is next insn. */
                        if (fp->jf == 0) {
                                insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                                target = i + fp->jt + 1;
                                BPF_EMIT_JMP;
                                break;
                        }

                        /* Convert some jumps when 'jump_true' is next insn. */
                        if (fp->jt == 0) {
                                switch (BPF_OP(fp->code)) {
                                case BPF_JEQ:
                                        insn->code = BPF_JMP | BPF_JNE | bpf_src;
                                        break;
                                case BPF_JGT:
                                        insn->code = BPF_JMP | BPF_JLE | bpf_src;
                                        break;
                                case BPF_JGE:
                                        insn->code = BPF_JMP | BPF_JLT | bpf_src;
                                        break;
                                default:
                                        goto jmp_rest;
                                }

                                target = i + fp->jf + 1;
                                BPF_EMIT_JMP;
                                break;
                        }
jmp_rest:
                        /* Other jumps are mapped into two insns: Jxx and JA. */
                        target = i + fp->jt + 1;
                        insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src;
                        BPF_EMIT_JMP;
                        insn++;

                        insn->code = BPF_JMP | BPF_JA;
                        target = i + fp->jf + 1;
                        BPF_EMIT_JMP;
                        break;

                /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */
                case BPF_LDX | BPF_MSH | BPF_B: {
                        struct sock_filter tmp = {
                                .code        = BPF_LD | BPF_ABS | BPF_B,
                                .k        = fp->k,
                        };

                        *seen_ld_abs = true;

                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = BPF_R0 = *(u8 *) (skb->data + K) */
                        convert_bpf_ld_abs(&tmp, &insn);
                        insn++;
                        /* A &= 0xf */
                        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf);
                        /* A <<= 2 */
                        *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2);
                        /* tmp = X */
                        *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X);
                        /* X = A */
                        *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        /* A = tmp */
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
                        break;
                }
                /* RET_K is remapped into 2 insns. RET_A case doesn't need an
                 * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
                 */
                case BPF_RET | BPF_A:
                case BPF_RET | BPF_K:
                        if (BPF_RVAL(fp->code) == BPF_K)
                                *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
                                                        0, fp->k);
                        *insn = BPF_EXIT_INSN();
                        break;

                /* Store to stack. */
                case BPF_ST:
                case BPF_STX:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) ==
                                            BPF_ST ? BPF_REG_A : BPF_REG_X,
                                            -stack_off);
                        /* check_load_and_stores() verifies that classic BPF can
                         * load from stack only after write, so tracking
                         * stack_depth for ST|STX insns is enough
                         */
                        if (new_prog && new_prog->aux->stack_depth < stack_off)
                                new_prog->aux->stack_depth = stack_off;
                        break;

                /* Load from stack. */
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        stack_off = fp->k * 4  + 4;
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD  ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_FP,
                                            -stack_off);
                        break;

                /* A = K or X = K */
                case BPF_LD | BPF_IMM:
                case BPF_LDX | BPF_IMM:
                        *insn = BPF_MOV32_IMM(BPF_CLASS(fp->code) == BPF_LD ?
                                              BPF_REG_A : BPF_REG_X, fp->k);
                        break;

                /* X = A */
                case BPF_MISC | BPF_TAX:
                        *insn = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A);
                        break;

                /* A = X */
                case BPF_MISC | BPF_TXA:
                        *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_X);
                        break;

                /* A = skb->len or X = skb->len */
                case BPF_LD | BPF_W | BPF_LEN:
                case BPF_LDX | BPF_W | BPF_LEN:
                        *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ?
                                            BPF_REG_A : BPF_REG_X, BPF_REG_CTX,
                                            offsetof(struct sk_buff, len));
                        break;

                /* Access seccomp_data fields. */
                case BPF_LDX | BPF_ABS | BPF_W:
                        /* A = *(u32 *) (ctx + K) */
                        *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, fp->k);
                        break;

                /* Unknown instruction. */
                default:
                        goto err;
                }

                insn++;
                if (new_prog)
                        memcpy(new_insn, tmp_insns,
                               sizeof(*insn) * (insn - tmp_insns));
                new_insn += insn - tmp_insns;
        }

        if (!new_prog) {
                /* Only calculating new length. */
                *new_len = new_insn - first_insn;
                if (*seen_ld_abs)
                        *new_len += 4; /* Prologue bits. */
                return 0;
        }

        pass++;
        if (new_flen != new_insn - first_insn) {
                new_flen = new_insn - first_insn;
                if (pass > 2)
                        goto err;
                goto do_pass;
        }

        kfree(addrs);
        BUG_ON(*new_len != new_flen);
        return 0;
err:
        kfree(addrs);
        return -EINVAL;
}

/* Security:
 *
 * As we dont want to clear mem[] array for each packet going through
 * __bpf_prog_run(), we check that filter loaded by user never try to read
 * a cell if not previously written, and we check all branches to be sure
 * a malicious user doesn't try to abuse us.
 */
static int check_load_and_stores(const struct sock_filter *filter, int flen)
{
        u16 *masks, memvalid = 0; /* One bit per cell, 16 cells */
        int pc, ret = 0;

        BUILD_BUG_ON(BPF_MEMWORDS > 16);

        masks = kmalloc_array(flen, sizeof(*masks), GFP_KERNEL);
        if (!masks)
                return -ENOMEM;

        memset(masks, 0xff, flen * sizeof(*masks));

        for (pc = 0; pc < flen; pc++) {
                memvalid &= masks[pc];

                switch (filter[pc].code) {
                case BPF_ST:
                case BPF_STX:
                        memvalid |= (1 << filter[pc].k);
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                        if (!(memvalid & (1 << filter[pc].k))) {
                                ret = -EINVAL;
                                goto error;
                        }
                        break;
                case BPF_JMP | BPF_JA:
                        /* A jump must set masks on target */
                        masks[pc + 1 + filter[pc].k] &= memvalid;
                        memvalid = ~0;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* A jump must set masks on targets */
                        masks[pc + 1 + filter[pc].jt] &= memvalid;
                        masks[pc + 1 + filter[pc].jf] &= memvalid;
                        memvalid = ~0;
                        break;
                }
        }
error:
        kfree(masks);
        return ret;
}

static bool chk_code_allowed(u16 code_to_probe)
{
        static const bool codes[] = {
                /* 32 bit ALU operations */
                [BPF_ALU | BPF_ADD | BPF_K] = true,
                [BPF_ALU | BPF_ADD | BPF_X] = true,
                [BPF_ALU | BPF_SUB | BPF_K] = true,
                [BPF_ALU | BPF_SUB | BPF_X] = true,
                [BPF_ALU | BPF_MUL | BPF_K] = true,
                [BPF_ALU | BPF_MUL | BPF_X] = true,
                [BPF_ALU | BPF_DIV | BPF_K] = true,
                [BPF_ALU | BPF_DIV | BPF_X] = true,
                [BPF_ALU | BPF_MOD | BPF_K] = true,
                [BPF_ALU | BPF_MOD | BPF_X] = true,
                [BPF_ALU | BPF_AND | BPF_K] = true,
                [BPF_ALU | BPF_AND | BPF_X] = true,
                [BPF_ALU | BPF_OR | BPF_K] = true,
                [BPF_ALU | BPF_OR | BPF_X] = true,
                [BPF_ALU | BPF_XOR | BPF_K] = true,
                [BPF_ALU | BPF_XOR | BPF_X] = true,
                [BPF_ALU | BPF_LSH | BPF_K] = true,
                [BPF_ALU | BPF_LSH | BPF_X] = true,
                [BPF_ALU | BPF_RSH | BPF_K] = true,
                [BPF_ALU | BPF_RSH | BPF_X] = true,
                [BPF_ALU | BPF_NEG] = true,
                /* Load instructions */
                [BPF_LD | BPF_W | BPF_ABS] = true,
                [BPF_LD | BPF_H | BPF_ABS] = true,
                [BPF_LD | BPF_B | BPF_ABS] = true,
                [BPF_LD | BPF_W | BPF_LEN] = true,
                [BPF_LD | BPF_W | BPF_IND] = true,
                [BPF_LD | BPF_H | BPF_IND] = true,
                [BPF_LD | BPF_B | BPF_IND] = true,
                [BPF_LD | BPF_IMM] = true,
                [BPF_LD | BPF_MEM] = true,
                [BPF_LDX | BPF_W | BPF_LEN] = true,
                [BPF_LDX | BPF_B | BPF_MSH] = true,
                [BPF_LDX | BPF_IMM] = true,
                [BPF_LDX | BPF_MEM] = true,
                /* Store instructions */
                [BPF_ST] = true,
                [BPF_STX] = true,
                /* Misc instructions */
                [BPF_MISC | BPF_TAX] = true,
                [BPF_MISC | BPF_TXA] = true,
                /* Return instructions */
                [BPF_RET | BPF_K] = true,
                [BPF_RET | BPF_A] = true,
                /* Jump instructions */
                [BPF_JMP | BPF_JA] = true,
                [BPF_JMP | BPF_JEQ | BPF_K] = true,
                [BPF_JMP | BPF_JEQ | BPF_X] = true,
                [BPF_JMP | BPF_JGE | BPF_K] = true,
                [BPF_JMP | BPF_JGE | BPF_X] = true,
                [BPF_JMP | BPF_JGT | BPF_K] = true,
                [BPF_JMP | BPF_JGT | BPF_X] = true,
                [BPF_JMP | BPF_JSET | BPF_K] = true,
                [BPF_JMP | BPF_JSET | BPF_X] = true,
        };

        if (code_to_probe >= ARRAY_SIZE(codes))
                return false;

        return codes[code_to_probe];
}

static bool bpf_check_basics_ok(const struct sock_filter *filter,
                                unsigned int flen)
{
        if (filter == NULL)
                return false;
        if (flen == 0 || flen > BPF_MAXINSNS)
                return false;

        return true;
}

/**
 *        bpf_check_classic - verify socket filter code
 *        @filter: filter to verify
 *        @flen: length of filter
 *
 * Check the user's filter code. If we let some ugly
 * filter code slip through kaboom! The filter must contain
 * no references or jumps that are out of range, no illegal
 * instructions, and must end with a RET instruction.
 *
 * All jumps are forward as they are not signed.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
 */
static int bpf_check_classic(const struct sock_filter *filter,
                             unsigned int flen)
{
        bool anc_found;
        int pc;

        /* Check the filter code now */
        for (pc = 0; pc < flen; pc++) {
                const struct sock_filter *ftest = &filter[pc];

                /* May we actually operate on this code? */
                if (!chk_code_allowed(ftest->code))
                        return -EINVAL;

                /* Some instructions need special checks */
                switch (ftest->code) {
                case BPF_ALU | BPF_DIV | BPF_K:
                case BPF_ALU | BPF_MOD | BPF_K:
                        /* Check for division by zero */
                        if (ftest->k == 0)
                                return -EINVAL;
                        break;
                case BPF_ALU | BPF_LSH | BPF_K:
                case BPF_ALU | BPF_RSH | BPF_K:
                        if (ftest->k >= 32)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_MEM:
                case BPF_LDX | BPF_MEM:
                case BPF_ST:
                case BPF_STX:
                        /* Check for invalid memory addresses */
                        if (ftest->k >= BPF_MEMWORDS)
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JA:
                        /* Note, the large ftest->k might cause loops.
                         * Compare this with conditional jumps below,
                         * where offsets are limited. --ANK (981016)
                         */
                        if (ftest->k >= (unsigned int)(flen - pc - 1))
                                return -EINVAL;
                        break;
                case BPF_JMP | BPF_JEQ | BPF_K:
                case BPF_JMP | BPF_JEQ | BPF_X:
                case BPF_JMP | BPF_JGE | BPF_K:
                case BPF_JMP | BPF_JGE | BPF_X:
                case BPF_JMP | BPF_JGT | BPF_K:
                case BPF_JMP | BPF_JGT | BPF_X:
                case BPF_JMP | BPF_JSET | BPF_K:
                case BPF_JMP | BPF_JSET | BPF_X:
                        /* Both conditionals must be safe */
                        if (pc + ftest->jt + 1 >= flen ||
                            pc + ftest->jf + 1 >= flen)
                                return -EINVAL;
                        break;
                case BPF_LD | BPF_W | BPF_ABS:
                case BPF_LD | BPF_H | BPF_ABS:
                case BPF_LD | BPF_B | BPF_ABS:
                        anc_found = false;
                        if (bpf_anc_helper(ftest) & BPF_ANC)
                                anc_found = true;
                        /* Ancillary operation unknown or unsupported */
                        if (anc_found == false && ftest->k >= SKF_AD_OFF)
                                return -EINVAL;
                }
        }

        /* Last instruction must be a RET code */
        switch (filter[flen - 1].code) {
        case BPF_RET | BPF_K:
        case BPF_RET | BPF_A:
                return check_load_and_stores(filter, flen);
        }

        return -EINVAL;
}

static int bpf_prog_store_orig_filter(struct bpf_prog *fp,
                                      const struct sock_fprog *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct sock_fprog_kern *fkprog;

        fp->orig_prog = kmalloc(sizeof(*fkprog), GFP_KERNEL);
        if (!fp->orig_prog)
                return -ENOMEM;

        fkprog = fp->orig_prog;
        fkprog->len = fprog->len;

        fkprog->filter = kmemdup(fp->insns, fsize,
                                 GFP_KERNEL | __GFP_NOWARN);
        if (!fkprog->filter) {
                kfree(fp->orig_prog);
                return -ENOMEM;
        }

        return 0;
}

static void bpf_release_orig_filter(struct bpf_prog *fp)
{
        struct sock_fprog_kern *fprog = fp->orig_prog;

        if (fprog) {
                kfree(fprog->filter);
                kfree(fprog);
        }
}

static void __bpf_prog_release(struct bpf_prog *prog)
{
        if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) {
                bpf_prog_put(prog);
        } else {
                bpf_release_orig_filter(prog);
                bpf_prog_free(prog);
        }
}

static void __sk_filter_release(struct sk_filter *fp)
{
        __bpf_prog_release(fp->prog);
        kfree(fp);
}

/**
 *         sk_filter_release_rcu - Release a socket filter by rcu_head
 *        @rcu: rcu_head that contains the sk_filter to free
 */
static void sk_filter_release_rcu(struct rcu_head *rcu)
{
        struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);

        __sk_filter_release(fp);
}

/**
 *        sk_filter_release - release a socket filter
 *        @fp: filter to remove
 *
 *        Remove a filter from a socket and release its resources.
 */
static void sk_filter_release(struct sk_filter *fp)
{
        if (refcount_dec_and_test(&fp->refcnt))
                call_rcu(&fp->rcu, sk_filter_release_rcu);
}

void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
{
        u32 filter_size = bpf_prog_size(fp->prog->len);

        atomic_sub(filter_size, &sk->sk_omem_alloc);
        sk_filter_release(fp);
}

/* try to charge the socket memory if there is space available
 * return true on success
 */
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
        u32 filter_size = bpf_prog_size(fp->prog->len);

        /* same check as in sock_kmalloc() */
        if (filter_size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
                atomic_add(filter_size, &sk->sk_omem_alloc);
                return true;
        }
        return false;
}

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
        if (!refcount_inc_not_zero(&fp->refcnt))
                return false;

        if (!__sk_filter_charge(sk, fp)) {
                sk_filter_release(fp);
                return false;
        }
        return true;
}

static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
{
        struct sock_filter *old_prog;
        struct bpf_prog *old_fp;
        int err, new_len, old_len = fp->len;
        bool seen_ld_abs = false;

        /* We are free to overwrite insns et al right here as it won't be used at
         * this point in time anymore internally after the migration to the eBPF
         * instruction representation.
         */
        BUILD_BUG_ON(sizeof(struct sock_filter) !=
                     sizeof(struct bpf_insn));

        /* Conversion cannot happen on overlapping memory areas,
         * so we need to keep the user BPF around until the 2nd
         * pass. At this time, the user BPF is stored in fp->insns.
         */
        old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter),
                                 GFP_KERNEL | __GFP_NOWARN);
        if (!old_prog) {
                err = -ENOMEM;
                goto out_err;
        }

        /* 1st pass: calculate the new program length. */
        err = bpf_convert_filter(old_prog, old_len, NULL, &new_len,
                                 &seen_ld_abs);
        if (err)
                goto out_err_free;

        /* Expand fp for appending the new filter representation. */
        old_fp = fp;
        fp = bpf_prog_realloc(old_fp, bpf_prog_size(new_len), 0);
        if (!fp) {
                /* The old_fp is still around in case we couldn't
                 * allocate new memory, so uncharge on that one.
                 */
                fp = old_fp;
                err = -ENOMEM;
                goto out_err_free;
        }

        fp->len = new_len;

        /* 2nd pass: remap sock_filter insns into bpf_insn insns. */
        err = bpf_convert_filter(old_prog, old_len, fp, &new_len,
                                 &seen_ld_abs);
        if (err)
                /* 2nd bpf_convert_filter() can fail only if it fails
                 * to allocate memory, remapping must succeed. Note,
                 * that at this time old_fp has already been released
                 * by krealloc().
                 */
                goto out_err_free;

        fp = bpf_prog_select_runtime(fp, &err);
        if (err)
                goto out_err_free;

        kfree(old_prog);
        return fp;

out_err_free:
        kfree(old_prog);
out_err:
        __bpf_prog_release(fp);
        return ERR_PTR(err);
}

static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
                                           bpf_aux_classic_check_t trans)
{
        int err;

        fp->bpf_func = NULL;
        fp->jited = 0;

        err = bpf_check_classic(fp->insns, fp->len);
        if (err) {
                __bpf_prog_release(fp);
                return ERR_PTR(err);
        }

        /* There might be additional checks and transformations
         * needed on classic filters, f.e. in case of seccomp.
         */
        if (trans) {
                err = trans(fp->insns, fp->len);
                if (err) {
                        __bpf_prog_release(fp);
                        return ERR_PTR(err);
                }
        }

        /* Probe if we can JIT compile the filter and if so, do
         * the compilation of the filter.
         */
        bpf_jit_compile(fp);

        /* JIT compiler couldn't process this filter, so do the eBPF translation
         * for the optimized interpreter.
         */
        if (!fp->jited)
                fp = bpf_migrate_filter(fp);

        return fp;
}

/**
 *        bpf_prog_create - create an unattached filter
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *
 * Create a filter independent of any socket. We first run some
 * sanity checks on it to make sure it does not explode on us later.
 * If an error occurs or there is insufficient memory for the filter
 * a negative errno code is returned. On success the return is zero.
 */
int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        memcpy(fp->insns, fprog->filter, fsize);

        fp->len = fprog->len;
        /* Since unattached filters are not copied back to user
         * space through sk_get_filter(), we do not need to hold
         * a copy here, and can spare us the work.
         */
        fp->orig_prog = NULL;

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, NULL);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create);

/**
 *        bpf_prog_create_from_user - create an unattached filter from user buffer
 *        @pfp: the unattached filter that is created
 *        @fprog: the filter program
 *        @trans: post-classic verifier transformation handler
 *        @save_orig: save classic BPF program
 *
 * This function effectively does the same as bpf_prog_create(), only
 * that it builds up its insns buffer from user space provided buffer.
 * It also allows for passing a bpf_aux_classic_check_t handler.
 */
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *fp;
        int err;

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return -EINVAL;

        fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!fp)
                return -ENOMEM;

        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
                __bpf_prog_free(fp);
                return -EFAULT;
        }

        fp->len = fprog->len;
        fp->orig_prog = NULL;

        if (save_orig) {
                err = bpf_prog_store_orig_filter(fp, fprog);
                if (err) {
                        __bpf_prog_free(fp);
                        return -ENOMEM;
                }
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        fp = bpf_prepare_filter(fp, trans);
        if (IS_ERR(fp))
                return PTR_ERR(fp);

        *pfp = fp;
        return 0;
}
EXPORT_SYMBOL_GPL(bpf_prog_create_from_user);

void bpf_prog_destroy(struct bpf_prog *fp)
{
        __bpf_prog_release(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_destroy);

static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
{
        struct sk_filter *fp, *old_fp;

        fp = kmalloc(sizeof(*fp), GFP_KERNEL);
        if (!fp)
                return -ENOMEM;

        fp->prog = prog;

        if (!__sk_filter_charge(sk, fp)) {
                kfree(fp);
                return -ENOMEM;
        }
        refcount_set(&fp->refcnt, 1);

        old_fp = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_filter, fp);

        if (old_fp)
                sk_filter_uncharge(sk, old_fp);

        return 0;
}

static
struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
        unsigned int fsize = bpf_classic_proglen(fprog);
        struct bpf_prog *prog;
        int err;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        /* Make sure new filter is there and in the right amounts. */
        if (!bpf_check_basics_ok(fprog->filter, fprog->len))
                return ERR_PTR(-EINVAL);

        prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0);
        if (!prog)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(prog->insns, fprog->filter, fsize)) {
                __bpf_prog_free(prog);
                return ERR_PTR(-EFAULT);
        }

        prog->len = fprog->len;

        err = bpf_prog_store_orig_filter(prog, fprog);
        if (err) {
                __bpf_prog_free(prog);
                return ERR_PTR(-ENOMEM);
        }

        /* bpf_prepare_filter() already takes care of freeing
         * memory in case something goes wrong.
         */
        return bpf_prepare_filter(prog, NULL);
}

/**
 *        sk_attach_filter - attach a socket filter
 *        @fprog: the filter program
 *        @sk: the socket to use
 *
 * Attach the user's filter code. We first run some sanity checks on
 * it to make sure it does not explode on us later. If an error
 * occurs or there is insufficient memory for the filter a negative
 * errno code is returned. On success the return is zero.
 */
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                __bpf_prog_release(prog);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(sk_attach_filter);

int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
        struct bpf_prog *prog = __get_filter(fprog, sk);
        int err, optmem_max;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
        if (bpf_prog_size(prog->len) > optmem_max)
                err = -ENOMEM;
        else
                err = reuseport_attach_prog(sk, prog);

        if (err)
                __bpf_prog_release(prog);

        return err;
}

static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
{
        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return ERR_PTR(-EPERM);

        return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
}

int sk_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog = __get_bpf(ufd, sk);
        int err;

        if (IS_ERR(prog))
                return PTR_ERR(prog);

        err = __sk_attach_prog(prog, sk);
        if (err < 0) {
                bpf_prog_put(prog);
                return err;
        }

        return 0;
}

int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
        struct bpf_prog *prog;
        int err, optmem_max;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
        if (PTR_ERR(prog) == -EINVAL)
                prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
        if (IS_ERR(prog))
                return PTR_ERR(prog);

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT) {
                /* Like other non BPF_PROG_TYPE_SOCKET_FILTER
                 * bpf prog (e.g. sockmap).  It depends on the
                 * limitation imposed by bpf_prog_load().
                 * Hence, sysctl_optmem_max is not checked.
                 */
                if ((sk->sk_type != SOCK_STREAM &&
                     sk->sk_type != SOCK_DGRAM) ||
                    (sk->sk_protocol != IPPROTO_UDP &&
                     sk->sk_protocol != IPPROTO_TCP) ||
                    (sk->sk_family != AF_INET &&
                     sk->sk_family != AF_INET6)) {
                        err = -ENOTSUPP;
                        goto err_prog_put;
                }
        } else {
                /* BPF_PROG_TYPE_SOCKET_FILTER */
                optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
                if (bpf_prog_size(prog->len) > optmem_max) {
                        err = -ENOMEM;
                        goto err_prog_put;
                }
        }

        err = reuseport_attach_prog(sk, prog);
err_prog_put:
        if (err)
                bpf_prog_put(prog);

        return err;
}

void sk_reuseport_prog_free(struct bpf_prog *prog)
{
        if (!prog)
                return;

        if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
                bpf_prog_put(prog);
        else
                bpf_prog_destroy(prog);
}

static inline int __bpf_try_make_writable(struct sk_buff *skb,
                                          unsigned int write_len)
{
#ifdef CONFIG_DEBUG_NET
        /* Avoid a splat in pskb_may_pull_reason() */
        if (write_len > INT_MAX)
                return -EINVAL;
#endif
        return skb_ensure_writable(skb, write_len);
}

static inline int bpf_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        int err = __bpf_try_make_writable(skb, write_len);

        bpf_compute_data_pointers(skb);
        return err;
}

static int bpf_try_make_head_writable(struct sk_buff *skb)
{
        return bpf_try_make_writable(skb, skb_headlen(skb));
}

static inline void bpf_push_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

static inline void bpf_pull_mac_rcsum(struct sk_buff *skb)
{
        if (skb_at_tc_ingress(skb))
                skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len);
}

BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len, u64, flags)
{
        void *ptr;

        if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
                return -EINVAL;
        if (unlikely(offset > INT_MAX))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;

        ptr = skb->data + offset;
        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpull_rcsum(skb, ptr, len, offset);

        memcpy(ptr, from, len);

        if (flags & BPF_F_RECOMPUTE_CSUM)
                __skb_postpush_rcsum(skb, ptr, len, offset);
        if (flags & BPF_F_INVALIDATE_HASH)
                skb_clear_hash(skb);

        return 0;
}

static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
        .func                = bpf_skb_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
                          u32 len, u64 flags)
{
        return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
}

BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > INT_MAX))
                goto err_clear;

        ptr = skb_header_pointer(skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
        .func                = bpf_skb_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
{
        return ____bpf_skb_load_bytes(skb, offset, to, len);
}

BPF_CALL_4(bpf_flow_dissector_load_bytes,
           const struct bpf_flow_dissector *, ctx, u32, offset,
           void *, to, u32, len)
{
        void *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        if (unlikely(!ctx->skb))
                goto err_clear;

        ptr = skb_header_pointer(ctx->skb, offset, len, to);
        if (unlikely(!ptr))
                goto err_clear;
        if (ptr != to)
                memcpy(to, ptr, len);

        return 0;
err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
        .func                = bpf_flow_dissector_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
           u32, offset, void *, to, u32, len, u32, start_header)
{
        u8 *end = skb_tail_pointer(skb);
        u8 *start, *ptr;

        if (unlikely(offset > 0xffff))
                goto err_clear;

        switch (start_header) {
        case BPF_HDR_START_MAC:
                if (unlikely(!skb_mac_header_was_set(skb)))
                        goto err_clear;
                start = skb_mac_header(skb);
                break;
        case BPF_HDR_START_NET:
                start = skb_network_header(skb);
                break;
        default:
                goto err_clear;
        }

        ptr = start + offset;

        if (likely(ptr + len <= end)) {
                memcpy(to, ptr, len);
                return 0;
        }

err_clear:
        memset(to, 0, len);
        return -EFAULT;
}

static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = {
        .func                = bpf_skb_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return bpf_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto bpf_skb_pull_data_proto = {
        .func                = bpf_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
{
        return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_sk_fullsock_proto = {
        .func                = bpf_sk_fullsock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

static inline int sk_skb_try_make_writable(struct sk_buff *skb,
                                           unsigned int write_len)
{
        return __bpf_try_make_writable(skb, write_len);
}

BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
{
        /* Idea is the following: should the needed direct read/write
         * test fail during runtime, we can pull in more data and redo
         * again, since implicitly, we invalidate previous checks here.
         *
         * Or, since we know how much we need to make read/writeable,
         * this can be done once at the program beginning for direct
         * access case. By this we overcome limitations of only current
         * headroom being accessible.
         */
        return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
}

static const struct bpf_func_proto sk_skb_pull_data_proto = {
        .func                = sk_skb_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                csum_replace_by_diff(ptr, to);
                break;
        case 2:
                csum_replace2(ptr, from, to);
                break;
        case 4:
                csum_replace4(ptr, from, to);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
        .func                = bpf_l3_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
           u64, from, u64, to, u64, flags)
{
        bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
        bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
        bool do_mforce = flags & BPF_F_MARK_ENFORCE;
        __sum16 *ptr;

        if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
                               BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
                return -EINVAL;
        if (unlikely(offset > 0xffff || offset & 1))
                return -EFAULT;
        if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr))))
                return -EFAULT;

        ptr = (__sum16 *)(skb->data + offset);
        if (is_mmzero && !do_mforce && !*ptr)
                return 0;

        switch (flags & BPF_F_HDR_FIELD_MASK) {
        case 0:
                if (unlikely(from != 0))
                        return -EINVAL;

                inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
                break;
        case 2:
                inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
                break;
        case 4:
                inet_proto_csum_replace4(ptr, skb, from, to, is_pseudo);
                break;
        default:
                return -EINVAL;
        }

        if (is_mmzero && !*ptr)
                *ptr = CSUM_MANGLED_0;
        return 0;
}

static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
        .func                = bpf_l4_csum_replace,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
           __be32 *, to, u32, to_size, __wsum, seed)
{
        /* This is quite flexible, some examples:
         *
         * from_size == 0, to_size > 0,  seed := csum --> pushing data
         * from_size > 0,  to_size == 0, seed := csum --> pulling data
         * from_size > 0,  to_size > 0,  seed := 0    --> diffing data
         *
         * Even for diffing, from_size and to_size don't need to be equal.
         */

        __wsum ret = seed;

        if (from_size && to_size)
                ret = csum_sub(csum_partial(to, to_size, ret),
                               csum_partial(from, from_size, 0));
        else if (to_size)
                ret = csum_partial(to, to_size, ret);

        else if (from_size)
                ret = ~csum_partial(from, from_size, ~ret);

        return csum_from32to16((__force unsigned int)ret);
}

static const struct bpf_func_proto bpf_csum_diff_proto = {
        .func                = bpf_csum_diff,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg2_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg3_type        = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum)
{
        /* The interface is to be used in combination with bpf_csum_diff()
         * for direct packet writes. csum rotation for alignment as well
         * as emulating csum_sub() can be done from the eBPF program.
         */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                return (skb->csum = csum_add(skb->csum, csum));

        return -ENOTSUPP;
}

static const struct bpf_func_proto bpf_csum_update_proto = {
        .func                = bpf_csum_update,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
{
        /* The interface is to be used in combination with bpf_skb_adjust_room()
         * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
         * is passed as flags, for example.
         */
        switch (level) {
        case BPF_CSUM_LEVEL_INC:
                __skb_incr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_DEC:
                __skb_decr_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_RESET:
                __skb_reset_checksum_unnecessary(skb);
                break;
        case BPF_CSUM_LEVEL_QUERY:
                return skb->ip_summed == CHECKSUM_UNNECESSARY ?
                       skb->csum_level : -EACCES;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_csum_level_proto = {
        .func                = bpf_csum_level,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
        return dev_forward_skb_nomtu(dev, skb);
}

static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
                                      struct sk_buff *skb)
{
        int ret = ____dev_forward_skb(dev, skb, false);

        if (likely(!ret)) {
                skb->dev = dev;
                ret = netif_rx(skb);
        }

        return ret;
}

static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
        int ret;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                kfree_skb(skb);
                return -ENETDOWN;
        }

        skb->dev = dev;
        skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
        skb_clear_tstamp(skb);

        dev_xmit_recursion_inc();
        ret = dev_queue_xmit(skb);
        dev_xmit_recursion_dec();

        return ret;
}

static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        unsigned int mlen = skb_network_offset(skb);

        if (unlikely(skb->len <= mlen)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        if (mlen) {
                __skb_pull(skb, mlen);

                /* At ingress, the mac header has already been pulled once.
                 * At egress, skb_pospull_rcsum has to be done in case that
                 * the skb is originated from ingress (i.e. a forwarded skb)
                 * to ensure that rcsum starts at net header.
                 */
                if (!skb_at_tc_ingress(skb))
                        skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
        }
        skb_pop_mac_header(skb);
        skb_reset_mac_len(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
                                 u32 flags)
{
        /* Verify that a link layer header is carried */
        if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
                kfree_skb(skb);
                return -ERANGE;
        }

        bpf_push_mac_rcsum(skb);
        return flags & BPF_F_INGRESS ?
               __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb);
}

static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
                          u32 flags)
{
        if (dev_is_mac_header_xmit(dev))
                return __bpf_redirect_common(skb, dev, flags);
        else
                return __bpf_redirect_no_mac(skb, dev, flags);
}

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        const struct in6_addr *nexthop;
        struct dst_entry *dst = NULL;
        struct neighbour *neigh;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb_clear_tstamp(skb);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        rcu_read_lock();
        if (!nh) {
                dst = skb_dst(skb);
                nexthop = rt6_nexthop(dst_rt6_info(dst),
                                      &ipv6_hdr(skb)->daddr);
        } else {
                nexthop = &nh->ipv6_nh;
        }
        neigh = ip_neigh_gw6(dev, nexthop);
        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                local_bh_disable();
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, false);
                dev_xmit_recursion_dec();
                local_bh_enable();
                rcu_read_unlock();
                return ret;
        }
        rcu_read_unlock();
        if (dst)
                IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct ipv6hdr *ip6h = ipv6_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct dst_entry *dst;
                struct flowi6 fl6 = {
                        .flowi6_flags = FLOWI_FLAG_ANYSRC,
                        .flowi6_mark  = skb->mark,
                        .flowlabel    = ip6_flowinfo(ip6h),
                        .flowi6_oif   = dev->ifindex,
                        .flowi6_proto = ip6h->nexthdr,
                        .daddr              = ip6h->daddr,
                        .saddr              = ip6h->saddr,
                };

                dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
                if (IS_ERR(dst))
                        goto out_drop;

                skb_dst_set(skb, dst);
        } else if (nh->nh_family != AF_INET6) {
                goto out_drop;
        }

        err = bpf_out_neigh_v6(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                DEV_STATS_INC(dev, tx_errors);
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_IPV6 */

#if IS_ENABLED(CONFIG_INET)
static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
                            struct net_device *dev, struct bpf_nh_params *nh)
{
        u32 hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
        bool is_v6gw = false;

        if (dev_xmit_recursion()) {
                net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
                goto out_drop;
        }

        skb->dev = dev;
        skb_clear_tstamp(skb);

        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb)
                        return -ENOMEM;
        }

        rcu_read_lock();
        if (!nh) {
                struct rtable *rt = skb_rtable(skb);

                neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
        } else if (nh->nh_family == AF_INET6) {
                neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
                is_v6gw = true;
        } else if (nh->nh_family == AF_INET) {
                neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
        } else {
                rcu_read_unlock();
                goto out_drop;
        }

        if (likely(!IS_ERR(neigh))) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                local_bh_disable();
                dev_xmit_recursion_inc();
                ret = neigh_output(neigh, skb, is_v6gw);
                dev_xmit_recursion_dec();
                local_bh_enable();
                rcu_read_unlock();
                return ret;
        }
        rcu_read_unlock();
out_drop:
        kfree_skb(skb);
        return -ENETDOWN;
}

static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        const struct iphdr *ip4h = ip_hdr(skb);
        struct net *net = dev_net(dev);
        int err, ret = NET_XMIT_DROP;

        if (!nh) {
                struct flowi4 fl4 = {
                        .flowi4_flags = FLOWI_FLAG_ANYSRC,
                        .flowi4_mark  = skb->mark,
                        .flowi4_tos   = inet_dscp_to_dsfield(ip4h_dscp(ip4h)),
                        .flowi4_oif   = dev->ifindex,
                        .flowi4_proto = ip4h->protocol,
                        .daddr              = ip4h->daddr,
                        .saddr              = ip4h->saddr,
                };
                struct rtable *rt;

                rt = ip_route_output_flow(net, &fl4, NULL);
                if (IS_ERR(rt))
                        goto out_drop;
                if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
                        ip_rt_put(rt);
                        goto out_drop;
                }

                skb_dst_set(skb, &rt->dst);
        }

        err = bpf_out_neigh_v4(net, skb, dev, nh);
        if (unlikely(net_xmit_eval(err)))
                DEV_STATS_INC(dev, tx_errors);
        else
                ret = NET_XMIT_SUCCESS;
        goto out_xmit;
out_drop:
        DEV_STATS_INC(dev, tx_errors);
        kfree_skb(skb);
out_xmit:
        return ret;
}
#else
static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
                                   struct bpf_nh_params *nh)
{
        kfree_skb(skb);
        return NET_XMIT_DROP;
}
#endif /* CONFIG_INET */

static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
                                struct bpf_nh_params *nh)
{
        struct ethhdr *ethh = eth_hdr(skb);

        if (unlikely(skb->mac_header >= skb->network_header))
                goto out;
        bpf_push_mac_rcsum(skb);
        if (is_multicast_ether_addr(ethh->h_dest))
                goto out;

        skb_pull(skb, sizeof(*ethh));
        skb_unset_mac_header(skb);
        skb_reset_network_header(skb);

        if (skb->protocol == htons(ETH_P_IP))
                return __bpf_redirect_neigh_v4(skb, dev, nh);
        else if (skb->protocol == htons(ETH_P_IPV6))
                return __bpf_redirect_neigh_v6(skb, dev, nh);
out:
        kfree_skb(skb);
        return -ENOTSUPP;
}

/* Internal, non-exposed redirect flags. */
enum {
        BPF_F_NEIGH        = (1ULL << 16),
        BPF_F_PEER        = (1ULL << 17),
        BPF_F_NEXTHOP        = (1ULL << 18),
#define BPF_F_REDIRECT_INTERNAL        (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
};

BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
        struct net_device *dev;
        struct sk_buff *clone;
        int ret;

        BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return -EINVAL;

        dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
        if (unlikely(!dev))
                return -EINVAL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (unlikely(!clone))
                return -ENOMEM;

        /* For direct write, we need to keep the invariant that the skbs
         * we're dealing with need to be uncloned. Should uncloning fail
         * here, we need to free the just generated clone to unclone once
         * again.
         */
        ret = bpf_try_make_head_writable(skb);
        if (unlikely(ret)) {
                kfree_skb(clone);
                return -ENOMEM;
        }

        return __bpf_redirect(clone, dev, flags);
}

static const struct bpf_func_proto bpf_clone_redirect_proto = {
        .func           = bpf_clone_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (likely(ops->ndo_get_peer_dev))
                return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
                                       netkit_peer_dev, dev);
        return NULL;
}

int skb_do_redirect(struct sk_buff *skb)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct net *net = dev_net(skb->dev);
        struct net_device *dev;
        u32 flags = ri->flags;

        dev = dev_get_by_index_rcu(net, ri->tgt_index);
        ri->tgt_index = 0;
        ri->flags = 0;
        if (unlikely(!dev))
                goto out_drop;
        if (flags & BPF_F_PEER) {
                if (unlikely(!skb_at_tc_ingress(skb)))
                        goto out_drop;
                dev = skb_get_peer_dev(dev);
                if (unlikely(!dev ||
                             !(dev->flags & IFF_UP) ||
                             net_eq(net, dev_net(dev))))
                        goto out_drop;
                skb->dev = dev;
                dev_sw_netstats_rx_add(dev, skb->len);
                return -EAGAIN;
        }
        return flags & BPF_F_NEIGH ?
               __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
                                    &ri->nh : NULL) :
               __bpf_redirect(skb, dev, flags);
out_drop:
        kfree_skb(skb);
        return -EINVAL;
}

BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
                return TC_ACT_SHOT;

        ri->flags = flags;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_proto = {
        .func           = bpf_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_PEER;
        ri->tgt_index = ifindex;

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_peer_proto = {
        .func           = bpf_redirect_peer,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
           int, plen, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely((plen && plen < sizeof(*params)) || flags))
                return TC_ACT_SHOT;

        ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
        ri->tgt_index = ifindex;

        BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
        if (plen)
                memcpy(&ri->nh, params, sizeof(ri->nh));

        return TC_ACT_REDIRECT;
}

static const struct bpf_func_proto bpf_redirect_neigh_proto = {
        .func                = bpf_redirect_neigh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_ANYTHING,
        .arg2_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->apply_bytes = bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
        .func           = bpf_msg_apply_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
{
        msg->cork_bytes = bytes;
        return 0;
}

static void sk_msg_reset_curr(struct sk_msg *msg)
{
        if (!msg->sg.size) {
                msg->sg.curr = msg->sg.start;
                msg->sg.copybreak = 0;
        } else {
                u32 i = msg->sg.end;

                sk_msg_iter_var_prev(i);
                msg->sg.curr = i;
                msg->sg.copybreak = msg->sg.data[i].length;
        }
}

static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
        .func           = bpf_msg_cork_bytes,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
           u32, end, u64, flags)
{
        u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
        u32 first_sge, last_sge, i, shift, bytes_sg_total;
        struct scatterlist *sge;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags || end <= start))
                return -EINVAL;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += len;
                len = sk_msg_elem(msg, i)->length;
                if (start < offset + len)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (unlikely(start >= offset + len))
                return -EINVAL;

        first_sge = i;
        /* The start may point into the sg element so we need to also
         * account for the headroom.
         */
        bytes_sg_total = start - offset + bytes;
        if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
                goto out;

        /* At this point we need to linearize multiple scatterlist
         * elements or a single shared page. Either way we need to
         * copy into a linear buffer exclusively owned by BPF. Then
         * place the buffer in the scatterlist and fixup the original
         * entries by removing the entries now in the linear buffer
         * and shifting the remaining entries. For now we do not try
         * to copy partial entries to avoid complexity of running out
         * of sg_entry slots. The downside is reading a single byte
         * will copy the entire sg entry.
         */
        do {
                copy += sk_msg_elem(msg, i)->length;
                sk_msg_iter_var_next(i);
                if (bytes_sg_total <= copy)
                        break;
        } while (i != msg->sg.end);
        last_sge = i;

        if (unlikely(bytes_sg_total > copy))
                return -EINVAL;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy));
        if (unlikely(!page))
                return -ENOMEM;

        raw = page_address(page);
        i = first_sge;
        do {
                sge = sk_msg_elem(msg, i);
                from = sg_virt(sge);
                len = sge->length;
                to = raw + poffset;

                memcpy(to, from, len);
                poffset += len;
                sge->length = 0;
                put_page(sg_page(sge));

                sk_msg_iter_var_next(i);
        } while (i != last_sge);

        sg_set_page(&msg->sg.data[first_sge], page, copy, 0);

        /* To repair sg ring we need to shift entries. If we only
         * had a single entry though we can just replace it and
         * be done. Otherwise walk the ring and shift the entries.
         */
        WARN_ON_ONCE(last_sge == first_sge);
        shift = last_sge > first_sge ?
                last_sge - first_sge - 1 :
                NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
        if (!shift)
                goto out;

        i = first_sge;
        sk_msg_iter_var_next(i);
        do {
                u32 move_from;

                if (i + shift >= NR_MSG_FRAG_IDS)
                        move_from = i + shift - NR_MSG_FRAG_IDS;
                else
                        move_from = i + shift;
                if (move_from == msg->sg.end)
                        break;

                msg->sg.data[i] = msg->sg.data[move_from];
                msg->sg.data[move_from].length = 0;
                msg->sg.data[move_from].page_link = 0;
                msg->sg.data[move_from].offset = 0;
                sk_msg_iter_var_next(i);
        } while (1);

        msg->sg.end = msg->sg.end - shift > msg->sg.end ?
                      msg->sg.end - shift + NR_MSG_FRAG_IDS :
                      msg->sg.end - shift;
out:
        sk_msg_reset_curr(msg);
        msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
        msg->data_end = msg->data + bytes;
        return 0;
}

static const struct bpf_func_proto bpf_msg_pull_data_proto = {
        .func                = bpf_msg_pull_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
        u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
        u8 *raw, *to, *from;
        struct page *page;

        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(len == 0))
                return 0;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        if (start > offset + l)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        /* If no space available will fallback to copy, we need at
         * least one scatterlist elem available to push data into
         * when start aligns to the beginning of an element or two
         * when it falls inside an element. We handle the start equals
         * offset case because its the common case for inserting a
         * header.
         */
        if (!space || (space == 1 && start != offset))
                copy = msg->sg.data[i].length;

        page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
                           get_order(copy + len));
        if (unlikely(!page))
                return -ENOMEM;

        if (copy) {
                int front, back;

                raw = page_address(page);

                if (i == msg->sg.end)
                        sk_msg_iter_var_prev(i);
                psge = sk_msg_elem(msg, i);
                front = start - offset;
                back = psge->length - front;
                from = sg_virt(psge);

                if (front)
                        memcpy(raw, from, front);

                if (back) {
                        from += front;
                        to = raw + front + len;

                        memcpy(to, from, back);
                }

                put_page(sg_page(psge));
                new = i;
                goto place_new;
        }

        if (start - offset) {
                if (i == msg->sg.end)
                        sk_msg_iter_var_prev(i);
                psge = sk_msg_elem(msg, i);
                rsge = sk_msg_elem_cpy(msg, i);

                psge->length = start - offset;
                rsge.length -= psge->length;
                rsge.offset += start;

                sk_msg_iter_var_next(i);
                sg_unmark_end(psge);
                sg_unmark_end(&rsge);
        }

        /* Slot(s) to place newly allocated data */
        sk_msg_iter_next(msg, end);
        new = i;
        sk_msg_iter_var_next(i);

        if (i == msg->sg.end) {
                if (!rsge.length)
                        goto place_new;
                sk_msg_iter_next(msg, end);
                goto place_new;
        }

        /* Shift one or two slots as needed */
        sge = sk_msg_elem_cpy(msg, new);
        sg_unmark_end(&sge);

        nsge = sk_msg_elem_cpy(msg, i);
        if (rsge.length) {
                sk_msg_iter_var_next(i);
                nnsge = sk_msg_elem_cpy(msg, i);
                sk_msg_iter_next(msg, end);
        }

        while (i != msg->sg.end) {
                msg->sg.data[i] = sge;
                sge = nsge;
                sk_msg_iter_var_next(i);
                if (rsge.length) {
                        nsge = nnsge;
                        nnsge = sk_msg_elem_cpy(msg, i);
                } else {
                        nsge = sk_msg_elem_cpy(msg, i);
                }
        }

place_new:
        /* Place newly allocated data buffer */
        sk_mem_charge(msg->sk, len);
        msg->sg.size += len;
        __clear_bit(new, msg->sg.copy);
        sg_set_page(&msg->sg.data[new], page, len + copy, 0);
        if (rsge.length) {
                get_page(sg_page(&rsge));
                sk_msg_iter_var_next(new);
                msg->sg.data[new] = rsge;
        }

        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_push_data_proto = {
        .func                = bpf_msg_push_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static void sk_msg_shift_left(struct sk_msg *msg, int i)
{
        struct scatterlist *sge = sk_msg_elem(msg, i);
        int prev;

        put_page(sg_page(sge));
        do {
                prev = i;
                sk_msg_iter_var_next(i);
                msg->sg.data[prev] = msg->sg.data[i];
        } while (i != msg->sg.end);

        sk_msg_iter_prev(msg, end);
}

static void sk_msg_shift_right(struct sk_msg *msg, int i)
{
        struct scatterlist tmp, sge;

        sk_msg_iter_next(msg, end);
        sge = sk_msg_elem_cpy(msg, i);
        sk_msg_iter_var_next(i);
        tmp = sk_msg_elem_cpy(msg, i);

        while (i != msg->sg.end) {
                msg->sg.data[i] = sge;
                sk_msg_iter_var_next(i);
                sge = tmp;
                tmp = sk_msg_elem_cpy(msg, i);
        }
}

BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
           u32, len, u64, flags)
{
        u32 i = 0, l = 0, space, offset = 0;
        u64 last = start + len;
        int pop;

        if (unlikely(flags))
                return -EINVAL;

        if (unlikely(len == 0))
                return 0;

        /* First find the starting scatterlist element */
        i = msg->sg.start;
        do {
                offset += l;
                l = sk_msg_elem(msg, i)->length;

                if (start < offset + l)
                        break;
                sk_msg_iter_var_next(i);
        } while (i != msg->sg.end);

        /* Bounds checks: start and pop must be inside message */
        if (start >= offset + l || last > msg->sg.size)
                return -EINVAL;

        space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);

        pop = len;
        /* --------------| offset
         * -| start      |-------- len -------|
         *
         *  |----- a ----|-------- pop -------|----- b ----|
         *  |______________________________________________| length
         *
         *
         * a:   region at front of scatter element to save
         * b:   region at back of scatter element to save when length > A + pop
         * pop: region to pop from element, same as input 'pop' here will be
         *      decremented below per iteration.
         *
         * Two top-level cases to handle when start != offset, first B is non
         * zero and second B is zero corresponding to when a pop includes more
         * than one element.
         *
         * Then if B is non-zero AND there is no space allocate space and
         * compact A, B regions into page. If there is space shift ring to
         * the right free'ing the next element in ring to place B, leaving
         * A untouched except to reduce length.
         */
        if (start != offset) {
                struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
                int a = start - offset;
                int b = sge->length - pop - a;

                sk_msg_iter_var_next(i);

                if (b > 0) {
                        if (space) {
                                sge->length = a;
                                sk_msg_shift_right(msg, i);
                                nsge = sk_msg_elem(msg, i);
                                get_page(sg_page(sge));
                                sg_set_page(nsge,
                                            sg_page(sge),
                                            b, sge->offset + pop + a);
                        } else {
                                struct page *page, *orig;
                                u8 *to, *from;

                                page = alloc_pages(__GFP_NOWARN |
                                                   __GFP_COMP   | GFP_ATOMIC,
                                                   get_order(a + b));
                                if (unlikely(!page))
                                        return -ENOMEM;

                                orig = sg_page(sge);
                                from = sg_virt(sge);
                                to = page_address(page);
                                memcpy(to, from, a);
                                memcpy(to + a, from + a + pop, b);
                                sg_set_page(sge, page, a + b, 0);
                                put_page(orig);
                        }
                        pop = 0;
                } else {
                        pop -= (sge->length - a);
                        sge->length = a;
                }
        }

        /* From above the current layout _must_ be as follows,
         *
         * -| offset
         * -| start
         *
         *  |---- pop ---|---------------- b ------------|
         *  |____________________________________________| length
         *
         * Offset and start of the current msg elem are equal because in the
         * previous case we handled offset != start and either consumed the
         * entire element and advanced to the next element OR pop == 0.
         *
         * Two cases to handle here are first pop is less than the length
         * leaving some remainder b above. Simply adjust the element's layout
         * in this case. Or pop >= length of the element so that b = 0. In this
         * case advance to next element decrementing pop.
         */
        while (pop) {
                struct scatterlist *sge = sk_msg_elem(msg, i);

                if (pop < sge->length) {
                        sge->length -= pop;
                        sge->offset += pop;
                        pop = 0;
                } else {
                        pop -= sge->length;
                        sk_msg_shift_left(msg, i);
                }
        }

        sk_mem_uncharge(msg->sk, len - pop);
        msg->sg.size -= (len - pop);
        sk_msg_reset_curr(msg);
        sk_msg_compute_data_pointers(msg);
        return 0;
}

static const struct bpf_func_proto bpf_msg_pop_data_proto = {
        .func                = bpf_msg_pop_data,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

#ifdef CONFIG_CGROUP_NET_CLASSID
BPF_CALL_0(bpf_get_cgroup_classid_curr)
{
        return __task_get_classid(current);
}

const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
        .func                = bpf_get_cgroup_classid_curr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
};

BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
{
        struct sock *sk = skb_to_full_sk(skb);

        if (!sk || !sk_fullsock(sk))
                return 0;

        return sock_cgroup_classid(&sk->sk_cgrp_data);
}

static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
        .func                = bpf_skb_cgroup_classid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};
#endif

BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
        return task_get_classid(skb);
}

static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
        .func           = bpf_get_cgroup_classid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_route_realm, const struct sk_buff *, skb)
{
        return dst_tclassid(skb);
}

static const struct bpf_func_proto bpf_get_route_realm_proto = {
        .func           = bpf_get_route_realm,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb)
{
        /* If skb_clear_hash() was called due to mangling, we can
         * trigger SW recalculation here. Later access to hash
         * can then use the inline skb->hash via context directly
         * instead of calling this helper again.
         */
        return skb_get_hash(skb);
}

static const struct bpf_func_proto bpf_get_hash_recalc_proto = {
        .func                = bpf_get_hash_recalc,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb)
{
        /* After all direct packet write, this can be used once for
         * triggering a lazy recalc on next skb_get_hash() invocation.
         */
        skb_clear_hash(skb);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_invalid_proto = {
        .func                = bpf_set_hash_invalid,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash)
{
        /* Set user specified hash as L4(+), so that it gets returned
         * on skb_get_hash() call unless BPF prog later on triggers a
         * skb_clear_hash().
         */
        __skb_set_sw_hash(skb, hash, true);
        return 0;
}

static const struct bpf_func_proto bpf_set_hash_proto = {
        .func                = bpf_set_hash,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
           u16, vlan_tci)
{
        int ret;

        if (unlikely(vlan_proto != htons(ETH_P_8021Q) &&
                     vlan_proto != htons(ETH_P_8021AD)))
                vlan_proto = htons(ETH_P_8021Q);

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
        bpf_pull_mac_rcsum(skb);
        skb_reset_mac_len(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_push_proto = {
        .func           = bpf_skb_vlan_push,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb)
{
        int ret;

        bpf_push_mac_rcsum(skb);
        ret = skb_vlan_pop(skb);
        bpf_pull_mac_rcsum(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
        .func           = bpf_skb_vlan_pop,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
        /* Caller already did skb_cow() with len as headroom,
         * so no need to do it here.
         */
        skb_push(skb, len);
        memmove(skb->data, skb->data + len, off);
        memset(skb->data + off, 0, len);

        /* No skb_postpush_rcsum(skb, skb->data + off, len)
         * needed here as it does not change the skb->csum
         * result for checksum complete when summing over
         * zeroed blocks.
         */
        return 0;
}

static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
{
        void *old_data;

        /* skb_ensure_writable() is not needed here, as we're
         * already working on an uncloned skb.
         */
        if (unlikely(!pskb_may_pull(skb, off + len)))
                return -ENOMEM;

        old_data = skb->data;
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, old_data + off, len);
        memmove(skb->data, old_data, off);

        return 0;
}

static int bpf_skb_net_hdr_push(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* There's no need for __skb_push()/__skb_pull() pair to
         * get to the start of the mac header as we're guaranteed
         * to always start from here under eBPF.
         */
        ret = bpf_skb_generic_push(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header -= len;
                skb->network_header -= len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
{
        bool trans_same = skb->transport_header == skb->network_header;
        int ret;

        /* Same here, __skb_push()/__skb_pull() pair not needed. */
        ret = bpf_skb_generic_pop(skb, off, len);
        if (likely(!ret)) {
                skb->mac_header += len;
                skb->network_header += len;
                if (trans_same)
                        skb->transport_header = skb->network_header;
        }

        return ret;
}

static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        u32 off = skb_mac_header_len(skb);
        int ret;

        ret = skb_cow(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
                if (shinfo->gso_type & SKB_GSO_TCPV4) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV4;
                        shinfo->gso_type |=  SKB_GSO_TCPV6;
                }
        }

        skb->protocol = htons(ETH_P_IPV6);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
{
        const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        u32 off = skb_mac_header_len(skb);
        int ret;

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
                if (shinfo->gso_type & SKB_GSO_TCPV6) {
                        shinfo->gso_type &= ~SKB_GSO_TCPV6;
                        shinfo->gso_type |=  SKB_GSO_TCPV4;
                }
        }

        skb->protocol = htons(ETH_P_IP);
        skb_clear_hash(skb);

        return 0;
}

static int bpf_skb_proto_xlat(struct sk_buff *skb, __be16 to_proto)
{
        __be16 from_proto = skb->protocol;

        if (from_proto == htons(ETH_P_IP) &&
              to_proto == htons(ETH_P_IPV6))
                return bpf_skb_proto_4_to_6(skb);

        if (from_proto == htons(ETH_P_IPV6) &&
              to_proto == htons(ETH_P_IP))
                return bpf_skb_proto_6_to_4(skb);

        return -ENOTSUPP;
}

BPF_CALL_3(bpf_skb_change_proto, struct sk_buff *, skb, __be16, proto,
           u64, flags)
{
        int ret;

        if (unlikely(flags))
                return -EINVAL;

        /* General idea is that this helper does the basic groundwork
         * needed for changing the protocol, and eBPF program fills the
         * rest through bpf_skb_store_bytes(), bpf_lX_csum_replace()
         * and other helpers, rather than passing a raw buffer here.
         *
         * The rationale is to keep this minimal and without a need to
         * deal with raw packet data. F.e. even if we would pass buffers
         * here, the program still needs to call the bpf_lX_csum_replace()
         * helpers anyway. Plus, this way we keep also separation of
         * concerns, since f.e. bpf_skb_store_bytes() should only take
         * care of stores.
         *
         * Currently, additional options and extension header space are
         * not supported, but flags register is reserved so we can adapt
         * that. For offloads, we mark packet as dodgy, so that headers
         * need to be verified first.
         */
        ret = bpf_skb_proto_xlat(skb, proto);
        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_proto_proto = {
        .func                = bpf_skb_change_proto,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type)
{
        /* We only allow a restricted subset to be changed for now. */
        if (unlikely(!skb_pkt_type_ok(skb->pkt_type) ||
                     !skb_pkt_type_ok(pkt_type)))
                return -EINVAL;

        skb->pkt_type = pkt_type;
        return 0;
}

static const struct bpf_func_proto bpf_skb_change_type_proto = {
        .func                = bpf_skb_change_type,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
{
        switch (skb->protocol) {
        case htons(ETH_P_IP):
                return sizeof(struct iphdr);
        case htons(ETH_P_IPV6):
                return sizeof(struct ipv6hdr);
        default:
                return ~0U;
        }
}

#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK        (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_DECAP_L3_MASK        (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
                                         BPF_F_ADJ_ROOM_DECAP_L3_IPV6)

#define BPF_F_ADJ_ROOM_MASK                (BPF_F_ADJ_ROOM_FIXED_GSO | \
                                         BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2( \
                                          BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
                                         BPF_F_ADJ_ROOM_DECAP_L3_MASK)

static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
                            u64 flags)
{
        u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
        bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
        u16 mac_len = 0, inner_net = 0, inner_trans = 0;
        unsigned int gso_type = SKB_GSO_DODGY;
        int ret;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_cow_head(skb, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                if (skb->protocol != htons(ETH_P_IP) &&
                    skb->protocol != htons(ETH_P_IPV6))
                        return -ENOTSUPP;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        return -EINVAL;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        return -EINVAL;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
                    inner_mac_len < ETH_HLEN)
                        return -EINVAL;

                if (skb->encapsulation)
                        return -EALREADY;

                mac_len = skb->network_header - skb->mac_header;
                inner_net = skb->network_header;
                if (inner_mac_len > len_diff)
                        return -EINVAL;
                inner_trans = skb->transport_header;
        }

        ret = bpf_skb_net_hdr_push(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        if (encap) {
                skb->inner_mac_header = inner_net - inner_mac_len;
                skb->inner_network_header = inner_net;
                skb->inner_transport_header = inner_trans;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
                        skb_set_inner_protocol(skb, htons(ETH_P_TEB));
                else
                        skb_set_inner_protocol(skb, skb->protocol);

                skb->encapsulation = 1;
                skb_set_network_header(skb, mac_len);

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        gso_type |= SKB_GSO_UDP_TUNNEL;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
                        gso_type |= SKB_GSO_GRE;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        gso_type |= SKB_GSO_IPXIP6;
                else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        gso_type |= SKB_GSO_IPXIP4;

                if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
                        int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
                                        sizeof(struct ipv6hdr) :
                                        sizeof(struct iphdr);

                        skb_set_transport_header(skb, mac_len + nh_len);
                }

                /* Match skb->protocol to new outer l3 protocol */
                if (skb->protocol == htons(ETH_P_IP) &&
                    flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
                        skb->protocol = htons(ETH_P_IPV6);
                else if (skb->protocol == htons(ETH_P_IPV6) &&
                         flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
                        skb->protocol = htons(ETH_P_IP);
        }

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= gso_type;
                shinfo->gso_segs = 0;

                /* Due to header growth, MSS needs to be downgraded.
                 * There is a BUG_ON() when segmenting the frag_list with
                 * head_frag true, so linearize the skb after downgrading
                 * the MSS.
                 */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
                        skb_decrease_gso_size(shinfo, len_diff);
                        if (shinfo->frag_list)
                                return skb_linearize(skb);
                }
        }

        return 0;
}

static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
                              u64 flags)
{
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
                               BPF_F_ADJ_ROOM_DECAP_L3_MASK |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;

        if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
                /* udp gso_size delineates datagrams, only allow if fixed */
                if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
                    !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        return -ENOTSUPP;
        }

        ret = skb_unclone(skb, GFP_ATOMIC);
        if (unlikely(ret < 0))
                return ret;

        ret = bpf_skb_net_hdr_pop(skb, off, len_diff);
        if (unlikely(ret < 0))
                return ret;

        /* Match skb->protocol to new outer l3 protocol */
        if (skb->protocol == htons(ETH_P_IP) &&
            flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
                skb->protocol = htons(ETH_P_IPV6);
        else if (skb->protocol == htons(ETH_P_IPV6) &&
                 flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
                skb->protocol = htons(ETH_P_IP);

        if (skb_is_gso(skb)) {
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                /* Due to header shrink, MSS can be upgraded. */
                if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
                        skb_increase_gso_size(shinfo, len_diff);

                /* Header must be checked, and gso_segs recomputed. */
                shinfo->gso_type |= SKB_GSO_DODGY;
                shinfo->gso_segs = 0;
        }

        return 0;
}

#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC

BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_diff_abs = abs(len_diff);
        bool shrink = len_diff < 0;
        int ret = 0;

        if (unlikely(flags || mode))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;

        if (!shrink) {
                ret = skb_cow(skb, len_diff);
                if (unlikely(ret < 0))
                        return ret;
                __skb_push(skb, len_diff_abs);
                memset(skb->data, 0, len_diff_abs);
        } else {
                if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
                        return -ENOMEM;
                __skb_pull(skb, len_diff_abs);
        }
        if (tls_sw_has_ctx_rx(skb->sk)) {
                struct strp_msg *rxm = strp_msg(skb);

                rxm->full_len += len_diff;
        }
        return ret;
}

static const struct bpf_func_proto sk_skb_adjust_room_proto = {
        .func                = sk_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
           u32, mode, u64, flags)
{
        u32 len_cur, len_diff_abs = abs(len_diff);
        u32 len_min = bpf_skb_net_base_len(skb);
        u32 len_max = BPF_SKB_MAX_LEN;
        __be16 proto = skb->protocol;
        bool shrink = len_diff < 0;
        u32 off;
        int ret;

        if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
                               BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
                return -EINVAL;
        if (unlikely(len_diff_abs > 0xfffU))
                return -EFAULT;
        if (unlikely(proto != htons(ETH_P_IP) &&
                     proto != htons(ETH_P_IPV6)))
                return -ENOTSUPP;

        off = skb_mac_header_len(skb);
        switch (mode) {
        case BPF_ADJ_ROOM_NET:
                off += bpf_skb_net_base_len(skb);
                break;
        case BPF_ADJ_ROOM_MAC:
                break;
        default:
                return -ENOTSUPP;
        }

        if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
                if (!shrink)
                        return -EINVAL;

                switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
                case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
                        len_min = sizeof(struct iphdr);
                        break;
                case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
                        len_min = sizeof(struct ipv6hdr);
                        break;
                default:
                        return -EINVAL;
                }
        }

        len_cur = skb->len - skb_network_offset(skb);
        if ((shrink && (len_diff_abs >= len_cur ||
                        len_cur - len_diff_abs < len_min)) ||
            (!shrink && (skb->len + len_diff_abs > len_max &&
                         !skb_is_gso(skb))))
                return -ENOTSUPP;

        ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
                       bpf_skb_net_grow(skb, off, len_diff_abs, flags);
        if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
                __skb_reset_checksum_unnecessary(skb);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
        .func                = bpf_skb_adjust_room,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_ANYTHING,
};

static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
        int offset = skb_network_offset(skb);
        u32 min_len = 0;

        if (offset > 0)
                min_len = offset;
        if (skb_transport_header_was_set(skb)) {
                offset = skb_transport_offset(skb);
                if (offset > 0)
                        min_len = offset;
        }
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                offset = skb_checksum_start_offset(skb) +
                         skb->csum_offset + sizeof(__sum16);
                if (offset > 0)
                        min_len = offset;
        }
        return min_len;
}

static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        unsigned int old_len = skb->len;
        int ret;

        ret = __skb_grow_rcsum(skb, new_len);
        if (!ret)
                memset(skb->data + old_len, 0, new_len - old_len);
        return ret;
}

static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
{
        return __skb_trim_rcsum(skb, new_len);
}

static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
                                        u64 flags)
{
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 min_len = __bpf_skb_min_len(skb);
        int ret;

        if (unlikely(flags || new_len > max_len || new_len < min_len))
                return -EINVAL;
        if (skb->encapsulation)
                return -ENOTSUPP;

        /* The basic idea of this helper is that it's performing the
         * needed work to either grow or trim an skb, and eBPF program
         * rewrites the rest via helpers like bpf_skb_store_bytes(),
         * bpf_lX_csum_replace() and others rather than passing a raw
         * buffer here. This one is a slow path helper and intended
         * for replies with control messages.
         *
         * Like in bpf_skb_change_proto(), we want to keep this rather
         * minimal and without protocol specifics so that we are able
         * to separate concerns as in bpf_skb_store_bytes() should only
         * be the one responsible for writing buffers.
         *
         * It's really expected to be a slow path operation here for
         * control message replies, so we're implicitly linearizing,
         * uncloning and drop offloads from the skb by this.
         */
        ret = __bpf_try_make_writable(skb, skb->len);
        if (!ret) {
                if (new_len > skb->len)
                        ret = bpf_skb_grow_rcsum(skb, new_len);
                else if (new_len < skb->len)
                        ret = bpf_skb_trim_rcsum(skb, new_len);
                if (!ret && skb_is_gso(skb))
                        skb_gso_reset(skb);
        }
        return ret;
}

BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        int ret = __bpf_skb_change_tail(skb, new_len, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_tail_proto = {
        .func                = bpf_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
{
        return __bpf_skb_change_tail(skb, new_len, flags);
}

static const struct bpf_func_proto sk_skb_change_tail_proto = {
        .func                = sk_skb_change_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
                                        u64 flags)
{
        u32 max_len = BPF_SKB_MAX_LEN;
        u32 new_len = skb->len + head_room;
        int ret;

        if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
                     new_len < skb->len))
                return -EINVAL;

        ret = skb_cow(skb, head_room);
        if (likely(!ret)) {
                /* Idea for this helper is that we currently only
                 * allow to expand on mac header. This means that
                 * skb->protocol network header, etc, stay as is.
                 * Compared to bpf_skb_change_tail(), we're more
                 * flexible due to not needing to linearize or
                 * reset GSO. Intention for this helper is to be
                 * used by an L3 skb that needs to push mac header
                 * for redirection into L2 device.
                 */
                __skb_push(skb, head_room);
                memset(skb->data, 0, head_room);
                skb_reset_mac_header(skb);
                skb_reset_mac_len(skb);
        }

        return ret;
}

BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        int ret = __bpf_skb_change_head(skb, head_room, flags);

        bpf_compute_data_pointers(skb);
        return ret;
}

static const struct bpf_func_proto bpf_skb_change_head_proto = {
        .func                = bpf_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
{
        return __bpf_skb_change_head(skb, head_room, flags);
}

static const struct bpf_func_proto sk_skb_change_head_proto = {
        .func                = sk_skb_change_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
{
        return xdp_get_buff_len(xdp);
}

static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
        .func                = bpf_xdp_get_buff_len,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)

const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
        .func                = bpf_xdp_get_buff_len,
        .gpl_only        = false,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_xdp_get_buff_len_bpf_ids[0],
};

static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
        return xdp_data_meta_unsupported(xdp) ? 0 :
               xdp->data - xdp->data_meta;
}

BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        unsigned long metalen = xdp_get_metalen(xdp);
        void *data_start = xdp_frame_end + metalen;
        void *data = xdp->data + offset;

        if (unlikely(data < data_start ||
                     data > xdp->data_end - ETH_HLEN))
                return -EINVAL;

        if (metalen)
                memmove(xdp->data_meta + offset,
                        xdp->data_meta, metalen);
        xdp->data_meta += offset;
        xdp->data = data;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
        .func                = bpf_xdp_adjust_head,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
                      void *buf, unsigned long len, bool flush)
{
        unsigned long ptr_len, ptr_off = 0;
        skb_frag_t *next_frag, *end_frag;
        struct skb_shared_info *sinfo;
        void *src, *dst;
        u8 *ptr_buf;

        if (likely(xdp->data_end - xdp->data >= off + len)) {
                src = flush ? buf : xdp->data + off;
                dst = flush ? xdp->data + off : buf;
                memcpy(dst, src, len);
                return;
        }

        sinfo = xdp_get_shared_info_from_buff(xdp);
        end_frag = &sinfo->frags[sinfo->nr_frags];
        next_frag = &sinfo->frags[0];

        ptr_len = xdp->data_end - xdp->data;
        ptr_buf = xdp->data;

        while (true) {
                if (off < ptr_off + ptr_len) {
                        unsigned long copy_off = off - ptr_off;
                        unsigned long copy_len = min(len, ptr_len - copy_off);

                        src = flush ? buf : ptr_buf + copy_off;
                        dst = flush ? ptr_buf + copy_off : buf;
                        memcpy(dst, src, copy_len);

                        off += copy_len;
                        len -= copy_len;
                        buf += copy_len;
                }

                if (!len || next_frag == end_frag)
                        break;

                ptr_off += ptr_len;
                ptr_buf = skb_frag_address(next_frag);
                ptr_len = skb_frag_size(next_frag);
                next_frag++;
        }
}

void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
        u32 size = xdp->data_end - xdp->data;
        struct skb_shared_info *sinfo;
        void *addr = xdp->data;
        int i;

        if (unlikely(offset > 0xffff || len > 0xffff))
                return ERR_PTR(-EFAULT);

        if (unlikely(offset + len > xdp_get_buff_len(xdp)))
                return ERR_PTR(-EINVAL);

        if (likely(offset < size)) /* linear area */
                goto out;

        sinfo = xdp_get_shared_info_from_buff(xdp);
        offset -= size;
        for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
                u32 frag_size = skb_frag_size(&sinfo->frags[i]);

                if  (offset < frag_size) {
                        addr = skb_frag_address(&sinfo->frags[i]);
                        size = frag_size;
                        break;
                }
                offset -= frag_size;
        }
out:
        return offset + len <= size ? addr + offset : NULL;
}

BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
           void *, buf, u32, len)
{
        void *ptr;

        ptr = bpf_xdp_pointer(xdp, offset, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        if (!ptr)
                bpf_xdp_copy_buf(xdp, offset, buf, len, false);
        else
                memcpy(buf, ptr, len);

        return 0;
}

static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
        .func                = bpf_xdp_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
{
        return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
}

BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
           void *, buf, u32, len)
{
        void *ptr;

        ptr = bpf_xdp_pointer(xdp, offset, len);
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);

        if (!ptr)
                bpf_xdp_copy_buf(xdp, offset, buf, len, true);
        else
                memcpy(ptr, buf, len);

        return 0;
}

static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
        .func                = bpf_xdp_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
{
        return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
}

static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
        struct xdp_rxq_info *rxq = xdp->rxq;
        unsigned int tailroom;

        if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
                return -EOPNOTSUPP;

        tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
        if (unlikely(offset > tailroom))
                return -EINVAL;

        memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
        skb_frag_size_add(frag, offset);
        sinfo->xdp_frags_size += offset;
        if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
                xsk_buff_get_tail(xdp)->data_end += offset;

        return 0;
}

static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
                                   enum xdp_mem_type mem_type, bool release)
{
        struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);

        if (release) {
                xsk_buff_del_tail(zc_frag);
                __xdp_return(0, mem_type, false, zc_frag);
        } else {
                zc_frag->data_end -= shrink;
        }
}

static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
                                int shrink)
{
        enum xdp_mem_type mem_type = xdp->rxq->mem.type;
        bool release = skb_frag_size(frag) == shrink;

        if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
                bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release);
                goto out;
        }

        if (release)
                __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL);

out:
        return release;
}

static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        int i, n_frags_free = 0, len_free = 0;

        if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
                return -EINVAL;

        for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
                skb_frag_t *frag = &sinfo->frags[i];
                int shrink = min_t(int, offset, skb_frag_size(frag));

                len_free += shrink;
                offset -= shrink;
                if (bpf_xdp_shrink_data(xdp, frag, shrink)) {
                        n_frags_free++;
                } else {
                        skb_frag_size_sub(frag, shrink);
                        break;
                }
        }
        sinfo->nr_frags -= n_frags_free;
        sinfo->xdp_frags_size -= len_free;

        if (unlikely(!sinfo->nr_frags)) {
                xdp_buff_clear_frags_flag(xdp);
                xdp->data_end -= offset;
        }

        return 0;
}

BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
        void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
        void *data_end = xdp->data_end + offset;

        if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
                if (offset < 0)
                        return bpf_xdp_frags_shrink_tail(xdp, -offset);

                return bpf_xdp_frags_increase_tail(xdp, offset);
        }

        /* Notice that xdp_data_hard_end have reserved some tailroom */
        if (unlikely(data_end > data_hard_end))
                return -EINVAL;

        if (unlikely(data_end < xdp->data + ETH_HLEN))
                return -EINVAL;

        /* Clear memory area on grow, can contain uninit kernel memory */
        if (offset > 0)
                memset(xdp->data_end, 0, offset);

        xdp->data_end = data_end;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_tail_proto = {
        .func                = bpf_xdp_adjust_tail,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
        void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
        void *meta = xdp->data_meta + offset;
        unsigned long metalen = xdp->data - meta;

        if (xdp_data_meta_unsupported(xdp))
                return -ENOTSUPP;
        if (unlikely(meta < xdp_frame_end ||
                     meta > xdp->data))
                return -EINVAL;
        if (unlikely(xdp_metalen_invalid(metalen)))
                return -EACCES;

        xdp->data_meta = meta;

        return 0;
}

static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
        .func                = bpf_xdp_adjust_meta,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

/**
 * DOC: xdp redirect
 *
 * XDP_REDIRECT works by a three-step process, implemented in the functions
 * below:
 *
 * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
 *    of the redirect and store it (along with some other metadata) in a per-CPU
 *    struct bpf_redirect_info.
 *
 * 2. When the program returns the XDP_REDIRECT return code, the driver will
 *    call xdp_do_redirect() which will use the information in struct
 *    bpf_redirect_info to actually enqueue the frame into a map type-specific
 *    bulk queue structure.
 *
 * 3. Before exiting its NAPI poll loop, the driver will call
 *    xdp_do_flush(), which will flush all the different bulk queues,
 *    thus completing the redirect. Note that xdp_do_flush() must be
 *    called before napi_complete_done() in the driver, as the
 *    XDP_REDIRECT logic relies on being inside a single NAPI instance
 *    through to the xdp_do_flush() call for RCU protection of all
 *    in-kernel data structures.
 */
/*
 * Pointers to the map entries will be kept around for this whole sequence of
 * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
 * the core code; instead, the RCU protection relies on everything happening
 * inside a single NAPI poll sequence, which means it's between a pair of calls
 * to local_bh_disable()/local_bh_enable().
 *
 * The map entries are marked as __rcu and the map code makes sure to
 * dereference those pointers with rcu_dereference_check() in a way that works
 * for both sections that to hold an rcu_read_lock() and sections that are
 * called from NAPI without a separate rcu_read_lock(). The code below does not
 * use RCU annotations, but relies on those in the map code.
 */
void xdp_do_flush(void)
{
        struct list_head *lh_map, *lh_dev, *lh_xsk;

        bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
        if (lh_dev)
                __dev_flush(lh_dev);
        if (lh_map)
                __cpu_map_flush(lh_map);
        if (lh_xsk)
                __xsk_map_flush(lh_xsk);
}
EXPORT_SYMBOL_GPL(xdp_do_flush);

#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
void xdp_do_check_flushed(struct napi_struct *napi)
{
        struct list_head *lh_map, *lh_dev, *lh_xsk;
        bool missed = false;

        bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
        if (lh_dev) {
                __dev_flush(lh_dev);
                missed = true;
        }
        if (lh_map) {
                __cpu_map_flush(lh_map);
                missed = true;
        }
        if (lh_xsk) {
                __xsk_map_flush(lh_xsk);
                missed = true;
        }

        WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
                  napi->poll);
}
#endif

DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);

u32 xdp_master_redirect(struct xdp_buff *xdp)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct net_device *master, *slave;

        master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
        slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
        if (slave && slave != xdp->rxq->dev) {
                /* The target device is different from the receiving device, so
                 * redirect it to the new device.
                 * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
                 * drivers to unmap the packet from their rx ring.
                 */
                ri->tgt_index = slave->ifindex;
                ri->map_id = INT_MAX;
                ri->map_type = BPF_MAP_TYPE_UNSPEC;
                return XDP_REDIRECT;
        }
        return XDP_TX;
}
EXPORT_SYMBOL_GPL(xdp_master_redirect);

static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
                                        const struct net_device *dev,
                                        struct xdp_buff *xdp,
                                        const struct bpf_prog *xdp_prog)
{
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        err = __xsk_map_redirect(fwd, xdp);
        if (unlikely(err))
                goto err;

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

static __always_inline int
__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
                        struct xdp_frame *xdpf,
                        const struct bpf_prog *xdp_prog)
{
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        u32 flags = ri->flags;
        struct bpf_map *map;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->flags = 0;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        if (unlikely(!xdpf)) {
                err = -EOVERFLOW;
                goto err;
        }

        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (unlikely(flags & BPF_F_BROADCAST)) {
                        map = READ_ONCE(ri->map);

                        /* The map pointer is cleared when the map is being torn
                         * down by dev_map_free()
                         */
                        if (unlikely(!map)) {
                                err = -ENOENT;
                                break;
                        }

                        WRITE_ONCE(ri->map, NULL);
                        err = dev_map_enqueue_multi(xdpf, dev, map,
                                                    flags & BPF_F_EXCLUDE_INGRESS);
                } else {
                        err = dev_map_enqueue(fwd, xdpf, dev);
                }
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_enqueue(fwd, xdpf, dev);
                break;
        case BPF_MAP_TYPE_UNSPEC:
                if (map_id == INT_MAX) {
                        fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
                        if (unlikely(!fwd)) {
                                err = -EINVAL;
                                break;
                        }
                        err = dev_xdp_enqueue(fwd, xdpf, dev);
                        break;
                }
                fallthrough;
        default:
                err = -EBADRQC;
        }

        if (unlikely(err))
                goto err;

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                    const struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;

        if (map_type == BPF_MAP_TYPE_XSKMAP)
                return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);

        return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
                                       xdp_prog);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect);

int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
                          struct xdp_frame *xdpf,
                          const struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;

        if (map_type == BPF_MAP_TYPE_XSKMAP)
                return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);

        return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
}
EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);

static int xdp_do_generic_redirect_map(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct xdp_buff *xdp,
                                       const struct bpf_prog *xdp_prog,
                                       void *fwd, enum bpf_map_type map_type,
                                       u32 map_id, u32 flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        struct bpf_map *map;
        int err;

        switch (map_type) {
        case BPF_MAP_TYPE_DEVMAP:
                fallthrough;
        case BPF_MAP_TYPE_DEVMAP_HASH:
                if (unlikely(flags & BPF_F_BROADCAST)) {
                        map = READ_ONCE(ri->map);

                        /* The map pointer is cleared when the map is being torn
                         * down by dev_map_free()
                         */
                        if (unlikely(!map)) {
                                err = -ENOENT;
                                break;
                        }

                        WRITE_ONCE(ri->map, NULL);
                        err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
                                                     flags & BPF_F_EXCLUDE_INGRESS);
                } else {
                        err = dev_map_generic_redirect(fwd, skb, xdp_prog);
                }
                if (unlikely(err))
                        goto err;
                break;
        case BPF_MAP_TYPE_XSKMAP:
                err = xsk_generic_rcv(fwd, xdp);
                if (err)
                        goto err;
                consume_skb(skb);
                break;
        case BPF_MAP_TYPE_CPUMAP:
                err = cpu_map_generic_redirect(fwd, skb);
                if (unlikely(err))
                        goto err;
                break;
        default:
                err = -EBADRQC;
                goto err;
        }

        _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
err:
        _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
}

int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp,
                            const struct bpf_prog *xdp_prog)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
        u32 map_id = ri->map_id;
        u32 flags = ri->flags;
        int err;

        ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
        ri->flags = 0;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
                fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
                if (unlikely(!fwd)) {
                        err = -EINVAL;
                        goto err;
                }

                err = xdp_ok_fwd_dev(fwd, skb->len);
                if (unlikely(err))
                        goto err;

                skb->dev = fwd;
                _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
                generic_xdp_tx(skb, xdp_prog);
                return 0;
        }

        return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
err:
        _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
        return err;
}

BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        if (unlikely(flags))
                return XDP_ABORTED;

        /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
         * by map_idr) is used for ifindex based XDP redirect.
         */
        ri->tgt_index = ifindex;
        ri->map_id = INT_MAX;
        ri->map_type = BPF_MAP_TYPE_UNSPEC;

        return XDP_REDIRECT;
}

static const struct bpf_func_proto bpf_xdp_redirect_proto = {
        .func           = bpf_xdp_redirect,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_ANYTHING,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
           u64, flags)
{
        return map->ops->map_redirect(map, key, flags);
}

static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
        .func           = bpf_xdp_redirect_map,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_CONST_MAP_PTR,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

static unsigned long bpf_skb_copy(void *dst_buff, const void *skb,
                                  unsigned long off, unsigned long len)
{
        void *ptr = skb_header_pointer(skb, off, len, dst_buff);

        if (unlikely(!ptr))
                return len;
        if (ptr != dst_buff)
                memcpy(dst_buff, ptr, len);

        return 0;
}

BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;
        if (unlikely(!skb || skb_size > skb->len))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
                                bpf_skb_copy);
}

static const struct bpf_func_proto bpf_skb_event_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)

const struct bpf_func_proto bpf_skb_output_proto = {
        .func                = bpf_skb_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_skb_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

static unsigned short bpf_tunnel_key_af(u64 flags)
{
        return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
}

BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key *, to,
           u32, size, u64, flags)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        void *to_orig = to;
        int err;

        if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
                                         BPF_F_TUNINFO_FLAGS)))) {
                err = -EINVAL;
                goto err_clear;
        }
        if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags)) {
                err = -EPROTO;
                goto err_clear;
        }
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                err = -EINVAL;
                switch (size) {
                case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                        goto set_compat;
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        if (ip_tunnel_info_af(info) != AF_INET)
                                goto err_clear;
set_compat:
                        to = (struct bpf_tunnel_key *)compat;
                        break;
                default:
                        goto err_clear;
                }
        }

        to->tunnel_id = be64_to_cpu(info->key.tun_id);
        to->tunnel_tos = info->key.tos;
        to->tunnel_ttl = info->key.ttl;
        if (flags & BPF_F_TUNINFO_FLAGS)
                to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
        else
                to->tunnel_ext = 0;

        if (flags & BPF_F_TUNINFO_IPV6) {
                memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
                       sizeof(to->remote_ipv6));
                memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
                       sizeof(to->local_ipv6));
                to->tunnel_label = be32_to_cpu(info->key.label);
        } else {
                to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
                to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
                memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
                to->tunnel_label = 0;
        }

        if (unlikely(size != sizeof(struct bpf_tunnel_key)))
                memcpy(to_orig, to, size);

        return 0;
err_clear:
        memset(to_orig, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
        .func                = bpf_skb_get_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
{
        const struct ip_tunnel_info *info = skb_tunnel_info(skb);
        int err;

        if (unlikely(!info ||
                     !ip_tunnel_is_options_present(info->key.tun_flags))) {
                err = -ENOENT;
                goto err_clear;
        }
        if (unlikely(size < info->options_len)) {
                err = -ENOMEM;
                goto err_clear;
        }

        ip_tunnel_info_opts_get(to, info);
        if (size > info->options_len)
                memset(to + info->options_len, 0, size - info->options_len);

        return info->options_len;
err_clear:
        memset(to, 0, size);
        return err;
}

static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
        .func                = bpf_skb_get_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg3_type        = ARG_CONST_SIZE,
};

static struct metadata_dst __percpu *md_dst;

BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
           const struct bpf_tunnel_key *, from, u32, size, u64, flags)
{
        struct metadata_dst *md = this_cpu_ptr(md_dst);
        u8 compat[sizeof(struct bpf_tunnel_key)];
        struct ip_tunnel_info *info;

        if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
                               BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER |
                               BPF_F_NO_TUNNEL_KEY)))
                return -EINVAL;
        if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
                switch (size) {
                case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
                case offsetof(struct bpf_tunnel_key, tunnel_label):
                case offsetof(struct bpf_tunnel_key, tunnel_ext):
                case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
                        /* Fixup deprecated structure layouts here, so we have
                         * a common path later on.
                         */
                        memcpy(compat, from, size);
                        memset(compat + size, 0, sizeof(compat) - size);
                        from = (const struct bpf_tunnel_key *) compat;
                        break;
                default:
                        return -EINVAL;
                }
        }
        if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
                     from->tunnel_ext))
                return -EINVAL;

        skb_dst_drop(skb);
        dst_hold((struct dst_entry *) md);
        skb_dst_set(skb, (struct dst_entry *) md);

        info = &md->u.tun_info;
        memset(info, 0, sizeof(*info));
        info->mode = IP_TUNNEL_INFO_TX;

        __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
        __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
                     flags & BPF_F_DONT_FRAGMENT);
        __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
                     !(flags & BPF_F_ZERO_CSUM_TX));
        __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
                     flags & BPF_F_SEQ_NUMBER);
        __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
                     !(flags & BPF_F_NO_TUNNEL_KEY));

        info->key.tun_id = cpu_to_be64(from->tunnel_id);
        info->key.tos = from->tunnel_tos;
        info->key.ttl = from->tunnel_ttl;

        if (flags & BPF_F_TUNINFO_IPV6) {
                info->mode |= IP_TUNNEL_INFO_IPV6;
                memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
                       sizeof(from->remote_ipv6));
                memcpy(&info->key.u.ipv6.src, from->local_ipv6,
                       sizeof(from->local_ipv6));
                info->key.label = cpu_to_be32(from->tunnel_label) &
                                  IPV6_FLOWLABEL_MASK;
        } else {
                info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
                info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
                info->key.flow_flags = FLOWI_FLAG_ANYSRC;
        }

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
        .func                = bpf_skb_set_tunnel_key,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
           const u8 *, from, u32, size)
{
        struct ip_tunnel_info *info = skb_tunnel_info(skb);
        const struct metadata_dst *md = this_cpu_ptr(md_dst);
        IP_TUNNEL_DECLARE_FLAGS(present) = { };

        if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
                return -EINVAL;
        if (unlikely(size > IP_TUNNEL_OPTS_MAX))
                return -ENOMEM;

        ip_tunnel_set_options_present(present);
        ip_tunnel_info_opts_set(info, from, size, present);

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
        .func                = bpf_skb_set_tunnel_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

static const struct bpf_func_proto *
bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
{
        if (!md_dst) {
                struct metadata_dst __percpu *tmp;

                tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
                                                METADATA_IP_TUNNEL,
                                                GFP_KERNEL);
                if (!tmp)
                        return NULL;
                if (cmpxchg(&md_dst, NULL, tmp))
                        metadata_dst_free_percpu(tmp);
        }

        switch (which) {
        case BPF_FUNC_skb_set_tunnel_key:
                return &bpf_skb_set_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return &bpf_skb_set_tunnel_opt_proto;
        default:
                return NULL;
        }
}

BPF_CALL_3(bpf_skb_under_cgroup, struct sk_buff *, skb, struct bpf_map *, map,
           u32, idx)
{
        struct bpf_array *array = container_of(map, struct bpf_array, map);
        struct cgroup *cgrp;
        struct sock *sk;

        sk = skb_to_full_sk(skb);
        if (!sk || !sk_fullsock(sk))
                return -ENOENT;
        if (unlikely(idx >= array->map.max_entries))
                return -E2BIG;

        cgrp = READ_ONCE(array->ptrs[idx]);
        if (unlikely(!cgrp))
                return -EAGAIN;

        return sk_under_cgroup_hierarchy(sk, cgrp);
}

static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
        .func                = bpf_skb_under_cgroup,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
};

#ifdef CONFIG_SOCK_CGROUP_DATA
static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        return cgroup_id(cgrp);
}

BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
{
        return __bpf_sk_cgroup_id(skb->sk);
}

static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
        .func           = bpf_skb_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
                                              int ancestor_level)
{
        struct cgroup *ancestor;
        struct cgroup *cgrp;

        sk = sk_to_full_sk(sk);
        if (!sk || !sk_fullsock(sk))
                return 0;

        cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
        ancestor = cgroup_ancestor(cgrp, ancestor_level);
        if (!ancestor)
                return 0;

        return cgroup_id(ancestor);
}

BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
           ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
}

static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
        .func           = bpf_skb_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
{
        return __bpf_sk_cgroup_id(sk);
}

static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
        .func           = bpf_sk_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
};

BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
{
        return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
}

static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
        .func           = bpf_sk_ancestor_cgroup_id,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type      = ARG_ANYTHING,
};
#endif

static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
                                  unsigned long off, unsigned long len)
{
        struct xdp_buff *xdp = (struct xdp_buff *)ctx;

        bpf_xdp_copy_buf(xdp, off, dst, len, false);
        return 0;
}

BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
           u64, flags, void *, meta, u64, meta_size)
{
        u64 xdp_size = (flags & BPF_F_CTXLEN_MASK) >> 32;

        if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
                return -EINVAL;

        if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
                return -EFAULT;

        return bpf_event_output(map, flags, meta, meta_size, xdp,
                                xdp_size, bpf_xdp_copy);
}

static const struct bpf_func_proto bpf_xdp_event_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)

const struct bpf_func_proto bpf_xdp_output_proto = {
        .func                = bpf_xdp_event_output,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_xdp_output_btf_ids[0],
        .arg2_type        = ARG_CONST_MAP_PTR,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
        return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
}

static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
        .func           = bpf_get_socket_cookie,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
        .func                = bpf_get_socket_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
{
        return __sock_gen_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
        .func                = bpf_get_socket_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
{
        return sk ? sock_gen_cookie(sk) : 0;
}

const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
        .func                = bpf_get_socket_ptr_cookie,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
};

BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
        return __sock_gen_cookie(ctx->sk);
}

static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
        .func                = bpf_get_socket_cookie_sock_ops,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
};

static u64 __bpf_get_netns_cookie(struct sock *sk)
{
        const struct net *net = sk ? sock_net(sk) : &init_net;

        return net->net_cookie;
}

BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb)
{
        return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_proto = {
        .func           = bpf_get_netns_cookie,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
{
        return __bpf_get_netns_cookie(ctx);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
        .func                = bpf_get_netns_cookie_sock,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
        .func                = bpf_get_netns_cookie_sock_addr,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
        .func                = bpf_get_netns_cookie_sock_ops,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
{
        return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
}

static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
        .func                = bpf_get_netns_cookie_sk_msg,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX_OR_NULL,
};

BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
        struct sock *sk = sk_to_full_sk(skb->sk);
        kuid_t kuid;

        if (!sk || !sk_fullsock(sk))
                return overflowuid;
        kuid = sock_net_uid(sock_net(sk), sk);
        return from_kuid_munged(sock_net(sk)->user_ns, kuid);
}

static const struct bpf_func_proto bpf_get_socket_uid_proto = {
        .func           = bpf_get_socket_uid,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
{
        u32 sk_bpf_cb_flags;

        if (getopt) {
                *(u32 *)optval = sk->sk_bpf_cb_flags;
                return 0;
        }

        sk_bpf_cb_flags = *(u32 *)optval;

        if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
                return -EINVAL;

        sk->sk_bpf_cb_flags = sk_bpf_cb_flags;

        return 0;
}

static int sol_socket_sockopt(struct sock *sk, int optname,
                              char *optval, int *optlen,
                              bool getopt)
{
        switch (optname) {
        case SO_REUSEADDR:
        case SO_SNDBUF:
        case SO_RCVBUF:
        case SO_KEEPALIVE:
        case SO_PRIORITY:
        case SO_REUSEPORT:
        case SO_RCVLOWAT:
        case SO_MARK:
        case SO_MAX_PACING_RATE:
        case SO_BINDTOIFINDEX:
        case SO_TXREHASH:
        case SK_BPF_CB_FLAGS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        case SO_BINDTODEVICE:
                break;
        default:
                return -EINVAL;
        }

        if (optname == SK_BPF_CB_FLAGS)
                return sk_bpf_set_get_cb_flags(sk, optval, getopt);

        if (getopt) {
                if (optname == SO_BINDTODEVICE)
                        return -EINVAL;
                return sk_getsockopt(sk, SOL_SOCKET, optname,
                                     KERNEL_SOCKPTR(optval),
                                     KERNEL_SOCKPTR(optlen));
        }

        return sk_setsockopt(sk, SOL_SOCKET, optname,
                             KERNEL_SOCKPTR(optval), *optlen);
}

static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
                                  char *optval, int optlen)
{
        if (optlen != sizeof(int))
                return -EINVAL;

        switch (optname) {
        case TCP_BPF_SOCK_OPS_CB_FLAGS: {
                int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags;

                memcpy(optval, &cb_flags, optlen);
                break;
        }
        case TCP_BPF_RTO_MIN: {
                int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min);

                memcpy(optval, &rto_min_us, optlen);
                break;
        }
        case TCP_BPF_DELACK_MAX: {
                int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max);

                memcpy(optval, &delack_max_us, optlen);
                break;
        }
        default:
                return -EINVAL;
        }

        return 0;
}

static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
                                  char *optval, int optlen)
{
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned long timeout;
        int val;

        if (optlen != sizeof(int))
                return -EINVAL;

        val = *(int *)optval;

        /* Only some options are supported */
        switch (optname) {
        case TCP_BPF_IW:
                if (val <= 0 || tp->data_segs_out > tp->syn_data)
                        return -EINVAL;
                tcp_snd_cwnd_set(tp, val);
                break;
        case TCP_BPF_SNDCWND_CLAMP:
                if (val <= 0)
                        return -EINVAL;
                tp->snd_cwnd_clamp = val;
                tp->snd_ssthresh = val;
                break;
        case TCP_BPF_DELACK_MAX:
                timeout = usecs_to_jiffies(val);
                if (timeout > TCP_DELACK_MAX ||
                    timeout < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                inet_csk(sk)->icsk_delack_max = timeout;
                break;
        case TCP_BPF_RTO_MIN:
                timeout = usecs_to_jiffies(val);
                if (timeout > TCP_RTO_MIN ||
                    timeout < TCP_TIMEOUT_MIN)
                        return -EINVAL;
                inet_csk(sk)->icsk_rto_min = timeout;
                break;
        case TCP_BPF_SOCK_OPS_CB_FLAGS:
                if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
                        return -EINVAL;
                tp->bpf_sock_ops_cb_flags = val;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
                                      int *optlen, bool getopt)
{
        struct tcp_sock *tp;
        int ret;

        if (*optlen < 2)
                return -EINVAL;

        if (getopt) {
                if (!inet_csk(sk)->icsk_ca_ops)
                        return -EINVAL;
                /* BPF expects NULL-terminated tcp-cc string */
                optval[--(*optlen)] = '\0';
                return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
                                         KERNEL_SOCKPTR(optval),
                                         KERNEL_SOCKPTR(optlen));
        }

        /* "cdg" is the only cc that alloc a ptr
         * in inet_csk_ca area.  The bpf-tcp-cc may
         * overwrite this ptr after switching to cdg.
         */
        if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
                return -ENOTSUPP;

        /* It stops this looping
         *
         * .init => bpf_setsockopt(tcp_cc) => .init =>
         * bpf_setsockopt(tcp_cc)" => .init => ....
         *
         * The second bpf_setsockopt(tcp_cc) is not allowed
         * in order to break the loop when both .init
         * are the same bpf prog.
         *
         * This applies even the second bpf_setsockopt(tcp_cc)
         * does not cause a loop.  This limits only the first
         * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
         * pick a fallback cc (eg. peer does not support ECN)
         * and the second '.init' cannot fallback to
         * another.
         */
        tp = tcp_sk(sk);
        if (tp->bpf_chg_cc_inprogress)
                return -EBUSY;

        tp->bpf_chg_cc_inprogress = 1;
        ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
                                KERNEL_SOCKPTR(optval), *optlen);
        tp->bpf_chg_cc_inprogress = 0;
        return ret;
}

static int sol_tcp_sockopt(struct sock *sk, int optname,
                           char *optval, int *optlen,
                           bool getopt)
{
        if (sk->sk_protocol != IPPROTO_TCP)
                return -EINVAL;

        switch (optname) {
        case TCP_NODELAY:
        case TCP_MAXSEG:
        case TCP_KEEPIDLE:
        case TCP_KEEPINTVL:
        case TCP_KEEPCNT:
        case TCP_SYNCNT:
        case TCP_WINDOW_CLAMP:
        case TCP_THIN_LINEAR_TIMEOUTS:
        case TCP_USER_TIMEOUT:
        case TCP_NOTSENT_LOWAT:
        case TCP_SAVE_SYN:
        case TCP_RTO_MAX_MS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        case TCP_CONGESTION:
                return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
        case TCP_SAVED_SYN:
                if (*optlen < 1)
                        return -EINVAL;
                break;
        default:
                if (getopt)
                        return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen);
                return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
        }

        if (getopt) {
                if (optname == TCP_SAVED_SYN) {
                        struct tcp_sock *tp = tcp_sk(sk);

                        if (!tp->saved_syn ||
                            *optlen > tcp_saved_syn_len(tp->saved_syn))
                                return -EINVAL;
                        memcpy(optval, tp->saved_syn->data, *optlen);
                        /* It cannot free tp->saved_syn here because it
                         * does not know if the user space still needs it.
                         */
                        return 0;
                }

                return do_tcp_getsockopt(sk, SOL_TCP, optname,
                                         KERNEL_SOCKPTR(optval),
                                         KERNEL_SOCKPTR(optlen));
        }

        return do_tcp_setsockopt(sk, SOL_TCP, optname,
                                 KERNEL_SOCKPTR(optval), *optlen);
}

static int sol_ip_sockopt(struct sock *sk, int optname,
                          char *optval, int *optlen,
                          bool getopt)
{
        if (sk->sk_family != AF_INET)
                return -EINVAL;

        switch (optname) {
        case IP_TOS:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (getopt)
                return do_ip_getsockopt(sk, SOL_IP, optname,
                                        KERNEL_SOCKPTR(optval),
                                        KERNEL_SOCKPTR(optlen));

        return do_ip_setsockopt(sk, SOL_IP, optname,
                                KERNEL_SOCKPTR(optval), *optlen);
}

static int sol_ipv6_sockopt(struct sock *sk, int optname,
                            char *optval, int *optlen,
                            bool getopt)
{
        if (sk->sk_family != AF_INET6)
                return -EINVAL;

        switch (optname) {
        case IPV6_TCLASS:
        case IPV6_AUTOFLOWLABEL:
                if (*optlen != sizeof(int))
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (getopt)
                return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
                                                      KERNEL_SOCKPTR(optval),
                                                      KERNEL_SOCKPTR(optlen));

        return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
                                              KERNEL_SOCKPTR(optval), *optlen);
}

static int __bpf_setsockopt(struct sock *sk, int level, int optname,
                            char *optval, int optlen)
{
        if (!sk_fullsock(sk))
                return -EINVAL;

        if (level == SOL_SOCKET)
                return sol_socket_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
                return sol_ip_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
                return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
                return sol_tcp_sockopt(sk, optname, optval, &optlen, false);

        return -EINVAL;
}

static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
{
        return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
}

static int _bpf_setsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        if (sk_fullsock(sk))
                sock_owned_by_me(sk);
        return __bpf_setsockopt(sk, level, optname, optval, optlen);
}

static int __bpf_getsockopt(struct sock *sk, int level, int optname,
                            char *optval, int optlen)
{
        int err, saved_optlen = optlen;

        if (!sk_fullsock(sk)) {
                err = -EINVAL;
                goto done;
        }

        if (level == SOL_SOCKET)
                err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
                err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
                err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
        else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
                err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
        else
                err = -EINVAL;

done:
        if (err)
                optlen = 0;
        if (optlen < saved_optlen)
                memset(optval + optlen, 0, saved_optlen - optlen);
        return err;
}

static int _bpf_getsockopt(struct sock *sk, int level, int optname,
                           char *optval, int optlen)
{
        if (sk_fullsock(sk))
                sock_owned_by_me(sk);
        return __bpf_getsockopt(sk, level, optname, optval, optlen);
}

BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_sk_setsockopt_proto = {
        .func                = bpf_sk_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return _bpf_getsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_sk_getsockopt_proto = {
        .func                = bpf_sk_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return __bpf_setsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
        .func                = bpf_unlocked_sk_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
{
        return __bpf_getsockopt(sk, level, optname, optval, optlen);
}

const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
        .func                = bpf_unlocked_sk_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
        .func                = bpf_sock_addr_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
           int, level, int, optname, char *, optval, int, optlen)
{
        return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
        .func                = bpf_sock_addr_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
        .func                = bpf_sock_ops_setsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
                                int optname, const u8 **start)
{
        struct sk_buff *syn_skb = bpf_sock->syn_skb;
        const u8 *hdr_start;
        int ret;

        if (syn_skb) {
                /* sk is a request_sock here */

                if (optname == TCP_BPF_SYN) {
                        hdr_start = syn_skb->data;
                        ret = tcp_hdrlen(syn_skb);
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = skb_network_header(syn_skb);
                        ret = skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                } else {
                        /* optname == TCP_BPF_SYN_MAC */
                        hdr_start = skb_mac_header(syn_skb);
                        ret = skb_mac_header_len(syn_skb) +
                                skb_network_header_len(syn_skb) +
                                tcp_hdrlen(syn_skb);
                }
        } else {
                struct sock *sk = bpf_sock->sk;
                struct saved_syn *saved_syn;

                if (sk->sk_state == TCP_NEW_SYN_RECV)
                        /* synack retransmit. bpf_sock->syn_skb will
                         * not be available.  It has to resort to
                         * saved_syn (if it is saved).
                         */
                        saved_syn = inet_reqsk(sk)->saved_syn;
                else
                        saved_syn = tcp_sk(sk)->saved_syn;

                if (!saved_syn)
                        return -ENOENT;

                if (optname == TCP_BPF_SYN) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen;
                        ret = saved_syn->tcp_hdrlen;
                } else if (optname == TCP_BPF_SYN_IP) {
                        hdr_start = saved_syn->data +
                                saved_syn->mac_hdrlen;
                        ret = saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                } else {
                        /* optname == TCP_BPF_SYN_MAC */

                        /* TCP_SAVE_SYN may not have saved the mac hdr */
                        if (!saved_syn->mac_hdrlen)
                                return -ENOENT;

                        hdr_start = saved_syn->data;
                        ret = saved_syn->mac_hdrlen +
                                saved_syn->network_hdrlen +
                                saved_syn->tcp_hdrlen;
                }
        }

        *start = hdr_start;
        return ret;
}

BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
           int, level, int, optname, char *, optval, int, optlen)
{
        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
            optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
                int ret, copy_len = 0;
                const u8 *start;

                ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
                if (ret > 0) {
                        copy_len = ret;
                        if (optlen < copy_len) {
                                copy_len = optlen;
                                ret = -ENOSPC;
                        }

                        memcpy(optval, start, copy_len);
                }

                /* Zero out unused buffer at the end */
                memset(optval + copy_len, 0, optlen - copy_len);

                return ret;
        }

        return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}

static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
        .func                = bpf_sock_ops_getsockopt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
        .arg4_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
           int, argval)
{
        struct sock *sk = bpf_sock->sk;
        int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;

        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
                return -EINVAL;

        tcp_sk(sk)->bpf_sock_ops_cb_flags = val;

        return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
}

static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = {
        .func                = bpf_sock_ops_cb_flags_set,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
};

const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly;
EXPORT_SYMBOL_GPL(ipv6_bpf_stub);

BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
           int, addr_len)
{
#ifdef CONFIG_INET
        struct sock *sk = ctx->sk;
        u32 flags = BIND_FROM_BPF;
        int err;

        err = -EINVAL;
        if (addr_len < offsetofend(struct sockaddr, sa_family))
                return err;
        if (addr->sa_family == AF_INET) {
                if (addr_len < sizeof(struct sockaddr_in))
                        return err;
                if (((struct sockaddr_in *)addr)->sin_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
                return __inet_bind(sk, addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
        } else if (addr->sa_family == AF_INET6) {
                if (addr_len < SIN6_LEN_RFC2133)
                        return err;
                if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
                        flags |= BIND_FORCE_ADDRESS_NO_PORT;
                /* ipv6_bpf_stub cannot be NULL, since it's called from
                 * bpf_cgroup_inet6_connect hook and ipv6 is already loaded
                 */
                return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags);
#endif /* CONFIG_IPV6 */
        }
#endif /* CONFIG_INET */

        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_bind_proto = {
        .func                = bpf_bind,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
};

#ifdef CONFIG_XFRM

#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))

struct metadata_dst __percpu *xfrm_bpf_md_dst;
EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);

#endif

BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
           struct bpf_xfrm_state *, to, u32, size, u64, flags)
{
        const struct sec_path *sp = skb_sec_path(skb);
        const struct xfrm_state *x;

        if (!sp || unlikely(index >= sp->len || flags))
                goto err_clear;

        x = sp->xvec[index];

        if (unlikely(size != sizeof(struct bpf_xfrm_state)))
                goto err_clear;

        to->reqid = x->props.reqid;
        to->spi = x->id.spi;
        to->family = x->props.family;
        to->ext = 0;

        if (to->family == AF_INET6) {
                memcpy(to->remote_ipv6, x->props.saddr.a6,
                       sizeof(to->remote_ipv6));
        } else {
                to->remote_ipv4 = x->props.saddr.a4;
                memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
        }

        return 0;
err_clear:
        memset(to, 0, size);
        return -EINVAL;
}

static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
        .func                = bpf_skb_get_xfrm_state,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};
#endif

#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
{
        params->h_vlan_TCI = 0;
        params->h_vlan_proto = 0;
        if (mtu)
                params->mtu_result = mtu; /* union with tot_len */

        return 0;
}
#endif

#if IS_ENABLED(CONFIG_INET)
static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct fib_nh_common *nhc;
        struct in_device *in_dev;
        struct neighbour *neigh;
        struct net_device *dev;
        struct fib_result res;
        struct flowi4 fl4;
        u32 mtu = 0;
        int err;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        /* verify forwarding is enabled on this interface */
        in_dev = __in_dev_get_rcu(dev);
        if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl4.flowi4_iif = 1;
                fl4.flowi4_oif = params->ifindex;
        } else {
                fl4.flowi4_iif = params->ifindex;
                fl4.flowi4_oif = 0;
        }
        fl4.flowi4_tos = params->tos & INET_DSCP_MASK;
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_flags = 0;

        fl4.flowi4_proto = params->l4_protocol;
        fl4.daddr = params->ipv4_dst;
        fl4.saddr = params->ipv4_src;
        fl4.fl4_sport = params->sport;
        fl4.fl4_dport = params->dport;
        fl4.flowi4_multipath_hash = 0;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib_table *tb;

                if (flags & BPF_FIB_LOOKUP_TBID) {
                        tbid = params->tbid;
                        /* zero out for vlan output */
                        params->tbid = 0;
                }

                tb = fib_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
        } else {
                if (flags & BPF_FIB_LOOKUP_MARK)
                        fl4.flowi4_mark = params->mark;
                else
                        fl4.flowi4_mark = 0;
                fl4.flowi4_secid = 0;
                fl4.flowi4_tun_key.tun_id = 0;
                fl4.flowi4_uid = sock_net_uid(net, NULL);

                err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
        }

        if (err) {
                /* map fib lookup errors to RTN_ type */
                if (err == -EINVAL)
                        return BPF_FIB_LKUP_RET_BLACKHOLE;
                if (err == -EHOSTUNREACH)
                        return BPF_FIB_LKUP_RET_UNREACHABLE;
                if (err == -EACCES)
                        return BPF_FIB_LKUP_RET_PROHIBIT;

                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        if (res.type != RTN_UNICAST)
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        if (fib_info_num_path(res.fi) > 1)
                fib_select_path(net, &res, &fl4, NULL);

        if (check_mtu) {
                mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
                if (params->tot_len > mtu) {
                        params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
                }
        }

        nhc = res.nhc;

        /* do not handle lwt encaps right now */
        if (nhc->nhc_lwtstate)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        dev = nhc->nhc_dev;

        params->rt_metric = res.fi->fib_priority;
        params->ifindex = dev->ifindex;

        if (flags & BPF_FIB_LOOKUP_SRC)
                params->ipv4_src = fib_result_prefsrc(net, &res);

        /* xdp and cls_bpf programs are run in RCU-bh so
         * rcu_read_lock_bh is not needed here
         */
        if (likely(nhc->nhc_gw_family != AF_INET6)) {
                if (nhc->nhc_gw_family)
                        params->ipv4_dst = nhc->nhc_gw.ipv4;
        } else {
                struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;

                params->family = AF_INET6;
                *dst = nhc->nhc_gw.ipv6;
        }

        if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
                goto set_fwd_params;

        if (likely(nhc->nhc_gw_family != AF_INET6))
                neigh = __ipv4_neigh_lookup_noref(dev,
                                                  (__force u32)params->ipv4_dst);
        else
                neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);

        if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);

set_fwd_params:
        return bpf_fib_set_fwd_params(params, mtu);
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
                               u32 flags, bool check_mtu)
{
        struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
        struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
        struct fib6_result res = {};
        struct neighbour *neigh;
        struct net_device *dev;
        struct inet6_dev *idev;
        struct flowi6 fl6;
        int strict = 0;
        int oif, err;
        u32 mtu = 0;

        /* link local addresses are never forwarded */
        if (rt6_need_strict(dst) || rt6_need_strict(src))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        dev = dev_get_by_index_rcu(net, params->ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        idev = __in6_dev_get_safely(dev);
        if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding)))
                return BPF_FIB_LKUP_RET_FWD_DISABLED;

        if (flags & BPF_FIB_LOOKUP_OUTPUT) {
                fl6.flowi6_iif = 1;
                oif = fl6.flowi6_oif = params->ifindex;
        } else {
                oif = fl6.flowi6_iif = params->ifindex;
                fl6.flowi6_oif = 0;
                strict = RT6_LOOKUP_F_HAS_SADDR;
        }
        fl6.flowlabel = params->flowinfo;
        fl6.flowi6_scope = 0;
        fl6.flowi6_flags = 0;
        fl6.mp_hash = 0;

        fl6.flowi6_proto = params->l4_protocol;
        fl6.daddr = *dst;
        fl6.saddr = *src;
        fl6.fl6_sport = params->sport;
        fl6.fl6_dport = params->dport;

        if (flags & BPF_FIB_LOOKUP_DIRECT) {
                u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
                struct fib6_table *tb;

                if (flags & BPF_FIB_LOOKUP_TBID) {
                        tbid = params->tbid;
                        /* zero out for vlan output */
                        params->tbid = 0;
                }

                tb = ipv6_stub->fib6_get_table(net, tbid);
                if (unlikely(!tb))
                        return BPF_FIB_LKUP_RET_NOT_FWDED;

                err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
                                                   strict);
        } else {
                if (flags & BPF_FIB_LOOKUP_MARK)
                        fl6.flowi6_mark = params->mark;
                else
                        fl6.flowi6_mark = 0;
                fl6.flowi6_secid = 0;
                fl6.flowi6_tun_key.tun_id = 0;
                fl6.flowi6_uid = sock_net_uid(net, NULL);

                err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
        }

        if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
                     res.f6i == net->ipv6.fib6_null_entry))
                return BPF_FIB_LKUP_RET_NOT_FWDED;

        switch (res.fib6_type) {
        /* only unicast is forwarded */
        case RTN_UNICAST:
                break;
        case RTN_BLACKHOLE:
                return BPF_FIB_LKUP_RET_BLACKHOLE;
        case RTN_UNREACHABLE:
                return BPF_FIB_LKUP_RET_UNREACHABLE;
        case RTN_PROHIBIT:
                return BPF_FIB_LKUP_RET_PROHIBIT;
        default:
                return BPF_FIB_LKUP_RET_NOT_FWDED;
        }

        ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
                                    fl6.flowi6_oif != 0, NULL, strict);

        if (check_mtu) {
                mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
                if (params->tot_len > mtu) {
                        params->mtu_result = mtu; /* union with tot_len */
                        return BPF_FIB_LKUP_RET_FRAG_NEEDED;
                }
        }

        if (res.nh->fib_nh_lws)
                return BPF_FIB_LKUP_RET_UNSUPP_LWT;

        if (res.nh->fib_nh_gw_family)
                *dst = res.nh->fib_nh_gw6;

        dev = res.nh->fib_nh_dev;
        params->rt_metric = res.f6i->fib6_metric;
        params->ifindex = dev->ifindex;

        if (flags & BPF_FIB_LOOKUP_SRC) {
                if (res.f6i->fib6_prefsrc.plen) {
                        *src = res.f6i->fib6_prefsrc.addr;
                } else {
                        err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
                                                                &fl6.daddr, 0,
                                                                src);
                        if (err)
                                return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
                }
        }

        if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
                goto set_fwd_params;

        /* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
         * not needed here.
         */
        neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
        if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
                return BPF_FIB_LKUP_RET_NO_NEIGH;
        memcpy(params->dmac, neigh->ha, ETH_ALEN);
        memcpy(params->smac, dev->dev_addr, ETH_ALEN);

set_fwd_params:
        return bpf_fib_set_fwd_params(params, mtu);
}
#endif

#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
                             BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
                             BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)

BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
                                           flags, true);
#endif
        }
        return -EAFNOSUPPORT;
}

static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
        .func                = bpf_xdp_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
           struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
        struct net *net = dev_net(skb->dev);
        int rc = -EAFNOSUPPORT;
        bool check_mtu = false;

        if (plen < sizeof(*params))
                return -EINVAL;

        if (flags & ~BPF_FIB_LOOKUP_MASK)
                return -EINVAL;

        if (params->tot_len)
                check_mtu = true;

        switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
        case AF_INET:
                rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
                break;
#endif
        }

        if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
                struct net_device *dev;

                /* When tot_len isn't provided by user, check skb
                 * against MTU of FIB lookup resulting net_device
                 */
                dev = dev_get_by_index_rcu(net, params->ifindex);
                if (!is_skb_forwardable(dev, skb))
                        rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;

                params->mtu_result = dev->mtu; /* union with tot_len */
        }

        return rc;
}

static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
        .func                = bpf_skb_fib_lookup,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM,
        .arg3_type      = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
                                            u32 ifindex)
{
        struct net *netns = dev_net(dev_curr);

        /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
        if (ifindex == 0)
                return dev_curr;

        return dev_get_by_index_rcu(netns, ifindex);
}

BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
           u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
{
        int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
        struct net_device *dev = skb->dev;
        int mtu, dev_len, skb_len;

        if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
                return -EINVAL;
        if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
                return -EINVAL;

        dev = __dev_via_ifindex(dev, ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        mtu = READ_ONCE(dev->mtu);
        dev_len = mtu + dev->hard_header_len;

        /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
        skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;

        skb_len += len_diff; /* minus result pass check */
        if (skb_len <= dev_len) {
                ret = BPF_MTU_CHK_RET_SUCCESS;
                goto out;
        }
        /* At this point, skb->len exceed MTU, but as it include length of all
         * segments, it can still be below MTU.  The SKB can possibly get
         * re-segmented in transmit path (see validate_xmit_skb).  Thus, user
         * must choose if segs are to be MTU checked.
         */
        if (skb_is_gso(skb)) {
                ret = BPF_MTU_CHK_RET_SUCCESS;
                if (flags & BPF_MTU_CHK_SEGS &&
                    !skb_gso_validate_network_len(skb, mtu))
                        ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
        }
out:
        *mtu_len = mtu;
        return ret;
}

BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
           u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
{
        struct net_device *dev = xdp->rxq->dev;
        int xdp_len = xdp->data_end - xdp->data;
        int ret = BPF_MTU_CHK_RET_SUCCESS;
        int mtu, dev_len;

        /* XDP variant doesn't support multi-buffer segment check (yet) */
        if (unlikely(flags))
                return -EINVAL;

        dev = __dev_via_ifindex(dev, ifindex);
        if (unlikely(!dev))
                return -ENODEV;

        mtu = READ_ONCE(dev->mtu);
        dev_len = mtu + dev->hard_header_len;

        /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
        if (*mtu_len)
                xdp_len = *mtu_len + dev->hard_header_len;

        xdp_len += len_diff; /* minus result pass check */
        if (xdp_len > dev_len)
                ret = BPF_MTU_CHK_RET_FRAG_NEEDED;

        *mtu_len = mtu;
        return ret;
}

static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
        .func                = bpf_skb_check_mtu,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
        .arg3_size        = sizeof(u32),
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
        .func                = bpf_xdp_check_mtu,
        .gpl_only        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
        .arg3_size        = sizeof(u32),
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
{
        int err;
        struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;

        if (!seg6_validate_srh(srh, len, false))
                return -EINVAL;

        switch (type) {
        case BPF_LWT_ENCAP_SEG6_INLINE:
                if (skb->protocol != htons(ETH_P_IPV6))
                        return -EBADMSG;

                err = seg6_do_srh_inline(skb, srh);
                break;
        case BPF_LWT_ENCAP_SEG6:
                skb_reset_inner_headers(skb);
                skb->encapsulation = 1;
                err = seg6_do_srh_encap(skb, srh, IPPROTO_IPV6);
                break;
        default:
                return -EINVAL;
        }

        bpf_compute_data_pointers(skb);
        if (err)
                return err;

        skb_set_transport_header(skb, sizeof(struct ipv6hdr));

        return seg6_lookup_nexthop(skb, NULL, 0);
}
#endif /* CONFIG_IPV6_SEG6_BPF */

#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
                             bool ingress)
{
        return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
}
#endif

BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
           u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_LWT_ENCAP_SEG6:
        case BPF_LWT_ENCAP_SEG6_INLINE:
                return bpf_push_seg6_encap(skb, type, hdr, len);
#endif
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
#endif
        default:
                return -EINVAL;
        }
}

BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
           void *, hdr, u32, len)
{
        switch (type) {
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
        case BPF_LWT_ENCAP_IP:
                return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
#endif
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
        .func                = bpf_lwt_in_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
        .func                = bpf_lwt_xmit_push_encap,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
           const void *, from, u32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_tlvs, *srh_end, *ptr;
        int srhoff = 0;

        lockdep_assert_held(&srh_state->bh_lock);
        if (srh == NULL)
                return -EINVAL;

        srh_tlvs = (void *)((char *)srh + ((srh->first_segment + 1) << 4));
        srh_end = (void *)((char *)srh + sizeof(*srh) + srh_state->hdrlen);

        ptr = skb->data + offset;
        if (ptr >= srh_tlvs && ptr + len <= srh_end)
                srh_state->valid = false;
        else if (ptr < (void *)&srh->flags ||
                 ptr + len > (void *)&srh->segments)
                return -EFAULT;

        if (unlikely(bpf_try_make_writable(skb, offset + len)))
                return -EFAULT;
        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);

        memcpy(skb->data + offset, from, len);
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
        .func                = bpf_lwt_seg6_store_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

static void bpf_update_srh_state(struct sk_buff *skb)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int srhoff = 0;

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) {
                srh_state->srh = NULL;
        } else {
                srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
                srh_state->hdrlen = srh_state->srh->hdrlen << 3;
                srh_state->valid = true;
        }
}

BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
           u32, action, void *, param, u32, param_len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        int hdroff = 0;
        int err;

        lockdep_assert_held(&srh_state->bh_lock);
        switch (action) {
        case SEG6_LOCAL_ACTION_END_X:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(struct in6_addr))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, (struct in6_addr *)param, 0);
        case SEG6_LOCAL_ACTION_END_T:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_DT6:
                if (!seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                if (param_len != sizeof(int))
                        return -EINVAL;

                if (ipv6_find_hdr(skb, &hdroff, IPPROTO_IPV6, NULL, NULL) < 0)
                        return -EBADMSG;
                if (!pskb_pull(skb, hdroff))
                        return -EBADMSG;

                skb_postpull_rcsum(skb, skb_network_header(skb), hdroff);
                skb_reset_network_header(skb);
                skb_reset_transport_header(skb);
                skb->encapsulation = 0;

                bpf_compute_data_pointers(skb);
                bpf_update_srh_state(skb);
                return seg6_lookup_nexthop(skb, NULL, *(int *)param);
        case SEG6_LOCAL_ACTION_END_B6:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6_INLINE,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        case SEG6_LOCAL_ACTION_END_B6_ENCAP:
                if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
                        return -EBADMSG;
                err = bpf_push_seg6_encap(skb, BPF_LWT_ENCAP_SEG6,
                                          param, param_len);
                if (!err)
                        bpf_update_srh_state(skb);

                return err;
        default:
                return -EINVAL;
        }
}

static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
        .func                = bpf_lwt_seg6_action,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg4_type        = ARG_CONST_SIZE
};

BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
           s32, len)
{
        struct seg6_bpf_srh_state *srh_state =
                this_cpu_ptr(&seg6_bpf_srh_states);
        struct ipv6_sr_hdr *srh = srh_state->srh;
        void *srh_end, *srh_tlvs, *ptr;
        struct ipv6hdr *hdr;
        int srhoff = 0;
        int ret;

        lockdep_assert_held(&srh_state->bh_lock);
        if (unlikely(srh == NULL))
                return -EINVAL;

        srh_tlvs = (void *)((unsigned char *)srh + sizeof(*srh) +
                        ((srh->first_segment + 1) << 4));
        srh_end = (void *)((unsigned char *)srh + sizeof(*srh) +
                        srh_state->hdrlen);
        ptr = skb->data + offset;

        if (unlikely(ptr < srh_tlvs || ptr > srh_end))
                return -EFAULT;
        if (unlikely(len < 0 && (void *)((char *)ptr - len) > srh_end))
                return -EFAULT;

        if (len > 0) {
                ret = skb_cow_head(skb, len);
                if (unlikely(ret < 0))
                        return ret;

                ret = bpf_skb_net_hdr_push(skb, offset, len);
        } else {
                ret = bpf_skb_net_hdr_pop(skb, offset, -1 * len);
        }

        bpf_compute_data_pointers(skb);
        if (unlikely(ret < 0))
                return ret;

        hdr = (struct ipv6hdr *)skb->data;
        hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));

        if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
                return -EINVAL;
        srh_state->srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
        srh_state->hdrlen += len;
        srh_state->valid = false;
        return 0;
}

static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
        .func                = bpf_lwt_seg6_adjust_srh,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};
#endif /* CONFIG_IPV6_SEG6_BPF */

#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
                              int dif, int sdif, u8 family, u8 proto)
{
        struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
        bool refcounted = false;
        struct sock *sk = NULL;

        if (family == AF_INET) {
                __be32 src4 = tuple->ipv4.saddr;
                __be32 dst4 = tuple->ipv4.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet_lookup(net, hinfo, NULL, 0,
                                           src4, tuple->ipv4.sport,
                                           dst4, tuple->ipv4.dport,
                                           dif, sdif, &refcounted);
                else
                        sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
                                               dst4, tuple->ipv4.dport,
                                               dif, sdif, net->ipv4.udp_table, NULL);
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
                struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;

                if (proto == IPPROTO_TCP)
                        sk = __inet6_lookup(net, hinfo, NULL, 0,
                                            src6, tuple->ipv6.sport,
                                            dst6, ntohs(tuple->ipv6.dport),
                                            dif, sdif, &refcounted);
                else if (likely(ipv6_bpf_stub))
                        sk = ipv6_bpf_stub->udp6_lib_lookup(net,
                                                            src6, tuple->ipv6.sport,
                                                            dst6, tuple->ipv6.dport,
                                                            dif, sdif,
                                                            net->ipv4.udp_table, NULL);
#endif
        }

        if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                sk = NULL;
        }
        return sk;
}

/* bpf_skc_lookup performs the core lookup for different types of sockets,
 * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
 */
static struct sock *
__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                 struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                 u64 flags, int sdif)
{
        struct sock *sk = NULL;
        struct net *net;
        u8 family;

        if (len == sizeof(tuple->ipv4))
                family = AF_INET;
        else if (len == sizeof(tuple->ipv6))
                family = AF_INET6;
        else
                return NULL;

        if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
                goto out;

        if (sdif < 0) {
                if (family == AF_INET)
                        sdif = inet_sdif(skb);
                else
                        sdif = inet6_sdif(skb);
        }

        if ((s32)netns_id < 0) {
                net = caller_net;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
        } else {
                net = get_net_ns_by_id(caller_net, netns_id);
                if (unlikely(!net))
                        goto out;
                sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
                put_net(net);
        }

out:
        return sk;
}

static struct sock *
__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
                struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
                u64 flags, int sdif)
{
        struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
                                           ifindex, proto, netns_id, flags,
                                           sdif);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

static struct sock *
bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
               u8 proto, u64 netns_id, u64 flags)
{
        struct net *caller_net;
        int ifindex;

        if (skb->dev) {
                caller_net = dev_net(skb->dev);
                ifindex = skb->dev->ifindex;
        } else {
                caller_net = sock_net(skb->sk);
                ifindex = 0;
        }

        return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
                                netns_id, flags, -1);
}

static struct sock *
bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
              u8 proto, u64 netns_id, u64 flags)
{
        struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
                                         flags);

        if (sk) {
                struct sock *sk2 = sk_to_full_sk(sk);

                /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
                 * sock refcnt is decremented to prevent a request_sock leak.
                 */
                if (sk2 != sk) {
                        sock_gen_put(sk);
                        /* Ensure there is no need to bump sk2 refcnt */
                        if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
                                WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
                                return NULL;
                        }
                        sk = sk2;
                }
        }

        return sk;
}

BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
                                             netns_id, flags);
}

static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
        .func                = bpf_skc_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
        .func                = bpf_sk_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
                                            netns_id, flags);
}

static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
        .func                = bpf_sk_lookup_udp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
                                               ifindex, IPPROTO_TCP, netns_id,
                                               flags, sdif);
}

static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
        .func                = bpf_tc_skc_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
                                              ifindex, IPPROTO_TCP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
        .func                = bpf_tc_sk_lookup_tcp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        struct net_device *dev = skb->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
                                              ifindex, IPPROTO_UDP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
        .func                = bpf_tc_sk_lookup_udp,
        .gpl_only        = false,
        .pkt_access        = true,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_1(bpf_sk_release, struct sock *, sk)
{
        if (sk && sk_is_refcounted(sk))
                sock_gen_put(sk);
        return 0;
}

static const struct bpf_func_proto bpf_sk_release_proto = {
        .func                = bpf_sk_release,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE,
};

BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_UDP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
        .func           = bpf_xdp_sk_lookup_udp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
                                               ifindex, IPPROTO_TCP, netns_id,
                                               flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
        .func           = bpf_xdp_skc_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
{
        struct net_device *dev = ctx->rxq->dev;
        int ifindex = dev->ifindex, sdif = dev_sdif(dev);
        struct net *caller_net = dev_net(dev);

        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
                                              ifindex, IPPROTO_TCP, netns_id,
                                              flags, sdif);
}

static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
        .func           = bpf_xdp_sk_lookup_tcp,
        .gpl_only       = false,
        .pkt_access     = true,
        .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type      = ARG_ANYTHING,
        .arg5_type      = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
                                               sock_net(ctx->sk), 0,
                                               IPPROTO_TCP, netns_id, flags,
                                               -1);
}

static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
        .func                = bpf_sock_addr_skc_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCK_COMMON_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_TCP,
                                              netns_id, flags, -1);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
        .func                = bpf_sock_addr_sk_lookup_tcp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
           struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
{
        return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
                                              sock_net(ctx->sk), 0, IPPROTO_UDP,
                                              netns_id, flags, -1);
}

static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
        .func                = bpf_sock_addr_sk_lookup_udp,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
        .arg4_type        = ARG_ANYTHING,
        .arg5_type        = ARG_ANYTHING,
};

bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
                                          icsk_retransmits))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_tcp_sock, bytes_received):
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                return size == sizeof(__u64);
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_TCP_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) >        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct tcp_sock, FIELD)); \
        } while (0)

#define BPF_INET_SOCK_GET_COMMON(FIELD)                                        \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct inet_connection_sock,        \
                                          FIELD) >                        \
                             sizeof_field(struct bpf_tcp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                        \
                                        struct inet_connection_sock,        \
                                        FIELD),                                \
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(                                \
                                        struct inet_connection_sock,        \
                                        FIELD));                        \
        } while (0)

        BTF_TYPE_EMIT(struct bpf_tcp_sock);

        switch (si->off) {
        case offsetof(struct bpf_tcp_sock, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      offsetof(struct minmax_sample, v));
                break;
        case offsetof(struct bpf_tcp_sock, snd_cwnd):
                BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
                break;
        case offsetof(struct bpf_tcp_sock, srtt_us):
                BPF_TCP_SOCK_GET_COMMON(srtt_us);
                break;
        case offsetof(struct bpf_tcp_sock, snd_ssthresh):
                BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
                break;
        case offsetof(struct bpf_tcp_sock, rcv_nxt):
                BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_nxt):
                BPF_TCP_SOCK_GET_COMMON(snd_nxt);
                break;
        case offsetof(struct bpf_tcp_sock, snd_una):
                BPF_TCP_SOCK_GET_COMMON(snd_una);
                break;
        case offsetof(struct bpf_tcp_sock, mss_cache):
                BPF_TCP_SOCK_GET_COMMON(mss_cache);
                break;
        case offsetof(struct bpf_tcp_sock, ecn_flags):
                BPF_TCP_SOCK_GET_COMMON(ecn_flags);
                break;
        case offsetof(struct bpf_tcp_sock, rate_delivered):
                BPF_TCP_SOCK_GET_COMMON(rate_delivered);
                break;
        case offsetof(struct bpf_tcp_sock, rate_interval_us):
                BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
                break;
        case offsetof(struct bpf_tcp_sock, packets_out):
                BPF_TCP_SOCK_GET_COMMON(packets_out);
                break;
        case offsetof(struct bpf_tcp_sock, retrans_out):
                BPF_TCP_SOCK_GET_COMMON(retrans_out);
                break;
        case offsetof(struct bpf_tcp_sock, total_retrans):
                BPF_TCP_SOCK_GET_COMMON(total_retrans);
                break;
        case offsetof(struct bpf_tcp_sock, segs_in):
                BPF_TCP_SOCK_GET_COMMON(segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_in):
                BPF_TCP_SOCK_GET_COMMON(data_segs_in);
                break;
        case offsetof(struct bpf_tcp_sock, segs_out):
                BPF_TCP_SOCK_GET_COMMON(segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, data_segs_out):
                BPF_TCP_SOCK_GET_COMMON(data_segs_out);
                break;
        case offsetof(struct bpf_tcp_sock, lost_out):
                BPF_TCP_SOCK_GET_COMMON(lost_out);
                break;
        case offsetof(struct bpf_tcp_sock, sacked_out):
                BPF_TCP_SOCK_GET_COMMON(sacked_out);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_received):
                BPF_TCP_SOCK_GET_COMMON(bytes_received);
                break;
        case offsetof(struct bpf_tcp_sock, bytes_acked):
                BPF_TCP_SOCK_GET_COMMON(bytes_acked);
                break;
        case offsetof(struct bpf_tcp_sock, dsack_dups):
                BPF_TCP_SOCK_GET_COMMON(dsack_dups);
                break;
        case offsetof(struct bpf_tcp_sock, delivered):
                BPF_TCP_SOCK_GET_COMMON(delivered);
                break;
        case offsetof(struct bpf_tcp_sock, delivered_ce):
                BPF_TCP_SOCK_GET_COMMON(delivered_ce);
                break;
        case offsetof(struct bpf_tcp_sock, icsk_retransmits):
                BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
                break;
        }

        return insn - insn_buf;
}

BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
{
        if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_tcp_sock_proto = {
        .func                = bpf_tcp_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_TCP_SOCK_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
{
        sk = sk_to_full_sk(sk);

        if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

static const struct bpf_func_proto bpf_get_listener_sock_proto = {
        .func                = bpf_get_listener_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_SOCKET_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
};

BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
{
        unsigned int iphdr_len;

        switch (skb_protocol(skb, true)) {
        case cpu_to_be16(ETH_P_IP):
                iphdr_len = sizeof(struct iphdr);
                break;
        case cpu_to_be16(ETH_P_IPV6):
                iphdr_len = sizeof(struct ipv6hdr);
                break;
        default:
                return 0;
        }

        if (skb_headlen(skb) < iphdr_len)
                return 0;

        if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
                return 0;

        return INET_ECN_set_ce(skb);
}

bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                                  struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        default:
                return size == sizeof(__u32);
        }
}

u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
                                    const struct bpf_insn *si,
                                    struct bpf_insn *insn_buf,
                                    struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

#define BPF_XDP_SOCK_GET(FIELD)                                                \
        do {                                                                \
                BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) >        \
                             sizeof_field(struct bpf_xdp_sock, FIELD));        \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
                                      si->dst_reg, si->src_reg,                \
                                      offsetof(struct xdp_sock, FIELD)); \
        } while (0)

        switch (si->off) {
        case offsetof(struct bpf_xdp_sock, queue_id):
                BPF_XDP_SOCK_GET(queue_id);
                break;
        }

        return insn - insn_buf;
}

static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
        .func           = bpf_skb_ecn_set_ce,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
};

BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        int ret;

        if (unlikely(!sk || th_len < sizeof(*th)))
                return -EINVAL;

        /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -EINVAL;

        if (!th->ack || th->rst || th->syn)
                return -ENOENT;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        if (tcp_synq_no_recent_overflow(sk))
                return -ENOENT;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
                        return -EINVAL;

                ret = __cookie_v4_check((struct iphdr *)iph, th);
                break;

#if IS_BUILTIN(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                ret = __cookie_v6_check((struct ipv6hdr *)iph, th);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }

        if (ret > 0)
                return 0;

        return -ENOENT;
#else
        return -ENOTSUPP;
#endif
}

static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
        .func                = bpf_tcp_check_syncookie,
        .gpl_only        = true,
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
           struct tcphdr *, th, u32, th_len)
{
#ifdef CONFIG_SYN_COOKIES
        u32 cookie;
        u16 mss;

        if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
                return -EINVAL;

        if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
                return -ENOENT;

        if (!th->syn || th->ack || th->fin || th->rst)
                return -EINVAL;

        if (unlikely(iph_len < sizeof(struct iphdr)))
                return -EINVAL;

        /* Both struct iphdr and struct ipv6hdr have the version field at the
         * same offset so we can cast to the shorter header (struct iphdr).
         */
        switch (((struct iphdr *)iph)->version) {
        case 4:
                if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
                        return -EINVAL;

                mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
                break;

#if IS_BUILTIN(CONFIG_IPV6)
        case 6:
                if (unlikely(iph_len < sizeof(struct ipv6hdr)))
                        return -EINVAL;

                if (sk->sk_family != AF_INET6)
                        return -EINVAL;

                mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
                break;
#endif /* CONFIG_IPV6 */

        default:
                return -EPROTONOSUPPORT;
        }
        if (mss == 0)
                return -ENOENT;

        return cookie | ((u64)mss << 32);
#else
        return -EOPNOTSUPP;
#endif /* CONFIG_SYN_COOKIES */
}

static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
        .func                = bpf_tcp_gen_syncookie,
        .gpl_only        = true, /* __cookie_v*_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg5_type        = ARG_CONST_SIZE,
};

BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
{
        if (!sk || flags != 0)
                return -EINVAL;
        if (!skb_at_tc_ingress(skb))
                return -EOPNOTSUPP;
        if (unlikely(dev_net(skb->dev) != sock_net(sk)))
                return -ENETUNREACH;
        if (sk_unhashed(sk))
                return -EOPNOTSUPP;
        if (sk_is_refcounted(sk) &&
            unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
                return -ENOENT;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_pfree;

        return 0;
}

static const struct bpf_func_proto bpf_sk_assign_proto = {
        .func                = bpf_sk_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .arg3_type        = ARG_ANYTHING,
};

static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
                                    u8 search_kind, const u8 *magic,
                                    u8 magic_len, bool *eol)
{
        u8 kind, kind_len;

        *eol = false;

        while (op < opend) {
                kind = op[0];

                if (kind == TCPOPT_EOL) {
                        *eol = true;
                        return ERR_PTR(-ENOMSG);
                } else if (kind == TCPOPT_NOP) {
                        op++;
                        continue;
                }

                if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
                        /* Something is wrong in the received header.
                         * Follow the TCP stack's tcp_parse_options()
                         * and just bail here.
                         */
                        return ERR_PTR(-EFAULT);

                kind_len = op[1];
                if (search_kind == kind) {
                        if (!magic_len)
                                return op;

                        if (magic_len > kind_len - 2)
                                return ERR_PTR(-ENOMSG);

                        if (!memcmp(&op[2], magic, magic_len))
                                return op;
                }

                op += kind_len;
        }

        return ERR_PTR(-ENOMSG);
}

BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           void *, search_res, u32, len, u64, flags)
{
        bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
        const u8 *op, *opend, *magic, *search = search_res;
        u8 search_kind, search_len, copy_len, magic_len;
        int ret;

        if (!is_locked_tcp_sock_ops(bpf_sock))
                return -EOPNOTSUPP;

        /* 2 byte is the minimal option len except TCPOPT_NOP and
         * TCPOPT_EOL which are useless for the bpf prog to learn
         * and this helper disallow loading them also.
         */
        if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
                return -EINVAL;

        search_kind = search[0];
        search_len = search[1];

        if (search_len > len || search_kind == TCPOPT_NOP ||
            search_kind == TCPOPT_EOL)
                return -EINVAL;

        if (search_kind == TCPOPT_EXP || search_kind == 253) {
                /* 16 or 32 bit magic.  +2 for kind and kind length */
                if (search_len != 4 && search_len != 6)
                        return -EINVAL;
                magic = &search[2];
                magic_len = search_len - 2;
        } else {
                if (search_len)
                        return -EINVAL;
                magic = NULL;
                magic_len = 0;
        }

        if (load_syn) {
                ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
                if (ret < 0)
                        return ret;

                opend = op + ret;
                op += sizeof(struct tcphdr);
        } else {
                if (!bpf_sock->skb ||
                    bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                        /* This bpf_sock->op cannot call this helper */
                        return -EPERM;

                opend = bpf_sock->skb_data_end;
                op = bpf_sock->skb->data + sizeof(struct tcphdr);
        }

        op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
                                &eol);
        if (IS_ERR(op))
                return PTR_ERR(op);

        copy_len = op[1];
        ret = copy_len;
        if (copy_len > len) {
                ret = -ENOSPC;
                copy_len = len;
        }

        memcpy(search_res, op, copy_len);
        return ret;
}

static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
        .func                = bpf_sock_ops_load_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_WRITE,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           const void *, from, u32, len, u64, flags)
{
        u8 new_kind, new_kind_len, magic_len = 0, *opend;
        const u8 *op, *new_op, *magic = NULL;
        struct sk_buff *skb;
        bool eol;

        if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
                return -EPERM;

        if (len < 2 || flags)
                return -EINVAL;

        new_op = from;
        new_kind = new_op[0];
        new_kind_len = new_op[1];

        if (new_kind_len > len || new_kind == TCPOPT_NOP ||
            new_kind == TCPOPT_EOL)
                return -EINVAL;

        if (new_kind_len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        /* 253 is another experimental kind */
        if (new_kind == TCPOPT_EXP || new_kind == 253)  {
                if (new_kind_len < 4)
                        return -EINVAL;
                /* Match for the 2 byte magic also.
                 * RFC 6994: the magic could be 2 or 4 bytes.
                 * Hence, matching by 2 byte only is on the
                 * conservative side but it is the right
                 * thing to do for the 'search-for-duplication'
                 * purpose.
                 */
                magic = &new_op[2];
                magic_len = 2;
        }

        /* Check for duplication */
        skb = bpf_sock->skb;
        op = skb->data + sizeof(struct tcphdr);
        opend = bpf_sock->skb_data_end;

        op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
                                &eol);
        if (!IS_ERR(op))
                return -EEXIST;

        if (PTR_ERR(op) != -ENOMSG)
                return PTR_ERR(op);

        if (eol)
                /* The option has been ended.  Treat it as no more
                 * header option can be written.
                 */
                return -ENOSPC;

        /* No duplication found.  Store the header option. */
        memcpy(opend, from, new_kind_len);

        bpf_sock->remaining_opt_len -= new_kind_len;
        bpf_sock->skb_data_end += new_kind_len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
        .func                = bpf_sock_ops_store_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_MEM | MEM_RDONLY,
        .arg3_type        = ARG_CONST_SIZE,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
           u32, len, u64, flags)
{
        if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
                return -EPERM;

        if (flags || len < 2)
                return -EINVAL;

        if (len > bpf_sock->remaining_opt_len)
                return -ENOSPC;

        bpf_sock->remaining_opt_len -= len;

        return 0;
}

static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
        .func                = bpf_sock_ops_reserve_hdr_opt,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_ANYTHING,
};

BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
           u64, tstamp, u32, tstamp_type)
{
        /* skb_clear_delivery_time() is done for inet protocol */
        if (skb->protocol != htons(ETH_P_IP) &&
            skb->protocol != htons(ETH_P_IPV6))
                return -EOPNOTSUPP;

        switch (tstamp_type) {
        case BPF_SKB_CLOCK_REALTIME:
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_REALTIME;
                break;
        case BPF_SKB_CLOCK_MONOTONIC:
                if (!tstamp)
                        return -EINVAL;
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_MONOTONIC;
                break;
        case BPF_SKB_CLOCK_TAI:
                if (!tstamp)
                        return -EINVAL;
                skb->tstamp = tstamp;
                skb->tstamp_type = SKB_CLOCK_TAI;
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
        .func           = bpf_skb_set_tstamp,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type      = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_ANYTHING,
        .arg3_type      = ARG_ANYTHING,
};

#ifdef CONFIG_SYN_COOKIES
BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
           struct tcphdr *, th, u32, th_len)
{
        u32 cookie;
        u16 mss;

        if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
        cookie = __cookie_v4_init_sequence(iph, th, &mss);

        return cookie | ((u64)mss << 32);
}

static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
        .func                = bpf_tcp_raw_gen_syncookie_ipv4,
        .gpl_only        = true, /* __cookie_v4_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct iphdr),
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
           struct tcphdr *, th, u32, th_len)
{
#if IS_BUILTIN(CONFIG_IPV6)
        const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
                sizeof(struct ipv6hdr);
        u32 cookie;
        u16 mss;

        if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
                return -EINVAL;

        mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
        cookie = __cookie_v6_init_sequence(iph, th, &mss);

        return cookie | ((u64)mss << 32);
#else
        return -EPROTONOSUPPORT;
#endif
}

static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
        .func                = bpf_tcp_raw_gen_syncookie_ipv6,
        .gpl_only        = true, /* __cookie_v6_init_sequence() is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct ipv6hdr),
        .arg2_type        = ARG_PTR_TO_MEM,
        .arg3_type        = ARG_CONST_SIZE_OR_ZERO,
};

BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
           struct tcphdr *, th)
{
        if (__cookie_v4_check(iph, th) > 0)
                return 0;

        return -EACCES;
}

static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
        .func                = bpf_tcp_raw_check_syncookie_ipv4,
        .gpl_only        = true, /* __cookie_v4_check is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct iphdr),
        .arg2_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg2_size        = sizeof(struct tcphdr),
};

BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
           struct tcphdr *, th)
{
#if IS_BUILTIN(CONFIG_IPV6)
        if (__cookie_v6_check(iph, th) > 0)
                return 0;

        return -EACCES;
#else
        return -EPROTONOSUPPORT;
#endif
}

static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
        .func                = bpf_tcp_raw_check_syncookie_ipv6,
        .gpl_only        = true, /* __cookie_v6_check is GPL */
        .pkt_access        = true,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg1_size        = sizeof(struct ipv6hdr),
        .arg2_type        = ARG_PTR_TO_FIXED_SIZE_MEM,
        .arg2_size        = sizeof(struct tcphdr),
};
#endif /* CONFIG_SYN_COOKIES */

#endif /* CONFIG_INET */

bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
        switch (func_id) {
        case BPF_FUNC_clone_redirect:
        case BPF_FUNC_l3_csum_replace:
        case BPF_FUNC_l4_csum_replace:
        case BPF_FUNC_lwt_push_encap:
        case BPF_FUNC_lwt_seg6_action:
        case BPF_FUNC_lwt_seg6_adjust_srh:
        case BPF_FUNC_lwt_seg6_store_bytes:
        case BPF_FUNC_msg_pop_data:
        case BPF_FUNC_msg_pull_data:
        case BPF_FUNC_msg_push_data:
        case BPF_FUNC_skb_adjust_room:
        case BPF_FUNC_skb_change_head:
        case BPF_FUNC_skb_change_proto:
        case BPF_FUNC_skb_change_tail:
        case BPF_FUNC_skb_pull_data:
        case BPF_FUNC_skb_store_bytes:
        case BPF_FUNC_skb_vlan_pop:
        case BPF_FUNC_skb_vlan_push:
        case BPF_FUNC_store_hdr_opt:
        case BPF_FUNC_xdp_adjust_head:
        case BPF_FUNC_xdp_adjust_meta:
        case BPF_FUNC_xdp_adjust_tail:
        /* tail-called program could call any of the above */
        case BPF_FUNC_tail_call:
                return true;
        default:
                return false;
        }
}

const struct bpf_func_proto bpf_event_output_data_proto __weak;
const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;

static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        func_proto = cgroup_current_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_cg_sock_proto;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        func_proto = cgroup_current_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_bind:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                        return &bpf_bind_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_addr_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_addr_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sock_addr_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sock_addr_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_setsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return &bpf_sock_addr_setsockopt_proto;
                default:
                        return NULL;
                }
        case BPF_FUNC_getsockopt:
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_UNIX_CONNECT:
                case BPF_CGROUP_UDP4_RECVMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                case BPF_CGROUP_UNIX_RECVMSG:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UNIX_SENDMSG:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_UNIX_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UNIX_GETSOCKNAME:
                        return &bpf_sock_addr_getsockopt_proto;
                default:
                        return NULL;
                }
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;

static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
        case BPF_FUNC_sk_cgroup_id:
                return &bpf_sk_cgroup_id_proto;
        case BPF_FUNC_sk_ancestor_cgroup_id:
                return &bpf_sk_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
#endif
        default:
                return sk_filter_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &bpf_skb_load_bytes_relative_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_skb_vlan_push:
                return &bpf_skb_vlan_push_proto;
        case BPF_FUNC_skb_vlan_pop:
                return &bpf_skb_vlan_pop_proto;
        case BPF_FUNC_skb_change_proto:
                return &bpf_skb_change_proto_proto;
        case BPF_FUNC_skb_change_type:
                return &bpf_skb_change_type_proto;
        case BPF_FUNC_skb_adjust_room:
                return &bpf_skb_adjust_room_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_redirect_neigh:
                return &bpf_redirect_neigh_proto;
        case BPF_FUNC_redirect_peer:
                return &bpf_redirect_peer_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_set_hash:
                return &bpf_set_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_skb_fib_lookup_proto;
        case BPF_FUNC_check_mtu:
                return &bpf_skb_check_mtu_proto;
        case BPF_FUNC_sk_fullsock:
                return &bpf_sk_fullsock_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_XFRM
        case BPF_FUNC_skb_get_xfrm_state:
                return &bpf_skb_get_xfrm_state_proto;
#endif
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_skb_cgroup_classid:
                return &bpf_skb_cgroup_classid_proto;
#endif
#ifdef CONFIG_SOCK_CGROUP_DATA
        case BPF_FUNC_skb_cgroup_id:
                return &bpf_skb_cgroup_id_proto;
        case BPF_FUNC_skb_ancestor_cgroup_id:
                return &bpf_skb_ancestor_cgroup_id_proto;
#endif
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_tc_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_tc_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
        case BPF_FUNC_get_listener_sock:
                return &bpf_get_listener_sock_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_tc_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_skb_ecn_set_ce:
                return &bpf_skb_ecn_set_ce_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_assign_proto;
        case BPF_FUNC_skb_set_tstamp:
                return &bpf_skb_set_tstamp_proto;
#ifdef CONFIG_SYN_COOKIES
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
                return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
                return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
                return &bpf_tcp_raw_check_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
                return &bpf_tcp_raw_check_syncookie_ipv6_proto;
#endif
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_xdp_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_xdp_adjust_head:
                return &bpf_xdp_adjust_head_proto;
        case BPF_FUNC_xdp_adjust_meta:
                return &bpf_xdp_adjust_meta_proto;
        case BPF_FUNC_redirect:
                return &bpf_xdp_redirect_proto;
        case BPF_FUNC_redirect_map:
                return &bpf_xdp_redirect_map_proto;
        case BPF_FUNC_xdp_adjust_tail:
                return &bpf_xdp_adjust_tail_proto;
        case BPF_FUNC_xdp_get_buff_len:
                return &bpf_xdp_get_buff_len_proto;
        case BPF_FUNC_xdp_load_bytes:
                return &bpf_xdp_load_bytes_proto;
        case BPF_FUNC_xdp_store_bytes:
                return &bpf_xdp_store_bytes_proto;
        case BPF_FUNC_fib_lookup:
                return &bpf_xdp_fib_lookup_proto;
        case BPF_FUNC_check_mtu:
                return &bpf_xdp_check_mtu_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_xdp_sk_lookup_udp_proto;
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_xdp_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_xdp_skc_lookup_tcp_proto;
        case BPF_FUNC_tcp_check_syncookie:
                return &bpf_tcp_check_syncookie_proto;
        case BPF_FUNC_tcp_gen_syncookie:
                return &bpf_tcp_gen_syncookie_proto;
#ifdef CONFIG_SYN_COOKIES
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
                return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
                return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
                return &bpf_tcp_raw_check_syncookie_ipv4_proto;
        case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
                return &bpf_tcp_raw_check_syncookie_ipv6_proto;
#endif
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }

#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
        /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
         * kfuncs are defined in two different modules, and we want to be able
         * to use them interchangeably with the same BTF type ID. Because modules
         * can't de-duplicate BTF IDs between each other, we need the type to be
         * referenced in the vmlinux BTF or the verifier will get confused about
         * the different types. So we add this dummy type reference which will
         * be included in vmlinux BTF, allowing both modules to refer to the
         * same type ID.
         */
        BTF_TYPE_EMIT(struct nf_conn___init);
#endif
}

const struct bpf_func_proto bpf_sock_map_update_proto __weak;
const struct bpf_func_proto bpf_sock_hash_update_proto __weak;

static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func_proto;

        func_proto = cgroup_common_func_proto(func_id, prog);
        if (func_proto)
                return func_proto;

        switch (func_id) {
        case BPF_FUNC_setsockopt:
                return &bpf_sock_ops_setsockopt_proto;
        case BPF_FUNC_getsockopt:
                return &bpf_sock_ops_getsockopt_proto;
        case BPF_FUNC_sock_ops_cb_flags_set:
                return &bpf_sock_ops_cb_flags_set_proto;
        case BPF_FUNC_sock_map_update:
                return &bpf_sock_map_update_proto;
        case BPF_FUNC_sock_hash_update:
                return &bpf_sock_hash_update_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_sock_ops_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sock_ops_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_load_hdr_opt:
                return &bpf_sock_ops_load_hdr_opt_proto;
        case BPF_FUNC_store_hdr_opt:
                return &bpf_sock_ops_store_hdr_opt_proto;
        case BPF_FUNC_reserve_hdr_opt:
                return &bpf_sock_ops_reserve_hdr_opt_proto;
        case BPF_FUNC_tcp_sock:
                return &bpf_tcp_sock_proto;
#endif /* CONFIG_INET */
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_msg_redirect_map:
                return &bpf_msg_redirect_map_proto;
        case BPF_FUNC_msg_redirect_hash:
                return &bpf_msg_redirect_hash_proto;
        case BPF_FUNC_msg_apply_bytes:
                return &bpf_msg_apply_bytes_proto;
        case BPF_FUNC_msg_cork_bytes:
                return &bpf_msg_cork_bytes_proto;
        case BPF_FUNC_msg_pull_data:
                return &bpf_msg_pull_data_proto;
        case BPF_FUNC_msg_push_data:
                return &bpf_msg_push_data_proto;
        case BPF_FUNC_msg_pop_data:
                return &bpf_msg_pop_data_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_get_current_uid_gid:
                return &bpf_get_current_uid_gid_proto;
        case BPF_FUNC_sk_storage_get:
                return &bpf_sk_storage_get_proto;
        case BPF_FUNC_sk_storage_delete:
                return &bpf_sk_storage_delete_proto;
        case BPF_FUNC_get_netns_cookie:
                return &bpf_get_netns_cookie_sk_msg_proto;
#ifdef CONFIG_CGROUP_NET_CLASSID
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_curr_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;

static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &sk_skb_pull_data_proto;
        case BPF_FUNC_skb_change_tail:
                return &sk_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &sk_skb_change_head_proto;
        case BPF_FUNC_skb_adjust_room:
                return &sk_skb_adjust_room_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_cookie_proto;
        case BPF_FUNC_get_socket_uid:
                return &bpf_get_socket_uid_proto;
        case BPF_FUNC_sk_redirect_map:
                return &bpf_sk_redirect_map_proto;
        case BPF_FUNC_sk_redirect_hash:
                return &bpf_sk_redirect_hash_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
#ifdef CONFIG_INET
        case BPF_FUNC_sk_lookup_tcp:
                return &bpf_sk_lookup_tcp_proto;
        case BPF_FUNC_sk_lookup_udp:
                return &bpf_sk_lookup_udp_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        case BPF_FUNC_skc_lookup_tcp:
                return &bpf_skc_lookup_tcp_proto;
#endif
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_flow_dissector_load_bytes_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_load_bytes:
                return &bpf_skb_load_bytes_proto;
        case BPF_FUNC_skb_pull_data:
                return &bpf_skb_pull_data_proto;
        case BPF_FUNC_csum_diff:
                return &bpf_csum_diff_proto;
        case BPF_FUNC_get_cgroup_classid:
                return &bpf_get_cgroup_classid_proto;
        case BPF_FUNC_get_route_realm:
                return &bpf_get_route_realm_proto;
        case BPF_FUNC_get_hash_recalc:
                return &bpf_get_hash_recalc_proto;
        case BPF_FUNC_perf_event_output:
                return &bpf_skb_event_output_proto;
        case BPF_FUNC_get_smp_processor_id:
                return &bpf_get_smp_processor_id_proto;
        case BPF_FUNC_skb_under_cgroup:
                return &bpf_skb_under_cgroup_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_in_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_skb_get_tunnel_key:
                return &bpf_skb_get_tunnel_key_proto;
        case BPF_FUNC_skb_set_tunnel_key:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_skb_get_tunnel_opt:
                return &bpf_skb_get_tunnel_opt_proto;
        case BPF_FUNC_skb_set_tunnel_opt:
                return bpf_get_skb_set_tunnel_proto(func_id);
        case BPF_FUNC_redirect:
                return &bpf_redirect_proto;
        case BPF_FUNC_clone_redirect:
                return &bpf_clone_redirect_proto;
        case BPF_FUNC_skb_change_tail:
                return &bpf_skb_change_tail_proto;
        case BPF_FUNC_skb_change_head:
                return &bpf_skb_change_head_proto;
        case BPF_FUNC_skb_store_bytes:
                return &bpf_skb_store_bytes_proto;
        case BPF_FUNC_csum_update:
                return &bpf_csum_update_proto;
        case BPF_FUNC_csum_level:
                return &bpf_csum_level_proto;
        case BPF_FUNC_l3_csum_replace:
                return &bpf_l3_csum_replace_proto;
        case BPF_FUNC_l4_csum_replace:
                return &bpf_l4_csum_replace_proto;
        case BPF_FUNC_set_hash_invalid:
                return &bpf_set_hash_invalid_proto;
        case BPF_FUNC_lwt_push_encap:
                return &bpf_lwt_xmit_push_encap_proto;
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static const struct bpf_func_proto *
lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
        case BPF_FUNC_lwt_seg6_store_bytes:
                return &bpf_lwt_seg6_store_bytes_proto;
        case BPF_FUNC_lwt_seg6_action:
                return &bpf_lwt_seg6_action_proto;
        case BPF_FUNC_lwt_seg6_adjust_srh:
                return &bpf_lwt_seg6_adjust_srh_proto;
#endif
        default:
                return lwt_out_func_proto(func_id, prog);
        }
}

static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type,
                                    const struct bpf_prog *prog,
                                    struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        switch (off) {
        case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                if (off + size > offsetofend(struct __sk_buff, cb[4]))
                        return false;
                break;
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (info->is_ldsx || size != size_default)
                        return false;
                break;
        case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
        case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
                if (size != size_default)
                        return false;
                break;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                return false;
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                if (type == BPF_WRITE || size != sizeof(__u64))
                        return false;
                break;
        case bpf_ctx_range(struct __sk_buff, tstamp):
                if (size != sizeof(__u64))
                        return false;
                break;
        case offsetof(struct __sk_buff, sk):
                if (type == BPF_WRITE || size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
                break;
        case offsetof(struct __sk_buff, tstamp_type):
                return false;
        case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
                /* Explicitly prohibit access to padding in __sk_buff. */
                return false;
        default:
                /* Only narrow read access allowed for now. */
                if (type == BPF_WRITE) {
                        if (size != size_default)
                                return false;
                } else {
                        bpf_ctx_record_field_size(info, size_default);
                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                }
        }

        return true;
}

static bool sk_filter_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, data_end):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool cg_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, wire_len):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (!bpf_token_capable(prog->aux->token, CAP_BPF))
                        return false;
                break;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                case bpf_ctx_range(struct __sk_buff, tstamp):
                        if (!bpf_token_capable(prog->aux->token, CAP_BPF))
                                return false;
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool lwt_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

/* Attach type specific accesses */
static bool __sock_filter_check_attach_type(int off,
                                            enum bpf_access_type access_type,
                                            enum bpf_attach_type attach_type)
{
        switch (off) {
        case offsetof(struct bpf_sock, bound_dev_if):
        case offsetof(struct bpf_sock, mark):
        case offsetof(struct bpf_sock, priority):
                switch (attach_type) {
                case BPF_CGROUP_INET_SOCK_CREATE:
                case BPF_CGROUP_INET_SOCK_RELEASE:
                        goto full_access;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_ip4):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
                switch (attach_type) {
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        case bpf_ctx_range(struct bpf_sock, src_port):
                switch (attach_type) {
                case BPF_CGROUP_INET4_POST_BIND:
                case BPF_CGROUP_INET6_POST_BIND:
                        goto read_only;
                default:
                        return false;
                }
        }
read_only:
        return access_type == BPF_READ;
full_access:
        return true;
}

bool bpf_sock_common_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range_till(struct bpf_sock, type, priority):
                return false;
        default:
                return bpf_sock_is_valid_access(off, size, type, info);
        }
}

bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
                              struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);
        int field_size;

        if (off < 0 || off >= sizeof(struct bpf_sock))
                return false;
        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct bpf_sock, state):
        case offsetof(struct bpf_sock, family):
        case offsetof(struct bpf_sock, type):
        case offsetof(struct bpf_sock, protocol):
        case offsetof(struct bpf_sock, src_port):
        case offsetof(struct bpf_sock, rx_queue_mapping):
        case bpf_ctx_range(struct bpf_sock, src_ip4):
        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock, dst_ip4):
        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);
        case bpf_ctx_range(struct bpf_sock, dst_port):
                field_size = size == size_default ?
                        size_default : sizeof_field(struct bpf_sock, dst_port);
                bpf_ctx_record_field_size(info, field_size);
                return bpf_ctx_narrow_access_ok(off, size, field_size);
        case offsetofend(struct bpf_sock, dst_port) ...
             offsetof(struct bpf_sock, dst_ip4) - 1:
                return false;
        }

        return size == size_default;
}

static bool sock_filter_is_valid_access(int off, int size,
                                        enum bpf_access_type type,
                                        const struct bpf_prog *prog,
                                        struct bpf_insn_access_aux *info)
{
        if (!bpf_sock_is_valid_access(off, size, type, info))
                return false;
        return __sock_filter_check_attach_type(off, type,
                                               prog->expected_attach_type);
}

static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
                             const struct bpf_prog *prog)
{
        /* Neither direct read nor direct write requires any preliminary
         * action.
         */
        return 0;
}

static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
                                const struct bpf_prog *prog, int drop_verdict)
{
        struct bpf_insn *insn = insn_buf;

        if (!direct_write)
                return 0;

        /* if (!skb->cloned)
         *       goto start;
         *
         * (Fast-path, otherwise approximation that we might be
         *  a clone, do the rest in helper.)
         */
        *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
        *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);

        /* ret = bpf_skb_pull_data(skb, 0); */
        *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1);
        *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2);
        *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0,
                               BPF_FUNC_skb_pull_data);
        /* if (!ret)
         *      goto restore;
         * return TC_ACT_SHOT;
         */
        *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict);
        *insn++ = BPF_EXIT_INSN();

        /* restore: */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6);
        /* start: */
        *insn++ = prog->insnsi[0];

        return insn - insn_buf;
}

static int bpf_gen_ld_abs(const struct bpf_insn *orig,
                          struct bpf_insn *insn_buf)
{
        bool indirect = BPF_MODE(orig->code) == BPF_IND;
        struct bpf_insn *insn = insn_buf;

        if (!indirect) {
                *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
        } else {
                *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg);
                if (orig->imm)
                        *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
        }
        /* We're guaranteed here that CTX is in R6. */
        *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);

        switch (BPF_SIZE(orig->code)) {
        case BPF_B:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache);
                break;
        case BPF_H:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache);
                break;
        case BPF_W:
                *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache);
                break;
        }

        *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2);
        *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
        *insn++ = BPF_EXIT_INSN();

        return insn - insn_buf;
}

static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write,
                               const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT);
}

static bool tc_cls_act_is_valid_access(int off, int size,
                                       enum bpf_access_type type,
                                       const struct bpf_prog *prog,
                                       struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, mark):
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                case bpf_ctx_range(struct __sk_buff, tc_classid):
                case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
                case bpf_ctx_range(struct __sk_buff, tstamp):
                case bpf_ctx_range(struct __sk_buff, queue_mapping):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        case bpf_ctx_range_till(struct __sk_buff, family, local_port):
                return false;
        case offsetof(struct __sk_buff, tstamp_type):
                /* The convert_ctx_access() on reading and writing
                 * __sk_buff->tstamp depends on whether the bpf prog
                 * has used __sk_buff->tstamp_type or not.
                 * Thus, we need to set prog->tstamp_type_access
                 * earlier during is_valid_access() here.
                 */
                ((struct bpf_prog *)prog)->tstamp_type_access = 1;
                return size == sizeof(__u8);
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

DEFINE_MUTEX(nf_conn_btf_access_lock);
EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);

int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
                              const struct bpf_reg_state *reg,
                              int off, int size);
EXPORT_SYMBOL_GPL(nfct_btf_struct_access);

static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
                                        const struct bpf_reg_state *reg,
                                        int off, int size)
{
        int ret = -EACCES;

        mutex_lock(&nf_conn_btf_access_lock);
        if (nfct_btf_struct_access)
                ret = nfct_btf_struct_access(log, reg, off, size);
        mutex_unlock(&nf_conn_btf_access_lock);

        return ret;
}

static bool __is_valid_xdp_access(int off, int size)
{
        if (off < 0 || off >= sizeof(struct xdp_md))
                return false;
        if (off % size != 0)
                return false;
        if (size != sizeof(__u32))
                return false;

        return true;
}

static bool xdp_is_valid_access(int off, int size,
                                enum bpf_access_type type,
                                const struct bpf_prog *prog,
                                struct bpf_insn_access_aux *info)
{
        if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
                switch (off) {
                case offsetof(struct xdp_md, egress_ifindex):
                        return false;
                }
        }

        if (type == BPF_WRITE) {
                if (bpf_prog_is_offloaded(prog->aux)) {
                        switch (off) {
                        case offsetof(struct xdp_md, rx_queue_index):
                                return __is_valid_xdp_access(off, size);
                        }
                }
                return false;
        } else {
                switch (off) {
                case offsetof(struct xdp_md, data_meta):
                case offsetof(struct xdp_md, data):
                case offsetof(struct xdp_md, data_end):
                        if (info->is_ldsx)
                                return false;
                }
        }

        switch (off) {
        case offsetof(struct xdp_md, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case offsetof(struct xdp_md, data_meta):
                info->reg_type = PTR_TO_PACKET_META;
                break;
        case offsetof(struct xdp_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return __is_valid_xdp_access(off, size);
}

void bpf_warn_invalid_xdp_action(const struct net_device *dev,
                                 const struct bpf_prog *prog, u32 act)
{
        const u32 act_max = XDP_REDIRECT;

        pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n",
                     act > act_max ? "Illegal" : "Driver unsupported",
                     act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A");
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);

static int xdp_btf_struct_access(struct bpf_verifier_log *log,
                                 const struct bpf_reg_state *reg,
                                 int off, int size)
{
        int ret = -EACCES;

        mutex_lock(&nf_conn_btf_access_lock);
        if (nfct_btf_struct_access)
                ret = nfct_btf_struct_access(log, reg, off, size);
        mutex_unlock(&nf_conn_btf_access_lock);

        return ret;
}

static bool sock_addr_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_addr))
                return false;
        if (off % size != 0)
                return false;

        /* Disallow access to fields not belonging to the attach type's address
         * family.
         */
        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET4_BIND:
                case BPF_CGROUP_INET4_CONNECT:
                case BPF_CGROUP_INET4_GETPEERNAME:
                case BPF_CGROUP_INET4_GETSOCKNAME:
                case BPF_CGROUP_UDP4_SENDMSG:
                case BPF_CGROUP_UDP4_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_INET6_BIND:
                case BPF_CGROUP_INET6_CONNECT:
                case BPF_CGROUP_INET6_GETPEERNAME:
                case BPF_CGROUP_INET6_GETSOCKNAME:
                case BPF_CGROUP_UDP6_SENDMSG:
                case BPF_CGROUP_UDP6_RECVMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP4_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                switch (prog->expected_attach_type) {
                case BPF_CGROUP_UDP6_SENDMSG:
                        break;
                default:
                        return false;
                }
                break;
        }

        switch (off) {
        case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
        case bpf_ctx_range(struct bpf_sock_addr, user_port):
                if (type == BPF_READ) {
                        bpf_ctx_record_field_size(info, size_default);

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (!bpf_ctx_narrow_access_ok(off, size, size_default))
                                return false;
                } else {
                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   user_ip6))
                                return true;

                        if (bpf_ctx_wide_access_ok(off, size,
                                                   struct bpf_sock_addr,
                                                   msg_src_ip6))
                                return true;

                        if (size != size_default)
                                return false;
                }
                break;
        case offsetof(struct bpf_sock_addr, sk):
                if (type != BPF_READ)
                        return false;
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        default:
                if (type == BPF_READ) {
                        if (size != size_default)
                                return false;
                } else {
                        return false;
                }
        }

        return true;
}

static bool sock_ops_is_valid_access(int off, int size,
                                     enum bpf_access_type type,
                                     const struct bpf_prog *prog,
                                     struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct bpf_sock_ops))
                return false;

        /* The verifier guarantees that size > 0. */
        if (off % size != 0)
                return false;

        if (type == BPF_WRITE) {
                switch (off) {
                case offsetof(struct bpf_sock_ops, reply):
                case offsetof(struct bpf_sock_ops, sk_txhash):
                        if (size != size_default)
                                return false;
                        break;
                default:
                        return false;
                }
        } else {
                switch (off) {
                case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received,
                                        bytes_acked):
                        if (size != sizeof(__u64))
                                return false;
                        break;
                case offsetof(struct bpf_sock_ops, sk):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_SOCKET_OR_NULL;
                        break;
                case offsetof(struct bpf_sock_ops, skb_data):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET;
                        break;
                case offsetof(struct bpf_sock_ops, skb_data_end):
                        if (size != sizeof(__u64))
                                return false;
                        info->reg_type = PTR_TO_PACKET_END;
                        break;
                case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                        bpf_ctx_record_field_size(info, size_default);
                        return bpf_ctx_narrow_access_ok(off, size,
                                                        size_default);
                case offsetof(struct bpf_sock_ops, skb_hwtstamp):
                        if (size != sizeof(__u64))
                                return false;
                        break;
                default:
                        if (size != size_default)
                                return false;
                        break;
                }
        }

        return true;
}

static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write,
                           const struct bpf_prog *prog)
{
        return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP);
}

static bool sk_skb_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        switch (off) {
        case bpf_ctx_range(struct __sk_buff, tc_classid):
        case bpf_ctx_range(struct __sk_buff, data_meta):
        case bpf_ctx_range(struct __sk_buff, tstamp):
        case bpf_ctx_range(struct __sk_buff, wire_len):
        case bpf_ctx_range(struct __sk_buff, hwtstamp):
                return false;
        }

        if (type == BPF_WRITE) {
                switch (off) {
                case bpf_ctx_range(struct __sk_buff, tc_index):
                case bpf_ctx_range(struct __sk_buff, priority):
                        break;
                default:
                        return false;
                }
        }

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, mark):
                return false;
        case bpf_ctx_range(struct __sk_buff, data):
                info->reg_type = PTR_TO_PACKET;
                break;
        case bpf_ctx_range(struct __sk_buff, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                break;
        }

        return bpf_skb_is_valid_access(off, size, type, prog, info);
}

static bool sk_msg_is_valid_access(int off, int size,
                                   enum bpf_access_type type,
                                   const struct bpf_prog *prog,
                                   struct bpf_insn_access_aux *info)
{
        if (type == BPF_WRITE)
                return false;

        if (off % size != 0)
                return false;

        switch (off) {
        case offsetof(struct sk_msg_md, data):
                info->reg_type = PTR_TO_PACKET;
                if (size != sizeof(__u64))
                        return false;
                break;
        case offsetof(struct sk_msg_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                if (size != sizeof(__u64))
                        return false;
                break;
        case offsetof(struct sk_msg_md, sk):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_SOCKET;
                break;
        case bpf_ctx_range(struct sk_msg_md, family):
        case bpf_ctx_range(struct sk_msg_md, remote_ip4):
        case bpf_ctx_range(struct sk_msg_md, local_ip4):
        case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct sk_msg_md, remote_port):
        case bpf_ctx_range(struct sk_msg_md, local_port):
        case bpf_ctx_range(struct sk_msg_md, size):
                if (size != sizeof(__u32))
                        return false;
                break;
        default:
                return false;
        }
        return true;
}

static bool flow_dissector_is_valid_access(int off, int size,
                                           enum bpf_access_type type,
                                           const struct bpf_prog *prog,
                                           struct bpf_insn_access_aux *info)
{
        const int size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct __sk_buff))
                return false;

        if (type == BPF_WRITE)
                return false;

        switch (off) {
        case bpf_ctx_range(struct __sk_buff, data):
                if (info->is_ldsx || size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET;
                return true;
        case bpf_ctx_range(struct __sk_buff, data_end):
                if (info->is_ldsx || size != size_default)
                        return false;
                info->reg_type = PTR_TO_PACKET_END;
                return true;
        case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
                if (size != sizeof(__u64))
                        return false;
                info->reg_type = PTR_TO_FLOW_KEYS;
                return true;
        default:
                return false;
        }
}

static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
                                             const struct bpf_insn *si,
                                             struct bpf_insn *insn_buf,
                                             struct bpf_prog *prog,
                                             u32 *target_size)

{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data));
                break;

        case offsetof(struct __sk_buff, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, data_end));
                break;

        case offsetof(struct __sk_buff, flow_keys):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_flow_dissector, flow_keys));
                break;
        }

        return insn - insn_buf;
}

static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
                                                     struct bpf_insn *insn)
{
        __u8 value_reg = si->dst_reg;
        __u8 skb_reg = si->src_reg;
        BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
        BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
        BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
        BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
        *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
        *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
#ifdef __BIG_ENDIAN_BITFIELD
        *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
#else
        BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
#endif

        return insn;
}

static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg,
                                                  struct bpf_insn *insn)
{
        /* si->dst_reg = skb_shinfo(SKB); */
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              BPF_REG_AX, skb_reg,
                              offsetof(struct sk_buff, end));
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
                              dst_reg, skb_reg,
                              offsetof(struct sk_buff, head));
        *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX);
#else
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
                              dst_reg, skb_reg,
                              offsetof(struct sk_buff, end));
#endif

        return insn;
}

static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
                                                const struct bpf_insn *si,
                                                struct bpf_insn *insn)
{
        __u8 value_reg = si->dst_reg;
        __u8 skb_reg = si->src_reg;

#ifdef CONFIG_NET_XGRESS
        /* If the tstamp_type is read,
         * the bpf prog is aware the tstamp could have delivery time.
         * Thus, read skb->tstamp as is if tstamp_type_access is true.
         */
        if (!prog->tstamp_type_access) {
                /* AX is needed because src_reg and dst_reg could be the same */
                __u8 tmp_reg = BPF_REG_AX;

                *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
                /* check if ingress mask bits is set */
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
                *insn++ = BPF_JMP_A(4);
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
                *insn++ = BPF_JMP_A(2);
                /* skb->tc_at_ingress && skb->tstamp_type,
                 * read 0 as the (rcv) timestamp.
                 */
                *insn++ = BPF_MOV64_IMM(value_reg, 0);
                *insn++ = BPF_JMP_A(1);
        }
#endif

        *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
                              offsetof(struct sk_buff, tstamp));
        return insn;
}

static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
                                                 const struct bpf_insn *si,
                                                 struct bpf_insn *insn)
{
        __u8 value_reg = si->src_reg;
        __u8 skb_reg = si->dst_reg;

#ifdef CONFIG_NET_XGRESS
        /* If the tstamp_type is read,
         * the bpf prog is aware the tstamp could have delivery time.
         * Thus, write skb->tstamp as is if tstamp_type_access is true.
         * Otherwise, writing at ingress will have to clear the
         * skb->tstamp_type bit also.
         */
        if (!prog->tstamp_type_access) {
                __u8 tmp_reg = BPF_REG_AX;

                *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
                /* Writing __sk_buff->tstamp as ingress, goto <clear> */
                *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
                /* goto <store> */
                *insn++ = BPF_JMP_A(2);
                /* <clear>: skb->tstamp_type */
                *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
                *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
        }
#endif

        /* <store>: skb->tstamp = tstamp */
        *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
                               skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
        return insn;
}

#define BPF_EMIT_STORE(size, si, off)                                        \
        BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM,                \
                     (si)->dst_reg, (si)->src_reg, (off), (si)->imm)

static u32 bpf_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, len):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, len, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, protocol, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, vlan_proto):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_proto, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, priority):
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 bpf_target_off(struct sk_buff, priority, 4,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, priority, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, skb_iif, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, hash):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, hash, 4,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, mark):
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 bpf_target_off(struct sk_buff, mark, 4,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, mark, 4,
                                                             target_size));
                break;

        case offsetof(struct __sk_buff, pkt_type):
                *target_size = 1;
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
                                      PKT_TYPE_OFFSET);
                *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
#endif
                break;

        case offsetof(struct __sk_buff, queue_mapping):
                if (type == BPF_WRITE) {
                        u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);

                        if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
                                *insn++ = BPF_JMP_A(0); /* noop */
                                break;
                        }

                        if (BPF_CLASS(si->code) == BPF_STX)
                                *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
                        *insn++ = BPF_EMIT_STORE(BPF_H, si, offset);
                } else {
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff,
                                                             queue_mapping,
                                                             2, target_size));
                }
                break;

        case offsetof(struct __sk_buff, vlan_present):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff,
                                                     vlan_all, 4, target_size));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
                break;

        case offsetof(struct __sk_buff, vlan_tci):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, vlan_tci, 2,
                                                     target_size));
                break;

        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct qdisc_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_classid):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);

                off  = si->off;
                off -= offsetof(struct __sk_buff, tc_classid);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, tc_classid);
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
                                              si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, data));
                break;

        case offsetof(struct __sk_buff, data_meta):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_meta);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_meta);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, data_end):
                off  = si->off;
                off -= offsetof(struct __sk_buff, data_end);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct bpf_skb_data_end, data_end);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
                                      si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_H, si,
                                                 bpf_target_off(struct sk_buff, tc_index, 2,
                                                                target_size));
                else
                        *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                              bpf_target_off(struct sk_buff, tc_index, 2,
                                                             target_size));
#else
                *target_size = 2;
                if (type == BPF_WRITE)
                        *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg);
                else
                        *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, napi_id):
#if defined(CONFIG_NET_RX_BUSY_POLL)
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct sk_buff, napi_id, 4,
                                                     target_size));
                *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#else
                *target_size = 4;
                *insn++ = BPF_MOV64_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_family,
                                                     2, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_daddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_rcv_saddr,
                                                     4, target_size));
                break;
        case offsetof(struct __sk_buff, remote_ip6[0]) ...
             offsetof(struct __sk_buff, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, remote_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        case offsetof(struct __sk_buff, local_ip6[0]) ...
             offsetof(struct __sk_buff, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, local_ip6[0]);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct __sk_buff, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_dport,
                                                     2, target_size));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct __sk_buff, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct sock_common,
                                                     skc_num, 2, target_size));
                break;

        case offsetof(struct __sk_buff, tstamp):
                BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);

                if (type == BPF_WRITE)
                        insn = bpf_convert_tstamp_write(prog, si, insn);
                else
                        insn = bpf_convert_tstamp_read(prog, si, insn);
                break;

        case offsetof(struct __sk_buff, tstamp_type):
                insn = bpf_convert_tstamp_type_read(si, insn);
                break;

        case offsetof(struct __sk_buff, gso_segs):
                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_segs, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, gso_size):
                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     gso_size, 2,
                                                     target_size));
                break;
        case offsetof(struct __sk_buff, wire_len):
                BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);

                off = si->off;
                off -= offsetof(struct __sk_buff, wire_len);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct qdisc_skb_cb, pkt_len);
                *target_size = 4;
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
                break;

        case offsetof(struct __sk_buff, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, sk));
                break;
        case offsetof(struct __sk_buff, hwtstamp):
                BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
                BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);

                insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_DW,
                                      si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     hwtstamps, 8,
                                                     target_size));
                break;
        }

        return insn - insn_buf;
}

u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
                                const struct bpf_insn *si,
                                struct bpf_insn *insn_buf,
                                struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct bpf_sock, bound_dev_if):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_bound_dev_if));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_bound_dev_if));
                break;

        case offsetof(struct bpf_sock, mark):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_mark));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_mark));
                break;

        case offsetof(struct bpf_sock, priority):
                BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);

                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si,
                                                 offsetof(struct sock, sk_priority));
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      offsetof(struct sock, sk_priority));
                break;

        case offsetof(struct bpf_sock, family):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_family),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_family,
                                       sizeof_field(struct sock_common,
                                                    skc_family),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, type):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_type),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_type,
                                       sizeof_field(struct sock, sk_type),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, protocol):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_protocol),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_protocol,
                                       sizeof_field(struct sock, sk_protocol),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, src_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_rcv_saddr,
                                       sizeof_field(struct sock_common,
                                                    skc_rcv_saddr),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_ip4):
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_daddr,
                                       sizeof_field(struct sock_common,
                                                    skc_daddr),
                                       target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, src_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(
                                struct sock_common,
                                skc_v6_rcv_saddr.s6_addr32[0],
                                sizeof_field(struct sock_common,
                                             skc_v6_rcv_saddr.s6_addr32[0]),
                                target_size) + off);
#else
                (void)off;
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                off = si->off;
                off -= offsetof(struct bpf_sock, dst_ip6[0]);
                *insn++ = BPF_LDX_MEM(
                        BPF_SIZE(si->code), si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common,
                                       skc_v6_daddr.s6_addr32[0],
                                       sizeof_field(struct sock_common,
                                                    skc_v6_daddr.s6_addr32[0]),
                                       target_size) + off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
                *target_size = 4;
#endif
                break;

        case offsetof(struct bpf_sock, src_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_num),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_num,
                                       sizeof_field(struct sock_common,
                                                    skc_num),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, dst_port):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_dport,
                                       sizeof_field(struct sock_common,
                                                    skc_dport),
                                       target_size));
                break;

        case offsetof(struct bpf_sock, state):
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock_common, skc_state),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock_common, skc_state,
                                       sizeof_field(struct sock_common,
                                                    skc_state),
                                       target_size));
                break;
        case offsetof(struct bpf_sock, rx_queue_mapping):
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
                *insn++ = BPF_LDX_MEM(
                        BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
                        si->dst_reg, si->src_reg,
                        bpf_target_off(struct sock, sk_rx_queue_mapping,
                                       sizeof_field(struct sock,
                                                    sk_rx_queue_mapping),
                                       target_size));
                *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
                                      1);
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
#else
                *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
                *target_size = 2;
#endif
                break;
        }

        return insn - insn_buf;
}

static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type,
                                         const struct bpf_insn *si,
                                         struct bpf_insn *insn_buf,
                                         struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct __sk_buff, ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_buff, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct net_device, ifindex, 4,
                                                     target_size));
                break;
        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 xdp_convert_ctx_access(enum bpf_access_type type,
                                  const struct bpf_insn *si,
                                  struct bpf_insn *insn_buf,
                                  struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct xdp_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data));
                break;
        case offsetof(struct xdp_md, data_meta):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_meta),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_meta));
                break;
        case offsetof(struct xdp_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, data_end));
                break;
        case offsetof(struct xdp_md, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        case offsetof(struct xdp_md, rx_queue_index):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, rxq));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_rxq_info,
                                               queue_index));
                break;
        case offsetof(struct xdp_md, egress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct xdp_buff, txq));
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct xdp_txq_info, dev));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct net_device, ifindex));
                break;
        }

        return insn - insn_buf;
}

/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of
 * context Structure, F is Field in context structure that contains a pointer
 * to Nested Structure of type NS that has the field NF.
 *
 * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make
 * sure that SIZE is not greater than actual size of S.F.NF.
 *
 * If offset OFF is provided, the load happens from that offset relative to
 * offset of NF.
 */
#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF)               \
        do {                                                                       \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg,     \
                                      si->src_reg, offsetof(S, F));               \
                *insn++ = BPF_LDX_MEM(                                               \
                        SIZE, si->dst_reg, si->dst_reg,                               \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                + OFF);                                               \
        } while (0)

#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF)                               \
        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF,                       \
                                             BPF_FIELD_SIZEOF(NS, NF), 0)

/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
 * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
 *
 * In addition it uses Temporary Field TF (member of struct S) as the 3rd
 * "register" since two registers available in convert_ctx_access are not
 * enough: we can't override neither SRC, since it contains value to store, nor
 * DST since it contains pointer to context that may be used by later
 * instructions. But we need a temporary place to save pointer to nested
 * structure whose field we want to store to.
 */
#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF)               \
        do {                                                                       \
                int tmp_reg = BPF_REG_9;                                       \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg)               \
                        --tmp_reg;                                               \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg,               \
                                      offsetof(S, TF));                               \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg,               \
                                      si->dst_reg, offsetof(S, F));               \
                *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code),   \
                                       tmp_reg, si->src_reg,                       \
                        bpf_target_off(NS, NF, sizeof_field(NS, NF),               \
                                       target_size)                               \
                                       + OFF,                                       \
                                       si->imm);                               \
                *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg,               \
                                      offsetof(S, TF));                               \
        } while (0)

#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \
                                                      TF)                       \
        do {                                                                       \
                if (type == BPF_WRITE) {                                       \
                        SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE,   \
                                                         OFF, TF);               \
                } else {                                                       \
                        SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(                       \
                                S, NS, F, NF, SIZE, OFF);  \
                }                                                               \
        } while (0)

static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog, u32 *target_size)
{
        int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sock_addr, user_family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sockaddr, uaddr, sa_family);
                break;

        case offsetof(struct bpf_sock_addr, user_ip4):
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in, uaddr,
                        sin_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, user_ip6[0]);
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off,
                        tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, user_port):
                /* To get port we need to know sa_family first and then treat
                 * sockaddr as either sockaddr_in or sockaddr_in6.
                 * Though we can simplify since port field has same offset and
                 * size in both structures.
                 * Here we check this invariant and use just one of the
                 * structures if it's true.
                 */
                BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
                             offsetof(struct sockaddr_in6, sin6_port));
                BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
                             sizeof_field(struct sockaddr_in6, sin6_port));
                /* Account for sin6_port being smaller than user_port. */
                port_size = min(port_size, BPF_LDST_BYTES(si));
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
                        sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
                break;

        case offsetof(struct bpf_sock_addr, family):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_family);
                break;

        case offsetof(struct bpf_sock_addr, type):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_type);
                break;

        case offsetof(struct bpf_sock_addr, protocol):
                SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
                                            struct sock, sk, sk_protocol);
                break;

        case offsetof(struct bpf_sock_addr, msg_src_ip4):
                /* Treat t_ctx as struct in_addr for msg_src_ip4. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in_addr, t_ctx,
                        s_addr, BPF_SIZE(si->code), 0, tmp_reg);
                break;

        case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
                                msg_src_ip6[3]):
                off = si->off;
                off -= offsetof(struct bpf_sock_addr, msg_src_ip6[0]);
                /* Treat t_ctx as struct in6_addr for msg_src_ip6. */
                SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
                        struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
                        s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
                break;
        case offsetof(struct bpf_sock_addr, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_addr_kern, sk));
                break;
        }

        return insn - insn_buf;
}

static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
                                       const struct bpf_insn *si,
                                       struct bpf_insn *insn_buf,
                                       struct bpf_prog *prog,
                                       u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;     \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_locked_tcp_sock),              \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_locked_tcp_sock));              \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ,                      \
                                                       OBJ_FIELD),              \
                                      si->dst_reg, si->dst_reg,                      \
                                      offsetof(OBJ, OBJ_FIELD));              \
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(1);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_SK()                                                              \
        do {                                                                      \
                int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1;     \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == si->src_reg) {                              \
                        *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,              \
                                          offsetof(struct bpf_sock_ops_kern,  \
                                          temp));                              \
                        fullsock_reg = reg;                                      \
                        jmp += 2;                                              \
                }                                                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_fullsock),                      \
                                      fullsock_reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_fullsock));                      \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);              \
                if (si->dst_reg == si->src_reg)                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      si->dst_reg, si->src_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                if (si->dst_reg == si->src_reg)        {                              \
                        *insn++ = BPF_JMP_A(1);                                      \
                        *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                      temp));                                      \
                }                                                              \
        } while (0)

#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
                SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)

/* Helper macro for adding write access to tcp_sock or sock fields.
 * The macro is called with two registers, dst_reg which contains a pointer
 * to ctx (context) and src_reg which contains the value that should be
 * stored. However, we need an additional register since we cannot overwrite
 * dst_reg because it may be used later in the program.
 * Instead we "borrow" one of the other register. We first save its value
 * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
 * it at the end of the macro.
 */
#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ)                              \
        do {                                                                      \
                int reg = BPF_REG_9;                                              \
                BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) >                      \
                             sizeof_field(struct bpf_sock_ops, BPF_FIELD));   \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                if (si->dst_reg == reg || si->src_reg == reg)                      \
                        reg--;                                                      \
                *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern,     \
                                                is_locked_tcp_sock),              \
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               is_locked_tcp_sock));              \
                *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2);                      \
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(                              \
                                                struct bpf_sock_ops_kern, sk),\
                                      reg, si->dst_reg,                              \
                                      offsetof(struct bpf_sock_ops_kern, sk));\
                *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) |     \
                                       BPF_MEM | BPF_CLASS(si->code),              \
                                       reg, si->src_reg,                      \
                                       offsetof(OBJ, OBJ_FIELD),              \
                                       si->imm);                              \
                *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg,                      \
                                      offsetof(struct bpf_sock_ops_kern,      \
                                               temp));                              \
        } while (0)

#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE)              \
        do {                                                                      \
                if (TYPE == BPF_WRITE)                                              \
                        SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
                else                                                              \
                        SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);              \
        } while (0)

        switch (si->off) {
        case offsetof(struct bpf_sock_ops, op):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       op),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, op));
                break;

        case offsetof(struct bpf_sock_ops, replylong[0]) ...
             offsetof(struct bpf_sock_ops, replylong[3]):
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
                             sizeof_field(struct bpf_sock_ops_kern, reply));
                BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
                             sizeof_field(struct bpf_sock_ops_kern, replylong));
                off = si->off;
                off -= offsetof(struct bpf_sock_ops, replylong[0]);
                off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_W, si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                              off);
                break;

        case offsetof(struct bpf_sock_ops, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct bpf_sock_ops, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
             offsetof(struct bpf_sock_ops, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
             offsetof(struct bpf_sock_ops, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct bpf_sock_ops, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct bpf_sock_ops, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct bpf_sock_ops, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct bpf_sock_ops, is_fullsock):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern,
                                                is_fullsock),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               is_fullsock));
                break;

        case offsetof(struct bpf_sock_ops, state):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_state));
                break;

        case offsetof(struct bpf_sock_ops, rtt_min):
                BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
                             sizeof(struct minmax));
                BUILD_BUG_ON(sizeof(struct minmax) <
                             sizeof(struct minmax_sample));

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct bpf_sock_ops_kern, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct tcp_sock, rtt_min) +
                                      sizeof_field(struct minmax_sample, t));
                break;

        case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
                SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
                                   struct tcp_sock);
                break;

        case offsetof(struct bpf_sock_ops, sk_txhash):
                SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
                                          struct sock, type);
                break;
        case offsetof(struct bpf_sock_ops, snd_cwnd):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
                break;
        case offsetof(struct bpf_sock_ops, srtt_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
                break;
        case offsetof(struct bpf_sock_ops, snd_ssthresh):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
                break;
        case offsetof(struct bpf_sock_ops, rcv_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_nxt):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
                break;
        case offsetof(struct bpf_sock_ops, snd_una):
                SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
                break;
        case offsetof(struct bpf_sock_ops, mss_cache):
                SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
                break;
        case offsetof(struct bpf_sock_ops, ecn_flags):
                SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
                break;
        case offsetof(struct bpf_sock_ops, rate_delivered):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
                break;
        case offsetof(struct bpf_sock_ops, rate_interval_us):
                SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
                break;
        case offsetof(struct bpf_sock_ops, packets_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
                break;
        case offsetof(struct bpf_sock_ops, retrans_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
                break;
        case offsetof(struct bpf_sock_ops, total_retrans):
                SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
                break;
        case offsetof(struct bpf_sock_ops, segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_in):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
                break;
        case offsetof(struct bpf_sock_ops, segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
                break;
        case offsetof(struct bpf_sock_ops, data_segs_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
                break;
        case offsetof(struct bpf_sock_ops, lost_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
                break;
        case offsetof(struct bpf_sock_ops, sacked_out):
                SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
                break;
        case offsetof(struct bpf_sock_ops, bytes_received):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
                break;
        case offsetof(struct bpf_sock_ops, bytes_acked):
                SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
                break;
        case offsetof(struct bpf_sock_ops, sk):
                SOCK_OPS_GET_SK();
                break;
        case offsetof(struct bpf_sock_ops, skb_data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb_data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb_data_end));
                break;
        case offsetof(struct bpf_sock_ops, skb_data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, data));
                break;
        case offsetof(struct bpf_sock_ops, skb_len):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
                                      si->dst_reg, si->dst_reg,
                                      offsetof(struct sk_buff, len));
                break;
        case offsetof(struct bpf_sock_ops, skb_tcp_flags):
                off = offsetof(struct sk_buff, cb);
                off += offsetof(struct tcp_skb_cb, tcp_flags);
                *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
                                                       tcp_flags),
                                      si->dst_reg, si->dst_reg, off);
                break;
        case offsetof(struct bpf_sock_ops, skb_hwtstamp): {
                struct bpf_insn *jmp_on_null_skb;

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
                                                       skb),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sock_ops_kern,
                                               skb));
                /* Reserve one insn to test skb == NULL */
                jmp_on_null_skb = insn++;
                insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn);
                *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
                                      bpf_target_off(struct skb_shared_info,
                                                     hwtstamps, 8,
                                                     target_size));
                *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
                                               insn - jmp_on_null_skb - 1);
                break;
        }
        }
        return insn - insn_buf;
}

/* data_end = skb->data + skb_headlen() */
static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
                                                    struct bpf_insn *insn)
{
        int reg;
        int temp_reg_off = offsetof(struct sk_buff, cb) +
                           offsetof(struct sk_skb_cb, temp_reg);

        if (si->src_reg == si->dst_reg) {
                /* We need an extra register, choose and save a register. */
                reg = BPF_REG_9;
                if (si->src_reg == reg || si->dst_reg == reg)
                        reg--;
                if (si->src_reg == reg || si->dst_reg == reg)
                        reg--;
                *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
        } else {
                reg = si->dst_reg;
        }

        /* reg = skb->data */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
                              reg, si->src_reg,
                              offsetof(struct sk_buff, data));
        /* AX = skb->len */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
                              BPF_REG_AX, si->src_reg,
                              offsetof(struct sk_buff, len));
        /* reg = skb->data + skb->len */
        *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
        /* AX = skb->data_len */
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
                              BPF_REG_AX, si->src_reg,
                              offsetof(struct sk_buff, data_len));

        /* reg = skb->data + skb->len - skb->data_len */
        *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);

        if (si->src_reg == si->dst_reg) {
                /* Restore the saved register */
                *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
                *insn++ = BPF_MOV64_REG(si->dst_reg, reg);
                *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
        }

        return insn;
}

static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
        int off;

        switch (si->off) {
        case offsetof(struct __sk_buff, data_end):
                insn = bpf_convert_data_end_access(si, insn);
                break;
        case offsetof(struct __sk_buff, cb[0]) ...
             offsetofend(struct __sk_buff, cb[4]) - 1:
                BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
                BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
                              offsetof(struct sk_skb_cb, data)) %
                             sizeof(__u64));

                prog->cb_access = 1;
                off  = si->off;
                off -= offsetof(struct __sk_buff, cb[0]);
                off += offsetof(struct sk_buff, cb);
                off += offsetof(struct sk_skb_cb, data);
                if (type == BPF_WRITE)
                        *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
                else
                        *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
                                              si->src_reg, off);
                break;


        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
                                              target_size);
        }

        return insn - insn_buf;
}

static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;
#if IS_ENABLED(CONFIG_IPV6)
        int off;
#endif

        /* convert ctx uses the fact sg element is first in struct */
        BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);

        switch (si->off) {
        case offsetof(struct sk_msg_md, data):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data));
                break;
        case offsetof(struct sk_msg_md, data_end):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, data_end));
                break;
        case offsetof(struct sk_msg_md, family):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_family));
                break;

        case offsetof(struct sk_msg_md, remote_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_daddr));
                break;

        case offsetof(struct sk_msg_md, local_ip4):
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_rcv_saddr) != 4);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                              struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_rcv_saddr));
                break;

        case offsetof(struct sk_msg_md, remote_ip6[0]) ...
             offsetof(struct sk_msg_md, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_daddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, remote_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_daddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, local_ip6[0]) ...
             offsetof(struct sk_msg_md, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
                BUILD_BUG_ON(sizeof_field(struct sock_common,
                                          skc_v6_rcv_saddr.s6_addr32[0]) != 4);

                off = si->off;
                off -= offsetof(struct sk_msg_md, local_ip6[0]);
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common,
                                               skc_v6_rcv_saddr.s6_addr32[0]) +
                                      off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;

        case offsetof(struct sk_msg_md, remote_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
                *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16);
#endif
                break;

        case offsetof(struct sk_msg_md, local_port):
                BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);

                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
                                                struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
                                      offsetof(struct sock_common, skc_num));
                break;

        case offsetof(struct sk_msg_md, size):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg_sg, size));
                break;

        case offsetof(struct sk_msg_md, sk):
                *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
                                      si->dst_reg, si->src_reg,
                                      offsetof(struct sk_msg, sk));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_filter_verifier_ops = {
        .get_func_proto                = sk_filter_func_proto,
        .is_valid_access        = sk_filter_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_ld_abs                = bpf_gen_ld_abs,
};

const struct bpf_prog_ops sk_filter_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
        .get_func_proto                = tc_cls_act_func_proto,
        .is_valid_access        = tc_cls_act_is_valid_access,
        .convert_ctx_access        = tc_cls_act_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
        .gen_ld_abs                = bpf_gen_ld_abs,
        .btf_struct_access        = tc_cls_act_btf_struct_access,
};

const struct bpf_prog_ops tc_cls_act_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops xdp_verifier_ops = {
        .get_func_proto                = xdp_func_proto,
        .is_valid_access        = xdp_is_valid_access,
        .convert_ctx_access        = xdp_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
        .btf_struct_access        = xdp_btf_struct_access,
};

const struct bpf_prog_ops xdp_prog_ops = {
        .test_run                = bpf_prog_test_run_xdp,
};

const struct bpf_verifier_ops cg_skb_verifier_ops = {
        .get_func_proto                = cg_skb_func_proto,
        .is_valid_access        = cg_skb_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops cg_skb_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_in_verifier_ops = {
        .get_func_proto                = lwt_in_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_in_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_out_verifier_ops = {
        .get_func_proto                = lwt_out_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_out_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_xmit_verifier_ops = {
        .get_func_proto                = lwt_xmit_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
        .gen_prologue                = tc_cls_act_prologue,
};

const struct bpf_prog_ops lwt_xmit_prog_ops = {
        .test_run                = bpf_prog_test_run_skb,
};

const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
        .get_func_proto                = lwt_seg6local_func_proto,
        .is_valid_access        = lwt_is_valid_access,
        .convert_ctx_access        = bpf_convert_ctx_access,
};

const struct bpf_prog_ops lwt_seg6local_prog_ops = {
};

const struct bpf_verifier_ops cg_sock_verifier_ops = {
        .get_func_proto                = sock_filter_func_proto,
        .is_valid_access        = sock_filter_is_valid_access,
        .convert_ctx_access        = bpf_sock_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_prog_ops = {
};

const struct bpf_verifier_ops cg_sock_addr_verifier_ops = {
        .get_func_proto                = sock_addr_func_proto,
        .is_valid_access        = sock_addr_is_valid_access,
        .convert_ctx_access        = sock_addr_convert_ctx_access,
};

const struct bpf_prog_ops cg_sock_addr_prog_ops = {
};

const struct bpf_verifier_ops sock_ops_verifier_ops = {
        .get_func_proto                = sock_ops_func_proto,
        .is_valid_access        = sock_ops_is_valid_access,
        .convert_ctx_access        = sock_ops_convert_ctx_access,
};

const struct bpf_prog_ops sock_ops_prog_ops = {
};

const struct bpf_verifier_ops sk_skb_verifier_ops = {
        .get_func_proto                = sk_skb_func_proto,
        .is_valid_access        = sk_skb_is_valid_access,
        .convert_ctx_access        = sk_skb_convert_ctx_access,
        .gen_prologue                = sk_skb_prologue,
};

const struct bpf_prog_ops sk_skb_prog_ops = {
};

const struct bpf_verifier_ops sk_msg_verifier_ops = {
        .get_func_proto                = sk_msg_func_proto,
        .is_valid_access        = sk_msg_is_valid_access,
        .convert_ctx_access        = sk_msg_convert_ctx_access,
        .gen_prologue                = bpf_noop_prologue,
};

const struct bpf_prog_ops sk_msg_prog_ops = {
};

const struct bpf_verifier_ops flow_dissector_verifier_ops = {
        .get_func_proto                = flow_dissector_func_proto,
        .is_valid_access        = flow_dissector_is_valid_access,
        .convert_ctx_access        = flow_dissector_convert_ctx_access,
};

const struct bpf_prog_ops flow_dissector_prog_ops = {
        .test_run                = bpf_prog_test_run_flow_dissector,
};

int sk_detach_filter(struct sock *sk)
{
        int ret = -ENOENT;
        struct sk_filter *filter;

        if (sock_flag(sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (filter) {
                RCU_INIT_POINTER(sk->sk_filter, NULL);
                sk_filter_uncharge(sk, filter);
                ret = 0;
        }

        return ret;
}
EXPORT_SYMBOL_GPL(sk_detach_filter);

int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
{
        struct sock_fprog_kern *fprog;
        struct sk_filter *filter;
        int ret = 0;

        sockopt_lock_sock(sk);
        filter = rcu_dereference_protected(sk->sk_filter,
                                           lockdep_sock_is_held(sk));
        if (!filter)
                goto out;

        /* We're copying the filter that has been originally attached,
         * so no conversion/decode needed anymore. eBPF programs that
         * have no original program cannot be dumped through this.
         */
        ret = -EACCES;
        fprog = filter->prog->orig_prog;
        if (!fprog)
                goto out;

        ret = fprog->len;
        if (!len)
                /* User space only enquires number of filter blocks. */
                goto out;

        ret = -EINVAL;
        if (len < fprog->len)
                goto out;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
                goto out;

        /* Instead of bytes, the API requests to return the number
         * of filter blocks.
         */
        ret = fprog->len;
out:
        sockopt_release_sock(sk);
        return ret;
}

#ifdef CONFIG_INET
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
                                    struct sock_reuseport *reuse,
                                    struct sock *sk, struct sk_buff *skb,
                                    struct sock *migrating_sk,
                                    u32 hash)
{
        reuse_kern->skb = skb;
        reuse_kern->sk = sk;
        reuse_kern->selected_sk = NULL;
        reuse_kern->migrating_sk = migrating_sk;
        reuse_kern->data_end = skb->data + skb_headlen(skb);
        reuse_kern->hash = hash;
        reuse_kern->reuseport_id = reuse->reuseport_id;
        reuse_kern->bind_inany = reuse->bind_inany;
}

struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  struct sock *migrating_sk,
                                  u32 hash)
{
        struct sk_reuseport_kern reuse_kern;
        enum sk_action action;

        bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
        action = bpf_prog_run(prog, &reuse_kern);

        if (action == SK_PASS)
                return reuse_kern.selected_sk;
        else
                return ERR_PTR(-ECONNREFUSED);
}

BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
           struct bpf_map *, map, void *, key, u32, flags)
{
        bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
        struct sock_reuseport *reuse;
        struct sock *selected_sk;
        int err;

        selected_sk = map->ops->map_lookup_elem(map, key);
        if (!selected_sk)
                return -ENOENT;

        reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
        if (!reuse) {
                /* reuseport_array has only sk with non NULL sk_reuseport_cb.
                 * The only (!reuse) case here is - the sk has already been
                 * unhashed (e.g. by close()), so treat it as -ENOENT.
                 *
                 * Other maps (e.g. sock_map) do not provide this guarantee and
                 * the sk may never be in the reuseport group to begin with.
                 */
                err = is_sockarray ? -ENOENT : -EINVAL;
                goto error;
        }

        if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
                struct sock *sk = reuse_kern->sk;

                if (sk->sk_protocol != selected_sk->sk_protocol) {
                        err = -EPROTOTYPE;
                } else if (sk->sk_family != selected_sk->sk_family) {
                        err = -EAFNOSUPPORT;
                } else {
                        /* Catch all. Likely bound to a different sockaddr. */
                        err = -EBADFD;
                }
                goto error;
        }

        reuse_kern->selected_sk = selected_sk;

        return 0;
error:
        /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
        if (sk_is_refcounted(selected_sk))
                sock_put(selected_sk);

        return err;
}

static const struct bpf_func_proto sk_select_reuseport_proto = {
        .func           = sk_select_reuseport,
        .gpl_only       = false,
        .ret_type       = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type      = ARG_CONST_MAP_PTR,
        .arg3_type      = ARG_PTR_TO_MAP_KEY,
        .arg4_type        = ARG_ANYTHING,
};

BPF_CALL_4(sk_reuseport_load_bytes,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len)
{
        return ____bpf_skb_load_bytes(reuse_kern->skb, offset, to, len);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_proto = {
        .func                = sk_reuseport_load_bytes,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
};

BPF_CALL_5(sk_reuseport_load_bytes_relative,
           const struct sk_reuseport_kern *, reuse_kern, u32, offset,
           void *, to, u32, len, u32, start_header)
{
        return ____bpf_skb_load_bytes_relative(reuse_kern->skb, offset, to,
                                               len, start_header);
}

static const struct bpf_func_proto sk_reuseport_load_bytes_relative_proto = {
        .func                = sk_reuseport_load_bytes_relative,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_ANYTHING,
        .arg3_type        = ARG_PTR_TO_UNINIT_MEM,
        .arg4_type        = ARG_CONST_SIZE,
        .arg5_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_reuseport_func_proto(enum bpf_func_id func_id,
                        const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_sk_select_reuseport:
                return &sk_select_reuseport_proto;
        case BPF_FUNC_skb_load_bytes:
                return &sk_reuseport_load_bytes_proto;
        case BPF_FUNC_skb_load_bytes_relative:
                return &sk_reuseport_load_bytes_relative_proto;
        case BPF_FUNC_get_socket_cookie:
                return &bpf_get_socket_ptr_cookie_proto;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }
}

static bool
sk_reuseport_is_valid_access(int off, int size,
                             enum bpf_access_type type,
                             const struct bpf_prog *prog,
                             struct bpf_insn_access_aux *info)
{
        const u32 size_default = sizeof(__u32);

        if (off < 0 || off >= sizeof(struct sk_reuseport_md) ||
            off % size || type != BPF_READ)
                return false;

        switch (off) {
        case offsetof(struct sk_reuseport_md, data):
                info->reg_type = PTR_TO_PACKET;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, data_end):
                info->reg_type = PTR_TO_PACKET_END;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, hash):
                return size == size_default;

        case offsetof(struct sk_reuseport_md, sk):
                info->reg_type = PTR_TO_SOCKET;
                return size == sizeof(__u64);

        case offsetof(struct sk_reuseport_md, migrating_sk):
                info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
                return size == sizeof(__u64);

        /* Fields that allow narrowing */
        case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
                if (size < sizeof_field(struct sk_buff, protocol))
                        return false;
                fallthrough;
        case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
        case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
        case bpf_ctx_range(struct sk_reuseport_md, len):
                bpf_ctx_record_field_size(info, size_default);
                return bpf_ctx_narrow_access_ok(off, size, size_default);

        default:
                return false;
        }
}

#define SK_REUSEPORT_LOAD_FIELD(F) ({                                        \
        *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
                              si->dst_reg, si->src_reg,                        \
                              bpf_target_off(struct sk_reuseport_kern, F, \
                                             sizeof_field(struct sk_reuseport_kern, F), \
                                             target_size));                \
        })

#define SK_REUSEPORT_LOAD_SKB_FIELD(SKB_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sk_buff,                        \
                                    skb,                                \
                                    SKB_FIELD)

#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD)                                \
        SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern,                \
                                    struct sock,                        \
                                    sk,                                        \
                                    SK_FIELD)

static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
                                           const struct bpf_insn *si,
                                           struct bpf_insn *insn_buf,
                                           struct bpf_prog *prog,
                                           u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct sk_reuseport_md, data):
                SK_REUSEPORT_LOAD_SKB_FIELD(data);
                break;

        case offsetof(struct sk_reuseport_md, len):
                SK_REUSEPORT_LOAD_SKB_FIELD(len);
                break;

        case offsetof(struct sk_reuseport_md, eth_protocol):
                SK_REUSEPORT_LOAD_SKB_FIELD(protocol);
                break;

        case offsetof(struct sk_reuseport_md, ip_protocol):
                SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
                break;

        case offsetof(struct sk_reuseport_md, data_end):
                SK_REUSEPORT_LOAD_FIELD(data_end);
                break;

        case offsetof(struct sk_reuseport_md, hash):
                SK_REUSEPORT_LOAD_FIELD(hash);
                break;

        case offsetof(struct sk_reuseport_md, bind_inany):
                SK_REUSEPORT_LOAD_FIELD(bind_inany);
                break;

        case offsetof(struct sk_reuseport_md, sk):
                SK_REUSEPORT_LOAD_FIELD(sk);
                break;

        case offsetof(struct sk_reuseport_md, migrating_sk):
                SK_REUSEPORT_LOAD_FIELD(migrating_sk);
                break;
        }

        return insn - insn_buf;
}

const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
        .get_func_proto                = sk_reuseport_func_proto,
        .is_valid_access        = sk_reuseport_is_valid_access,
        .convert_ctx_access        = sk_reuseport_convert_ctx_access,
};

const struct bpf_prog_ops sk_reuseport_prog_ops = {
};

DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
EXPORT_SYMBOL(bpf_sk_lookup_enabled);

BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
           struct sock *, sk, u64, flags)
{
        if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
                               BPF_SK_LOOKUP_F_NO_REUSEPORT)))
                return -EINVAL;
        if (unlikely(sk && sk_is_refcounted(sk)))
                return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
        if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
                return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
        if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
                return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */

        /* Check if socket is suitable for packet L3/L4 protocol */
        if (sk && sk->sk_protocol != ctx->protocol)
                return -EPROTOTYPE;
        if (sk && sk->sk_family != ctx->family &&
            (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
                return -EAFNOSUPPORT;

        if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
                return -EEXIST;

        /* Select socket as lookup result */
        ctx->selected_sk = sk;
        ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
        return 0;
}

static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
        .func                = bpf_sk_lookup_assign,
        .gpl_only        = false,
        .ret_type        = RET_INTEGER,
        .arg1_type        = ARG_PTR_TO_CTX,
        .arg2_type        = ARG_PTR_TO_SOCKET_OR_NULL,
        .arg3_type        = ARG_ANYTHING,
};

static const struct bpf_func_proto *
sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        switch (func_id) {
        case BPF_FUNC_perf_event_output:
                return &bpf_event_output_data_proto;
        case BPF_FUNC_sk_assign:
                return &bpf_sk_lookup_assign_proto;
        case BPF_FUNC_sk_release:
                return &bpf_sk_release_proto;
        default:
                return bpf_sk_base_func_proto(func_id, prog);
        }
}

static bool sk_lookup_is_valid_access(int off, int size,
                                      enum bpf_access_type type,
                                      const struct bpf_prog *prog,
                                      struct bpf_insn_access_aux *info)
{
        if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
                return false;
        if (off % size != 0)
                return false;
        if (type != BPF_READ)
                return false;

        switch (off) {
        case offsetof(struct bpf_sk_lookup, sk):
                info->reg_type = PTR_TO_SOCKET_OR_NULL;
                return size == sizeof(__u64);

        case bpf_ctx_range(struct bpf_sk_lookup, family):
        case bpf_ctx_range(struct bpf_sk_lookup, protocol):
        case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
        case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
        case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
        case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
        case bpf_ctx_range(struct bpf_sk_lookup, local_port):
        case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
                bpf_ctx_record_field_size(info, sizeof(__u32));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));

        case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
                /* Allow 4-byte access to 2-byte field for backward compatibility */
                if (size == sizeof(__u32))
                        return true;
                bpf_ctx_record_field_size(info, sizeof(__be16));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));

        case offsetofend(struct bpf_sk_lookup, remote_port) ...
             offsetof(struct bpf_sk_lookup, local_ip4) - 1:
                /* Allow access to zero padding for backward compatibility */
                bpf_ctx_record_field_size(info, sizeof(__u16));
                return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));

        default:
                return false;
        }
}

static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
                                        const struct bpf_insn *si,
                                        struct bpf_insn *insn_buf,
                                        struct bpf_prog *prog,
                                        u32 *target_size)
{
        struct bpf_insn *insn = insn_buf;

        switch (si->off) {
        case offsetof(struct bpf_sk_lookup, sk):
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, selected_sk));
                break;

        case offsetof(struct bpf_sk_lookup, family):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     family, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, protocol):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     protocol, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, remote_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.saddr, 4, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, local_ip4):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     v4.daddr, 4, target_size));
                break;

        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                remote_ip6[0], remote_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.saddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case bpf_ctx_range_till(struct bpf_sk_lookup,
                                local_ip6[0], local_ip6[3]): {
#if IS_ENABLED(CONFIG_IPV6)
                int off = si->off;

                off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
                off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
                *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
                                      offsetof(struct bpf_sk_lookup_kern, v6.daddr));
                *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
#else
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
#endif
                break;
        }
        case offsetof(struct bpf_sk_lookup, remote_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     sport, 2, target_size));
                break;

        case offsetofend(struct bpf_sk_lookup, remote_port):
                *target_size = 2;
                *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
                break;

        case offsetof(struct bpf_sk_lookup, local_port):
                *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     dport, 2, target_size));
                break;

        case offsetof(struct bpf_sk_lookup, ingress_ifindex):
                *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
                                      bpf_target_off(struct bpf_sk_lookup_kern,
                                                     ingress_ifindex, 4, target_size));
                break;
        }

        return insn - insn_buf;
}

const struct bpf_prog_ops sk_lookup_prog_ops = {
        .test_run = bpf_prog_test_run_sk_lookup,
};

const struct bpf_verifier_ops sk_lookup_verifier_ops = {
        .get_func_proto                = sk_lookup_func_proto,
        .is_valid_access        = sk_lookup_is_valid_access,
        .convert_ctx_access        = sk_lookup_convert_ctx_access,
};

#endif /* CONFIG_INET */

DEFINE_BPF_DISPATCHER(xdp)

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
{
        bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
}

BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
BTF_SOCK_TYPE_xxx
#undef BTF_SOCK_TYPE

BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
{
        /* tcp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct tcp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
            sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
        .func                        = bpf_skc_to_tcp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
};

BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
{
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
        .func                        = bpf_skc_to_tcp_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
};

BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
{
        /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
         * generated if CONFIG_INET=n. Trigger an explicit generation here.
         */
        BTF_TYPE_EMIT(struct inet_timewait_sock);
        BTF_TYPE_EMIT(struct tcp_timewait_sock);

#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

#if IS_BUILTIN(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
        .func                        = bpf_skc_to_tcp_timewait_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
};

BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
{
#ifdef CONFIG_INET
        if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

#if IS_BUILTIN(CONFIG_IPV6)
        if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
                return (unsigned long)sk;
#endif

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
        .func                        = bpf_skc_to_tcp_request_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
};

BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
{
        /* udp6_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct udp6_sock);
        if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
            sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
        .func                        = bpf_skc_to_udp6_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
};

BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
{
        /* unix_sock type is not generated in dwarf and hence btf,
         * trigger an explicit type generation here.
         */
        BTF_TYPE_EMIT(struct unix_sock);
        if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX)
                return (unsigned long)sk;

        return (unsigned long)NULL;
}

const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
        .func                        = bpf_skc_to_unix_sock,
        .gpl_only                = false,
        .ret_type                = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type                = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
        .ret_btf_id                = &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
};

BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
{
        BTF_TYPE_EMIT(struct mptcp_sock);
        return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
}

const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
        .func                = bpf_skc_to_mptcp_sock,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_BTF_ID_OR_NULL,
        .arg1_type        = ARG_PTR_TO_SOCK_COMMON,
        .ret_btf_id        = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
};

BPF_CALL_1(bpf_sock_from_file, struct file *, file)
{
        return (unsigned long)sock_from_file(file);
}

BTF_ID_LIST(bpf_sock_from_file_btf_ids)
BTF_ID(struct, socket)
BTF_ID(struct, file)

const struct bpf_func_proto bpf_sock_from_file_proto = {
        .func                = bpf_sock_from_file,
        .gpl_only        = false,
        .ret_type        = RET_PTR_TO_BTF_ID_OR_NULL,
        .ret_btf_id        = &bpf_sock_from_file_btf_ids[0],
        .arg1_type        = ARG_PTR_TO_BTF_ID,
        .arg1_btf_id        = &bpf_sock_from_file_btf_ids[1],
};

static const struct bpf_func_proto *
bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
        const struct bpf_func_proto *func;

        switch (func_id) {
        case BPF_FUNC_skc_to_tcp6_sock:
                func = &bpf_skc_to_tcp6_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_sock:
                func = &bpf_skc_to_tcp_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_timewait_sock:
                func = &bpf_skc_to_tcp_timewait_sock_proto;
                break;
        case BPF_FUNC_skc_to_tcp_request_sock:
                func = &bpf_skc_to_tcp_request_sock_proto;
                break;
        case BPF_FUNC_skc_to_udp6_sock:
                func = &bpf_skc_to_udp6_sock_proto;
                break;
        case BPF_FUNC_skc_to_unix_sock:
                func = &bpf_skc_to_unix_sock_proto;
                break;
        case BPF_FUNC_skc_to_mptcp_sock:
                func = &bpf_skc_to_mptcp_sock_proto;
                break;
        case BPF_FUNC_ktime_get_coarse_ns:
                return &bpf_ktime_get_coarse_ns_proto;
        default:
                return bpf_base_func_proto(func_id, prog);
        }

        if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
                return NULL;

        return func;
}

__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
                                    struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        struct sk_buff *skb = (struct sk_buff *)s;

        if (flags) {
                bpf_dynptr_set_null(ptr);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);

        return 0;
}

__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
                                    struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        struct xdp_buff *xdp = (struct xdp_buff *)x;

        if (flags) {
                bpf_dynptr_set_null(ptr);
                return -EINVAL;
        }

        bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));

        return 0;
}

__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
                                           const u8 *sun_path, u32 sun_path__sz)
{
        struct sockaddr_un *un;

        if (sa_kern->sk->sk_family != AF_UNIX)
                return -EINVAL;

        /* We do not allow changing the address to unnamed or larger than the
         * maximum allowed address size for a unix sockaddr.
         */
        if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
                return -EINVAL;

        un = (struct sockaddr_un *)sa_kern->uaddr;
        memcpy(un->sun_path, sun_path, sun_path__sz);
        sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;

        return 0;
}

__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
                                        struct bpf_tcp_req_attrs *attrs, int attrs__sz)
{
#if IS_ENABLED(CONFIG_SYN_COOKIES)
        struct sk_buff *skb = (struct sk_buff *)s;
        const struct request_sock_ops *ops;
        struct inet_request_sock *ireq;
        struct tcp_request_sock *treq;
        struct request_sock *req;
        struct net *net;
        __u16 min_mss;
        u32 tsoff = 0;

        if (attrs__sz != sizeof(*attrs) ||
            attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2])
                return -EINVAL;

        if (!skb_at_tc_ingress(skb))
                return -EINVAL;

        net = dev_net(skb->dev);
        if (net != sock_net(sk))
                return -ENETUNREACH;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                ops = &tcp_request_sock_ops;
                min_mss = 536;
                break;
#if IS_BUILTIN(CONFIG_IPV6)
        case htons(ETH_P_IPV6):
                ops = &tcp6_request_sock_ops;
                min_mss = IPV6_MIN_MTU - 60;
                break;
#endif
        default:
                return -EINVAL;
        }

        if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN ||
            sk_is_mptcp(sk))
                return -EINVAL;

        if (attrs->mss < min_mss)
                return -EINVAL;

        if (attrs->wscale_ok) {
                if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling))
                        return -EINVAL;

                if (attrs->snd_wscale > TCP_MAX_WSCALE ||
                    attrs->rcv_wscale > TCP_MAX_WSCALE)
                        return -EINVAL;
        }

        if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
                return -EINVAL;

        if (attrs->tstamp_ok) {
                if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
                        return -EINVAL;

                tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns());
        }

        req = inet_reqsk_alloc(ops, sk, false);
        if (!req)
                return -ENOMEM;

        ireq = inet_rsk(req);
        treq = tcp_rsk(req);

        req->rsk_listener = sk;
        req->syncookie = 1;
        req->mss = attrs->mss;
        req->ts_recent = attrs->rcv_tsval;

        ireq->snd_wscale = attrs->snd_wscale;
        ireq->rcv_wscale = attrs->rcv_wscale;
        ireq->tstamp_ok        = !!attrs->tstamp_ok;
        ireq->sack_ok = !!attrs->sack_ok;
        ireq->wscale_ok = !!attrs->wscale_ok;
        ireq->ecn_ok = !!attrs->ecn_ok;

        treq->req_usec_ts = !!attrs->usec_ts_ok;
        treq->ts_off = tsoff;

        skb_orphan(skb);
        skb->sk = req_to_sk(req);
        skb->destructor = sock_pfree;

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}

__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
                                              u64 flags)
{
        struct sk_buff *skb;

        if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
                return -EOPNOTSUPP;

        if (flags)
                return -EINVAL;

        skb = skops->skb;
        skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
        TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
        skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;

        return 0;
}

__bpf_kfunc_end_defs();

int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
                               struct bpf_dynptr *ptr__uninit)
{
        struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
        int err;

        err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
        if (err)
                return err;

        bpf_dynptr_set_rdonly(ptr);

        return 0;
}

BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb)

BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)

BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)

BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)

BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)

static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_skb,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_xdp,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_sock_addr,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_tcp_reqsk,
};

static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
        .owner = THIS_MODULE,
        .set = &bpf_kfunc_check_set_sock_ops,
};

static int __init bpf_kfunc_init(void)
{
        int ret;

        ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
                                               &bpf_kfunc_set_sock_addr);
        ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
        return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
}
late_initcall(bpf_kfunc_init);

__bpf_kfunc_start_defs();

/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
 *
 * The function expects a non-NULL pointer to a socket, and invokes the
 * protocol specific socket destroy handlers.
 *
 * The helper can only be called from BPF contexts that have acquired the socket
 * locks.
 *
 * Parameters:
 * @sock: Pointer to socket to be destroyed
 *
 * Return:
 * On error, may return EPROTONOSUPPORT, EINVAL.
 * EPROTONOSUPPORT if protocol specific destroy handler is not supported.
 * 0 otherwise
 */
__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
{
        struct sock *sk = (struct sock *)sock;

        /* The locking semantics that allow for synchronous execution of the
         * destroy handlers are only supported for TCP and UDP.
         * Supporting protocols will need to acquire sock lock in the BPF context
         * prior to invoking this kfunc.
         */
        if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
                                           sk->sk_protocol != IPPROTO_UDP))
                return -EOPNOTSUPP;

        return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
}

__bpf_kfunc_end_defs();

BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)

static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
{
        if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
            prog->expected_attach_type != BPF_TRACE_ITER)
                return -EACCES;
        return 0;
}

static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
        .owner = THIS_MODULE,
        .set   = &bpf_sk_iter_kfunc_ids,
        .filter = tracing_iter_filter,
};

static int init_subsystem(void)
{
        return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
}
late_initcall(init_subsystem);



















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_NULLS_H
#define _LINUX_LIST_NULLS_H

#include <linux/poison.h>
#include <linux/const.h>

/*
 * Special version of lists, where end of list is not a NULL pointer,
 * but a 'nulls' marker, which can have many different values.
 * (up to 2^31 different values guaranteed on all platforms)
 *
 * In the standard hlist, termination of a list is the NULL pointer.
 * In this special 'nulls' variant, we use the fact that objects stored in
 * a list are aligned on a word (4 or 8 bytes alignment).
 * We therefore use the last significant bit of 'ptr' :
 * Set to 1 : This is a 'nulls' end-of-list marker (ptr >> 1)
 * Set to 0 : This is a pointer to some object (ptr)
 */

struct hlist_nulls_head {
        struct hlist_nulls_node *first;
};

struct hlist_nulls_node {
        struct hlist_nulls_node *next, **pprev;
};
#define NULLS_MARKER(value) (1UL | (((long)value) << 1))
#define INIT_HLIST_NULLS_HEAD(ptr, nulls) \
        ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls))
#define HLIST_NULLS_HEAD_INIT(nulls) {.first = (struct hlist_nulls_node *)NULLS_MARKER(nulls)}

#define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_nulls_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \
        })
/**
 * ptr_is_a_nulls - Test if a ptr is a nulls
 * @ptr: ptr to be tested
 *
 */
static inline int is_a_nulls(const struct hlist_nulls_node *ptr)
{
        return ((unsigned long)ptr & 1);
}

/**
 * get_nulls_value - Get the 'nulls' value of the end of chain
 * @ptr: end of chain
 *
 * Should be called only if is_a_nulls(ptr);
 */
static inline unsigned long get_nulls_value(const struct hlist_nulls_node *ptr)
{
        return ((unsigned long)ptr) >> 1;
}

/**
 * hlist_nulls_unhashed - Has node been removed and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed state.
 * For example, hlist_del_init_rcu() leaves the node in unhashed state,
 * but hlist_nulls_del() does not.
 */
static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
{
        return !h->pprev;
}

/**
 * hlist_nulls_unhashed_lockless - Has node been removed and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed state.
 * For example, hlist_del_init_rcu() leaves the node in unhashed state,
 * but hlist_nulls_del() does not.  Unlike hlist_nulls_unhashed(), this
 * function may be used locklessly.
 */
static inline int hlist_nulls_unhashed_lockless(const struct hlist_nulls_node *h)
{
        return !READ_ONCE(h->pprev);
}

static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
{
        return is_a_nulls(READ_ONCE(h->first));
}

static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
                                        struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *first = h->first;

        n->next = first;
        WRITE_ONCE(n->pprev, &h->first);
        h->first = n;
        if (!is_a_nulls(first))
                WRITE_ONCE(first->pprev, &n->next);
}

static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
{
        struct hlist_nulls_node *next = n->next;
        struct hlist_nulls_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (!is_a_nulls(next))
                WRITE_ONCE(next->pprev, pprev);
}

static inline void hlist_nulls_del(struct hlist_nulls_node *n)
{
        __hlist_nulls_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_nulls_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_nulls_for_each_entry(tpos, pos, head, member)                       \
        for (pos = (head)->first;                                               \
             (!is_a_nulls(pos)) &&                                               \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_nulls_for_each_entry_from - iterate over a hlist continuing from current point
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_nulls_for_each_entry_from(tpos, pos, member)        \
        for (; (!is_a_nulls(pos)) &&                                 \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

#endif









































































   53 



















   53 


















    7 































   53 
























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_GENERIC_PGALLOC_H
#define __ASM_GENERIC_PGALLOC_H

#ifdef CONFIG_MMU

#define GFP_PGTABLE_KERNEL        (GFP_KERNEL | __GFP_ZERO)
#define GFP_PGTABLE_USER        (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT)

/**
 * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *__pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        struct ptdesc *ptdesc = pagetable_alloc_noprof(GFP_PGTABLE_KERNEL &
                        ~__GFP_HIGHMEM, 0);

        if (!ptdesc)
                return NULL;
        return ptdesc_address(ptdesc);
}
#define __pte_alloc_one_kernel(...)        alloc_hooks(__pte_alloc_one_kernel_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
/**
 * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table
 * @mm: the mm_struct of the current context
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pte_t *pte_alloc_one_kernel_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_kernel_noprof(mm);
}
#define pte_alloc_one_kernel(...)        alloc_hooks(pte_alloc_one_kernel_noprof(__VA_ARGS__))
#endif

/**
 * pte_free_kernel - free PTE-level kernel page table memory
 * @mm: the mm_struct of the current context
 * @pte: pointer to the memory containing the page table
 */
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
        pagetable_free(virt_to_ptdesc(pte));
}

/**
 * __pte_alloc_one - allocate memory for a PTE-level user page table
 * @mm: the mm_struct of the current context
 * @gfp: GFP flags to use for the allocation
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * This function is intended for architectures that need
 * anything beyond simple page allocation or must have custom GFP flags.
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp)
{
        struct ptdesc *ptdesc;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pte_ctor(ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }

        return ptdesc_page(ptdesc);
}
#define __pte_alloc_one(...)        alloc_hooks(__pte_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PTE_ALLOC_ONE
/**
 * pte_alloc_one - allocate a page for PTE-level user page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor().
 *
 * Return: `struct page` referencing the ptdesc or %NULL on error
 */
static inline pgtable_t pte_alloc_one_noprof(struct mm_struct *mm)
{
        return __pte_alloc_one_noprof(mm, GFP_PGTABLE_USER);
}
#define pte_alloc_one(...)        alloc_hooks(pte_alloc_one_noprof(__VA_ARGS__))
#endif

/*
 * Should really implement gc for free page table pages. This could be
 * done with a reference count in struct page.
 */

/**
 * pte_free - free PTE-level user page table memory
 * @mm: the mm_struct of the current context
 * @pte_page: the `struct page` referencing the ptdesc
 */
static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
        struct ptdesc *ptdesc = page_ptdesc(pte_page);

        pagetable_dtor_free(ptdesc);
}


#if CONFIG_PGTABLE_LEVELS > 2

#ifndef __HAVE_ARCH_PMD_ALLOC_ONE
/**
 * pmd_alloc_one - allocate memory for a PMD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor().
 *
 * Allocations use %GFP_PGTABLE_USER in user context and
 * %GFP_PGTABLE_KERNEL in kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        struct ptdesc *ptdesc;
        gfp_t gfp = GFP_PGTABLE_USER;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;
        if (!pagetable_pmd_ctor(ptdesc)) {
                pagetable_free(ptdesc);
                return NULL;
        }
        return ptdesc_address(ptdesc);
}
#define pmd_alloc_one(...)        alloc_hooks(pmd_alloc_one_noprof(__VA_ARGS__))
#endif

#ifndef __HAVE_ARCH_PMD_FREE
static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pmd);

        BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3

static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        gfp &= ~__GFP_HIGHMEM;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;

        pagetable_pud_ctor(ptdesc);
        return ptdesc_address(ptdesc);
}
#define __pud_alloc_one(...)        alloc_hooks(__pud_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_PUD_ALLOC_ONE
/**
 * pud_alloc_one - allocate memory for a PUD-level page table
 * @mm: the mm_struct of the current context
 *
 * Allocate memory for a page table using %GFP_PGTABLE_USER for user context
 * and %GFP_PGTABLE_KERNEL for kernel context.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
static inline pud_t *pud_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        return __pud_alloc_one_noprof(mm, addr);
}
#define pud_alloc_one(...)        alloc_hooks(pud_alloc_one_noprof(__VA_ARGS__))
#endif

static inline void __pud_free(struct mm_struct *mm, pud_t *pud)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pud);

        BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}

#ifndef __HAVE_ARCH_PUD_FREE
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
        __pud_free(mm, pud);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 3 */

#if CONFIG_PGTABLE_LEVELS > 4

static inline p4d_t *__p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        gfp &= ~__GFP_HIGHMEM;

        ptdesc = pagetable_alloc_noprof(gfp, 0);
        if (!ptdesc)
                return NULL;

        pagetable_p4d_ctor(ptdesc);
        return ptdesc_address(ptdesc);
}
#define __p4d_alloc_one(...)        alloc_hooks(__p4d_alloc_one_noprof(__VA_ARGS__))

#ifndef __HAVE_ARCH_P4D_ALLOC_ONE
static inline p4d_t *p4d_alloc_one_noprof(struct mm_struct *mm, unsigned long addr)
{
        return __p4d_alloc_one_noprof(mm, addr);
}
#define p4d_alloc_one(...)        alloc_hooks(p4d_alloc_one_noprof(__VA_ARGS__))
#endif

static inline void __p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(p4d);

        BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}

#ifndef __HAVE_ARCH_P4D_FREE
static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
        if (!mm_p4d_folded(mm))
                __p4d_free(mm, p4d);
}
#endif

#endif /* CONFIG_PGTABLE_LEVELS > 4 */

static inline pgd_t *__pgd_alloc_noprof(struct mm_struct *mm, unsigned int order)
{
        gfp_t gfp = GFP_PGTABLE_USER;
        struct ptdesc *ptdesc;

        if (mm == &init_mm)
                gfp = GFP_PGTABLE_KERNEL;
        gfp &= ~__GFP_HIGHMEM;

        ptdesc = pagetable_alloc_noprof(gfp, order);
        if (!ptdesc)
                return NULL;

        pagetable_pgd_ctor(ptdesc);
        return ptdesc_address(ptdesc);
}
#define __pgd_alloc(...)        alloc_hooks(__pgd_alloc_noprof(__VA_ARGS__))

static inline void __pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        struct ptdesc *ptdesc = virt_to_ptdesc(pgd);

        BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
        pagetable_dtor_free(ptdesc);
}

#ifndef __HAVE_ARCH_PGD_FREE
static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
        __pgd_free(mm, pgd);
}
#endif

#endif /* CONFIG_MMU */

#endif /* __ASM_GENERIC_PGALLOC_H */























































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
/* SPDX-License-Identifier: GPL-2.0-only */
/* include/net/xdp.h
 *
 * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
 */
#ifndef __LINUX_NET_XDP_H__
#define __LINUX_NET_XDP_H__

#include <linux/bitfield.h>
#include <linux/filter.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h> /* skb_shared_info */

#include <net/page_pool/types.h>

/**
 * DOC: XDP RX-queue information
 *
 * The XDP RX-queue info (xdp_rxq_info) is associated with the driver
 * level RX-ring queues.  It is information that is specific to how
 * the driver has configured a given RX-ring queue.
 *
 * Each xdp_buff frame received in the driver carries a (pointer)
 * reference to this xdp_rxq_info structure.  This provides the XDP
 * data-path read-access to RX-info for both kernel and bpf-side
 * (limited subset).
 *
 * For now, direct access is only safe while running in NAPI/softirq
 * context.  Contents are read-mostly and must not be updated during
 * driver NAPI/softirq poll.
 *
 * The driver usage API is a register and unregister API.
 *
 * The struct is not directly tied to the XDP prog.  A new XDP prog
 * can be attached as long as it doesn't change the underlying
 * RX-ring.  If the RX-ring does change significantly, the NIC driver
 * naturally needs to stop the RX-ring before purging and reallocating
 * memory.  In that process the driver MUST call unregister (which
 * also applies for driver shutdown and unload).  The register API is
 * also mandatory during RX-ring setup.
 */

enum xdp_mem_type {
        MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
        MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
        MEM_TYPE_PAGE_POOL,
        MEM_TYPE_XSK_BUFF_POOL,
        MEM_TYPE_MAX,
};

/* XDP flags for ndo_xdp_xmit */
#define XDP_XMIT_FLUSH                (1U << 0)        /* doorbell signal consumer */
#define XDP_XMIT_FLAGS_MASK        XDP_XMIT_FLUSH

struct xdp_mem_info {
        u32 type; /* enum xdp_mem_type, but known size type */
        u32 id;
};

struct page_pool;

struct xdp_rxq_info {
        struct net_device *dev;
        u32 queue_index;
        u32 reg_state;
        struct xdp_mem_info mem;
        u32 frag_size;
} ____cacheline_aligned; /* perf critical, avoid false-sharing */

struct xdp_txq_info {
        struct net_device *dev;
};

enum xdp_buff_flags {
        XDP_FLAGS_HAS_FRAGS                = BIT(0), /* non-linear xdp buff */
        XDP_FLAGS_FRAGS_PF_MEMALLOC        = BIT(1), /* xdp paged memory is under
                                                   * pressure
                                                   */
};

struct xdp_buff {
        void *data;
        void *data_end;
        void *data_meta;
        void *data_hard_start;
        struct xdp_rxq_info *rxq;
        struct xdp_txq_info *txq;
        u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
        u32 flags; /* supported values defined in xdp_buff_flags */
};

static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp)
{
        return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS);
}

static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp)
{
        xdp->flags |= XDP_FLAGS_HAS_FRAGS;
}

static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
{
        xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
}

static __always_inline bool
xdp_buff_is_frag_pfmemalloc(const struct xdp_buff *xdp)
{
        return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
}

static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp)
{
        xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
}

static __always_inline void
xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
{
        xdp->frame_sz = frame_sz;
        xdp->rxq = rxq;
        xdp->flags = 0;
}

static __always_inline void
xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start,
                 int headroom, int data_len, const bool meta_valid)
{
        unsigned char *data = hard_start + headroom;

        xdp->data_hard_start = hard_start;
        xdp->data = data;
        xdp->data_end = data + data_len;
        xdp->data_meta = meta_valid ? data : data + 1;
}

/* Reserve memory area at end-of data area.
 *
 * This macro reserves tailroom in the XDP buffer by limiting the
 * XDP/BPF data access to data_hard_end.  Notice same area (and size)
 * is used for XDP_PASS, when constructing the SKB via build_skb().
 */
#define xdp_data_hard_end(xdp)                                \
        ((xdp)->data_hard_start + (xdp)->frame_sz -        \
         SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

static inline struct skb_shared_info *
xdp_get_shared_info_from_buff(const struct xdp_buff *xdp)
{
        return (struct skb_shared_info *)xdp_data_hard_end(xdp);
}

static __always_inline unsigned int
xdp_get_buff_len(const struct xdp_buff *xdp)
{
        unsigned int len = xdp->data_end - xdp->data;
        const struct skb_shared_info *sinfo;

        if (likely(!xdp_buff_has_frags(xdp)))
                goto out;

        sinfo = xdp_get_shared_info_from_buff(xdp);
        len += sinfo->xdp_frags_size;
out:
        return len;
}

void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp);

/**
 * __xdp_buff_add_frag - attach frag to &xdp_buff
 * @xdp: XDP buffer to attach the frag to
 * @netmem: network memory containing the frag
 * @offset: offset at which the frag starts
 * @size: size of the frag
 * @truesize: total memory size occupied by the frag
 * @try_coalesce: whether to try coalescing the frags (not valid for XSk)
 *
 * Attach frag to the XDP buffer. If it currently has no frags attached,
 * initialize the related fields, otherwise check that the frag number
 * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing
 * the frag with the previous one.
 * The function doesn't check/update the pfmemalloc bit. Please use the
 * non-underscored wrapper in drivers.
 *
 * Return: true on success, false if there's no space for the frag in
 * the shared info struct.
 */
static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem,
                                       u32 offset, u32 size, u32 truesize,
                                       bool try_coalesce)
{
        struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
        skb_frag_t *prev;
        u32 nr_frags;

        if (!xdp_buff_has_frags(xdp)) {
                xdp_buff_set_frags_flag(xdp);

                nr_frags = 0;
                sinfo->xdp_frags_size = 0;
                sinfo->xdp_frags_truesize = 0;

                goto fill;
        }

        nr_frags = sinfo->nr_frags;
        prev = &sinfo->frags[nr_frags - 1];

        if (try_coalesce && netmem == skb_frag_netmem(prev) &&
            offset == skb_frag_off(prev) + skb_frag_size(prev)) {
                skb_frag_size_add(prev, size);
                /* Guaranteed to only decrement the refcount */
                xdp_return_frag(netmem, xdp);
        } else if (unlikely(nr_frags == MAX_SKB_FRAGS)) {
                return false;
        } else {
fill:
                __skb_fill_netmem_desc_noacc(sinfo, nr_frags++, netmem,
                                             offset, size);
        }

        sinfo->nr_frags = nr_frags;
        sinfo->xdp_frags_size += size;
        sinfo->xdp_frags_truesize += truesize;

        return true;
}

/**
 * xdp_buff_add_frag - attach frag to &xdp_buff
 * @xdp: XDP buffer to attach the frag to
 * @netmem: network memory containing the frag
 * @offset: offset at which the frag starts
 * @size: size of the frag
 * @truesize: total memory size occupied by the frag
 *
 * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit.
 *
 * Return: true on success, false if there's no space for the frag in
 * the shared info struct.
 */
static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem,
                                     u32 offset, u32 size, u32 truesize)
{
        if (!__xdp_buff_add_frag(xdp, netmem, offset, size, truesize, true))
                return false;

        if (unlikely(netmem_is_pfmemalloc(netmem)))
                xdp_buff_set_frag_pfmemalloc(xdp);

        return true;
}

struct xdp_frame {
        void *data;
        u32 len;
        u32 headroom;
        u32 metasize; /* uses lower 8-bits */
        /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
         * while mem_type is valid on remote CPU.
         */
        enum xdp_mem_type mem_type:32;
        struct net_device *dev_rx; /* used by cpumap */
        u32 frame_sz;
        u32 flags; /* supported values defined in xdp_buff_flags */
};

static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame)
{
        return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
}

static __always_inline bool
xdp_frame_is_frag_pfmemalloc(const struct xdp_frame *frame)
{
        return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC);
}

#define XDP_BULK_QUEUE_SIZE        16
struct xdp_frame_bulk {
        int count;
        netmem_ref q[XDP_BULK_QUEUE_SIZE];
};

static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq)
{
        bq->count = 0;
}

static inline struct skb_shared_info *
xdp_get_shared_info_from_frame(const struct xdp_frame *frame)
{
        void *data_hard_start = frame->data - frame->headroom - sizeof(*frame);

        return (struct skb_shared_info *)(data_hard_start + frame->frame_sz -
                                SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
}

struct xdp_cpumap_stats {
        unsigned int redirect;
        unsigned int pass;
        unsigned int drop;
};

/* Clear kernel pointers in xdp_frame */
static inline void xdp_scrub_frame(struct xdp_frame *frame)
{
        frame->data = NULL;
        frame->dev_rx = NULL;
}

static inline void
xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags,
                           unsigned int size, unsigned int truesize,
                           bool pfmemalloc)
{
        struct skb_shared_info *sinfo = skb_shinfo(skb);

        sinfo->nr_frags = nr_frags;
        /*
         * ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``,
         * reset it after that these fields aren't used anymore.
         */
        sinfo->destructor_arg = NULL;

        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
        skb->pfmemalloc |= pfmemalloc;
}

/* Avoids inlining WARN macro in fast-path */
void xdp_warn(const char *msg, const char *func, const int line);
#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__)

struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp);
struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp);
struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp);
struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                                           struct sk_buff *skb,
                                           struct net_device *dev);
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
                                         struct net_device *dev);
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);

static inline
void xdp_convert_frame_to_buff(const struct xdp_frame *frame,
                               struct xdp_buff *xdp)
{
        xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame);
        xdp->data = frame->data;
        xdp->data_end = frame->data + frame->len;
        xdp->data_meta = frame->data - frame->metasize;
        xdp->frame_sz = frame->frame_sz;
        xdp->flags = frame->flags;
}

static inline
int xdp_update_frame_from_buff(const struct xdp_buff *xdp,
                               struct xdp_frame *xdp_frame)
{
        int metasize, headroom;

        /* Assure headroom is available for storing info */
        headroom = xdp->data - xdp->data_hard_start;
        metasize = xdp->data - xdp->data_meta;
        metasize = metasize > 0 ? metasize : 0;
        if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
                return -ENOSPC;

        /* Catch if driver didn't reserve tailroom for skb_shared_info */
        if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
                XDP_WARN("Driver BUG: missing reserved tailroom");
                return -ENOSPC;
        }

        xdp_frame->data = xdp->data;
        xdp_frame->len  = xdp->data_end - xdp->data;
        xdp_frame->headroom = headroom - sizeof(*xdp_frame);
        xdp_frame->metasize = metasize;
        xdp_frame->frame_sz = xdp->frame_sz;
        xdp_frame->flags = xdp->flags;

        return 0;
}

/* Convert xdp_buff to xdp_frame */
static inline
struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
{
        struct xdp_frame *xdp_frame;

        if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
                return xdp_convert_zc_to_xdp_frame(xdp);

        /* Store info in top of packet */
        xdp_frame = xdp->data_hard_start;
        if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0))
                return NULL;

        /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */
        xdp_frame->mem_type = xdp->rxq->mem.type;

        return xdp_frame;
}

void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
                  bool napi_direct, struct xdp_buff *xdp);
void xdp_return_frame(struct xdp_frame *xdpf);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
void xdp_return_buff(struct xdp_buff *xdp);
void xdp_return_frame_bulk(struct xdp_frame *xdpf,
                           struct xdp_frame_bulk *bq);

static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
{
        if (unlikely(!bq->count))
                return;

        page_pool_put_netmem_bulk(bq->q, bq->count);
        bq->count = 0;
}

static __always_inline unsigned int
xdp_get_frame_len(const struct xdp_frame *xdpf)
{
        const struct skb_shared_info *sinfo;
        unsigned int len = xdpf->len;

        if (likely(!xdp_frame_has_frags(xdpf)))
                goto out;

        sinfo = xdp_get_shared_info_from_frame(xdpf);
        len += sinfo->xdp_frags_size;
out:
        return len;
}

int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
                       struct net_device *dev, u32 queue_index,
                       unsigned int napi_id, u32 frag_size);
static inline int
xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
                 struct net_device *dev, u32 queue_index,
                 unsigned int napi_id)
{
        return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0);
}

void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
                               enum xdp_mem_type type, void *allocator);
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq);
int xdp_reg_mem_model(struct xdp_mem_info *mem,
                      enum xdp_mem_type type, void *allocator);
void xdp_unreg_mem_model(struct xdp_mem_info *mem);
int xdp_reg_page_pool(struct page_pool *pool);
void xdp_unreg_page_pool(const struct page_pool *pool);
void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq,
                                   const struct page_pool *pool);

/**
 * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info
 * @xdp_rxq: XDP RxQ info to attach the memory info to
 * @mem: already registered memory info
 *
 * If the driver registers its memory providers manually, it must use this
 * function instead of xdp_rxq_info_reg_mem_model().
 */
static inline void
xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq,
                              const struct xdp_mem_info *mem)
{
        xdp_rxq->mem = *mem;
}

/**
 * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info
 * @xdp_rxq: XDP RxQ info to detach the memory info from
 *
 * If the driver registers its memory providers manually and then attaches it
 * via xdp_rxq_info_attach_mem_model(), it must call this function before
 * xdp_rxq_info_unreg().
 */
static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq)
{
        xdp_rxq->mem = (struct xdp_mem_info){ };
}

/* Drivers not supporting XDP metadata can use this helper, which
 * rejects any room expansion for metadata as a result.
 */
static __always_inline void
xdp_set_data_meta_invalid(struct xdp_buff *xdp)
{
        xdp->data_meta = xdp->data + 1;
}

static __always_inline bool
xdp_data_meta_unsupported(const struct xdp_buff *xdp)
{
        return unlikely(xdp->data_meta > xdp->data);
}

static inline bool xdp_metalen_invalid(unsigned long metalen)
{
        unsigned long meta_max;

        meta_max = type_max(typeof_member(struct skb_shared_info, meta_len));
        BUILD_BUG_ON(!__builtin_constant_p(meta_max));

        return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max;
}

struct xdp_attachment_info {
        struct bpf_prog *prog;
        u32 flags;
};

struct netdev_bpf;
void xdp_attachment_setup(struct xdp_attachment_info *info,
                          struct netdev_bpf *bpf);

#define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE

/* Define the relationship between xdp-rx-metadata kfunc and
 * various other entities:
 * - xdp_rx_metadata enum
 * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml)
 * - kfunc name
 * - xdp_metadata_ops field
 */
#define XDP_METADATA_KFUNC_xxx        \
        XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \
                           NETDEV_XDP_RX_METADATA_TIMESTAMP, \
                           bpf_xdp_metadata_rx_timestamp, \
                           xmo_rx_timestamp) \
        XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \
                           NETDEV_XDP_RX_METADATA_HASH, \
                           bpf_xdp_metadata_rx_hash, \
                           xmo_rx_hash) \
        XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \
                           NETDEV_XDP_RX_METADATA_VLAN_TAG, \
                           bpf_xdp_metadata_rx_vlan_tag, \
                           xmo_rx_vlan_tag) \

enum xdp_rx_metadata {
#define XDP_METADATA_KFUNC(name, _, __, ___) name,
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC
MAX_XDP_METADATA_KFUNC,
};

enum xdp_rss_hash_type {
        /* First part: Individual bits for L3/L4 types */
        XDP_RSS_L3_IPV4                = BIT(0),
        XDP_RSS_L3_IPV6                = BIT(1),

        /* The fixed (L3) IPv4 and IPv6 headers can both be followed by
         * variable/dynamic headers, IPv4 called Options and IPv6 called
         * Extension Headers. HW RSS type can contain this info.
         */
        XDP_RSS_L3_DYNHDR        = BIT(2),

        /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in
         * addition to the protocol specific bit.  This ease interaction with
         * SKBs and avoids reserving a fixed mask for future L4 protocol bits.
         */
        XDP_RSS_L4                = BIT(3), /* L4 based hash, proto can be unknown */
        XDP_RSS_L4_TCP                = BIT(4),
        XDP_RSS_L4_UDP                = BIT(5),
        XDP_RSS_L4_SCTP                = BIT(6),
        XDP_RSS_L4_IPSEC        = BIT(7), /* L4 based hash include IPSEC SPI */
        XDP_RSS_L4_ICMP                = BIT(8),

        /* Second part: RSS hash type combinations used for driver HW mapping */
        XDP_RSS_TYPE_NONE            = 0,
        XDP_RSS_TYPE_L2              = XDP_RSS_TYPE_NONE,

        XDP_RSS_TYPE_L3_IPV4         = XDP_RSS_L3_IPV4,
        XDP_RSS_TYPE_L3_IPV6         = XDP_RSS_L3_IPV6,
        XDP_RSS_TYPE_L3_IPV4_OPT     = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR,
        XDP_RSS_TYPE_L3_IPV6_EX      = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR,

        XDP_RSS_TYPE_L4_ANY          = XDP_RSS_L4,
        XDP_RSS_TYPE_L4_IPV4_TCP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
        XDP_RSS_TYPE_L4_IPV4_UDP     = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
        XDP_RSS_TYPE_L4_IPV4_SCTP    = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
        XDP_RSS_TYPE_L4_IPV4_IPSEC   = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC,
        XDP_RSS_TYPE_L4_IPV4_ICMP    = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP,

        XDP_RSS_TYPE_L4_IPV6_TCP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP,
        XDP_RSS_TYPE_L4_IPV6_UDP     = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP,
        XDP_RSS_TYPE_L4_IPV6_SCTP    = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP,
        XDP_RSS_TYPE_L4_IPV6_IPSEC   = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC,
        XDP_RSS_TYPE_L4_IPV6_ICMP    = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP,

        XDP_RSS_TYPE_L4_IPV6_TCP_EX  = XDP_RSS_TYPE_L4_IPV6_TCP  | XDP_RSS_L3_DYNHDR,
        XDP_RSS_TYPE_L4_IPV6_UDP_EX  = XDP_RSS_TYPE_L4_IPV6_UDP  | XDP_RSS_L3_DYNHDR,
        XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR,
};

struct xdp_metadata_ops {
        int        (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp);
        int        (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash,
                               enum xdp_rss_hash_type *rss_type);
        int        (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto,
                                   u16 *vlan_tci);
};

#ifdef CONFIG_NET
u32 bpf_xdp_metadata_kfunc_id(int id);
bool bpf_dev_bound_kfunc_id(u32 btf_id);
void xdp_set_features_flag(struct net_device *dev, xdp_features_t val);
void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg);
void xdp_features_clear_redirect_target(struct net_device *dev);
#else
static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; }
static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; }

static inline void
xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
{
}

static inline void
xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
{
}

static inline void
xdp_features_clear_redirect_target(struct net_device *dev)
{
}
#endif

static inline void xdp_clear_features_flag(struct net_device *dev)
{
        xdp_set_features_flag(dev, 0);
}

static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog,
                                            struct xdp_buff *xdp)
{
        /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus
         * under local_bh_disable(), which provides the needed RCU protection
         * for accessing map entries.
         */
        u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp));

        if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) {
                if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev))
                        act = xdp_master_redirect(xdp);
        }

        return act;
}
#endif /* __LINUX_NET_XDP_H__ */








































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BITOPS_H
#define _LINUX_BITOPS_H

#include <asm/types.h>
#include <linux/bits.h>
#include <linux/typecheck.h>

#include <uapi/linux/kernel.h>

#define BITS_PER_TYPE(type)        (sizeof(type) * BITS_PER_BYTE)
#define BITS_TO_LONGS(nr)        __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long))
#define BITS_TO_U64(nr)                __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64))
#define BITS_TO_U32(nr)                __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32))
#define BITS_TO_BYTES(nr)        __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char))

#define BYTES_TO_BITS(nb)        ((nb) * BITS_PER_BYTE)

extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned long __sw_hweight64(__u64 w);

/*
 * Defined here because those may be needed by architecture-specific static
 * inlines.
 */

#include <asm-generic/bitops/generic-non-atomic.h>

/*
 * Many architecture-specific non-atomic bitops contain inline asm code and due
 * to that the compiler can't optimize them to compile-time expressions or
 * constants. In contrary, generic_*() helpers are defined in pure C and
 * compilers optimize them just well.
 * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively
 * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when
 * the arguments can be resolved at compile time. That expression itself is a
 * constant and doesn't bring any functional changes to the rest of cases.
 * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when
 * passing a bitmap from .bss or .data (-> `!!addr` is always true).
 */
#define bitop(op, nr, addr)                                                \
        ((__builtin_constant_p(nr) &&                                        \
          __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) &&        \
          (uintptr_t)(addr) != (uintptr_t)NULL &&                        \
          __builtin_constant_p(*(const unsigned long *)(addr))) ?        \
         const##op(nr, addr) : op(nr, addr))

/*
 * The following macros are non-atomic versions of their non-underscored
 * counterparts.
 */
#define __set_bit(nr, addr)                bitop(___set_bit, nr, addr)
#define __clear_bit(nr, addr)                bitop(___clear_bit, nr, addr)
#define __change_bit(nr, addr)                bitop(___change_bit, nr, addr)
#define __test_and_set_bit(nr, addr)        bitop(___test_and_set_bit, nr, addr)
#define __test_and_clear_bit(nr, addr)        bitop(___test_and_clear_bit, nr, addr)
#define __test_and_change_bit(nr, addr)        bitop(___test_and_change_bit, nr, addr)

#define test_bit(nr, addr)                bitop(_test_bit, nr, addr)
#define test_bit_acquire(nr, addr)        bitop(_test_bit_acquire, nr, addr)

/*
 * Include this here because some architectures need generic_ffs/fls in
 * scope
 */
#include <asm/bitops.h>

/* Check that the bitops prototypes are sane */
#define __check_bitop_pr(name)                                                \
        static_assert(__same_type(arch_##name, generic_##name) &&        \
                      __same_type(const_##name, generic_##name) &&        \
                      __same_type(_##name, generic_##name))

__check_bitop_pr(__set_bit);
__check_bitop_pr(__clear_bit);
__check_bitop_pr(__change_bit);
__check_bitop_pr(__test_and_set_bit);
__check_bitop_pr(__test_and_clear_bit);
__check_bitop_pr(__test_and_change_bit);
__check_bitop_pr(test_bit);
__check_bitop_pr(test_bit_acquire);

#undef __check_bitop_pr

static inline int get_bitmask_order(unsigned int count)
{
        int order;

        order = fls(count);
        return order;        /* We could be slightly more clever with -1 here... */
}

static __always_inline unsigned long hweight_long(unsigned long w)
{
        return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w);
}

/**
 * rol64 - rotate a 64-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u64 rol64(__u64 word, unsigned int shift)
{
        return (word << (shift & 63)) | (word >> ((-shift) & 63));
}

/**
 * ror64 - rotate a 64-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u64 ror64(__u64 word, unsigned int shift)
{
        return (word >> (shift & 63)) | (word << ((-shift) & 63));
}

/**
 * rol32 - rotate a 32-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u32 rol32(__u32 word, unsigned int shift)
{
        return (word << (shift & 31)) | (word >> ((-shift) & 31));
}

/**
 * ror32 - rotate a 32-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u32 ror32(__u32 word, unsigned int shift)
{
        return (word >> (shift & 31)) | (word << ((-shift) & 31));
}

/**
 * rol16 - rotate a 16-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u16 rol16(__u16 word, unsigned int shift)
{
        return (word << (shift & 15)) | (word >> ((-shift) & 15));
}

/**
 * ror16 - rotate a 16-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u16 ror16(__u16 word, unsigned int shift)
{
        return (word >> (shift & 15)) | (word << ((-shift) & 15));
}

/**
 * rol8 - rotate an 8-bit value left
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u8 rol8(__u8 word, unsigned int shift)
{
        return (word << (shift & 7)) | (word >> ((-shift) & 7));
}

/**
 * ror8 - rotate an 8-bit value right
 * @word: value to rotate
 * @shift: bits to roll
 */
static inline __u8 ror8(__u8 word, unsigned int shift)
{
        return (word >> (shift & 7)) | (word << ((-shift) & 7));
}

/**
 * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit
 * @value: value to sign extend
 * @index: 0 based bit index (0<=index<32) to sign bit
 *
 * This is safe to use for 16- and 8-bit types as well.
 */
static __always_inline __s32 sign_extend32(__u32 value, int index)
{
        __u8 shift = 31 - index;
        return (__s32)(value << shift) >> shift;
}

/**
 * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit
 * @value: value to sign extend
 * @index: 0 based bit index (0<=index<64) to sign bit
 */
static __always_inline __s64 sign_extend64(__u64 value, int index)
{
        __u8 shift = 63 - index;
        return (__s64)(value << shift) >> shift;
}

static inline unsigned int fls_long(unsigned long l)
{
        if (sizeof(l) == 4)
                return fls(l);
        return fls64(l);
}

static inline int get_count_order(unsigned int count)
{
        if (count == 0)
                return -1;

        return fls(--count);
}

/**
 * get_count_order_long - get order after rounding @l up to power of 2
 * @l: parameter
 *
 * it is same as get_count_order() but with long type parameter
 */
static inline int get_count_order_long(unsigned long l)
{
        if (l == 0UL)
                return -1;
        return (int)fls_long(--l);
}

/**
 * parity8 - get the parity of an u8 value
 * @value: the value to be examined
 *
 * Determine the parity of the u8 argument.
 *
 * Returns:
 * 0 for even parity, 1 for odd parity
 *
 * Note: This function informs you about the current parity. Example to bail
 * out when parity is odd:
 *
 *        if (parity8(val) == 1)
 *                return -EBADMSG;
 *
 * If you need to calculate a parity bit, you need to draw the conclusion from
 * this result yourself. Example to enforce odd parity, parity bit is bit 7:
 *
 *        if (parity8(val) == 0)
 *                val ^= BIT(7);
 */
static inline int parity8(u8 val)
{
        /*
         * One explanation of this algorithm:
         * https://funloop.org/codex/problem/parity/README.html
         */
        val ^= val >> 4;
        return (0x6996 >> (val & 0xf)) & 1;
}

/**
 * __ffs64 - find first set bit in a 64 bit word
 * @word: The 64 bit word
 *
 * On 64 bit arches this is a synonym for __ffs
 * The result is not defined if no bits are set, so check that @word
 * is non-zero before calling this.
 */
static inline unsigned int __ffs64(u64 word)
{
#if BITS_PER_LONG == 32
        if (((u32)word) == 0UL)
                return __ffs((u32)(word >> 32)) + 32;
#elif BITS_PER_LONG != 64
#error BITS_PER_LONG not 32 or 64
#endif
        return __ffs((unsigned long)word);
}

/**
 * fns - find N'th set bit in a word
 * @word: The word to search
 * @n: Bit to find
 */
static inline unsigned int fns(unsigned long word, unsigned int n)
{
        while (word && n--)
                word &= word - 1;

        return word ? __ffs(word) : BITS_PER_LONG;
}

/**
 * assign_bit - Assign value to a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 * @value: the value to assign
 */
#define assign_bit(nr, addr, value)                                        \
        ((value) ? set_bit((nr), (addr)) : clear_bit((nr), (addr)))

#define __assign_bit(nr, addr, value)                                        \
        ((value) ? __set_bit((nr), (addr)) : __clear_bit((nr), (addr)))

/**
 * __ptr_set_bit - Set bit in a pointer's value
 * @nr: the bit to set
 * @addr: the address of the pointer variable
 *
 * Example:
 *        void *p = foo();
 *        __ptr_set_bit(bit, &p);
 */
#define __ptr_set_bit(nr, addr)                         \
        ({                                              \
                typecheck_pointer(*(addr));             \
                __set_bit(nr, (unsigned long *)(addr)); \
        })

/**
 * __ptr_clear_bit - Clear bit in a pointer's value
 * @nr: the bit to clear
 * @addr: the address of the pointer variable
 *
 * Example:
 *        void *p = foo();
 *        __ptr_clear_bit(bit, &p);
 */
#define __ptr_clear_bit(nr, addr)                         \
        ({                                                \
                typecheck_pointer(*(addr));               \
                __clear_bit(nr, (unsigned long *)(addr)); \
        })

/**
 * __ptr_test_bit - Test bit in a pointer's value
 * @nr: the bit to test
 * @addr: the address of the pointer variable
 *
 * Example:
 *        void *p = foo();
 *        if (__ptr_test_bit(bit, &p)) {
 *                ...
 *        } else {
 *                ...
 *        }
 */
#define __ptr_test_bit(nr, addr)                       \
        ({                                             \
                typecheck_pointer(*(addr));            \
                test_bit(nr, (unsigned long *)(addr)); \
        })

#ifdef __KERNEL__

#ifndef set_mask_bits
#define set_mask_bits(ptr, mask, bits)        \
({                                                                \
        const typeof(*(ptr)) mask__ = (mask), bits__ = (bits);        \
        typeof(*(ptr)) old__, new__;                                \
                                                                \
        old__ = READ_ONCE(*(ptr));                                \
        do {                                                        \
                new__ = (old__ & ~mask__) | bits__;                \
        } while (!try_cmpxchg(ptr, &old__, new__));                \
                                                                \
        old__;                                                        \
})
#endif

#ifndef bit_clear_unless
#define bit_clear_unless(ptr, clear, test)        \
({                                                                \
        const typeof(*(ptr)) clear__ = (clear), test__ = (test);\
        typeof(*(ptr)) old__, new__;                                \
                                                                \
        old__ = READ_ONCE(*(ptr));                                \
        do {                                                        \
                if (old__ & test__)                                \
                        break;                                        \
                new__ = old__ & ~clear__;                        \
        } while (!try_cmpxchg(ptr, &old__, new__));                \
                                                                \
        !(old__ & test__);                                        \
})
#endif

#endif /* __KERNEL__ */
#endif




















































































  185 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Prevent the compiler from merging or refetching reads or writes. The
 * compiler is also forbidden from reordering successive instances of
 * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
 * particular ordering. One way to make the compiler aware of ordering is to
 * put the two invocations of READ_ONCE or WRITE_ONCE in different C
 * statements.
 *
 * These two macros will also work on aggregate data types like structs or
 * unions.
 *
 * Their two major use cases are: (1) Mediating communication between
 * process-level code and irq/NMI handlers, all running on the same CPU,
 * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
 * mutilate accesses that either do not require ordering or that interact
 * with an explicit memory barrier or atomic instruction that provides the
 * required ordering.
 */
#ifndef __ASM_GENERIC_RWONCE_H
#define __ASM_GENERIC_RWONCE_H

#ifndef __ASSEMBLY__

#include <linux/compiler_types.h>
#include <linux/kasan-checks.h>
#include <linux/kcsan-checks.h>

/*
 * Yes, this permits 64-bit accesses on 32-bit architectures. These will
 * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
 * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
 * (e.g. a virtual address) and a strong prevailing wind.
 */
#define compiletime_assert_rwonce_type(t)                                        \
        compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),        \
                "Unsupported access size for {READ,WRITE}_ONCE().")

/*
 * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
 * atomicity. Note that this may result in tears!
 */
#ifndef __READ_ONCE
#define __READ_ONCE(x)        (*(const volatile __unqual_scalar_typeof(x) *)&(x))
#endif

#define READ_ONCE(x)                                                        \
({                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __READ_ONCE(x);                                                        \
})

#define __WRITE_ONCE(x, val)                                                \
do {                                                                        \
        *(volatile typeof(x) *)&(x) = (val);                                \
} while (0)

#define WRITE_ONCE(x, val)                                                \
do {                                                                        \
        compiletime_assert_rwonce_type(x);                                \
        __WRITE_ONCE(x, val);                                                \
} while (0)

static __no_sanitize_or_inline
unsigned long __read_once_word_nocheck(const void *addr)
{
        return __READ_ONCE(*(unsigned long *)addr);
}

/*
 * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
 * word from memory atomically but without telling KASAN/KCSAN. This is
 * usually used by unwinding code when walking the stack of a running process.
 */
#define READ_ONCE_NOCHECK(x)                                                \
({                                                                        \
        compiletime_assert(sizeof(x) == sizeof(unsigned long),                \
                "Unsupported access size for READ_ONCE_NOCHECK().");        \
        (typeof(x))__read_once_word_nocheck(&(x));                        \
})

static __no_sanitize_or_inline
unsigned long read_word_at_a_time(const void *addr)
{
        /* open-coded instrument_read(addr, 1) */
        kasan_check_read(addr, 1);
        kcsan_check_read(addr, 1);

        /*
         * This load can race with concurrent stores to out-of-bounds memory,
         * but READ_ONCE() can't be used because it requires higher alignment
         * than plain loads in arm64 builds with LTO.
         */
        return *(unsigned long *)addr;
}

#endif /* __ASSEMBLY__ */
#endif        /* __ASM_GENERIC_RWONCE_H */





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 














    3 




















    3 























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                IPv4 Forwarding Information Base: FIB frontend.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 */

#include <linux/module.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>

#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
#include <net/l3mdev.h>
#include <net/lwtunnel.h>
#include <trace/events/fib.h>

#ifndef CONFIG_IP_MULTIPLE_TABLES

static int __net_init fib4_rules_init(struct net *net)
{
        struct fib_table *local_table, *main_table;

        main_table  = fib_trie_table(RT_TABLE_MAIN, NULL);
        if (!main_table)
                return -ENOMEM;

        local_table = fib_trie_table(RT_TABLE_LOCAL, main_table);
        if (!local_table)
                goto fail;

        hlist_add_head_rcu(&local_table->tb_hlist,
                                &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX]);
        hlist_add_head_rcu(&main_table->tb_hlist,
                                &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]);
        return 0;

fail:
        fib_free_table(main_table);
        return -ENOMEM;
}
#else

struct fib_table *fib_new_table(struct net *net, u32 id)
{
        struct fib_table *tb, *alias = NULL;
        unsigned int h;

        if (id == 0)
                id = RT_TABLE_MAIN;
        tb = fib_get_table(net, id);
        if (tb)
                return tb;

        if (id == RT_TABLE_LOCAL && !net->ipv4.fib_has_custom_rules)
                alias = fib_new_table(net, RT_TABLE_MAIN);

        tb = fib_trie_table(id, alias);
        if (!tb)
                return NULL;

        switch (id) {
        case RT_TABLE_MAIN:
                rcu_assign_pointer(net->ipv4.fib_main, tb);
                break;
        case RT_TABLE_DEFAULT:
                rcu_assign_pointer(net->ipv4.fib_default, tb);
                break;
        default:
                break;
        }

        h = id & (FIB_TABLE_HASHSZ - 1);
        hlist_add_head_rcu(&tb->tb_hlist, &net->ipv4.fib_table_hash[h]);
        return tb;
}
EXPORT_SYMBOL_GPL(fib_new_table);

/* caller must hold either rtnl or rcu read lock */
struct fib_table *fib_get_table(struct net *net, u32 id)
{
        struct fib_table *tb;
        struct hlist_head *head;
        unsigned int h;

        if (id == 0)
                id = RT_TABLE_MAIN;
        h = id & (FIB_TABLE_HASHSZ - 1);

        head = &net->ipv4.fib_table_hash[h];
        hlist_for_each_entry_rcu(tb, head, tb_hlist,
                                 lockdep_rtnl_is_held()) {
                if (tb->tb_id == id)
                        return tb;
        }
        return NULL;
}
#endif /* CONFIG_IP_MULTIPLE_TABLES */

static void fib_replace_table(struct net *net, struct fib_table *old,
                              struct fib_table *new)
{
#ifdef CONFIG_IP_MULTIPLE_TABLES
        switch (new->tb_id) {
        case RT_TABLE_MAIN:
                rcu_assign_pointer(net->ipv4.fib_main, new);
                break;
        case RT_TABLE_DEFAULT:
                rcu_assign_pointer(net->ipv4.fib_default, new);
                break;
        default:
                break;
        }

#endif
        /* replace the old table in the hlist */
        hlist_replace_rcu(&old->tb_hlist, &new->tb_hlist);
}

int fib_unmerge(struct net *net)
{
        struct fib_table *old, *new, *main_table;

        /* attempt to fetch local table if it has been allocated */
        old = fib_get_table(net, RT_TABLE_LOCAL);
        if (!old)
                return 0;

        new = fib_trie_unmerge(old);
        if (!new)
                return -ENOMEM;

        /* table is already unmerged */
        if (new == old)
                return 0;

        /* replace merged table with clean table */
        fib_replace_table(net, old, new);
        fib_free_table(old);

        /* attempt to fetch main table if it has been allocated */
        main_table = fib_get_table(net, RT_TABLE_MAIN);
        if (!main_table)
                return 0;

        /* flush local entries from main table */
        fib_table_flush_external(main_table);

        return 0;
}

void fib_flush(struct net *net)
{
        int flushed = 0;
        unsigned int h;

        for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv4.fib_table_hash[h];
                struct hlist_node *tmp;
                struct fib_table *tb;

                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
                        flushed += fib_table_flush(net, tb, false);
        }

        if (flushed)
                rt_cache_flush(net);
}

/*
 * Find address type as if only "dev" was present in the system. If
 * on_dev is NULL then all interfaces are taken into consideration.
 */
static inline unsigned int __inet_dev_addr_type(struct net *net,
                                                const struct net_device *dev,
                                                __be32 addr, u32 tb_id)
{
        struct flowi4                fl4 = { .daddr = addr };
        struct fib_result        res;
        unsigned int ret = RTN_BROADCAST;
        struct fib_table *table;

        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
                return RTN_BROADCAST;
        if (ipv4_is_multicast(addr))
                return RTN_MULTICAST;

        rcu_read_lock();

        table = fib_get_table(net, tb_id);
        if (table) {
                ret = RTN_UNICAST;
                if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
                        struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);

                        if (!dev || dev == nhc->nhc_dev)
                                ret = res.type;
                }
        }

        rcu_read_unlock();
        return ret;
}

unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id)
{
        return __inet_dev_addr_type(net, NULL, addr, tb_id);
}
EXPORT_SYMBOL(inet_addr_type_table);

unsigned int inet_addr_type(struct net *net, __be32 addr)
{
        return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
}
EXPORT_SYMBOL(inet_addr_type);

unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
                                __be32 addr)
{
        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;

        return __inet_dev_addr_type(net, dev, addr, rt_table);
}
EXPORT_SYMBOL(inet_dev_addr_type);

/* inet_addr_type with dev == NULL but using the table from a dev
 * if one is associated
 */
unsigned int inet_addr_type_dev_table(struct net *net,
                                      const struct net_device *dev,
                                      __be32 addr)
{
        u32 rt_table = l3mdev_fib_table(dev) ? : RT_TABLE_LOCAL;

        return __inet_dev_addr_type(net, NULL, addr, rt_table);
}
EXPORT_SYMBOL(inet_addr_type_dev_table);

__be32 fib_compute_spec_dst(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        struct fib_result res;
        struct rtable *rt;
        struct net *net;
        int scope;

        rt = skb_rtable(skb);
        if ((rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST | RTCF_LOCAL)) ==
            RTCF_LOCAL)
                return ip_hdr(skb)->daddr;

        in_dev = __in_dev_get_rcu(dev);

        net = dev_net(dev);

        scope = RT_SCOPE_UNIVERSE;
        if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
                bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
                struct flowi4 fl4 = {
                        .flowi4_iif = LOOPBACK_IFINDEX,
                        .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
                        .daddr = ip_hdr(skb)->saddr,
                        .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))),
                        .flowi4_scope = scope,
                        .flowi4_mark = vmark ? skb->mark : 0,
                };
                if (!fib_lookup(net, &fl4, &res, 0))
                        return fib_result_prefsrc(net, &res);
        } else {
                scope = RT_SCOPE_LINK;
        }

        return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
}

bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
{
        bool dev_match = false;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (unlikely(fi->nh)) {
                dev_match = nexthop_uses_dev(fi->nh, dev);
        } else {
                int ret;

                for (ret = 0; ret < fib_info_num_path(fi); ret++) {
                        const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);

                        if (nhc_l3mdev_matches_dev(nhc, dev)) {
                                dev_match = true;
                                break;
                        }
                }
        }
#else
        if (fib_info_nhc(fi, 0)->nhc_dev == dev)
                dev_match = true;
#endif

        return dev_match;
}
EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);

/* Given (packet source, input interface) and optional (dst, oif, tos):
 * - (main) check, that source is valid i.e. not broadcast or our local
 *   address.
 * - figure out what "logical" interface this packet arrived
 *   and calculate "specific destination" address.
 * - check, that packet arrived from expected physical interface.
 * called with rcu_read_lock()
 */
static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                                 dscp_t dscp, int oif, struct net_device *dev,
                                 int rpf, struct in_device *idev, u32 *itag)
{
        struct net *net = dev_net(dev);
        enum skb_drop_reason reason;
        struct flow_keys flkeys;
        int ret, no_addr;
        struct fib_result res;
        struct flowi4 fl4;
        bool dev_match;

        fl4.flowi4_oif = 0;
        fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev);
        fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
        fl4.daddr = src;
        fl4.saddr = dst;
        fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_tun_key.tun_id = 0;
        fl4.flowi4_flags = 0;
        fl4.flowi4_uid = sock_net_uid(net, NULL);
        fl4.flowi4_multipath_hash = 0;

        no_addr = idev->ifa_list == NULL;

        fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
        if (!fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys)) {
                fl4.flowi4_proto = 0;
                fl4.fl4_sport = 0;
                fl4.fl4_dport = 0;
        } else {
                swap(fl4.fl4_sport, fl4.fl4_dport);
        }

        if (fib_lookup(net, &fl4, &res, 0))
                goto last_resort;
        if (res.type != RTN_UNICAST) {
                if (res.type != RTN_LOCAL) {
                        reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                        goto e_inval;
                } else if (!IN_DEV_ACCEPT_LOCAL(idev)) {
                        reason = SKB_DROP_REASON_IP_LOCAL_SOURCE;
                        goto e_inval;
                }
        }
        fib_combine_itag(itag, &res);

        dev_match = fib_info_nh_uses_dev(res.fi, dev);
        /* This is not common, loopback packets retain skb_dst so normally they
         * would not even hit this slow path.
         */
        dev_match = dev_match || (res.type == RTN_LOCAL &&
                                  dev == net->loopback_dev);
        if (dev_match) {
                ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
                return ret;
        }
        if (no_addr)
                goto last_resort;
        if (rpf == 1)
                goto e_rpf;
        fl4.flowi4_oif = dev->ifindex;

        ret = 0;
        if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
                if (res.type == RTN_UNICAST)
                        ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
        }
        return ret;

last_resort:
        if (rpf)
                goto e_rpf;
        *itag = 0;
        return 0;

e_inval:
        return -reason;
e_rpf:
        return -SKB_DROP_REASON_IP_RPFILTER;
}

/* Ignore rp_filter for packets protected by IPsec. */
int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
                        dscp_t dscp, int oif, struct net_device *dev,
                        struct in_device *idev, u32 *itag)
{
        int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
        struct net *net = dev_net(dev);

        if (!r && !fib_num_tclassid_users(net) &&
            (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) {
                if (IN_DEV_ACCEPT_LOCAL(idev))
                        goto ok;
                /* with custom local routes in place, checking local addresses
                 * only will be too optimistic, with custom rules, checking
                 * local addresses only can be too strict, e.g. due to vrf
                 */
                if (net->ipv4.fib_has_custom_local_routes ||
                    fib4_has_custom_rules(net))
                        goto full_check;
                /* Within the same container, it is regarded as a martian source,
                 * and the same host but different containers are not.
                 */
                if (inet_lookup_ifaddr_rcu(net, src))
                        return -SKB_DROP_REASON_IP_LOCAL_SOURCE;

ok:
                *itag = 0;
                return 0;
        }

full_check:
        return __fib_validate_source(skb, src, dst, dscp, oif, dev, r, idev,
                                     itag);
}

static inline __be32 sk_extract_addr(struct sockaddr *addr)
{
        return ((struct sockaddr_in *) addr)->sin_addr.s_addr;
}

static int put_rtax(struct nlattr *mx, int len, int type, u32 value)
{
        struct nlattr *nla;

        nla = (struct nlattr *) ((char *) mx + len);
        nla->nla_type = type;
        nla->nla_len = nla_attr_size(4);
        *(u32 *) nla_data(nla) = value;

        return len + nla_total_size(4);
}

static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
                                 struct fib_config *cfg)
{
        __be32 addr;
        int plen;

        memset(cfg, 0, sizeof(*cfg));
        cfg->fc_nlinfo.nl_net = net;

        if (rt->rt_dst.sa_family != AF_INET)
                return -EAFNOSUPPORT;

        /*
         * Check mask for validity:
         * a) it must be contiguous.
         * b) destination must have all host bits clear.
         * c) if application forgot to set correct family (AF_INET),
         *    reject request unless it is absolutely clear i.e.
         *    both family and mask are zero.
         */
        plen = 32;
        addr = sk_extract_addr(&rt->rt_dst);
        if (!(rt->rt_flags & RTF_HOST)) {
                __be32 mask = sk_extract_addr(&rt->rt_genmask);

                if (rt->rt_genmask.sa_family != AF_INET) {
                        if (mask || rt->rt_genmask.sa_family)
                                return -EAFNOSUPPORT;
                }

                if (bad_mask(mask, addr))
                        return -EINVAL;

                plen = inet_mask_len(mask);
        }

        cfg->fc_dst_len = plen;
        cfg->fc_dst = addr;

        if (cmd != SIOCDELRT) {
                cfg->fc_nlflags = NLM_F_CREATE;
                cfg->fc_protocol = RTPROT_BOOT;
        }

        if (rt->rt_metric)
                cfg->fc_priority = rt->rt_metric - 1;

        if (rt->rt_flags & RTF_REJECT) {
                cfg->fc_scope = RT_SCOPE_HOST;
                cfg->fc_type = RTN_UNREACHABLE;
                return 0;
        }

        cfg->fc_scope = RT_SCOPE_NOWHERE;
        cfg->fc_type = RTN_UNICAST;

        if (rt->rt_dev) {
                char *colon;
                struct net_device *dev;
                char devname[IFNAMSIZ];

                if (copy_from_user(devname, rt->rt_dev, IFNAMSIZ-1))
                        return -EFAULT;

                devname[IFNAMSIZ-1] = 0;
                colon = strchr(devname, ':');
                if (colon)
                        *colon = 0;
                dev = __dev_get_by_name(net, devname);
                if (!dev)
                        return -ENODEV;
                cfg->fc_oif = dev->ifindex;
                cfg->fc_table = l3mdev_fib_table(dev);
                if (colon) {
                        const struct in_ifaddr *ifa;
                        struct in_device *in_dev;

                        in_dev = __in_dev_get_rtnl_net(dev);
                        if (!in_dev)
                                return -ENODEV;

                        *colon = ':';

                        in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) {
                                if (strcmp(ifa->ifa_label, devname) == 0)
                                        break;
                        }

                        if (!ifa)
                                return -ENODEV;
                        cfg->fc_prefsrc = ifa->ifa_local;
                }
        }

        addr = sk_extract_addr(&rt->rt_gateway);
        if (rt->rt_gateway.sa_family == AF_INET && addr) {
                unsigned int addr_type;

                cfg->fc_gw4 = addr;
                cfg->fc_gw_family = AF_INET;
                addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
                if (rt->rt_flags & RTF_GATEWAY &&
                    addr_type == RTN_UNICAST)
                        cfg->fc_scope = RT_SCOPE_UNIVERSE;
        }

        if (!cfg->fc_table)
                cfg->fc_table = RT_TABLE_MAIN;

        if (cmd == SIOCDELRT)
                return 0;

        if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
                return -EINVAL;

        if (cfg->fc_scope == RT_SCOPE_NOWHERE)
                cfg->fc_scope = RT_SCOPE_LINK;

        if (rt->rt_flags & (RTF_MTU | RTF_WINDOW | RTF_IRTT)) {
                struct nlattr *mx;
                int len = 0;

                mx = kcalloc(3, nla_total_size(4), GFP_KERNEL);
                if (!mx)
                        return -ENOMEM;

                if (rt->rt_flags & RTF_MTU)
                        len = put_rtax(mx, len, RTAX_ADVMSS, rt->rt_mtu - 40);

                if (rt->rt_flags & RTF_WINDOW)
                        len = put_rtax(mx, len, RTAX_WINDOW, rt->rt_window);

                if (rt->rt_flags & RTF_IRTT)
                        len = put_rtax(mx, len, RTAX_RTT, rt->rt_irtt << 3);

                cfg->fc_mx = mx;
                cfg->fc_mx_len = len;
        }

        return 0;
}

/*
 * Handle IP routing ioctl calls.
 * These are used to manipulate the routing tables
 */
int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
{
        struct fib_config cfg;
        int err;

        switch (cmd) {
        case SIOCADDRT:                /* Add a route */
        case SIOCDELRT:                /* Delete a route */
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;

                rtnl_net_lock(net);
                err = rtentry_to_fib_config(net, cmd, rt, &cfg);
                if (err == 0) {
                        struct fib_table *tb;

                        if (cmd == SIOCDELRT) {
                                tb = fib_get_table(net, cfg.fc_table);
                                if (tb)
                                        err = fib_table_delete(net, tb, &cfg,
                                                               NULL);
                                else
                                        err = -ESRCH;
                        } else {
                                tb = fib_new_table(net, cfg.fc_table);
                                if (tb)
                                        err = fib_table_insert(net, tb,
                                                               &cfg, NULL);
                                else
                                        err = -ENOBUFS;
                        }

                        /* allocated by rtentry_to_fib_config() */
                        kfree(cfg.fc_mx);
                }
                rtnl_net_unlock(net);
                return err;
        }
        return -EINVAL;
}

const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
        [RTA_UNSPEC]                = { .strict_start_type = RTA_DPORT + 1 },
        [RTA_DST]                = { .type = NLA_U32 },
        [RTA_SRC]                = { .type = NLA_U32 },
        [RTA_IIF]                = { .type = NLA_U32 },
        [RTA_OIF]                = { .type = NLA_U32 },
        [RTA_GATEWAY]                = { .type = NLA_U32 },
        [RTA_PRIORITY]                = { .type = NLA_U32 },
        [RTA_PREFSRC]                = { .type = NLA_U32 },
        [RTA_METRICS]                = { .type = NLA_NESTED },
        [RTA_MULTIPATH]                = { .len = sizeof(struct rtnexthop) },
        [RTA_FLOW]                = { .type = NLA_U32 },
        [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
        [RTA_ENCAP]                = { .type = NLA_NESTED },
        [RTA_UID]                = { .type = NLA_U32 },
        [RTA_MARK]                = { .type = NLA_U32 },
        [RTA_TABLE]                = { .type = NLA_U32 },
        [RTA_IP_PROTO]                = { .type = NLA_U8 },
        [RTA_SPORT]                = { .type = NLA_U16 },
        [RTA_DPORT]                = { .type = NLA_U16 },
        [RTA_NH_ID]                = { .type = NLA_U32 },
};

int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
                    struct netlink_ext_ack *extack)
{
        struct rtvia *via;
        int alen;

        if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
                NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
                return -EINVAL;
        }

        via = nla_data(nla);
        alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);

        switch (via->rtvia_family) {
        case AF_INET:
                if (alen != sizeof(__be32)) {
                        NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
                        return -EINVAL;
                }
                cfg->fc_gw_family = AF_INET;
                cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
                break;
        case AF_INET6:
#if IS_ENABLED(CONFIG_IPV6)
                if (alen != sizeof(struct in6_addr)) {
                        NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
                        return -EINVAL;
                }
                cfg->fc_gw_family = AF_INET6;
                cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
#else
                NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
                return -EINVAL;
#endif
                break;
        default:
                NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
                return -EINVAL;
        }

        return 0;
}

static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
                             struct nlmsghdr *nlh, struct fib_config *cfg,
                             struct netlink_ext_ack *extack)
{
        bool has_gw = false, has_via = false;
        struct nlattr *attr;
        int err, remaining;
        struct rtmsg *rtm;

        err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
                                        rtm_ipv4_policy, extack);
        if (err < 0)
                goto errout;

        memset(cfg, 0, sizeof(*cfg));

        rtm = nlmsg_data(nlh);

        if (!inet_validate_dscp(rtm->rtm_tos)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid dsfield (tos): ECN bits must be 0");
                err = -EINVAL;
                goto errout;
        }
        cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);

        cfg->fc_dst_len = rtm->rtm_dst_len;
        cfg->fc_table = rtm->rtm_table;
        cfg->fc_protocol = rtm->rtm_protocol;
        cfg->fc_scope = rtm->rtm_scope;
        cfg->fc_type = rtm->rtm_type;
        cfg->fc_flags = rtm->rtm_flags;
        cfg->fc_nlflags = nlh->nlmsg_flags;

        cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
        cfg->fc_nlinfo.nlh = nlh;
        cfg->fc_nlinfo.nl_net = net;

        if (cfg->fc_type > RTN_MAX) {
                NL_SET_ERR_MSG(extack, "Invalid route type");
                err = -EINVAL;
                goto errout;
        }

        nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), remaining) {
                switch (nla_type(attr)) {
                case RTA_DST:
                        cfg->fc_dst = nla_get_be32(attr);
                        break;
                case RTA_OIF:
                        cfg->fc_oif = nla_get_u32(attr);
                        break;
                case RTA_GATEWAY:
                        has_gw = true;
                        cfg->fc_gw4 = nla_get_be32(attr);
                        if (cfg->fc_gw4)
                                cfg->fc_gw_family = AF_INET;
                        break;
                case RTA_VIA:
                        has_via = true;
                        err = fib_gw_from_via(cfg, attr, extack);
                        if (err)
                                goto errout;
                        break;
                case RTA_PRIORITY:
                        cfg->fc_priority = nla_get_u32(attr);
                        break;
                case RTA_PREFSRC:
                        cfg->fc_prefsrc = nla_get_be32(attr);
                        break;
                case RTA_METRICS:
                        cfg->fc_mx = nla_data(attr);
                        cfg->fc_mx_len = nla_len(attr);
                        break;
                case RTA_MULTIPATH:
                        err = lwtunnel_valid_encap_type_attr(nla_data(attr),
                                                             nla_len(attr),
                                                             extack, false);
                        if (err < 0)
                                goto errout;
                        cfg->fc_mp = nla_data(attr);
                        cfg->fc_mp_len = nla_len(attr);
                        break;
                case RTA_FLOW:
                        cfg->fc_flow = nla_get_u32(attr);
                        break;
                case RTA_TABLE:
                        cfg->fc_table = nla_get_u32(attr);
                        break;
                case RTA_ENCAP:
                        cfg->fc_encap = attr;
                        break;
                case RTA_ENCAP_TYPE:
                        cfg->fc_encap_type = nla_get_u16(attr);
                        err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
                                                        extack, false);
                        if (err < 0)
                                goto errout;
                        break;
                case RTA_NH_ID:
                        cfg->fc_nh_id = nla_get_u32(attr);
                        break;
                }
        }

        if (cfg->fc_dst_len > 32) {
                NL_SET_ERR_MSG(extack, "Invalid prefix length");
                err = -EINVAL;
                goto errout;
        }

        if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) {
                NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length");
                err = -EINVAL;
                goto errout;
        }

        if (cfg->fc_nh_id) {
                if (cfg->fc_oif || cfg->fc_gw_family ||
                    cfg->fc_encap || cfg->fc_mp) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop specification and nexthop id are mutually exclusive");
                        err = -EINVAL;
                        goto errout;
                }
        }

        if (has_gw && has_via) {
                NL_SET_ERR_MSG(extack,
                               "Nexthop configuration can not contain both GATEWAY and VIA");
                err = -EINVAL;
                goto errout;
        }

        if (!cfg->fc_table)
                cfg->fc_table = RT_TABLE_MAIN;

        return 0;
errout:
        return err;
}

static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct fib_config cfg;
        struct fib_table *tb;
        int err;

        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
        if (err < 0)
                goto errout;

        rtnl_net_lock(net);

        if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
                NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                err = -EINVAL;
                goto unlock;
        }

        tb = fib_get_table(net, cfg.fc_table);
        if (!tb) {
                NL_SET_ERR_MSG(extack, "FIB table does not exist");
                err = -ESRCH;
                goto unlock;
        }

        err = fib_table_delete(net, tb, &cfg, extack);
unlock:
        rtnl_net_unlock(net);
errout:
        return err;
}

static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct fib_config cfg;
        struct fib_table *tb;
        int err;

        err = rtm_to_fib_config(net, skb, nlh, &cfg, extack);
        if (err < 0)
                goto errout;

        rtnl_net_lock(net);

        tb = fib_new_table(net, cfg.fc_table);
        if (!tb) {
                err = -ENOBUFS;
                goto unlock;
        }

        err = fib_table_insert(net, tb, &cfg, extack);
        if (!err && cfg.fc_type == RTN_LOCAL)
                net->ipv4.fib_has_custom_local_routes = true;

unlock:
        rtnl_net_unlock(net);
errout:
        return err;
}

int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
                          struct fib_dump_filter *filter,
                          struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[RTA_MAX + 1];
        struct rtmsg *rtm;
        int err, i;

        if (filter->rtnl_held)
                ASSERT_RTNL();

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
                return -EINVAL;
        }

        rtm = nlmsg_data(nlh);
        if (rtm->rtm_dst_len || rtm->rtm_src_len  || rtm->rtm_tos   ||
            rtm->rtm_scope) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
                return -EINVAL;
        }

        if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
                NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
                return -EINVAL;
        }
        if (rtm->rtm_flags & RTM_F_CLONED)
                filter->dump_routes = false;
        else
                filter->dump_exceptions = false;

        filter->flags    = rtm->rtm_flags;
        filter->protocol = rtm->rtm_protocol;
        filter->rt_type  = rtm->rtm_type;
        filter->table_id = rtm->rtm_table;

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
                                            rtm_ipv4_policy, extack);
        if (err < 0)
                return err;

        for (i = 0; i <= RTA_MAX; ++i) {
                int ifindex;

                if (!tb[i])
                        continue;

                switch (i) {
                case RTA_TABLE:
                        filter->table_id = nla_get_u32(tb[i]);
                        break;
                case RTA_OIF:
                        ifindex = nla_get_u32(tb[i]);
                        if (filter->rtnl_held)
                                filter->dev = __dev_get_by_index(net, ifindex);
                        else
                                filter->dev = dev_get_by_index_rcu(net, ifindex);
                        if (!filter->dev)
                                return -ENODEV;
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        if (filter->flags || filter->protocol || filter->rt_type ||
            filter->table_id || filter->dev) {
                filter->filter_set = 1;
                cb->answer_flags = NLM_F_DUMP_FILTERED;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);

static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct fib_dump_filter filter = {
                .dump_routes = true,
                .dump_exceptions = true,
                .rtnl_held = false,
        };
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        unsigned int h, s_h;
        unsigned int e = 0, s_e;
        struct fib_table *tb;
        struct hlist_head *head;
        int dumped = 0, err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
                if (err < 0)
                        goto unlock;
        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
                struct rtmsg *rtm = nlmsg_data(nlh);

                filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
        }

        /* ipv4 does not use prefix flag */
        if (filter.flags & RTM_F_PREFIX)
                goto unlock;

        if (filter.table_id) {
                tb = fib_get_table(net, filter.table_id);
                if (!tb) {
                        if (rtnl_msg_family(cb->nlh) != PF_INET)
                                goto unlock;

                        NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
                        err = -ENOENT;
                        goto unlock;
                }
                err = fib_table_dump(tb, skb, cb, &filter);
                goto unlock;
        }

        s_h = cb->args[0];
        s_e = cb->args[1];

        err = 0;
        for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
                e = 0;
                head = &net->ipv4.fib_table_hash[h];
                hlist_for_each_entry_rcu(tb, head, tb_hlist) {
                        if (e < s_e)
                                goto next;
                        if (dumped)
                                memset(&cb->args[2], 0, sizeof(cb->args) -
                                                 2 * sizeof(cb->args[0]));
                        err = fib_table_dump(tb, skb, cb, &filter);
                        if (err < 0)
                                goto out;
                        dumped = 1;
next:
                        e++;
                }
        }
out:

        cb->args[1] = e;
        cb->args[0] = h;

unlock:
        rcu_read_unlock();
        return err;
}

/* Prepare and feed intra-kernel routing request.
 * Really, it should be netlink message, but :-( netlink
 * can be not configured, so that we feed it directly
 * to fib engine. It is legal, because all events occur
 * only when netlink is already locked.
 */
static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
                      struct in_ifaddr *ifa, u32 rt_priority)
{
        struct net *net = dev_net(ifa->ifa_dev->dev);
        u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
        struct fib_table *tb;
        struct fib_config cfg = {
                .fc_protocol = RTPROT_KERNEL,
                .fc_type = type,
                .fc_dst = dst,
                .fc_dst_len = dst_len,
                .fc_priority = rt_priority,
                .fc_prefsrc = ifa->ifa_local,
                .fc_oif = ifa->ifa_dev->dev->ifindex,
                .fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
                .fc_nlinfo = {
                        .nl_net = net,
                },
        };

        if (!tb_id)
                tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;

        tb = fib_new_table(net, tb_id);
        if (!tb)
                return;

        cfg.fc_table = tb->tb_id;

        if (type != RTN_LOCAL)
                cfg.fc_scope = RT_SCOPE_LINK;
        else
                cfg.fc_scope = RT_SCOPE_HOST;

        if (cmd == RTM_NEWROUTE)
                fib_table_insert(net, tb, &cfg, NULL);
        else
                fib_table_delete(net, tb, &cfg, NULL);
}

void fib_add_ifaddr(struct in_ifaddr *ifa)
{
        struct in_device *in_dev = ifa->ifa_dev;
        struct net_device *dev = in_dev->dev;
        struct in_ifaddr *prim = ifa;
        __be32 mask = ifa->ifa_mask;
        __be32 addr = ifa->ifa_local;
        __be32 prefix = ifa->ifa_address & mask;

        if (ifa->ifa_flags & IFA_F_SECONDARY) {
                prim = inet_ifa_byprefix(in_dev, prefix, mask);
                if (!prim) {
                        pr_warn("%s: bug: prim == NULL\n", __func__);
                        return;
                }
        }

        fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);

        if (!(dev->flags & IFF_UP))
                return;

        /* Add broadcast address, if it is explicitly assigned. */
        if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
                fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
                          prim, 0);
                arp_invalidate(dev, ifa->ifa_broadcast, false);
        }

        if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
            (prefix != addr || ifa->ifa_prefixlen < 32)) {
                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
                        fib_magic(RTM_NEWROUTE,
                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
                                  prefix, ifa->ifa_prefixlen, prim,
                                  ifa->ifa_rt_priority);

                /* Add the network broadcast address, when it makes sense */
                if (ifa->ifa_prefixlen < 31) {
                        fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
                                  32, prim, 0);
                        arp_invalidate(dev, prefix | ~mask, false);
                }
        }
}

void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
{
        __be32 prefix = ifa->ifa_address & ifa->ifa_mask;
        struct in_device *in_dev = ifa->ifa_dev;
        struct net_device *dev = in_dev->dev;

        if (!(dev->flags & IFF_UP) ||
            ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
            ipv4_is_zeronet(prefix) ||
            (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
                return;

        /* add the new */
        fib_magic(RTM_NEWROUTE,
                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
                  prefix, ifa->ifa_prefixlen, ifa, new_metric);

        /* delete the old */
        fib_magic(RTM_DELROUTE,
                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
                  prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
}

/* Delete primary or secondary address.
 * Optionally, on secondary address promotion consider the addresses
 * from subnet iprim as deleted, even if they are in device list.
 * In this case the secondary ifa can be in device list.
 */
void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
{
        struct in_device *in_dev = ifa->ifa_dev;
        struct net_device *dev = in_dev->dev;
        struct in_ifaddr *ifa1;
        struct in_ifaddr *prim = ifa, *prim1 = NULL;
        __be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
        __be32 any = ifa->ifa_address & ifa->ifa_mask;
#define LOCAL_OK        1
#define BRD_OK                2
#define BRD0_OK                4
#define BRD1_OK                8
        unsigned int ok = 0;
        int subnet = 0;                /* Primary network */
        int gone = 1;                /* Address is missing */
        int same_prefsrc = 0;        /* Another primary with same IP */

        if (ifa->ifa_flags & IFA_F_SECONDARY) {
                prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
                if (!prim) {
                        /* if the device has been deleted, we don't perform
                         * address promotion
                         */
                        if (!in_dev->dead)
                                pr_warn("%s: bug: prim == NULL\n", __func__);
                        return;
                }
                if (iprim && iprim != prim) {
                        pr_warn("%s: bug: iprim != prim\n", __func__);
                        return;
                }
        } else if (!ipv4_is_zeronet(any) &&
                   (any != ifa->ifa_local || ifa->ifa_prefixlen < 32)) {
                if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
                        fib_magic(RTM_DELROUTE,
                                  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
                                  any, ifa->ifa_prefixlen, prim, 0);
                subnet = 1;
        }

        if (in_dev->dead)
                goto no_promotions;

        /* Deletion is more complicated than add.
         * We should take care of not to delete too much :-)
         *
         * Scan address list to be sure that addresses are really gone.
         */
        rcu_read_lock();
        in_dev_for_each_ifa_rcu(ifa1, in_dev) {
                if (ifa1 == ifa) {
                        /* promotion, keep the IP */
                        gone = 0;
                        continue;
                }
                /* Ignore IFAs from our subnet */
                if (iprim && ifa1->ifa_mask == iprim->ifa_mask &&
                    inet_ifa_match(ifa1->ifa_address, iprim))
                        continue;

                /* Ignore ifa1 if it uses different primary IP (prefsrc) */
                if (ifa1->ifa_flags & IFA_F_SECONDARY) {
                        /* Another address from our subnet? */
                        if (ifa1->ifa_mask == prim->ifa_mask &&
                            inet_ifa_match(ifa1->ifa_address, prim))
                                prim1 = prim;
                        else {
                                /* We reached the secondaries, so
                                 * same_prefsrc should be determined.
                                 */
                                if (!same_prefsrc)
                                        continue;
                                /* Search new prim1 if ifa1 is not
                                 * using the current prim1
                                 */
                                if (!prim1 ||
                                    ifa1->ifa_mask != prim1->ifa_mask ||
                                    !inet_ifa_match(ifa1->ifa_address, prim1))
                                        prim1 = inet_ifa_byprefix(in_dev,
                                                        ifa1->ifa_address,
                                                        ifa1->ifa_mask);
                                if (!prim1)
                                        continue;
                                if (prim1->ifa_local != prim->ifa_local)
                                        continue;
                        }
                } else {
                        if (prim->ifa_local != ifa1->ifa_local)
                                continue;
                        prim1 = ifa1;
                        if (prim != prim1)
                                same_prefsrc = 1;
                }
                if (ifa->ifa_local == ifa1->ifa_local)
                        ok |= LOCAL_OK;
                if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
                        ok |= BRD_OK;
                if (brd == ifa1->ifa_broadcast)
                        ok |= BRD1_OK;
                if (any == ifa1->ifa_broadcast)
                        ok |= BRD0_OK;
                /* primary has network specific broadcasts */
                if (prim1 == ifa1 && ifa1->ifa_prefixlen < 31) {
                        __be32 brd1 = ifa1->ifa_address | ~ifa1->ifa_mask;
                        __be32 any1 = ifa1->ifa_address & ifa1->ifa_mask;

                        if (!ipv4_is_zeronet(any1)) {
                                if (ifa->ifa_broadcast == brd1 ||
                                    ifa->ifa_broadcast == any1)
                                        ok |= BRD_OK;
                                if (brd == brd1 || brd == any1)
                                        ok |= BRD1_OK;
                                if (any == brd1 || any == any1)
                                        ok |= BRD0_OK;
                        }
                }
        }
        rcu_read_unlock();

no_promotions:
        if (!(ok & BRD_OK))
                fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
                          prim, 0);
        if (subnet && ifa->ifa_prefixlen < 31) {
                if (!(ok & BRD1_OK))
                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
                                  prim, 0);
                if (!(ok & BRD0_OK))
                        fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
                                  prim, 0);
        }
        if (!(ok & LOCAL_OK)) {
                unsigned int addr_type;

                fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);

                /* Check, that this local address finally disappeared. */
                addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
                                                     ifa->ifa_local);
                if (gone && addr_type != RTN_LOCAL) {
                        /* And the last, but not the least thing.
                         * We must flush stray FIB entries.
                         *
                         * First of all, we scan fib_info list searching
                         * for stray nexthop entries, then ignite fib_flush.
                         */
                        if (fib_sync_down_addr(dev, ifa->ifa_local))
                                fib_flush(dev_net(dev));
                }
        }
#undef LOCAL_OK
#undef BRD_OK
#undef BRD0_OK
#undef BRD1_OK
}

static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
{

        struct fib_result       res;
        struct flowi4           fl4 = {
                .flowi4_mark = frn->fl_mark,
                .daddr = frn->fl_addr,
                .flowi4_tos = frn->fl_tos & INET_DSCP_MASK,
                .flowi4_scope = frn->fl_scope,
        };
        struct fib_table *tb;

        rcu_read_lock();

        tb = fib_get_table(net, frn->tb_id_in);

        frn->err = -ENOENT;
        if (tb) {
                local_bh_disable();

                frn->tb_id = tb->tb_id;
                frn->err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);

                if (!frn->err) {
                        frn->prefixlen = res.prefixlen;
                        frn->nh_sel = res.nh_sel;
                        frn->type = res.type;
                        frn->scope = res.scope;
                }
                local_bh_enable();
        }

        rcu_read_unlock();
}

static void nl_fib_input(struct sk_buff *skb)
{
        struct net *net;
        struct fib_result_nl *frn;
        struct nlmsghdr *nlh;
        u32 portid;

        net = sock_net(skb->sk);
        nlh = nlmsg_hdr(skb);
        if (skb->len < nlmsg_total_size(sizeof(*frn)) ||
            skb->len < nlh->nlmsg_len ||
            nlmsg_len(nlh) < sizeof(*frn))
                return;

        skb = netlink_skb_clone(skb, GFP_KERNEL);
        if (!skb)
                return;
        nlh = nlmsg_hdr(skb);

        frn = nlmsg_data(nlh);
        nl_fib_lookup(net, frn);

        portid = NETLINK_CB(skb).portid;      /* netlink portid */
        NETLINK_CB(skb).portid = 0;        /* from kernel */
        NETLINK_CB(skb).dst_group = 0;  /* unicast */
        nlmsg_unicast(net->ipv4.fibnl, skb, portid);
}

static int __net_init nl_fib_lookup_init(struct net *net)
{
        struct sock *sk;
        struct netlink_kernel_cfg cfg = {
                .input        = nl_fib_input,
        };

        sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, &cfg);
        if (!sk)
                return -EAFNOSUPPORT;
        net->ipv4.fibnl = sk;
        return 0;
}

static void nl_fib_lookup_exit(struct net *net)
{
        netlink_kernel_release(net->ipv4.fibnl);
        net->ipv4.fibnl = NULL;
}

static void fib_disable_ip(struct net_device *dev, unsigned long event,
                           bool force)
{
        if (fib_sync_down_dev(dev, event, force))
                fib_flush(dev_net(dev));
        else
                rt_cache_flush(dev_net(dev));
        arp_ifdown(dev);
}

static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct in_ifaddr *ifa = ptr;
        struct net_device *dev = ifa->ifa_dev->dev;
        struct net *net = dev_net(dev);

        switch (event) {
        case NETDEV_UP:
                fib_add_ifaddr(ifa);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                fib_sync_up(dev, RTNH_F_DEAD);
#endif
                atomic_inc(&net->ipv4.dev_addr_genid);
                rt_cache_flush(net);
                break;
        case NETDEV_DOWN:
                fib_del_ifaddr(ifa, NULL);
                atomic_inc(&net->ipv4.dev_addr_genid);
                if (!ifa->ifa_dev->ifa_list) {
                        /* Last address was deleted from this interface.
                         * Disable IP.
                         */
                        fib_disable_ip(dev, event, true);
                } else {
                        rt_cache_flush(net);
                }
                break;
        }
        return NOTIFY_DONE;
}

static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netdev_notifier_changeupper_info *upper_info = ptr;
        struct netdev_notifier_info_ext *info_ext = ptr;
        struct in_device *in_dev;
        struct net *net = dev_net(dev);
        struct in_ifaddr *ifa;
        unsigned int flags;

        if (event == NETDEV_UNREGISTER) {
                fib_disable_ip(dev, event, true);
                rt_flush_dev(dev);
                return NOTIFY_DONE;
        }

        in_dev = __in_dev_get_rtnl(dev);
        if (!in_dev)
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_UP:
                in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                        fib_add_ifaddr(ifa);
                }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                fib_sync_up(dev, RTNH_F_DEAD);
#endif
                atomic_inc(&net->ipv4.dev_addr_genid);
                rt_cache_flush(net);
                break;
        case NETDEV_DOWN:
                fib_disable_ip(dev, event, false);
                break;
        case NETDEV_CHANGE:
                flags = dev_get_flags(dev);
                if (flags & (IFF_RUNNING | IFF_LOWER_UP))
                        fib_sync_up(dev, RTNH_F_LINKDOWN);
                else
                        fib_sync_down_dev(dev, event, false);
                rt_cache_flush(net);
                break;
        case NETDEV_CHANGEMTU:
                fib_sync_mtu(dev, info_ext->ext.mtu);
                rt_cache_flush(net);
                break;
        case NETDEV_CHANGEUPPER:
                upper_info = ptr;
                /* flush all routes if dev is linked to or unlinked from
                 * an L3 master device (e.g., VRF)
                 */
                if (upper_info->upper_dev &&
                    netif_is_l3_master(upper_info->upper_dev))
                        fib_disable_ip(dev, NETDEV_DOWN, true);
                break;
        }
        return NOTIFY_DONE;
}

static struct notifier_block fib_inetaddr_notifier = {
        .notifier_call = fib_inetaddr_event,
};

static struct notifier_block fib_netdev_notifier = {
        .notifier_call = fib_netdev_event,
};

static int __net_init ip_fib_net_init(struct net *net)
{
        int err;
        size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;

        err = fib4_notifier_init(net);
        if (err)
                return err;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
        /* Default to 3-tuple */
        net->ipv4.sysctl_fib_multipath_hash_fields =
                FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
#endif

        /* Avoid false sharing : Use at least a full cache line */
        size = max_t(size_t, size, L1_CACHE_BYTES);

        net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
        if (!net->ipv4.fib_table_hash) {
                err = -ENOMEM;
                goto err_table_hash_alloc;
        }

        err = fib4_rules_init(net);
        if (err < 0)
                goto err_rules_init;
        return 0;

err_rules_init:
        kfree(net->ipv4.fib_table_hash);
err_table_hash_alloc:
        fib4_notifier_exit(net);
        return err;
}

static void ip_fib_net_exit(struct net *net)
{
        int i;

        ASSERT_RTNL_NET(net);
#ifdef CONFIG_IP_MULTIPLE_TABLES
        RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
        RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
#endif
        /* Destroy the tables in reverse order to guarantee that the
         * local table, ID 255, is destroyed before the main table, ID
         * 254. This is necessary as the local table may contain
         * references to data contained in the main table.
         */
        for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
                struct hlist_head *head = &net->ipv4.fib_table_hash[i];
                struct hlist_node *tmp;
                struct fib_table *tb;

                hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
                        hlist_del(&tb->tb_hlist);
                        fib_table_flush(net, tb, true);
                        fib_free_table(tb);
                }
        }

#ifdef CONFIG_IP_MULTIPLE_TABLES
        fib4_rules_exit(net);
#endif

        kfree(net->ipv4.fib_table_hash);
        fib4_notifier_exit(net);
}

static int __net_init fib_net_init(struct net *net)
{
        int error;

#ifdef CONFIG_IP_ROUTE_CLASSID
        atomic_set(&net->ipv4.fib_num_tclassid_users, 0);
#endif
        error = ip_fib_net_init(net);
        if (error < 0)
                goto out;

        error = fib4_semantics_init(net);
        if (error)
                goto out_semantics;

        error = nl_fib_lookup_init(net);
        if (error < 0)
                goto out_nlfl;

        error = fib_proc_init(net);
        if (error < 0)
                goto out_proc;
out:
        return error;

out_proc:
        nl_fib_lookup_exit(net);
out_nlfl:
        fib4_semantics_exit(net);
out_semantics:
        rtnl_net_lock(net);
        ip_fib_net_exit(net);
        rtnl_net_unlock(net);
        goto out;
}

static void __net_exit fib_net_exit(struct net *net)
{
        fib_proc_exit(net);
        nl_fib_lookup_exit(net);
}

static void __net_exit fib_net_exit_batch(struct list_head *net_list)
{
        struct net *net;

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                __rtnl_net_lock(net);
                ip_fib_net_exit(net);
                __rtnl_net_unlock(net);
        }
        rtnl_unlock();

        list_for_each_entry(net, net_list, exit_list)
                fib4_semantics_exit(net);
}

static struct pernet_operations fib_net_ops = {
        .init = fib_net_init,
        .exit = fib_net_exit,
        .exit_batch = fib_net_exit_batch,
};

static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = {
        {.protocol = PF_INET, .msgtype = RTM_NEWROUTE,
         .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET, .msgtype = RTM_DELROUTE,
         .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib,
         .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
};

void __init ip_fib_init(void)
{
        fib_trie_init();

        register_pernet_subsys(&fib_net_ops);

        register_netdevice_notifier(&fib_netdev_notifier);
        register_inetaddr_notifier(&fib_inetaddr_notifier);

        rtnl_register_many(fib_rtnl_msg_handlers);
}









































































































    1 










































































































































































































































































































































































































































































































































































































  147 




  147 
  148 








  147 






















































































































































    1 
    1 







    1 






























































  146 






  142 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

/*
 * fsnotify inode mark locking/lifetime/and refcnting
 *
 * REFCNT:
 * The group->recnt and mark->refcnt tell how many "things" in the kernel
 * currently are referencing the objects. Both kind of objects typically will
 * live inside the kernel with a refcnt of 2, one for its creation and one for
 * the reference a group and a mark hold to each other.
 * If you are holding the appropriate locks, you can take a reference and the
 * object itself is guaranteed to survive until the reference is dropped.
 *
 * LOCKING:
 * There are 3 locks involved with fsnotify inode marks and they MUST be taken
 * in order as follows:
 *
 * group->mark_mutex
 * mark->lock
 * mark->connector->lock
 *
 * group->mark_mutex protects the marks_list anchored inside a given group and
 * each mark is hooked via the g_list.  It also protects the groups private
 * data (i.e group limits).

 * mark->lock protects the marks attributes like its masks and flags.
 * Furthermore it protects the access to a reference of the group that the mark
 * is assigned to as well as the access to a reference of the inode/vfsmount
 * that is being watched by the mark.
 *
 * mark->connector->lock protects the list of marks anchored inside an
 * inode / vfsmount and each mark is hooked via the i_list.
 *
 * A list of notification marks relating to inode / mnt is contained in
 * fsnotify_mark_connector. That structure is alive as long as there are any
 * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets
 * detached from fsnotify_mark_connector when last reference to the mark is
 * dropped.  Thus having mark reference is enough to protect mark->connector
 * pointer and to make sure fsnotify_mark_connector cannot disappear. Also
 * because we remove mark from g_list before dropping mark reference associated
 * with that, any mark found through g_list is guaranteed to have
 * mark->connector set until we drop group->mark_mutex.
 *
 * LIFETIME:
 * Inode marks survive between when they are added to an inode and when their
 * refcnt==0. Marks are also protected by fsnotify_mark_srcu.
 *
 * The inode mark can be cleared for a number of different reasons including:
 * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
 * - The inode is being evicted from cache. (fsnotify_inode_delete)
 * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
 * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark)
 * - The fsnotify_group associated with the mark is going away and all such marks
 *   need to be cleaned up. (fsnotify_clear_marks_by_group)
 *
 * This has the very interesting property of being able to run concurrently with
 * any (or all) other directions.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/srcu.h>
#include <linux/ratelimit.h>

#include <linux/atomic.h>

#include <linux/fsnotify_backend.h>
#include "fsnotify.h"

#define FSNOTIFY_REAPER_DELAY        (1)        /* 1 jiffy */

struct srcu_struct fsnotify_mark_srcu;
struct kmem_cache *fsnotify_mark_connector_cachep;

static DEFINE_SPINLOCK(destroy_lock);
static LIST_HEAD(destroy_list);
static struct fsnotify_mark_connector *connector_destroy_list;

static void fsnotify_mark_destroy_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn);

static void fsnotify_connector_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn);

void fsnotify_get_mark(struct fsnotify_mark *mark)
{
        WARN_ON_ONCE(!refcount_read(&mark->refcnt));
        refcount_inc(&mark->refcnt);
}

static fsnotify_connp_t *fsnotify_object_connp(void *obj,
                                enum fsnotify_obj_type obj_type)
{
        switch (obj_type) {
        case FSNOTIFY_OBJ_TYPE_INODE:
                return &((struct inode *)obj)->i_fsnotify_marks;
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                return &real_mount(obj)->mnt_fsnotify_marks;
        case FSNOTIFY_OBJ_TYPE_SB:
                return fsnotify_sb_marks(obj);
        case FSNOTIFY_OBJ_TYPE_MNTNS:
                return &((struct mnt_namespace *)obj)->n_fsnotify_marks;
        default:
                return NULL;
        }
}

static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn)
{
        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
                return &fsnotify_conn_inode(conn)->i_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT)
                return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_SB)
                return &fsnotify_conn_sb(conn)->s_fsnotify_mask;
        else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS)
                return &fsnotify_conn_mntns(conn)->n_fsnotify_mask;
        return NULL;
}

__u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
{
        if (WARN_ON(!fsnotify_valid_obj_type(conn->type)))
                return 0;

        return READ_ONCE(*fsnotify_conn_mask_p(conn));
}

static void fsnotify_get_sb_watched_objects(struct super_block *sb)
{
        atomic_long_inc(fsnotify_sb_watched_objects(sb));
}

static void fsnotify_put_sb_watched_objects(struct super_block *sb)
{
        atomic_long_t *watched_objects = fsnotify_sb_watched_objects(sb);

        /* the superblock can go away after this decrement */
        if (atomic_long_dec_and_test(watched_objects))
                wake_up_var(watched_objects);
}

static void fsnotify_get_inode_ref(struct inode *inode)
{
        ihold(inode);
        fsnotify_get_sb_watched_objects(inode->i_sb);
}

static void fsnotify_put_inode_ref(struct inode *inode)
{
        /* read ->i_sb before the inode can go away */
        struct super_block *sb = inode->i_sb;

        iput(inode);
        fsnotify_put_sb_watched_objects(sb);
}

/*
 * Grab or drop watched objects reference depending on whether the connector
 * is attached and has any marks attached.
 */
static void fsnotify_update_sb_watchers(struct super_block *sb,
                                        struct fsnotify_mark_connector *conn)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);
        bool is_watched = conn->flags & FSNOTIFY_CONN_FLAG_IS_WATCHED;
        struct fsnotify_mark *first_mark = NULL;
        unsigned int highest_prio = 0;

        if (conn->obj)
                first_mark = hlist_entry_safe(conn->list.first,
                                              struct fsnotify_mark, obj_list);
        if (first_mark)
                highest_prio = first_mark->group->priority;
        if (WARN_ON(highest_prio >= __FSNOTIFY_PRIO_NUM))
                highest_prio = 0;

        /*
         * If the highest priority of group watching this object is prio,
         * then watched object has a reference on counters [0..prio].
         * Update priority >= 1 watched objects counters.
         */
        for (unsigned int p = conn->prio + 1; p <= highest_prio; p++)
                atomic_long_inc(&sbinfo->watched_objects[p]);
        for (unsigned int p = conn->prio; p > highest_prio; p--)
                atomic_long_dec(&sbinfo->watched_objects[p]);
        conn->prio = highest_prio;

        /* Update priority >= 0 (a.k.a total) watched objects counter */
        BUILD_BUG_ON(FSNOTIFY_PRIO_NORMAL != 0);
        if (first_mark && !is_watched) {
                conn->flags |= FSNOTIFY_CONN_FLAG_IS_WATCHED;
                fsnotify_get_sb_watched_objects(sb);
        } else if (!first_mark && is_watched) {
                conn->flags &= ~FSNOTIFY_CONN_FLAG_IS_WATCHED;
                fsnotify_put_sb_watched_objects(sb);
        }
}

/*
 * Grab or drop inode reference for the connector if needed.
 *
 * When it's time to drop the reference, we only clear the HAS_IREF flag and
 * return the inode object. fsnotify_drop_object() will be resonsible for doing
 * iput() outside of spinlocks. This happens when last mark that wanted iref is
 * detached.
 */
static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
                                          bool want_iref)
{
        bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
        struct inode *inode = NULL;

        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
            want_iref == has_iref)
                return NULL;

        if (want_iref) {
                /* Pin inode if any mark wants inode refcount held */
                fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
                conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
        } else {
                /* Unpin inode after detach of last mark that wanted iref */
                inode = fsnotify_conn_inode(conn);
                conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
        }

        return inode;
}

static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        u32 new_mask = 0;
        bool want_iref = false;
        struct fsnotify_mark *mark;

        assert_spin_locked(&conn->lock);
        /* We can get detached connector here when inode is getting unlinked. */
        if (!fsnotify_valid_obj_type(conn->type))
                return NULL;
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
                        continue;
                new_mask |= fsnotify_calc_mask(mark);
                if (conn->type == FSNOTIFY_OBJ_TYPE_INODE &&
                    !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
                        want_iref = true;
        }
        /*
         * We use WRITE_ONCE() to prevent silly compiler optimizations from
         * confusing readers not holding conn->lock with partial updates.
         */
        WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask);

        return fsnotify_update_iref(conn, want_iref);
}

static bool fsnotify_conn_watches_children(
                                        struct fsnotify_mark_connector *conn)
{
        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
                return false;

        return fsnotify_inode_watches_children(fsnotify_conn_inode(conn));
}

static void fsnotify_conn_set_children_dentry_flags(
                                        struct fsnotify_mark_connector *conn)
{
        if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
                return;

        fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
}

/*
 * Calculate mask of events for a list of marks. The caller must make sure
 * connector and connector->obj cannot disappear under us.  Callers achieve
 * this by holding a mark->lock or mark->group->mark_mutex for a mark on this
 * list.
 */
void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
        bool update_children;

        if (!conn)
                return;

        spin_lock(&conn->lock);
        update_children = !fsnotify_conn_watches_children(conn);
        __fsnotify_recalc_mask(conn);
        update_children &= fsnotify_conn_watches_children(conn);
        spin_unlock(&conn->lock);
        /*
         * Set children's PARENT_WATCHED flags only if parent started watching.
         * When parent stops watching, we clear false positive PARENT_WATCHED
         * flags lazily in __fsnotify_parent().
         */
        if (update_children)
                fsnotify_conn_set_children_dentry_flags(conn);
}

/* Free all connectors queued for freeing once SRCU period ends */
static void fsnotify_connector_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark_connector *conn, *free;

        spin_lock(&destroy_lock);
        conn = connector_destroy_list;
        connector_destroy_list = NULL;
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);
        while (conn) {
                free = conn;
                conn = conn->destroy_next;
                kmem_cache_free(fsnotify_mark_connector_cachep, free);
        }
}

static void *fsnotify_detach_connector_from_object(
                                        struct fsnotify_mark_connector *conn,
                                        unsigned int *type)
{
        fsnotify_connp_t *connp = fsnotify_object_connp(conn->obj, conn->type);
        struct super_block *sb = fsnotify_connector_sb(conn);
        struct inode *inode = NULL;

        *type = conn->type;
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED)
                return NULL;

        if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
                inode = fsnotify_conn_inode(conn);
                inode->i_fsnotify_mask = 0;

                /* Unpin inode when detaching from connector */
                if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
                        inode = NULL;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
                fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
                fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
        } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) {
                fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0;
        }

        rcu_assign_pointer(*connp, NULL);
        conn->obj = NULL;
        conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
        if (sb)
                fsnotify_update_sb_watchers(sb, conn);

        return inode;
}

static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        if (WARN_ON_ONCE(!group))
                return;
        group->ops->free_mark(mark);
        fsnotify_put_group(group);
}

/* Drop object reference originally held by a connector */
static void fsnotify_drop_object(unsigned int type, void *objp)
{
        if (!objp)
                return;
        /* Currently only inode references are passed to be dropped */
        if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
                return;
        fsnotify_put_inode_ref(objp);
}

void fsnotify_put_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_mark_connector *conn = READ_ONCE(mark->connector);
        void *objp = NULL;
        unsigned int type = FSNOTIFY_OBJ_TYPE_DETACHED;
        bool free_conn = false;

        /* Catch marks that were actually never attached to object */
        if (!conn) {
                if (refcount_dec_and_test(&mark->refcnt))
                        fsnotify_final_mark_destroy(mark);
                return;
        }

        /*
         * We have to be careful so that traversals of obj_list under lock can
         * safely grab mark reference.
         */
        if (!refcount_dec_and_lock(&mark->refcnt, &conn->lock))
                return;

        hlist_del_init_rcu(&mark->obj_list);
        if (hlist_empty(&conn->list)) {
                objp = fsnotify_detach_connector_from_object(conn, &type);
                free_conn = true;
        } else {
                struct super_block *sb = fsnotify_connector_sb(conn);

                /* Update watched objects after detaching mark */
                if (sb)
                        fsnotify_update_sb_watchers(sb, conn);
                objp = __fsnotify_recalc_mask(conn);
                type = conn->type;
        }
        WRITE_ONCE(mark->connector, NULL);
        spin_unlock(&conn->lock);

        fsnotify_drop_object(type, objp);

        if (free_conn) {
                spin_lock(&destroy_lock);
                conn->destroy_next = connector_destroy_list;
                connector_destroy_list = conn;
                spin_unlock(&destroy_lock);
                queue_work(system_unbound_wq, &connector_reaper_work);
        }
        /*
         * Note that we didn't update flags telling whether inode cares about
         * what's happening with children. We update these flags from
         * __fsnotify_parent() lazily when next event happens on one of our
         * children.
         */
        spin_lock(&destroy_lock);
        list_add(&mark->g_list, &destroy_list);
        spin_unlock(&destroy_lock);
        queue_delayed_work(system_unbound_wq, &reaper_work,
                           FSNOTIFY_REAPER_DELAY);
}
EXPORT_SYMBOL_GPL(fsnotify_put_mark);

/*
 * Get mark reference when we found the mark via lockless traversal of object
 * list. Mark can be already removed from the list by now and on its way to be
 * destroyed once SRCU period ends.
 *
 * Also pin the group so it doesn't disappear under us.
 */
static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark)
{
        if (!mark)
                return true;

        if (refcount_inc_not_zero(&mark->refcnt)) {
                spin_lock(&mark->lock);
                if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) {
                        /* mark is attached, group is still alive then */
                        atomic_inc(&mark->group->user_waits);
                        spin_unlock(&mark->lock);
                        return true;
                }
                spin_unlock(&mark->lock);
                fsnotify_put_mark(mark);
        }
        return false;
}

/*
 * Puts marks and wakes up group destruction if necessary.
 *
 * Pairs with fsnotify_get_mark_safe()
 */
static void fsnotify_put_mark_wake(struct fsnotify_mark *mark)
{
        if (mark) {
                struct fsnotify_group *group = mark->group;

                fsnotify_put_mark(mark);
                /*
                 * We abuse notification_waitq on group shutdown for waiting for
                 * all marks pinned when waiting for userspace.
                 */
                if (atomic_dec_and_test(&group->user_waits) && group->shutdown)
                        wake_up(&group->notification_waitq);
        }
}

bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
        __releases(&fsnotify_mark_srcu)
{
        int type;

        fsnotify_foreach_iter_type(type) {
                /* This can fail if mark is being removed */
                if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
                        __release(&fsnotify_mark_srcu);
                        goto fail;
                }
        }

        /*
         * Now that both marks are pinned by refcount in the inode / vfsmount
         * lists, we can drop SRCU lock, and safely resume the list iteration
         * once userspace returns.
         */
        srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx);

        return true;

fail:
        for (type--; type >= 0; type--)
                fsnotify_put_mark_wake(iter_info->marks[type]);
        return false;
}

void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
        __acquires(&fsnotify_mark_srcu)
{
        int type;

        iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
        fsnotify_foreach_iter_type(type)
                fsnotify_put_mark_wake(iter_info->marks[type]);
}

/*
 * Mark mark as detached, remove it from group list. Mark still stays in object
 * list until its last reference is dropped. Note that we rely on mark being
 * removed from group list before corresponding reference to it is dropped. In
 * particular we rely on mark->connector being valid while we hold
 * group->mark_mutex if we found the mark through g_list.
 *
 * Must be called with group->mark_mutex held. The caller must either hold
 * reference to the mark or be protected by fsnotify_mark_srcu.
 */
void fsnotify_detach_mark(struct fsnotify_mark *mark)
{
        fsnotify_group_assert_locked(mark->group);
        WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
                     refcount_read(&mark->refcnt) < 1 +
                        !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED;
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        /* Drop mark reference acquired in fsnotify_add_mark_locked() */
        fsnotify_put_mark(mark);
}

/*
 * Free fsnotify mark. The mark is actually only marked as being freed.  The
 * freeing is actually happening only once last reference to the mark is
 * dropped from a workqueue which first waits for srcu period end.
 *
 * Caller must have a reference to the mark or be protected by
 * fsnotify_mark_srcu.
 */
void fsnotify_free_mark(struct fsnotify_mark *mark)
{
        struct fsnotify_group *group = mark->group;

        spin_lock(&mark->lock);
        /* something else already called this function on this mark */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) {
                spin_unlock(&mark->lock);
                return;
        }
        mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
        spin_unlock(&mark->lock);

        /*
         * Some groups like to know that marks are being freed.  This is a
         * callback to the group function to let it know that this mark
         * is being freed.
         */
        if (group->ops->freeing_mark)
                group->ops->freeing_mark(mark, group);
}

void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                           struct fsnotify_group *group)
{
        fsnotify_group_lock(group);
        fsnotify_detach_mark(mark);
        fsnotify_group_unlock(group);
        fsnotify_free_mark(mark);
}
EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);

/*
 * Sorting function for lists of fsnotify marks.
 *
 * Fanotify supports different notification classes (reflected as priority of
 * notification group). Events shall be passed to notification groups in
 * decreasing priority order. To achieve this marks in notification lists for
 * inodes and vfsmounts are sorted so that priorities of corresponding groups
 * are descending.
 *
 * Furthermore correct handling of the ignore mask requires processing inode
 * and vfsmount marks of each group together. Using the group address as
 * further sort criterion provides a unique sorting order and thus we can
 * merge inode and vfsmount lists of marks in linear time and find groups
 * present in both lists.
 *
 * A return value of 1 signifies that b has priority over a.
 * A return value of 0 signifies that the two marks have to be handled together.
 * A return value of -1 signifies that a has priority over b.
 */
int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
{
        if (a == b)
                return 0;
        if (!a)
                return 1;
        if (!b)
                return -1;
        if (a->priority < b->priority)
                return 1;
        if (a->priority > b->priority)
                return -1;
        if (a < b)
                return 1;
        return -1;
}

static int fsnotify_attach_info_to_sb(struct super_block *sb)
{
        struct fsnotify_sb_info *sbinfo;

        /* sb info is freed on fsnotify_sb_delete() */
        sbinfo = kzalloc(sizeof(*sbinfo), GFP_KERNEL);
        if (!sbinfo)
                return -ENOMEM;

        /*
         * cmpxchg() provides the barrier so that callers of fsnotify_sb_info()
         * will observe an initialized structure
         */
        if (cmpxchg(&sb->s_fsnotify_info, NULL, sbinfo)) {
                /* Someone else created sbinfo for us */
                kfree(sbinfo);
        }
        return 0;
}

static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
                                               void *obj, unsigned int obj_type)
{
        struct fsnotify_mark_connector *conn;

        conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
        if (!conn)
                return -ENOMEM;
        spin_lock_init(&conn->lock);
        INIT_HLIST_HEAD(&conn->list);
        conn->flags = 0;
        conn->prio = 0;
        conn->type = obj_type;
        conn->obj = obj;

        /*
         * cmpxchg() provides the barrier so that readers of *connp can see
         * only initialized structure
         */
        if (cmpxchg(connp, NULL, conn)) {
                /* Someone else created list structure for us */
                kmem_cache_free(fsnotify_mark_connector_cachep, conn);
        }
        return 0;
}

/*
 * Get mark connector, make sure it is alive and return with its lock held.
 * This is for users that get connector pointer from inode or mount. Users that
 * hold reference to a mark on the list may directly lock connector->lock as
 * they are sure list cannot go away under them.
 */
static struct fsnotify_mark_connector *fsnotify_grab_connector(
                                                fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        int idx;

        idx = srcu_read_lock(&fsnotify_mark_srcu);
        conn = srcu_dereference(*connp, &fsnotify_mark_srcu);
        if (!conn)
                goto out;
        spin_lock(&conn->lock);
        if (conn->type == FSNOTIFY_OBJ_TYPE_DETACHED) {
                spin_unlock(&conn->lock);
                srcu_read_unlock(&fsnotify_mark_srcu, idx);
                return NULL;
        }
out:
        srcu_read_unlock(&fsnotify_mark_srcu, idx);
        return conn;
}

/*
 * Add mark into proper place in given list of marks. These marks may be used
 * for the fsnotify backend to determine which event types should be delivered
 * to which group and for which inodes. These marks are ordered according to
 * priority, highest number first, and then by the group's location in memory.
 */
static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj,
                                  unsigned int obj_type, int add_flags)
{
        struct super_block *sb = fsnotify_object_sb(obj, obj_type);
        struct fsnotify_mark *lmark, *last = NULL;
        struct fsnotify_mark_connector *conn;
        fsnotify_connp_t *connp;
        int cmp;
        int err = 0;

        if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
                return -EINVAL;

        /*
         * Attach the sb info before attaching a connector to any object on sb.
         * The sb info will remain attached as long as sb lives.
         */
        if (sb && !fsnotify_sb_info(sb)) {
                err = fsnotify_attach_info_to_sb(sb);
                if (err)
                        return err;
        }

        connp = fsnotify_object_connp(obj, obj_type);
restart:
        spin_lock(&mark->lock);
        conn = fsnotify_grab_connector(connp);
        if (!conn) {
                spin_unlock(&mark->lock);
                err = fsnotify_attach_connector_to_object(connp, obj, obj_type);
                if (err)
                        return err;
                goto restart;
        }

        /* is mark the first mark? */
        if (hlist_empty(&conn->list)) {
                hlist_add_head_rcu(&mark->obj_list, &conn->list);
                goto added;
        }

        /* should mark be in the middle of the current list? */
        hlist_for_each_entry(lmark, &conn->list, obj_list) {
                last = lmark;

                if ((lmark->group == mark->group) &&
                    (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
                    !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) {
                        err = -EEXIST;
                        goto out_err;
                }

                cmp = fsnotify_compare_groups(lmark->group, mark->group);
                if (cmp >= 0) {
                        hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list);
                        goto added;
                }
        }

        BUG_ON(last == NULL);
        /* mark should be the last entry.  last is the current last entry */
        hlist_add_behind_rcu(&mark->obj_list, &last->obj_list);
added:
        if (sb)
                fsnotify_update_sb_watchers(sb, conn);
        /*
         * Since connector is attached to object using cmpxchg() we are
         * guaranteed that connector initialization is fully visible by anyone
         * seeing mark->connector set.
         */
        WRITE_ONCE(mark->connector, conn);
out_err:
        spin_unlock(&conn->lock);
        spin_unlock(&mark->lock);
        return err;
}

/*
 * Attach an initialized mark to a given group and fs object.
 * These marks may be used for the fsnotify backend to determine which
 * event types should be delivered to which group.
 */
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
                             void *obj, unsigned int obj_type,
                             int add_flags)
{
        struct fsnotify_group *group = mark->group;
        int ret = 0;

        fsnotify_group_assert_locked(group);

        /*
         * LOCKING ORDER!!!!
         * group->mark_mutex
         * mark->lock
         * mark->connector->lock
         */
        spin_lock(&mark->lock);
        mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;

        list_add(&mark->g_list, &group->marks_list);
        fsnotify_get_mark(mark); /* for g_list */
        spin_unlock(&mark->lock);

        ret = fsnotify_add_mark_list(mark, obj, obj_type, add_flags);
        if (ret)
                goto err;

        fsnotify_recalc_mask(mark->connector);

        return ret;
err:
        spin_lock(&mark->lock);
        mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE |
                         FSNOTIFY_MARK_FLAG_ATTACHED);
        list_del_init(&mark->g_list);
        spin_unlock(&mark->lock);

        fsnotify_put_mark(mark);
        return ret;
}

int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
                      unsigned int obj_type, int add_flags)
{
        int ret;
        struct fsnotify_group *group = mark->group;

        fsnotify_group_lock(group);
        ret = fsnotify_add_mark_locked(mark, obj, obj_type, add_flags);
        fsnotify_group_unlock(group);
        return ret;
}
EXPORT_SYMBOL_GPL(fsnotify_add_mark);

/*
 * Given a list of marks, find the mark associated with given group. If found
 * take a reference to that mark and return it, else return NULL.
 */
struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
                                         struct fsnotify_group *group)
{
        fsnotify_connp_t *connp = fsnotify_object_connp(obj, obj_type);
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark;

        if (!connp)
                return NULL;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return NULL;

        hlist_for_each_entry(mark, &conn->list, obj_list) {
                if (mark->group == group &&
                    (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) {
                        fsnotify_get_mark(mark);
                        spin_unlock(&conn->lock);
                        return mark;
                }
        }
        spin_unlock(&conn->lock);
        return NULL;
}
EXPORT_SYMBOL_GPL(fsnotify_find_mark);

/* Clear any marks in a group with given type mask */
void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                   unsigned int obj_type)
{
        struct fsnotify_mark *lmark, *mark;
        LIST_HEAD(to_free);
        struct list_head *head = &to_free;

        /* Skip selection step if we want to clear all marks. */
        if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) {
                head = &group->marks_list;
                goto clear;
        }
        /*
         * We have to be really careful here. Anytime we drop mark_mutex, e.g.
         * fsnotify_clear_marks_by_inode() can come and free marks. Even in our
         * to_free list so we have to use mark_mutex even when accessing that
         * list. And freeing mark requires us to drop mark_mutex. So we can
         * reliably free only the first mark in the list. That's why we first
         * move marks to free to to_free list in one go and then free marks in
         * to_free list one by one.
         */
        fsnotify_group_lock(group);
        list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
                if (mark->connector->type == obj_type)
                        list_move(&mark->g_list, &to_free);
        }
        fsnotify_group_unlock(group);

clear:
        while (1) {
                fsnotify_group_lock(group);
                if (list_empty(head)) {
                        fsnotify_group_unlock(group);
                        break;
                }
                mark = list_first_entry(head, struct fsnotify_mark, g_list);
                fsnotify_get_mark(mark);
                fsnotify_detach_mark(mark);
                fsnotify_group_unlock(group);
                fsnotify_free_mark(mark);
                fsnotify_put_mark(mark);
        }
}

/* Destroy all marks attached to an object via connector */
void fsnotify_destroy_marks(fsnotify_connp_t *connp)
{
        struct fsnotify_mark_connector *conn;
        struct fsnotify_mark *mark, *old_mark = NULL;
        void *objp;
        unsigned int type;

        conn = fsnotify_grab_connector(connp);
        if (!conn)
                return;
        /*
         * We have to be careful since we can race with e.g.
         * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the
         * list can get modified. However we are holding mark reference and
         * thus our mark cannot be removed from obj_list so we can continue
         * iteration after regaining conn->lock.
         */
        hlist_for_each_entry(mark, &conn->list, obj_list) {
                fsnotify_get_mark(mark);
                spin_unlock(&conn->lock);
                if (old_mark)
                        fsnotify_put_mark(old_mark);
                old_mark = mark;
                fsnotify_destroy_mark(mark, mark->group);
                spin_lock(&conn->lock);
        }
        /*
         * Detach list from object now so that we don't pin inode until all
         * mark references get dropped. It would lead to strange results such
         * as delaying inode deletion or blocking unmount.
         */
        objp = fsnotify_detach_connector_from_object(conn, &type);
        spin_unlock(&conn->lock);
        if (old_mark)
                fsnotify_put_mark(old_mark);
        fsnotify_drop_object(type, objp);
}

/*
 * Nothing fancy, just initialize lists and locks and counters.
 */
void fsnotify_init_mark(struct fsnotify_mark *mark,
                        struct fsnotify_group *group)
{
        memset(mark, 0, sizeof(*mark));
        spin_lock_init(&mark->lock);
        refcount_set(&mark->refcnt, 1);
        fsnotify_get_group(group);
        mark->group = group;
        WRITE_ONCE(mark->connector, NULL);
}
EXPORT_SYMBOL_GPL(fsnotify_init_mark);

/*
 * Destroy all marks in destroy_list, waits for SRCU period to finish before
 * actually freeing marks.
 */
static void fsnotify_mark_destroy_workfn(struct work_struct *work)
{
        struct fsnotify_mark *mark, *next;
        struct list_head private_destroy_list;

        spin_lock(&destroy_lock);
        /* exchange the list head */
        list_replace_init(&destroy_list, &private_destroy_list);
        spin_unlock(&destroy_lock);

        synchronize_srcu(&fsnotify_mark_srcu);

        list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
                list_del_init(&mark->g_list);
                fsnotify_final_mark_destroy(mark);
        }
}

/* Wait for all marks queued for destruction to be actually destroyed */
void fsnotify_wait_marks_destroyed(void)
{
        flush_delayed_work(&reaper_work);
}
EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed);










  166 


1
2
3
4
5
6
7
8
9
10
11
12
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012-2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#include <asm/kvm_hyp.h>

void __kvm_timer_set_cntvoff(u64 cntvoff)
{
        write_sysreg(cntvoff, cntvoff_el2);
}





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Type definitions for the multi-level security (MLS) policy.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

/*
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *          Support for enhanced MLS infrastructure.
 *          Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 */

#ifndef _SS_MLS_TYPES_H_
#define _SS_MLS_TYPES_H_

#include "security.h"
#include "ebitmap.h"

struct mls_level {
        u32 sens; /* sensitivity */
        struct ebitmap cat; /* category set */
};

struct mls_range {
        struct mls_level level[2]; /* low == level[0], high == level[1] */
};

static inline int mls_level_eq(const struct mls_level *l1,
                               const struct mls_level *l2)
{
        return ((l1->sens == l2->sens) && ebitmap_equal(&l1->cat, &l2->cat));
}

static inline int mls_level_dom(const struct mls_level *l1,
                                const struct mls_level *l2)
{
        return ((l1->sens >= l2->sens) &&
                ebitmap_contains(&l1->cat, &l2->cat, 0));
}

#define mls_level_incomp(l1, l2) \
        (!mls_level_dom((l1), (l2)) && !mls_level_dom((l2), (l1)))

#define mls_level_between(l1, l2, l3) \
        (mls_level_dom((l1), (l2)) && mls_level_dom((l3), (l1)))

#define mls_range_contains(r1, r2)                        \
        (mls_level_dom(&(r2).level[0], &(r1).level[0]) && \
         mls_level_dom(&(r1).level[1], &(r2).level[1]))

#endif /* _SS_MLS_TYPES_H_ */
















































































































































































































   76 





   76 

















   12 









































































































   68 
   12 


































































































  285 






  285 









































  255 











  255 









  321 

  321 

  321 








































   76 




   76 
   76 
   76 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

#include <linux/anon_inodes.h>
#include <linux/slab.h>
#include <linux/sched/autogroup.h>
#include <linux/sched/mm.h>
#include <linux/sched/user.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/stat.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
#include <linux/sched/ext.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
#include <linux/unistd.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
#include <linux/personality.h>
#include <linux/mempolicy.h>
#include <linux/sem.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/iocontext.h>
#include <linux/key.h>
#include <linux/kmsan.h>
#include <linux/binfmts.h>
#include <linux/mman.h>
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/memblock.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/cgroup.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/seccomp.h>
#include <linux/swap.h>
#include <linux/syscalls.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/jiffies.h>
#include <linux/futex.h>
#include <linux/compat.h>
#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/proc_fs.h>
#include <linux/profile.h>
#include <linux/rmap.h>
#include <linux/ksm.h>
#include <linux/acct.h>
#include <linux/userfaultfd_k.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/tty.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
#include <linux/kcov.h>
#include <linux/livepatch.h>
#include <linux/thread_info.h>
#include <linux/stackleak.h>
#include <linux/kasan.h>
#include <linux/scs.h>
#include <linux/io_uring.h>
#include <linux/bpf.h>
#include <linux/stackprotector.h>
#include <linux/user_events.h>
#include <linux/iommu.h>
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
#include <linux/tick.h>

#include <asm/pgalloc.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

#include <trace/events/sched.h>

#define CREATE_TRACE_POINTS
#include <trace/events/task.h>

#include <kunit/visibility.h>

/*
 * Minimum number of threads to boot the kernel
 */
#define MIN_THREADS 20

/*
 * Maximum number of threads
 */
#define MAX_THREADS FUTEX_TID_MASK

/*
 * Protected counters by write_lock_irq(&tasklist_lock)
 */
unsigned long total_forks;        /* Handle normal Linux uptimes. */
int nr_threads;                        /* The idle threads do not count.. */

static int max_threads;                /* tunable limit on nr_threads */

#define NAMED_ARRAY_INDEX(x)        [x] = __stringify(x)

static const char * const resident_page_types[] = {
        NAMED_ARRAY_INDEX(MM_FILEPAGES),
        NAMED_ARRAY_INDEX(MM_ANONPAGES),
        NAMED_ARRAY_INDEX(MM_SWAPENTS),
        NAMED_ARRAY_INDEX(MM_SHMEMPAGES),
};

DEFINE_PER_CPU(unsigned long, process_counts) = 0;

__cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */

#ifdef CONFIG_PROVE_RCU
int lockdep_tasklist_lock_is_held(void)
{
        return lockdep_is_held(&tasklist_lock);
}
EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held);
#endif /* #ifdef CONFIG_PROVE_RCU */

int nr_processes(void)
{
        int cpu;
        int total = 0;

        for_each_possible_cpu(cpu)
                total += per_cpu(process_counts, cpu);

        return total;
}

void __weak arch_release_task_struct(struct task_struct *tsk)
{
}

static struct kmem_cache *task_struct_cachep;

static inline struct task_struct *alloc_task_struct_node(int node)
{
        return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node);
}

static inline void free_task_struct(struct task_struct *tsk)
{
        kmem_cache_free(task_struct_cachep, tsk);
}

/*
 * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
 * kmemcache based allocator.
 */
# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)

#  ifdef CONFIG_VMAP_STACK
/*
 * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
 * flush.  Try to minimize the number of calls by caching stacks.
 */
#define NR_CACHED_STACKS 2
static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);

struct vm_stack {
        struct rcu_head rcu;
        struct vm_struct *stack_vm_area;
};

static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
{
        unsigned int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *tmp = NULL;

                if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm))
                        return true;
        }
        return false;
}

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);

        if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
                return;

        vfree(vm_stack);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct vm_stack *vm_stack = tsk->stack;

        vm_stack->stack_vm_area = tsk->stack_vm_area;
        call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
}

static int free_vm_stack_cache(unsigned int cpu)
{
        struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *vm_stack = cached_vm_stacks[i];

                if (!vm_stack)
                        continue;

                vfree(vm_stack->addr);
                cached_vm_stacks[i] = NULL;
        }

        return 0;
}

static int memcg_charge_kernel_stack(struct vm_struct *vm)
{
        int i;
        int ret;
        int nr_charged = 0;

        BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);

        for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
                ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
                if (ret)
                        goto err;
                nr_charged++;
        }
        return 0;
err:
        for (i = 0; i < nr_charged; i++)
                memcg_kmem_uncharge_page(vm->pages[i], 0);
        return ret;
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct vm_struct *vm;
        void *stack;
        int i;

        for (i = 0; i < NR_CACHED_STACKS; i++) {
                struct vm_struct *s;

                s = this_cpu_xchg(cached_stacks[i], NULL);

                if (!s)
                        continue;

                /* Reset stack metadata. */
                kasan_unpoison_range(s->addr, THREAD_SIZE);

                stack = kasan_reset_tag(s->addr);

                /* Clear stale pointers from reused stack. */
                memset(stack, 0, THREAD_SIZE);

                if (memcg_charge_kernel_stack(s)) {
                        vfree(s->addr);
                        return -ENOMEM;
                }

                tsk->stack_vm_area = s;
                tsk->stack = stack;
                return 0;
        }

        /*
         * Allocated stacks are cached and later reused by new threads,
         * so memcg accounting is performed manually on assigning/releasing
         * stacks to tasks. Drop __GFP_ACCOUNT.
         */
        stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
                                     THREADINFO_GFP & ~__GFP_ACCOUNT,
                                     node, __builtin_return_address(0));
        if (!stack)
                return -ENOMEM;

        vm = find_vm_area(stack);
        if (memcg_charge_kernel_stack(vm)) {
                vfree(stack);
                return -ENOMEM;
        }
        /*
         * We can't call find_vm_area() in interrupt context, and
         * free_thread_stack() can be called in interrupt context,
         * so cache the vm_struct.
         */
        tsk->stack_vm_area = vm;
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return 0;
}

static void free_thread_stack(struct task_struct *tsk)
{
        if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
                thread_stack_delayed_free(tsk);

        tsk->stack = NULL;
        tsk->stack_vm_area = NULL;
}

#  else /* !CONFIG_VMAP_STACK */

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);

        if (likely(page)) {
                tsk->stack = kasan_reset_tag(page_address(page));
                return 0;
        }
        return -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

#  endif /* CONFIG_VMAP_STACK */
# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */

static struct kmem_cache *thread_stack_cache;

static void thread_stack_free_rcu(struct rcu_head *rh)
{
        kmem_cache_free(thread_stack_cache, rh);
}

static void thread_stack_delayed_free(struct task_struct *tsk)
{
        struct rcu_head *rh = tsk->stack;

        call_rcu(rh, thread_stack_free_rcu);
}

static int alloc_thread_stack_node(struct task_struct *tsk, int node)
{
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
        return stack ? 0 : -ENOMEM;
}

static void free_thread_stack(struct task_struct *tsk)
{
        thread_stack_delayed_free(tsk);
        tsk->stack = NULL;
}

void thread_stack_cache_init(void)
{
        thread_stack_cache = kmem_cache_create_usercopy("thread_stack",
                                        THREAD_SIZE, THREAD_SIZE, 0, 0,
                                        THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
}

# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */

/* SLAB cache for signal_struct structures (tsk->signal) */
static struct kmem_cache *signal_cachep;

/* SLAB cache for sighand_struct structures (tsk->sighand) */
struct kmem_cache *sighand_cachep;

/* SLAB cache for files_struct structures (tsk->files) */
struct kmem_cache *files_cachep;

/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;

/* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;

/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;

struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!vma)
                return NULL;

        vma_init(vma, mm);

        return vma;
}

static void vm_area_init_from(const struct vm_area_struct *src,
                              struct vm_area_struct *dest)
{
        dest->vm_mm = src->vm_mm;
        dest->vm_ops = src->vm_ops;
        dest->vm_start = src->vm_start;
        dest->vm_end = src->vm_end;
        dest->anon_vma = src->anon_vma;
        dest->vm_pgoff = src->vm_pgoff;
        dest->vm_file = src->vm_file;
        dest->vm_private_data = src->vm_private_data;
        vm_flags_init(dest, src->vm_flags);
        memcpy(&dest->vm_page_prot, &src->vm_page_prot,
               sizeof(dest->vm_page_prot));
        /*
         * src->shared.rb may be modified concurrently when called from
         * dup_mmap(), but the clone will reinitialize it.
         */
        data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
        memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
               sizeof(dest->vm_userfaultfd_ctx));
#ifdef CONFIG_ANON_VMA_NAME
        dest->anon_name = src->anon_name;
#endif
#ifdef CONFIG_SWAP
        memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
               sizeof(dest->swap_readahead_info));
#endif
#ifndef CONFIG_MMU
        dest->vm_region = src->vm_region;
#endif
#ifdef CONFIG_NUMA
        dest->vm_policy = src->vm_policy;
#endif
}

struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
        struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);

        if (!new)
                return NULL;

        ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
        ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
        vm_area_init_from(orig, new);
        vma_lock_init(new, true);
        INIT_LIST_HEAD(&new->anon_vma_chain);
        vma_numab_state_init(new);
        dup_anon_vma_name(orig, new);

        /* track_pfn_copy() will later take care of copying internal state. */
        if (unlikely(new->vm_flags & VM_PFNMAP))
                untrack_pfn_clear(new);

        return new;
}

void vm_area_free(struct vm_area_struct *vma)
{
        /* The vma should be detached while being destroyed. */
        vma_assert_detached(vma);
        vma_numab_state_free(vma);
        free_anon_vma_name(vma);
        kmem_cache_free(vm_area_cachep, vma);
}

static void account_kernel_stack(struct task_struct *tsk, int account)
{
        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm = task_stack_vm_area(tsk);
                int i;

                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
                                              account * (PAGE_SIZE / 1024));
        } else {
                void *stack = task_stack_page(tsk);

                /* All stack pages are in the same node. */
                mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
        }
}

void exit_task_stack_account(struct task_struct *tsk)
{
        account_kernel_stack(tsk, -1);

        if (IS_ENABLED(CONFIG_VMAP_STACK)) {
                struct vm_struct *vm;
                int i;

                vm = task_stack_vm_area(tsk);
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        memcg_kmem_uncharge_page(vm->pages[i], 0);
        }
}

static void release_task_stack(struct task_struct *tsk)
{
        if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
                return;  /* Better to leak the stack than to free prematurely */

        free_thread_stack(tsk);
}

#ifdef CONFIG_THREAD_INFO_IN_TASK
void put_task_stack(struct task_struct *tsk)
{
        if (refcount_dec_and_test(&tsk->stack_refcount))
                release_task_stack(tsk);
}
#endif

void free_task(struct task_struct *tsk)
{
#ifdef CONFIG_SECCOMP
        WARN_ON_ONCE(tsk->seccomp.filter);
#endif
        release_user_cpus_ptr(tsk);
        scs_release(tsk);

#ifndef CONFIG_THREAD_INFO_IN_TASK
        /*
         * The task is finally done with both the stack and thread_info,
         * so free both.
         */
        release_task_stack(tsk);
#else
        /*
         * If the task had a separate stack allocation, it should be gone
         * by now.
         */
        WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
#endif
        rt_mutex_debug_task_free(tsk);
        ftrace_graph_exit_task(tsk);
        arch_release_task_struct(tsk);
        if (tsk->flags & PF_KTHREAD)
                free_kthread_struct(tsk);
        bpf_task_storage_free(tsk);
        free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);

static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
{
        struct file *exe_file;

        exe_file = get_mm_exe_file(oldmm);
        RCU_INIT_POINTER(mm->exe_file, exe_file);
        /*
         * We depend on the oldmm having properly denied write access to the
         * exe_file already.
         */
        if (exe_file && exe_file_deny_write_access(exe_file))
                pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
}

#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
                                        struct mm_struct *oldmm)
{
        struct vm_area_struct *mpnt, *tmp;
        int retval;
        unsigned long charge = 0;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, 0);

        if (mmap_write_lock_killable(oldmm))
                return -EINTR;
        flush_cache_dup_mm(oldmm);
        uprobe_dup_mmap(oldmm, mm);
        /*
         * Not linked in yet - no deadlock potential:
         */
        mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING);

        /* No ordering required: file already has been exposed. */
        dup_mm_exe_file(mm, oldmm);

        mm->total_vm = oldmm->total_vm;
        mm->data_vm = oldmm->data_vm;
        mm->exec_vm = oldmm->exec_vm;
        mm->stack_vm = oldmm->stack_vm;

        /* Use __mt_dup() to efficiently build an identical maple tree. */
        retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL);
        if (unlikely(retval))
                goto out;

        mt_clear_in_rcu(vmi.mas.tree);
        for_each_vma(vmi, mpnt) {
                struct file *file;

                vma_start_write(mpnt);
                if (mpnt->vm_flags & VM_DONTCOPY) {
                        retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start,
                                                    mpnt->vm_end, GFP_KERNEL);
                        if (retval)
                                goto loop_out;

                        vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt));
                        continue;
                }
                charge = 0;
                /*
                 * Don't duplicate many vmas if we've been oom-killed (for
                 * example)
                 */
                if (fatal_signal_pending(current)) {
                        retval = -EINTR;
                        goto loop_out;
                }
                if (mpnt->vm_flags & VM_ACCOUNT) {
                        unsigned long len = vma_pages(mpnt);

                        if (security_vm_enough_memory_mm(oldmm, len)) /* sic */
                                goto fail_nomem;
                        charge = len;
                }
                tmp = vm_area_dup(mpnt);
                if (!tmp)
                        goto fail_nomem;
                retval = vma_dup_policy(mpnt, tmp);
                if (retval)
                        goto fail_nomem_policy;
                tmp->vm_mm = mm;
                retval = dup_userfaultfd(tmp, &uf);
                if (retval)
                        goto fail_nomem_anon_vma_fork;
                if (tmp->vm_flags & VM_WIPEONFORK) {
                        /*
                         * VM_WIPEONFORK gets a clean slate in the child.
                         * Don't prepare anon_vma until fault since we don't
                         * copy page for current vma.
                         */
                        tmp->anon_vma = NULL;
                } else if (anon_vma_fork(tmp, mpnt))
                        goto fail_nomem_anon_vma_fork;
                vm_flags_clear(tmp, VM_LOCKED_MASK);
                /*
                 * Copy/update hugetlb private vma information.
                 */
                if (is_vm_hugetlb_page(tmp))
                        hugetlb_dup_vma_private(tmp);

                /*
                 * Link the vma into the MT. After using __mt_dup(), memory
                 * allocation is not necessary here, so it cannot fail.
                 */
                vma_iter_bulk_store(&vmi, tmp);

                mm->map_count++;

                if (tmp->vm_ops && tmp->vm_ops->open)
                        tmp->vm_ops->open(tmp);

                file = tmp->vm_file;
                if (file) {
                        struct address_space *mapping = file->f_mapping;

                        get_file(file);
                        i_mmap_lock_write(mapping);
                        if (vma_is_shared_maywrite(tmp))
                                mapping_allow_writable(mapping);
                        flush_dcache_mmap_lock(mapping);
                        /* insert tmp into the share list, just after mpnt */
                        vma_interval_tree_insert_after(tmp, mpnt,
                                        &mapping->i_mmap);
                        flush_dcache_mmap_unlock(mapping);
                        i_mmap_unlock_write(mapping);
                }

                if (!(tmp->vm_flags & VM_WIPEONFORK))
                        retval = copy_page_range(tmp, mpnt);

                if (retval) {
                        mpnt = vma_next(&vmi);
                        goto loop_out;
                }
        }
        /* a new mm has just been created */
        retval = arch_dup_mmap(oldmm, mm);
loop_out:
        vma_iter_free(&vmi);
        if (!retval) {
                mt_set_in_rcu(vmi.mas.tree);
                ksm_fork(mm, oldmm);
                khugepaged_fork(mm, oldmm);
        } else {

                /*
                 * The entire maple tree has already been duplicated. If the
                 * mmap duplication fails, mark the failure point with
                 * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered,
                 * stop releasing VMAs that have not been duplicated after this
                 * point.
                 */
                if (mpnt) {
                        mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
                        mas_store(&vmi.mas, XA_ZERO_ENTRY);
                        /* Avoid OOM iterating a broken tree */
                        set_bit(MMF_OOM_SKIP, &mm->flags);
                }
                /*
                 * The mm_struct is going to exit, but the locks will be dropped
                 * first.  Set the mm_struct as unstable is advisable as it is
                 * not fully initialised.
                 */
                set_bit(MMF_UNSTABLE, &mm->flags);
        }
out:
        mmap_write_unlock(mm);
        flush_tlb_mm(oldmm);
        mmap_write_unlock(oldmm);
        if (!retval)
                dup_userfaultfd_complete(&uf);
        else
                dup_userfaultfd_fail(&uf);
        return retval;

fail_nomem_anon_vma_fork:
        mpol_put(vma_policy(tmp));
fail_nomem_policy:
        vm_area_free(tmp);
fail_nomem:
        retval = -ENOMEM;
        vm_unacct_memory(charge);
        goto loop_out;
}

static inline int mm_alloc_pgd(struct mm_struct *mm)
{
        mm->pgd = pgd_alloc(mm);
        if (unlikely(!mm->pgd))
                return -ENOMEM;
        return 0;
}

static inline void mm_free_pgd(struct mm_struct *mm)
{
        pgd_free(mm, mm->pgd);
}
#else
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
        mmap_write_lock(oldmm);
        dup_mm_exe_file(mm, oldmm);
        mmap_write_unlock(oldmm);
        return 0;
}
#define mm_alloc_pgd(mm)        (0)
#define mm_free_pgd(mm)
#endif /* CONFIG_MMU */

#ifdef CONFIG_MM_ID
static DEFINE_IDA(mm_ida);

static inline int mm_alloc_id(struct mm_struct *mm)
{
        int ret;

        ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
        if (ret < 0)
                return ret;
        mm->mm_id = ret;
        return 0;
}

static inline void mm_free_id(struct mm_struct *mm)
{
        const mm_id_t id = mm->mm_id;

        mm->mm_id = MM_ID_DUMMY;
        if (id == MM_ID_DUMMY)
                return;
        if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
                return;
        ida_free(&mm_ida, id);
}
#else /* !CONFIG_MM_ID */
static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
static inline void mm_free_id(struct mm_struct *mm) {}
#endif /* CONFIG_MM_ID */

static void check_mm(struct mm_struct *mm)
{
        int i;

        BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS,
                         "Please make sure 'struct resident_page_types[]' is updated as well");

        for (i = 0; i < NR_MM_COUNTERS; i++) {
                long x = percpu_counter_sum(&mm->rss_stat[i]);

                if (unlikely(x))
                        pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
                                 mm, resident_page_types[i], x);
        }

        if (mm_pgtables_bytes(mm))
                pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
                                mm_pgtables_bytes(mm));

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
        VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}

#define allocate_mm()        (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
#define free_mm(mm)        (kmem_cache_free(mm_cachep, (mm)))

static void do_check_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        WARN_ON_ONCE(current->active_mm == mm);
}

static void do_shoot_lazy_tlb(void *arg)
{
        struct mm_struct *mm = arg;

        if (current->active_mm == mm) {
                WARN_ON_ONCE(current->mm);
                current->active_mm = &init_mm;
                switch_mm(mm, &init_mm, current);
        }
}

static void cleanup_lazy_tlbs(struct mm_struct *mm)
{
        if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) {
                /*
                 * In this case, lazy tlb mms are refounted and would not reach
                 * __mmdrop until all CPUs have switched away and mmdrop()ed.
                 */
                return;
        }

        /*
         * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it
         * requires lazy mm users to switch to another mm when the refcount
         * drops to zero, before the mm is freed. This requires IPIs here to
         * switch kernel threads to init_mm.
         *
         * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm
         * switch with the final userspace teardown TLB flush which leaves the
         * mm lazy on this CPU but no others, reducing the need for additional
         * IPIs here. There are cases where a final IPI is still required here,
         * such as the final mmdrop being performed on a different CPU than the
         * one exiting, or kernel threads using the mm when userspace exits.
         *
         * IPI overheads have not found to be expensive, but they could be
         * reduced in a number of possible ways, for example (roughly
         * increasing order of complexity):
         * - The last lazy reference created by exit_mm() could instead switch
         *   to init_mm, however it's probable this will run on the same CPU
         *   immediately afterwards, so this may not reduce IPIs much.
         * - A batch of mms requiring IPIs could be gathered and freed at once.
         * - CPUs store active_mm where it can be remotely checked without a
         *   lock, to filter out false-positives in the cpumask.
         * - After mm_users or mm_count reaches zero, switching away from the
         *   mm could clear mm_cpumask to reduce some IPIs, perhaps together
         *   with some batching or delaying of the final IPIs.
         * - A delayed freeing and RCU-like quiescing sequence based on mm
         *   switching to avoid IPIs completely.
         */
        on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1);
        if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
                on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
}

/*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
void __mmdrop(struct mm_struct *mm)
{
        BUG_ON(mm == &init_mm);
        WARN_ON_ONCE(mm == current->mm);

        /* Ensure no CPUs are using this as their lazy tlb mm */
        cleanup_lazy_tlbs(mm);

        WARN_ON_ONCE(mm == current->active_mm);
        mm_free_pgd(mm);
        mm_free_id(mm);
        destroy_context(mm);
        mmu_notifier_subscriptions_destroy(mm);
        check_mm(mm);
        put_user_ns(mm->user_ns);
        mm_pasid_drop(mm);
        mm_destroy_cid(mm);
        percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);

        free_mm(mm);
}
EXPORT_SYMBOL_GPL(__mmdrop);

static void mmdrop_async_fn(struct work_struct *work)
{
        struct mm_struct *mm;

        mm = container_of(work, struct mm_struct, async_put_work);
        __mmdrop(mm);
}

static void mmdrop_async(struct mm_struct *mm)
{
        if (unlikely(atomic_dec_and_test(&mm->mm_count))) {
                INIT_WORK(&mm->async_put_work, mmdrop_async_fn);
                schedule_work(&mm->async_put_work);
        }
}

static inline void free_signal_struct(struct signal_struct *sig)
{
        taskstats_tgid_free(sig);
        sched_autogroup_exit(sig);
        /*
         * __mmdrop is not safe to call from softirq context on x86 due to
         * pgd_dtor so postpone it to the async context
         */
        if (sig->oom_mm)
                mmdrop_async(sig->oom_mm);
        kmem_cache_free(signal_cachep, sig);
}

static inline void put_signal_struct(struct signal_struct *sig)
{
        if (refcount_dec_and_test(&sig->sigcnt))
                free_signal_struct(sig);
}

void __put_task_struct(struct task_struct *tsk)
{
        WARN_ON(!tsk->exit_state);
        WARN_ON(refcount_read(&tsk->usage));
        WARN_ON(tsk == current);

        sched_ext_free(tsk);
        io_uring_free(tsk);
        cgroup_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
        sched_core_free(tsk);
        free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);

void __put_task_struct_rcu_cb(struct rcu_head *rhp)
{
        struct task_struct *task = container_of(rhp, struct task_struct, rcu);

        __put_task_struct(task);
}
EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb);

void __init __weak arch_task_cache_init(void) { }

/*
 * set_max_threads
 */
static void __init set_max_threads(unsigned int max_threads_suggested)
{
        u64 threads;
        unsigned long nr_pages = memblock_estimated_nr_free_pages();

        /*
         * The number of threads shall be limited such that the thread
         * structures may only consume a small part of the available memory.
         */
        if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
                threads = MAX_THREADS;
        else
                threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
                                    (u64) THREAD_SIZE * 8UL);

        if (threads > max_threads_suggested)
                threads = max_threads_suggested;

        max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
}

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
/* Initialized by the architecture: */
int arch_task_struct_size __read_mostly;
#endif

static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size)
{
        /* Fetch thread_struct whitelist for the architecture. */
        arch_thread_struct_whitelist(offset, size);

        /*
         * Handle zero-sized whitelist or empty thread_struct, otherwise
         * adjust offset to position of thread_struct in task_struct.
         */
        if (unlikely(*size == 0))
                *offset = 0;
        else
                *offset += offsetof(struct task_struct, thread);
}

void __init fork_init(void)
{
        int i;
#ifndef ARCH_MIN_TASKALIGN
#define ARCH_MIN_TASKALIGN        0
#endif
        int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN);
        unsigned long useroffset, usersize;

        /* create a slab on which task_structs can be allocated */
        task_struct_whitelist(&useroffset, &usersize);
        task_struct_cachep = kmem_cache_create_usercopy("task_struct",
                        arch_task_struct_size, align,
                        SLAB_PANIC|SLAB_ACCOUNT,
                        useroffset, usersize, NULL);

        /* do the arch specific task caches init */
        arch_task_cache_init();

        set_max_threads(MAX_THREADS);

        init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
        init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];

        for (i = 0; i < UCOUNT_COUNTS; i++)
                init_user_ns.ucount_max[i] = max_threads/2;

        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC,      RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE,   RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
        set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK,    RLIM_INFINITY);

#ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
#endif

        scs_init();

        lockdep_init_task(&init_task);
        uprobes_init();
}

int __weak arch_dup_task_struct(struct task_struct *dst,
                                               struct task_struct *src)
{
        *dst = *src;
        return 0;
}

void set_task_stack_end_magic(struct task_struct *tsk)
{
        unsigned long *stackend;

        stackend = end_of_stack(tsk);
        *stackend = STACK_END_MAGIC;        /* for overflow detection */
}

static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
        struct task_struct *tsk;
        int err;

        if (node == NUMA_NO_NODE)
                node = tsk_fork_get_node(orig);
        tsk = alloc_task_struct_node(node);
        if (!tsk)
                return NULL;

        err = arch_dup_task_struct(tsk, orig);
        if (err)
                goto free_tsk;

        err = alloc_thread_stack_node(tsk, node);
        if (err)
                goto free_tsk;

#ifdef CONFIG_THREAD_INFO_IN_TASK
        refcount_set(&tsk->stack_refcount, 1);
#endif
        account_kernel_stack(tsk, 1);

        err = scs_prepare(tsk, node);
        if (err)
                goto free_stack;

#ifdef CONFIG_SECCOMP
        /*
         * We must handle setting up seccomp filters once we're under
         * the sighand lock in case orig has changed between now and
         * then. Until then, filter must be NULL to avoid messing up
         * the usage counts on the error path calling free_task.
         */
        tsk->seccomp.filter = NULL;
#endif

        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
        set_task_stack_end_magic(tsk);
        clear_syscall_work_syscall_user_dispatch(tsk);

#ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
#endif
        if (orig->cpus_ptr == &orig->cpus_mask)
                tsk->cpus_ptr = &tsk->cpus_mask;
        dup_user_cpus_ptr(tsk, orig, node);

        /*
         * One for the user space visible state that goes away when reaped.
         * One for the scheduler.
         */
        refcount_set(&tsk->rcu_users, 2);
        /* One for the rcu users */
        refcount_set(&tsk->usage, 1);
#ifdef CONFIG_BLK_DEV_IO_TRACE
        tsk->btrace_seq = 0;
#endif
        tsk->splice_pipe = NULL;
        tsk->task_frag.page = NULL;
        tsk->wake_q.next = NULL;
        tsk->worker_private = NULL;

        kcov_task_init(tsk);
        kmsan_task_create(tsk);
        kmap_local_fork(tsk);

#ifdef CONFIG_FAULT_INJECTION
        tsk->fail_nth = 0;
#endif

#ifdef CONFIG_BLK_CGROUP
        tsk->throttle_disk = NULL;
        tsk->use_memdelay = 0;
#endif

#ifdef CONFIG_ARCH_HAS_CPU_PASID
        tsk->pasid_activated = 0;
#endif

#ifdef CONFIG_MEMCG
        tsk->active_memcg = NULL;
#endif

#ifdef CONFIG_X86_BUS_LOCK_DETECT
        tsk->reported_split_lock = 0;
#endif

#ifdef CONFIG_SCHED_MM_CID
        tsk->mm_cid = -1;
        tsk->last_mm_cid = -1;
        tsk->mm_cid_active = 0;
        tsk->migrate_from_cpu = -1;
#endif
        return tsk;

free_stack:
        exit_task_stack_account(tsk);
        free_thread_stack(tsk);
free_tsk:
        free_task_struct(tsk);
        return NULL;
}

__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);

static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;

static int __init coredump_filter_setup(char *s)
{
        default_dump_filter =
                (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
                MMF_DUMP_FILTER_MASK;
        return 1;
}

__setup("coredump_filter=", coredump_filter_setup);

#include <linux/init_task.h>

static void mm_init_aio(struct mm_struct *mm)
{
#ifdef CONFIG_AIO
        spin_lock_init(&mm->ioctx_lock);
        mm->ioctx_table = NULL;
#endif
}

static __always_inline void mm_clear_owner(struct mm_struct *mm,
                                           struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        if (mm->owner == p)
                WRITE_ONCE(mm->owner, NULL);
#endif
}

static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
#ifdef CONFIG_MEMCG
        mm->owner = p;
#endif
}

static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
        mm->uprobes_state.xol_area = NULL;
#endif
}

static void mmap_init_lock(struct mm_struct *mm)
{
        init_rwsem(&mm->mmap_lock);
        mm_lock_seqcount_init(mm);
#ifdef CONFIG_PER_VMA_LOCK
        rcuwait_init(&mm->vma_writer_wait);
#endif
}

static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        struct user_namespace *user_ns)
{
        mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
        mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
        atomic_set(&mm->mm_users, 1);
        atomic_set(&mm->mm_count, 1);
        seqcount_init(&mm->write_protect_seq);
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
        atomic64_set(&mm->pinned_vm, 0);
        memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
        spin_lock_init(&mm->page_table_lock);
        spin_lock_init(&mm->arg_lock);
        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
        mm_pasid_init(mm);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_subscriptions_init(mm);
        init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS)
        mm->pmd_huge_pte = NULL;
#endif
        mm_init_uprobes_state(mm);
        hugetlb_count_init(mm);

        if (current->mm) {
                mm->flags = mmf_init_flags(current->mm->flags);
                mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
        } else {
                mm->flags = default_dump_filter;
                mm->def_flags = 0;
        }

        if (mm_alloc_pgd(mm))
                goto fail_nopgd;

        if (mm_alloc_id(mm))
                goto fail_noid;

        if (init_new_context(p, mm))
                goto fail_nocontext;

        if (mm_alloc_cid(mm, p))
                goto fail_cid;

        if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
                                     NR_MM_COUNTERS))
                goto fail_pcpu;

        mm->user_ns = get_user_ns(user_ns);
        lru_gen_init_mm(mm);
        return mm;

fail_pcpu:
        mm_destroy_cid(mm);
fail_cid:
        destroy_context(mm);
fail_nocontext:
        mm_free_id(mm);
fail_noid:
        mm_free_pgd(mm);
fail_nopgd:
        free_mm(mm);
        return NULL;
}

/*
 * Allocate and initialize an mm_struct.
 */
struct mm_struct *mm_alloc(void)
{
        struct mm_struct *mm;

        mm = allocate_mm();
        if (!mm)
                return NULL;

        memset(mm, 0, sizeof(*mm));
        return mm_init(mm, current, current_user_ns());
}
EXPORT_SYMBOL_IF_KUNIT(mm_alloc);

static inline void __mmput(struct mm_struct *mm)
{
        VM_BUG_ON(atomic_read(&mm->mm_users));

        uprobe_clear_state(mm);
        exit_aio(mm);
        ksm_exit(mm);
        khugepaged_exit(mm); /* must run before exit_mmap */
        exit_mmap(mm);
        mm_put_huge_zero_folio(mm);
        set_mm_exe_file(mm, NULL);
        if (!list_empty(&mm->mmlist)) {
                spin_lock(&mmlist_lock);
                list_del(&mm->mmlist);
                spin_unlock(&mmlist_lock);
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
        lru_gen_del_mm(mm);
        mmdrop(mm);
}

/*
 * Decrement the use count and release all resources for an mm.
 */
void mmput(struct mm_struct *mm)
{
        might_sleep();

        if (atomic_dec_and_test(&mm->mm_users))
                __mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);

#ifdef CONFIG_MMU
static void mmput_async_fn(struct work_struct *work)
{
        struct mm_struct *mm = container_of(work, struct mm_struct,
                                            async_put_work);

        __mmput(mm);
}

void mmput_async(struct mm_struct *mm)
{
        if (atomic_dec_and_test(&mm->mm_users)) {
                INIT_WORK(&mm->async_put_work, mmput_async_fn);
                schedule_work(&mm->async_put_work);
        }
}
EXPORT_SYMBOL_GPL(mmput_async);
#endif

/**
 * set_mm_exe_file - change a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main users are mmput() and sys_execve(). Callers prevent concurrent
 * invocations: in mmput() nobody alive left, in execve it happens before
 * the new mm is made visible to anyone.
 *
 * Can only fail if new_exe_file != NULL.
 */
int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct file *old_exe_file;

        /*
         * It is safe to dereference the exe_file without RCU as
         * this function is only called if nobody else can access
         * this mm -- see comment above for justification.
         */
        old_exe_file = rcu_dereference_raw(mm->exe_file);

        if (new_exe_file) {
                /*
                 * We expect the caller (i.e., sys_execve) to already denied
                 * write access, so this is unlikely to fail.
                 */
                if (unlikely(exe_file_deny_write_access(new_exe_file)))
                        return -EACCES;
                get_file(new_exe_file);
        }
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        if (old_exe_file) {
                exe_file_allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * replace_mm_exe_file - replace a reference to the mm's executable file
 * @mm: The mm to change.
 * @new_exe_file: The new file to use.
 *
 * This changes mm's executable file (shown as symlink /proc/[pid]/exe).
 *
 * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
 */
int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{
        struct vm_area_struct *vma;
        struct file *old_exe_file;
        int ret = 0;

        /* Forbid mm->exe_file change if old file still mapped. */
        old_exe_file = get_mm_exe_file(mm);
        if (old_exe_file) {
                VMA_ITERATOR(vmi, mm, 0);
                mmap_read_lock(mm);
                for_each_vma(vmi, vma) {
                        if (!vma->vm_file)
                                continue;
                        if (path_equal(&vma->vm_file->f_path,
                                       &old_exe_file->f_path)) {
                                ret = -EBUSY;
                                break;
                        }
                }
                mmap_read_unlock(mm);
                fput(old_exe_file);
                if (ret)
                        return ret;
        }

        ret = exe_file_deny_write_access(new_exe_file);
        if (ret)
                return -EACCES;
        get_file(new_exe_file);

        /* set the new file */
        mmap_write_lock(mm);
        old_exe_file = rcu_dereference_raw(mm->exe_file);
        rcu_assign_pointer(mm->exe_file, new_exe_file);
        mmap_write_unlock(mm);

        if (old_exe_file) {
                exe_file_allow_write_access(old_exe_file);
                fput(old_exe_file);
        }
        return 0;
}

/**
 * get_mm_exe_file - acquire a reference to the mm's executable file
 * @mm: The mm of interest.
 *
 * Returns %NULL if mm has no associated executable file.
 * User must release file via fput().
 */
struct file *get_mm_exe_file(struct mm_struct *mm)
{
        struct file *exe_file;

        rcu_read_lock();
        exe_file = get_file_rcu(&mm->exe_file);
        rcu_read_unlock();
        return exe_file;
}

/**
 * get_task_exe_file - acquire a reference to the task's executable file
 * @task: The task.
 *
 * Returns %NULL if task's mm (if any) has no associated executable file or
 * this is a kernel thread with borrowed mm (see the comment above get_task_mm).
 * User must release file via fput().
 */
struct file *get_task_exe_file(struct task_struct *task)
{
        struct file *exe_file = NULL;
        struct mm_struct *mm;

        if (task->flags & PF_KTHREAD)
                return NULL;

        task_lock(task);
        mm = task->mm;
        if (mm)
                exe_file = get_mm_exe_file(mm);
        task_unlock(task);
        return exe_file;
}

/**
 * get_task_mm - acquire a reference to the task's mm
 * @task: The task.
 *
 * Returns %NULL if the task has no mm.  Checks PF_KTHREAD (meaning
 * this kernel workthread has transiently adopted a user mm with use_mm,
 * to do its AIO) is not set and if so returns a reference to it, after
 * bumping up the use count.  User must release the mm via mmput()
 * after use.  Typically used by /proc and ptrace.
 */
struct mm_struct *get_task_mm(struct task_struct *task)
{
        struct mm_struct *mm;

        if (task->flags & PF_KTHREAD)
                return NULL;

        task_lock(task);
        mm = task->mm;
        if (mm)
                mmget(mm);
        task_unlock(task);
        return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);

static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode)
{
        if (mm == current->mm)
                return true;
        if (ptrace_may_access(task, mode))
                return true;
        if ((mode & PTRACE_MODE_READ) && perfmon_capable())
                return true;
        return false;
}

struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
{
        struct mm_struct *mm;
        int err;

        err =  down_read_killable(&task->signal->exec_update_lock);
        if (err)
                return ERR_PTR(err);

        mm = get_task_mm(task);
        if (!mm) {
                mm = ERR_PTR(-ESRCH);
        } else if (!may_access_mm(mm, task, mode)) {
                mmput(mm);
                mm = ERR_PTR(-EACCES);
        }
        up_read(&task->signal->exec_update_lock);

        return mm;
}

static void complete_vfork_done(struct task_struct *tsk)
{
        struct completion *vfork;

        task_lock(tsk);
        vfork = tsk->vfork_done;
        if (likely(vfork)) {
                tsk->vfork_done = NULL;
                complete(vfork);
        }
        task_unlock(tsk);
}

static int wait_for_vfork_done(struct task_struct *child,
                                struct completion *vfork)
{
        unsigned int state = TASK_KILLABLE|TASK_FREEZABLE;
        int killed;

        cgroup_enter_frozen();
        killed = wait_for_completion_state(vfork, state);
        cgroup_leave_frozen(false);

        if (killed) {
                task_lock(child);
                child->vfork_done = NULL;
                task_unlock(child);
        }

        put_task_struct(child);
        return killed;
}

/* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
 *
 * mm_release is called after a mm_struct has been removed
 * from the current process.
 *
 * This difference is important for error handling, when we
 * only half set up a mm_struct for a new process and need to restore
 * the old one.  Because we mmput the new mm_struct before
 * restoring the old one. . .
 * Eric Biederman 10 January 1998
 */
static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        uprobe_free_utask(tsk);

        /* Get rid of any cached register state */
        deactivate_mm(tsk, mm);

        /*
         * Signal userspace if we're not exiting with a core dump
         * because we want to leave the value intact for debugging
         * purposes.
         */
        if (tsk->clear_child_tid) {
                if (atomic_read(&mm->mm_users) > 1) {
                        /*
                         * We don't check the error code - if userspace has
                         * not set up a proper pointer then tough luck.
                         */
                        put_user(0, tsk->clear_child_tid);
                        do_futex(tsk->clear_child_tid, FUTEX_WAKE,
                                        1, NULL, NULL, 0, 0);
                }
                tsk->clear_child_tid = NULL;
        }

        /*
         * All done, finally we can wake up parent and return this mm to him.
         * Also kthread_stop() uses this completion for synchronization.
         */
        if (tsk->vfork_done)
                complete_vfork_done(tsk);
}

void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exit_release(tsk);
        mm_release(tsk, mm);
}

void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
        futex_exec_release(tsk);
        mm_release(tsk, mm);
}

/**
 * dup_mm() - duplicates an existing mm structure
 * @tsk: the task_struct with which the new mm will be associated.
 * @oldmm: the mm to duplicate.
 *
 * Allocates a new mm structure and duplicates the provided @oldmm structure
 * content into it.
 *
 * Return: the duplicated mm or NULL on failure.
 */
static struct mm_struct *dup_mm(struct task_struct *tsk,
                                struct mm_struct *oldmm)
{
        struct mm_struct *mm;
        int err;

        mm = allocate_mm();
        if (!mm)
                goto fail_nomem;

        memcpy(mm, oldmm, sizeof(*mm));

        if (!mm_init(mm, tsk, mm->user_ns))
                goto fail_nomem;

        uprobe_start_dup_mmap();
        err = dup_mmap(mm, oldmm);
        if (err)
                goto free_pt;
        uprobe_end_dup_mmap();

        mm->hiwater_rss = get_mm_rss(mm);
        mm->hiwater_vm = mm->total_vm;

        if (mm->binfmt && !try_module_get(mm->binfmt->module))
                goto free_pt;

        return mm;

free_pt:
        /* don't put binfmt in mmput, we haven't got module yet */
        mm->binfmt = NULL;
        mm_init_owner(mm, NULL);
        mmput(mm);
        if (err)
                uprobe_end_dup_mmap();

fail_nomem:
        return NULL;
}

static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
        struct mm_struct *mm, *oldmm;

        tsk->min_flt = tsk->maj_flt = 0;
        tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
        tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
        tsk->last_switch_time = 0;
#endif

        tsk->mm = NULL;
        tsk->active_mm = NULL;

        /*
         * Are we cloning a kernel thread?
         *
         * We need to steal a active VM for that..
         */
        oldmm = current->mm;
        if (!oldmm)
                return 0;

        if (clone_flags & CLONE_VM) {
                mmget(oldmm);
                mm = oldmm;
        } else {
                mm = dup_mm(tsk, current->mm);
                if (!mm)
                        return -ENOMEM;
        }

        tsk->mm = mm;
        tsk->active_mm = mm;
        sched_mm_cid_fork(tsk);
        return 0;
}

static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
        struct fs_struct *fs = current->fs;
        if (clone_flags & CLONE_FS) {
                /* tsk->fs is already what we want */
                spin_lock(&fs->lock);
                /* "users" and "in_exec" locked for check_unsafe_exec() */
                if (fs->in_exec) {
                        spin_unlock(&fs->lock);
                        return -EAGAIN;
                }
                fs->users++;
                spin_unlock(&fs->lock);
                return 0;
        }
        tsk->fs = copy_fs_struct(fs);
        if (!tsk->fs)
                return -ENOMEM;
        return 0;
}

static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
                      int no_files)
{
        struct files_struct *oldf, *newf;

        /*
         * A background process may not have any files ...
         */
        oldf = current->files;
        if (!oldf)
                return 0;

        if (no_files) {
                tsk->files = NULL;
                return 0;
        }

        if (clone_flags & CLONE_FILES) {
                atomic_inc(&oldf->count);
                return 0;
        }

        newf = dup_fd(oldf, NULL);
        if (IS_ERR(newf))
                return PTR_ERR(newf);

        tsk->files = newf;
        return 0;
}

static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
        struct sighand_struct *sig;

        if (clone_flags & CLONE_SIGHAND) {
                refcount_inc(&current->sighand->count);
                return 0;
        }
        sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
        RCU_INIT_POINTER(tsk->sighand, sig);
        if (!sig)
                return -ENOMEM;

        refcount_set(&sig->count, 1);
        spin_lock_irq(&current->sighand->siglock);
        memcpy(sig->action, current->sighand->action, sizeof(sig->action));
        spin_unlock_irq(&current->sighand->siglock);

        /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
        if (clone_flags & CLONE_CLEAR_SIGHAND)
                flush_signal_handlers(tsk, 0);

        return 0;
}

void __cleanup_sighand(struct sighand_struct *sighand)
{
        if (refcount_dec_and_test(&sighand->count)) {
                signalfd_cleanup(sighand);
                /*
                 * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it
                 * without an RCU grace period, see __lock_task_sighand().
                 */
                kmem_cache_free(sighand_cachep, sighand);
        }
}

/*
 * Initialize POSIX timer handling for a thread group.
 */
static void posix_cpu_timers_init_group(struct signal_struct *sig)
{
        struct posix_cputimers *pct = &sig->posix_cputimers;
        unsigned long cpu_limit;

        cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur);
        posix_cputimers_group_init(pct, cpu_limit);
}

static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
        struct signal_struct *sig;

        if (clone_flags & CLONE_THREAD)
                return 0;

        sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL);
        tsk->signal = sig;
        if (!sig)
                return -ENOMEM;

        sig->nr_threads = 1;
        sig->quick_threads = 1;
        atomic_set(&sig->live, 1);
        refcount_set(&sig->sigcnt, 1);

        /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */
        sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node);
        tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);

        init_waitqueue_head(&sig->wait_chldexit);
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_HLIST_HEAD(&sig->multiprocess);
        seqlock_init(&sig->stats_lock);
        prev_cputime_init(&sig->prev_cputime);

#ifdef CONFIG_POSIX_TIMERS
        INIT_HLIST_HEAD(&sig->posix_timers);
        INIT_HLIST_HEAD(&sig->ignored_posix_timers);
        hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
#endif

        task_lock(current->group_leader);
        memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
        task_unlock(current->group_leader);

        posix_cpu_timers_init_group(sig);

        tty_audit_fork(sig);
        sched_autogroup_fork(sig);

        sig->oom_score_adj = current->signal->oom_score_adj;
        sig->oom_score_adj_min = current->signal->oom_score_adj_min;

        mutex_init(&sig->cred_guard_mutex);
        init_rwsem(&sig->exec_update_lock);

        return 0;
}

static void copy_seccomp(struct task_struct *p)
{
#ifdef CONFIG_SECCOMP
        /*
         * Must be called with sighand->lock held, which is common to
         * all threads in the group. Holding cred_guard_mutex is not
         * needed because this new task is not yet running and cannot
         * be racing exec.
         */
        assert_spin_locked(&current->sighand->siglock);

        /* Ref-count the new filter user, and assign it. */
        get_seccomp_filter(current);
        p->seccomp = current->seccomp;

        /*
         * Explicitly enable no_new_privs here in case it got set
         * between the task_struct being duplicated and holding the
         * sighand lock. The seccomp state and nnp must be in sync.
         */
        if (task_no_new_privs(current))
                task_set_no_new_privs(p);

        /*
         * If the parent gained a seccomp mode after copying thread
         * flags and between before we held the sighand lock, we have
         * to manually enable the seccomp thread flag here.
         */
        if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
                set_task_syscall_work(p, SECCOMP);
#endif
}

SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
        current->clear_child_tid = tidptr;

        return task_pid_vnr(current);
}

static void rt_mutex_init_task(struct task_struct *p)
{
        raw_spin_lock_init(&p->pi_lock);
#ifdef CONFIG_RT_MUTEXES
        p->pi_waiters = RB_ROOT_CACHED;
        p->pi_top_task = NULL;
        p->pi_blocked_on = NULL;
#endif
}

static inline void init_task_pid_links(struct task_struct *task)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_NODE(&task->pid_links[type]);
}

static inline void
init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid)
{
        if (type == PIDTYPE_PID)
                task->thread_pid = pid;
        else
                task->signal->pids[type] = pid;
}

static inline void rcu_copy_process(struct task_struct *p)
{
#ifdef CONFIG_PREEMPT_RCU
        p->rcu_read_lock_nesting = 0;
        p->rcu_read_unlock_special.s = 0;
        p->rcu_blocked_node = NULL;
        INIT_LIST_HEAD(&p->rcu_node_entry);
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
        p->rcu_tasks_holdout = false;
        INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
        p->rcu_tasks_idle_cpu = -1;
        INIT_LIST_HEAD(&p->rcu_tasks_exit_list);
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifdef CONFIG_TASKS_TRACE_RCU
        p->trc_reader_nesting = 0;
        p->trc_reader_special.s = 0;
        INIT_LIST_HEAD(&p->trc_holdout_list);
        INIT_LIST_HEAD(&p->trc_blkd_node);
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}

/**
 * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret: Where to return the file for the pidfd.
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper doesn't perform checks on @pid which makes it useful for pidfds
 * created via CLONE_PIDFD where @pid has no task attached when the pidfd and
 * pidfd file are prepared.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
        struct file *pidfd_file;

        CLASS(get_unused_fd, pidfd)(O_CLOEXEC);
        if (pidfd < 0)
                return pidfd;

        pidfd_file = pidfs_alloc_file(pid, flags | O_RDWR);
        if (IS_ERR(pidfd_file))
                return PTR_ERR(pidfd_file);

        *ret = pidfd_file;
        return take_fd(pidfd);
}

/**
 * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd
 * @pid:   the struct pid for which to create a pidfd
 * @flags: flags of the new @pidfd
 * @ret: Where to return the pidfd.
 *
 * Allocate a new file that stashes @pid and reserve a new pidfd number in the
 * caller's file descriptor table. The pidfd is reserved but not installed yet.
 *
 * The helper verifies that @pid is still in use, without PIDFD_THREAD the
 * task identified by @pid must be a thread-group leader.
 *
 * If this function returns successfully the caller is responsible to either
 * call fd_install() passing the returned pidfd and pidfd file as arguments in
 * order to install the pidfd into its file descriptor table or they must use
 * put_unused_fd() and fput() on the returned pidfd and pidfd file
 * respectively.
 *
 * This function is useful when a pidfd must already be reserved but there
 * might still be points of failure afterwards and the caller wants to ensure
 * that no pidfd is leaked into its file descriptor table.
 *
 * Return: On success, a reserved pidfd is returned from the function and a new
 *         pidfd file is returned in the last argument to the function. On
 *         error, a negative error code is returned from the function and the
 *         last argument remains unchanged.
 */
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret)
{
        bool thread = flags & PIDFD_THREAD;

        if (!pid || !pid_has_task(pid, thread ? PIDTYPE_PID : PIDTYPE_TGID))
                return -EINVAL;

        return __pidfd_prepare(pid, flags, ret);
}

static void __delayed_free_task(struct rcu_head *rhp)
{
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);

        free_task(tsk);
}

static __always_inline void delayed_free_task(struct task_struct *tsk)
{
        if (IS_ENABLED(CONFIG_MEMCG))
                call_rcu(&tsk->rcu, __delayed_free_task);
        else
                free_task(tsk);
}

static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
{
        /* Skip if kernel thread */
        if (!tsk->mm)
                return;

        /* Skip if spawning a thread or using vfork */
        if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM)
                return;

        /* We need to synchronize with __set_oom_adj */
        mutex_lock(&oom_adj_mutex);
        set_bit(MMF_MULTIPROCESS, &tsk->mm->flags);
        /* Update the values in case they were changed after copy_signal */
        tsk->signal->oom_score_adj = current->signal->oom_score_adj;
        tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
        mutex_unlock(&oom_adj_mutex);
}

#ifdef CONFIG_RV
static void rv_task_fork(struct task_struct *p)
{
        int i;

        for (i = 0; i < RV_PER_TASK_MONITORS; i++)
                p->rv[i].da_mon.monitoring = false;
}
#else
#define rv_task_fork(p) do {} while (0)
#endif

/*
 * This creates a new process as a copy of the old one,
 * but does not actually start it yet.
 *
 * It copies the registers, and all the appropriate
 * parts of the process environment (as per the clone
 * flags). The actual kick-off is left to the caller.
 */
__latent_entropy struct task_struct *copy_process(
                                        struct pid *pid,
                                        int trace,
                                        int node,
                                        struct kernel_clone_args *args)
{
        int pidfd = -1, retval;
        struct task_struct *p;
        struct multiprocess_signals delayed;
        struct file *pidfile = NULL;
        const u64 clone_flags = args->flags;
        struct nsproxy *nsp = current->nsproxy;

        /*
         * Don't allow sharing the root directory with processes in a different
         * namespace
         */
        if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
                return ERR_PTR(-EINVAL);

        if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
                return ERR_PTR(-EINVAL);

        /*
         * Thread groups must share signals as well, and detached threads
         * can only be started up within the thread group.
         */
        if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
                return ERR_PTR(-EINVAL);

        /*
         * Shared signal handlers imply shared VM. By way of the above,
         * thread groups also imply shared VM. Blocking this case allows
         * for various simplifications in other code.
         */
        if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
                return ERR_PTR(-EINVAL);

        /*
         * Siblings of global init remain as zombies on exit since they are
         * not reaped by their parent (swapper). To solve this and to avoid
         * multi-rooted process trees, prevent global and container-inits
         * from creating siblings.
         */
        if ((clone_flags & CLONE_PARENT) &&
                                current->signal->flags & SIGNAL_UNKILLABLE)
                return ERR_PTR(-EINVAL);

        /*
         * If the new process will be in a different pid or user namespace
         * do not allow it to share a thread group with the forking task.
         */
        if (clone_flags & CLONE_THREAD) {
                if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
                    (task_active_pid_ns(current) != nsp->pid_ns_for_children))
                        return ERR_PTR(-EINVAL);
        }

        if (clone_flags & CLONE_PIDFD) {
                /*
                 * - CLONE_DETACHED is blocked so that we can potentially
                 *   reuse it later for CLONE_PIDFD.
                 */
                if (clone_flags & CLONE_DETACHED)
                        return ERR_PTR(-EINVAL);
        }

        /*
         * Force any signals received before this point to be delivered
         * before the fork happens.  Collect up signals sent to multiple
         * processes that happen during the fork and delay them so that
         * they appear to happen after the fork.
         */
        sigemptyset(&delayed.signal);
        INIT_HLIST_NODE(&delayed.node);

        spin_lock_irq(&current->sighand->siglock);
        if (!(clone_flags & CLONE_THREAD))
                hlist_add_head(&delayed.node, &current->signal->multiprocess);
        recalc_sigpending();
        spin_unlock_irq(&current->sighand->siglock);
        retval = -ERESTARTNOINTR;
        if (task_sigpending(current))
                goto fork_out;

        retval = -ENOMEM;
        p = dup_task_struct(current, node);
        if (!p)
                goto fork_out;
        p->flags &= ~PF_KTHREAD;
        if (args->kthread)
                p->flags |= PF_KTHREAD;
        if (args->user_worker) {
                /*
                 * Mark us a user worker, and block any signal that isn't
                 * fatal or STOP
                 */
                p->flags |= PF_USER_WORKER;
                siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
        }
        if (args->io_thread)
                p->flags |= PF_IO_WORKER;

        if (args->name)
                strscpy_pad(p->comm, args->name, sizeof(p->comm));

        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
        /*
         * Clear TID on mm_release()?
         */
        p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL;

        ftrace_graph_init_task(p);

        rt_mutex_init_task(p);

        lockdep_assert_irqs_enabled();
#ifdef CONFIG_PROVE_LOCKING
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
        retval = copy_creds(p, clone_flags);
        if (retval < 0)
                goto bad_fork_free;

        retval = -EAGAIN;
        if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_cleanup_count;
        }
        current->flags &= ~PF_NPROC_EXCEEDED;

        /*
         * If multiple threads are within copy_process(), then this check
         * triggers too late. This doesn't hurt, the check is only there
         * to stop root fork bombs.
         */
        retval = -EAGAIN;
        if (data_race(nr_threads >= max_threads))
                goto bad_fork_cleanup_count;

        delayacct_tsk_init(p);        /* Must remain after dup_task_struct() */
        p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY);
        p->flags |= PF_FORKNOEXEC;
        INIT_LIST_HEAD(&p->children);
        INIT_LIST_HEAD(&p->sibling);
        rcu_copy_process(p);
        p->vfork_done = NULL;
        spin_lock_init(&p->alloc_lock);

        init_sigpending(&p->pending);

        p->utime = p->stime = p->gtime = 0;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        p->utimescaled = p->stimescaled = 0;
#endif
        prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        seqcount_init(&p->vtime.seqcount);
        p->vtime.starttime = 0;
        p->vtime.state = VTIME_INACTIVE;
#endif

#ifdef CONFIG_IO_URING
        p->io_uring = NULL;
#endif

        p->default_timer_slack_ns = current->timer_slack_ns;

#ifdef CONFIG_PSI
        p->psi_flags = 0;
#endif

        task_io_accounting_init(&p->ioac);
        acct_clear_integrals(p);

        posix_cputimers_init(&p->posix_cputimers);
        tick_dep_init_task(p);

        p->io_context = NULL;
        audit_set_context(p, NULL);
        cgroup_fork(p);
        if (args->kthread) {
                if (!set_kthread_struct(p))
                        goto bad_fork_cleanup_delayacct;
        }
#ifdef CONFIG_NUMA
        p->mempolicy = mpol_dup(p->mempolicy);
        if (IS_ERR(p->mempolicy)) {
                retval = PTR_ERR(p->mempolicy);
                p->mempolicy = NULL;
                goto bad_fork_cleanup_delayacct;
        }
#endif
#ifdef CONFIG_CPUSETS
        p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
        seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
        memset(&p->irqtrace, 0, sizeof(p->irqtrace));
        p->irqtrace.hardirq_disable_ip        = _THIS_IP_;
        p->irqtrace.softirq_enable_ip        = _THIS_IP_;
        p->softirqs_enabled                = 1;
        p->softirq_context                = 0;
#endif

        p->pagefault_disabled = 0;

#ifdef CONFIG_LOCKDEP
        lockdep_init_task(p);
#endif

#ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
        p->sequential_io        = 0;
        p->sequential_io_avg        = 0;
#endif
#ifdef CONFIG_BPF_SYSCALL
        RCU_INIT_POINTER(p->bpf_storage, NULL);
        p->bpf_ctx = NULL;
#endif

        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_policy;

        retval = perf_event_init_task(p, clone_flags);
        if (retval)
                goto bad_fork_sched_cancel_fork;
        retval = audit_alloc(p);
        if (retval)
                goto bad_fork_cleanup_perf;
        /* copy all the process information */
        shm_init_task(p);
        retval = security_task_alloc(p, clone_flags);
        if (retval)
                goto bad_fork_cleanup_audit;
        retval = copy_semundo(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_security;
        retval = copy_files(clone_flags, p, args->no_files);
        if (retval)
                goto bad_fork_cleanup_semundo;
        retval = copy_fs(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_files;
        retval = copy_sighand(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_fs;
        retval = copy_signal(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_sighand;
        retval = copy_mm(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_signal;
        retval = copy_namespaces(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_mm;
        retval = copy_io(clone_flags, p);
        if (retval)
                goto bad_fork_cleanup_namespaces;
        retval = copy_thread(p, args);
        if (retval)
                goto bad_fork_cleanup_io;

        stackleak_task_init(p);

        if (pid != &init_struct_pid) {
                pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
                                args->set_tid_size);
                if (IS_ERR(pid)) {
                        retval = PTR_ERR(pid);
                        goto bad_fork_cleanup_thread;
                }
        }

        /*
         * This has to happen after we've potentially unshared the file
         * descriptor table (so that the pidfd doesn't leak into the child
         * if the fd table isn't shared).
         */
        if (clone_flags & CLONE_PIDFD) {
                int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;

                /*
                 * Note that no task has been attached to @pid yet indicate
                 * that via CLONE_PIDFD.
                 */
                retval = __pidfd_prepare(pid, flags | PIDFD_CLONE, &pidfile);
                if (retval < 0)
                        goto bad_fork_free_pid;
                pidfd = retval;

                retval = put_user(pidfd, args->pidfd);
                if (retval)
                        goto bad_fork_put_pidfd;
        }

#ifdef CONFIG_BLOCK
        p->plug = NULL;
#endif
        futex_init_task(p);

        /*
         * sigaltstack should be cleared when sharing the same VM
         */
        if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
                sas_ss_reset(p);

        /*
         * Syscall tracing and stepping should be turned off in the
         * child regardless of CLONE_PTRACE.
         */
        user_disable_single_step(p);
        clear_task_syscall_work(p, SYSCALL_TRACE);
#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
        clear_task_syscall_work(p, SYSCALL_EMU);
#endif
        clear_tsk_latency_tracing(p);

        /* ok, now we should be set up.. */
        p->pid = pid_nr(pid);
        if (clone_flags & CLONE_THREAD) {
                p->group_leader = current->group_leader;
                p->tgid = current->tgid;
        } else {
                p->group_leader = p;
                p->tgid = p->pid;
        }

        p->nr_dirtied = 0;
        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        p->dirty_paused_when = 0;

        p->pdeath_signal = 0;
        p->task_works = NULL;
        clear_posix_cputimers_work(p);

#ifdef CONFIG_KRETPROBES
        p->kretprobe_instances.first = NULL;
#endif
#ifdef CONFIG_RETHOOK
        p->rethooks.first = NULL;
#endif

        /*
         * Ensure that the cgroup subsystem policies allow the new process to be
         * forked. It should be noted that the new process's css_set can be changed
         * between here and cgroup_post_fork() if an organisation operation is in
         * progress.
         */
        retval = cgroup_can_fork(p, args);
        if (retval)
                goto bad_fork_put_pidfd;

        /*
         * Now that the cgroups are pinned, re-clone the parent cgroup and put
         * the new task on the correct runqueue. All this *before* the task
         * becomes visible.
         *
         * This isn't part of ->can_fork() because while the re-cloning is
         * cgroup specific, it unconditionally needs to place the task on a
         * runqueue.
         */
        retval = sched_cgroup_fork(p, args);
        if (retval)
                goto bad_fork_cancel_cgroup;

        /*
         * From this point on we must avoid any synchronous user-space
         * communication until we take the tasklist-lock. In particular, we do
         * not want user-space to be able to predict the process start-time by
         * stalling fork(2) after we recorded the start_time but before it is
         * visible to the system.
         */

        p->start_time = ktime_get_ns();
        p->start_boottime = ktime_get_boottime_ns();

        /*
         * Make it visible to the rest of the system, but dont wake it up yet.
         * Need tasklist lock for parent etc handling!
         */
        write_lock_irq(&tasklist_lock);

        /* CLONE_PARENT re-uses the old parent */
        if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
                p->real_parent = current->real_parent;
                p->parent_exec_id = current->parent_exec_id;
                if (clone_flags & CLONE_THREAD)
                        p->exit_signal = -1;
                else
                        p->exit_signal = current->group_leader->exit_signal;
        } else {
                p->real_parent = current;
                p->parent_exec_id = current->self_exec_id;
                p->exit_signal = args->exit_signal;
        }

        klp_copy_process(p);

        sched_core_fork(p);

        spin_lock(&current->sighand->siglock);

        rv_task_fork(p);

        rseq_fork(p, clone_flags);

        /* Don't start children in a dying pid namespace */
        if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
                retval = -ENOMEM;
                goto bad_fork_core_free;
        }

        /* Let kill terminate clone/fork in the middle */
        if (fatal_signal_pending(current)) {
                retval = -EINTR;
                goto bad_fork_core_free;
        }

        /* No more failure paths after this point. */

        /*
         * Copy seccomp details explicitly here, in case they were changed
         * before holding sighand lock.
         */
        copy_seccomp(p);

        init_task_pid_links(p);
        if (likely(p->pid)) {
                ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

                init_task_pid(p, PIDTYPE_PID, pid);
                if (thread_group_leader(p)) {
                        init_task_pid(p, PIDTYPE_TGID, pid);
                        init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
                        init_task_pid(p, PIDTYPE_SID, task_session(current));

                        if (is_child_reaper(pid)) {
                                ns_of_pid(pid)->child_reaper = p;
                                p->signal->flags |= SIGNAL_UNKILLABLE;
                        }
                        p->signal->shared_pending.signal = delayed.signal;
                        p->signal->tty = tty_kref_get(current->signal->tty);
                        /*
                         * Inherit has_child_subreaper flag under the same
                         * tasklist_lock with adding child to the process tree
                         * for propagate_has_child_subreaper optimization.
                         */
                        p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper ||
                                                         p->real_parent->signal->is_child_subreaper;
                        list_add_tail(&p->sibling, &p->real_parent->children);
                        list_add_tail_rcu(&p->tasks, &init_task.tasks);
                        attach_pid(p, PIDTYPE_TGID);
                        attach_pid(p, PIDTYPE_PGID);
                        attach_pid(p, PIDTYPE_SID);
                        __this_cpu_inc(process_counts);
                } else {
                        current->signal->nr_threads++;
                        current->signal->quick_threads++;
                        atomic_inc(&current->signal->live);
                        refcount_inc(&current->signal->sigcnt);
                        task_join_group_stop(p);
                        list_add_tail_rcu(&p->thread_node,
                                          &p->signal->thread_head);
                }
                attach_pid(p, PIDTYPE_PID);
                nr_threads++;
        }
        total_forks++;
        hlist_del_init(&delayed.node);
        spin_unlock(&current->sighand->siglock);
        syscall_tracepoint_update(p);
        write_unlock_irq(&tasklist_lock);

        if (pidfile)
                fd_install(pidfd, pidfile);

        proc_fork_connector(p);
        sched_post_fork(p);
        cgroup_post_fork(p, args);
        perf_event_fork(p);

        trace_task_newtask(p, clone_flags);
        uprobe_copy_process(p, clone_flags);
        user_events_fork(p, clone_flags);

        copy_oom_score_adj(clone_flags, p);

        return p;

bad_fork_core_free:
        sched_core_free(p);
        spin_unlock(&current->sighand->siglock);
        write_unlock_irq(&tasklist_lock);
bad_fork_cancel_cgroup:
        cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
        if (clone_flags & CLONE_PIDFD) {
                fput(pidfile);
                put_unused_fd(pidfd);
        }
bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
bad_fork_cleanup_thread:
        exit_thread(p);
bad_fork_cleanup_io:
        if (p->io_context)
                exit_io_context(p);
bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
bad_fork_cleanup_mm:
        if (p->mm) {
                mm_clear_owner(p->mm, p);
                mmput(p->mm);
        }
bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
        __cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
        exit_fs(p); /* blocking */
bad_fork_cleanup_files:
        exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
        exit_sem(p);
bad_fork_cleanup_security:
        security_task_free(p);
bad_fork_cleanup_audit:
        audit_free(p);
bad_fork_cleanup_perf:
        perf_event_free_task(p);
bad_fork_sched_cancel_fork:
        sched_cancel_fork(p);
bad_fork_cleanup_policy:
        lockdep_free_task(p);
#ifdef CONFIG_NUMA
        mpol_put(p->mempolicy);
#endif
bad_fork_cleanup_delayacct:
        delayacct_tsk_free(p);
bad_fork_cleanup_count:
        dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        exit_creds(p);
bad_fork_free:
        WRITE_ONCE(p->__state, TASK_DEAD);
        exit_task_stack_account(p);
        put_task_stack(p);
        delayed_free_task(p);
fork_out:
        spin_lock_irq(&current->sighand->siglock);
        hlist_del_init(&delayed.node);
        spin_unlock_irq(&current->sighand->siglock);
        return ERR_PTR(retval);
}

static inline void init_idle_pids(struct task_struct *idle)
{
        enum pid_type type;

        for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
                INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */
                init_task_pid(idle, type, &init_struct_pid);
        }
}

static int idle_dummy(void *dummy)
{
        /* This function is never called */
        return 0;
}

struct task_struct * __init fork_idle(int cpu)
{
        struct task_struct *task;
        struct kernel_clone_args args = {
                .flags                = CLONE_VM,
                .fn                = &idle_dummy,
                .fn_arg                = NULL,
                .kthread        = 1,
                .idle                = 1,
        };

        task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args);
        if (!IS_ERR(task)) {
                init_idle_pids(task);
                init_idle(task, cpu);
        }

        return task;
}

/*
 * This is like kernel_clone(), but shaved down and tailored to just
 * creating io_uring workers. It returns a created task, or an error pointer.
 * The returned task is inactive, and the caller must fire it up through
 * wake_up_new_task(p). All signals are blocked in the created task.
 */
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{
        unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
                                CLONE_IO;
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
                .io_thread        = 1,
                .user_worker        = 1,
        };

        return copy_process(NULL, 0, node, &args);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 *
 * args->exit_signal is expected to be checked for sanity by the caller.
 */
pid_t kernel_clone(struct kernel_clone_args *args)
{
        u64 clone_flags = args->flags;
        struct completion vfork;
        struct pid *pid;
        struct task_struct *p;
        int trace = 0;
        pid_t nr;

        /*
         * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
         * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
         * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
         * field in struct clone_args and it still doesn't make sense to have
         * them both point at the same memory location. Performing this check
         * here has the advantage that we don't need to have a separate helper
         * to check for legacy clone().
         */
        if ((clone_flags & CLONE_PIDFD) &&
            (clone_flags & CLONE_PARENT_SETTID) &&
            (args->pidfd == args->parent_tid))
                return -EINVAL;

        /*
         * Determine whether and which event to report to ptracer.  When
         * called from kernel_thread or CLONE_UNTRACED is explicitly
         * requested, no event is reported; otherwise, report if the event
         * for the type of forking is enabled.
         */
        if (!(clone_flags & CLONE_UNTRACED)) {
                if (clone_flags & CLONE_VFORK)
                        trace = PTRACE_EVENT_VFORK;
                else if (args->exit_signal != SIGCHLD)
                        trace = PTRACE_EVENT_CLONE;
                else
                        trace = PTRACE_EVENT_FORK;

                if (likely(!ptrace_event_enabled(current, trace)))
                        trace = 0;
        }

        p = copy_process(NULL, trace, NUMA_NO_NODE, args);
        add_latent_entropy();

        if (IS_ERR(p))
                return PTR_ERR(p);

        /*
         * Do this prior waking up the new thread - the thread pointer
         * might get invalid after that point, if the thread exits quickly.
         */
        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
                put_user(nr, args->parent_tid);

        if (clone_flags & CLONE_VFORK) {
                p->vfork_done = &vfork;
                init_completion(&vfork);
                get_task_struct(p);
        }

        if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) {
                /* lock the task to synchronize with memcg migration */
                task_lock(p);
                lru_gen_add_mm(p->mm);
                task_unlock(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
                ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
                if (!wait_for_vfork_done(p, &vfork))
                        ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
        return nr;
}

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                    unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
                .name                = name,
                .kthread        = 1,
        };

        return kernel_clone(&args);
}

/*
 * Create a user mode thread.
 */
pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
        struct kernel_clone_args args = {
                .flags                = ((lower_32_bits(flags) | CLONE_VM |
                                    CLONE_UNTRACED) & ~CSIGNAL),
                .exit_signal        = (lower_32_bits(flags) & CSIGNAL),
                .fn                = fn,
                .fn_arg                = arg,
        };

        return kernel_clone(&args);
}

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
        struct kernel_clone_args args = {
                .exit_signal = SIGCHLD,
        };

        return kernel_clone(&args);
#else
        /* can not support in nommu mode */
        return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
        struct kernel_clone_args args = {
                .flags                = CLONE_VFORK | CLONE_VM,
                .exit_signal        = SIGCHLD,
        };

        return kernel_clone(&args);
}
#endif

#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 unsigned long, tls,
                 int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
                int, stack_size,
                int __user *, parent_tidptr,
                int __user *, child_tidptr,
                unsigned long, tls)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
                 int __user *, parent_tidptr,
                 int __user *, child_tidptr,
                 unsigned long, tls)
#endif
{
        struct kernel_clone_args args = {
                .flags                = (lower_32_bits(clone_flags) & ~CSIGNAL),
                .pidfd                = parent_tidptr,
                .child_tid        = child_tidptr,
                .parent_tid        = parent_tidptr,
                .exit_signal        = (lower_32_bits(clone_flags) & CSIGNAL),
                .stack                = newsp,
                .tls                = tls,
        };

        return kernel_clone(&args);
}
#endif

noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                                              struct clone_args __user *uargs,
                                              size_t usize)
{
        int err;
        struct clone_args args;
        pid_t *kset_tid = kargs->set_tid;

        BUILD_BUG_ON(offsetofend(struct clone_args, tls) !=
                     CLONE_ARGS_SIZE_VER0);
        BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) !=
                     CLONE_ARGS_SIZE_VER1);
        BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) !=
                     CLONE_ARGS_SIZE_VER2);
        BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2);

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < CLONE_ARGS_SIZE_VER0))
                return -EINVAL;

        err = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (err)
                return err;

        if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
                return -EINVAL;

        if (unlikely(!args.set_tid && args.set_tid_size > 0))
                return -EINVAL;

        if (unlikely(args.set_tid && args.set_tid_size == 0))
                return -EINVAL;

        /*
         * Verify that higher 32bits of exit_signal are unset and that
         * it is a valid signal
         */
        if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) ||
                     !valid_signal(args.exit_signal)))
                return -EINVAL;

        if ((args.flags & CLONE_INTO_CGROUP) &&
            (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2))
                return -EINVAL;

        *kargs = (struct kernel_clone_args){
                .flags                = args.flags,
                .pidfd                = u64_to_user_ptr(args.pidfd),
                .child_tid        = u64_to_user_ptr(args.child_tid),
                .parent_tid        = u64_to_user_ptr(args.parent_tid),
                .exit_signal        = args.exit_signal,
                .stack                = args.stack,
                .stack_size        = args.stack_size,
                .tls                = args.tls,
                .set_tid_size        = args.set_tid_size,
                .cgroup                = args.cgroup,
        };

        if (args.set_tid &&
                copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
                        (kargs->set_tid_size * sizeof(pid_t))))
                return -EFAULT;

        kargs->set_tid = kset_tid;

        return 0;
}

/**
 * clone3_stack_valid - check and prepare stack
 * @kargs: kernel clone args
 *
 * Verify that the stack arguments userspace gave us are sane.
 * In addition, set the stack direction for userspace since it's easy for us to
 * determine.
 */
static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
{
        if (kargs->stack == 0) {
                if (kargs->stack_size > 0)
                        return false;
        } else {
                if (kargs->stack_size == 0)
                        return false;

                if (!access_ok((void __user *)kargs->stack, kargs->stack_size))
                        return false;

#if !defined(CONFIG_STACK_GROWSUP)
                kargs->stack += kargs->stack_size;
#endif
        }

        return true;
}

static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
        /* Verify that no unknown flags are passed along. */
        if (kargs->flags &
            ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
                return false;

        /*
         * - make the CLONE_DETACHED bit reusable for clone3
         * - make the CSIGNAL bits reusable for clone3
         */
        if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME))))
                return false;

        if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
            (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
                return false;

        if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
            kargs->exit_signal)
                return false;

        if (!clone3_stack_valid(kargs))
                return false;

        return true;
}

/**
 * sys_clone3 - create a new process with specific properties
 * @uargs: argument structure
 * @size:  size of @uargs
 *
 * clone3() is the extensible successor to clone()/clone2().
 * It takes a struct as argument that is versioned by its size.
 *
 * Return: On success, a positive PID for the child process.
 *         On error, a negative errno number.
 */
SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
{
        int err;

        struct kernel_clone_args kargs;
        pid_t set_tid[MAX_PID_NS_LEVEL];

#ifdef __ARCH_BROKEN_SYS_CLONE3
#warning clone3() entry point is missing, please fix
        return -ENOSYS;
#endif

        kargs.set_tid = set_tid;

        err = copy_clone_args_from_user(&kargs, uargs, size);
        if (err)
                return err;

        if (!clone3_args_valid(&kargs))
                return -EINVAL;

        return kernel_clone(&kargs);
}

void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
        struct task_struct *leader, *parent, *child;
        int res;

        read_lock(&tasklist_lock);
        leader = top = top->group_leader;
down:
        for_each_thread(leader, parent) {
                list_for_each_entry(child, &parent->children, sibling) {
                        res = visitor(child, data);
                        if (res) {
                                if (res < 0)
                                        goto out;
                                leader = child;
                                goto down;
                        }
up:
                        ;
                }
        }

        if (leader != top) {
                child = leader;
                parent = child->real_parent;
                leader = parent->group_leader;
                goto up;
        }
out:
        read_unlock(&tasklist_lock);
}

#ifndef ARCH_MIN_MMSTRUCT_ALIGN
#define ARCH_MIN_MMSTRUCT_ALIGN 0
#endif

static void sighand_ctor(void *data)
{
        struct sighand_struct *sighand = data;

        spin_lock_init(&sighand->siglock);
        init_waitqueue_head(&sighand->signalfd_wqh);
}

void __init mm_cache_init(void)
{
        unsigned int mm_size;

        /*
         * The mm_cpumask is located at the end of mm_struct, and is
         * dynamically sized based on the maximum CPU number this system
         * can have, taking hotplug into account (nr_cpu_ids).
         */
        mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size();

        mm_cachep = kmem_cache_create_usercopy("mm_struct",
                        mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        offsetof(struct mm_struct, saved_auxv),
                        sizeof_field(struct mm_struct, saved_auxv),
                        NULL);
}

void __init proc_caches_init(void)
{
        struct kmem_cache_args args = {
                .use_freeptr_offset = true,
                .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
        };

        sighand_cachep = kmem_cache_create("sighand_cache",
                        sizeof(struct sighand_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_ACCOUNT, sighand_ctor);
        signal_cachep = kmem_cache_create("signal_cache",
                        sizeof(struct signal_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        files_cachep = kmem_cache_create("files_cache",
                        sizeof(struct files_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        fs_cachep = kmem_cache_create("fs_cache",
                        sizeof(struct fs_struct), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
                        NULL);
        vm_area_cachep = kmem_cache_create("vm_area_struct",
                        sizeof(struct vm_area_struct), &args,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
                        SLAB_ACCOUNT);
        mmap_init();
        nsproxy_cache_init();
}

/*
 * Check constraints on flags passed to the unshare system call.
 */
static int check_unshare_flags(unsigned long unshare_flags)
{
        if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
                                CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
                                CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
                                CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
                                CLONE_NEWTIME))
                return -EINVAL;
        /*
         * Not implemented, but pretend it works if there is nothing
         * to unshare.  Note that unsharing the address space or the
         * signal handlers also need to unshare the signal queues (aka
         * CLONE_THREAD).
         */
        if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
                if (!thread_group_empty(current))
                        return -EINVAL;
        }
        if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
                if (refcount_read(&current->sighand->count) > 1)
                        return -EINVAL;
        }
        if (unshare_flags & CLONE_VM) {
                if (!current_is_single_threaded())
                        return -EINVAL;
        }

        return 0;
}

/*
 * Unshare the filesystem structure if it is being shared
 */
static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
        struct fs_struct *fs = current->fs;

        if (!(unshare_flags & CLONE_FS) || !fs)
                return 0;

        /* don't need lock here; in the worst case we'll do useless copy */
        if (fs->users == 1)
                return 0;

        *new_fsp = copy_fs_struct(fs);
        if (!*new_fsp)
                return -ENOMEM;

        return 0;
}

/*
 * Unshare file descriptor table if it is being shared
 */
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
{
        struct files_struct *fd = current->files;

        if ((unshare_flags & CLONE_FILES) &&
            (fd && atomic_read(&fd->count) > 1)) {
                fd = dup_fd(fd, NULL);
                if (IS_ERR(fd))
                        return PTR_ERR(fd);
                *new_fdp = fd;
        }

        return 0;
}

/*
 * unshare allows a process to 'unshare' part of the process
 * context which was originally shared using clone.  copy_*
 * functions used by kernel_clone() cannot be used here directly
 * because they modify an inactive task_struct that is being
 * constructed. Here we are modifying the current, active,
 * task_struct.
 */
int ksys_unshare(unsigned long unshare_flags)
{
        struct fs_struct *fs, *new_fs = NULL;
        struct files_struct *new_fd = NULL;
        struct cred *new_cred = NULL;
        struct nsproxy *new_nsproxy = NULL;
        int do_sysvsem = 0;
        int err;

        /*
         * If unsharing a user namespace must also unshare the thread group
         * and unshare the filesystem root and working directories.
         */
        if (unshare_flags & CLONE_NEWUSER)
                unshare_flags |= CLONE_THREAD | CLONE_FS;
        /*
         * If unsharing vm, must also unshare signal handlers.
         */
        if (unshare_flags & CLONE_VM)
                unshare_flags |= CLONE_SIGHAND;
        /*
         * If unsharing a signal handlers, must also unshare the signal queues.
         */
        if (unshare_flags & CLONE_SIGHAND)
                unshare_flags |= CLONE_THREAD;
        /*
         * If unsharing namespace, must also unshare filesystem information.
         */
        if (unshare_flags & CLONE_NEWNS)
                unshare_flags |= CLONE_FS;

        err = check_unshare_flags(unshare_flags);
        if (err)
                goto bad_unshare_out;
        /*
         * CLONE_NEWIPC must also detach from the undolist: after switching
         * to a new ipc namespace, the semaphore arrays from the old
         * namespace are unreachable.
         */
        if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
                do_sysvsem = 1;
        err = unshare_fs(unshare_flags, &new_fs);
        if (err)
                goto bad_unshare_out;
        err = unshare_fd(unshare_flags, &new_fd);
        if (err)
                goto bad_unshare_cleanup_fs;
        err = unshare_userns(unshare_flags, &new_cred);
        if (err)
                goto bad_unshare_cleanup_fd;
        err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
                                         new_cred, new_fs);
        if (err)
                goto bad_unshare_cleanup_cred;

        if (new_cred) {
                err = set_cred_ucounts(new_cred);
                if (err)
                        goto bad_unshare_cleanup_cred;
        }

        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
                         * CLONE_SYSVSEM is equivalent to sys_exit().
                         */
                        exit_sem(current);
                }
                if (unshare_flags & CLONE_NEWIPC) {
                        /* Orphan segments in old ns (see sem above). */
                        exit_shm(current);
                        shm_init_task(current);
                }

                if (new_nsproxy)
                        switch_task_namespaces(current, new_nsproxy);

                task_lock(current);

                if (new_fs) {
                        fs = current->fs;
                        spin_lock(&fs->lock);
                        current->fs = new_fs;
                        if (--fs->users)
                                new_fs = NULL;
                        else
                                new_fs = fs;
                        spin_unlock(&fs->lock);
                }

                if (new_fd)
                        swap(current->files, new_fd);

                task_unlock(current);

                if (new_cred) {
                        /* Install the new user namespace */
                        commit_creds(new_cred);
                        new_cred = NULL;
                }
        }

        perf_event_namespaces(current);

bad_unshare_cleanup_cred:
        if (new_cred)
                put_cred(new_cred);
bad_unshare_cleanup_fd:
        if (new_fd)
                put_files_struct(new_fd);

bad_unshare_cleanup_fs:
        if (new_fs)
                free_fs_struct(new_fs);

bad_unshare_out:
        return err;
}

SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
        return ksys_unshare(unshare_flags);
}

/*
 *        Helper to unshare the files of the current task.
 *        We don't want to expose copy_files internals to
 *        the exec layer of the kernel.
 */

int unshare_files(void)
{
        struct task_struct *task = current;
        struct files_struct *old, *copy = NULL;
        int error;

        error = unshare_fd(CLONE_FILES, &copy);
        if (error || !copy)
                return error;

        old = task->files;
        task_lock(task);
        task->files = copy;
        task_unlock(task);
        put_files_struct(old);
        return 0;
}

int sysctl_max_threads(const struct ctl_table *table, int write,
                       void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int ret;
        int threads = max_threads;
        int min = 1;
        int max = MAX_THREADS;

        t = *table;
        t.data = &threads;
        t.extra1 = &min;
        t.extra2 = &max;

        ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_threads = threads;

        return 0;
}




































































































  149 














  149 














    1 









  148 












   61 






   93 
    1 














  153 





    1 






  152 































































































































































































































































































































































































   86 










   85 























   86 




   86 


   86 











   86 






































































































































































































































































































  153 







  153 






  153 















  153 















  153 





































































































































































































  152 














   86 










  153 



  152 


  153 













   84 
   86 








































































































































































































































































































  153 













  152 





  153 






















  153 














  153 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
/*
 * hugetlbpage-backed filesystem.  Based on ramfs.
 *
 * Nadia Yvette Chambers, 2002
 *
 * Copyright (C) 2002 Linus Torvalds.
 * License: GPL
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/thread_info.h>
#include <asm/current.h>
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/file.h>
#include <linux/kernel.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/capability.h>
#include <linux/ctype.h>
#include <linux/backing-dev.h>
#include <linux/hugetlb.h>
#include <linux/pagevec.h>
#include <linux/fs_parser.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/dnotify.h>
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/magic.h>
#include <linux/migrate.h>
#include <linux/uio.h>

#include <linux/uaccess.h>
#include <linux/sched/mm.h>

#define CREATE_TRACE_POINTS
#include <trace/events/hugetlbfs.h>

static const struct address_space_operations hugetlbfs_aops;
static const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
static const struct inode_operations hugetlbfs_inode_operations;

enum hugetlbfs_size_type { NO_SIZE, SIZE_STD, SIZE_PERCENT };

struct hugetlbfs_fs_context {
        struct hstate                *hstate;
        unsigned long long        max_size_opt;
        unsigned long long        min_size_opt;
        long                        max_hpages;
        long                        nr_inodes;
        long                        min_hpages;
        enum hugetlbfs_size_type max_val_type;
        enum hugetlbfs_size_type min_val_type;
        kuid_t                        uid;
        kgid_t                        gid;
        umode_t                        mode;
};

int sysctl_hugetlb_shm_group;

enum hugetlb_param {
        Opt_gid,
        Opt_min_size,
        Opt_mode,
        Opt_nr_inodes,
        Opt_pagesize,
        Opt_size,
        Opt_uid,
};

static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
        fsparam_gid   ("gid",                Opt_gid),
        fsparam_string("min_size",        Opt_min_size),
        fsparam_u32oct("mode",                Opt_mode),
        fsparam_string("nr_inodes",        Opt_nr_inodes),
        fsparam_string("pagesize",        Opt_pagesize),
        fsparam_string("size",                Opt_size),
        fsparam_uid   ("uid",                Opt_uid),
        {}
};

/*
 * Mask used when checking the page offset value passed in via system
 * calls.  This value will be converted to a loff_t which is signed.
 * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
 * value.  The extra bit (- 1 in the shift value) is to take the sign
 * bit into account.
 */
#define PGOFF_LOFFT_MAX \
        (((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))

static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(file);
        loff_t len, vma_len;
        int ret;
        struct hstate *h = hstate_file(file);
        vm_flags_t vm_flags;

        /*
         * vma address alignment (but not the pgoff alignment) has
         * already been checked by prepare_hugepage_range.  If you add
         * any error returns here, do so after setting VM_HUGETLB, so
         * is_vm_hugetlb_page tests below unmap_region go the right
         * way when do_mmap unwinds (may be important on powerpc
         * and ia64).
         */
        vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
        vma->vm_ops = &hugetlb_vm_ops;

        /*
         * page based offset in vm_pgoff could be sufficiently large to
         * overflow a loff_t when converted to byte offset.  This can
         * only happen on architectures where sizeof(loff_t) ==
         * sizeof(unsigned long).  So, only check in those instances.
         */
        if (sizeof(unsigned long) == sizeof(loff_t)) {
                if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
                        return -EINVAL;
        }

        /* must be huge page aligned */
        if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
                return -EINVAL;

        vma_len = (loff_t)(vma->vm_end - vma->vm_start);
        len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
        /* check for overflow */
        if (len < vma_len)
                return -EINVAL;

        inode_lock(inode);
        file_accessed(file);

        ret = -ENOMEM;

        vm_flags = vma->vm_flags;
        /*
         * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
         * reserving here. Note: only for SHM hugetlbfs file, the inode
         * flag S_PRIVATE is set.
         */
        if (inode->i_flags & S_PRIVATE)
                vm_flags |= VM_NORESERVE;

        if (!hugetlb_reserve_pages(inode,
                                vma->vm_pgoff >> huge_page_order(h),
                                len >> huge_page_shift(h), vma,
                                vm_flags))
                goto out;

        ret = 0;
        if (vma->vm_flags & VM_WRITE && inode->i_size < len)
                i_size_write(inode, len);
out:
        inode_unlock(inode);

        return ret;
}

/*
 * Called under mmap_write_lock(mm).
 */

unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                            unsigned long len, unsigned long pgoff,
                            unsigned long flags)
{
        unsigned long addr0 = 0;
        struct hstate *h = hstate_file(file);

        if (len & ~huge_page_mask(h))
                return -EINVAL;
        if (flags & MAP_FIXED) {
                if (addr & ~huge_page_mask(h))
                        return -EINVAL;
                if (prepare_hugepage_range(file, addr, len))
                        return -EINVAL;
        }
        if (addr)
                addr0 = ALIGN(addr, huge_page_size(h));

        return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff,
                                            flags, 0);
}

/*
 * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset.
 * Returns the maximum number of bytes one can read without touching the 1st raw
 * HWPOISON page.
 *
 * The implementation borrows the iteration logic from copy_page_to_iter*.
 */
static size_t adjust_range_hwpoison(struct folio *folio, size_t offset,
                size_t bytes)
{
        struct page *page;
        size_t n = 0;
        size_t res = 0;

        /* First page to start the loop. */
        page = folio_page(folio, offset / PAGE_SIZE);
        offset %= PAGE_SIZE;
        while (1) {
                if (is_raw_hwpoison_page_in_hugepage(page))
                        break;

                /* Safe to read n bytes without touching HWPOISON subpage. */
                n = min(bytes, (size_t)PAGE_SIZE - offset);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page = nth_page(page, 1);
                        offset = 0;
                }
        }

        return res;
}

/*
 * Support for read() - Find the page attached to f_mapping and copy out the
 * data. This provides functionality similar to filemap_read().
 */
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct hstate *h = hstate_file(file);
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        unsigned long index = iocb->ki_pos >> huge_page_shift(h);
        unsigned long offset = iocb->ki_pos & ~huge_page_mask(h);
        unsigned long end_index;
        loff_t isize;
        ssize_t retval = 0;

        while (iov_iter_count(to)) {
                struct folio *folio;
                size_t nr, copied, want;

                /* nr is the maximum number of bytes to copy from this page */
                nr = huge_page_size(h);
                isize = i_size_read(inode);
                if (!isize)
                        break;
                end_index = (isize - 1) >> huge_page_shift(h);
                if (index > end_index)
                        break;
                if (index == end_index) {
                        nr = ((isize - 1) & ~huge_page_mask(h)) + 1;
                        if (nr <= offset)
                                break;
                }
                nr = nr - offset;

                /* Find the folio */
                folio = filemap_lock_hugetlb_folio(h, mapping, index);
                if (IS_ERR(folio)) {
                        /*
                         * We have a HOLE, zero out the user-buffer for the
                         * length of the hole or request.
                         */
                        copied = iov_iter_zero(nr, to);
                } else {
                        folio_unlock(folio);

                        if (!folio_test_hwpoison(folio))
                                want = nr;
                        else {
                                /*
                                 * Adjust how many bytes safe to read without
                                 * touching the 1st raw HWPOISON page after
                                 * offset.
                                 */
                                want = adjust_range_hwpoison(folio, offset, nr);
                                if (want == 0) {
                                        folio_put(folio);
                                        retval = -EIO;
                                        break;
                                }
                        }

                        /*
                         * We have the folio, copy it to user space buffer.
                         */
                        copied = copy_folio_to_iter(folio, offset, want, to);
                        folio_put(folio);
                }
                offset += copied;
                retval += copied;
                if (copied != nr && iov_iter_count(to)) {
                        if (!retval)
                                retval = -EFAULT;
                        break;
                }
                index += offset >> huge_page_shift(h);
                offset &= ~huge_page_mask(h);
        }
        iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset;
        return retval;
}

static int hugetlbfs_write_begin(struct file *file,
                        struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct folio **foliop, void **fsdata)
{
        return -EINVAL;
}

static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio, void *fsdata)
{
        BUG();
        return -EINVAL;
}

static void hugetlb_delete_from_page_cache(struct folio *folio)
{
        folio_clear_dirty(folio);
        folio_clear_uptodate(folio);
        filemap_remove_folio(folio);
}

/*
 * Called with i_mmap_rwsem held for inode based vma maps.  This makes
 * sure vma (and vm_mm) will not go away.  We also hold the hugetlb fault
 * mutex for the page in the mapping.  So, we can not race with page being
 * faulted into the vma.
 */
static bool hugetlb_vma_maps_pfn(struct vm_area_struct *vma,
                                unsigned long addr, unsigned long pfn)
{
        pte_t *ptep, pte;

        ptep = hugetlb_walk(vma, addr, huge_page_size(hstate_vma(vma)));
        if (!ptep)
                return false;

        pte = huge_ptep_get(vma->vm_mm, addr, ptep);
        if (huge_pte_none(pte) || !pte_present(pte))
                return false;

        if (pte_pfn(pte) == pfn)
                return true;

        return false;
}

/*
 * Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
 * No, because the interval tree returns us only those vmas
 * which overlap the truncated area starting at pgoff,
 * and no vma on a 32-bit arch can span beyond the 4GB.
 */
static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
{
        unsigned long offset = 0;

        if (vma->vm_pgoff < start)
                offset = (start - vma->vm_pgoff) << PAGE_SHIFT;

        return vma->vm_start + offset;
}

static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
{
        unsigned long t_end;

        if (!end)
                return vma->vm_end;

        t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
        if (t_end > vma->vm_end)
                t_end = vma->vm_end;
        return t_end;
}

/*
 * Called with hugetlb fault mutex held.  Therefore, no more mappings to
 * this folio can be created while executing the routine.
 */
static void hugetlb_unmap_file_folio(struct hstate *h,
                                        struct address_space *mapping,
                                        struct folio *folio, pgoff_t index)
{
        struct rb_root_cached *root = &mapping->i_mmap;
        struct hugetlb_vma_lock *vma_lock;
        unsigned long pfn = folio_pfn(folio);
        struct vm_area_struct *vma;
        unsigned long v_start;
        unsigned long v_end;
        pgoff_t start, end;

        start = index * pages_per_huge_page(h);
        end = (index + 1) * pages_per_huge_page(h);

        i_mmap_lock_write(mapping);
retry:
        vma_lock = NULL;
        vma_interval_tree_foreach(vma, root, start, end - 1) {
                v_start = vma_offset_start(vma, start);
                v_end = vma_offset_end(vma, end);

                if (!hugetlb_vma_maps_pfn(vma, v_start, pfn))
                        continue;

                if (!hugetlb_vma_trylock_write(vma)) {
                        vma_lock = vma->vm_private_data;
                        /*
                         * If we can not get vma lock, we need to drop
                         * immap_sema and take locks in order.  First,
                         * take a ref on the vma_lock structure so that
                         * we can be guaranteed it will not go away when
                         * dropping immap_sema.
                         */
                        kref_get(&vma_lock->refs);
                        break;
                }

                unmap_hugepage_range(vma, v_start, v_end, NULL,
                                     ZAP_FLAG_DROP_MARKER);
                hugetlb_vma_unlock_write(vma);
        }

        i_mmap_unlock_write(mapping);

        if (vma_lock) {
                /*
                 * Wait on vma_lock.  We know it is still valid as we have
                 * a reference.  We must 'open code' vma locking as we do
                 * not know if vma_lock is still attached to vma.
                 */
                down_write(&vma_lock->rw_sema);
                i_mmap_lock_write(mapping);

                vma = vma_lock->vma;
                if (!vma) {
                        /*
                         * If lock is no longer attached to vma, then just
                         * unlock, drop our reference and retry looking for
                         * other vmas.
                         */
                        up_write(&vma_lock->rw_sema);
                        kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
                        goto retry;
                }

                /*
                 * vma_lock is still attached to vma.  Check to see if vma
                 * still maps page and if so, unmap.
                 */
                v_start = vma_offset_start(vma, start);
                v_end = vma_offset_end(vma, end);
                if (hugetlb_vma_maps_pfn(vma, v_start, pfn))
                        unmap_hugepage_range(vma, v_start, v_end, NULL,
                                             ZAP_FLAG_DROP_MARKER);

                kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
                hugetlb_vma_unlock_write(vma);

                goto retry;
        }
}

static void
hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
                      zap_flags_t zap_flags)
{
        struct vm_area_struct *vma;

        /*
         * end == 0 indicates that the entire range after start should be
         * unmapped.  Note, end is exclusive, whereas the interval tree takes
         * an inclusive "last".
         */
        vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
                unsigned long v_start;
                unsigned long v_end;

                if (!hugetlb_vma_trylock_write(vma))
                        continue;

                v_start = vma_offset_start(vma, start);
                v_end = vma_offset_end(vma, end);

                unmap_hugepage_range(vma, v_start, v_end, NULL, zap_flags);

                /*
                 * Note that vma lock only exists for shared/non-private
                 * vmas.  Therefore, lock is not held when calling
                 * unmap_hugepage_range for private vmas.
                 */
                hugetlb_vma_unlock_write(vma);
        }
}

/*
 * Called with hugetlb fault mutex held.
 * Returns true if page was actually removed, false otherwise.
 */
static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
                                        struct address_space *mapping,
                                        struct folio *folio, pgoff_t index,
                                        bool truncate_op)
{
        bool ret = false;

        /*
         * If folio is mapped, it was faulted in after being
         * unmapped in caller.  Unmap (again) while holding
         * the fault mutex.  The mutex will prevent faults
         * until we finish removing the folio.
         */
        if (unlikely(folio_mapped(folio)))
                hugetlb_unmap_file_folio(h, mapping, folio, index);

        folio_lock(folio);
        /*
         * We must remove the folio from page cache before removing
         * the region/ reserve map (hugetlb_unreserve_pages).  In
         * rare out of memory conditions, removal of the region/reserve
         * map could fail.  Correspondingly, the subpool and global
         * reserve usage count can need to be adjusted.
         */
        VM_BUG_ON_FOLIO(folio_test_hugetlb_restore_reserve(folio), folio);
        hugetlb_delete_from_page_cache(folio);
        ret = true;
        if (!truncate_op) {
                if (unlikely(hugetlb_unreserve_pages(inode, index,
                                                        index + 1, 1)))
                        hugetlb_fix_reserve_counts(inode);
        }

        folio_unlock(folio);
        return ret;
}

/*
 * remove_inode_hugepages handles two distinct cases: truncation and hole
 * punch.  There are subtle differences in operation for each case.
 *
 * truncation is indicated by end of range being LLONG_MAX
 *        In this case, we first scan the range and release found pages.
 *        After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
 *        maps and global counts.  Page faults can race with truncation.
 *        During faults, hugetlb_no_page() checks i_size before page allocation,
 *        and again after obtaining page table lock.  It will 'back out'
 *        allocations in the truncated range.
 * hole punch is indicated if end is not LLONG_MAX
 *        In the hole punch case we scan the range and release found pages.
 *        Only when releasing a page is the associated region/reserve map
 *        deleted.  The region/reserve map for ranges without associated
 *        pages are not modified.  Page faults can race with hole punch.
 *        This is indicated if we find a mapped page.
 * Note: If the passed end of range value is beyond the end of file, but
 * not LLONG_MAX this routine still performs a hole punch operation.
 */
static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                                   loff_t lend)
{
        struct hstate *h = hstate_inode(inode);
        struct address_space *mapping = &inode->i_data;
        const pgoff_t end = lend >> PAGE_SHIFT;
        struct folio_batch fbatch;
        pgoff_t next, index;
        int i, freed = 0;
        bool truncate_op = (lend == LLONG_MAX);

        folio_batch_init(&fbatch);
        next = lstart >> PAGE_SHIFT;
        while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
                for (i = 0; i < folio_batch_count(&fbatch); ++i) {
                        struct folio *folio = fbatch.folios[i];
                        u32 hash = 0;

                        index = folio->index >> huge_page_order(h);
                        hash = hugetlb_fault_mutex_hash(mapping, index);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);

                        /*
                         * Remove folio that was part of folio_batch.
                         */
                        if (remove_inode_single_folio(h, inode, mapping, folio,
                                                        index, truncate_op))
                                freed++;

                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }

        if (truncate_op)
                (void)hugetlb_unreserve_pages(inode,
                                lstart >> huge_page_shift(h),
                                LONG_MAX, freed);
}

static void hugetlbfs_evict_inode(struct inode *inode)
{
        struct resv_map *resv_map;

        trace_hugetlbfs_evict_inode(inode);
        remove_inode_hugepages(inode, 0, LLONG_MAX);

        /*
         * Get the resv_map from the address space embedded in the inode.
         * This is the address space which points to any resv_map allocated
         * at inode creation time.  If this is a device special inode,
         * i_mapping may not point to the original address space.
         */
        resv_map = (struct resv_map *)(&inode->i_data)->i_private_data;
        /* Only regular and link inodes have associated reserve maps */
        if (resv_map)
                resv_map_release(&resv_map->refs);
        clear_inode(inode);
}

static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
        pgoff_t pgoff;
        struct address_space *mapping = inode->i_mapping;
        struct hstate *h = hstate_inode(inode);

        BUG_ON(offset & ~huge_page_mask(h));
        pgoff = offset >> PAGE_SHIFT;

        i_size_write(inode, offset);
        i_mmap_lock_write(mapping);
        if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
                hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
                                      ZAP_FLAG_DROP_MARKER);
        i_mmap_unlock_write(mapping);
        remove_inode_hugepages(inode, offset, LLONG_MAX);
}

static void hugetlbfs_zero_partial_page(struct hstate *h,
                                        struct address_space *mapping,
                                        loff_t start,
                                        loff_t end)
{
        pgoff_t idx = start >> huge_page_shift(h);
        struct folio *folio;

        folio = filemap_lock_hugetlb_folio(h, mapping, idx);
        if (IS_ERR(folio))
                return;

        start = start & ~huge_page_mask(h);
        end = end & ~huge_page_mask(h);
        if (!end)
                end = huge_page_size(h);

        folio_zero_segment(folio, (size_t)start, (size_t)end);

        folio_unlock(folio);
        folio_put(folio);
}

static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
        struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
        struct address_space *mapping = inode->i_mapping;
        struct hstate *h = hstate_inode(inode);
        loff_t hpage_size = huge_page_size(h);
        loff_t hole_start, hole_end;

        /*
         * hole_start and hole_end indicate the full pages within the hole.
         */
        hole_start = round_up(offset, hpage_size);
        hole_end = round_down(offset + len, hpage_size);

        inode_lock(inode);

        /* protected by i_rwsem */
        if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
                inode_unlock(inode);
                return -EPERM;
        }

        i_mmap_lock_write(mapping);

        /* If range starts before first full page, zero partial page. */
        if (offset < hole_start)
                hugetlbfs_zero_partial_page(h, mapping,
                                offset, min(offset + len, hole_start));

        /* Unmap users of full pages in the hole. */
        if (hole_end > hole_start) {
                if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
                        hugetlb_vmdelete_list(&mapping->i_mmap,
                                              hole_start >> PAGE_SHIFT,
                                              hole_end >> PAGE_SHIFT, 0);
        }

        /* If range extends beyond last full page, zero partial page. */
        if ((offset + len) > hole_end && (offset + len) > hole_start)
                hugetlbfs_zero_partial_page(h, mapping,
                                hole_end, offset + len);

        i_mmap_unlock_write(mapping);

        /* Remove full pages from the file. */
        if (hole_end > hole_start)
                remove_inode_hugepages(inode, hole_start, hole_end);

        inode_unlock(inode);

        return 0;
}

static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
                                loff_t len)
{
        struct inode *inode = file_inode(file);
        struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
        struct address_space *mapping = inode->i_mapping;
        struct hstate *h = hstate_inode(inode);
        struct vm_area_struct pseudo_vma;
        struct mm_struct *mm = current->mm;
        loff_t hpage_size = huge_page_size(h);
        unsigned long hpage_shift = huge_page_shift(h);
        pgoff_t start, index, end;
        int error;
        u32 hash;

        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                error = hugetlbfs_punch_hole(inode, offset, len);
                goto out_nolock;
        }

        /*
         * Default preallocate case.
         * For this range, start is rounded down and end is rounded up
         * as well as being converted to page offsets.
         */
        start = offset >> hpage_shift;
        end = (offset + len + hpage_size - 1) >> hpage_shift;

        inode_lock(inode);

        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
        error = inode_newsize_ok(inode, offset + len);
        if (error)
                goto out;

        if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
                error = -EPERM;
                goto out;
        }

        /*
         * Initialize a pseudo vma as this is required by the huge page
         * allocation routines.
         */
        vma_init(&pseudo_vma, mm);
        vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
        pseudo_vma.vm_file = file;

        for (index = start; index < end; index++) {
                /*
                 * This is supposed to be the vaddr where the page is being
                 * faulted in, but we have no vaddr here.
                 */
                struct folio *folio;
                unsigned long addr;

                cond_resched();

                /*
                 * fallocate(2) manpage permits EINTR; we may have been
                 * interrupted because we are using up too much memory.
                 */
                if (signal_pending(current)) {
                        error = -EINTR;
                        break;
                }

                /* addr is the offset within the file (zero based) */
                addr = index * hpage_size;

                /* mutex taken here, fault path and hole punch */
                hash = hugetlb_fault_mutex_hash(mapping, index);
                mutex_lock(&hugetlb_fault_mutex_table[hash]);

                /* See if already present in mapping to avoid alloc/free */
                folio = filemap_get_folio(mapping, index << huge_page_order(h));
                if (!IS_ERR(folio)) {
                        folio_put(folio);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        continue;
                }

                /*
                 * Allocate folio without setting the avoid_reserve argument.
                 * There certainly are no reserves associated with the
                 * pseudo_vma.  However, there could be shared mappings with
                 * reserves for the file at the inode level.  If we fallocate
                 * folios in these areas, we need to consume the reserves
                 * to keep reservation accounting consistent.
                 */
                folio = alloc_hugetlb_folio(&pseudo_vma, addr, false);
                if (IS_ERR(folio)) {
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        error = PTR_ERR(folio);
                        goto out;
                }
                folio_zero_user(folio, addr);
                __folio_mark_uptodate(folio);
                error = hugetlb_add_to_page_cache(folio, mapping, index);
                if (unlikely(error)) {
                        restore_reserve_on_error(h, &pseudo_vma, addr, folio);
                        folio_put(folio);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        goto out;
                }

                mutex_unlock(&hugetlb_fault_mutex_table[hash]);

                folio_set_hugetlb_migratable(folio);
                /*
                 * folio_unlock because locked by hugetlb_add_to_page_cache()
                 * folio_put() due to reference from alloc_hugetlb_folio()
                 */
                folio_unlock(folio);
                folio_put(folio);
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
                i_size_write(inode, offset + len);
        inode_set_ctime_current(inode);
out:
        inode_unlock(inode);

out_nolock:
        trace_hugetlbfs_fallocate(inode, mode, offset, len, error);
        return error;
}

static int hugetlbfs_setattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct hstate *h = hstate_inode(inode);
        int error;
        unsigned int ia_valid = attr->ia_valid;
        struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);

        error = setattr_prepare(idmap, dentry, attr);
        if (error)
                return error;

        trace_hugetlbfs_setattr(inode, dentry, attr);

        if (ia_valid & ATTR_SIZE) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;

                if (newsize & ~huge_page_mask(h))
                        return -EINVAL;
                /* protected by i_rwsem */
                if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
                    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
                        return -EPERM;
                hugetlb_vmtruncate(inode, newsize);
        }

        setattr_copy(idmap, inode, attr);
        mark_inode_dirty(inode);
        return 0;
}

static struct inode *hugetlbfs_get_root(struct super_block *sb,
                                        struct hugetlbfs_fs_context *ctx)
{
        struct inode *inode;

        inode = new_inode(sb);
        if (inode) {
                inode->i_ino = get_next_ino();
                inode->i_mode = S_IFDIR | ctx->mode;
                inode->i_uid = ctx->uid;
                inode->i_gid = ctx->gid;
                simple_inode_init_ts(inode);
                inode->i_op = &hugetlbfs_dir_inode_operations;
                inode->i_fop = &simple_dir_operations;
                /* directory inodes start off with i_nlink == 2 (for "." entry) */
                inc_nlink(inode);
                lockdep_annotate_inode_mutex_key(inode);
        }
        return inode;
}

/*
 * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
 * be taken from reclaim -- unlike regular filesystems. This needs an
 * annotation because huge_pmd_share() does an allocation under hugetlb's
 * i_mmap_rwsem.
 */
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;

static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                                        struct mnt_idmap *idmap,
                                        struct inode *dir,
                                        umode_t mode, dev_t dev)
{
        struct inode *inode;
        struct resv_map *resv_map = NULL;

        /*
         * Reserve maps are only needed for inodes that can have associated
         * page allocations.
         */
        if (S_ISREG(mode) || S_ISLNK(mode)) {
                resv_map = resv_map_alloc();
                if (!resv_map)
                        return NULL;
        }

        inode = new_inode(sb);
        if (inode) {
                struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);

                inode->i_ino = get_next_ino();
                inode_init_owner(idmap, inode, dir, mode);
                lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
                                &hugetlbfs_i_mmap_rwsem_key);
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                simple_inode_init_ts(inode);
                inode->i_mapping->i_private_data = resv_map;
                info->seals = F_SEAL_SEAL;
                switch (mode & S_IFMT) {
                default:
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
                        inode->i_op = &hugetlbfs_inode_operations;
                        inode->i_fop = &hugetlbfs_file_operations;
                        break;
                case S_IFDIR:
                        inode->i_op = &hugetlbfs_dir_inode_operations;
                        inode->i_fop = &simple_dir_operations;

                        /* directory inodes start off with i_nlink == 2 (for "." entry) */
                        inc_nlink(inode);
                        break;
                case S_IFLNK:
                        inode->i_op = &page_symlink_inode_operations;
                        inode_nohighmem(inode);
                        break;
                }
                lockdep_annotate_inode_mutex_key(inode);
                trace_hugetlbfs_alloc_inode(inode, dir, mode);
        } else {
                if (resv_map)
                        kref_put(&resv_map->refs, resv_map_release);
        }

        return inode;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
                           struct dentry *dentry, umode_t mode, dev_t dev)
{
        struct inode *inode;

        inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev);
        if (!inode)
                return -ENOSPC;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        d_instantiate(dentry, inode);
        dget(dentry);/* Extra count - pin the dentry in core */
        return 0;
}

static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                                      struct dentry *dentry, umode_t mode)
{
        int retval = hugetlbfs_mknod(idmap, dir, dentry,
                                     mode | S_IFDIR, 0);
        if (!retval)
                inc_nlink(dir);
        return ERR_PTR(retval);
}

static int hugetlbfs_create(struct mnt_idmap *idmap,
                            struct inode *dir, struct dentry *dentry,
                            umode_t mode, bool excl)
{
        return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}

static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
                             struct inode *dir, struct file *file,
                             umode_t mode)
{
        struct inode *inode;

        inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0);
        if (!inode)
                return -ENOSPC;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        d_tmpfile(file, inode);
        return finish_open_simple(file, 0);
}

static int hugetlbfs_symlink(struct mnt_idmap *idmap,
                             struct inode *dir, struct dentry *dentry,
                             const char *symname)
{
        const umode_t mode = S_IFLNK|S_IRWXUGO;
        struct inode *inode;
        int error = -ENOSPC;

        inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0);
        if (inode) {
                int l = strlen(symname)+1;
                error = page_symlink(inode, symname, l);
                if (!error) {
                        d_instantiate(dentry, inode);
                        dget(dentry);
                } else
                        iput(inode);
        }
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));

        return error;
}

#ifdef CONFIG_MIGRATION
static int hugetlbfs_migrate_folio(struct address_space *mapping,
                                struct folio *dst, struct folio *src,
                                enum migrate_mode mode)
{
        int rc;

        rc = migrate_huge_page_move_mapping(mapping, dst, src);
        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;

        if (hugetlb_folio_subpool(src)) {
                hugetlb_set_folio_subpool(dst,
                                        hugetlb_folio_subpool(src));
                hugetlb_set_folio_subpool(src, NULL);
        }

        folio_migrate_flags(dst, src);

        return MIGRATEPAGE_SUCCESS;
}
#else
#define hugetlbfs_migrate_folio NULL
#endif

static int hugetlbfs_error_remove_folio(struct address_space *mapping,
                                struct folio *folio)
{
        return 0;
}

/*
 * Display the mount options in /proc/mounts.
 */
static int hugetlbfs_show_options(struct seq_file *m, struct dentry *root)
{
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(root->d_sb);
        struct hugepage_subpool *spool = sbinfo->spool;
        unsigned long hpage_size = huge_page_size(sbinfo->hstate);
        unsigned hpage_shift = huge_page_shift(sbinfo->hstate);
        char mod;

        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
                seq_printf(m, ",uid=%u",
                           from_kuid_munged(&init_user_ns, sbinfo->uid));
        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
                seq_printf(m, ",gid=%u",
                           from_kgid_munged(&init_user_ns, sbinfo->gid));
        if (sbinfo->mode != 0755)
                seq_printf(m, ",mode=%o", sbinfo->mode);
        if (sbinfo->max_inodes != -1)
                seq_printf(m, ",nr_inodes=%lu", sbinfo->max_inodes);

        hpage_size /= 1024;
        mod = 'K';
        if (hpage_size >= 1024) {
                hpage_size /= 1024;
                mod = 'M';
        }
        seq_printf(m, ",pagesize=%lu%c", hpage_size, mod);
        if (spool) {
                if (spool->max_hpages != -1)
                        seq_printf(m, ",size=%llu",
                                   (unsigned long long)spool->max_hpages << hpage_shift);
                if (spool->min_hpages != -1)
                        seq_printf(m, ",min_size=%llu",
                                   (unsigned long long)spool->min_hpages << hpage_shift);
        }
        return 0;
}

static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
        struct hstate *h = hstate_inode(d_inode(dentry));
        u64 id = huge_encode_dev(dentry->d_sb->s_dev);

        buf->f_fsid = u64_to_fsid(id);
        buf->f_type = HUGETLBFS_MAGIC;
        buf->f_bsize = huge_page_size(h);
        if (sbinfo) {
                spin_lock(&sbinfo->stat_lock);
                /* If no limits set, just report 0 or -1 for max/free/used
                 * blocks, like simple_statfs() */
                if (sbinfo->spool) {
                        long free_pages;

                        spin_lock_irq(&sbinfo->spool->lock);
                        buf->f_blocks = sbinfo->spool->max_hpages;
                        free_pages = sbinfo->spool->max_hpages
                                - sbinfo->spool->used_hpages;
                        buf->f_bavail = buf->f_bfree = free_pages;
                        spin_unlock_irq(&sbinfo->spool->lock);
                        buf->f_files = sbinfo->max_inodes;
                        buf->f_ffree = sbinfo->free_inodes;
                }
                spin_unlock(&sbinfo->stat_lock);
        }
        buf->f_namelen = NAME_MAX;
        return 0;
}

static void hugetlbfs_put_super(struct super_block *sb)
{
        struct hugetlbfs_sb_info *sbi = HUGETLBFS_SB(sb);

        if (sbi) {
                sb->s_fs_info = NULL;

                if (sbi->spool)
                        hugepage_put_subpool(sbi->spool);

                kfree(sbi);
        }
}

static inline int hugetlbfs_dec_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
        if (sbinfo->free_inodes >= 0) {
                spin_lock(&sbinfo->stat_lock);
                if (unlikely(!sbinfo->free_inodes)) {
                        spin_unlock(&sbinfo->stat_lock);
                        return 0;
                }
                sbinfo->free_inodes--;
                spin_unlock(&sbinfo->stat_lock);
        }

        return 1;
}

static void hugetlbfs_inc_free_inodes(struct hugetlbfs_sb_info *sbinfo)
{
        if (sbinfo->free_inodes >= 0) {
                spin_lock(&sbinfo->stat_lock);
                sbinfo->free_inodes++;
                spin_unlock(&sbinfo->stat_lock);
        }
}


static struct kmem_cache *hugetlbfs_inode_cachep;

static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
{
        struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(sb);
        struct hugetlbfs_inode_info *p;

        if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
                return NULL;
        p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
        if (unlikely(!p)) {
                hugetlbfs_inc_free_inodes(sbinfo);
                return NULL;
        }
        return &p->vfs_inode;
}

static void hugetlbfs_free_inode(struct inode *inode)
{
        trace_hugetlbfs_free_inode(inode);
        kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}

static void hugetlbfs_destroy_inode(struct inode *inode)
{
        hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
}

static const struct address_space_operations hugetlbfs_aops = {
        .write_begin        = hugetlbfs_write_begin,
        .write_end        = hugetlbfs_write_end,
        .dirty_folio        = noop_dirty_folio,
        .migrate_folio  = hugetlbfs_migrate_folio,
        .error_remove_folio        = hugetlbfs_error_remove_folio,
};


static void init_once(void *foo)
{
        struct hugetlbfs_inode_info *ei = foo;

        inode_init_once(&ei->vfs_inode);
}

static const struct file_operations hugetlbfs_file_operations = {
        .read_iter                = hugetlbfs_read_iter,
        .mmap                        = hugetlbfs_file_mmap,
        .fsync                        = noop_fsync,
        .get_unmapped_area        = hugetlb_get_unmapped_area,
        .llseek                        = default_llseek,
        .fallocate                = hugetlbfs_fallocate,
        .fop_flags                = FOP_HUGE_PAGES,
};

static const struct inode_operations hugetlbfs_dir_inode_operations = {
        .create                = hugetlbfs_create,
        .lookup                = simple_lookup,
        .link                = simple_link,
        .unlink                = simple_unlink,
        .symlink        = hugetlbfs_symlink,
        .mkdir                = hugetlbfs_mkdir,
        .rmdir                = simple_rmdir,
        .mknod                = hugetlbfs_mknod,
        .rename                = simple_rename,
        .setattr        = hugetlbfs_setattr,
        .tmpfile        = hugetlbfs_tmpfile,
};

static const struct inode_operations hugetlbfs_inode_operations = {
        .setattr        = hugetlbfs_setattr,
};

static const struct super_operations hugetlbfs_ops = {
        .alloc_inode    = hugetlbfs_alloc_inode,
        .free_inode     = hugetlbfs_free_inode,
        .destroy_inode  = hugetlbfs_destroy_inode,
        .evict_inode        = hugetlbfs_evict_inode,
        .statfs                = hugetlbfs_statfs,
        .put_super        = hugetlbfs_put_super,
        .show_options        = hugetlbfs_show_options,
};

/*
 * Convert size option passed from command line to number of huge pages
 * in the pool specified by hstate.  Size option could be in bytes
 * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT).
 */
static long
hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt,
                         enum hugetlbfs_size_type val_type)
{
        if (val_type == NO_SIZE)
                return -1;

        if (val_type == SIZE_PERCENT) {
                size_opt <<= huge_page_shift(h);
                size_opt *= h->max_huge_pages;
                do_div(size_opt, 100);
        }

        size_opt >>= huge_page_shift(h);
        return size_opt;
}

/*
 * Parse one mount parameter.
 */
static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct hugetlbfs_fs_context *ctx = fc->fs_private;
        struct fs_parse_result result;
        struct hstate *h;
        char *rest;
        unsigned long ps;
        int opt;

        opt = fs_parse(fc, hugetlb_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_uid:
                ctx->uid = result.uid;
                return 0;

        case Opt_gid:
                ctx->gid = result.gid;
                return 0;

        case Opt_mode:
                ctx->mode = result.uint_32 & 01777U;
                return 0;

        case Opt_size:
                /* memparse() will accept a K/M/G without a digit */
                if (!param->string || !isdigit(param->string[0]))
                        goto bad_val;
                ctx->max_size_opt = memparse(param->string, &rest);
                ctx->max_val_type = SIZE_STD;
                if (*rest == '%')
                        ctx->max_val_type = SIZE_PERCENT;
                return 0;

        case Opt_nr_inodes:
                /* memparse() will accept a K/M/G without a digit */
                if (!param->string || !isdigit(param->string[0]))
                        goto bad_val;
                ctx->nr_inodes = memparse(param->string, &rest);
                return 0;

        case Opt_pagesize:
                ps = memparse(param->string, &rest);
                h = size_to_hstate(ps);
                if (!h) {
                        pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
                        return -EINVAL;
                }
                ctx->hstate = h;
                return 0;

        case Opt_min_size:
                /* memparse() will accept a K/M/G without a digit */
                if (!param->string || !isdigit(param->string[0]))
                        goto bad_val;
                ctx->min_size_opt = memparse(param->string, &rest);
                ctx->min_val_type = SIZE_STD;
                if (*rest == '%')
                        ctx->min_val_type = SIZE_PERCENT;
                return 0;

        default:
                return -EINVAL;
        }

bad_val:
        return invalfc(fc, "Bad value '%s' for mount option '%s'\n",
                      param->string, param->key);
}

/*
 * Validate the parsed options.
 */
static int hugetlbfs_validate(struct fs_context *fc)
{
        struct hugetlbfs_fs_context *ctx = fc->fs_private;

        /*
         * Use huge page pool size (in hstate) to convert the size
         * options to number of huge pages.  If NO_SIZE, -1 is returned.
         */
        ctx->max_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
                                                   ctx->max_size_opt,
                                                   ctx->max_val_type);
        ctx->min_hpages = hugetlbfs_size_to_hpages(ctx->hstate,
                                                   ctx->min_size_opt,
                                                   ctx->min_val_type);

        /*
         * If max_size was specified, then min_size must be smaller
         */
        if (ctx->max_val_type > NO_SIZE &&
            ctx->min_hpages > ctx->max_hpages) {
                pr_err("Minimum size can not be greater than maximum size\n");
                return -EINVAL;
        }

        return 0;
}

static int
hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct hugetlbfs_fs_context *ctx = fc->fs_private;
        struct hugetlbfs_sb_info *sbinfo;

        sbinfo = kmalloc(sizeof(struct hugetlbfs_sb_info), GFP_KERNEL);
        if (!sbinfo)
                return -ENOMEM;
        sb->s_fs_info = sbinfo;
        spin_lock_init(&sbinfo->stat_lock);
        sbinfo->hstate                = ctx->hstate;
        sbinfo->max_inodes        = ctx->nr_inodes;
        sbinfo->free_inodes        = ctx->nr_inodes;
        sbinfo->spool                = NULL;
        sbinfo->uid                = ctx->uid;
        sbinfo->gid                = ctx->gid;
        sbinfo->mode                = ctx->mode;

        /*
         * Allocate and initialize subpool if maximum or minimum size is
         * specified.  Any needed reservations (for minimum size) are taken
         * when the subpool is created.
         */
        if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
                sbinfo->spool = hugepage_new_subpool(ctx->hstate,
                                                     ctx->max_hpages,
                                                     ctx->min_hpages);
                if (!sbinfo->spool)
                        goto out_free;
        }
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = huge_page_size(ctx->hstate);
        sb->s_blocksize_bits = huge_page_shift(ctx->hstate);
        sb->s_magic = HUGETLBFS_MAGIC;
        sb->s_op = &hugetlbfs_ops;
        sb->s_time_gran = 1;

        /*
         * Due to the special and limited functionality of hugetlbfs, it does
         * not work well as a stacking filesystem.
         */
        sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
        sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
        if (!sb->s_root)
                goto out_free;
        return 0;
out_free:
        kfree(sbinfo->spool);
        kfree(sbinfo);
        return -ENOMEM;
}

static int hugetlbfs_get_tree(struct fs_context *fc)
{
        int err = hugetlbfs_validate(fc);
        if (err)
                return err;
        return get_tree_nodev(fc, hugetlbfs_fill_super);
}

static void hugetlbfs_fs_context_free(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations hugetlbfs_fs_context_ops = {
        .free                = hugetlbfs_fs_context_free,
        .parse_param        = hugetlbfs_parse_param,
        .get_tree        = hugetlbfs_get_tree,
};

static int hugetlbfs_init_fs_context(struct fs_context *fc)
{
        struct hugetlbfs_fs_context *ctx;

        ctx = kzalloc(sizeof(struct hugetlbfs_fs_context), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->max_hpages        = -1; /* No limit on size by default */
        ctx->nr_inodes        = -1; /* No limit on number of inodes by default */
        ctx->uid        = current_fsuid();
        ctx->gid        = current_fsgid();
        ctx->mode        = 0755;
        ctx->hstate        = &default_hstate;
        ctx->min_hpages        = -1; /* No default minimum size */
        ctx->max_val_type = NO_SIZE;
        ctx->min_val_type = NO_SIZE;
        fc->fs_private = ctx;
        fc->ops        = &hugetlbfs_fs_context_ops;
        return 0;
}

static struct file_system_type hugetlbfs_fs_type = {
        .name                        = "hugetlbfs",
        .init_fs_context        = hugetlbfs_init_fs_context,
        .parameters                = hugetlb_fs_parameters,
        .kill_sb                = kill_litter_super,
        .fs_flags               = FS_ALLOW_IDMAP,
};

static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];

static int can_do_hugetlb_shm(void)
{
        kgid_t shm_group;
        shm_group = make_kgid(&init_user_ns, sysctl_hugetlb_shm_group);
        return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
}

static int get_hstate_idx(int page_size_log)
{
        struct hstate *h = hstate_sizelog(page_size_log);

        if (!h)
                return -1;
        return hstate_index(h);
}

/*
 * Note that size should be aligned to proper hugepage size in caller side,
 * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
 */
struct file *hugetlb_file_setup(const char *name, size_t size,
                                vm_flags_t acctflag, int creat_flags,
                                int page_size_log)
{
        struct inode *inode;
        struct vfsmount *mnt;
        int hstate_idx;
        struct file *file;

        hstate_idx = get_hstate_idx(page_size_log);
        if (hstate_idx < 0)
                return ERR_PTR(-ENODEV);

        mnt = hugetlbfs_vfsmount[hstate_idx];
        if (!mnt)
                return ERR_PTR(-ENOENT);

        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
                struct ucounts *ucounts = current_ucounts();

                if (user_shm_lock(size, ucounts)) {
                        pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
                                current->comm, current->pid);
                        user_shm_unlock(size, ucounts);
                }
                return ERR_PTR(-EPERM);
        }

        file = ERR_PTR(-ENOSPC);
        /* hugetlbfs_vfsmount[] mounts do not use idmapped mounts.  */
        inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL,
                                    S_IFREG | S_IRWXUGO, 0);
        if (!inode)
                goto out;
        if (creat_flags == HUGETLB_SHMFS_INODE)
                inode->i_flags |= S_PRIVATE;

        inode->i_size = size;
        clear_nlink(inode);

        if (!hugetlb_reserve_pages(inode, 0,
                        size >> huge_page_shift(hstate_inode(inode)), NULL,
                        acctflag))
                file = ERR_PTR(-ENOMEM);
        else
                file = alloc_file_pseudo(inode, mnt, name, O_RDWR,
                                        &hugetlbfs_file_operations);
        if (!IS_ERR(file))
                return file;

        iput(inode);
out:
        return file;
}

static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
{
        struct fs_context *fc;
        struct vfsmount *mnt;

        fc = fs_context_for_mount(&hugetlbfs_fs_type, SB_KERNMOUNT);
        if (IS_ERR(fc)) {
                mnt = ERR_CAST(fc);
        } else {
                struct hugetlbfs_fs_context *ctx = fc->fs_private;
                ctx->hstate = h;
                mnt = fc_mount(fc);
                put_fs_context(fc);
        }
        if (IS_ERR(mnt))
                pr_err("Cannot mount internal hugetlbfs for page size %luK",
                       huge_page_size(h) / SZ_1K);
        return mnt;
}

static int __init init_hugetlbfs_fs(void)
{
        struct vfsmount *mnt;
        struct hstate *h;
        int error;
        int i;

        if (!hugepages_supported()) {
                pr_info("disabling because there are no supported hugepage sizes\n");
                return -ENOTSUPP;
        }

        error = -ENOMEM;
        hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
                                        sizeof(struct hugetlbfs_inode_info),
                                        0, SLAB_ACCOUNT, init_once);
        if (hugetlbfs_inode_cachep == NULL)
                goto out;

        error = register_filesystem(&hugetlbfs_fs_type);
        if (error)
                goto out_free;

        /* default hstate mount is required */
        mnt = mount_one_hugetlbfs(&default_hstate);
        if (IS_ERR(mnt)) {
                error = PTR_ERR(mnt);
                goto out_unreg;
        }
        hugetlbfs_vfsmount[default_hstate_idx] = mnt;

        /* other hstates are optional */
        i = 0;
        for_each_hstate(h) {
                if (i == default_hstate_idx) {
                        i++;
                        continue;
                }

                mnt = mount_one_hugetlbfs(h);
                if (IS_ERR(mnt))
                        hugetlbfs_vfsmount[i] = NULL;
                else
                        hugetlbfs_vfsmount[i] = mnt;
                i++;
        }

        return 0;

 out_unreg:
        (void)unregister_filesystem(&hugetlbfs_fs_type);
 out_free:
        kmem_cache_destroy(hugetlbfs_inode_cachep);
 out:
        return error;
}
fs_initcall(init_hugetlbfs_fs)













































   11 
































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_POLL_H
#define _LINUX_POLL_H


#include <linux/compiler.h>
#include <linux/ktime.h>
#include <linux/wait.h>
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <uapi/linux/poll.h>
#include <uapi/linux/eventpoll.h>

/* ~832 bytes of stack space used max in sys_select/sys_poll before allocating
   additional memory. */
#define MAX_STACK_ALLOC 832
#define FRONTEND_STACK_ALLOC        256
#define SELECT_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define POLL_STACK_ALLOC        FRONTEND_STACK_ALLOC
#define WQUEUES_STACK_ALLOC        (MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
#define N_INLINE_POLL_ENTRIES        (WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))

#define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

struct poll_table_struct;

/*
 * structures and helpers for f_op->poll implementations
 */
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

/*
 * Do not touch the structure directly, use the access function
 * poll_requested_events() instead.
 */
typedef struct poll_table_struct {
        poll_queue_proc _qproc;
        __poll_t _key;
} poll_table;

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
        if (p && p->_qproc) {
                p->_qproc(filp, wait_address, p);
                /*
                 * This memory barrier is paired in the wq_has_sleeper().
                 * See the comment above prepare_to_wait(), we need to
                 * ensure that subsequent tests in this thread can't be
                 * reordered with __add_wait_queue() in _qproc() paths.
                 */
                smp_mb();
        }
}

/*
 * Return the set of events that the application wants to poll for.
 * This is useful for drivers that need to know whether a DMA transfer has
 * to be started implicitly on poll(). You typically only want to do that
 * if the application is actually polling for POLLIN and/or POLLOUT.
 */
static inline __poll_t poll_requested_events(const poll_table *p)
{
        return p ? p->_key : ~(__poll_t)0;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
        pt->_qproc = qproc;
        pt->_key   = ~(__poll_t)0; /* all events enabled */
}

static inline bool file_can_poll(struct file *file)
{
        return file->f_op->poll;
}

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
        if (unlikely(!file->f_op->poll))
                return DEFAULT_POLLMASK;
        return file->f_op->poll(file, pt);
}

struct poll_table_entry {
        struct file *filp;
        __poll_t key;
        wait_queue_entry_t wait;
        wait_queue_head_t *wait_address;
};

/*
 * Structures and helpers for select/poll syscall
 */
struct poll_wqueues {
        poll_table pt;
        struct poll_table_page *table;
        struct task_struct *polling_task;
        int triggered;
        int error;
        int inline_index;
        struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

extern void poll_initwait(struct poll_wqueues *pwq);
extern void poll_freewait(struct poll_wqueues *pwq);
extern u64 select_estimate_accuracy(struct timespec64 *tv);

#define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)

extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec64 *end_time);

extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
                                   long nsec);

#define __MAP(v, from, to) \
        (from < to ? (v & from) * (to/from) : (v & from) / (from/to))

static inline __u16 mangle_poll(__poll_t val)
{
        __u16 v = (__force __u16)val;
#define M(X) __MAP(v, (__force __u16)EPOLL##X, POLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}

static inline __poll_t demangle_poll(u16 val)
{
#define M(X) (__force __poll_t)__MAP(val, POLL##X, (__force __u16)EPOLL##X)
        return M(IN) | M(OUT) | M(PRI) | M(ERR) | M(NVAL) |
                M(RDNORM) | M(RDBAND) | M(WRNORM) | M(WRBAND) |
                M(HUP) | M(RDHUP) | M(MSG);
#undef M
}
#undef __MAP


#endif /* _LINUX_POLL_H */





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
/* Copyright (c) 2018, Mellanox Technologies All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <crypto/aead.h>
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <net/dst.h>
#include <net/inet_connection_sock.h>
#include <net/tcp.h>
#include <net/tls.h>
#include <linux/skbuff_ref.h>

#include "tls.h"
#include "trace.h"

/* device_offload_lock is used to synchronize tls_dev_add
 * against NETDEV_DOWN notifications.
 */
static DECLARE_RWSEM(device_offload_lock);

static struct workqueue_struct *destruct_wq __read_mostly;

static LIST_HEAD(tls_device_list);
static LIST_HEAD(tls_device_down_list);
static DEFINE_SPINLOCK(tls_device_lock);

static struct page *dummy_page;

static void tls_device_free_ctx(struct tls_context *ctx)
{
        if (ctx->tx_conf == TLS_HW)
                kfree(tls_offload_ctx_tx(ctx));

        if (ctx->rx_conf == TLS_HW)
                kfree(tls_offload_ctx_rx(ctx));

        tls_ctx_free(NULL, ctx);
}

static void tls_device_tx_del_task(struct work_struct *work)
{
        struct tls_offload_context_tx *offload_ctx =
                container_of(work, struct tls_offload_context_tx, destruct_work);
        struct tls_context *ctx = offload_ctx->ctx;
        struct net_device *netdev;

        /* Safe, because this is the destroy flow, refcount is 0, so
         * tls_device_down can't store this field in parallel.
         */
        netdev = rcu_dereference_protected(ctx->netdev,
                                           !refcount_read(&ctx->refcount));

        netdev->tlsdev_ops->tls_dev_del(netdev, ctx, TLS_OFFLOAD_CTX_DIR_TX);
        dev_put(netdev);
        ctx->netdev = NULL;
        tls_device_free_ctx(ctx);
}

static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
{
        struct net_device *netdev;
        unsigned long flags;
        bool async_cleanup;

        spin_lock_irqsave(&tls_device_lock, flags);
        if (unlikely(!refcount_dec_and_test(&ctx->refcount))) {
                spin_unlock_irqrestore(&tls_device_lock, flags);
                return;
        }

        list_del(&ctx->list); /* Remove from tls_device_list / tls_device_down_list */

        /* Safe, because this is the destroy flow, refcount is 0, so
         * tls_device_down can't store this field in parallel.
         */
        netdev = rcu_dereference_protected(ctx->netdev,
                                           !refcount_read(&ctx->refcount));

        async_cleanup = netdev && ctx->tx_conf == TLS_HW;
        if (async_cleanup) {
                struct tls_offload_context_tx *offload_ctx = tls_offload_ctx_tx(ctx);

                /* queue_work inside the spinlock
                 * to make sure tls_device_down waits for that work.
                 */
                queue_work(destruct_wq, &offload_ctx->destruct_work);
        }
        spin_unlock_irqrestore(&tls_device_lock, flags);

        if (!async_cleanup)
                tls_device_free_ctx(ctx);
}

/* We assume that the socket is already connected */
static struct net_device *get_netdev_for_sock(struct sock *sk)
{
        struct dst_entry *dst = sk_dst_get(sk);
        struct net_device *netdev = NULL;

        if (likely(dst)) {
                netdev = netdev_sk_get_lowest_dev(dst->dev, sk);
                dev_hold(netdev);
        }

        dst_release(dst);

        return netdev;
}

static void destroy_record(struct tls_record_info *record)
{
        int i;

        for (i = 0; i < record->num_frags; i++)
                __skb_frag_unref(&record->frags[i], false);
        kfree(record);
}

static void delete_all_records(struct tls_offload_context_tx *offload_ctx)
{
        struct tls_record_info *info, *temp;

        list_for_each_entry_safe(info, temp, &offload_ctx->records_list, list) {
                list_del(&info->list);
                destroy_record(info);
        }

        offload_ctx->retransmit_hint = NULL;
}

static void tls_tcp_clean_acked(struct sock *sk, u32 acked_seq)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_record_info *info, *temp;
        struct tls_offload_context_tx *ctx;
        u64 deleted_records = 0;
        unsigned long flags;

        if (!tls_ctx)
                return;

        ctx = tls_offload_ctx_tx(tls_ctx);

        spin_lock_irqsave(&ctx->lock, flags);
        info = ctx->retransmit_hint;
        if (info && !before(acked_seq, info->end_seq))
                ctx->retransmit_hint = NULL;

        list_for_each_entry_safe(info, temp, &ctx->records_list, list) {
                if (before(acked_seq, info->end_seq))
                        break;
                list_del(&info->list);

                destroy_record(info);
                deleted_records++;
        }

        ctx->unacked_record_sn += deleted_records;
        spin_unlock_irqrestore(&ctx->lock, flags);
}

/* At this point, there should be no references on this
 * socket and no in-flight SKBs associated with this
 * socket, so it is safe to free all the resources.
 */
void tls_device_sk_destruct(struct sock *sk)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);

        tls_ctx->sk_destruct(sk);

        if (tls_ctx->tx_conf == TLS_HW) {
                if (ctx->open_record)
                        destroy_record(ctx->open_record);
                delete_all_records(ctx);
                crypto_free_aead(ctx->aead_send);
                clean_acked_data_disable(tcp_sk(sk));
        }

        tls_device_queue_ctx_destruction(tls_ctx);
}
EXPORT_SYMBOL_GPL(tls_device_sk_destruct);

void tls_device_free_resources_tx(struct sock *sk)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);

        tls_free_partial_record(sk, tls_ctx);
}

void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);

        trace_tls_device_tx_resync_req(sk, got_seq, exp_seq);
        WARN_ON(test_and_set_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags));
}
EXPORT_SYMBOL_GPL(tls_offload_tx_resync_request);

static void tls_device_resync_tx(struct sock *sk, struct tls_context *tls_ctx,
                                 u32 seq)
{
        struct net_device *netdev;
        int err = 0;
        u8 *rcd_sn;

        tcp_write_collapse_fence(sk);
        rcd_sn = tls_ctx->tx.rec_seq;

        trace_tls_device_tx_resync_send(sk, seq, rcd_sn);
        down_read(&device_offload_lock);
        netdev = rcu_dereference_protected(tls_ctx->netdev,
                                           lockdep_is_held(&device_offload_lock));
        if (netdev)
                err = netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq,
                                                         rcd_sn,
                                                         TLS_OFFLOAD_CTX_DIR_TX);
        up_read(&device_offload_lock);
        if (err)
                return;

        clear_bit_unlock(TLS_TX_SYNC_SCHED, &tls_ctx->flags);
}

static void tls_append_frag(struct tls_record_info *record,
                            struct page_frag *pfrag,
                            int size)
{
        skb_frag_t *frag;

        frag = &record->frags[record->num_frags - 1];
        if (skb_frag_page(frag) == pfrag->page &&
            skb_frag_off(frag) + skb_frag_size(frag) == pfrag->offset) {
                skb_frag_size_add(frag, size);
        } else {
                ++frag;
                skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset,
                                        size);
                ++record->num_frags;
                get_page(pfrag->page);
        }

        pfrag->offset += size;
        record->len += size;
}

static int tls_push_record(struct sock *sk,
                           struct tls_context *ctx,
                           struct tls_offload_context_tx *offload_ctx,
                           struct tls_record_info *record,
                           int flags)
{
        struct tls_prot_info *prot = &ctx->prot_info;
        struct tcp_sock *tp = tcp_sk(sk);
        skb_frag_t *frag;
        int i;

        record->end_seq = tp->write_seq + record->len;
        list_add_tail_rcu(&record->list, &offload_ctx->records_list);
        offload_ctx->open_record = NULL;

        if (test_bit(TLS_TX_SYNC_SCHED, &ctx->flags))
                tls_device_resync_tx(sk, ctx, tp->write_seq);

        tls_advance_record_sn(sk, prot, &ctx->tx);

        for (i = 0; i < record->num_frags; i++) {
                frag = &record->frags[i];
                sg_unmark_end(&offload_ctx->sg_tx_data[i]);
                sg_set_page(&offload_ctx->sg_tx_data[i], skb_frag_page(frag),
                            skb_frag_size(frag), skb_frag_off(frag));
                sk_mem_charge(sk, skb_frag_size(frag));
                get_page(skb_frag_page(frag));
        }
        sg_mark_end(&offload_ctx->sg_tx_data[record->num_frags - 1]);

        /* all ready, send */
        return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
}

static void tls_device_record_close(struct sock *sk,
                                    struct tls_context *ctx,
                                    struct tls_record_info *record,
                                    struct page_frag *pfrag,
                                    unsigned char record_type)
{
        struct tls_prot_info *prot = &ctx->prot_info;
        struct page_frag dummy_tag_frag;

        /* append tag
         * device will fill in the tag, we just need to append a placeholder
         * use socket memory to improve coalescing (re-using a single buffer
         * increases frag count)
         * if we can't allocate memory now use the dummy page
         */
        if (unlikely(pfrag->size - pfrag->offset < prot->tag_size) &&
            !skb_page_frag_refill(prot->tag_size, pfrag, sk->sk_allocation)) {
                dummy_tag_frag.page = dummy_page;
                dummy_tag_frag.offset = 0;
                pfrag = &dummy_tag_frag;
        }
        tls_append_frag(record, pfrag, prot->tag_size);

        /* fill prepend */
        tls_fill_prepend(ctx, skb_frag_address(&record->frags[0]),
                         record->len - prot->overhead_size,
                         record_type);
}

static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
                                 struct page_frag *pfrag,
                                 size_t prepend_size)
{
        struct tls_record_info *record;
        skb_frag_t *frag;

        record = kmalloc(sizeof(*record), GFP_KERNEL);
        if (!record)
                return -ENOMEM;

        frag = &record->frags[0];
        skb_frag_fill_page_desc(frag, pfrag->page, pfrag->offset,
                                prepend_size);

        get_page(pfrag->page);
        pfrag->offset += prepend_size;

        record->num_frags = 1;
        record->len = prepend_size;
        offload_ctx->open_record = record;
        return 0;
}

static int tls_do_allocation(struct sock *sk,
                             struct tls_offload_context_tx *offload_ctx,
                             struct page_frag *pfrag,
                             size_t prepend_size)
{
        int ret;

        if (!offload_ctx->open_record) {
                if (unlikely(!skb_page_frag_refill(prepend_size, pfrag,
                                                   sk->sk_allocation))) {
                        READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk);
                        sk_stream_moderate_sndbuf(sk);
                        return -ENOMEM;
                }

                ret = tls_create_new_record(offload_ctx, pfrag, prepend_size);
                if (ret)
                        return ret;

                if (pfrag->size > pfrag->offset)
                        return 0;
        }

        if (!sk_page_frag_refill(sk, pfrag))
                return -ENOMEM;

        return 0;
}

static int tls_device_copy_data(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t pre_copy, nocache;

        pre_copy = ~((unsigned long)addr - 1) & (SMP_CACHE_BYTES - 1);
        if (pre_copy) {
                pre_copy = min(pre_copy, bytes);
                if (copy_from_iter(addr, pre_copy, i) != pre_copy)
                        return -EFAULT;
                bytes -= pre_copy;
                addr += pre_copy;
        }

        nocache = round_down(bytes, SMP_CACHE_BYTES);
        if (copy_from_iter_nocache(addr, nocache, i) != nocache)
                return -EFAULT;
        bytes -= nocache;
        addr += nocache;

        if (bytes && copy_from_iter(addr, bytes, i) != bytes)
                return -EFAULT;

        return 0;
}

static int tls_push_data(struct sock *sk,
                         struct iov_iter *iter,
                         size_t size, int flags,
                         unsigned char record_type)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_prot_info *prot = &tls_ctx->prot_info;
        struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
        struct tls_record_info *record;
        int tls_push_record_flags;
        struct page_frag *pfrag;
        size_t orig_size = size;
        u32 max_open_record_len;
        bool more = false;
        bool done = false;
        int copy, rc = 0;
        long timeo;

        if (flags &
            ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL |
              MSG_SPLICE_PAGES | MSG_EOR))
                return -EOPNOTSUPP;

        if ((flags & (MSG_MORE | MSG_EOR)) == (MSG_MORE | MSG_EOR))
                return -EINVAL;

        if (unlikely(sk->sk_err))
                return -sk->sk_err;

        flags |= MSG_SENDPAGE_DECRYPTED;
        tls_push_record_flags = flags | MSG_MORE;

        timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
        if (tls_is_partially_sent_record(tls_ctx)) {
                rc = tls_push_partial_record(sk, tls_ctx, flags);
                if (rc < 0)
                        return rc;
        }

        pfrag = sk_page_frag(sk);

        /* TLS_HEADER_SIZE is not counted as part of the TLS record, and
         * we need to leave room for an authentication tag.
         */
        max_open_record_len = TLS_MAX_PAYLOAD_SIZE +
                              prot->prepend_size;
        do {
                rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size);
                if (unlikely(rc)) {
                        rc = sk_stream_wait_memory(sk, &timeo);
                        if (!rc)
                                continue;

                        record = ctx->open_record;
                        if (!record)
                                break;
handle_error:
                        if (record_type != TLS_RECORD_TYPE_DATA) {
                                /* avoid sending partial
                                 * record with type !=
                                 * application_data
                                 */
                                size = orig_size;
                                destroy_record(record);
                                ctx->open_record = NULL;
                        } else if (record->len > prot->prepend_size) {
                                goto last_record;
                        }

                        break;
                }

                record = ctx->open_record;

                copy = min_t(size_t, size, max_open_record_len - record->len);
                if (copy && (flags & MSG_SPLICE_PAGES)) {
                        struct page_frag zc_pfrag;
                        struct page **pages = &zc_pfrag.page;
                        size_t off;

                        rc = iov_iter_extract_pages(iter, &pages,
                                                    copy, 1, 0, &off);
                        if (rc <= 0) {
                                if (rc == 0)
                                        rc = -EIO;
                                goto handle_error;
                        }
                        copy = rc;

                        if (WARN_ON_ONCE(!sendpage_ok(zc_pfrag.page))) {
                                iov_iter_revert(iter, copy);
                                rc = -EIO;
                                goto handle_error;
                        }

                        zc_pfrag.offset = off;
                        zc_pfrag.size = copy;
                        tls_append_frag(record, &zc_pfrag, copy);
                } else if (copy) {
                        copy = min_t(size_t, copy, pfrag->size - pfrag->offset);

                        rc = tls_device_copy_data(page_address(pfrag->page) +
                                                  pfrag->offset, copy,
                                                  iter);
                        if (rc)
                                goto handle_error;
                        tls_append_frag(record, pfrag, copy);
                }

                size -= copy;
                if (!size) {
last_record:
                        tls_push_record_flags = flags;
                        if (flags & MSG_MORE) {
                                more = true;
                                break;
                        }

                        done = true;
                }

                if (done || record->len >= max_open_record_len ||
                    (record->num_frags >= MAX_SKB_FRAGS - 1)) {
                        tls_device_record_close(sk, tls_ctx, record,
                                                pfrag, record_type);

                        rc = tls_push_record(sk,
                                             tls_ctx,
                                             ctx,
                                             record,
                                             tls_push_record_flags);
                        if (rc < 0)
                                break;
                }
        } while (!done);

        tls_ctx->pending_open_record_frags = more;

        if (orig_size - size > 0)
                rc = orig_size - size;

        return rc;
}

int tls_device_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
{
        unsigned char record_type = TLS_RECORD_TYPE_DATA;
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        int rc;

        if (!tls_ctx->zerocopy_sendfile)
                msg->msg_flags &= ~MSG_SPLICE_PAGES;

        mutex_lock(&tls_ctx->tx_lock);
        lock_sock(sk);

        if (unlikely(msg->msg_controllen)) {
                rc = tls_process_cmsg(sk, msg, &record_type);
                if (rc)
                        goto out;
        }

        rc = tls_push_data(sk, &msg->msg_iter, size, msg->msg_flags,
                           record_type);

out:
        release_sock(sk);
        mutex_unlock(&tls_ctx->tx_lock);
        return rc;
}

void tls_device_splice_eof(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct iov_iter iter = {};

        if (!tls_is_partially_sent_record(tls_ctx))
                return;

        mutex_lock(&tls_ctx->tx_lock);
        lock_sock(sk);

        if (tls_is_partially_sent_record(tls_ctx)) {
                iov_iter_bvec(&iter, ITER_SOURCE, NULL, 0, 0);
                tls_push_data(sk, &iter, 0, 0, TLS_RECORD_TYPE_DATA);
        }

        release_sock(sk);
        mutex_unlock(&tls_ctx->tx_lock);
}

struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
                                       u32 seq, u64 *p_record_sn)
{
        u64 record_sn = context->hint_record_sn;
        struct tls_record_info *info, *last;

        info = context->retransmit_hint;
        if (!info ||
            before(seq, info->end_seq - info->len)) {
                /* if retransmit_hint is irrelevant start
                 * from the beginning of the list
                 */
                info = list_first_entry_or_null(&context->records_list,
                                                struct tls_record_info, list);
                if (!info)
                        return NULL;
                /* send the start_marker record if seq number is before the
                 * tls offload start marker sequence number. This record is
                 * required to handle TCP packets which are before TLS offload
                 * started.
                 *  And if it's not start marker, look if this seq number
                 * belongs to the list.
                 */
                if (likely(!tls_record_is_start_marker(info))) {
                        /* we have the first record, get the last record to see
                         * if this seq number belongs to the list.
                         */
                        last = list_last_entry(&context->records_list,
                                               struct tls_record_info, list);

                        if (!between(seq, tls_record_start_seq(info),
                                     last->end_seq))
                                return NULL;
                }
                record_sn = context->unacked_record_sn;
        }

        /* We just need the _rcu for the READ_ONCE() */
        rcu_read_lock();
        list_for_each_entry_from_rcu(info, &context->records_list, list) {
                if (before(seq, info->end_seq)) {
                        if (!context->retransmit_hint ||
                            after(info->end_seq,
                                  context->retransmit_hint->end_seq)) {
                                context->hint_record_sn = record_sn;
                                context->retransmit_hint = info;
                        }
                        *p_record_sn = record_sn;
                        goto exit_rcu_unlock;
                }
                record_sn++;
        }
        info = NULL;

exit_rcu_unlock:
        rcu_read_unlock();
        return info;
}
EXPORT_SYMBOL(tls_get_record);

static int tls_device_push_pending_record(struct sock *sk, int flags)
{
        struct iov_iter iter;

        iov_iter_kvec(&iter, ITER_SOURCE, NULL, 0, 0);
        return tls_push_data(sk, &iter, 0, flags, TLS_RECORD_TYPE_DATA);
}

void tls_device_write_space(struct sock *sk, struct tls_context *ctx)
{
        if (tls_is_partially_sent_record(ctx)) {
                gfp_t sk_allocation = sk->sk_allocation;

                WARN_ON_ONCE(sk->sk_write_pending);

                sk->sk_allocation = GFP_ATOMIC;
                tls_push_partial_record(sk, ctx,
                                        MSG_DONTWAIT | MSG_NOSIGNAL |
                                        MSG_SENDPAGE_DECRYPTED);
                sk->sk_allocation = sk_allocation;
        }
}

static void tls_device_resync_rx(struct tls_context *tls_ctx,
                                 struct sock *sk, u32 seq, u8 *rcd_sn)
{
        struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);
        struct net_device *netdev;

        trace_tls_device_rx_resync_send(sk, seq, rcd_sn, rx_ctx->resync_type);
        rcu_read_lock();
        netdev = rcu_dereference(tls_ctx->netdev);
        if (netdev)
                netdev->tlsdev_ops->tls_dev_resync(netdev, sk, seq, rcd_sn,
                                                   TLS_OFFLOAD_CTX_DIR_RX);
        rcu_read_unlock();
        TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICERESYNC);
}

static bool
tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async,
                           s64 resync_req, u32 *seq, u16 *rcd_delta)
{
        u32 is_async = resync_req & RESYNC_REQ_ASYNC;
        u32 req_seq = resync_req >> 32;
        u32 req_end = req_seq + ((resync_req >> 16) & 0xffff);
        u16 i;

        *rcd_delta = 0;

        if (is_async) {
                /* shouldn't get to wraparound:
                 * too long in async stage, something bad happened
                 */
                if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX))
                        return false;

                /* asynchronous stage: log all headers seq such that
                 * req_seq <= seq <= end_seq, and wait for real resync request
                 */
                if (before(*seq, req_seq))
                        return false;
                if (!after(*seq, req_end) &&
                    resync_async->loglen < TLS_DEVICE_RESYNC_ASYNC_LOGMAX)
                        resync_async->log[resync_async->loglen++] = *seq;

                resync_async->rcd_delta++;

                return false;
        }

        /* synchronous stage: check against the logged entries and
         * proceed to check the next entries if no match was found
         */
        for (i = 0; i < resync_async->loglen; i++)
                if (req_seq == resync_async->log[i] &&
                    atomic64_try_cmpxchg(&resync_async->req, &resync_req, 0)) {
                        *rcd_delta = resync_async->rcd_delta - i;
                        *seq = req_seq;
                        resync_async->loglen = 0;
                        resync_async->rcd_delta = 0;
                        return true;
                }

        resync_async->loglen = 0;
        resync_async->rcd_delta = 0;

        if (req_seq == *seq &&
            atomic64_try_cmpxchg(&resync_async->req,
                                 &resync_req, 0))
                return true;

        return false;
}

void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_offload_context_rx *rx_ctx;
        u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];
        u32 sock_data, is_req_pending;
        struct tls_prot_info *prot;
        s64 resync_req;
        u16 rcd_delta;
        u32 req_seq;

        if (tls_ctx->rx_conf != TLS_HW)
                return;
        if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags)))
                return;

        prot = &tls_ctx->prot_info;
        rx_ctx = tls_offload_ctx_rx(tls_ctx);
        memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size);

        switch (rx_ctx->resync_type) {
        case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ:
                resync_req = atomic64_read(&rx_ctx->resync_req);
                req_seq = resync_req >> 32;
                seq += TLS_HEADER_SIZE - 1;
                is_req_pending = resync_req;

                if (likely(!is_req_pending) || req_seq != seq ||
                    !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
                        return;
                break;
        case TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT:
                if (likely(!rx_ctx->resync_nh_do_now))
                        return;

                /* head of next rec is already in, note that the sock_inq will
                 * include the currently parsed message when called from parser
                 */
                sock_data = tcp_inq(sk);
                if (sock_data > rcd_len) {
                        trace_tls_device_rx_resync_nh_delay(sk, sock_data,
                                                            rcd_len);
                        return;
                }

                rx_ctx->resync_nh_do_now = 0;
                seq += rcd_len;
                tls_bigint_increment(rcd_sn, prot->rec_seq_size);
                break;
        case TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC:
                resync_req = atomic64_read(&rx_ctx->resync_async->req);
                is_req_pending = resync_req;
                if (likely(!is_req_pending))
                        return;

                if (!tls_device_rx_resync_async(rx_ctx->resync_async,
                                                resync_req, &seq, &rcd_delta))
                        return;
                tls_bigint_subtract(rcd_sn, rcd_delta);
                break;
        }

        tls_device_resync_rx(tls_ctx, sk, seq, rcd_sn);
}

static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx,
                                           struct tls_offload_context_rx *ctx,
                                           struct sock *sk, struct sk_buff *skb)
{
        struct strp_msg *rxm;

        /* device will request resyncs by itself based on stream scan */
        if (ctx->resync_type != TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT)
                return;
        /* already scheduled */
        if (ctx->resync_nh_do_now)
                return;
        /* seen decrypted fragments since last fully-failed record */
        if (ctx->resync_nh_reset) {
                ctx->resync_nh_reset = 0;
                ctx->resync_nh.decrypted_failed = 1;
                ctx->resync_nh.decrypted_tgt = TLS_DEVICE_RESYNC_NH_START_IVAL;
                return;
        }

        if (++ctx->resync_nh.decrypted_failed <= ctx->resync_nh.decrypted_tgt)
                return;

        /* doing resync, bump the next target in case it fails */
        if (ctx->resync_nh.decrypted_tgt < TLS_DEVICE_RESYNC_NH_MAX_IVAL)
                ctx->resync_nh.decrypted_tgt *= 2;
        else
                ctx->resync_nh.decrypted_tgt += TLS_DEVICE_RESYNC_NH_MAX_IVAL;

        rxm = strp_msg(skb);

        /* head of next rec is already in, parser will sync for us */
        if (tcp_inq(sk) > rxm->full_len) {
                trace_tls_device_rx_resync_nh_schedule(sk);
                ctx->resync_nh_do_now = 1;
        } else {
                struct tls_prot_info *prot = &tls_ctx->prot_info;
                u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE];

                memcpy(rcd_sn, tls_ctx->rx.rec_seq, prot->rec_seq_size);
                tls_bigint_increment(rcd_sn, prot->rec_seq_size);

                tls_device_resync_rx(tls_ctx, sk, tcp_sk(sk)->copied_seq,
                                     rcd_sn);
        }
}

static int
tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx)
{
        struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx);
        const struct tls_cipher_desc *cipher_desc;
        int err, offset, copy, data_len, pos;
        struct sk_buff *skb, *skb_iter;
        struct scatterlist sg[1];
        struct strp_msg *rxm;
        char *orig_buf, *buf;

        cipher_desc = get_cipher_desc(tls_ctx->crypto_recv.info.cipher_type);
        DEBUG_NET_WARN_ON_ONCE(!cipher_desc || !cipher_desc->offloadable);

        rxm = strp_msg(tls_strp_msg(sw_ctx));
        orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv,
                           sk->sk_allocation);
        if (!orig_buf)
                return -ENOMEM;
        buf = orig_buf;

        err = tls_strp_msg_cow(sw_ctx);
        if (unlikely(err))
                goto free_buf;

        skb = tls_strp_msg(sw_ctx);
        rxm = strp_msg(skb);
        offset = rxm->offset;

        sg_init_table(sg, 1);
        sg_set_buf(&sg[0], buf,
                   rxm->full_len + TLS_HEADER_SIZE + cipher_desc->iv);
        err = skb_copy_bits(skb, offset, buf, TLS_HEADER_SIZE + cipher_desc->iv);
        if (err)
                goto free_buf;

        /* We are interested only in the decrypted data not the auth */
        err = decrypt_skb(sk, sg);
        if (err != -EBADMSG)
                goto free_buf;
        else
                err = 0;

        data_len = rxm->full_len - cipher_desc->tag;

        if (skb_pagelen(skb) > offset) {
                copy = min_t(int, skb_pagelen(skb) - offset, data_len);

                if (skb->decrypted) {
                        err = skb_store_bits(skb, offset, buf, copy);
                        if (err)
                                goto free_buf;
                }

                offset += copy;
                buf += copy;
        }

        pos = skb_pagelen(skb);
        skb_walk_frags(skb, skb_iter) {
                int frag_pos;

                /* Practically all frags must belong to msg if reencrypt
                 * is needed with current strparser and coalescing logic,
                 * but strparser may "get optimized", so let's be safe.
                 */
                if (pos + skb_iter->len <= offset)
                        goto done_with_frag;
                if (pos >= data_len + rxm->offset)
                        break;

                frag_pos = offset - pos;
                copy = min_t(int, skb_iter->len - frag_pos,
                             data_len + rxm->offset - offset);

                if (skb_iter->decrypted) {
                        err = skb_store_bits(skb_iter, frag_pos, buf, copy);
                        if (err)
                                goto free_buf;
                }

                offset += copy;
                buf += copy;
done_with_frag:
                pos += skb_iter->len;
        }

free_buf:
        kfree(orig_buf);
        return err;
}

int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx)
{
        struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx);
        struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx);
        struct sk_buff *skb = tls_strp_msg(sw_ctx);
        struct strp_msg *rxm = strp_msg(skb);
        int is_decrypted, is_encrypted;

        if (!tls_strp_msg_mixed_decrypted(sw_ctx)) {
                is_decrypted = skb->decrypted;
                is_encrypted = !is_decrypted;
        } else {
                is_decrypted = 0;
                is_encrypted = 0;
        }

        trace_tls_device_decrypted(sk, tcp_sk(sk)->copied_seq - rxm->full_len,
                                   tls_ctx->rx.rec_seq, rxm->full_len,
                                   is_encrypted, is_decrypted);

        if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) {
                if (likely(is_encrypted || is_decrypted))
                        return is_decrypted;

                /* After tls_device_down disables the offload, the next SKB will
                 * likely have initial fragments decrypted, and final ones not
                 * decrypted. We need to reencrypt that single SKB.
                 */
                return tls_device_reencrypt(sk, tls_ctx);
        }

        /* Return immediately if the record is either entirely plaintext or
         * entirely ciphertext. Otherwise handle reencrypt partially decrypted
         * record.
         */
        if (is_decrypted) {
                ctx->resync_nh_reset = 1;
                return is_decrypted;
        }
        if (is_encrypted) {
                tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb);
                return 0;
        }

        ctx->resync_nh_reset = 1;
        return tls_device_reencrypt(sk, tls_ctx);
}

static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
                              struct net_device *netdev)
{
        if (sk->sk_destruct != tls_device_sk_destruct) {
                refcount_set(&ctx->refcount, 1);
                dev_hold(netdev);
                RCU_INIT_POINTER(ctx->netdev, netdev);
                spin_lock_irq(&tls_device_lock);
                list_add_tail(&ctx->list, &tls_device_list);
                spin_unlock_irq(&tls_device_lock);

                ctx->sk_destruct = sk->sk_destruct;
                smp_store_release(&sk->sk_destruct, tls_device_sk_destruct);
        }
}

static struct tls_offload_context_tx *alloc_offload_ctx_tx(struct tls_context *ctx)
{
        struct tls_offload_context_tx *offload_ctx;
        __be64 rcd_sn;

        offload_ctx = kzalloc(sizeof(*offload_ctx), GFP_KERNEL);
        if (!offload_ctx)
                return NULL;

        INIT_WORK(&offload_ctx->destruct_work, tls_device_tx_del_task);
        INIT_LIST_HEAD(&offload_ctx->records_list);
        spin_lock_init(&offload_ctx->lock);
        sg_init_table(offload_ctx->sg_tx_data,
                      ARRAY_SIZE(offload_ctx->sg_tx_data));

        /* start at rec_seq - 1 to account for the start marker record */
        memcpy(&rcd_sn, ctx->tx.rec_seq, sizeof(rcd_sn));
        offload_ctx->unacked_record_sn = be64_to_cpu(rcd_sn) - 1;

        offload_ctx->ctx = ctx;

        return offload_ctx;
}

int tls_set_device_offload(struct sock *sk)
{
        struct tls_record_info *start_marker_record;
        struct tls_offload_context_tx *offload_ctx;
        const struct tls_cipher_desc *cipher_desc;
        struct tls_crypto_info *crypto_info;
        struct tls_prot_info *prot;
        struct net_device *netdev;
        struct tls_context *ctx;
        char *iv, *rec_seq;
        int rc;

        ctx = tls_get_ctx(sk);
        prot = &ctx->prot_info;

        if (ctx->priv_ctx_tx)
                return -EEXIST;

        netdev = get_netdev_for_sock(sk);
        if (!netdev) {
                pr_err_ratelimited("%s: netdev not found\n", __func__);
                return -EINVAL;
        }

        if (!(netdev->features & NETIF_F_HW_TLS_TX)) {
                rc = -EOPNOTSUPP;
                goto release_netdev;
        }

        crypto_info = &ctx->crypto_send.info;
        if (crypto_info->version != TLS_1_2_VERSION) {
                rc = -EOPNOTSUPP;
                goto release_netdev;
        }

        cipher_desc = get_cipher_desc(crypto_info->cipher_type);
        if (!cipher_desc || !cipher_desc->offloadable) {
                rc = -EINVAL;
                goto release_netdev;
        }

        rc = init_prot_info(prot, crypto_info, cipher_desc);
        if (rc)
                goto release_netdev;

        iv = crypto_info_iv(crypto_info, cipher_desc);
        rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc);

        memcpy(ctx->tx.iv + cipher_desc->salt, iv, cipher_desc->iv);
        memcpy(ctx->tx.rec_seq, rec_seq, cipher_desc->rec_seq);

        start_marker_record = kmalloc(sizeof(*start_marker_record), GFP_KERNEL);
        if (!start_marker_record) {
                rc = -ENOMEM;
                goto release_netdev;
        }

        offload_ctx = alloc_offload_ctx_tx(ctx);
        if (!offload_ctx) {
                rc = -ENOMEM;
                goto free_marker_record;
        }

        rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
        if (rc)
                goto free_offload_ctx;

        start_marker_record->end_seq = tcp_sk(sk)->write_seq;
        start_marker_record->len = 0;
        start_marker_record->num_frags = 0;
        list_add_tail(&start_marker_record->list, &offload_ctx->records_list);

        clean_acked_data_enable(tcp_sk(sk), &tls_tcp_clean_acked);
        ctx->push_pending_record = tls_device_push_pending_record;

        /* TLS offload is greatly simplified if we don't send
         * SKBs where only part of the payload needs to be encrypted.
         * So mark the last skb in the write queue as end of record.
         */
        tcp_write_collapse_fence(sk);

        /* Avoid offloading if the device is down
         * We don't want to offload new flows after
         * the NETDEV_DOWN event
         *
         * device_offload_lock is taken in tls_devices's NETDEV_DOWN
         * handler thus protecting from the device going down before
         * ctx was added to tls_device_list.
         */
        down_read(&device_offload_lock);
        if (!(netdev->flags & IFF_UP)) {
                rc = -EINVAL;
                goto release_lock;
        }

        ctx->priv_ctx_tx = offload_ctx;
        rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_TX,
                                             &ctx->crypto_send.info,
                                             tcp_sk(sk)->write_seq);
        trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_TX,
                                     tcp_sk(sk)->write_seq, rec_seq, rc);
        if (rc)
                goto release_lock;

        tls_device_attach(ctx, sk, netdev);
        up_read(&device_offload_lock);

        /* following this assignment tls_is_skb_tx_device_offloaded
         * will return true and the context might be accessed
         * by the netdev's xmit function.
         */
        smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb);
        dev_put(netdev);

        return 0;

release_lock:
        up_read(&device_offload_lock);
        clean_acked_data_disable(tcp_sk(sk));
        crypto_free_aead(offload_ctx->aead_send);
free_offload_ctx:
        kfree(offload_ctx);
        ctx->priv_ctx_tx = NULL;
free_marker_record:
        kfree(start_marker_record);
release_netdev:
        dev_put(netdev);
        return rc;
}

int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
{
        struct tls12_crypto_info_aes_gcm_128 *info;
        struct tls_offload_context_rx *context;
        struct net_device *netdev;
        int rc = 0;

        if (ctx->crypto_recv.info.version != TLS_1_2_VERSION)
                return -EOPNOTSUPP;

        netdev = get_netdev_for_sock(sk);
        if (!netdev) {
                pr_err_ratelimited("%s: netdev not found\n", __func__);
                return -EINVAL;
        }

        if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
                rc = -EOPNOTSUPP;
                goto release_netdev;
        }

        /* Avoid offloading if the device is down
         * We don't want to offload new flows after
         * the NETDEV_DOWN event
         *
         * device_offload_lock is taken in tls_devices's NETDEV_DOWN
         * handler thus protecting from the device going down before
         * ctx was added to tls_device_list.
         */
        down_read(&device_offload_lock);
        if (!(netdev->flags & IFF_UP)) {
                rc = -EINVAL;
                goto release_lock;
        }

        context = kzalloc(sizeof(*context), GFP_KERNEL);
        if (!context) {
                rc = -ENOMEM;
                goto release_lock;
        }
        context->resync_nh_reset = 1;

        ctx->priv_ctx_rx = context;
        rc = tls_set_sw_offload(sk, 0, NULL);
        if (rc)
                goto release_ctx;

        rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX,
                                             &ctx->crypto_recv.info,
                                             tcp_sk(sk)->copied_seq);
        info = (void *)&ctx->crypto_recv.info;
        trace_tls_device_offload_set(sk, TLS_OFFLOAD_CTX_DIR_RX,
                                     tcp_sk(sk)->copied_seq, info->rec_seq, rc);
        if (rc)
                goto free_sw_resources;

        tls_device_attach(ctx, sk, netdev);
        up_read(&device_offload_lock);

        dev_put(netdev);

        return 0;

free_sw_resources:
        up_read(&device_offload_lock);
        tls_sw_free_resources_rx(sk);
        down_read(&device_offload_lock);
release_ctx:
        ctx->priv_ctx_rx = NULL;
release_lock:
        up_read(&device_offload_lock);
release_netdev:
        dev_put(netdev);
        return rc;
}

void tls_device_offload_cleanup_rx(struct sock *sk)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct net_device *netdev;

        down_read(&device_offload_lock);
        netdev = rcu_dereference_protected(tls_ctx->netdev,
                                           lockdep_is_held(&device_offload_lock));
        if (!netdev)
                goto out;

        netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx,
                                        TLS_OFFLOAD_CTX_DIR_RX);

        if (tls_ctx->tx_conf != TLS_HW) {
                dev_put(netdev);
                rcu_assign_pointer(tls_ctx->netdev, NULL);
        } else {
                set_bit(TLS_RX_DEV_CLOSED, &tls_ctx->flags);
        }
out:
        up_read(&device_offload_lock);
        tls_sw_release_resources_rx(sk);
}

static int tls_device_down(struct net_device *netdev)
{
        struct tls_context *ctx, *tmp;
        unsigned long flags;
        LIST_HEAD(list);

        /* Request a write lock to block new offload attempts */
        down_write(&device_offload_lock);

        spin_lock_irqsave(&tls_device_lock, flags);
        list_for_each_entry_safe(ctx, tmp, &tls_device_list, list) {
                struct net_device *ctx_netdev =
                        rcu_dereference_protected(ctx->netdev,
                                                  lockdep_is_held(&device_offload_lock));

                if (ctx_netdev != netdev ||
                    !refcount_inc_not_zero(&ctx->refcount))
                        continue;

                list_move(&ctx->list, &list);
        }
        spin_unlock_irqrestore(&tls_device_lock, flags);

        list_for_each_entry_safe(ctx, tmp, &list, list)        {
                /* Stop offloaded TX and switch to the fallback.
                 * tls_is_skb_tx_device_offloaded will return false.
                 */
                WRITE_ONCE(ctx->sk->sk_validate_xmit_skb, tls_validate_xmit_skb_sw);

                /* Stop the RX and TX resync.
                 * tls_dev_resync must not be called after tls_dev_del.
                 */
                rcu_assign_pointer(ctx->netdev, NULL);

                /* Start skipping the RX resync logic completely. */
                set_bit(TLS_RX_DEV_DEGRADED, &ctx->flags);

                /* Sync with inflight packets. After this point:
                 * TX: no non-encrypted packets will be passed to the driver.
                 * RX: resync requests from the driver will be ignored.
                 */
                synchronize_net();

                /* Release the offload context on the driver side. */
                if (ctx->tx_conf == TLS_HW)
                        netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
                                                        TLS_OFFLOAD_CTX_DIR_TX);
                if (ctx->rx_conf == TLS_HW &&
                    !test_bit(TLS_RX_DEV_CLOSED, &ctx->flags))
                        netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
                                                        TLS_OFFLOAD_CTX_DIR_RX);

                dev_put(netdev);

                /* Move the context to a separate list for two reasons:
                 * 1. When the context is deallocated, list_del is called.
                 * 2. It's no longer an offloaded context, so we don't want to
                 *    run offload-specific code on this context.
                 */
                spin_lock_irqsave(&tls_device_lock, flags);
                list_move_tail(&ctx->list, &tls_device_down_list);
                spin_unlock_irqrestore(&tls_device_lock, flags);

                /* Device contexts for RX and TX will be freed in on sk_destruct
                 * by tls_device_free_ctx. rx_conf and tx_conf stay in TLS_HW.
                 * Now release the ref taken above.
                 */
                if (refcount_dec_and_test(&ctx->refcount)) {
                        /* sk_destruct ran after tls_device_down took a ref, and
                         * it returned early. Complete the destruction here.
                         */
                        list_del(&ctx->list);
                        tls_device_free_ctx(ctx);
                }
        }

        up_write(&device_offload_lock);

        flush_workqueue(destruct_wq);

        return NOTIFY_DONE;
}

static int tls_dev_event(struct notifier_block *this, unsigned long event,
                         void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        if (!dev->tlsdev_ops &&
            !(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_REGISTER:
        case NETDEV_FEAT_CHANGE:
                if (netif_is_bond_master(dev))
                        return NOTIFY_DONE;
                if ((dev->features & NETIF_F_HW_TLS_RX) &&
                    !dev->tlsdev_ops->tls_dev_resync)
                        return NOTIFY_BAD;

                if  (dev->tlsdev_ops &&
                     dev->tlsdev_ops->tls_dev_add &&
                     dev->tlsdev_ops->tls_dev_del)
                        return NOTIFY_DONE;
                else
                        return NOTIFY_BAD;
        case NETDEV_DOWN:
                return tls_device_down(dev);
        }
        return NOTIFY_DONE;
}

static struct notifier_block tls_dev_notifier = {
        .notifier_call        = tls_dev_event,
};

int __init tls_device_init(void)
{
        int err;

        dummy_page = alloc_page(GFP_KERNEL);
        if (!dummy_page)
                return -ENOMEM;

        destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0);
        if (!destruct_wq) {
                err = -ENOMEM;
                goto err_free_dummy;
        }

        err = register_netdevice_notifier(&tls_dev_notifier);
        if (err)
                goto err_destroy_wq;

        return 0;

err_destroy_wq:
        destroy_workqueue(destruct_wq);
err_free_dummy:
        put_page(dummy_page);
        return err;
}

void __exit tls_device_cleanup(void)
{
        unregister_netdevice_notifier(&tls_dev_notifier);
        destroy_workqueue(destruct_wq);
        clean_acked_data_flush();
        put_page(dummy_page);
}




































   35 



   35 






   35 
   35 



   35 
   35 
   35 





   35 





















   35 
   35 


   35 
   35 





























   35 











   35 

























































































































































































































































































































































































































































































































   35 











   34 




































































































































































































































































































































































































   35 










   35 

   35 
























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2002,2003 by Andreas Gruenbacher <a.gruenbacher@computer.org>
 *
 * Fixes from William Schumacher incorporated on 15 March 2001.
 *    (Reported by Charles Bertsch, <CBertsch@microtest.com>).
 */

/*
 *  This file contains generic functions for manipulating
 *  POSIX 1003.1e draft standard 17 ACLs.
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/xattr.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/mnt_idmapping.h>
#include <linux/iversion.h>
#include <linux/security.h>
#include <linux/fsnotify.h>
#include <linux/filelock.h>

#include "internal.h"

static struct posix_acl **acl_by_type(struct inode *inode, int type)
{
        switch (type) {
        case ACL_TYPE_ACCESS:
                return &inode->i_acl;
        case ACL_TYPE_DEFAULT:
                return &inode->i_default_acl;
        default:
                BUG();
        }
}

struct posix_acl *get_cached_acl(struct inode *inode, int type)
{
        struct posix_acl **p = acl_by_type(inode, type);
        struct posix_acl *acl;

        for (;;) {
                rcu_read_lock();
                acl = rcu_dereference(*p);
                if (!acl || is_uncached_acl(acl) ||
                    refcount_inc_not_zero(&acl->a_refcount))
                        break;
                rcu_read_unlock();
                cpu_relax();
        }
        rcu_read_unlock();
        return acl;
}
EXPORT_SYMBOL(get_cached_acl);

struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type)
{
        struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type));

        if (acl == ACL_DONT_CACHE) {
                struct posix_acl *ret;

                ret = inode->i_op->get_inode_acl(inode, type, LOOKUP_RCU);
                if (!IS_ERR(ret))
                        acl = ret;
        }

        return acl;
}
EXPORT_SYMBOL(get_cached_acl_rcu);

void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl)
{
        struct posix_acl **p = acl_by_type(inode, type);
        struct posix_acl *old;

        old = xchg(p, posix_acl_dup(acl));
        if (!is_uncached_acl(old))
                posix_acl_release(old);
}
EXPORT_SYMBOL(set_cached_acl);

static void __forget_cached_acl(struct posix_acl **p)
{
        struct posix_acl *old;

        old = xchg(p, ACL_NOT_CACHED);
        if (!is_uncached_acl(old))
                posix_acl_release(old);
}

void forget_cached_acl(struct inode *inode, int type)
{
        __forget_cached_acl(acl_by_type(inode, type));
}
EXPORT_SYMBOL(forget_cached_acl);

void forget_all_cached_acls(struct inode *inode)
{
        __forget_cached_acl(&inode->i_acl);
        __forget_cached_acl(&inode->i_default_acl);
}
EXPORT_SYMBOL(forget_all_cached_acls);

static struct posix_acl *__get_acl(struct mnt_idmap *idmap,
                                   struct dentry *dentry, struct inode *inode,
                                   int type)
{
        struct posix_acl *sentinel;
        struct posix_acl **p;
        struct posix_acl *acl;

        /*
         * The sentinel is used to detect when another operation like
         * set_cached_acl() or forget_cached_acl() races with get_inode_acl().
         * It is guaranteed that is_uncached_acl(sentinel) is true.
         */

        acl = get_cached_acl(inode, type);
        if (!is_uncached_acl(acl))
                return acl;

        if (!IS_POSIXACL(inode))
                return NULL;

        sentinel = uncached_acl_sentinel(current);
        p = acl_by_type(inode, type);

        /*
         * If the ACL isn't being read yet, set our sentinel.  Otherwise, the
         * current value of the ACL will not be ACL_NOT_CACHED and so our own
         * sentinel will not be set; another task will update the cache.  We
         * could wait for that other task to complete its job, but it's easier
         * to just call ->get_inode_acl to fetch the ACL ourself.  (This is
         * going to be an unlikely race.)
         */
        cmpxchg(p, ACL_NOT_CACHED, sentinel);

        /*
         * Normally, the ACL returned by ->get{_inode}_acl will be cached.
         * A filesystem can prevent that by calling
         * forget_cached_acl(inode, type) in ->get{_inode}_acl.
         *
         * If the filesystem doesn't have a get{_inode}_ acl() function at all,
         * we'll just create the negative cache entry.
         */
        if (dentry && inode->i_op->get_acl) {
                acl = inode->i_op->get_acl(idmap, dentry, type);
        } else if (inode->i_op->get_inode_acl) {
                acl = inode->i_op->get_inode_acl(inode, type, false);
        } else {
                set_cached_acl(inode, type, NULL);
                return NULL;
        }
        if (IS_ERR(acl)) {
                /*
                 * Remove our sentinel so that we don't block future attempts
                 * to cache the ACL.
                 */
                cmpxchg(p, sentinel, ACL_NOT_CACHED);
                return acl;
        }

        /*
         * Cache the result, but only if our sentinel is still in place.
         */
        posix_acl_dup(acl);
        if (unlikely(!try_cmpxchg(p, &sentinel, acl)))
                posix_acl_release(acl);
        return acl;
}

struct posix_acl *get_inode_acl(struct inode *inode, int type)
{
        return __get_acl(&nop_mnt_idmap, NULL, inode, type);
}
EXPORT_SYMBOL(get_inode_acl);

/*
 * Init a fresh posix_acl
 */
void
posix_acl_init(struct posix_acl *acl, int count)
{
        refcount_set(&acl->a_refcount, 1);
        acl->a_count = count;
}
EXPORT_SYMBOL(posix_acl_init);

/*
 * Allocate a new ACL with the specified number of entries.
 */
struct posix_acl *
posix_acl_alloc(unsigned int count, gfp_t flags)
{
        struct posix_acl *acl;

        acl = kmalloc(struct_size(acl, a_entries, count), flags);
        if (acl)
                posix_acl_init(acl, count);
        return acl;
}
EXPORT_SYMBOL(posix_acl_alloc);

/*
 * Clone an ACL.
 */
struct posix_acl *
posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
{
        struct posix_acl *clone = NULL;

        if (acl) {
                clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count),
                                flags);
                if (clone)
                        refcount_set(&clone->a_refcount, 1);
        }
        return clone;
}
EXPORT_SYMBOL_GPL(posix_acl_clone);

/*
 * Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
 */
int
posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
{
        const struct posix_acl_entry *pa, *pe;
        int state = ACL_USER_OBJ;
        int needs_mask = 0;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                if (pa->e_perm & ~(ACL_READ|ACL_WRITE|ACL_EXECUTE))
                        return -EINVAL;
                switch (pa->e_tag) {
                        case ACL_USER_OBJ:
                                if (state == ACL_USER_OBJ) {
                                        state = ACL_USER;
                                        break;
                                }
                                return -EINVAL;

                        case ACL_USER:
                                if (state != ACL_USER)
                                        return -EINVAL;
                                if (!kuid_has_mapping(user_ns, pa->e_uid))
                                        return -EINVAL;
                                needs_mask = 1;
                                break;

                        case ACL_GROUP_OBJ:
                                if (state == ACL_USER) {
                                        state = ACL_GROUP;
                                        break;
                                }
                                return -EINVAL;

                        case ACL_GROUP:
                                if (state != ACL_GROUP)
                                        return -EINVAL;
                                if (!kgid_has_mapping(user_ns, pa->e_gid))
                                        return -EINVAL;
                                needs_mask = 1;
                                break;

                        case ACL_MASK:
                                if (state != ACL_GROUP)
                                        return -EINVAL;
                                state = ACL_OTHER;
                                break;

                        case ACL_OTHER:
                                if (state == ACL_OTHER ||
                                    (state == ACL_GROUP && !needs_mask)) {
                                        state = 0;
                                        break;
                                }
                                return -EINVAL;

                        default:
                                return -EINVAL;
                }
        }
        if (state == 0)
                return 0;
        return -EINVAL;
}
EXPORT_SYMBOL(posix_acl_valid);

/*
 * Returns 0 if the acl can be exactly represented in the traditional
 * file mode permission bits, or else 1. Returns -E... on error.
 */
int
posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
{
        const struct posix_acl_entry *pa, *pe;
        umode_t mode = 0;
        int not_equiv = 0;

        /*
         * A null ACL can always be presented as mode bits.
         */
        if (!acl)
                return 0;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch (pa->e_tag) {
                        case ACL_USER_OBJ:
                                mode |= (pa->e_perm & S_IRWXO) << 6;
                                break;
                        case ACL_GROUP_OBJ:
                                mode |= (pa->e_perm & S_IRWXO) << 3;
                                break;
                        case ACL_OTHER:
                                mode |= pa->e_perm & S_IRWXO;
                                break;
                        case ACL_MASK:
                                mode = (mode & ~S_IRWXG) |
                                       ((pa->e_perm & S_IRWXO) << 3);
                                not_equiv = 1;
                                break;
                        case ACL_USER:
                        case ACL_GROUP:
                                not_equiv = 1;
                                break;
                        default:
                                return -EINVAL;
                }
        }
        if (mode_p)
                *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
}
EXPORT_SYMBOL(posix_acl_equiv_mode);

/*
 * Create an ACL representing the file mode permission bits of an inode.
 */
struct posix_acl *
posix_acl_from_mode(umode_t mode, gfp_t flags)
{
        struct posix_acl *acl = posix_acl_alloc(3, flags);
        if (!acl)
                return ERR_PTR(-ENOMEM);

        acl->a_entries[0].e_tag  = ACL_USER_OBJ;
        acl->a_entries[0].e_perm = (mode & S_IRWXU) >> 6;

        acl->a_entries[1].e_tag  = ACL_GROUP_OBJ;
        acl->a_entries[1].e_perm = (mode & S_IRWXG) >> 3;

        acl->a_entries[2].e_tag  = ACL_OTHER;
        acl->a_entries[2].e_perm = (mode & S_IRWXO);
        return acl;
}
EXPORT_SYMBOL(posix_acl_from_mode);

/*
 * Return 0 if current is granted want access to the inode
 * by the acl. Returns -E... otherwise.
 */
int
posix_acl_permission(struct mnt_idmap *idmap, struct inode *inode,
                     const struct posix_acl *acl, int want)
{
        const struct posix_acl_entry *pa, *pe, *mask_obj;
        struct user_namespace *fs_userns = i_user_ns(inode);
        int found = 0;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;

        want &= MAY_READ | MAY_WRITE | MAY_EXEC;

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                /* (May have been checked already) */
                                vfsuid = i_uid_into_vfsuid(idmap, inode);
                                if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                                        goto check_perm;
                                break;
                        case ACL_USER:
                                vfsuid = make_vfsuid(idmap, fs_userns,
                                                     pa->e_uid);
                                if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                                        goto mask;
                                break;
                        case ACL_GROUP_OBJ:
                                vfsgid = i_gid_into_vfsgid(idmap, inode);
                                if (vfsgid_in_group_p(vfsgid)) {
                                        found = 1;
                                        if ((pa->e_perm & want) == want)
                                                goto mask;
                                }
                                break;
                        case ACL_GROUP:
                                vfsgid = make_vfsgid(idmap, fs_userns,
                                                     pa->e_gid);
                                if (vfsgid_in_group_p(vfsgid)) {
                                        found = 1;
                                        if ((pa->e_perm & want) == want)
                                                goto mask;
                                }
                                break;
                        case ACL_MASK:
                                break;
                        case ACL_OTHER:
                                if (found)
                                        return -EACCES;
                                else
                                        goto check_perm;
                        default:
                                return -EIO;
                }
        }
        return -EIO;

mask:
        for (mask_obj = pa+1; mask_obj != pe; mask_obj++) {
                if (mask_obj->e_tag == ACL_MASK) {
                        if ((pa->e_perm & mask_obj->e_perm & want) == want)
                                return 0;
                        return -EACCES;
                }
        }

check_perm:
        if ((pa->e_perm & want) == want)
                return 0;
        return -EACCES;
}

/*
 * Modify acl when creating a new inode. The caller must ensure the acl is
 * only referenced once.
 *
 * mode_p initially must contain the mode parameter to the open() / creat()
 * system calls. All permissions that are not granted by the acl are removed.
 * The permissions in the acl are changed to reflect the mode_p parameter.
 */
static int posix_acl_create_masq(struct posix_acl *acl, umode_t *mode_p)
{
        struct posix_acl_entry *pa, *pe;
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        umode_t mode = *mode_p;
        int not_equiv = 0;

        /* assert(atomic_read(acl->a_refcount) == 1); */

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                pa->e_perm &= (mode >> 6) | ~S_IRWXO;
                                mode &= (pa->e_perm << 6) | ~S_IRWXU;
                                break;

                        case ACL_USER:
                        case ACL_GROUP:
                                not_equiv = 1;
                                break;

                        case ACL_GROUP_OBJ:
                                group_obj = pa;
                                break;

                        case ACL_OTHER:
                                pa->e_perm &= mode | ~S_IRWXO;
                                mode &= pa->e_perm | ~S_IRWXO;
                                break;

                        case ACL_MASK:
                                mask_obj = pa;
                                not_equiv = 1;
                                break;

                        default:
                                return -EIO;
                }
        }

        if (mask_obj) {
                mask_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
                mode &= (mask_obj->e_perm << 3) | ~S_IRWXG;
        } else {
                if (!group_obj)
                        return -EIO;
                group_obj->e_perm &= (mode >> 3) | ~S_IRWXO;
                mode &= (group_obj->e_perm << 3) | ~S_IRWXG;
        }

        *mode_p = (*mode_p & ~S_IRWXUGO) | mode;
        return not_equiv;
}

/*
 * Modify the ACL for the chmod syscall.
 */
static int __posix_acl_chmod_masq(struct posix_acl *acl, umode_t mode)
{
        struct posix_acl_entry *group_obj = NULL, *mask_obj = NULL;
        struct posix_acl_entry *pa, *pe;

        /* assert(atomic_read(acl->a_refcount) == 1); */

        FOREACH_ACL_ENTRY(pa, acl, pe) {
                switch(pa->e_tag) {
                        case ACL_USER_OBJ:
                                pa->e_perm = (mode & S_IRWXU) >> 6;
                                break;

                        case ACL_USER:
                        case ACL_GROUP:
                                break;

                        case ACL_GROUP_OBJ:
                                group_obj = pa;
                                break;

                        case ACL_MASK:
                                mask_obj = pa;
                                break;

                        case ACL_OTHER:
                                pa->e_perm = (mode & S_IRWXO);
                                break;

                        default:
                                return -EIO;
                }
        }

        if (mask_obj) {
                mask_obj->e_perm = (mode & S_IRWXG) >> 3;
        } else {
                if (!group_obj)
                        return -EIO;
                group_obj->e_perm = (mode & S_IRWXG) >> 3;
        }

        return 0;
}

int
__posix_acl_create(struct posix_acl **acl, gfp_t gfp, umode_t *mode_p)
{
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
        if (clone) {
                err = posix_acl_create_masq(clone, mode_p);
                if (err < 0) {
                        posix_acl_release(clone);
                        clone = NULL;
                }
        }
        posix_acl_release(*acl);
        *acl = clone;
        return err;
}
EXPORT_SYMBOL(__posix_acl_create);

int
__posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
{
        struct posix_acl *clone = posix_acl_clone(*acl, gfp);
        int err = -ENOMEM;
        if (clone) {
                err = __posix_acl_chmod_masq(clone, mode);
                if (err) {
                        posix_acl_release(clone);
                        clone = NULL;
                }
        }
        posix_acl_release(*acl);
        *acl = clone;
        return err;
}
EXPORT_SYMBOL(__posix_acl_chmod);

/**
 * posix_acl_chmod - chmod a posix acl
 *
 * @idmap:        idmap of the mount @inode was found from
 * @dentry:        dentry to check permissions on
 * @mode:        the new mode of @inode
 *
 * If the dentry has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
int
 posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry,
                    umode_t mode)
{
        struct inode *inode = d_inode(dentry);
        struct posix_acl *acl;
        int ret = 0;

        if (!IS_POSIXACL(inode))
                return 0;
        if (!inode->i_op->set_acl)
                return -EOPNOTSUPP;

        acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR_OR_NULL(acl)) {
                if (acl == ERR_PTR(-EOPNOTSUPP))
                        return 0;
                return PTR_ERR(acl);
        }

        ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
        if (ret)
                return ret;
        ret = inode->i_op->set_acl(idmap, dentry, acl, ACL_TYPE_ACCESS);
        posix_acl_release(acl);
        return ret;
}
EXPORT_SYMBOL(posix_acl_chmod);

int
posix_acl_create(struct inode *dir, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        struct posix_acl *p;
        struct posix_acl *clone;
        int ret;

        *acl = NULL;
        *default_acl = NULL;

        if (S_ISLNK(*mode) || !IS_POSIXACL(dir))
                return 0;

        p = get_inode_acl(dir, ACL_TYPE_DEFAULT);
        if (!p || p == ERR_PTR(-EOPNOTSUPP)) {
                *mode &= ~current_umask();
                return 0;
        }
        if (IS_ERR(p))
                return PTR_ERR(p);

        ret = -ENOMEM;
        clone = posix_acl_clone(p, GFP_NOFS);
        if (!clone)
                goto err_release;

        ret = posix_acl_create_masq(clone, mode);
        if (ret < 0)
                goto err_release_clone;

        if (ret == 0)
                posix_acl_release(clone);
        else
                *acl = clone;

        if (!S_ISDIR(*mode))
                posix_acl_release(p);
        else
                *default_acl = p;

        return 0;

err_release_clone:
        posix_acl_release(clone);
err_release:
        posix_acl_release(p);
        return ret;
}
EXPORT_SYMBOL_GPL(posix_acl_create);

/**
 * posix_acl_update_mode  -  update mode in set_acl
 * @idmap:        idmap of the mount @inode was found from
 * @inode:        target inode
 * @mode_p:        mode (pointer) for update
 * @acl:        acl pointer
 *
 * Update the file mode when setting an ACL: compute the new file permission
 * bits based on the ACL.  In addition, if the ACL is equivalent to the new
 * file mode, set *@acl to NULL to indicate that no ACL should be set.
 *
 * As with chmod, clear the setgid bit if the caller is not in the owning group
 * or capable of CAP_FSETID (see inode_change_ok).
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before checking
 * permissions. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 *
 * Called from set_acl inode operations.
 */
int posix_acl_update_mode(struct mnt_idmap *idmap,
                          struct inode *inode, umode_t *mode_p,
                          struct posix_acl **acl)
{
        umode_t mode = inode->i_mode;
        int error;

        error = posix_acl_equiv_mode(*acl, &mode);
        if (error < 0)
                return error;
        if (error == 0)
                *acl = NULL;
        if (!in_group_or_capable(idmap, inode,
                                 i_gid_into_vfsgid(idmap, inode)))
                mode &= ~S_ISGID;
        *mode_p = mode;
        return 0;
}
EXPORT_SYMBOL(posix_acl_update_mode);

/*
 * Fix up the uids and gids in posix acl extended attributes in place.
 */
static int posix_acl_fix_xattr_common(const void *value, size_t size)
{
        const struct posix_acl_xattr_header *header = value;
        int count;

        if (!header)
                return -EINVAL;
        if (size < sizeof(struct posix_acl_xattr_header))
                return -EINVAL;
        if (header->a_version != cpu_to_le32(POSIX_ACL_XATTR_VERSION))
                return -EOPNOTSUPP;

        count = posix_acl_xattr_count(size);
        if (count < 0)
                return -EINVAL;
        if (count == 0)
                return 0;

        return count;
}

/**
 * posix_acl_from_xattr - convert POSIX ACLs from backing store to VFS format
 * @userns: the filesystem's idmapping
 * @value: the uapi representation of POSIX ACLs
 * @size: the size of @void
 *
 * Filesystems that store POSIX ACLs in the unaltered uapi format should use
 * posix_acl_from_xattr() when reading them from the backing store and
 * converting them into the struct posix_acl VFS format. The helper is
 * specifically intended to be called from the acl inode operation.
 *
 * The posix_acl_from_xattr() function will map the raw {g,u}id values stored
 * in ACL_{GROUP,USER} entries into idmapping in @userns.
 *
 * Note that posix_acl_from_xattr() does not take idmapped mounts into account.
 * If it did it calling it from the get acl inode operation would return POSIX
 * ACLs mapped according to an idmapped mount which would mean that the value
 * couldn't be cached for the filesystem. Idmapped mounts are taken into
 * account on the fly during permission checking or right at the VFS -
 * userspace boundary before reporting them to the user.
 *
 * Return: Allocated struct posix_acl on success, NULL for a valid header but
 *         without actual POSIX ACL entries, or ERR_PTR() encoded error code.
 */
struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
                                       const void *value, size_t size)
{
        const struct posix_acl_xattr_header *header = value;
        const struct posix_acl_xattr_entry *entry = (const void *)(header + 1), *end;
        int count;
        struct posix_acl *acl;
        struct posix_acl_entry *acl_e;

        count = posix_acl_fix_xattr_common(value, size);
        if (count < 0)
                return ERR_PTR(count);
        if (count == 0)
                return NULL;

        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
        acl_e = acl->a_entries;

        for (end = entry + count; entry != end; acl_e++, entry++) {
                acl_e->e_tag  = le16_to_cpu(entry->e_tag);
                acl_e->e_perm = le16_to_cpu(entry->e_perm);

                switch(acl_e->e_tag) {
                        case ACL_USER_OBJ:
                        case ACL_GROUP_OBJ:
                        case ACL_MASK:
                        case ACL_OTHER:
                                break;

                        case ACL_USER:
                                acl_e->e_uid = make_kuid(userns,
                                                le32_to_cpu(entry->e_id));
                                if (!uid_valid(acl_e->e_uid))
                                        goto fail;
                                break;
                        case ACL_GROUP:
                                acl_e->e_gid = make_kgid(userns,
                                                le32_to_cpu(entry->e_id));
                                if (!gid_valid(acl_e->e_gid))
                                        goto fail;
                                break;

                        default:
                                goto fail;
                }
        }
        return acl;

fail:
        posix_acl_release(acl);
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL (posix_acl_from_xattr);

/*
 * Convert from in-memory to extended attribute representation.
 */
int
posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
                   void *buffer, size_t size)
{
        struct posix_acl_xattr_header *ext_acl = buffer;
        struct posix_acl_xattr_entry *ext_entry;
        int real_size, n;

        real_size = posix_acl_xattr_size(acl->a_count);
        if (!buffer)
                return real_size;
        if (real_size > size)
                return -ERANGE;

        ext_entry = (void *)(ext_acl + 1);
        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);

        for (n=0; n < acl->a_count; n++, ext_entry++) {
                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
                ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
                ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
                switch(acl_e->e_tag) {
                case ACL_USER:
                        ext_entry->e_id =
                                cpu_to_le32(from_kuid(user_ns, acl_e->e_uid));
                        break;
                case ACL_GROUP:
                        ext_entry->e_id =
                                cpu_to_le32(from_kgid(user_ns, acl_e->e_gid));
                        break;
                default:
                        ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
                        break;
                }
        }
        return real_size;
}
EXPORT_SYMBOL (posix_acl_to_xattr);

/**
 * vfs_posix_acl_to_xattr - convert from kernel to userspace representation
 * @idmap: idmap of the mount
 * @inode: inode the posix acls are set on
 * @acl: the posix acls as represented by the vfs
 * @buffer: the buffer into which to convert @acl
 * @size: size of @buffer
 *
 * This converts @acl from the VFS representation in the filesystem idmapping
 * to the uapi form reportable to userspace. And mount and caller idmappings
 * are handled appropriately.
 *
 * Return: On success, the size of the stored uapi posix acls, on error a
 * negative errno.
 */
static ssize_t vfs_posix_acl_to_xattr(struct mnt_idmap *idmap,
                                      struct inode *inode,
                                      const struct posix_acl *acl, void *buffer,
                                      size_t size)

{
        struct posix_acl_xattr_header *ext_acl = buffer;
        struct posix_acl_xattr_entry *ext_entry;
        struct user_namespace *fs_userns, *caller_userns;
        ssize_t real_size, n;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;

        real_size = posix_acl_xattr_size(acl->a_count);
        if (!buffer)
                return real_size;
        if (real_size > size)
                return -ERANGE;

        ext_entry = (void *)(ext_acl + 1);
        ext_acl->a_version = cpu_to_le32(POSIX_ACL_XATTR_VERSION);

        fs_userns = i_user_ns(inode);
        caller_userns = current_user_ns();
        for (n=0; n < acl->a_count; n++, ext_entry++) {
                const struct posix_acl_entry *acl_e = &acl->a_entries[n];
                ext_entry->e_tag  = cpu_to_le16(acl_e->e_tag);
                ext_entry->e_perm = cpu_to_le16(acl_e->e_perm);
                switch(acl_e->e_tag) {
                case ACL_USER:
                        vfsuid = make_vfsuid(idmap, fs_userns, acl_e->e_uid);
                        ext_entry->e_id = cpu_to_le32(from_kuid(
                                caller_userns, vfsuid_into_kuid(vfsuid)));
                        break;
                case ACL_GROUP:
                        vfsgid = make_vfsgid(idmap, fs_userns, acl_e->e_gid);
                        ext_entry->e_id = cpu_to_le32(from_kgid(
                                caller_userns, vfsgid_into_kgid(vfsgid)));
                        break;
                default:
                        ext_entry->e_id = cpu_to_le32(ACL_UNDEFINED_ID);
                        break;
                }
        }
        return real_size;
}

int
set_posix_acl(struct mnt_idmap *idmap, struct dentry *dentry,
              int type, struct posix_acl *acl)
{
        struct inode *inode = d_inode(dentry);

        if (!IS_POSIXACL(inode))
                return -EOPNOTSUPP;
        if (!inode->i_op->set_acl)
                return -EOPNOTSUPP;

        if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
                return acl ? -EACCES : 0;
        if (!inode_owner_or_capable(idmap, inode))
                return -EPERM;

        if (acl) {
                int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
                if (ret)
                        return ret;
        }
        return inode->i_op->set_acl(idmap, dentry, acl, type);
}
EXPORT_SYMBOL(set_posix_acl);

int posix_acl_listxattr(struct inode *inode, char **buffer,
                        ssize_t *remaining_size)
{
        int err;

        if (!IS_POSIXACL(inode))
                return 0;

        if (inode->i_acl) {
                err = xattr_list_one(buffer, remaining_size,
                                     XATTR_NAME_POSIX_ACL_ACCESS);
                if (err)
                        return err;
        }

        if (inode->i_default_acl) {
                err = xattr_list_one(buffer, remaining_size,
                                     XATTR_NAME_POSIX_ACL_DEFAULT);
                if (err)
                        return err;
        }

        return 0;
}

static bool
posix_acl_xattr_list(struct dentry *dentry)
{
        return IS_POSIXACL(d_backing_inode(dentry));
}

/*
 * nop_posix_acl_access - legacy xattr handler for access POSIX ACLs
 *
 * This is the legacy POSIX ACL access xattr handler. It is used by some
 * filesystems to implement their ->listxattr() inode operation. New code
 * should never use them.
 */
const struct xattr_handler nop_posix_acl_access = {
        .name = XATTR_NAME_POSIX_ACL_ACCESS,
        .list = posix_acl_xattr_list,
};
EXPORT_SYMBOL_GPL(nop_posix_acl_access);

/*
 * nop_posix_acl_default - legacy xattr handler for default POSIX ACLs
 *
 * This is the legacy POSIX ACL default xattr handler. It is used by some
 * filesystems to implement their ->listxattr() inode operation. New code
 * should never use them.
 */
const struct xattr_handler nop_posix_acl_default = {
        .name = XATTR_NAME_POSIX_ACL_DEFAULT,
        .list = posix_acl_xattr_list,
};
EXPORT_SYMBOL_GPL(nop_posix_acl_default);

int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   struct posix_acl *acl, int type)
{
        int error;
        struct inode *inode = d_inode(dentry);

        if (type == ACL_TYPE_ACCESS) {
                error = posix_acl_update_mode(idmap, inode,
                                &inode->i_mode, &acl);
                if (error)
                        return error;
        }

        inode_set_ctime_current(inode);
        if (IS_I_VERSION(inode))
                inode_inc_iversion(inode);
        set_cached_acl(inode, type, acl);
        return 0;
}

int simple_acl_create(struct inode *dir, struct inode *inode)
{
        struct posix_acl *default_acl, *acl;
        int error;

        error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
        if (error)
                return error;

        set_cached_acl(inode, ACL_TYPE_DEFAULT, default_acl);
        set_cached_acl(inode, ACL_TYPE_ACCESS, acl);

        if (default_acl)
                posix_acl_release(default_acl);
        if (acl)
                posix_acl_release(acl);
        return 0;
}

static int vfs_set_acl_idmapped_mnt(struct mnt_idmap *idmap,
                                    struct user_namespace *fs_userns,
                                    struct posix_acl *acl)
{
        for (int n = 0; n < acl->a_count; n++) {
                struct posix_acl_entry *acl_e = &acl->a_entries[n];

                switch (acl_e->e_tag) {
                case ACL_USER:
                        acl_e->e_uid = from_vfsuid(idmap, fs_userns,
                                                   VFSUIDT_INIT(acl_e->e_uid));
                        break;
                case ACL_GROUP:
                        acl_e->e_gid = from_vfsgid(idmap, fs_userns,
                                                   VFSGIDT_INIT(acl_e->e_gid));
                        break;
                }
        }

        return 0;
}

/**
 * vfs_set_acl - set posix acls
 * @idmap: idmap of the mount
 * @dentry: the dentry based on which to set the posix acls
 * @acl_name: the name of the posix acl
 * @kacl: the posix acls in the appropriate VFS format
 *
 * This function sets @kacl. The caller must all posix_acl_release() on @kacl
 * afterwards.
 *
 * Return: On success 0, on error negative errno.
 */
int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *acl_name, struct posix_acl *kacl)
{
        int acl_type;
        int error;
        struct inode *inode = d_inode(dentry);
        struct inode *delegated_inode = NULL;

        acl_type = posix_acl_type(acl_name);
        if (acl_type < 0)
                return -EINVAL;

        if (kacl) {
                /*
                 * If we're on an idmapped mount translate from mount specific
                 * vfs{g,u}id_t into global filesystem k{g,u}id_t.
                 * Afterwards we can cache the POSIX ACLs filesystem wide and -
                 * if this is a filesystem with a backing store - ultimately
                 * translate them to backing store values.
                 */
                error = vfs_set_acl_idmapped_mnt(idmap, i_user_ns(inode), kacl);
                if (error)
                        return error;
        }

retry_deleg:
        inode_lock(inode);

        /*
         * We only care about restrictions the inode struct itself places upon
         * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
         */
        error = may_write_xattr(idmap, inode);
        if (error)
                goto out_inode_unlock;

        error = security_inode_set_acl(idmap, dentry, acl_name, kacl);
        if (error)
                goto out_inode_unlock;

        error = try_break_deleg(inode, &delegated_inode);
        if (error)
                goto out_inode_unlock;

        if (likely(!is_bad_inode(inode)))
                error = set_posix_acl(idmap, dentry, acl_type, kacl);
        else
                error = -EIO;
        if (!error) {
                fsnotify_xattr(dentry);
                security_inode_post_set_acl(dentry, acl_name, kacl);
        }

out_inode_unlock:
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_set_acl);

/**
 * vfs_get_acl - get posix acls
 * @idmap: idmap of the mount
 * @dentry: the dentry based on which to retrieve the posix acls
 * @acl_name: the name of the posix acl
 *
 * This function retrieves @kacl from the filesystem. The caller must all
 * posix_acl_release() on @kacl.
 *
 * Return: On success POSIX ACLs in VFS format, on error negative errno.
 */
struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name)
{
        struct inode *inode = d_inode(dentry);
        struct posix_acl *acl;
        int acl_type, error;

        acl_type = posix_acl_type(acl_name);
        if (acl_type < 0)
                return ERR_PTR(-EINVAL);

        /*
         * The VFS has no restrictions on reading POSIX ACLs so calling
         * something like xattr_permission() isn't needed. Only LSMs get a say.
         */
        error = security_inode_get_acl(idmap, dentry, acl_name);
        if (error)
                return ERR_PTR(error);

        if (!IS_POSIXACL(inode))
                return ERR_PTR(-EOPNOTSUPP);
        if (S_ISLNK(inode->i_mode))
                return ERR_PTR(-EOPNOTSUPP);

        acl = __get_acl(idmap, dentry, inode, acl_type);
        if (IS_ERR(acl))
                return acl;
        if (!acl)
                return ERR_PTR(-ENODATA);

        return acl;
}
EXPORT_SYMBOL_GPL(vfs_get_acl);

/**
 * vfs_remove_acl - remove posix acls
 * @idmap: idmap of the mount
 * @dentry: the dentry based on which to retrieve the posix acls
 * @acl_name: the name of the posix acl
 *
 * This function removes posix acls.
 *
 * Return: On success 0, on error negative errno.
 */
int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name)
{
        int acl_type;
        int error;
        struct inode *inode = d_inode(dentry);
        struct inode *delegated_inode = NULL;

        acl_type = posix_acl_type(acl_name);
        if (acl_type < 0)
                return -EINVAL;

retry_deleg:
        inode_lock(inode);

        /*
         * We only care about restrictions the inode struct itself places upon
         * us otherwise POSIX ACLs aren't subject to any VFS restrictions.
         */
        error = may_write_xattr(idmap, inode);
        if (error)
                goto out_inode_unlock;

        error = security_inode_remove_acl(idmap, dentry, acl_name);
        if (error)
                goto out_inode_unlock;

        error = try_break_deleg(inode, &delegated_inode);
        if (error)
                goto out_inode_unlock;

        if (likely(!is_bad_inode(inode)))
                error = set_posix_acl(idmap, dentry, acl_type, NULL);
        else
                error = -EIO;
        if (!error) {
                fsnotify_xattr(dentry);
                security_inode_post_remove_acl(idmap, dentry, acl_name);
        }

out_inode_unlock:
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_remove_acl);

int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
               const char *acl_name, const void *kvalue, size_t size)
{
        int error;
        struct posix_acl *acl = NULL;

        if (size) {
                /*
                 * Note that posix_acl_from_xattr() uses GFP_NOFS when it
                 * probably doesn't need to here.
                 */
                acl = posix_acl_from_xattr(current_user_ns(), kvalue, size);
                if (IS_ERR(acl))
                        return PTR_ERR(acl);
        }

        error = vfs_set_acl(idmap, dentry, acl_name, acl);
        posix_acl_release(acl);
        return error;
}

ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name, void *kvalue, size_t size)
{
        ssize_t error;
        struct posix_acl *acl;

        acl = vfs_get_acl(idmap, dentry, acl_name);
        if (IS_ERR(acl))
                return PTR_ERR(acl);

        error = vfs_posix_acl_to_xattr(idmap, d_inode(dentry),
                                       acl, kvalue, size);
        posix_acl_release(acl);
        return error;
}












































































































































  202 






    4 







    4 






    4 











    4 



    4 








    4 





















    4 






























    4 










    4 


















    4 














    4 
    4 
    4 




































    4 








































































































































    4 















    4 






















































    4 




























































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/readahead.c - address_space-level file readahead.
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 09Apr2002        Andrew Morton
 *                Initial version.
 */

/**
 * DOC: Readahead Overview
 *
 * Readahead is used to read content into the page cache before it is
 * explicitly requested by the application.  Readahead only ever
 * attempts to read folios that are not yet in the page cache.  If a
 * folio is present but not up-to-date, readahead will not try to read
 * it. In that case a simple ->read_folio() will be requested.
 *
 * Readahead is triggered when an application read request (whether a
 * system call or a page fault) finds that the requested folio is not in
 * the page cache, or that it is in the page cache and has the
 * readahead flag set.  This flag indicates that the folio was read
 * as part of a previous readahead request and now that it has been
 * accessed, it is time for the next readahead.
 *
 * Each readahead request is partly synchronous read, and partly async
 * readahead.  This is reflected in the struct file_ra_state which
 * contains ->size being the total number of pages, and ->async_size
 * which is the number of pages in the async section.  The readahead
 * flag will be set on the first folio in this async section to trigger
 * a subsequent readahead.  Once a series of sequential reads has been
 * established, there should be no need for a synchronous component and
 * all readahead request will be fully asynchronous.
 *
 * When either of the triggers causes a readahead, three numbers need
 * to be determined: the start of the region to read, the size of the
 * region, and the size of the async tail.
 *
 * The start of the region is simply the first page address at or after
 * the accessed address, which is not currently populated in the page
 * cache.  This is found with a simple search in the page cache.
 *
 * The size of the async tail is determined by subtracting the size that
 * was explicitly requested from the determined request size, unless
 * this would be less than zero - then zero is used.  NOTE THIS
 * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
 * PAGE.  ALSO THIS CALCULATION IS NOT USED CONSISTENTLY.
 *
 * The size of the region is normally determined from the size of the
 * previous readahead which loaded the preceding pages.  This may be
 * discovered from the struct file_ra_state for simple sequential reads,
 * or from examining the state of the page cache when multiple
 * sequential reads are interleaved.  Specifically: where the readahead
 * was triggered by the readahead flag, the size of the previous
 * readahead is assumed to be the number of pages from the triggering
 * page to the start of the new readahead.  In these cases, the size of
 * the previous readahead is scaled, often doubled, for the new
 * readahead, though see get_next_ra_size() for details.
 *
 * If the size of the previous read cannot be determined, the number of
 * preceding pages in the page cache is used to estimate the size of
 * a previous read.  This estimate could easily be misled by random
 * reads being coincidentally adjacent, so it is ignored unless it is
 * larger than the current request, and it is not scaled up, unless it
 * is at the start of file.
 *
 * In general readahead is accelerated at the start of the file, as
 * reads from there are often sequential.  There are other minor
 * adjustments to the readahead size in various special cases and these
 * are best discovered by reading the code.
 *
 * The above calculation, based on the previous readahead size,
 * determines the size of the readahead, to which any requested read
 * size may be added.
 *
 * Readahead requests are sent to the filesystem using the ->readahead()
 * address space operation, for which mpage_readahead() is a canonical
 * implementation.  ->readahead() should normally initiate reads on all
 * folios, but may fail to read any or all folios without causing an I/O
 * error.  The page cache reading code will issue a ->read_folio() request
 * for any folio which ->readahead() did not read, and only an error
 * from this will be final.
 *
 * ->readahead() will generally call readahead_folio() repeatedly to get
 * each folio from those prepared for readahead.  It may fail to read a
 * folio by:
 *
 * * not calling readahead_folio() sufficiently many times, effectively
 *   ignoring some folios, as might be appropriate if the path to
 *   storage is congested.
 *
 * * failing to actually submit a read request for a given folio,
 *   possibly due to insufficient resources, or
 *
 * * getting an error during subsequent processing of a request.
 *
 * In the last two cases, the folio should be unlocked by the filesystem
 * to indicate that the read attempt has failed.  In the first case the
 * folio will be unlocked by the VFS.
 *
 * Those folios not in the final ``async_size`` of the request should be
 * considered to be important and ->readahead() should not fail them due
 * to congestion or temporary resource unavailability, but should wait
 * for necessary resources (e.g.  memory or indexing information) to
 * become available.  Folios in the final ``async_size`` may be
 * considered less urgent and failure to read them is more acceptable.
 * In this case it is best to use filemap_remove_folio() to remove the
 * folios from the page cache as is automatically done for folios that
 * were not fetched with readahead_folio().  This will allow a
 * subsequent synchronous readahead request to try them again.  If they
 * are left in the page cache, then they will be read individually using
 * ->read_folio() which may be less efficient.
 */

#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagemap.h>
#include <linux/psi.h>
#include <linux/syscalls.h>
#include <linux/file.h>
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
#include <linux/sched/mm.h>

#include "internal.h"

/*
 * Initialise a struct file's readahead state.  Assumes that the caller has
 * memset *ra to zero.
 */
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
        ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
        ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);

static void read_pages(struct readahead_control *rac)
{
        const struct address_space_operations *aops = rac->mapping->a_ops;
        struct folio *folio;
        struct blk_plug plug;

        if (!readahead_count(rac))
                return;

        if (unlikely(rac->_workingset))
                psi_memstall_enter(&rac->_pflags);
        blk_start_plug(&plug);

        if (aops->readahead) {
                aops->readahead(rac);
                /* Clean up the remaining folios. */
                while ((folio = readahead_folio(rac)) != NULL) {
                        folio_get(folio);
                        filemap_remove_folio(folio);
                        folio_unlock(folio);
                        folio_put(folio);
                }
        } else {
                while ((folio = readahead_folio(rac)) != NULL)
                        aops->read_folio(rac->file, folio);
        }

        blk_finish_plug(&plug);
        if (unlikely(rac->_workingset))
                psi_memstall_leave(&rac->_pflags);
        rac->_workingset = false;

        BUG_ON(readahead_count(rac));
}

static struct folio *ractl_alloc_folio(struct readahead_control *ractl,
                                       gfp_t gfp_mask, unsigned int order)
{
        struct folio *folio;

        folio = filemap_alloc_folio(gfp_mask, order);
        if (folio && ractl->dropbehind)
                __folio_set_dropbehind(folio);

        return folio;
}

/**
 * page_cache_ra_unbounded - Start unchecked readahead.
 * @ractl: Readahead control.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_ra_unbounded(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct address_space *mapping = ractl->mapping;
        unsigned long index = readahead_index(ractl);
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
        unsigned long mark = ULONG_MAX, i = 0;
        unsigned int min_nrpages = mapping_min_folio_nrpages(mapping);

        /*
         * Partway through the readahead operation, we will have added
         * locked pages to the page cache, but will not yet have submitted
         * them for I/O.  Adding another page may need to allocate memory,
         * which can trigger memory reclaim.  Telling the VM we're in
         * the middle of a filesystem operation will cause it to not
         * touch file-backed pages, preventing a deadlock.  Most (all?)
         * filesystems already specify __GFP_NOFS in their mapping's
         * gfp_mask, but let's be explicit here.
         */
        unsigned int nofs = memalloc_nofs_save();

        filemap_invalidate_lock_shared(mapping);
        index = mapping_align_index(mapping, index);

        /*
         * As iterator `i` is aligned to min_nrpages, round_up the
         * difference between nr_to_read and lookahead_size to mark the
         * index that only has lookahead or "async_region" to set the
         * readahead flag.
         */
        if (lookahead_size <= nr_to_read) {
                unsigned long ra_folio_index;

                ra_folio_index = round_up(readahead_index(ractl) +
                                          nr_to_read - lookahead_size,
                                          min_nrpages);
                mark = ra_folio_index - index;
        }
        nr_to_read += readahead_index(ractl) - index;
        ractl->_index = index;

        /*
         * Preallocate as many pages as we will need.
         */
        while (i < nr_to_read) {
                struct folio *folio = xa_load(&mapping->i_pages, index + i);
                int ret;

                if (folio && !xa_is_value(folio)) {
                        /*
                         * Page already present?  Kick off the current batch
                         * of contiguous pages before continuing with the
                         * next batch.  This page may be the one we would
                         * have intended to mark as Readahead, but we don't
                         * have a stable reference to this page, and it's
                         * not worth getting one just for that.
                         */
                        read_pages(ractl);
                        ractl->_index += min_nrpages;
                        i = ractl->_index + ractl->_nr_pages - index;
                        continue;
                }

                folio = ractl_alloc_folio(ractl, gfp_mask,
                                        mapping_min_folio_order(mapping));
                if (!folio)
                        break;

                ret = filemap_add_folio(mapping, folio, index + i, gfp_mask);
                if (ret < 0) {
                        folio_put(folio);
                        if (ret == -ENOMEM)
                                break;
                        read_pages(ractl);
                        ractl->_index += min_nrpages;
                        i = ractl->_index + ractl->_nr_pages - index;
                        continue;
                }
                if (i == mark)
                        folio_set_readahead(folio);
                ractl->_workingset |= folio_test_workingset(folio);
                ractl->_nr_pages += min_nrpages;
                i += min_nrpages;
        }

        /*
         * Now start the IO.  We ignore I/O errors - if the folio is not
         * uptodate then the caller will launch read_folio again, and
         * will then handle the error.
         */
        read_pages(ractl);
        filemap_invalidate_unlock_shared(mapping);
        memalloc_nofs_restore(nofs);
}
EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);

/*
 * do_page_cache_ra() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
static void do_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read, unsigned long lookahead_size)
{
        struct inode *inode = ractl->mapping->host;
        unsigned long index = readahead_index(ractl);
        loff_t isize = i_size_read(inode);
        pgoff_t end_index;        /* The last page we want to read */

        if (isize == 0)
                return;

        end_index = (isize - 1) >> PAGE_SHIFT;
        if (index > end_index)
                return;
        /* Don't read past the page containing the last byte of the file */
        if (nr_to_read > end_index - index)
                nr_to_read = end_index - index + 1;

        page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}

/*
 * Chunk the readahead into 2 megabyte units, so that we don't pin too much
 * memory at once.
 */
void force_page_cache_ra(struct readahead_control *ractl,
                unsigned long nr_to_read)
{
        struct address_space *mapping = ractl->mapping;
        struct file_ra_state *ra = ractl->ra;
        struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
        unsigned long max_pages;

        if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
                return;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
        nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
        while (nr_to_read) {
                unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;

                if (this_chunk > nr_to_read)
                        this_chunk = nr_to_read;
                do_page_cache_ra(ractl, this_chunk, 0);

                nr_to_read -= this_chunk;
        }
}

/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
 */
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
        unsigned long newsize = roundup_pow_of_two(size);

        if (newsize <= max / 32)
                newsize = newsize * 4;
        else if (newsize <= max / 4)
                newsize = newsize * 2;
        else
                newsize = max;

        return newsize;
}

/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
static unsigned long get_next_ra_size(struct file_ra_state *ra,
                                      unsigned long max)
{
        unsigned long cur = ra->size;

        if (cur < max / 16)
                return 4 * cur;
        if (cur <= max / 2)
                return 2 * cur;
        return max;
}

/*
 * On-demand readahead design.
 *
 * The fields in struct file_ra_state represent the most-recently-executed
 * readahead attempt:
 *
 *                        |<----- async_size ---------|
 *     |------------------- size -------------------->|
 *     |==================#===========================|
 *     ^start             ^page marked with PG_readahead
 *
 * To overlap application thinking time and disk I/O time, we do
 * `readahead pipelining': Do not wait until the application consumed all
 * readahead pages and stalled on the missing page at readahead_index;
 * Instead, submit an asynchronous readahead I/O as soon as there are
 * only async_size pages left in the readahead window. Normally async_size
 * will be equal to size, for maximum pipelining.
 *
 * In interleaved sequential reads, concurrent streams on the same fd can
 * be invalidating each other's readahead state. So we flag the new readahead
 * page at (start+size-async_size) with PG_readahead, and use it as readahead
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
 * prev_pos tracks the last visited byte in the _previous_ read request.
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
 * sequential ones.
 *
 * There is a special-case: if the first page which the application tries to
 * read happens to be the first page of the file, it is assumed that a linear
 * read is about to happen and the window is immediately set to the initial size
 * based on I/O request size and the max_readahead.
 *
 * The code ramps up the readahead size aggressively at first, but slow down as
 * it approaches max_readhead.
 */

static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
                pgoff_t mark, unsigned int order, gfp_t gfp)
{
        int err;
        struct folio *folio = ractl_alloc_folio(ractl, gfp, order);

        if (!folio)
                return -ENOMEM;
        mark = round_down(mark, 1UL << order);
        if (index == mark)
                folio_set_readahead(folio);
        err = filemap_add_folio(ractl->mapping, folio, index, gfp);
        if (err) {
                folio_put(folio);
                return err;
        }

        ractl->_nr_pages += 1UL << order;
        ractl->_workingset |= folio_test_workingset(folio);
        return 0;
}

void page_cache_ra_order(struct readahead_control *ractl,
                struct file_ra_state *ra, unsigned int new_order)
{
        struct address_space *mapping = ractl->mapping;
        pgoff_t start = readahead_index(ractl);
        pgoff_t index = start;
        unsigned int min_order = mapping_min_folio_order(mapping);
        pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
        pgoff_t mark = index + ra->size - ra->async_size;
        unsigned int nofs;
        int err = 0;
        gfp_t gfp = readahead_gfp_mask(mapping);
        unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping));

        /*
         * Fallback when size < min_nrpages as each folio should be
         * at least min_nrpages anyway.
         */
        if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size)
                goto fallback;

        limit = min(limit, index + ra->size - 1);

        if (new_order < mapping_max_folio_order(mapping))
                new_order += 2;

        new_order = min(mapping_max_folio_order(mapping), new_order);
        new_order = min_t(unsigned int, new_order, ilog2(ra->size));
        new_order = max(new_order, min_order);

        /* See comment in page_cache_ra_unbounded() */
        nofs = memalloc_nofs_save();
        filemap_invalidate_lock_shared(mapping);
        /*
         * If the new_order is greater than min_order and index is
         * already aligned to new_order, then this will be noop as index
         * aligned to new_order should also be aligned to min_order.
         */
        ractl->_index = mapping_align_index(mapping, index);
        index = readahead_index(ractl);

        while (index <= limit) {
                unsigned int order = new_order;

                /* Align with smaller pages if needed */
                if (index & ((1UL << order) - 1))
                        order = __ffs(index);
                /* Don't allocate pages past EOF */
                while (order > min_order && index + (1UL << order) - 1 > limit)
                        order--;
                err = ra_alloc_folio(ractl, index, mark, order, gfp);
                if (err)
                        break;
                index += 1UL << order;
        }

        read_pages(ractl);
        filemap_invalidate_unlock_shared(mapping);
        memalloc_nofs_restore(nofs);

        /*
         * If there were already pages in the page cache, then we may have
         * left some gaps.  Let the regular readahead code take care of this
         * situation below.
         */
        if (!err)
                return;
fallback:
        /*
         * ->readahead() may have updated readahead window size so we have to
         * check there's still something to read.
         */
        if (ra->size > index - start)
                do_page_cache_ra(ractl, ra->size - (index - start),
                                 ra->async_size);
}

static unsigned long ractl_max_pages(struct readahead_control *ractl,
                unsigned long req_size)
{
        struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
        unsigned long max_pages = ractl->ra->ra_pages;

        /*
         * If the request exceeds the readahead window, allow the read to
         * be up to the optimal hardware IO size
         */
        if (req_size > max_pages && bdi->io_pages > max_pages)
                max_pages = min(req_size, bdi->io_pages);
        return max_pages;
}

void page_cache_sync_ra(struct readahead_control *ractl,
                unsigned long req_count)
{
        pgoff_t index = readahead_index(ractl);
        bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
        struct file_ra_state *ra = ractl->ra;
        unsigned long max_pages, contig_count;
        pgoff_t prev_index, miss;

        /*
         * Even if readahead is disabled, issue this request as readahead
         * as we'll need it to satisfy the requested range. The forced
         * readahead will do the right thing and limit the read to just the
         * requested range, which we'll set to 1 page for this case.
         */
        if (!ra->ra_pages || blk_cgroup_congested()) {
                if (!ractl->file)
                        return;
                req_count = 1;
                do_forced_ra = true;
        }

        /* be dumb */
        if (do_forced_ra) {
                force_page_cache_ra(ractl, req_count);
                return;
        }

        max_pages = ractl_max_pages(ractl, req_count);
        prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
        /*
         * A start of file, oversized read, or sequential cache miss:
         * trivial case: (index - prev_index) == 1
         * unaligned reads: (index - prev_index) == 0
         */
        if (!index || req_count > max_pages || index - prev_index <= 1UL) {
                ra->start = index;
                ra->size = get_init_ra_size(req_count, max_pages);
                ra->async_size = ra->size > req_count ? ra->size - req_count :
                                                        ra->size >> 1;
                goto readit;
        }

        /*
         * Query the page cache and look for the traces(cached history pages)
         * that a sequential stream would leave behind.
         */
        rcu_read_lock();
        miss = page_cache_prev_miss(ractl->mapping, index - 1, max_pages);
        rcu_read_unlock();
        contig_count = index - miss - 1;
        /*
         * Standalone, small random read. Read as is, and do not pollute the
         * readahead state.
         */
        if (contig_count <= req_count) {
                do_page_cache_ra(ractl, req_count, 0);
                return;
        }
        /*
         * File cached from the beginning:
         * it is a strong indication of long-run stream (or whole-file-read)
         */
        if (miss == ULONG_MAX)
                contig_count *= 2;
        ra->start = index;
        ra->size = min(contig_count + req_count, max_pages);
        ra->async_size = 1;
readit:
        ractl->_index = ra->start;
        page_cache_ra_order(ractl, ra, 0);
}
EXPORT_SYMBOL_GPL(page_cache_sync_ra);

void page_cache_async_ra(struct readahead_control *ractl,
                struct folio *folio, unsigned long req_count)
{
        unsigned long max_pages;
        struct file_ra_state *ra = ractl->ra;
        pgoff_t index = readahead_index(ractl);
        pgoff_t expected, start;
        unsigned int order = folio_order(folio);

        /* no readahead */
        if (!ra->ra_pages)
                return;

        /*
         * Same bit is used for PG_readahead and PG_reclaim.
         */
        if (folio_test_writeback(folio))
                return;

        folio_clear_readahead(folio);

        if (blk_cgroup_congested())
                return;

        max_pages = ractl_max_pages(ractl, req_count);
        /*
         * It's the expected callback index, assume sequential access.
         * Ramp up sizes, and push forward the readahead window.
         */
        expected = round_down(ra->start + ra->size - ra->async_size,
                        1UL << order);
        if (index == expected) {
                ra->start += ra->size;
                /*
                 * In the case of MADV_HUGEPAGE, the actual size might exceed
                 * the readahead window.
                 */
                ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
                ra->async_size = ra->size;
                goto readit;
        }

        /*
         * Hit a marked folio without valid readahead state.
         * E.g. interleaved reads.
         * Query the pagecache for async_size, which normally equals to
         * readahead size. Ramp it up and use it as the new readahead size.
         */
        rcu_read_lock();
        start = page_cache_next_miss(ractl->mapping, index + 1, max_pages);
        rcu_read_unlock();

        if (!start || start - index > max_pages)
                return;

        ra->start = start;
        ra->size = start - index;        /* old async_size */
        ra->size += req_count;
        ra->size = get_next_ra_size(ra, max_pages);
        ra->async_size = ra->size;
readit:
        ractl->_index = ra->start;
        page_cache_ra_order(ractl, ra, order);
}
EXPORT_SYMBOL_GPL(page_cache_async_ra);

ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
                return -EBADF;

        /*
         * The readahead() syscall is intended to run only on files
         * that can execute readahead. If readahead is not possible
         * on this file, then we must return -EINVAL.
         */
        if (!fd_file(f)->f_mapping || !fd_file(f)->f_mapping->a_ops ||
            (!S_ISREG(file_inode(fd_file(f))->i_mode) &&
            !S_ISBLK(file_inode(fd_file(f))->i_mode)))
                return -EINVAL;

        return vfs_fadvise(fd_file(f), offset, count, POSIX_FADV_WILLNEED);
}

SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
{
        return ksys_readahead(fd, offset, count);
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD)
COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count)
{
        return ksys_readahead(fd, compat_arg_u64_glue(offset), count);
}
#endif

/**
 * readahead_expand - Expand a readahead request
 * @ractl: The request to be expanded
 * @new_start: The revised start
 * @new_len: The revised size of the request
 *
 * Attempt to expand a readahead request outwards from the current size to the
 * specified size by inserting locked pages before and after the current window
 * to increase the size to the new window.  This may involve the insertion of
 * THPs, in which case the window may get expanded even beyond what was
 * requested.
 *
 * The algorithm will stop if it encounters a conflicting page already in the
 * pagecache and leave a smaller expansion than requested.
 *
 * The caller must check for this by examining the revised @ractl object for a
 * different expansion than was requested.
 */
void readahead_expand(struct readahead_control *ractl,
                      loff_t new_start, size_t new_len)
{
        struct address_space *mapping = ractl->mapping;
        struct file_ra_state *ra = ractl->ra;
        pgoff_t new_index, new_nr_pages;
        gfp_t gfp_mask = readahead_gfp_mask(mapping);
        unsigned long min_nrpages = mapping_min_folio_nrpages(mapping);
        unsigned int min_order = mapping_min_folio_order(mapping);

        new_index = new_start / PAGE_SIZE;
        /*
         * Readahead code should have aligned the ractl->_index to
         * min_nrpages before calling readahead aops.
         */
        VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages));

        /* Expand the leading edge downwards */
        while (ractl->_index > new_index) {
                unsigned long index = ractl->_index - 1;
                struct folio *folio = xa_load(&mapping->i_pages, index);

                if (folio && !xa_is_value(folio))
                        return; /* Folio apparently present */

                folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
                if (!folio)
                        return;

                index = mapping_align_index(mapping, index);
                if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                        folio_put(folio);
                        return;
                }
                if (unlikely(folio_test_workingset(folio)) &&
                                !ractl->_workingset) {
                        ractl->_workingset = true;
                        psi_memstall_enter(&ractl->_pflags);
                }
                ractl->_nr_pages += min_nrpages;
                ractl->_index = folio->index;
        }

        new_len += new_start - readahead_pos(ractl);
        new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);

        /* Expand the trailing edge upwards */
        while (ractl->_nr_pages < new_nr_pages) {
                unsigned long index = ractl->_index + ractl->_nr_pages;
                struct folio *folio = xa_load(&mapping->i_pages, index);

                if (folio && !xa_is_value(folio))
                        return; /* Folio apparently present */

                folio = ractl_alloc_folio(ractl, gfp_mask, min_order);
                if (!folio)
                        return;

                index = mapping_align_index(mapping, index);
                if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) {
                        folio_put(folio);
                        return;
                }
                if (unlikely(folio_test_workingset(folio)) &&
                                !ractl->_workingset) {
                        ractl->_workingset = true;
                        psi_memstall_enter(&ractl->_pflags);
                }
                ractl->_nr_pages += min_nrpages;
                if (ra) {
                        ra->size += min_nrpages;
                        ra->async_size += min_nrpages;
                }
        }
}
EXPORT_SYMBOL(readahead_expand);













































































































































































  189 























  265 




  265 



















































































































































































    7 


























































    5 


















    7 


    7 



























































































    8 


















    7 

    1 


























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/bitmap.h>
#include <linux/bug.h>
#include <linux/export.h>
#include <linux/idr.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/xarray.h>

/**
 * idr_alloc_u32() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @nextid: Pointer to an ID.
 * @max: The maximum ID to allocate (inclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @nextid and @max.
 * Note that @max is inclusive whereas the @end parameter to idr_alloc()
 * is exclusive.  The new ID is assigned to @nextid before the pointer
 * is inserted into the IDR, so if @nextid points into the object pointed
 * to by @ptr, a concurrent lookup will not find an uninitialised ID.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.  If an error occurred,
 * @nextid is unchanged.
 */
int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid,
                        unsigned long max, gfp_t gfp)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        unsigned int base = idr->idr_base;
        unsigned int id = *nextid;

        if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR)))
                idr->idr_rt.xa_flags |= IDR_RT_MARKER;

        id = (id < base) ? 0 : id - base;
        radix_tree_iter_init(&iter, id);
        slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base);
        if (IS_ERR(slot))
                return PTR_ERR(slot);

        *nextid = iter.index + base;
        /* there is a memory barrier inside radix_tree_iter_replace() */
        radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr);
        radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE);

        return 0;
}
EXPORT_SYMBOL_GPL(idr_alloc_u32);

/**
 * idr_alloc() - Allocate an ID.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = start;
        int ret;

        if (WARN_ON_ONCE(start < 0))
                return -EINVAL;

        ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp);
        if (ret)
                return ret;

        return id;
}
EXPORT_SYMBOL_GPL(idr_alloc);

/**
 * idr_alloc_cyclic() - Allocate an ID cyclically.
 * @idr: IDR handle.
 * @ptr: Pointer to be associated with the new ID.
 * @start: The minimum ID (inclusive).
 * @end: The maximum ID (exclusive).
 * @gfp: Memory allocation flags.
 *
 * Allocates an unused ID in the range specified by @start and @end.  If
 * @end is <= 0, it is treated as one larger than %INT_MAX.  This allows
 * callers to use @start + N as @end as long as N is within integer range.
 * The search for an unused ID will start at the last ID allocated and will
 * wrap around to @start if no free IDs are found before reaching @end.
 *
 * The caller should provide their own locking to ensure that two
 * concurrent modifications to the IDR are not possible.  Read-only
 * accesses to the IDR may be done under the RCU read lock or may
 * exclude simultaneous writers.
 *
 * Return: The newly allocated ID, -ENOMEM if memory allocation failed,
 * or -ENOSPC if no free IDs could be found.
 */
int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp)
{
        u32 id = idr->idr_next;
        int err, max = end > 0 ? end - 1 : INT_MAX;

        if ((int)id < start)
                id = start;

        err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        if ((err == -ENOSPC) && (id > start)) {
                id = start;
                err = idr_alloc_u32(idr, ptr, &id, max, gfp);
        }
        if (err)
                return err;

        idr->idr_next = id + 1;
        return id;
}
EXPORT_SYMBOL(idr_alloc_cyclic);

/**
 * idr_remove() - Remove an ID from the IDR.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Removes this ID from the IDR.  If the ID was not previously in the IDR,
 * this function returns %NULL.
 *
 * Since this function modifies the IDR, the caller should provide their
 * own locking to ensure that concurrent modification of the same IDR is
 * not possible.
 *
 * Return: The pointer formerly associated with this ID.
 */
void *idr_remove(struct idr *idr, unsigned long id)
{
        return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL);
}
EXPORT_SYMBOL_GPL(idr_remove);

/**
 * idr_find() - Return pointer for given ID.
 * @idr: IDR handle.
 * @id: Pointer ID.
 *
 * Looks up the pointer associated with this ID.  A %NULL pointer may
 * indicate that @id is not allocated or that the %NULL pointer was
 * associated with this ID.
 *
 * This function can be called under rcu_read_lock(), given that the leaf
 * pointers lifetimes are correctly managed.
 *
 * Return: The pointer associated with this ID.
 */
void *idr_find(const struct idr *idr, unsigned long id)
{
        return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base);
}
EXPORT_SYMBOL_GPL(idr_find);

/**
 * idr_for_each() - Iterate through all stored pointers.
 * @idr: IDR handle.
 * @fn: Function to be called for each pointer.
 * @data: Data passed to callback function.
 *
 * The callback function will be called for each entry in @idr, passing
 * the ID, the entry and @data.
 *
 * If @fn returns anything other than %0, the iteration stops and that
 * value is returned from this function.
 *
 * idr_for_each() can be called concurrently with idr_alloc() and
 * idr_remove() if protected by RCU.  Newly added entries may not be
 * seen and deleted entries may be seen, but adding and removing entries
 * will not cause other entries to be skipped, nor spurious ones to be seen.
 */
int idr_for_each(const struct idr *idr,
                int (*fn)(int id, void *p, void *data), void *data)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        int base = idr->idr_base;

        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) {
                int ret;
                unsigned long id = iter.index + base;

                if (WARN_ON_ONCE(id > INT_MAX))
                        break;
                ret = fn(id, rcu_dereference_raw(*slot), data);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(idr_for_each);

/**
 * idr_get_next_ul() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next_ul(struct idr *idr, unsigned long *nextid)
{
        struct radix_tree_iter iter;
        void __rcu **slot;
        void *entry = NULL;
        unsigned long base = idr->idr_base;
        unsigned long id = *nextid;

        id = (id < base) ? 0 : id - base;
        radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) {
                entry = rcu_dereference_raw(*slot);
                if (!entry)
                        continue;
                if (!xa_is_internal(entry))
                        break;
                if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry))
                        break;
                slot = radix_tree_iter_retry(&iter);
        }
        if (!slot)
                return NULL;

        *nextid = iter.index + base;
        return entry;
}
EXPORT_SYMBOL(idr_get_next_ul);

/**
 * idr_get_next() - Find next populated entry.
 * @idr: IDR handle.
 * @nextid: Pointer to an ID.
 *
 * Returns the next populated entry in the tree with an ID greater than
 * or equal to the value pointed to by @nextid.  On exit, @nextid is updated
 * to the ID of the found value.  To use in a loop, the value pointed to by
 * nextid must be incremented by the user.
 */
void *idr_get_next(struct idr *idr, int *nextid)
{
        unsigned long id = *nextid;
        void *entry = idr_get_next_ul(idr, &id);

        if (WARN_ON_ONCE(id > INT_MAX))
                return NULL;
        *nextid = id;
        return entry;
}
EXPORT_SYMBOL(idr_get_next);

/**
 * idr_replace() - replace pointer for given ID.
 * @idr: IDR handle.
 * @ptr: New pointer to associate with the ID.
 * @id: ID to change.
 *
 * Replace the pointer registered with an ID and return the old value.
 * This function can be called under the RCU read lock concurrently with
 * idr_alloc() and idr_remove() (as long as the ID being removed is not
 * the one being replaced!).
 *
 * Returns: the old value on success.  %-ENOENT indicates that @id was not
 * found.  %-EINVAL indicates that @ptr was not valid.
 */
void *idr_replace(struct idr *idr, void *ptr, unsigned long id)
{
        struct radix_tree_node *node;
        void __rcu **slot = NULL;
        void *entry;

        id -= idr->idr_base;

        entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot);
        if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE))
                return ERR_PTR(-ENOENT);

        __radix_tree_replace(&idr->idr_rt, node, slot, ptr);

        return entry;
}
EXPORT_SYMBOL(idr_replace);

/**
 * DOC: IDA description
 *
 * The IDA is an ID allocator which does not provide the ability to
 * associate an ID with a pointer.  As such, it only needs to store one
 * bit per ID, and so is more space efficient than an IDR.  To use an IDA,
 * define it using DEFINE_IDA() (or embed a &struct ida in a data structure,
 * then initialise it using ida_init()).  To allocate a new ID, call
 * ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range().
 * To free an ID, call ida_free().
 *
 * ida_destroy() can be used to dispose of an IDA without needing to
 * free the individual IDs in it.  You can use ida_is_empty() to find
 * out whether the IDA has any IDs currently allocated.
 *
 * The IDA handles its own locking.  It is safe to call any of the IDA
 * functions without synchronisation in your code.
 *
 * IDs are currently limited to the range [0-INT_MAX].  If this is an awkward
 * limitation, it should be quite straightforward to raise the maximum.
 */

/*
 * Developer's notes:
 *
 * The IDA uses the functionality provided by the XArray to store bitmaps in
 * each entry.  The XA_FREE_MARK is only cleared when all bits in the bitmap
 * have been set.
 *
 * I considered telling the XArray that each slot is an order-10 node
 * and indexing by bit number, but the XArray can't allow a single multi-index
 * entry in the head, which would significantly increase memory consumption
 * for the IDA.  So instead we divide the index by the number of bits in the
 * leaf bitmap before doing a radix tree lookup.
 *
 * As an optimisation, if there are only a few low bits set in any given
 * leaf, instead of allocating a 128-byte bitmap, we store the bits
 * as a value entry.  Value entries never have the XA_FREE_MARK cleared
 * because we can always convert them into a bitmap entry.
 *
 * It would be possible to optimise further; once we've run out of a
 * single 128-byte bitmap, we currently switch to a 576-byte node, put
 * the 128-byte bitmap in the first entry and then start allocating extra
 * 128-byte entries.  We could instead use the 512 bytes of the node's
 * data as a bitmap before moving to that scheme.  I do not believe this
 * is a worthwhile optimisation; Rasmus Villemoes surveyed the current
 * users of the IDA and almost none of them use more than 1024 entries.
 * Those that do use more than the 8192 IDs that the 512 bytes would
 * provide.
 *
 * The IDA always uses a lock to alloc/free.  If we add a 'test_bit'
 * equivalent, it will still need locking.  Going to RCU lookup would require
 * using RCU to free bitmaps, and that's not trivial without embedding an
 * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte
 * bitmap, which is excessive.
 */

/**
 * ida_alloc_range() - Allocate an unused ID.
 * @ida: IDA handle.
 * @min: Lowest ID to allocate.
 * @max: Highest ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Allocate an ID between @min and @max, inclusive.  The allocated ID will
 * not exceed %INT_MAX, even if @max is larger.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 * Return: The allocated ID, or %-ENOMEM if memory could not be allocated,
 * or %-ENOSPC if there are no free IDs.
 */
int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max,
                        gfp_t gfp)
{
        XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS);
        unsigned bit = min % IDA_BITMAP_BITS;
        unsigned long flags;
        struct ida_bitmap *bitmap, *alloc = NULL;

        if ((int)min < 0)
                return -ENOSPC;

        if ((int)max < 0)
                max = INT_MAX;

retry:
        xas_lock_irqsave(&xas, flags);
next:
        bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK);
        if (xas.xa_index > min / IDA_BITMAP_BITS)
                bit = 0;
        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                goto nospc;

        if (xa_is_value(bitmap)) {
                unsigned long tmp = xa_to_value(bitmap);

                if (bit < BITS_PER_XA_VALUE) {
                        bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit);
                        if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                                goto nospc;
                        if (bit < BITS_PER_XA_VALUE) {
                                tmp |= 1UL << bit;
                                xas_store(&xas, xa_mk_value(tmp));
                                goto out;
                        }
                }
                bitmap = alloc;
                if (!bitmap)
                        bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                if (!bitmap)
                        goto alloc;
                bitmap->bitmap[0] = tmp;
                xas_store(&xas, bitmap);
                if (xas_error(&xas)) {
                        bitmap->bitmap[0] = 0;
                        goto out;
                }
        }

        if (bitmap) {
                bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit);
                if (xas.xa_index * IDA_BITMAP_BITS + bit > max)
                        goto nospc;
                if (bit == IDA_BITMAP_BITS)
                        goto next;

                __set_bit(bit, bitmap->bitmap);
                if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } else {
                if (bit < BITS_PER_XA_VALUE) {
                        bitmap = xa_mk_value(1UL << bit);
                } else {
                        bitmap = alloc;
                        if (!bitmap)
                                bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT);
                        if (!bitmap)
                                goto alloc;
                        __set_bit(bit, bitmap->bitmap);
                }
                xas_store(&xas, bitmap);
        }
out:
        xas_unlock_irqrestore(&xas, flags);
        if (xas_nomem(&xas, gfp)) {
                xas.xa_index = min / IDA_BITMAP_BITS;
                bit = min % IDA_BITMAP_BITS;
                goto retry;
        }
        if (bitmap != alloc)
                kfree(alloc);
        if (xas_error(&xas))
                return xas_error(&xas);
        return xas.xa_index * IDA_BITMAP_BITS + bit;
alloc:
        xas_unlock_irqrestore(&xas, flags);
        alloc = kzalloc(sizeof(*bitmap), gfp);
        if (!alloc)
                return -ENOMEM;
        xas_set(&xas, min / IDA_BITMAP_BITS);
        bit = min % IDA_BITMAP_BITS;
        goto retry;
nospc:
        xas_unlock_irqrestore(&xas, flags);
        kfree(alloc);
        return -ENOSPC;
}
EXPORT_SYMBOL(ida_alloc_range);

/**
 * ida_find_first_range - Get the lowest used ID.
 * @ida: IDA handle.
 * @min: Lowest ID to get.
 * @max: Highest ID to get.
 *
 * Get the lowest used ID between @min and @max, inclusive.  The returned
 * ID will not exceed %INT_MAX, even if @max is larger.
 *
 * Context: Any context. Takes and releases the xa_lock.
 * Return: The lowest used ID, or errno if no used ID is found.
 */
int ida_find_first_range(struct ida *ida, unsigned int min, unsigned int max)
{
        unsigned long index = min / IDA_BITMAP_BITS;
        unsigned int offset = min % IDA_BITMAP_BITS;
        unsigned long *addr, size, bit;
        unsigned long tmp = 0;
        unsigned long flags;
        void *entry;
        int ret;

        if ((int)min < 0)
                return -EINVAL;
        if ((int)max < 0)
                max = INT_MAX;

        xa_lock_irqsave(&ida->xa, flags);

        entry = xa_find(&ida->xa, &index, max / IDA_BITMAP_BITS, XA_PRESENT);
        if (!entry) {
                ret = -ENOENT;
                goto err_unlock;
        }

        if (index > min / IDA_BITMAP_BITS)
                offset = 0;
        if (index * IDA_BITMAP_BITS + offset > max) {
                ret = -ENOENT;
                goto err_unlock;
        }

        if (xa_is_value(entry)) {
                tmp = xa_to_value(entry);
                addr = &tmp;
                size = BITS_PER_XA_VALUE;
        } else {
                addr = ((struct ida_bitmap *)entry)->bitmap;
                size = IDA_BITMAP_BITS;
        }

        bit = find_next_bit(addr, size, offset);

        xa_unlock_irqrestore(&ida->xa, flags);

        if (bit == size ||
            index * IDA_BITMAP_BITS + bit > max)
                return -ENOENT;

        return index * IDA_BITMAP_BITS + bit;

err_unlock:
        xa_unlock_irqrestore(&ida->xa, flags);
        return ret;
}
EXPORT_SYMBOL(ida_find_first_range);

/**
 * ida_free() - Release an allocated ID.
 * @ida: IDA handle.
 * @id: Previously allocated ID.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_free(struct ida *ida, unsigned int id)
{
        XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS);
        unsigned bit = id % IDA_BITMAP_BITS;
        struct ida_bitmap *bitmap;
        unsigned long flags;

        if ((int)id < 0)
                return;

        xas_lock_irqsave(&xas, flags);
        bitmap = xas_load(&xas);

        if (xa_is_value(bitmap)) {
                unsigned long v = xa_to_value(bitmap);
                if (bit >= BITS_PER_XA_VALUE)
                        goto err;
                if (!(v & (1UL << bit)))
                        goto err;
                v &= ~(1UL << bit);
                if (!v)
                        goto delete;
                xas_store(&xas, xa_mk_value(v));
        } else {
                if (!bitmap || !test_bit(bit, bitmap->bitmap))
                        goto err;
                __clear_bit(bit, bitmap->bitmap);
                xas_set_mark(&xas, XA_FREE_MARK);
                if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) {
                        kfree(bitmap);
delete:
                        xas_store(&xas, NULL);
                }
        }
        xas_unlock_irqrestore(&xas, flags);
        return;
 err:
        xas_unlock_irqrestore(&xas, flags);
        WARN(1, "ida_free called for id=%d which is not allocated.\n", id);
}
EXPORT_SYMBOL(ida_free);

/**
 * ida_destroy() - Free all IDs.
 * @ida: IDA handle.
 *
 * Calling this function frees all IDs and releases all resources used
 * by an IDA.  When this call returns, the IDA is empty and can be reused
 * or freed.  If the IDA is already empty, there is no need to call this
 * function.
 *
 * Context: Any context. It is safe to call this function without
 * locking in your code.
 */
void ida_destroy(struct ida *ida)
{
        XA_STATE(xas, &ida->xa, 0);
        struct ida_bitmap *bitmap;
        unsigned long flags;

        xas_lock_irqsave(&xas, flags);
        xas_for_each(&xas, bitmap, ULONG_MAX) {
                if (!xa_is_value(bitmap))
                        kfree(bitmap);
                xas_store(&xas, NULL);
        }
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(ida_destroy);

#ifndef __KERNEL__
extern void xa_dump_index(unsigned long index, unsigned int shift);
#define IDA_CHUNK_SHIFT                ilog2(IDA_BITMAP_BITS)

static void ida_dump_entry(void *entry, unsigned long index)
{
        unsigned long i;

        if (!entry)
                return;

        if (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);
                unsigned int shift = node->shift + IDA_CHUNK_SHIFT +
                        XA_CHUNK_SHIFT;

                xa_dump_index(index * IDA_BITMAP_BITS, shift);
                xa_dump_node(node);
                for (i = 0; i < XA_CHUNK_SIZE; i++)
                        ida_dump_entry(node->slots[i],
                                        index | (i << node->shift));
        } else if (xa_is_value(entry)) {
                xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG));
                pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry);
        } else {
                struct ida_bitmap *bitmap = entry;

                xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT);
                pr_cont("bitmap: %p data", bitmap);
                for (i = 0; i < IDA_BITMAP_LONGS; i++)
                        pr_cont(" %lx", bitmap->bitmap[i]);
                pr_cont("\n");
        }
}

static void ida_dump(struct ida *ida)
{
        struct xarray *xa = &ida->xa;
        pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head,
                                xa->xa_flags >> ROOT_TAG_SHIFT);
        ida_dump_entry(xa->xa_head, 0);
}
#endif





























































































































































































































































































































































































































































































































































































































    3 



    3 

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
 */

#include <linux/skbuff.h>
#include <linux/if_arp.h>
#include <linux/netdevice.h>
#include <linux/if.h>
#include <linux/if_vlan.h>
#include <net/udp_tunnel.h>
#include <net/sch_generic.h>
#include <linux/netfilter.h>
#include <rdma/ib_addr.h>

#include "rxe.h"
#include "rxe_net.h"
#include "rxe_loc.h"

static struct rxe_recv_sockets recv_sockets;

static struct dst_entry *rxe_find_route4(struct rxe_qp *qp,
                                         struct net_device *ndev,
                                         struct in_addr *saddr,
                                         struct in_addr *daddr)
{
        struct rtable *rt;
        struct flowi4 fl = { { 0 } };

        memset(&fl, 0, sizeof(fl));
        fl.flowi4_oif = ndev->ifindex;
        memcpy(&fl.saddr, saddr, sizeof(*saddr));
        memcpy(&fl.daddr, daddr, sizeof(*daddr));
        fl.flowi4_proto = IPPROTO_UDP;

        rt = ip_route_output_key(&init_net, &fl);
        if (IS_ERR(rt)) {
                rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr);
                return NULL;
        }

        return &rt->dst;
}

#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
                                         struct net_device *ndev,
                                         struct in6_addr *saddr,
                                         struct in6_addr *daddr)
{
        struct dst_entry *ndst;
        struct flowi6 fl6 = { { 0 } };

        memset(&fl6, 0, sizeof(fl6));
        fl6.flowi6_oif = ndev->ifindex;
        memcpy(&fl6.saddr, saddr, sizeof(*saddr));
        memcpy(&fl6.daddr, daddr, sizeof(*daddr));
        fl6.flowi6_proto = IPPROTO_UDP;

        ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk),
                                               recv_sockets.sk6->sk, &fl6,
                                               NULL);
        if (IS_ERR(ndst)) {
                rxe_dbg_qp(qp, "no route to %pI6\n", daddr);
                return NULL;
        }

        if (unlikely(ndst->error)) {
                rxe_dbg_qp(qp, "no route to %pI6\n", daddr);
                goto put;
        }

        return ndst;
put:
        dst_release(ndst);
        return NULL;
}

#else

static struct dst_entry *rxe_find_route6(struct rxe_qp *qp,
                                         struct net_device *ndev,
                                         struct in6_addr *saddr,
                                         struct in6_addr *daddr)
{
        return NULL;
}

#endif

static struct dst_entry *rxe_find_route(struct net_device *ndev,
                                        struct rxe_qp *qp,
                                        struct rxe_av *av)
{
        struct dst_entry *dst = NULL;

        if (qp_type(qp) == IB_QPT_RC)
                dst = sk_dst_get(qp->sk->sk);

        if (!dst || !dst_check(dst, qp->dst_cookie)) {
                if (dst)
                        dst_release(dst);

                if (av->network_type == RXE_NETWORK_TYPE_IPV4) {
                        struct in_addr *saddr;
                        struct in_addr *daddr;

                        saddr = &av->sgid_addr._sockaddr_in.sin_addr;
                        daddr = &av->dgid_addr._sockaddr_in.sin_addr;
                        dst = rxe_find_route4(qp, ndev, saddr, daddr);
                } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) {
                        struct in6_addr *saddr6;
                        struct in6_addr *daddr6;

                        saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr;
                        daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr;
                        dst = rxe_find_route6(qp, ndev, saddr6, daddr6);
#if IS_ENABLED(CONFIG_IPV6)
                        if (dst)
                                qp->dst_cookie =
                                        rt6_get_cookie((struct rt6_info *)dst);
#endif
                }

                if (dst && (qp_type(qp) == IB_QPT_RC)) {
                        dst_hold(dst);
                        sk_dst_set(qp->sk->sk, dst);
                }
        }
        return dst;
}

static int rxe_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
        struct udphdr *udph;
        struct rxe_dev *rxe;
        struct net_device *ndev = skb->dev;
        struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);

        /* takes a reference on rxe->ib_dev
         * drop when skb is freed
         */
        rxe = rxe_get_dev_from_net(ndev);
        if (!rxe && is_vlan_dev(ndev))
                rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev));
        if (!rxe)
                goto drop;

        if (skb_linearize(skb)) {
                ib_device_put(&rxe->ib_dev);
                goto drop;
        }

        udph = udp_hdr(skb);
        pkt->rxe = rxe;
        pkt->port_num = 1;
        pkt->hdr = (u8 *)(udph + 1);
        pkt->mask = RXE_GRH_MASK;
        pkt->paylen = be16_to_cpu(udph->len) - sizeof(*udph);

        /* remove udp header */
        skb_pull(skb, sizeof(struct udphdr));

        rxe_rcv(skb);

        return 0;
drop:
        kfree_skb(skb);

        return 0;
}

static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
                                           bool ipv6)
{
        int err;
        struct socket *sock;
        struct udp_port_cfg udp_cfg = { };
        struct udp_tunnel_sock_cfg tnl_cfg = { };

        if (ipv6) {
                udp_cfg.family = AF_INET6;
                udp_cfg.ipv6_v6only = 1;
        } else {
                udp_cfg.family = AF_INET;
        }

        udp_cfg.local_udp_port = port;

        /* Create UDP socket */
        err = udp_sock_create(net, &udp_cfg, &sock);
        if (err < 0)
                return ERR_PTR(err);

        tnl_cfg.encap_type = 1;
        tnl_cfg.encap_rcv = rxe_udp_encap_recv;

        /* Setup UDP tunnel */
        setup_udp_tunnel_sock(net, sock, &tnl_cfg);

        return sock;
}

static void rxe_release_udp_tunnel(struct socket *sk)
{
        if (sk)
                udp_tunnel_sock_release(sk);
}

static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port,
                            __be16 dst_port)
{
        struct udphdr *udph;

        __skb_push(skb, sizeof(*udph));
        skb_reset_transport_header(skb);
        udph = udp_hdr(skb);

        udph->dest = dst_port;
        udph->source = src_port;
        udph->len = htons(skb->len);
        udph->check = 0;
}

static void prepare_ipv4_hdr(struct dst_entry *dst, struct sk_buff *skb,
                             __be32 saddr, __be32 daddr, __u8 proto,
                             __u8 tos, __u8 ttl, __be16 df, bool xnet)
{
        struct iphdr *iph;

        skb_scrub_packet(skb, xnet);

        skb_clear_hash(skb);
        skb_dst_set(skb, dst_clone(dst));
        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));

        skb_push(skb, sizeof(struct iphdr));
        skb_reset_network_header(skb);

        iph = ip_hdr(skb);

        iph->version        =        IPVERSION;
        iph->ihl        =        sizeof(struct iphdr) >> 2;
        iph->tot_len        =        htons(skb->len);
        iph->frag_off        =        df;
        iph->protocol        =        proto;
        iph->tos        =        tos;
        iph->daddr        =        daddr;
        iph->saddr        =        saddr;
        iph->ttl        =        ttl;
        __ip_select_ident(dev_net(dst->dev), iph,
                          skb_shinfo(skb)->gso_segs ?: 1);
}

static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
                             struct in6_addr *saddr, struct in6_addr *daddr,
                             __u8 proto, __u8 prio, __u8 ttl)
{
        struct ipv6hdr *ip6h;

        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
        IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
                            | IPSKB_REROUTED);
        skb_dst_set(skb, dst_clone(dst));

        __skb_push(skb, sizeof(*ip6h));
        skb_reset_network_header(skb);
        ip6h                  = ipv6_hdr(skb);
        ip6_flow_hdr(ip6h, prio, htonl(0));
        ip6h->payload_len = htons(skb->len);
        ip6h->nexthdr     = proto;
        ip6h->hop_limit   = ttl;
        ip6h->daddr          = *daddr;
        ip6h->saddr          = *saddr;
        ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
}

static int prepare4(struct rxe_av *av, struct rxe_pkt_info *pkt,
                    struct sk_buff *skb)
{
        struct rxe_qp *qp = pkt->qp;
        struct dst_entry *dst;
        bool xnet = false;
        __be16 df = htons(IP_DF);
        struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
        struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;

        dst = rxe_find_route(skb->dev, qp, av);
        if (!dst) {
                rxe_dbg_qp(qp, "Host not reachable\n");
                return -EHOSTUNREACH;
        }

        prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
                        cpu_to_be16(ROCE_V2_UDP_DPORT));

        prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
                         av->grh.traffic_class, av->grh.hop_limit, df, xnet);

        dst_release(dst);
        return 0;
}

static int prepare6(struct rxe_av *av, struct rxe_pkt_info *pkt,
                    struct sk_buff *skb)
{
        struct rxe_qp *qp = pkt->qp;
        struct dst_entry *dst;
        struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
        struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;

        dst = rxe_find_route(skb->dev, qp, av);
        if (!dst) {
                rxe_dbg_qp(qp, "Host not reachable\n");
                return -EHOSTUNREACH;
        }

        prepare_udp_hdr(skb, cpu_to_be16(qp->src_port),
                        cpu_to_be16(ROCE_V2_UDP_DPORT));

        prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP,
                         av->grh.traffic_class,
                         av->grh.hop_limit);

        dst_release(dst);
        return 0;
}

int rxe_prepare(struct rxe_av *av, struct rxe_pkt_info *pkt,
                struct sk_buff *skb)
{
        int err = 0;

        if (skb->protocol == htons(ETH_P_IP))
                err = prepare4(av, pkt, skb);
        else if (skb->protocol == htons(ETH_P_IPV6))
                err = prepare6(av, pkt, skb);

        if (ether_addr_equal(skb->dev->dev_addr, av->dmac))
                pkt->mask |= RXE_LOOPBACK_MASK;

        return err;
}

static void rxe_skb_tx_dtor(struct sk_buff *skb)
{
        struct net_device *ndev = skb->dev;
        struct rxe_dev *rxe;
        unsigned int qp_index;
        struct rxe_qp *qp;
        int skb_out;

        rxe = rxe_get_dev_from_net(ndev);
        if (!rxe && is_vlan_dev(ndev))
                rxe = rxe_get_dev_from_net(vlan_dev_real_dev(ndev));
        if (WARN_ON(!rxe))
                return;

        qp_index = (int)(uintptr_t)skb->sk->sk_user_data;
        if (!qp_index)
                return;

        qp = rxe_pool_get_index(&rxe->qp_pool, qp_index);
        if (!qp)
                goto put_dev;

        skb_out = atomic_dec_return(&qp->skb_out);
        if (qp->need_req_skb && skb_out < RXE_INFLIGHT_SKBS_PER_QP_LOW)
                rxe_sched_task(&qp->send_task);

        rxe_put(qp);
put_dev:
        ib_device_put(&rxe->ib_dev);
        sock_put(skb->sk);
}

static int rxe_send(struct sk_buff *skb, struct rxe_pkt_info *pkt)
{
        int err;
        struct sock *sk = pkt->qp->sk->sk;

        sock_hold(sk);
        skb->sk = sk;
        skb->destructor = rxe_skb_tx_dtor;
        atomic_inc(&pkt->qp->skb_out);

        if (skb->protocol == htons(ETH_P_IP))
                err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
        else
                err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);

        return err;
}

/* fix up a send packet to match the packets
 * received from UDP before looping them back
 */
static int rxe_loopback(struct sk_buff *skb, struct rxe_pkt_info *pkt)
{
        struct sock *sk = pkt->qp->sk->sk;

        memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));

        sock_hold(sk);
        skb->sk = sk;
        skb->destructor = rxe_skb_tx_dtor;
        atomic_inc(&pkt->qp->skb_out);

        if (skb->protocol == htons(ETH_P_IP))
                skb_pull(skb, sizeof(struct iphdr));
        else
                skb_pull(skb, sizeof(struct ipv6hdr));

        if (WARN_ON(!ib_device_try_get(&pkt->rxe->ib_dev))) {
                kfree_skb(skb);
                return -EIO;
        }

        /* remove udp header */
        skb_pull(skb, sizeof(struct udphdr));

        rxe_rcv(skb);

        return 0;
}

int rxe_xmit_packet(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
                    struct sk_buff *skb)
{
        int err;
        int is_request = pkt->mask & RXE_REQ_MASK;
        struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
        unsigned long flags;

        spin_lock_irqsave(&qp->state_lock, flags);
        if ((is_request && (qp_state(qp) < IB_QPS_RTS)) ||
            (!is_request && (qp_state(qp) < IB_QPS_RTR))) {
                spin_unlock_irqrestore(&qp->state_lock, flags);
                rxe_dbg_qp(qp, "Packet dropped. QP is not in ready state\n");
                goto drop;
        }
        spin_unlock_irqrestore(&qp->state_lock, flags);

        rxe_icrc_generate(skb, pkt);

        if (pkt->mask & RXE_LOOPBACK_MASK)
                err = rxe_loopback(skb, pkt);
        else
                err = rxe_send(skb, pkt);
        if (err) {
                rxe_counter_inc(rxe, RXE_CNT_SEND_ERR);
                return err;
        }

        rxe_counter_inc(rxe, RXE_CNT_SENT_PKTS);
        goto done;

drop:
        kfree_skb(skb);
        err = 0;
done:
        return err;
}

struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
                                int paylen, struct rxe_pkt_info *pkt)
{
        unsigned int hdr_len;
        struct sk_buff *skb = NULL;
        struct net_device *ndev;
        const struct ib_gid_attr *attr;
        const int port_num = 1;

        attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index);
        if (IS_ERR(attr))
                return NULL;

        if (av->network_type == RXE_NETWORK_TYPE_IPV4)
                hdr_len = ETH_HLEN + sizeof(struct udphdr) +
                        sizeof(struct iphdr);
        else
                hdr_len = ETH_HLEN + sizeof(struct udphdr) +
                        sizeof(struct ipv6hdr);

        rcu_read_lock();
        ndev = rdma_read_gid_attr_ndev_rcu(attr);
        if (IS_ERR(ndev)) {
                rcu_read_unlock();
                goto out;
        }
        skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev),
                        GFP_ATOMIC);

        if (unlikely(!skb)) {
                rcu_read_unlock();
                goto out;
        }

        skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev));

        /* FIXME: hold reference to this netdev until life of this skb. */
        skb->dev        = ndev;
        rcu_read_unlock();

        if (av->network_type == RXE_NETWORK_TYPE_IPV4)
                skb->protocol = htons(ETH_P_IP);
        else
                skb->protocol = htons(ETH_P_IPV6);

        pkt->rxe        = rxe;
        pkt->port_num        = port_num;
        pkt->hdr        = skb_put(skb, paylen);
        pkt->mask        |= RXE_GRH_MASK;

out:
        rdma_put_gid_attr(attr);
        return skb;
}

/*
 * this is required by rxe_cfg to match rxe devices in
 * /sys/class/infiniband up with their underlying ethernet devices
 */
const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num)
{
        struct net_device *ndev;
        char *ndev_name;

        ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
        if (!ndev)
                return NULL;
        ndev_name = ndev->name;
        dev_put(ndev);

        return ndev_name;
}

int rxe_net_add(const char *ibdev_name, struct net_device *ndev)
{
        int err;
        struct rxe_dev *rxe = NULL;

        rxe = ib_alloc_device(rxe_dev, ib_dev);
        if (!rxe)
                return -ENOMEM;

        ib_mark_name_assigned_by_user(&rxe->ib_dev);

        err = rxe_add(rxe, ndev->mtu, ibdev_name, ndev);
        if (err) {
                ib_dealloc_device(&rxe->ib_dev);
                return err;
        }

        return 0;
}

static void rxe_port_event(struct rxe_dev *rxe,
                           enum ib_event_type event)
{
        struct ib_event ev;

        ev.device = &rxe->ib_dev;
        ev.element.port_num = 1;
        ev.event = event;

        ib_dispatch_event(&ev);
}

/* Caller must hold net_info_lock */
void rxe_port_up(struct rxe_dev *rxe)
{
        rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE);
        dev_info(&rxe->ib_dev.dev, "set active\n");
}

/* Caller must hold net_info_lock */
void rxe_port_down(struct rxe_dev *rxe)
{
        rxe_port_event(rxe, IB_EVENT_PORT_ERR);
        rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED);
        dev_info(&rxe->ib_dev.dev, "set down\n");
}

void rxe_set_port_state(struct rxe_dev *rxe)
{
        struct net_device *ndev;

        ndev = rxe_ib_device_get_netdev(&rxe->ib_dev);
        if (!ndev)
                return;

        if (ib_get_curr_port_state(ndev) == IB_PORT_ACTIVE)
                rxe_port_up(rxe);
        else
                rxe_port_down(rxe);

        dev_put(ndev);
}

static int rxe_notify(struct notifier_block *not_blk,
                      unsigned long event,
                      void *arg)
{
        struct net_device *ndev = netdev_notifier_info_to_dev(arg);
        struct rxe_dev *rxe = rxe_get_dev_from_net(ndev);

        if (!rxe)
                return NOTIFY_OK;

        switch (event) {
        case NETDEV_UNREGISTER:
                ib_unregister_device_queued(&rxe->ib_dev);
                break;
        case NETDEV_CHANGEMTU:
                rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu);
                rxe_set_mtu(rxe, ndev->mtu);
                break;
        case NETDEV_DOWN:
        case NETDEV_CHANGE:
                if (ib_get_curr_port_state(ndev) == IB_PORT_DOWN)
                        rxe_counter_inc(rxe, RXE_CNT_LINK_DOWNED);
                break;
        case NETDEV_REBOOT:
        case NETDEV_GOING_DOWN:
        case NETDEV_CHANGEADDR:
        case NETDEV_CHANGENAME:
        case NETDEV_FEAT_CHANGE:
        default:
                rxe_dbg_dev(rxe, "ignoring netdev event = %ld for %s\n",
                        event, ndev->name);
                break;
        }

        ib_device_put(&rxe->ib_dev);
        return NOTIFY_OK;
}

static struct notifier_block rxe_net_notifier = {
        .notifier_call = rxe_notify,
};

static int rxe_net_ipv4_init(void)
{
        recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net,
                                htons(ROCE_V2_UDP_DPORT), false);
        if (IS_ERR(recv_sockets.sk4)) {
                recv_sockets.sk4 = NULL;
                pr_err("Failed to create IPv4 UDP tunnel\n");
                return -1;
        }

        return 0;
}

static int rxe_net_ipv6_init(void)
{
#if IS_ENABLED(CONFIG_IPV6)

        recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
                                                htons(ROCE_V2_UDP_DPORT), true);
        if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) {
                recv_sockets.sk6 = NULL;
                pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n");
                return 0;
        }

        if (IS_ERR(recv_sockets.sk6)) {
                recv_sockets.sk6 = NULL;
                pr_err("Failed to create IPv6 UDP tunnel\n");
                return -1;
        }
#endif
        return 0;
}

void rxe_net_exit(void)
{
        rxe_release_udp_tunnel(recv_sockets.sk6);
        rxe_release_udp_tunnel(recv_sockets.sk4);
        unregister_netdevice_notifier(&rxe_net_notifier);
}

int rxe_net_init(void)
{
        int err;

        recv_sockets.sk6 = NULL;

        err = rxe_net_ipv4_init();
        if (err)
                return err;
        err = rxe_net_ipv6_init();
        if (err)
                goto err_out;
        err = register_netdevice_notifier(&rxe_net_notifier);
        if (err) {
                pr_err("Failed to register netdev notifier\n");
                goto err_out;
        }
        return 0;
err_out:
        rxe_net_exit();
        return err;
}







































































































































































  806 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *  Security-Enhanced Linux (SELinux) security module
 *
 *  This file contains the SELinux security data structures for kernel objects.
 *
 *  Author(s):  Stephen Smalley, <stephen.smalley.work@gmail.com>
 *                Chris Vance, <cvance@nai.com>
 *                Wayne Salamon, <wsalamon@nai.com>
 *                James Morris <jmorris@redhat.com>
 *
 *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
 *  Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *  Copyright (C) 2016 Mellanox Technologies
 */

#ifndef _SELINUX_OBJSEC_H_
#define _SELINUX_OBJSEC_H_

#include <linux/list.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/binfmts.h>
#include <linux/in.h>
#include <linux/spinlock.h>
#include <linux/lsm_hooks.h>
#include <linux/msg.h>
#include <net/net_namespace.h>
#include "flask.h"
#include "avc.h"

struct task_security_struct {
        u32 osid; /* SID prior to last execve */
        u32 sid; /* current SID */
        u32 exec_sid; /* exec SID */
        u32 create_sid; /* fscreate SID */
        u32 keycreate_sid; /* keycreate SID */
        u32 sockcreate_sid; /* fscreate SID */
} __randomize_layout;

enum label_initialized {
        LABEL_INVALID, /* invalid or not initialized */
        LABEL_INITIALIZED, /* initialized */
        LABEL_PENDING
};

struct inode_security_struct {
        struct inode *inode; /* back pointer to inode object */
        struct list_head list; /* list of inode_security_struct */
        u32 task_sid; /* SID of creating task */
        u32 sid; /* SID of this object */
        u16 sclass; /* security class of this object */
        unsigned char initialized; /* initialization flag */
        spinlock_t lock;
};

struct file_security_struct {
        u32 sid; /* SID of open file description */
        u32 fown_sid; /* SID of file owner (for SIGIO) */
        u32 isid; /* SID of inode at the time of file open */
        u32 pseqno; /* Policy seqno at the time of file open */
};

struct superblock_security_struct {
        u32 sid; /* SID of file system superblock */
        u32 def_sid; /* default SID for labeling */
        u32 mntpoint_sid; /* SECURITY_FS_USE_MNTPOINT context for files */
        unsigned short behavior; /* labeling behavior */
        unsigned short flags; /* which mount options were specified */
        struct mutex lock;
        struct list_head isec_head;
        spinlock_t isec_lock;
};

struct msg_security_struct {
        u32 sid; /* SID of message */
};

struct ipc_security_struct {
        u16 sclass; /* security class of this object */
        u32 sid; /* SID of IPC resource */
};

struct netif_security_struct {
        struct net *ns; /* network namespace */
        int ifindex; /* device index */
        u32 sid; /* SID for this interface */
};

struct netnode_security_struct {
        union {
                __be32 ipv4; /* IPv4 node address */
                struct in6_addr ipv6; /* IPv6 node address */
        } addr;
        u32 sid; /* SID for this node */
        u16 family; /* address family */
};

struct netport_security_struct {
        u32 sid; /* SID for this node */
        u16 port; /* port number */
        u8 protocol; /* transport protocol */
};

struct sk_security_struct {
#ifdef CONFIG_NETLABEL
        enum { /* NetLabel state */
               NLBL_UNSET = 0,
               NLBL_REQUIRE,
               NLBL_LABELED,
               NLBL_REQSKB,
               NLBL_CONNLABELED,
        } nlbl_state;
        struct netlbl_lsm_secattr *nlbl_secattr; /* NetLabel sec attributes */
#endif
        u32 sid; /* SID of this object */
        u32 peer_sid; /* SID of peer */
        u16 sclass; /* sock security class */
        enum { /* SCTP association state */
               SCTP_ASSOC_UNSET = 0,
               SCTP_ASSOC_SET,
        } sctp_assoc_state;
};

struct tun_security_struct {
        u32 sid; /* SID for the tun device sockets */
};

struct key_security_struct {
        u32 sid; /* SID of key */
};

struct ib_security_struct {
        u32 sid; /* SID of the queue pair or MAD agent */
};

struct pkey_security_struct {
        u64 subnet_prefix; /* Port subnet prefix */
        u16 pkey; /* PKey number */
        u32 sid; /* SID of pkey */
};

struct bpf_security_struct {
        u32 sid; /* SID of bpf obj creator */
};

struct perf_event_security_struct {
        u32 sid; /* SID of perf_event obj creator */
};

extern struct lsm_blob_sizes selinux_blob_sizes;
static inline struct task_security_struct *selinux_cred(const struct cred *cred)
{
        return cred->security + selinux_blob_sizes.lbs_cred;
}

static inline struct file_security_struct *selinux_file(const struct file *file)
{
        return file->f_security + selinux_blob_sizes.lbs_file;
}

static inline struct inode_security_struct *
selinux_inode(const struct inode *inode)
{
        if (unlikely(!inode->i_security))
                return NULL;
        return inode->i_security + selinux_blob_sizes.lbs_inode;
}

static inline struct msg_security_struct *
selinux_msg_msg(const struct msg_msg *msg_msg)
{
        return msg_msg->security + selinux_blob_sizes.lbs_msg_msg;
}

static inline struct ipc_security_struct *
selinux_ipc(const struct kern_ipc_perm *ipc)
{
        return ipc->security + selinux_blob_sizes.lbs_ipc;
}

/*
 * get the subjective security ID of the current task
 */
static inline u32 current_sid(void)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());

        return tsec->sid;
}

static inline struct superblock_security_struct *
selinux_superblock(const struct super_block *superblock)
{
        return superblock->s_security + selinux_blob_sizes.lbs_superblock;
}

#ifdef CONFIG_KEYS
static inline struct key_security_struct *selinux_key(const struct key *key)
{
        return key->security + selinux_blob_sizes.lbs_key;
}
#endif /* CONFIG_KEYS */

static inline struct sk_security_struct *selinux_sock(const struct sock *sock)
{
        return sock->sk_security + selinux_blob_sizes.lbs_sock;
}

static inline struct tun_security_struct *selinux_tun_dev(void *security)
{
        return security + selinux_blob_sizes.lbs_tun_dev;
}

static inline struct ib_security_struct *selinux_ib(void *ib_sec)
{
        return ib_sec + selinux_blob_sizes.lbs_ib;
}

static inline struct perf_event_security_struct *
selinux_perf_event(void *perf_event)
{
        return perf_event + selinux_blob_sizes.lbs_perf_event;
}

#endif /* _SELINUX_OBJSEC_H_ */



































































































































































































































































































































































































































































































































































































































































































































































































































































































   22 

























































































































































   22 
   22 























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
// SPDX-License-Identifier: GPL-2.0-only
/*
 * SMP initialisation and IPI support
 * Based on arch/arm/kernel/smp.c
 *
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/acpi.h>
#include <linux/arm_sdei.h>
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/sched/mm.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/task_stack.h>
#include <linux/interrupt.h>
#include <linux/cache.h>
#include <linux/profile.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/seq_file.h>
#include <linux/irq.h>
#include <linux/irqchip/arm-gic-v3.h>
#include <linux/percpu.h>
#include <linux/clockchips.h>
#include <linux/completion.h>
#include <linux/of.h>
#include <linux/irq_work.h>
#include <linux/kernel_stat.h>
#include <linux/kexec.h>
#include <linux/kgdb.h>
#include <linux/kvm_host.h>
#include <linux/nmi.h>

#include <asm/alternative.h>
#include <asm/atomic.h>
#include <asm/cacheflush.h>
#include <asm/cpu.h>
#include <asm/cputype.h>
#include <asm/cpu_ops.h>
#include <asm/daifflags.h>
#include <asm/kvm_mmu.h>
#include <asm/mmu_context.h>
#include <asm/numa.h>
#include <asm/processor.h>
#include <asm/smp_plat.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/ptrace.h>
#include <asm/virt.h>

#include <trace/events/ipi.h>

/*
 * as from 2.5, kernels no longer have an init_tasks structure
 * so we need some other way of telling a new secondary core
 * where to place its SVC stack
 */
struct secondary_data secondary_data;
/* Number of CPUs which aren't online, but looping in kernel text. */
static int cpus_stuck_in_kernel;

enum ipi_msg_type {
        IPI_RESCHEDULE,
        IPI_CALL_FUNC,
        IPI_CPU_STOP,
        IPI_CPU_STOP_NMI,
        IPI_TIMER,
        IPI_IRQ_WORK,
        NR_IPI,
        /*
         * Any enum >= NR_IPI and < MAX_IPI is special and not tracable
         * with trace_ipi_*
         */
        IPI_CPU_BACKTRACE = NR_IPI,
        IPI_KGDB_ROUNDUP,
        MAX_IPI
};

static int ipi_irq_base __ro_after_init;
static int nr_ipi __ro_after_init = NR_IPI;
static struct irq_desc *ipi_desc[MAX_IPI] __ro_after_init;

static bool crash_stop;

static void ipi_setup(int cpu);

#ifdef CONFIG_HOTPLUG_CPU
static void ipi_teardown(int cpu);
static int op_cpu_kill(unsigned int cpu);
#else
static inline int op_cpu_kill(unsigned int cpu)
{
        return -ENOSYS;
}
#endif


/*
 * Boot a secondary CPU, and assign it the specified idle task.
 * This also gives us the initial stack to use for this CPU.
 */
static int boot_secondary(unsigned int cpu, struct task_struct *idle)
{
        const struct cpu_operations *ops = get_cpu_ops(cpu);

        if (ops->cpu_boot)
                return ops->cpu_boot(cpu);

        return -EOPNOTSUPP;
}

static DECLARE_COMPLETION(cpu_running);

int __cpu_up(unsigned int cpu, struct task_struct *idle)
{
        int ret;
        long status;

        /*
         * We need to tell the secondary core where to find its stack and the
         * page tables.
         */
        secondary_data.task = idle;
        update_cpu_boot_status(CPU_MMU_OFF);

        /* Now bring the CPU into our world */
        ret = boot_secondary(cpu, idle);
        if (ret) {
                if (ret != -EPERM)
                        pr_err("CPU%u: failed to boot: %d\n", cpu, ret);
                return ret;
        }

        /*
         * CPU was successfully started, wait for it to come online or
         * time out.
         */
        wait_for_completion_timeout(&cpu_running,
                                    msecs_to_jiffies(5000));
        if (cpu_online(cpu))
                return 0;

        pr_crit("CPU%u: failed to come online\n", cpu);
        secondary_data.task = NULL;
        status = READ_ONCE(secondary_data.status);
        if (status == CPU_MMU_OFF)
                status = READ_ONCE(__early_cpu_boot_status);

        switch (status & CPU_BOOT_STATUS_MASK) {
        default:
                pr_err("CPU%u: failed in unknown state : 0x%lx\n",
                       cpu, status);
                cpus_stuck_in_kernel++;
                break;
        case CPU_KILL_ME:
                if (!op_cpu_kill(cpu)) {
                        pr_crit("CPU%u: died during early boot\n", cpu);
                        break;
                }
                pr_crit("CPU%u: may not have shut down cleanly\n", cpu);
                fallthrough;
        case CPU_STUCK_IN_KERNEL:
                pr_crit("CPU%u: is stuck in kernel\n", cpu);
                if (status & CPU_STUCK_REASON_52_BIT_VA)
                        pr_crit("CPU%u: does not support 52-bit VAs\n", cpu);
                if (status & CPU_STUCK_REASON_NO_GRAN) {
                        pr_crit("CPU%u: does not support %luK granule\n",
                                cpu, PAGE_SIZE / SZ_1K);
                }
                cpus_stuck_in_kernel++;
                break;
        case CPU_PANIC_KERNEL:
                panic("CPU%u detected unsupported configuration\n", cpu);
        }

        return -EIO;
}

static void init_gic_priority_masking(void)
{
        u32 cpuflags;

        if (WARN_ON(!gic_enable_sre()))
                return;

        cpuflags = read_sysreg(daif);

        WARN_ON(!(cpuflags & PSR_I_BIT));
        WARN_ON(!(cpuflags & PSR_F_BIT));

        gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET);
}

/*
 * This is the secondary CPU boot entry.  We're using this CPUs
 * idle thread stack, but a set of temporary page tables.
 */
asmlinkage notrace void secondary_start_kernel(void)
{
        u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
        struct mm_struct *mm = &init_mm;
        const struct cpu_operations *ops;
        unsigned int cpu = smp_processor_id();

        /*
         * All kernel threads share the same mm context; grab a
         * reference and switch to it.
         */
        mmgrab(mm);
        current->active_mm = mm;

        /*
         * TTBR0 is only used for the identity mapping at this stage. Make it
         * point to zero page to avoid speculatively fetching new entries.
         */
        cpu_uninstall_idmap();

        if (system_uses_irq_prio_masking())
                init_gic_priority_masking();

        rcutree_report_cpu_starting(cpu);
        trace_hardirqs_off();

        /*
         * If the system has established the capabilities, make sure
         * this CPU ticks all of those. If it doesn't, the CPU will
         * fail to come online.
         */
        check_local_cpu_capabilities();

        ops = get_cpu_ops(cpu);
        if (ops->cpu_postboot)
                ops->cpu_postboot();

        /*
         * Log the CPU info before it is marked online and might get read.
         */
        cpuinfo_store_cpu();
        store_cpu_topology(cpu);

        /*
         * Enable GIC and timers.
         */
        notify_cpu_starting(cpu);

        ipi_setup(cpu);

        numa_add_cpu(cpu);

        /*
         * OK, now it's safe to let the boot CPU continue.  Wait for
         * the CPU migration code to notice that the CPU is online
         * before we continue.
         */
        pr_info("CPU%u: Booted secondary processor 0x%010lx [0x%08x]\n",
                                         cpu, (unsigned long)mpidr,
                                         read_cpuid_id());
        update_cpu_boot_status(CPU_BOOT_SUCCESS);
        set_cpu_online(cpu, true);
        complete(&cpu_running);

        /*
         * Secondary CPUs enter the kernel with all DAIF exceptions masked.
         *
         * As with setup_arch() we must unmask Debug and SError exceptions, and
         * as the root irqchip has already been detected and initialized we can
         * unmask IRQ and FIQ at the same time.
         */
        local_daif_restore(DAIF_PROCCTX);

        /*
         * OK, it's off to the idle thread for us
         */
        cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
}

#ifdef CONFIG_HOTPLUG_CPU
static int op_cpu_disable(unsigned int cpu)
{
        const struct cpu_operations *ops = get_cpu_ops(cpu);

        /*
         * If we don't have a cpu_die method, abort before we reach the point
         * of no return. CPU0 may not have an cpu_ops, so test for it.
         */
        if (!ops || !ops->cpu_die)
                return -EOPNOTSUPP;

        /*
         * We may need to abort a hot unplug for some other mechanism-specific
         * reason.
         */
        if (ops->cpu_disable)
                return ops->cpu_disable(cpu);

        return 0;
}

/*
 * __cpu_disable runs on the processor to be shutdown.
 */
int __cpu_disable(void)
{
        unsigned int cpu = smp_processor_id();
        int ret;

        ret = op_cpu_disable(cpu);
        if (ret)
                return ret;

        remove_cpu_topology(cpu);
        numa_remove_cpu(cpu);

        /*
         * Take this CPU offline.  Once we clear this, we can't return,
         * and we must not schedule until we're ready to give up the cpu.
         */
        set_cpu_online(cpu, false);
        ipi_teardown(cpu);

        /*
         * OK - migrate IRQs away from this CPU
         */
        irq_migrate_all_off_this_cpu();

        return 0;
}

static int op_cpu_kill(unsigned int cpu)
{
        const struct cpu_operations *ops = get_cpu_ops(cpu);

        /*
         * If we have no means of synchronising with the dying CPU, then assume
         * that it is really dead. We can only wait for an arbitrary length of
         * time and hope that it's dead, so let's skip the wait and just hope.
         */
        if (!ops->cpu_kill)
                return 0;

        return ops->cpu_kill(cpu);
}

/*
 * Called on the thread which is asking for a CPU to be shutdown after the
 * shutdown completed.
 */
void arch_cpuhp_cleanup_dead_cpu(unsigned int cpu)
{
        int err;

        pr_debug("CPU%u: shutdown\n", cpu);

        /*
         * Now that the dying CPU is beyond the point of no return w.r.t.
         * in-kernel synchronisation, try to get the firwmare to help us to
         * verify that it has really left the kernel before we consider
         * clobbering anything it might still be using.
         */
        err = op_cpu_kill(cpu);
        if (err)
                pr_warn("CPU%d may not have shut down cleanly: %d\n", cpu, err);
}

/*
 * Called from the idle thread for the CPU which has been shutdown.
 *
 */
void __noreturn cpu_die(void)
{
        unsigned int cpu = smp_processor_id();
        const struct cpu_operations *ops = get_cpu_ops(cpu);

        idle_task_exit();

        local_daif_mask();

        /* Tell cpuhp_bp_sync_dead() that this CPU is now safe to dispose of */
        cpuhp_ap_report_dead();

        /*
         * Actually shutdown the CPU. This must never fail. The specific hotplug
         * mechanism must perform all required cache maintenance to ensure that
         * no dirty lines are lost in the process of shutting down the CPU.
         */
        ops->cpu_die(cpu);

        BUG();
}
#endif

static void __cpu_try_die(int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
        const struct cpu_operations *ops = get_cpu_ops(cpu);

        if (ops && ops->cpu_die)
                ops->cpu_die(cpu);
#endif
}

/*
 * Kill the calling secondary CPU, early in bringup before it is turned
 * online.
 */
void __noreturn cpu_die_early(void)
{
        int cpu = smp_processor_id();

        pr_crit("CPU%d: will not boot\n", cpu);

        /* Mark this CPU absent */
        set_cpu_present(cpu, 0);
        rcutree_report_cpu_dead();

        if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
                update_cpu_boot_status(CPU_KILL_ME);
                __cpu_try_die(cpu);
        }

        update_cpu_boot_status(CPU_STUCK_IN_KERNEL);

        cpu_park_loop();
}

static void __init hyp_mode_check(void)
{
        if (is_hyp_mode_available())
                pr_info("CPU: All CPU(s) started at EL2\n");
        else if (is_hyp_mode_mismatched())
                WARN_TAINT(1, TAINT_CPU_OUT_OF_SPEC,
                           "CPU: CPUs started in inconsistent modes");
        else
                pr_info("CPU: All CPU(s) started at EL1\n");
        if (IS_ENABLED(CONFIG_KVM) && !is_kernel_in_hyp_mode()) {
                kvm_compute_layout();
                kvm_apply_hyp_relocations();
        }
}

void __init smp_cpus_done(unsigned int max_cpus)
{
        pr_info("SMP: Total of %d processors activated.\n", num_online_cpus());
        hyp_mode_check();
        setup_system_features();
        setup_user_features();
        mark_linear_text_alias_ro();
}

void __init smp_prepare_boot_cpu(void)
{
        /*
         * The runtime per-cpu areas have been allocated by
         * setup_per_cpu_areas(), and CPU0's boot time per-cpu area will be
         * freed shortly, so we must move over to the runtime per-cpu area.
         */
        set_my_cpu_offset(per_cpu_offset(smp_processor_id()));

        cpuinfo_store_boot_cpu();
        setup_boot_cpu_features();

        /* Conditionally switch to GIC PMR for interrupt masking */
        if (system_uses_irq_prio_masking())
                init_gic_priority_masking();

        kasan_init_hw_tags();
        /* Init percpu seeds for random tags after cpus are set up. */
        kasan_init_sw_tags();
}

/*
 * Duplicate MPIDRs are a recipe for disaster. Scan all initialized
 * entries and check for duplicates. If any is found just ignore the
 * cpu. cpu_logical_map was initialized to INVALID_HWID to avoid
 * matching valid MPIDR values.
 */
static bool __init is_mpidr_duplicate(unsigned int cpu, u64 hwid)
{
        unsigned int i;

        for (i = 1; (i < cpu) && (i < NR_CPUS); i++)
                if (cpu_logical_map(i) == hwid)
                        return true;
        return false;
}

/*
 * Initialize cpu operations for a logical cpu and
 * set it in the possible mask on success
 */
static int __init smp_cpu_setup(int cpu)
{
        const struct cpu_operations *ops;

        if (init_cpu_ops(cpu))
                return -ENODEV;

        ops = get_cpu_ops(cpu);
        if (ops->cpu_init(cpu))
                return -ENODEV;

        set_cpu_possible(cpu, true);

        return 0;
}

static bool bootcpu_valid __initdata;
static unsigned int cpu_count = 1;

int arch_register_cpu(int cpu)
{
        acpi_handle acpi_handle = acpi_get_processor_handle(cpu);
        struct cpu *c = &per_cpu(cpu_devices, cpu);

        if (!acpi_disabled && !acpi_handle &&
            IS_ENABLED(CONFIG_ACPI_HOTPLUG_CPU))
                return -EPROBE_DEFER;

#ifdef CONFIG_ACPI_HOTPLUG_CPU
        /* For now block anything that looks like physical CPU Hotplug */
        if (invalid_logical_cpuid(cpu) || !cpu_present(cpu)) {
                pr_err_once("Changing CPU present bit is not supported\n");
                return -ENODEV;
        }
#endif

        /*
         * Availability of the acpi handle is sufficient to establish
         * that _STA has aleady been checked. No need to recheck here.
         */
        c->hotpluggable = arch_cpu_is_hotpluggable(cpu);

        return register_cpu(c, cpu);
}

#ifdef CONFIG_ACPI_HOTPLUG_CPU
void arch_unregister_cpu(int cpu)
{
        acpi_handle acpi_handle = acpi_get_processor_handle(cpu);
        struct cpu *c = &per_cpu(cpu_devices, cpu);
        acpi_status status;
        unsigned long long sta;

        if (!acpi_handle) {
                pr_err_once("Removing a CPU without associated ACPI handle\n");
                return;
        }

        status = acpi_evaluate_integer(acpi_handle, "_STA", NULL, &sta);
        if (ACPI_FAILURE(status))
                return;

        /* For now do not allow anything that looks like physical CPU HP */
        if (cpu_present(cpu) && !(sta & ACPI_STA_DEVICE_PRESENT)) {
                pr_err_once("Changing CPU present bit is not supported\n");
                return;
        }

        unregister_cpu(c);
}
#endif /* CONFIG_ACPI_HOTPLUG_CPU */

#ifdef CONFIG_ACPI
static struct acpi_madt_generic_interrupt cpu_madt_gicc[NR_CPUS];

struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu)
{
        return &cpu_madt_gicc[cpu];
}
EXPORT_SYMBOL_GPL(acpi_cpu_get_madt_gicc);

/*
 * acpi_map_gic_cpu_interface - parse processor MADT entry
 *
 * Carry out sanity checks on MADT processor entry and initialize
 * cpu_logical_map on success
 */
static void __init
acpi_map_gic_cpu_interface(struct acpi_madt_generic_interrupt *processor)
{
        u64 hwid = processor->arm_mpidr;

        if (!(processor->flags &
              (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE))) {
                pr_debug("skipping disabled CPU entry with 0x%llx MPIDR\n", hwid);
                return;
        }

        if (hwid & ~MPIDR_HWID_BITMASK || hwid == INVALID_HWID) {
                pr_err("skipping CPU entry with invalid MPIDR 0x%llx\n", hwid);
                return;
        }

        if (is_mpidr_duplicate(cpu_count, hwid)) {
                pr_err("duplicate CPU MPIDR 0x%llx in MADT\n", hwid);
                return;
        }

        /* Check if GICC structure of boot CPU is available in the MADT */
        if (cpu_logical_map(0) == hwid) {
                if (bootcpu_valid) {
                        pr_err("duplicate boot CPU MPIDR: 0x%llx in MADT\n",
                               hwid);
                        return;
                }
                bootcpu_valid = true;
                cpu_madt_gicc[0] = *processor;
                return;
        }

        if (cpu_count >= NR_CPUS)
                return;

        /* map the logical cpu id to cpu MPIDR */
        set_cpu_logical_map(cpu_count, hwid);

        cpu_madt_gicc[cpu_count] = *processor;

        /*
         * Set-up the ACPI parking protocol cpu entries
         * while initializing the cpu_logical_map to
         * avoid parsing MADT entries multiple times for
         * nothing (ie a valid cpu_logical_map entry should
         * contain a valid parking protocol data set to
         * initialize the cpu if the parking protocol is
         * the only available enable method).
         */
        acpi_set_mailbox_entry(cpu_count, processor);

        cpu_count++;
}

static int __init
acpi_parse_gic_cpu_interface(union acpi_subtable_headers *header,
                             const unsigned long end)
{
        struct acpi_madt_generic_interrupt *processor;

        processor = (struct acpi_madt_generic_interrupt *)header;
        if (BAD_MADT_GICC_ENTRY(processor, end))
                return -EINVAL;

        acpi_table_print_madt_entry(&header->common);

        acpi_map_gic_cpu_interface(processor);

        return 0;
}

static void __init acpi_parse_and_init_cpus(void)
{
        int i;

        /*
         * do a walk of MADT to determine how many CPUs
         * we have including disabled CPUs, and get information
         * we need for SMP init.
         */
        acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
                                      acpi_parse_gic_cpu_interface, 0);

        /*
         * In ACPI, SMP and CPU NUMA information is provided in separate
         * static tables, namely the MADT and the SRAT.
         *
         * Thus, it is simpler to first create the cpu logical map through
         * an MADT walk and then map the logical cpus to their node ids
         * as separate steps.
         */
        acpi_map_cpus_to_nodes();

        for (i = 0; i < nr_cpu_ids; i++)
                early_map_cpu_to_node(i, acpi_numa_get_nid(i));
}
#else
#define acpi_parse_and_init_cpus(...)        do { } while (0)
#endif

/*
 * Enumerate the possible CPU set from the device tree and build the
 * cpu logical map array containing MPIDR values related to logical
 * cpus. Assumes that cpu_logical_map(0) has already been initialized.
 */
static void __init of_parse_and_init_cpus(void)
{
        struct device_node *dn;

        for_each_of_cpu_node(dn) {
                u64 hwid = of_get_cpu_hwid(dn, 0);

                if (hwid & ~MPIDR_HWID_BITMASK)
                        goto next;

                if (is_mpidr_duplicate(cpu_count, hwid)) {
                        pr_err("%pOF: duplicate cpu reg properties in the DT\n",
                                dn);
                        goto next;
                }

                /*
                 * The numbering scheme requires that the boot CPU
                 * must be assigned logical id 0. Record it so that
                 * the logical map built from DT is validated and can
                 * be used.
                 */
                if (hwid == cpu_logical_map(0)) {
                        if (bootcpu_valid) {
                                pr_err("%pOF: duplicate boot cpu reg property in DT\n",
                                        dn);
                                goto next;
                        }

                        bootcpu_valid = true;
                        early_map_cpu_to_node(0, of_node_to_nid(dn));

                        /*
                         * cpu_logical_map has already been
                         * initialized and the boot cpu doesn't need
                         * the enable-method so continue without
                         * incrementing cpu.
                         */
                        continue;
                }

                if (cpu_count >= NR_CPUS)
                        goto next;

                pr_debug("cpu logical map 0x%llx\n", hwid);
                set_cpu_logical_map(cpu_count, hwid);

                early_map_cpu_to_node(cpu_count, of_node_to_nid(dn));
next:
                cpu_count++;
        }
}

/*
 * Enumerate the possible CPU set from the device tree or ACPI and build the
 * cpu logical map array containing MPIDR values related to logical
 * cpus. Assumes that cpu_logical_map(0) has already been initialized.
 */
void __init smp_init_cpus(void)
{
        int i;

        if (acpi_disabled)
                of_parse_and_init_cpus();
        else
                acpi_parse_and_init_cpus();

        if (cpu_count > nr_cpu_ids)
                pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n",
                        cpu_count, nr_cpu_ids);

        if (!bootcpu_valid) {
                pr_err("missing boot CPU MPIDR, not enabling secondaries\n");
                return;
        }

        /*
         * We need to set the cpu_logical_map entries before enabling
         * the cpus so that cpu processor description entries (DT cpu nodes
         * and ACPI MADT entries) can be retrieved by matching the cpu hwid
         * with entries in cpu_logical_map while initializing the cpus.
         * If the cpu set-up fails, invalidate the cpu_logical_map entry.
         */
        for (i = 1; i < nr_cpu_ids; i++) {
                if (cpu_logical_map(i) != INVALID_HWID) {
                        if (smp_cpu_setup(i))
                                set_cpu_logical_map(i, INVALID_HWID);
                }
        }
}

void __init smp_prepare_cpus(unsigned int max_cpus)
{
        const struct cpu_operations *ops;
        int err;
        unsigned int cpu;
        unsigned int this_cpu;

        init_cpu_topology();

        this_cpu = smp_processor_id();
        store_cpu_topology(this_cpu);
        numa_store_cpu_info(this_cpu);
        numa_add_cpu(this_cpu);

        /*
         * If UP is mandated by "nosmp" (which implies "maxcpus=0"), don't set
         * secondary CPUs present.
         */
        if (max_cpus == 0)
                return;

        /*
         * Initialise the present map (which describes the set of CPUs
         * actually populated at the present time) and release the
         * secondaries from the bootloader.
         */
        for_each_possible_cpu(cpu) {

                if (cpu == smp_processor_id())
                        continue;

                ops = get_cpu_ops(cpu);
                if (!ops)
                        continue;

                err = ops->cpu_prepare(cpu);
                if (err)
                        continue;

                set_cpu_present(cpu, true);
                numa_store_cpu_info(cpu);
        }
}

static const char *ipi_types[MAX_IPI] __tracepoint_string = {
        [IPI_RESCHEDULE]        = "Rescheduling interrupts",
        [IPI_CALL_FUNC]                = "Function call interrupts",
        [IPI_CPU_STOP]                = "CPU stop interrupts",
        [IPI_CPU_STOP_NMI]        = "CPU stop NMIs",
        [IPI_TIMER]                = "Timer broadcast interrupts",
        [IPI_IRQ_WORK]                = "IRQ work interrupts",
        [IPI_CPU_BACKTRACE]        = "CPU backtrace interrupts",
        [IPI_KGDB_ROUNDUP]        = "KGDB roundup interrupts",
};

static void smp_cross_call(const struct cpumask *target, unsigned int ipinr);

unsigned long irq_err_count;

int arch_show_interrupts(struct seq_file *p, int prec)
{
        unsigned int cpu, i;

        for (i = 0; i < MAX_IPI; i++) {
                seq_printf(p, "%*s%u:%s", prec - 1, "IPI", i,
                           prec >= 4 ? " " : "");
                for_each_online_cpu(cpu)
                        seq_printf(p, "%10u ", irq_desc_kstat_cpu(ipi_desc[i], cpu));
                seq_printf(p, "      %s\n", ipi_types[i]);
        }

        seq_printf(p, "%*s: %10lu\n", prec, "Err", irq_err_count);
        return 0;
}

void arch_send_call_function_ipi_mask(const struct cpumask *mask)
{
        smp_cross_call(mask, IPI_CALL_FUNC);
}

void arch_send_call_function_single_ipi(int cpu)
{
        smp_cross_call(cpumask_of(cpu), IPI_CALL_FUNC);
}

#ifdef CONFIG_IRQ_WORK
void arch_irq_work_raise(void)
{
        smp_cross_call(cpumask_of(smp_processor_id()), IPI_IRQ_WORK);
}
#endif

static void __noreturn local_cpu_stop(unsigned int cpu)
{
        set_cpu_online(cpu, false);

        local_daif_mask();
        sdei_mask_local_cpu();
        cpu_park_loop();
}

/*
 * We need to implement panic_smp_self_stop() for parallel panic() calls, so
 * that cpu_online_mask gets correctly updated and smp_send_stop() can skip
 * CPUs that have already stopped themselves.
 */
void __noreturn panic_smp_self_stop(void)
{
        local_cpu_stop(smp_processor_id());
}

static void __noreturn ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs)
{
#ifdef CONFIG_KEXEC_CORE
        /*
         * Use local_daif_mask() instead of local_irq_disable() to make sure
         * that pseudo-NMIs are disabled. The "crash stop" code starts with
         * an IRQ and falls back to NMI (which might be pseudo). If the IRQ
         * finally goes through right as we're timing out then the NMI could
         * interrupt us. It's better to prevent the NMI and let the IRQ
         * finish since the pt_regs will be better.
         */
        local_daif_mask();

        crash_save_cpu(regs, cpu);

        set_cpu_online(cpu, false);

        sdei_mask_local_cpu();

        if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
                __cpu_try_die(cpu);

        /* just in case */
        cpu_park_loop();
#else
        BUG();
#endif
}

static void arm64_backtrace_ipi(cpumask_t *mask)
{
        __ipi_send_mask(ipi_desc[IPI_CPU_BACKTRACE], mask);
}

void arch_trigger_cpumask_backtrace(const cpumask_t *mask, int exclude_cpu)
{
        /*
         * NOTE: though nmi_trigger_cpumask_backtrace() has "nmi_" in the name,
         * nothing about it truly needs to be implemented using an NMI, it's
         * just that it's _allowed_ to work with NMIs. If ipi_should_be_nmi()
         * returned false our backtrace attempt will just use a regular IPI.
         */
        nmi_trigger_cpumask_backtrace(mask, exclude_cpu, arm64_backtrace_ipi);
}

#ifdef CONFIG_KGDB
void kgdb_roundup_cpus(void)
{
        int this_cpu = raw_smp_processor_id();
        int cpu;

        for_each_online_cpu(cpu) {
                /* No need to roundup ourselves */
                if (cpu == this_cpu)
                        continue;

                __ipi_send_single(ipi_desc[IPI_KGDB_ROUNDUP], cpu);
        }
}
#endif

/*
 * Main handler for inter-processor interrupts
 */
static void do_handle_IPI(int ipinr)
{
        unsigned int cpu = smp_processor_id();

        if ((unsigned)ipinr < NR_IPI)
                trace_ipi_entry(ipi_types[ipinr]);

        switch (ipinr) {
        case IPI_RESCHEDULE:
                scheduler_ipi();
                break;

        case IPI_CALL_FUNC:
                generic_smp_call_function_interrupt();
                break;

        case IPI_CPU_STOP:
        case IPI_CPU_STOP_NMI:
                if (IS_ENABLED(CONFIG_KEXEC_CORE) && crash_stop) {
                        ipi_cpu_crash_stop(cpu, get_irq_regs());
                        unreachable();
                } else {
                        local_cpu_stop(cpu);
                }
                break;

#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
        case IPI_TIMER:
                tick_receive_broadcast();
                break;
#endif

#ifdef CONFIG_IRQ_WORK
        case IPI_IRQ_WORK:
                irq_work_run();
                break;
#endif

        case IPI_CPU_BACKTRACE:
                /*
                 * NOTE: in some cases this _won't_ be NMI context. See the
                 * comment in arch_trigger_cpumask_backtrace().
                 */
                nmi_cpu_backtrace(get_irq_regs());
                break;

        case IPI_KGDB_ROUNDUP:
                kgdb_nmicallback(cpu, get_irq_regs());
                break;

        default:
                pr_crit("CPU%u: Unknown IPI message 0x%x\n", cpu, ipinr);
                break;
        }

        if ((unsigned)ipinr < NR_IPI)
                trace_ipi_exit(ipi_types[ipinr]);
}

static irqreturn_t ipi_handler(int irq, void *data)
{
        do_handle_IPI(irq - ipi_irq_base);
        return IRQ_HANDLED;
}

static void smp_cross_call(const struct cpumask *target, unsigned int ipinr)
{
        trace_ipi_raise(target, ipi_types[ipinr]);
        __ipi_send_mask(ipi_desc[ipinr], target);
}

static bool ipi_should_be_nmi(enum ipi_msg_type ipi)
{
        if (!system_uses_irq_prio_masking())
                return false;

        switch (ipi) {
        case IPI_CPU_STOP_NMI:
        case IPI_CPU_BACKTRACE:
        case IPI_KGDB_ROUNDUP:
                return true;
        default:
                return false;
        }
}

static void ipi_setup(int cpu)
{
        int i;

        if (WARN_ON_ONCE(!ipi_irq_base))
                return;

        for (i = 0; i < nr_ipi; i++) {
                if (ipi_should_be_nmi(i)) {
                        prepare_percpu_nmi(ipi_irq_base + i);
                        enable_percpu_nmi(ipi_irq_base + i, 0);
                } else {
                        enable_percpu_irq(ipi_irq_base + i, 0);
                }
        }
}

#ifdef CONFIG_HOTPLUG_CPU
static void ipi_teardown(int cpu)
{
        int i;

        if (WARN_ON_ONCE(!ipi_irq_base))
                return;

        for (i = 0; i < nr_ipi; i++) {
                if (ipi_should_be_nmi(i)) {
                        disable_percpu_nmi(ipi_irq_base + i);
                        teardown_percpu_nmi(ipi_irq_base + i);
                } else {
                        disable_percpu_irq(ipi_irq_base + i);
                }
        }
}
#endif

void __init set_smp_ipi_range(int ipi_base, int n)
{
        int i;

        WARN_ON(n < MAX_IPI);
        nr_ipi = min(n, MAX_IPI);

        for (i = 0; i < nr_ipi; i++) {
                int err;

                if (ipi_should_be_nmi(i)) {
                        err = request_percpu_nmi(ipi_base + i, ipi_handler,
                                                 "IPI", &irq_stat);
                        WARN(err, "Could not request IPI %d as NMI, err=%d\n",
                             i, err);
                } else {
                        err = request_percpu_irq(ipi_base + i, ipi_handler,
                                                 "IPI", &irq_stat);
                        WARN(err, "Could not request IPI %d as IRQ, err=%d\n",
                             i, err);
                }

                ipi_desc[i] = irq_to_desc(ipi_base + i);
                irq_set_status_flags(ipi_base + i, IRQ_HIDDEN);
        }

        ipi_irq_base = ipi_base;

        /* Setup the boot CPU immediately */
        ipi_setup(smp_processor_id());
}

void arch_smp_send_reschedule(int cpu)
{
        smp_cross_call(cpumask_of(cpu), IPI_RESCHEDULE);
}

#ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
void arch_send_wakeup_ipi(unsigned int cpu)
{
        /*
         * We use a scheduler IPI to wake the CPU as this avoids the need for a
         * dedicated IPI and we can safely handle spurious scheduler IPIs.
         */
        smp_send_reschedule(cpu);
}
#endif

#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
void tick_broadcast(const struct cpumask *mask)
{
        smp_cross_call(mask, IPI_TIMER);
}
#endif

/*
 * The number of CPUs online, not counting this CPU (which may not be
 * fully online and so not counted in num_online_cpus()).
 */
static inline unsigned int num_other_online_cpus(void)
{
        unsigned int this_cpu_online = cpu_online(smp_processor_id());

        return num_online_cpus() - this_cpu_online;
}

void smp_send_stop(void)
{
        static unsigned long stop_in_progress;
        cpumask_t mask;
        unsigned long timeout;

        /*
         * If this cpu is the only one alive at this point in time, online or
         * not, there are no stop messages to be sent around, so just back out.
         */
        if (num_other_online_cpus() == 0)
                goto skip_ipi;

        /* Only proceed if this is the first CPU to reach this code */
        if (test_and_set_bit(0, &stop_in_progress))
                return;

        /*
         * Send an IPI to all currently online CPUs except the CPU running
         * this code.
         *
         * NOTE: we don't do anything here to prevent other CPUs from coming
         * online after we snapshot `cpu_online_mask`. Ideally, the calling code
         * should do something to prevent other CPUs from coming up. This code
         * can be called in the panic path and thus it doesn't seem wise to
         * grab the CPU hotplug mutex ourselves. Worst case:
         * - If a CPU comes online as we're running, we'll likely notice it
         *   during the 1 second wait below and then we'll catch it when we try
         *   with an NMI (assuming NMIs are enabled) since we re-snapshot the
         *   mask before sending an NMI.
         * - If we leave the function and see that CPUs are still online we'll
         *   at least print a warning. Especially without NMIs this function
         *   isn't foolproof anyway so calling code will just have to accept
         *   the fact that there could be cases where a CPU can't be stopped.
         */
        cpumask_copy(&mask, cpu_online_mask);
        cpumask_clear_cpu(smp_processor_id(), &mask);

        if (system_state <= SYSTEM_RUNNING)
                pr_crit("SMP: stopping secondary CPUs\n");

        /*
         * Start with a normal IPI and wait up to one second for other CPUs to
         * stop. We do this first because it gives other processors a chance
         * to exit critical sections / drop locks and makes the rest of the
         * stop process (especially console flush) more robust.
         */
        smp_cross_call(&mask, IPI_CPU_STOP);
        timeout = USEC_PER_SEC;
        while (num_other_online_cpus() && timeout--)
                udelay(1);

        /*
         * If CPUs are still online, try an NMI. There's no excuse for this to
         * be slow, so we only give them an extra 10 ms to respond.
         */
        if (num_other_online_cpus() && ipi_should_be_nmi(IPI_CPU_STOP_NMI)) {
                smp_rmb();
                cpumask_copy(&mask, cpu_online_mask);
                cpumask_clear_cpu(smp_processor_id(), &mask);

                pr_info("SMP: retry stop with NMI for CPUs %*pbl\n",
                        cpumask_pr_args(&mask));

                smp_cross_call(&mask, IPI_CPU_STOP_NMI);
                timeout = USEC_PER_MSEC * 10;
                while (num_other_online_cpus() && timeout--)
                        udelay(1);
        }

        if (num_other_online_cpus()) {
                smp_rmb();
                cpumask_copy(&mask, cpu_online_mask);
                cpumask_clear_cpu(smp_processor_id(), &mask);

                pr_warn("SMP: failed to stop secondary CPUs %*pbl\n",
                        cpumask_pr_args(&mask));
        }

skip_ipi:
        sdei_mask_local_cpu();
}

#ifdef CONFIG_KEXEC_CORE
void crash_smp_send_stop(void)
{
        /*
         * This function can be called twice in panic path, but obviously
         * we execute this only once.
         *
         * We use this same boolean to tell whether the IPI we send was a
         * stop or a "crash stop".
         */
        if (crash_stop)
                return;
        crash_stop = 1;

        smp_send_stop();

        sdei_handler_abort();
}

bool smp_crash_stop_failed(void)
{
        return num_other_online_cpus() != 0;
}
#endif

static bool have_cpu_die(void)
{
#ifdef CONFIG_HOTPLUG_CPU
        int any_cpu = raw_smp_processor_id();
        const struct cpu_operations *ops = get_cpu_ops(any_cpu);

        if (ops && ops->cpu_die)
                return true;
#endif
        return false;
}

bool cpus_are_stuck_in_kernel(void)
{
        bool smp_spin_tables = (num_possible_cpus() > 1 && !have_cpu_die());

        return !!cpus_stuck_in_kernel || smp_spin_tables ||
                is_protected_kvm_enabled();
}




























































































































































































































































































































































































































  155 































































































  156 

























































   31 



















































































































































































































































































































































































































  134 













   95 


   72 












  113 



    1 

    5 
   73 
   51 

   83 
   13 












































   85 

  426 





  222 
  221 























   87 



   82 
   81 
















   61 


   58 

















   74 













   61 





   61 







   68 
   61 










   59 
   63 

















































































    6 








  134 















    6 

  120 

  130 























































































































































































































































































































































































































































































































  173 


  150 




















  168 

  134 
  134 















  183 



  168 

  163 














  186 











    5 










































































































































































































































    1 











































































































































   51 












   45 














   25 




  266 
   25 









































































































   41 




















































































  382 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __KVM_HOST_H
#define __KVM_HOST_H


#include <linux/types.h>
#include <linux/hardirq.h>
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/stat.h>
#include <linux/bug.h>
#include <linux/minmax.h>
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/preempt.h>
#include <linux/msi.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/rcupdate.h>
#include <linux/ratelimit.h>
#include <linux/err.h>
#include <linux/irqflags.h>
#include <linux/context_tracking.h>
#include <linux/irqbypass.h>
#include <linux/rcuwait.h>
#include <linux/refcount.h>
#include <linux/nospec.h>
#include <linux/notifier.h>
#include <linux/ftrace.h>
#include <linux/hashtable.h>
#include <linux/instrumentation.h>
#include <linux/interval_tree.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <asm/signal.h>

#include <linux/kvm.h>
#include <linux/kvm_para.h>

#include <linux/kvm_types.h>

#include <asm/kvm_host.h>
#include <linux/kvm_dirty_ring.h>

#ifndef KVM_MAX_VCPU_IDS
#define KVM_MAX_VCPU_IDS KVM_MAX_VCPUS
#endif

/*
 * The bit 16 ~ bit 31 of kvm_userspace_memory_region::flags are internally
 * used in kvm, other bits are visible for userspace which are defined in
 * include/linux/kvm_h.
 */
#define KVM_MEMSLOT_INVALID        (1UL << 16)

/*
 * Bit 63 of the memslot generation number is an "update in-progress flag",
 * e.g. is temporarily set for the duration of kvm_swap_active_memslots().
 * This flag effectively creates a unique generation number that is used to
 * mark cached memslot data, e.g. MMIO accesses, as potentially being stale,
 * i.e. may (or may not) have come from the previous memslots generation.
 *
 * This is necessary because the actual memslots update is not atomic with
 * respect to the generation number update.  Updating the generation number
 * first would allow a vCPU to cache a spte from the old memslots using the
 * new generation number, and updating the generation number after switching
 * to the new memslots would allow cache hits using the old generation number
 * to reference the defunct memslots.
 *
 * This mechanism is used to prevent getting hits in KVM's caches while a
 * memslot update is in-progress, and to prevent cache hits *after* updating
 * the actual generation number against accesses that were inserted into the
 * cache *before* the memslots were updated.
 */
#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS        BIT_ULL(63)

/* Two fragments for cross MMIO pages. */
#define KVM_MAX_MMIO_FRAGMENTS        2

#ifndef KVM_MAX_NR_ADDRESS_SPACES
#define KVM_MAX_NR_ADDRESS_SPACES        1
#endif

/*
 * For the normal pfn, the highest 12 bits should be zero,
 * so we can mask bit 62 ~ bit 52  to indicate the error pfn,
 * mask bit 63 to indicate the noslot pfn.
 */
#define KVM_PFN_ERR_MASK        (0x7ffULL << 52)
#define KVM_PFN_ERR_NOSLOT_MASK        (0xfffULL << 52)
#define KVM_PFN_NOSLOT                (0x1ULL << 63)

#define KVM_PFN_ERR_FAULT        (KVM_PFN_ERR_MASK)
#define KVM_PFN_ERR_HWPOISON        (KVM_PFN_ERR_MASK + 1)
#define KVM_PFN_ERR_RO_FAULT        (KVM_PFN_ERR_MASK + 2)
#define KVM_PFN_ERR_SIGPENDING        (KVM_PFN_ERR_MASK + 3)
#define KVM_PFN_ERR_NEEDS_IO        (KVM_PFN_ERR_MASK + 4)

/*
 * error pfns indicate that the gfn is in slot but faild to
 * translate it to pfn on host.
 */
static inline bool is_error_pfn(kvm_pfn_t pfn)
{
        return !!(pfn & KVM_PFN_ERR_MASK);
}

/*
 * KVM_PFN_ERR_SIGPENDING indicates that fetching the PFN was interrupted
 * by a pending signal.  Note, the signal may or may not be fatal.
 */
static inline bool is_sigpending_pfn(kvm_pfn_t pfn)
{
        return pfn == KVM_PFN_ERR_SIGPENDING;
}

/*
 * error_noslot pfns indicate that the gfn can not be
 * translated to pfn - it is not in slot or failed to
 * translate it to pfn.
 */
static inline bool is_error_noslot_pfn(kvm_pfn_t pfn)
{
        return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK);
}

/* noslot pfn indicates that the gfn is not in slot. */
static inline bool is_noslot_pfn(kvm_pfn_t pfn)
{
        return pfn == KVM_PFN_NOSLOT;
}

/*
 * architectures with KVM_HVA_ERR_BAD other than PAGE_OFFSET (e.g. s390)
 * provide own defines and kvm_is_error_hva
 */
#ifndef KVM_HVA_ERR_BAD

#define KVM_HVA_ERR_BAD                (PAGE_OFFSET)
#define KVM_HVA_ERR_RO_BAD        (PAGE_OFFSET + PAGE_SIZE)

static inline bool kvm_is_error_hva(unsigned long addr)
{
        return addr >= PAGE_OFFSET;
}

#endif

static inline bool kvm_is_error_gpa(gpa_t gpa)
{
        return gpa == INVALID_GPA;
}

#define KVM_REQUEST_MASK           GENMASK(7,0)
#define KVM_REQUEST_NO_WAKEUP      BIT(8)
#define KVM_REQUEST_WAIT           BIT(9)
#define KVM_REQUEST_NO_ACTION      BIT(10)
/*
 * Architecture-independent vcpu->requests bit members
 * Bits 3-7 are reserved for more arch-independent bits.
 */
#define KVM_REQ_TLB_FLUSH                (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_VM_DEAD                        (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_UNBLOCK                        2
#define KVM_REQ_DIRTY_RING_SOFT_FULL        3
#define KVM_REQUEST_ARCH_BASE                8

/*
 * KVM_REQ_OUTSIDE_GUEST_MODE exists is purely as way to force the vCPU to
 * OUTSIDE_GUEST_MODE.  KVM_REQ_OUTSIDE_GUEST_MODE differs from a vCPU "kick"
 * in that it ensures the vCPU has reached OUTSIDE_GUEST_MODE before continuing
 * on.  A kick only guarantees that the vCPU is on its way out, e.g. a previous
 * kick may have set vcpu->mode to EXITING_GUEST_MODE, and so there's no
 * guarantee the vCPU received an IPI and has actually exited guest mode.
 */
#define KVM_REQ_OUTSIDE_GUEST_MODE        (KVM_REQUEST_NO_ACTION | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)

#define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
        BUILD_BUG_ON((unsigned)(nr) >= (sizeof_field(struct kvm_vcpu, requests) * 8) - KVM_REQUEST_ARCH_BASE); \
        (unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \
})
#define KVM_ARCH_REQ(nr)           KVM_ARCH_REQ_FLAGS(nr, 0)

bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
                                 unsigned long *vcpu_bitmap);
bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req);

#define KVM_USERSPACE_IRQ_SOURCE_ID                0
#define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID        1

extern struct mutex kvm_lock;
extern struct list_head vm_list;

struct kvm_io_range {
        gpa_t addr;
        int len;
        struct kvm_io_device *dev;
};

#define NR_IOBUS_DEVS 1000

struct kvm_io_bus {
        int dev_count;
        int ioeventfd_count;
        struct kvm_io_range range[];
};

enum kvm_bus {
        KVM_MMIO_BUS,
        KVM_PIO_BUS,
        KVM_VIRTIO_CCW_NOTIFY_BUS,
        KVM_FAST_MMIO_BUS,
        KVM_IOCSR_BUS,
        KVM_NR_BUSES
};

int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val);
int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
                            gpa_t addr, int len, const void *val, long cookie);
int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val);
int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                              struct kvm_io_device *dev);
struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                                         gpa_t addr);

#ifdef CONFIG_KVM_ASYNC_PF
struct kvm_async_pf {
        struct work_struct work;
        struct list_head link;
        struct list_head queue;
        struct kvm_vcpu *vcpu;
        gpa_t cr2_or_gpa;
        unsigned long addr;
        struct kvm_arch_async_pf arch;
        bool   wakeup_all;
        bool notpresent_injected;
};

void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
                        unsigned long hva, struct kvm_arch_async_pf *arch);
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif

#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
union kvm_mmu_notifier_arg {
        unsigned long attributes;
};

enum kvm_gfn_range_filter {
        KVM_FILTER_SHARED                = BIT(0),
        KVM_FILTER_PRIVATE                = BIT(1),
};

struct kvm_gfn_range {
        struct kvm_memory_slot *slot;
        gfn_t start;
        gfn_t end;
        union kvm_mmu_notifier_arg arg;
        enum kvm_gfn_range_filter attr_filter;
        bool may_block;
        bool lockless;
};
bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range);
#endif

enum {
        OUTSIDE_GUEST_MODE,
        IN_GUEST_MODE,
        EXITING_GUEST_MODE,
        READING_SHADOW_PAGE_TABLES,
};

struct kvm_host_map {
        /*
         * Only valid if the 'pfn' is managed by the host kernel (i.e. There is
         * a 'struct page' for it. When using mem= kernel parameter some memory
         * can be used as guest memory but they are not managed by host
         * kernel).
         */
        struct page *pinned_page;
        struct page *page;
        void *hva;
        kvm_pfn_t pfn;
        kvm_pfn_t gfn;
        bool writable;
};

/*
 * Used to check if the mapping is valid or not. Never use 'kvm_host_map'
 * directly to check for that.
 */
static inline bool kvm_vcpu_mapped(struct kvm_host_map *map)
{
        return !!map->hva;
}

static inline bool kvm_vcpu_can_poll(ktime_t cur, ktime_t stop)
{
        return single_task_running() && !need_resched() && ktime_before(cur, stop);
}

/*
 * Sometimes a large or cross-page mmio needs to be broken up into separate
 * exits for userspace servicing.
 */
struct kvm_mmio_fragment {
        gpa_t gpa;
        void *data;
        unsigned len;
};

struct kvm_vcpu {
        struct kvm *kvm;
#ifdef CONFIG_PREEMPT_NOTIFIERS
        struct preempt_notifier preempt_notifier;
#endif
        int cpu;
        int vcpu_id; /* id given by userspace at creation */
        int vcpu_idx; /* index into kvm->vcpu_array */
        int ____srcu_idx; /* Don't use this directly.  You've been warned. */
#ifdef CONFIG_PROVE_RCU
        int srcu_depth;
#endif
        int mode;
        u64 requests;
        unsigned long guest_debug;

        struct mutex mutex;
        struct kvm_run *run;

#ifndef __KVM_HAVE_ARCH_WQP
        struct rcuwait wait;
#endif
        struct pid *pid;
        rwlock_t pid_lock;
        int sigset_active;
        sigset_t sigset;
        unsigned int halt_poll_ns;
        bool valid_wakeup;

#ifdef CONFIG_HAS_IOMEM
        int mmio_needed;
        int mmio_read_completed;
        int mmio_is_write;
        int mmio_cur_fragment;
        int mmio_nr_fragments;
        struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS];
#endif

#ifdef CONFIG_KVM_ASYNC_PF
        struct {
                u32 queued;
                struct list_head queue;
                struct list_head done;
                spinlock_t lock;
        } async_pf;
#endif

#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
        /*
         * Cpu relax intercept or pause loop exit optimization
         * in_spin_loop: set when a vcpu does a pause loop exit
         *  or cpu relax intercepted.
         * dy_eligible: indicates whether vcpu is eligible for directed yield.
         */
        struct {
                bool in_spin_loop;
                bool dy_eligible;
        } spin_loop;
#endif
        bool wants_to_run;
        bool preempted;
        bool ready;
        bool scheduled_out;
        struct kvm_vcpu_arch arch;
        struct kvm_vcpu_stat stat;
        char stats_id[KVM_STATS_NAME_SIZE];
        struct kvm_dirty_ring dirty_ring;

        /*
         * The most recently used memslot by this vCPU and the slots generation
         * for which it is valid.
         * No wraparound protection is needed since generations won't overflow in
         * thousands of years, even assuming 1M memslot operations per second.
         */
        struct kvm_memory_slot *last_used_slot;
        u64 last_used_slot_gen;
};

/*
 * Start accounting time towards a guest.
 * Must be called before entering guest context.
 */
static __always_inline void guest_timing_enter_irqoff(void)
{
        /*
         * This is running in ioctl context so its safe to assume that it's the
         * stime pending cputime to flush.
         */
        instrumentation_begin();
        vtime_account_guest_enter();
        instrumentation_end();
}

/*
 * Enter guest context and enter an RCU extended quiescent state.
 *
 * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is
 * unsafe to use any code which may directly or indirectly use RCU, tracing
 * (including IRQ flag tracing), or lockdep. All code in this period must be
 * non-instrumentable.
 */
static __always_inline void guest_context_enter_irqoff(void)
{
        /*
         * KVM does not hold any references to rcu protected data when it
         * switches CPU into a guest mode. In fact switching to a guest mode
         * is very similar to exiting to userspace from rcu point of view. In
         * addition CPU may stay in a guest mode for quite a long time (up to
         * one time slice). Lets treat guest mode as quiescent state, just like
         * we do with user-mode execution.
         */
        if (!context_tracking_guest_enter()) {
                instrumentation_begin();
                rcu_virt_note_context_switch();
                instrumentation_end();
        }
}

/*
 * Deprecated. Architectures should move to guest_timing_enter_irqoff() and
 * guest_state_enter_irqoff().
 */
static __always_inline void guest_enter_irqoff(void)
{
        guest_timing_enter_irqoff();
        guest_context_enter_irqoff();
}

/**
 * guest_state_enter_irqoff - Fixup state when entering a guest
 *
 * Entry to a guest will enable interrupts, but the kernel state is interrupts
 * disabled when this is invoked. Also tell RCU about it.
 *
 * 1) Trace interrupts on state
 * 2) Invoke context tracking if enabled to adjust RCU state
 * 3) Tell lockdep that interrupts are enabled
 *
 * Invoked from architecture specific code before entering a guest.
 * Must be called with interrupts disabled and the caller must be
 * non-instrumentable.
 * The caller has to invoke guest_timing_enter_irqoff() before this.
 *
 * Note: this is analogous to exit_to_user_mode().
 */
static __always_inline void guest_state_enter_irqoff(void)
{
        instrumentation_begin();
        trace_hardirqs_on_prepare();
        lockdep_hardirqs_on_prepare();
        instrumentation_end();

        guest_context_enter_irqoff();
        lockdep_hardirqs_on(CALLER_ADDR0);
}

/*
 * Exit guest context and exit an RCU extended quiescent state.
 *
 * Between guest_context_enter_irqoff() and guest_context_exit_irqoff() it is
 * unsafe to use any code which may directly or indirectly use RCU, tracing
 * (including IRQ flag tracing), or lockdep. All code in this period must be
 * non-instrumentable.
 */
static __always_inline void guest_context_exit_irqoff(void)
{
        /*
         * Guest mode is treated as a quiescent state, see
         * guest_context_enter_irqoff() for more details.
         */
        if (!context_tracking_guest_exit()) {
                instrumentation_begin();
                rcu_virt_note_context_switch();
                instrumentation_end();
        }
}

/*
 * Stop accounting time towards a guest.
 * Must be called after exiting guest context.
 */
static __always_inline void guest_timing_exit_irqoff(void)
{
        instrumentation_begin();
        /* Flush the guest cputime we spent on the guest */
        vtime_account_guest_exit();
        instrumentation_end();
}

/*
 * Deprecated. Architectures should move to guest_state_exit_irqoff() and
 * guest_timing_exit_irqoff().
 */
static __always_inline void guest_exit_irqoff(void)
{
        guest_context_exit_irqoff();
        guest_timing_exit_irqoff();
}

static inline void guest_exit(void)
{
        unsigned long flags;

        local_irq_save(flags);
        guest_exit_irqoff();
        local_irq_restore(flags);
}

/**
 * guest_state_exit_irqoff - Establish state when returning from guest mode
 *
 * Entry from a guest disables interrupts, but guest mode is traced as
 * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
 *
 * 1) Tell lockdep that interrupts are disabled
 * 2) Invoke context tracking if enabled to reactivate RCU
 * 3) Trace interrupts off state
 *
 * Invoked from architecture specific code after exiting a guest.
 * Must be invoked with interrupts disabled and the caller must be
 * non-instrumentable.
 * The caller has to invoke guest_timing_exit_irqoff() after this.
 *
 * Note: this is analogous to enter_from_user_mode().
 */
static __always_inline void guest_state_exit_irqoff(void)
{
        lockdep_hardirqs_off(CALLER_ADDR0);
        guest_context_exit_irqoff();

        instrumentation_begin();
        trace_hardirqs_off_finish();
        instrumentation_end();
}

static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
{
        /*
         * The memory barrier ensures a previous write to vcpu->requests cannot
         * be reordered with the read of vcpu->mode.  It pairs with the general
         * memory barrier following the write of vcpu->mode in VCPU RUN.
         */
        smp_mb__before_atomic();
        return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
}

/*
 * Some of the bitops functions do not support too long bitmaps.
 * This number must be determined not to exceed such limits.
 */
#define KVM_MEM_MAX_NR_PAGES ((1UL << 31) - 1)

/*
 * Since at idle each memslot belongs to two memslot sets it has to contain
 * two embedded nodes for each data structure that it forms a part of.
 *
 * Two memslot sets (one active and one inactive) are necessary so the VM
 * continues to run on one memslot set while the other is being modified.
 *
 * These two memslot sets normally point to the same set of memslots.
 * They can, however, be desynchronized when performing a memslot management
 * operation by replacing the memslot to be modified by its copy.
 * After the operation is complete, both memslot sets once again point to
 * the same, common set of memslot data.
 *
 * The memslots themselves are independent of each other so they can be
 * individually added or deleted.
 */
struct kvm_memory_slot {
        struct hlist_node id_node[2];
        struct interval_tree_node hva_node[2];
        struct rb_node gfn_node[2];
        gfn_t base_gfn;
        unsigned long npages;
        unsigned long *dirty_bitmap;
        struct kvm_arch_memory_slot arch;
        unsigned long userspace_addr;
        u32 flags;
        short id;
        u16 as_id;

#ifdef CONFIG_KVM_PRIVATE_MEM
        struct {
                /*
                 * Writes protected by kvm->slots_lock.  Acquiring a
                 * reference via kvm_gmem_get_file() is protected by
                 * either kvm->slots_lock or kvm->srcu.
                 */
                struct file *file;
                pgoff_t pgoff;
        } gmem;
#endif
};

static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
{
        return slot && (slot->flags & KVM_MEM_GUEST_MEMFD);
}

static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
{
        return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
}

static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
{
        return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
}

static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *memslot)
{
        unsigned long len = kvm_dirty_bitmap_bytes(memslot);

        return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap);
}

#ifndef KVM_DIRTY_LOG_MANUAL_CAPS
#define KVM_DIRTY_LOG_MANUAL_CAPS KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE
#endif

struct kvm_s390_adapter_int {
        u64 ind_addr;
        u64 summary_addr;
        u64 ind_offset;
        u32 summary_offset;
        u32 adapter_id;
};

struct kvm_hv_sint {
        u32 vcpu;
        u32 sint;
};

struct kvm_xen_evtchn {
        u32 port;
        u32 vcpu_id;
        int vcpu_idx;
        u32 priority;
};

struct kvm_kernel_irq_routing_entry {
        u32 gsi;
        u32 type;
        int (*set)(struct kvm_kernel_irq_routing_entry *e,
                   struct kvm *kvm, int irq_source_id, int level,
                   bool line_status);
        union {
                struct {
                        unsigned irqchip;
                        unsigned pin;
                } irqchip;
                struct {
                        u32 address_lo;
                        u32 address_hi;
                        u32 data;
                        u32 flags;
                        u32 devid;
                } msi;
                struct kvm_s390_adapter_int adapter;
                struct kvm_hv_sint hv_sint;
                struct kvm_xen_evtchn xen_evtchn;
        };
        struct hlist_node link;
};

#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
struct kvm_irq_routing_table {
        int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
        u32 nr_rt_entries;
        /*
         * Array indexed by gsi. Each entry contains list of irq chips
         * the gsi is connected to.
         */
        struct hlist_head map[] __counted_by(nr_rt_entries);
};
#endif

bool kvm_arch_irqchip_in_kernel(struct kvm *kvm);

#ifndef KVM_INTERNAL_MEM_SLOTS
#define KVM_INTERNAL_MEM_SLOTS 0
#endif

#define KVM_MEM_SLOTS_NUM SHRT_MAX
#define KVM_USER_MEM_SLOTS (KVM_MEM_SLOTS_NUM - KVM_INTERNAL_MEM_SLOTS)

#if KVM_MAX_NR_ADDRESS_SPACES == 1
static inline int kvm_arch_nr_memslot_as_ids(struct kvm *kvm)
{
        return KVM_MAX_NR_ADDRESS_SPACES;
}

static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
{
        return 0;
}
#endif

/*
 * Arch code must define kvm_arch_has_private_mem if support for private memory
 * is enabled.
 */
#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
{
        return false;
}
#endif

#ifndef kvm_arch_has_readonly_mem
static inline bool kvm_arch_has_readonly_mem(struct kvm *kvm)
{
        return IS_ENABLED(CONFIG_HAVE_KVM_READONLY_MEM);
}
#endif

struct kvm_memslots {
        u64 generation;
        atomic_long_t last_used_slot;
        struct rb_root_cached hva_tree;
        struct rb_root gfn_tree;
        /*
         * The mapping table from slot id to memslot.
         *
         * 7-bit bucket count matches the size of the old id to index array for
         * 512 slots, while giving good performance with this slot count.
         * Higher bucket counts bring only small performance improvements but
         * always result in higher memory usage (even for lower memslot counts).
         */
        DECLARE_HASHTABLE(id_hash, 7);
        int node_idx;
};

struct kvm {
#ifdef KVM_HAVE_MMU_RWLOCK
        rwlock_t mmu_lock;
#else
        spinlock_t mmu_lock;
#endif /* KVM_HAVE_MMU_RWLOCK */

        struct mutex slots_lock;

        /*
         * Protects the arch-specific fields of struct kvm_memory_slots in
         * use by the VM. To be used under the slots_lock (above) or in a
         * kvm->srcu critical section where acquiring the slots_lock would
         * lead to deadlock with the synchronize_srcu in
         * kvm_swap_active_memslots().
         */
        struct mutex slots_arch_lock;
        struct mm_struct *mm; /* userspace tied to this vm */
        unsigned long nr_memslot_pages;
        /* The two memslot sets - active and inactive (per address space) */
        struct kvm_memslots __memslots[KVM_MAX_NR_ADDRESS_SPACES][2];
        /* The current active memslot set for each address space */
        struct kvm_memslots __rcu *memslots[KVM_MAX_NR_ADDRESS_SPACES];
        struct xarray vcpu_array;
        /*
         * Protected by slots_lock, but can be read outside if an
         * incorrect answer is acceptable.
         */
        atomic_t nr_memslots_dirty_logging;

        /* Used to wait for completion of MMU notifiers.  */
        spinlock_t mn_invalidate_lock;
        unsigned long mn_active_invalidate_count;
        struct rcuwait mn_memslots_update_rcuwait;

        /* For management / invalidation of gfn_to_pfn_caches */
        spinlock_t gpc_lock;
        struct list_head gpc_list;

        /*
         * created_vcpus is protected by kvm->lock, and is incremented
         * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
         * incremented after storing the kvm_vcpu pointer in vcpus,
         * and is accessed atomically.
         */
        atomic_t online_vcpus;
        int max_vcpus;
        int created_vcpus;
        int last_boosted_vcpu;
        struct list_head vm_list;
        struct mutex lock;
        struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
#ifdef CONFIG_HAVE_KVM_IRQCHIP
        struct {
                spinlock_t        lock;
                struct list_head  items;
                /* resampler_list update side is protected by resampler_lock. */
                struct list_head  resampler_list;
                struct mutex      resampler_lock;
        } irqfds;
#endif
        struct list_head ioeventfds;
        struct kvm_vm_stat stat;
        struct kvm_arch arch;
        refcount_t users_count;
#ifdef CONFIG_KVM_MMIO
        struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
        spinlock_t ring_lock;
        struct list_head coalesced_zones;
#endif

        struct mutex irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
        /*
         * Update side is protected by irq_lock.
         */
        struct kvm_irq_routing_table __rcu *irq_routing;

        struct hlist_head irq_ack_notifier_list;
#endif

#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
        struct mmu_notifier mmu_notifier;
        unsigned long mmu_invalidate_seq;
        long mmu_invalidate_in_progress;
        gfn_t mmu_invalidate_range_start;
        gfn_t mmu_invalidate_range_end;
#endif
        struct list_head devices;
        u64 manual_dirty_log_protect;
        struct dentry *debugfs_dentry;
        struct kvm_stat_data **debugfs_stat_data;
        struct srcu_struct srcu;
        struct srcu_struct irq_srcu;
        pid_t userspace_pid;
        bool override_halt_poll_ns;
        unsigned int max_halt_poll_ns;
        u32 dirty_ring_size;
        bool dirty_ring_with_bitmap;
        bool vm_bugged;
        bool vm_dead;

#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
        struct notifier_block pm_notifier;
#endif
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
        /* Protected by slots_locks (for writes) and RCU (for reads) */
        struct xarray mem_attr_array;
#endif
        char stats_id[KVM_STATS_NAME_SIZE];
};

#define kvm_err(fmt, ...) \
        pr_err("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__)
#define kvm_info(fmt, ...) \
        pr_info("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__)
#define kvm_debug(fmt, ...) \
        pr_debug("kvm [%i]: " fmt, task_pid_nr(current), ## __VA_ARGS__)
#define kvm_debug_ratelimited(fmt, ...) \
        pr_debug_ratelimited("kvm [%i]: " fmt, task_pid_nr(current), \
                             ## __VA_ARGS__)
#define kvm_pr_unimpl(fmt, ...) \
        pr_err_ratelimited("kvm [%i]: " fmt, \
                           task_tgid_nr(current), ## __VA_ARGS__)

/* The guest did something we don't support. */
#define vcpu_unimpl(vcpu, fmt, ...)                                        \
        kvm_pr_unimpl("vcpu%i, guest rIP: 0x%lx " fmt,                        \
                        (vcpu)->vcpu_id, kvm_rip_read(vcpu), ## __VA_ARGS__)

#define vcpu_debug(vcpu, fmt, ...)                                        \
        kvm_debug("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
#define vcpu_debug_ratelimited(vcpu, fmt, ...)                                \
        kvm_debug_ratelimited("vcpu%i " fmt, (vcpu)->vcpu_id,           \
                              ## __VA_ARGS__)
#define vcpu_err(vcpu, fmt, ...)                                        \
        kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)

static inline void kvm_vm_dead(struct kvm *kvm)
{
        kvm->vm_dead = true;
        kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD);
}

static inline void kvm_vm_bugged(struct kvm *kvm)
{
        kvm->vm_bugged = true;
        kvm_vm_dead(kvm);
}


#define KVM_BUG(cond, kvm, fmt...)                                \
({                                                                \
        bool __ret = !!(cond);                                        \
                                                                \
        if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt))                \
                kvm_vm_bugged(kvm);                                \
        unlikely(__ret);                                        \
})

#define KVM_BUG_ON(cond, kvm)                                        \
({                                                                \
        bool __ret = !!(cond);                                        \
                                                                \
        if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged))                \
                kvm_vm_bugged(kvm);                                \
        unlikely(__ret);                                        \
})

/*
 * Note, "data corruption" refers to corruption of host kernel data structures,
 * not guest data.  Guest data corruption, suspected or confirmed, that is tied
 * and contained to a single VM should *never* BUG() and potentially panic the
 * host, i.e. use this variant of KVM_BUG() if and only if a KVM data structure
 * is corrupted and that corruption can have a cascading effect to other parts
 * of the hosts and/or to other VMs.
 */
#define KVM_BUG_ON_DATA_CORRUPTION(cond, kvm)                        \
({                                                                \
        bool __ret = !!(cond);                                        \
                                                                \
        if (IS_ENABLED(CONFIG_BUG_ON_DATA_CORRUPTION))                \
                BUG_ON(__ret);                                        \
        else if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged))        \
                kvm_vm_bugged(kvm);                                \
        unlikely(__ret);                                        \
})

static inline void kvm_vcpu_srcu_read_lock(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_PROVE_RCU
        WARN_ONCE(vcpu->srcu_depth++,
                  "KVM: Illegal vCPU srcu_idx LOCK, depth=%d", vcpu->srcu_depth - 1);
#endif
        vcpu->____srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
}

static inline void kvm_vcpu_srcu_read_unlock(struct kvm_vcpu *vcpu)
{
        srcu_read_unlock(&vcpu->kvm->srcu, vcpu->____srcu_idx);

#ifdef CONFIG_PROVE_RCU
        WARN_ONCE(--vcpu->srcu_depth,
                  "KVM: Illegal vCPU srcu_idx UNLOCK, depth=%d", vcpu->srcu_depth);
#endif
}

static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm)
{
        return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET);
}

static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx)
{
        return srcu_dereference_check(kvm->buses[idx], &kvm->srcu,
                                      lockdep_is_held(&kvm->slots_lock) ||
                                      !refcount_read(&kvm->users_count));
}

static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
{
        int num_vcpus = atomic_read(&kvm->online_vcpus);

        /*
         * Explicitly verify the target vCPU is online, as the anti-speculation
         * logic only limits the CPU's ability to speculate, e.g. given a "bad"
         * index, clamping the index to 0 would return vCPU0, not NULL.
         */
        if (i >= num_vcpus)
                return NULL;

        i = array_index_nospec(i, num_vcpus);

        /* Pairs with smp_wmb() in kvm_vm_ioctl_create_vcpu.  */
        smp_rmb();
        return xa_load(&kvm->vcpu_array, i);
}

#define kvm_for_each_vcpu(idx, vcpup, kvm)                                \
        if (atomic_read(&kvm->online_vcpus))                                \
                xa_for_each_range(&kvm->vcpu_array, idx, vcpup, 0,        \
                                  (atomic_read(&kvm->online_vcpus) - 1))

static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id)
{
        struct kvm_vcpu *vcpu = NULL;
        unsigned long i;

        if (id < 0)
                return NULL;
        if (id < KVM_MAX_VCPUS)
                vcpu = kvm_get_vcpu(kvm, id);
        if (vcpu && vcpu->vcpu_id == id)
                return vcpu;
        kvm_for_each_vcpu(i, vcpu, kvm)
                if (vcpu->vcpu_id == id)
                        return vcpu;
        return NULL;
}

void kvm_destroy_vcpus(struct kvm *kvm);

void vcpu_load(struct kvm_vcpu *vcpu);
void vcpu_put(struct kvm_vcpu *vcpu);

#ifdef __KVM_HAVE_IOAPIC
void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm);
void kvm_arch_post_irq_routing_update(struct kvm *kvm);
#else
static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm)
{
}
static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
}
#endif

#ifdef CONFIG_HAVE_KVM_IRQCHIP
int kvm_irqfd_init(void);
void kvm_irqfd_exit(void);
#else
static inline int kvm_irqfd_init(void)
{
        return 0;
}

static inline void kvm_irqfd_exit(void)
{
}
#endif
int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module);
void kvm_exit(void);

void kvm_get_kvm(struct kvm *kvm);
bool kvm_get_kvm_safe(struct kvm *kvm);
void kvm_put_kvm(struct kvm *kvm);
bool file_is_kvm(struct file *file);
void kvm_put_kvm_no_destroy(struct kvm *kvm);

static inline struct kvm_memslots *__kvm_memslots(struct kvm *kvm, int as_id)
{
        as_id = array_index_nospec(as_id, KVM_MAX_NR_ADDRESS_SPACES);
        return srcu_dereference_check(kvm->memslots[as_id], &kvm->srcu,
                        lockdep_is_held(&kvm->slots_lock) ||
                        !refcount_read(&kvm->users_count));
}

static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
{
        return __kvm_memslots(kvm, 0);
}

static inline struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu)
{
        int as_id = kvm_arch_vcpu_memslots_id(vcpu);

        return __kvm_memslots(vcpu->kvm, as_id);
}

static inline bool kvm_memslots_empty(struct kvm_memslots *slots)
{
        return RB_EMPTY_ROOT(&slots->gfn_tree);
}

bool kvm_are_all_memslots_empty(struct kvm *kvm);

#define kvm_for_each_memslot(memslot, bkt, slots)                              \
        hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \
                if (WARN_ON_ONCE(!memslot->npages)) {                              \
                } else

static inline
struct kvm_memory_slot *id_to_memslot(struct kvm_memslots *slots, int id)
{
        struct kvm_memory_slot *slot;
        int idx = slots->node_idx;

        hash_for_each_possible(slots->id_hash, slot, id_node[idx], id) {
                if (slot->id == id)
                        return slot;
        }

        return NULL;
}

/* Iterator used for walking memslots that overlap a gfn range. */
struct kvm_memslot_iter {
        struct kvm_memslots *slots;
        struct rb_node *node;
        struct kvm_memory_slot *slot;
};

static inline void kvm_memslot_iter_next(struct kvm_memslot_iter *iter)
{
        iter->node = rb_next(iter->node);
        if (!iter->node)
                return;

        iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[iter->slots->node_idx]);
}

static inline void kvm_memslot_iter_start(struct kvm_memslot_iter *iter,
                                          struct kvm_memslots *slots,
                                          gfn_t start)
{
        int idx = slots->node_idx;
        struct rb_node *tmp;
        struct kvm_memory_slot *slot;

        iter->slots = slots;

        /*
         * Find the so called "upper bound" of a key - the first node that has
         * its key strictly greater than the searched one (the start gfn in our case).
         */
        iter->node = NULL;
        for (tmp = slots->gfn_tree.rb_node; tmp; ) {
                slot = container_of(tmp, struct kvm_memory_slot, gfn_node[idx]);
                if (start < slot->base_gfn) {
                        iter->node = tmp;
                        tmp = tmp->rb_left;
                } else {
                        tmp = tmp->rb_right;
                }
        }

        /*
         * Find the slot with the lowest gfn that can possibly intersect with
         * the range, so we'll ideally have slot start <= range start
         */
        if (iter->node) {
                /*
                 * A NULL previous node means that the very first slot
                 * already has a higher start gfn.
                 * In this case slot start > range start.
                 */
                tmp = rb_prev(iter->node);
                if (tmp)
                        iter->node = tmp;
        } else {
                /* a NULL node below means no slots */
                iter->node = rb_last(&slots->gfn_tree);
        }

        if (iter->node) {
                iter->slot = container_of(iter->node, struct kvm_memory_slot, gfn_node[idx]);

                /*
                 * It is possible in the slot start < range start case that the
                 * found slot ends before or at range start (slot end <= range start)
                 * and so it does not overlap the requested range.
                 *
                 * In such non-overlapping case the next slot (if it exists) will
                 * already have slot start > range start, otherwise the logic above
                 * would have found it instead of the current slot.
                 */
                if (iter->slot->base_gfn + iter->slot->npages <= start)
                        kvm_memslot_iter_next(iter);
        }
}

static inline bool kvm_memslot_iter_is_valid(struct kvm_memslot_iter *iter, gfn_t end)
{
        if (!iter->node)
                return false;

        /*
         * If this slot starts beyond or at the end of the range so does
         * every next one
         */
        return iter->slot->base_gfn < end;
}

/* Iterate over each memslot at least partially intersecting [start, end) range */
#define kvm_for_each_memslot_in_gfn_range(iter, slots, start, end)        \
        for (kvm_memslot_iter_start(iter, slots, start);                \
             kvm_memslot_iter_is_valid(iter, end);                        \
             kvm_memslot_iter_next(iter))

struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn);
struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu);
struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn);

/*
 * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
 * - create a new memory slot
 * - delete an existing memory slot
 * - modify an existing memory slot
 *   -- move it in the guest physical memory space
 *   -- just change its flags
 *
 * Since flags can be changed by some of these operations, the following
 * differentiation is the best we can do for kvm_set_memory_region():
 */
enum kvm_mr_change {
        KVM_MR_CREATE,
        KVM_MR_DELETE,
        KVM_MR_MOVE,
        KVM_MR_FLAGS_ONLY,
};

int kvm_set_internal_memslot(struct kvm *kvm,
                             const struct kvm_userspace_memory_region2 *mem);
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot);
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                const struct kvm_memory_slot *old,
                                struct kvm_memory_slot *new,
                                enum kvm_mr_change change);
void kvm_arch_commit_memory_region(struct kvm *kvm,
                                struct kvm_memory_slot *old,
                                const struct kvm_memory_slot *new,
                                enum kvm_mr_change change);
/* flush all memory translations */
void kvm_arch_flush_shadow_all(struct kvm *kvm);
/* flush memory translations pointing to 'slot' */
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
                                   struct kvm_memory_slot *slot);

int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
                       struct page **pages, int nr_pages);

struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write);
static inline struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
        return __gfn_to_page(kvm, gfn, true);
}

unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable);
unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn);
unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, gfn_t gfn,
                                      bool *writable);

static inline void kvm_release_page_unused(struct page *page)
{
        if (!page)
                return;

        put_page(page);
}

void kvm_release_page_clean(struct page *page);
void kvm_release_page_dirty(struct page *page);

static inline void kvm_release_faultin_page(struct kvm *kvm, struct page *page,
                                            bool unused, bool dirty)
{
        lockdep_assert_once(lockdep_is_held(&kvm->mmu_lock) || unused);

        if (!page)
                return;

        /*
         * If the page that KVM got from the *primary MMU* is writable, and KVM
         * installed or reused a SPTE, mark the page/folio dirty.  Note, this
         * may mark a folio dirty even if KVM created a read-only SPTE, e.g. if
         * the GFN is write-protected.  Folios can't be safely marked dirty
         * outside of mmu_lock as doing so could race with writeback on the
         * folio.  As a result, KVM can't mark folios dirty in the fast page
         * fault handler, and so KVM must (somewhat) speculatively mark the
         * folio dirty if KVM could locklessly make the SPTE writable.
         */
        if (unused)
                kvm_release_page_unused(page);
        else if (dirty)
                kvm_release_page_dirty(page);
        else
                kvm_release_page_clean(page);
}

kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
                            unsigned int foll, bool *writable,
                            struct page **refcounted_page);

static inline kvm_pfn_t kvm_faultin_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
                                        bool write, bool *writable,
                                        struct page **refcounted_page)
{
        return __kvm_faultin_pfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn,
                                 write ? FOLL_WRITE : 0, writable, refcounted_page);
}

int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
                        int len);
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                           void *data, unsigned long len);
int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                                 void *data, unsigned int offset,
                                 unsigned long len);
int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
                         int offset, int len);
int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
                    unsigned long len);
int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                           void *data, unsigned long len);
int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                                  void *data, unsigned int offset,
                                  unsigned long len);
int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                              gpa_t gpa, unsigned long len);

#define __kvm_get_guest(kvm, gfn, offset, v)                                \
({                                                                        \
        unsigned long __addr = gfn_to_hva(kvm, gfn);                        \
        typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset);        \
        int __ret = -EFAULT;                                                \
                                                                        \
        if (!kvm_is_error_hva(__addr))                                        \
                __ret = get_user(v, __uaddr);                                \
        __ret;                                                                \
})

#define kvm_get_guest(kvm, gpa, v)                                        \
({                                                                        \
        gpa_t __gpa = gpa;                                                \
        struct kvm *__kvm = kvm;                                        \
                                                                        \
        __kvm_get_guest(__kvm, __gpa >> PAGE_SHIFT,                        \
                        offset_in_page(__gpa), v);                        \
})

#define __kvm_put_guest(kvm, gfn, offset, v)                                \
({                                                                        \
        unsigned long __addr = gfn_to_hva(kvm, gfn);                        \
        typeof(v) __user *__uaddr = (typeof(__uaddr))(__addr + offset);        \
        int __ret = -EFAULT;                                                \
                                                                        \
        if (!kvm_is_error_hva(__addr))                                        \
                __ret = put_user(v, __uaddr);                                \
        if (!__ret)                                                        \
                mark_page_dirty(kvm, gfn);                                \
        __ret;                                                                \
})

#define kvm_put_guest(kvm, gpa, v)                                        \
({                                                                        \
        gpa_t __gpa = gpa;                                                \
        struct kvm *__kvm = kvm;                                        \
                                                                        \
        __kvm_put_guest(__kvm, __gpa >> PAGE_SHIFT,                        \
                        offset_in_page(__gpa), v);                        \
})

int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len);
bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn);
bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn);
void mark_page_dirty_in_slot(struct kvm *kvm, const struct kvm_memory_slot *memslot, gfn_t gfn);
void mark_page_dirty(struct kvm *kvm, gfn_t gfn);

int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_host_map *map,
                   bool writable);
void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map);

static inline int kvm_vcpu_map(struct kvm_vcpu *vcpu, gpa_t gpa,
                               struct kvm_host_map *map)
{
        return __kvm_vcpu_map(vcpu, gpa, map, true);
}

static inline int kvm_vcpu_map_readonly(struct kvm_vcpu *vcpu, gpa_t gpa,
                                        struct kvm_host_map *map)
{
        return __kvm_vcpu_map(vcpu, gpa, map, false);
}

unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn);
unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable);
int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, int offset,
                             int len);
int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
                               unsigned long len);
int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data,
                        unsigned long len);
int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, const void *data,
                              int offset, int len);
int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
                         unsigned long len);
void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);

/**
 * kvm_gpc_init - initialize gfn_to_pfn_cache.
 *
 * @gpc:           struct gfn_to_pfn_cache object.
 * @kvm:           pointer to kvm instance.
 *
 * This sets up a gfn_to_pfn_cache by initializing locks and assigning the
 * immutable attributes.  Note, the cache must be zero-allocated (or zeroed by
 * the caller before init).
 */
void kvm_gpc_init(struct gfn_to_pfn_cache *gpc, struct kvm *kvm);

/**
 * kvm_gpc_activate - prepare a cached kernel mapping and HPA for a given guest
 *                    physical address.
 *
 * @gpc:           struct gfn_to_pfn_cache object.
 * @gpa:           guest physical address to map.
 * @len:           sanity check; the range being access must fit a single page.
 *
 * @return:           0 for success.
 *                   -EINVAL for a mapping which would cross a page boundary.
 *                   -EFAULT for an untranslatable guest physical address.
 *
 * This primes a gfn_to_pfn_cache and links it into the @gpc->kvm's list for
 * invalidations to be processed.  Callers are required to use kvm_gpc_check()
 * to ensure that the cache is valid before accessing the target page.
 */
int kvm_gpc_activate(struct gfn_to_pfn_cache *gpc, gpa_t gpa, unsigned long len);

/**
 * kvm_gpc_activate_hva - prepare a cached kernel mapping and HPA for a given HVA.
 *
 * @gpc:          struct gfn_to_pfn_cache object.
 * @hva:          userspace virtual address to map.
 * @len:          sanity check; the range being access must fit a single page.
 *
 * @return:       0 for success.
 *                -EINVAL for a mapping which would cross a page boundary.
 *                -EFAULT for an untranslatable guest physical address.
 *
 * The semantics of this function are the same as those of kvm_gpc_activate(). It
 * merely bypasses a layer of address translation.
 */
int kvm_gpc_activate_hva(struct gfn_to_pfn_cache *gpc, unsigned long hva, unsigned long len);

/**
 * kvm_gpc_check - check validity of a gfn_to_pfn_cache.
 *
 * @gpc:           struct gfn_to_pfn_cache object.
 * @len:           sanity check; the range being access must fit a single page.
 *
 * @return:           %true if the cache is still valid and the address matches.
 *                   %false if the cache is not valid.
 *
 * Callers outside IN_GUEST_MODE context should hold a read lock on @gpc->lock
 * while calling this function, and then continue to hold the lock until the
 * access is complete.
 *
 * Callers in IN_GUEST_MODE may do so without locking, although they should
 * still hold a read lock on kvm->scru for the memslot checks.
 */
bool kvm_gpc_check(struct gfn_to_pfn_cache *gpc, unsigned long len);

/**
 * kvm_gpc_refresh - update a previously initialized cache.
 *
 * @gpc:           struct gfn_to_pfn_cache object.
 * @len:           sanity check; the range being access must fit a single page.
 *
 * @return:           0 for success.
 *                   -EINVAL for a mapping which would cross a page boundary.
 *                   -EFAULT for an untranslatable guest physical address.
 *
 * This will attempt to refresh a gfn_to_pfn_cache. Note that a successful
 * return from this function does not mean the page can be immediately
 * accessed because it may have raced with an invalidation. Callers must
 * still lock and check the cache status, as this function does not return
 * with the lock still held to permit access.
 */
int kvm_gpc_refresh(struct gfn_to_pfn_cache *gpc, unsigned long len);

/**
 * kvm_gpc_deactivate - deactivate and unlink a gfn_to_pfn_cache.
 *
 * @gpc:           struct gfn_to_pfn_cache object.
 *
 * This removes a cache from the VM's list to be processed on MMU notifier
 * invocation.
 */
void kvm_gpc_deactivate(struct gfn_to_pfn_cache *gpc);

static inline bool kvm_gpc_is_gpa_active(struct gfn_to_pfn_cache *gpc)
{
        return gpc->active && !kvm_is_error_gpa(gpc->gpa);
}

static inline bool kvm_gpc_is_hva_active(struct gfn_to_pfn_cache *gpc)
{
        return gpc->active && kvm_is_error_gpa(gpc->gpa);
}

void kvm_sigset_activate(struct kvm_vcpu *vcpu);
void kvm_sigset_deactivate(struct kvm_vcpu *vcpu);

void kvm_vcpu_halt(struct kvm_vcpu *vcpu);
bool kvm_vcpu_block(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
int kvm_vcpu_yield_to(struct kvm_vcpu *target);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool yield_to_kernel_mode);

void kvm_flush_remote_tlbs(struct kvm *kvm);
void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot);

#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min);
int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min);
int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc);
void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc);
void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
#endif

void kvm_mmu_invalidate_begin(struct kvm *kvm);
void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
void kvm_mmu_invalidate_end(struct kvm *kvm);
bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);

long kvm_arch_dev_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg);
long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg);
vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf);

int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext);

void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                                        struct kvm_memory_slot *slot,
                                        gfn_t gfn_offset,
                                        unsigned long mask);
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot);

#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log);
int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
                      int *is_dirty, struct kvm_memory_slot **memslot);
#endif

int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
                        bool line_status);
int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                            struct kvm_enable_cap *cap);
int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg);
long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
                              unsigned long arg);

int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu);

int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
                                    struct kvm_translation *tr);

int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs);
int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs);
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
                                  struct kvm_sregs *sregs);
int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state);
int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state);
int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
                                        struct kvm_guest_debug *dbg);
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu);

void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu);
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id);
int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu);

#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state);
#endif

#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry);
#else
static inline void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
#endif

#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
/*
 * kvm_arch_{enable,disable}_virtualization() are called on one CPU, under
 * kvm_usage_lock, immediately after/before 0=>1 and 1=>0 transitions of
 * kvm_usage_count, i.e. at the beginning of the generic hardware enabling
 * sequence, and at the end of the generic hardware disabling sequence.
 */
void kvm_arch_enable_virtualization(void);
void kvm_arch_disable_virtualization(void);
/*
 * kvm_arch_{enable,disable}_virtualization_cpu() are called on "every" CPU to
 * do the actual twiddling of hardware bits.  The hooks are called on all
 * online CPUs when KVM enables/disabled virtualization, and on a single CPU
 * when that CPU is onlined/offlined (including for Resume/Suspend).
 */
int kvm_arch_enable_virtualization_cpu(void);
void kvm_arch_disable_virtualization_cpu(void);
#endif
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu);
int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu);
bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu);
bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu);
void kvm_arch_pre_destroy_vm(struct kvm *kvm);
void kvm_arch_create_vm_debugfs(struct kvm *kvm);

#ifndef __KVM_HAVE_ARCH_VM_ALLOC
/*
 * All architectures that want to use vzalloc currently also
 * need their own kvm_arch_alloc_vm implementation.
 */
static inline struct kvm *kvm_arch_alloc_vm(void)
{
        return kzalloc(sizeof(struct kvm), GFP_KERNEL_ACCOUNT);
}
#endif

static inline void __kvm_arch_free_vm(struct kvm *kvm)
{
        kvfree(kvm);
}

#ifndef __KVM_HAVE_ARCH_VM_FREE
static inline void kvm_arch_free_vm(struct kvm *kvm)
{
        __kvm_arch_free_vm(kvm);
}
#endif

#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS
static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
        return -ENOTSUPP;
}
#else
int kvm_arch_flush_remote_tlbs(struct kvm *kvm);
#endif

#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE
static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
                                                    gfn_t gfn, u64 nr_pages)
{
        return -EOPNOTSUPP;
}
#else
int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages);
#endif

#ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
bool kvm_arch_has_noncoherent_dma(struct kvm *kvm);
#else
static inline void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
{
}

static inline void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
{
}

static inline bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
{
        return false;
}
#endif
#ifdef __KVM_HAVE_ARCH_ASSIGNED_DEVICE
void kvm_arch_start_assignment(struct kvm *kvm);
void kvm_arch_end_assignment(struct kvm *kvm);
bool kvm_arch_has_assigned_device(struct kvm *kvm);
#else
static inline void kvm_arch_start_assignment(struct kvm *kvm)
{
}

static inline void kvm_arch_end_assignment(struct kvm *kvm)
{
}

static __always_inline bool kvm_arch_has_assigned_device(struct kvm *kvm)
{
        return false;
}
#endif

static inline struct rcuwait *kvm_arch_vcpu_get_wait(struct kvm_vcpu *vcpu)
{
#ifdef __KVM_HAVE_ARCH_WQP
        return vcpu->arch.waitp;
#else
        return &vcpu->wait;
#endif
}

/*
 * Wake a vCPU if necessary, but don't do any stats/metadata updates.  Returns
 * true if the vCPU was blocking and was awakened, false otherwise.
 */
static inline bool __kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
{
        return !!rcuwait_wake_up(kvm_arch_vcpu_get_wait(vcpu));
}

static inline bool kvm_vcpu_is_blocking(struct kvm_vcpu *vcpu)
{
        return rcuwait_active(kvm_arch_vcpu_get_wait(vcpu));
}

#ifdef __KVM_HAVE_ARCH_INTC_INITIALIZED
/*
 * returns true if the virtual interrupt controller is initialized and
 * ready to accept virtual IRQ. On some architectures the virtual interrupt
 * controller is dynamically instantiated and this is not always true.
 */
bool kvm_arch_intc_initialized(struct kvm *kvm);
#else
static inline bool kvm_arch_intc_initialized(struct kvm *kvm)
{
        return true;
}
#endif

#ifdef CONFIG_GUEST_PERF_EVENTS
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu);

void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void));
void kvm_unregister_perf_callbacks(void);
#else
static inline void kvm_register_perf_callbacks(void *ign) {}
static inline void kvm_unregister_perf_callbacks(void) {}
#endif /* CONFIG_GUEST_PERF_EVENTS */

int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
void kvm_arch_destroy_vm(struct kvm *kvm);

int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);

struct kvm_irq_ack_notifier {
        struct hlist_node link;
        unsigned gsi;
        void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
};

int kvm_irq_map_gsi(struct kvm *kvm,
                    struct kvm_kernel_irq_routing_entry *entries, int gsi);
int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin);

int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
                bool line_status);
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
                int irq_source_id, int level, bool line_status);
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
                               struct kvm *kvm, int irq_source_id,
                               int level, bool line_status);
bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
void kvm_notify_acked_gsi(struct kvm *kvm, int gsi);
void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
void kvm_register_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian);
void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian);
int kvm_request_irq_source_id(struct kvm *kvm);
void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args);

/*
 * Returns a pointer to the memslot if it contains gfn.
 * Otherwise returns NULL.
 */
static inline struct kvm_memory_slot *
try_get_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
{
        if (!slot)
                return NULL;

        if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages)
                return slot;
        else
                return NULL;
}

/*
 * Returns a pointer to the memslot that contains gfn. Otherwise returns NULL.
 *
 * With "approx" set returns the memslot also when the address falls
 * in a hole. In that case one of the memslots bordering the hole is
 * returned.
 */
static inline struct kvm_memory_slot *
search_memslots(struct kvm_memslots *slots, gfn_t gfn, bool approx)
{
        struct kvm_memory_slot *slot;
        struct rb_node *node;
        int idx = slots->node_idx;

        slot = NULL;
        for (node = slots->gfn_tree.rb_node; node; ) {
                slot = container_of(node, struct kvm_memory_slot, gfn_node[idx]);
                if (gfn >= slot->base_gfn) {
                        if (gfn < slot->base_gfn + slot->npages)
                                return slot;
                        node = node->rb_right;
                } else
                        node = node->rb_left;
        }

        return approx ? slot : NULL;
}

static inline struct kvm_memory_slot *
____gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn, bool approx)
{
        struct kvm_memory_slot *slot;

        slot = (struct kvm_memory_slot *)atomic_long_read(&slots->last_used_slot);
        slot = try_get_memslot(slot, gfn);
        if (slot)
                return slot;

        slot = search_memslots(slots, gfn, approx);
        if (slot) {
                atomic_long_set(&slots->last_used_slot, (unsigned long)slot);
                return slot;
        }

        return NULL;
}

/*
 * __gfn_to_memslot() and its descendants are here to allow arch code to inline
 * the lookups in hot paths.  gfn_to_memslot() itself isn't here as an inline
 * because that would bloat other code too much.
 */
static inline struct kvm_memory_slot *
__gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
{
        return ____gfn_to_memslot(slots, gfn, false);
}

static inline unsigned long
__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
{
        /*
         * The index was checked originally in search_memslots.  To avoid
         * that a malicious guest builds a Spectre gadget out of e.g. page
         * table walks, do not let the processor speculate loads outside
         * the guest's registered memslots.
         */
        unsigned long offset = gfn - slot->base_gfn;
        offset = array_index_nospec(offset, slot->npages);
        return slot->userspace_addr + offset * PAGE_SIZE;
}

static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
{
        return gfn_to_memslot(kvm, gfn)->id;
}

static inline gfn_t
hva_to_gfn_memslot(unsigned long hva, struct kvm_memory_slot *slot)
{
        gfn_t gfn_offset = (hva - slot->userspace_addr) >> PAGE_SHIFT;

        return slot->base_gfn + gfn_offset;
}

static inline gpa_t gfn_to_gpa(gfn_t gfn)
{
        return (gpa_t)gfn << PAGE_SHIFT;
}

static inline gfn_t gpa_to_gfn(gpa_t gpa)
{
        return (gfn_t)(gpa >> PAGE_SHIFT);
}

static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn)
{
        return (hpa_t)pfn << PAGE_SHIFT;
}

static inline bool kvm_is_gpa_in_memslot(struct kvm *kvm, gpa_t gpa)
{
        unsigned long hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));

        return !kvm_is_error_hva(hva);
}

static inline void kvm_gpc_mark_dirty_in_slot(struct gfn_to_pfn_cache *gpc)
{
        lockdep_assert_held(&gpc->lock);

        if (!gpc->memslot)
                return;

        mark_page_dirty_in_slot(gpc->kvm, gpc->memslot, gpa_to_gfn(gpc->gpa));
}

enum kvm_stat_kind {
        KVM_STAT_VM,
        KVM_STAT_VCPU,
};

struct kvm_stat_data {
        struct kvm *kvm;
        const struct _kvm_stats_desc *desc;
        enum kvm_stat_kind kind;
};

struct _kvm_stats_desc {
        struct kvm_stats_desc desc;
        char name[KVM_STATS_NAME_SIZE];
};

#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz)                       \
        .flags = type | unit | base |                                               \
                 BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) |               \
                 BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) |               \
                 BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK),               \
        .exponent = exp,                                                       \
        .size = sz,                                                               \
        .bucket_size = bsz

#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz)               \
        {                                                                       \
                {                                                               \
                        STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
                        .offset = offsetof(struct kvm_vm_stat, generic.stat)   \
                },                                                               \
                .name = #stat,                                                       \
        }
#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz)               \
        {                                                                       \
                {                                                               \
                        STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
                        .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \
                },                                                               \
                .name = #stat,                                                       \
        }
#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz)                       \
        {                                                                       \
                {                                                               \
                        STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
                        .offset = offsetof(struct kvm_vm_stat, stat)               \
                },                                                               \
                .name = #stat,                                                       \
        }
#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz)                       \
        {                                                                       \
                {                                                               \
                        STATS_DESC_COMMON(type, unit, base, exp, sz, bsz),     \
                        .offset = offsetof(struct kvm_vcpu_stat, stat)               \
                },                                                               \
                .name = #stat,                                                       \
        }
/* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */
#define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz)                       \
        SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz)

#define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent)               \
        STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE,                       \
                unit, base, exponent, 1, 0)
#define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent)                       \
        STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT,                               \
                unit, base, exponent, 1, 0)
#define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent)                       \
        STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK,                               \
                unit, base, exponent, 1, 0)
#define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz)     \
        STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST,                       \
                unit, base, exponent, sz, bsz)
#define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz)               \
        STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST,                       \
                unit, base, exponent, sz, 0)

/* Cumulative counter, read/write */
#define STATS_DESC_COUNTER(SCOPE, name)                                               \
        STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_NONE,                       \
                KVM_STATS_BASE_POW10, 0)
/* Instantaneous counter, read only */
#define STATS_DESC_ICOUNTER(SCOPE, name)                                       \
        STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_NONE,                       \
                KVM_STATS_BASE_POW10, 0)
/* Peak counter, read/write */
#define STATS_DESC_PCOUNTER(SCOPE, name)                                       \
        STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_NONE,                       \
                KVM_STATS_BASE_POW10, 0)

/* Instantaneous boolean value, read only */
#define STATS_DESC_IBOOLEAN(SCOPE, name)                                       \
        STATS_DESC_INSTANT(SCOPE, name, KVM_STATS_UNIT_BOOLEAN,                       \
                KVM_STATS_BASE_POW10, 0)
/* Peak (sticky) boolean value, read/write */
#define STATS_DESC_PBOOLEAN(SCOPE, name)                                       \
        STATS_DESC_PEAK(SCOPE, name, KVM_STATS_UNIT_BOOLEAN,                       \
                KVM_STATS_BASE_POW10, 0)

/* Cumulative time in nanosecond */
#define STATS_DESC_TIME_NSEC(SCOPE, name)                                       \
        STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS,               \
                KVM_STATS_BASE_POW10, -9)
/* Linear histogram for time in nanosecond */
#define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz)                       \
        STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS,               \
                KVM_STATS_BASE_POW10, -9, sz, bsz)
/* Logarithmic histogram for time in nanosecond */
#define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz)                               \
        STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS,               \
                KVM_STATS_BASE_POW10, -9, sz)

#define KVM_GENERIC_VM_STATS()                                                       \
        STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush),                       \
        STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests)

#define KVM_GENERIC_VCPU_STATS()                                               \
        STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll),                       \
        STATS_DESC_COUNTER(VCPU_GENERIC, halt_attempted_poll),                       \
        STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid),                       \
        STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup),                               \
        STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns),               \
        STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns),                       \
        STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns),                       \
        STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist,     \
                        HALT_POLL_HIST_COUNT),                                       \
        STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist,               \
                        HALT_POLL_HIST_COUNT),                                       \
        STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist,               \
                        HALT_POLL_HIST_COUNT),                                       \
        STATS_DESC_IBOOLEAN(VCPU_GENERIC, blocking)

ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header,
                       const struct _kvm_stats_desc *desc,
                       void *stats, size_t size_stats,
                       char __user *user_buffer, size_t size, loff_t *offset);

/**
 * kvm_stats_linear_hist_update() - Update bucket value for linear histogram
 * statistics data.
 *
 * @data: start address of the stats data
 * @size: the number of bucket of the stats data
 * @value: the new value used to update the linear histogram's bucket
 * @bucket_size: the size (width) of a bucket
 */
static inline void kvm_stats_linear_hist_update(u64 *data, size_t size,
                                                u64 value, size_t bucket_size)
{
        size_t index = div64_u64(value, bucket_size);

        index = min(index, size - 1);
        ++data[index];
}

/**
 * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram
 * statistics data.
 *
 * @data: start address of the stats data
 * @size: the number of bucket of the stats data
 * @value: the new value used to update the logarithmic histogram's bucket
 */
static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value)
{
        size_t index = fls64(value);

        index = min(index, size - 1);
        ++data[index];
}

#define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize)                       \
        kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize)
#define KVM_STATS_LOG_HIST_UPDATE(array, value)                                       \
        kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value)


extern const struct kvm_stats_header kvm_vm_stats_header;
extern const struct _kvm_stats_desc kvm_vm_stats_desc[];
extern const struct kvm_stats_header kvm_vcpu_stats_header;
extern const struct _kvm_stats_desc kvm_vcpu_stats_desc[];

#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
static inline int mmu_invalidate_retry(struct kvm *kvm, unsigned long mmu_seq)
{
        if (unlikely(kvm->mmu_invalidate_in_progress))
                return 1;
        /*
         * Ensure the read of mmu_invalidate_in_progress happens before
         * the read of mmu_invalidate_seq.  This interacts with the
         * smp_wmb() in mmu_notifier_invalidate_range_end to make sure
         * that the caller either sees the old (non-zero) value of
         * mmu_invalidate_in_progress or the new (incremented) value of
         * mmu_invalidate_seq.
         *
         * PowerPC Book3s HV KVM calls this under a per-page lock rather
         * than under kvm->mmu_lock, for scalability, so can't rely on
         * kvm->mmu_lock to keep things ordered.
         */
        smp_rmb();
        if (kvm->mmu_invalidate_seq != mmu_seq)
                return 1;
        return 0;
}

static inline int mmu_invalidate_retry_gfn(struct kvm *kvm,
                                           unsigned long mmu_seq,
                                           gfn_t gfn)
{
        lockdep_assert_held(&kvm->mmu_lock);
        /*
         * If mmu_invalidate_in_progress is non-zero, then the range maintained
         * by kvm_mmu_notifier_invalidate_range_start contains all addresses
         * that might be being invalidated. Note that it may include some false
         * positives, due to shortcuts when handing concurrent invalidations.
         */
        if (unlikely(kvm->mmu_invalidate_in_progress)) {
                /*
                 * Dropping mmu_lock after bumping mmu_invalidate_in_progress
                 * but before updating the range is a KVM bug.
                 */
                if (WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA ||
                                 kvm->mmu_invalidate_range_end == INVALID_GPA))
                        return 1;

                if (gfn >= kvm->mmu_invalidate_range_start &&
                    gfn < kvm->mmu_invalidate_range_end)
                        return 1;
        }

        if (kvm->mmu_invalidate_seq != mmu_seq)
                return 1;
        return 0;
}

/*
 * This lockless version of the range-based retry check *must* be paired with a
 * call to the locked version after acquiring mmu_lock, i.e. this is safe to
 * use only as a pre-check to avoid contending mmu_lock.  This version *will*
 * get false negatives and false positives.
 */
static inline bool mmu_invalidate_retry_gfn_unsafe(struct kvm *kvm,
                                                   unsigned long mmu_seq,
                                                   gfn_t gfn)
{
        /*
         * Use READ_ONCE() to ensure the in-progress flag and sequence counter
         * are always read from memory, e.g. so that checking for retry in a
         * loop won't result in an infinite retry loop.  Don't force loads for
         * start+end, as the key to avoiding infinite retry loops is observing
         * the 1=>0 transition of in-progress, i.e. getting false negatives
         * due to stale start+end values is acceptable.
         */
        if (unlikely(READ_ONCE(kvm->mmu_invalidate_in_progress)) &&
            gfn >= kvm->mmu_invalidate_range_start &&
            gfn < kvm->mmu_invalidate_range_end)
                return true;

        return READ_ONCE(kvm->mmu_invalidate_seq) != mmu_seq;
}
#endif

#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING

#define KVM_MAX_IRQ_ROUTES 4096 /* might need extension/rework in the future */

bool kvm_arch_can_set_irq_routing(struct kvm *kvm);
int kvm_set_irq_routing(struct kvm *kvm,
                        const struct kvm_irq_routing_entry *entries,
                        unsigned nr,
                        unsigned flags);
int kvm_init_irq_routing(struct kvm *kvm);
int kvm_set_routing_entry(struct kvm *kvm,
                          struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue);
void kvm_free_irq_routing(struct kvm *kvm);

#else

static inline void kvm_free_irq_routing(struct kvm *kvm) {}

static inline int kvm_init_irq_routing(struct kvm *kvm)
{
        return 0;
}

#endif

int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);

void kvm_eventfd_init(struct kvm *kvm);
int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args);

#ifdef CONFIG_HAVE_KVM_IRQCHIP
int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args);
void kvm_irqfd_release(struct kvm *kvm);
bool kvm_notify_irqfd_resampler(struct kvm *kvm,
                                unsigned int irqchip,
                                unsigned int pin);
void kvm_irq_routing_update(struct kvm *);
#else
static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
        return -EINVAL;
}

static inline void kvm_irqfd_release(struct kvm *kvm) {}

static inline bool kvm_notify_irqfd_resampler(struct kvm *kvm,
                                              unsigned int irqchip,
                                              unsigned int pin)
{
        return false;
}
#endif /* CONFIG_HAVE_KVM_IRQCHIP */

void kvm_arch_irq_routing_update(struct kvm *kvm);

static inline void __kvm_make_request(int req, struct kvm_vcpu *vcpu)
{
        /*
         * Ensure the rest of the request is published to kvm_check_request's
         * caller.  Paired with the smp_mb__after_atomic in kvm_check_request.
         */
        smp_wmb();
        set_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
}

static __always_inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
{
        /*
         * Request that don't require vCPU action should never be logged in
         * vcpu->requests.  The vCPU won't clear the request, so it will stay
         * logged indefinitely and prevent the vCPU from entering the guest.
         */
        BUILD_BUG_ON(!__builtin_constant_p(req) ||
                     (req & KVM_REQUEST_NO_ACTION));

        __kvm_make_request(req, vcpu);
}

static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
{
        return READ_ONCE(vcpu->requests);
}

static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
{
        return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
}

static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
{
        clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
}

static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
{
        if (kvm_test_request(req, vcpu)) {
                kvm_clear_request(req, vcpu);

                /*
                 * Ensure the rest of the request is visible to kvm_check_request's
                 * caller.  Paired with the smp_wmb in kvm_make_request.
                 */
                smp_mb__after_atomic();
                return true;
        } else {
                return false;
        }
}

#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
extern bool kvm_rebooting;
#endif

extern unsigned int halt_poll_ns;
extern unsigned int halt_poll_ns_grow;
extern unsigned int halt_poll_ns_grow_start;
extern unsigned int halt_poll_ns_shrink;

struct kvm_device {
        const struct kvm_device_ops *ops;
        struct kvm *kvm;
        void *private;
        struct list_head vm_node;
};

/* create, destroy, and name are mandatory */
struct kvm_device_ops {
        const char *name;

        /*
         * create is called holding kvm->lock and any operations not suitable
         * to do while holding the lock should be deferred to init (see
         * below).
         */
        int (*create)(struct kvm_device *dev, u32 type);

        /*
         * init is called after create if create is successful and is called
         * outside of holding kvm->lock.
         */
        void (*init)(struct kvm_device *dev);

        /*
         * Destroy is responsible for freeing dev.
         *
         * Destroy may be called before or after destructors are called
         * on emulated I/O regions, depending on whether a reference is
         * held by a vcpu or other kvm component that gets destroyed
         * after the emulated I/O.
         */
        void (*destroy)(struct kvm_device *dev);

        /*
         * Release is an alternative method to free the device. It is
         * called when the device file descriptor is closed. Once
         * release is called, the destroy method will not be called
         * anymore as the device is removed from the device list of
         * the VM. kvm->lock is held.
         */
        void (*release)(struct kvm_device *dev);

        int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
        int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
        int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
        long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
                      unsigned long arg);
        int (*mmap)(struct kvm_device *dev, struct vm_area_struct *vma);
};

struct kvm_device *kvm_device_from_filp(struct file *filp);
int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type);
void kvm_unregister_device_ops(u32 type);

extern struct kvm_device_ops kvm_mpic_ops;
extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
extern struct kvm_device_ops kvm_arm_vgic_v3_ops;

#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT

static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
{
        vcpu->spin_loop.in_spin_loop = val;
}
static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
{
        vcpu->spin_loop.dy_eligible = val;
}

#else /* !CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */

static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
{
}

static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
{
}
#endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */

static inline bool kvm_is_visible_memslot(struct kvm_memory_slot *memslot)
{
        return (memslot && memslot->id < KVM_USER_MEM_SLOTS &&
                !(memslot->flags & KVM_MEMSLOT_INVALID));
}

struct kvm_vcpu *kvm_get_running_vcpu(void);
struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);

#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
bool kvm_arch_has_irq_bypass(void);
int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
                           struct irq_bypass_producer *);
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
                           struct irq_bypass_producer *);
void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
                                  uint32_t guest_irq, bool set);
bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *,
                                  struct kvm_kernel_irq_routing_entry *);
#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */

#ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS
/* If we wakeup during the poll time, was it a sucessful poll? */
static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu)
{
        return vcpu->valid_wakeup;
}

#else
static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu)
{
        return true;
}
#endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */

#ifdef CONFIG_HAVE_KVM_NO_POLL
/* Callback that tells if we must not poll */
bool kvm_arch_no_poll(struct kvm_vcpu *vcpu);
#else
static inline bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
        return false;
}
#endif /* CONFIG_HAVE_KVM_NO_POLL */

#ifdef CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL
long kvm_arch_vcpu_async_ioctl(struct file *filp,
                               unsigned int ioctl, unsigned long arg);
#else
static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
                                             unsigned int ioctl,
                                             unsigned long arg)
{
        return -ENOIOCTLCMD;
}
#endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */

void kvm_arch_guest_memory_reclaimed(struct kvm *kvm);

#ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
#else
static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
{
        return 0;
}
#endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */

#ifdef CONFIG_KVM_XFER_TO_GUEST_WORK
static inline void kvm_handle_signal_exit(struct kvm_vcpu *vcpu)
{
        vcpu->run->exit_reason = KVM_EXIT_INTR;
        vcpu->stat.signal_exits++;
}
#endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */

/*
 * If more than one page is being (un)accounted, @virt must be the address of
 * the first page of a block of pages what were allocated together (i.e
 * accounted together).
 *
 * kvm_account_pgtable_pages() is thread-safe because mod_lruvec_page_state()
 * is thread-safe.
 */
static inline void kvm_account_pgtable_pages(void *virt, int nr)
{
        mod_lruvec_page_state(virt_to_page(virt), NR_SECONDARY_PAGETABLE, nr);
}

/*
 * This defines how many reserved entries we want to keep before we
 * kick the vcpu to the userspace to avoid dirty ring full.  This
 * value can be tuned to higher if e.g. PML is enabled on the host.
 */
#define  KVM_DIRTY_RING_RSVD_ENTRIES  64

/* Max number of entries allowed for each kvm dirty ring */
#define  KVM_DIRTY_RING_MAX_ENTRIES  65536

static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
                                                 gpa_t gpa, gpa_t size,
                                                 bool is_write, bool is_exec,
                                                 bool is_private)
{
        vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
        vcpu->run->memory_fault.gpa = gpa;
        vcpu->run->memory_fault.size = size;

        /* RWX flags are not (yet) defined or communicated to userspace. */
        vcpu->run->memory_fault.flags = 0;
        if (is_private)
                vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
}

#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
{
        return xa_to_value(xa_load(&kvm->mem_attr_array, gfn));
}

bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
                                     unsigned long mask, unsigned long attrs);
bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
                                        struct kvm_gfn_range *range);
bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
                                         struct kvm_gfn_range *range);

static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
        return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
               kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
}
#else
static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
{
        return false;
}
#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */

#ifdef CONFIG_KVM_PRIVATE_MEM
int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
                     gfn_t gfn, kvm_pfn_t *pfn, struct page **page,
                     int *max_order);
#else
static inline int kvm_gmem_get_pfn(struct kvm *kvm,
                                   struct kvm_memory_slot *slot, gfn_t gfn,
                                   kvm_pfn_t *pfn, struct page **page,
                                   int *max_order)
{
        KVM_BUG_ON(1, kvm);
        return -EIO;
}
#endif /* CONFIG_KVM_PRIVATE_MEM */

#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_PREPARE
int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_order);
#endif

#ifdef CONFIG_KVM_GENERIC_PRIVATE_MEM
/**
 * kvm_gmem_populate() - Populate/prepare a GPA range with guest data
 *
 * @kvm: KVM instance
 * @gfn: starting GFN to be populated
 * @src: userspace-provided buffer containing data to copy into GFN range
 *       (passed to @post_populate, and incremented on each iteration
 *       if not NULL)
 * @npages: number of pages to copy from userspace-buffer
 * @post_populate: callback to issue for each gmem page that backs the GPA
 *                 range
 * @opaque: opaque data to pass to @post_populate callback
 *
 * This is primarily intended for cases where a gmem-backed GPA range needs
 * to be initialized with userspace-provided data prior to being mapped into
 * the guest as a private page. This should be called with the slots->lock
 * held so that caller-enforced invariants regarding the expected memory
 * attributes of the GPA range do not race with KVM_SET_MEMORY_ATTRIBUTES.
 *
 * Returns the number of pages that were populated.
 */
typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
                                    void __user *src, int order, void *opaque);

long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
                       kvm_gmem_populate_cb post_populate, void *opaque);
#endif

#ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end);
#endif

#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
                                    struct kvm_pre_fault_memory *range);
#endif

#endif









































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_FLOW_DISSECTOR_H
#define _NET_FLOW_DISSECTOR_H

#include <linux/types.h>
#include <linux/in6.h>
#include <linux/siphash.h>
#include <linux/string.h>
#include <uapi/linux/if_ether.h>
#include <uapi/linux/pkt_cls.h>

struct bpf_prog;
struct net;
struct sk_buff;

/**
 * struct flow_dissector_key_control:
 * @thoff:     Transport header offset
 * @addr_type: Type of key. One of FLOW_DISSECTOR_KEY_*
 * @flags:     Key flags.
 *             Any of FLOW_DIS_(IS_FRAGMENT|FIRST_FRAG|ENCAPSULATION|F_*)
 */
struct flow_dissector_key_control {
        u16        thoff;
        u16        addr_type;
        u32        flags;
};

/* The control flags are kept in sync with TCA_FLOWER_KEY_FLAGS_*, as those
 * flags are exposed to userspace in some error paths, ie. unsupported flags.
 */
enum flow_dissector_ctrl_flags {
        FLOW_DIS_IS_FRAGMENT                = TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT,
        FLOW_DIS_FIRST_FRAG                = TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST,
        FLOW_DIS_F_TUNNEL_CSUM                = TCA_FLOWER_KEY_FLAGS_TUNNEL_CSUM,
        FLOW_DIS_F_TUNNEL_DONT_FRAGMENT        = TCA_FLOWER_KEY_FLAGS_TUNNEL_DONT_FRAGMENT,
        FLOW_DIS_F_TUNNEL_OAM                = TCA_FLOWER_KEY_FLAGS_TUNNEL_OAM,
        FLOW_DIS_F_TUNNEL_CRIT_OPT        = TCA_FLOWER_KEY_FLAGS_TUNNEL_CRIT_OPT,

        /* These flags are internal to the kernel */
        FLOW_DIS_ENCAPSULATION                = (TCA_FLOWER_KEY_FLAGS_MAX << 1),
};

enum flow_dissect_ret {
        FLOW_DISSECT_RET_OUT_GOOD,
        FLOW_DISSECT_RET_OUT_BAD,
        FLOW_DISSECT_RET_PROTO_AGAIN,
        FLOW_DISSECT_RET_IPPROTO_AGAIN,
        FLOW_DISSECT_RET_CONTINUE,
};

/**
 * struct flow_dissector_key_basic:
 * @n_proto:  Network header protocol (eg. IPv4/IPv6)
 * @ip_proto: Transport header protocol (eg. TCP/UDP)
 * @padding:  Unused
 */
struct flow_dissector_key_basic {
        __be16        n_proto;
        u8        ip_proto;
        u8        padding;
};

struct flow_dissector_key_tags {
        u32        flow_label;
};

struct flow_dissector_key_vlan {
        union {
                struct {
                        u16        vlan_id:12,
                                vlan_dei:1,
                                vlan_priority:3;
                };
                __be16        vlan_tci;
        };
        __be16        vlan_tpid;
        __be16        vlan_eth_type;
        u16        padding;
};

struct flow_dissector_mpls_lse {
        u32        mpls_ttl:8,
                mpls_bos:1,
                mpls_tc:3,
                mpls_label:20;
};

#define FLOW_DIS_MPLS_MAX 7
struct flow_dissector_key_mpls {
        struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */
        u8 used_lses; /* One bit set for each Label Stack Entry in use */
};

static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls,
                                          int lse_index)
{
        mpls->used_lses |= 1 << lse_index;
}

#define FLOW_DIS_TUN_OPTS_MAX 255
/**
 * struct flow_dissector_key_enc_opts:
 * @data: tunnel option data
 * @len: length of tunnel option data
 * @dst_opt_type: tunnel option type
 */
struct flow_dissector_key_enc_opts {
        u8 data[FLOW_DIS_TUN_OPTS_MAX];        /* Using IP_TUNNEL_OPTS_MAX is desired
                                         * here but seems difficult to #include
                                         */
        u8 len;
        u32 dst_opt_type;
};

struct flow_dissector_key_keyid {
        __be32        keyid;
};

/**
 * struct flow_dissector_key_ipv4_addrs:
 * @src: source ip address
 * @dst: destination ip address
 */
struct flow_dissector_key_ipv4_addrs {
        /* (src,dst) must be grouped, in the same way than in IP header */
        __be32 src;
        __be32 dst;
};

/**
 * struct flow_dissector_key_ipv6_addrs:
 * @src: source ip address
 * @dst: destination ip address
 */
struct flow_dissector_key_ipv6_addrs {
        /* (src,dst) must be grouped, in the same way than in IP header */
        struct in6_addr src;
        struct in6_addr dst;
};

/**
 * struct flow_dissector_key_tipc:
 * @key: source node address combined with selector
 */
struct flow_dissector_key_tipc {
        __be32 key;
};

/**
 * struct flow_dissector_key_addrs:
 * @v4addrs: IPv4 addresses
 * @v6addrs: IPv6 addresses
 * @tipckey: TIPC key
 */
struct flow_dissector_key_addrs {
        union {
                struct flow_dissector_key_ipv4_addrs v4addrs;
                struct flow_dissector_key_ipv6_addrs v6addrs;
                struct flow_dissector_key_tipc tipckey;
        };
};

/**
 * struct flow_dissector_key_arp:
 * @sip: Sender IP address
 * @tip: Target IP address
 * @op:  Operation
 * @sha: Sender hardware address
 * @tha: Target hardware address
 */
struct flow_dissector_key_arp {
        __u32 sip;
        __u32 tip;
        __u8 op;
        unsigned char sha[ETH_ALEN];
        unsigned char tha[ETH_ALEN];
};

/**
 * struct flow_dissector_key_ports:
 * @ports: port numbers of Transport header
 * @src: source port number
 * @dst: destination port number
 */
struct flow_dissector_key_ports {
        union {
                __be32 ports;
                struct {
                        __be16 src;
                        __be16 dst;
                };
        };
};

/**
 * struct flow_dissector_key_ports_range
 * @tp: port number from packet
 * @tp_min: min port number in range
 * @tp_max: max port number in range
 */
struct flow_dissector_key_ports_range {
        union {
                struct flow_dissector_key_ports tp;
                struct {
                        struct flow_dissector_key_ports tp_min;
                        struct flow_dissector_key_ports tp_max;
                };
        };
};

/**
 * struct flow_dissector_key_icmp:
 * @type: ICMP type
 * @code: ICMP code
 * @id:   Session identifier
 */
struct flow_dissector_key_icmp {
        struct {
                u8 type;
                u8 code;
        };
        u16 id;
};

/**
 * struct flow_dissector_key_eth_addrs:
 * @src: source Ethernet address
 * @dst: destination Ethernet address
 */
struct flow_dissector_key_eth_addrs {
        /* (dst,src) must be grouped, in the same way than in ETH header */
        unsigned char dst[ETH_ALEN];
        unsigned char src[ETH_ALEN];
};

/**
 * struct flow_dissector_key_tcp:
 * @flags: flags
 */
struct flow_dissector_key_tcp {
        __be16 flags;
};

/**
 * struct flow_dissector_key_ip:
 * @tos: tos
 * @ttl: ttl
 */
struct flow_dissector_key_ip {
        __u8        tos;
        __u8        ttl;
};

/**
 * struct flow_dissector_key_meta:
 * @ingress_ifindex: ingress ifindex
 * @ingress_iftype: ingress interface type
 * @l2_miss: packet did not match an L2 entry during forwarding
 */
struct flow_dissector_key_meta {
        int ingress_ifindex;
        u16 ingress_iftype;
        u8 l2_miss;
};

/**
 * struct flow_dissector_key_ct:
 * @ct_state: conntrack state after converting with map
 * @ct_mark: conttrack mark
 * @ct_zone: conntrack zone
 * @ct_labels: conntrack labels
 */
struct flow_dissector_key_ct {
        u16        ct_state;
        u16        ct_zone;
        u32        ct_mark;
        u32        ct_labels[4];
};

/**
 * struct flow_dissector_key_hash:
 * @hash: hash value
 */
struct flow_dissector_key_hash {
        u32 hash;
};

/**
 * struct flow_dissector_key_num_of_vlans:
 * @num_of_vlans: num_of_vlans value
 */
struct flow_dissector_key_num_of_vlans {
        u8 num_of_vlans;
};

/**
 * struct flow_dissector_key_pppoe:
 * @session_id: pppoe session id
 * @ppp_proto: ppp protocol
 * @type: pppoe eth type
 */
struct flow_dissector_key_pppoe {
        __be16 session_id;
        __be16 ppp_proto;
        __be16 type;
};

/**
 * struct flow_dissector_key_l2tpv3:
 * @session_id: identifier for a l2tp session
 */
struct flow_dissector_key_l2tpv3 {
        __be32 session_id;
};

/**
 * struct flow_dissector_key_ipsec:
 * @spi: identifier for a ipsec connection
 */
struct flow_dissector_key_ipsec {
        __be32 spi;
};

/**
 * struct flow_dissector_key_cfm
 * @mdl_ver: maintenance domain level (mdl) and cfm protocol version
 * @opcode: code specifying a type of cfm protocol packet
 *
 * See 802.1ag, ITU-T G.8013/Y.1731
 *         1               2
 * |7 6 5 4 3 2 1 0|7 6 5 4 3 2 1 0|
 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
 * | mdl | version |     opcode    |
 * +-----+---------+-+-+-+-+-+-+-+-+
 */
struct flow_dissector_key_cfm {
        u8        mdl_ver;
        u8        opcode;
};

#define FLOW_DIS_CFM_MDL_MASK GENMASK(7, 5)
#define FLOW_DIS_CFM_MDL_MAX 7

enum flow_dissector_key_id {
        FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */
        FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */
        FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
        FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
        FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */
        FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */
        FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */
        FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */
        FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_vlan */
        FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_tags */
        FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */
        FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */
        FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */
        FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */
        FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */
        FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */
        FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */
        FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */
        FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */
        FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */
        FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */
        FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */
        FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */
        FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */
        FLOW_DISSECTOR_KEY_NUM_OF_VLANS, /* struct flow_dissector_key_num_of_vlans */
        FLOW_DISSECTOR_KEY_PPPOE, /* struct flow_dissector_key_pppoe */
        FLOW_DISSECTOR_KEY_L2TPV3, /* struct flow_dissector_key_l2tpv3 */
        FLOW_DISSECTOR_KEY_CFM, /* struct flow_dissector_key_cfm */
        FLOW_DISSECTOR_KEY_IPSEC, /* struct flow_dissector_key_ipsec */

        FLOW_DISSECTOR_KEY_MAX,
};

#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG                BIT(0)
#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL        BIT(1)
#define FLOW_DISSECTOR_F_STOP_AT_ENCAP                BIT(2)
#define FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP        BIT(3)

struct flow_dissector_key {
        enum flow_dissector_key_id key_id;
        size_t offset; /* offset of struct flow_dissector_key_*
                          in target the struct */
};

struct flow_dissector {
        unsigned long long  used_keys;
                /* each bit represents presence of one key id */
        unsigned short int offset[FLOW_DISSECTOR_KEY_MAX];
};

struct flow_keys_basic {
        struct flow_dissector_key_control control;
        struct flow_dissector_key_basic basic;
};

struct flow_keys {
        struct flow_dissector_key_control control;
#define FLOW_KEYS_HASH_START_FIELD basic
        struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT);
        struct flow_dissector_key_tags tags;
        struct flow_dissector_key_vlan vlan;
        struct flow_dissector_key_vlan cvlan;
        struct flow_dissector_key_keyid keyid;
        struct flow_dissector_key_ports ports;
        struct flow_dissector_key_icmp icmp;
        /* 'addrs' must be the last member */
        struct flow_dissector_key_addrs addrs;
};

#define FLOW_KEYS_HASH_OFFSET                \
        offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)

__be32 flow_get_u32_src(const struct flow_keys *flow);
__be32 flow_get_u32_dst(const struct flow_keys *flow);

extern struct flow_dissector flow_keys_dissector;
extern struct flow_dissector flow_keys_basic_dissector;

/* struct flow_keys_digest:
 *
 * This structure is used to hold a digest of the full flow keys. This is a
 * larger "hash" of a flow to allow definitively matching specific flows where
 * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so
 * that it can be used in CB of skb (see sch_choke for an example).
 */
#define FLOW_KEYS_DIGEST_LEN        16
struct flow_keys_digest {
        u8        data[FLOW_KEYS_DIGEST_LEN];
};

void make_flow_keys_digest(struct flow_keys_digest *digest,
                           const struct flow_keys *flow);

static inline bool flow_keys_have_l4(const struct flow_keys *keys)
{
        return (keys->ports.ports || keys->tags.flow_label);
}

u32 flow_hash_from_keys(struct flow_keys *keys);
u32 flow_hash_from_keys_seed(struct flow_keys *keys,
                             const siphash_key_t *keyval);
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                           struct flow_dissector_key_icmp *key_icmp,
                           const void *data, int thoff, int hlen);

static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector,
                                      enum flow_dissector_key_id key_id)
{
        return flow_dissector->used_keys & (1ULL << key_id);
}

static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
                                              enum flow_dissector_key_id key_id,
                                              void *target_container)
{
        return ((char *)target_container) + flow_dissector->offset[key_id];
}

struct bpf_flow_dissector {
        struct bpf_flow_keys        *flow_keys;
        const struct sk_buff        *skb;
        const void                *data;
        const void                *data_end;
};

static inline void
flow_dissector_init_keys(struct flow_dissector_key_control *key_control,
                         struct flow_dissector_key_basic *key_basic)
{
        memset(key_control, 0, sizeof(*key_control));
        memset(key_basic, 0, sizeof(*key_basic));
}

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach_check(struct net *net,
                                         struct bpf_prog *prog);
#endif /* CONFIG_BPF_SYSCALL */

#endif
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  419 




  419 














































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Procedures for maintaining information about logical memory blocks.
 *
 * Peter Bergner, IBM Corp.        June 2001.
 * Copyright (C) 2001 Peter Bergner.
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/poison.h>
#include <linux/pfn.h>
#include <linux/debugfs.h>
#include <linux/kmemleak.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
#include <linux/mutex.h>

#include <asm/sections.h>
#include <linux/io.h>

#include "internal.h"

#define INIT_MEMBLOCK_REGIONS                        128
#define INIT_PHYSMEM_REGIONS                        4

#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
# define INIT_MEMBLOCK_RESERVED_REGIONS                INIT_MEMBLOCK_REGIONS
#endif

#ifndef INIT_MEMBLOCK_MEMORY_REGIONS
#define INIT_MEMBLOCK_MEMORY_REGIONS                INIT_MEMBLOCK_REGIONS
#endif

/**
 * DOC: memblock overview
 *
 * Memblock is a method of managing memory regions during the early
 * boot period when the usual kernel memory allocators are not up and
 * running.
 *
 * Memblock views the system memory as collections of contiguous
 * regions. There are several types of these collections:
 *
 * * ``memory`` - describes the physical memory available to the
 *   kernel; this may differ from the actual physical memory installed
 *   in the system, for instance when the memory is restricted with
 *   ``mem=`` command line parameter
 * * ``reserved`` - describes the regions that were allocated
 * * ``physmem`` - describes the actual physical memory available during
 *   boot regardless of the possible restrictions and memory hot(un)plug;
 *   the ``physmem`` type is only available on some architectures.
 *
 * Each region is represented by struct memblock_region that
 * defines the region extents, its attributes and NUMA node id on NUMA
 * systems. Every memory type is described by the struct memblock_type
 * which contains an array of memory regions along with
 * the allocator metadata. The "memory" and "reserved" types are nicely
 * wrapped with struct memblock. This structure is statically
 * initialized at build time. The region arrays are initially sized to
 * %INIT_MEMBLOCK_MEMORY_REGIONS for "memory" and
 * %INIT_MEMBLOCK_RESERVED_REGIONS for "reserved". The region array
 * for "physmem" is initially sized to %INIT_PHYSMEM_REGIONS.
 * The memblock_allow_resize() enables automatic resizing of the region
 * arrays during addition of new regions. This feature should be used
 * with care so that memory allocated for the region array will not
 * overlap with areas that should be reserved, for example initrd.
 *
 * The early architecture setup should tell memblock what the physical
 * memory layout is by using memblock_add() or memblock_add_node()
 * functions. The first function does not assign the region to a NUMA
 * node and it is appropriate for UMA systems. Yet, it is possible to
 * use it on NUMA systems as well and assign the region to a NUMA node
 * later in the setup process using memblock_set_node(). The
 * memblock_add_node() performs such an assignment directly.
 *
 * Once memblock is setup the memory can be allocated using one of the
 * API variants:
 *
 * * memblock_phys_alloc*() - these functions return the **physical**
 *   address of the allocated memory
 * * memblock_alloc*() - these functions return the **virtual** address
 *   of the allocated memory.
 *
 * Note, that both API variants use implicit assumptions about allowed
 * memory ranges and the fallback methods. Consult the documentation
 * of memblock_alloc_internal() and memblock_alloc_range_nid()
 * functions for more elaborate description.
 *
 * As the system boot progresses, the architecture specific mem_init()
 * function frees all the memory to the buddy page allocator.
 *
 * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
 * memblock data structures (except "physmem") will be discarded after the
 * system initialization completes.
 */

#ifndef CONFIG_NUMA
struct pglist_data __refdata contig_page_data;
EXPORT_SYMBOL(contig_page_data);
#endif

unsigned long max_low_pfn;
unsigned long min_low_pfn;
unsigned long max_pfn;
unsigned long long max_possible_pfn;

static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
#endif

struct memblock memblock __initdata_memblock = {
        .memory.regions                = memblock_memory_init_regions,
        .memory.max                = INIT_MEMBLOCK_MEMORY_REGIONS,
        .memory.name                = "memory",

        .reserved.regions        = memblock_reserved_init_regions,
        .reserved.max                = INIT_MEMBLOCK_RESERVED_REGIONS,
        .reserved.name                = "reserved",

        .bottom_up                = false,
        .current_limit                = MEMBLOCK_ALLOC_ANYWHERE,
};

#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
struct memblock_type physmem = {
        .regions                = memblock_physmem_init_regions,
        .max                        = INIT_PHYSMEM_REGIONS,
        .name                        = "physmem",
};
#endif

/*
 * keep a pointer to &memblock.memory in the text section to use it in
 * __next_mem_range() and its helpers.
 *  For architectures that do not keep memblock data after init, this
 * pointer will be reset to NULL at memblock_discard()
 */
static __refdata struct memblock_type *memblock_memory = &memblock.memory;

#define for_each_memblock_type(i, memblock_type, rgn)                        \
        for (i = 0, rgn = &memblock_type->regions[0];                        \
             i < memblock_type->cnt;                                        \
             i++, rgn = &memblock_type->regions[i])

#define memblock_dbg(fmt, ...)                                                \
        do {                                                                \
                if (memblock_debug)                                        \
                        pr_info(fmt, ##__VA_ARGS__);                        \
        } while (0)

static int memblock_debug __initdata_memblock;
static bool system_has_some_mirror __initdata_memblock;
static int memblock_can_resize __initdata_memblock;
static int memblock_memory_in_slab __initdata_memblock;
static int memblock_reserved_in_slab __initdata_memblock;

bool __init_memblock memblock_has_mirror(void)
{
        return system_has_some_mirror;
}

static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
        return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}

/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */
static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size)
{
        return *size = min(*size, PHYS_ADDR_MAX - base);
}

/*
 * Address comparison utilities
 */
unsigned long __init_memblock
memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2,
                       phys_addr_t size2)
{
        return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}

bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
                                        phys_addr_t base, phys_addr_t size)
{
        unsigned long i;

        memblock_cap_size(base, &size);

        for (i = 0; i < type->cnt; i++)
                if (memblock_addrs_overlap(base, size, type->regions[i].base,
                                           type->regions[i].size))
                        return true;
        return false;
}

/**
 * __memblock_find_range_bottom_up - find free area utility in bottom-up
 * @start: start of candidate range
 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
 *       %MEMBLOCK_ALLOC_ACCESSIBLE
 * @size: size of free area to find
 * @align: alignment of free area to find
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 * @flags: pick from blocks based on memory attributes
 *
 * Utility called from memblock_find_in_range_node(), find free area bottom-up.
 *
 * Return:
 * Found address on success, 0 on failure.
 */
static phys_addr_t __init_memblock
__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
                                phys_addr_t size, phys_addr_t align, int nid,
                                enum memblock_flags flags)
{
        phys_addr_t this_start, this_end, cand;
        u64 i;

        for_each_free_mem_range(i, nid, flags, &this_start, &this_end, NULL) {
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);

                cand = round_up(this_start, align);
                if (cand < this_end && this_end - cand >= size)
                        return cand;
        }

        return 0;
}

/**
 * __memblock_find_range_top_down - find free area utility, in top-down
 * @start: start of candidate range
 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
 *       %MEMBLOCK_ALLOC_ACCESSIBLE
 * @size: size of free area to find
 * @align: alignment of free area to find
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 * @flags: pick from blocks based on memory attributes
 *
 * Utility called from memblock_find_in_range_node(), find free area top-down.
 *
 * Return:
 * Found address on success, 0 on failure.
 */
static phys_addr_t __init_memblock
__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
                               phys_addr_t size, phys_addr_t align, int nid,
                               enum memblock_flags flags)
{
        phys_addr_t this_start, this_end, cand;
        u64 i;

        for_each_free_mem_range_reverse(i, nid, flags, &this_start, &this_end,
                                        NULL) {
                this_start = clamp(this_start, start, end);
                this_end = clamp(this_end, start, end);

                if (this_end < size)
                        continue;

                cand = round_down(this_end - size, align);
                if (cand >= this_start)
                        return cand;
        }

        return 0;
}

/**
 * memblock_find_in_range_node - find free area in given range and node
 * @size: size of free area to find
 * @align: alignment of free area to find
 * @start: start of candidate range
 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
 *       %MEMBLOCK_ALLOC_ACCESSIBLE
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 * @flags: pick from blocks based on memory attributes
 *
 * Find @size free area aligned to @align in the specified range and node.
 *
 * Return:
 * Found address on success, 0 on failure.
 */
static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t start,
                                        phys_addr_t end, int nid,
                                        enum memblock_flags flags)
{
        /* pump up @end */
        if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
            end == MEMBLOCK_ALLOC_NOLEAKTRACE)
                end = memblock.current_limit;

        /* avoid allocating the first page */
        start = max_t(phys_addr_t, start, PAGE_SIZE);
        end = max(start, end);

        if (memblock_bottom_up())
                return __memblock_find_range_bottom_up(start, end, size, align,
                                                       nid, flags);
        else
                return __memblock_find_range_top_down(start, end, size, align,
                                                      nid, flags);
}

/**
 * memblock_find_in_range - find free area in given range
 * @start: start of candidate range
 * @end: end of candidate range, can be %MEMBLOCK_ALLOC_ANYWHERE or
 *       %MEMBLOCK_ALLOC_ACCESSIBLE
 * @size: size of free area to find
 * @align: alignment of free area to find
 *
 * Find @size free area aligned to @align in the specified range.
 *
 * Return:
 * Found address on success, 0 on failure.
 */
static phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                        phys_addr_t end, phys_addr_t size,
                                        phys_addr_t align)
{
        phys_addr_t ret;
        enum memblock_flags flags = choose_memblock_flags();

again:
        ret = memblock_find_in_range_node(size, align, start, end,
                                            NUMA_NO_NODE, flags);

        if (!ret && (flags & MEMBLOCK_MIRROR)) {
                pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
                        &size);
                flags &= ~MEMBLOCK_MIRROR;
                goto again;
        }

        return ret;
}

static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
        type->total_size -= type->regions[r].size;
        memmove(&type->regions[r], &type->regions[r + 1],
                (type->cnt - (r + 1)) * sizeof(type->regions[r]));
        type->cnt--;

        /* Special case for empty arrays */
        if (type->cnt == 0) {
                WARN_ON(type->total_size != 0);
                type->regions[0].base = 0;
                type->regions[0].size = 0;
                type->regions[0].flags = 0;
                memblock_set_region_node(&type->regions[0], MAX_NUMNODES);
        }
}

#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
/**
 * memblock_discard - discard memory and reserved arrays if they were allocated
 */
void __init memblock_discard(void)
{
        phys_addr_t addr, size;

        if (memblock.reserved.regions != memblock_reserved_init_regions) {
                addr = __pa(memblock.reserved.regions);
                size = PAGE_ALIGN(sizeof(struct memblock_region) *
                                  memblock.reserved.max);
                if (memblock_reserved_in_slab)
                        kfree(memblock.reserved.regions);
                else
                        memblock_free_late(addr, size);
        }

        if (memblock.memory.regions != memblock_memory_init_regions) {
                addr = __pa(memblock.memory.regions);
                size = PAGE_ALIGN(sizeof(struct memblock_region) *
                                  memblock.memory.max);
                if (memblock_memory_in_slab)
                        kfree(memblock.memory.regions);
                else
                        memblock_free_late(addr, size);
        }

        memblock_memory = NULL;
}
#endif

/**
 * memblock_double_array - double the size of the memblock regions array
 * @type: memblock type of the regions array being doubled
 * @new_area_start: starting address of memory range to avoid overlap with
 * @new_area_size: size of memory range to avoid overlap with
 *
 * Double the size of the @type regions array. If memblock is being used to
 * allocate memory for a new reserved regions array and there is a previously
 * allocated memory range [@new_area_start, @new_area_start + @new_area_size]
 * waiting to be reserved, ensure the memory used by the new array does
 * not overlap.
 *
 * Return:
 * 0 on success, -1 on failure.
 */
static int __init_memblock memblock_double_array(struct memblock_type *type,
                                                phys_addr_t new_area_start,
                                                phys_addr_t new_area_size)
{
        struct memblock_region *new_array, *old_array;
        phys_addr_t old_alloc_size, new_alloc_size;
        phys_addr_t old_size, new_size, addr, new_end;
        int use_slab = slab_is_available();
        int *in_slab;

        /* We don't allow resizing until we know about the reserved regions
         * of memory that aren't suitable for allocation
         */
        if (!memblock_can_resize)
                panic("memblock: cannot resize %s array\n", type->name);

        /* Calculate new doubled size */
        old_size = type->max * sizeof(struct memblock_region);
        new_size = old_size << 1;
        /*
         * We need to allocated new one align to PAGE_SIZE,
         *   so we can free them completely later.
         */
        old_alloc_size = PAGE_ALIGN(old_size);
        new_alloc_size = PAGE_ALIGN(new_size);

        /* Retrieve the slab flag */
        if (type == &memblock.memory)
                in_slab = &memblock_memory_in_slab;
        else
                in_slab = &memblock_reserved_in_slab;

        /* Try to find some space for it */
        if (use_slab) {
                new_array = kmalloc(new_size, GFP_KERNEL);
                addr = new_array ? __pa(new_array) : 0;
        } else {
                /* only exclude range when trying to double reserved.regions */
                if (type != &memblock.reserved)
                        new_area_start = new_area_size = 0;

                addr = memblock_find_in_range(new_area_start + new_area_size,
                                                memblock.current_limit,
                                                new_alloc_size, PAGE_SIZE);
                if (!addr && new_area_size)
                        addr = memblock_find_in_range(0,
                                min(new_area_start, memblock.current_limit),
                                new_alloc_size, PAGE_SIZE);

                new_array = addr ? __va(addr) : NULL;
        }
        if (!addr) {
                pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
                       type->name, type->max, type->max * 2);
                return -1;
        }

        new_end = addr + new_size - 1;
        memblock_dbg("memblock: %s is doubled to %ld at [%pa-%pa]",
                        type->name, type->max * 2, &addr, &new_end);

        /*
         * Found space, we now need to move the array over before we add the
         * reserved region since it may be our reserved array itself that is
         * full.
         */
        memcpy(new_array, type->regions, old_size);
        memset(new_array + type->max, 0, old_size);
        old_array = type->regions;
        type->regions = new_array;
        type->max <<= 1;

        /* Free old array. We needn't free it if the array is the static one */
        if (*in_slab)
                kfree(old_array);
        else if (old_array != memblock_memory_init_regions &&
                 old_array != memblock_reserved_init_regions)
                memblock_free(old_array, old_alloc_size);

        /*
         * Reserve the new array if that comes from the memblock.  Otherwise, we
         * needn't do it
         */
        if (!use_slab)
                BUG_ON(memblock_reserve(addr, new_alloc_size));

        /* Update slab flag */
        *in_slab = use_slab;

        return 0;
}

/**
 * memblock_merge_regions - merge neighboring compatible regions
 * @type: memblock type to scan
 * @start_rgn: start scanning from (@start_rgn - 1)
 * @end_rgn: end scanning at (@end_rgn - 1)
 * Scan @type and merge neighboring compatible regions in [@start_rgn - 1, @end_rgn)
 */
static void __init_memblock memblock_merge_regions(struct memblock_type *type,
                                                   unsigned long start_rgn,
                                                   unsigned long end_rgn)
{
        int i = 0;
        if (start_rgn)
                i = start_rgn - 1;
        end_rgn = min(end_rgn, type->cnt - 1);
        while (i < end_rgn) {
                struct memblock_region *this = &type->regions[i];
                struct memblock_region *next = &type->regions[i + 1];

                if (this->base + this->size != next->base ||
                    memblock_get_region_node(this) !=
                    memblock_get_region_node(next) ||
                    this->flags != next->flags) {
                        BUG_ON(this->base + this->size > next->base);
                        i++;
                        continue;
                }

                this->size += next->size;
                /* move forward from next + 1, index of which is i + 2 */
                memmove(next, next + 1, (type->cnt - (i + 2)) * sizeof(*next));
                type->cnt--;
                end_rgn--;
        }
}

/**
 * memblock_insert_region - insert new memblock region
 * @type:        memblock type to insert into
 * @idx:        index for the insertion point
 * @base:        base address of the new region
 * @size:        size of the new region
 * @nid:        node id of the new region
 * @flags:        flags of the new region
 *
 * Insert new memblock region [@base, @base + @size) into @type at @idx.
 * @type must already have extra room to accommodate the new region.
 */
static void __init_memblock memblock_insert_region(struct memblock_type *type,
                                                   int idx, phys_addr_t base,
                                                   phys_addr_t size,
                                                   int nid,
                                                   enum memblock_flags flags)
{
        struct memblock_region *rgn = &type->regions[idx];

        BUG_ON(type->cnt >= type->max);
        memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn));
        rgn->base = base;
        rgn->size = size;
        rgn->flags = flags;
        memblock_set_region_node(rgn, nid);
        type->cnt++;
        type->total_size += size;
}

/**
 * memblock_add_range - add new memblock region
 * @type: memblock type to add new region into
 * @base: base address of the new region
 * @size: size of the new region
 * @nid: nid of the new region
 * @flags: flags of the new region
 *
 * Add new memblock region [@base, @base + @size) into @type.  The new region
 * is allowed to overlap with existing ones - overlaps don't affect already
 * existing regions.  @type is guaranteed to be minimal (all neighbouring
 * compatible regions are merged) after the addition.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
static int __init_memblock memblock_add_range(struct memblock_type *type,
                                phys_addr_t base, phys_addr_t size,
                                int nid, enum memblock_flags flags)
{
        bool insert = false;
        phys_addr_t obase = base;
        phys_addr_t end = base + memblock_cap_size(base, &size);
        int idx, nr_new, start_rgn = -1, end_rgn;
        struct memblock_region *rgn;

        if (!size)
                return 0;

        /* special case for empty array */
        if (type->regions[0].size == 0) {
                WARN_ON(type->cnt != 0 || type->total_size);
                type->regions[0].base = base;
                type->regions[0].size = size;
                type->regions[0].flags = flags;
                memblock_set_region_node(&type->regions[0], nid);
                type->total_size = size;
                type->cnt = 1;
                return 0;
        }

        /*
         * The worst case is when new range overlaps all existing regions,
         * then we'll need type->cnt + 1 empty regions in @type. So if
         * type->cnt * 2 + 1 is less than or equal to type->max, we know
         * that there is enough empty regions in @type, and we can insert
         * regions directly.
         */
        if (type->cnt * 2 + 1 <= type->max)
                insert = true;

repeat:
        /*
         * The following is executed twice.  Once with %false @insert and
         * then with %true.  The first counts the number of regions needed
         * to accommodate the new area.  The second actually inserts them.
         */
        base = obase;
        nr_new = 0;

        for_each_memblock_type(idx, type, rgn) {
                phys_addr_t rbase = rgn->base;
                phys_addr_t rend = rbase + rgn->size;

                if (rbase >= end)
                        break;
                if (rend <= base)
                        continue;
                /*
                 * @rgn overlaps.  If it separates the lower part of new
                 * area, insert that portion.
                 */
                if (rbase > base) {
#ifdef CONFIG_NUMA
                        WARN_ON(nid != memblock_get_region_node(rgn));
#endif
                        WARN_ON(flags != rgn->flags);
                        nr_new++;
                        if (insert) {
                                if (start_rgn == -1)
                                        start_rgn = idx;
                                end_rgn = idx + 1;
                                memblock_insert_region(type, idx++, base,
                                                       rbase - base, nid,
                                                       flags);
                        }
                }
                /* area below @rend is dealt with, forget about it */
                base = min(rend, end);
        }

        /* insert the remaining portion */
        if (base < end) {
                nr_new++;
                if (insert) {
                        if (start_rgn == -1)
                                start_rgn = idx;
                        end_rgn = idx + 1;
                        memblock_insert_region(type, idx, base, end - base,
                                               nid, flags);
                }
        }

        if (!nr_new)
                return 0;

        /*
         * If this was the first round, resize array and repeat for actual
         * insertions; otherwise, merge and return.
         */
        if (!insert) {
                while (type->cnt + nr_new > type->max)
                        if (memblock_double_array(type, obase, size) < 0)
                                return -ENOMEM;
                insert = true;
                goto repeat;
        } else {
                memblock_merge_regions(type, start_rgn, end_rgn);
                return 0;
        }
}

/**
 * memblock_add_node - add new memblock region within a NUMA node
 * @base: base address of the new region
 * @size: size of the new region
 * @nid: nid of the new region
 * @flags: flags of the new region
 *
 * Add new memblock region [@base, @base + @size) to the "memory"
 * type. See memblock_add_range() description for mode details
 *
 * Return:
 * 0 on success, -errno on failure.
 */
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
                                      int nid, enum memblock_flags flags)
{
        phys_addr_t end = base + size - 1;

        memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
                     &base, &end, nid, flags, (void *)_RET_IP_);

        return memblock_add_range(&memblock.memory, base, size, nid, flags);
}

/**
 * memblock_add - add new memblock region
 * @base: base address of the new region
 * @size: size of the new region
 *
 * Add new memblock region [@base, @base + @size) to the "memory"
 * type. See memblock_add_range() description for mode details
 *
 * Return:
 * 0 on success, -errno on failure.
 */
int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
        phys_addr_t end = base + size - 1;

        memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
                     &base, &end, (void *)_RET_IP_);

        return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
}

/**
 * memblock_validate_numa_coverage - check if amount of memory with
 * no node ID assigned is less than a threshold
 * @threshold_bytes: maximal memory size that can have unassigned node
 * ID (in bytes).
 *
 * A buggy firmware may report memory that does not belong to any node.
 * Check if amount of such memory is below @threshold_bytes.
 *
 * Return: true on success, false on failure.
 */
bool __init_memblock memblock_validate_numa_coverage(unsigned long threshold_bytes)
{
        unsigned long nr_pages = 0;
        unsigned long start_pfn, end_pfn, mem_size_mb;
        int nid, i;

        /* calculate lose page */
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
                if (!numa_valid_node(nid))
                        nr_pages += end_pfn - start_pfn;
        }

        if ((nr_pages << PAGE_SHIFT) > threshold_bytes) {
                mem_size_mb = memblock_phys_mem_size() >> 20;
                pr_err("NUMA: no nodes coverage for %luMB of %luMB RAM\n",
                       (nr_pages << PAGE_SHIFT) >> 20, mem_size_mb);
                return false;
        }

        return true;
}


/**
 * memblock_isolate_range - isolate given range into disjoint memblocks
 * @type: memblock type to isolate range for
 * @base: base of range to isolate
 * @size: size of range to isolate
 * @start_rgn: out parameter for the start of isolated region
 * @end_rgn: out parameter for the end of isolated region
 *
 * Walk @type and ensure that regions don't cross the boundaries defined by
 * [@base, @base + @size).  Crossing regions are split at the boundaries,
 * which may create at most two more regions.  The index of the first
 * region inside the range is returned in *@start_rgn and the index of the
 * first region after the range is returned in *@end_rgn.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
static int __init_memblock memblock_isolate_range(struct memblock_type *type,
                                        phys_addr_t base, phys_addr_t size,
                                        int *start_rgn, int *end_rgn)
{
        phys_addr_t end = base + memblock_cap_size(base, &size);
        int idx;
        struct memblock_region *rgn;

        *start_rgn = *end_rgn = 0;

        if (!size)
                return 0;

        /* we'll create at most two more regions */
        while (type->cnt + 2 > type->max)
                if (memblock_double_array(type, base, size) < 0)
                        return -ENOMEM;

        for_each_memblock_type(idx, type, rgn) {
                phys_addr_t rbase = rgn->base;
                phys_addr_t rend = rbase + rgn->size;

                if (rbase >= end)
                        break;
                if (rend <= base)
                        continue;

                if (rbase < base) {
                        /*
                         * @rgn intersects from below.  Split and continue
                         * to process the next region - the new top half.
                         */
                        rgn->base = base;
                        rgn->size -= base - rbase;
                        type->total_size -= base - rbase;
                        memblock_insert_region(type, idx, rbase, base - rbase,
                                               memblock_get_region_node(rgn),
                                               rgn->flags);
                } else if (rend > end) {
                        /*
                         * @rgn intersects from above.  Split and redo the
                         * current region - the new bottom half.
                         */
                        rgn->base = end;
                        rgn->size -= end - rbase;
                        type->total_size -= end - rbase;
                        memblock_insert_region(type, idx--, rbase, end - rbase,
                                               memblock_get_region_node(rgn),
                                               rgn->flags);
                } else {
                        /* @rgn is fully contained, record it */
                        if (!*end_rgn)
                                *start_rgn = idx;
                        *end_rgn = idx + 1;
                }
        }

        return 0;
}

static int __init_memblock memblock_remove_range(struct memblock_type *type,
                                          phys_addr_t base, phys_addr_t size)
{
        int start_rgn, end_rgn;
        int i, ret;

        ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
        if (ret)
                return ret;

        for (i = end_rgn - 1; i >= start_rgn; i--)
                memblock_remove_region(type, i);
        return 0;
}

int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
        phys_addr_t end = base + size - 1;

        memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
                     &base, &end, (void *)_RET_IP_);

        return memblock_remove_range(&memblock.memory, base, size);
}

/**
 * memblock_free - free boot memory allocation
 * @ptr: starting address of the  boot memory allocation
 * @size: size of the boot memory block in bytes
 *
 * Free boot memory block previously allocated by memblock_alloc_xx() API.
 * The freeing memory will not be released to the buddy allocator.
 */
void __init_memblock memblock_free(void *ptr, size_t size)
{
        if (ptr)
                memblock_phys_free(__pa(ptr), size);
}

/**
 * memblock_phys_free - free boot memory block
 * @base: phys starting address of the  boot memory block
 * @size: size of the boot memory block in bytes
 *
 * Free boot memory block previously allocated by memblock_phys_alloc_xx() API.
 * The freeing memory will not be released to the buddy allocator.
 */
int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
{
        phys_addr_t end = base + size - 1;

        memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
                     &base, &end, (void *)_RET_IP_);

        kmemleak_free_part_phys(base, size);
        return memblock_remove_range(&memblock.reserved, base, size);
}

int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
        phys_addr_t end = base + size - 1;

        memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
                     &base, &end, (void *)_RET_IP_);

        return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
}

#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
{
        phys_addr_t end = base + size - 1;

        memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
                     &base, &end, (void *)_RET_IP_);

        return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0);
}
#endif

/**
 * memblock_setclr_flag - set or clear flag for a memory region
 * @type: memblock type to set/clear flag for
 * @base: base address of the region
 * @size: size of the region
 * @set: set or clear the flag
 * @flag: the flag to update
 *
 * This function isolates region [@base, @base + @size), and sets/clears flag
 *
 * Return: 0 on success, -errno on failure.
 */
static int __init_memblock memblock_setclr_flag(struct memblock_type *type,
                                phys_addr_t base, phys_addr_t size, int set, int flag)
{
        int i, ret, start_rgn, end_rgn;

        ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
        if (ret)
                return ret;

        for (i = start_rgn; i < end_rgn; i++) {
                struct memblock_region *r = &type->regions[i];

                if (set)
                        r->flags |= flag;
                else
                        r->flags &= ~flag;
        }

        memblock_merge_regions(type, start_rgn, end_rgn);
        return 0;
}

/**
 * memblock_mark_hotplug - Mark hotpluggable memory with flag MEMBLOCK_HOTPLUG.
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
 * Return: 0 on success, -errno on failure.
 */
int __init_memblock memblock_mark_hotplug(phys_addr_t base, phys_addr_t size)
{
        return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_HOTPLUG);
}

/**
 * memblock_clear_hotplug - Clear flag MEMBLOCK_HOTPLUG for a specified region.
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
 * Return: 0 on success, -errno on failure.
 */
int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
{
        return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_HOTPLUG);
}

/**
 * memblock_mark_mirror - Mark mirrored memory with flag MEMBLOCK_MIRROR.
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
 * Return: 0 on success, -errno on failure.
 */
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
{
        if (!mirrored_kernelcore)
                return 0;

        system_has_some_mirror = true;

        return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_MIRROR);
}

/**
 * memblock_mark_nomap - Mark a memory region with flag MEMBLOCK_NOMAP.
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
 * The memory regions marked with %MEMBLOCK_NOMAP will not be added to the
 * direct mapping of the physical memory. These regions will still be
 * covered by the memory map. The struct page representing NOMAP memory
 * frames in the memory map will be PageReserved()
 *
 * Note: if the memory being marked %MEMBLOCK_NOMAP was allocated from
 * memblock, the caller must inform kmemleak to ignore that memory
 *
 * Return: 0 on success, -errno on failure.
 */
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
{
        return memblock_setclr_flag(&memblock.memory, base, size, 1, MEMBLOCK_NOMAP);
}

/**
 * memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region.
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
 * Return: 0 on success, -errno on failure.
 */
int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size)
{
        return memblock_setclr_flag(&memblock.memory, base, size, 0, MEMBLOCK_NOMAP);
}

/**
 * memblock_reserved_mark_noinit - Mark a reserved memory region with flag
 * MEMBLOCK_RSRV_NOINIT which results in the struct pages not being initialized
 * for this region.
 * @base: the base phys addr of the region
 * @size: the size of the region
 *
 * struct pages will not be initialized for reserved memory regions marked with
 * %MEMBLOCK_RSRV_NOINIT.
 *
 * Return: 0 on success, -errno on failure.
 */
int __init_memblock memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size)
{
        return memblock_setclr_flag(&memblock.reserved, base, size, 1,
                                    MEMBLOCK_RSRV_NOINIT);
}

static bool should_skip_region(struct memblock_type *type,
                               struct memblock_region *m,
                               int nid, int flags)
{
        int m_nid = memblock_get_region_node(m);

        /* we never skip regions when iterating memblock.reserved or physmem */
        if (type != memblock_memory)
                return false;

        /* only memory regions are associated with nodes, check it */
        if (numa_valid_node(nid) && nid != m_nid)
                return true;

        /* skip hotpluggable memory regions if needed */
        if (movable_node_is_enabled() && memblock_is_hotpluggable(m) &&
            !(flags & MEMBLOCK_HOTPLUG))
                return true;

        /* if we want mirror memory skip non-mirror memory regions */
        if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
                return true;

        /* skip nomap memory unless we were asked for it explicitly */
        if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
                return true;

        /* skip driver-managed memory unless we were asked for it explicitly */
        if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
                return true;

        return false;
}

/**
 * __next_mem_range - next function for for_each_free_mem_range() etc.
 * @idx: pointer to u64 loop variable
 * @nid: node selector, %NUMA_NO_NODE for all nodes
 * @flags: pick from blocks based on memory attributes
 * @type_a: pointer to memblock_type from where the range is taken
 * @type_b: pointer to memblock_type which excludes memory from being taken
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Find the first area from *@idx which matches @nid, fill the out
 * parameters, and update *@idx for the next iteration.  The lower 32bit of
 * *@idx contains index into type_a and the upper 32bit indexes the
 * areas before each region in type_b.        For example, if type_b regions
 * look like the following,
 *
 *        0:[0-16), 1:[32-48), 2:[128-130)
 *
 * The upper 32bit indexes the following regions.
 *
 *        0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX)
 *
 * As both region arrays are sorted, the function advances the two indices
 * in lockstep and returns each intersection.
 */
void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
                      struct memblock_type *type_a,
                      struct memblock_type *type_b, phys_addr_t *out_start,
                      phys_addr_t *out_end, int *out_nid)
{
        int idx_a = *idx & 0xffffffff;
        int idx_b = *idx >> 32;

        for (; idx_a < type_a->cnt; idx_a++) {
                struct memblock_region *m = &type_a->regions[idx_a];

                phys_addr_t m_start = m->base;
                phys_addr_t m_end = m->base + m->size;
                int            m_nid = memblock_get_region_node(m);

                if (should_skip_region(type_a, m, nid, flags))
                        continue;

                if (!type_b) {
                        if (out_start)
                                *out_start = m_start;
                        if (out_end)
                                *out_end = m_end;
                        if (out_nid)
                                *out_nid = m_nid;
                        idx_a++;
                        *idx = (u32)idx_a | (u64)idx_b << 32;
                        return;
                }

                /* scan areas before each reservation */
                for (; idx_b < type_b->cnt + 1; idx_b++) {
                        struct memblock_region *r;
                        phys_addr_t r_start;
                        phys_addr_t r_end;

                        r = &type_b->regions[idx_b];
                        r_start = idx_b ? r[-1].base + r[-1].size : 0;
                        r_end = idx_b < type_b->cnt ?
                                r->base : PHYS_ADDR_MAX;

                        /*
                         * if idx_b advanced past idx_a,
                         * break out to advance idx_a
                         */
                        if (r_start >= m_end)
                                break;
                        /* if the two regions intersect, we're done */
                        if (m_start < r_end) {
                                if (out_start)
                                        *out_start =
                                                max(m_start, r_start);
                                if (out_end)
                                        *out_end = min(m_end, r_end);
                                if (out_nid)
                                        *out_nid = m_nid;
                                /*
                                 * The region which ends first is
                                 * advanced for the next iteration.
                                 */
                                if (m_end <= r_end)
                                        idx_a++;
                                else
                                        idx_b++;
                                *idx = (u32)idx_a | (u64)idx_b << 32;
                                return;
                        }
                }
        }

        /* signal end of iteration */
        *idx = ULLONG_MAX;
}

/**
 * __next_mem_range_rev - generic next function for for_each_*_range_rev()
 *
 * @idx: pointer to u64 loop variable
 * @nid: node selector, %NUMA_NO_NODE for all nodes
 * @flags: pick from blocks based on memory attributes
 * @type_a: pointer to memblock_type from where the range is taken
 * @type_b: pointer to memblock_type which excludes memory from being taken
 * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Finds the next range from type_a which is not marked as unsuitable
 * in type_b.
 *
 * Reverse of __next_mem_range().
 */
void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
                                          enum memblock_flags flags,
                                          struct memblock_type *type_a,
                                          struct memblock_type *type_b,
                                          phys_addr_t *out_start,
                                          phys_addr_t *out_end, int *out_nid)
{
        int idx_a = *idx & 0xffffffff;
        int idx_b = *idx >> 32;

        if (*idx == (u64)ULLONG_MAX) {
                idx_a = type_a->cnt - 1;
                if (type_b != NULL)
                        idx_b = type_b->cnt;
                else
                        idx_b = 0;
        }

        for (; idx_a >= 0; idx_a--) {
                struct memblock_region *m = &type_a->regions[idx_a];

                phys_addr_t m_start = m->base;
                phys_addr_t m_end = m->base + m->size;
                int m_nid = memblock_get_region_node(m);

                if (should_skip_region(type_a, m, nid, flags))
                        continue;

                if (!type_b) {
                        if (out_start)
                                *out_start = m_start;
                        if (out_end)
                                *out_end = m_end;
                        if (out_nid)
                                *out_nid = m_nid;
                        idx_a--;
                        *idx = (u32)idx_a | (u64)idx_b << 32;
                        return;
                }

                /* scan areas before each reservation */
                for (; idx_b >= 0; idx_b--) {
                        struct memblock_region *r;
                        phys_addr_t r_start;
                        phys_addr_t r_end;

                        r = &type_b->regions[idx_b];
                        r_start = idx_b ? r[-1].base + r[-1].size : 0;
                        r_end = idx_b < type_b->cnt ?
                                r->base : PHYS_ADDR_MAX;
                        /*
                         * if idx_b advanced past idx_a,
                         * break out to advance idx_a
                         */

                        if (r_end <= m_start)
                                break;
                        /* if the two regions intersect, we're done */
                        if (m_end > r_start) {
                                if (out_start)
                                        *out_start = max(m_start, r_start);
                                if (out_end)
                                        *out_end = min(m_end, r_end);
                                if (out_nid)
                                        *out_nid = m_nid;
                                if (m_start >= r_start)
                                        idx_a--;
                                else
                                        idx_b--;
                                *idx = (u32)idx_a | (u64)idx_b << 32;
                                return;
                        }
                }
        }
        /* signal end of iteration */
        *idx = ULLONG_MAX;
}

/*
 * Common iterator interface used to define for_each_mem_pfn_range().
 */
void __init_memblock __next_mem_pfn_range(int *idx, int nid,
                                unsigned long *out_start_pfn,
                                unsigned long *out_end_pfn, int *out_nid)
{
        struct memblock_type *type = &memblock.memory;
        struct memblock_region *r;
        int r_nid;

        while (++*idx < type->cnt) {
                r = &type->regions[*idx];
                r_nid = memblock_get_region_node(r);

                if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
                        continue;
                if (!numa_valid_node(nid) || nid == r_nid)
                        break;
        }
        if (*idx >= type->cnt) {
                *idx = -1;
                return;
        }

        if (out_start_pfn)
                *out_start_pfn = PFN_UP(r->base);
        if (out_end_pfn)
                *out_end_pfn = PFN_DOWN(r->base + r->size);
        if (out_nid)
                *out_nid = r_nid;
}

/**
 * memblock_set_node - set node ID on memblock regions
 * @base: base of area to set node ID for
 * @size: size of area to set node ID for
 * @type: memblock type to set node ID for
 * @nid: node ID to set
 *
 * Set the nid of memblock @type regions in [@base, @base + @size) to @nid.
 * Regions which cross the area boundaries are split as necessary.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
                                      struct memblock_type *type, int nid)
{
#ifdef CONFIG_NUMA
        int start_rgn, end_rgn;
        int i, ret;

        ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn);
        if (ret)
                return ret;

        for (i = start_rgn; i < end_rgn; i++)
                memblock_set_region_node(&type->regions[i], nid);

        memblock_merge_regions(type, start_rgn, end_rgn);
#endif
        return 0;
}

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/**
 * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
 *
 * @idx: pointer to u64 loop variable
 * @zone: zone in which all of the memory blocks reside
 * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
 * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
 *
 * This function is meant to be a zone/pfn specific wrapper for the
 * for_each_mem_range type iterators. Specifically they are used in the
 * deferred memory init routines and as such we were duplicating much of
 * this logic throughout the code. So instead of having it in multiple
 * locations it seemed like it would make more sense to centralize this to
 * one new iterator that does everything they need.
 */
void __init_memblock
__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
                             unsigned long *out_spfn, unsigned long *out_epfn)
{
        int zone_nid = zone_to_nid(zone);
        phys_addr_t spa, epa;

        __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
                         &memblock.memory, &memblock.reserved,
                         &spa, &epa, NULL);

        while (*idx != U64_MAX) {
                unsigned long epfn = PFN_DOWN(epa);
                unsigned long spfn = PFN_UP(spa);

                /*
                 * Verify the end is at least past the start of the zone and
                 * that we have at least one PFN to initialize.
                 */
                if (zone->zone_start_pfn < epfn && spfn < epfn) {
                        /* if we went too far just stop searching */
                        if (zone_end_pfn(zone) <= spfn) {
                                *idx = U64_MAX;
                                break;
                        }

                        if (out_spfn)
                                *out_spfn = max(zone->zone_start_pfn, spfn);
                        if (out_epfn)
                                *out_epfn = min(zone_end_pfn(zone), epfn);

                        return;
                }

                __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
                                 &memblock.memory, &memblock.reserved,
                                 &spa, &epa, NULL);
        }

        /* signal end of iteration */
        if (out_spfn)
                *out_spfn = ULONG_MAX;
        if (out_epfn)
                *out_epfn = 0;
}

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

/**
 * memblock_alloc_range_nid - allocate boot memory block
 * @size: size of memory block to be allocated in bytes
 * @align: alignment of the region and block's size
 * @start: the lower bound of the memory region to allocate (phys address)
 * @end: the upper bound of the memory region to allocate (phys address)
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 * @exact_nid: control the allocation fall back to other nodes
 *
 * The allocation is performed from memory region limited by
 * memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE.
 *
 * If the specified node can not hold the requested memory and @exact_nid
 * is false, the allocation falls back to any node in the system.
 *
 * For systems with memory mirroring, the allocation is attempted first
 * from the regions with mirroring enabled and then retried from any
 * memory region.
 *
 * In addition, function using kmemleak_alloc_phys for allocated boot
 * memory block, it is never reported as leaks.
 *
 * Return:
 * Physical address of allocated memory block on success, %0 on failure.
 */
phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
                                        phys_addr_t align, phys_addr_t start,
                                        phys_addr_t end, int nid,
                                        bool exact_nid)
{
        enum memblock_flags flags = choose_memblock_flags();
        phys_addr_t found;

        /*
         * Detect any accidental use of these APIs after slab is ready, as at
         * this moment memblock may be deinitialized already and its
         * internal data may be destroyed (after execution of memblock_free_all)
         */
        if (WARN_ON_ONCE(slab_is_available())) {
                void *vaddr = kzalloc_node(size, GFP_NOWAIT, nid);

                return vaddr ? virt_to_phys(vaddr) : 0;
        }

        if (!align) {
                /* Can't use WARNs this early in boot on powerpc */
                dump_stack();
                align = SMP_CACHE_BYTES;
        }

again:
        found = memblock_find_in_range_node(size, align, start, end, nid,
                                            flags);
        if (found && !memblock_reserve(found, size))
                goto done;

        if (numa_valid_node(nid) && !exact_nid) {
                found = memblock_find_in_range_node(size, align, start,
                                                    end, NUMA_NO_NODE,
                                                    flags);
                if (found && !memblock_reserve(found, size))
                        goto done;
        }

        if (flags & MEMBLOCK_MIRROR) {
                flags &= ~MEMBLOCK_MIRROR;
                pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
                        &size);
                goto again;
        }

        return 0;

done:
        /*
         * Skip kmemleak for those places like kasan_init() and
         * early_pgtable_alloc() due to high volume.
         */
        if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
                /*
                 * Memblock allocated blocks are never reported as
                 * leaks. This is because many of these blocks are
                 * only referred via the physical address which is
                 * not looked up by kmemleak.
                 */
                kmemleak_alloc_phys(found, size, 0);

        /*
         * Some Virtual Machine platforms, such as Intel TDX or AMD SEV-SNP,
         * require memory to be accepted before it can be used by the
         * guest.
         *
         * Accept the memory of the allocated buffer.
         */
        accept_memory(found, size);

        return found;
}

/**
 * memblock_phys_alloc_range - allocate a memory block inside specified range
 * @size: size of memory block to be allocated in bytes
 * @align: alignment of the region and block's size
 * @start: the lower bound of the memory region to allocate (physical address)
 * @end: the upper bound of the memory region to allocate (physical address)
 *
 * Allocate @size bytes in the between @start and @end.
 *
 * Return: physical address of the allocated memory block on success,
 * %0 on failure.
 */
phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
                                             phys_addr_t align,
                                             phys_addr_t start,
                                             phys_addr_t end)
{
        memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n",
                     __func__, (u64)size, (u64)align, &start, &end,
                     (void *)_RET_IP_);
        return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
                                        false);
}

/**
 * memblock_phys_alloc_try_nid - allocate a memory block from specified NUMA node
 * @size: size of memory block to be allocated in bytes
 * @align: alignment of the region and block's size
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Allocates memory block from the specified NUMA node. If the node
 * has no available memory, attempts to allocated from any node in the
 * system.
 *
 * Return: physical address of the allocated memory block on success,
 * %0 on failure.
 */
phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
{
        return memblock_alloc_range_nid(size, align, 0,
                                        MEMBLOCK_ALLOC_ACCESSIBLE, nid, false);
}

/**
 * memblock_alloc_internal - allocate boot memory block
 * @size: size of memory block to be allocated in bytes
 * @align: alignment of the region and block's size
 * @min_addr: the lower bound of the memory region to allocate (phys address)
 * @max_addr: the upper bound of the memory region to allocate (phys address)
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 * @exact_nid: control the allocation fall back to other nodes
 *
 * Allocates memory block using memblock_alloc_range_nid() and
 * converts the returned physical address to virtual.
 *
 * The @min_addr limit is dropped if it can not be satisfied and the allocation
 * will fall back to memory below @min_addr. Other constraints, such
 * as node and mirrored memory will be handled again in
 * memblock_alloc_range_nid().
 *
 * Return:
 * Virtual address of allocated memory block on success, NULL on failure.
 */
static void * __init memblock_alloc_internal(
                                phys_addr_t size, phys_addr_t align,
                                phys_addr_t min_addr, phys_addr_t max_addr,
                                int nid, bool exact_nid)
{
        phys_addr_t alloc;


        if (max_addr > memblock.current_limit)
                max_addr = memblock.current_limit;

        alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid,
                                        exact_nid);

        /* retry allocation without lower limit */
        if (!alloc && min_addr)
                alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid,
                                                exact_nid);

        if (!alloc)
                return NULL;

        return phys_to_virt(alloc);
}

/**
 * memblock_alloc_exact_nid_raw - allocate boot memory block on the exact node
 * without zeroing memory
 * @size: size of memory block to be allocated in bytes
 * @align: alignment of the region and block's size
 * @min_addr: the lower bound of the memory region from where the allocation
 *          is preferred (phys address)
 * @max_addr: the upper bound of the memory region from where the allocation
 *              is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
 *              allocate only from memory limited by memblock.current_limit value
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Public function, provides additional debug information (including caller
 * info), if enabled. Does not zero allocated memory.
 *
 * Return:
 * Virtual address of allocated memory block on success, NULL on failure.
 */
void * __init memblock_alloc_exact_nid_raw(
                        phys_addr_t size, phys_addr_t align,
                        phys_addr_t min_addr, phys_addr_t max_addr,
                        int nid)
{
        memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
                     __func__, (u64)size, (u64)align, nid, &min_addr,
                     &max_addr, (void *)_RET_IP_);

        return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
                                       true);
}

/**
 * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing
 * memory and without panicking
 * @size: size of memory block to be allocated in bytes
 * @align: alignment of the region and block's size
 * @min_addr: the lower bound of the memory region from where the allocation
 *          is preferred (phys address)
 * @max_addr: the upper bound of the memory region from where the allocation
 *              is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
 *              allocate only from memory limited by memblock.current_limit value
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Public function, provides additional debug information (including caller
 * info), if enabled. Does not zero allocated memory, does not panic if request
 * cannot be satisfied.
 *
 * Return:
 * Virtual address of allocated memory block on success, NULL on failure.
 */
void * __init memblock_alloc_try_nid_raw(
                        phys_addr_t size, phys_addr_t align,
                        phys_addr_t min_addr, phys_addr_t max_addr,
                        int nid)
{
        memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
                     __func__, (u64)size, (u64)align, nid, &min_addr,
                     &max_addr, (void *)_RET_IP_);

        return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
                                       false);
}

/**
 * memblock_alloc_try_nid - allocate boot memory block
 * @size: size of memory block to be allocated in bytes
 * @align: alignment of the region and block's size
 * @min_addr: the lower bound of the memory region from where the allocation
 *          is preferred (phys address)
 * @max_addr: the upper bound of the memory region from where the allocation
 *              is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
 *              allocate only from memory limited by memblock.current_limit value
 * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
 *
 * Public function, provides additional debug information (including caller
 * info), if enabled. This function zeroes the allocated memory.
 *
 * Return:
 * Virtual address of allocated memory block on success, NULL on failure.
 */
void * __init memblock_alloc_try_nid(
                        phys_addr_t size, phys_addr_t align,
                        phys_addr_t min_addr, phys_addr_t max_addr,
                        int nid)
{
        void *ptr;

        memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
                     __func__, (u64)size, (u64)align, nid, &min_addr,
                     &max_addr, (void *)_RET_IP_);
        ptr = memblock_alloc_internal(size, align,
                                           min_addr, max_addr, nid, false);
        if (ptr)
                memset(ptr, 0, size);

        return ptr;
}

/**
 * __memblock_alloc_or_panic - Try to allocate memory and panic on failure
 * @size: size of memory block to be allocated in bytes
 * @align: alignment of the region and block's size
 * @func: caller func name
 *
 * This function attempts to allocate memory using memblock_alloc,
 * and in case of failure, it calls panic with the formatted message.
 * This function should not be used directly, please use the macro memblock_alloc_or_panic.
 */
void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align,
                                       const char *func)
{
        void *addr = memblock_alloc(size, align);

        if (unlikely(!addr))
                panic("%s: Failed to allocate %pap bytes\n", func, &size);
        return addr;
}

/**
 * memblock_free_late - free pages directly to buddy allocator
 * @base: phys starting address of the  boot memory block
 * @size: size of the boot memory block in bytes
 *
 * This is only useful when the memblock allocator has already been torn
 * down, but we are still initializing the system.  Pages are released directly
 * to the buddy allocator.
 */
void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
{
        phys_addr_t cursor, end;

        end = base + size - 1;
        memblock_dbg("%s: [%pa-%pa] %pS\n",
                     __func__, &base, &end, (void *)_RET_IP_);
        kmemleak_free_part_phys(base, size);
        cursor = PFN_UP(base);
        end = PFN_DOWN(base + size);

        for (; cursor < end; cursor++) {
                memblock_free_pages(pfn_to_page(cursor), cursor, 0);
                totalram_pages_inc();
        }
}

/*
 * Remaining API functions
 */

phys_addr_t __init_memblock memblock_phys_mem_size(void)
{
        return memblock.memory.total_size;
}

phys_addr_t __init_memblock memblock_reserved_size(void)
{
        return memblock.reserved.total_size;
}

/**
 * memblock_estimated_nr_free_pages - return estimated number of free pages
 * from memblock point of view
 *
 * During bootup, subsystems might need a rough estimate of the number of free
 * pages in the whole system, before precise numbers are available from the
 * buddy. Especially with CONFIG_DEFERRED_STRUCT_PAGE_INIT, the numbers
 * obtained from the buddy might be very imprecise during bootup.
 *
 * Return:
 * An estimated number of free pages from memblock point of view.
 */
unsigned long __init memblock_estimated_nr_free_pages(void)
{
        return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size());
}

/* lowest address */
phys_addr_t __init_memblock memblock_start_of_DRAM(void)
{
        return memblock.memory.regions[0].base;
}

phys_addr_t __init_memblock memblock_end_of_DRAM(void)
{
        int idx = memblock.memory.cnt - 1;

        return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
}

static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit)
{
        phys_addr_t max_addr = PHYS_ADDR_MAX;
        struct memblock_region *r;

        /*
         * translate the memory @limit size into the max address within one of
         * the memory memblock regions, if the @limit exceeds the total size
         * of those regions, max_addr will keep original value PHYS_ADDR_MAX
         */
        for_each_mem_region(r) {
                if (limit <= r->size) {
                        max_addr = r->base + limit;
                        break;
                }
                limit -= r->size;
        }

        return max_addr;
}

void __init memblock_enforce_memory_limit(phys_addr_t limit)
{
        phys_addr_t max_addr;

        if (!limit)
                return;

        max_addr = __find_max_addr(limit);

        /* @limit exceeds the total size of the memory, do nothing */
        if (max_addr == PHYS_ADDR_MAX)
                return;

        /* truncate both memory and reserved regions */
        memblock_remove_range(&memblock.memory, max_addr,
                              PHYS_ADDR_MAX);
        memblock_remove_range(&memblock.reserved, max_addr,
                              PHYS_ADDR_MAX);
}

void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
{
        int start_rgn, end_rgn;
        int i, ret;

        if (!size)
                return;

        if (!memblock_memory->total_size) {
                pr_warn("%s: No memory registered yet\n", __func__);
                return;
        }

        ret = memblock_isolate_range(&memblock.memory, base, size,
                                                &start_rgn, &end_rgn);
        if (ret)
                return;

        /* remove all the MAP regions */
        for (i = memblock.memory.cnt - 1; i >= end_rgn; i--)
                if (!memblock_is_nomap(&memblock.memory.regions[i]))
                        memblock_remove_region(&memblock.memory, i);

        for (i = start_rgn - 1; i >= 0; i--)
                if (!memblock_is_nomap(&memblock.memory.regions[i]))
                        memblock_remove_region(&memblock.memory, i);

        /* truncate the reserved regions */
        memblock_remove_range(&memblock.reserved, 0, base);
        memblock_remove_range(&memblock.reserved,
                        base + size, PHYS_ADDR_MAX);
}

void __init memblock_mem_limit_remove_map(phys_addr_t limit)
{
        phys_addr_t max_addr;

        if (!limit)
                return;

        max_addr = __find_max_addr(limit);

        /* @limit exceeds the total size of the memory, do nothing */
        if (max_addr == PHYS_ADDR_MAX)
                return;

        memblock_cap_memory_range(0, max_addr);
}

static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
{
        unsigned int left = 0, right = type->cnt;

        do {
                unsigned int mid = (right + left) / 2;

                if (addr < type->regions[mid].base)
                        right = mid;
                else if (addr >= (type->regions[mid].base +
                                  type->regions[mid].size))
                        left = mid + 1;
                else
                        return mid;
        } while (left < right);
        return -1;
}

bool __init_memblock memblock_is_reserved(phys_addr_t addr)
{
        return memblock_search(&memblock.reserved, addr) != -1;
}

bool __init_memblock memblock_is_memory(phys_addr_t addr)
{
        return memblock_search(&memblock.memory, addr) != -1;
}

bool __init_memblock memblock_is_map_memory(phys_addr_t addr)
{
        int i = memblock_search(&memblock.memory, addr);

        if (i == -1)
                return false;
        return !memblock_is_nomap(&memblock.memory.regions[i]);
}

int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
                         unsigned long *start_pfn, unsigned long *end_pfn)
{
        struct memblock_type *type = &memblock.memory;
        int mid = memblock_search(type, PFN_PHYS(pfn));

        if (mid == -1)
                return NUMA_NO_NODE;

        *start_pfn = PFN_DOWN(type->regions[mid].base);
        *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);

        return memblock_get_region_node(&type->regions[mid]);
}

/**
 * memblock_is_region_memory - check if a region is a subset of memory
 * @base: base of region to check
 * @size: size of region to check
 *
 * Check if the region [@base, @base + @size) is a subset of a memory block.
 *
 * Return:
 * 0 if false, non-zero if true
 */
bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
{
        int idx = memblock_search(&memblock.memory, base);
        phys_addr_t end = base + memblock_cap_size(base, &size);

        if (idx == -1)
                return false;
        return (memblock.memory.regions[idx].base +
                 memblock.memory.regions[idx].size) >= end;
}

/**
 * memblock_is_region_reserved - check if a region intersects reserved memory
 * @base: base of region to check
 * @size: size of region to check
 *
 * Check if the region [@base, @base + @size) intersects a reserved
 * memory block.
 *
 * Return:
 * True if they intersect, false if not.
 */
bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
        return memblock_overlaps_region(&memblock.reserved, base, size);
}

void __init_memblock memblock_trim_memory(phys_addr_t align)
{
        phys_addr_t start, end, orig_start, orig_end;
        struct memblock_region *r;

        for_each_mem_region(r) {
                orig_start = r->base;
                orig_end = r->base + r->size;
                start = round_up(orig_start, align);
                end = round_down(orig_end, align);

                if (start == orig_start && end == orig_end)
                        continue;

                if (start < end) {
                        r->base = start;
                        r->size = end - start;
                } else {
                        memblock_remove_region(&memblock.memory,
                                               r - memblock.memory.regions);
                        r--;
                }
        }
}

void __init_memblock memblock_set_current_limit(phys_addr_t limit)
{
        memblock.current_limit = limit;
}

phys_addr_t __init_memblock memblock_get_current_limit(void)
{
        return memblock.current_limit;
}

static void __init_memblock memblock_dump(struct memblock_type *type)
{
        phys_addr_t base, end, size;
        enum memblock_flags flags;
        int idx;
        struct memblock_region *rgn;

        pr_info(" %s.cnt  = 0x%lx\n", type->name, type->cnt);

        for_each_memblock_type(idx, type, rgn) {
                char nid_buf[32] = "";

                base = rgn->base;
                size = rgn->size;
                end = base + size - 1;
                flags = rgn->flags;
#ifdef CONFIG_NUMA
                if (numa_valid_node(memblock_get_region_node(rgn)))
                        snprintf(nid_buf, sizeof(nid_buf), " on node %d",
                                 memblock_get_region_node(rgn));
#endif
                pr_info(" %s[%#x]\t[%pa-%pa], %pa bytes%s flags: %#x\n",
                        type->name, idx, &base, &end, &size, nid_buf, flags);
        }
}

static void __init_memblock __memblock_dump_all(void)
{
        pr_info("MEMBLOCK configuration:\n");
        pr_info(" memory size = %pa reserved size = %pa\n",
                &memblock.memory.total_size,
                &memblock.reserved.total_size);

        memblock_dump(&memblock.memory);
        memblock_dump(&memblock.reserved);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
        memblock_dump(&physmem);
#endif
}

void __init_memblock memblock_dump_all(void)
{
        if (memblock_debug)
                __memblock_dump_all();
}

void __init memblock_allow_resize(void)
{
        memblock_can_resize = 1;
}

static int __init early_memblock(char *p)
{
        if (p && strstr(p, "debug"))
                memblock_debug = 1;
        return 0;
}
early_param("memblock", early_memblock);

static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn)
{
        struct page *start_pg, *end_pg;
        phys_addr_t pg, pgend;

        /*
         * Convert start_pfn/end_pfn to a struct page pointer.
         */
        start_pg = pfn_to_page(start_pfn - 1) + 1;
        end_pg = pfn_to_page(end_pfn - 1) + 1;

        /*
         * Convert to physical addresses, and round start upwards and end
         * downwards.
         */
        pg = PAGE_ALIGN(__pa(start_pg));
        pgend = PAGE_ALIGN_DOWN(__pa(end_pg));

        /*
         * If there are free pages between these, free the section of the
         * memmap array.
         */
        if (pg < pgend)
                memblock_phys_free(pg, pgend - pg);
}

/*
 * The mem_map array can get very big.  Free the unused area of the memory map.
 */
static void __init free_unused_memmap(void)
{
        unsigned long start, end, prev_end = 0;
        int i;

        if (!IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) ||
            IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
                return;

        /*
         * This relies on each bank being in address order.
         * The banks are sorted previously in bootmem_init().
         */
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
#ifdef CONFIG_SPARSEMEM
                /*
                 * Take care not to free memmap entries that don't exist
                 * due to SPARSEMEM sections which aren't present.
                 */
                start = min(start, ALIGN(prev_end, PAGES_PER_SECTION));
#endif
                /*
                 * Align down here since many operations in VM subsystem
                 * presume that there are no holes in the memory map inside
                 * a pageblock
                 */
                start = pageblock_start_pfn(start);

                /*
                 * If we had a previous bank, and there is a space
                 * between the current bank and the previous, free it.
                 */
                if (prev_end && prev_end < start)
                        free_memmap(prev_end, start);

                /*
                 * Align up here since many operations in VM subsystem
                 * presume that there are no holes in the memory map inside
                 * a pageblock
                 */
                prev_end = pageblock_align(end);
        }

#ifdef CONFIG_SPARSEMEM
        if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) {
                prev_end = pageblock_align(end);
                free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
        }
#endif
}

static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
        int order;

        while (start < end) {
                /*
                 * Free the pages in the largest chunks alignment allows.
                 *
                 * __ffs() behaviour is undefined for 0. start == 0 is
                 * MAX_PAGE_ORDER-aligned, set order to MAX_PAGE_ORDER for
                 * the case.
                 */
                if (start)
                        order = min_t(int, MAX_PAGE_ORDER, __ffs(start));
                else
                        order = MAX_PAGE_ORDER;

                while (start + (1UL << order) > end)
                        order--;

                memblock_free_pages(pfn_to_page(start), start, order);

                start += (1UL << order);
        }
}

static unsigned long __init __free_memory_core(phys_addr_t start,
                                 phys_addr_t end)
{
        unsigned long start_pfn = PFN_UP(start);
        unsigned long end_pfn = PFN_DOWN(end);

        if (!IS_ENABLED(CONFIG_HIGHMEM) && end_pfn > max_low_pfn)
                end_pfn = max_low_pfn;

        if (start_pfn >= end_pfn)
                return 0;

        __free_pages_memory(start_pfn, end_pfn);

        return end_pfn - start_pfn;
}

static void __init memmap_init_reserved_pages(void)
{
        struct memblock_region *region;
        phys_addr_t start, end;
        int nid;

        /*
         * set nid on all reserved pages and also treat struct
         * pages for the NOMAP regions as PageReserved
         */
        for_each_mem_region(region) {
                nid = memblock_get_region_node(region);
                start = region->base;
                end = start + region->size;

                if (memblock_is_nomap(region))
                        reserve_bootmem_region(start, end, nid);

                memblock_set_node(start, end, &memblock.reserved, nid);
        }

        /*
         * initialize struct pages for reserved regions that don't have
         * the MEMBLOCK_RSRV_NOINIT flag set
         */
        for_each_reserved_mem_region(region) {
                if (!memblock_is_reserved_noinit(region)) {
                        nid = memblock_get_region_node(region);
                        start = region->base;
                        end = start + region->size;

                        if (!numa_valid_node(nid))
                                nid = early_pfn_to_nid(PFN_DOWN(start));

                        reserve_bootmem_region(start, end, nid);
                }
        }
}

static unsigned long __init free_low_memory_core_early(void)
{
        unsigned long count = 0;
        phys_addr_t start, end;
        u64 i;

        memblock_clear_hotplug(0, -1);

        memmap_init_reserved_pages();

        /*
         * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
         *  because in some case like Node0 doesn't have RAM installed
         *  low ram will be on Node1
         */
        for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
                                NULL)
                count += __free_memory_core(start, end);

        return count;
}

static int reset_managed_pages_done __initdata;

static void __init reset_node_managed_pages(pg_data_t *pgdat)
{
        struct zone *z;

        for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
                atomic_long_set(&z->managed_pages, 0);
}

void __init reset_all_zones_managed_pages(void)
{
        struct pglist_data *pgdat;

        if (reset_managed_pages_done)
                return;

        for_each_online_pgdat(pgdat)
                reset_node_managed_pages(pgdat);

        reset_managed_pages_done = 1;
}

/**
 * memblock_free_all - release free pages to the buddy allocator
 */
void __init memblock_free_all(void)
{
        unsigned long pages;

        free_unused_memmap();
        reset_all_zones_managed_pages();

        pages = free_low_memory_core_early();
        totalram_pages_add(pages);
}

/* Keep a table to reserve named memory */
#define RESERVE_MEM_MAX_ENTRIES                8
#define RESERVE_MEM_NAME_SIZE                16
struct reserve_mem_table {
        char                        name[RESERVE_MEM_NAME_SIZE];
        phys_addr_t                start;
        phys_addr_t                size;
};
static struct reserve_mem_table reserved_mem_table[RESERVE_MEM_MAX_ENTRIES];
static int reserved_mem_count;
static DEFINE_MUTEX(reserve_mem_lock);

/* Add wildcard region with a lookup name */
static void __init reserved_mem_add(phys_addr_t start, phys_addr_t size,
                                   const char *name)
{
        struct reserve_mem_table *map;

        map = &reserved_mem_table[reserved_mem_count++];
        map->start = start;
        map->size = size;
        strscpy(map->name, name);
}

static struct reserve_mem_table *reserve_mem_find_by_name_nolock(const char *name)
{
        struct reserve_mem_table *map;
        int i;

        for (i = 0; i < reserved_mem_count; i++) {
                map = &reserved_mem_table[i];
                if (!map->size)
                        continue;
                if (strcmp(name, map->name) == 0)
                        return map;
        }
        return NULL;
}

/**
 * reserve_mem_find_by_name - Find reserved memory region with a given name
 * @name: The name that is attached to a reserved memory region
 * @start: If found, holds the start address
 * @size: If found, holds the size of the address.
 *
 * @start and @size are only updated if @name is found.
 *
 * Returns: 1 if found or 0 if not found.
 */
int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *size)
{
        struct reserve_mem_table *map;

        guard(mutex)(&reserve_mem_lock);
        map = reserve_mem_find_by_name_nolock(name);
        if (!map)
                return 0;

        *start = map->start;
        *size = map->size;
        return 1;
}
EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);

/**
 * reserve_mem_release_by_name - Release reserved memory region with a given name
 * @name: The name that is attatched to a reserved memory region
 *
 * Forcibly release the pages in the reserved memory region so that those memory
 * can be used as free memory. After released the reserved region size becomes 0.
 *
 * Returns: 1 if released or 0 if not found.
 */
int reserve_mem_release_by_name(const char *name)
{
        char buf[RESERVE_MEM_NAME_SIZE + 12];
        struct reserve_mem_table *map;
        void *start, *end;

        guard(mutex)(&reserve_mem_lock);
        map = reserve_mem_find_by_name_nolock(name);
        if (!map)
                return 0;

        start = phys_to_virt(map->start);
        end = start + map->size - 1;
        snprintf(buf, sizeof(buf), "reserve_mem:%s", name);
        free_reserved_area(start, end, 0, buf);
        map->size = 0;

        return 1;
}

/*
 * Parse reserve_mem=nn:align:name
 */
static int __init reserve_mem(char *p)
{
        phys_addr_t start, size, align, tmp;
        char *name;
        char *oldp;
        int len;

        if (!p)
                return -EINVAL;

        /* Check if there's room for more reserved memory */
        if (reserved_mem_count >= RESERVE_MEM_MAX_ENTRIES)
                return -EBUSY;

        oldp = p;
        size = memparse(p, &p);
        if (!size || p == oldp)
                return -EINVAL;

        if (*p != ':')
                return -EINVAL;

        align = memparse(p+1, &p);
        if (*p != ':')
                return -EINVAL;

        /*
         * memblock_phys_alloc() doesn't like a zero size align,
         * but it is OK for this command to have it.
         */
        if (align < SMP_CACHE_BYTES)
                align = SMP_CACHE_BYTES;

        name = p + 1;
        len = strlen(name);

        /* name needs to have length but not too big */
        if (!len || len >= RESERVE_MEM_NAME_SIZE)
                return -EINVAL;

        /* Make sure that name has text */
        for (p = name; *p; p++) {
                if (!isspace(*p))
                        break;
        }
        if (!*p)
                return -EINVAL;

        /* Make sure the name is not already used */
        if (reserve_mem_find_by_name(name, &start, &tmp))
                return -EBUSY;

        start = memblock_phys_alloc(size, align);
        if (!start)
                return -ENOMEM;

        reserved_mem_add(start, size, name);

        return 1;
}
__setup("reserve_mem=", reserve_mem);

#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
static const char * const flagname[] = {
        [ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",
        [ilog2(MEMBLOCK_MIRROR)] = "MIRROR",
        [ilog2(MEMBLOCK_NOMAP)] = "NOMAP",
        [ilog2(MEMBLOCK_DRIVER_MANAGED)] = "DRV_MNG",
        [ilog2(MEMBLOCK_RSRV_NOINIT)] = "RSV_NIT",
};

static int memblock_debug_show(struct seq_file *m, void *private)
{
        struct memblock_type *type = m->private;
        struct memblock_region *reg;
        int i, j, nid;
        unsigned int count = ARRAY_SIZE(flagname);
        phys_addr_t end;

        for (i = 0; i < type->cnt; i++) {
                reg = &type->regions[i];
                end = reg->base + reg->size - 1;
                nid = memblock_get_region_node(reg);

                seq_printf(m, "%4d: ", i);
                seq_printf(m, "%pa..%pa ", &reg->base, &end);
                if (numa_valid_node(nid))
                        seq_printf(m, "%4d ", nid);
                else
                        seq_printf(m, "%4c ", 'x');
                if (reg->flags) {
                        for (j = 0; j < count; j++) {
                                if (reg->flags & (1U << j)) {
                                        seq_printf(m, "%s\n", flagname[j]);
                                        break;
                                }
                        }
                        if (j == count)
                                seq_printf(m, "%s\n", "UNKNOWN");
                } else {
                        seq_printf(m, "%s\n", "NONE");
                }
        }
        return 0;
}
DEFINE_SHOW_ATTRIBUTE(memblock_debug);

static int __init memblock_init_debugfs(void)
{
        struct dentry *root = debugfs_create_dir("memblock", NULL);

        debugfs_create_file("memory", 0444, root,
                            &memblock.memory, &memblock_debug_fops);
        debugfs_create_file("reserved", 0444, root,
                            &memblock.reserved, &memblock_debug_fops);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
        debugfs_create_file("physmem", 0444, root, &physmem,
                            &memblock_debug_fops);
#endif

        return 0;
}
__initcall(memblock_init_debugfs);

#endif /* CONFIG_DEBUG_FS */










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_UIDGID_H
#define _LINUX_UIDGID_H

/*
 * A set of types for the internal kernel types representing uids and gids.
 *
 * The types defined in this header allow distinguishing which uids and gids in
 * the kernel are values used by userspace and which uid and gid values are
 * the internal kernel values.  With the addition of user namespaces the values
 * can be different.  Using the type system makes it possible for the compiler
 * to detect when we overlook these differences.
 *
 */
#include <linux/uidgid_types.h>
#include <linux/highuid.h>

struct user_namespace;
extern struct user_namespace init_user_ns;
struct uid_gid_map;

#define KUIDT_INIT(value) (kuid_t){ value }
#define KGIDT_INIT(value) (kgid_t){ value }

#ifdef CONFIG_MULTIUSER
static inline uid_t __kuid_val(kuid_t uid)
{
        return uid.val;
}

static inline gid_t __kgid_val(kgid_t gid)
{
        return gid.val;
}
#else
static inline uid_t __kuid_val(kuid_t uid)
{
        return 0;
}

static inline gid_t __kgid_val(kgid_t gid)
{
        return 0;
}
#endif

#define GLOBAL_ROOT_UID KUIDT_INIT(0)
#define GLOBAL_ROOT_GID KGIDT_INIT(0)

#define INVALID_UID KUIDT_INIT(-1)
#define INVALID_GID KGIDT_INIT(-1)

static inline bool uid_eq(kuid_t left, kuid_t right)
{
        return __kuid_val(left) == __kuid_val(right);
}

static inline bool gid_eq(kgid_t left, kgid_t right)
{
        return __kgid_val(left) == __kgid_val(right);
}

static inline bool uid_gt(kuid_t left, kuid_t right)
{
        return __kuid_val(left) > __kuid_val(right);
}

static inline bool gid_gt(kgid_t left, kgid_t right)
{
        return __kgid_val(left) > __kgid_val(right);
}

static inline bool uid_gte(kuid_t left, kuid_t right)
{
        return __kuid_val(left) >= __kuid_val(right);
}

static inline bool gid_gte(kgid_t left, kgid_t right)
{
        return __kgid_val(left) >= __kgid_val(right);
}

static inline bool uid_lt(kuid_t left, kuid_t right)
{
        return __kuid_val(left) < __kuid_val(right);
}

static inline bool gid_lt(kgid_t left, kgid_t right)
{
        return __kgid_val(left) < __kgid_val(right);
}

static inline bool uid_lte(kuid_t left, kuid_t right)
{
        return __kuid_val(left) <= __kuid_val(right);
}

static inline bool gid_lte(kgid_t left, kgid_t right)
{
        return __kgid_val(left) <= __kgid_val(right);
}

static inline bool uid_valid(kuid_t uid)
{
        return __kuid_val(uid) != (uid_t) -1;
}

static inline bool gid_valid(kgid_t gid)
{
        return __kgid_val(gid) != (gid_t) -1;
}

#ifdef CONFIG_USER_NS

extern kuid_t make_kuid(struct user_namespace *from, uid_t uid);
extern kgid_t make_kgid(struct user_namespace *from, gid_t gid);

extern uid_t from_kuid(struct user_namespace *to, kuid_t uid);
extern gid_t from_kgid(struct user_namespace *to, kgid_t gid);
extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid);
extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid);

static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
{
        return from_kuid(ns, uid) != (uid_t) -1;
}

static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
{
        return from_kgid(ns, gid) != (gid_t) -1;
}

u32 map_id_down(struct uid_gid_map *map, u32 id);
u32 map_id_up(struct uid_gid_map *map, u32 id);
u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count);

#else

static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid)
{
        return KUIDT_INIT(uid);
}

static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid)
{
        return KGIDT_INIT(gid);
}

static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid)
{
        return __kuid_val(kuid);
}

static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid)
{
        return __kgid_val(kgid);
}

static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid)
{
        uid_t uid = from_kuid(to, kuid);
        if (uid == (uid_t)-1)
                uid = overflowuid;
        return uid;
}

static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid)
{
        gid_t gid = from_kgid(to, kgid);
        if (gid == (gid_t)-1)
                gid = overflowgid;
        return gid;
}

static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid)
{
        return uid_valid(uid);
}

static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid)
{
        return gid_valid(gid);
}

static inline u32 map_id_down(struct uid_gid_map *map, u32 id)
{
        return id;
}

static inline u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count)
{
        return id;
}

static inline u32 map_id_up(struct uid_gid_map *map, u32 id)
{
        return id;
}
#endif /* CONFIG_USER_NS */

#endif /* _LINUX_UIDGID_H */







































































































































































































































































































































































  253 





  253 






  129 
  252 






  272 





  272 






  257 
  269 



































































































































































































































































































































    7 

























   24 










































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
// SPDX-License-Identifier: GPL-2.0-only
/*
 * lib/bitmap.c
 * Helper functions for bitmap.h.
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/slab.h>

/**
 * DOC: bitmap introduction
 *
 * bitmaps provide an array of bits, implemented using an
 * array of unsigned longs.  The number of valid bits in a
 * given bitmap does _not_ need to be an exact multiple of
 * BITS_PER_LONG.
 *
 * The possible unused bits in the last, partially used word
 * of a bitmap are 'don't care'.  The implementation makes
 * no particular effort to keep them zero.  It ensures that
 * their value will not affect the results of any operation.
 * The bitmap operations that return Boolean (bitmap_empty,
 * for example) or scalar (bitmap_weight, for example) results
 * carefully filter out these unused bits from impacting their
 * results.
 *
 * The byte ordering of bitmaps is more natural on little
 * endian architectures.  See the big-endian headers
 * include/asm-ppc64/bitops.h and include/asm-s390/bitops.h
 * for the best explanations of this ordering.
 */

bool __bitmap_equal(const unsigned long *bitmap1,
                    const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] != bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;

        return true;
}
EXPORT_SYMBOL(__bitmap_equal);

bool __bitmap_or_equal(const unsigned long *bitmap1,
                       const unsigned long *bitmap2,
                       const unsigned long *bitmap3,
                       unsigned int bits)
{
        unsigned int k, lim = bits / BITS_PER_LONG;
        unsigned long tmp;

        for (k = 0; k < lim; ++k) {
                if ((bitmap1[k] | bitmap2[k]) != bitmap3[k])
                        return false;
        }

        if (!(bits % BITS_PER_LONG))
                return true;

        tmp = (bitmap1[k] | bitmap2[k]) ^ bitmap3[k];
        return (tmp & BITMAP_LAST_WORD_MASK(bits)) == 0;
}

void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits)
{
        unsigned int k, lim = BITS_TO_LONGS(bits);
        for (k = 0; k < lim; ++k)
                dst[k] = ~src[k];
}
EXPORT_SYMBOL(__bitmap_complement);

/**
 * __bitmap_shift_right - logical right shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting right (dividing) means moving bits in the MS -> LS bit
 * direction.  Zeros are fed into the vacated MS positions and the
 * LS bits shifted off the bottom are lost.
 */
void __bitmap_shift_right(unsigned long *dst, const unsigned long *src,
                        unsigned shift, unsigned nbits)
{
        unsigned k, lim = BITS_TO_LONGS(nbits);
        unsigned off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        unsigned long mask = BITMAP_LAST_WORD_MASK(nbits);
        for (k = 0; off + k < lim; ++k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take lower rem bits of
                 * word above and make them the top rem bits of result.
                 */
                if (!rem || off + k + 1 >= lim)
                        upper = 0;
                else {
                        upper = src[off + k + 1];
                        if (off + k + 1 == lim - 1)
                                upper &= mask;
                        upper <<= (BITS_PER_LONG - rem);
                }
                lower = src[off + k];
                if (off + k == lim - 1)
                        lower &= mask;
                lower >>= rem;
                dst[k] = lower | upper;
        }
        if (off)
                memset(&dst[lim - off], 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_right);


/**
 * __bitmap_shift_left - logical left shift of the bits in a bitmap
 *   @dst : destination bitmap
 *   @src : source bitmap
 *   @shift : shift by this many bits
 *   @nbits : bitmap size, in bits
 *
 * Shifting left (multiplying) means moving bits in the LS -> MS
 * direction.  Zeros are fed into the vacated LS bit positions
 * and those MS bits shifted off the top are lost.
 */

void __bitmap_shift_left(unsigned long *dst, const unsigned long *src,
                        unsigned int shift, unsigned int nbits)
{
        int k;
        unsigned int lim = BITS_TO_LONGS(nbits);
        unsigned int off = shift/BITS_PER_LONG, rem = shift % BITS_PER_LONG;
        for (k = lim - off - 1; k >= 0; --k) {
                unsigned long upper, lower;

                /*
                 * If shift is not word aligned, take upper rem bits of
                 * word below and make them the bottom rem bits of result.
                 */
                if (rem && k > 0)
                        lower = src[k - 1] >> (BITS_PER_LONG - rem);
                else
                        lower = 0;
                upper = src[k] << rem;
                dst[k + off] = lower | upper;
        }
        if (off)
                memset(dst, 0, off*sizeof(unsigned long));
}
EXPORT_SYMBOL(__bitmap_shift_left);

/**
 * bitmap_cut() - remove bit region from bitmap and right shift remaining bits
 * @dst: destination bitmap, might overlap with src
 * @src: source bitmap
 * @first: start bit of region to be removed
 * @cut: number of bits to remove
 * @nbits: bitmap size, in bits
 *
 * Set the n-th bit of @dst iff the n-th bit of @src is set and
 * n is less than @first, or the m-th bit of @src is set for any
 * m such that @first <= n < nbits, and m = n + @cut.
 *
 * In pictures, example for a big-endian 32-bit architecture:
 *
 * The @src bitmap is::
 *
 *   31                                   63
 *   |                                    |
 *   10000000 11000001 11110010 00010101  10000000 11000001 01110010 00010101
 *                   |  |              |                                    |
 *                  16  14             0                                   32
 *
 * if @cut is 3, and @first is 14, bits 14-16 in @src are cut and @dst is::
 *
 *   31                                   63
 *   |                                    |
 *   10110000 00011000 00110010 00010101  00010000 00011000 00101110 01000010
 *                      |              |                                    |
 *                      14 (bit 17     0                                   32
 *                          from @src)
 *
 * Note that @dst and @src might overlap partially or entirely.
 *
 * This is implemented in the obvious way, with a shift and carry
 * step for each moved bit. Optimisation is left as an exercise
 * for the compiler.
 */
void bitmap_cut(unsigned long *dst, const unsigned long *src,
                unsigned int first, unsigned int cut, unsigned int nbits)
{
        unsigned int len = BITS_TO_LONGS(nbits);
        unsigned long keep = 0, carry;
        int i;

        if (first % BITS_PER_LONG) {
                keep = src[first / BITS_PER_LONG] &
                       (~0UL >> (BITS_PER_LONG - first % BITS_PER_LONG));
        }

        memmove(dst, src, len * sizeof(*dst));

        while (cut--) {
                for (i = first / BITS_PER_LONG; i < len; i++) {
                        if (i < len - 1)
                                carry = dst[i + 1] & 1UL;
                        else
                                carry = 0;

                        dst[i] = (dst[i] >> 1) | (carry << (BITS_PER_LONG - 1));
                }
        }

        dst[first / BITS_PER_LONG] &= ~0UL << (first % BITS_PER_LONG);
        dst[first / BITS_PER_LONG] |= keep;
}
EXPORT_SYMBOL(bitmap_cut);

bool __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_and);

void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] | bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_or);

void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(bits);

        for (k = 0; k < nr; k++)
                dst[k] = bitmap1[k] ^ bitmap2[k];
}
EXPORT_SYMBOL(__bitmap_xor);

bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k;
        unsigned int lim = bits/BITS_PER_LONG;
        unsigned long result = 0;

        for (k = 0; k < lim; k++)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
        if (bits % BITS_PER_LONG)
                result |= (dst[k] = bitmap1[k] & ~bitmap2[k] &
                           BITMAP_LAST_WORD_MASK(bits));
        return result != 0;
}
EXPORT_SYMBOL(__bitmap_andnot);

void __bitmap_replace(unsigned long *dst,
                      const unsigned long *old, const unsigned long *new,
                      const unsigned long *mask, unsigned int nbits)
{
        unsigned int k;
        unsigned int nr = BITS_TO_LONGS(nbits);

        for (k = 0; k < nr; k++)
                dst[k] = (old[k] & ~mask[k]) | (new[k] & mask[k]);
}
EXPORT_SYMBOL(__bitmap_replace);

bool __bitmap_intersects(const unsigned long *bitmap1,
                         const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & bitmap2[k])
                        return true;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return true;
        return false;
}
EXPORT_SYMBOL(__bitmap_intersects);

bool __bitmap_subset(const unsigned long *bitmap1,
                     const unsigned long *bitmap2, unsigned int bits)
{
        unsigned int k, lim = bits/BITS_PER_LONG;
        for (k = 0; k < lim; ++k)
                if (bitmap1[k] & ~bitmap2[k])
                        return false;

        if (bits % BITS_PER_LONG)
                if ((bitmap1[k] & ~bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits))
                        return false;
        return true;
}
EXPORT_SYMBOL(__bitmap_subset);

#define BITMAP_WEIGHT(FETCH, bits)        \
({                                                                                \
        unsigned int __bits = (bits), idx, w = 0;                                \
                                                                                \
        for (idx = 0; idx < __bits / BITS_PER_LONG; idx++)                        \
                w += hweight_long(FETCH);                                        \
                                                                                \
        if (__bits % BITS_PER_LONG)                                                \
                w += hweight_long((FETCH) & BITMAP_LAST_WORD_MASK(__bits));        \
                                                                                \
        w;                                                                        \
})

unsigned int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight);

unsigned int __bitmap_weight_and(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_and);

unsigned int __bitmap_weight_andnot(const unsigned long *bitmap1,
                                const unsigned long *bitmap2, unsigned int bits)
{
        return BITMAP_WEIGHT(bitmap1[idx] & ~bitmap2[idx], bits);
}
EXPORT_SYMBOL(__bitmap_weight_andnot);

void __bitmap_set(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_set >= 0) {
                *p |= mask_to_set;
                len -= bits_to_set;
                bits_to_set = BITS_PER_LONG;
                mask_to_set = ~0UL;
                p++;
        }
        if (len) {
                mask_to_set &= BITMAP_LAST_WORD_MASK(size);
                *p |= mask_to_set;
        }
}
EXPORT_SYMBOL(__bitmap_set);

void __bitmap_clear(unsigned long *map, unsigned int start, int len)
{
        unsigned long *p = map + BIT_WORD(start);
        const unsigned int size = start + len;
        int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
        unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);

        while (len - bits_to_clear >= 0) {
                *p &= ~mask_to_clear;
                len -= bits_to_clear;
                bits_to_clear = BITS_PER_LONG;
                mask_to_clear = ~0UL;
                p++;
        }
        if (len) {
                mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
                *p &= ~mask_to_clear;
        }
}
EXPORT_SYMBOL(__bitmap_clear);

/**
 * bitmap_find_next_zero_area_off - find a contiguous aligned zero area
 * @map: The address to base the search on
 * @size: The bitmap size in bits
 * @start: The bitnumber to start searching at
 * @nr: The number of zeroed bits we're looking for
 * @align_mask: Alignment mask for zero area
 * @align_offset: Alignment offset for zero area.
 *
 * The @align_mask should be one less than a power of 2; the effect is that
 * the bit offset of all zero areas this function finds plus @align_offset
 * is multiple of that power of 2.
 */
unsigned long bitmap_find_next_zero_area_off(unsigned long *map,
                                             unsigned long size,
                                             unsigned long start,
                                             unsigned int nr,
                                             unsigned long align_mask,
                                             unsigned long align_offset)
{
        unsigned long index, end, i;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index + align_offset, align_mask) - align_offset;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                start = i + 1;
                goto again;
        }
        return index;
}
EXPORT_SYMBOL(bitmap_find_next_zero_area_off);

/**
 * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap
 *        @buf: pointer to a bitmap
 *        @pos: a bit position in @buf (0 <= @pos < @nbits)
 *        @nbits: number of valid bit positions in @buf
 *
 * Map the bit at position @pos in @buf (of length @nbits) to the
 * ordinal of which set bit it is.  If it is not set or if @pos
 * is not a valid bit position, map to -1.
 *
 * If for example, just bits 4 through 7 are set in @buf, then @pos
 * values 4 through 7 will get mapped to 0 through 3, respectively,
 * and other @pos values will get mapped to -1.  When @pos value 7
 * gets mapped to (returns) @ord value 3 in this example, that means
 * that bit 7 is the 3rd (starting with 0th) set bit in @buf.
 *
 * The bit positions 0 through @bits are valid positions in @buf.
 */
static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits)
{
        if (pos >= nbits || !test_bit(pos, buf))
                return -1;

        return bitmap_weight(buf, pos);
}

/**
 * bitmap_remap - Apply map defined by a pair of bitmaps to another bitmap
 *        @dst: remapped result
 *        @src: subset to be remapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @nbits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * If either of the @old and @new bitmaps are empty, or if @src and
 * @dst point to the same location, then this routine copies @src
 * to @dst.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to @src, placing the result in
 * @dst, clearing any bits previously set in @dst.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @src comes into this routine
 * with bits 1, 5 and 7 set, then @dst should leave with bits 1,
 * 13 and 15 set.
 */
void bitmap_remap(unsigned long *dst, const unsigned long *src,
                const unsigned long *old, const unsigned long *new,
                unsigned int nbits)
{
        unsigned int oldbit, w;

        if (dst == src)                /* following doesn't handle inplace remaps */
                return;
        bitmap_zero(dst, nbits);

        w = bitmap_weight(new, nbits);
        for_each_set_bit(oldbit, src, nbits) {
                int n = bitmap_pos_to_ord(old, oldbit, nbits);

                if (n < 0 || w == 0)
                        set_bit(oldbit, dst);        /* identity map */
                else
                        set_bit(find_nth_bit(new, nbits, n % w), dst);
        }
}
EXPORT_SYMBOL(bitmap_remap);

/**
 * bitmap_bitremap - Apply map defined by a pair of bitmaps to a single bit
 *        @oldbit: bit position to be mapped
 *        @old: defines domain of map
 *        @new: defines range of map
 *        @bits: number of bits in each of these bitmaps
 *
 * Let @old and @new define a mapping of bit positions, such that
 * whatever position is held by the n-th set bit in @old is mapped
 * to the n-th set bit in @new.  In the more general case, allowing
 * for the possibility that the weight 'w' of @new is less than the
 * weight of @old, map the position of the n-th set bit in @old to
 * the position of the m-th set bit in @new, where m == n % w.
 *
 * The positions of unset bits in @old are mapped to themselves
 * (the identity map).
 *
 * Apply the above specified mapping to bit position @oldbit, returning
 * the new bit position.
 *
 * For example, lets say that @old has bits 4 through 7 set, and
 * @new has bits 12 through 15 set.  This defines the mapping of bit
 * position 4 to 12, 5 to 13, 6 to 14 and 7 to 15, and of all other
 * bit positions unchanged.  So if say @oldbit is 5, then this routine
 * returns 13.
 */
int bitmap_bitremap(int oldbit, const unsigned long *old,
                                const unsigned long *new, int bits)
{
        int w = bitmap_weight(new, bits);
        int n = bitmap_pos_to_ord(old, oldbit, bits);
        if (n < 0 || w == 0)
                return oldbit;
        else
                return find_nth_bit(new, bits, n % w);
}
EXPORT_SYMBOL(bitmap_bitremap);

#ifdef CONFIG_NUMA
/**
 * bitmap_onto - translate one bitmap relative to another
 *        @dst: resulting translated bitmap
 *         @orig: original untranslated bitmap
 *         @relmap: bitmap relative to which translated
 *        @bits: number of bits in each of these bitmaps
 *
 * Set the n-th bit of @dst iff there exists some m such that the
 * n-th bit of @relmap is set, the m-th bit of @orig is set, and
 * the n-th bit of @relmap is also the m-th _set_ bit of @relmap.
 * (If you understood the previous sentence the first time your
 * read it, you're overqualified for your current job.)
 *
 * In other words, @orig is mapped onto (surjectively) @dst,
 * using the map { <n, m> | the n-th bit of @relmap is the
 * m-th set bit of @relmap }.
 *
 * Any set bits in @orig above bit number W, where W is the
 * weight of (number of set bits in) @relmap are mapped nowhere.
 * In particular, if for all bits m set in @orig, m >= W, then
 * @dst will end up empty.  In situations where the possibility
 * of such an empty result is not desired, one way to avoid it is
 * to use the bitmap_fold() operator, below, to first fold the
 * @orig bitmap over itself so that all its set bits x are in the
 * range 0 <= x < W.  The bitmap_fold() operator does this by
 * setting the bit (m % W) in @dst, for each bit (m) set in @orig.
 *
 * Example [1] for bitmap_onto():
 *  Let's say @relmap has bits 30-39 set, and @orig has bits
 *  1, 3, 5, 7, 9 and 11 set.  Then on return from this routine,
 *  @dst will have bits 31, 33, 35, 37 and 39 set.
 *
 *  When bit 0 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the first bit (if any)
 *  that is turned on in @relmap.  Since bit 0 was off in the
 *  above example, we leave off that bit (bit 30) in @dst.
 *
 *  When bit 1 is set in @orig (as in the above example), it
 *  means turn on the bit in @dst corresponding to whatever
 *  is the second bit that is turned on in @relmap.  The second
 *  bit in @relmap that was turned on in the above example was
 *  bit 31, so we turned on bit 31 in @dst.
 *
 *  Similarly, we turned on bits 33, 35, 37 and 39 in @dst,
 *  because they were the 4th, 6th, 8th and 10th set bits
 *  set in @relmap, and the 4th, 6th, 8th and 10th bits of
 *  @orig (i.e. bits 3, 5, 7 and 9) were also set.
 *
 *  When bit 11 is set in @orig, it means turn on the bit in
 *  @dst corresponding to whatever is the twelfth bit that is
 *  turned on in @relmap.  In the above example, there were
 *  only ten bits turned on in @relmap (30..39), so that bit
 *  11 was set in @orig had no affect on @dst.
 *
 * Example [2] for bitmap_fold() + bitmap_onto():
 *  Let's say @relmap has these ten bits set::
 *
 *                40 41 42 43 45 48 53 61 74 95
 *
 *  (for the curious, that's 40 plus the first ten terms of the
 *  Fibonacci sequence.)
 *
 *  Further lets say we use the following code, invoking
 *  bitmap_fold() then bitmap_onto, as suggested above to
 *  avoid the possibility of an empty @dst result::
 *
 *        unsigned long *tmp;        // a temporary bitmap's bits
 *
 *        bitmap_fold(tmp, orig, bitmap_weight(relmap, bits), bits);
 *        bitmap_onto(dst, tmp, relmap, bits);
 *
 *  Then this table shows what various values of @dst would be, for
 *  various @orig's.  I list the zero-based positions of each set bit.
 *  The tmp column shows the intermediate result, as computed by
 *  using bitmap_fold() to fold the @orig bitmap modulo ten
 *  (the weight of @relmap):
 *
 *      =============== ============== =================
 *      @orig           tmp            @dst
 *      0                0             40
 *      1                1             41
 *      9                9             95
 *      10               0             40 [#f1]_
 *      1 3 5 7          1 3 5 7       41 43 48 61
 *      0 1 2 3 4        0 1 2 3 4     40 41 42 43 45
 *      0 9 18 27        0 9 8 7       40 61 74 95
 *      0 10 20 30       0             40
 *      0 11 22 33       0 1 2 3       40 41 42 43
 *      0 12 24 36       0 2 4 6       40 42 45 53
 *      78 102 211       1 2 8         41 42 74 [#f1]_
 *      =============== ============== =================
 *
 * .. [#f1]
 *
 *     For these marked lines, if we hadn't first done bitmap_fold()
 *     into tmp, then the @dst result would have been empty.
 *
 * If either of @orig or @relmap is empty (no set bits), then @dst
 * will be returned empty.
 *
 * If (as explained above) the only set bits in @orig are in positions
 * m where m >= W, (where W is the weight of @relmap) then @dst will
 * once again be returned empty.
 *
 * All bits in @dst not set by the above rule are cleared.
 */
void bitmap_onto(unsigned long *dst, const unsigned long *orig,
                        const unsigned long *relmap, unsigned int bits)
{
        unsigned int n, m;        /* same meaning as in above comment */

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, bits);

        /*
         * The following code is a more efficient, but less
         * obvious, equivalent to the loop:
         *        for (m = 0; m < bitmap_weight(relmap, bits); m++) {
         *                n = find_nth_bit(orig, bits, m);
         *                if (test_bit(m, orig))
         *                        set_bit(n, dst);
         *        }
         */

        m = 0;
        for_each_set_bit(n, relmap, bits) {
                /* m == bitmap_pos_to_ord(relmap, n, bits) */
                if (test_bit(m, orig))
                        set_bit(n, dst);
                m++;
        }
}

/**
 * bitmap_fold - fold larger bitmap into smaller, modulo specified size
 *        @dst: resulting smaller bitmap
 *        @orig: original larger bitmap
 *        @sz: specified size
 *        @nbits: number of bits in each of these bitmaps
 *
 * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst.
 * Clear all other bits in @dst.  See further the comment and
 * Example [2] for bitmap_onto() for why and how to use this.
 */
void bitmap_fold(unsigned long *dst, const unsigned long *orig,
                        unsigned int sz, unsigned int nbits)
{
        unsigned int oldbit;

        if (dst == orig)        /* following doesn't handle inplace mappings */
                return;
        bitmap_zero(dst, nbits);

        for_each_set_bit(oldbit, orig, nbits)
                set_bit(oldbit % sz, dst);
}
#endif /* CONFIG_NUMA */

unsigned long *bitmap_alloc(unsigned int nbits, gfp_t flags)
{
        return kmalloc_array(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                             flags);
}
EXPORT_SYMBOL(bitmap_alloc);

unsigned long *bitmap_zalloc(unsigned int nbits, gfp_t flags)
{
        return bitmap_alloc(nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(bitmap_zalloc);

unsigned long *bitmap_alloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return kmalloc_array_node(BITS_TO_LONGS(nbits), sizeof(unsigned long),
                                  flags, node);
}
EXPORT_SYMBOL(bitmap_alloc_node);

unsigned long *bitmap_zalloc_node(unsigned int nbits, gfp_t flags, int node)
{
        return bitmap_alloc_node(nbits, flags | __GFP_ZERO, node);
}
EXPORT_SYMBOL(bitmap_zalloc_node);

void bitmap_free(const unsigned long *bitmap)
{
        kfree(bitmap);
}
EXPORT_SYMBOL(bitmap_free);

static void devm_bitmap_free(void *data)
{
        unsigned long *bitmap = data;

        bitmap_free(bitmap);
}

unsigned long *devm_bitmap_alloc(struct device *dev,
                                 unsigned int nbits, gfp_t flags)
{
        unsigned long *bitmap;
        int ret;

        bitmap = bitmap_alloc(nbits, flags);
        if (!bitmap)
                return NULL;

        ret = devm_add_action_or_reset(dev, devm_bitmap_free, bitmap);
        if (ret)
                return NULL;

        return bitmap;
}
EXPORT_SYMBOL_GPL(devm_bitmap_alloc);

unsigned long *devm_bitmap_zalloc(struct device *dev,
                                  unsigned int nbits, gfp_t flags)
{
        return devm_bitmap_alloc(dev, nbits, flags | __GFP_ZERO);
}
EXPORT_SYMBOL_GPL(devm_bitmap_zalloc);

#if BITS_PER_LONG == 64
/**
 * bitmap_from_arr32 - copy the contents of u32 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u32 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr32(unsigned long *bitmap, const u32 *buf, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                bitmap[i/2] = (unsigned long) buf[i];
                if (++i < halfwords)
                        bitmap[i/2] |= ((unsigned long) buf[i]) << 32;
        }

        /* Clear tail bits in last word beyond nbits. */
        if (nbits % BITS_PER_LONG)
                bitmap[(halfwords - 1) / 2] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr32);

/**
 * bitmap_to_arr32 - copy the contents of bitmap to a u32 array of bits
 *        @buf: array of u32 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr32(u32 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        unsigned int i, halfwords;

        halfwords = DIV_ROUND_UP(nbits, 32);
        for (i = 0; i < halfwords; i++) {
                buf[i] = (u32) (bitmap[i/2] & UINT_MAX);
                if (++i < halfwords)
                        buf[i] = (u32) (bitmap[i/2] >> 32);
        }

        /* Clear tail bits in last element of array beyond nbits. */
        if (nbits % BITS_PER_LONG)
                buf[halfwords - 1] &= (u32) (UINT_MAX >> ((-nbits) & 31));
}
EXPORT_SYMBOL(bitmap_to_arr32);
#endif

#if BITS_PER_LONG == 32
/**
 * bitmap_from_arr64 - copy the contents of u64 array of bits to bitmap
 *        @bitmap: array of unsigned longs, the destination bitmap
 *        @buf: array of u64 (in host byte order), the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_from_arr64(unsigned long *bitmap, const u64 *buf, unsigned int nbits)
{
        int n;

        for (n = nbits; n > 0; n -= 64) {
                u64 val = *buf++;

                *bitmap++ = val;
                if (n > 32)
                        *bitmap++ = val >> 32;
        }

        /*
         * Clear tail bits in the last word beyond nbits.
         *
         * Negative index is OK because here we point to the word next
         * to the last word of the bitmap, except for nbits == 0, which
         * is tested implicitly.
         */
        if (nbits % BITS_PER_LONG)
                bitmap[-1] &= BITMAP_LAST_WORD_MASK(nbits);
}
EXPORT_SYMBOL(bitmap_from_arr64);

/**
 * bitmap_to_arr64 - copy the contents of bitmap to a u64 array of bits
 *        @buf: array of u64 (in host byte order), the dest bitmap
 *        @bitmap: array of unsigned longs, the source bitmap
 *        @nbits: number of bits in @bitmap
 */
void bitmap_to_arr64(u64 *buf, const unsigned long *bitmap, unsigned int nbits)
{
        const unsigned long *end = bitmap + BITS_TO_LONGS(nbits);

        while (bitmap < end) {
                *buf = *bitmap++;
                if (bitmap < end)
                        *buf |= (u64)(*bitmap++) << 32;
                buf++;
        }

        /* Clear tail bits in the last element of array beyond nbits. */
        if (nbits % 64)
                buf[-1] &= GENMASK_ULL((nbits - 1) % 64, 0);
}
EXPORT_SYMBOL(bitmap_to_arr64);
#endif
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_SCHED_GENERIC_H
#define __NET_SCHED_GENERIC_H

#include <linux/netdevice.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/pkt_sched.h>
#include <linux/pkt_cls.h>
#include <linux/percpu.h>
#include <linux/dynamic_queue_limits.h>
#include <linux/list.h>
#include <linux/refcount.h>
#include <linux/workqueue.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
#include <linux/hashtable.h>
#include <net/gen_stats.h>
#include <net/rtnetlink.h>
#include <net/flow_offload.h>
#include <linux/xarray.h>

struct Qdisc_ops;
struct qdisc_walker;
struct tcf_walker;
struct module;
struct bpf_flow_keys;

struct qdisc_rate_table {
        struct tc_ratespec rate;
        u32                data[256];
        struct qdisc_rate_table *next;
        int                refcnt;
};

enum qdisc_state_t {
        __QDISC_STATE_SCHED,
        __QDISC_STATE_DEACTIVATED,
        __QDISC_STATE_MISSED,
        __QDISC_STATE_DRAINING,
};

enum qdisc_state2_t {
        /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly.
         * Use qdisc_run_begin/end() or qdisc_is_running() instead.
         */
        __QDISC_STATE2_RUNNING,
};

#define QDISC_STATE_MISSED        BIT(__QDISC_STATE_MISSED)
#define QDISC_STATE_DRAINING        BIT(__QDISC_STATE_DRAINING)

#define QDISC_STATE_NON_EMPTY        (QDISC_STATE_MISSED | \
                                        QDISC_STATE_DRAINING)

struct qdisc_size_table {
        struct rcu_head                rcu;
        struct list_head        list;
        struct tc_sizespec        szopts;
        int                        refcnt;
        u16                        data[];
};

/* similar to sk_buff_head, but skb->prev pointer is undefined. */
struct qdisc_skb_head {
        struct sk_buff        *head;
        struct sk_buff        *tail;
        __u32                qlen;
        spinlock_t        lock;
};

struct Qdisc {
        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *sch);
        unsigned int                flags;
#define TCQ_F_BUILTIN                1
#define TCQ_F_INGRESS                2
#define TCQ_F_CAN_BYPASS        4
#define TCQ_F_MQROOT                8
#define TCQ_F_ONETXQUEUE        0x10 /* dequeue_skb() can assume all skbs are for
                                      * q->dev_queue : It can test
                                      * netif_xmit_frozen_or_stopped() before
                                      * dequeueing next packet.
                                      * Its true for MQ/MQPRIO slaves, or non
                                      * multiqueue device.
                                      */
#define TCQ_F_WARN_NONWC        (1 << 16)
#define TCQ_F_CPUSTATS                0x20 /* run using percpu statistics */
#define TCQ_F_NOPARENT                0x40 /* root of its hierarchy :
                                      * qdisc_tree_decrease_qlen() should stop.
                                      */
#define TCQ_F_INVISIBLE                0x80 /* invisible by default in dump */
#define TCQ_F_NOLOCK                0x100 /* qdisc does not require locking */
#define TCQ_F_OFFLOADED                0x200 /* qdisc is offloaded to HW */
        u32                        limit;
        const struct Qdisc_ops        *ops;
        struct qdisc_size_table        __rcu *stab;
        struct hlist_node       hash;
        u32                        handle;
        u32                        parent;

        struct netdev_queue        *dev_queue;

        struct net_rate_estimator __rcu *rate_est;
        struct gnet_stats_basic_sync __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        int                        pad;
        refcount_t                refcnt;

        /*
         * For performance sake on SMP, we put highly modified fields at the end
         */
        struct sk_buff_head        gso_skb ____cacheline_aligned_in_smp;
        struct qdisc_skb_head        q;
        struct gnet_stats_basic_sync bstats;
        struct gnet_stats_queue        qstats;
        int                     owner;
        unsigned long                state;
        unsigned long                state2; /* must be written under qdisc spinlock */
        struct Qdisc            *next_sched;
        struct sk_buff_head        skb_bad_txq;

        spinlock_t                busylock ____cacheline_aligned_in_smp;
        spinlock_t                seqlock;

        struct rcu_head                rcu;
        netdevice_tracker        dev_tracker;
        struct lock_class_key        root_lock_key;
        /* private data */
        long privdata[] ____cacheline_aligned;
};

static inline void qdisc_refcount_inc(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return;
        refcount_inc(&qdisc->refcnt);
}

static inline bool qdisc_refcount_dec_if_one(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return true;
        return refcount_dec_if_one(&qdisc->refcnt);
}

/* Intended to be used by unlocked users, when concurrent qdisc release is
 * possible.
 */

static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_BUILTIN)
                return qdisc;
        if (refcount_inc_not_zero(&qdisc->refcnt))
                return qdisc;
        return NULL;
}

/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc
 * root_lock section, or provide their own memory barriers -- ordering
 * against qdisc_run_begin/end() atomic bit operations.
 */
static inline bool qdisc_is_running(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK)
                return spin_is_locked(&qdisc->seqlock);
        return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
}

static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc)
{
        return !(READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY);
}

static inline bool qdisc_is_percpu_stats(const struct Qdisc *q)
{
        return q->flags & TCQ_F_CPUSTATS;
}

static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
{
        if (qdisc_is_percpu_stats(qdisc))
                return nolock_qdisc_is_empty(qdisc);
        return !READ_ONCE(qdisc->q.qlen);
}

/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with
 * the qdisc root lock acquired.
 */
static inline bool qdisc_run_begin(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK) {
                if (spin_trylock(&qdisc->seqlock))
                        return true;

                /* No need to insist if the MISSED flag was already set.
                 * Note that test_and_set_bit() also gives us memory ordering
                 * guarantees wrt potential earlier enqueue() and below
                 * spin_trylock(), both of which are necessary to prevent races
                 */
                if (test_and_set_bit(__QDISC_STATE_MISSED, &qdisc->state))
                        return false;

                /* Try to take the lock again to make sure that we will either
                 * grab it or the CPU that still has it will see MISSED set
                 * when testing it in qdisc_run_end()
                 */
                return spin_trylock(&qdisc->seqlock);
        }
        return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
}

static inline void qdisc_run_end(struct Qdisc *qdisc)
{
        if (qdisc->flags & TCQ_F_NOLOCK) {
                spin_unlock(&qdisc->seqlock);

                /* spin_unlock() only has store-release semantic. The unlock
                 * and test_bit() ordering is a store-load ordering, so a full
                 * memory barrier is needed here.
                 */
                smp_mb();

                if (unlikely(test_bit(__QDISC_STATE_MISSED,
                                      &qdisc->state)))
                        __netif_schedule(qdisc);
        } else {
                __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2);
        }
}

static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
{
        return qdisc->flags & TCQ_F_ONETXQUEUE;
}

static inline int qdisc_avail_bulklimit(const struct netdev_queue *txq)
{
        return netdev_queue_dql_avail(txq);
}

struct Qdisc_class_ops {
        unsigned int                flags;
        /* Child qdisc manipulation */
        struct netdev_queue *        (*select_queue)(struct Qdisc *, struct tcmsg *);
        int                        (*graft)(struct Qdisc *, unsigned long cl,
                                        struct Qdisc *, struct Qdisc **,
                                        struct netlink_ext_ack *extack);
        struct Qdisc *                (*leaf)(struct Qdisc *, unsigned long cl);
        void                        (*qlen_notify)(struct Qdisc *, unsigned long);

        /* Class manipulation routines */
        unsigned long                (*find)(struct Qdisc *, u32 classid);
        int                        (*change)(struct Qdisc *, u32, u32,
                                        struct nlattr **, unsigned long *,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct Qdisc *, unsigned long,
                                          struct netlink_ext_ack *);
        void                        (*walk)(struct Qdisc *, struct qdisc_walker * arg);

        /* Filter manipulation */
        struct tcf_block *        (*tcf_block)(struct Qdisc *sch,
                                             unsigned long arg,
                                             struct netlink_ext_ack *extack);
        unsigned long                (*bind_tcf)(struct Qdisc *, unsigned long,
                                        u32 classid);
        void                        (*unbind_tcf)(struct Qdisc *, unsigned long);

        /* rtnetlink specific */
        int                        (*dump)(struct Qdisc *, unsigned long,
                                        struct sk_buff *skb, struct tcmsg*);
        int                        (*dump_stats)(struct Qdisc *, unsigned long,
                                        struct gnet_dump *);
};

/* Qdisc_class_ops flag values */

/* Implements API that doesn't require rtnl lock */
enum qdisc_class_ops_flags {
        QDISC_CLASS_OPS_DOIT_UNLOCKED = 1,
};

struct Qdisc_ops {
        struct Qdisc_ops        *next;
        const struct Qdisc_class_ops        *cl_ops;
        char                        id[IFNAMSIZ];
        int                        priv_size;
        unsigned int                static_flags;

        int                         (*enqueue)(struct sk_buff *skb,
                                           struct Qdisc *sch,
                                           struct sk_buff **to_free);
        struct sk_buff *        (*dequeue)(struct Qdisc *);
        struct sk_buff *        (*peek)(struct Qdisc *);

        int                        (*init)(struct Qdisc *sch, struct nlattr *arg,
                                        struct netlink_ext_ack *extack);
        void                        (*reset)(struct Qdisc *);
        void                        (*destroy)(struct Qdisc *);
        int                        (*change)(struct Qdisc *sch,
                                          struct nlattr *arg,
                                          struct netlink_ext_ack *extack);
        void                        (*attach)(struct Qdisc *sch);
        int                        (*change_tx_queue_len)(struct Qdisc *, unsigned int);
        void                        (*change_real_num_tx)(struct Qdisc *sch,
                                                      unsigned int new_real_tx);

        int                        (*dump)(struct Qdisc *, struct sk_buff *);
        int                        (*dump_stats)(struct Qdisc *, struct gnet_dump *);

        void                        (*ingress_block_set)(struct Qdisc *sch,
                                                     u32 block_index);
        void                        (*egress_block_set)(struct Qdisc *sch,
                                                    u32 block_index);
        u32                        (*ingress_block_get)(struct Qdisc *sch);
        u32                        (*egress_block_get)(struct Qdisc *sch);

        struct module                *owner;
};

struct tcf_result {
        union {
                struct {
                        unsigned long        class;
                        u32                classid;
                };
                const struct tcf_proto *goto_tp;
        };
};

struct tcf_chain;

struct tcf_proto_ops {
        struct list_head        head;
        char                        kind[IFNAMSIZ];

        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        int                        (*init)(struct tcf_proto*);
        void                        (*destroy)(struct tcf_proto *tp, bool rtnl_held,
                                           struct netlink_ext_ack *extack);

        void*                        (*get)(struct tcf_proto*, u32 handle);
        void                        (*put)(struct tcf_proto *tp, void *f);
        int                        (*change)(struct net *net, struct sk_buff *,
                                        struct tcf_proto*, unsigned long,
                                        u32 handle, struct nlattr **,
                                        void **, u32,
                                        struct netlink_ext_ack *);
        int                        (*delete)(struct tcf_proto *tp, void *arg,
                                          bool *last, bool rtnl_held,
                                          struct netlink_ext_ack *);
        bool                        (*delete_empty)(struct tcf_proto *tp);
        void                        (*walk)(struct tcf_proto *tp,
                                        struct tcf_walker *arg, bool rtnl_held);
        int                        (*reoffload)(struct tcf_proto *tp, bool add,
                                             flow_setup_cb_t *cb, void *cb_priv,
                                             struct netlink_ext_ack *extack);
        void                        (*hw_add)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*hw_del)(struct tcf_proto *tp,
                                          void *type_data);
        void                        (*bind_class)(void *, u32, unsigned long,
                                              void *, unsigned long);
        void *                        (*tmplt_create)(struct net *net,
                                                struct tcf_chain *chain,
                                                struct nlattr **tca,
                                                struct netlink_ext_ack *extack);
        void                        (*tmplt_destroy)(void *tmplt_priv);
        void                        (*tmplt_reoffload)(struct tcf_chain *chain,
                                                   bool add,
                                                   flow_setup_cb_t *cb,
                                                   void *cb_priv);
        struct tcf_exts *        (*get_exts)(const struct tcf_proto *tp,
                                            u32 handle);

        /* rtnetlink specific */
        int                        (*dump)(struct net*, struct tcf_proto*, void *,
                                        struct sk_buff *skb, struct tcmsg*,
                                        bool);
        int                        (*terse_dump)(struct net *net,
                                              struct tcf_proto *tp, void *fh,
                                              struct sk_buff *skb,
                                              struct tcmsg *t, bool rtnl_held);
        int                        (*tmplt_dump)(struct sk_buff *skb,
                                              struct net *net,
                                              void *tmplt_priv);

        struct module                *owner;
        int                        flags;
};

/* Classifiers setting TCF_PROTO_OPS_DOIT_UNLOCKED in tcf_proto_ops->flags
 * are expected to implement tcf_proto_ops->delete_empty(), otherwise race
 * conditions can occur when filters are inserted/deleted simultaneously.
 */
enum tcf_proto_ops_flags {
        TCF_PROTO_OPS_DOIT_UNLOCKED = 1,
};

struct tcf_proto {
        /* Fast access part */
        struct tcf_proto __rcu        *next;
        void __rcu                *root;

        /* called under RCU BH lock*/
        int                        (*classify)(struct sk_buff *,
                                            const struct tcf_proto *,
                                            struct tcf_result *);
        __be16                        protocol;

        /* All the rest */
        u32                        prio;
        void                        *data;
        const struct tcf_proto_ops        *ops;
        struct tcf_chain        *chain;
        /* Lock protects tcf_proto shared state and can be used by unlocked
         * classifiers to protect their private data.
         */
        spinlock_t                lock;
        bool                        deleting;
        bool                        counted;
        bool                        usesw;
        refcount_t                refcnt;
        struct rcu_head                rcu;
        struct hlist_node        destroy_ht_node;
};

struct qdisc_skb_cb {
        struct {
                unsigned int                pkt_len;
                u16                        slave_dev_queue_mapping;
                u16                        tc_classid;
        };
#define QDISC_CB_PRIV_LEN 20
        unsigned char                data[QDISC_CB_PRIV_LEN];
};

typedef void tcf_chain_head_change_t(struct tcf_proto *tp_head, void *priv);

struct tcf_chain {
        /* Protects filter_chain. */
        struct mutex filter_chain_lock;
        struct tcf_proto __rcu *filter_chain;
        struct list_head list;
        struct tcf_block *block;
        u32 index; /* chain index */
        unsigned int refcnt;
        unsigned int action_refcnt;
        bool explicitly_created;
        bool flushing;
        const struct tcf_proto_ops *tmplt_ops;
        void *tmplt_priv;
        struct rcu_head rcu;
};

struct tcf_block {
        struct xarray ports; /* datapath accessible */
        /* Lock protects tcf_block and lifetime-management data of chains
         * attached to the block (refcnt, action_refcnt, explicitly_created).
         */
        struct mutex lock;
        struct list_head chain_list;
        u32 index; /* block index for shared blocks */
        u32 classid; /* which class this block belongs to */
        refcount_t refcnt;
        struct net *net;
        struct Qdisc *q;
        struct rw_semaphore cb_lock; /* protects cb_list and offload counters */
        struct flow_block flow_block;
        struct list_head owner_list;
        bool keep_dst;
        atomic_t useswcnt;
        atomic_t offloadcnt; /* Number of oddloaded filters */
        unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */
        unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */
        struct {
                struct tcf_chain *chain;
                struct list_head filter_chain_list;
        } chain0;
        struct rcu_head rcu;
        DECLARE_HASHTABLE(proto_destroy_ht, 7);
        struct mutex proto_destroy_lock; /* Lock for proto_destroy hashtable. */
};

struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index);

static inline bool lockdep_tcf_chain_is_locked(struct tcf_chain *chain)
{
        return lockdep_is_held(&chain->filter_chain_lock);
}

static inline bool lockdep_tcf_proto_is_locked(struct tcf_proto *tp)
{
        return lockdep_is_held(&tp->lock);
}

#define tcf_chain_dereference(p, chain)                                        \
        rcu_dereference_protected(p, lockdep_tcf_chain_is_locked(chain))

#define tcf_proto_dereference(p, tp)                                        \
        rcu_dereference_protected(p, lockdep_tcf_proto_is_locked(tp))

static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
{
        struct qdisc_skb_cb *qcb;

        BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*qcb));
        BUILD_BUG_ON(sizeof(qcb->data) < sz);
}

static inline int qdisc_qlen(const struct Qdisc *q)
{
        return q->q.qlen;
}

static inline int qdisc_qlen_sum(const struct Qdisc *q)
{
        __u32 qlen = q->qstats.qlen;
        int i;

        if (qdisc_is_percpu_stats(q)) {
                for_each_possible_cpu(i)
                        qlen += per_cpu_ptr(q->cpu_qstats, i)->qlen;
        } else {
                qlen += q->q.qlen;
        }

        return qlen;
}

static inline struct qdisc_skb_cb *qdisc_skb_cb(const struct sk_buff *skb)
{
        return (struct qdisc_skb_cb *)skb->cb;
}

static inline spinlock_t *qdisc_lock(struct Qdisc *qdisc)
{
        return &qdisc->q.lock;
}

static inline struct Qdisc *qdisc_root(const struct Qdisc *qdisc)
{
        struct Qdisc *q = rcu_dereference_rtnl(qdisc->dev_queue->qdisc);

        return q;
}

static inline struct Qdisc *qdisc_root_bh(const struct Qdisc *qdisc)
{
        return rcu_dereference_bh(qdisc->dev_queue->qdisc);
}

static inline struct Qdisc *qdisc_root_sleeping(const struct Qdisc *qdisc)
{
        return rcu_dereference_rtnl(qdisc->dev_queue->qdisc_sleeping);
}

static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
{
        struct Qdisc *root = qdisc_root_sleeping(qdisc);

        ASSERT_RTNL();
        return qdisc_lock(root);
}

static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
{
        return qdisc->dev_queue->dev;
}

static inline void sch_tree_lock(struct Qdisc *q)
{
        if (q->flags & TCQ_F_MQROOT)
                spin_lock_bh(qdisc_lock(q));
        else
                spin_lock_bh(qdisc_root_sleeping_lock(q));
}

static inline void sch_tree_unlock(struct Qdisc *q)
{
        if (q->flags & TCQ_F_MQROOT)
                spin_unlock_bh(qdisc_lock(q));
        else
                spin_unlock_bh(qdisc_root_sleeping_lock(q));
}

extern struct Qdisc noop_qdisc;
extern struct Qdisc_ops noop_qdisc_ops;
extern struct Qdisc_ops pfifo_fast_ops;
extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1];
extern struct Qdisc_ops mq_qdisc_ops;
extern struct Qdisc_ops noqueue_qdisc_ops;
extern const struct Qdisc_ops *default_qdisc_ops;
static inline const struct Qdisc_ops *
get_default_qdisc_ops(const struct net_device *dev, int ntx)
{
        return ntx < dev->real_num_tx_queues ?
                        default_qdisc_ops : &pfifo_fast_ops;
}

struct Qdisc_class_common {
        u32                        classid;
        unsigned int                filter_cnt;
        struct hlist_node        hnode;
};

struct Qdisc_class_hash {
        struct hlist_head        *hash;
        unsigned int                hashsize;
        unsigned int                hashmask;
        unsigned int                hashelems;
};

static inline unsigned int qdisc_class_hash(u32 id, u32 mask)
{
        id ^= id >> 8;
        id ^= id >> 4;
        return id & mask;
}

static inline struct Qdisc_class_common *
qdisc_class_find(const struct Qdisc_class_hash *hash, u32 id)
{
        struct Qdisc_class_common *cl;
        unsigned int h;

        if (!id)
                return NULL;

        h = qdisc_class_hash(id, hash->hashmask);
        hlist_for_each_entry(cl, &hash->hash[h], hnode) {
                if (cl->classid == id)
                        return cl;
        }
        return NULL;
}

static inline bool qdisc_class_in_use(const struct Qdisc_class_common *cl)
{
        return cl->filter_cnt > 0;
}

static inline void qdisc_class_get(struct Qdisc_class_common *cl)
{
        unsigned int res;

        if (check_add_overflow(cl->filter_cnt, 1, &res))
                WARN(1, "Qdisc class overflow");

        cl->filter_cnt = res;
}

static inline void qdisc_class_put(struct Qdisc_class_common *cl)
{
        unsigned int res;

        if (check_sub_overflow(cl->filter_cnt, 1, &res))
                WARN(1, "Qdisc class underflow");

        cl->filter_cnt = res;
}

static inline int tc_classid_to_hwtc(struct net_device *dev, u32 classid)
{
        u32 hwtc = TC_H_MIN(classid) - TC_H_MIN_PRIORITY;

        return (hwtc < netdev_get_num_tc(dev)) ? hwtc : -EINVAL;
}

int qdisc_class_hash_init(struct Qdisc_class_hash *);
void qdisc_class_hash_insert(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_remove(struct Qdisc_class_hash *,
                             struct Qdisc_class_common *);
void qdisc_class_hash_grow(struct Qdisc *, struct Qdisc_class_hash *);
void qdisc_class_hash_destroy(struct Qdisc_class_hash *);

int dev_qdisc_change_tx_queue_len(struct net_device *dev);
void dev_qdisc_change_real_num_tx(struct net_device *dev,
                                  unsigned int new_real_tx);
void dev_init_scheduler(struct net_device *dev);
void dev_shutdown(struct net_device *dev);
void dev_activate(struct net_device *dev);
void dev_deactivate(struct net_device *dev);
void dev_deactivate_many(struct list_head *head);
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
                              struct Qdisc *qdisc);
void qdisc_reset(struct Qdisc *qdisc);
void qdisc_destroy(struct Qdisc *qdisc);
void qdisc_put(struct Qdisc *qdisc);
void qdisc_put_unlocked(struct Qdisc *qdisc);
void qdisc_tree_reduce_backlog(struct Qdisc *qdisc, int n, int len);
#ifdef CONFIG_NET_SCHED
int qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                              void *type_data);
void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                                struct Qdisc *new, struct Qdisc *old,
                                enum tc_setup_type type, void *type_data,
                                struct netlink_ext_ack *extack);
#else
static inline int
qdisc_offload_dump_helper(struct Qdisc *q, enum tc_setup_type type,
                          void *type_data)
{
        q->flags &= ~TCQ_F_OFFLOADED;
        return 0;
}

static inline void
qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
                           struct Qdisc *new, struct Qdisc *old,
                           enum tc_setup_type type, void *type_data,
                           struct netlink_ext_ack *extack)
{
}
#endif
void qdisc_offload_query_caps(struct net_device *dev,
                              enum tc_setup_type type,
                              void *caps, size_t caps_len);
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
                          const struct Qdisc_ops *ops,
                          struct netlink_ext_ack *extack);
void qdisc_free(struct Qdisc *qdisc);
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
                                const struct Qdisc_ops *ops, u32 parentid,
                                struct netlink_ext_ack *extack);
void __qdisc_calculate_pkt_len(struct sk_buff *skb,
                               const struct qdisc_size_table *stab);
int skb_do_redirect(struct sk_buff *);

static inline bool skb_at_tc_ingress(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_XGRESS
        return skb->tc_at_ingress;
#else
        return false;
#endif
}

static inline bool skb_skip_tc_classify(struct sk_buff *skb)
{
#ifdef CONFIG_NET_CLS_ACT
        if (skb->tc_skip_classify) {
                skb->tc_skip_classify = 0;
                return true;
        }
#endif
        return false;
}

/* Reset all TX qdiscs greater than index of a device.  */
static inline void qdisc_reset_all_tx_gt(struct net_device *dev, unsigned int i)
{
        struct Qdisc *qdisc;

        for (; i < dev->num_tx_queues; i++) {
                qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc);
                if (qdisc) {
                        spin_lock_bh(qdisc_lock(qdisc));
                        qdisc_reset(qdisc);
                        spin_unlock_bh(qdisc_lock(qdisc));
                }
        }
}

/* Are all TX queues of the device empty?  */
static inline bool qdisc_all_tx_empty(const struct net_device *dev)
{
        unsigned int i;

        rcu_read_lock();
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                const struct Qdisc *q = rcu_dereference(txq->qdisc);

                if (!qdisc_is_empty(q)) {
                        rcu_read_unlock();
                        return false;
                }
        }
        rcu_read_unlock();
        return true;
}

/* Are any of the TX qdiscs changing?  */
static inline bool qdisc_tx_changing(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                if (rcu_access_pointer(txq->qdisc) !=
                    rcu_access_pointer(txq->qdisc_sleeping))
                        return true;
        }
        return false;
}

/* Is the device using the noop qdisc on all queues?  */
static inline bool qdisc_tx_is_noop(const struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                if (rcu_access_pointer(txq->qdisc) != &noop_qdisc)
                        return false;
        }
        return true;
}

static inline unsigned int qdisc_pkt_len(const struct sk_buff *skb)
{
        return qdisc_skb_cb(skb)->pkt_len;
}

/* additional qdisc xmit flags (NET_XMIT_MASK in linux/netdevice.h) */
enum net_xmit_qdisc_t {
        __NET_XMIT_STOLEN = 0x00010000,
        __NET_XMIT_BYPASS = 0x00020000,
};

#ifdef CONFIG_NET_CLS_ACT
#define net_xmit_drop_count(e)        ((e) & __NET_XMIT_STOLEN ? 0 : 1)
#else
#define net_xmit_drop_count(e)        (1)
#endif

static inline void qdisc_calculate_pkt_len(struct sk_buff *skb,
                                           const struct Qdisc *sch)
{
#ifdef CONFIG_NET_SCHED
        struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab);

        if (stab)
                __qdisc_calculate_pkt_len(skb, stab);
#endif
}

static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                                struct sk_buff **to_free)
{
        return sch->enqueue(skb, sch, to_free);
}

static inline void _bstats_update(struct gnet_stats_basic_sync *bstats,
                                  __u64 bytes, __u64 packets)
{
        u64_stats_update_begin(&bstats->syncp);
        u64_stats_add(&bstats->bytes, bytes);
        u64_stats_add(&bstats->packets, packets);
        u64_stats_update_end(&bstats->syncp);
}

static inline void bstats_update(struct gnet_stats_basic_sync *bstats,
                                 const struct sk_buff *skb)
{
        _bstats_update(bstats,
                       qdisc_pkt_len(skb),
                       skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1);
}

static inline void qdisc_bstats_cpu_update(struct Qdisc *sch,
                                           const struct sk_buff *skb)
{
        bstats_update(this_cpu_ptr(sch->cpu_bstats), skb);
}

static inline void qdisc_bstats_update(struct Qdisc *sch,
                                       const struct sk_buff *skb)
{
        bstats_update(&sch->bstats, skb);
}

static inline void qdisc_qstats_backlog_dec(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog -= qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_dec(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_sub(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_backlog_inc(struct Qdisc *sch,
                                            const struct sk_buff *skb)
{
        sch->qstats.backlog += qdisc_pkt_len(skb);
}

static inline void qdisc_qstats_cpu_backlog_inc(struct Qdisc *sch,
                                                const struct sk_buff *skb)
{
        this_cpu_add(sch->cpu_qstats->backlog, qdisc_pkt_len(skb));
}

static inline void qdisc_qstats_cpu_qlen_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_qlen_dec(struct Qdisc *sch)
{
        this_cpu_dec(sch->cpu_qstats->qlen);
}

static inline void qdisc_qstats_cpu_requeues_inc(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->requeues);
}

static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count)
{
        sch->qstats.drops += count;
}

static inline void qstats_drop_inc(struct gnet_stats_queue *qstats)
{
        qstats->drops++;
}

static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats)
{
        qstats->overlimits++;
}

static inline void qdisc_qstats_drop(struct Qdisc *sch)
{
        qstats_drop_inc(&sch->qstats);
}

static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch)
{
        this_cpu_inc(sch->cpu_qstats->drops);
}

static inline void qdisc_qstats_overlimit(struct Qdisc *sch)
{
        sch->qstats.overlimits++;
}

static inline int qdisc_qstats_copy(struct gnet_dump *d, struct Qdisc *sch)
{
        __u32 qlen = qdisc_qlen_sum(sch);

        return gnet_stats_copy_queue(d, sch->cpu_qstats, &sch->qstats, qlen);
}

static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch,  __u32 *qlen,
                                             __u32 *backlog)
{
        struct gnet_stats_queue qstats = { 0 };

        gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats);
        *qlen = qstats.qlen + qdisc_qlen(sch);
        *backlog = qstats.backlog;
}

static inline void qdisc_tree_flush_backlog(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void qdisc_purge_queue(struct Qdisc *sch)
{
        __u32 qlen, backlog;

        qdisc_qstats_qlen_backlog(sch, &qlen, &backlog);
        qdisc_reset(sch);
        qdisc_tree_reduce_backlog(sch, qlen, backlog);
}

static inline void __qdisc_enqueue_tail(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        struct sk_buff *last = qh->tail;

        if (last) {
                skb->next = NULL;
                last->next = skb;
                qh->tail = skb;
        } else {
                qh->tail = skb;
                qh->head = skb;
        }
        qh->qlen++;
}

static inline int qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch)
{
        __qdisc_enqueue_tail(skb, &sch->q);
        qdisc_qstats_backlog_inc(sch, skb);
        return NET_XMIT_SUCCESS;
}

static inline void __qdisc_enqueue_head(struct sk_buff *skb,
                                        struct qdisc_skb_head *qh)
{
        skb->next = qh->head;

        if (!qh->head)
                qh->tail = skb;
        qh->head = skb;
        qh->qlen++;
}

static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh)
{
        struct sk_buff *skb = qh->head;

        if (likely(skb != NULL)) {
                qh->head = skb->next;
                qh->qlen--;
                if (qh->head == NULL)
                        qh->tail = NULL;
                skb->next = NULL;
        }

        return skb;
}

static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch)
{
        struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);

        if (likely(skb != NULL)) {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
        }

        return skb;
}

struct tc_skb_cb {
        struct qdisc_skb_cb qdisc_cb;
        u32 drop_reason;

        u16 zone; /* Only valid if post_ct = true */
        u16 mru;
        u8 post_ct:1;
        u8 post_ct_snat:1;
        u8 post_ct_dnat:1;
};

static inline struct tc_skb_cb *tc_skb_cb(const struct sk_buff *skb)
{
        struct tc_skb_cb *cb = (struct tc_skb_cb *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        return cb;
}

static inline enum skb_drop_reason
tcf_get_drop_reason(const struct sk_buff *skb)
{
        return tc_skb_cb(skb)->drop_reason;
}

static inline void tcf_set_drop_reason(const struct sk_buff *skb,
                                       enum skb_drop_reason reason)
{
        tc_skb_cb(skb)->drop_reason = reason;
}

/* Instead of calling kfree_skb() while root qdisc lock is held,
 * queue the skb for future freeing at end of __dev_xmit_skb()
 */
static inline void __qdisc_drop(struct sk_buff *skb, struct sk_buff **to_free)
{
        skb->next = *to_free;
        *to_free = skb;
}

static inline void __qdisc_drop_all(struct sk_buff *skb,
                                    struct sk_buff **to_free)
{
        if (skb->prev)
                skb->prev->next = *to_free;
        else
                skb->next = *to_free;
        *to_free = skb;
}

static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch,
                                                   struct qdisc_skb_head *qh,
                                                   struct sk_buff **to_free)
{
        struct sk_buff *skb = __qdisc_dequeue_head(qh);

        if (likely(skb != NULL)) {
                unsigned int len = qdisc_pkt_len(skb);

                qdisc_qstats_backlog_dec(sch, skb);
                __qdisc_drop(skb, to_free);
                return len;
        }

        return 0;
}

static inline struct sk_buff *qdisc_peek_head(struct Qdisc *sch)
{
        const struct qdisc_skb_head *qh = &sch->q;

        return qh->head;
}

/* generic pseudo peek method for non-work-conserving qdisc */
static inline struct sk_buff *qdisc_peek_dequeued(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        /* we can reuse ->gso_skb because peek isn't called for root qdiscs */
        if (!skb) {
                skb = sch->dequeue(sch);

                if (skb) {
                        __skb_queue_head(&sch->gso_skb, skb);
                        /* it's still part of the queue */
                        qdisc_qstats_backlog_inc(sch, skb);
                        sch->q.qlen++;
                }
        }

        return skb;
}

static inline void qdisc_update_stats_at_dequeue(struct Qdisc *sch,
                                                 struct sk_buff *skb)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_backlog_dec(sch, skb);
                qdisc_bstats_cpu_update(sch, skb);
                qdisc_qstats_cpu_qlen_dec(sch);
        } else {
                qdisc_qstats_backlog_dec(sch, skb);
                qdisc_bstats_update(sch, skb);
                sch->q.qlen--;
        }
}

static inline void qdisc_update_stats_at_enqueue(struct Qdisc *sch,
                                                 unsigned int pkt_len)
{
        if (qdisc_is_percpu_stats(sch)) {
                qdisc_qstats_cpu_qlen_inc(sch);
                this_cpu_add(sch->cpu_qstats->backlog, pkt_len);
        } else {
                sch->qstats.backlog += pkt_len;
                sch->q.qlen++;
        }
}

/* use instead of qdisc->dequeue() for all qdiscs queried with ->peek() */
static inline struct sk_buff *qdisc_dequeue_peeked(struct Qdisc *sch)
{
        struct sk_buff *skb = skb_peek(&sch->gso_skb);

        if (skb) {
                skb = __skb_dequeue(&sch->gso_skb);
                if (qdisc_is_percpu_stats(sch)) {
                        qdisc_qstats_cpu_backlog_dec(sch, skb);
                        qdisc_qstats_cpu_qlen_dec(sch);
                } else {
                        qdisc_qstats_backlog_dec(sch, skb);
                        sch->q.qlen--;
                }
        } else {
                skb = sch->dequeue(sch);
        }

        return skb;
}

static inline void __qdisc_reset_queue(struct qdisc_skb_head *qh)
{
        /*
         * We do not know the backlog in bytes of this list, it
         * is up to the caller to correct it
         */
        ASSERT_RTNL();
        if (qh->qlen) {
                rtnl_kfree_skbs(qh->head, qh->tail);

                qh->head = NULL;
                qh->tail = NULL;
                qh->qlen = 0;
        }
}

static inline void qdisc_reset_queue(struct Qdisc *sch)
{
        __qdisc_reset_queue(&sch->q);
}

static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new,
                                          struct Qdisc **pold)
{
        struct Qdisc *old;

        sch_tree_lock(sch);
        old = *pold;
        *pold = new;
        if (old != NULL)
                qdisc_purge_queue(old);
        sch_tree_unlock(sch);

        return old;
}

static inline void rtnl_qdisc_drop(struct sk_buff *skb, struct Qdisc *sch)
{
        rtnl_kfree_skbs(skb, skb);
        qdisc_qstats_drop(sch);
}

static inline int qdisc_drop_cpu(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_cpu_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop(struct sk_buff *skb, struct Qdisc *sch,
                             struct sk_buff **to_free)
{
        __qdisc_drop(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

static inline int qdisc_drop_reason(struct sk_buff *skb, struct Qdisc *sch,
                                    struct sk_buff **to_free,
                                    enum skb_drop_reason reason)
{
        tcf_set_drop_reason(skb, reason);
        return qdisc_drop(skb, sch, to_free);
}

static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
                                 struct sk_buff **to_free)
{
        __qdisc_drop_all(skb, to_free);
        qdisc_qstats_drop(sch);

        return NET_XMIT_DROP;
}

struct psched_ratecfg {
        u64        rate_bytes_ps; /* bytes per second */
        u32        mult;
        u16        overhead;
        u16        mpu;
        u8        linklayer;
        u8        shift;
};

static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
                                unsigned int len)
{
        len += r->overhead;

        if (len < r->mpu)
                len = r->mpu;

        if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
                return ((u64)(DIV_ROUND_UP(len,48)*53) * r->mult) >> r->shift;

        return ((u64)len * r->mult) >> r->shift;
}

void psched_ratecfg_precompute(struct psched_ratecfg *r,
                               const struct tc_ratespec *conf,
                               u64 rate64);

static inline void psched_ratecfg_getrate(struct tc_ratespec *res,
                                          const struct psched_ratecfg *r)
{
        memset(res, 0, sizeof(*res));

        /* legacy struct tc_ratespec has a 32bit @rate field
         * Qdisc using 64bit rate should add new attributes
         * in order to maintain compatibility.
         */
        res->rate = min_t(u64, r->rate_bytes_ps, ~0U);

        res->overhead = r->overhead;
        res->mpu = r->mpu;
        res->linklayer = (r->linklayer & TC_LINKLAYER_MASK);
}

struct psched_pktrate {
        u64        rate_pkts_ps; /* packets per second */
        u32        mult;
        u8        shift;
};

static inline u64 psched_pkt2t_ns(const struct psched_pktrate *r,
                                  unsigned int pkt_num)
{
        return ((u64)pkt_num * r->mult) >> r->shift;
}

void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64);

/* Mini Qdisc serves for specific needs of ingress/clsact Qdisc.
 * The fast path only needs to access filter list and to update stats
 */
struct mini_Qdisc {
        struct tcf_proto *filter_list;
        struct tcf_block *block;
        struct gnet_stats_basic_sync __percpu *cpu_bstats;
        struct gnet_stats_queue        __percpu *cpu_qstats;
        unsigned long rcu_state;
};

static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq,
                                                const struct sk_buff *skb)
{
        bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb);
}

static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq)
{
        this_cpu_inc(miniq->cpu_qstats->drops);
}

struct mini_Qdisc_pair {
        struct mini_Qdisc miniq1;
        struct mini_Qdisc miniq2;
        struct mini_Qdisc __rcu **p_miniq;
};

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
                          struct tcf_proto *tp_head);
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
                          struct mini_Qdisc __rcu **p_miniq);
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
                                struct tcf_block *block);

void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx);

int sch_frag_xmit_hook(struct sk_buff *skb, int (*xmit)(struct sk_buff *skb));

/* Make sure qdisc is no longer in SCHED state. */
static inline void qdisc_synchronize(const struct Qdisc *q)
{
        while (test_bit(__QDISC_STATE_SCHED, &q->state))
                msleep(1);
}

#endif
























































































































































































    3 
















    3 





    3 




















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
// SPDX-License-Identifier: GPL-2.0

#include <linux/debugfs.h>

#include "netdevsim.h"

#define NSIM_DEV_HWSTATS_TRAFFIC_MS        100

static struct list_head *
nsim_dev_hwstats_get_list_head(struct nsim_dev_hwstats *hwstats,
                               enum netdev_offload_xstats_type type)
{
        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                return &hwstats->l3_list;
        }

        WARN_ON_ONCE(1);
        return NULL;
}

static void nsim_dev_hwstats_traffic_bump(struct nsim_dev_hwstats *hwstats,
                                          enum netdev_offload_xstats_type type)
{
        struct nsim_dev_hwstats_netdev *hwsdev;
        struct list_head *hwsdev_list;

        hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type);
        if (WARN_ON(!hwsdev_list))
                return;

        list_for_each_entry(hwsdev, hwsdev_list, list) {
                if (hwsdev->enabled) {
                        hwsdev->stats.rx_packets += 1;
                        hwsdev->stats.tx_packets += 2;
                        hwsdev->stats.rx_bytes += 100;
                        hwsdev->stats.tx_bytes += 300;
                }
        }
}

static void nsim_dev_hwstats_traffic_work(struct work_struct *work)
{
        struct nsim_dev_hwstats *hwstats;

        hwstats = container_of(work, struct nsim_dev_hwstats, traffic_dw.work);
        mutex_lock(&hwstats->hwsdev_list_lock);
        nsim_dev_hwstats_traffic_bump(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
        mutex_unlock(&hwstats->hwsdev_list_lock);

        schedule_delayed_work(&hwstats->traffic_dw,
                              msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS));
}

static struct nsim_dev_hwstats_netdev *
nsim_dev_hwslist_find_hwsdev(struct list_head *hwsdev_list,
                             int ifindex)
{
        struct nsim_dev_hwstats_netdev *hwsdev;

        list_for_each_entry(hwsdev, hwsdev_list, list) {
                if (hwsdev->netdev->ifindex == ifindex)
                        return hwsdev;
        }

        return NULL;
}

static int nsim_dev_hwsdev_enable(struct nsim_dev_hwstats_netdev *hwsdev,
                                  struct netlink_ext_ack *extack)
{
        if (hwsdev->fail_enable) {
                hwsdev->fail_enable = false;
                NL_SET_ERR_MSG_MOD(extack, "Stats enablement set to fail");
                return -ECANCELED;
        }

        hwsdev->enabled = true;
        return 0;
}

static void nsim_dev_hwsdev_disable(struct nsim_dev_hwstats_netdev *hwsdev)
{
        hwsdev->enabled = false;
        memset(&hwsdev->stats, 0, sizeof(hwsdev->stats));
}

static int
nsim_dev_hwsdev_report_delta(struct nsim_dev_hwstats_netdev *hwsdev,
                             struct netdev_notifier_offload_xstats_info *info)
{
        netdev_offload_xstats_report_delta(info->report_delta, &hwsdev->stats);
        memset(&hwsdev->stats, 0, sizeof(hwsdev->stats));
        return 0;
}

static void
nsim_dev_hwsdev_report_used(struct nsim_dev_hwstats_netdev *hwsdev,
                            struct netdev_notifier_offload_xstats_info *info)
{
        if (hwsdev->enabled)
                netdev_offload_xstats_report_used(info->report_used);
}

static int nsim_dev_hwstats_event_off_xstats(struct nsim_dev_hwstats *hwstats,
                                             struct net_device *dev,
                                             unsigned long event, void *ptr)
{
        struct netdev_notifier_offload_xstats_info *info;
        struct nsim_dev_hwstats_netdev *hwsdev;
        struct list_head *hwsdev_list;
        int err = 0;

        info = ptr;
        hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, info->type);
        if (!hwsdev_list)
                return 0;

        mutex_lock(&hwstats->hwsdev_list_lock);

        hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex);
        if (!hwsdev)
                goto out;

        switch (event) {
        case NETDEV_OFFLOAD_XSTATS_ENABLE:
                err = nsim_dev_hwsdev_enable(hwsdev, info->info.extack);
                break;
        case NETDEV_OFFLOAD_XSTATS_DISABLE:
                nsim_dev_hwsdev_disable(hwsdev);
                break;
        case NETDEV_OFFLOAD_XSTATS_REPORT_USED:
                nsim_dev_hwsdev_report_used(hwsdev, info);
                break;
        case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA:
                err = nsim_dev_hwsdev_report_delta(hwsdev, info);
                break;
        }

out:
        mutex_unlock(&hwstats->hwsdev_list_lock);
        return err;
}

static void nsim_dev_hwsdev_fini(struct nsim_dev_hwstats_netdev *hwsdev)
{
        dev_put(hwsdev->netdev);
        kfree(hwsdev);
}

static void
__nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats,
                                    struct net_device *dev,
                                    enum netdev_offload_xstats_type type)
{
        struct nsim_dev_hwstats_netdev *hwsdev;
        struct list_head *hwsdev_list;

        hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type);
        if (WARN_ON(!hwsdev_list))
                return;

        hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, dev->ifindex);
        if (!hwsdev)
                return;

        list_del(&hwsdev->list);
        nsim_dev_hwsdev_fini(hwsdev);
}

static void nsim_dev_hwstats_event_unregister(struct nsim_dev_hwstats *hwstats,
                                              struct net_device *dev)
{
        mutex_lock(&hwstats->hwsdev_list_lock);
        __nsim_dev_hwstats_event_unregister(hwstats, dev,
                                            NETDEV_OFFLOAD_XSTATS_TYPE_L3);
        mutex_unlock(&hwstats->hwsdev_list_lock);
}

static int nsim_dev_hwstats_event(struct nsim_dev_hwstats *hwstats,
                                  struct net_device *dev,
                                  unsigned long event, void *ptr)
{
        switch (event) {
        case NETDEV_OFFLOAD_XSTATS_ENABLE:
        case NETDEV_OFFLOAD_XSTATS_DISABLE:
        case NETDEV_OFFLOAD_XSTATS_REPORT_USED:
        case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA:
                return nsim_dev_hwstats_event_off_xstats(hwstats, dev,
                                                         event, ptr);
        case NETDEV_UNREGISTER:
                nsim_dev_hwstats_event_unregister(hwstats, dev);
                break;
        }

        return 0;
}

static int nsim_dev_netdevice_event(struct notifier_block *nb,
                                    unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct nsim_dev_hwstats *hwstats;
        int err = 0;

        hwstats = container_of(nb, struct nsim_dev_hwstats, netdevice_nb);
        err = nsim_dev_hwstats_event(hwstats, dev, event, ptr);
        if (err)
                return notifier_from_errno(err);

        return NOTIFY_OK;
}

static int
nsim_dev_hwstats_enable_ifindex(struct nsim_dev_hwstats *hwstats,
                                int ifindex,
                                enum netdev_offload_xstats_type type,
                                struct list_head *hwsdev_list)
{
        struct nsim_dev_hwstats_netdev *hwsdev;
        struct nsim_dev *nsim_dev;
        struct net_device *netdev;
        bool notify = false;
        struct net *net;
        int err = 0;

        nsim_dev = container_of(hwstats, struct nsim_dev, hwstats);
        net = nsim_dev_net(nsim_dev);

        rtnl_lock();
        mutex_lock(&hwstats->hwsdev_list_lock);
        hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex);
        if (hwsdev)
                goto out_unlock_list;

        netdev = dev_get_by_index(net, ifindex);
        if (!netdev) {
                err = -ENODEV;
                goto out_unlock_list;
        }

        hwsdev = kzalloc(sizeof(*hwsdev), GFP_KERNEL);
        if (!hwsdev) {
                err = -ENOMEM;
                goto out_put_netdev;
        }

        hwsdev->netdev = netdev;
        list_add_tail(&hwsdev->list, hwsdev_list);
        mutex_unlock(&hwstats->hwsdev_list_lock);

        if (netdev_offload_xstats_enabled(netdev, type)) {
                nsim_dev_hwsdev_enable(hwsdev, NULL);
                notify = true;
        }

        if (notify)
                rtnl_offload_xstats_notify(netdev);
        rtnl_unlock();
        return err;

out_put_netdev:
        dev_put(netdev);
out_unlock_list:
        mutex_unlock(&hwstats->hwsdev_list_lock);
        rtnl_unlock();
        return err;
}

static int
nsim_dev_hwstats_disable_ifindex(struct nsim_dev_hwstats *hwstats,
                                 int ifindex,
                                 enum netdev_offload_xstats_type type,
                                 struct list_head *hwsdev_list)
{
        struct nsim_dev_hwstats_netdev *hwsdev;
        int err = 0;

        rtnl_lock();
        mutex_lock(&hwstats->hwsdev_list_lock);
        hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex);
        if (hwsdev)
                list_del(&hwsdev->list);
        mutex_unlock(&hwstats->hwsdev_list_lock);

        if (!hwsdev) {
                err = -ENOENT;
                goto unlock_out;
        }

        if (netdev_offload_xstats_enabled(hwsdev->netdev, type)) {
                netdev_offload_xstats_push_delta(hwsdev->netdev, type,
                                                 &hwsdev->stats);
                rtnl_offload_xstats_notify(hwsdev->netdev);
        }
        nsim_dev_hwsdev_fini(hwsdev);

unlock_out:
        rtnl_unlock();
        return err;
}

static int
nsim_dev_hwstats_fail_ifindex(struct nsim_dev_hwstats *hwstats,
                              int ifindex,
                              enum netdev_offload_xstats_type type,
                              struct list_head *hwsdev_list)
{
        struct nsim_dev_hwstats_netdev *hwsdev;
        int err = 0;

        mutex_lock(&hwstats->hwsdev_list_lock);

        hwsdev = nsim_dev_hwslist_find_hwsdev(hwsdev_list, ifindex);
        if (!hwsdev) {
                err = -ENOENT;
                goto err_hwsdev_list_unlock;
        }

        hwsdev->fail_enable = true;

err_hwsdev_list_unlock:
        mutex_unlock(&hwstats->hwsdev_list_lock);
        return err;
}

enum nsim_dev_hwstats_do {
        NSIM_DEV_HWSTATS_DO_DISABLE,
        NSIM_DEV_HWSTATS_DO_ENABLE,
        NSIM_DEV_HWSTATS_DO_FAIL,
};

struct nsim_dev_hwstats_fops {
        enum nsim_dev_hwstats_do action;
        enum netdev_offload_xstats_type type;
};

static ssize_t
nsim_dev_hwstats_do_write(struct file *file,
                          const char __user *data,
                          size_t count, loff_t *ppos)
{
        struct nsim_dev_hwstats *hwstats = file->private_data;
        const struct nsim_dev_hwstats_fops *hwsfops;
        struct list_head *hwsdev_list;
        int ifindex;
        int err;

        hwsfops = debugfs_get_aux(file);

        err = kstrtoint_from_user(data, count, 0, &ifindex);
        if (err)
                return err;

        hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, hwsfops->type);
        if (WARN_ON(!hwsdev_list))
                return -EINVAL;

        switch (hwsfops->action) {
        case NSIM_DEV_HWSTATS_DO_DISABLE:
                err = nsim_dev_hwstats_disable_ifindex(hwstats, ifindex,
                                                       hwsfops->type,
                                                       hwsdev_list);
                break;
        case NSIM_DEV_HWSTATS_DO_ENABLE:
                err = nsim_dev_hwstats_enable_ifindex(hwstats, ifindex,
                                                      hwsfops->type,
                                                      hwsdev_list);
                break;
        case NSIM_DEV_HWSTATS_DO_FAIL:
                err = nsim_dev_hwstats_fail_ifindex(hwstats, ifindex,
                                                    hwsfops->type,
                                                    hwsdev_list);
                break;
        }
        if (err)
                return err;

        return count;
}

static struct debugfs_short_fops debugfs_ops = {
        .write = nsim_dev_hwstats_do_write,
        .llseek = generic_file_llseek,
};

#define NSIM_DEV_HWSTATS_FOPS(ACTION, TYPE)                        \
        {                                                        \
                .action = ACTION,                                \
                .type = TYPE,                                        \
        }

static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_disable_fops =
        NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_DISABLE,
                              NETDEV_OFFLOAD_XSTATS_TYPE_L3);

static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_enable_fops =
        NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_ENABLE,
                              NETDEV_OFFLOAD_XSTATS_TYPE_L3);

static const struct nsim_dev_hwstats_fops nsim_dev_hwstats_l3_fail_fops =
        NSIM_DEV_HWSTATS_FOPS(NSIM_DEV_HWSTATS_DO_FAIL,
                              NETDEV_OFFLOAD_XSTATS_TYPE_L3);

#undef NSIM_DEV_HWSTATS_FOPS

int nsim_dev_hwstats_init(struct nsim_dev *nsim_dev)
{
        struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats;
        struct net *net = nsim_dev_net(nsim_dev);
        int err;

        mutex_init(&hwstats->hwsdev_list_lock);
        INIT_LIST_HEAD(&hwstats->l3_list);

        hwstats->netdevice_nb.notifier_call = nsim_dev_netdevice_event;
        err = register_netdevice_notifier_net(net, &hwstats->netdevice_nb);
        if (err)
                goto err_mutex_destroy;

        hwstats->ddir = debugfs_create_dir("hwstats", nsim_dev->ddir);
        if (IS_ERR(hwstats->ddir)) {
                err = PTR_ERR(hwstats->ddir);
                goto err_unregister_notifier;
        }

        hwstats->l3_ddir = debugfs_create_dir("l3", hwstats->ddir);
        if (IS_ERR(hwstats->l3_ddir)) {
                err = PTR_ERR(hwstats->l3_ddir);
                goto err_remove_hwstats_recursive;
        }

        debugfs_create_file_aux("enable_ifindex", 0200, hwstats->l3_ddir, hwstats,
                            &nsim_dev_hwstats_l3_enable_fops, &debugfs_ops);
        debugfs_create_file_aux("disable_ifindex", 0200, hwstats->l3_ddir, hwstats,
                            &nsim_dev_hwstats_l3_disable_fops, &debugfs_ops);
        debugfs_create_file_aux("fail_next_enable", 0200, hwstats->l3_ddir, hwstats,
                            &nsim_dev_hwstats_l3_fail_fops, &debugfs_ops);

        INIT_DELAYED_WORK(&hwstats->traffic_dw,
                          &nsim_dev_hwstats_traffic_work);
        schedule_delayed_work(&hwstats->traffic_dw,
                              msecs_to_jiffies(NSIM_DEV_HWSTATS_TRAFFIC_MS));
        return 0;

err_remove_hwstats_recursive:
        debugfs_remove_recursive(hwstats->ddir);
err_unregister_notifier:
        unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb);
err_mutex_destroy:
        mutex_destroy(&hwstats->hwsdev_list_lock);
        return err;
}

static void nsim_dev_hwsdev_list_wipe(struct nsim_dev_hwstats *hwstats,
                                      enum netdev_offload_xstats_type type)
{
        struct nsim_dev_hwstats_netdev *hwsdev, *tmp;
        struct list_head *hwsdev_list;

        hwsdev_list = nsim_dev_hwstats_get_list_head(hwstats, type);
        if (WARN_ON(!hwsdev_list))
                return;

        mutex_lock(&hwstats->hwsdev_list_lock);
        list_for_each_entry_safe(hwsdev, tmp, hwsdev_list, list) {
                list_del(&hwsdev->list);
                nsim_dev_hwsdev_fini(hwsdev);
        }
        mutex_unlock(&hwstats->hwsdev_list_lock);
}

void nsim_dev_hwstats_exit(struct nsim_dev *nsim_dev)
{
        struct nsim_dev_hwstats *hwstats = &nsim_dev->hwstats;
        struct net *net = nsim_dev_net(nsim_dev);

        cancel_delayed_work_sync(&hwstats->traffic_dw);
        debugfs_remove_recursive(hwstats->ddir);
        unregister_netdevice_notifier_net(net, &hwstats->netdevice_nb);
        nsim_dev_hwsdev_list_wipe(hwstats, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
        mutex_destroy(&hwstats->hwsdev_list_lock);
}




























































































































































   24 





   24 

   24 
   24 







   24 


   17 









    7 




































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Ethernet-type device handling.
 *
 * Version:        @(#)eth.c        1.0.7        05/25/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *                Florian  La Roche, <rzsfl@rz.uni-sb.de>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *
 * Fixes:
 *                Mr Linux        : Arp problems
 *                Alan Cox        : Generic queue tidyup (very tiny here)
 *                Alan Cox        : eth_header ntohs should be htons
 *                Alan Cox        : eth_rebuild_header missing an htons and
 *                                  minor other things.
 *                Tegge                : Arp bug fixes.
 *                Florian                : Removed many unnecessary functions, code cleanup
 *                                  and changes for new arp and skbuff.
 *                Alan Cox        : Redid header building to reflect new format.
 *                Alan Cox        : ARP only when compiled with CONFIG_INET
 *                Greg Page        : 802.2 and SNAP stuff.
 *                Alan Cox        : MAC layer pointers/new format.
 *                Paul Gortmaker        : eth_copy_and_sum shouldn't csum padding.
 *                Alan Cox        : Protect against forwarding explosions with
 *                                  older network drivers and IFF_ALLMULTI.
 *        Christer Weinigel        : Better rebuild header message.
 *             Andrew Morton    : 26Feb01: kill ether_setup() - use netdev_boot_setup().
 */
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/nvmem-consumer.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/if_ether.h>
#include <linux/of_net.h>
#include <linux/pci.h>
#include <linux/property.h>
#include <net/dst.h>
#include <net/arp.h>
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/ip.h>
#include <net/dsa.h>
#include <net/flow_dissector.h>
#include <net/gro.h>
#include <linux/uaccess.h>
#include <net/pkt_sched.h>

/**
 * eth_header - create the Ethernet header
 * @skb:        buffer to alter
 * @dev:        source device
 * @type:        Ethernet type field
 * @daddr: destination address (NULL leave destination address)
 * @saddr: source address (NULL use device source address)
 * @len:   packet length (<= skb->len)
 *
 *
 * Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length
 * in here instead.
 */
int eth_header(struct sk_buff *skb, struct net_device *dev,
               unsigned short type,
               const void *daddr, const void *saddr, unsigned int len)
{
        struct ethhdr *eth = skb_push(skb, ETH_HLEN);

        if (type != ETH_P_802_3 && type != ETH_P_802_2)
                eth->h_proto = htons(type);
        else
                eth->h_proto = htons(len);

        /*
         *      Set the source hardware address.
         */

        if (!saddr)
                saddr = dev->dev_addr;
        memcpy(eth->h_source, saddr, ETH_ALEN);

        if (daddr) {
                memcpy(eth->h_dest, daddr, ETH_ALEN);
                return ETH_HLEN;
        }

        /*
         *      Anyway, the loopback-device should never use this function...
         */

        if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
                eth_zero_addr(eth->h_dest);
                return ETH_HLEN;
        }

        return -ETH_HLEN;
}
EXPORT_SYMBOL(eth_header);

/**
 * eth_get_headlen - determine the length of header for an ethernet frame
 * @dev: pointer to network device
 * @data: pointer to start of frame
 * @len: total length of frame
 *
 * Make a best effort attempt to pull the length for all of the headers for
 * a given frame in a linear buffer.
 */
u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len)
{
        const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
        const struct ethhdr *eth = (const struct ethhdr *)data;
        struct flow_keys_basic keys;

        /* this should never happen, but better safe than sorry */
        if (unlikely(len < sizeof(*eth)))
                return len;

        /* parse any remaining L2/L3 headers, check for L4 */
        if (!skb_flow_dissect_flow_keys_basic(dev_net(dev), NULL, &keys, data,
                                              eth->h_proto, sizeof(*eth),
                                              len, flags))
                return max_t(u32, keys.control.thoff, sizeof(*eth));

        /* parse for any L4 headers */
        return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
}
EXPORT_SYMBOL(eth_get_headlen);

/**
 * eth_type_trans - determine the packet's protocol ID.
 * @skb: received socket data
 * @dev: receiving network device
 *
 * The rule here is that we
 * assume 802.3 if the type field is short enough to be a length.
 * This is normal practice and works for any 'now in use' protocol.
 */
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
        unsigned short _service_access_point;
        const unsigned short *sap;
        const struct ethhdr *eth;

        skb->dev = dev;
        skb_reset_mac_header(skb);

        eth = eth_skb_pull_mac(skb);
        eth_skb_pkt_type(skb, dev);

        /*
         * Some variants of DSA tagging don't have an ethertype field
         * at all, so we check here whether one of those tagging
         * variants has been configured on the receiving interface,
         * and if so, set skb->protocol without looking at the packet.
         */
        if (unlikely(netdev_uses_dsa(dev)))
                return htons(ETH_P_XDSA);

        if (likely(eth_proto_is_802_3(eth->h_proto)))
                return eth->h_proto;

        /*
         *      This is a magic hack to spot IPX packets. Older Novell breaks
         *      the protocol design and runs IPX over 802.3 without an 802.2 LLC
         *      layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
         *      won't work for fault tolerant netware but does for the rest.
         */
        sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point);
        if (sap && *sap == 0xFFFF)
                return htons(ETH_P_802_3);

        /*
         *      Real 802.2 LLC
         */
        return htons(ETH_P_802_2);
}
EXPORT_SYMBOL(eth_type_trans);

/**
 * eth_header_parse - extract hardware address from packet
 * @skb: packet to extract header from
 * @haddr: destination buffer
 */
int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr)
{
        const struct ethhdr *eth = eth_hdr(skb);
        memcpy(haddr, eth->h_source, ETH_ALEN);
        return ETH_ALEN;
}
EXPORT_SYMBOL(eth_header_parse);

/**
 * eth_header_cache - fill cache entry from neighbour
 * @neigh: source neighbour
 * @hh: destination cache entry
 * @type: Ethernet type field
 *
 * Create an Ethernet header template from the neighbour.
 */
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)
{
        struct ethhdr *eth;
        const struct net_device *dev = neigh->dev;

        eth = (struct ethhdr *)
            (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth))));

        if (type == htons(ETH_P_802_3))
                return -1;

        eth->h_proto = type;
        memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
        memcpy(eth->h_dest, neigh->ha, ETH_ALEN);

        /* Pairs with READ_ONCE() in neigh_resolve_output(),
         * neigh_hh_output() and neigh_update_hhs().
         */
        smp_store_release(&hh->hh_len, ETH_HLEN);

        return 0;
}
EXPORT_SYMBOL(eth_header_cache);

/**
 * eth_header_cache_update - update cache entry
 * @hh: destination cache entry
 * @dev: network device
 * @haddr: new hardware address
 *
 * Called by Address Resolution module to notify changes in address.
 */
void eth_header_cache_update(struct hh_cache *hh,
                             const struct net_device *dev,
                             const unsigned char *haddr)
{
        memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)),
               haddr, ETH_ALEN);
}
EXPORT_SYMBOL(eth_header_cache_update);

/**
 * eth_header_parse_protocol - extract protocol from L2 header
 * @skb: packet to extract protocol from
 */
__be16 eth_header_parse_protocol(const struct sk_buff *skb)
{
        const struct ethhdr *eth = eth_hdr(skb);

        return eth->h_proto;
}
EXPORT_SYMBOL(eth_header_parse_protocol);

/**
 * eth_prepare_mac_addr_change - prepare for mac change
 * @dev: network device
 * @p: socket address
 */
int eth_prepare_mac_addr_change(struct net_device *dev, void *p)
{
        struct sockaddr *addr = p;

        if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
                return -EBUSY;
        if (!is_valid_ether_addr(addr->sa_data))
                return -EADDRNOTAVAIL;
        return 0;
}
EXPORT_SYMBOL(eth_prepare_mac_addr_change);

/**
 * eth_commit_mac_addr_change - commit mac change
 * @dev: network device
 * @p: socket address
 */
void eth_commit_mac_addr_change(struct net_device *dev, void *p)
{
        struct sockaddr *addr = p;

        eth_hw_addr_set(dev, addr->sa_data);
}
EXPORT_SYMBOL(eth_commit_mac_addr_change);

/**
 * eth_mac_addr - set new Ethernet hardware address
 * @dev: network device
 * @p: socket address
 *
 * Change hardware address of device.
 *
 * This doesn't change hardware matching, so needs to be overridden
 * for most real devices.
 */
int eth_mac_addr(struct net_device *dev, void *p)
{
        int ret;

        ret = eth_prepare_mac_addr_change(dev, p);
        if (ret < 0)
                return ret;
        eth_commit_mac_addr_change(dev, p);
        return 0;
}
EXPORT_SYMBOL(eth_mac_addr);

int eth_validate_addr(struct net_device *dev)
{
        if (!is_valid_ether_addr(dev->dev_addr))
                return -EADDRNOTAVAIL;

        return 0;
}
EXPORT_SYMBOL(eth_validate_addr);

const struct header_ops eth_header_ops ____cacheline_aligned = {
        .create                = eth_header,
        .parse                = eth_header_parse,
        .cache                = eth_header_cache,
        .cache_update        = eth_header_cache_update,
        .parse_protocol        = eth_header_parse_protocol,
};

/**
 * ether_setup - setup Ethernet network device
 * @dev: network device
 *
 * Fill in the fields of the device structure with Ethernet-generic values.
 */
void ether_setup(struct net_device *dev)
{
        dev->header_ops                = &eth_header_ops;
        dev->type                = ARPHRD_ETHER;
        dev->hard_header_len         = ETH_HLEN;
        dev->min_header_len        = ETH_HLEN;
        dev->mtu                = ETH_DATA_LEN;
        dev->min_mtu                = ETH_MIN_MTU;
        dev->max_mtu                = ETH_DATA_LEN;
        dev->addr_len                = ETH_ALEN;
        dev->tx_queue_len        = DEFAULT_TX_QUEUE_LEN;
        dev->flags                = IFF_BROADCAST|IFF_MULTICAST;
        dev->priv_flags                |= IFF_TX_SKB_SHARING;

        eth_broadcast_addr(dev->broadcast);

}
EXPORT_SYMBOL(ether_setup);

/**
 * alloc_etherdev_mqs - Allocates and sets up an Ethernet device
 * @sizeof_priv: Size of additional driver-private structure to be allocated
 *        for this Ethernet device
 * @txqs: The number of TX queues this device has.
 * @rxqs: The number of RX queues this device has.
 *
 * Fill in the fields of the device structure with Ethernet-generic
 * values. Basically does everything except registering the device.
 *
 * Constructs a new net device, complete with a private data area of
 * size (sizeof_priv).  A 32-byte (not bit) alignment is enforced for
 * this private data area.
 */

struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
                                      unsigned int rxqs)
{
        return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_ENUM,
                                ether_setup, txqs, rxqs);
}
EXPORT_SYMBOL(alloc_etherdev_mqs);

ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
{
        return sysfs_emit(buf, "%*phC\n", len, addr);
}
EXPORT_SYMBOL(sysfs_format_mac);

struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
{
        const struct packet_offload *ptype;
        unsigned int hlen, off_eth;
        struct sk_buff *pp = NULL;
        struct ethhdr *eh, *eh2;
        struct sk_buff *p;
        __be16 type;
        int flush = 1;

        off_eth = skb_gro_offset(skb);
        hlen = off_eth + sizeof(*eh);
        eh = skb_gro_header(skb, hlen, off_eth);
        if (unlikely(!eh))
                goto out;

        flush = 0;

        list_for_each_entry(p, head, list) {
                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                eh2 = (struct ethhdr *)(p->data + off_eth);
                if (compare_ether_header(eh, eh2)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }
        }

        type = eh->h_proto;

        ptype = gro_find_receive_by_type(type);
        if (ptype == NULL) {
                flush = 1;
                goto out;
        }

        skb_gro_pull(skb, sizeof(*eh));
        skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));

        pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
                                            ipv6_gro_receive, inet_gro_receive,
                                            head, skb);

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}
EXPORT_SYMBOL(eth_gro_receive);

int eth_gro_complete(struct sk_buff *skb, int nhoff)
{
        struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff);
        __be16 type = eh->h_proto;
        struct packet_offload *ptype;
        int err = -ENOSYS;

        if (skb->encapsulation)
                skb_set_inner_mac_header(skb, nhoff);

        ptype = gro_find_complete_by_type(type);
        if (ptype != NULL)
                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
                                         ipv6_gro_complete, inet_gro_complete,
                                         skb, nhoff + sizeof(*eh));

        return err;
}
EXPORT_SYMBOL(eth_gro_complete);

static struct packet_offload eth_packet_offload __read_mostly = {
        .type = cpu_to_be16(ETH_P_TEB),
        .priority = 10,
        .callbacks = {
                .gro_receive = eth_gro_receive,
                .gro_complete = eth_gro_complete,
        },
};

static int __init eth_offload_init(void)
{
        dev_add_offload(&eth_packet_offload);

        return 0;
}

fs_initcall(eth_offload_init);

unsigned char * __weak arch_get_platform_mac_address(void)
{
        return NULL;
}

int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
{
        unsigned char *addr;
        int ret;

        ret = of_get_mac_address(dev->of_node, mac_addr);
        if (!ret)
                return 0;

        addr = arch_get_platform_mac_address();
        if (!addr)
                return -ENODEV;

        ether_addr_copy(mac_addr, addr);

        return 0;
}
EXPORT_SYMBOL(eth_platform_get_mac_address);

/**
 * platform_get_ethdev_address - Set netdev's MAC address from a given device
 * @dev:        Pointer to the device
 * @netdev:        Pointer to netdev to write the address to
 *
 * Wrapper around eth_platform_get_mac_address() which writes the address
 * directly to netdev->dev_addr.
 */
int platform_get_ethdev_address(struct device *dev, struct net_device *netdev)
{
        u8 addr[ETH_ALEN] __aligned(2);
        int ret;

        ret = eth_platform_get_mac_address(dev, addr);
        if (!ret)
                eth_hw_addr_set(netdev, addr);
        return ret;
}
EXPORT_SYMBOL(platform_get_ethdev_address);

/**
 * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named
 * 'mac-address' associated with given device.
 *
 * @dev:        Device with which the mac-address cell is associated.
 * @addrbuf:        Buffer to which the MAC address will be copied on success.
 *
 * Returns 0 on success or a negative error number on failure.
 */
int nvmem_get_mac_address(struct device *dev, void *addrbuf)
{
        struct nvmem_cell *cell;
        const void *mac;
        size_t len;

        cell = nvmem_cell_get(dev, "mac-address");
        if (IS_ERR(cell))
                return PTR_ERR(cell);

        mac = nvmem_cell_read(cell, &len);
        nvmem_cell_put(cell);

        if (IS_ERR(mac))
                return PTR_ERR(mac);

        if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
                kfree(mac);
                return -EINVAL;
        }

        ether_addr_copy(addrbuf, mac);
        kfree(mac);

        return 0;
}

static int fwnode_get_mac_addr(struct fwnode_handle *fwnode,
                               const char *name, char *addr)
{
        int ret;

        ret = fwnode_property_read_u8_array(fwnode, name, addr, ETH_ALEN);
        if (ret)
                return ret;

        if (!is_valid_ether_addr(addr))
                return -EINVAL;
        return 0;
}

/**
 * fwnode_get_mac_address - Get the MAC from the firmware node
 * @fwnode:        Pointer to the firmware node
 * @addr:        Address of buffer to store the MAC in
 *
 * Search the firmware node for the best MAC address to use.  'mac-address' is
 * checked first, because that is supposed to contain to "most recent" MAC
 * address. If that isn't set, then 'local-mac-address' is checked next,
 * because that is the default address.  If that isn't set, then the obsolete
 * 'address' is checked, just in case we're using an old device tree.
 *
 * Note that the 'address' property is supposed to contain a virtual address of
 * the register set, but some DTS files have redefined that property to be the
 * MAC address.
 *
 * All-zero MAC addresses are rejected, because those could be properties that
 * exist in the firmware tables, but were not updated by the firmware.  For
 * example, the DTS could define 'mac-address' and 'local-mac-address', with
 * zero MAC addresses.  Some older U-Boots only initialized 'local-mac-address'.
 * In this case, the real MAC is in 'local-mac-address', and 'mac-address'
 * exists but is all zeros.
 */
int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr)
{
        if (!fwnode_get_mac_addr(fwnode, "mac-address", addr) ||
            !fwnode_get_mac_addr(fwnode, "local-mac-address", addr) ||
            !fwnode_get_mac_addr(fwnode, "address", addr))
                return 0;

        return -ENOENT;
}
EXPORT_SYMBOL(fwnode_get_mac_address);

/**
 * device_get_mac_address - Get the MAC for a given device
 * @dev:        Pointer to the device
 * @addr:        Address of buffer to store the MAC in
 */
int device_get_mac_address(struct device *dev, char *addr)
{
        return fwnode_get_mac_address(dev_fwnode(dev), addr);
}
EXPORT_SYMBOL(device_get_mac_address);

/**
 * device_get_ethdev_address - Set netdev's MAC address from a given device
 * @dev:        Pointer to the device
 * @netdev:        Pointer to netdev to write the address to
 *
 * Wrapper around device_get_mac_address() which writes the address
 * directly to netdev->dev_addr.
 */
int device_get_ethdev_address(struct device *dev, struct net_device *netdev)
{
        u8 addr[ETH_ALEN];
        int ret;

        ret = device_get_mac_address(dev, addr);
        if (!ret)
                eth_hw_addr_set(netdev, addr);
        return ret;
}
EXPORT_SYMBOL(device_get_ethdev_address);
































































































































































































































































  165 
  166 
























  165 



























































































































  165 
















































    2 



    2 


    2 








  165 



  165 


  165 












































  166 


























    2 





























































































































































































































































   22 






































































































































































































































































































































































































































































































































































































   22 




   22 
   22 



































   22 











   22 
   22 


   22 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2013-2017 ARM Limited, All Rights Reserved.
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#define pr_fmt(fmt)        "GICv3: " fmt

#include <linux/acpi.h>
#include <linux/cpu.h>
#include <linux/cpu_pm.h>
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/irqdomain.h>
#include <linux/kernel.h>
#include <linux/kstrtox.h>
#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
#include <linux/percpu.h>
#include <linux/refcount.h>
#include <linux/slab.h>
#include <linux/iopoll.h>

#include <linux/irqchip.h>
#include <linux/irqchip/arm-gic-common.h>
#include <linux/irqchip/arm-gic-v3.h>
#include <linux/irqchip/arm-gic-v3-prio.h>
#include <linux/irqchip/irq-partition-percpu.h>
#include <linux/bitfield.h>
#include <linux/bits.h>
#include <linux/arm-smccc.h>

#include <asm/cputype.h>
#include <asm/exception.h>
#include <asm/smp_plat.h>
#include <asm/virt.h>

#include "irq-gic-common.h"

static u8 dist_prio_irq __ro_after_init = GICV3_PRIO_IRQ;
static u8 dist_prio_nmi __ro_after_init = GICV3_PRIO_NMI;

#define FLAGS_WORKAROUND_GICR_WAKER_MSM8996        (1ULL << 0)
#define FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539        (1ULL << 1)
#define FLAGS_WORKAROUND_ASR_ERRATUM_8601001        (1ULL << 2)
#define FLAGS_WORKAROUND_INSECURE                (1ULL << 3)

#define GIC_IRQ_TYPE_PARTITION        (GIC_IRQ_TYPE_LPI + 1)

static struct cpumask broken_rdists __read_mostly __maybe_unused;

struct redist_region {
        void __iomem                *redist_base;
        phys_addr_t                phys_base;
        bool                        single_redist;
};

struct gic_chip_data {
        struct fwnode_handle        *fwnode;
        phys_addr_t                dist_phys_base;
        void __iomem                *dist_base;
        struct redist_region        *redist_regions;
        struct rdists                rdists;
        struct irq_domain        *domain;
        u64                        redist_stride;
        u32                        nr_redist_regions;
        u64                        flags;
        bool                        has_rss;
        unsigned int                ppi_nr;
        struct partition_desc        **ppi_descs;
};

#define T241_CHIPS_MAX                4
static void __iomem *t241_dist_base_alias[T241_CHIPS_MAX] __read_mostly;
static DEFINE_STATIC_KEY_FALSE(gic_nvidia_t241_erratum);

static DEFINE_STATIC_KEY_FALSE(gic_arm64_2941627_erratum);

static struct gic_chip_data gic_data __read_mostly;
static DEFINE_STATIC_KEY_TRUE(supports_deactivate_key);

#define GIC_ID_NR        (1U << GICD_TYPER_ID_BITS(gic_data.rdists.gicd_typer))
#define GIC_LINE_NR        min(GICD_TYPER_SPIS(gic_data.rdists.gicd_typer), 1020U)
#define GIC_ESPI_NR        GICD_TYPER_ESPIS(gic_data.rdists.gicd_typer)

static bool nmi_support_forbidden;

/*
 * There are 16 SGIs, though we only actually use 8 in Linux. The other 8 SGIs
 * are potentially stolen by the secure side. Some code, especially code dealing
 * with hwirq IDs, is simplified by accounting for all 16.
 */
#define SGI_NR                16

/*
 * The behaviours of RPR and PMR registers differ depending on the value of
 * SCR_EL3.FIQ, and the behaviour of non-secure priority registers of the
 * distributor and redistributors depends on whether security is enabled in the
 * GIC.
 *
 * When security is enabled, non-secure priority values from the (re)distributor
 * are presented to the GIC CPUIF as follow:
 *     (GIC_(R)DIST_PRI[irq] >> 1) | 0x80;
 *
 * If SCR_EL3.FIQ == 1, the values written to/read from PMR and RPR at non-secure
 * EL1 are subject to a similar operation thus matching the priorities presented
 * from the (re)distributor when security is enabled. When SCR_EL3.FIQ == 0,
 * these values are unchanged by the GIC.
 *
 * see GICv3/GICv4 Architecture Specification (IHI0069D):
 * - section 4.8.1 Non-secure accesses to register fields for Secure interrupt
 *   priorities.
 * - Figure 4-7 Secure read of the priority field for a Non-secure Group 1
 *   interrupt.
 */
static DEFINE_STATIC_KEY_FALSE(supports_pseudo_nmis);

static u32 gic_get_pribits(void)
{
        u32 pribits;

        pribits = gic_read_ctlr();
        pribits &= ICC_CTLR_EL1_PRI_BITS_MASK;
        pribits >>= ICC_CTLR_EL1_PRI_BITS_SHIFT;
        pribits++;

        return pribits;
}

static bool gic_has_group0(void)
{
        u32 val;
        u32 old_pmr;

        old_pmr = gic_read_pmr();

        /*
         * Let's find out if Group0 is under control of EL3 or not by
         * setting the highest possible, non-zero priority in PMR.
         *
         * If SCR_EL3.FIQ is set, the priority gets shifted down in
         * order for the CPU interface to set bit 7, and keep the
         * actual priority in the non-secure range. In the process, it
         * looses the least significant bit and the actual priority
         * becomes 0x80. Reading it back returns 0, indicating that
         * we're don't have access to Group0.
         */
        gic_write_pmr(BIT(8 - gic_get_pribits()));
        val = gic_read_pmr();

        gic_write_pmr(old_pmr);

        return val != 0;
}

static inline bool gic_dist_security_disabled(void)
{
        return readl_relaxed(gic_data.dist_base + GICD_CTLR) & GICD_CTLR_DS;
}

static bool cpus_have_security_disabled __ro_after_init;
static bool cpus_have_group0 __ro_after_init;

static void __init gic_prio_init(void)
{
        bool ds;

        cpus_have_group0 = gic_has_group0();

        ds = gic_dist_security_disabled();
        if ((gic_data.flags & FLAGS_WORKAROUND_INSECURE) && !ds) {
                if (cpus_have_group0) {
                        u32 val;

                        val = readl_relaxed(gic_data.dist_base + GICD_CTLR);
                        val |= GICD_CTLR_DS;
                        writel_relaxed(val, gic_data.dist_base + GICD_CTLR);

                        ds = gic_dist_security_disabled();
                        if (ds)
                                pr_warn("Broken GIC integration, security disabled\n");
                } else {
                        pr_warn("Broken GIC integration, pNMI forbidden\n");
                        nmi_support_forbidden = true;
                }
        }

        cpus_have_security_disabled = ds;

        /*
         * How priority values are used by the GIC depends on two things:
         * the security state of the GIC (controlled by the GICD_CTRL.DS bit)
         * and if Group 0 interrupts can be delivered to Linux in the non-secure
         * world as FIQs (controlled by the SCR_EL3.FIQ bit). These affect the
         * way priorities are presented in ICC_PMR_EL1 and in the distributor:
         *
         * GICD_CTRL.DS | SCR_EL3.FIQ | ICC_PMR_EL1 | Distributor
         * -------------------------------------------------------
         *      1       |      -      |  unchanged  |  unchanged
         * -------------------------------------------------------
         *      0       |      1      |  non-secure |  non-secure
         * -------------------------------------------------------
         *      0       |      0      |  unchanged  |  non-secure
         *
         * In the non-secure view reads and writes are modified:
         *
         * - A value written is right-shifted by one and the MSB is set,
         *   forcing the priority into the non-secure range.
         *
         * - A value read is left-shifted by one.
         *
         * In the first two cases, where ICC_PMR_EL1 and the interrupt priority
         * are both either modified or unchanged, we can use the same set of
         * priorities.
         *
         * In the last case, where only the interrupt priorities are modified to
         * be in the non-secure range, we program the non-secure values into
         * the distributor to match the PMR values we want.
         */
        if (cpus_have_group0 && !cpus_have_security_disabled) {
                dist_prio_irq = __gicv3_prio_to_ns(dist_prio_irq);
                dist_prio_nmi = __gicv3_prio_to_ns(dist_prio_nmi);
        }

        pr_info("GICD_CTRL.DS=%d, SCR_EL3.FIQ=%d\n",
                cpus_have_security_disabled,
                !cpus_have_group0);
}

/* rdist_nmi_refs[n] == number of cpus having the rdist interrupt n set as NMI */
static refcount_t *rdist_nmi_refs;

static struct gic_kvm_info gic_v3_kvm_info __initdata;
static DEFINE_PER_CPU(bool, has_rss);

#define MPIDR_RS(mpidr)                        (((mpidr) & 0xF0UL) >> 4)
#define gic_data_rdist()                (this_cpu_ptr(gic_data.rdists.rdist))
#define gic_data_rdist_rd_base()        (gic_data_rdist()->rd_base)
#define gic_data_rdist_sgi_base()        (gic_data_rdist_rd_base() + SZ_64K)

/* Our default, arbitrary priority value. Linux only uses one anyway. */
#define DEFAULT_PMR_VALUE        0xf0

enum gic_intid_range {
        SGI_RANGE,
        PPI_RANGE,
        SPI_RANGE,
        EPPI_RANGE,
        ESPI_RANGE,
        LPI_RANGE,
        __INVALID_RANGE__
};

static enum gic_intid_range __get_intid_range(irq_hw_number_t hwirq)
{
        switch (hwirq) {
        case 0 ... 15:
                return SGI_RANGE;
        case 16 ... 31:
                return PPI_RANGE;
        case 32 ... 1019:
                return SPI_RANGE;
        case EPPI_BASE_INTID ... (EPPI_BASE_INTID + 63):
                return EPPI_RANGE;
        case ESPI_BASE_INTID ... (ESPI_BASE_INTID + 1023):
                return ESPI_RANGE;
        case 8192 ... GENMASK(23, 0):
                return LPI_RANGE;
        default:
                return __INVALID_RANGE__;
        }
}

static enum gic_intid_range get_intid_range(struct irq_data *d)
{
        return __get_intid_range(d->hwirq);
}

static inline bool gic_irq_in_rdist(struct irq_data *d)
{
        switch (get_intid_range(d)) {
        case SGI_RANGE:
        case PPI_RANGE:
        case EPPI_RANGE:
                return true;
        default:
                return false;
        }
}

static inline void __iomem *gic_dist_base_alias(struct irq_data *d)
{
        if (static_branch_unlikely(&gic_nvidia_t241_erratum)) {
                irq_hw_number_t hwirq = irqd_to_hwirq(d);
                u32 chip;

                /*
                 * For the erratum T241-FABRIC-4, read accesses to GICD_In{E}
                 * registers are directed to the chip that owns the SPI. The
                 * the alias region can also be used for writes to the
                 * GICD_In{E} except GICD_ICENABLERn. Each chip has support
                 * for 320 {E}SPIs. Mappings for all 4 chips:
                 *    Chip0 = 32-351
                 *    Chip1 = 352-671
                 *    Chip2 = 672-991
                 *    Chip3 = 4096-4415
                 */
                switch (__get_intid_range(hwirq)) {
                case SPI_RANGE:
                        chip = (hwirq - 32) / 320;
                        break;
                case ESPI_RANGE:
                        chip = 3;
                        break;
                default:
                        unreachable();
                }
                return t241_dist_base_alias[chip];
        }

        return gic_data.dist_base;
}

static inline void __iomem *gic_dist_base(struct irq_data *d)
{
        switch (get_intid_range(d)) {
        case SGI_RANGE:
        case PPI_RANGE:
        case EPPI_RANGE:
                /* SGI+PPI -> SGI_base for this CPU */
                return gic_data_rdist_sgi_base();

        case SPI_RANGE:
        case ESPI_RANGE:
                /* SPI -> dist_base */
                return gic_data.dist_base;

        default:
                return NULL;
        }
}

static void gic_do_wait_for_rwp(void __iomem *base, u32 bit)
{
        u32 val;
        int ret;

        ret = readl_relaxed_poll_timeout_atomic(base + GICD_CTLR, val, !(val & bit),
                                                1, USEC_PER_SEC);
        if (ret == -ETIMEDOUT)
                pr_err_ratelimited("RWP timeout, gone fishing\n");
}

/* Wait for completion of a distributor change */
static void gic_dist_wait_for_rwp(void)
{
        gic_do_wait_for_rwp(gic_data.dist_base, GICD_CTLR_RWP);
}

/* Wait for completion of a redistributor change */
static void gic_redist_wait_for_rwp(void)
{
        gic_do_wait_for_rwp(gic_data_rdist_rd_base(), GICR_CTLR_RWP);
}

static void gic_enable_redist(bool enable)
{
        void __iomem *rbase;
        u32 val;
        int ret;

        if (gic_data.flags & FLAGS_WORKAROUND_GICR_WAKER_MSM8996)
                return;

        rbase = gic_data_rdist_rd_base();

        val = readl_relaxed(rbase + GICR_WAKER);
        if (enable)
                /* Wake up this CPU redistributor */
                val &= ~GICR_WAKER_ProcessorSleep;
        else
                val |= GICR_WAKER_ProcessorSleep;
        writel_relaxed(val, rbase + GICR_WAKER);

        if (!enable) {                /* Check that GICR_WAKER is writeable */
                val = readl_relaxed(rbase + GICR_WAKER);
                if (!(val & GICR_WAKER_ProcessorSleep))
                        return;        /* No PM support in this redistributor */
        }

        ret = readl_relaxed_poll_timeout_atomic(rbase + GICR_WAKER, val,
                                                enable ^ (bool)(val & GICR_WAKER_ChildrenAsleep),
                                                1, USEC_PER_SEC);
        if (ret == -ETIMEDOUT) {
                pr_err_ratelimited("redistributor failed to %s...\n",
                                   enable ? "wakeup" : "sleep");
        }
}

/*
 * Routines to disable, enable, EOI and route interrupts
 */
static u32 convert_offset_index(struct irq_data *d, u32 offset, u32 *index)
{
        switch (get_intid_range(d)) {
        case SGI_RANGE:
        case PPI_RANGE:
        case SPI_RANGE:
                *index = d->hwirq;
                return offset;
        case EPPI_RANGE:
                /*
                 * Contrary to the ESPI range, the EPPI range is contiguous
                 * to the PPI range in the registers, so let's adjust the
                 * displacement accordingly. Consistency is overrated.
                 */
                *index = d->hwirq - EPPI_BASE_INTID + 32;
                return offset;
        case ESPI_RANGE:
                *index = d->hwirq - ESPI_BASE_INTID;
                switch (offset) {
                case GICD_ISENABLER:
                        return GICD_ISENABLERnE;
                case GICD_ICENABLER:
                        return GICD_ICENABLERnE;
                case GICD_ISPENDR:
                        return GICD_ISPENDRnE;
                case GICD_ICPENDR:
                        return GICD_ICPENDRnE;
                case GICD_ISACTIVER:
                        return GICD_ISACTIVERnE;
                case GICD_ICACTIVER:
                        return GICD_ICACTIVERnE;
                case GICD_IPRIORITYR:
                        return GICD_IPRIORITYRnE;
                case GICD_ICFGR:
                        return GICD_ICFGRnE;
                case GICD_IROUTER:
                        return GICD_IROUTERnE;
                default:
                        break;
                }
                break;
        default:
                break;
        }

        WARN_ON(1);
        *index = d->hwirq;
        return offset;
}

static int gic_peek_irq(struct irq_data *d, u32 offset)
{
        void __iomem *base;
        u32 index, mask;

        offset = convert_offset_index(d, offset, &index);
        mask = 1 << (index % 32);

        if (gic_irq_in_rdist(d))
                base = gic_data_rdist_sgi_base();
        else
                base = gic_dist_base_alias(d);

        return !!(readl_relaxed(base + offset + (index / 32) * 4) & mask);
}

static void gic_poke_irq(struct irq_data *d, u32 offset)
{
        void __iomem *base;
        u32 index, mask;

        offset = convert_offset_index(d, offset, &index);
        mask = 1 << (index % 32);

        if (gic_irq_in_rdist(d))
                base = gic_data_rdist_sgi_base();
        else
                base = gic_data.dist_base;

        writel_relaxed(mask, base + offset + (index / 32) * 4);
}

static void gic_mask_irq(struct irq_data *d)
{
        gic_poke_irq(d, GICD_ICENABLER);
        if (gic_irq_in_rdist(d))
                gic_redist_wait_for_rwp();
        else
                gic_dist_wait_for_rwp();
}

static void gic_eoimode1_mask_irq(struct irq_data *d)
{
        gic_mask_irq(d);
        /*
         * When masking a forwarded interrupt, make sure it is
         * deactivated as well.
         *
         * This ensures that an interrupt that is getting
         * disabled/masked will not get "stuck", because there is
         * noone to deactivate it (guest is being terminated).
         */
        if (irqd_is_forwarded_to_vcpu(d))
                gic_poke_irq(d, GICD_ICACTIVER);
}

static void gic_unmask_irq(struct irq_data *d)
{
        gic_poke_irq(d, GICD_ISENABLER);
}

static inline bool gic_supports_nmi(void)
{
        return IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) &&
               static_branch_likely(&supports_pseudo_nmis);
}

static int gic_irq_set_irqchip_state(struct irq_data *d,
                                     enum irqchip_irq_state which, bool val)
{
        u32 reg;

        if (d->hwirq >= 8192) /* SGI/PPI/SPI only */
                return -EINVAL;

        switch (which) {
        case IRQCHIP_STATE_PENDING:
                reg = val ? GICD_ISPENDR : GICD_ICPENDR;
                break;

        case IRQCHIP_STATE_ACTIVE:
                reg = val ? GICD_ISACTIVER : GICD_ICACTIVER;
                break;

        case IRQCHIP_STATE_MASKED:
                if (val) {
                        gic_mask_irq(d);
                        return 0;
                }
                reg = GICD_ISENABLER;
                break;

        default:
                return -EINVAL;
        }

        gic_poke_irq(d, reg);

        /*
         * Force read-back to guarantee that the active state has taken
         * effect, and won't race with a guest-driven deactivation.
         */
        if (reg == GICD_ISACTIVER)
                gic_peek_irq(d, reg);
        return 0;
}

static int gic_irq_get_irqchip_state(struct irq_data *d,
                                     enum irqchip_irq_state which, bool *val)
{
        if (d->hwirq >= 8192) /* PPI/SPI only */
                return -EINVAL;

        switch (which) {
        case IRQCHIP_STATE_PENDING:
                *val = gic_peek_irq(d, GICD_ISPENDR);
                break;

        case IRQCHIP_STATE_ACTIVE:
                *val = gic_peek_irq(d, GICD_ISACTIVER);
                break;

        case IRQCHIP_STATE_MASKED:
                *val = !gic_peek_irq(d, GICD_ISENABLER);
                break;

        default:
                return -EINVAL;
        }

        return 0;
}

static void gic_irq_set_prio(struct irq_data *d, u8 prio)
{
        void __iomem *base = gic_dist_base(d);
        u32 offset, index;

        offset = convert_offset_index(d, GICD_IPRIORITYR, &index);

        writeb_relaxed(prio, base + offset + index);
}

static u32 __gic_get_ppi_index(irq_hw_number_t hwirq)
{
        switch (__get_intid_range(hwirq)) {
        case PPI_RANGE:
                return hwirq - 16;
        case EPPI_RANGE:
                return hwirq - EPPI_BASE_INTID + 16;
        default:
                unreachable();
        }
}

static u32 __gic_get_rdist_index(irq_hw_number_t hwirq)
{
        switch (__get_intid_range(hwirq)) {
        case SGI_RANGE:
        case PPI_RANGE:
                return hwirq;
        case EPPI_RANGE:
                return hwirq - EPPI_BASE_INTID + 32;
        default:
                unreachable();
        }
}

static u32 gic_get_rdist_index(struct irq_data *d)
{
        return __gic_get_rdist_index(d->hwirq);
}

static int gic_irq_nmi_setup(struct irq_data *d)
{
        struct irq_desc *desc = irq_to_desc(d->irq);

        if (!gic_supports_nmi())
                return -EINVAL;

        if (gic_peek_irq(d, GICD_ISENABLER)) {
                pr_err("Cannot set NMI property of enabled IRQ %u\n", d->irq);
                return -EINVAL;
        }

        /*
         * A secondary irq_chip should be in charge of LPI request,
         * it should not be possible to get there
         */
        if (WARN_ON(irqd_to_hwirq(d) >= 8192))
                return -EINVAL;

        /* desc lock should already be held */
        if (gic_irq_in_rdist(d)) {
                u32 idx = gic_get_rdist_index(d);

                /*
                 * Setting up a percpu interrupt as NMI, only switch handler
                 * for first NMI
                 */
                if (!refcount_inc_not_zero(&rdist_nmi_refs[idx])) {
                        refcount_set(&rdist_nmi_refs[idx], 1);
                        desc->handle_irq = handle_percpu_devid_fasteoi_nmi;
                }
        } else {
                desc->handle_irq = handle_fasteoi_nmi;
        }

        gic_irq_set_prio(d, dist_prio_nmi);

        return 0;
}

static void gic_irq_nmi_teardown(struct irq_data *d)
{
        struct irq_desc *desc = irq_to_desc(d->irq);

        if (WARN_ON(!gic_supports_nmi()))
                return;

        if (gic_peek_irq(d, GICD_ISENABLER)) {
                pr_err("Cannot set NMI property of enabled IRQ %u\n", d->irq);
                return;
        }

        /*
         * A secondary irq_chip should be in charge of LPI request,
         * it should not be possible to get there
         */
        if (WARN_ON(irqd_to_hwirq(d) >= 8192))
                return;

        /* desc lock should already be held */
        if (gic_irq_in_rdist(d)) {
                u32 idx = gic_get_rdist_index(d);

                /* Tearing down NMI, only switch handler for last NMI */
                if (refcount_dec_and_test(&rdist_nmi_refs[idx]))
                        desc->handle_irq = handle_percpu_devid_irq;
        } else {
                desc->handle_irq = handle_fasteoi_irq;
        }

        gic_irq_set_prio(d, dist_prio_irq);
}

static bool gic_arm64_erratum_2941627_needed(struct irq_data *d)
{
        enum gic_intid_range range;

        if (!static_branch_unlikely(&gic_arm64_2941627_erratum))
                return false;

        range = get_intid_range(d);

        /*
         * The workaround is needed if the IRQ is an SPI and
         * the target cpu is different from the one we are
         * executing on.
         */
        return (range == SPI_RANGE || range == ESPI_RANGE) &&
                !cpumask_test_cpu(raw_smp_processor_id(),
                                  irq_data_get_effective_affinity_mask(d));
}

static void gic_eoi_irq(struct irq_data *d)
{
        write_gicreg(irqd_to_hwirq(d), ICC_EOIR1_EL1);
        isb();

        if (gic_arm64_erratum_2941627_needed(d)) {
                /*
                 * Make sure the GIC stream deactivate packet
                 * issued by ICC_EOIR1_EL1 has completed before
                 * deactivating through GICD_IACTIVER.
                 */
                dsb(sy);
                gic_poke_irq(d, GICD_ICACTIVER);
        }
}

static void gic_eoimode1_eoi_irq(struct irq_data *d)
{
        /*
         * No need to deactivate an LPI, or an interrupt that
         * is is getting forwarded to a vcpu.
         */
        if (irqd_to_hwirq(d) >= 8192 || irqd_is_forwarded_to_vcpu(d))
                return;

        if (!gic_arm64_erratum_2941627_needed(d))
                gic_write_dir(irqd_to_hwirq(d));
        else
                gic_poke_irq(d, GICD_ICACTIVER);
}

static int gic_set_type(struct irq_data *d, unsigned int type)
{
        irq_hw_number_t irq = irqd_to_hwirq(d);
        enum gic_intid_range range;
        void __iomem *base;
        u32 offset, index;
        int ret;

        range = get_intid_range(d);

        /* Interrupt configuration for SGIs can't be changed */
        if (range == SGI_RANGE)
                return type != IRQ_TYPE_EDGE_RISING ? -EINVAL : 0;

        /* SPIs have restrictions on the supported types */
        if ((range == SPI_RANGE || range == ESPI_RANGE) &&
            type != IRQ_TYPE_LEVEL_HIGH && type != IRQ_TYPE_EDGE_RISING)
                return -EINVAL;

        if (gic_irq_in_rdist(d))
                base = gic_data_rdist_sgi_base();
        else
                base = gic_dist_base_alias(d);

        offset = convert_offset_index(d, GICD_ICFGR, &index);

        ret = gic_configure_irq(index, type, base + offset);
        if (ret && (range == PPI_RANGE || range == EPPI_RANGE)) {
                /* Misconfigured PPIs are usually not fatal */
                pr_warn("GIC: PPI INTID%ld is secure or misconfigured\n", irq);
                ret = 0;
        }

        return ret;
}

static int gic_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
{
        if (get_intid_range(d) == SGI_RANGE)
                return -EINVAL;

        if (vcpu)
                irqd_set_forwarded_to_vcpu(d);
        else
                irqd_clr_forwarded_to_vcpu(d);
        return 0;
}

static u64 gic_cpu_to_affinity(int cpu)
{
        u64 mpidr = cpu_logical_map(cpu);
        u64 aff;

        /* ASR8601 needs to have its affinities shifted down... */
        if (unlikely(gic_data.flags & FLAGS_WORKAROUND_ASR_ERRATUM_8601001))
                mpidr = (MPIDR_AFFINITY_LEVEL(mpidr, 1)        |
                         (MPIDR_AFFINITY_LEVEL(mpidr, 2) << 8));

        aff = ((u64)MPIDR_AFFINITY_LEVEL(mpidr, 3) << 32 |
               MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16 |
               MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8  |
               MPIDR_AFFINITY_LEVEL(mpidr, 0));

        return aff;
}

static void gic_deactivate_unhandled(u32 irqnr)
{
        if (static_branch_likely(&supports_deactivate_key)) {
                if (irqnr < 8192)
                        gic_write_dir(irqnr);
        } else {
                write_gicreg(irqnr, ICC_EOIR1_EL1);
                isb();
        }
}

/*
 * Follow a read of the IAR with any HW maintenance that needs to happen prior
 * to invoking the relevant IRQ handler. We must do two things:
 *
 * (1) Ensure instruction ordering between a read of IAR and subsequent
 *     instructions in the IRQ handler using an ISB.
 *
 *     It is possible for the IAR to report an IRQ which was signalled *after*
 *     the CPU took an IRQ exception as multiple interrupts can race to be
 *     recognized by the GIC, earlier interrupts could be withdrawn, and/or
 *     later interrupts could be prioritized by the GIC.
 *
 *     For devices which are tightly coupled to the CPU, such as PMUs, a
 *     context synchronization event is necessary to ensure that system
 *     register state is not stale, as these may have been indirectly written
 *     *after* exception entry.
 *
 * (2) Execute an interrupt priority drop when EOI mode 1 is in use.
 */
static inline void gic_complete_ack(u32 irqnr)
{
        if (static_branch_likely(&supports_deactivate_key))
                write_gicreg(irqnr, ICC_EOIR1_EL1);

        isb();
}

static bool gic_rpr_is_nmi_prio(void)
{
        if (!gic_supports_nmi())
                return false;

        return unlikely(gic_read_rpr() == GICV3_PRIO_NMI);
}

static bool gic_irqnr_is_special(u32 irqnr)
{
        return irqnr >= 1020 && irqnr <= 1023;
}

static void __gic_handle_irq(u32 irqnr, struct pt_regs *regs)
{
        if (gic_irqnr_is_special(irqnr))
                return;

        gic_complete_ack(irqnr);

        if (generic_handle_domain_irq(gic_data.domain, irqnr)) {
                WARN_ONCE(true, "Unexpected interrupt (irqnr %u)\n", irqnr);
                gic_deactivate_unhandled(irqnr);
        }
}

static void __gic_handle_nmi(u32 irqnr, struct pt_regs *regs)
{
        if (gic_irqnr_is_special(irqnr))
                return;

        gic_complete_ack(irqnr);

        if (generic_handle_domain_nmi(gic_data.domain, irqnr)) {
                WARN_ONCE(true, "Unexpected pseudo-NMI (irqnr %u)\n", irqnr);
                gic_deactivate_unhandled(irqnr);
        }
}

/*
 * An exception has been taken from a context with IRQs enabled, and this could
 * be an IRQ or an NMI.
 *
 * The entry code called us with DAIF.IF set to keep NMIs masked. We must clear
 * DAIF.IF (and update ICC_PMR_EL1 to mask regular IRQs) prior to returning,
 * after handling any NMI but before handling any IRQ.
 *
 * The entry code has performed IRQ entry, and if an NMI is detected we must
 * perform NMI entry/exit around invoking the handler.
 */
static void __gic_handle_irq_from_irqson(struct pt_regs *regs)
{
        bool is_nmi;
        u32 irqnr;

        irqnr = gic_read_iar();

        is_nmi = gic_rpr_is_nmi_prio();

        if (is_nmi) {
                nmi_enter();
                __gic_handle_nmi(irqnr, regs);
                nmi_exit();
        }

        if (gic_prio_masking_enabled()) {
                gic_pmr_mask_irqs();
                gic_arch_enable_irqs();
        }

        if (!is_nmi)
                __gic_handle_irq(irqnr, regs);
}

/*
 * An exception has been taken from a context with IRQs disabled, which can only
 * be an NMI.
 *
 * The entry code called us with DAIF.IF set to keep NMIs masked. We must leave
 * DAIF.IF (and ICC_PMR_EL1) unchanged.
 *
 * The entry code has performed NMI entry.
 */
static void __gic_handle_irq_from_irqsoff(struct pt_regs *regs)
{
        u64 pmr;
        u32 irqnr;

        /*
         * We were in a context with IRQs disabled. However, the
         * entry code has set PMR to a value that allows any
         * interrupt to be acknowledged, and not just NMIs. This can
         * lead to surprising effects if the NMI has been retired in
         * the meantime, and that there is an IRQ pending. The IRQ
         * would then be taken in NMI context, something that nobody
         * wants to debug twice.
         *
         * Until we sort this, drop PMR again to a level that will
         * actually only allow NMIs before reading IAR, and then
         * restore it to what it was.
         */
        pmr = gic_read_pmr();
        gic_pmr_mask_irqs();
        isb();
        irqnr = gic_read_iar();
        gic_write_pmr(pmr);

        __gic_handle_nmi(irqnr, regs);
}

static void __exception_irq_entry gic_handle_irq(struct pt_regs *regs)
{
        if (unlikely(gic_supports_nmi() && !interrupts_enabled(regs)))
                __gic_handle_irq_from_irqsoff(regs);
        else
                __gic_handle_irq_from_irqson(regs);
}

static void __init gic_dist_init(void)
{
        unsigned int i;
        u64 affinity;
        void __iomem *base = gic_data.dist_base;
        u32 val;

        /* Disable the distributor */
        writel_relaxed(0, base + GICD_CTLR);
        gic_dist_wait_for_rwp();

        /*
         * Configure SPIs as non-secure Group-1. This will only matter
         * if the GIC only has a single security state. This will not
         * do the right thing if the kernel is running in secure mode,
         * but that's not the intended use case anyway.
         */
        for (i = 32; i < GIC_LINE_NR; i += 32)
                writel_relaxed(~0, base + GICD_IGROUPR + i / 8);

        /* Extended SPI range, not handled by the GICv2/GICv3 common code */
        for (i = 0; i < GIC_ESPI_NR; i += 32) {
                writel_relaxed(~0U, base + GICD_ICENABLERnE + i / 8);
                writel_relaxed(~0U, base + GICD_ICACTIVERnE + i / 8);
        }

        for (i = 0; i < GIC_ESPI_NR; i += 32)
                writel_relaxed(~0U, base + GICD_IGROUPRnE + i / 8);

        for (i = 0; i < GIC_ESPI_NR; i += 16)
                writel_relaxed(0, base + GICD_ICFGRnE + i / 4);

        for (i = 0; i < GIC_ESPI_NR; i += 4)
                writel_relaxed(REPEAT_BYTE_U32(dist_prio_irq),
                               base + GICD_IPRIORITYRnE + i);

        /* Now do the common stuff */
        gic_dist_config(base, GIC_LINE_NR, dist_prio_irq);

        val = GICD_CTLR_ARE_NS | GICD_CTLR_ENABLE_G1A | GICD_CTLR_ENABLE_G1;
        if (gic_data.rdists.gicd_typer2 & GICD_TYPER2_nASSGIcap) {
                pr_info("Enabling SGIs without active state\n");
                val |= GICD_CTLR_nASSGIreq;
        }

        /* Enable distributor with ARE, Group1, and wait for it to drain */
        writel_relaxed(val, base + GICD_CTLR);
        gic_dist_wait_for_rwp();

        /*
         * Set all global interrupts to the boot CPU only. ARE must be
         * enabled.
         */
        affinity = gic_cpu_to_affinity(smp_processor_id());
        for (i = 32; i < GIC_LINE_NR; i++)
                gic_write_irouter(affinity, base + GICD_IROUTER + i * 8);

        for (i = 0; i < GIC_ESPI_NR; i++)
                gic_write_irouter(affinity, base + GICD_IROUTERnE + i * 8);
}

static int gic_iterate_rdists(int (*fn)(struct redist_region *, void __iomem *))
{
        int ret = -ENODEV;
        int i;

        for (i = 0; i < gic_data.nr_redist_regions; i++) {
                void __iomem *ptr = gic_data.redist_regions[i].redist_base;
                u64 typer;
                u32 reg;

                reg = readl_relaxed(ptr + GICR_PIDR2) & GIC_PIDR2_ARCH_MASK;
                if (reg != GIC_PIDR2_ARCH_GICv3 &&
                    reg != GIC_PIDR2_ARCH_GICv4) { /* We're in trouble... */
                        pr_warn("No redistributor present @%p\n", ptr);
                        break;
                }

                do {
                        typer = gic_read_typer(ptr + GICR_TYPER);
                        ret = fn(gic_data.redist_regions + i, ptr);
                        if (!ret)
                                return 0;

                        if (gic_data.redist_regions[i].single_redist)
                                break;

                        if (gic_data.redist_stride) {
                                ptr += gic_data.redist_stride;
                        } else {
                                ptr += SZ_64K * 2; /* Skip RD_base + SGI_base */
                                if (typer & GICR_TYPER_VLPIS)
                                        ptr += SZ_64K * 2; /* Skip VLPI_base + reserved page */
                        }
                } while (!(typer & GICR_TYPER_LAST));
        }

        return ret ? -ENODEV : 0;
}

static int __gic_populate_rdist(struct redist_region *region, void __iomem *ptr)
{
        unsigned long mpidr;
        u64 typer;
        u32 aff;

        /*
         * Convert affinity to a 32bit value that can be matched to
         * GICR_TYPER bits [63:32].
         */
        mpidr = gic_cpu_to_affinity(smp_processor_id());

        aff = (MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24 |
               MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16 |
               MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8 |
               MPIDR_AFFINITY_LEVEL(mpidr, 0));

        typer = gic_read_typer(ptr + GICR_TYPER);
        if ((typer >> 32) == aff) {
                u64 offset = ptr - region->redist_base;
                raw_spin_lock_init(&gic_data_rdist()->rd_lock);
                gic_data_rdist_rd_base() = ptr;
                gic_data_rdist()->phys_base = region->phys_base + offset;

                pr_info("CPU%d: found redistributor %lx region %d:%pa\n",
                        smp_processor_id(), mpidr,
                        (int)(region - gic_data.redist_regions),
                        &gic_data_rdist()->phys_base);
                return 0;
        }

        /* Try next one */
        return 1;
}

static int gic_populate_rdist(void)
{
        if (gic_iterate_rdists(__gic_populate_rdist) == 0)
                return 0;

        /* We couldn't even deal with ourselves... */
        WARN(true, "CPU%d: mpidr %lx has no re-distributor!\n",
             smp_processor_id(),
             (unsigned long)cpu_logical_map(smp_processor_id()));
        return -ENODEV;
}

static int __gic_update_rdist_properties(struct redist_region *region,
                                         void __iomem *ptr)
{
        u64 typer = gic_read_typer(ptr + GICR_TYPER);
        u32 ctlr = readl_relaxed(ptr + GICR_CTLR);

        /* Boot-time cleanup */
        if ((typer & GICR_TYPER_VLPIS) && (typer & GICR_TYPER_RVPEID)) {
                u64 val;

                /* Deactivate any present vPE */
                val = gicr_read_vpendbaser(ptr + SZ_128K + GICR_VPENDBASER);
                if (val & GICR_VPENDBASER_Valid)
                        gicr_write_vpendbaser(GICR_VPENDBASER_PendingLast,
                                              ptr + SZ_128K + GICR_VPENDBASER);

                /* Mark the VPE table as invalid */
                val = gicr_read_vpropbaser(ptr + SZ_128K + GICR_VPROPBASER);
                val &= ~GICR_VPROPBASER_4_1_VALID;
                gicr_write_vpropbaser(val, ptr + SZ_128K + GICR_VPROPBASER);
        }

        gic_data.rdists.has_vlpis &= !!(typer & GICR_TYPER_VLPIS);

        /*
         * TYPER.RVPEID implies some form of DirectLPI, no matter what the
         * doc says... :-/ And CTLR.IR implies another subset of DirectLPI
         * that the ITS driver can make use of for LPIs (and not VLPIs).
         *
         * These are 3 different ways to express the same thing, depending
         * on the revision of the architecture and its relaxations over
         * time. Just group them under the 'direct_lpi' banner.
         */
        gic_data.rdists.has_rvpeid &= !!(typer & GICR_TYPER_RVPEID);
        gic_data.rdists.has_direct_lpi &= (!!(typer & GICR_TYPER_DirectLPIS) |
                                           !!(ctlr & GICR_CTLR_IR) |
                                           gic_data.rdists.has_rvpeid);
        gic_data.rdists.has_vpend_valid_dirty &= !!(typer & GICR_TYPER_DIRTY);

        /* Detect non-sensical configurations */
        if (WARN_ON_ONCE(gic_data.rdists.has_rvpeid && !gic_data.rdists.has_vlpis)) {
                gic_data.rdists.has_direct_lpi = false;
                gic_data.rdists.has_vlpis = false;
                gic_data.rdists.has_rvpeid = false;
        }

        gic_data.ppi_nr = min(GICR_TYPER_NR_PPIS(typer), gic_data.ppi_nr);

        return 1;
}

static void gic_update_rdist_properties(void)
{
        gic_data.ppi_nr = UINT_MAX;
        gic_iterate_rdists(__gic_update_rdist_properties);
        if (WARN_ON(gic_data.ppi_nr == UINT_MAX))
                gic_data.ppi_nr = 0;
        pr_info("GICv3 features: %d PPIs%s%s\n",
                gic_data.ppi_nr,
                gic_data.has_rss ? ", RSS" : "",
                gic_data.rdists.has_direct_lpi ? ", DirectLPI" : "");

        if (gic_data.rdists.has_vlpis)
                pr_info("GICv4 features: %s%s%s\n",
                        gic_data.rdists.has_direct_lpi ? "DirectLPI " : "",
                        gic_data.rdists.has_rvpeid ? "RVPEID " : "",
                        gic_data.rdists.has_vpend_valid_dirty ? "Valid+Dirty " : "");
}

static void gic_cpu_sys_reg_enable(void)
{
        /*
         * Need to check that the SRE bit has actually been set. If
         * not, it means that SRE is disabled at EL2. We're going to
         * die painfully, and there is nothing we can do about it.
         *
         * Kindly inform the luser.
         */
        if (!gic_enable_sre())
                pr_err("GIC: unable to set SRE (disabled at EL2), panic ahead\n");

}

static void gic_cpu_sys_reg_init(void)
{
        int i, cpu = smp_processor_id();
        u64 mpidr = gic_cpu_to_affinity(cpu);
        u64 need_rss = MPIDR_RS(mpidr);
        bool group0;
        u32 pribits;

        pribits = gic_get_pribits();

        group0 = gic_has_group0();

        /* Set priority mask register */
        if (!gic_prio_masking_enabled()) {
                write_gicreg(DEFAULT_PMR_VALUE, ICC_PMR_EL1);
        } else if (gic_supports_nmi()) {
                /*
                 * Check that all CPUs use the same priority space.
                 *
                 * If there's a mismatch with the boot CPU, the system is
                 * likely to die as interrupt masking will not work properly on
                 * all CPUs.
                 */
                WARN_ON(group0 != cpus_have_group0);
                WARN_ON(gic_dist_security_disabled() != cpus_have_security_disabled);
        }

        /*
         * Some firmwares hand over to the kernel with the BPR changed from
         * its reset value (and with a value large enough to prevent
         * any pre-emptive interrupts from working at all). Writing a zero
         * to BPR restores is reset value.
         */
        gic_write_bpr1(0);

        if (static_branch_likely(&supports_deactivate_key)) {
                /* EOI drops priority only (mode 1) */
                gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop);
        } else {
                /* EOI deactivates interrupt too (mode 0) */
                gic_write_ctlr(ICC_CTLR_EL1_EOImode_drop_dir);
        }

        /* Always whack Group0 before Group1 */
        if (group0) {
                switch(pribits) {
                case 8:
                case 7:
                        write_gicreg(0, ICC_AP0R3_EL1);
                        write_gicreg(0, ICC_AP0R2_EL1);
                        fallthrough;
                case 6:
                        write_gicreg(0, ICC_AP0R1_EL1);
                        fallthrough;
                case 5:
                case 4:
                        write_gicreg(0, ICC_AP0R0_EL1);
                }

                isb();
        }

        switch(pribits) {
        case 8:
        case 7:
                write_gicreg(0, ICC_AP1R3_EL1);
                write_gicreg(0, ICC_AP1R2_EL1);
                fallthrough;
        case 6:
                write_gicreg(0, ICC_AP1R1_EL1);
                fallthrough;
        case 5:
        case 4:
                write_gicreg(0, ICC_AP1R0_EL1);
        }

        isb();

        /* ... and let's hit the road... */
        gic_write_grpen1(1);

        /* Keep the RSS capability status in per_cpu variable */
        per_cpu(has_rss, cpu) = !!(gic_read_ctlr() & ICC_CTLR_EL1_RSS);

        /* Check all the CPUs have capable of sending SGIs to other CPUs */
        for_each_online_cpu(i) {
                bool have_rss = per_cpu(has_rss, i) && per_cpu(has_rss, cpu);

                need_rss |= MPIDR_RS(gic_cpu_to_affinity(i));
                if (need_rss && (!have_rss))
                        pr_crit("CPU%d (%lx) can't SGI CPU%d (%lx), no RSS\n",
                                cpu, (unsigned long)mpidr,
                                i, (unsigned long)gic_cpu_to_affinity(i));
        }

        /**
         * GIC spec says, when ICC_CTLR_EL1.RSS==1 and GICD_TYPER.RSS==0,
         * writing ICC_ASGI1R_EL1 register with RS != 0 is a CONSTRAINED
         * UNPREDICTABLE choice of :
         *   - The write is ignored.
         *   - The RS field is treated as 0.
         */
        if (need_rss && (!gic_data.has_rss))
                pr_crit_once("RSS is required but GICD doesn't support it\n");
}

static bool gicv3_nolpi;

static int __init gicv3_nolpi_cfg(char *buf)
{
        return kstrtobool(buf, &gicv3_nolpi);
}
early_param("irqchip.gicv3_nolpi", gicv3_nolpi_cfg);

static int gic_dist_supports_lpis(void)
{
        return (IS_ENABLED(CONFIG_ARM_GIC_V3_ITS) &&
                !!(readl_relaxed(gic_data.dist_base + GICD_TYPER) & GICD_TYPER_LPIS) &&
                !gicv3_nolpi);
}

static void gic_cpu_init(void)
{
        void __iomem *rbase;
        int i;

        /* Register ourselves with the rest of the world */
        if (gic_populate_rdist())
                return;

        gic_enable_redist(true);

        WARN((gic_data.ppi_nr > 16 || GIC_ESPI_NR != 0) &&
             !(gic_read_ctlr() & ICC_CTLR_EL1_ExtRange),
             "Distributor has extended ranges, but CPU%d doesn't\n",
             smp_processor_id());

        rbase = gic_data_rdist_sgi_base();

        /* Configure SGIs/PPIs as non-secure Group-1 */
        for (i = 0; i < gic_data.ppi_nr + SGI_NR; i += 32)
                writel_relaxed(~0, rbase + GICR_IGROUPR0 + i / 8);

        gic_cpu_config(rbase, gic_data.ppi_nr + SGI_NR, dist_prio_irq);
        gic_redist_wait_for_rwp();

        /* initialise system registers */
        gic_cpu_sys_reg_init();
}

#ifdef CONFIG_SMP

#define MPIDR_TO_SGI_RS(mpidr)        (MPIDR_RS(mpidr) << ICC_SGI1R_RS_SHIFT)
#define MPIDR_TO_SGI_CLUSTER_ID(mpidr)        ((mpidr) & ~0xFUL)

/*
 * gic_starting_cpu() is called after the last point where cpuhp is allowed
 * to fail. So pre check for problems earlier.
 */
static int gic_check_rdist(unsigned int cpu)
{
        if (cpumask_test_cpu(cpu, &broken_rdists))
                return -EINVAL;

        return 0;
}

static int gic_starting_cpu(unsigned int cpu)
{
        gic_cpu_sys_reg_enable();
        gic_cpu_init();

        if (gic_dist_supports_lpis())
                its_cpu_init();

        return 0;
}

static u16 gic_compute_target_list(int *base_cpu, const struct cpumask *mask,
                                   unsigned long cluster_id)
{
        int next_cpu, cpu = *base_cpu;
        unsigned long mpidr;
        u16 tlist = 0;

        mpidr = gic_cpu_to_affinity(cpu);

        while (cpu < nr_cpu_ids) {
                tlist |= 1 << (mpidr & 0xf);

                next_cpu = cpumask_next(cpu, mask);
                if (next_cpu >= nr_cpu_ids)
                        goto out;
                cpu = next_cpu;

                mpidr = gic_cpu_to_affinity(cpu);

                if (cluster_id != MPIDR_TO_SGI_CLUSTER_ID(mpidr)) {
                        cpu--;
                        goto out;
                }
        }
out:
        *base_cpu = cpu;
        return tlist;
}

#define MPIDR_TO_SGI_AFFINITY(cluster_id, level) \
        (MPIDR_AFFINITY_LEVEL(cluster_id, level) \
                << ICC_SGI1R_AFFINITY_## level ##_SHIFT)

static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq)
{
        u64 val;

        val = (MPIDR_TO_SGI_AFFINITY(cluster_id, 3)        |
               MPIDR_TO_SGI_AFFINITY(cluster_id, 2)        |
               irq << ICC_SGI1R_SGI_ID_SHIFT                |
               MPIDR_TO_SGI_AFFINITY(cluster_id, 1)        |
               MPIDR_TO_SGI_RS(cluster_id)                |
               tlist << ICC_SGI1R_TARGET_LIST_SHIFT);

        pr_devel("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val);
        gic_write_sgi1r(val);
}

static void gic_ipi_send_mask(struct irq_data *d, const struct cpumask *mask)
{
        int cpu;

        if (WARN_ON(d->hwirq >= 16))
                return;

        /*
         * Ensure that stores to Normal memory are visible to the
         * other CPUs before issuing the IPI.
         */
        dsb(ishst);

        for_each_cpu(cpu, mask) {
                u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(gic_cpu_to_affinity(cpu));
                u16 tlist;

                tlist = gic_compute_target_list(&cpu, mask, cluster_id);
                gic_send_sgi(cluster_id, tlist, d->hwirq);
        }

        /* Force the above writes to ICC_SGI1R_EL1 to be executed */
        isb();
}

static void __init gic_smp_init(void)
{
        struct irq_fwspec sgi_fwspec = {
                .fwnode                = gic_data.fwnode,
                .param_count        = 1,
        };
        int base_sgi;

        cpuhp_setup_state_nocalls(CPUHP_BP_PREPARE_DYN,
                                  "irqchip/arm/gicv3:checkrdist",
                                  gic_check_rdist, NULL);

        cpuhp_setup_state_nocalls(CPUHP_AP_IRQ_GIC_STARTING,
                                  "irqchip/arm/gicv3:starting",
                                  gic_starting_cpu, NULL);

        /* Register all 8 non-secure SGIs */
        base_sgi = irq_domain_alloc_irqs(gic_data.domain, 8, NUMA_NO_NODE, &sgi_fwspec);
        if (WARN_ON(base_sgi <= 0))
                return;

        set_smp_ipi_range(base_sgi, 8);
}

static int gic_set_affinity(struct irq_data *d, const struct cpumask *mask_val,
                            bool force)
{
        unsigned int cpu;
        u32 offset, index;
        void __iomem *reg;
        int enabled;
        u64 val;

        if (force)
                cpu = cpumask_first(mask_val);
        else
                cpu = cpumask_any_and(mask_val, cpu_online_mask);

        if (cpu >= nr_cpu_ids)
                return -EINVAL;

        if (gic_irq_in_rdist(d))
                return -EINVAL;

        /* If interrupt was enabled, disable it first */
        enabled = gic_peek_irq(d, GICD_ISENABLER);
        if (enabled)
                gic_mask_irq(d);

        offset = convert_offset_index(d, GICD_IROUTER, &index);
        reg = gic_dist_base(d) + offset + (index * 8);
        val = gic_cpu_to_affinity(cpu);

        gic_write_irouter(val, reg);

        /*
         * If the interrupt was enabled, enabled it again. Otherwise,
         * just wait for the distributor to have digested our changes.
         */
        if (enabled)
                gic_unmask_irq(d);

        irq_data_update_effective_affinity(d, cpumask_of(cpu));

        return IRQ_SET_MASK_OK_DONE;
}
#else
#define gic_set_affinity        NULL
#define gic_ipi_send_mask        NULL
#define gic_smp_init()                do { } while(0)
#endif

static int gic_retrigger(struct irq_data *data)
{
        return !gic_irq_set_irqchip_state(data, IRQCHIP_STATE_PENDING, true);
}

#ifdef CONFIG_CPU_PM
static int gic_cpu_pm_notifier(struct notifier_block *self,
                               unsigned long cmd, void *v)
{
        if (cmd == CPU_PM_EXIT || cmd == CPU_PM_ENTER_FAILED) {
                if (gic_dist_security_disabled())
                        gic_enable_redist(true);
                gic_cpu_sys_reg_enable();
                gic_cpu_sys_reg_init();
        } else if (cmd == CPU_PM_ENTER && gic_dist_security_disabled()) {
                gic_write_grpen1(0);
                gic_enable_redist(false);
        }
        return NOTIFY_OK;
}

static struct notifier_block gic_cpu_pm_notifier_block = {
        .notifier_call = gic_cpu_pm_notifier,
};

static void gic_cpu_pm_init(void)
{
        cpu_pm_register_notifier(&gic_cpu_pm_notifier_block);
}

#else
static inline void gic_cpu_pm_init(void) { }
#endif /* CONFIG_CPU_PM */

static struct irq_chip gic_chip = {
        .name                        = "GICv3",
        .irq_mask                = gic_mask_irq,
        .irq_unmask                = gic_unmask_irq,
        .irq_eoi                = gic_eoi_irq,
        .irq_set_type                = gic_set_type,
        .irq_set_affinity        = gic_set_affinity,
        .irq_retrigger          = gic_retrigger,
        .irq_get_irqchip_state        = gic_irq_get_irqchip_state,
        .irq_set_irqchip_state        = gic_irq_set_irqchip_state,
        .irq_nmi_setup                = gic_irq_nmi_setup,
        .irq_nmi_teardown        = gic_irq_nmi_teardown,
        .ipi_send_mask                = gic_ipi_send_mask,
        .flags                        = IRQCHIP_SET_TYPE_MASKED |
                                  IRQCHIP_SKIP_SET_WAKE |
                                  IRQCHIP_MASK_ON_SUSPEND,
};

static struct irq_chip gic_eoimode1_chip = {
        .name                        = "GICv3",
        .irq_mask                = gic_eoimode1_mask_irq,
        .irq_unmask                = gic_unmask_irq,
        .irq_eoi                = gic_eoimode1_eoi_irq,
        .irq_set_type                = gic_set_type,
        .irq_set_affinity        = gic_set_affinity,
        .irq_retrigger          = gic_retrigger,
        .irq_get_irqchip_state        = gic_irq_get_irqchip_state,
        .irq_set_irqchip_state        = gic_irq_set_irqchip_state,
        .irq_set_vcpu_affinity        = gic_irq_set_vcpu_affinity,
        .irq_nmi_setup                = gic_irq_nmi_setup,
        .irq_nmi_teardown        = gic_irq_nmi_teardown,
        .ipi_send_mask                = gic_ipi_send_mask,
        .flags                        = IRQCHIP_SET_TYPE_MASKED |
                                  IRQCHIP_SKIP_SET_WAKE |
                                  IRQCHIP_MASK_ON_SUSPEND,
};

static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq,
                              irq_hw_number_t hw)
{
        struct irq_chip *chip = &gic_chip;
        struct irq_data *irqd = irq_desc_get_irq_data(irq_to_desc(irq));

        if (static_branch_likely(&supports_deactivate_key))
                chip = &gic_eoimode1_chip;

        switch (__get_intid_range(hw)) {
        case SGI_RANGE:
        case PPI_RANGE:
        case EPPI_RANGE:
                irq_set_percpu_devid(irq);
                irq_domain_set_info(d, irq, hw, chip, d->host_data,
                                    handle_percpu_devid_irq, NULL, NULL);
                break;

        case SPI_RANGE:
        case ESPI_RANGE:
                irq_domain_set_info(d, irq, hw, chip, d->host_data,
                                    handle_fasteoi_irq, NULL, NULL);
                irq_set_probe(irq);
                irqd_set_single_target(irqd);
                break;

        case LPI_RANGE:
                if (!gic_dist_supports_lpis())
                        return -EPERM;
                irq_domain_set_info(d, irq, hw, chip, d->host_data,
                                    handle_fasteoi_irq, NULL, NULL);
                break;

        default:
                return -EPERM;
        }

        /* Prevents SW retriggers which mess up the ACK/EOI ordering */
        irqd_set_handle_enforce_irqctx(irqd);
        return 0;
}

static int gic_irq_domain_translate(struct irq_domain *d,
                                    struct irq_fwspec *fwspec,
                                    unsigned long *hwirq,
                                    unsigned int *type)
{
        if (fwspec->param_count == 1 && fwspec->param[0] < 16) {
                *hwirq = fwspec->param[0];
                *type = IRQ_TYPE_EDGE_RISING;
                return 0;
        }

        if (is_of_node(fwspec->fwnode)) {
                if (fwspec->param_count < 3)
                        return -EINVAL;

                switch (fwspec->param[0]) {
                case 0:                        /* SPI */
                        *hwirq = fwspec->param[1] + 32;
                        break;
                case 1:                        /* PPI */
                        *hwirq = fwspec->param[1] + 16;
                        break;
                case 2:                        /* ESPI */
                        *hwirq = fwspec->param[1] + ESPI_BASE_INTID;
                        break;
                case 3:                        /* EPPI */
                        *hwirq = fwspec->param[1] + EPPI_BASE_INTID;
                        break;
                case GIC_IRQ_TYPE_LPI:        /* LPI */
                        *hwirq = fwspec->param[1];
                        break;
                case GIC_IRQ_TYPE_PARTITION:
                        *hwirq = fwspec->param[1];
                        if (fwspec->param[1] >= 16)
                                *hwirq += EPPI_BASE_INTID - 16;
                        else
                                *hwirq += 16;
                        break;
                default:
                        return -EINVAL;
                }

                *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;

                /*
                 * Make it clear that broken DTs are... broken.
                 * Partitioned PPIs are an unfortunate exception.
                 */
                WARN_ON(*type == IRQ_TYPE_NONE &&
                        fwspec->param[0] != GIC_IRQ_TYPE_PARTITION);
                return 0;
        }

        if (is_fwnode_irqchip(fwspec->fwnode)) {
                if(fwspec->param_count != 2)
                        return -EINVAL;

                if (fwspec->param[0] < 16) {
                        pr_err(FW_BUG "Illegal GSI%d translation request\n",
                               fwspec->param[0]);
                        return -EINVAL;
                }

                *hwirq = fwspec->param[0];
                *type = fwspec->param[1];

                WARN_ON(*type == IRQ_TYPE_NONE);
                return 0;
        }

        return -EINVAL;
}

static int gic_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
                                unsigned int nr_irqs, void *arg)
{
        int i, ret;
        irq_hw_number_t hwirq;
        unsigned int type = IRQ_TYPE_NONE;
        struct irq_fwspec *fwspec = arg;

        ret = gic_irq_domain_translate(domain, fwspec, &hwirq, &type);
        if (ret)
                return ret;

        for (i = 0; i < nr_irqs; i++) {
                ret = gic_irq_domain_map(domain, virq + i, hwirq + i);
                if (ret)
                        return ret;
        }

        return 0;
}

static void gic_irq_domain_free(struct irq_domain *domain, unsigned int virq,
                                unsigned int nr_irqs)
{
        int i;

        for (i = 0; i < nr_irqs; i++) {
                struct irq_data *d = irq_domain_get_irq_data(domain, virq + i);
                irq_set_handler(virq + i, NULL);
                irq_domain_reset_irq_data(d);
        }
}

static bool fwspec_is_partitioned_ppi(struct irq_fwspec *fwspec,
                                      irq_hw_number_t hwirq)
{
        enum gic_intid_range range;

        if (!gic_data.ppi_descs)
                return false;

        if (!is_of_node(fwspec->fwnode))
                return false;

        if (fwspec->param_count < 4 || !fwspec->param[3])
                return false;

        range = __get_intid_range(hwirq);
        if (range != PPI_RANGE && range != EPPI_RANGE)
                return false;

        return true;
}

static int gic_irq_domain_select(struct irq_domain *d,
                                 struct irq_fwspec *fwspec,
                                 enum irq_domain_bus_token bus_token)
{
        unsigned int type, ret, ppi_idx;
        irq_hw_number_t hwirq;

        /* Not for us */
        if (fwspec->fwnode != d->fwnode)
                return 0;

        /* Handle pure domain searches */
        if (!fwspec->param_count)
                return d->bus_token == bus_token;

        /* If this is not DT, then we have a single domain */
        if (!is_of_node(fwspec->fwnode))
                return 1;

        ret = gic_irq_domain_translate(d, fwspec, &hwirq, &type);
        if (WARN_ON_ONCE(ret))
                return 0;

        if (!fwspec_is_partitioned_ppi(fwspec, hwirq))
                return d == gic_data.domain;

        /*
         * If this is a PPI and we have a 4th (non-null) parameter,
         * then we need to match the partition domain.
         */
        ppi_idx = __gic_get_ppi_index(hwirq);
        return d == partition_get_domain(gic_data.ppi_descs[ppi_idx]);
}

static const struct irq_domain_ops gic_irq_domain_ops = {
        .translate = gic_irq_domain_translate,
        .alloc = gic_irq_domain_alloc,
        .free = gic_irq_domain_free,
        .select = gic_irq_domain_select,
};

static int partition_domain_translate(struct irq_domain *d,
                                      struct irq_fwspec *fwspec,
                                      unsigned long *hwirq,
                                      unsigned int *type)
{
        unsigned long ppi_intid;
        struct device_node *np;
        unsigned int ppi_idx;
        int ret;

        if (!gic_data.ppi_descs)
                return -ENOMEM;

        np = of_find_node_by_phandle(fwspec->param[3]);
        if (WARN_ON(!np))
                return -EINVAL;

        ret = gic_irq_domain_translate(d, fwspec, &ppi_intid, type);
        if (WARN_ON_ONCE(ret))
                return 0;

        ppi_idx = __gic_get_ppi_index(ppi_intid);
        ret = partition_translate_id(gic_data.ppi_descs[ppi_idx],
                                     of_node_to_fwnode(np));
        if (ret < 0)
                return ret;

        *hwirq = ret;
        *type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK;

        return 0;
}

static const struct irq_domain_ops partition_domain_ops = {
        .translate = partition_domain_translate,
        .select = gic_irq_domain_select,
};

static bool gic_enable_quirk_msm8996(void *data)
{
        struct gic_chip_data *d = data;

        d->flags |= FLAGS_WORKAROUND_GICR_WAKER_MSM8996;

        return true;
}

static bool gic_enable_quirk_cavium_38539(void *data)
{
        struct gic_chip_data *d = data;

        d->flags |= FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539;

        return true;
}

static bool gic_enable_quirk_hip06_07(void *data)
{
        struct gic_chip_data *d = data;

        /*
         * HIP06 GICD_IIDR clashes with GIC-600 product number (despite
         * not being an actual ARM implementation). The saving grace is
         * that GIC-600 doesn't have ESPI, so nothing to do in that case.
         * HIP07 doesn't even have a proper IIDR, and still pretends to
         * have ESPI. In both cases, put them right.
         */
        if (d->rdists.gicd_typer & GICD_TYPER_ESPI) {
                /* Zero both ESPI and the RES0 field next to it... */
                d->rdists.gicd_typer &= ~GENMASK(9, 8);
                return true;
        }

        return false;
}

#define T241_CHIPN_MASK                GENMASK_ULL(45, 44)
#define T241_CHIP_GICDA_OFFSET        0x1580000
#define SMCCC_SOC_ID_T241        0x036b0241

static bool gic_enable_quirk_nvidia_t241(void *data)
{
        s32 soc_id = arm_smccc_get_soc_id_version();
        unsigned long chip_bmask = 0;
        phys_addr_t phys;
        u32 i;

        /* Check JEP106 code for NVIDIA T241 chip (036b:0241) */
        if ((soc_id < 0) || (soc_id != SMCCC_SOC_ID_T241))
                return false;

        /* Find the chips based on GICR regions PHYS addr */
        for (i = 0; i < gic_data.nr_redist_regions; i++) {
                chip_bmask |= BIT(FIELD_GET(T241_CHIPN_MASK,
                                  (u64)gic_data.redist_regions[i].phys_base));
        }

        if (hweight32(chip_bmask) < 3)
                return false;

        /* Setup GICD alias regions */
        for (i = 0; i < ARRAY_SIZE(t241_dist_base_alias); i++) {
                if (chip_bmask & BIT(i)) {
                        phys = gic_data.dist_phys_base + T241_CHIP_GICDA_OFFSET;
                        phys |= FIELD_PREP(T241_CHIPN_MASK, i);
                        t241_dist_base_alias[i] = ioremap(phys, SZ_64K);
                        WARN_ON_ONCE(!t241_dist_base_alias[i]);
                }
        }
        static_branch_enable(&gic_nvidia_t241_erratum);
        return true;
}

static bool gic_enable_quirk_asr8601(void *data)
{
        struct gic_chip_data *d = data;

        d->flags |= FLAGS_WORKAROUND_ASR_ERRATUM_8601001;

        return true;
}

static bool gic_enable_quirk_arm64_2941627(void *data)
{
        static_branch_enable(&gic_arm64_2941627_erratum);
        return true;
}

static bool gic_enable_quirk_rk3399(void *data)
{
        struct gic_chip_data *d = data;

        if (of_machine_is_compatible("rockchip,rk3399")) {
                d->flags |= FLAGS_WORKAROUND_INSECURE;
                return true;
        }

        return false;
}

static bool rd_set_non_coherent(void *data)
{
        struct gic_chip_data *d = data;

        d->rdists.flags |= RDIST_FLAGS_FORCE_NON_SHAREABLE;
        return true;
}

static const struct gic_quirk gic_quirks[] = {
        {
                .desc        = "GICv3: Qualcomm MSM8996 broken firmware",
                .compatible = "qcom,msm8996-gic-v3",
                .init        = gic_enable_quirk_msm8996,
        },
        {
                .desc        = "GICv3: ASR erratum 8601001",
                .compatible = "asr,asr8601-gic-v3",
                .init        = gic_enable_quirk_asr8601,
        },
        {
                .desc        = "GICv3: HIP06 erratum 161010803",
                .iidr        = 0x0204043b,
                .mask        = 0xffffffff,
                .init        = gic_enable_quirk_hip06_07,
        },
        {
                .desc        = "GICv3: HIP07 erratum 161010803",
                .iidr        = 0x00000000,
                .mask        = 0xffffffff,
                .init        = gic_enable_quirk_hip06_07,
        },
        {
                /*
                 * Reserved register accesses generate a Synchronous
                 * External Abort. This erratum applies to:
                 * - ThunderX: CN88xx
                 * - OCTEON TX: CN83xx, CN81xx
                 * - OCTEON TX2: CN93xx, CN96xx, CN98xx, CNF95xx*
                 */
                .desc        = "GICv3: Cavium erratum 38539",
                .iidr        = 0xa000034c,
                .mask        = 0xe8f00fff,
                .init        = gic_enable_quirk_cavium_38539,
        },
        {
                .desc        = "GICv3: NVIDIA erratum T241-FABRIC-4",
                .iidr        = 0x0402043b,
                .mask        = 0xffffffff,
                .init        = gic_enable_quirk_nvidia_t241,
        },
        {
                /*
                 * GIC-700: 2941627 workaround - IP variant [0,1]
                 *
                 */
                .desc        = "GICv3: ARM64 erratum 2941627",
                .iidr        = 0x0400043b,
                .mask        = 0xff0e0fff,
                .init        = gic_enable_quirk_arm64_2941627,
        },
        {
                /*
                 * GIC-700: 2941627 workaround - IP variant [2]
                 */
                .desc        = "GICv3: ARM64 erratum 2941627",
                .iidr        = 0x0402043b,
                .mask        = 0xff0f0fff,
                .init        = gic_enable_quirk_arm64_2941627,
        },
        {
                .desc   = "GICv3: non-coherent attribute",
                .property = "dma-noncoherent",
                .init   = rd_set_non_coherent,
        },
        {
                .desc        = "GICv3: Insecure RK3399 integration",
                .iidr        = 0x0000043b,
                .mask        = 0xff000fff,
                .init        = gic_enable_quirk_rk3399,
        },
        {
        }
};

static void gic_enable_nmi_support(void)
{
        int i;

        if (!gic_prio_masking_enabled() || nmi_support_forbidden)
                return;

        rdist_nmi_refs = kcalloc(gic_data.ppi_nr + SGI_NR,
                                 sizeof(*rdist_nmi_refs), GFP_KERNEL);
        if (!rdist_nmi_refs)
                return;

        for (i = 0; i < gic_data.ppi_nr + SGI_NR; i++)
                refcount_set(&rdist_nmi_refs[i], 0);

        pr_info("Pseudo-NMIs enabled using %s ICC_PMR_EL1 synchronisation\n",
                gic_has_relaxed_pmr_sync() ? "relaxed" : "forced");

        static_branch_enable(&supports_pseudo_nmis);

        if (static_branch_likely(&supports_deactivate_key))
                gic_eoimode1_chip.flags |= IRQCHIP_SUPPORTS_NMI;
        else
                gic_chip.flags |= IRQCHIP_SUPPORTS_NMI;
}

static int __init gic_init_bases(phys_addr_t dist_phys_base,
                                 void __iomem *dist_base,
                                 struct redist_region *rdist_regs,
                                 u32 nr_redist_regions,
                                 u64 redist_stride,
                                 struct fwnode_handle *handle)
{
        u32 typer;
        int err;

        if (!is_hyp_mode_available())
                static_branch_disable(&supports_deactivate_key);

        if (static_branch_likely(&supports_deactivate_key))
                pr_info("GIC: Using split EOI/Deactivate mode\n");

        gic_data.fwnode = handle;
        gic_data.dist_phys_base = dist_phys_base;
        gic_data.dist_base = dist_base;
        gic_data.redist_regions = rdist_regs;
        gic_data.nr_redist_regions = nr_redist_regions;
        gic_data.redist_stride = redist_stride;

        /*
         * Find out how many interrupts are supported.
         */
        typer = readl_relaxed(gic_data.dist_base + GICD_TYPER);
        gic_data.rdists.gicd_typer = typer;

        gic_enable_quirks(readl_relaxed(gic_data.dist_base + GICD_IIDR),
                          gic_quirks, &gic_data);

        pr_info("%d SPIs implemented\n", GIC_LINE_NR - 32);
        pr_info("%d Extended SPIs implemented\n", GIC_ESPI_NR);

        /*
         * ThunderX1 explodes on reading GICD_TYPER2, in violation of the
         * architecture spec (which says that reserved registers are RES0).
         */
        if (!(gic_data.flags & FLAGS_WORKAROUND_CAVIUM_ERRATUM_38539))
                gic_data.rdists.gicd_typer2 = readl_relaxed(gic_data.dist_base + GICD_TYPER2);

        gic_data.domain = irq_domain_create_tree(handle, &gic_irq_domain_ops,
                                                 &gic_data);
        gic_data.rdists.rdist = alloc_percpu(typeof(*gic_data.rdists.rdist));
        if (!static_branch_unlikely(&gic_nvidia_t241_erratum)) {
                /* Disable GICv4.x features for the erratum T241-FABRIC-4 */
                gic_data.rdists.has_rvpeid = true;
                gic_data.rdists.has_vlpis = true;
                gic_data.rdists.has_direct_lpi = true;
                gic_data.rdists.has_vpend_valid_dirty = true;
        }

        if (WARN_ON(!gic_data.domain) || WARN_ON(!gic_data.rdists.rdist)) {
                err = -ENOMEM;
                goto out_free;
        }

        irq_domain_update_bus_token(gic_data.domain, DOMAIN_BUS_WIRED);

        gic_data.has_rss = !!(typer & GICD_TYPER_RSS);

        if (typer & GICD_TYPER_MBIS) {
                err = mbi_init(handle, gic_data.domain);
                if (err)
                        pr_err("Failed to initialize MBIs\n");
        }

        set_handle_irq(gic_handle_irq);

        gic_update_rdist_properties();

        gic_cpu_sys_reg_enable();
        gic_prio_init();
        gic_dist_init();
        gic_cpu_init();
        gic_enable_nmi_support();
        gic_smp_init();
        gic_cpu_pm_init();

        if (gic_dist_supports_lpis()) {
                its_init(handle, &gic_data.rdists, gic_data.domain, dist_prio_irq);
                its_cpu_init();
                its_lpi_memreserve_init();
        } else {
                if (IS_ENABLED(CONFIG_ARM_GIC_V2M))
                        gicv2m_init(handle, gic_data.domain);
        }

        return 0;

out_free:
        if (gic_data.domain)
                irq_domain_remove(gic_data.domain);
        free_percpu(gic_data.rdists.rdist);
        return err;
}

static int __init gic_validate_dist_version(void __iomem *dist_base)
{
        u32 reg = readl_relaxed(dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;

        if (reg != GIC_PIDR2_ARCH_GICv3 && reg != GIC_PIDR2_ARCH_GICv4)
                return -ENODEV;

        return 0;
}

/* Create all possible partitions at boot time */
static void __init gic_populate_ppi_partitions(struct device_node *gic_node)
{
        struct device_node *parts_node, *child_part;
        int part_idx = 0, i;
        int nr_parts;
        struct partition_affinity *parts;

        parts_node = of_get_child_by_name(gic_node, "ppi-partitions");
        if (!parts_node)
                return;

        gic_data.ppi_descs = kcalloc(gic_data.ppi_nr, sizeof(*gic_data.ppi_descs), GFP_KERNEL);
        if (!gic_data.ppi_descs)
                goto out_put_node;

        nr_parts = of_get_child_count(parts_node);

        if (!nr_parts)
                goto out_put_node;

        parts = kcalloc(nr_parts, sizeof(*parts), GFP_KERNEL);
        if (WARN_ON(!parts))
                goto out_put_node;

        for_each_child_of_node(parts_node, child_part) {
                struct partition_affinity *part;
                int n;

                part = &parts[part_idx];

                part->partition_id = of_node_to_fwnode(child_part);

                pr_info("GIC: PPI partition %pOFn[%d] { ",
                        child_part, part_idx);

                n = of_property_count_elems_of_size(child_part, "affinity",
                                                    sizeof(u32));
                WARN_ON(n <= 0);

                for (i = 0; i < n; i++) {
                        int err, cpu;
                        u32 cpu_phandle;
                        struct device_node *cpu_node;

                        err = of_property_read_u32_index(child_part, "affinity",
                                                         i, &cpu_phandle);
                        if (WARN_ON(err))
                                continue;

                        cpu_node = of_find_node_by_phandle(cpu_phandle);
                        if (WARN_ON(!cpu_node))
                                continue;

                        cpu = of_cpu_node_to_id(cpu_node);
                        if (WARN_ON(cpu < 0)) {
                                of_node_put(cpu_node);
                                continue;
                        }

                        pr_cont("%pOF[%d] ", cpu_node, cpu);

                        cpumask_set_cpu(cpu, &part->mask);
                        of_node_put(cpu_node);
                }

                pr_cont("}\n");
                part_idx++;
        }

        for (i = 0; i < gic_data.ppi_nr; i++) {
                unsigned int irq;
                struct partition_desc *desc;
                struct irq_fwspec ppi_fwspec = {
                        .fwnode                = gic_data.fwnode,
                        .param_count        = 3,
                        .param                = {
                                [0]        = GIC_IRQ_TYPE_PARTITION,
                                [1]        = i,
                                [2]        = IRQ_TYPE_NONE,
                        },
                };

                irq = irq_create_fwspec_mapping(&ppi_fwspec);
                if (WARN_ON(!irq))
                        continue;
                desc = partition_create_desc(gic_data.fwnode, parts, nr_parts,
                                             irq, &partition_domain_ops);
                if (WARN_ON(!desc))
                        continue;

                gic_data.ppi_descs[i] = desc;
        }

out_put_node:
        of_node_put(parts_node);
}

static void __init gic_of_setup_kvm_info(struct device_node *node, u32 nr_redist_regions)
{
        int ret;
        struct resource r;

        gic_v3_kvm_info.type = GIC_V3;

        gic_v3_kvm_info.maint_irq = irq_of_parse_and_map(node, 0);
        if (!gic_v3_kvm_info.maint_irq)
                return;

        /* Also skip GICD, GICC, GICH */
        ret = of_address_to_resource(node, nr_redist_regions + 3, &r);
        if (!ret)
                gic_v3_kvm_info.vcpu = r;

        gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis;
        gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid;
        vgic_set_kvm_info(&gic_v3_kvm_info);
}

static void gic_request_region(resource_size_t base, resource_size_t size,
                               const char *name)
{
        if (!request_mem_region(base, size, name))
                pr_warn_once(FW_BUG "%s region %pa has overlapping address\n",
                             name, &base);
}

static void __iomem *gic_of_iomap(struct device_node *node, int idx,
                                  const char *name, struct resource *res)
{
        void __iomem *base;
        int ret;

        ret = of_address_to_resource(node, idx, res);
        if (ret)
                return IOMEM_ERR_PTR(ret);

        gic_request_region(res->start, resource_size(res), name);
        base = of_iomap(node, idx);

        return base ?: IOMEM_ERR_PTR(-ENOMEM);
}

static int __init gic_of_init(struct device_node *node, struct device_node *parent)
{
        phys_addr_t dist_phys_base;
        void __iomem *dist_base;
        struct redist_region *rdist_regs;
        struct resource res;
        u64 redist_stride;
        u32 nr_redist_regions;
        int err, i;

        dist_base = gic_of_iomap(node, 0, "GICD", &res);
        if (IS_ERR(dist_base)) {
                pr_err("%pOF: unable to map gic dist registers\n", node);
                return PTR_ERR(dist_base);
        }

        dist_phys_base = res.start;

        err = gic_validate_dist_version(dist_base);
        if (err) {
                pr_err("%pOF: no distributor detected, giving up\n", node);
                goto out_unmap_dist;
        }

        if (of_property_read_u32(node, "#redistributor-regions", &nr_redist_regions))
                nr_redist_regions = 1;

        rdist_regs = kcalloc(nr_redist_regions, sizeof(*rdist_regs),
                             GFP_KERNEL);
        if (!rdist_regs) {
                err = -ENOMEM;
                goto out_unmap_dist;
        }

        for (i = 0; i < nr_redist_regions; i++) {
                rdist_regs[i].redist_base = gic_of_iomap(node, 1 + i, "GICR", &res);
                if (IS_ERR(rdist_regs[i].redist_base)) {
                        pr_err("%pOF: couldn't map region %d\n", node, i);
                        err = -ENODEV;
                        goto out_unmap_rdist;
                }
                rdist_regs[i].phys_base = res.start;
        }

        if (of_property_read_u64(node, "redistributor-stride", &redist_stride))
                redist_stride = 0;

        gic_enable_of_quirks(node, gic_quirks, &gic_data);

        err = gic_init_bases(dist_phys_base, dist_base, rdist_regs,
                             nr_redist_regions, redist_stride, &node->fwnode);
        if (err)
                goto out_unmap_rdist;

        gic_populate_ppi_partitions(node);

        if (static_branch_likely(&supports_deactivate_key))
                gic_of_setup_kvm_info(node, nr_redist_regions);
        return 0;

out_unmap_rdist:
        for (i = 0; i < nr_redist_regions; i++)
                if (rdist_regs[i].redist_base && !IS_ERR(rdist_regs[i].redist_base))
                        iounmap(rdist_regs[i].redist_base);
        kfree(rdist_regs);
out_unmap_dist:
        iounmap(dist_base);
        return err;
}

IRQCHIP_DECLARE(gic_v3, "arm,gic-v3", gic_of_init);

#ifdef CONFIG_ACPI
static struct
{
        void __iomem *dist_base;
        struct redist_region *redist_regs;
        u32 nr_redist_regions;
        bool single_redist;
        int enabled_rdists;
        u32 maint_irq;
        int maint_irq_mode;
        phys_addr_t vcpu_base;
} acpi_data __initdata;

static void __init
gic_acpi_register_redist(phys_addr_t phys_base, void __iomem *redist_base)
{
        static int count = 0;

        acpi_data.redist_regs[count].phys_base = phys_base;
        acpi_data.redist_regs[count].redist_base = redist_base;
        acpi_data.redist_regs[count].single_redist = acpi_data.single_redist;
        count++;
}

static int __init
gic_acpi_parse_madt_redist(union acpi_subtable_headers *header,
                           const unsigned long end)
{
        struct acpi_madt_generic_redistributor *redist =
                        (struct acpi_madt_generic_redistributor *)header;
        void __iomem *redist_base;

        redist_base = ioremap(redist->base_address, redist->length);
        if (!redist_base) {
                pr_err("Couldn't map GICR region @%llx\n", redist->base_address);
                return -ENOMEM;
        }

        if (acpi_get_madt_revision() >= 7 &&
            (redist->flags & ACPI_MADT_GICR_NON_COHERENT))
                gic_data.rdists.flags |= RDIST_FLAGS_FORCE_NON_SHAREABLE;

        gic_request_region(redist->base_address, redist->length, "GICR");

        gic_acpi_register_redist(redist->base_address, redist_base);
        return 0;
}

static int __init
gic_acpi_parse_madt_gicc(union acpi_subtable_headers *header,
                         const unsigned long end)
{
        struct acpi_madt_generic_interrupt *gicc =
                                (struct acpi_madt_generic_interrupt *)header;
        u32 reg = readl_relaxed(acpi_data.dist_base + GICD_PIDR2) & GIC_PIDR2_ARCH_MASK;
        u32 size = reg == GIC_PIDR2_ARCH_GICv4 ? SZ_64K * 4 : SZ_64K * 2;
        void __iomem *redist_base;

        /* Neither enabled or online capable means it doesn't exist, skip it */
        if (!(gicc->flags & (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE)))
                return 0;

        /*
         * Capable but disabled CPUs can be brought online later. What about
         * the redistributor? ACPI doesn't want to say!
         * Virtual hotplug systems can use the MADT's "always-on" GICR entries.
         * Otherwise, prevent such CPUs from being brought online.
         */
        if (!(gicc->flags & ACPI_MADT_ENABLED)) {
                int cpu = get_cpu_for_acpi_id(gicc->uid);

                pr_warn("CPU %u's redistributor is inaccessible: this CPU can't be brought online\n", cpu);
                if (cpu >= 0)
                        cpumask_set_cpu(cpu, &broken_rdists);
                return 0;
        }

        redist_base = ioremap(gicc->gicr_base_address, size);
        if (!redist_base)
                return -ENOMEM;
        gic_request_region(gicc->gicr_base_address, size, "GICR");

        if (acpi_get_madt_revision() >= 7 &&
            (gicc->flags & ACPI_MADT_GICC_NON_COHERENT))
                gic_data.rdists.flags |= RDIST_FLAGS_FORCE_NON_SHAREABLE;

        gic_acpi_register_redist(gicc->gicr_base_address, redist_base);
        return 0;
}

static int __init gic_acpi_collect_gicr_base(void)
{
        acpi_tbl_entry_handler redist_parser;
        enum acpi_madt_type type;

        if (acpi_data.single_redist) {
                type = ACPI_MADT_TYPE_GENERIC_INTERRUPT;
                redist_parser = gic_acpi_parse_madt_gicc;
        } else {
                type = ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR;
                redist_parser = gic_acpi_parse_madt_redist;
        }

        /* Collect redistributor base addresses in GICR entries */
        if (acpi_table_parse_madt(type, redist_parser, 0) > 0)
                return 0;

        pr_info("No valid GICR entries exist\n");
        return -ENODEV;
}

static int __init gic_acpi_match_gicr(union acpi_subtable_headers *header,
                                  const unsigned long end)
{
        /* Subtable presence means that redist exists, that's it */
        return 0;
}

static int __init gic_acpi_match_gicc(union acpi_subtable_headers *header,
                                      const unsigned long end)
{
        struct acpi_madt_generic_interrupt *gicc =
                                (struct acpi_madt_generic_interrupt *)header;

        /*
         * If GICC is enabled and has valid gicr base address, then it means
         * GICR base is presented via GICC. The redistributor is only known to
         * be accessible if the GICC is marked as enabled. If this bit is not
         * set, we'd need to add the redistributor at runtime, which isn't
         * supported.
         */
        if (gicc->flags & ACPI_MADT_ENABLED && gicc->gicr_base_address)
                acpi_data.enabled_rdists++;

        return 0;
}

static int __init gic_acpi_count_gicr_regions(void)
{
        int count;

        /*
         * Count how many redistributor regions we have. It is not allowed
         * to mix redistributor description, GICR and GICC subtables have to be
         * mutually exclusive.
         */
        count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_REDISTRIBUTOR,
                                      gic_acpi_match_gicr, 0);
        if (count > 0) {
                acpi_data.single_redist = false;
                return count;
        }

        count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
                                      gic_acpi_match_gicc, 0);
        if (count > 0) {
                acpi_data.single_redist = true;
                count = acpi_data.enabled_rdists;
        }

        return count;
}

static bool __init acpi_validate_gic_table(struct acpi_subtable_header *header,
                                           struct acpi_probe_entry *ape)
{
        struct acpi_madt_generic_distributor *dist;
        int count;

        dist = (struct acpi_madt_generic_distributor *)header;
        if (dist->version != ape->driver_data)
                return false;

        /* We need to do that exercise anyway, the sooner the better */
        count = gic_acpi_count_gicr_regions();
        if (count <= 0)
                return false;

        acpi_data.nr_redist_regions = count;
        return true;
}

static int __init gic_acpi_parse_virt_madt_gicc(union acpi_subtable_headers *header,
                                                const unsigned long end)
{
        struct acpi_madt_generic_interrupt *gicc =
                (struct acpi_madt_generic_interrupt *)header;
        int maint_irq_mode;
        static int first_madt = true;

        if (!(gicc->flags &
              (ACPI_MADT_ENABLED | ACPI_MADT_GICC_ONLINE_CAPABLE)))
                return 0;

        maint_irq_mode = (gicc->flags & ACPI_MADT_VGIC_IRQ_MODE) ?
                ACPI_EDGE_SENSITIVE : ACPI_LEVEL_SENSITIVE;

        if (first_madt) {
                first_madt = false;

                acpi_data.maint_irq = gicc->vgic_interrupt;
                acpi_data.maint_irq_mode = maint_irq_mode;
                acpi_data.vcpu_base = gicc->gicv_base_address;

                return 0;
        }

        /*
         * The maintenance interrupt and GICV should be the same for every CPU
         */
        if ((acpi_data.maint_irq != gicc->vgic_interrupt) ||
            (acpi_data.maint_irq_mode != maint_irq_mode) ||
            (acpi_data.vcpu_base != gicc->gicv_base_address))
                return -EINVAL;

        return 0;
}

static bool __init gic_acpi_collect_virt_info(void)
{
        int count;

        count = acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_INTERRUPT,
                                      gic_acpi_parse_virt_madt_gicc, 0);

        return (count > 0);
}

#define ACPI_GICV3_DIST_MEM_SIZE (SZ_64K)
#define ACPI_GICV2_VCTRL_MEM_SIZE        (SZ_4K)
#define ACPI_GICV2_VCPU_MEM_SIZE        (SZ_8K)

static void __init gic_acpi_setup_kvm_info(void)
{
        int irq;

        if (!gic_acpi_collect_virt_info()) {
                pr_warn("Unable to get hardware information used for virtualization\n");
                return;
        }

        gic_v3_kvm_info.type = GIC_V3;

        irq = acpi_register_gsi(NULL, acpi_data.maint_irq,
                                acpi_data.maint_irq_mode,
                                ACPI_ACTIVE_HIGH);
        if (irq <= 0)
                return;

        gic_v3_kvm_info.maint_irq = irq;

        if (acpi_data.vcpu_base) {
                struct resource *vcpu = &gic_v3_kvm_info.vcpu;

                vcpu->flags = IORESOURCE_MEM;
                vcpu->start = acpi_data.vcpu_base;
                vcpu->end = vcpu->start + ACPI_GICV2_VCPU_MEM_SIZE - 1;
        }

        gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis;
        gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid;
        vgic_set_kvm_info(&gic_v3_kvm_info);
}

static struct fwnode_handle *gsi_domain_handle;

static struct fwnode_handle *gic_v3_get_gsi_domain_id(u32 gsi)
{
        return gsi_domain_handle;
}

static int __init
gic_acpi_init(union acpi_subtable_headers *header, const unsigned long end)
{
        struct acpi_madt_generic_distributor *dist;
        size_t size;
        int i, err;

        /* Get distributor base address */
        dist = (struct acpi_madt_generic_distributor *)header;
        acpi_data.dist_base = ioremap(dist->base_address,
                                      ACPI_GICV3_DIST_MEM_SIZE);
        if (!acpi_data.dist_base) {
                pr_err("Unable to map GICD registers\n");
                return -ENOMEM;
        }
        gic_request_region(dist->base_address, ACPI_GICV3_DIST_MEM_SIZE, "GICD");

        err = gic_validate_dist_version(acpi_data.dist_base);
        if (err) {
                pr_err("No distributor detected at @%p, giving up\n",
                       acpi_data.dist_base);
                goto out_dist_unmap;
        }

        size = sizeof(*acpi_data.redist_regs) * acpi_data.nr_redist_regions;
        acpi_data.redist_regs = kzalloc(size, GFP_KERNEL);
        if (!acpi_data.redist_regs) {
                err = -ENOMEM;
                goto out_dist_unmap;
        }

        err = gic_acpi_collect_gicr_base();
        if (err)
                goto out_redist_unmap;

        gsi_domain_handle = irq_domain_alloc_fwnode(&dist->base_address);
        if (!gsi_domain_handle) {
                err = -ENOMEM;
                goto out_redist_unmap;
        }

        err = gic_init_bases(dist->base_address, acpi_data.dist_base,
                             acpi_data.redist_regs, acpi_data.nr_redist_regions,
                             0, gsi_domain_handle);
        if (err)
                goto out_fwhandle_free;

        acpi_set_irq_model(ACPI_IRQ_MODEL_GIC, gic_v3_get_gsi_domain_id);

        if (static_branch_likely(&supports_deactivate_key))
                gic_acpi_setup_kvm_info();

        return 0;

out_fwhandle_free:
        irq_domain_free_fwnode(gsi_domain_handle);
out_redist_unmap:
        for (i = 0; i < acpi_data.nr_redist_regions; i++)
                if (acpi_data.redist_regs[i].redist_base)
                        iounmap(acpi_data.redist_regs[i].redist_base);
        kfree(acpi_data.redist_regs);
out_dist_unmap:
        iounmap(acpi_data.dist_base);
        return err;
}
IRQCHIP_ACPI_DECLARE(gic_v3, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
                     acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V3,
                     gic_acpi_init);
IRQCHIP_ACPI_DECLARE(gic_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
                     acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_V4,
                     gic_acpi_init);
IRQCHIP_ACPI_DECLARE(gic_v3_or_v4, ACPI_MADT_TYPE_GENERIC_DISTRIBUTOR,
                     acpi_validate_gic_table, ACPI_MADT_GIC_VERSION_NONE,
                     gic_acpi_init);
#endif































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
// SPDX-License-Identifier: GPL-2.0
/*
 * kobject.h - generic kernel object infrastructure.
 *
 * Copyright (c) 2002-2003 Patrick Mochel
 * Copyright (c) 2002-2003 Open Source Development Labs
 * Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2006-2008 Novell Inc.
 *
 * Please read Documentation/core-api/kobject.rst before using the kobject
 * interface, ESPECIALLY the parts about reference counts and object
 * destructors.
 */

#ifndef _KOBJECT_H_
#define _KOBJECT_H_

#include <linux/types.h>
#include <linux/list.h>
#include <linux/sysfs.h>
#include <linux/compiler.h>
#include <linux/container_of.h>
#include <linux/spinlock.h>
#include <linux/kref.h>
#include <linux/kobject_ns.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/workqueue.h>
#include <linux/uidgid.h>

#define UEVENT_HELPER_PATH_LEN                256
#define UEVENT_NUM_ENVP                        64        /* number of env pointers */
#define UEVENT_BUFFER_SIZE                2048        /* buffer for the variables */

#ifdef CONFIG_UEVENT_HELPER
/* path to the userspace helper executed on an event */
extern char uevent_helper[];
#endif

/* counter to tag the uevent, read only except for the kobject core */
extern atomic64_t uevent_seqnum;

/*
 * The actions here must match the index to the string array
 * in lib/kobject_uevent.c
 *
 * Do not add new actions here without checking with the driver-core
 * maintainers. Action strings are not meant to express subsystem
 * or device specific properties. In most cases you want to send a
 * kobject_uevent_env(kobj, KOBJ_CHANGE, env) with additional event
 * specific variables added to the event environment.
 */
enum kobject_action {
        KOBJ_ADD,
        KOBJ_REMOVE,
        KOBJ_CHANGE,
        KOBJ_MOVE,
        KOBJ_ONLINE,
        KOBJ_OFFLINE,
        KOBJ_BIND,
        KOBJ_UNBIND,
};

struct kobject {
        const char                *name;
        struct list_head        entry;
        struct kobject                *parent;
        struct kset                *kset;
        const struct kobj_type        *ktype;
        struct kernfs_node        *sd; /* sysfs directory entry */
        struct kref                kref;

        unsigned int state_initialized:1;
        unsigned int state_in_sysfs:1;
        unsigned int state_add_uevent_sent:1;
        unsigned int state_remove_uevent_sent:1;
        unsigned int uevent_suppress:1;

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
        struct delayed_work        release;
#endif
};

__printf(2, 3) int kobject_set_name(struct kobject *kobj, const char *name, ...);
__printf(2, 0) int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs);

static inline const char *kobject_name(const struct kobject *kobj)
{
        return kobj->name;
}

void kobject_init(struct kobject *kobj, const struct kobj_type *ktype);
__printf(3, 4) __must_check int kobject_add(struct kobject *kobj,
                                            struct kobject *parent,
                                            const char *fmt, ...);
__printf(4, 5) __must_check int kobject_init_and_add(struct kobject *kobj,
                                                     const struct kobj_type *ktype,
                                                     struct kobject *parent,
                                                     const char *fmt, ...);

void kobject_del(struct kobject *kobj);

struct kobject * __must_check kobject_create_and_add(const char *name, struct kobject *parent);

int __must_check kobject_rename(struct kobject *, const char *new_name);
int __must_check kobject_move(struct kobject *, struct kobject *);

struct kobject *kobject_get(struct kobject *kobj);
struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj);
void kobject_put(struct kobject *kobj);

const void *kobject_namespace(const struct kobject *kobj);
void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid);
char *kobject_get_path(const struct kobject *kobj, gfp_t flag);

struct kobj_type {
        void (*release)(struct kobject *kobj);
        const struct sysfs_ops *sysfs_ops;
        const struct attribute_group **default_groups;
        const struct kobj_ns_type_operations *(*child_ns_type)(const struct kobject *kobj);
        const void *(*namespace)(const struct kobject *kobj);
        void (*get_ownership)(const struct kobject *kobj, kuid_t *uid, kgid_t *gid);
};

struct kobj_uevent_env {
        char *argv[3];
        char *envp[UEVENT_NUM_ENVP];
        int envp_idx;
        char buf[UEVENT_BUFFER_SIZE];
        int buflen;
};

struct kset_uevent_ops {
        int (* const filter)(const struct kobject *kobj);
        const char *(* const name)(const struct kobject *kobj);
        int (* const uevent)(const struct kobject *kobj, struct kobj_uevent_env *env);
};

struct kobj_attribute {
        struct attribute attr;
        ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr,
                         const char *buf, size_t count);
};

extern const struct sysfs_ops kobj_sysfs_ops;

struct sock;

/**
 * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem.
 *
 * A kset defines a group of kobjects.  They can be individually
 * different "types" but overall these kobjects all want to be grouped
 * together and operated on in the same manner.  ksets are used to
 * define the attribute callbacks and other common events that happen to
 * a kobject.
 *
 * @list: the list of all kobjects for this kset
 * @list_lock: a lock for iterating over the kobjects
 * @kobj: the embedded kobject for this kset (recursion, isn't it fun...)
 * @uevent_ops: the set of uevent operations for this kset.  These are
 * called whenever a kobject has something happen to it so that the kset
 * can add new environment variables, or filter out the uevents if so
 * desired.
 */
struct kset {
        struct list_head list;
        spinlock_t list_lock;
        struct kobject kobj;
        const struct kset_uevent_ops *uevent_ops;
} __randomize_layout;

void kset_init(struct kset *kset);
int __must_check kset_register(struct kset *kset);
void kset_unregister(struct kset *kset);
struct kset * __must_check kset_create_and_add(const char *name, const struct kset_uevent_ops *u,
                                               struct kobject *parent_kobj);

static inline struct kset *to_kset(struct kobject *kobj)
{
        return kobj ? container_of(kobj, struct kset, kobj) : NULL;
}

static inline struct kset *kset_get(struct kset *k)
{
        return k ? to_kset(kobject_get(&k->kobj)) : NULL;
}

static inline void kset_put(struct kset *k)
{
        kobject_put(&k->kobj);
}

static inline const struct kobj_type *get_ktype(const struct kobject *kobj)
{
        return kobj->ktype;
}

struct kobject *kset_find_obj(struct kset *, const char *);

/* The global /sys/kernel/ kobject for people to chain off of */
extern struct kobject *kernel_kobj;
/* The global /sys/kernel/mm/ kobject for people to chain off of */
extern struct kobject *mm_kobj;
/* The global /sys/hypervisor/ kobject for people to chain off of */
extern struct kobject *hypervisor_kobj;
/* The global /sys/power/ kobject for people to chain off of */
extern struct kobject *power_kobj;
/* The global /sys/firmware/ kobject for people to chain off of */
extern struct kobject *firmware_kobj;

int kobject_uevent(struct kobject *kobj, enum kobject_action action);
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
                        char *envp[]);
int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count);

__printf(2, 3)
int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);

#endif /* _KOBJECT_H_ */























   19 



    3 







   19 



















   32 


   16 



   32 



   32 
















   32 



   32 
   32 
    6 
























  131 

































  321 
  131 







  322 









  131 

















  131 









  131 
   19 



  131 






  130 














































































































   82 




































  312 
   82 






   82 


   13 






   82 



   82 

















  322 

  321 




  322 

  322 























  321 
  322 











  321 
  322 


























  321 












  322 
















  322 

  322 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
#include <linux/gfp.h>
#include <linux/highmem.h>
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>
#include <linux/swap.h>
#include <linux/rmap.h>

#include <asm/pgalloc.h>
#include <asm/tlb.h>

#ifndef CONFIG_MMU_GATHER_NO_GATHER

static bool tlb_next_batch(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        /* Limit batching if we have delayed rmaps pending */
        if (tlb->delayed_rmap && tlb->active != &tlb->local)
                return false;

        batch = tlb->active;
        if (batch->next) {
                tlb->active = batch->next;
                return true;
        }

        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
                return false;

        batch = (void *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
        if (!batch)
                return false;

        tlb->batch_count++;
        batch->next = NULL;
        batch->nr   = 0;
        batch->max  = MAX_GATHER_BATCH;

        tlb->active->next = batch;
        tlb->active = batch;

        return true;
}

#ifdef CONFIG_SMP
static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
{
        struct encoded_page **pages = batch->encoded_pages;

        for (int i = 0; i < batch->nr; i++) {
                struct encoded_page *enc = pages[i];

                if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
                        struct page *page = encoded_page_ptr(enc);
                        unsigned int nr_pages = 1;

                        if (unlikely(encoded_page_flags(enc) &
                                     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                nr_pages = encoded_nr_pages(pages[++i]);

                        folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
                                               vma);
                }
        }
}

/**
 * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
 * @tlb: the current mmu_gather
 * @vma: The memory area from which the pages are being removed.
 *
 * Note that because of how tlb_next_batch() above works, we will
 * never start multiple new batches with pending delayed rmaps, so
 * we only need to walk through the current active batch and the
 * original local one.
 */
void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (!tlb->delayed_rmap)
                return;

        tlb_flush_rmap_batch(&tlb->local, vma);
        if (tlb->active != &tlb->local)
                tlb_flush_rmap_batch(tlb->active, vma);
        tlb->delayed_rmap = 0;
}
#endif

/*
 * We might end up freeing a lot of pages. Reschedule on a regular
 * basis to avoid soft lockups in configurations without full
 * preemption enabled. The magic number of 512 folios seems to work.
 */
#define MAX_NR_FOLIOS_PER_FREE                512

static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
{
        struct encoded_page **pages = batch->encoded_pages;
        unsigned int nr, nr_pages;

        while (batch->nr) {
                if (!page_poisoning_enabled_static() && !want_init_on_free()) {
                        nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);

                        /*
                         * Make sure we cover page + nr_pages, and don't leave
                         * nr_pages behind when capping the number of entries.
                         */
                        if (unlikely(encoded_page_flags(pages[nr - 1]) &
                                     ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                nr++;
                } else {
                        /*
                         * With page poisoning and init_on_free, the time it
                         * takes to free memory grows proportionally with the
                         * actual memory size. Therefore, limit based on the
                         * actual memory size and not the number of involved
                         * folios.
                         */
                        for (nr = 0, nr_pages = 0;
                             nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
                             nr++) {
                                if (unlikely(encoded_page_flags(pages[nr]) &
                                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                                        nr_pages += encoded_nr_pages(pages[++nr]);
                                else
                                        nr_pages++;
                        }
                }

                free_pages_and_swap_cache(pages, nr);
                pages += nr;
                batch->nr -= nr;

                cond_resched();
        }
}

static void tlb_batch_pages_flush(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch;

        for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
                __tlb_batch_free_encoded_pages(batch);
        tlb->active = &tlb->local;
}

static void tlb_batch_list_free(struct mmu_gather *tlb)
{
        struct mmu_gather_batch *batch, *next;

        for (batch = tlb->local.next; batch; batch = next) {
                next = batch->next;
                free_pages((unsigned long)batch, 0);
        }
        tlb->local.next = NULL;
}

static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
                struct page *page, unsigned int nr_pages, bool delay_rmap,
                int page_size)
{
        int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
        struct mmu_gather_batch *batch;

        VM_BUG_ON(!tlb->end);

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        VM_WARN_ON(tlb->page_size != page_size);
        VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
        VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
#endif

        batch = tlb->active;
        /*
         * Add the page and check if we are full. If so
         * force a flush.
         */
        if (likely(nr_pages == 1)) {
                batch->encoded_pages[batch->nr++] = encode_page(page, flags);
        } else {
                flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
                batch->encoded_pages[batch->nr++] = encode_page(page, flags);
                batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
        }
        /*
         * Make sure that we can always add another "page" + "nr_pages",
         * requiring two entries instead of only a single one.
         */
        if (batch->nr >= batch->max - 1) {
                if (!tlb_next_batch(tlb))
                        return true;
                batch = tlb->active;
        }
        VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);

        return false;
}

bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
                unsigned int nr_pages, bool delay_rmap)
{
        return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
                                             PAGE_SIZE);
}

bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
                bool delay_rmap, int page_size)
{
        return __tlb_remove_folio_pages_size(tlb, page, 1, delay_rmap, page_size);
}

#endif /* MMU_GATHER_NO_GATHER */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

static void __tlb_remove_table_free(struct mmu_table_batch *batch)
{
        int i;

        for (i = 0; i < batch->nr; i++)
                __tlb_remove_table(batch->tables[i]);

        free_page((unsigned long)batch);
}

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE

/*
 * Semi RCU freeing of the page directories.
 *
 * This is needed by some architectures to implement software pagetable walkers.
 *
 * gup_fast() and other software pagetable walkers do a lockless page-table
 * walk and therefore needs some synchronization with the freeing of the page
 * directories. The chosen means to accomplish that is by disabling IRQs over
 * the walk.
 *
 * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 * since we unlink the page, flush TLBs, free the page. Since the disabling of
 * IRQs delays the completion of the TLB flush we can never observe an already
 * freed page.
 *
 * Not all systems IPI every CPU for this purpose:
 *
 * - Some architectures have HW support for cross-CPU synchronisation of TLB
 *   flushes, so there's no IPI at all.
 *
 * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate
 *   with the hypervisor to defer flushing on preempted vCPUs.
 *
 * Such systems need to delay the freeing by some other means, this is that
 * means.
 *
 * What we do is batch the freed directory pages (tables) and RCU free them.
 * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 * holds off grace periods.
 *
 * However, in order to batch these pages we need to allocate storage, this
 * allocation is deep inside the MM code and can thus easily fail on memory
 * pressure. To guarantee progress we fall back to single table freeing, see
 * the implementation of tlb_remove_table_one().
 *
 */

static void tlb_remove_table_smp_sync(void *arg)
{
        /* Simply deliver the interrupt */
}

void tlb_remove_table_sync_one(void)
{
        /*
         * This isn't an RCU grace period and hence the page-tables cannot be
         * assumed to be actually RCU-freed.
         *
         * It is however sufficient for software page-table walkers that rely on
         * IRQ disabling.
         */
        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
}

static void tlb_remove_table_rcu(struct rcu_head *head)
{
        __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
}

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        call_rcu(&batch->rcu, tlb_remove_table_rcu);
}

#else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */

static void tlb_remove_table_free(struct mmu_table_batch *batch)
{
        __tlb_remove_table_free(batch);
}

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */

/*
 * If we want tlb_remove_table() to imply TLB invalidates.
 */
static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
        if (tlb_needs_table_invalidate()) {
                /*
                 * Invalidate page-table caches used by hardware walkers. Then
                 * we still need to RCU-sched wait while freeing the pages
                 * because software walkers can still be in-flight.
                 */
                tlb_flush_mmu_tlbonly(tlb);
        }
}

#ifdef CONFIG_PT_RECLAIM
static inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
{
        struct ptdesc *ptdesc;

        ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
        __tlb_remove_table(ptdesc);
}

static inline void __tlb_remove_table_one(void *table)
{
        struct ptdesc *ptdesc;

        ptdesc = table;
        call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
}
#else
static inline void __tlb_remove_table_one(void *table)
{
        tlb_remove_table_sync_one();
        __tlb_remove_table(table);
}
#endif /* CONFIG_PT_RECLAIM */

static void tlb_remove_table_one(void *table)
{
        __tlb_remove_table_one(table);
}

static void tlb_table_flush(struct mmu_gather *tlb)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch) {
                tlb_table_invalidate(tlb);
                tlb_remove_table_free(*batch);
                *batch = NULL;
        }
}

void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        struct mmu_table_batch **batch = &tlb->batch;

        if (*batch == NULL) {
                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
                if (*batch == NULL) {
                        tlb_table_invalidate(tlb);
                        tlb_remove_table_one(table);
                        return;
                }
                (*batch)->nr = 0;
        }

        (*batch)->tables[(*batch)->nr++] = table;
        if ((*batch)->nr == MAX_TABLE_BATCH)
                tlb_table_flush(tlb);
}

static inline void tlb_table_init(struct mmu_gather *tlb)
{
        tlb->batch = NULL;
}

#else /* !CONFIG_MMU_GATHER_TABLE_FREE */

static inline void tlb_table_flush(struct mmu_gather *tlb) { }
static inline void tlb_table_init(struct mmu_gather *tlb) { }

#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
        tlb_table_flush(tlb);
#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_pages_flush(tlb);
#endif
}

void tlb_flush_mmu(struct mmu_gather *tlb)
{
        tlb_flush_mmu_tlbonly(tlb);
        tlb_flush_mmu_free(tlb);
}

static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
                             bool fullmm)
{
        tlb->mm = mm;
        tlb->fullmm = fullmm;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb->need_flush_all = 0;
        tlb->local.next = NULL;
        tlb->local.nr   = 0;
        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
        tlb->active     = &tlb->local;
        tlb->batch_count = 0;
#endif
        tlb->delayed_rmap = 0;

        tlb_table_init(tlb);
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        tlb->page_size = 0;
#endif

        __tlb_reset_range(tlb);
        inc_tlb_flush_pending(tlb->mm);
}

/**
 * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
{
        __tlb_gather_mmu(tlb, mm, false);
}

/**
 * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 * @tlb: the mmu_gather structure to initialize
 * @mm: the mm_struct of the target address space
 *
 * In this case, @mm is without users and we're going to destroy the
 * full address space (exit/execve).
 *
 * Called to initialize an (on-stack) mmu_gather structure for page-table
 * tear-down from @mm.
 */
void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
{
        __tlb_gather_mmu(tlb, mm, true);
}

/**
 * tlb_finish_mmu - finish an mmu_gather structure
 * @tlb: the mmu_gather structure to finish
 *
 * Called at the end of the shootdown operation to free up any resources that
 * were required.
 */
void tlb_finish_mmu(struct mmu_gather *tlb)
{
        /*
         * If there are parallel threads are doing PTE changes on same range
         * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
         * flush by batching, one thread may end up seeing inconsistent PTEs
         * and result in having stale TLB entries.  So flush TLB forcefully
         * if we detect parallel PTE batching threads.
         *
         * However, some syscalls, e.g. munmap(), may free page tables, this
         * needs force flush everything in the given range. Otherwise this
         * may result in having stale TLB entries for some architectures,
         * e.g. aarch64, that could specify flush what level TLB.
         */
        if (mm_tlb_flush_nested(tlb->mm)) {
                /*
                 * The aarch64 yields better performance with fullmm by
                 * avoiding multiple CPUs spamming TLBI messages at the
                 * same time.
                 *
                 * On x86 non-fullmm doesn't yield significant difference
                 * against fullmm.
                 */
                tlb->fullmm = 1;
                __tlb_reset_range(tlb);
                tlb->freed_tables = 1;
        }

        tlb_flush_mmu(tlb);

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        tlb_batch_list_free(tlb);
#endif
        dec_tlb_flush_pending(tlb->mm);
}





















  329 
  513 






  595 
    5 



















  599 







  597 

















  599 
























  540 





  540 







  511 






  535 










  540 

























  541 




  541 

  540 

  540 


  539 
  542 

  538 
  538 






  540 
  540 





  539 


  541 
    1 



































  536 



  535 
   33 

  508 







   14 





















   25 














   25 



   14 
   14 
   14 


   14 

   13 

   14 







  999 








  998 







    4 




    5 







  329 




  330 


    5 
  328 
  329 








  329 
  330 



    5 


  329 



  329 


  330 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/fs_struct.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/prefetch.h>
#include "mount.h"
#include "internal.h"

struct prepend_buffer {
        char *buf;
        int len;
};
#define DECLARE_BUFFER(__name, __buf, __len) \
        struct prepend_buffer __name = {.buf = __buf + __len, .len = __len}

static char *extract_string(struct prepend_buffer *p)
{
        if (likely(p->len >= 0))
                return p->buf;
        return ERR_PTR(-ENAMETOOLONG);
}

static bool prepend_char(struct prepend_buffer *p, unsigned char c)
{
        if (likely(p->len > 0)) {
                p->len--;
                *--p->buf = c;
                return true;
        }
        p->len = -1;
        return false;
}

/*
 * The source of the prepend data can be an optimistic load
 * of a dentry name and length. And because we don't hold any
 * locks, the length and the pointer to the name may not be
 * in sync if a concurrent rename happens, and the kernel
 * copy might fault as a result.
 *
 * The end result will correct itself when we check the
 * rename sequence count, but we need to be able to handle
 * the fault gracefully.
 */
static bool prepend_copy(void *dst, const void *src, int len)
{
        if (unlikely(copy_from_kernel_nofault(dst, src, len))) {
                memset(dst, 'x', len);
                return false;
        }
        return true;
}

static bool prepend(struct prepend_buffer *p, const char *str, int namelen)
{
        // Already overflowed?
        if (p->len < 0)
                return false;

        // Will overflow?
        if (p->len < namelen) {
                // Fill as much as possible from the end of the name
                str += namelen - p->len;
                p->buf -= p->len;
                prepend_copy(p->buf, str, p->len);
                p->len = -1;
                return false;
        }

        // Fits fully
        p->len -= namelen;
        p->buf -= namelen;
        return prepend_copy(p->buf, str, namelen);
}

/**
 * prepend_name - prepend a pathname in front of current buffer pointer
 * @p: prepend buffer which contains buffer pointer and allocated length
 * @name: name string and length qstr structure
 *
 * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to
 * make sure that either the old or the new name pointer and length are
 * fetched. However, there may be mismatch between length and pointer.
 * But since the length cannot be trusted, we need to copy the name very
 * carefully when doing the prepend_copy(). It also prepends "/" at
 * the beginning of the name. The sequence number check at the caller will
 * retry it again when a d_move() does happen. So any garbage in the buffer
 * due to mismatched pointer and length will be discarded.
 *
 * Load acquire is needed to make sure that we see the new name data even
 * if we might get the length wrong.
 */
static bool prepend_name(struct prepend_buffer *p, const struct qstr *name)
{
        const char *dname = smp_load_acquire(&name->name); /* ^^^ */
        u32 dlen = READ_ONCE(name->len);

        return prepend(p, dname, dlen) && prepend_char(p, '/');
}

static int __prepend_path(const struct dentry *dentry, const struct mount *mnt,
                          const struct path *root, struct prepend_buffer *p)
{
        while (dentry != root->dentry || &mnt->mnt != root->mnt) {
                const struct dentry *parent = READ_ONCE(dentry->d_parent);

                if (dentry == mnt->mnt.mnt_root) {
                        struct mount *m = READ_ONCE(mnt->mnt_parent);
                        struct mnt_namespace *mnt_ns;

                        if (likely(mnt != m)) {
                                dentry = READ_ONCE(mnt->mnt_mountpoint);
                                mnt = m;
                                continue;
                        }
                        /* Global root */
                        mnt_ns = READ_ONCE(mnt->mnt_ns);
                        /* open-coded is_mounted() to use local mnt_ns */
                        if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
                                return 1;        // absolute root
                        else
                                return 2;        // detached or not attached yet
                }

                if (unlikely(dentry == parent))
                        /* Escaped? */
                        return 3;

                prefetch(parent);
                if (!prepend_name(p, &dentry->d_name))
                        break;
                dentry = parent;
        }
        return 0;
}

/**
 * prepend_path - Prepend path string to a buffer
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @p: prepend buffer which contains buffer pointer and allocated length
 *
 * The function will first try to write out the pathname without taking any
 * lock other than the RCU read lock to make sure that dentries won't go away.
 * It only checks the sequence number of the global rename_lock as any change
 * in the dentry's d_seq will be preceded by changes in the rename_lock
 * sequence number. If the sequence number had been changed, it will restart
 * the whole pathname back-tracing sequence again by taking the rename_lock.
 * In this case, there is no need to take the RCU read lock as the recursive
 * parent pointer references will keep the dentry chain alive as long as no
 * rename operation is performed.
 */
static int prepend_path(const struct path *path,
                        const struct path *root,
                        struct prepend_buffer *p)
{
        unsigned seq, m_seq = 0;
        struct prepend_buffer b;
        int error;

        rcu_read_lock();
restart_mnt:
        read_seqbegin_or_lock(&mount_lock, &m_seq);
        seq = 0;
        rcu_read_lock();
restart:
        b = *p;
        read_seqbegin_or_lock(&rename_lock, &seq);
        error = __prepend_path(path->dentry, real_mount(path->mnt), root, &b);
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);

        if (!(m_seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&mount_lock, m_seq)) {
                m_seq = 1;
                goto restart_mnt;
        }
        done_seqretry(&mount_lock, m_seq);

        if (unlikely(error == 3))
                b = *p;

        if (b.len == p->len)
                prepend_char(&b, '/');

        *p = b;
        return error;
}

/**
 * __d_path - return the path of a dentry
 * @path: the dentry/vfsmount to report
 * @root: root vfsmnt/dentry
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name.
 *
 * Returns a pointer into the buffer or an error code if the
 * path was too long.
 *
 * "buflen" should be positive.
 *
 * If the path is not reachable from the supplied root, return %NULL.
 */
char *__d_path(const struct path *path,
               const struct path *root,
               char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        if (unlikely(prepend_path(path, root, &b) > 0))
                return NULL;
        return extract_string(&b);
}

char *d_absolute_path(const struct path *path,
               char *buf, int buflen)
{
        struct path root = {};
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        if (unlikely(prepend_path(path, &root, &b) > 1))
                return ERR_PTR(-EINVAL);
        return extract_string(&b);
}

static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
{
        unsigned seq;

        do {
                seq = read_seqcount_begin(&fs->seq);
                *root = fs->root;
        } while (read_seqcount_retry(&fs->seq, seq));
}

/**
 * d_path - return the path of a dentry
 * @path: path to report
 * @buf: buffer to return value in
 * @buflen: buffer length
 *
 * Convert a dentry into an ASCII path name. If the entry has been deleted
 * the string " (deleted)" is appended. Note that this is ambiguous.
 *
 * Returns a pointer into the buffer or an error code if the path was
 * too long. Note: Callers should use the returned pointer, not the passed
 * in buffer, to use the name! The implementation often starts at an offset
 * into the buffer, and may leave 0 bytes at the start.
 *
 * "buflen" should be positive.
 */
char *d_path(const struct path *path, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);
        struct path root;

        /*
         * We have various synthetic filesystems that never get mounted.  On
         * these filesystems dentries are never used for lookup purposes, and
         * thus don't need to be hashed.  They also don't need a name until a
         * user wants to identify the object in /proc/pid/fd/.  The little hack
         * below allows us to generate a name for these objects on demand:
         *
         * Some pseudo inodes are mountable.  When they are mounted
         * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
         * and instead have d_path return the mounted path.
         */
        if (path->dentry->d_op && path->dentry->d_op->d_dname &&
            (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
                return path->dentry->d_op->d_dname(path->dentry, buf, buflen);

        rcu_read_lock();
        get_fs_root_rcu(current->fs, &root);
        if (unlikely(d_unlinked(path->dentry)))
                prepend(&b, " (deleted)", 11);
        else
                prepend_char(&b, 0);
        prepend_path(path, &root, &b);
        rcu_read_unlock();

        return extract_string(&b);
}
EXPORT_SYMBOL(d_path);

/*
 * Helper function for dentry_operations.d_dname() members
 */
char *dynamic_dname(char *buffer, int buflen, const char *fmt, ...)
{
        va_list args;
        char temp[64];
        int sz;

        va_start(args, fmt);
        sz = vsnprintf(temp, sizeof(temp), fmt, args) + 1;
        va_end(args);

        if (sz > sizeof(temp) || sz > buflen)
                return ERR_PTR(-ENAMETOOLONG);

        buffer += buflen - sz;
        return memcpy(buffer, temp, sz);
}

char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
{
        DECLARE_BUFFER(b, buffer, buflen);
        /* these dentries are never renamed, so d_lock is not needed */
        prepend(&b, " (deleted)", 11);
        prepend(&b, dentry->d_name.name, dentry->d_name.len);
        prepend_char(&b, '/');
        return extract_string(&b);
}

/*
 * Write full pathname from the root of the filesystem into the buffer.
 */
static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p)
{
        const struct dentry *dentry;
        struct prepend_buffer b;
        int seq = 0;

        rcu_read_lock();
restart:
        dentry = d;
        b = *p;
        read_seqbegin_or_lock(&rename_lock, &seq);
        while (!IS_ROOT(dentry)) {
                const struct dentry *parent = dentry->d_parent;

                prefetch(parent);
                if (!prepend_name(&b, &dentry->d_name))
                        break;
                dentry = parent;
        }
        if (!(seq & 1))
                rcu_read_unlock();
        if (need_seqretry(&rename_lock, seq)) {
                seq = 1;
                goto restart;
        }
        done_seqretry(&rename_lock, seq);
        if (b.len == p->len)
                prepend_char(&b, '/');
        return extract_string(&b);
}

char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        prepend_char(&b, 0);
        return __dentry_path(dentry, &b);
}
EXPORT_SYMBOL(dentry_path_raw);

char *dentry_path(const struct dentry *dentry, char *buf, int buflen)
{
        DECLARE_BUFFER(b, buf, buflen);

        if (unlikely(d_unlinked(dentry)))
                prepend(&b, "//deleted", 10);
        else
                prepend_char(&b, 0);
        return __dentry_path(dentry, &b);
}

static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
                                    struct path *pwd)
{
        unsigned seq;

        do {
                seq = read_seqcount_begin(&fs->seq);
                *root = fs->root;
                *pwd = fs->pwd;
        } while (read_seqcount_retry(&fs->seq, seq));
}

/*
 * NOTE! The user-level library version returns a
 * character pointer. The kernel system call just
 * returns the length of the buffer filled (which
 * includes the ending '\0' character), or a negative
 * error value. So libc would do something like
 *
 *        char *getcwd(char * buf, size_t size)
 *        {
 *                int retval;
 *
 *                retval = sys_getcwd(buf, size);
 *                if (retval >= 0)
 *                        return buf;
 *                errno = -retval;
 *                return NULL;
 *        }
 */
SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
{
        int error;
        struct path pwd, root;
        char *page = __getname();

        if (!page)
                return -ENOMEM;

        rcu_read_lock();
        get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);

        if (unlikely(d_unlinked(pwd.dentry))) {
                rcu_read_unlock();
                error = -ENOENT;
        } else {
                unsigned len;
                DECLARE_BUFFER(b, page, PATH_MAX);

                prepend_char(&b, 0);
                if (unlikely(prepend_path(&pwd, &root, &b) > 0))
                        prepend(&b, "(unreachable)", 13);
                rcu_read_unlock();

                len = PATH_MAX - b.len;
                if (unlikely(len > PATH_MAX))
                        error = -ENAMETOOLONG;
                else if (unlikely(len > size))
                        error = -ERANGE;
                else if (copy_to_user(buf, b.buf, len))
                        error = -EFAULT;
                else
                        error = len;
        }
        __putname(page);
        return error;
}





























































































































































































    2 
    8 




    7 


























































  315 
   81 





  316 
   81 








































































































































































































































































































































































































































































































   78 


   81 











  152 













   95 




   86 




   95 




   35 
























































































































   95 









   95 






































































    8 
   40 















































































































































































































































































































   48 




































   20 





















   17 
    8 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_H
#define _LINUX_HUGETLB_H

#include <linux/mm.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/fs.h>
#include <linux/hugetlb_inline.h>
#include <linux/cgroup.h>
#include <linux/page_ref.h>
#include <linux/list.h>
#include <linux/kref.h>
#include <linux/pgtable.h>
#include <linux/gfp.h>
#include <linux/userfaultfd_k.h>

struct ctl_table;
struct user_struct;
struct mmu_gather;
struct node;

void free_huge_folio(struct folio *folio);

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/pagemap.h>
#include <linux/shm.h>
#include <asm/tlbflush.h>

/*
 * For HugeTLB page, there are more metadata to save in the struct page. But
 * the head struct page cannot meet our needs, so we have to abuse other tail
 * struct page to store the metadata.
 */
#define __NR_USED_SUBPAGE 3

struct hugepage_subpool {
        spinlock_t lock;
        long count;
        long max_hpages;        /* Maximum huge pages or -1 if no maximum. */
        long used_hpages;        /* Used count against maximum, includes */
                                /* both allocated and reserved pages. */
        struct hstate *hstate;
        long min_hpages;        /* Minimum huge pages or -1 if no minimum. */
        long rsv_hpages;        /* Pages reserved against global pool to */
                                /* satisfy minimum size. */
};

struct resv_map {
        struct kref refs;
        spinlock_t lock;
        struct list_head regions;
        long adds_in_progress;
        struct list_head region_cache;
        long region_cache_count;
        struct rw_semaphore rw_sema;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On private mappings, the counter to uncharge reservations is stored
         * here. If these fields are 0, then either the mapping is shared, or
         * cgroup accounting is disabled for this resv_map.
         */
        struct page_counter *reservation_counter;
        unsigned long pages_per_hpage;
        struct cgroup_subsys_state *css;
#endif
};

/*
 * Region tracking -- allows tracking of reservations and instantiated pages
 *                    across the pages in a mapping.
 *
 * The region data structures are embedded into a resv_map and protected
 * by a resv_map's lock.  The set of regions within the resv_map represent
 * reservations for huge pages, or huge pages that have already been
 * instantiated within the map.  The from and to elements are huge page
 * indices into the associated mapping.  from indicates the starting index
 * of the region.  to represents the first index past the end of  the region.
 *
 * For example, a file region structure with from == 0 and to == 4 represents
 * four huge pages in a mapping.  It is important to note that the to element
 * represents the first element past the end of the region. This is used in
 * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
 *
 * Interval notation of the form [from, to) will be used to indicate that
 * the endpoint from is inclusive and to is exclusive.
 */
struct file_region {
        struct list_head link;
        long from;
        long to;
#ifdef CONFIG_CGROUP_HUGETLB
        /*
         * On shared mappings, each reserved region appears as a struct
         * file_region in resv_map. These fields hold the info needed to
         * uncharge each reservation.
         */
        struct page_counter *reservation_counter;
        struct cgroup_subsys_state *css;
#endif
};

struct hugetlb_vma_lock {
        struct kref refs;
        struct rw_semaphore rw_sema;
        struct vm_area_struct *vma;
};

extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref);

extern spinlock_t hugetlb_lock;
extern int hugetlb_max_hstate __read_mostly;
#define for_each_hstate(h) \
        for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++)

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                long min_hpages);
void hugepage_put_subpool(struct hugepage_subpool *spool);

void hugetlb_dup_vma_private(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int move_hugetlb_page_tables(struct vm_area_struct *vma,
                             struct vm_area_struct *new_vma,
                             unsigned long old_addr, unsigned long new_addr,
                             unsigned long len);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
                            struct vm_area_struct *, struct vm_area_struct *);
void unmap_hugepage_range(struct vm_area_struct *,
                          unsigned long, unsigned long, struct page *,
                          zap_flags_t);
void __unmap_hugepage_range(struct mmu_gather *tlb,
                          struct vm_area_struct *vma,
                          unsigned long start, unsigned long end,
                          struct page *ref_page, zap_flags_t zap_flags);
void hugetlb_report_meminfo(struct seq_file *);
int hugetlb_report_node_meminfo(char *buf, int len, int nid);
void hugetlb_show_meminfo_node(int nid);
unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags);
#ifdef CONFIG_USERFAULTFD
int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                             struct vm_area_struct *dst_vma,
                             unsigned long dst_addr,
                             unsigned long src_addr,
                             uffd_flags_t flags,
                             struct folio **foliop);
#endif /* CONFIG_USERFAULTFD */
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
                                                struct vm_area_struct *vma,
                                                vm_flags_t vm_flags);
long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                long freed);
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison);
int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                bool *migratable_cleared);
void folio_putback_hugetlb(struct folio *folio);
void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason);
void hugetlb_fix_reserve_counts(struct inode *inode);
extern struct mutex *hugetlb_fault_mutex_table;
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);

pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud);
bool hugetlbfs_pagecache_present(struct hstate *h,
                                 struct vm_area_struct *vma,
                                 unsigned long address);

struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);

extern int sysctl_hugetlb_shm_group;
extern struct list_head huge_boot_pages[MAX_NUMNODES];

void hugetlb_bootmem_alloc(void);
bool hugetlb_bootmem_allocated(void);

/* arch callbacks */

#ifndef CONFIG_HIGHPTE
/*
 * pte_offset_huge() and pte_alloc_huge() are helpers for those architectures
 * which may go down to the lowest PTE level in their huge_pte_offset() and
 * huge_pte_alloc(): to avoid reliance on pte_offset_map() without pte_unmap().
 */
static inline pte_t *pte_offset_huge(pmd_t *pmd, unsigned long address)
{
        return pte_offset_kernel(pmd, address);
}
static inline pte_t *pte_alloc_huge(struct mm_struct *mm, pmd_t *pmd,
                                    unsigned long address)
{
        return pte_alloc(mm, pmd) ? NULL : pte_offset_huge(pmd, address);
}
#endif

pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz);
/*
 * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE.
 * Returns the pte_t* if found, or NULL if the address is not mapped.
 *
 * IMPORTANT: we should normally not directly call this function, instead
 * this is only a common interface to implement arch-specific
 * walker. Please use hugetlb_walk() instead, because that will attempt to
 * verify the locking for you.
 *
 * Since this function will walk all the pgtable pages (including not only
 * high-level pgtable page, but also PUD entry that can be unshared
 * concurrently for VM_SHARED), the caller of this function should be
 * responsible of its thread safety.  One can follow this rule:
 *
 *  (1) For private mappings: pmd unsharing is not possible, so holding the
 *      mmap_lock for either read or write is sufficient. Most callers
 *      already hold the mmap_lock, so normally, no special action is
 *      required.
 *
 *  (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged
 *      pgtable page can go away from under us!  It can be done by a pmd
 *      unshare with a follow up munmap() on the other process), then we
 *      need either:
 *
 *     (2.1) hugetlb vma lock read or write held, to make sure pmd unshare
 *           won't happen upon the range (it also makes sure the pte_t we
 *           read is the right and stable one), or,
 *
 *     (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make
 *           sure even if unshare happened the racy unmap() will wait until
 *           i_mmap_rwsem is released.
 *
 * Option (2.1) is the safest, which guarantees pte stability from pmd
 * sharing pov, until the vma lock released.  Option (2.2) doesn't protect
 * a concurrent pmd unshare, but it makes sure the pgtable page is safe to
 * access.
 */
pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz);
unsigned long hugetlb_mask_last_page(struct hstate *h);
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep);
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end);

extern void __hugetlb_zap_begin(struct vm_area_struct *vma,
                                unsigned long *begin, unsigned long *end);
extern void __hugetlb_zap_end(struct vm_area_struct *vma,
                              struct zap_details *details);

static inline void hugetlb_zap_begin(struct vm_area_struct *vma,
                                     unsigned long *start, unsigned long *end)
{
        if (is_vm_hugetlb_page(vma))
                __hugetlb_zap_begin(vma, start, end);
}

static inline void hugetlb_zap_end(struct vm_area_struct *vma,
                                   struct zap_details *details)
{
        if (is_vm_hugetlb_page(vma))
                __hugetlb_zap_end(vma, details);
}

void hugetlb_vma_lock_read(struct vm_area_struct *vma);
void hugetlb_vma_unlock_read(struct vm_area_struct *vma);
void hugetlb_vma_lock_write(struct vm_area_struct *vma);
void hugetlb_vma_unlock_write(struct vm_area_struct *vma);
int hugetlb_vma_trylock_write(struct vm_area_struct *vma);
void hugetlb_vma_assert_locked(struct vm_area_struct *vma);
void hugetlb_vma_lock_release(struct kref *kref);
long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot,
                unsigned long cp_flags);
bool is_hugetlb_entry_migration(pte_t pte);
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);

#else /* !CONFIG_HUGETLB_PAGE */

static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
}

static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}

static inline unsigned long hugetlb_total_pages(void)
{
        return 0;
}

static inline struct address_space *hugetlb_folio_mapping_lock_write(
                                                        struct folio *folio)
{
        return NULL;
}

static inline int huge_pmd_unshare(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long addr, pte_t *ptep)
{
        return 0;
}

static inline void adjust_range_if_pmd_sharing_possible(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline void hugetlb_zap_begin(
                                struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

static inline void hugetlb_zap_end(
                                struct vm_area_struct *vma,
                                struct zap_details *details)
{
}

static inline int copy_hugetlb_page_range(struct mm_struct *dst,
                                          struct mm_struct *src,
                                          struct vm_area_struct *dst_vma,
                                          struct vm_area_struct *src_vma)
{
        BUG();
        return 0;
}

static inline int move_hugetlb_page_tables(struct vm_area_struct *vma,
                                           struct vm_area_struct *new_vma,
                                           unsigned long old_addr,
                                           unsigned long new_addr,
                                           unsigned long len)
{
        BUG();
        return 0;
}

static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}

static inline int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
        return 0;
}

static inline void hugetlb_show_meminfo_node(int nid)
{
}

static inline int prepare_hugepage_range(struct file *file,
                                unsigned long addr, unsigned long len)
{
        return -EINVAL;
}

static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma)
{
}

static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
{
}

static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{
        return 1;
}

static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
{
}

static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}

static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                                unsigned long addr, unsigned long end,
                                unsigned long floor, unsigned long ceiling)
{
        BUG();
}

#ifdef CONFIG_USERFAULTFD
static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                                           struct vm_area_struct *dst_vma,
                                           unsigned long dst_addr,
                                           unsigned long src_addr,
                                           uffd_flags_t flags,
                                           struct folio **foliop)
{
        BUG();
        return 0;
}
#endif /* CONFIG_USERFAULTFD */

static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr,
                                        unsigned long sz)
{
        return NULL;
}

static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
{
        return false;
}

static inline int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
        return 0;
}

static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                        bool *migratable_cleared)
{
        return 0;
}

static inline void folio_putback_hugetlb(struct folio *folio)
{
}

static inline void move_hugetlb_state(struct folio *old_folio,
                                        struct folio *new_folio, int reason)
{
}

static inline long hugetlb_change_protection(
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned long end, pgprot_t newprot,
                        unsigned long cp_flags)
{
        return 0;
}

static inline void __unmap_hugepage_range(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
                        unsigned long end, struct page *ref_page,
                        zap_flags_t zap_flags)
{
        BUG();
}

static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
                        struct vm_area_struct *vma, unsigned long address,
                        unsigned int flags)
{
        BUG();
        return 0;
}

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

#endif /* !CONFIG_HUGETLB_PAGE */

#ifndef pgd_write
static inline int pgd_write(pgd_t pgd)
{
        BUG();
        return 0;
}
#endif

#define HUGETLB_ANON_FILE "anon_hugepage"

enum {
        /*
         * The file will be used as an shm file so shmfs accounting rules
         * apply
         */
        HUGETLB_SHMFS_INODE     = 1,
        /*
         * The file is being created on the internal vfs mount and shmfs
         * accounting rules do not apply
         */
        HUGETLB_ANONHUGE_INODE  = 2,
};

#ifdef CONFIG_HUGETLBFS
struct hugetlbfs_sb_info {
        long        max_inodes;   /* inodes allowed */
        long        free_inodes;  /* inodes free */
        spinlock_t        stat_lock;
        struct hstate *hstate;
        struct hugepage_subpool *spool;
        kuid_t        uid;
        kgid_t        gid;
        umode_t mode;
};

static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

struct hugetlbfs_inode_info {
        struct inode vfs_inode;
        unsigned int seals;
};

static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
{
        return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}

extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
                                int creat_flags, int page_size_log);

static inline bool is_file_hugepages(const struct file *file)
{
        return file->f_op->fop_flags & FOP_HUGE_PAGES;
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return HUGETLBFS_SB(i->i_sb)->hstate;
}
#else /* !CONFIG_HUGETLBFS */

#define is_file_hugepages(file)                        false
static inline struct file *
hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
                int creat_flags, int page_size_log)
{
        return ERR_PTR(-ENOSYS);
}

static inline struct hstate *hstate_inode(struct inode *i)
{
        return NULL;
}
#endif /* !CONFIG_HUGETLBFS */

unsigned long
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                                    unsigned long len, unsigned long pgoff,
                                    unsigned long flags);

/*
 * huegtlb page specific state flags.  These flags are located in page.private
 * of the hugetlb head page.  Functions created via the below macros should be
 * used to manipulate these flags.
 *
 * HPG_restore_reserve - Set when a hugetlb page consumes a reservation at
 *        allocation time.  Cleared when page is fully instantiated.  Free
 *        routine checks flag to restore a reservation on error paths.
 *        Synchronization:  Examined or modified by code that knows it has
 *        the only reference to page.  i.e. After allocation but before use
 *        or when the page is being freed.
 * HPG_migratable  - Set after a newly allocated page is added to the page
 *        cache and/or page tables.  Indicates the page is a candidate for
 *        migration.
 *        Synchronization:  Initially set after new page allocation with no
 *        locking.  When examined and modified during migration processing
 *        (isolate, migrate, putback) the hugetlb_lock is held.
 * HPG_temporary - Set on a page that is temporarily allocated from the buddy
 *        allocator.  Typically used for migration target pages when no pages
 *        are available in the pool.  The hugetlb free page path will
 *        immediately free pages with this flag set to the buddy allocator.
 *        Synchronization: Can be set after huge page allocation from buddy when
 *        code knows it has only reference.  All other examinations and
 *        modifications require hugetlb_lock.
 * HPG_freed - Set when page is on the free lists.
 *        Synchronization: hugetlb_lock held for examination and modification.
 * HPG_vmemmap_optimized - Set when the vmemmap pages of the page are freed.
 * HPG_raw_hwp_unreliable - Set when the hugetlb page has a hwpoison sub-page
 *     that is not tracked by raw_hwp_page list.
 */
enum hugetlb_page_flags {
        HPG_restore_reserve = 0,
        HPG_migratable,
        HPG_temporary,
        HPG_freed,
        HPG_vmemmap_optimized,
        HPG_raw_hwp_unreliable,
        HPG_cma,
        __NR_HPAGEFLAGS,
};

/*
 * Macros to create test, set and clear function definitions for
 * hugetlb specific page flags.
 */
#ifdef CONFIG_HUGETLB_PAGE
#define TESTHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
bool folio_test_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                return test_bit(HPG_##flname, private);                \
        }

#define SETHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
void folio_set_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                set_bit(HPG_##flname, private);                        \
        }

#define CLEARHPAGEFLAG(uname, flname)                                \
static __always_inline                                                \
void folio_clear_hugetlb_##flname(struct folio *folio)                \
        {        void *private = &folio->private;                \
                clear_bit(HPG_##flname, private);                \
        }
#else
#define TESTHPAGEFLAG(uname, flname)                                \
static inline bool                                                \
folio_test_hugetlb_##flname(struct folio *folio)                \
        { return 0; }

#define SETHPAGEFLAG(uname, flname)                                \
static inline void                                                \
folio_set_hugetlb_##flname(struct folio *folio)                 \
        { }

#define CLEARHPAGEFLAG(uname, flname)                                \
static inline void                                                \
folio_clear_hugetlb_##flname(struct folio *folio)                \
        { }
#endif

#define HPAGEFLAG(uname, flname)                                \
        TESTHPAGEFLAG(uname, flname)                                \
        SETHPAGEFLAG(uname, flname)                                \
        CLEARHPAGEFLAG(uname, flname)                                \

/*
 * Create functions associated with hugetlb page flags
 */
HPAGEFLAG(RestoreReserve, restore_reserve)
HPAGEFLAG(Migratable, migratable)
HPAGEFLAG(Temporary, temporary)
HPAGEFLAG(Freed, freed)
HPAGEFLAG(VmemmapOptimized, vmemmap_optimized)
HPAGEFLAG(RawHwpUnreliable, raw_hwp_unreliable)
HPAGEFLAG(Cma, cma)

#ifdef CONFIG_HUGETLB_PAGE

#define HSTATE_NAME_LEN 32
/* Defines one hugetlb page size */
struct hstate {
        struct mutex resize_lock;
        struct lock_class_key resize_key;
        int next_nid_to_alloc;
        int next_nid_to_free;
        unsigned int order;
        unsigned int demote_order;
        unsigned long mask;
        unsigned long max_huge_pages;
        unsigned long nr_huge_pages;
        unsigned long free_huge_pages;
        unsigned long resv_huge_pages;
        unsigned long surplus_huge_pages;
        unsigned long nr_overcommit_huge_pages;
        struct list_head hugepage_activelist;
        struct list_head hugepage_freelists[MAX_NUMNODES];
        unsigned int max_huge_pages_node[MAX_NUMNODES];
        unsigned int nr_huge_pages_node[MAX_NUMNODES];
        unsigned int free_huge_pages_node[MAX_NUMNODES];
        unsigned int surplus_huge_pages_node[MAX_NUMNODES];
        char name[HSTATE_NAME_LEN];
};

struct cma;

struct huge_bootmem_page {
        struct list_head list;
        struct hstate *hstate;
        unsigned long flags;
        struct cma *cma;
};

#define HUGE_BOOTMEM_HVO                0x0001
#define HUGE_BOOTMEM_ZONES_VALID        0x0002
#define HUGE_BOOTMEM_CMA                0x0004

bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m);

int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list);
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn);
void wait_for_freed_hugetlb_folios(void);
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                unsigned long addr, bool cow_from_owner);
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                                nodemask_t *nmask, gfp_t gfp_mask,
                                bool allow_alloc_fallback);
struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
                                          nodemask_t *nmask, gfp_t gfp_mask);

int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
                        pgoff_t idx);
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
                                unsigned long address, struct folio *folio);

/* arch callback */
int __init __alloc_bootmem_huge_page(struct hstate *h, int nid);
int __init alloc_bootmem_huge_page(struct hstate *h, int nid);
bool __init hugetlb_node_alloc_supported(void);

void __init hugetlb_add_hstate(unsigned order);
bool __init arch_hugetlb_valid_size(unsigned long size);
struct hstate *size_to_hstate(unsigned long size);

#ifndef HUGE_MAX_HSTATE
#define HUGE_MAX_HSTATE 1
#endif

extern struct hstate hstates[HUGE_MAX_HSTATE];
extern unsigned int default_hstate_idx;

#define default_hstate (hstates[default_hstate_idx])

static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
{
        return folio->_hugetlb_subpool;
}

static inline void hugetlb_set_folio_subpool(struct folio *folio,
                                        struct hugepage_subpool *subpool)
{
        folio->_hugetlb_subpool = subpool;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return hstate_inode(file_inode(f));
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        if (!page_size_log)
                return &default_hstate;

        if (page_size_log < BITS_PER_LONG)
                return size_to_hstate(1UL << page_size_log);

        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return hstate_file(vma->vm_file);
}

static inline unsigned long huge_page_size(const struct hstate *h)
{
        return (unsigned long)PAGE_SIZE << h->order;
}

extern unsigned long vma_kernel_pagesize(struct vm_area_struct *vma);

extern unsigned long vma_mmu_pagesize(struct vm_area_struct *vma);

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return h->mask;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return h->order;
}

static inline unsigned huge_page_shift(struct hstate *h)
{
        return h->order + PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return huge_page_order(h) > MAX_PAGE_ORDER;
}

static inline unsigned int pages_per_huge_page(const struct hstate *h)
{
        return 1 << h->order;
}

static inline unsigned int blocks_per_huge_page(struct hstate *h)
{
        return huge_page_size(h) / 512;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
                                struct address_space *mapping, pgoff_t idx)
{
        return filemap_lock_folio(mapping, idx << huge_page_order(h));
}

#include <asm/hugetlb.h>

#ifndef is_hugepage_only_range
static inline int is_hugepage_only_range(struct mm_struct *mm,
                                        unsigned long addr, unsigned long len)
{
        return 0;
}
#define is_hugepage_only_range is_hugepage_only_range
#endif

#ifndef arch_clear_hugetlb_flags
static inline void arch_clear_hugetlb_flags(struct folio *folio) { }
#define arch_clear_hugetlb_flags arch_clear_hugetlb_flags
#endif

#ifndef arch_make_huge_pte
static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift,
                                       vm_flags_t flags)
{
        return pte_mkhuge(entry);
}
#endif

#ifndef arch_has_huge_bootmem_alloc
/*
 * Some architectures do their own bootmem allocation, so they can't use
 * early CMA allocation.
 */
static inline bool arch_has_huge_bootmem_alloc(void)
{
        return false;
}
#endif

static inline struct hstate *folio_hstate(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        return size_to_hstate(folio_size(folio));
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return hstates[index].order + PAGE_SHIFT;
}

static inline int hstate_index(struct hstate *h)
{
        return h - hstates;
}

int dissolve_free_hugetlb_folio(struct folio *folio);
int dissolve_free_hugetlb_folios(unsigned long start_pfn,
                                    unsigned long end_pfn);

#ifdef CONFIG_MEMORY_FAILURE
extern void folio_clear_hugetlb_hwpoison(struct folio *folio);
#else
static inline void folio_clear_hugetlb_hwpoison(struct folio *folio)
{
}
#endif

#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
#ifndef arch_hugetlb_migration_supported
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        if ((huge_page_shift(h) == PMD_SHIFT) ||
                (huge_page_shift(h) == PUD_SHIFT) ||
                        (huge_page_shift(h) == PGDIR_SHIFT))
                return true;
        else
                return false;
}
#endif
#else
static inline bool arch_hugetlb_migration_supported(struct hstate *h)
{
        return false;
}
#endif

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return arch_hugetlb_migration_supported(h);
}

/*
 * Movability check is different as compared to migration check.
 * It determines whether or not a huge page should be placed on
 * movable zone or not. Movability of any huge page should be
 * required only if huge page size is supported for migration.
 * There won't be any reason for the huge page to be movable if
 * it is not migratable to start with. Also the size of the huge
 * page should be large enough to be placed under a movable zone
 * and still feasible enough to be migratable. Just the presence
 * in movable zone does not make the migration feasible.
 *
 * So even though large huge page sizes like the gigantic ones
 * are migratable they should not be movable because its not
 * feasible to migrate them from movable zone.
 */
static inline bool hugepage_movable_supported(struct hstate *h)
{
        if (!hugepage_migration_supported(h))
                return false;

        if (hstate_is_gigantic(h))
                return false;
        return true;
}

/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        gfp_t gfp = __GFP_COMP | __GFP_NOWARN;

        gfp |= hugepage_movable_supported(h) ? GFP_HIGHUSER_MOVABLE : GFP_HIGHUSER;

        return gfp;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        gfp_t modified_mask = htlb_alloc_mask(h);

        /* Some callers might want to enforce node */
        modified_mask |= (gfp_mask & __GFP_THISNODE);

        modified_mask |= (gfp_mask & __GFP_NOWARN);

        return modified_mask;
}

static inline bool htlb_allow_alloc_fallback(int reason)
{
        bool allowed_fallback = false;

        /*
         * Note: the memory offline, memory failure and migration syscalls will
         * be allowed to fallback to other nodes due to lack of a better chioce,
         * that might break the per-node hugetlb pool. While other cases will
         * set the __GFP_THISNODE to avoid breaking the per-node hugetlb pool.
         */
        switch (reason) {
        case MR_MEMORY_HOTPLUG:
        case MR_MEMORY_FAILURE:
        case MR_SYSCALL:
        case MR_MEMPOLICY_MBIND:
                allowed_fallback = true;
                break;
        default:
                break;
        }

        return allowed_fallback;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        const unsigned long size = huge_page_size(h);

        VM_WARN_ON(size == PAGE_SIZE);

        /*
         * hugetlb must use the exact same PT locks as core-mm page table
         * walkers would. When modifying a PTE table, hugetlb must take the
         * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD
         * PT lock etc.
         *
         * The expectation is that any hugetlb folio smaller than a PMD is
         * always mapped into a single PTE table and that any hugetlb folio
         * smaller than a PUD (but at least as big as a PMD) is always mapped
         * into a single PMD table.
         *
         * If that does not hold for an architecture, then that architecture
         * must disable split PT locks such that all *_lockptr() functions
         * will give us the same result: the per-MM PT lock.
         *
         * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where
         * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr()
         * and core-mm would use pmd_lockptr(). However, in such configurations
         * split PMD locks are disabled -- they don't make sense on a single
         * PGDIR page table -- and the end result is the same.
         */
        if (size >= PUD_SIZE)
                return pud_lockptr(mm, (pud_t *) pte);
        else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE))
                return pmd_lockptr(mm, (pmd_t *) pte);
        /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */
        return ptep_lockptr(mm, pte);
}

#ifndef hugepages_supported
/*
 * Some platform decide whether they support huge pages at boot
 * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
 * when there is no such support
 */
#define hugepages_supported() (HPAGE_SHIFT != 0)
#endif

void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm);

static inline void hugetlb_count_init(struct mm_struct *mm)
{
        atomic_long_set(&mm->hugetlb_usage, 0);
}

static inline void hugetlb_count_add(long l, struct mm_struct *mm)
{
        atomic_long_add(l, &mm->hugetlb_usage);
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
        atomic_long_sub(l, &mm->hugetlb_usage);
}

#ifndef huge_ptep_modify_prot_start
#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start
static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep)
{
        unsigned long psize = huge_page_size(hstate_vma(vma));

        return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize);
}
#endif

#ifndef huge_ptep_modify_prot_commit
#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit
static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma,
                                                unsigned long addr, pte_t *ptep,
                                                pte_t old_pte, pte_t pte)
{
        unsigned long psize = huge_page_size(hstate_vma(vma));

        set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}
#endif

#ifdef CONFIG_NUMA
void hugetlb_register_node(struct node *node);
void hugetlb_unregister_node(struct node *node);
#endif

/*
 * Check if a given raw @page in a hugepage is HWPOISON.
 */
bool is_raw_hwpoison_page_in_hugepage(struct page *page);

static inline unsigned long huge_page_mask_align(struct file *file)
{
        return PAGE_MASK & ~huge_page_mask(hstate_file(file));
}

#else        /* CONFIG_HUGETLB_PAGE */
struct hstate {};

static inline unsigned long huge_page_mask_align(struct file *file)
{
        return 0;
}

static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio)
{
        return NULL;
}

static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h,
                                struct address_space *mapping, pgoff_t idx)
{
        return NULL;
}

static inline int isolate_or_dissolve_huge_page(struct page *page,
                                                struct list_head *list)
{
        return -ENOMEM;
}

static inline int replace_free_hugepage_folios(unsigned long start_pfn,
                unsigned long end_pfn)
{
        return 0;
}

static inline void wait_for_freed_hugetlb_folios(void)
{
}

static inline struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           bool cow_from_owner)
{
        return NULL;
}

static inline struct folio *
alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
                            nodemask_t *nmask, gfp_t gfp_mask)
{
        return NULL;
}

static inline struct folio *
alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                        nodemask_t *nmask, gfp_t gfp_mask,
                        bool allow_alloc_fallback)
{
        return NULL;
}

static inline int __alloc_bootmem_huge_page(struct hstate *h)
{
        return 0;
}

static inline struct hstate *hstate_file(struct file *f)
{
        return NULL;
}

static inline struct hstate *hstate_sizelog(int page_size_log)
{
        return NULL;
}

static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
{
        return NULL;
}

static inline struct hstate *folio_hstate(struct folio *folio)
{
        return NULL;
}

static inline struct hstate *size_to_hstate(unsigned long size)
{
        return NULL;
}

static inline unsigned long huge_page_size(struct hstate *h)
{
        return PAGE_SIZE;
}

static inline unsigned long huge_page_mask(struct hstate *h)
{
        return PAGE_MASK;
}

static inline unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
        return PAGE_SIZE;
}

static inline unsigned int huge_page_order(struct hstate *h)
{
        return 0;
}

static inline unsigned int huge_page_shift(struct hstate *h)
{
        return PAGE_SHIFT;
}

static inline bool hstate_is_gigantic(struct hstate *h)
{
        return false;
}

static inline unsigned int pages_per_huge_page(struct hstate *h)
{
        return 1;
}

static inline unsigned hstate_index_to_shift(unsigned index)
{
        return 0;
}

static inline int hstate_index(struct hstate *h)
{
        return 0;
}

static inline int dissolve_free_hugetlb_folio(struct folio *folio)
{
        return 0;
}

static inline int dissolve_free_hugetlb_folios(unsigned long start_pfn,
                                           unsigned long end_pfn)
{
        return 0;
}

static inline bool hugepage_migration_supported(struct hstate *h)
{
        return false;
}

static inline bool hugepage_movable_supported(struct hstate *h)
{
        return false;
}

static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
        return 0;
}

static inline gfp_t htlb_modify_alloc_mask(struct hstate *h, gfp_t gfp_mask)
{
        return 0;
}

static inline bool htlb_allow_alloc_fallback(int reason)
{
        return false;
}

static inline spinlock_t *huge_pte_lockptr(struct hstate *h,
                                           struct mm_struct *mm, pte_t *pte)
{
        return &mm->page_table_lock;
}

static inline void hugetlb_count_init(struct mm_struct *mm)
{
}

static inline void hugetlb_report_usage(struct seq_file *f, struct mm_struct *m)
{
}

static inline void hugetlb_count_sub(long l, struct mm_struct *mm)
{
}

static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep)
{
#ifdef CONFIG_MMU
        return ptep_get(ptep);
#else
        return *ptep;
#endif
}

static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                                   pte_t *ptep, pte_t pte, unsigned long sz)
{
}

static inline void hugetlb_register_node(struct node *node)
{
}

static inline void hugetlb_unregister_node(struct node *node)
{
}

static inline bool hugetlbfs_pagecache_present(
    struct hstate *h, struct vm_area_struct *vma, unsigned long address)
{
        return false;
}

static inline void hugetlb_bootmem_alloc(void)
{
}

static inline bool hugetlb_bootmem_allocated(void)
{
        return false;
}
#endif        /* CONFIG_HUGETLB_PAGE */

static inline spinlock_t *huge_pte_lock(struct hstate *h,
                                        struct mm_struct *mm, pte_t *pte)
{
        spinlock_t *ptl;

        ptl = huge_pte_lockptr(h, mm, pte);
        spin_lock(ptl);
        return ptl;
}

#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
extern void __init hugetlb_cma_reserve(int order);
#else
static inline __init void hugetlb_cma_reserve(int order)
{
}
#endif

#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return page_count(virt_to_page(pte)) > 1;
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
        return false;
}
#endif

bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);

#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
/*
 * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
 * implement this.
 */
#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#endif

static inline bool __vma_shareable_lock(struct vm_area_struct *vma)
{
        return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data;
}

bool __vma_private_lock(struct vm_area_struct *vma);

/*
 * Safe version of huge_pte_offset() to check the locks.  See comments
 * above huge_pte_offset().
 */
static inline pte_t *
hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz)
{
#if defined(CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING) && defined(CONFIG_LOCKDEP)
        struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

        /*
         * If pmd sharing possible, locking needed to safely walk the
         * hugetlb pgtables.  More information can be found at the comment
         * above huge_pte_offset() in the same file.
         *
         * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP.
         */
        if (__vma_shareable_lock(vma))
                WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) &&
                             !lockdep_is_held(
                                 &vma->vm_file->f_mapping->i_mmap_rwsem));
#endif
        return huge_pte_offset(vma->vm_mm, addr, sz);
}

#endif /* _LINUX_HUGETLB_H */




































































































































































 1259 






 1260 

 1255 

 1253 



    1 



 1261 



































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/domain.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"

#include <linux/binfmts.h>
#include <linux/slab.h>
#include <linux/rculist.h>

/* Variables definitions.*/

/* The initial domain. */
struct tomoyo_domain_info tomoyo_kernel_domain;

/**
 * tomoyo_update_policy - Update an entry for exception policy.
 *
 * @new_entry:       Pointer to "struct tomoyo_acl_info".
 * @size:            Size of @new_entry in bytes.
 * @param:           Pointer to "struct tomoyo_acl_param".
 * @check_duplicate: Callback function to find duplicated entry.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_update_policy(struct tomoyo_acl_head *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)(const struct tomoyo_acl_head
                                                 *,
                                                 const struct tomoyo_acl_head
                                                 *))
{
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        struct tomoyo_acl_head *entry;
        struct list_head *list = param->list;

        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                return -ENOMEM;
        list_for_each_entry_rcu(entry, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS)
                        continue;
                if (!check_duplicate(entry, new_entry))
                        continue;
                entry->is_deleted = param->is_delete;
                error = 0;
                break;
        }
        if (error && !param->is_delete) {
                entry = tomoyo_commit_ok(new_entry, size);
                if (entry) {
                        list_add_tail_rcu(&entry->list, list);
                        error = 0;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
        return error;
}

/**
 * tomoyo_same_acl_head - Check for duplicated "struct tomoyo_acl_info" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_acl_head(const struct tomoyo_acl_info *a,
                                        const struct tomoyo_acl_info *b)
{
        return a->type == b->type && a->cond == b->cond;
}

/**
 * tomoyo_update_domain - Update an entry for domain policy.
 *
 * @new_entry:       Pointer to "struct tomoyo_acl_info".
 * @size:            Size of @new_entry in bytes.
 * @param:           Pointer to "struct tomoyo_acl_param".
 * @check_duplicate: Callback function to find duplicated entry.
 * @merge_duplicate: Callback function to merge duplicated entry.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_update_domain(struct tomoyo_acl_info *new_entry, const int size,
                         struct tomoyo_acl_param *param,
                         bool (*check_duplicate)(const struct tomoyo_acl_info
                                                 *,
                                                 const struct tomoyo_acl_info
                                                 *),
                         bool (*merge_duplicate)(struct tomoyo_acl_info *,
                                                 struct tomoyo_acl_info *,
                                                 const bool))
{
        const bool is_delete = param->is_delete;
        int error = is_delete ? -ENOENT : -ENOMEM;
        struct tomoyo_acl_info *entry;
        struct list_head * const list = param->list;

        if (param->data[0]) {
                new_entry->cond = tomoyo_get_condition(param);
                if (!new_entry->cond)
                        return -EINVAL;
                /*
                 * Domain transition preference is allowed for only
                 * "file execute" entries.
                 */
                if (new_entry->cond->transit &&
                    !(new_entry->type == TOMOYO_TYPE_PATH_ACL &&
                      container_of(new_entry, struct tomoyo_path_acl, head)
                      ->perm == 1 << TOMOYO_TYPE_EXECUTE))
                        goto out;
        }
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        list_for_each_entry_rcu(entry, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (entry->is_deleted == TOMOYO_GC_IN_PROGRESS)
                        continue;
                if (!tomoyo_same_acl_head(entry, new_entry) ||
                    !check_duplicate(entry, new_entry))
                        continue;
                if (merge_duplicate)
                        entry->is_deleted = merge_duplicate(entry, new_entry,
                                                            is_delete);
                else
                        entry->is_deleted = is_delete;
                error = 0;
                break;
        }
        if (error && !is_delete) {
                entry = tomoyo_commit_ok(new_entry, size);
                if (entry) {
                        list_add_tail_rcu(&entry->list, list);
                        error = 0;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        tomoyo_put_condition(new_entry->cond);
        return error;
}

/**
 * tomoyo_check_acl - Do permission check.
 *
 * @r:           Pointer to "struct tomoyo_request_info".
 * @check_entry: Callback function to check type specific parameters.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
void tomoyo_check_acl(struct tomoyo_request_info *r,
                      bool (*check_entry)(struct tomoyo_request_info *,
                                          const struct tomoyo_acl_info *))
{
        const struct tomoyo_domain_info *domain = r->domain;
        struct tomoyo_acl_info *ptr;
        const struct list_head *list = &domain->acl_info_list;
        u16 i = 0;

retry:
        list_for_each_entry_rcu(ptr, list, list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (ptr->is_deleted || ptr->type != r->param_type)
                        continue;
                if (!check_entry(r, ptr))
                        continue;
                if (!tomoyo_condition(r, ptr->cond))
                        continue;
                r->matched_acl = ptr;
                r->granted = true;
                return;
        }
        for (; i < TOMOYO_MAX_ACL_GROUPS; i++) {
                if (!test_bit(i, domain->group))
                        continue;
                list = &domain->ns->acl_group[i++];
                goto retry;
        }
        r->granted = false;
}

/* The list for "struct tomoyo_domain_info". */
LIST_HEAD(tomoyo_domain_list);

/**
 * tomoyo_last_word - Get last component of a domainname.
 *
 * @name: Domainname to check.
 *
 * Returns the last word of @domainname.
 */
static const char *tomoyo_last_word(const char *name)
{
        const char *cp = strrchr(name, ' ');

        if (cp)
                return cp + 1;
        return name;
}

/**
 * tomoyo_same_transition_control - Check for duplicated "struct tomoyo_transition_control" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_transition_control(const struct tomoyo_acl_head *a,
                                           const struct tomoyo_acl_head *b)
{
        const struct tomoyo_transition_control *p1 = container_of(a,
                                                                  typeof(*p1),
                                                                  head);
        const struct tomoyo_transition_control *p2 = container_of(b,
                                                                  typeof(*p2),
                                                                  head);

        return p1->type == p2->type && p1->is_last_name == p2->is_last_name
                && p1->domainname == p2->domainname
                && p1->program == p2->program;
}

/**
 * tomoyo_write_transition_control - Write "struct tomoyo_transition_control" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @type:  Type of this entry.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_write_transition_control(struct tomoyo_acl_param *param,
                                    const u8 type)
{
        struct tomoyo_transition_control e = { .type = type };
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        char *program = param->data;
        char *domainname = strstr(program, " from ");

        if (domainname) {
                *domainname = '\0';
                domainname += 6;
        } else if (type == TOMOYO_TRANSITION_CONTROL_NO_KEEP ||
                   type == TOMOYO_TRANSITION_CONTROL_KEEP) {
                domainname = program;
                program = NULL;
        }
        if (program && strcmp(program, "any")) {
                if (!tomoyo_correct_path(program))
                        return -EINVAL;
                e.program = tomoyo_get_name(program);
                if (!e.program)
                        goto out;
        }
        if (domainname && strcmp(domainname, "any")) {
                if (!tomoyo_correct_domain(domainname)) {
                        if (!tomoyo_correct_path(domainname))
                                goto out;
                        e.is_last_name = true;
                }
                e.domainname = tomoyo_get_name(domainname);
                if (!e.domainname)
                        goto out;
        }
        param->list = &param->ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];
        error = tomoyo_update_policy(&e.head, sizeof(e), param,
                                     tomoyo_same_transition_control);
out:
        tomoyo_put_name(e.domainname);
        tomoyo_put_name(e.program);
        return error;
}

/**
 * tomoyo_scan_transition - Try to find specific domain transition type.
 *
 * @list:       Pointer to "struct list_head".
 * @domainname: The name of current domain.
 * @program:    The name of requested program.
 * @last_name:  The last component of @domainname.
 * @type:       One of values in "enum tomoyo_transition_type".
 *
 * Returns true if found one, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static inline bool tomoyo_scan_transition
(const struct list_head *list, const struct tomoyo_path_info *domainname,
 const struct tomoyo_path_info *program, const char *last_name,
 const enum tomoyo_transition_type type)
{
        const struct tomoyo_transition_control *ptr;

        list_for_each_entry_rcu(ptr, list, head.list,
                                srcu_read_lock_held(&tomoyo_ss)) {
                if (ptr->head.is_deleted || ptr->type != type)
                        continue;
                if (ptr->domainname) {
                        if (!ptr->is_last_name) {
                                if (ptr->domainname != domainname)
                                        continue;
                        } else {
                                /*
                                 * Use direct strcmp() since this is
                                 * unlikely used.
                                 */
                                if (strcmp(ptr->domainname->name, last_name))
                                        continue;
                        }
                }
                if (ptr->program && tomoyo_pathcmp(ptr->program, program))
                        continue;
                return true;
        }
        return false;
}

/**
 * tomoyo_transition_type - Get domain transition type.
 *
 * @ns:         Pointer to "struct tomoyo_policy_namespace".
 * @domainname: The name of current domain.
 * @program:    The name of requested program.
 *
 * Returns TOMOYO_TRANSITION_CONTROL_TRANSIT if executing @program causes
 * domain transition across namespaces, TOMOYO_TRANSITION_CONTROL_INITIALIZE if
 * executing @program reinitializes domain transition within that namespace,
 * TOMOYO_TRANSITION_CONTROL_KEEP if executing @program stays at @domainname ,
 * others otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static enum tomoyo_transition_type tomoyo_transition_type
(const struct tomoyo_policy_namespace *ns,
 const struct tomoyo_path_info *domainname,
 const struct tomoyo_path_info *program)
{
        const char *last_name = tomoyo_last_word(domainname->name);
        enum tomoyo_transition_type type = TOMOYO_TRANSITION_CONTROL_NO_RESET;

        while (type < TOMOYO_MAX_TRANSITION_TYPE) {
                const struct list_head * const list =
                        &ns->policy_list[TOMOYO_ID_TRANSITION_CONTROL];

                if (!tomoyo_scan_transition(list, domainname, program,
                                            last_name, type)) {
                        type++;
                        continue;
                }
                if (type != TOMOYO_TRANSITION_CONTROL_NO_RESET &&
                    type != TOMOYO_TRANSITION_CONTROL_NO_INITIALIZE)
                        break;
                /*
                 * Do not check for reset_domain if no_reset_domain matched.
                 * Do not check for initialize_domain if no_initialize_domain
                 * matched.
                 */
                type++;
                type++;
        }
        return type;
}

/**
 * tomoyo_same_aggregator - Check for duplicated "struct tomoyo_aggregator" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_head".
 * @b: Pointer to "struct tomoyo_acl_head".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_aggregator(const struct tomoyo_acl_head *a,
                                   const struct tomoyo_acl_head *b)
{
        const struct tomoyo_aggregator *p1 = container_of(a, typeof(*p1),
                                                          head);
        const struct tomoyo_aggregator *p2 = container_of(b, typeof(*p2),
                                                          head);

        return p1->original_name == p2->original_name &&
                p1->aggregated_name == p2->aggregated_name;
}

/**
 * tomoyo_write_aggregator - Write "struct tomoyo_aggregator" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_write_aggregator(struct tomoyo_acl_param *param)
{
        struct tomoyo_aggregator e = { };
        int error = param->is_delete ? -ENOENT : -ENOMEM;
        const char *original_name = tomoyo_read_token(param);
        const char *aggregated_name = tomoyo_read_token(param);

        if (!tomoyo_correct_word(original_name) ||
            !tomoyo_correct_path(aggregated_name))
                return -EINVAL;
        e.original_name = tomoyo_get_name(original_name);
        e.aggregated_name = tomoyo_get_name(aggregated_name);
        if (!e.original_name || !e.aggregated_name ||
            e.aggregated_name->is_patterned) /* No patterns allowed. */
                goto out;
        param->list = &param->ns->policy_list[TOMOYO_ID_AGGREGATOR];
        error = tomoyo_update_policy(&e.head, sizeof(e), param,
                                     tomoyo_same_aggregator);
out:
        tomoyo_put_name(e.original_name);
        tomoyo_put_name(e.aggregated_name);
        return error;
}

/**
 * tomoyo_find_namespace - Find specified namespace.
 *
 * @name: Name of namespace to find.
 * @len:  Length of @name.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" if found,
 * NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static struct tomoyo_policy_namespace *tomoyo_find_namespace
(const char *name, const unsigned int len)
{
        struct tomoyo_policy_namespace *ns;

        list_for_each_entry(ns, &tomoyo_namespace_list, namespace_list) {
                if (strncmp(name, ns->name, len) ||
                    (name[len] && name[len] != ' '))
                        continue;
                return ns;
        }
        return NULL;
}

/**
 * tomoyo_assign_namespace - Create a new namespace.
 *
 * @domainname: Name of namespace to create.
 *
 * Returns pointer to "struct tomoyo_policy_namespace" on success,
 * NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_policy_namespace *tomoyo_assign_namespace(const char *domainname)
{
        struct tomoyo_policy_namespace *ptr;
        struct tomoyo_policy_namespace *entry;
        const char *cp = domainname;
        unsigned int len = 0;

        while (*cp && *cp++ != ' ')
                len++;
        ptr = tomoyo_find_namespace(domainname, len);
        if (ptr)
                return ptr;
        if (len >= TOMOYO_EXEC_TMPSIZE - 10 || !tomoyo_domain_def(domainname))
                return NULL;
        entry = kzalloc(sizeof(*entry) + len + 1, GFP_NOFS | __GFP_NOWARN);
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        ptr = tomoyo_find_namespace(domainname, len);
        if (!ptr && tomoyo_memory_ok(entry)) {
                char *name = (char *) (entry + 1);

                ptr = entry;
                memmove(name, domainname, len);
                name[len] = '\0';
                entry->name = name;
                tomoyo_init_policy_namespace(entry);
                entry = NULL;
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        kfree(entry);
        return ptr;
}

/**
 * tomoyo_namespace_jump - Check for namespace jump.
 *
 * @domainname: Name of domain.
 *
 * Returns true if namespace differs, false otherwise.
 */
static bool tomoyo_namespace_jump(const char *domainname)
{
        const char *namespace = tomoyo_current_namespace()->name;
        const int len = strlen(namespace);

        return strncmp(domainname, namespace, len) ||
                (domainname[len] && domainname[len] != ' ');
}

/**
 * tomoyo_assign_domain - Create a domain or a namespace.
 *
 * @domainname: The name of domain.
 * @transit:    True if transit to domain found or created.
 *
 * Returns pointer to "struct tomoyo_domain_info" on success, NULL otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
struct tomoyo_domain_info *tomoyo_assign_domain(const char *domainname,
                                                const bool transit)
{
        struct tomoyo_domain_info e = { };
        struct tomoyo_domain_info *entry = tomoyo_find_domain(domainname);
        bool created = false;

        if (entry) {
                if (transit) {
                        /*
                         * Since namespace is created at runtime, profiles may
                         * not be created by the moment the process transits to
                         * that domain. Do not perform domain transition if
                         * profile for that domain is not yet created.
                         */
                        if (tomoyo_policy_loaded &&
                            !entry->ns->profile_ptr[entry->profile])
                                return NULL;
                }
                return entry;
        }
        /* Requested domain does not exist. */
        /* Don't create requested domain if domainname is invalid. */
        if (strlen(domainname) >= TOMOYO_EXEC_TMPSIZE - 10 ||
            !tomoyo_correct_domain(domainname))
                return NULL;
        /*
         * Since definition of profiles and acl_groups may differ across
         * namespaces, do not inherit "use_profile" and "use_group" settings
         * by automatically creating requested domain upon domain transition.
         */
        if (transit && tomoyo_namespace_jump(domainname))
                return NULL;
        e.ns = tomoyo_assign_namespace(domainname);
        if (!e.ns)
                return NULL;
        /*
         * "use_profile" and "use_group" settings for automatically created
         * domains are inherited from current domain. These are 0 for manually
         * created domains.
         */
        if (transit) {
                const struct tomoyo_domain_info *domain = tomoyo_domain();

                e.profile = domain->profile;
                memcpy(e.group, domain->group, sizeof(e.group));
        }
        e.domainname = tomoyo_get_name(domainname);
        if (!e.domainname)
                return NULL;
        if (mutex_lock_interruptible(&tomoyo_policy_lock))
                goto out;
        entry = tomoyo_find_domain(domainname);
        if (!entry) {
                entry = tomoyo_commit_ok(&e, sizeof(e));
                if (entry) {
                        INIT_LIST_HEAD(&entry->acl_info_list);
                        list_add_tail_rcu(&entry->list, &tomoyo_domain_list);
                        created = true;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        tomoyo_put_name(e.domainname);
        if (entry && transit) {
                if (created) {
                        struct tomoyo_request_info r;
                        int i;

                        tomoyo_init_request_info(&r, entry,
                                                 TOMOYO_MAC_FILE_EXECUTE);
                        r.granted = false;
                        tomoyo_write_log(&r, "use_profile %u\n",
                                         entry->profile);
                        for (i = 0; i < TOMOYO_MAX_ACL_GROUPS; i++)
                                if (test_bit(i, entry->group))
                                        tomoyo_write_log(&r, "use_group %u\n",
                                                         i);
                        tomoyo_update_stat(TOMOYO_STAT_POLICY_UPDATES);
                }
        }
        return entry;
}

/**
 * tomoyo_environ - Check permission for environment variable names.
 *
 * @ee: Pointer to "struct tomoyo_execve".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_environ(struct tomoyo_execve *ee)
{
        struct tomoyo_request_info *r = &ee->r;
        struct linux_binprm *bprm = ee->bprm;
        /* env_page.data is allocated by tomoyo_dump_page(). */
        struct tomoyo_page_dump env_page = { };
        char *arg_ptr; /* Size is TOMOYO_EXEC_TMPSIZE bytes */
        int arg_len = 0;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        int error = -ENOMEM;

        ee->r.type = TOMOYO_MAC_ENVIRON;
        ee->r.profile = r->domain->profile;
        ee->r.mode = tomoyo_get_mode(r->domain->ns, ee->r.profile,
                                     TOMOYO_MAC_ENVIRON);
        if (!r->mode || !envp_count)
                return 0;
        arg_ptr = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS);
        if (!arg_ptr)
                goto out;
        while (error == -ENOMEM) {
                if (!tomoyo_dump_page(bprm, pos, &env_page))
                        goto out;
                pos += PAGE_SIZE - offset;
                /* Read. */
                while (argv_count && offset < PAGE_SIZE) {
                        if (!env_page.data[offset++])
                                argv_count--;
                }
                if (argv_count) {
                        offset = 0;
                        continue;
                }
                while (offset < PAGE_SIZE) {
                        const unsigned char c = env_page.data[offset++];

                        if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) {
                                if (c == '=') {
                                        arg_ptr[arg_len++] = '\0';
                                } else if (c == '\\') {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = '\\';
                                } else if (c > ' ' && c < 127) {
                                        arg_ptr[arg_len++] = c;
                                } else {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = (c >> 6) + '0';
                                        arg_ptr[arg_len++]
                                                = ((c >> 3) & 7) + '0';
                                        arg_ptr[arg_len++] = (c & 7) + '0';
                                }
                        } else {
                                arg_ptr[arg_len] = '\0';
                        }
                        if (c)
                                continue;
                        if (tomoyo_env_perm(r, arg_ptr)) {
                                error = -EPERM;
                                break;
                        }
                        if (!--envp_count) {
                                error = 0;
                                break;
                        }
                        arg_len = 0;
                }
                offset = 0;
        }
out:
        if (r->mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        kfree(env_page.data);
        kfree(arg_ptr);
        return error;
}

/**
 * tomoyo_find_next_domain - Find a domain.
 *
 * @bprm: Pointer to "struct linux_binprm".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_find_next_domain(struct linux_binprm *bprm)
{
        struct tomoyo_domain_info *old_domain = tomoyo_domain();
        struct tomoyo_domain_info *domain = NULL;
        const char *original_name = bprm->filename;
        int retval = -ENOMEM;
        bool reject_on_transition_failure = false;
        const struct tomoyo_path_info *candidate;
        struct tomoyo_path_info exename;
        struct tomoyo_execve *ee = kzalloc(sizeof(*ee), GFP_NOFS);

        if (!ee)
                return -ENOMEM;
        ee->tmp = kzalloc(TOMOYO_EXEC_TMPSIZE, GFP_NOFS);
        if (!ee->tmp) {
                kfree(ee);
                return -ENOMEM;
        }
        /* ee->dump->data is allocated by tomoyo_dump_page(). */
        tomoyo_init_request_info(&ee->r, NULL, TOMOYO_MAC_FILE_EXECUTE);
        ee->r.ee = ee;
        ee->bprm = bprm;
        ee->r.obj = &ee->obj;
        ee->obj.path1 = bprm->file->f_path;
        /*
         * Get symlink's pathname of program, but fallback to realpath if
         * symlink's pathname does not exist or symlink's pathname refers
         * to proc filesystem (e.g. /dev/fd/<num> or /proc/self/fd/<num> ).
         */
        exename.name = tomoyo_realpath_nofollow(original_name);
        if (exename.name && !strncmp(exename.name, "proc:/", 6)) {
                kfree(exename.name);
                exename.name = NULL;
        }
        if (!exename.name) {
                exename.name = tomoyo_realpath_from_path(&bprm->file->f_path);
                if (!exename.name)
                        goto out;
        }
        tomoyo_fill_path_info(&exename);
retry:
        /* Check 'aggregator' directive. */
        {
                struct tomoyo_aggregator *ptr;
                struct list_head *list =
                        &old_domain->ns->policy_list[TOMOYO_ID_AGGREGATOR];

                /* Check 'aggregator' directive. */
                candidate = &exename;
                list_for_each_entry_rcu(ptr, list, head.list,
                                        srcu_read_lock_held(&tomoyo_ss)) {
                        if (ptr->head.is_deleted ||
                            !tomoyo_path_matches_pattern(&exename,
                                                         ptr->original_name))
                                continue;
                        candidate = ptr->aggregated_name;
                        break;
                }
        }

        /* Check execute permission. */
        retval = tomoyo_execute_permission(&ee->r, candidate);
        if (retval == TOMOYO_RETRY_REQUEST)
                goto retry;
        if (retval < 0)
                goto out;
        /*
         * To be able to specify domainnames with wildcards, use the
         * pathname specified in the policy (which may contain
         * wildcard) rather than the pathname passed to execve()
         * (which never contains wildcard).
         */
        if (ee->r.param.path.matched_path)
                candidate = ee->r.param.path.matched_path;

        /*
         * Check for domain transition preference if "file execute" matched.
         * If preference is given, make execve() fail if domain transition
         * has failed, for domain transition preference should be used with
         * destination domain defined.
         */
        if (ee->transition) {
                const char *domainname = ee->transition->name;

                reject_on_transition_failure = true;
                if (!strcmp(domainname, "keep"))
                        goto force_keep_domain;
                if (!strcmp(domainname, "child"))
                        goto force_child_domain;
                if (!strcmp(domainname, "reset"))
                        goto force_reset_domain;
                if (!strcmp(domainname, "initialize"))
                        goto force_initialize_domain;
                if (!strcmp(domainname, "parent")) {
                        char *cp;

                        strscpy(ee->tmp, old_domain->domainname->name, TOMOYO_EXEC_TMPSIZE);
                        cp = strrchr(ee->tmp, ' ');
                        if (cp)
                                *cp = '\0';
                } else if (*domainname == '<')
                        strscpy(ee->tmp, domainname, TOMOYO_EXEC_TMPSIZE);
                else
                        snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                                 old_domain->domainname->name, domainname);
                goto force_jump_domain;
        }
        /*
         * No domain transition preference specified.
         * Calculate domain to transit to.
         */
        switch (tomoyo_transition_type(old_domain->ns, old_domain->domainname,
                                       candidate)) {
        case TOMOYO_TRANSITION_CONTROL_RESET:
force_reset_domain:
                /* Transit to the root of specified namespace. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "<%s>",
                         candidate->name);
                /*
                 * Make execve() fail if domain transition across namespaces
                 * has failed.
                 */
                reject_on_transition_failure = true;
                break;
        case TOMOYO_TRANSITION_CONTROL_INITIALIZE:
force_initialize_domain:
                /* Transit to the child of current namespace's root. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                         old_domain->ns->name, candidate->name);
                break;
        case TOMOYO_TRANSITION_CONTROL_KEEP:
force_keep_domain:
                /* Keep current domain. */
                domain = old_domain;
                break;
        default:
                if (old_domain == &tomoyo_kernel_domain &&
                    !tomoyo_policy_loaded) {
                        /*
                         * Needn't to transit from kernel domain before
                         * starting /sbin/init. But transit from kernel domain
                         * if executing initializers because they might start
                         * before /sbin/init.
                         */
                        domain = old_domain;
                        break;
                }
force_child_domain:
                /* Normal domain transition. */
                snprintf(ee->tmp, TOMOYO_EXEC_TMPSIZE - 1, "%s %s",
                         old_domain->domainname->name, candidate->name);
                break;
        }
force_jump_domain:
        if (!domain)
                domain = tomoyo_assign_domain(ee->tmp, true);
        if (domain)
                retval = 0;
        else if (reject_on_transition_failure) {
                pr_warn("ERROR: Domain '%s' not ready.\n", ee->tmp);
                retval = -ENOMEM;
        } else if (ee->r.mode == TOMOYO_CONFIG_ENFORCING)
                retval = -ENOMEM;
        else {
                retval = 0;
                if (!old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED]) {
                        old_domain->flags[TOMOYO_DIF_TRANSITION_FAILED] = true;
                        ee->r.granted = false;
                        tomoyo_write_log(&ee->r, "%s", tomoyo_dif
                                         [TOMOYO_DIF_TRANSITION_FAILED]);
                        pr_warn("ERROR: Domain '%s' not defined.\n", ee->tmp);
                }
        }
 out:
        if (!domain)
                domain = old_domain;
        /* Update reference count on "struct tomoyo_domain_info". */
        {
                struct tomoyo_task *s = tomoyo_task(current);

                s->old_domain_info = s->domain_info;
                s->domain_info = domain;
                atomic_inc(&domain->users);
        }
        kfree(exename.name);
        if (!retval) {
                ee->r.domain = domain;
                retval = tomoyo_environ(ee);
        }
        kfree(ee->tmp);
        kfree(ee->dump.data);
        kfree(ee);
        return retval;
}

/**
 * tomoyo_dump_page - Dump a page to buffer.
 *
 * @bprm: Pointer to "struct linux_binprm".
 * @pos:  Location to dump.
 * @dump: Pointer to "struct tomoyo_page_dump".
 *
 * Returns true on success, false otherwise.
 */
bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos,
                      struct tomoyo_page_dump *dump)
{
        struct page *page;
#ifdef CONFIG_MMU
        int ret;
#endif

        /* dump->data is released by tomoyo_find_next_domain(). */
        if (!dump->data) {
                dump->data = kzalloc(PAGE_SIZE, GFP_NOFS);
                if (!dump->data)
                        return false;
        }
        /* Same with get_arg_page(bprm, pos, 0) in fs/exec.c */
#ifdef CONFIG_MMU
        /*
         * This is called at execve() time in order to dig around
         * in the argv/environment of the new process
         * (represented by bprm).
         */
        mmap_read_lock(bprm->mm);
        ret = get_user_pages_remote(bprm->mm, pos, 1,
                                    FOLL_FORCE, &page, NULL);
        mmap_read_unlock(bprm->mm);
        if (ret <= 0)
                return false;
#else
        page = bprm->page[pos / PAGE_SIZE];
#endif
        if (page != dump->page) {
                const unsigned int offset = pos % PAGE_SIZE;
                /*
                 * Maybe kmap()/kunmap() should be used here.
                 * But remove_arg_zero() uses kmap_atomic()/kunmap_atomic().
                 * So do I.
                 */
                char *kaddr = kmap_atomic(page);

                dump->page = page;
                memcpy(dump->data + offset, kaddr + offset,
                       PAGE_SIZE - offset);
                kunmap_atomic(kaddr);
        }
        /* Same with put_arg_page(page) in fs/exec.c */
#ifdef CONFIG_MMU
        put_page(page);
#endif
        return true;
}

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































  106 







  105 

  106 









   46 









   46 

   46 






    1 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/stat.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/compat.h>
#include <linux/iversion.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>

#include <trace/events/timestamp.h>

#include "internal.h"
#include "mount.h"

/**
 * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
 * @stat: where to store the resulting values
 * @request_mask: STATX_* values requested
 * @inode: inode from which to grab the c/mtime
 *
 * Given @inode, grab the ctime and mtime out if it and store the result
 * in @stat. When fetching the value, flag it as QUERIED (if not already)
 * so the next write will record a distinct timestamp.
 *
 * NB: The QUERIED flag is tracked in the ctime, but we set it there even
 * if only the mtime was requested, as that ensures that the next mtime
 * change will be distinct.
 */
void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
{
        atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec;

        /* If neither time was requested, then don't report them */
        if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
                stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
                return;
        }

        stat->mtime = inode_get_mtime(inode);
        stat->ctime.tv_sec = inode->i_ctime_sec;
        stat->ctime.tv_nsec = (u32)atomic_read(pcn);
        if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
                stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));
        stat->ctime.tv_nsec &= ~I_CTIME_QUERIED;
        trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime);
}
EXPORT_SYMBOL(fill_mg_cmtime);

/**
 * generic_fillattr - Fill in the basic attributes from the inode struct
 * @idmap:                idmap of the mount the inode was found from
 * @request_mask:        statx request_mask
 * @inode:                Inode to use as the source
 * @stat:                Where to fill in the attributes
 *
 * Fill in the basic attributes in the kstat structure from data that's to be
 * found on the VFS inode structure.  This is the default if no getattr inode
 * operation is supplied.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then
 * take care to map the inode according to @idmap before filling in the
 * uid and gid filds. On non-idmapped mounts or if permission checking is to be
 * performed on the raw inode simply pass @nop_mnt_idmap.
 */
void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
                      struct inode *inode, struct kstat *stat)
{
        vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
        vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);

        stat->dev = inode->i_sb->s_dev;
        stat->ino = inode->i_ino;
        stat->mode = inode->i_mode;
        stat->nlink = inode->i_nlink;
        stat->uid = vfsuid_into_kuid(vfsuid);
        stat->gid = vfsgid_into_kgid(vfsgid);
        stat->rdev = inode->i_rdev;
        stat->size = i_size_read(inode);
        stat->atime = inode_get_atime(inode);

        if (is_mgtime(inode)) {
                fill_mg_cmtime(stat, request_mask, inode);
        } else {
                stat->ctime = inode_get_ctime(inode);
                stat->mtime = inode_get_mtime(inode);
        }

        stat->blksize = i_blocksize(inode);
        stat->blocks = inode->i_blocks;

        if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
                stat->result_mask |= STATX_CHANGE_COOKIE;
                stat->change_cookie = inode_query_iversion(inode);
        }

}
EXPORT_SYMBOL(generic_fillattr);

/**
 * generic_fill_statx_attr - Fill in the statx attributes from the inode flags
 * @inode:        Inode to use as the source
 * @stat:        Where to fill in the attribute flags
 *
 * Fill in the STATX_ATTR_* flags in the kstat structure for properties of the
 * inode that are published on i_flags and enforced by the VFS.
 */
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat)
{
        if (inode->i_flags & S_IMMUTABLE)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (inode->i_flags & S_APPEND)
                stat->attributes |= STATX_ATTR_APPEND;
        stat->attributes_mask |= KSTAT_ATTR_VFS_FLAGS;
}
EXPORT_SYMBOL(generic_fill_statx_attr);

/**
 * generic_fill_statx_atomic_writes - Fill in atomic writes statx attributes
 * @stat:        Where to fill in the attribute flags
 * @unit_min:        Minimum supported atomic write length in bytes
 * @unit_max:        Maximum supported atomic write length in bytes
 *
 * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from
 * atomic write unit_min and unit_max values.
 */
void generic_fill_statx_atomic_writes(struct kstat *stat,
                                      unsigned int unit_min,
                                      unsigned int unit_max)
{
        /* Confirm that the request type is known */
        stat->result_mask |= STATX_WRITE_ATOMIC;

        /* Confirm that the file attribute type is known */
        stat->attributes_mask |= STATX_ATTR_WRITE_ATOMIC;

        if (unit_min) {
                stat->atomic_write_unit_min = unit_min;
                stat->atomic_write_unit_max = unit_max;
                /* Initially only allow 1x segment */
                stat->atomic_write_segments_max = 1;

                /* Confirm atomic writes are actually supported */
                stat->attributes |= STATX_ATTR_WRITE_ATOMIC;
        }
}
EXPORT_SYMBOL_GPL(generic_fill_statx_atomic_writes);

/**
 * vfs_getattr_nosec - getattr without security checks
 * @path: file to get attributes from
 * @stat: structure to return attributes in
 * @request_mask: STATX_xxx flags indicating what the caller wants
 * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
 *
 * Get attributes without calling security_inode_getattr.
 *
 * Currently the only caller other than vfs_getattr is internal to the
 * filehandle lookup code, which uses only the inode number and returns no
 * attributes to any user.  Any other code probably wants vfs_getattr.
 */
int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
                      u32 request_mask, unsigned int query_flags)
{
        struct mnt_idmap *idmap;
        struct inode *inode = d_backing_inode(path->dentry);

        memset(stat, 0, sizeof(*stat));
        stat->result_mask |= STATX_BASIC_STATS;
        query_flags &= AT_STATX_SYNC_TYPE;

        /* allow the fs to override these if it really wants to */
        /* SB_NOATIME means filesystem supplies dummy atime value */
        if (inode->i_sb->s_flags & SB_NOATIME)
                stat->result_mask &= ~STATX_ATIME;

        /*
         * Note: If you add another clause to set an attribute flag, please
         * update attributes_mask below.
         */
        if (IS_AUTOMOUNT(inode))
                stat->attributes |= STATX_ATTR_AUTOMOUNT;

        if (IS_DAX(inode))
                stat->attributes |= STATX_ATTR_DAX;

        stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT |
                                  STATX_ATTR_DAX);

        idmap = mnt_idmap(path->mnt);
        if (inode->i_op->getattr) {
                int ret;

                ret = inode->i_op->getattr(idmap, path, stat, request_mask,
                                query_flags);
                if (ret)
                        return ret;
        } else {
                generic_fillattr(idmap, request_mask, inode, stat);
        }

        /*
         * If this is a block device inode, override the filesystem attributes
         * with the block device specific parameters that need to be obtained
         * from the bdev backing inode.
         */
        if (S_ISBLK(stat->mode))
                bdev_statx(path, stat, request_mask);

        return 0;
}
EXPORT_SYMBOL(vfs_getattr_nosec);

/*
 * vfs_getattr - Get the enhanced basic attributes of a file
 * @path: The file of interest
 * @stat: Where to return the statistics
 * @request_mask: STATX_xxx flags indicating what the caller wants
 * @query_flags: Query mode (AT_STATX_SYNC_TYPE)
 *
 * Ask the filesystem for a file's attributes.  The caller must indicate in
 * request_mask and query_flags to indicate what they want.
 *
 * If the file is remote, the filesystem can be forced to update the attributes
 * from the backing store by passing AT_STATX_FORCE_SYNC in query_flags or can
 * suppress the update by passing AT_STATX_DONT_SYNC.
 *
 * Bits must have been set in request_mask to indicate which attributes the
 * caller wants retrieving.  Any such attribute not requested may be returned
 * anyway, but the value may be approximate, and, if remote, may not have been
 * synchronised with the server.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
int vfs_getattr(const struct path *path, struct kstat *stat,
                u32 request_mask, unsigned int query_flags)
{
        int retval;

        retval = security_inode_getattr(path);
        if (retval)
                return retval;
        return vfs_getattr_nosec(path, stat, request_mask, query_flags);
}
EXPORT_SYMBOL(vfs_getattr);

/**
 * vfs_fstat - Get the basic attributes by file descriptor
 * @fd: The file descriptor referring to the file of interest
 * @stat: The result structure to fill in.
 *
 * This function is a wrapper around vfs_getattr().  The main difference is
 * that it uses a file descriptor to determine the file location.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
int vfs_fstat(int fd, struct kstat *stat)
{
        CLASS(fd_raw, f)(fd);
        if (fd_empty(f))
                return -EBADF;
        return vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
}

static int statx_lookup_flags(int flags)
{
        int lookup_flags = 0;

        if (!(flags & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        if (!(flags & AT_NO_AUTOMOUNT))
                lookup_flags |= LOOKUP_AUTOMOUNT;

        return lookup_flags;
}

static int vfs_statx_path(struct path *path, int flags, struct kstat *stat,
                          u32 request_mask)
{
        int error = vfs_getattr(path, stat, request_mask, flags);
        if (error)
                return error;

        if (request_mask & STATX_MNT_ID_UNIQUE) {
                stat->mnt_id = real_mount(path->mnt)->mnt_id_unique;
                stat->result_mask |= STATX_MNT_ID_UNIQUE;
        } else {
                stat->mnt_id = real_mount(path->mnt)->mnt_id;
                stat->result_mask |= STATX_MNT_ID;
        }

        if (path_mounted(path))
                stat->attributes |= STATX_ATTR_MOUNT_ROOT;
        stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT;
        return 0;
}

static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
                          u32 request_mask)
{
        CLASS(fd_raw, f)(fd);
        if (fd_empty(f))
                return -EBADF;
        return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask);
}

/**
 * vfs_statx - Get basic and extra attributes by filename
 * @dfd: A file descriptor representing the base dir for a relative filename
 * @filename: The name of the file of interest
 * @flags: Flags to control the query
 * @stat: The result structure to fill in.
 * @request_mask: STATX_xxx flags indicating what the caller wants
 *
 * This function is a wrapper around vfs_getattr().  The main difference is
 * that it uses a filename and base directory to determine the file location.
 * Additionally, the use of AT_SYMLINK_NOFOLLOW in flags will prevent a symlink
 * at the given name from being referenced.
 *
 * 0 will be returned on success, and a -ve error code if unsuccessful.
 */
static int vfs_statx(int dfd, struct filename *filename, int flags,
              struct kstat *stat, u32 request_mask)
{
        struct path path;
        unsigned int lookup_flags = statx_lookup_flags(flags);
        int error;

        if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
                      AT_STATX_SYNC_TYPE))
                return -EINVAL;

retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                return error;
        error = vfs_statx_path(&path, flags, stat, request_mask);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

int vfs_fstatat(int dfd, const char __user *filename,
                              struct kstat *stat, int flags)
{
        int ret;
        int statx_flags = flags | AT_NO_AUTOMOUNT;
        struct filename *name = getname_maybe_null(filename, flags);

        if (!name && dfd >= 0)
                return vfs_fstat(dfd, stat);

        ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
        putname(name);

        return ret;
}

#ifdef __ARCH_WANT_OLD_STAT

/*
 * For backward compatibility?  Maybe this should be moved
 * into arch/i386 instead?
 */
static int cp_old_stat(struct kstat *stat, struct __old_kernel_stat __user * statbuf)
{
        static int warncount = 5;
        struct __old_kernel_stat tmp;

        if (warncount > 0) {
                warncount--;
                printk(KERN_WARNING "VFS: Warning: %s using old stat() call. Recompile your binary.\n",
                        current->comm);
        } else if (warncount < 0) {
                /* it's laughable, but... */
                warncount = 0;
        }

        memset(&tmp, 0, sizeof(struct __old_kernel_stat));
        tmp.st_dev = old_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = old_encode_dev(stat->rdev);
#if BITS_PER_LONG == 32
        if (stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
#endif
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_ctime = stat->ctime.tv_sec;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(stat, const char __user *, filename,
                struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_stat(filename, &stat);
        if (error)
                return error;

        return cp_old_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(lstat, const char __user *, filename,
                struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;

        return cp_old_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_old_stat(&stat, statbuf);

        return error;
}

#endif /* __ARCH_WANT_OLD_STAT */

#ifdef __ARCH_WANT_NEW_STAT

#ifndef INIT_STRUCT_STAT_PADDING
#  define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif

static int cp_new_stat(struct kstat *stat, struct stat __user *statbuf)
{
        struct stat tmp;

        if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
                return -EOVERFLOW;
        if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
                return -EOVERFLOW;
#if BITS_PER_LONG == 32
        if (stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
#endif

        INIT_STRUCT_STAT_PADDING(tmp);
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = new_encode_dev(stat->rdev);
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_ctime = stat->ctime.tv_sec;
#ifdef STAT_HAVE_NSEC
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
#endif
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(newstat, const char __user *, filename,
                struct stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_stat(filename, &stat);

        if (error)
                return error;
        return cp_new_stat(&stat, statbuf);
}

SYSCALL_DEFINE2(newlstat, const char __user *, filename,
                struct stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;

        return cp_new_stat(&stat, statbuf);
}

#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT)
SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename,
                struct stat __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_new_stat(&stat, statbuf);
}
#endif

SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_new_stat(&stat, statbuf);

        return error;
}
#endif

static int do_readlinkat(int dfd, const char __user *pathname,
                         char __user *buf, int bufsiz)
{
        struct path path;
        struct filename *name;
        int error;
        unsigned int lookup_flags = LOOKUP_EMPTY;

        if (bufsiz <= 0)
                return -EINVAL;

retry:
        name = getname_flags(pathname, lookup_flags);
        error = filename_lookup(dfd, name, lookup_flags, &path, NULL);
        if (unlikely(error)) {
                putname(name);
                return error;
        }

        /*
         * AFS mountpoints allow readlink(2) but are not symlinks
         */
        if (d_is_symlink(path.dentry) ||
            d_backing_inode(path.dentry)->i_op->readlink) {
                error = security_inode_readlink(path.dentry);
                if (!error) {
                        touch_atime(&path);
                        error = vfs_readlink(path.dentry, buf, bufsiz);
                }
        } else {
                error = (name->name[0] == '\0') ? -ENOENT : -EINVAL;
        }
        path_put(&path);
        putname(name);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
        return error;
}

SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
                char __user *, buf, int, bufsiz)
{
        return do_readlinkat(dfd, pathname, buf, bufsiz);
}

SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf,
                int, bufsiz)
{
        return do_readlinkat(AT_FDCWD, path, buf, bufsiz);
}


/* ---------- LFS-64 ----------- */
#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)

#ifndef INIT_STRUCT_STAT64_PADDING
#  define INIT_STRUCT_STAT64_PADDING(st) memset(&st, 0, sizeof(st))
#endif

static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
{
        struct stat64 tmp;

        INIT_STRUCT_STAT64_PADDING(tmp);
#ifdef CONFIG_MIPS
        /* mips has weird padding, so we don't get 64 bits there */
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_rdev = new_encode_dev(stat->rdev);
#else
        tmp.st_dev = huge_encode_dev(stat->dev);
        tmp.st_rdev = huge_encode_dev(stat->rdev);
#endif
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
#ifdef STAT64_HAS_BROKEN_ST_INO
        tmp.__st_ino = stat->ino;
#endif
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        tmp.st_uid = from_kuid_munged(current_user_ns(), stat->uid);
        tmp.st_gid = from_kgid_munged(current_user_ns(), stat->gid);
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime = stat->ctime.tv_sec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
        tmp.st_size = stat->size;
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(statbuf,&tmp,sizeof(tmp)) ? -EFAULT : 0;
}

SYSCALL_DEFINE2(stat64, const char __user *, filename,
                struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_stat(filename, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE2(lstat64, const char __user *, filename,
                struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_lstat(filename, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE2(fstat64, unsigned long, fd, struct stat64 __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_new_stat64(&stat, statbuf);

        return error;
}

SYSCALL_DEFINE4(fstatat64, int, dfd, const char __user *, filename,
                struct stat64 __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_new_stat64(&stat, statbuf);
}
#endif /* __ARCH_WANT_STAT64 || __ARCH_WANT_COMPAT_STAT64 */

static noinline_for_stack int
cp_statx(const struct kstat *stat, struct statx __user *buffer)
{
        struct statx tmp;

        memset(&tmp, 0, sizeof(tmp));

        /* STATX_CHANGE_COOKIE is kernel-only for now */
        tmp.stx_mask = stat->result_mask & ~STATX_CHANGE_COOKIE;
        tmp.stx_blksize = stat->blksize;
        /* STATX_ATTR_CHANGE_MONOTONIC is kernel-only for now */
        tmp.stx_attributes = stat->attributes & ~STATX_ATTR_CHANGE_MONOTONIC;
        tmp.stx_nlink = stat->nlink;
        tmp.stx_uid = from_kuid_munged(current_user_ns(), stat->uid);
        tmp.stx_gid = from_kgid_munged(current_user_ns(), stat->gid);
        tmp.stx_mode = stat->mode;
        tmp.stx_ino = stat->ino;
        tmp.stx_size = stat->size;
        tmp.stx_blocks = stat->blocks;
        tmp.stx_attributes_mask = stat->attributes_mask;
        tmp.stx_atime.tv_sec = stat->atime.tv_sec;
        tmp.stx_atime.tv_nsec = stat->atime.tv_nsec;
        tmp.stx_btime.tv_sec = stat->btime.tv_sec;
        tmp.stx_btime.tv_nsec = stat->btime.tv_nsec;
        tmp.stx_ctime.tv_sec = stat->ctime.tv_sec;
        tmp.stx_ctime.tv_nsec = stat->ctime.tv_nsec;
        tmp.stx_mtime.tv_sec = stat->mtime.tv_sec;
        tmp.stx_mtime.tv_nsec = stat->mtime.tv_nsec;
        tmp.stx_rdev_major = MAJOR(stat->rdev);
        tmp.stx_rdev_minor = MINOR(stat->rdev);
        tmp.stx_dev_major = MAJOR(stat->dev);
        tmp.stx_dev_minor = MINOR(stat->dev);
        tmp.stx_mnt_id = stat->mnt_id;
        tmp.stx_dio_mem_align = stat->dio_mem_align;
        tmp.stx_dio_offset_align = stat->dio_offset_align;
        tmp.stx_dio_read_offset_align = stat->dio_read_offset_align;
        tmp.stx_subvol = stat->subvol;
        tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min;
        tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max;
        tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max;

        return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

int do_statx(int dfd, struct filename *filename, unsigned int flags,
             unsigned int mask, struct statx __user *buffer)
{
        struct kstat stat;
        int error;

        if (mask & STATX__RESERVED)
                return -EINVAL;
        if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
                return -EINVAL;

        /*
         * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
         * from userland.
         */
        mask &= ~STATX_CHANGE_COOKIE;

        error = vfs_statx(dfd, filename, flags, &stat, mask);
        if (error)
                return error;

        return cp_statx(&stat, buffer);
}

int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
             struct statx __user *buffer)
{
        struct kstat stat;
        int error;

        if (mask & STATX__RESERVED)
                return -EINVAL;
        if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE)
                return -EINVAL;

        /*
         * STATX_CHANGE_COOKIE is kernel-only for now. Ignore requests
         * from userland.
         */
        mask &= ~STATX_CHANGE_COOKIE;

        error = vfs_statx_fd(fd, flags, &stat, mask);
        if (error)
                return error;

        return cp_statx(&stat, buffer);
}

/**
 * sys_statx - System call to get enhanced stats
 * @dfd: Base directory to pathwalk from *or* fd to stat.
 * @filename: File to stat or either NULL or "" with AT_EMPTY_PATH
 * @flags: AT_* flags to control pathwalk.
 * @mask: Parts of statx struct actually required.
 * @buffer: Result buffer.
 *
 * Note that fstat() can be emulated by setting dfd to the fd of interest,
 * supplying "" (or preferably NULL) as the filename and setting AT_EMPTY_PATH
 * in the flags.
 */
SYSCALL_DEFINE5(statx,
                int, dfd, const char __user *, filename, unsigned, flags,
                unsigned int, mask,
                struct statx __user *, buffer)
{
        int ret;
        struct filename *name = getname_maybe_null(filename, flags);

        if (!name && dfd >= 0)
                return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer);

        ret = do_statx(dfd, name, flags, mask, buffer);
        putname(name);

        return ret;
}

#if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_STAT)
static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf)
{
        struct compat_stat tmp;

        if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
                return -EOVERFLOW;
        if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
                return -EOVERFLOW;

        memset(&tmp, 0, sizeof(tmp));
        tmp.st_dev = new_encode_dev(stat->dev);
        tmp.st_ino = stat->ino;
        if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
                return -EOVERFLOW;
        tmp.st_mode = stat->mode;
        tmp.st_nlink = stat->nlink;
        if (tmp.st_nlink != stat->nlink)
                return -EOVERFLOW;
        SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
        SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
        tmp.st_rdev = new_encode_dev(stat->rdev);
        if ((u64) stat->size > MAX_NON_LFS)
                return -EOVERFLOW;
        tmp.st_size = stat->size;
        tmp.st_atime = stat->atime.tv_sec;
        tmp.st_atime_nsec = stat->atime.tv_nsec;
        tmp.st_mtime = stat->mtime.tv_sec;
        tmp.st_mtime_nsec = stat->mtime.tv_nsec;
        tmp.st_ctime = stat->ctime.tv_sec;
        tmp.st_ctime_nsec = stat->ctime.tv_nsec;
        tmp.st_blocks = stat->blocks;
        tmp.st_blksize = stat->blksize;
        return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0;
}

COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_stat(filename, &stat);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}

COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error;

        error = vfs_lstat(filename, &stat);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}

#ifndef __ARCH_WANT_STAT64
COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd,
                       const char __user *, filename,
                       struct compat_stat __user *, statbuf, int, flag)
{
        struct kstat stat;
        int error;

        error = vfs_fstatat(dfd, filename, &stat, flag);
        if (error)
                return error;
        return cp_compat_stat(&stat, statbuf);
}
#endif

COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd,
                       struct compat_stat __user *, statbuf)
{
        struct kstat stat;
        int error = vfs_fstat(fd, &stat);

        if (!error)
                error = cp_compat_stat(&stat, statbuf);
        return error;
}
#endif

/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */
void __inode_add_bytes(struct inode *inode, loff_t bytes)
{
        inode->i_blocks += bytes >> 9;
        bytes &= 511;
        inode->i_bytes += bytes;
        if (inode->i_bytes >= 512) {
                inode->i_blocks++;
                inode->i_bytes -= 512;
        }
}
EXPORT_SYMBOL(__inode_add_bytes);

void inode_add_bytes(struct inode *inode, loff_t bytes)
{
        spin_lock(&inode->i_lock);
        __inode_add_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
}

EXPORT_SYMBOL(inode_add_bytes);

void __inode_sub_bytes(struct inode *inode, loff_t bytes)
{
        inode->i_blocks -= bytes >> 9;
        bytes &= 511;
        if (inode->i_bytes < bytes) {
                inode->i_blocks--;
                inode->i_bytes += 512;
        }
        inode->i_bytes -= bytes;
}

EXPORT_SYMBOL(__inode_sub_bytes);

void inode_sub_bytes(struct inode *inode, loff_t bytes)
{
        spin_lock(&inode->i_lock);
        __inode_sub_bytes(inode, bytes);
        spin_unlock(&inode->i_lock);
}

EXPORT_SYMBOL(inode_sub_bytes);

loff_t inode_get_bytes(struct inode *inode)
{
        loff_t ret;

        spin_lock(&inode->i_lock);
        ret = __inode_get_bytes(inode);
        spin_unlock(&inode->i_lock);
        return ret;
}

EXPORT_SYMBOL(inode_get_bytes);

void inode_set_bytes(struct inode *inode, loff_t bytes)
{
        /* Caller is here responsible for sufficient locking
         * (ie. inode->i_lock) */
        inode->i_blocks = bytes >> 9;
        inode->i_bytes = bytes & 511;
}

EXPORT_SYMBOL(inode_set_bytes);





























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMEKEEPING_H
#define _LINUX_TIMEKEEPING_H

#include <linux/errno.h>
#include <linux/clocksource_ids.h>
#include <linux/ktime.h>

/* Included from linux/ktime.h */

void timekeeping_init(void);
extern int timekeeping_suspended;

/* Architecture timer tick functions: */
extern void legacy_timer_tick(unsigned long ticks);

/*
 * Get and set timeofday
 */
extern int do_settimeofday64(const struct timespec64 *ts);
extern int do_sys_settimeofday64(const struct timespec64 *tv,
                                 const struct timezone *tz);

/*
 * ktime_get() family - read the current time in a multitude of ways.
 *
 * The default time reference is CLOCK_MONOTONIC, starting at
 * boot time but not counting the time spent in suspend.
 * For other references, use the functions with "real", "clocktai",
 * "boottime" and "raw" suffixes.
 *
 * To get the time in a different format, use the ones with
 * "ns", "ts64" and "seconds" suffix.
 *
 * See Documentation/core-api/timekeeping.rst for more details.
 */


/*
 * timespec64 based interfaces
 */
extern void ktime_get_raw_ts64(struct timespec64 *ts);
extern void ktime_get_ts64(struct timespec64 *ts);
extern void ktime_get_real_ts64(struct timespec64 *tv);
extern void ktime_get_coarse_ts64(struct timespec64 *ts);
extern void ktime_get_coarse_real_ts64(struct timespec64 *ts);

/* Multigrain timestamp interfaces */
extern void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts);
extern void ktime_get_real_ts64_mg(struct timespec64 *ts);
extern unsigned long timekeeping_get_mg_floor_swaps(void);

void getboottime64(struct timespec64 *ts);

/*
 * time64_t base interfaces
 */
extern time64_t ktime_get_seconds(void);
extern time64_t __ktime_get_real_seconds(void);
extern time64_t ktime_get_real_seconds(void);

/*
 * ktime_t based interfaces
 */

enum tk_offsets {
        TK_OFFS_REAL,
        TK_OFFS_BOOT,
        TK_OFFS_TAI,
        TK_OFFS_MAX,
};

extern ktime_t ktime_get(void);
extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
extern ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs);
extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
extern ktime_t ktime_get_raw(void);
extern u32 ktime_get_resolution_ns(void);

/**
 * ktime_get_real - get the real (wall-) time in ktime_t format
 *
 * Returns: real (wall) time in ktime_t format
 */
static inline ktime_t ktime_get_real(void)
{
        return ktime_get_with_offset(TK_OFFS_REAL);
}

static inline ktime_t ktime_get_coarse_real(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_REAL);
}

/**
 * ktime_get_boottime - Get monotonic time since boot in ktime_t format
 *
 * This is similar to CLOCK_MONTONIC/ktime_get, but also includes the
 * time spent in suspend.
 *
 * Returns: monotonic time since boot in ktime_t format
 */
static inline ktime_t ktime_get_boottime(void)
{
        return ktime_get_with_offset(TK_OFFS_BOOT);
}

static inline ktime_t ktime_get_coarse_boottime(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_BOOT);
}

/**
 * ktime_get_clocktai - Get the TAI time of day in ktime_t format
 *
 * Returns: the TAI time of day in ktime_t format
 */
static inline ktime_t ktime_get_clocktai(void)
{
        return ktime_get_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse_clocktai(void)
{
        return ktime_get_coarse_with_offset(TK_OFFS_TAI);
}

static inline ktime_t ktime_get_coarse(void)
{
        struct timespec64 ts;

        ktime_get_coarse_ts64(&ts);
        return timespec64_to_ktime(ts);
}

static inline u64 ktime_get_coarse_ns(void)
{
        return ktime_to_ns(ktime_get_coarse());
}

static inline u64 ktime_get_coarse_real_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_real());
}

static inline u64 ktime_get_coarse_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_boottime());
}

static inline u64 ktime_get_coarse_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_coarse_clocktai());
}

/**
 * ktime_mono_to_real - Convert monotonic time to clock realtime
 * @mono: monotonic time to convert
 *
 * Returns: time converted to realtime clock
 */
static inline ktime_t ktime_mono_to_real(ktime_t mono)
{
        return ktime_mono_to_any(mono, TK_OFFS_REAL);
}

/**
 * ktime_get_ns - Get the current time in nanoseconds
 *
 * Returns: current time converted to nanoseconds
 */
static inline u64 ktime_get_ns(void)
{
        return ktime_to_ns(ktime_get());
}

/**
 * ktime_get_real_ns - Get the current real/wall time in nanoseconds
 *
 * Returns: current real time converted to nanoseconds
 */
static inline u64 ktime_get_real_ns(void)
{
        return ktime_to_ns(ktime_get_real());
}

/**
 * ktime_get_boottime_ns - Get the monotonic time since boot in nanoseconds
 *
 * Returns: current boottime converted to nanoseconds
 */
static inline u64 ktime_get_boottime_ns(void)
{
        return ktime_to_ns(ktime_get_boottime());
}

/**
 * ktime_get_clocktai_ns - Get the current TAI time of day in nanoseconds
 *
 * Returns: current TAI time converted to nanoseconds
 */
static inline u64 ktime_get_clocktai_ns(void)
{
        return ktime_to_ns(ktime_get_clocktai());
}

/**
 * ktime_get_raw_ns - Get the raw monotonic time in nanoseconds
 *
 * Returns: current raw monotonic time converted to nanoseconds
 */
static inline u64 ktime_get_raw_ns(void)
{
        return ktime_to_ns(ktime_get_raw());
}

extern u64 ktime_get_mono_fast_ns(void);
extern u64 ktime_get_raw_fast_ns(void);
extern u64 ktime_get_boot_fast_ns(void);
extern u64 ktime_get_tai_fast_ns(void);
extern u64 ktime_get_real_fast_ns(void);

/*
 * timespec64/time64_t interfaces utilizing the ktime based ones
 * for API completeness, these could be implemented more efficiently
 * if needed.
 */
static inline void ktime_get_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_boottime());
}

static inline void ktime_get_coarse_boottime_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_boottime());
}

static inline time64_t ktime_get_boottime_seconds(void)
{
        return ktime_divns(ktime_get_coarse_boottime(), NSEC_PER_SEC);
}

static inline void ktime_get_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_clocktai());
}

static inline void ktime_get_coarse_clocktai_ts64(struct timespec64 *ts)
{
        *ts = ktime_to_timespec64(ktime_get_coarse_clocktai());
}

static inline time64_t ktime_get_clocktai_seconds(void)
{
        return ktime_divns(ktime_get_coarse_clocktai(), NSEC_PER_SEC);
}

/*
 * RTC specific
 */
extern bool timekeeping_rtc_skipsuspend(void);
extern bool timekeeping_rtc_skipresume(void);

extern void timekeeping_inject_sleeptime64(const struct timespec64 *delta);

/**
 * struct system_time_snapshot - simultaneous raw/real time capture with
 *                                 counter value
 * @cycles:        Clocksource counter value to produce the system times
 * @real:        Realtime system time
 * @boot:        Boot time
 * @raw:        Monotonic raw system time
 * @cs_id:        Clocksource ID
 * @clock_was_set_seq:        The sequence number of clock-was-set events
 * @cs_was_changed_seq:        The sequence number of clocksource change events
 */
struct system_time_snapshot {
        u64                        cycles;
        ktime_t                        real;
        ktime_t                        boot;
        ktime_t                        raw;
        enum clocksource_ids        cs_id;
        unsigned int                clock_was_set_seq;
        u8                        cs_was_changed_seq;
};

/**
 * struct system_device_crosststamp - system/device cross-timestamp
 *                                      (synchronized capture)
 * @device:                Device time
 * @sys_realtime:        Realtime simultaneous with device time
 * @sys_monoraw:        Monotonic raw simultaneous with device time
 */
struct system_device_crosststamp {
        ktime_t device;
        ktime_t sys_realtime;
        ktime_t sys_monoraw;
};

/**
 * struct system_counterval_t - system counter value with the ID of the
 *                                corresponding clocksource
 * @cycles:        System counter value
 * @cs_id:        Clocksource ID corresponding to system counter value. Used by
 *                timekeeping code to verify comparability of two cycle values.
 *                The default ID, CSID_GENERIC, does not identify a specific
 *                clocksource.
 * @use_nsecs:        @cycles is in nanoseconds.
 */
struct system_counterval_t {
        u64                        cycles;
        enum clocksource_ids        cs_id;
        bool                        use_nsecs;
};

extern bool ktime_real_to_base_clock(ktime_t treal,
                                     enum clocksource_ids base_id, u64 *cycles);
extern bool timekeeping_clocksource_has_base(enum clocksource_ids id);

/*
 * Get cross timestamp between system clock and device clock
 */
extern int get_device_system_crosststamp(
                        int (*get_time_fn)(ktime_t *device_time,
                                struct system_counterval_t *system_counterval,
                                void *ctx),
                        void *ctx,
                        struct system_time_snapshot *history,
                        struct system_device_crosststamp *xtstamp);

/*
 * Simultaneously snapshot realtime and monotonic raw clocks
 */
extern void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot);

/*
 * Persistent clock related interfaces
 */
extern int persistent_clock_is_local;

extern void read_persistent_clock64(struct timespec64 *ts);
void read_persistent_wall_and_boot_offset(struct timespec64 *wall_clock,
                                          struct timespec64 *boot_offset);
#ifdef CONFIG_GENERIC_CMOS_UPDATE
extern int update_persistent_clock64(struct timespec64 now);
#endif

#endif













































































































































   22 








   22 






















































































   22 






   22 





   22 

   22 































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * internal.h - printk internal definitions
 */
#include <linux/console.h>
#include <linux/percpu.h>
#include <linux/types.h>

#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
struct ctl_table;
void __init printk_sysctl_init(void);
int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos);
#else
#define printk_sysctl_init() do { } while (0)
#endif

#define con_printk(lvl, con, fmt, ...)                                \
        printk(lvl pr_fmt("%s%sconsole [%s%d] " fmt),                \
                (con->flags & CON_NBCON) ? "" : "legacy ",        \
                (con->flags & CON_BOOT) ? "boot" : "",                \
                con->name, con->index, ##__VA_ARGS__)

/*
 * Identify if legacy printing is forced in a dedicated kthread. If
 * true, all printing via console lock occurs within a dedicated
 * legacy printer thread. The only exception is on panic, after the
 * nbcon consoles have had their chance to print the panic messages
 * first.
 */
#ifdef CONFIG_PREEMPT_RT
# define force_legacy_kthread()        (true)
#else
# define force_legacy_kthread()        (false)
#endif

#ifdef CONFIG_PRINTK

#ifdef CONFIG_PRINTK_CALLER
#define PRINTK_PREFIX_MAX        48
#else
#define PRINTK_PREFIX_MAX        32
#endif

/*
 * the maximum size of a formatted record (i.e. with prefix added
 * per line and dropped messages or in extended message format)
 */
#define PRINTK_MESSAGE_MAX        2048

/* the maximum size allowed to be reserved for a record */
#define PRINTKRB_RECORD_MAX        1024

/* Flags for a single printk record. */
enum printk_info_flags {
        /* always show on console, ignore console_loglevel */
        LOG_FORCE_CON        = 1,
        LOG_NEWLINE        = 2,        /* text ended with a newline */
        LOG_CONT        = 8,        /* text is a fragment of a continuation line */
};

struct printk_ringbuffer;
struct dev_printk_info;

extern struct printk_ringbuffer *prb;
extern bool printk_kthreads_running;
extern bool debug_non_panic_cpus;

__printf(4, 0)
int vprintk_store(int facility, int level,
                  const struct dev_printk_info *dev_info,
                  const char *fmt, va_list args);

__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);

void __printk_safe_enter(void);
void __printk_safe_exit(void);

bool printk_percpu_data_ready(void);

#define printk_safe_enter_irqsave(flags)        \
        do {                                        \
                local_irq_save(flags);                \
                __printk_safe_enter();                \
        } while (0)

#define printk_safe_exit_irqrestore(flags)        \
        do {                                        \
                __printk_safe_exit();                \
                local_irq_restore(flags);        \
        } while (0)

void defer_console_output(void);
bool is_printk_legacy_deferred(void);
bool is_printk_force_console(void);

u16 printk_parse_prefix(const char *text, int *level,
                        enum printk_info_flags *flags);
void console_lock_spinning_enable(void);
int console_lock_spinning_disable_and_check(int cookie);

u64 nbcon_seq_read(struct console *con);
void nbcon_seq_force(struct console *con, u64 seq);
bool nbcon_alloc(struct console *con);
void nbcon_free(struct console *con);
enum nbcon_prio nbcon_get_default_prio(void);
void nbcon_atomic_flush_pending(void);
bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
                                   int cookie, bool use_atomic);
bool nbcon_kthread_create(struct console *con);
void nbcon_kthread_stop(struct console *con);
void nbcon_kthreads_wake(void);

/*
 * Check if the given console is currently capable and allowed to print
 * records. Note that this function does not consider the current context,
 * which can also play a role in deciding if @con can be used to print
 * records.
 */
static inline bool console_is_usable(struct console *con, short flags, bool use_atomic)
{
        if (!(flags & CON_ENABLED))
                return false;

        if ((flags & CON_SUSPENDED))
                return false;

        if (flags & CON_NBCON) {
                /* The write_atomic() callback is optional. */
                if (use_atomic && !con->write_atomic)
                        return false;

                /*
                 * For the !use_atomic case, @printk_kthreads_running is not
                 * checked because the write_thread() callback is also used
                 * via the legacy loop when the printer threads are not
                 * available.
                 */
        } else {
                if (!con->write)
                        return false;
        }

        /*
         * Console drivers may assume that per-cpu resources have been
         * allocated. So unless they're explicitly marked as being able to
         * cope (CON_ANYTIME) don't call them until this CPU is officially up.
         */
        if (!cpu_online(raw_smp_processor_id()) && !(flags & CON_ANYTIME))
                return false;

        return true;
}

/**
 * nbcon_kthread_wake - Wake up a console printing thread
 * @con:        Console to operate on
 */
static inline void nbcon_kthread_wake(struct console *con)
{
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the rcuwait is empty.
         *
         * The full memory barrier in rcuwait_wake_up() pairs with the full
         * memory barrier within set_current_state() of
         * ___rcuwait_wait_event(), which is called after prepare_to_rcuwait()
         * adds the waiter but before it has checked the wait condition.
         *
         * This pairs with nbcon_kthread_func:A.
         */
        rcuwait_wake_up(&con->rcuwait); /* LMM(nbcon_kthread_wake:A) */
}

#else

#define PRINTK_PREFIX_MAX        0
#define PRINTK_MESSAGE_MAX        0
#define PRINTKRB_RECORD_MAX        0

#define printk_kthreads_running (false)

/*
 * In !PRINTK builds we still export console_sem
 * semaphore and some of console functions (console_unlock()/etc.), so
 * printk-safe must preserve the existing local IRQ guarantees.
 */
#define printk_safe_enter_irqsave(flags) local_irq_save(flags)
#define printk_safe_exit_irqrestore(flags) local_irq_restore(flags)

static inline bool printk_percpu_data_ready(void) { return false; }
static inline void defer_console_output(void) { }
static inline bool is_printk_legacy_deferred(void) { return false; }
static inline u64 nbcon_seq_read(struct console *con) { return 0; }
static inline void nbcon_seq_force(struct console *con, u64 seq) { }
static inline bool nbcon_alloc(struct console *con) { return false; }
static inline void nbcon_free(struct console *con) { }
static inline enum nbcon_prio nbcon_get_default_prio(void) { return NBCON_PRIO_NONE; }
static inline void nbcon_atomic_flush_pending(void) { }
static inline bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
                                                 int cookie, bool use_atomic) { return false; }
static inline void nbcon_kthread_wake(struct console *con) { }
static inline void nbcon_kthreads_wake(void) { }

static inline bool console_is_usable(struct console *con, short flags,
                                     bool use_atomic) { return false; }

#endif /* CONFIG_PRINTK */

extern bool have_boot_console;
extern bool have_nbcon_console;
extern bool have_legacy_console;
extern bool legacy_allow_panic_sync;

/**
 * struct console_flush_type - Define available console flush methods
 * @nbcon_atomic:        Flush directly using nbcon_atomic() callback
 * @nbcon_offload:        Offload flush to printer thread
 * @legacy_direct:        Call the legacy loop in this context
 * @legacy_offload:        Offload the legacy loop into IRQ or legacy thread
 *
 * Note that the legacy loop also flushes the nbcon consoles.
 */
struct console_flush_type {
        bool        nbcon_atomic;
        bool        nbcon_offload;
        bool        legacy_direct;
        bool        legacy_offload;
};

/*
 * Identify which console flushing methods should be used in the context of
 * the caller.
 */
static inline void printk_get_console_flush_type(struct console_flush_type *ft)
{
        memset(ft, 0, sizeof(*ft));

        switch (nbcon_get_default_prio()) {
        case NBCON_PRIO_NORMAL:
                if (have_nbcon_console && !have_boot_console) {
                        if (printk_kthreads_running)
                                ft->nbcon_offload = true;
                        else
                                ft->nbcon_atomic = true;
                }

                /* Legacy consoles are flushed directly when possible. */
                if (have_legacy_console || have_boot_console) {
                        if (!is_printk_legacy_deferred())
                                ft->legacy_direct = true;
                        else
                                ft->legacy_offload = true;
                }
                break;

        case NBCON_PRIO_EMERGENCY:
                if (have_nbcon_console && !have_boot_console)
                        ft->nbcon_atomic = true;

                /* Legacy consoles are flushed directly when possible. */
                if (have_legacy_console || have_boot_console) {
                        if (!is_printk_legacy_deferred())
                                ft->legacy_direct = true;
                        else
                                ft->legacy_offload = true;
                }
                break;

        case NBCON_PRIO_PANIC:
                /*
                 * In panic, the nbcon consoles will directly print. But
                 * only allowed if there are no boot consoles.
                 */
                if (have_nbcon_console && !have_boot_console)
                        ft->nbcon_atomic = true;

                if (have_legacy_console || have_boot_console) {
                        /*
                         * This is the same decision as NBCON_PRIO_NORMAL
                         * except that offloading never occurs in panic.
                         *
                         * Note that console_flush_on_panic() will flush
                         * legacy consoles anyway, even if unsafe.
                         */
                        if (!is_printk_legacy_deferred())
                                ft->legacy_direct = true;

                        /*
                         * In panic, if nbcon atomic printing occurs,
                         * the legacy consoles must remain silent until
                         * explicitly allowed.
                         */
                        if (ft->nbcon_atomic && !legacy_allow_panic_sync)
                                ft->legacy_direct = false;
                }
                break;

        default:
                WARN_ON_ONCE(1);
                break;
        }
}

extern struct printk_buffers printk_shared_pbufs;

/**
 * struct printk_buffers - Buffers to read/format/output printk messages.
 * @outbuf:        After formatting, contains text to output.
 * @scratchbuf:        Used as temporary ringbuffer reading and string-print space.
 */
struct printk_buffers {
        char        outbuf[PRINTK_MESSAGE_MAX];
        char        scratchbuf[PRINTKRB_RECORD_MAX];
};

/**
 * struct printk_message - Container for a prepared printk message.
 * @pbufs:        printk buffers used to prepare the message.
 * @outbuf_len:        The length of prepared text in @pbufs->outbuf to output. This
 *                does not count the terminator. A value of 0 means there is
 *                nothing to output and this record should be skipped.
 * @seq:        The sequence number of the record used for @pbufs->outbuf.
 * @dropped:        The number of dropped records from reading @seq.
 */
struct printk_message {
        struct printk_buffers        *pbufs;
        unsigned int                outbuf_len;
        u64                        seq;
        unsigned long                dropped;
};

bool other_cpu_in_panic(void);
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
                             bool is_extended, bool may_supress);

#ifdef CONFIG_PRINTK
void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
void console_prepend_replay(struct printk_message *pmsg);
#endif

#ifdef CONFIG_SMP
bool is_printk_cpu_sync_owner(void);
#else
static inline bool is_printk_cpu_sync_owner(void) { return false; }
#endif







































































  703 



































































































































































































  672 




  188 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_COUNTER_H
#define _LINUX_PERCPU_COUNTER_H
/*
 * A simple "approximate counter" for use in ext2 and ext3 superblocks.
 *
 * WARNING: these things are HUGE.  4 kbytes per counter on 32-way P4.
 */

#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/list.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>

/* percpu_counter batch for local add or sub */
#define PERCPU_COUNTER_LOCAL_BATCH        INT_MAX

#ifdef CONFIG_SMP

struct percpu_counter {
        raw_spinlock_t lock;
        s64 count;
#ifdef CONFIG_HOTPLUG_CPU
        struct list_head list;        /* All percpu_counters are on a list */
#endif
        s32 __percpu *counters;
};

extern int percpu_counter_batch;

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
                               gfp_t gfp, u32 nr_counters,
                               struct lock_class_key *key);

#define percpu_counter_init_many(fbc, value, gfp, nr_counters)                \
        ({                                                                \
                static struct lock_class_key __key;                        \
                                                                        \
                __percpu_counter_init_many(fbc, value, gfp, nr_counters,\
                                           &__key);                        \
        })


#define percpu_counter_init(fbc, value, gfp)                                \
        percpu_counter_init_many(fbc, value, gfp, 1)

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters);
static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
        percpu_counter_destroy_many(fbc, 1);
}

void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
                              s32 batch);
s64 __percpu_counter_sum(struct percpu_counter *fbc);
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
bool __percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit,
                                  s64 amount, s32 batch);
void percpu_counter_sync(struct percpu_counter *fbc);

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        return __percpu_counter_compare(fbc, rhs, percpu_counter_batch);
}

static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        return __percpu_counter_limited_add(fbc, limit, amount,
                                            percpu_counter_batch);
}

/*
 * With percpu_counter_add_local() and percpu_counter_sub_local(), counts
 * are accumulated in local per cpu counter and not in fbc->count until
 * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter
 * write efficient.
 * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be
 * used to add up the counts from each CPU to account for all the local
 * counts. So percpu_counter_add_local() and percpu_counter_sub_local()
 * should be used when a counter is updated frequently and read rarely.
 */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH);
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        s64 ret = __percpu_counter_sum(fbc);
        return ret < 0 ? 0 : ret;
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return __percpu_counter_sum(fbc);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * It is possible for the percpu_counter_read() to return a small negative
 * number for some counter which should never be negative.
 *
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        /* Prevent reloads of fbc->count */
        s64 ret = READ_ONCE(fbc->count);

        if (ret >= 0)
                return ret;
        return 0;
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return (fbc->counters != NULL);
}

#else /* !CONFIG_SMP */

struct percpu_counter {
        s64 count;
};

static inline int percpu_counter_init_many(struct percpu_counter *fbc,
                                           s64 amount, gfp_t gfp,
                                           u32 nr_counters)
{
        u32 i;

        for (i = 0; i < nr_counters; i++)
                fbc[i].count = amount;

        return 0;
}

static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
                                      gfp_t gfp)
{
        return percpu_counter_init_many(fbc, amount, gfp, 1);
}

static inline void percpu_counter_destroy_many(struct percpu_counter *fbc,
                                               u32 nr_counters)
{
}

static inline void percpu_counter_destroy(struct percpu_counter *fbc)
{
}

static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        fbc->count = amount;
}

static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
{
        if (fbc->count > rhs)
                return 1;
        else if (fbc->count < rhs)
                return -1;
        else
                return 0;
}

static inline int
__percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        return percpu_counter_compare(fbc, rhs);
}

static inline void
percpu_counter_add(struct percpu_counter *fbc, s64 amount)
{
        unsigned long flags;

        local_irq_save(flags);
        fbc->count += amount;
        local_irq_restore(flags);
}

static inline bool
percpu_counter_limited_add(struct percpu_counter *fbc, s64 limit, s64 amount)
{
        unsigned long flags;
        bool good = false;
        s64 count;

        if (amount == 0)
                return true;

        local_irq_save(flags);
        count = fbc->count + amount;
        if ((amount > 0 && count <= limit) ||
            (amount < 0 && count >= limit)) {
                fbc->count = count;
                good = true;
        }
        local_irq_restore(flags);
        return good;
}

/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */
static inline void
percpu_counter_add_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, amount);
}

static inline void
percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        percpu_counter_add(fbc, amount);
}

static inline s64 percpu_counter_read(struct percpu_counter *fbc)
{
        return fbc->count;
}

/*
 * percpu_counter is intended to track positive numbers. In the UP case the
 * number should never be negative.
 */
static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc)
{
        return fbc->count;
}

static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
{
        return percpu_counter_read_positive(fbc);
}

static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
{
        return percpu_counter_read(fbc);
}

static inline bool percpu_counter_initialized(struct percpu_counter *fbc)
{
        return true;
}

static inline void percpu_counter_sync(struct percpu_counter *fbc)
{
}
#endif        /* CONFIG_SMP */

static inline void percpu_counter_inc(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, 1);
}

static inline void percpu_counter_dec(struct percpu_counter *fbc)
{
        percpu_counter_add(fbc, -1);
}

static inline void percpu_counter_sub(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add(fbc, -amount);
}

static inline void
percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount)
{
        percpu_counter_add_local(fbc, -amount);
}

#endif /* _LINUX_PERCPU_COUNTER_H */

















































 1511 









 1508 
















 1509 


 1517 




 1508 





















 1512 

































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Common arm64 stack unwinder code.
 *
 * See: arch/arm64/kernel/stacktrace.c for the reference implementation.
 *
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_STACKTRACE_COMMON_H
#define __ASM_STACKTRACE_COMMON_H

#include <linux/types.h>

struct stack_info {
        unsigned long low;
        unsigned long high;
};

/**
 * struct unwind_state - state used for robust unwinding.
 *
 * @fp:          The fp value in the frame record (or the real fp)
 * @pc:          The lr value in the frame record (or the real lr)
 *
 * @stack:       The stack currently being unwound.
 * @stacks:      An array of stacks which can be unwound.
 * @nr_stacks:   The number of stacks in @stacks.
 */
struct unwind_state {
        unsigned long fp;
        unsigned long pc;

        struct stack_info stack;
        struct stack_info *stacks;
        int nr_stacks;
};

static inline struct stack_info stackinfo_get_unknown(void)
{
        return (struct stack_info) {
                .low = 0,
                .high = 0,
        };
}

static inline bool stackinfo_on_stack(const struct stack_info *info,
                                      unsigned long sp, unsigned long size)
{
        if (!info->low)
                return false;

        if (sp < info->low || sp + size < sp || sp + size > info->high)
                return false;

        return true;
}

static inline void unwind_init_common(struct unwind_state *state)
{
        state->stack = stackinfo_get_unknown();
}

/**
 * unwind_find_stack() - Find the accessible stack which entirely contains an
 * object.
 *
 * @state: the current unwind state.
 * @sp:    the base address of the object.
 * @size:  the size of the object.
 *
 * Return: a pointer to the relevant stack_info if found; NULL otherwise.
 */
static struct stack_info *unwind_find_stack(struct unwind_state *state,
                                            unsigned long sp,
                                            unsigned long size)
{
        struct stack_info *info = &state->stack;

        if (stackinfo_on_stack(info, sp, size))
                return info;

        for (int i = 0; i < state->nr_stacks; i++) {
                info = &state->stacks[i];
                if (stackinfo_on_stack(info, sp, size))
                        return info;
        }

        return NULL;
}

/**
 * unwind_consume_stack() - Update stack boundaries so that future unwind steps
 * cannot consume this object again.
 *
 * @state: the current unwind state.
 * @info:  the stack_info of the stack containing the object.
 * @sp:    the base address of the object.
 * @size:  the size of the object.
 *
 * Return: 0 upon success, an error code otherwise.
 */
static inline void unwind_consume_stack(struct unwind_state *state,
                                        struct stack_info *info,
                                        unsigned long sp,
                                        unsigned long size)
{
        struct stack_info tmp;

        /*
         * Stack transitions are strictly one-way, and once we've
         * transitioned from one stack to another, it's never valid to
         * unwind back to the old stack.
         *
         * Destroy the old stack info so that it cannot be found upon a
         * subsequent transition. If the stack has not changed, we'll
         * immediately restore the current stack info.
         *
         * Note that stacks can nest in several valid orders, e.g.
         *
         *   TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL
         *   TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW
         *   HYP -> OVERFLOW
         *
         * ... so we do not check the specific order of stack
         * transitions.
         */
        tmp = *info;
        *info = stackinfo_get_unknown();
        state->stack = tmp;

        /*
         * Future unwind steps can only consume stack above this frame record.
         * Update the current stack to start immediately above it.
         */
        state->stack.low = sp + size;
}

/**
 * unwind_next_frame_record() - Unwind to the next frame record.
 *
 * @state:        the current unwind state.
 *
 * Return: 0 upon success, an error code otherwise.
 */
static inline int
unwind_next_frame_record(struct unwind_state *state)
{
        struct stack_info *info;
        struct frame_record *record;
        unsigned long fp = state->fp;

        if (fp & 0x7)
                return -EINVAL;

        info = unwind_find_stack(state, fp, sizeof(*record));
        if (!info)
                return -EINVAL;

        unwind_consume_stack(state, info, fp, sizeof(*record));

        /*
         * Record this frame record's values.
         */
        record = (struct frame_record *)fp;
        state->fp = READ_ONCE(record->fp);
        state->pc = READ_ONCE(record->lr);

        return 0;
}

#endif        /* __ASM_STACKTRACE_COMMON_H */

















































   77 
















    4 














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIME64_H
#define _LINUX_TIME64_H

#include <linux/math64.h>
#include <vdso/time64.h>

typedef __s64 time64_t;
typedef __u64 timeu64_t;

#include <uapi/linux/time.h>

struct timespec64 {
        time64_t        tv_sec;                        /* seconds */
        long                tv_nsec;                /* nanoseconds */
};

struct itimerspec64 {
        struct timespec64 it_interval;
        struct timespec64 it_value;
};

/* Parameters used to convert the timespec values: */
#define PSEC_PER_NSEC                        1000L

/* Located here for timespec[64]_valid_strict */
#define TIME64_MAX                        ((s64)~((u64)1 << 63))
#define TIME64_MIN                        (-TIME64_MAX - 1)

#define KTIME_MAX                        ((s64)~((u64)1 << 63))
#define KTIME_MIN                        (-KTIME_MAX - 1)
#define KTIME_SEC_MAX                        (KTIME_MAX / NSEC_PER_SEC)
#define KTIME_SEC_MIN                        (KTIME_MIN / NSEC_PER_SEC)

/*
 * Limits for settimeofday():
 *
 * To prevent setting the time close to the wraparound point time setting
 * is limited so a reasonable uptime can be accomodated. Uptime of 30 years
 * should be really sufficient, which means the cutoff is 2232. At that
 * point the cutoff is just a small part of the larger problem.
 */
#define TIME_UPTIME_SEC_MAX                (30LL * 365 * 24 *3600)
#define TIME_SETTOD_SEC_MAX                (KTIME_SEC_MAX - TIME_UPTIME_SEC_MAX)

static inline int timespec64_equal(const struct timespec64 *a,
                                   const struct timespec64 *b)
{
        return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
}

static inline bool timespec64_is_epoch(const struct timespec64 *ts)
{
        return ts->tv_sec == 0 && ts->tv_nsec == 0;
}

/*
 * lhs < rhs:  return <0
 * lhs == rhs: return 0
 * lhs > rhs:  return >0
 */
static inline int timespec64_compare(const struct timespec64 *lhs, const struct timespec64 *rhs)
{
        if (lhs->tv_sec < rhs->tv_sec)
                return -1;
        if (lhs->tv_sec > rhs->tv_sec)
                return 1;
        return lhs->tv_nsec - rhs->tv_nsec;
}

extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);

static inline struct timespec64 timespec64_add(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec + rhs.tv_sec,
                                lhs.tv_nsec + rhs.tv_nsec);
        return ts_delta;
}

/*
 * sub = lhs - rhs, in normalized form
 */
static inline struct timespec64 timespec64_sub(struct timespec64 lhs,
                                                struct timespec64 rhs)
{
        struct timespec64 ts_delta;
        set_normalized_timespec64(&ts_delta, lhs.tv_sec - rhs.tv_sec,
                                lhs.tv_nsec - rhs.tv_nsec);
        return ts_delta;
}

/*
 * Returns true if the timespec64 is norm, false if denorm:
 */
static inline bool timespec64_valid(const struct timespec64 *ts)
{
        /* Dates before 1970 are bogus */
        if (ts->tv_sec < 0)
                return false;
        /* Can't have more nanoseconds then a second */
        if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
                return false;
        return true;
}

static inline bool timespec64_valid_strict(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values that could overflow ktime_t */
        if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX)
                return false;
        return true;
}

static inline bool timespec64_valid_settod(const struct timespec64 *ts)
{
        if (!timespec64_valid(ts))
                return false;
        /* Disallow values which cause overflow issues vs. CLOCK_REALTIME */
        if ((unsigned long long)ts->tv_sec >= TIME_SETTOD_SEC_MAX)
                return false;
        return true;
}

/**
 * timespec64_to_ns - Convert timespec64 to nanoseconds
 * @ts:                pointer to the timespec64 variable to be converted
 *
 * Returns the scalar nanosecond representation of the timespec64
 * parameter.
 */
static inline s64 timespec64_to_ns(const struct timespec64 *ts)
{
        /* Prevent multiplication overflow / underflow */
        if (ts->tv_sec >= KTIME_SEC_MAX)
                return KTIME_MAX;

        if (ts->tv_sec <= KTIME_SEC_MIN)
                return KTIME_MIN;

        return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec;
}

/**
 * ns_to_timespec64 - Convert nanoseconds to timespec64
 * @nsec:        the nanoseconds value to be converted
 *
 * Returns the timespec64 representation of the nsec parameter.
 */
extern struct timespec64 ns_to_timespec64(s64 nsec);

/**
 * timespec64_add_ns - Adds nanoseconds to a timespec64
 * @a:                pointer to timespec64 to be incremented
 * @ns:                unsigned nanoseconds value to be added
 *
 * This must always be inlined because its used from the x86-64 vdso,
 * which cannot call other kernel functions.
 */
static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
{
        a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns);
        a->tv_nsec = ns;
}

/*
 * timespec64_add_safe assumes both values are positive and checks for
 * overflow. It will return TIME64_MAX in case of overflow.
 */
extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
                                         const struct timespec64 rhs);

#endif /* _LINUX_TIME64_H */







































  135 


  135 

































  212 





  135 




  136 









  134 








  135 






















  213 
















  185 
    1 












  213 
















   23 
  110 




  213 










  213 


  213 


  212 





  136 

  136 


  134 
   14 


  213 







  213 







  213 


  190 


  213 



















  213 












  213 








  213 



  213 
  213 











    1 










    1 












    1 

    1 



































































































































































































































































  246 



  246 




































  246 















  136 




   68 








   97 

   97 










  134 


















  135 











  135 
























































  134 


















  135 











  135 












  134 





  135 









  135 


  135 
































   14 

   14 











  136 





















  110 





  135 




  133 






  135 



  135 









    1 






















  135 
  135 




  135 

  135 








  110 


  110 


















  135 






















  134 
  134 















  135 




  110 

  135 








  135 





  134 
















  134 































   45 







   44 







   14 

   14 

   14 
   14 







   14 

   14 



   14 
   14 





   45 










   45 













   75 




   68 












    1 





    7 



    7 













    7 






















   68 




































































    7 
















    7 

    7 

































































































































































































  246 




  246 





























   24 


   24 











   24 







   24 

   24 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Stand-alone page-table allocator for hyp stage-1 and guest stage-2.
 * No bombay mix was harmed in the writing of this file.
 *
 * Copyright (C) 2020 Google LLC
 * Author: Will Deacon <will@kernel.org>
 */

#include <linux/bitfield.h>
#include <asm/kvm_pgtable.h>
#include <asm/stage2_pgtable.h>

struct kvm_pgtable_walk_data {
        struct kvm_pgtable_walker        *walker;

        const u64                        start;
        u64                                addr;
        const u64                        end;
};

static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx)
{
        return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI);
}

static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx)
{
        return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO);
}

static bool kvm_block_mapping_supported(const struct kvm_pgtable_visit_ctx *ctx, u64 phys)
{
        u64 granule = kvm_granule_size(ctx->level);

        if (!kvm_level_supports_block_mapping(ctx->level))
                return false;

        if (granule > (ctx->end - ctx->addr))
                return false;

        if (!IS_ALIGNED(phys, granule))
                return false;

        return IS_ALIGNED(ctx->addr, granule);
}

static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, s8 level)
{
        u64 shift = kvm_granule_shift(level);
        u64 mask = BIT(PAGE_SHIFT - 3) - 1;

        return (data->addr >> shift) & mask;
}

static u32 kvm_pgd_page_idx(struct kvm_pgtable *pgt, u64 addr)
{
        u64 shift = kvm_granule_shift(pgt->start_level - 1); /* May underflow */
        u64 mask = BIT(pgt->ia_bits) - 1;

        return (addr & mask) >> shift;
}

static u32 kvm_pgd_pages(u32 ia_bits, s8 start_level)
{
        struct kvm_pgtable pgt = {
                .ia_bits        = ia_bits,
                .start_level        = start_level,
        };

        return kvm_pgd_page_idx(&pgt, -1ULL) + 1;
}

static bool kvm_pte_table(kvm_pte_t pte, s8 level)
{
        if (level == KVM_PGTABLE_LAST_LEVEL)
                return false;

        if (!kvm_pte_valid(pte))
                return false;

        return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE;
}

static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops)
{
        return mm_ops->phys_to_virt(kvm_pte_to_phys(pte));
}

static void kvm_clear_pte(kvm_pte_t *ptep)
{
        WRITE_ONCE(*ptep, 0);
}

static kvm_pte_t kvm_init_table_pte(kvm_pte_t *childp, struct kvm_pgtable_mm_ops *mm_ops)
{
        kvm_pte_t pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp));

        pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE);
        pte |= KVM_PTE_VALID;
        return pte;
}

static kvm_pte_t kvm_init_valid_leaf_pte(u64 pa, kvm_pte_t attr, s8 level)
{
        kvm_pte_t pte = kvm_phys_to_pte(pa);
        u64 type = (level == KVM_PGTABLE_LAST_LEVEL) ? KVM_PTE_TYPE_PAGE :
                                                       KVM_PTE_TYPE_BLOCK;

        pte |= attr & (KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI);
        pte |= FIELD_PREP(KVM_PTE_TYPE, type);
        pte |= KVM_PTE_VALID;

        return pte;
}

static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id)
{
        return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id);
}

static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data,
                                  const struct kvm_pgtable_visit_ctx *ctx,
                                  enum kvm_pgtable_walk_flags visit)
{
        struct kvm_pgtable_walker *walker = data->walker;

        /* Ensure the appropriate lock is held (e.g. RCU lock for stage-2 MMU) */
        WARN_ON_ONCE(kvm_pgtable_walk_shared(ctx) && !kvm_pgtable_walk_lock_held());
        return walker->cb(ctx, visit);
}

static bool kvm_pgtable_walk_continue(const struct kvm_pgtable_walker *walker,
                                      int r)
{
        /*
         * Visitor callbacks return EAGAIN when the conditions that led to a
         * fault are no longer reflected in the page tables due to a race to
         * update a PTE. In the context of a fault handler this is interpreted
         * as a signal to retry guest execution.
         *
         * Ignore the return code altogether for walkers outside a fault handler
         * (e.g. write protecting a range of memory) and chug along with the
         * page table walk.
         */
        if (r == -EAGAIN)
                return !(walker->flags & KVM_PGTABLE_WALK_HANDLE_FAULT);

        return !r;
}

static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
                              struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level);

static inline int __kvm_pgtable_visit(struct kvm_pgtable_walk_data *data,
                                      struct kvm_pgtable_mm_ops *mm_ops,
                                      kvm_pteref_t pteref, s8 level)
{
        enum kvm_pgtable_walk_flags flags = data->walker->flags;
        kvm_pte_t *ptep = kvm_dereference_pteref(data->walker, pteref);
        struct kvm_pgtable_visit_ctx ctx = {
                .ptep        = ptep,
                .old        = READ_ONCE(*ptep),
                .arg        = data->walker->arg,
                .mm_ops        = mm_ops,
                .start        = data->start,
                .addr        = data->addr,
                .end        = data->end,
                .level        = level,
                .flags        = flags,
        };
        int ret = 0;
        bool reload = false;
        kvm_pteref_t childp;
        bool table = kvm_pte_table(ctx.old, level);

        if (table && (ctx.flags & KVM_PGTABLE_WALK_TABLE_PRE)) {
                ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_PRE);
                reload = true;
        }

        if (!table && (ctx.flags & KVM_PGTABLE_WALK_LEAF)) {
                ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_LEAF);
                reload = true;
        }

        /*
         * Reload the page table after invoking the walker callback for leaf
         * entries or after pre-order traversal, to allow the walker to descend
         * into a newly installed or replaced table.
         */
        if (reload) {
                ctx.old = READ_ONCE(*ptep);
                table = kvm_pte_table(ctx.old, level);
        }

        if (!kvm_pgtable_walk_continue(data->walker, ret))
                goto out;

        if (!table) {
                data->addr = ALIGN_DOWN(data->addr, kvm_granule_size(level));
                data->addr += kvm_granule_size(level);
                goto out;
        }

        childp = (kvm_pteref_t)kvm_pte_follow(ctx.old, mm_ops);
        ret = __kvm_pgtable_walk(data, mm_ops, childp, level + 1);
        if (!kvm_pgtable_walk_continue(data->walker, ret))
                goto out;

        if (ctx.flags & KVM_PGTABLE_WALK_TABLE_POST)
                ret = kvm_pgtable_visitor_cb(data, &ctx, KVM_PGTABLE_WALK_TABLE_POST);

out:
        if (kvm_pgtable_walk_continue(data->walker, ret))
                return 0;

        return ret;
}

static int __kvm_pgtable_walk(struct kvm_pgtable_walk_data *data,
                              struct kvm_pgtable_mm_ops *mm_ops, kvm_pteref_t pgtable, s8 level)
{
        u32 idx;
        int ret = 0;

        if (WARN_ON_ONCE(level < KVM_PGTABLE_FIRST_LEVEL ||
                         level > KVM_PGTABLE_LAST_LEVEL))
                return -EINVAL;

        for (idx = kvm_pgtable_idx(data, level); idx < PTRS_PER_PTE; ++idx) {
                kvm_pteref_t pteref = &pgtable[idx];

                if (data->addr >= data->end)
                        break;

                ret = __kvm_pgtable_visit(data, mm_ops, pteref, level);
                if (ret)
                        break;
        }

        return ret;
}

static int _kvm_pgtable_walk(struct kvm_pgtable *pgt, struct kvm_pgtable_walk_data *data)
{
        u32 idx;
        int ret = 0;
        u64 limit = BIT(pgt->ia_bits);

        if (data->addr > limit || data->end > limit)
                return -ERANGE;

        if (!pgt->pgd)
                return -EINVAL;

        for (idx = kvm_pgd_page_idx(pgt, data->addr); data->addr < data->end; ++idx) {
                kvm_pteref_t pteref = &pgt->pgd[idx * PTRS_PER_PTE];

                ret = __kvm_pgtable_walk(data, pgt->mm_ops, pteref, pgt->start_level);
                if (ret)
                        break;
        }

        return ret;
}

int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
                     struct kvm_pgtable_walker *walker)
{
        struct kvm_pgtable_walk_data walk_data = {
                .start        = ALIGN_DOWN(addr, PAGE_SIZE),
                .addr        = ALIGN_DOWN(addr, PAGE_SIZE),
                .end        = PAGE_ALIGN(walk_data.addr + size),
                .walker        = walker,
        };
        int r;

        r = kvm_pgtable_walk_begin(walker);
        if (r)
                return r;

        r = _kvm_pgtable_walk(pgt, &walk_data);
        kvm_pgtable_walk_end(walker);

        return r;
}

struct leaf_walk_data {
        kvm_pte_t        pte;
        s8                level;
};

static int leaf_walker(const struct kvm_pgtable_visit_ctx *ctx,
                       enum kvm_pgtable_walk_flags visit)
{
        struct leaf_walk_data *data = ctx->arg;

        data->pte   = ctx->old;
        data->level = ctx->level;

        return 0;
}

int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
                         kvm_pte_t *ptep, s8 *level)
{
        struct leaf_walk_data data;
        struct kvm_pgtable_walker walker = {
                .cb        = leaf_walker,
                .flags        = KVM_PGTABLE_WALK_LEAF,
                .arg        = &data,
        };
        int ret;

        ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE),
                               PAGE_SIZE, &walker);
        if (!ret) {
                if (ptep)
                        *ptep  = data.pte;
                if (level)
                        *level = data.level;
        }

        return ret;
}

struct hyp_map_data {
        const u64                        phys;
        kvm_pte_t                        attr;
};

static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep)
{
        bool device = prot & KVM_PGTABLE_PROT_DEVICE;
        u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL;
        kvm_pte_t attr = FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, mtype);
        u32 sh = KVM_PTE_LEAF_ATTR_LO_S1_SH_IS;
        u32 ap = (prot & KVM_PGTABLE_PROT_W) ? KVM_PTE_LEAF_ATTR_LO_S1_AP_RW :
                                               KVM_PTE_LEAF_ATTR_LO_S1_AP_RO;

        if (!(prot & KVM_PGTABLE_PROT_R))
                return -EINVAL;

        if (prot & KVM_PGTABLE_PROT_X) {
                if (prot & KVM_PGTABLE_PROT_W)
                        return -EINVAL;

                if (device)
                        return -EINVAL;

                if (system_supports_bti_kernel())
                        attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP;
        } else {
                attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN;
        }

        attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap);
        if (!kvm_lpa2_is_enabled())
                attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh);
        attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF;
        attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
        *ptep = attr;

        return 0;
}

enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte)
{
        enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;
        u32 ap;

        if (!kvm_pte_valid(pte))
                return prot;

        if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN))
                prot |= KVM_PGTABLE_PROT_X;

        ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte);
        if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO)
                prot |= KVM_PGTABLE_PROT_R;
        else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW)
                prot |= KVM_PGTABLE_PROT_RW;

        return prot;
}

static bool hyp_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
                                    struct hyp_map_data *data)
{
        u64 phys = data->phys + (ctx->addr - ctx->start);
        kvm_pte_t new;

        if (!kvm_block_mapping_supported(ctx, phys))
                return false;

        new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
        if (ctx->old == new)
                return true;
        if (!kvm_pte_valid(ctx->old))
                ctx->mm_ops->get_page(ctx->ptep);
        else if (WARN_ON((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW))
                return false;

        smp_store_release(ctx->ptep, new);
        return true;
}

static int hyp_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
                          enum kvm_pgtable_walk_flags visit)
{
        kvm_pte_t *childp, new;
        struct hyp_map_data *data = ctx->arg;
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;

        if (hyp_map_walker_try_leaf(ctx, data))
                return 0;

        if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
                return -EINVAL;

        childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL);
        if (!childp)
                return -ENOMEM;

        new = kvm_init_table_pte(childp, mm_ops);
        mm_ops->get_page(ctx->ptep);
        smp_store_release(ctx->ptep, new);

        return 0;
}

int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
                        enum kvm_pgtable_prot prot)
{
        int ret;
        struct hyp_map_data map_data = {
                .phys        = ALIGN_DOWN(phys, PAGE_SIZE),
        };
        struct kvm_pgtable_walker walker = {
                .cb        = hyp_map_walker,
                .flags        = KVM_PGTABLE_WALK_LEAF,
                .arg        = &map_data,
        };

        ret = hyp_set_prot_attr(prot, &map_data.attr);
        if (ret)
                return ret;

        ret = kvm_pgtable_walk(pgt, addr, size, &walker);
        dsb(ishst);
        isb();
        return ret;
}

static int hyp_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
                            enum kvm_pgtable_walk_flags visit)
{
        kvm_pte_t *childp = NULL;
        u64 granule = kvm_granule_size(ctx->level);
        u64 *unmapped = ctx->arg;
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;

        if (!kvm_pte_valid(ctx->old))
                return -EINVAL;

        if (kvm_pte_table(ctx->old, ctx->level)) {
                childp = kvm_pte_follow(ctx->old, mm_ops);

                if (mm_ops->page_count(childp) != 1)
                        return 0;

                kvm_clear_pte(ctx->ptep);
                dsb(ishst);
                __tlbi_level(vae2is, __TLBI_VADDR(ctx->addr, 0), TLBI_TTL_UNKNOWN);
        } else {
                if (ctx->end - ctx->addr < granule)
                        return -EINVAL;

                kvm_clear_pte(ctx->ptep);
                dsb(ishst);
                __tlbi_level(vale2is, __TLBI_VADDR(ctx->addr, 0), ctx->level);
                *unmapped += granule;
        }

        dsb(ish);
        isb();
        mm_ops->put_page(ctx->ptep);

        if (childp)
                mm_ops->put_page(childp);

        return 0;
}

u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
        u64 unmapped = 0;
        struct kvm_pgtable_walker walker = {
                .cb        = hyp_unmap_walker,
                .arg        = &unmapped,
                .flags        = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
        };

        if (!pgt->mm_ops->page_count)
                return 0;

        kvm_pgtable_walk(pgt, addr, size, &walker);
        return unmapped;
}

int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
                         struct kvm_pgtable_mm_ops *mm_ops)
{
        s8 start_level = KVM_PGTABLE_LAST_LEVEL + 1 -
                         ARM64_HW_PGTABLE_LEVELS(va_bits);

        if (start_level < KVM_PGTABLE_FIRST_LEVEL ||
            start_level > KVM_PGTABLE_LAST_LEVEL)
                return -EINVAL;

        pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_page(NULL);
        if (!pgt->pgd)
                return -ENOMEM;

        pgt->ia_bits                = va_bits;
        pgt->start_level        = start_level;
        pgt->mm_ops                = mm_ops;
        pgt->mmu                = NULL;
        pgt->force_pte_cb        = NULL;

        return 0;
}

static int hyp_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
                           enum kvm_pgtable_walk_flags visit)
{
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;

        if (!kvm_pte_valid(ctx->old))
                return 0;

        mm_ops->put_page(ctx->ptep);

        if (kvm_pte_table(ctx->old, ctx->level))
                mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));

        return 0;
}

void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt)
{
        struct kvm_pgtable_walker walker = {
                .cb        = hyp_free_walker,
                .flags        = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
        };

        WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
        pgt->mm_ops->put_page(kvm_dereference_pteref(&walker, pgt->pgd));
        pgt->pgd = NULL;
}

struct stage2_map_data {
        const u64                        phys;
        kvm_pte_t                        attr;
        u8                                owner_id;

        kvm_pte_t                        *anchor;
        kvm_pte_t                        *childp;

        struct kvm_s2_mmu                *mmu;
        void                                *memcache;

        /* Force mappings to page granularity */
        bool                                force_pte;

        /* Walk should update owner_id only */
        bool                                annotation;
};

u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
{
        u64 vtcr = VTCR_EL2_FLAGS;
        s8 lvls;

        vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT;
        vtcr |= VTCR_EL2_T0SZ(phys_shift);
        /*
         * Use a minimum 2 level page table to prevent splitting
         * host PMD huge pages at stage2.
         */
        lvls = stage2_pgtable_levels(phys_shift);
        if (lvls < 2)
                lvls = 2;

        /*
         * When LPA2 is enabled, the HW supports an extra level of translation
         * (for 5 in total) when using 4K pages. It also introduces VTCR_EL2.SL2
         * to as an addition to SL0 to enable encoding this extra start level.
         * However, since we always use concatenated pages for the first level
         * lookup, we will never need this extra level and therefore do not need
         * to touch SL2.
         */
        vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);

#ifdef CONFIG_ARM64_HW_AFDBM
        /*
         * Enable the Hardware Access Flag management, unconditionally
         * on all CPUs. In systems that have asymmetric support for the feature
         * this allows KVM to leverage hardware support on the subset of cores
         * that implement the feature.
         *
         * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by
         * hardware) on implementations that do not advertise support for the
         * feature. As such, setting HA unconditionally is safe, unless you
         * happen to be running on a design that has unadvertised support for
         * HAFDBS. Here be dragons.
         */
        if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38))
                vtcr |= VTCR_EL2_HA;
#endif /* CONFIG_ARM64_HW_AFDBM */

        if (kvm_lpa2_is_enabled())
                vtcr |= VTCR_EL2_DS;

        /* Set the vmid bits */
        vtcr |= (get_vmid_bits(mmfr1) == 16) ?
                VTCR_EL2_VS_16BIT :
                VTCR_EL2_VS_8BIT;

        return vtcr;
}

static bool stage2_has_fwb(struct kvm_pgtable *pgt)
{
        if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
                return false;

        return !(pgt->flags & KVM_PGTABLE_S2_NOFWB);
}

void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
                                phys_addr_t addr, size_t size)
{
        unsigned long pages, inval_pages;

        if (!system_supports_tlb_range()) {
                kvm_call_hyp(__kvm_tlb_flush_vmid, mmu);
                return;
        }

        pages = size >> PAGE_SHIFT;
        while (pages > 0) {
                inval_pages = min(pages, MAX_TLBI_RANGE_PAGES);
                kvm_call_hyp(__kvm_tlb_flush_vmid_range, mmu, addr, inval_pages);

                addr += inval_pages << PAGE_SHIFT;
                pages -= inval_pages;
        }
}

#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))

static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
                                kvm_pte_t *ptep)
{
        kvm_pte_t attr;
        u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;

        switch (prot & (KVM_PGTABLE_PROT_DEVICE |
                        KVM_PGTABLE_PROT_NORMAL_NC)) {
        case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
                return -EINVAL;
        case KVM_PGTABLE_PROT_DEVICE:
                if (prot & KVM_PGTABLE_PROT_X)
                        return -EINVAL;
                attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
                break;
        case KVM_PGTABLE_PROT_NORMAL_NC:
                if (prot & KVM_PGTABLE_PROT_X)
                        return -EINVAL;
                attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
                break;
        default:
                attr = KVM_S2_MEMATTR(pgt, NORMAL);
        }

        if (!(prot & KVM_PGTABLE_PROT_X))
                attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;

        if (prot & KVM_PGTABLE_PROT_R)
                attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;

        if (prot & KVM_PGTABLE_PROT_W)
                attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;

        if (!kvm_lpa2_is_enabled())
                attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh);

        attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
        attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW;
        *ptep = attr;

        return 0;
}

enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
{
        enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW;

        if (!kvm_pte_valid(pte))
                return prot;

        if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R)
                prot |= KVM_PGTABLE_PROT_R;
        if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
                prot |= KVM_PGTABLE_PROT_W;
        if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
                prot |= KVM_PGTABLE_PROT_X;

        return prot;
}

static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new)
{
        if (!kvm_pte_valid(old) || !kvm_pte_valid(new))
                return true;

        return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS));
}

static bool stage2_pte_is_counted(kvm_pte_t pte)
{
        /*
         * The refcount tracks valid entries as well as invalid entries if they
         * encode ownership of a page to another entity than the page-table
         * owner, whose id is 0.
         */
        return !!pte;
}

static bool stage2_pte_is_locked(kvm_pte_t pte)
{
        return !kvm_pte_valid(pte) && (pte & KVM_INVALID_PTE_LOCKED);
}

static bool stage2_try_set_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
{
        if (!kvm_pgtable_walk_shared(ctx)) {
                WRITE_ONCE(*ctx->ptep, new);
                return true;
        }

        return cmpxchg(ctx->ptep, ctx->old, new) == ctx->old;
}

/**
 * stage2_try_break_pte() - Invalidates a pte according to the
 *                            'break-before-make' requirements of the
 *                            architecture.
 *
 * @ctx: context of the visited pte.
 * @mmu: stage-2 mmu
 *
 * Returns: true if the pte was successfully broken.
 *
 * If the removed pte was valid, performs the necessary serialization and TLB
 * invalidation for the old value. For counted ptes, drops the reference count
 * on the containing table page.
 */
static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx,
                                 struct kvm_s2_mmu *mmu)
{
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;

        if (stage2_pte_is_locked(ctx->old)) {
                /*
                 * Should never occur if this walker has exclusive access to the
                 * page tables.
                 */
                WARN_ON(!kvm_pgtable_walk_shared(ctx));
                return false;
        }

        if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED))
                return false;

        if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) {
                /*
                 * Perform the appropriate TLB invalidation based on the
                 * evicted pte value (if any).
                 */
                if (kvm_pte_table(ctx->old, ctx->level)) {
                        u64 size = kvm_granule_size(ctx->level);
                        u64 addr = ALIGN_DOWN(ctx->addr, size);

                        kvm_tlb_flush_vmid_range(mmu, addr, size);
                } else if (kvm_pte_valid(ctx->old)) {
                        kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu,
                                     ctx->addr, ctx->level);
                }
        }

        if (stage2_pte_is_counted(ctx->old))
                mm_ops->put_page(ctx->ptep);

        return true;
}

static void stage2_make_pte(const struct kvm_pgtable_visit_ctx *ctx, kvm_pte_t new)
{
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;

        WARN_ON(!stage2_pte_is_locked(*ctx->ptep));

        if (stage2_pte_is_counted(new))
                mm_ops->get_page(ctx->ptep);

        smp_store_release(ctx->ptep, new);
}

static bool stage2_unmap_defer_tlb_flush(struct kvm_pgtable *pgt)
{
        /*
         * If FEAT_TLBIRANGE is implemented, defer the individual
         * TLB invalidations until the entire walk is finished, and
         * then use the range-based TLBI instructions to do the
         * invalidations. Condition deferred TLB invalidation on the
         * system supporting FWB as the optimization is entirely
         * pointless when the unmap walker needs to perform CMOs.
         */
        return system_supports_tlb_range() && stage2_has_fwb(pgt);
}

static void stage2_unmap_put_pte(const struct kvm_pgtable_visit_ctx *ctx,
                                struct kvm_s2_mmu *mmu,
                                struct kvm_pgtable_mm_ops *mm_ops)
{
        struct kvm_pgtable *pgt = ctx->arg;

        /*
         * Clear the existing PTE, and perform break-before-make if it was
         * valid. Depending on the system support, defer the TLB maintenance
         * for the same until the entire unmap walk is completed.
         */
        if (kvm_pte_valid(ctx->old)) {
                kvm_clear_pte(ctx->ptep);

                if (kvm_pte_table(ctx->old, ctx->level)) {
                        kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
                                     TLBI_TTL_UNKNOWN);
                } else if (!stage2_unmap_defer_tlb_flush(pgt)) {
                        kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr,
                                     ctx->level);
                }
        }

        mm_ops->put_page(ctx->ptep);
}

static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte)
{
        u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR;
        return kvm_pte_valid(pte) && memattr == KVM_S2_MEMATTR(pgt, NORMAL);
}

static bool stage2_pte_executable(kvm_pte_t pte)
{
        return kvm_pte_valid(pte) && !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN);
}

static u64 stage2_map_walker_phys_addr(const struct kvm_pgtable_visit_ctx *ctx,
                                       const struct stage2_map_data *data)
{
        u64 phys = data->phys;

        /* Work out the correct PA based on how far the walk has gotten */
        return phys + (ctx->addr - ctx->start);
}

static bool stage2_leaf_mapping_allowed(const struct kvm_pgtable_visit_ctx *ctx,
                                        struct stage2_map_data *data)
{
        u64 phys = stage2_map_walker_phys_addr(ctx, data);

        if (data->force_pte && ctx->level < KVM_PGTABLE_LAST_LEVEL)
                return false;

        if (data->annotation)
                return true;

        return kvm_block_mapping_supported(ctx, phys);
}

static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx,
                                      struct stage2_map_data *data)
{
        kvm_pte_t new;
        u64 phys = stage2_map_walker_phys_addr(ctx, data);
        u64 granule = kvm_granule_size(ctx->level);
        struct kvm_pgtable *pgt = data->mmu->pgt;
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;

        if (!stage2_leaf_mapping_allowed(ctx, data))
                return -E2BIG;

        if (!data->annotation)
                new = kvm_init_valid_leaf_pte(phys, data->attr, ctx->level);
        else
                new = kvm_init_invalid_leaf_owner(data->owner_id);

        /*
         * Skip updating the PTE if we are trying to recreate the exact
         * same mapping or only change the access permissions. Instead,
         * the vCPU will exit one more time from guest if still needed
         * and then go through the path of relaxing permissions.
         */
        if (!stage2_pte_needs_update(ctx->old, new))
                return -EAGAIN;

        /* If we're only changing software bits, then store them and go! */
        if (!kvm_pgtable_walk_shared(ctx) &&
            !((ctx->old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) {
                bool old_is_counted = stage2_pte_is_counted(ctx->old);

                if (old_is_counted != stage2_pte_is_counted(new)) {
                        if (old_is_counted)
                                mm_ops->put_page(ctx->ptep);
                        else
                                mm_ops->get_page(ctx->ptep);
                }
                WARN_ON_ONCE(!stage2_try_set_pte(ctx, new));
                return 0;
        }

        if (!stage2_try_break_pte(ctx, data->mmu))
                return -EAGAIN;

        /* Perform CMOs before installation of the guest stage-2 PTE */
        if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc &&
            stage2_pte_cacheable(pgt, new))
                mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops),
                                               granule);

        if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou &&
            stage2_pte_executable(new))
                mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule);

        stage2_make_pte(ctx, new);

        return 0;
}

static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx,
                                     struct stage2_map_data *data)
{
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
        kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
        int ret;

        if (!stage2_leaf_mapping_allowed(ctx, data))
                return 0;

        ret = stage2_map_walker_try_leaf(ctx, data);
        if (ret)
                return ret;

        mm_ops->free_unlinked_table(childp, ctx->level);
        return 0;
}

static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx,
                                struct stage2_map_data *data)
{
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
        kvm_pte_t *childp, new;
        int ret;

        ret = stage2_map_walker_try_leaf(ctx, data);
        if (ret != -E2BIG)
                return ret;

        if (WARN_ON(ctx->level == KVM_PGTABLE_LAST_LEVEL))
                return -EINVAL;

        if (!data->memcache)
                return -ENOMEM;

        childp = mm_ops->zalloc_page(data->memcache);
        if (!childp)
                return -ENOMEM;

        if (!stage2_try_break_pte(ctx, data->mmu)) {
                mm_ops->put_page(childp);
                return -EAGAIN;
        }

        /*
         * If we've run into an existing block mapping then replace it with
         * a table. Accesses beyond 'end' that fall within the new table
         * will be mapped lazily.
         */
        new = kvm_init_table_pte(childp, mm_ops);
        stage2_make_pte(ctx, new);

        return 0;
}

/*
 * The TABLE_PRE callback runs for table entries on the way down, looking
 * for table entries which we could conceivably replace with a block entry
 * for this mapping. If it finds one it replaces the entry and calls
 * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table.
 *
 * Otherwise, the LEAF callback performs the mapping at the existing leaves
 * instead.
 */
static int stage2_map_walker(const struct kvm_pgtable_visit_ctx *ctx,
                             enum kvm_pgtable_walk_flags visit)
{
        struct stage2_map_data *data = ctx->arg;

        switch (visit) {
        case KVM_PGTABLE_WALK_TABLE_PRE:
                return stage2_map_walk_table_pre(ctx, data);
        case KVM_PGTABLE_WALK_LEAF:
                return stage2_map_walk_leaf(ctx, data);
        default:
                return -EINVAL;
        }
}

int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                           u64 phys, enum kvm_pgtable_prot prot,
                           void *mc, enum kvm_pgtable_walk_flags flags)
{
        int ret;
        struct stage2_map_data map_data = {
                .phys                = ALIGN_DOWN(phys, PAGE_SIZE),
                .mmu                = pgt->mmu,
                .memcache        = mc,
                .force_pte        = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot),
        };
        struct kvm_pgtable_walker walker = {
                .cb                = stage2_map_walker,
                .flags                = flags |
                                  KVM_PGTABLE_WALK_TABLE_PRE |
                                  KVM_PGTABLE_WALK_LEAF,
                .arg                = &map_data,
        };

        if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys)))
                return -EINVAL;

        ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
        if (ret)
                return ret;

        ret = kvm_pgtable_walk(pgt, addr, size, &walker);
        dsb(ishst);
        return ret;
}

int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
                                 void *mc, u8 owner_id)
{
        int ret;
        struct stage2_map_data map_data = {
                .mmu                = pgt->mmu,
                .memcache        = mc,
                .owner_id        = owner_id,
                .force_pte        = true,
                .annotation        = true,
        };
        struct kvm_pgtable_walker walker = {
                .cb                = stage2_map_walker,
                .flags                = KVM_PGTABLE_WALK_TABLE_PRE |
                                  KVM_PGTABLE_WALK_LEAF,
                .arg                = &map_data,
        };

        if (owner_id > KVM_MAX_OWNER_ID)
                return -EINVAL;

        ret = kvm_pgtable_walk(pgt, addr, size, &walker);
        return ret;
}

static int stage2_unmap_walker(const struct kvm_pgtable_visit_ctx *ctx,
                               enum kvm_pgtable_walk_flags visit)
{
        struct kvm_pgtable *pgt = ctx->arg;
        struct kvm_s2_mmu *mmu = pgt->mmu;
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
        kvm_pte_t *childp = NULL;
        bool need_flush = false;

        if (!kvm_pte_valid(ctx->old)) {
                if (stage2_pte_is_counted(ctx->old)) {
                        kvm_clear_pte(ctx->ptep);
                        mm_ops->put_page(ctx->ptep);
                }
                return 0;
        }

        if (kvm_pte_table(ctx->old, ctx->level)) {
                childp = kvm_pte_follow(ctx->old, mm_ops);

                if (mm_ops->page_count(childp) != 1)
                        return 0;
        } else if (stage2_pte_cacheable(pgt, ctx->old)) {
                need_flush = !stage2_has_fwb(pgt);
        }

        /*
         * This is similar to the map() path in that we unmap the entire
         * block entry and rely on the remaining portions being faulted
         * back lazily.
         */
        stage2_unmap_put_pte(ctx, mmu, mm_ops);

        if (need_flush && mm_ops->dcache_clean_inval_poc)
                mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
                                               kvm_granule_size(ctx->level));

        if (childp)
                mm_ops->put_page(childp);

        return 0;
}

int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
        int ret;
        struct kvm_pgtable_walker walker = {
                .cb        = stage2_unmap_walker,
                .arg        = pgt,
                .flags        = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST,
        };

        ret = kvm_pgtable_walk(pgt, addr, size, &walker);
        if (stage2_unmap_defer_tlb_flush(pgt))
                /* Perform the deferred TLB invalidations */
                kvm_tlb_flush_vmid_range(pgt->mmu, addr, size);

        return ret;
}

struct stage2_attr_data {
        kvm_pte_t                        attr_set;
        kvm_pte_t                        attr_clr;
        kvm_pte_t                        pte;
        s8                                level;
};

static int stage2_attr_walker(const struct kvm_pgtable_visit_ctx *ctx,
                              enum kvm_pgtable_walk_flags visit)
{
        kvm_pte_t pte = ctx->old;
        struct stage2_attr_data *data = ctx->arg;
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;

        if (!kvm_pte_valid(ctx->old))
                return -EAGAIN;

        data->level = ctx->level;
        data->pte = pte;
        pte &= ~data->attr_clr;
        pte |= data->attr_set;

        /*
         * We may race with the CPU trying to set the access flag here,
         * but worst-case the access flag update gets lost and will be
         * set on the next access instead.
         */
        if (data->pte != pte) {
                /*
                 * Invalidate instruction cache before updating the guest
                 * stage-2 PTE if we are going to add executable permission.
                 */
                if (mm_ops->icache_inval_pou &&
                    stage2_pte_executable(pte) && !stage2_pte_executable(ctx->old))
                        mm_ops->icache_inval_pou(kvm_pte_follow(pte, mm_ops),
                                                  kvm_granule_size(ctx->level));

                if (!stage2_try_set_pte(ctx, pte))
                        return -EAGAIN;
        }

        return 0;
}

static int stage2_update_leaf_attrs(struct kvm_pgtable *pgt, u64 addr,
                                    u64 size, kvm_pte_t attr_set,
                                    kvm_pte_t attr_clr, kvm_pte_t *orig_pte,
                                    s8 *level, enum kvm_pgtable_walk_flags flags)
{
        int ret;
        kvm_pte_t attr_mask = KVM_PTE_LEAF_ATTR_LO | KVM_PTE_LEAF_ATTR_HI;
        struct stage2_attr_data data = {
                .attr_set        = attr_set & attr_mask,
                .attr_clr        = attr_clr & attr_mask,
        };
        struct kvm_pgtable_walker walker = {
                .cb                = stage2_attr_walker,
                .arg                = &data,
                .flags                = flags | KVM_PGTABLE_WALK_LEAF,
        };

        ret = kvm_pgtable_walk(pgt, addr, size, &walker);
        if (ret)
                return ret;

        if (orig_pte)
                *orig_pte = data.pte;

        if (level)
                *level = data.level;
        return 0;
}

int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
        return stage2_update_leaf_attrs(pgt, addr, size, 0,
                                        KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
                                        NULL, NULL, 0);
}

void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
                                enum kvm_pgtable_walk_flags flags)
{
        int ret;

        ret = stage2_update_leaf_attrs(pgt, addr, 1, KVM_PTE_LEAF_ATTR_LO_S2_AF, 0,
                                       NULL, NULL, flags);
        if (!ret)
                dsb(ishst);
}

struct stage2_age_data {
        bool        mkold;
        bool        young;
};

static int stage2_age_walker(const struct kvm_pgtable_visit_ctx *ctx,
                             enum kvm_pgtable_walk_flags visit)
{
        kvm_pte_t new = ctx->old & ~KVM_PTE_LEAF_ATTR_LO_S2_AF;
        struct stage2_age_data *data = ctx->arg;

        if (!kvm_pte_valid(ctx->old) || new == ctx->old)
                return 0;

        data->young = true;

        /*
         * stage2_age_walker() is always called while holding the MMU lock for
         * write, so this will always succeed. Nonetheless, this deliberately
         * follows the race detection pattern of the other stage-2 walkers in
         * case the locking mechanics of the MMU notifiers is ever changed.
         */
        if (data->mkold && !stage2_try_set_pte(ctx, new))
                return -EAGAIN;

        /*
         * "But where's the TLBI?!", you scream.
         * "Over in the core code", I sigh.
         *
         * See the '->clear_flush_young()' callback on the KVM mmu notifier.
         */
        return 0;
}

bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
                                         u64 size, bool mkold)
{
        struct stage2_age_data data = {
                .mkold                = mkold,
        };
        struct kvm_pgtable_walker walker = {
                .cb                = stage2_age_walker,
                .arg                = &data,
                .flags                = KVM_PGTABLE_WALK_LEAF,
        };

        WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
        return data.young;
}

int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
                                   enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags)
{
        int ret;
        s8 level;
        kvm_pte_t set = 0, clr = 0;

        if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
                return -EINVAL;

        if (prot & KVM_PGTABLE_PROT_R)
                set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;

        if (prot & KVM_PGTABLE_PROT_W)
                set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;

        if (prot & KVM_PGTABLE_PROT_X)
                clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;

        ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags);
        if (!ret || ret == -EAGAIN)
                kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level);
        return ret;
}

static int stage2_flush_walker(const struct kvm_pgtable_visit_ctx *ctx,
                               enum kvm_pgtable_walk_flags visit)
{
        struct kvm_pgtable *pgt = ctx->arg;
        struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;

        if (!stage2_pte_cacheable(pgt, ctx->old))
                return 0;

        if (mm_ops->dcache_clean_inval_poc)
                mm_ops->dcache_clean_inval_poc(kvm_pte_follow(ctx->old, mm_ops),
                                               kvm_granule_size(ctx->level));
        return 0;
}

int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
{
        struct kvm_pgtable_walker walker = {
                .cb        = stage2_flush_walker,
                .flags        = KVM_PGTABLE_WALK_LEAF,
                .arg        = pgt,
        };

        if (stage2_has_fwb(pgt))
                return 0;

        return kvm_pgtable_walk(pgt, addr, size, &walker);
}

kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
                                              u64 phys, s8 level,
                                              enum kvm_pgtable_prot prot,
                                              void *mc, bool force_pte)
{
        struct stage2_map_data map_data = {
                .phys                = phys,
                .mmu                = pgt->mmu,
                .memcache        = mc,
                .force_pte        = force_pte,
        };
        struct kvm_pgtable_walker walker = {
                .cb                = stage2_map_walker,
                .flags                = KVM_PGTABLE_WALK_LEAF |
                                  KVM_PGTABLE_WALK_SKIP_BBM_TLBI |
                                  KVM_PGTABLE_WALK_SKIP_CMO,
                .arg                = &map_data,
        };
        /*
         * The input address (.addr) is irrelevant for walking an
         * unlinked table. Construct an ambiguous IA range to map
         * kvm_granule_size(level) worth of memory.
         */
        struct kvm_pgtable_walk_data data = {
                .walker        = &walker,
                .addr        = 0,
                .end        = kvm_granule_size(level),
        };
        struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
        kvm_pte_t *pgtable;
        int ret;

        if (!IS_ALIGNED(phys, kvm_granule_size(level)))
                return ERR_PTR(-EINVAL);

        ret = stage2_set_prot_attr(pgt, prot, &map_data.attr);
        if (ret)
                return ERR_PTR(ret);

        pgtable = mm_ops->zalloc_page(mc);
        if (!pgtable)
                return ERR_PTR(-ENOMEM);

        ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable,
                                 level + 1);
        if (ret) {
                kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level);
                return ERR_PTR(ret);
        }

        return pgtable;
}

/*
 * Get the number of page-tables needed to replace a block with a
 * fully populated tree up to the PTE entries. Note that @level is
 * interpreted as in "level @level entry".
 */
static int stage2_block_get_nr_page_tables(s8 level)
{
        switch (level) {
        case 1:
                return PTRS_PER_PTE + 1;
        case 2:
                return 1;
        case 3:
                return 0;
        default:
                WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL ||
                             level > KVM_PGTABLE_LAST_LEVEL);
                return -EINVAL;
        };
}

static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx,
                               enum kvm_pgtable_walk_flags visit)
{
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
        struct kvm_mmu_memory_cache *mc = ctx->arg;
        struct kvm_s2_mmu *mmu;
        kvm_pte_t pte = ctx->old, new, *childp;
        enum kvm_pgtable_prot prot;
        s8 level = ctx->level;
        bool force_pte;
        int nr_pages;
        u64 phys;

        /* No huge-pages exist at the last level */
        if (level == KVM_PGTABLE_LAST_LEVEL)
                return 0;

        /* We only split valid block mappings */
        if (!kvm_pte_valid(pte))
                return 0;

        nr_pages = stage2_block_get_nr_page_tables(level);
        if (nr_pages < 0)
                return nr_pages;

        if (mc->nobjs >= nr_pages) {
                /* Build a tree mapped down to the PTE granularity. */
                force_pte = true;
        } else {
                /*
                 * Don't force PTEs, so create_unlinked() below does
                 * not populate the tree up to the PTE level. The
                 * consequence is that the call will require a single
                 * page of level 2 entries at level 1, or a single
                 * page of PTEs at level 2. If we are at level 1, the
                 * PTEs will be created recursively.
                 */
                force_pte = false;
                nr_pages = 1;
        }

        if (mc->nobjs < nr_pages)
                return -ENOMEM;

        mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache);
        phys = kvm_pte_to_phys(pte);
        prot = kvm_pgtable_stage2_pte_prot(pte);

        childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys,
                                                    level, prot, mc, force_pte);
        if (IS_ERR(childp))
                return PTR_ERR(childp);

        if (!stage2_try_break_pte(ctx, mmu)) {
                kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level);
                return -EAGAIN;
        }

        /*
         * Note, the contents of the page table are guaranteed to be made
         * visible before the new PTE is assigned because stage2_make_pte()
         * writes the PTE using smp_store_release().
         */
        new = kvm_init_table_pte(childp, mm_ops);
        stage2_make_pte(ctx, new);
        return 0;
}

int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
                             struct kvm_mmu_memory_cache *mc)
{
        struct kvm_pgtable_walker walker = {
                .cb        = stage2_split_walker,
                .flags        = KVM_PGTABLE_WALK_LEAF,
                .arg        = mc,
        };
        int ret;

        ret = kvm_pgtable_walk(pgt, addr, size, &walker);
        dsb(ishst);
        return ret;
}

int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
                              struct kvm_pgtable_mm_ops *mm_ops,
                              enum kvm_pgtable_stage2_flags flags,
                              kvm_pgtable_force_pte_cb_t force_pte_cb)
{
        size_t pgd_sz;
        u64 vtcr = mmu->vtcr;
        u32 ia_bits = VTCR_EL2_IPA(vtcr);
        u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
        s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;

        pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
        pgt->pgd = (kvm_pteref_t)mm_ops->zalloc_pages_exact(pgd_sz);
        if (!pgt->pgd)
                return -ENOMEM;

        pgt->ia_bits                = ia_bits;
        pgt->start_level        = start_level;
        pgt->mm_ops                = mm_ops;
        pgt->mmu                = mmu;
        pgt->flags                = flags;
        pgt->force_pte_cb        = force_pte_cb;

        /* Ensure zeroed PGD pages are visible to the hardware walker */
        dsb(ishst);
        return 0;
}

size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
{
        u32 ia_bits = VTCR_EL2_IPA(vtcr);
        u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
        s8 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0;

        return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
}

static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
                              enum kvm_pgtable_walk_flags visit)
{
        struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;

        if (!stage2_pte_is_counted(ctx->old))
                return 0;

        mm_ops->put_page(ctx->ptep);

        if (kvm_pte_table(ctx->old, ctx->level))
                mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));

        return 0;
}

void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
{
        size_t pgd_sz;
        struct kvm_pgtable_walker walker = {
                .cb        = stage2_free_walker,
                .flags        = KVM_PGTABLE_WALK_LEAF |
                          KVM_PGTABLE_WALK_TABLE_POST,
        };

        WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
        pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
        pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
        pgt->pgd = NULL;
}

void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
{
        kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
        struct kvm_pgtable_walker walker = {
                .cb        = stage2_free_walker,
                .flags        = KVM_PGTABLE_WALK_LEAF |
                          KVM_PGTABLE_WALK_TABLE_POST,
        };
        struct kvm_pgtable_walk_data data = {
                .walker        = &walker,

                /*
                 * At this point the IPA really doesn't matter, as the page
                 * table being traversed has already been removed from the stage
                 * 2. Set an appropriate range to cover the entire page table.
                 */
                .addr        = 0,
                .end        = kvm_granule_size(level),
        };

        WARN_ON(__kvm_pgtable_walk(&data, mm_ops, ptep, level + 1));

        WARN_ON(mm_ops->page_count(pgtable) != 1);
        mm_ops->put_page(pgtable);
}































   99 













   49 


















  138 
   11 

































































  184 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * kref.h - library routines for handling generic reference counted objects
 *
 * Copyright (C) 2004 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2004 IBM Corp.
 *
 * based on kobject.h which was:
 * Copyright (C) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (C) 2002-2003 Open Source Development Labs
 */

#ifndef _KREF_H_
#define _KREF_H_

#include <linux/spinlock.h>
#include <linux/refcount.h>

struct kref {
        refcount_t refcount;
};

#define KREF_INIT(n)        { .refcount = REFCOUNT_INIT(n), }

/**
 * kref_init - initialize object.
 * @kref: object in question.
 */
static inline void kref_init(struct kref *kref)
{
        refcount_set(&kref->refcount, 1);
}

static inline unsigned int kref_read(const struct kref *kref)
{
        return refcount_read(&kref->refcount);
}

/**
 * kref_get - increment refcount for object.
 * @kref: object.
 */
static inline void kref_get(struct kref *kref)
{
        refcount_inc(&kref->refcount);
}

/**
 * kref_put - Decrement refcount for object
 * @kref: Object
 * @release: Pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 *
 * Decrement the refcount, and if 0, call @release.  The caller may not
 * pass NULL or kfree() as the release function.
 *
 * Return: 1 if this call removed the object, otherwise return 0.  Beware,
 * if this function returns 0, another caller may have removed the object
 * by the time this function returns.  The return value is only certain
 * if you want to see if the object is definitely released.
 */
static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref))
{
        if (refcount_dec_and_test(&kref->refcount)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_put_mutex - Decrement refcount for object
 * @kref: Object
 * @release: Pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 * @mutex: Mutex which protects the release function.
 *
 * This variant of kref_lock() calls the @release function with the @mutex
 * held.  The @release function will release the mutex.
 */
static inline int kref_put_mutex(struct kref *kref,
                                 void (*release)(struct kref *kref),
                                 struct mutex *mutex)
{
        if (refcount_dec_and_mutex_lock(&kref->refcount, mutex)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_put_lock - Decrement refcount for object
 * @kref: Object
 * @release: Pointer to the function that will clean up the object when the
 *             last reference to the object is released.
 * @lock: Spinlock which protects the release function.
 *
 * This variant of kref_lock() calls the @release function with the @lock
 * held.  The @release function will release the lock.
 */
static inline int kref_put_lock(struct kref *kref,
                                void (*release)(struct kref *kref),
                                spinlock_t *lock)
{
        if (refcount_dec_and_lock(&kref->refcount, lock)) {
                release(kref);
                return 1;
        }
        return 0;
}

/**
 * kref_get_unless_zero - Increment refcount for object unless it is zero.
 * @kref: object.
 *
 * This function is intended to simplify locking around refcounting for
 * objects that can be looked up from a lookup structure, and which are
 * removed from that lookup structure in the object destructor.
 * Operations on such objects require at least a read lock around
 * lookup + kref_get, and a write lock around kref_put + remove from lookup
 * structure. Furthermore, RCU implementations become extremely tricky.
 * With a lookup followed by a kref_get_unless_zero *with return value check*
 * locking in the kref_put path can be deferred to the actual removal from
 * the lookup structure and RCU lookups become trivial.
 *
 * Return: non-zero if the increment succeeded. Otherwise return 0.
 */
static inline int __must_check kref_get_unless_zero(struct kref *kref)
{
        return refcount_inc_not_zero(&kref->refcount);
}
#endif /* _KREF_H_ */































































































































































































































































































































































































































































































































  188 
  188 




























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Landlock LSM - Ruleset management
 *
 * Copyright © 2016-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2018-2020 ANSSI
 */

#include <linux/bits.h>
#include <linux/bug.h>
#include <linux/cleanup.h>
#include <linux/compiler_types.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/lockdep.h>
#include <linux/mutex.h>
#include <linux/overflow.h>
#include <linux/rbtree.h>
#include <linux/refcount.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>

#include "access.h"
#include "audit.h"
#include "domain.h"
#include "limits.h"
#include "object.h"
#include "ruleset.h"

static struct landlock_ruleset *create_ruleset(const u32 num_layers)
{
        struct landlock_ruleset *new_ruleset;

        new_ruleset =
                kzalloc(struct_size(new_ruleset, access_masks, num_layers),
                        GFP_KERNEL_ACCOUNT);
        if (!new_ruleset)
                return ERR_PTR(-ENOMEM);
        refcount_set(&new_ruleset->usage, 1);
        mutex_init(&new_ruleset->lock);
        new_ruleset->root_inode = RB_ROOT;

#if IS_ENABLED(CONFIG_INET)
        new_ruleset->root_net_port = RB_ROOT;
#endif /* IS_ENABLED(CONFIG_INET) */

        new_ruleset->num_layers = num_layers;
        /*
         * hierarchy = NULL
         * num_rules = 0
         * access_masks[] = 0
         */
        return new_ruleset;
}

struct landlock_ruleset *
landlock_create_ruleset(const access_mask_t fs_access_mask,
                        const access_mask_t net_access_mask,
                        const access_mask_t scope_mask)
{
        struct landlock_ruleset *new_ruleset;

        /* Informs about useless ruleset. */
        if (!fs_access_mask && !net_access_mask && !scope_mask)
                return ERR_PTR(-ENOMSG);
        new_ruleset = create_ruleset(1);
        if (IS_ERR(new_ruleset))
                return new_ruleset;
        if (fs_access_mask)
                landlock_add_fs_access_mask(new_ruleset, fs_access_mask, 0);
        if (net_access_mask)
                landlock_add_net_access_mask(new_ruleset, net_access_mask, 0);
        if (scope_mask)
                landlock_add_scope_mask(new_ruleset, scope_mask, 0);
        return new_ruleset;
}

static void build_check_rule(void)
{
        const struct landlock_rule rule = {
                .num_layers = ~0,
        };

        BUILD_BUG_ON(rule.num_layers < LANDLOCK_MAX_NUM_LAYERS);
}

static bool is_object_pointer(const enum landlock_key_type key_type)
{
        switch (key_type) {
        case LANDLOCK_KEY_INODE:
                return true;

#if IS_ENABLED(CONFIG_INET)
        case LANDLOCK_KEY_NET_PORT:
                return false;
#endif /* IS_ENABLED(CONFIG_INET) */

        default:
                WARN_ON_ONCE(1);
                return false;
        }
}

static struct landlock_rule *
create_rule(const struct landlock_id id,
            const struct landlock_layer (*const layers)[], const u32 num_layers,
            const struct landlock_layer *const new_layer)
{
        struct landlock_rule *new_rule;
        u32 new_num_layers;

        build_check_rule();
        if (new_layer) {
                /* Should already be checked by landlock_merge_ruleset(). */
                if (WARN_ON_ONCE(num_layers >= LANDLOCK_MAX_NUM_LAYERS))
                        return ERR_PTR(-E2BIG);
                new_num_layers = num_layers + 1;
        } else {
                new_num_layers = num_layers;
        }
        new_rule = kzalloc(struct_size(new_rule, layers, new_num_layers),
                           GFP_KERNEL_ACCOUNT);
        if (!new_rule)
                return ERR_PTR(-ENOMEM);
        RB_CLEAR_NODE(&new_rule->node);
        if (is_object_pointer(id.type)) {
                /* This should have been caught by insert_rule(). */
                WARN_ON_ONCE(!id.key.object);
                landlock_get_object(id.key.object);
        }

        new_rule->key = id.key;
        new_rule->num_layers = new_num_layers;
        /* Copies the original layer stack. */
        memcpy(new_rule->layers, layers,
               flex_array_size(new_rule, layers, num_layers));
        if (new_layer)
                /* Adds a copy of @new_layer on the layer stack. */
                new_rule->layers[new_rule->num_layers - 1] = *new_layer;
        return new_rule;
}

static struct rb_root *get_root(struct landlock_ruleset *const ruleset,
                                const enum landlock_key_type key_type)
{
        switch (key_type) {
        case LANDLOCK_KEY_INODE:
                return &ruleset->root_inode;

#if IS_ENABLED(CONFIG_INET)
        case LANDLOCK_KEY_NET_PORT:
                return &ruleset->root_net_port;
#endif /* IS_ENABLED(CONFIG_INET) */

        default:
                WARN_ON_ONCE(1);
                return ERR_PTR(-EINVAL);
        }
}

static void free_rule(struct landlock_rule *const rule,
                      const enum landlock_key_type key_type)
{
        might_sleep();
        if (!rule)
                return;
        if (is_object_pointer(key_type))
                landlock_put_object(rule->key.object);
        kfree(rule);
}

static void build_check_ruleset(void)
{
        const struct landlock_ruleset ruleset = {
                .num_rules = ~0,
                .num_layers = ~0,
        };

        BUILD_BUG_ON(ruleset.num_rules < LANDLOCK_MAX_NUM_RULES);
        BUILD_BUG_ON(ruleset.num_layers < LANDLOCK_MAX_NUM_LAYERS);
}

/**
 * insert_rule - Create and insert a rule in a ruleset
 *
 * @ruleset: The ruleset to be updated.
 * @id: The ID to build the new rule with.  The underlying kernel object, if
 *      any, must be held by the caller.
 * @layers: One or multiple layers to be copied into the new rule.
 * @num_layers: The number of @layers entries.
 *
 * When user space requests to add a new rule to a ruleset, @layers only
 * contains one entry and this entry is not assigned to any level.  In this
 * case, the new rule will extend @ruleset, similarly to a boolean OR between
 * access rights.
 *
 * When merging a ruleset in a domain, or copying a domain, @layers will be
 * added to @ruleset as new constraints, similarly to a boolean AND between
 * access rights.
 */
static int insert_rule(struct landlock_ruleset *const ruleset,
                       const struct landlock_id id,
                       const struct landlock_layer (*const layers)[],
                       const size_t num_layers)
{
        struct rb_node **walker_node;
        struct rb_node *parent_node = NULL;
        struct landlock_rule *new_rule;
        struct rb_root *root;

        might_sleep();
        lockdep_assert_held(&ruleset->lock);
        if (WARN_ON_ONCE(!layers))
                return -ENOENT;

        if (is_object_pointer(id.type) && WARN_ON_ONCE(!id.key.object))
                return -ENOENT;

        root = get_root(ruleset, id.type);
        if (IS_ERR(root))
                return PTR_ERR(root);

        walker_node = &root->rb_node;
        while (*walker_node) {
                struct landlock_rule *const this =
                        rb_entry(*walker_node, struct landlock_rule, node);

                if (this->key.data != id.key.data) {
                        parent_node = *walker_node;
                        if (this->key.data < id.key.data)
                                walker_node = &((*walker_node)->rb_right);
                        else
                                walker_node = &((*walker_node)->rb_left);
                        continue;
                }

                /* Only a single-level layer should match an existing rule. */
                if (WARN_ON_ONCE(num_layers != 1))
                        return -EINVAL;

                /* If there is a matching rule, updates it. */
                if ((*layers)[0].level == 0) {
                        /*
                         * Extends access rights when the request comes from
                         * landlock_add_rule(2), i.e. @ruleset is not a domain.
                         */
                        if (WARN_ON_ONCE(this->num_layers != 1))
                                return -EINVAL;
                        if (WARN_ON_ONCE(this->layers[0].level != 0))
                                return -EINVAL;
                        this->layers[0].access |= (*layers)[0].access;
                        return 0;
                }

                if (WARN_ON_ONCE(this->layers[0].level == 0))
                        return -EINVAL;

                /*
                 * Intersects access rights when it is a merge between a
                 * ruleset and a domain.
                 */
                new_rule = create_rule(id, &this->layers, this->num_layers,
                                       &(*layers)[0]);
                if (IS_ERR(new_rule))
                        return PTR_ERR(new_rule);
                rb_replace_node(&this->node, &new_rule->node, root);
                free_rule(this, id.type);
                return 0;
        }

        /* There is no match for @id. */
        build_check_ruleset();
        if (ruleset->num_rules >= LANDLOCK_MAX_NUM_RULES)
                return -E2BIG;
        new_rule = create_rule(id, layers, num_layers, NULL);
        if (IS_ERR(new_rule))
                return PTR_ERR(new_rule);
        rb_link_node(&new_rule->node, parent_node, walker_node);
        rb_insert_color(&new_rule->node, root);
        ruleset->num_rules++;
        return 0;
}

static void build_check_layer(void)
{
        const struct landlock_layer layer = {
                .level = ~0,
                .access = ~0,
        };

        BUILD_BUG_ON(layer.level < LANDLOCK_MAX_NUM_LAYERS);
        BUILD_BUG_ON(layer.access < LANDLOCK_MASK_ACCESS_FS);
}

/* @ruleset must be locked by the caller. */
int landlock_insert_rule(struct landlock_ruleset *const ruleset,
                         const struct landlock_id id,
                         const access_mask_t access)
{
        struct landlock_layer layers[] = { {
                .access = access,
                /* When @level is zero, insert_rule() extends @ruleset. */
                .level = 0,
        } };

        build_check_layer();
        return insert_rule(ruleset, id, &layers, ARRAY_SIZE(layers));
}

static int merge_tree(struct landlock_ruleset *const dst,
                      struct landlock_ruleset *const src,
                      const enum landlock_key_type key_type)
{
        struct landlock_rule *walker_rule, *next_rule;
        struct rb_root *src_root;
        int err = 0;

        might_sleep();
        lockdep_assert_held(&dst->lock);
        lockdep_assert_held(&src->lock);

        src_root = get_root(src, key_type);
        if (IS_ERR(src_root))
                return PTR_ERR(src_root);

        /* Merges the @src tree. */
        rbtree_postorder_for_each_entry_safe(walker_rule, next_rule, src_root,
                                             node) {
                struct landlock_layer layers[] = { {
                        .level = dst->num_layers,
                } };
                const struct landlock_id id = {
                        .key = walker_rule->key,
                        .type = key_type,
                };

                if (WARN_ON_ONCE(walker_rule->num_layers != 1))
                        return -EINVAL;

                if (WARN_ON_ONCE(walker_rule->layers[0].level != 0))
                        return -EINVAL;

                layers[0].access = walker_rule->layers[0].access;

                err = insert_rule(dst, id, &layers, ARRAY_SIZE(layers));
                if (err)
                        return err;
        }
        return err;
}

static int merge_ruleset(struct landlock_ruleset *const dst,
                         struct landlock_ruleset *const src)
{
        int err = 0;

        might_sleep();
        /* Should already be checked by landlock_merge_ruleset() */
        if (WARN_ON_ONCE(!src))
                return 0;
        /* Only merge into a domain. */
        if (WARN_ON_ONCE(!dst || !dst->hierarchy))
                return -EINVAL;

        /* Locks @dst first because we are its only owner. */
        mutex_lock(&dst->lock);
        mutex_lock_nested(&src->lock, SINGLE_DEPTH_NESTING);

        /* Stacks the new layer. */
        if (WARN_ON_ONCE(src->num_layers != 1 || dst->num_layers < 1)) {
                err = -EINVAL;
                goto out_unlock;
        }
        dst->access_masks[dst->num_layers - 1] =
                landlock_upgrade_handled_access_masks(src->access_masks[0]);

        /* Merges the @src inode tree. */
        err = merge_tree(dst, src, LANDLOCK_KEY_INODE);
        if (err)
                goto out_unlock;

#if IS_ENABLED(CONFIG_INET)
        /* Merges the @src network port tree. */
        err = merge_tree(dst, src, LANDLOCK_KEY_NET_PORT);
        if (err)
                goto out_unlock;
#endif /* IS_ENABLED(CONFIG_INET) */

out_unlock:
        mutex_unlock(&src->lock);
        mutex_unlock(&dst->lock);
        return err;
}

static int inherit_tree(struct landlock_ruleset *const parent,
                        struct landlock_ruleset *const child,
                        const enum landlock_key_type key_type)
{
        struct landlock_rule *walker_rule, *next_rule;
        struct rb_root *parent_root;
        int err = 0;

        might_sleep();
        lockdep_assert_held(&parent->lock);
        lockdep_assert_held(&child->lock);

        parent_root = get_root(parent, key_type);
        if (IS_ERR(parent_root))
                return PTR_ERR(parent_root);

        /* Copies the @parent inode or network tree. */
        rbtree_postorder_for_each_entry_safe(walker_rule, next_rule,
                                             parent_root, node) {
                const struct landlock_id id = {
                        .key = walker_rule->key,
                        .type = key_type,
                };

                err = insert_rule(child, id, &walker_rule->layers,
                                  walker_rule->num_layers);
                if (err)
                        return err;
        }
        return err;
}

static int inherit_ruleset(struct landlock_ruleset *const parent,
                           struct landlock_ruleset *const child)
{
        int err = 0;

        might_sleep();
        if (!parent)
                return 0;

        /* Locks @child first because we are its only owner. */
        mutex_lock(&child->lock);
        mutex_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);

        /* Copies the @parent inode tree. */
        err = inherit_tree(parent, child, LANDLOCK_KEY_INODE);
        if (err)
                goto out_unlock;

#if IS_ENABLED(CONFIG_INET)
        /* Copies the @parent network port tree. */
        err = inherit_tree(parent, child, LANDLOCK_KEY_NET_PORT);
        if (err)
                goto out_unlock;
#endif /* IS_ENABLED(CONFIG_INET) */

        if (WARN_ON_ONCE(child->num_layers <= parent->num_layers)) {
                err = -EINVAL;
                goto out_unlock;
        }
        /* Copies the parent layer stack and leaves a space for the new layer. */
        memcpy(child->access_masks, parent->access_masks,
               flex_array_size(parent, access_masks, parent->num_layers));

        if (WARN_ON_ONCE(!parent->hierarchy)) {
                err = -EINVAL;
                goto out_unlock;
        }
        landlock_get_hierarchy(parent->hierarchy);
        child->hierarchy->parent = parent->hierarchy;

out_unlock:
        mutex_unlock(&parent->lock);
        mutex_unlock(&child->lock);
        return err;
}

static void free_ruleset(struct landlock_ruleset *const ruleset)
{
        struct landlock_rule *freeme, *next;

        might_sleep();
        rbtree_postorder_for_each_entry_safe(freeme, next, &ruleset->root_inode,
                                             node)
                free_rule(freeme, LANDLOCK_KEY_INODE);

#if IS_ENABLED(CONFIG_INET)
        rbtree_postorder_for_each_entry_safe(freeme, next,
                                             &ruleset->root_net_port, node)
                free_rule(freeme, LANDLOCK_KEY_NET_PORT);
#endif /* IS_ENABLED(CONFIG_INET) */

        landlock_put_hierarchy(ruleset->hierarchy);
        kfree(ruleset);
}

void landlock_put_ruleset(struct landlock_ruleset *const ruleset)
{
        might_sleep();
        if (ruleset && refcount_dec_and_test(&ruleset->usage))
                free_ruleset(ruleset);
}

static void free_ruleset_work(struct work_struct *const work)
{
        struct landlock_ruleset *ruleset;

        ruleset = container_of(work, struct landlock_ruleset, work_free);
        free_ruleset(ruleset);
}

/* Only called by hook_cred_free(). */
void landlock_put_ruleset_deferred(struct landlock_ruleset *const ruleset)
{
        if (ruleset && refcount_dec_and_test(&ruleset->usage)) {
                INIT_WORK(&ruleset->work_free, free_ruleset_work);
                schedule_work(&ruleset->work_free);
        }
}

/**
 * landlock_merge_ruleset - Merge a ruleset with a domain
 *
 * @parent: Parent domain.
 * @ruleset: New ruleset to be merged.
 *
 * The current task is requesting to be restricted.  The subjective credentials
 * must not be in an overridden state. cf. landlock_init_hierarchy_log().
 *
 * Returns the intersection of @parent and @ruleset, or returns @parent if
 * @ruleset is empty, or returns a duplicate of @ruleset if @parent is empty.
 */
struct landlock_ruleset *
landlock_merge_ruleset(struct landlock_ruleset *const parent,
                       struct landlock_ruleset *const ruleset)
{
        struct landlock_ruleset *new_dom __free(landlock_put_ruleset) = NULL;
        u32 num_layers;
        int err;

        might_sleep();
        if (WARN_ON_ONCE(!ruleset || parent == ruleset))
                return ERR_PTR(-EINVAL);

        if (parent) {
                if (parent->num_layers >= LANDLOCK_MAX_NUM_LAYERS)
                        return ERR_PTR(-E2BIG);
                num_layers = parent->num_layers + 1;
        } else {
                num_layers = 1;
        }

        /* Creates a new domain... */
        new_dom = create_ruleset(num_layers);
        if (IS_ERR(new_dom))
                return new_dom;

        new_dom->hierarchy =
                kzalloc(sizeof(*new_dom->hierarchy), GFP_KERNEL_ACCOUNT);
        if (!new_dom->hierarchy)
                return ERR_PTR(-ENOMEM);

        refcount_set(&new_dom->hierarchy->usage, 1);

        /* ...as a child of @parent... */
        err = inherit_ruleset(parent, new_dom);
        if (err)
                return ERR_PTR(err);

        /* ...and including @ruleset. */
        err = merge_ruleset(new_dom, ruleset);
        if (err)
                return ERR_PTR(err);

        err = landlock_init_hierarchy_log(new_dom->hierarchy);
        if (err)
                return ERR_PTR(err);

        return no_free_ptr(new_dom);
}

/*
 * The returned access has the same lifetime as @ruleset.
 */
const struct landlock_rule *
landlock_find_rule(const struct landlock_ruleset *const ruleset,
                   const struct landlock_id id)
{
        const struct rb_root *root;
        const struct rb_node *node;

        root = get_root((struct landlock_ruleset *)ruleset, id.type);
        if (IS_ERR(root))
                return NULL;
        node = root->rb_node;

        while (node) {
                struct landlock_rule *this =
                        rb_entry(node, struct landlock_rule, node);

                if (this->key.data == id.key.data)
                        return this;
                if (this->key.data < id.key.data)
                        node = node->rb_right;
                else
                        node = node->rb_left;
        }
        return NULL;
}

/*
 * @layer_masks is read and may be updated according to the access request and
 * the matching rule.
 * @masks_array_size must be equal to ARRAY_SIZE(*layer_masks).
 *
 * Returns true if the request is allowed (i.e. relevant layer masks for the
 * request are empty).
 */
bool landlock_unmask_layers(const struct landlock_rule *const rule,
                            const access_mask_t access_request,
                            layer_mask_t (*const layer_masks)[],
                            const size_t masks_array_size)
{
        size_t layer_level;

        if (!access_request || !layer_masks)
                return true;
        if (!rule)
                return false;

        /*
         * An access is granted if, for each policy layer, at least one rule
         * encountered on the pathwalk grants the requested access,
         * regardless of its position in the layer stack.  We must then check
         * the remaining layers for each inode, from the first added layer to
         * the last one.  When there is multiple requested accesses, for each
         * policy layer, the full set of requested accesses may not be granted
         * by only one rule, but by the union (binary OR) of multiple rules.
         * E.g. /a/b <execute> + /a <read> => /a/b <execute + read>
         */
        for (layer_level = 0; layer_level < rule->num_layers; layer_level++) {
                const struct landlock_layer *const layer =
                        &rule->layers[layer_level];
                const layer_mask_t layer_bit = BIT_ULL(layer->level - 1);
                const unsigned long access_req = access_request;
                unsigned long access_bit;
                bool is_empty;

                /*
                 * Records in @layer_masks which layer grants access to each
                 * requested access.
                 */
                is_empty = true;
                for_each_set_bit(access_bit, &access_req, masks_array_size) {
                        if (layer->access & BIT_ULL(access_bit))
                                (*layer_masks)[access_bit] &= ~layer_bit;
                        is_empty = is_empty && !(*layer_masks)[access_bit];
                }
                if (is_empty)
                        return true;
        }
        return false;
}

typedef access_mask_t
get_access_mask_t(const struct landlock_ruleset *const ruleset,
                  const u16 layer_level);

/**
 * landlock_init_layer_masks - Initialize layer masks from an access request
 *
 * Populates @layer_masks such that for each access right in @access_request,
 * the bits for all the layers are set where this access right is handled.
 *
 * @domain: The domain that defines the current restrictions.
 * @access_request: The requested access rights to check.
 * @layer_masks: It must contain %LANDLOCK_NUM_ACCESS_FS or
 * %LANDLOCK_NUM_ACCESS_NET elements according to @key_type.
 * @key_type: The key type to switch between access masks of different types.
 *
 * Returns: An access mask where each access right bit is set which is handled
 * in any of the active layers in @domain.
 */
access_mask_t
landlock_init_layer_masks(const struct landlock_ruleset *const domain,
                          const access_mask_t access_request,
                          layer_mask_t (*const layer_masks)[],
                          const enum landlock_key_type key_type)
{
        access_mask_t handled_accesses = 0;
        size_t layer_level, num_access;
        get_access_mask_t *get_access_mask;

        switch (key_type) {
        case LANDLOCK_KEY_INODE:
                get_access_mask = landlock_get_fs_access_mask;
                num_access = LANDLOCK_NUM_ACCESS_FS;
                break;

#if IS_ENABLED(CONFIG_INET)
        case LANDLOCK_KEY_NET_PORT:
                get_access_mask = landlock_get_net_access_mask;
                num_access = LANDLOCK_NUM_ACCESS_NET;
                break;
#endif /* IS_ENABLED(CONFIG_INET) */

        default:
                WARN_ON_ONCE(1);
                return 0;
        }

        memset(layer_masks, 0,
               array_size(sizeof((*layer_masks)[0]), num_access));

        /* An empty access request can happen because of O_WRONLY | O_RDWR. */
        if (!access_request)
                return 0;

        /* Saves all handled accesses per layer. */
        for (layer_level = 0; layer_level < domain->num_layers; layer_level++) {
                const unsigned long access_req = access_request;
                const access_mask_t access_mask =
                        get_access_mask(domain, layer_level);
                unsigned long access_bit;

                for_each_set_bit(access_bit, &access_req, num_access) {
                        if (BIT_ULL(access_bit) & access_mask) {
                                (*layer_masks)[access_bit] |=
                                        BIT_ULL(layer_level);
                                handled_accesses |= BIT_ULL(access_bit);
                        }
                }
        }
        return handled_accesses;
}




























































































































































































































































































































































































































































































































































































































































































































    3 




    1 


































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * VLAN                An implementation of 802.1Q VLAN tagging.
 *
 * Authors:        Ben Greear <greearb@candelatech.com>
 */
#ifndef _LINUX_IF_VLAN_H_
#define _LINUX_IF_VLAN_H_

#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/bug.h>
#include <uapi/linux/if_vlan.h>

#define VLAN_HLEN        4                /* The additional bytes required by VLAN
                                         * (in addition to the Ethernet header)
                                         */
#define VLAN_ETH_HLEN        18                /* Total octets in header.         */
#define VLAN_ETH_ZLEN        64                /* Min. octets in frame sans FCS */

/*
 * According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan
 */
#define VLAN_ETH_DATA_LEN        1500        /* Max. octets in payload         */
#define VLAN_ETH_FRAME_LEN        1518        /* Max. octets in frame sans FCS */

#define VLAN_MAX_DEPTH        8                /* Max. number of nested VLAN tags parsed */

/*
 *         struct vlan_hdr - vlan header
 *         @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_hdr {
        __be16        h_vlan_TCI;
        __be16        h_vlan_encapsulated_proto;
};

/**
 *        struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr)
 *        @h_dest: destination ethernet address
 *        @h_source: source ethernet address
 *        @h_vlan_proto: ethernet protocol
 *        @h_vlan_TCI: priority and VLAN ID
 *        @h_vlan_encapsulated_proto: packet type ID or len
 */
struct vlan_ethhdr {
        struct_group(addrs,
                unsigned char        h_dest[ETH_ALEN];
                unsigned char        h_source[ETH_ALEN];
        );
        __be16                h_vlan_proto;
        __be16                h_vlan_TCI;
        __be16                h_vlan_encapsulated_proto;
};

#include <linux/skbuff.h>

static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb_mac_header(skb);
}

/* Prefer this version in TX path, instead of
 * skb_reset_mac_header() + vlan_eth_hdr()
 */
static inline struct vlan_ethhdr *skb_vlan_eth_hdr(const struct sk_buff *skb)
{
        return (struct vlan_ethhdr *)skb->data;
}

#define VLAN_PRIO_MASK                0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT                13
#define VLAN_CFI_MASK                0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */
#define VLAN_VID_MASK                0x0fff /* VLAN Identifier */
#define VLAN_N_VID                4096

/* found in socket.c */
extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *));

static inline bool is_vlan_dev(const struct net_device *dev)
{
        return dev->priv_flags & IFF_802_1Q_VLAN;
}

#define skb_vlan_tag_present(__skb)        (!!(__skb)->vlan_all)
#define skb_vlan_tag_get(__skb)                ((__skb)->vlan_tci)
#define skb_vlan_tag_get_id(__skb)        ((__skb)->vlan_tci & VLAN_VID_MASK)
#define skb_vlan_tag_get_cfi(__skb)        (!!((__skb)->vlan_tci & VLAN_CFI_MASK))
#define skb_vlan_tag_get_prio(__skb)        (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT)

static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev);
}

static inline int vlan_get_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev));
}

static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev);
}

/**
 *        struct vlan_pcpu_stats - VLAN percpu rx/tx stats
 *        @rx_packets: number of received packets
 *        @rx_bytes: number of received bytes
 *        @rx_multicast: number of received multicast packets
 *        @tx_packets: number of transmitted packets
 *        @tx_bytes: number of transmitted bytes
 *        @syncp: synchronization point for 64bit counters
 *        @rx_errors: number of rx errors
 *        @tx_dropped: number of tx drops
 */
struct vlan_pcpu_stats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                rx_multicast;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync        syncp;
        u32                        rx_errors;
        u32                        tx_dropped;
};

#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)

extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                                               __be16 vlan_proto, u16 vlan_id);
extern int vlan_for_each(struct net_device *dev,
                         int (*action)(struct net_device *dev, int vid,
                                       void *arg), void *arg);
extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
extern u16 vlan_dev_vlan_id(const struct net_device *dev);
extern __be16 vlan_dev_vlan_proto(const struct net_device *dev);

/**
 *        struct vlan_priority_tci_mapping - vlan egress priority mappings
 *        @priority: skb priority
 *        @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000
 *        @next: pointer to next struct
 */
struct vlan_priority_tci_mapping {
        u32                                        priority;
        u16                                        vlan_qos;
        struct vlan_priority_tci_mapping        *next;
};

struct proc_dir_entry;
struct netpoll;

/**
 *        struct vlan_dev_priv - VLAN private device data
 *        @nr_ingress_mappings: number of ingress priority mappings
 *        @ingress_priority_map: ingress priority mappings
 *        @nr_egress_mappings: number of egress priority mappings
 *        @egress_priority_map: hash of egress priority mappings
 *        @vlan_proto: VLAN encapsulation protocol
 *        @vlan_id: VLAN identifier
 *        @flags: device flags
 *        @real_dev: underlying netdevice
 *        @dev_tracker: refcount tracker for @real_dev reference
 *        @real_dev_addr: address of underlying netdevice
 *        @dent: proc dir entry
 *        @vlan_pcpu_stats: ptr to percpu rx stats
 *        @netpoll: netpoll instance "propagated" down to @real_dev
 */
struct vlan_dev_priv {
        unsigned int                                nr_ingress_mappings;
        u32                                        ingress_priority_map[8];
        unsigned int                                nr_egress_mappings;
        struct vlan_priority_tci_mapping        *egress_priority_map[16];

        __be16                                        vlan_proto;
        u16                                        vlan_id;
        u16                                        flags;

        struct net_device                        *real_dev;
        netdevice_tracker                        dev_tracker;

        unsigned char                                real_dev_addr[ETH_ALEN];

        struct proc_dir_entry                        *dent;
        struct vlan_pcpu_stats __percpu                *vlan_pcpu_stats;
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll                                *netpoll;
#endif
};

static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
{
        return netdev_priv(dev);
}

static inline u16
vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio)
{
        struct vlan_priority_tci_mapping *mp;

        smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */

        mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)];
        while (mp) {
                if (mp->priority == skprio) {
                        return mp->vlan_qos; /* This should already be shifted
                                              * to mask correctly with the
                                              * VLAN's TCI */
                }
                mp = mp->next;
        }
        return 0;
}

extern bool vlan_do_receive(struct sk_buff **skb);

extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid);
extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid);

extern int vlan_vids_add_by_dev(struct net_device *dev,
                                const struct net_device *by_dev);
extern void vlan_vids_del_by_dev(struct net_device *dev,
                                 const struct net_device *by_dev);

extern bool vlan_uses_dev(const struct net_device *dev);

#else
static inline struct net_device *
__vlan_find_dev_deep_rcu(struct net_device *real_dev,
                     __be16 vlan_proto, u16 vlan_id)
{
        return NULL;
}

static inline int
vlan_for_each(struct net_device *dev,
              int (*action)(struct net_device *dev, int vid, void *arg),
              void *arg)
{
        return 0;
}

static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
        BUG();
        return NULL;
}

static inline u16 vlan_dev_vlan_id(const struct net_device *dev)
{
        BUG();
        return 0;
}

static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
        BUG();
        return 0;
}

static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev,
                                               u32 skprio)
{
        return 0;
}

static inline bool vlan_do_receive(struct sk_buff **skb)
{
        return false;
}

static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
        return 0;
}

static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
}

static inline int vlan_vids_add_by_dev(struct net_device *dev,
                                       const struct net_device *by_dev)
{
        return 0;
}

static inline void vlan_vids_del_by_dev(struct net_device *dev,
                                        const struct net_device *by_dev)
{
}

static inline bool vlan_uses_dev(const struct net_device *dev)
{
        return false;
}
#endif

/**
 * eth_type_vlan - check for valid vlan ether type.
 * @ethertype: ether type to check
 *
 * Returns: true if the ether type is a vlan ether type.
 */
static inline bool eth_type_vlan(__be16 ethertype)
{
        switch (ethertype) {
        case htons(ETH_P_8021Q):
        case htons(ETH_P_8021AD):
                return true;
        default:
                return false;
        }
}

static inline bool vlan_hw_offload_capable(netdev_features_t features,
                                           __be16 proto)
{
        if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX)
                return true;
        if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX)
                return true;
        return false;
}

/**
 * __vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Returns: error if skb_cow_head fails.
 */
static inline int __vlan_insert_inner_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci,
                                          unsigned int mac_len)
{
        struct vlan_ethhdr *veth;

        if (skb_cow_head(skb, VLAN_HLEN) < 0)
                return -ENOMEM;

        skb_push(skb, VLAN_HLEN);

        /* Move the mac header sans proto to the beginning of the new header. */
        if (likely(mac_len > ETH_TLEN))
                memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN);
        if (skb_mac_header_was_set(skb))
                skb->mac_header -= VLAN_HLEN;

        veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN);

        /* first, the ethernet type */
        if (likely(mac_len >= ETH_TLEN)) {
                /* h_vlan_encapsulated_proto should already be populated, and
                 * skb->data has space for h_vlan_proto
                 */
                veth->h_vlan_proto = vlan_proto;
        } else {
                /* h_vlan_encapsulated_proto should not be populated, and
                 * skb->data has no space for h_vlan_proto
                 */
                veth->h_vlan_encapsulated_proto = skb->protocol;
        }

        /* now, the TCI */
        veth->h_vlan_TCI = htons(vlan_tci);

        return 0;
}

/**
 * __vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Returns: error if skb_cow_head fails.
 */
static inline int __vlan_insert_tag(struct sk_buff *skb,
                                    __be16 vlan_proto, u16 vlan_tci)
{
        return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_inner_tag - inner VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 * @mac_len: MAC header length including outer vlan headers
 *
 * Inserts the VLAN tag into @skb as part of the payload at offset mac_len
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Return: modified @skb on success, NULL on error (@skb is freed).
 */
static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb,
                                                    __be16 vlan_proto,
                                                    u16 vlan_tci,
                                                    unsigned int mac_len)
{
        int err;

        err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len);
        if (err) {
                dev_kfree_skb_any(skb);
                return NULL;
        }
        return skb;
}

/**
 * vlan_insert_tag - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Does not change skb->protocol so this function can be used during receive.
 *
 * Return: modified @skb on success, NULL on error (@skb is freed).
 */
static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
                                              __be16 vlan_proto, u16 vlan_tci)
{
        return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN);
}

/**
 * vlan_insert_tag_set_proto - regular VLAN tag inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Inserts the VLAN tag into @skb as part of the payload
 * Returns a VLAN tagged skb. This might change skb->head.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 *
 * Return: modified @skb on success, NULL on error (@skb is freed).
 */
static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb,
                                                        __be16 vlan_proto,
                                                        u16 vlan_tci)
{
        skb = vlan_insert_tag(skb, vlan_proto, vlan_tci);
        if (skb)
                skb->protocol = vlan_proto;
        return skb;
}

/**
 * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info
 * @skb: skbuff to clear
 *
 * Clears the VLAN information from @skb
 */
static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb)
{
        skb->vlan_all = 0;
}

/**
 * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb
 * @dst: skbuff to copy to
 * @src: skbuff to copy from
 *
 * Copies VLAN information from @src to @dst (for branchless code)
 */
static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src)
{
        dst->vlan_all = src->vlan_all;
}

/*
 * __vlan_hwaccel_push_inside - pushes vlan tag to the payload
 * @skb: skbuff to tag
 *
 * Pushes the VLAN tag from @skb->vlan_tci inside to the payload.
 *
 * Following the skb_unshare() example, in case of error, the calling function
 * doesn't have to worry about freeing the original skb.
 */
static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
{
        skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
        if (likely(skb))
                __vlan_hwaccel_clear_tag(skb);
        return skb;
}

/**
 * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
 * @skb: skbuff to tag
 * @vlan_proto: VLAN encapsulation protocol
 * @vlan_tci: VLAN TCI to insert
 *
 * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest
 */
static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb,
                                          __be16 vlan_proto, u16 vlan_tci)
{
        skb->vlan_proto = vlan_proto;
        skb->vlan_tci = vlan_tci;
}

/**
 * __vlan_get_tag - get the VLAN ID that is part of the payload
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns: error if the skb is not of VLAN type
 */
static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_ethhdr *veth = skb_vlan_eth_hdr(skb);

        if (!eth_type_vlan(veth->h_vlan_proto))
                return -ENODATA;

        *vlan_tci = ntohs(veth->h_vlan_TCI);
        return 0;
}

/**
 * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[]
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns: error if @skb->vlan_tci is not set correctly
 */
static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
                                         u16 *vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                *vlan_tci = skb_vlan_tag_get(skb);
                return 0;
        } else {
                *vlan_tci = 0;
                return -ENODATA;
        }
}

/**
 * vlan_get_tag - get the VLAN ID from the skb
 * @skb: skbuff to query
 * @vlan_tci: buffer to store value
 *
 * Returns: error if the skb is not VLAN tagged
 */
static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
{
        if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
                return __vlan_hwaccel_get_tag(skb, vlan_tci);
        } else {
                return __vlan_get_tag(skb, vlan_tci);
        }
}

/**
 * __vlan_get_protocol_offset() - get protocol EtherType.
 * @skb: skbuff to query
 * @type: first vlan protocol
 * @mac_offset: MAC offset
 * @depth: buffer to store length of eth and vlan tags in bytes
 *
 * Returns: the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb,
                                                __be16 type,
                                                int mac_offset,
                                                int *depth)
{
        unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;

        /* if type is 802.1Q/AD then the header should already be
         * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
         * ETH_HLEN otherwise
         */
        if (eth_type_vlan(type)) {
                if (vlan_depth) {
                        if (WARN_ON(vlan_depth < VLAN_HLEN))
                                return 0;
                        vlan_depth -= VLAN_HLEN;
                } else {
                        vlan_depth = ETH_HLEN;
                }
                do {
                        struct vlan_hdr vhdr, *vh;

                        vh = skb_header_pointer(skb, mac_offset + vlan_depth,
                                                sizeof(vhdr), &vhdr);
                        if (unlikely(!vh || !--parse_depth))
                                return 0;

                        type = vh->h_vlan_encapsulated_proto;
                        vlan_depth += VLAN_HLEN;
                } while (eth_type_vlan(type));
        }

        if (depth)
                *depth = vlan_depth;

        return type;
}

static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type,
                                         int *depth)
{
        return __vlan_get_protocol_offset(skb, type, 0, depth);
}

/**
 * vlan_get_protocol - get protocol EtherType.
 * @skb: skbuff to query
 *
 * Returns: the EtherType of the packet, regardless of whether it is
 * vlan encapsulated (normal or hardware accelerated) or not.
 */
static inline __be16 vlan_get_protocol(const struct sk_buff *skb)
{
        return __vlan_get_protocol(skb, skb->protocol, NULL);
}

/* This version of __vlan_get_protocol() also pulls mac header in skb->head */
static inline __be16 vlan_get_protocol_and_depth(struct sk_buff *skb,
                                                 __be16 type, int *depth)
{
        int maclen;

        type = __vlan_get_protocol(skb, type, &maclen);

        if (type) {
                if (!pskb_may_pull(skb, maclen))
                        type = 0;
                else if (depth)
                        *depth = maclen;
        }
        return type;
}

/* A getter for the SKB protocol field which will handle VLAN tags consistently
 * whether VLAN acceleration is enabled or not.
 */
static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan)
{
        if (!skip_vlan)
                /* VLAN acceleration strips the VLAN header from the skb and
                 * moves it to skb->vlan_proto
                 */
                return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol;

        return vlan_get_protocol(skb);
}

static inline void vlan_set_encap_proto(struct sk_buff *skb,
                                        struct vlan_hdr *vhdr)
{
        __be16 proto;
        unsigned short *rawp;

        /*
         * Was a VLAN packet, grab the encapsulated protocol, which the layer
         * three protocols care about.
         */

        proto = vhdr->h_vlan_encapsulated_proto;
        if (eth_proto_is_802_3(proto)) {
                skb->protocol = proto;
                return;
        }

        rawp = (unsigned short *)(vhdr + 1);
        if (*rawp == 0xFFFF)
                /*
                 * This is a magic hack to spot IPX packets. Older Novell
                 * breaks the protocol design and runs IPX over 802.3 without
                 * an 802.2 LLC layer. We look for FFFF which isn't a used
                 * 802.2 SSAP/DSAP. This won't work for fault tolerant netware
                 * but does for the rest.
                 */
                skb->protocol = htons(ETH_P_802_3);
        else
                /*
                 * Real 802.2 LLC
                 */
                skb->protocol = htons(ETH_P_802_2);
}

/**
 * vlan_remove_tag - remove outer VLAN tag from payload
 * @skb: skbuff to remove tag from
 * @vlan_tci: buffer to store value
 *
 * Expects the skb to contain a VLAN tag in the payload, and to have skb->data
 * pointing at the MAC header.
 *
 * Returns: a new pointer to skb->data, or NULL on failure to pull.
 */
static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
{
        struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);

        *vlan_tci = ntohs(vhdr->h_vlan_TCI);

        memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
        vlan_set_encap_proto(skb, vhdr);
        return __skb_pull(skb, VLAN_HLEN);
}

/**
 * skb_vlan_tagged - check if skb is vlan tagged.
 * @skb: skbuff to query
 *
 * Returns: true if the skb is tagged, regardless of whether it is hardware
 * accelerated or not.
 */
static inline bool skb_vlan_tagged(const struct sk_buff *skb)
{
        if (!skb_vlan_tag_present(skb) &&
            likely(!eth_type_vlan(skb->protocol)))
                return false;

        return true;
}

/**
 * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers.
 * @skb: skbuff to query
 *
 * Returns: true if the skb is tagged with multiple vlan headers, regardless
 * of whether it is hardware accelerated or not.
 */
static inline bool skb_vlan_tagged_multi(struct sk_buff *skb)
{
        __be16 protocol = skb->protocol;

        if (!skb_vlan_tag_present(skb)) {
                struct vlan_ethhdr *veh;

                if (likely(!eth_type_vlan(protocol)))
                        return false;

                if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN)))
                        return false;

                veh = skb_vlan_eth_hdr(skb);
                protocol = veh->h_vlan_encapsulated_proto;
        }

        if (!eth_type_vlan(protocol))
                return false;

        return true;
}

/**
 * vlan_features_check - drop unsafe features for skb with multiple tags.
 * @skb: skbuff to query
 * @features: features to be checked
 *
 * Returns: features without unsafe ones if the skb has multiple tags.
 */
static inline netdev_features_t vlan_features_check(struct sk_buff *skb,
                                                    netdev_features_t features)
{
        if (skb_vlan_tagged_multi(skb)) {
                /* In the case of multi-tagged packets, use a direct mask
                 * instead of using netdev_interesect_features(), to make
                 * sure that only devices supporting NETIF_F_HW_CSUM will
                 * have checksum offloading support.
                 */
                features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM |
                            NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX |
                            NETIF_F_HW_VLAN_STAG_TX;
        }

        return features;
}

/**
 * compare_vlan_header - Compare two vlan headers
 * @h1: Pointer to vlan header
 * @h2: Pointer to vlan header
 *
 * Compare two vlan headers.
 *
 * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits.
 *
 * Return: 0 if equal, arbitrary non-zero value if not equal.
 */
static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1,
                                                const struct vlan_hdr *h2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return *(u32 *)h1 ^ *(u32 *)h2;
#else
        return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) |
               ((__force u32)h1->h_vlan_encapsulated_proto ^
                (__force u32)h2->h_vlan_encapsulated_proto);
#endif
}
#endif /* !(_LINUX_IF_VLAN_H_) */
























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BLK_CGROUP_PRIVATE_H
#define _BLK_CGROUP_PRIVATE_H
/*
 * block cgroup private header
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 */

#include <linux/blk-cgroup.h>
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
#include <linux/llist.h>
#include "blk.h"

struct blkcg_gq;
struct blkg_policy_data;


/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
#define BLKG_STAT_CPU_BATCH        (INT_MAX / 2)

#ifdef CONFIG_BLK_CGROUP

enum blkg_iostat_type {
        BLKG_IOSTAT_READ,
        BLKG_IOSTAT_WRITE,
        BLKG_IOSTAT_DISCARD,

        BLKG_IOSTAT_NR,
};

struct blkg_iostat {
        u64                                bytes[BLKG_IOSTAT_NR];
        u64                                ios[BLKG_IOSTAT_NR];
};

struct blkg_iostat_set {
        struct u64_stats_sync                sync;
        struct blkcg_gq                       *blkg;
        struct llist_node                lnode;
        int                                lqueued;        /* queued in llist */
        struct blkg_iostat                cur;
        struct blkg_iostat                last;
};

/* association between a blk cgroup and a request queue */
struct blkcg_gq {
        /* Pointer to the associated request_queue */
        struct request_queue                *q;
        struct list_head                q_node;
        struct hlist_node                blkcg_node;
        struct blkcg                        *blkcg;

        /* all non-root blkcg_gq's are guaranteed to have access to parent */
        struct blkcg_gq                        *parent;

        /* reference count */
        struct percpu_ref                refcnt;

        /* is this blkg online? protected by both blkcg and q locks */
        bool                                online;

        struct blkg_iostat_set __percpu        *iostat_cpu;
        struct blkg_iostat_set                iostat;

        struct blkg_policy_data                *pd[BLKCG_MAX_POLS];
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spinlock_t                        async_bio_lock;
        struct bio_list                        async_bios;
#endif
        union {
                struct work_struct        async_bio_work;
                struct work_struct        free_work;
        };

        atomic_t                        use_delay;
        atomic64_t                        delay_nsec;
        atomic64_t                        delay_start;
        u64                                last_delay;
        int                                last_use;

        struct rcu_head                        rcu_head;
};

struct blkcg {
        struct cgroup_subsys_state        css;
        spinlock_t                        lock;
        refcount_t                        online_pin;
        /* If there is block congestion on this cgroup. */
        atomic_t                        congestion_count;

        struct radix_tree_root                blkg_tree;
        struct blkcg_gq        __rcu                *blkg_hint;
        struct hlist_head                blkg_list;

        struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];

        struct list_head                all_blkcgs_node;

        /*
         * List of updated percpu blkg_iostat_set's since the last flush.
         */
        struct llist_head __percpu        *lhead;

#ifdef CONFIG_BLK_CGROUP_FC_APPID
        char                            fc_app_id[FC_APPID_LEN];
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
        struct list_head                cgwb_list;
#endif
};

static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
{
        return css ? container_of(css, struct blkcg, css) : NULL;
}

/*
 * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
 * request_queue (q).  This is used by blkcg policies which need to track
 * information per blkcg - q pair.
 *
 * There can be multiple active blkcg policies and each blkg:policy pair is
 * represented by a blkg_policy_data which is allocated and freed by each
 * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
 * area by allocating larger data structure which embeds blkg_policy_data
 * at the beginning.
 */
struct blkg_policy_data {
        /* the blkg and policy id this per-policy data belongs to */
        struct blkcg_gq                        *blkg;
        int                                plid;
        bool                                online;
};

/*
 * Policies that need to keep per-blkcg data which is independent from any
 * request_queue associated to it should implement cpd_alloc/free_fn()
 * methods.  A policy can allocate private data area by allocating larger
 * data structure which embeds blkcg_policy_data at the beginning.
 * cpd_init() is invoked to let each policy handle per-blkcg data.
 */
struct blkcg_policy_data {
        /* the blkcg and policy id this per-policy data belongs to */
        struct blkcg                        *blkcg;
        int                                plid;
};

typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk,
                struct blkcg *blkcg, gfp_t gfp);
typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
                                struct seq_file *s);

struct blkcg_policy {
        int                                plid;
        /* cgroup files for the policy */
        struct cftype                        *dfl_cftypes;
        struct cftype                        *legacy_cftypes;

        /* operations */
        blkcg_pol_alloc_cpd_fn                *cpd_alloc_fn;
        blkcg_pol_free_cpd_fn                *cpd_free_fn;

        blkcg_pol_alloc_pd_fn                *pd_alloc_fn;
        blkcg_pol_init_pd_fn                *pd_init_fn;
        blkcg_pol_online_pd_fn                *pd_online_fn;
        blkcg_pol_offline_pd_fn                *pd_offline_fn;
        blkcg_pol_free_pd_fn                *pd_free_fn;
        blkcg_pol_reset_pd_stats_fn        *pd_reset_stats_fn;
        blkcg_pol_stat_pd_fn                *pd_stat_fn;
};

extern struct blkcg blkcg_root;
extern bool blkcg_debug_stats;

void blkg_init_queue(struct request_queue *q);
int blkcg_init_disk(struct gendisk *disk);
void blkcg_exit_disk(struct gendisk *disk);

/* Blkio controller policy registration */
int blkcg_policy_register(struct blkcg_policy *pol);
void blkcg_policy_unregister(struct blkcg_policy *pol);
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol);
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol);

const char *blkg_dev_name(struct blkcg_gq *blkg);
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total);
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);

struct blkg_conf_ctx {
        char                                *input;
        char                                *body;
        struct block_device                *bdev;
        struct blkcg_gq                        *blkg;
};

void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
unsigned long blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx);
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx);
void blkg_conf_exit(struct blkg_conf_ctx *ctx);
void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags);

/**
 * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
 * @bio: the target &bio
 *
 * Return: true if this bio needs to be submitted with the root blkg context.
 *
 * In order to avoid priority inversions we sometimes need to issue a bio as if
 * it were attached to the root blkg, and then backcharge to the actual owning
 * blkg.  The idea is we do bio_blkcg_css() to look up the actual context for
 * the bio and attach the appropriate blkg to the bio.  Then we call this helper
 * and if it is true run with the root blkg for that queue and then do any
 * backcharging to the originating cgroup once the io is complete.
 */
static inline bool bio_issue_as_root_blkg(struct bio *bio)
{
        return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
}

/**
 * blkg_lookup - lookup blkg for the specified blkcg - q pair
 * @blkcg: blkcg of interest
 * @q: request_queue of interest
 *
 * Lookup blkg for the @blkcg - @q pair.
 *
 * Must be called in a RCU critical section.
 */
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
                                           struct request_queue *q)
{
        struct blkcg_gq *blkg;

        if (blkcg == &blkcg_root)
                return q->root_blkg;

        blkg = rcu_dereference_check(blkcg->blkg_hint,
                        lockdep_is_held(&q->queue_lock));
        if (blkg && blkg->q == q)
                return blkg;

        blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
        if (blkg && blkg->q != q)
                blkg = NULL;
        return blkg;
}

/**
 * blkg_to_pd - get policy private data
 * @blkg: blkg of interest
 * @pol: policy of interest
 *
 * Return pointer to private data associated with the @blkg-@pol pair.
 */
static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol)
{
        return blkg ? blkg->pd[pol->plid] : NULL;
}

static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
                                                     struct blkcg_policy *pol)
{
        return blkcg ? blkcg->cpd[pol->plid] : NULL;
}

/**
 * pd_to_blkg - get blkg associated with policy private data
 * @pd: policy private data of interest
 *
 * @pd is policy private data.  Determine the blkg it's associated with.
 */
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
{
        return pd ? pd->blkg : NULL;
}

static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
{
        return cpd ? cpd->blkcg : NULL;
}

/**
 * blkg_get - get a blkg reference
 * @blkg: blkg to get
 *
 * The caller should be holding an existing reference.
 */
static inline void blkg_get(struct blkcg_gq *blkg)
{
        percpu_ref_get(&blkg->refcnt);
}

/**
 * blkg_tryget - try and get a blkg reference
 * @blkg: blkg to get
 *
 * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
 * of freeing this blkg, so we can only use it if the refcnt is not zero.
 */
static inline bool blkg_tryget(struct blkcg_gq *blkg)
{
        return blkg && percpu_ref_tryget(&blkg->refcnt);
}

/**
 * blkg_put - put a blkg reference
 * @blkg: blkg to put
 */
static inline void blkg_put(struct blkcg_gq *blkg)
{
        percpu_ref_put(&blkg->refcnt);
}

/**
 * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
 * read locked.  If called under either blkcg or queue lock, the iteration
 * is guaranteed to include all and only online blkgs.  The caller may
 * update @pos_css by calling css_rightmost_descendant() to skip subtree.
 * @p_blkg is included in the iteration and the first node to be visited.
 */
#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

/**
 * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
 * @d_blkg: loop cursor pointing to the current descendant
 * @pos_css: used for iteration
 * @p_blkg: target blkg to walk descendants of
 *
 * Similar to blkg_for_each_descendant_pre() but performs post-order
 * traversal instead.  Synchronization rules are the same.  @p_blkg is
 * included in the iteration and the last node to be visited.
 */
#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)                \
        css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)        \
                if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),        \
                                            (p_blkg)->q)))

static inline void blkcg_bio_issue_init(struct bio *bio)
{
        bio_issue_init(&bio->bi_issue, bio_sectors(bio));
}

static inline void blkcg_use_delay(struct blkcg_gq *blkg)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        if (atomic_add_return(1, &blkg->use_delay) == 1)
                atomic_inc(&blkg->blkcg->congestion_count);
}

static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        if (WARN_ON_ONCE(old < 0))
                return 0;
        if (old == 0)
                return 0;

        /*
         * We do this song and dance because we can race with somebody else
         * adding or removing delay.  If we just did an atomic_dec we'd end up
         * negative and we'd already be in trouble.  We need to subtract 1 and
         * then check to see if we were the last delay so we can drop the
         * congestion count on the cgroup.
         */
        while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1))
                ;

        if (old == 0)
                return 0;
        if (old == 1)
                atomic_dec(&blkg->blkcg->congestion_count);
        return 1;
}

/**
 * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
 * @blkg: target blkg
 * @delay: delay duration in nsecs
 *
 * When enabled with this function, the delay is not decayed and must be
 * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
 * blkcg_[un]use_delay() and blkcg_add_delay() usages.
 */
static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person setting the congestion count for this blkg. */
        if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
                atomic_inc(&blkg->blkcg->congestion_count);

        atomic64_set(&blkg->delay_nsec, delay);
}

/**
 * blkcg_clear_delay - Disable allocator delay mechanism
 * @blkg: target blkg
 *
 * Disable use_delay mechanism. See blkcg_set_delay().
 */
static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
{
        int old = atomic_read(&blkg->use_delay);

        /* We only want 1 person clearing the congestion count for this blkg. */
        if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
                atomic_dec(&blkg->blkcg->congestion_count);
}

/**
 * blk_cgroup_mergeable - Determine whether to allow or disallow merges
 * @rq: request to merge into
 * @bio: bio to merge
 *
 * @bio and @rq should belong to the same cgroup and their issue_as_root should
 * match. The latter is necessary as we don't want to throttle e.g. a metadata
 * update because it happens to be next to a regular IO.
 */
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
{
        return rq->bio->bi_blkg == bio->bi_blkg &&
                bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
}

void blk_cgroup_bio_start(struct bio *bio);
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
#else        /* CONFIG_BLK_CGROUP */

struct blkg_policy_data {
};

struct blkcg_policy_data {
};

struct blkcg_policy {
};

struct blkcg {
};

static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline void blkg_init_queue(struct request_queue *q) { }
static inline int blkcg_init_disk(struct gendisk *disk) { return 0; }
static inline void blkcg_exit_disk(struct gendisk *disk) { }
static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
static inline int blkcg_activate_policy(struct gendisk *disk,
                                        const struct blkcg_policy *pol) { return 0; }
static inline void blkcg_deactivate_policy(struct gendisk *disk,
                                           const struct blkcg_policy *pol) { }

static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
                                                  struct blkcg_policy *pol) { return NULL; }
static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
static inline void blkg_get(struct blkcg_gq *blkg) { }
static inline void blkg_put(struct blkcg_gq *blkg) { }
static inline void blkcg_bio_issue_init(struct bio *bio) { }
static inline void blk_cgroup_bio_start(struct bio *bio) { }
static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }

#define blk_queue_for_each_rl(rl, q)        \
        for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)

#endif        /* CONFIG_BLK_CGROUP */

#endif /* _BLK_CGROUP_PRIVATE_H */






























































































  900 



  899 


  137 










  900 
























































































































































































































   25 








   25 




   19 
   25 

   25 
   25 



















   19 






















   19 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
// SPDX-License-Identifier: GPL-2.0
/*
 * Fast batching percpu counters.
 */

#include <linux/percpu_counter.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/module.h>
#include <linux/debugobjects.h>

#ifdef CONFIG_HOTPLUG_CPU
static LIST_HEAD(percpu_counters);
static DEFINE_SPINLOCK(percpu_counters_lock);
#endif

#ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER

static const struct debug_obj_descr percpu_counter_debug_descr;

static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state)
{
        struct percpu_counter *fbc = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                percpu_counter_destroy(fbc);
                debug_object_free(fbc, &percpu_counter_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr percpu_counter_debug_descr = {
        .name                = "percpu_counter",
        .fixup_free        = percpu_counter_fixup_free,
};

static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{
        debug_object_init(fbc, &percpu_counter_debug_descr);
        debug_object_activate(fbc, &percpu_counter_debug_descr);
}

static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{
        debug_object_deactivate(fbc, &percpu_counter_debug_descr);
        debug_object_free(fbc, &percpu_counter_debug_descr);
}

#else        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */
static inline void debug_percpu_counter_activate(struct percpu_counter *fbc)
{ }
static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc)
{ }
#endif        /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */

void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
{
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        for_each_possible_cpu(cpu) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                *pcount = 0;
        }
        fbc->count = amount;
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_set);

/*
 * Add to a counter while respecting batch size.
 *
 * There are 2 implementations, both dealing with the following problem:
 *
 * The decision slow path/fast path and the actual update must be atomic.
 * Otherwise a call in process context could check the current values and
 * decide that the fast path can be used. If now an interrupt occurs before
 * the this_cpu_add(), and the interrupt updates this_cpu(*fbc->counters),
 * then the this_cpu_add() that is executed after the interrupt has completed
 * can produce values larger than "batch" or even overflows.
 */
#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
/*
 * Safety against interrupts is achieved in 2 ways:
 * 1. the fast path uses local cmpxchg (note: no lock prefix)
 * 2. the slow path operates with interrupts disabled
 */
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        s64 count;
        unsigned long flags;

        count = this_cpu_read(*fbc->counters);
        do {
                if (unlikely(abs(count + amount) >= batch)) {
                        raw_spin_lock_irqsave(&fbc->lock, flags);
                        /*
                         * Note: by now we might have migrated to another CPU
                         * or the value might have changed.
                         */
                        count = __this_cpu_read(*fbc->counters);
                        fbc->count += count + amount;
                        __this_cpu_sub(*fbc->counters, count);
                        raw_spin_unlock_irqrestore(&fbc->lock, flags);
                        return;
                }
        } while (!this_cpu_try_cmpxchg(*fbc->counters, &count, count + amount));
}
#else
/*
 * local_irq_save() is used to make the function irq safe:
 * - The slow path would be ok as protected by an irq-safe spinlock.
 * - this_cpu_add would be ok as it is irq-safe by definition.
 */
void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
{
        s64 count;
        unsigned long flags;

        local_irq_save(flags);
        count = __this_cpu_read(*fbc->counters) + amount;
        if (abs(count) >= batch) {
                raw_spin_lock(&fbc->lock);
                fbc->count += count;
                __this_cpu_sub(*fbc->counters, count - amount);
                raw_spin_unlock(&fbc->lock);
        } else {
                this_cpu_add(*fbc->counters, amount);
        }
        local_irq_restore(flags);
}
#endif
EXPORT_SYMBOL(percpu_counter_add_batch);

/*
 * For percpu_counter with a big batch, the devication of its count could
 * be big, and there is requirement to reduce the deviation, like when the
 * counter's batch could be runtime decreased to get a better accuracy,
 * which can be achieved by running this sync function on each CPU.
 */
void percpu_counter_sync(struct percpu_counter *fbc)
{
        unsigned long flags;
        s64 count;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        count = __this_cpu_read(*fbc->counters);
        fbc->count += count;
        __this_cpu_sub(*fbc->counters, count);
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
}
EXPORT_SYMBOL(percpu_counter_sync);

/*
 * Add up all the per-cpu counts, return the result.  This is a more accurate
 * but much slower version of percpu_counter_read_positive().
 *
 * We use the cpu mask of (cpu_online_mask | cpu_dying_mask) to capture sums
 * from CPUs that are in the process of being taken offline. Dying cpus have
 * been removed from the online mask, but may not have had the hotplug dead
 * notifier called to fold the percpu count back into the global counter sum.
 * By including dying CPUs in the iteration mask, we avoid this race condition
 * so __percpu_counter_sum() just does the right thing when CPUs are being taken
 * offline.
 */
s64 __percpu_counter_sum(struct percpu_counter *fbc)
{
        s64 ret;
        int cpu;
        unsigned long flags;

        raw_spin_lock_irqsave(&fbc->lock, flags);
        ret = fbc->count;
        for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
                s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
                ret += *pcount;
        }
        raw_spin_unlock_irqrestore(&fbc->lock, flags);
        return ret;
}
EXPORT_SYMBOL(__percpu_counter_sum);

int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
                               gfp_t gfp, u32 nr_counters,
                               struct lock_class_key *key)
{
        unsigned long flags __maybe_unused;
        size_t counter_size;
        s32 __percpu *counters;
        u32 i;

        counter_size = ALIGN(sizeof(*counters), __alignof__(*counters));
        counters = __alloc_percpu_gfp(nr_counters * counter_size,
                                      __alignof__(*counters), gfp);
        if (!counters) {
                fbc[0].counters = NULL;
                return -ENOMEM;
        }

        for (i = 0; i < nr_counters; i++) {
                raw_spin_lock_init(&fbc[i].lock);
                lockdep_set_class(&fbc[i].lock, key);
#ifdef CONFIG_HOTPLUG_CPU
                INIT_LIST_HEAD(&fbc[i].list);
#endif
                fbc[i].count = amount;
                fbc[i].counters = (void __percpu *)counters + i * counter_size;

                debug_percpu_counter_activate(&fbc[i]);
        }

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        for (i = 0; i < nr_counters; i++)
                list_add(&fbc[i].list, &percpu_counters);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif
        return 0;
}
EXPORT_SYMBOL(__percpu_counter_init_many);

void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters)
{
        unsigned long flags __maybe_unused;
        u32 i;

        if (WARN_ON_ONCE(!fbc))
                return;

        if (!fbc[0].counters)
                return;

        for (i = 0; i < nr_counters; i++)
                debug_percpu_counter_deactivate(&fbc[i]);

#ifdef CONFIG_HOTPLUG_CPU
        spin_lock_irqsave(&percpu_counters_lock, flags);
        for (i = 0; i < nr_counters; i++)
                list_del(&fbc[i].list);
        spin_unlock_irqrestore(&percpu_counters_lock, flags);
#endif

        free_percpu(fbc[0].counters);

        for (i = 0; i < nr_counters; i++)
                fbc[i].counters = NULL;
}
EXPORT_SYMBOL(percpu_counter_destroy_many);

int percpu_counter_batch __read_mostly = 32;
EXPORT_SYMBOL(percpu_counter_batch);

static int compute_batch_value(unsigned int cpu)
{
        int nr = num_online_cpus();

        percpu_counter_batch = max(32, nr*2);
        return 0;
}

static int percpu_counter_cpu_dead(unsigned int cpu)
{
#ifdef CONFIG_HOTPLUG_CPU
        struct percpu_counter *fbc;

        compute_batch_value(cpu);

        spin_lock_irq(&percpu_counters_lock);
        list_for_each_entry(fbc, &percpu_counters, list) {
                s32 *pcount;

                raw_spin_lock(&fbc->lock);
                pcount = per_cpu_ptr(fbc->counters, cpu);
                fbc->count += *pcount;
                *pcount = 0;
                raw_spin_unlock(&fbc->lock);
        }
        spin_unlock_irq(&percpu_counters_lock);
#endif
        return 0;
}

/*
 * Compare counter against given value.
 * Return 1 if greater, 0 if equal and -1 if less
 */
int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch)
{
        s64        count;

        count = percpu_counter_read(fbc);
        /* Check to see if rough count will be sufficient for comparison */
        if (abs(count - rhs) > (batch * num_online_cpus())) {
                if (count > rhs)
                        return 1;
                else
                        return -1;
        }
        /* Need to use precise count */
        count = percpu_counter_sum(fbc);
        if (count > rhs)
                return 1;
        else if (count < rhs)
                return -1;
        else
                return 0;
}
EXPORT_SYMBOL(__percpu_counter_compare);

/*
 * Compare counter, and add amount if total is: less than or equal to limit if
 * amount is positive, or greater than or equal to limit if amount is negative.
 * Return true if amount is added, or false if total would be beyond the limit.
 *
 * Negative limit is allowed, but unusual.
 * When negative amounts (subs) are given to percpu_counter_limited_add(),
 * the limit would most naturally be 0 - but other limits are also allowed.
 *
 * Overflow beyond S64_MAX is not allowed for: counter, limit and amount
 * are all assumed to be sane (far from S64_MIN and S64_MAX).
 */
bool __percpu_counter_limited_add(struct percpu_counter *fbc,
                                  s64 limit, s64 amount, s32 batch)
{
        s64 count;
        s64 unknown;
        unsigned long flags;
        bool good = false;

        if (amount == 0)
                return true;

        local_irq_save(flags);
        unknown = batch * num_online_cpus();
        count = __this_cpu_read(*fbc->counters);

        /* Skip taking the lock when safe */
        if (abs(count + amount) <= batch &&
            ((amount > 0 && fbc->count + unknown <= limit) ||
             (amount < 0 && fbc->count - unknown >= limit))) {
                this_cpu_add(*fbc->counters, amount);
                local_irq_restore(flags);
                return true;
        }

        raw_spin_lock(&fbc->lock);
        count = fbc->count + amount;

        /* Skip percpu_counter_sum() when safe */
        if (amount > 0) {
                if (count - unknown > limit)
                        goto out;
                if (count + unknown <= limit)
                        good = true;
        } else {
                if (count + unknown < limit)
                        goto out;
                if (count - unknown >= limit)
                        good = true;
        }

        if (!good) {
                s32 *pcount;
                int cpu;

                for_each_cpu_or(cpu, cpu_online_mask, cpu_dying_mask) {
                        pcount = per_cpu_ptr(fbc->counters, cpu);
                        count += *pcount;
                }
                if (amount > 0) {
                        if (count > limit)
                                goto out;
                } else {
                        if (count < limit)
                                goto out;
                }
                good = true;
        }

        count = __this_cpu_read(*fbc->counters);
        fbc->count += count + amount;
        __this_cpu_sub(*fbc->counters, count);
out:
        raw_spin_unlock(&fbc->lock);
        local_irq_restore(flags);
        return good;
}

static int __init percpu_counter_startup(void)
{
        int ret;

        ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online",
                                compute_batch_value, NULL);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD,
                                        "lib/percpu_cnt:dead", NULL,
                                        percpu_counter_cpu_dead);
        WARN_ON(ret < 0);
        return 0;
}
module_init(percpu_counter_startup);

































    6 





    6 
















































    6 





    6 































































































  179 


  179 

































































































































































































































































































  179 





  179 




























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the multi-level security (MLS) policy.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

/*
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *          Support for enhanced MLS infrastructure.
 *          Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
 *
 * Updated: Hewlett-Packard <paul@paul-moore.com>
 *          Added support to import/export the MLS label from NetLabel
 *          Copyright (C) Hewlett-Packard Development Company, L.P., 2006
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <net/netlabel.h>
#include "sidtab.h"
#include "mls.h"
#include "policydb.h"
#include "services.h"

/*
 * Return the length in bytes for the MLS fields of the
 * security context string representation of `context'.
 */
int mls_compute_context_len(struct policydb *p, struct context *context)
{
        int i, l, len, head, prev;
        char *nm;
        struct ebitmap *e;
        struct ebitmap_node *node;

        if (!p->mls_enabled)
                return 0;

        len = 1; /* for the beginning ":" */
        for (l = 0; l < 2; l++) {
                u32 index_sens = context->range.level[l].sens;
                len += strlen(sym_name(p, SYM_LEVELS, index_sens - 1));

                /* categories */
                head = -2;
                prev = -2;
                e = &context->range.level[l].cat;
                ebitmap_for_each_positive_bit(e, node, i)
                {
                        if (i - prev > 1) {
                                /* one or more negative bits are skipped */
                                if (head != prev) {
                                        nm = sym_name(p, SYM_CATS, prev);
                                        len += strlen(nm) + 1;
                                }
                                nm = sym_name(p, SYM_CATS, i);
                                len += strlen(nm) + 1;
                                head = i;
                        }
                        prev = i;
                }
                if (prev != head) {
                        nm = sym_name(p, SYM_CATS, prev);
                        len += strlen(nm) + 1;
                }
                if (l == 0) {
                        if (mls_level_eq(&context->range.level[0],
                                         &context->range.level[1]))
                                break;
                        else
                                len++;
                }
        }

        return len;
}

/*
 * Write the security context string representation of
 * the MLS fields of `context' into the string `*scontext'.
 * Update `*scontext' to point to the end of the MLS fields.
 */
void mls_sid_to_context(struct policydb *p, struct context *context,
                        char **scontext)
{
        char *scontextp, *nm;
        int i, l, head, prev;
        struct ebitmap *e;
        struct ebitmap_node *node;

        if (!p->mls_enabled)
                return;

        scontextp = *scontext;

        *scontextp = ':';
        scontextp++;

        for (l = 0; l < 2; l++) {
                strcpy(scontextp, sym_name(p, SYM_LEVELS,
                                           context->range.level[l].sens - 1));
                scontextp += strlen(scontextp);

                /* categories */
                head = -2;
                prev = -2;
                e = &context->range.level[l].cat;
                ebitmap_for_each_positive_bit(e, node, i)
                {
                        if (i - prev > 1) {
                                /* one or more negative bits are skipped */
                                if (prev != head) {
                                        if (prev - head > 1)
                                                *scontextp++ = '.';
                                        else
                                                *scontextp++ = ',';
                                        nm = sym_name(p, SYM_CATS, prev);
                                        strcpy(scontextp, nm);
                                        scontextp += strlen(nm);
                                }
                                if (prev < 0)
                                        *scontextp++ = ':';
                                else
                                        *scontextp++ = ',';
                                nm = sym_name(p, SYM_CATS, i);
                                strcpy(scontextp, nm);
                                scontextp += strlen(nm);
                                head = i;
                        }
                        prev = i;
                }

                if (prev != head) {
                        if (prev - head > 1)
                                *scontextp++ = '.';
                        else
                                *scontextp++ = ',';
                        nm = sym_name(p, SYM_CATS, prev);
                        strcpy(scontextp, nm);
                        scontextp += strlen(nm);
                }

                if (l == 0) {
                        if (mls_level_eq(&context->range.level[0],
                                         &context->range.level[1]))
                                break;
                        else
                                *scontextp++ = '-';
                }
        }

        *scontext = scontextp;
}

int mls_level_isvalid(struct policydb *p, struct mls_level *l)
{
        struct level_datum *levdatum;

        if (!l->sens || l->sens > p->p_levels.nprim)
                return 0;
        levdatum = symtab_search(&p->p_levels,
                                 sym_name(p, SYM_LEVELS, l->sens - 1));
        if (!levdatum)
                return 0;

        /*
         * Return 1 iff all the bits set in l->cat are also be set in
         * levdatum->level->cat and no bit in l->cat is larger than
         * p->p_cats.nprim.
         */
        return ebitmap_contains(&levdatum->level.cat, &l->cat,
                                p->p_cats.nprim);
}

int mls_range_isvalid(struct policydb *p, struct mls_range *r)
{
        return (mls_level_isvalid(p, &r->level[0]) &&
                mls_level_isvalid(p, &r->level[1]) &&
                mls_level_dom(&r->level[1], &r->level[0]));
}

/*
 * Return 1 if the MLS fields in the security context
 * structure `c' are valid.  Return 0 otherwise.
 */
int mls_context_isvalid(struct policydb *p, struct context *c)
{
        struct user_datum *usrdatum;

        if (!p->mls_enabled)
                return 1;

        if (!mls_range_isvalid(p, &c->range))
                return 0;

        if (c->role == OBJECT_R_VAL)
                return 1;

        /*
         * User must be authorized for the MLS range.
         */
        if (!c->user || c->user > p->p_users.nprim)
                return 0;
        usrdatum = p->user_val_to_struct[c->user - 1];
        if (!mls_range_contains(usrdatum->range, c->range))
                return 0; /* user may not be associated with range */

        return 1;
}

/*
 * Set the MLS fields in the security context structure
 * `context' based on the string representation in
 * the string `scontext'.
 *
 * This function modifies the string in place, inserting
 * NULL characters to terminate the MLS fields.
 *
 * If a def_sid is provided and no MLS field is present,
 * copy the MLS field of the associated default context.
 * Used for upgraded to MLS systems where objects may lack
 * MLS fields.
 *
 * Policy read-lock must be held for sidtab lookup.
 *
 */
int mls_context_to_sid(struct policydb *pol, char oldc, char *scontext,
                       struct context *context, struct sidtab *s, u32 def_sid)
{
        char *sensitivity, *cur_cat, *next_cat, *rngptr;
        struct level_datum *levdatum;
        struct cat_datum *catdatum, *rngdatum;
        u32 i;
        int l, rc;
        char *rangep[2];

        if (!pol->mls_enabled) {
                /*
                 * With no MLS, only return -EINVAL if there is a MLS field
                 * and it did not come from an xattr.
                 */
                if (oldc && def_sid == SECSID_NULL)
                        return -EINVAL;
                return 0;
        }

        /*
         * No MLS component to the security context, try and map to
         * default if provided.
         */
        if (!oldc) {
                struct context *defcon;

                if (def_sid == SECSID_NULL)
                        return -EINVAL;

                defcon = sidtab_search(s, def_sid);
                if (!defcon)
                        return -EINVAL;

                return mls_context_cpy(context, defcon);
        }

        /*
         * If we're dealing with a range, figure out where the two parts
         * of the range begin.
         */
        rangep[0] = scontext;
        rangep[1] = strchr(scontext, '-');
        if (rangep[1]) {
                rangep[1][0] = '\0';
                rangep[1]++;
        }

        /* For each part of the range: */
        for (l = 0; l < 2; l++) {
                /* Split sensitivity and category set. */
                sensitivity = rangep[l];
                if (sensitivity == NULL)
                        break;
                next_cat = strchr(sensitivity, ':');
                if (next_cat)
                        *(next_cat++) = '\0';

                /* Parse sensitivity. */
                levdatum = symtab_search(&pol->p_levels, sensitivity);
                if (!levdatum)
                        return -EINVAL;
                context->range.level[l].sens = levdatum->level.sens;

                /* Extract category set. */
                while (next_cat != NULL) {
                        cur_cat = next_cat;
                        next_cat = strchr(next_cat, ',');
                        if (next_cat != NULL)
                                *(next_cat++) = '\0';

                        /* Separate into range if exists */
                        rngptr = strchr(cur_cat, '.');
                        if (rngptr != NULL) {
                                /* Remove '.' */
                                *rngptr++ = '\0';
                        }

                        catdatum = symtab_search(&pol->p_cats, cur_cat);
                        if (!catdatum)
                                return -EINVAL;

                        rc = ebitmap_set_bit(&context->range.level[l].cat,
                                             catdatum->value - 1, 1);
                        if (rc)
                                return rc;

                        /* If range, set all categories in range */
                        if (rngptr == NULL)
                                continue;

                        rngdatum = symtab_search(&pol->p_cats, rngptr);
                        if (!rngdatum)
                                return -EINVAL;

                        if (catdatum->value >= rngdatum->value)
                                return -EINVAL;

                        for (i = catdatum->value; i < rngdatum->value; i++) {
                                rc = ebitmap_set_bit(
                                        &context->range.level[l].cat, i, 1);
                                if (rc)
                                        return rc;
                        }
                }
        }

        /* If we didn't see a '-', the range start is also the range end. */
        if (rangep[1] == NULL) {
                context->range.level[1].sens = context->range.level[0].sens;
                rc = ebitmap_cpy(&context->range.level[1].cat,
                                 &context->range.level[0].cat);
                if (rc)
                        return rc;
        }

        return 0;
}

/*
 * Set the MLS fields in the security context structure
 * `context' based on the string representation in
 * the string `str'.  This function will allocate temporary memory with the
 * given constraints of gfp_mask.
 */
int mls_from_string(struct policydb *p, char *str, struct context *context,
                    gfp_t gfp_mask)
{
        char *tmpstr;
        int rc;

        if (!p->mls_enabled)
                return -EINVAL;

        tmpstr = kstrdup(str, gfp_mask);
        if (!tmpstr) {
                rc = -ENOMEM;
        } else {
                rc = mls_context_to_sid(p, ':', tmpstr, context, NULL,
                                        SECSID_NULL);
                kfree(tmpstr);
        }

        return rc;
}

/*
 * Copies the MLS range `range' into `context'.
 */
int mls_range_set(struct context *context, struct mls_range *range)
{
        int l, rc = 0;

        /* Copy the MLS range into the  context */
        for (l = 0; l < 2; l++) {
                context->range.level[l].sens = range->level[l].sens;
                rc = ebitmap_cpy(&context->range.level[l].cat,
                                 &range->level[l].cat);
                if (rc)
                        break;
        }

        return rc;
}

int mls_setup_user_range(struct policydb *p, struct context *fromcon,
                         struct user_datum *user, struct context *usercon)
{
        if (p->mls_enabled) {
                struct mls_level *fromcon_sen = &(fromcon->range.level[0]);
                struct mls_level *fromcon_clr = &(fromcon->range.level[1]);
                struct mls_level *user_low = &(user->range.level[0]);
                struct mls_level *user_clr = &(user->range.level[1]);
                struct mls_level *user_def = &(user->dfltlevel);
                struct mls_level *usercon_sen = &(usercon->range.level[0]);
                struct mls_level *usercon_clr = &(usercon->range.level[1]);

                /* Honor the user's default level if we can */
                if (mls_level_between(user_def, fromcon_sen, fromcon_clr))
                        *usercon_sen = *user_def;
                else if (mls_level_between(fromcon_sen, user_def, user_clr))
                        *usercon_sen = *fromcon_sen;
                else if (mls_level_between(fromcon_clr, user_low, user_def))
                        *usercon_sen = *user_low;
                else
                        return -EINVAL;

                /* Lower the clearance of available contexts
                   if the clearance of "fromcon" is lower than
                   that of the user's default clearance (but
                   only if the "fromcon" clearance dominates
                   the user's computed sensitivity level) */
                if (mls_level_dom(user_clr, fromcon_clr))
                        *usercon_clr = *fromcon_clr;
                else if (mls_level_dom(fromcon_clr, user_clr))
                        *usercon_clr = *user_clr;
                else
                        return -EINVAL;
        }

        return 0;
}

/*
 * Convert the MLS fields in the security context
 * structure `oldc' from the values specified in the
 * policy `oldp' to the values specified in the policy `newp',
 * storing the resulting context in `newc'.
 */
int mls_convert_context(struct policydb *oldp, struct policydb *newp,
                        struct context *oldc, struct context *newc)
{
        struct level_datum *levdatum;
        struct cat_datum *catdatum;
        struct ebitmap_node *node;
        u32 i;
        int l;

        if (!oldp->mls_enabled || !newp->mls_enabled)
                return 0;

        for (l = 0; l < 2; l++) {
                char *name = sym_name(oldp, SYM_LEVELS,
                                      oldc->range.level[l].sens - 1);

                levdatum = symtab_search(&newp->p_levels, name);

                if (!levdatum)
                        return -EINVAL;
                newc->range.level[l].sens = levdatum->level.sens;

                ebitmap_for_each_positive_bit(&oldc->range.level[l].cat, node,
                                              i)
                {
                        int rc;

                        catdatum = symtab_search(&newp->p_cats,
                                                 sym_name(oldp, SYM_CATS, i));
                        if (!catdatum)
                                return -EINVAL;
                        rc = ebitmap_set_bit(&newc->range.level[l].cat,
                                             catdatum->value - 1, 1);
                        if (rc)
                                return rc;
                }
        }

        return 0;
}

int mls_compute_sid(struct policydb *p, struct context *scontext,
                    struct context *tcontext, u16 tclass, u32 specified,
                    struct context *newcontext, bool sock)
{
        struct range_trans rtr;
        struct mls_range *r;
        struct class_datum *cladatum;
        char default_range = 0;

        if (!p->mls_enabled)
                return 0;

        switch (specified) {
        case AVTAB_TRANSITION:
                /* Look for a range transition rule. */
                rtr.source_type = scontext->type;
                rtr.target_type = tcontext->type;
                rtr.target_class = tclass;
                r = policydb_rangetr_search(p, &rtr);
                if (r)
                        return mls_range_set(newcontext, r);

                if (tclass && tclass <= p->p_classes.nprim) {
                        cladatum = p->class_val_to_struct[tclass - 1];
                        if (cladatum)
                                default_range = cladatum->default_range;
                }

                switch (default_range) {
                case DEFAULT_SOURCE_LOW:
                        return mls_context_cpy_low(newcontext, scontext);
                case DEFAULT_SOURCE_HIGH:
                        return mls_context_cpy_high(newcontext, scontext);
                case DEFAULT_SOURCE_LOW_HIGH:
                        return mls_context_cpy(newcontext, scontext);
                case DEFAULT_TARGET_LOW:
                        return mls_context_cpy_low(newcontext, tcontext);
                case DEFAULT_TARGET_HIGH:
                        return mls_context_cpy_high(newcontext, tcontext);
                case DEFAULT_TARGET_LOW_HIGH:
                        return mls_context_cpy(newcontext, tcontext);
                case DEFAULT_GLBLUB:
                        return mls_context_glblub(newcontext, scontext,
                                                  tcontext);
                }

                fallthrough;
        case AVTAB_CHANGE:
                if ((tclass == p->process_class) || sock)
                        /* Use the process MLS attributes. */
                        return mls_context_cpy(newcontext, scontext);
                else
                        /* Use the process effective MLS attributes. */
                        return mls_context_cpy_low(newcontext, scontext);
        case AVTAB_MEMBER:
                /* Use the process effective MLS attributes. */
                return mls_context_cpy_low(newcontext, scontext);
        }
        return -EINVAL;
}

#ifdef CONFIG_NETLABEL
/**
 * mls_export_netlbl_lvl - Export the MLS sensitivity levels to NetLabel
 * @p: the policy
 * @context: the security context
 * @secattr: the NetLabel security attributes
 *
 * Description:
 * Given the security context copy the low MLS sensitivity level into the
 * NetLabel MLS sensitivity level field.
 *
 */
void mls_export_netlbl_lvl(struct policydb *p, struct context *context,
                           struct netlbl_lsm_secattr *secattr)
{
        if (!p->mls_enabled)
                return;

        secattr->attr.mls.lvl = context->range.level[0].sens - 1;
        secattr->flags |= NETLBL_SECATTR_MLS_LVL;
}

/**
 * mls_import_netlbl_lvl - Import the NetLabel MLS sensitivity levels
 * @p: the policy
 * @context: the security context
 * @secattr: the NetLabel security attributes
 *
 * Description:
 * Given the security context and the NetLabel security attributes, copy the
 * NetLabel MLS sensitivity level into the context.
 *
 */
void mls_import_netlbl_lvl(struct policydb *p, struct context *context,
                           struct netlbl_lsm_secattr *secattr)
{
        if (!p->mls_enabled)
                return;

        context->range.level[0].sens = secattr->attr.mls.lvl + 1;
        context->range.level[1].sens = context->range.level[0].sens;
}

/**
 * mls_export_netlbl_cat - Export the MLS categories to NetLabel
 * @p: the policy
 * @context: the security context
 * @secattr: the NetLabel security attributes
 *
 * Description:
 * Given the security context copy the low MLS categories into the NetLabel
 * MLS category field.  Returns zero on success, negative values on failure.
 *
 */
int mls_export_netlbl_cat(struct policydb *p, struct context *context,
                          struct netlbl_lsm_secattr *secattr)
{
        int rc;

        if (!p->mls_enabled)
                return 0;

        rc = ebitmap_netlbl_export(&context->range.level[0].cat,
                                   &secattr->attr.mls.cat);
        if (rc == 0 && secattr->attr.mls.cat != NULL)
                secattr->flags |= NETLBL_SECATTR_MLS_CAT;

        return rc;
}

/**
 * mls_import_netlbl_cat - Import the MLS categories from NetLabel
 * @p: the policy
 * @context: the security context
 * @secattr: the NetLabel security attributes
 *
 * Description:
 * Copy the NetLabel security attributes into the SELinux context; since the
 * NetLabel security attribute only contains a single MLS category use it for
 * both the low and high categories of the context.  Returns zero on success,
 * negative values on failure.
 *
 */
int mls_import_netlbl_cat(struct policydb *p, struct context *context,
                          struct netlbl_lsm_secattr *secattr)
{
        int rc;

        if (!p->mls_enabled)
                return 0;

        rc = ebitmap_netlbl_import(&context->range.level[0].cat,
                                   secattr->attr.mls.cat);
        if (rc)
                goto import_netlbl_cat_failure;
        memcpy(&context->range.level[1].cat, &context->range.level[0].cat,
               sizeof(context->range.level[0].cat));

        return 0;

import_netlbl_cat_failure:
        ebitmap_destroy(&context->range.level[0].cat);
        return rc;
}
#endif /* CONFIG_NETLABEL */




























  209 



  209 


  209 

















































































































































  209 










  209 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
#include <linux/uaccess.h>

#include <asm/cacheflush.h>
#include <asm/fixmap.h>
#include <asm/insn.h>
#include <asm/kprobes.h>
#include <asm/text-patching.h>
#include <asm/sections.h>

static DEFINE_RAW_SPINLOCK(patch_lock);

static bool is_exit_text(unsigned long addr)
{
        /* discarded with init text/data */
        return system_state < SYSTEM_RUNNING &&
                addr >= (unsigned long)__exittext_begin &&
                addr < (unsigned long)__exittext_end;
}

static bool is_image_text(unsigned long addr)
{
        return core_kernel_text(addr) || is_exit_text(addr);
}

static void __kprobes *patch_map(void *addr, int fixmap)
{
        phys_addr_t phys;

        if (is_image_text((unsigned long)addr)) {
                phys = __pa_symbol(addr);
        } else {
                struct page *page = vmalloc_to_page(addr);
                BUG_ON(!page);
                phys = page_to_phys(page) + offset_in_page(addr);
        }

        return (void *)set_fixmap_offset(fixmap, phys);
}

static void __kprobes patch_unmap(int fixmap)
{
        clear_fixmap(fixmap);
}
/*
 * In ARMv8-A, A64 instructions have a fixed length of 32 bits and are always
 * little-endian.
 */
int __kprobes aarch64_insn_read(void *addr, u32 *insnp)
{
        int ret;
        __le32 val;

        ret = copy_from_kernel_nofault(&val, addr, AARCH64_INSN_SIZE);
        if (!ret)
                *insnp = le32_to_cpu(val);

        return ret;
}

static int __kprobes __aarch64_insn_write(void *addr, __le32 insn)
{
        void *waddr = addr;
        unsigned long flags = 0;
        int ret;

        raw_spin_lock_irqsave(&patch_lock, flags);
        waddr = patch_map(addr, FIX_TEXT_POKE0);

        ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE);

        patch_unmap(FIX_TEXT_POKE0);
        raw_spin_unlock_irqrestore(&patch_lock, flags);

        return ret;
}

int __kprobes aarch64_insn_write(void *addr, u32 insn)
{
        return __aarch64_insn_write(addr, cpu_to_le32(insn));
}

noinstr int aarch64_insn_write_literal_u64(void *addr, u64 val)
{
        u64 *waddr;
        unsigned long flags;
        int ret;

        raw_spin_lock_irqsave(&patch_lock, flags);
        waddr = patch_map(addr, FIX_TEXT_POKE0);

        ret = copy_to_kernel_nofault(waddr, &val, sizeof(val));

        patch_unmap(FIX_TEXT_POKE0);
        raw_spin_unlock_irqrestore(&patch_lock, flags);

        return ret;
}

typedef void text_poke_f(void *dst, void *src, size_t patched, size_t len);

static void *__text_poke(text_poke_f func, void *addr, void *src, size_t len)
{
        unsigned long flags;
        size_t patched = 0;
        size_t size;
        void *waddr;
        void *ptr;

        raw_spin_lock_irqsave(&patch_lock, flags);

        while (patched < len) {
                ptr = addr + patched;
                size = min_t(size_t, PAGE_SIZE - offset_in_page(ptr),
                             len - patched);

                waddr = patch_map(ptr, FIX_TEXT_POKE0);
                func(waddr, src, patched, size);
                patch_unmap(FIX_TEXT_POKE0);

                patched += size;
        }
        raw_spin_unlock_irqrestore(&patch_lock, flags);

        flush_icache_range((uintptr_t)addr, (uintptr_t)addr + len);

        return addr;
}

static void text_poke_memcpy(void *dst, void *src, size_t patched, size_t len)
{
        copy_to_kernel_nofault(dst, src + patched, len);
}

static void text_poke_memset(void *dst, void *src, size_t patched, size_t len)
{
        u32 c = *(u32 *)src;

        memset32(dst, c, len / 4);
}

/**
 * aarch64_insn_copy - Copy instructions into (an unused part of) RX memory
 * @dst: address to modify
 * @src: source of the copy
 * @len: length to copy
 *
 * Useful for JITs to dump new code blocks into unused regions of RX memory.
 */
noinstr void *aarch64_insn_copy(void *dst, void *src, size_t len)
{
        /* A64 instructions must be word aligned */
        if ((uintptr_t)dst & 0x3)
                return NULL;

        return __text_poke(text_poke_memcpy, dst, src, len);
}

/**
 * aarch64_insn_set - memset for RX memory regions.
 * @dst: address to modify
 * @insn: value to set
 * @len: length of memory region.
 *
 * Useful for JITs to fill regions of RX memory with illegal instructions.
 */
noinstr void *aarch64_insn_set(void *dst, u32 insn, size_t len)
{
        if ((uintptr_t)dst & 0x3)
                return NULL;

        return __text_poke(text_poke_memset, dst, &insn, len);
}

int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn)
{
        u32 *tp = addr;
        int ret;

        /* A64 instructions must be word aligned */
        if ((uintptr_t)tp & 0x3)
                return -EINVAL;

        ret = aarch64_insn_write(tp, insn);
        if (ret == 0)
                caches_clean_inval_pou((uintptr_t)tp,
                                     (uintptr_t)tp + AARCH64_INSN_SIZE);

        return ret;
}

struct aarch64_insn_patch {
        void                **text_addrs;
        u32                *new_insns;
        int                insn_cnt;
        atomic_t        cpu_count;
};

static int __kprobes aarch64_insn_patch_text_cb(void *arg)
{
        int i, ret = 0;
        struct aarch64_insn_patch *pp = arg;

        /* The last CPU becomes master */
        if (atomic_inc_return(&pp->cpu_count) == num_online_cpus()) {
                for (i = 0; ret == 0 && i < pp->insn_cnt; i++)
                        ret = aarch64_insn_patch_text_nosync(pp->text_addrs[i],
                                                             pp->new_insns[i]);
                /* Notify other processors with an additional increment. */
                atomic_inc(&pp->cpu_count);
        } else {
                while (atomic_read(&pp->cpu_count) <= num_online_cpus())
                        cpu_relax();
                isb();
        }

        return ret;
}

int __kprobes aarch64_insn_patch_text(void *addrs[], u32 insns[], int cnt)
{
        struct aarch64_insn_patch patch = {
                .text_addrs = addrs,
                .new_insns = insns,
                .insn_cnt = cnt,
                .cpu_count = ATOMIC_INIT(0),
        };

        if (cnt <= 0)
                return -EINVAL;

        return stop_machine_cpuslocked(aarch64_insn_patch_text_cb, &patch,
                                       cpu_online_mask);
}




















































































































    6 







    5 



    5 







    5 


    1 
    4 






















    8 












    2 




    5 

    6 





    1 


































    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
/*
 * llc_input.c - Minimal input path for LLC
 *
 * Copyright (c) 1997 by Procom Technology, Inc.
 *                  2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 * This program can be redistributed or modified under the terms of the
 * GNU General Public License as published by the Free Software Foundation.
 * This program is distributed without any warranty or implied warranty
 * of merchantability or fitness for a particular purpose.
 *
 * See the GNU General Public License for more details.
 */
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/llc.h>
#include <net/llc_pdu.h>
#include <net/llc_sap.h>

#if 0
#define dprintk(args...) printk(KERN_DEBUG args)
#else
#define dprintk(args...)
#endif

/*
 * Packet handler for the station, registerable because in the minimal
 * LLC core that is taking shape only the very minimal subset of LLC that
 * is needed for things like IPX, Appletalk, etc will stay, with all the
 * rest in the llc1 and llc2 modules.
 */
static void (*llc_station_handler)(struct sk_buff *skb);

/*
 * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN.
 */
static void (*llc_type_handlers[2])(struct llc_sap *sap,
                                    struct sk_buff *skb);

void llc_add_pack(int type, void (*handler)(struct llc_sap *sap,
                                            struct sk_buff *skb))
{
        smp_wmb(); /* ensure initialisation is complete before it's called */
        if (type == LLC_DEST_SAP || type == LLC_DEST_CONN)
                llc_type_handlers[type - 1] = handler;
}

void llc_remove_pack(int type)
{
        if (type == LLC_DEST_SAP || type == LLC_DEST_CONN)
                llc_type_handlers[type - 1] = NULL;
        synchronize_net();
}

void llc_set_station_handler(void (*handler)(struct sk_buff *skb))
{
        /* Ensure initialisation is complete before it's called */
        if (handler)
                smp_wmb();

        llc_station_handler = handler;

        if (!handler)
                synchronize_net();
}

/**
 *        llc_pdu_type - returns which LLC component must handle for PDU
 *        @skb: input skb
 *
 *        This function returns which LLC component must handle this PDU.
 */
static __inline__ int llc_pdu_type(struct sk_buff *skb)
{
        int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */
        struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);

        if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U)
                goto out;
        switch (LLC_U_PDU_CMD(pdu)) {
        case LLC_1_PDU_CMD_XID:
        case LLC_1_PDU_CMD_UI:
        case LLC_1_PDU_CMD_TEST:
                type = LLC_DEST_SAP;
                break;
        case LLC_2_PDU_CMD_SABME:
        case LLC_2_PDU_CMD_DISC:
        case LLC_2_PDU_RSP_UA:
        case LLC_2_PDU_RSP_DM:
        case LLC_2_PDU_RSP_FRMR:
                break;
        default:
                type = LLC_DEST_INVALID;
                break;
        }
out:
        return type;
}

/**
 *        llc_fixup_skb - initializes skb pointers
 *        @skb: This argument points to incoming skb
 *
 *        Initializes internal skb pointer to start of network layer by deriving
 *        length of LLC header; finds length of LLC control field in LLC header
 *        by looking at the two lowest-order bits of the first control field
 *        byte; field is either 3 or 4 bytes long.
 */
static inline int llc_fixup_skb(struct sk_buff *skb)
{
        u8 llc_len = 2;
        struct llc_pdu_un *pdu;

        if (unlikely(!pskb_may_pull(skb, sizeof(*pdu))))
                return 0;

        pdu = (struct llc_pdu_un *)skb->data;
        if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U)
                llc_len = 1;
        llc_len += 2;

        if (unlikely(!pskb_may_pull(skb, llc_len)))
                return 0;

        skb_pull(skb, llc_len);
        skb_reset_transport_header(skb);
        if (skb->protocol == htons(ETH_P_802_2)) {
                __be16 pdulen;
                s32 data_size;

                if (skb->mac_len < ETH_HLEN)
                        return 0;

                pdulen = eth_hdr(skb)->h_proto;
                data_size = ntohs(pdulen) - llc_len;

                if (data_size < 0 ||
                    !pskb_may_pull(skb, data_size))
                        return 0;
                if (unlikely(pskb_trim_rcsum(skb, data_size)))
                        return 0;
        }
        return 1;
}

/**
 *        llc_rcv - 802.2 entry point from net lower layers
 *        @skb: received pdu
 *        @dev: device that receive pdu
 *        @pt: packet type
 *        @orig_dev: the original receive net device
 *
 *        When the system receives a 802.2 frame this function is called. It
 *        checks SAP and connection of received pdu and passes frame to
 *        llc_{station,sap,conn}_rcv for sending to proper state machine. If
 *        the frame is related to a busy connection (a connection is sending
 *        data now), it queues this frame in the connection's backlog.
 */
int llc_rcv(struct sk_buff *skb, struct net_device *dev,
            struct packet_type *pt, struct net_device *orig_dev)
{
        struct llc_sap *sap;
        struct llc_pdu_sn *pdu;
        int dest;
        int (*rcv)(struct sk_buff *, struct net_device *,
                   struct packet_type *, struct net_device *);
        void (*sta_handler)(struct sk_buff *skb);
        void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb);

        /*
         * When the interface is in promisc. mode, drop all the crap that it
         * receives, do not try to analyse it.
         */
        if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) {
                dprintk("%s: PACKET_OTHERHOST\n", __func__);
                goto drop;
        }
        skb = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                goto out;
        if (unlikely(!llc_fixup_skb(skb)))
                goto drop;
        pdu = llc_pdu_sn_hdr(skb);
        if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */
               goto handle_station;
        sap = llc_sap_find(pdu->dsap);
        if (unlikely(!sap)) {/* unknown SAP */
                dprintk("%s: llc_sap_find(%02X) failed!\n", __func__,
                        pdu->dsap);
                goto drop;
        }
        /*
         * First the upper layer protocols that don't need the full
         * LLC functionality
         */
        rcv = rcu_dereference(sap->rcv_func);
        dest = llc_pdu_type(skb);
        sap_handler = dest ? READ_ONCE(llc_type_handlers[dest - 1]) : NULL;
        if (unlikely(!sap_handler)) {
                if (rcv)
                        rcv(skb, dev, pt, orig_dev);
                else
                        kfree_skb(skb);
        } else {
                if (rcv) {
                        struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC);
                        if (cskb)
                                rcv(cskb, dev, pt, orig_dev);
                }
                sap_handler(sap, skb);
        }
        llc_sap_put(sap);
out:
        return 0;
drop:
        kfree_skb(skb);
        goto out;
handle_station:
        sta_handler = READ_ONCE(llc_station_handler);
        if (!sta_handler)
                goto drop;
        sta_handler(skb);
        goto out;
}

EXPORT_SYMBOL(llc_add_pack);
EXPORT_SYMBOL(llc_remove_pack);
EXPORT_SYMBOL(llc_set_station_handler);


















































































































































































    7 





    2 






    6 

















   75 








   39 
    8 



  106 




  106 









   25 





   82 


















   40 
    6 
   46 













  209 





  168 















































   50 


























   35 



   35 

   35 































   89 









   89 








   50 



















  133 



















  117 



   46 












































































































































   17 
















   42 























   42 





































































































































































































































  106 











  106 
  106 
  106 





  105 




  106 




    1 


  106 






  106 





  106 

    1 



































































































































   56 







   56 
























   56 















   56 




   56 

   46 










   46 




   46 














   56 












   56 

   56 
















   56 































































    6 









































    8 










    8 














    2 
    6 










    2 
    6 

    2 
    6 


    2 
    6 




    8 

    8 




    8 
    8 






    8 

    2 


    6 






   50 





   50 

   50 

   50 







   50 












   50 
   50 



































































































































































































































































































  122 















































































  106 





   68 


   25 




















   17 

   17 


   17 

























































  106 

  106 







  106 





























  106 

  105 








  106 















    1 



  105 
































    1 






























































































































































































































































































































































































































































































  110 
  110 












  110 




  109 


  109 

















   28 







    6 
   23 
   28 






















  106 






  105 

















  105 

    1 







  106 





















   82 
   25 












   82 

   82 
   82 

   82 



   82 






























































































































   88 










   88 





   88 



    3 

   84 








   97 







   97 











   97 


   97 




























































































    2 























  106 







































    5 


    5 









   39 

































    5 






















    5 



    5 







































  122 





















   89 

   35 





  122 



  122 












  122 


























  122 



  122 








  122 















































































































































   27 







   27 











   27 













   27 


   23 
   26 

   24 
   25 
   25 

   25 

   25 



   27 





























































































































   28 






    2 







   27 

















































































































































































































































































































































































   33 



   33 











   33 













   33 

   33 










    2 










    2 




    2 






















   33 















































































































































































































































































   19 









   11 




   11 

    1 










    5 


















   35 

















   35 






   35 




































   26 















































































































































































































































































































































































































































































































































































































































































































































































































































































  122 















   50 

   50 
   50 






























































































































































































































































































































































































































































































































































































































































   89 









   89 


















   89 



















































   89 













   89 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
/*
 * Resizable virtual memory filesystem for Linux.
 *
 * Copyright (C) 2000 Linus Torvalds.
 *                 2000 Transmeta Corp.
 *                 2000-2001 Christoph Rohland
 *                 2000-2001 SAP AG
 *                 2002 Red Hat Inc.
 * Copyright (C) 2002-2011 Hugh Dickins.
 * Copyright (C) 2011 Google Inc.
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
 * Extended attribute support for tmpfs:
 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *
 * tiny-shmem:
 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
 *
 * This file is released under the GPL.
 */

#include <linux/fs.h>
#include <linux/init.h>
#include <linux/vfs.h>
#include <linux/mount.h>
#include <linux/ramfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/fileattr.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
#include <linux/shmem_fs.h>
#include <linux/swap.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
#include <linux/iversion.h>
#include <linux/unicode.h>
#include "swap.h"

static struct vfsmount *shm_mnt __ro_after_init;

#ifdef CONFIG_SHMEM
/*
 * This virtual memory filesystem is heavily based on the ramfs. It
 * extends ramfs by the ability to use swap and honor resource limits
 * which makes it a completely usable filesystem.
 */

#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/mman.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
#include <linux/falloc.h>
#include <linux/splice.h>
#include <linux/security.h>
#include <linux/swapops.h>
#include <linux/mempolicy.h>
#include <linux/namei.h>
#include <linux/ctype.h>
#include <linux/migrate.h>
#include <linux/highmem.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
#include <linux/rmap.h>
#include <linux/uuid.h>
#include <linux/quotaops.h>
#include <linux/rcupdate_wait.h>

#include <linux/uaccess.h>

#include "internal.h"

#define VM_ACCT(size)    (PAGE_ALIGN(size) >> PAGE_SHIFT)

/* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20

/* Pretend that one inode + its dentry occupy this much memory */
#define BOGO_INODE_SIZE 1024

/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
#define SHORT_SYMLINK_LEN 128

/*
 * shmem_fallocate communicates with shmem_fault or shmem_writepage via
 * inode->i_private (with i_rwsem making sure that it has only one user at
 * a time): we would prefer not to enlarge the shmem inode just for that.
 */
struct shmem_falloc {
        wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
        pgoff_t start;                /* start of range currently being fallocated */
        pgoff_t next;                /* the next page offset to be fallocated */
        pgoff_t nr_falloced;        /* how many new pages have been fallocated */
        pgoff_t nr_unswapped;        /* how often writepage refused to swap out */
};

struct shmem_options {
        unsigned long long blocks;
        unsigned long long inodes;
        struct mempolicy *mpol;
        kuid_t uid;
        kgid_t gid;
        umode_t mode;
        bool full_inums;
        int huge;
        int seen;
        bool noswap;
        unsigned short quota_types;
        struct shmem_quota_limits qlimits;
#if IS_ENABLED(CONFIG_UNICODE)
        struct unicode_map *encoding;
        bool strict_encoding;
#endif
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
#define SHMEM_SEEN_NOSWAP 16
#define SHMEM_SEEN_QUOTA 32
};

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static unsigned long huge_shmem_orders_always __read_mostly;
static unsigned long huge_shmem_orders_madvise __read_mostly;
static unsigned long huge_shmem_orders_inherit __read_mostly;
static unsigned long huge_shmem_orders_within_size __read_mostly;
static bool shmem_orders_configured __initdata;
#endif

#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
        return totalram_pages() / 2;
}

static unsigned long shmem_default_max_inodes(void)
{
        unsigned long nr_pages = totalram_pages();

        return min3(nr_pages - totalhigh_pages(), nr_pages / 2,
                        ULONG_MAX / BOGO_INODE_SIZE);
}
#endif

static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                        struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
                        struct vm_area_struct *vma, vm_fault_t *fault_type);

static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}

/*
 * shmem_file_setup pre-accounts the whole fixed size of a VM object,
 * for shared memory and for shared anonymous (/dev/zero) mappings
 * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
 * consistent with the pre-accounting of private mappings ...
 */
static inline int shmem_acct_size(unsigned long flags, loff_t size)
{
        return (flags & VM_NORESERVE) ?
                0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
}

static inline void shmem_unacct_size(unsigned long flags, loff_t size)
{
        if (!(flags & VM_NORESERVE))
                vm_unacct_memory(VM_ACCT(size));
}

static inline int shmem_reacct_size(unsigned long flags,
                loff_t oldsize, loff_t newsize)
{
        if (!(flags & VM_NORESERVE)) {
                if (VM_ACCT(newsize) > VM_ACCT(oldsize))
                        return security_vm_enough_memory_mm(current->mm,
                                        VM_ACCT(newsize) - VM_ACCT(oldsize));
                else if (VM_ACCT(newsize) < VM_ACCT(oldsize))
                        vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize));
        }
        return 0;
}

/*
 * ... whereas tmpfs objects are accounted incrementally as
 * pages are allocated, in order to allow large sparse files.
 * shmem_get_folio reports shmem_acct_blocks failure as -ENOSPC not -ENOMEM,
 * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
 */
static inline int shmem_acct_blocks(unsigned long flags, long pages)
{
        if (!(flags & VM_NORESERVE))
                return 0;

        return security_vm_enough_memory_mm(current->mm,
                        pages * VM_ACCT(PAGE_SIZE));
}

static inline void shmem_unacct_blocks(unsigned long flags, long pages)
{
        if (flags & VM_NORESERVE)
                vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
}

static int shmem_inode_acct_blocks(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        int err = -ENOSPC;

        if (shmem_acct_blocks(info->flags, pages))
                return err;

        might_sleep();        /* when quotas */
        if (sbinfo->max_blocks) {
                if (!percpu_counter_limited_add(&sbinfo->used_blocks,
                                                sbinfo->max_blocks, pages))
                        goto unacct;

                err = dquot_alloc_block_nodirty(inode, pages);
                if (err) {
                        percpu_counter_sub(&sbinfo->used_blocks, pages);
                        goto unacct;
                }
        } else {
                err = dquot_alloc_block_nodirty(inode, pages);
                if (err)
                        goto unacct;
        }

        return 0;

unacct:
        shmem_unacct_blocks(info->flags, pages);
        return err;
}

static void shmem_inode_unacct_blocks(struct inode *inode, long pages)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);

        might_sleep();        /* when quotas */
        dquot_free_block_nodirty(inode, pages);

        if (sbinfo->max_blocks)
                percpu_counter_sub(&sbinfo->used_blocks, pages);
        shmem_unacct_blocks(info->flags, pages);
}

static const struct super_operations shmem_ops;
static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
static const struct vm_operations_struct shmem_anon_vm_ops;
static struct file_system_type shmem_fs_type;

bool shmem_mapping(struct address_space *mapping)
{
        return mapping->a_ops == &shmem_aops;
}
EXPORT_SYMBOL_GPL(shmem_mapping);

bool vma_is_anon_shmem(struct vm_area_struct *vma)
{
        return vma->vm_ops == &shmem_anon_vm_ops;
}

bool vma_is_shmem(struct vm_area_struct *vma)
{
        return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops;
}

static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);

#ifdef CONFIG_TMPFS_QUOTA

static int shmem_enable_quotas(struct super_block *sb,
                               unsigned short quota_types)
{
        int type, err = 0;

        sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
        for (type = 0; type < SHMEM_MAXQUOTAS; type++) {
                if (!(quota_types & (1 << type)))
                        continue;
                err = dquot_load_quota_sb(sb, type, QFMT_SHMEM,
                                          DQUOT_USAGE_ENABLED |
                                          DQUOT_LIMITS_ENABLED);
                if (err)
                        goto out_err;
        }
        return 0;

out_err:
        pr_warn("tmpfs: failed to enable quota tracking (type=%d, err=%d)\n",
                type, err);
        for (type--; type >= 0; type--)
                dquot_quota_off(sb, type);
        return err;
}

static void shmem_disable_quotas(struct super_block *sb)
{
        int type;

        for (type = 0; type < SHMEM_MAXQUOTAS; type++)
                dquot_quota_off(sb, type);
}

static struct dquot __rcu **shmem_get_dquots(struct inode *inode)
{
        return SHMEM_I(inode)->i_dquot;
}
#endif /* CONFIG_TMPFS_QUOTA */

/*
 * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
 * produces a novel ino for the newly allocated inode.
 *
 * It may also be called when making a hard link to permit the space needed by
 * each dentry. However, in that case, no new inode number is needed since that
 * internally draws from another pool of inode numbers (currently global
 * get_next_ino()). This case is indicated by passing NULL as inop.
 */
#define SHMEM_INO_BATCH 1024
static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        ino_t ino;

        if (!(sb->s_flags & SB_KERNMOUNT)) {
                raw_spin_lock(&sbinfo->stat_lock);
                if (sbinfo->max_inodes) {
                        if (sbinfo->free_ispace < BOGO_INODE_SIZE) {
                                raw_spin_unlock(&sbinfo->stat_lock);
                                return -ENOSPC;
                        }
                        sbinfo->free_ispace -= BOGO_INODE_SIZE;
                }
                if (inop) {
                        ino = sbinfo->next_ino++;
                        if (unlikely(is_zero_ino(ino)))
                                ino = sbinfo->next_ino++;
                        if (unlikely(!sbinfo->full_inums &&
                                     ino > UINT_MAX)) {
                                /*
                                 * Emulate get_next_ino uint wraparound for
                                 * compatibility
                                 */
                                if (IS_ENABLED(CONFIG_64BIT))
                                        pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
                                                __func__, MINOR(sb->s_dev));
                                sbinfo->next_ino = 1;
                                ino = sbinfo->next_ino++;
                        }
                        *inop = ino;
                }
                raw_spin_unlock(&sbinfo->stat_lock);
        } else if (inop) {
                /*
                 * __shmem_file_setup, one of our callers, is lock-free: it
                 * doesn't hold stat_lock in shmem_reserve_inode since
                 * max_inodes is always 0, and is called from potentially
                 * unknown contexts. As such, use a per-cpu batched allocator
                 * which doesn't require the per-sb stat_lock unless we are at
                 * the batch boundary.
                 *
                 * We don't need to worry about inode{32,64} since SB_KERNMOUNT
                 * shmem mounts are not exposed to userspace, so we don't need
                 * to worry about things like glibc compatibility.
                 */
                ino_t *next_ino;

                next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
                ino = *next_ino;
                if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
                        raw_spin_lock(&sbinfo->stat_lock);
                        ino = sbinfo->next_ino;
                        sbinfo->next_ino += SHMEM_INO_BATCH;
                        raw_spin_unlock(&sbinfo->stat_lock);
                        if (unlikely(is_zero_ino(ino)))
                                ino++;
                }
                *inop = ino;
                *next_ino = ++ino;
                put_cpu();
        }

        return 0;
}

static void shmem_free_inode(struct super_block *sb, size_t freed_ispace)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        if (sbinfo->max_inodes) {
                raw_spin_lock(&sbinfo->stat_lock);
                sbinfo->free_ispace += BOGO_INODE_SIZE + freed_ispace;
                raw_spin_unlock(&sbinfo->stat_lock);
        }
}

/**
 * shmem_recalc_inode - recalculate the block usage of an inode
 * @inode: inode to recalc
 * @alloced: the change in number of pages allocated to inode
 * @swapped: the change in number of pages swapped from inode
 *
 * We have to calculate the free blocks since the mm can drop
 * undirtied hole pages behind our back.
 *
 * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
 * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
 */
static void shmem_recalc_inode(struct inode *inode, long alloced, long swapped)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        long freed;

        spin_lock(&info->lock);
        info->alloced += alloced;
        info->swapped += swapped;
        freed = info->alloced - info->swapped -
                READ_ONCE(inode->i_mapping->nrpages);
        /*
         * Special case: whereas normally shmem_recalc_inode() is called
         * after i_mapping->nrpages has already been adjusted (up or down),
         * shmem_writepage() has to raise swapped before nrpages is lowered -
         * to stop a racing shmem_recalc_inode() from thinking that a page has
         * been freed.  Compensate here, to avoid the need for a followup call.
         */
        if (swapped > 0)
                freed += swapped;
        if (freed > 0)
                info->alloced -= freed;
        spin_unlock(&info->lock);

        /* The quota case may block */
        if (freed > 0)
                shmem_inode_unacct_blocks(inode, freed);
}

bool shmem_charge(struct inode *inode, long pages)
{
        struct address_space *mapping = inode->i_mapping;

        if (shmem_inode_acct_blocks(inode, pages))
                return false;

        /* nrpages adjustment first, then shmem_recalc_inode() when balanced */
        xa_lock_irq(&mapping->i_pages);
        mapping->nrpages += pages;
        xa_unlock_irq(&mapping->i_pages);

        shmem_recalc_inode(inode, pages, 0);
        return true;
}

void shmem_uncharge(struct inode *inode, long pages)
{
        /* pages argument is currently unused: keep it to help debugging */
        /* nrpages adjustment done by __filemap_remove_folio() or caller */

        shmem_recalc_inode(inode, 0, 0);
}

/*
 * Replace item expected in xarray by a new item, while holding xa_lock.
 */
static int shmem_replace_entry(struct address_space *mapping,
                        pgoff_t index, void *expected, void *replacement)
{
        XA_STATE(xas, &mapping->i_pages, index);
        void *item;

        VM_BUG_ON(!expected);
        VM_BUG_ON(!replacement);
        item = xas_load(&xas);
        if (item != expected)
                return -ENOENT;
        xas_store(&xas, replacement);
        return 0;
}

/*
 * Sometimes, before we decide whether to proceed or to fail, we must check
 * that an entry was not already brought back from swap by a racing thread.
 *
 * Checking folio is not enough: by the time a swapcache folio is locked, it
 * might be reused, and again be swapcache, using the same swap as before.
 */
static bool shmem_confirm_swap(struct address_space *mapping,
                               pgoff_t index, swp_entry_t swap)
{
        return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
}

/*
 * Definitions for "huge tmpfs": tmpfs mounted with the huge= option
 *
 * SHMEM_HUGE_NEVER:
 *        disables huge pages for the mount;
 * SHMEM_HUGE_ALWAYS:
 *        enables huge pages for the mount;
 * SHMEM_HUGE_WITHIN_SIZE:
 *        only allocate huge pages if the page will be fully within i_size,
 *        also respect madvise() hints;
 * SHMEM_HUGE_ADVISE:
 *        only allocate huge pages if requested with madvise();
 */

#define SHMEM_HUGE_NEVER        0
#define SHMEM_HUGE_ALWAYS        1
#define SHMEM_HUGE_WITHIN_SIZE        2
#define SHMEM_HUGE_ADVISE        3

/*
 * Special values.
 * Only can be set via /sys/kernel/mm/transparent_hugepage/shmem_enabled:
 *
 * SHMEM_HUGE_DENY:
 *        disables huge on shm_mnt and all mounts, for emergency use;
 * SHMEM_HUGE_FORCE:
 *        enables huge on shm_mnt and all mounts, w/o needing option, for testing;
 *
 */
#define SHMEM_HUGE_DENY                (-1)
#define SHMEM_HUGE_FORCE        (-2)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */

static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
static int tmpfs_huge __read_mostly = SHMEM_HUGE_NEVER;

/**
 * shmem_mapping_size_orders - Get allowable folio orders for the given file size.
 * @mapping: Target address_space.
 * @index: The page index.
 * @write_end: end of a write, could extend inode size.
 *
 * This returns huge orders for folios (when supported) based on the file size
 * which the mapping currently allows at the given index. The index is relevant
 * due to alignment considerations the mapping might have. The returned order
 * may be less than the size passed.
 *
 * Return: The orders.
 */
static inline unsigned int
shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t write_end)
{
        unsigned int order;
        size_t size;

        if (!mapping_large_folio_support(mapping) || !write_end)
                return 0;

        /* Calculate the write size based on the write_end */
        size = write_end - (index << PAGE_SHIFT);
        order = filemap_get_order(size);
        if (!order)
                return 0;

        /* If we're not aligned, allocate a smaller folio */
        if (index & ((1UL << order) - 1))
                order = __ffs(index);

        order = min_t(size_t, order, MAX_PAGECACHE_ORDER);
        return order > 0 ? BIT(order + 1) - 1 : 0;
}

static unsigned int shmem_get_orders_within_size(struct inode *inode,
                unsigned long within_size_orders, pgoff_t index,
                loff_t write_end)
{
        pgoff_t aligned_index;
        unsigned long order;
        loff_t i_size;

        order = highest_order(within_size_orders);
        while (within_size_orders) {
                aligned_index = round_up(index + 1, 1 << order);
                i_size = max(write_end, i_size_read(inode));
                i_size = round_up(i_size, PAGE_SIZE);
                if (i_size >> PAGE_SHIFT >= aligned_index)
                        return within_size_orders;

                order = next_order(&within_size_orders, order);
        }

        return 0;
}

static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
                                              loff_t write_end, bool shmem_huge_force,
                                              struct vm_area_struct *vma,
                                              unsigned long vm_flags)
{
        unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ?
                0 : BIT(HPAGE_PMD_ORDER);
        unsigned long within_size_orders;

        if (!S_ISREG(inode->i_mode))
                return 0;
        if (shmem_huge == SHMEM_HUGE_DENY)
                return 0;
        if (shmem_huge_force || shmem_huge == SHMEM_HUGE_FORCE)
                return maybe_pmd_order;

        /*
         * The huge order allocation for anon shmem is controlled through
         * the mTHP interface, so we still use PMD-sized huge order to
         * check whether global control is enabled.
         *
         * For tmpfs mmap()'s huge order, we still use PMD-sized order to
         * allocate huge pages due to lack of a write size hint.
         *
         * Otherwise, tmpfs will allow getting a highest order hint based on
         * the size of write and fallocate paths, then will try each allowable
         * huge orders.
         */
        switch (SHMEM_SB(inode->i_sb)->huge) {
        case SHMEM_HUGE_ALWAYS:
                if (vma)
                        return maybe_pmd_order;

                return shmem_mapping_size_orders(inode->i_mapping, index, write_end);
        case SHMEM_HUGE_WITHIN_SIZE:
                if (vma)
                        within_size_orders = maybe_pmd_order;
                else
                        within_size_orders = shmem_mapping_size_orders(inode->i_mapping,
                                                                       index, write_end);

                within_size_orders = shmem_get_orders_within_size(inode, within_size_orders,
                                                                  index, write_end);
                if (within_size_orders > 0)
                        return within_size_orders;

                fallthrough;
        case SHMEM_HUGE_ADVISE:
                if (vm_flags & VM_HUGEPAGE)
                        return maybe_pmd_order;
                fallthrough;
        default:
                return 0;
        }
}

static int shmem_parse_huge(const char *str)
{
        int huge;

        if (!str)
                return -EINVAL;

        if (!strcmp(str, "never"))
                huge = SHMEM_HUGE_NEVER;
        else if (!strcmp(str, "always"))
                huge = SHMEM_HUGE_ALWAYS;
        else if (!strcmp(str, "within_size"))
                huge = SHMEM_HUGE_WITHIN_SIZE;
        else if (!strcmp(str, "advise"))
                huge = SHMEM_HUGE_ADVISE;
        else if (!strcmp(str, "deny"))
                huge = SHMEM_HUGE_DENY;
        else if (!strcmp(str, "force"))
                huge = SHMEM_HUGE_FORCE;
        else
                return -EINVAL;

        if (!has_transparent_hugepage() &&
            huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
                return -EINVAL;

        /* Do not override huge allocation policy with non-PMD sized mTHP */
        if (huge == SHMEM_HUGE_FORCE &&
            huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
                return -EINVAL;

        return huge;
}

#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
{
        switch (huge) {
        case SHMEM_HUGE_NEVER:
                return "never";
        case SHMEM_HUGE_ALWAYS:
                return "always";
        case SHMEM_HUGE_WITHIN_SIZE:
                return "within_size";
        case SHMEM_HUGE_ADVISE:
                return "advise";
        case SHMEM_HUGE_DENY:
                return "deny";
        case SHMEM_HUGE_FORCE:
                return "force";
        default:
                VM_BUG_ON(1);
                return "bad_val";
        }
}
#endif

static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
                struct shrink_control *sc, unsigned long nr_to_free)
{
        LIST_HEAD(list), *pos, *next;
        struct inode *inode;
        struct shmem_inode_info *info;
        struct folio *folio;
        unsigned long batch = sc ? sc->nr_to_scan : 128;
        unsigned long split = 0, freed = 0;

        if (list_empty(&sbinfo->shrinklist))
                return SHRINK_STOP;

        spin_lock(&sbinfo->shrinklist_lock);
        list_for_each_safe(pos, next, &sbinfo->shrinklist) {
                info = list_entry(pos, struct shmem_inode_info, shrinklist);

                /* pin the inode */
                inode = igrab(&info->vfs_inode);

                /* inode is about to be evicted */
                if (!inode) {
                        list_del_init(&info->shrinklist);
                        goto next;
                }

                list_move(&info->shrinklist, &list);
next:
                sbinfo->shrinklist_len--;
                if (!--batch)
                        break;
        }
        spin_unlock(&sbinfo->shrinklist_lock);

        list_for_each_safe(pos, next, &list) {
                pgoff_t next, end;
                loff_t i_size;
                int ret;

                info = list_entry(pos, struct shmem_inode_info, shrinklist);
                inode = &info->vfs_inode;

                if (nr_to_free && freed >= nr_to_free)
                        goto move_back;

                i_size = i_size_read(inode);
                folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
                if (!folio || xa_is_value(folio))
                        goto drop;

                /* No large folio at the end of the file: nothing to split */
                if (!folio_test_large(folio)) {
                        folio_put(folio);
                        goto drop;
                }

                /* Check if there is anything to gain from splitting */
                next = folio_next_index(folio);
                end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
                if (end <= folio->index || end >= next) {
                        folio_put(folio);
                        goto drop;
                }

                /*
                 * Move the inode on the list back to shrinklist if we failed
                 * to lock the page at this time.
                 *
                 * Waiting for the lock may lead to deadlock in the
                 * reclaim path.
                 */
                if (!folio_trylock(folio)) {
                        folio_put(folio);
                        goto move_back;
                }

                ret = split_folio(folio);
                folio_unlock(folio);
                folio_put(folio);

                /* If split failed move the inode on the list back to shrinklist */
                if (ret)
                        goto move_back;

                freed += next - end;
                split++;
drop:
                list_del_init(&info->shrinklist);
                goto put;
move_back:
                /*
                 * Make sure the inode is either on the global list or deleted
                 * from any local list before iput() since it could be deleted
                 * in another thread once we put the inode (then the local list
                 * is corrupted).
                 */
                spin_lock(&sbinfo->shrinklist_lock);
                list_move(&info->shrinklist, &sbinfo->shrinklist);
                sbinfo->shrinklist_len++;
                spin_unlock(&sbinfo->shrinklist_lock);
put:
                iput(inode);
        }

        return split;
}

static long shmem_unused_huge_scan(struct super_block *sb,
                struct shrink_control *sc)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

        if (!READ_ONCE(sbinfo->shrinklist_len))
                return SHRINK_STOP;

        return shmem_unused_huge_shrink(sbinfo, sc, 0);
}

static long shmem_unused_huge_count(struct super_block *sb,
                struct shrink_control *sc)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        return READ_ONCE(sbinfo->shrinklist_len);
}
#else /* !CONFIG_TRANSPARENT_HUGEPAGE */

#define shmem_huge SHMEM_HUGE_DENY

static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
                struct shrink_control *sc, unsigned long nr_to_free)
{
        return 0;
}

static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
                                              loff_t write_end, bool shmem_huge_force,
                                              struct vm_area_struct *vma,
                                              unsigned long vm_flags)
{
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static void shmem_update_stats(struct folio *folio, int nr_pages)
{
        if (folio_test_pmd_mappable(folio))
                __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr_pages);
        __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr_pages);
        __lruvec_stat_mod_folio(folio, NR_SHMEM, nr_pages);
}

/*
 * Somewhat like filemap_add_folio, but error if expected item has gone.
 */
static int shmem_add_to_page_cache(struct folio *folio,
                                   struct address_space *mapping,
                                   pgoff_t index, void *expected, gfp_t gfp)
{
        XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
        long nr = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);

        folio_ref_add(folio, nr);
        folio->mapping = mapping;
        folio->index = index;

        gfp &= GFP_RECLAIM_MASK;
        folio_throttle_swaprate(folio, gfp);

        do {
                xas_lock_irq(&xas);
                if (expected != xas_find_conflict(&xas)) {
                        xas_set_err(&xas, -EEXIST);
                        goto unlock;
                }
                if (expected && xas_find_conflict(&xas)) {
                        xas_set_err(&xas, -EEXIST);
                        goto unlock;
                }
                xas_store(&xas, folio);
                if (xas_error(&xas))
                        goto unlock;
                shmem_update_stats(folio, nr);
                mapping->nrpages += nr;
unlock:
                xas_unlock_irq(&xas);
        } while (xas_nomem(&xas, gfp));

        if (xas_error(&xas)) {
                folio->mapping = NULL;
                folio_ref_sub(folio, nr);
                return xas_error(&xas);
        }

        return 0;
}

/*
 * Somewhat like filemap_remove_folio, but substitutes swap for @folio.
 */
static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{
        struct address_space *mapping = folio->mapping;
        long nr = folio_nr_pages(folio);
        int error;

        xa_lock_irq(&mapping->i_pages);
        error = shmem_replace_entry(mapping, folio->index, folio, radswap);
        folio->mapping = NULL;
        mapping->nrpages -= nr;
        shmem_update_stats(folio, -nr);
        xa_unlock_irq(&mapping->i_pages);
        folio_put_refs(folio, nr);
        BUG_ON(error);
}

/*
 * Remove swap entry from page cache, free the swap and its page cache. Returns
 * the number of pages being freed. 0 means entry not found in XArray (0 pages
 * being freed).
 */
static long shmem_free_swap(struct address_space *mapping,
                            pgoff_t index, void *radswap)
{
        int order = xa_get_order(&mapping->i_pages, index);
        void *old;

        old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
        if (old != radswap)
                return 0;
        free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);

        return 1 << order;
}

/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given offsets are swapped out.
 *
 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct page *page;
        unsigned long swapped = 0;
        unsigned long max = end - 1;

        rcu_read_lock();
        xas_for_each(&xas, page, max) {
                if (xas_retry(&xas, page))
                        continue;
                if (xa_is_value(page))
                        swapped += 1 << xas_get_order(&xas);
                if (xas.xa_index == max)
                        break;
                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();

        return swapped << PAGE_SHIFT;
}

/*
 * Determine (in bytes) how many of the shmem object's pages mapped by the
 * given vma is swapped out.
 *
 * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
 * as long as the inode doesn't go away and racy results are not a problem.
 */
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(vma->vm_file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct address_space *mapping = inode->i_mapping;
        unsigned long swapped;

        /* Be careful as we don't hold info->lock */
        swapped = READ_ONCE(info->swapped);

        /*
         * The easier cases are when the shmem object has nothing in swap, or
         * the vma maps it whole. Then we can simply use the stats that we
         * already track.
         */
        if (!swapped)
                return 0;

        if (!vma->vm_pgoff && vma->vm_end - vma->vm_start >= inode->i_size)
                return swapped << PAGE_SHIFT;

        /* Here comes the more involved part */
        return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
                                        vma->vm_pgoff + vma_pages(vma));
}

/*
 * SysV IPC SHM_UNLOCK restore Unevictable pages to their evictable lists.
 */
void shmem_unlock_mapping(struct address_space *mapping)
{
        struct folio_batch fbatch;
        pgoff_t index = 0;

        folio_batch_init(&fbatch);
        /*
         * Minor point, but we might as well stop if someone else SHM_LOCKs it.
         */
        while (!mapping_unevictable(mapping) &&
               filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
                check_move_unevictable_folios(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
        struct folio *folio;

        /*
         * At first avoid shmem_get_folio(,,,SGP_READ): that fails
         * beyond i_size, and reports fallocated folios as holes.
         */
        folio = filemap_get_entry(inode->i_mapping, index);
        if (!folio)
                return folio;
        if (!xa_is_value(folio)) {
                folio_lock(folio);
                if (folio->mapping == inode->i_mapping)
                        return folio;
                /* The folio has been swapped out */
                folio_unlock(folio);
                folio_put(folio);
        }
        /*
         * But read a folio back from swap if any of it is within i_size
         * (although in some cases this is just a waste of time).
         */
        folio = NULL;
        shmem_get_folio(inode, index, 0, &folio, SGP_READ);
        return folio;
}

/*
 * Remove range of pages and swap entries from page cache, and free them.
 * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 */
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                                                                 bool unfalloc)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
        pgoff_t end = (lend + 1) >> PAGE_SHIFT;
        struct folio_batch fbatch;
        pgoff_t indices[PAGEVEC_SIZE];
        struct folio *folio;
        bool same_folio;
        long nr_swaps_freed = 0;
        pgoff_t index;
        int i;

        if (lend == -1)
                end = -1;        /* unsigned, so actually very big */

        if (info->fallocend > start && info->fallocend <= end && !unfalloc)
                info->fallocend = start;

        folio_batch_init(&fbatch);
        index = start;
        while (index < end && find_lock_entries(mapping, &index, end - 1,
                        &fbatch, indices)) {
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        folio = fbatch.folios[i];

                        if (xa_is_value(folio)) {
                                if (unfalloc)
                                        continue;
                                nr_swaps_freed += shmem_free_swap(mapping,
                                                        indices[i], folio);
                                continue;
                        }

                        if (!unfalloc || !folio_test_uptodate(folio))
                                truncate_inode_folio(mapping, folio);
                        folio_unlock(folio);
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }

        /*
         * When undoing a failed fallocate, we want none of the partial folio
         * zeroing and splitting below, but shall want to truncate the whole
         * folio when !uptodate indicates that it was added by this fallocate,
         * even when [lstart, lend] covers only a part of the folio.
         */
        if (unfalloc)
                goto whole_folios;

        same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
        folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
        if (folio) {
                same_folio = lend < folio_pos(folio) + folio_size(folio);
                folio_mark_dirty(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend)) {
                        start = folio_next_index(folio);
                        if (same_folio)
                                end = folio->index;
                }
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
        }

        if (!same_folio)
                folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
        if (folio) {
                folio_mark_dirty(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend))
                        end = folio->index;
                folio_unlock(folio);
                folio_put(folio);
        }

whole_folios:

        index = start;
        while (index < end) {
                cond_resched();

                if (!find_get_entries(mapping, &index, end - 1, &fbatch,
                                indices)) {
                        /* If all gone or hole-punch or unfalloc, we're done */
                        if (index == start || end != -1)
                                break;
                        /* But if truncating, restart to make sure all gone */
                        index = start;
                        continue;
                }
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        folio = fbatch.folios[i];

                        if (xa_is_value(folio)) {
                                long swaps_freed;

                                if (unfalloc)
                                        continue;
                                swaps_freed = shmem_free_swap(mapping, indices[i], folio);
                                if (!swaps_freed) {
                                        /* Swap was replaced by page: retry */
                                        index = indices[i];
                                        break;
                                }
                                nr_swaps_freed += swaps_freed;
                                continue;
                        }

                        folio_lock(folio);

                        if (!unfalloc || !folio_test_uptodate(folio)) {
                                if (folio_mapping(folio) != mapping) {
                                        /* Page was replaced by swap: retry */
                                        folio_unlock(folio);
                                        index = indices[i];
                                        break;
                                }
                                VM_BUG_ON_FOLIO(folio_test_writeback(folio),
                                                folio);

                                if (!folio_test_large(folio)) {
                                        truncate_inode_folio(mapping, folio);
                                } else if (truncate_inode_partial_folio(folio, lstart, lend)) {
                                        /*
                                         * If we split a page, reset the loop so
                                         * that we pick up the new sub pages.
                                         * Otherwise the THP was entirely
                                         * dropped or the target range was
                                         * zeroed, so just continue the loop as
                                         * is.
                                         */
                                        if (!folio_test_large(folio)) {
                                                folio_unlock(folio);
                                                index = start;
                                                break;
                                        }
                                }
                        }
                        folio_unlock(folio);
                }
                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
        }

        shmem_recalc_inode(inode, 0, -nr_swaps_freed);
}

void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        shmem_undo_range(inode, lstart, lend, false);
        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);

static int shmem_getattr(struct mnt_idmap *idmap,
                         const struct path *path, struct kstat *stat,
                         u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = path->dentry->d_inode;
        struct shmem_inode_info *info = SHMEM_I(inode);

        if (info->alloced - info->swapped != inode->i_mapping->nrpages)
                shmem_recalc_inode(inode, 0, 0);

        if (info->fsflags & FS_APPEND_FL)
                stat->attributes |= STATX_ATTR_APPEND;
        if (info->fsflags & FS_IMMUTABLE_FL)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (info->fsflags & FS_NODUMP_FL)
                stat->attributes |= STATX_ATTR_NODUMP;
        stat->attributes_mask |= (STATX_ATTR_APPEND |
                        STATX_ATTR_IMMUTABLE |
                        STATX_ATTR_NODUMP);
        generic_fillattr(idmap, request_mask, inode, stat);

        if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
                stat->blksize = HPAGE_PMD_SIZE;

        if (request_mask & STATX_BTIME) {
                stat->result_mask |= STATX_BTIME;
                stat->btime.tv_sec = info->i_crtime.tv_sec;
                stat->btime.tv_nsec = info->i_crtime.tv_nsec;
        }

        return 0;
}

static int shmem_setattr(struct mnt_idmap *idmap,
                         struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int error;
        bool update_mtime = false;
        bool update_ctime = true;

        error = setattr_prepare(idmap, dentry, attr);
        if (error)
                return error;

        if ((info->seals & F_SEAL_EXEC) && (attr->ia_valid & ATTR_MODE)) {
                if ((inode->i_mode ^ attr->ia_mode) & 0111) {
                        return -EPERM;
                }
        }

        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;

                /* protected by i_rwsem */
                if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
                    (newsize > oldsize && (info->seals & F_SEAL_GROW)))
                        return -EPERM;

                if (newsize != oldsize) {
                        error = shmem_reacct_size(SHMEM_I(inode)->flags,
                                        oldsize, newsize);
                        if (error)
                                return error;
                        i_size_write(inode, newsize);
                        update_mtime = true;
                } else {
                        update_ctime = false;
                }
                if (newsize <= oldsize) {
                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
                        if (oldsize > holebegin)
                                unmap_mapping_range(inode->i_mapping,
                                                        holebegin, 0, 1);
                        if (info->alloced)
                                shmem_truncate_range(inode,
                                                        newsize, (loff_t)-1);
                        /* unmap again to remove racily COWed private pages */
                        if (oldsize > holebegin)
                                unmap_mapping_range(inode->i_mapping,
                                                        holebegin, 0, 1);
                }
        }

        if (is_quota_modification(idmap, inode, attr)) {
                error = dquot_initialize(inode);
                if (error)
                        return error;
        }

        /* Transfer quota accounting */
        if (i_uid_needs_update(idmap, attr, inode) ||
            i_gid_needs_update(idmap, attr, inode)) {
                error = dquot_transfer(idmap, inode, attr);
                if (error)
                        return error;
        }

        setattr_copy(idmap, inode, attr);
        if (attr->ia_valid & ATTR_MODE)
                error = posix_acl_chmod(idmap, dentry, inode->i_mode);
        if (!error && update_ctime) {
                inode_set_ctime_current(inode);
                if (update_mtime)
                        inode_set_mtime_to_ts(inode, inode_get_ctime(inode));
                inode_inc_iversion(inode);
        }
        return error;
}

static void shmem_evict_inode(struct inode *inode)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        size_t freed = 0;

        if (shmem_mapping(inode->i_mapping)) {
                shmem_unacct_size(info->flags, inode->i_size);
                inode->i_size = 0;
                mapping_set_exiting(inode->i_mapping);
                shmem_truncate_range(inode, 0, (loff_t)-1);
                if (!list_empty(&info->shrinklist)) {
                        spin_lock(&sbinfo->shrinklist_lock);
                        if (!list_empty(&info->shrinklist)) {
                                list_del_init(&info->shrinklist);
                                sbinfo->shrinklist_len--;
                        }
                        spin_unlock(&sbinfo->shrinklist_lock);
                }
                while (!list_empty(&info->swaplist)) {
                        /* Wait while shmem_unuse() is scanning this inode... */
                        wait_var_event(&info->stop_eviction,
                                       !atomic_read(&info->stop_eviction));
                        mutex_lock(&shmem_swaplist_mutex);
                        /* ...but beware of the race if we peeked too early */
                        if (!atomic_read(&info->stop_eviction))
                                list_del_init(&info->swaplist);
                        mutex_unlock(&shmem_swaplist_mutex);
                }
        }

        simple_xattrs_free(&info->xattrs, sbinfo->max_inodes ? &freed : NULL);
        shmem_free_inode(inode->i_sb, freed);
        WARN_ON(inode->i_blocks);
        clear_inode(inode);
#ifdef CONFIG_TMPFS_QUOTA
        dquot_free_inode(inode);
        dquot_drop(inode);
#endif
}

static unsigned int shmem_find_swap_entries(struct address_space *mapping,
                                pgoff_t start, struct folio_batch *fbatch,
                                pgoff_t *indices, unsigned int type)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct folio *folio;
        swp_entry_t entry;

        rcu_read_lock();
        xas_for_each(&xas, folio, ULONG_MAX) {
                if (xas_retry(&xas, folio))
                        continue;

                if (!xa_is_value(folio))
                        continue;

                entry = radix_to_swp_entry(folio);
                /*
                 * swapin error entries can be found in the mapping. But they're
                 * deliberately ignored here as we've done everything we can do.
                 */
                if (swp_type(entry) != type)
                        continue;

                indices[folio_batch_count(fbatch)] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;

                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}

/*
 * Move the swapped pages for an inode to page cache. Returns the count
 * of pages swapped in, or the error in case of failure.
 */
static int shmem_unuse_swap_entries(struct inode *inode,
                struct folio_batch *fbatch, pgoff_t *indices)
{
        int i = 0;
        int ret = 0;
        int error = 0;
        struct address_space *mapping = inode->i_mapping;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                if (!xa_is_value(folio))
                        continue;
                error = shmem_swapin_folio(inode, indices[i], &folio, SGP_CACHE,
                                        mapping_gfp_mask(mapping), NULL, NULL);
                if (error == 0) {
                        folio_unlock(folio);
                        folio_put(folio);
                        ret++;
                }
                if (error == -ENOMEM)
                        break;
                error = 0;
        }
        return error ? error : ret;
}

/*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
static int shmem_unuse_inode(struct inode *inode, unsigned int type)
{
        struct address_space *mapping = inode->i_mapping;
        pgoff_t start = 0;
        struct folio_batch fbatch;
        pgoff_t indices[PAGEVEC_SIZE];
        int ret = 0;

        do {
                folio_batch_init(&fbatch);
                if (!shmem_find_swap_entries(mapping, start, &fbatch,
                                             indices, type)) {
                        ret = 0;
                        break;
                }

                ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
                if (ret < 0)
                        break;

                start = indices[folio_batch_count(&fbatch) - 1];
        } while (true);

        return ret;
}

/*
 * Read all the shared memory data that resides in the swap
 * device 'type' back into memory, so the swap device can be
 * unused.
 */
int shmem_unuse(unsigned int type)
{
        struct shmem_inode_info *info, *next;
        int error = 0;

        if (list_empty(&shmem_swaplist))
                return 0;

        mutex_lock(&shmem_swaplist_mutex);
        list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
                if (!info->swapped) {
                        list_del_init(&info->swaplist);
                        continue;
                }
                /*
                 * Drop the swaplist mutex while searching the inode for swap;
                 * but before doing so, make sure shmem_evict_inode() will not
                 * remove placeholder inode from swaplist, nor let it be freed
                 * (igrab() would protect from unlink, but not from unmount).
                 */
                atomic_inc(&info->stop_eviction);
                mutex_unlock(&shmem_swaplist_mutex);

                error = shmem_unuse_inode(&info->vfs_inode, type);
                cond_resched();

                mutex_lock(&shmem_swaplist_mutex);
                next = list_next_entry(info, swaplist);
                if (!info->swapped)
                        list_del_init(&info->swaplist);
                if (atomic_dec_and_test(&info->stop_eviction))
                        wake_up_var(&info->stop_eviction);
                if (error)
                        break;
        }
        mutex_unlock(&shmem_swaplist_mutex);

        return error;
}

/*
 * Move the page from the page cache to the swap cache.
 */
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
        struct folio *folio = page_folio(page);
        struct address_space *mapping = folio->mapping;
        struct inode *inode = mapping->host;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        pgoff_t index;
        int nr_pages;
        bool split = false;

        /*
         * Our capabilities prevent regular writeback or sync from ever calling
         * shmem_writepage; but a stacking filesystem might use ->writepage of
         * its underlying filesystem, in which case tmpfs should write out to
         * swap only in response to memory pressure, and not for the writeback
         * threads or sync.
         */
        if (WARN_ON_ONCE(!wbc->for_reclaim))
                goto redirty;

        if ((info->flags & VM_LOCKED) || sbinfo->noswap)
                goto redirty;

        if (!total_swap_pages)
                goto redirty;

        /*
         * If CONFIG_THP_SWAP is not enabled, the large folio should be
         * split when swapping.
         *
         * And shrinkage of pages beyond i_size does not split swap, so
         * swapout of a large folio crossing i_size needs to split too
         * (unless fallocate has been used to preallocate beyond EOF).
         */
        if (folio_test_large(folio)) {
                index = shmem_fallocend(inode,
                        DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
                if ((index > folio->index && index < folio_next_index(folio)) ||
                    !IS_ENABLED(CONFIG_THP_SWAP))
                        split = true;
        }

        if (split) {
try_split:
                /* Ensure the subpages are still dirty */
                folio_test_set_dirty(folio);
                if (split_huge_page_to_list_to_order(page, wbc->list, 0))
                        goto redirty;
                folio = page_folio(page);
                folio_clear_dirty(folio);
        }

        index = folio->index;
        nr_pages = folio_nr_pages(folio);

        /*
         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
         * value into swapfile.c, the only way we can correctly account for a
         * fallocated folio arriving here is now to initialize it and write it.
         *
         * That's okay for a folio already fallocated earlier, but if we have
         * not yet completed the fallocation, then (a) we want to keep track
         * of this folio in case we have to undo it, and (b) it may not be a
         * good idea to continue anyway, once we're pushing into swap.  So
         * reactivate the folio, and let shmem_fallocate() quit when too many.
         */
        if (!folio_test_uptodate(folio)) {
                if (inode->i_private) {
                        struct shmem_falloc *shmem_falloc;
                        spin_lock(&inode->i_lock);
                        shmem_falloc = inode->i_private;
                        if (shmem_falloc &&
                            !shmem_falloc->waitq &&
                            index >= shmem_falloc->start &&
                            index < shmem_falloc->next)
                                shmem_falloc->nr_unswapped += nr_pages;
                        else
                                shmem_falloc = NULL;
                        spin_unlock(&inode->i_lock);
                        if (shmem_falloc)
                                goto redirty;
                }
                folio_zero_range(folio, 0, folio_size(folio));
                flush_dcache_folio(folio);
                folio_mark_uptodate(folio);
        }

        /*
         * Add inode to shmem_unuse()'s list of swapped-out inodes,
         * if it's not already there.  Do it now before the folio is
         * moved to swap cache, when its pagelock no longer protects
         * the inode from eviction.  But don't unlock the mutex until
         * we've incremented swapped, because shmem_unuse_inode() will
         * prune a !swapped inode from the swaplist under this mutex.
         */
        mutex_lock(&shmem_swaplist_mutex);
        if (list_empty(&info->swaplist))
                list_add(&info->swaplist, &shmem_swaplist);

        if (!folio_alloc_swap(folio, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN)) {
                shmem_recalc_inode(inode, 0, nr_pages);
                swap_shmem_alloc(folio->swap, nr_pages);
                shmem_delete_from_page_cache(folio, swp_to_radix_entry(folio->swap));

                mutex_unlock(&shmem_swaplist_mutex);
                BUG_ON(folio_mapped(folio));
                return swap_writepage(&folio->page, wbc);
        }

        list_del_init(&info->swaplist);
        mutex_unlock(&shmem_swaplist_mutex);
        if (nr_pages > 1)
                goto try_split;
redirty:
        folio_mark_dirty(folio);
        if (wbc->for_reclaim)
                return AOP_WRITEPAGE_ACTIVATE;        /* Return with folio locked */
        folio_unlock(folio);
        return 0;
}

#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
        char buffer[64];

        if (!mpol || mpol->mode == MPOL_DEFAULT)
                return;                /* show nothing */

        mpol_to_str(buffer, sizeof(buffer), mpol);

        seq_printf(seq, ",mpol=%s", buffer);
}

static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
        struct mempolicy *mpol = NULL;
        if (sbinfo->mpol) {
                raw_spin_lock(&sbinfo->stat_lock);        /* prevent replace/use races */
                mpol = sbinfo->mpol;
                mpol_get(mpol);
                raw_spin_unlock(&sbinfo->stat_lock);
        }
        return mpol;
}
#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
{
}
static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
        return NULL;
}
#endif /* CONFIG_NUMA && CONFIG_TMPFS */

static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
                        pgoff_t index, unsigned int order, pgoff_t *ilx);

static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
                        struct shmem_inode_info *info, pgoff_t index)
{
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct folio *folio;

        mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
        folio = swap_cluster_readahead(swap, gfp, mpol, ilx);
        mpol_cond_put(mpol);

        return folio;
}

/*
 * Make sure huge_gfp is always more limited than limit_gfp.
 * Some of the flags set permissions, while others set limitations.
 */
static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
{
        gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
        gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
        gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
        gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);

        /* Allow allocations only from the originally specified zones. */
        result |= zoneflags;

        /*
         * Minimize the result gfp by taking the union with the deny flags,
         * and the intersection of the allow flags.
         */
        result |= (limit_gfp & denyflags);
        result |= (huge_gfp & limit_gfp) & allowflags;

        return result;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool shmem_hpage_pmd_enabled(void)
{
        if (shmem_huge == SHMEM_HUGE_DENY)
                return false;
        if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always))
                return true;
        if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise))
                return true;
        if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size))
                return true;
        if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) &&
            shmem_huge != SHMEM_HUGE_NEVER)
                return true;

        return false;
}

unsigned long shmem_allowable_huge_orders(struct inode *inode,
                                struct vm_area_struct *vma, pgoff_t index,
                                loff_t write_end, bool shmem_huge_force)
{
        unsigned long mask = READ_ONCE(huge_shmem_orders_always);
        unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
        unsigned long vm_flags = vma ? vma->vm_flags : 0;
        unsigned int global_orders;

        if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags)))
                return 0;

        global_orders = shmem_huge_global_enabled(inode, index, write_end,
                                                  shmem_huge_force, vma, vm_flags);
        /* Tmpfs huge pages allocation */
        if (!vma || !vma_is_anon_shmem(vma))
                return global_orders;

        /*
         * Following the 'deny' semantics of the top level, force the huge
         * option off from all mounts.
         */
        if (shmem_huge == SHMEM_HUGE_DENY)
                return 0;

        /*
         * Only allow inherit orders if the top-level value is 'force', which
         * means non-PMD sized THP can not override 'huge' mount option now.
         */
        if (shmem_huge == SHMEM_HUGE_FORCE)
                return READ_ONCE(huge_shmem_orders_inherit);

        /* Allow mTHP that will be fully within i_size. */
        mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0);

        if (vm_flags & VM_HUGEPAGE)
                mask |= READ_ONCE(huge_shmem_orders_madvise);

        if (global_orders > 0)
                mask |= READ_ONCE(huge_shmem_orders_inherit);

        return THP_ORDERS_ALL_FILE_DEFAULT & mask;
}

static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
                                           struct address_space *mapping, pgoff_t index,
                                           unsigned long orders)
{
        struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
        pgoff_t aligned_index;
        unsigned long pages;
        int order;

        if (vma) {
                orders = thp_vma_suitable_orders(vma, vmf->address, orders);
                if (!orders)
                        return 0;
        }

        /* Find the highest order that can add into the page cache */
        order = highest_order(orders);
        while (orders) {
                pages = 1UL << order;
                aligned_index = round_down(index, pages);
                /*
                 * Check for conflict before waiting on a huge allocation.
                 * Conflict might be that a huge page has just been allocated
                 * and added to page cache by a racing thread, or that there
                 * is already at least one small page in the huge extent.
                 * Be careful to retry when appropriate, but not forever!
                 * Elsewhere -EEXIST would be the right code, but not here.
                 */
                if (!xa_find(&mapping->i_pages, &aligned_index,
                             aligned_index + pages - 1, XA_PRESENT))
                        break;
                order = next_order(&orders, order);
        }

        return orders;
}
#else
static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault *vmf,
                                           struct address_space *mapping, pgoff_t index,
                                           unsigned long orders)
{
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static struct folio *shmem_alloc_folio(gfp_t gfp, int order,
                struct shmem_inode_info *info, pgoff_t index)
{
        struct mempolicy *mpol;
        pgoff_t ilx;
        struct folio *folio;

        mpol = shmem_get_pgoff_policy(info, index, order, &ilx);
        folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id());
        mpol_cond_put(mpol);

        return folio;
}

static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
                gfp_t gfp, struct inode *inode, pgoff_t index,
                struct mm_struct *fault_mm, unsigned long orders)
{
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long suitable_orders = 0;
        struct folio *folio = NULL;
        long pages;
        int error, order;

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                orders = 0;

        if (orders > 0) {
                suitable_orders = shmem_suitable_orders(inode, vmf,
                                                        mapping, index, orders);

                order = highest_order(suitable_orders);
                while (suitable_orders) {
                        pages = 1UL << order;
                        index = round_down(index, pages);
                        folio = shmem_alloc_folio(gfp, order, info, index);
                        if (folio)
                                goto allocated;

                        if (pages == HPAGE_PMD_NR)
                                count_vm_event(THP_FILE_FALLBACK);
                        count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
                        order = next_order(&suitable_orders, order);
                }
        } else {
                pages = 1;
                folio = shmem_alloc_folio(gfp, 0, info, index);
        }
        if (!folio)
                return ERR_PTR(-ENOMEM);

allocated:
        __folio_set_locked(folio);
        __folio_set_swapbacked(folio);

        gfp &= GFP_RECLAIM_MASK;
        error = mem_cgroup_charge(folio, fault_mm, gfp);
        if (error) {
                if (xa_find(&mapping->i_pages, &index,
                                index + pages - 1, XA_PRESENT)) {
                        error = -EEXIST;
                } else if (pages > 1) {
                        if (pages == HPAGE_PMD_NR) {
                                count_vm_event(THP_FILE_FALLBACK);
                                count_vm_event(THP_FILE_FALLBACK_CHARGE);
                        }
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
                }
                goto unlock;
        }

        error = shmem_add_to_page_cache(folio, mapping, index, NULL, gfp);
        if (error)
                goto unlock;

        error = shmem_inode_acct_blocks(inode, pages);
        if (error) {
                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
                long freed;
                /*
                 * Try to reclaim some space by splitting a few
                 * large folios beyond i_size on the filesystem.
                 */
                shmem_unused_huge_shrink(sbinfo, NULL, pages);
                /*
                 * And do a shmem_recalc_inode() to account for freed pages:
                 * except our folio is there in cache, so not quite balanced.
                 */
                spin_lock(&info->lock);
                freed = pages + info->alloced - info->swapped -
                        READ_ONCE(mapping->nrpages);
                if (freed > 0)
                        info->alloced -= freed;
                spin_unlock(&info->lock);
                if (freed > 0)
                        shmem_inode_unacct_blocks(inode, freed);
                error = shmem_inode_acct_blocks(inode, pages);
                if (error) {
                        filemap_remove_folio(folio);
                        goto unlock;
                }
        }

        shmem_recalc_inode(inode, pages, 0);
        folio_add_lru(folio);
        return folio;

unlock:
        folio_unlock(folio);
        folio_put(folio);
        return ERR_PTR(error);
}

static struct folio *shmem_swap_alloc_folio(struct inode *inode,
                struct vm_area_struct *vma, pgoff_t index,
                swp_entry_t entry, int order, gfp_t gfp)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct folio *new;
        void *shadow;
        int nr_pages;

        /*
         * We have arrived here because our zones are constrained, so don't
         * limit chance of success with further cpuset and node constraints.
         */
        gfp &= ~GFP_CONSTRAINT_MASK;
        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && order > 0) {
                gfp_t huge_gfp = vma_thp_gfp_mask(vma);

                gfp = limit_gfp_mask(huge_gfp, gfp);
        }

        new = shmem_alloc_folio(gfp, order, info, index);
        if (!new)
                return ERR_PTR(-ENOMEM);

        nr_pages = folio_nr_pages(new);
        if (mem_cgroup_swapin_charge_folio(new, vma ? vma->vm_mm : NULL,
                                           gfp, entry)) {
                folio_put(new);
                return ERR_PTR(-ENOMEM);
        }

        /*
         * Prevent parallel swapin from proceeding with the swap cache flag.
         *
         * Of course there is another possible concurrent scenario as well,
         * that is to say, the swap cache flag of a large folio has already
         * been set by swapcache_prepare(), while another thread may have
         * already split the large swap entry stored in the shmem mapping.
         * In this case, shmem_add_to_page_cache() will help identify the
         * concurrent swapin and return -EEXIST.
         */
        if (swapcache_prepare(entry, nr_pages)) {
                folio_put(new);
                return ERR_PTR(-EEXIST);
        }

        __folio_set_locked(new);
        __folio_set_swapbacked(new);
        new->swap = entry;

        memcg1_swapin(entry, nr_pages);
        shadow = get_shadow_from_swap_cache(entry);
        if (shadow)
                workingset_refault(new, shadow);
        folio_add_lru(new);
        swap_read_folio(new, NULL);
        return new;
}

/*
 * When a page is moved from swapcache to shmem filecache (either by the
 * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
 * shmem_unuse_inode()), it may have been read in earlier from swap, in
 * ignorance of the mapping it belongs to.  If that mapping has special
 * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
 * we may need to copy to a suitable page before moving to filecache.
 *
 * In a future release, this may well be extended to respect cpuset and
 * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
 * but for now it is a simple matter of zone.
 */
static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
{
        return folio_zonenum(folio) > gfp_zone(gfp);
}

static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
                                struct shmem_inode_info *info, pgoff_t index,
                                struct vm_area_struct *vma)
{
        struct folio *new, *old = *foliop;
        swp_entry_t entry = old->swap;
        struct address_space *swap_mapping = swap_address_space(entry);
        pgoff_t swap_index = swap_cache_index(entry);
        XA_STATE(xas, &swap_mapping->i_pages, swap_index);
        int nr_pages = folio_nr_pages(old);
        int error = 0, i;

        /*
         * We have arrived here because our zones are constrained, so don't
         * limit chance of success by further cpuset and node constraints.
         */
        gfp &= ~GFP_CONSTRAINT_MASK;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (nr_pages > 1) {
                gfp_t huge_gfp = vma_thp_gfp_mask(vma);

                gfp = limit_gfp_mask(huge_gfp, gfp);
        }
#endif

        new = shmem_alloc_folio(gfp, folio_order(old), info, index);
        if (!new)
                return -ENOMEM;

        folio_ref_add(new, nr_pages);
        folio_copy(new, old);
        flush_dcache_folio(new);

        __folio_set_locked(new);
        __folio_set_swapbacked(new);
        folio_mark_uptodate(new);
        new->swap = entry;
        folio_set_swapcache(new);

        /* Swap cache still stores N entries instead of a high-order entry */
        xa_lock_irq(&swap_mapping->i_pages);
        for (i = 0; i < nr_pages; i++) {
                void *item = xas_load(&xas);

                if (item != old) {
                        error = -ENOENT;
                        break;
                }

                xas_store(&xas, new);
                xas_next(&xas);
        }
        if (!error) {
                mem_cgroup_replace_folio(old, new);
                shmem_update_stats(new, nr_pages);
                shmem_update_stats(old, -nr_pages);
        }
        xa_unlock_irq(&swap_mapping->i_pages);

        if (unlikely(error)) {
                /*
                 * Is this possible?  I think not, now that our callers
                 * check both the swapcache flag and folio->private
                 * after getting the folio lock; but be defensive.
                 * Reverse old to newpage for clear and free.
                 */
                old = new;
        } else {
                folio_add_lru(new);
                *foliop = new;
        }

        folio_clear_swapcache(old);
        old->private = NULL;

        folio_unlock(old);
        /*
         * The old folio are removed from swap cache, drop the 'nr_pages'
         * reference, as well as one temporary reference getting from swap
         * cache.
         */
        folio_put_refs(old, nr_pages + 1);
        return error;
}

static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
                                         struct folio *folio, swp_entry_t swap,
                                         bool skip_swapcache)
{
        struct address_space *mapping = inode->i_mapping;
        swp_entry_t swapin_error;
        void *old;
        int nr_pages;

        swapin_error = make_poisoned_swp_entry();
        old = xa_cmpxchg_irq(&mapping->i_pages, index,
                             swp_to_radix_entry(swap),
                             swp_to_radix_entry(swapin_error), 0);
        if (old != swp_to_radix_entry(swap))
                return;

        nr_pages = folio_nr_pages(folio);
        folio_wait_writeback(folio);
        if (!skip_swapcache)
                delete_from_swap_cache(folio);
        /*
         * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks
         * won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
         * in shmem_evict_inode().
         */
        shmem_recalc_inode(inode, -nr_pages, -nr_pages);
        swap_free_nr(swap, nr_pages);
}

static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
                                   swp_entry_t swap, gfp_t gfp)
{
        struct address_space *mapping = inode->i_mapping;
        XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
        int split_order = 0, entry_order;
        int i;

        /* Convert user data gfp flags to xarray node gfp flags */
        gfp &= GFP_RECLAIM_MASK;

        for (;;) {
                void *old = NULL;
                int cur_order;
                pgoff_t swap_index;

                xas_lock_irq(&xas);
                old = xas_load(&xas);
                if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
                        xas_set_err(&xas, -EEXIST);
                        goto unlock;
                }

                entry_order = xas_get_order(&xas);

                if (!entry_order)
                        goto unlock;

                /* Try to split large swap entry in pagecache */
                cur_order = entry_order;
                swap_index = round_down(index, 1 << entry_order);

                split_order = xas_try_split_min_order(cur_order);

                while (cur_order > 0) {
                        pgoff_t aligned_index =
                                round_down(index, 1 << cur_order);
                        pgoff_t swap_offset = aligned_index - swap_index;

                        xas_set_order(&xas, index, split_order);
                        xas_try_split(&xas, old, cur_order);
                        if (xas_error(&xas))
                                goto unlock;

                        /*
                         * Re-set the swap entry after splitting, and the swap
                         * offset of the original large entry must be continuous.
                         */
                        for (i = 0; i < 1 << cur_order;
                             i += (1 << split_order)) {
                                swp_entry_t tmp;

                                tmp = swp_entry(swp_type(swap),
                                                swp_offset(swap) + swap_offset +
                                                        i);
                                __xa_store(&mapping->i_pages, aligned_index + i,
                                           swp_to_radix_entry(tmp), 0);
                        }
                        cur_order = split_order;
                        split_order = xas_try_split_min_order(split_order);
                }

unlock:
                xas_unlock_irq(&xas);

                if (!xas_nomem(&xas, gfp))
                        break;
        }

        if (xas_error(&xas))
                return xas_error(&xas);

        return entry_order;
}

/*
 * Swap in the folio pointed to by *foliop.
 * Caller has to make sure that *foliop contains a valid swapped folio.
 * Returns 0 and the folio in foliop if success. On failure, returns the
 * error code and NULL in *foliop.
 */
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
                             struct folio **foliop, enum sgp_type sgp,
                             gfp_t gfp, struct vm_area_struct *vma,
                             vm_fault_t *fault_type)
{
        struct address_space *mapping = inode->i_mapping;
        struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct swap_info_struct *si;
        struct folio *folio = NULL;
        bool skip_swapcache = false;
        swp_entry_t swap;
        int error, nr_pages, order, split_order;

        VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
        swap = radix_to_swp_entry(*foliop);
        *foliop = NULL;

        if (is_poisoned_swp_entry(swap))
                return -EIO;

        si = get_swap_device(swap);
        if (!si) {
                if (!shmem_confirm_swap(mapping, index, swap))
                        return -EEXIST;
                else
                        return -EINVAL;
        }

        /* Look it up and read it in.. */
        folio = swap_cache_get_folio(swap, NULL, 0);
        order = xa_get_order(&mapping->i_pages, index);
        if (!folio) {
                bool fallback_order0 = false;

                /* Or update major stats only when swapin succeeds?? */
                if (fault_type) {
                        *fault_type |= VM_FAULT_MAJOR;
                        count_vm_event(PGMAJFAULT);
                        count_memcg_event_mm(fault_mm, PGMAJFAULT);
                }

                /*
                 * If uffd is active for the vma, we need per-page fault
                 * fidelity to maintain the uffd semantics, then fallback
                 * to swapin order-0 folio, as well as for zswap case.
                 */
                if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) ||
                                  !zswap_never_enabled()))
                        fallback_order0 = true;

                /* Skip swapcache for synchronous device. */
                if (!fallback_order0 && data_race(si->flags & SWP_SYNCHRONOUS_IO)) {
                        folio = shmem_swap_alloc_folio(inode, vma, index, swap, order, gfp);
                        if (!IS_ERR(folio)) {
                                skip_swapcache = true;
                                goto alloced;
                        }

                        /*
                         * Fallback to swapin order-0 folio unless the swap entry
                         * already exists.
                         */
                        error = PTR_ERR(folio);
                        folio = NULL;
                        if (error == -EEXIST)
                                goto failed;
                }

                /*
                 * Now swap device can only swap in order 0 folio, then we
                 * should split the large swap entry stored in the pagecache
                 * if necessary.
                 */
                split_order = shmem_split_large_entry(inode, index, swap, gfp);
                if (split_order < 0) {
                        error = split_order;
                        goto failed;
                }

                /*
                 * If the large swap entry has already been split, it is
                 * necessary to recalculate the new swap entry based on
                 * the old order alignment.
                 */
                if (split_order > 0) {
                        pgoff_t offset = index - round_down(index, 1 << split_order);

                        swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
                }

                /* Here we actually start the io */
                folio = shmem_swapin_cluster(swap, gfp, info, index);
                if (!folio) {
                        error = -ENOMEM;
                        goto failed;
                }
        } else if (order != folio_order(folio)) {
                /*
                 * Swap readahead may swap in order 0 folios into swapcache
                 * asynchronously, while the shmem mapping can still stores
                 * large swap entries. In such cases, we should split the
                 * large swap entry to prevent possible data corruption.
                 */
                split_order = shmem_split_large_entry(inode, index, swap, gfp);
                if (split_order < 0) {
                        error = split_order;
                        goto failed;
                }

                /*
                 * If the large swap entry has already been split, it is
                 * necessary to recalculate the new swap entry based on
                 * the old order alignment.
                 */
                if (split_order > 0) {
                        pgoff_t offset = index - round_down(index, 1 << split_order);

                        swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
                }
        }

alloced:
        /* We have to do this with folio locked to prevent races */
        folio_lock(folio);
        if ((!skip_swapcache && !folio_test_swapcache(folio)) ||
            folio->swap.val != swap.val ||
            !shmem_confirm_swap(mapping, index, swap) ||
            xa_get_order(&mapping->i_pages, index) != folio_order(folio)) {
                error = -EEXIST;
                goto unlock;
        }
        if (!folio_test_uptodate(folio)) {
                error = -EIO;
                goto failed;
        }
        folio_wait_writeback(folio);
        nr_pages = folio_nr_pages(folio);

        /*
         * Some architectures may have to restore extra metadata to the
         * folio after reading from swap.
         */
        arch_swap_restore(folio_swap(swap, folio), folio);

        if (shmem_should_replace_folio(folio, gfp)) {
                error = shmem_replace_folio(&folio, gfp, info, index, vma);
                if (error)
                        goto failed;
        }

        error = shmem_add_to_page_cache(folio, mapping,
                                        round_down(index, nr_pages),
                                        swp_to_radix_entry(swap), gfp);
        if (error)
                goto failed;

        shmem_recalc_inode(inode, 0, -nr_pages);

        if (sgp == SGP_WRITE)
                folio_mark_accessed(folio);

        if (skip_swapcache) {
                folio->swap.val = 0;
                swapcache_clear(si, swap, nr_pages);
        } else {
                delete_from_swap_cache(folio);
        }
        folio_mark_dirty(folio);
        swap_free_nr(swap, nr_pages);
        put_swap_device(si);

        *foliop = folio;
        return 0;
failed:
        if (!shmem_confirm_swap(mapping, index, swap))
                error = -EEXIST;
        if (error == -EIO)
                shmem_set_folio_swapin_error(inode, index, folio, swap,
                                             skip_swapcache);
unlock:
        if (skip_swapcache)
                swapcache_clear(si, swap, folio_nr_pages(folio));
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
        put_swap_device(si);

        return error;
}

/*
 * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache.
 *
 * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
 */
static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
                loff_t write_end, struct folio **foliop, enum sgp_type sgp,
                gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
{
        struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
        struct mm_struct *fault_mm;
        struct folio *folio;
        int error;
        bool alloced;
        unsigned long orders = 0;

        if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
                return -EINVAL;

        if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
                return -EFBIG;
repeat:
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode))
                return -EINVAL;

        alloced = false;
        fault_mm = vma ? vma->vm_mm : NULL;

        folio = filemap_get_entry(inode->i_mapping, index);
        if (folio && vma && userfaultfd_minor(vma)) {
                if (!xa_is_value(folio))
                        folio_put(folio);
                *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
                return 0;
        }

        if (xa_is_value(folio)) {
                error = shmem_swapin_folio(inode, index, &folio,
                                           sgp, gfp, vma, fault_type);
                if (error == -EEXIST)
                        goto repeat;

                *foliop = folio;
                return error;
        }

        if (folio) {
                folio_lock(folio);

                /* Has the folio been truncated or swapped out? */
                if (unlikely(folio->mapping != inode->i_mapping)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto repeat;
                }
                if (sgp == SGP_WRITE)
                        folio_mark_accessed(folio);
                if (folio_test_uptodate(folio))
                        goto out;
                /* fallocated folio */
                if (sgp != SGP_READ)
                        goto clear;
                folio_unlock(folio);
                folio_put(folio);
        }

        /*
         * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
         * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
         */
        *foliop = NULL;
        if (sgp == SGP_READ)
                return 0;
        if (sgp == SGP_NOALLOC)
                return -ENOENT;

        /*
         * Fast cache lookup and swap lookup did not find it: allocate.
         */

        if (vma && userfaultfd_missing(vma)) {
                *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
                return 0;
        }

        /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
        orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
        if (orders > 0) {
                gfp_t huge_gfp;

                huge_gfp = vma_thp_gfp_mask(vma);
                huge_gfp = limit_gfp_mask(huge_gfp, gfp);
                folio = shmem_alloc_and_add_folio(vmf, huge_gfp,
                                inode, index, fault_mm, orders);
                if (!IS_ERR(folio)) {
                        if (folio_test_pmd_mappable(folio))
                                count_vm_event(THP_FILE_ALLOC);
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
                        goto alloced;
                }
                if (PTR_ERR(folio) == -EEXIST)
                        goto repeat;
        }

        folio = shmem_alloc_and_add_folio(vmf, gfp, inode, index, fault_mm, 0);
        if (IS_ERR(folio)) {
                error = PTR_ERR(folio);
                if (error == -EEXIST)
                        goto repeat;
                folio = NULL;
                goto unlock;
        }

alloced:
        alloced = true;
        if (folio_test_large(folio) &&
            DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
                                        folio_next_index(folio)) {
                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
                struct shmem_inode_info *info = SHMEM_I(inode);
                /*
                 * Part of the large folio is beyond i_size: subject
                 * to shrink under memory pressure.
                 */
                spin_lock(&sbinfo->shrinklist_lock);
                /*
                 * _careful to defend against unlocked access to
                 * ->shrink_list in shmem_unused_huge_shrink()
                 */
                if (list_empty_careful(&info->shrinklist)) {
                        list_add_tail(&info->shrinklist,
                                      &sbinfo->shrinklist);
                        sbinfo->shrinklist_len++;
                }
                spin_unlock(&sbinfo->shrinklist_lock);
        }

        if (sgp == SGP_WRITE)
                folio_set_referenced(folio);
        /*
         * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
         */
        if (sgp == SGP_FALLOC)
                sgp = SGP_WRITE;
clear:
        /*
         * Let SGP_WRITE caller clear ends if write does not fill folio;
         * but SGP_FALLOC on a folio fallocated earlier must initialize
         * it now, lest undo on failure cancel our earlier guarantee.
         */
        if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
                long i, n = folio_nr_pages(folio);

                for (i = 0; i < n; i++)
                        clear_highpage(folio_page(folio, i));
                flush_dcache_folio(folio);
                folio_mark_uptodate(folio);
        }

        /* Perhaps the file has been truncated since we checked */
        if (sgp <= SGP_CACHE &&
            ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
                error = -EINVAL;
                goto unlock;
        }
out:
        *foliop = folio;
        return 0;

        /*
         * Error recovery.
         */
unlock:
        if (alloced)
                filemap_remove_folio(folio);
        shmem_recalc_inode(inode, 0, 0);
        if (folio) {
                folio_unlock(folio);
                folio_put(folio);
        }
        return error;
}

/**
 * shmem_get_folio - find, and lock a shmem folio.
 * @inode:        inode to search
 * @index:        the page index.
 * @write_end:        end of a write, could extend inode size
 * @foliop:        pointer to the folio if found
 * @sgp:        SGP_* flags to control behavior
 *
 * Looks up the page cache entry at @inode & @index.  If a folio is
 * present, it is returned locked with an increased refcount.
 *
 * If the caller modifies data in the folio, it must call folio_mark_dirty()
 * before unlocking the folio to ensure that the folio is not reclaimed.
 * There is no need to reserve space before calling folio_mark_dirty().
 *
 * When no folio is found, the behavior depends on @sgp:
 *  - for SGP_READ, *@foliop is %NULL and 0 is returned
 *  - for SGP_NOALLOC, *@foliop is %NULL and -ENOENT is returned
 *  - for all other flags a new folio is allocated, inserted into the
 *    page cache and returned locked in @foliop.
 *
 * Context: May sleep.
 * Return: 0 if successful, else a negative error code.
 */
int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
                    struct folio **foliop, enum sgp_type sgp)
{
        return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
                        mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
EXPORT_SYMBOL_GPL(shmem_get_folio);

/*
 * This is like autoremove_wake_function, but it removes the wait queue
 * entry unconditionally - even if something else had already woken the
 * target.
 */
static int synchronous_wake_function(wait_queue_entry_t *wait,
                        unsigned int mode, int sync, void *key)
{
        int ret = default_wake_function(wait, mode, sync, key);
        list_del_init(&wait->entry);
        return ret;
}

/*
 * Trinity finds that probing a hole which tmpfs is punching can
 * prevent the hole-punch from ever completing: which in turn
 * locks writers out with its hold on i_rwsem.  So refrain from
 * faulting pages into the hole while it's being punched.  Although
 * shmem_undo_range() does remove the additions, it may be unable to
 * keep up, as each new page needs its own unmap_mapping_range() call,
 * and the i_mmap tree grows ever slower to scan if new vmas are added.
 *
 * It does not matter if we sometimes reach this check just before the
 * hole-punch begins, so that one fault then races with the punch:
 * we just need to make racing faults a rare case.
 *
 * The implementation below would be much simpler if we just used a
 * standard mutex or completion: but we cannot take i_rwsem in fault,
 * and bloating every shmem inode for this unlikely case would be sad.
 */
static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
{
        struct shmem_falloc *shmem_falloc;
        struct file *fpin = NULL;
        vm_fault_t ret = 0;

        spin_lock(&inode->i_lock);
        shmem_falloc = inode->i_private;
        if (shmem_falloc &&
            shmem_falloc->waitq &&
            vmf->pgoff >= shmem_falloc->start &&
            vmf->pgoff < shmem_falloc->next) {
                wait_queue_head_t *shmem_falloc_waitq;
                DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);

                ret = VM_FAULT_NOPAGE;
                fpin = maybe_unlock_mmap_for_io(vmf, NULL);
                shmem_falloc_waitq = shmem_falloc->waitq;
                prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
                                TASK_UNINTERRUPTIBLE);
                spin_unlock(&inode->i_lock);
                schedule();

                /*
                 * shmem_falloc_waitq points into the shmem_fallocate()
                 * stack of the hole-punching task: shmem_falloc_waitq
                 * is usually invalid by the time we reach here, but
                 * finish_wait() does not dereference it in that case;
                 * though i_lock needed lest racing with wake_up_all().
                 */
                spin_lock(&inode->i_lock);
                finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
        }
        spin_unlock(&inode->i_lock);
        if (fpin) {
                fput(fpin);
                ret = VM_FAULT_RETRY;
        }
        return ret;
}

static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
        struct inode *inode = file_inode(vmf->vma->vm_file);
        gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
        struct folio *folio = NULL;
        vm_fault_t ret = 0;
        int err;

        /*
         * Trinity finds that probing a hole which tmpfs is punching can
         * prevent the hole-punch from ever completing: noted in i_private.
         */
        if (unlikely(inode->i_private)) {
                ret = shmem_falloc_wait(vmf, inode);
                if (ret)
                        return ret;
        }

        WARN_ON_ONCE(vmf->page != NULL);
        err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
                                  gfp, vmf, &ret);
        if (err)
                return vmf_error(err);
        if (folio) {
                vmf->page = folio_file_page(folio, vmf->pgoff);
                ret |= VM_FAULT_LOCKED;
        }
        return ret;
}

unsigned long shmem_get_unmapped_area(struct file *file,
                                      unsigned long uaddr, unsigned long len,
                                      unsigned long pgoff, unsigned long flags)
{
        unsigned long addr;
        unsigned long offset;
        unsigned long inflated_len;
        unsigned long inflated_addr;
        unsigned long inflated_offset;
        unsigned long hpage_size;

        if (len > TASK_SIZE)
                return -ENOMEM;

        addr = mm_get_unmapped_area(current->mm, file, uaddr, len, pgoff,
                                    flags);

        if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
                return addr;
        if (IS_ERR_VALUE(addr))
                return addr;
        if (addr & ~PAGE_MASK)
                return addr;
        if (addr > TASK_SIZE - len)
                return addr;

        if (shmem_huge == SHMEM_HUGE_DENY)
                return addr;
        if (flags & MAP_FIXED)
                return addr;
        /*
         * Our priority is to support MAP_SHARED mapped hugely;
         * and support MAP_PRIVATE mapped hugely too, until it is COWed.
         * But if caller specified an address hint and we allocated area there
         * successfully, respect that as before.
         */
        if (uaddr == addr)
                return addr;

        hpage_size = HPAGE_PMD_SIZE;
        if (shmem_huge != SHMEM_HUGE_FORCE) {
                struct super_block *sb;
                unsigned long __maybe_unused hpage_orders;
                int order = 0;

                if (file) {
                        VM_BUG_ON(file->f_op != &shmem_file_operations);
                        sb = file_inode(file)->i_sb;
                } else {
                        /*
                         * Called directly from mm/mmap.c, or drivers/char/mem.c
                         * for "/dev/zero", to create a shared anonymous object.
                         */
                        if (IS_ERR(shm_mnt))
                                return addr;
                        sb = shm_mnt->mnt_sb;

                        /*
                         * Find the highest mTHP order used for anonymous shmem to
                         * provide a suitable alignment address.
                         */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
                        hpage_orders = READ_ONCE(huge_shmem_orders_always);
                        hpage_orders |= READ_ONCE(huge_shmem_orders_within_size);
                        hpage_orders |= READ_ONCE(huge_shmem_orders_madvise);
                        if (SHMEM_SB(sb)->huge != SHMEM_HUGE_NEVER)
                                hpage_orders |= READ_ONCE(huge_shmem_orders_inherit);

                        if (hpage_orders > 0) {
                                order = highest_order(hpage_orders);
                                hpage_size = PAGE_SIZE << order;
                        }
#endif
                }
                if (SHMEM_SB(sb)->huge == SHMEM_HUGE_NEVER && !order)
                        return addr;
        }

        if (len < hpage_size)
                return addr;

        offset = (pgoff << PAGE_SHIFT) & (hpage_size - 1);
        if (offset && offset + len < 2 * hpage_size)
                return addr;
        if ((addr & (hpage_size - 1)) == offset)
                return addr;

        inflated_len = len + hpage_size - PAGE_SIZE;
        if (inflated_len > TASK_SIZE)
                return addr;
        if (inflated_len < len)
                return addr;

        inflated_addr = mm_get_unmapped_area(current->mm, NULL, uaddr,
                                             inflated_len, 0, flags);
        if (IS_ERR_VALUE(inflated_addr))
                return addr;
        if (inflated_addr & ~PAGE_MASK)
                return addr;

        inflated_offset = inflated_addr & (hpage_size - 1);
        inflated_addr += offset - inflated_offset;
        if (inflated_offset > offset)
                inflated_addr += hpage_size;

        if (inflated_addr > TASK_SIZE - len)
                return addr;
        return inflated_addr;
}

#ifdef CONFIG_NUMA
static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{
        struct inode *inode = file_inode(vma->vm_file);
        return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
}

static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                          unsigned long addr, pgoff_t *ilx)
{
        struct inode *inode = file_inode(vma->vm_file);
        pgoff_t index;

        /*
         * Bias interleave by inode number to distribute better across nodes;
         * but this interface is independent of which page order is used, so
         * supplies only that bias, letting caller apply the offset (adjusted
         * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
         */
        *ilx = inode->i_ino;
        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
}

static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
                        pgoff_t index, unsigned int order, pgoff_t *ilx)
{
        struct mempolicy *mpol;

        /* Bias interleave by inode number to distribute better across nodes */
        *ilx = info->vfs_inode.i_ino + (index >> order);

        mpol = mpol_shared_policy_lookup(&info->policy, index);
        return mpol ? mpol : get_task_policy(current);
}
#else
static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
                        pgoff_t index, unsigned int order, pgoff_t *ilx)
{
        *ilx = 0;
        return NULL;
}
#endif /* CONFIG_NUMA */

int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
        struct inode *inode = file_inode(file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int retval = -ENOMEM;

        /*
         * What serializes the accesses to info->flags?
         * ipc_lock_object() when called from shmctl_do_lock(),
         * no serialization needed when called from shm_destroy().
         */
        if (lock && !(info->flags & VM_LOCKED)) {
                if (!user_shm_lock(inode->i_size, ucounts))
                        goto out_nomem;
                info->flags |= VM_LOCKED;
                mapping_set_unevictable(file->f_mapping);
        }
        if (!lock && (info->flags & VM_LOCKED) && ucounts) {
                user_shm_unlock(inode->i_size, ucounts);
                info->flags &= ~VM_LOCKED;
                mapping_clear_unevictable(file->f_mapping);
        }
        retval = 0;

out_nomem:
        return retval;
}

static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct inode *inode = file_inode(file);

        file_accessed(file);
        /* This is anonymous shared memory if it is unlinked at the time of mmap */
        if (inode->i_nlink)
                vma->vm_ops = &shmem_vm_ops;
        else
                vma->vm_ops = &shmem_anon_vm_ops;
        return 0;
}

static int shmem_file_open(struct inode *inode, struct file *file)
{
        file->f_mode |= FMODE_CAN_ODIRECT;
        return generic_file_open(inode, file);
}

#ifdef CONFIG_TMPFS_XATTR
static int shmem_initxattrs(struct inode *, const struct xattr *, void *);

#if IS_ENABLED(CONFIG_UNICODE)
/*
 * shmem_inode_casefold_flags - Deal with casefold file attribute flag
 *
 * The casefold file attribute needs some special checks. I can just be added to
 * an empty dir, and can't be removed from a non-empty dir.
 */
static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
                                      struct dentry *dentry, unsigned int *i_flags)
{
        unsigned int old = inode->i_flags;
        struct super_block *sb = inode->i_sb;

        if (fsflags & FS_CASEFOLD_FL) {
                if (!(old & S_CASEFOLD)) {
                        if (!sb->s_encoding)
                                return -EOPNOTSUPP;

                        if (!S_ISDIR(inode->i_mode))
                                return -ENOTDIR;

                        if (dentry && !simple_empty(dentry))
                                return -ENOTEMPTY;
                }

                *i_flags = *i_flags | S_CASEFOLD;
        } else if (old & S_CASEFOLD) {
                if (dentry && !simple_empty(dentry))
                        return -ENOTEMPTY;
        }

        return 0;
}
#else
static int shmem_inode_casefold_flags(struct inode *inode, unsigned int fsflags,
                                      struct dentry *dentry, unsigned int *i_flags)
{
        if (fsflags & FS_CASEFOLD_FL)
                return -EOPNOTSUPP;

        return 0;
}
#endif

/*
 * chattr's fsflags are unrelated to extended attributes,
 * but tmpfs has chosen to enable them under the same config option.
 */
static int shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
{
        unsigned int i_flags = 0;
        int ret;

        ret = shmem_inode_casefold_flags(inode, fsflags, dentry, &i_flags);
        if (ret)
                return ret;

        if (fsflags & FS_NOATIME_FL)
                i_flags |= S_NOATIME;
        if (fsflags & FS_APPEND_FL)
                i_flags |= S_APPEND;
        if (fsflags & FS_IMMUTABLE_FL)
                i_flags |= S_IMMUTABLE;
        /*
         * But FS_NODUMP_FL does not require any action in i_flags.
         */
        inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE | S_CASEFOLD);

        return 0;
}
#else
static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags, struct dentry *dentry)
{
}
#define shmem_initxattrs NULL
#endif

static struct offset_ctx *shmem_get_offset_ctx(struct inode *inode)
{
        return &SHMEM_I(inode)->dir_offsets;
}

static struct inode *__shmem_get_inode(struct mnt_idmap *idmap,
                                             struct super_block *sb,
                                             struct inode *dir, umode_t mode,
                                             dev_t dev, unsigned long flags)
{
        struct inode *inode;
        struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        ino_t ino;
        int err;

        err = shmem_reserve_inode(sb, &ino);
        if (err)
                return ERR_PTR(err);

        inode = new_inode(sb);
        if (!inode) {
                shmem_free_inode(sb, 0);
                return ERR_PTR(-ENOSPC);
        }

        inode->i_ino = ino;
        inode_init_owner(idmap, inode, dir, mode);
        inode->i_blocks = 0;
        simple_inode_init_ts(inode);
        inode->i_generation = get_random_u32();
        info = SHMEM_I(inode);
        memset(info, 0, (char *)inode - (char *)info);
        spin_lock_init(&info->lock);
        atomic_set(&info->stop_eviction, 0);
        info->seals = F_SEAL_SEAL;
        info->flags = flags & VM_NORESERVE;
        info->i_crtime = inode_get_mtime(inode);
        info->fsflags = (dir == NULL) ? 0 :
                SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
        if (info->fsflags)
                shmem_set_inode_flags(inode, info->fsflags, NULL);
        INIT_LIST_HEAD(&info->shrinklist);
        INIT_LIST_HEAD(&info->swaplist);
        simple_xattrs_init(&info->xattrs);
        cache_no_acl(inode);
        if (sbinfo->noswap)
                mapping_set_unevictable(inode->i_mapping);

        /* Don't consider 'deny' for emergencies and 'force' for testing */
        if (sbinfo->huge)
                mapping_set_large_folios(inode->i_mapping);

        switch (mode & S_IFMT) {
        default:
                inode->i_op = &shmem_special_inode_operations;
                init_special_inode(inode, mode, dev);
                break;
        case S_IFREG:
                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_inode_operations;
                inode->i_fop = &shmem_file_operations;
                mpol_shared_policy_init(&info->policy,
                                         shmem_get_sbmpol(sbinfo));
                break;
        case S_IFDIR:
                inc_nlink(inode);
                /* Some things misbehave if size == 0 on a directory */
                inode->i_size = 2 * BOGO_DIRENT_SIZE;
                inode->i_op = &shmem_dir_inode_operations;
                inode->i_fop = &simple_offset_dir_operations;
                simple_offset_init(shmem_get_offset_ctx(inode));
                break;
        case S_IFLNK:
                /*
                 * Must not load anything in the rbtree,
                 * mpol_free_shared_policy will not be called.
                 */
                mpol_shared_policy_init(&info->policy, NULL);
                break;
        }

        lockdep_annotate_inode_mutex_key(inode);
        return inode;
}

#ifdef CONFIG_TMPFS_QUOTA
static struct inode *shmem_get_inode(struct mnt_idmap *idmap,
                                     struct super_block *sb, struct inode *dir,
                                     umode_t mode, dev_t dev, unsigned long flags)
{
        int err;
        struct inode *inode;

        inode = __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
        if (IS_ERR(inode))
                return inode;

        err = dquot_initialize(inode);
        if (err)
                goto errout;

        err = dquot_alloc_inode(inode);
        if (err) {
                dquot_drop(inode);
                goto errout;
        }
        return inode;

errout:
        inode->i_flags |= S_NOQUOTA;
        iput(inode);
        return ERR_PTR(err);
}
#else
static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
                                     struct super_block *sb, struct inode *dir,
                                     umode_t mode, dev_t dev, unsigned long flags)
{
        return __shmem_get_inode(idmap, sb, dir, mode, dev, flags);
}
#endif /* CONFIG_TMPFS_QUOTA */

#ifdef CONFIG_USERFAULTFD
int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
                           struct vm_area_struct *dst_vma,
                           unsigned long dst_addr,
                           unsigned long src_addr,
                           uffd_flags_t flags,
                           struct folio **foliop)
{
        struct inode *inode = file_inode(dst_vma->vm_file);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct address_space *mapping = inode->i_mapping;
        gfp_t gfp = mapping_gfp_mask(mapping);
        pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
        void *page_kaddr;
        struct folio *folio;
        int ret;
        pgoff_t max_off;

        if (shmem_inode_acct_blocks(inode, 1)) {
                /*
                 * We may have got a page, returned -ENOENT triggering a retry,
                 * and now we find ourselves with -ENOMEM. Release the page, to
                 * avoid a BUG_ON in our caller.
                 */
                if (unlikely(*foliop)) {
                        folio_put(*foliop);
                        *foliop = NULL;
                }
                return -ENOMEM;
        }

        if (!*foliop) {
                ret = -ENOMEM;
                folio = shmem_alloc_folio(gfp, 0, info, pgoff);
                if (!folio)
                        goto out_unacct_blocks;

                if (uffd_flags_mode_is(flags, MFILL_ATOMIC_COPY)) {
                        page_kaddr = kmap_local_folio(folio, 0);
                        /*
                         * The read mmap_lock is held here.  Despite the
                         * mmap_lock being read recursive a deadlock is still
                         * possible if a writer has taken a lock.  For example:
                         *
                         * process A thread 1 takes read lock on own mmap_lock
                         * process A thread 2 calls mmap, blocks taking write lock
                         * process B thread 1 takes page fault, read lock on own mmap lock
                         * process B thread 2 calls mmap, blocks taking write lock
                         * process A thread 1 blocks taking read lock on process B
                         * process B thread 1 blocks taking read lock on process A
                         *
                         * Disable page faults to prevent potential deadlock
                         * and retry the copy outside the mmap_lock.
                         */
                        pagefault_disable();
                        ret = copy_from_user(page_kaddr,
                                             (const void __user *)src_addr,
                                             PAGE_SIZE);
                        pagefault_enable();
                        kunmap_local(page_kaddr);

                        /* fallback to copy_from_user outside mmap_lock */
                        if (unlikely(ret)) {
                                *foliop = folio;
                                ret = -ENOENT;
                                /* don't free the page */
                                goto out_unacct_blocks;
                        }

                        flush_dcache_folio(folio);
                } else {                /* ZEROPAGE */
                        clear_user_highpage(&folio->page, dst_addr);
                }
        } else {
                folio = *foliop;
                VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
                *foliop = NULL;
        }

        VM_BUG_ON(folio_test_locked(folio));
        VM_BUG_ON(folio_test_swapbacked(folio));
        __folio_set_locked(folio);
        __folio_set_swapbacked(folio);
        __folio_mark_uptodate(folio);

        ret = -EFAULT;
        max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(pgoff >= max_off))
                goto out_release;

        ret = mem_cgroup_charge(folio, dst_vma->vm_mm, gfp);
        if (ret)
                goto out_release;
        ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp);
        if (ret)
                goto out_release;

        ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
                                       &folio->page, true, flags);
        if (ret)
                goto out_delete_from_cache;

        shmem_recalc_inode(inode, 1, 0);
        folio_unlock(folio);
        return 0;
out_delete_from_cache:
        filemap_remove_folio(folio);
out_release:
        folio_unlock(folio);
        folio_put(folio);
out_unacct_blocks:
        shmem_inode_unacct_blocks(inode, 1);
        return ret;
}
#endif /* CONFIG_USERFAULTFD */

#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;

static int
shmem_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct folio **foliop, void **fsdata)
{
        struct inode *inode = mapping->host;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t index = pos >> PAGE_SHIFT;
        struct folio *folio;
        int ret = 0;

        /* i_rwsem is held by caller */
        if (unlikely(info->seals & (F_SEAL_GROW |
                                   F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
                        return -EPERM;
                if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
                        return -EPERM;
        }

        ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
        if (ret)
                return ret;

        if (folio_contain_hwpoisoned_page(folio)) {
                folio_unlock(folio);
                folio_put(folio);
                return -EIO;
        }

        *foliop = folio;
        return 0;
}

static int
shmem_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio, void *fsdata)
{
        struct inode *inode = mapping->host;

        if (pos + copied > inode->i_size)
                i_size_write(inode, pos + copied);

        if (!folio_test_uptodate(folio)) {
                if (copied < folio_size(folio)) {
                        size_t from = offset_in_folio(folio, pos);
                        folio_zero_segments(folio, 0, from,
                                        from + copied, folio_size(folio));
                }
                folio_mark_uptodate(folio);
        }
        folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);

        return copied;
}

static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        pgoff_t index;
        unsigned long offset;
        int error = 0;
        ssize_t retval = 0;

        for (;;) {
                struct folio *folio = NULL;
                struct page *page = NULL;
                unsigned long nr, ret;
                loff_t end_offset, i_size = i_size_read(inode);
                bool fallback_page_copy = false;
                size_t fsize;

                if (unlikely(iocb->ki_pos >= i_size))
                        break;

                index = iocb->ki_pos >> PAGE_SHIFT;
                error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
                if (error) {
                        if (error == -EINVAL)
                                error = 0;
                        break;
                }
                if (folio) {
                        folio_unlock(folio);

                        page = folio_file_page(folio, index);
                        if (PageHWPoison(page)) {
                                folio_put(folio);
                                error = -EIO;
                                break;
                        }

                        if (folio_test_large(folio) &&
                            folio_test_has_hwpoisoned(folio))
                                fallback_page_copy = true;
                }

                /*
                 * We must evaluate after, since reads (unlike writes)
                 * are called without i_rwsem protection against truncate
                 */
                i_size = i_size_read(inode);
                if (unlikely(iocb->ki_pos >= i_size)) {
                        if (folio)
                                folio_put(folio);
                        break;
                }
                end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count);
                if (folio && likely(!fallback_page_copy))
                        fsize = folio_size(folio);
                else
                        fsize = PAGE_SIZE;
                offset = iocb->ki_pos & (fsize - 1);
                nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset);

                if (folio) {
                        /*
                         * If users can be writing to this page using arbitrary
                         * virtual addresses, take care about potential aliasing
                         * before reading the page on the kernel side.
                         */
                        if (mapping_writably_mapped(mapping)) {
                                if (likely(!fallback_page_copy))
                                        flush_dcache_folio(folio);
                                else
                                        flush_dcache_page(page);
                        }

                        /*
                         * Mark the folio accessed if we read the beginning.
                         */
                        if (!offset)
                                folio_mark_accessed(folio);
                        /*
                         * Ok, we have the page, and it's up-to-date, so
                         * now we can copy it to user space...
                         */
                        if (likely(!fallback_page_copy))
                                ret = copy_folio_to_iter(folio, offset, nr, to);
                        else
                                ret = copy_page_to_iter(page, offset, nr, to);
                        folio_put(folio);
                } else if (user_backed_iter(to)) {
                        /*
                         * Copy to user tends to be so well optimized, but
                         * clear_user() not so much, that it is noticeably
                         * faster to copy the zero page instead of clearing.
                         */
                        ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
                } else {
                        /*
                         * But submitting the same page twice in a row to
                         * splice() - or others? - can result in confusion:
                         * so don't attempt that optimization on pipes etc.
                         */
                        ret = iov_iter_zero(nr, to);
                }

                retval += ret;
                iocb->ki_pos += ret;

                if (!iov_iter_count(to))
                        break;
                if (ret < nr) {
                        error = -EFAULT;
                        break;
                }
                cond_resched();
        }

        file_accessed(file);
        return retval ? retval : error;
}

static ssize_t shmem_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;

        inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret <= 0)
                goto unlock;
        ret = file_remove_privs(file);
        if (ret)
                goto unlock;
        ret = file_update_time(file);
        if (ret)
                goto unlock;
        ret = generic_perform_write(iocb, from);
unlock:
        inode_unlock(inode);
        return ret;
}

static bool zero_pipe_buf_get(struct pipe_inode_info *pipe,
                              struct pipe_buffer *buf)
{
        return true;
}

static void zero_pipe_buf_release(struct pipe_inode_info *pipe,
                                  struct pipe_buffer *buf)
{
}

static bool zero_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf)
{
        return false;
}

static const struct pipe_buf_operations zero_pipe_buf_ops = {
        .release        = zero_pipe_buf_release,
        .try_steal        = zero_pipe_buf_try_steal,
        .get                = zero_pipe_buf_get,
};

static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe,
                                        loff_t fpos, size_t size)
{
        size_t offset = fpos & ~PAGE_MASK;

        size = min_t(size_t, size, PAGE_SIZE - offset);

        if (!pipe_is_full(pipe)) {
                struct pipe_buffer *buf = pipe_head_buf(pipe);

                *buf = (struct pipe_buffer) {
                        .ops        = &zero_pipe_buf_ops,
                        .page        = ZERO_PAGE(0),
                        .offset        = offset,
                        .len        = size,
                };
                pipe->head++;
        }

        return size;
}

static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
                                      struct pipe_inode_info *pipe,
                                      size_t len, unsigned int flags)
{
        struct inode *inode = file_inode(in);
        struct address_space *mapping = inode->i_mapping;
        struct folio *folio = NULL;
        size_t total_spliced = 0, used, npages, n, part;
        loff_t isize;
        int error = 0;

        /* Work out how much data we can actually add into the pipe */
        used = pipe_buf_usage(pipe);
        npages = max_t(ssize_t, pipe->max_usage - used, 0);
        len = min_t(size_t, len, npages * PAGE_SIZE);

        do {
                bool fallback_page_splice = false;
                struct page *page = NULL;
                pgoff_t index;
                size_t size;

                if (*ppos >= i_size_read(inode))
                        break;

                index = *ppos >> PAGE_SHIFT;
                error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
                if (error) {
                        if (error == -EINVAL)
                                error = 0;
                        break;
                }
                if (folio) {
                        folio_unlock(folio);

                        page = folio_file_page(folio, index);
                        if (PageHWPoison(page)) {
                                error = -EIO;
                                break;
                        }

                        if (folio_test_large(folio) &&
                            folio_test_has_hwpoisoned(folio))
                                fallback_page_splice = true;
                }

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(inode);
                if (unlikely(*ppos >= isize))
                        break;
                /*
                 * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned
                 * pages.
                 */
                size = len;
                if (unlikely(fallback_page_splice)) {
                        size_t offset = *ppos & ~PAGE_MASK;

                        size = umin(size, PAGE_SIZE - offset);
                }
                part = min_t(loff_t, isize - *ppos, size);

                if (folio) {
                        /*
                         * If users can be writing to this page using arbitrary
                         * virtual addresses, take care about potential aliasing
                         * before reading the page on the kernel side.
                         */
                        if (mapping_writably_mapped(mapping)) {
                                if (likely(!fallback_page_splice))
                                        flush_dcache_folio(folio);
                                else
                                        flush_dcache_page(page);
                        }
                        folio_mark_accessed(folio);
                        /*
                         * Ok, we have the page, and it's up-to-date, so we can
                         * now splice it into the pipe.
                         */
                        n = splice_folio_into_pipe(pipe, folio, *ppos, part);
                        folio_put(folio);
                        folio = NULL;
                } else {
                        n = splice_zeropage_into_pipe(pipe, *ppos, part);
                }

                if (!n)
                        break;
                len -= n;
                total_spliced += n;
                *ppos += n;
                in->f_ra.prev_pos = *ppos;
                if (pipe_is_full(pipe))
                        break;

                cond_resched();
        } while (len);

        if (folio)
                folio_put(folio);

        file_accessed(in);
        return total_spliced ? total_spliced : error;
}

static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;

        if (whence != SEEK_DATA && whence != SEEK_HOLE)
                return generic_file_llseek_size(file, offset, whence,
                                        MAX_LFS_FILESIZE, i_size_read(inode));
        if (offset < 0)
                return -ENXIO;

        inode_lock(inode);
        /* We're holding i_rwsem so we can access i_size directly */
        offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
        if (offset >= 0)
                offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
        inode_unlock(inode);
        return offset;
}

static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                                         loff_t len)
{
        struct inode *inode = file_inode(file);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_falloc shmem_falloc;
        pgoff_t start, index, end, undo_fallocend;
        int error;

        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                return -EOPNOTSUPP;

        inode_lock(inode);

        if (mode & FALLOC_FL_PUNCH_HOLE) {
                struct address_space *mapping = file->f_mapping;
                loff_t unmap_start = round_up(offset, PAGE_SIZE);
                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);

                /* protected by i_rwsem */
                if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
                        error = -EPERM;
                        goto out;
                }

                shmem_falloc.waitq = &shmem_falloc_waitq;
                shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
                spin_lock(&inode->i_lock);
                inode->i_private = &shmem_falloc;
                spin_unlock(&inode->i_lock);

                if ((u64)unmap_end > (u64)unmap_start)
                        unmap_mapping_range(mapping, unmap_start,
                                            1 + unmap_end - unmap_start, 0);
                shmem_truncate_range(inode, offset, offset + len - 1);
                /* No need to unmap again: hole-punching leaves COWed pages */

                spin_lock(&inode->i_lock);
                inode->i_private = NULL;
                wake_up_all(&shmem_falloc_waitq);
                WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.head));
                spin_unlock(&inode->i_lock);
                error = 0;
                goto out;
        }

        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
        error = inode_newsize_ok(inode, offset + len);
        if (error)
                goto out;

        if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) {
                error = -EPERM;
                goto out;
        }

        start = offset >> PAGE_SHIFT;
        end = (offset + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
        /* Try to avoid a swapstorm if len is impossible to satisfy */
        if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
                error = -ENOSPC;
                goto out;
        }

        shmem_falloc.waitq = NULL;
        shmem_falloc.start = start;
        shmem_falloc.next  = start;
        shmem_falloc.nr_falloced = 0;
        shmem_falloc.nr_unswapped = 0;
        spin_lock(&inode->i_lock);
        inode->i_private = &shmem_falloc;
        spin_unlock(&inode->i_lock);

        /*
         * info->fallocend is only relevant when huge pages might be
         * involved: to prevent split_huge_page() freeing fallocated
         * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
         */
        undo_fallocend = info->fallocend;
        if (info->fallocend < end)
                info->fallocend = end;

        for (index = start; index < end; ) {
                struct folio *folio;

                /*
                 * Check for fatal signal so that we abort early in OOM
                 * situations. We don't want to abort in case of non-fatal
                 * signals as large fallocate can take noticeable time and
                 * e.g. periodic timers may result in fallocate constantly
                 * restarting.
                 */
                if (fatal_signal_pending(current))
                        error = -EINTR;
                else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
                        error = -ENOMEM;
                else
                        error = shmem_get_folio(inode, index, offset + len,
                                                &folio, SGP_FALLOC);
                if (error) {
                        info->fallocend = undo_fallocend;
                        /* Remove the !uptodate folios we added */
                        if (index > start) {
                                shmem_undo_range(inode,
                                    (loff_t)start << PAGE_SHIFT,
                                    ((loff_t)index << PAGE_SHIFT) - 1, true);
                        }
                        goto undone;
                }

                /*
                 * Here is a more important optimization than it appears:
                 * a second SGP_FALLOC on the same large folio will clear it,
                 * making it uptodate and un-undoable if we fail later.
                 */
                index = folio_next_index(folio);
                /* Beware 32-bit wraparound */
                if (!index)
                        index--;

                /*
                 * Inform shmem_writepage() how far we have reached.
                 * No need for lock or barrier: we have the page lock.
                 */
                if (!folio_test_uptodate(folio))
                        shmem_falloc.nr_falloced += index - shmem_falloc.next;
                shmem_falloc.next = index;

                /*
                 * If !uptodate, leave it that way so that freeable folios
                 * can be recognized if we need to rollback on error later.
                 * But mark it dirty so that memory pressure will swap rather
                 * than free the folios we are allocating (and SGP_CACHE folios
                 * might still be clean: we now need to mark those dirty too).
                 */
                folio_mark_dirty(folio);
                folio_unlock(folio);
                folio_put(folio);
                cond_resched();
        }

        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
                i_size_write(inode, offset + len);
undone:
        spin_lock(&inode->i_lock);
        inode->i_private = NULL;
        spin_unlock(&inode->i_lock);
out:
        if (!error)
                file_modified(file);
        inode_unlock(inode);
        return error;
}

static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);

        buf->f_type = TMPFS_MAGIC;
        buf->f_bsize = PAGE_SIZE;
        buf->f_namelen = NAME_MAX;
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
                buf->f_bavail =
                buf->f_bfree  = sbinfo->max_blocks -
                                percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
                buf->f_ffree = sbinfo->free_ispace / BOGO_INODE_SIZE;
        }
        /* else leave those fields 0 like simple_statfs */

        buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);

        return 0;
}

/*
 * File creation. Allocate an inode, and we're done..
 */
static int
shmem_mknod(struct mnt_idmap *idmap, struct inode *dir,
            struct dentry *dentry, umode_t mode, dev_t dev)
{
        struct inode *inode;
        int error;

        if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
                return -EINVAL;

        inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        error = simple_acl_create(dir, inode);
        if (error)
                goto out_iput;
        error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;

        error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
        if (error)
                goto out_iput;

        dir->i_size += BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        inode_inc_iversion(dir);

        if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
                d_add(dentry, inode);
        else
                d_instantiate(dentry, inode);

        dget(dentry); /* Extra count - pin the dentry in core */
        return error;

out_iput:
        iput(inode);
        return error;
}

static int
shmem_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
              struct file *file, umode_t mode)
{
        struct inode *inode;
        int error;

        inode = shmem_get_inode(idmap, dir->i_sb, dir, mode, 0, VM_NORESERVE);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto err_out;
        }
        error = security_inode_init_security(inode, dir, NULL,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;
        error = simple_acl_create(dir, inode);
        if (error)
                goto out_iput;
        d_tmpfile(file, inode);

err_out:
        return finish_open_simple(file, error);
out_iput:
        iput(inode);
        return error;
}

static struct dentry *shmem_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                                  struct dentry *dentry, umode_t mode)
{
        int error;

        error = shmem_mknod(idmap, dir, dentry, mode | S_IFDIR, 0);
        if (error)
                return ERR_PTR(error);
        inc_nlink(dir);
        return NULL;
}

static int shmem_create(struct mnt_idmap *idmap, struct inode *dir,
                        struct dentry *dentry, umode_t mode, bool excl)
{
        return shmem_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
}

/*
 * Link a file..
 */
static int shmem_link(struct dentry *old_dentry, struct inode *dir,
                      struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);
        int ret = 0;

        /*
         * No ordinary (disk based) filesystem counts links as inodes;
         * but each new link needs a new dentry, pinning lowmem, and
         * tmpfs dentries cannot be pruned until they are unlinked.
         * But if an O_TMPFILE file is linked into the tmpfs, the
         * first link must skip that, to get the accounting right.
         */
        if (inode->i_nlink) {
                ret = shmem_reserve_inode(inode->i_sb, NULL);
                if (ret)
                        goto out;
        }

        ret = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
        if (ret) {
                if (inode->i_nlink)
                        shmem_free_inode(inode->i_sb, 0);
                goto out;
        }

        dir->i_size += BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inode_inc_iversion(dir);
        inc_nlink(inode);
        ihold(inode);        /* New dentry reference */
        dget(dentry);        /* Extra pinning count for the created dentry */
        if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
                d_add(dentry, inode);
        else
                d_instantiate(dentry, inode);
out:
        return ret;
}

static int shmem_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
                shmem_free_inode(inode->i_sb, 0);

        simple_offset_remove(shmem_get_offset_ctx(dir), dentry);

        dir->i_size -= BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inode_inc_iversion(dir);
        drop_nlink(inode);
        dput(dentry);        /* Undo the count from "create" - does all the work */

        /*
         * For now, VFS can't deal with case-insensitive negative dentries, so
         * we invalidate them
         */
        if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
                d_invalidate(dentry);

        return 0;
}

static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (!simple_empty(dentry))
                return -ENOTEMPTY;

        drop_nlink(d_inode(dentry));
        drop_nlink(dir);
        return shmem_unlink(dir, dentry);
}

static int shmem_whiteout(struct mnt_idmap *idmap,
                          struct inode *old_dir, struct dentry *old_dentry)
{
        struct dentry *whiteout;
        int error;

        whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name);
        if (!whiteout)
                return -ENOMEM;

        error = shmem_mknod(idmap, old_dir, whiteout,
                            S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
        dput(whiteout);
        if (error)
                return error;

        /*
         * Cheat and hash the whiteout while the old dentry is still in
         * place, instead of playing games with FS_RENAME_DOES_D_MOVE.
         *
         * d_lookup() will consistently find one of them at this point,
         * not sure which one, but that isn't even important.
         */
        d_rehash(whiteout);
        return 0;
}

/*
 * The VFS layer already does all the dentry stuff for rename,
 * we just have to decrement the usage count for the target if
 * it exists so that the VFS layer correctly free's it when it
 * gets overwritten.
 */
static int shmem_rename2(struct mnt_idmap *idmap,
                         struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry,
                         unsigned int flags)
{
        struct inode *inode = d_inode(old_dentry);
        int they_are_dirs = S_ISDIR(inode->i_mode);
        int error;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE)
                return simple_offset_rename_exchange(old_dir, old_dentry,
                                                     new_dir, new_dentry);

        if (!simple_empty(new_dentry))
                return -ENOTEMPTY;

        if (flags & RENAME_WHITEOUT) {
                error = shmem_whiteout(idmap, old_dir, old_dentry);
                if (error)
                        return error;
        }

        error = simple_offset_rename(old_dir, old_dentry, new_dir, new_dentry);
        if (error)
                return error;

        if (d_really_is_positive(new_dentry)) {
                (void) shmem_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
        }

        old_dir->i_size -= BOGO_DIRENT_SIZE;
        new_dir->i_size += BOGO_DIRENT_SIZE;
        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        inode_inc_iversion(old_dir);
        inode_inc_iversion(new_dir);
        return 0;
}

static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
                         struct dentry *dentry, const char *symname)
{
        int error;
        int len;
        struct inode *inode;
        struct folio *folio;
        char *link;

        len = strlen(symname) + 1;
        if (len > PAGE_SIZE)
                return -ENAMETOOLONG;

        inode = shmem_get_inode(idmap, dir->i_sb, dir, S_IFLNK | 0777, 0,
                                VM_NORESERVE);
        if (IS_ERR(inode))
                return PTR_ERR(inode);

        error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             shmem_initxattrs, NULL);
        if (error && error != -EOPNOTSUPP)
                goto out_iput;

        error = simple_offset_add(shmem_get_offset_ctx(dir), dentry);
        if (error)
                goto out_iput;

        inode->i_size = len-1;
        if (len <= SHORT_SYMLINK_LEN) {
                link = kmemdup(symname, len, GFP_KERNEL);
                if (!link) {
                        error = -ENOMEM;
                        goto out_remove_offset;
                }
                inode->i_op = &shmem_short_symlink_operations;
                inode_set_cached_link(inode, link, len - 1);
        } else {
                inode_nohighmem(inode);
                inode->i_mapping->a_ops = &shmem_aops;
                error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
                if (error)
                        goto out_remove_offset;
                inode->i_op = &shmem_symlink_inode_operations;
                memcpy(folio_address(folio), symname, len);
                folio_mark_uptodate(folio);
                folio_mark_dirty(folio);
                folio_unlock(folio);
                folio_put(folio);
        }
        dir->i_size += BOGO_DIRENT_SIZE;
        inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
        inode_inc_iversion(dir);
        if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
                d_add(dentry, inode);
        else
                d_instantiate(dentry, inode);
        dget(dentry);
        return 0;

out_remove_offset:
        simple_offset_remove(shmem_get_offset_ctx(dir), dentry);
out_iput:
        iput(inode);
        return error;
}

static void shmem_put_link(void *arg)
{
        folio_mark_accessed(arg);
        folio_put(arg);
}

static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
                                  struct delayed_call *done)
{
        struct folio *folio = NULL;
        int error;

        if (!dentry) {
                folio = filemap_get_folio(inode->i_mapping, 0);
                if (IS_ERR(folio))
                        return ERR_PTR(-ECHILD);
                if (PageHWPoison(folio_page(folio, 0)) ||
                    !folio_test_uptodate(folio)) {
                        folio_put(folio);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
                if (error)
                        return ERR_PTR(error);
                if (!folio)
                        return ERR_PTR(-ECHILD);
                if (PageHWPoison(folio_page(folio, 0))) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return ERR_PTR(-ECHILD);
                }
                folio_unlock(folio);
        }
        set_delayed_call(done, shmem_put_link, folio);
        return folio_address(folio);
}

#ifdef CONFIG_TMPFS_XATTR

static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));

        fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);

        return 0;
}

static int shmem_fileattr_set(struct mnt_idmap *idmap,
                              struct dentry *dentry, struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct shmem_inode_info *info = SHMEM_I(inode);
        int ret, flags;

        if (fileattr_has_fsx(fa))
                return -EOPNOTSUPP;
        if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
                return -EOPNOTSUPP;

        flags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
                (fa->flags & SHMEM_FL_USER_MODIFIABLE);

        ret = shmem_set_inode_flags(inode, flags, dentry);

        if (ret)
                return ret;

        info->fsflags = flags;

        inode_set_ctime_current(inode);
        inode_inc_iversion(inode);
        return 0;
}

/*
 * Superblocks without xattr inode operations may get some security.* xattr
 * support from the LSM "for free". As soon as we have any other xattrs
 * like ACLs, we also need to implement the security.* handlers at
 * filesystem level, though.
 */

/*
 * Callback for security_inode_init_security() for acquiring xattrs.
 */
static int shmem_initxattrs(struct inode *inode,
                            const struct xattr *xattr_array, void *fs_info)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        const struct xattr *xattr;
        struct simple_xattr *new_xattr;
        size_t ispace = 0;
        size_t len;

        if (sbinfo->max_inodes) {
                for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                        ispace += simple_xattr_space(xattr->name,
                                xattr->value_len + XATTR_SECURITY_PREFIX_LEN);
                }
                if (ispace) {
                        raw_spin_lock(&sbinfo->stat_lock);
                        if (sbinfo->free_ispace < ispace)
                                ispace = 0;
                        else
                                sbinfo->free_ispace -= ispace;
                        raw_spin_unlock(&sbinfo->stat_lock);
                        if (!ispace)
                                return -ENOSPC;
                }
        }

        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
                new_xattr = simple_xattr_alloc(xattr->value, xattr->value_len);
                if (!new_xattr)
                        break;

                len = strlen(xattr->name) + 1;
                new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
                                          GFP_KERNEL_ACCOUNT);
                if (!new_xattr->name) {
                        kvfree(new_xattr);
                        break;
                }

                memcpy(new_xattr->name, XATTR_SECURITY_PREFIX,
                       XATTR_SECURITY_PREFIX_LEN);
                memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN,
                       xattr->name, len);

                simple_xattr_add(&info->xattrs, new_xattr);
        }

        if (xattr->name != NULL) {
                if (ispace) {
                        raw_spin_lock(&sbinfo->stat_lock);
                        sbinfo->free_ispace += ispace;
                        raw_spin_unlock(&sbinfo->stat_lock);
                }
                simple_xattrs_free(&info->xattrs, NULL);
                return -ENOMEM;
        }

        return 0;
}

static int shmem_xattr_handler_get(const struct xattr_handler *handler,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, void *buffer, size_t size)
{
        struct shmem_inode_info *info = SHMEM_I(inode);

        name = xattr_full_name(handler, name);
        return simple_xattr_get(&info->xattrs, name, buffer, size);
}

static int shmem_xattr_handler_set(const struct xattr_handler *handler,
                                   struct mnt_idmap *idmap,
                                   struct dentry *unused, struct inode *inode,
                                   const char *name, const void *value,
                                   size_t size, int flags)
{
        struct shmem_inode_info *info = SHMEM_I(inode);
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        struct simple_xattr *old_xattr;
        size_t ispace = 0;

        name = xattr_full_name(handler, name);
        if (value && sbinfo->max_inodes) {
                ispace = simple_xattr_space(name, size);
                raw_spin_lock(&sbinfo->stat_lock);
                if (sbinfo->free_ispace < ispace)
                        ispace = 0;
                else
                        sbinfo->free_ispace -= ispace;
                raw_spin_unlock(&sbinfo->stat_lock);
                if (!ispace)
                        return -ENOSPC;
        }

        old_xattr = simple_xattr_set(&info->xattrs, name, value, size, flags);
        if (!IS_ERR(old_xattr)) {
                ispace = 0;
                if (old_xattr && sbinfo->max_inodes)
                        ispace = simple_xattr_space(old_xattr->name,
                                                    old_xattr->size);
                simple_xattr_free(old_xattr);
                old_xattr = NULL;
                inode_set_ctime_current(inode);
                inode_inc_iversion(inode);
        }
        if (ispace) {
                raw_spin_lock(&sbinfo->stat_lock);
                sbinfo->free_ispace += ispace;
                raw_spin_unlock(&sbinfo->stat_lock);
        }
        return PTR_ERR(old_xattr);
}

static const struct xattr_handler shmem_security_xattr_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler shmem_trusted_xattr_handler = {
        .prefix = XATTR_TRUSTED_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler shmem_user_xattr_handler = {
        .prefix = XATTR_USER_PREFIX,
        .get = shmem_xattr_handler_get,
        .set = shmem_xattr_handler_set,
};

static const struct xattr_handler * const shmem_xattr_handlers[] = {
        &shmem_security_xattr_handler,
        &shmem_trusted_xattr_handler,
        &shmem_user_xattr_handler,
        NULL
};

static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
        struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
        return simple_xattr_list(d_inode(dentry), &info->xattrs, buffer, size);
}
#endif /* CONFIG_TMPFS_XATTR */

static const struct inode_operations shmem_short_symlink_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
        .get_link        = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
};

static const struct inode_operations shmem_symlink_inode_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
        .get_link        = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
};

static struct dentry *shmem_get_parent(struct dentry *child)
{
        return ERR_PTR(-ESTALE);
}

static int shmem_match(struct inode *ino, void *vfh)
{
        __u32 *fh = vfh;
        __u64 inum = fh[2];
        inum = (inum << 32) | fh[1];
        return ino->i_ino == inum && fh[0] == ino->i_generation;
}

/* Find any alias of inode, but prefer a hashed alias */
static struct dentry *shmem_find_alias(struct inode *inode)
{
        struct dentry *alias = d_find_alias(inode);

        return alias ?: d_find_any_alias(inode);
}

static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
                struct fid *fid, int fh_len, int fh_type)
{
        struct inode *inode;
        struct dentry *dentry = NULL;
        u64 inum;

        if (fh_len < 3)
                return NULL;

        inum = fid->raw[2];
        inum = (inum << 32) | fid->raw[1];

        inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]),
                        shmem_match, fid->raw);
        if (inode) {
                dentry = shmem_find_alias(inode);
                iput(inode);
        }

        return dentry;
}

static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
                                struct inode *parent)
{
        if (*len < 3) {
                *len = 3;
                return FILEID_INVALID;
        }

        if (inode_unhashed(inode)) {
                /* Unfortunately insert_inode_hash is not idempotent,
                 * so as we hash inodes here rather than at creation
                 * time, we need a lock to ensure we only try
                 * to do it once
                 */
                static DEFINE_SPINLOCK(lock);
                spin_lock(&lock);
                if (inode_unhashed(inode))
                        __insert_inode_hash(inode,
                                            inode->i_ino + inode->i_generation);
                spin_unlock(&lock);
        }

        fh[0] = inode->i_generation;
        fh[1] = inode->i_ino;
        fh[2] = ((__u64)inode->i_ino) >> 32;

        *len = 3;
        return 1;
}

static const struct export_operations shmem_export_ops = {
        .get_parent     = shmem_get_parent,
        .encode_fh      = shmem_encode_fh,
        .fh_to_dentry        = shmem_fh_to_dentry,
};

enum shmem_param {
        Opt_gid,
        Opt_huge,
        Opt_mode,
        Opt_mpol,
        Opt_nr_blocks,
        Opt_nr_inodes,
        Opt_size,
        Opt_uid,
        Opt_inode32,
        Opt_inode64,
        Opt_noswap,
        Opt_quota,
        Opt_usrquota,
        Opt_grpquota,
        Opt_usrquota_block_hardlimit,
        Opt_usrquota_inode_hardlimit,
        Opt_grpquota_block_hardlimit,
        Opt_grpquota_inode_hardlimit,
        Opt_casefold_version,
        Opt_casefold,
        Opt_strict_encoding,
};

static const struct constant_table shmem_param_enums_huge[] = {
        {"never",        SHMEM_HUGE_NEVER },
        {"always",        SHMEM_HUGE_ALWAYS },
        {"within_size",        SHMEM_HUGE_WITHIN_SIZE },
        {"advise",        SHMEM_HUGE_ADVISE },
        {}
};

const struct fs_parameter_spec shmem_fs_parameters[] = {
        fsparam_gid   ("gid",                Opt_gid),
        fsparam_enum  ("huge",                Opt_huge,  shmem_param_enums_huge),
        fsparam_u32oct("mode",                Opt_mode),
        fsparam_string("mpol",                Opt_mpol),
        fsparam_string("nr_blocks",        Opt_nr_blocks),
        fsparam_string("nr_inodes",        Opt_nr_inodes),
        fsparam_string("size",                Opt_size),
        fsparam_uid   ("uid",                Opt_uid),
        fsparam_flag  ("inode32",        Opt_inode32),
        fsparam_flag  ("inode64",        Opt_inode64),
        fsparam_flag  ("noswap",        Opt_noswap),
#ifdef CONFIG_TMPFS_QUOTA
        fsparam_flag  ("quota",                Opt_quota),
        fsparam_flag  ("usrquota",        Opt_usrquota),
        fsparam_flag  ("grpquota",        Opt_grpquota),
        fsparam_string("usrquota_block_hardlimit", Opt_usrquota_block_hardlimit),
        fsparam_string("usrquota_inode_hardlimit", Opt_usrquota_inode_hardlimit),
        fsparam_string("grpquota_block_hardlimit", Opt_grpquota_block_hardlimit),
        fsparam_string("grpquota_inode_hardlimit", Opt_grpquota_inode_hardlimit),
#endif
        fsparam_string("casefold",        Opt_casefold_version),
        fsparam_flag  ("casefold",        Opt_casefold),
        fsparam_flag  ("strict_encoding", Opt_strict_encoding),
        {}
};

#if IS_ENABLED(CONFIG_UNICODE)
static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
                                    bool latest_version)
{
        struct shmem_options *ctx = fc->fs_private;
        int version = UTF8_LATEST;
        struct unicode_map *encoding;
        char *version_str = param->string + 5;

        if (!latest_version) {
                if (strncmp(param->string, "utf8-", 5))
                        return invalfc(fc, "Only UTF-8 encodings are supported "
                                       "in the format: utf8-<version number>");

                version = utf8_parse_version(version_str);
                if (version < 0)
                        return invalfc(fc, "Invalid UTF-8 version: %s", version_str);
        }

        encoding = utf8_load(version);

        if (IS_ERR(encoding)) {
                return invalfc(fc, "Failed loading UTF-8 version: utf8-%u.%u.%u\n",
                               unicode_major(version), unicode_minor(version),
                               unicode_rev(version));
        }

        pr_info("tmpfs: Using encoding : utf8-%u.%u.%u\n",
                unicode_major(version), unicode_minor(version), unicode_rev(version));

        ctx->encoding = encoding;

        return 0;
}
#else
static int shmem_parse_opt_casefold(struct fs_context *fc, struct fs_parameter *param,
                                    bool latest_version)
{
        return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
}
#endif

static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
        struct shmem_options *ctx = fc->fs_private;
        struct fs_parse_result result;
        unsigned long long size;
        char *rest;
        int opt;
        kuid_t kuid;
        kgid_t kgid;

        opt = fs_parse(fc, shmem_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        switch (opt) {
        case Opt_size:
                size = memparse(param->string, &rest);
                if (*rest == '%') {
                        size <<= PAGE_SHIFT;
                        size *= totalram_pages();
                        do_div(size, 100);
                        rest++;
                }
                if (*rest)
                        goto bad_value;
                ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
                ctx->seen |= SHMEM_SEEN_BLOCKS;
                break;
        case Opt_nr_blocks:
                ctx->blocks = memparse(param->string, &rest);
                if (*rest || ctx->blocks > LONG_MAX)
                        goto bad_value;
                ctx->seen |= SHMEM_SEEN_BLOCKS;
                break;
        case Opt_nr_inodes:
                ctx->inodes = memparse(param->string, &rest);
                if (*rest || ctx->inodes > ULONG_MAX / BOGO_INODE_SIZE)
                        goto bad_value;
                ctx->seen |= SHMEM_SEEN_INODES;
                break;
        case Opt_mode:
                ctx->mode = result.uint_32 & 07777;
                break;
        case Opt_uid:
                kuid = result.uid;

                /*
                 * The requested uid must be representable in the
                 * filesystem's idmapping.
                 */
                if (!kuid_has_mapping(fc->user_ns, kuid))
                        goto bad_value;

                ctx->uid = kuid;
                break;
        case Opt_gid:
                kgid = result.gid;

                /*
                 * The requested gid must be representable in the
                 * filesystem's idmapping.
                 */
                if (!kgid_has_mapping(fc->user_ns, kgid))
                        goto bad_value;

                ctx->gid = kgid;
                break;
        case Opt_huge:
                ctx->huge = result.uint_32;
                if (ctx->huge != SHMEM_HUGE_NEVER &&
                    !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
                      has_transparent_hugepage()))
                        goto unsupported_parameter;
                ctx->seen |= SHMEM_SEEN_HUGE;
                break;
        case Opt_mpol:
                if (IS_ENABLED(CONFIG_NUMA)) {
                        mpol_put(ctx->mpol);
                        ctx->mpol = NULL;
                        if (mpol_parse_str(param->string, &ctx->mpol))
                                goto bad_value;
                        break;
                }
                goto unsupported_parameter;
        case Opt_inode32:
                ctx->full_inums = false;
                ctx->seen |= SHMEM_SEEN_INUMS;
                break;
        case Opt_inode64:
                if (sizeof(ino_t) < 8) {
                        return invalfc(fc,
                                       "Cannot use inode64 with <64bit inums in kernel\n");
                }
                ctx->full_inums = true;
                ctx->seen |= SHMEM_SEEN_INUMS;
                break;
        case Opt_noswap:
                if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN)) {
                        return invalfc(fc,
                                       "Turning off swap in unprivileged tmpfs mounts unsupported");
                }
                ctx->noswap = true;
                ctx->seen |= SHMEM_SEEN_NOSWAP;
                break;
        case Opt_quota:
                if (fc->user_ns != &init_user_ns)
                        return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
                ctx->seen |= SHMEM_SEEN_QUOTA;
                ctx->quota_types |= (QTYPE_MASK_USR | QTYPE_MASK_GRP);
                break;
        case Opt_usrquota:
                if (fc->user_ns != &init_user_ns)
                        return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
                ctx->seen |= SHMEM_SEEN_QUOTA;
                ctx->quota_types |= QTYPE_MASK_USR;
                break;
        case Opt_grpquota:
                if (fc->user_ns != &init_user_ns)
                        return invalfc(fc, "Quotas in unprivileged tmpfs mounts are unsupported");
                ctx->seen |= SHMEM_SEEN_QUOTA;
                ctx->quota_types |= QTYPE_MASK_GRP;
                break;
        case Opt_usrquota_block_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
                        return invalfc(fc,
                                       "User quota block hardlimit too large.");
                ctx->qlimits.usrquota_bhardlimit = size;
                break;
        case Opt_grpquota_block_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_SPC_LIMIT)
                        return invalfc(fc,
                                       "Group quota block hardlimit too large.");
                ctx->qlimits.grpquota_bhardlimit = size;
                break;
        case Opt_usrquota_inode_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
                        return invalfc(fc,
                                       "User quota inode hardlimit too large.");
                ctx->qlimits.usrquota_ihardlimit = size;
                break;
        case Opt_grpquota_inode_hardlimit:
                size = memparse(param->string, &rest);
                if (*rest || !size)
                        goto bad_value;
                if (size > SHMEM_QUOTA_MAX_INO_LIMIT)
                        return invalfc(fc,
                                       "Group quota inode hardlimit too large.");
                ctx->qlimits.grpquota_ihardlimit = size;
                break;
        case Opt_casefold_version:
                return shmem_parse_opt_casefold(fc, param, false);
        case Opt_casefold:
                return shmem_parse_opt_casefold(fc, param, true);
        case Opt_strict_encoding:
#if IS_ENABLED(CONFIG_UNICODE)
                ctx->strict_encoding = true;
                break;
#else
                return invalfc(fc, "tmpfs: Kernel not built with CONFIG_UNICODE\n");
#endif
        }
        return 0;

unsupported_parameter:
        return invalfc(fc, "Unsupported parameter '%s'", param->key);
bad_value:
        return invalfc(fc, "Bad value for '%s'", param->key);
}

static char *shmem_next_opt(char **s)
{
        char *sbegin = *s;
        char *p;

        if (sbegin == NULL)
                return NULL;

        /*
         * NUL-terminate this option: unfortunately,
         * mount options form a comma-separated list,
         * but mpol's nodelist may also contain commas.
         */
        for (;;) {
                p = strchr(*s, ',');
                if (p == NULL)
                        break;
                *s = p + 1;
                if (!isdigit(*(p+1))) {
                        *p = '\0';
                        return sbegin;
                }
        }

        *s = NULL;
        return sbegin;
}

static int shmem_parse_monolithic(struct fs_context *fc, void *data)
{
        return vfs_parse_monolithic_sep(fc, data, shmem_next_opt);
}

/*
 * Reconfigure a shmem filesystem.
 */
static int shmem_reconfigure(struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;
        struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
        unsigned long used_isp;
        struct mempolicy *mpol = NULL;
        const char *err;

        raw_spin_lock(&sbinfo->stat_lock);
        used_isp = sbinfo->max_inodes * BOGO_INODE_SIZE - sbinfo->free_ispace;

        if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
                if (!sbinfo->max_blocks) {
                        err = "Cannot retroactively limit size";
                        goto out;
                }
                if (percpu_counter_compare(&sbinfo->used_blocks,
                                           ctx->blocks) > 0) {
                        err = "Too small a size for current use";
                        goto out;
                }
        }
        if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
                if (!sbinfo->max_inodes) {
                        err = "Cannot retroactively limit inodes";
                        goto out;
                }
                if (ctx->inodes * BOGO_INODE_SIZE < used_isp) {
                        err = "Too few inodes for current use";
                        goto out;
                }
        }

        if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
            sbinfo->next_ino > UINT_MAX) {
                err = "Current inum too high to switch to 32-bit inums";
                goto out;
        }
        if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
                err = "Cannot disable swap on remount";
                goto out;
        }
        if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
                err = "Cannot enable swap on remount if it was disabled on first mount";
                goto out;
        }

        if (ctx->seen & SHMEM_SEEN_QUOTA &&
            !sb_any_quota_loaded(fc->root->d_sb)) {
                err = "Cannot enable quota on remount";
                goto out;
        }

#ifdef CONFIG_TMPFS_QUOTA
#define CHANGED_LIMIT(name)                                                \
        (ctx->qlimits.name## hardlimit &&                                \
        (ctx->qlimits.name## hardlimit != sbinfo->qlimits.name## hardlimit))

        if (CHANGED_LIMIT(usrquota_b) || CHANGED_LIMIT(usrquota_i) ||
            CHANGED_LIMIT(grpquota_b) || CHANGED_LIMIT(grpquota_i)) {
                err = "Cannot change global quota limit on remount";
                goto out;
        }
#endif /* CONFIG_TMPFS_QUOTA */

        if (ctx->seen & SHMEM_SEEN_HUGE)
                sbinfo->huge = ctx->huge;
        if (ctx->seen & SHMEM_SEEN_INUMS)
                sbinfo->full_inums = ctx->full_inums;
        if (ctx->seen & SHMEM_SEEN_BLOCKS)
                sbinfo->max_blocks  = ctx->blocks;
        if (ctx->seen & SHMEM_SEEN_INODES) {
                sbinfo->max_inodes  = ctx->inodes;
                sbinfo->free_ispace = ctx->inodes * BOGO_INODE_SIZE - used_isp;
        }

        /*
         * Preserve previous mempolicy unless mpol remount option was specified.
         */
        if (ctx->mpol) {
                mpol = sbinfo->mpol;
                sbinfo->mpol = ctx->mpol;        /* transfers initial ref */
                ctx->mpol = NULL;
        }

        if (ctx->noswap)
                sbinfo->noswap = true;

        raw_spin_unlock(&sbinfo->stat_lock);
        mpol_put(mpol);
        return 0;
out:
        raw_spin_unlock(&sbinfo->stat_lock);
        return invalfc(fc, "%s", err);
}

static int shmem_show_options(struct seq_file *seq, struct dentry *root)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(root->d_sb);
        struct mempolicy *mpol;

        if (sbinfo->max_blocks != shmem_default_max_blocks())
                seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks));
        if (sbinfo->max_inodes != shmem_default_max_inodes())
                seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
        if (sbinfo->mode != (0777 | S_ISVTX))
                seq_printf(seq, ",mode=%03ho", sbinfo->mode);
        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
                seq_printf(seq, ",uid=%u",
                                from_kuid_munged(&init_user_ns, sbinfo->uid));
        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
                seq_printf(seq, ",gid=%u",
                                from_kgid_munged(&init_user_ns, sbinfo->gid));

        /*
         * Showing inode{64,32} might be useful even if it's the system default,
         * since then people don't have to resort to checking both here and
         * /proc/config.gz to confirm 64-bit inums were successfully applied
         * (which may not even exist if IKCONFIG_PROC isn't enabled).
         *
         * We hide it when inode64 isn't the default and we are using 32-bit
         * inodes, since that probably just means the feature isn't even under
         * consideration.
         *
         * As such:
         *
         *                     +-----------------+-----------------+
         *                     | TMPFS_INODE64=y | TMPFS_INODE64=n |
         *  +------------------+-----------------+-----------------+
         *  | full_inums=true  | show            | show            |
         *  | full_inums=false | show            | hide            |
         *  +------------------+-----------------+-----------------+
         *
         */
        if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
                seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
        if (sbinfo->huge)
                seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
#endif
        mpol = shmem_get_sbmpol(sbinfo);
        shmem_show_mpol(seq, mpol);
        mpol_put(mpol);
        if (sbinfo->noswap)
                seq_printf(seq, ",noswap");
#ifdef CONFIG_TMPFS_QUOTA
        if (sb_has_quota_active(root->d_sb, USRQUOTA))
                seq_printf(seq, ",usrquota");
        if (sb_has_quota_active(root->d_sb, GRPQUOTA))
                seq_printf(seq, ",grpquota");
        if (sbinfo->qlimits.usrquota_bhardlimit)
                seq_printf(seq, ",usrquota_block_hardlimit=%lld",
                           sbinfo->qlimits.usrquota_bhardlimit);
        if (sbinfo->qlimits.grpquota_bhardlimit)
                seq_printf(seq, ",grpquota_block_hardlimit=%lld",
                           sbinfo->qlimits.grpquota_bhardlimit);
        if (sbinfo->qlimits.usrquota_ihardlimit)
                seq_printf(seq, ",usrquota_inode_hardlimit=%lld",
                           sbinfo->qlimits.usrquota_ihardlimit);
        if (sbinfo->qlimits.grpquota_ihardlimit)
                seq_printf(seq, ",grpquota_inode_hardlimit=%lld",
                           sbinfo->qlimits.grpquota_ihardlimit);
#endif
        return 0;
}

#endif /* CONFIG_TMPFS */

static void shmem_put_super(struct super_block *sb)
{
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);

#if IS_ENABLED(CONFIG_UNICODE)
        if (sb->s_encoding)
                utf8_unload(sb->s_encoding);
#endif

#ifdef CONFIG_TMPFS_QUOTA
        shmem_disable_quotas(sb);
#endif
        free_percpu(sbinfo->ino_batch);
        percpu_counter_destroy(&sbinfo->used_blocks);
        mpol_put(sbinfo->mpol);
        kfree(sbinfo);
        sb->s_fs_info = NULL;
}

#if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_TMPFS)
static const struct dentry_operations shmem_ci_dentry_ops = {
        .d_hash = generic_ci_d_hash,
        .d_compare = generic_ci_d_compare,
        .d_delete = always_delete_dentry,
};
#endif

static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;
        struct inode *inode;
        struct shmem_sb_info *sbinfo;
        int error = -ENOMEM;

        /* Round up to L1_CACHE_BYTES to resist false sharing */
        sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
                                L1_CACHE_BYTES), GFP_KERNEL);
        if (!sbinfo)
                return error;

        sb->s_fs_info = sbinfo;

#ifdef CONFIG_TMPFS
        /*
         * Per default we only allow half of the physical ram per
         * tmpfs instance, limiting inodes to one per page of lowmem;
         * but the internal instance is left unlimited.
         */
        if (!(sb->s_flags & SB_KERNMOUNT)) {
                if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
                        ctx->blocks = shmem_default_max_blocks();
                if (!(ctx->seen & SHMEM_SEEN_INODES))
                        ctx->inodes = shmem_default_max_inodes();
                if (!(ctx->seen & SHMEM_SEEN_INUMS))
                        ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
                sbinfo->noswap = ctx->noswap;
        } else {
                sb->s_flags |= SB_NOUSER;
        }
        sb->s_export_op = &shmem_export_ops;
        sb->s_flags |= SB_NOSEC | SB_I_VERSION;

#if IS_ENABLED(CONFIG_UNICODE)
        if (!ctx->encoding && ctx->strict_encoding) {
                pr_err("tmpfs: strict_encoding option without encoding is forbidden\n");
                error = -EINVAL;
                goto failed;
        }

        if (ctx->encoding) {
                sb->s_encoding = ctx->encoding;
                sb->s_d_op = &shmem_ci_dentry_ops;
                if (ctx->strict_encoding)
                        sb->s_encoding_flags = SB_ENC_STRICT_MODE_FL;
        }
#endif

#else
        sb->s_flags |= SB_NOUSER;
#endif /* CONFIG_TMPFS */
        sbinfo->max_blocks = ctx->blocks;
        sbinfo->max_inodes = ctx->inodes;
        sbinfo->free_ispace = sbinfo->max_inodes * BOGO_INODE_SIZE;
        if (sb->s_flags & SB_KERNMOUNT) {
                sbinfo->ino_batch = alloc_percpu(ino_t);
                if (!sbinfo->ino_batch)
                        goto failed;
        }
        sbinfo->uid = ctx->uid;
        sbinfo->gid = ctx->gid;
        sbinfo->full_inums = ctx->full_inums;
        sbinfo->mode = ctx->mode;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (ctx->seen & SHMEM_SEEN_HUGE)
                sbinfo->huge = ctx->huge;
        else
                sbinfo->huge = tmpfs_huge;
#endif
        sbinfo->mpol = ctx->mpol;
        ctx->mpol = NULL;

        raw_spin_lock_init(&sbinfo->stat_lock);
        if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
                goto failed;
        spin_lock_init(&sbinfo->shrinklist_lock);
        INIT_LIST_HEAD(&sbinfo->shrinklist);

        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = PAGE_SIZE;
        sb->s_blocksize_bits = PAGE_SHIFT;
        sb->s_magic = TMPFS_MAGIC;
        sb->s_op = &shmem_ops;
        sb->s_time_gran = 1;
#ifdef CONFIG_TMPFS_XATTR
        sb->s_xattr = shmem_xattr_handlers;
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        sb->s_flags |= SB_POSIXACL;
#endif
        uuid_t uuid;
        uuid_gen(&uuid);
        super_set_uuid(sb, uuid.b, sizeof(uuid));

#ifdef CONFIG_TMPFS_QUOTA
        if (ctx->seen & SHMEM_SEEN_QUOTA) {
                sb->dq_op = &shmem_quota_operations;
                sb->s_qcop = &dquot_quotactl_sysfile_ops;
                sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;

                /* Copy the default limits from ctx into sbinfo */
                memcpy(&sbinfo->qlimits, &ctx->qlimits,
                       sizeof(struct shmem_quota_limits));

                if (shmem_enable_quotas(sb, ctx->quota_types))
                        goto failed;
        }
#endif /* CONFIG_TMPFS_QUOTA */

        inode = shmem_get_inode(&nop_mnt_idmap, sb, NULL,
                                S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto failed;
        }
        inode->i_uid = sbinfo->uid;
        inode->i_gid = sbinfo->gid;
        sb->s_root = d_make_root(inode);
        if (!sb->s_root)
                goto failed;
        return 0;

failed:
        shmem_put_super(sb);
        return error;
}

static int shmem_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, shmem_fill_super);
}

static void shmem_free_fc(struct fs_context *fc)
{
        struct shmem_options *ctx = fc->fs_private;

        if (ctx) {
                mpol_put(ctx->mpol);
                kfree(ctx);
        }
}

static const struct fs_context_operations shmem_fs_context_ops = {
        .free                        = shmem_free_fc,
        .get_tree                = shmem_get_tree,
#ifdef CONFIG_TMPFS
        .parse_monolithic        = shmem_parse_monolithic,
        .parse_param                = shmem_parse_one,
        .reconfigure                = shmem_reconfigure,
#endif
};

static struct kmem_cache *shmem_inode_cachep __ro_after_init;

static struct inode *shmem_alloc_inode(struct super_block *sb)
{
        struct shmem_inode_info *info;
        info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
        if (!info)
                return NULL;
        return &info->vfs_inode;
}

static void shmem_free_in_core_inode(struct inode *inode)
{
        if (S_ISLNK(inode->i_mode))
                kfree(inode->i_link);
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
}

static void shmem_destroy_inode(struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
        if (S_ISDIR(inode->i_mode))
                simple_offset_destroy(shmem_get_offset_ctx(inode));
}

static void shmem_init_inode(void *foo)
{
        struct shmem_inode_info *info = foo;
        inode_init_once(&info->vfs_inode);
}

static void __init shmem_init_inodecache(void)
{
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
                                0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
}

static void __init shmem_destroy_inodecache(void)
{
        kmem_cache_destroy(shmem_inode_cachep);
}

/* Keep the page in page cache instead of truncating it */
static int shmem_error_remove_folio(struct address_space *mapping,
                                   struct folio *folio)
{
        return 0;
}

static const struct address_space_operations shmem_aops = {
        .writepage        = shmem_writepage,
        .dirty_folio        = noop_dirty_folio,
#ifdef CONFIG_TMPFS
        .write_begin        = shmem_write_begin,
        .write_end        = shmem_write_end,
#endif
#ifdef CONFIG_MIGRATION
        .migrate_folio        = migrate_folio,
#endif
        .error_remove_folio = shmem_error_remove_folio,
};

static const struct file_operations shmem_file_operations = {
        .mmap                = shmem_mmap,
        .open                = shmem_file_open,
        .get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
        .llseek                = shmem_file_llseek,
        .read_iter        = shmem_file_read_iter,
        .write_iter        = shmem_file_write_iter,
        .fsync                = noop_fsync,
        .splice_read        = shmem_file_splice_read,
        .splice_write        = iter_file_splice_write,
        .fallocate        = shmem_fallocate,
#endif
};

static const struct inode_operations shmem_inode_operations = {
        .getattr        = shmem_getattr,
        .setattr        = shmem_setattr,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
        .set_acl        = simple_set_acl,
        .fileattr_get        = shmem_fileattr_get,
        .fileattr_set        = shmem_fileattr_set,
#endif
};

static const struct inode_operations shmem_dir_inode_operations = {
#ifdef CONFIG_TMPFS
        .getattr        = shmem_getattr,
        .create                = shmem_create,
        .lookup                = simple_lookup,
        .link                = shmem_link,
        .unlink                = shmem_unlink,
        .symlink        = shmem_symlink,
        .mkdir                = shmem_mkdir,
        .rmdir                = shmem_rmdir,
        .mknod                = shmem_mknod,
        .rename                = shmem_rename2,
        .tmpfile        = shmem_tmpfile,
        .get_offset_ctx        = shmem_get_offset_ctx,
#endif
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
        .fileattr_get        = shmem_fileattr_get,
        .fileattr_set        = shmem_fileattr_set,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
        .set_acl        = simple_set_acl,
#endif
};

static const struct inode_operations shmem_special_inode_operations = {
        .getattr        = shmem_getattr,
#ifdef CONFIG_TMPFS_XATTR
        .listxattr        = shmem_listxattr,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
        .setattr        = shmem_setattr,
        .set_acl        = simple_set_acl,
#endif
};

static const struct super_operations shmem_ops = {
        .alloc_inode        = shmem_alloc_inode,
        .free_inode        = shmem_free_in_core_inode,
        .destroy_inode        = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
        .statfs                = shmem_statfs,
        .show_options        = shmem_show_options,
#endif
#ifdef CONFIG_TMPFS_QUOTA
        .get_dquots        = shmem_get_dquots,
#endif
        .evict_inode        = shmem_evict_inode,
        .drop_inode        = generic_delete_inode,
        .put_super        = shmem_put_super,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        .nr_cached_objects        = shmem_unused_huge_count,
        .free_cached_objects        = shmem_unused_huge_scan,
#endif
};

static const struct vm_operations_struct shmem_vm_ops = {
        .fault                = shmem_fault,
        .map_pages        = filemap_map_pages,
#ifdef CONFIG_NUMA
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
#endif
};

static const struct vm_operations_struct shmem_anon_vm_ops = {
        .fault                = shmem_fault,
        .map_pages        = filemap_map_pages,
#ifdef CONFIG_NUMA
        .set_policy     = shmem_set_policy,
        .get_policy     = shmem_get_policy,
#endif
};

int shmem_init_fs_context(struct fs_context *fc)
{
        struct shmem_options *ctx;

        ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        ctx->mode = 0777 | S_ISVTX;
        ctx->uid = current_fsuid();
        ctx->gid = current_fsgid();

#if IS_ENABLED(CONFIG_UNICODE)
        ctx->encoding = NULL;
#endif

        fc->fs_private = ctx;
        fc->ops = &shmem_fs_context_ops;
        return 0;
}

static struct file_system_type shmem_fs_type = {
        .owner                = THIS_MODULE,
        .name                = "tmpfs",
        .init_fs_context = shmem_init_fs_context,
#ifdef CONFIG_TMPFS
        .parameters        = shmem_fs_parameters,
#endif
        .kill_sb        = kill_litter_super,
        .fs_flags        = FS_USERNS_MOUNT | FS_ALLOW_IDMAP | FS_MGTIME,
};

#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)

#define __INIT_KOBJ_ATTR(_name, _mode, _show, _store)                        \
{                                                                        \
        .attr        = { .name = __stringify(_name), .mode = _mode },        \
        .show        = _show,                                                \
        .store        = _store,                                                \
}

#define TMPFS_ATTR_W(_name, _store)                                \
        static struct kobj_attribute tmpfs_attr_##_name =        \
                        __INIT_KOBJ_ATTR(_name, 0200, NULL, _store)

#define TMPFS_ATTR_RW(_name, _show, _store)                        \
        static struct kobj_attribute tmpfs_attr_##_name =        \
                        __INIT_KOBJ_ATTR(_name, 0644, _show, _store)

#define TMPFS_ATTR_RO(_name, _show)                                \
        static struct kobj_attribute tmpfs_attr_##_name =        \
                        __INIT_KOBJ_ATTR(_name, 0444, _show, NULL)

#if IS_ENABLED(CONFIG_UNICODE)
static ssize_t casefold_show(struct kobject *kobj, struct kobj_attribute *a,
                        char *buf)
{
                return sysfs_emit(buf, "supported\n");
}
TMPFS_ATTR_RO(casefold, casefold_show);
#endif

static struct attribute *tmpfs_attributes[] = {
#if IS_ENABLED(CONFIG_UNICODE)
        &tmpfs_attr_casefold.attr,
#endif
        NULL
};

static const struct attribute_group tmpfs_attribute_group = {
        .attrs = tmpfs_attributes,
        .name = "features"
};

static struct kobject *tmpfs_kobj;

static int __init tmpfs_sysfs_init(void)
{
        int ret;

        tmpfs_kobj = kobject_create_and_add("tmpfs", fs_kobj);
        if (!tmpfs_kobj)
                return -ENOMEM;

        ret = sysfs_create_group(tmpfs_kobj, &tmpfs_attribute_group);
        if (ret)
                kobject_put(tmpfs_kobj);

        return ret;
}
#endif /* CONFIG_SYSFS && CONFIG_TMPFS */

void __init shmem_init(void)
{
        int error;

        shmem_init_inodecache();

#ifdef CONFIG_TMPFS_QUOTA
        register_quota_format(&shmem_quota_format);
#endif

        error = register_filesystem(&shmem_fs_type);
        if (error) {
                pr_err("Could not register tmpfs\n");
                goto out2;
        }

        shm_mnt = kern_mount(&shmem_fs_type);
        if (IS_ERR(shm_mnt)) {
                error = PTR_ERR(shm_mnt);
                pr_err("Could not kern_mount tmpfs\n");
                goto out1;
        }

#if defined(CONFIG_SYSFS) && defined(CONFIG_TMPFS)
        error = tmpfs_sysfs_init();
        if (error) {
                pr_err("Could not init tmpfs sysfs\n");
                goto out1;
        }
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
        else
                shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */

        /*
         * Default to setting PMD-sized THP to inherit the global setting and
         * disable all other multi-size THPs.
         */
        if (!shmem_orders_configured)
                huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
#endif
        return;

out1:
        unregister_filesystem(&shmem_fs_type);
out2:
#ifdef CONFIG_TMPFS_QUOTA
        unregister_quota_format(&shmem_quota_format);
#endif
        shmem_destroy_inodecache();
        shm_mnt = ERR_PTR(error);
}

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        static const int values[] = {
                SHMEM_HUGE_ALWAYS,
                SHMEM_HUGE_WITHIN_SIZE,
                SHMEM_HUGE_ADVISE,
                SHMEM_HUGE_NEVER,
                SHMEM_HUGE_DENY,
                SHMEM_HUGE_FORCE,
        };
        int len = 0;
        int i;

        for (i = 0; i < ARRAY_SIZE(values); i++) {
                len += sysfs_emit_at(buf, len,
                                shmem_huge == values[i] ? "%s[%s]" : "%s%s",
                                i ? " " : "", shmem_format_huge(values[i]));
        }
        len += sysfs_emit_at(buf, len, "\n");

        return len;
}

static ssize_t shmem_enabled_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        char tmp[16];
        int huge, err;

        if (count + 1 > sizeof(tmp))
                return -EINVAL;
        memcpy(tmp, buf, count);
        tmp[count] = '\0';
        if (count && tmp[count - 1] == '\n')
                tmp[count - 1] = '\0';

        huge = shmem_parse_huge(tmp);
        if (huge == -EINVAL)
                return huge;

        shmem_huge = huge;
        if (shmem_huge > SHMEM_HUGE_DENY)
                SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;

        err = start_stop_khugepaged();
        return err ? err : count;
}

struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
static DEFINE_SPINLOCK(huge_shmem_orders_lock);

static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
                                          struct kobj_attribute *attr, char *buf)
{
        int order = to_thpsize(kobj)->order;
        const char *output;

        if (test_bit(order, &huge_shmem_orders_always))
                output = "[always] inherit within_size advise never";
        else if (test_bit(order, &huge_shmem_orders_inherit))
                output = "always [inherit] within_size advise never";
        else if (test_bit(order, &huge_shmem_orders_within_size))
                output = "always inherit [within_size] advise never";
        else if (test_bit(order, &huge_shmem_orders_madvise))
                output = "always inherit within_size [advise] never";
        else
                output = "always inherit within_size advise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
                                           struct kobj_attribute *attr,
                                           const char *buf, size_t count)
{
        int order = to_thpsize(kobj)->order;
        ssize_t ret = count;

        if (sysfs_streq(buf, "always")) {
                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_inherit);
                clear_bit(order, &huge_shmem_orders_madvise);
                clear_bit(order, &huge_shmem_orders_within_size);
                set_bit(order, &huge_shmem_orders_always);
                spin_unlock(&huge_shmem_orders_lock);
        } else if (sysfs_streq(buf, "inherit")) {
                /* Do not override huge allocation policy with non-PMD sized mTHP */
                if (shmem_huge == SHMEM_HUGE_FORCE &&
                    order != HPAGE_PMD_ORDER)
                        return -EINVAL;

                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_always);
                clear_bit(order, &huge_shmem_orders_madvise);
                clear_bit(order, &huge_shmem_orders_within_size);
                set_bit(order, &huge_shmem_orders_inherit);
                spin_unlock(&huge_shmem_orders_lock);
        } else if (sysfs_streq(buf, "within_size")) {
                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_always);
                clear_bit(order, &huge_shmem_orders_inherit);
                clear_bit(order, &huge_shmem_orders_madvise);
                set_bit(order, &huge_shmem_orders_within_size);
                spin_unlock(&huge_shmem_orders_lock);
        } else if (sysfs_streq(buf, "advise")) {
                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_always);
                clear_bit(order, &huge_shmem_orders_inherit);
                clear_bit(order, &huge_shmem_orders_within_size);
                set_bit(order, &huge_shmem_orders_madvise);
                spin_unlock(&huge_shmem_orders_lock);
        } else if (sysfs_streq(buf, "never")) {
                spin_lock(&huge_shmem_orders_lock);
                clear_bit(order, &huge_shmem_orders_always);
                clear_bit(order, &huge_shmem_orders_inherit);
                clear_bit(order, &huge_shmem_orders_within_size);
                clear_bit(order, &huge_shmem_orders_madvise);
                spin_unlock(&huge_shmem_orders_lock);
        } else {
                ret = -EINVAL;
        }

        if (ret > 0) {
                int err = start_stop_khugepaged();

                if (err)
                        ret = err;
        }
        return ret;
}

struct kobj_attribute thpsize_shmem_enabled_attr =
        __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */

#if defined(CONFIG_TRANSPARENT_HUGEPAGE)

static int __init setup_transparent_hugepage_shmem(char *str)
{
        int huge;

        huge = shmem_parse_huge(str);
        if (huge == -EINVAL) {
                pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n");
                return huge;
        }

        shmem_huge = huge;
        return 1;
}
__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem);

static int __init setup_transparent_hugepage_tmpfs(char *str)
{
        int huge;

        huge = shmem_parse_huge(str);
        if (huge < 0) {
                pr_warn("transparent_hugepage_tmpfs= cannot parse, ignored\n");
                return huge;
        }

        tmpfs_huge = huge;
        return 1;
}
__setup("transparent_hugepage_tmpfs=", setup_transparent_hugepage_tmpfs);

static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_shmem(char *str)
{
        char *token, *range, *policy, *subtoken;
        unsigned long always, inherit, madvise, within_size;
        char *start_size, *end_size;
        int start, end, nr;
        char *p;

        if (!str || strlen(str) + 1 > PAGE_SIZE)
                goto err;
        strscpy(str_dup, str);

        always = huge_shmem_orders_always;
        inherit = huge_shmem_orders_inherit;
        madvise = huge_shmem_orders_madvise;
        within_size = huge_shmem_orders_within_size;
        p = str_dup;
        while ((token = strsep(&p, ";")) != NULL) {
                range = strsep(&token, ":");
                policy = token;

                if (!policy)
                        goto err;

                while ((subtoken = strsep(&range, ",")) != NULL) {
                        if (strchr(subtoken, '-')) {
                                start_size = strsep(&subtoken, "-");
                                end_size = subtoken;

                                start = get_order_from_str(start_size,
                                                           THP_ORDERS_ALL_FILE_DEFAULT);
                                end = get_order_from_str(end_size,
                                                         THP_ORDERS_ALL_FILE_DEFAULT);
                        } else {
                                start_size = end_size = subtoken;
                                start = end = get_order_from_str(subtoken,
                                                                 THP_ORDERS_ALL_FILE_DEFAULT);
                        }

                        if (start < 0) {
                                pr_err("invalid size %s in thp_shmem boot parameter\n",
                                       start_size);
                                goto err;
                        }

                        if (end < 0) {
                                pr_err("invalid size %s in thp_shmem boot parameter\n",
                                       end_size);
                                goto err;
                        }

                        if (start > end)
                                goto err;

                        nr = end - start + 1;
                        if (!strcmp(policy, "always")) {
                                bitmap_set(&always, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&within_size, start, nr);
                        } else if (!strcmp(policy, "advise")) {
                                bitmap_set(&madvise, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&always, start, nr);
                                bitmap_clear(&within_size, start, nr);
                        } else if (!strcmp(policy, "inherit")) {
                                bitmap_set(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                                bitmap_clear(&within_size, start, nr);
                        } else if (!strcmp(policy, "within_size")) {
                                bitmap_set(&within_size, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                        } else if (!strcmp(policy, "never")) {
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                                bitmap_clear(&within_size, start, nr);
                        } else {
                                pr_err("invalid policy %s in thp_shmem boot parameter\n", policy);
                                goto err;
                        }
                }
        }

        huge_shmem_orders_always = always;
        huge_shmem_orders_madvise = madvise;
        huge_shmem_orders_inherit = inherit;
        huge_shmem_orders_within_size = within_size;
        shmem_orders_configured = true;
        return 1;

err:
        pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str);
        return 0;
}
__setup("thp_shmem=", setup_thp_shmem);

#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#else /* !CONFIG_SHMEM */

/*
 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
 *
 * This is intended for small system where the benefits of the full
 * shmem code (swap-backed and resource-limited) are outweighed by
 * their complexity. On systems without swap this code should be
 * effectively equivalent, but much lighter weight.
 */

static struct file_system_type shmem_fs_type = {
        .name                = "tmpfs",
        .init_fs_context = ramfs_init_fs_context,
        .parameters        = ramfs_fs_parameters,
        .kill_sb        = ramfs_kill_sb,
        .fs_flags        = FS_USERNS_MOUNT,
};

void __init shmem_init(void)
{
        BUG_ON(register_filesystem(&shmem_fs_type) != 0);

        shm_mnt = kern_mount(&shmem_fs_type);
        BUG_ON(IS_ERR(shm_mnt));
}

int shmem_unuse(unsigned int type)
{
        return 0;
}

int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
        return 0;
}

void shmem_unlock_mapping(struct address_space *mapping)
{
}

#ifdef CONFIG_MMU
unsigned long shmem_get_unmapped_area(struct file *file,
                                      unsigned long addr, unsigned long len,
                                      unsigned long pgoff, unsigned long flags)
{
        return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags);
}
#endif

void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        truncate_inode_pages_range(inode->i_mapping, lstart, lend);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);

#define shmem_vm_ops                                generic_file_vm_ops
#define shmem_anon_vm_ops                        generic_file_vm_ops
#define shmem_file_operations                        ramfs_file_operations
#define shmem_acct_size(flags, size)                0
#define shmem_unacct_size(flags, size)                do {} while (0)

static inline struct inode *shmem_get_inode(struct mnt_idmap *idmap,
                                struct super_block *sb, struct inode *dir,
                                umode_t mode, dev_t dev, unsigned long flags)
{
        struct inode *inode = ramfs_get_inode(sb, dir, mode, dev);
        return inode ? inode : ERR_PTR(-ENOSPC);
}

#endif /* CONFIG_SHMEM */

/* common code */

static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name,
                        loff_t size, unsigned long flags, unsigned int i_flags)
{
        struct inode *inode;
        struct file *res;

        if (IS_ERR(mnt))
                return ERR_CAST(mnt);

        if (size < 0 || size > MAX_LFS_FILESIZE)
                return ERR_PTR(-EINVAL);

        if (shmem_acct_size(flags, size))
                return ERR_PTR(-ENOMEM);

        if (is_idmapped_mnt(mnt))
                return ERR_PTR(-EINVAL);

        inode = shmem_get_inode(&nop_mnt_idmap, mnt->mnt_sb, NULL,
                                S_IFREG | S_IRWXUGO, 0, flags);
        if (IS_ERR(inode)) {
                shmem_unacct_size(flags, size);
                return ERR_CAST(inode);
        }
        inode->i_flags |= i_flags;
        inode->i_size = size;
        clear_nlink(inode);        /* It is unlinked */
        res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
        if (!IS_ERR(res))
                res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
                                &shmem_file_operations);
        if (IS_ERR(res))
                iput(inode);
        return res;
}

/**
 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
 *         kernel internal.  There will be NO LSM permission checks against the
 *         underlying inode.  So users of this interface must do LSM checks at a
 *        higher layer.  The users are the big_key and shm implementations.  LSM
 *        checks are provided at the key or shm level rather than the inode.
 * @name: name for dentry (to be seen in /proc/<pid>/maps)
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
{
        return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}
EXPORT_SYMBOL_GPL(shmem_kernel_file_setup);

/**
 * shmem_file_setup - get an unlinked file living in tmpfs
 * @name: name for dentry (to be seen in /proc/<pid>/maps)
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
{
        return __shmem_file_setup(shm_mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup);

/**
 * shmem_file_setup_with_mnt - get an unlinked file living in tmpfs
 * @mnt: the tmpfs mount where the file will be created
 * @name: name for dentry (to be seen in /proc/<pid>/maps)
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
                                       loff_t size, unsigned long flags)
{
        return __shmem_file_setup(mnt, name, size, flags, 0);
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);

/**
 * shmem_zero_setup - setup a shared anonymous mapping
 * @vma: the vma to be mmapped is prepared by do_mmap
 */
int shmem_zero_setup(struct vm_area_struct *vma)
{
        struct file *file;
        loff_t size = vma->vm_end - vma->vm_start;

        /*
         * Cloning a new file under mmap_lock leads to a lock ordering conflict
         * between XFS directory reading and selinux: since this file is only
         * accessible to the user through its mapping, use S_PRIVATE flag to
         * bypass file security, in the same way as shmem_kernel_file_setup().
         */
        file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
        if (IS_ERR(file))
                return PTR_ERR(file);

        if (vma->vm_file)
                fput(vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_anon_vm_ops;

        return 0;
}

/**
 * shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
 * @mapping:        the folio's address_space
 * @index:        the folio index
 * @gfp:        the page allocator flags to use if allocating
 *
 * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
 * with any new page allocations done using the specified allocation flags.
 * But read_cache_page_gfp() uses the ->read_folio() method: which does not
 * suit tmpfs, since it may have pages in swapcache, and needs to find those
 * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
 *
 * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
 * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
 */
struct folio *shmem_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
#ifdef CONFIG_SHMEM
        struct inode *inode = mapping->host;
        struct folio *folio;
        int error;

        error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
                                    gfp, NULL, NULL);
        if (error)
                return ERR_PTR(error);

        folio_unlock(folio);
        return folio;
#else
        /*
         * The tiny !SHMEM case uses ramfs without swap
         */
        return mapping_read_folio_gfp(mapping, index, gfp);
#endif
}
EXPORT_SYMBOL_GPL(shmem_read_folio_gfp);

struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                         pgoff_t index, gfp_t gfp)
{
        struct folio *folio = shmem_read_folio_gfp(mapping, index, gfp);
        struct page *page;

        if (IS_ERR(folio))
                return &folio->page;

        page = folio_file_page(folio, index);
        if (PageHWPoison(page)) {
                folio_put(folio);
                return ERR_PTR(-EIO);
        }

        return page;
}
EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  253 


  253 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork()
 *
 * Copyright (C) 2008-2009 Red Hat, Inc.
 * Authors:
 *        Izik Eidus
 *        Andrea Arcangeli
 *        Chris Wright
 *        Hugh Dickins
 */

#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/cputime.h>
#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
#include <linux/xxhash.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
#include <linux/slab.h>
#include <linux/rbtree.h>
#include <linux/memory.h>
#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/ksm.h>
#include <linux/hashtable.h>
#include <linux/freezer.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/pagewalk.h>

#include <asm/tlbflush.h>
#include "internal.h"
#include "mm_slot.h"

#define CREATE_TRACE_POINTS
#include <trace/events/ksm.h>

#ifdef CONFIG_NUMA
#define NUMA(x)                (x)
#define DO_NUMA(x)        do { (x); } while (0)
#else
#define NUMA(x)                (0)
#define DO_NUMA(x)        do { } while (0)
#endif

typedef u8 rmap_age_t;

/**
 * DOC: Overview
 *
 * A few notes about the KSM scanning process,
 * to make it easier to understand the data structures below:
 *
 * In order to reduce excessive scanning, KSM sorts the memory pages by their
 * contents into a data structure that holds pointers to the pages' locations.
 *
 * Since the contents of the pages may change at any moment, KSM cannot just
 * insert the pages into a normal sorted tree and expect it to find anything.
 * Therefore KSM uses two data structures - the stable and the unstable tree.
 *
 * The stable tree holds pointers to all the merged pages (ksm pages), sorted
 * by their contents.  Because each such page is write-protected, searching on
 * this tree is fully assured to be working (except when pages are unmapped),
 * and therefore this tree is called the stable tree.
 *
 * The stable tree node includes information required for reverse
 * mapping from a KSM page to virtual addresses that map this page.
 *
 * In order to avoid large latencies of the rmap walks on KSM pages,
 * KSM maintains two types of nodes in the stable tree:
 *
 * * the regular nodes that keep the reverse mapping structures in a
 *   linked list
 * * the "chains" that link nodes ("dups") that represent the same
 *   write protected memory content, but each "dup" corresponds to a
 *   different KSM page copy of that content
 *
 * Internally, the regular nodes, "dups" and "chains" are represented
 * using the same struct ksm_stable_node structure.
 *
 * In addition to the stable tree, KSM uses a second data structure called the
 * unstable tree: this tree holds pointers to pages which have been found to
 * be "unchanged for a period of time".  The unstable tree sorts these pages
 * by their contents, but since they are not write-protected, KSM cannot rely
 * upon the unstable tree to work correctly - the unstable tree is liable to
 * be corrupted as its contents are modified, and so it is called unstable.
 *
 * KSM solves this problem by several techniques:
 *
 * 1) The unstable tree is flushed every time KSM completes scanning all
 *    memory areas, and then the tree is rebuilt again from the beginning.
 * 2) KSM will only insert into the unstable tree, pages whose hash value
 *    has not changed since the previous scan of all memory areas.
 * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the
 *    colors of the nodes and not on their contents, assuring that even when
 *    the tree gets "corrupted" it won't get out of balance, so scanning time
 *    remains the same (also, searching and inserting nodes in an rbtree uses
 *    the same algorithm, so we have no overhead when we flush and rebuild).
 * 4) KSM never flushes the stable tree, which means that even if it were to
 *    take 10 attempts to find a page in the unstable tree, once it is found,
 *    it is secured in the stable tree.  (When we scan a new page, we first
 *    compare it against the stable tree, and then against the unstable tree.)
 *
 * If the merge_across_nodes tunable is unset, then KSM maintains multiple
 * stable trees and multiple unstable trees: one of each for each NUMA node.
 */

/**
 * struct ksm_mm_slot - ksm information per mm that is being scanned
 * @slot: hash lookup from mm to mm_slot
 * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
 */
struct ksm_mm_slot {
        struct mm_slot slot;
        struct ksm_rmap_item *rmap_list;
};

/**
 * struct ksm_scan - cursor for scanning
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 * @rmap_list: link to the next rmap to be scanned in the rmap_list
 * @seqnr: count of completed full scans (needed when removing unstable node)
 *
 * There is only the one ksm_scan instance of this cursor structure.
 */
struct ksm_scan {
        struct ksm_mm_slot *mm_slot;
        unsigned long address;
        struct ksm_rmap_item **rmap_list;
        unsigned long seqnr;
};

/**
 * struct ksm_stable_node - node of the stable rbtree
 * @node: rb node of this ksm page in the stable tree
 * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
 * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
 * @list: linked into migrate_nodes, pending placement in the proper node tree
 * @hlist: hlist head of rmap_items using this ksm page
 * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
 * @chain_prune_time: time of the last full garbage collection
 * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
 * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
 */
struct ksm_stable_node {
        union {
                struct rb_node node;        /* when node of stable tree */
                struct {                /* when listed for migration */
                        struct list_head *head;
                        struct {
                                struct hlist_node hlist_dup;
                                struct list_head list;
                        };
                };
        };
        struct hlist_head hlist;
        union {
                unsigned long kpfn;
                unsigned long chain_prune_time;
        };
        /*
         * STABLE_NODE_CHAIN can be any negative number in
         * rmap_hlist_len negative range, but better not -1 to be able
         * to reliably detect underflows.
         */
#define STABLE_NODE_CHAIN -1024
        int rmap_hlist_len;
#ifdef CONFIG_NUMA
        int nid;
#endif
};

/**
 * struct ksm_rmap_item - reverse mapping item for virtual addresses
 * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
 * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 * @nid: NUMA node id of unstable tree in which linked (may not match page)
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @oldchecksum: previous checksum of the page at that virtual address
 * @node: rb node of this rmap_item in the unstable tree
 * @head: pointer to stable_node heading this list in the stable tree
 * @hlist: link into hlist of rmap_items hanging off that stable_node
 * @age: number of scan iterations since creation
 * @remaining_skips: how many scans to skip
 */
struct ksm_rmap_item {
        struct ksm_rmap_item *rmap_list;
        union {
                struct anon_vma *anon_vma;        /* when stable */
#ifdef CONFIG_NUMA
                int nid;                /* when node of unstable tree */
#endif
        };
        struct mm_struct *mm;
        unsigned long address;                /* + low bits used for flags below */
        unsigned int oldchecksum;        /* when unstable */
        rmap_age_t age;
        rmap_age_t remaining_skips;
        union {
                struct rb_node node;        /* when node of unstable tree */
                struct {                /* when listed from stable tree */
                        struct ksm_stable_node *head;
                        struct hlist_node hlist;
                };
        };
};

#define SEQNR_MASK        0x0ff        /* low bits of unstable tree seqnr */
#define UNSTABLE_FLAG        0x100        /* is a node of the unstable tree */
#define STABLE_FLAG        0x200        /* is listed from the stable tree */

/* The stable and unstable tree heads */
static struct rb_root one_stable_tree[1] = { RB_ROOT };
static struct rb_root one_unstable_tree[1] = { RB_ROOT };
static struct rb_root *root_stable_tree = one_stable_tree;
static struct rb_root *root_unstable_tree = one_unstable_tree;

/* Recently migrated nodes of stable tree, pending proper placement */
static LIST_HEAD(migrate_nodes);
#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)

#define MM_SLOTS_HASH_BITS 10
static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);

static struct ksm_mm_slot ksm_mm_head = {
        .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node),
};
static struct ksm_scan ksm_scan = {
        .mm_slot = &ksm_mm_head,
};

static struct kmem_cache *rmap_item_cache;
static struct kmem_cache *stable_node_cache;
static struct kmem_cache *mm_slot_cache;

/* Default number of pages to scan per batch */
#define DEFAULT_PAGES_TO_SCAN 100

/* The number of pages scanned */
static unsigned long ksm_pages_scanned;

/* The number of nodes in the stable tree */
static unsigned long ksm_pages_shared;

/* The number of page slots additionally sharing those nodes */
static unsigned long ksm_pages_sharing;

/* The number of nodes in the unstable tree */
static unsigned long ksm_pages_unshared;

/* The number of rmap_items in use: to calculate pages_volatile */
static unsigned long ksm_rmap_items;

/* The number of stable_node chains */
static unsigned long ksm_stable_node_chains;

/* The number of stable_node dups linked to the stable_node chains */
static unsigned long ksm_stable_node_dups;

/* Delay in pruning stale stable_node_dups in the stable_node_chains */
static unsigned int ksm_stable_node_chains_prune_millisecs = 2000;

/* Maximum number of page slots sharing a stable node */
static int ksm_max_page_sharing = 256;

/* Number of pages ksmd should scan in one batch */
static unsigned int ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;

/* Milliseconds ksmd should sleep between batches */
static unsigned int ksm_thread_sleep_millisecs = 20;

/* Checksum of an empty (zeroed) page */
static unsigned int zero_checksum __read_mostly;

/* Whether to merge empty (zeroed) pages with actual zero pages */
static bool ksm_use_zero_pages __read_mostly;

/* Skip pages that couldn't be de-duplicated previously */
/* Default to true at least temporarily, for testing */
static bool ksm_smart_scan = true;

/* The number of zero pages which is placed by KSM */
atomic_long_t ksm_zero_pages = ATOMIC_LONG_INIT(0);

/* The number of pages that have been skipped due to "smart scanning" */
static unsigned long ksm_pages_skipped;

/* Don't scan more than max pages per batch. */
static unsigned long ksm_advisor_max_pages_to_scan = 30000;

/* Min CPU for scanning pages per scan */
#define KSM_ADVISOR_MIN_CPU 10

/* Max CPU for scanning pages per scan */
static unsigned int ksm_advisor_max_cpu =  70;

/* Target scan time in seconds to analyze all KSM candidate pages. */
static unsigned long ksm_advisor_target_scan_time = 200;

/* Exponentially weighted moving average. */
#define EWMA_WEIGHT 30

/**
 * struct advisor_ctx - metadata for KSM advisor
 * @start_scan: start time of the current scan
 * @scan_time: scan time of previous scan
 * @change: change in percent to pages_to_scan parameter
 * @cpu_time: cpu time consumed by the ksmd thread in the previous scan
 */
struct advisor_ctx {
        ktime_t start_scan;
        unsigned long scan_time;
        unsigned long change;
        unsigned long long cpu_time;
};
static struct advisor_ctx advisor_ctx;

/* Define different advisor's */
enum ksm_advisor_type {
        KSM_ADVISOR_NONE,
        KSM_ADVISOR_SCAN_TIME,
};
static enum ksm_advisor_type ksm_advisor;

#ifdef CONFIG_SYSFS
/*
 * Only called through the sysfs control interface:
 */

/* At least scan this many pages per batch. */
static unsigned long ksm_advisor_min_pages_to_scan = 500;

static void set_advisor_defaults(void)
{
        if (ksm_advisor == KSM_ADVISOR_NONE) {
                ksm_thread_pages_to_scan = DEFAULT_PAGES_TO_SCAN;
        } else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME) {
                advisor_ctx = (const struct advisor_ctx){ 0 };
                ksm_thread_pages_to_scan = ksm_advisor_min_pages_to_scan;
        }
}
#endif /* CONFIG_SYSFS */

static inline void advisor_start_scan(void)
{
        if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
                advisor_ctx.start_scan = ktime_get();
}

/*
 * Use previous scan time if available, otherwise use current scan time as an
 * approximation for the previous scan time.
 */
static inline unsigned long prev_scan_time(struct advisor_ctx *ctx,
                                           unsigned long scan_time)
{
        return ctx->scan_time ? ctx->scan_time : scan_time;
}

/* Calculate exponential weighted moving average */
static unsigned long ewma(unsigned long prev, unsigned long curr)
{
        return ((100 - EWMA_WEIGHT) * prev + EWMA_WEIGHT * curr) / 100;
}

/*
 * The scan time advisor is based on the current scan rate and the target
 * scan rate.
 *
 *      new_pages_to_scan = pages_to_scan * (scan_time / target_scan_time)
 *
 * To avoid perturbations it calculates a change factor of previous changes.
 * A new change factor is calculated for each iteration and it uses an
 * exponentially weighted moving average. The new pages_to_scan value is
 * multiplied with that change factor:
 *
 *      new_pages_to_scan *= change facor
 *
 * The new_pages_to_scan value is limited by the cpu min and max values. It
 * calculates the cpu percent for the last scan and calculates the new
 * estimated cpu percent cost for the next scan. That value is capped by the
 * cpu min and max setting.
 *
 * In addition the new pages_to_scan value is capped by the max and min
 * limits.
 */
static void scan_time_advisor(void)
{
        unsigned int cpu_percent;
        unsigned long cpu_time;
        unsigned long cpu_time_diff;
        unsigned long cpu_time_diff_ms;
        unsigned long pages;
        unsigned long per_page_cost;
        unsigned long factor;
        unsigned long change;
        unsigned long last_scan_time;
        unsigned long scan_time;

        /* Convert scan time to seconds */
        scan_time = div_s64(ktime_ms_delta(ktime_get(), advisor_ctx.start_scan),
                            MSEC_PER_SEC);
        scan_time = scan_time ? scan_time : 1;

        /* Calculate CPU consumption of ksmd background thread */
        cpu_time = task_sched_runtime(current);
        cpu_time_diff = cpu_time - advisor_ctx.cpu_time;
        cpu_time_diff_ms = cpu_time_diff / 1000 / 1000;

        cpu_percent = (cpu_time_diff_ms * 100) / (scan_time * 1000);
        cpu_percent = cpu_percent ? cpu_percent : 1;
        last_scan_time = prev_scan_time(&advisor_ctx, scan_time);

        /* Calculate scan time as percentage of target scan time */
        factor = ksm_advisor_target_scan_time * 100 / scan_time;
        factor = factor ? factor : 1;

        /*
         * Calculate scan time as percentage of last scan time and use
         * exponentially weighted average to smooth it
         */
        change = scan_time * 100 / last_scan_time;
        change = change ? change : 1;
        change = ewma(advisor_ctx.change, change);

        /* Calculate new scan rate based on target scan rate. */
        pages = ksm_thread_pages_to_scan * 100 / factor;
        /* Update pages_to_scan by weighted change percentage. */
        pages = pages * change / 100;

        /* Cap new pages_to_scan value */
        per_page_cost = ksm_thread_pages_to_scan / cpu_percent;
        per_page_cost = per_page_cost ? per_page_cost : 1;

        pages = min(pages, per_page_cost * ksm_advisor_max_cpu);
        pages = max(pages, per_page_cost * KSM_ADVISOR_MIN_CPU);
        pages = min(pages, ksm_advisor_max_pages_to_scan);

        /* Update advisor context */
        advisor_ctx.change = change;
        advisor_ctx.scan_time = scan_time;
        advisor_ctx.cpu_time = cpu_time;

        ksm_thread_pages_to_scan = pages;
        trace_ksm_advisor(scan_time, pages, cpu_percent);
}

static void advisor_stop_scan(void)
{
        if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
                scan_time_advisor();
}

#ifdef CONFIG_NUMA
/* Zeroed when merging across nodes is not allowed */
static unsigned int ksm_merge_across_nodes = 1;
static int ksm_nr_node_ids = 1;
#else
#define ksm_merge_across_nodes        1U
#define ksm_nr_node_ids                1
#endif

#define KSM_RUN_STOP        0
#define KSM_RUN_MERGE        1
#define KSM_RUN_UNMERGE        2
#define KSM_RUN_OFFLINE        4
static unsigned long ksm_run = KSM_RUN_STOP;
static void wait_while_offlining(void);

static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);

static int __init ksm_slab_init(void)
{
        rmap_item_cache = KMEM_CACHE(ksm_rmap_item, 0);
        if (!rmap_item_cache)
                goto out;

        stable_node_cache = KMEM_CACHE(ksm_stable_node, 0);
        if (!stable_node_cache)
                goto out_free1;

        mm_slot_cache = KMEM_CACHE(ksm_mm_slot, 0);
        if (!mm_slot_cache)
                goto out_free2;

        return 0;

out_free2:
        kmem_cache_destroy(stable_node_cache);
out_free1:
        kmem_cache_destroy(rmap_item_cache);
out:
        return -ENOMEM;
}

static void __init ksm_slab_free(void)
{
        kmem_cache_destroy(mm_slot_cache);
        kmem_cache_destroy(stable_node_cache);
        kmem_cache_destroy(rmap_item_cache);
        mm_slot_cache = NULL;
}

static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain)
{
        return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
}

static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup)
{
        return dup->head == STABLE_NODE_DUP_HEAD;
}

static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup,
                                             struct ksm_stable_node *chain)
{
        VM_BUG_ON(is_stable_node_dup(dup));
        dup->head = STABLE_NODE_DUP_HEAD;
        VM_BUG_ON(!is_stable_node_chain(chain));
        hlist_add_head(&dup->hlist_dup, &chain->hlist);
        ksm_stable_node_dups++;
}

static inline void __stable_node_dup_del(struct ksm_stable_node *dup)
{
        VM_BUG_ON(!is_stable_node_dup(dup));
        hlist_del(&dup->hlist_dup);
        ksm_stable_node_dups--;
}

static inline void stable_node_dup_del(struct ksm_stable_node *dup)
{
        VM_BUG_ON(is_stable_node_chain(dup));
        if (is_stable_node_dup(dup))
                __stable_node_dup_del(dup);
        else
                rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
#ifdef CONFIG_DEBUG_VM
        dup->head = NULL;
#endif
}

static inline struct ksm_rmap_item *alloc_rmap_item(void)
{
        struct ksm_rmap_item *rmap_item;

        rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
                                                __GFP_NORETRY | __GFP_NOWARN);
        if (rmap_item)
                ksm_rmap_items++;
        return rmap_item;
}

static inline void free_rmap_item(struct ksm_rmap_item *rmap_item)
{
        ksm_rmap_items--;
        rmap_item->mm->ksm_rmap_items--;
        rmap_item->mm = NULL;        /* debug safety */
        kmem_cache_free(rmap_item_cache, rmap_item);
}

static inline struct ksm_stable_node *alloc_stable_node(void)
{
        /*
         * The allocation can take too long with GFP_KERNEL when memory is under
         * pressure, which may lead to hung task warnings.  Adding __GFP_HIGH
         * grants access to memory reserves, helping to avoid this problem.
         */
        return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH);
}

static inline void free_stable_node(struct ksm_stable_node *stable_node)
{
        VM_BUG_ON(stable_node->rmap_hlist_len &&
                  !is_stable_node_chain(stable_node));
        kmem_cache_free(stable_node_cache, stable_node);
}

/*
 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
 * page tables after it has passed through ksm_exit() - which, if necessary,
 * takes mmap_lock briefly to serialize against them.  ksm_exit() does not set
 * a special flag: they can just back out as soon as mm_users goes to zero.
 * ksm_test_exit() is used throughout to make this test for exit: in some
 * places for correctness, in some places just to avoid unnecessary work.
 */
static inline bool ksm_test_exit(struct mm_struct *mm)
{
        return atomic_read(&mm->mm_users) == 0;
}

/*
 * We use break_ksm to break COW on a ksm page by triggering unsharing,
 * such that the ksm page will get replaced by an exclusive anonymous page.
 *
 * We take great care only to touch a ksm page, in a VM_MERGEABLE vma,
 * in case the application has unmapped and remapped mm,addr meanwhile.
 * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
 * mmap of /dev/mem, where we would not want to touch it.
 *
 * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context
 * of the process that owns 'vma'.  We also do not want to enforce
 * protection keys here anyway.
 */
static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
{
        vm_fault_t ret = 0;

        if (lock_vma)
                vma_start_write(vma);

        do {
                bool ksm_page = false;
                struct folio_walk fw;
                struct folio *folio;

                cond_resched();
                folio = folio_walk_start(&fw, vma, addr,
                                         FW_MIGRATION | FW_ZEROPAGE);
                if (folio) {
                        /* Small folio implies FW_LEVEL_PTE. */
                        if (!folio_test_large(folio) &&
                            (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte)))
                                ksm_page = true;
                        folio_walk_end(&fw, vma);
                }

                if (!ksm_page)
                        return 0;
                ret = handle_mm_fault(vma, addr,
                                      FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE,
                                      NULL);
        } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
        /*
         * We must loop until we no longer find a KSM page because
         * handle_mm_fault() may back out if there's any difficulty e.g. if
         * pte accessed bit gets updated concurrently.
         *
         * VM_FAULT_SIGBUS could occur if we race with truncation of the
         * backing file, which also invalidates anonymous pages: that's
         * okay, that truncation will have unmapped the KSM page for us.
         *
         * VM_FAULT_OOM: at the time of writing (late July 2009), setting
         * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
         * current task has TIF_MEMDIE set, and will be OOM killed on return
         * to user; and ksmd, having no mm, would never be chosen for that.
         *
         * But if the mm is in a limited mem_cgroup, then the fault may fail
         * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
         * even ksmd can fail in this way - though it's usually breaking ksm
         * just to undo a merge it made a moment before, so unlikely to oom.
         *
         * That's a pity: we might therefore have more kernel pages allocated
         * than we're counting as nodes in the stable tree; but ksm_do_scan
         * will retry to break_cow on each pass, so should recover the page
         * in due course.  The important thing is to not let VM_MERGEABLE
         * be cleared while any such pages might remain in the area.
         */
        return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
}

static bool vma_ksm_compatible(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_SHARED  | VM_MAYSHARE   | VM_PFNMAP  |
                             VM_IO      | VM_DONTEXPAND | VM_HUGETLB |
                             VM_MIXEDMAP| VM_DROPPABLE))
                return false;                /* just ignore the advice */

        if (vma_is_dax(vma))
                return false;

#ifdef VM_SAO
        if (vma->vm_flags & VM_SAO)
                return false;
#endif
#ifdef VM_SPARC_ADI
        if (vma->vm_flags & VM_SPARC_ADI)
                return false;
#endif

        return true;
}

static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm,
                unsigned long addr)
{
        struct vm_area_struct *vma;
        if (ksm_test_exit(mm))
                return NULL;
        vma = vma_lookup(mm, addr);
        if (!vma || !(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
                return NULL;
        return vma;
}

static void break_cow(struct ksm_rmap_item *rmap_item)
{
        struct mm_struct *mm = rmap_item->mm;
        unsigned long addr = rmap_item->address;
        struct vm_area_struct *vma;

        /*
         * It is not an accident that whenever we want to break COW
         * to undo, we also need to drop a reference to the anon_vma.
         */
        put_anon_vma(rmap_item->anon_vma);

        mmap_read_lock(mm);
        vma = find_mergeable_vma(mm, addr);
        if (vma)
                break_ksm(vma, addr, false);
        mmap_read_unlock(mm);
}

static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
{
        struct mm_struct *mm = rmap_item->mm;
        unsigned long addr = rmap_item->address;
        struct vm_area_struct *vma;
        struct page *page = NULL;
        struct folio_walk fw;
        struct folio *folio;

        mmap_read_lock(mm);
        vma = find_mergeable_vma(mm, addr);
        if (!vma)
                goto out;

        folio = folio_walk_start(&fw, vma, addr, 0);
        if (folio) {
                if (!folio_is_zone_device(folio) &&
                    folio_test_anon(folio)) {
                        folio_get(folio);
                        page = fw.page;
                }
                folio_walk_end(&fw, vma);
        }
out:
        if (page) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        }
        mmap_read_unlock(mm);
        return page;
}

/*
 * This helper is used for getting right index into array of tree roots.
 * When merge_across_nodes knob is set to 1, there are only two rb-trees for
 * stable and unstable pages from all nodes with roots in index 0. Otherwise,
 * every node has its own stable and unstable tree.
 */
static inline int get_kpfn_nid(unsigned long kpfn)
{
        return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
}

static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup,
                                                   struct rb_root *root)
{
        struct ksm_stable_node *chain = alloc_stable_node();
        VM_BUG_ON(is_stable_node_chain(dup));
        if (likely(chain)) {
                INIT_HLIST_HEAD(&chain->hlist);
                chain->chain_prune_time = jiffies;
                chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
                chain->nid = NUMA_NO_NODE; /* debug */
#endif
                ksm_stable_node_chains++;

                /*
                 * Put the stable node chain in the first dimension of
                 * the stable tree and at the same time remove the old
                 * stable node.
                 */
                rb_replace_node(&dup->node, &chain->node, root);

                /*
                 * Move the old stable node to the second dimension
                 * queued in the hlist_dup. The invariant is that all
                 * dup stable_nodes in the chain->hlist point to pages
                 * that are write protected and have the exact same
                 * content.
                 */
                stable_node_chain_add_dup(dup, chain);
        }
        return chain;
}

static inline void free_stable_node_chain(struct ksm_stable_node *chain,
                                          struct rb_root *root)
{
        rb_erase(&chain->node, root);
        free_stable_node(chain);
        ksm_stable_node_chains--;
}

static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node)
{
        struct ksm_rmap_item *rmap_item;

        /* check it's not STABLE_NODE_CHAIN or negative */
        BUG_ON(stable_node->rmap_hlist_len < 0);

        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                if (rmap_item->hlist.next) {
                        ksm_pages_sharing--;
                        trace_ksm_remove_rmap_item(stable_node->kpfn, rmap_item, rmap_item->mm);
                } else {
                        ksm_pages_shared--;
                }

                rmap_item->mm->ksm_merging_pages--;

                VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
                stable_node->rmap_hlist_len--;
                put_anon_vma(rmap_item->anon_vma);
                rmap_item->address &= PAGE_MASK;
                cond_resched();
        }

        /*
         * We need the second aligned pointer of the migrate_nodes
         * list_head to stay clear from the rb_parent_color union
         * (aligned and different than any node) and also different
         * from &migrate_nodes. This will verify that future list.h changes
         * don't break STABLE_NODE_DUP_HEAD. Only recent gcc can handle it.
         */
        BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
        BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);

        trace_ksm_remove_ksm_page(stable_node->kpfn);
        if (stable_node->head == &migrate_nodes)
                list_del(&stable_node->list);
        else
                stable_node_dup_del(stable_node);
        free_stable_node(stable_node);
}

enum ksm_get_folio_flags {
        KSM_GET_FOLIO_NOLOCK,
        KSM_GET_FOLIO_LOCK,
        KSM_GET_FOLIO_TRYLOCK
};

/*
 * ksm_get_folio: checks if the page indicated by the stable node
 * is still its ksm page, despite having held no reference to it.
 * In which case we can trust the content of the page, and it
 * returns the gotten page; but if the page has now been zapped,
 * remove the stale node from the stable tree and return NULL.
 * But beware, the stable node's page might be being migrated.
 *
 * You would expect the stable_node to hold a reference to the ksm page.
 * But if it increments the page's count, swapping out has to wait for
 * ksmd to come around again before it can free the page, which may take
 * seconds or even minutes: much too unresponsive.  So instead we use a
 * "keyhole reference": access to the ksm page from the stable node peeps
 * out through its keyhole to see if that page still holds the right key,
 * pointing back to this stable node.  This relies on freeing a PageAnon
 * page to reset its page->mapping to NULL, and relies on no other use of
 * a page to put something that might look like our key in page->mapping.
 * is on its way to being freed; but it is an anomaly to bear in mind.
 */
static struct folio *ksm_get_folio(struct ksm_stable_node *stable_node,
                                 enum ksm_get_folio_flags flags)
{
        struct folio *folio;
        void *expected_mapping;
        unsigned long kpfn;

        expected_mapping = (void *)((unsigned long)stable_node |
                                        PAGE_MAPPING_KSM);
again:
        kpfn = READ_ONCE(stable_node->kpfn); /* Address dependency. */
        folio = pfn_folio(kpfn);
        if (READ_ONCE(folio->mapping) != expected_mapping)
                goto stale;

        /*
         * We cannot do anything with the page while its refcount is 0.
         * Usually 0 means free, or tail of a higher-order page: in which
         * case this node is no longer referenced, and should be freed;
         * however, it might mean that the page is under page_ref_freeze().
         * The __remove_mapping() case is easy, again the node is now stale;
         * the same is in reuse_ksm_page() case; but if page is swapcache
         * in folio_migrate_mapping(), it might still be our page,
         * in which case it's essential to keep the node.
         */
        while (!folio_try_get(folio)) {
                /*
                 * Another check for folio->mapping != expected_mapping
                 * would work here too.  We have chosen to test the
                 * swapcache flag to optimize the common case, when the
                 * folio is or is about to be freed: the swapcache flag
                 * is cleared (under spin_lock_irq) in the ref_freeze
                 * section of __remove_mapping(); but anon folio->mapping
                 * is reset to NULL later, in free_pages_prepare().
                 */
                if (!folio_test_swapcache(folio))
                        goto stale;
                cpu_relax();
        }

        if (READ_ONCE(folio->mapping) != expected_mapping) {
                folio_put(folio);
                goto stale;
        }

        if (flags == KSM_GET_FOLIO_TRYLOCK) {
                if (!folio_trylock(folio)) {
                        folio_put(folio);
                        return ERR_PTR(-EBUSY);
                }
        } else if (flags == KSM_GET_FOLIO_LOCK)
                folio_lock(folio);

        if (flags != KSM_GET_FOLIO_NOLOCK) {
                if (READ_ONCE(folio->mapping) != expected_mapping) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto stale;
                }
        }
        return folio;

stale:
        /*
         * We come here from above when folio->mapping or the swapcache flag
         * suggests that the node is stale; but it might be under migration.
         * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
         * before checking whether node->kpfn has been changed.
         */
        smp_rmb();
        if (READ_ONCE(stable_node->kpfn) != kpfn)
                goto again;
        remove_node_from_stable_tree(stable_node);
        return NULL;
}

/*
 * Removing rmap_item from stable or unstable tree.
 * This function will clean the information from the stable/unstable tree.
 */
static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item)
{
        if (rmap_item->address & STABLE_FLAG) {
                struct ksm_stable_node *stable_node;
                struct folio *folio;

                stable_node = rmap_item->head;
                folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
                if (!folio)
                        goto out;

                hlist_del(&rmap_item->hlist);
                folio_unlock(folio);
                folio_put(folio);

                if (!hlist_empty(&stable_node->hlist))
                        ksm_pages_sharing--;
                else
                        ksm_pages_shared--;

                rmap_item->mm->ksm_merging_pages--;

                VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
                stable_node->rmap_hlist_len--;

                put_anon_vma(rmap_item->anon_vma);
                rmap_item->head = NULL;
                rmap_item->address &= PAGE_MASK;

        } else if (rmap_item->address & UNSTABLE_FLAG) {
                unsigned char age;
                /*
                 * Usually ksmd can and must skip the rb_erase, because
                 * root_unstable_tree was already reset to RB_ROOT.
                 * But be careful when an mm is exiting: do the rb_erase
                 * if this rmap_item was inserted by this scan, rather
                 * than left over from before.
                 */
                age = (unsigned char)(ksm_scan.seqnr - rmap_item->address);
                BUG_ON(age > 1);
                if (!age)
                        rb_erase(&rmap_item->node,
                                 root_unstable_tree + NUMA(rmap_item->nid));
                ksm_pages_unshared--;
                rmap_item->address &= PAGE_MASK;
        }
out:
        cond_resched();                /* we're called from many long loops */
}

static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list)
{
        while (*rmap_list) {
                struct ksm_rmap_item *rmap_item = *rmap_list;
                *rmap_list = rmap_item->rmap_list;
                remove_rmap_item_from_tree(rmap_item);
                free_rmap_item(rmap_item);
        }
}

/*
 * Though it's very tempting to unmerge rmap_items from stable tree rather
 * than check every pte of a given vma, the locking doesn't quite work for
 * that - an rmap_item is assigned to the stable tree after inserting ksm
 * page and upping mmap_lock.  Nor does it fit with the way we skip dup'ing
 * rmap_items from parent to child at fork time (so as not to waste time
 * if exit comes before the next scan reaches it).
 *
 * Similarly, although we'd like to remove rmap_items (so updating counts
 * and freeing memory) when unmerging an area, it's easier to leave that
 * to the next pass of ksmd - consider, for example, how ksmd might be
 * in cmp_and_merge_page on one of the rmap_items we would be removing.
 */
static int unmerge_ksm_pages(struct vm_area_struct *vma,
                             unsigned long start, unsigned long end, bool lock_vma)
{
        unsigned long addr;
        int err = 0;

        for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
                if (ksm_test_exit(vma->vm_mm))
                        break;
                if (signal_pending(current))
                        err = -ERESTARTSYS;
                else
                        err = break_ksm(vma, addr, lock_vma);
        }
        return err;
}

static inline
struct ksm_stable_node *folio_stable_node(const struct folio *folio)
{
        return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}

static inline struct ksm_stable_node *page_stable_node(struct page *page)
{
        return folio_stable_node(page_folio(page));
}

static inline void folio_set_stable_node(struct folio *folio,
                                         struct ksm_stable_node *stable_node)
{
        VM_WARN_ON_FOLIO(folio_test_anon(folio) && PageAnonExclusive(&folio->page), folio);
        folio->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM);
}

#ifdef CONFIG_SYSFS
/*
 * Only called through the sysfs control interface:
 */
static int remove_stable_node(struct ksm_stable_node *stable_node)
{
        struct folio *folio;
        int err;

        folio = ksm_get_folio(stable_node, KSM_GET_FOLIO_LOCK);
        if (!folio) {
                /*
                 * ksm_get_folio did remove_node_from_stable_tree itself.
                 */
                return 0;
        }

        /*
         * Page could be still mapped if this races with __mmput() running in
         * between ksm_exit() and exit_mmap(). Just refuse to let
         * merge_across_nodes/max_page_sharing be switched.
         */
        err = -EBUSY;
        if (!folio_mapped(folio)) {
                /*
                 * The stable node did not yet appear stale to ksm_get_folio(),
                 * since that allows for an unmapped ksm folio to be recognized
                 * right up until it is freed; but the node is safe to remove.
                 * This folio might be in an LRU cache waiting to be freed,
                 * or it might be in the swapcache (perhaps under writeback),
                 * or it might have been removed from swapcache a moment ago.
                 */
                folio_set_stable_node(folio, NULL);
                remove_node_from_stable_tree(stable_node);
                err = 0;
        }

        folio_unlock(folio);
        folio_put(folio);
        return err;
}

static int remove_stable_node_chain(struct ksm_stable_node *stable_node,
                                    struct rb_root *root)
{
        struct ksm_stable_node *dup;
        struct hlist_node *hlist_safe;

        if (!is_stable_node_chain(stable_node)) {
                VM_BUG_ON(is_stable_node_dup(stable_node));
                if (remove_stable_node(stable_node))
                        return true;
                else
                        return false;
        }

        hlist_for_each_entry_safe(dup, hlist_safe,
                                  &stable_node->hlist, hlist_dup) {
                VM_BUG_ON(!is_stable_node_dup(dup));
                if (remove_stable_node(dup))
                        return true;
        }
        BUG_ON(!hlist_empty(&stable_node->hlist));
        free_stable_node_chain(stable_node, root);
        return false;
}

static int remove_all_stable_nodes(void)
{
        struct ksm_stable_node *stable_node, *next;
        int nid;
        int err = 0;

        for (nid = 0; nid < ksm_nr_node_ids; nid++) {
                while (root_stable_tree[nid].rb_node) {
                        stable_node = rb_entry(root_stable_tree[nid].rb_node,
                                                struct ksm_stable_node, node);
                        if (remove_stable_node_chain(stable_node,
                                                     root_stable_tree + nid)) {
                                err = -EBUSY;
                                break;        /* proceed to next nid */
                        }
                        cond_resched();
                }
        }
        list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
                if (remove_stable_node(stable_node))
                        err = -EBUSY;
                cond_resched();
        }
        return err;
}

static int unmerge_and_remove_all_rmap_items(void)
{
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        int err = 0;

        spin_lock(&ksm_mmlist_lock);
        slot = list_entry(ksm_mm_head.slot.mm_node.next,
                          struct mm_slot, mm_node);
        ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
        spin_unlock(&ksm_mmlist_lock);

        for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head;
             mm_slot = ksm_scan.mm_slot) {
                VMA_ITERATOR(vmi, mm_slot->slot.mm, 0);

                mm = mm_slot->slot.mm;
                mmap_read_lock(mm);

                /*
                 * Exit right away if mm is exiting to avoid lockdep issue in
                 * the maple tree
                 */
                if (ksm_test_exit(mm))
                        goto mm_exiting;

                for_each_vma(vmi, vma) {
                        if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
                                continue;
                        err = unmerge_ksm_pages(vma,
                                                vma->vm_start, vma->vm_end, false);
                        if (err)
                                goto error;
                }

mm_exiting:
                remove_trailing_rmap_items(&mm_slot->rmap_list);
                mmap_read_unlock(mm);

                spin_lock(&ksm_mmlist_lock);
                slot = list_entry(mm_slot->slot.mm_node.next,
                                  struct mm_slot, mm_node);
                ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
                if (ksm_test_exit(mm)) {
                        hash_del(&mm_slot->slot.hash);
                        list_del(&mm_slot->slot.mm_node);
                        spin_unlock(&ksm_mmlist_lock);

                        mm_slot_free(mm_slot_cache, mm_slot);
                        clear_bit(MMF_VM_MERGEABLE, &mm->flags);
                        clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
                        mmdrop(mm);
                } else
                        spin_unlock(&ksm_mmlist_lock);
        }

        /* Clean up stable nodes, but don't worry if some are still busy */
        remove_all_stable_nodes();
        ksm_scan.seqnr = 0;
        return 0;

error:
        mmap_read_unlock(mm);
        spin_lock(&ksm_mmlist_lock);
        ksm_scan.mm_slot = &ksm_mm_head;
        spin_unlock(&ksm_mmlist_lock);
        return err;
}
#endif /* CONFIG_SYSFS */

static u32 calc_checksum(struct page *page)
{
        u32 checksum;
        void *addr = kmap_local_page(page);
        checksum = xxhash(addr, PAGE_SIZE, 0);
        kunmap_local(addr);
        return checksum;
}

static int write_protect_page(struct vm_area_struct *vma, struct folio *folio,
                              pte_t *orig_pte)
{
        struct mm_struct *mm = vma->vm_mm;
        DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, 0, 0);
        int swapped;
        int err = -EFAULT;
        struct mmu_notifier_range range;
        bool anon_exclusive;
        pte_t entry;

        if (WARN_ON_ONCE(folio_test_large(folio)))
                return err;

        pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma);
        if (pvmw.address == -EFAULT)
                goto out;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, pvmw.address,
                                pvmw.address + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        if (!page_vma_mapped_walk(&pvmw))
                goto out_mn;
        if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?"))
                goto out_unlock;

        entry = ptep_get(pvmw.pte);
        /*
         * Handle PFN swap PTEs, such as device-exclusive ones, that actually
         * map pages: give up just like the next folio_walk would.
         */
        if (unlikely(!pte_present(entry)))
                goto out_unlock;

        anon_exclusive = PageAnonExclusive(&folio->page);
        if (pte_write(entry) || pte_dirty(entry) ||
            anon_exclusive || mm_tlb_flush_pending(mm)) {
                swapped = folio_test_swapcache(folio);
                flush_cache_page(vma, pvmw.address, folio_pfn(folio));
                /*
                 * Ok this is tricky, when get_user_pages_fast() run it doesn't
                 * take any lock, therefore the check that we are going to make
                 * with the pagecount against the mapcount is racy and
                 * O_DIRECT can happen right after the check.
                 * So we clear the pte and flush the tlb before the check
                 * this assure us that no O_DIRECT can happen after the check
                 * or in the middle of the check.
                 *
                 * No need to notify as we are downgrading page table to read
                 * only not changing it to point to a new page.
                 *
                 * See Documentation/mm/mmu_notifier.rst
                 */
                entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
                /*
                 * Check that no O_DIRECT or similar I/O is in progress on the
                 * page
                 */
                if (folio_mapcount(folio) + 1 + swapped != folio_ref_count(folio)) {
                        set_pte_at(mm, pvmw.address, pvmw.pte, entry);
                        goto out_unlock;
                }

                /* See folio_try_share_anon_rmap_pte(): clear PTE first. */
                if (anon_exclusive &&
                    folio_try_share_anon_rmap_pte(folio, &folio->page)) {
                        set_pte_at(mm, pvmw.address, pvmw.pte, entry);
                        goto out_unlock;
                }

                if (pte_dirty(entry))
                        folio_mark_dirty(folio);
                entry = pte_mkclean(entry);

                if (pte_write(entry))
                        entry = pte_wrprotect(entry);

                set_pte_at(mm, pvmw.address, pvmw.pte, entry);
        }
        *orig_pte = entry;
        err = 0;

out_unlock:
        page_vma_mapped_walk_done(&pvmw);
out_mn:
        mmu_notifier_invalidate_range_end(&range);
out:
        return err;
}

/**
 * replace_page - replace page in vma by new ksm page
 * @vma:      vma that holds the pte pointing to page
 * @page:     the page we are replacing by kpage
 * @kpage:    the ksm page we replace page by
 * @orig_pte: the original value of the pte
 *
 * Returns 0 on success, -EFAULT on failure.
 */
static int replace_page(struct vm_area_struct *vma, struct page *page,
                        struct page *kpage, pte_t orig_pte)
{
        struct folio *kfolio = page_folio(kpage);
        struct mm_struct *mm = vma->vm_mm;
        struct folio *folio = page_folio(page);
        pmd_t *pmd;
        pmd_t pmde;
        pte_t *ptep;
        pte_t newpte;
        spinlock_t *ptl;
        unsigned long addr;
        int err = -EFAULT;
        struct mmu_notifier_range range;

        addr = page_address_in_vma(folio, page, vma);
        if (addr == -EFAULT)
                goto out;

        pmd = mm_find_pmd(mm, addr);
        if (!pmd)
                goto out;
        /*
         * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
         * without holding anon_vma lock for write.  So when looking for a
         * genuine pmde (in which to find pte), test present and !THP together.
         */
        pmde = pmdp_get_lockless(pmd);
        if (!pmd_present(pmde) || pmd_trans_huge(pmde))
                goto out;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr,
                                addr + PAGE_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!ptep)
                goto out_mn;
        if (!pte_same(ptep_get(ptep), orig_pte)) {
                pte_unmap_unlock(ptep, ptl);
                goto out_mn;
        }
        VM_BUG_ON_PAGE(PageAnonExclusive(page), page);
        VM_BUG_ON_FOLIO(folio_test_anon(kfolio) && PageAnonExclusive(kpage),
                        kfolio);

        /*
         * No need to check ksm_use_zero_pages here: we can only have a
         * zero_page here if ksm_use_zero_pages was enabled already.
         */
        if (!is_zero_pfn(page_to_pfn(kpage))) {
                folio_get(kfolio);
                folio_add_anon_rmap_pte(kfolio, kpage, vma, addr, RMAP_NONE);
                newpte = mk_pte(kpage, vma->vm_page_prot);
        } else {
                /*
                 * Use pte_mkdirty to mark the zero page mapped by KSM, and then
                 * we can easily track all KSM-placed zero pages by checking if
                 * the dirty bit in zero page's PTE is set.
                 */
                newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot)));
                ksm_map_zero_page(mm);
                /*
                 * We're replacing an anonymous page with a zero page, which is
                 * not anonymous. We need to do proper accounting otherwise we
                 * will get wrong values in /proc, and a BUG message in dmesg
                 * when tearing down the mm.
                 */
                dec_mm_counter(mm, MM_ANONPAGES);
        }

        flush_cache_page(vma, addr, pte_pfn(ptep_get(ptep)));
        /*
         * No need to notify as we are replacing a read only page with another
         * read only page with the same content.
         *
         * See Documentation/mm/mmu_notifier.rst
         */
        ptep_clear_flush(vma, addr, ptep);
        set_pte_at(mm, addr, ptep, newpte);

        folio_remove_rmap_pte(folio, page, vma);
        if (!folio_mapped(folio))
                folio_free_swap(folio);
        folio_put(folio);

        pte_unmap_unlock(ptep, ptl);
        err = 0;
out_mn:
        mmu_notifier_invalidate_range_end(&range);
out:
        return err;
}

/*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
 * @page: the PageAnon page that we want to replace with kpage
 * @kpage: the KSM page that we want to map instead of page,
 *         or NULL the first time when we want to use page as kpage.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_one_page(struct vm_area_struct *vma,
                                 struct page *page, struct page *kpage)
{
        struct folio *folio = page_folio(page);
        pte_t orig_pte = __pte(0);
        int err = -EFAULT;

        if (page == kpage)                        /* ksm page forked */
                return 0;

        if (!folio_test_anon(folio))
                goto out;

        /*
         * We need the folio lock to read a stable swapcache flag in
         * write_protect_page().  We trylock because we don't want to wait
         * here - we prefer to continue scanning and merging different
         * pages, then come back to this page when it is unlocked.
         */
        if (!folio_trylock(folio))
                goto out;

        if (folio_test_large(folio)) {
                if (split_huge_page(page))
                        goto out_unlock;
                folio = page_folio(page);
        }

        /*
         * If this anonymous page is mapped only here, its pte may need
         * to be write-protected.  If it's mapped elsewhere, all of its
         * ptes are necessarily already write-protected.  But in either
         * case, we need to lock and check page_count is not raised.
         */
        if (write_protect_page(vma, folio, &orig_pte) == 0) {
                if (!kpage) {
                        /*
                         * While we hold folio lock, upgrade folio from
                         * anon to a NULL stable_node with the KSM flag set:
                         * stable_tree_insert() will update stable_node.
                         */
                        folio_set_stable_node(folio, NULL);
                        folio_mark_accessed(folio);
                        /*
                         * Page reclaim just frees a clean folio with no dirty
                         * ptes: make sure that the ksm page would be swapped.
                         */
                        if (!folio_test_dirty(folio))
                                folio_mark_dirty(folio);
                        err = 0;
                } else if (pages_identical(page, kpage))
                        err = replace_page(vma, page, kpage, orig_pte);
        }

out_unlock:
        folio_unlock(folio);
out:
        return err;
}

/*
 * This function returns 0 if the pages were merged or if they are
 * no longer merging candidates (e.g., VMA stale), -EFAULT otherwise.
 */
static int try_to_merge_with_zero_page(struct ksm_rmap_item *rmap_item,
                                       struct page *page)
{
        struct mm_struct *mm = rmap_item->mm;
        int err = -EFAULT;

        /*
         * Same checksum as an empty page. We attempt to merge it with the
         * appropriate zero page if the user enabled this via sysfs.
         */
        if (ksm_use_zero_pages && (rmap_item->oldchecksum == zero_checksum)) {
                struct vm_area_struct *vma;

                mmap_read_lock(mm);
                vma = find_mergeable_vma(mm, rmap_item->address);
                if (vma) {
                        err = try_to_merge_one_page(vma, page,
                                        ZERO_PAGE(rmap_item->address));
                        trace_ksm_merge_one_page(
                                page_to_pfn(ZERO_PAGE(rmap_item->address)),
                                rmap_item, mm, err);
                } else {
                        /*
                         * If the vma is out of date, we do not need to
                         * continue.
                         */
                        err = 0;
                }
                mmap_read_unlock(mm);
        }

        return err;
}

/*
 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
 * but no new kernel page is allocated: kpage must already be a ksm page.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item,
                                      struct page *page, struct page *kpage)
{
        struct mm_struct *mm = rmap_item->mm;
        struct vm_area_struct *vma;
        int err = -EFAULT;

        mmap_read_lock(mm);
        vma = find_mergeable_vma(mm, rmap_item->address);
        if (!vma)
                goto out;

        err = try_to_merge_one_page(vma, page, kpage);
        if (err)
                goto out;

        /* Unstable nid is in union with stable anon_vma: remove first */
        remove_rmap_item_from_tree(rmap_item);

        /* Must get reference to anon_vma while still holding mmap_lock */
        rmap_item->anon_vma = vma->anon_vma;
        get_anon_vma(vma->anon_vma);
out:
        mmap_read_unlock(mm);
        trace_ksm_merge_with_ksm_page(kpage, page_to_pfn(kpage ? kpage : page),
                                rmap_item, mm, err);
        return err;
}

/*
 * try_to_merge_two_pages - take two identical pages and prepare them
 * to be merged into one page.
 *
 * This function returns the kpage if we successfully merged two identical
 * pages into one ksm page, NULL otherwise.
 *
 * Note that this function upgrades page to ksm page: if one of the pages
 * is already a ksm page, try_to_merge_with_ksm_page should be used.
 */
static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item,
                                           struct page *page,
                                           struct ksm_rmap_item *tree_rmap_item,
                                           struct page *tree_page)
{
        int err;

        err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
        if (!err) {
                err = try_to_merge_with_ksm_page(tree_rmap_item,
                                                        tree_page, page);
                /*
                 * If that fails, we have a ksm page with only one pte
                 * pointing to it: so break it.
                 */
                if (err)
                        break_cow(rmap_item);
        }
        return err ? NULL : page_folio(page);
}

static __always_inline
bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset)
{
        VM_BUG_ON(stable_node->rmap_hlist_len < 0);
        /*
         * Check that at least one mapping still exists, otherwise
         * there's no much point to merge and share with this
         * stable_node, as the underlying tree_page of the other
         * sharer is going to be freed soon.
         */
        return stable_node->rmap_hlist_len &&
                stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
}

static __always_inline
bool is_page_sharing_candidate(struct ksm_stable_node *stable_node)
{
        return __is_page_sharing_candidate(stable_node, 0);
}

static struct folio *stable_node_dup(struct ksm_stable_node **_stable_node_dup,
                                     struct ksm_stable_node **_stable_node,
                                     struct rb_root *root,
                                     bool prune_stale_stable_nodes)
{
        struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node;
        struct hlist_node *hlist_safe;
        struct folio *folio, *tree_folio = NULL;
        int found_rmap_hlist_len;

        if (!prune_stale_stable_nodes ||
            time_before(jiffies, stable_node->chain_prune_time +
                        msecs_to_jiffies(
                                ksm_stable_node_chains_prune_millisecs)))
                prune_stale_stable_nodes = false;
        else
                stable_node->chain_prune_time = jiffies;

        hlist_for_each_entry_safe(dup, hlist_safe,
                                  &stable_node->hlist, hlist_dup) {
                cond_resched();
                /*
                 * We must walk all stable_node_dup to prune the stale
                 * stable nodes during lookup.
                 *
                 * ksm_get_folio can drop the nodes from the
                 * stable_node->hlist if they point to freed pages
                 * (that's why we do a _safe walk). The "dup"
                 * stable_node parameter itself will be freed from
                 * under us if it returns NULL.
                 */
                folio = ksm_get_folio(dup, KSM_GET_FOLIO_NOLOCK);
                if (!folio)
                        continue;
                /* Pick the best candidate if possible. */
                if (!found || (is_page_sharing_candidate(dup) &&
                    (!is_page_sharing_candidate(found) ||
                     dup->rmap_hlist_len > found_rmap_hlist_len))) {
                        if (found)
                                folio_put(tree_folio);
                        found = dup;
                        found_rmap_hlist_len = found->rmap_hlist_len;
                        tree_folio = folio;
                        /* skip put_page for found candidate */
                        if (!prune_stale_stable_nodes &&
                            is_page_sharing_candidate(found))
                                break;
                        continue;
                }
                folio_put(folio);
        }

        if (found) {
                if (hlist_is_singular_node(&found->hlist_dup, &stable_node->hlist)) {
                        /*
                         * If there's not just one entry it would
                         * corrupt memory, better BUG_ON. In KSM
                         * context with no lock held it's not even
                         * fatal.
                         */
                        BUG_ON(stable_node->hlist.first->next);

                        /*
                         * There's just one entry and it is below the
                         * deduplication limit so drop the chain.
                         */
                        rb_replace_node(&stable_node->node, &found->node,
                                        root);
                        free_stable_node(stable_node);
                        ksm_stable_node_chains--;
                        ksm_stable_node_dups--;
                        /*
                         * NOTE: the caller depends on the stable_node
                         * to be equal to stable_node_dup if the chain
                         * was collapsed.
                         */
                        *_stable_node = found;
                        /*
                         * Just for robustness, as stable_node is
                         * otherwise left as a stable pointer, the
                         * compiler shall optimize it away at build
                         * time.
                         */
                        stable_node = NULL;
                } else if (stable_node->hlist.first != &found->hlist_dup &&
                           __is_page_sharing_candidate(found, 1)) {
                        /*
                         * If the found stable_node dup can accept one
                         * more future merge (in addition to the one
                         * that is underway) and is not at the head of
                         * the chain, put it there so next search will
                         * be quicker in the !prune_stale_stable_nodes
                         * case.
                         *
                         * NOTE: it would be inaccurate to use nr > 1
                         * instead of checking the hlist.first pointer
                         * directly, because in the
                         * prune_stale_stable_nodes case "nr" isn't
                         * the position of the found dup in the chain,
                         * but the total number of dups in the chain.
                         */
                        hlist_del(&found->hlist_dup);
                        hlist_add_head(&found->hlist_dup,
                                       &stable_node->hlist);
                }
        } else {
                /* Its hlist must be empty if no one found. */
                free_stable_node_chain(stable_node, root);
        }

        *_stable_node_dup = found;
        return tree_folio;
}

/*
 * Like for ksm_get_folio, this function can free the *_stable_node and
 * *_stable_node_dup if the returned tree_page is NULL.
 *
 * It can also free and overwrite *_stable_node with the found
 * stable_node_dup if the chain is collapsed (in which case
 * *_stable_node will be equal to *_stable_node_dup like if the chain
 * never existed). It's up to the caller to verify tree_page is not
 * NULL before dereferencing *_stable_node or *_stable_node_dup.
 *
 * *_stable_node_dup is really a second output parameter of this
 * function and will be overwritten in all cases, the caller doesn't
 * need to initialize it.
 */
static struct folio *__stable_node_chain(struct ksm_stable_node **_stable_node_dup,
                                         struct ksm_stable_node **_stable_node,
                                         struct rb_root *root,
                                         bool prune_stale_stable_nodes)
{
        struct ksm_stable_node *stable_node = *_stable_node;

        if (!is_stable_node_chain(stable_node)) {
                *_stable_node_dup = stable_node;
                return ksm_get_folio(stable_node, KSM_GET_FOLIO_NOLOCK);
        }
        return stable_node_dup(_stable_node_dup, _stable_node, root,
                               prune_stale_stable_nodes);
}

static __always_inline struct folio *chain_prune(struct ksm_stable_node **s_n_d,
                                                 struct ksm_stable_node **s_n,
                                                 struct rb_root *root)
{
        return __stable_node_chain(s_n_d, s_n, root, true);
}

static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d,
                                           struct ksm_stable_node **s_n,
                                           struct rb_root *root)
{
        return __stable_node_chain(s_n_d, s_n, root, false);
}

/*
 * stable_tree_search - search for page inside the stable tree
 *
 * This function checks if there is a page inside the stable tree
 * with identical content to the page that we are scanning right now.
 *
 * This function returns the stable tree node of identical content if found,
 * -EBUSY if the stable node's page is being migrated, NULL otherwise.
 */
static struct folio *stable_tree_search(struct page *page)
{
        int nid;
        struct rb_root *root;
        struct rb_node **new;
        struct rb_node *parent;
        struct ksm_stable_node *stable_node, *stable_node_dup;
        struct ksm_stable_node *page_node;
        struct folio *folio;

        folio = page_folio(page);
        page_node = folio_stable_node(folio);
        if (page_node && page_node->head != &migrate_nodes) {
                /* ksm page forked */
                folio_get(folio);
                return folio;
        }

        nid = get_kpfn_nid(folio_pfn(folio));
        root = root_stable_tree + nid;
again:
        new = &root->rb_node;
        parent = NULL;

        while (*new) {
                struct folio *tree_folio;
                int ret;

                cond_resched();
                stable_node = rb_entry(*new, struct ksm_stable_node, node);
                tree_folio = chain_prune(&stable_node_dup, &stable_node, root);
                if (!tree_folio) {
                        /*
                         * If we walked over a stale stable_node,
                         * ksm_get_folio() will call rb_erase() and it
                         * may rebalance the tree from under us. So
                         * restart the search from scratch. Returning
                         * NULL would be safe too, but we'd generate
                         * false negative insertions just because some
                         * stable_node was stale.
                         */
                        goto again;
                }

                ret = memcmp_pages(page, &tree_folio->page);
                folio_put(tree_folio);

                parent = *new;
                if (ret < 0)
                        new = &parent->rb_left;
                else if (ret > 0)
                        new = &parent->rb_right;
                else {
                        if (page_node) {
                                VM_BUG_ON(page_node->head != &migrate_nodes);
                                /*
                                 * If the mapcount of our migrated KSM folio is
                                 * at most 1, we can merge it with another
                                 * KSM folio where we know that we have space
                                 * for one more mapping without exceeding the
                                 * ksm_max_page_sharing limit: see
                                 * chain_prune(). This way, we can avoid adding
                                 * this stable node to the chain.
                                 */
                                if (folio_mapcount(folio) > 1)
                                        goto chain_append;
                        }

                        if (!is_page_sharing_candidate(stable_node_dup)) {
                                /*
                                 * If the stable_node is a chain and
                                 * we got a payload match in memcmp
                                 * but we cannot merge the scanned
                                 * page in any of the existing
                                 * stable_node dups because they're
                                 * all full, we need to wait the
                                 * scanned page to find itself a match
                                 * in the unstable tree to create a
                                 * brand new KSM page to add later to
                                 * the dups of this stable_node.
                                 */
                                return NULL;
                        }

                        /*
                         * Lock and unlock the stable_node's page (which
                         * might already have been migrated) so that page
                         * migration is sure to notice its raised count.
                         * It would be more elegant to return stable_node
                         * than kpage, but that involves more changes.
                         */
                        tree_folio = ksm_get_folio(stable_node_dup,
                                                   KSM_GET_FOLIO_TRYLOCK);

                        if (PTR_ERR(tree_folio) == -EBUSY)
                                return ERR_PTR(-EBUSY);

                        if (unlikely(!tree_folio))
                                /*
                                 * The tree may have been rebalanced,
                                 * so re-evaluate parent and new.
                                 */
                                goto again;
                        folio_unlock(tree_folio);

                        if (get_kpfn_nid(stable_node_dup->kpfn) !=
                            NUMA(stable_node_dup->nid)) {
                                folio_put(tree_folio);
                                goto replace;
                        }
                        return tree_folio;
                }
        }

        if (!page_node)
                return NULL;

        list_del(&page_node->list);
        DO_NUMA(page_node->nid = nid);
        rb_link_node(&page_node->node, parent, new);
        rb_insert_color(&page_node->node, root);
out:
        if (is_page_sharing_candidate(page_node)) {
                folio_get(folio);
                return folio;
        } else
                return NULL;

replace:
        /*
         * If stable_node was a chain and chain_prune collapsed it,
         * stable_node has been updated to be the new regular
         * stable_node. A collapse of the chain is indistinguishable
         * from the case there was no chain in the stable
         * rbtree. Otherwise stable_node is the chain and
         * stable_node_dup is the dup to replace.
         */
        if (stable_node_dup == stable_node) {
                VM_BUG_ON(is_stable_node_chain(stable_node_dup));
                VM_BUG_ON(is_stable_node_dup(stable_node_dup));
                /* there is no chain */
                if (page_node) {
                        VM_BUG_ON(page_node->head != &migrate_nodes);
                        list_del(&page_node->list);
                        DO_NUMA(page_node->nid = nid);
                        rb_replace_node(&stable_node_dup->node,
                                        &page_node->node,
                                        root);
                        if (is_page_sharing_candidate(page_node))
                                folio_get(folio);
                        else
                                folio = NULL;
                } else {
                        rb_erase(&stable_node_dup->node, root);
                        folio = NULL;
                }
        } else {
                VM_BUG_ON(!is_stable_node_chain(stable_node));
                __stable_node_dup_del(stable_node_dup);
                if (page_node) {
                        VM_BUG_ON(page_node->head != &migrate_nodes);
                        list_del(&page_node->list);
                        DO_NUMA(page_node->nid = nid);
                        stable_node_chain_add_dup(page_node, stable_node);
                        if (is_page_sharing_candidate(page_node))
                                folio_get(folio);
                        else
                                folio = NULL;
                } else {
                        folio = NULL;
                }
        }
        stable_node_dup->head = &migrate_nodes;
        list_add(&stable_node_dup->list, stable_node_dup->head);
        return folio;

chain_append:
        /*
         * If stable_node was a chain and chain_prune collapsed it,
         * stable_node has been updated to be the new regular
         * stable_node. A collapse of the chain is indistinguishable
         * from the case there was no chain in the stable
         * rbtree. Otherwise stable_node is the chain and
         * stable_node_dup is the dup to replace.
         */
        if (stable_node_dup == stable_node) {
                VM_BUG_ON(is_stable_node_dup(stable_node_dup));
                /* chain is missing so create it */
                stable_node = alloc_stable_node_chain(stable_node_dup,
                                                      root);
                if (!stable_node)
                        return NULL;
        }
        /*
         * Add this stable_node dup that was
         * migrated to the stable_node chain
         * of the current nid for this page
         * content.
         */
        VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
        VM_BUG_ON(page_node->head != &migrate_nodes);
        list_del(&page_node->list);
        DO_NUMA(page_node->nid = nid);
        stable_node_chain_add_dup(page_node, stable_node);
        goto out;
}

/*
 * stable_tree_insert - insert stable tree node pointing to new ksm page
 * into the stable tree.
 *
 * This function returns the stable tree node just allocated on success,
 * NULL otherwise.
 */
static struct ksm_stable_node *stable_tree_insert(struct folio *kfolio)
{
        int nid;
        unsigned long kpfn;
        struct rb_root *root;
        struct rb_node **new;
        struct rb_node *parent;
        struct ksm_stable_node *stable_node, *stable_node_dup;
        bool need_chain = false;

        kpfn = folio_pfn(kfolio);
        nid = get_kpfn_nid(kpfn);
        root = root_stable_tree + nid;
again:
        parent = NULL;
        new = &root->rb_node;

        while (*new) {
                struct folio *tree_folio;
                int ret;

                cond_resched();
                stable_node = rb_entry(*new, struct ksm_stable_node, node);
                tree_folio = chain(&stable_node_dup, &stable_node, root);
                if (!tree_folio) {
                        /*
                         * If we walked over a stale stable_node,
                         * ksm_get_folio() will call rb_erase() and it
                         * may rebalance the tree from under us. So
                         * restart the search from scratch. Returning
                         * NULL would be safe too, but we'd generate
                         * false negative insertions just because some
                         * stable_node was stale.
                         */
                        goto again;
                }

                ret = memcmp_pages(&kfolio->page, &tree_folio->page);
                folio_put(tree_folio);

                parent = *new;
                if (ret < 0)
                        new = &parent->rb_left;
                else if (ret > 0)
                        new = &parent->rb_right;
                else {
                        need_chain = true;
                        break;
                }
        }

        stable_node_dup = alloc_stable_node();
        if (!stable_node_dup)
                return NULL;

        INIT_HLIST_HEAD(&stable_node_dup->hlist);
        stable_node_dup->kpfn = kpfn;
        stable_node_dup->rmap_hlist_len = 0;
        DO_NUMA(stable_node_dup->nid = nid);
        if (!need_chain) {
                rb_link_node(&stable_node_dup->node, parent, new);
                rb_insert_color(&stable_node_dup->node, root);
        } else {
                if (!is_stable_node_chain(stable_node)) {
                        struct ksm_stable_node *orig = stable_node;
                        /* chain is missing so create it */
                        stable_node = alloc_stable_node_chain(orig, root);
                        if (!stable_node) {
                                free_stable_node(stable_node_dup);
                                return NULL;
                        }
                }
                stable_node_chain_add_dup(stable_node_dup, stable_node);
        }

        folio_set_stable_node(kfolio, stable_node_dup);

        return stable_node_dup;
}

/*
 * unstable_tree_search_insert - search for identical page,
 * else insert rmap_item into the unstable tree.
 *
 * This function searches for a page in the unstable tree identical to the
 * page currently being scanned; and if no identical page is found in the
 * tree, we insert rmap_item as a new object into the unstable tree.
 *
 * This function returns pointer to rmap_item found to be identical
 * to the currently scanned page, NULL otherwise.
 *
 * This function does both searching and inserting, because they share
 * the same walking algorithm in an rbtree.
 */
static
struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item,
                                              struct page *page,
                                              struct page **tree_pagep)
{
        struct rb_node **new;
        struct rb_root *root;
        struct rb_node *parent = NULL;
        int nid;

        nid = get_kpfn_nid(page_to_pfn(page));
        root = root_unstable_tree + nid;
        new = &root->rb_node;

        while (*new) {
                struct ksm_rmap_item *tree_rmap_item;
                struct page *tree_page;
                int ret;

                cond_resched();
                tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node);
                tree_page = get_mergeable_page(tree_rmap_item);
                if (!tree_page)
                        return NULL;

                /*
                 * Don't substitute a ksm page for a forked page.
                 */
                if (page == tree_page) {
                        put_page(tree_page);
                        return NULL;
                }

                ret = memcmp_pages(page, tree_page);

                parent = *new;
                if (ret < 0) {
                        put_page(tree_page);
                        new = &parent->rb_left;
                } else if (ret > 0) {
                        put_page(tree_page);
                        new = &parent->rb_right;
                } else if (!ksm_merge_across_nodes &&
                           page_to_nid(tree_page) != nid) {
                        /*
                         * If tree_page has been migrated to another NUMA node,
                         * it will be flushed out and put in the right unstable
                         * tree next time: only merge with it when across_nodes.
                         */
                        put_page(tree_page);
                        return NULL;
                } else {
                        *tree_pagep = tree_page;
                        return tree_rmap_item;
                }
        }

        rmap_item->address |= UNSTABLE_FLAG;
        rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
        DO_NUMA(rmap_item->nid = nid);
        rb_link_node(&rmap_item->node, parent, new);
        rb_insert_color(&rmap_item->node, root);

        ksm_pages_unshared++;
        return NULL;
}

/*
 * stable_tree_append - add another rmap_item to the linked list of
 * rmap_items hanging off a given node of the stable tree, all sharing
 * the same ksm page.
 */
static void stable_tree_append(struct ksm_rmap_item *rmap_item,
                               struct ksm_stable_node *stable_node,
                               bool max_page_sharing_bypass)
{
        /*
         * rmap won't find this mapping if we don't insert the
         * rmap_item in the right stable_node
         * duplicate. page_migration could break later if rmap breaks,
         * so we can as well crash here. We really need to check for
         * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
         * for other negative values as an underflow if detected here
         * for the first time (and not when decreasing rmap_hlist_len)
         * would be sign of memory corruption in the stable_node.
         */
        BUG_ON(stable_node->rmap_hlist_len < 0);

        stable_node->rmap_hlist_len++;
        if (!max_page_sharing_bypass)
                /* possibly non fatal but unexpected overflow, only warn */
                WARN_ON_ONCE(stable_node->rmap_hlist_len >
                             ksm_max_page_sharing);

        rmap_item->head = stable_node;
        rmap_item->address |= STABLE_FLAG;
        hlist_add_head(&rmap_item->hlist, &stable_node->hlist);

        if (rmap_item->hlist.next)
                ksm_pages_sharing++;
        else
                ksm_pages_shared++;

        rmap_item->mm->ksm_merging_pages++;
}

/*
 * cmp_and_merge_page - first see if page can be merged into the stable tree;
 * if not, compare checksum to previous and if it's the same, see if page can
 * be inserted into the unstable tree, or merged with a page already there and
 * both transferred to the stable tree.
 *
 * @page: the page that we are searching identical page to.
 * @rmap_item: the reverse mapping into the virtual address of this page
 */
static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
{
        struct ksm_rmap_item *tree_rmap_item;
        struct page *tree_page = NULL;
        struct ksm_stable_node *stable_node;
        struct folio *kfolio;
        unsigned int checksum;
        int err;
        bool max_page_sharing_bypass = false;

        stable_node = page_stable_node(page);
        if (stable_node) {
                if (stable_node->head != &migrate_nodes &&
                    get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
                    NUMA(stable_node->nid)) {
                        stable_node_dup_del(stable_node);
                        stable_node->head = &migrate_nodes;
                        list_add(&stable_node->list, stable_node->head);
                }
                if (stable_node->head != &migrate_nodes &&
                    rmap_item->head == stable_node)
                        return;
                /*
                 * If it's a KSM fork, allow it to go over the sharing limit
                 * without warnings.
                 */
                if (!is_page_sharing_candidate(stable_node))
                        max_page_sharing_bypass = true;
        } else {
                remove_rmap_item_from_tree(rmap_item);

                /*
                 * If the hash value of the page has changed from the last time
                 * we calculated it, this page is changing frequently: therefore we
                 * don't want to insert it in the unstable tree, and we don't want
                 * to waste our time searching for something identical to it there.
                 */
                checksum = calc_checksum(page);
                if (rmap_item->oldchecksum != checksum) {
                        rmap_item->oldchecksum = checksum;
                        return;
                }

                if (!try_to_merge_with_zero_page(rmap_item, page))
                        return;
        }

        /* Start by searching for the folio in the stable tree */
        kfolio = stable_tree_search(page);
        if (&kfolio->page == page && rmap_item->head == stable_node) {
                folio_put(kfolio);
                return;
        }

        remove_rmap_item_from_tree(rmap_item);

        if (kfolio) {
                if (kfolio == ERR_PTR(-EBUSY))
                        return;

                err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page);
                if (!err) {
                        /*
                         * The page was successfully merged:
                         * add its rmap_item to the stable tree.
                         */
                        folio_lock(kfolio);
                        stable_tree_append(rmap_item, folio_stable_node(kfolio),
                                           max_page_sharing_bypass);
                        folio_unlock(kfolio);
                }
                folio_put(kfolio);
                return;
        }

        tree_rmap_item =
                unstable_tree_search_insert(rmap_item, page, &tree_page);
        if (tree_rmap_item) {
                bool split;

                kfolio = try_to_merge_two_pages(rmap_item, page,
                                                tree_rmap_item, tree_page);
                /*
                 * If both pages we tried to merge belong to the same compound
                 * page, then we actually ended up increasing the reference
                 * count of the same compound page twice, and split_huge_page
                 * failed.
                 * Here we set a flag if that happened, and we use it later to
                 * try split_huge_page again. Since we call put_page right
                 * afterwards, the reference count will be correct and
                 * split_huge_page should succeed.
                 */
                split = PageTransCompound(page)
                        && compound_head(page) == compound_head(tree_page);
                put_page(tree_page);
                if (kfolio) {
                        /*
                         * The pages were successfully merged: insert new
                         * node in the stable tree and add both rmap_items.
                         */
                        folio_lock(kfolio);
                        stable_node = stable_tree_insert(kfolio);
                        if (stable_node) {
                                stable_tree_append(tree_rmap_item, stable_node,
                                                   false);
                                stable_tree_append(rmap_item, stable_node,
                                                   false);
                        }
                        folio_unlock(kfolio);

                        /*
                         * If we fail to insert the page into the stable tree,
                         * we will have 2 virtual addresses that are pointing
                         * to a ksm page left outside the stable tree,
                         * in which case we need to break_cow on both.
                         */
                        if (!stable_node) {
                                break_cow(tree_rmap_item);
                                break_cow(rmap_item);
                        }
                } else if (split) {
                        /*
                         * We are here if we tried to merge two pages and
                         * failed because they both belonged to the same
                         * compound page. We will split the page now, but no
                         * merging will take place.
                         * We do not want to add the cost of a full lock; if
                         * the page is locked, it is better to skip it and
                         * perhaps try again later.
                         */
                        if (!trylock_page(page))
                                return;
                        split_huge_page(page);
                        unlock_page(page);
                }
        }
}

static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
                                            struct ksm_rmap_item **rmap_list,
                                            unsigned long addr)
{
        struct ksm_rmap_item *rmap_item;

        while (*rmap_list) {
                rmap_item = *rmap_list;
                if ((rmap_item->address & PAGE_MASK) == addr)
                        return rmap_item;
                if (rmap_item->address > addr)
                        break;
                *rmap_list = rmap_item->rmap_list;
                remove_rmap_item_from_tree(rmap_item);
                free_rmap_item(rmap_item);
        }

        rmap_item = alloc_rmap_item();
        if (rmap_item) {
                /* It has already been zeroed */
                rmap_item->mm = mm_slot->slot.mm;
                rmap_item->mm->ksm_rmap_items++;
                rmap_item->address = addr;
                rmap_item->rmap_list = *rmap_list;
                *rmap_list = rmap_item;
        }
        return rmap_item;
}

/*
 * Calculate skip age for the ksm page age. The age determines how often
 * de-duplicating has already been tried unsuccessfully. If the age is
 * smaller, the scanning of this page is skipped for less scans.
 *
 * @age: rmap_item age of page
 */
static unsigned int skip_age(rmap_age_t age)
{
        if (age <= 3)
                return 1;
        if (age <= 5)
                return 2;
        if (age <= 8)
                return 4;

        return 8;
}

/*
 * Determines if a page should be skipped for the current scan.
 *
 * @folio: folio containing the page to check
 * @rmap_item: associated rmap_item of page
 */
static bool should_skip_rmap_item(struct folio *folio,
                                  struct ksm_rmap_item *rmap_item)
{
        rmap_age_t age;

        if (!ksm_smart_scan)
                return false;

        /*
         * Never skip pages that are already KSM; pages cmp_and_merge_page()
         * will essentially ignore them, but we still have to process them
         * properly.
         */
        if (folio_test_ksm(folio))
                return false;

        age = rmap_item->age;
        if (age != U8_MAX)
                rmap_item->age++;

        /*
         * Smaller ages are not skipped, they need to get a chance to go
         * through the different phases of the KSM merging.
         */
        if (age < 3)
                return false;

        /*
         * Are we still allowed to skip? If not, then don't skip it
         * and determine how much more often we are allowed to skip next.
         */
        if (!rmap_item->remaining_skips) {
                rmap_item->remaining_skips = skip_age(age);
                return false;
        }

        /* Skip this page */
        ksm_pages_skipped++;
        rmap_item->remaining_skips--;
        remove_rmap_item_from_tree(rmap_item);
        return true;
}

static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
        struct mm_struct *mm;
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        struct vm_area_struct *vma;
        struct ksm_rmap_item *rmap_item;
        struct vma_iterator vmi;
        int nid;

        if (list_empty(&ksm_mm_head.slot.mm_node))
                return NULL;

        mm_slot = ksm_scan.mm_slot;
        if (mm_slot == &ksm_mm_head) {
                advisor_start_scan();
                trace_ksm_start_scan(ksm_scan.seqnr, ksm_rmap_items);

                /*
                 * A number of pages can hang around indefinitely in per-cpu
                 * LRU cache, raised page count preventing write_protect_page
                 * from merging them.  Though it doesn't really matter much,
                 * it is puzzling to see some stuck in pages_volatile until
                 * other activity jostles them out, and they also prevented
                 * LTP's KSM test from succeeding deterministically; so drain
                 * them here (here rather than on entry to ksm_do_scan(),
                 * so we don't IPI too often when pages_to_scan is set low).
                 */
                lru_add_drain_all();

                /*
                 * Whereas stale stable_nodes on the stable_tree itself
                 * get pruned in the regular course of stable_tree_search(),
                 * those moved out to the migrate_nodes list can accumulate:
                 * so prune them once before each full scan.
                 */
                if (!ksm_merge_across_nodes) {
                        struct ksm_stable_node *stable_node, *next;
                        struct folio *folio;

                        list_for_each_entry_safe(stable_node, next,
                                                 &migrate_nodes, list) {
                                folio = ksm_get_folio(stable_node,
                                                      KSM_GET_FOLIO_NOLOCK);
                                if (folio)
                                        folio_put(folio);
                                cond_resched();
                        }
                }

                for (nid = 0; nid < ksm_nr_node_ids; nid++)
                        root_unstable_tree[nid] = RB_ROOT;

                spin_lock(&ksm_mmlist_lock);
                slot = list_entry(mm_slot->slot.mm_node.next,
                                  struct mm_slot, mm_node);
                mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
                ksm_scan.mm_slot = mm_slot;
                spin_unlock(&ksm_mmlist_lock);
                /*
                 * Although we tested list_empty() above, a racing __ksm_exit
                 * of the last mm on the list may have removed it since then.
                 */
                if (mm_slot == &ksm_mm_head)
                        return NULL;
next_mm:
                ksm_scan.address = 0;
                ksm_scan.rmap_list = &mm_slot->rmap_list;
        }

        slot = &mm_slot->slot;
        mm = slot->mm;
        vma_iter_init(&vmi, mm, ksm_scan.address);

        mmap_read_lock(mm);
        if (ksm_test_exit(mm))
                goto no_vmas;

        for_each_vma(vmi, vma) {
                if (!(vma->vm_flags & VM_MERGEABLE))
                        continue;
                if (ksm_scan.address < vma->vm_start)
                        ksm_scan.address = vma->vm_start;
                if (!vma->anon_vma)
                        ksm_scan.address = vma->vm_end;

                while (ksm_scan.address < vma->vm_end) {
                        struct page *tmp_page = NULL;
                        struct folio_walk fw;
                        struct folio *folio;

                        if (ksm_test_exit(mm))
                                break;

                        folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
                        if (folio) {
                                if (!folio_is_zone_device(folio) &&
                                     folio_test_anon(folio)) {
                                        folio_get(folio);
                                        tmp_page = fw.page;
                                }
                                folio_walk_end(&fw, vma);
                        }

                        if (tmp_page) {
                                flush_anon_page(vma, tmp_page, ksm_scan.address);
                                flush_dcache_page(tmp_page);
                                rmap_item = get_next_rmap_item(mm_slot,
                                        ksm_scan.rmap_list, ksm_scan.address);
                                if (rmap_item) {
                                        ksm_scan.rmap_list =
                                                        &rmap_item->rmap_list;

                                        if (should_skip_rmap_item(folio, rmap_item)) {
                                                folio_put(folio);
                                                goto next_page;
                                        }

                                        ksm_scan.address += PAGE_SIZE;
                                        *page = tmp_page;
                                } else {
                                        folio_put(folio);
                                }
                                mmap_read_unlock(mm);
                                return rmap_item;
                        }
next_page:
                        ksm_scan.address += PAGE_SIZE;
                        cond_resched();
                }
        }

        if (ksm_test_exit(mm)) {
no_vmas:
                ksm_scan.address = 0;
                ksm_scan.rmap_list = &mm_slot->rmap_list;
        }
        /*
         * Nuke all the rmap_items that are above this current rmap:
         * because there were no VM_MERGEABLE vmas with such addresses.
         */
        remove_trailing_rmap_items(ksm_scan.rmap_list);

        spin_lock(&ksm_mmlist_lock);
        slot = list_entry(mm_slot->slot.mm_node.next,
                          struct mm_slot, mm_node);
        ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
        if (ksm_scan.address == 0) {
                /*
                 * We've completed a full scan of all vmas, holding mmap_lock
                 * throughout, and found no VM_MERGEABLE: so do the same as
                 * __ksm_exit does to remove this mm from all our lists now.
                 * This applies either when cleaning up after __ksm_exit
                 * (but beware: we can reach here even before __ksm_exit),
                 * or when all VM_MERGEABLE areas have been unmapped (and
                 * mmap_lock then protects against race with MADV_MERGEABLE).
                 */
                hash_del(&mm_slot->slot.hash);
                list_del(&mm_slot->slot.mm_node);
                spin_unlock(&ksm_mmlist_lock);

                mm_slot_free(mm_slot_cache, mm_slot);
                clear_bit(MMF_VM_MERGEABLE, &mm->flags);
                clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
                mmap_read_unlock(mm);
                mmdrop(mm);
        } else {
                mmap_read_unlock(mm);
                /*
                 * mmap_read_unlock(mm) first because after
                 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
                 * already have been freed under us by __ksm_exit()
                 * because the "mm_slot" is still hashed and
                 * ksm_scan.mm_slot doesn't point to it anymore.
                 */
                spin_unlock(&ksm_mmlist_lock);
        }

        /* Repeat until we've completed scanning the whole list */
        mm_slot = ksm_scan.mm_slot;
        if (mm_slot != &ksm_mm_head)
                goto next_mm;

        advisor_stop_scan();

        trace_ksm_stop_scan(ksm_scan.seqnr, ksm_rmap_items);
        ksm_scan.seqnr++;
        return NULL;
}

/**
 * ksm_do_scan  - the ksm scanner main worker function.
 * @scan_npages:  number of pages we want to scan before we return.
 */
static void ksm_do_scan(unsigned int scan_npages)
{
        struct ksm_rmap_item *rmap_item;
        struct page *page;

        while (scan_npages-- && likely(!freezing(current))) {
                cond_resched();
                rmap_item = scan_get_next_rmap_item(&page);
                if (!rmap_item)
                        return;
                cmp_and_merge_page(page, rmap_item);
                put_page(page);
                ksm_pages_scanned++;
        }
}

static int ksmd_should_run(void)
{
        return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node);
}

static int ksm_scan_thread(void *nothing)
{
        unsigned int sleep_ms;

        set_freezable();
        set_user_nice(current, 5);

        while (!kthread_should_stop()) {
                mutex_lock(&ksm_thread_mutex);
                wait_while_offlining();
                if (ksmd_should_run())
                        ksm_do_scan(ksm_thread_pages_to_scan);
                mutex_unlock(&ksm_thread_mutex);

                if (ksmd_should_run()) {
                        sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
                        wait_event_freezable_timeout(ksm_iter_wait,
                                sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
                                msecs_to_jiffies(sleep_ms));
                } else {
                        wait_event_freezable(ksm_thread_wait,
                                ksmd_should_run() || kthread_should_stop());
                }
        }
        return 0;
}

static void __ksm_add_vma(struct vm_area_struct *vma)
{
        unsigned long vm_flags = vma->vm_flags;

        if (vm_flags & VM_MERGEABLE)
                return;

        if (vma_ksm_compatible(vma))
                vm_flags_set(vma, VM_MERGEABLE);
}

static int __ksm_del_vma(struct vm_area_struct *vma)
{
        int err;

        if (!(vma->vm_flags & VM_MERGEABLE))
                return 0;

        if (vma->anon_vma) {
                err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true);
                if (err)
                        return err;
        }

        vm_flags_clear(vma, VM_MERGEABLE);
        return 0;
}
/**
 * ksm_add_vma - Mark vma as mergeable if compatible
 *
 * @vma:  Pointer to vma
 */
void ksm_add_vma(struct vm_area_struct *vma)
{
        struct mm_struct *mm = vma->vm_mm;

        if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                __ksm_add_vma(vma);
}

static void ksm_add_vmas(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        VMA_ITERATOR(vmi, mm, 0);
        for_each_vma(vmi, vma)
                __ksm_add_vma(vma);
}

static int ksm_del_vmas(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        int err;

        VMA_ITERATOR(vmi, mm, 0);
        for_each_vma(vmi, vma) {
                err = __ksm_del_vma(vma);
                if (err)
                        return err;
        }
        return 0;
}

/**
 * ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
 *                        compatible VMA's
 *
 * @mm:  Pointer to mm
 *
 * Returns 0 on success, otherwise error code
 */
int ksm_enable_merge_any(struct mm_struct *mm)
{
        int err;

        if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                return 0;

        if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
                err = __ksm_enter(mm);
                if (err)
                        return err;
        }

        set_bit(MMF_VM_MERGE_ANY, &mm->flags);
        ksm_add_vmas(mm);

        return 0;
}

/**
 * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
 *                           previously enabled via ksm_enable_merge_any().
 *
 * Disabling merging implies unmerging any merged pages, like setting
 * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
 * merging on all compatible VMA's remains enabled.
 *
 * @mm: Pointer to mm
 *
 * Returns 0 on success, otherwise error code
 */
int ksm_disable_merge_any(struct mm_struct *mm)
{
        int err;

        if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                return 0;

        err = ksm_del_vmas(mm);
        if (err) {
                ksm_add_vmas(mm);
                return err;
        }

        clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
        return 0;
}

int ksm_disable(struct mm_struct *mm)
{
        mmap_assert_write_locked(mm);

        if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
                return 0;
        if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                return ksm_disable_merge_any(mm);
        return ksm_del_vmas(mm);
}

int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        int err;

        switch (advice) {
        case MADV_MERGEABLE:
                if (vma->vm_flags & VM_MERGEABLE)
                        return 0;
                if (!vma_ksm_compatible(vma))
                        return 0;

                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
                        err = __ksm_enter(mm);
                        if (err)
                                return err;
                }

                *vm_flags |= VM_MERGEABLE;
                break;

        case MADV_UNMERGEABLE:
                if (!(*vm_flags & VM_MERGEABLE))
                        return 0;                /* just ignore the advice */

                if (vma->anon_vma) {
                        err = unmerge_ksm_pages(vma, start, end, true);
                        if (err)
                                return err;
                }

                *vm_flags &= ~VM_MERGEABLE;
                break;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(ksm_madvise);

int __ksm_enter(struct mm_struct *mm)
{
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        int needs_wakeup;

        mm_slot = mm_slot_alloc(mm_slot_cache);
        if (!mm_slot)
                return -ENOMEM;

        slot = &mm_slot->slot;

        /* Check ksm_run too?  Would need tighter locking */
        needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node);

        spin_lock(&ksm_mmlist_lock);
        mm_slot_insert(mm_slots_hash, mm, slot);
        /*
         * When KSM_RUN_MERGE (or KSM_RUN_STOP),
         * insert just behind the scanning cursor, to let the area settle
         * down a little; when fork is followed by immediate exec, we don't
         * want ksmd to waste time setting up and tearing down an rmap_list.
         *
         * But when KSM_RUN_UNMERGE, it's important to insert ahead of its
         * scanning cursor, otherwise KSM pages in newly forked mms will be
         * missed: then we might as well insert at the end of the list.
         */
        if (ksm_run & KSM_RUN_UNMERGE)
                list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node);
        else
                list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
        spin_unlock(&ksm_mmlist_lock);

        set_bit(MMF_VM_MERGEABLE, &mm->flags);
        mmgrab(mm);

        if (needs_wakeup)
                wake_up_interruptible(&ksm_thread_wait);

        trace_ksm_enter(mm);
        return 0;
}

void __ksm_exit(struct mm_struct *mm)
{
        struct ksm_mm_slot *mm_slot;
        struct mm_slot *slot;
        int easy_to_free = 0;

        /*
         * This process is exiting: if it's straightforward (as is the
         * case when ksmd was never running), free mm_slot immediately.
         * But if it's at the cursor or has rmap_items linked to it, use
         * mmap_lock to synchronize with any break_cows before pagetables
         * are freed, and leave the mm_slot on the list for ksmd to free.
         * Beware: ksm may already have noticed it exiting and freed the slot.
         */

        spin_lock(&ksm_mmlist_lock);
        slot = mm_slot_lookup(mm_slots_hash, mm);
        mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot);
        if (mm_slot && ksm_scan.mm_slot != mm_slot) {
                if (!mm_slot->rmap_list) {
                        hash_del(&slot->hash);
                        list_del(&slot->mm_node);
                        easy_to_free = 1;
                } else {
                        list_move(&slot->mm_node,
                                  &ksm_scan.mm_slot->slot.mm_node);
                }
        }
        spin_unlock(&ksm_mmlist_lock);

        if (easy_to_free) {
                mm_slot_free(mm_slot_cache, mm_slot);
                clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
                clear_bit(MMF_VM_MERGEABLE, &mm->flags);
                mmdrop(mm);
        } else if (mm_slot) {
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
        }

        trace_ksm_exit(mm);
}

struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr)
{
        struct page *page = folio_page(folio, 0);
        struct anon_vma *anon_vma = folio_anon_vma(folio);
        struct folio *new_folio;

        if (folio_test_large(folio))
                return folio;

        if (folio_test_ksm(folio)) {
                if (folio_stable_node(folio) &&
                    !(ksm_run & KSM_RUN_UNMERGE))
                        return folio;        /* no need to copy it */
        } else if (!anon_vma) {
                return folio;                /* no need to copy it */
        } else if (folio->index == linear_page_index(vma, addr) &&
                        anon_vma->root == vma->anon_vma->root) {
                return folio;                /* still no need to copy it */
        }
        if (PageHWPoison(page))
                return ERR_PTR(-EHWPOISON);
        if (!folio_test_uptodate(folio))
                return folio;                /* let do_swap_page report the error */

        new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr);
        if (new_folio &&
            mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) {
                folio_put(new_folio);
                new_folio = NULL;
        }
        if (new_folio) {
                if (copy_mc_user_highpage(folio_page(new_folio, 0), page,
                                                                addr, vma)) {
                        folio_put(new_folio);
                        return ERR_PTR(-EHWPOISON);
                }
                folio_set_dirty(new_folio);
                __folio_mark_uptodate(new_folio);
                __folio_set_locked(new_folio);
#ifdef CONFIG_SWAP
                count_vm_event(KSM_SWPIN_COPY);
#endif
        }

        return new_folio;
}

void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
{
        struct ksm_stable_node *stable_node;
        struct ksm_rmap_item *rmap_item;
        int search_new_forks = 0;

        VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio);

        /*
         * Rely on the page lock to protect against concurrent modifications
         * to that page's node of the stable tree.
         */
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        stable_node = folio_stable_node(folio);
        if (!stable_node)
                return;
again:
        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;

                cond_resched();
                if (!anon_vma_trylock_read(anon_vma)) {
                        if (rwc->try_lock) {
                                rwc->contended = true;
                                return;
                        }
                        anon_vma_lock_read(anon_vma);
                }
                anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
                                               0, ULONG_MAX) {
                        unsigned long addr;

                        cond_resched();
                        vma = vmac->vma;

                        /* Ignore the stable/unstable/sqnr flags */
                        addr = rmap_item->address & PAGE_MASK;

                        if (addr < vma->vm_start || addr >= vma->vm_end)
                                continue;
                        /*
                         * Initially we examine only the vma which covers this
                         * rmap_item; but later, if there is still work to do,
                         * we examine covering vmas in other mms: in case they
                         * were forked from the original since ksmd passed.
                         */
                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
                                continue;

                        if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                                continue;

                        if (!rwc->rmap_one(folio, vma, addr, rwc->arg)) {
                                anon_vma_unlock_read(anon_vma);
                                return;
                        }
                        if (rwc->done && rwc->done(folio)) {
                                anon_vma_unlock_read(anon_vma);
                                return;
                        }
                }
                anon_vma_unlock_read(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
}

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Collect processes when the error hit an ksm page.
 */
void collect_procs_ksm(const struct folio *folio, const struct page *page,
                struct list_head *to_kill, int force_early)
{
        struct ksm_stable_node *stable_node;
        struct ksm_rmap_item *rmap_item;
        struct vm_area_struct *vma;
        struct task_struct *tsk;

        stable_node = folio_stable_node(folio);
        if (!stable_node)
                return;
        hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
                struct anon_vma *av = rmap_item->anon_vma;

                anon_vma_lock_read(av);
                rcu_read_lock();
                for_each_process(tsk) {
                        struct anon_vma_chain *vmac;
                        unsigned long addr;
                        struct task_struct *t =
                                task_early_kill(tsk, force_early);
                        if (!t)
                                continue;
                        anon_vma_interval_tree_foreach(vmac, &av->rb_root, 0,
                                                       ULONG_MAX)
                        {
                                vma = vmac->vma;
                                if (vma->vm_mm == t->mm) {
                                        addr = rmap_item->address & PAGE_MASK;
                                        add_to_kill_ksm(t, page, vma, to_kill,
                                                        addr);
                                }
                        }
                }
                rcu_read_unlock();
                anon_vma_unlock_read(av);
        }
}
#endif

#ifdef CONFIG_MIGRATION
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
{
        struct ksm_stable_node *stable_node;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
        VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);

        stable_node = folio_stable_node(folio);
        if (stable_node) {
                VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
                stable_node->kpfn = folio_pfn(newfolio);
                /*
                 * newfolio->mapping was set in advance; now we need smp_wmb()
                 * to make sure that the new stable_node->kpfn is visible
                 * to ksm_get_folio() before it can see that folio->mapping
                 * has gone stale (or that the swapcache flag has been cleared).
                 */
                smp_wmb();
                folio_set_stable_node(folio, NULL);
        }
}
#endif /* CONFIG_MIGRATION */

#ifdef CONFIG_MEMORY_HOTREMOVE
static void wait_while_offlining(void)
{
        while (ksm_run & KSM_RUN_OFFLINE) {
                mutex_unlock(&ksm_thread_mutex);
                wait_on_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE),
                            TASK_UNINTERRUPTIBLE);
                mutex_lock(&ksm_thread_mutex);
        }
}

static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node,
                                         unsigned long start_pfn,
                                         unsigned long end_pfn)
{
        if (stable_node->kpfn >= start_pfn &&
            stable_node->kpfn < end_pfn) {
                /*
                 * Don't ksm_get_folio, page has already gone:
                 * which is why we keep kpfn instead of page*
                 */
                remove_node_from_stable_tree(stable_node);
                return true;
        }
        return false;
}

static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node,
                                           unsigned long start_pfn,
                                           unsigned long end_pfn,
                                           struct rb_root *root)
{
        struct ksm_stable_node *dup;
        struct hlist_node *hlist_safe;

        if (!is_stable_node_chain(stable_node)) {
                VM_BUG_ON(is_stable_node_dup(stable_node));
                return stable_node_dup_remove_range(stable_node, start_pfn,
                                                    end_pfn);
        }

        hlist_for_each_entry_safe(dup, hlist_safe,
                                  &stable_node->hlist, hlist_dup) {
                VM_BUG_ON(!is_stable_node_dup(dup));
                stable_node_dup_remove_range(dup, start_pfn, end_pfn);
        }
        if (hlist_empty(&stable_node->hlist)) {
                free_stable_node_chain(stable_node, root);
                return true; /* notify caller that tree was rebalanced */
        } else
                return false;
}

static void ksm_check_stable_tree(unsigned long start_pfn,
                                  unsigned long end_pfn)
{
        struct ksm_stable_node *stable_node, *next;
        struct rb_node *node;
        int nid;

        for (nid = 0; nid < ksm_nr_node_ids; nid++) {
                node = rb_first(root_stable_tree + nid);
                while (node) {
                        stable_node = rb_entry(node, struct ksm_stable_node, node);
                        if (stable_node_chain_remove_range(stable_node,
                                                           start_pfn, end_pfn,
                                                           root_stable_tree +
                                                           nid))
                                node = rb_first(root_stable_tree + nid);
                        else
                                node = rb_next(node);
                        cond_resched();
                }
        }
        list_for_each_entry_safe(stable_node, next, &migrate_nodes, list) {
                if (stable_node->kpfn >= start_pfn &&
                    stable_node->kpfn < end_pfn)
                        remove_node_from_stable_tree(stable_node);
                cond_resched();
        }
}

static int ksm_memory_callback(struct notifier_block *self,
                               unsigned long action, void *arg)
{
        struct memory_notify *mn = arg;

        switch (action) {
        case MEM_GOING_OFFLINE:
                /*
                 * Prevent ksm_do_scan(), unmerge_and_remove_all_rmap_items()
                 * and remove_all_stable_nodes() while memory is going offline:
                 * it is unsafe for them to touch the stable tree at this time.
                 * But unmerge_ksm_pages(), rmap lookups and other entry points
                 * which do not need the ksm_thread_mutex are all safe.
                 */
                mutex_lock(&ksm_thread_mutex);
                ksm_run |= KSM_RUN_OFFLINE;
                mutex_unlock(&ksm_thread_mutex);
                break;

        case MEM_OFFLINE:
                /*
                 * Most of the work is done by page migration; but there might
                 * be a few stable_nodes left over, still pointing to struct
                 * pages which have been offlined: prune those from the tree,
                 * otherwise ksm_get_folio() might later try to access a
                 * non-existent struct page.
                 */
                ksm_check_stable_tree(mn->start_pfn,
                                      mn->start_pfn + mn->nr_pages);
                fallthrough;
        case MEM_CANCEL_OFFLINE:
                mutex_lock(&ksm_thread_mutex);
                ksm_run &= ~KSM_RUN_OFFLINE;
                mutex_unlock(&ksm_thread_mutex);

                smp_mb();        /* wake_up_bit advises this */
                wake_up_bit(&ksm_run, ilog2(KSM_RUN_OFFLINE));
                break;
        }
        return NOTIFY_OK;
}
#else
static void wait_while_offlining(void)
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */

#ifdef CONFIG_PROC_FS
/*
 * The process is mergeable only if any VMA is currently
 * applicable to KSM.
 *
 * The mmap lock must be held in read mode.
 */
bool ksm_process_mergeable(struct mm_struct *mm)
{
        struct vm_area_struct *vma;

        mmap_assert_locked(mm);
        VMA_ITERATOR(vmi, mm, 0);
        for_each_vma(vmi, vma)
                if (vma->vm_flags & VM_MERGEABLE)
                        return true;

        return false;
}

long ksm_process_profit(struct mm_struct *mm)
{
        return (long)(mm->ksm_merging_pages + mm_ksm_zero_pages(mm)) * PAGE_SIZE -
                mm->ksm_rmap_items * sizeof(struct ksm_rmap_item);
}
#endif /* CONFIG_PROC_FS */

#ifdef CONFIG_SYSFS
/*
 * This all compiles without CONFIG_SYSFS, but is a waste of space.
 */

#define KSM_ATTR_RO(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
#define KSM_ATTR(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RW(_name)

static ssize_t sleep_millisecs_show(struct kobject *kobj,
                                    struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
}

static ssize_t sleep_millisecs_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        ksm_thread_sleep_millisecs = msecs;
        wake_up_interruptible(&ksm_iter_wait);

        return count;
}
KSM_ATTR(sleep_millisecs);

static ssize_t pages_to_scan_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
}

static ssize_t pages_to_scan_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        unsigned int nr_pages;
        int err;

        if (ksm_advisor != KSM_ADVISOR_NONE)
                return -EINVAL;

        err = kstrtouint(buf, 10, &nr_pages);
        if (err)
                return -EINVAL;

        ksm_thread_pages_to_scan = nr_pages;

        return count;
}
KSM_ATTR(pages_to_scan);

static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
                        char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_run);
}

static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
                         const char *buf, size_t count)
{
        unsigned int flags;
        int err;

        err = kstrtouint(buf, 10, &flags);
        if (err)
                return -EINVAL;
        if (flags > KSM_RUN_UNMERGE)
                return -EINVAL;

        /*
         * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
         * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
         * breaking COW to free the pages_shared (but leaves mm_slots
         * on the list for when ksmd may be set running again).
         */

        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksm_run != flags) {
                ksm_run = flags;
                if (flags & KSM_RUN_UNMERGE) {
                        set_current_oom_origin();
                        err = unmerge_and_remove_all_rmap_items();
                        clear_current_oom_origin();
                        if (err) {
                                ksm_run = KSM_RUN_STOP;
                                count = err;
                        }
                }
        }
        mutex_unlock(&ksm_thread_mutex);

        if (flags & KSM_RUN_MERGE)
                wake_up_interruptible(&ksm_thread_wait);

        return count;
}
KSM_ATTR(run);

#ifdef CONFIG_NUMA
static ssize_t merge_across_nodes_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
}

static ssize_t merge_across_nodes_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long knob;

        err = kstrtoul(buf, 10, &knob);
        if (err)
                return err;
        if (knob > 1)
                return -EINVAL;

        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksm_merge_across_nodes != knob) {
                if (ksm_pages_shared || remove_all_stable_nodes())
                        err = -EBUSY;
                else if (root_stable_tree == one_stable_tree) {
                        struct rb_root *buf;
                        /*
                         * This is the first time that we switch away from the
                         * default of merging across nodes: must now allocate
                         * a buffer to hold as many roots as may be needed.
                         * Allocate stable and unstable together:
                         * MAXSMP NODES_SHIFT 10 will use 16kB.
                         */
                        buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
                                      GFP_KERNEL);
                        /* Let us assume that RB_ROOT is NULL is zero */
                        if (!buf)
                                err = -ENOMEM;
                        else {
                                root_stable_tree = buf;
                                root_unstable_tree = buf + nr_node_ids;
                                /* Stable tree is empty but not the unstable */
                                root_unstable_tree[0] = one_unstable_tree[0];
                        }
                }
                if (!err) {
                        ksm_merge_across_nodes = knob;
                        ksm_nr_node_ids = knob ? 1 : nr_node_ids;
                }
        }
        mutex_unlock(&ksm_thread_mutex);

        return err ? err : count;
}
KSM_ATTR(merge_across_nodes);
#endif

static ssize_t use_zero_pages_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
}
static ssize_t use_zero_pages_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        bool value;

        err = kstrtobool(buf, &value);
        if (err)
                return -EINVAL;

        ksm_use_zero_pages = value;

        return count;
}
KSM_ATTR(use_zero_pages);

static ssize_t max_page_sharing_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
}

static ssize_t max_page_sharing_store(struct kobject *kobj,
                                      struct kobj_attribute *attr,
                                      const char *buf, size_t count)
{
        int err;
        int knob;

        err = kstrtoint(buf, 10, &knob);
        if (err)
                return err;
        /*
         * When a KSM page is created it is shared by 2 mappings. This
         * being a signed comparison, it implicitly verifies it's not
         * negative.
         */
        if (knob < 2)
                return -EINVAL;

        if (READ_ONCE(ksm_max_page_sharing) == knob)
                return count;

        mutex_lock(&ksm_thread_mutex);
        wait_while_offlining();
        if (ksm_max_page_sharing != knob) {
                if (ksm_pages_shared || remove_all_stable_nodes())
                        err = -EBUSY;
                else
                        ksm_max_page_sharing = knob;
        }
        mutex_unlock(&ksm_thread_mutex);

        return err ? err : count;
}
KSM_ATTR(max_page_sharing);

static ssize_t pages_scanned_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_scanned);
}
KSM_ATTR_RO(pages_scanned);

static ssize_t pages_shared_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
}
KSM_ATTR_RO(pages_shared);

static ssize_t pages_sharing_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
}
KSM_ATTR_RO(pages_sharing);

static ssize_t pages_unshared_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
}
KSM_ATTR_RO(pages_unshared);

static ssize_t pages_volatile_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        long ksm_pages_volatile;

        ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared
                                - ksm_pages_sharing - ksm_pages_unshared;
        /*
         * It was not worth any locking to calculate that statistic,
         * but it might therefore sometimes be negative: conceal that.
         */
        if (ksm_pages_volatile < 0)
                ksm_pages_volatile = 0;
        return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
}
KSM_ATTR_RO(pages_volatile);

static ssize_t pages_skipped_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_pages_skipped);
}
KSM_ATTR_RO(pages_skipped);

static ssize_t ksm_zero_pages_show(struct kobject *kobj,
                                struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%ld\n", atomic_long_read(&ksm_zero_pages));
}
KSM_ATTR_RO(ksm_zero_pages);

static ssize_t general_profit_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        long general_profit;

        general_profit = (ksm_pages_sharing + atomic_long_read(&ksm_zero_pages)) * PAGE_SIZE -
                                ksm_rmap_items * sizeof(struct ksm_rmap_item);

        return sysfs_emit(buf, "%ld\n", general_profit);
}
KSM_ATTR_RO(general_profit);

static ssize_t stable_node_dups_show(struct kobject *kobj,
                                     struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
}
KSM_ATTR_RO(stable_node_dups);

static ssize_t stable_node_chains_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
}
KSM_ATTR_RO(stable_node_chains);

static ssize_t
stable_node_chains_prune_millisecs_show(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
}

static ssize_t
stable_node_chains_prune_millisecs_store(struct kobject *kobj,
                                         struct kobj_attribute *attr,
                                         const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        ksm_stable_node_chains_prune_millisecs = msecs;

        return count;
}
KSM_ATTR(stable_node_chains_prune_millisecs);

static ssize_t full_scans_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
}
KSM_ATTR_RO(full_scans);

static ssize_t smart_scan_show(struct kobject *kobj,
                               struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_smart_scan);
}

static ssize_t smart_scan_store(struct kobject *kobj,
                                struct kobj_attribute *attr,
                                const char *buf, size_t count)
{
        int err;
        bool value;

        err = kstrtobool(buf, &value);
        if (err)
                return -EINVAL;

        ksm_smart_scan = value;
        return count;
}
KSM_ATTR(smart_scan);

static ssize_t advisor_mode_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (ksm_advisor == KSM_ADVISOR_NONE)
                output = "[none] scan-time";
        else if (ksm_advisor == KSM_ADVISOR_SCAN_TIME)
                output = "none [scan-time]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t advisor_mode_store(struct kobject *kobj,
                                  struct kobj_attribute *attr, const char *buf,
                                  size_t count)
{
        enum ksm_advisor_type curr_advisor = ksm_advisor;

        if (sysfs_streq("scan-time", buf))
                ksm_advisor = KSM_ADVISOR_SCAN_TIME;
        else if (sysfs_streq("none", buf))
                ksm_advisor = KSM_ADVISOR_NONE;
        else
                return -EINVAL;

        /* Set advisor default values */
        if (curr_advisor != ksm_advisor)
                set_advisor_defaults();

        return count;
}
KSM_ATTR(advisor_mode);

static ssize_t advisor_max_cpu_show(struct kobject *kobj,
                                    struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%u\n", ksm_advisor_max_cpu);
}

static ssize_t advisor_max_cpu_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;

        ksm_advisor_max_cpu = value;
        return count;
}
KSM_ATTR(advisor_max_cpu);

static ssize_t advisor_min_pages_to_scan_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_advisor_min_pages_to_scan);
}

static ssize_t advisor_min_pages_to_scan_store(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;

        ksm_advisor_min_pages_to_scan = value;
        return count;
}
KSM_ATTR(advisor_min_pages_to_scan);

static ssize_t advisor_max_pages_to_scan_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_advisor_max_pages_to_scan);
}

static ssize_t advisor_max_pages_to_scan_store(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;

        ksm_advisor_max_pages_to_scan = value;
        return count;
}
KSM_ATTR(advisor_max_pages_to_scan);

static ssize_t advisor_target_scan_time_show(struct kobject *kobj,
                                             struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", ksm_advisor_target_scan_time);
}

static ssize_t advisor_target_scan_time_store(struct kobject *kobj,
                                              struct kobj_attribute *attr,
                                              const char *buf, size_t count)
{
        int err;
        unsigned long value;

        err = kstrtoul(buf, 10, &value);
        if (err)
                return -EINVAL;
        if (value < 1)
                return -EINVAL;

        ksm_advisor_target_scan_time = value;
        return count;
}
KSM_ATTR(advisor_target_scan_time);

static struct attribute *ksm_attrs[] = {
        &sleep_millisecs_attr.attr,
        &pages_to_scan_attr.attr,
        &run_attr.attr,
        &pages_scanned_attr.attr,
        &pages_shared_attr.attr,
        &pages_sharing_attr.attr,
        &pages_unshared_attr.attr,
        &pages_volatile_attr.attr,
        &pages_skipped_attr.attr,
        &ksm_zero_pages_attr.attr,
        &full_scans_attr.attr,
#ifdef CONFIG_NUMA
        &merge_across_nodes_attr.attr,
#endif
        &max_page_sharing_attr.attr,
        &stable_node_chains_attr.attr,
        &stable_node_dups_attr.attr,
        &stable_node_chains_prune_millisecs_attr.attr,
        &use_zero_pages_attr.attr,
        &general_profit_attr.attr,
        &smart_scan_attr.attr,
        &advisor_mode_attr.attr,
        &advisor_max_cpu_attr.attr,
        &advisor_min_pages_to_scan_attr.attr,
        &advisor_max_pages_to_scan_attr.attr,
        &advisor_target_scan_time_attr.attr,
        NULL,
};

static const struct attribute_group ksm_attr_group = {
        .attrs = ksm_attrs,
        .name = "ksm",
};
#endif /* CONFIG_SYSFS */

static int __init ksm_init(void)
{
        struct task_struct *ksm_thread;
        int err;

        /* The correct value depends on page size and endianness */
        zero_checksum = calc_checksum(ZERO_PAGE(0));
        /* Default to false for backwards compatibility */
        ksm_use_zero_pages = false;

        err = ksm_slab_init();
        if (err)
                goto out;

        ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
        if (IS_ERR(ksm_thread)) {
                pr_err("ksm: creating kthread failed\n");
                err = PTR_ERR(ksm_thread);
                goto out_free;
        }

#ifdef CONFIG_SYSFS
        err = sysfs_create_group(mm_kobj, &ksm_attr_group);
        if (err) {
                pr_err("ksm: register sysfs failed\n");
                kthread_stop(ksm_thread);
                goto out_free;
        }
#else
        ksm_run = KSM_RUN_MERGE;        /* no way for user to start it */

#endif /* CONFIG_SYSFS */

#ifdef CONFIG_MEMORY_HOTREMOVE
        /* There is no significance to this priority 100 */
        hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI);
#endif
        return 0;

out_free:
        ksm_slab_free();
out:
        return err;
}
subsys_initcall(ksm_init);




























































    8 










































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_NS_H
#define _LINUX_PID_NS_H

#include <linux/sched.h>
#include <linux/bug.h>
#include <linux/mm.h>
#include <linux/workqueue.h>
#include <linux/threads.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/idr.h>

/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32

struct fs_pin;

#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
/* modes for vm.memfd_noexec sysctl */
#define MEMFD_NOEXEC_SCOPE_EXEC                        0 /* MFD_EXEC implied if unset */
#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL                1 /* MFD_NOEXEC_SEAL implied if unset */
#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED        2 /* same as 1, except MFD_EXEC rejected */
#endif

struct pid_namespace {
        struct idr idr;
        struct rcu_head rcu;
        unsigned int pid_allocated;
        struct task_struct *child_reaper;
        struct kmem_cache *pid_cachep;
        unsigned int level;
        int pid_max;
        struct pid_namespace *parent;
#ifdef CONFIG_BSD_PROCESS_ACCT
        struct fs_pin *bacct;
#endif
        struct user_namespace *user_ns;
        struct ucounts *ucounts;
        int reboot;        /* group exit code if this pidns was rebooted */
        struct ns_common ns;
        struct work_struct        work;
#ifdef CONFIG_SYSCTL
        struct ctl_table_set        set;
        struct ctl_table_header *sysctls;
#if defined(CONFIG_MEMFD_CREATE)
        int memfd_noexec_scope;
#endif
#endif
} __randomize_layout;

extern struct pid_namespace init_pid_ns;

#define PIDNS_ADDING (1U << 31)

#ifdef CONFIG_PID_NS
static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        if (ns != &init_pid_ns)
                refcount_inc(&ns->ns.count);
        return ns;
}

#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        int scope = MEMFD_NOEXEC_SCOPE_EXEC;

        for (; ns; ns = ns->parent)
                scope = max(scope, READ_ONCE(ns->memfd_noexec_scope));

        return scope;
}
#else
static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        return 0;
}
#endif

extern struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *ns);
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
extern void put_pid_ns(struct pid_namespace *ns);

#else /* !CONFIG_PID_NS */
#include <linux/err.h>

static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
{
        return ns;
}

static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns)
{
        return 0;
}

static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
        struct user_namespace *user_ns, struct pid_namespace *ns)
{
        if (flags & CLONE_NEWPID)
                ns = ERR_PTR(-EINVAL);
        return ns;
}

static inline void put_pid_ns(struct pid_namespace *ns)
{
}

static inline void zap_pid_ns_processes(struct pid_namespace *ns)
{
        BUG();
}

static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
        return 0;
}
#endif /* CONFIG_PID_NS */

extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);
int register_pidns_sysctls(struct pid_namespace *pidns);
void unregister_pidns_sysctls(struct pid_namespace *pidns);

static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
{
        return task_active_pid_ns(tsk) == &init_pid_ns;
}

#endif /* _LINUX_PID_NS_H */









































































































































































































































































































































































































































  231 
















































  232 

  231 
  232 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/mm_inline.h>
#include <linux/kthread.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/mman.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/rcupdate_wait.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/dax.h>
#include <linux/ksm.h>

#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
#include "mm_slot.h"

enum scan_result {
        SCAN_FAIL,
        SCAN_SUCCEED,
        SCAN_PMD_NULL,
        SCAN_PMD_NONE,
        SCAN_PMD_MAPPED,
        SCAN_EXCEED_NONE_PTE,
        SCAN_EXCEED_SWAP_PTE,
        SCAN_EXCEED_SHARED_PTE,
        SCAN_PTE_NON_PRESENT,
        SCAN_PTE_UFFD_WP,
        SCAN_PTE_MAPPED_HUGEPAGE,
        SCAN_PAGE_RO,
        SCAN_LACK_REFERENCED_PAGE,
        SCAN_PAGE_NULL,
        SCAN_SCAN_ABORT,
        SCAN_PAGE_COUNT,
        SCAN_PAGE_LRU,
        SCAN_PAGE_LOCK,
        SCAN_PAGE_ANON,
        SCAN_PAGE_COMPOUND,
        SCAN_ANY_PROCESS,
        SCAN_VMA_NULL,
        SCAN_VMA_CHECK,
        SCAN_ADDRESS_RANGE,
        SCAN_DEL_PAGE_LRU,
        SCAN_ALLOC_HUGE_PAGE_FAIL,
        SCAN_CGROUP_CHARGE_FAIL,
        SCAN_TRUNCATED,
        SCAN_PAGE_HAS_PRIVATE,
        SCAN_STORE_FAILED,
        SCAN_COPY_MC,
        SCAN_PAGE_FILLED,
};

#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>

static struct task_struct *khugepaged_thread __read_mostly;
static DEFINE_MUTEX(khugepaged_mutex);

/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
static unsigned int khugepaged_full_scans;
static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
/* during fragmentation poll the hugepage allocator once every minute */
static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
static unsigned long khugepaged_sleep_expire;
static DEFINE_SPINLOCK(khugepaged_mm_lock);
static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
/*
 * default collapse hugepages if there is at least one pte mapped like
 * it would have happened if the vma was large enough during page
 * fault.
 *
 * Note that these are only respected if collapse was initiated by khugepaged.
 */
unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;

#define MM_SLOTS_HASH_BITS 10
static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);

static struct kmem_cache *mm_slot_cache __ro_after_init;

struct collapse_control {
        bool is_khugepaged;

        /* Num pages scanned per node */
        u32 node_load[MAX_NUMNODES];

        /* nodemask for allocation fallback */
        nodemask_t alloc_nmask;
};

/**
 * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
 * @slot: hash lookup from mm to mm_slot
 */
struct khugepaged_mm_slot {
        struct mm_slot slot;
};

/**
 * struct khugepaged_scan - cursor for scanning
 * @mm_head: the head of the mm list to scan
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
 *
 * There is only the one khugepaged_scan instance of this cursor structure.
 */
struct khugepaged_scan {
        struct list_head mm_head;
        struct khugepaged_mm_slot *mm_slot;
        unsigned long address;
};

static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};

#ifdef CONFIG_SYSFS
static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
                                         struct kobj_attribute *attr,
                                         char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}

static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
                                          struct kobj_attribute *attr,
                                          const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        khugepaged_scan_sleep_millisecs = msecs;
        khugepaged_sleep_expire = 0;
        wake_up_interruptible(&khugepaged_wait);

        return count;
}
static struct kobj_attribute scan_sleep_millisecs_attr =
        __ATTR_RW(scan_sleep_millisecs);

static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
                                          struct kobj_attribute *attr,
                                          char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
}

static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
                                           struct kobj_attribute *attr,
                                           const char *buf, size_t count)
{
        unsigned int msecs;
        int err;

        err = kstrtouint(buf, 10, &msecs);
        if (err)
                return -EINVAL;

        khugepaged_alloc_sleep_millisecs = msecs;
        khugepaged_sleep_expire = 0;
        wake_up_interruptible(&khugepaged_wait);

        return count;
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
        __ATTR_RW(alloc_sleep_millisecs);

static ssize_t pages_to_scan_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        unsigned int pages;
        int err;

        err = kstrtouint(buf, 10, &pages);
        if (err || !pages)
                return -EINVAL;

        khugepaged_pages_to_scan = pages;

        return count;
}
static struct kobj_attribute pages_to_scan_attr =
        __ATTR_RW(pages_to_scan);

static ssize_t pages_collapsed_show(struct kobject *kobj,
                                    struct kobj_attribute *attr,
                                    char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
}
static struct kobj_attribute pages_collapsed_attr =
        __ATTR_RO(pages_collapsed);

static ssize_t full_scans_show(struct kobject *kobj,
                               struct kobj_attribute *attr,
                               char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
}
static struct kobj_attribute full_scans_attr =
        __ATTR_RO(full_scans);

static ssize_t defrag_show(struct kobject *kobj,
                           struct kobj_attribute *attr, char *buf)
{
        return single_hugepage_flag_show(kobj, attr, buf,
                                         TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static ssize_t defrag_store(struct kobject *kobj,
                            struct kobj_attribute *attr,
                            const char *buf, size_t count)
{
        return single_hugepage_flag_store(kobj, attr, buf, count,
                                 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
        __ATTR_RW(defrag);

/*
 * max_ptes_none controls if khugepaged should collapse hugepages over
 * any unmapped ptes in turn potentially increasing the memory
 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
 * reduce the available free memory in the system as it
 * runs. Increasing max_ptes_none will instead potentially reduce the
 * free memory in the system during the khugepaged scan.
 */
static ssize_t max_ptes_none_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
}
static ssize_t max_ptes_none_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_none;

        err = kstrtoul(buf, 10, &max_ptes_none);
        if (err || max_ptes_none > HPAGE_PMD_NR - 1)
                return -EINVAL;

        khugepaged_max_ptes_none = max_ptes_none;

        return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
        __ATTR_RW(max_ptes_none);

static ssize_t max_ptes_swap_show(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
}

static ssize_t max_ptes_swap_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_swap;

        err  = kstrtoul(buf, 10, &max_ptes_swap);
        if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
                return -EINVAL;

        khugepaged_max_ptes_swap = max_ptes_swap;

        return count;
}

static struct kobj_attribute khugepaged_max_ptes_swap_attr =
        __ATTR_RW(max_ptes_swap);

static ssize_t max_ptes_shared_show(struct kobject *kobj,
                                    struct kobj_attribute *attr,
                                    char *buf)
{
        return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
}

static ssize_t max_ptes_shared_store(struct kobject *kobj,
                                     struct kobj_attribute *attr,
                                     const char *buf, size_t count)
{
        int err;
        unsigned long max_ptes_shared;

        err  = kstrtoul(buf, 10, &max_ptes_shared);
        if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
                return -EINVAL;

        khugepaged_max_ptes_shared = max_ptes_shared;

        return count;
}

static struct kobj_attribute khugepaged_max_ptes_shared_attr =
        __ATTR_RW(max_ptes_shared);

static struct attribute *khugepaged_attr[] = {
        &khugepaged_defrag_attr.attr,
        &khugepaged_max_ptes_none_attr.attr,
        &khugepaged_max_ptes_swap_attr.attr,
        &khugepaged_max_ptes_shared_attr.attr,
        &pages_to_scan_attr.attr,
        &pages_collapsed_attr.attr,
        &full_scans_attr.attr,
        &scan_sleep_millisecs_attr.attr,
        &alloc_sleep_millisecs_attr.attr,
        NULL,
};

struct attribute_group khugepaged_attr_group = {
        .attrs = khugepaged_attr,
        .name = "khugepaged",
};
#endif /* CONFIG_SYSFS */

int hugepage_madvise(struct vm_area_struct *vma,
                     unsigned long *vm_flags, int advice)
{
        switch (advice) {
        case MADV_HUGEPAGE:
#ifdef CONFIG_S390
                /*
                 * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
                 * can't handle this properly after s390_enable_sie, so we simply
                 * ignore the madvise to prevent qemu from causing a SIGSEGV.
                 */
                if (mm_has_pgste(vma->vm_mm))
                        return 0;
#endif
                *vm_flags &= ~VM_NOHUGEPAGE;
                *vm_flags |= VM_HUGEPAGE;
                /*
                 * If the vma become good for khugepaged to scan,
                 * register it here without waiting a page fault that
                 * may not happen any time soon.
                 */
                khugepaged_enter_vma(vma, *vm_flags);
                break;
        case MADV_NOHUGEPAGE:
                *vm_flags &= ~VM_HUGEPAGE;
                *vm_flags |= VM_NOHUGEPAGE;
                /*
                 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
                 * this vma even if we leave the mm registered in khugepaged if
                 * it got registered before VM_NOHUGEPAGE was set.
                 */
                break;
        }

        return 0;
}

int __init khugepaged_init(void)
{
        mm_slot_cache = KMEM_CACHE(khugepaged_mm_slot, 0);
        if (!mm_slot_cache)
                return -ENOMEM;

        khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
        khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
        khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
        khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;

        return 0;
}

void __init khugepaged_destroy(void)
{
        kmem_cache_destroy(mm_slot_cache);
}

static inline int hpage_collapse_test_exit(struct mm_struct *mm)
{
        return atomic_read(&mm->mm_users) == 0;
}

static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm)
{
        return hpage_collapse_test_exit(mm) ||
               test_bit(MMF_DISABLE_THP, &mm->flags);
}

static bool hugepage_pmd_enabled(void)
{
        /*
         * We cover the anon, shmem and the file-backed case here; file-backed
         * hugepages, when configured in, are determined by the global control.
         * Anon pmd-sized hugepages are determined by the pmd-size control.
         * Shmem pmd-sized hugepages are also determined by its pmd-size control,
         * except when the global shmem_huge is set to SHMEM_HUGE_DENY.
         */
        if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
            hugepage_global_enabled())
                return true;
        if (test_bit(PMD_ORDER, &huge_anon_orders_always))
                return true;
        if (test_bit(PMD_ORDER, &huge_anon_orders_madvise))
                return true;
        if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) &&
            hugepage_global_enabled())
                return true;
        if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled())
                return true;
        return false;
}

void __khugepaged_enter(struct mm_struct *mm)
{
        struct khugepaged_mm_slot *mm_slot;
        struct mm_slot *slot;
        int wakeup;

        /* __khugepaged_exit() must not run from under us */
        VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags)))
                return;

        mm_slot = mm_slot_alloc(mm_slot_cache);
        if (!mm_slot)
                return;

        slot = &mm_slot->slot;

        spin_lock(&khugepaged_mm_lock);
        mm_slot_insert(mm_slots_hash, mm, slot);
        /*
         * Insert just behind the scanning cursor, to let the area settle
         * down a little.
         */
        wakeup = list_empty(&khugepaged_scan.mm_head);
        list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
        spin_unlock(&khugepaged_mm_lock);

        mmgrab(mm);
        if (wakeup)
                wake_up_interruptible(&khugepaged_wait);
}

void khugepaged_enter_vma(struct vm_area_struct *vma,
                          unsigned long vm_flags)
{
        if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
            hugepage_pmd_enabled()) {
                if (thp_vma_allowable_order(vma, vm_flags, TVA_ENFORCE_SYSFS,
                                            PMD_ORDER))
                        __khugepaged_enter(vma->vm_mm);
        }
}

void __khugepaged_exit(struct mm_struct *mm)
{
        struct khugepaged_mm_slot *mm_slot;
        struct mm_slot *slot;
        int free = 0;

        spin_lock(&khugepaged_mm_lock);
        slot = mm_slot_lookup(mm_slots_hash, mm);
        mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
                hash_del(&slot->hash);
                list_del(&slot->mm_node);
                free = 1;
        }
        spin_unlock(&khugepaged_mm_lock);

        if (free) {
                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                mm_slot_free(mm_slot_cache, mm_slot);
                mmdrop(mm);
        } else if (mm_slot) {
                /*
                 * This is required to serialize against
                 * hpage_collapse_test_exit() (which is guaranteed to run
                 * under mmap sem read mode). Stop here (after we return all
                 * pagetables will be destroyed) until khugepaged has finished
                 * working on the pagetables under the mmap_lock.
                 */
                mmap_write_lock(mm);
                mmap_write_unlock(mm);
        }
}

static void release_pte_folio(struct folio *folio)
{
        node_stat_mod_folio(folio,
                        NR_ISOLATED_ANON + folio_is_file_lru(folio),
                        -folio_nr_pages(folio));
        folio_unlock(folio);
        folio_putback_lru(folio);
}

static void release_pte_pages(pte_t *pte, pte_t *_pte,
                struct list_head *compound_pagelist)
{
        struct folio *folio, *tmp;

        while (--_pte >= pte) {
                pte_t pteval = ptep_get(_pte);
                unsigned long pfn;

                if (pte_none(pteval))
                        continue;
                pfn = pte_pfn(pteval);
                if (is_zero_pfn(pfn))
                        continue;
                folio = pfn_folio(pfn);
                if (folio_test_large(folio))
                        continue;
                release_pte_folio(folio);
        }

        list_for_each_entry_safe(folio, tmp, compound_pagelist, lru) {
                list_del(&folio->lru);
                release_pte_folio(folio);
        }
}

static bool is_refcount_suitable(struct folio *folio)
{
        int expected_refcount = folio_mapcount(folio);

        if (!folio_test_anon(folio) || folio_test_swapcache(folio))
                expected_refcount += folio_nr_pages(folio);

        if (folio_test_private(folio))
                expected_refcount++;

        return folio_ref_count(folio) == expected_refcount;
}

static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pte_t *pte,
                                        struct collapse_control *cc,
                                        struct list_head *compound_pagelist)
{
        struct page *page = NULL;
        struct folio *folio = NULL;
        pte_t *_pte;
        int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
        bool writable = false;

        for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pte_t pteval = ptep_get(_pte);
                if (pte_none(pteval) || (pte_present(pteval) &&
                                is_zero_pfn(pte_pfn(pteval)))) {
                        ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
                            (!cc->is_khugepaged ||
                             none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
                                count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                                goto out;
                        }
                }
                if (!pte_present(pteval)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto out;
                }
                if (pte_uffd_wp(pteval)) {
                        result = SCAN_PTE_UFFD_WP;
                        goto out;
                }
                page = vm_normal_page(vma, address, pteval);
                if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out;
                }

                folio = page_folio(page);
                VM_BUG_ON_FOLIO(!folio_test_anon(folio), folio);

                /* See hpage_collapse_scan_pmd(). */
                if (folio_maybe_mapped_shared(folio)) {
                        ++shared;
                        if (cc->is_khugepaged &&
                            shared > khugepaged_max_ptes_shared) {
                                result = SCAN_EXCEED_SHARED_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
                                goto out;
                        }
                }

                if (folio_test_large(folio)) {
                        struct folio *f;

                        /*
                         * Check if we have dealt with the compound page
                         * already
                         */
                        list_for_each_entry(f, compound_pagelist, lru) {
                                if (folio == f)
                                        goto next;
                        }
                }

                /*
                 * We can do it before folio_isolate_lru because the
                 * folio can't be freed from under us. NOTE: PG_lock
                 * is needed to serialize against split_huge_page
                 * when invoked from the VM.
                 */
                if (!folio_trylock(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out;
                }

                /*
                 * Check if the page has any GUP (or other external) pins.
                 *
                 * The page table that maps the page has been already unlinked
                 * from the page table tree and this process cannot get
                 * an additional pin on the page.
                 *
                 * New pins can come later if the page is shared across fork,
                 * but not from this process. The other process cannot write to
                 * the page, only trigger CoW.
                 */
                if (!is_refcount_suitable(folio)) {
                        folio_unlock(folio);
                        result = SCAN_PAGE_COUNT;
                        goto out;
                }

                /*
                 * Isolate the page to avoid collapsing an hugepage
                 * currently in use by the VM.
                 */
                if (!folio_isolate_lru(folio)) {
                        folio_unlock(folio);
                        result = SCAN_DEL_PAGE_LRU;
                        goto out;
                }
                node_stat_mod_folio(folio,
                                NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                folio_nr_pages(folio));
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
                VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

                if (folio_test_large(folio))
                        list_add_tail(&folio->lru, compound_pagelist);
next:
                /*
                 * If collapse was initiated by khugepaged, check that there is
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
                    (pte_young(pteval) || folio_test_young(folio) ||
                     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                     address)))
                        referenced++;

                if (pte_write(pteval))
                        writable = true;
        }

        if (unlikely(!writable)) {
                result = SCAN_PAGE_RO;
        } else if (unlikely(cc->is_khugepaged && !referenced)) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
                trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                                    referenced, writable, result);
                return result;
        }
out:
        release_pte_pages(pte, _pte, compound_pagelist);
        trace_mm_collapse_huge_page_isolate(&folio->page, none_or_zero,
                                            referenced, writable, result);
        return result;
}

static void __collapse_huge_page_copy_succeeded(pte_t *pte,
                                                struct vm_area_struct *vma,
                                                unsigned long address,
                                                spinlock_t *ptl,
                                                struct list_head *compound_pagelist)
{
        struct folio *src, *tmp;
        pte_t *_pte;
        pte_t pteval;

        for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, address += PAGE_SIZE) {
                pteval = ptep_get(_pte);
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
                        if (is_zero_pfn(pte_pfn(pteval))) {
                                /*
                                 * ptl mostly unnecessary.
                                 */
                                spin_lock(ptl);
                                ptep_clear(vma->vm_mm, address, _pte);
                                spin_unlock(ptl);
                                ksm_might_unmap_zero_page(vma->vm_mm, pteval);
                        }
                } else {
                        struct page *src_page = pte_page(pteval);

                        src = page_folio(src_page);
                        if (!folio_test_large(src))
                                release_pte_folio(src);
                        /*
                         * ptl mostly unnecessary, but preempt has to
                         * be disabled to update the per-cpu stats
                         * inside folio_remove_rmap_pte().
                         */
                        spin_lock(ptl);
                        ptep_clear(vma->vm_mm, address, _pte);
                        folio_remove_rmap_pte(src, src_page, vma);
                        spin_unlock(ptl);
                        free_page_and_swap_cache(src_page);
                }
        }

        list_for_each_entry_safe(src, tmp, compound_pagelist, lru) {
                list_del(&src->lru);
                node_stat_sub_folio(src, NR_ISOLATED_ANON +
                                folio_is_file_lru(src));
                folio_unlock(src);
                free_swap_cache(src);
                folio_putback_lru(src);
        }
}

static void __collapse_huge_page_copy_failed(pte_t *pte,
                                             pmd_t *pmd,
                                             pmd_t orig_pmd,
                                             struct vm_area_struct *vma,
                                             struct list_head *compound_pagelist)
{
        spinlock_t *pmd_ptl;

        /*
         * Re-establish the PMD to point to the original page table
         * entry. Restoring PMD needs to be done prior to releasing
         * pages. Since pages are still isolated and locked here,
         * acquiring anon_vma_lock_write is unnecessary.
         */
        pmd_ptl = pmd_lock(vma->vm_mm, pmd);
        pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
        spin_unlock(pmd_ptl);
        /*
         * Release both raw and compound pages isolated
         * in __collapse_huge_page_isolate.
         */
        release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
}

/*
 * __collapse_huge_page_copy - attempts to copy memory contents from raw
 * pages to a hugepage. Cleans up the raw pages if copying succeeds;
 * otherwise restores the original page table and releases isolated raw pages.
 * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
 *
 * @pte: starting of the PTEs to copy from
 * @folio: the new hugepage to copy contents to
 * @pmd: pointer to the new hugepage's PMD
 * @orig_pmd: the original raw pages' PMD
 * @vma: the original raw pages' virtual memory area
 * @address: starting address to copy
 * @ptl: lock on raw pages' PTEs
 * @compound_pagelist: list that stores compound pages
 */
static int __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
                pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
                unsigned long address, spinlock_t *ptl,
                struct list_head *compound_pagelist)
{
        unsigned int i;
        int result = SCAN_SUCCEED;

        /*
         * Copying pages' contents is subject to memory poison at any iteration.
         */
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                pte_t pteval = ptep_get(pte + i);
                struct page *page = folio_page(folio, i);
                unsigned long src_addr = address + i * PAGE_SIZE;
                struct page *src_page;

                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        clear_user_highpage(page, src_addr);
                        continue;
                }
                src_page = pte_page(pteval);
                if (copy_mc_user_highpage(page, src_page, src_addr, vma) > 0) {
                        result = SCAN_COPY_MC;
                        break;
                }
        }

        if (likely(result == SCAN_SUCCEED))
                __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
                                                    compound_pagelist);
        else
                __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
                                                 compound_pagelist);

        return result;
}

static void khugepaged_alloc_sleep(void)
{
        DEFINE_WAIT(wait);

        add_wait_queue(&khugepaged_wait, &wait);
        __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
        schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
        remove_wait_queue(&khugepaged_wait, &wait);
}

struct collapse_control khugepaged_collapse_control = {
        .is_khugepaged = true,
};

static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
{
        int i;

        /*
         * If node_reclaim_mode is disabled, then no extra effort is made to
         * allocate memory locally.
         */
        if (!node_reclaim_enabled())
                return false;

        /* If there is a count for this node already, it must be acceptable */
        if (cc->node_load[nid])
                return false;

        for (i = 0; i < MAX_NUMNODES; i++) {
                if (!cc->node_load[i])
                        continue;
                if (node_distance(nid, i) > node_reclaim_distance)
                        return true;
        }
        return false;
}

#define khugepaged_defrag()                                        \
        (transparent_hugepage_flags &                                \
         (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))

/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
        return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}

#ifdef CONFIG_NUMA
static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
        int nid, target_node = 0, max_value = 0;

        /* find first node with max normal pages hit */
        for (nid = 0; nid < MAX_NUMNODES; nid++)
                if (cc->node_load[nid] > max_value) {
                        max_value = cc->node_load[nid];
                        target_node = nid;
                }

        for_each_online_node(nid) {
                if (max_value == cc->node_load[nid])
                        node_set(nid, cc->alloc_nmask);
        }

        return target_node;
}
#else
static int hpage_collapse_find_target_node(struct collapse_control *cc)
{
        return 0;
}
#endif

/*
 * If mmap_lock temporarily dropped, revalidate vma
 * before taking mmap_lock.
 * Returns enum scan_result value.
 */

static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
                                   bool expect_anon,
                                   struct vm_area_struct **vmap,
                                   struct collapse_control *cc)
{
        struct vm_area_struct *vma;
        unsigned long tva_flags = cc->is_khugepaged ? TVA_ENFORCE_SYSFS : 0;

        if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
                return SCAN_ANY_PROCESS;

        *vmap = vma = find_vma(mm, address);
        if (!vma)
                return SCAN_VMA_NULL;

        if (!thp_vma_suitable_order(vma, address, PMD_ORDER))
                return SCAN_ADDRESS_RANGE;
        if (!thp_vma_allowable_order(vma, vma->vm_flags, tva_flags, PMD_ORDER))
                return SCAN_VMA_CHECK;
        /*
         * Anon VMA expected, the address may be unmapped then
         * remapped to file after khugepaged reaquired the mmap_lock.
         *
         * thp_vma_allowable_order may return true for qualified file
         * vmas.
         */
        if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
                return SCAN_PAGE_ANON;
        return SCAN_SUCCEED;
}

static inline int check_pmd_state(pmd_t *pmd)
{
        pmd_t pmde = pmdp_get_lockless(pmd);

        if (pmd_none(pmde))
                return SCAN_PMD_NONE;
        if (!pmd_present(pmde))
                return SCAN_PMD_NULL;
        if (pmd_trans_huge(pmde))
                return SCAN_PMD_MAPPED;
        if (pmd_devmap(pmde))
                return SCAN_PMD_NULL;
        if (pmd_bad(pmde))
                return SCAN_PMD_NULL;
        return SCAN_SUCCEED;
}

static int find_pmd_or_thp_or_none(struct mm_struct *mm,
                                   unsigned long address,
                                   pmd_t **pmd)
{
        *pmd = mm_find_pmd(mm, address);
        if (!*pmd)
                return SCAN_PMD_NULL;

        return check_pmd_state(*pmd);
}

static int check_pmd_still_valid(struct mm_struct *mm,
                                 unsigned long address,
                                 pmd_t *pmd)
{
        pmd_t *new_pmd;
        int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);

        if (result != SCAN_SUCCEED)
                return result;
        if (new_pmd != pmd)
                return SCAN_FAIL;
        return SCAN_SUCCEED;
}

/*
 * Bring missing pages in from swap, to complete THP collapse.
 * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
 *
 * Called and returns without pte mapped or spinlocks held.
 * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
 */
static int __collapse_huge_page_swapin(struct mm_struct *mm,
                                       struct vm_area_struct *vma,
                                       unsigned long haddr, pmd_t *pmd,
                                       int referenced)
{
        int swapped_in = 0;
        vm_fault_t ret = 0;
        unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
        int result;
        pte_t *pte = NULL;
        spinlock_t *ptl;

        for (address = haddr; address < end; address += PAGE_SIZE) {
                struct vm_fault vmf = {
                        .vma = vma,
                        .address = address,
                        .pgoff = linear_page_index(vma, address),
                        .flags = FAULT_FLAG_ALLOW_RETRY,
                        .pmd = pmd,
                };

                if (!pte++) {
                        /*
                         * Here the ptl is only used to check pte_same() in
                         * do_swap_page(), so readonly version is enough.
                         */
                        pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl);
                        if (!pte) {
                                mmap_read_unlock(mm);
                                result = SCAN_PMD_NULL;
                                goto out;
                        }
                }

                vmf.orig_pte = ptep_get_lockless(pte);
                if (!is_swap_pte(vmf.orig_pte))
                        continue;

                vmf.pte = pte;
                vmf.ptl = ptl;
                ret = do_swap_page(&vmf);
                /* Which unmaps pte (after perhaps re-checking the entry) */
                pte = NULL;

                /*
                 * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
                 * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
                 * we do not retry here and swap entry will remain in pagetable
                 * resulting in later failure.
                 */
                if (ret & VM_FAULT_RETRY) {
                        /* Likely, but not guaranteed, that page lock failed */
                        result = SCAN_PAGE_LOCK;
                        goto out;
                }
                if (ret & VM_FAULT_ERROR) {
                        mmap_read_unlock(mm);
                        result = SCAN_FAIL;
                        goto out;
                }
                swapped_in++;
        }

        if (pte)
                pte_unmap(pte);

        /* Drain LRU cache to remove extra pin on the swapped in pages */
        if (swapped_in)
                lru_add_drain();

        result = SCAN_SUCCEED;
out:
        trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, result);
        return result;
}

static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
                              struct collapse_control *cc)
{
        gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
                     GFP_TRANSHUGE);
        int node = hpage_collapse_find_target_node(cc);
        struct folio *folio;

        folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
        if (!folio) {
                *foliop = NULL;
                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
                return SCAN_ALLOC_HUGE_PAGE_FAIL;
        }

        count_vm_event(THP_COLLAPSE_ALLOC);
        if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
                folio_put(folio);
                *foliop = NULL;
                return SCAN_CGROUP_CHARGE_FAIL;
        }

        count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);

        *foliop = folio;
        return SCAN_SUCCEED;
}

static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
                              int referenced, int unmapped,
                              struct collapse_control *cc)
{
        LIST_HEAD(compound_pagelist);
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pgtable_t pgtable;
        struct folio *folio;
        spinlock_t *pmd_ptl, *pte_ptl;
        int result = SCAN_FAIL;
        struct vm_area_struct *vma;
        struct mmu_notifier_range range;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        /*
         * Before allocating the hugepage, release the mmap_lock read lock.
         * The allocation can take potentially a long time if it involves
         * sync compaction, and we do not need to hold the mmap_lock during
         * that. We will recheck the vma after taking it again in write mode.
         */
        mmap_read_unlock(mm);

        result = alloc_charge_folio(&folio, mm, cc);
        if (result != SCAN_SUCCEED)
                goto out_nolock;

        mmap_read_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }

        result = find_pmd_or_thp_or_none(mm, address, &pmd);
        if (result != SCAN_SUCCEED) {
                mmap_read_unlock(mm);
                goto out_nolock;
        }

        if (unmapped) {
                /*
                 * __collapse_huge_page_swapin will return with mmap_lock
                 * released when it fails. So we jump out_nolock directly in
                 * that case.  Continuing to collapse causes inconsistency.
                 */
                result = __collapse_huge_page_swapin(mm, vma, address, pmd,
                                                     referenced);
                if (result != SCAN_SUCCEED)
                        goto out_nolock;
        }

        mmap_read_unlock(mm);
        /*
         * Prevent all access to pagetables with the exception of
         * gup_fast later handled by the ptep_clear_flush and the VM
         * handled by the anon_vma lock + PG_lock.
         *
         * UFFDIO_MOVE is prevented to race as well thanks to the
         * mmap_lock.
         */
        mmap_write_lock(mm);
        result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
        if (result != SCAN_SUCCEED)
                goto out_up_write;
        /* check if the pmd is still valid */
        result = check_pmd_still_valid(mm, address, pmd);
        if (result != SCAN_SUCCEED)
                goto out_up_write;

        vma_start_write(vma);
        anon_vma_lock_write(vma->anon_vma);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
                                address + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
        /*
         * This removes any huge TLB entry from the CPU so we won't allow
         * huge and small TLB entries for the same virtual address to
         * avoid the risk of CPU bugs in that area.
         *
         * Parallel GUP-fast is fine since GUP-fast will back off when
         * it detects PMD is changed.
         */
        _pmd = pmdp_collapse_flush(vma, address, pmd);
        spin_unlock(pmd_ptl);
        mmu_notifier_invalidate_range_end(&range);
        tlb_remove_table_sync_one();

        pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
        if (pte) {
                result = __collapse_huge_page_isolate(vma, address, pte, cc,
                                                      &compound_pagelist);
                spin_unlock(pte_ptl);
        } else {
                result = SCAN_PMD_NULL;
        }

        if (unlikely(result != SCAN_SUCCEED)) {
                if (pte)
                        pte_unmap(pte);
                spin_lock(pmd_ptl);
                BUG_ON(!pmd_none(*pmd));
                /*
                 * We can only use set_pmd_at when establishing
                 * hugepmds and never for establishing regular pmds that
                 * points to regular pagetables. Use pmd_populate for that
                 */
                pmd_populate(mm, pmd, pmd_pgtable(_pmd));
                spin_unlock(pmd_ptl);
                anon_vma_unlock_write(vma->anon_vma);
                goto out_up_write;
        }

        /*
         * All pages are isolated and locked so anon_vma rmap
         * can't run anymore.
         */
        anon_vma_unlock_write(vma->anon_vma);

        result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
                                           vma, address, pte_ptl,
                                           &compound_pagelist);
        pte_unmap(pte);
        if (unlikely(result != SCAN_SUCCEED))
                goto out_up_write;

        /*
         * The smp_wmb() inside __folio_mark_uptodate() ensures the
         * copy_huge_page writes become visible before the set_pmd_at()
         * write.
         */
        __folio_mark_uptodate(folio);
        pgtable = pmd_pgtable(_pmd);

        _pmd = mk_huge_pmd(&folio->page, vma->vm_page_prot);
        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);

        spin_lock(pmd_ptl);
        BUG_ON(!pmd_none(*pmd));
        folio_add_new_anon_rmap(folio, vma, address, RMAP_EXCLUSIVE);
        folio_add_lru_vma(folio, vma);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, address, pmd, _pmd);
        update_mmu_cache_pmd(vma, address, pmd);
        deferred_split_folio(folio, false);
        spin_unlock(pmd_ptl);

        folio = NULL;

        result = SCAN_SUCCEED;
out_up_write:
        mmap_write_unlock(mm);
out_nolock:
        if (folio)
                folio_put(folio);
        trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
        return result;
}

static int hpage_collapse_scan_pmd(struct mm_struct *mm,
                                   struct vm_area_struct *vma,
                                   unsigned long address, bool *mmap_locked,
                                   struct collapse_control *cc)
{
        pmd_t *pmd;
        pte_t *pte, *_pte;
        int result = SCAN_FAIL, referenced = 0;
        int none_or_zero = 0, shared = 0;
        struct page *page = NULL;
        struct folio *folio = NULL;
        unsigned long _address;
        spinlock_t *ptl;
        int node = NUMA_NO_NODE, unmapped = 0;
        bool writable = false;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);

        result = find_pmd_or_thp_or_none(mm, address, &pmd);
        if (result != SCAN_SUCCEED)
                goto out;

        memset(cc->node_load, 0, sizeof(cc->node_load));
        nodes_clear(cc->alloc_nmask);
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (!pte) {
                result = SCAN_PMD_NULL;
                goto out;
        }

        for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
             _pte++, _address += PAGE_SIZE) {
                pte_t pteval = ptep_get(_pte);
                if (is_swap_pte(pteval)) {
                        ++unmapped;
                        if (!cc->is_khugepaged ||
                            unmapped <= khugepaged_max_ptes_swap) {
                                /*
                                 * Always be strict with uffd-wp
                                 * enabled swap entries.  Please see
                                 * comment below for pte_uffd_wp().
                                 */
                                if (pte_swp_uffd_wp_any(pteval)) {
                                        result = SCAN_PTE_UFFD_WP;
                                        goto out_unmap;
                                }
                                continue;
                        } else {
                                result = SCAN_EXCEED_SWAP_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
                                goto out_unmap;
                        }
                }
                if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
                        ++none_or_zero;
                        if (!userfaultfd_armed(vma) &&
                            (!cc->is_khugepaged ||
                             none_or_zero <= khugepaged_max_ptes_none)) {
                                continue;
                        } else {
                                result = SCAN_EXCEED_NONE_PTE;
                                count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                                goto out_unmap;
                        }
                }
                if (pte_uffd_wp(pteval)) {
                        /*
                         * Don't collapse the page if any of the small
                         * PTEs are armed with uffd write protection.
                         * Here we can also mark the new huge pmd as
                         * write protected if any of the small ones is
                         * marked but that could bring unknown
                         * userfault messages that falls outside of
                         * the registered range.  So, just be simple.
                         */
                        result = SCAN_PTE_UFFD_WP;
                        goto out_unmap;
                }
                if (pte_write(pteval))
                        writable = true;

                page = vm_normal_page(vma, _address, pteval);
                if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
                        result = SCAN_PAGE_NULL;
                        goto out_unmap;
                }
                folio = page_folio(page);

                if (!folio_test_anon(folio)) {
                        result = SCAN_PAGE_ANON;
                        goto out_unmap;
                }

                /*
                 * We treat a single page as shared if any part of the THP
                 * is shared.
                 */
                if (folio_maybe_mapped_shared(folio)) {
                        ++shared;
                        if (cc->is_khugepaged &&
                            shared > khugepaged_max_ptes_shared) {
                                result = SCAN_EXCEED_SHARED_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
                                goto out_unmap;
                        }
                }

                /*
                 * Record which node the original page is from and save this
                 * information to cc->node_load[].
                 * Khugepaged will allocate hugepage from the node has the max
                 * hit record.
                 */
                node = folio_nid(folio);
                if (hpage_collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        goto out_unmap;
                }
                cc->node_load[node]++;
                if (!folio_test_lru(folio)) {
                        result = SCAN_PAGE_LRU;
                        goto out_unmap;
                }
                if (folio_test_locked(folio)) {
                        result = SCAN_PAGE_LOCK;
                        goto out_unmap;
                }

                /*
                 * Check if the page has any GUP (or other external) pins.
                 *
                 * Here the check may be racy:
                 * it may see folio_mapcount() > folio_ref_count().
                 * But such case is ephemeral we could always retry collapse
                 * later.  However it may report false positive if the page
                 * has excessive GUP pins (i.e. 512).  Anyway the same check
                 * will be done again later the risk seems low.
                 */
                if (!is_refcount_suitable(folio)) {
                        result = SCAN_PAGE_COUNT;
                        goto out_unmap;
                }

                /*
                 * If collapse was initiated by khugepaged, check that there is
                 * enough young pte to justify collapsing the page
                 */
                if (cc->is_khugepaged &&
                    (pte_young(pteval) || folio_test_young(folio) ||
                     folio_test_referenced(folio) || mmu_notifier_test_young(vma->vm_mm,
                                                                     address)))
                        referenced++;
        }
        if (!writable) {
                result = SCAN_PAGE_RO;
        } else if (cc->is_khugepaged &&
                   (!referenced ||
                    (unmapped && referenced < HPAGE_PMD_NR / 2))) {
                result = SCAN_LACK_REFERENCED_PAGE;
        } else {
                result = SCAN_SUCCEED;
        }
out_unmap:
        pte_unmap_unlock(pte, ptl);
        if (result == SCAN_SUCCEED) {
                result = collapse_huge_page(mm, address, referenced,
                                            unmapped, cc);
                /* collapse_huge_page will return with the mmap_lock released */
                *mmap_locked = false;
        }
out:
        trace_mm_khugepaged_scan_pmd(mm, &folio->page, writable, referenced,
                                     none_or_zero, result, unmapped);
        return result;
}

static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
{
        struct mm_slot *slot = &mm_slot->slot;
        struct mm_struct *mm = slot->mm;

        lockdep_assert_held(&khugepaged_mm_lock);

        if (hpage_collapse_test_exit(mm)) {
                /* free mm_slot */
                hash_del(&slot->hash);
                list_del(&slot->mm_node);

                /*
                 * Not strictly needed because the mm exited already.
                 *
                 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
                 */

                /* khugepaged_mm_lock actually not necessary for the below */
                mm_slot_free(mm_slot_cache, mm_slot);
                mmdrop(mm);
        }
}

#ifdef CONFIG_SHMEM
/* hpage must be locked, and mmap_lock must be held */
static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
                        pmd_t *pmdp, struct page *hpage)
{
        struct vm_fault vmf = {
                .vma = vma,
                .address = addr,
                .flags = 0,
                .pmd = pmdp,
        };

        VM_BUG_ON(!PageTransHuge(hpage));
        mmap_assert_locked(vma->vm_mm);

        if (do_set_pmd(&vmf, hpage))
                return SCAN_FAIL;

        get_page(hpage);
        return SCAN_SUCCEED;
}

/**
 * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
 * address haddr.
 *
 * @mm: process address space where collapse happens
 * @addr: THP collapse address
 * @install_pmd: If a huge PMD should be installed
 *
 * This function checks whether all the PTEs in the PMD are pointing to the
 * right THP. If so, retract the page table so the THP can refault in with
 * as pmd-mapped. Possibly install a huge PMD mapping the THP.
 */
int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
                            bool install_pmd)
{
        struct mmu_notifier_range range;
        bool notified = false;
        unsigned long haddr = addr & HPAGE_PMD_MASK;
        struct vm_area_struct *vma = vma_lookup(mm, haddr);
        struct folio *folio;
        pte_t *start_pte, *pte;
        pmd_t *pmd, pgt_pmd;
        spinlock_t *pml = NULL, *ptl;
        int nr_ptes = 0, result = SCAN_FAIL;
        int i;

        mmap_assert_locked(mm);

        /* First check VMA found, in case page tables are being torn down */
        if (!vma || !vma->vm_file ||
            !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
                return SCAN_VMA_CHECK;

        /* Fast check before locking page if already PMD-mapped */
        result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
        if (result == SCAN_PMD_MAPPED)
                return result;

        /*
         * If we are here, we've succeeded in replacing all the native pages
         * in the page cache with a single hugepage. If a mm were to fault-in
         * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
         * and map it by a PMD, regardless of sysfs THP settings. As such, let's
         * analogously elide sysfs THP settings here.
         */
        if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
                return SCAN_VMA_CHECK;

        /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
        if (userfaultfd_wp(vma))
                return SCAN_PTE_UFFD_WP;

        folio = filemap_lock_folio(vma->vm_file->f_mapping,
                               linear_page_index(vma, haddr));
        if (IS_ERR(folio))
                return SCAN_PAGE_NULL;

        if (folio_order(folio) != HPAGE_PMD_ORDER) {
                result = SCAN_PAGE_COMPOUND;
                goto drop_folio;
        }

        result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
        switch (result) {
        case SCAN_SUCCEED:
                break;
        case SCAN_PMD_NONE:
                /*
                 * All pte entries have been removed and pmd cleared.
                 * Skip all the pte checks and just update the pmd mapping.
                 */
                goto maybe_install_pmd;
        default:
                goto drop_folio;
        }

        result = SCAN_FAIL;
        start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
        if (!start_pte)                /* mmap_lock + page lock should prevent this */
                goto drop_folio;

        /* step 1: check all mapped PTEs are to the right huge page */
        for (i = 0, addr = haddr, pte = start_pte;
             i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
                pte_t ptent = ptep_get(pte);

                /* empty pte, skip */
                if (pte_none(ptent))
                        continue;

                /* page swapped out, abort */
                if (!pte_present(ptent)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto abort;
                }

                page = vm_normal_page(vma, addr, ptent);
                if (WARN_ON_ONCE(page && is_zone_device_page(page)))
                        page = NULL;
                /*
                 * Note that uprobe, debugger, or MAP_PRIVATE may change the
                 * page table, but the new page will not be a subpage of hpage.
                 */
                if (folio_page(folio, i) != page)
                        goto abort;
        }

        pte_unmap_unlock(start_pte, ptl);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                haddr, haddr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        notified = true;

        /*
         * pmd_lock covers a wider range than ptl, and (if split from mm's
         * page_table_lock) ptl nests inside pml. The less time we hold pml,
         * the better; but userfaultfd's mfill_atomic_pte() on a private VMA
         * inserts a valid as-if-COWed PTE without even looking up page cache.
         * So page lock of folio does not protect from it, so we must not drop
         * ptl before pgt_pmd is removed, so uffd private needs pml taken now.
         */
        if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
                pml = pmd_lock(mm, pmd);

        start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
        if (!start_pte)                /* mmap_lock + page lock should prevent this */
                goto abort;
        if (!pml)
                spin_lock(ptl);
        else if (ptl != pml)
                spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

        if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
                goto abort;

        /* step 2: clear page table and adjust rmap */
        for (i = 0, addr = haddr, pte = start_pte;
             i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
                struct page *page;
                pte_t ptent = ptep_get(pte);

                if (pte_none(ptent))
                        continue;
                /*
                 * We dropped ptl after the first scan, to do the mmu_notifier:
                 * page lock stops more PTEs of the folio being faulted in, but
                 * does not stop write faults COWing anon copies from existing
                 * PTEs; and does not stop those being swapped out or migrated.
                 */
                if (!pte_present(ptent)) {
                        result = SCAN_PTE_NON_PRESENT;
                        goto abort;
                }
                page = vm_normal_page(vma, addr, ptent);
                if (folio_page(folio, i) != page)
                        goto abort;

                /*
                 * Must clear entry, or a racing truncate may re-remove it.
                 * TLB flush can be left until pmdp_collapse_flush() does it.
                 * PTE dirty? Shmem page is already dirty; file is read-only.
                 */
                ptep_clear(mm, addr, pte);
                folio_remove_rmap_pte(folio, page, vma);
                nr_ptes++;
        }

        if (!pml)
                spin_unlock(ptl);

        /* step 3: set proper refcount and mm_counters. */
        if (nr_ptes) {
                folio_ref_sub(folio, nr_ptes);
                add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
        }

        /* step 4: remove empty page table */
        if (!pml) {
                pml = pmd_lock(mm, pmd);
                if (ptl != pml) {
                        spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
                        if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
                                flush_tlb_mm(mm);
                                goto unlock;
                        }
                }
        }
        pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd);
        pmdp_get_lockless_sync();
        pte_unmap_unlock(start_pte, ptl);
        if (ptl != pml)
                spin_unlock(pml);

        mmu_notifier_invalidate_range_end(&range);

        mm_dec_nr_ptes(mm);
        page_table_check_pte_clear_range(mm, haddr, pgt_pmd);
        pte_free_defer(mm, pmd_pgtable(pgt_pmd));

maybe_install_pmd:
        /* step 5: install pmd entry */
        result = install_pmd
                        ? set_huge_pmd(vma, haddr, pmd, &folio->page)
                        : SCAN_SUCCEED;
        goto drop_folio;
abort:
        if (nr_ptes) {
                flush_tlb_mm(mm);
                folio_ref_sub(folio, nr_ptes);
                add_mm_counter(mm, mm_counter_file(folio), -nr_ptes);
        }
unlock:
        if (start_pte)
                pte_unmap_unlock(start_pte, ptl);
        if (pml && pml != ptl)
                spin_unlock(pml);
        if (notified)
                mmu_notifier_invalidate_range_end(&range);
drop_folio:
        folio_unlock(folio);
        folio_put(folio);
        return result;
}

static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
        struct vm_area_struct *vma;

        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                struct mmu_notifier_range range;
                struct mm_struct *mm;
                unsigned long addr;
                pmd_t *pmd, pgt_pmd;
                spinlock_t *pml;
                spinlock_t *ptl;
                bool success = false;

                /*
                 * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
                 * got written to. These VMAs are likely not worth removing
                 * page tables from, as PMD-mapping is likely to be split later.
                 */
                if (READ_ONCE(vma->anon_vma))
                        continue;

                addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                if (addr & ~HPAGE_PMD_MASK ||
                    vma->vm_end < addr + HPAGE_PMD_SIZE)
                        continue;

                mm = vma->vm_mm;
                if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED)
                        continue;

                if (hpage_collapse_test_exit(mm))
                        continue;
                /*
                 * When a vma is registered with uffd-wp, we cannot recycle
                 * the page table because there may be pte markers installed.
                 * Other vmas can still have the same file mapped hugely, but
                 * skip this one: it will always be mapped in small page size
                 * for uffd-wp registered ranges.
                 */
                if (userfaultfd_wp(vma))
                        continue;

                /* PTEs were notified when unmapped; but now for the PMD? */
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                        addr, addr + HPAGE_PMD_SIZE);
                mmu_notifier_invalidate_range_start(&range);

                pml = pmd_lock(mm, pmd);
                /*
                 * The lock of new_folio is still held, we will be blocked in
                 * the page fault path, which prevents the pte entries from
                 * being set again. So even though the old empty PTE page may be
                 * concurrently freed and a new PTE page is filled into the pmd
                 * entry, it is still empty and can be removed.
                 *
                 * So here we only need to recheck if the state of pmd entry
                 * still meets our requirements, rather than checking pmd_same()
                 * like elsewhere.
                 */
                if (check_pmd_state(pmd) != SCAN_SUCCEED)
                        goto drop_pml;
                ptl = pte_lockptr(mm, pmd);
                if (ptl != pml)
                        spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);

                /*
                 * Huge page lock is still held, so normally the page table
                 * must remain empty; and we have already skipped anon_vma
                 * and userfaultfd_wp() vmas.  But since the mmap_lock is not
                 * held, it is still possible for a racing userfaultfd_ioctl()
                 * to have inserted ptes or markers.  Now that we hold ptlock,
                 * repeating the anon_vma check protects from one category,
                 * and repeating the userfaultfd_wp() check from another.
                 */
                if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
                        pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
                        pmdp_get_lockless_sync();
                        success = true;
                }

                if (ptl != pml)
                        spin_unlock(ptl);
drop_pml:
                spin_unlock(pml);

                mmu_notifier_invalidate_range_end(&range);

                if (success) {
                        mm_dec_nr_ptes(mm);
                        page_table_check_pte_clear_range(mm, addr, pgt_pmd);
                        pte_free_defer(mm, pmd_pgtable(pgt_pmd));
                }
        }
        i_mmap_unlock_read(mapping);
}

/**
 * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
 *
 * @mm: process address space where collapse happens
 * @addr: virtual collapse start address
 * @file: file that collapse on
 * @start: collapse start address
 * @cc: collapse context and scratchpad
 *
 * Basic scheme is simple, details are more complex:
 *  - allocate and lock a new huge page;
 *  - scan page cache, locking old pages
 *    + swap/gup in pages if necessary;
 *  - copy data to new page
 *  - handle shmem holes
 *    + re-validate that holes weren't filled by someone else
 *    + check for userfaultfd
 *  - finalize updates to the page cache;
 *  - if replacing succeeds:
 *    + unlock huge page;
 *    + free old pages;
 *  - if replacing failed;
 *    + unlock old pages
 *    + unlock and free huge page;
 */
static int collapse_file(struct mm_struct *mm, unsigned long addr,
                         struct file *file, pgoff_t start,
                         struct collapse_control *cc)
{
        struct address_space *mapping = file->f_mapping;
        struct page *dst;
        struct folio *folio, *tmp, *new_folio;
        pgoff_t index = 0, end = start + HPAGE_PMD_NR;
        LIST_HEAD(pagelist);
        XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
        int nr_none = 0, result = SCAN_SUCCEED;
        bool is_shmem = shmem_file(file);

        VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
        VM_BUG_ON(start & (HPAGE_PMD_NR - 1));

        result = alloc_charge_folio(&new_folio, mm, cc);
        if (result != SCAN_SUCCEED)
                goto out;

        mapping_set_update(&xas, mapping);

        __folio_set_locked(new_folio);
        if (is_shmem)
                __folio_set_swapbacked(new_folio);
        new_folio->index = start;
        new_folio->mapping = mapping;

        /*
         * Ensure we have slots for all the pages in the range.  This is
         * almost certainly a no-op because most of the pages must be present
         */
        do {
                xas_lock_irq(&xas);
                xas_create_range(&xas);
                if (!xas_error(&xas))
                        break;
                xas_unlock_irq(&xas);
                if (!xas_nomem(&xas, GFP_KERNEL)) {
                        result = SCAN_FAIL;
                        goto rollback;
                }
        } while (1);

        for (index = start; index < end;) {
                xas_set(&xas, index);
                folio = xas_load(&xas);

                VM_BUG_ON(index != xas.xa_index);
                if (is_shmem) {
                        if (!folio) {
                                /*
                                 * Stop if extent has been truncated or
                                 * hole-punched, and is now completely
                                 * empty.
                                 */
                                if (index == start) {
                                        if (!xas_next_entry(&xas, end - 1)) {
                                                result = SCAN_TRUNCATED;
                                                goto xa_locked;
                                        }
                                }
                                nr_none++;
                                index++;
                                continue;
                        }

                        if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
                                xas_unlock_irq(&xas);
                                /* swap in or instantiate fallocated page */
                                if (shmem_get_folio(mapping->host, index, 0,
                                                &folio, SGP_NOALLOC)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
                                /* drain lru cache to help folio_isolate_lru() */
                                lru_add_drain();
                        } else if (folio_trylock(folio)) {
                                folio_get(folio);
                                xas_unlock_irq(&xas);
                        } else {
                                result = SCAN_PAGE_LOCK;
                                goto xa_locked;
                        }
                } else {        /* !is_shmem */
                        if (!folio || xa_is_value(folio)) {
                                xas_unlock_irq(&xas);
                                page_cache_sync_readahead(mapping, &file->f_ra,
                                                          file, index,
                                                          end - index);
                                /* drain lru cache to help folio_isolate_lru() */
                                lru_add_drain();
                                folio = filemap_lock_folio(mapping, index);
                                if (IS_ERR(folio)) {
                                        result = SCAN_FAIL;
                                        goto xa_unlocked;
                                }
                        } else if (folio_test_dirty(folio)) {
                                /*
                                 * khugepaged only works on read-only fd,
                                 * so this page is dirty because it hasn't
                                 * been flushed since first write. There
                                 * won't be new dirty pages.
                                 *
                                 * Trigger async flush here and hope the
                                 * writeback is done when khugepaged
                                 * revisits this page.
                                 *
                                 * This is a one-off situation. We are not
                                 * forcing writeback in loop.
                                 */
                                xas_unlock_irq(&xas);
                                filemap_flush(mapping);
                                result = SCAN_FAIL;
                                goto xa_unlocked;
                        } else if (folio_test_writeback(folio)) {
                                xas_unlock_irq(&xas);
                                result = SCAN_FAIL;
                                goto xa_unlocked;
                        } else if (folio_trylock(folio)) {
                                folio_get(folio);
                                xas_unlock_irq(&xas);
                        } else {
                                result = SCAN_PAGE_LOCK;
                                goto xa_locked;
                        }
                }

                /*
                 * The folio must be locked, so we can drop the i_pages lock
                 * without racing with truncate.
                 */
                VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

                /* make sure the folio is up to date */
                if (unlikely(!folio_test_uptodate(folio))) {
                        result = SCAN_FAIL;
                        goto out_unlock;
                }

                /*
                 * If file was truncated then extended, or hole-punched, before
                 * we locked the first folio, then a THP might be there already.
                 * This will be discovered on the first iteration.
                 */
                if (folio_order(folio) == HPAGE_PMD_ORDER &&
                    folio->index == start) {
                        /* Maybe PMD-mapped */
                        result = SCAN_PTE_MAPPED_HUGEPAGE;
                        goto out_unlock;
                }

                if (folio_mapping(folio) != mapping) {
                        result = SCAN_TRUNCATED;
                        goto out_unlock;
                }

                if (!is_shmem && (folio_test_dirty(folio) ||
                                  folio_test_writeback(folio))) {
                        /*
                         * khugepaged only works on read-only fd, so this
                         * folio is dirty because it hasn't been flushed
                         * since first write.
                         */
                        result = SCAN_FAIL;
                        goto out_unlock;
                }

                if (!folio_isolate_lru(folio)) {
                        result = SCAN_DEL_PAGE_LRU;
                        goto out_unlock;
                }

                if (!filemap_release_folio(folio, GFP_KERNEL)) {
                        result = SCAN_PAGE_HAS_PRIVATE;
                        folio_putback_lru(folio);
                        goto out_unlock;
                }

                if (folio_mapped(folio))
                        try_to_unmap(folio,
                                        TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);

                xas_lock_irq(&xas);

                VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);

                /*
                 * We control 2 + nr_pages references to the folio:
                 *  - we hold a pin on it;
                 *  - nr_pages reference from page cache;
                 *  - one from lru_isolate_folio;
                 * If those are the only references, then any new usage
                 * of the folio will have to fetch it from the page
                 * cache. That requires locking the folio to handle
                 * truncate, so any new usage will be blocked until we
                 * unlock folio after collapse/during rollback.
                 */
                if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
                        result = SCAN_PAGE_COUNT;
                        xas_unlock_irq(&xas);
                        folio_putback_lru(folio);
                        goto out_unlock;
                }

                /*
                 * Accumulate the folios that are being collapsed.
                 */
                list_add_tail(&folio->lru, &pagelist);
                index += folio_nr_pages(folio);
                continue;
out_unlock:
                folio_unlock(folio);
                folio_put(folio);
                goto xa_unlocked;
        }

        if (!is_shmem) {
                filemap_nr_thps_inc(mapping);
                /*
                 * Paired with the fence in do_dentry_open() -> get_write_access()
                 * to ensure i_writecount is up to date and the update to nr_thps
                 * is visible. Ensures the page cache will be truncated if the
                 * file is opened writable.
                 */
                smp_mb();
                if (inode_is_open_for_write(mapping->host)) {
                        result = SCAN_FAIL;
                        filemap_nr_thps_dec(mapping);
                }
        }

xa_locked:
        xas_unlock_irq(&xas);
xa_unlocked:

        /*
         * If collapse is successful, flush must be done now before copying.
         * If collapse is unsuccessful, does flush actually need to be done?
         * Do it anyway, to clear the state.
         */
        try_to_unmap_flush();

        if (result == SCAN_SUCCEED && nr_none &&
            !shmem_charge(mapping->host, nr_none))
                result = SCAN_FAIL;
        if (result != SCAN_SUCCEED) {
                nr_none = 0;
                goto rollback;
        }

        /*
         * The old folios are locked, so they won't change anymore.
         */
        index = start;
        dst = folio_page(new_folio, 0);
        list_for_each_entry(folio, &pagelist, lru) {
                int i, nr_pages = folio_nr_pages(folio);

                while (index < folio->index) {
                        clear_highpage(dst);
                        index++;
                        dst++;
                }

                for (i = 0; i < nr_pages; i++) {
                        if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) {
                                result = SCAN_COPY_MC;
                                goto rollback;
                        }
                        index++;
                        dst++;
                }
        }
        while (index < end) {
                clear_highpage(dst);
                index++;
                dst++;
        }

        if (nr_none) {
                struct vm_area_struct *vma;
                int nr_none_check = 0;

                i_mmap_lock_read(mapping);
                xas_lock_irq(&xas);

                xas_set(&xas, start);
                for (index = start; index < end; index++) {
                        if (!xas_next(&xas)) {
                                xas_store(&xas, XA_RETRY_ENTRY);
                                if (xas_error(&xas)) {
                                        result = SCAN_STORE_FAILED;
                                        goto immap_locked;
                                }
                                nr_none_check++;
                        }
                }

                if (nr_none != nr_none_check) {
                        result = SCAN_PAGE_FILLED;
                        goto immap_locked;
                }

                /*
                 * If userspace observed a missing page in a VMA with
                 * a MODE_MISSING userfaultfd, then it might expect a
                 * UFFD_EVENT_PAGEFAULT for that page. If so, we need to
                 * roll back to avoid suppressing such an event. Since
                 * wp/minor userfaultfds don't give userspace any
                 * guarantees that the kernel doesn't fill a missing
                 * page with a zero page, so they don't matter here.
                 *
                 * Any userfaultfds registered after this point will
                 * not be able to observe any missing pages due to the
                 * previously inserted retry entries.
                 */
                vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
                        if (userfaultfd_missing(vma)) {
                                result = SCAN_EXCEED_NONE_PTE;
                                goto immap_locked;
                        }
                }

immap_locked:
                i_mmap_unlock_read(mapping);
                if (result != SCAN_SUCCEED) {
                        xas_set(&xas, start);
                        for (index = start; index < end; index++) {
                                if (xas_next(&xas) == XA_RETRY_ENTRY)
                                        xas_store(&xas, NULL);
                        }

                        xas_unlock_irq(&xas);
                        goto rollback;
                }
        } else {
                xas_lock_irq(&xas);
        }

        if (is_shmem)
                __lruvec_stat_mod_folio(new_folio, NR_SHMEM_THPS, HPAGE_PMD_NR);
        else
                __lruvec_stat_mod_folio(new_folio, NR_FILE_THPS, HPAGE_PMD_NR);

        if (nr_none) {
                __lruvec_stat_mod_folio(new_folio, NR_FILE_PAGES, nr_none);
                /* nr_none is always 0 for non-shmem. */
                __lruvec_stat_mod_folio(new_folio, NR_SHMEM, nr_none);
        }

        /*
         * Mark new_folio as uptodate before inserting it into the
         * page cache so that it isn't mistaken for an fallocated but
         * unwritten page.
         */
        folio_mark_uptodate(new_folio);
        folio_ref_add(new_folio, HPAGE_PMD_NR - 1);

        if (is_shmem)
                folio_mark_dirty(new_folio);
        folio_add_lru(new_folio);

        /* Join all the small entries into a single multi-index entry. */
        xas_set_order(&xas, start, HPAGE_PMD_ORDER);
        xas_store(&xas, new_folio);
        WARN_ON_ONCE(xas_error(&xas));
        xas_unlock_irq(&xas);

        /*
         * Remove pte page tables, so we can re-fault the page as huge.
         * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp().
         */
        retract_page_tables(mapping, start);
        if (cc && !cc->is_khugepaged)
                result = SCAN_PTE_MAPPED_HUGEPAGE;
        folio_unlock(new_folio);

        /*
         * The collapse has succeeded, so free the old folios.
         */
        list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
                list_del(&folio->lru);
                folio->mapping = NULL;
                folio_clear_active(folio);
                folio_clear_unevictable(folio);
                folio_unlock(folio);
                folio_put_refs(folio, 2 + folio_nr_pages(folio));
        }

        goto out;

rollback:
        /* Something went wrong: roll back page cache changes */
        if (nr_none) {
                xas_lock_irq(&xas);
                mapping->nrpages -= nr_none;
                xas_unlock_irq(&xas);
                shmem_uncharge(mapping->host, nr_none);
        }

        list_for_each_entry_safe(folio, tmp, &pagelist, lru) {
                list_del(&folio->lru);
                folio_unlock(folio);
                folio_putback_lru(folio);
                folio_put(folio);
        }
        /*
         * Undo the updates of filemap_nr_thps_inc for non-SHMEM
         * file only. This undo is not needed unless failure is
         * due to SCAN_COPY_MC.
         */
        if (!is_shmem && result == SCAN_COPY_MC) {
                filemap_nr_thps_dec(mapping);
                /*
                 * Paired with the fence in do_dentry_open() -> get_write_access()
                 * to ensure the update to nr_thps is visible.
                 */
                smp_mb();
        }

        new_folio->mapping = NULL;

        folio_unlock(new_folio);
        folio_put(new_folio);
out:
        VM_BUG_ON(!list_empty(&pagelist));
        trace_mm_khugepaged_collapse_file(mm, new_folio, index, addr, is_shmem, file, HPAGE_PMD_NR, result);
        return result;
}

static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
                                    struct file *file, pgoff_t start,
                                    struct collapse_control *cc)
{
        struct folio *folio = NULL;
        struct address_space *mapping = file->f_mapping;
        XA_STATE(xas, &mapping->i_pages, start);
        int present, swap;
        int node = NUMA_NO_NODE;
        int result = SCAN_SUCCEED;

        present = 0;
        swap = 0;
        memset(cc->node_load, 0, sizeof(cc->node_load));
        nodes_clear(cc->alloc_nmask);
        rcu_read_lock();
        xas_for_each(&xas, folio, start + HPAGE_PMD_NR - 1) {
                if (xas_retry(&xas, folio))
                        continue;

                if (xa_is_value(folio)) {
                        swap += 1 << xas_get_order(&xas);
                        if (cc->is_khugepaged &&
                            swap > khugepaged_max_ptes_swap) {
                                result = SCAN_EXCEED_SWAP_PTE;
                                count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
                                break;
                        }
                        continue;
                }

                if (folio_order(folio) == HPAGE_PMD_ORDER &&
                    folio->index == start) {
                        /* Maybe PMD-mapped */
                        result = SCAN_PTE_MAPPED_HUGEPAGE;
                        /*
                         * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
                         * by the caller won't touch the page cache, and so
                         * it's safe to skip LRU and refcount checks before
                         * returning.
                         */
                        break;
                }

                node = folio_nid(folio);
                if (hpage_collapse_scan_abort(node, cc)) {
                        result = SCAN_SCAN_ABORT;
                        break;
                }
                cc->node_load[node]++;

                if (!folio_test_lru(folio)) {
                        result = SCAN_PAGE_LRU;
                        break;
                }

                if (!is_refcount_suitable(folio)) {
                        result = SCAN_PAGE_COUNT;
                        break;
                }

                /*
                 * We probably should check if the folio is referenced
                 * here, but nobody would transfer pte_young() to
                 * folio_test_referenced() for us.  And rmap walk here
                 * is just too costly...
                 */

                present += folio_nr_pages(folio);

                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();

        if (result == SCAN_SUCCEED) {
                if (cc->is_khugepaged &&
                    present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
                        result = SCAN_EXCEED_NONE_PTE;
                        count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
                } else {
                        result = collapse_file(mm, addr, file, start, cc);
                }
        }

        trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result);
        return result;
}
#else
static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
                                    struct file *file, pgoff_t start,
                                    struct collapse_control *cc)
{
        BUILD_BUG();
}
#endif

static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
                                            struct collapse_control *cc)
        __releases(&khugepaged_mm_lock)
        __acquires(&khugepaged_mm_lock)
{
        struct vma_iterator vmi;
        struct khugepaged_mm_slot *mm_slot;
        struct mm_slot *slot;
        struct mm_struct *mm;
        struct vm_area_struct *vma;
        int progress = 0;

        VM_BUG_ON(!pages);
        lockdep_assert_held(&khugepaged_mm_lock);
        *result = SCAN_FAIL;

        if (khugepaged_scan.mm_slot) {
                mm_slot = khugepaged_scan.mm_slot;
                slot = &mm_slot->slot;
        } else {
                slot = list_entry(khugepaged_scan.mm_head.next,
                                     struct mm_slot, mm_node);
                mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
                khugepaged_scan.address = 0;
                khugepaged_scan.mm_slot = mm_slot;
        }
        spin_unlock(&khugepaged_mm_lock);

        mm = slot->mm;
        /*
         * Don't wait for semaphore (to avoid long wait times).  Just move to
         * the next mm on the list.
         */
        vma = NULL;
        if (unlikely(!mmap_read_trylock(mm)))
                goto breakouterloop_mmap_lock;

        progress++;
        if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
                goto breakouterloop;

        vma_iter_init(&vmi, mm, khugepaged_scan.address);
        for_each_vma(vmi, vma) {
                unsigned long hstart, hend;

                cond_resched();
                if (unlikely(hpage_collapse_test_exit_or_disable(mm))) {
                        progress++;
                        break;
                }
                if (!thp_vma_allowable_order(vma, vma->vm_flags,
                                        TVA_ENFORCE_SYSFS, PMD_ORDER)) {
skip:
                        progress++;
                        continue;
                }
                hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
                hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
                if (khugepaged_scan.address > hend)
                        goto skip;
                if (khugepaged_scan.address < hstart)
                        khugepaged_scan.address = hstart;
                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);

                while (khugepaged_scan.address < hend) {
                        bool mmap_locked = true;

                        cond_resched();
                        if (unlikely(hpage_collapse_test_exit_or_disable(mm)))
                                goto breakouterloop;

                        VM_BUG_ON(khugepaged_scan.address < hstart ||
                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
                                  hend);
                        if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
                                struct file *file = get_file(vma->vm_file);
                                pgoff_t pgoff = linear_page_index(vma,
                                                khugepaged_scan.address);

                                mmap_read_unlock(mm);
                                mmap_locked = false;
                                *result = hpage_collapse_scan_file(mm,
                                        khugepaged_scan.address, file, pgoff, cc);
                                fput(file);
                                if (*result == SCAN_PTE_MAPPED_HUGEPAGE) {
                                        mmap_read_lock(mm);
                                        if (hpage_collapse_test_exit_or_disable(mm))
                                                goto breakouterloop;
                                        *result = collapse_pte_mapped_thp(mm,
                                                khugepaged_scan.address, false);
                                        if (*result == SCAN_PMD_MAPPED)
                                                *result = SCAN_SUCCEED;
                                        mmap_read_unlock(mm);
                                }
                        } else {
                                *result = hpage_collapse_scan_pmd(mm, vma,
                                        khugepaged_scan.address, &mmap_locked, cc);
                        }

                        if (*result == SCAN_SUCCEED)
                                ++khugepaged_pages_collapsed;

                        /* move to next address */
                        khugepaged_scan.address += HPAGE_PMD_SIZE;
                        progress += HPAGE_PMD_NR;
                        if (!mmap_locked)
                                /*
                                 * We released mmap_lock so break loop.  Note
                                 * that we drop mmap_lock before all hugepage
                                 * allocations, so if allocation fails, we are
                                 * guaranteed to break here and report the
                                 * correct result back to caller.
                                 */
                                goto breakouterloop_mmap_lock;
                        if (progress >= pages)
                                goto breakouterloop;
                }
        }
breakouterloop:
        mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
breakouterloop_mmap_lock:

        spin_lock(&khugepaged_mm_lock);
        VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
        /*
         * Release the current mm_slot if this mm is about to die, or
         * if we scanned all vmas of this mm.
         */
        if (hpage_collapse_test_exit(mm) || !vma) {
                /*
                 * Make sure that if mm_users is reaching zero while
                 * khugepaged runs here, khugepaged_exit will find
                 * mm_slot not pointing to the exiting mm.
                 */
                if (slot->mm_node.next != &khugepaged_scan.mm_head) {
                        slot = list_entry(slot->mm_node.next,
                                          struct mm_slot, mm_node);
                        khugepaged_scan.mm_slot =
                                mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
                        khugepaged_scan.address = 0;
                } else {
                        khugepaged_scan.mm_slot = NULL;
                        khugepaged_full_scans++;
                }

                collect_mm_slot(mm_slot);
        }

        return progress;
}

static int khugepaged_has_work(void)
{
        return !list_empty(&khugepaged_scan.mm_head) && hugepage_pmd_enabled();
}

static int khugepaged_wait_event(void)
{
        return !list_empty(&khugepaged_scan.mm_head) ||
                kthread_should_stop();
}

static void khugepaged_do_scan(struct collapse_control *cc)
{
        unsigned int progress = 0, pass_through_head = 0;
        unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
        bool wait = true;
        int result = SCAN_SUCCEED;

        lru_add_drain_all();

        while (true) {
                cond_resched();

                if (unlikely(kthread_should_stop()))
                        break;

                spin_lock(&khugepaged_mm_lock);
                if (!khugepaged_scan.mm_slot)
                        pass_through_head++;
                if (khugepaged_has_work() &&
                    pass_through_head < 2)
                        progress += khugepaged_scan_mm_slot(pages - progress,
                                                            &result, cc);
                else
                        progress = pages;
                spin_unlock(&khugepaged_mm_lock);

                if (progress >= pages)
                        break;

                if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
                        /*
                         * If fail to allocate the first time, try to sleep for
                         * a while.  When hit again, cancel the scan.
                         */
                        if (!wait)
                                break;
                        wait = false;
                        khugepaged_alloc_sleep();
                }
        }
}

static bool khugepaged_should_wakeup(void)
{
        return kthread_should_stop() ||
               time_after_eq(jiffies, khugepaged_sleep_expire);
}

static void khugepaged_wait_work(void)
{
        if (khugepaged_has_work()) {
                const unsigned long scan_sleep_jiffies =
                        msecs_to_jiffies(khugepaged_scan_sleep_millisecs);

                if (!scan_sleep_jiffies)
                        return;

                khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
                wait_event_freezable_timeout(khugepaged_wait,
                                             khugepaged_should_wakeup(),
                                             scan_sleep_jiffies);
                return;
        }

        if (hugepage_pmd_enabled())
                wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}

static int khugepaged(void *none)
{
        struct khugepaged_mm_slot *mm_slot;

        set_freezable();
        set_user_nice(current, MAX_NICE);

        while (!kthread_should_stop()) {
                khugepaged_do_scan(&khugepaged_collapse_control);
                khugepaged_wait_work();
        }

        spin_lock(&khugepaged_mm_lock);
        mm_slot = khugepaged_scan.mm_slot;
        khugepaged_scan.mm_slot = NULL;
        if (mm_slot)
                collect_mm_slot(mm_slot);
        spin_unlock(&khugepaged_mm_lock);
        return 0;
}

static void set_recommended_min_free_kbytes(void)
{
        struct zone *zone;
        int nr_zones = 0;
        unsigned long recommended_min;

        if (!hugepage_pmd_enabled()) {
                calculate_min_free_kbytes();
                goto update_wmarks;
        }

        for_each_populated_zone(zone) {
                /*
                 * We don't need to worry about fragmentation of
                 * ZONE_MOVABLE since it only has movable pages.
                 */
                if (zone_idx(zone) > gfp_zone(GFP_USER))
                        continue;

                nr_zones++;
        }

        /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
        recommended_min = pageblock_nr_pages * nr_zones * 2;

        /*
         * Make sure that on average at least two pageblocks are almost free
         * of another type, one for a migratetype to fall back to and a
         * second to avoid subsequent fallbacks of other types There are 3
         * MIGRATE_TYPES we care about.
         */
        recommended_min += pageblock_nr_pages * nr_zones *
                           MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;

        /* don't ever allow to reserve more than 5% of the lowmem */
        recommended_min = min(recommended_min,
                              (unsigned long) nr_free_buffer_pages() / 20);
        recommended_min <<= (PAGE_SHIFT-10);

        if (recommended_min > min_free_kbytes) {
                if (user_min_free_kbytes >= 0)
                        pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
                                min_free_kbytes, recommended_min);

                min_free_kbytes = recommended_min;
        }

update_wmarks:
        setup_per_zone_wmarks();
}

int start_stop_khugepaged(void)
{
        int err = 0;

        mutex_lock(&khugepaged_mutex);
        if (hugepage_pmd_enabled()) {
                if (!khugepaged_thread)
                        khugepaged_thread = kthread_run(khugepaged, NULL,
                                                        "khugepaged");
                if (IS_ERR(khugepaged_thread)) {
                        pr_err("khugepaged: kthread_run(khugepaged) failed\n");
                        err = PTR_ERR(khugepaged_thread);
                        khugepaged_thread = NULL;
                        goto fail;
                }

                if (!list_empty(&khugepaged_scan.mm_head))
                        wake_up_interruptible(&khugepaged_wait);
        } else if (khugepaged_thread) {
                kthread_stop(khugepaged_thread);
                khugepaged_thread = NULL;
        }
        set_recommended_min_free_kbytes();
fail:
        mutex_unlock(&khugepaged_mutex);
        return err;
}

void khugepaged_min_free_kbytes_update(void)
{
        mutex_lock(&khugepaged_mutex);
        if (hugepage_pmd_enabled() && khugepaged_thread)
                set_recommended_min_free_kbytes();
        mutex_unlock(&khugepaged_mutex);
}

bool current_is_khugepaged(void)
{
        return kthread_func(current) == khugepaged;
}

static int madvise_collapse_errno(enum scan_result r)
{
        /*
         * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
         * actionable feedback to caller, so they may take an appropriate
         * fallback measure depending on the nature of the failure.
         */
        switch (r) {
        case SCAN_ALLOC_HUGE_PAGE_FAIL:
                return -ENOMEM;
        case SCAN_CGROUP_CHARGE_FAIL:
        case SCAN_EXCEED_NONE_PTE:
                return -EBUSY;
        /* Resource temporary unavailable - trying again might succeed */
        case SCAN_PAGE_COUNT:
        case SCAN_PAGE_LOCK:
        case SCAN_PAGE_LRU:
        case SCAN_DEL_PAGE_LRU:
        case SCAN_PAGE_FILLED:
                return -EAGAIN;
        /*
         * Other: Trying again likely not to succeed / error intrinsic to
         * specified memory range. khugepaged likely won't be able to collapse
         * either.
         */
        default:
                return -EINVAL;
        }
}

int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
                     unsigned long start, unsigned long end)
{
        struct collapse_control *cc;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long hstart, hend, addr;
        int thps = 0, last_fail = SCAN_FAIL;
        bool mmap_locked = true;

        BUG_ON(vma->vm_start > start);
        BUG_ON(vma->vm_end < end);

        *prev = vma;

        if (!thp_vma_allowable_order(vma, vma->vm_flags, 0, PMD_ORDER))
                return -EINVAL;

        cc = kmalloc(sizeof(*cc), GFP_KERNEL);
        if (!cc)
                return -ENOMEM;
        cc->is_khugepaged = false;

        mmgrab(mm);
        lru_add_drain_all();

        hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
        hend = end & HPAGE_PMD_MASK;

        for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
                int result = SCAN_FAIL;

                if (!mmap_locked) {
                        cond_resched();
                        mmap_read_lock(mm);
                        mmap_locked = true;
                        result = hugepage_vma_revalidate(mm, addr, false, &vma,
                                                         cc);
                        if (result  != SCAN_SUCCEED) {
                                last_fail = result;
                                goto out_nolock;
                        }

                        hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
                }
                mmap_assert_locked(mm);
                memset(cc->node_load, 0, sizeof(cc->node_load));
                nodes_clear(cc->alloc_nmask);
                if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) {
                        struct file *file = get_file(vma->vm_file);
                        pgoff_t pgoff = linear_page_index(vma, addr);

                        mmap_read_unlock(mm);
                        mmap_locked = false;
                        result = hpage_collapse_scan_file(mm, addr, file, pgoff,
                                                          cc);
                        fput(file);
                } else {
                        result = hpage_collapse_scan_pmd(mm, vma, addr,
                                                         &mmap_locked, cc);
                }
                if (!mmap_locked)
                        *prev = NULL;  /* Tell caller we dropped mmap_lock */

handle_result:
                switch (result) {
                case SCAN_SUCCEED:
                case SCAN_PMD_MAPPED:
                        ++thps;
                        break;
                case SCAN_PTE_MAPPED_HUGEPAGE:
                        BUG_ON(mmap_locked);
                        BUG_ON(*prev);
                        mmap_read_lock(mm);
                        result = collapse_pte_mapped_thp(mm, addr, true);
                        mmap_read_unlock(mm);
                        goto handle_result;
                /* Whitelisted set of results where continuing OK */
                case SCAN_PMD_NULL:
                case SCAN_PTE_NON_PRESENT:
                case SCAN_PTE_UFFD_WP:
                case SCAN_PAGE_RO:
                case SCAN_LACK_REFERENCED_PAGE:
                case SCAN_PAGE_NULL:
                case SCAN_PAGE_COUNT:
                case SCAN_PAGE_LOCK:
                case SCAN_PAGE_COMPOUND:
                case SCAN_PAGE_LRU:
                case SCAN_DEL_PAGE_LRU:
                        last_fail = result;
                        break;
                default:
                        last_fail = result;
                        /* Other error, exit */
                        goto out_maybelock;
                }
        }

out_maybelock:
        /* Caller expects us to hold mmap_lock on return */
        if (!mmap_locked)
                mmap_read_lock(mm);
out_nolock:
        mmap_assert_locked(mm);
        mmdrop(mm);
        kfree(cc);

        return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
                        : madvise_collapse_errno(last_fail);
}






























































































































































































































































































































   21 





















   21 











   21 















   21 








   21 















   21 



   21 












   21 








   21 





































































































































  122 
    1 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
 * Copyright (C) 2017-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 * Copyright Matt Mackall <mpm@selenic.com>, 2003, 2004, 2005
 * Copyright Theodore Ts'o, 1994, 1995, 1996, 1997, 1998, 1999. All rights reserved.
 *
 * This driver produces cryptographically secure pseudorandom data. It is divided
 * into roughly six sections, each with a section header:
 *
 *   - Initialization and readiness waiting.
 *   - Fast key erasure RNG, the "crng".
 *   - Entropy accumulation and extraction routines.
 *   - Entropy collection routines.
 *   - Userspace reader/writer interfaces.
 *   - Sysctl interface.
 *
 * The high level overview is that there is one input pool, into which
 * various pieces of data are hashed. Prior to initialization, some of that
 * data is then "credited" as having a certain number of bits of entropy.
 * When enough bits of entropy are available, the hash is finalized and
 * handed as a key to a stream cipher that expands it indefinitely for
 * various consumers. This key is periodically refreshed as the various
 * entropy collectors, described below, add data to the input pool.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/utsname.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/workqueue.h>
#include <linux/irq.h>
#include <linux/ratelimit.h>
#include <linux/syscalls.h>
#include <linux/completion.h>
#include <linux/uuid.h>
#include <linux/uaccess.h>
#include <linux/suspend.h>
#include <linux/siphash.h>
#include <linux/sched/isolation.h>
#include <crypto/chacha.h>
#include <crypto/blake2s.h>
#ifdef CONFIG_VDSO_GETRANDOM
#include <vdso/getrandom.h>
#include <vdso/datapage.h>
#include <vdso/vsyscall.h>
#endif
#include <asm/archrandom.h>
#include <asm/processor.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
#include <asm/io.h>

/*********************************************************************
 *
 * Initialization and readiness waiting.
 *
 * Much of the RNG infrastructure is devoted to various dependencies
 * being able to wait until the RNG has collected enough entropy and
 * is ready for safe consumption.
 *
 *********************************************************************/

/*
 * crng_init is protected by base_crng->lock, and only increases
 * its value (from empty->early->ready).
 */
static enum {
        CRNG_EMPTY = 0, /* Little to no entropy collected */
        CRNG_EARLY = 1, /* At least POOL_EARLY_BITS collected */
        CRNG_READY = 2  /* Fully initialized with POOL_READY_BITS collected */
} crng_init __read_mostly = CRNG_EMPTY;
static DEFINE_STATIC_KEY_FALSE(crng_is_ready);
#define crng_ready() (static_branch_likely(&crng_is_ready) || crng_init >= CRNG_READY)
/* Various types of waiters for crng_init->CRNG_READY transition. */
static DECLARE_WAIT_QUEUE_HEAD(crng_init_wait);
static struct fasync_struct *fasync;
static ATOMIC_NOTIFIER_HEAD(random_ready_notifier);

/* Control how we warn userspace. */
static struct ratelimit_state urandom_warning =
        RATELIMIT_STATE_INIT_FLAGS("urandom_warning", HZ, 3, RATELIMIT_MSG_ON_RELEASE);
static int ratelimit_disable __read_mostly =
        IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM);
module_param_named(ratelimit_disable, ratelimit_disable, int, 0644);
MODULE_PARM_DESC(ratelimit_disable, "Disable random ratelimit suppression");

/*
 * Returns whether or not the input pool has been seeded and thus guaranteed
 * to supply cryptographically secure random numbers. This applies to: the
 * /dev/urandom device, the get_random_bytes function, and the get_random_{u8,
 * u16,u32,u64,long} family of functions.
 *
 * Returns: true if the input pool has been seeded.
 *          false if the input pool has not been seeded.
 */
bool rng_is_initialized(void)
{
        return crng_ready();
}
EXPORT_SYMBOL(rng_is_initialized);

static void __cold crng_set_ready(struct work_struct *work)
{
        static_branch_enable(&crng_is_ready);
}

/* Used by wait_for_random_bytes(), and considered an entropy collector, below. */
static void try_to_generate_entropy(void);

/*
 * Wait for the input pool to be seeded and thus guaranteed to supply
 * cryptographically secure random numbers. This applies to: the /dev/urandom
 * device, the get_random_bytes function, and the get_random_{u8,u16,u32,u64,
 * long} family of functions. Using any of these functions without first
 * calling this function forfeits the guarantee of security.
 *
 * Returns: 0 if the input pool has been seeded.
 *          -ERESTARTSYS if the function was interrupted by a signal.
 */
int wait_for_random_bytes(void)
{
        while (!crng_ready()) {
                int ret;

                try_to_generate_entropy();
                ret = wait_event_interruptible_timeout(crng_init_wait, crng_ready(), HZ);
                if (ret)
                        return ret > 0 ? 0 : ret;
        }
        return 0;
}
EXPORT_SYMBOL(wait_for_random_bytes);

/*
 * Add a callback function that will be invoked when the crng is initialised,
 * or immediately if it already has been. Only use this is you are absolutely
 * sure it is required. Most users should instead be able to test
 * `rng_is_initialized()` on demand, or make use of `get_random_bytes_wait()`.
 */
int __cold execute_with_initialized_rng(struct notifier_block *nb)
{
        unsigned long flags;
        int ret = 0;

        spin_lock_irqsave(&random_ready_notifier.lock, flags);
        if (crng_ready())
                nb->notifier_call(nb, 0, NULL);
        else
                ret = raw_notifier_chain_register((struct raw_notifier_head *)&random_ready_notifier.head, nb);
        spin_unlock_irqrestore(&random_ready_notifier.lock, flags);
        return ret;
}

#define warn_unseeded_randomness() \
        if (IS_ENABLED(CONFIG_WARN_ALL_UNSEEDED_RANDOM) && !crng_ready()) \
                printk_deferred(KERN_NOTICE "random: %s called from %pS with crng_init=%d\n", \
                                __func__, (void *)_RET_IP_, crng_init)


/*********************************************************************
 *
 * Fast key erasure RNG, the "crng".
 *
 * These functions expand entropy from the entropy extractor into
 * long streams for external consumption using the "fast key erasure"
 * RNG described at <https://blog.cr.yp.to/20170723-random.html>.
 *
 * There are a few exported interfaces for use by other drivers:
 *
 *        void get_random_bytes(void *buf, size_t len)
 *        u8 get_random_u8()
 *        u16 get_random_u16()
 *        u32 get_random_u32()
 *        u32 get_random_u32_below(u32 ceil)
 *        u32 get_random_u32_above(u32 floor)
 *        u32 get_random_u32_inclusive(u32 floor, u32 ceil)
 *        u64 get_random_u64()
 *        unsigned long get_random_long()
 *
 * These interfaces will return the requested number of random bytes
 * into the given buffer or as a return value. This is equivalent to
 * a read from /dev/urandom. The u8, u16, u32, u64, long family of
 * functions may be higher performance for one-off random integers,
 * because they do a bit of buffering and do not invoke reseeding
 * until the buffer is emptied.
 *
 *********************************************************************/

enum {
        CRNG_RESEED_START_INTERVAL = HZ,
        CRNG_RESEED_INTERVAL = 60 * HZ
};

static struct {
        u8 key[CHACHA_KEY_SIZE] __aligned(__alignof__(long));
        unsigned long generation;
        spinlock_t lock;
} base_crng = {
        .lock = __SPIN_LOCK_UNLOCKED(base_crng.lock)
};

struct crng {
        u8 key[CHACHA_KEY_SIZE];
        unsigned long generation;
        local_lock_t lock;
};

static DEFINE_PER_CPU(struct crng, crngs) = {
        .generation = ULONG_MAX,
        .lock = INIT_LOCAL_LOCK(crngs.lock),
};

/*
 * Return the interval until the next reseeding, which is normally
 * CRNG_RESEED_INTERVAL, but during early boot, it is at an interval
 * proportional to the uptime.
 */
static unsigned int crng_reseed_interval(void)
{
        static bool early_boot = true;

        if (unlikely(READ_ONCE(early_boot))) {
                time64_t uptime = ktime_get_seconds();
                if (uptime >= CRNG_RESEED_INTERVAL / HZ * 2)
                        WRITE_ONCE(early_boot, false);
                else
                        return max_t(unsigned int, CRNG_RESEED_START_INTERVAL,
                                     (unsigned int)uptime / 2 * HZ);
        }
        return CRNG_RESEED_INTERVAL;
}

/* Used by crng_reseed() and crng_make_state() to extract a new seed from the input pool. */
static void extract_entropy(void *buf, size_t len);

/* This extracts a new crng key from the input pool. */
static void crng_reseed(struct work_struct *work)
{
        static DECLARE_DELAYED_WORK(next_reseed, crng_reseed);
        unsigned long flags;
        unsigned long next_gen;
        u8 key[CHACHA_KEY_SIZE];

        /* Immediately schedule the next reseeding, so that it fires sooner rather than later. */
        if (likely(system_unbound_wq))
                queue_delayed_work(system_unbound_wq, &next_reseed, crng_reseed_interval());

        extract_entropy(key, sizeof(key));

        /*
         * We copy the new key into the base_crng, overwriting the old one,
         * and update the generation counter. We avoid hitting ULONG_MAX,
         * because the per-cpu crngs are initialized to ULONG_MAX, so this
         * forces new CPUs that come online to always initialize.
         */
        spin_lock_irqsave(&base_crng.lock, flags);
        memcpy(base_crng.key, key, sizeof(base_crng.key));
        next_gen = base_crng.generation + 1;
        if (next_gen == ULONG_MAX)
                ++next_gen;
        WRITE_ONCE(base_crng.generation, next_gen);
#ifdef CONFIG_VDSO_GETRANDOM
        /* base_crng.generation's invalid value is ULONG_MAX, while
         * vdso_k_rng_data->generation's invalid value is 0, so add one to the
         * former to arrive at the latter. Use smp_store_release so that this
         * is ordered with the write above to base_crng.generation. Pairs with
         * the smp_rmb() before the syscall in the vDSO code.
         *
         * Cast to unsigned long for 32-bit architectures, since atomic 64-bit
         * operations are not supported on those architectures. This is safe
         * because base_crng.generation is a 32-bit value. On big-endian
         * architectures it will be stored in the upper 32 bits, but that's okay
         * because the vDSO side only checks whether the value changed, without
         * actually using or interpreting the value.
         */
        smp_store_release((unsigned long *)&vdso_k_rng_data->generation, next_gen + 1);
#endif
        if (!static_branch_likely(&crng_is_ready))
                crng_init = CRNG_READY;
        spin_unlock_irqrestore(&base_crng.lock, flags);
        memzero_explicit(key, sizeof(key));
}

/*
 * This generates a ChaCha block using the provided key, and then
 * immediately overwrites that key with half the block. It returns
 * the resultant ChaCha state to the user, along with the second
 * half of the block containing 32 bytes of random data that may
 * be used; random_data_len may not be greater than 32.
 *
 * The returned ChaCha state contains within it a copy of the old
 * key value, at index 4, so the state should always be zeroed out
 * immediately after using in order to maintain forward secrecy.
 * If the state cannot be erased in a timely manner, then it is
 * safer to set the random_data parameter to &chacha_state[4] so
 * that this function overwrites it before returning.
 */
static void crng_fast_key_erasure(u8 key[CHACHA_KEY_SIZE],
                                  u32 chacha_state[CHACHA_STATE_WORDS],
                                  u8 *random_data, size_t random_data_len)
{
        u8 first_block[CHACHA_BLOCK_SIZE];

        BUG_ON(random_data_len > 32);

        chacha_init_consts(chacha_state);
        memcpy(&chacha_state[4], key, CHACHA_KEY_SIZE);
        memset(&chacha_state[12], 0, sizeof(u32) * 4);
        chacha20_block(chacha_state, first_block);

        memcpy(key, first_block, CHACHA_KEY_SIZE);
        memcpy(random_data, first_block + CHACHA_KEY_SIZE, random_data_len);
        memzero_explicit(first_block, sizeof(first_block));
}

/*
 * This function returns a ChaCha state that you may use for generating
 * random data. It also returns up to 32 bytes on its own of random data
 * that may be used; random_data_len may not be greater than 32.
 */
static void crng_make_state(u32 chacha_state[CHACHA_STATE_WORDS],
                            u8 *random_data, size_t random_data_len)
{
        unsigned long flags;
        struct crng *crng;

        BUG_ON(random_data_len > 32);

        /*
         * For the fast path, we check whether we're ready, unlocked first, and
         * then re-check once locked later. In the case where we're really not
         * ready, we do fast key erasure with the base_crng directly, extracting
         * when crng_init is CRNG_EMPTY.
         */
        if (!crng_ready()) {
                bool ready;

                spin_lock_irqsave(&base_crng.lock, flags);
                ready = crng_ready();
                if (!ready) {
                        if (crng_init == CRNG_EMPTY)
                                extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_fast_key_erasure(base_crng.key, chacha_state,
                                              random_data, random_data_len);
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
                if (!ready)
                        return;
        }

        local_lock_irqsave(&crngs.lock, flags);
        crng = raw_cpu_ptr(&crngs);

        /*
         * If our per-cpu crng is older than the base_crng, then it means
         * somebody reseeded the base_crng. In that case, we do fast key
         * erasure on the base_crng, and use its output as the new key
         * for our per-cpu crng. This brings us up to date with base_crng.
         */
        if (unlikely(crng->generation != READ_ONCE(base_crng.generation))) {
                spin_lock(&base_crng.lock);
                crng_fast_key_erasure(base_crng.key, chacha_state,
                                      crng->key, sizeof(crng->key));
                crng->generation = base_crng.generation;
                spin_unlock(&base_crng.lock);
        }

        /*
         * Finally, when we've made it this far, our per-cpu crng has an up
         * to date key, and we can do fast key erasure with it to produce
         * some random data and a ChaCha state for the caller. All other
         * branches of this function are "unlikely", so most of the time we
         * should wind up here immediately.
         */
        crng_fast_key_erasure(crng->key, chacha_state, random_data, random_data_len);
        local_unlock_irqrestore(&crngs.lock, flags);
}

static void _get_random_bytes(void *buf, size_t len)
{
        u32 chacha_state[CHACHA_STATE_WORDS];
        u8 tmp[CHACHA_BLOCK_SIZE];
        size_t first_block_len;

        if (!len)
                return;

        first_block_len = min_t(size_t, 32, len);
        crng_make_state(chacha_state, buf, first_block_len);
        len -= first_block_len;
        buf += first_block_len;

        while (len) {
                if (len < CHACHA_BLOCK_SIZE) {
                        chacha20_block(chacha_state, tmp);
                        memcpy(buf, tmp, len);
                        memzero_explicit(tmp, sizeof(tmp));
                        break;
                }

                chacha20_block(chacha_state, buf);
                if (unlikely(chacha_state[12] == 0))
                        ++chacha_state[13];
                len -= CHACHA_BLOCK_SIZE;
                buf += CHACHA_BLOCK_SIZE;
        }

        memzero_explicit(chacha_state, sizeof(chacha_state));
}

/*
 * This returns random bytes in arbitrary quantities. The quality of the
 * random bytes is good as /dev/urandom. In order to ensure that the
 * randomness provided by this function is okay, the function
 * wait_for_random_bytes() should be called and return 0 at least once
 * at any point prior.
 */
void get_random_bytes(void *buf, size_t len)
{
        warn_unseeded_randomness();
        _get_random_bytes(buf, len);
}
EXPORT_SYMBOL(get_random_bytes);

static ssize_t get_random_bytes_user(struct iov_iter *iter)
{
        u32 chacha_state[CHACHA_STATE_WORDS];
        u8 block[CHACHA_BLOCK_SIZE];
        size_t ret = 0, copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        /*
         * Immediately overwrite the ChaCha key at index 4 with random
         * bytes, in case userspace causes copy_to_iter() below to sleep
         * forever, so that we still retain forward secrecy in that case.
         */
        crng_make_state(chacha_state, (u8 *)&chacha_state[4], CHACHA_KEY_SIZE);
        /*
         * However, if we're doing a read of len <= 32, we don't need to
         * use chacha_state after, so we can simply return those bytes to
         * the user directly.
         */
        if (iov_iter_count(iter) <= CHACHA_KEY_SIZE) {
                ret = copy_to_iter(&chacha_state[4], CHACHA_KEY_SIZE, iter);
                goto out_zero_chacha;
        }

        for (;;) {
                chacha20_block(chacha_state, block);
                if (unlikely(chacha_state[12] == 0))
                        ++chacha_state[13];

                copied = copy_to_iter(block, sizeof(block), iter);
                ret += copied;
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
out_zero_chacha:
        memzero_explicit(chacha_state, sizeof(chacha_state));
        return ret ? ret : -EFAULT;
}

/*
 * Batched entropy returns random integers. The quality of the random
 * number is good as /dev/urandom. In order to ensure that the randomness
 * provided by this function is okay, the function wait_for_random_bytes()
 * should be called and return 0 at least once at any point prior.
 */

#define DEFINE_BATCHED_ENTROPY(type)                                                \
struct batch_ ##type {                                                                \
        /*                                                                        \
         * We make this 1.5x a ChaCha block, so that we get the                        \
         * remaining 32 bytes from fast key erasure, plus one full                \
         * block from the detached ChaCha state. We can increase                \
         * the size of this later if needed so long as we keep the                \
         * formula of (integer_blocks + 0.5) * CHACHA_BLOCK_SIZE.                \
         */                                                                        \
        type entropy[CHACHA_BLOCK_SIZE * 3 / (2 * sizeof(type))];                \
        local_lock_t lock;                                                        \
        unsigned long generation;                                                \
        unsigned int position;                                                        \
};                                                                                \
                                                                                \
static DEFINE_PER_CPU(struct batch_ ##type, batched_entropy_ ##type) = {        \
        .lock = INIT_LOCAL_LOCK(batched_entropy_ ##type.lock),                        \
        .position = UINT_MAX                                                        \
};                                                                                \
                                                                                \
type get_random_ ##type(void)                                                        \
{                                                                                \
        type ret;                                                                \
        unsigned long flags;                                                        \
        struct batch_ ##type *batch;                                                \
        unsigned long next_gen;                                                        \
                                                                                \
        warn_unseeded_randomness();                                                \
                                                                                \
        if  (!crng_ready()) {                                                        \
                _get_random_bytes(&ret, sizeof(ret));                                \
                return ret;                                                        \
        }                                                                        \
                                                                                \
        local_lock_irqsave(&batched_entropy_ ##type.lock, flags);                \
        batch = raw_cpu_ptr(&batched_entropy_##type);                                \
                                                                                \
        next_gen = READ_ONCE(base_crng.generation);                                \
        if (batch->position >= ARRAY_SIZE(batch->entropy) ||                        \
            next_gen != batch->generation) {                                        \
                _get_random_bytes(batch->entropy, sizeof(batch->entropy));        \
                batch->position = 0;                                                \
                batch->generation = next_gen;                                        \
        }                                                                        \
                                                                                \
        ret = batch->entropy[batch->position];                                        \
        batch->entropy[batch->position] = 0;                                        \
        ++batch->position;                                                        \
        local_unlock_irqrestore(&batched_entropy_ ##type.lock, flags);                \
        return ret;                                                                \
}                                                                                \
EXPORT_SYMBOL(get_random_ ##type);

DEFINE_BATCHED_ENTROPY(u8)
DEFINE_BATCHED_ENTROPY(u16)
DEFINE_BATCHED_ENTROPY(u32)
DEFINE_BATCHED_ENTROPY(u64)

u32 __get_random_u32_below(u32 ceil)
{
        /*
         * This is the slow path for variable ceil. It is still fast, most of
         * the time, by doing traditional reciprocal multiplication and
         * opportunistically comparing the lower half to ceil itself, before
         * falling back to computing a larger bound, and then rejecting samples
         * whose lower half would indicate a range indivisible by ceil. The use
         * of `-ceil % ceil` is analogous to `2^32 % ceil`, but is computable
         * in 32-bits.
         */
        u32 rand = get_random_u32();
        u64 mult;

        /*
         * This function is technically undefined for ceil == 0, and in fact
         * for the non-underscored constant version in the header, we build bug
         * on that. But for the non-constant case, it's convenient to have that
         * evaluate to being a straight call to get_random_u32(), so that
         * get_random_u32_inclusive() can work over its whole range without
         * undefined behavior.
         */
        if (unlikely(!ceil))
                return rand;

        mult = (u64)ceil * rand;
        if (unlikely((u32)mult < ceil)) {
                u32 bound = -ceil % ceil;
                while (unlikely((u32)mult < bound))
                        mult = (u64)ceil * get_random_u32();
        }
        return mult >> 32;
}
EXPORT_SYMBOL(__get_random_u32_below);

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU is coming up, with entry
 * CPUHP_RANDOM_PREPARE, which comes before CPUHP_WORKQUEUE_PREP.
 */
int __cold random_prepare_cpu(unsigned int cpu)
{
        /*
         * When the cpu comes back online, immediately invalidate both
         * the per-cpu crng and all batches, so that we serve fresh
         * randomness.
         */
        per_cpu_ptr(&crngs, cpu)->generation = ULONG_MAX;
        per_cpu_ptr(&batched_entropy_u8, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u16, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u32, cpu)->position = UINT_MAX;
        per_cpu_ptr(&batched_entropy_u64, cpu)->position = UINT_MAX;
        return 0;
}
#endif


/**********************************************************************
 *
 * Entropy accumulation and extraction routines.
 *
 * Callers may add entropy via:
 *
 *     static void mix_pool_bytes(const void *buf, size_t len)
 *
 * After which, if added entropy should be credited:
 *
 *     static void credit_init_bits(size_t bits)
 *
 * Finally, extract entropy via:
 *
 *     static void extract_entropy(void *buf, size_t len)
 *
 **********************************************************************/

enum {
        POOL_BITS = BLAKE2S_HASH_SIZE * 8,
        POOL_READY_BITS = POOL_BITS, /* When crng_init->CRNG_READY */
        POOL_EARLY_BITS = POOL_READY_BITS / 2 /* When crng_init->CRNG_EARLY */
};

static struct {
        struct blake2s_state hash;
        spinlock_t lock;
        unsigned int init_bits;
} input_pool = {
        .hash.h = { BLAKE2S_IV0 ^ (0x01010000 | BLAKE2S_HASH_SIZE),
                    BLAKE2S_IV1, BLAKE2S_IV2, BLAKE2S_IV3, BLAKE2S_IV4,
                    BLAKE2S_IV5, BLAKE2S_IV6, BLAKE2S_IV7 },
        .hash.outlen = BLAKE2S_HASH_SIZE,
        .lock = __SPIN_LOCK_UNLOCKED(input_pool.lock),
};

static void _mix_pool_bytes(const void *buf, size_t len)
{
        blake2s_update(&input_pool.hash, buf, len);
}

/*
 * This function adds bytes into the input pool. It does not
 * update the initialization bit counter; the caller should call
 * credit_init_bits if this is appropriate.
 */
static void mix_pool_bytes(const void *buf, size_t len)
{
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}

/*
 * This is an HKDF-like construction for using the hashed collected entropy
 * as a PRF key, that's then expanded block-by-block.
 */
static void extract_entropy(void *buf, size_t len)
{
        unsigned long flags;
        u8 seed[BLAKE2S_HASH_SIZE], next_key[BLAKE2S_HASH_SIZE];
        struct {
                unsigned long rdseed[32 / sizeof(long)];
                size_t counter;
        } block;
        size_t i, longs;

        for (i = 0; i < ARRAY_SIZE(block.rdseed);) {
                longs = arch_get_random_seed_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(&block.rdseed[i], ARRAY_SIZE(block.rdseed) - i);
                if (longs) {
                        i += longs;
                        continue;
                }
                block.rdseed[i++] = random_get_entropy();
        }

        spin_lock_irqsave(&input_pool.lock, flags);

        /* seed = HASHPRF(last_key, entropy_input) */
        blake2s_final(&input_pool.hash, seed);

        /* next_key = HASHPRF(seed, RDSEED || 0) */
        block.counter = 0;
        blake2s(next_key, (u8 *)&block, seed, sizeof(next_key), sizeof(block), sizeof(seed));
        blake2s_init_key(&input_pool.hash, BLAKE2S_HASH_SIZE, next_key, sizeof(next_key));

        spin_unlock_irqrestore(&input_pool.lock, flags);
        memzero_explicit(next_key, sizeof(next_key));

        while (len) {
                i = min_t(size_t, len, BLAKE2S_HASH_SIZE);
                /* output = HASHPRF(seed, RDSEED || ++counter) */
                ++block.counter;
                blake2s(buf, (u8 *)&block, seed, i, sizeof(block), sizeof(seed));
                len -= i;
                buf += i;
        }

        memzero_explicit(seed, sizeof(seed));
        memzero_explicit(&block, sizeof(block));
}

#define credit_init_bits(bits) if (!crng_ready()) _credit_init_bits(bits)

static void __cold _credit_init_bits(size_t bits)
{
        static DECLARE_WORK(set_ready, crng_set_ready);
        unsigned int new, orig, add;
        unsigned long flags;

        if (!bits)
                return;

        add = min_t(size_t, bits, POOL_BITS);

        orig = READ_ONCE(input_pool.init_bits);
        do {
                new = min_t(unsigned int, POOL_BITS, orig + add);
        } while (!try_cmpxchg(&input_pool.init_bits, &orig, new));

        if (orig < POOL_READY_BITS && new >= POOL_READY_BITS) {
                crng_reseed(NULL); /* Sets crng_init to CRNG_READY under base_crng.lock. */
                if (static_key_initialized && system_unbound_wq)
                        queue_work(system_unbound_wq, &set_ready);
                atomic_notifier_call_chain(&random_ready_notifier, 0, NULL);
#ifdef CONFIG_VDSO_GETRANDOM
                WRITE_ONCE(vdso_k_rng_data->is_ready, true);
#endif
                wake_up_interruptible(&crng_init_wait);
                kill_fasync(&fasync, SIGIO, POLL_IN);
                pr_notice("crng init done\n");
                if (urandom_warning.missed)
                        pr_notice("%d urandom warning(s) missed due to ratelimiting\n",
                                  urandom_warning.missed);
        } else if (orig < POOL_EARLY_BITS && new >= POOL_EARLY_BITS) {
                spin_lock_irqsave(&base_crng.lock, flags);
                /* Check if crng_init is CRNG_EMPTY, to avoid race with crng_reseed(). */
                if (crng_init == CRNG_EMPTY) {
                        extract_entropy(base_crng.key, sizeof(base_crng.key));
                        crng_init = CRNG_EARLY;
                }
                spin_unlock_irqrestore(&base_crng.lock, flags);
        }
}


/**********************************************************************
 *
 * Entropy collection routines.
 *
 * The following exported functions are used for pushing entropy into
 * the above entropy accumulation routines:
 *
 *        void add_device_randomness(const void *buf, size_t len);
 *        void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after);
 *        void add_bootloader_randomness(const void *buf, size_t len);
 *        void add_vmfork_randomness(const void *unique_vm_id, size_t len);
 *        void add_interrupt_randomness(int irq);
 *        void add_input_randomness(unsigned int type, unsigned int code, unsigned int value);
 *        void add_disk_randomness(struct gendisk *disk);
 *
 * add_device_randomness() adds data to the input pool that
 * is likely to differ between two devices (or possibly even per boot).
 * This would be things like MAC addresses or serial numbers, or the
 * read-out of the RTC. This does *not* credit any actual entropy to
 * the pool, but it initializes the pool to different values for devices
 * that might otherwise be identical and have very little entropy
 * available to them (particularly common in the embedded world).
 *
 * add_hwgenerator_randomness() is for true hardware RNGs, and will credit
 * entropy as specified by the caller. If the entropy pool is full it will
 * block until more entropy is needed.
 *
 * add_bootloader_randomness() is called by bootloader drivers, such as EFI
 * and device tree, and credits its input depending on whether or not the
 * command line option 'random.trust_bootloader'.
 *
 * add_vmfork_randomness() adds a unique (but not necessarily secret) ID
 * representing the current instance of a VM to the pool, without crediting,
 * and then force-reseeds the crng so that it takes effect immediately.
 *
 * add_interrupt_randomness() uses the interrupt timing as random
 * inputs to the entropy pool. Using the cycle counters and the irq source
 * as inputs, it feeds the input pool roughly once a second or after 64
 * interrupts, crediting 1 bit of entropy for whichever comes first.
 *
 * add_input_randomness() uses the input layer interrupt timing, as well
 * as the event type information from the hardware.
 *
 * add_disk_randomness() uses what amounts to the seek time of block
 * layer request events, on a per-disk_devt basis, as input to the
 * entropy pool. Note that high-speed solid state drives with very low
 * seek times do not make for good sources of entropy, as their seek
 * times are usually fairly consistent.
 *
 * The last two routines try to estimate how many bits of entropy
 * to credit. They do this by keeping track of the first and second
 * order deltas of the event timings.
 *
 **********************************************************************/

static bool trust_cpu __initdata = true;
static bool trust_bootloader __initdata = true;
static int __init parse_trust_cpu(char *arg)
{
        return kstrtobool(arg, &trust_cpu);
}
static int __init parse_trust_bootloader(char *arg)
{
        return kstrtobool(arg, &trust_bootloader);
}
early_param("random.trust_cpu", parse_trust_cpu);
early_param("random.trust_bootloader", parse_trust_bootloader);

static int random_pm_notification(struct notifier_block *nb, unsigned long action, void *data)
{
        unsigned long flags, entropy = random_get_entropy();

        /*
         * Encode a representation of how long the system has been suspended,
         * in a way that is distinct from prior system suspends.
         */
        ktime_t stamps[] = { ktime_get(), ktime_get_boottime(), ktime_get_real() };

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&action, sizeof(action));
        _mix_pool_bytes(stamps, sizeof(stamps));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        spin_unlock_irqrestore(&input_pool.lock, flags);

        if (crng_ready() && (action == PM_RESTORE_PREPARE ||
            (action == PM_POST_SUSPEND && !IS_ENABLED(CONFIG_PM_AUTOSLEEP) &&
             !IS_ENABLED(CONFIG_PM_USERSPACE_AUTOSLEEP)))) {
                crng_reseed(NULL);
                pr_notice("crng reseeded on system resumption\n");
        }
        return 0;
}

static struct notifier_block pm_notifier = { .notifier_call = random_pm_notification };

/*
 * This is called extremely early, before time keeping functionality is
 * available, but arch randomness is. Interrupts are not yet enabled.
 */
void __init random_init_early(const char *command_line)
{
        unsigned long entropy[BLAKE2S_BLOCK_SIZE / sizeof(long)];
        size_t i, longs, arch_bits;

#if defined(LATENT_ENTROPY_PLUGIN)
        static const u8 compiletime_seed[BLAKE2S_BLOCK_SIZE] __initconst __latent_entropy;
        _mix_pool_bytes(compiletime_seed, sizeof(compiletime_seed));
#endif

        for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) {
                longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i);
                if (longs) {
                        _mix_pool_bytes(entropy, sizeof(*entropy) * longs);
                        i += longs;
                        continue;
                }
                arch_bits -= sizeof(*entropy) * 8;
                ++i;
        }

        _mix_pool_bytes(init_utsname(), sizeof(*(init_utsname())));
        _mix_pool_bytes(command_line, strlen(command_line));

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);
        else if (trust_cpu)
                _credit_init_bits(arch_bits);
}

/*
 * This is called a little bit after the prior function, and now there is
 * access to timestamps counters. Interrupts are not yet enabled.
 */
void __init random_init(void)
{
        unsigned long entropy = random_get_entropy();
        ktime_t now = ktime_get_real();

        _mix_pool_bytes(&now, sizeof(now));
        _mix_pool_bytes(&entropy, sizeof(entropy));
        add_latent_entropy();

        /*
         * If we were initialized by the cpu or bootloader before jump labels
         * or workqueues are initialized, then we should enable the static
         * branch here, where it's guaranteed that these have been initialized.
         */
        if (!static_branch_likely(&crng_is_ready) && crng_init >= CRNG_READY)
                crng_set_ready(NULL);

        /* Reseed if already seeded by earlier phases. */
        if (crng_ready())
                crng_reseed(NULL);

        WARN_ON(register_pm_notifier(&pm_notifier));

        WARN(!entropy, "Missing cycle counter and fallback timer; RNG "
                       "entropy collection will consequently suffer.");
}

/*
 * Add device- or boot-specific data to the input pool to help
 * initialize it.
 *
 * None of this adds any entropy; it is meant to avoid the problem of
 * the entropy pool having similar initial state across largely
 * identical devices.
 */
void add_device_randomness(const void *buf, size_t len)
{
        unsigned long entropy = random_get_entropy();
        unsigned long flags;

        spin_lock_irqsave(&input_pool.lock, flags);
        _mix_pool_bytes(&entropy, sizeof(entropy));
        _mix_pool_bytes(buf, len);
        spin_unlock_irqrestore(&input_pool.lock, flags);
}
EXPORT_SYMBOL(add_device_randomness);

/*
 * Interface for in-kernel drivers of true hardware RNGs. Those devices
 * may produce endless random bits, so this function will sleep for
 * some amount of time after, if the sleep_after parameter is true.
 */
void add_hwgenerator_randomness(const void *buf, size_t len, size_t entropy, bool sleep_after)
{
        mix_pool_bytes(buf, len);
        credit_init_bits(entropy);

        /*
         * Throttle writing to once every reseed interval, unless we're not yet
         * initialized or no entropy is credited.
         */
        if (sleep_after && !kthread_should_stop() && (crng_ready() || !entropy))
                schedule_timeout_interruptible(crng_reseed_interval());
}
EXPORT_SYMBOL_GPL(add_hwgenerator_randomness);

/*
 * Handle random seed passed by bootloader, and credit it depending
 * on the command line option 'random.trust_bootloader'.
 */
void __init add_bootloader_randomness(const void *buf, size_t len)
{
        mix_pool_bytes(buf, len);
        if (trust_bootloader)
                credit_init_bits(len * 8);
}

#if IS_ENABLED(CONFIG_VMGENID)
static BLOCKING_NOTIFIER_HEAD(vmfork_chain);

/*
 * Handle a new unique VM ID, which is unique, not secret, so we
 * don't credit it, but we do immediately force a reseed after so
 * that it's used by the crng posthaste.
 */
void __cold add_vmfork_randomness(const void *unique_vm_id, size_t len)
{
        add_device_randomness(unique_vm_id, len);
        if (crng_ready()) {
                crng_reseed(NULL);
                pr_notice("crng reseeded due to virtual machine fork\n");
        }
        blocking_notifier_call_chain(&vmfork_chain, 0, NULL);
}
#if IS_MODULE(CONFIG_VMGENID)
EXPORT_SYMBOL_GPL(add_vmfork_randomness);
#endif

int __cold register_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(register_random_vmfork_notifier);

int __cold unregister_random_vmfork_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmfork_chain, nb);
}
EXPORT_SYMBOL_GPL(unregister_random_vmfork_notifier);
#endif

struct fast_pool {
        unsigned long pool[4];
        unsigned long last;
        unsigned int count;
        struct timer_list mix;
};

static void mix_interrupt_randomness(struct timer_list *work);

static DEFINE_PER_CPU(struct fast_pool, irq_randomness) = {
#ifdef CONFIG_64BIT
#define FASTMIX_PERM SIPHASH_PERMUTATION
        .pool = { SIPHASH_CONST_0, SIPHASH_CONST_1, SIPHASH_CONST_2, SIPHASH_CONST_3 },
#else
#define FASTMIX_PERM HSIPHASH_PERMUTATION
        .pool = { HSIPHASH_CONST_0, HSIPHASH_CONST_1, HSIPHASH_CONST_2, HSIPHASH_CONST_3 },
#endif
        .mix = __TIMER_INITIALIZER(mix_interrupt_randomness, 0)
};

/*
 * This is [Half]SipHash-1-x, starting from an empty key. Because
 * the key is fixed, it assumes that its inputs are non-malicious,
 * and therefore this has no security on its own. s represents the
 * four-word SipHash state, while v represents a two-word input.
 */
static void fast_mix(unsigned long s[4], unsigned long v1, unsigned long v2)
{
        s[3] ^= v1;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v1;
        s[3] ^= v2;
        FASTMIX_PERM(s[0], s[1], s[2], s[3]);
        s[0] ^= v2;
}

#ifdef CONFIG_SMP
/*
 * This function is called when the CPU has just come online, with
 * entry CPUHP_AP_RANDOM_ONLINE, just after CPUHP_AP_WORKQUEUE_ONLINE.
 */
int __cold random_online_cpu(unsigned int cpu)
{
        /*
         * During CPU shutdown and before CPU onlining, add_interrupt_
         * randomness() may schedule mix_interrupt_randomness(), and
         * set the MIX_INFLIGHT flag. However, because the worker can
         * be scheduled on a different CPU during this period, that
         * flag will never be cleared. For that reason, we zero out
         * the flag here, which runs just after workqueues are onlined
         * for the CPU again. This also has the effect of setting the
         * irq randomness count to zero so that new accumulated irqs
         * are fresh.
         */
        per_cpu_ptr(&irq_randomness, cpu)->count = 0;
        return 0;
}
#endif

static void mix_interrupt_randomness(struct timer_list *work)
{
        struct fast_pool *fast_pool = container_of(work, struct fast_pool, mix);
        /*
         * The size of the copied stack pool is explicitly 2 longs so that we
         * only ever ingest half of the siphash output each time, retaining
         * the other half as the next "key" that carries over. The entropy is
         * supposed to be sufficiently dispersed between bits so on average
         * we don't wind up "losing" some.
         */
        unsigned long pool[2];
        unsigned int count;

        /* Check to see if we're running on the wrong CPU due to hotplug. */
        local_irq_disable();
        if (fast_pool != this_cpu_ptr(&irq_randomness)) {
                local_irq_enable();
                return;
        }

        /*
         * Copy the pool to the stack so that the mixer always has a
         * consistent view, before we reenable irqs again.
         */
        memcpy(pool, fast_pool->pool, sizeof(pool));
        count = fast_pool->count;
        fast_pool->count = 0;
        fast_pool->last = jiffies;
        local_irq_enable();

        mix_pool_bytes(pool, sizeof(pool));
        credit_init_bits(clamp_t(unsigned int, (count & U16_MAX) / 64, 1, sizeof(pool) * 8));

        memzero_explicit(pool, sizeof(pool));
}

void add_interrupt_randomness(int irq)
{
        enum { MIX_INFLIGHT = 1U << 31 };
        unsigned long entropy = random_get_entropy();
        struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness);
        struct pt_regs *regs = get_irq_regs();
        unsigned int new_count;

        fast_mix(fast_pool->pool, entropy,
                 (regs ? instruction_pointer(regs) : _RET_IP_) ^ swab(irq));
        new_count = ++fast_pool->count;

        if (new_count & MIX_INFLIGHT)
                return;

        if (new_count < 1024 && !time_is_before_jiffies(fast_pool->last + HZ))
                return;

        fast_pool->count |= MIX_INFLIGHT;
        if (!timer_pending(&fast_pool->mix)) {
                fast_pool->mix.expires = jiffies;
                add_timer_on(&fast_pool->mix, raw_smp_processor_id());
        }
}
EXPORT_SYMBOL_GPL(add_interrupt_randomness);

/* There is one of these per entropy source */
struct timer_rand_state {
        unsigned long last_time;
        long last_delta, last_delta2;
};

/*
 * This function adds entropy to the entropy "pool" by using timing
 * delays. It uses the timer_rand_state structure to make an estimate
 * of how many bits of entropy this call has added to the pool. The
 * value "num" is also added to the pool; it should somehow describe
 * the type of event that just happened.
 */
static void add_timer_randomness(struct timer_rand_state *state, unsigned int num)
{
        unsigned long entropy = random_get_entropy(), now = jiffies, flags;
        long delta, delta2, delta3;
        unsigned int bits;

        /*
         * If we're in a hard IRQ, add_interrupt_randomness() will be called
         * sometime after, so mix into the fast pool.
         */
        if (in_hardirq()) {
                fast_mix(this_cpu_ptr(&irq_randomness)->pool, entropy, num);
        } else {
                spin_lock_irqsave(&input_pool.lock, flags);
                _mix_pool_bytes(&entropy, sizeof(entropy));
                _mix_pool_bytes(&num, sizeof(num));
                spin_unlock_irqrestore(&input_pool.lock, flags);
        }

        if (crng_ready())
                return;

        /*
         * Calculate number of bits of randomness we probably added.
         * We take into account the first, second and third-order deltas
         * in order to make our estimate.
         */
        delta = now - READ_ONCE(state->last_time);
        WRITE_ONCE(state->last_time, now);

        delta2 = delta - READ_ONCE(state->last_delta);
        WRITE_ONCE(state->last_delta, delta);

        delta3 = delta2 - READ_ONCE(state->last_delta2);
        WRITE_ONCE(state->last_delta2, delta2);

        if (delta < 0)
                delta = -delta;
        if (delta2 < 0)
                delta2 = -delta2;
        if (delta3 < 0)
                delta3 = -delta3;
        if (delta > delta2)
                delta = delta2;
        if (delta > delta3)
                delta = delta3;

        /*
         * delta is now minimum absolute delta. Round down by 1 bit
         * on general principles, and limit entropy estimate to 11 bits.
         */
        bits = min(fls(delta >> 1), 11);

        /*
         * As mentioned above, if we're in a hard IRQ, add_interrupt_randomness()
         * will run after this, which uses a different crediting scheme of 1 bit
         * per every 64 interrupts. In order to let that function do accounting
         * close to the one in this function, we credit a full 64/64 bit per bit,
         * and then subtract one to account for the extra one added.
         */
        if (in_hardirq())
                this_cpu_ptr(&irq_randomness)->count += max(1u, bits * 64) - 1;
        else
                _credit_init_bits(bits);
}

void add_input_randomness(unsigned int type, unsigned int code, unsigned int value)
{
        static unsigned char last_value;
        static struct timer_rand_state input_timer_state = { INITIAL_JIFFIES };

        /* Ignore autorepeat and the like. */
        if (value == last_value)
                return;

        last_value = value;
        add_timer_randomness(&input_timer_state,
                             (type << 4) ^ code ^ (code >> 4) ^ value);
}
EXPORT_SYMBOL_GPL(add_input_randomness);

#ifdef CONFIG_BLOCK
void add_disk_randomness(struct gendisk *disk)
{
        if (!disk || !disk->random)
                return;
        /* First major is 1, so we get >= 0x200 here. */
        add_timer_randomness(disk->random, 0x100 + disk_devt(disk));
}
EXPORT_SYMBOL_GPL(add_disk_randomness);

void __cold rand_initialize_disk(struct gendisk *disk)
{
        struct timer_rand_state *state;

        /*
         * If kzalloc returns null, we just won't use that entropy
         * source.
         */
        state = kzalloc(sizeof(struct timer_rand_state), GFP_KERNEL);
        if (state) {
                state->last_time = INITIAL_JIFFIES;
                disk->random = state;
        }
}
#endif

struct entropy_timer_state {
        unsigned long entropy;
        struct timer_list timer;
        atomic_t samples;
        unsigned int samples_per_bit;
};

/*
 * Each time the timer fires, we expect that we got an unpredictable jump in
 * the cycle counter. Even if the timer is running on another CPU, the timer
 * activity will be touching the stack of the CPU that is generating entropy.
 *
 * Note that we don't re-arm the timer in the timer itself - we are happy to be
 * scheduled away, since that just makes the load more complex, but we do not
 * want the timer to keep ticking unless the entropy loop is running.
 *
 * So the re-arming always happens in the entropy loop itself.
 */
static void __cold entropy_timer(struct timer_list *timer)
{
        struct entropy_timer_state *state = container_of(timer, struct entropy_timer_state, timer);
        unsigned long entropy = random_get_entropy();

        mix_pool_bytes(&entropy, sizeof(entropy));
        if (atomic_inc_return(&state->samples) % state->samples_per_bit == 0)
                credit_init_bits(1);
}

/*
 * If we have an actual cycle counter, see if we can generate enough entropy
 * with timing noise.
 */
static void __cold try_to_generate_entropy(void)
{
        enum { NUM_TRIAL_SAMPLES = 8192, MAX_SAMPLES_PER_BIT = HZ / 15 };
        u8 stack_bytes[sizeof(struct entropy_timer_state) + SMP_CACHE_BYTES - 1];
        struct entropy_timer_state *stack = PTR_ALIGN((void *)stack_bytes, SMP_CACHE_BYTES);
        unsigned int i, num_different = 0;
        unsigned long last = random_get_entropy();
        int cpu = -1;

        for (i = 0; i < NUM_TRIAL_SAMPLES - 1; ++i) {
                stack->entropy = random_get_entropy();
                if (stack->entropy != last)
                        ++num_different;
                last = stack->entropy;
        }
        stack->samples_per_bit = DIV_ROUND_UP(NUM_TRIAL_SAMPLES, num_different + 1);
        if (stack->samples_per_bit > MAX_SAMPLES_PER_BIT)
                return;

        atomic_set(&stack->samples, 0);
        timer_setup_on_stack(&stack->timer, entropy_timer, 0);
        while (!crng_ready() && !signal_pending(current)) {
                /*
                 * Check !timer_pending() and then ensure that any previous callback has finished
                 * executing by checking try_to_del_timer_sync(), before queueing the next one.
                 */
                if (!timer_pending(&stack->timer) && try_to_del_timer_sync(&stack->timer) >= 0) {
                        struct cpumask timer_cpus;
                        unsigned int num_cpus;

                        /*
                         * Preemption must be disabled here, both to read the current CPU number
                         * and to avoid scheduling a timer on a dead CPU.
                         */
                        preempt_disable();

                        /* Only schedule callbacks on timer CPUs that are online. */
                        cpumask_and(&timer_cpus, housekeeping_cpumask(HK_TYPE_TIMER), cpu_online_mask);
                        num_cpus = cpumask_weight(&timer_cpus);
                        /* In very bizarre case of misconfiguration, fallback to all online. */
                        if (unlikely(num_cpus == 0)) {
                                timer_cpus = *cpu_online_mask;
                                num_cpus = cpumask_weight(&timer_cpus);
                        }

                        /* Basic CPU round-robin, which avoids the current CPU. */
                        do {
                                cpu = cpumask_next(cpu, &timer_cpus);
                                if (cpu >= nr_cpu_ids)
                                        cpu = cpumask_first(&timer_cpus);
                        } while (cpu == smp_processor_id() && num_cpus > 1);

                        /* Expiring the timer at `jiffies` means it's the next tick. */
                        stack->timer.expires = jiffies;

                        add_timer_on(&stack->timer, cpu);

                        preempt_enable();
                }
                mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));
                schedule();
                stack->entropy = random_get_entropy();
        }
        mix_pool_bytes(&stack->entropy, sizeof(stack->entropy));

        timer_delete_sync(&stack->timer);
        destroy_timer_on_stack(&stack->timer);
}


/**********************************************************************
 *
 * Userspace reader/writer interfaces.
 *
 * getrandom(2) is the primary modern interface into the RNG and should
 * be used in preference to anything else.
 *
 * Reading from /dev/random has the same functionality as calling
 * getrandom(2) with flags=0. In earlier versions, however, it had
 * vastly different semantics and should therefore be avoided, to
 * prevent backwards compatibility issues.
 *
 * Reading from /dev/urandom has the same functionality as calling
 * getrandom(2) with flags=GRND_INSECURE. Because it does not block
 * waiting for the RNG to be ready, it should not be used.
 *
 * Writing to either /dev/random or /dev/urandom adds entropy to
 * the input pool but does not credit it.
 *
 * Polling on /dev/random indicates when the RNG is initialized, on
 * the read side, and when it wants new entropy, on the write side.
 *
 * Both /dev/random and /dev/urandom have the same set of ioctls for
 * adding entropy, getting the entropy count, zeroing the count, and
 * reseeding the crng.
 *
 **********************************************************************/

SYSCALL_DEFINE3(getrandom, char __user *, ubuf, size_t, len, unsigned int, flags)
{
        struct iov_iter iter;
        int ret;

        if (flags & ~(GRND_NONBLOCK | GRND_RANDOM | GRND_INSECURE))
                return -EINVAL;

        /*
         * Requesting insecure and blocking randomness at the same time makes
         * no sense.
         */
        if ((flags & (GRND_INSECURE | GRND_RANDOM)) == (GRND_INSECURE | GRND_RANDOM))
                return -EINVAL;

        if (!crng_ready() && !(flags & GRND_INSECURE)) {
                if (flags & GRND_NONBLOCK)
                        return -EAGAIN;
                ret = wait_for_random_bytes();
                if (unlikely(ret))
                        return ret;
        }

        ret = import_ubuf(ITER_DEST, ubuf, len, &iter);
        if (unlikely(ret))
                return ret;
        return get_random_bytes_user(&iter);
}

static __poll_t random_poll(struct file *file, poll_table *wait)
{
        poll_wait(file, &crng_init_wait, wait);
        return crng_ready() ? EPOLLIN | EPOLLRDNORM : EPOLLOUT | EPOLLWRNORM;
}

static ssize_t write_pool_user(struct iov_iter *iter)
{
        u8 block[BLAKE2S_BLOCK_SIZE];
        ssize_t ret = 0;
        size_t copied;

        if (unlikely(!iov_iter_count(iter)))
                return 0;

        for (;;) {
                copied = copy_from_iter(block, sizeof(block), iter);
                ret += copied;
                mix_pool_bytes(block, copied);
                if (!iov_iter_count(iter) || copied != sizeof(block))
                        break;

                BUILD_BUG_ON(PAGE_SIZE % sizeof(block) != 0);
                if (ret % PAGE_SIZE == 0) {
                        if (signal_pending(current))
                                break;
                        cond_resched();
                }
        }

        memzero_explicit(block, sizeof(block));
        return ret ? ret : -EFAULT;
}

static ssize_t random_write_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        return write_pool_user(iter);
}

static ssize_t urandom_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        static int maxwarn = 10;

        /*
         * Opportunistically attempt to initialize the RNG on platforms that
         * have fast cycle counters, but don't (for now) require it to succeed.
         */
        if (!crng_ready())
                try_to_generate_entropy();

        if (!crng_ready()) {
                if (!ratelimit_disable && maxwarn <= 0)
                        ++urandom_warning.missed;
                else if (ratelimit_disable || __ratelimit(&urandom_warning)) {
                        --maxwarn;
                        pr_notice("%s: uninitialized urandom read (%zu bytes read)\n",
                                  current->comm, iov_iter_count(iter));
                }
        }

        return get_random_bytes_user(iter);
}

static ssize_t random_read_iter(struct kiocb *kiocb, struct iov_iter *iter)
{
        int ret;

        if (!crng_ready() &&
            ((kiocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) ||
             (kiocb->ki_filp->f_flags & O_NONBLOCK)))
                return -EAGAIN;

        ret = wait_for_random_bytes();
        if (ret != 0)
                return ret;
        return get_random_bytes_user(iter);
}

static long random_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
        int __user *p = (int __user *)arg;
        int ent_count;

        switch (cmd) {
        case RNDGETENTCNT:
                /* Inherently racy, no point locking. */
                if (put_user(input_pool.init_bits, p))
                        return -EFAULT;
                return 0;
        case RNDADDTOENTCNT:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                credit_init_bits(ent_count);
                return 0;
        case RNDADDENTROPY: {
                struct iov_iter iter;
                ssize_t ret;
                int len;

                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (get_user(ent_count, p++))
                        return -EFAULT;
                if (ent_count < 0)
                        return -EINVAL;
                if (get_user(len, p++))
                        return -EFAULT;
                ret = import_ubuf(ITER_SOURCE, p, len, &iter);
                if (unlikely(ret))
                        return ret;
                ret = write_pool_user(&iter);
                if (unlikely(ret < 0))
                        return ret;
                /* Since we're crediting, enforce that it was all written into the pool. */
                if (unlikely(ret != len))
                        return -EFAULT;
                credit_init_bits(ent_count);
                return 0;
        }
        case RNDZAPENTCNT:
        case RNDCLEARPOOL:
                /* No longer has any effect. */
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                return 0;
        case RNDRESEEDCRNG:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (!crng_ready())
                        return -ENODATA;
                crng_reseed(NULL);
                return 0;
        default:
                return -EINVAL;
        }
}

static int random_fasync(int fd, struct file *filp, int on)
{
        return fasync_helper(fd, filp, on, &fasync);
}

const struct file_operations random_fops = {
        .read_iter = random_read_iter,
        .write_iter = random_write_iter,
        .poll = random_poll,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};

const struct file_operations urandom_fops = {
        .read_iter = urandom_read_iter,
        .write_iter = random_write_iter,
        .unlocked_ioctl = random_ioctl,
        .compat_ioctl = compat_ptr_ioctl,
        .fasync = random_fasync,
        .llseek = noop_llseek,
        .splice_read = copy_splice_read,
        .splice_write = iter_file_splice_write,
};


/********************************************************************
 *
 * Sysctl interface.
 *
 * These are partly unused legacy knobs with dummy values to not break
 * userspace and partly still useful things. They are usually accessible
 * in /proc/sys/kernel/random/ and are as follows:
 *
 * - boot_id - a UUID representing the current boot.
 *
 * - uuid - a random UUID, different each time the file is read.
 *
 * - poolsize - the number of bits of entropy that the input pool can
 *   hold, tied to the POOL_BITS constant.
 *
 * - entropy_avail - the number of bits of entropy currently in the
 *   input pool. Always <= poolsize.
 *
 * - write_wakeup_threshold - the amount of entropy in the input pool
 *   below which write polls to /dev/random will unblock, requesting
 *   more entropy, tied to the POOL_READY_BITS constant. It is writable
 *   to avoid breaking old userspaces, but writing to it does not
 *   change any behavior of the RNG.
 *
 * - urandom_min_reseed_secs - fixed to the value CRNG_RESEED_INTERVAL.
 *   It is writable to avoid breaking old userspaces, but writing
 *   to it does not change any behavior of the RNG.
 *
 ********************************************************************/

#ifdef CONFIG_SYSCTL

#include <linux/sysctl.h>

static int sysctl_random_min_urandom_seed = CRNG_RESEED_INTERVAL / HZ;
static int sysctl_random_write_wakeup_bits = POOL_READY_BITS;
static int sysctl_poolsize = POOL_BITS;
static u8 sysctl_bootid[UUID_SIZE];

/*
 * This function is used to return both the bootid UUID, and random
 * UUID. The difference is in whether table->data is NULL; if it is,
 * then a new UUID is generated and returned to the user.
 */
static int proc_do_uuid(const struct ctl_table *table, int write, void *buf,
                        size_t *lenp, loff_t *ppos)
{
        u8 tmp_uuid[UUID_SIZE], *uuid;
        char uuid_string[UUID_STRING_LEN + 1];
        struct ctl_table fake_table = {
                .data = uuid_string,
                .maxlen = UUID_STRING_LEN
        };

        if (write)
                return -EPERM;

        uuid = table->data;
        if (!uuid) {
                uuid = tmp_uuid;
                generate_random_uuid(uuid);
        } else {
                static DEFINE_SPINLOCK(bootid_spinlock);

                spin_lock(&bootid_spinlock);
                if (!uuid[8])
                        generate_random_uuid(uuid);
                spin_unlock(&bootid_spinlock);
        }

        snprintf(uuid_string, sizeof(uuid_string), "%pU", uuid);
        return proc_dostring(&fake_table, 0, buf, lenp, ppos);
}

/* The same as proc_dointvec, but writes don't change anything. */
static int proc_do_rointvec(const struct ctl_table *table, int write, void *buf,
                            size_t *lenp, loff_t *ppos)
{
        return write ? 0 : proc_dointvec(table, 0, buf, lenp, ppos);
}

static const struct ctl_table random_table[] = {
        {
                .procname        = "poolsize",
                .data                = &sysctl_poolsize,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "entropy_avail",
                .data                = &input_pool.init_bits,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "write_wakeup_threshold",
                .data                = &sysctl_random_write_wakeup_bits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "urandom_min_reseed_secs",
                .data                = &sysctl_random_min_urandom_seed,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_do_rointvec,
        },
        {
                .procname        = "boot_id",
                .data                = &sysctl_bootid,
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
        {
                .procname        = "uuid",
                .mode                = 0444,
                .proc_handler        = proc_do_uuid,
        },
};

/*
 * random_init() is called before sysctl_init(),
 * so we cannot call register_sysctl_init() in random_init()
 */
static int __init random_sysctls_init(void)
{
        register_sysctl_init("kernel/random", random_table);
        return 0;
}
device_initcall(random_sysctls_init);
#endif












































































































































































































































































   11 
























































































































































































































































































































































































































































































































































































































































































































































































































































  266 





























  106 














    1 
   97 



    9 







    9 






















   33 













    1 









   23 


   26 












    9 








































































































































































































  165 


























































    6 
















































































  204 

   66 

   67 

   66 

   67 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Derived from arch/arm/include/asm/kvm_host.h:
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#ifndef __ARM64_KVM_HOST_H__
#define __ARM64_KVM_HOST_H__

#include <linux/arm-smccc.h>
#include <linux/bitmap.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <linux/kvm_types.h>
#include <linux/maple_tree.h>
#include <linux/percpu.h>
#include <linux/psci.h>
#include <asm/arch_gicv3.h>
#include <asm/barrier.h>
#include <asm/cpufeature.h>
#include <asm/cputype.h>
#include <asm/daifflags.h>
#include <asm/fpsimd.h>
#include <asm/kvm.h>
#include <asm/kvm_asm.h>
#include <asm/vncr_mapping.h>

#define __KVM_HAVE_ARCH_INTC_INITIALIZED

#define KVM_HALT_POLL_NS_DEFAULT 500000

#include <kvm/arm_vgic.h>
#include <kvm/arm_arch_timer.h>
#include <kvm/arm_pmu.h>

#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS

#define KVM_VCPU_MAX_FEATURES 9
#define KVM_VCPU_VALID_FEATURES        (BIT(KVM_VCPU_MAX_FEATURES) - 1)

#define KVM_REQ_SLEEP \
        KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
#define KVM_REQ_IRQ_PENDING                KVM_ARCH_REQ(1)
#define KVM_REQ_VCPU_RESET                KVM_ARCH_REQ(2)
#define KVM_REQ_RECORD_STEAL                KVM_ARCH_REQ(3)
#define KVM_REQ_RELOAD_GICv4                KVM_ARCH_REQ(4)
#define KVM_REQ_RELOAD_PMU                KVM_ARCH_REQ(5)
#define KVM_REQ_SUSPEND                        KVM_ARCH_REQ(6)
#define KVM_REQ_RESYNC_PMU_EL0                KVM_ARCH_REQ(7)
#define KVM_REQ_NESTED_S2_UNMAP                KVM_ARCH_REQ(8)
#define KVM_REQ_GUEST_HYP_IRQ_PENDING        KVM_ARCH_REQ(9)
#define KVM_REQ_MAP_L1_VNCR_EL2                KVM_ARCH_REQ(10)

#define KVM_DIRTY_LOG_MANUAL_CAPS   (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \
                                     KVM_DIRTY_LOG_INITIALLY_SET)

#define KVM_HAVE_MMU_RWLOCK

/*
 * Mode of operation configurable with kvm-arm.mode early param.
 * See Documentation/admin-guide/kernel-parameters.txt for more information.
 */
enum kvm_mode {
        KVM_MODE_DEFAULT,
        KVM_MODE_PROTECTED,
        KVM_MODE_NV,
        KVM_MODE_NONE,
};
#ifdef CONFIG_KVM
enum kvm_mode kvm_get_mode(void);
#else
static inline enum kvm_mode kvm_get_mode(void) { return KVM_MODE_NONE; };
#endif

extern unsigned int __ro_after_init kvm_sve_max_vl;
extern unsigned int __ro_after_init kvm_host_sve_max_vl;
int __init kvm_arm_init_sve(void);

u32 __attribute_const__ kvm_target_cpu(void);
void kvm_reset_vcpu(struct kvm_vcpu *vcpu);
void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);

struct kvm_hyp_memcache {
        phys_addr_t head;
        unsigned long nr_pages;
        struct pkvm_mapping *mapping; /* only used from EL1 */

#define        HYP_MEMCACHE_ACCOUNT_STAGE2        BIT(1)
        unsigned long flags;
};

static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
                                     phys_addr_t *p,
                                     phys_addr_t (*to_pa)(void *virt))
{
        *p = mc->head;
        mc->head = to_pa(p);
        mc->nr_pages++;
}

static inline void *pop_hyp_memcache(struct kvm_hyp_memcache *mc,
                                     void *(*to_va)(phys_addr_t phys))
{
        phys_addr_t *p = to_va(mc->head & PAGE_MASK);

        if (!mc->nr_pages)
                return NULL;

        mc->head = *p;
        mc->nr_pages--;

        return p;
}

static inline int __topup_hyp_memcache(struct kvm_hyp_memcache *mc,
                                       unsigned long min_pages,
                                       void *(*alloc_fn)(void *arg),
                                       phys_addr_t (*to_pa)(void *virt),
                                       void *arg)
{
        while (mc->nr_pages < min_pages) {
                phys_addr_t *p = alloc_fn(arg);

                if (!p)
                        return -ENOMEM;
                push_hyp_memcache(mc, p, to_pa);
        }

        return 0;
}

static inline void __free_hyp_memcache(struct kvm_hyp_memcache *mc,
                                       void (*free_fn)(void *virt, void *arg),
                                       void *(*to_va)(phys_addr_t phys),
                                       void *arg)
{
        while (mc->nr_pages)
                free_fn(pop_hyp_memcache(mc, to_va), arg);
}

void free_hyp_memcache(struct kvm_hyp_memcache *mc);
int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages);

struct kvm_vmid {
        atomic64_t id;
};

struct kvm_s2_mmu {
        struct kvm_vmid vmid;

        /*
         * stage2 entry level table
         *
         * Two kvm_s2_mmu structures in the same VM can point to the same
         * pgd here.  This happens when running a guest using a
         * translation regime that isn't affected by its own stage-2
         * translation, such as a non-VHE hypervisor running at vEL2, or
         * for vEL1/EL0 with vHCR_EL2.VM == 0.  In that case, we use the
         * canonical stage-2 page tables.
         */
        phys_addr_t        pgd_phys;
        struct kvm_pgtable *pgt;

        /*
         * VTCR value used on the host. For a non-NV guest (or a NV
         * guest that runs in a context where its own S2 doesn't
         * apply), its T0SZ value reflects that of the IPA size.
         *
         * For a shadow S2 MMU, T0SZ reflects the PARange exposed to
         * the guest.
         */
        u64        vtcr;

        /* The last vcpu id that ran on each physical CPU */
        int __percpu *last_vcpu_ran;

#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0
        /*
         * Memory cache used to split
         * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It
         * is used to allocate stage2 page tables while splitting huge
         * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE
         * influences both the capacity of the split page cache, and
         * how often KVM reschedules. Be wary of raising CHUNK_SIZE
         * too high.
         *
         * Protected by kvm->slots_lock.
         */
        struct kvm_mmu_memory_cache split_page_cache;
        uint64_t split_page_chunk_size;

        struct kvm_arch *arch;

        /*
         * For a shadow stage-2 MMU, the virtual vttbr used by the
         * host to parse the guest S2.
         * This either contains:
         * - the virtual VTTBR programmed by the guest hypervisor with
         *   CnP cleared
         * - The value 1 (VMID=0, BADDR=0, CnP=1) if invalid
         *
         * We also cache the full VTCR which gets used for TLB invalidation,
         * taking the ARM ARM's "Any of the bits in VTCR_EL2 are permitted
         * to be cached in a TLB" to the letter.
         */
        u64        tlb_vttbr;
        u64        tlb_vtcr;

        /*
         * true when this represents a nested context where virtual
         * HCR_EL2.VM == 1
         */
        bool        nested_stage2_enabled;

        /*
         * true when this MMU needs to be unmapped before being used for a new
         * purpose.
         */
        bool        pending_unmap;

        /*
         *  0: Nobody is currently using this, check vttbr for validity
         * >0: Somebody is actively using this.
         */
        atomic_t refcnt;
};

struct kvm_arch_memory_slot {
};

/**
 * struct kvm_smccc_features: Descriptor of the hypercall services exposed to the guests
 *
 * @std_bmap: Bitmap of standard secure service calls
 * @std_hyp_bmap: Bitmap of standard hypervisor service calls
 * @vendor_hyp_bmap: Bitmap of vendor specific hypervisor service calls
 */
struct kvm_smccc_features {
        unsigned long std_bmap;
        unsigned long std_hyp_bmap;
        unsigned long vendor_hyp_bmap; /* Function numbers 0-63 */
        unsigned long vendor_hyp_bmap_2; /* Function numbers 64-127 */
};

typedef unsigned int pkvm_handle_t;

struct kvm_protected_vm {
        pkvm_handle_t handle;
        struct kvm_hyp_memcache teardown_mc;
        struct kvm_hyp_memcache stage2_teardown_mc;
        bool enabled;
};

struct kvm_mpidr_data {
        u64                        mpidr_mask;
        DECLARE_FLEX_ARRAY(u16, cmpidr_to_idx);
};

static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr)
{
        unsigned long index = 0, mask = data->mpidr_mask;
        unsigned long aff = mpidr & MPIDR_HWID_BITMASK;

        bitmap_gather(&index, &aff, &mask, fls(mask));

        return index;
}

struct kvm_sysreg_masks;

enum fgt_group_id {
        __NO_FGT_GROUP__,
        HFGRTR_GROUP,
        HFGWTR_GROUP = HFGRTR_GROUP,
        HDFGRTR_GROUP,
        HDFGWTR_GROUP = HDFGRTR_GROUP,
        HFGITR_GROUP,
        HAFGRTR_GROUP,
        HFGRTR2_GROUP,
        HFGWTR2_GROUP = HFGRTR2_GROUP,
        HDFGRTR2_GROUP,
        HDFGWTR2_GROUP = HDFGRTR2_GROUP,
        HFGITR2_GROUP,

        /* Must be last */
        __NR_FGT_GROUP_IDS__
};

struct kvm_arch {
        struct kvm_s2_mmu mmu;

        /*
         * Fine-Grained UNDEF, mimicking the FGT layout defined by the
         * architecture. We track them globally, as we present the
         * same feature-set to all vcpus.
         *
         * Index 0 is currently spare.
         */
        u64 fgu[__NR_FGT_GROUP_IDS__];

        /*
         * Stage 2 paging state for VMs with nested S2 using a virtual
         * VMID.
         */
        struct kvm_s2_mmu *nested_mmus;
        size_t nested_mmus_size;
        int nested_mmus_next;

        /* Interrupt controller */
        struct vgic_dist        vgic;

        /* Timers */
        struct arch_timer_vm_data timer_data;

        /* Mandated version of PSCI */
        u32 psci_version;

        /* Protects VM-scoped configuration data */
        struct mutex config_lock;

        /*
         * If we encounter a data abort without valid instruction syndrome
         * information, report this to user space.  User space can (and
         * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is
         * supported.
         */
#define KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER        0
        /* Memory Tagging Extension enabled for the guest */
#define KVM_ARCH_FLAG_MTE_ENABLED                        1
        /* At least one vCPU has ran in the VM */
#define KVM_ARCH_FLAG_HAS_RAN_ONCE                        2
        /* The vCPU feature set for the VM is configured */
#define KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED                3
        /* PSCI SYSTEM_SUSPEND enabled for the guest */
#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED                4
        /* VM counter offset */
#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET                        5
        /* Timer PPIs made immutable */
#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE                6
        /* Initial ID reg values loaded */
#define KVM_ARCH_FLAG_ID_REGS_INITIALIZED                7
        /* Fine-Grained UNDEF initialised */
#define KVM_ARCH_FLAG_FGU_INITIALIZED                        8
        /* SVE exposed to guest */
#define KVM_ARCH_FLAG_GUEST_HAS_SVE                        9
        /* MIDR_EL1, REVIDR_EL1, and AIDR_EL1 are writable from userspace */
#define KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS                10
        unsigned long flags;

        /* VM-wide vCPU feature set */
        DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES);

        /* MPIDR to vcpu index mapping, optional */
        struct kvm_mpidr_data *mpidr_data;

        /*
         * VM-wide PMU filter, implemented as a bitmap and big enough for
         * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
         */
        unsigned long *pmu_filter;
        struct arm_pmu *arm_pmu;

        cpumask_var_t supported_cpus;

        /* Maximum number of counters for the guest */
        u8 nr_pmu_counters;

        /* Iterator for idreg debugfs */
        u8        idreg_debugfs_iter;

        /* Hypercall features firmware registers' descriptor */
        struct kvm_smccc_features smccc_feat;
        struct maple_tree smccc_filter;

        /*
         * Emulated CPU ID registers per VM
         * (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it
         * is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8.
         *
         * These emulated idregs are VM-wide, but accessed from the context of a vCPU.
         * Atomic access to multiple idregs are guarded by kvm_arch.config_lock.
         */
#define IDREG_IDX(id)                (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id))
#define KVM_ARM_ID_REG_NUM        (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1)
        u64 id_regs[KVM_ARM_ID_REG_NUM];

        u64 midr_el1;
        u64 revidr_el1;
        u64 aidr_el1;
        u64 ctr_el0;

        /* Masks for VNCR-backed and general EL2 sysregs */
        struct kvm_sysreg_masks        *sysreg_masks;

        /* Count the number of VNCR_EL2 currently mapped */
        atomic_t vncr_map_count;

        /*
         * For an untrusted host VM, 'pkvm.handle' is used to lookup
         * the associated pKVM instance in the hypervisor.
         */
        struct kvm_protected_vm pkvm;
};

struct kvm_vcpu_fault_info {
        u64 esr_el2;                /* Hyp Syndrom Register */
        u64 far_el2;                /* Hyp Fault Address Register */
        u64 hpfar_el2;                /* Hyp IPA Fault Address Register */
        u64 disr_el1;                /* Deferred [SError] Status Register */
};

/*
 * VNCR() just places the VNCR_capable registers in the enum after
 * __VNCR_START__, and the value (after correction) to be an 8-byte offset
 * from the VNCR base. As we don't require the enum to be otherwise ordered,
 * we need the terrible hack below to ensure that we correctly size the
 * sys_regs array, no matter what.
 *
 * The __MAX__ macro has been lifted from Sean Eron Anderson's wonderful
 * treasure trove of bit hacks:
 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
 */
#define __MAX__(x,y)        ((x) ^ (((x) ^ (y)) & -((x) < (y))))
#define VNCR(r)                                                \
        __before_##r,                                        \
        r = __VNCR_START__ + ((VNCR_ ## r) / 8),        \
        __after_##r = __MAX__(__before_##r - 1, r)

#define MARKER(m)                                \
        m, __after_##m = m - 1

enum vcpu_sysreg {
        __INVALID_SYSREG__,   /* 0 is reserved as an invalid value */
        MPIDR_EL1,        /* MultiProcessor Affinity Register */
        CLIDR_EL1,        /* Cache Level ID Register */
        CSSELR_EL1,        /* Cache Size Selection Register */
        TPIDR_EL0,        /* Thread ID, User R/W */
        TPIDRRO_EL0,        /* Thread ID, User R/O */
        TPIDR_EL1,        /* Thread ID, Privileged */
        CNTKCTL_EL1,        /* Timer Control Register (EL1) */
        PAR_EL1,        /* Physical Address Register */
        MDCCINT_EL1,        /* Monitor Debug Comms Channel Interrupt Enable Reg */
        OSLSR_EL1,        /* OS Lock Status Register */
        DISR_EL1,        /* Deferred Interrupt Status Register */

        /* Performance Monitors Registers */
        PMCR_EL0,        /* Control Register */
        PMSELR_EL0,        /* Event Counter Selection Register */
        PMEVCNTR0_EL0,        /* Event Counter Register (0-30) */
        PMEVCNTR30_EL0 = PMEVCNTR0_EL0 + 30,
        PMCCNTR_EL0,        /* Cycle Counter Register */
        PMEVTYPER0_EL0,        /* Event Type Register (0-30) */
        PMEVTYPER30_EL0 = PMEVTYPER0_EL0 + 30,
        PMCCFILTR_EL0,        /* Cycle Count Filter Register */
        PMCNTENSET_EL0,        /* Count Enable Set Register */
        PMINTENSET_EL1,        /* Interrupt Enable Set Register */
        PMOVSSET_EL0,        /* Overflow Flag Status Set Register */
        PMUSERENR_EL0,        /* User Enable Register */

        /* Pointer Authentication Registers in a strict increasing order. */
        APIAKEYLO_EL1,
        APIAKEYHI_EL1,
        APIBKEYLO_EL1,
        APIBKEYHI_EL1,
        APDAKEYLO_EL1,
        APDAKEYHI_EL1,
        APDBKEYLO_EL1,
        APDBKEYHI_EL1,
        APGAKEYLO_EL1,
        APGAKEYHI_EL1,

        /* Memory Tagging Extension registers */
        RGSR_EL1,        /* Random Allocation Tag Seed Register */
        GCR_EL1,        /* Tag Control Register */
        TFSRE0_EL1,        /* Tag Fault Status Register (EL0) */

        POR_EL0,        /* Permission Overlay Register 0 (EL0) */

        /* FP/SIMD/SVE */
        SVCR,
        FPMR,

        /* 32bit specific registers. */
        DACR32_EL2,        /* Domain Access Control Register */
        IFSR32_EL2,        /* Instruction Fault Status Register */
        FPEXC32_EL2,        /* Floating-Point Exception Control Register */
        DBGVCR32_EL2,        /* Debug Vector Catch Register */

        /* EL2 registers */
        SCTLR_EL2,        /* System Control Register (EL2) */
        ACTLR_EL2,        /* Auxiliary Control Register (EL2) */
        CPTR_EL2,        /* Architectural Feature Trap Register (EL2) */
        HACR_EL2,        /* Hypervisor Auxiliary Control Register */
        ZCR_EL2,        /* SVE Control Register (EL2) */
        TTBR0_EL2,        /* Translation Table Base Register 0 (EL2) */
        TTBR1_EL2,        /* Translation Table Base Register 1 (EL2) */
        TCR_EL2,        /* Translation Control Register (EL2) */
        PIRE0_EL2,        /* Permission Indirection Register 0 (EL2) */
        PIR_EL2,        /* Permission Indirection Register 1 (EL2) */
        POR_EL2,        /* Permission Overlay Register 2 (EL2) */
        SPSR_EL2,        /* EL2 saved program status register */
        ELR_EL2,        /* EL2 exception link register */
        AFSR0_EL2,        /* Auxiliary Fault Status Register 0 (EL2) */
        AFSR1_EL2,        /* Auxiliary Fault Status Register 1 (EL2) */
        ESR_EL2,        /* Exception Syndrome Register (EL2) */
        FAR_EL2,        /* Fault Address Register (EL2) */
        HPFAR_EL2,        /* Hypervisor IPA Fault Address Register */
        MAIR_EL2,        /* Memory Attribute Indirection Register (EL2) */
        AMAIR_EL2,        /* Auxiliary Memory Attribute Indirection Register (EL2) */
        VBAR_EL2,        /* Vector Base Address Register (EL2) */
        RVBAR_EL2,        /* Reset Vector Base Address Register */
        CONTEXTIDR_EL2,        /* Context ID Register (EL2) */
        SP_EL2,                /* EL2 Stack Pointer */
        CNTHP_CTL_EL2,
        CNTHP_CVAL_EL2,
        CNTHV_CTL_EL2,
        CNTHV_CVAL_EL2,

        /* Anything from this can be RES0/RES1 sanitised */
        MARKER(__SANITISED_REG_START__),
        TCR2_EL2,        /* Extended Translation Control Register (EL2) */
        MDCR_EL2,        /* Monitor Debug Configuration Register (EL2) */
        CNTHCTL_EL2,        /* Counter-timer Hypervisor Control register */

        /* Any VNCR-capable reg goes after this point */
        MARKER(__VNCR_START__),

        VNCR(SCTLR_EL1),/* System Control Register */
        VNCR(ACTLR_EL1),/* Auxiliary Control Register */
        VNCR(CPACR_EL1),/* Coprocessor Access Control */
        VNCR(ZCR_EL1),        /* SVE Control */
        VNCR(TTBR0_EL1),/* Translation Table Base Register 0 */
        VNCR(TTBR1_EL1),/* Translation Table Base Register 1 */
        VNCR(TCR_EL1),        /* Translation Control Register */
        VNCR(TCR2_EL1),        /* Extended Translation Control Register */
        VNCR(ESR_EL1),        /* Exception Syndrome Register */
        VNCR(AFSR0_EL1),/* Auxiliary Fault Status Register 0 */
        VNCR(AFSR1_EL1),/* Auxiliary Fault Status Register 1 */
        VNCR(FAR_EL1),        /* Fault Address Register */
        VNCR(MAIR_EL1),        /* Memory Attribute Indirection Register */
        VNCR(VBAR_EL1),        /* Vector Base Address Register */
        VNCR(CONTEXTIDR_EL1),        /* Context ID Register */
        VNCR(AMAIR_EL1),/* Aux Memory Attribute Indirection Register */
        VNCR(MDSCR_EL1),/* Monitor Debug System Control Register */
        VNCR(ELR_EL1),
        VNCR(SP_EL1),
        VNCR(SPSR_EL1),
        VNCR(TFSR_EL1),        /* Tag Fault Status Register (EL1) */
        VNCR(VPIDR_EL2),/* Virtualization Processor ID Register */
        VNCR(VMPIDR_EL2),/* Virtualization Multiprocessor ID Register */
        VNCR(HCR_EL2),        /* Hypervisor Configuration Register */
        VNCR(HSTR_EL2),        /* Hypervisor System Trap Register */
        VNCR(VTTBR_EL2),/* Virtualization Translation Table Base Register */
        VNCR(VTCR_EL2),        /* Virtualization Translation Control Register */
        VNCR(TPIDR_EL2),/* EL2 Software Thread ID Register */
        VNCR(HCRX_EL2),        /* Extended Hypervisor Configuration Register */

        /* Permission Indirection Extension registers */
        VNCR(PIR_EL1),         /* Permission Indirection Register 1 (EL1) */
        VNCR(PIRE0_EL1), /*  Permission Indirection Register 0 (EL1) */

        VNCR(POR_EL1),        /* Permission Overlay Register 1 (EL1) */

        VNCR(HFGRTR_EL2),
        VNCR(HFGWTR_EL2),
        VNCR(HFGITR_EL2),
        VNCR(HDFGRTR_EL2),
        VNCR(HDFGWTR_EL2),
        VNCR(HAFGRTR_EL2),
        VNCR(HFGRTR2_EL2),
        VNCR(HFGWTR2_EL2),
        VNCR(HFGITR2_EL2),
        VNCR(HDFGRTR2_EL2),
        VNCR(HDFGWTR2_EL2),

        VNCR(VNCR_EL2),

        VNCR(CNTVOFF_EL2),
        VNCR(CNTV_CVAL_EL0),
        VNCR(CNTV_CTL_EL0),
        VNCR(CNTP_CVAL_EL0),
        VNCR(CNTP_CTL_EL0),

        VNCR(ICH_LR0_EL2),
        VNCR(ICH_LR1_EL2),
        VNCR(ICH_LR2_EL2),
        VNCR(ICH_LR3_EL2),
        VNCR(ICH_LR4_EL2),
        VNCR(ICH_LR5_EL2),
        VNCR(ICH_LR6_EL2),
        VNCR(ICH_LR7_EL2),
        VNCR(ICH_LR8_EL2),
        VNCR(ICH_LR9_EL2),
        VNCR(ICH_LR10_EL2),
        VNCR(ICH_LR11_EL2),
        VNCR(ICH_LR12_EL2),
        VNCR(ICH_LR13_EL2),
        VNCR(ICH_LR14_EL2),
        VNCR(ICH_LR15_EL2),

        VNCR(ICH_AP0R0_EL2),
        VNCR(ICH_AP0R1_EL2),
        VNCR(ICH_AP0R2_EL2),
        VNCR(ICH_AP0R3_EL2),
        VNCR(ICH_AP1R0_EL2),
        VNCR(ICH_AP1R1_EL2),
        VNCR(ICH_AP1R2_EL2),
        VNCR(ICH_AP1R3_EL2),
        VNCR(ICH_HCR_EL2),
        VNCR(ICH_VMCR_EL2),

        NR_SYS_REGS        /* Nothing after this line! */
};

struct kvm_sysreg_masks {
        struct {
                u64        res0;
                u64        res1;
        } mask[NR_SYS_REGS - __SANITISED_REG_START__];
};

struct fgt_masks {
        const char        *str;
        u64                mask;
        u64                nmask;
        u64                res0;
};

extern struct fgt_masks hfgrtr_masks;
extern struct fgt_masks hfgwtr_masks;
extern struct fgt_masks hfgitr_masks;
extern struct fgt_masks hdfgrtr_masks;
extern struct fgt_masks hdfgwtr_masks;
extern struct fgt_masks hafgrtr_masks;
extern struct fgt_masks hfgrtr2_masks;
extern struct fgt_masks hfgwtr2_masks;
extern struct fgt_masks hfgitr2_masks;
extern struct fgt_masks hdfgrtr2_masks;
extern struct fgt_masks hdfgwtr2_masks;

extern struct fgt_masks kvm_nvhe_sym(hfgrtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgwtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgitr_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgrtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgwtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hafgrtr_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgrtr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgwtr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hfgitr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgrtr2_masks);
extern struct fgt_masks kvm_nvhe_sym(hdfgwtr2_masks);

struct kvm_cpu_context {
        struct user_pt_regs regs;        /* sp = sp_el0 */

        u64        spsr_abt;
        u64        spsr_und;
        u64        spsr_irq;
        u64        spsr_fiq;

        struct user_fpsimd_state fp_regs;

        u64 sys_regs[NR_SYS_REGS];

        struct kvm_vcpu *__hyp_running_vcpu;

        /* This pointer has to be 4kB aligned. */
        u64 *vncr_array;
};

struct cpu_sve_state {
        __u64 zcr_el1;

        /*
         * Ordering is important since __sve_save_state/__sve_restore_state
         * relies on it.
         */
        __u32 fpsr;
        __u32 fpcr;

        /* Must be SVE_VQ_BYTES (128 bit) aligned. */
        __u8 sve_regs[];
};

/*
 * This structure is instantiated on a per-CPU basis, and contains
 * data that is:
 *
 * - tied to a single physical CPU, and
 * - either have a lifetime that does not extend past vcpu_put()
 * - or is an invariant for the lifetime of the system
 *
 * Use host_data_ptr(field) as a way to access a pointer to such a
 * field.
 */
struct kvm_host_data {
#define KVM_HOST_DATA_FLAG_HAS_SPE                        0
#define KVM_HOST_DATA_FLAG_HAS_TRBE                        1
#define KVM_HOST_DATA_FLAG_TRBE_ENABLED                        4
#define KVM_HOST_DATA_FLAG_EL1_TRACING_CONFIGURED        5
#define KVM_HOST_DATA_FLAG_VCPU_IN_HYP_CONTEXT                6
#define KVM_HOST_DATA_FLAG_L1_VNCR_MAPPED                7
        unsigned long flags;

        struct kvm_cpu_context host_ctxt;

        /*
         * Hyp VA.
         * sve_state is only used in pKVM and if system_supports_sve().
         */
        struct cpu_sve_state *sve_state;

        /* Used by pKVM only. */
        u64        fpmr;

        /* Ownership of the FP regs */
        enum {
                FP_STATE_FREE,
                FP_STATE_HOST_OWNED,
                FP_STATE_GUEST_OWNED,
        } fp_owner;

        /*
         * host_debug_state contains the host registers which are
         * saved and restored during world switches.
         */
        struct {
                /* {Break,watch}point registers */
                struct kvm_guest_debug_arch regs;
                /* Statistical profiling extension */
                u64 pmscr_el1;
                /* Self-hosted trace */
                u64 trfcr_el1;
                /* Values of trap registers for the host before guest entry. */
                u64 mdcr_el2;
        } host_debug_state;

        /* Guest trace filter value */
        u64 trfcr_while_in_guest;

        /* Number of programmable event counters (PMCR_EL0.N) for this CPU */
        unsigned int nr_event_counters;

        /* Number of debug breakpoints/watchpoints for this CPU (minus 1) */
        unsigned int debug_brps;
        unsigned int debug_wrps;
};

struct kvm_host_psci_config {
        /* PSCI version used by host. */
        u32 version;
        u32 smccc_version;

        /* Function IDs used by host if version is v0.1. */
        struct psci_0_1_function_ids function_ids_0_1;

        bool psci_0_1_cpu_suspend_implemented;
        bool psci_0_1_cpu_on_implemented;
        bool psci_0_1_cpu_off_implemented;
        bool psci_0_1_migrate_implemented;
};

extern struct kvm_host_psci_config kvm_nvhe_sym(kvm_host_psci_config);
#define kvm_host_psci_config CHOOSE_NVHE_SYM(kvm_host_psci_config)

extern s64 kvm_nvhe_sym(hyp_physvirt_offset);
#define hyp_physvirt_offset CHOOSE_NVHE_SYM(hyp_physvirt_offset)

extern u64 kvm_nvhe_sym(hyp_cpu_logical_map)[NR_CPUS];
#define hyp_cpu_logical_map CHOOSE_NVHE_SYM(hyp_cpu_logical_map)

struct vcpu_reset_state {
        unsigned long        pc;
        unsigned long        r0;
        bool                be;
        bool                reset;
};

struct vncr_tlb;

struct kvm_vcpu_arch {
        struct kvm_cpu_context ctxt;

        /*
         * Guest floating point state
         *
         * The architecture has two main floating point extensions,
         * the original FPSIMD and SVE.  These have overlapping
         * register views, with the FPSIMD V registers occupying the
         * low 128 bits of the SVE Z registers.  When the core
         * floating point code saves the register state of a task it
         * records which view it saved in fp_type.
         */
        void *sve_state;
        enum fp_type fp_type;
        unsigned int sve_max_vl;

        /* Stage 2 paging state used by the hardware on next switch */
        struct kvm_s2_mmu *hw_mmu;

        /* Values of trap registers for the guest. */
        u64 hcr_el2;
        u64 hcrx_el2;
        u64 mdcr_el2;

        /* Exception Information */
        struct kvm_vcpu_fault_info fault;

        /* Configuration flags, set once and for all before the vcpu can run */
        u8 cflags;

        /* Input flags to the hypervisor code, potentially cleared after use */
        u8 iflags;

        /* State flags for kernel bookkeeping, unused by the hypervisor code */
        u8 sflags;

        /*
         * Don't run the guest (internal implementation need).
         *
         * Contrary to the flags above, this is set/cleared outside of
         * a vcpu context, and thus cannot be mixed with the flags
         * themselves (or the flag accesses need to be made atomic).
         */
        bool pause;

        /*
         * We maintain more than a single set of debug registers to support
         * debugging the guest from the host and to maintain separate host and
         * guest state during world switches. vcpu_debug_state are the debug
         * registers of the vcpu as the guest sees them.
         *
         * external_debug_state contains the debug values we want to debug the
         * guest. This is set via the KVM_SET_GUEST_DEBUG ioctl.
         */
        struct kvm_guest_debug_arch vcpu_debug_state;
        struct kvm_guest_debug_arch external_debug_state;
        u64 external_mdscr_el1;

        enum {
                VCPU_DEBUG_FREE,
                VCPU_DEBUG_HOST_OWNED,
                VCPU_DEBUG_GUEST_OWNED,
        } debug_owner;

        /* VGIC state */
        struct vgic_cpu vgic_cpu;
        struct arch_timer_cpu timer_cpu;
        struct kvm_pmu pmu;

        /* vcpu power state */
        struct kvm_mp_state mp_state;
        spinlock_t mp_state_lock;

        /* Cache some mmu pages needed inside spinlock regions */
        struct kvm_mmu_memory_cache mmu_page_cache;

        /* Pages to top-up the pKVM/EL2 guest pool */
        struct kvm_hyp_memcache pkvm_memcache;

        /* Virtual SError ESR to restore when HCR_EL2.VSE is set */
        u64 vsesr_el2;

        /* Additional reset state */
        struct vcpu_reset_state        reset_state;

        /* Guest PV state */
        struct {
                u64 last_steal;
                gpa_t base;
        } steal;

        /* Per-vcpu CCSIDR override or NULL */
        u32 *ccsidr;

        /* Per-vcpu TLB for VNCR_EL2 -- NULL when !NV */
        struct vncr_tlb        *vncr_tlb;
};

/*
 * Each 'flag' is composed of a comma-separated triplet:
 *
 * - the flag-set it belongs to in the vcpu->arch structure
 * - the value for that flag
 * - the mask for that flag
 *
 *  __vcpu_single_flag() builds such a triplet for a single-bit flag.
 * unpack_vcpu_flag() extract the flag value from the triplet for
 * direct use outside of the flag accessors.
 */
#define __vcpu_single_flag(_set, _f)        _set, (_f), (_f)

#define __unpack_flag(_set, _f, _m)        _f
#define unpack_vcpu_flag(...)                __unpack_flag(__VA_ARGS__)

#define __build_check_flag(v, flagset, f, m)                        \
        do {                                                        \
                typeof(v->arch.flagset) *_fset;                        \
                                                                \
                /* Check that the flags fit in the mask */        \
                BUILD_BUG_ON(HWEIGHT(m) != HWEIGHT((f) | (m)));        \
                /* Check that the flags fit in the type */        \
                BUILD_BUG_ON((sizeof(*_fset) * 8) <= __fls(m));        \
        } while (0)

#define __vcpu_get_flag(v, flagset, f, m)                        \
        ({                                                        \
                __build_check_flag(v, flagset, f, m);                \
                                                                \
                READ_ONCE(v->arch.flagset) & (m);                \
        })

/*
 * Note that the set/clear accessors must be preempt-safe in order to
 * avoid nesting them with load/put which also manipulate flags...
 */
#ifdef __KVM_NVHE_HYPERVISOR__
/* the nVHE hypervisor is always non-preemptible */
#define __vcpu_flags_preempt_disable()
#define __vcpu_flags_preempt_enable()
#else
#define __vcpu_flags_preempt_disable()        preempt_disable()
#define __vcpu_flags_preempt_enable()        preempt_enable()
#endif

#define __vcpu_set_flag(v, flagset, f, m)                        \
        do {                                                        \
                typeof(v->arch.flagset) *fset;                        \
                                                                \
                __build_check_flag(v, flagset, f, m);                \
                                                                \
                fset = &v->arch.flagset;                        \
                __vcpu_flags_preempt_disable();                        \
                if (HWEIGHT(m) > 1)                                \
                        *fset &= ~(m);                                \
                *fset |= (f);                                        \
                __vcpu_flags_preempt_enable();                        \
        } while (0)

#define __vcpu_clear_flag(v, flagset, f, m)                        \
        do {                                                        \
                typeof(v->arch.flagset) *fset;                        \
                                                                \
                __build_check_flag(v, flagset, f, m);                \
                                                                \
                fset = &v->arch.flagset;                        \
                __vcpu_flags_preempt_disable();                        \
                *fset &= ~(m);                                        \
                __vcpu_flags_preempt_enable();                        \
        } while (0)

#define vcpu_get_flag(v, ...)        __vcpu_get_flag((v), __VA_ARGS__)
#define vcpu_set_flag(v, ...)        __vcpu_set_flag((v), __VA_ARGS__)
#define vcpu_clear_flag(v, ...)        __vcpu_clear_flag((v), __VA_ARGS__)

/* KVM_ARM_VCPU_INIT completed */
#define VCPU_INITIALIZED        __vcpu_single_flag(cflags, BIT(0))
/* SVE config completed */
#define VCPU_SVE_FINALIZED        __vcpu_single_flag(cflags, BIT(1))
/* pKVM VCPU setup completed */
#define VCPU_PKVM_FINALIZED        __vcpu_single_flag(cflags, BIT(2))

/* Exception pending */
#define PENDING_EXCEPTION        __vcpu_single_flag(iflags, BIT(0))
/*
 * PC increment. Overlaps with EXCEPT_MASK on purpose so that it can't
 * be set together with an exception...
 */
#define INCREMENT_PC                __vcpu_single_flag(iflags, BIT(1))
/* Target EL/MODE (not a single flag, but let's abuse the macro) */
#define EXCEPT_MASK                __vcpu_single_flag(iflags, GENMASK(3, 1))

/* Helpers to encode exceptions with minimum fuss */
#define __EXCEPT_MASK_VAL        unpack_vcpu_flag(EXCEPT_MASK)
#define __EXCEPT_SHIFT                __builtin_ctzl(__EXCEPT_MASK_VAL)
#define __vcpu_except_flags(_f)        iflags, (_f << __EXCEPT_SHIFT), __EXCEPT_MASK_VAL

/*
 * When PENDING_EXCEPTION is set, EXCEPT_MASK can take the following
 * values:
 *
 * For AArch32 EL1:
 */
#define EXCEPT_AA32_UND                __vcpu_except_flags(0)
#define EXCEPT_AA32_IABT        __vcpu_except_flags(1)
#define EXCEPT_AA32_DABT        __vcpu_except_flags(2)
/* For AArch64: */
#define EXCEPT_AA64_EL1_SYNC        __vcpu_except_flags(0)
#define EXCEPT_AA64_EL1_IRQ        __vcpu_except_flags(1)
#define EXCEPT_AA64_EL1_FIQ        __vcpu_except_flags(2)
#define EXCEPT_AA64_EL1_SERR        __vcpu_except_flags(3)
/* For AArch64 with NV: */
#define EXCEPT_AA64_EL2_SYNC        __vcpu_except_flags(4)
#define EXCEPT_AA64_EL2_IRQ        __vcpu_except_flags(5)
#define EXCEPT_AA64_EL2_FIQ        __vcpu_except_flags(6)
#define EXCEPT_AA64_EL2_SERR        __vcpu_except_flags(7)

/* Physical CPU not in supported_cpus */
#define ON_UNSUPPORTED_CPU        __vcpu_single_flag(sflags, BIT(0))
/* WFIT instruction trapped */
#define IN_WFIT                        __vcpu_single_flag(sflags, BIT(1))
/* vcpu system registers loaded on physical CPU */
#define SYSREGS_ON_CPU                __vcpu_single_flag(sflags, BIT(2))
/* Software step state is Active-pending for external debug */
#define HOST_SS_ACTIVE_PENDING        __vcpu_single_flag(sflags, BIT(3))
/* Software step state is Active pending for guest debug */
#define GUEST_SS_ACTIVE_PENDING __vcpu_single_flag(sflags, BIT(4))
/* PMUSERENR for the guest EL0 is on physical CPU */
#define PMUSERENR_ON_CPU        __vcpu_single_flag(sflags, BIT(5))
/* WFI instruction trapped */
#define IN_WFI                        __vcpu_single_flag(sflags, BIT(6))
/* KVM is currently emulating a nested ERET */
#define IN_NESTED_ERET                __vcpu_single_flag(sflags, BIT(7))


/* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
#define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) +        \
                             sve_ffr_offset((vcpu)->arch.sve_max_vl))

#define vcpu_sve_max_vq(vcpu)        sve_vq_from_vl((vcpu)->arch.sve_max_vl)

#define vcpu_sve_zcr_elx(vcpu)                                                \
        (unlikely(is_hyp_ctxt(vcpu)) ? ZCR_EL2 : ZCR_EL1)

#define sve_state_size_from_vl(sve_max_vl) ({                                \
        size_t __size_ret;                                                \
        unsigned int __vq;                                                \
                                                                        \
        if (WARN_ON(!sve_vl_valid(sve_max_vl))) {                        \
                __size_ret = 0;                                                \
        } else {                                                        \
                __vq = sve_vq_from_vl(sve_max_vl);                        \
                __size_ret = SVE_SIG_REGS_SIZE(__vq);                        \
        }                                                                \
                                                                        \
        __size_ret;                                                        \
})

#define vcpu_sve_state_size(vcpu) sve_state_size_from_vl((vcpu)->arch.sve_max_vl)

#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \
                                 KVM_GUESTDBG_USE_SW_BP | \
                                 KVM_GUESTDBG_USE_HW | \
                                 KVM_GUESTDBG_SINGLESTEP)

#define kvm_has_sve(kvm)        (system_supports_sve() &&                \
                                 test_bit(KVM_ARCH_FLAG_GUEST_HAS_SVE, &(kvm)->arch.flags))

#ifdef __KVM_NVHE_HYPERVISOR__
#define vcpu_has_sve(vcpu)        kvm_has_sve(kern_hyp_va((vcpu)->kvm))
#else
#define vcpu_has_sve(vcpu)        kvm_has_sve((vcpu)->kvm)
#endif

#ifdef CONFIG_ARM64_PTR_AUTH
#define vcpu_has_ptrauth(vcpu)                                                \
        ((cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH) ||                \
          cpus_have_final_cap(ARM64_HAS_GENERIC_AUTH)) &&                \
         (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) ||       \
          vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)))
#else
#define vcpu_has_ptrauth(vcpu)                false
#endif

#define vcpu_on_unsupported_cpu(vcpu)                                        \
        vcpu_get_flag(vcpu, ON_UNSUPPORTED_CPU)

#define vcpu_set_on_unsupported_cpu(vcpu)                                \
        vcpu_set_flag(vcpu, ON_UNSUPPORTED_CPU)

#define vcpu_clear_on_unsupported_cpu(vcpu)                                \
        vcpu_clear_flag(vcpu, ON_UNSUPPORTED_CPU)

#define vcpu_gp_regs(v)                (&(v)->arch.ctxt.regs)

/*
 * Only use __vcpu_sys_reg/ctxt_sys_reg if you know you want the
 * memory backed version of a register, and not the one most recently
 * accessed by a running VCPU.  For example, for userspace access or
 * for system registers that are never context switched, but only
 * emulated.
 *
 * Don't bother with VNCR-based accesses in the nVHE code, it has no
 * business dealing with NV.
 */
static inline u64 *___ctxt_sys_reg(const struct kvm_cpu_context *ctxt, int r)
{
#if !defined (__KVM_NVHE_HYPERVISOR__)
        if (unlikely(cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) &&
                     r >= __VNCR_START__ && ctxt->vncr_array))
                return &ctxt->vncr_array[r - __VNCR_START__];
#endif
        return (u64 *)&ctxt->sys_regs[r];
}

#define __ctxt_sys_reg(c,r)                                                \
        ({                                                                \
                BUILD_BUG_ON(__builtin_constant_p(r) &&                        \
                             (r) >= NR_SYS_REGS);                        \
                ___ctxt_sys_reg(c, r);                                        \
        })

#define ctxt_sys_reg(c,r)        (*__ctxt_sys_reg(c,r))

u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *, enum vcpu_sysreg, u64);
#define __vcpu_sys_reg(v,r)                                                \
        (*({                                                                \
                const struct kvm_cpu_context *ctxt = &(v)->arch.ctxt;        \
                u64 *__r = __ctxt_sys_reg(ctxt, (r));                        \
                if (vcpu_has_nv((v)) && (r) >= __SANITISED_REG_START__)        \
                        *__r = kvm_vcpu_apply_reg_masks((v), (r), *__r);\
                __r;                                                        \
        }))

u64 vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg);
void vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg);

static inline bool __vcpu_read_sys_reg_from_cpu(int reg, u64 *val)
{
        /*
         * *** VHE ONLY ***
         *
         * System registers listed in the switch are not saved on every
         * exit from the guest but are only saved on vcpu_put.
         *
         * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
         * should never be listed below, because the guest cannot modify its
         * own MPIDR_EL1 and MPIDR_EL1 is accessed for VCPU A from VCPU B's
         * thread when emulating cross-VCPU communication.
         */
        if (!has_vhe())
                return false;

        switch (reg) {
        case SCTLR_EL1:                *val = read_sysreg_s(SYS_SCTLR_EL12);        break;
        case CPACR_EL1:                *val = read_sysreg_s(SYS_CPACR_EL12);        break;
        case TTBR0_EL1:                *val = read_sysreg_s(SYS_TTBR0_EL12);        break;
        case TTBR1_EL1:                *val = read_sysreg_s(SYS_TTBR1_EL12);        break;
        case TCR_EL1:                *val = read_sysreg_s(SYS_TCR_EL12);        break;
        case TCR2_EL1:                *val = read_sysreg_s(SYS_TCR2_EL12);        break;
        case PIR_EL1:                *val = read_sysreg_s(SYS_PIR_EL12);        break;
        case PIRE0_EL1:                *val = read_sysreg_s(SYS_PIRE0_EL12);        break;
        case POR_EL1:                *val = read_sysreg_s(SYS_POR_EL12);        break;
        case ESR_EL1:                *val = read_sysreg_s(SYS_ESR_EL12);        break;
        case AFSR0_EL1:                *val = read_sysreg_s(SYS_AFSR0_EL12);        break;
        case AFSR1_EL1:                *val = read_sysreg_s(SYS_AFSR1_EL12);        break;
        case FAR_EL1:                *val = read_sysreg_s(SYS_FAR_EL12);        break;
        case MAIR_EL1:                *val = read_sysreg_s(SYS_MAIR_EL12);        break;
        case VBAR_EL1:                *val = read_sysreg_s(SYS_VBAR_EL12);        break;
        case CONTEXTIDR_EL1:        *val = read_sysreg_s(SYS_CONTEXTIDR_EL12);break;
        case TPIDR_EL0:                *val = read_sysreg_s(SYS_TPIDR_EL0);        break;
        case TPIDRRO_EL0:        *val = read_sysreg_s(SYS_TPIDRRO_EL0);        break;
        case TPIDR_EL1:                *val = read_sysreg_s(SYS_TPIDR_EL1);        break;
        case AMAIR_EL1:                *val = read_sysreg_s(SYS_AMAIR_EL12);        break;
        case CNTKCTL_EL1:        *val = read_sysreg_s(SYS_CNTKCTL_EL12);        break;
        case ELR_EL1:                *val = read_sysreg_s(SYS_ELR_EL12);        break;
        case SPSR_EL1:                *val = read_sysreg_s(SYS_SPSR_EL12);        break;
        case PAR_EL1:                *val = read_sysreg_par();                break;
        case DACR32_EL2:        *val = read_sysreg_s(SYS_DACR32_EL2);        break;
        case IFSR32_EL2:        *val = read_sysreg_s(SYS_IFSR32_EL2);        break;
        case DBGVCR32_EL2:        *val = read_sysreg_s(SYS_DBGVCR32_EL2);        break;
        case ZCR_EL1:                *val = read_sysreg_s(SYS_ZCR_EL12);        break;
        default:                return false;
        }

        return true;
}

static inline bool __vcpu_write_sys_reg_to_cpu(u64 val, int reg)
{
        /*
         * *** VHE ONLY ***
         *
         * System registers listed in the switch are not restored on every
         * entry to the guest but are only restored on vcpu_load.
         *
         * Note that MPIDR_EL1 for the guest is set by KVM via VMPIDR_EL2 but
         * should never be listed below, because the MPIDR should only be set
         * once, before running the VCPU, and never changed later.
         */
        if (!has_vhe())
                return false;

        switch (reg) {
        case SCTLR_EL1:                write_sysreg_s(val, SYS_SCTLR_EL12);        break;
        case CPACR_EL1:                write_sysreg_s(val, SYS_CPACR_EL12);        break;
        case TTBR0_EL1:                write_sysreg_s(val, SYS_TTBR0_EL12);        break;
        case TTBR1_EL1:                write_sysreg_s(val, SYS_TTBR1_EL12);        break;
        case TCR_EL1:                write_sysreg_s(val, SYS_TCR_EL12);        break;
        case TCR2_EL1:                write_sysreg_s(val, SYS_TCR2_EL12);        break;
        case PIR_EL1:                write_sysreg_s(val, SYS_PIR_EL12);        break;
        case PIRE0_EL1:                write_sysreg_s(val, SYS_PIRE0_EL12);        break;
        case POR_EL1:                write_sysreg_s(val, SYS_POR_EL12);        break;
        case ESR_EL1:                write_sysreg_s(val, SYS_ESR_EL12);        break;
        case AFSR0_EL1:                write_sysreg_s(val, SYS_AFSR0_EL12);        break;
        case AFSR1_EL1:                write_sysreg_s(val, SYS_AFSR1_EL12);        break;
        case FAR_EL1:                write_sysreg_s(val, SYS_FAR_EL12);        break;
        case MAIR_EL1:                write_sysreg_s(val, SYS_MAIR_EL12);        break;
        case VBAR_EL1:                write_sysreg_s(val, SYS_VBAR_EL12);        break;
        case CONTEXTIDR_EL1:        write_sysreg_s(val, SYS_CONTEXTIDR_EL12);break;
        case TPIDR_EL0:                write_sysreg_s(val, SYS_TPIDR_EL0);        break;
        case TPIDRRO_EL0:        write_sysreg_s(val, SYS_TPIDRRO_EL0);        break;
        case TPIDR_EL1:                write_sysreg_s(val, SYS_TPIDR_EL1);        break;
        case AMAIR_EL1:                write_sysreg_s(val, SYS_AMAIR_EL12);        break;
        case CNTKCTL_EL1:        write_sysreg_s(val, SYS_CNTKCTL_EL12);        break;
        case ELR_EL1:                write_sysreg_s(val, SYS_ELR_EL12);        break;
        case SPSR_EL1:                write_sysreg_s(val, SYS_SPSR_EL12);        break;
        case PAR_EL1:                write_sysreg_s(val, SYS_PAR_EL1);        break;
        case DACR32_EL2:        write_sysreg_s(val, SYS_DACR32_EL2);        break;
        case IFSR32_EL2:        write_sysreg_s(val, SYS_IFSR32_EL2);        break;
        case DBGVCR32_EL2:        write_sysreg_s(val, SYS_DBGVCR32_EL2);        break;
        case ZCR_EL1:                write_sysreg_s(val, SYS_ZCR_EL12);        break;
        default:                return false;
        }

        return true;
}

struct kvm_vm_stat {
        struct kvm_vm_stat_generic generic;
};

struct kvm_vcpu_stat {
        struct kvm_vcpu_stat_generic generic;
        u64 hvc_exit_stat;
        u64 wfe_exit_stat;
        u64 wfi_exit_stat;
        u64 mmio_exit_user;
        u64 mmio_exit_kernel;
        u64 signal_exits;
        u64 exits;
};

unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);

unsigned long kvm_arm_num_sys_reg_descs(struct kvm_vcpu *vcpu);
int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);

int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
                              struct kvm_vcpu_events *events);

int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
                              struct kvm_vcpu_events *events);

void kvm_arm_halt_guest(struct kvm *kvm);
void kvm_arm_resume_guest(struct kvm *kvm);

#define vcpu_has_run_once(vcpu)        (!!READ_ONCE((vcpu)->pid))

#ifndef __KVM_NVHE_HYPERVISOR__
#define kvm_call_hyp_nvhe(f, ...)                                                \
        ({                                                                \
                struct arm_smccc_res res;                                \
                                                                        \
                arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(f),                \
                                  ##__VA_ARGS__, &res);                        \
                WARN_ON(res.a0 != SMCCC_RET_SUCCESS);                        \
                                                                        \
                res.a1;                                                        \
        })

/*
 * The couple of isb() below are there to guarantee the same behaviour
 * on VHE as on !VHE, where the eret to EL1 acts as a context
 * synchronization event.
 */
#define kvm_call_hyp(f, ...)                                                \
        do {                                                                \
                if (has_vhe()) {                                        \
                        f(__VA_ARGS__);                                        \
                        isb();                                                \
                } else {                                                \
                        kvm_call_hyp_nvhe(f, ##__VA_ARGS__);                \
                }                                                        \
        } while(0)

#define kvm_call_hyp_ret(f, ...)                                        \
        ({                                                                \
                typeof(f(__VA_ARGS__)) ret;                                \
                                                                        \
                if (has_vhe()) {                                        \
                        ret = f(__VA_ARGS__);                                \
                        isb();                                                \
                } else {                                                \
                        ret = kvm_call_hyp_nvhe(f, ##__VA_ARGS__);        \
                }                                                        \
                                                                        \
                ret;                                                        \
        })
#else /* __KVM_NVHE_HYPERVISOR__ */
#define kvm_call_hyp(f, ...) f(__VA_ARGS__)
#define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__)
#define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__)
#endif /* __KVM_NVHE_HYPERVISOR__ */

int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);

int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu);
int kvm_handle_cp14_32(struct kvm_vcpu *vcpu);
int kvm_handle_cp14_64(struct kvm_vcpu *vcpu);
int kvm_handle_cp15_32(struct kvm_vcpu *vcpu);
int kvm_handle_cp15_64(struct kvm_vcpu *vcpu);
int kvm_handle_sys_reg(struct kvm_vcpu *vcpu);
int kvm_handle_cp10_id(struct kvm_vcpu *vcpu);

void kvm_sys_regs_create_debugfs(struct kvm *kvm);
void kvm_reset_sys_regs(struct kvm_vcpu *vcpu);

int __init kvm_sys_reg_table_init(void);
struct sys_reg_desc;
int __init populate_sysreg_config(const struct sys_reg_desc *sr,
                                  unsigned int idx);
int __init populate_nv_trap_config(void);

bool lock_all_vcpus(struct kvm *kvm);
void unlock_all_vcpus(struct kvm *kvm);

void kvm_calculate_traps(struct kvm_vcpu *vcpu);

/* MMIO helpers */
void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);

int kvm_handle_mmio_return(struct kvm_vcpu *vcpu);
int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa);

/*
 * Returns true if a Performance Monitoring Interrupt (PMI), a.k.a. perf event,
 * arrived in guest context.  For arm64, any event that arrives while a vCPU is
 * loaded is considered to be "in guest".
 */
static inline bool kvm_arch_pmi_in_guest(struct kvm_vcpu *vcpu)
{
        return IS_ENABLED(CONFIG_GUEST_PERF_EVENTS) && !!vcpu;
}

long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu);
gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu);
void kvm_update_stolen_time(struct kvm_vcpu *vcpu);

bool kvm_arm_pvtime_supported(void);
int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr);
int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr);
int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr);

extern unsigned int __ro_after_init kvm_arm_vmid_bits;
int __init kvm_arm_vmid_alloc_init(void);
void __init kvm_arm_vmid_alloc_free(void);
void kvm_arm_vmid_update(struct kvm_vmid *kvm_vmid);
void kvm_arm_vmid_clear_active(void);

static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch)
{
        vcpu_arch->steal.base = INVALID_GPA;
}

static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch)
{
        return (vcpu_arch->steal.base != INVALID_GPA);
}

void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 syndrome);

struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);

DECLARE_KVM_HYP_PER_CPU(struct kvm_host_data, kvm_host_data);

/*
 * How we access per-CPU host data depends on the where we access it from,
 * and the mode we're in:
 *
 * - VHE and nVHE hypervisor bits use their locally defined instance
 *
 * - the rest of the kernel use either the VHE or nVHE one, depending on
 *   the mode we're running in.
 *
 *   Unless we're in protected mode, fully deprivileged, and the nVHE
 *   per-CPU stuff is exclusively accessible to the protected EL2 code.
 *   In this case, the EL1 code uses the *VHE* data as its private state
 *   (which makes sense in a way as there shouldn't be any shared state
 *   between the host and the hypervisor).
 *
 * Yes, this is all totally trivial. Shoot me now.
 */
#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)
#define host_data_ptr(f)        (&this_cpu_ptr(&kvm_host_data)->f)
#else
#define host_data_ptr(f)                                                \
        (static_branch_unlikely(&kvm_protected_mode_initialized) ?        \
         &this_cpu_ptr(&kvm_host_data)->f :                                \
         &this_cpu_ptr_hyp_sym(kvm_host_data)->f)
#endif

#define host_data_test_flag(flag)                                        \
        (test_bit(KVM_HOST_DATA_FLAG_##flag, host_data_ptr(flags)))
#define host_data_set_flag(flag)                                        \
        set_bit(KVM_HOST_DATA_FLAG_##flag, host_data_ptr(flags))
#define host_data_clear_flag(flag)                                        \
        clear_bit(KVM_HOST_DATA_FLAG_##flag, host_data_ptr(flags))

/* Check whether the FP regs are owned by the guest */
static inline bool guest_owns_fp_regs(void)
{
        return *host_data_ptr(fp_owner) == FP_STATE_GUEST_OWNED;
}

/* Check whether the FP regs are owned by the host */
static inline bool host_owns_fp_regs(void)
{
        return *host_data_ptr(fp_owner) == FP_STATE_HOST_OWNED;
}

static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt)
{
        /* The host's MPIDR is immutable, so let's set it up at boot time */
        ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr();
}

static inline bool kvm_system_needs_idmapped_vectors(void)
{
        return cpus_have_final_cap(ARM64_SPECTRE_V3A);
}

void kvm_init_host_debug_data(void);
void kvm_vcpu_load_debug(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_debug(struct kvm_vcpu *vcpu);
void kvm_debug_set_guest_ownership(struct kvm_vcpu *vcpu);
void kvm_debug_handle_oslar(struct kvm_vcpu *vcpu, u64 val);

#define kvm_vcpu_os_lock_enabled(vcpu)                \
        (!!(__vcpu_sys_reg(vcpu, OSLSR_EL1) & OSLSR_EL1_OSLK))

#define kvm_debug_regs_in_use(vcpu)                \
        ((vcpu)->arch.debug_owner != VCPU_DEBUG_FREE)
#define kvm_host_owns_debug_regs(vcpu)                \
        ((vcpu)->arch.debug_owner == VCPU_DEBUG_HOST_OWNED)
#define kvm_guest_owns_debug_regs(vcpu)                \
        ((vcpu)->arch.debug_owner == VCPU_DEBUG_GUEST_OWNED)

int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
                               struct kvm_device_attr *attr);
int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
                               struct kvm_device_attr *attr);
int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
                               struct kvm_device_attr *attr);

int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
                               struct kvm_arm_copy_mte_tags *copy_tags);
int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
                                    struct kvm_arm_counter_offset *offset);
int kvm_vm_ioctl_get_reg_writable_masks(struct kvm *kvm,
                                        struct reg_mask_range *range);

/* Guest/host FPSIMD coordination helpers */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu);
void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu);

static inline bool kvm_pmu_counter_deferred(struct perf_event_attr *attr)
{
        return (!has_vhe() && attr->exclude_host);
}

#ifdef CONFIG_KVM
void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr);
void kvm_clr_pmu_events(u64 clr);
bool kvm_set_pmuserenr(u64 val);
void kvm_enable_trbe(void);
void kvm_disable_trbe(void);
void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest);
#else
static inline void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr) {}
static inline void kvm_clr_pmu_events(u64 clr) {}
static inline bool kvm_set_pmuserenr(u64 val)
{
        return false;
}
static inline void kvm_enable_trbe(void) {}
static inline void kvm_disable_trbe(void) {}
static inline void kvm_tracing_set_el1_configuration(u64 trfcr_while_in_guest) {}
#endif

void kvm_vcpu_load_vhe(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_vhe(struct kvm_vcpu *vcpu);

int __init kvm_set_ipa_limit(void);
u32 kvm_get_pa_bits(struct kvm *kvm);

#define __KVM_HAVE_ARCH_VM_ALLOC
struct kvm *kvm_arch_alloc_vm(void);

#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS

#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE

#define kvm_vm_is_protected(kvm)        (is_protected_kvm_enabled() && (kvm)->arch.pkvm.enabled)

#define vcpu_is_protected(vcpu)                kvm_vm_is_protected((vcpu)->kvm)

int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature);
bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu);

#define kvm_arm_vcpu_sve_finalized(vcpu) vcpu_get_flag(vcpu, VCPU_SVE_FINALIZED)

#define kvm_has_mte(kvm)                                        \
        (system_supports_mte() &&                                \
         test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &(kvm)->arch.flags))

#define kvm_supports_32bit_el0()                                \
        (system_supports_32bit_el0() &&                                \
         !static_branch_unlikely(&arm64_mismatched_32bit_el0))

#define kvm_vm_has_ran_once(kvm)                                        \
        (test_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &(kvm)->arch.flags))

static inline bool __vcpu_has_feature(const struct kvm_arch *ka, int feature)
{
        return test_bit(feature, ka->vcpu_features);
}

#define kvm_vcpu_has_feature(k, f)        __vcpu_has_feature(&(k)->arch, (f))
#define vcpu_has_feature(v, f)        __vcpu_has_feature(&(v)->kvm->arch, (f))

#define kvm_vcpu_initialized(v) vcpu_get_flag(vcpu, VCPU_INITIALIZED)

int kvm_trng_call(struct kvm_vcpu *vcpu);
#ifdef CONFIG_KVM
extern phys_addr_t hyp_mem_base;
extern phys_addr_t hyp_mem_size;
void __init kvm_hyp_reserve(void);
#else
static inline void kvm_hyp_reserve(void) { }
#endif

void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu);
bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu);

static inline u64 *__vm_id_reg(struct kvm_arch *ka, u32 reg)
{
        switch (reg) {
        case sys_reg(3, 0, 0, 1, 0) ... sys_reg(3, 0, 0, 7, 7):
                return &ka->id_regs[IDREG_IDX(reg)];
        case SYS_CTR_EL0:
                return &ka->ctr_el0;
        case SYS_MIDR_EL1:
                return &ka->midr_el1;
        case SYS_REVIDR_EL1:
                return &ka->revidr_el1;
        case SYS_AIDR_EL1:
                return &ka->aidr_el1;
        default:
                WARN_ON_ONCE(1);
                return NULL;
        }
}

#define kvm_read_vm_id_reg(kvm, reg)                                        \
        ({ u64 __val = *__vm_id_reg(&(kvm)->arch, reg); __val; })

void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val);

#define __expand_field_sign_unsigned(id, fld, val)                        \
        ((u64)SYS_FIELD_VALUE(id, fld, val))

#define __expand_field_sign_signed(id, fld, val)                        \
        ({                                                                \
                u64 __val = SYS_FIELD_VALUE(id, fld, val);                \
                sign_extend64(__val, id##_##fld##_WIDTH - 1);                \
        })

#define get_idreg_field_unsigned(kvm, id, fld)                                \
        ({                                                                \
                u64 __val = kvm_read_vm_id_reg((kvm), SYS_##id);        \
                FIELD_GET(id##_##fld##_MASK, __val);                        \
        })

#define get_idreg_field_signed(kvm, id, fld)                                \
        ({                                                                \
                u64 __val = get_idreg_field_unsigned(kvm, id, fld);        \
                sign_extend64(__val, id##_##fld##_WIDTH - 1);                \
        })

#define get_idreg_field_enum(kvm, id, fld)                                \
        get_idreg_field_unsigned(kvm, id, fld)

#define kvm_cmp_feat_signed(kvm, id, fld, op, limit)                        \
        (get_idreg_field_signed((kvm), id, fld) op __expand_field_sign_signed(id, fld, limit))

#define kvm_cmp_feat_unsigned(kvm, id, fld, op, limit)                        \
        (get_idreg_field_unsigned((kvm), id, fld) op __expand_field_sign_unsigned(id, fld, limit))

#define kvm_cmp_feat(kvm, id, fld, op, limit)                                \
        (id##_##fld##_SIGNED ?                                                \
         kvm_cmp_feat_signed(kvm, id, fld, op, limit) :                        \
         kvm_cmp_feat_unsigned(kvm, id, fld, op, limit))

#define __kvm_has_feat(kvm, id, fld, limit)                                \
        kvm_cmp_feat(kvm, id, fld, >=, limit)

#define kvm_has_feat(kvm, ...) __kvm_has_feat(kvm, __VA_ARGS__)

#define __kvm_has_feat_enum(kvm, id, fld, val)                                \
        kvm_cmp_feat_unsigned(kvm, id, fld, ==, val)

#define kvm_has_feat_enum(kvm, ...) __kvm_has_feat_enum(kvm, __VA_ARGS__)

#define kvm_has_feat_range(kvm, id, fld, min, max)                        \
        (kvm_cmp_feat(kvm, id, fld, >=, min) &&                                \
        kvm_cmp_feat(kvm, id, fld, <=, max))

/* Check for a given level of PAuth support */
#define kvm_has_pauth(k, l)                                                \
        ({                                                                \
                bool pa, pi, pa3;                                        \
                                                                        \
                pa  = kvm_has_feat((k), ID_AA64ISAR1_EL1, APA, l);        \
                pa &= kvm_has_feat((k), ID_AA64ISAR1_EL1, GPA, IMP);        \
                pi  = kvm_has_feat((k), ID_AA64ISAR1_EL1, API, l);        \
                pi &= kvm_has_feat((k), ID_AA64ISAR1_EL1, GPI, IMP);        \
                pa3  = kvm_has_feat((k), ID_AA64ISAR2_EL1, APA3, l);        \
                pa3 &= kvm_has_feat((k), ID_AA64ISAR2_EL1, GPA3, IMP);        \
                                                                        \
                (pa + pi + pa3) == 1;                                        \
        })

#define kvm_has_fpmr(k)                                        \
        (system_supports_fpmr() &&                        \
         kvm_has_feat((k), ID_AA64PFR2_EL1, FPMR, IMP))

#define kvm_has_tcr2(k)                                \
        (kvm_has_feat((k), ID_AA64MMFR3_EL1, TCRX, IMP))

#define kvm_has_s1pie(k)                                \
        (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1PIE, IMP))

#define kvm_has_s1poe(k)                                \
        (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP))

static inline bool kvm_arch_has_irq_bypass(void)
{
        return true;
}

void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt);
void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1);
void check_feature_map(void);


#endif /* __ARM64_KVM_HOST_H__ */















  157 














  157 






















  156 







































   21 













































    1 


































































  169 



















  209 






























  127 





















  166 



















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
/* SPDX-License-Identifier: GPL-2.0 */
#if !defined(_TRACE_ARM_ARM64_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_ARM_ARM64_KVM_H

#include <asm/kvm_emulate.h>
#include <kvm/arm_arch_timer.h>
#include <linux/tracepoint.h>

#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm

/*
 * Tracepoints for entry/exit to guest
 */
TRACE_EVENT(kvm_entry,
        TP_PROTO(unsigned long vcpu_pc),
        TP_ARGS(vcpu_pc),

        TP_STRUCT__entry(
                __field(        unsigned long,        vcpu_pc                )
        ),

        TP_fast_assign(
                __entry->vcpu_pc                = vcpu_pc;
        ),

        TP_printk("PC: 0x%016lx", __entry->vcpu_pc)
);

TRACE_EVENT(kvm_exit,
        TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
        TP_ARGS(ret, esr_ec, vcpu_pc),

        TP_STRUCT__entry(
                __field(        int,                ret                )
                __field(        unsigned int,        esr_ec                )
                __field(        unsigned long,        vcpu_pc                )
        ),

        TP_fast_assign(
                __entry->ret                        = ARM_EXCEPTION_CODE(ret);
                __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
                __entry->vcpu_pc                = vcpu_pc;
        ),

        TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%016lx",
                  __print_symbolic(__entry->ret, kvm_arm_exception_type),
                  __entry->esr_ec,
                  __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
                  __entry->vcpu_pc)
);

TRACE_EVENT(kvm_guest_fault,
        TP_PROTO(unsigned long vcpu_pc, unsigned long hsr,
                 unsigned long hxfar,
                 unsigned long long ipa),
        TP_ARGS(vcpu_pc, hsr, hxfar, ipa),

        TP_STRUCT__entry(
                __field(        unsigned long,        vcpu_pc                )
                __field(        unsigned long,        hsr                )
                __field(        unsigned long,        hxfar                )
                __field(   unsigned long long,        ipa                )
        ),

        TP_fast_assign(
                __entry->vcpu_pc                = vcpu_pc;
                __entry->hsr                        = hsr;
                __entry->hxfar                        = hxfar;
                __entry->ipa                        = ipa;
        ),

        TP_printk("ipa %#llx, hsr %#08lx, hxfar %#08lx, pc %#016lx",
                  __entry->ipa, __entry->hsr,
                  __entry->hxfar, __entry->vcpu_pc)
);

TRACE_EVENT(kvm_access_fault,
        TP_PROTO(unsigned long ipa),
        TP_ARGS(ipa),

        TP_STRUCT__entry(
                __field(        unsigned long,        ipa                )
        ),

        TP_fast_assign(
                __entry->ipa                = ipa;
        ),

        TP_printk("IPA: %lx", __entry->ipa)
);

TRACE_EVENT(kvm_irq_line,
        TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level),
        TP_ARGS(type, vcpu_idx, irq_num, level),

        TP_STRUCT__entry(
                __field(        unsigned int,        type                )
                __field(        int,                vcpu_idx        )
                __field(        int,                irq_num                )
                __field(        int,                level                )
        ),

        TP_fast_assign(
                __entry->type                = type;
                __entry->vcpu_idx        = vcpu_idx;
                __entry->irq_num        = irq_num;
                __entry->level                = level;
        ),

        TP_printk("Inject %s interrupt (%d), vcpu->idx: %d, num: %d, level: %d",
                  (__entry->type == KVM_ARM_IRQ_TYPE_CPU) ? "CPU" :
                  (__entry->type == KVM_ARM_IRQ_TYPE_PPI) ? "VGIC PPI" :
                  (__entry->type == KVM_ARM_IRQ_TYPE_SPI) ? "VGIC SPI" : "UNKNOWN",
                  __entry->type, __entry->vcpu_idx, __entry->irq_num, __entry->level)
);

TRACE_EVENT(kvm_mmio_emulate,
        TP_PROTO(unsigned long vcpu_pc, unsigned long instr,
                 unsigned long cpsr),
        TP_ARGS(vcpu_pc, instr, cpsr),

        TP_STRUCT__entry(
                __field(        unsigned long,        vcpu_pc                )
                __field(        unsigned long,        instr                )
                __field(        unsigned long,        cpsr                )
        ),

        TP_fast_assign(
                __entry->vcpu_pc                = vcpu_pc;
                __entry->instr                        = instr;
                __entry->cpsr                        = cpsr;
        ),

        TP_printk("Emulate MMIO at: 0x%016lx (instr: %08lx, cpsr: %08lx)",
                  __entry->vcpu_pc, __entry->instr, __entry->cpsr)
);

TRACE_EVENT(kvm_mmio_nisv,
        TP_PROTO(unsigned long vcpu_pc, unsigned long esr,
                 unsigned long far, unsigned long ipa),
        TP_ARGS(vcpu_pc, esr, far, ipa),

        TP_STRUCT__entry(
                __field(        unsigned long,        vcpu_pc                )
                __field(        unsigned long,        esr                )
                __field(        unsigned long,        far                )
                __field(        unsigned long,        ipa                )
        ),

        TP_fast_assign(
                __entry->vcpu_pc                = vcpu_pc;
                __entry->esr                        = esr;
                __entry->far                        = far;
                __entry->ipa                        = ipa;
        ),

        TP_printk("ipa %#016lx, esr %#016lx, far %#016lx, pc %#016lx",
                  __entry->ipa, __entry->esr,
                  __entry->far, __entry->vcpu_pc)
);


TRACE_EVENT(kvm_set_way_flush,
            TP_PROTO(unsigned long vcpu_pc, bool cache),
            TP_ARGS(vcpu_pc, cache),

            TP_STRUCT__entry(
                    __field(        unsigned long,        vcpu_pc                )
                    __field(        bool,                cache                )
            ),

            TP_fast_assign(
                    __entry->vcpu_pc                = vcpu_pc;
                    __entry->cache                = cache;
            ),

            TP_printk("S/W flush at 0x%016lx (cache %s)",
                      __entry->vcpu_pc, str_on_off(__entry->cache))
);

TRACE_EVENT(kvm_toggle_cache,
            TP_PROTO(unsigned long vcpu_pc, bool was, bool now),
            TP_ARGS(vcpu_pc, was, now),

            TP_STRUCT__entry(
                    __field(        unsigned long,        vcpu_pc                )
                    __field(        bool,                was                )
                    __field(        bool,                now                )
            ),

            TP_fast_assign(
                    __entry->vcpu_pc                = vcpu_pc;
                    __entry->was                = was;
                    __entry->now                = now;
            ),

            TP_printk("VM op at 0x%016lx (cache was %s, now %s)",
                      __entry->vcpu_pc, str_on_off(__entry->was),
                      str_on_off(__entry->now))
);

/*
 * Tracepoints for arch_timer
 */
TRACE_EVENT(kvm_timer_update_irq,
        TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
        TP_ARGS(vcpu_id, irq, level),

        TP_STRUCT__entry(
                __field(        unsigned long,        vcpu_id        )
                __field(        __u32,                irq        )
                __field(        int,                level        )
        ),

        TP_fast_assign(
                __entry->vcpu_id        = vcpu_id;
                __entry->irq                = irq;
                __entry->level                = level;
        ),

        TP_printk("VCPU: %ld, IRQ %d, level %d",
                  __entry->vcpu_id, __entry->irq, __entry->level)
);

TRACE_EVENT(kvm_get_timer_map,
        TP_PROTO(unsigned long vcpu_id, struct timer_map *map),
        TP_ARGS(vcpu_id, map),

        TP_STRUCT__entry(
                __field(        unsigned long,                vcpu_id        )
                __field(        int,                        direct_vtimer        )
                __field(        int,                        direct_ptimer        )
                __field(        int,                        emul_vtimer        )
                __field(        int,                        emul_ptimer        )
        ),

        TP_fast_assign(
                __entry->vcpu_id                = vcpu_id;
                __entry->direct_vtimer                = arch_timer_ctx_index(map->direct_vtimer);
                __entry->direct_ptimer =
                        (map->direct_ptimer) ? arch_timer_ctx_index(map->direct_ptimer) : -1;
                __entry->emul_vtimer =
                        (map->emul_vtimer) ? arch_timer_ctx_index(map->emul_vtimer) : -1;
                __entry->emul_ptimer =
                        (map->emul_ptimer) ? arch_timer_ctx_index(map->emul_ptimer) : -1;
        ),

        TP_printk("VCPU: %ld, dv: %d, dp: %d, ev: %d, ep: %d",
                  __entry->vcpu_id,
                  __entry->direct_vtimer,
                  __entry->direct_ptimer,
                  __entry->emul_vtimer,
                  __entry->emul_ptimer)
);

TRACE_EVENT(kvm_timer_save_state,
        TP_PROTO(struct arch_timer_context *ctx),
        TP_ARGS(ctx),

        TP_STRUCT__entry(
                __field(        unsigned long,                ctl                )
                __field(        unsigned long long,        cval                )
                __field(        int,                        timer_idx        )
        ),

        TP_fast_assign(
                __entry->ctl                        = timer_get_ctl(ctx);
                __entry->cval                        = timer_get_cval(ctx);
                __entry->timer_idx                = arch_timer_ctx_index(ctx);
        ),

        TP_printk("   CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
                  __entry->ctl,
                  __entry->cval,
                  __entry->timer_idx)
);

TRACE_EVENT(kvm_timer_restore_state,
        TP_PROTO(struct arch_timer_context *ctx),
        TP_ARGS(ctx),

        TP_STRUCT__entry(
                __field(        unsigned long,                ctl                )
                __field(        unsigned long long,        cval                )
                __field(        int,                        timer_idx        )
        ),

        TP_fast_assign(
                __entry->ctl                        = timer_get_ctl(ctx);
                __entry->cval                        = timer_get_cval(ctx);
                __entry->timer_idx                = arch_timer_ctx_index(ctx);
        ),

        TP_printk("CTL: %#08lx CVAL: %#16llx arch_timer_ctx_index: %d",
                  __entry->ctl,
                  __entry->cval,
                  __entry->timer_idx)
);

TRACE_EVENT(kvm_timer_hrtimer_expire,
        TP_PROTO(struct arch_timer_context *ctx),
        TP_ARGS(ctx),

        TP_STRUCT__entry(
                __field(        int,                        timer_idx        )
        ),

        TP_fast_assign(
                __entry->timer_idx                = arch_timer_ctx_index(ctx);
        ),

        TP_printk("arch_timer_ctx_index: %d", __entry->timer_idx)
);

TRACE_EVENT(kvm_timer_emulate,
        TP_PROTO(struct arch_timer_context *ctx, bool should_fire),
        TP_ARGS(ctx, should_fire),

        TP_STRUCT__entry(
                __field(        int,                        timer_idx        )
                __field(        bool,                        should_fire        )
        ),

        TP_fast_assign(
                __entry->timer_idx                = arch_timer_ctx_index(ctx);
                __entry->should_fire                = should_fire;
        ),

        TP_printk("arch_timer_ctx_index: %d (should_fire: %d)",
                  __entry->timer_idx, __entry->should_fire)
);

TRACE_EVENT(kvm_nested_eret,
        TP_PROTO(struct kvm_vcpu *vcpu, unsigned long elr_el2,
                 unsigned long spsr_el2),
        TP_ARGS(vcpu, elr_el2, spsr_el2),

        TP_STRUCT__entry(
                __field(struct kvm_vcpu *,        vcpu)
                __field(unsigned long,                elr_el2)
                __field(unsigned long,                spsr_el2)
                __field(unsigned long,                target_mode)
                __field(unsigned long,                hcr_el2)
        ),

        TP_fast_assign(
                __entry->vcpu = vcpu;
                __entry->elr_el2 = elr_el2;
                __entry->spsr_el2 = spsr_el2;
                __entry->target_mode = spsr_el2 & (PSR_MODE_MASK | PSR_MODE32_BIT);
                __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2);
        ),

        TP_printk("elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx",
                  __entry->elr_el2, __entry->spsr_el2,
                  __print_symbolic(__entry->target_mode, kvm_mode_names),
                  __entry->hcr_el2)
);

TRACE_EVENT(kvm_inject_nested_exception,
        TP_PROTO(struct kvm_vcpu *vcpu, u64 esr_el2, int type),
        TP_ARGS(vcpu, esr_el2, type),

        TP_STRUCT__entry(
                __field(struct kvm_vcpu *,                vcpu)
                __field(unsigned long,                        esr_el2)
                __field(int,                                type)
                __field(unsigned long,                        spsr_el2)
                __field(unsigned long,                        pc)
                __field(unsigned long,                        source_mode)
                __field(unsigned long,                        hcr_el2)
        ),

        TP_fast_assign(
                __entry->vcpu = vcpu;
                __entry->esr_el2 = esr_el2;
                __entry->type = type;
                __entry->spsr_el2 = *vcpu_cpsr(vcpu);
                __entry->pc = *vcpu_pc(vcpu);
                __entry->source_mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);
                __entry->hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2);
        ),

        TP_printk("%s: esr_el2 0x%lx elr_el2: 0x%lx spsr_el2: 0x%08lx (M: %s) hcr_el2: %lx",
                  __print_symbolic(__entry->type, kvm_exception_type_names),
                  __entry->esr_el2, __entry->pc, __entry->spsr_el2,
                  __print_symbolic(__entry->source_mode, kvm_mode_names),
                  __entry->hcr_el2)
);

TRACE_EVENT(kvm_forward_sysreg_trap,
            TP_PROTO(struct kvm_vcpu *vcpu, u32 sysreg, bool is_read),
            TP_ARGS(vcpu, sysreg, is_read),

            TP_STRUCT__entry(
                __field(u64,        pc)
                __field(u32,        sysreg)
                __field(bool,        is_read)
            ),

            TP_fast_assign(
                __entry->pc = *vcpu_pc(vcpu);
                __entry->sysreg = sysreg;
                __entry->is_read = is_read;
            ),

            TP_printk("%llx %c (%d,%d,%d,%d,%d)",
                      __entry->pc,
                      __entry->is_read ? 'R' : 'W',
                      sys_reg_Op0(__entry->sysreg),
                      sys_reg_Op1(__entry->sysreg),
                      sys_reg_CRn(__entry->sysreg),
                      sys_reg_CRm(__entry->sysreg),
                      sys_reg_Op2(__entry->sysreg))
);

#endif /* _TRACE_ARM_ARM64_KVM_H */

#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#undef TRACE_INCLUDE_FILE
#define TRACE_INCLUDE_FILE trace_arm

/* This part must be outside protection */
#include <trace/define_trace.h>



















































































































































































































































































































































































  189 



































































































































































































































































































































  564 








































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Filesystem access notification for Linux
 *
 *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
 */

#ifndef __LINUX_FSNOTIFY_BACKEND_H
#define __LINUX_FSNOTIFY_BACKEND_H

#ifdef __KERNEL__

#include <linux/idr.h> /* inotify uses this */
#include <linux/fs.h> /* struct inode */
#include <linux/list.h>
#include <linux/path.h> /* struct path */
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/mempool.h>
#include <linux/sched/mm.h>

/*
 * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
 * convert between them.  dnotify only needs conversion at watch creation
 * so no perf loss there.  fanotify isn't defined yet, so it can use the
 * wholes if it needs more events.
 */
#define FS_ACCESS                0x00000001        /* File was accessed */
#define FS_MODIFY                0x00000002        /* File was modified */
#define FS_ATTRIB                0x00000004        /* Metadata changed */
#define FS_CLOSE_WRITE                0x00000008        /* Writable file was closed */
#define FS_CLOSE_NOWRITE        0x00000010        /* Unwritable file closed */
#define FS_OPEN                        0x00000020        /* File was opened */
#define FS_MOVED_FROM                0x00000040        /* File was moved from X */
#define FS_MOVED_TO                0x00000080        /* File was moved to Y */
#define FS_CREATE                0x00000100        /* Subfile was created */
#define FS_DELETE                0x00000200        /* Subfile was deleted */
#define FS_DELETE_SELF                0x00000400        /* Self was deleted */
#define FS_MOVE_SELF                0x00000800        /* Self was moved */
#define FS_OPEN_EXEC                0x00001000        /* File was opened for exec */

#define FS_UNMOUNT                0x00002000        /* inode on umount fs */
#define FS_Q_OVERFLOW                0x00004000        /* Event queued overflowed */
#define FS_ERROR                0x00008000        /* Filesystem Error (fanotify) */

/*
 * FS_IN_IGNORED overloads FS_ERROR.  It is only used internally by inotify
 * which does not support FS_ERROR.
 */
#define FS_IN_IGNORED                0x00008000        /* last inotify event here */

#define FS_OPEN_PERM                0x00010000        /* open event in an permission hook */
#define FS_ACCESS_PERM                0x00020000        /* access event in a permissions hook */
#define FS_OPEN_EXEC_PERM        0x00040000        /* open/exec event in a permission hook */
/* #define FS_DIR_MODIFY        0x00080000 */        /* Deprecated (reserved) */

#define FS_PRE_ACCESS                0x00100000        /* Pre-content access hook */

#define FS_MNT_ATTACH                0x01000000        /* Mount was attached */
#define FS_MNT_DETACH                0x02000000        /* Mount was detached */
#define FS_MNT_MOVE                (FS_MNT_ATTACH | FS_MNT_DETACH)

/*
 * Set on inode mark that cares about things that happen to its children.
 * Always set for dnotify and inotify.
 * Set on inode/sb/mount marks that care about parent/name info.
 */
#define FS_EVENT_ON_CHILD        0x08000000

#define FS_RENAME                0x10000000        /* File was renamed */
#define FS_DN_MULTISHOT                0x20000000        /* dnotify multishot */
#define FS_ISDIR                0x40000000        /* event occurred against dir */

#define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)

/*
 * Directory entry modification events - reported only to directory
 * where entry is modified and not to a watching parent.
 * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
 * when a directory entry inside a child subdir changes.
 */
#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)

/* Mount namespace events */
#define FSNOTIFY_MNT_EVENTS (FS_MNT_ATTACH | FS_MNT_DETACH)

/* Content events can be used to inspect file content */
#define FSNOTIFY_CONTENT_PERM_EVENTS (FS_OPEN_PERM | FS_OPEN_EXEC_PERM | \
                                      FS_ACCESS_PERM)
/* Pre-content events can be used to fill file content */
#define FSNOTIFY_PRE_CONTENT_EVENTS  (FS_PRE_ACCESS)

#define ALL_FSNOTIFY_PERM_EVENTS (FSNOTIFY_CONTENT_PERM_EVENTS | \
                                  FSNOTIFY_PRE_CONTENT_EVENTS)

/*
 * This is a list of all events that may get sent to a parent that is watching
 * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory.
 */
#define FS_EVENTS_POSS_ON_CHILD   (ALL_FSNOTIFY_PERM_EVENTS | \
                                   FS_ACCESS | FS_MODIFY | FS_ATTRIB | \
                                   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \
                                   FS_OPEN | FS_OPEN_EXEC)

/*
 * This is a list of all events that may get sent with the parent inode as the
 * @to_tell argument of fsnotify().
 * It may include events that can be sent to an inode/sb/mount mark, but cannot
 * be sent to a parent watching children.
 */
#define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD)

/* Events that can be reported to backends */
#define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
                             FSNOTIFY_MNT_EVENTS | \
                             FS_EVENTS_POSS_ON_CHILD | \
                             FS_DELETE_SELF | FS_MOVE_SELF | \
                             FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
                             FS_ERROR)

/* Extra flags that may be reported with event or control handling of events */
#define ALL_FSNOTIFY_FLAGS  (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT)

#define ALL_FSNOTIFY_BITS   (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS)

struct fsnotify_group;
struct fsnotify_event;
struct fsnotify_mark;
struct fsnotify_event_private_data;
struct fsnotify_fname;
struct fsnotify_iter_info;

struct mem_cgroup;

/*
 * Each group much define these ops.  The fsnotify infrastructure will call
 * these operations for each relevant group.
 *
 * handle_event - main call for a group to handle an fs event
 * @group:        group to notify
 * @mask:        event type and flags
 * @data:        object that event happened on
 * @data_type:        type of object for fanotify_data_XXX() accessors
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 * @iter_info:        array of marks from this group that are interested in the event
 *
 * handle_inode_event - simple variant of handle_event() for groups that only
 *                have inode marks and don't have ignore mask
 * @mark:        mark to notify
 * @mask:        event type and flags
 * @inode:        inode that event happened on
 * @dir:        optional directory associated with event -
 *                if @file_name is not NULL, this is the directory that
 *                @file_name is relative to.
 *                Either @inode or @dir must be non-NULL.
 * @file_name:        optional file name associated with event
 * @cookie:        inotify rename cookie
 *
 * free_group_priv - called when a group refcnt hits 0 to clean up the private union
 * freeing_mark - called when a mark is being destroyed for some reason.  The group
 *                MUST be holding a reference on each mark and that reference must be
 *                dropped in this function.  inotify uses this function to send
 *                userspace messages that marks have been removed.
 */
struct fsnotify_ops {
        int (*handle_event)(struct fsnotify_group *group, u32 mask,
                            const void *data, int data_type, struct inode *dir,
                            const struct qstr *file_name, u32 cookie,
                            struct fsnotify_iter_info *iter_info);
        int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask,
                            struct inode *inode, struct inode *dir,
                            const struct qstr *file_name, u32 cookie);
        void (*free_group_priv)(struct fsnotify_group *group);
        void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
        void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event);
        /* called on final put+free to free memory */
        void (*free_mark)(struct fsnotify_mark *mark);
};

/*
 * all of the information about the original object we want to now send to
 * a group.  If you want to carry more info from the accessing task to the
 * listener this structure is where you need to be adding fields.
 */
struct fsnotify_event {
        struct list_head list;
};

/*
 * fsnotify group priorities.
 * Events are sent in order from highest priority to lowest priority.
 */
enum fsnotify_group_prio {
        FSNOTIFY_PRIO_NORMAL = 0,        /* normal notifiers, no permissions */
        FSNOTIFY_PRIO_CONTENT,                /* fanotify permission events */
        FSNOTIFY_PRIO_PRE_CONTENT,        /* fanotify pre-content events */
        __FSNOTIFY_PRIO_NUM
};

/*
 * A group is a "thing" that wants to receive notification about filesystem
 * events.  The mask holds the subset of event types this group cares about.
 * refcnt on a group is up to the implementor and at any moment if it goes 0
 * everything will be cleaned up.
 */
struct fsnotify_group {
        const struct fsnotify_ops *ops;        /* how this group handles things */

        /*
         * How the refcnt is used is up to each group.  When the refcnt hits 0
         * fsnotify will clean up all of the resources associated with this group.
         * As an example, the dnotify group will always have a refcnt=1 and that
         * will never change.  Inotify, on the other hand, has a group per
         * inotify_init() and the refcnt will hit 0 only when that fd has been
         * closed.
         */
        refcount_t refcnt;                /* things with interest in this group */

        /* needed to send notification to userspace */
        spinlock_t notification_lock;                /* protect the notification_list */
        struct list_head notification_list;        /* list of event_holder this group needs to send to userspace */
        wait_queue_head_t notification_waitq;        /* read() on the notification file blocks on this waitq */
        unsigned int q_len;                        /* events on the queue */
        unsigned int max_events;                /* maximum events allowed on the list */
        enum fsnotify_group_prio priority;        /* priority for sending events */
        bool shutdown;                /* group is being shut down, don't queue more events */

#define FSNOTIFY_GROUP_USER        0x01 /* user allocated group */
#define FSNOTIFY_GROUP_DUPS        0x02 /* allow multiple marks per object */
        int flags;
        unsigned int owner_flags;        /* stored flags of mark_mutex owner */

        /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
        struct mutex mark_mutex;        /* protect marks_list */
        atomic_t user_waits;                /* Number of tasks waiting for user
                                         * response */
        struct list_head marks_list;        /* all inode marks for this group */

        struct fasync_struct *fsn_fa;    /* async notification */

        struct fsnotify_event *overflow_event;        /* Event we queue when the
                                                 * notification list is too
                                                 * full */

        struct mem_cgroup *memcg;        /* memcg to charge allocations */

        /* groups can define private fields here or use the void *private */
        union {
                void *private;
#ifdef CONFIG_INOTIFY_USER
                struct inotify_group_private_data {
                        spinlock_t        idr_lock;
                        struct idr      idr;
                        struct ucounts *ucounts;
                } inotify_data;
#endif
#ifdef CONFIG_FANOTIFY
                struct fanotify_group_private_data {
                        /* Hash table of events for merge */
                        struct hlist_head *merge_hash;
                        /* allows a group to block waiting for a userspace response */
                        struct list_head access_list;
                        wait_queue_head_t access_waitq;
                        int flags;           /* flags from fanotify_init() */
                        int f_flags; /* event_f_flags from fanotify_init() */
                        struct ucounts *ucounts;
                        mempool_t error_events_pool;
                } fanotify_data;
#endif /* CONFIG_FANOTIFY */
        };
};

/*
 * These helpers are used to prevent deadlock when reclaiming inodes with
 * evictable marks of the same group that is allocating a new mark.
 */
static inline void fsnotify_group_lock(struct fsnotify_group *group)
{
        mutex_lock(&group->mark_mutex);
        group->owner_flags = memalloc_nofs_save();
}

static inline void fsnotify_group_unlock(struct fsnotify_group *group)
{
        memalloc_nofs_restore(group->owner_flags);
        mutex_unlock(&group->mark_mutex);
}

static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
{
        WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
        WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}

/* When calling fsnotify tell it if the data is a path or inode */
enum fsnotify_data_type {
        FSNOTIFY_EVENT_NONE,
        FSNOTIFY_EVENT_FILE_RANGE,
        FSNOTIFY_EVENT_PATH,
        FSNOTIFY_EVENT_INODE,
        FSNOTIFY_EVENT_DENTRY,
        FSNOTIFY_EVENT_MNT,
        FSNOTIFY_EVENT_ERROR,
};

struct fs_error_report {
        int error;
        struct inode *inode;
        struct super_block *sb;
};

struct file_range {
        const struct path *path;
        loff_t pos;
        size_t count;
};

static inline const struct path *file_range_path(const struct file_range *range)
{
        return range->path;
}

struct fsnotify_mnt {
        const struct mnt_namespace *ns;
        u64 mnt_id;
};

static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return (struct inode *)data;
        case FSNOTIFY_EVENT_DENTRY:
                return d_inode(data);
        case FSNOTIFY_EVENT_PATH:
                return d_inode(((const struct path *)data)->dentry);
        case FSNOTIFY_EVENT_FILE_RANGE:
                return d_inode(file_range_path(data)->dentry);
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *)data)->inode;
        default:
                return NULL;
        }
}

static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_DENTRY:
                /* Non const is needed for dget() */
                return (struct dentry *)data;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry;
        case FSNOTIFY_EVENT_FILE_RANGE:
                return file_range_path(data)->dentry;
        default:
                return NULL;
        }
}

static inline const struct path *fsnotify_data_path(const void *data,
                                                    int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_PATH:
                return data;
        case FSNOTIFY_EVENT_FILE_RANGE:
                return file_range_path(data);
        default:
                return NULL;
        }
}

static inline struct super_block *fsnotify_data_sb(const void *data,
                                                   int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_INODE:
                return ((struct inode *)data)->i_sb;
        case FSNOTIFY_EVENT_DENTRY:
                return ((struct dentry *)data)->d_sb;
        case FSNOTIFY_EVENT_PATH:
                return ((const struct path *)data)->dentry->d_sb;
        case FSNOTIFY_EVENT_FILE_RANGE:
                return file_range_path(data)->dentry->d_sb;
        case FSNOTIFY_EVENT_ERROR:
                return ((struct fs_error_report *) data)->sb;
        default:
                return NULL;
        }
}

static inline const struct fsnotify_mnt *fsnotify_data_mnt(const void *data,
                                                           int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_MNT:
                return data;
        default:
                return NULL;
        }
}

static inline u64 fsnotify_data_mnt_id(const void *data, int data_type)
{
        const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type);

        return mnt_data ? mnt_data->mnt_id : 0;
}

static inline struct fs_error_report *fsnotify_data_error_report(
                                                        const void *data,
                                                        int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_ERROR:
                return (struct fs_error_report *) data;
        default:
                return NULL;
        }
}

static inline const struct file_range *fsnotify_data_file_range(
                                                        const void *data,
                                                        int data_type)
{
        switch (data_type) {
        case FSNOTIFY_EVENT_FILE_RANGE:
                return (struct file_range *)data;
        default:
                return NULL;
        }
}

/*
 * Index to merged marks iterator array that correlates to a type of watch.
 * The type of watched object can be deduced from the iterator type, but not
 * the other way around, because an event can match different watched objects
 * of the same object type.
 * For example, both parent and child are watching an object of type inode.
 */
enum fsnotify_iter_type {
        FSNOTIFY_ITER_TYPE_INODE,
        FSNOTIFY_ITER_TYPE_VFSMOUNT,
        FSNOTIFY_ITER_TYPE_SB,
        FSNOTIFY_ITER_TYPE_PARENT,
        FSNOTIFY_ITER_TYPE_INODE2,
        FSNOTIFY_ITER_TYPE_MNTNS,
        FSNOTIFY_ITER_TYPE_COUNT
};

/* The type of object that a mark is attached to */
enum fsnotify_obj_type {
        FSNOTIFY_OBJ_TYPE_ANY = -1,
        FSNOTIFY_OBJ_TYPE_INODE,
        FSNOTIFY_OBJ_TYPE_VFSMOUNT,
        FSNOTIFY_OBJ_TYPE_SB,
        FSNOTIFY_OBJ_TYPE_MNTNS,
        FSNOTIFY_OBJ_TYPE_COUNT,
        FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
};

static inline bool fsnotify_valid_obj_type(unsigned int obj_type)
{
        return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT);
}

struct fsnotify_iter_info {
        struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT];
        struct fsnotify_group *current_group;
        unsigned int report_mask;
        int srcu_idx;
};

static inline bool fsnotify_iter_should_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        return (iter_info->report_mask & (1U << iter_type));
}

static inline void fsnotify_iter_set_report_type(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        iter_info->report_mask |= (1U << iter_type);
}

static inline struct fsnotify_mark *fsnotify_iter_mark(
                struct fsnotify_iter_info *iter_info, int iter_type)
{
        if (fsnotify_iter_should_report_type(iter_info, iter_type))
                return iter_info->marks[iter_type];
        return NULL;
}

static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type,
                                     struct fsnotify_mark **markp)
{
        while (type < FSNOTIFY_ITER_TYPE_COUNT) {
                *markp = fsnotify_iter_mark(iter, type);
                if (*markp)
                        break;
                type++;
        }
        return type;
}

#define FSNOTIFY_ITER_FUNCS(name, NAME) \
static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
                struct fsnotify_iter_info *iter_info) \
{ \
        return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \
}

FSNOTIFY_ITER_FUNCS(inode, INODE)
FSNOTIFY_ITER_FUNCS(parent, PARENT)
FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
FSNOTIFY_ITER_FUNCS(sb, SB)

#define fsnotify_foreach_iter_type(type) \
        for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++)
#define fsnotify_foreach_iter_mark_type(iter, mark, type) \
        for (type = 0; \
             type = fsnotify_iter_step(iter, type, &mark), \
             type < FSNOTIFY_ITER_TYPE_COUNT; \
             type++)

/*
 * Inode/vfsmount/sb point to this structure which tracks all marks attached to
 * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this
 * structure. We destroy this structure when there are no more marks attached
 * to it. The structure is protected by fsnotify_mark_srcu.
 */
struct fsnotify_mark_connector {
        spinlock_t lock;
        unsigned char type;        /* Type of object [lock] */
        unsigned char prio;        /* Highest priority group */
#define FSNOTIFY_CONN_FLAG_IS_WATCHED        0x01
#define FSNOTIFY_CONN_FLAG_HAS_IREF        0x02
        unsigned short flags;        /* flags [lock] */
        union {
                /* Object pointer [lock] */
                void *obj;
                /* Used listing heads to free after srcu period expires */
                struct fsnotify_mark_connector *destroy_next;
        };
        struct hlist_head list;
};

/*
 * Container for per-sb fsnotify state (sb marks and more).
 * Attached lazily on first marked object on the sb and freed when killing sb.
 */
struct fsnotify_sb_info {
        struct fsnotify_mark_connector __rcu *sb_marks;
        /*
         * Number of inode/mount/sb objects that are being watched in this sb.
         * Note that inodes objects are currently double-accounted.
         *
         * The value in watched_objects[prio] is the number of objects that are
         * watched by groups of priority >= prio, so watched_objects[0] is the
         * total number of watched objects in this sb.
         */
        atomic_long_t watched_objects[__FSNOTIFY_PRIO_NUM];
};

static inline struct fsnotify_sb_info *fsnotify_sb_info(struct super_block *sb)
{
#ifdef CONFIG_FSNOTIFY
        return READ_ONCE(sb->s_fsnotify_info);
#else
        return NULL;
#endif
}

static inline atomic_long_t *fsnotify_sb_watched_objects(struct super_block *sb)
{
        return &fsnotify_sb_info(sb)->watched_objects[0];
}

/*
 * A mark is simply an object attached to an in core inode which allows an
 * fsnotify listener to indicate they are either no longer interested in events
 * of a type matching mask or only interested in those events.
 *
 * These are flushed when an inode is evicted from core and may be flushed
 * when the inode is modified (as seen by fsnotify_access).  Some fsnotify
 * users (such as dnotify) will flush these when the open fd is closed and not
 * at inode eviction or modification.
 *
 * Text in brackets is showing the lock(s) protecting modifications of a
 * particular entry. obj_lock means either inode->i_lock or
 * mnt->mnt_root->d_lock depending on the mark type.
 */
struct fsnotify_mark {
        /* Mask this mark is for [mark->lock, group->mark_mutex] */
        __u32 mask;
        /* We hold one for presence in g_list. Also one ref for each 'thing'
         * in kernel that found and may be using this mark. */
        refcount_t refcnt;
        /* Group this mark is for. Set on mark creation, stable until last ref
         * is dropped */
        struct fsnotify_group *group;
        /* List of marks by group->marks_list. Also reused for queueing
         * mark into destroy_list when it's waiting for the end of SRCU period
         * before it can be freed. [group->mark_mutex] */
        struct list_head g_list;
        /* Protects inode / mnt pointers, flags, masks */
        spinlock_t lock;
        /* List of marks for inode / vfsmount [connector->lock, mark ref] */
        struct hlist_node obj_list;
        /* Head of list of marks for an object [mark ref] */
        struct fsnotify_mark_connector *connector;
        /* Events types and flags to ignore [mark->lock, group->mark_mutex] */
        __u32 ignore_mask;
        /* General fsnotify mark flags */
#define FSNOTIFY_MARK_FLAG_ALIVE                0x0001
#define FSNOTIFY_MARK_FLAG_ATTACHED                0x0002
        /* inotify mark flags */
#define FSNOTIFY_MARK_FLAG_EXCL_UNLINK                0x0010
#define FSNOTIFY_MARK_FLAG_IN_ONESHOT                0x0020
        /* fanotify mark flags */
#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY        0x0100
#define FSNOTIFY_MARK_FLAG_NO_IREF                0x0200
#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS        0x0400
#define FSNOTIFY_MARK_FLAG_HAS_FSID                0x0800
#define FSNOTIFY_MARK_FLAG_WEAK_FSID                0x1000
        unsigned int flags;                /* flags [mark->lock] */
};

#ifdef CONFIG_FSNOTIFY

/* called from the vfs helpers */

/* main fsnotify call to send events */
extern int fsnotify(__u32 mask, const void *data, int data_type,
                    struct inode *dir, const struct qstr *name,
                    struct inode *inode, u32 cookie);
extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
                           int data_type);
extern void __fsnotify_inode_delete(struct inode *inode);
extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt);
extern void fsnotify_sb_delete(struct super_block *sb);
extern void __fsnotify_mntns_delete(struct mnt_namespace *mntns);
extern void fsnotify_sb_free(struct super_block *sb);
extern u32 fsnotify_get_cookie(void);
extern void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt);

static inline __u32 fsnotify_parent_needed_mask(__u32 mask)
{
        /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */
        if (!(mask & FS_EVENT_ON_CHILD))
                return 0;
        /*
         * This object might be watched by a mark that cares about parent/name
         * info, does it care about the specific set of events that can be
         * reported with parent/name info?
         */
        return mask & FS_EVENTS_POSS_TO_PARENT;
}

static inline int fsnotify_inode_watches_children(struct inode *inode)
{
        __u32 parent_mask = READ_ONCE(inode->i_fsnotify_mask);

        /* FS_EVENT_ON_CHILD is set if the inode may care */
        if (!(parent_mask & FS_EVENT_ON_CHILD))
                return 0;
        /* this inode might care about child events, does it care about the
         * specific set of events that can happen on a child? */
        return parent_mask & FS_EVENTS_POSS_ON_CHILD;
}

/*
 * Update the dentry with a flag indicating the interest of its parent to receive
 * filesystem events when those events happens to this dentry->d_inode.
 */
static inline void fsnotify_update_flags(struct dentry *dentry)
{
        assert_spin_locked(&dentry->d_lock);

        /*
         * Serialisation of setting PARENT_WATCHED on the dentries is provided
         * by d_lock. If inotify_inode_watched changes after we have taken
         * d_lock, the following fsnotify_set_children_dentry_flags call will
         * find our entry, so it will spin until we complete here, and update
         * us with the new state.
         */
        if (fsnotify_inode_watches_children(dentry->d_parent->d_inode))
                dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
        else
                dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
}

/* called from fsnotify listeners, such as fanotify or dnotify */

/* create a new group */
extern struct fsnotify_group *fsnotify_alloc_group(
                                const struct fsnotify_ops *ops,
                                int flags);
/* get reference to a group */
extern void fsnotify_get_group(struct fsnotify_group *group);
/* drop reference on a group from fsnotify_alloc_group */
extern void fsnotify_put_group(struct fsnotify_group *group);
/* group destruction begins, stop queuing new events */
extern void fsnotify_group_stop_queueing(struct fsnotify_group *group);
/* destroy group */
extern void fsnotify_destroy_group(struct fsnotify_group *group);
/* fasync handler function */
extern int fsnotify_fasync(int fd, struct file *file, int on);
/* Free event from memory */
extern void fsnotify_destroy_event(struct fsnotify_group *group,
                                   struct fsnotify_event *event);
/* attach the event to the group notification queue */
extern int fsnotify_insert_event(struct fsnotify_group *group,
                                 struct fsnotify_event *event,
                                 int (*merge)(struct fsnotify_group *,
                                              struct fsnotify_event *),
                                 void (*insert)(struct fsnotify_group *,
                                                struct fsnotify_event *));

static inline int fsnotify_add_event(struct fsnotify_group *group,
                                     struct fsnotify_event *event,
                                     int (*merge)(struct fsnotify_group *,
                                                  struct fsnotify_event *))
{
        return fsnotify_insert_event(group, event, merge, NULL);
}

/* Queue overflow event to a notification group */
static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
{
        fsnotify_add_event(group, group->overflow_event, NULL);
}

static inline bool fsnotify_is_overflow_event(u32 mask)
{
        return mask & FS_Q_OVERFLOW;
}

static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
{
        assert_spin_locked(&group->notification_lock);

        return list_empty(&group->notification_list);
}

extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
/* return, but do not dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
/* return AND dequeue the first event on the notification queue */
extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group);
/* Remove event queued in the notification list */
extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
                                         struct fsnotify_event *event);

/* functions used to manipulate the marks attached to inodes */

/*
 * Canonical "ignore mask" including event flags.
 *
 * Note the subtle semantic difference from the legacy ->ignored_mask.
 * ->ignored_mask traditionally only meant which events should be ignored,
 * while ->ignore_mask also includes flags regarding the type of objects on
 * which events should be ignored.
 */
static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark)
{
        __u32 ignore_mask = mark->ignore_mask;

        /* The event flags in ignore mask take effect */
        if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
                return ignore_mask;

        /*
         * Legacy behavior:
         * - Always ignore events on dir
         * - Ignore events on child if parent is watching children
         */
        ignore_mask |= FS_ISDIR;
        ignore_mask &= ~FS_EVENT_ON_CHILD;
        ignore_mask |= mark->mask & FS_EVENT_ON_CHILD;

        return ignore_mask;
}

/* Legacy ignored_mask - only event types to ignore */
static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark)
{
        return mark->ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/*
 * Check if mask (or ignore mask) should be applied depending if victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir,
                                            int iter_type)
{
        /* Should mask be applied to a directory? */
        if (is_dir && !(mask & FS_ISDIR))
                return false;

        /* Should mask be applied to a child? */
        if (iter_type == FSNOTIFY_ITER_TYPE_PARENT &&
            !(mask & FS_EVENT_ON_CHILD))
                return false;

        return true;
}

/*
 * Effective ignore mask taking into account if event victim is a
 * directory and whether it is reported to a watching parent.
 */
static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark,
                                                   bool is_dir, int iter_type)
{
        __u32 ignore_mask = fsnotify_ignored_events(mark);

        if (!ignore_mask)
                return 0;

        /* For non-dir and non-child, no need to consult the event flags */
        if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT)
                return ignore_mask;

        ignore_mask = fsnotify_ignore_mask(mark);
        if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type))
                return 0;

        return ignore_mask & ALL_FSNOTIFY_EVENTS;
}

/* Get mask for calculating object interest taking ignore mask into account */
static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
{
        __u32 mask = mark->mask;

        if (!fsnotify_ignored_events(mark))
                return mask;

        /* Interest in FS_MODIFY may be needed for clearing ignore mask */
        if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
                mask |= FS_MODIFY;

        /*
         * If mark is interested in ignoring events on children, the object must
         * show interest in those events for fsnotify_parent() to notice it.
         */
        return mask | mark->ignore_mask;
}

/* Get mask of events for a list of marks */
extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn);
/* Calculate mask of events for a list of marks */
extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn);
extern void fsnotify_init_mark(struct fsnotify_mark *mark,
                               struct fsnotify_group *group);
/* Find mark belonging to given group in the list of marks */
struct fsnotify_mark *fsnotify_find_mark(void *obj, unsigned int obj_type,
                                         struct fsnotify_group *group);
/* attach the mark to the object */
int fsnotify_add_mark(struct fsnotify_mark *mark, void *obj,
                      unsigned int obj_type, int add_flags);
int fsnotify_add_mark_locked(struct fsnotify_mark *mark, void *obj,
                             unsigned int obj_type, int add_flags);

/* attach the mark to the inode */
static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
                                          struct inode *inode,
                                          int add_flags)
{
        return fsnotify_add_mark(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                 add_flags);
}
static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
                                                 struct inode *inode,
                                                 int add_flags)
{
        return fsnotify_add_mark_locked(mark, inode, FSNOTIFY_OBJ_TYPE_INODE,
                                        add_flags);
}

static inline struct fsnotify_mark *fsnotify_find_inode_mark(
                                                struct inode *inode,
                                                struct fsnotify_group *group)
{
        return fsnotify_find_mark(inode, FSNOTIFY_OBJ_TYPE_INODE, group);
}

/* given a group and a mark, flag mark to be freed when all references are dropped */
extern void fsnotify_destroy_mark(struct fsnotify_mark *mark,
                                  struct fsnotify_group *group);
/* detach mark from inode / mount list, group list, drop inode reference */
extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
/* free mark */
extern void fsnotify_free_mark(struct fsnotify_mark *mark);
/* Wait until all marks queued for destruction are destroyed */
extern void fsnotify_wait_marks_destroyed(void);
/* Clear all of the marks of a group attached to a given object type */
extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
                                          unsigned int obj_type);
/* run all the marks in a group, and clear all of the vfsmount marks */
static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
}
/* run all the marks in a group, and clear all of the inode marks */
static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE);
}
/* run all the marks in a group, and clear all of the sn marks */
static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group)
{
        fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB);
}
extern void fsnotify_get_mark(struct fsnotify_mark *mark);
extern void fsnotify_put_mark(struct fsnotify_mark *mark);
extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);

static inline void fsnotify_init_event(struct fsnotify_event *event)
{
        INIT_LIST_HEAD(&event->list);
}
int fsnotify_pre_content(const struct path *path, const loff_t *ppos,
                         size_t count);

#else

static inline int fsnotify_pre_content(const struct path *path,
                                       const loff_t *ppos, size_t count)
{
        return 0;
}

static inline int fsnotify(__u32 mask, const void *data, int data_type,
                           struct inode *dir, const struct qstr *name,
                           struct inode *inode, u32 cookie)
{
        return 0;
}

static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask,
                                  const void *data, int data_type)
{
        return 0;
}

static inline void __fsnotify_inode_delete(struct inode *inode)
{}

static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt)
{}

static inline void fsnotify_sb_delete(struct super_block *sb)
{}

static inline void __fsnotify_mntns_delete(struct mnt_namespace *mntns)
{}

static inline void fsnotify_sb_free(struct super_block *sb)
{}

static inline void fsnotify_update_flags(struct dentry *dentry)
{}

static inline u32 fsnotify_get_cookie(void)
{
        return 0;
}

static inline void fsnotify_unmount_inodes(struct super_block *sb)
{}

static inline void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt)
{}

#endif        /* CONFIG_FSNOTIFY */

#endif        /* __KERNEL __ */

#endif        /* __LINUX_FSNOTIFY_BACKEND_H */


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2016 ARM Ltd.
 */
#ifndef __ASM_CHECKSUM_H
#define __ASM_CHECKSUM_H

#include <linux/in6.h>

#define _HAVE_ARCH_IPV6_CSUM
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
                        const struct in6_addr *daddr,
                        __u32 len, __u8 proto, __wsum sum);

static inline __sum16 csum_fold(__wsum csum)
{
        u32 sum = (__force u32)csum;
        sum += (sum >> 16) | (sum << 16);
        return ~(__force __sum16)(sum >> 16);
}
#define csum_fold csum_fold

static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
{
        __uint128_t tmp;
        u64 sum;
        int n = ihl; /* we want it signed */

        tmp = *(const __uint128_t *)iph;
        iph += 16;
        n -= 4;
        tmp += ((tmp >> 64) | (tmp << 64));
        sum = tmp >> 64;
        do {
                sum += *(const u32 *)iph;
                iph += 4;
        } while (--n > 0);

        sum += ((sum >> 32) | (sum << 32));
        return csum_fold((__force __wsum)(sum >> 32));
}
#define ip_fast_csum ip_fast_csum

extern unsigned int do_csum(const unsigned char *buff, int len);
#define do_csum do_csum

#include <asm-generic/checksum.h>

#endif        /* __ASM_CHECKSUM_H */







































































































  179 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * A hash table (hashtab) maintains associations between
 * key values and datum values.  The type of the key values
 * and the type of the datum values is arbitrary.  The
 * functions for hash computation and key comparison are
 * provided by the creator of the table.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

#ifndef _SS_HASHTAB_H_
#define _SS_HASHTAB_H_

#include <linux/types.h>
#include <linux/errno.h>
#include <linux/sched.h>

#define HASHTAB_MAX_NODES U32_MAX

struct hashtab_key_params {
        u32 (*hash)(const void *key); /* hash func */
        int (*cmp)(const void *key1, const void *key2); /* comparison func */
};

struct hashtab_node {
        void *key;
        void *datum;
        struct hashtab_node *next;
};

struct hashtab {
        struct hashtab_node **htable; /* hash table */
        u32 size; /* number of slots in hash table */
        u32 nel; /* number of elements in hash table */
};

struct hashtab_info {
        u32 slots_used;
        u32 max_chain_len;
        u64 chain2_len_sum;
};

/*
 * Initializes a new hash table with the specified characteristics.
 *
 * Returns -ENOMEM if insufficient space is available or 0 otherwise.
 */
int hashtab_init(struct hashtab *h, u32 nel_hint);

int __hashtab_insert(struct hashtab *h, struct hashtab_node **dst, void *key,
                     void *datum);

/*
 * Inserts the specified (key, datum) pair into the specified hash table.
 *
 * Returns -ENOMEM on memory allocation error,
 * -EEXIST if there is already an entry with the same key,
 * -EINVAL for general errors or
  0 otherwise.
 */
static inline int hashtab_insert(struct hashtab *h, void *key, void *datum,
                                 struct hashtab_key_params key_params)
{
        u32 hvalue;
        struct hashtab_node *prev, *cur;

        cond_resched();

        if (!h->size || h->nel == HASHTAB_MAX_NODES)
                return -EINVAL;

        hvalue = key_params.hash(key) & (h->size - 1);
        prev = NULL;
        cur = h->htable[hvalue];
        while (cur) {
                int cmp = key_params.cmp(key, cur->key);

                if (cmp == 0)
                        return -EEXIST;
                if (cmp < 0)
                        break;
                prev = cur;
                cur = cur->next;
        }

        return __hashtab_insert(h, prev ? &prev->next : &h->htable[hvalue], key,
                                datum);
}

/*
 * Searches for the entry with the specified key in the hash table.
 *
 * Returns NULL if no entry has the specified key or
 * the datum of the entry otherwise.
 */
static inline void *hashtab_search(struct hashtab *h, const void *key,
                                   struct hashtab_key_params key_params)
{
        u32 hvalue;
        struct hashtab_node *cur;

        if (!h->size)
                return NULL;

        hvalue = key_params.hash(key) & (h->size - 1);
        cur = h->htable[hvalue];
        while (cur) {
                int cmp = key_params.cmp(key, cur->key);

                if (cmp == 0)
                        return cur->datum;
                if (cmp < 0)
                        break;
                cur = cur->next;
        }
        return NULL;
}

/*
 * Destroys the specified hash table.
 */
void hashtab_destroy(struct hashtab *h);

/*
 * Applies the specified apply function to (key,datum,args)
 * for each entry in the specified hash table.
 *
 * The order in which the function is applied to the entries
 * is dependent upon the internal structure of the hash table.
 *
 * If apply returns a non-zero status, then hashtab_map will cease
 * iterating through the hash table and will propagate the error
 * return to its caller.
 */
int hashtab_map(struct hashtab *h, int (*apply)(void *k, void *d, void *args),
                void *args);

int hashtab_duplicate(struct hashtab *new, const struct hashtab *orig,
                      int (*copy)(struct hashtab_node *new,
                                  const struct hashtab_node *orig, void *args),
                      int (*destroy)(void *k, void *d, void *args), void *args);

#ifdef CONFIG_SECURITY_SELINUX_DEBUG
/* Fill info with some hash table statistics */
void hashtab_stat(struct hashtab *h, struct hashtab_info *info);
#else
static inline void hashtab_stat(struct hashtab *h, struct hashtab_info *info)
{
        return;
}
#endif

#endif /* _SS_HASHTAB_H */





































































































































































































































































































   39 
   39 






   39 





   39 


































































































































  209 
  136 

















  136 








  133 


   82 


  136 







  136 

























  209 





  209 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/tlbflush.h
 *
 * Copyright (C) 1999-2003 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_TLBFLUSH_H
#define __ASM_TLBFLUSH_H

#ifndef __ASSEMBLY__

#include <linux/bitfield.h>
#include <linux/mm_types.h>
#include <linux/sched.h>
#include <linux/mmu_notifier.h>
#include <asm/cputype.h>
#include <asm/mmu.h>

/*
 * Raw TLBI operations.
 *
 * Where necessary, use the __tlbi() macro to avoid asm()
 * boilerplate. Drivers and most kernel code should use the TLB
 * management routines in preference to the macro below.
 *
 * The macro can be used as __tlbi(op) or __tlbi(op, arg), depending
 * on whether a particular TLBI operation takes an argument or
 * not. The macros handles invoking the asm with or without the
 * register argument as appropriate.
 */
#define __TLBI_0(op, arg) asm (ARM64_ASM_PREAMBLE                               \
                               "tlbi " #op "\n"                                       \
                   ALTERNATIVE("nop\n                        nop",                       \
                               "dsb ish\n                tlbi " #op,               \
                               ARM64_WORKAROUND_REPEAT_TLBI,                       \
                               CONFIG_ARM64_WORKAROUND_REPEAT_TLBI)               \
                            : : )

#define __TLBI_1(op, arg) asm (ARM64_ASM_PREAMBLE                               \
                               "tlbi " #op ", %0\n"                               \
                   ALTERNATIVE("nop\n                        nop",                       \
                               "dsb ish\n                tlbi " #op ", %0",     \
                               ARM64_WORKAROUND_REPEAT_TLBI,                       \
                               CONFIG_ARM64_WORKAROUND_REPEAT_TLBI)               \
                            : : "r" (arg))

#define __TLBI_N(op, arg, n, ...) __TLBI_##n(op, arg)

#define __tlbi(op, ...)                __TLBI_N(op, ##__VA_ARGS__, 1, 0)

#define __tlbi_user(op, arg) do {                                                \
        if (arm64_kernel_unmapped_at_el0())                                        \
                __tlbi(op, (arg) | USER_ASID_FLAG);                                \
} while (0)

/* This macro creates a properly formatted VA operand for the TLBI */
#define __TLBI_VADDR(addr, asid)                                \
        ({                                                        \
                unsigned long __ta = (addr) >> 12;                \
                __ta &= GENMASK_ULL(43, 0);                        \
                __ta |= (unsigned long)(asid) << 48;                \
                __ta;                                                \
        })

/*
 * Get translation granule of the system, which is decided by
 * PAGE_SIZE.  Used by TTL.
 *  - 4KB        : 1
 *  - 16KB        : 2
 *  - 64KB        : 3
 */
#define TLBI_TTL_TG_4K                1
#define TLBI_TTL_TG_16K                2
#define TLBI_TTL_TG_64K                3

static inline unsigned long get_trans_granule(void)
{
        switch (PAGE_SIZE) {
        case SZ_4K:
                return TLBI_TTL_TG_4K;
        case SZ_16K:
                return TLBI_TTL_TG_16K;
        case SZ_64K:
                return TLBI_TTL_TG_64K;
        default:
                return 0;
        }
}

/*
 * Level-based TLBI operations.
 *
 * When ARMv8.4-TTL exists, TLBI operations take an additional hint for
 * the level at which the invalidation must take place. If the level is
 * wrong, no invalidation may take place. In the case where the level
 * cannot be easily determined, the value TLBI_TTL_UNKNOWN will perform
 * a non-hinted invalidation. Any provided level outside the hint range
 * will also cause fall-back to non-hinted invalidation.
 *
 * For Stage-2 invalidation, use the level values provided to that effect
 * in asm/stage2_pgtable.h.
 */
#define TLBI_TTL_MASK                GENMASK_ULL(47, 44)

#define TLBI_TTL_UNKNOWN        INT_MAX

#define __tlbi_level(op, addr, level) do {                                \
        u64 arg = addr;                                                        \
                                                                        \
        if (alternative_has_cap_unlikely(ARM64_HAS_ARMv8_4_TTL) &&        \
            level >= 0 && level <= 3) {                                        \
                u64 ttl = level & 3;                                        \
                ttl |= get_trans_granule() << 2;                        \
                arg &= ~TLBI_TTL_MASK;                                        \
                arg |= FIELD_PREP(TLBI_TTL_MASK, ttl);                        \
        }                                                                \
                                                                        \
        __tlbi(op, arg);                                                \
} while(0)

#define __tlbi_user_level(op, arg, level) do {                                \
        if (arm64_kernel_unmapped_at_el0())                                \
                __tlbi_level(op, (arg | USER_ASID_FLAG), level);        \
} while (0)

/*
 * This macro creates a properly formatted VA operand for the TLB RANGE. The
 * value bit assignments are:
 *
 * +----------+------+-------+-------+-------+----------------------+
 * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
 * +-----------------+-------+-------+-------+----------------------+
 * |63      48|47  46|45   44|43   39|38   37|36                   0|
 *
 * The address range is determined by below formula: [BADDR, BADDR + (NUM + 1) *
 * 2^(5*SCALE + 1) * PAGESIZE)
 *
 * Note that the first argument, baddr, is pre-shifted; If LPA2 is in use, BADDR
 * holds addr[52:16]. Else BADDR holds page number. See for example ARM DDI
 * 0487J.a section C5.5.60 "TLBI VAE1IS, TLBI VAE1ISNXS, TLB Invalidate by VA,
 * EL1, Inner Shareable".
 *
 */
#define TLBIR_ASID_MASK                GENMASK_ULL(63, 48)
#define TLBIR_TG_MASK                GENMASK_ULL(47, 46)
#define TLBIR_SCALE_MASK        GENMASK_ULL(45, 44)
#define TLBIR_NUM_MASK                GENMASK_ULL(43, 39)
#define TLBIR_TTL_MASK                GENMASK_ULL(38, 37)
#define TLBIR_BADDR_MASK        GENMASK_ULL(36,  0)

#define __TLBI_VADDR_RANGE(baddr, asid, scale, num, ttl)                \
        ({                                                                \
                unsigned long __ta = 0;                                        \
                unsigned long __ttl = (ttl >= 1 && ttl <= 3) ? ttl : 0;        \
                __ta |= FIELD_PREP(TLBIR_BADDR_MASK, baddr);                \
                __ta |= FIELD_PREP(TLBIR_TTL_MASK, __ttl);                \
                __ta |= FIELD_PREP(TLBIR_NUM_MASK, num);                \
                __ta |= FIELD_PREP(TLBIR_SCALE_MASK, scale);                \
                __ta |= FIELD_PREP(TLBIR_TG_MASK, get_trans_granule());        \
                __ta |= FIELD_PREP(TLBIR_ASID_MASK, asid);                \
                __ta;                                                        \
        })

/* These macros are used by the TLBI RANGE feature. */
#define __TLBI_RANGE_PAGES(num, scale)        \
        ((unsigned long)((num) + 1) << (5 * (scale) + 1))
#define MAX_TLBI_RANGE_PAGES                __TLBI_RANGE_PAGES(31, 3)

/*
 * Generate 'num' values from -1 to 31 with -1 rejected by the
 * __flush_tlb_range() loop below. Its return value is only
 * significant for a maximum of MAX_TLBI_RANGE_PAGES pages. If
 * 'pages' is more than that, you must iterate over the overall
 * range.
 */
#define __TLBI_RANGE_NUM(pages, scale)                                        \
        ({                                                                \
                int __pages = min((pages),                                \
                                  __TLBI_RANGE_PAGES(31, (scale)));        \
                (__pages >> (5 * (scale) + 1)) - 1;                        \
        })

/*
 *        TLB Invalidation
 *        ================
 *
 *         This header file implements the low-level TLB invalidation routines
 *        (sometimes referred to as "flushing" in the kernel) for arm64.
 *
 *        Every invalidation operation uses the following template:
 *
 *        DSB ISHST        // Ensure prior page-table updates have completed
 *        TLBI ...        // Invalidate the TLB
 *        DSB ISH                // Ensure the TLB invalidation has completed
 *      if (invalidated kernel mappings)
 *                ISB        // Discard any instructions fetched from the old mapping
 *
 *
 *        The following functions form part of the "core" TLB invalidation API,
 *        as documented in Documentation/core-api/cachetlb.rst:
 *
 *        flush_tlb_all()
 *                Invalidate the entire TLB (kernel + user) on all CPUs
 *
 *        flush_tlb_mm(mm)
 *                Invalidate an entire user address space on all CPUs.
 *                The 'mm' argument identifies the ASID to invalidate.
 *
 *        flush_tlb_range(vma, start, end)
 *                Invalidate the virtual-address range '[start, end)' on all
 *                CPUs for the user address space corresponding to 'vma->mm'.
 *                Note that this operation also invalidates any walk-cache
 *                entries associated with translations for the specified address
 *                range.
 *
 *        flush_tlb_kernel_range(start, end)
 *                Same as flush_tlb_range(..., start, end), but applies to
 *                 kernel mappings rather than a particular user address space.
 *                Whilst not explicitly documented, this function is used when
 *                unmapping pages from vmalloc/io space.
 *
 *        flush_tlb_page(vma, addr)
 *                Invalidate a single user mapping for address 'addr' in the
 *                address space corresponding to 'vma->mm'.  Note that this
 *                operation only invalidates a single, last-level page-table
 *                entry and therefore does not affect any walk-caches.
 *
 *
 *        Next, we have some undocumented invalidation routines that you probably
 *        don't want to call unless you know what you're doing:
 *
 *        local_flush_tlb_all()
 *                Same as flush_tlb_all(), but only applies to the calling CPU.
 *
 *        __flush_tlb_kernel_pgtable(addr)
 *                Invalidate a single kernel mapping for address 'addr' on all
 *                CPUs, ensuring that any walk-cache entries associated with the
 *                translation are also invalidated.
 *
 *        __flush_tlb_range(vma, start, end, stride, last_level, tlb_level)
 *                Invalidate the virtual-address range '[start, end)' on all
 *                CPUs for the user address space corresponding to 'vma->mm'.
 *                The invalidation operations are issued at a granularity
 *                determined by 'stride' and only affect any walk-cache entries
 *                if 'last_level' is equal to false. tlb_level is the level at
 *                which the invalidation must take place. If the level is wrong,
 *                no invalidation may take place. In the case where the level
 *                cannot be easily determined, the value TLBI_TTL_UNKNOWN will
 *                perform a non-hinted invalidation.
 *
 *
 *        Finally, take a look at asm/tlb.h to see how tlb_flush() is implemented
 *        on top of these routines, since that is our interface to the mmu_gather
 *        API as used by munmap() and friends.
 */
static inline void local_flush_tlb_all(void)
{
        dsb(nshst);
        __tlbi(vmalle1);
        dsb(nsh);
        isb();
}

static inline void flush_tlb_all(void)
{
        dsb(ishst);
        __tlbi(vmalle1is);
        dsb(ish);
        isb();
}

static inline void flush_tlb_mm(struct mm_struct *mm)
{
        unsigned long asid;

        dsb(ishst);
        asid = __TLBI_VADDR(0, ASID(mm));
        __tlbi(aside1is, asid);
        __tlbi_user(aside1is, asid);
        dsb(ish);
        mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL);
}

static inline void __flush_tlb_page_nosync(struct mm_struct *mm,
                                           unsigned long uaddr)
{
        unsigned long addr;

        dsb(ishst);
        addr = __TLBI_VADDR(uaddr, ASID(mm));
        __tlbi(vale1is, addr);
        __tlbi_user(vale1is, addr);
        mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK,
                                                (uaddr & PAGE_MASK) + PAGE_SIZE);
}

static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
                                         unsigned long uaddr)
{
        return __flush_tlb_page_nosync(vma->vm_mm, uaddr);
}

static inline void flush_tlb_page(struct vm_area_struct *vma,
                                  unsigned long uaddr)
{
        flush_tlb_page_nosync(vma, uaddr);
        dsb(ish);
}

static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm)
{
        /*
         * TLB flush deferral is not required on systems which are affected by
         * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation
         * will have two consecutive TLBI instructions with a dsb(ish) in between
         * defeating the purpose (i.e save overall 'dsb ish' cost).
         */
        if (alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI))
                return false;

        return true;
}

/*
 * If mprotect/munmap/etc occurs during TLB batched flushing, we need to
 * synchronise all the TLBI issued with a DSB to avoid the race mentioned in
 * flush_tlb_batched_pending().
 */
static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm)
{
        dsb(ish);
}

/*
 * To support TLB batched flush for multiple pages unmapping, we only send
 * the TLBI for each page in arch_tlbbatch_add_pending() and wait for the
 * completion at the end in arch_tlbbatch_flush(). Since we've already issued
 * TLBI for each page so only a DSB is needed to synchronise its effect on the
 * other CPUs.
 *
 * This will save the time waiting on DSB comparing issuing a TLBI;DSB sequence
 * for each page.
 */
static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
        dsb(ish);
}

/*
 * This is meant to avoid soft lock-ups on large TLB flushing ranges and not
 * necessarily a performance improvement.
 */
#define MAX_DVM_OPS        PTRS_PER_PTE

/*
 * __flush_tlb_range_op - Perform TLBI operation upon a range
 *
 * @op:        TLBI instruction that operates on a range (has 'r' prefix)
 * @start:        The start address of the range
 * @pages:        Range as the number of pages from 'start'
 * @stride:        Flush granularity
 * @asid:        The ASID of the task (0 for IPA instructions)
 * @tlb_level:        Translation Table level hint, if known
 * @tlbi_user:        If 'true', call an additional __tlbi_user()
 *              (typically for user ASIDs). 'flase' for IPA instructions
 * @lpa2:        If 'true', the lpa2 scheme is used as set out below
 *
 * When the CPU does not support TLB range operations, flush the TLB
 * entries one by one at the granularity of 'stride'. If the TLB
 * range ops are supported, then:
 *
 * 1. If FEAT_LPA2 is in use, the start address of a range operation must be
 *    64KB aligned, so flush pages one by one until the alignment is reached
 *    using the non-range operations. This step is skipped if LPA2 is not in
 *    use.
 *
 * 2. The minimum range granularity is decided by 'scale', so multiple range
 *    TLBI operations may be required. Start from scale = 3, flush the largest
 *    possible number of pages ((num+1)*2^(5*scale+1)) that fit into the
 *    requested range, then decrement scale and continue until one or zero pages
 *    are left. We must start from highest scale to ensure 64KB start alignment
 *    is maintained in the LPA2 case.
 *
 * 3. If there is 1 page remaining, flush it through non-range operations. Range
 *    operations can only span an even number of pages. We save this for last to
 *    ensure 64KB start alignment is maintained for the LPA2 case.
 */
#define __flush_tlb_range_op(op, start, pages, stride,                        \
                                asid, tlb_level, tlbi_user, lpa2)        \
do {                                                                        \
        typeof(start) __flush_start = start;                                \
        typeof(pages) __flush_pages = pages;                                \
        int num = 0;                                                        \
        int scale = 3;                                                        \
        int shift = lpa2 ? 16 : PAGE_SHIFT;                                \
        unsigned long addr;                                                \
                                                                        \
        while (__flush_pages > 0) {                                        \
                if (!system_supports_tlb_range() ||                        \
                    __flush_pages == 1 ||                                \
                    (lpa2 && __flush_start != ALIGN(__flush_start, SZ_64K))) {        \
                        addr = __TLBI_VADDR(__flush_start, asid);        \
                        __tlbi_level(op, addr, tlb_level);                \
                        if (tlbi_user)                                        \
                                __tlbi_user_level(op, addr, tlb_level);        \
                        __flush_start += stride;                        \
                        __flush_pages -= stride >> PAGE_SHIFT;                \
                        continue;                                        \
                }                                                        \
                                                                        \
                num = __TLBI_RANGE_NUM(__flush_pages, scale);                \
                if (num >= 0) {                                                \
                        addr = __TLBI_VADDR_RANGE(__flush_start >> shift, asid, \
                                                scale, num, tlb_level);        \
                        __tlbi(r##op, addr);                                \
                        if (tlbi_user)                                        \
                                __tlbi_user(r##op, addr);                \
                        __flush_start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT; \
                        __flush_pages -= __TLBI_RANGE_PAGES(num, scale);\
                }                                                        \
                scale--;                                                \
        }                                                                \
} while (0)

#define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
        __flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());

static inline bool __flush_tlb_range_limit_excess(unsigned long start,
                unsigned long end, unsigned long pages, unsigned long stride)
{
        /*
         * When the system does not support TLB range based flush
         * operation, (MAX_DVM_OPS - 1) pages can be handled. But
         * with TLB range based operation, MAX_TLBI_RANGE_PAGES
         * pages can be handled.
         */
        if ((!system_supports_tlb_range() &&
             (end - start) >= (MAX_DVM_OPS * stride)) ||
            pages > MAX_TLBI_RANGE_PAGES)
                return true;

        return false;
}

static inline void __flush_tlb_range_nosync(struct mm_struct *mm,
                                     unsigned long start, unsigned long end,
                                     unsigned long stride, bool last_level,
                                     int tlb_level)
{
        unsigned long asid, pages;

        start = round_down(start, stride);
        end = round_up(end, stride);
        pages = (end - start) >> PAGE_SHIFT;

        if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
                flush_tlb_mm(mm);
                return;
        }

        dsb(ishst);
        asid = ASID(mm);

        if (last_level)
                __flush_tlb_range_op(vale1is, start, pages, stride, asid,
                                     tlb_level, true, lpa2_is_enabled());
        else
                __flush_tlb_range_op(vae1is, start, pages, stride, asid,
                                     tlb_level, true, lpa2_is_enabled());

        mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}

static inline void __flush_tlb_range(struct vm_area_struct *vma,
                                     unsigned long start, unsigned long end,
                                     unsigned long stride, bool last_level,
                                     int tlb_level)
{
        __flush_tlb_range_nosync(vma->vm_mm, start, end, stride,
                                 last_level, tlb_level);
        dsb(ish);
}

static inline void flush_tlb_range(struct vm_area_struct *vma,
                                   unsigned long start, unsigned long end)
{
        /*
         * We cannot use leaf-only invalidation here, since we may be invalidating
         * table entries as part of collapsing hugepages or moving page tables.
         * Set the tlb_level to TLBI_TTL_UNKNOWN because we can not get enough
         * information here.
         */
        __flush_tlb_range(vma, start, end, PAGE_SIZE, false, TLBI_TTL_UNKNOWN);
}

static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
        const unsigned long stride = PAGE_SIZE;
        unsigned long pages;

        start = round_down(start, stride);
        end = round_up(end, stride);
        pages = (end - start) >> PAGE_SHIFT;

        if (__flush_tlb_range_limit_excess(start, end, pages, stride)) {
                flush_tlb_all();
                return;
        }

        dsb(ishst);
        __flush_tlb_range_op(vaale1is, start, pages, stride, 0,
                             TLBI_TTL_UNKNOWN, false, lpa2_is_enabled());
        dsb(ish);
        isb();
}

/*
 * Used to invalidate the TLB (walk caches) corresponding to intermediate page
 * table levels (pgd/pud/pmd).
 */
static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr)
{
        unsigned long addr = __TLBI_VADDR(kaddr, 0);

        dsb(ishst);
        __tlbi(vaae1is, addr);
        dsb(ish);
        isb();
}

static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
                struct mm_struct *mm, unsigned long start, unsigned long end)
{
        __flush_tlb_range_nosync(mm, start, end, PAGE_SIZE, true, 3);
}
#endif

#endif




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 





















    4 
































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ext4

#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_EXT4_H

#include <linux/writeback.h>
#include <linux/tracepoint.h>

struct ext4_allocation_context;
struct ext4_allocation_request;
struct ext4_extent;
struct ext4_prealloc_space;
struct ext4_inode_info;
struct mpage_da_data;
struct ext4_map_blocks;
struct extent_status;
struct ext4_fsmap;
struct partial_cluster;

#define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode))

#define show_mballoc_flags(flags) __print_flags(flags, "|",        \
        { EXT4_MB_HINT_MERGE,                "HINT_MERGE" },                \
        { EXT4_MB_HINT_RESERVED,        "HINT_RESV" },                \
        { EXT4_MB_HINT_METADATA,        "HINT_MDATA" },                \
        { EXT4_MB_HINT_FIRST,                "HINT_FIRST" },                \
        { EXT4_MB_HINT_BEST,                "HINT_BEST" },                \
        { EXT4_MB_HINT_DATA,                "HINT_DATA" },                \
        { EXT4_MB_HINT_NOPREALLOC,        "HINT_NOPREALLOC" },        \
        { EXT4_MB_HINT_GROUP_ALLOC,        "HINT_GRP_ALLOC" },        \
        { EXT4_MB_HINT_GOAL_ONLY,        "HINT_GOAL_ONLY" },        \
        { EXT4_MB_HINT_TRY_GOAL,        "HINT_TRY_GOAL" },        \
        { EXT4_MB_DELALLOC_RESERVED,        "DELALLOC_RESV" },        \
        { EXT4_MB_STREAM_ALLOC,                "STREAM_ALLOC" },        \
        { EXT4_MB_USE_ROOT_BLOCKS,        "USE_ROOT_BLKS" },        \
        { EXT4_MB_USE_RESERVED,                "USE_RESV" },                \
        { EXT4_MB_STRICT_CHECK,                "STRICT_CHECK" })

#define show_map_flags(flags) __print_flags(flags, "|",                        \
        { EXT4_GET_BLOCKS_CREATE,                "CREATE" },                \
        { EXT4_GET_BLOCKS_UNWRIT_EXT,                "UNWRIT" },                \
        { EXT4_GET_BLOCKS_DELALLOC_RESERVE,        "DELALLOC" },                \
        { EXT4_GET_BLOCKS_PRE_IO,                "PRE_IO" },                \
        { EXT4_GET_BLOCKS_CONVERT,                "CONVERT" },                \
        { EXT4_GET_BLOCKS_METADATA_NOFAIL,        "METADATA_NOFAIL" },        \
        { EXT4_GET_BLOCKS_NO_NORMALIZE,                "NO_NORMALIZE" },        \
        { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN,        "CONVERT_UNWRITTEN" },  \
        { EXT4_GET_BLOCKS_ZERO,                        "ZERO" },                \
        { EXT4_GET_BLOCKS_IO_SUBMIT,                "IO_SUBMIT" },                \
        { EXT4_EX_NOCACHE,                        "EX_NOCACHE" })

/*
 * __print_flags() requires that all enum values be wrapped in the
 * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace
 * ring buffer.
 */
TRACE_DEFINE_ENUM(BH_New);
TRACE_DEFINE_ENUM(BH_Mapped);
TRACE_DEFINE_ENUM(BH_Unwritten);
TRACE_DEFINE_ENUM(BH_Boundary);

#define show_mflags(flags) __print_flags(flags, "",        \
        { EXT4_MAP_NEW,                "N" },                        \
        { EXT4_MAP_MAPPED,        "M" },                        \
        { EXT4_MAP_UNWRITTEN,        "U" },                        \
        { EXT4_MAP_BOUNDARY,        "B" })

#define show_free_flags(flags) __print_flags(flags, "|",        \
        { EXT4_FREE_BLOCKS_METADATA,                "METADATA" },        \
        { EXT4_FREE_BLOCKS_FORGET,                "FORGET" },        \
        { EXT4_FREE_BLOCKS_VALIDATED,                "VALIDATED" },        \
        { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE,        "NO_QUOTA" },        \
        { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\
        { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER,        "LAST_CLUSTER" })

TRACE_DEFINE_ENUM(ES_WRITTEN_B);
TRACE_DEFINE_ENUM(ES_UNWRITTEN_B);
TRACE_DEFINE_ENUM(ES_DELAYED_B);
TRACE_DEFINE_ENUM(ES_HOLE_B);
TRACE_DEFINE_ENUM(ES_REFERENCED_B);

#define show_extent_status(status) __print_flags(status, "",        \
        { EXTENT_STATUS_WRITTEN,        "W" },                        \
        { EXTENT_STATUS_UNWRITTEN,        "U" },                        \
        { EXTENT_STATUS_DELAYED,        "D" },                        \
        { EXTENT_STATUS_HOLE,                "H" },                        \
        { EXTENT_STATUS_REFERENCED,        "R" })

#define show_falloc_mode(mode) __print_flags(mode, "|",                \
        { FALLOC_FL_KEEP_SIZE,                "KEEP_SIZE"},                \
        { FALLOC_FL_PUNCH_HOLE,                "PUNCH_HOLE"},                \
        { FALLOC_FL_COLLAPSE_RANGE,        "COLLAPSE_RANGE"},        \
        { FALLOC_FL_ZERO_RANGE,                "ZERO_RANGE"})

TRACE_DEFINE_ENUM(EXT4_FC_REASON_XATTR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_CROSS_RENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_NOMEM);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_SWAP_BOOT);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RESIZE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_RENAME_DIR);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_FALLOC_RANGE);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_INODE_JOURNAL_DATA);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_ENCRYPTED_FILENAME);
TRACE_DEFINE_ENUM(EXT4_FC_REASON_MAX);

#define show_fc_reason(reason)                                                \
        __print_symbolic(reason,                                        \
                { EXT4_FC_REASON_XATTR,                "XATTR"},                \
                { EXT4_FC_REASON_CROSS_RENAME,        "CROSS_RENAME"},        \
                { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \
                { EXT4_FC_REASON_NOMEM,        "NO_MEM"},                        \
                { EXT4_FC_REASON_SWAP_BOOT,        "SWAP_BOOT"},                \
                { EXT4_FC_REASON_RESIZE,        "RESIZE"},                \
                { EXT4_FC_REASON_RENAME_DIR,        "RENAME_DIR"},                \
                { EXT4_FC_REASON_FALLOC_RANGE,        "FALLOC_RANGE"},        \
                { EXT4_FC_REASON_INODE_JOURNAL_DATA,        "INODE_JOURNAL_DATA"}, \
                { EXT4_FC_REASON_ENCRYPTED_FILENAME,        "ENCRYPTED_FILENAME"})

TRACE_DEFINE_ENUM(CR_POWER2_ALIGNED);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_FAST);
TRACE_DEFINE_ENUM(CR_BEST_AVAIL_LEN);
TRACE_DEFINE_ENUM(CR_GOAL_LEN_SLOW);
TRACE_DEFINE_ENUM(CR_ANY_FREE);

#define show_criteria(cr)                                               \
        __print_symbolic(cr,                                            \
                         { CR_POWER2_ALIGNED, "CR_POWER2_ALIGNED" },        \
                         { CR_GOAL_LEN_FAST, "CR_GOAL_LEN_FAST" },      \
                         { CR_BEST_AVAIL_LEN, "CR_BEST_AVAIL_LEN" },    \
                         { CR_GOAL_LEN_SLOW, "CR_GOAL_LEN_SLOW" },      \
                         { CR_ANY_FREE, "CR_ANY_FREE" })

TRACE_EVENT(ext4_other_inode_update_time,
        TP_PROTO(struct inode *inode, ino_t orig_ino),

        TP_ARGS(inode, orig_ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        orig_ino                )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->orig_ino = orig_ino;
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->orig_ino,
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid)
);

TRACE_EVENT(ext4_free_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        uid_t,        uid                        )
                __field(        gid_t,        gid                        )
                __field(        __u64, blocks                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->uid        = i_uid_read(inode);
                __entry->gid        = i_gid_read(inode);
                __entry->blocks        = inode->i_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->mode,
                  __entry->uid, __entry->gid, __entry->blocks)
);

TRACE_EVENT(ext4_request_inode,
        TP_PROTO(struct inode *dir, int mode),

        TP_ARGS(dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        dir                        )
                __field(        __u16, mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = dir->i_sb->s_dev;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_allocate_inode,
        TP_PROTO(struct inode *inode, struct inode *dir, int mode),

        TP_ARGS(inode, dir, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        dir                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->dir        = dir->i_ino;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->dir, __entry->mode)
);

TRACE_EVENT(ext4_evict_inode,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        nlink                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->nlink        = inode->i_nlink;
        ),

        TP_printk("dev %d,%d ino %lu nlink %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nlink)
);

TRACE_EVENT(ext4_drop_inode,
        TP_PROTO(struct inode *inode, int drop),

        TP_ARGS(inode, drop),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        drop                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->drop        = drop;
        ),

        TP_printk("dev %d,%d ino %lu drop %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->drop)
);

TRACE_EVENT(ext4_nfs_commit_metadata,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_mark_inode_dirty,
        TP_PROTO(struct inode *inode, unsigned long IP),

        TP_ARGS(inode, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        ip                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->ip        = IP;
        ),

        TP_printk("dev %d,%d ino %lu caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, (void *)__entry->ip)
);

TRACE_EVENT(ext4_begin_ordered_truncate,
        TP_PROTO(struct inode *inode, loff_t new_size),

        TP_ARGS(inode, new_size),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        new_size                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->new_size        = new_size;
        ),

        TP_printk("dev %d,%d ino %lu new_size %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->new_size)
);

DECLARE_EVENT_CLASS(ext4__write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len)
);

DEFINE_EVENT(ext4__write_begin, ext4_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len)
);

DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len),

        TP_ARGS(inode, pos, len)
);

DECLARE_EVENT_CLASS(ext4__write_end,
        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                        unsigned int copied),

        TP_ARGS(inode, pos, len, copied),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int, len                )
                __field(        unsigned int, copied                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = pos;
                __entry->len        = len;
                __entry->copied        = copied;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->len, __entry->copied)
);

DEFINE_EVENT(ext4__write_end, ext4_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

DEFINE_EVENT(ext4__write_end, ext4_da_write_end,

        TP_PROTO(struct inode *inode, loff_t pos, unsigned int len,
                 unsigned int copied),

        TP_ARGS(inode, pos, len, copied)
);

TRACE_EVENT(ext4_writepages,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        long,        nr_to_write                )
                __field(        long,        pages_skipped                )
                __field(        loff_t,        range_start                )
                __field(        loff_t,        range_end                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
                __field(        char,        for_kupdate                )
                __field(        char,        range_cyclic                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->range_start        = wbc->range_start;
                __entry->range_end        = wbc->range_end;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->range_cyclic        = wbc->range_cyclic;
        ),

        TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld "
                  "range_start %lld range_end %lld sync_mode %d "
                  "for_kupdate %d range_cyclic %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->nr_to_write,
                  __entry->pages_skipped, __entry->range_start,
                  __entry->range_end, __entry->sync_mode,
                  __entry->for_kupdate, __entry->range_cyclic,
                  (unsigned long) __entry->writeback_index)
);

TRACE_EVENT(ext4_da_write_pages,
        TP_PROTO(struct inode *inode, pgoff_t first_page,
                 struct writeback_control *wbc),

        TP_ARGS(inode, first_page, wbc),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(      pgoff_t,        first_page                )
                __field(         long,        nr_to_write                )
                __field(          int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->first_page        = first_page;
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld "
                  "sync_mode %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->first_page,
                  __entry->nr_to_write, __entry->sync_mode)
);

TRACE_EVENT(ext4_da_write_pages_extent,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map),

        TP_ARGS(inode, map),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        lblk                        )
                __field(        __u32,        len                        )
                __field(        __u32,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = map->m_lblk;
                __entry->len                = map->m_len;
                __entry->flags                = map->m_flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk, __entry->len,
                  show_mflags(__entry->flags))
);

TRACE_EVENT(ext4_writepages_result,
        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                        int ret, int pages_written),

        TP_ARGS(inode, wbc, ret, pages_written),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
                __field(        int,        pages_written                )
                __field(        long,        pages_skipped                )
                __field(       pgoff_t,        writeback_index                )
                __field(        int,        sync_mode                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
                __entry->pages_written        = pages_written;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->sync_mode        = wbc->sync_mode;
        ),

        TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
                  "sync_mode %d writeback_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->ret,
                  __entry->pages_written, __entry->pages_skipped,
                  __entry->sync_mode,
                  (unsigned long) __entry->writeback_index)
);

DECLARE_EVENT_CLASS(ext4__folio_op,
        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->index        = folio->index;
        ),

        TP_printk("dev %d,%d ino %lu folio_index %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index)
);

DEFINE_EVENT(ext4__folio_op, ext4_read_folio,

        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio)
);

DEFINE_EVENT(ext4__folio_op, ext4_release_folio,

        TP_PROTO(struct inode *inode, struct folio *folio),

        TP_ARGS(inode, folio)
);

DECLARE_EVENT_CLASS(ext4_invalidate_folio_op,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        pgoff_t, index                        )
                __field(        size_t, offset                        )
                __field(        size_t, length                        )
        ),

        TP_fast_assign(
                __entry->dev        = folio->mapping->host->i_sb->s_dev;
                __entry->ino        = folio->mapping->host->i_ino;
                __entry->index        = folio->index;
                __entry->offset        = offset;
                __entry->length        = length;
        ),

        TP_printk("dev %d,%d ino %lu folio_index %lu offset %zu length %zu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->index,
                  __entry->offset, __entry->length)
);

DEFINE_EVENT(ext4_invalidate_folio_op, ext4_invalidate_folio,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length)
);

DEFINE_EVENT(ext4_invalidate_folio_op, ext4_journalled_invalidate_folio,
        TP_PROTO(struct folio *folio, size_t offset, size_t length),

        TP_ARGS(folio, offset, length)
);

TRACE_EVENT(ext4_discard_blocks,
        TP_PROTO(struct super_block *sb, unsigned long long blk,
                        unsigned long long count),

        TP_ARGS(sb, blk, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        blk                        )
                __field(        __u64,        count                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->blk        = blk;
                __entry->count        = count;
        ),

        TP_printk("dev %d,%d blk %llu count %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blk, __entry->count)
);

DECLARE_EVENT_CLASS(ext4__mb_new_pa,
        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u64,        pa_lstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_lstart        = pa->pa_lstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa,

        TP_PROTO(struct ext4_allocation_context *ac,
                 struct ext4_prealloc_space *pa),

        TP_ARGS(ac, pa)
);

TRACE_EVENT(ext4_mb_release_inode_pa,
        TP_PROTO(struct ext4_prealloc_space *pa,
                 unsigned long long block, unsigned int count),

        TP_ARGS(pa, block, count),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        __u32,        count                        )

        ),

        TP_fast_assign(
                __entry->dev                = pa->pa_inode->i_sb->s_dev;
                __entry->ino                = pa->pa_inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
        ),

        TP_printk("dev %d,%d ino %lu block %llu count %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->block, __entry->count)
);

TRACE_EVENT(ext4_mb_release_group_pa,
        TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa),

        TP_ARGS(sb, pa),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u64,        pa_pstart                )
                __field(        __u32,        pa_len                        )

        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->pa_pstart        = pa->pa_pstart;
                __entry->pa_len                = pa->pa_len;
        ),

        TP_printk("dev %d,%d pstart %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->pa_pstart, __entry->pa_len)
);

TRACE_EVENT(ext4_discard_preallocations,
        TP_PROTO(struct inode *inode, unsigned int len),

        TP_ARGS(inode, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        len                )

        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu len: %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->len)
);

TRACE_EVENT(ext4_mb_discard_preallocations,
        TP_PROTO(struct super_block *sb, int needed),

        TP_ARGS(sb, needed),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        needed                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->needed        = needed;
        ),

        TP_printk("dev %d,%d needed %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->needed)
);

TRACE_EVENT(ext4_request_blocks,
        TP_PROTO(struct ext4_allocation_request *ar),

        TP_ARGS(ar),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu "
                  "lleft %u lright %u pleft %llu pright %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->logical, __entry->goal,
                  __entry->lleft, __entry->lright, __entry->pleft,
                  __entry->pright)
);

TRACE_EVENT(ext4_allocate_blocks,
        TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block),

        TP_ARGS(ar, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned int, len                )
                __field(        __u32,  logical                        )
                __field(        __u32,        lleft                        )
                __field(        __u32,        lright                        )
                __field(        __u64,        goal                        )
                __field(        __u64,        pleft                        )
                __field(        __u64,        pright                        )
                __field(        unsigned int, flags                )
        ),

        TP_fast_assign(
                __entry->dev        = ar->inode->i_sb->s_dev;
                __entry->ino        = ar->inode->i_ino;
                __entry->block        = block;
                __entry->len        = ar->len;
                __entry->logical = ar->logical;
                __entry->goal        = ar->goal;
                __entry->lleft        = ar->lleft;
                __entry->lright        = ar->lright;
                __entry->pleft        = ar->pleft;
                __entry->pright        = ar->pright;
                __entry->flags        = ar->flags;
        ),

        TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u "
                  "goal %llu lleft %u lright %u pleft %llu pright %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags),
                  __entry->len, __entry->block, __entry->logical,
                  __entry->goal,  __entry->lleft, __entry->lright,
                  __entry->pleft, __entry->pright)
);

TRACE_EVENT(ext4_free_blocks,
        TP_PROTO(struct inode *inode, __u64 block, unsigned long count,
                 int flags),

        TP_ARGS(inode, block, count, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        unsigned long,        count                )
                __field(        int,        flags                        )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->block                = block;
                __entry->count                = count;
                __entry->flags                = flags;
                __entry->mode                = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->block, __entry->count,
                  show_free_flags(__entry->flags))
);

TRACE_EVENT(ext4_sync_file_enter,
        TP_PROTO(struct file *file, int datasync),

        TP_ARGS(file, datasync),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        int,        datasync                )
        ),

        TP_fast_assign(
                struct dentry *dentry = file->f_path.dentry;

                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->datasync        = datasync;
                __entry->parent                = d_inode(dentry->d_parent)->i_ino;
        ),

        TP_printk("dev %d,%d ino %lu parent %lu datasync %d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long) __entry->parent, __entry->datasync)
);

TRACE_EVENT(ext4_sync_file_exit,
        TP_PROTO(struct inode *inode, int ret),

        TP_ARGS(inode, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

TRACE_EVENT(ext4_sync_fs,
        TP_PROTO(struct super_block *sb, int wait),

        TP_ARGS(sb, wait),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        wait                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->wait        = wait;
        ),

        TP_printk("dev %d,%d wait %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->wait)
);

TRACE_EVENT(ext4_alloc_da_blocks,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field( unsigned int,        data_blocks                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
        ),

        TP_printk("dev %d,%d ino %lu reserved_data_blocks %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->data_blocks)
);

TRACE_EVENT(ext4_mballoc_alloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         goal_logical                )
                __field(          int,        goal_start                )
                __field(        __u32,         goal_group                )
                __field(          int,        goal_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
                __field(        __u16,        found                        )
                __field(        __u16,        groups                        )
                __field(        __u16,        buddy                        )
                __field(        __u16,        flags                        )
                __field(        __u16,        tail                        )
                __field(        __u8,        cr                        )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->goal_logical        = ac->ac_g_ex.fe_logical;
                __entry->goal_start        = ac->ac_g_ex.fe_start;
                __entry->goal_group        = ac->ac_g_ex.fe_group;
                __entry->goal_len        = ac->ac_g_ex.fe_len;
                __entry->result_logical        = ac->ac_f_ex.fe_logical;
                __entry->result_start        = ac->ac_f_ex.fe_start;
                __entry->result_group        = ac->ac_f_ex.fe_group;
                __entry->result_len        = ac->ac_f_ex.fe_len;
                __entry->found                = ac->ac_found;
                __entry->flags                = ac->ac_flags;
                __entry->groups                = ac->ac_groups_scanned;
                __entry->buddy                = ac->ac_buddy;
                __entry->tail                = ac->ac_tail;
                __entry->cr                = ac->ac_criteria;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u "
                  "result %u/%d/%u@%u blks %u grps %u cr %s flags %s "
                  "tail %u broken %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->goal_group, __entry->goal_start,
                  __entry->goal_len, __entry->goal_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical,
                  __entry->found, __entry->groups, show_criteria(__entry->cr),
                  show_mballoc_flags(__entry->flags), __entry->tail,
                  __entry->buddy ? 1 << __entry->buddy : 0)
);

TRACE_EVENT(ext4_mballoc_prealloc,
        TP_PROTO(struct ext4_allocation_context *ac),

        TP_ARGS(ac),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u32,         orig_logical                )
                __field(          int,        orig_start                )
                __field(        __u32,         orig_group                )
                __field(          int,        orig_len                )
                __field(        __u32,         result_logical                )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = ac->ac_inode->i_sb->s_dev;
                __entry->ino                = ac->ac_inode->i_ino;
                __entry->orig_logical        = ac->ac_o_ex.fe_logical;
                __entry->orig_start        = ac->ac_o_ex.fe_start;
                __entry->orig_group        = ac->ac_o_ex.fe_group;
                __entry->orig_len        = ac->ac_o_ex.fe_len;
                __entry->result_logical        = ac->ac_b_ex.fe_logical;
                __entry->result_start        = ac->ac_b_ex.fe_start;
                __entry->result_group        = ac->ac_b_ex.fe_group;
                __entry->result_len        = ac->ac_b_ex.fe_len;
        ),

        TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->orig_group, __entry->orig_start,
                  __entry->orig_len, __entry->orig_logical,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len, __entry->result_logical)
);

DECLARE_EVENT_CLASS(ext4__mballoc,
        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(          int,        result_start                )
                __field(        __u32,         result_group                )
                __field(          int,        result_len                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->result_start        = start;
                __entry->result_group        = group;
                __entry->result_len        = len;
        ),

        TP_printk("dev %d,%d inode %lu extent %u/%d/%d ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->result_group, __entry->result_start,
                  __entry->result_len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free,

        TP_PROTO(struct super_block *sb,
                 struct inode *inode,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, inode, group, start, len)
);

TRACE_EVENT(ext4_forget,
        TP_PROTO(struct inode *inode, int is_metadata, __u64 block),

        TP_ARGS(inode, is_metadata, block),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        block                        )
                __field(        int,        is_metadata                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->block        = block;
                __entry->is_metadata = is_metadata;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->is_metadata, __entry->block)
);

TRACE_EVENT(ext4_da_update_reserve_space,
        TP_PROTO(struct inode *inode, int used_blocks, int quota_claim),

        TP_ARGS(inode, used_blocks, quota_claim),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        used_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        int,        quota_claim                )
                __field(        __u16,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->used_blocks = used_blocks;
                __entry->reserved_data_blocks =
                                EXT4_I(inode)->i_reserved_data_blocks;
                __entry->quota_claim = quota_claim;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d "
                  "reserved_data_blocks %d quota_claim %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->used_blocks, __entry->reserved_data_blocks,
                  __entry->quota_claim)
);

TRACE_EVENT(ext4_da_reserve_space,
        TP_PROTO(struct inode *inode, int nr_resv),

        TP_ARGS(inode, nr_resv),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        reserve_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->reserve_blocks = nr_resv;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu reserve_blocks %d"
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->reserve_blocks, __entry->reserved_data_blocks)
);

TRACE_EVENT(ext4_da_release_space,
        TP_PROTO(struct inode *inode, int freed_blocks),

        TP_ARGS(inode, freed_blocks),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        __u64,        i_blocks                )
                __field(        int,        freed_blocks                )
                __field(        int,        reserved_data_blocks        )
                __field(        __u16,  mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->i_blocks = inode->i_blocks;
                __entry->freed_blocks = freed_blocks;
                __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks;
                __entry->mode        = inode->i_mode;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d "
                  "reserved_data_blocks %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->mode, __entry->i_blocks,
                  __entry->freed_blocks, __entry->reserved_data_blocks)
);

DECLARE_EVENT_CLASS(ext4__bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap,

        TP_PROTO(struct super_block *sb, unsigned long group),

        TP_ARGS(sb, group)
);

TRACE_EVENT(ext4_read_block_bitmap_load,
        TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch),

        TP_ARGS(sb, group, prefetch),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        bool,        prefetch                )

        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->prefetch = prefetch;
        ),

        TP_printk("dev %d,%d group %u prefetch %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->prefetch)
);

DECLARE_EVENT_CLASS(ext4__fallocate_mode,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        offset                        )
                __field(        loff_t, len                        )
                __field(        int,        mode                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
                __entry->mode        = mode;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len,
                  show_falloc_mode(__entry->mode))
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range,

        TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode),

        TP_ARGS(inode, offset, len, mode)
);

TRACE_EVENT(ext4_fallocate_exit,
        TP_PROTO(struct inode *inode, loff_t offset,
                 unsigned int max_blocks, int ret),

        TP_ARGS(inode, offset, max_blocks, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        pos                        )
                __field(        unsigned int,        blocks                )
                __field(        int,         ret                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pos        = offset;
                __entry->blocks        = max_blocks;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->pos, __entry->blocks,
                  __entry->ret)
);

TRACE_EVENT(ext4_unlink_enter,
        TP_PROTO(struct inode *parent, struct dentry *dentry),

        TP_ARGS(parent, dentry),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        ino_t,        parent                        )
                __field(        loff_t,        size                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->parent                = parent->i_ino;
                __entry->size                = d_inode(dentry)->i_size;
        ),

        TP_printk("dev %d,%d ino %lu size %lld parent %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->size,
                  (unsigned long) __entry->parent)
);

TRACE_EVENT(ext4_unlink_exit,
        TP_PROTO(struct dentry *dentry, int ret),

        TP_ARGS(dentry, ret),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        int,        ret                        )
        ),

        TP_fast_assign(
                __entry->dev                = dentry->d_sb->s_dev;
                __entry->ino                = d_inode(dentry)->i_ino;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->ret)
);

DECLARE_EVENT_CLASS(ext4__truncate,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        __u64,                blocks                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->blocks        = inode->i_blocks;
        ),

        TP_printk("dev %d,%d ino %lu blocks %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->blocks)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_enter,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(ext4__truncate, ext4_truncate_exit,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/* 'ux' is the unwritten extent. */
TRACE_EVENT(ext4_ext_convert_to_initialized_enter,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux),

        TP_ARGS(inode, map, ux),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u "
                  "u_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk)
);

/*
 * 'ux' is the unwritten extent.
 * 'ix' is the initialized extent to which blocks are transferred.
 */
TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map,
                 struct ext4_extent *ux, struct ext4_extent *ix),

        TP_ARGS(inode, map, ux, ix),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        m_lblk        )
                __field(        unsigned,        m_len        )
                __field(        ext4_lblk_t,        u_lblk        )
                __field(        unsigned,        u_len        )
                __field(        ext4_fsblk_t,        u_pblk        )
                __field(        ext4_lblk_t,        i_lblk        )
                __field(        unsigned,        i_len        )
                __field(        ext4_fsblk_t,        i_pblk        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->m_lblk                = map->m_lblk;
                __entry->m_len                = map->m_len;
                __entry->u_lblk                = le32_to_cpu(ux->ee_block);
                __entry->u_len                = ext4_ext_get_actual_len(ux);
                __entry->u_pblk                = ext4_ext_pblock(ux);
                __entry->i_lblk                = le32_to_cpu(ix->ee_block);
                __entry->i_len                = ext4_ext_get_actual_len(ix);
                __entry->i_pblk                = ext4_ext_pblock(ix);
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u "
                  "u_lblk %u u_len %u u_pblk %llu "
                  "i_lblk %u i_len %u i_pblk %llu ",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->m_lblk, __entry->m_len,
                  __entry->u_lblk, __entry->u_len, __entry->u_pblk,
                  __entry->i_lblk, __entry->i_len, __entry->i_pblk)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned int len, unsigned int flags),

        TP_ARGS(inode, lblk, len, flags),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        flags                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len, show_map_flags(__entry->flags))
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk,
                 unsigned len, unsigned flags),

        TP_ARGS(inode, lblk, len, flags)
);

DECLARE_EVENT_CLASS(ext4__map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map,
                 int ret),

        TP_ARGS(inode, flags, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        unsigned int,        flags                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        mflags                )
                __field(        int,                ret                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->flags        = flags;
                __entry->pblk        = map->m_pblk;
                __entry->lblk        = map->m_lblk;
                __entry->len        = map->m_len;
                __entry->mflags        = map->m_flags;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u "
                  "mflags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  show_map_flags(__entry->flags), __entry->lblk, __entry->pblk,
                  __entry->len, show_mflags(__entry->mflags), __entry->ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit,
        TP_PROTO(struct inode *inode, unsigned flags,
                 struct ext4_map_blocks *map, int ret),

        TP_ARGS(inode, flags, map, ret)
);

TRACE_EVENT(ext4_ext_load_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk),

        TP_ARGS(inode, lblk, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev    = inode->i_sb->s_dev;
                __entry->ino    = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->pblk)
);

TRACE_EVENT(ext4_load_inode,
        TP_PROTO(struct super_block *sb, unsigned long ino),

        TP_ARGS(sb, ino),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                )
                __field(        ino_t,        ino                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->ino                = ino;
        ),

        TP_printk("dev %d,%d ino %ld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino)
);

TRACE_EVENT(ext4_journal_start_sb,
        TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks,
                 int revoke_creds, int type, unsigned long IP),

        TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, type, IP),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        unsigned long,        ip                )
                __field(        int,                blocks                )
                __field(        int,                rsv_blocks        )
                __field(        int,                revoke_creds        )
                __field(        int,                type                )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
                __entry->type                 = type;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d,"
                  " type %d, caller %pS", MAJOR(__entry->dev),
                  MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks,
                  __entry->revoke_creds, __entry->type, (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_inode,
        TP_PROTO(struct inode *inode, int blocks, int rsv_blocks,
                 int revoke_creds, int type, unsigned long IP),

        TP_ARGS(inode, blocks, rsv_blocks, revoke_creds, type, IP),

        TP_STRUCT__entry(
                __field(        unsigned long,        ino                )
                __field(        dev_t,                dev                )
                __field(        unsigned long,        ip                )
                __field(        int,                blocks                )
                __field(        int,                rsv_blocks        )
                __field(        int,                revoke_creds        )
                __field(        int,                type                )
        ),

        TP_fast_assign(
                __entry->dev                 = inode->i_sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
                __entry->rsv_blocks         = rsv_blocks;
                __entry->revoke_creds         = revoke_creds;
                __entry->type                 = type;
                __entry->ino                 = inode->i_ino;
        ),

        TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d,"
                  " type %d, ino %lu, caller %pS", MAJOR(__entry->dev),
                  MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks,
                  __entry->revoke_creds, __entry->type, __entry->ino,
                  (void *)__entry->ip)
);

TRACE_EVENT(ext4_journal_start_reserved,
        TP_PROTO(struct super_block *sb, int blocks, unsigned long IP),

        TP_ARGS(sb, blocks, IP),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(unsigned long,        ip                        )
                __field(          int,        blocks                        )
        ),

        TP_fast_assign(
                __entry->dev                 = sb->s_dev;
                __entry->ip                 = IP;
                __entry->blocks                 = blocks;
        ),

        TP_printk("dev %d,%d blocks, %d caller %pS",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->blocks, (void *)__entry->ip)
);

DECLARE_EVENT_CLASS(ext4__trim,
        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len),

        TP_STRUCT__entry(
                __field(        int,        dev_major                )
                __field(        int,        dev_minor                )
                __field(        __u32,         group                        )
                __field(        int,        start                        )
                __field(        int,        len                        )
        ),

        TP_fast_assign(
                __entry->dev_major        = MAJOR(sb->s_dev);
                __entry->dev_minor        = MINOR(sb->s_dev);
                __entry->group                = group;
                __entry->start                = start;
                __entry->len                = len;
        ),

        TP_printk("dev %d,%d group %u, start %d, len %d",
                  __entry->dev_major, __entry->dev_minor,
                  __entry->group, __entry->start, __entry->len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_extent,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

DEFINE_EVENT(ext4__trim, ext4_trim_all_free,

        TP_PROTO(struct super_block *sb,
                 ext4_group_t group,
                 ext4_grpblk_t start,
                 ext4_grpblk_t len),

        TP_ARGS(sb, group, start, len)
);

TRACE_EVENT(ext4_ext_handle_unwritten_extents,
        TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags,
                 unsigned int allocated, ext4_fsblk_t newblock),

        TP_ARGS(inode, map, flags, allocated, newblock),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        int,                flags                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        unsigned int,        len                )
                __field(        unsigned int,        allocated        )
                __field(        ext4_fsblk_t,        newblk                )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->flags                = flags;
                __entry->lblk                = map->m_lblk;
                __entry->pblk                = map->m_pblk;
                __entry->len                = map->m_len;
                __entry->allocated        = allocated;
                __entry->newblk                = newblock;
        ),

        TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s "
                  "allocated %d newblock %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_map_flags(__entry->flags),
                  (unsigned int) __entry->allocated,
                  (unsigned long long) __entry->newblk)
);

TRACE_EVENT(ext4_get_implied_cluster_alloc_exit,
        TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret),

        TP_ARGS(sb, map, ret),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        unsigned int,        flags        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        unsigned int,        len        )
                __field(        int,                ret        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = map->m_flags;
                __entry->lblk        = map->m_lblk;
                __entry->pblk        = map->m_pblk;
                __entry->len        = map->m_len;
                __entry->ret        = ret;
        ),

        TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->lblk, (unsigned long long) __entry->pblk,
                  __entry->len, show_mflags(__entry->flags), __entry->ret)
);

TRACE_EVENT(ext4_ext_show_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                 unsigned short len),

        TP_ARGS(inode, lblk, pblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
                __field(        ext4_lblk_t,        lblk        )
                __field(        unsigned short,        len        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->lblk,
                  (unsigned long long) __entry->pblk,
                  (unsigned short) __entry->len)
);

TRACE_EVENT(ext4_remove_blocks,
        TP_PROTO(struct inode *inode, struct ext4_extent *ex,
                 ext4_lblk_t from, ext4_fsblk_t to,
                 struct partial_cluster *pc),

        TP_ARGS(inode, ex, from, to, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        from        )
                __field(        ext4_lblk_t,        to        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        unsigned short,        ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->from                = from;
                __entry->to                = to;
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]"
                  "from %u to %u partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (unsigned) __entry->from,
                  (unsigned) __entry->to,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_leaf,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 struct ext4_extent *ex,
                 struct partial_cluster *pc),

        TP_ARGS(inode, start, ex, pc),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        ee_lblk        )
                __field(        ext4_fsblk_t,        ee_pblk        )
                __field(        short,                ee_len        )
                __field(        ext4_fsblk_t,        pc_pclu        )
                __field(        ext4_lblk_t,        pc_lblk        )
                __field(        int,                pc_state)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->ee_lblk        = le32_to_cpu(ex->ee_block);
                __entry->ee_pblk        = ext4_ext_pblock(ex);
                __entry->ee_len                = ext4_ext_get_actual_len(ex);
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
        ),

        TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]"
                  "partial [pclu %lld lblk %u state %d]",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->ee_lblk,
                  (unsigned long long) __entry->ee_pblk,
                  (unsigned short) __entry->ee_len,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state)
);

TRACE_EVENT(ext4_ext_rm_idx,
        TP_PROTO(struct inode *inode, ext4_fsblk_t pblk),

        TP_ARGS(inode, pblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_fsblk_t,        pblk        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->pblk        = pblk;
        ),

        TP_printk("dev %d,%d ino %lu index_pblk %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned long long) __entry->pblk)
);

TRACE_EVENT(ext4_ext_remove_space,
        TP_PROTO(struct inode *inode, ext4_lblk_t start,
                 ext4_lblk_t end, int depth),

        TP_ARGS(inode, start, end, depth),

        TP_STRUCT__entry(
                __field(        dev_t,                dev        )
                __field(        ino_t,                ino        )
                __field(        ext4_lblk_t,        start        )
                __field(        ext4_lblk_t,        end        )
                __field(        int,                depth        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->start        = start;
                __entry->end        = end;
                __entry->depth        = depth;
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth)
);

TRACE_EVENT(ext4_ext_remove_space_done,
        TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end,
                 int depth, struct partial_cluster *pc, __le16 eh_entries),

        TP_ARGS(inode, start, end, depth, pc, eh_entries),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        start                )
                __field(        ext4_lblk_t,        end                )
                __field(        int,                depth                )
                __field(        ext4_fsblk_t,        pc_pclu                )
                __field(        ext4_lblk_t,        pc_lblk                )
                __field(        int,                pc_state        )
                __field(        unsigned short,        eh_entries        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->start                = start;
                __entry->end                = end;
                __entry->depth                = depth;
                __entry->pc_pclu        = pc->pclu;
                __entry->pc_lblk        = pc->lblk;
                __entry->pc_state        = pc->state;
                __entry->eh_entries        = le16_to_cpu(eh_entries);
        ),

        TP_printk("dev %d,%d ino %lu since %u end %u depth %d "
                  "partial [pclu %lld lblk %u state %d] "
                  "remaining_entries %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  (unsigned) __entry->start,
                  (unsigned) __entry->end,
                  __entry->depth,
                  (long long) __entry->pc_pclu,
                  (unsigned int) __entry->pc_lblk,
                  (int) __entry->pc_state,
                  (unsigned short) __entry->eh_entries)
);

DECLARE_EVENT_CLASS(ext4__es_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es)
);

TRACE_EVENT(ext4_es_remove_extent,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len),

        TP_ARGS(inode, lblk, len),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(        loff_t,        lblk                        )
                __field(        loff_t,        len                        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu es [%lld/%lld)",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len)
);

TRACE_EVENT(ext4_es_find_extent_range_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_find_extent_range_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es),

        TP_ARGS(inode, es),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char, status        )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status))
);

TRACE_EVENT(ext4_es_lookup_extent_enter,
        TP_PROTO(struct inode *inode, ext4_lblk_t lblk),

        TP_ARGS(inode, lblk),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = lblk;
        ),

        TP_printk("dev %d,%d ino %lu lblk %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->lblk)
);

TRACE_EVENT(ext4_es_lookup_extent_exit,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 int found),

        TP_ARGS(inode, es, found),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        int,                found                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->lblk        = es->es_lblk;
                __entry->len        = es->es_len;
                __entry->pblk        = ext4_es_show_pblock(es);
                __entry->status        = ext4_es_status(es);
                __entry->found        = found;
        ),

        TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino, __entry->found,
                  __entry->lblk, __entry->len,
                  __entry->found ? __entry->pblk : 0,
                  show_extent_status(__entry->found ? __entry->status : 0))
);

DECLARE_EVENT_CLASS(ext4__es_shrink_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_to_scan                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_to_scan        = nr_to_scan;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_to_scan, __entry->cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter,
        TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt),

        TP_ARGS(sb, nr_to_scan, cache_cnt)
);

TRACE_EVENT(ext4_es_shrink_scan_exit,
        TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt),

        TP_ARGS(sb, nr_shrunk, cache_cnt),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        int,        nr_shrunk                )
                __field(        int,        cache_cnt                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->cache_cnt        = cache_cnt;
        ),

        TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nr_shrunk, __entry->cache_cnt)
);

TRACE_EVENT(ext4_collapse_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_insert_range,
        TP_PROTO(struct inode *inode, loff_t offset, loff_t len),

        TP_ARGS(inode, offset, len),

        TP_STRUCT__entry(
                __field(dev_t,        dev)
                __field(ino_t,        ino)
                __field(loff_t,        offset)
                __field(loff_t, len)
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->offset        = offset;
                __entry->len        = len;
        ),

        TP_printk("dev %d,%d ino %lu offset %lld len %lld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->offset, __entry->len)
);

TRACE_EVENT(ext4_es_shrink,
        TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time,
                 int nr_skipped, int retried),

        TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        int,                nr_shrunk        )
                __field(        unsigned long long, scan_time        )
                __field(        int,                nr_skipped        )
                __field(        int,                retried                )
        ),

        TP_fast_assign(
                __entry->dev                = sb->s_dev;
                __entry->nr_shrunk        = nr_shrunk;
                __entry->scan_time        = div_u64(scan_time, 1000);
                __entry->nr_skipped        = nr_skipped;
                __entry->retried        = retried;
        ),

        TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu "
                  "nr_skipped %d retried %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk,
                  __entry->scan_time, __entry->nr_skipped, __entry->retried)
);

TRACE_EVENT(ext4_es_insert_delayed_extent,
        TP_PROTO(struct inode *inode, struct extent_status *es,
                 bool lclu_allocated, bool end_allocated),

        TP_ARGS(inode, es, lclu_allocated, end_allocated),

        TP_STRUCT__entry(
                __field(        dev_t,                dev                )
                __field(        ino_t,                ino                )
                __field(        ext4_lblk_t,        lblk                )
                __field(        ext4_lblk_t,        len                )
                __field(        ext4_fsblk_t,        pblk                )
                __field(        char,                status                )
                __field(        bool,                lclu_allocated        )
                __field(        bool,                end_allocated        )
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->lblk                = es->es_lblk;
                __entry->len                = es->es_len;
                __entry->pblk                = ext4_es_show_pblock(es);
                __entry->status                = ext4_es_status(es);
                __entry->lclu_allocated        = lclu_allocated;
                __entry->end_allocated        = end_allocated;
        ),

        TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s "
                  "allocated %d %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long) __entry->ino,
                  __entry->lblk, __entry->len,
                  __entry->pblk, show_extent_status(__entry->status),
                  __entry->lclu_allocated, __entry->end_allocated)
);

/* fsmap traces */
DECLARE_EVENT_CLASS(ext4_fsmap_class,
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len,
                 u64 owner),
        TP_ARGS(sb, keydev, agno, bno, len, owner),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u32, agno)
                __field(u64, bno)
                __field(u64, len)
                __field(u64, owner)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(keydev);
                __entry->agno = agno;
                __entry->bno = bno;
                __entry->len = len;
                __entry->owner = owner;
        ),
        TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->agno,
                  __entry->bno,
                  __entry->len,
                  __entry->owner)
)
#define DEFINE_FSMAP_EVENT(name) \
DEFINE_EVENT(ext4_fsmap_class, name, \
        TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \
                 u64 owner), \
        TP_ARGS(sb, keydev, agno, bno, len, owner))
DEFINE_FSMAP_EVENT(ext4_fsmap_low_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_high_key);
DEFINE_FSMAP_EVENT(ext4_fsmap_mapping);

DECLARE_EVENT_CLASS(ext4_getfsmap_class,
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap),
        TP_ARGS(sb, fsmap),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(dev_t, keydev)
                __field(u64, block)
                __field(u64, len)
                __field(u64, owner)
                __field(u64, flags)
        ),
        TP_fast_assign(
                __entry->dev = sb->s_bdev->bd_dev;
                __entry->keydev = new_decode_dev(fsmap->fmr_device);
                __entry->block = fsmap->fmr_physical;
                __entry->len = fsmap->fmr_length;
                __entry->owner = fsmap->fmr_owner;
                __entry->flags = fsmap->fmr_flags;
        ),
        TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  MAJOR(__entry->keydev), MINOR(__entry->keydev),
                  __entry->block,
                  __entry->len,
                  __entry->owner,
                  __entry->flags)
)
#define DEFINE_GETFSMAP_EVENT(name) \
DEFINE_EVENT(ext4_getfsmap_class, name, \
        TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \
        TP_ARGS(sb, fsmap))
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key);
DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping);

TRACE_EVENT(ext4_shutdown,
        TP_PROTO(struct super_block *sb, unsigned long flags),

        TP_ARGS(sb, flags),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(     unsigned,        flags                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->flags)
);

TRACE_EVENT(ext4_error,
        TP_PROTO(struct super_block *sb, const char *function,
                 unsigned int line),

        TP_ARGS(sb, function, line),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field( const char *,        function                )
                __field(     unsigned,        line                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->function = function;
                __entry->line        = line;
        ),

        TP_printk("dev %d,%d function %s line %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->function, __entry->line)
);

TRACE_EVENT(ext4_prefetch_bitmaps,
            TP_PROTO(struct super_block *sb, ext4_group_t group,
                     ext4_group_t next, unsigned int prefetch_ios),

        TP_ARGS(sb, group, next, prefetch_ios),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
                __field(        __u32,        next                        )
                __field(        __u32,        ios                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
                __entry->next        = next;
                __entry->ios        = prefetch_ios;
        ),

        TP_printk("dev %d,%d group %u next %u ios %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->group, __entry->next, __entry->ios)
);

TRACE_EVENT(ext4_lazy_itable_init,
            TP_PROTO(struct super_block *sb, ext4_group_t group),

        TP_ARGS(sb, group),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        __u32,        group                        )
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->group        = group;
        ),

        TP_printk("dev %d,%d group %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group)
);

TRACE_EVENT(ext4_fc_replay_scan,
        TP_PROTO(struct super_block *sb, int error, int off),

        TP_ARGS(sb, error, off),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, error)
                __field(int, off)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->error = error;
                __entry->off = off;
        ),

        TP_printk("dev %d,%d error %d, off %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->error, __entry->off)
);

TRACE_EVENT(ext4_fc_replay,
        TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2),

        TP_ARGS(sb, tag, ino, priv1, priv2),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, tag)
                __field(int, ino)
                __field(int, priv1)
                __field(int, priv2)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tag = tag;
                __entry->ino = ino;
                __entry->priv1 = priv1;
                __entry->priv2 = priv2;
        ),

        TP_printk("dev %d,%d: tag %d, ino %d, data1 %d, data2 %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tag, __entry->ino, __entry->priv1, __entry->priv2)
);

TRACE_EVENT(ext4_fc_commit_start,
        TP_PROTO(struct super_block *sb, tid_t commit_tid),

        TP_ARGS(sb, commit_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->tid = commit_tid;
        ),

        TP_printk("dev %d,%d tid %u", MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->tid)
);

TRACE_EVENT(ext4_fc_commit_stop,
            TP_PROTO(struct super_block *sb, int nblks, int reason,
                     tid_t commit_tid),

        TP_ARGS(sb, nblks, reason, commit_tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, nblks)
                __field(int, reason)
                __field(int, num_fc)
                __field(int, num_fc_ineligible)
                __field(int, nblks_agg)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                __entry->dev = sb->s_dev;
                __entry->nblks = nblks;
                __entry->reason = reason;
                __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->num_fc_ineligible =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks;
                __entry->tid = commit_tid;
        ),

        TP_printk("dev %d,%d nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d, tid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->nblks, __entry->reason, __entry->num_fc,
                  __entry->num_fc_ineligible, __entry->nblks_agg, __entry->tid)
);

#define FC_REASON_NAME_STAT(reason)                                        \
        show_fc_reason(reason),                                                \
        __entry->fc_ineligible_rc[reason]

TRACE_EVENT(ext4_fc_stats,
        TP_PROTO(struct super_block *sb),

        TP_ARGS(sb),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __array(unsigned int, fc_ineligible_rc, EXT4_FC_REASON_MAX)
                __field(unsigned long, fc_commits)
                __field(unsigned long, fc_ineligible_commits)
                __field(unsigned long, fc_numblks)
        ),

        TP_fast_assign(
                int i;

                __entry->dev = sb->s_dev;
                for (i = 0; i < EXT4_FC_REASON_MAX; i++) {
                        __entry->fc_ineligible_rc[i] =
                                EXT4_SB(sb)->s_fc_stats.fc_ineligible_reason_count[i];
                }
                __entry->fc_commits = EXT4_SB(sb)->s_fc_stats.fc_num_commits;
                __entry->fc_ineligible_commits =
                        EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits;
                __entry->fc_numblks = EXT4_SB(sb)->s_fc_stats.fc_numblks;
        ),

        TP_printk("dev %d,%d fc ineligible reasons:\n"
                  "%s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u, %s:%u"
                  "num_commits:%lu, ineligible: %lu, numblks: %lu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA),
                  FC_REASON_NAME_STAT(EXT4_FC_REASON_ENCRYPTED_FILENAME),
                  __entry->fc_commits, __entry->fc_ineligible_commits,
                  __entry->fc_numblks)
);

DECLARE_EVENT_CLASS(ext4_fc_track_dentry,

        TP_PROTO(handle_t *handle, struct inode *inode,
                 struct dentry *dentry, int ret),

        TP_ARGS(handle, inode, dentry, ret),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(ino_t, i_ino)
                __field(tid_t, i_sync_tid)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->error = ret;
        ),

        TP_printk("dev %d,%d, t_tid %u, ino %lu, i_sync_tid %u, error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error
        )
);

#define DEFINE_EVENT_CLASS_DENTRY(__type)                                \
DEFINE_EVENT(ext4_fc_track_dentry, ext4_fc_track_##__type,                \
        TP_PROTO(handle_t *handle, struct inode *inode,                        \
                 struct dentry *dentry, int ret),                        \
        TP_ARGS(handle, inode, dentry, ret)                                \
)

DEFINE_EVENT_CLASS_DENTRY(create);
DEFINE_EVENT_CLASS_DENTRY(link);
DEFINE_EVENT_CLASS_DENTRY(unlink);

TRACE_EVENT(ext4_fc_track_inode,
        TP_PROTO(handle_t *handle, struct inode *inode, int ret),

        TP_ARGS(handle, inode, ret),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(ino_t, i_ino)
                __field(tid_t, i_sync_tid)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->error = ret;
        ),

        TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error)
        );

TRACE_EVENT(ext4_fc_track_range,
        TP_PROTO(handle_t *handle, struct inode *inode,
                 long start, long end, int ret),

        TP_ARGS(handle, inode, start, end, ret),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(tid_t, t_tid)
                __field(ino_t, i_ino)
                __field(tid_t, i_sync_tid)
                __field(long, start)
                __field(long, end)
                __field(int, error)
        ),

        TP_fast_assign(
                struct ext4_inode_info *ei = EXT4_I(inode);

                __entry->dev = inode->i_sb->s_dev;
                __entry->t_tid = handle->h_transaction->t_tid;
                __entry->i_ino = inode->i_ino;
                __entry->i_sync_tid = ei->i_sync_tid;
                __entry->start = start;
                __entry->end = end;
                __entry->error = ret;
        ),

        TP_printk("dev %d:%d, t_tid %u, inode %lu, i_sync_tid %u, error %d, start %ld, end %ld",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->t_tid, __entry->i_ino, __entry->i_sync_tid,
                  __entry->error, __entry->start, __entry->end)
        );

TRACE_EVENT(ext4_fc_cleanup,
        TP_PROTO(journal_t *journal, int full, tid_t tid),

        TP_ARGS(journal, full, tid),

        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(int, j_fc_off)
                __field(int, full)
                __field(tid_t, tid)
        ),

        TP_fast_assign(
                struct super_block *sb = journal->j_private;

                __entry->dev = sb->s_dev;
                __entry->j_fc_off = journal->j_fc_off;
                __entry->full = full;
                __entry->tid = tid;
        ),

        TP_printk("dev %d,%d, j_fc_off %d, full %d, tid %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->j_fc_off, __entry->full, __entry->tid)
        );

TRACE_EVENT(ext4_update_sb,
        TP_PROTO(struct super_block *sb, ext4_fsblk_t fsblk,
                 unsigned int flags),

        TP_ARGS(sb, fsblk, flags),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ext4_fsblk_t,        fsblk)
                __field(unsigned int,        flags)
        ),

        TP_fast_assign(
                __entry->dev        = sb->s_dev;
                __entry->fsblk        = fsblk;
                __entry->flags        = flags;
        ),

        TP_printk("dev %d,%d fsblk %llu flags %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->fsblk, __entry->flags)
);

#endif /* _TRACE_EXT4_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_HUGETLB_H
#define _ASM_GENERIC_HUGETLB_H

#include <linux/swap.h>
#include <linux/swapops.h>

static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
{
        return mk_pte(page, pgprot);
}

static inline unsigned long huge_pte_write(pte_t pte)
{
        return pte_write(pte);
}

static inline unsigned long huge_pte_dirty(pte_t pte)
{
        return pte_dirty(pte);
}

static inline pte_t huge_pte_mkwrite(pte_t pte)
{
        return pte_mkwrite_novma(pte);
}

#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
static inline pte_t huge_pte_wrprotect(pte_t pte)
{
        return pte_wrprotect(pte);
}
#endif

static inline pte_t huge_pte_mkdirty(pte_t pte)
{
        return pte_mkdirty(pte);
}

static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
{
        return pte_modify(pte, newprot);
}

#ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP
static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
{
        return huge_pte_wrprotect(pte_mkuffd_wp(pte));
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP
static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
{
        return pte_clear_uffd_wp(pte);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP
static inline int huge_pte_uffd_wp(pte_t pte)
{
        return pte_uffd_wp(pte);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
                    pte_t *ptep, unsigned long sz)
{
        pte_clear(mm, addr, ptep);
}
#endif

#ifndef __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE
static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
                unsigned long addr, unsigned long end,
                unsigned long floor, unsigned long ceiling)
{
        free_pgd_range(tlb, addr, end, floor, ceiling);
}
#endif

#ifndef __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT
static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, pte_t pte, unsigned long sz)
{
        set_pte_at(mm, addr, ptep, pte);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR
static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned long sz)
{
        return ptep_get_and_clear(mm, addr, ptep);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep)
{
        return ptep_clear_flush(vma, addr, ptep);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTE_NONE
static inline int huge_pte_none(pte_t pte)
{
        return pte_none(pte);
}
#endif

/* Please refer to comments above pte_none_mostly() for the usage */
#ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY
static inline int huge_pte_none_mostly(pte_t pte)
{
        return huge_pte_none(pte) || is_pte_marker(pte);
}
#endif

#ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE
static inline int prepare_hugepage_range(struct file *file,
                unsigned long addr, unsigned long len)
{
        return 0;
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep)
{
        ptep_set_wrprotect(mm, addr, ptep);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
                unsigned long addr, pte_t *ptep,
                pte_t pte, int dirty)
{
        return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
}
#endif

#ifndef __HAVE_ARCH_HUGE_PTEP_GET
static inline pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
        return ptep_get(ptep);
}
#endif

#ifndef __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED
static inline bool gigantic_page_runtime_supported(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_GIGANTIC_PAGE);
}
#endif /* __HAVE_ARCH_GIGANTIC_PAGE_RUNTIME_SUPPORTED */

#endif /* _ASM_GENERIC_HUGETLB_H */































    3 
    3 




    3 










    3 














































































































































































































































































































    3 


    3 
    3 










    3 





    3 















































































    3 



    3 


































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
// SPDX-License-Identifier: GPL-2.0
/*
 * Management Component Transport Protocol (MCTP) - device implementation.
 *
 * Copyright (c) 2021 Code Construct
 * Copyright (c) 2021 Google
 */

#include <linux/if_arp.h>
#include <linux/if_link.h>
#include <linux/mctp.h>
#include <linux/netdevice.h>
#include <linux/rcupdate.h>
#include <linux/rtnetlink.h>

#include <net/addrconf.h>
#include <net/netlink.h>
#include <net/mctp.h>
#include <net/mctpdevice.h>
#include <net/sock.h>

struct mctp_dump_cb {
        unsigned long ifindex;
        size_t a_idx;
};

/* unlocked: caller must hold rcu_read_lock.
 * Returned mctp_dev has its refcount incremented, or NULL if unset.
 */
struct mctp_dev *__mctp_dev_get(const struct net_device *dev)
{
        struct mctp_dev *mdev = rcu_dereference(dev->mctp_ptr);

        /* RCU guarantees that any mdev is still live.
         * Zero refcount implies a pending free, return NULL.
         */
        if (mdev)
                if (!refcount_inc_not_zero(&mdev->refs))
                        return NULL;
        return mdev;
}

/* Returned mctp_dev does not have refcount incremented. The returned pointer
 * remains live while rtnl_lock is held, as that prevents mctp_unregister()
 */
struct mctp_dev *mctp_dev_get_rtnl(const struct net_device *dev)
{
        return rtnl_dereference(dev->mctp_ptr);
}

static int mctp_addrinfo_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
                + nla_total_size(1) // IFA_LOCAL
                + nla_total_size(1) // IFA_ADDRESS
                ;
}

/* flag should be NLM_F_MULTI for dump calls */
static int mctp_fill_addrinfo(struct sk_buff *skb,
                              struct mctp_dev *mdev, mctp_eid_t eid,
                              int msg_type, u32 portid, u32 seq, int flag)
{
        struct ifaddrmsg *hdr;
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, portid, seq,
                        msg_type, sizeof(*hdr), flag);
        if (!nlh)
                return -EMSGSIZE;

        hdr = nlmsg_data(nlh);
        hdr->ifa_family = AF_MCTP;
        hdr->ifa_prefixlen = 0;
        hdr->ifa_flags = 0;
        hdr->ifa_scope = 0;
        hdr->ifa_index = mdev->dev->ifindex;

        if (nla_put_u8(skb, IFA_LOCAL, eid))
                goto cancel;

        if (nla_put_u8(skb, IFA_ADDRESS, eid))
                goto cancel;

        nlmsg_end(skb, nlh);

        return 0;

cancel:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int mctp_dump_dev_addrinfo(struct mctp_dev *mdev, struct sk_buff *skb,
                                  struct netlink_callback *cb)
{
        struct mctp_dump_cb *mcb = (void *)cb->ctx;
        u32 portid, seq;
        int rc = 0;

        portid = NETLINK_CB(cb->skb).portid;
        seq = cb->nlh->nlmsg_seq;
        for (; mcb->a_idx < mdev->num_addrs; mcb->a_idx++) {
                rc = mctp_fill_addrinfo(skb, mdev, mdev->addrs[mcb->a_idx],
                                        RTM_NEWADDR, portid, seq, NLM_F_MULTI);
                if (rc < 0)
                        break;
        }

        return rc;
}

static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct mctp_dump_cb *mcb = (void *)cb->ctx;
        struct net *net = sock_net(skb->sk);
        struct net_device *dev;
        struct ifaddrmsg *hdr;
        struct mctp_dev *mdev;
        int ifindex, rc;

        hdr = nlmsg_data(cb->nlh);
        // filter by ifindex if requested
        ifindex = hdr->ifa_index;

        rcu_read_lock();
        for_each_netdev_dump(net, dev, mcb->ifindex) {
                if (ifindex && ifindex != dev->ifindex)
                        continue;
                mdev = __mctp_dev_get(dev);
                if (!mdev)
                        continue;
                rc = mctp_dump_dev_addrinfo(mdev, skb, cb);
                mctp_dev_put(mdev);
                if (rc < 0)
                        break;
                mcb->a_idx = 0;
        }
        rcu_read_unlock();

        return skb->len;
}

static void mctp_addr_notify(struct mctp_dev *mdev, mctp_eid_t eid, int msg_type,
                             struct sk_buff *req_skb, struct nlmsghdr *req_nlh)
{
        u32 portid = NETLINK_CB(req_skb).portid;
        struct net *net = dev_net(mdev->dev);
        struct sk_buff *skb;
        int rc = -ENOBUFS;

        skb = nlmsg_new(mctp_addrinfo_size(), GFP_KERNEL);
        if (!skb)
                goto out;

        rc = mctp_fill_addrinfo(skb, mdev, eid, msg_type,
                                portid, req_nlh->nlmsg_seq, 0);
        if (rc < 0) {
                WARN_ON_ONCE(rc == -EMSGSIZE);
                goto out;
        }

        rtnl_notify(skb, net, portid, RTNLGRP_MCTP_IFADDR, req_nlh, GFP_KERNEL);
        return;
out:
        kfree_skb(skb);
        rtnl_set_sk_err(net, RTNLGRP_MCTP_IFADDR, rc);
}

static const struct nla_policy ifa_mctp_policy[IFA_MAX + 1] = {
        [IFA_ADDRESS]                = { .type = NLA_U8 },
        [IFA_LOCAL]                = { .type = NLA_U8 },
};

static int mctp_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[IFA_MAX + 1];
        struct net_device *dev;
        struct mctp_addr *addr;
        struct mctp_dev *mdev;
        struct ifaddrmsg *ifm;
        unsigned long flags;
        u8 *tmp_addrs;
        int rc;

        rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy,
                         extack);
        if (rc < 0)
                return rc;

        ifm = nlmsg_data(nlh);

        if (tb[IFA_LOCAL])
                addr = nla_data(tb[IFA_LOCAL]);
        else if (tb[IFA_ADDRESS])
                addr = nla_data(tb[IFA_ADDRESS]);
        else
                return -EINVAL;

        /* find device */
        dev = __dev_get_by_index(net, ifm->ifa_index);
        if (!dev)
                return -ENODEV;

        mdev = mctp_dev_get_rtnl(dev);
        if (!mdev)
                return -ENODEV;

        if (!mctp_address_unicast(addr->s_addr))
                return -EINVAL;

        /* Prevent duplicates. Under RTNL so don't need to lock for reading */
        if (memchr(mdev->addrs, addr->s_addr, mdev->num_addrs))
                return -EEXIST;

        tmp_addrs = kmalloc(mdev->num_addrs + 1, GFP_KERNEL);
        if (!tmp_addrs)
                return -ENOMEM;
        memcpy(tmp_addrs, mdev->addrs, mdev->num_addrs);
        tmp_addrs[mdev->num_addrs] = addr->s_addr;

        /* Lock to write */
        spin_lock_irqsave(&mdev->addrs_lock, flags);
        mdev->num_addrs++;
        swap(mdev->addrs, tmp_addrs);
        spin_unlock_irqrestore(&mdev->addrs_lock, flags);

        kfree(tmp_addrs);

        mctp_addr_notify(mdev, addr->s_addr, RTM_NEWADDR, skb, nlh);
        mctp_route_add_local(mdev, addr->s_addr);

        return 0;
}

static int mctp_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[IFA_MAX + 1];
        struct net_device *dev;
        struct mctp_addr *addr;
        struct mctp_dev *mdev;
        struct ifaddrmsg *ifm;
        unsigned long flags;
        u8 *pos;
        int rc;

        rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_mctp_policy,
                         extack);
        if (rc < 0)
                return rc;

        ifm = nlmsg_data(nlh);

        if (tb[IFA_LOCAL])
                addr = nla_data(tb[IFA_LOCAL]);
        else if (tb[IFA_ADDRESS])
                addr = nla_data(tb[IFA_ADDRESS]);
        else
                return -EINVAL;

        /* find device */
        dev = __dev_get_by_index(net, ifm->ifa_index);
        if (!dev)
                return -ENODEV;

        mdev = mctp_dev_get_rtnl(dev);
        if (!mdev)
                return -ENODEV;

        pos = memchr(mdev->addrs, addr->s_addr, mdev->num_addrs);
        if (!pos)
                return -ENOENT;

        rc = mctp_route_remove_local(mdev, addr->s_addr);
        // we can ignore -ENOENT in the case a route was already removed
        if (rc < 0 && rc != -ENOENT)
                return rc;

        spin_lock_irqsave(&mdev->addrs_lock, flags);
        memmove(pos, pos + 1, mdev->num_addrs - 1 - (pos - mdev->addrs));
        mdev->num_addrs--;
        spin_unlock_irqrestore(&mdev->addrs_lock, flags);

        mctp_addr_notify(mdev, addr->s_addr, RTM_DELADDR, skb, nlh);

        return 0;
}

void mctp_dev_hold(struct mctp_dev *mdev)
{
        refcount_inc(&mdev->refs);
}

void mctp_dev_put(struct mctp_dev *mdev)
{
        if (mdev && refcount_dec_and_test(&mdev->refs)) {
                kfree(mdev->addrs);
                dev_put(mdev->dev);
                kfree_rcu(mdev, rcu);
        }
}

void mctp_dev_release_key(struct mctp_dev *dev, struct mctp_sk_key *key)
        __must_hold(&key->lock)
{
        if (!dev)
                return;
        if (dev->ops && dev->ops->release_flow)
                dev->ops->release_flow(dev, key);
        key->dev = NULL;
        mctp_dev_put(dev);
}

void mctp_dev_set_key(struct mctp_dev *dev, struct mctp_sk_key *key)
        __must_hold(&key->lock)
{
        mctp_dev_hold(dev);
        key->dev = dev;
}

static struct mctp_dev *mctp_add_dev(struct net_device *dev)
{
        struct mctp_dev *mdev;

        ASSERT_RTNL();

        mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
        if (!mdev)
                return ERR_PTR(-ENOMEM);

        spin_lock_init(&mdev->addrs_lock);

        mdev->net = mctp_default_net(dev_net(dev));

        /* associate to net_device */
        refcount_set(&mdev->refs, 1);
        rcu_assign_pointer(dev->mctp_ptr, mdev);

        dev_hold(dev);
        mdev->dev = dev;

        return mdev;
}

static int mctp_fill_link_af(struct sk_buff *skb,
                             const struct net_device *dev, u32 ext_filter_mask)
{
        struct mctp_dev *mdev;

        mdev = mctp_dev_get_rtnl(dev);
        if (!mdev)
                return -ENODATA;
        if (nla_put_u32(skb, IFLA_MCTP_NET, mdev->net))
                return -EMSGSIZE;
        if (nla_put_u8(skb, IFLA_MCTP_PHYS_BINDING, mdev->binding))
                return -EMSGSIZE;
        return 0;
}

static size_t mctp_get_link_af_size(const struct net_device *dev,
                                    u32 ext_filter_mask)
{
        struct mctp_dev *mdev;
        unsigned int ret;

        /* caller holds RCU */
        mdev = __mctp_dev_get(dev);
        if (!mdev)
                return 0;
        ret = nla_total_size(4); /* IFLA_MCTP_NET */
        ret += nla_total_size(1); /* IFLA_MCTP_PHYS_BINDING */
        mctp_dev_put(mdev);
        return ret;
}

static const struct nla_policy ifla_af_mctp_policy[IFLA_MCTP_MAX + 1] = {
        [IFLA_MCTP_NET]                = { .type = NLA_U32 },
};

static int mctp_set_link_af(struct net_device *dev, const struct nlattr *attr,
                            struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_MCTP_MAX + 1];
        struct mctp_dev *mdev;
        int rc;

        rc = nla_parse_nested(tb, IFLA_MCTP_MAX, attr, ifla_af_mctp_policy,
                              NULL);
        if (rc)
                return rc;

        mdev = mctp_dev_get_rtnl(dev);
        if (!mdev)
                return 0;

        if (tb[IFLA_MCTP_NET])
                WRITE_ONCE(mdev->net, nla_get_u32(tb[IFLA_MCTP_NET]));

        return 0;
}

/* Matches netdev types that should have MCTP handling */
static bool mctp_known(struct net_device *dev)
{
        /* only register specific types (inc. NONE for TUN devices) */
        return dev->type == ARPHRD_MCTP ||
                   dev->type == ARPHRD_LOOPBACK ||
                   dev->type == ARPHRD_NONE;
}

static void mctp_unregister(struct net_device *dev)
{
        struct mctp_dev *mdev;

        mdev = mctp_dev_get_rtnl(dev);
        if (!mdev)
                return;

        RCU_INIT_POINTER(mdev->dev->mctp_ptr, NULL);

        mctp_route_remove_dev(mdev);
        mctp_neigh_remove_dev(mdev);

        mctp_dev_put(mdev);
}

static int mctp_register(struct net_device *dev)
{
        struct mctp_dev *mdev;

        /* Already registered? */
        if (rtnl_dereference(dev->mctp_ptr))
                return 0;

        /* only register specific types */
        if (!mctp_known(dev))
                return 0;

        mdev = mctp_add_dev(dev);
        if (IS_ERR(mdev))
                return PTR_ERR(mdev);

        return 0;
}

static int mctp_dev_notify(struct notifier_block *this, unsigned long event,
                           void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        int rc;

        switch (event) {
        case NETDEV_REGISTER:
                rc = mctp_register(dev);
                if (rc)
                        return notifier_from_errno(rc);
                break;
        case NETDEV_UNREGISTER:
                mctp_unregister(dev);
                break;
        }

        return NOTIFY_OK;
}

static int mctp_register_netdevice(struct net_device *dev,
                                   const struct mctp_netdev_ops *ops,
                                   enum mctp_phys_binding binding)
{
        struct mctp_dev *mdev;

        mdev = mctp_add_dev(dev);
        if (IS_ERR(mdev))
                return PTR_ERR(mdev);

        mdev->ops = ops;
        mdev->binding = binding;

        return register_netdevice(dev);
}

int mctp_register_netdev(struct net_device *dev,
                         const struct mctp_netdev_ops *ops,
                         enum mctp_phys_binding binding)
{
        int rc;

        rtnl_lock();
        rc = mctp_register_netdevice(dev, ops, binding);
        rtnl_unlock();

        return rc;
}
EXPORT_SYMBOL_GPL(mctp_register_netdev);

void mctp_unregister_netdev(struct net_device *dev)
{
        unregister_netdev(dev);
}
EXPORT_SYMBOL_GPL(mctp_unregister_netdev);

static struct rtnl_af_ops mctp_af_ops = {
        .family = AF_MCTP,
        .fill_link_af = mctp_fill_link_af,
        .get_link_af_size = mctp_get_link_af_size,
        .set_link_af = mctp_set_link_af,
};

static struct notifier_block mctp_dev_nb = {
        .notifier_call = mctp_dev_notify,
        .priority = ADDRCONF_NOTIFY_PRIORITY,
};

static const struct rtnl_msg_handler mctp_device_rtnl_msg_handlers[] = {
        {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_NEWADDR,
         .doit = mctp_rtm_newaddr},
        {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_DELADDR,
         .doit = mctp_rtm_deladdr},
        {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_GETADDR,
         .dumpit = mctp_dump_addrinfo},
};

int __init mctp_device_init(void)
{
        int err;

        register_netdevice_notifier(&mctp_dev_nb);

        err = rtnl_af_register(&mctp_af_ops);
        if (err)
                goto err_notifier;

        err = rtnl_register_many(mctp_device_rtnl_msg_handlers);
        if (err)
                goto err_af;

        return 0;
err_af:
        rtnl_af_unregister(&mctp_af_ops);
err_notifier:
        unregister_netdevice_notifier(&mctp_dev_nb);
        return err;
}

void __exit mctp_device_exit(void)
{
        rtnl_unregister_many(mctp_device_rtnl_msg_handlers);
        rtnl_af_unregister(&mctp_af_ops);
        unregister_netdevice_notifier(&mctp_dev_nb);
}

































    2 









    2 


































































    2 
    2 

    2 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/*
 * llc_station.c - station component of LLC
 *
 * Copyright (c) 1997 by Procom Technology, Inc.
 *                  2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 * This program can be redistributed or modified under the terms of the
 * GNU General Public License as published by the Free Software Foundation.
 * This program is distributed without any warranty or implied warranty
 * of merchantability or fitness for a particular purpose.
 *
 * See the GNU General Public License for more details.
 */
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <net/llc.h>
#include <net/llc_sap.h>
#include <net/llc_conn.h>
#include <net/llc_c_ac.h>
#include <net/llc_s_ac.h>
#include <net/llc_c_ev.h>
#include <net/llc_c_st.h>
#include <net/llc_s_ev.h>
#include <net/llc_s_st.h>
#include <net/llc_pdu.h>

static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb)
{
        struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);

        return LLC_PDU_IS_CMD(pdu) &&                        /* command PDU */
               LLC_PDU_TYPE_IS_U(pdu) &&                /* U type PDU */
               LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID &&
               !pdu->dsap;                                /* NULL DSAP value */
}

static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb)
{
        struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb);

        return LLC_PDU_IS_CMD(pdu) &&                        /* command PDU */
               LLC_PDU_TYPE_IS_U(pdu) &&                /* U type PDU */
               LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST &&
               !pdu->dsap;                                /* NULL DSAP */
}

static int llc_station_ac_send_xid_r(struct sk_buff *skb)
{
        u8 mac_da[ETH_ALEN], dsap;
        int rc = 1;
        struct sk_buff *nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U,
                                               sizeof(struct llc_xid_info));

        if (!nskb)
                goto out;
        llc_pdu_decode_sa(skb, mac_da);
        llc_pdu_decode_ssap(skb, &dsap);
        llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
        llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 127);
        rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da);
        if (unlikely(rc))
                goto free;
        dev_queue_xmit(nskb);
out:
        return rc;
free:
        kfree_skb(nskb);
        goto out;
}

static int llc_station_ac_send_test_r(struct sk_buff *skb)
{
        u8 mac_da[ETH_ALEN], dsap;
        int rc = 1;
        u32 data_size;
        struct sk_buff *nskb;

        if (skb->mac_len < ETH_HLEN)
                goto out;

        /* The test request command is type U (llc_len = 3) */
        data_size = ntohs(eth_hdr(skb)->h_proto) - 3;
        nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size);

        if (!nskb)
                goto out;
        llc_pdu_decode_sa(skb, mac_da);
        llc_pdu_decode_ssap(skb, &dsap);
        llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
        llc_pdu_init_as_test_rsp(nskb, skb);
        rc = llc_mac_hdr_init(nskb, skb->dev->dev_addr, mac_da);
        if (unlikely(rc))
                goto free;
        dev_queue_xmit(nskb);
out:
        return rc;
free:
        kfree_skb(nskb);
        goto out;
}

/**
 *        llc_station_rcv - send received pdu to the station state machine
 *        @skb: received frame.
 *
 *        Sends data unit to station state machine.
 */
static void llc_station_rcv(struct sk_buff *skb)
{
        if (llc_stat_ev_rx_null_dsap_xid_c(skb))
                llc_station_ac_send_xid_r(skb);
        else if (llc_stat_ev_rx_null_dsap_test_c(skb))
                llc_station_ac_send_test_r(skb);
        kfree_skb(skb);
}

void __init llc_station_init(void)
{
        llc_set_station_handler(llc_station_rcv);
}

void llc_station_exit(void)
{
        llc_set_station_handler(NULL);
}








































































































































































































































 1272 





















 1312 








  520 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_ALTERNATIVE_MACROS_H
#define __ASM_ALTERNATIVE_MACROS_H

#include <linux/const.h>
#include <vdso/bits.h>

#include <asm/cpucaps.h>
#include <asm/insn-def.h>

/*
 * Binutils 2.27.0 can't handle a 'UL' suffix on constants, so for the assembly
 * macros below we must use we must use `(1 << ARM64_CB_SHIFT)`.
 */
#define ARM64_CB_SHIFT        15
#define ARM64_CB_BIT        BIT(ARM64_CB_SHIFT)

#if ARM64_NCAPS >= ARM64_CB_BIT
#error "cpucaps have overflown ARM64_CB_BIT"
#endif

#ifndef __ASSEMBLY__

#include <linux/stringify.h>

#define ALTINSTR_ENTRY(cpucap)                                                      \
        " .word 661b - .\n"                                /* label           */ \
        " .word 663f - .\n"                                /* new instruction */ \
        " .hword " __stringify(cpucap) "\n"                /* cpucap          */ \
        " .byte 662b-661b\n"                                /* source len      */ \
        " .byte 664f-663f\n"                                /* replacement len */

#define ALTINSTR_ENTRY_CB(cpucap, cb)                                              \
        " .word 661b - .\n"                                /* label           */ \
        " .word " __stringify(cb) "- .\n"                /* callback        */ \
        " .hword " __stringify(cpucap) "\n"                /* cpucap          */ \
        " .byte 662b-661b\n"                                /* source len      */ \
        " .byte 664f-663f\n"                                /* replacement len */

/*
 * alternative assembly primitive:
 *
 * If any of these .org directive fail, it means that insn1 and insn2
 * don't have the same length. This used to be written as
 *
 * .if ((664b-663b) != (662b-661b))
 *         .error "Alternatives instruction length mismatch"
 * .endif
 *
 * but most assemblers die if insn1 or insn2 have a .inst. This should
 * be fixed in a binutils release posterior to 2.25.51.0.2 (anything
 * containing commit 4e4d08cf7399b606 or c1baaddf8861).
 *
 * Alternatives with callbacks do not generate replacement instructions.
 */
#define __ALTERNATIVE_CFG(oldinstr, newinstr, cpucap, cfg_enabled)        \
        ".if "__stringify(cfg_enabled)" == 1\n"                                \
        "661:\n\t"                                                        \
        oldinstr "\n"                                                        \
        "662:\n"                                                        \
        ".pushsection .altinstructions,\"a\"\n"                                \
        ALTINSTR_ENTRY(cpucap)                                                \
        ".popsection\n"                                                        \
        ".subsection 1\n"                                                \
        "663:\n\t"                                                        \
        newinstr "\n"                                                        \
        "664:\n\t"                                                        \
        ".org        . - (664b-663b) + (662b-661b)\n\t"                        \
        ".org        . - (662b-661b) + (664b-663b)\n\t"                        \
        ".previous\n"                                                        \
        ".endif\n"

#define __ALTERNATIVE_CFG_CB(oldinstr, cpucap, cfg_enabled, cb)        \
        ".if "__stringify(cfg_enabled)" == 1\n"                                \
        "661:\n\t"                                                        \
        oldinstr "\n"                                                        \
        "662:\n"                                                        \
        ".pushsection .altinstructions,\"a\"\n"                                \
        ALTINSTR_ENTRY_CB(cpucap, cb)                                        \
        ".popsection\n"                                                        \
        "663:\n\t"                                                        \
        "664:\n\t"                                                        \
        ".endif\n"

#define _ALTERNATIVE_CFG(oldinstr, newinstr, cpucap, cfg, ...)        \
        __ALTERNATIVE_CFG(oldinstr, newinstr, cpucap, IS_ENABLED(cfg))

#define ALTERNATIVE_CB(oldinstr, cpucap, cb) \
        __ALTERNATIVE_CFG_CB(oldinstr, (1 << ARM64_CB_SHIFT) | (cpucap), 1, cb)
#else

#include <asm/assembler.h>

.macro altinstruction_entry orig_offset alt_offset cpucap orig_len alt_len
        .word \orig_offset - .
        .word \alt_offset - .
        .hword (\cpucap)
        .byte \orig_len
        .byte \alt_len
.endm

.macro alternative_insn insn1, insn2, cap, enable = 1
        .if \enable
661:        \insn1
662:        .pushsection .altinstructions, "a"
        altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f
        .popsection
        .subsection 1
663:        \insn2
664:        .org        . - (664b-663b) + (662b-661b)
        .org        . - (662b-661b) + (664b-663b)
        .previous
        .endif
.endm

/*
 * Alternative sequences
 *
 * The code for the case where the capability is not present will be
 * assembled and linked as normal. There are no restrictions on this
 * code.
 *
 * The code for the case where the capability is present will be
 * assembled into a special section to be used for dynamic patching.
 * Code for that case must:
 *
 * 1. Be exactly the same length (in bytes) as the default code
 *    sequence.
 *
 * 2. Not contain a branch target that is used outside of the
 *    alternative sequence it is defined in (branches into an
 *    alternative sequence are not fixed up).
 */

/*
 * Begin an alternative code sequence.
 */
.macro alternative_if_not cap
        .set .Lasm_alt_mode, 0
        .pushsection .altinstructions, "a"
        altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f
        .popsection
661:
.endm

.macro alternative_if cap
        .set .Lasm_alt_mode, 1
        .pushsection .altinstructions, "a"
        altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f
        .popsection
        .subsection 1
        .align 2        /* So GAS knows label 661 is suitably aligned */
661:
.endm

.macro alternative_cb cap, cb
        .set .Lasm_alt_mode, 0
        .pushsection .altinstructions, "a"
        altinstruction_entry 661f, \cb, (1 << ARM64_CB_SHIFT) | \cap, 662f-661f, 0
        .popsection
661:
.endm

/*
 * Provide the other half of the alternative code sequence.
 */
.macro alternative_else
662:
        .if .Lasm_alt_mode==0
        .subsection 1
        .else
        .previous
        .endif
663:
.endm

/*
 * Complete an alternative code sequence.
 */
.macro alternative_endif
664:
        .org        . - (664b-663b) + (662b-661b)
        .org        . - (662b-661b) + (664b-663b)
        .if .Lasm_alt_mode==0
        .previous
        .endif
.endm

/*
 * Callback-based alternative epilogue
 */
.macro alternative_cb_end
662:
.endm

/*
 * Provides a trivial alternative or default sequence consisting solely
 * of NOPs. The number of NOPs is chosen automatically to match the
 * previous case.
 */
.macro alternative_else_nop_endif
alternative_else
        nops        (662b-661b) / AARCH64_INSN_SIZE
alternative_endif
.endm

#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...)        \
        alternative_insn insn1, insn2, cap, IS_ENABLED(cfg)

#endif  /*  __ASSEMBLY__  */

/*
 * Usage: asm(ALTERNATIVE(oldinstr, newinstr, cpucap));
 *
 * Usage: asm(ALTERNATIVE(oldinstr, newinstr, cpucap, CONFIG_FOO));
 * N.B. If CONFIG_FOO is specified, but not selected, the whole block
 *      will be omitted, including oldinstr.
 */
#define ALTERNATIVE(oldinstr, newinstr, ...)   \
        _ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1)

#ifndef __ASSEMBLY__

#include <linux/types.h>

static __always_inline bool
alternative_has_cap_likely(const unsigned long cpucap)
{
        if (!cpucap_is_possible(cpucap))
                return false;

        asm goto(
#ifdef BUILD_VDSO
        ALTERNATIVE("b        %l[l_no]", "nop", %[cpucap])
#else
        ALTERNATIVE_CB("b        %l[l_no]", %[cpucap], alt_cb_patch_nops)
#endif
        :
        : [cpucap] "i" (cpucap)
        :
        : l_no);

        return true;
l_no:
        return false;
}

static __always_inline bool
alternative_has_cap_unlikely(const unsigned long cpucap)
{
        if (!cpucap_is_possible(cpucap))
                return false;

        asm goto(
        ALTERNATIVE("nop", "b        %l[l_yes]", %[cpucap])
        :
        : [cpucap] "i" (cpucap)
        :
        : l_yes);

        return false;
l_yes:
        return true;
}

#endif /* __ASSEMBLY__ */

#endif /* __ASM_ALTERNATIVE_MACROS_H */





































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Fast and scalable bitmaps.
 *
 * Copyright (C) 2016 Facebook
 * Copyright (C) 2013-2014 Jens Axboe
 */

#ifndef __LINUX_SCALE_BITMAP_H
#define __LINUX_SCALE_BITMAP_H

#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/minmax.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/types.h>
#include <linux/wait.h>

struct seq_file;

/**
 * struct sbitmap_word - Word in a &struct sbitmap.
 */
struct sbitmap_word {
        /**
         * @word: word holding free bits
         */
        unsigned long word;

        /**
         * @cleared: word holding cleared bits
         */
        unsigned long cleared ____cacheline_aligned_in_smp;

        /**
         * @swap_lock: serializes simultaneous updates of ->word and ->cleared
         */
        raw_spinlock_t swap_lock;
} ____cacheline_aligned_in_smp;

/**
 * struct sbitmap - Scalable bitmap.
 *
 * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This
 * trades off higher memory usage for better scalability.
 */
struct sbitmap {
        /**
         * @depth: Number of bits used in the whole bitmap.
         */
        unsigned int depth;

        /**
         * @shift: log2(number of bits used per word)
         */
        unsigned int shift;

        /**
         * @map_nr: Number of words (cachelines) being used for the bitmap.
         */
        unsigned int map_nr;

        /**
         * @round_robin: Allocate bits in strict round-robin order.
         */
        bool round_robin;

        /**
         * @map: Allocated bitmap.
         */
        struct sbitmap_word *map;

        /*
         * @alloc_hint: Cache of last successfully allocated or freed bit.
         *
         * This is per-cpu, which allows multiple users to stick to different
         * cachelines until the map is exhausted.
         */
        unsigned int __percpu *alloc_hint;
};

#define SBQ_WAIT_QUEUES 8
#define SBQ_WAKE_BATCH 8

/**
 * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue.
 */
struct sbq_wait_state {
        /**
         * @wait: Wait queue.
         */
        wait_queue_head_t wait;
} ____cacheline_aligned_in_smp;

/**
 * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free
 * bits.
 *
 * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to
 * avoid contention on the wait queue spinlock. This ensures that we don't hit a
 * scalability wall when we run out of free bits and have to start putting tasks
 * to sleep.
 */
struct sbitmap_queue {
        /**
         * @sb: Scalable bitmap.
         */
        struct sbitmap sb;

        /**
         * @wake_batch: Number of bits which must be freed before we wake up any
         * waiters.
         */
        unsigned int wake_batch;

        /**
         * @wake_index: Next wait queue in @ws to wake up.
         */
        atomic_t wake_index;

        /**
         * @ws: Wait queues.
         */
        struct sbq_wait_state *ws;

        /*
         * @ws_active: count of currently active ws waitqueues
         */
        atomic_t ws_active;

        /**
         * @min_shallow_depth: The minimum shallow depth which may be passed to
         * sbitmap_queue_get_shallow()
         */
        unsigned int min_shallow_depth;

        /**
         * @completion_cnt: Number of bits cleared passed to the
         * wakeup function.
         */
        atomic_t completion_cnt;

        /**
         * @wakeup_cnt: Number of thread wake ups issued.
         */
        atomic_t wakeup_cnt;
};

/**
 * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node.
 * @sb: Bitmap to initialize.
 * @depth: Number of bits to allocate.
 * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if
 *         given, a good default is chosen.
 * @flags: Allocation flags.
 * @node: Memory node to allocate on.
 * @round_robin: If true, be stricter about allocation order; always allocate
 *               starting from the last allocated bit. This is less efficient
 *               than the default behavior (false).
 * @alloc_hint: If true, apply percpu hint for where to start searching for
 *              a free bit.
 *
 * Return: Zero on success or negative errno on failure.
 */
int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift,
                      gfp_t flags, int node, bool round_robin, bool alloc_hint);

/* sbitmap internal helper */
static inline unsigned int __map_depth(const struct sbitmap *sb, int index)
{
        if (index == sb->map_nr - 1)
                return sb->depth - (index << sb->shift);
        return 1U << sb->shift;
}

/**
 * sbitmap_free() - Free memory used by a &struct sbitmap.
 * @sb: Bitmap to free.
 */
static inline void sbitmap_free(struct sbitmap *sb)
{
        free_percpu(sb->alloc_hint);
        kvfree(sb->map);
        sb->map = NULL;
}

/**
 * sbitmap_resize() - Resize a &struct sbitmap.
 * @sb: Bitmap to resize.
 * @depth: New number of bits to resize to.
 *
 * Doesn't reallocate anything. It's up to the caller to ensure that the new
 * depth doesn't exceed the depth that the sb was initialized with.
 */
void sbitmap_resize(struct sbitmap *sb, unsigned int depth);

/**
 * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap.
 * @sb: Bitmap to allocate from.
 *
 * This operation provides acquire barrier semantics if it succeeds.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_get(struct sbitmap *sb);

/**
 * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap,
 * limiting the depth used from each word.
 * @sb: Bitmap to allocate from.
 * @shallow_depth: The maximum number of bits to allocate from a single word.
 *
 * This rather specific operation allows for having multiple users with
 * different allocation limits. E.g., there can be a high-priority class that
 * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow()
 * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority
 * class can only allocate half of the total bits in the bitmap, preventing it
 * from starving out the high-priority class.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_get_shallow(struct sbitmap *sb, unsigned long shallow_depth);

/**
 * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap.
 * @sb: Bitmap to check.
 *
 * Return: true if any bit in the bitmap is set, false otherwise.
 */
bool sbitmap_any_bit_set(const struct sbitmap *sb);

#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))

typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *);

/**
 * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
 * @start: Where to start the iteration.
 * @sb: Bitmap to iterate over.
 * @fn: Callback. Should return true to continue or false to break early.
 * @data: Pointer to pass to callback.
 *
 * This is inline even though it's non-trivial so that the function calls to the
 * callback will hopefully get optimized away.
 */
static inline void __sbitmap_for_each_set(struct sbitmap *sb,
                                          unsigned int start,
                                          sb_for_each_fn fn, void *data)
{
        unsigned int index;
        unsigned int nr;
        unsigned int scanned = 0;

        if (start >= sb->depth)
                start = 0;
        index = SB_NR_TO_INDEX(sb, start);
        nr = SB_NR_TO_BIT(sb, start);

        while (scanned < sb->depth) {
                unsigned long word;
                unsigned int depth = min_t(unsigned int,
                                           __map_depth(sb, index) - nr,
                                           sb->depth - scanned);

                scanned += depth;
                word = sb->map[index].word & ~sb->map[index].cleared;
                if (!word)
                        goto next;

                /*
                 * On the first iteration of the outer loop, we need to add the
                 * bit offset back to the size of the word for find_next_bit().
                 * On all other iterations, nr is zero, so this is a noop.
                 */
                depth += nr;
                while (1) {
                        nr = find_next_bit(&word, depth, nr);
                        if (nr >= depth)
                                break;
                        if (!fn(sb, (index << sb->shift) + nr, data))
                                return;

                        nr++;
                }
next:
                nr = 0;
                if (++index >= sb->map_nr)
                        index = 0;
        }
}

/**
 * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap.
 * @sb: Bitmap to iterate over.
 * @fn: Callback. Should return true to continue or false to break early.
 * @data: Pointer to pass to callback.
 */
static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn,
                                        void *data)
{
        __sbitmap_for_each_set(sb, 0, fn, data);
}

static inline unsigned long *__sbitmap_word(struct sbitmap *sb,
                                            unsigned int bitnr)
{
        return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word;
}

/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */

static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr)
{
        set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
        clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

/*
 * This one is special, since it doesn't actually clear the bit, rather it
 * sets the corresponding bit in the ->cleared mask instead. Paired with
 * the caller doing sbitmap_deferred_clear() if a given index is full, which
 * will clear the previously freed entries in the corresponding ->word.
 */
static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr)
{
        unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared;

        set_bit(SB_NR_TO_BIT(sb, bitnr), addr);
}

/*
 * Pair of sbitmap_get, and this one applies both cleared bit and
 * allocation hint.
 */
static inline void sbitmap_put(struct sbitmap *sb, unsigned int bitnr)
{
        sbitmap_deferred_clear_bit(sb, bitnr);

        if (likely(sb->alloc_hint && !sb->round_robin && bitnr < sb->depth))
                *raw_cpu_ptr(sb->alloc_hint) = bitnr;
}

static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr)
{
        return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr));
}

static inline int sbitmap_calculate_shift(unsigned int depth)
{
        int        shift = ilog2(BITS_PER_LONG);

        /*
         * If the bitmap is small, shrink the number of bits per word so
         * we spread over a few cachelines, at least. If less than 4
         * bits, just forget about it, it's not going to work optimally
         * anyway.
         */
        if (depth >= 4) {
                while ((4U << shift) > depth)
                        shift--;
        }

        return shift;
}

/**
 * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file.
 * @sb: Bitmap to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The format may change at any time.
 */
void sbitmap_show(struct sbitmap *sb, struct seq_file *m);


/**
 * sbitmap_weight() - Return how many set and not cleared bits in a &struct
 * sbitmap.
 * @sb: Bitmap to check.
 *
 * Return: How many set and not cleared bits set
 */
unsigned int sbitmap_weight(const struct sbitmap *sb);

/**
 * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct
 * seq_file.
 * @sb: Bitmap to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The output isn't guaranteed to be internally
 * consistent.
 */
void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m);

/**
 * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific
 * memory node.
 * @sbq: Bitmap queue to initialize.
 * @depth: See sbitmap_init_node().
 * @shift: See sbitmap_init_node().
 * @round_robin: See sbitmap_get().
 * @flags: Allocation flags.
 * @node: Memory node to allocate on.
 *
 * Return: Zero on success or negative errno on failure.
 */
int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
                            int shift, bool round_robin, gfp_t flags, int node);

/**
 * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue.
 *
 * @sbq: Bitmap queue to free.
 */
static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
{
        kfree(sbq->ws);
        sbitmap_free(&sbq->sb);
}

/**
 * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
 * @sbq: Bitmap queue to recalculate wake batch.
 * @users: Number of shares.
 *
 * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
 * by depth. This interface is for HCTX shared tags or queue shared tags.
 */
void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
                                            unsigned int users);

/**
 * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
 * @sbq: Bitmap queue to resize.
 * @depth: New number of bits to resize to.
 *
 * Like sbitmap_resize(), this doesn't reallocate anything. It has to do
 * some extra work on the &struct sbitmap_queue, so it's not safe to just
 * resize the underlying &struct sbitmap.
 */
void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth);

/**
 * __sbitmap_queue_get() - Try to allocate a free bit from a &struct
 * sbitmap_queue with preemption already disabled.
 * @sbq: Bitmap queue to allocate from.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int __sbitmap_queue_get(struct sbitmap_queue *sbq);

/**
 * __sbitmap_queue_get_batch() - Try to allocate a batch of free bits
 * @sbq: Bitmap queue to allocate from.
 * @nr_tags: number of tags requested
 * @offset: offset to add to returned bits
 *
 * Return: Mask of allocated tags, 0 if none are found. Each tag allocated is
 * a bit in the mask returned, and the caller must add @offset to the value to
 * get the absolute tag value.
 */
unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
                                        unsigned int *offset);

/**
 * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct
 * sbitmap_queue, limiting the depth used from each word, with preemption
 * already disabled.
 * @sbq: Bitmap queue to allocate from.
 * @shallow_depth: The maximum number of bits to allocate from a single word.
 * See sbitmap_get_shallow().
 *
 * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after
 * initializing @sbq.
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq,
                              unsigned int shallow_depth);

/**
 * sbitmap_queue_get() - Try to allocate a free bit from a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to allocate from.
 * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to
 *       sbitmap_queue_clear()).
 *
 * Return: Non-negative allocated bit number if successful, -1 otherwise.
 */
static inline int sbitmap_queue_get(struct sbitmap_queue *sbq,
                                    unsigned int *cpu)
{
        int nr;

        *cpu = get_cpu();
        nr = __sbitmap_queue_get(sbq);
        put_cpu();
        return nr;
}

/**
 * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the
 * minimum shallow depth that will be used.
 * @sbq: Bitmap queue in question.
 * @min_shallow_depth: The minimum shallow depth that will be passed to
 * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow().
 *
 * sbitmap_queue_clear() batches wakeups as an optimization. The batch size
 * depends on the depth of the bitmap. Since the shallow allocation functions
 * effectively operate with a different depth, the shallow depth must be taken
 * into account when calculating the batch size. This function must be called
 * with the minimum shallow depth that will be used. Failure to do so can result
 * in missed wakeups.
 */
void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
                                     unsigned int min_shallow_depth);

/**
 * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a
 * &struct sbitmap_queue.
 * @sbq: Bitmap to free from.
 * @nr: Bit number to free.
 * @cpu: CPU the bit was allocated on.
 */
void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr,
                         unsigned int cpu);

/**
 * sbitmap_queue_clear_batch() - Free a batch of allocated bits
 * &struct sbitmap_queue.
 * @sbq: Bitmap to free from.
 * @offset: offset for each tag in array
 * @tags: array of tags
 * @nr_tags: number of tags in array
 */
void sbitmap_queue_clear_batch(struct sbitmap_queue *sbq, int offset,
                                int *tags, int nr_tags);

static inline int sbq_index_inc(int index)
{
        return (index + 1) & (SBQ_WAIT_QUEUES - 1);
}

static inline void sbq_index_atomic_inc(atomic_t *index)
{
        int old = atomic_read(index);
        int new = sbq_index_inc(old);
        atomic_cmpxchg(index, old, new);
}

/**
 * sbq_wait_ptr() - Get the next wait queue to use for a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to wait on.
 * @wait_index: A counter per "user" of @sbq.
 */
static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq,
                                                  atomic_t *wait_index)
{
        struct sbq_wait_state *ws;

        ws = &sbq->ws[atomic_read(wait_index)];
        sbq_index_atomic_inc(wait_index);
        return ws;
}

/**
 * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct
 * sbitmap_queue.
 * @sbq: Bitmap queue to wake up.
 */
void sbitmap_queue_wake_all(struct sbitmap_queue *sbq);

/**
 * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue
 * on a &struct sbitmap_queue.
 * @sbq: Bitmap queue to wake up.
 * @nr: Number of bits cleared.
 */
void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr);

/**
 * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct
 * seq_file.
 * @sbq: Bitmap queue to show.
 * @m: struct seq_file to write to.
 *
 * This is intended for debugging. The format may change at any time.
 */
void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m);

struct sbq_wait {
        struct sbitmap_queue *sbq;        /* if set, sbq_wait is accounted */
        struct wait_queue_entry wait;
};

#define DEFINE_SBQ_WAIT(name)                                                        \
        struct sbq_wait name = {                                                \
                .sbq = NULL,                                                        \
                .wait = {                                                        \
                        .private        = current,                                \
                        .func                = autoremove_wake_function,                \
                        .entry                = LIST_HEAD_INIT((name).wait.entry),        \
                }                                                                \
        }

/*
 * Wrapper around prepare_to_wait_exclusive(), which maintains some extra
 * internal state.
 */
void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq,
                                struct sbq_wait_state *ws,
                                struct sbq_wait *sbq_wait, int state);

/*
 * Must be paired with sbitmap_prepare_to_wait().
 */
void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws,
                                struct sbq_wait *sbq_wait);

/*
 * Wrapper around add_wait_queue(), which maintains some extra internal state
 */
void sbitmap_add_wait_queue(struct sbitmap_queue *sbq,
                            struct sbq_wait_state *ws,
                            struct sbq_wait *sbq_wait);

/*
 * Must be paired with sbitmap_add_wait_queue()
 */
void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait);

#endif /* __LINUX_SCALE_BITMAP_H */






























































































   16 




  610 




















  384 





  230 
  384 




    1 






    1 













  431 






  431 
















  261 








  261 


















  276 





























  173 


  173 
  173 








  173 














  155 
  156 




   18 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PAGE_REF_H
#define _LINUX_PAGE_REF_H

#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/tracepoint-defs.h>

DECLARE_TRACEPOINT(page_ref_set);
DECLARE_TRACEPOINT(page_ref_mod);
DECLARE_TRACEPOINT(page_ref_mod_and_test);
DECLARE_TRACEPOINT(page_ref_mod_and_return);
DECLARE_TRACEPOINT(page_ref_mod_unless);
DECLARE_TRACEPOINT(page_ref_freeze);
DECLARE_TRACEPOINT(page_ref_unfreeze);

#ifdef CONFIG_DEBUG_PAGE_REF

/*
 * Ideally we would want to use the trace_<tracepoint>_enabled() helper
 * functions. But due to include header file issues, that is not
 * feasible. Instead we have to open code the static key functions.
 *
 * See trace_##name##_enabled(void) in include/linux/tracepoint.h
 */
#define page_ref_tracepoint_active(t) tracepoint_enabled(t)

extern void __page_ref_set(struct page *page, int v);
extern void __page_ref_mod(struct page *page, int v);
extern void __page_ref_mod_and_test(struct page *page, int v, int ret);
extern void __page_ref_mod_and_return(struct page *page, int v, int ret);
extern void __page_ref_mod_unless(struct page *page, int v, int u);
extern void __page_ref_freeze(struct page *page, int v, int ret);
extern void __page_ref_unfreeze(struct page *page, int v);

#else

#define page_ref_tracepoint_active(t) false

static inline void __page_ref_set(struct page *page, int v)
{
}
static inline void __page_ref_mod(struct page *page, int v)
{
}
static inline void __page_ref_mod_and_test(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_and_return(struct page *page, int v, int ret)
{
}
static inline void __page_ref_mod_unless(struct page *page, int v, int u)
{
}
static inline void __page_ref_freeze(struct page *page, int v, int ret)
{
}
static inline void __page_ref_unfreeze(struct page *page, int v)
{
}

#endif

static inline int page_ref_count(const struct page *page)
{
        return atomic_read(&page->_refcount);
}

/**
 * folio_ref_count - The reference count on this folio.
 * @folio: The folio.
 *
 * The refcount is usually incremented by calls to folio_get() and
 * decremented by calls to folio_put().  Some typical users of the
 * folio refcount:
 *
 * - Each reference from a page table
 * - The page cache
 * - Filesystem private data
 * - The LRU list
 * - Pipes
 * - Direct IO which references this page in the process address space
 *
 * Return: The number of references to this folio.
 */
static inline int folio_ref_count(const struct folio *folio)
{
        return page_ref_count(&folio->page);
}

static inline int page_count(const struct page *page)
{
        return folio_ref_count(page_folio(page));
}

static inline void set_page_count(struct page *page, int v)
{
        atomic_set(&page->_refcount, v);
        if (page_ref_tracepoint_active(page_ref_set))
                __page_ref_set(page, v);
}

static inline void folio_set_count(struct folio *folio, int v)
{
        set_page_count(&folio->page, v);
}

/*
 * Setup the page count before being freed into the page allocator for
 * the first time (boot or memory hotplug)
 */
static inline void init_page_count(struct page *page)
{
        set_page_count(page, 1);
}

static inline void page_ref_add(struct page *page, int nr)
{
        atomic_add(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, nr);
}

static inline void folio_ref_add(struct folio *folio, int nr)
{
        page_ref_add(&folio->page, nr);
}

static inline void page_ref_sub(struct page *page, int nr)
{
        atomic_sub(nr, &page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -nr);
}

static inline void folio_ref_sub(struct folio *folio, int nr)
{
        page_ref_sub(&folio->page, nr);
}

static inline int folio_ref_sub_return(struct folio *folio, int nr)
{
        int ret = atomic_sub_return(nr, &folio->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(&folio->page, -nr, ret);
        return ret;
}

static inline void page_ref_inc(struct page *page)
{
        atomic_inc(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, 1);
}

static inline void folio_ref_inc(struct folio *folio)
{
        page_ref_inc(&folio->page);
}

static inline void page_ref_dec(struct page *page)
{
        atomic_dec(&page->_refcount);
        if (page_ref_tracepoint_active(page_ref_mod))
                __page_ref_mod(page, -1);
}

static inline void folio_ref_dec(struct folio *folio)
{
        page_ref_dec(&folio->page);
}

static inline int page_ref_sub_and_test(struct page *page, int nr)
{
        int ret = atomic_sub_and_test(nr, &page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -nr, ret);
        return ret;
}

static inline int folio_ref_sub_and_test(struct folio *folio, int nr)
{
        return page_ref_sub_and_test(&folio->page, nr);
}

static inline int page_ref_inc_return(struct page *page)
{
        int ret = atomic_inc_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, 1, ret);
        return ret;
}

static inline int folio_ref_inc_return(struct folio *folio)
{
        return page_ref_inc_return(&folio->page);
}

static inline int page_ref_dec_and_test(struct page *page)
{
        int ret = atomic_dec_and_test(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_test))
                __page_ref_mod_and_test(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_and_test(struct folio *folio)
{
        return page_ref_dec_and_test(&folio->page);
}

static inline int page_ref_dec_return(struct page *page)
{
        int ret = atomic_dec_return(&page->_refcount);

        if (page_ref_tracepoint_active(page_ref_mod_and_return))
                __page_ref_mod_and_return(page, -1, ret);
        return ret;
}

static inline int folio_ref_dec_return(struct folio *folio)
{
        return page_ref_dec_return(&folio->page);
}

static inline bool page_ref_add_unless(struct page *page, int nr, int u)
{
        bool ret = false;

        rcu_read_lock();
        /* avoid writing to the vmemmap area being remapped */
        if (page_count_writable(page, u))
                ret = atomic_add_unless(&page->_refcount, nr, u);
        rcu_read_unlock();

        if (page_ref_tracepoint_active(page_ref_mod_unless))
                __page_ref_mod_unless(page, nr, ret);
        return ret;
}

static inline bool folio_ref_add_unless(struct folio *folio, int nr, int u)
{
        return page_ref_add_unless(&folio->page, nr, u);
}

/**
 * folio_try_get - Attempt to increase the refcount on a folio.
 * @folio: The folio.
 *
 * If you do not already have a reference to a folio, you can attempt to
 * get one using this function.  It may fail if, for example, the folio
 * has been freed since you found a pointer to it, or it is frozen for
 * the purposes of splitting or migration.
 *
 * Return: True if the reference count was successfully incremented.
 */
static inline bool folio_try_get(struct folio *folio)
{
        return folio_ref_add_unless(folio, 1, 0);
}

static inline bool folio_ref_try_add(struct folio *folio, int count)
{
        return folio_ref_add_unless(folio, count, 0);
}

static inline int page_ref_freeze(struct page *page, int count)
{
        int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);

        if (page_ref_tracepoint_active(page_ref_freeze))
                __page_ref_freeze(page, count, ret);
        return ret;
}

static inline int folio_ref_freeze(struct folio *folio, int count)
{
        return page_ref_freeze(&folio->page, count);
}

static inline void page_ref_unfreeze(struct page *page, int count)
{
        VM_BUG_ON_PAGE(page_count(page) != 0, page);
        VM_BUG_ON(count == 0);

        atomic_set_release(&page->_refcount, count);
        if (page_ref_tracepoint_active(page_ref_unfreeze))
                __page_ref_unfreeze(page, count);
}

static inline void folio_ref_unfreeze(struct folio *folio, int count)
{
        page_ref_unfreeze(&folio->page, count);
}
#endif





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/*
 * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
 * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
 * Copyright (c) 2004, 2020 Intel Corporation.  All rights reserved.
 * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
 * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
 * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
 */

#ifndef IB_VERBS_H
#define IB_VERBS_H

#include <linux/ethtool.h>
#include <linux/types.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/rwsem.h>
#include <linux/workqueue.h>
#include <linux/irq_poll.h>
#include <uapi/linux/if_ether.h>
#include <net/ipv6.h>
#include <net/ip.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <linux/refcount.h>
#include <linux/if_link.h>
#include <linux/atomic.h>
#include <linux/mmu_notifier.h>
#include <linux/uaccess.h>
#include <linux/cgroup_rdma.h>
#include <linux/irqflags.h>
#include <linux/preempt.h>
#include <linux/dim.h>
#include <uapi/rdma/ib_user_verbs.h>
#include <rdma/rdma_counter.h>
#include <rdma/restrack.h>
#include <rdma/signature.h>
#include <uapi/rdma/rdma_user_ioctl.h>
#include <uapi/rdma/ib_user_ioctl_verbs.h>

#define IB_FW_VERSION_NAME_MAX        ETHTOOL_FWVERS_LEN

struct ib_umem_odp;
struct ib_uqp_object;
struct ib_usrq_object;
struct ib_uwq_object;
struct rdma_cm_id;
struct ib_port;
struct hw_stats_device_data;

extern struct workqueue_struct *ib_wq;
extern struct workqueue_struct *ib_comp_wq;
extern struct workqueue_struct *ib_comp_unbound_wq;

struct ib_ucq_object;

__printf(2, 3) __cold
void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...);
__printf(2, 3) __cold
void ibdev_alert(const struct ib_device *ibdev, const char *format, ...);
__printf(2, 3) __cold
void ibdev_crit(const struct ib_device *ibdev, const char *format, ...);
__printf(2, 3) __cold
void ibdev_err(const struct ib_device *ibdev, const char *format, ...);
__printf(2, 3) __cold
void ibdev_warn(const struct ib_device *ibdev, const char *format, ...);
__printf(2, 3) __cold
void ibdev_notice(const struct ib_device *ibdev, const char *format, ...);
__printf(2, 3) __cold
void ibdev_info(const struct ib_device *ibdev, const char *format, ...);

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define ibdev_dbg(__dev, format, args...)                       \
        dynamic_ibdev_dbg(__dev, format, ##args)
#else
__printf(2, 3) __cold
static inline
void ibdev_dbg(const struct ib_device *ibdev, const char *format, ...) {}
#endif

#define ibdev_level_ratelimited(ibdev_level, ibdev, fmt, ...)           \
do {                                                                    \
        static DEFINE_RATELIMIT_STATE(_rs,                              \
                                      DEFAULT_RATELIMIT_INTERVAL,       \
                                      DEFAULT_RATELIMIT_BURST);         \
        if (__ratelimit(&_rs))                                          \
                ibdev_level(ibdev, fmt, ##__VA_ARGS__);                 \
} while (0)

#define ibdev_emerg_ratelimited(ibdev, fmt, ...) \
        ibdev_level_ratelimited(ibdev_emerg, ibdev, fmt, ##__VA_ARGS__)
#define ibdev_alert_ratelimited(ibdev, fmt, ...) \
        ibdev_level_ratelimited(ibdev_alert, ibdev, fmt, ##__VA_ARGS__)
#define ibdev_crit_ratelimited(ibdev, fmt, ...) \
        ibdev_level_ratelimited(ibdev_crit, ibdev, fmt, ##__VA_ARGS__)
#define ibdev_err_ratelimited(ibdev, fmt, ...) \
        ibdev_level_ratelimited(ibdev_err, ibdev, fmt, ##__VA_ARGS__)
#define ibdev_warn_ratelimited(ibdev, fmt, ...) \
        ibdev_level_ratelimited(ibdev_warn, ibdev, fmt, ##__VA_ARGS__)
#define ibdev_notice_ratelimited(ibdev, fmt, ...) \
        ibdev_level_ratelimited(ibdev_notice, ibdev, fmt, ##__VA_ARGS__)
#define ibdev_info_ratelimited(ibdev, fmt, ...) \
        ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__)

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define ibdev_dbg_ratelimited(ibdev, fmt, ...)                          \
do {                                                                    \
        static DEFINE_RATELIMIT_STATE(_rs,                              \
                                      DEFAULT_RATELIMIT_INTERVAL,       \
                                      DEFAULT_RATELIMIT_BURST);         \
        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt);                 \
        if (DYNAMIC_DEBUG_BRANCH(descriptor) && __ratelimit(&_rs))      \
                __dynamic_ibdev_dbg(&descriptor, ibdev, fmt,            \
                                    ##__VA_ARGS__);                     \
} while (0)
#else
__printf(2, 3) __cold
static inline
void ibdev_dbg_ratelimited(const struct ib_device *ibdev, const char *format, ...) {}
#endif

union ib_gid {
        u8        raw[16];
        struct {
                __be64        subnet_prefix;
                __be64        interface_id;
        } global;
};

extern union ib_gid zgid;

enum ib_gid_type {
        IB_GID_TYPE_IB = IB_UVERBS_GID_TYPE_IB,
        IB_GID_TYPE_ROCE = IB_UVERBS_GID_TYPE_ROCE_V1,
        IB_GID_TYPE_ROCE_UDP_ENCAP = IB_UVERBS_GID_TYPE_ROCE_V2,
        IB_GID_TYPE_SIZE
};

#define ROCE_V2_UDP_DPORT      4791
struct ib_gid_attr {
        struct net_device __rcu        *ndev;
        struct ib_device        *device;
        union ib_gid                gid;
        enum ib_gid_type        gid_type;
        u16                        index;
        u32                        port_num;
};

enum {
        /* set the local administered indication */
        IB_SA_WELL_KNOWN_GUID        = BIT_ULL(57) | 2,
};

enum rdma_transport_type {
        RDMA_TRANSPORT_IB,
        RDMA_TRANSPORT_IWARP,
        RDMA_TRANSPORT_USNIC,
        RDMA_TRANSPORT_USNIC_UDP,
        RDMA_TRANSPORT_UNSPECIFIED,
};

enum rdma_protocol_type {
        RDMA_PROTOCOL_IB,
        RDMA_PROTOCOL_IBOE,
        RDMA_PROTOCOL_IWARP,
        RDMA_PROTOCOL_USNIC_UDP
};

__attribute_const__ enum rdma_transport_type
rdma_node_get_transport(unsigned int node_type);

enum rdma_network_type {
        RDMA_NETWORK_IB,
        RDMA_NETWORK_ROCE_V1,
        RDMA_NETWORK_IPV4,
        RDMA_NETWORK_IPV6
};

static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type)
{
        if (network_type == RDMA_NETWORK_IPV4 ||
            network_type == RDMA_NETWORK_IPV6)
                return IB_GID_TYPE_ROCE_UDP_ENCAP;
        else if (network_type == RDMA_NETWORK_ROCE_V1)
                return IB_GID_TYPE_ROCE;
        else
                return IB_GID_TYPE_IB;
}

static inline enum rdma_network_type
rdma_gid_attr_network_type(const struct ib_gid_attr *attr)
{
        if (attr->gid_type == IB_GID_TYPE_IB)
                return RDMA_NETWORK_IB;

        if (attr->gid_type == IB_GID_TYPE_ROCE)
                return RDMA_NETWORK_ROCE_V1;

        if (ipv6_addr_v4mapped((struct in6_addr *)&attr->gid))
                return RDMA_NETWORK_IPV4;
        else
                return RDMA_NETWORK_IPV6;
}

enum rdma_link_layer {
        IB_LINK_LAYER_UNSPECIFIED,
        IB_LINK_LAYER_INFINIBAND,
        IB_LINK_LAYER_ETHERNET,
};

enum ib_device_cap_flags {
        IB_DEVICE_RESIZE_MAX_WR = IB_UVERBS_DEVICE_RESIZE_MAX_WR,
        IB_DEVICE_BAD_PKEY_CNTR = IB_UVERBS_DEVICE_BAD_PKEY_CNTR,
        IB_DEVICE_BAD_QKEY_CNTR = IB_UVERBS_DEVICE_BAD_QKEY_CNTR,
        IB_DEVICE_RAW_MULTI = IB_UVERBS_DEVICE_RAW_MULTI,
        IB_DEVICE_AUTO_PATH_MIG = IB_UVERBS_DEVICE_AUTO_PATH_MIG,
        IB_DEVICE_CHANGE_PHY_PORT = IB_UVERBS_DEVICE_CHANGE_PHY_PORT,
        IB_DEVICE_UD_AV_PORT_ENFORCE = IB_UVERBS_DEVICE_UD_AV_PORT_ENFORCE,
        IB_DEVICE_CURR_QP_STATE_MOD = IB_UVERBS_DEVICE_CURR_QP_STATE_MOD,
        IB_DEVICE_SHUTDOWN_PORT = IB_UVERBS_DEVICE_SHUTDOWN_PORT,
        /* IB_DEVICE_INIT_TYPE = IB_UVERBS_DEVICE_INIT_TYPE, (not in use) */
        IB_DEVICE_PORT_ACTIVE_EVENT = IB_UVERBS_DEVICE_PORT_ACTIVE_EVENT,
        IB_DEVICE_SYS_IMAGE_GUID = IB_UVERBS_DEVICE_SYS_IMAGE_GUID,
        IB_DEVICE_RC_RNR_NAK_GEN = IB_UVERBS_DEVICE_RC_RNR_NAK_GEN,
        IB_DEVICE_SRQ_RESIZE = IB_UVERBS_DEVICE_SRQ_RESIZE,
        IB_DEVICE_N_NOTIFY_CQ = IB_UVERBS_DEVICE_N_NOTIFY_CQ,

        /* Reserved, old SEND_W_INV = 1 << 16,*/
        IB_DEVICE_MEM_WINDOW = IB_UVERBS_DEVICE_MEM_WINDOW,
        /*
         * Devices should set IB_DEVICE_UD_IP_SUM if they support
         * insertion of UDP and TCP checksum on outgoing UD IPoIB
         * messages and can verify the validity of checksum for
         * incoming messages.  Setting this flag implies that the
         * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode.
         */
        IB_DEVICE_UD_IP_CSUM = IB_UVERBS_DEVICE_UD_IP_CSUM,
        IB_DEVICE_XRC = IB_UVERBS_DEVICE_XRC,

        /*
         * This device supports the IB "base memory management extension",
         * which includes support for fast registrations (IB_WR_REG_MR,
         * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs).  This flag should
         * also be set by any iWarp device which must support FRs to comply
         * to the iWarp verbs spec.  iWarp devices also support the
         * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the
         * stag.
         */
        IB_DEVICE_MEM_MGT_EXTENSIONS = IB_UVERBS_DEVICE_MEM_MGT_EXTENSIONS,
        IB_DEVICE_MEM_WINDOW_TYPE_2A = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2A,
        IB_DEVICE_MEM_WINDOW_TYPE_2B = IB_UVERBS_DEVICE_MEM_WINDOW_TYPE_2B,
        IB_DEVICE_RC_IP_CSUM = IB_UVERBS_DEVICE_RC_IP_CSUM,
        /* Deprecated. Please use IB_RAW_PACKET_CAP_IP_CSUM. */
        IB_DEVICE_RAW_IP_CSUM = IB_UVERBS_DEVICE_RAW_IP_CSUM,
        IB_DEVICE_MANAGED_FLOW_STEERING =
                IB_UVERBS_DEVICE_MANAGED_FLOW_STEERING,
        /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */
        IB_DEVICE_RAW_SCATTER_FCS = IB_UVERBS_DEVICE_RAW_SCATTER_FCS,
        /* The device supports padding incoming writes to cacheline. */
        IB_DEVICE_PCI_WRITE_END_PADDING =
                IB_UVERBS_DEVICE_PCI_WRITE_END_PADDING,
        /* Placement type attributes */
        IB_DEVICE_FLUSH_GLOBAL = IB_UVERBS_DEVICE_FLUSH_GLOBAL,
        IB_DEVICE_FLUSH_PERSISTENT = IB_UVERBS_DEVICE_FLUSH_PERSISTENT,
        IB_DEVICE_ATOMIC_WRITE = IB_UVERBS_DEVICE_ATOMIC_WRITE,
};

enum ib_kernel_cap_flags {
        /*
         * This device supports a per-device lkey or stag that can be
         * used without performing a memory registration for the local
         * memory.  Note that ULPs should never check this flag, but
         * instead of use the local_dma_lkey flag in the ib_pd structure,
         * which will always contain a usable lkey.
         */
        IBK_LOCAL_DMA_LKEY = 1 << 0,
        /* IB_QP_CREATE_INTEGRITY_EN is supported to implement T10-PI */
        IBK_INTEGRITY_HANDOVER = 1 << 1,
        /* IB_ACCESS_ON_DEMAND is supported during reg_user_mr() */
        IBK_ON_DEMAND_PAGING = 1 << 2,
        /* IB_MR_TYPE_SG_GAPS is supported */
        IBK_SG_GAPS_REG = 1 << 3,
        /* Driver supports RDMA_NLDEV_CMD_DELLINK */
        IBK_ALLOW_USER_UNREG = 1 << 4,

        /* ipoib will use IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK */
        IBK_BLOCK_MULTICAST_LOOPBACK = 1 << 5,
        /* iopib will use IB_QP_CREATE_IPOIB_UD_LSO for its QPs */
        IBK_UD_TSO = 1 << 6,
        /* iopib will use the device ops:
         *   get_vf_config
         *   get_vf_guid
         *   get_vf_stats
         *   set_vf_guid
         *   set_vf_link_state
         */
        IBK_VIRTUAL_FUNCTION = 1 << 7,
        /* ipoib will use IB_QP_CREATE_NETDEV_USE for its QPs */
        IBK_RDMA_NETDEV_OPA = 1 << 8,
};

enum ib_atomic_cap {
        IB_ATOMIC_NONE,
        IB_ATOMIC_HCA,
        IB_ATOMIC_GLOB
};

enum ib_odp_general_cap_bits {
        IB_ODP_SUPPORT                = 1 << 0,
        IB_ODP_SUPPORT_IMPLICIT = 1 << 1,
};

enum ib_odp_transport_cap_bits {
        IB_ODP_SUPPORT_SEND        = 1 << 0,
        IB_ODP_SUPPORT_RECV        = 1 << 1,
        IB_ODP_SUPPORT_WRITE        = 1 << 2,
        IB_ODP_SUPPORT_READ        = 1 << 3,
        IB_ODP_SUPPORT_ATOMIC        = 1 << 4,
        IB_ODP_SUPPORT_SRQ_RECV        = 1 << 5,
};

struct ib_odp_caps {
        uint64_t general_caps;
        struct {
                uint32_t  rc_odp_caps;
                uint32_t  uc_odp_caps;
                uint32_t  ud_odp_caps;
                uint32_t  xrc_odp_caps;
        } per_transport_caps;
};

struct ib_rss_caps {
        /* Corresponding bit will be set if qp type from
         * 'enum ib_qp_type' is supported, e.g.
         * supported_qpts |= 1 << IB_QPT_UD
         */
        u32 supported_qpts;
        u32 max_rwq_indirection_tables;
        u32 max_rwq_indirection_table_size;
};

enum ib_tm_cap_flags {
        /*  Support tag matching with rendezvous offload for RC transport */
        IB_TM_CAP_RNDV_RC = 1 << 0,
};

struct ib_tm_caps {
        /* Max size of RNDV header */
        u32 max_rndv_hdr_size;
        /* Max number of entries in tag matching list */
        u32 max_num_tags;
        /* From enum ib_tm_cap_flags */
        u32 flags;
        /* Max number of outstanding list operations */
        u32 max_ops;
        /* Max number of SGE in tag matching entry */
        u32 max_sge;
};

struct ib_cq_init_attr {
        unsigned int        cqe;
        u32                comp_vector;
        u32                flags;
};

enum ib_cq_attr_mask {
        IB_CQ_MODERATE = 1 << 0,
};

struct ib_cq_caps {
        u16     max_cq_moderation_count;
        u16     max_cq_moderation_period;
};

struct ib_dm_mr_attr {
        u64                length;
        u64                offset;
        u32                access_flags;
};

struct ib_dm_alloc_attr {
        u64        length;
        u32        alignment;
        u32        flags;
};

struct ib_device_attr {
        u64                        fw_ver;
        __be64                        sys_image_guid;
        u64                        max_mr_size;
        u64                        page_size_cap;
        u32                        vendor_id;
        u32                        vendor_part_id;
        u32                        hw_ver;
        int                        max_qp;
        int                        max_qp_wr;
        u64                        device_cap_flags;
        u64                        kernel_cap_flags;
        int                        max_send_sge;
        int                        max_recv_sge;
        int                        max_sge_rd;
        int                        max_cq;
        int                        max_cqe;
        int                        max_mr;
        int                        max_pd;
        int                        max_qp_rd_atom;
        int                        max_ee_rd_atom;
        int                        max_res_rd_atom;
        int                        max_qp_init_rd_atom;
        int                        max_ee_init_rd_atom;
        enum ib_atomic_cap        atomic_cap;
        enum ib_atomic_cap        masked_atomic_cap;
        int                        max_ee;
        int                        max_rdd;
        int                        max_mw;
        int                        max_raw_ipv6_qp;
        int                        max_raw_ethy_qp;
        int                        max_mcast_grp;
        int                        max_mcast_qp_attach;
        int                        max_total_mcast_qp_attach;
        int                        max_ah;
        int                        max_srq;
        int                        max_srq_wr;
        int                        max_srq_sge;
        unsigned int                max_fast_reg_page_list_len;
        unsigned int                max_pi_fast_reg_page_list_len;
        u16                        max_pkeys;
        u8                        local_ca_ack_delay;
        int                        sig_prot_cap;
        int                        sig_guard_cap;
        struct ib_odp_caps        odp_caps;
        uint64_t                timestamp_mask;
        uint64_t                hca_core_clock; /* in KHZ */
        struct ib_rss_caps        rss_caps;
        u32                        max_wq_type_rq;
        u32                        raw_packet_caps; /* Use ib_raw_packet_caps enum */
        struct ib_tm_caps        tm_caps;
        struct ib_cq_caps       cq_caps;
        u64                        max_dm_size;
        /* Max entries for sgl for optimized performance per READ */
        u32                        max_sgl_rd;
};

enum ib_mtu {
        IB_MTU_256  = 1,
        IB_MTU_512  = 2,
        IB_MTU_1024 = 3,
        IB_MTU_2048 = 4,
        IB_MTU_4096 = 5
};

enum opa_mtu {
        OPA_MTU_8192 = 6,
        OPA_MTU_10240 = 7
};

static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)
{
        switch (mtu) {
        case IB_MTU_256:  return  256;
        case IB_MTU_512:  return  512;
        case IB_MTU_1024: return 1024;
        case IB_MTU_2048: return 2048;
        case IB_MTU_4096: return 4096;
        default:           return -1;
        }
}

static inline enum ib_mtu ib_mtu_int_to_enum(int mtu)
{
        if (mtu >= 4096)
                return IB_MTU_4096;
        else if (mtu >= 2048)
                return IB_MTU_2048;
        else if (mtu >= 1024)
                return IB_MTU_1024;
        else if (mtu >= 512)
                return IB_MTU_512;
        else
                return IB_MTU_256;
}

static inline int opa_mtu_enum_to_int(enum opa_mtu mtu)
{
        switch (mtu) {
        case OPA_MTU_8192:
                return 8192;
        case OPA_MTU_10240:
                return 10240;
        default:
                return(ib_mtu_enum_to_int((enum ib_mtu)mtu));
        }
}

static inline enum opa_mtu opa_mtu_int_to_enum(int mtu)
{
        if (mtu >= 10240)
                return OPA_MTU_10240;
        else if (mtu >= 8192)
                return OPA_MTU_8192;
        else
                return ((enum opa_mtu)ib_mtu_int_to_enum(mtu));
}

enum ib_port_state {
        IB_PORT_NOP                = 0,
        IB_PORT_DOWN                = 1,
        IB_PORT_INIT                = 2,
        IB_PORT_ARMED                = 3,
        IB_PORT_ACTIVE                = 4,
        IB_PORT_ACTIVE_DEFER        = 5
};

static inline const char *__attribute_const__
ib_port_state_to_str(enum ib_port_state state)
{
        const char * const states[] = {
                [IB_PORT_NOP] = "NOP",
                [IB_PORT_DOWN] = "DOWN",
                [IB_PORT_INIT] = "INIT",
                [IB_PORT_ARMED] = "ARMED",
                [IB_PORT_ACTIVE] = "ACTIVE",
                [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER",
        };

        if (state < ARRAY_SIZE(states))
                return states[state];
        return "UNKNOWN";
}

enum ib_port_phys_state {
        IB_PORT_PHYS_STATE_SLEEP = 1,
        IB_PORT_PHYS_STATE_POLLING = 2,
        IB_PORT_PHYS_STATE_DISABLED = 3,
        IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING = 4,
        IB_PORT_PHYS_STATE_LINK_UP = 5,
        IB_PORT_PHYS_STATE_LINK_ERROR_RECOVERY = 6,
        IB_PORT_PHYS_STATE_PHY_TEST = 7,
};

enum ib_port_width {
        IB_WIDTH_1X        = 1,
        IB_WIDTH_2X        = 16,
        IB_WIDTH_4X        = 2,
        IB_WIDTH_8X        = 4,
        IB_WIDTH_12X        = 8
};

static inline int ib_width_enum_to_int(enum ib_port_width width)
{
        switch (width) {
        case IB_WIDTH_1X:  return  1;
        case IB_WIDTH_2X:  return  2;
        case IB_WIDTH_4X:  return  4;
        case IB_WIDTH_8X:  return  8;
        case IB_WIDTH_12X: return 12;
        default:           return -1;
        }
}

enum ib_port_speed {
        IB_SPEED_SDR        = 1,
        IB_SPEED_DDR        = 2,
        IB_SPEED_QDR        = 4,
        IB_SPEED_FDR10        = 8,
        IB_SPEED_FDR        = 16,
        IB_SPEED_EDR        = 32,
        IB_SPEED_HDR        = 64,
        IB_SPEED_NDR        = 128,
        IB_SPEED_XDR        = 256,
};

enum ib_stat_flag {
        IB_STAT_FLAG_OPTIONAL = 1 << 0,
};

/**
 * struct rdma_stat_desc
 * @name - The name of the counter
 * @flags - Flags of the counter; For example, IB_STAT_FLAG_OPTIONAL
 * @priv - Driver private information; Core code should not use
 */
struct rdma_stat_desc {
        const char *name;
        unsigned int flags;
        const void *priv;
};

/**
 * struct rdma_hw_stats
 * @lock - Mutex to protect parallel write access to lifespan and values
 *    of counters, which are 64bits and not guaranteed to be written
 *    atomicaly on 32bits systems.
 * @timestamp - Used by the core code to track when the last update was
 * @lifespan - Used by the core code to determine how old the counters
 *   should be before being updated again.  Stored in jiffies, defaults
 *   to 10 milliseconds, drivers can override the default be specifying
 *   their own value during their allocation routine.
 * @descs - Array of pointers to static descriptors used for the counters
 *   in directory.
 * @is_disabled - A bitmap to indicate each counter is currently disabled
 *   or not.
 * @num_counters - How many hardware counters there are.  If name is
 *   shorter than this number, a kernel oops will result.  Driver authors
 *   are encouraged to leave BUILD_BUG_ON(ARRAY_SIZE(@name) < num_counters)
 *   in their code to prevent this.
 * @value - Array of u64 counters that are accessed by the sysfs code and
 *   filled in by the drivers get_stats routine
 */
struct rdma_hw_stats {
        struct mutex        lock; /* Protect lifespan and values[] */
        unsigned long        timestamp;
        unsigned long        lifespan;
        const struct rdma_stat_desc *descs;
        unsigned long        *is_disabled;
        int                num_counters;
        u64                value[] __counted_by(num_counters);
};

#define RDMA_HW_STATS_DEFAULT_LIFESPAN 10

struct rdma_hw_stats *rdma_alloc_hw_stats_struct(
        const struct rdma_stat_desc *descs, int num_counters,
        unsigned long lifespan);

void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats);

/* Define bits for the various functionality this port needs to be supported by
 * the core.
 */
/* Management                           0x00000FFF */
#define RDMA_CORE_CAP_IB_MAD            0x00000001
#define RDMA_CORE_CAP_IB_SMI            0x00000002
#define RDMA_CORE_CAP_IB_CM             0x00000004
#define RDMA_CORE_CAP_IW_CM             0x00000008
#define RDMA_CORE_CAP_IB_SA             0x00000010
#define RDMA_CORE_CAP_OPA_MAD           0x00000020

/* Address format                       0x000FF000 */
#define RDMA_CORE_CAP_AF_IB             0x00001000
#define RDMA_CORE_CAP_ETH_AH            0x00002000
#define RDMA_CORE_CAP_OPA_AH            0x00004000
#define RDMA_CORE_CAP_IB_GRH_REQUIRED   0x00008000

/* Protocol                             0xFFF00000 */
#define RDMA_CORE_CAP_PROT_IB           0x00100000
#define RDMA_CORE_CAP_PROT_ROCE         0x00200000
#define RDMA_CORE_CAP_PROT_IWARP        0x00400000
#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000
#define RDMA_CORE_CAP_PROT_RAW_PACKET   0x01000000
#define RDMA_CORE_CAP_PROT_USNIC        0x02000000

#define RDMA_CORE_PORT_IB_GRH_REQUIRED (RDMA_CORE_CAP_IB_GRH_REQUIRED \
                                        | RDMA_CORE_CAP_PROT_ROCE     \
                                        | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP)

#define RDMA_CORE_PORT_IBA_IB          (RDMA_CORE_CAP_PROT_IB  \
                                        | RDMA_CORE_CAP_IB_MAD \
                                        | RDMA_CORE_CAP_IB_SMI \
                                        | RDMA_CORE_CAP_IB_CM  \
                                        | RDMA_CORE_CAP_IB_SA  \
                                        | RDMA_CORE_CAP_AF_IB)
#define RDMA_CORE_PORT_IBA_ROCE        (RDMA_CORE_CAP_PROT_ROCE \
                                        | RDMA_CORE_CAP_IB_MAD  \
                                        | RDMA_CORE_CAP_IB_CM   \
                                        | RDMA_CORE_CAP_AF_IB   \
                                        | RDMA_CORE_CAP_ETH_AH)
#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP                        \
                                        (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \
                                        | RDMA_CORE_CAP_IB_MAD  \
                                        | RDMA_CORE_CAP_IB_CM   \
                                        | RDMA_CORE_CAP_AF_IB   \
                                        | RDMA_CORE_CAP_ETH_AH)
#define RDMA_CORE_PORT_IWARP           (RDMA_CORE_CAP_PROT_IWARP \
                                        | RDMA_CORE_CAP_IW_CM)
#define RDMA_CORE_PORT_INTEL_OPA       (RDMA_CORE_PORT_IBA_IB  \
                                        | RDMA_CORE_CAP_OPA_MAD)

#define RDMA_CORE_PORT_RAW_PACKET        (RDMA_CORE_CAP_PROT_RAW_PACKET)

#define RDMA_CORE_PORT_USNIC                (RDMA_CORE_CAP_PROT_USNIC)

struct ib_port_attr {
        u64                        subnet_prefix;
        enum ib_port_state        state;
        enum ib_mtu                max_mtu;
        enum ib_mtu                active_mtu;
        u32                     phys_mtu;
        int                        gid_tbl_len;
        unsigned int                ip_gids:1;
        /* This is the value from PortInfo CapabilityMask, defined by IBA */
        u32                        port_cap_flags;
        u32                        max_msg_sz;
        u32                        bad_pkey_cntr;
        u32                        qkey_viol_cntr;
        u16                        pkey_tbl_len;
        u32                        sm_lid;
        u32                        lid;
        u8                        lmc;
        u8                        max_vl_num;
        u8                        sm_sl;
        u8                        subnet_timeout;
        u8                        init_type_reply;
        u8                        active_width;
        u16                        active_speed;
        u8                      phys_state;
        u16                        port_cap_flags2;
};

enum ib_device_modify_flags {
        IB_DEVICE_MODIFY_SYS_IMAGE_GUID        = 1 << 0,
        IB_DEVICE_MODIFY_NODE_DESC        = 1 << 1
};

#define IB_DEVICE_NODE_DESC_MAX 64

struct ib_device_modify {
        u64        sys_image_guid;
        char        node_desc[IB_DEVICE_NODE_DESC_MAX];
};

enum ib_port_modify_flags {
        IB_PORT_SHUTDOWN                = 1,
        IB_PORT_INIT_TYPE                = (1<<2),
        IB_PORT_RESET_QKEY_CNTR                = (1<<3),
        IB_PORT_OPA_MASK_CHG                = (1<<4)
};

struct ib_port_modify {
        u32        set_port_cap_mask;
        u32        clr_port_cap_mask;
        u8        init_type;
};

enum ib_event_type {
        IB_EVENT_CQ_ERR,
        IB_EVENT_QP_FATAL,
        IB_EVENT_QP_REQ_ERR,
        IB_EVENT_QP_ACCESS_ERR,
        IB_EVENT_COMM_EST,
        IB_EVENT_SQ_DRAINED,
        IB_EVENT_PATH_MIG,
        IB_EVENT_PATH_MIG_ERR,
        IB_EVENT_DEVICE_FATAL,
        IB_EVENT_PORT_ACTIVE,
        IB_EVENT_PORT_ERR,
        IB_EVENT_LID_CHANGE,
        IB_EVENT_PKEY_CHANGE,
        IB_EVENT_SM_CHANGE,
        IB_EVENT_SRQ_ERR,
        IB_EVENT_SRQ_LIMIT_REACHED,
        IB_EVENT_QP_LAST_WQE_REACHED,
        IB_EVENT_CLIENT_REREGISTER,
        IB_EVENT_GID_CHANGE,
        IB_EVENT_WQ_FATAL,
};

const char *__attribute_const__ ib_event_msg(enum ib_event_type event);

struct ib_event {
        struct ib_device        *device;
        union {
                struct ib_cq        *cq;
                struct ib_qp        *qp;
                struct ib_srq        *srq;
                struct ib_wq        *wq;
                u32                port_num;
        } element;
        enum ib_event_type        event;
};

struct ib_event_handler {
        struct ib_device *device;
        void            (*handler)(struct ib_event_handler *, struct ib_event *);
        struct list_head  list;
};

#define INIT_IB_EVENT_HANDLER(_ptr, _device, _handler)                \
        do {                                                        \
                (_ptr)->device  = _device;                        \
                (_ptr)->handler = _handler;                        \
                INIT_LIST_HEAD(&(_ptr)->list);                        \
        } while (0)

struct ib_global_route {
        const struct ib_gid_attr *sgid_attr;
        union ib_gid        dgid;
        u32                flow_label;
        u8                sgid_index;
        u8                hop_limit;
        u8                traffic_class;
};

struct ib_grh {
        __be32                version_tclass_flow;
        __be16                paylen;
        u8                next_hdr;
        u8                hop_limit;
        union ib_gid        sgid;
        union ib_gid        dgid;
};

union rdma_network_hdr {
        struct ib_grh ibgrh;
        struct {
                /* The IB spec states that if it's IPv4, the header
                 * is located in the last 20 bytes of the header.
                 */
                u8                reserved[20];
                struct iphdr        roce4grh;
        };
};

#define IB_QPN_MASK                0xFFFFFF

enum {
        IB_MULTICAST_QPN = 0xffffff
};

#define IB_LID_PERMISSIVE        cpu_to_be16(0xFFFF)
#define IB_MULTICAST_LID_BASE        cpu_to_be16(0xC000)

enum ib_ah_flags {
        IB_AH_GRH        = 1
};

enum ib_rate {
        IB_RATE_PORT_CURRENT = 0,
        IB_RATE_2_5_GBPS = 2,
        IB_RATE_5_GBPS   = 5,
        IB_RATE_10_GBPS  = 3,
        IB_RATE_20_GBPS  = 6,
        IB_RATE_30_GBPS  = 4,
        IB_RATE_40_GBPS  = 7,
        IB_RATE_60_GBPS  = 8,
        IB_RATE_80_GBPS  = 9,
        IB_RATE_120_GBPS = 10,
        IB_RATE_14_GBPS  = 11,
        IB_RATE_56_GBPS  = 12,
        IB_RATE_112_GBPS = 13,
        IB_RATE_168_GBPS = 14,
        IB_RATE_25_GBPS  = 15,
        IB_RATE_100_GBPS = 16,
        IB_RATE_200_GBPS = 17,
        IB_RATE_300_GBPS = 18,
        IB_RATE_28_GBPS  = 19,
        IB_RATE_50_GBPS  = 20,
        IB_RATE_400_GBPS = 21,
        IB_RATE_600_GBPS = 22,
        IB_RATE_800_GBPS = 23,
};

/**
 * ib_rate_to_mult - Convert the IB rate enum to a multiple of the
 * base rate of 2.5 Gbit/sec.  For example, IB_RATE_5_GBPS will be
 * converted to 2, since 5 Gbit/sec is 2 * 2.5 Gbit/sec.
 * @rate: rate to convert.
 */
__attribute_const__ int ib_rate_to_mult(enum ib_rate rate);

/**
 * ib_rate_to_mbps - Convert the IB rate enum to Mbps.
 * For example, IB_RATE_2_5_GBPS will be converted to 2500.
 * @rate: rate to convert.
 */
__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate);


/**
 * enum ib_mr_type - memory region type
 * @IB_MR_TYPE_MEM_REG:       memory region that is used for
 *                            normal registration
 * @IB_MR_TYPE_SG_GAPS:       memory region that is capable to
 *                            register any arbitrary sg lists (without
 *                            the normal mr constraints - see
 *                            ib_map_mr_sg)
 * @IB_MR_TYPE_DM:            memory region that is used for device
 *                            memory registration
 * @IB_MR_TYPE_USER:          memory region that is used for the user-space
 *                            application
 * @IB_MR_TYPE_DMA:           memory region that is used for DMA operations
 *                            without address translations (VA=PA)
 * @IB_MR_TYPE_INTEGRITY:     memory region that is used for
 *                            data integrity operations
 */
enum ib_mr_type {
        IB_MR_TYPE_MEM_REG,
        IB_MR_TYPE_SG_GAPS,
        IB_MR_TYPE_DM,
        IB_MR_TYPE_USER,
        IB_MR_TYPE_DMA,
        IB_MR_TYPE_INTEGRITY,
};

enum ib_mr_status_check {
        IB_MR_CHECK_SIG_STATUS = 1,
};

/**
 * struct ib_mr_status - Memory region status container
 *
 * @fail_status: Bitmask of MR checks status. For each
 *     failed check a corresponding status bit is set.
 * @sig_err: Additional info for IB_MR_CEHCK_SIG_STATUS
 *     failure.
 */
struct ib_mr_status {
        u32                    fail_status;
        struct ib_sig_err   sig_err;
};

/**
 * mult_to_ib_rate - Convert a multiple of 2.5 Gbit/sec to an IB rate
 * enum.
 * @mult: multiple to convert.
 */
__attribute_const__ enum ib_rate mult_to_ib_rate(int mult);

struct rdma_ah_init_attr {
        struct rdma_ah_attr *ah_attr;
        u32 flags;
        struct net_device *xmit_slave;
};

enum rdma_ah_attr_type {
        RDMA_AH_ATTR_TYPE_UNDEFINED,
        RDMA_AH_ATTR_TYPE_IB,
        RDMA_AH_ATTR_TYPE_ROCE,
        RDMA_AH_ATTR_TYPE_OPA,
};

struct ib_ah_attr {
        u16                        dlid;
        u8                        src_path_bits;
};

struct roce_ah_attr {
        u8                        dmac[ETH_ALEN];
};

struct opa_ah_attr {
        u32                        dlid;
        u8                        src_path_bits;
        bool                        make_grd;
};

struct rdma_ah_attr {
        struct ib_global_route        grh;
        u8                        sl;
        u8                        static_rate;
        u32                        port_num;
        u8                        ah_flags;
        enum rdma_ah_attr_type type;
        union {
                struct ib_ah_attr ib;
                struct roce_ah_attr roce;
                struct opa_ah_attr opa;
        };
};

enum ib_wc_status {
        IB_WC_SUCCESS,
        IB_WC_LOC_LEN_ERR,
        IB_WC_LOC_QP_OP_ERR,
        IB_WC_LOC_EEC_OP_ERR,
        IB_WC_LOC_PROT_ERR,
        IB_WC_WR_FLUSH_ERR,
        IB_WC_MW_BIND_ERR,
        IB_WC_BAD_RESP_ERR,
        IB_WC_LOC_ACCESS_ERR,
        IB_WC_REM_INV_REQ_ERR,
        IB_WC_REM_ACCESS_ERR,
        IB_WC_REM_OP_ERR,
        IB_WC_RETRY_EXC_ERR,
        IB_WC_RNR_RETRY_EXC_ERR,
        IB_WC_LOC_RDD_VIOL_ERR,
        IB_WC_REM_INV_RD_REQ_ERR,
        IB_WC_REM_ABORT_ERR,
        IB_WC_INV_EECN_ERR,
        IB_WC_INV_EEC_STATE_ERR,
        IB_WC_FATAL_ERR,
        IB_WC_RESP_TIMEOUT_ERR,
        IB_WC_GENERAL_ERR
};

const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status);

enum ib_wc_opcode {
        IB_WC_SEND = IB_UVERBS_WC_SEND,
        IB_WC_RDMA_WRITE = IB_UVERBS_WC_RDMA_WRITE,
        IB_WC_RDMA_READ = IB_UVERBS_WC_RDMA_READ,
        IB_WC_COMP_SWAP = IB_UVERBS_WC_COMP_SWAP,
        IB_WC_FETCH_ADD = IB_UVERBS_WC_FETCH_ADD,
        IB_WC_BIND_MW = IB_UVERBS_WC_BIND_MW,
        IB_WC_LOCAL_INV = IB_UVERBS_WC_LOCAL_INV,
        IB_WC_LSO = IB_UVERBS_WC_TSO,
        IB_WC_ATOMIC_WRITE = IB_UVERBS_WC_ATOMIC_WRITE,
        IB_WC_REG_MR,
        IB_WC_MASKED_COMP_SWAP,
        IB_WC_MASKED_FETCH_ADD,
        IB_WC_FLUSH = IB_UVERBS_WC_FLUSH,
/*
 * Set value of IB_WC_RECV so consumers can test if a completion is a
 * receive by testing (opcode & IB_WC_RECV).
 */
        IB_WC_RECV                        = 1 << 7,
        IB_WC_RECV_RDMA_WITH_IMM
};

enum ib_wc_flags {
        IB_WC_GRH                = 1,
        IB_WC_WITH_IMM                = (1<<1),
        IB_WC_WITH_INVALIDATE        = (1<<2),
        IB_WC_IP_CSUM_OK        = (1<<3),
        IB_WC_WITH_SMAC                = (1<<4),
        IB_WC_WITH_VLAN                = (1<<5),
        IB_WC_WITH_NETWORK_HDR_TYPE        = (1<<6),
};

struct ib_wc {
        union {
                u64                wr_id;
                struct ib_cqe        *wr_cqe;
        };
        enum ib_wc_status        status;
        enum ib_wc_opcode        opcode;
        u32                        vendor_err;
        u32                        byte_len;
        struct ib_qp               *qp;
        union {
                __be32                imm_data;
                u32                invalidate_rkey;
        } ex;
        u32                        src_qp;
        u32                        slid;
        int                        wc_flags;
        u16                        pkey_index;
        u8                        sl;
        u8                        dlid_path_bits;
        u32 port_num; /* valid only for DR SMPs on switches */
        u8                        smac[ETH_ALEN];
        u16                        vlan_id;
        u8                        network_hdr_type;
};

enum ib_cq_notify_flags {
        IB_CQ_SOLICITED                        = 1 << 0,
        IB_CQ_NEXT_COMP                        = 1 << 1,
        IB_CQ_SOLICITED_MASK                = IB_CQ_SOLICITED | IB_CQ_NEXT_COMP,
        IB_CQ_REPORT_MISSED_EVENTS        = 1 << 2,
};

enum ib_srq_type {
        IB_SRQT_BASIC = IB_UVERBS_SRQT_BASIC,
        IB_SRQT_XRC = IB_UVERBS_SRQT_XRC,
        IB_SRQT_TM = IB_UVERBS_SRQT_TM,
};

static inline bool ib_srq_has_cq(enum ib_srq_type srq_type)
{
        return srq_type == IB_SRQT_XRC ||
               srq_type == IB_SRQT_TM;
}

enum ib_srq_attr_mask {
        IB_SRQ_MAX_WR        = 1 << 0,
        IB_SRQ_LIMIT        = 1 << 1,
};

struct ib_srq_attr {
        u32        max_wr;
        u32        max_sge;
        u32        srq_limit;
};

struct ib_srq_init_attr {
        void                      (*event_handler)(struct ib_event *, void *);
        void                       *srq_context;
        struct ib_srq_attr        attr;
        enum ib_srq_type        srq_type;

        struct {
                struct ib_cq   *cq;
                union {
                        struct {
                                struct ib_xrcd *xrcd;
                        } xrc;

                        struct {
                                u32                max_num_tags;
                        } tag_matching;
                };
        } ext;
};

struct ib_qp_cap {
        u32        max_send_wr;
        u32        max_recv_wr;
        u32        max_send_sge;
        u32        max_recv_sge;
        u32        max_inline_data;

        /*
         * Maximum number of rdma_rw_ctx structures in flight at a time.
         * ib_create_qp() will calculate the right amount of needed WRs
         * and MRs based on this.
         */
        u32        max_rdma_ctxs;
};

enum ib_sig_type {
        IB_SIGNAL_ALL_WR,
        IB_SIGNAL_REQ_WR
};

enum ib_qp_type {
        /*
         * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
         * here (and in that order) since the MAD layer uses them as
         * indices into a 2-entry table.
         */
        IB_QPT_SMI,
        IB_QPT_GSI,

        IB_QPT_RC = IB_UVERBS_QPT_RC,
        IB_QPT_UC = IB_UVERBS_QPT_UC,
        IB_QPT_UD = IB_UVERBS_QPT_UD,
        IB_QPT_RAW_IPV6,
        IB_QPT_RAW_ETHERTYPE,
        IB_QPT_RAW_PACKET = IB_UVERBS_QPT_RAW_PACKET,
        IB_QPT_XRC_INI = IB_UVERBS_QPT_XRC_INI,
        IB_QPT_XRC_TGT = IB_UVERBS_QPT_XRC_TGT,
        IB_QPT_MAX,
        IB_QPT_DRIVER = IB_UVERBS_QPT_DRIVER,
        /* Reserve a range for qp types internal to the low level driver.
         * These qp types will not be visible at the IB core layer, so the
         * IB_QPT_MAX usages should not be affected in the core layer
         */
        IB_QPT_RESERVED1 = 0x1000,
        IB_QPT_RESERVED2,
        IB_QPT_RESERVED3,
        IB_QPT_RESERVED4,
        IB_QPT_RESERVED5,
        IB_QPT_RESERVED6,
        IB_QPT_RESERVED7,
        IB_QPT_RESERVED8,
        IB_QPT_RESERVED9,
        IB_QPT_RESERVED10,
};

enum ib_qp_create_flags {
        IB_QP_CREATE_IPOIB_UD_LSO                = 1 << 0,
        IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK        =
                IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
        IB_QP_CREATE_CROSS_CHANNEL              = 1 << 2,
        IB_QP_CREATE_MANAGED_SEND               = 1 << 3,
        IB_QP_CREATE_MANAGED_RECV               = 1 << 4,
        IB_QP_CREATE_NETIF_QP                        = 1 << 5,
        IB_QP_CREATE_INTEGRITY_EN                = 1 << 6,
        IB_QP_CREATE_NETDEV_USE                        = 1 << 7,
        IB_QP_CREATE_SCATTER_FCS                =
                IB_UVERBS_QP_CREATE_SCATTER_FCS,
        IB_QP_CREATE_CVLAN_STRIPPING                =
                IB_UVERBS_QP_CREATE_CVLAN_STRIPPING,
        IB_QP_CREATE_SOURCE_QPN                        = 1 << 10,
        IB_QP_CREATE_PCI_WRITE_END_PADDING        =
                IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING,
        /* reserve bits 26-31 for low level drivers' internal use */
        IB_QP_CREATE_RESERVED_START                = 1 << 26,
        IB_QP_CREATE_RESERVED_END                = 1 << 31,
};

/*
 * Note: users may not call ib_close_qp or ib_destroy_qp from the event_handler
 * callback to destroy the passed in QP.
 */

struct ib_qp_init_attr {
        /* This callback occurs in workqueue context */
        void                  (*event_handler)(struct ib_event *, void *);

        void                       *qp_context;
        struct ib_cq               *send_cq;
        struct ib_cq               *recv_cq;
        struct ib_srq               *srq;
        struct ib_xrcd               *xrcd;     /* XRC TGT QPs only */
        struct ib_qp_cap        cap;
        enum ib_sig_type        sq_sig_type;
        enum ib_qp_type                qp_type;
        u32                        create_flags;

        /*
         * Only needed for special QP types, or when using the RW API.
         */
        u32                        port_num;
        struct ib_rwq_ind_table *rwq_ind_tbl;
        u32                        source_qpn;
};

struct ib_qp_open_attr {
        void                  (*event_handler)(struct ib_event *, void *);
        void                       *qp_context;
        u32                        qp_num;
        enum ib_qp_type                qp_type;
};

enum ib_rnr_timeout {
        IB_RNR_TIMER_655_36 =  0,
        IB_RNR_TIMER_000_01 =  1,
        IB_RNR_TIMER_000_02 =  2,
        IB_RNR_TIMER_000_03 =  3,
        IB_RNR_TIMER_000_04 =  4,
        IB_RNR_TIMER_000_06 =  5,
        IB_RNR_TIMER_000_08 =  6,
        IB_RNR_TIMER_000_12 =  7,
        IB_RNR_TIMER_000_16 =  8,
        IB_RNR_TIMER_000_24 =  9,
        IB_RNR_TIMER_000_32 = 10,
        IB_RNR_TIMER_000_48 = 11,
        IB_RNR_TIMER_000_64 = 12,
        IB_RNR_TIMER_000_96 = 13,
        IB_RNR_TIMER_001_28 = 14,
        IB_RNR_TIMER_001_92 = 15,
        IB_RNR_TIMER_002_56 = 16,
        IB_RNR_TIMER_003_84 = 17,
        IB_RNR_TIMER_005_12 = 18,
        IB_RNR_TIMER_007_68 = 19,
        IB_RNR_TIMER_010_24 = 20,
        IB_RNR_TIMER_015_36 = 21,
        IB_RNR_TIMER_020_48 = 22,
        IB_RNR_TIMER_030_72 = 23,
        IB_RNR_TIMER_040_96 = 24,
        IB_RNR_TIMER_061_44 = 25,
        IB_RNR_TIMER_081_92 = 26,
        IB_RNR_TIMER_122_88 = 27,
        IB_RNR_TIMER_163_84 = 28,
        IB_RNR_TIMER_245_76 = 29,
        IB_RNR_TIMER_327_68 = 30,
        IB_RNR_TIMER_491_52 = 31
};

enum ib_qp_attr_mask {
        IB_QP_STATE                        = 1,
        IB_QP_CUR_STATE                        = (1<<1),
        IB_QP_EN_SQD_ASYNC_NOTIFY        = (1<<2),
        IB_QP_ACCESS_FLAGS                = (1<<3),
        IB_QP_PKEY_INDEX                = (1<<4),
        IB_QP_PORT                        = (1<<5),
        IB_QP_QKEY                        = (1<<6),
        IB_QP_AV                        = (1<<7),
        IB_QP_PATH_MTU                        = (1<<8),
        IB_QP_TIMEOUT                        = (1<<9),
        IB_QP_RETRY_CNT                        = (1<<10),
        IB_QP_RNR_RETRY                        = (1<<11),
        IB_QP_RQ_PSN                        = (1<<12),
        IB_QP_MAX_QP_RD_ATOMIC                = (1<<13),
        IB_QP_ALT_PATH                        = (1<<14),
        IB_QP_MIN_RNR_TIMER                = (1<<15),
        IB_QP_SQ_PSN                        = (1<<16),
        IB_QP_MAX_DEST_RD_ATOMIC        = (1<<17),
        IB_QP_PATH_MIG_STATE                = (1<<18),
        IB_QP_CAP                        = (1<<19),
        IB_QP_DEST_QPN                        = (1<<20),
        IB_QP_RESERVED1                        = (1<<21),
        IB_QP_RESERVED2                        = (1<<22),
        IB_QP_RESERVED3                        = (1<<23),
        IB_QP_RESERVED4                        = (1<<24),
        IB_QP_RATE_LIMIT                = (1<<25),

        IB_QP_ATTR_STANDARD_BITS = GENMASK(20, 0),
};

enum ib_qp_state {
        IB_QPS_RESET,
        IB_QPS_INIT,
        IB_QPS_RTR,
        IB_QPS_RTS,
        IB_QPS_SQD,
        IB_QPS_SQE,
        IB_QPS_ERR
};

enum ib_mig_state {
        IB_MIG_MIGRATED,
        IB_MIG_REARM,
        IB_MIG_ARMED
};

enum ib_mw_type {
        IB_MW_TYPE_1 = 1,
        IB_MW_TYPE_2 = 2
};

struct ib_qp_attr {
        enum ib_qp_state        qp_state;
        enum ib_qp_state        cur_qp_state;
        enum ib_mtu                path_mtu;
        enum ib_mig_state        path_mig_state;
        u32                        qkey;
        u32                        rq_psn;
        u32                        sq_psn;
        u32                        dest_qp_num;
        int                        qp_access_flags;
        struct ib_qp_cap        cap;
        struct rdma_ah_attr        ah_attr;
        struct rdma_ah_attr        alt_ah_attr;
        u16                        pkey_index;
        u16                        alt_pkey_index;
        u8                        en_sqd_async_notify;
        u8                        sq_draining;
        u8                        max_rd_atomic;
        u8                        max_dest_rd_atomic;
        u8                        min_rnr_timer;
        u32                        port_num;
        u8                        timeout;
        u8                        retry_cnt;
        u8                        rnr_retry;
        u32                        alt_port_num;
        u8                        alt_timeout;
        u32                        rate_limit;
        struct net_device        *xmit_slave;
};

enum ib_wr_opcode {
        /* These are shared with userspace */
        IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE,
        IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM,
        IB_WR_SEND = IB_UVERBS_WR_SEND,
        IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM,
        IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ,
        IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP,
        IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD,
        IB_WR_BIND_MW = IB_UVERBS_WR_BIND_MW,
        IB_WR_LSO = IB_UVERBS_WR_TSO,
        IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV,
        IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV,
        IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV,
        IB_WR_MASKED_ATOMIC_CMP_AND_SWP =
                IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP,
        IB_WR_MASKED_ATOMIC_FETCH_AND_ADD =
                IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD,
        IB_WR_FLUSH = IB_UVERBS_WR_FLUSH,
        IB_WR_ATOMIC_WRITE = IB_UVERBS_WR_ATOMIC_WRITE,

        /* These are kernel only and can not be issued by userspace */
        IB_WR_REG_MR = 0x20,
        IB_WR_REG_MR_INTEGRITY,

        /* reserve values for low level drivers' internal use.
         * These values will not be used at all in the ib core layer.
         */
        IB_WR_RESERVED1 = 0xf0,
        IB_WR_RESERVED2,
        IB_WR_RESERVED3,
        IB_WR_RESERVED4,
        IB_WR_RESERVED5,
        IB_WR_RESERVED6,
        IB_WR_RESERVED7,
        IB_WR_RESERVED8,
        IB_WR_RESERVED9,
        IB_WR_RESERVED10,
};

enum ib_send_flags {
        IB_SEND_FENCE                = 1,
        IB_SEND_SIGNALED        = (1<<1),
        IB_SEND_SOLICITED        = (1<<2),
        IB_SEND_INLINE                = (1<<3),
        IB_SEND_IP_CSUM                = (1<<4),

        /* reserve bits 26-31 for low level drivers' internal use */
        IB_SEND_RESERVED_START        = (1 << 26),
        IB_SEND_RESERVED_END        = (1 << 31),
};

struct ib_sge {
        u64        addr;
        u32        length;
        u32        lkey;
};

struct ib_cqe {
        void (*done)(struct ib_cq *cq, struct ib_wc *wc);
};

struct ib_send_wr {
        struct ib_send_wr      *next;
        union {
                u64                wr_id;
                struct ib_cqe        *wr_cqe;
        };
        struct ib_sge               *sg_list;
        int                        num_sge;
        enum ib_wr_opcode        opcode;
        int                        send_flags;
        union {
                __be32                imm_data;
                u32                invalidate_rkey;
        } ex;
};

struct ib_rdma_wr {
        struct ib_send_wr        wr;
        u64                        remote_addr;
        u32                        rkey;
};

static inline const struct ib_rdma_wr *rdma_wr(const struct ib_send_wr *wr)
{
        return container_of(wr, struct ib_rdma_wr, wr);
}

struct ib_atomic_wr {
        struct ib_send_wr        wr;
        u64                        remote_addr;
        u64                        compare_add;
        u64                        swap;
        u64                        compare_add_mask;
        u64                        swap_mask;
        u32                        rkey;
};

static inline const struct ib_atomic_wr *atomic_wr(const struct ib_send_wr *wr)
{
        return container_of(wr, struct ib_atomic_wr, wr);
}

struct ib_ud_wr {
        struct ib_send_wr        wr;
        struct ib_ah                *ah;
        void                        *header;
        int                        hlen;
        int                        mss;
        u32                        remote_qpn;
        u32                        remote_qkey;
        u16                        pkey_index; /* valid for GSI only */
        u32                        port_num; /* valid for DR SMPs on switch only */
};

static inline const struct ib_ud_wr *ud_wr(const struct ib_send_wr *wr)
{
        return container_of(wr, struct ib_ud_wr, wr);
}

struct ib_reg_wr {
        struct ib_send_wr        wr;
        struct ib_mr                *mr;
        u32                        key;
        int                        access;
};

static inline const struct ib_reg_wr *reg_wr(const struct ib_send_wr *wr)
{
        return container_of(wr, struct ib_reg_wr, wr);
}

struct ib_recv_wr {
        struct ib_recv_wr      *next;
        union {
                u64                wr_id;
                struct ib_cqe        *wr_cqe;
        };
        struct ib_sge               *sg_list;
        int                        num_sge;
};

enum ib_access_flags {
        IB_ACCESS_LOCAL_WRITE = IB_UVERBS_ACCESS_LOCAL_WRITE,
        IB_ACCESS_REMOTE_WRITE = IB_UVERBS_ACCESS_REMOTE_WRITE,
        IB_ACCESS_REMOTE_READ = IB_UVERBS_ACCESS_REMOTE_READ,
        IB_ACCESS_REMOTE_ATOMIC = IB_UVERBS_ACCESS_REMOTE_ATOMIC,
        IB_ACCESS_MW_BIND = IB_UVERBS_ACCESS_MW_BIND,
        IB_ZERO_BASED = IB_UVERBS_ACCESS_ZERO_BASED,
        IB_ACCESS_ON_DEMAND = IB_UVERBS_ACCESS_ON_DEMAND,
        IB_ACCESS_HUGETLB = IB_UVERBS_ACCESS_HUGETLB,
        IB_ACCESS_RELAXED_ORDERING = IB_UVERBS_ACCESS_RELAXED_ORDERING,
        IB_ACCESS_FLUSH_GLOBAL = IB_UVERBS_ACCESS_FLUSH_GLOBAL,
        IB_ACCESS_FLUSH_PERSISTENT = IB_UVERBS_ACCESS_FLUSH_PERSISTENT,

        IB_ACCESS_OPTIONAL = IB_UVERBS_ACCESS_OPTIONAL_RANGE,
        IB_ACCESS_SUPPORTED =
                ((IB_ACCESS_FLUSH_PERSISTENT << 1) - 1) | IB_ACCESS_OPTIONAL,
};

/*
 * XXX: these are apparently used for ->rereg_user_mr, no idea why they
 * are hidden here instead of a uapi header!
 */
enum ib_mr_rereg_flags {
        IB_MR_REREG_TRANS        = 1,
        IB_MR_REREG_PD                = (1<<1),
        IB_MR_REREG_ACCESS        = (1<<2),
        IB_MR_REREG_SUPPORTED        = ((IB_MR_REREG_ACCESS << 1) - 1)
};

struct ib_umem;

enum rdma_remove_reason {
        /*
         * Userspace requested uobject deletion or initial try
         * to remove uobject via cleanup. Call could fail
         */
        RDMA_REMOVE_DESTROY,
        /* Context deletion. This call should delete the actual object itself */
        RDMA_REMOVE_CLOSE,
        /* Driver is being hot-unplugged. This call should delete the actual object itself */
        RDMA_REMOVE_DRIVER_REMOVE,
        /* uobj is being cleaned-up before being committed */
        RDMA_REMOVE_ABORT,
        /* The driver failed to destroy the uobject and is being disconnected */
        RDMA_REMOVE_DRIVER_FAILURE,
};

struct ib_rdmacg_object {
#ifdef CONFIG_CGROUP_RDMA
        struct rdma_cgroup        *cg;                /* owner rdma cgroup */
#endif
};

struct ib_ucontext {
        struct ib_device       *device;
        struct ib_uverbs_file  *ufile;

        struct ib_rdmacg_object        cg_obj;
        u64 enabled_caps;
        /*
         * Implementation details of the RDMA core, don't use in drivers:
         */
        struct rdma_restrack_entry res;
        struct xarray mmap_xa;
};

struct ib_uobject {
        u64                        user_handle;        /* handle given to us by userspace */
        /* ufile & ucontext owning this object */
        struct ib_uverbs_file  *ufile;
        /* FIXME, save memory: ufile->context == context */
        struct ib_ucontext     *context;        /* associated user context */
        void                       *object;                /* containing object */
        struct list_head        list;                /* link to context's list */
        struct ib_rdmacg_object        cg_obj;                /* rdmacg object */
        int                        id;                /* index into kernel idr */
        struct kref                ref;
        atomic_t                usecnt;                /* protects exclusive access */
        struct rcu_head                rcu;                /* kfree_rcu() overhead */

        const struct uverbs_api_object *uapi_object;
};

struct ib_udata {
        const void __user *inbuf;
        void __user *outbuf;
        size_t       inlen;
        size_t       outlen;
};

struct ib_pd {
        u32                        local_dma_lkey;
        u32                        flags;
        struct ib_device       *device;
        struct ib_uobject      *uobject;
        atomic_t                  usecnt; /* count all resources */

        u32                        unsafe_global_rkey;

        /*
         * Implementation details of the RDMA core, don't use in drivers:
         */
        struct ib_mr               *__internal_mr;
        struct rdma_restrack_entry res;
};

struct ib_xrcd {
        struct ib_device       *device;
        atomic_t                usecnt; /* count all exposed resources */
        struct inode               *inode;
        struct rw_semaphore        tgt_qps_rwsem;
        struct xarray                tgt_qps;
};

struct ib_ah {
        struct ib_device        *device;
        struct ib_pd                *pd;
        struct ib_uobject        *uobject;
        const struct ib_gid_attr *sgid_attr;
        enum rdma_ah_attr_type        type;
};

typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);

enum ib_poll_context {
        IB_POLL_SOFTIRQ,           /* poll from softirq context */
        IB_POLL_WORKQUEUE,           /* poll from workqueue */
        IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */
        IB_POLL_LAST_POOL_TYPE = IB_POLL_UNBOUND_WORKQUEUE,

        IB_POLL_DIRECT,                   /* caller context, no hw completions */
};

struct ib_cq {
        struct ib_device       *device;
        struct ib_ucq_object   *uobject;
        ib_comp_handler           comp_handler;
        void                  (*event_handler)(struct ib_event *, void *);
        void                   *cq_context;
        int                       cqe;
        unsigned int                cqe_used;
        atomic_t                  usecnt; /* count number of work queues */
        enum ib_poll_context        poll_ctx;
        struct ib_wc                *wc;
        struct list_head        pool_entry;
        union {
                struct irq_poll                iop;
                struct work_struct        work;
        };
        struct workqueue_struct *comp_wq;
        struct dim *dim;

        /* updated only by trace points */
        ktime_t timestamp;
        u8 interrupt:1;
        u8 shared:1;
        unsigned int comp_vector;

        /*
         * Implementation details of the RDMA core, don't use in drivers:
         */
        struct rdma_restrack_entry res;
};

struct ib_srq {
        struct ib_device       *device;
        struct ib_pd               *pd;
        struct ib_usrq_object  *uobject;
        void                      (*event_handler)(struct ib_event *, void *);
        void                       *srq_context;
        enum ib_srq_type        srq_type;
        atomic_t                usecnt;

        struct {
                struct ib_cq   *cq;
                union {
                        struct {
                                struct ib_xrcd *xrcd;
                                u32                srq_num;
                        } xrc;
                };
        } ext;

        /*
         * Implementation details of the RDMA core, don't use in drivers:
         */
        struct rdma_restrack_entry res;
};

enum ib_raw_packet_caps {
        /*
         * Strip cvlan from incoming packet and report it in the matching work
         * completion is supported.
         */
        IB_RAW_PACKET_CAP_CVLAN_STRIPPING =
                IB_UVERBS_RAW_PACKET_CAP_CVLAN_STRIPPING,
        /*
         * Scatter FCS field of an incoming packet to host memory is supported.
         */
        IB_RAW_PACKET_CAP_SCATTER_FCS = IB_UVERBS_RAW_PACKET_CAP_SCATTER_FCS,
        /* Checksum offloads are supported (for both send and receive). */
        IB_RAW_PACKET_CAP_IP_CSUM = IB_UVERBS_RAW_PACKET_CAP_IP_CSUM,
        /*
         * When a packet is received for an RQ with no receive WQEs, the
         * packet processing is delayed.
         */
        IB_RAW_PACKET_CAP_DELAY_DROP = IB_UVERBS_RAW_PACKET_CAP_DELAY_DROP,
};

enum ib_wq_type {
        IB_WQT_RQ = IB_UVERBS_WQT_RQ,
};

enum ib_wq_state {
        IB_WQS_RESET,
        IB_WQS_RDY,
        IB_WQS_ERR
};

struct ib_wq {
        struct ib_device       *device;
        struct ib_uwq_object   *uobject;
        void                    *wq_context;
        void                    (*event_handler)(struct ib_event *, void *);
        struct ib_pd               *pd;
        struct ib_cq               *cq;
        u32                wq_num;
        enum ib_wq_state       state;
        enum ib_wq_type        wq_type;
        atomic_t                usecnt;
};

enum ib_wq_flags {
        IB_WQ_FLAGS_CVLAN_STRIPPING        = IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING,
        IB_WQ_FLAGS_SCATTER_FCS                = IB_UVERBS_WQ_FLAGS_SCATTER_FCS,
        IB_WQ_FLAGS_DELAY_DROP                = IB_UVERBS_WQ_FLAGS_DELAY_DROP,
        IB_WQ_FLAGS_PCI_WRITE_END_PADDING =
                                IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING,
};

struct ib_wq_init_attr {
        void                       *wq_context;
        enum ib_wq_type        wq_type;
        u32                max_wr;
        u32                max_sge;
        struct        ib_cq               *cq;
        void                    (*event_handler)(struct ib_event *, void *);
        u32                create_flags; /* Use enum ib_wq_flags */
};

enum ib_wq_attr_mask {
        IB_WQ_STATE                = 1 << 0,
        IB_WQ_CUR_STATE                = 1 << 1,
        IB_WQ_FLAGS                = 1 << 2,
};

struct ib_wq_attr {
        enum        ib_wq_state        wq_state;
        enum        ib_wq_state        curr_wq_state;
        u32                        flags; /* Use enum ib_wq_flags */
        u32                        flags_mask; /* Use enum ib_wq_flags */
};

struct ib_rwq_ind_table {
        struct ib_device        *device;
        struct ib_uobject      *uobject;
        atomic_t                usecnt;
        u32                ind_tbl_num;
        u32                log_ind_tbl_size;
        struct ib_wq        **ind_tbl;
};

struct ib_rwq_ind_table_init_attr {
        u32                log_ind_tbl_size;
        /* Each entry is a pointer to Receive Work Queue */
        struct ib_wq        **ind_tbl;
};

enum port_pkey_state {
        IB_PORT_PKEY_NOT_VALID = 0,
        IB_PORT_PKEY_VALID = 1,
        IB_PORT_PKEY_LISTED = 2,
};

struct ib_qp_security;

struct ib_port_pkey {
        enum port_pkey_state        state;
        u16                        pkey_index;
        u32                        port_num;
        struct list_head        qp_list;
        struct list_head        to_error_list;
        struct ib_qp_security  *sec;
};

struct ib_ports_pkeys {
        struct ib_port_pkey        main;
        struct ib_port_pkey        alt;
};

struct ib_qp_security {
        struct ib_qp               *qp;
        struct ib_device       *dev;
        /* Hold this mutex when changing port and pkey settings. */
        struct mutex                mutex;
        struct ib_ports_pkeys  *ports_pkeys;
        /* A list of all open shared QP handles.  Required to enforce security
         * properly for all users of a shared QP.
         */
        struct list_head        shared_qp_list;
        void                   *security;
        bool                        destroying;
        atomic_t                error_list_count;
        struct completion        error_complete;
        int                        error_comps_pending;
};

/*
 * @max_write_sge: Maximum SGE elements per RDMA WRITE request.
 * @max_read_sge:  Maximum SGE elements per RDMA READ request.
 */
struct ib_qp {
        struct ib_device       *device;
        struct ib_pd               *pd;
        struct ib_cq               *send_cq;
        struct ib_cq               *recv_cq;
        spinlock_t                mr_lock;
        int                        mrs_used;
        struct list_head        rdma_mrs;
        struct list_head        sig_mrs;
        struct ib_srq               *srq;
        struct completion        srq_completion;
        struct ib_xrcd               *xrcd; /* XRC TGT QPs only */
        struct list_head        xrcd_list;

        /* count times opened, mcast attaches, flow attaches */
        atomic_t                usecnt;
        struct list_head        open_list;
        struct ib_qp           *real_qp;
        struct ib_uqp_object   *uobject;
        void                  (*event_handler)(struct ib_event *, void *);
        void                  (*registered_event_handler)(struct ib_event *, void *);
        void                       *qp_context;
        /* sgid_attrs associated with the AV's */
        const struct ib_gid_attr *av_sgid_attr;
        const struct ib_gid_attr *alt_path_sgid_attr;
        u32                        qp_num;
        u32                        max_write_sge;
        u32                        max_read_sge;
        enum ib_qp_type                qp_type;
        struct ib_rwq_ind_table *rwq_ind_tbl;
        struct ib_qp_security  *qp_sec;
        u32                        port;

        bool                        integrity_en;
        /*
         * Implementation details of the RDMA core, don't use in drivers:
         */
        struct rdma_restrack_entry     res;

        /* The counter the qp is bind to */
        struct rdma_counter    *counter;
};

struct ib_dm {
        struct ib_device  *device;
        u32                   length;
        u32                   flags;
        struct ib_uobject *uobject;
        atomic_t           usecnt;
};

struct ib_mr {
        struct ib_device  *device;
        struct ib_pd          *pd;
        u32                   lkey;
        u32                   rkey;
        u64                   iova;
        u64                   length;
        unsigned int           page_size;
        enum ib_mr_type           type;
        bool                   need_inval;
        union {
                struct ib_uobject        *uobject;        /* user */
                struct list_head        qp_entry;        /* FR */
        };

        struct ib_dm      *dm;
        struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */
        /*
         * Implementation details of the RDMA core, don't use in drivers:
         */
        struct rdma_restrack_entry res;
};

struct ib_mw {
        struct ib_device        *device;
        struct ib_pd                *pd;
        struct ib_uobject        *uobject;
        u32                        rkey;
        enum ib_mw_type         type;
};

/* Supported steering options */
enum ib_flow_attr_type {
        /* steering according to rule specifications */
        IB_FLOW_ATTR_NORMAL                = 0x0,
        /* default unicast and multicast rule -
         * receive all Eth traffic which isn't steered to any QP
         */
        IB_FLOW_ATTR_ALL_DEFAULT        = 0x1,
        /* default multicast rule -
         * receive all Eth multicast traffic which isn't steered to any QP
         */
        IB_FLOW_ATTR_MC_DEFAULT                = 0x2,
        /* sniffer rule - receive all port traffic */
        IB_FLOW_ATTR_SNIFFER                = 0x3
};

/* Supported steering header types */
enum ib_flow_spec_type {
        /* L2 headers*/
        IB_FLOW_SPEC_ETH                = 0x20,
        IB_FLOW_SPEC_IB                        = 0x22,
        /* L3 header*/
        IB_FLOW_SPEC_IPV4                = 0x30,
        IB_FLOW_SPEC_IPV6                = 0x31,
        IB_FLOW_SPEC_ESP                = 0x34,
        /* L4 headers*/
        IB_FLOW_SPEC_TCP                = 0x40,
        IB_FLOW_SPEC_UDP                = 0x41,
        IB_FLOW_SPEC_VXLAN_TUNNEL        = 0x50,
        IB_FLOW_SPEC_GRE                = 0x51,
        IB_FLOW_SPEC_MPLS                = 0x60,
        IB_FLOW_SPEC_INNER                = 0x100,
        /* Actions */
        IB_FLOW_SPEC_ACTION_TAG         = 0x1000,
        IB_FLOW_SPEC_ACTION_DROP        = 0x1001,
        IB_FLOW_SPEC_ACTION_HANDLE        = 0x1002,
        IB_FLOW_SPEC_ACTION_COUNT       = 0x1003,
};
#define IB_FLOW_SPEC_LAYER_MASK        0xF0
#define IB_FLOW_SPEC_SUPPORT_LAYERS 10

enum ib_flow_flags {
        IB_FLOW_ATTR_FLAGS_DONT_TRAP = 1UL << 1, /* Continue match, no steal */
        IB_FLOW_ATTR_FLAGS_EGRESS = 1UL << 2, /* Egress flow */
        IB_FLOW_ATTR_FLAGS_RESERVED  = 1UL << 3  /* Must be last */
};

struct ib_flow_eth_filter {
        u8        dst_mac[6];
        u8        src_mac[6];
        __be16        ether_type;
        __be16        vlan_tag;
};

struct ib_flow_spec_eth {
        u32                          type;
        u16                          size;
        struct ib_flow_eth_filter val;
        struct ib_flow_eth_filter mask;
};

struct ib_flow_ib_filter {
        __be16 dlid;
        __u8   sl;
};

struct ib_flow_spec_ib {
        u32                         type;
        u16                         size;
        struct ib_flow_ib_filter val;
        struct ib_flow_ib_filter mask;
};

/* IPv4 header flags */
enum ib_ipv4_flags {
        IB_IPV4_DONT_FRAG = 0x2, /* Don't enable packet fragmentation */
        IB_IPV4_MORE_FRAG = 0X4  /* For All fragmented packets except the
                                    last have this flag set */
};

struct ib_flow_ipv4_filter {
        __be32        src_ip;
        __be32        dst_ip;
        u8        proto;
        u8        tos;
        u8        ttl;
        u8        flags;
};

struct ib_flow_spec_ipv4 {
        u32                           type;
        u16                           size;
        struct ib_flow_ipv4_filter val;
        struct ib_flow_ipv4_filter mask;
};

struct ib_flow_ipv6_filter {
        u8        src_ip[16];
        u8        dst_ip[16];
        __be32        flow_label;
        u8        next_hdr;
        u8        traffic_class;
        u8        hop_limit;
} __packed;

struct ib_flow_spec_ipv6 {
        u32                           type;
        u16                           size;
        struct ib_flow_ipv6_filter val;
        struct ib_flow_ipv6_filter mask;
};

struct ib_flow_tcp_udp_filter {
        __be16        dst_port;
        __be16        src_port;
};

struct ib_flow_spec_tcp_udp {
        u32                              type;
        u16                              size;
        struct ib_flow_tcp_udp_filter val;
        struct ib_flow_tcp_udp_filter mask;
};

struct ib_flow_tunnel_filter {
        __be32        tunnel_id;
};

/* ib_flow_spec_tunnel describes the Vxlan tunnel
 * the tunnel_id from val has the vni value
 */
struct ib_flow_spec_tunnel {
        u32                              type;
        u16                              size;
        struct ib_flow_tunnel_filter  val;
        struct ib_flow_tunnel_filter  mask;
};

struct ib_flow_esp_filter {
        __be32        spi;
        __be32  seq;
};

struct ib_flow_spec_esp {
        u32                           type;
        u16                              size;
        struct ib_flow_esp_filter     val;
        struct ib_flow_esp_filter     mask;
};

struct ib_flow_gre_filter {
        __be16 c_ks_res0_ver;
        __be16 protocol;
        __be32 key;
};

struct ib_flow_spec_gre {
        u32                           type;
        u16                              size;
        struct ib_flow_gre_filter     val;
        struct ib_flow_gre_filter     mask;
};

struct ib_flow_mpls_filter {
        __be32 tag;
};

struct ib_flow_spec_mpls {
        u32                           type;
        u16                              size;
        struct ib_flow_mpls_filter     val;
        struct ib_flow_mpls_filter     mask;
};

struct ib_flow_spec_action_tag {
        enum ib_flow_spec_type              type;
        u16                              size;
        u32                           tag_id;
};

struct ib_flow_spec_action_drop {
        enum ib_flow_spec_type              type;
        u16                              size;
};

struct ib_flow_spec_action_handle {
        enum ib_flow_spec_type              type;
        u16                              size;
        struct ib_flow_action             *act;
};

enum ib_counters_description {
        IB_COUNTER_PACKETS,
        IB_COUNTER_BYTES,
};

struct ib_flow_spec_action_count {
        enum ib_flow_spec_type type;
        u16 size;
        struct ib_counters *counters;
};

union ib_flow_spec {
        struct {
                u32                        type;
                u16                        size;
        };
        struct ib_flow_spec_eth                eth;
        struct ib_flow_spec_ib                ib;
        struct ib_flow_spec_ipv4        ipv4;
        struct ib_flow_spec_tcp_udp        tcp_udp;
        struct ib_flow_spec_ipv6        ipv6;
        struct ib_flow_spec_tunnel      tunnel;
        struct ib_flow_spec_esp                esp;
        struct ib_flow_spec_gre                gre;
        struct ib_flow_spec_mpls        mpls;
        struct ib_flow_spec_action_tag  flow_tag;
        struct ib_flow_spec_action_drop drop;
        struct ib_flow_spec_action_handle action;
        struct ib_flow_spec_action_count flow_count;
};

struct ib_flow_attr {
        enum ib_flow_attr_type type;
        u16             size;
        u16             priority;
        u32             flags;
        u8             num_of_specs;
        u32             port;
        union ib_flow_spec flows[];
};

struct ib_flow {
        struct ib_qp                *qp;
        struct ib_device        *device;
        struct ib_uobject        *uobject;
};

enum ib_flow_action_type {
        IB_FLOW_ACTION_UNSPECIFIED,
        IB_FLOW_ACTION_ESP = 1,
};

struct ib_flow_action_attrs_esp_keymats {
        enum ib_uverbs_flow_action_esp_keymat                        protocol;
        union {
                struct ib_uverbs_flow_action_esp_keymat_aes_gcm aes_gcm;
        } keymat;
};

struct ib_flow_action_attrs_esp_replays {
        enum ib_uverbs_flow_action_esp_replay                        protocol;
        union {
                struct ib_uverbs_flow_action_esp_replay_bmp        bmp;
        } replay;
};

enum ib_flow_action_attrs_esp_flags {
        /* All user-space flags at the top: Use enum ib_uverbs_flow_action_esp_flags
         * This is done in order to share the same flags between user-space and
         * kernel and spare an unnecessary translation.
         */

        /* Kernel flags */
        IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED        = 1ULL << 32,
        IB_FLOW_ACTION_ESP_FLAGS_MOD_ESP_ATTRS        = 1ULL << 33,
};

struct ib_flow_spec_list {
        struct ib_flow_spec_list        *next;
        union ib_flow_spec                spec;
};

struct ib_flow_action_attrs_esp {
        struct ib_flow_action_attrs_esp_keymats                *keymat;
        struct ib_flow_action_attrs_esp_replays                *replay;
        struct ib_flow_spec_list                        *encap;
        /* Used only if IB_FLOW_ACTION_ESP_FLAGS_ESN_TRIGGERED is enabled.
         * Value of 0 is a valid value.
         */
        u32                                                esn;
        u32                                                spi;
        u32                                                seq;
        u32                                                tfc_pad;
        /* Use enum ib_flow_action_attrs_esp_flags */
        u64                                                flags;
        u64                                                hard_limit_pkts;
};

struct ib_flow_action {
        struct ib_device                *device;
        struct ib_uobject                *uobject;
        enum ib_flow_action_type        type;
        atomic_t                        usecnt;
};

struct ib_mad;

enum ib_process_mad_flags {
        IB_MAD_IGNORE_MKEY        = 1,
        IB_MAD_IGNORE_BKEY        = 2,
        IB_MAD_IGNORE_ALL        = IB_MAD_IGNORE_MKEY | IB_MAD_IGNORE_BKEY
};

enum ib_mad_result {
        IB_MAD_RESULT_FAILURE  = 0,      /* (!SUCCESS is the important flag) */
        IB_MAD_RESULT_SUCCESS  = 1 << 0, /* MAD was successfully processed   */
        IB_MAD_RESULT_REPLY    = 1 << 1, /* Reply packet needs to be sent    */
        IB_MAD_RESULT_CONSUMED = 1 << 2  /* Packet consumed: stop processing */
};

struct ib_port_cache {
        u64                      subnet_prefix;
        struct ib_pkey_cache  *pkey;
        struct ib_gid_table   *gid;
        u8                     lmc;
        enum ib_port_state     port_state;
        enum ib_port_state     last_port_state;
};

struct ib_port_immutable {
        int                           pkey_tbl_len;
        int                           gid_tbl_len;
        u32                           core_cap_flags;
        u32                           max_mad_size;
};

struct ib_port_data {
        struct ib_device *ib_dev;

        struct ib_port_immutable immutable;

        spinlock_t pkey_list_lock;

        spinlock_t netdev_lock;

        struct list_head pkey_list;

        struct ib_port_cache cache;

        struct net_device __rcu *netdev;
        netdevice_tracker netdev_tracker;
        struct hlist_node ndev_hash_link;
        struct rdma_port_counter port_counter;
        struct ib_port *sysfs;
};

/* rdma netdev type - specifies protocol type */
enum rdma_netdev_t {
        RDMA_NETDEV_OPA_VNIC,
        RDMA_NETDEV_IPOIB,
};

/**
 * struct rdma_netdev - rdma netdev
 * For cases where netstack interfacing is required.
 */
struct rdma_netdev {
        void              *clnt_priv;
        struct ib_device  *hca;
        u32                   port_num;
        int                mtu;

        /*
         * cleanup function must be specified.
         * FIXME: This is only used for OPA_VNIC and that usage should be
         * removed too.
         */
        void (*free_rdma_netdev)(struct net_device *netdev);

        /* control functions */
        void (*set_id)(struct net_device *netdev, int id);
        /* send packet */
        int (*send)(struct net_device *dev, struct sk_buff *skb,
                    struct ib_ah *address, u32 dqpn);
        /* multicast */
        int (*attach_mcast)(struct net_device *dev, struct ib_device *hca,
                            union ib_gid *gid, u16 mlid,
                            int set_qkey, u32 qkey);
        int (*detach_mcast)(struct net_device *dev, struct ib_device *hca,
                            union ib_gid *gid, u16 mlid);
        /* timeout */
        void (*tx_timeout)(struct net_device *dev, unsigned int txqueue);
};

struct rdma_netdev_alloc_params {
        size_t sizeof_priv;
        unsigned int txqs;
        unsigned int rxqs;
        void *param;

        int (*initialize_rdma_netdev)(struct ib_device *device, u32 port_num,
                                      struct net_device *netdev, void *param);
};

struct ib_odp_counters {
        atomic64_t faults;
        atomic64_t faults_handled;
        atomic64_t invalidations;
        atomic64_t invalidations_handled;
        atomic64_t prefetch;
};

struct ib_counters {
        struct ib_device        *device;
        struct ib_uobject        *uobject;
        /* num of objects attached */
        atomic_t        usecnt;
};

struct ib_counters_read_attr {
        u64        *counters_buff;
        u32        ncounters;
        u32        flags; /* use enum ib_read_counters_flags */
};

struct uverbs_attr_bundle;
struct iw_cm_id;
struct iw_cm_conn_param;

#define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member)                      \
        .size_##ib_struct =                                                    \
                (sizeof(struct drv_struct) +                                   \
                 BUILD_BUG_ON_ZERO(offsetof(struct drv_struct, member)) +      \
                 BUILD_BUG_ON_ZERO(                                            \
                         !__same_type(((struct drv_struct *)NULL)->member,     \
                                      struct ib_struct)))

#define rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, gfp)                          \
        ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \
                                           gfp, false))

#define rdma_zalloc_drv_obj_numa(ib_dev, ib_type)                              \
        ((struct ib_type *)rdma_zalloc_obj(ib_dev, ib_dev->ops.size_##ib_type, \
                                           GFP_KERNEL, true))

#define rdma_zalloc_drv_obj(ib_dev, ib_type)                                   \
        rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, GFP_KERNEL)

#define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct

struct rdma_user_mmap_entry {
        struct kref ref;
        struct ib_ucontext *ucontext;
        unsigned long start_pgoff;
        size_t npages;
        bool driver_removed;
};

/* Return the offset (in bytes) the user should pass to libc's mmap() */
static inline u64
rdma_user_mmap_get_offset(const struct rdma_user_mmap_entry *entry)
{
        return (u64)entry->start_pgoff << PAGE_SHIFT;
}

/**
 * struct ib_device_ops - InfiniBand device operations
 * This structure defines all the InfiniBand device operations, providers will
 * need to define the supported operations, otherwise they will be set to null.
 */
struct ib_device_ops {
        struct module *owner;
        enum rdma_driver_id driver_id;
        u32 uverbs_abi_ver;
        unsigned int uverbs_no_driver_id_binding:1;

        /*
         * NOTE: New drivers should not make use of device_group; instead new
         * device parameter should be exposed via netlink command. This
         * mechanism exists only for existing drivers.
         */
        const struct attribute_group *device_group;
        const struct attribute_group **port_groups;

        int (*post_send)(struct ib_qp *qp, const struct ib_send_wr *send_wr,
                         const struct ib_send_wr **bad_send_wr);
        int (*post_recv)(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
                         const struct ib_recv_wr **bad_recv_wr);
        void (*drain_rq)(struct ib_qp *qp);
        void (*drain_sq)(struct ib_qp *qp);
        int (*poll_cq)(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
        int (*peek_cq)(struct ib_cq *cq, int wc_cnt);
        int (*req_notify_cq)(struct ib_cq *cq, enum ib_cq_notify_flags flags);
        int (*post_srq_recv)(struct ib_srq *srq,
                             const struct ib_recv_wr *recv_wr,
                             const struct ib_recv_wr **bad_recv_wr);
        int (*process_mad)(struct ib_device *device, int process_mad_flags,
                           u32 port_num, const struct ib_wc *in_wc,
                           const struct ib_grh *in_grh,
                           const struct ib_mad *in_mad, struct ib_mad *out_mad,
                           size_t *out_mad_size, u16 *out_mad_pkey_index);
        int (*query_device)(struct ib_device *device,
                            struct ib_device_attr *device_attr,
                            struct ib_udata *udata);
        int (*modify_device)(struct ib_device *device, int device_modify_mask,
                             struct ib_device_modify *device_modify);
        void (*get_dev_fw_str)(struct ib_device *device, char *str);
        const struct cpumask *(*get_vector_affinity)(struct ib_device *ibdev,
                                                     int comp_vector);
        int (*query_port)(struct ib_device *device, u32 port_num,
                          struct ib_port_attr *port_attr);
        int (*modify_port)(struct ib_device *device, u32 port_num,
                           int port_modify_mask,
                           struct ib_port_modify *port_modify);
        /**
         * The following mandatory functions are used only at device
         * registration.  Keep functions such as these at the end of this
         * structure to avoid cache line misses when accessing struct ib_device
         * in fast paths.
         */
        int (*get_port_immutable)(struct ib_device *device, u32 port_num,
                                  struct ib_port_immutable *immutable);
        enum rdma_link_layer (*get_link_layer)(struct ib_device *device,
                                               u32 port_num);
        /**
         * When calling get_netdev, the HW vendor's driver should return the
         * net device of device @device at port @port_num or NULL if such
         * a net device doesn't exist. The vendor driver should call dev_hold
         * on this net device. The HW vendor's device driver must guarantee
         * that this function returns NULL before the net device has finished
         * NETDEV_UNREGISTER state.
         */
        struct net_device *(*get_netdev)(struct ib_device *device,
                                         u32 port_num);
        /**
         * rdma netdev operation
         *
         * Driver implementing alloc_rdma_netdev or rdma_netdev_get_params
         * must return -EOPNOTSUPP if it doesn't support the specified type.
         */
        struct net_device *(*alloc_rdma_netdev)(
                struct ib_device *device, u32 port_num, enum rdma_netdev_t type,
                const char *name, unsigned char name_assign_type,
                void (*setup)(struct net_device *));

        int (*rdma_netdev_get_params)(struct ib_device *device, u32 port_num,
                                      enum rdma_netdev_t type,
                                      struct rdma_netdev_alloc_params *params);
        /**
         * query_gid should be return GID value for @device, when @port_num
         * link layer is either IB or iWarp. It is no-op if @port_num port
         * is RoCE link layer.
         */
        int (*query_gid)(struct ib_device *device, u32 port_num, int index,
                         union ib_gid *gid);
        /**
         * When calling add_gid, the HW vendor's driver should add the gid
         * of device of port at gid index available at @attr. Meta-info of
         * that gid (for example, the network device related to this gid) is
         * available at @attr. @context allows the HW vendor driver to store
         * extra information together with a GID entry. The HW vendor driver may
         * allocate memory to contain this information and store it in @context
         * when a new GID entry is written to. Params are consistent until the
         * next call of add_gid or delete_gid. The function should return 0 on
         * success or error otherwise. The function could be called
         * concurrently for different ports. This function is only called when
         * roce_gid_table is used.
         */
        int (*add_gid)(const struct ib_gid_attr *attr, void **context);
        /**
         * When calling del_gid, the HW vendor's driver should delete the
         * gid of device @device at gid index gid_index of port port_num
         * available in @attr.
         * Upon the deletion of a GID entry, the HW vendor must free any
         * allocated memory. The caller will clear @context afterwards.
         * This function is only called when roce_gid_table is used.
         */
        int (*del_gid)(const struct ib_gid_attr *attr, void **context);
        int (*query_pkey)(struct ib_device *device, u32 port_num, u16 index,
                          u16 *pkey);
        int (*alloc_ucontext)(struct ib_ucontext *context,
                              struct ib_udata *udata);
        void (*dealloc_ucontext)(struct ib_ucontext *context);
        int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
        /**
         * This will be called once refcount of an entry in mmap_xa reaches
         * zero. The type of the memory that was mapped may differ between
         * entries and is opaque to the rdma_user_mmap interface.
         * Therefore needs to be implemented by the driver in mmap_free.
         */
        void (*mmap_free)(struct rdma_user_mmap_entry *entry);
        void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
        int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
        int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
        int (*create_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr,
                         struct ib_udata *udata);
        int (*create_user_ah)(struct ib_ah *ah, struct rdma_ah_init_attr *attr,
                              struct ib_udata *udata);
        int (*modify_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
        int (*query_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
        int (*destroy_ah)(struct ib_ah *ah, u32 flags);
        int (*create_srq)(struct ib_srq *srq,
                          struct ib_srq_init_attr *srq_init_attr,
                          struct ib_udata *udata);
        int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr,
                          enum ib_srq_attr_mask srq_attr_mask,
                          struct ib_udata *udata);
        int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
        int (*destroy_srq)(struct ib_srq *srq, struct ib_udata *udata);
        int (*create_qp)(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr,
                         struct ib_udata *udata);
        int (*modify_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
                         int qp_attr_mask, struct ib_udata *udata);
        int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
                        int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
        int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata);
        int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr,
                         struct uverbs_attr_bundle *attrs);
        int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
        int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
        int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
        struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags);
        struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length,
                                     u64 virt_addr, int mr_access_flags,
                                     struct ib_udata *udata);
        struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset,
                                            u64 length, u64 virt_addr, int fd,
                                            int mr_access_flags,
                                            struct uverbs_attr_bundle *attrs);
        struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start,
                                       u64 length, u64 virt_addr,
                                       int mr_access_flags, struct ib_pd *pd,
                                       struct ib_udata *udata);
        int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata);
        struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type,
                                  u32 max_num_sg);
        struct ib_mr *(*alloc_mr_integrity)(struct ib_pd *pd,
                                            u32 max_num_data_sg,
                                            u32 max_num_meta_sg);
        int (*advise_mr)(struct ib_pd *pd,
                         enum ib_uverbs_advise_mr_advice advice, u32 flags,
                         struct ib_sge *sg_list, u32 num_sge,
                         struct uverbs_attr_bundle *attrs);

        /*
         * Kernel users should universally support relaxed ordering (RO), as
         * they are designed to read data only after observing the CQE and use
         * the DMA API correctly.
         *
         * Some drivers implicitly enable RO if platform supports it.
         */
        int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
                         unsigned int *sg_offset);
        int (*check_mr_status)(struct ib_mr *mr, u32 check_mask,
                               struct ib_mr_status *mr_status);
        int (*alloc_mw)(struct ib_mw *mw, struct ib_udata *udata);
        int (*dealloc_mw)(struct ib_mw *mw);
        int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
        int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
        int (*alloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata);
        int (*dealloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata);
        struct ib_flow *(*create_flow)(struct ib_qp *qp,
                                       struct ib_flow_attr *flow_attr,
                                       struct ib_udata *udata);
        int (*destroy_flow)(struct ib_flow *flow_id);
        int (*destroy_flow_action)(struct ib_flow_action *action);
        int (*set_vf_link_state)(struct ib_device *device, int vf, u32 port,
                                 int state);
        int (*get_vf_config)(struct ib_device *device, int vf, u32 port,
                             struct ifla_vf_info *ivf);
        int (*get_vf_stats)(struct ib_device *device, int vf, u32 port,
                            struct ifla_vf_stats *stats);
        int (*get_vf_guid)(struct ib_device *device, int vf, u32 port,
                            struct ifla_vf_guid *node_guid,
                            struct ifla_vf_guid *port_guid);
        int (*set_vf_guid)(struct ib_device *device, int vf, u32 port, u64 guid,
                           int type);
        struct ib_wq *(*create_wq)(struct ib_pd *pd,
                                   struct ib_wq_init_attr *init_attr,
                                   struct ib_udata *udata);
        int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata);
        int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr,
                         u32 wq_attr_mask, struct ib_udata *udata);
        int (*create_rwq_ind_table)(struct ib_rwq_ind_table *ib_rwq_ind_table,
                                    struct ib_rwq_ind_table_init_attr *init_attr,
                                    struct ib_udata *udata);
        int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table);
        struct ib_dm *(*alloc_dm)(struct ib_device *device,
                                  struct ib_ucontext *context,
                                  struct ib_dm_alloc_attr *attr,
                                  struct uverbs_attr_bundle *attrs);
        int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs);
        struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm,
                                   struct ib_dm_mr_attr *attr,
                                   struct uverbs_attr_bundle *attrs);
        int (*create_counters)(struct ib_counters *counters,
                               struct uverbs_attr_bundle *attrs);
        int (*destroy_counters)(struct ib_counters *counters);
        int (*read_counters)(struct ib_counters *counters,
                             struct ib_counters_read_attr *counters_read_attr,
                             struct uverbs_attr_bundle *attrs);
        int (*map_mr_sg_pi)(struct ib_mr *mr, struct scatterlist *data_sg,
                            int data_sg_nents, unsigned int *data_sg_offset,
                            struct scatterlist *meta_sg, int meta_sg_nents,
                            unsigned int *meta_sg_offset);

        /**
         * alloc_hw_[device,port]_stats - Allocate a struct rdma_hw_stats and
         *   fill in the driver initialized data.  The struct is kfree()'ed by
         *   the sysfs core when the device is removed.  A lifespan of -1 in the
         *   return struct tells the core to set a default lifespan.
         */
        struct rdma_hw_stats *(*alloc_hw_device_stats)(struct ib_device *device);
        struct rdma_hw_stats *(*alloc_hw_port_stats)(struct ib_device *device,
                                                     u32 port_num);
        /**
         * get_hw_stats - Fill in the counter value(s) in the stats struct.
         * @index - The index in the value array we wish to have updated, or
         *   num_counters if we want all stats updated
         * Return codes -
         *   < 0 - Error, no counters updated
         *   index - Updated the single counter pointed to by index
         *   num_counters - Updated all counters (will reset the timestamp
         *     and prevent further calls for lifespan milliseconds)
         * Drivers are allowed to update all counters in leiu of just the
         *   one given in index at their option
         */
        int (*get_hw_stats)(struct ib_device *device,
                            struct rdma_hw_stats *stats, u32 port, int index);

        /**
         * modify_hw_stat - Modify the counter configuration
         * @enable: true/false when enable/disable a counter
         * Return codes - 0 on success or error code otherwise.
         */
        int (*modify_hw_stat)(struct ib_device *device, u32 port,
                              unsigned int counter_index, bool enable);
        /**
         * Allows rdma drivers to add their own restrack attributes.
         */
        int (*fill_res_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr);
        int (*fill_res_mr_entry_raw)(struct sk_buff *msg, struct ib_mr *ibmr);
        int (*fill_res_cq_entry)(struct sk_buff *msg, struct ib_cq *ibcq);
        int (*fill_res_cq_entry_raw)(struct sk_buff *msg, struct ib_cq *ibcq);
        int (*fill_res_qp_entry)(struct sk_buff *msg, struct ib_qp *ibqp);
        int (*fill_res_qp_entry_raw)(struct sk_buff *msg, struct ib_qp *ibqp);
        int (*fill_res_cm_id_entry)(struct sk_buff *msg, struct rdma_cm_id *id);
        int (*fill_res_srq_entry)(struct sk_buff *msg, struct ib_srq *ib_srq);
        int (*fill_res_srq_entry_raw)(struct sk_buff *msg, struct ib_srq *ib_srq);

        /* Device lifecycle callbacks */
        /*
         * Called after the device becomes registered, before clients are
         * attached
         */
        int (*enable_driver)(struct ib_device *dev);
        /*
         * This is called as part of ib_dealloc_device().
         */
        void (*dealloc_driver)(struct ib_device *dev);

        /* iWarp CM callbacks */
        void (*iw_add_ref)(struct ib_qp *qp);
        void (*iw_rem_ref)(struct ib_qp *qp);
        struct ib_qp *(*iw_get_qp)(struct ib_device *device, int qpn);
        int (*iw_connect)(struct iw_cm_id *cm_id,
                          struct iw_cm_conn_param *conn_param);
        int (*iw_accept)(struct iw_cm_id *cm_id,
                         struct iw_cm_conn_param *conn_param);
        int (*iw_reject)(struct iw_cm_id *cm_id, const void *pdata,
                         u8 pdata_len);
        int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog);
        int (*iw_destroy_listen)(struct iw_cm_id *cm_id);
        /**
         * counter_bind_qp - Bind a QP to a counter.
         * @counter - The counter to be bound. If counter->id is zero then
         *   the driver needs to allocate a new counter and set counter->id
         */
        int (*counter_bind_qp)(struct rdma_counter *counter, struct ib_qp *qp,
                               u32 port);
        /**
         * counter_unbind_qp - Unbind the qp from the dynamically-allocated
         *   counter and bind it onto the default one
         */
        int (*counter_unbind_qp)(struct ib_qp *qp, u32 port);
        /**
         * counter_dealloc -De-allocate the hw counter
         */
        int (*counter_dealloc)(struct rdma_counter *counter);
        /**
         * counter_alloc_stats - Allocate a struct rdma_hw_stats and fill in
         * the driver initialized data.
         */
        struct rdma_hw_stats *(*counter_alloc_stats)(
                struct rdma_counter *counter);
        /**
         * counter_update_stats - Query the stats value of this counter
         */
        int (*counter_update_stats)(struct rdma_counter *counter);

        /**
         * counter_init - Initialize the driver specific rdma counter struct.
         */
        void (*counter_init)(struct rdma_counter *counter);

        /**
         * Allows rdma drivers to add their own restrack attributes
         * dumped via 'rdma stat' iproute2 command.
         */
        int (*fill_stat_mr_entry)(struct sk_buff *msg, struct ib_mr *ibmr);

        /* query driver for its ucontext properties */
        int (*query_ucontext)(struct ib_ucontext *context,
                              struct uverbs_attr_bundle *attrs);

        /*
         * Provide NUMA node. This API exists for rdmavt/hfi1 only.
         * Everyone else relies on Linux memory management model.
         */
        int (*get_numa_node)(struct ib_device *dev);

        /**
         * add_sub_dev - Add a sub IB device
         */
        struct ib_device *(*add_sub_dev)(struct ib_device *parent,
                                         enum rdma_nl_dev_type type,
                                         const char *name);

        /**
         * del_sub_dev - Delete a sub IB device
         */
        void (*del_sub_dev)(struct ib_device *sub_dev);

        /**
         * ufile_cleanup - Attempt to cleanup ubojects HW resources inside
         * the ufile.
         */
        void (*ufile_hw_cleanup)(struct ib_uverbs_file *ufile);

        /**
         * report_port_event - Drivers need to implement this if they have
         * some private stuff to handle when link status changes.
         */
        void (*report_port_event)(struct ib_device *ibdev,
                                  struct net_device *ndev, unsigned long event);

        DECLARE_RDMA_OBJ_SIZE(ib_ah);
        DECLARE_RDMA_OBJ_SIZE(ib_counters);
        DECLARE_RDMA_OBJ_SIZE(ib_cq);
        DECLARE_RDMA_OBJ_SIZE(ib_mw);
        DECLARE_RDMA_OBJ_SIZE(ib_pd);
        DECLARE_RDMA_OBJ_SIZE(ib_qp);
        DECLARE_RDMA_OBJ_SIZE(ib_rwq_ind_table);
        DECLARE_RDMA_OBJ_SIZE(ib_srq);
        DECLARE_RDMA_OBJ_SIZE(ib_ucontext);
        DECLARE_RDMA_OBJ_SIZE(ib_xrcd);
        DECLARE_RDMA_OBJ_SIZE(rdma_counter);
};

struct ib_core_device {
        /* device must be the first element in structure until,
         * union of ib_core_device and device exists in ib_device.
         */
        struct device dev;
        possible_net_t rdma_net;
        struct kobject *ports_kobj;
        struct list_head port_list;
        struct ib_device *owner; /* reach back to owner ib_device */
};

struct rdma_restrack_root;
struct ib_device {
        /* Do not access @dma_device directly from ULP nor from HW drivers. */
        struct device                *dma_device;
        struct ib_device_ops             ops;
        char                          name[IB_DEVICE_NAME_MAX];
        struct rcu_head rcu_head;

        struct list_head              event_handler_list;
        /* Protects event_handler_list */
        struct rw_semaphore event_handler_rwsem;

        /* Protects QP's event_handler calls and open_qp list */
        spinlock_t qp_open_list_lock;

        struct rw_semaphore              client_data_rwsem;
        struct xarray                 client_data;
        struct mutex                  unregistration_lock;

        /* Synchronize GID, Pkey cache entries, subnet prefix, LMC */
        rwlock_t cache_lock;
        /**
         * port_data is indexed by port number
         */
        struct ib_port_data *port_data;

        int                              num_comp_vectors;

        union {
                struct device                dev;
                struct ib_core_device        coredev;
        };

        /* First group is for device attributes,
         * Second group is for driver provided attributes (optional).
         * Third group is for the hw_stats
         * It is a NULL terminated array.
         */
        const struct attribute_group        *groups[4];
        u8                                hw_stats_attr_index;

        u64                             uverbs_cmd_mask;

        char                             node_desc[IB_DEVICE_NODE_DESC_MAX];
        __be64                             node_guid;
        u32                             local_dma_lkey;
        u16                          is_switch:1;
        /* Indicates kernel verbs support, should not be used in drivers */
        u16                          kverbs_provider:1;
        /* CQ adaptive moderation (RDMA DIM) */
        u16                          use_cq_dim:1;
        u8                           node_type;
        u32                             phys_port_cnt;
        struct ib_device_attr        attrs;
        struct hw_stats_device_data *hw_stats_data;

#ifdef CONFIG_CGROUP_RDMA
        struct rdmacg_device         cg_device;
#endif

        u32                          index;

        spinlock_t                   cq_pools_lock;
        struct list_head             cq_pools[IB_POLL_LAST_POOL_TYPE + 1];

        struct rdma_restrack_root *res;

        const struct uapi_definition   *driver_def;

        /*
         * Positive refcount indicates that the device is currently
         * registered and cannot be unregistered.
         */
        refcount_t refcount;
        struct completion unreg_completion;
        struct work_struct unregistration_work;

        const struct rdma_link_ops *link_ops;

        /* Protects compat_devs xarray modifications */
        struct mutex compat_devs_mutex;
        /* Maintains compat devices for each net namespace */
        struct xarray compat_devs;

        /* Used by iWarp CM */
        char iw_ifname[IFNAMSIZ];
        u32 iw_driver_flags;
        u32 lag_flags;

        /* A parent device has a list of sub-devices */
        struct mutex subdev_lock;
        struct list_head subdev_list_head;

        /* A sub device has a type and a parent */
        enum rdma_nl_dev_type type;
        struct ib_device *parent;
        struct list_head subdev_list;

        enum rdma_nl_name_assign_type name_assign_type;
};

static inline void *rdma_zalloc_obj(struct ib_device *dev, size_t size,
                                    gfp_t gfp, bool is_numa_aware)
{
        if (is_numa_aware && dev->ops.get_numa_node)
                return kzalloc_node(size, gfp, dev->ops.get_numa_node(dev));

        return kzalloc(size, gfp);
}

struct ib_client_nl_info;
struct ib_client {
        const char *name;
        int (*add)(struct ib_device *ibdev);
        void (*remove)(struct ib_device *, void *client_data);
        void (*rename)(struct ib_device *dev, void *client_data);
        int (*get_nl_info)(struct ib_device *ibdev, void *client_data,
                           struct ib_client_nl_info *res);
        int (*get_global_nl_info)(struct ib_client_nl_info *res);

        /* Returns the net_dev belonging to this ib_client and matching the
         * given parameters.
         * @dev:         An RDMA device that the net_dev use for communication.
         * @port:         A physical port number on the RDMA device.
         * @pkey:         P_Key that the net_dev uses if applicable.
         * @gid:         A GID that the net_dev uses to communicate.
         * @addr:         An IP address the net_dev is configured with.
         * @client_data: The device's client data set by ib_set_client_data().
         *
         * An ib_client that implements a net_dev on top of RDMA devices
         * (such as IP over IB) should implement this callback, allowing the
         * rdma_cm module to find the right net_dev for a given request.
         *
         * The caller is responsible for calling dev_put on the returned
         * netdev. */
        struct net_device *(*get_net_dev_by_params)(
                        struct ib_device *dev,
                        u32 port,
                        u16 pkey,
                        const union ib_gid *gid,
                        const struct sockaddr *addr,
                        void *client_data);

        refcount_t uses;
        struct completion uses_zero;
        u32 client_id;

        /* kverbs are not required by the client */
        u8 no_kverbs_req:1;
};

/*
 * IB block DMA iterator
 *
 * Iterates the DMA-mapped SGL in contiguous memory blocks aligned
 * to a HW supported page size.
 */
struct ib_block_iter {
        /* internal states */
        struct scatterlist *__sg;        /* sg holding the current aligned block */
        dma_addr_t __dma_addr;                /* unaligned DMA address of this block */
        size_t __sg_numblocks;                /* ib_umem_num_dma_blocks() */
        unsigned int __sg_nents;        /* number of SG entries */
        unsigned int __sg_advance;        /* number of bytes to advance in sg in next step */
        unsigned int __pg_bit;                /* alignment of current block */
};

struct ib_device *_ib_alloc_device(size_t size);
#define ib_alloc_device(drv_struct, member)                                    \
        container_of(_ib_alloc_device(sizeof(struct drv_struct) +              \
                                      BUILD_BUG_ON_ZERO(offsetof(              \
                                              struct drv_struct, member))),    \
                     struct drv_struct, member)

void ib_dealloc_device(struct ib_device *device);

void ib_get_device_fw_str(struct ib_device *device, char *str);

int ib_register_device(struct ib_device *device, const char *name,
                       struct device *dma_device);
void ib_unregister_device(struct ib_device *device);
void ib_unregister_driver(enum rdma_driver_id driver_id);
void ib_unregister_device_and_put(struct ib_device *device);
void ib_unregister_device_queued(struct ib_device *ib_dev);

int ib_register_client   (struct ib_client *client);
void ib_unregister_client(struct ib_client *client);

void __rdma_block_iter_start(struct ib_block_iter *biter,
                             struct scatterlist *sglist,
                             unsigned int nents,
                             unsigned long pgsz);
bool __rdma_block_iter_next(struct ib_block_iter *biter);

/**
 * rdma_block_iter_dma_address - get the aligned dma address of the current
 * block held by the block iterator.
 * @biter: block iterator holding the memory block
 */
static inline dma_addr_t
rdma_block_iter_dma_address(struct ib_block_iter *biter)
{
        return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1);
}

/**
 * rdma_for_each_block - iterate over contiguous memory blocks of the sg list
 * @sglist: sglist to iterate over
 * @biter: block iterator holding the memory block
 * @nents: maximum number of sg entries to iterate over
 * @pgsz: best HW supported page size to use
 *
 * Callers may use rdma_block_iter_dma_address() to get each
 * blocks aligned DMA address.
 */
#define rdma_for_each_block(sglist, biter, nents, pgsz)                \
        for (__rdma_block_iter_start(biter, sglist, nents,        \
                                     pgsz);                        \
             __rdma_block_iter_next(biter);)

/**
 * ib_get_client_data - Get IB client context
 * @device:Device to get context for
 * @client:Client to get context for
 *
 * ib_get_client_data() returns the client context data set with
 * ib_set_client_data(). This can only be called while the client is
 * registered to the device, once the ib_client remove() callback returns this
 * cannot be called.
 */
static inline void *ib_get_client_data(struct ib_device *device,
                                       struct ib_client *client)
{
        return xa_load(&device->client_data, client->client_id);
}
void  ib_set_client_data(struct ib_device *device, struct ib_client *client,
                         void *data);
void ib_set_device_ops(struct ib_device *device,
                       const struct ib_device_ops *ops);

int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
                      unsigned long pfn, unsigned long size, pgprot_t prot,
                      struct rdma_user_mmap_entry *entry);
int rdma_user_mmap_entry_insert(struct ib_ucontext *ucontext,
                                struct rdma_user_mmap_entry *entry,
                                size_t length);
int rdma_user_mmap_entry_insert_range(struct ib_ucontext *ucontext,
                                      struct rdma_user_mmap_entry *entry,
                                      size_t length, u32 min_pgoff,
                                      u32 max_pgoff);

#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
void rdma_user_mmap_disassociate(struct ib_device *device);
#else
static inline void rdma_user_mmap_disassociate(struct ib_device *device)
{
}
#endif

static inline int
rdma_user_mmap_entry_insert_exact(struct ib_ucontext *ucontext,
                                  struct rdma_user_mmap_entry *entry,
                                  size_t length, u32 pgoff)
{
        return rdma_user_mmap_entry_insert_range(ucontext, entry, length, pgoff,
                                                 pgoff);
}

struct rdma_user_mmap_entry *
rdma_user_mmap_entry_get_pgoff(struct ib_ucontext *ucontext,
                               unsigned long pgoff);
struct rdma_user_mmap_entry *
rdma_user_mmap_entry_get(struct ib_ucontext *ucontext,
                         struct vm_area_struct *vma);
void rdma_user_mmap_entry_put(struct rdma_user_mmap_entry *entry);

void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry);

static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
{
        return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0;
}

static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len)
{
        return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0;
}

static inline bool ib_is_buffer_cleared(const void __user *p,
                                        size_t len)
{
        bool ret;
        u8 *buf;

        if (len > USHRT_MAX)
                return false;

        buf = memdup_user(p, len);
        if (IS_ERR(buf))
                return false;

        ret = !memchr_inv(buf, 0, len);
        kfree(buf);
        return ret;
}

static inline bool ib_is_udata_cleared(struct ib_udata *udata,
                                       size_t offset,
                                       size_t len)
{
        return ib_is_buffer_cleared(udata->inbuf + offset, len);
}

/**
 * ib_modify_qp_is_ok - Check that the supplied attribute mask
 * contains all required attributes and no attributes not allowed for
 * the given QP state transition.
 * @cur_state: Current QP state
 * @next_state: Next QP state
 * @type: QP type
 * @mask: Mask of supplied QP attributes
 *
 * This function is a helper function that a low-level driver's
 * modify_qp method can use to validate the consumer's input.  It
 * checks that cur_state and next_state are valid QP states, that a
 * transition from cur_state to next_state is allowed by the IB spec,
 * and that the attribute mask supplied is allowed for the transition.
 */
bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
                        enum ib_qp_type type, enum ib_qp_attr_mask mask);

void ib_register_event_handler(struct ib_event_handler *event_handler);
void ib_unregister_event_handler(struct ib_event_handler *event_handler);
void ib_dispatch_event(const struct ib_event *event);

int ib_query_port(struct ib_device *device,
                  u32 port_num, struct ib_port_attr *port_attr);

enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device,
                                               u32 port_num);

/**
 * rdma_cap_ib_switch - Check if the device is IB switch
 * @device: Device to check
 *
 * Device driver is responsible for setting is_switch bit on
 * in ib_device structure at init time.
 *
 * Return: true if the device is IB switch.
 */
static inline bool rdma_cap_ib_switch(const struct ib_device *device)
{
        return device->is_switch;
}

/**
 * rdma_start_port - Return the first valid port number for the device
 * specified
 *
 * @device: Device to be checked
 *
 * Return start port number
 */
static inline u32 rdma_start_port(const struct ib_device *device)
{
        return rdma_cap_ib_switch(device) ? 0 : 1;
}

/**
 * rdma_for_each_port - Iterate over all valid port numbers of the IB device
 * @device - The struct ib_device * to iterate over
 * @iter - The unsigned int to store the port number
 */
#define rdma_for_each_port(device, iter)                                       \
        for (iter = rdma_start_port(device +                                       \
                                    BUILD_BUG_ON_ZERO(!__same_type(u32,               \
                                                                   iter)));    \
             iter <= rdma_end_port(device); iter++)

/**
 * rdma_end_port - Return the last valid port number for the device
 * specified
 *
 * @device: Device to be checked
 *
 * Return last port number
 */
static inline u32 rdma_end_port(const struct ib_device *device)
{
        return rdma_cap_ib_switch(device) ? 0 : device->phys_port_cnt;
}

static inline int rdma_is_port_valid(const struct ib_device *device,
                                     unsigned int port)
{
        return (port >= rdma_start_port(device) &&
                port <= rdma_end_port(device));
}

static inline bool rdma_is_grh_required(const struct ib_device *device,
                                        u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_PORT_IB_GRH_REQUIRED;
}

static inline bool rdma_protocol_ib(const struct ib_device *device,
                                    u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_IB;
}

static inline bool rdma_protocol_roce(const struct ib_device *device,
                                      u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP);
}

static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device,
                                                u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP;
}

static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device,
                                                u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_ROCE;
}

static inline bool rdma_protocol_iwarp(const struct ib_device *device,
                                       u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_IWARP;
}

static inline bool rdma_ib_or_roce(const struct ib_device *device,
                                   u32 port_num)
{
        return rdma_protocol_ib(device, port_num) ||
                rdma_protocol_roce(device, port_num);
}

static inline bool rdma_protocol_raw_packet(const struct ib_device *device,
                                            u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_RAW_PACKET;
}

static inline bool rdma_protocol_usnic(const struct ib_device *device,
                                       u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_PROT_USNIC;
}

/**
 * rdma_cap_ib_mad - Check if the port of a device supports Infiniband
 * Management Datagrams.
 * @device: Device to check
 * @port_num: Port number to check
 *
 * Management Datagrams (MAD) are a required part of the InfiniBand
 * specification and are supported on all InfiniBand devices.  A slightly
 * extended version are also supported on OPA interfaces.
 *
 * Return: true if the port supports sending/receiving of MAD packets.
 */
static inline bool rdma_cap_ib_mad(const struct ib_device *device, u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IB_MAD;
}

/**
 * rdma_cap_opa_mad - Check if the port of device provides support for OPA
 * Management Datagrams.
 * @device: Device to check
 * @port_num: Port number to check
 *
 * Intel OmniPath devices extend and/or replace the InfiniBand Management
 * datagrams with their own versions.  These OPA MADs share many but not all of
 * the characteristics of InfiniBand MADs.
 *
 * OPA MADs differ in the following ways:
 *
 *    1) MADs are variable size up to 2K
 *       IBTA defined MADs remain fixed at 256 bytes
 *    2) OPA SMPs must carry valid PKeys
 *    3) OPA SMP packets are a different format
 *
 * Return: true if the port supports OPA MAD packet formats.
 */
static inline bool rdma_cap_opa_mad(struct ib_device *device, u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
                RDMA_CORE_CAP_OPA_MAD;
}

/**
 * rdma_cap_ib_smi - Check if the port of a device provides an Infiniband
 * Subnet Management Agent (SMA) on the Subnet Management Interface (SMI).
 * @device: Device to check
 * @port_num: Port number to check
 *
 * Each InfiniBand node is required to provide a Subnet Management Agent
 * that the subnet manager can access.  Prior to the fabric being fully
 * configured by the subnet manager, the SMA is accessed via a well known
 * interface called the Subnet Management Interface (SMI).  This interface
 * uses directed route packets to communicate with the SM to get around the
 * chicken and egg problem of the SM needing to know what's on the fabric
 * in order to configure the fabric, and needing to configure the fabric in
 * order to send packets to the devices on the fabric.  These directed
 * route packets do not need the fabric fully configured in order to reach
 * their destination.  The SMI is the only method allowed to send
 * directed route packets on an InfiniBand fabric.
 *
 * Return: true if the port provides an SMI.
 */
static inline bool rdma_cap_ib_smi(const struct ib_device *device, u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IB_SMI;
}

/**
 * rdma_cap_ib_cm - Check if the port of device has the capability Infiniband
 * Communication Manager.
 * @device: Device to check
 * @port_num: Port number to check
 *
 * The InfiniBand Communication Manager is one of many pre-defined General
 * Service Agents (GSA) that are accessed via the General Service
 * Interface (GSI).  It's role is to facilitate establishment of connections
 * between nodes as well as other management related tasks for established
 * connections.
 *
 * Return: true if the port supports an IB CM (this does not guarantee that
 * a CM is actually running however).
 */
static inline bool rdma_cap_ib_cm(const struct ib_device *device, u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IB_CM;
}

/**
 * rdma_cap_iw_cm - Check if the port of device has the capability IWARP
 * Communication Manager.
 * @device: Device to check
 * @port_num: Port number to check
 *
 * Similar to above, but specific to iWARP connections which have a different
 * managment protocol than InfiniBand.
 *
 * Return: true if the port supports an iWARP CM (this does not guarantee that
 * a CM is actually running however).
 */
static inline bool rdma_cap_iw_cm(const struct ib_device *device, u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IW_CM;
}

/**
 * rdma_cap_ib_sa - Check if the port of device has the capability Infiniband
 * Subnet Administration.
 * @device: Device to check
 * @port_num: Port number to check
 *
 * An InfiniBand Subnet Administration (SA) service is a pre-defined General
 * Service Agent (GSA) provided by the Subnet Manager (SM).  On InfiniBand
 * fabrics, devices should resolve routes to other hosts by contacting the
 * SA to query the proper route.
 *
 * Return: true if the port should act as a client to the fabric Subnet
 * Administration interface.  This does not imply that the SA service is
 * running locally.
 */
static inline bool rdma_cap_ib_sa(const struct ib_device *device, u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_IB_SA;
}

/**
 * rdma_cap_ib_mcast - Check if the port of device has the capability Infiniband
 * Multicast.
 * @device: Device to check
 * @port_num: Port number to check
 *
 * InfiniBand multicast registration is more complex than normal IPv4 or
 * IPv6 multicast registration.  Each Host Channel Adapter must register
 * with the Subnet Manager when it wishes to join a multicast group.  It
 * should do so only once regardless of how many queue pairs it subscribes
 * to this group.  And it should leave the group only after all queue pairs
 * attached to the group have been detached.
 *
 * Return: true if the port must undertake the additional adminstrative
 * overhead of registering/unregistering with the SM and tracking of the
 * total number of queue pairs attached to the multicast group.
 */
static inline bool rdma_cap_ib_mcast(const struct ib_device *device,
                                     u32 port_num)
{
        return rdma_cap_ib_sa(device, port_num);
}

/**
 * rdma_cap_af_ib - Check if the port of device has the capability
 * Native Infiniband Address.
 * @device: Device to check
 * @port_num: Port number to check
 *
 * InfiniBand addressing uses a port's GUID + Subnet Prefix to make a default
 * GID.  RoCE uses a different mechanism, but still generates a GID via
 * a prescribed mechanism and port specific data.
 *
 * Return: true if the port uses a GID address to identify devices on the
 * network.
 */
static inline bool rdma_cap_af_ib(const struct ib_device *device, u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_AF_IB;
}

/**
 * rdma_cap_eth_ah - Check if the port of device has the capability
 * Ethernet Address Handle.
 * @device: Device to check
 * @port_num: Port number to check
 *
 * RoCE is InfiniBand over Ethernet, and it uses a well defined technique
 * to fabricate GIDs over Ethernet/IP specific addresses native to the
 * port.  Normally, packet headers are generated by the sending host
 * adapter, but when sending connectionless datagrams, we must manually
 * inject the proper headers for the fabric we are communicating over.
 *
 * Return: true if we are running as a RoCE port and must force the
 * addition of a Global Route Header built from our Ethernet Address
 * Handle into our header list for connectionless packets.
 */
static inline bool rdma_cap_eth_ah(const struct ib_device *device, u32 port_num)
{
        return device->port_data[port_num].immutable.core_cap_flags &
               RDMA_CORE_CAP_ETH_AH;
}

/**
 * rdma_cap_opa_ah - Check if the port of device supports
 * OPA Address handles
 * @device: Device to check
 * @port_num: Port number to check
 *
 * Return: true if we are running on an OPA device which supports
 * the extended OPA addressing.
 */
static inline bool rdma_cap_opa_ah(struct ib_device *device, u32 port_num)
{
        return (device->port_data[port_num].immutable.core_cap_flags &
                RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH;
}

/**
 * rdma_max_mad_size - Return the max MAD size required by this RDMA Port.
 *
 * @device: Device
 * @port_num: Port number
 *
 * This MAD size includes the MAD headers and MAD payload.  No other headers
 * are included.
 *
 * Return the max MAD size required by the Port.  Will return 0 if the port
 * does not support MADs
 */
static inline size_t rdma_max_mad_size(const struct ib_device *device,
                                       u32 port_num)
{
        return device->port_data[port_num].immutable.max_mad_size;
}

/**
 * rdma_cap_roce_gid_table - Check if the port of device uses roce_gid_table
 * @device: Device to check
 * @port_num: Port number to check
 *
 * RoCE GID table mechanism manages the various GIDs for a device.
 *
 * NOTE: if allocating the port's GID table has failed, this call will still
 * return true, but any RoCE GID table API will fail.
 *
 * Return: true if the port uses RoCE GID table mechanism in order to manage
 * its GIDs.
 */
static inline bool rdma_cap_roce_gid_table(const struct ib_device *device,
                                           u32 port_num)
{
        return rdma_protocol_roce(device, port_num) &&
                device->ops.add_gid && device->ops.del_gid;
}

/*
 * Check if the device supports READ W/ INVALIDATE.
 */
static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num)
{
        /*
         * iWarp drivers must support READ W/ INVALIDATE.  No other protocol
         * has support for it yet.
         */
        return rdma_protocol_iwarp(dev, port_num);
}

/**
 * rdma_core_cap_opa_port - Return whether the RDMA Port is OPA or not.
 * @device: Device
 * @port_num: 1 based Port number
 *
 * Return true if port is an Intel OPA port , false if not
 */
static inline bool rdma_core_cap_opa_port(struct ib_device *device,
                                          u32 port_num)
{
        return (device->port_data[port_num].immutable.core_cap_flags &
                RDMA_CORE_PORT_INTEL_OPA) == RDMA_CORE_PORT_INTEL_OPA;
}

/**
 * rdma_mtu_enum_to_int - Return the mtu of the port as an integer value.
 * @device: Device
 * @port_num: Port number
 * @mtu: enum value of MTU
 *
 * Return the MTU size supported by the port as an integer value. Will return
 * -1 if enum value of mtu is not supported.
 */
static inline int rdma_mtu_enum_to_int(struct ib_device *device, u32 port,
                                       int mtu)
{
        if (rdma_core_cap_opa_port(device, port))
                return opa_mtu_enum_to_int((enum opa_mtu)mtu);
        else
                return ib_mtu_enum_to_int((enum ib_mtu)mtu);
}

/**
 * rdma_mtu_from_attr - Return the mtu of the port from the port attribute.
 * @device: Device
 * @port_num: Port number
 * @attr: port attribute
 *
 * Return the MTU size supported by the port as an integer value.
 */
static inline int rdma_mtu_from_attr(struct ib_device *device, u32 port,
                                     struct ib_port_attr *attr)
{
        if (rdma_core_cap_opa_port(device, port))
                return attr->phys_mtu;
        else
                return ib_mtu_enum_to_int(attr->max_mtu);
}

int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port,
                         int state);
int ib_get_vf_config(struct ib_device *device, int vf, u32 port,
                     struct ifla_vf_info *info);
int ib_get_vf_stats(struct ib_device *device, int vf, u32 port,
                    struct ifla_vf_stats *stats);
int ib_get_vf_guid(struct ib_device *device, int vf, u32 port,
                    struct ifla_vf_guid *node_guid,
                    struct ifla_vf_guid *port_guid);
int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid,
                   int type);

int ib_query_pkey(struct ib_device *device,
                  u32 port_num, u16 index, u16 *pkey);

int ib_modify_device(struct ib_device *device,
                     int device_modify_mask,
                     struct ib_device_modify *device_modify);

int ib_modify_port(struct ib_device *device,
                   u32 port_num, int port_modify_mask,
                   struct ib_port_modify *port_modify);

int ib_find_gid(struct ib_device *device, union ib_gid *gid,
                u32 *port_num, u16 *index);

int ib_find_pkey(struct ib_device *device,
                 u32 port_num, u16 pkey, u16 *index);

enum ib_pd_flags {
        /*
         * Create a memory registration for all memory in the system and place
         * the rkey for it into pd->unsafe_global_rkey.  This can be used by
         * ULPs to avoid the overhead of dynamic MRs.
         *
         * This flag is generally considered unsafe and must only be used in
         * extremly trusted environments.  Every use of it will log a warning
         * in the kernel log.
         */
        IB_PD_UNSAFE_GLOBAL_RKEY        = 0x01,
};

struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
                const char *caller);

/**
 * ib_alloc_pd - Allocates an unused protection domain.
 * @device: The device on which to allocate the protection domain.
 * @flags: protection domain flags
 *
 * A protection domain object provides an association between QPs, shared
 * receive queues, address handles, memory regions, and memory windows.
 *
 * Every PD has a local_dma_lkey which can be used as the lkey value for local
 * memory operations.
 */
#define ib_alloc_pd(device, flags) \
        __ib_alloc_pd((device), (flags), KBUILD_MODNAME)

int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata);

/**
 * ib_dealloc_pd - Deallocate kernel PD
 * @pd: The protection domain
 *
 * NOTE: for user PD use ib_dealloc_pd_user with valid udata!
 */
static inline void ib_dealloc_pd(struct ib_pd *pd)
{
        int ret = ib_dealloc_pd_user(pd, NULL);

        WARN_ONCE(ret, "Destroy of kernel PD shouldn't fail");
}

enum rdma_create_ah_flags {
        /* In a sleepable context */
        RDMA_CREATE_AH_SLEEPABLE = BIT(0),
};

/**
 * rdma_create_ah - Creates an address handle for the given address vector.
 * @pd: The protection domain associated with the address handle.
 * @ah_attr: The attributes of the address vector.
 * @flags: Create address handle flags (see enum rdma_create_ah_flags).
 *
 * The address handle is used to reference a local or global destination
 * in all UD QP post sends.
 */
struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
                             u32 flags);

/**
 * rdma_create_user_ah - Creates an address handle for the given address vector.
 * It resolves destination mac address for ah attribute of RoCE type.
 * @pd: The protection domain associated with the address handle.
 * @ah_attr: The attributes of the address vector.
 * @udata: pointer to user's input output buffer information need by
 *         provider driver.
 *
 * It returns 0 on success and returns appropriate error code on error.
 * The address handle is used to reference a local or global destination
 * in all UD QP post sends.
 */
struct ib_ah *rdma_create_user_ah(struct ib_pd *pd,
                                  struct rdma_ah_attr *ah_attr,
                                  struct ib_udata *udata);
/**
 * ib_get_gids_from_rdma_hdr - Get sgid and dgid from GRH or IPv4 header
 *   work completion.
 * @hdr: the L3 header to parse
 * @net_type: type of header to parse
 * @sgid: place to store source gid
 * @dgid: place to store destination gid
 */
int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr,
                              enum rdma_network_type net_type,
                              union ib_gid *sgid, union ib_gid *dgid);

/**
 * ib_get_rdma_header_version - Get the header version
 * @hdr: the L3 header to parse
 */
int ib_get_rdma_header_version(const union rdma_network_hdr *hdr);

/**
 * ib_init_ah_attr_from_wc - Initializes address handle attributes from a
 *   work completion.
 * @device: Device on which the received message arrived.
 * @port_num: Port on which the received message arrived.
 * @wc: Work completion associated with the received message.
 * @grh: References the received global route header.  This parameter is
 *   ignored unless the work completion indicates that the GRH is valid.
 * @ah_attr: Returned attributes that can be used when creating an address
 *   handle for replying to the message.
 * When ib_init_ah_attr_from_wc() returns success,
 * (a) for IB link layer it optionally contains a reference to SGID attribute
 * when GRH is present for IB link layer.
 * (b) for RoCE link layer it contains a reference to SGID attribute.
 * User must invoke rdma_cleanup_ah_attr_gid_attr() to release reference to SGID
 * attributes which are initialized using ib_init_ah_attr_from_wc().
 *
 */
int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num,
                            const struct ib_wc *wc, const struct ib_grh *grh,
                            struct rdma_ah_attr *ah_attr);

/**
 * ib_create_ah_from_wc - Creates an address handle associated with the
 *   sender of the specified work completion.
 * @pd: The protection domain associated with the address handle.
 * @wc: Work completion information associated with a received message.
 * @grh: References the received global route header.  This parameter is
 *   ignored unless the work completion indicates that the GRH is valid.
 * @port_num: The outbound port number to associate with the address.
 *
 * The address handle is used to reference a local or global destination
 * in all UD QP post sends.
 */
struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc,
                                   const struct ib_grh *grh, u32 port_num);

/**
 * rdma_modify_ah - Modifies the address vector associated with an address
 *   handle.
 * @ah: The address handle to modify.
 * @ah_attr: The new address vector attributes to associate with the
 *   address handle.
 */
int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);

/**
 * rdma_query_ah - Queries the address vector associated with an address
 *   handle.
 * @ah: The address handle to query.
 * @ah_attr: The address vector attributes associated with the address
 *   handle.
 */
int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);

enum rdma_destroy_ah_flags {
        /* In a sleepable context */
        RDMA_DESTROY_AH_SLEEPABLE = BIT(0),
};

/**
 * rdma_destroy_ah_user - Destroys an address handle.
 * @ah: The address handle to destroy.
 * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags).
 * @udata: Valid user data or NULL for kernel objects
 */
int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata);

/**
 * rdma_destroy_ah - Destroys an kernel address handle.
 * @ah: The address handle to destroy.
 * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags).
 *
 * NOTE: for user ah use rdma_destroy_ah_user with valid udata!
 */
static inline void rdma_destroy_ah(struct ib_ah *ah, u32 flags)
{
        int ret = rdma_destroy_ah_user(ah, flags, NULL);

        WARN_ONCE(ret, "Destroy of kernel AH shouldn't fail");
}

struct ib_srq *ib_create_srq_user(struct ib_pd *pd,
                                  struct ib_srq_init_attr *srq_init_attr,
                                  struct ib_usrq_object *uobject,
                                  struct ib_udata *udata);
static inline struct ib_srq *
ib_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr)
{
        if (!pd->device->ops.create_srq)
                return ERR_PTR(-EOPNOTSUPP);

        return ib_create_srq_user(pd, srq_init_attr, NULL, NULL);
}

/**
 * ib_modify_srq - Modifies the attributes for the specified SRQ.
 * @srq: The SRQ to modify.
 * @srq_attr: On input, specifies the SRQ attributes to modify.  On output,
 *   the current values of selected SRQ attributes are returned.
 * @srq_attr_mask: A bit-mask used to specify which attributes of the SRQ
 *   are being modified.
 *
 * The mask may contain IB_SRQ_MAX_WR to resize the SRQ and/or
 * IB_SRQ_LIMIT to set the SRQ's limit and request notification when
 * the number of receives queued drops below the limit.
 */
int ib_modify_srq(struct ib_srq *srq,
                  struct ib_srq_attr *srq_attr,
                  enum ib_srq_attr_mask srq_attr_mask);

/**
 * ib_query_srq - Returns the attribute list and current values for the
 *   specified SRQ.
 * @srq: The SRQ to query.
 * @srq_attr: The attributes of the specified SRQ.
 */
int ib_query_srq(struct ib_srq *srq,
                 struct ib_srq_attr *srq_attr);

/**
 * ib_destroy_srq_user - Destroys the specified SRQ.
 * @srq: The SRQ to destroy.
 * @udata: Valid user data or NULL for kernel objects
 */
int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata);

/**
 * ib_destroy_srq - Destroys the specified kernel SRQ.
 * @srq: The SRQ to destroy.
 *
 * NOTE: for user srq use ib_destroy_srq_user with valid udata!
 */
static inline void ib_destroy_srq(struct ib_srq *srq)
{
        int ret = ib_destroy_srq_user(srq, NULL);

        WARN_ONCE(ret, "Destroy of kernel SRQ shouldn't fail");
}

/**
 * ib_post_srq_recv - Posts a list of work requests to the specified SRQ.
 * @srq: The SRQ to post the work request on.
 * @recv_wr: A list of work requests to post on the receive queue.
 * @bad_recv_wr: On an immediate failure, this parameter will reference
 *   the work request that failed to be posted on the QP.
 */
static inline int ib_post_srq_recv(struct ib_srq *srq,
                                   const struct ib_recv_wr *recv_wr,
                                   const struct ib_recv_wr **bad_recv_wr)
{
        const struct ib_recv_wr *dummy;

        return srq->device->ops.post_srq_recv(srq, recv_wr,
                                              bad_recv_wr ? : &dummy);
}

struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd,
                                  struct ib_qp_init_attr *qp_init_attr,
                                  const char *caller);
/**
 * ib_create_qp - Creates a kernel QP associated with the specific protection
 * domain.
 * @pd: The protection domain associated with the QP.
 * @init_attr: A list of initial attributes required to create the
 *   QP.  If QP creation succeeds, then the attributes are updated to
 *   the actual capabilities of the created QP.
 */
static inline struct ib_qp *ib_create_qp(struct ib_pd *pd,
                                         struct ib_qp_init_attr *init_attr)
{
        return ib_create_qp_kernel(pd, init_attr, KBUILD_MODNAME);
}

/**
 * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
 * @qp: The QP to modify.
 * @attr: On input, specifies the QP attributes to modify.  On output,
 *   the current values of selected QP attributes are returned.
 * @attr_mask: A bit-mask used to specify which attributes of the QP
 *   are being modified.
 * @udata: pointer to user's input output buffer information
 *   are being modified.
 * It returns 0 on success and returns appropriate error code on error.
 */
int ib_modify_qp_with_udata(struct ib_qp *qp,
                            struct ib_qp_attr *attr,
                            int attr_mask,
                            struct ib_udata *udata);

/**
 * ib_modify_qp - Modifies the attributes for the specified QP and then
 *   transitions the QP to the given state.
 * @qp: The QP to modify.
 * @qp_attr: On input, specifies the QP attributes to modify.  On output,
 *   the current values of selected QP attributes are returned.
 * @qp_attr_mask: A bit-mask used to specify which attributes of the QP
 *   are being modified.
 */
int ib_modify_qp(struct ib_qp *qp,
                 struct ib_qp_attr *qp_attr,
                 int qp_attr_mask);

/**
 * ib_query_qp - Returns the attribute list and current values for the
 *   specified QP.
 * @qp: The QP to query.
 * @qp_attr: The attributes of the specified QP.
 * @qp_attr_mask: A bit-mask used to select specific attributes to query.
 * @qp_init_attr: Additional attributes of the selected QP.
 *
 * The qp_attr_mask may be used to limit the query to gathering only the
 * selected attributes.
 */
int ib_query_qp(struct ib_qp *qp,
                struct ib_qp_attr *qp_attr,
                int qp_attr_mask,
                struct ib_qp_init_attr *qp_init_attr);

/**
 * ib_destroy_qp - Destroys the specified QP.
 * @qp: The QP to destroy.
 * @udata: Valid udata or NULL for kernel objects
 */
int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata);

/**
 * ib_destroy_qp - Destroys the specified kernel QP.
 * @qp: The QP to destroy.
 *
 * NOTE: for user qp use ib_destroy_qp_user with valid udata!
 */
static inline int ib_destroy_qp(struct ib_qp *qp)
{
        return ib_destroy_qp_user(qp, NULL);
}

/**
 * ib_open_qp - Obtain a reference to an existing sharable QP.
 * @xrcd - XRC domain
 * @qp_open_attr: Attributes identifying the QP to open.
 *
 * Returns a reference to a sharable QP.
 */
struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
                         struct ib_qp_open_attr *qp_open_attr);

/**
 * ib_close_qp - Release an external reference to a QP.
 * @qp: The QP handle to release
 *
 * The opened QP handle is released by the caller.  The underlying
 * shared QP is not destroyed until all internal references are released.
 */
int ib_close_qp(struct ib_qp *qp);

/**
 * ib_post_send - Posts a list of work requests to the send queue of
 *   the specified QP.
 * @qp: The QP to post the work request on.
 * @send_wr: A list of work requests to post on the send queue.
 * @bad_send_wr: On an immediate failure, this parameter will reference
 *   the work request that failed to be posted on the QP.
 *
 * While IBA Vol. 1 section 11.4.1.1 specifies that if an immediate
 * error is returned, the QP state shall not be affected,
 * ib_post_send() will return an immediate error after queueing any
 * earlier work requests in the list.
 */
static inline int ib_post_send(struct ib_qp *qp,
                               const struct ib_send_wr *send_wr,
                               const struct ib_send_wr **bad_send_wr)
{
        const struct ib_send_wr *dummy;

        return qp->device->ops.post_send(qp, send_wr, bad_send_wr ? : &dummy);
}

/**
 * ib_post_recv - Posts a list of work requests to the receive queue of
 *   the specified QP.
 * @qp: The QP to post the work request on.
 * @recv_wr: A list of work requests to post on the receive queue.
 * @bad_recv_wr: On an immediate failure, this parameter will reference
 *   the work request that failed to be posted on the QP.
 */
static inline int ib_post_recv(struct ib_qp *qp,
                               const struct ib_recv_wr *recv_wr,
                               const struct ib_recv_wr **bad_recv_wr)
{
        const struct ib_recv_wr *dummy;

        return qp->device->ops.post_recv(qp, recv_wr, bad_recv_wr ? : &dummy);
}

struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, int nr_cqe,
                            int comp_vector, enum ib_poll_context poll_ctx,
                            const char *caller);
static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
                                        int nr_cqe, int comp_vector,
                                        enum ib_poll_context poll_ctx)
{
        return __ib_alloc_cq(dev, private, nr_cqe, comp_vector, poll_ctx,
                             KBUILD_MODNAME);
}

struct ib_cq *__ib_alloc_cq_any(struct ib_device *dev, void *private,
                                int nr_cqe, enum ib_poll_context poll_ctx,
                                const char *caller);

/**
 * ib_alloc_cq_any: Allocate kernel CQ
 * @dev: The IB device
 * @private: Private data attached to the CQE
 * @nr_cqe: Number of CQEs in the CQ
 * @poll_ctx: Context used for polling the CQ
 */
static inline struct ib_cq *ib_alloc_cq_any(struct ib_device *dev,
                                            void *private, int nr_cqe,
                                            enum ib_poll_context poll_ctx)
{
        return __ib_alloc_cq_any(dev, private, nr_cqe, poll_ctx,
                                 KBUILD_MODNAME);
}

void ib_free_cq(struct ib_cq *cq);
int ib_process_cq_direct(struct ib_cq *cq, int budget);

/**
 * ib_create_cq - Creates a CQ on the specified device.
 * @device: The device on which to create the CQ.
 * @comp_handler: A user-specified callback that is invoked when a
 *   completion event occurs on the CQ.
 * @event_handler: A user-specified callback that is invoked when an
 *   asynchronous event not associated with a completion occurs on the CQ.
 * @cq_context: Context associated with the CQ returned to the user via
 *   the associated completion and event handlers.
 * @cq_attr: The attributes the CQ should be created upon.
 *
 * Users can examine the cq structure to determine the actual CQ size.
 */
struct ib_cq *__ib_create_cq(struct ib_device *device,
                             ib_comp_handler comp_handler,
                             void (*event_handler)(struct ib_event *, void *),
                             void *cq_context,
                             const struct ib_cq_init_attr *cq_attr,
                             const char *caller);
#define ib_create_cq(device, cmp_hndlr, evt_hndlr, cq_ctxt, cq_attr) \
        __ib_create_cq((device), (cmp_hndlr), (evt_hndlr), (cq_ctxt), (cq_attr), KBUILD_MODNAME)

/**
 * ib_resize_cq - Modifies the capacity of the CQ.
 * @cq: The CQ to resize.
 * @cqe: The minimum size of the CQ.
 *
 * Users can examine the cq structure to determine the actual CQ size.
 */
int ib_resize_cq(struct ib_cq *cq, int cqe);

/**
 * rdma_set_cq_moderation - Modifies moderation params of the CQ
 * @cq: The CQ to modify.
 * @cq_count: number of CQEs that will trigger an event
 * @cq_period: max period of time in usec before triggering an event
 *
 */
int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period);

/**
 * ib_destroy_cq_user - Destroys the specified CQ.
 * @cq: The CQ to destroy.
 * @udata: Valid user data or NULL for kernel objects
 */
int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata);

/**
 * ib_destroy_cq - Destroys the specified kernel CQ.
 * @cq: The CQ to destroy.
 *
 * NOTE: for user cq use ib_destroy_cq_user with valid udata!
 */
static inline void ib_destroy_cq(struct ib_cq *cq)
{
        int ret = ib_destroy_cq_user(cq, NULL);

        WARN_ONCE(ret, "Destroy of kernel CQ shouldn't fail");
}

/**
 * ib_poll_cq - poll a CQ for completion(s)
 * @cq:the CQ being polled
 * @num_entries:maximum number of completions to return
 * @wc:array of at least @num_entries &struct ib_wc where completions
 *   will be returned
 *
 * Poll a CQ for (possibly multiple) completions.  If the return value
 * is < 0, an error occurred.  If the return value is >= 0, it is the
 * number of completions returned.  If the return value is
 * non-negative and < num_entries, then the CQ was emptied.
 */
static inline int ib_poll_cq(struct ib_cq *cq, int num_entries,
                             struct ib_wc *wc)
{
        return cq->device->ops.poll_cq(cq, num_entries, wc);
}

/**
 * ib_req_notify_cq - Request completion notification on a CQ.
 * @cq: The CQ to generate an event for.
 * @flags:
 *   Must contain exactly one of %IB_CQ_SOLICITED or %IB_CQ_NEXT_COMP
 *   to request an event on the next solicited event or next work
 *   completion at any type, respectively. %IB_CQ_REPORT_MISSED_EVENTS
 *   may also be |ed in to request a hint about missed events, as
 *   described below.
 *
 * Return Value:
 *    < 0 means an error occurred while requesting notification
 *   == 0 means notification was requested successfully, and if
 *        IB_CQ_REPORT_MISSED_EVENTS was passed in, then no events
 *        were missed and it is safe to wait for another event.  In
 *        this case is it guaranteed that any work completions added
 *        to the CQ since the last CQ poll will trigger a completion
 *        notification event.
 *    > 0 is only returned if IB_CQ_REPORT_MISSED_EVENTS was passed
 *        in.  It means that the consumer must poll the CQ again to
 *        make sure it is empty to avoid missing an event because of a
 *        race between requesting notification and an entry being
 *        added to the CQ.  This return value means it is possible
 *        (but not guaranteed) that a work completion has been added
 *        to the CQ since the last poll without triggering a
 *        completion notification event.
 */
static inline int ib_req_notify_cq(struct ib_cq *cq,
                                   enum ib_cq_notify_flags flags)
{
        return cq->device->ops.req_notify_cq(cq, flags);
}

struct ib_cq *ib_cq_pool_get(struct ib_device *dev, unsigned int nr_cqe,
                             int comp_vector_hint,
                             enum ib_poll_context poll_ctx);

void ib_cq_pool_put(struct ib_cq *cq, unsigned int nr_cqe);

/*
 * Drivers that don't need a DMA mapping at the RDMA layer, set dma_device to
 * NULL. This causes the ib_dma* helpers to just stash the kernel virtual
 * address into the dma address.
 */
static inline bool ib_uses_virt_dma(struct ib_device *dev)
{
        return IS_ENABLED(CONFIG_INFINIBAND_VIRT_DMA) && !dev->dma_device;
}

/*
 * Check if a IB device's underlying DMA mapping supports P2PDMA transfers.
 */
static inline bool ib_dma_pci_p2p_dma_supported(struct ib_device *dev)
{
        if (ib_uses_virt_dma(dev))
                return false;

        return dma_pci_p2pdma_supported(dev->dma_device);
}

/**
 * ib_virt_dma_to_ptr - Convert a dma_addr to a kernel pointer
 * @dma_addr: The DMA address
 *
 * Used by ib_uses_virt_dma() devices to get back to the kernel pointer after
 * going through the dma_addr marshalling.
 */
static inline void *ib_virt_dma_to_ptr(u64 dma_addr)
{
        /* virt_dma mode maps the kvs's directly into the dma addr */
        return (void *)(uintptr_t)dma_addr;
}

/**
 * ib_virt_dma_to_page - Convert a dma_addr to a struct page
 * @dma_addr: The DMA address
 *
 * Used by ib_uses_virt_dma() device to get back to the struct page after going
 * through the dma_addr marshalling.
 */
static inline struct page *ib_virt_dma_to_page(u64 dma_addr)
{
        return virt_to_page(ib_virt_dma_to_ptr(dma_addr));
}

/**
 * ib_dma_mapping_error - check a DMA addr for error
 * @dev: The device for which the dma_addr was created
 * @dma_addr: The DMA address to check
 */
static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
{
        if (ib_uses_virt_dma(dev))
                return 0;
        return dma_mapping_error(dev->dma_device, dma_addr);
}

/**
 * ib_dma_map_single - Map a kernel virtual address to DMA address
 * @dev: The device for which the dma_addr is to be created
 * @cpu_addr: The kernel virtual address
 * @size: The size of the region in bytes
 * @direction: The direction of the DMA
 */
static inline u64 ib_dma_map_single(struct ib_device *dev,
                                    void *cpu_addr, size_t size,
                                    enum dma_data_direction direction)
{
        if (ib_uses_virt_dma(dev))
                return (uintptr_t)cpu_addr;
        return dma_map_single(dev->dma_device, cpu_addr, size, direction);
}

/**
 * ib_dma_unmap_single - Destroy a mapping created by ib_dma_map_single()
 * @dev: The device for which the DMA address was created
 * @addr: The DMA address
 * @size: The size of the region in bytes
 * @direction: The direction of the DMA
 */
static inline void ib_dma_unmap_single(struct ib_device *dev,
                                       u64 addr, size_t size,
                                       enum dma_data_direction direction)
{
        if (!ib_uses_virt_dma(dev))
                dma_unmap_single(dev->dma_device, addr, size, direction);
}

/**
 * ib_dma_map_page - Map a physical page to DMA address
 * @dev: The device for which the dma_addr is to be created
 * @page: The page to be mapped
 * @offset: The offset within the page
 * @size: The size of the region in bytes
 * @direction: The direction of the DMA
 */
static inline u64 ib_dma_map_page(struct ib_device *dev,
                                  struct page *page,
                                  unsigned long offset,
                                  size_t size,
                                         enum dma_data_direction direction)
{
        if (ib_uses_virt_dma(dev))
                return (uintptr_t)(page_address(page) + offset);
        return dma_map_page(dev->dma_device, page, offset, size, direction);
}

/**
 * ib_dma_unmap_page - Destroy a mapping created by ib_dma_map_page()
 * @dev: The device for which the DMA address was created
 * @addr: The DMA address
 * @size: The size of the region in bytes
 * @direction: The direction of the DMA
 */
static inline void ib_dma_unmap_page(struct ib_device *dev,
                                     u64 addr, size_t size,
                                     enum dma_data_direction direction)
{
        if (!ib_uses_virt_dma(dev))
                dma_unmap_page(dev->dma_device, addr, size, direction);
}

int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents);
static inline int ib_dma_map_sg_attrs(struct ib_device *dev,
                                      struct scatterlist *sg, int nents,
                                      enum dma_data_direction direction,
                                      unsigned long dma_attrs)
{
        if (ib_uses_virt_dma(dev))
                return ib_dma_virt_map_sg(dev, sg, nents);
        return dma_map_sg_attrs(dev->dma_device, sg, nents, direction,
                                dma_attrs);
}

static inline void ib_dma_unmap_sg_attrs(struct ib_device *dev,
                                         struct scatterlist *sg, int nents,
                                         enum dma_data_direction direction,
                                         unsigned long dma_attrs)
{
        if (!ib_uses_virt_dma(dev))
                dma_unmap_sg_attrs(dev->dma_device, sg, nents, direction,
                                   dma_attrs);
}

/**
 * ib_dma_map_sgtable_attrs - Map a scatter/gather table to DMA addresses
 * @dev: The device for which the DMA addresses are to be created
 * @sg: The sg_table object describing the buffer
 * @direction: The direction of the DMA
 * @attrs: Optional DMA attributes for the map operation
 */
static inline int ib_dma_map_sgtable_attrs(struct ib_device *dev,
                                           struct sg_table *sgt,
                                           enum dma_data_direction direction,
                                           unsigned long dma_attrs)
{
        int nents;

        if (ib_uses_virt_dma(dev)) {
                nents = ib_dma_virt_map_sg(dev, sgt->sgl, sgt->orig_nents);
                if (!nents)
                        return -EIO;
                sgt->nents = nents;
                return 0;
        }
        return dma_map_sgtable(dev->dma_device, sgt, direction, dma_attrs);
}

static inline void ib_dma_unmap_sgtable_attrs(struct ib_device *dev,
                                              struct sg_table *sgt,
                                              enum dma_data_direction direction,
                                              unsigned long dma_attrs)
{
        if (!ib_uses_virt_dma(dev))
                dma_unmap_sgtable(dev->dma_device, sgt, direction, dma_attrs);
}

/**
 * ib_dma_map_sg - Map a scatter/gather list to DMA addresses
 * @dev: The device for which the DMA addresses are to be created
 * @sg: The array of scatter/gather entries
 * @nents: The number of scatter/gather entries
 * @direction: The direction of the DMA
 */
static inline int ib_dma_map_sg(struct ib_device *dev,
                                struct scatterlist *sg, int nents,
                                enum dma_data_direction direction)
{
        return ib_dma_map_sg_attrs(dev, sg, nents, direction, 0);
}

/**
 * ib_dma_unmap_sg - Unmap a scatter/gather list of DMA addresses
 * @dev: The device for which the DMA addresses were created
 * @sg: The array of scatter/gather entries
 * @nents: The number of scatter/gather entries
 * @direction: The direction of the DMA
 */
static inline void ib_dma_unmap_sg(struct ib_device *dev,
                                   struct scatterlist *sg, int nents,
                                   enum dma_data_direction direction)
{
        ib_dma_unmap_sg_attrs(dev, sg, nents, direction, 0);
}

/**
 * ib_dma_max_seg_size - Return the size limit of a single DMA transfer
 * @dev: The device to query
 *
 * The returned value represents a size in bytes.
 */
static inline unsigned int ib_dma_max_seg_size(struct ib_device *dev)
{
        if (ib_uses_virt_dma(dev))
                return UINT_MAX;
        return dma_get_max_seg_size(dev->dma_device);
}

/**
 * ib_dma_sync_single_for_cpu - Prepare DMA region to be accessed by CPU
 * @dev: The device for which the DMA address was created
 * @addr: The DMA address
 * @size: The size of the region in bytes
 * @dir: The direction of the DMA
 */
static inline void ib_dma_sync_single_for_cpu(struct ib_device *dev,
                                              u64 addr,
                                              size_t size,
                                              enum dma_data_direction dir)
{
        if (!ib_uses_virt_dma(dev))
                dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
}

/**
 * ib_dma_sync_single_for_device - Prepare DMA region to be accessed by device
 * @dev: The device for which the DMA address was created
 * @addr: The DMA address
 * @size: The size of the region in bytes
 * @dir: The direction of the DMA
 */
static inline void ib_dma_sync_single_for_device(struct ib_device *dev,
                                                 u64 addr,
                                                 size_t size,
                                                 enum dma_data_direction dir)
{
        if (!ib_uses_virt_dma(dev))
                dma_sync_single_for_device(dev->dma_device, addr, size, dir);
}

/* ib_reg_user_mr - register a memory region for virtual addresses from kernel
 * space. This function should be called when 'current' is the owning MM.
 */
struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                             u64 virt_addr, int mr_access_flags);

/* ib_advise_mr -  give an advice about an address range in a memory region */
int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
                 u32 flags, struct ib_sge *sg_list, u32 num_sge);
/**
 * ib_dereg_mr_user - Deregisters a memory region and removes it from the
 *   HCA translation table.
 * @mr: The memory region to deregister.
 * @udata: Valid user data or NULL for kernel object
 *
 * This function can fail, if the memory region has memory windows bound to it.
 */
int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata);

/**
 * ib_dereg_mr - Deregisters a kernel memory region and removes it from the
 *   HCA translation table.
 * @mr: The memory region to deregister.
 *
 * This function can fail, if the memory region has memory windows bound to it.
 *
 * NOTE: for user mr use ib_dereg_mr_user with valid udata!
 */
static inline int ib_dereg_mr(struct ib_mr *mr)
{
        return ib_dereg_mr_user(mr, NULL);
}

struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
                          u32 max_num_sg);

struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd,
                                    u32 max_num_data_sg,
                                    u32 max_num_meta_sg);

/**
 * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
 *   R_Key and L_Key.
 * @mr - struct ib_mr pointer to be updated.
 * @newkey - new key to be used.
 */
static inline void ib_update_fast_reg_key(struct ib_mr *mr, u8 newkey)
{
        mr->lkey = (mr->lkey & 0xffffff00) | newkey;
        mr->rkey = (mr->rkey & 0xffffff00) | newkey;
}

/**
 * ib_inc_rkey - increments the key portion of the given rkey. Can be used
 * for calculating a new rkey for type 2 memory windows.
 * @rkey - the rkey to increment.
 */
static inline u32 ib_inc_rkey(u32 rkey)
{
        const u32 mask = 0x000000ff;
        return ((rkey + 1) & mask) | (rkey & ~mask);
}

/**
 * ib_attach_mcast - Attaches the specified QP to a multicast group.
 * @qp: QP to attach to the multicast group.  The QP must be type
 *   IB_QPT_UD.
 * @gid: Multicast group GID.
 * @lid: Multicast group LID in host byte order.
 *
 * In order to send and receive multicast packets, subnet
 * administration must have created the multicast group and configured
 * the fabric appropriately.  The port associated with the specified
 * QP must also be a member of the multicast group.
 */
int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);

/**
 * ib_detach_mcast - Detaches the specified QP from a multicast group.
 * @qp: QP to detach from the multicast group.
 * @gid: Multicast group GID.
 * @lid: Multicast group LID in host byte order.
 */
int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);

struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device,
                                   struct inode *inode, struct ib_udata *udata);
int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata);

static inline int ib_check_mr_access(struct ib_device *ib_dev,
                                     unsigned int flags)
{
        u64 device_cap = ib_dev->attrs.device_cap_flags;

        /*
         * Local write permission is required if remote write or
         * remote atomic permission is also requested.
         */
        if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) &&
            !(flags & IB_ACCESS_LOCAL_WRITE))
                return -EINVAL;

        if (flags & ~IB_ACCESS_SUPPORTED)
                return -EINVAL;

        if (flags & IB_ACCESS_ON_DEMAND &&
            !(ib_dev->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING))
                return -EOPNOTSUPP;

        if ((flags & IB_ACCESS_FLUSH_GLOBAL &&
            !(device_cap & IB_DEVICE_FLUSH_GLOBAL)) ||
            (flags & IB_ACCESS_FLUSH_PERSISTENT &&
            !(device_cap & IB_DEVICE_FLUSH_PERSISTENT)))
                return -EOPNOTSUPP;

        return 0;
}

static inline bool ib_access_writable(int access_flags)
{
        /*
         * We have writable memory backing the MR if any of the following
         * access flags are set.  "Local write" and "remote write" obviously
         * require write access.  "Remote atomic" can do things like fetch and
         * add, which will modify memory, and "MW bind" can change permissions
         * by binding a window.
         */
        return access_flags &
                (IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
                 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND);
}

/**
 * ib_check_mr_status: lightweight check of MR status.
 *     This routine may provide status checks on a selected
 *     ib_mr. first use is for signature status check.
 *
 * @mr: A memory region.
 * @check_mask: Bitmask of which checks to perform from
 *     ib_mr_status_check enumeration.
 * @mr_status: The container of relevant status checks.
 *     failed checks will be indicated in the status bitmask
 *     and the relevant info shall be in the error item.
 */
int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
                       struct ib_mr_status *mr_status);

/**
 * ib_device_try_get: Hold a registration lock
 * device: The device to lock
 *
 * A device under an active registration lock cannot become unregistered. It
 * is only possible to obtain a registration lock on a device that is fully
 * registered, otherwise this function returns false.
 *
 * The registration lock is only necessary for actions which require the
 * device to still be registered. Uses that only require the device pointer to
 * be valid should use get_device(&ibdev->dev) to hold the memory.
 *
 */
static inline bool ib_device_try_get(struct ib_device *dev)
{
        return refcount_inc_not_zero(&dev->refcount);
}

void ib_device_put(struct ib_device *device);
struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
                                          enum rdma_driver_id driver_id);
struct ib_device *ib_device_get_by_name(const char *name,
                                        enum rdma_driver_id driver_id);
struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port,
                                            u16 pkey, const union ib_gid *gid,
                                            const struct sockaddr *addr);
int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
                         unsigned int port);
struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
                                        u32 port);
int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev,
                         u32 *port);

static inline enum ib_port_state ib_get_curr_port_state(struct net_device *net_dev)
{
        return (netif_running(net_dev) && netif_carrier_ok(net_dev)) ?
                IB_PORT_ACTIVE : IB_PORT_DOWN;
}

void ib_dispatch_port_state_event(struct ib_device *ibdev,
                                  struct net_device *ndev);
struct ib_wq *ib_create_wq(struct ib_pd *pd,
                           struct ib_wq_init_attr *init_attr);
int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata);

int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
                 unsigned int *sg_offset, unsigned int page_size);
int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg,
                    int data_sg_nents, unsigned int *data_sg_offset,
                    struct scatterlist *meta_sg, int meta_sg_nents,
                    unsigned int *meta_sg_offset, unsigned int page_size);

static inline int
ib_map_mr_sg_zbva(struct ib_mr *mr, struct scatterlist *sg, int sg_nents,
                  unsigned int *sg_offset, unsigned int page_size)
{
        int n;

        n = ib_map_mr_sg(mr, sg, sg_nents, sg_offset, page_size);
        mr->iova = 0;

        return n;
}

int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents,
                unsigned int *sg_offset, int (*set_page)(struct ib_mr *, u64));

void ib_drain_rq(struct ib_qp *qp);
void ib_drain_sq(struct ib_qp *qp);
void ib_drain_qp(struct ib_qp *qp);

int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed,
                     u8 *width);

static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr)
{
        if (attr->type == RDMA_AH_ATTR_TYPE_ROCE)
                return attr->roce.dmac;
        return NULL;
}

static inline void rdma_ah_set_dlid(struct rdma_ah_attr *attr, u32 dlid)
{
        if (attr->type == RDMA_AH_ATTR_TYPE_IB)
                attr->ib.dlid = (u16)dlid;
        else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
                attr->opa.dlid = dlid;
}

static inline u32 rdma_ah_get_dlid(const struct rdma_ah_attr *attr)
{
        if (attr->type == RDMA_AH_ATTR_TYPE_IB)
                return attr->ib.dlid;
        else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
                return attr->opa.dlid;
        return 0;
}

static inline void rdma_ah_set_sl(struct rdma_ah_attr *attr, u8 sl)
{
        attr->sl = sl;
}

static inline u8 rdma_ah_get_sl(const struct rdma_ah_attr *attr)
{
        return attr->sl;
}

static inline void rdma_ah_set_path_bits(struct rdma_ah_attr *attr,
                                         u8 src_path_bits)
{
        if (attr->type == RDMA_AH_ATTR_TYPE_IB)
                attr->ib.src_path_bits = src_path_bits;
        else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
                attr->opa.src_path_bits = src_path_bits;
}

static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr)
{
        if (attr->type == RDMA_AH_ATTR_TYPE_IB)
                return attr->ib.src_path_bits;
        else if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
                return attr->opa.src_path_bits;
        return 0;
}

static inline void rdma_ah_set_make_grd(struct rdma_ah_attr *attr,
                                        bool make_grd)
{
        if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
                attr->opa.make_grd = make_grd;
}

static inline bool rdma_ah_get_make_grd(const struct rdma_ah_attr *attr)
{
        if (attr->type == RDMA_AH_ATTR_TYPE_OPA)
                return attr->opa.make_grd;
        return false;
}

static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u32 port_num)
{
        attr->port_num = port_num;
}

static inline u32 rdma_ah_get_port_num(const struct rdma_ah_attr *attr)
{
        return attr->port_num;
}

static inline void rdma_ah_set_static_rate(struct rdma_ah_attr *attr,
                                           u8 static_rate)
{
        attr->static_rate = static_rate;
}

static inline u8 rdma_ah_get_static_rate(const struct rdma_ah_attr *attr)
{
        return attr->static_rate;
}

static inline void rdma_ah_set_ah_flags(struct rdma_ah_attr *attr,
                                        enum ib_ah_flags flag)
{
        attr->ah_flags = flag;
}

static inline enum ib_ah_flags
                rdma_ah_get_ah_flags(const struct rdma_ah_attr *attr)
{
        return attr->ah_flags;
}

static inline const struct ib_global_route
                *rdma_ah_read_grh(const struct rdma_ah_attr *attr)
{
        return &attr->grh;
}

/*To retrieve and modify the grh */
static inline struct ib_global_route
                *rdma_ah_retrieve_grh(struct rdma_ah_attr *attr)
{
        return &attr->grh;
}

static inline void rdma_ah_set_dgid_raw(struct rdma_ah_attr *attr, void *dgid)
{
        struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);

        memcpy(grh->dgid.raw, dgid, sizeof(grh->dgid));
}

static inline void rdma_ah_set_subnet_prefix(struct rdma_ah_attr *attr,
                                             __be64 prefix)
{
        struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);

        grh->dgid.global.subnet_prefix = prefix;
}

static inline void rdma_ah_set_interface_id(struct rdma_ah_attr *attr,
                                            __be64 if_id)
{
        struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);

        grh->dgid.global.interface_id = if_id;
}

static inline void rdma_ah_set_grh(struct rdma_ah_attr *attr,
                                   union ib_gid *dgid, u32 flow_label,
                                   u8 sgid_index, u8 hop_limit,
                                   u8 traffic_class)
{
        struct ib_global_route *grh = rdma_ah_retrieve_grh(attr);

        attr->ah_flags = IB_AH_GRH;
        if (dgid)
                grh->dgid = *dgid;
        grh->flow_label = flow_label;
        grh->sgid_index = sgid_index;
        grh->hop_limit = hop_limit;
        grh->traffic_class = traffic_class;
        grh->sgid_attr = NULL;
}

void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr);
void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid,
                             u32 flow_label, u8 hop_limit, u8 traffic_class,
                             const struct ib_gid_attr *sgid_attr);
void rdma_copy_ah_attr(struct rdma_ah_attr *dest,
                       const struct rdma_ah_attr *src);
void rdma_replace_ah_attr(struct rdma_ah_attr *old,
                          const struct rdma_ah_attr *new);
void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src);

/**
 * rdma_ah_find_type - Return address handle type.
 *
 * @dev: Device to be checked
 * @port_num: Port number
 */
static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev,
                                                       u32 port_num)
{
        if (rdma_protocol_roce(dev, port_num))
                return RDMA_AH_ATTR_TYPE_ROCE;
        if (rdma_protocol_ib(dev, port_num)) {
                if (rdma_cap_opa_ah(dev, port_num))
                        return RDMA_AH_ATTR_TYPE_OPA;
                return RDMA_AH_ATTR_TYPE_IB;
        }
        if (dev->type == RDMA_DEVICE_TYPE_SMI)
                return RDMA_AH_ATTR_TYPE_IB;

        return RDMA_AH_ATTR_TYPE_UNDEFINED;
}

/**
 * ib_lid_cpu16 - Return lid in 16bit CPU encoding.
 *     In the current implementation the only way to
 *     get the 32bit lid is from other sources for OPA.
 *     For IB, lids will always be 16bits so cast the
 *     value accordingly.
 *
 * @lid: A 32bit LID
 */
static inline u16 ib_lid_cpu16(u32 lid)
{
        WARN_ON_ONCE(lid & 0xFFFF0000);
        return (u16)lid;
}

/**
 * ib_lid_be16 - Return lid in 16bit BE encoding.
 *
 * @lid: A 32bit LID
 */
static inline __be16 ib_lid_be16(u32 lid)
{
        WARN_ON_ONCE(lid & 0xFFFF0000);
        return cpu_to_be16((u16)lid);
}

/**
 * ib_get_vector_affinity - Get the affinity mappings of a given completion
 *   vector
 * @device:         the rdma device
 * @comp_vector:    index of completion vector
 *
 * Returns NULL on failure, otherwise a corresponding cpu map of the
 * completion vector (returns all-cpus map if the device driver doesn't
 * implement get_vector_affinity).
 */
static inline const struct cpumask *
ib_get_vector_affinity(struct ib_device *device, int comp_vector)
{
        if (comp_vector < 0 || comp_vector >= device->num_comp_vectors ||
            !device->ops.get_vector_affinity)
                return NULL;

        return device->ops.get_vector_affinity(device, comp_vector);

}

/**
 * rdma_roce_rescan_device - Rescan all of the network devices in the system
 * and add their gids, as needed, to the relevant RoCE devices.
 *
 * @device:         the rdma device
 */
void rdma_roce_rescan_device(struct ib_device *ibdev);
void rdma_roce_rescan_port(struct ib_device *ib_dev, u32 port);
void roce_del_all_netdev_gids(struct ib_device *ib_dev,
                              u32 port, struct net_device *ndev);

struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile);

#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs);
#else
static inline int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs)
{
        return 0;
}
#endif

struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num,
                                     enum rdma_netdev_t type, const char *name,
                                     unsigned char name_assign_type,
                                     void (*setup)(struct net_device *));

int rdma_init_netdev(struct ib_device *device, u32 port_num,
                     enum rdma_netdev_t type, const char *name,
                     unsigned char name_assign_type,
                     void (*setup)(struct net_device *),
                     struct net_device *netdev);

/**
 * rdma_device_to_ibdev - Get ib_device pointer from device pointer
 *
 * @device:        device pointer for which ib_device pointer to retrieve
 *
 * rdma_device_to_ibdev() retrieves ib_device pointer from device.
 *
 */
static inline struct ib_device *rdma_device_to_ibdev(struct device *device)
{
        struct ib_core_device *coredev =
                container_of(device, struct ib_core_device, dev);

        return coredev->owner;
}

/**
 * ibdev_to_node - return the NUMA node for a given ib_device
 * @dev:        device to get the NUMA node for.
 */
static inline int ibdev_to_node(struct ib_device *ibdev)
{
        struct device *parent = ibdev->dev.parent;

        if (!parent)
                return NUMA_NO_NODE;
        return dev_to_node(parent);
}

/**
 * rdma_device_to_drv_device - Helper macro to reach back to driver's
 *                               ib_device holder structure from device pointer.
 *
 * NOTE: New drivers should not make use of this API; This API is only for
 * existing drivers who have exposed sysfs entries using
 * ops->device_group.
 */
#define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member)           \
        container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member)

bool rdma_dev_access_netns(const struct ib_device *device,
                           const struct net *net);

#define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000)
#define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF)
#define IB_GRH_FLOWLABEL_MASK (0x000FFFFF)

/**
 * rdma_flow_label_to_udp_sport - generate a RoCE v2 UDP src port value based
 *                               on the flow_label
 *
 * This function will convert the 20 bit flow_label input to a valid RoCE v2
 * UDP src port 14 bit value. All RoCE V2 drivers should use this same
 * convention.
 */
static inline u16 rdma_flow_label_to_udp_sport(u32 fl)
{
        u32 fl_low = fl & 0x03fff, fl_high = fl & 0xFC000;

        fl_low ^= fl_high >> 14;
        return (u16)(fl_low | IB_ROCE_UDP_ENCAP_VALID_PORT_MIN);
}

/**
 * rdma_calc_flow_label - generate a RDMA symmetric flow label value based on
 *                        local and remote qpn values
 *
 * This function folded the multiplication results of two qpns, 24 bit each,
 * fields, and converts it to a 20 bit results.
 *
 * This function will create symmetric flow_label value based on the local
 * and remote qpn values. this will allow both the requester and responder
 * to calculate the same flow_label for a given connection.
 *
 * This helper function should be used by driver in case the upper layer
 * provide a zero flow_label value. This is to improve entropy of RDMA
 * traffic in the network.
 */
static inline u32 rdma_calc_flow_label(u32 lqpn, u32 rqpn)
{
        u64 v = (u64)lqpn * rqpn;

        v ^= v >> 20;
        v ^= v >> 40;

        return (u32)(v & IB_GRH_FLOWLABEL_MASK);
}

/**
 * rdma_get_udp_sport - Calculate and set UDP source port based on the flow
 *                      label. If flow label is not defined in GRH then
 *                      calculate it based on lqpn/rqpn.
 *
 * @fl:                 flow label from GRH
 * @lqpn:               local qp number
 * @rqpn:               remote qp number
 */
static inline u16 rdma_get_udp_sport(u32 fl, u32 lqpn, u32 rqpn)
{
        if (!fl)
                fl = rdma_calc_flow_label(lqpn, rqpn);

        return rdma_flow_label_to_udp_sport(fl);
}

const struct ib_port_immutable*
ib_port_immutable_read(struct ib_device *dev, unsigned int port);

/** ib_add_sub_device - Add a sub IB device on an existing one
 *
 * @parent: The IB device that needs to add a sub device
 * @type: The type of the new sub device
 * @name: The name of the new sub device
 *
 *
 * Return 0 on success, an error code otherwise
 */
int ib_add_sub_device(struct ib_device *parent,
                      enum rdma_nl_dev_type type,
                      const char *name);


/** ib_del_sub_device_and_put - Delect an IB sub device while holding a 'get'
 *
 * @sub: The sub device that is going to be deleted
 *
 * Return 0 on success, an error code otherwise
 */
int ib_del_sub_device_and_put(struct ib_device *sub);

static inline void ib_mark_name_assigned_by_user(struct ib_device *ibdev)
{
        ibdev->name_assign_type = RDMA_NAME_ASSIGN_TYPE_USER;
}

#endif /* IB_VERBS_H */



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_CLOCK_H
#define _LINUX_SCHED_CLOCK_H

#include <linux/smp.h>

/*
 * Do not use outside of architecture code which knows its limitations.
 *
 * sched_clock() has no promise of monotonicity or bounded drift between
 * CPUs, use (which you should not) requires disabling IRQs.
 *
 * Please use one of the three interfaces below.
 */
extern u64 sched_clock(void);

#if defined(CONFIG_ARCH_WANTS_NO_INSTR) || defined(CONFIG_GENERIC_SCHED_CLOCK)
extern u64 sched_clock_noinstr(void);
#else
static __always_inline u64 sched_clock_noinstr(void)
{
        return sched_clock();
}
#endif

/*
 * See the comment in kernel/sched/clock.c
 */
extern u64 running_clock(void);
extern u64 sched_clock_cpu(int cpu);


extern void sched_clock_init(void);

#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
static inline void sched_clock_tick(void)
{
}

static inline void clear_sched_clock_stable(void)
{
}

static inline void sched_clock_idle_sleep_event(void)
{
}

static inline void sched_clock_idle_wakeup_event(void)
{
}

static inline u64 cpu_clock(int cpu)
{
        return sched_clock();
}

static __always_inline u64 local_clock_noinstr(void)
{
        return sched_clock_noinstr();
}

static __always_inline u64 local_clock(void)
{
        return sched_clock();
}
#else
extern int sched_clock_stable(void);
extern void clear_sched_clock_stable(void);

/*
 * When sched_clock_stable(), __sched_clock_offset provides the offset
 * between local_clock() and sched_clock().
 */
extern u64 __sched_clock_offset;

extern void sched_clock_tick(void);
extern void sched_clock_tick_stable(void);
extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(void);

/*
 * As outlined in clock.c, provides a fast, high resolution, nanosecond
 * time source that is monotonic per cpu argument and has bounded drift
 * between cpus.
 *
 * ######################### BIG FAT WARNING ##########################
 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
 * # go backwards !!                                                  #
 * ####################################################################
 */
static inline u64 cpu_clock(int cpu)
{
        return sched_clock_cpu(cpu);
}

extern u64 local_clock_noinstr(void);
extern u64 local_clock(void);

#endif

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
 * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
 * The reason for this explicit opt-in is not to have perf penalty with
 * slow sched_clocks.
 */
extern void enable_sched_clock_irqtime(void);
extern void disable_sched_clock_irqtime(void);
#else
static inline void enable_sched_clock_irqtime(void) {}
static inline void disable_sched_clock_irqtime(void) {}
#endif

#endif /* _LINUX_SCHED_CLOCK_H */
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Mutexes: blocking mutual exclusion locks
 *
 * started by Ingo Molnar:
 *
 *  Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *
 * This file contains the main data structure and API definitions.
 */
#ifndef __LINUX_MUTEX_H
#define __LINUX_MUTEX_H

#include <asm/current.h>
#include <linux/list.h>
#include <linux/spinlock_types.h>
#include <linux/lockdep.h>
#include <linux/atomic.h>
#include <asm/processor.h>
#include <linux/osq_lock.h>
#include <linux/debug_locks.h>
#include <linux/cleanup.h>
#include <linux/mutex_types.h>

struct device;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __DEP_MAP_MUTEX_INITIALIZER(lockname)                        \
                , .dep_map = {                                        \
                        .name = #lockname,                        \
                        .wait_type_inner = LD_WAIT_SLEEP,        \
                }
#else
# define __DEP_MAP_MUTEX_INITIALIZER(lockname)
#endif

#ifdef CONFIG_DEBUG_MUTEXES

# define __DEBUG_MUTEX_INITIALIZER(lockname)                                \
        , .magic = &lockname

extern void mutex_destroy(struct mutex *lock);

#else

# define __DEBUG_MUTEX_INITIALIZER(lockname)

static inline void mutex_destroy(struct mutex *lock) {}

#endif

/**
 * mutex_init - initialize the mutex
 * @mutex: the mutex to be initialized
 *
 * Initialize the mutex to unlocked state.
 *
 * It is not allowed to initialize an already locked mutex.
 */
#define mutex_init(mutex)                                                \
do {                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __mutex_init((mutex), #mutex, &__key);                                \
} while (0)

/**
 * mutex_init_with_key - initialize a mutex with a given lockdep key
 * @mutex: the mutex to be initialized
 * @key: the lockdep key to be associated with the mutex
 *
 * Initialize the mutex to the unlocked state.
 *
 * It is not allowed to initialize an already locked mutex.
 */
#define mutex_init_with_key(mutex, key) __mutex_init((mutex), #mutex, (key))

#ifndef CONFIG_PREEMPT_RT
#define __MUTEX_INITIALIZER(lockname) \
                { .owner = ATOMIC_LONG_INIT(0) \
                , .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(lockname.wait_lock) \
                , .wait_list = LIST_HEAD_INIT(lockname.wait_list) \
                __DEBUG_MUTEX_INITIALIZER(lockname) \
                __DEP_MAP_MUTEX_INITIALIZER(lockname) }

#define DEFINE_MUTEX(mutexname) \
        struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)

extern void __mutex_init(struct mutex *lock, const char *name,
                         struct lock_class_key *key);

/**
 * mutex_is_locked - is the mutex locked
 * @lock: the mutex to be queried
 *
 * Returns true if the mutex is locked, false if unlocked.
 */
extern bool mutex_is_locked(struct mutex *lock);

#else /* !CONFIG_PREEMPT_RT */
/*
 * Preempt-RT variant based on rtmutexes.
 */

#define __MUTEX_INITIALIZER(mutexname)                                        \
{                                                                        \
        .rtmutex = __RT_MUTEX_BASE_INITIALIZER(mutexname.rtmutex)        \
        __DEP_MAP_MUTEX_INITIALIZER(mutexname)                                \
}

#define DEFINE_MUTEX(mutexname)                                                \
        struct mutex mutexname = __MUTEX_INITIALIZER(mutexname)

extern void __mutex_rt_init(struct mutex *lock, const char *name,
                            struct lock_class_key *key);

#define mutex_is_locked(l)        rt_mutex_base_is_locked(&(l)->rtmutex)

#define __mutex_init(mutex, name, key)                        \
do {                                                        \
        rt_mutex_base_init(&(mutex)->rtmutex);                \
        __mutex_rt_init((mutex), name, key);                \
} while (0)

#endif /* CONFIG_PREEMPT_RT */

#ifdef CONFIG_DEBUG_MUTEXES

int __devm_mutex_init(struct device *dev, struct mutex *lock);

#else

static inline int __devm_mutex_init(struct device *dev, struct mutex *lock)
{
        /*
         * When CONFIG_DEBUG_MUTEXES is off mutex_destroy() is just a nop so
         * no really need to register it in the devm subsystem.
         */
        return 0;
}

#endif

#define devm_mutex_init(dev, mutex)                        \
({                                                        \
        typeof(mutex) mutex_ = (mutex);                        \
                                                        \
        mutex_init(mutex_);                                \
        __devm_mutex_init(dev, mutex_);                        \
})

/*
 * See kernel/locking/mutex.c for detailed documentation of these APIs.
 * Also see Documentation/locking/mutex-design.rst.
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void mutex_lock_nested(struct mutex *lock, unsigned int subclass);
extern void _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock);

extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock,
                                        unsigned int subclass);
extern int __must_check mutex_lock_killable_nested(struct mutex *lock,
                                        unsigned int subclass);
extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass);

#define mutex_lock(lock) mutex_lock_nested(lock, 0)
#define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0)
#define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0)
#define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0)

#define mutex_lock_nest_lock(lock, nest_lock)                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map);        \
        _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map);                \
} while (0)

#else
extern void mutex_lock(struct mutex *lock);
extern int __must_check mutex_lock_interruptible(struct mutex *lock);
extern int __must_check mutex_lock_killable(struct mutex *lock);
extern void mutex_lock_io(struct mutex *lock);

# define mutex_lock_nested(lock, subclass) mutex_lock(lock)
# define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
# define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock)
# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock)
# define mutex_lock_io_nested(lock, subclass) mutex_lock_io(lock)
#endif

/*
 * NOTE: mutex_trylock() follows the spin_trylock() convention,
 *       not the down_trylock() convention!
 *
 * Returns 1 if the mutex has been acquired successfully, and 0 on contention.
 */
extern int mutex_trylock(struct mutex *lock);
extern void mutex_unlock(struct mutex *lock);

extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);

DEFINE_GUARD(mutex, struct mutex *, mutex_lock(_T), mutex_unlock(_T))
DEFINE_GUARD_COND(mutex, _try, mutex_trylock(_T))
DEFINE_GUARD_COND(mutex, _intr, mutex_lock_interruptible(_T) == 0)

extern unsigned long mutex_get_owner(struct mutex *lock);

#endif /* __LINUX_MUTEX_H */





























































  120 






  168 




  167 

  167 











  168 




  168 

  167 











  178 




  178 


  178 













  129 




  128 


  128 



















   71 








  208 
  209 












  209 










  209 





















































    2 
















    2 








































































































  166 











  105 














  105 

    3 


  166 



    2 





    2 
    2 






   79 







   78 
    1 
   79 




  169 







  169 









  168 



  169 


  110 



   60 





























  166 





  166 



  126 







  127 




  127 







  127 























  127 






  127 







  127 





  127 



  127 








































  164 







  164 




  166 





  166 




  166 








  166 
  165 









  166 



  166 






  166 















  109 
   59 



  165 













































































































  166 
















  166 






  165 











  166 




























  165 



  166 









  165 


  166 

  166 








  166 
  166 

  164 


  166 



  103 











    2 
  104 



  127 










  127 










  127 

  127 


  127 















































  106 


    2 


  102 






  106 






  105 



   72 











   72 







   72 






   69 
    3 



    2 


    1 



   72 

   72 






























   72 






    1 
   71 
   71 







  246 



















    5 




    1 






    1 



    1 










    1 



    1 




















    2 






    4 


    1 


    1 



    1 

    1 
























    2 







    1 




































































































































































































































































































































   12 












   58 





   58 









   58 


   58 




























  165 




    1 



  108 






   58 




















   58 















    4 



    1 


    3 


    1 




    1 





    1 






























    3 




    1 

















    2 



    1 













    4 


    1 




    1 
    2 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012 ARM Ltd.
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#include <linux/cpu.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/uaccess.h>

#include <clocksource/arm_arch_timer.h>
#include <asm/arch_timer.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_nested.h>

#include <kvm/arm_vgic.h>
#include <kvm/arm_arch_timer.h>

#include "trace.h"

static struct timecounter *timecounter;
static unsigned int host_vtimer_irq;
static unsigned int host_ptimer_irq;
static u32 host_vtimer_irq_flags;
static u32 host_ptimer_irq_flags;

static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
DEFINE_STATIC_KEY_FALSE(broken_cntvoff_key);

static const u8 default_ppi[] = {
        [TIMER_PTIMER]  = 30,
        [TIMER_VTIMER]  = 27,
        [TIMER_HPTIMER] = 26,
        [TIMER_HVTIMER] = 28,
};

static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx);
static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
                                 struct arch_timer_context *timer_ctx);
static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx);
static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
                                struct arch_timer_context *timer,
                                enum kvm_arch_timer_regs treg,
                                u64 val);
static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
                              struct arch_timer_context *timer,
                              enum kvm_arch_timer_regs treg);
static bool kvm_arch_timer_get_input_level(int vintid);

static struct irq_ops arch_timer_irq_ops = {
        .get_input_level = kvm_arch_timer_get_input_level,
};

static int nr_timers(struct kvm_vcpu *vcpu)
{
        if (!vcpu_has_nv(vcpu))
                return NR_KVM_EL0_TIMERS;

        return NR_KVM_TIMERS;
}

u32 timer_get_ctl(struct arch_timer_context *ctxt)
{
        struct kvm_vcpu *vcpu = ctxt->vcpu;

        switch(arch_timer_ctx_index(ctxt)) {
        case TIMER_VTIMER:
                return __vcpu_sys_reg(vcpu, CNTV_CTL_EL0);
        case TIMER_PTIMER:
                return __vcpu_sys_reg(vcpu, CNTP_CTL_EL0);
        case TIMER_HVTIMER:
                return __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2);
        case TIMER_HPTIMER:
                return __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2);
        default:
                WARN_ON(1);
                return 0;
        }
}

u64 timer_get_cval(struct arch_timer_context *ctxt)
{
        struct kvm_vcpu *vcpu = ctxt->vcpu;

        switch(arch_timer_ctx_index(ctxt)) {
        case TIMER_VTIMER:
                return __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0);
        case TIMER_PTIMER:
                return __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0);
        case TIMER_HVTIMER:
                return __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2);
        case TIMER_HPTIMER:
                return __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2);
        default:
                WARN_ON(1);
                return 0;
        }
}

static void timer_set_ctl(struct arch_timer_context *ctxt, u32 ctl)
{
        struct kvm_vcpu *vcpu = ctxt->vcpu;

        switch(arch_timer_ctx_index(ctxt)) {
        case TIMER_VTIMER:
                __vcpu_sys_reg(vcpu, CNTV_CTL_EL0) = ctl;
                break;
        case TIMER_PTIMER:
                __vcpu_sys_reg(vcpu, CNTP_CTL_EL0) = ctl;
                break;
        case TIMER_HVTIMER:
                __vcpu_sys_reg(vcpu, CNTHV_CTL_EL2) = ctl;
                break;
        case TIMER_HPTIMER:
                __vcpu_sys_reg(vcpu, CNTHP_CTL_EL2) = ctl;
                break;
        default:
                WARN_ON(1);
        }
}

static void timer_set_cval(struct arch_timer_context *ctxt, u64 cval)
{
        struct kvm_vcpu *vcpu = ctxt->vcpu;

        switch(arch_timer_ctx_index(ctxt)) {
        case TIMER_VTIMER:
                __vcpu_sys_reg(vcpu, CNTV_CVAL_EL0) = cval;
                break;
        case TIMER_PTIMER:
                __vcpu_sys_reg(vcpu, CNTP_CVAL_EL0) = cval;
                break;
        case TIMER_HVTIMER:
                __vcpu_sys_reg(vcpu, CNTHV_CVAL_EL2) = cval;
                break;
        case TIMER_HPTIMER:
                __vcpu_sys_reg(vcpu, CNTHP_CVAL_EL2) = cval;
                break;
        default:
                WARN_ON(1);
        }
}

static void timer_set_offset(struct arch_timer_context *ctxt, u64 offset)
{
        if (!ctxt->offset.vm_offset) {
                WARN(offset, "timer %ld\n", arch_timer_ctx_index(ctxt));
                return;
        }

        WRITE_ONCE(*ctxt->offset.vm_offset, offset);
}

u64 kvm_phys_timer_read(void)
{
        return timecounter->cc->read(timecounter->cc);
}

void get_timer_map(struct kvm_vcpu *vcpu, struct timer_map *map)
{
        if (vcpu_has_nv(vcpu)) {
                if (is_hyp_ctxt(vcpu)) {
                        map->direct_vtimer = vcpu_hvtimer(vcpu);
                        map->direct_ptimer = vcpu_hptimer(vcpu);
                        map->emul_vtimer = vcpu_vtimer(vcpu);
                        map->emul_ptimer = vcpu_ptimer(vcpu);
                } else {
                        map->direct_vtimer = vcpu_vtimer(vcpu);
                        map->direct_ptimer = vcpu_ptimer(vcpu);
                        map->emul_vtimer = vcpu_hvtimer(vcpu);
                        map->emul_ptimer = vcpu_hptimer(vcpu);
                }
        } else if (has_vhe()) {
                map->direct_vtimer = vcpu_vtimer(vcpu);
                map->direct_ptimer = vcpu_ptimer(vcpu);
                map->emul_vtimer = NULL;
                map->emul_ptimer = NULL;
        } else {
                map->direct_vtimer = vcpu_vtimer(vcpu);
                map->direct_ptimer = NULL;
                map->emul_vtimer = NULL;
                map->emul_ptimer = vcpu_ptimer(vcpu);
        }

        trace_kvm_get_timer_map(vcpu->vcpu_id, map);
}

static inline bool userspace_irqchip(struct kvm *kvm)
{
        return unlikely(!irqchip_in_kernel(kvm));
}

static void soft_timer_start(struct hrtimer *hrt, u64 ns)
{
        hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns),
                      HRTIMER_MODE_ABS_HARD);
}

static void soft_timer_cancel(struct hrtimer *hrt)
{
        hrtimer_cancel(hrt);
}

static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
{
        struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
        struct arch_timer_context *ctx;
        struct timer_map map;

        /*
         * We may see a timer interrupt after vcpu_put() has been called which
         * sets the CPU's vcpu pointer to NULL, because even though the timer
         * has been disabled in timer_save_state(), the hardware interrupt
         * signal may not have been retired from the interrupt controller yet.
         */
        if (!vcpu)
                return IRQ_HANDLED;

        get_timer_map(vcpu, &map);

        if (irq == host_vtimer_irq)
                ctx = map.direct_vtimer;
        else
                ctx = map.direct_ptimer;

        if (kvm_timer_should_fire(ctx))
                kvm_timer_update_irq(vcpu, true, ctx);

        if (userspace_irqchip(vcpu->kvm) &&
            !static_branch_unlikely(&has_gic_active_state))
                disable_percpu_irq(host_vtimer_irq);

        return IRQ_HANDLED;
}

static u64 kvm_counter_compute_delta(struct arch_timer_context *timer_ctx,
                                     u64 val)
{
        u64 now = kvm_phys_timer_read() - timer_get_offset(timer_ctx);

        if (now < val) {
                u64 ns;

                ns = cyclecounter_cyc2ns(timecounter->cc,
                                         val - now,
                                         timecounter->mask,
                                         &timer_ctx->ns_frac);
                return ns;
        }

        return 0;
}

static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
{
        return kvm_counter_compute_delta(timer_ctx, timer_get_cval(timer_ctx));
}

static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx)
{
        WARN_ON(timer_ctx && timer_ctx->loaded);
        return timer_ctx &&
                ((timer_get_ctl(timer_ctx) &
                  (ARCH_TIMER_CTRL_IT_MASK | ARCH_TIMER_CTRL_ENABLE)) == ARCH_TIMER_CTRL_ENABLE);
}

static bool vcpu_has_wfit_active(struct kvm_vcpu *vcpu)
{
        return (cpus_have_final_cap(ARM64_HAS_WFXT) &&
                vcpu_get_flag(vcpu, IN_WFIT));
}

static u64 wfit_delay_ns(struct kvm_vcpu *vcpu)
{
        u64 val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));
        struct arch_timer_context *ctx;

        ctx = is_hyp_ctxt(vcpu) ? vcpu_hvtimer(vcpu) : vcpu_vtimer(vcpu);

        return kvm_counter_compute_delta(ctx, val);
}

/*
 * Returns the earliest expiration time in ns among guest timers.
 * Note that it will return 0 if none of timers can fire.
 */
static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu)
{
        u64 min_delta = ULLONG_MAX;
        int i;

        for (i = 0; i < nr_timers(vcpu); i++) {
                struct arch_timer_context *ctx = &vcpu->arch.timer_cpu.timers[i];

                WARN(ctx->loaded, "timer %d loaded\n", i);
                if (kvm_timer_irq_can_fire(ctx))
                        min_delta = min(min_delta, kvm_timer_compute_delta(ctx));
        }

        if (vcpu_has_wfit_active(vcpu))
                min_delta = min(min_delta, wfit_delay_ns(vcpu));

        /* If none of timers can fire, then return 0 */
        if (min_delta == ULLONG_MAX)
                return 0;

        return min_delta;
}

static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
{
        struct arch_timer_cpu *timer;
        struct kvm_vcpu *vcpu;
        u64 ns;

        timer = container_of(hrt, struct arch_timer_cpu, bg_timer);
        vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu);

        /*
         * Check that the timer has really expired from the guest's
         * PoV (NTP on the host may have forced it to expire
         * early). If we should have slept longer, restart it.
         */
        ns = kvm_timer_earliest_exp(vcpu);
        if (unlikely(ns)) {
                hrtimer_forward_now(hrt, ns_to_ktime(ns));
                return HRTIMER_RESTART;
        }

        kvm_vcpu_wake_up(vcpu);
        return HRTIMER_NORESTART;
}

static enum hrtimer_restart kvm_hrtimer_expire(struct hrtimer *hrt)
{
        struct arch_timer_context *ctx;
        struct kvm_vcpu *vcpu;
        u64 ns;

        ctx = container_of(hrt, struct arch_timer_context, hrtimer);
        vcpu = ctx->vcpu;

        trace_kvm_timer_hrtimer_expire(ctx);

        /*
         * Check that the timer has really expired from the guest's
         * PoV (NTP on the host may have forced it to expire
         * early). If not ready, schedule for a later time.
         */
        ns = kvm_timer_compute_delta(ctx);
        if (unlikely(ns)) {
                hrtimer_forward_now(hrt, ns_to_ktime(ns));
                return HRTIMER_RESTART;
        }

        kvm_timer_update_irq(vcpu, true, ctx);
        return HRTIMER_NORESTART;
}

static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx)
{
        enum kvm_arch_timers index;
        u64 cval, now;

        if (!timer_ctx)
                return false;

        index = arch_timer_ctx_index(timer_ctx);

        if (timer_ctx->loaded) {
                u32 cnt_ctl = 0;

                switch (index) {
                case TIMER_VTIMER:
                case TIMER_HVTIMER:
                        cnt_ctl = read_sysreg_el0(SYS_CNTV_CTL);
                        break;
                case TIMER_PTIMER:
                case TIMER_HPTIMER:
                        cnt_ctl = read_sysreg_el0(SYS_CNTP_CTL);
                        break;
                case NR_KVM_TIMERS:
                        /* GCC is braindead */
                        cnt_ctl = 0;
                        break;
                }

                return  (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) &&
                        (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) &&
                       !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK);
        }

        if (!kvm_timer_irq_can_fire(timer_ctx))
                return false;

        cval = timer_get_cval(timer_ctx);
        now = kvm_phys_timer_read() - timer_get_offset(timer_ctx);

        return cval <= now;
}

int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
        return vcpu_has_wfit_active(vcpu) && wfit_delay_ns(vcpu) == 0;
}

/*
 * Reflect the timer output level into the kvm_run structure
 */
void kvm_timer_update_run(struct kvm_vcpu *vcpu)
{
        struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
        struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
        struct kvm_sync_regs *regs = &vcpu->run->s.regs;

        /* Populate the device bitmap with the timer states */
        regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER |
                                    KVM_ARM_DEV_EL1_PTIMER);
        if (kvm_timer_should_fire(vtimer))
                regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER;
        if (kvm_timer_should_fire(ptimer))
                regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER;
}

static void kvm_timer_update_status(struct arch_timer_context *ctx, bool level)
{
        /*
         * Paper over NV2 brokenness by publishing the interrupt status
         * bit. This still results in a poor quality of emulation (guest
         * writes will have no effect until the next exit).
         *
         * But hey, it's fast, right?
         */
        if (is_hyp_ctxt(ctx->vcpu) &&
            (ctx == vcpu_vtimer(ctx->vcpu) || ctx == vcpu_ptimer(ctx->vcpu))) {
                unsigned long val = timer_get_ctl(ctx);
                __assign_bit(__ffs(ARCH_TIMER_CTRL_IT_STAT), &val, level);
                timer_set_ctl(ctx, val);
        }
}

static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level,
                                 struct arch_timer_context *timer_ctx)
{
        kvm_timer_update_status(timer_ctx, new_level);

        timer_ctx->irq.level = new_level;
        trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_irq(timer_ctx),
                                   timer_ctx->irq.level);

        if (userspace_irqchip(vcpu->kvm))
                return;

        kvm_vgic_inject_irq(vcpu->kvm, vcpu,
                            timer_irq(timer_ctx),
                            timer_ctx->irq.level,
                            timer_ctx);
}

/* Only called for a fully emulated timer */
static void timer_emulate(struct arch_timer_context *ctx)
{
        bool should_fire = kvm_timer_should_fire(ctx);

        trace_kvm_timer_emulate(ctx, should_fire);

        if (should_fire != ctx->irq.level)
                kvm_timer_update_irq(ctx->vcpu, should_fire, ctx);

        kvm_timer_update_status(ctx, should_fire);

        /*
         * If the timer can fire now, we don't need to have a soft timer
         * scheduled for the future.  If the timer cannot fire at all,
         * then we also don't need a soft timer.
         */
        if (should_fire || !kvm_timer_irq_can_fire(ctx))
                return;

        soft_timer_start(&ctx->hrtimer, kvm_timer_compute_delta(ctx));
}

static void set_cntvoff(u64 cntvoff)
{
        kvm_call_hyp(__kvm_timer_set_cntvoff, cntvoff);
}

static void set_cntpoff(u64 cntpoff)
{
        if (has_cntpoff())
                write_sysreg_s(cntpoff, SYS_CNTPOFF_EL2);
}

static void timer_save_state(struct arch_timer_context *ctx)
{
        struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
        enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
        unsigned long flags;

        if (!timer->enabled)
                return;

        local_irq_save(flags);

        if (!ctx->loaded)
                goto out;

        switch (index) {
                u64 cval;

        case TIMER_VTIMER:
        case TIMER_HVTIMER:
                timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTV_CTL));
                cval = read_sysreg_el0(SYS_CNTV_CVAL);

                if (has_broken_cntvoff())
                        cval -= timer_get_offset(ctx);

                timer_set_cval(ctx, cval);

                /* Disable the timer */
                write_sysreg_el0(0, SYS_CNTV_CTL);
                isb();

                /*
                 * The kernel may decide to run userspace after
                 * calling vcpu_put, so we reset cntvoff to 0 to
                 * ensure a consistent read between user accesses to
                 * the virtual counter and kernel access to the
                 * physical counter of non-VHE case.
                 *
                 * For VHE, the virtual counter uses a fixed virtual
                 * offset of zero, so no need to zero CNTVOFF_EL2
                 * register, but this is actually useful when switching
                 * between EL1/vEL2 with NV.
                 *
                 * Do it unconditionally, as this is either unavoidable
                 * or dirt cheap.
                 */
                set_cntvoff(0);
                break;
        case TIMER_PTIMER:
        case TIMER_HPTIMER:
                timer_set_ctl(ctx, read_sysreg_el0(SYS_CNTP_CTL));
                cval = read_sysreg_el0(SYS_CNTP_CVAL);

                cval -= timer_get_offset(ctx);

                timer_set_cval(ctx, cval);

                /* Disable the timer */
                write_sysreg_el0(0, SYS_CNTP_CTL);
                isb();

                set_cntpoff(0);
                break;
        case NR_KVM_TIMERS:
                BUG();
        }

        trace_kvm_timer_save_state(ctx);

        ctx->loaded = false;
out:
        local_irq_restore(flags);
}

/*
 * Schedule the background timer before calling kvm_vcpu_halt, so that this
 * thread is removed from its waitqueue and made runnable when there's a timer
 * interrupt to handle.
 */
static void kvm_timer_blocking(struct kvm_vcpu *vcpu)
{
        struct arch_timer_cpu *timer = vcpu_timer(vcpu);
        struct timer_map map;

        get_timer_map(vcpu, &map);

        /*
         * If no timers are capable of raising interrupts (disabled or
         * masked), then there's no more work for us to do.
         */
        if (!kvm_timer_irq_can_fire(map.direct_vtimer) &&
            !kvm_timer_irq_can_fire(map.direct_ptimer) &&
            !kvm_timer_irq_can_fire(map.emul_vtimer) &&
            !kvm_timer_irq_can_fire(map.emul_ptimer) &&
            !vcpu_has_wfit_active(vcpu))
                return;

        /*
         * At least one guest time will expire. Schedule a background timer.
         * Set the earliest expiration time among the guest timers.
         */
        soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu));
}

static void kvm_timer_unblocking(struct kvm_vcpu *vcpu)
{
        struct arch_timer_cpu *timer = vcpu_timer(vcpu);

        soft_timer_cancel(&timer->bg_timer);
}

static void timer_restore_state(struct arch_timer_context *ctx)
{
        struct arch_timer_cpu *timer = vcpu_timer(ctx->vcpu);
        enum kvm_arch_timers index = arch_timer_ctx_index(ctx);
        unsigned long flags;

        if (!timer->enabled)
                return;

        local_irq_save(flags);

        if (ctx->loaded)
                goto out;

        switch (index) {
                u64 cval, offset;

        case TIMER_VTIMER:
        case TIMER_HVTIMER:
                cval = timer_get_cval(ctx);
                offset = timer_get_offset(ctx);
                if (has_broken_cntvoff()) {
                        set_cntvoff(0);
                        cval += offset;
                } else {
                        set_cntvoff(offset);
                }
                write_sysreg_el0(cval, SYS_CNTV_CVAL);
                isb();
                write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTV_CTL);
                break;
        case TIMER_PTIMER:
        case TIMER_HPTIMER:
                cval = timer_get_cval(ctx);
                offset = timer_get_offset(ctx);
                set_cntpoff(offset);
                cval += offset;
                write_sysreg_el0(cval, SYS_CNTP_CVAL);
                isb();
                write_sysreg_el0(timer_get_ctl(ctx), SYS_CNTP_CTL);
                break;
        case NR_KVM_TIMERS:
                BUG();
        }

        trace_kvm_timer_restore_state(ctx);

        ctx->loaded = true;
out:
        local_irq_restore(flags);
}

static inline void set_timer_irq_phys_active(struct arch_timer_context *ctx, bool active)
{
        int r;
        r = irq_set_irqchip_state(ctx->host_timer_irq, IRQCHIP_STATE_ACTIVE, active);
        WARN_ON(r);
}

static void kvm_timer_vcpu_load_gic(struct arch_timer_context *ctx)
{
        struct kvm_vcpu *vcpu = ctx->vcpu;
        bool phys_active = false;

        /*
         * Update the timer output so that it is likely to match the
         * state we're about to restore. If the timer expires between
         * this point and the register restoration, we'll take the
         * interrupt anyway.
         */
        kvm_timer_update_irq(ctx->vcpu, kvm_timer_should_fire(ctx), ctx);

        if (irqchip_in_kernel(vcpu->kvm))
                phys_active = kvm_vgic_map_is_active(vcpu, timer_irq(ctx));

        phys_active |= ctx->irq.level;

        set_timer_irq_phys_active(ctx, phys_active);
}

static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu)
{
        struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);

        /*
         * Update the timer output so that it is likely to match the
         * state we're about to restore. If the timer expires between
         * this point and the register restoration, we'll take the
         * interrupt anyway.
         */
        kvm_timer_update_irq(vcpu, kvm_timer_should_fire(vtimer), vtimer);

        /*
         * When using a userspace irqchip with the architected timers and a
         * host interrupt controller that doesn't support an active state, we
         * must still prevent continuously exiting from the guest, and
         * therefore mask the physical interrupt by disabling it on the host
         * interrupt controller when the virtual level is high, such that the
         * guest can make forward progress.  Once we detect the output level
         * being de-asserted, we unmask the interrupt again so that we exit
         * from the guest when the timer fires.
         */
        if (vtimer->irq.level)
                disable_percpu_irq(host_vtimer_irq);
        else
                enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
}

/* If _pred is true, set bit in _set, otherwise set it in _clr */
#define assign_clear_set_bit(_pred, _bit, _clr, _set)                        \
        do {                                                                \
                if (_pred)                                                \
                        (_set) |= (_bit);                                \
                else                                                        \
                        (_clr) |= (_bit);                                \
        } while (0)

static void kvm_timer_vcpu_load_nested_switch(struct kvm_vcpu *vcpu,
                                              struct timer_map *map)
{
        int hw, ret;

        if (!irqchip_in_kernel(vcpu->kvm))
                return;

        /*
         * We only ever unmap the vtimer irq on a VHE system that runs nested
         * virtualization, in which case we have both a valid emul_vtimer,
         * emul_ptimer, direct_vtimer, and direct_ptimer.
         *
         * Since this is called from kvm_timer_vcpu_load(), a change between
         * vEL2 and vEL1/0 will have just happened, and the timer_map will
         * represent this, and therefore we switch the emul/direct mappings
         * below.
         */
        hw = kvm_vgic_get_map(vcpu, timer_irq(map->direct_vtimer));
        if (hw < 0) {
                kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_vtimer));
                kvm_vgic_unmap_phys_irq(vcpu, timer_irq(map->emul_ptimer));

                ret = kvm_vgic_map_phys_irq(vcpu,
                                            map->direct_vtimer->host_timer_irq,
                                            timer_irq(map->direct_vtimer),
                                            &arch_timer_irq_ops);
                WARN_ON_ONCE(ret);
                ret = kvm_vgic_map_phys_irq(vcpu,
                                            map->direct_ptimer->host_timer_irq,
                                            timer_irq(map->direct_ptimer),
                                            &arch_timer_irq_ops);
                WARN_ON_ONCE(ret);
        }
}

static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
{
        bool tvt, tpt, tvc, tpc, tvt02, tpt02;
        u64 clr, set;

        /*
         * No trapping gets configured here with nVHE. See
         * __timer_enable_traps(), which is where the stuff happens.
         */
        if (!has_vhe())
                return;

        /*
         * Our default policy is not to trap anything. As we progress
         * within this function, reality kicks in and we start adding
         * traps based on emulation requirements.
         */
        tvt = tpt = tvc = tpc = false;
        tvt02 = tpt02 = false;

        /*
         * NV2 badly breaks the timer semantics by redirecting accesses to
         * the EL1 timer state to memory, so let's call ECV to the rescue if
         * available: we trap all CNT{P,V}_{CTL,CVAL,TVAL}_EL0 accesses.
         *
         * The treatment slightly varies depending whether we run a nVHE or
         * VHE guest: nVHE will use the _EL0 registers directly, while VHE
         * will use the _EL02 accessors. This translates in different trap
         * bits.
         *
         * None of the trapping is required when running in non-HYP context,
         * unless required by the L1 hypervisor settings once we advertise
         * ECV+NV in the guest, or that we need trapping for other reasons.
         */
        if (cpus_have_final_cap(ARM64_HAS_ECV) && is_hyp_ctxt(vcpu)) {
                if (vcpu_el2_e2h_is_set(vcpu))
                        tvt02 = tpt02 = true;
                else
                        tvt = tpt = true;
        }

        /*
         * We have two possibility to deal with a physical offset:
         *
         * - Either we have CNTPOFF (yay!) or the offset is 0:
         *   we let the guest freely access the HW
         *
         * - or neither of these condition apply:
         *   we trap accesses to the HW, but still use it
         *   after correcting the physical offset
         */
        if (!has_cntpoff() && timer_get_offset(map->direct_ptimer))
                tpt = tpc = true;

        /*
         * For the poor sods that could not correctly substract one value
         * from another, trap the full virtual timer and counter.
         */
        if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer))
                tvt = tvc = true;

        /*
         * Apply the enable bits that the guest hypervisor has requested for
         * its own guest. We can only add traps that wouldn't have been set
         * above.
         * Implementation choices: we do not support NV when E2H=0 in the
         * guest, and we don't support configuration where E2H is writable
         * by the guest (either FEAT_VHE or FEAT_E2H0 is implemented, but
         * not both). This simplifies the handling of the EL1NV* bits.
         */
        if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
                u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);

                /* Use the VHE format for mental sanity */
                if (!vcpu_el2_e2h_is_set(vcpu))
                        val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10;

                tpt |= !(val & (CNTHCTL_EL1PCEN << 10));
                tpc |= !(val & (CNTHCTL_EL1PCTEN << 10));

                tpt02 |= (val & CNTHCTL_EL1NVPCT);
                tvt02 |= (val & CNTHCTL_EL1NVVCT);
        }

        /*
         * Now that we have collected our requirements, compute the
         * trap and enable bits.
         */
        set = 0;
        clr = 0;

        assign_clear_set_bit(tpt, CNTHCTL_EL1PCEN << 10, set, clr);
        assign_clear_set_bit(tpc, CNTHCTL_EL1PCTEN << 10, set, clr);
        assign_clear_set_bit(tvt, CNTHCTL_EL1TVT, clr, set);
        assign_clear_set_bit(tvc, CNTHCTL_EL1TVCT, clr, set);
        assign_clear_set_bit(tvt02, CNTHCTL_EL1NVVCT, clr, set);
        assign_clear_set_bit(tpt02, CNTHCTL_EL1NVPCT, clr, set);

        /* This only happens on VHE, so use the CNTHCTL_EL2 accessor. */
        sysreg_clear_set(cnthctl_el2, clr, set);
}

void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu)
{
        struct arch_timer_cpu *timer = vcpu_timer(vcpu);
        struct timer_map map;

        if (unlikely(!timer->enabled))
                return;

        get_timer_map(vcpu, &map);

        if (static_branch_likely(&has_gic_active_state)) {
                if (vcpu_has_nv(vcpu))
                        kvm_timer_vcpu_load_nested_switch(vcpu, &map);

                kvm_timer_vcpu_load_gic(map.direct_vtimer);
                if (map.direct_ptimer)
                        kvm_timer_vcpu_load_gic(map.direct_ptimer);
        } else {
                kvm_timer_vcpu_load_nogic(vcpu);
        }

        kvm_timer_unblocking(vcpu);

        timer_restore_state(map.direct_vtimer);
        if (map.direct_ptimer)
                timer_restore_state(map.direct_ptimer);
        if (map.emul_vtimer)
                timer_emulate(map.emul_vtimer);
        if (map.emul_ptimer)
                timer_emulate(map.emul_ptimer);

        timer_set_traps(vcpu, &map);
}

bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu)
{
        struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
        struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
        struct kvm_sync_regs *sregs = &vcpu->run->s.regs;
        bool vlevel, plevel;

        if (likely(irqchip_in_kernel(vcpu->kvm)))
                return false;

        vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER;
        plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER;

        return kvm_timer_should_fire(vtimer) != vlevel ||
               kvm_timer_should_fire(ptimer) != plevel;
}

void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
{
        struct arch_timer_cpu *timer = vcpu_timer(vcpu);
        struct timer_map map;

        if (unlikely(!timer->enabled))
                return;

        get_timer_map(vcpu, &map);

        timer_save_state(map.direct_vtimer);
        if (map.direct_ptimer)
                timer_save_state(map.direct_ptimer);

        /*
         * Cancel soft timer emulation, because the only case where we
         * need it after a vcpu_put is in the context of a sleeping VCPU, and
         * in that case we already factor in the deadline for the physical
         * timer when scheduling the bg_timer.
         *
         * In any case, we re-schedule the hrtimer for the physical timer when
         * coming back to the VCPU thread in kvm_timer_vcpu_load().
         */
        if (map.emul_vtimer)
                soft_timer_cancel(&map.emul_vtimer->hrtimer);
        if (map.emul_ptimer)
                soft_timer_cancel(&map.emul_ptimer->hrtimer);

        if (kvm_vcpu_is_blocking(vcpu))
                kvm_timer_blocking(vcpu);
}

void kvm_timer_sync_nested(struct kvm_vcpu *vcpu)
{
        /*
         * When NV2 is on, guest hypervisors have their EL1 timer register
         * accesses redirected to the VNCR page. Any guest action taken on
         * the timer is postponed until the next exit, leading to a very
         * poor quality of emulation.
         *
         * This is an unmitigated disaster, only papered over by FEAT_ECV,
         * which allows trapping of the timer registers even with NV2.
         * Still, this is still worse than FEAT_NV on its own. Meh.
         */
        if (!cpus_have_final_cap(ARM64_HAS_ECV)) {
                /*
                 * For a VHE guest hypervisor, the EL2 state is directly
                 * stored in the host EL1 timers, while the emulated EL1
                 * state is stored in the VNCR page. The latter could have
                 * been updated behind our back, and we must reset the
                 * emulation of the timers.
                 *
                 * A non-VHE guest hypervisor doesn't have any direct access
                 * to its timers: the EL2 registers trap despite being
                 * notionally direct (we use the EL1 HW, as for VHE), while
                 * the EL1 registers access memory.
                 *
                 * In both cases, process the emulated timers on each guest
                 * exit. Boo.
                 */
                struct timer_map map;
                get_timer_map(vcpu, &map);

                soft_timer_cancel(&map.emul_vtimer->hrtimer);
                soft_timer_cancel(&map.emul_ptimer->hrtimer);
                timer_emulate(map.emul_vtimer);
                timer_emulate(map.emul_ptimer);
        }
}

/*
 * With a userspace irqchip we have to check if the guest de-asserted the
 * timer and if so, unmask the timer irq signal on the host interrupt
 * controller to ensure that we see future timer signals.
 */
static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu)
{
        struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);

        if (!kvm_timer_should_fire(vtimer)) {
                kvm_timer_update_irq(vcpu, false, vtimer);
                if (static_branch_likely(&has_gic_active_state))
                        set_timer_irq_phys_active(vtimer, false);
                else
                        enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
        }
}

void kvm_timer_sync_user(struct kvm_vcpu *vcpu)
{
        struct arch_timer_cpu *timer = vcpu_timer(vcpu);

        if (unlikely(!timer->enabled))
                return;

        if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
                unmask_vtimer_irq_user(vcpu);
}

void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu)
{
        struct arch_timer_cpu *timer = vcpu_timer(vcpu);
        struct timer_map map;

        get_timer_map(vcpu, &map);

        /*
         * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
         * and to 0 for ARMv7.  We provide an implementation that always
         * resets the timer to be disabled and unmasked and is compliant with
         * the ARMv7 architecture.
         */
        for (int i = 0; i < nr_timers(vcpu); i++)
                timer_set_ctl(vcpu_get_timer(vcpu, i), 0);

        /*
         * A vcpu running at EL2 is in charge of the offset applied to
         * the virtual timer, so use the physical VM offset, and point
         * the vcpu offset to CNTVOFF_EL2.
         */
        if (vcpu_has_nv(vcpu)) {
                struct arch_timer_offset *offs = &vcpu_vtimer(vcpu)->offset;

                offs->vcpu_offset = &__vcpu_sys_reg(vcpu, CNTVOFF_EL2);
                offs->vm_offset = &vcpu->kvm->arch.timer_data.poffset;
        }

        if (timer->enabled) {
                for (int i = 0; i < nr_timers(vcpu); i++)
                        kvm_timer_update_irq(vcpu, false,
                                             vcpu_get_timer(vcpu, i));

                if (irqchip_in_kernel(vcpu->kvm)) {
                        kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_vtimer));
                        if (map.direct_ptimer)
                                kvm_vgic_reset_mapped_irq(vcpu, timer_irq(map.direct_ptimer));
                }
        }

        if (map.emul_vtimer)
                soft_timer_cancel(&map.emul_vtimer->hrtimer);
        if (map.emul_ptimer)
                soft_timer_cancel(&map.emul_ptimer->hrtimer);
}

static void timer_context_init(struct kvm_vcpu *vcpu, int timerid)
{
        struct arch_timer_context *ctxt = vcpu_get_timer(vcpu, timerid);
        struct kvm *kvm = vcpu->kvm;

        ctxt->vcpu = vcpu;

        if (timerid == TIMER_VTIMER)
                ctxt->offset.vm_offset = &kvm->arch.timer_data.voffset;
        else
                ctxt->offset.vm_offset = &kvm->arch.timer_data.poffset;

        hrtimer_setup(&ctxt->hrtimer, kvm_hrtimer_expire, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);

        switch (timerid) {
        case TIMER_PTIMER:
        case TIMER_HPTIMER:
                ctxt->host_timer_irq = host_ptimer_irq;
                break;
        case TIMER_VTIMER:
        case TIMER_HVTIMER:
                ctxt->host_timer_irq = host_vtimer_irq;
                break;
        }
}

void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
{
        struct arch_timer_cpu *timer = vcpu_timer(vcpu);

        for (int i = 0; i < NR_KVM_TIMERS; i++)
                timer_context_init(vcpu, i);

        /* Synchronize offsets across timers of a VM if not already provided */
        if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &vcpu->kvm->arch.flags)) {
                timer_set_offset(vcpu_vtimer(vcpu), kvm_phys_timer_read());
                timer_set_offset(vcpu_ptimer(vcpu), 0);
        }

        hrtimer_setup(&timer->bg_timer, kvm_bg_timer_expire, CLOCK_MONOTONIC,
                      HRTIMER_MODE_ABS_HARD);
}

void kvm_timer_init_vm(struct kvm *kvm)
{
        for (int i = 0; i < NR_KVM_TIMERS; i++)
                kvm->arch.timer_data.ppi[i] = default_ppi[i];
}

void kvm_timer_cpu_up(void)
{
        enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags);
        if (host_ptimer_irq)
                enable_percpu_irq(host_ptimer_irq, host_ptimer_irq_flags);
}

void kvm_timer_cpu_down(void)
{
        disable_percpu_irq(host_vtimer_irq);
        if (host_ptimer_irq)
                disable_percpu_irq(host_ptimer_irq);
}

int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
{
        struct arch_timer_context *timer;

        switch (regid) {
        case KVM_REG_ARM_TIMER_CTL:
                timer = vcpu_vtimer(vcpu);
                kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
                break;
        case KVM_REG_ARM_TIMER_CNT:
                if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
                              &vcpu->kvm->arch.flags)) {
                        timer = vcpu_vtimer(vcpu);
                        timer_set_offset(timer, kvm_phys_timer_read() - value);
                }
                break;
        case KVM_REG_ARM_TIMER_CVAL:
                timer = vcpu_vtimer(vcpu);
                kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
                break;
        case KVM_REG_ARM_PTIMER_CTL:
                timer = vcpu_ptimer(vcpu);
                kvm_arm_timer_write(vcpu, timer, TIMER_REG_CTL, value);
                break;
        case KVM_REG_ARM_PTIMER_CNT:
                if (!test_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET,
                              &vcpu->kvm->arch.flags)) {
                        timer = vcpu_ptimer(vcpu);
                        timer_set_offset(timer, kvm_phys_timer_read() - value);
                }
                break;
        case KVM_REG_ARM_PTIMER_CVAL:
                timer = vcpu_ptimer(vcpu);
                kvm_arm_timer_write(vcpu, timer, TIMER_REG_CVAL, value);
                break;

        default:
                return -1;
        }

        return 0;
}

static u64 read_timer_ctl(struct arch_timer_context *timer)
{
        /*
         * Set ISTATUS bit if it's expired.
         * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
         * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
         * regardless of ENABLE bit for our implementation convenience.
         */
        u32 ctl = timer_get_ctl(timer);

        if (!kvm_timer_compute_delta(timer))
                ctl |= ARCH_TIMER_CTRL_IT_STAT;

        return ctl;
}

u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid)
{
        switch (regid) {
        case KVM_REG_ARM_TIMER_CTL:
                return kvm_arm_timer_read(vcpu,
                                          vcpu_vtimer(vcpu), TIMER_REG_CTL);
        case KVM_REG_ARM_TIMER_CNT:
                return kvm_arm_timer_read(vcpu,
                                          vcpu_vtimer(vcpu), TIMER_REG_CNT);
        case KVM_REG_ARM_TIMER_CVAL:
                return kvm_arm_timer_read(vcpu,
                                          vcpu_vtimer(vcpu), TIMER_REG_CVAL);
        case KVM_REG_ARM_PTIMER_CTL:
                return kvm_arm_timer_read(vcpu,
                                          vcpu_ptimer(vcpu), TIMER_REG_CTL);
        case KVM_REG_ARM_PTIMER_CNT:
                return kvm_arm_timer_read(vcpu,
                                          vcpu_ptimer(vcpu), TIMER_REG_CNT);
        case KVM_REG_ARM_PTIMER_CVAL:
                return kvm_arm_timer_read(vcpu,
                                          vcpu_ptimer(vcpu), TIMER_REG_CVAL);
        }
        return (u64)-1;
}

static u64 kvm_arm_timer_read(struct kvm_vcpu *vcpu,
                              struct arch_timer_context *timer,
                              enum kvm_arch_timer_regs treg)
{
        u64 val;

        switch (treg) {
        case TIMER_REG_TVAL:
                val = timer_get_cval(timer) - kvm_phys_timer_read() + timer_get_offset(timer);
                val = lower_32_bits(val);
                break;

        case TIMER_REG_CTL:
                val = read_timer_ctl(timer);
                break;

        case TIMER_REG_CVAL:
                val = timer_get_cval(timer);
                break;

        case TIMER_REG_CNT:
                val = kvm_phys_timer_read() - timer_get_offset(timer);
                break;

        case TIMER_REG_VOFF:
                val = *timer->offset.vcpu_offset;
                break;

        default:
                BUG();
        }

        return val;
}

u64 kvm_arm_timer_read_sysreg(struct kvm_vcpu *vcpu,
                              enum kvm_arch_timers tmr,
                              enum kvm_arch_timer_regs treg)
{
        struct arch_timer_context *timer;
        struct timer_map map;
        u64 val;

        get_timer_map(vcpu, &map);
        timer = vcpu_get_timer(vcpu, tmr);

        if (timer == map.emul_vtimer || timer == map.emul_ptimer)
                return kvm_arm_timer_read(vcpu, timer, treg);

        preempt_disable();
        timer_save_state(timer);

        val = kvm_arm_timer_read(vcpu, timer, treg);

        timer_restore_state(timer);
        preempt_enable();

        return val;
}

static void kvm_arm_timer_write(struct kvm_vcpu *vcpu,
                                struct arch_timer_context *timer,
                                enum kvm_arch_timer_regs treg,
                                u64 val)
{
        switch (treg) {
        case TIMER_REG_TVAL:
                timer_set_cval(timer, kvm_phys_timer_read() - timer_get_offset(timer) + (s32)val);
                break;

        case TIMER_REG_CTL:
                timer_set_ctl(timer, val & ~ARCH_TIMER_CTRL_IT_STAT);
                break;

        case TIMER_REG_CVAL:
                timer_set_cval(timer, val);
                break;

        case TIMER_REG_VOFF:
                *timer->offset.vcpu_offset = val;
                break;

        default:
                BUG();
        }
}

void kvm_arm_timer_write_sysreg(struct kvm_vcpu *vcpu,
                                enum kvm_arch_timers tmr,
                                enum kvm_arch_timer_regs treg,
                                u64 val)
{
        struct arch_timer_context *timer;
        struct timer_map map;

        get_timer_map(vcpu, &map);
        timer = vcpu_get_timer(vcpu, tmr);
        if (timer == map.emul_vtimer || timer == map.emul_ptimer) {
                soft_timer_cancel(&timer->hrtimer);
                kvm_arm_timer_write(vcpu, timer, treg, val);
                timer_emulate(timer);
        } else {
                preempt_disable();
                timer_save_state(timer);
                kvm_arm_timer_write(vcpu, timer, treg, val);
                timer_restore_state(timer);
                preempt_enable();
        }
}

static int timer_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu)
{
        if (vcpu)
                irqd_set_forwarded_to_vcpu(d);
        else
                irqd_clr_forwarded_to_vcpu(d);

        return 0;
}

static int timer_irq_set_irqchip_state(struct irq_data *d,
                                       enum irqchip_irq_state which, bool val)
{
        if (which != IRQCHIP_STATE_ACTIVE || !irqd_is_forwarded_to_vcpu(d))
                return irq_chip_set_parent_state(d, which, val);

        if (val)
                irq_chip_mask_parent(d);
        else
                irq_chip_unmask_parent(d);

        return 0;
}

static void timer_irq_eoi(struct irq_data *d)
{
        if (!irqd_is_forwarded_to_vcpu(d))
                irq_chip_eoi_parent(d);
}

static void timer_irq_ack(struct irq_data *d)
{
        d = d->parent_data;
        if (d->chip->irq_ack)
                d->chip->irq_ack(d);
}

static struct irq_chip timer_chip = {
        .name                        = "KVM",
        .irq_ack                = timer_irq_ack,
        .irq_mask                = irq_chip_mask_parent,
        .irq_unmask                = irq_chip_unmask_parent,
        .irq_eoi                = timer_irq_eoi,
        .irq_set_type                = irq_chip_set_type_parent,
        .irq_set_vcpu_affinity        = timer_irq_set_vcpu_affinity,
        .irq_set_irqchip_state        = timer_irq_set_irqchip_state,
};

static int timer_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
                                  unsigned int nr_irqs, void *arg)
{
        irq_hw_number_t hwirq = (uintptr_t)arg;

        return irq_domain_set_hwirq_and_chip(domain, virq, hwirq,
                                             &timer_chip, NULL);
}

static void timer_irq_domain_free(struct irq_domain *domain, unsigned int virq,
                                  unsigned int nr_irqs)
{
}

static const struct irq_domain_ops timer_domain_ops = {
        .alloc        = timer_irq_domain_alloc,
        .free        = timer_irq_domain_free,
};

static void kvm_irq_fixup_flags(unsigned int virq, u32 *flags)
{
        *flags = irq_get_trigger_type(virq);
        if (*flags != IRQF_TRIGGER_HIGH && *flags != IRQF_TRIGGER_LOW) {
                kvm_err("Invalid trigger for timer IRQ%d, assuming level low\n",
                        virq);
                *flags = IRQF_TRIGGER_LOW;
        }
}

static int kvm_irq_init(struct arch_timer_kvm_info *info)
{
        struct irq_domain *domain = NULL;

        if (info->virtual_irq <= 0) {
                kvm_err("kvm_arch_timer: invalid virtual timer IRQ: %d\n",
                        info->virtual_irq);
                return -ENODEV;
        }

        host_vtimer_irq = info->virtual_irq;
        kvm_irq_fixup_flags(host_vtimer_irq, &host_vtimer_irq_flags);

        if (kvm_vgic_global_state.no_hw_deactivation) {
                struct fwnode_handle *fwnode;
                struct irq_data *data;

                fwnode = irq_domain_alloc_named_fwnode("kvm-timer");
                if (!fwnode)
                        return -ENOMEM;

                /* Assume both vtimer and ptimer in the same parent */
                data = irq_get_irq_data(host_vtimer_irq);
                domain = irq_domain_create_hierarchy(data->domain, 0,
                                                     NR_KVM_TIMERS, fwnode,
                                                     &timer_domain_ops, NULL);
                if (!domain) {
                        irq_domain_free_fwnode(fwnode);
                        return -ENOMEM;
                }

                arch_timer_irq_ops.flags |= VGIC_IRQ_SW_RESAMPLE;
                WARN_ON(irq_domain_push_irq(domain, host_vtimer_irq,
                                            (void *)TIMER_VTIMER));
        }

        if (info->physical_irq > 0) {
                host_ptimer_irq = info->physical_irq;
                kvm_irq_fixup_flags(host_ptimer_irq, &host_ptimer_irq_flags);

                if (domain)
                        WARN_ON(irq_domain_push_irq(domain, host_ptimer_irq,
                                                    (void *)TIMER_PTIMER));
        }

        return 0;
}

static void kvm_timer_handle_errata(void)
{
        u64 mmfr0, mmfr1, mmfr4;

        /*
         * CNTVOFF_EL2 is broken on some implementations. For those, we trap
         * all virtual timer/counter accesses, requiring FEAT_ECV.
         *
         * However, a hypervisor supporting nesting is likely to mitigate the
         * erratum at L0, and not require other levels to mitigate it (which
         * would otherwise be a terrible performance sink due to trap
         * amplification).
         *
         * Given that the affected HW implements both FEAT_VHE and FEAT_E2H0,
         * and that NV is likely not to (because of limitations of the
         * architecture), only enable the workaround when FEAT_VHE and
         * FEAT_E2H0 are both detected. Time will tell if this actually holds.
         */
        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
        mmfr4 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR4_EL1);
        if (SYS_FIELD_GET(ID_AA64MMFR1_EL1, VH, mmfr1)                &&
            !SYS_FIELD_GET(ID_AA64MMFR4_EL1, E2H0, mmfr4)        &&
            SYS_FIELD_GET(ID_AA64MMFR0_EL1, ECV, mmfr0)                &&
            (has_vhe() || has_hvhe())                                &&
            cpus_have_final_cap(ARM64_WORKAROUND_QCOM_ORYON_CNTVOFF)) {
                static_branch_enable(&broken_cntvoff_key);
                kvm_info("Broken CNTVOFF_EL2, trapping virtual timer\n");
        }
}

int __init kvm_timer_hyp_init(bool has_gic)
{
        struct arch_timer_kvm_info *info;
        int err;

        info = arch_timer_get_kvm_info();
        timecounter = &info->timecounter;

        if (!timecounter->cc) {
                kvm_err("kvm_arch_timer: uninitialized timecounter\n");
                return -ENODEV;
        }

        err = kvm_irq_init(info);
        if (err)
                return err;

        /* First, do the virtual EL1 timer irq */

        err = request_percpu_irq(host_vtimer_irq, kvm_arch_timer_handler,
                                 "kvm guest vtimer", kvm_get_running_vcpus());
        if (err) {
                kvm_err("kvm_arch_timer: can't request vtimer interrupt %d (%d)\n",
                        host_vtimer_irq, err);
                return err;
        }

        if (has_gic) {
                err = irq_set_vcpu_affinity(host_vtimer_irq,
                                            kvm_get_running_vcpus());
                if (err) {
                        kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
                        goto out_free_vtimer_irq;
                }

                static_branch_enable(&has_gic_active_state);
        }

        kvm_debug("virtual timer IRQ%d\n", host_vtimer_irq);

        /* Now let's do the physical EL1 timer irq */

        if (info->physical_irq > 0) {
                err = request_percpu_irq(host_ptimer_irq, kvm_arch_timer_handler,
                                         "kvm guest ptimer", kvm_get_running_vcpus());
                if (err) {
                        kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n",
                                host_ptimer_irq, err);
                        goto out_free_vtimer_irq;
                }

                if (has_gic) {
                        err = irq_set_vcpu_affinity(host_ptimer_irq,
                                                    kvm_get_running_vcpus());
                        if (err) {
                                kvm_err("kvm_arch_timer: error setting vcpu affinity\n");
                                goto out_free_ptimer_irq;
                        }
                }

                kvm_debug("physical timer IRQ%d\n", host_ptimer_irq);
        } else if (has_vhe()) {
                kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n",
                        info->physical_irq);
                err = -ENODEV;
                goto out_free_vtimer_irq;
        }

        kvm_timer_handle_errata();
        return 0;

out_free_ptimer_irq:
        if (info->physical_irq > 0)
                free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus());
out_free_vtimer_irq:
        free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus());
        return err;
}

void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
{
        struct arch_timer_cpu *timer = vcpu_timer(vcpu);

        soft_timer_cancel(&timer->bg_timer);
}

static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
{
        u32 ppis = 0;
        bool valid;

        mutex_lock(&vcpu->kvm->arch.config_lock);

        for (int i = 0; i < nr_timers(vcpu); i++) {
                struct arch_timer_context *ctx;
                int irq;

                ctx = vcpu_get_timer(vcpu, i);
                irq = timer_irq(ctx);
                if (kvm_vgic_set_owner(vcpu, irq, ctx))
                        break;

                /*
                 * We know by construction that we only have PPIs, so
                 * all values are less than 32.
                 */
                ppis |= BIT(irq);
        }

        valid = hweight32(ppis) == nr_timers(vcpu);

        if (valid)
                set_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE, &vcpu->kvm->arch.flags);

        mutex_unlock(&vcpu->kvm->arch.config_lock);

        return valid;
}

static bool kvm_arch_timer_get_input_level(int vintid)
{
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();

        if (WARN(!vcpu, "No vcpu context!\n"))
                return false;

        for (int i = 0; i < nr_timers(vcpu); i++) {
                struct arch_timer_context *ctx;

                ctx = vcpu_get_timer(vcpu, i);
                if (timer_irq(ctx) == vintid)
                        return kvm_timer_should_fire(ctx);
        }

        /* A timer IRQ has fired, but no matching timer was found? */
        WARN_RATELIMIT(1, "timer INTID%d unknown\n", vintid);

        return false;
}

int kvm_timer_enable(struct kvm_vcpu *vcpu)
{
        struct arch_timer_cpu *timer = vcpu_timer(vcpu);
        struct timer_map map;
        int ret;

        if (timer->enabled)
                return 0;

        /* Without a VGIC we do not map virtual IRQs to physical IRQs */
        if (!irqchip_in_kernel(vcpu->kvm))
                goto no_vgic;

        /*
         * At this stage, we have the guarantee that the vgic is both
         * available and initialized.
         */
        if (!timer_irqs_are_valid(vcpu)) {
                kvm_debug("incorrectly configured timer irqs\n");
                return -EINVAL;
        }

        get_timer_map(vcpu, &map);

        ret = kvm_vgic_map_phys_irq(vcpu,
                                    map.direct_vtimer->host_timer_irq,
                                    timer_irq(map.direct_vtimer),
                                    &arch_timer_irq_ops);
        if (ret)
                return ret;

        if (map.direct_ptimer) {
                ret = kvm_vgic_map_phys_irq(vcpu,
                                            map.direct_ptimer->host_timer_irq,
                                            timer_irq(map.direct_ptimer),
                                            &arch_timer_irq_ops);
        }

        if (ret)
                return ret;

no_vgic:
        timer->enabled = 1;
        return 0;
}

/* If we have CNTPOFF, permanently set ECV to enable it */
void kvm_timer_init_vhe(void)
{
        if (cpus_have_final_cap(ARM64_HAS_ECV_CNTPOFF))
                sysreg_clear_set(cnthctl_el2, 0, CNTHCTL_ECV);
}

int kvm_arm_timer_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
        int __user *uaddr = (int __user *)(long)attr->addr;
        int irq, idx, ret = 0;

        if (!irqchip_in_kernel(vcpu->kvm))
                return -EINVAL;

        if (get_user(irq, uaddr))
                return -EFAULT;

        if (!(irq_is_ppi(irq)))
                return -EINVAL;

        mutex_lock(&vcpu->kvm->arch.config_lock);

        if (test_bit(KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE,
                     &vcpu->kvm->arch.flags)) {
                ret = -EBUSY;
                goto out;
        }

        switch (attr->attr) {
        case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
                idx = TIMER_VTIMER;
                break;
        case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
                idx = TIMER_PTIMER;
                break;
        case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
                idx = TIMER_HVTIMER;
                break;
        case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
                idx = TIMER_HPTIMER;
                break;
        default:
                ret = -ENXIO;
                goto out;
        }

        /*
         * We cannot validate the IRQ unicity before we run, so take it at
         * face value. The verdict will be given on first vcpu run, for each
         * vcpu. Yes this is late. Blame it on the stupid API.
         */
        vcpu->kvm->arch.timer_data.ppi[idx] = irq;

out:
        mutex_unlock(&vcpu->kvm->arch.config_lock);
        return ret;
}

int kvm_arm_timer_get_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
        int __user *uaddr = (int __user *)(long)attr->addr;
        struct arch_timer_context *timer;
        int irq;

        switch (attr->attr) {
        case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
                timer = vcpu_vtimer(vcpu);
                break;
        case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
                timer = vcpu_ptimer(vcpu);
                break;
        case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
                timer = vcpu_hvtimer(vcpu);
                break;
        case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
                timer = vcpu_hptimer(vcpu);
                break;
        default:
                return -ENXIO;
        }

        irq = timer_irq(timer);
        return put_user(irq, uaddr);
}

int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
        switch (attr->attr) {
        case KVM_ARM_VCPU_TIMER_IRQ_VTIMER:
        case KVM_ARM_VCPU_TIMER_IRQ_PTIMER:
        case KVM_ARM_VCPU_TIMER_IRQ_HVTIMER:
        case KVM_ARM_VCPU_TIMER_IRQ_HPTIMER:
                return 0;
        }

        return -ENXIO;
}

int kvm_vm_ioctl_set_counter_offset(struct kvm *kvm,
                                    struct kvm_arm_counter_offset *offset)
{
        int ret = 0;

        if (offset->reserved)
                return -EINVAL;

        mutex_lock(&kvm->lock);

        if (lock_all_vcpus(kvm)) {
                set_bit(KVM_ARCH_FLAG_VM_COUNTER_OFFSET, &kvm->arch.flags);

                /*
                 * If userspace decides to set the offset using this
                 * API rather than merely restoring the counter
                 * values, the offset applies to both the virtual and
                 * physical views.
                 */
                kvm->arch.timer_data.voffset = offset->counter_offset;
                kvm->arch.timer_data.poffset = offset->counter_offset;

                unlock_all_vcpus(kvm);
        } else {
                ret = -EBUSY;
        }

        mutex_unlock(&kvm->lock);

        return ret;
}















































































   18 



   18 


   18 











   18 








    1 
    1 









    1 



























  248 







  124 
  128 























































































































































































































































































































































































   18 



   18 








   18 
























































    5 
























  120 








































































































































































































    5 











  253 















  253 
  180 

  248 









    5 















  238 






   26 







  248 

  248 









  248 


















  247 










  248 


  180 
  181 

















   77 











  252 







































   49 











  271 









  279 






  278 







































  279 


  279 






























































































































   45 






  238 














































  283 











  282 









  172 



  171 
    1 













  186 
   56 












  279 





































































































  282 







  283 








  283 































    6 
  279 







  279 





  279 
  253 
  269 


































  128 













  124 


















  124 










  247 

  281 































































































































































  129 

































  129 




    7 


  124 







    7 

































































  128 







  129 



























  203 










  203 









   50 











  158 













































































  203 








  203 






  203 

    2 




















   95 





  128 



  203 




















































































































































































   22 





   23 


    4 


   21 
   22 

   21 
   17 































































































































































































































































































































































































































































































  129 






  129 

























































   18 






































   18 












    1 





   16 





    2 
















































  125 










  112 


   34 














   18 







   18 




    1 




   17 

















   17 


   17 




  127 

  127 





























































































































































































































































   11 











  127 


   17 






















  130 














  130 







  130 


















  130 





  130 

































  130 















  129 
  130 
  130 





  130 












  130 











  129 


  130 









  130 

  130 







































  128 











  128 


























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/spinlock.h>

#include <linux/mm.h>
#include <linux/memfd.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/secretmem.h>

#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
#include <linux/migrate.h>
#include <linux/mm_inline.h>
#include <linux/pagevec.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>

#include <asm/mmu_context.h>
#include <asm/tlbflush.h>

#include "internal.h"

struct follow_page_context {
        struct dev_pagemap *pgmap;
        unsigned int page_mask;
};

static inline void sanity_check_pinned_pages(struct page **pages,
                                             unsigned long npages)
{
        if (!IS_ENABLED(CONFIG_DEBUG_VM))
                return;

        /*
         * We only pin anonymous pages if they are exclusive. Once pinned, we
         * can no longer turn them possibly shared and PageAnonExclusive() will
         * stick around until the page is freed.
         *
         * We'd like to verify that our pinned anonymous pages are still mapped
         * exclusively. The issue with anon THP is that we don't know how
         * they are/were mapped when pinning them. However, for anon
         * THP we can assume that either the given page (PTE-mapped THP) or
         * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
         * neither is the case, there is certainly something wrong.
         */
        for (; npages; npages--, pages++) {
                struct page *page = *pages;
                struct folio *folio;

                if (!page)
                        continue;

                folio = page_folio(page);

                if (is_zero_page(page) ||
                    !folio_test_anon(folio))
                        continue;
                if (!folio_test_large(folio) || folio_test_hugetlb(folio))
                        VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
                else
                        /* Either a PTE-mapped or a PMD-mapped THP. */
                        VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
                                       !PageAnonExclusive(page), page);
        }
}

/*
 * Return the folio with ref appropriately incremented,
 * or NULL if that failed.
 */
static inline struct folio *try_get_folio(struct page *page, int refs)
{
        struct folio *folio;

retry:
        folio = page_folio(page);
        if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
                return NULL;
        if (unlikely(!folio_ref_try_add(folio, refs)))
                return NULL;

        /*
         * At this point we have a stable reference to the folio; but it
         * could be that between calling page_folio() and the refcount
         * increment, the folio was split, in which case we'd end up
         * holding a reference on a folio that has nothing to do with the page
         * we were given anymore.
         * So now that the folio is stable, recheck that the page still
         * belongs to this folio.
         */
        if (unlikely(page_folio(page) != folio)) {
                folio_put_refs(folio, refs);
                goto retry;
        }

        return folio;
}

static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
{
        if (flags & FOLL_PIN) {
                if (is_zero_folio(folio))
                        return;
                node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
                if (folio_has_pincount(folio))
                        atomic_sub(refs, &folio->_pincount);
                else
                        refs *= GUP_PIN_COUNTING_BIAS;
        }

        folio_put_refs(folio, refs);
}

/**
 * try_grab_folio() - add a folio's refcount by a flag-dependent amount
 * @folio:    pointer to folio to be grabbed
 * @refs:     the value to (effectively) add to the folio's refcount
 * @flags:    gup flags: these are the FOLL_* flag values
 *
 * This might not do anything at all, depending on the flags argument.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
 * time.
 *
 * Return: 0 for success, or if no action was required (if neither FOLL_PIN
 * nor FOLL_GET was set, nothing is done). A negative error code for failure:
 *
 *   -ENOMEM                FOLL_GET or FOLL_PIN was set, but the folio could not
 *                        be grabbed.
 *
 * It is called when we have a stable reference for the folio, typically in
 * GUP slow path.
 */
int __must_check try_grab_folio(struct folio *folio, int refs,
                                unsigned int flags)
{
        if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
                return -ENOMEM;

        if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(&folio->page)))
                return -EREMOTEIO;

        if (flags & FOLL_GET)
                folio_ref_add(folio, refs);
        else if (flags & FOLL_PIN) {
                /*
                 * Don't take a pin on the zero page - it's not going anywhere
                 * and it is used in a *lot* of places.
                 */
                if (is_zero_folio(folio))
                        return 0;

                /*
                 * Increment the normal page refcount field at least once,
                 * so that the page really is pinned.
                 */
                if (folio_has_pincount(folio)) {
                        folio_ref_add(folio, refs);
                        atomic_add(refs, &folio->_pincount);
                } else {
                        folio_ref_add(folio, refs * GUP_PIN_COUNTING_BIAS);
                }

                node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
        }

        return 0;
}

/**
 * unpin_user_page() - release a dma-pinned page
 * @page:            pointer to page to be released
 *
 * Pages that were pinned via pin_user_pages*() must be released via either
 * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
 * that such pages can be separately tracked and uniquely handled. In
 * particular, interactions with RDMA and filesystems need special handling.
 */
void unpin_user_page(struct page *page)
{
        sanity_check_pinned_pages(&page, 1);
        gup_put_folio(page_folio(page), 1, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_page);

/**
 * unpin_folio() - release a dma-pinned folio
 * @folio:         pointer to folio to be released
 *
 * Folios that were pinned via memfd_pin_folios() or other similar routines
 * must be released either using unpin_folio() or unpin_folios().
 */
void unpin_folio(struct folio *folio)
{
        gup_put_folio(folio, 1, FOLL_PIN);
}
EXPORT_SYMBOL_GPL(unpin_folio);

/**
 * folio_add_pin - Try to get an additional pin on a pinned folio
 * @folio: The folio to be pinned
 *
 * Get an additional pin on a folio we already have a pin on.  Makes no change
 * if the folio is a zero_page.
 */
void folio_add_pin(struct folio *folio)
{
        if (is_zero_folio(folio))
                return;

        /*
         * Similar to try_grab_folio(): be sure to *also* increment the normal
         * page refcount field at least once, so that the page really is
         * pinned.
         */
        if (folio_has_pincount(folio)) {
                WARN_ON_ONCE(atomic_read(&folio->_pincount) < 1);
                folio_ref_inc(folio);
                atomic_inc(&folio->_pincount);
        } else {
                WARN_ON_ONCE(folio_ref_count(folio) < GUP_PIN_COUNTING_BIAS);
                folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
        }
}

static inline struct folio *gup_folio_range_next(struct page *start,
                unsigned long npages, unsigned long i, unsigned int *ntails)
{
        struct page *next = nth_page(start, i);
        struct folio *folio = page_folio(next);
        unsigned int nr = 1;

        if (folio_test_large(folio))
                nr = min_t(unsigned int, npages - i,
                           folio_nr_pages(folio) - folio_page_idx(folio, next));

        *ntails = nr;
        return folio;
}

static inline struct folio *gup_folio_next(struct page **list,
                unsigned long npages, unsigned long i, unsigned int *ntails)
{
        struct folio *folio = page_folio(list[i]);
        unsigned int nr;

        for (nr = i + 1; nr < npages; nr++) {
                if (page_folio(list[nr]) != folio)
                        break;
        }

        *ntails = nr - i;
        return folio;
}

/**
 * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
 * @pages:  array of pages to be maybe marked dirty, and definitely released.
 * @npages: number of pages in the @pages array.
 * @make_dirty: whether to mark the pages dirty
 *
 * "gup-pinned page" refers to a page that has had one of the get_user_pages()
 * variants called on that page.
 *
 * For each page in the @pages array, make that page (or its head page, if a
 * compound page) dirty, if @make_dirty is true, and if the page was previously
 * listed as clean. In any case, releases all pages using unpin_user_page(),
 * possibly via unpin_user_pages(), for the non-dirty case.
 *
 * Please see the unpin_user_page() documentation for details.
 *
 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 * required, then the caller should a) verify that this is really correct,
 * because _lock() is usually required, and b) hand code it:
 * set_page_dirty_lock(), unpin_user_page().
 *
 */
void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
                                 bool make_dirty)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        if (!make_dirty) {
                unpin_user_pages(pages, npages);
                return;
        }

        sanity_check_pinned_pages(pages, npages);
        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_next(pages, npages, i, &nr);
                /*
                 * Checking PageDirty at this point may race with
                 * clear_page_dirty_for_io(), but that's OK. Two key
                 * cases:
                 *
                 * 1) This code sees the page as already dirty, so it
                 * skips the call to set_page_dirty(). That could happen
                 * because clear_page_dirty_for_io() called
                 * folio_mkclean(), followed by set_page_dirty().
                 * However, now the page is going to get written back,
                 * which meets the original intention of setting it
                 * dirty, so all is well: clear_page_dirty_for_io() goes
                 * on to call TestClearPageDirty(), and write the page
                 * back.
                 *
                 * 2) This code sees the page as clean, so it calls
                 * set_page_dirty(). The page stays dirty, despite being
                 * written back, so it gets written back again in the
                 * next writeback cycle. This is harmless.
                 */
                if (!folio_test_dirty(folio)) {
                        folio_lock(folio);
                        folio_mark_dirty(folio);
                        folio_unlock(folio);
                }
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}
EXPORT_SYMBOL(unpin_user_pages_dirty_lock);

/**
 * unpin_user_page_range_dirty_lock() - release and optionally dirty
 * gup-pinned page range
 *
 * @page:  the starting page of a range maybe marked dirty, and definitely released.
 * @npages: number of consecutive pages to release.
 * @make_dirty: whether to mark the pages dirty
 *
 * "gup-pinned page range" refers to a range of pages that has had one of the
 * pin_user_pages() variants called on that page.
 *
 * For the page ranges defined by [page .. page+npages], make that range (or
 * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
 * page range was previously listed as clean.
 *
 * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
 * required, then the caller should a) verify that this is really correct,
 * because _lock() is usually required, and b) hand code it:
 * set_page_dirty_lock(), unpin_user_page().
 *
 */
void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
                                      bool make_dirty)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_range_next(page, npages, i, &nr);
                if (make_dirty && !folio_test_dirty(folio)) {
                        folio_lock(folio);
                        folio_mark_dirty(folio);
                        folio_unlock(folio);
                }
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}
EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);

static void gup_fast_unpin_user_pages(struct page **pages, unsigned long npages)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        /*
         * Don't perform any sanity checks because we might have raced with
         * fork() and some anonymous pages might now actually be shared --
         * which is why we're unpinning after all.
         */
        for (i = 0; i < npages; i += nr) {
                folio = gup_folio_next(pages, npages, i, &nr);
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}

/**
 * unpin_user_pages() - release an array of gup-pinned pages.
 * @pages:  array of pages to be marked dirty and released.
 * @npages: number of pages in the @pages array.
 *
 * For each page in the @pages array, release the page using unpin_user_page().
 *
 * Please see the unpin_user_page() documentation for details.
 */
void unpin_user_pages(struct page **pages, unsigned long npages)
{
        unsigned long i;
        struct folio *folio;
        unsigned int nr;

        /*
         * If this WARN_ON() fires, then the system *might* be leaking pages (by
         * leaving them pinned), but probably not. More likely, gup/pup returned
         * a hard -ERRNO error to the caller, who erroneously passed it here.
         */
        if (WARN_ON(IS_ERR_VALUE(npages)))
                return;

        sanity_check_pinned_pages(pages, npages);
        for (i = 0; i < npages; i += nr) {
                if (!pages[i]) {
                        nr = 1;
                        continue;
                }
                folio = gup_folio_next(pages, npages, i, &nr);
                gup_put_folio(folio, nr, FOLL_PIN);
        }
}
EXPORT_SYMBOL(unpin_user_pages);

/**
 * unpin_user_folio() - release pages of a folio
 * @folio:  pointer to folio to be released
 * @npages: number of pages of same folio
 *
 * Release npages of the folio
 */
void unpin_user_folio(struct folio *folio, unsigned long npages)
{
        gup_put_folio(folio, npages, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_folio);

/**
 * unpin_folios() - release an array of gup-pinned folios.
 * @folios:  array of folios to be marked dirty and released.
 * @nfolios: number of folios in the @folios array.
 *
 * For each folio in the @folios array, release the folio using gup_put_folio.
 *
 * Please see the unpin_folio() documentation for details.
 */
void unpin_folios(struct folio **folios, unsigned long nfolios)
{
        unsigned long i = 0, j;

        /*
         * If this WARN_ON() fires, then the system *might* be leaking folios
         * (by leaving them pinned), but probably not. More likely, gup/pup
         * returned a hard -ERRNO error to the caller, who erroneously passed
         * it here.
         */
        if (WARN_ON(IS_ERR_VALUE(nfolios)))
                return;

        while (i < nfolios) {
                for (j = i + 1; j < nfolios; j++)
                        if (folios[i] != folios[j])
                                break;

                if (folios[i])
                        gup_put_folio(folios[i], j - i, FOLL_PIN);
                i = j;
        }
}
EXPORT_SYMBOL_GPL(unpin_folios);

/*
 * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
 * lifecycle.  Avoid setting the bit unless necessary, or it might cause write
 * cache bouncing on large SMP machines for concurrent pinned gups.
 */
static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
{
        if (!test_bit(MMF_HAS_PINNED, mm_flags))
                set_bit(MMF_HAS_PINNED, mm_flags);
}

#ifdef CONFIG_MMU

#ifdef CONFIG_HAVE_GUP_FAST
static int record_subpages(struct page *page, unsigned long sz,
                           unsigned long addr, unsigned long end,
                           struct page **pages)
{
        struct page *start_page;
        int nr;

        start_page = nth_page(page, (addr & (sz - 1)) >> PAGE_SHIFT);
        for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
                pages[nr] = nth_page(start_page, nr);

        return nr;
}

/**
 * try_grab_folio_fast() - Attempt to get or pin a folio in fast path.
 * @page:  pointer to page to be grabbed
 * @refs:  the value to (effectively) add to the folio's refcount
 * @flags: gup flags: these are the FOLL_* flag values.
 *
 * "grab" names in this file mean, "look at flags to decide whether to use
 * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
 *
 * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
 * same time. (That's true throughout the get_user_pages*() and
 * pin_user_pages*() APIs.) Cases:
 *
 *    FOLL_GET: folio's refcount will be incremented by @refs.
 *
 *    FOLL_PIN on large folios: folio's refcount will be incremented by
 *    @refs, and its pincount will be incremented by @refs.
 *
 *    FOLL_PIN on single-page folios: folio's refcount will be incremented by
 *    @refs * GUP_PIN_COUNTING_BIAS.
 *
 * Return: The folio containing @page (with refcount appropriately
 * incremented) for success, or NULL upon failure. If neither FOLL_GET
 * nor FOLL_PIN was set, that's considered failure, and furthermore,
 * a likely bug in the caller, so a warning is also emitted.
 *
 * It uses add ref unless zero to elevate the folio refcount and must be called
 * in fast path only.
 */
static struct folio *try_grab_folio_fast(struct page *page, int refs,
                                         unsigned int flags)
{
        struct folio *folio;

        /* Raise warn if it is not called in fast GUP */
        VM_WARN_ON_ONCE(!irqs_disabled());

        if (WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == 0))
                return NULL;

        if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
                return NULL;

        if (flags & FOLL_GET)
                return try_get_folio(page, refs);

        /* FOLL_PIN is set */

        /*
         * Don't take a pin on the zero page - it's not going anywhere
         * and it is used in a *lot* of places.
         */
        if (is_zero_page(page))
                return page_folio(page);

        folio = try_get_folio(page, refs);
        if (!folio)
                return NULL;

        /*
         * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
         * right zone, so fail and let the caller fall back to the slow
         * path.
         */
        if (unlikely((flags & FOLL_LONGTERM) &&
                     !folio_is_longterm_pinnable(folio))) {
                folio_put_refs(folio, refs);
                return NULL;
        }

        /*
         * When pinning a large folio, use an exact count to track it.
         *
         * However, be sure to *also* increment the normal folio
         * refcount field at least once, so that the folio really
         * is pinned.  That's why the refcount from the earlier
         * try_get_folio() is left intact.
         */
        if (folio_has_pincount(folio))
                atomic_add(refs, &folio->_pincount);
        else
                folio_ref_add(folio,
                                refs * (GUP_PIN_COUNTING_BIAS - 1));
        /*
         * Adjust the pincount before re-checking the PTE for changes.
         * This is essentially a smp_mb() and is paired with a memory
         * barrier in folio_try_share_anon_rmap_*().
         */
        smp_mb__after_atomic();

        node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);

        return folio;
}
#endif        /* CONFIG_HAVE_GUP_FAST */

/* Common code for can_follow_write_* */
static inline bool can_follow_write_common(struct page *page,
                struct vm_area_struct *vma, unsigned int flags)
{
        /* Maybe FOLL_FORCE is set to override it? */
        if (!(flags & FOLL_FORCE))
                return false;

        /* But FOLL_FORCE has no effect on shared mappings */
        if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
                return false;

        /* ... or read-only private ones */
        if (!(vma->vm_flags & VM_MAYWRITE))
                return false;

        /* ... or already writable ones that just need to take a write fault */
        if (vma->vm_flags & VM_WRITE)
                return false;

        /*
         * See can_change_pte_writable(): we broke COW and could map the page
         * writable if we have an exclusive anonymous page ...
         */
        return page && PageAnon(page) && PageAnonExclusive(page);
}

static struct page *no_page_table(struct vm_area_struct *vma,
                                  unsigned int flags, unsigned long address)
{
        if (!(flags & FOLL_DUMP))
                return NULL;

        /*
         * When core dumping, we don't want to allocate unnecessary pages or
         * page tables.  Return error instead of NULL to skip handle_mm_fault,
         * then get_dump_page() will return NULL to leave a hole in the dump.
         * But we can only make this optimization where a hole would surely
         * be zero-filled if handle_mm_fault() actually did handle it.
         */
        if (is_vm_hugetlb_page(vma)) {
                struct hstate *h = hstate_vma(vma);

                if (!hugetlbfs_pagecache_present(h, vma, address))
                        return ERR_PTR(-EFAULT);
        } else if ((vma_is_anonymous(vma) || !vma->vm_ops->fault)) {
                return ERR_PTR(-EFAULT);
        }

        return NULL;
}

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
/* FOLL_FORCE can write to even unwritable PUDs in COW mappings. */
static inline bool can_follow_write_pud(pud_t pud, struct page *page,
                                        struct vm_area_struct *vma,
                                        unsigned int flags)
{
        /* If the pud is writable, we can write to the page. */
        if (pud_write(pud))
                return true;

        return can_follow_write_common(page, vma, flags);
}

static struct page *follow_huge_pud(struct vm_area_struct *vma,
                                    unsigned long addr, pud_t *pudp,
                                    int flags, struct follow_page_context *ctx)
{
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
        pud_t pud = *pudp;
        unsigned long pfn = pud_pfn(pud);
        int ret;

        assert_spin_locked(pud_lockptr(mm, pudp));

        if (!pud_present(pud))
                return NULL;

        if ((flags & FOLL_WRITE) &&
            !can_follow_write_pud(pud, pfn_to_page(pfn), vma, flags))
                return NULL;

        pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;

        if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
            pud_devmap(pud)) {
                /*
                 * device mapped pages can only be returned if the caller
                 * will manage the page reference count.
                 *
                 * At least one of FOLL_GET | FOLL_PIN must be set, so
                 * assert that here:
                 */
                if (!(flags & (FOLL_GET | FOLL_PIN)))
                        return ERR_PTR(-EEXIST);

                if (flags & FOLL_TOUCH)
                        touch_pud(vma, addr, pudp, flags & FOLL_WRITE);

                ctx->pgmap = get_dev_pagemap(pfn, ctx->pgmap);
                if (!ctx->pgmap)
                        return ERR_PTR(-EFAULT);
        }

        page = pfn_to_page(pfn);

        if (!pud_devmap(pud) && !pud_write(pud) &&
            gup_must_unshare(vma, flags, page))
                return ERR_PTR(-EMLINK);

        ret = try_grab_folio(page_folio(page), 1, flags);
        if (ret)
                page = ERR_PTR(ret);
        else
                ctx->page_mask = HPAGE_PUD_NR - 1;

        return page;
}

/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */
static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page,
                                        struct vm_area_struct *vma,
                                        unsigned int flags)
{
        /* If the pmd is writable, we can write to the page. */
        if (pmd_write(pmd))
                return true;

        if (!can_follow_write_common(page, vma, flags))
                return false;

        /* ... and a write-fault isn't required for other reasons. */
        if (pmd_needs_soft_dirty_wp(vma, pmd))
                return false;
        return !userfaultfd_huge_pmd_wp(vma, pmd);
}

static struct page *follow_huge_pmd(struct vm_area_struct *vma,
                                    unsigned long addr, pmd_t *pmd,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        struct mm_struct *mm = vma->vm_mm;
        pmd_t pmdval = *pmd;
        struct page *page;
        int ret;

        assert_spin_locked(pmd_lockptr(mm, pmd));

        page = pmd_page(pmdval);
        if ((flags & FOLL_WRITE) &&
            !can_follow_write_pmd(pmdval, page, vma, flags))
                return NULL;

        /* Avoid dumping huge zero page */
        if ((flags & FOLL_DUMP) && is_huge_zero_pmd(pmdval))
                return ERR_PTR(-EFAULT);

        if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags))
                return NULL;

        if (!pmd_write(pmdval) && gup_must_unshare(vma, flags, page))
                return ERR_PTR(-EMLINK);

        VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
                        !PageAnonExclusive(page), page);

        ret = try_grab_folio(page_folio(page), 1, flags);
        if (ret)
                return ERR_PTR(ret);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(pmdval) && (flags & FOLL_TOUCH))
                touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
#endif        /* CONFIG_TRANSPARENT_HUGEPAGE */

        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        ctx->page_mask = HPAGE_PMD_NR - 1;

        return page;
}

#else  /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */
static struct page *follow_huge_pud(struct vm_area_struct *vma,
                                    unsigned long addr, pud_t *pudp,
                                    int flags, struct follow_page_context *ctx)
{
        return NULL;
}

static struct page *follow_huge_pmd(struct vm_area_struct *vma,
                                    unsigned long addr, pmd_t *pmd,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        return NULL;
}
#endif        /* CONFIG_PGTABLE_HAS_HUGE_LEAVES */

static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
                pte_t *pte, unsigned int flags)
{
        if (flags & FOLL_TOUCH) {
                pte_t orig_entry = ptep_get(pte);
                pte_t entry = orig_entry;

                if (flags & FOLL_WRITE)
                        entry = pte_mkdirty(entry);
                entry = pte_mkyoung(entry);

                if (!pte_same(orig_entry, entry)) {
                        set_pte_at(vma->vm_mm, address, pte, entry);
                        update_mmu_cache(vma, address, pte);
                }
        }

        /* Proper page table entry exists, but no corresponding struct page */
        return -EEXIST;
}

/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
static inline bool can_follow_write_pte(pte_t pte, struct page *page,
                                        struct vm_area_struct *vma,
                                        unsigned int flags)
{
        /* If the pte is writable, we can write to the page. */
        if (pte_write(pte))
                return true;

        if (!can_follow_write_common(page, vma, flags))
                return false;

        /* ... and a write-fault isn't required for other reasons. */
        if (pte_needs_soft_dirty_wp(vma, pte))
                return false;
        return !userfaultfd_pte_wp(vma, pte);
}

static struct page *follow_page_pte(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmd, unsigned int flags,
                struct dev_pagemap **pgmap)
{
        struct mm_struct *mm = vma->vm_mm;
        struct folio *folio;
        struct page *page;
        spinlock_t *ptl;
        pte_t *ptep, pte;
        int ret;

        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
                         (FOLL_PIN | FOLL_GET)))
                return ERR_PTR(-EINVAL);

        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (!ptep)
                return no_page_table(vma, flags, address);
        pte = ptep_get(ptep);
        if (!pte_present(pte))
                goto no_page;
        if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags))
                goto no_page;

        page = vm_normal_page(vma, address, pte);

        /*
         * We only care about anon pages in can_follow_write_pte() and don't
         * have to worry about pte_devmap() because they are never anon.
         */
        if ((flags & FOLL_WRITE) &&
            !can_follow_write_pte(pte, page, vma, flags)) {
                page = NULL;
                goto out;
        }

        if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
                /*
                 * Only return device mapping pages in the FOLL_GET or FOLL_PIN
                 * case since they are only valid while holding the pgmap
                 * reference.
                 */
                *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
                if (*pgmap)
                        page = pte_page(pte);
                else
                        goto no_page;
        } else if (unlikely(!page)) {
                if (flags & FOLL_DUMP) {
                        /* Avoid special (like zero) pages in core dumps */
                        page = ERR_PTR(-EFAULT);
                        goto out;
                }

                if (is_zero_pfn(pte_pfn(pte))) {
                        page = pte_page(pte);
                } else {
                        ret = follow_pfn_pte(vma, address, ptep, flags);
                        page = ERR_PTR(ret);
                        goto out;
                }
        }
        folio = page_folio(page);

        if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
                page = ERR_PTR(-EMLINK);
                goto out;
        }

        VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
                       !PageAnonExclusive(page), page);

        /* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
        ret = try_grab_folio(folio, 1, flags);
        if (unlikely(ret)) {
                page = ERR_PTR(ret);
                goto out;
        }

        /*
         * We need to make the page accessible if and only if we are going
         * to access its content (the FOLL_PIN case).  Please see
         * Documentation/core-api/pin_user_pages.rst for details.
         */
        if (flags & FOLL_PIN) {
                ret = arch_make_folio_accessible(folio);
                if (ret) {
                        unpin_user_page(page);
                        page = ERR_PTR(ret);
                        goto out;
                }
        }
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !folio_test_dirty(folio))
                        folio_mark_dirty(folio);
                /*
                 * pte_mkyoung() would be more correct here, but atomic care
                 * is needed to avoid losing the dirty bit: it is easier to use
                 * folio_mark_accessed().
                 */
                folio_mark_accessed(folio);
        }
out:
        pte_unmap_unlock(ptep, ptl);
        return page;
no_page:
        pte_unmap_unlock(ptep, ptl);
        if (!pte_none(pte))
                return NULL;
        return no_page_table(vma, flags, address);
}

static struct page *follow_pmd_mask(struct vm_area_struct *vma,
                                    unsigned long address, pud_t *pudp,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        pmd_t *pmd, pmdval;
        spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;

        pmd = pmd_offset(pudp, address);
        pmdval = pmdp_get_lockless(pmd);
        if (pmd_none(pmdval))
                return no_page_table(vma, flags, address);
        if (!pmd_present(pmdval))
                return no_page_table(vma, flags, address);
        if (pmd_devmap(pmdval)) {
                ptl = pmd_lock(mm, pmd);
                page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
                spin_unlock(ptl);
                if (page)
                        return page;
                return no_page_table(vma, flags, address);
        }
        if (likely(!pmd_leaf(pmdval)))
                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);

        if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
                return no_page_table(vma, flags, address);

        ptl = pmd_lock(mm, pmd);
        pmdval = *pmd;
        if (unlikely(!pmd_present(pmdval))) {
                spin_unlock(ptl);
                return no_page_table(vma, flags, address);
        }
        if (unlikely(!pmd_leaf(pmdval))) {
                spin_unlock(ptl);
                return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
        if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
                spin_unlock(ptl);
                split_huge_pmd(vma, pmd, address);
                /* If pmd was left empty, stuff a page table in there quickly */
                return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
                        follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
        }
        page = follow_huge_pmd(vma, address, pmd, flags, ctx);
        spin_unlock(ptl);
        return page;
}

static struct page *follow_pud_mask(struct vm_area_struct *vma,
                                    unsigned long address, p4d_t *p4dp,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        pud_t *pudp, pud;
        spinlock_t *ptl;
        struct page *page;
        struct mm_struct *mm = vma->vm_mm;

        pudp = pud_offset(p4dp, address);
        pud = READ_ONCE(*pudp);
        if (!pud_present(pud))
                return no_page_table(vma, flags, address);
        if (pud_leaf(pud)) {
                ptl = pud_lock(mm, pudp);
                page = follow_huge_pud(vma, address, pudp, flags, ctx);
                spin_unlock(ptl);
                if (page)
                        return page;
                return no_page_table(vma, flags, address);
        }
        if (unlikely(pud_bad(pud)))
                return no_page_table(vma, flags, address);

        return follow_pmd_mask(vma, address, pudp, flags, ctx);
}

static struct page *follow_p4d_mask(struct vm_area_struct *vma,
                                    unsigned long address, pgd_t *pgdp,
                                    unsigned int flags,
                                    struct follow_page_context *ctx)
{
        p4d_t *p4dp, p4d;

        p4dp = p4d_offset(pgdp, address);
        p4d = READ_ONCE(*p4dp);
        BUILD_BUG_ON(p4d_leaf(p4d));

        if (!p4d_present(p4d) || p4d_bad(p4d))
                return no_page_table(vma, flags, address);

        return follow_pud_mask(vma, address, p4dp, flags, ctx);
}

/**
 * follow_page_mask - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
 * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
 *       pointer to output page_mask
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
 * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
 * the device's dev_pagemap metadata to avoid repeating expensive lookups.
 *
 * When getting an anonymous page and the caller has to trigger unsharing
 * of a shared anonymous page first, -EMLINK is returned. The caller should
 * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
 * relevant with FOLL_PIN and !FOLL_WRITE.
 *
 * On output, the @ctx->page_mask is set according to the size of the page.
 *
 * Return: the mapped (struct page *), %NULL if no mapping exists, or
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
static struct page *follow_page_mask(struct vm_area_struct *vma,
                              unsigned long address, unsigned int flags,
                              struct follow_page_context *ctx)
{
        pgd_t *pgd;
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;

        vma_pgtable_walk_begin(vma);

        ctx->page_mask = 0;
        pgd = pgd_offset(mm, address);

        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
                page = no_page_table(vma, flags, address);
        else
                page = follow_p4d_mask(vma, address, pgd, flags, ctx);

        vma_pgtable_walk_end(vma);

        return page;
}

static int get_gate_page(struct mm_struct *mm, unsigned long address,
                unsigned int gup_flags, struct vm_area_struct **vma,
                struct page **page)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *pte;
        pte_t entry;
        int ret = -EFAULT;

        /* user gate pages are read-only */
        if (gup_flags & FOLL_WRITE)
                return -EFAULT;
        if (address > TASK_SIZE)
                pgd = pgd_offset_k(address);
        else
                pgd = pgd_offset_gate(mm, address);
        if (pgd_none(*pgd))
                return -EFAULT;
        p4d = p4d_offset(pgd, address);
        if (p4d_none(*p4d))
                return -EFAULT;
        pud = pud_offset(p4d, address);
        if (pud_none(*pud))
                return -EFAULT;
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return -EFAULT;
        pte = pte_offset_map(pmd, address);
        if (!pte)
                return -EFAULT;
        entry = ptep_get(pte);
        if (pte_none(entry))
                goto unmap;
        *vma = get_gate_vma(mm);
        if (!page)
                goto out;
        *page = vm_normal_page(*vma, address, entry);
        if (!*page) {
                if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(entry)))
                        goto unmap;
                *page = pte_page(entry);
        }
        ret = try_grab_folio(page_folio(*page), 1, gup_flags);
        if (unlikely(ret))
                goto unmap;
out:
        ret = 0;
unmap:
        pte_unmap(pte);
        return ret;
}

/*
 * mmap_lock must be held on entry.  If @flags has FOLL_UNLOCKABLE but not
 * FOLL_NOWAIT, the mmap_lock may be released.  If it is, *@locked will be set
 * to 0 and -EBUSY returned.
 */
static int faultin_page(struct vm_area_struct *vma,
                unsigned long address, unsigned int flags, bool unshare,
                int *locked)
{
        unsigned int fault_flags = 0;
        vm_fault_t ret;

        if (flags & FOLL_NOFAULT)
                return -EFAULT;
        if (flags & FOLL_WRITE)
                fault_flags |= FAULT_FLAG_WRITE;
        if (flags & FOLL_REMOTE)
                fault_flags |= FAULT_FLAG_REMOTE;
        if (flags & FOLL_UNLOCKABLE) {
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
                /*
                 * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
                 * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE.
                 * That's because some callers may not be prepared to
                 * handle early exits caused by non-fatal signals.
                 */
                if (flags & FOLL_INTERRUPTIBLE)
                        fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
        }
        if (flags & FOLL_NOWAIT)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
        if (flags & FOLL_TRIED) {
                /*
                 * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
                 * can co-exist
                 */
                fault_flags |= FAULT_FLAG_TRIED;
        }
        if (unshare) {
                fault_flags |= FAULT_FLAG_UNSHARE;
                /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
                VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
        }

        ret = handle_mm_fault(vma, address, fault_flags, NULL);

        if (ret & VM_FAULT_COMPLETED) {
                /*
                 * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
                 * mmap lock in the page fault handler. Sanity check this.
                 */
                WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
                *locked = 0;

                /*
                 * We should do the same as VM_FAULT_RETRY, but let's not
                 * return -EBUSY since that's not reflecting the reality of
                 * what has happened - we've just fully completed a page
                 * fault, with the mmap lock released.  Use -EAGAIN to show
                 * that we want to take the mmap lock _again_.
                 */
                return -EAGAIN;
        }

        if (ret & VM_FAULT_ERROR) {
                int err = vm_fault_to_errno(ret, flags);

                if (err)
                        return err;
                BUG();
        }

        if (ret & VM_FAULT_RETRY) {
                if (!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
                        *locked = 0;
                return -EBUSY;
        }

        return 0;
}

/*
 * Writing to file-backed mappings which require folio dirty tracking using GUP
 * is a fundamentally broken operation, as kernel write access to GUP mappings
 * do not adhere to the semantics expected by a file system.
 *
 * Consider the following scenario:-
 *
 * 1. A folio is written to via GUP which write-faults the memory, notifying
 *    the file system and dirtying the folio.
 * 2. Later, writeback is triggered, resulting in the folio being cleaned and
 *    the PTE being marked read-only.
 * 3. The GUP caller writes to the folio, as it is mapped read/write via the
 *    direct mapping.
 * 4. The GUP caller, now done with the page, unpins it and sets it dirty
 *    (though it does not have to).
 *
 * This results in both data being written to a folio without writenotify, and
 * the folio being dirtied unexpectedly (if the caller decides to do so).
 */
static bool writable_file_mapping_allowed(struct vm_area_struct *vma,
                                          unsigned long gup_flags)
{
        /*
         * If we aren't pinning then no problematic write can occur. A long term
         * pin is the most egregious case so this is the case we disallow.
         */
        if ((gup_flags & (FOLL_PIN | FOLL_LONGTERM)) !=
            (FOLL_PIN | FOLL_LONGTERM))
                return true;

        /*
         * If the VMA does not require dirty tracking then no problematic write
         * can occur either.
         */
        return !vma_needs_dirty_tracking(vma);
}

static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
{
        vm_flags_t vm_flags = vma->vm_flags;
        int write = (gup_flags & FOLL_WRITE);
        int foreign = (gup_flags & FOLL_REMOTE);
        bool vma_anon = vma_is_anonymous(vma);

        if (vm_flags & (VM_IO | VM_PFNMAP))
                return -EFAULT;

        if ((gup_flags & FOLL_ANON) && !vma_anon)
                return -EFAULT;

        if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
                return -EOPNOTSUPP;

        if ((gup_flags & FOLL_SPLIT_PMD) && is_vm_hugetlb_page(vma))
                return -EOPNOTSUPP;

        if (vma_is_secretmem(vma))
                return -EFAULT;

        if (write) {
                if (!vma_anon &&
                    !writable_file_mapping_allowed(vma, gup_flags))
                        return -EFAULT;

                if (!(vm_flags & VM_WRITE) || (vm_flags & VM_SHADOW_STACK)) {
                        if (!(gup_flags & FOLL_FORCE))
                                return -EFAULT;
                        /*
                         * We used to let the write,force case do COW in a
                         * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
                         * set a breakpoint in a read-only mapping of an
                         * executable, without corrupting the file (yet only
                         * when that file had been opened for writing!).
                         * Anon pages in shared mappings are surprising: now
                         * just reject it.
                         */
                        if (!is_cow_mapping(vm_flags))
                                return -EFAULT;
                }
        } else if (!(vm_flags & VM_READ)) {
                if (!(gup_flags & FOLL_FORCE))
                        return -EFAULT;
                /*
                 * Is there actually any vma we can reach here which does not
                 * have VM_MAYREAD set?
                 */
                if (!(vm_flags & VM_MAYREAD))
                        return -EFAULT;
        }
        /*
         * gups are always data accesses, not instruction
         * fetches, so execute=false here
         */
        if (!arch_vma_access_permitted(vma, write, false, foreign))
                return -EFAULT;
        return 0;
}

/*
 * This is "vma_lookup()", but with a warning if we would have
 * historically expanded the stack in the GUP code.
 */
static struct vm_area_struct *gup_vma_lookup(struct mm_struct *mm,
         unsigned long addr)
{
#ifdef CONFIG_STACK_GROWSUP
        return vma_lookup(mm, addr);
#else
        static volatile unsigned long next_warn;
        struct vm_area_struct *vma;
        unsigned long now, next;

        vma = find_vma(mm, addr);
        if (!vma || (addr >= vma->vm_start))
                return vma;

        /* Only warn for half-way relevant accesses */
        if (!(vma->vm_flags & VM_GROWSDOWN))
                return NULL;
        if (vma->vm_start - addr > 65536)
                return NULL;

        /* Let's not warn more than once an hour.. */
        now = jiffies; next = next_warn;
        if (next && time_before(now, next))
                return NULL;
        next_warn = now + 60*60*HZ;

        /* Let people know things may have changed. */
        pr_warn("GUP no longer grows the stack in %s (%d): %lx-%lx (%lx)\n",
                current->comm, task_pid_nr(current),
                vma->vm_start, vma->vm_end, addr);
        dump_stack();
        return NULL;
#endif
}

/**
 * __get_user_pages() - pin user pages in memory
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying pin behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @locked:     whether we're still with the mmap_lock held
 *
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 * -- 0 return value is possible when the fault would need to be retried.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * Must be called with mmap_lock held.  It may be released.  See below.
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * __get_user_pages returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re-faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 * appropriate) must be called after the page is finished with, and
 * before put_page is called.
 *
 * If FOLL_UNLOCKABLE is set without FOLL_NOWAIT then the mmap_lock may
 * be released. If this happens *@locked will be set to 0 on return.
 *
 * A caller using such a combination of @gup_flags must therefore hold the
 * mmap_lock for reading only, and recognize when it's been released. Otherwise,
 * it must be held for either reading or writing and will not be released.
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */
static long __get_user_pages(struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
                int *locked)
{
        long ret = 0, i = 0;
        struct vm_area_struct *vma = NULL;
        struct follow_page_context ctx = { NULL };

        if (!nr_pages)
                return 0;

        start = untagged_addr_remote(mm, start);

        VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));

        do {
                struct page *page;
                unsigned int page_increm;

                /* first iteration or cross vma bound */
                if (!vma || start >= vma->vm_end) {
                        /*
                         * MADV_POPULATE_(READ|WRITE) wants to handle VMA
                         * lookups+error reporting differently.
                         */
                        if (gup_flags & FOLL_MADV_POPULATE) {
                                vma = vma_lookup(mm, start);
                                if (!vma) {
                                        ret = -ENOMEM;
                                        goto out;
                                }
                                if (check_vma_flags(vma, gup_flags)) {
                                        ret = -EINVAL;
                                        goto out;
                                }
                                goto retry;
                        }
                        vma = gup_vma_lookup(mm, start);
                        if (!vma && in_gate_area(mm, start)) {
                                ret = get_gate_page(mm, start & PAGE_MASK,
                                                gup_flags, &vma,
                                                pages ? &page : NULL);
                                if (ret)
                                        goto out;
                                ctx.page_mask = 0;
                                goto next_page;
                        }

                        if (!vma) {
                                ret = -EFAULT;
                                goto out;
                        }
                        ret = check_vma_flags(vma, gup_flags);
                        if (ret)
                                goto out;
                }
retry:
                /*
                 * If we have a pending SIGKILL, don't keep faulting pages and
                 * potentially allocating memory.
                 */
                if (fatal_signal_pending(current)) {
                        ret = -EINTR;
                        goto out;
                }
                cond_resched();

                page = follow_page_mask(vma, start, gup_flags, &ctx);
                if (!page || PTR_ERR(page) == -EMLINK) {
                        ret = faultin_page(vma, start, gup_flags,
                                           PTR_ERR(page) == -EMLINK, locked);
                        switch (ret) {
                        case 0:
                                goto retry;
                        case -EBUSY:
                        case -EAGAIN:
                                ret = 0;
                                fallthrough;
                        case -EFAULT:
                        case -ENOMEM:
                        case -EHWPOISON:
                                goto out;
                        }
                        BUG();
                } else if (PTR_ERR(page) == -EEXIST) {
                        /*
                         * Proper page table entry exists, but no corresponding
                         * struct page. If the caller expects **pages to be
                         * filled in, bail out now, because that can't be done
                         * for this page.
                         */
                        if (pages) {
                                ret = PTR_ERR(page);
                                goto out;
                        }
                } else if (IS_ERR(page)) {
                        ret = PTR_ERR(page);
                        goto out;
                }
next_page:
                page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
                if (page_increm > nr_pages)
                        page_increm = nr_pages;

                if (pages) {
                        struct page *subpage;
                        unsigned int j;

                        /*
                         * This must be a large folio (and doesn't need to
                         * be the whole folio; it can be part of it), do
                         * the refcount work for all the subpages too.
                         *
                         * NOTE: here the page may not be the head page
                         * e.g. when start addr is not thp-size aligned.
                         * try_grab_folio() should have taken care of tail
                         * pages.
                         */
                        if (page_increm > 1) {
                                struct folio *folio = page_folio(page);

                                /*
                                 * Since we already hold refcount on the
                                 * large folio, this should never fail.
                                 */
                                if (try_grab_folio(folio, page_increm - 1,
                                                   gup_flags)) {
                                        /*
                                         * Release the 1st page ref if the
                                         * folio is problematic, fail hard.
                                         */
                                        gup_put_folio(folio, 1, gup_flags);
                                        ret = -EFAULT;
                                        goto out;
                                }
                        }

                        for (j = 0; j < page_increm; j++) {
                                subpage = nth_page(page, j);
                                pages[i + j] = subpage;
                                flush_anon_page(vma, subpage, start + j * PAGE_SIZE);
                                flush_dcache_page(subpage);
                        }
                }

                i += page_increm;
                start += page_increm * PAGE_SIZE;
                nr_pages -= page_increm;
        } while (nr_pages);
out:
        if (ctx.pgmap)
                put_dev_pagemap(ctx.pgmap);
        return i ? i : ret;
}

static bool vma_permits_fault(struct vm_area_struct *vma,
                              unsigned int fault_flags)
{
        bool write   = !!(fault_flags & FAULT_FLAG_WRITE);
        bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
        vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;

        if (!(vm_flags & vma->vm_flags))
                return false;

        /*
         * The architecture might have a hardware protection
         * mechanism other than read/write that can deny access.
         *
         * gup always represents data access, not instruction
         * fetches, so execute=false here:
         */
        if (!arch_vma_access_permitted(vma, write, false, foreign))
                return false;

        return true;
}

/**
 * fixup_user_fault() - manually resolve a user page fault
 * @mm:                mm_struct of target mm
 * @address:        user address
 * @fault_flags:flags to pass down to handle_mm_fault()
 * @unlocked:        did we unlock the mmap_lock while retrying, maybe NULL if caller
 *                does not allow retry. If NULL, the caller must guarantee
 *                that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
 *
 * This is meant to be called in the specific scenario where for locking reasons
 * we try to access user memory in atomic context (within a pagefault_disable()
 * section), this returns -EFAULT, and we want to resolve the user fault before
 * trying again.
 *
 * Typically this is meant to be used by the futex code.
 *
 * The main difference with get_user_pages() is that this function will
 * unconditionally call handle_mm_fault() which will in turn perform all the
 * necessary SW fixup of the dirty and young bits in the PTE, while
 * get_user_pages() only guarantees to update these in the struct page.
 *
 * This is important for some architectures where those bits also gate the
 * access permission to the page because they are maintained in software.  On
 * such architectures, gup() will not be enough to make a subsequent access
 * succeed.
 *
 * This function will not return with an unlocked mmap_lock. So it has not the
 * same semantics wrt the @mm->mmap_lock as does filemap_fault().
 */
int fixup_user_fault(struct mm_struct *mm,
                     unsigned long address, unsigned int fault_flags,
                     bool *unlocked)
{
        struct vm_area_struct *vma;
        vm_fault_t ret;

        address = untagged_addr_remote(mm, address);

        if (unlocked)
                fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;

retry:
        vma = gup_vma_lookup(mm, address);
        if (!vma)
                return -EFAULT;

        if (!vma_permits_fault(vma, fault_flags))
                return -EFAULT;

        if ((fault_flags & FAULT_FLAG_KILLABLE) &&
            fatal_signal_pending(current))
                return -EINTR;

        ret = handle_mm_fault(vma, address, fault_flags, NULL);

        if (ret & VM_FAULT_COMPLETED) {
                /*
                 * NOTE: it's a pity that we need to retake the lock here
                 * to pair with the unlock() in the callers. Ideally we
                 * could tell the callers so they do not need to unlock.
                 */
                mmap_read_lock(mm);
                *unlocked = true;
                return 0;
        }

        if (ret & VM_FAULT_ERROR) {
                int err = vm_fault_to_errno(ret, 0);

                if (err)
                        return err;
                BUG();
        }

        if (ret & VM_FAULT_RETRY) {
                mmap_read_lock(mm);
                *unlocked = true;
                fault_flags |= FAULT_FLAG_TRIED;
                goto retry;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(fixup_user_fault);

/*
 * GUP always responds to fatal signals.  When FOLL_INTERRUPTIBLE is
 * specified, it'll also respond to generic signals.  The caller of GUP
 * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption.
 */
static bool gup_signal_pending(unsigned int flags)
{
        if (fatal_signal_pending(current))
                return true;

        if (!(flags & FOLL_INTERRUPTIBLE))
                return false;

        return signal_pending(current);
}

/*
 * Locking: (*locked == 1) means that the mmap_lock has already been acquired by
 * the caller. This function may drop the mmap_lock. If it does so, then it will
 * set (*locked = 0).
 *
 * (*locked == 0) means that the caller expects this function to acquire and
 * drop the mmap_lock. Therefore, the value of *locked will still be zero when
 * the function returns, even though it may have changed temporarily during
 * function execution.
 *
 * Please note that this function, unlike __get_user_pages(), will not return 0
 * for nr_pages > 0, unless FOLL_NOWAIT is used.
 */
static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
                                                unsigned long start,
                                                unsigned long nr_pages,
                                                struct page **pages,
                                                int *locked,
                                                unsigned int flags)
{
        long ret, pages_done;
        bool must_unlock = false;

        if (!nr_pages)
                return 0;

        /*
         * The internal caller expects GUP to manage the lock internally and the
         * lock must be released when this returns.
         */
        if (!*locked) {
                if (mmap_read_lock_killable(mm))
                        return -EAGAIN;
                must_unlock = true;
                *locked = 1;
        }
        else
                mmap_assert_locked(mm);

        if (flags & FOLL_PIN)
                mm_set_has_pinned_flag(&mm->flags);

        /*
         * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
         * is to set FOLL_GET if the caller wants pages[] filled in (but has
         * carelessly failed to specify FOLL_GET), so keep doing that, but only
         * for FOLL_GET, not for the newer FOLL_PIN.
         *
         * FOLL_PIN always expects pages to be non-null, but no need to assert
         * that here, as any failures will be obvious enough.
         */
        if (pages && !(flags & FOLL_PIN))
                flags |= FOLL_GET;

        pages_done = 0;
        for (;;) {
                ret = __get_user_pages(mm, start, nr_pages, flags, pages,
                                       locked);
                if (!(flags & FOLL_UNLOCKABLE)) {
                        /* VM_FAULT_RETRY couldn't trigger, bypass */
                        pages_done = ret;
                        break;
                }

                /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
                if (!*locked) {
                        BUG_ON(ret < 0);
                        BUG_ON(ret >= nr_pages);
                }

                if (ret > 0) {
                        nr_pages -= ret;
                        pages_done += ret;
                        if (!nr_pages)
                                break;
                }
                if (*locked) {
                        /*
                         * VM_FAULT_RETRY didn't trigger or it was a
                         * FOLL_NOWAIT.
                         */
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }
                /*
                 * VM_FAULT_RETRY triggered, so seek to the faulting offset.
                 * For the prefault case (!pages) we only update counts.
                 */
                if (likely(pages))
                        pages += ret;
                start += ret << PAGE_SHIFT;

                /* The lock was temporarily dropped, so we must unlock later */
                must_unlock = true;

retry:
                /*
                 * Repeat on the address that fired VM_FAULT_RETRY
                 * with both FAULT_FLAG_ALLOW_RETRY and
                 * FAULT_FLAG_TRIED.  Note that GUP can be interrupted
                 * by fatal signals of even common signals, depending on
                 * the caller's request. So we need to check it before we
                 * start trying again otherwise it can loop forever.
                 */
                if (gup_signal_pending(flags)) {
                        if (!pages_done)
                                pages_done = -EINTR;
                        break;
                }

                ret = mmap_read_lock_killable(mm);
                if (ret) {
                        BUG_ON(ret > 0);
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }

                *locked = 1;
                ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
                                       pages, locked);
                if (!*locked) {
                        /* Continue to retry until we succeeded */
                        BUG_ON(ret != 0);
                        goto retry;
                }
                if (ret != 1) {
                        BUG_ON(ret > 1);
                        if (!pages_done)
                                pages_done = ret;
                        break;
                }
                nr_pages--;
                pages_done++;
                if (!nr_pages)
                        break;
                if (likely(pages))
                        pages++;
                start += PAGE_SIZE;
        }
        if (must_unlock && *locked) {
                /*
                 * We either temporarily dropped the lock, or the caller
                 * requested that we both acquire and drop the lock. Either way,
                 * we must now unlock, and notify the caller of that state.
                 */
                mmap_read_unlock(mm);
                *locked = 0;
        }

        /*
         * Failing to pin anything implies something has gone wrong (except when
         * FOLL_NOWAIT is specified).
         */
        if (WARN_ON_ONCE(pages_done == 0 && !(flags & FOLL_NOWAIT)))
                return -EFAULT;

        return pages_done;
}

/**
 * populate_vma_page_range() -  populate a range of pages in the vma.
 * @vma:   target vma
 * @start: start address
 * @end:   end address
 * @locked: whether the mmap_lock is still held
 *
 * This takes care of mlocking the pages too if VM_LOCKED is set.
 *
 * Return either number of pages pinned in the vma, or a negative error
 * code on error.
 *
 * vma->vm_mm->mmap_lock must be held.
 *
 * If @locked is NULL, it may be held for read or write and will
 * be unperturbed.
 *
 * If @locked is non-NULL, it must held for read only and may be
 * released.  If it's released, *@locked will be set to 0.
 */
long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *locked)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long nr_pages = (end - start) / PAGE_SIZE;
        int local_locked = 1;
        int gup_flags;
        long ret;

        VM_BUG_ON(!PAGE_ALIGNED(start));
        VM_BUG_ON(!PAGE_ALIGNED(end));
        VM_BUG_ON_VMA(start < vma->vm_start, vma);
        VM_BUG_ON_VMA(end   > vma->vm_end, vma);
        mmap_assert_locked(mm);

        /*
         * Rightly or wrongly, the VM_LOCKONFAULT case has never used
         * faultin_page() to break COW, so it has no work to do here.
         */
        if (vma->vm_flags & VM_LOCKONFAULT)
                return nr_pages;

        /* ... similarly, we've never faulted in PROT_NONE pages */
        if (!vma_is_accessible(vma))
                return -EFAULT;

        gup_flags = FOLL_TOUCH;
        /*
         * We want to touch writable mappings with a write fault in order
         * to break COW, except for shared mappings because these don't COW
         * and we would not want to dirty them for nothing.
         *
         * Otherwise, do a read fault, and use FOLL_FORCE in case it's not
         * readable (ie write-only or executable).
         */
        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
        else
                gup_flags |= FOLL_FORCE;

        if (locked)
                gup_flags |= FOLL_UNLOCKABLE;

        /*
         * We made sure addr is within a VMA, so the following will
         * not result in a stack expansion that recurses back here.
         */
        ret = __get_user_pages(mm, start, nr_pages, gup_flags,
                               NULL, locked ? locked : &local_locked);
        lru_add_drain();
        return ret;
}

/*
 * faultin_page_range() - populate (prefault) page tables inside the
 *                          given range readable/writable
 *
 * This takes care of mlocking the pages, too, if VM_LOCKED is set.
 *
 * @mm: the mm to populate page tables in
 * @start: start address
 * @end: end address
 * @write: whether to prefault readable or writable
 * @locked: whether the mmap_lock is still held
 *
 * Returns either number of processed pages in the MM, or a negative error
 * code on error (see __get_user_pages()). Note that this function reports
 * errors related to VMAs, such as incompatible mappings, as expected by
 * MADV_POPULATE_(READ|WRITE).
 *
 * The range must be page-aligned.
 *
 * mm->mmap_lock must be held. If it's released, *@locked will be set to 0.
 */
long faultin_page_range(struct mm_struct *mm, unsigned long start,
                        unsigned long end, bool write, int *locked)
{
        unsigned long nr_pages = (end - start) / PAGE_SIZE;
        int gup_flags;
        long ret;

        VM_BUG_ON(!PAGE_ALIGNED(start));
        VM_BUG_ON(!PAGE_ALIGNED(end));
        mmap_assert_locked(mm);

        /*
         * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
         *               the page dirty with FOLL_WRITE -- which doesn't make a
         *               difference with !FOLL_FORCE, because the page is writable
         *               in the page table.
         * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
         *                  a poisoned page.
         * !FOLL_FORCE: Require proper access permissions.
         */
        gup_flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_UNLOCKABLE |
                    FOLL_MADV_POPULATE;
        if (write)
                gup_flags |= FOLL_WRITE;

        ret = __get_user_pages_locked(mm, start, nr_pages, NULL, locked,
                                      gup_flags);
        lru_add_drain();
        return ret;
}

/*
 * __mm_populate - populate and/or mlock pages within a range of address space.
 *
 * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
 * flags. VMAs must be already marked with the desired vm_flags, and
 * mmap_lock must not be held.
 */
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
        struct mm_struct *mm = current->mm;
        unsigned long end, nstart, nend;
        struct vm_area_struct *vma = NULL;
        int locked = 0;
        long ret = 0;

        end = start + len;

        for (nstart = start; nstart < end; nstart = nend) {
                /*
                 * We want to fault in pages for [nstart; end) address range.
                 * Find first corresponding VMA.
                 */
                if (!locked) {
                        locked = 1;
                        mmap_read_lock(mm);
                        vma = find_vma_intersection(mm, nstart, end);
                } else if (nstart >= vma->vm_end)
                        vma = find_vma_intersection(mm, vma->vm_end, end);

                if (!vma)
                        break;
                /*
                 * Set [nstart; nend) to intersection of desired address
                 * range with the first VMA. Also, skip undesirable VMA types.
                 */
                nend = min(end, vma->vm_end);
                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                        continue;
                if (nstart < vma->vm_start)
                        nstart = vma->vm_start;
                /*
                 * Now fault in a range of pages. populate_vma_page_range()
                 * double checks the vma flags, so that it won't mlock pages
                 * if the vma was already munlocked.
                 */
                ret = populate_vma_page_range(vma, nstart, nend, &locked);
                if (ret < 0) {
                        if (ignore_errors) {
                                ret = 0;
                                continue;        /* continue at next VMA */
                        }
                        break;
                }
                nend = nstart + ret * PAGE_SIZE;
                ret = 0;
        }
        if (locked)
                mmap_read_unlock(mm);
        return ret;        /* 0 or negative error code */
}
#else /* CONFIG_MMU */
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
                unsigned long nr_pages, struct page **pages,
                int *locked, unsigned int foll_flags)
{
        struct vm_area_struct *vma;
        bool must_unlock = false;
        unsigned long vm_flags;
        long i;

        if (!nr_pages)
                return 0;

        /*
         * The internal caller expects GUP to manage the lock internally and the
         * lock must be released when this returns.
         */
        if (!*locked) {
                if (mmap_read_lock_killable(mm))
                        return -EAGAIN;
                must_unlock = true;
                *locked = 1;
        }

        /* calculate required read or write permissions.
         * If FOLL_FORCE is set, we only require the "MAY" flags.
         */
        vm_flags  = (foll_flags & FOLL_WRITE) ?
                        (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
        vm_flags &= (foll_flags & FOLL_FORCE) ?
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);

        for (i = 0; i < nr_pages; i++) {
                vma = find_vma(mm, start);
                if (!vma)
                        break;

                /* protect what we can, including chardevs */
                if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
                    !(vm_flags & vma->vm_flags))
                        break;

                if (pages) {
                        pages[i] = virt_to_page((void *)start);
                        if (pages[i])
                                get_page(pages[i]);
                }

                start = (start + PAGE_SIZE) & PAGE_MASK;
        }

        if (must_unlock && *locked) {
                mmap_read_unlock(mm);
                *locked = 0;
        }

        return i ? : -EFAULT;
}
#endif /* !CONFIG_MMU */

/**
 * fault_in_writeable - fault in userspace address range for writing
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 */
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
        char __user *start = uaddr, *end;

        if (unlikely(size == 0))
                return 0;
        if (!user_write_access_begin(uaddr, size))
                return size;
        if (!PAGE_ALIGNED(uaddr)) {
                unsafe_put_user(0, uaddr, out);
                uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
        }
        end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
        if (unlikely(end < start))
                end = NULL;
        while (uaddr != end) {
                unsafe_put_user(0, uaddr, out);
                uaddr += PAGE_SIZE;
        }

out:
        user_write_access_end();
        if (size > uaddr - start)
                return size - (uaddr - start);
        return 0;
}
EXPORT_SYMBOL(fault_in_writeable);

/**
 * fault_in_subpage_writeable - fault in an address range for writing
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Fault in a user address range for writing while checking for permissions at
 * sub-page granularity (e.g. arm64 MTE). This function should be used when
 * the caller cannot guarantee forward progress of a copy_to_user() loop.
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 */
size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
{
        size_t faulted_in;

        /*
         * Attempt faulting in at page granularity first for page table
         * permission checking. The arch-specific probe_subpage_writeable()
         * functions may not check for this.
         */
        faulted_in = size - fault_in_writeable(uaddr, size);
        if (faulted_in)
                faulted_in -= probe_subpage_writeable(uaddr, faulted_in);

        return size - faulted_in;
}
EXPORT_SYMBOL(fault_in_subpage_writeable);

/*
 * fault_in_safe_writeable - fault in an address range for writing
 * @uaddr: start of address range
 * @size: length of address range
 *
 * Faults in an address range for writing.  This is primarily useful when we
 * already know that some or all of the pages in the address range aren't in
 * memory.
 *
 * Unlike fault_in_writeable(), this function is non-destructive.
 *
 * Note that we don't pin or otherwise hold the pages referenced that we fault
 * in.  There's no guarantee that they'll stay in memory for any duration of
 * time.
 *
 * Returns the number of bytes not faulted in, like copy_to_user() and
 * copy_from_user().
 */
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
        unsigned long start = (unsigned long)uaddr, end;
        struct mm_struct *mm = current->mm;
        bool unlocked = false;

        if (unlikely(size == 0))
                return 0;
        end = PAGE_ALIGN(start + size);
        if (end < start)
                end = 0;

        mmap_read_lock(mm);
        do {
                if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
                        break;
                start = (start + PAGE_SIZE) & PAGE_MASK;
        } while (start != end);
        mmap_read_unlock(mm);

        if (size > start - (unsigned long)uaddr)
                return size - (start - (unsigned long)uaddr);
        return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);

/**
 * fault_in_readable - fault in userspace address range for reading
 * @uaddr: start of user address range
 * @size: size of user address range
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 */
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
        const char __user *start = uaddr, *end;
        volatile char c;

        if (unlikely(size == 0))
                return 0;
        if (!user_read_access_begin(uaddr, size))
                return size;
        if (!PAGE_ALIGNED(uaddr)) {
                unsafe_get_user(c, uaddr, out);
                uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
        }
        end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
        if (unlikely(end < start))
                end = NULL;
        while (uaddr != end) {
                unsafe_get_user(c, uaddr, out);
                uaddr += PAGE_SIZE;
        }

out:
        user_read_access_end();
        (void)c;
        if (size > uaddr - start)
                return size - (uaddr - start);
        return 0;
}
EXPORT_SYMBOL(fault_in_readable);

/**
 * get_dump_page() - pin user page in memory while writing it to core dump
 * @addr: user address
 * @locked: a pointer to an int denoting whether the mmap sem is held
 *
 * Returns struct page pointer of user page pinned for dump,
 * to be freed afterwards by put_page().
 *
 * Returns NULL on any kind of failure - a hole must then be inserted into
 * the corefile, to preserve alignment with its headers; and also returns
 * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
 * allowing a hole to be left in the corefile to save disk space.
 *
 * Called without mmap_lock (takes and releases the mmap_lock by itself).
 */
#ifdef CONFIG_ELF_CORE
struct page *get_dump_page(unsigned long addr, int *locked)
{
        struct page *page;
        int ret;

        ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
                                      FOLL_FORCE | FOLL_DUMP | FOLL_GET);
        return (ret == 1) ? page : NULL;
}
#endif /* CONFIG_ELF_CORE */

#ifdef CONFIG_MIGRATION

/*
 * An array of either pages or folios ("pofs"). Although it may seem tempting to
 * avoid this complication, by simply interpreting a list of folios as a list of
 * pages, that approach won't work in the longer term, because eventually the
 * layouts of struct page and struct folio will become completely different.
 * Furthermore, this pof approach avoids excessive page_folio() calls.
 */
struct pages_or_folios {
        union {
                struct page **pages;
                struct folio **folios;
                void **entries;
        };
        bool has_folios;
        long nr_entries;
};

static struct folio *pofs_get_folio(struct pages_or_folios *pofs, long i)
{
        if (pofs->has_folios)
                return pofs->folios[i];
        return page_folio(pofs->pages[i]);
}

static void pofs_clear_entry(struct pages_or_folios *pofs, long i)
{
        pofs->entries[i] = NULL;
}

static void pofs_unpin(struct pages_or_folios *pofs)
{
        if (pofs->has_folios)
                unpin_folios(pofs->folios, pofs->nr_entries);
        else
                unpin_user_pages(pofs->pages, pofs->nr_entries);
}

/*
 * Returns the number of collected folios. Return value is always >= 0.
 */
static void collect_longterm_unpinnable_folios(
                struct list_head *movable_folio_list,
                struct pages_or_folios *pofs)
{
        struct folio *prev_folio = NULL;
        bool drain_allow = true;
        unsigned long i;

        for (i = 0; i < pofs->nr_entries; i++) {
                struct folio *folio = pofs_get_folio(pofs, i);

                if (folio == prev_folio)
                        continue;
                prev_folio = folio;

                if (folio_is_longterm_pinnable(folio))
                        continue;

                if (folio_is_device_coherent(folio))
                        continue;

                if (folio_test_hugetlb(folio)) {
                        folio_isolate_hugetlb(folio, movable_folio_list);
                        continue;
                }

                if (!folio_test_lru(folio) && drain_allow) {
                        lru_add_drain_all();
                        drain_allow = false;
                }

                if (!folio_isolate_lru(folio))
                        continue;

                list_add_tail(&folio->lru, movable_folio_list);
                node_stat_mod_folio(folio,
                                    NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                    folio_nr_pages(folio));
        }
}

/*
 * Unpins all folios and migrates device coherent folios and movable_folio_list.
 * Returns -EAGAIN if all folios were successfully migrated or -errno for
 * failure (or partial success).
 */
static int
migrate_longterm_unpinnable_folios(struct list_head *movable_folio_list,
                                   struct pages_or_folios *pofs)
{
        int ret;
        unsigned long i;

        for (i = 0; i < pofs->nr_entries; i++) {
                struct folio *folio = pofs_get_folio(pofs, i);

                if (folio_is_device_coherent(folio)) {
                        /*
                         * Migration will fail if the folio is pinned, so
                         * convert the pin on the source folio to a normal
                         * reference.
                         */
                        pofs_clear_entry(pofs, i);
                        folio_get(folio);
                        gup_put_folio(folio, 1, FOLL_PIN);

                        if (migrate_device_coherent_folio(folio)) {
                                ret = -EBUSY;
                                goto err;
                        }

                        continue;
                }

                /*
                 * We can't migrate folios with unexpected references, so drop
                 * the reference obtained by __get_user_pages_locked().
                 * Migrating folios have been added to movable_folio_list after
                 * calling folio_isolate_lru() which takes a reference so the
                 * folio won't be freed if it's migrating.
                 */
                unpin_folio(folio);
                pofs_clear_entry(pofs, i);
        }

        if (!list_empty(movable_folio_list)) {
                struct migration_target_control mtc = {
                        .nid = NUMA_NO_NODE,
                        .gfp_mask = GFP_USER | __GFP_NOWARN,
                        .reason = MR_LONGTERM_PIN,
                };

                if (migrate_pages(movable_folio_list, alloc_migration_target,
                                  NULL, (unsigned long)&mtc, MIGRATE_SYNC,
                                  MR_LONGTERM_PIN, NULL)) {
                        ret = -ENOMEM;
                        goto err;
                }
        }

        putback_movable_pages(movable_folio_list);

        return -EAGAIN;

err:
        pofs_unpin(pofs);
        putback_movable_pages(movable_folio_list);

        return ret;
}

static long
check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs)
{
        LIST_HEAD(movable_folio_list);

        collect_longterm_unpinnable_folios(&movable_folio_list, pofs);
        if (list_empty(&movable_folio_list))
                return 0;

        return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs);
}

/*
 * Check whether all folios are *allowed* to be pinned indefinitely (long term).
 * Rather confusingly, all folios in the range are required to be pinned via
 * FOLL_PIN, before calling this routine.
 *
 * Return values:
 *
 * 0: if everything is OK and all folios in the range are allowed to be pinned,
 * then this routine leaves all folios pinned and returns zero for success.
 *
 * -EAGAIN: if any folios in the range are not allowed to be pinned, then this
 * routine will migrate those folios away, unpin all the folios in the range. If
 * migration of the entire set of folios succeeds, then -EAGAIN is returned. The
 * caller should re-pin the entire range with FOLL_PIN and then call this
 * routine again.
 *
 * -ENOMEM, or any other -errno: if an error *other* than -EAGAIN occurs, this
 * indicates a migration failure. The caller should give up, and propagate the
 * error back up the call stack. The caller does not need to unpin any folios in
 * that case, because this routine will do the unpinning.
 */
static long check_and_migrate_movable_folios(unsigned long nr_folios,
                                             struct folio **folios)
{
        struct pages_or_folios pofs = {
                .folios = folios,
                .has_folios = true,
                .nr_entries = nr_folios,
        };

        return check_and_migrate_movable_pages_or_folios(&pofs);
}

/*
 * Return values and behavior are the same as those for
 * check_and_migrate_movable_folios().
 */
static long check_and_migrate_movable_pages(unsigned long nr_pages,
                                            struct page **pages)
{
        struct pages_or_folios pofs = {
                .pages = pages,
                .has_folios = false,
                .nr_entries = nr_pages,
        };

        return check_and_migrate_movable_pages_or_folios(&pofs);
}
#else
static long check_and_migrate_movable_pages(unsigned long nr_pages,
                                            struct page **pages)
{
        return 0;
}

static long check_and_migrate_movable_folios(unsigned long nr_folios,
                                             struct folio **folios)
{
        return 0;
}
#endif /* CONFIG_MIGRATION */

/*
 * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
 * allows us to process the FOLL_LONGTERM flag.
 */
static long __gup_longterm_locked(struct mm_struct *mm,
                                  unsigned long start,
                                  unsigned long nr_pages,
                                  struct page **pages,
                                  int *locked,
                                  unsigned int gup_flags)
{
        unsigned int flags;
        long rc, nr_pinned_pages;

        if (!(gup_flags & FOLL_LONGTERM))
                return __get_user_pages_locked(mm, start, nr_pages, pages,
                                               locked, gup_flags);

        flags = memalloc_pin_save();
        do {
                nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
                                                          pages, locked,
                                                          gup_flags);
                if (nr_pinned_pages <= 0) {
                        rc = nr_pinned_pages;
                        break;
                }

                /* FOLL_LONGTERM implies FOLL_PIN */
                rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
        } while (rc == -EAGAIN);
        memalloc_pin_restore(flags);
        return rc ? rc : nr_pinned_pages;
}

/*
 * Check that the given flags are valid for the exported gup/pup interface, and
 * update them with the required flags that the caller must have set.
 */
static bool is_valid_gup_args(struct page **pages, int *locked,
                              unsigned int *gup_flags_p, unsigned int to_set)
{
        unsigned int gup_flags = *gup_flags_p;

        /*
         * These flags not allowed to be specified externally to the gup
         * interfaces:
         * - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
         * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote()
         * - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
         */
        if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))
                return false;

        gup_flags |= to_set;
        if (locked) {
                /* At the external interface locked must be set */
                if (WARN_ON_ONCE(*locked != 1))
                        return false;

                gup_flags |= FOLL_UNLOCKABLE;
        }

        /* FOLL_GET and FOLL_PIN are mutually exclusive. */
        if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) ==
                         (FOLL_PIN | FOLL_GET)))
                return false;

        /* LONGTERM can only be specified when pinning */
        if (WARN_ON_ONCE(!(gup_flags & FOLL_PIN) && (gup_flags & FOLL_LONGTERM)))
                return false;

        /* Pages input must be given if using GET/PIN */
        if (WARN_ON_ONCE((gup_flags & (FOLL_GET | FOLL_PIN)) && !pages))
                return false;

        /* We want to allow the pgmap to be hot-unplugged at all times */
        if (WARN_ON_ONCE((gup_flags & FOLL_LONGTERM) &&
                         (gup_flags & FOLL_PCI_P2PDMA)))
                return false;

        *gup_flags_p = gup_flags;
        return true;
}

#ifdef CONFIG_MMU
/**
 * get_user_pages_remote() - pin user pages in memory
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long. Or NULL, if caller
 *                only intends to ensure the pages are faulted in.
 * @locked:        pointer to lock flag indicating whether lock is held and
 *                subsequently whether VM_FAULT_RETRY functionality can be
 *                utilised. Lock must initially be held.
 *
 * Returns either number of pages pinned (which may be less than the
 * number requested), or an error. Details about the return value:
 *
 * -- If nr_pages is 0, returns 0.
 * -- If nr_pages is >0, but no pages were pinned, returns -errno.
 * -- If nr_pages is >0, and some pages were pinned, returns the number of
 *    pages pinned. Again, this may be less than nr_pages.
 *
 * The caller is responsible for releasing returned @pages, via put_page().
 *
 * Must be called with mmap_lock held for read or write.
 *
 * get_user_pages_remote walks a process's page tables and takes a reference
 * to each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * get_user_pages_remote returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re-faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
 * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
 * be called after the page is finished with, and before put_page is called.
 *
 * get_user_pages_remote is typically used for fewer-copy IO operations,
 * to get a handle on the memory by some means other than accesses
 * via the user virtual addresses. The pages may be submitted for
 * DMA to devices or accessed via their kernel linear mapping (via the
 * kmap APIs). Care should be taken to use the correct cache flushing APIs.
 *
 * See also get_user_pages_fast, for performance critical applications.
 *
 * get_user_pages_remote should be phased out in favor of
 * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
 * should use get_user_pages_remote because it cannot pass
 * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
 */
long get_user_pages_remote(struct mm_struct *mm,
                unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages,
                int *locked)
{
        int local_locked = 1;

        if (!is_valid_gup_args(pages, locked, &gup_flags,
                               FOLL_TOUCH | FOLL_REMOTE))
                return -EINVAL;

        return __get_user_pages_locked(mm, start, nr_pages, pages,
                                       locked ? locked : &local_locked,
                                       gup_flags);
}
EXPORT_SYMBOL(get_user_pages_remote);

#else /* CONFIG_MMU */
long get_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked)
{
        return 0;
}
#endif /* !CONFIG_MMU */

/**
 * get_user_pages() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying lookup behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long. Or NULL, if caller
 *              only intends to ensure the pages are faulted in.
 *
 * This is the same as get_user_pages_remote(), just with a less-flexible
 * calling convention where we assume that the mm being operated on belongs to
 * the current task, and doesn't allow passing of a locked parameter.  We also
 * obviously don't pass FOLL_REMOTE in here.
 */
long get_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages)
{
        int locked = 1;

        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_TOUCH))
                return -EINVAL;

        return __get_user_pages_locked(current->mm, start, nr_pages, pages,
                                       &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages);

/*
 * get_user_pages_unlocked() is suitable to replace the form:
 *
 *      mmap_read_lock(mm);
 *      get_user_pages(mm, ..., pages, NULL);
 *      mmap_read_unlock(mm);
 *
 *  with:
 *
 *      get_user_pages_unlocked(mm, ..., pages);
 *
 * It is functionally equivalent to get_user_pages_fast so
 * get_user_pages_fast should be used instead if specific gup_flags
 * (e.g. FOLL_FORCE) are not required.
 */
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                             struct page **pages, unsigned int gup_flags)
{
        int locked = 0;

        if (!is_valid_gup_args(pages, NULL, &gup_flags,
                               FOLL_TOUCH | FOLL_UNLOCKABLE))
                return -EINVAL;

        return __get_user_pages_locked(current->mm, start, nr_pages, pages,
                                       &locked, gup_flags);
}
EXPORT_SYMBOL(get_user_pages_unlocked);

/*
 * GUP-fast
 *
 * get_user_pages_fast attempts to pin user pages by walking the page
 * tables directly and avoids taking locks. Thus the walker needs to be
 * protected from page table pages being freed from under it, and should
 * block any THP splits.
 *
 * One way to achieve this is to have the walker disable interrupts, and
 * rely on IPIs from the TLB flushing code blocking before the page table
 * pages are freed. This is unsuitable for architectures that do not need
 * to broadcast an IPI when invalidating TLBs.
 *
 * Another way to achieve this is to batch up page table containing pages
 * belonging to more than one mm_user, then rcu_sched a callback to free those
 * pages. Disabling interrupts will allow the gup_fast() walker to both block
 * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
 * (which is a relatively rare event). The code below adopts this strategy.
 *
 * Before activating this code, please be aware that the following assumptions
 * are currently made:
 *
 *  *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
 *  free pages containing page tables or TLB flushing requires IPI broadcast.
 *
 *  *) ptes can be read atomically by the architecture.
 *
 *  *) valid user addesses are below TASK_MAX_SIZE
 *
 * The last two assumptions can be relaxed by the addition of helper functions.
 *
 * This code is based heavily on the PowerPC implementation by Nick Piggin.
 */
#ifdef CONFIG_HAVE_GUP_FAST
/*
 * Used in the GUP-fast path to determine whether GUP is permitted to work on
 * a specific folio.
 *
 * This call assumes the caller has pinned the folio, that the lowest page table
 * level still points to this folio, and that interrupts have been disabled.
 *
 * GUP-fast must reject all secretmem folios.
 *
 * Writing to pinned file-backed dirty tracked folios is inherently problematic
 * (see comment describing the writable_file_mapping_allowed() function). We
 * therefore try to avoid the most egregious case of a long-term mapping doing
 * so.
 *
 * This function cannot be as thorough as that one as the VMA is not available
 * in the fast path, so instead we whitelist known good cases and if in doubt,
 * fall back to the slow path.
 */
static bool gup_fast_folio_allowed(struct folio *folio, unsigned int flags)
{
        bool reject_file_backed = false;
        struct address_space *mapping;
        bool check_secretmem = false;
        unsigned long mapping_flags;

        /*
         * If we aren't pinning then no problematic write can occur. A long term
         * pin is the most egregious case so this is the one we disallow.
         */
        if ((flags & (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE)) ==
            (FOLL_PIN | FOLL_LONGTERM | FOLL_WRITE))
                reject_file_backed = true;

        /* We hold a folio reference, so we can safely access folio fields. */

        /* secretmem folios are always order-0 folios. */
        if (IS_ENABLED(CONFIG_SECRETMEM) && !folio_test_large(folio))
                check_secretmem = true;

        if (!reject_file_backed && !check_secretmem)
                return true;

        if (WARN_ON_ONCE(folio_test_slab(folio)))
                return false;

        /* hugetlb neither requires dirty-tracking nor can be secretmem. */
        if (folio_test_hugetlb(folio))
                return true;

        /*
         * GUP-fast disables IRQs. When IRQS are disabled, RCU grace periods
         * cannot proceed, which means no actions performed under RCU can
         * proceed either.
         *
         * inodes and thus their mappings are freed under RCU, which means the
         * mapping cannot be freed beneath us and thus we can safely dereference
         * it.
         */
        lockdep_assert_irqs_disabled();

        /*
         * However, there may be operations which _alter_ the mapping, so ensure
         * we read it once and only once.
         */
        mapping = READ_ONCE(folio->mapping);

        /*
         * The mapping may have been truncated, in any case we cannot determine
         * if this mapping is safe - fall back to slow path to determine how to
         * proceed.
         */
        if (!mapping)
                return false;

        /* Anonymous folios pose no problem. */
        mapping_flags = (unsigned long)mapping & PAGE_MAPPING_FLAGS;
        if (mapping_flags)
                return mapping_flags & PAGE_MAPPING_ANON;

        /*
         * At this point, we know the mapping is non-null and points to an
         * address_space object.
         */
        if (check_secretmem && secretmem_mapping(mapping))
                return false;
        /* The only remaining allowed file system is shmem. */
        return !reject_file_backed || shmem_mapping(mapping);
}

static void __maybe_unused gup_fast_undo_dev_pagemap(int *nr, int nr_start,
                unsigned int flags, struct page **pages)
{
        while ((*nr) - nr_start) {
                struct folio *folio = page_folio(pages[--(*nr)]);

                folio_clear_referenced(folio);
                gup_put_folio(folio, 1, flags);
        }
}

#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
/*
 * GUP-fast relies on pte change detection to avoid concurrent pgtable
 * operations.
 *
 * To pin the page, GUP-fast needs to do below in order:
 * (1) pin the page (by prefetching pte), then (2) check pte not changed.
 *
 * For the rest of pgtable operations where pgtable updates can be racy
 * with GUP-fast, we need to do (1) clear pte, then (2) check whether page
 * is pinned.
 *
 * Above will work for all pte-level operations, including THP split.
 *
 * For THP collapse, it's a bit more complicated because GUP-fast may be
 * walking a pgtable page that is being freed (pte is still valid but pmd
 * can be cleared already).  To avoid race in such condition, we need to
 * also check pmd here to make sure pmd doesn't change (corresponds to
 * pmdp_collapse_flush() in the THP collapse code path).
 */
static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        struct dev_pagemap *pgmap = NULL;
        int nr_start = *nr, ret = 0;
        pte_t *ptep, *ptem;

        ptem = ptep = pte_offset_map(&pmd, addr);
        if (!ptep)
                return 0;
        do {
                pte_t pte = ptep_get_lockless(ptep);
                struct page *page;
                struct folio *folio;

                /*
                 * Always fallback to ordinary GUP on PROT_NONE-mapped pages:
                 * pte_access_permitted() better should reject these pages
                 * either way: otherwise, GUP-fast might succeed in
                 * cases where ordinary GUP would fail due to VMA access
                 * permissions.
                 */
                if (pte_protnone(pte))
                        goto pte_unmap;

                if (!pte_access_permitted(pte, flags & FOLL_WRITE))
                        goto pte_unmap;

                if (pte_devmap(pte)) {
                        if (unlikely(flags & FOLL_LONGTERM))
                                goto pte_unmap;

                        pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
                        if (unlikely(!pgmap)) {
                                gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                                goto pte_unmap;
                        }
                } else if (pte_special(pte))
                        goto pte_unmap;

                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
                page = pte_page(pte);

                folio = try_grab_folio_fast(page, 1, flags);
                if (!folio)
                        goto pte_unmap;

                if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
                    unlikely(pte_val(pte) != pte_val(ptep_get(ptep)))) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }

                if (!gup_fast_folio_allowed(folio, flags)) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }

                if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) {
                        gup_put_folio(folio, 1, flags);
                        goto pte_unmap;
                }

                /*
                 * We need to make the page accessible if and only if we are
                 * going to access its content (the FOLL_PIN case).  Please
                 * see Documentation/core-api/pin_user_pages.rst for
                 * details.
                 */
                if (flags & FOLL_PIN) {
                        ret = arch_make_folio_accessible(folio);
                        if (ret) {
                                gup_put_folio(folio, 1, flags);
                                goto pte_unmap;
                        }
                }
                folio_set_referenced(folio);
                pages[*nr] = page;
                (*nr)++;
        } while (ptep++, addr += PAGE_SIZE, addr != end);

        ret = 1;

pte_unmap:
        if (pgmap)
                put_dev_pagemap(pgmap);
        pte_unmap(ptem);
        return ret;
}
#else

/*
 * If we can't determine whether or not a pte is special, then fail immediately
 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
 * to be special.
 *
 * For a futex to be placed on a THP tail page, get_futex_key requires a
 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
 * useful to have gup_fast_pmd_leaf even if we can't operate on ptes.
 */
static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        return 0;
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */

#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int gup_fast_devmap_leaf(unsigned long pfn, unsigned long addr,
        unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
        int nr_start = *nr;
        struct dev_pagemap *pgmap = NULL;

        do {
                struct folio *folio;
                struct page *page = pfn_to_page(pfn);

                pgmap = get_dev_pagemap(pfn, pgmap);
                if (unlikely(!pgmap)) {
                        gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                        break;
                }

                folio = try_grab_folio_fast(page, 1, flags);
                if (!folio) {
                        gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                        break;
                }
                folio_set_referenced(folio);
                pages[*nr] = page;
                (*nr)++;
                pfn++;
        } while (addr += PAGE_SIZE, addr != end);

        put_dev_pagemap(pgmap);
        return addr == end;
}

static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long fault_pfn;
        int nr_start = *nr;

        fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
                return 0;

        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
                gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                return 0;
        }
        return 1;
}

static int gup_fast_devmap_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long fault_pfn;
        int nr_start = *nr;

        fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        if (!gup_fast_devmap_leaf(fault_pfn, addr, end, flags, pages, nr))
                return 0;

        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
                gup_fast_undo_dev_pagemap(nr, nr_start, flags, pages);
                return 0;
        }
        return 1;
}
#else
static int gup_fast_devmap_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        BUILD_BUG();
        return 0;
}

static int gup_fast_devmap_pud_leaf(pud_t pud, pud_t *pudp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        BUILD_BUG();
        return 0;
}
#endif

static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        struct page *page;
        struct folio *folio;
        int refs;

        if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        if (pmd_special(orig))
                return 0;

        if (pmd_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
                return gup_fast_devmap_pmd_leaf(orig, pmdp, addr, end, flags,
                                                pages, nr);
        }

        page = pmd_page(orig);
        refs = record_subpages(page, PMD_SIZE, addr, end, pages + *nr);

        folio = try_grab_folio_fast(page, refs, flags);
        if (!folio)
                return 0;

        if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!gup_fast_folio_allowed(folio, flags)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }
        if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        *nr += refs;
        folio_set_referenced(folio);
        return 1;
}

static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        struct page *page;
        struct folio *folio;
        int refs;

        if (!pud_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        if (pud_special(orig))
                return 0;

        if (pud_devmap(orig)) {
                if (unlikely(flags & FOLL_LONGTERM))
                        return 0;
                return gup_fast_devmap_pud_leaf(orig, pudp, addr, end, flags,
                                                pages, nr);
        }

        page = pud_page(orig);
        refs = record_subpages(page, PUD_SIZE, addr, end, pages + *nr);

        folio = try_grab_folio_fast(page, refs, flags);
        if (!folio)
                return 0;

        if (unlikely(pud_val(orig) != pud_val(*pudp))) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!gup_fast_folio_allowed(folio, flags)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        *nr += refs;
        folio_set_referenced(folio);
        return 1;
}

static int gup_fast_pgd_leaf(pgd_t orig, pgd_t *pgdp, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        int refs;
        struct page *page;
        struct folio *folio;

        if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
                return 0;

        BUILD_BUG_ON(pgd_devmap(orig));

        page = pgd_page(orig);
        refs = record_subpages(page, PGDIR_SIZE, addr, end, pages + *nr);

        folio = try_grab_folio_fast(page, refs, flags);
        if (!folio)
                return 0;

        if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!pgd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        if (!gup_fast_folio_allowed(folio, flags)) {
                gup_put_folio(folio, refs, flags);
                return 0;
        }

        *nr += refs;
        folio_set_referenced(folio);
        return 1;
}

static int gup_fast_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long next;
        pmd_t *pmdp;

        pmdp = pmd_offset_lockless(pudp, pud, addr);
        do {
                pmd_t pmd = pmdp_get_lockless(pmdp);

                next = pmd_addr_end(addr, end);
                if (!pmd_present(pmd))
                        return 0;

                if (unlikely(pmd_leaf(pmd))) {
                        /* See gup_fast_pte_range() */
                        if (pmd_protnone(pmd))
                                return 0;

                        if (!gup_fast_pmd_leaf(pmd, pmdp, addr, next, flags,
                                pages, nr))
                                return 0;

                } else if (!gup_fast_pte_range(pmd, pmdp, addr, next, flags,
                                               pages, nr))
                        return 0;
        } while (pmdp++, addr = next, addr != end);

        return 1;
}

static int gup_fast_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long next;
        pud_t *pudp;

        pudp = pud_offset_lockless(p4dp, p4d, addr);
        do {
                pud_t pud = READ_ONCE(*pudp);

                next = pud_addr_end(addr, end);
                if (unlikely(!pud_present(pud)))
                        return 0;
                if (unlikely(pud_leaf(pud))) {
                        if (!gup_fast_pud_leaf(pud, pudp, addr, next, flags,
                                               pages, nr))
                                return 0;
                } else if (!gup_fast_pmd_range(pudp, pud, addr, next, flags,
                                               pages, nr))
                        return 0;
        } while (pudp++, addr = next, addr != end);

        return 1;
}

static int gup_fast_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
                unsigned long end, unsigned int flags, struct page **pages,
                int *nr)
{
        unsigned long next;
        p4d_t *p4dp;

        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
        do {
                p4d_t p4d = READ_ONCE(*p4dp);

                next = p4d_addr_end(addr, end);
                if (!p4d_present(p4d))
                        return 0;
                BUILD_BUG_ON(p4d_leaf(p4d));
                if (!gup_fast_pud_range(p4dp, p4d, addr, next, flags,
                                        pages, nr))
                        return 0;
        } while (p4dp++, addr = next, addr != end);

        return 1;
}

static void gup_fast_pgd_range(unsigned long addr, unsigned long end,
                unsigned int flags, struct page **pages, int *nr)
{
        unsigned long next;
        pgd_t *pgdp;

        pgdp = pgd_offset(current->mm, addr);
        do {
                pgd_t pgd = READ_ONCE(*pgdp);

                next = pgd_addr_end(addr, end);
                if (pgd_none(pgd))
                        return;
                if (unlikely(pgd_leaf(pgd))) {
                        if (!gup_fast_pgd_leaf(pgd, pgdp, addr, next, flags,
                                               pages, nr))
                                return;
                } else if (!gup_fast_p4d_range(pgdp, pgd, addr, next, flags,
                                               pages, nr))
                        return;
        } while (pgdp++, addr = next, addr != end);
}
#else
static inline void gup_fast_pgd_range(unsigned long addr, unsigned long end,
                unsigned int flags, struct page **pages, int *nr)
{
}
#endif /* CONFIG_HAVE_GUP_FAST */

#ifndef gup_fast_permitted
/*
 * Check if it's allowed to use get_user_pages_fast_only() for the range, or
 * we need to fall back to the slow version:
 */
static bool gup_fast_permitted(unsigned long start, unsigned long end)
{
        return true;
}
#endif

static unsigned long gup_fast(unsigned long start, unsigned long end,
                unsigned int gup_flags, struct page **pages)
{
        unsigned long flags;
        int nr_pinned = 0;
        unsigned seq;

        if (!IS_ENABLED(CONFIG_HAVE_GUP_FAST) ||
            !gup_fast_permitted(start, end))
                return 0;

        if (gup_flags & FOLL_PIN) {
                if (!raw_seqcount_try_begin(&current->mm->write_protect_seq, seq))
                        return 0;
        }

        /*
         * Disable interrupts. The nested form is used, in order to allow full,
         * general purpose use of this routine.
         *
         * With interrupts disabled, we block page table pages from being freed
         * from under us. See struct mmu_table_batch comments in
         * include/asm-generic/tlb.h for more details.
         *
         * We do not adopt an rcu_read_lock() here as we also want to block IPIs
         * that come from THPs splitting.
         */
        local_irq_save(flags);
        gup_fast_pgd_range(start, end, gup_flags, pages, &nr_pinned);
        local_irq_restore(flags);

        /*
         * When pinning pages for DMA there could be a concurrent write protect
         * from fork() via copy_page_range(), in this case always fail GUP-fast.
         */
        if (gup_flags & FOLL_PIN) {
                if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
                        gup_fast_unpin_user_pages(pages, nr_pinned);
                        return 0;
                } else {
                        sanity_check_pinned_pages(pages, nr_pinned);
                }
        }
        return nr_pinned;
}

static int gup_fast_fallback(unsigned long start, unsigned long nr_pages,
                unsigned int gup_flags, struct page **pages)
{
        unsigned long len, end;
        unsigned long nr_pinned;
        int locked = 0;
        int ret;

        if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
                                       FOLL_FORCE | FOLL_PIN | FOLL_GET |
                                       FOLL_FAST_ONLY | FOLL_NOFAULT |
                                       FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT)))
                return -EINVAL;

        if (gup_flags & FOLL_PIN)
                mm_set_has_pinned_flag(&current->mm->flags);

        if (!(gup_flags & FOLL_FAST_ONLY))
                might_lock_read(&current->mm->mmap_lock);

        start = untagged_addr(start) & PAGE_MASK;
        len = nr_pages << PAGE_SHIFT;
        if (check_add_overflow(start, len, &end))
                return -EOVERFLOW;
        if (end > TASK_SIZE_MAX)
                return -EFAULT;

        nr_pinned = gup_fast(start, end, gup_flags, pages);
        if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
                return nr_pinned;

        /* Slow path: try to get the remaining pages with get_user_pages */
        start += nr_pinned << PAGE_SHIFT;
        pages += nr_pinned;
        ret = __gup_longterm_locked(current->mm, start, nr_pages - nr_pinned,
                                    pages, &locked,
                                    gup_flags | FOLL_TOUCH | FOLL_UNLOCKABLE);
        if (ret < 0) {
                /*
                 * The caller has to unpin the pages we already pinned so
                 * returning -errno is not an option
                 */
                if (nr_pinned)
                        return nr_pinned;
                return ret;
        }
        return ret + nr_pinned;
}

/**
 * get_user_pages_fast_only() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
 * the regular GUP.
 *
 * If the architecture does not support this function, simply return with no
 * pages pinned.
 *
 * Careful, careful! COW breaking can go either way, so a non-write
 * access can get ambiguous page results. If you call this function without
 * 'write' set, you'd better be sure that you're ok with that ambiguity.
 */
int get_user_pages_fast_only(unsigned long start, int nr_pages,
                             unsigned int gup_flags, struct page **pages)
{
        /*
         * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
         * because gup fast is always a "pin with a +1 page refcount" request.
         *
         * FOLL_FAST_ONLY is required in order to match the API description of
         * this routine: no fall back to regular ("slow") GUP.
         */
        if (!is_valid_gup_args(pages, NULL, &gup_flags,
                               FOLL_GET | FOLL_FAST_ONLY))
                return -EINVAL;

        return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast_only);

/**
 * get_user_pages_fast() - pin user pages in memory
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Attempt to pin user pages in memory without taking mm->mmap_lock.
 * If not successful, it will fall back to taking the lock and
 * calling get_user_pages().
 *
 * Returns number of pages pinned. This may be fewer than the number requested.
 * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
 * -errno.
 */
int get_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages)
{
        /*
         * The caller may or may not have explicitly set FOLL_GET; either way is
         * OK. However, internally (within mm/gup.c), gup fast variants must set
         * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
         * request.
         */
        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_GET))
                return -EINVAL;
        return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);

/**
 * pin_user_pages_fast() - pin user pages in memory without taking locks
 *
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
 * @gup_flags:  flags modifying pin behaviour
 * @pages:      array that receives pointers to the pages pinned.
 *              Should be at least nr_pages long.
 *
 * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
 * get_user_pages_fast() for documentation on the function arguments, because
 * the arguments here are identical.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for further details.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page() will not remove pins from it.
 */
int pin_user_pages_fast(unsigned long start, int nr_pages,
                        unsigned int gup_flags, struct page **pages)
{
        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
                return -EINVAL;
        return gup_fast_fallback(start, nr_pages, gup_flags, pages);
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);

/**
 * pin_user_pages_remote() - pin pages of a remote process
 *
 * @mm:                mm_struct of target mm
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long.
 * @locked:        pointer to lock flag indicating whether lock is held and
 *                subsequently whether VM_FAULT_RETRY functionality can be
 *                utilised. Lock must initially be held.
 *
 * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
 * get_user_pages_remote() for documentation on the function arguments, because
 * the arguments here are identical.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for details.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page*() will not remove pins from it.
 */
long pin_user_pages_remote(struct mm_struct *mm,
                           unsigned long start, unsigned long nr_pages,
                           unsigned int gup_flags, struct page **pages,
                           int *locked)
{
        int local_locked = 1;

        if (!is_valid_gup_args(pages, locked, &gup_flags,
                               FOLL_PIN | FOLL_TOUCH | FOLL_REMOTE))
                return 0;
        return __gup_longterm_locked(mm, start, nr_pages, pages,
                                     locked ? locked : &local_locked,
                                     gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_remote);

/**
 * pin_user_pages() - pin user pages in memory for use by other devices
 *
 * @start:        starting user address
 * @nr_pages:        number of pages from start to pin
 * @gup_flags:        flags modifying lookup behaviour
 * @pages:        array that receives pointers to the pages pinned.
 *                Should be at least nr_pages long.
 *
 * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
 * FOLL_PIN is set.
 *
 * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
 * see Documentation/core-api/pin_user_pages.rst for details.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page*() will not remove pins from it.
 */
long pin_user_pages(unsigned long start, unsigned long nr_pages,
                    unsigned int gup_flags, struct page **pages)
{
        int locked = 1;

        if (!is_valid_gup_args(pages, NULL, &gup_flags, FOLL_PIN))
                return 0;
        return __gup_longterm_locked(current->mm, start, nr_pages,
                                     pages, &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);

/*
 * pin_user_pages_unlocked() is the FOLL_PIN variant of
 * get_user_pages_unlocked(). Behavior is the same, except that this one sets
 * FOLL_PIN and rejects FOLL_GET.
 *
 * Note that if a zero_page is amongst the returned pages, it will not have
 * pins in it and unpin_user_page*() will not remove pins from it.
 */
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
                             struct page **pages, unsigned int gup_flags)
{
        int locked = 0;

        if (!is_valid_gup_args(pages, NULL, &gup_flags,
                               FOLL_PIN | FOLL_TOUCH | FOLL_UNLOCKABLE))
                return 0;

        return __gup_longterm_locked(current->mm, start, nr_pages, pages,
                                     &locked, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);

/**
 * memfd_pin_folios() - pin folios associated with a memfd
 * @memfd:      the memfd whose folios are to be pinned
 * @start:      the first memfd offset
 * @end:        the last memfd offset (inclusive)
 * @folios:     array that receives pointers to the folios pinned
 * @max_folios: maximum number of entries in @folios
 * @offset:     the offset into the first folio
 *
 * Attempt to pin folios associated with a memfd in the contiguous range
 * [start, end]. Given that a memfd is either backed by shmem or hugetlb,
 * the folios can either be found in the page cache or need to be allocated
 * if necessary. Once the folios are located, they are all pinned via
 * FOLL_PIN and @offset is populatedwith the offset into the first folio.
 * And, eventually, these pinned folios must be released either using
 * unpin_folios() or unpin_folio().
 *
 * It must be noted that the folios may be pinned for an indefinite amount
 * of time. And, in most cases, the duration of time they may stay pinned
 * would be controlled by the userspace. This behavior is effectively the
 * same as using FOLL_LONGTERM with other GUP APIs.
 *
 * Returns number of folios pinned, which could be less than @max_folios
 * as it depends on the folio sizes that cover the range [start, end].
 * If no folios were pinned, it returns -errno.
 */
long memfd_pin_folios(struct file *memfd, loff_t start, loff_t end,
                      struct folio **folios, unsigned int max_folios,
                      pgoff_t *offset)
{
        unsigned int flags, nr_folios, nr_found;
        unsigned int i, pgshift = PAGE_SHIFT;
        pgoff_t start_idx, end_idx, next_idx;
        struct folio *folio = NULL;
        struct folio_batch fbatch;
        struct hstate *h;
        long ret = -EINVAL;

        if (start < 0 || start > end || !max_folios)
                return -EINVAL;

        if (!memfd)
                return -EINVAL;

        if (!shmem_file(memfd) && !is_file_hugepages(memfd))
                return -EINVAL;

        if (end >= i_size_read(file_inode(memfd)))
                return -EINVAL;

        if (is_file_hugepages(memfd)) {
                h = hstate_file(memfd);
                pgshift = huge_page_shift(h);
        }

        flags = memalloc_pin_save();
        do {
                nr_folios = 0;
                start_idx = start >> pgshift;
                end_idx = end >> pgshift;
                if (is_file_hugepages(memfd)) {
                        start_idx <<= huge_page_order(h);
                        end_idx <<= huge_page_order(h);
                }

                folio_batch_init(&fbatch);
                while (start_idx <= end_idx && nr_folios < max_folios) {
                        /*
                         * In most cases, we should be able to find the folios
                         * in the page cache. If we cannot find them for some
                         * reason, we try to allocate them and add them to the
                         * page cache.
                         */
                        nr_found = filemap_get_folios_contig(memfd->f_mapping,
                                                             &start_idx,
                                                             end_idx,
                                                             &fbatch);
                        if (folio) {
                                folio_put(folio);
                                folio = NULL;
                        }

                        next_idx = 0;
                        for (i = 0; i < nr_found; i++) {
                                /*
                                 * As there can be multiple entries for a
                                 * given folio in the batch returned by
                                 * filemap_get_folios_contig(), the below
                                 * check is to ensure that we pin and return a
                                 * unique set of folios between start and end.
                                 */
                                if (next_idx &&
                                    next_idx != folio_index(fbatch.folios[i]))
                                        continue;

                                folio = page_folio(&fbatch.folios[i]->page);

                                if (try_grab_folio(folio, 1, FOLL_PIN)) {
                                        folio_batch_release(&fbatch);
                                        ret = -EINVAL;
                                        goto err;
                                }

                                if (nr_folios == 0)
                                        *offset = offset_in_folio(folio, start);

                                folios[nr_folios] = folio;
                                next_idx = folio_next_index(folio);
                                if (++nr_folios == max_folios)
                                        break;
                        }

                        folio = NULL;
                        folio_batch_release(&fbatch);
                        if (!nr_found) {
                                folio = memfd_alloc_folio(memfd, start_idx);
                                if (IS_ERR(folio)) {
                                        ret = PTR_ERR(folio);
                                        if (ret != -EEXIST)
                                                goto err;
                                        folio = NULL;
                                }
                        }
                }

                ret = check_and_migrate_movable_folios(nr_folios, folios);
        } while (ret == -EAGAIN);

        memalloc_pin_restore(flags);
        return ret ? ret : nr_folios;
err:
        memalloc_pin_restore(flags);
        unpin_folios(folios, nr_folios);

        return ret;
}
EXPORT_SYMBOL_GPL(memfd_pin_folios);

/**
 * folio_add_pins() - add pins to an already-pinned folio
 * @folio: the folio to add more pins to
 * @pins: number of pins to add
 *
 * Try to add more pins to an already-pinned folio. The semantics
 * of the pin (e.g., FOLL_WRITE) follow any existing pin and cannot
 * be changed.
 *
 * This function is helpful when having obtained a pin on a large folio
 * using memfd_pin_folios(), but wanting to logically unpin parts
 * (e.g., individual pages) of the folio later, for example, using
 * unpin_user_page_range_dirty_lock().
 *
 * This is not the right interface to initially pin a folio.
 */
int folio_add_pins(struct folio *folio, unsigned int pins)
{
        VM_WARN_ON_ONCE(!folio_maybe_dma_pinned(folio));

        return try_grab_folio(folio, pins, FOLL_PIN);
}
EXPORT_SYMBOL_GPL(folio_add_pins);























































    7 
















    7 




    7 



    7 









    7 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * An extensible bitmap is a bitmap that supports an
 * arbitrary number of bits.  Extensible bitmaps are
 * used to represent sets of values, such as types,
 * roles, categories, and classes.
 *
 * Each extensible bitmap is implemented as a linked
 * list of bitmap nodes, where each bitmap node has
 * an explicitly specified starting bit position within
 * the total bitmap.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

#ifndef _SS_EBITMAP_H_
#define _SS_EBITMAP_H_

#include <net/netlabel.h>

#ifdef CONFIG_64BIT
#define EBITMAP_NODE_SIZE 64
#else
#define EBITMAP_NODE_SIZE 32
#endif

#define EBITMAP_UNIT_NUMS                                     \
        ((EBITMAP_NODE_SIZE - sizeof(void *) - sizeof(u32)) / \
         sizeof(unsigned long))
#define EBITMAP_UNIT_SIZE BITS_PER_LONG
#define EBITMAP_SIZE          (EBITMAP_UNIT_NUMS * EBITMAP_UNIT_SIZE)
#define EBITMAP_BIT          1UL
#define EBITMAP_SHIFT_UNIT_SIZE(x) \
        (((x) >> EBITMAP_UNIT_SIZE / 2) >> EBITMAP_UNIT_SIZE / 2)

struct ebitmap_node {
        struct ebitmap_node *next;
        unsigned long maps[EBITMAP_UNIT_NUMS];
        u32 startbit;
};

struct ebitmap {
        struct ebitmap_node *node; /* first node in the bitmap */
        u32 highbit; /* highest position in the total bitmap */
};

#define ebitmap_length(e) ((e)->highbit)

static inline u32 ebitmap_start_positive(const struct ebitmap *e,
                                         struct ebitmap_node **n)
{
        u32 ofs;

        for (*n = e->node; *n; *n = (*n)->next) {
                ofs = find_first_bit((*n)->maps, EBITMAP_SIZE);
                if (ofs < EBITMAP_SIZE)
                        return (*n)->startbit + ofs;
        }
        return ebitmap_length(e);
}

static inline void ebitmap_init(struct ebitmap *e)
{
        memset(e, 0, sizeof(*e));
}

static inline u32 ebitmap_next_positive(const struct ebitmap *e,
                                        struct ebitmap_node **n, u32 bit)
{
        u32 ofs;

        ofs = find_next_bit((*n)->maps, EBITMAP_SIZE, bit - (*n)->startbit + 1);
        if (ofs < EBITMAP_SIZE)
                return ofs + (*n)->startbit;

        for (*n = (*n)->next; *n; *n = (*n)->next) {
                ofs = find_first_bit((*n)->maps, EBITMAP_SIZE);
                if (ofs < EBITMAP_SIZE)
                        return ofs + (*n)->startbit;
        }
        return ebitmap_length(e);
}

#define EBITMAP_NODE_INDEX(node, bit) \
        (((bit) - (node)->startbit) / EBITMAP_UNIT_SIZE)
#define EBITMAP_NODE_OFFSET(node, bit) \
        (((bit) - (node)->startbit) % EBITMAP_UNIT_SIZE)

static inline int ebitmap_node_get_bit(const struct ebitmap_node *n, u32 bit)
{
        u32 index = EBITMAP_NODE_INDEX(n, bit);
        u32 ofs = EBITMAP_NODE_OFFSET(n, bit);

        BUG_ON(index >= EBITMAP_UNIT_NUMS);
        if ((n->maps[index] & (EBITMAP_BIT << ofs)))
                return 1;
        return 0;
}

static inline void ebitmap_node_set_bit(struct ebitmap_node *n, u32 bit)
{
        u32 index = EBITMAP_NODE_INDEX(n, bit);
        u32 ofs = EBITMAP_NODE_OFFSET(n, bit);

        BUG_ON(index >= EBITMAP_UNIT_NUMS);
        n->maps[index] |= (EBITMAP_BIT << ofs);
}

static inline void ebitmap_node_clr_bit(struct ebitmap_node *n, u32 bit)
{
        u32 index = EBITMAP_NODE_INDEX(n, bit);
        u32 ofs = EBITMAP_NODE_OFFSET(n, bit);

        BUG_ON(index >= EBITMAP_UNIT_NUMS);
        n->maps[index] &= ~(EBITMAP_BIT << ofs);
}

#define ebitmap_for_each_positive_bit(e, n, bit)      \
        for ((bit) = ebitmap_start_positive(e, &(n)); \
             (bit) < ebitmap_length(e);               \
             (bit) = ebitmap_next_positive(e, &(n), bit))

bool ebitmap_equal(const struct ebitmap *e1, const struct ebitmap *e2);
int ebitmap_cpy(struct ebitmap *dst, const struct ebitmap *src);
int ebitmap_and(struct ebitmap *dst, const struct ebitmap *e1,
                const struct ebitmap *e2);
int ebitmap_contains(const struct ebitmap *e1, const struct ebitmap *e2,
                     u32 last_e2bit);
int ebitmap_get_bit(const struct ebitmap *e, u32 bit);
int ebitmap_set_bit(struct ebitmap *e, u32 bit, int value);
void ebitmap_destroy(struct ebitmap *e);
struct policy_file;
int ebitmap_read(struct ebitmap *e, struct policy_file *fp);
int ebitmap_write(const struct ebitmap *e, struct policy_file *fp);
u32 ebitmap_hash(const struct ebitmap *e, u32 hash);

#ifdef CONFIG_NETLABEL
int ebitmap_netlbl_export(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap **catmap);
int ebitmap_netlbl_import(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap *catmap);
#else
static inline int ebitmap_netlbl_export(struct ebitmap *ebmap,
                                        struct netlbl_lsm_catmap **catmap)
{
        return -ENOMEM;
}
static inline int ebitmap_netlbl_import(struct ebitmap *ebmap,
                                        struct netlbl_lsm_catmap *catmap)
{
        return -ENOMEM;
}
#endif

#endif /* _SS_EBITMAP_H_ */
















































































  346 


  356 
















































































































































































































































































   31 
   30 
   31 






















   31 

   31 
   31 





















   31 
   31 








   31 









   31 
   30 















   31 









   31 

   31 

   31 




   31 


































  307 





  306 






  307 

  248 











  307 




  307 

    4 










  304 




  307 












  307 




  307 



















  307 

  307 
  306 




















  306 






  307 






































































































  326 










  325 









   25 












   26 

   25 



   26 


   26 
























   26 

   25 





































































































































































   31 


























  307 


  307 




















   31 



   31 

   30 




































































































  326 




















  326 

  326 







  324 


































  301 
  244 
























  326 







  326 

















   80 




  106 






   31 





   80 
































































  302 









  326 



  326 

  326 

















































































































































































  307 



  307 







  307 












  306 


  306 







  305 



































































































   80 









  301 































































   80 
  302 

  301 

























  307 































  307 



  307 














































  306 


  280 


  227 























































































  307 
































  306 
























  307 




  307 











  307 
  307 




  307 














  306 















  307 























































































































































































































































































































   30 


   31 






   30 







   31 





   31 


   31 






















































   31 












   31 

   31 



























































































































































































































































































































































































































































































































































































































































































  306 









  306 



























  306 









































































   31 














   31 











































































































   37 












   12 









   31 

   26 




   30 








   31 
































































































































































   69 

















  306 



















  306 
  247 



  248 




































  307 


















  307 





















  307 



  307 
    1 


  306 































  307 





  306 





  307 
















































  307 









































































  307 












  307 






























































  306 


























































    1 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 1993  Linus Torvalds
 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
 *  SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
 *  Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
 *  Numa awareness, Christoph Lameter, SGI, June 2005
 *  Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
 */

#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
#include <linux/xarray.h>
#include <linux/io.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/memcontrol.h>
#include <linux/llist.h>
#include <linux/uio.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
#include <linux/overflow.h>
#include <linux/pgtable.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
#include <linux/page_owner.h>

#define CREATE_TRACE_POINTS
#include <trace/events/vmalloc.h>

#include "internal.h"
#include "pgalloc-track.h"

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;

static int __init set_nohugeiomap(char *str)
{
        ioremap_max_page_shift = PAGE_SHIFT;
        return 0;
}
early_param("nohugeiomap", set_nohugeiomap);
#else /* CONFIG_HAVE_ARCH_HUGE_VMAP */
static const unsigned int ioremap_max_page_shift = PAGE_SHIFT;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
static bool __ro_after_init vmap_allow_huge = true;

static int __init set_nohugevmalloc(char *str)
{
        vmap_allow_huge = false;
        return 0;
}
early_param("nohugevmalloc", set_nohugevmalloc);
#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
static const bool vmap_allow_huge = false;
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */

bool is_vmalloc_addr(const void *x)
{
        unsigned long addr = (unsigned long)kasan_reset_tag(x);

        return addr >= VMALLOC_START && addr < VMALLOC_END;
}
EXPORT_SYMBOL(is_vmalloc_addr);

struct vfree_deferred {
        struct llist_head list;
        struct work_struct wq;
};
static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);

/*** Page table manipulation functions ***/
static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pte_t *pte;
        u64 pfn;
        struct page *page;
        unsigned long size = PAGE_SIZE;

        pfn = phys_addr >> PAGE_SHIFT;
        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;
        do {
                if (unlikely(!pte_none(ptep_get(pte)))) {
                        if (pfn_valid(pfn)) {
                                page = pfn_to_page(pfn);
                                dump_page(page, "remapping already mapped page");
                        }
                        BUG();
                }

#ifdef CONFIG_HUGETLB_PAGE
                size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
                if (size != PAGE_SIZE) {
                        pte_t entry = pfn_pte(pfn, prot);

                        entry = arch_make_huge_pte(entry, ilog2(size), 0);
                        set_huge_pte_at(&init_mm, addr, pte, entry, size);
                        pfn += PFN_DOWN(size);
                        continue;
                }
#endif
                set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
                pfn++;
        } while (pte += PFN_DOWN(size), addr += size, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
        return 0;
}

static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PMD_SHIFT)
                return 0;

        if (!arch_vmap_pmd_supported(prot))
                return 0;

        if ((end - addr) != PMD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PMD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PMD_SIZE))
                return 0;

        if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
                return 0;

        return pmd_set_huge(pmd, phys_addr, prot);
}

static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);

                if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PMD_MODIFIED;
                        continue;
                }

                if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
                        return -ENOMEM;
        } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < PUD_SHIFT)
                return 0;

        if (!arch_vmap_pud_supported(prot))
                return 0;

        if ((end - addr) != PUD_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, PUD_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, PUD_SIZE))
                return 0;

        if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
                return 0;

        return pud_set_huge(pud, phys_addr, prot);
}

static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);

                if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_PUD_MODIFIED;
                        continue;
                }

                if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
                                        max_page_shift, mask))
                        return -ENOMEM;
        } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        if (max_page_shift < P4D_SHIFT)
                return 0;

        if (!arch_vmap_p4d_supported(prot))
                return 0;

        if ((end - addr) != P4D_SIZE)
                return 0;

        if (!IS_ALIGNED(addr, P4D_SIZE))
                return 0;

        if (!IS_ALIGNED(phys_addr, P4D_SIZE))
                return 0;

        if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
                return 0;

        return p4d_set_huge(p4d, phys_addr, prot);
}

static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift, pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);

                if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
                                        max_page_shift)) {
                        *mask |= PGTBL_P4D_MODIFIED;
                        continue;
                }

                if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
                                        max_page_shift, mask))
                        return -ENOMEM;
        } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
        return 0;
}

static int vmap_range_noflush(unsigned long addr, unsigned long end,
                        phys_addr_t phys_addr, pgprot_t prot,
                        unsigned int max_page_shift)
{
        pgd_t *pgd;
        unsigned long start;
        unsigned long next;
        int err;
        pgtbl_mod_mask mask = 0;

        might_sleep();
        BUG_ON(addr >= end);

        start = addr;
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
                                        max_page_shift, &mask);
                if (err)
                        break;
        } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return err;
}

int vmap_page_range(unsigned long addr, unsigned long end,
                    phys_addr_t phys_addr, pgprot_t prot)
{
        int err;

        err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot),
                                 ioremap_max_page_shift);
        flush_cache_vmap(addr, end);
        if (!err)
                err = kmsan_ioremap_page_range(addr, end, phys_addr, prot,
                                               ioremap_max_page_shift);
        return err;
}

int ioremap_page_range(unsigned long addr, unsigned long end,
                phys_addr_t phys_addr, pgprot_t prot)
{
        struct vm_struct *area;

        area = find_vm_area((void *)addr);
        if (!area || !(area->flags & VM_IOREMAP)) {
                WARN_ONCE(1, "vm_area at addr %lx is not marked as VM_IOREMAP\n", addr);
                return -EINVAL;
        }
        if (addr != (unsigned long)area->addr ||
            (void *)end != area->addr + get_vm_area_size(area)) {
                WARN_ONCE(1, "ioremap request [%lx,%lx) doesn't match vm_area [%lx, %lx)\n",
                          addr, end, (long)area->addr,
                          (long)area->addr + get_vm_area_size(area));
                return -ERANGE;
        }
        return vmap_page_range(addr, end, phys_addr, prot);
}

static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pte_t *pte;

        pte = pte_offset_kernel(pmd, addr);
        do {
                pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
                WARN_ON(!pte_none(ptent) && !pte_present(ptent));
        } while (pte++, addr += PAGE_SIZE, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
}

static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;
        int cleared;

        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);

                cleared = pmd_clear_huge(pmd);
                if (cleared || pmd_bad(*pmd))
                        *mask |= PGTBL_PMD_MODIFIED;

                if (cleared)
                        continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                vunmap_pte_range(pmd, addr, next, mask);

                cond_resched();
        } while (pmd++, addr = next, addr != end);
}

static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;
        int cleared;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);

                cleared = pud_clear_huge(pud);
                if (cleared || pud_bad(*pud))
                        *mask |= PGTBL_PUD_MODIFIED;

                if (cleared)
                        continue;
                if (pud_none_or_clear_bad(pud))
                        continue;
                vunmap_pmd_range(pud, addr, next, mask);
        } while (pud++, addr = next, addr != end);
}

static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
                             pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);

                p4d_clear_huge(p4d);
                if (p4d_bad(*p4d))
                        *mask |= PGTBL_P4D_MODIFIED;

                if (p4d_none_or_clear_bad(p4d))
                        continue;
                vunmap_pud_range(p4d, addr, next, mask);
        } while (p4d++, addr = next, addr != end);
}

/*
 * vunmap_range_noflush is similar to vunmap_range, but does not
 * flush caches or TLBs.
 *
 * The caller is responsible for calling flush_cache_vmap() before calling
 * this function, and flush_tlb_kernel_range after it has returned
 * successfully (and before the addresses are expected to cause a page fault
 * or be re-mapped for something else, if TLB flushes are being delayed or
 * coalesced).
 *
 * This is an internal function only. Do not use outside mm/.
 */
void __vunmap_range_noflush(unsigned long start, unsigned long end)
{
        unsigned long next;
        pgd_t *pgd;
        unsigned long addr = start;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                vunmap_p4d_range(pgd, addr, next, &mask);
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);
}

void vunmap_range_noflush(unsigned long start, unsigned long end)
{
        kmsan_vunmap_range_noflush(start, end);
        __vunmap_range_noflush(start, end);
}

/**
 * vunmap_range - unmap kernel virtual addresses
 * @addr: start of the VM area to unmap
 * @end: end of the VM area to unmap (non-inclusive)
 *
 * Clears any present PTEs in the virtual address range, flushes TLBs and
 * caches. Any subsequent access to the address before it has been re-mapped
 * is a kernel bug.
 */
void vunmap_range(unsigned long addr, unsigned long end)
{
        flush_cache_vunmap(addr, end);
        vunmap_range_noflush(addr, end);
        flush_tlb_kernel_range(addr, end);
}

static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pte_t *pte;

        /*
         * nr is a running index into the array which helps higher level
         * callers keep track of where we're up to.
         */

        pte = pte_alloc_kernel_track(pmd, addr, mask);
        if (!pte)
                return -ENOMEM;
        do {
                struct page *page = pages[*nr];

                if (WARN_ON(!pte_none(ptep_get(pte))))
                        return -EBUSY;
                if (WARN_ON(!page))
                        return -ENOMEM;
                if (WARN_ON(!pfn_valid(page_to_pfn(page))))
                        return -EINVAL;

                set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
                (*nr)++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        *mask |= PGTBL_PTE_MODIFIED;
        return 0;
}

static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pmd_t *pmd;
        unsigned long next;

        pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
        if (!pmd)
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);
                if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        pud_t *pud;
        unsigned long next;

        pud = pud_alloc_track(&init_mm, p4d, addr, mask);
        if (!pud)
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
                if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
                unsigned long end, pgprot_t prot, struct page **pages, int *nr,
                pgtbl_mod_mask *mask)
{
        p4d_t *p4d;
        unsigned long next;

        p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
        if (!p4d)
                return -ENOMEM;
        do {
                next = p4d_addr_end(addr, end);
                if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
                        return -ENOMEM;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages)
{
        unsigned long start = addr;
        pgd_t *pgd;
        unsigned long next;
        int err = 0;
        int nr = 0;
        pgtbl_mod_mask mask = 0;

        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_bad(*pgd))
                        mask |= PGTBL_PGD_MODIFIED;
                err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);

        if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
                arch_sync_kernel_mappings(start, end);

        return err;
}

/*
 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
 * flush caches.
 *
 * The caller is responsible for calling flush_cache_vmap() after this
 * function returns successfully and before the addresses are accessed.
 *
 * This is an internal function only. Do not use outside mm/.
 */
int __vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        unsigned int i, nr = (end - addr) >> PAGE_SHIFT;

        WARN_ON(page_shift < PAGE_SHIFT);

        if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
                        page_shift == PAGE_SHIFT)
                return vmap_small_pages_range_noflush(addr, end, prot, pages);

        for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
                int err;

                err = vmap_range_noflush(addr, addr + (1UL << page_shift),
                                        page_to_phys(pages[i]), prot,
                                        page_shift);
                if (err)
                        return err;

                addr += 1UL << page_shift;
        }

        return 0;
}

int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        int ret = kmsan_vmap_pages_range_noflush(addr, end, prot, pages,
                                                 page_shift);

        if (ret)
                return ret;
        return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
}

/**
 * vmap_pages_range - map pages to a kernel virtual address
 * @addr: start of the VM area to map
 * @end: end of the VM area to map (non-inclusive)
 * @prot: page protection flags to use
 * @pages: pages to map (always PAGE_SIZE pages)
 * @page_shift: maximum shift that the pages may be mapped with, @pages must
 * be aligned and contiguous up to at least this shift.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int vmap_pages_range(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        int err;

        err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
        flush_cache_vmap(addr, end);
        return err;
}

static int check_sparse_vm_area(struct vm_struct *area, unsigned long start,
                                unsigned long end)
{
        might_sleep();
        if (WARN_ON_ONCE(area->flags & VM_FLUSH_RESET_PERMS))
                return -EINVAL;
        if (WARN_ON_ONCE(area->flags & VM_NO_GUARD))
                return -EINVAL;
        if (WARN_ON_ONCE(!(area->flags & VM_SPARSE)))
                return -EINVAL;
        if ((end - start) >> PAGE_SHIFT > totalram_pages())
                return -E2BIG;
        if (start < (unsigned long)area->addr ||
            (void *)end > area->addr + get_vm_area_size(area))
                return -ERANGE;
        return 0;
}

/**
 * vm_area_map_pages - map pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 * @pages: pages to map (always PAGE_SIZE pages)
 */
int vm_area_map_pages(struct vm_struct *area, unsigned long start,
                      unsigned long end, struct page **pages)
{
        int err;

        err = check_sparse_vm_area(area, start, end);
        if (err)
                return err;

        return vmap_pages_range(start, end, PAGE_KERNEL, pages, PAGE_SHIFT);
}

/**
 * vm_area_unmap_pages - unmap pages inside given sparse vm_area
 * @area: vm_area
 * @start: start address inside vm_area
 * @end: end address inside vm_area
 */
void vm_area_unmap_pages(struct vm_struct *area, unsigned long start,
                         unsigned long end)
{
        if (check_sparse_vm_area(area, start, end))
                return;

        vunmap_range(start, end);
}

int is_vmalloc_or_module_addr(const void *x)
{
        /*
         * ARM, x86-64 and sparc64 put modules in a special place,
         * and fall back on vmalloc() if that fails. Others
         * just put it in the vmalloc space.
         */
#if defined(CONFIG_EXECMEM) && defined(MODULES_VADDR)
        unsigned long addr = (unsigned long)kasan_reset_tag(x);
        if (addr >= MODULES_VADDR && addr < MODULES_END)
                return 1;
#endif
        return is_vmalloc_addr(x);
}
EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr);

/*
 * Walk a vmap address to the struct page it maps. Huge vmap mappings will
 * return the tail page that corresponds to the base page address, which
 * matches small vmap mappings.
 */
struct page *vmalloc_to_page(const void *vmalloc_addr)
{
        unsigned long addr = (unsigned long) vmalloc_addr;
        struct page *page = NULL;
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;
        pte_t *ptep, pte;

        /*
         * XXX we might need to change this if we add VIRTUAL_BUG_ON for
         * architectures that do not vmalloc module space
         */
        VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));

        if (pgd_none(*pgd))
                return NULL;
        if (WARN_ON_ONCE(pgd_leaf(*pgd)))
                return NULL; /* XXX: no allowance for huge pgd */
        if (WARN_ON_ONCE(pgd_bad(*pgd)))
                return NULL;

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d))
                return NULL;
        if (p4d_leaf(*p4d))
                return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(p4d_bad(*p4d)))
                return NULL;

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud))
                return NULL;
        if (pud_leaf(*pud))
                return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pud_bad(*pud)))
                return NULL;

        pmd = pmd_offset(pud, addr);
        if (pmd_none(*pmd))
                return NULL;
        if (pmd_leaf(*pmd))
                return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
        if (WARN_ON_ONCE(pmd_bad(*pmd)))
                return NULL;

        ptep = pte_offset_kernel(pmd, addr);
        pte = ptep_get(ptep);
        if (pte_present(pte))
                page = pte_page(pte);

        return page;
}
EXPORT_SYMBOL(vmalloc_to_page);

/*
 * Map a vmalloc()-space virtual address to the physical page frame number.
 */
unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
{
        return page_to_pfn(vmalloc_to_page(vmalloc_addr));
}
EXPORT_SYMBOL(vmalloc_to_pfn);


/*** Global kva allocator ***/

#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0


static DEFINE_SPINLOCK(free_vmap_area_lock);
static bool vmap_initialized __read_mostly;

/*
 * This kmem_cache is used for vmap_area objects. Instead of
 * allocating from slab we reuse an object from this cache to
 * make things faster. Especially in "no edge" splitting of
 * free block.
 */
static struct kmem_cache *vmap_area_cachep;

/*
 * This linked list is used in pair with free_vmap_area_root.
 * It gives O(1) access to prev/next to perform fast coalescing.
 */
static LIST_HEAD(free_vmap_area_list);

/*
 * This augment red-black tree represents the free vmap space.
 * All vmap_area objects in this tree are sorted by va->va_start
 * address. It is used for allocation and merging when a vmap
 * object is released.
 *
 * Each vmap_area node contains a maximum available free block
 * of its sub-tree, right or left. Therefore it is possible to
 * find a lowest match of free area.
 */
static struct rb_root free_vmap_area_root = RB_ROOT;

/*
 * Preload a CPU with one object for "no edge" split case. The
 * aim is to get rid of allocations from the atomic context, thus
 * to use more permissive allocation masks.
 */
static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);

/*
 * This structure defines a single, solid model where a list and
 * rb-tree are part of one entity protected by the lock. Nodes are
 * sorted in ascending order, thus for O(1) access to left/right
 * neighbors a list is used as well as for sequential traversal.
 */
struct rb_list {
        struct rb_root root;
        struct list_head head;
        spinlock_t lock;
};

/*
 * A fast size storage contains VAs up to 1M size. A pool consists
 * of linked between each other ready to go VAs of certain sizes.
 * An index in the pool-array corresponds to number of pages + 1.
 */
#define MAX_VA_SIZE_PAGES 256

struct vmap_pool {
        struct list_head head;
        unsigned long len;
};

/*
 * An effective vmap-node logic. Users make use of nodes instead
 * of a global heap. It allows to balance an access and mitigate
 * contention.
 */
static struct vmap_node {
        /* Simple size segregated storage. */
        struct vmap_pool pool[MAX_VA_SIZE_PAGES];
        spinlock_t pool_lock;
        bool skip_populate;

        /* Bookkeeping data of this node. */
        struct rb_list busy;
        struct rb_list lazy;

        /*
         * Ready-to-free areas.
         */
        struct list_head purge_list;
        struct work_struct purge_work;
        unsigned long nr_purged;
} single;

/*
 * Initial setup consists of one single node, i.e. a balancing
 * is fully disabled. Later on, after vmap is initialized these
 * parameters are updated based on a system capacity.
 */
static struct vmap_node *vmap_nodes = &single;
static __read_mostly unsigned int nr_vmap_nodes = 1;
static __read_mostly unsigned int vmap_zone_size = 1;

static inline unsigned int
addr_to_node_id(unsigned long addr)
{
        return (addr / vmap_zone_size) % nr_vmap_nodes;
}

static inline struct vmap_node *
addr_to_node(unsigned long addr)
{
        return &vmap_nodes[addr_to_node_id(addr)];
}

static inline struct vmap_node *
id_to_node(unsigned int id)
{
        return &vmap_nodes[id % nr_vmap_nodes];
}

/*
 * We use the value 0 to represent "no node", that is why
 * an encoded value will be the node-id incremented by 1.
 * It is always greater then 0. A valid node_id which can
 * be encoded is [0:nr_vmap_nodes - 1]. If a passed node_id
 * is not valid 0 is returned.
 */
static unsigned int
encode_vn_id(unsigned int node_id)
{
        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return (node_id + 1) << BITS_PER_BYTE;

        /* Warn and no node encoded. */
        WARN_ONCE(1, "Encode wrong node id (%u)\n", node_id);
        return 0;
}

/*
 * Returns an encoded node-id, the valid range is within
 * [0:nr_vmap_nodes-1] values. Otherwise nr_vmap_nodes is
 * returned if extracted data is wrong.
 */
static unsigned int
decode_vn_id(unsigned int val)
{
        unsigned int node_id = (val >> BITS_PER_BYTE) - 1;

        /* Can store U8_MAX [0:254] nodes. */
        if (node_id < nr_vmap_nodes)
                return node_id;

        /* If it was _not_ zero, warn. */
        WARN_ONCE(node_id != UINT_MAX,
                "Decode wrong node id (%d)\n", node_id);

        return nr_vmap_nodes;
}

static bool
is_vn_id_valid(unsigned int node_id)
{
        if (node_id < nr_vmap_nodes)
                return true;

        return false;
}

static __always_inline unsigned long
va_size(struct vmap_area *va)
{
        return (va->va_end - va->va_start);
}

static __always_inline unsigned long
get_subtree_max_size(struct rb_node *node)
{
        struct vmap_area *va;

        va = rb_entry_safe(node, struct vmap_area, rb_node);
        return va ? va->subtree_max_size : 0;
}

RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
        struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)

static void reclaim_and_purge_vmap_areas(void);
static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
static void drain_vmap_area_work(struct work_struct *work);
static DECLARE_WORK(drain_vmap_work, drain_vmap_area_work);

static atomic_long_t nr_vmalloc_pages;

unsigned long vmalloc_nr_pages(void)
{
        return atomic_long_read(&nr_vmalloc_pages);
}

static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *va;

                va = rb_entry(n, struct vmap_area, rb_node);
                if (addr < va->va_start)
                        n = n->rb_left;
                else if (addr >= va->va_end)
                        n = n->rb_right;
                else
                        return va;
        }

        return NULL;
}

/* Look up the first VA which satisfies addr < va_end, NULL if none. */
static struct vmap_area *
__find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root)
{
        struct vmap_area *va = NULL;
        struct rb_node *n = root->rb_node;

        addr = (unsigned long)kasan_reset_tag((void *)addr);

        while (n) {
                struct vmap_area *tmp;

                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_end > addr) {
                        va = tmp;
                        if (tmp->va_start <= addr)
                                break;

                        n = n->rb_left;
                } else
                        n = n->rb_right;
        }

        return va;
}

/*
 * Returns a node where a first VA, that satisfies addr < va_end, resides.
 * If success, a node is locked. A user is responsible to unlock it when a
 * VA is no longer needed to be accessed.
 *
 * Returns NULL if nothing found.
 */
static struct vmap_node *
find_vmap_area_exceed_addr_lock(unsigned long addr, struct vmap_area **va)
{
        unsigned long va_start_lowest;
        struct vmap_node *vn;
        int i;

repeat:
        for (i = 0, va_start_lowest = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area_exceed_addr(addr, &vn->busy.root);

                if (*va)
                        if (!va_start_lowest || (*va)->va_start < va_start_lowest)
                                va_start_lowest = (*va)->va_start;
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Check if found VA exists, it might have gone away.  In this case we
         * repeat the search because a VA has been removed concurrently and we
         * need to proceed to the next one, which is a rare case.
         */
        if (va_start_lowest) {
                vn = addr_to_node(va_start_lowest);

                spin_lock(&vn->busy.lock);
                *va = __find_vmap_area(va_start_lowest, &vn->busy.root);

                if (*va)
                        return vn;

                spin_unlock(&vn->busy.lock);
                goto repeat;
        }

        return NULL;
}

/*
 * This function returns back addresses of parent node
 * and its left or right link for further processing.
 *
 * Otherwise NULL is returned. In that case all further
 * steps regarding inserting of conflicting overlap range
 * have to be declined and actually considered as a bug.
 */
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
        struct rb_root *root, struct rb_node *from,
        struct rb_node **parent)
{
        struct vmap_area *tmp_va;
        struct rb_node **link;

        if (root) {
                link = &root->rb_node;
                if (unlikely(!*link)) {
                        *parent = NULL;
                        return link;
                }
        } else {
                link = &from;
        }

        /*
         * Go to the bottom of the tree. When we hit the last point
         * we end up with parent rb_node and correct direction, i name
         * it link, where the new va->rb_node will be attached to.
         */
        do {
                tmp_va = rb_entry(*link, struct vmap_area, rb_node);

                /*
                 * During the traversal we also do some sanity check.
                 * Trigger the BUG() if there are sides(left/right)
                 * or full overlaps.
                 */
                if (va->va_end <= tmp_va->va_start)
                        link = &(*link)->rb_left;
                else if (va->va_start >= tmp_va->va_end)
                        link = &(*link)->rb_right;
                else {
                        WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
                                va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);

                        return NULL;
                }
        } while (*link);

        *parent = &tmp_va->rb_node;
        return link;
}

static __always_inline struct list_head *
get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
{
        struct list_head *list;

        if (unlikely(!parent))
                /*
                 * The red-black tree where we try to find VA neighbors
                 * before merging or inserting is empty, i.e. it means
                 * there is no free vmap space. Normally it does not
                 * happen but we handle this case anyway.
                 */
                return NULL;

        list = &rb_entry(parent, struct vmap_area, rb_node)->list;
        return (&parent->rb_right == link ? list->next : list);
}

static __always_inline void
__link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head, bool augment)
{
        /*
         * VA is still not in the list, but we can
         * identify its future previous list_head node.
         */
        if (likely(parent)) {
                head = &rb_entry(parent, struct vmap_area, rb_node)->list;
                if (&parent->rb_right != link)
                        head = head->prev;
        }

        /* Insert to the rb-tree */
        rb_link_node(&va->rb_node, parent, link);
        if (augment) {
                /*
                 * Some explanation here. Just perform simple insertion
                 * to the tree. We do not set va->subtree_max_size to
                 * its current size before calling rb_insert_augmented().
                 * It is because we populate the tree from the bottom
                 * to parent levels when the node _is_ in the tree.
                 *
                 * Therefore we set subtree_max_size to zero after insertion,
                 * to let __augment_tree_propagate_from() puts everything to
                 * the correct order later on.
                 */
                rb_insert_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
                va->subtree_max_size = 0;
        } else {
                rb_insert_color(&va->rb_node, root);
        }

        /* Address-sort this list */
        list_add(&va->list, head);
}

static __always_inline void
link_va(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, false);
}

static __always_inline void
link_va_augment(struct vmap_area *va, struct rb_root *root,
        struct rb_node *parent, struct rb_node **link,
        struct list_head *head)
{
        __link_va(va, root, parent, link, head, true);
}

static __always_inline void
__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
{
        if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
                return;

        if (augment)
                rb_erase_augmented(&va->rb_node,
                        root, &free_vmap_area_rb_augment_cb);
        else
                rb_erase(&va->rb_node, root);

        list_del_init(&va->list);
        RB_CLEAR_NODE(&va->rb_node);
}

static __always_inline void
unlink_va(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, false);
}

static __always_inline void
unlink_va_augment(struct vmap_area *va, struct rb_root *root)
{
        __unlink_va(va, root, true);
}

#if DEBUG_AUGMENT_PROPAGATE_CHECK
/*
 * Gets called when remove the node and rotate.
 */
static __always_inline unsigned long
compute_subtree_max_size(struct vmap_area *va)
{
        return max3(va_size(va),
                get_subtree_max_size(va->rb_node.rb_left),
                get_subtree_max_size(va->rb_node.rb_right));
}

static void
augment_tree_propagate_check(void)
{
        struct vmap_area *va;
        unsigned long computed_size;

        list_for_each_entry(va, &free_vmap_area_list, list) {
                computed_size = compute_subtree_max_size(va);
                if (computed_size != va->subtree_max_size)
                        pr_emerg("tree is corrupted: %lu, %lu\n",
                                va_size(va), va->subtree_max_size);
        }
}
#endif

/*
 * This function populates subtree_max_size from bottom to upper
 * levels starting from VA point. The propagation must be done
 * when VA size is modified by changing its va_start/va_end. Or
 * in case of newly inserting of VA to the tree.
 *
 * It means that __augment_tree_propagate_from() must be called:
 * - After VA has been inserted to the tree(free path);
 * - After VA has been shrunk(allocation path);
 * - After VA has been increased(merging path).
 *
 * Please note that, it does not mean that upper parent nodes
 * and their subtree_max_size are recalculated all the time up
 * to the root node.
 *
 *       4--8
 *        /\
 *       /  \
 *      /    \
 *    2--2  8--8
 *
 * For example if we modify the node 4, shrinking it to 2, then
 * no any modification is required. If we shrink the node 2 to 1
 * its subtree_max_size is updated only, and set to 1. If we shrink
 * the node 8 to 6, then its subtree_max_size is set to 6 and parent
 * node becomes 4--6.
 */
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
        /*
         * Populate the tree from bottom towards the root until
         * the calculated maximum available size of checked node
         * is equal to its current one.
         */
        free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);

#if DEBUG_AUGMENT_PROPAGATE_CHECK
        augment_tree_propagate_check();
#endif
}

static void
insert_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        link = find_va_links(va, root, NULL, &parent);
        if (link)
                link_va(va, root, parent, link, head);
}

static void
insert_vmap_area_augment(struct vmap_area *va,
        struct rb_node *from, struct rb_root *root,
        struct list_head *head)
{
        struct rb_node **link;
        struct rb_node *parent;

        if (from)
                link = find_va_links(va, NULL, from, &parent);
        else
                link = find_va_links(va, root, NULL, &parent);

        if (link) {
                link_va_augment(va, root, parent, link, head);
                augment_tree_propagate_from(va);
        }
}

/*
 * Merge de-allocated chunk of VA memory with previous
 * and next free blocks. If coalesce is not done a new
 * free area is inserted. If VA has been merged, it is
 * freed.
 *
 * Please note, it can return NULL in case of overlap
 * ranges, followed by WARN() report. Despite it is a
 * buggy behaviour, a system can be alive and keep
 * ongoing.
 */
static __always_inline struct vmap_area *
__merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head, bool augment)
{
        struct vmap_area *sibling;
        struct list_head *next;
        struct rb_node **link;
        struct rb_node *parent;
        bool merged = false;

        /*
         * Find a place in the tree where VA potentially will be
         * inserted, unless it is merged with its sibling/siblings.
         */
        link = find_va_links(va, root, NULL, &parent);
        if (!link)
                return NULL;

        /*
         * Get next node of VA to check if merging can be done.
         */
        next = get_va_next_sibling(parent, link);
        if (unlikely(next == NULL))
                goto insert;

        /*
         * start            end
         * |                |
         * |<------VA------>|<-----Next----->|
         *                  |                |
         *                  start            end
         */
        if (next != head) {
                sibling = list_entry(next, struct vmap_area, list);
                if (sibling->va_start == va->va_end) {
                        sibling->va_start = va->va_start;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

        /*
         * start            end
         * |                |
         * |<-----Prev----->|<------VA------>|
         *                  |                |
         *                  start            end
         */
        if (next->prev != head) {
                sibling = list_entry(next->prev, struct vmap_area, list);
                if (sibling->va_end == va->va_start) {
                        /*
                         * If both neighbors are coalesced, it is important
                         * to unlink the "next" node first, followed by merging
                         * with "previous" one. Otherwise the tree might not be
                         * fully populated if a sibling's augmented value is
                         * "normalized" because of rotation operations.
                         */
                        if (merged)
                                __unlink_va(va, root, augment);

                        sibling->va_end = va->va_end;

                        /* Free vmap_area object. */
                        kmem_cache_free(vmap_area_cachep, va);

                        /* Point to the new merged area. */
                        va = sibling;
                        merged = true;
                }
        }

insert:
        if (!merged)
                __link_va(va, root, parent, link, head, augment);

        return va;
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        return __merge_or_add_vmap_area(va, root, head, false);
}

static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area *va,
        struct rb_root *root, struct list_head *head)
{
        va = __merge_or_add_vmap_area(va, root, head, true);
        if (va)
                augment_tree_propagate_from(va);

        return va;
}

static __always_inline bool
is_within_this_va(struct vmap_area *va, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        unsigned long nva_start_addr;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Can be overflowed due to big size or alignment. */
        if (nva_start_addr + size < nva_start_addr ||
                        nva_start_addr < vstart)
                return false;

        return (nva_start_addr + size <= va->va_end);
}

/*
 * Find the first free block(lowest start address) in the tree,
 * that will accomplish the request corresponding to passing
 * parameters. Please note, with an alignment bigger than PAGE_SIZE,
 * a search length is adjusted to account for worst case alignment
 * overhead.
 */
static __always_inline struct vmap_area *
find_vmap_lowest_match(struct rb_root *root, unsigned long size,
        unsigned long align, unsigned long vstart, bool adjust_search_size)
{
        struct vmap_area *va;
        struct rb_node *node;
        unsigned long length;

        /* Start from the root. */
        node = root->rb_node;

        /* Adjust the search size for alignment overhead. */
        length = adjust_search_size ? size + align - 1 : size;

        while (node) {
                va = rb_entry(node, struct vmap_area, rb_node);

                if (get_subtree_max_size(node->rb_left) >= length &&
                                vstart < va->va_start) {
                        node = node->rb_left;
                } else {
                        if (is_within_this_va(va, size, align, vstart))
                                return va;

                        /*
                         * Does not make sense to go deeper towards the right
                         * sub-tree if it does not have a free block that is
                         * equal or bigger to the requested search length.
                         */
                        if (get_subtree_max_size(node->rb_right) >= length) {
                                node = node->rb_right;
                                continue;
                        }

                        /*
                         * OK. We roll back and find the first right sub-tree,
                         * that will satisfy the search criteria. It can happen
                         * due to "vstart" restriction or an alignment overhead
                         * that is bigger then PAGE_SIZE.
                         */
                        while ((node = rb_parent(node))) {
                                va = rb_entry(node, struct vmap_area, rb_node);
                                if (is_within_this_va(va, size, align, vstart))
                                        return va;

                                if (get_subtree_max_size(node->rb_right) >= length &&
                                                vstart <= va->va_start) {
                                        /*
                                         * Shift the vstart forward. Please note, we update it with
                                         * parent's start address adding "1" because we do not want
                                         * to enter same sub-tree after it has already been checked
                                         * and no suitable free block found there.
                                         */
                                        vstart = va->va_start + 1;
                                        node = node->rb_right;
                                        break;
                                }
                        }
                }
        }

        return NULL;
}

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
#include <linux/random.h>

static struct vmap_area *
find_vmap_lowest_linear_match(struct list_head *head, unsigned long size,
        unsigned long align, unsigned long vstart)
{
        struct vmap_area *va;

        list_for_each_entry(va, head, list) {
                if (!is_within_this_va(va, size, align, vstart))
                        continue;

                return va;
        }

        return NULL;
}

static void
find_vmap_lowest_match_check(struct rb_root *root, struct list_head *head,
                             unsigned long size, unsigned long align)
{
        struct vmap_area *va_1, *va_2;
        unsigned long vstart;
        unsigned int rnd;

        get_random_bytes(&rnd, sizeof(rnd));
        vstart = VMALLOC_START + rnd;

        va_1 = find_vmap_lowest_match(root, size, align, vstart, false);
        va_2 = find_vmap_lowest_linear_match(head, size, align, vstart);

        if (va_1 != va_2)
                pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
                        va_1, va_2, vstart);
}
#endif

enum fit_type {
        NOTHING_FIT = 0,
        FL_FIT_TYPE = 1,        /* full fit */
        LE_FIT_TYPE = 2,        /* left edge fit */
        RE_FIT_TYPE = 3,        /* right edge fit */
        NE_FIT_TYPE = 4                /* no edge fit */
};

static __always_inline enum fit_type
classify_va_fit_type(struct vmap_area *va,
        unsigned long nva_start_addr, unsigned long size)
{
        enum fit_type type;

        /* Check if it is within VA. */
        if (nva_start_addr < va->va_start ||
                        nva_start_addr + size > va->va_end)
                return NOTHING_FIT;

        /* Now classify. */
        if (va->va_start == nva_start_addr) {
                if (va->va_end == nva_start_addr + size)
                        type = FL_FIT_TYPE;
                else
                        type = LE_FIT_TYPE;
        } else if (va->va_end == nva_start_addr + size) {
                type = RE_FIT_TYPE;
        } else {
                type = NE_FIT_TYPE;
        }

        return type;
}

static __always_inline int
va_clip(struct rb_root *root, struct list_head *head,
                struct vmap_area *va, unsigned long nva_start_addr,
                unsigned long size)
{
        struct vmap_area *lva = NULL;
        enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);

        if (type == FL_FIT_TYPE) {
                /*
                 * No need to split VA, it fully fits.
                 *
                 * |               |
                 * V      NVA      V
                 * |---------------|
                 */
                unlink_va_augment(va, root);
                kmem_cache_free(vmap_area_cachep, va);
        } else if (type == LE_FIT_TYPE) {
                /*
                 * Split left edge of fit VA.
                 *
                 * |       |
                 * V  NVA  V   R
                 * |-------|-------|
                 */
                va->va_start += size;
        } else if (type == RE_FIT_TYPE) {
                /*
                 * Split right edge of fit VA.
                 *
                 *         |       |
                 *     L   V  NVA  V
                 * |-------|-------|
                 */
                va->va_end = nva_start_addr;
        } else if (type == NE_FIT_TYPE) {
                /*
                 * Split no edge of fit VA.
                 *
                 *     |       |
                 *   L V  NVA  V R
                 * |---|-------|---|
                 */
                lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
                if (unlikely(!lva)) {
                        /*
                         * For percpu allocator we do not do any pre-allocation
                         * and leave it as it is. The reason is it most likely
                         * never ends up with NE_FIT_TYPE splitting. In case of
                         * percpu allocations offsets and sizes are aligned to
                         * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
                         * are its main fitting cases.
                         *
                         * There are a few exceptions though, as an example it is
                         * a first allocation (early boot up) when we have "one"
                         * big free space that has to be split.
                         *
                         * Also we can hit this path in case of regular "vmap"
                         * allocations, if "this" current CPU was not preloaded.
                         * See the comment in alloc_vmap_area() why. If so, then
                         * GFP_NOWAIT is used instead to get an extra object for
                         * split purpose. That is rare and most time does not
                         * occur.
                         *
                         * What happens if an allocation gets failed. Basically,
                         * an "overflow" path is triggered to purge lazily freed
                         * areas to free some memory, then, the "retry" path is
                         * triggered to repeat one more time. See more details
                         * in alloc_vmap_area() function.
                         */
                        lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!lva)
                                return -1;
                }

                /*
                 * Build the remainder.
                 */
                lva->va_start = va->va_start;
                lva->va_end = nva_start_addr;

                /*
                 * Shrink this VA to remaining size.
                 */
                va->va_start = nva_start_addr + size;
        } else {
                return -1;
        }

        if (type != FL_FIT_TYPE) {
                augment_tree_propagate_from(va);

                if (lva)        /* type == NE_FIT_TYPE */
                        insert_vmap_area_augment(lva, &va->rb_node, root, head);
        }

        return 0;
}

static unsigned long
va_alloc(struct vmap_area *va,
                struct rb_root *root, struct list_head *head,
                unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend)
{
        unsigned long nva_start_addr;
        int ret;

        if (va->va_start > vstart)
                nva_start_addr = ALIGN(va->va_start, align);
        else
                nva_start_addr = ALIGN(vstart, align);

        /* Check the "vend" restriction. */
        if (nva_start_addr + size > vend)
                return vend;

        /* Update the free vmap_area. */
        ret = va_clip(root, head, va, nva_start_addr, size);
        if (WARN_ON_ONCE(ret))
                return vend;

        return nva_start_addr;
}

/*
 * Returns a start address of the newly allocated area, if success.
 * Otherwise a vend is returned that indicates failure.
 */
static __always_inline unsigned long
__alloc_vmap_area(struct rb_root *root, struct list_head *head,
        unsigned long size, unsigned long align,
        unsigned long vstart, unsigned long vend)
{
        bool adjust_search_size = true;
        unsigned long nva_start_addr;
        struct vmap_area *va;

        /*
         * Do not adjust when:
         *   a) align <= PAGE_SIZE, because it does not make any sense.
         *      All blocks(their start addresses) are at least PAGE_SIZE
         *      aligned anyway;
         *   b) a short range where a requested size corresponds to exactly
         *      specified [vstart:vend] interval and an alignment > PAGE_SIZE.
         *      With adjusted search length an allocation would not succeed.
         */
        if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
                adjust_search_size = false;

        va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
        if (unlikely(!va))
                return vend;

        nva_start_addr = va_alloc(va, root, head, size, align, vstart, vend);
        if (nva_start_addr == vend)
                return vend;

#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
        find_vmap_lowest_match_check(root, head, size, align);
#endif

        return nva_start_addr;
}

/*
 * Free a region of KVA allocated by alloc_vmap_area
 */
static void free_vmap_area(struct vmap_area *va)
{
        struct vmap_node *vn = addr_to_node(va->va_start);

        /*
         * Remove from the busy tree/list.
         */
        spin_lock(&vn->busy.lock);
        unlink_va(va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        /*
         * Insert/Merge it back to the free tree/list.
         */
        spin_lock(&free_vmap_area_lock);
        merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static inline void
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
{
        struct vmap_area *va = NULL, *tmp;

        /*
         * Preload this CPU with one extra vmap_area object. It is used
         * when fit type of free area is NE_FIT_TYPE. It guarantees that
         * a CPU that does an allocation is preloaded.
         *
         * We do it in non-atomic context, thus it allows us to use more
         * permissive allocation masks to be more stable under low memory
         * condition and high memory pressure.
         */
        if (!this_cpu_read(ne_fit_preload_node))
                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);

        spin_lock(lock);

        tmp = NULL;
        if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
                kmem_cache_free(vmap_area_cachep, va);
}

static struct vmap_pool *
size_to_va_pool(struct vmap_node *vn, unsigned long size)
{
        unsigned int idx = (size - 1) / PAGE_SIZE;

        if (idx < MAX_VA_SIZE_PAGES)
                return &vn->pool[idx];

        return NULL;
}

static bool
node_pool_add_va(struct vmap_node *n, struct vmap_area *va)
{
        struct vmap_pool *vp;

        vp = size_to_va_pool(n, va_size(va));
        if (!vp)
                return false;

        spin_lock(&n->pool_lock);
        list_add(&va->list, &vp->head);
        WRITE_ONCE(vp->len, vp->len + 1);
        spin_unlock(&n->pool_lock);

        return true;
}

static struct vmap_area *
node_pool_del_va(struct vmap_node *vn, unsigned long size,
                unsigned long align, unsigned long vstart,
                unsigned long vend)
{
        struct vmap_area *va = NULL;
        struct vmap_pool *vp;
        int err = 0;

        vp = size_to_va_pool(vn, size);
        if (!vp || list_empty(&vp->head))
                return NULL;

        spin_lock(&vn->pool_lock);
        if (!list_empty(&vp->head)) {
                va = list_first_entry(&vp->head, struct vmap_area, list);

                if (IS_ALIGNED(va->va_start, align)) {
                        /*
                         * Do some sanity check and emit a warning
                         * if one of below checks detects an error.
                         */
                        err |= (va_size(va) != size);
                        err |= (va->va_start < vstart);
                        err |= (va->va_end > vend);

                        if (!WARN_ON_ONCE(err)) {
                                list_del_init(&va->list);
                                WRITE_ONCE(vp->len, vp->len - 1);
                        } else {
                                va = NULL;
                        }
                } else {
                        list_move_tail(&va->list, &vp->head);
                        va = NULL;
                }
        }
        spin_unlock(&vn->pool_lock);

        return va;
}

static struct vmap_area *
node_alloc(unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend,
                unsigned long *addr, unsigned int *vn_id)
{
        struct vmap_area *va;

        *vn_id = 0;
        *addr = vend;

        /*
         * Fallback to a global heap if not vmalloc or there
         * is only one node.
         */
        if (vstart != VMALLOC_START || vend != VMALLOC_END ||
                        nr_vmap_nodes == 1)
                return NULL;

        *vn_id = raw_smp_processor_id() % nr_vmap_nodes;
        va = node_pool_del_va(id_to_node(*vn_id), size, align, vstart, vend);
        *vn_id = encode_vn_id(*vn_id);

        if (va)
                *addr = va->va_start;

        return va;
}

static inline void setup_vmalloc_vm(struct vm_struct *vm,
        struct vmap_area *va, unsigned long flags, const void *caller)
{
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = va_size(va);
        vm->caller = caller;
        va->vm = vm;
}

/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend. If vm is passed in, the two will also be bound.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
                                unsigned long align,
                                unsigned long vstart, unsigned long vend,
                                int node, gfp_t gfp_mask,
                                unsigned long va_flags, struct vm_struct *vm)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        unsigned long freed;
        unsigned long addr;
        unsigned int vn_id;
        int purged = 0;
        int ret;

        if (unlikely(!size || offset_in_page(size) || !is_power_of_2(align)))
                return ERR_PTR(-EINVAL);

        if (unlikely(!vmap_initialized))
                return ERR_PTR(-EBUSY);

        might_sleep();

        /*
         * If a VA is obtained from a global heap(if it fails here)
         * it is anyway marked with this "vn_id" so it is returned
         * to this pool's node later. Such way gives a possibility
         * to populate pools based on users demand.
         *
         * On success a ready to go VA is returned.
         */
        va = node_alloc(size, align, vstart, vend, &addr, &vn_id);
        if (!va) {
                gfp_mask = gfp_mask & GFP_RECLAIM_MASK;

                va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
                if (unlikely(!va))
                        return ERR_PTR(-ENOMEM);

                /*
                 * Only scan the relevant parts containing pointers to other objects
                 * to avoid false negatives.
                 */
                kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
        }

retry:
        if (addr == vend) {
                preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
                addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
                        size, align, vstart, vend);
                spin_unlock(&free_vmap_area_lock);
        }

        trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend);

        /*
         * If an allocation fails, the "vend" address is
         * returned. Therefore trigger the overflow path.
         */
        if (unlikely(addr == vend))
                goto overflow;

        va->va_start = addr;
        va->va_end = addr + size;
        va->vm = NULL;
        va->flags = (va_flags | vn_id);

        if (vm) {
                vm->addr = (void *)va->va_start;
                vm->size = va_size(va);
                va->vm = vm;
        }

        vn = addr_to_node(va->va_start);

        spin_lock(&vn->busy.lock);
        insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        spin_unlock(&vn->busy.lock);

        BUG_ON(!IS_ALIGNED(va->va_start, align));
        BUG_ON(va->va_start < vstart);
        BUG_ON(va->va_end > vend);

        ret = kasan_populate_vmalloc(addr, size);
        if (ret) {
                free_vmap_area(va);
                return ERR_PTR(ret);
        }

        return va;

overflow:
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = 1;
                goto retry;
        }

        freed = 0;
        blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);

        if (freed > 0) {
                purged = 0;
                goto retry;
        }

        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
                pr_warn("vmalloc_node_range for size %lu failed: Address range restricted to %#lx - %#lx\n",
                                size, vstart, vend);

        kmem_cache_free(vmap_area_cachep, va);
        return ERR_PTR(-EBUSY);
}

int register_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);

int unregister_vmap_purge_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
}
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);

/*
 * lazy_max_pages is the maximum amount of virtual address space we gather up
 * before attempting to purge with a TLB flush.
 *
 * There is a tradeoff here: a larger number will cover more kernel page tables
 * and take slightly longer to purge, but it will linearly reduce the number of
 * global TLB flushes that must be performed. It would seem natural to scale
 * this number up linearly with the number of CPUs (because vmapping activity
 * could also scale linearly with the number of CPUs), however it is likely
 * that in practice, workloads might be constrained in other ways that mean
 * vmap activity will not scale linearly with CPUs. Also, I want to be
 * conservative and not introduce a big latency on huge systems, so go with
 * a less aggressive log scale. It will still be an improvement over the old
 * code, and it will be simple to change the scale factor if we find that it
 * becomes a problem on bigger systems.
 */
static unsigned long lazy_max_pages(void)
{
        unsigned int log;

        log = fls(num_online_cpus());

        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}

static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);

/*
 * Serialize vmap purging.  There is no actual critical section protected
 * by this lock, but we want to avoid concurrent calls for performance
 * reasons and to make the pcpu_get_vm_areas more deterministic.
 */
static DEFINE_MUTEX(vmap_purge_lock);

/* for per-CPU blocks */
static void purge_fragmented_blocks_allcpus(void);
static cpumask_t purge_nodes;

static void
reclaim_list_global(struct list_head *head)
{
        struct vmap_area *va, *n;

        if (list_empty(head))
                return;

        spin_lock(&free_vmap_area_lock);
        list_for_each_entry_safe(va, n, head, list)
                merge_or_add_vmap_area_augment(va,
                        &free_vmap_area_root, &free_vmap_area_list);
        spin_unlock(&free_vmap_area_lock);
}

static void
decay_va_pool_node(struct vmap_node *vn, bool full_decay)
{
        LIST_HEAD(decay_list);
        struct rb_root decay_root = RB_ROOT;
        struct vmap_area *va, *nva;
        unsigned long n_decay;
        int i;

        for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                LIST_HEAD(tmp_list);

                if (list_empty(&vn->pool[i].head))
                        continue;

                /* Detach the pool, so no-one can access it. */
                spin_lock(&vn->pool_lock);
                list_replace_init(&vn->pool[i].head, &tmp_list);
                spin_unlock(&vn->pool_lock);

                if (full_decay)
                        WRITE_ONCE(vn->pool[i].len, 0);

                /* Decay a pool by ~25% out of left objects. */
                n_decay = vn->pool[i].len >> 2;

                list_for_each_entry_safe(va, nva, &tmp_list, list) {
                        list_del_init(&va->list);
                        merge_or_add_vmap_area(va, &decay_root, &decay_list);

                        if (!full_decay) {
                                WRITE_ONCE(vn->pool[i].len, vn->pool[i].len - 1);

                                if (!--n_decay)
                                        break;
                        }
                }

                /*
                 * Attach the pool back if it has been partly decayed.
                 * Please note, it is supposed that nobody(other contexts)
                 * can populate the pool therefore a simple list replace
                 * operation takes place here.
                 */
                if (!full_decay && !list_empty(&tmp_list)) {
                        spin_lock(&vn->pool_lock);
                        list_replace_init(&tmp_list, &vn->pool[i].head);
                        spin_unlock(&vn->pool_lock);
                }
        }

        reclaim_list_global(&decay_list);
}

static void
kasan_release_vmalloc_node(struct vmap_node *vn)
{
        struct vmap_area *va;
        unsigned long start, end;

        start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
        end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;

        list_for_each_entry(va, &vn->purge_list, list) {
                if (is_vmalloc_or_module_addr((void *) va->va_start))
                        kasan_release_vmalloc(va->va_start, va->va_end,
                                va->va_start, va->va_end,
                                KASAN_VMALLOC_PAGE_RANGE);
        }

        kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
}

static void purge_vmap_node(struct work_struct *work)
{
        struct vmap_node *vn = container_of(work,
                struct vmap_node, purge_work);
        unsigned long nr_purged_pages = 0;
        struct vmap_area *va, *n_va;
        LIST_HEAD(local_list);

        if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
                kasan_release_vmalloc_node(vn);

        vn->nr_purged = 0;

        list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
                unsigned long nr = va_size(va) >> PAGE_SHIFT;
                unsigned int vn_id = decode_vn_id(va->flags);

                list_del_init(&va->list);

                nr_purged_pages += nr;
                vn->nr_purged++;

                if (is_vn_id_valid(vn_id) && !vn->skip_populate)
                        if (node_pool_add_va(vn, va))
                                continue;

                /* Go back to global. */
                list_add(&va->list, &local_list);
        }

        atomic_long_sub(nr_purged_pages, &vmap_lazy_nr);

        reclaim_list_global(&local_list);
}

/*
 * Purges all lazily-freed vmap areas.
 */
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end,
                bool full_pool_decay)
{
        unsigned long nr_purged_areas = 0;
        unsigned int nr_purge_helpers;
        unsigned int nr_purge_nodes;
        struct vmap_node *vn;
        int i;

        lockdep_assert_held(&vmap_purge_lock);

        /*
         * Use cpumask to mark which node has to be processed.
         */
        purge_nodes = CPU_MASK_NONE;

        for (i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                INIT_LIST_HEAD(&vn->purge_list);
                vn->skip_populate = full_pool_decay;
                decay_va_pool_node(vn, full_pool_decay);

                if (RB_EMPTY_ROOT(&vn->lazy.root))
                        continue;

                spin_lock(&vn->lazy.lock);
                WRITE_ONCE(vn->lazy.root.rb_node, NULL);
                list_replace_init(&vn->lazy.head, &vn->purge_list);
                spin_unlock(&vn->lazy.lock);

                start = min(start, list_first_entry(&vn->purge_list,
                        struct vmap_area, list)->va_start);

                end = max(end, list_last_entry(&vn->purge_list,
                        struct vmap_area, list)->va_end);

                cpumask_set_cpu(i, &purge_nodes);
        }

        nr_purge_nodes = cpumask_weight(&purge_nodes);
        if (nr_purge_nodes > 0) {
                flush_tlb_kernel_range(start, end);

                /* One extra worker is per a lazy_max_pages() full set minus one. */
                nr_purge_helpers = atomic_long_read(&vmap_lazy_nr) / lazy_max_pages();
                nr_purge_helpers = clamp(nr_purge_helpers, 1U, nr_purge_nodes) - 1;

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (nr_purge_helpers > 0) {
                                INIT_WORK(&vn->purge_work, purge_vmap_node);

                                if (cpumask_test_cpu(i, cpu_online_mask))
                                        schedule_work_on(i, &vn->purge_work);
                                else
                                        schedule_work(&vn->purge_work);

                                nr_purge_helpers--;
                        } else {
                                vn->purge_work.func = NULL;
                                purge_vmap_node(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }

                for_each_cpu(i, &purge_nodes) {
                        vn = &vmap_nodes[i];

                        if (vn->purge_work.func) {
                                flush_work(&vn->purge_work);
                                nr_purged_areas += vn->nr_purged;
                        }
                }
        }

        trace_purge_vmap_area_lazy(start, end, nr_purged_areas);
        return nr_purged_areas > 0;
}

/*
 * Reclaim vmap areas by purging fragmented blocks and purge_vmap_area_list.
 */
static void reclaim_and_purge_vmap_areas(void)

{
        mutex_lock(&vmap_purge_lock);
        purge_fragmented_blocks_allcpus();
        __purge_vmap_area_lazy(ULONG_MAX, 0, true);
        mutex_unlock(&vmap_purge_lock);
}

static void drain_vmap_area_work(struct work_struct *work)
{
        mutex_lock(&vmap_purge_lock);
        __purge_vmap_area_lazy(ULONG_MAX, 0, false);
        mutex_unlock(&vmap_purge_lock);
}

/*
 * Free a vmap area, caller ensuring that the area has been unmapped,
 * unlinked and flush_cache_vunmap had been called for the correct
 * range previously.
 */
static void free_vmap_area_noflush(struct vmap_area *va)
{
        unsigned long nr_lazy_max = lazy_max_pages();
        unsigned long va_start = va->va_start;
        unsigned int vn_id = decode_vn_id(va->flags);
        struct vmap_node *vn;
        unsigned long nr_lazy;

        if (WARN_ON_ONCE(!list_empty(&va->list)))
                return;

        nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT,
                                         &vmap_lazy_nr);

        /*
         * If it was request by a certain node we would like to
         * return it to that node, i.e. its pool for later reuse.
         */
        vn = is_vn_id_valid(vn_id) ?
                id_to_node(vn_id):addr_to_node(va->va_start);

        spin_lock(&vn->lazy.lock);
        insert_vmap_area(va, &vn->lazy.root, &vn->lazy.head);
        spin_unlock(&vn->lazy.lock);

        trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max);

        /* After this point, we may free va at any time */
        if (unlikely(nr_lazy > nr_lazy_max))
                schedule_work(&drain_vmap_work);
}

/*
 * Free and unmap a vmap area
 */
static void free_unmap_vmap_area(struct vmap_area *va)
{
        flush_cache_vunmap(va->va_start, va->va_end);
        vunmap_range_noflush(va->va_start, va->va_end);
        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(va->va_start, va->va_end);

        free_vmap_area_noflush(va);
}

struct vmap_area *find_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        if (unlikely(!vmap_initialized))
                return NULL;

        /*
         * An addr_to_node_id(addr) converts an address to a node index
         * where a VA is located. If VA spans several zones and passed
         * addr is not the same as va->va_start, what is not common, we
         * may need to scan extra nodes. See an example:
         *
         *      <----va---->
         * -|-----|-----|-----|-----|-
         *     1     2     0     1
         *
         * VA resides in node 1 whereas it spans 1, 2 an 0. If passed
         * addr is within 2 or 0 nodes we should do extra work.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + 1) % nr_vmap_nodes) != j);

        return NULL;
}

static struct vmap_area *find_unlink_vmap_area(unsigned long addr)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i, j;

        /*
         * Check the comment in the find_vmap_area() about the loop.
         */
        i = j = addr_to_node_id(addr);
        do {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                va = __find_vmap_area(addr, &vn->busy.root);
                if (va)
                        unlink_va(va, &vn->busy.root);
                spin_unlock(&vn->busy.lock);

                if (va)
                        return va;
        } while ((i = (i + 1) % nr_vmap_nodes) != j);

        return NULL;
}

/*** Per cpu kva allocator ***/

/*
 * vmap space is limited especially on 32 bit architectures. Ensure there is
 * room for at least 16 percpu vmap blocks per CPU.
 */
/*
 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
 * to #define VMALLOC_SPACE                (VMALLOC_END-VMALLOC_START). Guess
 * instead (we just need a rough idea)
 */
#if BITS_PER_LONG == 32
#define VMALLOC_SPACE                (128UL*1024*1024)
#else
#define VMALLOC_SPACE                (128UL*1024*1024*1024)
#endif

#define VMALLOC_PAGES                (VMALLOC_SPACE / PAGE_SIZE)
#define VMAP_MAX_ALLOC                BITS_PER_LONG        /* 256K with 4K pages */
#define VMAP_BBMAP_BITS_MAX        1024        /* 4MB with 4K pages */
#define VMAP_BBMAP_BITS_MIN        (VMAP_MAX_ALLOC*2)
#define VMAP_MIN(x, y)                ((x) < (y) ? (x) : (y)) /* can't use min() */
#define VMAP_MAX(x, y)                ((x) > (y) ? (x) : (y)) /* can't use max() */
#define VMAP_BBMAP_BITS                \
                VMAP_MIN(VMAP_BBMAP_BITS_MAX,        \
                VMAP_MAX(VMAP_BBMAP_BITS_MIN,        \
                        VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))

#define VMAP_BLOCK_SIZE                (VMAP_BBMAP_BITS * PAGE_SIZE)

/*
 * Purge threshold to prevent overeager purging of fragmented blocks for
 * regular operations: Purge if vb->free is less than 1/4 of the capacity.
 */
#define VMAP_PURGE_THRESHOLD        (VMAP_BBMAP_BITS / 4)

#define VMAP_RAM                0x1 /* indicates vm_map_ram area*/
#define VMAP_BLOCK                0x2 /* mark out the vmap_block sub-type*/
#define VMAP_FLAGS_MASK                0x3

struct vmap_block_queue {
        spinlock_t lock;
        struct list_head free;

        /*
         * An xarray requires an extra memory dynamically to
         * be allocated. If it is an issue, we can use rb-tree
         * instead.
         */
        struct xarray vmap_blocks;
};

struct vmap_block {
        spinlock_t lock;
        struct vmap_area *va;
        unsigned long free, dirty;
        DECLARE_BITMAP(used_map, VMAP_BBMAP_BITS);
        unsigned long dirty_min, dirty_max; /*< dirty range */
        struct list_head free_list;
        struct rcu_head rcu_head;
        struct list_head purge;
        unsigned int cpu;
};

/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);

/*
 * In order to fast access to any "vmap_block" associated with a
 * specific address, we use a hash.
 *
 * A per-cpu vmap_block_queue is used in both ways, to serialize
 * an access to free block chains among CPUs(alloc path) and it
 * also acts as a vmap_block hash(alloc/free paths). It means we
 * overload it, since we already have the per-cpu array which is
 * used as a hash table. When used as a hash a 'cpu' passed to
 * per_cpu() is not actually a CPU but rather a hash index.
 *
 * A hash function is addr_to_vb_xa() which hashes any address
 * to a specific index(in a hash) it belongs to. This then uses a
 * per_cpu() macro to access an array with generated index.
 *
 * An example:
 *
 *  CPU_1  CPU_2  CPU_0
 *    |      |      |
 *    V      V      V
 * 0     10     20     30     40     50     60
 * |------|------|------|------|------|------|...<vmap address space>
 *   CPU0   CPU1   CPU2   CPU0   CPU1   CPU2
 *
 * - CPU_1 invokes vm_unmap_ram(6), 6 belongs to CPU0 zone, thus
 *   it access: CPU0/INDEX0 -> vmap_blocks -> xa_lock;
 *
 * - CPU_2 invokes vm_unmap_ram(11), 11 belongs to CPU1 zone, thus
 *   it access: CPU1/INDEX1 -> vmap_blocks -> xa_lock;
 *
 * - CPU_0 invokes vm_unmap_ram(20), 20 belongs to CPU2 zone, thus
 *   it access: CPU2/INDEX2 -> vmap_blocks -> xa_lock.
 *
 * This technique almost always avoids lock contention on insert/remove,
 * however xarray spinlocks protect against any contention that remains.
 */
static struct xarray *
addr_to_vb_xa(unsigned long addr)
{
        int index = (addr / VMAP_BLOCK_SIZE) % nr_cpu_ids;

        /*
         * Please note, nr_cpu_ids points on a highest set
         * possible bit, i.e. we never invoke cpumask_next()
         * if an index points on it which is nr_cpu_ids - 1.
         */
        if (!cpu_possible(index))
                index = cpumask_next(index, cpu_possible_mask);

        return &per_cpu(vmap_block_queue, index).vmap_blocks;
}

/*
 * We should probably have a fallback mechanism to allocate virtual memory
 * out of partially filled vmap blocks. However vmap block sizing should be
 * fairly reasonable according to the vmalloc size, so it shouldn't be a
 * big problem.
 */

static unsigned long addr_to_vb_idx(unsigned long addr)
{
        addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
        addr /= VMAP_BLOCK_SIZE;
        return addr;
}

static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
{
        unsigned long addr;

        addr = va_start + (pages_off << PAGE_SHIFT);
        BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
        return (void *)addr;
}

/**
 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
 *                  block. Of course pages number can't exceed VMAP_BBMAP_BITS
 * @order:    how many 2^order pages should be occupied in newly allocated block
 * @gfp_mask: flags for the page level allocator
 *
 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
 */
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        struct vmap_area *va;
        struct xarray *xa;
        unsigned long vb_idx;
        int node, err;
        void *vaddr;

        node = numa_node_id();

        vb = kmalloc_node(sizeof(struct vmap_block),
                        gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!vb))
                return ERR_PTR(-ENOMEM);

        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask,
                                        VMAP_RAM|VMAP_BLOCK, NULL);
        if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }

        vaddr = vmap_block_vaddr(va->va_start, 0);
        spin_lock_init(&vb->lock);
        vb->va = va;
        /* At least something should be left free */
        BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
        bitmap_zero(vb->used_map, VMAP_BBMAP_BITS);
        vb->free = VMAP_BBMAP_BITS - (1UL << order);
        vb->dirty = 0;
        vb->dirty_min = VMAP_BBMAP_BITS;
        vb->dirty_max = 0;
        bitmap_set(vb->used_map, 0, (1UL << order));
        INIT_LIST_HEAD(&vb->free_list);
        vb->cpu = raw_smp_processor_id();

        xa = addr_to_vb_xa(va->va_start);
        vb_idx = addr_to_vb_idx(va->va_start);
        err = xa_insert(xa, vb_idx, vb, gfp_mask);
        if (err) {
                kfree(vb);
                free_vmap_area(va);
                return ERR_PTR(err);
        }
        /*
         * list_add_tail_rcu could happened in another core
         * rather than vb->cpu due to task migration, which
         * is safe as list_add_tail_rcu will ensure the list's
         * integrity together with list_for_each_rcu from read
         * side.
         */
        vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
        spin_lock(&vbq->lock);
        list_add_tail_rcu(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);

        return vaddr;
}

static void free_vmap_block(struct vmap_block *vb)
{
        struct vmap_node *vn;
        struct vmap_block *tmp;
        struct xarray *xa;

        xa = addr_to_vb_xa(vb->va->va_start);
        tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start));
        BUG_ON(tmp != vb);

        vn = addr_to_node(vb->va->va_start);
        spin_lock(&vn->busy.lock);
        unlink_va(vb->va, &vn->busy.root);
        spin_unlock(&vn->busy.lock);

        free_vmap_area_noflush(vb->va);
        kfree_rcu(vb, rcu_head);
}

static bool purge_fragmented_block(struct vmap_block *vb,
                struct list_head *purge_list, bool force_purge)
{
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);

        if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
            vb->dirty == VMAP_BBMAP_BITS)
                return false;

        /* Don't overeagerly purge usable blocks unless requested */
        if (!(force_purge || vb->free < VMAP_PURGE_THRESHOLD))
                return false;

        /* prevent further allocs after releasing lock */
        WRITE_ONCE(vb->free, 0);
        /* prevent purging it again */
        WRITE_ONCE(vb->dirty, VMAP_BBMAP_BITS);
        vb->dirty_min = 0;
        vb->dirty_max = VMAP_BBMAP_BITS;
        spin_lock(&vbq->lock);
        list_del_rcu(&vb->free_list);
        spin_unlock(&vbq->lock);
        list_add_tail(&vb->purge, purge_list);
        return true;
}

static void free_purged_blocks(struct list_head *purge_list)
{
        struct vmap_block *vb, *n_vb;

        list_for_each_entry_safe(vb, n_vb, purge_list, purge) {
                list_del(&vb->purge);
                free_vmap_block(vb);
        }
}

static void purge_fragmented_blocks(int cpu)
{
        LIST_HEAD(purge);
        struct vmap_block *vb;
        struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);

        rcu_read_lock();
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long free = READ_ONCE(vb->free);
                unsigned long dirty = READ_ONCE(vb->dirty);

                if (free + dirty != VMAP_BBMAP_BITS ||
                    dirty == VMAP_BBMAP_BITS)
                        continue;

                spin_lock(&vb->lock);
                purge_fragmented_block(vb, &purge, true);
                spin_unlock(&vb->lock);
        }
        rcu_read_unlock();
        free_purged_blocks(&purge);
}

static void purge_fragmented_blocks_allcpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                purge_fragmented_blocks(cpu);
}

static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
        struct vmap_block_queue *vbq;
        struct vmap_block *vb;
        void *vaddr = NULL;
        unsigned int order;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
        if (WARN_ON(size == 0)) {
                /*
                 * Allocating 0 bytes isn't what caller wants since
                 * get_order(0) returns funny result. Just warn and terminate
                 * early.
                 */
                return ERR_PTR(-EINVAL);
        }
        order = get_order(size);

        rcu_read_lock();
        vbq = raw_cpu_ptr(&vmap_block_queue);
        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
                unsigned long pages_off;

                if (READ_ONCE(vb->free) < (1UL << order))
                        continue;

                spin_lock(&vb->lock);
                if (vb->free < (1UL << order)) {
                        spin_unlock(&vb->lock);
                        continue;
                }

                pages_off = VMAP_BBMAP_BITS - vb->free;
                vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
                WRITE_ONCE(vb->free, vb->free - (1UL << order));
                bitmap_set(vb->used_map, pages_off, (1UL << order));
                if (vb->free == 0) {
                        spin_lock(&vbq->lock);
                        list_del_rcu(&vb->free_list);
                        spin_unlock(&vbq->lock);
                }

                spin_unlock(&vb->lock);
                break;
        }

        rcu_read_unlock();

        /* Allocate new block if nothing was found */
        if (!vaddr)
                vaddr = new_vmap_block(order, gfp_mask);

        return vaddr;
}

static void vb_free(unsigned long addr, unsigned long size)
{
        unsigned long offset;
        unsigned int order;
        struct vmap_block *vb;
        struct xarray *xa;

        BUG_ON(offset_in_page(size));
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);

        flush_cache_vunmap(addr, addr + size);

        order = get_order(size);
        offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;

        xa = addr_to_vb_xa(addr);
        vb = xa_load(xa, addr_to_vb_idx(addr));

        spin_lock(&vb->lock);
        bitmap_clear(vb->used_map, offset, (1UL << order));
        spin_unlock(&vb->lock);

        vunmap_range_noflush(addr, addr + size);

        if (debug_pagealloc_enabled_static())
                flush_tlb_kernel_range(addr, addr + size);

        spin_lock(&vb->lock);

        /* Expand the not yet TLB flushed dirty range */
        vb->dirty_min = min(vb->dirty_min, offset);
        vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));

        WRITE_ONCE(vb->dirty, vb->dirty + (1UL << order));
        if (vb->dirty == VMAP_BBMAP_BITS) {
                BUG_ON(vb->free);
                spin_unlock(&vb->lock);
                free_vmap_block(vb);
        } else
                spin_unlock(&vb->lock);
}

static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
        LIST_HEAD(purge_list);
        int cpu;

        if (unlikely(!vmap_initialized))
                return;

        mutex_lock(&vmap_purge_lock);

        for_each_possible_cpu(cpu) {
                struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
                struct vmap_block *vb;
                unsigned long idx;

                rcu_read_lock();
                xa_for_each(&vbq->vmap_blocks, idx, vb) {
                        spin_lock(&vb->lock);

                        /*
                         * Try to purge a fragmented block first. If it's
                         * not purgeable, check whether there is dirty
                         * space to be flushed.
                         */
                        if (!purge_fragmented_block(vb, &purge_list, false) &&
                            vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
                                unsigned long va_start = vb->va->va_start;
                                unsigned long s, e;

                                s = va_start + (vb->dirty_min << PAGE_SHIFT);
                                e = va_start + (vb->dirty_max << PAGE_SHIFT);

                                start = min(s, start);
                                end   = max(e, end);

                                /* Prevent that this is flushed again */
                                vb->dirty_min = VMAP_BBMAP_BITS;
                                vb->dirty_max = 0;

                                flush = 1;
                        }
                        spin_unlock(&vb->lock);
                }
                rcu_read_unlock();
        }
        free_purged_blocks(&purge_list);

        if (!__purge_vmap_area_lazy(start, end, false) && flush)
                flush_tlb_kernel_range(start, end);
        mutex_unlock(&vmap_purge_lock);
}

/**
 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
 *
 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
 * to amortize TLB flushing overheads. What this means is that any page you
 * have now, may, in a former life, have been mapped into kernel virtual
 * address by the vmap layer and so there might be some CPUs with TLB entries
 * still referencing that page (additional to the regular 1:1 kernel mapping).
 *
 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
 * be sure that none of the pages we have control over will have any aliases
 * from the vmap layer.
 */
void vm_unmap_aliases(void)
{
        unsigned long start = ULONG_MAX, end = 0;
        int flush = 0;

        _vm_unmap_aliases(start, end, flush);
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);

/**
 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
 * @mem: the pointer returned by vm_map_ram
 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
 */
void vm_unmap_ram(const void *mem, unsigned int count)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr = (unsigned long)kasan_reset_tag(mem);
        struct vmap_area *va;

        might_sleep();
        BUG_ON(!addr);
        BUG_ON(addr < VMALLOC_START);
        BUG_ON(addr > VMALLOC_END);
        BUG_ON(!PAGE_ALIGNED(addr));

        kasan_poison_vmalloc(mem, size);

        if (likely(count <= VMAP_MAX_ALLOC)) {
                debug_check_no_locks_freed(mem, size);
                vb_free(addr, size);
                return;
        }

        va = find_unlink_vmap_area(addr);
        if (WARN_ON_ONCE(!va))
                return;

        debug_check_no_locks_freed((void *)va->va_start, va_size(va));
        free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);

/**
 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
 * @pages: an array of pointers to the pages to be mapped
 * @count: number of pages
 * @node: prefer to allocate data structures on this node
 *
 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
 * faster than vmap so it's good.  But if you mix long-life and short-life
 * objects with vm_map_ram(), it could consume lots of address space through
 * fragmentation (especially on a 32bit machine).  You could see failures in
 * the end.  Please use this function for short-lived objects.
 *
 * Returns: a pointer to the address that has been mapped, or %NULL on failure
 */
void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
        unsigned long size = (unsigned long)count << PAGE_SHIFT;
        unsigned long addr;
        void *mem;

        if (likely(count <= VMAP_MAX_ALLOC)) {
                mem = vb_alloc(size, GFP_KERNEL);
                if (IS_ERR(mem))
                        return NULL;
                addr = (unsigned long)mem;
        } else {
                struct vmap_area *va;
                va = alloc_vmap_area(size, PAGE_SIZE,
                                VMALLOC_START, VMALLOC_END,
                                node, GFP_KERNEL, VMAP_RAM,
                                NULL);
                if (IS_ERR(va))
                        return NULL;

                addr = va->va_start;
                mem = (void *)addr;
        }

        if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
                                pages, PAGE_SHIFT) < 0) {
                vm_unmap_ram(mem, count);
                return NULL;
        }

        /*
         * Mark the pages as accessible, now that they are mapped.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        mem = kasan_unpoison_vmalloc(mem, size, KASAN_VMALLOC_PROT_NORMAL);

        return mem;
}
EXPORT_SYMBOL(vm_map_ram);

static struct vm_struct *vmlist __initdata;

static inline unsigned int vm_area_page_order(struct vm_struct *vm)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        return vm->page_order;
#else
        return 0;
#endif
}

unsigned int get_vm_area_page_order(struct vm_struct *vm)
{
        return vm_area_page_order(vm);
}

static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
{
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
        vm->page_order = order;
#else
        BUG_ON(order != 0);
#endif
}

/**
 * vm_area_add_early - add vmap area early during boot
 * @vm: vm_struct to add
 *
 * This function is used to add fixed kernel vm area to vmlist before
 * vmalloc_init() is called.  @vm->addr, @vm->size, and @vm->flags
 * should contain proper values and the other fields should be zero.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_add_early(struct vm_struct *vm)
{
        struct vm_struct *tmp, **p;

        BUG_ON(vmap_initialized);
        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
                if (tmp->addr >= vm->addr) {
                        BUG_ON(tmp->addr < vm->addr + vm->size);
                        break;
                } else
                        BUG_ON(tmp->addr + tmp->size > vm->addr);
        }
        vm->next = *p;
        *p = vm;
}

/**
 * vm_area_register_early - register vmap area early during boot
 * @vm: vm_struct to register
 * @align: requested alignment
 *
 * This function is used to register kernel vm area before
 * vmalloc_init() is called.  @vm->size and @vm->flags should contain
 * proper values on entry and other fields should be zero.  On return,
 * vm->addr contains the allocated address.
 *
 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
 */
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
        unsigned long addr = ALIGN(VMALLOC_START, align);
        struct vm_struct *cur, **p;

        BUG_ON(vmap_initialized);

        for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
                if ((unsigned long)cur->addr - addr >= vm->size)
                        break;
                addr = ALIGN((unsigned long)cur->addr + cur->size, align);
        }

        BUG_ON(addr > VMALLOC_END - vm->size);
        vm->addr = (void *)addr;
        vm->next = *p;
        *p = vm;
        kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}

static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
        /*
         * Before removing VM_UNINITIALIZED,
         * we should make sure that vm has proper values.
         * Pair with smp_rmb() in show_numa_info().
         */
        smp_wmb();
        vm->flags &= ~VM_UNINITIALIZED;
}

struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long shift, unsigned long flags,
                unsigned long start, unsigned long end, int node,
                gfp_t gfp_mask, const void *caller)
{
        struct vmap_area *va;
        struct vm_struct *area;
        unsigned long requested_size = size;

        BUG_ON(in_interrupt());
        size = ALIGN(size, 1ul << shift);
        if (unlikely(!size))
                return NULL;

        if (flags & VM_IOREMAP)
                align = 1ul << clamp_t(int, get_count_order_long(size),
                                       PAGE_SHIFT, IOREMAP_MAX_ORDER);

        area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;

        if (!(flags & VM_NO_GUARD))
                size += PAGE_SIZE;

        area->flags = flags;
        area->caller = caller;

        va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
        if (IS_ERR(va)) {
                kfree(area);
                return NULL;
        }

        /*
         * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
         * best-effort approach, as they can be mapped outside of vmalloc code.
         * For VM_ALLOC mappings, the pages are marked as accessible after
         * getting mapped in __vmalloc_node_range().
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        if (!(flags & VM_ALLOC))
                area->addr = kasan_unpoison_vmalloc(area->addr, requested_size,
                                                    KASAN_VMALLOC_PROT_NORMAL);

        return area;
}

struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
                                       const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * get_vm_area - reserve a contiguous kernel virtual area
 * @size:         size of the area
 * @flags:         %VM_IOREMAP for I/O mappings or VM_ALLOC
 *
 * Search an area of @size in the kernel virtual mapping area,
 * and reserved it for out purposes.  Returns the area descriptor
 * on success or %NULL on failure.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL,
                                  __builtin_return_address(0));
}

struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                const void *caller)
{
        return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
                                  VMALLOC_START, VMALLOC_END,
                                  NUMA_NO_NODE, GFP_KERNEL, caller);
}

/**
 * find_vm_area - find a continuous kernel virtual area
 * @addr:          base address
 *
 * Search for the kernel VM area starting at @addr, and return it.
 * It is up to the caller to do all required locking to keep the returned
 * pointer valid.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *find_vm_area(const void *addr)
{
        struct vmap_area *va;

        va = find_vmap_area((unsigned long)addr);
        if (!va)
                return NULL;

        return va->vm;
}

/**
 * remove_vm_area - find and remove a continuous kernel virtual area
 * @addr:            base address
 *
 * Search for the kernel VM area starting at @addr, and remove it.
 * This function returns the found VM area, but using it is NOT safe
 * on SMP machines, except for its size or flags.
 *
 * Return: the area descriptor on success or %NULL on failure.
 */
struct vm_struct *remove_vm_area(const void *addr)
{
        struct vmap_area *va;
        struct vm_struct *vm;

        might_sleep();

        if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
                        addr))
                return NULL;

        va = find_unlink_vmap_area((unsigned long)addr);
        if (!va || !va->vm)
                return NULL;
        vm = va->vm;

        debug_check_no_locks_freed(vm->addr, get_vm_area_size(vm));
        debug_check_no_obj_freed(vm->addr, get_vm_area_size(vm));
        kasan_free_module_shadow(vm);
        kasan_poison_vmalloc(vm->addr, get_vm_area_size(vm));

        free_unmap_vmap_area(va);
        return vm;
}

static inline void set_area_direct_map(const struct vm_struct *area,
                                       int (*set_direct_map)(struct page *page))
{
        int i;

        /* HUGE_VMALLOC passes small pages to set_direct_map */
        for (i = 0; i < area->nr_pages; i++)
                if (page_address(area->pages[i]))
                        set_direct_map(area->pages[i]);
}

/*
 * Flush the vm mapping and reset the direct map.
 */
static void vm_reset_perms(struct vm_struct *area)
{
        unsigned long start = ULONG_MAX, end = 0;
        unsigned int page_order = vm_area_page_order(area);
        int flush_dmap = 0;
        int i;

        /*
         * Find the start and end range of the direct mappings to make sure that
         * the vm_unmap_aliases() flush includes the direct map.
         */
        for (i = 0; i < area->nr_pages; i += 1U << page_order) {
                unsigned long addr = (unsigned long)page_address(area->pages[i]);

                if (addr) {
                        unsigned long page_size;

                        page_size = PAGE_SIZE << page_order;
                        start = min(addr, start);
                        end = max(addr + page_size, end);
                        flush_dmap = 1;
                }
        }

        /*
         * Set direct map to something invalid so that it won't be cached if
         * there are any accesses after the TLB flush, then flush the TLB and
         * reset the direct map permissions to the default.
         */
        set_area_direct_map(area, set_direct_map_invalid_noflush);
        _vm_unmap_aliases(start, end, flush_dmap);
        set_area_direct_map(area, set_direct_map_default_noflush);
}

static void delayed_vfree_work(struct work_struct *w)
{
        struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
        struct llist_node *t, *llnode;

        llist_for_each_safe(llnode, t, llist_del_all(&p->list))
                vfree(llnode);
}

/**
 * vfree_atomic - release memory allocated by vmalloc()
 * @addr:          memory base address
 *
 * This one is just like vfree() but can be called in any atomic context
 * except NMIs.
 */
void vfree_atomic(const void *addr)
{
        struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);

        BUG_ON(in_nmi());
        kmemleak_free(addr);

        /*
         * Use raw_cpu_ptr() because this can be called from preemptible
         * context. Preemption is absolutely fine here, because the llist_add()
         * implementation is lockless, so it works even if we are adding to
         * another cpu's list. schedule_work() should be fine with this too.
         */
        if (addr && llist_add((struct llist_node *)addr, &p->list))
                schedule_work(&p->wq);
}

/**
 * vfree - Release memory allocated by vmalloc()
 * @addr:  Memory base address
 *
 * Free the virtually continuous memory area starting at @addr, as obtained
 * from one of the vmalloc() family of APIs.  This will usually also free the
 * physical memory underlying the virtual allocation, but that memory is
 * reference counted, so it will not be freed until the last user goes away.
 *
 * If @addr is NULL, no operation is performed.
 *
 * Context:
 * May sleep if called *not* from interrupt context.
 * Must not be called in NMI context (strictly speaking, it could be
 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
 * conventions for vfree() arch-dependent would be a really bad idea).
 */
void vfree(const void *addr)
{
        struct vm_struct *vm;
        int i;

        if (unlikely(in_interrupt())) {
                vfree_atomic(addr);
                return;
        }

        BUG_ON(in_nmi());
        kmemleak_free(addr);
        might_sleep();

        if (!addr)
                return;

        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
                return;
        }

        if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS))
                vm_reset_perms(vm);
        for (i = 0; i < vm->nr_pages; i++) {
                struct page *page = vm->pages[i];

                BUG_ON(!page);
                if (!(vm->flags & VM_MAP_PUT_PAGES))
                        mod_memcg_page_state(page, MEMCG_VMALLOC, -1);
                /*
                 * High-order allocs for huge vmallocs are split, so
                 * can be freed as an array of order-0 allocations
                 */
                __free_page(page);
                cond_resched();
        }
        if (!(vm->flags & VM_MAP_PUT_PAGES))
                atomic_long_sub(vm->nr_pages, &nr_vmalloc_pages);
        kvfree(vm->pages);
        kfree(vm);
}
EXPORT_SYMBOL(vfree);

/**
 * vunmap - release virtual mapping obtained by vmap()
 * @addr:   memory base address
 *
 * Free the virtually contiguous memory area starting at @addr,
 * which was created from the page array passed to vmap().
 *
 * Must not be called in interrupt context.
 */
void vunmap(const void *addr)
{
        struct vm_struct *vm;

        BUG_ON(in_interrupt());
        might_sleep();

        if (!addr)
                return;
        vm = remove_vm_area(addr);
        if (unlikely(!vm)) {
                WARN(1, KERN_ERR "Trying to vunmap() nonexistent vm area (%p)\n",
                                addr);
                return;
        }
        kfree(vm);
}
EXPORT_SYMBOL(vunmap);

/**
 * vmap - map an array of pages into virtually contiguous space
 * @pages: array of page pointers
 * @count: number of pages to map
 * @flags: vm_area->flags
 * @prot: page protection for the mapping
 *
 * Maps @count pages from @pages into contiguous kernel virtual space.
 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
 * (which must be kmalloc or vmalloc memory) and one reference per pages in it
 * are transferred from the caller to vmap(), and will be freed / dropped when
 * vfree() is called on the return value.
 *
 * Return: the address of the area or %NULL on failure
 */
void *vmap(struct page **pages, unsigned int count,
           unsigned long flags, pgprot_t prot)
{
        struct vm_struct *area;
        unsigned long addr;
        unsigned long size;                /* In bytes */

        might_sleep();

        if (WARN_ON_ONCE(flags & VM_FLUSH_RESET_PERMS))
                return NULL;

        /*
         * Your top guard is someone else's bottom guard. Not having a top
         * guard compromises someone else's mappings too.
         */
        if (WARN_ON_ONCE(flags & VM_NO_GUARD))
                flags &= ~VM_NO_GUARD;

        if (count > totalram_pages())
                return NULL;

        size = (unsigned long)count << PAGE_SHIFT;
        area = get_vm_area_caller(size, flags, __builtin_return_address(0));
        if (!area)
                return NULL;

        addr = (unsigned long)area->addr;
        if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
                                pages, PAGE_SHIFT) < 0) {
                vunmap(area->addr);
                return NULL;
        }

        if (flags & VM_MAP_PUT_PAGES) {
                area->pages = pages;
                area->nr_pages = count;
        }
        return area->addr;
}
EXPORT_SYMBOL(vmap);

#ifdef CONFIG_VMAP_PFN
struct vmap_pfn_data {
        unsigned long        *pfns;
        pgprot_t        prot;
        unsigned int        idx;
};

static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
{
        struct vmap_pfn_data *data = private;
        unsigned long pfn = data->pfns[data->idx];
        pte_t ptent;

        if (WARN_ON_ONCE(pfn_valid(pfn)))
                return -EINVAL;

        ptent = pte_mkspecial(pfn_pte(pfn, data->prot));
        set_pte_at(&init_mm, addr, pte, ptent);

        data->idx++;
        return 0;
}

/**
 * vmap_pfn - map an array of PFNs into virtually contiguous space
 * @pfns: array of PFNs
 * @count: number of pages to map
 * @prot: page protection for the mapping
 *
 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
 * the start address of the mapping.
 */
void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
{
        struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
        struct vm_struct *area;

        area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
                        __builtin_return_address(0));
        if (!area)
                return NULL;
        if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
                        count * PAGE_SIZE, vmap_pfn_apply, &data)) {
                free_vm_area(area);
                return NULL;
        }

        flush_cache_vmap((unsigned long)area->addr,
                         (unsigned long)area->addr + count * PAGE_SIZE);

        return area->addr;
}
EXPORT_SYMBOL_GPL(vmap_pfn);
#endif /* CONFIG_VMAP_PFN */

static inline unsigned int
vm_area_alloc_pages(gfp_t gfp, int nid,
                unsigned int order, unsigned int nr_pages, struct page **pages)
{
        unsigned int nr_allocated = 0;
        struct page *page;
        int i;

        /*
         * For order-0 pages we make use of bulk allocator, if
         * the page array is partly or not at all populated due
         * to fails, fallback to a single page allocator that is
         * more permissive.
         */
        if (!order) {
                while (nr_allocated < nr_pages) {
                        unsigned int nr, nr_pages_request;

                        /*
                         * A maximum allowed request is hard-coded and is 100
                         * pages per call. That is done in order to prevent a
                         * long preemption off scenario in the bulk-allocator
                         * so the range is [1:100].
                         */
                        nr_pages_request = min(100U, nr_pages - nr_allocated);

                        /* memory allocation should consider mempolicy, we can't
                         * wrongly use nearest node when nid == NUMA_NO_NODE,
                         * otherwise memory may be allocated in only one node,
                         * but mempolicy wants to alloc memory by interleaving.
                         */
                        if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
                                nr = alloc_pages_bulk_mempolicy_noprof(gfp,
                                                        nr_pages_request,
                                                        pages + nr_allocated);
                        else
                                nr = alloc_pages_bulk_node_noprof(gfp, nid,
                                                        nr_pages_request,
                                                        pages + nr_allocated);

                        nr_allocated += nr;
                        cond_resched();

                        /*
                         * If zero or pages were obtained partly,
                         * fallback to a single page allocator.
                         */
                        if (nr != nr_pages_request)
                                break;
                }
        }

        /* High-order pages or fallback path if "bulk" fails. */
        while (nr_allocated < nr_pages) {
                if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current))
                        break;

                if (nid == NUMA_NO_NODE)
                        page = alloc_pages_noprof(gfp, order);
                else
                        page = alloc_pages_node_noprof(nid, gfp, order);

                if (unlikely(!page))
                        break;

                /*
                 * High-order allocations must be able to be treated as
                 * independent small pages by callers (as they can with
                 * small-page vmallocs). Some drivers do their own refcounting
                 * on vmalloc_to_page() pages, some use page->mapping,
                 * page->lru, etc.
                 */
                if (order)
                        split_page(page, order);

                /*
                 * Careful, we allocate and map page-order pages, but
                 * tracking is done per PAGE_SIZE page so as to keep the
                 * vm_struct APIs independent of the physical/mapped size.
                 */
                for (i = 0; i < (1U << order); i++)
                        pages[nr_allocated + i] = page + i;

                cond_resched();
                nr_allocated += 1U << order;
        }

        return nr_allocated;
}

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, unsigned int page_shift,
                                 int node)
{
        const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
        bool nofail = gfp_mask & __GFP_NOFAIL;
        unsigned long addr = (unsigned long)area->addr;
        unsigned long size = get_vm_area_size(area);
        unsigned long array_size;
        unsigned int nr_small_pages = size >> PAGE_SHIFT;
        unsigned int page_order;
        unsigned int flags;
        int ret;

        array_size = (unsigned long)nr_small_pages * sizeof(struct page *);

        if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
                gfp_mask |= __GFP_HIGHMEM;

        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
                area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node,
                                        area->caller);
        } else {
                area->pages = kmalloc_node_noprof(array_size, nested_gfp, node);
        }

        if (!area->pages) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to allocated page array size %lu",
                        nr_small_pages * PAGE_SIZE, array_size);
                free_vm_area(area);
                return NULL;
        }

        set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
        page_order = vm_area_page_order(area);

        /*
         * High-order nofail allocations are really expensive and
         * potentially dangerous (pre-mature OOM, disruptive reclaim
         * and compaction etc.
         *
         * Please note, the __vmalloc_node_range_noprof() falls-back
         * to order-0 pages if high-order attempt is unsuccessful.
         */
        area->nr_pages = vm_area_alloc_pages((page_order ?
                gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN,
                node, page_order, nr_small_pages, area->pages);

        atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
        if (gfp_mask & __GFP_ACCOUNT) {
                int i;

                for (i = 0; i < area->nr_pages; i++)
                        mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1);
        }

        /*
         * If not enough pages were obtained to accomplish an
         * allocation request, free them via vfree() if any.
         */
        if (area->nr_pages != nr_small_pages) {
                /*
                 * vm_area_alloc_pages() can fail due to insufficient memory but
                 * also:-
                 *
                 * - a pending fatal signal
                 * - insufficient huge page-order pages
                 *
                 * Since we always retry allocations at order-0 in the huge page
                 * case a warning for either is spurious.
                 */
                if (!fatal_signal_pending(current) && page_order == 0)
                        warn_alloc(gfp_mask, NULL,
                                "vmalloc error: size %lu, failed to allocate pages",
                                area->nr_pages * PAGE_SIZE);
                goto fail;
        }

        /*
         * page tables allocations ignore external gfp mask, enforce it
         * by the scope API
         */
        if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
                flags = memalloc_nofs_save();
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
                flags = memalloc_noio_save();

        do {
                ret = vmap_pages_range(addr, addr + size, prot, area->pages,
                        page_shift);
                if (nofail && (ret < 0))
                        schedule_timeout_uninterruptible(1);
        } while (nofail && (ret < 0));

        if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
                memalloc_nofs_restore(flags);
        else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
                memalloc_noio_restore(flags);

        if (ret < 0) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, failed to map pages",
                        area->nr_pages * PAGE_SIZE);
                goto fail;
        }

        return area->addr;

fail:
        vfree(area->addr);
        return NULL;
}

/**
 * __vmalloc_node_range - allocate virtually contiguous memory
 * @size:                  allocation size
 * @align:                  desired alignment
 * @start:                  vm area range start
 * @end:                  vm area range end
 * @gfp_mask:                  flags for the page level allocator
 * @prot:                  protection mask for the allocated pages
 * @vm_flags:                  additional vm area flags (e.g. %VM_NO_GUARD)
 * @node:                  node to use for allocation or NUMA_NO_NODE
 * @caller:                  caller's return address
 *
 * Allocate enough pages to cover @size from the page level
 * allocator with @gfp_mask flags. Please note that the full set of gfp
 * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
 * supported.
 * Zone modifiers are not supported. From the reclaim modifiers
 * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
 * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
 * __GFP_RETRY_MAYFAIL are not supported).
 *
 * __GFP_NOWARN can be used to suppress failures messages.
 *
 * Map them into contiguous kernel virtual space, using a pagetable
 * protection of @prot.
 *
 * Return: the address of the area or %NULL on failure
 */
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
                        pgprot_t prot, unsigned long vm_flags, int node,
                        const void *caller)
{
        struct vm_struct *area;
        void *ret;
        kasan_vmalloc_flags_t kasan_flags = KASAN_VMALLOC_NONE;
        unsigned long original_align = align;
        unsigned int shift = PAGE_SHIFT;

        if (WARN_ON_ONCE(!size))
                return NULL;

        if ((size >> PAGE_SHIFT) > totalram_pages()) {
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, exceeds total pages",
                        size);
                return NULL;
        }

        if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) {
                /*
                 * Try huge pages. Only try for PAGE_KERNEL allocations,
                 * others like modules don't yet expect huge pages in
                 * their allocations due to apply_to_page_range not
                 * supporting them.
                 */

                if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE)
                        shift = PMD_SHIFT;
                else
                        shift = arch_vmap_pte_supported_shift(size);

                align = max(original_align, 1UL << shift);
        }

again:
        area = __get_vm_area_node(size, align, shift, VM_ALLOC |
                                  VM_UNINITIALIZED | vm_flags, start, end, node,
                                  gfp_mask, caller);
        if (!area) {
                bool nofail = gfp_mask & __GFP_NOFAIL;
                warn_alloc(gfp_mask, NULL,
                        "vmalloc error: size %lu, vm_struct allocation failed%s",
                        size, (nofail) ? ". Retrying." : "");
                if (nofail) {
                        schedule_timeout_uninterruptible(1);
                        goto again;
                }
                goto fail;
        }

        /*
         * Prepare arguments for __vmalloc_area_node() and
         * kasan_unpoison_vmalloc().
         */
        if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) {
                if (kasan_hw_tags_enabled()) {
                        /*
                         * Modify protection bits to allow tagging.
                         * This must be done before mapping.
                         */
                        prot = arch_vmap_pgprot_tagged(prot);

                        /*
                         * Skip page_alloc poisoning and zeroing for physical
                         * pages backing VM_ALLOC mapping. Memory is instead
                         * poisoned and zeroed by kasan_unpoison_vmalloc().
                         */
                        gfp_mask |= __GFP_SKIP_KASAN | __GFP_SKIP_ZERO;
                }

                /* Take note that the mapping is PAGE_KERNEL. */
                kasan_flags |= KASAN_VMALLOC_PROT_NORMAL;
        }

        /* Allocate physical pages and map them into vmalloc space. */
        ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
        if (!ret)
                goto fail;

        /*
         * Mark the pages as accessible, now that they are mapped.
         * The condition for setting KASAN_VMALLOC_INIT should complement the
         * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
         * to make sure that memory is initialized under the same conditions.
         * Tag-based KASAN modes only assign tags to normal non-executable
         * allocations, see __kasan_unpoison_vmalloc().
         */
        kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
        if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
            (gfp_mask & __GFP_SKIP_ZERO))
                kasan_flags |= KASAN_VMALLOC_INIT;
        /* KASAN_VMALLOC_PROT_NORMAL already set if required. */
        area->addr = kasan_unpoison_vmalloc(area->addr, size, kasan_flags);

        /*
         * In this function, newly allocated vm_struct has VM_UNINITIALIZED
         * flag. It means that vm_struct is not fully initialized.
         * Now, it is fully initialized, so remove this flag here.
         */
        clear_vm_uninitialized_flag(area);

        if (!(vm_flags & VM_DEFER_KMEMLEAK))
                kmemleak_vmalloc(area, PAGE_ALIGN(size), gfp_mask);

        return area->addr;

fail:
        if (shift > PAGE_SHIFT) {
                shift = PAGE_SHIFT;
                align = original_align;
                goto again;
        }

        return NULL;
}

/**
 * __vmalloc_node - allocate virtually contiguous memory
 * @size:            allocation size
 * @align:            desired alignment
 * @gfp_mask:            flags for the page level allocator
 * @node:            node to use for allocation or NUMA_NO_NODE
 * @caller:            caller's return address
 *
 * Allocate enough pages to cover @size from the page level allocator with
 * @gfp_mask flags.  Map them into contiguous kernel virtual space.
 *
 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
 * and __GFP_NOFAIL are not supported
 *
 * Any use of gfp flags outside of GFP_KERNEL should be consulted
 * with mm people.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *__vmalloc_node_noprof(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, int node, const void *caller)
{
        return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END,
                                gfp_mask, PAGE_KERNEL, 0, node, caller);
}
/*
 * This is only for performance analysis of vmalloc and stress purpose.
 * It is required by vmalloc test module, therefore do not use it other
 * than that.
 */
#ifdef CONFIG_TEST_VMALLOC_MODULE
EXPORT_SYMBOL_GPL(__vmalloc_node_noprof);
#endif

void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
{
        return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc_noprof);

/**
 * vmalloc - allocate virtually contiguous memory
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_noprof);

/**
 * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
 * @size:      allocation size
 * @gfp_mask:  flags for the page level allocator
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * If @size is greater than or equal to PMD_SIZE, allow using
 * huge pages for the memory
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask)
{
        return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END,
                                    gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
                                    NUMA_NO_NODE, __builtin_return_address(0));
}
EXPORT_SYMBOL_GPL(vmalloc_huge_noprof);

/**
 * vzalloc - allocate virtually contiguous memory with zero fill
 * @size:    allocation size
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_noprof);

/**
 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
 * @size: allocation size
 *
 * The resulting memory area is zeroed so it can be mapped to userspace
 * without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user_noprof);

/**
 * vmalloc_node - allocate memory on a specific node
 * @size:          allocation size
 * @node:          numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 *
 * For tight control over page level allocator and protection flags
 * use __vmalloc() instead.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node_noprof);

/**
 * vzalloc_node - allocate memory on a specific node with zero fill
 * @size:        allocation size
 * @node:        numa node
 *
 * Allocate enough pages to cover @size from the page level
 * allocator and map them into contiguous kernel virtual space.
 * The memory allocated is set to zero.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vzalloc_node_noprof(unsigned long size, int node)
{
        return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node,
                                __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node_noprof);

/**
 * vrealloc - reallocate virtually contiguous memory; contents remain unchanged
 * @p: object to reallocate memory for
 * @size: the size to reallocate
 * @flags: the flags for the page level allocator
 *
 * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and
 * @p is not a %NULL pointer, the object pointed to is freed.
 *
 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 * initial memory allocation, every subsequent call to this API for the same
 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 * __GFP_ZERO is not fully honored by this API.
 *
 * In any case, the contents of the object pointed to are preserved up to the
 * lesser of the new and old sizes.
 *
 * This function must not be called concurrently with itself or vfree() for the
 * same memory allocation.
 *
 * Return: pointer to the allocated memory; %NULL if @size is zero or in case of
 *         failure
 */
void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
{
        size_t old_size = 0;
        void *n;

        if (!size) {
                vfree(p);
                return NULL;
        }

        if (p) {
                struct vm_struct *vm;

                vm = find_vm_area(p);
                if (unlikely(!vm)) {
                        WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
                        return NULL;
                }

                old_size = get_vm_area_size(vm);
        }

        /*
         * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
         * would be a good heuristic for when to shrink the vm_area?
         */
        if (size <= old_size) {
                /* Zero out spare memory. */
                if (want_init_on_alloc(flags))
                        memset((void *)p + size, 0, old_size - size);
                kasan_poison_vmalloc(p + size, old_size - size);
                kasan_unpoison_vmalloc(p, size, KASAN_VMALLOC_PROT_NORMAL);
                return (void *)p;
        }

        /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
        n = __vmalloc_noprof(size, flags);
        if (!n)
                return NULL;

        if (p) {
                memcpy(n, p, old_size);
                vfree(p);
        }

        return n;
}

#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
#else
/*
 * 64b systems should always have either DMA or DMA32 zones. For others
 * GFP_DMA32 should do the right thing and use the normal zone.
 */
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#endif

/**
 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
 * @size:        allocation size
 *
 * Allocate enough 32bit PA addressable pages to cover @size from the
 * page level allocator and map them into contiguous kernel virtual space.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_noprof(unsigned long size)
{
        return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
                        __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_noprof);

/**
 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
 * @size:             allocation size
 *
 * The resulting memory area is 32bit addressable and zeroed so it can be
 * mapped to userspace without leaking data.
 *
 * Return: pointer to the allocated memory or %NULL on error
 */
void *vmalloc_32_user_noprof(unsigned long size)
{
        return __vmalloc_node_range_noprof(size, SHMLBA,  VMALLOC_START, VMALLOC_END,
                                    GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
                                    VM_USERMAP, NUMA_NO_NODE,
                                    __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user_noprof);

/*
 * Atomically zero bytes in the iterator.
 *
 * Returns the number of zeroed bytes.
 */
static size_t zero_iter(struct iov_iter *iter, size_t count)
{
        size_t remains = count;

        while (remains > 0) {
                size_t num, copied;

                num = min_t(size_t, remains, PAGE_SIZE);
                copied = copy_page_to_iter_nofault(ZERO_PAGE(0), 0, num, iter);
                remains -= copied;

                if (copied < num)
                        break;
        }

        return count - remains;
}

/*
 * small helper routine, copy contents to iter from addr.
 * If the page is not present, fill zero.
 *
 * Returns the number of copied bytes.
 */
static size_t aligned_vread_iter(struct iov_iter *iter,
                                 const char *addr, size_t count)
{
        size_t remains = count;
        struct page *page;

        while (remains > 0) {
                unsigned long offset, length;
                size_t copied = 0;

                offset = offset_in_page(addr);
                length = PAGE_SIZE - offset;
                if (length > remains)
                        length = remains;
                page = vmalloc_to_page(addr);
                /*
                 * To do safe access to this _mapped_ area, we need lock. But
                 * adding lock here means that we need to add overhead of
                 * vmalloc()/vfree() calls for this _debug_ interface, rarely
                 * used. Instead of that, we'll use an local mapping via
                 * copy_page_to_iter_nofault() and accept a small overhead in
                 * this access function.
                 */
                if (page)
                        copied = copy_page_to_iter_nofault(page, offset,
                                                           length, iter);
                else
                        copied = zero_iter(iter, length);

                addr += copied;
                remains -= copied;

                if (copied != length)
                        break;
        }

        return count - remains;
}

/*
 * Read from a vm_map_ram region of memory.
 *
 * Returns the number of copied bytes.
 */
static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr,
                                  size_t count, unsigned long flags)
{
        char *start;
        struct vmap_block *vb;
        struct xarray *xa;
        unsigned long offset;
        unsigned int rs, re;
        size_t remains, n;

        /*
         * If it's area created by vm_map_ram() interface directly, but
         * not further subdividing and delegating management to vmap_block,
         * handle it here.
         */
        if (!(flags & VMAP_BLOCK))
                return aligned_vread_iter(iter, addr, count);

        remains = count;

        /*
         * Area is split into regions and tracked with vmap_block, read out
         * each region and zero fill the hole between regions.
         */
        xa = addr_to_vb_xa((unsigned long) addr);
        vb = xa_load(xa, addr_to_vb_idx((unsigned long)addr));
        if (!vb)
                goto finished_zero;

        spin_lock(&vb->lock);
        if (bitmap_empty(vb->used_map, VMAP_BBMAP_BITS)) {
                spin_unlock(&vb->lock);
                goto finished_zero;
        }

        for_each_set_bitrange(rs, re, vb->used_map, VMAP_BBMAP_BITS) {
                size_t copied;

                if (remains == 0)
                        goto finished;

                start = vmap_block_vaddr(vb->va->va_start, rs);

                if (addr < start) {
                        size_t to_zero = min_t(size_t, start - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                /*it could start reading from the middle of used region*/
                offset = offset_in_page(addr);
                n = ((re - rs + 1) << PAGE_SHIFT) - offset;
                if (n > remains)
                        n = remains;

                copied = aligned_vread_iter(iter, start + offset, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;
        }

        spin_unlock(&vb->lock);

finished_zero:
        /* zero-fill the left dirty or free regions */
        return count - remains + zero_iter(iter, remains);
finished:
        /* We couldn't copy/zero everything */
        spin_unlock(&vb->lock);
        return count - remains;
}

/**
 * vread_iter() - read vmalloc area in a safe way to an iterator.
 * @iter:         the iterator to which data should be written.
 * @addr:         vm address.
 * @count:        number of bytes to be read.
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * copy data from that area to a given buffer. If the given memory range
 * of [addr...addr+count) includes some valid address, data is copied to
 * proper area of @buf. If there are memory holes, they'll be zero-filled.
 * IOREMAP area is treated as memory hole and no copy is done.
 *
 * If [addr...addr+count) doesn't includes any intersects with alive
 * vm_struct area, returns 0. @buf should be kernel's buffer.
 *
 * Note: In usual ops, vread() is never necessary because the caller
 * should know vmalloc() area is valid and can use memcpy().
 * This is for routines which have to access vmalloc area without
 * any information, as /proc/kcore.
 *
 * Return: number of bytes for which addr and buf should be increased
 * (same number as @count) or %0 if [addr...addr+count) doesn't
 * include any intersection with valid vmalloc area
 */
long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *vm;
        char *vaddr;
        size_t n, size, flags, remains;
        unsigned long next;

        addr = kasan_reset_tag(addr);

        /* Don't allow overflow */
        if ((unsigned long) addr + count < count)
                count = -(unsigned long) addr;

        remains = count;

        vn = find_vmap_area_exceed_addr_lock((unsigned long) addr, &va);
        if (!vn)
                goto finished_zero;

        /* no intersects with alive vmap_area */
        if ((unsigned long)addr + remains <= va->va_start)
                goto finished_zero;

        do {
                size_t copied;

                if (remains == 0)
                        goto finished;

                vm = va->vm;
                flags = va->flags & VMAP_FLAGS_MASK;
                /*
                 * VMAP_BLOCK indicates a sub-type of vm_map_ram area, need
                 * be set together with VMAP_RAM.
                 */
                WARN_ON(flags == VMAP_BLOCK);

                if (!vm && !flags)
                        goto next_va;

                if (vm && (vm->flags & VM_UNINITIALIZED))
                        goto next_va;

                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                smp_rmb();

                vaddr = (char *) va->va_start;
                size = vm ? get_vm_area_size(vm) : va_size(va);

                if (addr >= vaddr + size)
                        goto next_va;

                if (addr < vaddr) {
                        size_t to_zero = min_t(size_t, vaddr - addr, remains);
                        size_t zeroed = zero_iter(iter, to_zero);

                        addr += zeroed;
                        remains -= zeroed;

                        if (remains == 0 || zeroed != to_zero)
                                goto finished;
                }

                n = vaddr + size - addr;
                if (n > remains)
                        n = remains;

                if (flags & VMAP_RAM)
                        copied = vmap_ram_vread_iter(iter, addr, n, flags);
                else if (!(vm && (vm->flags & (VM_IOREMAP | VM_SPARSE))))
                        copied = aligned_vread_iter(iter, addr, n);
                else /* IOREMAP | SPARSE area is treated as memory hole */
                        copied = zero_iter(iter, n);

                addr += copied;
                remains -= copied;

                if (copied != n)
                        goto finished;

        next_va:
                next = va->va_end;
                spin_unlock(&vn->busy.lock);
        } while ((vn = find_vmap_area_exceed_addr_lock(next, &va)));

finished_zero:
        if (vn)
                spin_unlock(&vn->busy.lock);

        /* zero-fill memory holes */
        return count - remains + zero_iter(iter, remains);
finished:
        /* Nothing remains, or We couldn't copy/zero everything. */
        if (vn)
                spin_unlock(&vn->busy.lock);

        return count - remains;
}

/**
 * remap_vmalloc_range_partial - map vmalloc pages to userspace
 * @vma:                vma to cover
 * @uaddr:                target user address to start at
 * @kaddr:                virtual address of vmalloc kernel memory
 * @pgoff:                offset from @kaddr to start at
 * @size:                size of map area
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that @kaddr is a valid vmalloc'ed area,
 * and that it is big enough to cover the range starting at
 * @uaddr in @vma. Will return failure if that criteria isn't
 * met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
                                void *kaddr, unsigned long pgoff,
                                unsigned long size)
{
        struct vm_struct *area;
        unsigned long off;
        unsigned long end_index;

        if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
                return -EINVAL;

        size = PAGE_ALIGN(size);

        if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
                return -EINVAL;

        area = find_vm_area(kaddr);
        if (!area)
                return -EINVAL;

        if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
                return -EINVAL;

        if (check_add_overflow(size, off, &end_index) ||
            end_index > get_vm_area_size(area))
                return -EINVAL;
        kaddr += off;

        do {
                struct page *page = vmalloc_to_page(kaddr);
                int ret;

                ret = vm_insert_page(vma, uaddr, page);
                if (ret)
                        return ret;

                uaddr += PAGE_SIZE;
                kaddr += PAGE_SIZE;
                size -= PAGE_SIZE;
        } while (size > 0);

        vm_flags_set(vma, VM_DONTEXPAND | VM_DONTDUMP);

        return 0;
}

/**
 * remap_vmalloc_range - map vmalloc pages to userspace
 * @vma:                vma to cover (map full range of vma)
 * @addr:                vmalloc memory
 * @pgoff:                number of pages into addr before first page to map
 *
 * Returns:        0 for success, -Exxx on failure
 *
 * This function checks that addr is a valid vmalloc'ed area, and
 * that it is big enough to cover the vma. Will return failure if
 * that criteria isn't met.
 *
 * Similar to remap_pfn_range() (see mm/memory.c)
 */
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
                                                unsigned long pgoff)
{
        return remap_vmalloc_range_partial(vma, vma->vm_start,
                                           addr, pgoff,
                                           vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);

void free_vm_area(struct vm_struct *area)
{
        struct vm_struct *ret;
        ret = remove_vm_area(area->addr);
        BUG_ON(ret != area);
        kfree(area);
}
EXPORT_SYMBOL_GPL(free_vm_area);

#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
        return rb_entry_safe(n, struct vmap_area, rb_node);
}

/**
 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
 * @addr: target address
 *
 * Returns: vmap_area if it is found. If there is no such area
 *   the first highest(reverse order) vmap_area is returned
 *   i.e. va->va_start < addr && va->va_end < addr or NULL
 *   if there are no any areas before @addr.
 */
static struct vmap_area *
pvm_find_va_enclose_addr(unsigned long addr)
{
        struct vmap_area *va, *tmp;
        struct rb_node *n;

        n = free_vmap_area_root.rb_node;
        va = NULL;

        while (n) {
                tmp = rb_entry(n, struct vmap_area, rb_node);
                if (tmp->va_start <= addr) {
                        va = tmp;
                        if (tmp->va_end >= addr)
                                break;

                        n = n->rb_right;
                } else {
                        n = n->rb_left;
                }
        }

        return va;
}

/**
 * pvm_determine_end_from_reverse - find the highest aligned address
 * of free block below VMALLOC_END
 * @va:
 *   in - the VA we start the search(reverse order);
 *   out - the VA with the highest aligned end address.
 * @align: alignment for required highest address
 *
 * Returns: determined end address within vmap_area
 */
static unsigned long
pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
        unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        unsigned long addr;

        if (likely(*va)) {
                list_for_each_entry_from_reverse((*va),
                                &free_vmap_area_list, list) {
                        addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
                        if ((*va)->va_start < addr)
                                return addr;
                }
        }

        return 0;
}

/**
 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
 * @offsets: array containing offset of each area
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *            vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
 * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
 * be scattered pretty far, distance between two areas easily going up
 * to gigabytes.  To avoid interacting with regular vmallocs, these
 * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple. It
 * does everything top-down and scans free blocks from the end looking
 * for matching base. While scanning, if any of the areas do not fit the
 * base address is pulled down to fit the area. Scanning is repeated till
 * all the areas fit and then all necessary data structures are inserted
 * and the result is returned.
 */
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
                                     size_t align)
{
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
        struct vmap_area **vas, *va;
        struct vm_struct **vms;
        int area, area2, last_area, term_area;
        unsigned long base, start, size, end, last_end, orig_start, orig_end;
        bool purged = false;

        /* verify parameters and allocate data structures */
        BUG_ON(offset_in_page(align) || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
                start = offsets[area];
                end = start + sizes[area];

                /* is everything aligned properly? */
                BUG_ON(!IS_ALIGNED(offsets[area], align));
                BUG_ON(!IS_ALIGNED(sizes[area], align));

                /* detect the area with the highest address */
                if (start > offsets[last_area])
                        last_area = area;

                for (area2 = area + 1; area2 < nr_vms; area2++) {
                        unsigned long start2 = offsets[area2];
                        unsigned long end2 = start2 + sizes[area2];

                        BUG_ON(start2 < end && start < end2);
                }
        }
        last_end = offsets[last_area] + sizes[last_area];

        if (vmalloc_end - vmalloc_start < last_end) {
                WARN_ON(true);
                return NULL;
        }

        vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
        vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
        if (!vas || !vms)
                goto err_free2;

        for (area = 0; area < nr_vms; area++) {
                vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
                vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
retry:
        spin_lock(&free_vmap_area_lock);

        /* start scanning - we scan from the top, begin with the last area */
        area = term_area = last_area;
        start = offsets[area];
        end = start + sizes[area];

        va = pvm_find_va_enclose_addr(vmalloc_end);
        base = pvm_determine_end_from_reverse(&va, align) - end;

        while (true) {
                /*
                 * base might have underflowed, add last_end before
                 * comparing.
                 */
                if (base + last_end < vmalloc_start + last_end)
                        goto overflow;

                /*
                 * Fitting base has not been found.
                 */
                if (va == NULL)
                        goto overflow;

                /*
                 * If required width exceeds current VA block, move
                 * base downwards and then recheck.
                 */
                if (base + end > va->va_end) {
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * If this VA does not fit, move base downwards and recheck.
                 */
                if (base + start < va->va_start) {
                        va = node_to_va(rb_prev(&va->rb_node));
                        base = pvm_determine_end_from_reverse(&va, align) - end;
                        term_area = area;
                        continue;
                }

                /*
                 * This area fits, move on to the previous one.  If
                 * the previous one is the terminal one, we're done.
                 */
                area = (area + nr_vms - 1) % nr_vms;
                if (area == term_area)
                        break;

                start = offsets[area];
                end = start + sizes[area];
                va = pvm_find_va_enclose_addr(base + end);
        }

        /* we've found a fitting base, insert all va's */
        for (area = 0; area < nr_vms; area++) {
                int ret;

                start = base + offsets[area];
                size = sizes[area];

                va = pvm_find_va_enclose_addr(start);
                if (WARN_ON_ONCE(va == NULL))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                ret = va_clip(&free_vmap_area_root,
                        &free_vmap_area_list, va, start, size);
                if (WARN_ON_ONCE(unlikely(ret)))
                        /* It is a BUG(), but trigger recovery instead. */
                        goto recovery;

                /* Allocated area. */
                va = vas[area];
                va->va_start = start;
                va->va_end = start + size;
        }

        spin_unlock(&free_vmap_area_lock);

        /* populate the kasan shadow space */
        for (area = 0; area < nr_vms; area++) {
                if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
                        goto err_free_shadow;
        }

        /* insert all vm's */
        for (area = 0; area < nr_vms; area++) {
                struct vmap_node *vn = addr_to_node(vas[area]->va_start);

                spin_lock(&vn->busy.lock);
                insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
                setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
                                 pcpu_get_vm_areas);
                spin_unlock(&vn->busy.lock);
        }

        /*
         * Mark allocated areas as accessible. Do it now as a best-effort
         * approach, as they can be mapped outside of vmalloc code.
         * With hardware tag-based KASAN, marking is skipped for
         * non-VM_ALLOC mappings, see __kasan_unpoison_vmalloc().
         */
        for (area = 0; area < nr_vms; area++)
                vms[area]->addr = kasan_unpoison_vmalloc(vms[area]->addr,
                                vms[area]->size, KASAN_VMALLOC_PROT_NORMAL);

        kfree(vas);
        return vms;

recovery:
        /*
         * Remove previously allocated areas. There is no
         * need in removing these areas from the busy tree,
         * because they are inserted only on the final step
         * and when pcpu_get_vm_areas() is success.
         */
        while (area--) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end,
                                KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
                vas[area] = NULL;
        }

overflow:
        spin_unlock(&free_vmap_area_lock);
        if (!purged) {
                reclaim_and_purge_vmap_areas();
                purged = true;

                /* Before "retry", check if we recover. */
                for (area = 0; area < nr_vms; area++) {
                        if (vas[area])
                                continue;

                        vas[area] = kmem_cache_zalloc(
                                vmap_area_cachep, GFP_KERNEL);
                        if (!vas[area])
                                goto err_free;
                }

                goto retry;
        }

err_free:
        for (area = 0; area < nr_vms; area++) {
                if (vas[area])
                        kmem_cache_free(vmap_area_cachep, vas[area]);

                kfree(vms[area]);
        }
err_free2:
        kfree(vas);
        kfree(vms);
        return NULL;

err_free_shadow:
        spin_lock(&free_vmap_area_lock);
        /*
         * We release all the vmalloc shadows, even the ones for regions that
         * hadn't been successfully added. This relies on kasan_release_vmalloc
         * being able to tolerate this case.
         */
        for (area = 0; area < nr_vms; area++) {
                orig_start = vas[area]->va_start;
                orig_end = vas[area]->va_end;
                va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
                                &free_vmap_area_list);
                if (va)
                        kasan_release_vmalloc(orig_start, orig_end,
                                va->va_start, va->va_end,
                                KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
                vas[area] = NULL;
                kfree(vms[area]);
        }
        spin_unlock(&free_vmap_area_lock);
        kfree(vas);
        kfree(vms);
        return NULL;
}

/**
 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
 * @nr_vms: the number of allocated areas
 *
 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
 */
void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
{
        int i;

        for (i = 0; i < nr_vms; i++)
                free_vm_area(vms[i]);
        kfree(vms);
}
#endif        /* CONFIG_SMP */

#ifdef CONFIG_PRINTK
bool vmalloc_dump_obj(void *object)
{
        const void *caller;
        struct vm_struct *vm;
        struct vmap_area *va;
        struct vmap_node *vn;
        unsigned long addr;
        unsigned int nr_pages;

        addr = PAGE_ALIGN((unsigned long) object);
        vn = addr_to_node(addr);

        if (!spin_trylock(&vn->busy.lock))
                return false;

        va = __find_vmap_area(addr, &vn->busy.root);
        if (!va || !va->vm) {
                spin_unlock(&vn->busy.lock);
                return false;
        }

        vm = va->vm;
        addr = (unsigned long) vm->addr;
        caller = vm->caller;
        nr_pages = vm->nr_pages;
        spin_unlock(&vn->busy.lock);

        pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
                nr_pages, addr, caller);

        return true;
}
#endif

#ifdef CONFIG_PROC_FS
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
        if (IS_ENABLED(CONFIG_NUMA)) {
                unsigned int nr, *counters = m->private;
                unsigned int step = 1U << vm_area_page_order(v);

                if (!counters)
                        return;

                if (v->flags & VM_UNINITIALIZED)
                        return;
                /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
                smp_rmb();

                memset(counters, 0, nr_node_ids * sizeof(unsigned int));

                for (nr = 0; nr < v->nr_pages; nr += step)
                        counters[page_to_nid(v->pages[nr])] += step;
                for_each_node_state(nr, N_HIGH_MEMORY)
                        if (counters[nr])
                                seq_printf(m, " N%u=%u", nr, counters[nr]);
        }
}

static void show_purge_info(struct seq_file *m)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        int i;

        for (i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                spin_lock(&vn->lazy.lock);
                list_for_each_entry(va, &vn->lazy.head, list) {
                        seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
                                (void *)va->va_start, (void *)va->va_end,
                                va_size(va));
                }
                spin_unlock(&vn->lazy.lock);
        }
}

static int vmalloc_info_show(struct seq_file *m, void *p)
{
        struct vmap_node *vn;
        struct vmap_area *va;
        struct vm_struct *v;
        int i;

        for (i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                spin_lock(&vn->busy.lock);
                list_for_each_entry(va, &vn->busy.head, list) {
                        if (!va->vm) {
                                if (va->flags & VMAP_RAM)
                                        seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
                                                (void *)va->va_start, (void *)va->va_end,
                                                va_size(va));

                                continue;
                        }

                        v = va->vm;

                        seq_printf(m, "0x%pK-0x%pK %7ld",
                                v->addr, v->addr + v->size, v->size);

                        if (v->caller)
                                seq_printf(m, " %pS", v->caller);

                        if (v->nr_pages)
                                seq_printf(m, " pages=%d", v->nr_pages);

                        if (v->phys_addr)
                                seq_printf(m, " phys=%pa", &v->phys_addr);

                        if (v->flags & VM_IOREMAP)
                                seq_puts(m, " ioremap");

                        if (v->flags & VM_SPARSE)
                                seq_puts(m, " sparse");

                        if (v->flags & VM_ALLOC)
                                seq_puts(m, " vmalloc");

                        if (v->flags & VM_MAP)
                                seq_puts(m, " vmap");

                        if (v->flags & VM_USERMAP)
                                seq_puts(m, " user");

                        if (v->flags & VM_DMA_COHERENT)
                                seq_puts(m, " dma-coherent");

                        if (is_vmalloc_addr(v->pages))
                                seq_puts(m, " vpages");

                        show_numa_info(m, v);
                        seq_putc(m, '\n');
                }
                spin_unlock(&vn->busy.lock);
        }

        /*
         * As a final step, dump "unpurged" areas.
         */
        show_purge_info(m);
        return 0;
}

static int __init proc_vmalloc_init(void)
{
        void *priv_data = NULL;

        if (IS_ENABLED(CONFIG_NUMA))
                priv_data = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);

        proc_create_single_data("vmallocinfo",
                0400, NULL, vmalloc_info_show, priv_data);

        return 0;
}
module_init(proc_vmalloc_init);

#endif

static void __init vmap_init_free_space(void)
{
        unsigned long vmap_start = 1;
        const unsigned long vmap_end = ULONG_MAX;
        struct vmap_area *free;
        struct vm_struct *busy;

        /*
         *     B     F     B     B     B     F
         * -|-----|.....|-----|-----|-----|.....|-
         *  |           The KVA space           |
         *  |<--------------------------------->|
         */
        for (busy = vmlist; busy; busy = busy->next) {
                if ((unsigned long) busy->addr - vmap_start > 0) {
                        free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                        if (!WARN_ON_ONCE(!free)) {
                                free->va_start = vmap_start;
                                free->va_end = (unsigned long) busy->addr;

                                insert_vmap_area_augment(free, NULL,
                                        &free_vmap_area_root,
                                                &free_vmap_area_list);
                        }
                }

                vmap_start = (unsigned long) busy->addr + busy->size;
        }

        if (vmap_end - vmap_start > 0) {
                free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (!WARN_ON_ONCE(!free)) {
                        free->va_start = vmap_start;
                        free->va_end = vmap_end;

                        insert_vmap_area_augment(free, NULL,
                                &free_vmap_area_root,
                                        &free_vmap_area_list);
                }
        }
}

static void vmap_init_nodes(void)
{
        struct vmap_node *vn;
        int i, n;

#if BITS_PER_LONG == 64
        /*
         * A high threshold of max nodes is fixed and bound to 128,
         * thus a scale factor is 1 for systems where number of cores
         * are less or equal to specified threshold.
         *
         * As for NUMA-aware notes. For bigger systems, for example
         * NUMA with multi-sockets, where we can end-up with thousands
         * of cores in total, a "sub-numa-clustering" should be added.
         *
         * In this case a NUMA domain is considered as a single entity
         * with dedicated sub-nodes in it which describe one group or
         * set of cores. Therefore a per-domain purging is supposed to
         * be added as well as a per-domain balancing.
         */
        n = clamp_t(unsigned int, num_possible_cpus(), 1, 128);

        if (n > 1) {
                vn = kmalloc_array(n, sizeof(*vn), GFP_NOWAIT | __GFP_NOWARN);
                if (vn) {
                        /* Node partition is 16 pages. */
                        vmap_zone_size = (1 << 4) * PAGE_SIZE;
                        nr_vmap_nodes = n;
                        vmap_nodes = vn;
                } else {
                        pr_err("Failed to allocate an array. Disable a node layer\n");
                }
        }
#endif

        for (n = 0; n < nr_vmap_nodes; n++) {
                vn = &vmap_nodes[n];
                vn->busy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->busy.head);
                spin_lock_init(&vn->busy.lock);

                vn->lazy.root = RB_ROOT;
                INIT_LIST_HEAD(&vn->lazy.head);
                spin_lock_init(&vn->lazy.lock);

                for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
                        INIT_LIST_HEAD(&vn->pool[i].head);
                        WRITE_ONCE(vn->pool[i].len, 0);
                }

                spin_lock_init(&vn->pool_lock);
        }
}

static unsigned long
vmap_node_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        unsigned long count;
        struct vmap_node *vn;
        int i, j;

        for (count = 0, i = 0; i < nr_vmap_nodes; i++) {
                vn = &vmap_nodes[i];

                for (j = 0; j < MAX_VA_SIZE_PAGES; j++)
                        count += READ_ONCE(vn->pool[j].len);
        }

        return count ? count : SHRINK_EMPTY;
}

static unsigned long
vmap_node_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        int i;

        for (i = 0; i < nr_vmap_nodes; i++)
                decay_va_pool_node(&vmap_nodes[i], true);

        return SHRINK_STOP;
}

void __init vmalloc_init(void)
{
        struct shrinker *vmap_node_shrinker;
        struct vmap_area *va;
        struct vmap_node *vn;
        struct vm_struct *tmp;
        int i;

        /*
         * Create the cache for vmap_area objects.
         */
        vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);

        for_each_possible_cpu(i) {
                struct vmap_block_queue *vbq;
                struct vfree_deferred *p;

                vbq = &per_cpu(vmap_block_queue, i);
                spin_lock_init(&vbq->lock);
                INIT_LIST_HEAD(&vbq->free);
                p = &per_cpu(vfree_deferred, i);
                init_llist_head(&p->list);
                INIT_WORK(&p->wq, delayed_vfree_work);
                xa_init(&vbq->vmap_blocks);
        }

        /*
         * Setup nodes before importing vmlist.
         */
        vmap_init_nodes();

        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
                if (WARN_ON_ONCE(!va))
                        continue;

                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
                va->vm = tmp;

                vn = addr_to_node(va->va_start);
                insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
        }

        /*
         * Now we can initialize a free vmap space.
         */
        vmap_init_free_space();
        vmap_initialized = true;

        vmap_node_shrinker = shrinker_alloc(0, "vmap-node");
        if (!vmap_node_shrinker) {
                pr_err("Failed to allocate vmap-node shrinker!\n");
                return;
        }

        vmap_node_shrinker->count_objects = vmap_node_shrink_count;
        vmap_node_shrinker->scan_objects = vmap_node_shrink_scan;
        shrinker_register(vmap_node_shrinker);
}














































































































































































































































































































































































































































































  437 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kmem

#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KMEM_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <trace/events/mmflags.h>

TRACE_EVENT(kmem_cache_alloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 struct kmem_cache *s,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, s, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
                __field(        bool,                accounted        )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = s->object_size;
                __entry->bytes_alloc        = s->size;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
                __entry->accounted        = IS_ENABLED(CONFIG_MEMCG) ?
                                          ((gfp_flags & __GFP_ACCOUNT) ||
                                          (s->flags & SLAB_ACCOUNT)) : false;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                __entry->accounted ? "true" : "false")
);

TRACE_EVENT(kmalloc,

        TP_PROTO(unsigned long call_site,
                 const void *ptr,
                 size_t bytes_req,
                 size_t bytes_alloc,
                 gfp_t gfp_flags,
                 int node),

        TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __field(        size_t,                bytes_req        )
                __field(        size_t,                bytes_alloc        )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                node                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __entry->bytes_req        = bytes_req;
                __entry->bytes_alloc        = bytes_alloc;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->node                = node;
        ),

        TP_printk("call_site=%pS ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d accounted=%s",
                (void *)__entry->call_site,
                __entry->ptr,
                __entry->bytes_req,
                __entry->bytes_alloc,
                show_gfp_flags(__entry->gfp_flags),
                __entry->node,
                (IS_ENABLED(CONFIG_MEMCG) &&
                 (__entry->gfp_flags & (__force unsigned long)__GFP_ACCOUNT)) ? "true" : "false")
);

TRACE_EVENT(kfree,

        TP_PROTO(unsigned long call_site, const void *ptr),

        TP_ARGS(call_site, ptr),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
        ),

        TP_printk("call_site=%pS ptr=%p",
                  (void *)__entry->call_site, __entry->ptr)
);

TRACE_EVENT(kmem_cache_free,

        TP_PROTO(unsigned long call_site, const void *ptr, const struct kmem_cache *s),

        TP_ARGS(call_site, ptr, s),

        TP_STRUCT__entry(
                __field(        unsigned long,        call_site        )
                __field(        const void *,        ptr                )
                __string(        name,                s->name                )
        ),

        TP_fast_assign(
                __entry->call_site        = call_site;
                __entry->ptr                = ptr;
                __assign_str(name);
        ),

        TP_printk("call_site=%pS ptr=%p name=%s",
                  (void *)__entry->call_site, __entry->ptr, __get_str(name))
);

TRACE_EVENT(mm_page_free,

        TP_PROTO(struct page *page, unsigned int order),

        TP_ARGS(page, order),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
                __entry->order                = order;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn,
                        __entry->order)
);

TRACE_EVENT(mm_page_free_batched,

        TP_PROTO(struct page *page),

        TP_ARGS(page),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
        ),

        TP_fast_assign(
                __entry->pfn                = page_to_pfn(page);
        ),

        TP_printk("page=%p pfn=0x%lx order=0",
                        pfn_to_page(__entry->pfn),
                        __entry->pfn)
);

TRACE_EVENT(mm_page_alloc,

        TP_PROTO(struct page *page, unsigned int order,
                        gfp_t gfp_flags, int migratetype),

        TP_ARGS(page, order, gfp_flags, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        unsigned long,        gfp_flags        )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->gfp_flags        = (__force unsigned long)gfp_flags;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d gfp_flags=%s",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                show_gfp_flags(__entry->gfp_flags))
);

DECLARE_EVENT_CLASS(mm_page,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
                __field(        int,                percpu_refill        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
                __entry->percpu_refill        = percpu_refill;
        ),

        TP_printk("page=%p pfn=0x%lx order=%u migratetype=%d percpu_refill=%d",
                __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL,
                __entry->pfn != -1UL ? __entry->pfn : 0,
                __entry->order,
                __entry->migratetype,
                __entry->percpu_refill)
);

DEFINE_EVENT(mm_page, mm_page_alloc_zone_locked,

        TP_PROTO(struct page *page, unsigned int order, int migratetype,
                 int percpu_refill),

        TP_ARGS(page, order, migratetype, percpu_refill)
);

TRACE_EVENT(mm_page_pcpu_drain,

        TP_PROTO(struct page *page, unsigned int order, int migratetype),

        TP_ARGS(page, order, migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                )
                __field(        unsigned int,        order                )
                __field(        int,                migratetype        )
        ),

        TP_fast_assign(
                __entry->pfn                = page ? page_to_pfn(page) : -1UL;
                __entry->order                = order;
                __entry->migratetype        = migratetype;
        ),

        TP_printk("page=%p pfn=0x%lx order=%d migratetype=%d",
                pfn_to_page(__entry->pfn), __entry->pfn,
                __entry->order, __entry->migratetype)
);

TRACE_EVENT(mm_page_alloc_extfrag,

        TP_PROTO(struct page *page,
                int alloc_order, int fallback_order,
                int alloc_migratetype, int fallback_migratetype),

        TP_ARGS(page,
                alloc_order, fallback_order,
                alloc_migratetype, fallback_migratetype),

        TP_STRUCT__entry(
                __field(        unsigned long,        pfn                        )
                __field(        int,                alloc_order                )
                __field(        int,                fallback_order                )
                __field(        int,                alloc_migratetype        )
                __field(        int,                fallback_migratetype        )
                __field(        int,                change_ownership        )
        ),

        TP_fast_assign(
                __entry->pfn                        = page_to_pfn(page);
                __entry->alloc_order                = alloc_order;
                __entry->fallback_order                = fallback_order;
                __entry->alloc_migratetype        = alloc_migratetype;
                __entry->fallback_migratetype        = fallback_migratetype;
                __entry->change_ownership        = (alloc_migratetype ==
                                        get_pageblock_migratetype(page));
        ),

        TP_printk("page=%p pfn=0x%lx alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
                pfn_to_page(__entry->pfn),
                __entry->pfn,
                __entry->alloc_order,
                __entry->fallback_order,
                pageblock_order,
                __entry->alloc_migratetype,
                __entry->fallback_migratetype,
                __entry->fallback_order < pageblock_order,
                __entry->change_ownership)
);

TRACE_EVENT(mm_alloc_contig_migrate_range_info,

        TP_PROTO(unsigned long start,
                 unsigned long end,
                 unsigned long nr_migrated,
                 unsigned long nr_reclaimed,
                 unsigned long nr_mapped,
                 int migratetype),

        TP_ARGS(start, end, nr_migrated, nr_reclaimed, nr_mapped, migratetype),

        TP_STRUCT__entry(
                __field(unsigned long, start)
                __field(unsigned long, end)
                __field(unsigned long, nr_migrated)
                __field(unsigned long, nr_reclaimed)
                __field(unsigned long, nr_mapped)
                __field(int, migratetype)
        ),

        TP_fast_assign(
                __entry->start = start;
                __entry->end = end;
                __entry->nr_migrated = nr_migrated;
                __entry->nr_reclaimed = nr_reclaimed;
                __entry->nr_mapped = nr_mapped;
                __entry->migratetype = migratetype;
        ),

        TP_printk("start=0x%lx end=0x%lx migratetype=%d nr_migrated=%lu nr_reclaimed=%lu nr_mapped=%lu",
                  __entry->start,
                  __entry->end,
                  __entry->migratetype,
                  __entry->nr_migrated,
                  __entry->nr_reclaimed,
                  __entry->nr_mapped)
);

TRACE_EVENT(mm_setup_per_zone_wmarks,

        TP_PROTO(struct zone *zone),

        TP_ARGS(zone),

        TP_STRUCT__entry(
                __field(int, node_id)
                __string(name, zone->name)
                __field(unsigned long, watermark_min)
                __field(unsigned long, watermark_low)
                __field(unsigned long, watermark_high)
                __field(unsigned long, watermark_promo)
        ),

        TP_fast_assign(
                __entry->node_id = zone->zone_pgdat->node_id;
                __assign_str(name);
                __entry->watermark_min = zone->_watermark[WMARK_MIN];
                __entry->watermark_low = zone->_watermark[WMARK_LOW];
                __entry->watermark_high = zone->_watermark[WMARK_HIGH];
                __entry->watermark_promo = zone->_watermark[WMARK_PROMO];
        ),

        TP_printk("node_id=%d zone name=%s watermark min=%lu low=%lu high=%lu promo=%lu",
                  __entry->node_id,
                  __get_str(name),
                  __entry->watermark_min,
                  __entry->watermark_low,
                  __entry->watermark_high,
                  __entry->watermark_promo)
);

TRACE_EVENT(mm_setup_per_zone_lowmem_reserve,

        TP_PROTO(struct zone *zone, struct zone *upper_zone, long lowmem_reserve),

        TP_ARGS(zone, upper_zone, lowmem_reserve),

        TP_STRUCT__entry(
                __field(int, node_id)
                __string(name, zone->name)
                __string(upper_name, upper_zone->name)
                __field(long, lowmem_reserve)
        ),

        TP_fast_assign(
                __entry->node_id = zone->zone_pgdat->node_id;
                __assign_str(name);
                __assign_str(upper_name);
                __entry->lowmem_reserve = lowmem_reserve;
        ),

        TP_printk("node_id=%d zone name=%s upper_zone name=%s lowmem_reserve_pages=%ld",
                  __entry->node_id,
                  __get_str(name),
                  __get_str(upper_name),
                  __entry->lowmem_reserve)
);

TRACE_EVENT(mm_calculate_totalreserve_pages,

        TP_PROTO(unsigned long totalreserve_pages),

        TP_ARGS(totalreserve_pages),

        TP_STRUCT__entry(
                __field(unsigned long, totalreserve_pages)
        ),

        TP_fast_assign(
                __entry->totalreserve_pages = totalreserve_pages;
        ),

        TP_printk("totalreserve_pages=%lu", __entry->totalreserve_pages)
);


/*
 * Required for uniquely and securely identifying mm in rss_stat tracepoint.
 */
#ifndef __PTR_TO_HASHVAL
static unsigned int __maybe_unused mm_ptr_to_hash(const void *ptr)
{
        int ret;
        unsigned long hashval;

        ret = ptr_to_hashval(ptr, &hashval);
        if (ret)
                return 0;

        /* The hashed value is only 32-bit */
        return (unsigned int)hashval;
}
#define __PTR_TO_HASHVAL
#endif

#define TRACE_MM_PAGES                \
        EM(MM_FILEPAGES)        \
        EM(MM_ANONPAGES)        \
        EM(MM_SWAPENTS)                \
        EMe(MM_SHMEMPAGES)

#undef EM
#undef EMe

#define EM(a)        TRACE_DEFINE_ENUM(a);
#define EMe(a)        TRACE_DEFINE_ENUM(a);

TRACE_MM_PAGES

#undef EM
#undef EMe

#define EM(a)        { a, #a },
#define EMe(a)        { a, #a }

TRACE_EVENT(rss_stat,

        TP_PROTO(struct mm_struct *mm,
                int member),

        TP_ARGS(mm, member),

        TP_STRUCT__entry(
                __field(unsigned int, mm_id)
                __field(unsigned int, curr)
                __field(int, member)
                __field(long, size)
        ),

        TP_fast_assign(
                __entry->mm_id = mm_ptr_to_hash(mm);
                __entry->curr = !!(current->mm == mm);
                __entry->member = member;
                __entry->size = (percpu_counter_sum_positive(&mm->rss_stat[member])
                                                            << PAGE_SHIFT);
        ),

        TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
                __entry->mm_id,
                __entry->curr,
                __print_symbolic(__entry->member, TRACE_MM_PAGES),
                __entry->size)
        );
#endif /* _TRACE_KMEM_H */

/* This part must be outside protection */
#include <trace/define_trace.h>






























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __SHMEM_FS_H
#define __SHMEM_FS_H

#include <linux/file.h>
#include <linux/swap.h>
#include <linux/mempolicy.h>
#include <linux/pagemap.h>
#include <linux/percpu_counter.h>
#include <linux/xattr.h>
#include <linux/fs_parser.h>
#include <linux/userfaultfd_k.h>

/* inode in-kernel data */

#ifdef CONFIG_TMPFS_QUOTA
#define SHMEM_MAXQUOTAS 2
#endif

struct shmem_inode_info {
        spinlock_t                lock;
        unsigned int                seals;                /* shmem seals */
        unsigned long                flags;
        unsigned long                alloced;        /* data pages alloced to file */
        unsigned long                swapped;        /* subtotal assigned to swap */
        union {
            struct offset_ctx        dir_offsets;        /* stable directory offsets */
            struct {
                struct list_head shrinklist;        /* shrinkable hpage inodes */
                struct list_head swaplist;        /* chain of maybes on swap */
            };
        };
        struct timespec64        i_crtime;        /* file creation time */
        struct shared_policy        policy;                /* NUMA memory alloc policy */
        struct simple_xattrs        xattrs;                /* list of xattrs */
        pgoff_t                        fallocend;        /* highest fallocate endindex */
        unsigned int                fsflags;        /* for FS_IOC_[SG]ETFLAGS */
        atomic_t                stop_eviction;        /* hold when working on inode */
#ifdef CONFIG_TMPFS_QUOTA
        struct dquot __rcu        *i_dquot[MAXQUOTAS];
#endif
        struct inode                vfs_inode;
};

#define SHMEM_FL_USER_VISIBLE                (FS_FL_USER_VISIBLE | FS_CASEFOLD_FL)
#define SHMEM_FL_USER_MODIFIABLE \
        (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL)
#define SHMEM_FL_INHERITED                (FS_NODUMP_FL | FS_NOATIME_FL | FS_CASEFOLD_FL)

struct shmem_quota_limits {
        qsize_t usrquota_bhardlimit; /* Default user quota block hard limit */
        qsize_t usrquota_ihardlimit; /* Default user quota inode hard limit */
        qsize_t grpquota_bhardlimit; /* Default group quota block hard limit */
        qsize_t grpquota_ihardlimit; /* Default group quota inode hard limit */
};

struct shmem_sb_info {
        unsigned long max_blocks;   /* How many blocks are allowed */
        struct percpu_counter used_blocks;  /* How many are allocated */
        unsigned long max_inodes;   /* How many inodes are allowed */
        unsigned long free_ispace;  /* How much ispace left for allocation */
        raw_spinlock_t stat_lock;   /* Serialize shmem_sb_info changes */
        umode_t mode;                    /* Mount mode for root directory */
        unsigned char huge;            /* Whether to try for hugepages */
        kuid_t uid;                    /* Mount uid for root directory */
        kgid_t gid;                    /* Mount gid for root directory */
        bool full_inums;            /* If i_ino should be uint or ino_t */
        bool noswap;                    /* ignores VM reclaim / swap requests */
        ino_t next_ino;                    /* The next per-sb inode number to use */
        ino_t __percpu *ino_batch;  /* The next per-cpu inode number to use */
        struct mempolicy *mpol;     /* default memory policy for mappings */
        spinlock_t shrinklist_lock;   /* Protects shrinklist */
        struct list_head shrinklist;  /* List of shinkable inodes */
        unsigned long shrinklist_len; /* Length of shrinklist */
        struct shmem_quota_limits qlimits; /* Default quota limits */
};

static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
{
        return container_of(inode, struct shmem_inode_info, vfs_inode);
}

/*
 * Functions in mm/shmem.c called directly from elsewhere:
 */
extern const struct fs_parameter_spec shmem_fs_parameters[];
extern void shmem_init(void);
extern int shmem_init_fs_context(struct fs_context *fc);
extern struct file *shmem_file_setup(const char *name,
                                        loff_t size, unsigned long flags);
extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
                                            unsigned long flags);
extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
                const char *name, loff_t size, unsigned long flags);
extern int shmem_zero_setup(struct vm_area_struct *);
extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
#ifdef CONFIG_SHMEM
bool shmem_mapping(struct address_space *mapping);
#else
static inline bool shmem_mapping(struct address_space *mapping)
{
        return false;
}
#endif /* CONFIG_SHMEM */
extern void shmem_unlock_mapping(struct address_space *mapping);
extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                        pgoff_t index, gfp_t gfp_mask);
extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end);
int shmem_unuse(unsigned int type);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
unsigned long shmem_allowable_huge_orders(struct inode *inode,
                                struct vm_area_struct *vma, pgoff_t index,
                                loff_t write_end, bool shmem_huge_force);
bool shmem_hpage_pmd_enabled(void);
#else
static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
                                struct vm_area_struct *vma, pgoff_t index,
                                loff_t write_end, bool shmem_huge_force)
{
        return 0;
}

static inline bool shmem_hpage_pmd_enabled(void)
{
        return false;
}
#endif

#ifdef CONFIG_SHMEM
extern unsigned long shmem_swap_usage(struct vm_area_struct *vma);
#else
static inline unsigned long shmem_swap_usage(struct vm_area_struct *vma)
{
        return 0;
}
#endif
extern unsigned long shmem_partial_swap_usage(struct address_space *mapping,
                                                pgoff_t start, pgoff_t end);

/* Flag allocation requirements to shmem_get_folio */
enum sgp_type {
        SGP_READ,        /* don't exceed i_size, don't allocate page */
        SGP_NOALLOC,        /* similar, but fail on hole or use fallocated page */
        SGP_CACHE,        /* don't exceed i_size, may allocate page */
        SGP_WRITE,        /* may exceed i_size, may allocate !Uptodate page */
        SGP_FALLOC,        /* like SGP_WRITE, but make existing page Uptodate */
};

int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
                struct folio **foliop, enum sgp_type sgp);
struct folio *shmem_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp);

static inline struct folio *shmem_read_folio(struct address_space *mapping,
                pgoff_t index)
{
        return shmem_read_folio_gfp(mapping, index, mapping_gfp_mask(mapping));
}

static inline struct page *shmem_read_mapping_page(
                                struct address_space *mapping, pgoff_t index)
{
        return shmem_read_mapping_page_gfp(mapping, index,
                                        mapping_gfp_mask(mapping));
}

static inline bool shmem_file(struct file *file)
{
        if (!IS_ENABLED(CONFIG_SHMEM))
                return false;
        if (!file || !file->f_mapping)
                return false;
        return shmem_mapping(file->f_mapping);
}

/*
 * If fallocate(FALLOC_FL_KEEP_SIZE) has been used, there may be pages
 * beyond i_size's notion of EOF, which fallocate has committed to reserving:
 * which split_huge_page() must therefore not delete.  This use of a single
 * "fallocend" per inode errs on the side of not deleting a reservation when
 * in doubt: there are plenty of cases when it preserves unreserved pages.
 */
static inline pgoff_t shmem_fallocend(struct inode *inode, pgoff_t eof)
{
        return max(eof, SHMEM_I(inode)->fallocend);
}

extern bool shmem_charge(struct inode *inode, long pages);
extern void shmem_uncharge(struct inode *inode, long pages);

#ifdef CONFIG_USERFAULTFD
#ifdef CONFIG_SHMEM
extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
                                  struct vm_area_struct *dst_vma,
                                  unsigned long dst_addr,
                                  unsigned long src_addr,
                                  uffd_flags_t flags,
                                  struct folio **foliop);
#else /* !CONFIG_SHMEM */
#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
                               src_addr, flags, foliop) ({ BUG(); 0; })
#endif /* CONFIG_SHMEM */
#endif /* CONFIG_USERFAULTFD */

/*
 * Used space is stored as unsigned 64-bit value in bytes but
 * quota core supports only signed 64-bit values so use that
 * as a limit
 */
#define SHMEM_QUOTA_MAX_SPC_LIMIT 0x7fffffffffffffffLL /* 2^63-1 */
#define SHMEM_QUOTA_MAX_INO_LIMIT 0x7fffffffffffffffLL

#ifdef CONFIG_TMPFS_QUOTA
extern const struct dquot_operations shmem_quota_operations;
extern struct quota_format_type shmem_quota_format;
#endif /* CONFIG_TMPFS_QUOTA */

#endif

















































































































































































































































































































































































































































































































































































































































   23 






   13 

   11 








   24 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  NET  is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Ethernet handlers.
 *
 * Version:        @(#)eth.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *
 *                Relocated to include/linux where it belongs by Alan Cox
 *                                                        <gw4pts@gw4pts.ampr.org>
 */
#ifndef _LINUX_ETHERDEVICE_H
#define _LINUX_ETHERDEVICE_H

#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/random.h>
#include <linux/crc32.h>
#include <linux/unaligned.h>
#include <asm/bitsperlong.h>

#ifdef __KERNEL__
struct device;
struct fwnode_handle;

int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr);
int platform_get_ethdev_address(struct device *dev, struct net_device *netdev);
unsigned char *arch_get_platform_mac_address(void);
int nvmem_get_mac_address(struct device *dev, void *addrbuf);
int device_get_mac_address(struct device *dev, char *addr);
int device_get_ethdev_address(struct device *dev, struct net_device *netdev);
int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr);

u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len);
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev);
extern const struct header_ops eth_header_ops;

int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
               const void *daddr, const void *saddr, unsigned len);
int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh,
                     __be16 type);
void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev,
                             const unsigned char *haddr);
__be16 eth_header_parse_protocol(const struct sk_buff *skb);
int eth_prepare_mac_addr_change(struct net_device *dev, void *p);
void eth_commit_mac_addr_change(struct net_device *dev, void *p);
int eth_mac_addr(struct net_device *dev, void *p);
int eth_validate_addr(struct net_device *dev);

struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
                                            unsigned int rxqs);
#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
#define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)

struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
                                           unsigned int txqs,
                                           unsigned int rxqs);
#define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1)

struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb);
int eth_gro_complete(struct sk_buff *skb, int nhoff);

/* Reserved Ethernet Addresses per IEEE 802.1Q */
static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 };
#define eth_stp_addr eth_reserved_addr_base

static const u8 eth_ipv4_mcast_addr_base[ETH_ALEN] __aligned(2) =
{ 0x01, 0x00, 0x5e, 0x00, 0x00, 0x00 };

static const u8 eth_ipv6_mcast_addr_base[ETH_ALEN] __aligned(2) =
{ 0x33, 0x33, 0x00, 0x00, 0x00, 0x00 };

/**
 * is_link_local_ether_addr - Determine if given Ethernet address is link-local
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if address is link local reserved addr (01:80:c2:00:00:0X) per
 * IEEE 802.1Q 8.6.3 Frame filtering.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_link_local_ether_addr(const u8 *addr)
{
        __be16 *a = (__be16 *)addr;
        static const __be16 *b = (const __be16 *)eth_reserved_addr_base;
        static const __be16 m = cpu_to_be16(0xfff0);

#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return (((*(const u32 *)addr) ^ (*(const u32 *)b)) |
                (__force int)((a[2] ^ b[2]) & m)) == 0;
#else
        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0;
#endif
}

/**
 * is_zero_ether_addr - Determine if give Ethernet address is all zeros.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is all zeroes.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_zero_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0;
#else
        return (*(const u16 *)(addr + 0) |
                *(const u16 *)(addr + 2) |
                *(const u16 *)(addr + 4)) == 0;
#endif
}

/**
 * is_multicast_ether_addr - Determine if the Ethernet address is a multicast.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is a multicast address.
 * By definition the broadcast address is also a multicast address.
 */
static inline bool is_multicast_ether_addr(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 a = *(const u32 *)addr;
#else
        u16 a = *(const u16 *)addr;
#endif
#ifdef __BIG_ENDIAN
        return 0x01 & (a >> ((sizeof(a) * 8) - 8));
#else
        return 0x01 & a;
#endif
}

static inline bool is_multicast_ether_addr_64bits(const u8 *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
#ifdef __BIG_ENDIAN
        return 0x01 & ((*(const u64 *)addr) >> 56);
#else
        return 0x01 & (*(const u64 *)addr);
#endif
#else
        return is_multicast_ether_addr(addr);
#endif
}

/**
 * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802).
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is a local address.
 */
static inline bool is_local_ether_addr(const u8 *addr)
{
        return 0x02 & addr[0];
}

/**
 * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is the broadcast address.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_broadcast_ether_addr(const u8 *addr)
{
        return (*(const u16 *)(addr + 0) &
                *(const u16 *)(addr + 2) &
                *(const u16 *)(addr + 4)) == 0xffff;
}

/**
 * is_unicast_ether_addr - Determine if the Ethernet address is unicast
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: true if the address is a unicast address.
 */
static inline bool is_unicast_ether_addr(const u8 *addr)
{
        return !is_multicast_ether_addr(addr);
}

/**
 * is_valid_ether_addr - Determine if the given Ethernet address is valid
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not
 * a multicast address, and is not FF:FF:FF:FF:FF:FF.
 *
 * Return: true if the address is valid.
 *
 * Please note: addr must be aligned to u16.
 */
static inline bool is_valid_ether_addr(const u8 *addr)
{
        /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to
         * explicitly check for it here. */
        return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr);
}

/**
 * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol
 * @proto: Ethertype/length value to be tested
 *
 * Check that the value from the Ethertype/length field is a valid Ethertype.
 *
 * Return: true if the valid is an 802.3 supported Ethertype.
 */
static inline bool eth_proto_is_802_3(__be16 proto)
{
#ifndef __BIG_ENDIAN
        /* if CPU is little endian mask off bits representing LSB */
        proto &= htons(0xFF00);
#endif
        /* cast both to u16 and compare since LSB can be ignored */
        return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN);
}

/**
 * eth_random_addr - Generate software assigned random Ethernet address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Generate a random Ethernet address (MAC) that is not multicast
 * and has the local assigned bit set.
 */
static inline void eth_random_addr(u8 *addr)
{
        get_random_bytes(addr, ETH_ALEN);
        addr[0] &= 0xfe;        /* clear multicast bit */
        addr[0] |= 0x02;        /* set local assignment bit (IEEE802) */
}

/**
 * eth_broadcast_addr - Assign broadcast address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the broadcast address to the given address array.
 */
static inline void eth_broadcast_addr(u8 *addr)
{
        memset(addr, 0xff, ETH_ALEN);
}

/**
 * eth_zero_addr - Assign zero address
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Assign the zero address to the given address array.
 */
static inline void eth_zero_addr(u8 *addr)
{
        memset(addr, 0x00, ETH_ALEN);
}

/**
 * eth_hw_addr_random - Generate software assigned random Ethernet and
 * set device flag
 * @dev: pointer to net_device structure
 *
 * Generate a random Ethernet address (MAC) to be used by a net device
 * and set addr_assign_type so the state can be read by sysfs and be
 * used by userspace.
 */
static inline void eth_hw_addr_random(struct net_device *dev)
{
        u8 addr[ETH_ALEN];

        eth_random_addr(addr);
        __dev_addr_set(dev, addr, ETH_ALEN);
        dev->addr_assign_type = NET_ADDR_RANDOM;
}

/**
 * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr
 * @ha: pointer to hardware address
 *
 * Calculate CRC from a hardware address as basis for filter hashes.
 */
static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha)
{
        return ether_crc(ETH_ALEN, ha->addr);
}

/**
 * ether_addr_copy - Copy an Ethernet address
 * @dst: Pointer to a six-byte array Ethernet address destination
 * @src: Pointer to a six-byte array Ethernet address source
 *
 * Please note: dst & src must both be aligned to u16.
 */
static inline void ether_addr_copy(u8 *dst, const u8 *src)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        *(u32 *)dst = *(const u32 *)src;
        *(u16 *)(dst + 4) = *(const u16 *)(src + 4);
#else
        u16 *a = (u16 *)dst;
        const u16 *b = (const u16 *)src;

        a[0] = b[0];
        a[1] = b[1];
        a[2] = b[2];
#endif
}

/**
 * eth_hw_addr_set - Assign Ethernet address to a net_device
 * @dev: pointer to net_device structure
 * @addr: address to assign
 *
 * Assign given address to the net_device, addr_assign_type is not changed.
 */
static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, ETH_ALEN);
}

/**
 * eth_hw_addr_inherit - Copy dev_addr from another net_device
 * @dst: pointer to net_device to copy dev_addr to
 * @src: pointer to net_device to copy dev_addr from
 *
 * Copy the Ethernet address from one net_device to another along with
 * the address attributes (addr_assign_type).
 */
static inline void eth_hw_addr_inherit(struct net_device *dst,
                                       struct net_device *src)
{
        dst->addr_assign_type = src->addr_assign_type;
        eth_hw_addr_set(dst, src->dev_addr);
}

/**
 * ether_addr_equal - Compare two Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: addr1 & addr2 must both be aligned to u16.
 */
static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
                   ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));

        return fold == 0;
#else
        const u16 *a = (const u16 *)addr1;
        const u16 *b = (const u16 *)addr2;

        return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
#endif
}

/**
 * ether_addr_equal_64bits - Compare two Ethernet addresses
 * @addr1: Pointer to an array of 8 bytes
 * @addr2: Pointer to an other array of 8 bytes
 *
 * Compare two Ethernet addresses, returns true if equal, false otherwise.
 *
 * The function doesn't need any conditional branches and possibly uses
 * word memory accesses on CPU allowing cheap unaligned memory reads.
 * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 }
 *
 * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits.
 */

static inline bool ether_addr_equal_64bits(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2);

#ifdef __BIG_ENDIAN
        return (fold >> 16) == 0;
#else
        return (fold << 16) == 0;
#endif
#else
        return ether_addr_equal(addr1, addr2);
#endif
}

/**
 * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses
 * @addr1: Pointer to a six-byte array containing the Ethernet address
 * @addr2: Pointer other six-byte array containing the Ethernet address
 *
 * Compare two Ethernet addresses, returns true if equal
 *
 * Please note: Use only when any Ethernet address may not be u16 aligned.
 */
static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
        return ether_addr_equal(addr1, addr2);
#else
        return memcmp(addr1, addr2, ETH_ALEN) == 0;
#endif
}

/**
 * ether_addr_equal_masked - Compare two Ethernet addresses with a mask
 * @addr1: Pointer to a six-byte array containing the 1st Ethernet address
 * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address
 * @mask: Pointer to a six-byte array containing the Ethernet address bitmask
 *
 * Compare two Ethernet addresses with a mask, returns true if for every bit
 * set in the bitmask the equivalent bits in the ethernet addresses are equal.
 * Using a mask with all bits set is a slower ether_addr_equal.
 */
static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
                                           const u8 *mask)
{
        int i;

        for (i = 0; i < ETH_ALEN; i++) {
                if ((addr1[i] ^ addr2[i]) & mask[i])
                        return false;
        }

        return true;
}

static inline bool ether_addr_is_ipv4_mcast(const u8 *addr)
{
        u8 mask[ETH_ALEN] = { 0xff, 0xff, 0xff, 0x80, 0x00, 0x00 };

        return ether_addr_equal_masked(addr, eth_ipv4_mcast_addr_base, mask);
}

static inline bool ether_addr_is_ipv6_mcast(const u8 *addr)
{
        u8 mask[ETH_ALEN] = { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 };

        return ether_addr_equal_masked(addr, eth_ipv6_mcast_addr_base, mask);
}

static inline bool ether_addr_is_ip_mcast(const u8 *addr)
{
        return ether_addr_is_ipv4_mcast(addr) ||
                ether_addr_is_ipv6_mcast(addr);
}

/**
 * ether_addr_to_u64 - Convert an Ethernet address into a u64 value.
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Return: a u64 value of the address
 */
static inline u64 ether_addr_to_u64(const u8 *addr)
{
        u64 u = 0;
        int i;

        for (i = 0; i < ETH_ALEN; i++)
                u = u << 8 | addr[i];

        return u;
}

/**
 * u64_to_ether_addr - Convert a u64 to an Ethernet address.
 * @u: u64 to convert to an Ethernet MAC address
 * @addr: Pointer to a six-byte array to contain the Ethernet address
 */
static inline void u64_to_ether_addr(u64 u, u8 *addr)
{
        int i;

        for (i = ETH_ALEN - 1; i >= 0; i--) {
                addr[i] = u & 0xff;
                u = u >> 8;
        }
}

/**
 * eth_addr_dec - Decrement the given MAC address
 *
 * @addr: Pointer to a six-byte array containing Ethernet address to decrement
 */
static inline void eth_addr_dec(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u--;
        u64_to_ether_addr(u, addr);
}

/**
 * eth_addr_inc() - Increment the given MAC address.
 * @addr: Pointer to a six-byte array containing Ethernet address to increment.
 */
static inline void eth_addr_inc(u8 *addr)
{
        u64 u = ether_addr_to_u64(addr);

        u++;
        u64_to_ether_addr(u, addr);
}

/**
 * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address.
 *
 * @offset: Offset to add.
 * @addr: Pointer to a six-byte array containing Ethernet address to increment.
 */
static inline void eth_addr_add(u8 *addr, long offset)
{
        u64 u = ether_addr_to_u64(addr);

        u += offset;
        u64_to_ether_addr(u, addr);
}

/**
 * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
 * @dev: Pointer to a device structure
 * @addr: Pointer to a six-byte array containing the Ethernet address
 *
 * Compare passed address with all addresses of the device. Return true if the
 * address if one of the device addresses.
 *
 * Note that this function calls ether_addr_equal_64bits() so take care of
 * the right padding.
 */
static inline bool is_etherdev_addr(const struct net_device *dev,
                                    const u8 addr[6 + 2])
{
        struct netdev_hw_addr *ha;
        bool res = false;

        rcu_read_lock();
        for_each_dev_addr(dev, ha) {
                res = ether_addr_equal_64bits(addr, ha->addr);
                if (res)
                        break;
        }
        rcu_read_unlock();
        return res;
}
#endif        /* __KERNEL__ */

/**
 * compare_ether_header - Compare two Ethernet headers
 * @a: Pointer to Ethernet header
 * @b: Pointer to Ethernet header
 *
 * Compare two Ethernet headers, returns 0 if equal.
 * This assumes that the network header (i.e., IP header) is 4-byte
 * aligned OR the platform can handle unaligned access.  This is the
 * case for all packets coming into netif_receive_skb or similar
 * entry points.
 */

static inline unsigned long compare_ether_header(const void *a, const void *b)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        unsigned long fold;

        /*
         * We want to compare 14 bytes:
         *  [a0 ... a13] ^ [b0 ... b13]
         * Use two long XOR, ORed together, with an overlap of two bytes.
         *  [a0  a1  a2  a3  a4  a5  a6  a7 ] ^ [b0  b1  b2  b3  b4  b5  b6  b7 ] |
         *  [a6  a7  a8  a9  a10 a11 a12 a13] ^ [b6  b7  b8  b9  b10 b11 b12 b13]
         * This means the [a6 a7] ^ [b6 b7] part is done two times.
        */
        fold = *(unsigned long *)a ^ *(unsigned long *)b;
        fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6);
        return fold;
#else
        u32 *a32 = (u32 *)((u8 *)a + 2);
        u32 *b32 = (u32 *)((u8 *)b + 2);

        return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) |
               (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]);
#endif
}

/**
 * eth_hw_addr_gen - Generate and assign Ethernet address to a port
 * @dev: pointer to port's net_device structure
 * @base_addr: base Ethernet address
 * @id: offset to add to the base address
 *
 * Generate a MAC address using a base address and an offset and assign it
 * to a net_device. Commonly used by switch drivers which need to compute
 * addresses for all their ports. addr_assign_type is not changed.
 */
static inline void eth_hw_addr_gen(struct net_device *dev, const u8 *base_addr,
                                   unsigned int id)
{
        u64 u = ether_addr_to_u64(base_addr);
        u8 addr[ETH_ALEN];

        u += id;
        u64_to_ether_addr(u, addr);
        eth_hw_addr_set(dev, addr);
}

/**
 * eth_skb_pkt_type - Assign packet type if destination address does not match
 * @skb: Assigned a packet type if address does not match @dev address
 * @dev: Network device used to compare packet address against
 *
 * If the destination MAC address of the packet does not match the network
 * device address, assign an appropriate packet type.
 */
static inline void eth_skb_pkt_type(struct sk_buff *skb,
                                    const struct net_device *dev)
{
        const struct ethhdr *eth = eth_hdr(skb);

        if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) {
                if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
                        if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
                                skb->pkt_type = PACKET_BROADCAST;
                        else
                                skb->pkt_type = PACKET_MULTICAST;
                } else {
                        skb->pkt_type = PACKET_OTHERHOST;
                }
        }
}

static inline struct ethhdr *eth_skb_pull_mac(struct sk_buff *skb)
{
        struct ethhdr *eth = (struct ethhdr *)skb->data;

        skb_pull_inline(skb, ETH_HLEN);
        return eth;
}

/**
 * eth_skb_pad - Pad buffer to minimum number of octets for Ethernet frame
 * @skb: Buffer to pad
 *
 * An Ethernet frame should have a minimum size of 60 bytes.  This function
 * takes short frames and pads them with zeros up to the 60 byte limit.
 */
static inline int eth_skb_pad(struct sk_buff *skb)
{
        return skb_put_padto(skb, ETH_ZLEN);
}

#endif        /* _LINUX_ETHERDEVICE_H */




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_NF_TABLES_H
#define _NET_NF_TABLES_H

#include <linux/unaligned.h>
#include <linux/list.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/nf_tables.h>
#include <linux/u64_stats_sync.h>
#include <linux/rhashtable.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netlink.h>
#include <net/flow_offload.h>
#include <net/netns/generic.h>

#define NFT_MAX_HOOKS        (NF_INET_INGRESS + 1)

struct module;

#define NFT_JUMP_STACK_SIZE        16

enum {
        NFT_PKTINFO_L4PROTO        = (1 << 0),
        NFT_PKTINFO_INNER        = (1 << 1),
        NFT_PKTINFO_INNER_FULL        = (1 << 2),
};

struct nft_pktinfo {
        struct sk_buff                        *skb;
        const struct nf_hook_state        *state;
        u8                                flags;
        u8                                tprot;
        u16                                fragoff;
        u16                                thoff;
        u16                                inneroff;
};

static inline struct sock *nft_sk(const struct nft_pktinfo *pkt)
{
        return pkt->state->sk;
}

static inline unsigned int nft_thoff(const struct nft_pktinfo *pkt)
{
        return pkt->thoff;
}

static inline struct net *nft_net(const struct nft_pktinfo *pkt)
{
        return pkt->state->net;
}

static inline unsigned int nft_hook(const struct nft_pktinfo *pkt)
{
        return pkt->state->hook;
}

static inline u8 nft_pf(const struct nft_pktinfo *pkt)
{
        return pkt->state->pf;
}

static inline const struct net_device *nft_in(const struct nft_pktinfo *pkt)
{
        return pkt->state->in;
}

static inline const struct net_device *nft_out(const struct nft_pktinfo *pkt)
{
        return pkt->state->out;
}

static inline void nft_set_pktinfo(struct nft_pktinfo *pkt,
                                   struct sk_buff *skb,
                                   const struct nf_hook_state *state)
{
        pkt->skb = skb;
        pkt->state = state;
}

static inline void nft_set_pktinfo_unspec(struct nft_pktinfo *pkt)
{
        pkt->flags = 0;
        pkt->tprot = 0;
        pkt->thoff = 0;
        pkt->fragoff = 0;
}

/**
 *         struct nft_verdict - nf_tables verdict
 *
 *         @code: nf_tables/netfilter verdict code
 *         @chain: destination chain for NFT_JUMP/NFT_GOTO
 */
struct nft_verdict {
        u32                                code;
        struct nft_chain                *chain;
};

struct nft_data {
        union {
                u32                        data[4];
                struct nft_verdict        verdict;
        };
} __attribute__((aligned(__alignof__(u64))));

#define NFT_REG32_NUM                20

/**
 *        struct nft_regs - nf_tables register set
 *
 *        @data: data registers
 *        @verdict: verdict register
 *
 *        The first four data registers alias to the verdict register.
 */
struct nft_regs {
        union {
                u32                        data[NFT_REG32_NUM];
                struct nft_verdict        verdict;
        };
};

struct nft_regs_track {
        struct {
                const struct nft_expr                *selector;
                const struct nft_expr                *bitwise;
                u8                                num_reg;
        } regs[NFT_REG32_NUM];

        const struct nft_expr                        *cur;
        const struct nft_expr                        *last;
};

/* Store/load an u8, u16 or u64 integer to/from the u32 data register.
 *
 * Note, when using concatenations, register allocation happens at 32-bit
 * level. So for store instruction, pad the rest part with zero to avoid
 * garbage values.
 */

static inline void nft_reg_store8(u32 *dreg, u8 val)
{
        *dreg = 0;
        *(u8 *)dreg = val;
}

static inline u8 nft_reg_load8(const u32 *sreg)
{
        return *(u8 *)sreg;
}

static inline void nft_reg_store16(u32 *dreg, u16 val)
{
        *dreg = 0;
        *(u16 *)dreg = val;
}

static inline void nft_reg_store_be16(u32 *dreg, __be16 val)
{
        nft_reg_store16(dreg, (__force __u16)val);
}

static inline u16 nft_reg_load16(const u32 *sreg)
{
        return *(u16 *)sreg;
}

static inline __be16 nft_reg_load_be16(const u32 *sreg)
{
        return (__force __be16)nft_reg_load16(sreg);
}

static inline __be32 nft_reg_load_be32(const u32 *sreg)
{
        return *(__force __be32 *)sreg;
}

static inline void nft_reg_store64(u64 *dreg, u64 val)
{
        put_unaligned(val, dreg);
}

static inline u64 nft_reg_load64(const u32 *sreg)
{
        return get_unaligned((u64 *)sreg);
}

static inline void nft_data_copy(u32 *dst, const struct nft_data *src,
                                 unsigned int len)
{
        if (len % NFT_REG32_SIZE)
                dst[len / NFT_REG32_SIZE] = 0;
        memcpy(dst, src, len);
}

/**
 *        struct nft_ctx - nf_tables rule/set context
 *
 *        @net: net namespace
 *         @table: the table the chain is contained in
 *         @chain: the chain the rule is contained in
 *        @nla: netlink attributes
 *        @portid: netlink portID of the original message
 *        @seq: netlink sequence number
 *        @flags: modifiers to new request
 *        @family: protocol family
 *        @level: depth of the chains
 *        @report: notify via unicast netlink message
 *        @reg_inited: bitmap of initialised registers
 */
struct nft_ctx {
        struct net                        *net;
        struct nft_table                *table;
        struct nft_chain                *chain;
        const struct nlattr * const         *nla;
        u32                                portid;
        u32                                seq;
        u16                                flags;
        u8                                family;
        u8                                level;
        bool                                report;
        DECLARE_BITMAP(reg_inited, NFT_REG32_NUM);
};

enum nft_data_desc_flags {
        NFT_DATA_DESC_SETELEM        = (1 << 0),
};

struct nft_data_desc {
        enum nft_data_types                type;
        unsigned int                        size;
        unsigned int                        len;
        unsigned int                        flags;
};

int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
                  struct nft_data_desc *desc, const struct nlattr *nla);
void nft_data_hold(const struct nft_data *data, enum nft_data_types type);
void nft_data_release(const struct nft_data *data, enum nft_data_types type);
int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
                  enum nft_data_types type, unsigned int len);

static inline enum nft_data_types nft_dreg_to_type(enum nft_registers reg)
{
        return reg == NFT_REG_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE;
}

static inline enum nft_registers nft_type_to_reg(enum nft_data_types type)
{
        return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE;
}

int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest);
int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg);

int nft_parse_register_load(const struct nft_ctx *ctx,
                            const struct nlattr *attr, u8 *sreg, u32 len);
int nft_parse_register_store(const struct nft_ctx *ctx,
                             const struct nlattr *attr, u8 *dreg,
                             const struct nft_data *data,
                             enum nft_data_types type, unsigned int len);

/**
 *        struct nft_userdata - user defined data associated with an object
 *
 *        @len: length of the data
 *        @data: content
 *
 *        The presence of user data is indicated in an object specific fashion,
 *        so a length of zero can't occur and the value "len" indicates data
 *        of length len + 1.
 */
struct nft_userdata {
        u8                        len;
        unsigned char                data[];
};

/* placeholder structure for opaque set element backend representation. */
struct nft_elem_priv { };

/**
 *        struct nft_set_elem - generic representation of set elements
 *
 *        @key: element key
 *        @key_end: closing element key
 *        @data: element data
 *        @priv: element private data and extensions
 */
struct nft_set_elem {
        union {
                u32                buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
                struct nft_data        val;
        } key;
        union {
                u32                buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
                struct nft_data        val;
        } key_end;
        union {
                u32                buf[NFT_DATA_VALUE_MAXLEN / sizeof(u32)];
                struct nft_data val;
        } data;
        struct nft_elem_priv        *priv;
};

static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv)
{
        return (void *)priv;
}


/**
 * enum nft_iter_type - nftables set iterator type
 *
 * @NFT_ITER_UNSPEC: unspecified, to catch errors
 * @NFT_ITER_READ: read-only iteration over set elements
 * @NFT_ITER_UPDATE: iteration under mutex to update set element state
 */
enum nft_iter_type {
        NFT_ITER_UNSPEC,
        NFT_ITER_READ,
        NFT_ITER_UPDATE,
};

struct nft_set;
struct nft_set_iter {
        u8                genmask;
        enum nft_iter_type type:8;
        unsigned int        count;
        unsigned int        skip;
        int                err;
        int                (*fn)(const struct nft_ctx *ctx,
                              struct nft_set *set,
                              const struct nft_set_iter *iter,
                              struct nft_elem_priv *elem_priv);
};

/**
 *        struct nft_set_desc - description of set elements
 *
 *        @ktype: key type
 *        @klen: key length
 *        @dtype: data type
 *        @dlen: data length
 *        @objtype: object type
 *        @size: number of set elements
 *        @policy: set policy
 *        @gc_int: garbage collector interval
 *        @timeout: element timeout
 *        @field_len: length of each field in concatenation, bytes
 *        @field_count: number of concatenated fields in element
 *        @expr: set must support for expressions
 */
struct nft_set_desc {
        u32                        ktype;
        unsigned int                klen;
        u32                        dtype;
        unsigned int                dlen;
        u32                        objtype;
        unsigned int                size;
        u32                        policy;
        u32                        gc_int;
        u64                        timeout;
        u8                        field_len[NFT_REG32_COUNT];
        u8                        field_count;
        bool                        expr;
};

/**
 *        enum nft_set_class - performance class
 *
 *        @NFT_SET_CLASS_O_1: constant, O(1)
 *        @NFT_SET_CLASS_O_LOG_N: logarithmic, O(log N)
 *        @NFT_SET_CLASS_O_N: linear, O(N)
 */
enum nft_set_class {
        NFT_SET_CLASS_O_1,
        NFT_SET_CLASS_O_LOG_N,
        NFT_SET_CLASS_O_N,
};

/**
 *        struct nft_set_estimate - estimation of memory and performance
 *                                  characteristics
 *
 *        @size: required memory
 *        @lookup: lookup performance class
 *        @space: memory class
 */
struct nft_set_estimate {
        u64                        size;
        enum nft_set_class        lookup;
        enum nft_set_class        space;
};

#define NFT_EXPR_MAXATTR                16
#define NFT_EXPR_SIZE(size)                (sizeof(struct nft_expr) + \
                                         ALIGN(size, __alignof__(struct nft_expr)))

/**
 *        struct nft_expr - nf_tables expression
 *
 *        @ops: expression ops
 *        @data: expression private data
 */
struct nft_expr {
        const struct nft_expr_ops        *ops;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(u64))));
};

static inline void *nft_expr_priv(const struct nft_expr *expr)
{
        return (void *)expr->data;
}

struct nft_expr_info;

int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
                         struct nft_expr_info *info);
int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp);
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
                  const struct nft_expr *expr, bool reset);
bool nft_expr_reduce_bitwise(struct nft_regs_track *track,
                             const struct nft_expr *expr);

struct nft_set_ext;

/**
 *        struct nft_set_ops - nf_tables set operations
 *
 *        @lookup: look up an element within the set
 *        @update: update an element if exists, add it if doesn't exist
 *        @delete: delete an element
 *        @insert: insert new element into set
 *        @activate: activate new element in the next generation
 *        @deactivate: lookup for element and deactivate it in the next generation
 *        @flush: deactivate element in the next generation
 *        @remove: remove element from set
 *        @walk: iterate over all set elements
 *        @get: get set elements
 *        @ksize: kernel set size
 *         @usize: userspace set size
 *        @adjust_maxsize: delta to adjust maximum set size
 *        @commit: commit set elements
 *        @abort: abort set elements
 *        @privsize: function to return size of set private data
 *        @estimate: estimate the required memory size and the lookup complexity class
 *        @init: initialize private data of new set instance
 *        @destroy: destroy private data of set instance
 *        @gc_init: initialize garbage collection
 *        @elemsize: element private size
 *
 *        Operations lookup, update and delete have simpler interfaces, are faster
 *        and currently only used in the packet path. All the rest are slower,
 *        control plane functions.
 */
struct nft_set_ops {
        bool                                (*lookup)(const struct net *net,
                                                  const struct nft_set *set,
                                                  const u32 *key,
                                                  const struct nft_set_ext **ext);
        bool                                (*update)(struct nft_set *set,
                                                  const u32 *key,
                                                  struct nft_elem_priv *
                                                        (*new)(struct nft_set *,
                                                               const struct nft_expr *,
                                                               struct nft_regs *),
                                                  const struct nft_expr *expr,
                                                  struct nft_regs *regs,
                                                  const struct nft_set_ext **ext);
        bool                                (*delete)(const struct nft_set *set,
                                                  const u32 *key);

        int                                (*insert)(const struct net *net,
                                                  const struct nft_set *set,
                                                  const struct nft_set_elem *elem,
                                                  struct nft_elem_priv **priv);
        void                                (*activate)(const struct net *net,
                                                    const struct nft_set *set,
                                                    struct nft_elem_priv *elem_priv);
        struct nft_elem_priv *                (*deactivate)(const struct net *net,
                                                      const struct nft_set *set,
                                                      const struct nft_set_elem *elem);
        void                                (*flush)(const struct net *net,
                                                 const struct nft_set *set,
                                                 struct nft_elem_priv *priv);
        void                                (*remove)(const struct net *net,
                                                  const struct nft_set *set,
                                                  struct nft_elem_priv *elem_priv);
        void                                (*walk)(const struct nft_ctx *ctx,
                                                struct nft_set *set,
                                                struct nft_set_iter *iter);
        struct nft_elem_priv *                (*get)(const struct net *net,
                                               const struct nft_set *set,
                                               const struct nft_set_elem *elem,
                                               unsigned int flags);
        u32                                (*ksize)(u32 size);
        u32                                (*usize)(u32 size);
        u32                                (*adjust_maxsize)(const struct nft_set *set);
        void                                (*commit)(struct nft_set *set);
        void                                (*abort)(const struct nft_set *set);
        u64                                (*privsize)(const struct nlattr * const nla[],
                                                    const struct nft_set_desc *desc);
        bool                                (*estimate)(const struct nft_set_desc *desc,
                                                    u32 features,
                                                    struct nft_set_estimate *est);
        int                                (*init)(const struct nft_set *set,
                                                const struct nft_set_desc *desc,
                                                const struct nlattr * const nla[]);
        void                                (*destroy)(const struct nft_ctx *ctx,
                                                   const struct nft_set *set);
        void                                (*gc_init)(const struct nft_set *set);

        unsigned int                        elemsize;
};

/**
 *      struct nft_set_type - nf_tables set type
 *
 *      @ops: set ops for this type
 *      @features: features supported by the implementation
 */
struct nft_set_type {
        const struct nft_set_ops        ops;
        u32                                features;
};
#define to_set_type(o) container_of(o, struct nft_set_type, ops)

struct nft_set_elem_expr {
        u8                                size;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(struct nft_expr))));
};

#define nft_setelem_expr_at(__elem_expr, __offset)                        \
        ((struct nft_expr *)&__elem_expr->data[__offset])

#define nft_setelem_expr_foreach(__expr, __elem_expr, __size)                \
        for (__expr = nft_setelem_expr_at(__elem_expr, 0), __size = 0;        \
             __size < (__elem_expr)->size;                                \
             __size += (__expr)->ops->size, __expr = ((void *)(__expr)) + (__expr)->ops->size)

#define NFT_SET_EXPR_MAX        2

/**
 *         struct nft_set - nf_tables set instance
 *
 *        @list: table set list node
 *        @bindings: list of set bindings
 *        @refs: internal refcounting for async set destruction
 *        @table: table this set belongs to
 *        @net: netnamespace this set belongs to
 *         @name: name of the set
 *        @handle: unique handle of the set
 *         @ktype: key type (numeric type defined by userspace, not used in the kernel)
 *         @dtype: data type (verdict or numeric type defined by userspace)
 *         @objtype: object type (see NFT_OBJECT_* definitions)
 *         @size: maximum set size
 *        @field_len: length of each field in concatenation, bytes
 *        @field_count: number of concatenated fields in element
 *        @use: number of rules references to this set
 *         @nelems: number of elements
 *         @ndeact: number of deactivated elements queued for removal
 *        @timeout: default timeout value in jiffies
 *         @gc_int: garbage collection interval in msecs
 *        @policy: set parameterization (see enum nft_set_policies)
 *        @udlen: user data length
 *        @udata: user data
 *        @pending_update: list of pending update set element
 *         @ops: set ops
 *         @flags: set flags
 *        @dead: set will be freed, never cleared
 *        @genmask: generation mask
 *         @klen: key length
 *         @dlen: data length
 *        @num_exprs: numbers of exprs
 *        @exprs: stateful expression
 *        @catchall_list: list of catch-all set element
 *         @data: private set data
 */
struct nft_set {
        struct list_head                list;
        struct list_head                bindings;
        refcount_t                        refs;
        struct nft_table                *table;
        possible_net_t                        net;
        char                                *name;
        u64                                handle;
        u32                                ktype;
        u32                                dtype;
        u32                                objtype;
        u32                                size;
        u8                                field_len[NFT_REG32_COUNT];
        u8                                field_count;
        u32                                use;
        atomic_t                        nelems;
        u32                                ndeact;
        u64                                timeout;
        u32                                gc_int;
        u16                                policy;
        u16                                udlen;
        unsigned char                        *udata;
        struct list_head                pending_update;
        /* runtime data below here */
        const struct nft_set_ops        *ops ____cacheline_aligned;
        u16                                flags:13,
                                        dead:1,
                                        genmask:2;
        u8                                klen;
        u8                                dlen;
        u8                                num_exprs;
        struct nft_expr                        *exprs[NFT_SET_EXPR_MAX];
        struct list_head                catchall_list;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(u64))));
};

static inline bool nft_set_is_anonymous(const struct nft_set *set)
{
        return set->flags & NFT_SET_ANONYMOUS;
}

static inline void *nft_set_priv(const struct nft_set *set)
{
        return (void *)set->data;
}

static inline enum nft_data_types nft_set_datatype(const struct nft_set *set)
{
        return set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE;
}

static inline bool nft_set_gc_is_pending(const struct nft_set *s)
{
        return refcount_read(&s->refs) != 1;
}

static inline struct nft_set *nft_set_container_of(const void *priv)
{
        return (void *)priv - offsetof(struct nft_set, data);
}

struct nft_set *nft_set_lookup_global(const struct net *net,
                                      const struct nft_table *table,
                                      const struct nlattr *nla_set_name,
                                      const struct nlattr *nla_set_id,
                                      u8 genmask);

struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
                                            const struct nft_set *set);

static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
{
        u32 gc_int = READ_ONCE(set->gc_int);

        return gc_int ? msecs_to_jiffies(gc_int) : HZ;
}

/**
 *        struct nft_set_binding - nf_tables set binding
 *
 *        @list: set bindings list node
 *        @chain: chain containing the rule bound to the set
 *        @flags: set action flags
 *
 *        A set binding contains all information necessary for validation
 *        of new elements added to a bound set.
 */
struct nft_set_binding {
        struct list_head                list;
        const struct nft_chain                *chain;
        u32                                flags;
};

enum nft_trans_phase;
void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set);
void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
                              struct nft_set_binding *binding,
                              enum nft_trans_phase phase);
int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
                       struct nft_set_binding *binding);
void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set);

/**
 *        enum nft_set_extensions - set extension type IDs
 *
 *        @NFT_SET_EXT_KEY: element key
 *        @NFT_SET_EXT_KEY_END: upper bound element key, for ranges
 *        @NFT_SET_EXT_DATA: mapping data
 *        @NFT_SET_EXT_FLAGS: element flags
 *        @NFT_SET_EXT_TIMEOUT: element timeout
 *        @NFT_SET_EXT_USERDATA: user data associated with the element
 *        @NFT_SET_EXT_EXPRESSIONS: expressions associated with the element
 *        @NFT_SET_EXT_OBJREF: stateful object reference associated with element
 *        @NFT_SET_EXT_NUM: number of extension types
 */
enum nft_set_extensions {
        NFT_SET_EXT_KEY,
        NFT_SET_EXT_KEY_END,
        NFT_SET_EXT_DATA,
        NFT_SET_EXT_FLAGS,
        NFT_SET_EXT_TIMEOUT,
        NFT_SET_EXT_USERDATA,
        NFT_SET_EXT_EXPRESSIONS,
        NFT_SET_EXT_OBJREF,
        NFT_SET_EXT_NUM
};

/**
 *        struct nft_set_ext_type - set extension type
 *
 *         @len: fixed part length of the extension
 *         @align: alignment requirements of the extension
 */
struct nft_set_ext_type {
        u8        len;
        u8        align;
};

extern const struct nft_set_ext_type nft_set_ext_types[];

/**
 *        struct nft_set_ext_tmpl - set extension template
 *
 *        @len: length of extension area
 *        @offset: offsets of individual extension types
 *        @ext_len: length of the expected extension(used to sanity check)
 */
struct nft_set_ext_tmpl {
        u16        len;
        u8        offset[NFT_SET_EXT_NUM];
        u8        ext_len[NFT_SET_EXT_NUM];
};

/**
 *        struct nft_set_ext - set extensions
 *
 *        @genmask: generation mask, but also flags (see NFT_SET_ELEM_DEAD_BIT)
 *        @offset: offsets of individual extension types
 *        @data: beginning of extension data
 *
 *        This structure must be aligned to word size, otherwise atomic bitops
 *        on genmask field can cause alignment failure on some archs.
 */
struct nft_set_ext {
        u8        genmask;
        u8        offset[NFT_SET_EXT_NUM];
        char        data[];
} __aligned(BITS_PER_LONG / 8);

static inline void nft_set_ext_prepare(struct nft_set_ext_tmpl *tmpl)
{
        memset(tmpl, 0, sizeof(*tmpl));
        tmpl->len = sizeof(struct nft_set_ext);
}

static inline int nft_set_ext_add_length(struct nft_set_ext_tmpl *tmpl, u8 id,
                                         unsigned int len)
{
        tmpl->len         = ALIGN(tmpl->len, nft_set_ext_types[id].align);
        if (tmpl->len > U8_MAX)
                return -EINVAL;

        tmpl->offset[id] = tmpl->len;
        tmpl->ext_len[id] = nft_set_ext_types[id].len + len;
        tmpl->len        += tmpl->ext_len[id];

        return 0;
}

static inline int nft_set_ext_add(struct nft_set_ext_tmpl *tmpl, u8 id)
{
        return nft_set_ext_add_length(tmpl, id, 0);
}

static inline void nft_set_ext_init(struct nft_set_ext *ext,
                                    const struct nft_set_ext_tmpl *tmpl)
{
        memcpy(ext->offset, tmpl->offset, sizeof(ext->offset));
}

static inline bool __nft_set_ext_exists(const struct nft_set_ext *ext, u8 id)
{
        return !!ext->offset[id];
}

static inline bool nft_set_ext_exists(const struct nft_set_ext *ext, u8 id)
{
        return ext && __nft_set_ext_exists(ext, id);
}

static inline void *nft_set_ext(const struct nft_set_ext *ext, u8 id)
{
        return (void *)ext + ext->offset[id];
}

static inline struct nft_data *nft_set_ext_key(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_KEY);
}

static inline struct nft_data *nft_set_ext_key_end(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_KEY_END);
}

static inline struct nft_data *nft_set_ext_data(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_DATA);
}

static inline u8 *nft_set_ext_flags(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_FLAGS);
}

struct nft_timeout {
        u64        timeout;
        u64        expiration;
};

static inline struct nft_timeout *nft_set_ext_timeout(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_TIMEOUT);
}

static inline struct nft_userdata *nft_set_ext_userdata(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_USERDATA);
}

static inline struct nft_set_elem_expr *nft_set_ext_expr(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_EXPRESSIONS);
}

static inline bool __nft_set_elem_expired(const struct nft_set_ext *ext,
                                          u64 tstamp)
{
        if (!nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) ||
            READ_ONCE(nft_set_ext_timeout(ext)->timeout) == 0)
                return false;

        return time_after_eq64(tstamp, READ_ONCE(nft_set_ext_timeout(ext)->expiration));
}

static inline bool nft_set_elem_expired(const struct nft_set_ext *ext)
{
        return __nft_set_elem_expired(ext, get_jiffies_64());
}

static inline struct nft_set_ext *nft_set_elem_ext(const struct nft_set *set,
                                                   const struct nft_elem_priv *elem_priv)
{
        return (void *)elem_priv + set->ops->elemsize;
}

static inline struct nft_object **nft_set_ext_obj(const struct nft_set_ext *ext)
{
        return nft_set_ext(ext, NFT_SET_EXT_OBJREF);
}

struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
                                         const struct nft_set *set,
                                         const struct nlattr *attr);

struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
                                        const struct nft_set_ext_tmpl *tmpl,
                                        const u32 *key, const u32 *key_end,
                                        const u32 *data,
                                        u64 timeout, u64 expiration, gfp_t gfp);
int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
                            struct nft_expr *expr_array[]);
void nft_set_elem_destroy(const struct nft_set *set,
                          const struct nft_elem_priv *elem_priv,
                          bool destroy_expr);
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
                                const struct nft_set *set,
                                const struct nft_elem_priv *elem_priv);

struct nft_expr_ops;
/**
 *        struct nft_expr_type - nf_tables expression type
 *
 *        @select_ops: function to select nft_expr_ops
 *        @release_ops: release nft_expr_ops
 *        @ops: default ops, used when no select_ops functions is present
 *        @inner_ops: inner ops, used for inner packet operation
 *        @list: used internally
 *        @name: Identifier
 *        @owner: module reference
 *        @policy: netlink attribute policy
 *        @maxattr: highest netlink attribute number
 *        @family: address family for AF-specific types
 *        @flags: expression type flags
 */
struct nft_expr_type {
        const struct nft_expr_ops        *(*select_ops)(const struct nft_ctx *,
                                                       const struct nlattr * const tb[]);
        void                                (*release_ops)(const struct nft_expr_ops *ops);
        const struct nft_expr_ops        *ops;
        const struct nft_expr_ops        *inner_ops;
        struct list_head                list;
        const char                        *name;
        struct module                        *owner;
        const struct nla_policy                *policy;
        unsigned int                        maxattr;
        u8                                family;
        u8                                flags;
};

#define NFT_EXPR_STATEFUL                0x1
#define NFT_EXPR_GC                        0x2

enum nft_trans_phase {
        NFT_TRANS_PREPARE,
        NFT_TRANS_PREPARE_ERROR,
        NFT_TRANS_ABORT,
        NFT_TRANS_COMMIT,
        NFT_TRANS_RELEASE
};

struct nft_flow_rule;
struct nft_offload_ctx;

/**
 *        struct nft_expr_ops - nf_tables expression operations
 *
 *        @eval: Expression evaluation function
 *        @clone: Expression clone function
 *        @size: full expression size, including private data size
 *        @init: initialization function
 *        @activate: activate expression in the next generation
 *        @deactivate: deactivate expression in next generation
 *        @destroy: destruction function, called after synchronize_rcu
 *        @destroy_clone: destruction clone function
 *        @dump: function to dump parameters
 *        @validate: validate expression, called during loop detection
 *        @reduce: reduce expression
 *        @gc: garbage collection expression
 *        @offload: hardware offload expression
 *        @offload_action: function to report true/false to allocate one slot or not in the flow
 *                         offload array
 *        @offload_stats: function to synchronize hardware stats via updating the counter expression
 *        @type: expression type
 *        @data: extra data to attach to this expression operation
 */
struct nft_expr_ops {
        void                                (*eval)(const struct nft_expr *expr,
                                                struct nft_regs *regs,
                                                const struct nft_pktinfo *pkt);
        int                                (*clone)(struct nft_expr *dst,
                                                 const struct nft_expr *src, gfp_t gfp);
        unsigned int                        size;

        int                                (*init)(const struct nft_ctx *ctx,
                                                const struct nft_expr *expr,
                                                const struct nlattr * const tb[]);
        void                                (*activate)(const struct nft_ctx *ctx,
                                                    const struct nft_expr *expr);
        void                                (*deactivate)(const struct nft_ctx *ctx,
                                                      const struct nft_expr *expr,
                                                      enum nft_trans_phase phase);
        void                                (*destroy)(const struct nft_ctx *ctx,
                                                   const struct nft_expr *expr);
        void                                (*destroy_clone)(const struct nft_ctx *ctx,
                                                         const struct nft_expr *expr);
        int                                (*dump)(struct sk_buff *skb,
                                                const struct nft_expr *expr,
                                                bool reset);
        int                                (*validate)(const struct nft_ctx *ctx,
                                                    const struct nft_expr *expr);
        bool                                (*reduce)(struct nft_regs_track *track,
                                                  const struct nft_expr *expr);
        bool                                (*gc)(struct net *net,
                                              const struct nft_expr *expr);
        int                                (*offload)(struct nft_offload_ctx *ctx,
                                                   struct nft_flow_rule *flow,
                                                   const struct nft_expr *expr);
        bool                                (*offload_action)(const struct nft_expr *expr);
        void                                (*offload_stats)(struct nft_expr *expr,
                                                         const struct flow_stats *stats);
        const struct nft_expr_type        *type;
        void                                *data;
};

/**
 *        struct nft_rule - nf_tables rule
 *
 *        @list: used internally
 *        @handle: rule handle
 *        @genmask: generation mask
 *        @dlen: length of expression data
 *        @udata: user data is appended to the rule
 *        @data: expression data
 */
struct nft_rule {
        struct list_head                list;
        u64                                handle:42,
                                        genmask:2,
                                        dlen:12,
                                        udata:1;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(struct nft_expr))));
};

static inline struct nft_expr *nft_expr_first(const struct nft_rule *rule)
{
        return (struct nft_expr *)&rule->data[0];
}

static inline struct nft_expr *nft_expr_next(const struct nft_expr *expr)
{
        return ((void *)expr) + expr->ops->size;
}

static inline struct nft_expr *nft_expr_last(const struct nft_rule *rule)
{
        return (struct nft_expr *)&rule->data[rule->dlen];
}

static inline bool nft_expr_more(const struct nft_rule *rule,
                                 const struct nft_expr *expr)
{
        return expr != nft_expr_last(rule) && expr->ops;
}

static inline struct nft_userdata *nft_userdata(const struct nft_rule *rule)
{
        return (void *)&rule->data[rule->dlen];
}

void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule);
void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule,
                              enum nft_trans_phase phase);
void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule);

static inline void nft_set_elem_update_expr(const struct nft_set_ext *ext,
                                            struct nft_regs *regs,
                                            const struct nft_pktinfo *pkt)
{
        struct nft_set_elem_expr *elem_expr;
        struct nft_expr *expr;
        u32 size;

        if (__nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS)) {
                elem_expr = nft_set_ext_expr(ext);
                nft_setelem_expr_foreach(expr, elem_expr, size) {
                        expr->ops->eval(expr, regs, pkt);
                        if (regs->verdict.code == NFT_BREAK)
                                return;
                }
        }
}

/*
 * The last pointer isn't really necessary, but the compiler isn't able to
 * determine that the result of nft_expr_last() is always the same since it
 * can't assume that the dlen value wasn't changed within calls in the loop.
 */
#define nft_rule_for_each_expr(expr, last, rule) \
        for ((expr) = nft_expr_first(rule), (last) = nft_expr_last(rule); \
             (expr) != (last); \
             (expr) = nft_expr_next(expr))

#define NFT_CHAIN_POLICY_UNSET                U8_MAX

struct nft_rule_dp {
        u64                                is_last:1,
                                        dlen:12,
                                        handle:42;        /* for tracing */
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(struct nft_expr))));
};

struct nft_rule_dp_last {
        struct nft_rule_dp end;                /* end of nft_rule_blob marker */
        struct rcu_head h;                /* call_rcu head */
        struct nft_rule_blob *blob;        /* ptr to free via call_rcu */
        const struct nft_chain *chain;        /* for nftables tracing */
};

static inline const struct nft_rule_dp *nft_rule_next(const struct nft_rule_dp *rule)
{
        return (void *)rule + sizeof(*rule) + rule->dlen;
}

struct nft_rule_blob {
        unsigned long                        size;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(struct nft_rule_dp))));
};

/**
 *        struct nft_chain - nf_tables chain
 *
 *        @blob_gen_0: rule blob pointer to the current generation
 *        @blob_gen_1: rule blob pointer to the future generation
 *        @rules: list of rules in the chain
 *        @list: used internally
 *        @rhlhead: used internally
 *        @table: table that this chain belongs to
 *        @handle: chain handle
 *        @use: number of jump references to this chain
 *        @flags: bitmask of enum NFTA_CHAIN_FLAGS
 *        @bound: bind or not
 *        @genmask: generation mask
 *        @name: name of the chain
 *        @udlen: user data length
 *        @udata: user data in the chain
 *        @blob_next: rule blob pointer to the next in the chain
 */
struct nft_chain {
        struct nft_rule_blob                __rcu *blob_gen_0;
        struct nft_rule_blob                __rcu *blob_gen_1;
        struct list_head                rules;
        struct list_head                list;
        struct rhlist_head                rhlhead;
        struct nft_table                *table;
        u64                                handle;
        u32                                use;
        u8                                flags:5,
                                        bound:1,
                                        genmask:2;
        char                                *name;
        u16                                udlen;
        u8                                *udata;

        /* Only used during control plane commit phase: */
        struct nft_rule_blob                *blob_next;
};

int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain);
int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
                         const struct nft_set_iter *iter,
                         struct nft_elem_priv *elem_priv);
int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set);
int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);
void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain);

enum nft_chain_types {
        NFT_CHAIN_T_DEFAULT = 0,
        NFT_CHAIN_T_ROUTE,
        NFT_CHAIN_T_NAT,
        NFT_CHAIN_T_MAX
};

/**
 *         struct nft_chain_type - nf_tables chain type info
 *
 *         @name: name of the type
 *         @type: numeric identifier
 *         @family: address family
 *         @owner: module owner
 *         @hook_mask: mask of valid hooks
 *         @hooks: array of hook functions
 *        @ops_register: base chain register function
 *        @ops_unregister: base chain unregister function
 */
struct nft_chain_type {
        const char                        *name;
        enum nft_chain_types                type;
        int                                family;
        struct module                        *owner;
        unsigned int                        hook_mask;
        nf_hookfn                        *hooks[NFT_MAX_HOOKS];
        int                                (*ops_register)(struct net *net, const struct nf_hook_ops *ops);
        void                                (*ops_unregister)(struct net *net, const struct nf_hook_ops *ops);
};

int nft_chain_validate_dependency(const struct nft_chain *chain,
                                  enum nft_chain_types type);
int nft_chain_validate_hooks(const struct nft_chain *chain,
                             unsigned int hook_flags);

static inline bool nft_chain_binding(const struct nft_chain *chain)
{
        return chain->flags & NFT_CHAIN_BINDING;
}

static inline bool nft_chain_is_bound(struct nft_chain *chain)
{
        return (chain->flags & NFT_CHAIN_BINDING) && chain->bound;
}

int nft_chain_add(struct nft_table *table, struct nft_chain *chain);
void nft_chain_del(struct nft_chain *chain);
void nf_tables_chain_destroy(struct nft_chain *chain);

struct nft_stats {
        u64                        bytes;
        u64                        pkts;
        struct u64_stats_sync        syncp;
};

struct nft_hook {
        struct list_head        list;
        struct nf_hook_ops        ops;
        struct rcu_head                rcu;
        char                        ifname[IFNAMSIZ];
        u8                        ifnamelen;
};

/**
 *        struct nft_base_chain - nf_tables base chain
 *
 *        @ops: netfilter hook ops
 *        @hook_list: list of netfilter hooks (for NFPROTO_NETDEV family)
 *        @type: chain type
 *        @policy: default policy
 *        @flags: indicate the base chain disabled or not
 *        @stats: per-cpu chain stats
 *        @chain: the chain
 *        @flow_block: flow block (for hardware offload)
 */
struct nft_base_chain {
        struct nf_hook_ops                ops;
        struct list_head                hook_list;
        const struct nft_chain_type        *type;
        u8                                policy;
        u8                                flags;
        struct nft_stats __percpu        *stats;
        struct nft_chain                chain;
        struct flow_block                flow_block;
};

static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chain)
{
        return container_of(chain, struct nft_base_chain, chain);
}

static inline bool nft_is_base_chain(const struct nft_chain *chain)
{
        return chain->flags & NFT_CHAIN_BASE;
}

unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv);

static inline bool nft_use_inc(u32 *use)
{
        if (*use == UINT_MAX)
                return false;

        (*use)++;

        return true;
}

static inline void nft_use_dec(u32 *use)
{
        WARN_ON_ONCE((*use)-- == 0);
}

/* For error and abort path: restore use counter to previous state. */
static inline void nft_use_inc_restore(u32 *use)
{
        WARN_ON_ONCE(!nft_use_inc(use));
}

#define nft_use_dec_restore        nft_use_dec

/**
 *        struct nft_table - nf_tables table
 *
 *        @list: used internally
 *        @chains_ht: chains in the table
 *        @chains: same, for stable walks
 *        @sets: sets in the table
 *        @objects: stateful objects in the table
 *        @flowtables: flow tables in the table
 *        @hgenerator: handle generator state
 *        @handle: table handle
 *        @use: number of chain references to this table
 *        @family:address family
 *        @flags: table flag (see enum nft_table_flags)
 *        @genmask: generation mask
 *        @nlpid: netlink port ID
 *        @name: name of the table
 *        @udlen: length of the user data
 *        @udata: user data
 *        @validate_state: internal, set when transaction adds jumps
 */
struct nft_table {
        struct list_head                list;
        struct rhltable                        chains_ht;
        struct list_head                chains;
        struct list_head                sets;
        struct list_head                objects;
        struct list_head                flowtables;
        u64                                hgenerator;
        u64                                handle;
        u32                                use;
        u16                                family:6,
                                        flags:8,
                                        genmask:2;
        u32                                nlpid;
        char                                *name;
        u16                                udlen;
        u8                                *udata;
        u8                                validate_state;
};

static inline bool nft_table_has_owner(const struct nft_table *table)
{
        return table->flags & NFT_TABLE_F_OWNER;
}

static inline bool nft_table_is_orphan(const struct nft_table *table)
{
        return (table->flags & (NFT_TABLE_F_OWNER | NFT_TABLE_F_PERSIST)) ==
                        NFT_TABLE_F_PERSIST;
}

static inline bool nft_base_chain_netdev(int family, u32 hooknum)
{
        return family == NFPROTO_NETDEV ||
               (family == NFPROTO_INET && hooknum == NF_INET_INGRESS);
}

void nft_register_chain_type(const struct nft_chain_type *);
void nft_unregister_chain_type(const struct nft_chain_type *);

int nft_register_expr(struct nft_expr_type *);
void nft_unregister_expr(struct nft_expr_type *);

int nft_verdict_dump(struct sk_buff *skb, int type,
                     const struct nft_verdict *v);

/**
 *        struct nft_object_hash_key - key to lookup nft_object
 *
 *        @name: name of the stateful object to look up
 *        @table: table the object belongs to
 */
struct nft_object_hash_key {
        const char                      *name;
        const struct nft_table          *table;
};

/**
 *        struct nft_object - nf_tables stateful object
 *
 *        @list: table stateful object list node
 *        @rhlhead: nft_objname_ht node
 *        @key: keys that identify this object
 *        @genmask: generation mask
 *        @use: number of references to this stateful object
 *        @handle: unique object handle
 *        @udlen: length of user data
 *        @udata: user data
 *        @ops: object operations
 *        @data: object data, layout depends on type
 */
struct nft_object {
        struct list_head                list;
        struct rhlist_head                rhlhead;
        struct nft_object_hash_key        key;
        u32                                genmask:2;
        u32                                use;
        u64                                handle;
        u16                                udlen;
        u8                                *udata;
        /* runtime data below here */
        const struct nft_object_ops        *ops ____cacheline_aligned;
        unsigned char                        data[]
                __attribute__((aligned(__alignof__(u64))));
};

static inline void *nft_obj_data(const struct nft_object *obj)
{
        return (void *)obj->data;
}

#define nft_expr_obj(expr)        *((struct nft_object **)nft_expr_priv(expr))

struct nft_object *nft_obj_lookup(const struct net *net,
                                  const struct nft_table *table,
                                  const struct nlattr *nla, u32 objtype,
                                  u8 genmask);

void nft_obj_notify(struct net *net, const struct nft_table *table,
                    struct nft_object *obj, u32 portid, u32 seq,
                    int event, u16 flags, int family, int report, gfp_t gfp);

/**
 *        struct nft_object_type - stateful object type
 *
 *        @select_ops: function to select nft_object_ops
 *        @ops: default ops, used when no select_ops functions is present
 *        @list: list node in list of object types
 *        @type: stateful object numeric type
 *        @owner: module owner
 *        @maxattr: maximum netlink attribute
 *        @family: address family for AF-specific object types
 *        @policy: netlink attribute policy
 */
struct nft_object_type {
        const struct nft_object_ops        *(*select_ops)(const struct nft_ctx *,
                                                       const struct nlattr * const tb[]);
        const struct nft_object_ops        *ops;
        struct list_head                list;
        u32                                type;
        unsigned int                    maxattr;
        u8                                family;
        struct module                        *owner;
        const struct nla_policy                *policy;
};

/**
 *        struct nft_object_ops - stateful object operations
 *
 *        @eval: stateful object evaluation function
 *        @size: stateful object size
 *        @init: initialize object from netlink attributes
 *        @destroy: release existing stateful object
 *        @dump: netlink dump stateful object
 *        @update: update stateful object
 *        @type: pointer to object type
 */
struct nft_object_ops {
        void                                (*eval)(struct nft_object *obj,
                                                struct nft_regs *regs,
                                                const struct nft_pktinfo *pkt);
        unsigned int                        size;
        int                                (*init)(const struct nft_ctx *ctx,
                                                const struct nlattr *const tb[],
                                                struct nft_object *obj);
        void                                (*destroy)(const struct nft_ctx *ctx,
                                                   struct nft_object *obj);
        int                                (*dump)(struct sk_buff *skb,
                                                struct nft_object *obj,
                                                bool reset);
        void                                (*update)(struct nft_object *obj,
                                                  struct nft_object *newobj);
        const struct nft_object_type        *type;
};

int nft_register_obj(struct nft_object_type *obj_type);
void nft_unregister_obj(struct nft_object_type *obj_type);

#define NFT_NETDEVICE_MAX        256

/**
 *        struct nft_flowtable - nf_tables flow table
 *
 *        @list: flow table list node in table list
 *         @table: the table the flow table is contained in
 *        @name: name of this flow table
 *        @hooknum: hook number
 *        @ops_len: number of hooks in array
 *        @genmask: generation mask
 *        @use: number of references to this flow table
 *         @handle: unique object handle
 *        @hook_list: hook list for hooks per net_device in flowtables
 *        @data: rhashtable and garbage collector
 */
struct nft_flowtable {
        struct list_head                list;
        struct nft_table                *table;
        char                                *name;
        int                                hooknum;
        int                                ops_len;
        u32                                genmask:2;
        u32                                use;
        u64                                handle;
        /* runtime data below here */
        struct list_head                hook_list ____cacheline_aligned;
        struct nf_flowtable                data;
};

struct nft_flowtable *nft_flowtable_lookup(const struct net *net,
                                           const struct nft_table *table,
                                           const struct nlattr *nla,
                                           u8 genmask);

void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
                                    struct nft_flowtable *flowtable,
                                    enum nft_trans_phase phase);

void nft_register_flowtable_type(struct nf_flowtable_type *type);
void nft_unregister_flowtable_type(struct nf_flowtable_type *type);

/**
 *        struct nft_traceinfo - nft tracing information and state
 *
 *        @trace: other struct members are initialised
 *        @nf_trace: copy of skb->nf_trace before rule evaluation
 *        @type: event type (enum nft_trace_types)
 *        @skbid: hash of skb to be used as trace id
 *        @packet_dumped: packet headers sent in a previous traceinfo message
 *        @basechain: base chain currently processed
 */
struct nft_traceinfo {
        bool                                trace;
        bool                                nf_trace;
        bool                                packet_dumped;
        enum nft_trace_types                type:8;
        u32                                skbid;
        const struct nft_base_chain        *basechain;
};

void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
                    const struct nft_chain *basechain);

void nft_trace_notify(const struct nft_pktinfo *pkt,
                      const struct nft_verdict *verdict,
                      const struct nft_rule_dp *rule,
                      struct nft_traceinfo *info);

#define MODULE_ALIAS_NFT_CHAIN(family, name) \
        MODULE_ALIAS("nft-chain-" __stringify(family) "-" name)

#define MODULE_ALIAS_NFT_AF_EXPR(family, name) \
        MODULE_ALIAS("nft-expr-" __stringify(family) "-" name)

#define MODULE_ALIAS_NFT_EXPR(name) \
        MODULE_ALIAS("nft-expr-" name)

#define MODULE_ALIAS_NFT_OBJ(type) \
        MODULE_ALIAS("nft-obj-" __stringify(type))

#if IS_ENABLED(CONFIG_NF_TABLES)

/*
 * The gencursor defines two generations, the currently active and the
 * next one. Objects contain a bitmask of 2 bits specifying the generations
 * they're active in. A set bit means they're inactive in the generation
 * represented by that bit.
 *
 * New objects start out as inactive in the current and active in the
 * next generation. When committing the ruleset the bitmask is cleared,
 * meaning they're active in all generations. When removing an object,
 * it is set inactive in the next generation. After committing the ruleset,
 * the objects are removed.
 */
static inline unsigned int nft_gencursor_next(const struct net *net)
{
        return net->nft.gencursor + 1 == 1 ? 1 : 0;
}

static inline u8 nft_genmask_next(const struct net *net)
{
        return 1 << nft_gencursor_next(net);
}

static inline u8 nft_genmask_cur(const struct net *net)
{
        /* Use READ_ONCE() to prevent refetching the value for atomicity */
        return 1 << READ_ONCE(net->nft.gencursor);
}

#define NFT_GENMASK_ANY                ((1 << 0) | (1 << 1))

/*
 * Generic transaction helpers
 */

/* Check if this object is currently active. */
#define nft_is_active(__net, __obj)                                \
        (((__obj)->genmask & nft_genmask_cur(__net)) == 0)

/* Check if this object is active in the next generation. */
#define nft_is_active_next(__net, __obj)                        \
        (((__obj)->genmask & nft_genmask_next(__net)) == 0)

/* This object becomes active in the next generation. */
#define nft_activate_next(__net, __obj)                                \
        (__obj)->genmask = nft_genmask_cur(__net)

/* This object becomes inactive in the next generation. */
#define nft_deactivate_next(__net, __obj)                        \
        (__obj)->genmask = nft_genmask_next(__net)

/* After committing the ruleset, clear the stale generation bit. */
#define nft_clear(__net, __obj)                                        \
        (__obj)->genmask &= ~nft_genmask_next(__net)
#define nft_active_genmask(__obj, __genmask)                        \
        !((__obj)->genmask & __genmask)

/*
 * Set element transaction helpers
 */

static inline bool nft_set_elem_active(const struct nft_set_ext *ext,
                                       u8 genmask)
{
        return !(ext->genmask & genmask);
}

static inline void nft_set_elem_change_active(const struct net *net,
                                              const struct nft_set *set,
                                              struct nft_set_ext *ext)
{
        ext->genmask ^= nft_genmask_next(net);
}

#endif /* IS_ENABLED(CONFIG_NF_TABLES) */

#define NFT_SET_ELEM_DEAD_MASK        (1 << 2)

#if defined(__LITTLE_ENDIAN_BITFIELD)
#define NFT_SET_ELEM_DEAD_BIT        2
#elif defined(__BIG_ENDIAN_BITFIELD)
#define NFT_SET_ELEM_DEAD_BIT        (BITS_PER_LONG - BITS_PER_BYTE + 2)
#else
#error
#endif

static inline void nft_set_elem_dead(struct nft_set_ext *ext)
{
        unsigned long *word = (unsigned long *)ext;

        BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
        set_bit(NFT_SET_ELEM_DEAD_BIT, word);
}

static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext)
{
        unsigned long *word = (unsigned long *)ext;

        BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
        return test_bit(NFT_SET_ELEM_DEAD_BIT, word);
}

/**
 * struct nft_trans - nf_tables object update in transaction
 *
 * @list: used internally
 * @net: struct net
 * @table: struct nft_table the object resides in
 * @msg_type: message type
 * @seq: netlink sequence number
 * @flags: modifiers to new request
 * @report: notify via unicast netlink message
 * @put_net: net needs to be put
 *
 * This is the information common to all objects in the transaction,
 * this must always be the first member of derived sub-types.
 */
struct nft_trans {
        struct list_head                list;
        struct net                        *net;
        struct nft_table                *table;
        int                                msg_type;
        u32                                seq;
        u16                                flags;
        u8                                report:1;
        u8                                put_net:1;
};

/**
 * struct nft_trans_binding - nf_tables object with binding support in transaction
 * @nft_trans:    base structure, MUST be first member
 * @binding_list: list of objects with possible bindings
 *
 * This is the base type used by objects that can be bound to a chain.
 */
struct nft_trans_binding {
        struct nft_trans nft_trans;
        struct list_head binding_list;
};

struct nft_trans_rule {
        struct nft_trans                nft_trans;
        struct nft_rule                        *rule;
        struct nft_chain                *chain;
        struct nft_flow_rule                *flow;
        u32                                rule_id;
        bool                                bound;
};

#define nft_trans_container_rule(trans)                        \
        container_of(trans, struct nft_trans_rule, nft_trans)
#define nft_trans_rule(trans)                                \
        nft_trans_container_rule(trans)->rule
#define nft_trans_flow_rule(trans)                        \
        nft_trans_container_rule(trans)->flow
#define nft_trans_rule_id(trans)                        \
        nft_trans_container_rule(trans)->rule_id
#define nft_trans_rule_bound(trans)                        \
        nft_trans_container_rule(trans)->bound
#define nft_trans_rule_chain(trans)        \
        nft_trans_container_rule(trans)->chain

struct nft_trans_set {
        struct nft_trans_binding        nft_trans_binding;
        struct list_head                list_trans_newset;
        struct nft_set                        *set;
        u32                                set_id;
        u32                                gc_int;
        u64                                timeout;
        bool                                update;
        bool                                bound;
        u32                                size;
};

#define nft_trans_container_set(t)        \
        container_of(t, struct nft_trans_set, nft_trans_binding.nft_trans)
#define nft_trans_set(trans)                                \
        nft_trans_container_set(trans)->set
#define nft_trans_set_id(trans)                                \
        nft_trans_container_set(trans)->set_id
#define nft_trans_set_bound(trans)                        \
        nft_trans_container_set(trans)->bound
#define nft_trans_set_update(trans)                        \
        nft_trans_container_set(trans)->update
#define nft_trans_set_timeout(trans)                        \
        nft_trans_container_set(trans)->timeout
#define nft_trans_set_gc_int(trans)                        \
        nft_trans_container_set(trans)->gc_int
#define nft_trans_set_size(trans)                        \
        nft_trans_container_set(trans)->size

struct nft_trans_chain {
        struct nft_trans_binding        nft_trans_binding;
        struct nft_chain                *chain;
        char                                *name;
        struct nft_stats __percpu        *stats;
        u8                                policy;
        bool                                update;
        bool                                bound;
        u32                                chain_id;
        struct nft_base_chain                *basechain;
        struct list_head                hook_list;
};

#define nft_trans_container_chain(t)        \
        container_of(t, struct nft_trans_chain, nft_trans_binding.nft_trans)
#define nft_trans_chain(trans)                                \
        nft_trans_container_chain(trans)->chain
#define nft_trans_chain_update(trans)                        \
        nft_trans_container_chain(trans)->update
#define nft_trans_chain_name(trans)                        \
        nft_trans_container_chain(trans)->name
#define nft_trans_chain_stats(trans)                        \
        nft_trans_container_chain(trans)->stats
#define nft_trans_chain_policy(trans)                        \
        nft_trans_container_chain(trans)->policy
#define nft_trans_chain_bound(trans)                        \
        nft_trans_container_chain(trans)->bound
#define nft_trans_chain_id(trans)                        \
        nft_trans_container_chain(trans)->chain_id
#define nft_trans_basechain(trans)                        \
        nft_trans_container_chain(trans)->basechain
#define nft_trans_chain_hooks(trans)                        \
        nft_trans_container_chain(trans)->hook_list

struct nft_trans_table {
        struct nft_trans                nft_trans;
        bool                                update;
};

#define nft_trans_container_table(trans)                \
        container_of(trans, struct nft_trans_table, nft_trans)
#define nft_trans_table_update(trans)                        \
        nft_trans_container_table(trans)->update

enum nft_trans_elem_flags {
        NFT_TRANS_UPD_TIMEOUT                = (1 << 0),
        NFT_TRANS_UPD_EXPIRATION        = (1 << 1),
};

struct nft_elem_update {
        u64                                timeout;
        u64                                expiration;
        u8                                flags;
};

struct nft_trans_one_elem {
        struct nft_elem_priv                *priv;
        struct nft_elem_update                *update;
};

struct nft_trans_elem {
        struct nft_trans                nft_trans;
        struct nft_set                        *set;
        bool                                bound;
        unsigned int                        nelems;
        struct nft_trans_one_elem        elems[] __counted_by(nelems);
};

#define nft_trans_container_elem(t)                        \
        container_of(t, struct nft_trans_elem, nft_trans)
#define nft_trans_elem_set(trans)                        \
        nft_trans_container_elem(trans)->set
#define nft_trans_elem_set_bound(trans)                        \
        nft_trans_container_elem(trans)->bound

struct nft_trans_obj {
        struct nft_trans                nft_trans;
        struct nft_object                *obj;
        struct nft_object                *newobj;
        bool                                update;
};

#define nft_trans_container_obj(t)                        \
        container_of(t, struct nft_trans_obj, nft_trans)
#define nft_trans_obj(trans)                                \
        nft_trans_container_obj(trans)->obj
#define nft_trans_obj_newobj(trans)                        \
        nft_trans_container_obj(trans)->newobj
#define nft_trans_obj_update(trans)                        \
        nft_trans_container_obj(trans)->update

struct nft_trans_flowtable {
        struct nft_trans                nft_trans;
        struct nft_flowtable                *flowtable;
        struct list_head                hook_list;
        u32                                flags;
        bool                                update;
};

#define nft_trans_container_flowtable(t)                \
        container_of(t, struct nft_trans_flowtable, nft_trans)
#define nft_trans_flowtable(trans)                        \
        nft_trans_container_flowtable(trans)->flowtable
#define nft_trans_flowtable_update(trans)                \
        nft_trans_container_flowtable(trans)->update
#define nft_trans_flowtable_hooks(trans)                \
        nft_trans_container_flowtable(trans)->hook_list
#define nft_trans_flowtable_flags(trans)                \
        nft_trans_container_flowtable(trans)->flags

#define NFT_TRANS_GC_BATCHCOUNT        256

struct nft_trans_gc {
        struct list_head        list;
        struct net                *net;
        struct nft_set                *set;
        u32                        seq;
        u16                        count;
        struct nft_elem_priv        *priv[NFT_TRANS_GC_BATCHCOUNT];
        struct rcu_head                rcu;
};

static inline void nft_ctx_update(struct nft_ctx *ctx,
                                  const struct nft_trans *trans)
{
        switch (trans->msg_type) {
        case NFT_MSG_NEWRULE:
        case NFT_MSG_DELRULE:
        case NFT_MSG_DESTROYRULE:
                ctx->chain = nft_trans_rule_chain(trans);
                break;
        case NFT_MSG_NEWCHAIN:
        case NFT_MSG_DELCHAIN:
        case NFT_MSG_DESTROYCHAIN:
                ctx->chain = nft_trans_chain(trans);
                break;
        default:
                ctx->chain = NULL;
                break;
        }

        ctx->net = trans->net;
        ctx->table = trans->table;
        ctx->family = trans->table->family;
        ctx->report = trans->report;
        ctx->flags = trans->flags;
        ctx->seq = trans->seq;
}

struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
                                        unsigned int gc_seq, gfp_t gfp);
void nft_trans_gc_destroy(struct nft_trans_gc *trans);

struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
                                              unsigned int gc_seq, gfp_t gfp);
void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc);

struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp);
void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans);

void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv);

struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
                                                 unsigned int gc_seq);
struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc);

void nft_setelem_data_deactivate(const struct net *net,
                                 const struct nft_set *set,
                                 struct nft_elem_priv *elem_priv);

int __init nft_chain_filter_init(void);
void nft_chain_filter_fini(void);

void __init nft_chain_route_init(void);
void nft_chain_route_fini(void);

void nf_tables_trans_destroy_flush_work(struct net *net);

int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result);
__be64 nf_jiffies64_to_msecs(u64 input);

#ifdef CONFIG_MODULES
__printf(2, 3) int nft_request_module(struct net *net, const char *fmt, ...);
#else
static inline int nft_request_module(struct net *net, const char *fmt, ...) { return -ENOENT; }
#endif

struct nftables_pernet {
        struct list_head        tables;
        struct list_head        commit_list;
        struct list_head        destroy_list;
        struct list_head        commit_set_list;
        struct list_head        binding_list;
        struct list_head        module_list;
        struct list_head        notify_list;
        struct mutex                commit_mutex;
        u64                        table_handle;
        u64                        tstamp;
        unsigned int                base_seq;
        unsigned int                gc_seq;
        u8                        validate_state;
        struct work_struct        destroy_work;
};

extern unsigned int nf_tables_net_id;

static inline struct nftables_pernet *nft_pernet(const struct net *net)
{
        return net_generic(net, nf_tables_net_id);
}

static inline u64 nft_net_tstamp(const struct net *net)
{
        return nft_pernet(net)->tstamp;
}

#define __NFT_REDUCE_READONLY        1UL
#define NFT_REDUCE_READONLY        (void *)__NFT_REDUCE_READONLY

static inline bool nft_reduce_is_readonly(const struct nft_expr *expr)
{
        return expr->ops->reduce == NFT_REDUCE_READONLY;
}

void nft_reg_track_update(struct nft_regs_track *track,
                          const struct nft_expr *expr, u8 dreg, u8 len);
void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len);
void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg);

static inline bool nft_reg_track_cmp(struct nft_regs_track *track,
                                     const struct nft_expr *expr, u8 dreg)
{
        return track->regs[dreg].selector &&
               track->regs[dreg].selector->ops == expr->ops &&
               track->regs[dreg].num_reg == 0;
}

#endif /* _NET_NF_TABLES_H */




























































































































































    4 



















    4 







































































































































































    4 






    4 










    4 






































































































































































































































    4 





























    4 














    4 

















































    4 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
// SPDX-License-Identifier: GPL-2.0-or-later

#include <linux/plist.h>
#include <linux/sched/task.h>
#include <linux/sched/signal.h>
#include <linux/freezer.h>

#include "futex.h"

/*
 * READ this before attempting to hack on futexes!
 *
 * Basic futex operation and ordering guarantees
 * =============================================
 *
 * The waiter reads the futex value in user space and calls
 * futex_wait(). This function computes the hash bucket and acquires
 * the hash bucket lock. After that it reads the futex user space value
 * again and verifies that the data has not changed. If it has not changed
 * it enqueues itself into the hash bucket, releases the hash bucket lock
 * and schedules.
 *
 * The waker side modifies the user space value of the futex and calls
 * futex_wake(). This function computes the hash bucket and acquires the
 * hash bucket lock. Then it looks for waiters on that futex in the hash
 * bucket and wakes them.
 *
 * In futex wake up scenarios where no tasks are blocked on a futex, taking
 * the hb spinlock can be avoided and simply return. In order for this
 * optimization to work, ordering guarantees must exist so that the waiter
 * being added to the list is acknowledged when the list is concurrently being
 * checked by the waker, avoiding scenarios like the following:
 *
 * CPU 0                               CPU 1
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
 *   uval = *futex;
 *                                     *futex = newval;
 *                                     sys_futex(WAKE, futex);
 *                                       futex_wake(futex);
 *                                       if (queue_empty())
 *                                         return;
 *   if (uval == val)
 *      lock(hash_bucket(futex));
 *      queue();
 *     unlock(hash_bucket(futex));
 *     schedule();
 *
 * This would cause the waiter on CPU 0 to wait forever because it
 * missed the transition of the user space value from val to newval
 * and the waker did not find the waiter in the hash bucket queue.
 *
 * The correct serialization ensures that a waiter either observes
 * the changed user space value before blocking or is woken by a
 * concurrent waker:
 *
 * CPU 0                                 CPU 1
 * val = *futex;
 * sys_futex(WAIT, futex, val);
 *   futex_wait(futex, val);
 *
 *   waiters++; (a)
 *   smp_mb(); (A) <-- paired with -.
 *                                  |
 *   lock(hash_bucket(futex));      |
 *                                  |
 *   uval = *futex;                 |
 *                                  |        *futex = newval;
 *                                  |        sys_futex(WAKE, futex);
 *                                  |          futex_wake(futex);
 *                                  |
 *                                  `--------> smp_mb(); (B)
 *   if (uval == val)
 *     queue();
 *     unlock(hash_bucket(futex));
 *     schedule();                         if (waiters)
 *                                           lock(hash_bucket(futex));
 *   else                                    wake_waiters(futex);
 *     waiters--; (b)                        unlock(hash_bucket(futex));
 *
 * Where (A) orders the waiters increment and the futex value read through
 * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write
 * to futex and the waiters read (see futex_hb_waiters_pending()).
 *
 * This yields the following case (where X:=waiters, Y:=futex):
 *
 *        X = Y = 0
 *
 *        w[X]=1                w[Y]=1
 *        MB                MB
 *        r[Y]=y                r[X]=x
 *
 * Which guarantees that x==0 && y==0 is impossible; which translates back into
 * the guarantee that we cannot both miss the futex variable change and the
 * enqueue.
 *
 * Note that a new waiter is accounted for in (a) even when it is possible that
 * the wait call can return error, in which case we backtrack from it in (b).
 * Refer to the comment in futex_q_lock().
 *
 * Similarly, in order to account for waiters being requeued on another
 * address we always increment the waiters for the destination bucket before
 * acquiring the lock. It then decrements them again  after releasing it -
 * the code that actually moves the futex(es) between hash buckets (requeue_futex)
 * will do the additional required waiter count housekeeping. This is done for
 * double_lock_hb() and double_unlock_hb(), respectively.
 */

bool __futex_wake_mark(struct futex_q *q)
{
        if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
                return false;

        __futex_unqueue(q);
        /*
         * The waiting task can free the futex_q as soon as q->lock_ptr = NULL
         * is written, without taking any locks. This is possible in the event
         * of a spurious wakeup, for example. A memory barrier is required here
         * to prevent the following store to lock_ptr from getting ahead of the
         * plist_del in __futex_unqueue().
         */
        smp_store_release(&q->lock_ptr, NULL);

        return true;
}

/*
 * The hash bucket lock must be held when this is called.
 * Afterwards, the futex_q must not be accessed. Callers
 * must ensure to later call wake_up_q() for the actual
 * wakeups to occur.
 */
void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q)
{
        struct task_struct *p = q->task;

        get_task_struct(p);

        if (!__futex_wake_mark(q)) {
                put_task_struct(p);
                return;
        }

        /*
         * Queue the task for later wakeup for after we've released
         * the hb->lock.
         */
        wake_q_add_safe(wake_q, p);
}

/*
 * Wake up waiters matching bitset queued on this futex (uaddr).
 */
int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
{
        struct futex_hash_bucket *hb;
        struct futex_q *this, *next;
        union futex_key key = FUTEX_KEY_INIT;
        DEFINE_WAKE_Q(wake_q);
        int ret;

        if (!bitset)
                return -EINVAL;

        ret = get_futex_key(uaddr, flags, &key, FUTEX_READ);
        if (unlikely(ret != 0))
                return ret;

        if ((flags & FLAGS_STRICT) && !nr_wake)
                return 0;

        hb = futex_hash(&key);

        /* Make sure we really have tasks to wakeup */
        if (!futex_hb_waiters_pending(hb))
                return ret;

        spin_lock(&hb->lock);

        plist_for_each_entry_safe(this, next, &hb->chain, list) {
                if (futex_match (&this->key, &key)) {
                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
                                break;
                        }

                        /* Check if one of the bits is set in both bitsets */
                        if (!(this->bitset & bitset))
                                continue;

                        this->wake(&wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
        }

        spin_unlock(&hb->lock);
        wake_up_q(&wake_q);
        return ret;
}

static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
{
        unsigned int op =          (encoded_op & 0x70000000) >> 28;
        unsigned int cmp =          (encoded_op & 0x0f000000) >> 24;
        int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11);
        int cmparg = sign_extend32(encoded_op & 0x00000fff, 11);
        int oldval, ret;

        if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
                if (oparg < 0 || oparg > 31) {
                        /*
                         * kill this print and return -EINVAL when userspace
                         * is sane again
                         */
                        pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
                                            current->comm, oparg);
                        oparg &= 31;
                }
                oparg = 1 << oparg;
        }

        pagefault_disable();
        ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr);
        pagefault_enable();
        if (ret)
                return ret;

        switch (cmp) {
        case FUTEX_OP_CMP_EQ:
                return oldval == cmparg;
        case FUTEX_OP_CMP_NE:
                return oldval != cmparg;
        case FUTEX_OP_CMP_LT:
                return oldval < cmparg;
        case FUTEX_OP_CMP_GE:
                return oldval >= cmparg;
        case FUTEX_OP_CMP_LE:
                return oldval <= cmparg;
        case FUTEX_OP_CMP_GT:
                return oldval > cmparg;
        default:
                return -ENOSYS;
        }
}

/*
 * Wake up all waiters hashed on the physical page that is mapped
 * to this virtual address:
 */
int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
                  int nr_wake, int nr_wake2, int op)
{
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
        struct futex_hash_bucket *hb1, *hb2;
        struct futex_q *this, *next;
        int ret, op_ret;
        DEFINE_WAKE_Q(wake_q);

retry:
        ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ);
        if (unlikely(ret != 0))
                return ret;
        ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE);
        if (unlikely(ret != 0))
                return ret;

        hb1 = futex_hash(&key1);
        hb2 = futex_hash(&key2);

retry_private:
        double_lock_hb(hb1, hb2);
        op_ret = futex_atomic_op_inuser(op, uaddr2);
        if (unlikely(op_ret < 0)) {
                double_unlock_hb(hb1, hb2);

                if (!IS_ENABLED(CONFIG_MMU) ||
                    unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) {
                        /*
                         * we don't get EFAULT from MMU faults if we don't have
                         * an MMU, but we might get them from range checking
                         */
                        ret = op_ret;
                        return ret;
                }

                if (op_ret == -EFAULT) {
                        ret = fault_in_user_writeable(uaddr2);
                        if (ret)
                                return ret;
                }

                cond_resched();
                if (!(flags & FLAGS_SHARED))
                        goto retry_private;
                goto retry;
        }

        plist_for_each_entry_safe(this, next, &hb1->chain, list) {
                if (futex_match (&this->key, &key1)) {
                        if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
                                goto out_unlock;
                        }
                        this->wake(&wake_q, this);
                        if (++ret >= nr_wake)
                                break;
                }
        }

        if (op_ret > 0) {
                op_ret = 0;
                plist_for_each_entry_safe(this, next, &hb2->chain, list) {
                        if (futex_match (&this->key, &key2)) {
                                if (this->pi_state || this->rt_waiter) {
                                        ret = -EINVAL;
                                        goto out_unlock;
                                }
                                this->wake(&wake_q, this);
                                if (++op_ret >= nr_wake2)
                                        break;
                        }
                }
                ret += op_ret;
        }

out_unlock:
        double_unlock_hb(hb1, hb2);
        wake_up_q(&wake_q);
        return ret;
}

static long futex_wait_restart(struct restart_block *restart);

/**
 * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal
 * @hb:                the futex hash bucket, must be locked by the caller
 * @q:                the futex_q to queue up on
 * @timeout:        the prepared hrtimer_sleeper, or null for no timeout
 */
void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
                            struct hrtimer_sleeper *timeout)
{
        /*
         * The task state is guaranteed to be set before another task can
         * wake it. set_current_state() is implemented using smp_store_mb() and
         * futex_queue() calls spin_unlock() upon completion, both serializing
         * access to the hash list and forcing another memory barrier.
         */
        set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
        futex_queue(q, hb, current);

        /* Arm the timer */
        if (timeout)
                hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS);

        /*
         * If we have been removed from the hash list, then another task
         * has tried to wake us, and we can skip the call to schedule().
         */
        if (likely(!plist_node_empty(&q->list))) {
                /*
                 * If the timer has already expired, current will already be
                 * flagged for rescheduling. Only call schedule if there
                 * is no timeout, or if it has yet to expire.
                 */
                if (!timeout || timeout->task)
                        schedule();
        }
        __set_current_state(TASK_RUNNING);
}

/**
 * futex_unqueue_multiple - Remove various futexes from their hash bucket
 * @v:           The list of futexes to unqueue
 * @count: Number of futexes in the list
 *
 * Helper to unqueue a list of futexes. This can't fail.
 *
 * Return:
 *  - >=0 - Index of the last futex that was awoken;
 *  - -1  - No futex was awoken
 */
int futex_unqueue_multiple(struct futex_vector *v, int count)
{
        int ret = -1, i;

        for (i = 0; i < count; i++) {
                if (!futex_unqueue(&v[i].q))
                        ret = i;
        }

        return ret;
}

/**
 * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes
 * @vs:                The futex list to wait on
 * @count:        The size of the list
 * @woken:        Index of the last woken futex, if any. Used to notify the
 *                caller that it can return this index to userspace (return parameter)
 *
 * Prepare multiple futexes in a single step and enqueue them. This may fail if
 * the futex list is invalid or if any futex was already awoken. On success the
 * task is ready to interruptible sleep.
 *
 * Return:
 *  -  1 - One of the futexes was woken by another thread
 *  -  0 - Success
 *  - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL
 */
int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken)
{
        struct futex_hash_bucket *hb;
        bool retry = false;
        int ret, i;
        u32 uval;

        /*
         * Enqueuing multiple futexes is tricky, because we need to enqueue
         * each futex on the list before dealing with the next one to avoid
         * deadlocking on the hash bucket. But, before enqueuing, we need to
         * make sure that current->state is TASK_INTERRUPTIBLE, so we don't
         * lose any wake events, which cannot be done before the get_futex_key
         * of the next key, because it calls get_user_pages, which can sleep.
         * Thus, we fetch the list of futexes keys in two steps, by first
         * pinning all the memory keys in the futex key, and only then we read
         * each key and queue the corresponding futex.
         *
         * Private futexes doesn't need to recalculate hash in retry, so skip
         * get_futex_key() when retrying.
         */
retry:
        for (i = 0; i < count; i++) {
                if (!(vs[i].w.flags & FLAGS_SHARED) && retry)
                        continue;

                ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr),
                                    vs[i].w.flags,
                                    &vs[i].q.key, FUTEX_READ);

                if (unlikely(ret))
                        return ret;
        }

        set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);

        for (i = 0; i < count; i++) {
                u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr;
                struct futex_q *q = &vs[i].q;
                u32 val = vs[i].w.val;

                hb = futex_q_lock(q);
                ret = futex_get_value_locked(&uval, uaddr);

                if (!ret && uval == val) {
                        /*
                         * The bucket lock can't be held while dealing with the
                         * next futex. Queue each futex at this moment so hb can
                         * be unlocked.
                         */
                        futex_queue(q, hb, current);
                        continue;
                }

                futex_q_unlock(hb);
                __set_current_state(TASK_RUNNING);

                /*
                 * Even if something went wrong, if we find out that a futex
                 * was woken, we don't return error and return this index to
                 * userspace
                 */
                *woken = futex_unqueue_multiple(vs, i);
                if (*woken >= 0)
                        return 1;

                if (ret) {
                        /*
                         * If we need to handle a page fault, we need to do so
                         * without any lock and any enqueued futex (otherwise
                         * we could lose some wakeup). So we do it here, after
                         * undoing all the work done so far. In success, we
                         * retry all the work.
                         */
                        if (get_user(uval, uaddr))
                                return -EFAULT;

                        retry = true;
                        goto retry;
                }

                if (uval != val)
                        return -EWOULDBLOCK;
        }

        return 0;
}

/**
 * futex_sleep_multiple - Check sleeping conditions and sleep
 * @vs:    List of futexes to wait for
 * @count: Length of vs
 * @to:    Timeout
 *
 * Sleep if and only if the timeout hasn't expired and no futex on the list has
 * been woken up.
 */
static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count,
                                 struct hrtimer_sleeper *to)
{
        if (to && !to->task)
                return;

        for (; count; count--, vs++) {
                if (!READ_ONCE(vs->q.lock_ptr))
                        return;
        }

        schedule();
}

/**
 * futex_wait_multiple - Prepare to wait on and enqueue several futexes
 * @vs:                The list of futexes to wait on
 * @count:        The number of objects
 * @to:                Timeout before giving up and returning to userspace
 *
 * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function
 * sleeps on a group of futexes and returns on the first futex that is
 * wake, or after the timeout has elapsed.
 *
 * Return:
 *  - >=0 - Hint to the futex that was awoken
 *  - <0  - On error
 */
int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
                        struct hrtimer_sleeper *to)
{
        int ret, hint = 0;

        if (to)
                hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);

        while (1) {
                ret = futex_wait_multiple_setup(vs, count, &hint);
                if (ret) {
                        if (ret > 0) {
                                /* A futex was woken during setup */
                                ret = hint;
                        }
                        return ret;
                }

                futex_sleep_multiple(vs, count, to);

                __set_current_state(TASK_RUNNING);

                ret = futex_unqueue_multiple(vs, count);
                if (ret >= 0)
                        return ret;

                if (to && !to->task)
                        return -ETIMEDOUT;
                else if (signal_pending(current))
                        return -ERESTARTSYS;
                /*
                 * The final case is a spurious wakeup, for
                 * which just retry.
                 */
        }
}

/**
 * futex_wait_setup() - Prepare to wait on a futex
 * @uaddr:        the futex userspace address
 * @val:        the expected value
 * @flags:        futex flags (FLAGS_SHARED, etc.)
 * @q:                the associated futex_q
 * @hb:                storage for hash_bucket pointer to be returned to caller
 *
 * Setup the futex_q and locate the hash_bucket.  Get the futex value and
 * compare it with the expected value.  Handle atomic faults internally.
 * Return with the hb lock held on success, and unlocked on failure.
 *
 * Return:
 *  -  0 - uaddr contains val and hb has been locked;
 *  - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
 */
int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                     struct futex_q *q, struct futex_hash_bucket **hb)
{
        u32 uval;
        int ret;

        /*
         * Access the page AFTER the hash-bucket is locked.
         * Order is important:
         *
         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
         *
         * The basic logical guarantee of a futex is that it blocks ONLY
         * if cond(var) is known to be true at the time of blocking, for
         * any cond.  If we locked the hash-bucket after testing *uaddr, that
         * would open a race condition where we could block indefinitely with
         * cond(var) false, which would violate the guarantee.
         *
         * On the other hand, we insert q and release the hash-bucket only
         * after testing *uaddr.  This guarantees that futex_wait() will NOT
         * absorb a wakeup if *uaddr does not match the desired values
         * while the syscall executes.
         */
retry:
        ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ);
        if (unlikely(ret != 0))
                return ret;

retry_private:
        *hb = futex_q_lock(q);

        ret = futex_get_value_locked(&uval, uaddr);

        if (ret) {
                futex_q_unlock(*hb);

                ret = get_user(uval, uaddr);
                if (ret)
                        return ret;

                if (!(flags & FLAGS_SHARED))
                        goto retry_private;

                goto retry;
        }

        if (uval != val) {
                futex_q_unlock(*hb);
                ret = -EWOULDBLOCK;
        }

        return ret;
}

int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
                 struct hrtimer_sleeper *to, u32 bitset)
{
        struct futex_q q = futex_q_init;
        struct futex_hash_bucket *hb;
        int ret;

        if (!bitset)
                return -EINVAL;

        q.bitset = bitset;

retry:
        /*
         * Prepare to wait on uaddr. On success, it holds hb->lock and q
         * is initialized.
         */
        ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
        if (ret)
                return ret;

        /* futex_queue and wait for wakeup, timeout, or a signal. */
        futex_wait_queue(hb, &q, to);

        /* If we were woken (and unqueued), we succeeded, whatever. */
        if (!futex_unqueue(&q))
                return 0;

        if (to && !to->task)
                return -ETIMEDOUT;

        /*
         * We expect signal_pending(current), but we might be the
         * victim of a spurious wakeup as well.
         */
        if (!signal_pending(current))
                goto retry;

        return -ERESTARTSYS;
}

int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset)
{
        struct hrtimer_sleeper timeout, *to;
        struct restart_block *restart;
        int ret;

        to = futex_setup_timer(abs_time, &timeout, flags,
                               current->timer_slack_ns);

        ret = __futex_wait(uaddr, flags, val, to, bitset);

        /* No timeout, nothing to clean up. */
        if (!to)
                return ret;

        hrtimer_cancel(&to->timer);
        destroy_hrtimer_on_stack(&to->timer);

        if (ret == -ERESTARTSYS) {
                restart = &current->restart_block;
                restart->futex.uaddr = uaddr;
                restart->futex.val = val;
                restart->futex.time = *abs_time;
                restart->futex.bitset = bitset;
                restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;

                return set_restart_fn(restart, futex_wait_restart);
        }

        return ret;
}

static long futex_wait_restart(struct restart_block *restart)
{
        u32 __user *uaddr = restart->futex.uaddr;
        ktime_t t, *tp = NULL;

        if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
                t = restart->futex.time;
                tp = &t;
        }
        restart->fn = do_no_restart_syscall;

        return (long)futex_wait(uaddr, restart->futex.flags,
                                restart->futex.val, tp, restart->futex.bitset);
}




























  227 
























  252 
  253 

   84 
  227 

  229 
  227 










  108 
  141 
  119 




  161 
  161 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/mm/flush.c
 *
 * Copyright (C) 1995-2002 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/export.h>
#include <linux/mm.h>
#include <linux/libnvdimm.h>
#include <linux/pagemap.h>

#include <asm/cacheflush.h>
#include <asm/cache.h>
#include <asm/tlbflush.h>

void sync_icache_aliases(unsigned long start, unsigned long end)
{
        if (icache_is_aliasing()) {
                dcache_clean_pou(start, end);
                icache_inval_all_pou();
        } else {
                /*
                 * Don't issue kick_all_cpus_sync() after I-cache invalidation
                 * for user mappings.
                 */
                caches_clean_inval_pou(start, end);
        }
}

static void flush_ptrace_access(struct vm_area_struct *vma, unsigned long start,
                                unsigned long end)
{
        if (vma->vm_flags & VM_EXEC)
                sync_icache_aliases(start, end);
}

/*
 * Copy user data from/to a page which is mapped into a different processes
 * address space.  Really, we want to allow our "user space" model to handle
 * this.
 */
void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
                       unsigned long uaddr, void *dst, const void *src,
                       unsigned long len)
{
        memcpy(dst, src, len);
        flush_ptrace_access(vma, (unsigned long)dst, (unsigned long)dst + len);
}

void __sync_icache_dcache(pte_t pte)
{
        struct folio *folio = page_folio(pte_page(pte));

        if (!test_bit(PG_dcache_clean, &folio->flags)) {
                sync_icache_aliases((unsigned long)folio_address(folio),
                                    (unsigned long)folio_address(folio) +
                                            folio_size(folio));
                set_bit(PG_dcache_clean, &folio->flags);
        }
}
EXPORT_SYMBOL_GPL(__sync_icache_dcache);

/*
 * This function is called when a page has been modified by the kernel. Mark
 * it as dirty for later flushing when mapped in user space (if executable,
 * see __sync_icache_dcache).
 */
void flush_dcache_folio(struct folio *folio)
{
        if (test_bit(PG_dcache_clean, &folio->flags))
                clear_bit(PG_dcache_clean, &folio->flags);
}
EXPORT_SYMBOL(flush_dcache_folio);

void flush_dcache_page(struct page *page)
{
        flush_dcache_folio(page_folio(page));
}
EXPORT_SYMBOL(flush_dcache_page);

/*
 * Additional functions defined in assembly.
 */
EXPORT_SYMBOL(caches_clean_inval_pou);

#ifdef CONFIG_ARCH_HAS_PMEM_API
void arch_wb_cache_pmem(void *addr, size_t size)
{
        /* Ensure order against any prior non-cacheable writes */
        dmb(osh);
        dcache_clean_pop((unsigned long)addr, (unsigned long)addr + size);
}
EXPORT_SYMBOL_GPL(arch_wb_cache_pmem);

void arch_invalidate_pmem(void *addr, size_t size)
{
        dcache_inval_poc((unsigned long)addr, (unsigned long)addr + size);
}
EXPORT_SYMBOL_GPL(arch_invalidate_pmem);
#endif






















  127 




















    1 






















    5 


















































































































































  103 








































































































































































    1 

























































    1 



































  206 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
/* SPDX-License-Identifier: GPL-2.0 */
#if !defined(_TRACE_KVM_MAIN_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_KVM_MAIN_H

#include <linux/tracepoint.h>

#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm

#define ERSN(x) { KVM_EXIT_##x, "KVM_EXIT_" #x }

#define kvm_trace_exit_reason                                                \
        ERSN(UNKNOWN), ERSN(EXCEPTION), ERSN(IO), ERSN(HYPERCALL),        \
        ERSN(DEBUG), ERSN(HLT), ERSN(MMIO), ERSN(IRQ_WINDOW_OPEN),        \
        ERSN(SHUTDOWN), ERSN(FAIL_ENTRY), ERSN(INTR), ERSN(SET_TPR),        \
        ERSN(TPR_ACCESS), ERSN(S390_SIEIC), ERSN(S390_RESET), ERSN(DCR),\
        ERSN(NMI), ERSN(INTERNAL_ERROR), ERSN(OSI), ERSN(PAPR_HCALL),        \
        ERSN(S390_UCONTROL), ERSN(WATCHDOG), ERSN(S390_TSCH), ERSN(EPR),\
        ERSN(SYSTEM_EVENT), ERSN(S390_STSI), ERSN(IOAPIC_EOI),          \
        ERSN(HYPERV), ERSN(ARM_NISV), ERSN(X86_RDMSR), ERSN(X86_WRMSR)

TRACE_EVENT(kvm_userspace_exit,
            TP_PROTO(__u32 reason, int errno),
            TP_ARGS(reason, errno),

        TP_STRUCT__entry(
                __field(        __u32,                reason                )
                __field(        int,                errno                )
        ),

        TP_fast_assign(
                __entry->reason                = reason;
                __entry->errno                = errno;
        ),

        TP_printk("reason %s (%d)",
                  __entry->errno < 0 ?
                  (__entry->errno == -EINTR ? "restart" : "error") :
                  __print_symbolic(__entry->reason, kvm_trace_exit_reason),
                  __entry->errno < 0 ? -__entry->errno : __entry->reason)
);

TRACE_EVENT(kvm_vcpu_wakeup,
            TP_PROTO(__u64 ns, bool waited, bool valid),
            TP_ARGS(ns, waited, valid),

        TP_STRUCT__entry(
                __field(        __u64,                ns                )
                __field(        bool,                waited                )
                __field(        bool,                valid                )
        ),

        TP_fast_assign(
                __entry->ns                = ns;
                __entry->waited                = waited;
                __entry->valid                = valid;
        ),

        TP_printk("%s time %lld ns, polling %s",
                  __entry->waited ? "wait" : "poll",
                  __entry->ns,
                  __entry->valid ? "valid" : "invalid")
);

#if defined(CONFIG_HAVE_KVM_IRQCHIP)
TRACE_EVENT(kvm_set_irq,
        TP_PROTO(unsigned int gsi, int level, int irq_source_id),
        TP_ARGS(gsi, level, irq_source_id),

        TP_STRUCT__entry(
                __field(        unsigned int,        gsi                )
                __field(        int,                level                )
                __field(        int,                irq_source_id        )
        ),

        TP_fast_assign(
                __entry->gsi                = gsi;
                __entry->level                = level;
                __entry->irq_source_id        = irq_source_id;
        ),

        TP_printk("gsi %u level %d source %d",
                  __entry->gsi, __entry->level, __entry->irq_source_id)
);
#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */

#if defined(__KVM_HAVE_IOAPIC)
#define kvm_deliver_mode                \
        {0x0, "Fixed"},                        \
        {0x1, "LowPrio"},                \
        {0x2, "SMI"},                        \
        {0x3, "Res3"},                        \
        {0x4, "NMI"},                        \
        {0x5, "INIT"},                        \
        {0x6, "SIPI"},                        \
        {0x7, "ExtINT"}

TRACE_EVENT(kvm_ioapic_set_irq,
            TP_PROTO(__u64 e, int pin, bool coalesced),
            TP_ARGS(e, pin, coalesced),

        TP_STRUCT__entry(
                __field(        __u64,                e                )
                __field(        int,                pin                )
                __field(        bool,                coalesced        )
        ),

        TP_fast_assign(
                __entry->e                = e;
                __entry->pin                = pin;
                __entry->coalesced        = coalesced;
        ),

        TP_printk("pin %u dst %x vec %u (%s|%s|%s%s)%s",
                  __entry->pin, (u8)(__entry->e >> 56), (u8)__entry->e,
                  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->e & (1<<11)) ? "logical" : "physical",
                  (__entry->e & (1<<15)) ? "level" : "edge",
                  (__entry->e & (1<<16)) ? "|masked" : "",
                  __entry->coalesced ? " (coalesced)" : "")
);

TRACE_EVENT(kvm_ioapic_delayed_eoi_inj,
            TP_PROTO(__u64 e),
            TP_ARGS(e),

        TP_STRUCT__entry(
                __field(        __u64,                e                )
        ),

        TP_fast_assign(
                __entry->e                = e;
        ),

        TP_printk("dst %x vec %u (%s|%s|%s%s)",
                  (u8)(__entry->e >> 56), (u8)__entry->e,
                  __print_symbolic((__entry->e >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->e & (1<<11)) ? "logical" : "physical",
                  (__entry->e & (1<<15)) ? "level" : "edge",
                  (__entry->e & (1<<16)) ? "|masked" : "")
);

TRACE_EVENT(kvm_msi_set_irq,
            TP_PROTO(__u64 address, __u64 data),
            TP_ARGS(address, data),

        TP_STRUCT__entry(
                __field(        __u64,                address                )
                __field(        __u64,                data                )
        ),

        TP_fast_assign(
                __entry->address        = address;
                __entry->data                = data;
        ),

        TP_printk("dst %llx vec %u (%s|%s|%s%s)",
                  (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
                  (u8)__entry->data,
                  __print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
                  (__entry->address & (1<<2)) ? "logical" : "physical",
                  (__entry->data & (1<<15)) ? "level" : "edge",
                  (__entry->address & (1<<3)) ? "|rh" : "")
);

#define kvm_irqchips                                                \
        {KVM_IRQCHIP_PIC_MASTER,        "PIC master"},                \
        {KVM_IRQCHIP_PIC_SLAVE,                "PIC slave"},                \
        {KVM_IRQCHIP_IOAPIC,                "IOAPIC"}

#endif /* defined(__KVM_HAVE_IOAPIC) */

#if defined(CONFIG_HAVE_KVM_IRQCHIP)

#ifdef kvm_irqchips
#define kvm_ack_irq_string "irqchip %s pin %u"
#define kvm_ack_irq_parm  __print_symbolic(__entry->irqchip, kvm_irqchips), __entry->pin
#else
#define kvm_ack_irq_string "irqchip %d pin %u"
#define kvm_ack_irq_parm  __entry->irqchip, __entry->pin
#endif

TRACE_EVENT(kvm_ack_irq,
        TP_PROTO(unsigned int irqchip, unsigned int pin),
        TP_ARGS(irqchip, pin),

        TP_STRUCT__entry(
                __field(        unsigned int,        irqchip                )
                __field(        unsigned int,        pin                )
        ),

        TP_fast_assign(
                __entry->irqchip        = irqchip;
                __entry->pin                = pin;
        ),

        TP_printk(kvm_ack_irq_string, kvm_ack_irq_parm)
);

#endif /* defined(CONFIG_HAVE_KVM_IRQCHIP) */



#define KVM_TRACE_MMIO_READ_UNSATISFIED 0
#define KVM_TRACE_MMIO_READ 1
#define KVM_TRACE_MMIO_WRITE 2

#define kvm_trace_symbol_mmio \
        { KVM_TRACE_MMIO_READ_UNSATISFIED, "unsatisfied-read" }, \
        { KVM_TRACE_MMIO_READ, "read" }, \
        { KVM_TRACE_MMIO_WRITE, "write" }

TRACE_EVENT(kvm_mmio,
        TP_PROTO(int type, int len, u64 gpa, void *val),
        TP_ARGS(type, len, gpa, val),

        TP_STRUCT__entry(
                __field(        u32,        type                )
                __field(        u32,        len                )
                __field(        u64,        gpa                )
                __field(        u64,        val                )
        ),

        TP_fast_assign(
                __entry->type                = type;
                __entry->len                = len;
                __entry->gpa                = gpa;
                __entry->val                = 0;
                if (val)
                        memcpy(&__entry->val, val,
                               min_t(u32, sizeof(__entry->val), len));
        ),

        TP_printk("mmio %s len %u gpa 0x%llx val 0x%llx",
                  __print_symbolic(__entry->type, kvm_trace_symbol_mmio),
                  __entry->len, __entry->gpa, __entry->val)
);

#define KVM_TRACE_IOCSR_READ_UNSATISFIED 0
#define KVM_TRACE_IOCSR_READ 1
#define KVM_TRACE_IOCSR_WRITE 2

#define kvm_trace_symbol_iocsr \
        { KVM_TRACE_IOCSR_READ_UNSATISFIED, "unsatisfied-read" }, \
        { KVM_TRACE_IOCSR_READ, "read" }, \
        { KVM_TRACE_IOCSR_WRITE, "write" }

TRACE_EVENT(kvm_iocsr,
        TP_PROTO(int type, int len, u64 gpa, void *val),
        TP_ARGS(type, len, gpa, val),

        TP_STRUCT__entry(
                __field(        u32,        type        )
                __field(        u32,        len        )
                __field(        u64,        gpa        )
                __field(        u64,        val        )
        ),

        TP_fast_assign(
                __entry->type                = type;
                __entry->len                = len;
                __entry->gpa                = gpa;
                __entry->val                = 0;
                if (val)
                        memcpy(&__entry->val, val,
                               min_t(u32, sizeof(__entry->val), len));
        ),

        TP_printk("iocsr %s len %u gpa 0x%llx val 0x%llx",
                  __print_symbolic(__entry->type, kvm_trace_symbol_iocsr),
                  __entry->len, __entry->gpa, __entry->val)
);

#define kvm_fpu_load_symbol        \
        {0, "unload"},                \
        {1, "load"}

TRACE_EVENT(kvm_fpu,
        TP_PROTO(int load),
        TP_ARGS(load),

        TP_STRUCT__entry(
                __field(        u32,                load                )
        ),

        TP_fast_assign(
                __entry->load                = load;
        ),

        TP_printk("%s", __print_symbolic(__entry->load, kvm_fpu_load_symbol))
);

#ifdef CONFIG_KVM_ASYNC_PF
DECLARE_EVENT_CLASS(kvm_async_get_page_class,

        TP_PROTO(u64 gva, u64 gfn),

        TP_ARGS(gva, gfn),

        TP_STRUCT__entry(
                __field(__u64, gva)
                __field(u64, gfn)
        ),

        TP_fast_assign(
                __entry->gva = gva;
                __entry->gfn = gfn;
        ),

        TP_printk("gva = %#llx, gfn = %#llx", __entry->gva, __entry->gfn)
);

DEFINE_EVENT(kvm_async_get_page_class, kvm_try_async_get_page,

        TP_PROTO(u64 gva, u64 gfn),

        TP_ARGS(gva, gfn)
);

DEFINE_EVENT(kvm_async_get_page_class, kvm_async_pf_repeated_fault,

        TP_PROTO(u64 gva, u64 gfn),

        TP_ARGS(gva, gfn)
);

DECLARE_EVENT_CLASS(kvm_async_pf_nopresent_ready,

        TP_PROTO(u64 token, u64 gva),

        TP_ARGS(token, gva),

        TP_STRUCT__entry(
                __field(__u64, token)
                __field(__u64, gva)
        ),

        TP_fast_assign(
                __entry->token = token;
                __entry->gva = gva;
        ),

        TP_printk("token %#llx gva %#llx", __entry->token, __entry->gva)

);

DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_not_present,

        TP_PROTO(u64 token, u64 gva),

        TP_ARGS(token, gva)
);

DEFINE_EVENT(kvm_async_pf_nopresent_ready, kvm_async_pf_ready,

        TP_PROTO(u64 token, u64 gva),

        TP_ARGS(token, gva)
);

TRACE_EVENT(
        kvm_async_pf_completed,
        TP_PROTO(unsigned long address, u64 gva),
        TP_ARGS(address, gva),

        TP_STRUCT__entry(
                __field(unsigned long, address)
                __field(u64, gva)
                ),

        TP_fast_assign(
                __entry->address = address;
                __entry->gva = gva;
                ),

        TP_printk("gva %#llx address %#lx",  __entry->gva,
                  __entry->address)
);

#endif

TRACE_EVENT(kvm_halt_poll_ns,
        TP_PROTO(bool grow, unsigned int vcpu_id, unsigned int new,
                 unsigned int old),
        TP_ARGS(grow, vcpu_id, new, old),

        TP_STRUCT__entry(
                __field(bool, grow)
                __field(unsigned int, vcpu_id)
                __field(unsigned int, new)
                __field(unsigned int, old)
        ),

        TP_fast_assign(
                __entry->grow           = grow;
                __entry->vcpu_id        = vcpu_id;
                __entry->new            = new;
                __entry->old            = old;
        ),

        TP_printk("vcpu %u: halt_poll_ns %u (%s %u)",
                        __entry->vcpu_id,
                        __entry->new,
                        __entry->grow ? "grow" : "shrink",
                        __entry->old)
);

#define trace_kvm_halt_poll_ns_grow(vcpu_id, new, old) \
        trace_kvm_halt_poll_ns(true, vcpu_id, new, old)
#define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
        trace_kvm_halt_poll_ns(false, vcpu_id, new, old)

TRACE_EVENT(kvm_dirty_ring_push,
        TP_PROTO(struct kvm_dirty_ring *ring, u32 slot, u64 offset),
        TP_ARGS(ring, slot, offset),

        TP_STRUCT__entry(
                __field(int, index)
                __field(u32, dirty_index)
                __field(u32, reset_index)
                __field(u32, slot)
                __field(u64, offset)
        ),

        TP_fast_assign(
                __entry->index          = ring->index;
                __entry->dirty_index    = ring->dirty_index;
                __entry->reset_index    = ring->reset_index;
                __entry->slot           = slot;
                __entry->offset         = offset;
        ),

        TP_printk("ring %d: dirty 0x%x reset 0x%x "
                  "slot %u offset 0x%llx (used %u)",
                  __entry->index, __entry->dirty_index,
                  __entry->reset_index,  __entry->slot, __entry->offset,
                  __entry->dirty_index - __entry->reset_index)
);

TRACE_EVENT(kvm_dirty_ring_reset,
        TP_PROTO(struct kvm_dirty_ring *ring),
        TP_ARGS(ring),

        TP_STRUCT__entry(
                __field(int, index)
                __field(u32, dirty_index)
                __field(u32, reset_index)
        ),

        TP_fast_assign(
                __entry->index          = ring->index;
                __entry->dirty_index    = ring->dirty_index;
                __entry->reset_index    = ring->reset_index;
        ),

        TP_printk("ring %d: dirty 0x%x reset 0x%x (used %u)",
                  __entry->index, __entry->dirty_index, __entry->reset_index,
                  __entry->dirty_index - __entry->reset_index)
);

TRACE_EVENT(kvm_dirty_ring_exit,
        TP_PROTO(struct kvm_vcpu *vcpu),
        TP_ARGS(vcpu),

        TP_STRUCT__entry(
            __field(int, vcpu_id)
        ),

        TP_fast_assign(
            __entry->vcpu_id = vcpu->vcpu_id;
        ),

        TP_printk("vcpu %d", __entry->vcpu_id)
);

TRACE_EVENT(kvm_unmap_hva_range,
        TP_PROTO(unsigned long start, unsigned long end),
        TP_ARGS(start, end),

        TP_STRUCT__entry(
                __field(        unsigned long,        start                )
                __field(        unsigned long,        end                )
        ),

        TP_fast_assign(
                __entry->start                = start;
                __entry->end                = end;
        ),

        TP_printk("mmu notifier unmap range: %#016lx -- %#016lx",
                  __entry->start, __entry->end)
);

TRACE_EVENT(kvm_age_hva,
        TP_PROTO(unsigned long start, unsigned long end),
        TP_ARGS(start, end),

        TP_STRUCT__entry(
                __field(        unsigned long,        start                )
                __field(        unsigned long,        end                )
        ),

        TP_fast_assign(
                __entry->start                = start;
                __entry->end                = end;
        ),

        TP_printk("mmu notifier age hva: %#016lx -- %#016lx",
                  __entry->start, __entry->end)
);

TRACE_EVENT(kvm_test_age_hva,
        TP_PROTO(unsigned long hva),
        TP_ARGS(hva),

        TP_STRUCT__entry(
                __field(        unsigned long,        hva                )
        ),

        TP_fast_assign(
                __entry->hva                = hva;
        ),

        TP_printk("mmu notifier test age hva: %#016lx", __entry->hva)
);

#endif /* _TRACE_KVM_MAIN_H */

/* This part must be outside protection */
#include <trace/define_trace.h>











































































































































































































































































































































































































































































































































































   20 








   21 

    6 

   14 



   13 



   14 










    1 


   13 

   13 















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
// SPDX-License-Identifier: GPL-2.0
/*
 *        SUCS NET3:
 *
 *        Generic datagram handling routines. These are generic for all
 *        protocols. Possibly a generic IP version on top of these would
 *        make sense. Not tonight however 8-).
 *        This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and
 *        NetROM layer all have identical poll code and mostly
 *        identical recvmsg() code. So we share it here. The poll was
 *        shared before but buried in udp.c so I moved it.
 *
 *        Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>. (datagram_poll() from old
 *                                                     udp.c code)
 *
 *        Fixes:
 *                Alan Cox        :        NULL return from skb_peek_copy()
 *                                        understood
 *                Alan Cox        :        Rewrote skb_read_datagram to avoid the
 *                                        skb_peek_copy stuff.
 *                Alan Cox        :        Added support for SOCK_SEQPACKET.
 *                                        IPX can no longer use the SO_TYPE hack
 *                                        but AX.25 now works right, and SPX is
 *                                        feasible.
 *                Alan Cox        :        Fixed write poll of non IP protocol
 *                                        crash.
 *                Florian  La Roche:        Changed for my new skbuff handling.
 *                Darryl Miles        :        Fixed non-blocking SOCK_SEQPACKET.
 *                Linus Torvalds        :        BSD semantic fixes.
 *                Alan Cox        :        Datagram iovec handling
 *                Darryl Miles        :        Fixed non-blocking SOCK_STREAM.
 *                Alan Cox        :        POSIXisms
 *                Pete Wyckoff    :       Unconnected accept() fix.
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/uaccess.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/poll.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/iov_iter.h>
#include <linux/indirect_call_wrapper.h>

#include <net/protocol.h>
#include <linux/skbuff.h>

#include <net/checksum.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
#include <crypto/hash.h>

/*
 *        Is a socket 'connection oriented' ?
 */
static inline int connection_based(struct sock *sk)
{
        return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM;
}

static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, int sync,
                                  void *key)
{
        /*
         * Avoid a wakeup if event not interesting for us
         */
        if (key && !(key_to_poll(key) & (EPOLLIN | EPOLLERR)))
                return 0;
        return autoremove_wake_function(wait, mode, sync, key);
}
/*
 * Wait for the last received packet to be different from skb
 */
int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
                                int *err, long *timeo_p,
                                const struct sk_buff *skb)
{
        int error;
        DEFINE_WAIT_FUNC(wait, receiver_wake_function);

        prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);

        /* Socket errors? */
        error = sock_error(sk);
        if (error)
                goto out_err;

        if (READ_ONCE(queue->prev) != skb)
                goto out;

        /* Socket shut down? */
        if (sk->sk_shutdown & RCV_SHUTDOWN)
                goto out_noerr;

        /* Sequenced packets can come disconnected.
         * If so we report the problem
         */
        error = -ENOTCONN;
        if (connection_based(sk) &&
            !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
                goto out_err;

        /* handle signals */
        if (signal_pending(current))
                goto interrupted;

        error = 0;
        *timeo_p = schedule_timeout(*timeo_p);
out:
        finish_wait(sk_sleep(sk), &wait);
        return error;
interrupted:
        error = sock_intr_errno(*timeo_p);
out_err:
        *err = error;
        goto out;
out_noerr:
        *err = 0;
        error = 1;
        goto out;
}
EXPORT_SYMBOL(__skb_wait_for_more_packets);

static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
{
        struct sk_buff *nskb;

        if (skb->peeked)
                return skb;

        /* We have to unshare an skb before modifying it. */
        if (!skb_shared(skb))
                goto done;

        nskb = skb_clone(skb, GFP_ATOMIC);
        if (!nskb)
                return ERR_PTR(-ENOMEM);

        skb->prev->next = nskb;
        skb->next->prev = nskb;
        nskb->prev = skb->prev;
        nskb->next = skb->next;

        consume_skb(skb);
        skb = nskb;

done:
        skb->peeked = 1;

        return skb;
}

struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
                                          struct sk_buff_head *queue,
                                          unsigned int flags,
                                          int *off, int *err,
                                          struct sk_buff **last)
{
        bool peek_at_off = false;
        struct sk_buff *skb;
        int _off = 0;

        if (unlikely(flags & MSG_PEEK && *off >= 0)) {
                peek_at_off = true;
                _off = *off;
        }

        *last = queue->prev;
        skb_queue_walk(queue, skb) {
                if (flags & MSG_PEEK) {
                        if (peek_at_off && _off >= skb->len &&
                            (_off || skb->peeked)) {
                                _off -= skb->len;
                                continue;
                        }
                        if (!skb->len) {
                                skb = skb_set_peeked(skb);
                                if (IS_ERR(skb)) {
                                        *err = PTR_ERR(skb);
                                        return NULL;
                                }
                        }
                        refcount_inc(&skb->users);
                } else {
                        __skb_unlink(skb, queue);
                }
                *off = _off;
                return skb;
        }
        return NULL;
}

/**
 *        __skb_try_recv_datagram - Receive a datagram skbuff
 *        @sk: socket
 *        @queue: socket queue from which to receive
 *        @flags: MSG\_ flags
 *        @off: an offset in bytes to peek skb from. Returns an offset
 *              within an skb where data actually starts
 *        @err: error code returned
 *        @last: set to last peeked message to inform the wait function
 *               what to look for when peeking
 *
 *        Get a datagram skbuff, understands the peeking, nonblocking wakeups
 *        and possible races. This replaces identical code in packet, raw and
 *        udp, as well as the IPX AX.25 and Appletalk. It also finally fixes
 *        the long standing peek and read race for datagram sockets. If you
 *        alter this routine remember it must be re-entrant.
 *
 *        This function will lock the socket if a skb is returned, so
 *        the caller needs to unlock the socket in that case (usually by
 *        calling skb_free_datagram). Returns NULL with @err set to
 *        -EAGAIN if no data was available or to some other value if an
 *        error was detected.
 *
 *        * It does not lock socket since today. This function is
 *        * free of race conditions. This measure should/can improve
 *        * significantly datagram socket latencies at high loads,
 *        * when data copying to user space takes lots of time.
 *        * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
 *        *  8) Great win.)
 *        *                                            --ANK (980729)
 *
 *        The order of the tests when we find no data waiting are specified
 *        quite explicitly by POSIX 1003.1g, don't change them without having
 *        the standard around please.
 */
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
                                        struct sk_buff_head *queue,
                                        unsigned int flags, int *off, int *err,
                                        struct sk_buff **last)
{
        struct sk_buff *skb;
        unsigned long cpu_flags;
        /*
         * Caller is allowed not to check sk->sk_err before skb_recv_datagram()
         */
        int error = sock_error(sk);

        if (error)
                goto no_packet;

        do {
                /* Again only user level code calls this function, so nothing
                 * interrupt level will suddenly eat the receive_queue.
                 *
                 * Look at current nfs client by the way...
                 * However, this function was correct in any case. 8)
                 */
                spin_lock_irqsave(&queue->lock, cpu_flags);
                skb = __skb_try_recv_from_queue(sk, queue, flags, off, &error,
                                                last);
                spin_unlock_irqrestore(&queue->lock, cpu_flags);
                if (error)
                        goto no_packet;
                if (skb)
                        return skb;

                if (!sk_can_busy_loop(sk))
                        break;

                sk_busy_loop(sk, flags & MSG_DONTWAIT);
        } while (READ_ONCE(queue->prev) != *last);

        error = -EAGAIN;

no_packet:
        *err = error;
        return NULL;
}
EXPORT_SYMBOL(__skb_try_recv_datagram);

struct sk_buff *__skb_recv_datagram(struct sock *sk,
                                    struct sk_buff_head *sk_queue,
                                    unsigned int flags, int *off, int *err)
{
        struct sk_buff *skb, *last;
        long timeo;

        timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);

        do {
                skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
                                              &last);
                if (skb)
                        return skb;

                if (*err != -EAGAIN)
                        break;
        } while (timeo &&
                 !__skb_wait_for_more_packets(sk, sk_queue, err,
                                              &timeo, last));

        return NULL;
}
EXPORT_SYMBOL(__skb_recv_datagram);

struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
                                  int *err)
{
        int off = 0;

        return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags,
                                   &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);

void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
        consume_skb(skb);
}
EXPORT_SYMBOL(skb_free_datagram);

int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
                        struct sk_buff *skb, unsigned int flags,
                        void (*destructor)(struct sock *sk,
                                           struct sk_buff *skb))
{
        int err = 0;

        if (flags & MSG_PEEK) {
                err = -ENOENT;
                spin_lock_bh(&sk_queue->lock);
                if (skb->next) {
                        __skb_unlink(skb, sk_queue);
                        refcount_dec(&skb->users);
                        if (destructor)
                                destructor(sk, skb);
                        err = 0;
                }
                spin_unlock_bh(&sk_queue->lock);
        }

        atomic_inc(&sk->sk_drops);
        return err;
}
EXPORT_SYMBOL(__sk_queue_drop_skb);

/**
 *        skb_kill_datagram - Free a datagram skbuff forcibly
 *        @sk: socket
 *        @skb: datagram skbuff
 *        @flags: MSG\_ flags
 *
 *        This function frees a datagram skbuff that was received by
 *        skb_recv_datagram.  The flags argument must match the one
 *        used for skb_recv_datagram.
 *
 *        If the MSG_PEEK flag is set, and the packet is still on the
 *        receive queue of the socket, it will be taken off the queue
 *        before it is freed.
 *
 *        This function currently only disables BH when acquiring the
 *        sk_receive_queue lock.  Therefore it must not be used in a
 *        context where that lock is acquired in an IRQ context.
 *
 *        It returns 0 if the packet was removed by us.
 */

int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
{
        int err = __sk_queue_drop_skb(sk, &sk->sk_receive_queue, skb, flags,
                                      NULL);

        kfree_skb(skb);
        return err;
}
EXPORT_SYMBOL(skb_kill_datagram);

INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
                                                size_t bytes,
                                                void *data __always_unused,
                                                struct iov_iter *i));

static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
                               struct iov_iter *to, int len, bool fault_short,
                               size_t (*cb)(const void *, size_t, void *,
                                            struct iov_iter *), void *data)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset, start_off = offset, n;
        struct sk_buff *frag_iter;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
                                    skb->data + offset, copy, data, to);
                offset += n;
                if (n != copy)
                        goto short_copy;
                if ((len -= copy) == 0)
                        return 0;
        }

        if (!skb_frags_readable(skb))
                goto short_copy;

        /* Copy paged appendix. Hmm... why does this look so complicated? */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        n = 0;
                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_local_page(p);
                                n += INDIRECT_CALL_1(cb, simple_copy_to_iter,
                                        vaddr + p_off, p_len, data, to);
                                kunmap_local(vaddr);
                        }

                        offset += n;
                        if (n != copy)
                                goto short_copy;
                        if (!(len -= copy))
                                return 0;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (__skb_datagram_iter(frag_iter, offset - start,
                                                to, copy, fault_short, cb, data))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

        /* This is not really a user copy fault, but rather someone
         * gave us a bogus length on the skb.  We should probably
         * print a warning here as it may indicate a kernel bug.
         */

fault:
        iov_iter_revert(to, offset - start_off);
        return -EFAULT;

short_copy:
        if (fault_short || iov_iter_count(to))
                goto fault;

        return 0;
}

static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp,
                                    struct iov_iter *i)
{
#ifdef CONFIG_CRYPTO_HASH
        struct ahash_request *hash = hashp;
        struct scatterlist sg;
        size_t copied;

        copied = copy_to_iter(addr, bytes, i);
        sg_init_one(&sg, addr, copied);
        ahash_request_set_crypt(hash, &sg, NULL, copied);
        crypto_ahash_update(hash);
        return copied;
#else
        return 0;
#endif
}

/**
 *        skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator
 *          and update a hash.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying from
 *        @to: iovec iterator to copy to
 *        @len: amount of data to copy from buffer to iovec
 *      @hash: hash request to update
 */
int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len,
                           struct ahash_request *hash)
{
        return __skb_datagram_iter(skb, offset, to, len, true,
                        hash_and_copy_to_iter, hash);
}
EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter);

static size_t simple_copy_to_iter(const void *addr, size_t bytes,
                void *data __always_unused, struct iov_iter *i)
{
        return copy_to_iter(addr, bytes, i);
}

/**
 *        skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying from
 *        @to: iovec iterator to copy to
 *        @len: amount of data to copy from buffer to iovec
 */
int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len)
{
        trace_skb_copy_datagram_iovec(skb, len);
        return __skb_datagram_iter(skb, offset, to, len, false,
                        simple_copy_to_iter, NULL);
}
EXPORT_SYMBOL(skb_copy_datagram_iter);

/**
 *        skb_copy_datagram_from_iter - Copy a datagram from an iov_iter.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying to
 *        @from: the copy source
 *        @len: amount of data to copy to buffer from iovec
 *
 *        Returns 0 or -EFAULT.
 */
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
                                 struct iov_iter *from,
                                 int len)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                if (copy_from_iter(skb->data + offset, copy, from) != copy)
                        goto fault;
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
        }

        /* Copy paged appendix. Hmm... why does this look so complicated? */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        size_t copied;

                        if (copy > len)
                                copy = len;
                        copied = copy_page_from_iter(skb_frag_page(frag),
                                          skb_frag_off(frag) + offset - start,
                                          copy, from);
                        if (copied != copy)
                                goto fault;

                        if (!(len -= copy))
                                return 0;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_copy_datagram_from_iter(frag_iter,
                                                        offset - start,
                                                        from, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);

int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
                                struct iov_iter *from, size_t length)
{
        int frag = skb_shinfo(skb)->nr_frags;

        if (!skb_frags_readable(skb))
                return -EFAULT;

        while (length && iov_iter_count(from)) {
                struct page *head, *last_head = NULL;
                struct page *pages[MAX_SKB_FRAGS];
                int refs, order, n = 0;
                size_t start;
                ssize_t copied;

                if (frag == MAX_SKB_FRAGS)
                        return -EMSGSIZE;

                copied = iov_iter_get_pages2(from, pages, length,
                                            MAX_SKB_FRAGS - frag, &start);
                if (copied < 0)
                        return -EFAULT;

                length -= copied;

                skb->data_len += copied;
                skb->len += copied;
                skb->truesize += PAGE_ALIGN(copied + start);

                head = compound_head(pages[n]);
                order = compound_order(head);

                for (refs = 0; copied != 0; start = 0) {
                        int size = min_t(int, copied, PAGE_SIZE - start);

                        if (pages[n] - head > (1UL << order) - 1) {
                                head = compound_head(pages[n]);
                                order = compound_order(head);
                        }

                        start += (pages[n] - head) << PAGE_SHIFT;
                        copied -= size;
                        n++;
                        if (frag) {
                                skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1];

                                if (head == skb_frag_page(last) &&
                                    start == skb_frag_off(last) + skb_frag_size(last)) {
                                        skb_frag_size_add(last, size);
                                        /* We combined this page, we need to release
                                         * a reference. Since compound pages refcount
                                         * is shared among many pages, batch the refcount
                                         * adjustments to limit false sharing.
                                         */
                                        last_head = head;
                                        refs++;
                                        continue;
                                }
                        }
                        if (refs) {
                                page_ref_sub(last_head, refs);
                                refs = 0;
                        }
                        skb_fill_page_desc_noacc(skb, frag++, head, start, size);
                }
                if (refs)
                        page_ref_sub(last_head, refs);
        }
        return 0;
}

int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
                            struct sk_buff *skb, struct iov_iter *from,
                            size_t length)
{
        unsigned long orig_size = skb->truesize;
        unsigned long truesize;
        int ret;

        if (msg && msg->msg_ubuf && msg->sg_from_iter)
                ret = msg->sg_from_iter(skb, from, length);
        else
                ret = zerocopy_fill_skb_from_iter(skb, from, length);

        truesize = skb->truesize - orig_size;
        if (sk && sk->sk_type == SOCK_STREAM) {
                sk_wmem_queued_add(sk, truesize);
                if (!skb_zcopy_pure(skb))
                        sk_mem_charge(sk, truesize);
        } else {
                refcount_add(truesize, &skb->sk->sk_wmem_alloc);
        }
        return ret;
}
EXPORT_SYMBOL(__zerocopy_sg_from_iter);

/**
 *        zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
 *        @skb: buffer to copy
 *        @from: the source to copy from
 *
 *        The function will first copy up to headlen, and then pin the userspace
 *        pages and build frags through them.
 *
 *        Returns 0, -EFAULT or -EMSGSIZE.
 */
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
{
        int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));

        /* copy up to skb headlen */
        if (skb_copy_datagram_from_iter(skb, 0, from, copy))
                return -EFAULT;

        return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);

static __always_inline
size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
                              size_t len, void *from, void *priv2)
{
        __wsum next, *csum = priv2;

        next = csum_and_copy_to_user(from + progress, iter_to, len);
        *csum = csum_block_add(*csum, next, progress);
        return next ? 0 : len;
}

static __always_inline
size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
                           size_t len, void *from, void *priv2)
{
        __wsum *csum = priv2;
        __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len);

        *csum = csum_block_add(*csum, next, progress);
        return 0;
}

struct csum_state {
        __wsum csum;
        size_t off;
};

static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
                                    struct iov_iter *i)
{
        struct csum_state *csstate = _csstate;
        __wsum sum;

        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (unlikely(iov_iter_is_discard(i))) {
                // can't use csum_memcpy() for that one - data is not copied
                csstate->csum = csum_block_add(csstate->csum,
                                               csum_partial(addr, bytes, 0),
                                               csstate->off);
                csstate->off += bytes;
                return bytes;
        }

        sum = csum_shift(csstate->csum, csstate->off);

        bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
                                     copy_to_user_iter_csum,
                                     memcpy_to_iter_csum);
        csstate->csum = csum_shift(sum, csstate->off);
        csstate->off += bytes;
        return bytes;
}

/**
 *        skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
 *          and update a checksum.
 *        @skb: buffer to copy
 *        @offset: offset in the buffer to start copying from
 *        @to: iovec iterator to copy to
 *        @len: amount of data to copy from buffer to iovec
 *      @csump: checksum pointer
 */
static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
                                      struct iov_iter *to, int len,
                                      __wsum *csump)
{
        struct csum_state csdata = { .csum = *csump };
        int ret;

        ret = __skb_datagram_iter(skb, offset, to, len, true,
                                  csum_and_copy_to_iter, &csdata);
        if (ret)
                return ret;

        *csump = csdata.csum;
        return 0;
}

/**
 *        skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
 *        @skb: skbuff
 *        @hlen: hardware length
 *        @msg: destination
 *
 *        Caller _must_ check that skb will fit to this iovec.
 *
 *        Returns: 0       - success.
 *                 -EINVAL - checksum failure.
 *                 -EFAULT - fault during copy.
 */
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
                                   int hlen, struct msghdr *msg)
{
        __wsum csum;
        int chunk = skb->len - hlen;

        if (!chunk)
                return 0;

        if (msg_data_left(msg) < chunk) {
                if (__skb_checksum_complete(skb))
                        return -EINVAL;
                if (skb_copy_datagram_msg(skb, hlen, msg, chunk))
                        goto fault;
        } else {
                csum = csum_partial(skb->data, hlen, skb->csum);
                if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter,
                                               chunk, &csum))
                        goto fault;

                if (csum_fold(csum)) {
                        iov_iter_revert(&msg->msg_iter, chunk);
                        return -EINVAL;
                }

                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(NULL, skb);
        }
        return 0;
fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);

/**
 *         datagram_poll - generic datagram poll
 *        @file: file struct
 *        @sock: socket
 *        @wait: poll table
 *
 *        Datagram poll: Again totally generic. This also handles
 *        sequenced packet sockets providing the socket receive queue
 *        is only ever holding data ready to receive.
 *
 *        Note: when you *don't* use this routine for this protocol,
 *        and you use a different write policy from sock_writeable()
 *        then please supply your own write_space callback.
 */
__poll_t datagram_poll(struct file *file, struct socket *sock,
                           poll_table *wait)
{
        struct sock *sk = sock->sk;
        __poll_t mask;
        u8 shutdown;

        sock_poll_wait(file, sock, wait);
        mask = 0;

        /* exceptional events? */
        if (READ_ONCE(sk->sk_err) ||
            !skb_queue_empty_lockless(&sk->sk_error_queue))
                mask |= EPOLLERR |
                        (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);

        shutdown = READ_ONCE(sk->sk_shutdown);
        if (shutdown & RCV_SHUTDOWN)
                mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
        if (shutdown == SHUTDOWN_MASK)
                mask |= EPOLLHUP;

        /* readable? */
        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* Connection-based need to check for termination and startup */
        if (connection_based(sk)) {
                int state = READ_ONCE(sk->sk_state);

                if (state == TCP_CLOSE)
                        mask |= EPOLLHUP;
                /* connection hasn't started yet? */
                if (state == TCP_SYN_SENT)
                        return mask;
        }

        /* writable? */
        if (sock_writeable(sk))
                mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
        else
                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);

        return mask;
}
EXPORT_SYMBOL(datagram_poll);


















































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * fscrypt_private.h
 *
 * Copyright (C) 2015, Google, Inc.
 *
 * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar.
 * Heavily modified since then.
 */

#ifndef _FSCRYPT_PRIVATE_H
#define _FSCRYPT_PRIVATE_H

#include <linux/fscrypt.h>
#include <linux/siphash.h>
#include <crypto/hash.h>
#include <linux/blk-crypto.h>

#define CONST_STRLEN(str)        (sizeof(str) - 1)

#define FSCRYPT_FILE_NONCE_SIZE        16

/*
 * Minimum size of an fscrypt master key.  Note: a longer key will be required
 * if ciphers with a 256-bit security strength are used.  This is just the
 * absolute minimum, which applies when only 128-bit encryption is used.
 */
#define FSCRYPT_MIN_KEY_SIZE        16

#define FSCRYPT_CONTEXT_V1        1
#define FSCRYPT_CONTEXT_V2        2

/* Keep this in sync with include/uapi/linux/fscrypt.h */
#define FSCRYPT_MODE_MAX        FSCRYPT_MODE_AES_256_HCTR2

struct fscrypt_context_v1 {
        u8 version; /* FSCRYPT_CONTEXT_V1 */
        u8 contents_encryption_mode;
        u8 filenames_encryption_mode;
        u8 flags;
        u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE];
        u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};

struct fscrypt_context_v2 {
        u8 version; /* FSCRYPT_CONTEXT_V2 */
        u8 contents_encryption_mode;
        u8 filenames_encryption_mode;
        u8 flags;
        u8 log2_data_unit_size;
        u8 __reserved[3];
        u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE];
        u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
};

/*
 * fscrypt_context - the encryption context of an inode
 *
 * This is the on-disk equivalent of an fscrypt_policy, stored alongside each
 * encrypted file usually in a hidden extended attribute.  It contains the
 * fields from the fscrypt_policy, in order to identify the encryption algorithm
 * and key with which the file is encrypted.  It also contains a nonce that was
 * randomly generated by fscrypt itself; this is used as KDF input or as a tweak
 * to cause different files to be encrypted differently.
 */
union fscrypt_context {
        u8 version;
        struct fscrypt_context_v1 v1;
        struct fscrypt_context_v2 v2;
};

/*
 * Return the size expected for the given fscrypt_context based on its version
 * number, or 0 if the context version is unrecognized.
 */
static inline int fscrypt_context_size(const union fscrypt_context *ctx)
{
        switch (ctx->version) {
        case FSCRYPT_CONTEXT_V1:
                BUILD_BUG_ON(sizeof(ctx->v1) != 28);
                return sizeof(ctx->v1);
        case FSCRYPT_CONTEXT_V2:
                BUILD_BUG_ON(sizeof(ctx->v2) != 40);
                return sizeof(ctx->v2);
        }
        return 0;
}

/* Check whether an fscrypt_context has a recognized version number and size */
static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx,
                                            int ctx_size)
{
        return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx);
}

/* Retrieve the context's nonce, assuming the context was already validated */
static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx)
{
        switch (ctx->version) {
        case FSCRYPT_CONTEXT_V1:
                return ctx->v1.nonce;
        case FSCRYPT_CONTEXT_V2:
                return ctx->v2.nonce;
        }
        WARN_ON_ONCE(1);
        return NULL;
}

union fscrypt_policy {
        u8 version;
        struct fscrypt_policy_v1 v1;
        struct fscrypt_policy_v2 v2;
};

/*
 * Return the size expected for the given fscrypt_policy based on its version
 * number, or 0 if the policy version is unrecognized.
 */
static inline int fscrypt_policy_size(const union fscrypt_policy *policy)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return sizeof(policy->v1);
        case FSCRYPT_POLICY_V2:
                return sizeof(policy->v2);
        }
        return 0;
}

/* Return the contents encryption mode of a valid encryption policy */
static inline u8
fscrypt_policy_contents_mode(const union fscrypt_policy *policy)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return policy->v1.contents_encryption_mode;
        case FSCRYPT_POLICY_V2:
                return policy->v2.contents_encryption_mode;
        }
        BUG();
}

/* Return the filenames encryption mode of a valid encryption policy */
static inline u8
fscrypt_policy_fnames_mode(const union fscrypt_policy *policy)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return policy->v1.filenames_encryption_mode;
        case FSCRYPT_POLICY_V2:
                return policy->v2.filenames_encryption_mode;
        }
        BUG();
}

/* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */
static inline u8
fscrypt_policy_flags(const union fscrypt_policy *policy)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return policy->v1.flags;
        case FSCRYPT_POLICY_V2:
                return policy->v2.flags;
        }
        BUG();
}

static inline int
fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy,
                          const struct inode *inode)
{
        return policy->log2_data_unit_size ?: inode->i_blkbits;
}

static inline int
fscrypt_policy_du_bits(const union fscrypt_policy *policy,
                       const struct inode *inode)
{
        switch (policy->version) {
        case FSCRYPT_POLICY_V1:
                return inode->i_blkbits;
        case FSCRYPT_POLICY_V2:
                return fscrypt_policy_v2_du_bits(&policy->v2, inode);
        }
        BUG();
}

/*
 * For encrypted symlinks, the ciphertext length is stored at the beginning
 * of the string in little-endian format.
 */
struct fscrypt_symlink_data {
        __le16 len;
        char encrypted_path[];
} __packed;

/**
 * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption
 * @tfm: crypto API transform object
 * @blk_key: key for blk-crypto
 *
 * Normally only one of the fields will be non-NULL.
 */
struct fscrypt_prepared_key {
        struct crypto_skcipher *tfm;
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
        struct blk_crypto_key *blk_key;
#endif
};

/*
 * fscrypt_inode_info - the "encryption key" for an inode
 *
 * When an encrypted file's key is made available, an instance of this struct is
 * allocated and stored in ->i_crypt_info.  Once created, it remains until the
 * inode is evicted.
 */
struct fscrypt_inode_info {

        /* The key in a form prepared for actual encryption/decryption */
        struct fscrypt_prepared_key ci_enc_key;

        /* True if ci_enc_key should be freed when this struct is freed */
        u8 ci_owns_key : 1;

#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
        /*
         * True if this inode will use inline encryption (blk-crypto) instead of
         * the traditional filesystem-layer encryption.
         */
        u8 ci_inlinecrypt : 1;
#endif

        /* True if ci_dirhash_key is initialized */
        u8 ci_dirhash_key_initialized : 1;

        /*
         * log2 of the data unit size (granularity of contents encryption) of
         * this file.  This is computable from ci_policy and ci_inode but is
         * cached here for efficiency.  Only used for regular files.
         */
        u8 ci_data_unit_bits;

        /* Cached value: log2 of number of data units per FS block */
        u8 ci_data_units_per_block_bits;

        /* Hashed inode number.  Only set for IV_INO_LBLK_32 */
        u32 ci_hashed_ino;

        /*
         * Encryption mode used for this inode.  It corresponds to either the
         * contents or filenames encryption mode, depending on the inode type.
         */
        struct fscrypt_mode *ci_mode;

        /* Back-pointer to the inode */
        struct inode *ci_inode;

        /*
         * The master key with which this inode was unlocked (decrypted).  This
         * will be NULL if the master key was found in a process-subscribed
         * keyring rather than in the filesystem-level keyring.
         */
        struct fscrypt_master_key *ci_master_key;

        /*
         * Link in list of inodes that were unlocked with the master key.
         * Only used when ->ci_master_key is set.
         */
        struct list_head ci_master_key_link;

        /*
         * If non-NULL, then encryption is done using the master key directly
         * and ci_enc_key will equal ci_direct_key->dk_key.
         */
        struct fscrypt_direct_key *ci_direct_key;

        /*
         * This inode's hash key for filenames.  This is a 128-bit SipHash-2-4
         * key.  This is only set for directories that use a keyed dirhash over
         * the plaintext filenames -- currently just casefolded directories.
         */
        siphash_key_t ci_dirhash_key;

        /* The encryption policy used by this inode */
        union fscrypt_policy ci_policy;

        /* This inode's nonce, copied from the fscrypt_context */
        u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE];
};

typedef enum {
        FS_DECRYPT = 0,
        FS_ENCRYPT,
} fscrypt_direction_t;

/* crypto.c */
extern struct kmem_cache *fscrypt_inode_info_cachep;
int fscrypt_initialize(struct super_block *sb);
int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci,
                            fscrypt_direction_t rw, u64 index,
                            struct page *src_page, struct page *dest_page,
                            unsigned int len, unsigned int offs,
                            gfp_t gfp_flags);
struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags);

void __printf(3, 4) __cold
fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...);

#define fscrypt_warn(inode, fmt, ...)                \
        fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__)
#define fscrypt_err(inode, fmt, ...)                \
        fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)

#define FSCRYPT_MAX_IV_SIZE        32

union fscrypt_iv {
        struct {
                /* zero-based index of data unit within the file */
                __le64 index;

                /* per-file nonce; only set in DIRECT_KEY mode */
                u8 nonce[FSCRYPT_FILE_NONCE_SIZE];
        };
        u8 raw[FSCRYPT_MAX_IV_SIZE];
        __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)];
};

void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index,
                         const struct fscrypt_inode_info *ci);

/*
 * Return the number of bits used by the maximum file data unit index that is
 * possible on the given filesystem, using the given log2 data unit size.
 */
static inline int
fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits)
{
        return fls64(sb->s_maxbytes - 1) - du_bits;
}

/* fname.c */
bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy,
                                    u32 orig_len, u32 max_len,
                                    u32 *encrypted_len_ret);

/* hkdf.c */
struct fscrypt_hkdf {
        struct crypto_shash *hmac_tfm;
};

int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key,
                      unsigned int master_key_size);

/*
 * The list of contexts in which fscrypt uses HKDF.  These values are used as
 * the first byte of the HKDF application-specific info string to guarantee that
 * info strings are never repeated between contexts.  This ensures that all HKDF
 * outputs are unique and cryptographically isolated, i.e. knowledge of one
 * output doesn't reveal another.
 */
#define HKDF_CONTEXT_KEY_IDENTIFIER        1 /* info=<empty>                */
#define HKDF_CONTEXT_PER_FILE_ENC_KEY        2 /* info=file_nonce                */
#define HKDF_CONTEXT_DIRECT_KEY                3 /* info=mode_num                */
#define HKDF_CONTEXT_IV_INO_LBLK_64_KEY        4 /* info=mode_num||fs_uuid        */
#define HKDF_CONTEXT_DIRHASH_KEY        5 /* info=file_nonce                */
#define HKDF_CONTEXT_IV_INO_LBLK_32_KEY        6 /* info=mode_num||fs_uuid        */
#define HKDF_CONTEXT_INODE_HASH_KEY        7 /* info=<empty>                */

int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context,
                        const u8 *info, unsigned int infolen,
                        u8 *okm, unsigned int okmlen);

void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf);

/* inline_crypt.c */
#ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci);

static inline bool
fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
{
        return ci->ci_inlinecrypt;
}

int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
                                     const u8 *raw_key,
                                     const struct fscrypt_inode_info *ci);

void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
                                      struct fscrypt_prepared_key *prep_key);

/*
 * Check whether the crypto transform or blk-crypto key has been allocated in
 * @prep_key, depending on which encryption implementation the file will use.
 */
static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
                        const struct fscrypt_inode_info *ci)
{
        /*
         * The two smp_load_acquire()'s here pair with the smp_store_release()'s
         * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key().
         * I.e., in some cases (namely, if this prep_key is a per-mode
         * encryption key) another task can publish blk_key or tfm concurrently,
         * executing a RELEASE barrier.  We need to use smp_load_acquire() here
         * to safely ACQUIRE the memory the other task published.
         */
        if (fscrypt_using_inline_encryption(ci))
                return smp_load_acquire(&prep_key->blk_key) != NULL;
        return smp_load_acquire(&prep_key->tfm) != NULL;
}

#else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci)
{
        return 0;
}

static inline bool
fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci)
{
        return false;
}

static inline int
fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
                                 const u8 *raw_key,
                                 const struct fscrypt_inode_info *ci)
{
        WARN_ON_ONCE(1);
        return -EOPNOTSUPP;
}

static inline void
fscrypt_destroy_inline_crypt_key(struct super_block *sb,
                                 struct fscrypt_prepared_key *prep_key)
{
}

static inline bool
fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key,
                        const struct fscrypt_inode_info *ci)
{
        return smp_load_acquire(&prep_key->tfm) != NULL;
}
#endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */

/* keyring.c */

/*
 * fscrypt_master_key_secret - secret key material of an in-use master key
 */
struct fscrypt_master_key_secret {

        /*
         * For v2 policy keys: HKDF context keyed by this master key.
         * For v1 policy keys: not set (hkdf.hmac_tfm == NULL).
         */
        struct fscrypt_hkdf        hkdf;

        /*
         * Size of the raw key in bytes.  This remains set even if ->raw was
         * zeroized due to no longer being needed.  I.e. we still remember the
         * size of the key even if we don't need to remember the key itself.
         */
        u32                        size;

        /* For v1 policy keys: the raw key.  Wiped for v2 policy keys. */
        u8                        raw[FSCRYPT_MAX_KEY_SIZE];

} __randomize_layout;

/*
 * fscrypt_master_key - an in-use master key
 *
 * This represents a master encryption key which has been added to the
 * filesystem.  There are three high-level states that a key can be in:
 *
 * FSCRYPT_KEY_STATUS_PRESENT
 *        Key is fully usable; it can be used to unlock inodes that are encrypted
 *        with it (this includes being able to create new inodes).  ->mk_present
 *        indicates whether the key is in this state.  ->mk_secret exists, the key
 *        is in the keyring, and ->mk_active_refs > 0 due to ->mk_present.
 *
 * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED
 *        Removal of this key has been initiated, but some inodes that were
 *        unlocked with it are still in-use.  Like ABSENT, ->mk_secret is wiped,
 *        and the key can no longer be used to unlock inodes.  Unlike ABSENT, the
 *        key is still in the keyring; ->mk_decrypted_inodes is nonempty; and
 *        ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes.
 *
 *        This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty,
 *        or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key.
 *
 * FSCRYPT_KEY_STATUS_ABSENT
 *        Key is fully removed.  The key is no longer in the keyring,
 *        ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is
 *        wiped, and the key can no longer be used to unlock inodes.
 */
struct fscrypt_master_key {

        /*
         * Link in ->s_master_keys->key_hashtable.
         * Only valid if ->mk_active_refs > 0.
         */
        struct hlist_node                        mk_node;

        /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */
        struct rw_semaphore                        mk_sem;

        /*
         * Active and structural reference counts.  An active ref guarantees
         * that the struct continues to exist, continues to be in the keyring
         * ->s_master_keys, and that any embedded subkeys (e.g.
         * ->mk_direct_keys) that have been prepared continue to exist.
         * A structural ref only guarantees that the struct continues to exist.
         *
         * There is one active ref associated with ->mk_present being true, and
         * one active ref for each inode in ->mk_decrypted_inodes.
         *
         * There is one structural ref associated with the active refcount being
         * nonzero.  Finding a key in the keyring also takes a structural ref,
         * which is then held temporarily while the key is operated on.
         */
        refcount_t                                mk_active_refs;
        refcount_t                                mk_struct_refs;

        struct rcu_head                                mk_rcu_head;

        /*
         * The secret key material.  Wiped as soon as it is no longer needed;
         * for details, see the fscrypt_master_key struct comment.
         *
         * Locking: protected by ->mk_sem.
         */
        struct fscrypt_master_key_secret        mk_secret;

        /*
         * For v1 policy keys: an arbitrary key descriptor which was assigned by
         * userspace (->descriptor).
         *
         * For v2 policy keys: a cryptographic hash of this key (->identifier).
         */
        struct fscrypt_key_specifier                mk_spec;

        /*
         * Keyring which contains a key of type 'key_type_fscrypt_user' for each
         * user who has added this key.  Normally each key will be added by just
         * one user, but it's possible that multiple users share a key, and in
         * that case we need to keep track of those users so that one user can't
         * remove the key before the others want it removed too.
         *
         * This is NULL for v1 policy keys; those can only be added by root.
         *
         * Locking: protected by ->mk_sem.  (We don't just rely on the keyrings
         * subsystem semaphore ->mk_users->sem, as we need support for atomic
         * search+insert along with proper synchronization with other fields.)
         */
        struct key                *mk_users;

        /*
         * List of inodes that were unlocked using this key.  This allows the
         * inodes to be evicted efficiently if the key is removed.
         */
        struct list_head        mk_decrypted_inodes;
        spinlock_t                mk_decrypted_inodes_lock;

        /*
         * Per-mode encryption keys for the various types of encryption policies
         * that use them.  Allocated and derived on-demand.
         */
        struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1];
        struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1];
        struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1];

        /* Hash key for inode numbers.  Initialized only when needed. */
        siphash_key_t                mk_ino_hash_key;
        bool                        mk_ino_hash_key_initialized;

        /*
         * Whether this key is in the "present" state, i.e. fully usable.  For
         * details, see the fscrypt_master_key struct comment.
         *
         * Locking: protected by ->mk_sem, but can be read locklessly using
         * READ_ONCE().  Writers must use WRITE_ONCE() when concurrent readers
         * are possible.
         */
        bool                        mk_present;

} __randomize_layout;

static inline const char *master_key_spec_type(
                                const struct fscrypt_key_specifier *spec)
{
        switch (spec->type) {
        case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
                return "descriptor";
        case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
                return "identifier";
        }
        return "[unknown]";
}

static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec)
{
        switch (spec->type) {
        case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
                return FSCRYPT_KEY_DESCRIPTOR_SIZE;
        case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
                return FSCRYPT_KEY_IDENTIFIER_SIZE;
        }
        return 0;
}

void fscrypt_put_master_key(struct fscrypt_master_key *mk);

void fscrypt_put_master_key_activeref(struct super_block *sb,
                                      struct fscrypt_master_key *mk);

struct fscrypt_master_key *
fscrypt_find_master_key(struct super_block *sb,
                        const struct fscrypt_key_specifier *mk_spec);

int fscrypt_get_test_dummy_key_identifier(
                          u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);

int fscrypt_add_test_dummy_key(struct super_block *sb,
                               struct fscrypt_key_specifier *key_spec);

int fscrypt_verify_key_added(struct super_block *sb,
                             const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]);

int __init fscrypt_init_keyring(void);

/* keysetup.c */

struct fscrypt_mode {
        const char *friendly_name;
        const char *cipher_str;
        int keysize;                /* key size in bytes */
        int security_strength;        /* security strength in bytes */
        int ivsize;                /* IV size in bytes */
        int logged_cryptoapi_impl;
        int logged_blk_crypto_native;
        int logged_blk_crypto_fallback;
        enum blk_crypto_mode_num blk_crypto_mode;
};

extern struct fscrypt_mode fscrypt_modes[];

int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key,
                        const u8 *raw_key, const struct fscrypt_inode_info *ci);

void fscrypt_destroy_prepared_key(struct super_block *sb,
                                  struct fscrypt_prepared_key *prep_key);

int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci,
                                 const u8 *raw_key);

int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci,
                               const struct fscrypt_master_key *mk);

void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci,
                               const struct fscrypt_master_key *mk);

int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported);

/**
 * fscrypt_require_key() - require an inode's encryption key
 * @inode: the inode we need the key for
 *
 * If the inode is encrypted, set up its encryption key if not already done.
 * Then require that the key be present and return -ENOKEY otherwise.
 *
 * No locks are needed, and the key will live as long as the struct inode --- so
 * it won't go away from under you.
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 * if a problem occurred while setting up the encryption key.
 */
static inline int fscrypt_require_key(struct inode *inode)
{
        if (IS_ENCRYPTED(inode)) {
                int err = fscrypt_get_encryption_info(inode, false);

                if (err)
                        return err;
                if (!fscrypt_has_encryption_key(inode))
                        return -ENOKEY;
        }
        return 0;
}

/* keysetup_v1.c */

void fscrypt_put_direct_key(struct fscrypt_direct_key *dk);

int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci,
                              const u8 *raw_master_key);

int fscrypt_setup_v1_file_key_via_subscribed_keyrings(
                                struct fscrypt_inode_info *ci);

/* policy.c */

bool fscrypt_policies_equal(const union fscrypt_policy *policy1,
                            const union fscrypt_policy *policy2);
int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy,
                               struct fscrypt_key_specifier *key_spec);
const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb);
bool fscrypt_supported_policy(const union fscrypt_policy *policy_u,
                              const struct inode *inode);
int fscrypt_policy_from_context(union fscrypt_policy *policy_u,
                                const union fscrypt_context *ctx_u,
                                int ctx_size);
const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir);

#endif /* _FSCRYPT_PRIVATE_H */



















































































    9 






























































  320 












  319 





























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMAN_H
#define _LINUX_MMAN_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/percpu_counter.h>

#include <linux/atomic.h>
#include <uapi/linux/mman.h>

/*
 * Arrange for legacy / undefined architecture specific flags to be
 * ignored by mmap handling code.
 */
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_ABOVE4G
#define MAP_ABOVE4G 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif

/*
 * The historical set of flags that all mmap implementations implicitly
 * support when a ->mmap_validate() op is not provided in file_operations.
 *
 * MAP_EXECUTABLE and MAP_DENYWRITE are completely ignored throughout the
 * kernel.
 */
#define LEGACY_MAP_MASK (MAP_SHARED \
                | MAP_PRIVATE \
                | MAP_FIXED \
                | MAP_ANONYMOUS \
                | MAP_DENYWRITE \
                | MAP_EXECUTABLE \
                | MAP_UNINITIALIZED \
                | MAP_GROWSDOWN \
                | MAP_LOCKED \
                | MAP_NORESERVE \
                | MAP_POPULATE \
                | MAP_NONBLOCK \
                | MAP_STACK \
                | MAP_HUGETLB \
                | MAP_32BIT \
                | MAP_ABOVE4G \
                | MAP_HUGE_2MB \
                | MAP_HUGE_1GB)

extern int sysctl_overcommit_memory;
extern struct percpu_counter vm_committed_as;

#ifdef CONFIG_SMP
extern s32 vm_committed_as_batch;
extern void mm_compute_batch(int overcommit_policy);
#else
#define vm_committed_as_batch 0
static inline void mm_compute_batch(int overcommit_policy)
{
}
#endif

unsigned long vm_memory_committed(void);

static inline void vm_acct_memory(long pages)
{
        percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
}

static inline void vm_unacct_memory(long pages)
{
        vm_acct_memory(-pages);
}

/*
 * Allow architectures to handle additional protection and flag bits. The
 * overriding macros must be defined in the arch-specific asm/mman.h file.
 */

#ifndef arch_calc_vm_prot_bits
#define arch_calc_vm_prot_bits(prot, pkey) 0
#endif

#ifndef arch_calc_vm_flag_bits
#define arch_calc_vm_flag_bits(file, flags) 0
#endif

#ifndef arch_validate_prot
/*
 * This is called from mprotect().  PROT_GROWSDOWN and PROT_GROWSUP have
 * already been masked out.
 *
 * Returns true if the prot flags are valid
 */
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
{
        return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0;
}
#define arch_validate_prot arch_validate_prot
#endif

#ifndef arch_validate_flags
/*
 * This is called from mmap() and mprotect() with the updated vma->vm_flags.
 *
 * Returns true if the VM_* flags are valid.
 */
static inline bool arch_validate_flags(unsigned long flags)
{
        return true;
}
#define arch_validate_flags arch_validate_flags
#endif

/*
 * Optimisation macro.  It is equivalent to:
 *      (x & bit1) ? bit2 : 0
 * but this version is faster.
 * ("bit1" and "bit2" must be single bits)
 */
#define _calc_vm_trans(x, bit1, bit2) \
  ((!(bit1) || !(bit2)) ? 0 : \
  ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
   : ((x) & (bit1)) / ((bit1) / (bit2))))

/*
 * Combine the mmap "prot" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
{
        return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
               _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
               _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
               arch_calc_vm_prot_bits(prot, pkey);
}

/*
 * Combine the mmap "flags" argument into "vm_flags" used internally.
 */
static inline unsigned long
calc_vm_flag_bits(struct file *file, unsigned long flags)
{
        return _calc_vm_trans(flags, MAP_GROWSDOWN,  VM_GROWSDOWN ) |
               _calc_vm_trans(flags, MAP_LOCKED,     VM_LOCKED    ) |
               _calc_vm_trans(flags, MAP_SYNC,             VM_SYNC      ) |
               _calc_vm_trans(flags, MAP_STACK,             VM_NOHUGEPAGE) |
               arch_calc_vm_flag_bits(file, flags);
}

unsigned long vm_commit_limit(void);

#ifndef arch_memory_deny_write_exec_supported
static inline bool arch_memory_deny_write_exec_supported(void)
{
        return true;
}
#define arch_memory_deny_write_exec_supported arch_memory_deny_write_exec_supported
#endif

/*
 * Denies creating a writable executable mapping or gaining executable permissions.
 *
 * This denies the following:
 *
 *         a)        mmap(PROT_WRITE | PROT_EXEC)
 *
 *        b)        mmap(PROT_WRITE)
 *                mprotect(PROT_EXEC)
 *
 *        c)        mmap(PROT_WRITE)
 *                mprotect(PROT_READ)
 *                mprotect(PROT_EXEC)
 *
 * But allows the following:
 *
 *        d)        mmap(PROT_READ | PROT_EXEC)
 *                mmap(PROT_READ | PROT_EXEC | PROT_BTI)
 *
 * This is only applicable if the user has set the Memory-Deny-Write-Execute
 * (MDWE) protection mask for the current process.
 *
 * @old specifies the VMA flags the VMA originally possessed, and @new the ones
 * we propose to set.
 *
 * Return: false if proposed change is OK, true if not ok and should be denied.
 */
static inline bool map_deny_write_exec(unsigned long old, unsigned long new)
{
        /* If MDWE is disabled, we have nothing to deny. */
        if (!test_bit(MMF_HAS_MDWE, &current->mm->flags))
                return false;

        /* If the new VMA is not executable, we have nothing to deny. */
        if (!(new & VM_EXEC))
                return false;

        /* Under MDWE we do not accept newly writably executable VMAs... */
        if (new & VM_WRITE)
                return true;

        /* ...nor previously non-executable VMAs becoming executable. */
        if (!(old & VM_EXEC))
                return true;

        return false;
}

#endif /* _LINUX_MMAN_H */






















































































































    4 




















    4 
    4 



















































































    4 


















    4 












    4 


























































































































































































































































    4 












    4 
















    4 









































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  Fast Userspace Mutexes (which I call "Futexes!").
 *  (C) Rusty Russell, IBM 2002
 *
 *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
 *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
 *
 *  Removed page pinning, fix privately mapped COW pages and other cleanups
 *  (C) Copyright 2003, 2004 Jamie Lokier
 *
 *  Robust futex support started by Ingo Molnar
 *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
 *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
 *
 *  PI-futex support started by Ingo Molnar and Thomas Gleixner
 *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
 *
 *  PRIVATE futexes by Eric Dumazet
 *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
 *
 *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
 *  Copyright (C) IBM Corporation, 2009
 *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
 *
 *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
 *  enough at me, Linus for the original (flawed) idea, Matthew
 *  Kirkwood for proof-of-concept implementation.
 *
 *  "The futexes are also cursed."
 *  "But they come in a choice of three flavours!"
 */
#include <linux/compat.h>
#include <linux/jhash.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/plist.h>
#include <linux/memblock.h>
#include <linux/fault-inject.h>
#include <linux/slab.h>

#include "futex.h"
#include "../locking/rtmutex_common.h"

/*
 * The base of the bucket array and its size are always used together
 * (after initialization only in futex_hash()), so ensure that they
 * reside in the same cacheline.
 */
static struct {
        struct futex_hash_bucket *queues;
        unsigned long            hashmask;
} __futex_data __read_mostly __aligned(2*sizeof(long));
#define futex_queues   (__futex_data.queues)
#define futex_hashmask (__futex_data.hashmask)


/*
 * Fault injections for futexes.
 */
#ifdef CONFIG_FAIL_FUTEX

static struct {
        struct fault_attr attr;

        bool ignore_private;
} fail_futex = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_private = false,
};

static int __init setup_fail_futex(char *str)
{
        return setup_fault_attr(&fail_futex.attr, str);
}
__setup("fail_futex=", setup_fail_futex);

bool should_fail_futex(bool fshared)
{
        if (fail_futex.ignore_private && !fshared)
                return false;

        return should_fail(&fail_futex.attr, 1);
}

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_futex_debugfs(void)
{
        umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_futex", NULL,
                                        &fail_futex.attr);
        if (IS_ERR(dir))
                return PTR_ERR(dir);

        debugfs_create_bool("ignore-private", mode, dir,
                            &fail_futex.ignore_private);
        return 0;
}

late_initcall(fail_futex_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

#endif /* CONFIG_FAIL_FUTEX */

/**
 * futex_hash - Return the hash bucket in the global hash
 * @key:        Pointer to the futex key for which the hash is calculated
 *
 * We hash on the keys returned from get_futex_key (see below) and return the
 * corresponding hash bucket in the global hash.
 */
struct futex_hash_bucket *futex_hash(union futex_key *key)
{
        u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
                          key->both.offset);

        return &futex_queues[hash & futex_hashmask];
}


/**
 * futex_setup_timer - set up the sleeping hrtimer.
 * @time:        ptr to the given timeout value
 * @timeout:        the hrtimer_sleeper structure to be set up
 * @flags:        futex flags
 * @range_ns:        optional range in ns
 *
 * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
 *           value given
 */
struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
                  int flags, u64 range_ns)
{
        if (!time)
                return NULL;

        hrtimer_setup_sleeper_on_stack(timeout,
                                       (flags & FLAGS_CLOCKRT) ? CLOCK_REALTIME : CLOCK_MONOTONIC,
                                       HRTIMER_MODE_ABS);
        /*
         * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
         * effectively the same as calling hrtimer_set_expires().
         */
        hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);

        return timeout;
}

/*
 * Generate a machine wide unique identifier for this inode.
 *
 * This relies on u64 not wrapping in the life-time of the machine; which with
 * 1ns resolution means almost 585 years.
 *
 * This further relies on the fact that a well formed program will not unmap
 * the file while it has a (shared) futex waiting on it. This mapping will have
 * a file reference which pins the mount and inode.
 *
 * If for some reason an inode gets evicted and read back in again, it will get
 * a new sequence number and will _NOT_ match, even though it is the exact same
 * file.
 *
 * It is important that futex_match() will never have a false-positive, esp.
 * for PI futexes that can mess up the state. The above argues that false-negatives
 * are only possible for malformed programs.
 */
static u64 get_inode_sequence_number(struct inode *inode)
{
        static atomic64_t i_seq;
        u64 old;

        /* Does the inode already have a sequence number? */
        old = atomic64_read(&inode->i_sequence);
        if (likely(old))
                return old;

        for (;;) {
                u64 new = atomic64_inc_return(&i_seq);
                if (WARN_ON_ONCE(!new))
                        continue;

                old = 0;
                if (!atomic64_try_cmpxchg_relaxed(&inode->i_sequence, &old, new))
                        return old;
                return new;
        }
}

/**
 * get_futex_key() - Get parameters which are the keys for a futex
 * @uaddr:        virtual address of the futex
 * @flags:        FLAGS_*
 * @key:        address where result is stored.
 * @rw:                mapping needs to be read/write (values: FUTEX_READ,
 *              FUTEX_WRITE)
 *
 * Return: a negative error code or 0
 *
 * The key words are stored in @key on success.
 *
 * For shared mappings (when @fshared), the key is:
 *
 *   ( inode->i_sequence, page->index, offset_within_page )
 *
 * [ also see get_inode_sequence_number() ]
 *
 * For private mappings (or when !@fshared), the key is:
 *
 *   ( current->mm, address, 0 )
 *
 * This allows (cross process, where applicable) identification of the futex
 * without keeping the page pinned for the duration of the FUTEX_WAIT.
 *
 * lock_page() might sleep, the caller should not hold a spinlock.
 */
int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
                  enum futex_access rw)
{
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
        struct page *page;
        struct folio *folio;
        struct address_space *mapping;
        int err, ro = 0;
        bool fshared;

        fshared = flags & FLAGS_SHARED;

        /*
         * The futex address must be "naturally" aligned.
         */
        key->both.offset = address % PAGE_SIZE;
        if (unlikely((address % sizeof(u32)) != 0))
                return -EINVAL;
        address -= key->both.offset;

        if (unlikely(!access_ok(uaddr, sizeof(u32))))
                return -EFAULT;

        if (unlikely(should_fail_futex(fshared)))
                return -EFAULT;

        /*
         * PROCESS_PRIVATE futexes are fast.
         * As the mm cannot disappear under us and the 'key' only needs
         * virtual address, we dont even have to find the underlying vma.
         * Note : We do have to check 'uaddr' is a valid user address,
         *        but access_ok() should be faster than find_vma()
         */
        if (!fshared) {
                /*
                 * On no-MMU, shared futexes are treated as private, therefore
                 * we must not include the current process in the key. Since
                 * there is only one address space, the address is a unique key
                 * on its own.
                 */
                if (IS_ENABLED(CONFIG_MMU))
                        key->private.mm = mm;
                else
                        key->private.mm = NULL;

                key->private.address = address;
                return 0;
        }

again:
        /* Ignore any VERIFY_READ mapping (futex common case) */
        if (unlikely(should_fail_futex(true)))
                return -EFAULT;

        err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
        /*
         * If write access is not required (eg. FUTEX_WAIT), try
         * and get read-only access.
         */
        if (err == -EFAULT && rw == FUTEX_READ) {
                err = get_user_pages_fast(address, 1, 0, &page);
                ro = 1;
        }
        if (err < 0)
                return err;
        else
                err = 0;

        /*
         * The treatment of mapping from this point on is critical. The folio
         * lock protects many things but in this context the folio lock
         * stabilizes mapping, prevents inode freeing in the shared
         * file-backed region case and guards against movement to swap cache.
         *
         * Strictly speaking the folio lock is not needed in all cases being
         * considered here and folio lock forces unnecessarily serialization.
         * From this point on, mapping will be re-verified if necessary and
         * folio lock will be acquired only if it is unavoidable
         *
         * Mapping checks require the folio so it is looked up now. For
         * anonymous pages, it does not matter if the folio is split
         * in the future as the key is based on the address. For
         * filesystem-backed pages, the precise page is required as the
         * index of the page determines the key.
         */
        folio = page_folio(page);
        mapping = READ_ONCE(folio->mapping);

        /*
         * If folio->mapping is NULL, then it cannot be an anonymous
         * page; but it might be the ZERO_PAGE or in the gate area or
         * in a special mapping (all cases which we are happy to fail);
         * or it may have been a good file page when get_user_pages_fast
         * found it, but truncated or holepunched or subjected to
         * invalidate_complete_page2 before we got the folio lock (also
         * cases which we are happy to fail).  And we hold a reference,
         * so refcount care in invalidate_inode_page's remove_mapping
         * prevents drop_caches from setting mapping to NULL beneath us.
         *
         * The case we do have to guard against is when memory pressure made
         * shmem_writepage move it from filecache to swapcache beneath us:
         * an unlikely race, but we do need to retry for folio->mapping.
         */
        if (unlikely(!mapping)) {
                int shmem_swizzled;

                /*
                 * Folio lock is required to identify which special case above
                 * applies. If this is really a shmem page then the folio lock
                 * will prevent unexpected transitions.
                 */
                folio_lock(folio);
                shmem_swizzled = folio_test_swapcache(folio) || folio->mapping;
                folio_unlock(folio);
                folio_put(folio);

                if (shmem_swizzled)
                        goto again;

                return -EFAULT;
        }

        /*
         * Private mappings are handled in a simple way.
         *
         * If the futex key is stored in anonymous memory, then the associated
         * object is the mm which is implicitly pinned by the calling process.
         *
         * NOTE: When userspace waits on a MAP_SHARED mapping, even if
         * it's a read-only handle, it's expected that futexes attach to
         * the object not the particular process.
         */
        if (folio_test_anon(folio)) {
                /*
                 * A RO anonymous page will never change and thus doesn't make
                 * sense for futex operations.
                 */
                if (unlikely(should_fail_futex(true)) || ro) {
                        err = -EFAULT;
                        goto out;
                }

                key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
                key->private.mm = mm;
                key->private.address = address;

        } else {
                struct inode *inode;

                /*
                 * The associated futex object in this case is the inode and
                 * the folio->mapping must be traversed. Ordinarily this should
                 * be stabilised under folio lock but it's not strictly
                 * necessary in this case as we just want to pin the inode, not
                 * update i_pages or anything like that.
                 *
                 * The RCU read lock is taken as the inode is finally freed
                 * under RCU. If the mapping still matches expectations then the
                 * mapping->host can be safely accessed as being a valid inode.
                 */
                rcu_read_lock();

                if (READ_ONCE(folio->mapping) != mapping) {
                        rcu_read_unlock();
                        folio_put(folio);

                        goto again;
                }

                inode = READ_ONCE(mapping->host);
                if (!inode) {
                        rcu_read_unlock();
                        folio_put(folio);

                        goto again;
                }

                key->both.offset |= FUT_OFF_INODE; /* inode-based key */
                key->shared.i_seq = get_inode_sequence_number(inode);
                key->shared.pgoff = page_pgoff(folio, page);
                rcu_read_unlock();
        }

out:
        folio_put(folio);
        return err;
}

/**
 * fault_in_user_writeable() - Fault in user address and verify RW access
 * @uaddr:        pointer to faulting user space address
 *
 * Slow path to fixup the fault we just took in the atomic write
 * access to @uaddr.
 *
 * We have no generic implementation of a non-destructive write to the
 * user address. We know that we faulted in the atomic pagefault
 * disabled section so we can as well avoid the #PF overhead by
 * calling get_user_pages() right away.
 */
int fault_in_user_writeable(u32 __user *uaddr)
{
        struct mm_struct *mm = current->mm;
        int ret;

        mmap_read_lock(mm);
        ret = fixup_user_fault(mm, (unsigned long)uaddr,
                               FAULT_FLAG_WRITE, NULL);
        mmap_read_unlock(mm);

        return ret < 0 ? ret : 0;
}

/**
 * futex_top_waiter() - Return the highest priority waiter on a futex
 * @hb:                the hash bucket the futex_q's reside in
 * @key:        the futex key (to distinguish it from other futex futex_q's)
 *
 * Must be called with the hb lock held.
 */
struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
{
        struct futex_q *this;

        plist_for_each_entry(this, &hb->chain, list) {
                if (futex_match(&this->key, key))
                        return this;
        }
        return NULL;
}

/**
 * wait_for_owner_exiting - Block until the owner has exited
 * @ret: owner's current futex lock status
 * @exiting:        Pointer to the exiting task
 *
 * Caller must hold a refcount on @exiting.
 */
void wait_for_owner_exiting(int ret, struct task_struct *exiting)
{
        if (ret != -EBUSY) {
                WARN_ON_ONCE(exiting);
                return;
        }

        if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
                return;

        mutex_lock(&exiting->futex_exit_mutex);
        /*
         * No point in doing state checking here. If the waiter got here
         * while the task was in exec()->exec_futex_release() then it can
         * have any FUTEX_STATE_* value when the waiter has acquired the
         * mutex. OK, if running, EXITING or DEAD if it reached exit()
         * already. Highly unlikely and not a problem. Just one more round
         * through the futex maze.
         */
        mutex_unlock(&exiting->futex_exit_mutex);

        put_task_struct(exiting);
}

/**
 * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
 * @q:        The futex_q to unqueue
 *
 * The q->lock_ptr must not be NULL and must be held by the caller.
 */
void __futex_unqueue(struct futex_q *q)
{
        struct futex_hash_bucket *hb;

        if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
                return;
        lockdep_assert_held(q->lock_ptr);

        hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
        plist_del(&q->list, &hb->chain);
        futex_hb_waiters_dec(hb);
}

/* The key must be already stored in q->key. */
struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
        __acquires(&hb->lock)
{
        struct futex_hash_bucket *hb;

        hb = futex_hash(&q->key);

        /*
         * Increment the counter before taking the lock so that
         * a potential waker won't miss a to-be-slept task that is
         * waiting for the spinlock. This is safe as all futex_q_lock()
         * users end up calling futex_queue(). Similarly, for housekeeping,
         * decrement the counter at futex_q_unlock() when some error has
         * occurred and we don't end up adding the task to the list.
         */
        futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */

        q->lock_ptr = &hb->lock;

        spin_lock(&hb->lock);
        return hb;
}

void futex_q_unlock(struct futex_hash_bucket *hb)
        __releases(&hb->lock)
{
        spin_unlock(&hb->lock);
        futex_hb_waiters_dec(hb);
}

void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
                   struct task_struct *task)
{
        int prio;

        /*
         * The priority used to register this element is
         * - either the real thread-priority for the real-time threads
         * (i.e. threads with a priority lower than MAX_RT_PRIO)
         * - or MAX_RT_PRIO for non-RT threads.
         * Thus, all RT-threads are woken first in priority order, and
         * the others are woken last, in FIFO order.
         */
        prio = min(current->normal_prio, MAX_RT_PRIO);

        plist_node_init(&q->list, prio);
        plist_add(&q->list, &hb->chain);
        q->task = task;
}

/**
 * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
 * @q:        The futex_q to unqueue
 *
 * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
 * be paired with exactly one earlier call to futex_queue().
 *
 * Return:
 *  - 1 - if the futex_q was still queued (and we removed unqueued it);
 *  - 0 - if the futex_q was already removed by the waking thread
 */
int futex_unqueue(struct futex_q *q)
{
        spinlock_t *lock_ptr;
        int ret = 0;

        /* In the common case we don't take the spinlock, which is nice. */
retry:
        /*
         * q->lock_ptr can change between this read and the following spin_lock.
         * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
         * optimizing lock_ptr out of the logic below.
         */
        lock_ptr = READ_ONCE(q->lock_ptr);
        if (lock_ptr != NULL) {
                spin_lock(lock_ptr);
                /*
                 * q->lock_ptr can change between reading it and
                 * spin_lock(), causing us to take the wrong lock.  This
                 * corrects the race condition.
                 *
                 * Reasoning goes like this: if we have the wrong lock,
                 * q->lock_ptr must have changed (maybe several times)
                 * between reading it and the spin_lock().  It can
                 * change again after the spin_lock() but only if it was
                 * already changed before the spin_lock().  It cannot,
                 * however, change back to the original value.  Therefore
                 * we can detect whether we acquired the correct lock.
                 */
                if (unlikely(lock_ptr != q->lock_ptr)) {
                        spin_unlock(lock_ptr);
                        goto retry;
                }
                __futex_unqueue(q);

                BUG_ON(q->pi_state);

                spin_unlock(lock_ptr);
                ret = 1;
        }

        return ret;
}

/*
 * PI futexes can not be requeued and must remove themselves from the hash
 * bucket. The hash bucket lock (i.e. lock_ptr) is held.
 */
void futex_unqueue_pi(struct futex_q *q)
{
        /*
         * If the lock was not acquired (due to timeout or signal) then the
         * rt_waiter is removed before futex_q is. If this is observed by
         * an unlocker after dropping the rtmutex wait lock and before
         * acquiring the hash bucket lock, then the unlocker dequeues the
         * futex_q from the hash bucket list to guarantee consistent state
         * vs. userspace. Therefore the dequeue here must be conditional.
         */
        if (!plist_node_empty(&q->list))
                __futex_unqueue(q);

        BUG_ON(!q->pi_state);
        put_pi_state(q->pi_state);
        q->pi_state = NULL;
}

/* Constants for the pending_op argument of handle_futex_death */
#define HANDLE_DEATH_PENDING        true
#define HANDLE_DEATH_LIST        false

/*
 * Process a futex-list entry, check whether it's owned by the
 * dying task, and do notification if so:
 */
static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
                              bool pi, bool pending_op)
{
        u32 uval, nval, mval;
        pid_t owner;
        int err;

        /* Futex address must be 32bit aligned */
        if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
                return -1;

retry:
        if (get_user(uval, uaddr))
                return -1;

        /*
         * Special case for regular (non PI) futexes. The unlock path in
         * user space has two race scenarios:
         *
         * 1. The unlock path releases the user space futex value and
         *    before it can execute the futex() syscall to wake up
         *    waiters it is killed.
         *
         * 2. A woken up waiter is killed before it can acquire the
         *    futex in user space.
         *
         * In the second case, the wake up notification could be generated
         * by the unlock path in user space after setting the futex value
         * to zero or by the kernel after setting the OWNER_DIED bit below.
         *
         * In both cases the TID validation below prevents a wakeup of
         * potential waiters which can cause these waiters to block
         * forever.
         *
         * In both cases the following conditions are met:
         *
         *        1) task->robust_list->list_op_pending != NULL
         *           @pending_op == true
         *        2) The owner part of user space futex value == 0
         *        3) Regular futex: @pi == false
         *
         * If these conditions are met, it is safe to attempt waking up a
         * potential waiter without touching the user space futex value and
         * trying to set the OWNER_DIED bit. If the futex value is zero,
         * the rest of the user space mutex state is consistent, so a woken
         * waiter will just take over the uncontended futex. Setting the
         * OWNER_DIED bit would create inconsistent state and malfunction
         * of the user space owner died handling. Otherwise, the OWNER_DIED
         * bit is already set, and the woken waiter is expected to deal with
         * this.
         */
        owner = uval & FUTEX_TID_MASK;

        if (pending_op && !pi && !owner) {
                futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
                           FUTEX_BITSET_MATCH_ANY);
                return 0;
        }

        if (owner != task_pid_vnr(curr))
                return 0;

        /*
         * Ok, this dying thread is truly holding a futex
         * of interest. Set the OWNER_DIED bit atomically
         * via cmpxchg, and if the value had FUTEX_WAITERS
         * set, wake up a waiter (if any). (We have to do a
         * futex_wake() even if OWNER_DIED is already set -
         * to handle the rare but possible case of recursive
         * thread-death.) The rest of the cleanup is done in
         * userspace.
         */
        mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;

        /*
         * We are not holding a lock here, but we want to have
         * the pagefault_disable/enable() protection because
         * we want to handle the fault gracefully. If the
         * access fails we try to fault in the futex with R/W
         * verification via get_user_pages. get_user() above
         * does not guarantee R/W access. If that fails we
         * give up and leave the futex locked.
         */
        if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
                switch (err) {
                case -EFAULT:
                        if (fault_in_user_writeable(uaddr))
                                return -1;
                        goto retry;

                case -EAGAIN:
                        cond_resched();
                        goto retry;

                default:
                        WARN_ON_ONCE(1);
                        return err;
                }
        }

        if (nval != uval)
                goto retry;

        /*
         * Wake robust non-PI futexes here. The wakeup of
         * PI futexes happens in exit_pi_state():
         */
        if (!pi && (uval & FUTEX_WAITERS)) {
                futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
                           FUTEX_BITSET_MATCH_ANY);
        }

        return 0;
}

/*
 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
 */
static inline int fetch_robust_entry(struct robust_list __user **entry,
                                     struct robust_list __user * __user *head,
                                     unsigned int *pi)
{
        unsigned long uentry;

        if (get_user(uentry, (unsigned long __user *)head))
                return -EFAULT;

        *entry = (void __user *)(uentry & ~1UL);
        *pi = uentry & 1;

        return 0;
}

/*
 * Walk curr->robust_list (very carefully, it's a userspace list!)
 * and mark any locks found there dead, and notify any waiters.
 *
 * We silently return on any sign of list-walking problem.
 */
static void exit_robust_list(struct task_struct *curr)
{
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
        unsigned int next_pi;
        unsigned long futex_offset;
        int rc;

        /*
         * Fetch the list head (which was registered earlier, via
         * sys_set_robust_list()):
         */
        if (fetch_robust_entry(&entry, &head->list.next, &pi))
                return;
        /*
         * Fetch the relative futex offset:
         */
        if (get_user(futex_offset, &head->futex_offset))
                return;
        /*
         * Fetch any possibly pending lock-add first, and handle it
         * if it exists:
         */
        if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
                return;

        next_entry = NULL;        /* avoid warning with gcc */
        while (entry != &head->list) {
                /*
                 * Fetch the next entry in the list before calling
                 * handle_futex_death:
                 */
                rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
                /*
                 * A pending lock might already be on the list, so
                 * don't process it twice:
                 */
                if (entry != pending) {
                        if (handle_futex_death((void __user *)entry + futex_offset,
                                                curr, pi, HANDLE_DEATH_LIST))
                                return;
                }
                if (rc)
                        return;
                entry = next_entry;
                pi = next_pi;
                /*
                 * Avoid excessively long or circular lists:
                 */
                if (!--limit)
                        break;

                cond_resched();
        }

        if (pending) {
                handle_futex_death((void __user *)pending + futex_offset,
                                   curr, pip, HANDLE_DEATH_PENDING);
        }
}

#ifdef CONFIG_COMPAT
static void __user *futex_uaddr(struct robust_list __user *entry,
                                compat_long_t futex_offset)
{
        compat_uptr_t base = ptr_to_compat(entry);
        void __user *uaddr = compat_ptr(base + futex_offset);

        return uaddr;
}

/*
 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
 */
static inline int
compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
                   compat_uptr_t __user *head, unsigned int *pi)
{
        if (get_user(*uentry, head))
                return -EFAULT;

        *entry = compat_ptr((*uentry) & ~1);
        *pi = (unsigned int)(*uentry) & 1;

        return 0;
}

/*
 * Walk curr->robust_list (very carefully, it's a userspace list!)
 * and mark any locks found there dead, and notify any waiters.
 *
 * We silently return on any sign of list-walking problem.
 */
static void compat_exit_robust_list(struct task_struct *curr)
{
        struct compat_robust_list_head __user *head = curr->compat_robust_list;
        struct robust_list __user *entry, *next_entry, *pending;
        unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
        unsigned int next_pi;
        compat_uptr_t uentry, next_uentry, upending;
        compat_long_t futex_offset;
        int rc;

        /*
         * Fetch the list head (which was registered earlier, via
         * sys_set_robust_list()):
         */
        if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
                return;
        /*
         * Fetch the relative futex offset:
         */
        if (get_user(futex_offset, &head->futex_offset))
                return;
        /*
         * Fetch any possibly pending lock-add first, and handle it
         * if it exists:
         */
        if (compat_fetch_robust_entry(&upending, &pending,
                               &head->list_op_pending, &pip))
                return;

        next_entry = NULL;        /* avoid warning with gcc */
        while (entry != (struct robust_list __user *) &head->list) {
                /*
                 * Fetch the next entry in the list before calling
                 * handle_futex_death:
                 */
                rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
                        (compat_uptr_t __user *)&entry->next, &next_pi);
                /*
                 * A pending lock might already be on the list, so
                 * dont process it twice:
                 */
                if (entry != pending) {
                        void __user *uaddr = futex_uaddr(entry, futex_offset);

                        if (handle_futex_death(uaddr, curr, pi,
                                               HANDLE_DEATH_LIST))
                                return;
                }
                if (rc)
                        return;
                uentry = next_uentry;
                entry = next_entry;
                pi = next_pi;
                /*
                 * Avoid excessively long or circular lists:
                 */
                if (!--limit)
                        break;

                cond_resched();
        }
        if (pending) {
                void __user *uaddr = futex_uaddr(pending, futex_offset);

                handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
        }
}
#endif

#ifdef CONFIG_FUTEX_PI

/*
 * This task is holding PI mutexes at exit time => bad.
 * Kernel cleans up PI-state, but userspace is likely hosed.
 * (Robust-futex cleanup is separate and might save the day for userspace.)
 */
static void exit_pi_state_list(struct task_struct *curr)
{
        struct list_head *next, *head = &curr->pi_state_list;
        struct futex_pi_state *pi_state;
        struct futex_hash_bucket *hb;
        union futex_key key = FUTEX_KEY_INIT;

        /*
         * We are a ZOMBIE and nobody can enqueue itself on
         * pi_state_list anymore, but we have to be careful
         * versus waiters unqueueing themselves:
         */
        raw_spin_lock_irq(&curr->pi_lock);
        while (!list_empty(head)) {
                next = head->next;
                pi_state = list_entry(next, struct futex_pi_state, list);
                key = pi_state->key;
                hb = futex_hash(&key);

                /*
                 * We can race against put_pi_state() removing itself from the
                 * list (a waiter going away). put_pi_state() will first
                 * decrement the reference count and then modify the list, so
                 * its possible to see the list entry but fail this reference
                 * acquire.
                 *
                 * In that case; drop the locks to let put_pi_state() make
                 * progress and retry the loop.
                 */
                if (!refcount_inc_not_zero(&pi_state->refcount)) {
                        raw_spin_unlock_irq(&curr->pi_lock);
                        cpu_relax();
                        raw_spin_lock_irq(&curr->pi_lock);
                        continue;
                }
                raw_spin_unlock_irq(&curr->pi_lock);

                spin_lock(&hb->lock);
                raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
                raw_spin_lock(&curr->pi_lock);
                /*
                 * We dropped the pi-lock, so re-check whether this
                 * task still owns the PI-state:
                 */
                if (head->next != next) {
                        /* retain curr->pi_lock for the loop invariant */
                        raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
                        spin_unlock(&hb->lock);
                        put_pi_state(pi_state);
                        continue;
                }

                WARN_ON(pi_state->owner != curr);
                WARN_ON(list_empty(&pi_state->list));
                list_del_init(&pi_state->list);
                pi_state->owner = NULL;

                raw_spin_unlock(&curr->pi_lock);
                raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
                spin_unlock(&hb->lock);

                rt_mutex_futex_unlock(&pi_state->pi_mutex);
                put_pi_state(pi_state);

                raw_spin_lock_irq(&curr->pi_lock);
        }
        raw_spin_unlock_irq(&curr->pi_lock);
}
#else
static inline void exit_pi_state_list(struct task_struct *curr) { }
#endif

static void futex_cleanup(struct task_struct *tsk)
{
        if (unlikely(tsk->robust_list)) {
                exit_robust_list(tsk);
                tsk->robust_list = NULL;
        }

#ifdef CONFIG_COMPAT
        if (unlikely(tsk->compat_robust_list)) {
                compat_exit_robust_list(tsk);
                tsk->compat_robust_list = NULL;
        }
#endif

        if (unlikely(!list_empty(&tsk->pi_state_list)))
                exit_pi_state_list(tsk);
}

/**
 * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
 * @tsk:        task to set the state on
 *
 * Set the futex exit state of the task lockless. The futex waiter code
 * observes that state when a task is exiting and loops until the task has
 * actually finished the futex cleanup. The worst case for this is that the
 * waiter runs through the wait loop until the state becomes visible.
 *
 * This is called from the recursive fault handling path in make_task_dead().
 *
 * This is best effort. Either the futex exit code has run already or
 * not. If the OWNER_DIED bit has been set on the futex then the waiter can
 * take it over. If not, the problem is pushed back to user space. If the
 * futex exit code did not run yet, then an already queued waiter might
 * block forever, but there is nothing which can be done about that.
 */
void futex_exit_recursive(struct task_struct *tsk)
{
        /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
        if (tsk->futex_state == FUTEX_STATE_EXITING)
                mutex_unlock(&tsk->futex_exit_mutex);
        tsk->futex_state = FUTEX_STATE_DEAD;
}

static void futex_cleanup_begin(struct task_struct *tsk)
{
        /*
         * Prevent various race issues against a concurrent incoming waiter
         * including live locks by forcing the waiter to block on
         * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
         * attach_to_pi_owner().
         */
        mutex_lock(&tsk->futex_exit_mutex);

        /*
         * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
         *
         * This ensures that all subsequent checks of tsk->futex_state in
         * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
         * tsk->pi_lock held.
         *
         * It guarantees also that a pi_state which was queued right before
         * the state change under tsk->pi_lock by a concurrent waiter must
         * be observed in exit_pi_state_list().
         */
        raw_spin_lock_irq(&tsk->pi_lock);
        tsk->futex_state = FUTEX_STATE_EXITING;
        raw_spin_unlock_irq(&tsk->pi_lock);
}

static void futex_cleanup_end(struct task_struct *tsk, int state)
{
        /*
         * Lockless store. The only side effect is that an observer might
         * take another loop until it becomes visible.
         */
        tsk->futex_state = state;
        /*
         * Drop the exit protection. This unblocks waiters which observed
         * FUTEX_STATE_EXITING to reevaluate the state.
         */
        mutex_unlock(&tsk->futex_exit_mutex);
}

void futex_exec_release(struct task_struct *tsk)
{
        /*
         * The state handling is done for consistency, but in the case of
         * exec() there is no way to prevent further damage as the PID stays
         * the same. But for the unlikely and arguably buggy case that a
         * futex is held on exec(), this provides at least as much state
         * consistency protection which is possible.
         */
        futex_cleanup_begin(tsk);
        futex_cleanup(tsk);
        /*
         * Reset the state to FUTEX_STATE_OK. The task is alive and about
         * exec a new binary.
         */
        futex_cleanup_end(tsk, FUTEX_STATE_OK);
}

void futex_exit_release(struct task_struct *tsk)
{
        futex_cleanup_begin(tsk);
        futex_cleanup(tsk);
        futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
}

static int __init futex_init(void)
{
        unsigned long hashsize, i;
        unsigned int futex_shift;

#ifdef CONFIG_BASE_SMALL
        hashsize = 16;
#else
        hashsize = roundup_pow_of_two(256 * num_possible_cpus());
#endif

        futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
                                               hashsize, 0, 0,
                                               &futex_shift, NULL,
                                               hashsize, hashsize);
        hashsize = 1UL << futex_shift;

        for (i = 0; i < hashsize; i++) {
                atomic_set(&futex_queues[i].waiters, 0);
                plist_head_init(&futex_queues[i].chain);
                spin_lock_init(&futex_queues[i].lock);
        }

        futex_hashmask = hashsize - 1;
        return 0;
}
core_initcall(futex_init);

























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
#include <linux/fs_pin.h>

extern struct list_head notify_list;

typedef __u32 __bitwise mntns_flags_t;

#define MNTNS_PROPAGATING        ((__force mntns_flags_t)(1 << 0))

struct mnt_namespace {
        struct ns_common        ns;
        struct mount *        root;
        struct {
                struct rb_root        mounts;                 /* Protected by namespace_sem */
                struct rb_node        *mnt_last_node;         /* last (rightmost) mount in the rbtree */
                struct rb_node        *mnt_first_node; /* first (leftmost) mount in the rbtree */
        };
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        u64                        seq;        /* Sequence number to prevent loops */
        union {
                wait_queue_head_t        poll;
                struct rcu_head                mnt_ns_rcu;
        };
        u64                        seq_origin; /* Sequence number of origin mount namespace */
        u64 event;
#ifdef CONFIG_FSNOTIFY
        __u32                        n_fsnotify_mask;
        struct fsnotify_mark_connector __rcu *n_fsnotify_marks;
#endif
        unsigned int                nr_mounts; /* # of mounts in the namespace */
        unsigned int                pending_mounts;
        struct rb_node                mnt_ns_tree_node; /* node in the mnt_ns_tree */
        struct list_head        mnt_ns_list; /* entry in the sequential list of mounts namespace */
        refcount_t                passive; /* number references not pinning @mounts */
        mntns_flags_t                mntns_flags;
} __randomize_layout;

struct mnt_pcp {
        int mnt_count;
        int mnt_writers;
};

struct mountpoint {
        struct hlist_node m_hash;
        struct dentry *m_dentry;
        struct hlist_head m_list;
        int m_count;
};

struct mount {
        struct hlist_node mnt_hash;
        struct mount *mnt_parent;
        struct dentry *mnt_mountpoint;
        struct vfsmount mnt;
        union {
                struct rb_node mnt_node; /* node in the ns->mounts rbtree */
                struct rcu_head mnt_rcu;
                struct llist_node mnt_llist;
        };
#ifdef CONFIG_SMP
        struct mnt_pcp __percpu *mnt_pcp;
#else
        int mnt_count;
        int mnt_writers;
#endif
        struct list_head mnt_mounts;        /* list of children, anchored here */
        struct list_head mnt_child;        /* and going through their mnt_child */
        struct list_head mnt_instance;        /* mount instance on sb->s_mounts */
        const char *mnt_devname;        /* Name of device e.g. /dev/dsk/hda1 */
        struct list_head mnt_list;
        struct list_head mnt_expire;        /* link in fs-specific expiry list */
        struct list_head mnt_share;        /* circular list of shared mounts */
        struct list_head mnt_slave_list;/* list of slave mounts */
        struct list_head mnt_slave;        /* slave list entry */
        struct mount *mnt_master;        /* slave is on master->mnt_slave_list */
        struct mnt_namespace *mnt_ns;        /* containing namespace */
        struct mountpoint *mnt_mp;        /* where is it mounted */
        union {
                struct hlist_node mnt_mp_list;        /* list mounts with the same mountpoint */
                struct hlist_node mnt_umount;
        };
        struct list_head mnt_umounting; /* list entry for umount propagation */
#ifdef CONFIG_FSNOTIFY
        struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
        __u32 mnt_fsnotify_mask;
        struct list_head to_notify;        /* need to queue notification */
        struct mnt_namespace *prev_ns;        /* previous namespace (NULL if none) */
#endif
        int mnt_id;                        /* mount identifier, reused */
        u64 mnt_id_unique;                /* mount ID unique until reboot */
        int mnt_group_id;                /* peer group identifier */
        int mnt_expiry_mark;                /* true if marked for expiry */
        struct hlist_head mnt_pins;
        struct hlist_head mnt_stuck_children;
} __randomize_layout;

#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */

static inline struct mount *real_mount(struct vfsmount *mnt)
{
        return container_of(mnt, struct mount, mnt);
}

static inline int mnt_has_parent(struct mount *mnt)
{
        return mnt != mnt->mnt_parent;
}

static inline int is_mounted(struct vfsmount *mnt)
{
        /* neither detached nor internal? */
        return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns);
}

extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *);

extern int __legitimize_mnt(struct vfsmount *, unsigned);

static inline bool __path_is_mountpoint(const struct path *path)
{
        struct mount *m = __lookup_mnt(path->mnt, path->dentry);
        return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
}

extern void __detach_mounts(struct dentry *dentry);

static inline void detach_mounts(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return;
        __detach_mounts(dentry);
}

static inline void get_mnt_ns(struct mnt_namespace *ns)
{
        refcount_inc(&ns->ns.count);
}

extern seqlock_t mount_lock;

struct proc_mounts {
        struct mnt_namespace *ns;
        struct path root;
        int (*show)(struct seq_file *, struct vfsmount *);
};

extern const struct seq_operations mounts_op;

extern bool __is_local_mountpoint(struct dentry *dentry);
static inline bool is_local_mountpoint(struct dentry *dentry)
{
        if (!d_mountpoint(dentry))
                return false;

        return __is_local_mountpoint(dentry);
}

static inline bool is_anon_ns(struct mnt_namespace *ns)
{
        return ns->seq == 0;
}

static inline bool mnt_ns_attached(const struct mount *mnt)
{
        return !RB_EMPTY_NODE(&mnt->mnt_node);
}

static inline bool mnt_ns_empty(const struct mnt_namespace *ns)
{
        return RB_EMPTY_ROOT(&ns->mounts);
}

static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
{
        struct mnt_namespace *ns = mnt->mnt_ns;
        WARN_ON(!mnt_ns_attached(mnt));
        if (ns->mnt_last_node == &mnt->mnt_node)
                ns->mnt_last_node = rb_prev(&mnt->mnt_node);
        if (ns->mnt_first_node == &mnt->mnt_node)
                ns->mnt_first_node = rb_next(&mnt->mnt_node);
        rb_erase(&mnt->mnt_node, &ns->mounts);
        RB_CLEAR_NODE(&mnt->mnt_node);
        list_add_tail(&mnt->mnt_list, dt_list);
}

bool has_locked_children(struct mount *mnt, struct dentry *dentry);
struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mnt_ns,
                                            bool previous);

static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns)
{
        return container_of(ns, struct mnt_namespace, ns);
}

#ifdef CONFIG_FSNOTIFY
static inline void mnt_notify_add(struct mount *m)
{
        /* Optimize the case where there are no watches */
        if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) ||
            (m->prev_ns && m->prev_ns->n_fsnotify_marks))
                list_add_tail(&m->to_notify, &notify_list);
        else
                m->prev_ns = m->mnt_ns;
}
#else
static inline void mnt_notify_add(struct mount *m)
{
}
#endif

struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry);






















































































































































































































































































































































































































   34 
























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
// SPDX-License-Identifier: GPL-2.0-only
/*
 * ARMv8 single-step debug support and mdscr context switching.
 *
 * Copyright (C) 2012 ARM Limited
 *
 * Author: Will Deacon <will.deacon@arm.com>
 */

#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/hardirq.h>
#include <linux/init.h>
#include <linux/ptrace.h>
#include <linux/kprobes.h>
#include <linux/stat.h>
#include <linux/uaccess.h>
#include <linux/sched/task_stack.h>

#include <asm/cpufeature.h>
#include <asm/cputype.h>
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
#include <asm/system_misc.h>
#include <asm/traps.h>

/* Determine debug architecture. */
u8 debug_monitors_arch(void)
{
        return cpuid_feature_extract_unsigned_field(read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1),
                                                ID_AA64DFR0_EL1_DebugVer_SHIFT);
}

/*
 * MDSCR access routines.
 */
static void mdscr_write(u32 mdscr)
{
        unsigned long flags;
        flags = local_daif_save();
        write_sysreg(mdscr, mdscr_el1);
        local_daif_restore(flags);
}
NOKPROBE_SYMBOL(mdscr_write);

static u32 mdscr_read(void)
{
        return read_sysreg(mdscr_el1);
}
NOKPROBE_SYMBOL(mdscr_read);

/*
 * Allow root to disable self-hosted debug from userspace.
 * This is useful if you want to connect an external JTAG debugger.
 */
static bool debug_enabled = true;

static int create_debug_debugfs_entry(void)
{
        debugfs_create_bool("debug_enabled", 0644, NULL, &debug_enabled);
        return 0;
}
fs_initcall(create_debug_debugfs_entry);

static int __init early_debug_disable(char *buf)
{
        debug_enabled = false;
        return 0;
}

early_param("nodebugmon", early_debug_disable);

/*
 * Keep track of debug users on each core.
 * The ref counts are per-cpu so we use a local_t type.
 */
static DEFINE_PER_CPU(int, mde_ref_count);
static DEFINE_PER_CPU(int, kde_ref_count);

void enable_debug_monitors(enum dbg_active_el el)
{
        u32 mdscr, enable = 0;

        WARN_ON(preemptible());

        if (this_cpu_inc_return(mde_ref_count) == 1)
                enable = DBG_MDSCR_MDE;

        if (el == DBG_ACTIVE_EL1 &&
            this_cpu_inc_return(kde_ref_count) == 1)
                enable |= DBG_MDSCR_KDE;

        if (enable && debug_enabled) {
                mdscr = mdscr_read();
                mdscr |= enable;
                mdscr_write(mdscr);
        }
}
NOKPROBE_SYMBOL(enable_debug_monitors);

void disable_debug_monitors(enum dbg_active_el el)
{
        u32 mdscr, disable = 0;

        WARN_ON(preemptible());

        if (this_cpu_dec_return(mde_ref_count) == 0)
                disable = ~DBG_MDSCR_MDE;

        if (el == DBG_ACTIVE_EL1 &&
            this_cpu_dec_return(kde_ref_count) == 0)
                disable &= ~DBG_MDSCR_KDE;

        if (disable) {
                mdscr = mdscr_read();
                mdscr &= disable;
                mdscr_write(mdscr);
        }
}
NOKPROBE_SYMBOL(disable_debug_monitors);

/*
 * OS lock clearing.
 */
static int clear_os_lock(unsigned int cpu)
{
        write_sysreg(0, osdlr_el1);
        write_sysreg(0, oslar_el1);
        isb();
        return 0;
}

static int __init debug_monitors_init(void)
{
        return cpuhp_setup_state(CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING,
                                 "arm64/debug_monitors:starting",
                                 clear_os_lock, NULL);
}
postcore_initcall(debug_monitors_init);

/*
 * Single step API and exception handling.
 */
static void set_user_regs_spsr_ss(struct user_pt_regs *regs)
{
        regs->pstate |= DBG_SPSR_SS;
}
NOKPROBE_SYMBOL(set_user_regs_spsr_ss);

static void clear_user_regs_spsr_ss(struct user_pt_regs *regs)
{
        regs->pstate &= ~DBG_SPSR_SS;
}
NOKPROBE_SYMBOL(clear_user_regs_spsr_ss);

#define set_regs_spsr_ss(r)        set_user_regs_spsr_ss(&(r)->user_regs)
#define clear_regs_spsr_ss(r)        clear_user_regs_spsr_ss(&(r)->user_regs)

static DEFINE_SPINLOCK(debug_hook_lock);
static LIST_HEAD(user_step_hook);
static LIST_HEAD(kernel_step_hook);

static void register_debug_hook(struct list_head *node, struct list_head *list)
{
        spin_lock(&debug_hook_lock);
        list_add_rcu(node, list);
        spin_unlock(&debug_hook_lock);

}

static void unregister_debug_hook(struct list_head *node)
{
        spin_lock(&debug_hook_lock);
        list_del_rcu(node);
        spin_unlock(&debug_hook_lock);
        synchronize_rcu();
}

void register_user_step_hook(struct step_hook *hook)
{
        register_debug_hook(&hook->node, &user_step_hook);
}

void unregister_user_step_hook(struct step_hook *hook)
{
        unregister_debug_hook(&hook->node);
}

void register_kernel_step_hook(struct step_hook *hook)
{
        register_debug_hook(&hook->node, &kernel_step_hook);
}

void unregister_kernel_step_hook(struct step_hook *hook)
{
        unregister_debug_hook(&hook->node);
}

/*
 * Call registered single step handlers
 * There is no Syndrome info to check for determining the handler.
 * So we call all the registered handlers, until the right handler is
 * found which returns zero.
 */
static int call_step_hook(struct pt_regs *regs, unsigned long esr)
{
        struct step_hook *hook;
        struct list_head *list;
        int retval = DBG_HOOK_ERROR;

        list = user_mode(regs) ? &user_step_hook : &kernel_step_hook;

        /*
         * Since single-step exception disables interrupt, this function is
         * entirely not preemptible, and we can use rcu list safely here.
         */
        list_for_each_entry_rcu(hook, list, node)        {
                retval = hook->fn(regs, esr);
                if (retval == DBG_HOOK_HANDLED)
                        break;
        }

        return retval;
}
NOKPROBE_SYMBOL(call_step_hook);

static void send_user_sigtrap(int si_code)
{
        struct pt_regs *regs = current_pt_regs();

        if (WARN_ON(!user_mode(regs)))
                return;

        if (interrupts_enabled(regs))
                local_irq_enable();

        arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs),
                              "User debug trap");
}

static int single_step_handler(unsigned long unused, unsigned long esr,
                               struct pt_regs *regs)
{
        bool handler_found = false;

        /*
         * If we are stepping a pending breakpoint, call the hw_breakpoint
         * handler first.
         */
        if (!reinstall_suspended_bps(regs))
                return 0;

        if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED)
                handler_found = true;

        if (!handler_found && user_mode(regs)) {
                send_user_sigtrap(TRAP_TRACE);

                /*
                 * ptrace will disable single step unless explicitly
                 * asked to re-enable it. For other clients, it makes
                 * sense to leave it enabled (i.e. rewind the controls
                 * to the active-not-pending state).
                 */
                user_rewind_single_step(current);
        } else if (!handler_found) {
                pr_warn("Unexpected kernel single-step exception at EL1\n");
                /*
                 * Re-enable stepping since we know that we will be
                 * returning to regs.
                 */
                set_regs_spsr_ss(regs);
        }

        return 0;
}
NOKPROBE_SYMBOL(single_step_handler);

static LIST_HEAD(user_break_hook);
static LIST_HEAD(kernel_break_hook);

void register_user_break_hook(struct break_hook *hook)
{
        register_debug_hook(&hook->node, &user_break_hook);
}

void unregister_user_break_hook(struct break_hook *hook)
{
        unregister_debug_hook(&hook->node);
}

void register_kernel_break_hook(struct break_hook *hook)
{
        register_debug_hook(&hook->node, &kernel_break_hook);
}

void unregister_kernel_break_hook(struct break_hook *hook)
{
        unregister_debug_hook(&hook->node);
}

static int call_break_hook(struct pt_regs *regs, unsigned long esr)
{
        struct break_hook *hook;
        struct list_head *list;

        list = user_mode(regs) ? &user_break_hook : &kernel_break_hook;

        /*
         * Since brk exception disables interrupt, this function is
         * entirely not preemptible, and we can use rcu list safely here.
         */
        list_for_each_entry_rcu(hook, list, node) {
                if ((esr_brk_comment(esr) & ~hook->mask) == hook->imm)
                        return hook->fn(regs, esr);
        }

        return DBG_HOOK_ERROR;
}
NOKPROBE_SYMBOL(call_break_hook);

static int brk_handler(unsigned long unused, unsigned long esr,
                       struct pt_regs *regs)
{
        if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED)
                return 0;

        if (user_mode(regs)) {
                send_user_sigtrap(TRAP_BRKPT);
        } else {
                pr_warn("Unexpected kernel BRK exception at EL1\n");
                return -EFAULT;
        }

        return 0;
}
NOKPROBE_SYMBOL(brk_handler);

int aarch32_break_handler(struct pt_regs *regs)
{
        u32 arm_instr;
        u16 thumb_instr;
        bool bp = false;
        void __user *pc = (void __user *)instruction_pointer(regs);

        if (!compat_user_mode(regs))
                return -EFAULT;

        if (compat_thumb_mode(regs)) {
                /* get 16-bit Thumb instruction */
                __le16 instr;
                get_user(instr, (__le16 __user *)pc);
                thumb_instr = le16_to_cpu(instr);
                if (thumb_instr == AARCH32_BREAK_THUMB2_LO) {
                        /* get second half of 32-bit Thumb-2 instruction */
                        get_user(instr, (__le16 __user *)(pc + 2));
                        thumb_instr = le16_to_cpu(instr);
                        bp = thumb_instr == AARCH32_BREAK_THUMB2_HI;
                } else {
                        bp = thumb_instr == AARCH32_BREAK_THUMB;
                }
        } else {
                /* 32-bit ARM instruction */
                __le32 instr;
                get_user(instr, (__le32 __user *)pc);
                arm_instr = le32_to_cpu(instr);
                bp = (arm_instr & ~0xf0000000) == AARCH32_BREAK_ARM;
        }

        if (!bp)
                return -EFAULT;

        send_user_sigtrap(TRAP_BRKPT);
        return 0;
}
NOKPROBE_SYMBOL(aarch32_break_handler);

void __init debug_traps_init(void)
{
        hook_debug_fault_code(DBG_ESR_EVT_HWSS, single_step_handler, SIGTRAP,
                              TRAP_TRACE, "single-step handler");
        hook_debug_fault_code(DBG_ESR_EVT_BRK, brk_handler, SIGTRAP,
                              TRAP_BRKPT, "BRK handler");
}

/* Re-enable single step for syscall restarting. */
void user_rewind_single_step(struct task_struct *task)
{
        /*
         * If single step is active for this thread, then set SPSR.SS
         * to 1 to avoid returning to the active-pending state.
         */
        if (test_tsk_thread_flag(task, TIF_SINGLESTEP))
                set_regs_spsr_ss(task_pt_regs(task));
}
NOKPROBE_SYMBOL(user_rewind_single_step);

void user_fastforward_single_step(struct task_struct *task)
{
        if (test_tsk_thread_flag(task, TIF_SINGLESTEP))
                clear_regs_spsr_ss(task_pt_regs(task));
}

void user_regs_reset_single_step(struct user_pt_regs *regs,
                                 struct task_struct *task)
{
        if (test_tsk_thread_flag(task, TIF_SINGLESTEP))
                set_user_regs_spsr_ss(regs);
        else
                clear_user_regs_spsr_ss(regs);
}

/* Kernel API */
void kernel_enable_single_step(struct pt_regs *regs)
{
        WARN_ON(!irqs_disabled());
        set_regs_spsr_ss(regs);
        mdscr_write(mdscr_read() | DBG_MDSCR_SS);
        enable_debug_monitors(DBG_ACTIVE_EL1);
}
NOKPROBE_SYMBOL(kernel_enable_single_step);

void kernel_disable_single_step(void)
{
        WARN_ON(!irqs_disabled());
        mdscr_write(mdscr_read() & ~DBG_MDSCR_SS);
        disable_debug_monitors(DBG_ACTIVE_EL1);
}
NOKPROBE_SYMBOL(kernel_disable_single_step);

int kernel_active_single_step(void)
{
        WARN_ON(!irqs_disabled());
        return mdscr_read() & DBG_MDSCR_SS;
}
NOKPROBE_SYMBOL(kernel_active_single_step);

void kernel_rewind_single_step(struct pt_regs *regs)
{
        set_regs_spsr_ss(regs);
}

void kernel_fastforward_single_step(struct pt_regs *regs)
{
        clear_regs_spsr_ss(regs);
}

/* ptrace API */
void user_enable_single_step(struct task_struct *task)
{
        struct thread_info *ti = task_thread_info(task);

        if (!test_and_set_ti_thread_flag(ti, TIF_SINGLESTEP))
                set_regs_spsr_ss(task_pt_regs(task));
}
NOKPROBE_SYMBOL(user_enable_single_step);

void user_disable_single_step(struct task_struct *task)
{
        clear_ti_thread_flag(task_thread_info(task), TIF_SINGLESTEP);
}
NOKPROBE_SYMBOL(user_disable_single_step);












































































































































  283 


























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright IBM Corporation, 2021
 *
 * Author: Mike Rapoport <rppt@linux.ibm.com>
 */

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/swap.h>
#include <linux/mount.h>
#include <linux/memfd.h>
#include <linux/bitops.h>
#include <linux/printk.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/pseudo_fs.h>
#include <linux/secretmem.h>
#include <linux/set_memory.h>
#include <linux/sched/signal.h>

#include <uapi/linux/magic.h>

#include <asm/tlbflush.h>

#include "internal.h"

#undef pr_fmt
#define pr_fmt(fmt) "secretmem: " fmt

/*
 * Define mode and flag masks to allow validation of the system call
 * parameters.
 */
#define SECRETMEM_MODE_MASK        (0x0)
#define SECRETMEM_FLAGS_MASK        SECRETMEM_MODE_MASK

static bool secretmem_enable __ro_after_init = 1;
module_param_named(enable, secretmem_enable, bool, 0400);
MODULE_PARM_DESC(secretmem_enable,
                 "Enable secretmem and memfd_secret(2) system call");

static atomic_t secretmem_users;

bool secretmem_active(void)
{
        return !!atomic_read(&secretmem_users);
}

static vm_fault_t secretmem_fault(struct vm_fault *vmf)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct inode *inode = file_inode(vmf->vma->vm_file);
        pgoff_t offset = vmf->pgoff;
        gfp_t gfp = vmf->gfp_mask;
        unsigned long addr;
        struct page *page;
        struct folio *folio;
        vm_fault_t ret;
        int err;

        if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
                return vmf_error(-EINVAL);

        filemap_invalidate_lock_shared(mapping);

retry:
        page = find_lock_page(mapping, offset);
        if (!page) {
                folio = folio_alloc(gfp | __GFP_ZERO, 0);
                if (!folio) {
                        ret = VM_FAULT_OOM;
                        goto out;
                }

                page = &folio->page;
                err = set_direct_map_invalid_noflush(page);
                if (err) {
                        folio_put(folio);
                        ret = vmf_error(err);
                        goto out;
                }

                __folio_mark_uptodate(folio);
                err = filemap_add_folio(mapping, folio, offset, gfp);
                if (unlikely(err)) {
                        folio_put(folio);
                        /*
                         * If a split of large page was required, it
                         * already happened when we marked the page invalid
                         * which guarantees that this call won't fail
                         */
                        set_direct_map_default_noflush(page);
                        if (err == -EEXIST)
                                goto retry;

                        ret = vmf_error(err);
                        goto out;
                }

                addr = (unsigned long)page_address(page);
                flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
        }

        vmf->page = page;
        ret = VM_FAULT_LOCKED;

out:
        filemap_invalidate_unlock_shared(mapping);
        return ret;
}

static const struct vm_operations_struct secretmem_vm_ops = {
        .fault = secretmem_fault,
};

static int secretmem_release(struct inode *inode, struct file *file)
{
        atomic_dec(&secretmem_users);
        return 0;
}

static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
{
        unsigned long len = vma->vm_end - vma->vm_start;

        if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
                return -EINVAL;

        if (!mlock_future_ok(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
                return -EAGAIN;

        vm_flags_set(vma, VM_LOCKED | VM_DONTDUMP);
        vma->vm_ops = &secretmem_vm_ops;

        return 0;
}

bool vma_is_secretmem(struct vm_area_struct *vma)
{
        return vma->vm_ops == &secretmem_vm_ops;
}

static const struct file_operations secretmem_fops = {
        .release        = secretmem_release,
        .mmap                = secretmem_mmap,
};

static int secretmem_migrate_folio(struct address_space *mapping,
                struct folio *dst, struct folio *src, enum migrate_mode mode)
{
        return -EBUSY;
}

static void secretmem_free_folio(struct folio *folio)
{
        set_direct_map_default_noflush(&folio->page);
        folio_zero_segment(folio, 0, folio_size(folio));
}

const struct address_space_operations secretmem_aops = {
        .dirty_folio        = noop_dirty_folio,
        .free_folio        = secretmem_free_folio,
        .migrate_folio        = secretmem_migrate_folio,
};

static int secretmem_setattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        struct address_space *mapping = inode->i_mapping;
        unsigned int ia_valid = iattr->ia_valid;
        int ret;

        filemap_invalidate_lock(mapping);

        if ((ia_valid & ATTR_SIZE) && inode->i_size)
                ret = -EINVAL;
        else
                ret = simple_setattr(idmap, dentry, iattr);

        filemap_invalidate_unlock(mapping);

        return ret;
}

static const struct inode_operations secretmem_iops = {
        .setattr = secretmem_setattr,
};

static struct vfsmount *secretmem_mnt;

static struct file *secretmem_file_create(unsigned long flags)
{
        struct file *file;
        struct inode *inode;
        const char *anon_name = "[secretmem]";
        int err;

        inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        err = security_inode_init_security_anon(inode, &QSTR(anon_name), NULL);
        if (err) {
                file = ERR_PTR(err);
                goto err_free_inode;
        }

        file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
                                 O_RDWR, &secretmem_fops);
        if (IS_ERR(file))
                goto err_free_inode;

        mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
        mapping_set_unevictable(inode->i_mapping);

        inode->i_op = &secretmem_iops;
        inode->i_mapping->a_ops = &secretmem_aops;

        /* pretend we are a normal file with zero size */
        inode->i_mode |= S_IFREG;
        inode->i_size = 0;

        return file;

err_free_inode:
        iput(inode);
        return file;
}

SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
{
        struct file *file;
        int fd, err;

        /* make sure local flags do not confict with global fcntl.h */
        BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);

        if (!secretmem_enable || !can_set_direct_map())
                return -ENOSYS;

        if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
                return -EINVAL;
        if (atomic_read(&secretmem_users) < 0)
                return -ENFILE;

        fd = get_unused_fd_flags(flags & O_CLOEXEC);
        if (fd < 0)
                return fd;

        file = secretmem_file_create(flags);
        if (IS_ERR(file)) {
                err = PTR_ERR(file);
                goto err_put_fd;
        }

        file->f_flags |= O_LARGEFILE;

        atomic_inc(&secretmem_users);
        fd_install(fd, file);
        return fd;

err_put_fd:
        put_unused_fd(fd);
        return err;
}

static int secretmem_init_fs_context(struct fs_context *fc)
{
        return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
}

static struct file_system_type secretmem_fs = {
        .name                = "secretmem",
        .init_fs_context = secretmem_init_fs_context,
        .kill_sb        = kill_anon_super,
};

static int __init secretmem_init(void)
{
        if (!secretmem_enable || !can_set_direct_map())
                return 0;

        secretmem_mnt = kern_mount(&secretmem_fs);
        if (IS_ERR(secretmem_mnt))
                return PTR_ERR(secretmem_mnt);

        /* prevent secretmem mappings from ever getting PROT_EXEC */
        secretmem_mnt->mnt_flags |= MNT_NOEXEC;

        return 0;
}
fs_initcall(secretmem_init);







































   10 
   10 




   10 












   20 

















   20 

















   20 




































    1 



    1 


    1 







    1 

















    1 





    1 
















    2 



    2 







    1 




    1 

































































































































































































































  156 

















  157 












  122 





  156 
























  157 














  157 







































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Derived from arch/arm/kvm/handle_exit.c:
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/ubsan.h>

#include <asm/esr.h>
#include <asm/exception.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/debug-monitors.h>
#include <asm/stacktrace/nvhe.h>
#include <asm/traps.h>

#include <kvm/arm_hypercalls.h>

#define CREATE_TRACE_POINTS
#include "trace_handle_exit.h"

typedef int (*exit_handle_fn)(struct kvm_vcpu *);

static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u64 esr)
{
        if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(NULL, esr))
                kvm_inject_vabt(vcpu);
}

static int handle_hvc(struct kvm_vcpu *vcpu)
{
        trace_kvm_hvc_arm64(*vcpu_pc(vcpu), vcpu_get_reg(vcpu, 0),
                            kvm_vcpu_hvc_get_imm(vcpu));
        vcpu->stat.hvc_exit_stat++;

        /* Forward hvc instructions to the virtual EL2 if the guest has EL2. */
        if (vcpu_has_nv(vcpu)) {
                if (vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_HCD)
                        kvm_inject_undefined(vcpu);
                else
                        kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));

                return 1;
        }

        return kvm_smccc_call_handler(vcpu);
}

static int handle_smc(struct kvm_vcpu *vcpu)
{
        /*
         * Forward this trapped smc instruction to the virtual EL2 if
         * the guest has asked for it.
         */
        if (forward_smc_trap(vcpu))
                return 1;

        /*
         * "If an SMC instruction executed at Non-secure EL1 is
         * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a
         * Trap exception, not a Secure Monitor Call exception [...]"
         *
         * We need to advance the PC after the trap, as it would
         * otherwise return to the same address. Furthermore, pre-incrementing
         * the PC before potentially exiting to userspace maintains the same
         * abstraction for both SMCs and HVCs.
         */
        kvm_incr_pc(vcpu);

        /*
         * SMCs with a nonzero immediate are reserved according to DEN0028E 2.9
         * "SMC and HVC immediate value".
         */
        if (kvm_vcpu_hvc_get_imm(vcpu)) {
                vcpu_set_reg(vcpu, 0, ~0UL);
                return 1;
        }

        /*
         * If imm is zero then it is likely an SMCCC call.
         *
         * Note that on ARMv8.3, even if EL3 is not implemented, SMC executed
         * at Non-secure EL1 is trapped to EL2 if HCR_EL2.TSC==1, rather than
         * being treated as UNDEFINED.
         */
        return kvm_smccc_call_handler(vcpu);
}

/*
 * This handles the cases where the system does not support FP/ASIMD or when
 * we are running nested virtualization and the guest hypervisor is trapping
 * FP/ASIMD accesses by its guest guest.
 *
 * All other handling of guest vs. host FP/ASIMD register state is handled in
 * fixup_guest_exit().
 */
static int kvm_handle_fpasimd(struct kvm_vcpu *vcpu)
{
        if (guest_hyp_fpsimd_traps_enabled(vcpu))
                return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));

        /* This is the case when the system doesn't support FP/ASIMD. */
        kvm_inject_undefined(vcpu);
        return 1;
}

/**
 * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event
 *                    instruction executed by a guest
 *
 * @vcpu:        the vcpu pointer
 *
 * WFE[T]: Yield the CPU and come back to this vcpu when the scheduler
 * decides to.
 * WFI: Simply call kvm_vcpu_halt(), which will halt execution of
 * world-switches and schedule other host processes until there is an
 * incoming IRQ or FIQ to the VM.
 * WFIT: Same as WFI, with a timed wakeup implemented as a background timer
 *
 * WF{I,E}T can immediately return if the deadline has already expired.
 */
static int kvm_handle_wfx(struct kvm_vcpu *vcpu)
{
        u64 esr = kvm_vcpu_get_esr(vcpu);
        bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE);

        if (guest_hyp_wfx_traps_enabled(vcpu))
                return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));

        if (is_wfe) {
                trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true);
                vcpu->stat.wfe_exit_stat++;
        } else {
                trace_kvm_wfx_arm64(*vcpu_pc(vcpu), false);
                vcpu->stat.wfi_exit_stat++;
        }

        if (esr & ESR_ELx_WFx_ISS_WFxT) {
                if (esr & ESR_ELx_WFx_ISS_RV) {
                        u64 val, now;

                        now = kvm_arm_timer_get_reg(vcpu, KVM_REG_ARM_TIMER_CNT);
                        val = vcpu_get_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu));

                        if (now >= val)
                                goto out;
                } else {
                        /* Treat WFxT as WFx if RN is invalid */
                        esr &= ~ESR_ELx_WFx_ISS_WFxT;
                }
        }

        if (esr & ESR_ELx_WFx_ISS_WFE) {
                kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu));
        } else {
                if (esr & ESR_ELx_WFx_ISS_WFxT)
                        vcpu_set_flag(vcpu, IN_WFIT);

                kvm_vcpu_wfi(vcpu);
        }
out:
        kvm_incr_pc(vcpu);

        return 1;
}

/**
 * kvm_handle_guest_debug - handle a debug exception instruction
 *
 * @vcpu:        the vcpu pointer
 *
 * We route all debug exceptions through the same handler. If both the
 * guest and host are using the same debug facilities it will be up to
 * userspace to re-inject the correct exception for guest delivery.
 *
 * @return: 0 (while setting vcpu->run->exit_reason)
 */
static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu)
{
        struct kvm_run *run = vcpu->run;
        u64 esr = kvm_vcpu_get_esr(vcpu);

        if (!vcpu->guest_debug && forward_debug_exception(vcpu))
                return 1;

        run->exit_reason = KVM_EXIT_DEBUG;
        run->debug.arch.hsr = lower_32_bits(esr);
        run->debug.arch.hsr_high = upper_32_bits(esr);
        run->flags = KVM_DEBUG_ARCH_HSR_HIGH_VALID;

        switch (ESR_ELx_EC(esr)) {
        case ESR_ELx_EC_WATCHPT_LOW:
                run->debug.arch.far = vcpu->arch.fault.far_el2;
                break;
        case ESR_ELx_EC_SOFTSTP_LOW:
                *vcpu_cpsr(vcpu) |= DBG_SPSR_SS;
                break;
        }

        return 0;
}

static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu)
{
        u64 esr = kvm_vcpu_get_esr(vcpu);

        kvm_pr_unimpl("Unknown exception class: esr: %#016llx -- %s\n",
                      esr, esr_get_class_string(esr));

        kvm_inject_undefined(vcpu);
        return 1;
}

/*
 * Guest access to SVE registers should be routed to this handler only
 * when the system doesn't support SVE.
 */
static int handle_sve(struct kvm_vcpu *vcpu)
{
        if (guest_hyp_sve_traps_enabled(vcpu))
                return kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));

        kvm_inject_undefined(vcpu);
        return 1;
}

/*
 * Two possibilities to handle a trapping ptrauth instruction:
 *
 * - Guest usage of a ptrauth instruction (which the guest EL1 did not
 *   turn into a NOP). If we get here, it is because we didn't enable
 *   ptrauth for the guest. This results in an UNDEF, as it isn't
 *   supposed to use ptrauth without being told it could.
 *
 * - Running an L2 NV guest while L1 has left HCR_EL2.API==0, and for
 *   which we reinject the exception into L1.
 *
 * Anything else is an emulation bug (hence the WARN_ON + UNDEF).
 */
static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu)
{
        if (!vcpu_has_ptrauth(vcpu)) {
                kvm_inject_undefined(vcpu);
                return 1;
        }

        if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
                kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
                return 1;
        }

        /* Really shouldn't be here! */
        WARN_ON_ONCE(1);
        kvm_inject_undefined(vcpu);
        return 1;
}

static int kvm_handle_eret(struct kvm_vcpu *vcpu)
{
        if (esr_iss_is_eretax(kvm_vcpu_get_esr(vcpu)) &&
            !vcpu_has_ptrauth(vcpu))
                return kvm_handle_ptrauth(vcpu);

        /*
         * If we got here, two possibilities:
         *
         * - the guest is in EL2, and we need to fully emulate ERET
         *
         * - the guest is in EL1, and we need to reinject the
         *   exception into the L1 hypervisor.
         *
         * If KVM ever traps ERET for its own use, we'll have to
         * revisit this.
         */
        if (is_hyp_ctxt(vcpu))
                kvm_emulate_nested_eret(vcpu);
        else
                kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));

        return 1;
}

static int handle_svc(struct kvm_vcpu *vcpu)
{
        /*
         * So far, SVC traps only for NV via HFGITR_EL2. A SVC from a
         * 32bit guest would be caught by vpcu_mode_is_bad_32bit(), so
         * we should only have to deal with a 64 bit exception.
         */
        kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
        return 1;
}

static int kvm_handle_gcs(struct kvm_vcpu *vcpu)
{
        /* We don't expect GCS, so treat it with contempt */
        if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, GCS, IMP))
                WARN_ON_ONCE(1);

        kvm_inject_undefined(vcpu);
        return 1;
}

static int handle_other(struct kvm_vcpu *vcpu)
{
        bool is_l2 = vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu);
        u64 hcrx = __vcpu_sys_reg(vcpu, HCRX_EL2);
        u64 esr = kvm_vcpu_get_esr(vcpu);
        u64 iss = ESR_ELx_ISS(esr);
        struct kvm *kvm = vcpu->kvm;
        bool allowed, fwd = false;

        /*
         * We only trap for two reasons:
         *
         * - the feature is disabled, and the only outcome is to
         *   generate an UNDEF.
         *
         * - the feature is enabled, but a NV guest wants to trap the
         *   feature used by its L2 guest. We forward the exception in
         *   this case.
         *
         * What we don't expect is to end-up here if the guest is
         * expected be be able to directly use the feature, hence the
         * WARN_ON below.
         */
        switch (iss) {
        case ESR_ELx_ISS_OTHER_ST64BV:
                allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_V);
                if (is_l2)
                        fwd = !(hcrx & HCRX_EL2_EnASR);
                break;
        case ESR_ELx_ISS_OTHER_ST64BV0:
                allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA);
                if (is_l2)
                        fwd = !(hcrx & HCRX_EL2_EnAS0);
                break;
        case ESR_ELx_ISS_OTHER_LDST64B:
                allowed = kvm_has_feat(kvm, ID_AA64ISAR1_EL1, LS64, LS64);
                if (is_l2)
                        fwd = !(hcrx & HCRX_EL2_EnALS);
                break;
        case ESR_ELx_ISS_OTHER_TSBCSYNC:
                allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1);
                if (is_l2)
                        fwd = (__vcpu_sys_reg(vcpu, HFGITR2_EL2) & HFGITR2_EL2_TSBCSYNC);
                break;
        case ESR_ELx_ISS_OTHER_PSBCSYNC:
                allowed = kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P5);
                if (is_l2)
                        fwd = (__vcpu_sys_reg(vcpu, HFGITR_EL2) & HFGITR_EL2_PSBCSYNC);
                break;
        default:
                /* Clearly, we're missing something. */
                WARN_ON_ONCE(1);
                allowed = false;
        }

        WARN_ON_ONCE(allowed && !fwd);

        if (allowed && fwd)
                kvm_inject_nested_sync(vcpu, esr);
        else
                kvm_inject_undefined(vcpu);

        return 1;
}

static exit_handle_fn arm_exit_handlers[] = {
        [0 ... ESR_ELx_EC_MAX]        = kvm_handle_unknown_ec,
        [ESR_ELx_EC_WFx]        = kvm_handle_wfx,
        [ESR_ELx_EC_CP15_32]        = kvm_handle_cp15_32,
        [ESR_ELx_EC_CP15_64]        = kvm_handle_cp15_64,
        [ESR_ELx_EC_CP14_MR]        = kvm_handle_cp14_32,
        [ESR_ELx_EC_CP14_LS]        = kvm_handle_cp14_load_store,
        [ESR_ELx_EC_CP10_ID]        = kvm_handle_cp10_id,
        [ESR_ELx_EC_CP14_64]        = kvm_handle_cp14_64,
        [ESR_ELx_EC_OTHER]        = handle_other,
        [ESR_ELx_EC_HVC32]        = handle_hvc,
        [ESR_ELx_EC_SMC32]        = handle_smc,
        [ESR_ELx_EC_HVC64]        = handle_hvc,
        [ESR_ELx_EC_SMC64]        = handle_smc,
        [ESR_ELx_EC_SVC64]        = handle_svc,
        [ESR_ELx_EC_SYS64]        = kvm_handle_sys_reg,
        [ESR_ELx_EC_SVE]        = handle_sve,
        [ESR_ELx_EC_ERET]        = kvm_handle_eret,
        [ESR_ELx_EC_IABT_LOW]        = kvm_handle_guest_abort,
        [ESR_ELx_EC_DABT_LOW]        = kvm_handle_guest_abort,
        [ESR_ELx_EC_DABT_CUR]        = kvm_handle_vncr_abort,
        [ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug,
        [ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug,
        [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
        [ESR_ELx_EC_BKPT32]        = kvm_handle_guest_debug,
        [ESR_ELx_EC_BRK64]        = kvm_handle_guest_debug,
        [ESR_ELx_EC_FP_ASIMD]        = kvm_handle_fpasimd,
        [ESR_ELx_EC_PAC]        = kvm_handle_ptrauth,
        [ESR_ELx_EC_GCS]        = kvm_handle_gcs,
};

static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
{
        u64 esr = kvm_vcpu_get_esr(vcpu);
        u8 esr_ec = ESR_ELx_EC(esr);

        return arm_exit_handlers[esr_ec];
}

/*
 * We may be single-stepping an emulated instruction. If the emulation
 * has been completed in the kernel, we can return to userspace with a
 * KVM_EXIT_DEBUG, otherwise userspace needs to complete its
 * emulation first.
 */
static int handle_trap_exceptions(struct kvm_vcpu *vcpu)
{
        int handled;

        /*
         * See ARM ARM B1.14.1: "Hyp traps on instructions
         * that fail their condition code check"
         */
        if (!kvm_condition_valid(vcpu)) {
                kvm_incr_pc(vcpu);
                handled = 1;
        } else {
                exit_handle_fn exit_handler;

                exit_handler = kvm_get_exit_handler(vcpu);
                handled = exit_handler(vcpu);
        }

        return handled;
}

/*
 * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on
 * proper exit to userspace.
 */
int handle_exit(struct kvm_vcpu *vcpu, int exception_index)
{
        struct kvm_run *run = vcpu->run;

        if (ARM_SERROR_PENDING(exception_index)) {
                /*
                 * The SError is handled by handle_exit_early(). If the guest
                 * survives it will re-execute the original instruction.
                 */
                return 1;
        }

        exception_index = ARM_EXCEPTION_CODE(exception_index);

        switch (exception_index) {
        case ARM_EXCEPTION_IRQ:
                return 1;
        case ARM_EXCEPTION_EL1_SERROR:
                return 1;
        case ARM_EXCEPTION_TRAP:
                return handle_trap_exceptions(vcpu);
        case ARM_EXCEPTION_HYP_GONE:
                /*
                 * EL2 has been reset to the hyp-stub. This happens when a guest
                 * is pre-emptied by kvm_reboot()'s shutdown call.
                 */
                run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                return 0;
        case ARM_EXCEPTION_IL:
                /*
                 * We attempted an illegal exception return.  Guest state must
                 * have been corrupted somehow.  Give up.
                 */
                run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                return -EINVAL;
        default:
                kvm_pr_unimpl("Unsupported exception type: %d",
                              exception_index);
                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
                return 0;
        }
}

/* For exit types that need handling before we can be preempted */
void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index)
{
        if (ARM_SERROR_PENDING(exception_index)) {
                if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) {
                        u64 disr = kvm_vcpu_get_disr(vcpu);

                        kvm_handle_guest_serror(vcpu, disr_to_esr(disr));
                } else {
                        kvm_inject_vabt(vcpu);
                }

                return;
        }

        exception_index = ARM_EXCEPTION_CODE(exception_index);

        if (exception_index == ARM_EXCEPTION_EL1_SERROR)
                kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu));
}

static void print_nvhe_hyp_panic(const char *name, u64 panic_addr)
{
        kvm_err("nVHE hyp %s at: [<%016llx>] %pB!\n", name, panic_addr,
                (void *)(panic_addr + kaslr_offset()));
}

static void kvm_nvhe_report_cfi_failure(u64 panic_addr)
{
        print_nvhe_hyp_panic("CFI failure", panic_addr);

        if (IS_ENABLED(CONFIG_CFI_PERMISSIVE))
                kvm_err(" (CONFIG_CFI_PERMISSIVE ignored for hyp failures)\n");
}

void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr,
                                              u64 elr_virt, u64 elr_phys,
                                              u64 par, uintptr_t vcpu,
                                              u64 far, u64 hpfar) {
        u64 elr_in_kimg = __phys_to_kimg(elr_phys);
        u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt;
        u64 mode = spsr & PSR_MODE_MASK;
        u64 panic_addr = elr_virt + hyp_offset;

        if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) {
                kvm_err("Invalid host exception to nVHE hyp!\n");
        } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
                   esr_brk_comment(esr) == BUG_BRK_IMM) {
                const char *file = NULL;
                unsigned int line = 0;

                /* All hyp bugs, including warnings, are treated as fatal. */
                if (!is_protected_kvm_enabled() ||
                    IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) {
                        struct bug_entry *bug = find_bug(elr_in_kimg);

                        if (bug)
                                bug_get_file_line(bug, &file, &line);
                }

                if (file)
                        kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line);
                else
                        print_nvhe_hyp_panic("BUG", panic_addr);
        } else if (IS_ENABLED(CONFIG_CFI_CLANG) && esr_is_cfi_brk(esr)) {
                kvm_nvhe_report_cfi_failure(panic_addr);
        } else if (IS_ENABLED(CONFIG_UBSAN_KVM_EL2) &&
                   ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 &&
                   esr_is_ubsan_brk(esr)) {
                print_nvhe_hyp_panic(report_ubsan_failure(esr & UBSAN_BRK_MASK),
                                     panic_addr);
        } else {
                print_nvhe_hyp_panic("panic", panic_addr);
        }

        /* Dump the nVHE hypervisor backtrace */
        kvm_nvhe_dump_backtrace(hyp_offset);

        /*
         * Hyp has panicked and we're going to handle that by panicking the
         * kernel. The kernel offset will be revealed in the panic so we're
         * also safe to reveal the hyp offset as a debugging aid for translating
         * hyp VAs to vmlinux addresses.
         */
        kvm_err("Hyp Offset: 0x%llx\n", hyp_offset);

        panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%016llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n",
              spsr, elr_virt, esr, far, hpfar, par, vcpu);
}




























































   24 

































   24 






















   24 






































   24 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VIRTIO_NET_H
#define _LINUX_VIRTIO_NET_H

#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/udp.h>
#include <uapi/linux/tcp.h>
#include <uapi/linux/virtio_net.h>

static inline bool virtio_net_hdr_match_proto(__be16 protocol, __u8 gso_type)
{
        switch (gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
                return protocol == cpu_to_be16(ETH_P_IP);
        case VIRTIO_NET_HDR_GSO_TCPV6:
                return protocol == cpu_to_be16(ETH_P_IPV6);
        case VIRTIO_NET_HDR_GSO_UDP:
        case VIRTIO_NET_HDR_GSO_UDP_L4:
                return protocol == cpu_to_be16(ETH_P_IP) ||
                       protocol == cpu_to_be16(ETH_P_IPV6);
        default:
                return false;
        }
}

static inline int virtio_net_hdr_set_proto(struct sk_buff *skb,
                                           const struct virtio_net_hdr *hdr)
{
        if (skb->protocol)
                return 0;

        switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
        case VIRTIO_NET_HDR_GSO_UDP:
        case VIRTIO_NET_HDR_GSO_UDP_L4:
                skb->protocol = cpu_to_be16(ETH_P_IP);
                break;
        case VIRTIO_NET_HDR_GSO_TCPV6:
                skb->protocol = cpu_to_be16(ETH_P_IPV6);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
                                        const struct virtio_net_hdr *hdr,
                                        bool little_endian)
{
        unsigned int nh_min_len = sizeof(struct iphdr);
        unsigned int gso_type = 0;
        unsigned int thlen = 0;
        unsigned int p_off = 0;
        unsigned int ip_proto;

        if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
                switch (hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
                case VIRTIO_NET_HDR_GSO_TCPV4:
                        gso_type = SKB_GSO_TCPV4;
                        ip_proto = IPPROTO_TCP;
                        thlen = sizeof(struct tcphdr);
                        break;
                case VIRTIO_NET_HDR_GSO_TCPV6:
                        gso_type = SKB_GSO_TCPV6;
                        ip_proto = IPPROTO_TCP;
                        thlen = sizeof(struct tcphdr);
                        nh_min_len = sizeof(struct ipv6hdr);
                        break;
                case VIRTIO_NET_HDR_GSO_UDP:
                        gso_type = SKB_GSO_UDP;
                        ip_proto = IPPROTO_UDP;
                        thlen = sizeof(struct udphdr);
                        break;
                case VIRTIO_NET_HDR_GSO_UDP_L4:
                        gso_type = SKB_GSO_UDP_L4;
                        ip_proto = IPPROTO_UDP;
                        thlen = sizeof(struct udphdr);
                        break;
                default:
                        return -EINVAL;
                }

                if (hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
                        gso_type |= SKB_GSO_TCP_ECN;

                if (hdr->gso_size == 0)
                        return -EINVAL;
        }

        skb_reset_mac_header(skb);

        if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
                u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start);
                u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset);
                u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16));

                if (!pskb_may_pull(skb, needed))
                        return -EINVAL;

                if (!skb_partial_csum_set(skb, start, off))
                        return -EINVAL;
                if (skb_transport_offset(skb) < nh_min_len)
                        return -EINVAL;

                nh_min_len = skb_transport_offset(skb);
                p_off = nh_min_len + thlen;
                if (!pskb_may_pull(skb, p_off))
                        return -EINVAL;
        } else {
                /* gso packets without NEEDS_CSUM do not set transport_offset.
                 * probe and drop if does not match one of the above types.
                 */
                if (gso_type && skb->network_header) {
                        struct flow_keys_basic keys;

                        if (!skb->protocol) {
                                __be16 protocol = dev_parse_header_protocol(skb);

                                if (!protocol)
                                        virtio_net_hdr_set_proto(skb, hdr);
                                else if (!virtio_net_hdr_match_proto(protocol, hdr->gso_type))
                                        return -EINVAL;
                                else
                                        skb->protocol = protocol;
                        }
retry:
                        if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                                              NULL, 0, 0, 0,
                                                              0)) {
                                /* UFO does not specify ipv4 or 6: try both */
                                if (gso_type & SKB_GSO_UDP &&
                                    skb->protocol == htons(ETH_P_IP)) {
                                        skb->protocol = htons(ETH_P_IPV6);
                                        goto retry;
                                }
                                return -EINVAL;
                        }

                        p_off = keys.control.thoff + thlen;
                        if (!pskb_may_pull(skb, p_off) ||
                            keys.basic.ip_proto != ip_proto)
                                return -EINVAL;

                        skb_set_transport_header(skb, keys.control.thoff);
                } else if (gso_type) {
                        p_off = nh_min_len + thlen;
                        if (!pskb_may_pull(skb, p_off))
                                return -EINVAL;
                }
        }

        if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
                u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
                unsigned int nh_off = p_off;
                struct skb_shared_info *shinfo = skb_shinfo(skb);

                switch (gso_type & ~SKB_GSO_TCP_ECN) {
                case SKB_GSO_UDP:
                        /* UFO may not include transport header in gso_size. */
                        nh_off -= thlen;
                        break;
                case SKB_GSO_UDP_L4:
                        if (!(hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM))
                                return -EINVAL;
                        if (skb->csum_offset != offsetof(struct udphdr, check))
                                return -EINVAL;
                        if (skb->len - p_off > gso_size * UDP_MAX_SEGMENTS)
                                return -EINVAL;
                        if (gso_type != SKB_GSO_UDP_L4)
                                return -EINVAL;
                        break;
                case SKB_GSO_TCPV4:
                case SKB_GSO_TCPV6:
                        if (skb->ip_summed == CHECKSUM_PARTIAL &&
                            skb->csum_offset != offsetof(struct tcphdr, check))
                                return -EINVAL;
                        break;
                }

                /* Kernel has a special handling for GSO_BY_FRAGS. */
                if (gso_size == GSO_BY_FRAGS)
                        return -EINVAL;

                /* Too small packets are not really GSO ones. */
                if (skb->len - nh_off > gso_size) {
                        shinfo->gso_size = gso_size;
                        shinfo->gso_type = gso_type;

                        /* Header must be checked, and gso_segs computed. */
                        shinfo->gso_type |= SKB_GSO_DODGY;
                        shinfo->gso_segs = 0;
                }
        }

        return 0;
}

static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
                                          struct virtio_net_hdr *hdr,
                                          bool little_endian,
                                          bool has_data_valid,
                                          int vlan_hlen)
{
        memset(hdr, 0, sizeof(*hdr));   /* no info leak */

        if (skb_is_gso(skb)) {
                struct skb_shared_info *sinfo = skb_shinfo(skb);

                /* This is a hint as to how much should be linear. */
                hdr->hdr_len = __cpu_to_virtio16(little_endian,
                                                 skb_headlen(skb));
                hdr->gso_size = __cpu_to_virtio16(little_endian,
                                                  sinfo->gso_size);
                if (sinfo->gso_type & SKB_GSO_TCPV4)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
                else if (sinfo->gso_type & SKB_GSO_TCPV6)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
                else if (sinfo->gso_type & SKB_GSO_UDP_L4)
                        hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP_L4;
                else
                        return -EINVAL;
                if (sinfo->gso_type & SKB_GSO_TCP_ECN)
                        hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
        } else
                hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
                hdr->csum_start = __cpu_to_virtio16(little_endian,
                        skb_checksum_start_offset(skb) + vlan_hlen);
                hdr->csum_offset = __cpu_to_virtio16(little_endian,
                                skb->csum_offset);
        } else if (has_data_valid &&
                   skb->ip_summed == CHECKSUM_UNNECESSARY) {
                hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
        } /* else everything is zero */

        return 0;
}

#endif /* _LINUX_VIRTIO_NET_H */

























































































































































































































































































































































































































































































































































































































    4 
















    4 



















    4 











    4 






























    4 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 



    4 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ext4/inode.c
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/fs/minix/inode.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  64-bit file support on 64-bit platforms by Jakub Jelinek
 *        (jj@sunsite.ms.mff.cuni.cz)
 *
 *  Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000
 */

#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/time.h>
#include <linux/highuid.h>
#include <linux/pagemap.h>
#include <linux/dax.h>
#include <linux/quotaops.h>
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/workqueue.h>
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/iomap.h>
#include <linux/iversion.h>

#include "ext4_jbd2.h"
#include "xattr.h"
#include "acl.h"
#include "truncate.h"

#include <trace/events/ext4.h>

static void ext4_journalled_zero_new_buffers(handle_t *handle,
                                            struct inode *inode,
                                            struct folio *folio,
                                            unsigned from, unsigned to);

static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
                              struct ext4_inode_info *ei)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        __u32 csum;
        __u16 dummy_csum = 0;
        int offset = offsetof(struct ext4_inode, i_checksum_lo);
        unsigned int csum_size = sizeof(dummy_csum);

        csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
        offset += csum_size;
        csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
                           EXT4_GOOD_OLD_INODE_SIZE - offset);

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                offset = offsetof(struct ext4_inode, i_checksum_hi);
                csum = ext4_chksum(sbi, csum, (__u8 *)raw +
                                   EXT4_GOOD_OLD_INODE_SIZE,
                                   offset - EXT4_GOOD_OLD_INODE_SIZE);
                if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
                        csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
                                           csum_size);
                        offset += csum_size;
                }
                csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
                                   EXT4_INODE_SIZE(inode->i_sb) - offset);
        }

        return csum;
}

static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
                                  struct ext4_inode_info *ei)
{
        __u32 provided, calculated;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_feature_metadata_csum(inode->i_sb))
                return 1;

        provided = le16_to_cpu(raw->i_checksum_lo);
        calculated = ext4_inode_csum(inode, raw, ei);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
        else
                calculated &= 0xFFFF;

        return provided == calculated;
}

void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei)
{
        __u32 csum;

        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_LINUX) ||
            !ext4_has_feature_metadata_csum(inode->i_sb))
                return;

        csum = ext4_inode_csum(inode, raw, ei);
        raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
                raw->i_checksum_hi = cpu_to_le16(csum >> 16);
}

static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
{
        trace_ext4_begin_ordered_truncate(inode, new_size);
        /*
         * If jinode is zero, then we never opened the file for
         * writing, so there's no need to call
         * jbd2_journal_begin_ordered_truncate() since there's no
         * outstanding writes we need to flush.
         */
        if (!EXT4_I(inode)->jinode)
                return 0;
        return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode),
                                                   EXT4_I(inode)->jinode,
                                                   new_size);
}

static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
                                  int pextents);

/*
 * Test whether an inode is a fast symlink.
 * A fast symlink has its symlink data stored in ext4_inode_info->i_data.
 */
int ext4_inode_is_fast_symlink(struct inode *inode)
{
        if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
                int ea_blocks = EXT4_I(inode)->i_file_acl ?
                                EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0;

                if (ext4_has_inline_data(inode))
                        return 0;

                return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
        }
        return S_ISLNK(inode->i_mode) && inode->i_size &&
               (inode->i_size < EXT4_N_BLOCKS * 4);
}

/*
 * Called at the last iput() if i_nlink is zero.
 */
void ext4_evict_inode(struct inode *inode)
{
        handle_t *handle;
        int err;
        /*
         * Credits for final inode cleanup and freeing:
         * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
         * (xattr block freeing), bitmap, group descriptor (inode freeing)
         */
        int extra_credits = 6;
        struct ext4_xattr_inode_array *ea_inode_array = NULL;
        bool freeze_protected = false;

        trace_ext4_evict_inode(inode);

        dax_break_layout_final(inode);

        if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
                ext4_evict_ea_inode(inode);
        if (inode->i_nlink) {
                truncate_inode_pages_final(&inode->i_data);

                goto no_delete;
        }

        if (is_bad_inode(inode))
                goto no_delete;
        dquot_initialize(inode);

        if (ext4_should_order_data(inode))
                ext4_begin_ordered_truncate(inode, 0);
        truncate_inode_pages_final(&inode->i_data);

        /*
         * For inodes with journalled data, transaction commit could have
         * dirtied the inode. And for inodes with dioread_nolock, unwritten
         * extents converting worker could merge extents and also have dirtied
         * the inode. Flush worker is ignoring it because of I_FREEING flag but
         * we still need to remove the inode from the writeback lists.
         */
        if (!list_empty_careful(&inode->i_io_list))
                inode_io_list_del(inode);

        /*
         * Protect us against freezing - iput() caller didn't have to have any
         * protection against it. When we are in a running transaction though,
         * we are already protected against freezing and we cannot grab further
         * protection due to lock ordering constraints.
         */
        if (!ext4_journal_current_handle()) {
                sb_start_intwrite(inode->i_sb);
                freeze_protected = true;
        }

        if (!IS_NOQUOTA(inode))
                extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);

        /*
         * Block bitmap, group descriptor, and inode are accounted in both
         * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
         */
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
                         ext4_blocks_for_truncate(inode) + extra_credits - 3);
        if (IS_ERR(handle)) {
                ext4_std_error(inode->i_sb, PTR_ERR(handle));
                /*
                 * If we're going to skip the normal cleanup, we still need to
                 * make sure that the in-core orphan linked list is properly
                 * cleaned up.
                 */
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                goto no_delete;
        }

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

        /*
         * Set inode->i_size to 0 before calling ext4_truncate(). We need
         * special handling of symlinks here because i_size is used to
         * determine whether ext4_inode_info->i_data contains symlink data or
         * block mappings. Setting i_size to 0 will remove its fast symlink
         * status. Erase i_data so that it becomes a valid empty block map.
         */
        if (ext4_inode_is_fast_symlink(inode))
                memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data));
        inode->i_size = 0;
        err = ext4_mark_inode_dirty(handle, inode);
        if (err) {
                ext4_warning(inode->i_sb,
                             "couldn't mark inode dirty (err %d)", err);
                goto stop_handle;
        }
        if (inode->i_blocks) {
                err = ext4_truncate(inode);
                if (err) {
                        ext4_error_err(inode->i_sb, -err,
                                       "couldn't truncate inode %lu (err %d)",
                                       inode->i_ino, err);
                        goto stop_handle;
                }
        }

        /* Remove xattr references. */
        err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array,
                                      extra_credits);
        if (err) {
                ext4_warning(inode->i_sb, "xattr delete (err %d)", err);
stop_handle:
                ext4_journal_stop(handle);
                ext4_orphan_del(NULL, inode);
                if (freeze_protected)
                        sb_end_intwrite(inode->i_sb);
                ext4_xattr_inode_array_free(ea_inode_array);
                goto no_delete;
        }

        /*
         * Kill off the orphan record which ext4_truncate created.
         * AKPM: I think this can be inside the above `if'.
         * Note that ext4_orphan_del() has to be able to cope with the
         * deletion of a non-existent orphan - this is because we don't
         * know if ext4_truncate() actually created an orphan record.
         * (Well, we could do this if we need to, but heck - it works)
         */
        ext4_orphan_del(handle, inode);
        EXT4_I(inode)->i_dtime        = (__u32)ktime_get_real_seconds();

        /*
         * One subtle ordering requirement: if anything has gone wrong
         * (transaction abort, IO errors, whatever), then we can still
         * do these next steps (the fs will already have been marked as
         * having errors), but we can't free the inode if the mark_dirty
         * fails.
         */
        if (ext4_mark_inode_dirty(handle, inode))
                /* If that failed, just do the required in-core inode clear. */
                ext4_clear_inode(inode);
        else
                ext4_free_inode(handle, inode);
        ext4_journal_stop(handle);
        if (freeze_protected)
                sb_end_intwrite(inode->i_sb);
        ext4_xattr_inode_array_free(ea_inode_array);
        return;
no_delete:
        /*
         * Check out some where else accidentally dirty the evicting inode,
         * which may probably cause inode use-after-free issues later.
         */
        WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list));

        if (!list_empty(&EXT4_I(inode)->i_fc_list))
                ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL);
        ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
}

#ifdef CONFIG_QUOTA
qsize_t *ext4_get_reserved_space(struct inode *inode)
{
        return &EXT4_I(inode)->i_reserved_quota;
}
#endif

/*
 * Called with i_data_sem down, which is important since we can call
 * ext4_discard_preallocations() from here.
 */
void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used, quota_claim);
        if (unlikely(used > ei->i_reserved_data_blocks)) {
                ext4_warning(inode->i_sb, "%s: ino %lu, used %d "
                         "with only %d reserved data blocks",
                         __func__, inode->i_ino, used,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                used = ei->i_reserved_data_blocks;
        }

        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, used);

        spin_unlock(&ei->i_block_reservation_lock);

        /* Update quota subsystem for data blocks */
        if (quota_claim)
                dquot_claim_block(inode, EXT4_C2B(sbi, used));
        else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
                 * not re-claim the quota for fallocated blocks.
                 */
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, used));
        }

        /*
         * If we have done all the pending block allocations and if
         * there aren't any writers on the inode, we can discard the
         * inode's preallocations.
         */
        if ((ei->i_reserved_data_blocks == 0) &&
            !inode_is_open_for_write(inode))
                ext4_discard_preallocations(inode);
}

static int __check_block_validity(struct inode *inode, const char *func,
                                unsigned int line,
                                struct ext4_map_blocks *map)
{
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal && inode == journal->j_inode)
                return 0;

        if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
                ext4_error_inode(inode, func, line, map->m_pblk,
                                 "lblock %lu mapped to illegal pblock %llu "
                                 "(length %d)", (unsigned long) map->m_lblk,
                                 map->m_pblk, map->m_len);
                return -EFSCORRUPTED;
        }
        return 0;
}

int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
                       ext4_lblk_t len)
{
        int ret;

        if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode))
                return fscrypt_zeroout_range(inode, lblk, pblk, len);

        ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
        if (ret > 0)
                ret = 0;

        return ret;
}

#define check_block_validity(inode, map)        \
        __check_block_validity((inode), __func__, __LINE__, (map))

#ifdef ES_AGGRESSIVE_TEST
static void ext4_map_blocks_es_recheck(handle_t *handle,
                                       struct inode *inode,
                                       struct ext4_map_blocks *es_map,
                                       struct ext4_map_blocks *map,
                                       int flags)
{
        int retval;

        map->m_flags = 0;
        /*
         * There is a race window that the result is not the same.
         * e.g. xfstests #223 when dioread_nolock enables.  The reason
         * is that we lookup a block mapping in extent status tree with
         * out taking i_data_sem.  So at the time the unwritten extent
         * could be converted.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));

        /*
         * We don't check m_len because extent will be collpased in status
         * tree.  So the m_len might not equal.
         */
        if (es_map->m_lblk != map->m_lblk ||
            es_map->m_flags != map->m_flags ||
            es_map->m_pblk != map->m_pblk) {
                printk("ES cache assertion failed for inode: %lu "
                       "es_cached ex [%d/%d/%llu/%x] != "
                       "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
                       inode->i_ino, es_map->m_lblk, es_map->m_len,
                       es_map->m_pblk, es_map->m_flags, map->m_lblk,
                       map->m_len, map->m_pblk, map->m_flags,
                       retval, flags);
        }
}
#endif /* ES_AGGRESSIVE_TEST */

static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
                                 struct ext4_map_blocks *map)
{
        unsigned int status;
        int retval;

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                retval = ext4_ext_map_blocks(handle, inode, map, 0);
        else
                retval = ext4_ind_map_blocks(handle, inode, map, 0);

        if (retval <= 0)
                return retval;

        if (unlikely(retval != map->m_len)) {
                ext4_warning(inode->i_sb,
                             "ES len assertion failed for inode "
                             "%lu: retval %d != map->m_len %d",
                             inode->i_ino, retval, map->m_len);
                WARN_ON(1);
        }

        status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                        EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
        ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
                              map->m_pblk, status, false);
        return retval;
}

static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
                                  struct ext4_map_blocks *map, int flags)
{
        struct extent_status es;
        unsigned int status;
        int err, retval = 0;

        /*
         * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE
         * indicates that the blocks and quotas has already been
         * checked when the data was copied into the page cache.
         */
        if (map->m_flags & EXT4_MAP_DELAYED)
                flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;

        /*
         * Here we clear m_flags because after allocating an new extent,
         * it will be set again.
         */
        map->m_flags &= ~EXT4_MAP_FLAGS;

        /*
         * We need to check for EXT4 here because migrate could have
         * changed the inode type in between.
         */
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                retval = ext4_ext_map_blocks(handle, inode, map, flags);
        } else {
                retval = ext4_ind_map_blocks(handle, inode, map, flags);

                /*
                 * We allocated new blocks which will result in i_data's
                 * format changing. Force the migrate to fail by clearing
                 * migrate flags.
                 */
                if (retval > 0 && map->m_flags & EXT4_MAP_NEW)
                        ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
        }
        if (retval <= 0)
                return retval;

        if (unlikely(retval != map->m_len)) {
                ext4_warning(inode->i_sb,
                             "ES len assertion failed for inode %lu: "
                             "retval %d != map->m_len %d",
                             inode->i_ino, retval, map->m_len);
                WARN_ON(1);
        }

        /*
         * We have to zeroout blocks before inserting them into extent
         * status tree. Otherwise someone could look them up there and
         * use them before they are really zeroed. We also have to
         * unmap metadata before zeroing as otherwise writeback can
         * overwrite zeros with stale data from block device.
         */
        if (flags & EXT4_GET_BLOCKS_ZERO &&
            map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) {
                err = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk,
                                         map->m_len);
                if (err)
                        return err;
        }

        /*
         * If the extent has been zeroed out, we don't need to update
         * extent status tree.
         */
        if (flags & EXT4_GET_BLOCKS_PRE_IO &&
            ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                if (ext4_es_is_written(&es))
                        return retval;
        }

        status = map->m_flags & EXT4_MAP_UNWRITTEN ?
                        EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
        ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk,
                              status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);

        return retval;
}

/*
 * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
 * If file type is extents based, it will call ext4_ext_map_blocks(),
 * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocated.
 * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
 * pre-allocated and unwritten, the resulting @map is marked as unwritten.
 * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
 *
 * It returns 0 if plain look up failed (blocks have not been allocated), in
 * that case, @map is returned as unmapped but we still do fill map->m_len to
 * indicate the length of a hole starting at map->m_lblk.
 *
 * It returns the error in case of allocation failure.
 */
int ext4_map_blocks(handle_t *handle, struct inode *inode,
                    struct ext4_map_blocks *map, int flags)
{
        struct extent_status es;
        int retval;
        int ret = 0;
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        map->m_flags = 0;
        ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n",
                  flags, map->m_len, (unsigned long) map->m_lblk);

        /*
         * ext4_map_blocks returns an int, and m_len is an unsigned int
         */
        if (unlikely(map->m_len > INT_MAX))
                map->m_len = INT_MAX;

        /* We can handle the block number less than EXT_MAX_BLOCKS */
        if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
                return -EFSCORRUPTED;

        /* Lookup extent status tree firstly */
        if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
            ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
                        map->m_pblk = ext4_es_pblock(&es) +
                                        map->m_lblk - es.es_lblk;
                        map->m_flags |= ext4_es_is_written(&es) ?
                                        EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
                        map->m_pblk = 0;
                        map->m_flags |= ext4_es_is_delayed(&es) ?
                                        EXT4_MAP_DELAYED : 0;
                        retval = es.es_len - (map->m_lblk - es.es_lblk);
                        if (retval > map->m_len)
                                retval = map->m_len;
                        map->m_len = retval;
                        retval = 0;
                } else {
                        BUG();
                }

                if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
                        return retval;
#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(handle, inode, map,
                                           &orig_map, flags);
#endif
                goto found;
        }
        /*
         * In the query cache no-wait mode, nothing we can do more if we
         * cannot find extent in the cache.
         */
        if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT)
                return 0;

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        retval = ext4_map_query_blocks(handle, inode, map);
        up_read((&EXT4_I(inode)->i_data_sem));

found:
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;
        }

        /* If it is only a block(s) look up */
        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
                return retval;

        /*
         * Returns if the blocks have already allocated
         *
         * Note that if blocks have been preallocated
         * ext4_ext_map_blocks() returns with buffer head unmapped
         */
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                /*
                 * If we need to convert extent to unwritten
                 * we continue and do the actual work in
                 * ext4_ext_map_blocks()
                 */
                if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
                        return retval;

        /*
         * New blocks allocate and/or writing to unwritten extent
         * will possibly result in updating i_data, so we take
         * the write lock of i_data_sem, and call get_block()
         * with create == 1 flag.
         */
        down_write(&EXT4_I(inode)->i_data_sem);
        retval = ext4_map_create_blocks(handle, inode, map, flags);
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
                ret = check_block_validity(inode, map);
                if (ret != 0)
                        return ret;

                /*
                 * Inodes with freshly allocated blocks where contents will be
                 * visible after transaction commit must be on transaction's
                 * ordered data list.
                 */
                if (map->m_flags & EXT4_MAP_NEW &&
                    !(map->m_flags & EXT4_MAP_UNWRITTEN) &&
                    !(flags & EXT4_GET_BLOCKS_ZERO) &&
                    !ext4_is_quota_file(inode) &&
                    ext4_should_order_data(inode)) {
                        loff_t start_byte =
                                (loff_t)map->m_lblk << inode->i_blkbits;
                        loff_t length = (loff_t)map->m_len << inode->i_blkbits;

                        if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
                                ret = ext4_jbd2_inode_add_wait(handle, inode,
                                                start_byte, length);
                        else
                                ret = ext4_jbd2_inode_add_write(handle, inode,
                                                start_byte, length);
                        if (ret)
                                return ret;
                }
        }
        if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN ||
                                map->m_flags & EXT4_MAP_MAPPED))
                ext4_fc_track_range(handle, inode, map->m_lblk,
                                        map->m_lblk + map->m_len - 1);
        if (retval < 0)
                ext_debug(inode, "failed with err %d\n", retval);
        return retval;
}

/*
 * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages
 * we have to be careful as someone else may be manipulating b_state as well.
 */
static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags)
{
        unsigned long old_state;
        unsigned long new_state;

        flags &= EXT4_MAP_FLAGS;

        /* Dummy buffer_head? Set non-atomically. */
        if (!bh->b_folio) {
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags;
                return;
        }
        /*
         * Someone else may be modifying b_state. Be careful! This is ugly but
         * once we get rid of using bh as a container for mapping information
         * to pass to / from get_block functions, this can go away.
         */
        old_state = READ_ONCE(bh->b_state);
        do {
                new_state = (old_state & ~EXT4_MAP_FLAGS) | flags;
        } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state)));
}

static int _ext4_get_block(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int flags)
{
        struct ext4_map_blocks map;
        int ret = 0;

        if (ext4_has_inline_data(inode))
                return -ERANGE;

        map.m_lblk = iblock;
        map.m_len = bh->b_size >> inode->i_blkbits;

        ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map,
                              flags);
        if (ret > 0) {
                map_bh(bh, inode->i_sb, map.m_pblk);
                ext4_update_bh_state(bh, map.m_flags);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        } else if (ret == 0) {
                /* hole case, need to fill in bh->b_size */
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
        }
        return ret;
}

int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh, int create)
{
        return _ext4_get_block(inode, iblock, bh,
                               create ? EXT4_GET_BLOCKS_CREATE : 0);
}

/*
 * Get block function used when preparing for buffered write if we require
 * creating an unwritten extent if blocks haven't been allocated.  The extent
 * will be converted to written after the IO is complete.
 */
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create)
{
        int ret = 0;

        ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n",
                   inode->i_ino, create);
        ret = _ext4_get_block(inode, iblock, bh_result,
                               EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT);

        /*
         * If the buffer is marked unwritten, mark it as new to make sure it is
         * zeroed out correctly in case of partial writes. Otherwise, there is
         * a chance of stale data getting exposed.
         */
        if (ret == 0 && buffer_unwritten(bh_result))
                set_buffer_new(bh_result);

        return ret;
}

/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096

/*
 * `handle' can be NULL if create is zero
 */
struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int map_flags)
{
        struct ext4_map_blocks map;
        struct buffer_head *bh;
        int create = map_flags & EXT4_GET_BLOCKS_CREATE;
        bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT;
        int err;

        ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                    || handle != NULL || create == 0);
        ASSERT(create == 0 || !nowait);

        map.m_lblk = block;
        map.m_len = 1;
        err = ext4_map_blocks(handle, inode, &map, map_flags);

        if (err == 0)
                return create ? ERR_PTR(-ENOSPC) : NULL;
        if (err < 0)
                return ERR_PTR(err);

        if (nowait)
                return sb_find_get_block(inode->i_sb, map.m_pblk);

        /*
         * Since bh could introduce extra ref count such as referred by
         * journal_head etc. Try to avoid using __GFP_MOVABLE here
         * as it may fail the migration when journal_head remains.
         */
        bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk,
                                inode->i_sb->s_blocksize);

        if (unlikely(!bh))
                return ERR_PTR(-ENOMEM);
        if (map.m_flags & EXT4_MAP_NEW) {
                ASSERT(create != 0);
                ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                            || (handle != NULL));

                /*
                 * Now that we do not always journal data, we should
                 * keep in mind whether this should always journal the
                 * new buffer as metadata.  For now, regular file
                 * writes use ext4_get_block instead, so it's not a
                 * problem.
                 */
                lock_buffer(bh);
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, inode->i_sb, bh,
                                                     EXT4_JTR_NONE);
                if (unlikely(err)) {
                        unlock_buffer(bh);
                        goto errout;
                }
                if (!buffer_uptodate(bh)) {
                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
                        set_buffer_uptodate(bh);
                }
                unlock_buffer(bh);
                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
                err = ext4_handle_dirty_metadata(handle, inode, bh);
                if (unlikely(err))
                        goto errout;
        } else
                BUFFER_TRACE(bh, "not a new buffer");
        return bh;
errout:
        brelse(bh);
        return ERR_PTR(err);
}

struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
                               ext4_lblk_t block, int map_flags)
{
        struct buffer_head *bh;
        int ret;

        bh = ext4_getblk(handle, inode, block, map_flags);
        if (IS_ERR(bh))
                return bh;
        if (!bh || ext4_buffer_uptodate(bh))
                return bh;

        ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true);
        if (ret) {
                put_bh(bh);
                return ERR_PTR(ret);
        }
        return bh;
}

/* Read a contiguous batch of blocks. */
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs)
{
        int i, err;

        for (i = 0; i < bh_count; i++) {
                bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */);
                if (IS_ERR(bhs[i])) {
                        err = PTR_ERR(bhs[i]);
                        bh_count = i;
                        goto out_brelse;
                }
        }

        for (i = 0; i < bh_count; i++)
                /* Note that NULL bhs[i] is valid because of holes. */
                if (bhs[i] && !ext4_buffer_uptodate(bhs[i]))
                        ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false);

        if (!wait)
                return 0;

        for (i = 0; i < bh_count; i++)
                if (bhs[i])
                        wait_on_buffer(bhs[i]);

        for (i = 0; i < bh_count; i++) {
                if (bhs[i] && !buffer_uptodate(bhs[i])) {
                        err = -EIO;
                        goto out_brelse;
                }
        }
        return 0;

out_brelse:
        for (i = 0; i < bh_count; i++) {
                brelse(bhs[i]);
                bhs[i] = NULL;
        }
        return err;
}

int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh))
{
        struct buffer_head *bh;
        unsigned block_start, block_end;
        unsigned blocksize = head->b_size;
        int err, ret = 0;
        struct buffer_head *next;

        for (bh = head, block_start = 0;
             ret == 0 && (bh != head || !block_start);
             block_start = block_end, bh = next) {
                next = bh->b_this_page;
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (partial && !buffer_uptodate(bh))
                                *partial = 1;
                        continue;
                }
                err = (*fn)(handle, inode, bh);
                if (!ret)
                        ret = err;
        }
        return ret;
}

/*
 * Helper for handling dirtying of journalled data. We also mark the folio as
 * dirty so that writeback code knows about this page (and inode) contains
 * dirty data. ext4_writepages() then commits appropriate transaction to
 * make data stable.
 */
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
        folio_mark_dirty(bh->b_folio);
        return ext4_handle_dirty_metadata(handle, NULL, bh);
}

int do_journal_get_write_access(handle_t *handle, struct inode *inode,
                                struct buffer_head *bh)
{
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        BUFFER_TRACE(bh, "get write access");
        return ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                            EXT4_JTR_NONE);
}

int ext4_block_write_begin(handle_t *handle, struct folio *folio,
                           loff_t pos, unsigned len,
                           get_block_t *get_block)
{
        unsigned from = pos & (PAGE_SIZE - 1);
        unsigned to = from + len;
        struct inode *inode = folio->mapping->host;
        unsigned block_start, block_end;
        sector_t block;
        int err = 0;
        unsigned blocksize = inode->i_sb->s_blocksize;
        unsigned bbits;
        struct buffer_head *bh, *head, *wait[2];
        int nr_wait = 0;
        int i;
        bool should_journal_data = ext4_should_journal_data(inode);

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(from > PAGE_SIZE);
        BUG_ON(to > PAGE_SIZE);
        BUG_ON(from > to);

        head = folio_buffers(folio);
        if (!head)
                head = create_empty_buffers(folio, blocksize, 0);
        bbits = ilog2(blocksize);
        block = (sector_t)folio->index << (PAGE_SHIFT - bbits);

        for (bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start = block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (folio_test_uptodate(folio)) {
                                set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                break;
                        if (buffer_new(bh)) {
                                /*
                                 * We may be zeroing partial buffers or all new
                                 * buffers in case of failure. Prepare JBD2 for
                                 * that.
                                 */
                                if (should_journal_data)
                                        do_journal_get_write_access(handle,
                                                                    inode, bh);
                                if (folio_test_uptodate(folio)) {
                                        /*
                                         * Unlike __block_write_begin() we leave
                                         * dirtying of new uptodate buffers to
                                         * ->write_end() time or
                                         * folio_zero_new_buffers().
                                         */
                                        set_buffer_uptodate(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        folio_zero_segments(folio, to,
                                                            block_end,
                                                            block_start, from);
                                continue;
                        }
                }
                if (folio_test_uptodate(folio)) {
                        set_buffer_uptodate(bh);
                        continue;
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                    (block_start < from || block_end > to)) {
                        ext4_read_bh_lock(bh, 0, false);
                        wait[nr_wait++] = bh;
                }
        }
        /*
         * If we issued read requests, let them complete.
         */
        for (i = 0; i < nr_wait; i++) {
                wait_on_buffer(wait[i]);
                if (!buffer_uptodate(wait[i]))
                        err = -EIO;
        }
        if (unlikely(err)) {
                if (should_journal_data)
                        ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                         from, to);
                else
                        folio_zero_new_buffers(folio, from, to);
        } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                for (i = 0; i < nr_wait; i++) {
                        int err2;

                        err2 = fscrypt_decrypt_pagecache_blocks(folio,
                                                blocksize, bh_offset(wait[i]));
                        if (err2) {
                                clear_buffer_uptodate(wait[i]);
                                err = err2;
                        }
                }
        }

        return err;
}

/*
 * To preserve ordering, it is essential that the hole instantiation and
 * the data write be encapsulated in a single transaction.  We cannot
 * close off a transaction and start a new one between the ext4_get_block()
 * and the ext4_write_end().  So doing the jbd2_journal_start at the start of
 * ext4_write_begin() is the right place.
 */
static int ext4_write_begin(struct file *file, struct address_space *mapping,
                            loff_t pos, unsigned len,
                            struct folio **foliop, void **fsdata)
{
        struct inode *inode = mapping->host;
        int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct folio *folio;
        pgoff_t index;
        unsigned from, to;

        ret = ext4_emergency_state(inode->i_sb);
        if (unlikely(ret))
                return ret;

        trace_ext4_write_begin(inode, pos, len);
        /*
         * Reserve one block more for addition to orphan list in case
         * we allocate blocks but write fails for some reason
         */
        needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
        index = pos >> PAGE_SHIFT;
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
                                                    foliop);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

        /*
         * __filemap_get_folio() can take a long time if the
         * system is thrashing due to memory pressure, or if the folio
         * is being written back.  So grab it first before we start
         * the transaction handle.  This also allows us to allocate
         * the folio (if needed) without using GFP_NOFS.
         */
retry_grab:
        folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
                                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);
        /*
         * The same as page allocation, we prealloc buffer heads before
         * starting the handle.
         */
        if (!folio_buffers(folio))
                create_empty_buffers(folio, inode->i_sb->s_blocksize, 0);

        folio_unlock(folio);

retry_journal:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
        if (IS_ERR(handle)) {
                folio_put(folio);
                return PTR_ERR(handle);
        }

        folio_lock(folio);
        if (folio->mapping != mapping) {
                /* The folio got truncated from under us */
                folio_unlock(folio);
                folio_put(folio);
                ext4_journal_stop(handle);
                goto retry_grab;
        }
        /* In case writeback began while the folio was unlocked */
        folio_wait_stable(folio);

        if (ext4_should_dioread_nolock(inode))
                ret = ext4_block_write_begin(handle, folio, pos, len,
                                             ext4_get_block_unwritten);
        else
                ret = ext4_block_write_begin(handle, folio, pos, len,
                                             ext4_get_block);
        if (!ret && ext4_should_journal_data(inode)) {
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio), from, to,
                                             NULL, do_journal_get_write_access);
        }

        if (ret) {
                bool extended = (pos + len > inode->i_size) &&
                                !ext4_verity_in_progress(inode);

                folio_unlock(folio);
                /*
                 * ext4_block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_rwsem.
                 *
                 * Add inode to orphan list in case we crash before
                 * truncate finishes
                 */
                if (extended && ext4_can_truncate(inode))
                        ext4_orphan_add(handle, inode);

                ext4_journal_stop(handle);
                if (extended) {
                        ext4_truncate_failed_write(inode);
                        /*
                         * If truncate failed early the inode might
                         * still be on the orphan list; we need to
                         * make sure the inode is removed from the
                         * orphan list in that case.
                         */
                        if (inode->i_nlink)
                                ext4_orphan_del(NULL, inode);
                }

                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry_journal;
                folio_put(folio);
                return ret;
        }
        *foliop = folio;
        return ret;
}

/* For write_end() in data=journal mode */
static int write_end_fn(handle_t *handle, struct inode *inode,
                        struct buffer_head *bh)
{
        int ret;
        if (!buffer_mapped(bh) || buffer_freed(bh))
                return 0;
        set_buffer_uptodate(bh);
        ret = ext4_dirty_journalled_data(handle, bh);
        clear_buffer_meta(bh);
        clear_buffer_prio(bh);
        return ret;
}

/*
 * We need to pick up the new inode size which generic_commit_write gave us
 * `file' can be NULL - eg, when called from page_symlink().
 *
 * ext4 never places buffers on inode->i_mapping->i_private_list.  metadata
 * buffers are managed internally.
 */
static int ext4_write_end(struct file *file,
                          struct address_space *mapping,
                          loff_t pos, unsigned len, unsigned copied,
                          struct folio *folio, void *fsdata)
{
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int i_size_changed = 0;
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_write_end(inode, pos, len, copied);

        if (ext4_has_inline_data(inode) &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);
        /*
         * it's important to update i_size while still holding folio lock:
         * page writeout could otherwise come in and zero beyond i_size.
         *
         * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree
         * blocks are being written past EOF, so skip the i_size update.
         */
        if (!verity)
                i_size_changed = ext4_update_inode_size(inode, pos + copied);
        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos && !verity) {
                pagecache_isize_extended(inode, old_size, pos);
                ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
        }
        /*
         * Don't mark the inode dirty under folio lock. First, it unnecessarily
         * makes the holding time of folio lock longer. Second, it forces lock
         * ordering of folio lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                ret = ext4_mark_inode_dirty(handle, inode);

        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;

        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * This is a private version of folio_zero_new_buffers() which doesn't
 * set the buffer to be dirty, since in data=journalled mode we need
 * to call ext4_dirty_journalled_data() instead.
 */
static void ext4_journalled_zero_new_buffers(handle_t *handle,
                                            struct inode *inode,
                                            struct folio *folio,
                                            unsigned from, unsigned to)
{
        unsigned int block_start = 0, block_end;
        struct buffer_head *head, *bh;

        bh = head = folio_buffers(folio);
        do {
                block_end = block_start + bh->b_size;
                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!folio_test_uptodate(folio)) {
                                        unsigned start, size;

                                        start = max(from, block_start);
                                        size = min(to, block_end) - start;

                                        folio_zero_range(folio, start, size);
                                }
                                clear_buffer_new(bh);
                                write_end_fn(handle, inode, bh);
                        }
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}

static int ext4_journalled_write_end(struct file *file,
                                     struct address_space *mapping,
                                     loff_t pos, unsigned len, unsigned copied,
                                     struct folio *folio, void *fsdata)
{
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        int ret = 0, ret2;
        int partial = 0;
        unsigned from, to;
        int size_changed = 0;
        bool verity = ext4_verity_in_progress(inode);

        trace_ext4_journalled_write_end(inode, pos, len, copied);
        from = pos & (PAGE_SIZE - 1);
        to = from + len;

        BUG_ON(!ext4_handle_valid(handle));

        if (ext4_has_inline_data(inode))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
                copied = 0;
                ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                 from, to);
        } else {
                if (unlikely(copied < len))
                        ext4_journalled_zero_new_buffers(handle, inode, folio,
                                                         from + copied, to);
                ret = ext4_walk_page_buffers(handle, inode,
                                             folio_buffers(folio),
                                             from, from + copied, &partial,
                                             write_end_fn);
                if (!partial)
                        folio_mark_uptodate(folio);
        }
        if (!verity)
                size_changed = ext4_update_inode_size(inode, pos + copied);
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos && !verity) {
                pagecache_isize_extended(inode, old_size, pos);
                ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
        }

        if (size_changed) {
                ret2 = ext4_mark_inode_dirty(handle, inode);
                if (!ret)
                        ret = ret2;
        }

        if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode))
                /* if we have allocated more blocks and copied
                 * less. We will have blocks allocated outside
                 * inode->i_size. So truncate them
                 */
                ext4_orphan_add(handle, inode);

        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
        if (pos + len > inode->i_size && !verity) {
                ext4_truncate_failed_write(inode);
                /*
                 * If truncate failed early the inode might still be
                 * on the orphan list; we need to make sure the inode
                 * is removed from the orphan list in that case.
                 */
                if (inode->i_nlink)
                        ext4_orphan_del(NULL, inode);
        }

        return ret ? ret : copied;
}

/*
 * Reserve space for 'nr_resv' clusters
 */
static int ext4_da_reserve_space(struct inode *inode, int nr_resv)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int ret;

        /*
         * We will charge metadata quota at writeout time; this saves
         * us from metadata over-estimation, though we may go over by
         * a small amount in the end.  Here we just reserve for data.
         */
        ret = dquot_reserve_block(inode, EXT4_C2B(sbi, nr_resv));
        if (ret)
                return ret;

        spin_lock(&ei->i_block_reservation_lock);
        if (ext4_claim_free_clusters(sbi, nr_resv, 0)) {
                spin_unlock(&ei->i_block_reservation_lock);
                dquot_release_reservation_block(inode, EXT4_C2B(sbi, nr_resv));
                return -ENOSPC;
        }
        ei->i_reserved_data_blocks += nr_resv;
        trace_ext4_da_reserve_space(inode, nr_resv);
        spin_unlock(&ei->i_block_reservation_lock);

        return 0;       /* success */
}

void ext4_da_release_space(struct inode *inode, int to_free)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (!to_free)
                return;                /* Nothing to release, exit */

        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);

        trace_ext4_da_release_space(inode, to_free);
        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
                 * if there aren't enough reserved blocks, then the
                 * counter is messed up somewhere.  Since this
                 * function is called from invalidate page, it's
                 * harmless to return without any action.
                 */
                ext4_warning(inode->i_sb, "ext4_da_release_space: "
                         "ino %lu, to_free %d with only %d reserved "
                         "data blocks", inode->i_ino, to_free,
                         ei->i_reserved_data_blocks);
                WARN_ON(1);
                to_free = ei->i_reserved_data_blocks;
        }
        ei->i_reserved_data_blocks -= to_free;

        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free);

        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);

        dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free));
}

/*
 * Delayed allocation stuff
 */

struct mpage_da_data {
        /* These are input fields for ext4_do_writepages() */
        struct inode *inode;
        struct writeback_control *wbc;
        unsigned int can_map:1;        /* Can writepages call map blocks? */

        /* These are internal state of ext4_do_writepages() */
        pgoff_t first_page;        /* The first page to write */
        pgoff_t next_page;        /* Current page to examine */
        pgoff_t last_page;        /* Last page to examine */
        /*
         * Extent to map - this can be after first_page because that can be
         * fully mapped. We somewhat abuse m_flags to store whether the extent
         * is delalloc or unwritten.
         */
        struct ext4_map_blocks map;
        struct ext4_io_submit io_submit;        /* IO submission data */
        unsigned int do_map:1;
        unsigned int scanned_until_end:1;
        unsigned int journalled_more_data:1;
};

static void mpage_release_unused_pages(struct mpage_da_data *mpd,
                                       bool invalidate)
{
        unsigned nr, i;
        pgoff_t index, end;
        struct folio_batch fbatch;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;

        /* This is necessary when next_page == 0. */
        if (mpd->first_page >= mpd->next_page)
                return;

        mpd->scanned_until_end = 0;
        index = mpd->first_page;
        end   = mpd->next_page - 1;
        if (invalidate) {
                ext4_lblk_t start, last;
                start = index << (PAGE_SHIFT - inode->i_blkbits);
                last = end << (PAGE_SHIFT - inode->i_blkbits);

                /*
                 * avoid racing with extent status tree scans made by
                 * ext4_insert_delayed_block()
                 */
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_es_remove_extent(inode, start, last - start + 1);
                up_write(&EXT4_I(inode)->i_data_sem);
        }

        folio_batch_init(&fbatch);
        while (index <= end) {
                nr = filemap_get_folios(mapping, &index, end, &fbatch);
                if (nr == 0)
                        break;
                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        if (folio->index < mpd->first_page)
                                continue;
                        if (folio_next_index(folio) - 1 > end)
                                continue;
                        BUG_ON(!folio_test_locked(folio));
                        BUG_ON(folio_test_writeback(folio));
                        if (invalidate) {
                                if (folio_mapped(folio))
                                        folio_clear_dirty_for_io(folio);
                                block_invalidate_folio(folio, 0,
                                                folio_size(folio));
                                folio_clear_uptodate(folio);
                        }
                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
        }
}

static void ext4_print_free_blocks(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct super_block *sb = inode->i_sb;
        struct ext4_inode_info *ei = EXT4_I(inode);

        ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
               EXT4_C2B(EXT4_SB(inode->i_sb),
                        ext4_count_free_clusters(sb)));
        ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
        ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_freeclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
               (long long) EXT4_C2B(EXT4_SB(sb),
                percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
        ext4_msg(sb, KERN_CRIT, "Block reservation details");
        ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
                 ei->i_reserved_data_blocks);
        return;
}

/*
 * Check whether the cluster containing lblk has been allocated or has
 * delalloc reservation.
 *
 * Returns 0 if the cluster doesn't have either, 1 if it has delalloc
 * reservation, 2 if it's already been allocated, negative error code on
 * failure.
 */
static int ext4_clu_alloc_state(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int ret;

        /* Has delalloc reservation? */
        if (ext4_es_scan_clu(inode, &ext4_es_is_delayed, lblk))
                return 1;

        /* Already been allocated? */
        if (ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk))
                return 2;
        ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk));
        if (ret < 0)
                return ret;
        if (ret > 0)
                return 2;

        return 0;
}

/*
 * ext4_insert_delayed_blocks - adds a multiple delayed blocks to the extents
 *                              status tree, incrementing the reserved
 *                              cluster/block count or making pending
 *                              reservations where needed
 *
 * @inode - file containing the newly added block
 * @lblk - start logical block to be added
 * @len - length of blocks to be added
 *
 * Returns 0 on success, negative error code on failure.
 */
static int ext4_insert_delayed_blocks(struct inode *inode, ext4_lblk_t lblk,
                                      ext4_lblk_t len)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int ret;
        bool lclu_allocated = false;
        bool end_allocated = false;
        ext4_lblk_t resv_clu;
        ext4_lblk_t end = lblk + len - 1;

        /*
         * If the cluster containing lblk or end is shared with a delayed,
         * written, or unwritten extent in a bigalloc file system, it's
         * already been accounted for and does not need to be reserved.
         * A pending reservation must be made for the cluster if it's
         * shared with a written or unwritten extent and doesn't already
         * have one.  Written and unwritten extents can be purged from the
         * extents status tree if the system is under memory pressure, so
         * it's necessary to examine the extent tree if a search of the
         * extents status tree doesn't get a match.
         */
        if (sbi->s_cluster_ratio == 1) {
                ret = ext4_da_reserve_space(inode, len);
                if (ret != 0)   /* ENOSPC */
                        return ret;
        } else {   /* bigalloc */
                resv_clu = EXT4_B2C(sbi, end) - EXT4_B2C(sbi, lblk) + 1;

                ret = ext4_clu_alloc_state(inode, lblk);
                if (ret < 0)
                        return ret;
                if (ret > 0) {
                        resv_clu--;
                        lclu_allocated = (ret == 2);
                }

                if (EXT4_B2C(sbi, lblk) != EXT4_B2C(sbi, end)) {
                        ret = ext4_clu_alloc_state(inode, end);
                        if (ret < 0)
                                return ret;
                        if (ret > 0) {
                                resv_clu--;
                                end_allocated = (ret == 2);
                        }
                }

                if (resv_clu) {
                        ret = ext4_da_reserve_space(inode, resv_clu);
                        if (ret != 0)   /* ENOSPC */
                                return ret;
                }
        }

        ext4_es_insert_delayed_extent(inode, lblk, len, lclu_allocated,
                                      end_allocated);
        return 0;
}

/*
 * Looks up the requested blocks and sets the delalloc extent map.
 * First try to look up for the extent entry that contains the requested
 * blocks in the extent status tree without i_data_sem, then try to look
 * up for the ondisk extent mapping with i_data_sem in read mode,
 * finally hold i_data_sem in write mode, looks up again and add a
 * delalloc extent entry if it still couldn't find any extent. Pass out
 * the mapped extent through @map and return 0 on success.
 */
static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
{
        struct extent_status es;
        int retval;
#ifdef ES_AGGRESSIVE_TEST
        struct ext4_map_blocks orig_map;

        memcpy(&orig_map, map, sizeof(*map));
#endif

        map->m_flags = 0;
        ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
                  (unsigned long) map->m_lblk);

        /* Lookup extent status tree firstly */
        if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                map->m_len = min_t(unsigned int, map->m_len,
                                   es.es_len - (map->m_lblk - es.es_lblk));

                if (ext4_es_is_hole(&es))
                        goto add_delayed;

found:
                /*
                 * Delayed extent could be allocated by fallocate.
                 * So we need to check it.
                 */
                if (ext4_es_is_delayed(&es)) {
                        map->m_flags |= EXT4_MAP_DELAYED;
                        return 0;
                }

                map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk;
                if (ext4_es_is_written(&es))
                        map->m_flags |= EXT4_MAP_MAPPED;
                else if (ext4_es_is_unwritten(&es))
                        map->m_flags |= EXT4_MAP_UNWRITTEN;
                else
                        BUG();

#ifdef ES_AGGRESSIVE_TEST
                ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0);
#endif
                return 0;
        }

        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read(&EXT4_I(inode)->i_data_sem);
        if (ext4_has_inline_data(inode))
                retval = 0;
        else
                retval = ext4_map_query_blocks(NULL, inode, map);
        up_read(&EXT4_I(inode)->i_data_sem);
        if (retval)
                return retval < 0 ? retval : 0;

add_delayed:
        down_write(&EXT4_I(inode)->i_data_sem);
        /*
         * Page fault path (ext4_page_mkwrite does not take i_rwsem)
         * and fallocate path (no folio lock) can race. Make sure we
         * lookup the extent status tree here again while i_data_sem
         * is held in write mode, before inserting a new da entry in
         * the extent status tree.
         */
        if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
                map->m_len = min_t(unsigned int, map->m_len,
                                   es.es_len - (map->m_lblk - es.es_lblk));

                if (!ext4_es_is_hole(&es)) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        goto found;
                }
        } else if (!ext4_has_inline_data(inode)) {
                retval = ext4_map_query_blocks(NULL, inode, map);
                if (retval) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        return retval < 0 ? retval : 0;
                }
        }

        map->m_flags |= EXT4_MAP_DELAYED;
        retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len);
        up_write(&EXT4_I(inode)->i_data_sem);

        return retval;
}

/*
 * This is a special get_block_t callback which is used by
 * ext4_da_write_begin().  It will either return mapped block or
 * reserve space for a single block.
 *
 * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
 * We also have b_blocknr = -1 and b_bdev initialized properly
 *
 * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
 * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
 * initialized properly.
 */
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create)
{
        struct ext4_map_blocks map;
        sector_t invalid_block = ~((sector_t) 0xffff);
        int ret = 0;

        BUG_ON(create == 0);
        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);

        if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es))
                invalid_block = ~0;

        map.m_lblk = iblock;
        map.m_len = 1;

        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
        ret = ext4_da_map_blocks(inode, &map);
        if (ret < 0)
                return ret;

        if (map.m_flags & EXT4_MAP_DELAYED) {
                map_bh(bh, inode->i_sb, invalid_block);
                set_buffer_new(bh);
                set_buffer_delay(bh);
                return 0;
        }

        map_bh(bh, inode->i_sb, map.m_pblk);
        ext4_update_bh_state(bh, map.m_flags);

        if (buffer_unwritten(bh)) {
                /* A delayed write to unwritten bh should be marked
                 * new and mapped.  Mapped ensures that we don't do
                 * get_block multiple times when we write to the same
                 * offset and new ensures that we do proper zero out
                 * for partial write.
                 */
                set_buffer_new(bh);
                set_buffer_mapped(bh);
        }
        return 0;
}

static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio)
{
        mpd->first_page += folio_nr_pages(folio);
        folio_unlock(folio);
}

static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
{
        size_t len;
        loff_t size;
        int err;

        BUG_ON(folio->index != mpd->first_page);
        folio_clear_dirty_for_io(folio);
        /*
         * We have to be very careful here!  Nothing protects writeback path
         * against i_size changes and the page can be writeably mapped into
         * page tables. So an application can be growing i_size and writing
         * data through mmap while writeback runs. folio_clear_dirty_for_io()
         * write-protects our page in page tables and the page cannot get
         * written to again until we release folio lock. So only after
         * folio_clear_dirty_for_io() we are safe to sample i_size for
         * ext4_bio_write_folio() to zero-out tail of the written page. We rely
         * on the barrier provided by folio_test_clear_dirty() in
         * folio_clear_dirty_for_io() to make sure i_size is really sampled only
         * after page tables are updated.
         */
        size = i_size_read(mpd->inode);
        len = folio_size(folio);
        if (folio_pos(folio) + len > size &&
            !ext4_verity_in_progress(mpd->inode))
                len = size & (len - 1);
        err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
        if (!err)
                mpd->wbc->nr_to_write--;

        return err;
}

#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay))

/*
 * mballoc gives us at most this number of blocks...
 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
 * The rest of mballoc seems to handle chunks up to full group size.
 */
#define MAX_WRITEPAGES_EXTENT_LEN 2048

/*
 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
 *
 * @mpd - extent of blocks
 * @lblk - logical number of the block in the file
 * @bh - buffer head we want to add to the extent
 *
 * The function is used to collect contig. blocks in the same state. If the
 * buffer doesn't require mapping for writeback and we haven't started the
 * extent of buffers to map yet, the function returns 'true' immediately - the
 * caller can write the buffer right away. Otherwise the function returns true
 * if the block has been added to the extent, false if the block couldn't be
 * added.
 */
static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
                                   struct buffer_head *bh)
{
        struct ext4_map_blocks *map = &mpd->map;

        /* Buffer that doesn't need mapping for writeback? */
        if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
            (!buffer_delay(bh) && !buffer_unwritten(bh))) {
                /* So far no extent to map => we write the buffer right away */
                if (map->m_len == 0)
                        return true;
                return false;
        }

        /* First block in the extent? */
        if (map->m_len == 0) {
                /* We cannot map unless handle is started... */
                if (!mpd->do_map)
                        return false;
                map->m_lblk = lblk;
                map->m_len = 1;
                map->m_flags = bh->b_state & BH_FLAGS;
                return true;
        }

        /* Don't go larger than mballoc is willing to allocate */
        if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
                return false;

        /* Can we merge the block to our big extent? */
        if (lblk == map->m_lblk + map->m_len &&
            (bh->b_state & BH_FLAGS) == map->m_flags) {
                map->m_len++;
                return true;
        }
        return false;
}

/*
 * mpage_process_page_bufs - submit page buffers for IO or add them to extent
 *
 * @mpd - extent of blocks for mapping
 * @head - the first buffer in the page
 * @bh - buffer we should start processing from
 * @lblk - logical number of the block in the file corresponding to @bh
 *
 * Walk through page buffers from @bh upto @head (exclusive) and either submit
 * the page for IO if all buffers in this page were mapped and there's no
 * accumulated extent of buffers to map or add buffers in the page to the
 * extent of buffers to map. The function returns 1 if the caller can continue
 * by processing the next page, 0 if it should stop adding buffers to the
 * extent to map because we cannot extend it anymore. It can also return value
 * < 0 in case of error during IO submission.
 */
static int mpage_process_page_bufs(struct mpage_da_data *mpd,
                                   struct buffer_head *head,
                                   struct buffer_head *bh,
                                   ext4_lblk_t lblk)
{
        struct inode *inode = mpd->inode;
        int err;
        ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1)
                                                        >> inode->i_blkbits;

        if (ext4_verity_in_progress(inode))
                blocks = EXT_MAX_BLOCKS;

        do {
                BUG_ON(buffer_locked(bh));

                if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
                        /* Found extent to map? */
                        if (mpd->map.m_len)
                                return 0;
                        /* Buffer needs mapping and handle is not started? */
                        if (!mpd->do_map)
                                return 0;
                        /* Everything mapped so far and we hit EOF */
                        break;
                }
        } while (lblk++, (bh = bh->b_this_page) != head);
        /* So far everything mapped? Submit the page for IO. */
        if (mpd->map.m_len == 0) {
                err = mpage_submit_folio(mpd, head->b_folio);
                if (err < 0)
                        return err;
                mpage_folio_done(mpd, head->b_folio);
        }
        if (lblk >= blocks) {
                mpd->scanned_until_end = 1;
                return 0;
        }
        return 1;
}

/*
 * mpage_process_folio - update folio buffers corresponding to changed extent
 *                         and may submit fully mapped page for IO
 * @mpd: description of extent to map, on return next extent to map
 * @folio: Contains these buffers.
 * @m_lblk: logical block mapping.
 * @m_pblk: corresponding physical mapping.
 * @map_bh: determines on return whether this page requires any further
 *                  mapping or not.
 *
 * Scan given folio buffers corresponding to changed extent and update buffer
 * state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits.
 * If the given folio is not fully mapped, we update @mpd to the next extent in
 * the given folio that needs mapping & return @map_bh as true.
 */
static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio,
                              ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk,
                              bool *map_bh)
{
        struct buffer_head *head, *bh;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        ext4_lblk_t lblk = *m_lblk;
        ext4_fsblk_t pblock = *m_pblk;
        int err = 0;
        int blkbits = mpd->inode->i_blkbits;
        ssize_t io_end_size = 0;
        struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end);

        bh = head = folio_buffers(folio);
        do {
                if (lblk < mpd->map.m_lblk)
                        continue;
                if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
                        /*
                         * Buffer after end of mapped extent.
                         * Find next buffer in the folio to map.
                         */
                        mpd->map.m_len = 0;
                        mpd->map.m_flags = 0;
                        io_end_vec->size += io_end_size;

                        err = mpage_process_page_bufs(mpd, head, bh, lblk);
                        if (err > 0)
                                err = 0;
                        if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) {
                                io_end_vec = ext4_alloc_io_end_vec(io_end);
                                if (IS_ERR(io_end_vec)) {
                                        err = PTR_ERR(io_end_vec);
                                        goto out;
                                }
                                io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits;
                        }
                        *map_bh = true;
                        goto out;
                }
                if (buffer_delay(bh)) {
                        clear_buffer_delay(bh);
                        bh->b_blocknr = pblock++;
                }
                clear_buffer_unwritten(bh);
                io_end_size += (1 << blkbits);
        } while (lblk++, (bh = bh->b_this_page) != head);

        io_end_vec->size += io_end_size;
        *map_bh = false;
out:
        *m_lblk = lblk;
        *m_pblk = pblock;
        return err;
}

/*
 * mpage_map_buffers - update buffers corresponding to changed extent and
 *                       submit fully mapped pages for IO
 *
 * @mpd - description of extent to map, on return next extent to map
 *
 * Scan buffers corresponding to changed extent (we expect corresponding pages
 * to be already locked) and update buffer state according to new extent state.
 * We map delalloc buffers to their physical location, clear unwritten bits,
 * and mark buffers as uninit when we perform writes to unwritten extents
 * and do extent conversion after IO is finished. If the last page is not fully
 * mapped, we update @map to the next extent in the last page that needs
 * mapping. Otherwise we submit the page for IO.
 */
static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
{
        struct folio_batch fbatch;
        unsigned nr, i;
        struct inode *inode = mpd->inode;
        int bpp_bits = PAGE_SHIFT - inode->i_blkbits;
        pgoff_t start, end;
        ext4_lblk_t lblk;
        ext4_fsblk_t pblock;
        int err;
        bool map_bh = false;

        start = mpd->map.m_lblk >> bpp_bits;
        end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
        lblk = start << bpp_bits;
        pblock = mpd->map.m_pblk;

        folio_batch_init(&fbatch);
        while (start <= end) {
                nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch);
                if (nr == 0)
                        break;
                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        err = mpage_process_folio(mpd, folio, &lblk, &pblock,
                                                 &map_bh);
                        /*
                         * If map_bh is true, means page may require further bh
                         * mapping, or maybe the page was submitted for IO.
                         * So we return to call further extent mapping.
                         */
                        if (err < 0 || map_bh)
                                goto out;
                        /* Page fully mapped - let IO run! */
                        err = mpage_submit_folio(mpd, folio);
                        if (err < 0)
                                goto out;
                        mpage_folio_done(mpd, folio);
                }
                folio_batch_release(&fbatch);
        }
        /* Extent fully mapped and matches with page boundary. We are done. */
        mpd->map.m_len = 0;
        mpd->map.m_flags = 0;
        return 0;
out:
        folio_batch_release(&fbatch);
        return err;
}

static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int get_blocks_flags;
        int err, dioread_nolock;

        trace_ext4_da_write_pages_extent(inode, map);
        /*
         * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
         * to convert an unwritten extent to be initialized (in the case
         * where we have written into one or more preallocated blocks).  It is
         * possible that we're going to need more metadata blocks than
         * previously reserved. However we must not fail because we're in
         * writeback and there is nothing we can do about it so it might result
         * in data loss.  So use reserved blocks to allocate metadata if
         * possible.
         */
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
                           EXT4_GET_BLOCKS_METADATA_NOFAIL |
                           EXT4_GET_BLOCKS_IO_SUBMIT;
        dioread_nolock = ext4_should_dioread_nolock(inode);
        if (dioread_nolock)
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;

        err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
        if (err < 0)
                return err;
        if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) {
                if (!mpd->io_submit.io_end->handle &&
                    ext4_handle_valid(handle)) {
                        mpd->io_submit.io_end->handle = handle->h_rsv_handle;
                        handle->h_rsv_handle = NULL;
                }
                ext4_set_io_unwritten_flag(mpd->io_submit.io_end);
        }

        BUG_ON(map->m_len == 0);
        return 0;
}

/*
 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
 *                                 mpd->len and submit pages underlying it for IO
 *
 * @handle - handle for journal operations
 * @mpd - extent to map
 * @give_up_on_write - we set this to true iff there is a fatal error and there
 *                     is no hope of writing the data. The caller should discard
 *                     dirty pages to avoid infinite loops.
 *
 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
 * delayed, blocks are allocated, if it is unwritten, we may need to convert
 * them to initialized or split the described range from larger unwritten
 * extent. Note that we need not map all the described range since allocation
 * can return less blocks or the range is covered by more unwritten extents. We
 * cannot map more because we are limited by reserved transaction credits. On
 * the other hand we always make sure that the last touched page is fully
 * mapped so that it can be written out (and thus forward progress is
 * guaranteed). After mapping we submit all mapped pages for IO.
 */
static int mpage_map_and_submit_extent(handle_t *handle,
                                       struct mpage_da_data *mpd,
                                       bool *give_up_on_write)
{
        struct inode *inode = mpd->inode;
        struct ext4_map_blocks *map = &mpd->map;
        int err;
        loff_t disksize;
        int progress = 0;
        ext4_io_end_t *io_end = mpd->io_submit.io_end;
        struct ext4_io_end_vec *io_end_vec;

        io_end_vec = ext4_alloc_io_end_vec(io_end);
        if (IS_ERR(io_end_vec))
                return PTR_ERR(io_end_vec);
        io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits;
        do {
                err = mpage_map_one_extent(handle, mpd);
                if (err < 0) {
                        struct super_block *sb = inode->i_sb;

                        if (ext4_emergency_state(sb))
                                goto invalidate_dirty_pages;
                        /*
                         * Let the uper layers retry transient errors.
                         * In the case of ENOSPC, if ext4_count_free_blocks()
                         * is non-zero, a commit should free up blocks.
                         */
                        if ((err == -ENOMEM) ||
                            (err == -ENOSPC && ext4_count_free_clusters(sb))) {
                                if (progress)
                                        goto update_disksize;
                                return err;
                        }
                        ext4_msg(sb, KERN_CRIT,
                                 "Delayed block allocation failed for "
                                 "inode %lu at logical offset %llu with"
                                 " max blocks %u with error %d",
                                 inode->i_ino,
                                 (unsigned long long)map->m_lblk,
                                 (unsigned)map->m_len, -err);
                        ext4_msg(sb, KERN_CRIT,
                                 "This should not happen!! Data will "
                                 "be lost\n");
                        if (err == -ENOSPC)
                                ext4_print_free_blocks(inode);
                invalidate_dirty_pages:
                        *give_up_on_write = true;
                        return err;
                }
                progress = 1;
                /*
                 * Update buffer state, submit mapped pages, and get us new
                 * extent to map
                 */
                err = mpage_map_and_submit_buffers(mpd);
                if (err < 0)
                        goto update_disksize;
        } while (map->m_len);

update_disksize:
        /*
         * Update on-disk size after IO is submitted.  Races with
         * truncate are avoided by checking i_size under i_data_sem.
         */
        disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
        if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
                int err2;
                loff_t i_size;

                down_write(&EXT4_I(inode)->i_data_sem);
                i_size = i_size_read(inode);
                if (disksize > i_size)
                        disksize = i_size;
                if (disksize > EXT4_I(inode)->i_disksize)
                        EXT4_I(inode)->i_disksize = disksize;
                up_write(&EXT4_I(inode)->i_data_sem);
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (err2) {
                        ext4_error_err(inode->i_sb, -err2,
                                       "Failed to mark inode %lu dirty",
                                       inode->i_ino);
                }
                if (!err)
                        err = err2;
        }
        return err;
}

/*
 * Calculate the total number of credits to reserve for one writepages
 * iteration. This is called from ext4_writepages(). We map an extent of
 * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
 * bpp - 1 blocks in bpp different extents.
 */
static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
        int bpp = ext4_journal_blocks_per_page(inode);

        return ext4_meta_trans_blocks(inode,
                                MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}

static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
                                     size_t len)
{
        struct buffer_head *page_bufs = folio_buffers(folio);
        struct inode *inode = folio->mapping->host;
        int ret, err;

        ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
                                     NULL, do_journal_get_write_access);
        err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
                                     NULL, write_end_fn);
        if (ret == 0)
                ret = err;
        err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
        if (ret == 0)
                ret = err;
        EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;

        return ret;
}

static int mpage_journal_page_buffers(handle_t *handle,
                                      struct mpage_da_data *mpd,
                                      struct folio *folio)
{
        struct inode *inode = mpd->inode;
        loff_t size = i_size_read(inode);
        size_t len = folio_size(folio);

        folio_clear_checked(folio);
        mpd->wbc->nr_to_write--;

        if (folio_pos(folio) + len > size &&
            !ext4_verity_in_progress(inode))
                len = size & (len - 1);

        return ext4_journal_folio_buffers(handle, folio, len);
}

/*
 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
 *                                  needing mapping, submit mapped pages
 *
 * @mpd - where to look for pages
 *
 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
 * IO immediately. If we cannot map blocks, we submit just already mapped
 * buffers in the page for IO and keep page dirty. When we can map blocks and
 * we find a page which isn't mapped we start accumulating extent of buffers
 * underlying these pages that needs mapping (formed by either delayed or
 * unwritten buffers). We also lock the pages containing these buffers. The
 * extent found is returned in @mpd structure (starting at mpd->lblk with
 * length mpd->len blocks).
 *
 * Note that this function can attach bios to one io_end structure which are
 * neither logically nor physically contiguous. Although it may seem as an
 * unnecessary complication, it is actually inevitable in blocksize < pagesize
 * case as we need to track IO to all buffers underlying a page in one io_end.
 */
static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
{
        struct address_space *mapping = mpd->inode->i_mapping;
        struct folio_batch fbatch;
        unsigned int nr_folios;
        pgoff_t index = mpd->first_page;
        pgoff_t end = mpd->last_page;
        xa_mark_t tag;
        int i, err = 0;
        int blkbits = mpd->inode->i_blkbits;
        ext4_lblk_t lblk;
        struct buffer_head *head;
        handle_t *handle = NULL;
        int bpp = ext4_journal_blocks_per_page(mpd->inode);

        if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
                tag = PAGECACHE_TAG_TOWRITE;
        else
                tag = PAGECACHE_TAG_DIRTY;

        mpd->map.m_len = 0;
        mpd->next_page = index;
        if (ext4_should_journal_data(mpd->inode)) {
                handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
                                            bpp);
                if (IS_ERR(handle))
                        return PTR_ERR(handle);
        }
        folio_batch_init(&fbatch);
        while (index <= end) {
                nr_folios = filemap_get_folios_tag(mapping, &index, end,
                                tag, &fbatch);
                if (nr_folios == 0)
                        break;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        /*
                         * Accumulated enough dirty pages? This doesn't apply
                         * to WB_SYNC_ALL mode. For integrity sync we have to
                         * keep going because someone may be concurrently
                         * dirtying pages, and we might have synced a lot of
                         * newly appeared dirty pages, but have not synced all
                         * of the old dirty pages.
                         */
                        if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
                            mpd->wbc->nr_to_write <=
                            mpd->map.m_len >> (PAGE_SHIFT - blkbits))
                                goto out;

                        /* If we can't merge this page, we are done. */
                        if (mpd->map.m_len > 0 && mpd->next_page != folio->index)
                                goto out;

                        if (handle) {
                                err = ext4_journal_ensure_credits(handle, bpp,
                                                                  0);
                                if (err < 0)
                                        goto out;
                        }

                        folio_lock(folio);
                        /*
                         * If the page is no longer dirty, or its mapping no
                         * longer corresponds to inode we are writing (which
                         * means it has been truncated or invalidated), or the
                         * page is already under writeback and we are not doing
                         * a data integrity writeback, skip the page
                         */
                        if (!folio_test_dirty(folio) ||
                            (folio_test_writeback(folio) &&
                             (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
                            unlikely(folio->mapping != mapping)) {
                                folio_unlock(folio);
                                continue;
                        }

                        folio_wait_writeback(folio);
                        BUG_ON(folio_test_writeback(folio));

                        /*
                         * Should never happen but for buggy code in
                         * other subsystems that call
                         * set_page_dirty() without properly warning
                         * the file system first.  See [1] for more
                         * information.
                         *
                         * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
                         */
                        if (!folio_buffers(folio)) {
                                ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index);
                                folio_clear_dirty(folio);
                                folio_unlock(folio);
                                continue;
                        }

                        if (mpd->map.m_len == 0)
                                mpd->first_page = folio->index;
                        mpd->next_page = folio_next_index(folio);
                        /*
                         * Writeout when we cannot modify metadata is simple.
                         * Just submit the page. For data=journal mode we
                         * first handle writeout of the page for checkpoint and
                         * only after that handle delayed page dirtying. This
                         * makes sure current data is checkpointed to the final
                         * location before possibly journalling it again which
                         * is desirable when the page is frequently dirtied
                         * through a pin.
                         */
                        if (!mpd->can_map) {
                                err = mpage_submit_folio(mpd, folio);
                                if (err < 0)
                                        goto out;
                                /* Pending dirtying of journalled data? */
                                if (folio_test_checked(folio)) {
                                        err = mpage_journal_page_buffers(handle,
                                                mpd, folio);
                                        if (err < 0)
                                                goto out;
                                        mpd->journalled_more_data = 1;
                                }
                                mpage_folio_done(mpd, folio);
                        } else {
                                /* Add all dirty buffers to mpd */
                                lblk = ((ext4_lblk_t)folio->index) <<
                                        (PAGE_SHIFT - blkbits);
                                head = folio_buffers(folio);
                                err = mpage_process_page_bufs(mpd, head, head,
                                                lblk);
                                if (err <= 0)
                                        goto out;
                                err = 0;
                        }
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
        mpd->scanned_until_end = 1;
        if (handle)
                ext4_journal_stop(handle);
        return 0;
out:
        folio_batch_release(&fbatch);
        if (handle)
                ext4_journal_stop(handle);
        return err;
}

static int ext4_do_writepages(struct mpage_da_data *mpd)
{
        struct writeback_control *wbc = mpd->wbc;
        pgoff_t        writeback_index = 0;
        long nr_to_write = wbc->nr_to_write;
        int range_whole = 0;
        int cycled = 1;
        handle_t *handle = NULL;
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
        int needed_blocks, rsv_blocks = 0, ret = 0;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        struct blk_plug plug;
        bool give_up_on_write = false;

        trace_ext4_writepages(inode, wbc);

        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
         * because that could violate lock ordering on umount
         */
        if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                goto out_writepages;

        /*
         * If the filesystem has aborted, it is read-only, so return
         * right away instead of dumping stack traces later on that
         * will obscure the real source of the problem.  We test
         * fs shutdown state instead of sb->s_flag's SB_RDONLY because
         * the latter could be true if the filesystem is mounted
         * read-only, and in that case, ext4_writepages should
         * *never* be called, so if that ever happens, we would want
         * the stack trace.
         */
        ret = ext4_emergency_state(mapping->host->i_sb);
        if (unlikely(ret))
                goto out_writepages;

        /*
         * If we have inline data and arrive here, it means that
         * we will soon create the block for the 1st page, so
         * we'd better clear the inline data here.
         */
        if (ext4_has_inline_data(inode)) {
                /* Just inode will be modified... */
                handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        goto out_writepages;
                }
                BUG_ON(ext4_test_inode_state(inode,
                                EXT4_STATE_MAY_INLINE_DATA));
                ext4_destroy_inline_data(handle, inode);
                ext4_journal_stop(handle);
        }

        /*
         * data=journal mode does not do delalloc so we just need to writeout /
         * journal already mapped buffers. On the other hand we need to commit
         * transaction to make data stable. We expect all the data to be
         * already in the journal (the only exception are DMA pinned pages
         * dirtied behind our back) so we commit transaction here and run the
         * writeback loop to checkpoint them. The checkpointing is not actually
         * necessary to make data persistent *but* quite a few places (extent
         * shifting operations, fsverity, ...) depend on being able to drop
         * pagecache pages after calling filemap_write_and_wait() and for that
         * checkpointing needs to happen.
         */
        if (ext4_should_journal_data(inode)) {
                mpd->can_map = 0;
                if (wbc->sync_mode == WB_SYNC_ALL)
                        ext4_fc_commit(sbi->s_journal,
                                       EXT4_I(inode)->i_datasync_tid);
        }
        mpd->journalled_more_data = 0;

        if (ext4_should_dioread_nolock(inode)) {
                /*
                 * We may need to convert up to one extent per block in
                 * the page and we may dirty the inode.
                 */
                rsv_blocks = 1 + ext4_chunk_trans_blocks(inode,
                                                PAGE_SIZE >> inode->i_blkbits);
        }

        if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                range_whole = 1;

        if (wbc->range_cyclic) {
                writeback_index = mapping->writeback_index;
                if (writeback_index)
                        cycled = 0;
                mpd->first_page = writeback_index;
                mpd->last_page = -1;
        } else {
                mpd->first_page = wbc->range_start >> PAGE_SHIFT;
                mpd->last_page = wbc->range_end >> PAGE_SHIFT;
        }

        ext4_io_submit_init(&mpd->io_submit, wbc);
retry:
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                tag_pages_for_writeback(mapping, mpd->first_page,
                                        mpd->last_page);
        blk_start_plug(&plug);

        /*
         * First writeback pages that don't need mapping - we can avoid
         * starting a transaction unnecessarily and also avoid being blocked
         * in the block layer on device congestion while having transaction
         * started.
         */
        mpd->do_map = 0;
        mpd->scanned_until_end = 0;
        mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
        if (!mpd->io_submit.io_end) {
                ret = -ENOMEM;
                goto unplug;
        }
        ret = mpage_prepare_extent_to_map(mpd);
        /* Unlock pages we didn't use */
        mpage_release_unused_pages(mpd, false);
        /* Submit prepared bio */
        ext4_io_submit(&mpd->io_submit);
        ext4_put_io_end_defer(mpd->io_submit.io_end);
        mpd->io_submit.io_end = NULL;
        if (ret < 0)
                goto unplug;

        while (!mpd->scanned_until_end && wbc->nr_to_write > 0) {
                /* For each extent of pages we use new io_end */
                mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
                if (!mpd->io_submit.io_end) {
                        ret = -ENOMEM;
                        break;
                }

                WARN_ON_ONCE(!mpd->can_map);
                /*
                 * We have two constraints: We find one extent to map and we
                 * must always write out whole page (makes a difference when
                 * blocksize < pagesize) so that we don't block on IO when we
                 * try to write out the rest of the page. Journalled mode is
                 * not supported by delalloc.
                 */
                BUG_ON(ext4_should_journal_data(inode));
                needed_blocks = ext4_da_writepages_trans_blocks(inode);

                /* start a new transaction */
                handle = ext4_journal_start_with_reserve(inode,
                                EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        /* Release allocated io_end */
                        ext4_put_io_end(mpd->io_submit.io_end);
                        mpd->io_submit.io_end = NULL;
                        break;
                }
                mpd->do_map = 1;

                trace_ext4_da_write_pages(inode, mpd->first_page, wbc);
                ret = mpage_prepare_extent_to_map(mpd);
                if (!ret && mpd->map.m_len)
                        ret = mpage_map_and_submit_extent(handle, mpd,
                                        &give_up_on_write);
                /*
                 * Caution: If the handle is synchronous,
                 * ext4_journal_stop() can wait for transaction commit
                 * to finish which may depend on writeback of pages to
                 * complete or on page lock to be released.  In that
                 * case, we have to wait until after we have
                 * submitted all the IO, released page locks we hold,
                 * and dropped io_end reference (for extent conversion
                 * to be able to complete) before stopping the handle.
                 */
                if (!ext4_handle_valid(handle) || handle->h_sync == 0) {
                        ext4_journal_stop(handle);
                        handle = NULL;
                        mpd->do_map = 0;
                }
                /* Unlock pages we didn't use */
                mpage_release_unused_pages(mpd, give_up_on_write);
                /* Submit prepared bio */
                ext4_io_submit(&mpd->io_submit);

                /*
                 * Drop our io_end reference we got from init. We have
                 * to be careful and use deferred io_end finishing if
                 * we are still holding the transaction as we can
                 * release the last reference to io_end which may end
                 * up doing unwritten extent conversion.
                 */
                if (handle) {
                        ext4_put_io_end_defer(mpd->io_submit.io_end);
                        ext4_journal_stop(handle);
                } else
                        ext4_put_io_end(mpd->io_submit.io_end);
                mpd->io_submit.io_end = NULL;

                if (ret == -ENOSPC && sbi->s_journal) {
                        /*
                         * Commit the transaction which would
                         * free blocks released in the transaction
                         * and try again
                         */
                        jbd2_journal_force_commit_nested(sbi->s_journal);
                        ret = 0;
                        continue;
                }
                /* Fatal error - ENOMEM, EIO... */
                if (ret)
                        break;
        }
unplug:
        blk_finish_plug(&plug);
        if (!ret && !cycled && wbc->nr_to_write > 0) {
                cycled = 1;
                mpd->last_page = writeback_index - 1;
                mpd->first_page = 0;
                goto retry;
        }

        /* Update index */
        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
                /*
                 * Set the writeback_index so that range_cyclic
                 * mode will write it back later
                 */
                mapping->writeback_index = mpd->first_page;

out_writepages:
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        return ret;
}

static int ext4_writepages(struct address_space *mapping,
                           struct writeback_control *wbc)
{
        struct super_block *sb = mapping->host->i_sb;
        struct mpage_da_data mpd = {
                .inode = mapping->host,
                .wbc = wbc,
                .can_map = 1,
        };
        int ret;
        int alloc_ctx;

        ret = ext4_emergency_state(sb);
        if (unlikely(ret))
                return ret;

        alloc_ctx = ext4_writepages_down_read(sb);
        ret = ext4_do_writepages(&mpd);
        /*
         * For data=journal writeback we could have come across pages marked
         * for delayed dirtying (PageChecked) which were just added to the
         * running transaction. Try once more to get them to stable storage.
         */
        if (!ret && mpd.journalled_more_data)
                ret = ext4_do_writepages(&mpd);
        ext4_writepages_up_read(sb, alloc_ctx);

        return ret;
}

int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode)
{
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
                .range_start = jinode->i_dirty_start,
                .range_end = jinode->i_dirty_end,
        };
        struct mpage_da_data mpd = {
                .inode = jinode->i_vfs_inode,
                .wbc = &wbc,
                .can_map = 0,
        };
        return ext4_do_writepages(&mpd);
}

static int ext4_dax_writepages(struct address_space *mapping,
                               struct writeback_control *wbc)
{
        int ret;
        long nr_to_write = wbc->nr_to_write;
        struct inode *inode = mapping->host;
        int alloc_ctx;

        ret = ext4_emergency_state(inode->i_sb);
        if (unlikely(ret))
                return ret;

        alloc_ctx = ext4_writepages_down_read(inode->i_sb);
        trace_ext4_writepages(inode, wbc);

        ret = dax_writeback_mapping_range(mapping,
                                          EXT4_SB(inode->i_sb)->s_daxdev, wbc);
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
        ext4_writepages_up_read(inode->i_sb, alloc_ctx);
        return ret;
}

static int ext4_nonda_switch(struct super_block *sb)
{
        s64 free_clusters, dirty_clusters;
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        /*
         * switch to non delalloc mode if we are running low
         * on free block. The free block accounting via percpu
         * counters can get slightly wrong with percpu_counter_batch getting
         * accumulated on each CPU without updating global counters
         * Delalloc need an accurate free block accounting. So switch
         * to non delalloc when we are near to error range.
         */
        free_clusters =
                percpu_counter_read_positive(&sbi->s_freeclusters_counter);
        dirty_clusters =
                percpu_counter_read_positive(&sbi->s_dirtyclusters_counter);
        /*
         * Start pushing delalloc when 1/2 of free blocks are dirty.
         */
        if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
                try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);

        if (2 * free_clusters < 3 * dirty_clusters ||
            free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
                /*
                 * free block count is less than 150% of dirty blocks
                 * or free blocks is less than watermark
                 */
                return 1;
        }
        return 0;
}

static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len,
                               struct folio **foliop, void **fsdata)
{
        int ret, retries = 0;
        struct folio *folio;
        pgoff_t index;
        struct inode *inode = mapping->host;

        ret = ext4_emergency_state(inode->i_sb);
        if (unlikely(ret))
                return ret;

        index = pos >> PAGE_SHIFT;

        if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) {
                *fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
                return ext4_write_begin(file, mapping, pos,
                                        len, foliop, fsdata);
        }
        *fsdata = (void *)0;
        trace_ext4_da_write_begin(inode, pos, len);

        if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
                ret = ext4_generic_write_inline_data(mapping, inode, pos, len,
                                                     foliop, fsdata, true);
                if (ret < 0)
                        return ret;
                if (ret == 1)
                        return 0;
        }

retry:
        folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        ret = ext4_block_write_begin(NULL, folio, pos, len,
                                     ext4_da_get_block_prep);
        if (ret < 0) {
                folio_unlock(folio);
                folio_put(folio);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold inode lock.
                 */
                if (pos + len > inode->i_size)
                        ext4_truncate_failed_write(inode);

                if (ret == -ENOSPC &&
                    ext4_should_retry_alloc(inode->i_sb, &retries))
                        goto retry;
                return ret;
        }

        *foliop = folio;
        return ret;
}

/*
 * Check if we should update i_disksize
 * when write to the end of file but not require block allocation
 */
static int ext4_da_should_update_i_disksize(struct folio *folio,
                                            unsigned long offset)
{
        struct buffer_head *bh;
        struct inode *inode = folio->mapping->host;
        unsigned int idx;
        int i;

        bh = folio_buffers(folio);
        idx = offset >> inode->i_blkbits;

        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;

        if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
                return 0;
        return 1;
}

static int ext4_da_do_write_end(struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio)
{
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool disksize_changed = false;
        loff_t new_i_size, zero_len = 0;
        handle_t *handle;

        if (unlikely(!folio_buffers(folio))) {
                folio_unlock(folio);
                folio_put(folio);
                return -EIO;
        }
        /*
         * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
         * flag, which all that's needed to trigger page writeback.
         */
        copied = block_write_end(NULL, mapping, pos, len, copied,
                        folio, NULL);
        new_i_size = pos + copied;

        /*
         * It's important to update i_size while still holding folio lock,
         * because folio writeout could otherwise come in and zero beyond
         * i_size.
         *
         * Since we are holding inode lock, we are sure i_disksize <=
         * i_size. We also know that if i_disksize < i_size, there are
         * delalloc writes pending in the range up to i_size. If the end of
         * the current write is <= i_size, there's no need to touch
         * i_disksize since writeback will push i_disksize up to i_size
         * eventually. If the end of the current write is > i_size and
         * inside an allocated block which ext4_da_should_update_i_disksize()
         * checked, we need to update i_disksize here as certain
         * ext4_writepages() paths not allocating blocks and update i_disksize.
         */
        if (new_i_size > inode->i_size) {
                unsigned long end;

                i_size_write(inode, new_i_size);
                end = (new_i_size - 1) & (PAGE_SIZE - 1);
                if (copied && ext4_da_should_update_i_disksize(folio, end)) {
                        ext4_update_i_disksize(inode, new_i_size);
                        disksize_changed = true;
                }
        }

        folio_unlock(folio);
        folio_put(folio);

        if (pos > old_size) {
                pagecache_isize_extended(inode, old_size, pos);
                zero_len = pos - old_size;
        }

        if (!disksize_changed && !zero_len)
                return copied;

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        if (zero_len)
                ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);

        return copied;
}

static int ext4_da_write_end(struct file *file,
                             struct address_space *mapping,
                             loff_t pos, unsigned len, unsigned copied,
                             struct folio *folio, void *fsdata)
{
        struct inode *inode = mapping->host;
        int write_mode = (int)(unsigned long)fsdata;

        if (write_mode == FALL_BACK_TO_NONDELALLOC)
                return ext4_write_end(file, mapping, pos,
                                      len, copied, folio, fsdata);

        trace_ext4_da_write_end(inode, pos, len, copied);

        if (write_mode != CONVERT_INLINE_DATA &&
            ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
            ext4_has_inline_data(inode))
                return ext4_write_inline_data_end(inode, pos, len, copied,
                                                  folio);

        if (unlikely(copied < len) && !folio_test_uptodate(folio))
                copied = 0;

        return ext4_da_do_write_end(mapping, pos, len, copied, folio);
}

/*
 * Force all delayed allocation blocks to be allocated for a given inode.
 */
int ext4_alloc_da_blocks(struct inode *inode)
{
        trace_ext4_alloc_da_blocks(inode);

        if (!EXT4_I(inode)->i_reserved_data_blocks)
                return 0;

        /*
         * We do something simple for now.  The filemap_flush() will
         * also start triggering a write of the data blocks, which is
         * not strictly speaking necessary (and for users of
         * laptop_mode, not even desirable).  However, to do otherwise
         * would require replicating code paths in:
         *
         * ext4_writepages() ->
         *    write_cache_pages() ---> (via passed in callback function)
         *        __mpage_da_writepage() -->
         *           mpage_add_bh_to_extent()
         *           mpage_da_map_blocks()
         *
         * The problem is that write_cache_pages(), located in
         * mm/page-writeback.c, marks pages clean in preparation for
         * doing I/O, which is not desirable if we're not planning on
         * doing I/O at all.
         *
         * We could call write_cache_pages(), and then redirty all of
         * the pages by calling redirty_page_for_writepage() but that
         * would be ugly in the extreme.  So instead we would need to
         * replicate parts of the code in the above functions,
         * simplifying them because we wouldn't actually intend to
         * write out the pages, but rather only collect contiguous
         * logical block extents, call the multi-block allocator, and
         * then update the buffer heads with the block allocations.
         *
         * For now, though, we'll cheat by calling filemap_flush(),
         * which will map the blocks, and start the I/O, but not
         * actually wait for the I/O to complete.
         */
        return filemap_flush(inode->i_mapping);
}

/*
 * bmap() is special.  It gets used by applications such as lilo and by
 * the swapper to find the on-disk block of a specific piece of data.
 *
 * Naturally, this is dangerous if the block concerned is still in the
 * journal.  If somebody makes a swapfile on an ext4 data-journaling
 * filesystem and enables swap, then they may get a nasty shock when the
 * data getting swapped to that swapfile suddenly gets overwritten by
 * the original zero's written out previously to the journal and
 * awaiting writeback in the kernel's buffer cache.
 *
 * So, if we see any bmap calls here on a modified, data-journaled file,
 * take extra steps to flush any blocks which might be in the cache.
 */
static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
{
        struct inode *inode = mapping->host;
        sector_t ret = 0;

        inode_lock_shared(inode);
        /*
         * We can get here for an inline file via the FIBMAP ioctl
         */
        if (ext4_has_inline_data(inode))
                goto out;

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
            (test_opt(inode->i_sb, DELALLOC) ||
             ext4_should_journal_data(inode))) {
                /*
                 * With delalloc or journalled data we want to sync the file so
                 * that we can make sure we allocate blocks for file and data
                 * is in place for the user to see it
                 */
                filemap_write_and_wait(mapping);
        }

        ret = iomap_bmap(mapping, block, &ext4_iomap_ops);

out:
        inode_unlock_shared(inode);
        return ret;
}

static int ext4_read_folio(struct file *file, struct folio *folio)
{
        int ret = -EAGAIN;
        struct inode *inode = folio->mapping->host;

        trace_ext4_read_folio(inode, folio);

        if (ext4_has_inline_data(inode))
                ret = ext4_readpage_inline(inode, folio);

        if (ret == -EAGAIN)
                return ext4_mpage_readpages(inode, NULL, folio);

        return ret;
}

static void ext4_readahead(struct readahead_control *rac)
{
        struct inode *inode = rac->mapping->host;

        /* If the file has inline data, no need to do readahead. */
        if (ext4_has_inline_data(inode))
                return;

        ext4_mpage_readpages(inode, rac, NULL);
}

static void ext4_invalidate_folio(struct folio *folio, size_t offset,
                                size_t length)
{
        trace_ext4_invalidate_folio(folio, offset, length);

        /* No journalling happens on data buffers when this function is used */
        WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio)));

        block_invalidate_folio(folio, offset, length);
}

static int __ext4_journalled_invalidate_folio(struct folio *folio,
                                            size_t offset, size_t length)
{
        journal_t *journal = EXT4_JOURNAL(folio->mapping->host);

        trace_ext4_journalled_invalidate_folio(folio, offset, length);

        /*
         * If it's a full truncate we just forget about the pending dirtying
         */
        if (offset == 0 && length == folio_size(folio))
                folio_clear_checked(folio);

        return jbd2_journal_invalidate_folio(journal, folio, offset, length);
}

/* Wrapper for aops... */
static void ext4_journalled_invalidate_folio(struct folio *folio,
                                           size_t offset,
                                           size_t length)
{
        WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0);
}

static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
        struct inode *inode = folio->mapping->host;
        journal_t *journal = EXT4_JOURNAL(inode);

        trace_ext4_release_folio(inode, folio);

        /* Page has dirty journalled data -> cannot release */
        if (folio_test_checked(folio))
                return false;
        if (journal)
                return jbd2_journal_try_to_free_buffers(journal, folio);
        else
                return try_to_free_buffers(folio);
}

static bool ext4_inode_datasync_dirty(struct inode *inode)
{
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;

        if (journal) {
                if (jbd2_transaction_committed(journal,
                        EXT4_I(inode)->i_datasync_tid))
                        return false;
                if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT))
                        return !list_empty(&EXT4_I(inode)->i_fc_list);
                return true;
        }

        /* Any metadata buffers to write? */
        if (!list_empty(&inode->i_mapping->i_private_list))
                return true;
        return inode->i_state & I_DIRTY_DATASYNC;
}

static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
                           struct ext4_map_blocks *map, loff_t offset,
                           loff_t length, unsigned int flags)
{
        u8 blkbits = inode->i_blkbits;

        /*
         * Writes that span EOF might trigger an I/O size update on completion,
         * so consider them to be dirty for the purpose of O_DSYNC, even if
         * there is no other metadata changes being made or are pending.
         */
        iomap->flags = 0;
        if (ext4_inode_datasync_dirty(inode) ||
            offset + length > i_size_read(inode))
                iomap->flags |= IOMAP_F_DIRTY;

        if (map->m_flags & EXT4_MAP_NEW)
                iomap->flags |= IOMAP_F_NEW;

        /* HW-offload atomics are always used */
        if (flags & IOMAP_ATOMIC)
                iomap->flags |= IOMAP_F_ATOMIC_BIO;

        if (flags & IOMAP_DAX)
                iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev;
        else
                iomap->bdev = inode->i_sb->s_bdev;
        iomap->offset = (u64) map->m_lblk << blkbits;
        iomap->length = (u64) map->m_len << blkbits;

        if ((map->m_flags & EXT4_MAP_MAPPED) &&
            !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                iomap->flags |= IOMAP_F_MERGED;

        /*
         * Flags passed to ext4_map_blocks() for direct I/O writes can result
         * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits
         * set. In order for any allocated unwritten extents to be converted
         * into written extents correctly within the ->end_io() handler, we
         * need to ensure that the iomap->type is set appropriately. Hence, the
         * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has
         * been set first.
         */
        if (map->m_flags & EXT4_MAP_UNWRITTEN) {
                iomap->type = IOMAP_UNWRITTEN;
                iomap->addr = (u64) map->m_pblk << blkbits;
                if (flags & IOMAP_DAX)
                        iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
        } else if (map->m_flags & EXT4_MAP_MAPPED) {
                iomap->type = IOMAP_MAPPED;
                iomap->addr = (u64) map->m_pblk << blkbits;
                if (flags & IOMAP_DAX)
                        iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
        } else if (map->m_flags & EXT4_MAP_DELAYED) {
                iomap->type = IOMAP_DELALLOC;
                iomap->addr = IOMAP_NULL_ADDR;
        } else {
                iomap->type = IOMAP_HOLE;
                iomap->addr = IOMAP_NULL_ADDR;
        }
}

static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
                            unsigned int flags)
{
        handle_t *handle;
        u8 blkbits = inode->i_blkbits;
        int ret, dio_credits, m_flags = 0, retries = 0;

        /*
         * Trim the mapping request to the maximum value that we can map at
         * once for direct I/O.
         */
        if (map->m_len > DIO_MAX_BLOCKS)
                map->m_len = DIO_MAX_BLOCKS;
        dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);

retry:
        /*
         * Either we allocate blocks and then don't get an unwritten extent, so
         * in that case we have reserved enough credits. Or, the blocks are
         * already allocated and unwritten. In that case, the extent conversion
         * fits into the credits as well.
         */
        handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        /*
         * DAX and direct I/O are the only two operations that are currently
         * supported with IOMAP_WRITE.
         */
        WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT)));
        if (flags & IOMAP_DAX)
                m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
        /*
         * We use i_size instead of i_disksize here because delalloc writeback
         * can complete at any point during the I/O and subsequently push the
         * i_disksize out to i_size. This could be beyond where direct I/O is
         * happening and thus expose allocated blocks to direct I/O reads.
         */
        else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode))
                m_flags = EXT4_GET_BLOCKS_CREATE;
        else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;

        ret = ext4_map_blocks(handle, inode, map, m_flags);

        /*
         * We cannot fill holes in indirect tree based inodes as that could
         * expose stale data in the case of a crash. Use the magic error code
         * to fallback to buffered I/O.
         */
        if (!m_flags && !ret)
                ret = -ENOTBLK;

        ext4_journal_stop(handle);
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;

        return ret;
}


static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
                unsigned flags, struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
                return -ERANGE;

        /*
         * Calculate the first and last logical blocks respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;

        if (flags & IOMAP_WRITE) {
                /*
                 * We check here if the blocks are already allocated, then we
                 * don't need to start a journal txn and we can directly return
                 * the mapping information. This could boost performance
                 * especially in multi-threaded overwrite requests.
                 */
                if (offset + length <= i_size_read(inode)) {
                        ret = ext4_map_blocks(NULL, inode, &map, 0);
                        if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
                                goto out;
                }
                ret = ext4_iomap_alloc(inode, &map, flags);
        } else {
                ret = ext4_map_blocks(NULL, inode, &map, 0);
        }

        if (ret < 0)
                return ret;
out:
        /*
         * When inline encryption is enabled, sometimes I/O to an encrypted file
         * has to be broken up to guarantee DUN contiguity.  Handle this by
         * limiting the length of the mapping returned.
         */
        map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);

        ext4_set_iomap(inode, iomap, &map, offset, length, flags);

        return 0;
}

static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
                loff_t length, unsigned flags, struct iomap *iomap,
                struct iomap *srcmap)
{
        int ret;

        /*
         * Even for writes we don't need to allocate blocks, so just pretend
         * we are reading to save overhead of starting a transaction.
         */
        flags &= ~IOMAP_WRITE;
        ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap);
        WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED);
        return ret;
}

static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
{
        /* must be a directio to fall back to buffered */
        if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
                    (IOMAP_WRITE | IOMAP_DIRECT))
                return false;

        /* atomic writes are all-or-nothing */
        if (flags & IOMAP_ATOMIC)
                return false;

        /* can only try again if we wrote nothing */
        return written == 0;
}

static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
                          ssize_t written, unsigned flags, struct iomap *iomap)
{
        /*
         * Check to see whether an error occurred while writing out the data to
         * the allocated blocks. If so, return the magic error code for
         * non-atomic write so that we fallback to buffered I/O and attempt to
         * complete the remainder of the I/O.
         * For non-atomic writes, any blocks that may have been
         * allocated in preparation for the direct I/O will be reused during
         * buffered I/O. For atomic write, we never fallback to buffered-io.
         */
        if (ext4_want_directio_fallback(flags, written))
                return -ENOTBLK;

        return 0;
}

const struct iomap_ops ext4_iomap_ops = {
        .iomap_begin                = ext4_iomap_begin,
        .iomap_end                = ext4_iomap_end,
};

const struct iomap_ops ext4_iomap_overwrite_ops = {
        .iomap_begin                = ext4_iomap_overwrite_begin,
        .iomap_end                = ext4_iomap_end,
};

static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
                                   loff_t length, unsigned int flags,
                                   struct iomap *iomap, struct iomap *srcmap)
{
        int ret;
        struct ext4_map_blocks map;
        u8 blkbits = inode->i_blkbits;

        if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
                return -EINVAL;

        if (ext4_has_inline_data(inode)) {
                ret = ext4_inline_data_iomap(inode, iomap);
                if (ret != -EAGAIN) {
                        if (ret == 0 && offset >= iomap->length)
                                ret = -ENOENT;
                        return ret;
                }
        }

        /*
         * Calculate the first and last logical block respectively.
         */
        map.m_lblk = offset >> blkbits;
        map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
                          EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;

        /*
         * Fiemap callers may call for offset beyond s_bitmap_maxbytes.
         * So handle it here itself instead of querying ext4_map_blocks().
         * Since ext4_map_blocks() will warn about it and will return
         * -EIO error.
         */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                if (offset >= sbi->s_bitmap_maxbytes) {
                        map.m_flags = 0;
                        goto set_iomap;
                }
        }

        ret = ext4_map_blocks(NULL, inode, &map, 0);
        if (ret < 0)
                return ret;
set_iomap:
        ext4_set_iomap(inode, iomap, &map, offset, length, flags);

        return 0;
}

const struct iomap_ops ext4_iomap_report_ops = {
        .iomap_begin = ext4_iomap_begin_report,
};

/*
 * For data=journal mode, folio should be marked dirty only when it was
 * writeably mapped. When that happens, it was already attached to the
 * transaction and marked as jbddirty (we take care of this in
 * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
 * so we should have nothing to do here, except for the case when someone
 * had the page pinned and dirtied the page through this pin (e.g. by doing
 * direct IO to it). In that case we'd need to attach buffers here to the
 * transaction but we cannot due to lock ordering.  We cannot just dirty the
 * folio and leave attached buffers clean, because the buffers' dirty state is
 * "definitive".  We cannot just set the buffers dirty or jbddirty because all
 * the journalling code will explode.  So what we do is to mark the folio
 * "pending dirty" and next time ext4_writepages() is called, attach buffers
 * to the transaction appropriately.
 */
static bool ext4_journalled_dirty_folio(struct address_space *mapping,
                struct folio *folio)
{
        WARN_ON_ONCE(!folio_buffers(folio));
        if (folio_maybe_dma_pinned(folio))
                folio_set_checked(folio);
        return filemap_dirty_folio(mapping, folio);
}

static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio));
        WARN_ON_ONCE(!folio_buffers(folio));
        return block_dirty_folio(mapping, folio);
}

static int ext4_iomap_swap_activate(struct swap_info_struct *sis,
                                    struct file *file, sector_t *span)
{
        return iomap_swapfile_activate(sis, file, span,
                                       &ext4_iomap_report_ops);
}

static const struct address_space_operations ext4_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_write_end,
        .dirty_folio                = ext4_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_journalled_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_write_begin,
        .write_end                = ext4_journalled_write_end,
        .dirty_folio                = ext4_journalled_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_journalled_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio_norefs,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_da_aops = {
        .read_folio                = ext4_read_folio,
        .readahead                = ext4_readahead,
        .writepages                = ext4_writepages,
        .write_begin                = ext4_da_write_begin,
        .write_end                = ext4_da_write_end,
        .dirty_folio                = ext4_dirty_folio,
        .bmap                        = ext4_bmap,
        .invalidate_folio        = ext4_invalidate_folio,
        .release_folio                = ext4_release_folio,
        .migrate_folio                = buffer_migrate_folio,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_folio        = generic_error_remove_folio,
        .swap_activate                = ext4_iomap_swap_activate,
};

static const struct address_space_operations ext4_dax_aops = {
        .writepages                = ext4_dax_writepages,
        .dirty_folio                = noop_dirty_folio,
        .bmap                        = ext4_bmap,
        .swap_activate                = ext4_iomap_swap_activate,
};

void ext4_set_aops(struct inode *inode)
{
        switch (ext4_inode_journal_mode(inode)) {
        case EXT4_INODE_ORDERED_DATA_MODE:
        case EXT4_INODE_WRITEBACK_DATA_MODE:
                break;
        case EXT4_INODE_JOURNAL_DATA_MODE:
                inode->i_mapping->a_ops = &ext4_journalled_aops;
                return;
        default:
                BUG();
        }
        if (IS_DAX(inode))
                inode->i_mapping->a_ops = &ext4_dax_aops;
        else if (test_opt(inode->i_sb, DELALLOC))
                inode->i_mapping->a_ops = &ext4_da_aops;
        else
                inode->i_mapping->a_ops = &ext4_aops;
}

/*
 * Here we can't skip an unwritten buffer even though it usually reads zero
 * because it might have data in pagecache (eg, if called from ext4_zero_range,
 * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
 * racing writeback can come later and flush the stale pagecache to disk.
 */
static int __ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
{
        ext4_fsblk_t index = from >> PAGE_SHIFT;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
        struct folio *folio;
        int err = 0;

        folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT,
                                    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
                                    mapping_gfp_constraint(mapping, ~__GFP_FS));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        blocksize = inode->i_sb->s_blocksize;

        iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio, blocksize, 0);

        /* Find the buffer that contains "offset" */
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }
        if (buffer_freed(bh)) {
                BUFFER_TRACE(bh, "freed: skip");
                goto unlock;
        }
        if (!buffer_mapped(bh)) {
                BUFFER_TRACE(bh, "unmapped");
                ext4_get_block(inode, iblock, bh, 0);
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh)) {
                        BUFFER_TRACE(bh, "still unmapped");
                        goto unlock;
                }
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (folio_test_uptodate(folio))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh)) {
                err = ext4_read_bh_lock(bh, 0, true);
                if (err)
                        goto unlock;
                if (fscrypt_inode_uses_fs_layer_crypto(inode)) {
                        /* We expect the key to be set. */
                        BUG_ON(!fscrypt_has_encryption_key(inode));
                        err = fscrypt_decrypt_pagecache_blocks(folio,
                                                               blocksize,
                                                               bh_offset(bh));
                        if (err) {
                                clear_buffer_uptodate(bh);
                                goto unlock;
                        }
                }
        }
        if (ext4_should_journal_data(inode)) {
                BUFFER_TRACE(bh, "get write access");
                err = ext4_journal_get_write_access(handle, inode->i_sb, bh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto unlock;
        }
        folio_zero_range(folio, offset, length);
        BUFFER_TRACE(bh, "zeroed end of block");

        if (ext4_should_journal_data(inode)) {
                err = ext4_dirty_journalled_data(handle, bh);
        } else {
                err = 0;
                mark_buffer_dirty(bh);
                if (ext4_should_order_data(inode))
                        err = ext4_jbd2_inode_add_write(handle, inode, from,
                                        length);
        }

unlock:
        folio_unlock(folio);
        folio_put(folio);
        return err;
}

/*
 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
 * starting from file offset 'from'.  The range to be zero'd must
 * be contained with in one block.  If the specified range exceeds
 * the end of the block it will be shortened to end of the block
 * that corresponds to 'from'
 */
static int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
{
        struct inode *inode = mapping->host;
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned blocksize = inode->i_sb->s_blocksize;
        unsigned max = blocksize - (offset & (blocksize - 1));

        /*
         * correct length if it does not fall between
         * 'from' and the end of the block
         */
        if (length > max || length < 0)
                length = max;

        if (IS_DAX(inode)) {
                return dax_zero_range(inode, from, length, NULL,
                                      &ext4_iomap_ops);
        }
        return __ext4_block_zero_page_range(handle, mapping, from, length);
}

/*
 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
 * up to the end of the block which corresponds to `from'.
 * This required during truncate. We need to physically zero the tail end
 * of that block so it doesn't yield old data if the file is later grown.
 */
static int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from)
{
        unsigned offset = from & (PAGE_SIZE-1);
        unsigned length;
        unsigned blocksize;
        struct inode *inode = mapping->host;

        /* If we are processing an encrypted inode during orphan list handling */
        if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode))
                return 0;

        blocksize = inode->i_sb->s_blocksize;
        length = blocksize - (offset & (blocksize - 1));

        return ext4_block_zero_page_range(handle, mapping, from, length);
}

int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t length)
{
        struct super_block *sb = inode->i_sb;
        struct address_space *mapping = inode->i_mapping;
        unsigned partial_start, partial_end;
        ext4_fsblk_t start, end;
        loff_t byte_end = (lstart + length - 1);
        int err = 0;

        partial_start = lstart & (sb->s_blocksize - 1);
        partial_end = byte_end & (sb->s_blocksize - 1);

        start = lstart >> sb->s_blocksize_bits;
        end = byte_end >> sb->s_blocksize_bits;

        /* Handle partial zero within the single block */
        if (start == end &&
            (partial_start || (partial_end != sb->s_blocksize - 1))) {
                err = ext4_block_zero_page_range(handle, mapping,
                                                 lstart, length);
                return err;
        }
        /* Handle partial zero out on the start of the range */
        if (partial_start) {
                err = ext4_block_zero_page_range(handle, mapping,
                                                 lstart, sb->s_blocksize);
                if (err)
                        return err;
        }
        /* Handle partial zero out on the end of the range */
        if (partial_end != sb->s_blocksize - 1)
                err = ext4_block_zero_page_range(handle, mapping,
                                                 byte_end - partial_end,
                                                 partial_end + 1);
        return err;
}

int ext4_can_truncate(struct inode *inode)
{
        if (S_ISREG(inode->i_mode))
                return 1;
        if (S_ISDIR(inode->i_mode))
                return 1;
        if (S_ISLNK(inode->i_mode))
                return !ext4_inode_is_fast_symlink(inode);
        return 0;
}

/*
 * We have to make sure i_disksize gets properly updated before we truncate
 * page cache due to hole punching or zero range. Otherwise i_disksize update
 * can get lost as it may have been postponed to submission of writeback but
 * that will never happen after we truncate page cache.
 */
int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len)
{
        handle_t *handle;
        int ret;

        loff_t size = i_size_read(inode);

        WARN_ON(!inode_is_locked(inode));
        if (offset > size || offset + len < size)
                return 0;

        if (EXT4_I(inode)->i_disksize >= size)
                return 0;

        handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ext4_update_i_disksize(inode, size);
        ret = ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);

        return ret;
}

static inline void ext4_truncate_folio(struct inode *inode,
                                       loff_t start, loff_t end)
{
        unsigned long blocksize = i_blocksize(inode);
        struct folio *folio;

        /* Nothing to be done if no complete block needs to be truncated. */
        if (round_up(start, blocksize) >= round_down(end, blocksize))
                return;

        folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT);
        if (IS_ERR(folio))
                return;

        if (folio_mkclean(folio))
                folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);
}

int ext4_truncate_page_cache_block_range(struct inode *inode,
                                         loff_t start, loff_t end)
{
        unsigned long blocksize = i_blocksize(inode);
        int ret;

        /*
         * For journalled data we need to write (and checkpoint) pages
         * before discarding page cache to avoid inconsitent data on disk
         * in case of crash before freeing or unwritten converting trans
         * is committed.
         */
        if (ext4_should_journal_data(inode)) {
                ret = filemap_write_and_wait_range(inode->i_mapping, start,
                                                   end - 1);
                if (ret)
                        return ret;
                goto truncate_pagecache;
        }

        /*
         * If the block size is less than the page size, the file's mapped
         * blocks within one page could be freed or converted to unwritten.
         * So it's necessary to remove writable userspace mappings, and then
         * ext4_page_mkwrite() can be called during subsequent write access
         * to these partial folios.
         */
        if (!IS_ALIGNED(start | end, PAGE_SIZE) &&
            blocksize < PAGE_SIZE && start < inode->i_size) {
                loff_t page_boundary = round_up(start, PAGE_SIZE);

                ext4_truncate_folio(inode, start, min(page_boundary, end));
                if (end > page_boundary)
                        ext4_truncate_folio(inode,
                                            round_down(end, PAGE_SIZE), end);
        }

truncate_pagecache:
        truncate_pagecache_range(inode, start, end - 1);
        return 0;
}

static void ext4_wait_dax_page(struct inode *inode)
{
        filemap_invalidate_unlock(inode->i_mapping);
        schedule();
        filemap_invalidate_lock(inode->i_mapping);
}

int ext4_break_layouts(struct inode *inode)
{
        if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
                return -EINVAL;

        return dax_break_layout_inode(inode, ext4_wait_dax_page);
}

/*
 * ext4_punch_hole: punches a hole in a file by releasing the blocks
 * associated with the given offset and length
 *
 * @inode:  File inode
 * @offset: The offset where the hole will begin
 * @len:    The length of the hole
 *
 * Returns: 0 on success or negative on failure
 */

int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
        struct inode *inode = file_inode(file);
        struct super_block *sb = inode->i_sb;
        ext4_lblk_t start_lblk, end_lblk;
        loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
        loff_t end = offset + length;
        handle_t *handle;
        unsigned int credits;
        int ret;

        trace_ext4_punch_hole(inode, offset, length, 0);
        WARN_ON_ONCE(!inode_is_locked(inode));

        /* No need to punch hole beyond i_size */
        if (offset >= inode->i_size)
                return 0;

        /*
         * If the hole extends beyond i_size, set the hole to end after
         * the page that contains i_size, and also make sure that the hole
         * within one block before last range.
         */
        if (end > inode->i_size)
                end = round_up(inode->i_size, PAGE_SIZE);
        if (end > max_end)
                end = max_end;
        length = end - offset;

        /*
         * Attach jinode to inode for jbd2 if we do any zeroing of partial
         * block.
         */
        if (!IS_ALIGNED(offset | end, sb->s_blocksize)) {
                ret = ext4_inode_attach_jinode(inode);
                if (ret < 0)
                        return ret;
        }


        ret = ext4_update_disksize_before_punch(inode, offset, length);
        if (ret)
                return ret;

        /* Now release the pages and zero block aligned part of pages*/
        ret = ext4_truncate_page_cache_block_range(inode, offset, end);
        if (ret)
                return ret;

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
                credits = ext4_blocks_for_truncate(inode);
        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                ext4_std_error(sb, ret);
                return ret;
        }

        ret = ext4_zero_partial_blocks(handle, inode, offset, length);
        if (ret)
                goto out_handle;

        /* If there are blocks to remove, do it */
        start_lblk = EXT4_B_TO_LBLK(inode, offset);
        end_lblk = end >> inode->i_blkbits;

        if (end_lblk > start_lblk) {
                ext4_lblk_t hole_len = end_lblk - start_lblk;

                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode);

                ext4_es_remove_extent(inode, start_lblk, hole_len);

                if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                        ret = ext4_ext_remove_space(inode, start_lblk,
                                                    end_lblk - 1);
                else
                        ret = ext4_ind_remove_space(handle, inode, start_lblk,
                                                    end_lblk);
                if (ret) {
                        up_write(&EXT4_I(inode)->i_data_sem);
                        goto out_handle;
                }

                ext4_es_insert_extent(inode, start_lblk, hole_len, ~0,
                                      EXTENT_STATUS_HOLE, 0);
                up_write(&EXT4_I(inode)->i_data_sem);
        }
        ext4_fc_track_range(handle, inode, start_lblk, end_lblk);

        ret = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(ret))
                goto out_handle;

        ext4_update_inode_fsync_trans(handle, inode, 1);
        if (IS_SYNC(inode))
                ext4_handle_sync(handle);
out_handle:
        ext4_journal_stop(handle);
        return ret;
}

int ext4_inode_attach_jinode(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct jbd2_inode *jinode;

        if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal)
                return 0;

        jinode = jbd2_alloc_inode(GFP_KERNEL);
        spin_lock(&inode->i_lock);
        if (!ei->jinode) {
                if (!jinode) {
                        spin_unlock(&inode->i_lock);
                        return -ENOMEM;
                }
                ei->jinode = jinode;
                jbd2_journal_init_jbd_inode(ei->jinode, inode);
                jinode = NULL;
        }
        spin_unlock(&inode->i_lock);
        if (unlikely(jinode != NULL))
                jbd2_free_inode(jinode);
        return 0;
}

/*
 * ext4_truncate()
 *
 * We block out ext4_get_block() block instantiations across the entire
 * transaction, and VFS/VM ensures that ext4_truncate() cannot run
 * simultaneously on behalf of the same inode.
 *
 * As we work through the truncate and commit bits of it to the journal there
 * is one core, guiding principle: the file's tree must always be consistent on
 * disk.  We must be able to restart the truncate after a crash.
 *
 * The file's tree may be transiently inconsistent in memory (although it
 * probably isn't), but whenever we close off and commit a journal transaction,
 * the contents of (the filesystem + the journal) must be consistent and
 * restartable.  It's pretty simple, really: bottom up, right to left (although
 * left-to-right works OK too).
 *
 * Note that at recovery time, journal replay occurs *before* the restart of
 * truncate against the orphan inode list.
 *
 * The committed inode has the new, desired i_size (which is the same as
 * i_disksize in this case).  After a crash, ext4_orphan_cleanup() will see
 * that this inode's truncate did not complete and it will again call
 * ext4_truncate() to have another go.  So there will be instantiated blocks
 * to the right of the truncation point in a crashed ext4 filesystem.  But
 * that's fine - as long as they are linked from the inode, the post-crash
 * ext4_truncate() run will find them and release them.
 */
int ext4_truncate(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int credits;
        int err = 0, err2;
        handle_t *handle;
        struct address_space *mapping = inode->i_mapping;

        /*
         * There is a possibility that we're either freeing the inode
         * or it's a completely new inode. In those cases we might not
         * have i_rwsem locked because it's not necessary.
         */
        if (!(inode->i_state & (I_NEW|I_FREEING)))
                WARN_ON(!inode_is_locked(inode));
        trace_ext4_truncate_enter(inode);

        if (!ext4_can_truncate(inode))
                goto out_trace;

        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);

        if (ext4_has_inline_data(inode)) {
                int has_inline = 1;

                err = ext4_inline_data_truncate(inode, &has_inline);
                if (err || has_inline)
                        goto out_trace;
        }

        /* If we zero-out tail of the page, we have to create jinode for jbd2 */
        if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
                err = ext4_inode_attach_jinode(inode);
                if (err)
                        goto out_trace;
        }

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                credits = ext4_writepage_trans_blocks(inode);
        else
                credits = ext4_blocks_for_truncate(inode);

        handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
        if (IS_ERR(handle)) {
                err = PTR_ERR(handle);
                goto out_trace;
        }

        if (inode->i_size & (inode->i_sb->s_blocksize - 1))
                ext4_block_truncate_page(handle, mapping, inode->i_size);

        /*
         * We add the inode to the orphan list, so that if this
         * truncate spans multiple transactions, and we crash, we will
         * resume the truncate when the filesystem recovers.  It also
         * marks the inode dirty, to catch the new size.
         *
         * Implication: the file must always be in a sane, consistent
         * truncatable state while each transaction commits.
         */
        err = ext4_orphan_add(handle, inode);
        if (err)
                goto out_stop;

        down_write(&EXT4_I(inode)->i_data_sem);

        ext4_discard_preallocations(inode);

        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                err = ext4_ext_truncate(handle, inode);
        else
                ext4_ind_truncate(handle, inode);

        up_write(&ei->i_data_sem);
        if (err)
                goto out_stop;

        if (IS_SYNC(inode))
                ext4_handle_sync(handle);

out_stop:
        /*
         * If this was a simple ftruncate() and the file will remain alive,
         * then we need to clear up the orphan record which we created above.
         * However, if this was a real unlink then we were called by
         * ext4_evict_inode(), and we allow that function to clean up the
         * orphan info for us.
         */
        if (inode->i_nlink)
                ext4_orphan_del(handle, inode);

        inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
        err2 = ext4_mark_inode_dirty(handle, inode);
        if (unlikely(err2 && !err))
                err = err2;
        ext4_journal_stop(handle);

out_trace:
        trace_ext4_truncate_exit(inode);
        return err;
}

static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                return inode_peek_iversion_raw(inode);
        else
                return inode_peek_iversion(inode);
}

static int ext4_inode_blocks_set(struct ext4_inode *raw_inode,
                                 struct ext4_inode_info *ei)
{
        struct inode *inode = &(ei->vfs_inode);
        u64 i_blocks = READ_ONCE(inode->i_blocks);
        struct super_block *sb = inode->i_sb;

        if (i_blocks <= ~0U) {
                /*
                 * i_blocks can be represented in a 32 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }

        /*
         * This should never happen since sb->s_maxbytes should not have
         * allowed this, sb->s_maxbytes was set according to the huge_file
         * feature in ext4_fill_super().
         */
        if (!ext4_has_feature_huge_file(sb))
                return -EFSCORRUPTED;

        if (i_blocks <= 0xffffffffffffULL) {
                /*
                 * i_blocks can be represented in a 48 bit variable
                 * as multiple of 512 bytes
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
        } else {
                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
        }
        return 0;
}

static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;
        int block;
        int err;

        err = ext4_inode_blocks_set(raw_inode, ei);

        raw_inode->i_mode = cpu_to_le16(inode->i_mode);
        i_uid = i_uid_read(inode);
        i_gid = i_gid_read(inode);
        i_projid = from_kprojid(&init_user_ns, ei->i_projid);
        if (!(test_opt(inode->i_sb, NO_UID32))) {
                raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
                /*
                 * Fix up interoperability with old kernels. Otherwise,
                 * old inodes get re-used with the upper 16 bits of the
                 * uid/gid intact.
                 */
                if (ei->i_dtime && list_empty(&ei->i_orphan)) {
                        raw_inode->i_uid_high = 0;
                        raw_inode->i_gid_high = 0;
                } else {
                        raw_inode->i_uid_high =
                                cpu_to_le16(high_16_bits(i_uid));
                        raw_inode->i_gid_high =
                                cpu_to_le16(high_16_bits(i_gid));
                }
        } else {
                raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid));
                raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid));
                raw_inode->i_uid_high = 0;
                raw_inode->i_gid_high = 0;
        }
        raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);

        EXT4_INODE_SET_CTIME(inode, raw_inode);
        EXT4_INODE_SET_MTIME(inode, raw_inode);
        EXT4_INODE_SET_ATIME(inode, raw_inode);
        EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode);

        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
        raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF);
        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT)))
                raw_inode->i_file_acl_high =
                        cpu_to_le16(ei->i_file_acl >> 32);
        raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
        ext4_isize_set(raw_inode, ei->i_disksize);

        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
        if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
                if (old_valid_dev(inode->i_rdev)) {
                        raw_inode->i_block[0] =
                                cpu_to_le32(old_encode_dev(inode->i_rdev));
                        raw_inode->i_block[1] = 0;
                } else {
                        raw_inode->i_block[0] = 0;
                        raw_inode->i_block[1] =
                                cpu_to_le32(new_encode_dev(inode->i_rdev));
                        raw_inode->i_block[2] = 0;
                }
        } else if (!ext4_has_inline_data(inode)) {
                for (block = 0; block < EXT4_N_BLOCKS; block++)
                        raw_inode->i_block[block] = ei->i_data[block];
        }

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = ext4_inode_peek_iversion(inode);

                raw_inode->i_disk_version = cpu_to_le32(ivers);
                if (ei->i_extra_isize) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                raw_inode->i_version_hi =
                                        cpu_to_le32(ivers >> 32);
                        raw_inode->i_extra_isize =
                                cpu_to_le16(ei->i_extra_isize);
                }
        }

        if (i_projid != EXT4_DEF_PROJID &&
            !ext4_has_feature_project(inode->i_sb))
                err = err ?: -EFSCORRUPTED;

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                raw_inode->i_projid = cpu_to_le32(i_projid);

        ext4_inode_csum_set(inode, raw_inode, ei);
        return err;
}

/*
 * ext4_get_inode_loc returns with an extra refcount against the inode's
 * underlying buffer_head on success. If we pass 'inode' and it does not
 * have in-inode xattr, we have all inode data in memory that is needed
 * to recreate the on-disk version of this inode.
 */
static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino,
                                struct inode *inode, struct ext4_iloc *iloc,
                                ext4_fsblk_t *ret_block)
{
        struct ext4_group_desc        *gdp;
        struct buffer_head        *bh;
        ext4_fsblk_t                block;
        struct blk_plug                plug;
        int                        inodes_per_block, inode_offset;

        iloc->bh = NULL;
        if (ino < EXT4_ROOT_INO ||
            ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))
                return -EFSCORRUPTED;

        iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
        gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
        if (!gdp)
                return -EIO;

        /*
         * Figure out the offset within the block group inode table
         */
        inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        inode_offset = ((ino - 1) %
                        EXT4_INODES_PER_GROUP(sb));
        iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);

        block = ext4_inode_table(sb, gdp);
        if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
            (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
                ext4_error(sb, "Invalid inode table block %llu in "
                           "block_group %u", block, iloc->block_group);
                return -EFSCORRUPTED;
        }
        block += (inode_offset / inodes_per_block);

        bh = sb_getblk(sb, block);
        if (unlikely(!bh))
                return -ENOMEM;
        if (ext4_buffer_uptodate(bh))
                goto has_buffer;

        lock_buffer(bh);
        if (ext4_buffer_uptodate(bh)) {
                /* Someone brought it uptodate while we waited */
                unlock_buffer(bh);
                goto has_buffer;
        }

        /*
         * If we have all information of the inode in memory and this
         * is the only valid inode in the block, we need not read the
         * block.
         */
        if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) {
                struct buffer_head *bitmap_bh;
                int i, start;

                start = inode_offset & ~(inodes_per_block - 1);

                /* Is the inode bitmap in cache? */
                bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp));
                if (unlikely(!bitmap_bh))
                        goto make_io;

                /*
                 * If the inode bitmap isn't in cache then the
                 * optimisation may end up performing two reads instead
                 * of one, so skip it.
                 */
                if (!buffer_uptodate(bitmap_bh)) {
                        brelse(bitmap_bh);
                        goto make_io;
                }
                for (i = start; i < start + inodes_per_block; i++) {
                        if (i == inode_offset)
                                continue;
                        if (ext4_test_bit(i, bitmap_bh->b_data))
                                break;
                }
                brelse(bitmap_bh);
                if (i == start + inodes_per_block) {
                        struct ext4_inode *raw_inode =
                                (struct ext4_inode *) (bh->b_data + iloc->offset);

                        /* all other inodes are free, so skip I/O */
                        memset(bh->b_data, 0, bh->b_size);
                        if (!ext4_test_inode_state(inode, EXT4_STATE_NEW))
                                ext4_fill_raw_inode(inode, raw_inode);
                        set_buffer_uptodate(bh);
                        unlock_buffer(bh);
                        goto has_buffer;
                }
        }

make_io:
        /*
         * If we need to do any I/O, try to pre-readahead extra
         * blocks from the inode table.
         */
        blk_start_plug(&plug);
        if (EXT4_SB(sb)->s_inode_readahead_blks) {
                ext4_fsblk_t b, end, table;
                unsigned num;
                __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks;

                table = ext4_inode_table(sb, gdp);
                /* s_inode_readahead_blks is always a power of 2 */
                b = block & ~((ext4_fsblk_t) ra_blks - 1);
                if (table > b)
                        b = table;
                end = b + ra_blks;
                num = EXT4_INODES_PER_GROUP(sb);
                if (ext4_has_group_desc_csum(sb))
                        num -= ext4_itable_unused_count(sb, gdp);
                table += num / inodes_per_block;
                if (end > table)
                        end = table;
                while (b <= end)
                        ext4_sb_breadahead_unmovable(sb, b++);
        }

        /*
         * There are other valid inodes in the buffer, this inode
         * has in-inode xattrs, or we don't have this inode in memory.
         * Read the block from disk.
         */
        trace_ext4_load_inode(sb, ino);
        ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL,
                            ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO));
        blk_finish_plug(&plug);
        wait_on_buffer(bh);
        if (!buffer_uptodate(bh)) {
                if (ret_block)
                        *ret_block = block;
                brelse(bh);
                return -EIO;
        }
has_buffer:
        iloc->bh = bh;
        return 0;
}

static int __ext4_get_inode_loc_noinmem(struct inode *inode,
                                        struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc,
                                        &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}

int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc)
{
        ext4_fsblk_t err_blk = 0;
        int ret;

        ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc,
                                        &err_blk);

        if (ret == -EIO)
                ext4_error_inode_block(inode, err_blk, EIO,
                                        "unable to read itable block");

        return ret;
}


int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc)
{
        return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL);
}

static bool ext4_should_enable_dax(struct inode *inode)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (test_opt2(inode->i_sb, DAX_NEVER))
                return false;
        if (!S_ISREG(inode->i_mode))
                return false;
        if (ext4_should_journal_data(inode))
                return false;
        if (ext4_has_inline_data(inode))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT))
                return false;
        if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY))
                return false;
        if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags))
                return false;
        if (test_opt(inode->i_sb, DAX_ALWAYS))
                return true;

        return ext4_test_inode_flag(inode, EXT4_INODE_DAX);
}

void ext4_set_inode_flags(struct inode *inode, bool init)
{
        unsigned int flags = EXT4_I(inode)->i_flags;
        unsigned int new_fl = 0;

        WARN_ON_ONCE(IS_DAX(inode) && init);

        if (flags & EXT4_SYNC_FL)
                new_fl |= S_SYNC;
        if (flags & EXT4_APPEND_FL)
                new_fl |= S_APPEND;
        if (flags & EXT4_IMMUTABLE_FL)
                new_fl |= S_IMMUTABLE;
        if (flags & EXT4_NOATIME_FL)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;

        /* Because of the way inode_set_flags() works we must preserve S_DAX
         * here if already set. */
        new_fl |= (inode->i_flags & S_DAX);
        if (init && ext4_should_enable_dax(inode))
                new_fl |= S_DAX;

        if (flags & EXT4_ENCRYPT_FL)
                new_fl |= S_ENCRYPTED;
        if (flags & EXT4_CASEFOLD_FL)
                new_fl |= S_CASEFOLD;
        if (flags & EXT4_VERITY_FL)
                new_fl |= S_VERITY;
        inode_set_flags(inode, new_fl,
                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
                        S_ENCRYPTED|S_CASEFOLD|S_VERITY);
}

static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
                                  struct ext4_inode_info *ei)
{
        blkcnt_t i_blocks ;
        struct inode *inode = &(ei->vfs_inode);
        struct super_block *sb = inode->i_sb;

        if (ext4_has_feature_huge_file(sb)) {
                /* we are using combined 48 bit field */
                i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
                                        le32_to_cpu(raw_inode->i_blocks_lo);
                if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) {
                        /* i_blocks represent file system block size */
                        return i_blocks  << (inode->i_blkbits - 9);
                } else {
                        return i_blocks;
                }
        } else {
                return le32_to_cpu(raw_inode->i_blocks_lo);
        }
}

static inline int ext4_iget_extra_inode(struct inode *inode,
                                         struct ext4_inode *raw_inode,
                                         struct ext4_inode_info *ei)
{
        __le32 *magic = (void *)raw_inode +
                        EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;

        if (EXT4_INODE_HAS_XATTR_SPACE(inode)  &&
            *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
                int err;

                err = xattr_check_inode(inode, IHDR(inode, raw_inode),
                                        ITAIL(inode, raw_inode));
                if (err)
                        return err;

                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                err = ext4_find_inline_data_nolock(inode);
                if (!err && ext4_has_inline_data(inode))
                        ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
                return err;
        } else
                EXT4_I(inode)->i_inline_off = 0;
        return 0;
}

int ext4_get_projid(struct inode *inode, kprojid_t *projid)
{
        if (!ext4_has_feature_project(inode->i_sb))
                return -EOPNOTSUPP;
        *projid = EXT4_I(inode)->i_projid;
        return 0;
}

/*
 * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of
 * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag
 * set.
 */
static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val)
{
        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
                inode_set_iversion_raw(inode, val);
        else
                inode_set_iversion_queried(inode, val);
}

static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
                            const char *function, unsigned int line)
{
        const char *err_str;

        if (flags & EXT4_IGET_EA_INODE) {
                if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
                        err_str = "missing EA_INODE flag";
                        goto error;
                }
                if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
                    EXT4_I(inode)->i_file_acl) {
                        err_str = "ea_inode with extended attributes";
                        goto error;
                }
        } else {
                if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
                        /*
                         * open_by_handle_at() could provide an old inode number
                         * that has since been reused for an ea_inode; this does
                         * not indicate filesystem corruption
                         */
                        if (flags & EXT4_IGET_HANDLE)
                                return -ESTALE;
                        err_str = "unexpected EA_INODE flag";
                        goto error;
                }
        }
        if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) {
                err_str = "unexpected bad inode w/o EXT4_IGET_BAD";
                goto error;
        }
        return 0;

error:
        ext4_error_inode(inode, function, line, 0, err_str);
        return -EFSCORRUPTED;
}

struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                          ext4_iget_flags flags, const char *function,
                          unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei;
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct inode *inode;
        journal_t *journal = EXT4_SB(sb)->s_journal;
        long ret;
        loff_t size;
        int block;
        uid_t i_uid;
        gid_t i_gid;
        projid_t i_projid;

        if ((!(flags & EXT4_IGET_SPECIAL) &&
             ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
              ino == le32_to_cpu(es->s_usr_quota_inum) ||
              ino == le32_to_cpu(es->s_grp_quota_inum) ||
              ino == le32_to_cpu(es->s_prj_quota_inum) ||
              ino == le32_to_cpu(es->s_orphan_file_inum))) ||
            (ino < EXT4_ROOT_INO) ||
            (ino > le32_to_cpu(es->s_inodes_count))) {
                if (flags & EXT4_IGET_HANDLE)
                        return ERR_PTR(-ESTALE);
                __ext4_error(sb, function, line, false, EFSCORRUPTED, 0,
                             "inode #%lu: comm %s: iget: illegal inode #",
                             ino, current->comm);
                return ERR_PTR(-EFSCORRUPTED);
        }

        inode = iget_locked(sb, ino);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        if (!(inode->i_state & I_NEW)) {
                ret = check_igot_inode(inode, flags, function, line);
                if (ret) {
                        iput(inode);
                        return ERR_PTR(ret);
                }
                return inode;
        }

        ei = EXT4_I(inode);
        iloc.bh = NULL;

        ret = __ext4_get_inode_loc_noinmem(inode, &iloc);
        if (ret < 0)
                goto bad_inode;
        raw_inode = ext4_raw_inode(&iloc);

        if ((flags & EXT4_IGET_HANDLE) &&
            (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
                ret = -ESTALE;
                goto bad_inode;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
                if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
                        EXT4_INODE_SIZE(inode->i_sb) ||
                    (ei->i_extra_isize & 3)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: bad extra_isize %u "
                                         "(inode size %u)",
                                         ei->i_extra_isize,
                                         EXT4_INODE_SIZE(inode->i_sb));
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
        } else
                ei->i_extra_isize = 0;

        /* Precompute checksum seed for inode metadata */
        if (ext4_has_feature_metadata_csum(sb)) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                __u32 csum;
                __le32 inum = cpu_to_le32(inode->i_ino);
                __le32 gen = raw_inode->i_generation;
                csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
                                   sizeof(inum));
                ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
                                              sizeof(gen));
        }

        if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
            ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) &&
             (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) {
                ext4_error_inode_err(inode, function, line, 0,
                                EFSBADCRC, "iget: checksum invalid");
                ret = -EFSBADCRC;
                goto bad_inode;
        }

        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
        i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
        if (ext4_has_feature_project(sb) &&
            EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
                i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
        else
                i_projid = EXT4_DEF_PROJID;

        if (!(test_opt(inode->i_sb, NO_UID32))) {
                i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
                i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
        }
        i_uid_write(inode, i_uid);
        i_gid_write(inode, i_gid);
        ei->i_projid = make_kprojid(&init_user_ns, i_projid);
        set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));

        ext4_clear_state_flags(ei);        /* Only relevant on 32-bit archs */
        ei->i_inline_off = 0;
        ei->i_dir_start_lookup = 0;
        ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
        /* We now have enough fields to check if the inode was active or not.
         * This is needed because nfsd might try to access dead inodes
         * the test is that same one that e2fsck uses
         * NeilBrown 1999oct15
         */
        if (inode->i_nlink == 0) {
                if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
                     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
                    ino != EXT4_BOOT_LOADER_INO) {
                        /* this inode is deleted or unallocated */
                        if (flags & EXT4_IGET_SPECIAL) {
                                ext4_error_inode(inode, function, line, 0,
                                                 "iget: special inode unallocated");
                                ret = -EFSCORRUPTED;
                        } else
                                ret = -ESTALE;
                        goto bad_inode;
                }
                /* The only unlinked inodes we let through here have
                 * valid i_mode and are being read by the orphan
                 * recovery code: that's fine, we're about to complete
                 * the process of deleting those.
                 * OR it is the EXT4_BOOT_LOADER_INO which is
                 * not initialized on a new filesystem. */
        }
        ei->i_flags = le32_to_cpu(raw_inode->i_flags);
        ext4_set_inode_flags(inode, true);
        inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
        ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
        if (ext4_has_feature_64bit(sb))
                ei->i_file_acl |=
                        ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
        inode->i_size = ext4_isize(sb, raw_inode);
        if ((size = i_size_read(inode)) < 0) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad i_size value: %lld", size);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        /*
         * If dir_index is not enabled but there's dir with INDEX flag set,
         * we'd normally treat htree data as empty space. But with metadata
         * checksumming that corrupts checksums so forbid that.
         */
        if (!ext4_has_feature_dir_index(sb) &&
            ext4_has_feature_metadata_csum(sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                ext4_error_inode(inode, function, line, 0,
                         "iget: Dir with htree data on filesystem without dir_index feature.");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
        ei->i_reserved_quota = 0;
#endif
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
        ei->i_last_alloc_group = ~0;
        /*
         * NOTE! The in-memory inode i_data array is in little-endian order
         * even on big-endian machines: we do NOT byteswap the block numbers!
         */
        for (block = 0; block < EXT4_N_BLOCKS; block++)
                ei->i_data[block] = raw_inode->i_block[block];
        INIT_LIST_HEAD(&ei->i_orphan);
        ext4_fc_init_inode(&ei->vfs_inode);

        /*
         * Set transaction id's of transactions that have to be committed
         * to finish f[data]sync. We set them to currently running transaction
         * as we cannot be sure that the inode or some of its metadata isn't
         * part of the transaction - the inode could have been reclaimed and
         * now it is reread from disk.
         */
        if (journal) {
                transaction_t *transaction;
                tid_t tid;

                read_lock(&journal->j_state_lock);
                if (journal->j_running_transaction)
                        transaction = journal->j_running_transaction;
                else
                        transaction = journal->j_committing_transaction;
                if (transaction)
                        tid = transaction->t_tid;
                else
                        tid = journal->j_commit_sequence;
                read_unlock(&journal->j_state_lock);
                ei->i_sync_tid = tid;
                ei->i_datasync_tid = tid;
        }

        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                if (ei->i_extra_isize == 0) {
                        /* The extra space is currently unused. Use it. */
                        BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
                        ei->i_extra_isize = sizeof(struct ext4_inode) -
                                            EXT4_GOOD_OLD_INODE_SIZE;
                } else {
                        ret = ext4_iget_extra_inode(inode, raw_inode, ei);
                        if (ret)
                                goto bad_inode;
                }
        }

        EXT4_INODE_GET_CTIME(inode, raw_inode);
        EXT4_INODE_GET_ATIME(inode, raw_inode);
        EXT4_INODE_GET_MTIME(inode, raw_inode);
        EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode);

        if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) {
                u64 ivers = le32_to_cpu(raw_inode->i_disk_version);

                if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
                        if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi))
                                ivers |=
                    (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
                }
                ext4_inode_set_iversion_queried(inode, ivers);
        }

        ret = 0;
        if (ei->i_file_acl &&
            !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) {
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bad extended attribute block %llu",
                                 ei->i_file_acl);
                ret = -EFSCORRUPTED;
                goto bad_inode;
        } else if (!ext4_has_inline_data(inode)) {
                /* validate the block references in the inode */
                if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) &&
                        (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                        (S_ISLNK(inode->i_mode) &&
                        !ext4_inode_is_fast_symlink(inode)))) {
                        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                                ret = ext4_ext_check_inode(inode);
                        else
                                ret = ext4_ind_check_inode(inode);
                }
        }
        if (ret)
                goto bad_inode;

        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext4_dir_inode_operations;
                inode->i_fop = &ext4_dir_operations;
        } else if (S_ISLNK(inode->i_mode)) {
                /* VFS does not allow setting these so must be corruption */
                if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) {
                        ext4_error_inode(inode, function, line, 0,
                                         "iget: immutable or append flags "
                                         "not allowed on symlinks");
                        ret = -EFSCORRUPTED;
                        goto bad_inode;
                }
                if (IS_ENCRYPTED(inode)) {
                        inode->i_op = &ext4_encrypted_symlink_inode_operations;
                } else if (ext4_inode_is_fast_symlink(inode)) {
                        inode->i_op = &ext4_fast_symlink_inode_operations;
                        if (inode->i_size == 0 ||
                            inode->i_size >= sizeof(ei->i_data) ||
                            strnlen((char *)ei->i_data, inode->i_size + 1) !=
                                                                inode->i_size) {
                                ext4_error_inode(inode, function, line, 0,
                                        "invalid fast symlink length %llu",
                                         (unsigned long long)inode->i_size);
                                ret = -EFSCORRUPTED;
                                goto bad_inode;
                        }
                        inode_set_cached_link(inode, (char *)ei->i_data,
                                              inode->i_size);
                } else {
                        inode->i_op = &ext4_symlink_inode_operations;
                }
        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
              S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                inode->i_op = &ext4_special_inode_operations;
                if (raw_inode->i_block[0])
                        init_special_inode(inode, inode->i_mode,
                           old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else if (ino == EXT4_BOOT_LOADER_INO) {
                make_bad_inode(inode);
        } else {
                ret = -EFSCORRUPTED;
                ext4_error_inode(inode, function, line, 0,
                                 "iget: bogus i_mode (%o)", inode->i_mode);
                goto bad_inode;
        }
        if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) {
                ext4_error_inode(inode, function, line, 0,
                                 "casefold flag without casefold feature");
                ret = -EFSCORRUPTED;
                goto bad_inode;
        }
        ret = check_igot_inode(inode, flags, function, line);
        /*
         * -ESTALE here means there is nothing inherently wrong with the inode,
         * it's just not an inode we can return for an fhandle lookup.
         */
        if (ret == -ESTALE) {
                brelse(iloc.bh);
                unlock_new_inode(inode);
                iput(inode);
                return ERR_PTR(-ESTALE);
        }
        if (ret)
                goto bad_inode;
        brelse(iloc.bh);

        unlock_new_inode(inode);
        return inode;

bad_inode:
        brelse(iloc.bh);
        iget_failed(inode);
        return ERR_PTR(ret);
}

static void __ext4_update_other_inode_time(struct super_block *sb,
                                           unsigned long orig_ino,
                                           unsigned long ino,
                                           struct ext4_inode *raw_inode)
{
        struct inode *inode;

        inode = find_inode_by_ino_rcu(sb, ino);
        if (!inode)
                return;

        if (!inode_is_dirtytime_only(inode))
                return;

        spin_lock(&inode->i_lock);
        if (inode_is_dirtytime_only(inode)) {
                struct ext4_inode_info        *ei = EXT4_I(inode);

                inode->i_state &= ~I_DIRTY_TIME;
                spin_unlock(&inode->i_lock);

                spin_lock(&ei->i_raw_lock);
                EXT4_INODE_SET_CTIME(inode, raw_inode);
                EXT4_INODE_SET_MTIME(inode, raw_inode);
                EXT4_INODE_SET_ATIME(inode, raw_inode);
                ext4_inode_csum_set(inode, raw_inode, ei);
                spin_unlock(&ei->i_raw_lock);
                trace_ext4_other_inode_update_time(inode, orig_ino);
                return;
        }
        spin_unlock(&inode->i_lock);
}

/*
 * Opportunistically update the other time fields for other inodes in
 * the same inode table block.
 */
static void ext4_update_other_inodes_time(struct super_block *sb,
                                          unsigned long orig_ino, char *buf)
{
        unsigned long ino;
        int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
        int inode_size = EXT4_INODE_SIZE(sb);

        /*
         * Calculate the first inode in the inode table block.  Inode
         * numbers are one-based.  That is, the first inode in a block
         * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1).
         */
        ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1;
        rcu_read_lock();
        for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
                if (ino == orig_ino)
                        continue;
                __ext4_update_other_inode_time(sb, orig_ino, ino,
                                               (struct ext4_inode *)buf);
        }
        rcu_read_unlock();
}

/*
 * Post the struct inode info into an on-disk inode location in the
 * buffer-cache.  This gobbles the caller's reference to the
 * buffer_head in the inode location struct.
 *
 * The caller must have write access to iloc->bh.
 */
static int ext4_do_update_inode(handle_t *handle,
                                struct inode *inode,
                                struct ext4_iloc *iloc)
{
        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct buffer_head *bh = iloc->bh;
        struct super_block *sb = inode->i_sb;
        int err;
        int need_datasync = 0, set_large_file = 0;

        spin_lock(&ei->i_raw_lock);

        /*
         * For fields not tracked in the in-memory inode, initialise them
         * to zero for new inodes.
         */
        if (ext4_test_inode_state(inode, EXT4_STATE_NEW))
                memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);

        if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode))
                need_datasync = 1;
        if (ei->i_disksize > 0x7fffffffULL) {
                if (!ext4_has_feature_large_file(sb) ||
                    EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV))
                        set_large_file = 1;
        }

        err = ext4_fill_raw_inode(inode, raw_inode);
        spin_unlock(&ei->i_raw_lock);
        if (err) {
                EXT4_ERROR_INODE(inode, "corrupted inode contents");
                goto out_brelse;
        }

        if (inode->i_sb->s_flags & SB_LAZYTIME)
                ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
                                              bh->b_data);

        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (err)
                goto out_error;
        ext4_clear_inode_state(inode, EXT4_STATE_NEW);
        if (set_large_file) {
                BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
                err = ext4_journal_get_write_access(handle, sb,
                                                    EXT4_SB(sb)->s_sbh,
                                                    EXT4_JTR_NONE);
                if (err)
                        goto out_error;
                lock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_set_feature_large_file(sb);
                ext4_superblock_csum_set(sb);
                unlock_buffer(EXT4_SB(sb)->s_sbh);
                ext4_handle_sync(handle);
                err = ext4_handle_dirty_metadata(handle, NULL,
                                                 EXT4_SB(sb)->s_sbh);
        }
        ext4_update_inode_fsync_trans(handle, inode, need_datasync);
out_error:
        ext4_std_error(inode->i_sb, err);
out_brelse:
        brelse(bh);
        return err;
}

/*
 * ext4_write_inode()
 *
 * We are called from a few places:
 *
 * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files.
 *   Here, there will be no transaction running. We wait for any running
 *   transaction to commit.
 *
 * - Within flush work (sys_sync(), kupdate and such).
 *   We wait on commit, if told to.
 *
 * - Within iput_final() -> write_inode_now()
 *   We wait on commit, if told to.
 *
 * In all cases it is actually safe for us to return without doing anything,
 * because the inode has been copied into a raw inode buffer in
 * ext4_mark_inode_dirty().  This is a correctness thing for WB_SYNC_ALL
 * writeback.
 *
 * Note that we are absolutely dependent upon all inode dirtiers doing the
 * right thing: they *must* call mark_inode_dirty() after dirtying info in
 * which we are interested.
 *
 * It would be a bug for them to not do this.  The code:
 *
 *        mark_inode_dirty(inode)
 *        stuff();
 *        inode->i_size = expr;
 *
 * is in error because write_inode() could occur while `stuff()' is running,
 * and the new i_size will be lost.  Plus the inode will no longer be on the
 * superblock's dirty inode list.
 */
int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int err;

        if (WARN_ON_ONCE(current->flags & PF_MEMALLOC))
                return 0;

        err = ext4_emergency_state(inode->i_sb);
        if (unlikely(err))
                return err;

        if (EXT4_SB(inode->i_sb)->s_journal) {
                if (ext4_journal_current_handle()) {
                        ext4_debug("called recursively, non-PF_MEMALLOC!\n");
                        dump_stack();
                        return -EIO;
                }

                /*
                 * No need to force transaction in WB_SYNC_NONE mode. Also
                 * ext4_sync_fs() will force the commit after everything is
                 * written.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync)
                        return 0;

                err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal,
                                                EXT4_I(inode)->i_sync_tid);
        } else {
                struct ext4_iloc iloc;

                err = __ext4_get_inode_loc_noinmem(inode, &iloc);
                if (err)
                        return err;
                /*
                 * sync(2) will flush the whole buffer cache. No need to do
                 * it here separately for each inode.
                 */
                if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
                        ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO,
                                               "IO error syncing inode");
                        err = -EIO;
                }
                brelse(iloc.bh);
        }
        return err;
}

/*
 * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate
 * buffers that are attached to a folio straddling i_size and are undergoing
 * commit. In that case we have to wait for commit to finish and try again.
 */
static void ext4_wait_for_tail_page_commit(struct inode *inode)
{
        unsigned offset;
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        tid_t commit_tid;
        int ret;
        bool has_transaction;

        offset = inode->i_size & (PAGE_SIZE - 1);
        /*
         * If the folio is fully truncated, we don't need to wait for any commit
         * (and we even should not as __ext4_journalled_invalidate_folio() may
         * strip all buffers from the folio but keep the folio dirty which can then
         * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
         * buffers). Also we don't need to wait for any commit if all buffers in
         * the folio remain valid. This is most beneficial for the common case of
         * blocksize == PAGESIZE.
         */
        if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
                return;
        while (1) {
                struct folio *folio = filemap_lock_folio(inode->i_mapping,
                                      inode->i_size >> PAGE_SHIFT);
                if (IS_ERR(folio))
                        return;
                ret = __ext4_journalled_invalidate_folio(folio, offset,
                                                folio_size(folio) - offset);
                folio_unlock(folio);
                folio_put(folio);
                if (ret != -EBUSY)
                        return;
                has_transaction = false;
                read_lock(&journal->j_state_lock);
                if (journal->j_committing_transaction) {
                        commit_tid = journal->j_committing_transaction->t_tid;
                        has_transaction = true;
                }
                read_unlock(&journal->j_state_lock);
                if (has_transaction)
                        jbd2_log_wait_commit(journal, commit_tid);
        }
}

/*
 * ext4_setattr()
 *
 * Called from notify_change.
 *
 * We want to trap VFS attempts to truncate the file as soon as
 * possible.  In particular, we want to make sure that when the VFS
 * shrinks i_size, we put the inode on the orphan list and modify
 * i_disksize immediately, so that during the subsequent flushing of
 * dirty pages and freeing of disk blocks, we can guarantee that any
 * commit will leave the blocks being flushed in an unused state on
 * disk.  (On recovery, the inode will get truncated and the blocks will
 * be freed, so we have a strong guarantee that no future commit will
 * leave these blocks visible to the user.)
 *
 * Another thing we have to assure is that if we are in ordered mode
 * and inode is still attached to the committing transaction, we must
 * we start writeout of all the dirty pages which are being truncated.
 * This way we are sure that all the data written in the previous
 * transaction are already on disk (truncate waits for pages under
 * writeback).
 *
 * Called with inode->i_rwsem down.
 */
int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                 struct iattr *attr)
{
        struct inode *inode = d_inode(dentry);
        int error, rc = 0;
        int orphan = 0;
        const unsigned int ia_valid = attr->ia_valid;
        bool inc_ivers = true;

        error = ext4_emergency_state(inode->i_sb);
        if (unlikely(error))
                return error;

        if (unlikely(IS_IMMUTABLE(inode)))
                return -EPERM;

        if (unlikely(IS_APPEND(inode) &&
                     (ia_valid & (ATTR_MODE | ATTR_UID |
                                  ATTR_GID | ATTR_TIMES_SET))))
                return -EPERM;

        error = setattr_prepare(idmap, dentry, attr);
        if (error)
                return error;

        error = fscrypt_prepare_setattr(dentry, attr);
        if (error)
                return error;

        error = fsverity_prepare_setattr(dentry, attr);
        if (error)
                return error;

        if (is_quota_modification(idmap, inode, attr)) {
                error = dquot_initialize(inode);
                if (error)
                        return error;
        }

        if (i_uid_needs_update(idmap, attr, inode) ||
            i_gid_needs_update(idmap, attr, inode)) {
                handle_t *handle;

                /* (user+group)*(old+new) structure, inode write (sb,
                 * inode block, ? - but truncate inode update has it) */
                handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
                        (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) +
                         EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3);
                if (IS_ERR(handle)) {
                        error = PTR_ERR(handle);
                        goto err_out;
                }

                /* dquot_transfer() calls back ext4_get_inode_usage() which
                 * counts xattr inode references.
                 */
                down_read(&EXT4_I(inode)->xattr_sem);
                error = dquot_transfer(idmap, inode, attr);
                up_read(&EXT4_I(inode)->xattr_sem);

                if (error) {
                        ext4_journal_stop(handle);
                        return error;
                }
                /* Update corresponding info in inode so that everything is in
                 * one transaction */
                i_uid_update(idmap, attr, inode);
                i_gid_update(idmap, attr, inode);
                error = ext4_mark_inode_dirty(handle, inode);
                ext4_journal_stop(handle);
                if (unlikely(error)) {
                        return error;
                }
        }

        if (attr->ia_valid & ATTR_SIZE) {
                handle_t *handle;
                loff_t oldsize = inode->i_size;
                loff_t old_disksize;
                int shrink = (attr->ia_size < inode->i_size);

                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
                                return -EFBIG;
                        }
                }
                if (!S_ISREG(inode->i_mode)) {
                        return -EINVAL;
                }

                if (attr->ia_size == inode->i_size)
                        inc_ivers = false;

                if (shrink) {
                        if (ext4_should_order_data(inode)) {
                                error = ext4_begin_ordered_truncate(inode,
                                                            attr->ia_size);
                                if (error)
                                        goto err_out;
                        }
                        /*
                         * Blocks are going to be removed from the inode. Wait
                         * for dio in flight.
                         */
                        inode_dio_wait(inode);
                }

                filemap_invalidate_lock(inode->i_mapping);

                rc = ext4_break_layouts(inode);
                if (rc) {
                        filemap_invalidate_unlock(inode->i_mapping);
                        goto err_out;
                }

                if (attr->ia_size != inode->i_size) {
                        /* attach jbd2 jinode for EOF folio tail zeroing */
                        if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
                            oldsize & (inode->i_sb->s_blocksize - 1)) {
                                error = ext4_inode_attach_jinode(inode);
                                if (error)
                                        goto out_mmap_sem;
                        }

                        handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
                        if (IS_ERR(handle)) {
                                error = PTR_ERR(handle);
                                goto out_mmap_sem;
                        }
                        if (ext4_handle_valid(handle) && shrink) {
                                error = ext4_orphan_add(handle, inode);
                                orphan = 1;
                        }
                        /*
                         * Update c/mtime and tail zero the EOF folio on
                         * truncate up. ext4_truncate() handles the shrink case
                         * below.
                         */
                        if (!shrink) {
                                inode_set_mtime_to_ts(inode,
                                                      inode_set_ctime_current(inode));
                                if (oldsize & (inode->i_sb->s_blocksize - 1))
                                        ext4_block_truncate_page(handle,
                                                        inode->i_mapping, oldsize);
                        }

                        if (shrink)
                                ext4_fc_track_range(handle, inode,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits,
                                        EXT_MAX_BLOCKS - 1);
                        else
                                ext4_fc_track_range(
                                        handle, inode,
                                        (oldsize > 0 ? oldsize - 1 : oldsize) >>
                                        inode->i_sb->s_blocksize_bits,
                                        (attr->ia_size > 0 ? attr->ia_size - 1 : 0) >>
                                        inode->i_sb->s_blocksize_bits);

                        down_write(&EXT4_I(inode)->i_data_sem);
                        old_disksize = EXT4_I(inode)->i_disksize;
                        EXT4_I(inode)->i_disksize = attr->ia_size;
                        rc = ext4_mark_inode_dirty(handle, inode);
                        if (!error)
                                error = rc;
                        /*
                         * We have to update i_size under i_data_sem together
                         * with i_disksize to avoid races with writeback code
                         * running ext4_wb_update_i_disksize().
                         */
                        if (!error)
                                i_size_write(inode, attr->ia_size);
                        else
                                EXT4_I(inode)->i_disksize = old_disksize;
                        up_write(&EXT4_I(inode)->i_data_sem);
                        ext4_journal_stop(handle);
                        if (error)
                                goto out_mmap_sem;
                        if (!shrink) {
                                pagecache_isize_extended(inode, oldsize,
                                                         inode->i_size);
                        } else if (ext4_should_journal_data(inode)) {
                                ext4_wait_for_tail_page_commit(inode);
                        }
                }

                /*
                 * Truncate pagecache after we've waited for commit
                 * in data=journal mode to make pages freeable.
                 */
                truncate_pagecache(inode, inode->i_size);
                /*
                 * Call ext4_truncate() even if i_size didn't change to
                 * truncate possible preallocated blocks.
                 */
                if (attr->ia_size <= oldsize) {
                        rc = ext4_truncate(inode);
                        if (rc)
                                error = rc;
                }
out_mmap_sem:
                filemap_invalidate_unlock(inode->i_mapping);
        }

        if (!error) {
                if (inc_ivers)
                        inode_inc_iversion(inode);
                setattr_copy(idmap, inode, attr);
                mark_inode_dirty(inode);
        }

        /*
         * If the call to ext4_truncate failed to get a transaction handle at
         * all, we need to clean up the in-core orphan list manually.
         */
        if (orphan && inode->i_nlink)
                ext4_orphan_del(NULL, inode);

        if (!error && (ia_valid & ATTR_MODE))
                rc = posix_acl_chmod(idmap, dentry, inode->i_mode);

err_out:
        if  (error)
                ext4_std_error(inode->i_sb, error);
        if (!error)
                error = rc;
        return error;
}

u32 ext4_dio_alignment(struct inode *inode)
{
        if (fsverity_active(inode))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        if (ext4_has_inline_data(inode))
                return 0;
        if (IS_ENCRYPTED(inode)) {
                if (!fscrypt_dio_supported(inode))
                        return 0;
                return i_blocksize(inode);
        }
        return 1; /* use the iomap defaults */
}

int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
                 struct kstat *stat, u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        struct ext4_inode *raw_inode;
        struct ext4_inode_info *ei = EXT4_I(inode);
        unsigned int flags;

        if ((request_mask & STATX_BTIME) &&
            EXT4_FITS_IN_INODE(raw_inode, ei, i_crtime)) {
                stat->result_mask |= STATX_BTIME;
                stat->btime.tv_sec = ei->i_crtime.tv_sec;
                stat->btime.tv_nsec = ei->i_crtime.tv_nsec;
        }

        /*
         * Return the DIO alignment restrictions if requested.  We only return
         * this information when requested, since on encrypted files it might
         * take a fair bit of work to get if the file wasn't opened recently.
         */
        if ((request_mask & STATX_DIOALIGN) && S_ISREG(inode->i_mode)) {
                u32 dio_align = ext4_dio_alignment(inode);

                stat->result_mask |= STATX_DIOALIGN;
                if (dio_align == 1) {
                        struct block_device *bdev = inode->i_sb->s_bdev;

                        /* iomap defaults */
                        stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
                        stat->dio_offset_align = bdev_logical_block_size(bdev);
                } else {
                        stat->dio_mem_align = dio_align;
                        stat->dio_offset_align = dio_align;
                }
        }

        if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                unsigned int awu_min = 0, awu_max = 0;

                if (ext4_inode_can_atomic_write(inode)) {
                        awu_min = sbi->s_awu_min;
                        awu_max = sbi->s_awu_max;
                }

                generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
        }

        flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
        if (flags & EXT4_APPEND_FL)
                stat->attributes |= STATX_ATTR_APPEND;
        if (flags & EXT4_COMPR_FL)
                stat->attributes |= STATX_ATTR_COMPRESSED;
        if (flags & EXT4_ENCRYPT_FL)
                stat->attributes |= STATX_ATTR_ENCRYPTED;
        if (flags & EXT4_IMMUTABLE_FL)
                stat->attributes |= STATX_ATTR_IMMUTABLE;
        if (flags & EXT4_NODUMP_FL)
                stat->attributes |= STATX_ATTR_NODUMP;
        if (flags & EXT4_VERITY_FL)
                stat->attributes |= STATX_ATTR_VERITY;

        stat->attributes_mask |= (STATX_ATTR_APPEND |
                                  STATX_ATTR_COMPRESSED |
                                  STATX_ATTR_ENCRYPTED |
                                  STATX_ATTR_IMMUTABLE |
                                  STATX_ATTR_NODUMP |
                                  STATX_ATTR_VERITY);

        generic_fillattr(idmap, request_mask, inode, stat);
        return 0;
}

int ext4_file_getattr(struct mnt_idmap *idmap,
                      const struct path *path, struct kstat *stat,
                      u32 request_mask, unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        u64 delalloc_blocks;

        ext4_getattr(idmap, path, stat, request_mask, query_flags);

        /*
         * If there is inline data in the inode, the inode will normally not
         * have data blocks allocated (it may have an external xattr block).
         * Report at least one sector for such files, so tools like tar, rsync,
         * others don't incorrectly think the file is completely sparse.
         */
        if (unlikely(ext4_has_inline_data(inode)))
                stat->blocks += (stat->size + 511) >> 9;

        /*
         * We can't update i_blocks if the block allocation is delayed
         * otherwise in the case of system crash before the real block
         * allocation is done, we will have i_blocks inconsistent with
         * on-disk file blocks.
         * We always keep i_blocks updated together with real
         * allocation. But to not confuse with user, stat
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
        delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
                                   EXT4_I(inode)->i_reserved_data_blocks);
        stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits - 9);
        return 0;
}

static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
                                   int pextents)
{
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_ind_trans_blocks(inode, lblocks);
        return ext4_ext_index_trans_blocks(inode, pextents);
}

/*
 * Account for index blocks, block groups bitmaps and block group
 * descriptor blocks if modify datablocks and index blocks
 * worse case, the indexs blocks spread over different block groups
 *
 * If datablocks are discontiguous, they are possible to spread over
 * different block groups too. If they are contiguous, with flexbg,
 * they could still across block group boundary.
 *
 * Also account for superblock, inode, quota and xattr blocks
 */
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
                                  int pextents)
{
        ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
        int gdpblocks;
        int idxblocks;
        int ret;

        /*
         * How many index blocks need to touch to map @lblocks logical blocks
         * to @pextents physical extents?
         */
        idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);

        ret = idxblocks;

        /*
         * Now let's see how many group bitmaps and group descriptors need
         * to account
         */
        groups = idxblocks + pextents;
        gdpblocks = groups;
        if (groups > ngroups)
                groups = ngroups;
        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;

        /* bitmaps and block group descriptor blocks */
        ret += groups + gdpblocks;

        /* Blocks for super block, inode, quota and xattr blocks */
        ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);

        return ret;
}

/*
 * Calculate the total number of credits to reserve to fit
 * the modification of a single pages into a single transaction,
 * which may include multiple chunks of block allocations.
 *
 * This could be called via ext4_write_begin()
 *
 * We need to consider the worse case, when
 * one new block per extent.
 */
int ext4_writepage_trans_blocks(struct inode *inode)
{
        int bpp = ext4_journal_blocks_per_page(inode);
        int ret;

        ret = ext4_meta_trans_blocks(inode, bpp, bpp);

        /* Account for data blocks for journalled mode */
        if (ext4_should_journal_data(inode))
                ret += bpp;
        return ret;
}

/*
 * Calculate the journal credits for a chunk of data modification.
 *
 * This is called from DIO, fallocate or whoever calling
 * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks.
 *
 * journal buffers for data blocks are not included here, as DIO
 * and fallocate do no need to journal data buffers.
 */
int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
{
        return ext4_meta_trans_blocks(inode, nrblocks, 1);
}

/*
 * The caller must have previously called ext4_reserve_inode_write().
 * Give this, we know that the caller already has write access to iloc->bh.
 */
int ext4_mark_iloc_dirty(handle_t *handle,
                         struct inode *inode, struct ext4_iloc *iloc)
{
        int err = 0;

        err = ext4_emergency_state(inode->i_sb);
        if (unlikely(err)) {
                put_bh(iloc->bh);
                return err;
        }
        ext4_fc_track_inode(handle, inode);

        /* the do_update_inode consumes one bh->b_count */
        get_bh(iloc->bh);

        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
        err = ext4_do_update_inode(handle, inode, iloc);
        put_bh(iloc->bh);
        return err;
}

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int
ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                         struct ext4_iloc *iloc)
{
        int err;

        err = ext4_emergency_state(inode->i_sb);
        if (unlikely(err))
                return err;

        err = ext4_get_inode_loc(inode, iloc);
        if (!err) {
                BUFFER_TRACE(iloc->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, inode->i_sb,
                                                    iloc->bh, EXT4_JTR_NONE);
                if (err) {
                        brelse(iloc->bh);
                        iloc->bh = NULL;
                }
        }
        ext4_std_error(inode->i_sb, err);
        return err;
}

static int __ext4_expand_extra_isize(struct inode *inode,
                                     unsigned int new_extra_isize,
                                     struct ext4_iloc *iloc,
                                     handle_t *handle, int *no_expand)
{
        struct ext4_inode *raw_inode;
        struct ext4_xattr_ibody_header *header;
        unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        int error;

        /* this was checked at iget time, but double check for good measure */
        if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
            (ei->i_extra_isize & 3)) {
                EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
                                 ei->i_extra_isize,
                                 EXT4_INODE_SIZE(inode->i_sb));
                return -EFSCORRUPTED;
        }
        if ((new_extra_isize < ei->i_extra_isize) ||
            (new_extra_isize < 4) ||
            (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
                return -EINVAL;        /* Should never happen */

        raw_inode = ext4_raw_inode(iloc);

        header = IHDR(inode, raw_inode);

        /* No extended attributes present */
        if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
            header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
                memset((void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE +
                       EXT4_I(inode)->i_extra_isize, 0,
                       new_extra_isize - EXT4_I(inode)->i_extra_isize);
                EXT4_I(inode)->i_extra_isize = new_extra_isize;
                return 0;
        }

        /*
         * We may need to allocate external xattr block so we need quotas
         * initialized. Here we can be called with various locks held so we
         * cannot affort to initialize quotas ourselves. So just bail.
         */
        if (dquot_initialize_needed(inode))
                return -EAGAIN;

        /* try to expand with EAs present */
        error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
                                           raw_inode, handle);
        if (error) {
                /*
                 * Inode size expansion failed; don't try again
                 */
                *no_expand = 1;
        }

        return error;
}

/*
 * Expand an inode by new_extra_isize bytes.
 * Returns 0 on success or negative error number on failure.
 */
static int ext4_try_to_expand_extra_isize(struct inode *inode,
                                          unsigned int new_extra_isize,
                                          struct ext4_iloc iloc,
                                          handle_t *handle)
{
        int no_expand;
        int error;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND))
                return -EOVERFLOW;

        /*
         * In nojournal mode, we can immediately attempt to expand
         * the inode.  When journaled, we first need to obtain extra
         * buffer credits since we may write into the EA block
         * with this same handle. If journal_extend fails, then it will
         * only result in a minor loss of functionality for that inode.
         * If this is felt to be critical, then e2fsck should be run to
         * force a large enough s_min_extra_isize.
         */
        if (ext4_journal_extend(handle,
                                EXT4_DATA_TRANS_BLOCKS(inode->i_sb), 0) != 0)
                return -ENOSPC;

        if (ext4_write_trylock_xattr(inode, &no_expand) == 0)
                return -EBUSY;

        error = __ext4_expand_extra_isize(inode, new_extra_isize, &iloc,
                                          handle, &no_expand);
        ext4_write_unlock_xattr(inode, &no_expand);

        return error;
}

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc)
{
        handle_t *handle;
        int no_expand;
        int error, rc;

        if (ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
                brelse(iloc->bh);
                return -EOVERFLOW;
        }

        handle = ext4_journal_start(inode, EXT4_HT_INODE,
                                    EXT4_DATA_TRANS_BLOCKS(inode->i_sb));
        if (IS_ERR(handle)) {
                error = PTR_ERR(handle);
                brelse(iloc->bh);
                return error;
        }

        ext4_write_lock_xattr(inode, &no_expand);

        BUFFER_TRACE(iloc->bh, "get_write_access");
        error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh,
                                              EXT4_JTR_NONE);
        if (error) {
                brelse(iloc->bh);
                goto out_unlock;
        }

        error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
                                          handle, &no_expand);

        rc = ext4_mark_iloc_dirty(handle, inode, iloc);
        if (!error)
                error = rc;

out_unlock:
        ext4_write_unlock_xattr(inode, &no_expand);
        ext4_journal_stop(handle);
        return error;
}

/*
 * What we do here is to mark the in-core inode as clean with respect to inode
 * dirtiness (it may still be data-dirty).
 * This means that the in-core inode may be reaped by prune_icache
 * without having to perform any I/O.  This is a very good thing,
 * because *any* task may call prune_icache - even ones which
 * have a transaction open against a different journal.
 *
 * Is this cheating?  Not really.  Sure, we haven't written the
 * inode out, but prune_icache isn't a user-visible syncing function.
 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
 * we start and wait on commits.
 */
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line)
{
        struct ext4_iloc iloc;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        int err;

        might_sleep();
        trace_ext4_mark_inode_dirty(inode, _RET_IP_);
        err = ext4_reserve_inode_write(handle, inode, &iloc);
        if (err)
                goto out;

        if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize)
                ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize,
                                               iloc, handle);

        err = ext4_mark_iloc_dirty(handle, inode, &iloc);
out:
        if (unlikely(err))
                ext4_error_inode_err(inode, func, line, 0, err,
                                        "mark_inode_dirty error");
        return err;
}

/*
 * ext4_dirty_inode() is called from __mark_inode_dirty()
 *
 * We're really interested in the case where a file is being extended.
 * i_size has been changed by generic_commit_write() and we thus need
 * to include the updated inode in the current transaction.
 *
 * Also, dquot_alloc_block() will always dirty the inode when blocks
 * are allocated to the file.
 *
 * If the inode is marked synchronous, we don't honour that here - doing
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
 */
void ext4_dirty_inode(struct inode *inode, int flags)
{
        handle_t *handle;

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                return;
        ext4_mark_inode_dirty(handle, inode);
        ext4_journal_stop(handle);
}

int ext4_change_inode_journal_flag(struct inode *inode, int val)
{
        journal_t *journal;
        handle_t *handle;
        int err;
        int alloc_ctx;

        /*
         * We have to be very careful here: changing a data block's
         * journaling status dynamically is dangerous.  If we write a
         * data block to the journal, change the status and then delete
         * that block, we risk forgetting to revoke the old log record
         * from the journal and so a subsequent replay can corrupt data.
         * So, first we make sure that the journal is empty and that
         * nobody is changing anything.
         */

        journal = EXT4_JOURNAL(inode);
        if (!journal)
                return 0;
        if (is_journal_aborted(journal))
                return -EROFS;

        /* Wait for all existing dio workers */
        inode_dio_wait(inode);

        /*
         * Before flushing the journal and switching inode's aops, we have
         * to flush all dirty data the inode has. There can be outstanding
         * delayed allocations, there can be unwritten extents created by
         * fallocate or buffered writes in dioread_nolock mode covered by
         * dirty data which can be converted only after flushing the dirty
         * data (and journalled aops don't know how to handle these cases).
         */
        if (val) {
                filemap_invalidate_lock(inode->i_mapping);
                err = filemap_write_and_wait(inode->i_mapping);
                if (err < 0) {
                        filemap_invalidate_unlock(inode->i_mapping);
                        return err;
                }
        }

        alloc_ctx = ext4_writepages_down_write(inode->i_sb);
        jbd2_journal_lock_updates(journal);

        /*
         * OK, there are no updates running now, and all cached data is
         * synced to disk.  We are now in a completely consistent state
         * which doesn't have anything in the journal, and we know that
         * no filesystem updates are running, so it is safe to modify
         * the inode's in-core data-journaling state flag now.
         */

        if (val)
                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else {
                err = jbd2_journal_flush(journal, 0);
                if (err < 0) {
                        jbd2_journal_unlock_updates(journal);
                        ext4_writepages_up_write(inode->i_sb, alloc_ctx);
                        return err;
                }
                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        }
        ext4_set_aops(inode);

        jbd2_journal_unlock_updates(journal);
        ext4_writepages_up_write(inode->i_sb, alloc_ctx);

        if (val)
                filemap_invalidate_unlock(inode->i_mapping);

        /* Finally we can mark the inode as dirty. */

        handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
        if (IS_ERR(handle))
                return PTR_ERR(handle);

        ext4_fc_mark_ineligible(inode->i_sb,
                EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
        err = ext4_mark_inode_dirty(handle, inode);
        ext4_handle_sync(handle);
        ext4_journal_stop(handle);
        ext4_std_error(inode->i_sb, err);

        return err;
}

static int ext4_bh_unmapped(handle_t *handle, struct inode *inode,
                            struct buffer_head *bh)
{
        return !buffer_mapped(bh);
}

vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio = page_folio(vmf->page);
        loff_t size;
        unsigned long len;
        int err;
        vm_fault_t ret;
        struct file *file = vma->vm_file;
        struct inode *inode = file_inode(file);
        struct address_space *mapping = inode->i_mapping;
        handle_t *handle;
        get_block_t *get_block;
        int retries = 0;

        if (unlikely(IS_IMMUTABLE(inode)))
                return VM_FAULT_SIGBUS;

        sb_start_pagefault(inode->i_sb);
        file_update_time(vma->vm_file);

        filemap_invalidate_lock_shared(mapping);

        err = ext4_convert_inline_data(inode);
        if (err)
                goto out_ret;

        /*
         * On data journalling we skip straight to the transaction handle:
         * there's no delalloc; page truncated will be checked later; the
         * early return w/ all buffers mapped (calculates size/len) can't
         * be used; and there's no dioread_nolock, so only ext4_get_block.
         */
        if (ext4_should_journal_data(inode))
                goto retry_alloc;

        /* Delalloc case is easy... */
        if (test_opt(inode->i_sb, DELALLOC) &&
            !ext4_nonda_switch(inode->i_sb)) {
                do {
                        err = block_page_mkwrite(vma, vmf,
                                                   ext4_da_get_block_prep);
                } while (err == -ENOSPC &&
                       ext4_should_retry_alloc(inode->i_sb, &retries));
                goto out_ret;
        }

        folio_lock(folio);
        size = i_size_read(inode);
        /* Page got truncated from under us? */
        if (folio->mapping != mapping || folio_pos(folio) > size) {
                folio_unlock(folio);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        len = folio_size(folio);
        if (folio_pos(folio) + len > size)
                len = size - folio_pos(folio);
        /*
         * Return if we have all the buffers mapped. This avoids the need to do
         * journal_start/journal_stop which can block and take a long time
         *
         * This cannot be done for data journalling, as we have to add the
         * inode to the transaction's list to writeprotect pages on commit.
         */
        if (folio_buffers(folio)) {
                if (!ext4_walk_page_buffers(NULL, inode, folio_buffers(folio),
                                            0, len, NULL,
                                            ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
                        folio_wait_stable(folio);
                        ret = VM_FAULT_LOCKED;
                        goto out;
                }
        }
        folio_unlock(folio);
        /* OK, we need to fill the hole... */
        if (ext4_should_dioread_nolock(inode))
                get_block = ext4_get_block_unwritten;
        else
                get_block = ext4_get_block;
retry_alloc:
        handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
                                    ext4_writepage_trans_blocks(inode));
        if (IS_ERR(handle)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }
        /*
         * Data journalling can't use block_page_mkwrite() because it
         * will set_buffer_dirty() before do_journal_get_write_access()
         * thus might hit warning messages for dirty metadata buffers.
         */
        if (!ext4_should_journal_data(inode)) {
                err = block_page_mkwrite(vma, vmf, get_block);
        } else {
                folio_lock(folio);
                size = i_size_read(inode);
                /* Page got truncated from under us? */
                if (folio->mapping != mapping || folio_pos(folio) > size) {
                        ret = VM_FAULT_NOPAGE;
                        goto out_error;
                }

                len = folio_size(folio);
                if (folio_pos(folio) + len > size)
                        len = size - folio_pos(folio);

                err = ext4_block_write_begin(handle, folio, 0, len,
                                             ext4_get_block);
                if (!err) {
                        ret = VM_FAULT_SIGBUS;
                        if (ext4_journal_folio_buffers(handle, folio, len))
                                goto out_error;
                } else {
                        folio_unlock(folio);
                }
        }
        ext4_journal_stop(handle);
        if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry_alloc;
out_ret:
        ret = vmf_fs_error(err);
out:
        filemap_invalidate_unlock_shared(mapping);
        sb_end_pagefault(inode->i_sb);
        return ret;
out_error:
        folio_unlock(folio);
        ext4_journal_stop(handle);
        goto out;
}































































































































































































































































































































































































































































































































































































































































































































































































    1 














    1 




















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/condition.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/* List of "struct tomoyo_condition". */
LIST_HEAD(tomoyo_condition_list);

/**
 * tomoyo_argv - Check argv[] in "struct linux_binbrm".
 *
 * @index:   Index number of @arg_ptr.
 * @arg_ptr: Contents of argv[@index].
 * @argc:    Length of @argv.
 * @argv:    Pointer to "struct tomoyo_argv".
 * @checked: Set to true if @argv[@index] was found.
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_argv(const unsigned int index, const char *arg_ptr,
                        const int argc, const struct tomoyo_argv *argv,
                        u8 *checked)
{
        int i;
        struct tomoyo_path_info arg;

        arg.name = arg_ptr;
        for (i = 0; i < argc; argv++, checked++, i++) {
                bool result;

                if (index != argv->index)
                        continue;
                *checked = 1;
                tomoyo_fill_path_info(&arg);
                result = tomoyo_path_matches_pattern(&arg, argv->value);
                if (argv->is_not)
                        result = !result;
                if (!result)
                        return false;
        }
        return true;
}

/**
 * tomoyo_envp - Check envp[] in "struct linux_binbrm".
 *
 * @env_name:  The name of environment variable.
 * @env_value: The value of environment variable.
 * @envc:      Length of @envp.
 * @envp:      Pointer to "struct tomoyo_envp".
 * @checked:   Set to true if @envp[@env_name] was found.
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_envp(const char *env_name, const char *env_value,
                        const int envc, const struct tomoyo_envp *envp,
                        u8 *checked)
{
        int i;
        struct tomoyo_path_info name;
        struct tomoyo_path_info value;

        name.name = env_name;
        tomoyo_fill_path_info(&name);
        value.name = env_value;
        tomoyo_fill_path_info(&value);
        for (i = 0; i < envc; envp++, checked++, i++) {
                bool result;

                if (!tomoyo_path_matches_pattern(&name, envp->name))
                        continue;
                *checked = 1;
                if (envp->value) {
                        result = tomoyo_path_matches_pattern(&value,
                                                             envp->value);
                        if (envp->is_not)
                                result = !result;
                } else {
                        result = true;
                        if (!envp->is_not)
                                result = !result;
                }
                if (!result)
                        return false;
        }
        return true;
}

/**
 * tomoyo_scan_bprm - Scan "struct linux_binprm".
 *
 * @ee:   Pointer to "struct tomoyo_execve".
 * @argc: Length of @argc.
 * @argv: Pointer to "struct tomoyo_argv".
 * @envc: Length of @envp.
 * @envp: Pointer to "struct tomoyo_envp".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_scan_bprm(struct tomoyo_execve *ee,
                             const u16 argc, const struct tomoyo_argv *argv,
                             const u16 envc, const struct tomoyo_envp *envp)
{
        struct linux_binprm *bprm = ee->bprm;
        struct tomoyo_page_dump *dump = &ee->dump;
        char *arg_ptr = ee->tmp;
        int arg_len = 0;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        bool result = true;
        u8 local_checked[32];
        u8 *checked;

        if (argc + envc <= sizeof(local_checked)) {
                checked = local_checked;
                memset(local_checked, 0, sizeof(local_checked));
        } else {
                checked = kzalloc(argc + envc, GFP_NOFS);
                if (!checked)
                        return false;
        }
        while (argv_count || envp_count) {
                if (!tomoyo_dump_page(bprm, pos, dump)) {
                        result = false;
                        goto out;
                }
                pos += PAGE_SIZE - offset;
                while (offset < PAGE_SIZE) {
                        /* Read. */
                        const char *kaddr = dump->data;
                        const unsigned char c = kaddr[offset++];

                        if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) {
                                if (c == '\\') {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = '\\';
                                } else if (c > ' ' && c < 127) {
                                        arg_ptr[arg_len++] = c;
                                } else {
                                        arg_ptr[arg_len++] = '\\';
                                        arg_ptr[arg_len++] = (c >> 6) + '0';
                                        arg_ptr[arg_len++] =
                                                ((c >> 3) & 7) + '0';
                                        arg_ptr[arg_len++] = (c & 7) + '0';
                                }
                        } else {
                                arg_ptr[arg_len] = '\0';
                        }
                        if (c)
                                continue;
                        /* Check. */
                        if (argv_count) {
                                if (!tomoyo_argv(bprm->argc - argv_count,
                                                 arg_ptr, argc, argv,
                                                 checked)) {
                                        result = false;
                                        break;
                                }
                                argv_count--;
                        } else if (envp_count) {
                                char *cp = strchr(arg_ptr, '=');

                                if (cp) {
                                        *cp = '\0';
                                        if (!tomoyo_envp(arg_ptr, cp + 1,
                                                         envc, envp,
                                                         checked + argc)) {
                                                result = false;
                                                break;
                                        }
                                }
                                envp_count--;
                        } else {
                                break;
                        }
                        arg_len = 0;
                }
                offset = 0;
                if (!result)
                        break;
        }
out:
        if (result) {
                int i;

                /* Check not-yet-checked entries. */
                for (i = 0; i < argc; i++) {
                        if (checked[i])
                                continue;
                        /*
                         * Return true only if all unchecked indexes in
                         * bprm->argv[] are not matched.
                         */
                        if (argv[i].is_not)
                                continue;
                        result = false;
                        break;
                }
                for (i = 0; i < envc; envp++, i++) {
                        if (checked[argc + i])
                                continue;
                        /*
                         * Return true only if all unchecked environ variables
                         * in bprm->envp[] are either undefined or not matched.
                         */
                        if ((!envp->value && !envp->is_not) ||
                            (envp->value && envp->is_not))
                                continue;
                        result = false;
                        break;
                }
        }
        if (checked != local_checked)
                kfree(checked);
        return result;
}

/**
 * tomoyo_scan_exec_realpath - Check "exec.realpath" parameter of "struct tomoyo_condition".
 *
 * @file:  Pointer to "struct file".
 * @ptr:   Pointer to "struct tomoyo_name_union".
 * @match: True if "exec.realpath=", false if "exec.realpath!=".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_scan_exec_realpath(struct file *file,
                                      const struct tomoyo_name_union *ptr,
                                      const bool match)
{
        bool result;
        struct tomoyo_path_info exe;

        if (!file)
                return false;
        exe.name = tomoyo_realpath_from_path(&file->f_path);
        if (!exe.name)
                return false;
        tomoyo_fill_path_info(&exe);
        result = tomoyo_compare_name_union(&exe, ptr);
        kfree(exe.name);
        return result == match;
}

/**
 * tomoyo_get_dqword - tomoyo_get_name() for a quoted string.
 *
 * @start: String to save.
 *
 * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise.
 */
static const struct tomoyo_path_info *tomoyo_get_dqword(char *start)
{
        char *cp = start + strlen(start) - 1;

        if (cp == start || *start++ != '"' || *cp != '"')
                return NULL;
        *cp = '\0';
        if (*start && !tomoyo_correct_word(start))
                return NULL;
        return tomoyo_get_name(start);
}

/**
 * tomoyo_parse_name_union_quoted - Parse a quoted word.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @ptr:   Pointer to "struct tomoyo_name_union".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param,
                                           struct tomoyo_name_union *ptr)
{
        char *filename = param->data;

        if (*filename == '@')
                return tomoyo_parse_name_union(param, ptr);
        ptr->filename = tomoyo_get_dqword(filename);
        return ptr->filename != NULL;
}

/**
 * tomoyo_parse_argv - Parse an argv[] condition part.
 *
 * @left:  Lefthand value.
 * @right: Righthand value.
 * @argv:  Pointer to "struct tomoyo_argv".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_parse_argv(char *left, char *right,
                              struct tomoyo_argv *argv)
{
        if (tomoyo_parse_ulong(&argv->index, &left) !=
            TOMOYO_VALUE_TYPE_DECIMAL || *left++ != ']' || *left)
                return false;
        argv->value = tomoyo_get_dqword(right);
        return argv->value != NULL;
}

/**
 * tomoyo_parse_envp - Parse an envp[] condition part.
 *
 * @left:  Lefthand value.
 * @right: Righthand value.
 * @envp:  Pointer to "struct tomoyo_envp".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_parse_envp(char *left, char *right,
                              struct tomoyo_envp *envp)
{
        const struct tomoyo_path_info *name;
        const struct tomoyo_path_info *value;
        char *cp = left + strlen(left) - 1;

        if (*cp-- != ']' || *cp != '"')
                goto out;
        *cp = '\0';
        if (!tomoyo_correct_word(left))
                goto out;
        name = tomoyo_get_name(left);
        if (!name)
                goto out;
        if (!strcmp(right, "NULL")) {
                value = NULL;
        } else {
                value = tomoyo_get_dqword(right);
                if (!value) {
                        tomoyo_put_name(name);
                        goto out;
                }
        }
        envp->name = name;
        envp->value = value;
        return true;
out:
        return false;
}

/**
 * tomoyo_same_condition - Check for duplicated "struct tomoyo_condition" entry.
 *
 * @a: Pointer to "struct tomoyo_condition".
 * @b: Pointer to "struct tomoyo_condition".
 *
 * Returns true if @a == @b, false otherwise.
 */
static inline bool tomoyo_same_condition(const struct tomoyo_condition *a,
                                         const struct tomoyo_condition *b)
{
        return a->size == b->size && a->condc == b->condc &&
                a->numbers_count == b->numbers_count &&
                a->names_count == b->names_count &&
                a->argc == b->argc && a->envc == b->envc &&
                a->grant_log == b->grant_log && a->transit == b->transit &&
                !memcmp(a + 1, b + 1, a->size - sizeof(*a));
}

/**
 * tomoyo_condition_type - Get condition type.
 *
 * @word: Keyword string.
 *
 * Returns one of values in "enum tomoyo_conditions_index" on success,
 * TOMOYO_MAX_CONDITION_KEYWORD otherwise.
 */
static u8 tomoyo_condition_type(const char *word)
{
        u8 i;

        for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) {
                if (!strcmp(word, tomoyo_condition_keyword[i]))
                        break;
        }
        return i;
}

/* Define this to enable debug mode. */
/* #define DEBUG_CONDITION */

#ifdef DEBUG_CONDITION
#define dprintk printk
#else
#define dprintk(...) do { } while (0)
#endif

/**
 * tomoyo_commit_condition - Commit "struct tomoyo_condition".
 *
 * @entry: Pointer to "struct tomoyo_condition".
 *
 * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise.
 *
 * This function merges duplicated entries. This function returns NULL if
 * @entry is not duplicated but memory quota for policy has exceeded.
 */
static struct tomoyo_condition *tomoyo_commit_condition
(struct tomoyo_condition *entry)
{
        struct tomoyo_condition *ptr;
        bool found = false;

        if (mutex_lock_interruptible(&tomoyo_policy_lock)) {
                dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__);
                ptr = NULL;
                found = true;
                goto out;
        }
        list_for_each_entry(ptr, &tomoyo_condition_list, head.list) {
                if (!tomoyo_same_condition(ptr, entry) ||
                    atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS)
                        continue;
                /* Same entry found. Share this entry. */
                atomic_inc(&ptr->head.users);
                found = true;
                break;
        }
        if (!found) {
                if (tomoyo_memory_ok(entry)) {
                        atomic_set(&entry->head.users, 1);
                        list_add(&entry->head.list, &tomoyo_condition_list);
                } else {
                        found = true;
                        ptr = NULL;
                }
        }
        mutex_unlock(&tomoyo_policy_lock);
out:
        if (found) {
                tomoyo_del_condition(&entry->head.list);
                kfree(entry);
                entry = ptr;
        }
        return entry;
}

/**
 * tomoyo_get_transit_preference - Parse domain transition preference for execve().
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 * @e:     Pointer to "struct tomoyo_condition".
 *
 * Returns the condition string part.
 */
static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param,
                                           struct tomoyo_condition *e)
{
        char * const pos = param->data;
        bool flag;

        if (*pos == '<') {
                e->transit = tomoyo_get_domainname(param);
                goto done;
        }
        {
                char *cp = strchr(pos, ' ');

                if (cp)
                        *cp = '\0';
                flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") ||
                        !strcmp(pos, "initialize") || !strcmp(pos, "reset") ||
                        !strcmp(pos, "child") || !strcmp(pos, "parent");
                if (cp)
                        *cp = ' ';
        }
        if (!flag)
                return pos;
        e->transit = tomoyo_get_name(tomoyo_read_token(param));
done:
        if (e->transit)
                return param->data;
        /*
         * Return a bad read-only condition string that will let
         * tomoyo_get_condition() return NULL.
         */
        return "/";
}

/**
 * tomoyo_get_condition - Parse condition part.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise.
 */
struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param)
{
        struct tomoyo_condition *entry = NULL;
        struct tomoyo_condition_element *condp = NULL;
        struct tomoyo_number_union *numbers_p = NULL;
        struct tomoyo_name_union *names_p = NULL;
        struct tomoyo_argv *argv = NULL;
        struct tomoyo_envp *envp = NULL;
        struct tomoyo_condition e = { };
        char * const start_of_string =
                tomoyo_get_transit_preference(param, &e);
        char * const end_of_string = start_of_string + strlen(start_of_string);
        char *pos;

rerun:
        pos = start_of_string;
        while (1) {
                u8 left = -1;
                u8 right = -1;
                char *left_word = pos;
                char *cp;
                char *right_word;
                bool is_not;

                if (!*left_word)
                        break;
                /*
                 * Since left-hand condition does not allow use of "path_group"
                 * or "number_group" and environment variable's names do not
                 * accept '=', it is guaranteed that the original line consists
                 * of one or more repetition of $left$operator$right blocks
                 * where "$left is free from '=' and ' '" and "$operator is
                 * either '=' or '!='" and "$right is free from ' '".
                 * Therefore, we can reconstruct the original line at the end
                 * of dry run even if we overwrite $operator with '\0'.
                 */
                cp = strchr(pos, ' ');
                if (cp) {
                        *cp = '\0'; /* Will restore later. */
                        pos = cp + 1;
                } else {
                        pos = "";
                }
                right_word = strchr(left_word, '=');
                if (!right_word || right_word == left_word)
                        goto out;
                is_not = *(right_word - 1) == '!';
                if (is_not)
                        *(right_word++ - 1) = '\0'; /* Will restore later. */
                else if (*(right_word + 1) != '=')
                        *right_word++ = '\0'; /* Will restore later. */
                else
                        goto out;
                dprintk(KERN_WARNING "%u: <%s>%s=<%s>\n", __LINE__, left_word,
                        is_not ? "!" : "", right_word);
                if (!strcmp(left_word, "grant_log")) {
                        if (entry) {
                                if (is_not ||
                                    entry->grant_log != TOMOYO_GRANTLOG_AUTO)
                                        goto out;
                                else if (!strcmp(right_word, "yes"))
                                        entry->grant_log = TOMOYO_GRANTLOG_YES;
                                else if (!strcmp(right_word, "no"))
                                        entry->grant_log = TOMOYO_GRANTLOG_NO;
                                else
                                        goto out;
                        }
                        continue;
                }
                if (!strncmp(left_word, "exec.argv[", 10)) {
                        if (!argv) {
                                e.argc++;
                                e.condc++;
                        } else {
                                e.argc--;
                                e.condc--;
                                left = TOMOYO_ARGV_ENTRY;
                                argv->is_not = is_not;
                                if (!tomoyo_parse_argv(left_word + 10,
                                                       right_word, argv++))
                                        goto out;
                        }
                        goto store_value;
                }
                if (!strncmp(left_word, "exec.envp[\"", 11)) {
                        if (!envp) {
                                e.envc++;
                                e.condc++;
                        } else {
                                e.envc--;
                                e.condc--;
                                left = TOMOYO_ENVP_ENTRY;
                                envp->is_not = is_not;
                                if (!tomoyo_parse_envp(left_word + 11,
                                                       right_word, envp++))
                                        goto out;
                        }
                        goto store_value;
                }
                left = tomoyo_condition_type(left_word);
                dprintk(KERN_WARNING "%u: <%s> left=%u\n", __LINE__, left_word,
                        left);
                if (left == TOMOYO_MAX_CONDITION_KEYWORD) {
                        if (!numbers_p) {
                                e.numbers_count++;
                        } else {
                                e.numbers_count--;
                                left = TOMOYO_NUMBER_UNION;
                                param->data = left_word;
                                if (*left_word == '@' ||
                                    !tomoyo_parse_number_union(param,
                                                               numbers_p++))
                                        goto out;
                        }
                }
                if (!condp)
                        e.condc++;
                else
                        e.condc--;
                if (left == TOMOYO_EXEC_REALPATH ||
                    left == TOMOYO_SYMLINK_TARGET) {
                        if (!names_p) {
                                e.names_count++;
                        } else {
                                e.names_count--;
                                right = TOMOYO_NAME_UNION;
                                param->data = right_word;
                                if (!tomoyo_parse_name_union_quoted(param,
                                                                    names_p++))
                                        goto out;
                        }
                        goto store_value;
                }
                right = tomoyo_condition_type(right_word);
                if (right == TOMOYO_MAX_CONDITION_KEYWORD) {
                        if (!numbers_p) {
                                e.numbers_count++;
                        } else {
                                e.numbers_count--;
                                right = TOMOYO_NUMBER_UNION;
                                param->data = right_word;
                                if (!tomoyo_parse_number_union(param,
                                                               numbers_p++))
                                        goto out;
                        }
                }
store_value:
                if (!condp) {
                        dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n",
                                __LINE__, left, right, !is_not);
                        continue;
                }
                condp->left = left;
                condp->right = right;
                condp->equals = !is_not;
                dprintk(KERN_WARNING "%u: left=%u right=%u match=%u\n",
                        __LINE__, condp->left, condp->right,
                        condp->equals);
                condp++;
        }
        dprintk(KERN_INFO "%u: cond=%u numbers=%u names=%u ac=%u ec=%u\n",
                __LINE__, e.condc, e.numbers_count, e.names_count, e.argc,
                e.envc);
        if (entry) {
                BUG_ON(e.names_count | e.numbers_count | e.argc | e.envc |
                       e.condc);
                return tomoyo_commit_condition(entry);
        }
        e.size = sizeof(*entry)
                + e.condc * sizeof(struct tomoyo_condition_element)
                + e.numbers_count * sizeof(struct tomoyo_number_union)
                + e.names_count * sizeof(struct tomoyo_name_union)
                + e.argc * sizeof(struct tomoyo_argv)
                + e.envc * sizeof(struct tomoyo_envp);
        entry = kzalloc(e.size, GFP_NOFS);
        if (!entry)
                goto out2;
        *entry = e;
        e.transit = NULL;
        condp = (struct tomoyo_condition_element *) (entry + 1);
        numbers_p = (struct tomoyo_number_union *) (condp + e.condc);
        names_p = (struct tomoyo_name_union *) (numbers_p + e.numbers_count);
        argv = (struct tomoyo_argv *) (names_p + e.names_count);
        envp = (struct tomoyo_envp *) (argv + e.argc);
        {
                bool flag = false;

                for (pos = start_of_string; pos < end_of_string; pos++) {
                        if (*pos)
                                continue;
                        if (flag) /* Restore " ". */
                                *pos = ' ';
                        else if (*(pos + 1) == '=') /* Restore "!=". */
                                *pos = '!';
                        else /* Restore "=". */
                                *pos = '=';
                        flag = !flag;
                }
        }
        goto rerun;
out:
        dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__);
        if (entry) {
                tomoyo_del_condition(&entry->head.list);
                kfree(entry);
        }
out2:
        tomoyo_put_name(e.transit);
        return NULL;
}

/**
 * tomoyo_get_attributes - Revalidate "struct inode".
 *
 * @obj: Pointer to "struct tomoyo_obj_info".
 *
 * Returns nothing.
 */
void tomoyo_get_attributes(struct tomoyo_obj_info *obj)
{
        u8 i;
        struct dentry *dentry = NULL;

        for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) {
                struct inode *inode;

                switch (i) {
                case TOMOYO_PATH1:
                        dentry = obj->path1.dentry;
                        if (!dentry)
                                continue;
                        break;
                case TOMOYO_PATH2:
                        dentry = obj->path2.dentry;
                        if (!dentry)
                                continue;
                        break;
                default:
                        if (!dentry)
                                continue;
                        dentry = dget_parent(dentry);
                        break;
                }
                inode = d_backing_inode(dentry);
                if (inode) {
                        struct tomoyo_mini_stat *stat = &obj->stat[i];

                        stat->uid  = inode->i_uid;
                        stat->gid  = inode->i_gid;
                        stat->ino  = inode->i_ino;
                        stat->mode = inode->i_mode;
                        stat->dev  = inode->i_sb->s_dev;
                        stat->rdev = inode->i_rdev;
                        obj->stat_valid[i] = true;
                }
                if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */
                        dput(dentry);
        }
}

/**
 * tomoyo_condition - Check condition part.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @cond: Pointer to "struct tomoyo_condition". Maybe NULL.
 *
 * Returns true on success, false otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
bool tomoyo_condition(struct tomoyo_request_info *r,
                      const struct tomoyo_condition *cond)
{
        u32 i;
        unsigned long min_v[2] = { 0, 0 };
        unsigned long max_v[2] = { 0, 0 };
        const struct tomoyo_condition_element *condp;
        const struct tomoyo_number_union *numbers_p;
        const struct tomoyo_name_union *names_p;
        const struct tomoyo_argv *argv;
        const struct tomoyo_envp *envp;
        struct tomoyo_obj_info *obj;
        u16 condc;
        u16 argc;
        u16 envc;
        struct linux_binprm *bprm = NULL;

        if (!cond)
                return true;
        condc = cond->condc;
        argc = cond->argc;
        envc = cond->envc;
        obj = r->obj;
        if (r->ee)
                bprm = r->ee->bprm;
        if (!bprm && (argc || envc))
                return false;
        condp = (struct tomoyo_condition_element *) (cond + 1);
        numbers_p = (const struct tomoyo_number_union *) (condp + condc);
        names_p = (const struct tomoyo_name_union *)
                (numbers_p + cond->numbers_count);
        argv = (const struct tomoyo_argv *) (names_p + cond->names_count);
        envp = (const struct tomoyo_envp *) (argv + argc);
        for (i = 0; i < condc; i++) {
                const bool match = condp->equals;
                const u8 left = condp->left;
                const u8 right = condp->right;
                bool is_bitop[2] = { false, false };
                u8 j;

                condp++;
                /* Check argv[] and envp[] later. */
                if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY)
                        continue;
                /* Check string expressions. */
                if (right == TOMOYO_NAME_UNION) {
                        const struct tomoyo_name_union *ptr = names_p++;
                        struct tomoyo_path_info *symlink;
                        struct tomoyo_execve *ee;
                        struct file *file;

                        switch (left) {
                        case TOMOYO_SYMLINK_TARGET:
                                symlink = obj ? obj->symlink_target : NULL;
                                if (!symlink ||
                                    !tomoyo_compare_name_union(symlink, ptr)
                                    == match)
                                        goto out;
                                break;
                        case TOMOYO_EXEC_REALPATH:
                                ee = r->ee;
                                file = ee ? ee->bprm->file : NULL;
                                if (!tomoyo_scan_exec_realpath(file, ptr,
                                                               match))
                                        goto out;
                                break;
                        }
                        continue;
                }
                /* Check numeric or bit-op expressions. */
                for (j = 0; j < 2; j++) {
                        const u8 index = j ? right : left;
                        unsigned long value = 0;

                        switch (index) {
                        case TOMOYO_TASK_UID:
                                value = from_kuid(&init_user_ns, current_uid());
                                break;
                        case TOMOYO_TASK_EUID:
                                value = from_kuid(&init_user_ns, current_euid());
                                break;
                        case TOMOYO_TASK_SUID:
                                value = from_kuid(&init_user_ns, current_suid());
                                break;
                        case TOMOYO_TASK_FSUID:
                                value = from_kuid(&init_user_ns, current_fsuid());
                                break;
                        case TOMOYO_TASK_GID:
                                value = from_kgid(&init_user_ns, current_gid());
                                break;
                        case TOMOYO_TASK_EGID:
                                value = from_kgid(&init_user_ns, current_egid());
                                break;
                        case TOMOYO_TASK_SGID:
                                value = from_kgid(&init_user_ns, current_sgid());
                                break;
                        case TOMOYO_TASK_FSGID:
                                value = from_kgid(&init_user_ns, current_fsgid());
                                break;
                        case TOMOYO_TASK_PID:
                                value = tomoyo_sys_getpid();
                                break;
                        case TOMOYO_TASK_PPID:
                                value = tomoyo_sys_getppid();
                                break;
                        case TOMOYO_TYPE_IS_SOCKET:
                                value = S_IFSOCK;
                                break;
                        case TOMOYO_TYPE_IS_SYMLINK:
                                value = S_IFLNK;
                                break;
                        case TOMOYO_TYPE_IS_FILE:
                                value = S_IFREG;
                                break;
                        case TOMOYO_TYPE_IS_BLOCK_DEV:
                                value = S_IFBLK;
                                break;
                        case TOMOYO_TYPE_IS_DIRECTORY:
                                value = S_IFDIR;
                                break;
                        case TOMOYO_TYPE_IS_CHAR_DEV:
                                value = S_IFCHR;
                                break;
                        case TOMOYO_TYPE_IS_FIFO:
                                value = S_IFIFO;
                                break;
                        case TOMOYO_MODE_SETUID:
                                value = S_ISUID;
                                break;
                        case TOMOYO_MODE_SETGID:
                                value = S_ISGID;
                                break;
                        case TOMOYO_MODE_STICKY:
                                value = S_ISVTX;
                                break;
                        case TOMOYO_MODE_OWNER_READ:
                                value = 0400;
                                break;
                        case TOMOYO_MODE_OWNER_WRITE:
                                value = 0200;
                                break;
                        case TOMOYO_MODE_OWNER_EXECUTE:
                                value = 0100;
                                break;
                        case TOMOYO_MODE_GROUP_READ:
                                value = 0040;
                                break;
                        case TOMOYO_MODE_GROUP_WRITE:
                                value = 0020;
                                break;
                        case TOMOYO_MODE_GROUP_EXECUTE:
                                value = 0010;
                                break;
                        case TOMOYO_MODE_OTHERS_READ:
                                value = 0004;
                                break;
                        case TOMOYO_MODE_OTHERS_WRITE:
                                value = 0002;
                                break;
                        case TOMOYO_MODE_OTHERS_EXECUTE:
                                value = 0001;
                                break;
                        case TOMOYO_EXEC_ARGC:
                                if (!bprm)
                                        goto out;
                                value = bprm->argc;
                                break;
                        case TOMOYO_EXEC_ENVC:
                                if (!bprm)
                                        goto out;
                                value = bprm->envc;
                                break;
                        case TOMOYO_NUMBER_UNION:
                                /* Fetch values later. */
                                break;
                        default:
                                if (!obj)
                                        goto out;
                                if (!obj->validate_done) {
                                        tomoyo_get_attributes(obj);
                                        obj->validate_done = true;
                                }
                                {
                                        u8 stat_index;
                                        struct tomoyo_mini_stat *stat;

                                        switch (index) {
                                        case TOMOYO_PATH1_UID:
                                        case TOMOYO_PATH1_GID:
                                        case TOMOYO_PATH1_INO:
                                        case TOMOYO_PATH1_MAJOR:
                                        case TOMOYO_PATH1_MINOR:
                                        case TOMOYO_PATH1_TYPE:
                                        case TOMOYO_PATH1_DEV_MAJOR:
                                        case TOMOYO_PATH1_DEV_MINOR:
                                        case TOMOYO_PATH1_PERM:
                                                stat_index = TOMOYO_PATH1;
                                                break;
                                        case TOMOYO_PATH2_UID:
                                        case TOMOYO_PATH2_GID:
                                        case TOMOYO_PATH2_INO:
                                        case TOMOYO_PATH2_MAJOR:
                                        case TOMOYO_PATH2_MINOR:
                                        case TOMOYO_PATH2_TYPE:
                                        case TOMOYO_PATH2_DEV_MAJOR:
                                        case TOMOYO_PATH2_DEV_MINOR:
                                        case TOMOYO_PATH2_PERM:
                                                stat_index = TOMOYO_PATH2;
                                                break;
                                        case TOMOYO_PATH1_PARENT_UID:
                                        case TOMOYO_PATH1_PARENT_GID:
                                        case TOMOYO_PATH1_PARENT_INO:
                                        case TOMOYO_PATH1_PARENT_PERM:
                                                stat_index =
                                                        TOMOYO_PATH1_PARENT;
                                                break;
                                        case TOMOYO_PATH2_PARENT_UID:
                                        case TOMOYO_PATH2_PARENT_GID:
                                        case TOMOYO_PATH2_PARENT_INO:
                                        case TOMOYO_PATH2_PARENT_PERM:
                                                stat_index =
                                                        TOMOYO_PATH2_PARENT;
                                                break;
                                        default:
                                                goto out;
                                        }
                                        if (!obj->stat_valid[stat_index])
                                                goto out;
                                        stat = &obj->stat[stat_index];
                                        switch (index) {
                                        case TOMOYO_PATH1_UID:
                                        case TOMOYO_PATH2_UID:
                                        case TOMOYO_PATH1_PARENT_UID:
                                        case TOMOYO_PATH2_PARENT_UID:
                                                value = from_kuid(&init_user_ns, stat->uid);
                                                break;
                                        case TOMOYO_PATH1_GID:
                                        case TOMOYO_PATH2_GID:
                                        case TOMOYO_PATH1_PARENT_GID:
                                        case TOMOYO_PATH2_PARENT_GID:
                                                value = from_kgid(&init_user_ns, stat->gid);
                                                break;
                                        case TOMOYO_PATH1_INO:
                                        case TOMOYO_PATH2_INO:
                                        case TOMOYO_PATH1_PARENT_INO:
                                        case TOMOYO_PATH2_PARENT_INO:
                                                value = stat->ino;
                                                break;
                                        case TOMOYO_PATH1_MAJOR:
                                        case TOMOYO_PATH2_MAJOR:
                                                value = MAJOR(stat->dev);
                                                break;
                                        case TOMOYO_PATH1_MINOR:
                                        case TOMOYO_PATH2_MINOR:
                                                value = MINOR(stat->dev);
                                                break;
                                        case TOMOYO_PATH1_TYPE:
                                        case TOMOYO_PATH2_TYPE:
                                                value = stat->mode & S_IFMT;
                                                break;
                                        case TOMOYO_PATH1_DEV_MAJOR:
                                        case TOMOYO_PATH2_DEV_MAJOR:
                                                value = MAJOR(stat->rdev);
                                                break;
                                        case TOMOYO_PATH1_DEV_MINOR:
                                        case TOMOYO_PATH2_DEV_MINOR:
                                                value = MINOR(stat->rdev);
                                                break;
                                        case TOMOYO_PATH1_PERM:
                                        case TOMOYO_PATH2_PERM:
                                        case TOMOYO_PATH1_PARENT_PERM:
                                        case TOMOYO_PATH2_PARENT_PERM:
                                                value = stat->mode & S_IALLUGO;
                                                break;
                                        }
                                }
                                break;
                        }
                        max_v[j] = value;
                        min_v[j] = value;
                        switch (index) {
                        case TOMOYO_MODE_SETUID:
                        case TOMOYO_MODE_SETGID:
                        case TOMOYO_MODE_STICKY:
                        case TOMOYO_MODE_OWNER_READ:
                        case TOMOYO_MODE_OWNER_WRITE:
                        case TOMOYO_MODE_OWNER_EXECUTE:
                        case TOMOYO_MODE_GROUP_READ:
                        case TOMOYO_MODE_GROUP_WRITE:
                        case TOMOYO_MODE_GROUP_EXECUTE:
                        case TOMOYO_MODE_OTHERS_READ:
                        case TOMOYO_MODE_OTHERS_WRITE:
                        case TOMOYO_MODE_OTHERS_EXECUTE:
                                is_bitop[j] = true;
                        }
                }
                if (left == TOMOYO_NUMBER_UNION) {
                        /* Fetch values now. */
                        const struct tomoyo_number_union *ptr = numbers_p++;

                        min_v[0] = ptr->values[0];
                        max_v[0] = ptr->values[1];
                }
                if (right == TOMOYO_NUMBER_UNION) {
                        /* Fetch values now. */
                        const struct tomoyo_number_union *ptr = numbers_p++;

                        if (ptr->group) {
                                if (tomoyo_number_matches_group(min_v[0],
                                                                max_v[0],
                                                                ptr->group)
                                    == match)
                                        continue;
                        } else {
                                if ((min_v[0] <= ptr->values[1] &&
                                     max_v[0] >= ptr->values[0]) == match)
                                        continue;
                        }
                        goto out;
                }
                /*
                 * Bit operation is valid only when counterpart value
                 * represents permission.
                 */
                if (is_bitop[0] && is_bitop[1]) {
                        goto out;
                } else if (is_bitop[0]) {
                        switch (right) {
                        case TOMOYO_PATH1_PERM:
                        case TOMOYO_PATH1_PARENT_PERM:
                        case TOMOYO_PATH2_PERM:
                        case TOMOYO_PATH2_PARENT_PERM:
                                if (!(max_v[0] & max_v[1]) == !match)
                                        continue;
                        }
                        goto out;
                } else if (is_bitop[1]) {
                        switch (left) {
                        case TOMOYO_PATH1_PERM:
                        case TOMOYO_PATH1_PARENT_PERM:
                        case TOMOYO_PATH2_PERM:
                        case TOMOYO_PATH2_PARENT_PERM:
                                if (!(max_v[0] & max_v[1]) == !match)
                                        continue;
                        }
                        goto out;
                }
                /* Normal value range comparison. */
                if ((min_v[0] <= max_v[1] && max_v[0] >= min_v[1]) == match)
                        continue;
out:
                return false;
        }
        /* Check argv[] and envp[] now. */
        if (r->ee && (argc || envc))
                return tomoyo_scan_bprm(r->ee, argc, argv, envc, envp);
        return true;
}







































































































































































































































  152 










  719 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  285 
    2 






















  286 

  286 
  284 






























































































  246 







  247 





















































































































  710 





  709 



















































   35 




   35 
















































































































  692 










  695 


































  694 














  317 






  317 



















  255 







  256 

  256 





  248 







  637 




















  609 
  608 


  610 





    4 
    4 





















































































































































































  307 





  307 













  306 






  255 
  256 














































































































































  107 



  107 


































































































































































































































































  122 





  122 



















































   50 



   50 



















































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Simple NUMA memory policy for the Linux kernel.
 *
 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
 *
 * NUMA policy allows the user to give hints in which node(s) memory should
 * be allocated.
 *
 * Support six policies per VMA and per process:
 *
 * The VMA policy has priority over the process policy for a page fault.
 *
 * interleave     Allocate memory interleaved over a set of nodes,
 *                with normal fallback if it fails.
 *                For VMA based allocations this interleaves based on the
 *                offset into the backing object or offset into the mapping
 *                for anonymous memory. For process policy an process counter
 *                is used.
 *
 * weighted interleave
 *                Allocate memory interleaved over a set of nodes based on
 *                a set of weights (per-node), with normal fallback if it
 *                fails.  Otherwise operates the same as interleave.
 *                Example: nodeset(0,1) & weights (2,1) - 2 pages allocated
 *                on node 0 for every 1 page allocated on node 1.
 *
 * bind           Only allocate memory on a specific set of nodes,
 *                no fallback.
 *                FIXME: memory is allocated starting with the first node
 *                to the last. It would be better if bind would truly restrict
 *                the allocation to memory nodes instead
 *
 * preferred      Try a specific node first before normal fallback.
 *                As a special case NUMA_NO_NODE here means do the allocation
 *                on the local CPU. This is normally identical to default,
 *                but useful to set in a VMA when you have a non default
 *                process policy.
 *
 * preferred many Try a set of nodes first before normal fallback. This is
 *                similar to preferred without the special case.
 *
 * default        Allocate on the local node first, or when on a VMA
 *                use the process policy. This is what Linux always did
 *                  in a NUMA aware kernel and still does by, ahem, default.
 *
 * The process policy is applied for most non interrupt memory allocations
 * in that process' context. Interrupts ignore the policies and always
 * try to allocate on the local CPU. The VMA policy is only applied for memory
 * allocations for a VMA in the VM.
 *
 * Currently there are a few corner cases in swapping where the policy
 * is not applied, but the majority should be handled. When process policy
 * is used it is not remembered over swap outs/swap ins.
 *
 * Only the highest zone in the zone hierarchy gets policied. Allocations
 * requesting a lower zone just use default policy. This implies that
 * on systems with highmem kernel lowmem allocation don't get policied.
 * Same with GFP_DMA allocations.
 *
 * For shmem/tmpfs shared memory the policy is shared between
 * all users and remembered even when nobody has memory mapped.
 */

/* Notebook:
   fix mmap readahead to honour policy and enable policy for any page cache
   object
   statistics for bigpages
   global policy for page cache? currently it uses process policy. Requires
   first item above.
   handle mremap for shared memory (currently ignored for the policy)
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
*/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mempolicy.h>
#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
#include <linux/ptrace.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>

#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>

#include "internal.h"

/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)        /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT       (MPOL_MF_INTERNAL << 1)        /* Invert check for nodemask */
#define MPOL_MF_WRLOCK       (MPOL_MF_INTERNAL << 2)        /* Write-lock walked vmas */

static struct kmem_cache *policy_cache;
static struct kmem_cache *sn_cache;

/* Highest zone. An specific allocation for a zone below that is not
   policied. */
enum zone_type policy_zone = 0;

/*
 * run-time system-wide default policy => local allocation
 */
static struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
        .mode = MPOL_LOCAL,
};

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

/*
 * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
 * system-default value should be used. A NULL iw_table also denotes that
 * system-default values should be used. Until the system-default table
 * is implemented, the system-default is always 1.
 *
 * iw_table is RCU protected
 */
static u8 __rcu *iw_table;
static DEFINE_MUTEX(iw_table_lock);

static u8 get_il_weight(int node)
{
        u8 *table;
        u8 weight;

        rcu_read_lock();
        table = rcu_dereference(iw_table);
        /* if no iw_table, use system default */
        weight = table ? table[node] : 1;
        /* if value in iw_table is 0, use system default */
        weight = weight ? weight : 1;
        rcu_read_unlock();
        return weight;
}

/**
 * numa_nearest_node - Find nearest node by state
 * @node: Node id to start the search
 * @state: State to filter the search
 *
 * Lookup the closest node by distance if @nid is not in state.
 *
 * Return: this @node if it is in state, otherwise the closest node by distance
 */
int numa_nearest_node(int node, unsigned int state)
{
        int min_dist = INT_MAX, dist, n, min_node;

        if (state >= NR_NODE_STATES)
                return -EINVAL;

        if (node == NUMA_NO_NODE || node_state(node, state))
                return node;

        min_node = node;
        for_each_node_state(n, state) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(numa_nearest_node);

/**
 * nearest_node_nodemask - Find the node in @mask at the nearest distance
 *                           from @node.
 *
 * @node: a valid node ID to start the search from.
 * @mask: a pointer to a nodemask representing the allowed nodes.
 *
 * This function iterates over all nodes in @mask and calculates the
 * distance from the starting @node, then it returns the node ID that is
 * the closest to @node, or MAX_NUMNODES if no node is found.
 *
 * Note that @node must be a valid node ID usable with node_distance(),
 * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
 * or unexpected behavior.
 */
int nearest_node_nodemask(int node, nodemask_t *mask)
{
        int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;

        for_each_node_mask(n, *mask) {
                dist = node_distance(node, n);
                if (dist < min_dist) {
                        min_dist = dist;
                        min_node = n;
                }
        }

        return min_node;
}
EXPORT_SYMBOL_GPL(nearest_node_nodemask);

struct mempolicy *get_task_policy(struct task_struct *p)
{
        struct mempolicy *pol = p->mempolicy;
        int node;

        if (pol)
                return pol;

        node = numa_node_id();
        if (node != NUMA_NO_NODE) {
                pol = &preferred_node_policy[node];
                /* preferred_node_policy is not initialised early in boot */
                if (pol->mode)
                        return pol;
        }

        return &default_policy;
}

static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
} mpol_ops[MPOL_MAX];

static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
        return pol->flags & MPOL_MODE_FLAGS;
}

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
                                   const nodemask_t *rel)
{
        nodemask_t tmp;
        nodes_fold(tmp, *orig, nodes_weight(*rel));
        nodes_onto(*ret, tmp, *rel);
}

static int mpol_new_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;
        pol->nodes = *nodes;
        return 0;
}

static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
{
        if (nodes_empty(*nodes))
                return -EINVAL;

        nodes_clear(pol->nodes);
        node_set(first_node(*nodes), pol->nodes);
        return 0;
}

/*
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 * any, for the new policy.  mpol_new() has already validated the nodes
 * parameter with respect to the policy mode and flags.
 *
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_lock for write.
 */
static int mpol_set_nodemask(struct mempolicy *pol,
                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
        int ret;

        /*
         * Default (pol==NULL) resp. local memory policies are not a
         * subject of any remapping. They also do not need any special
         * constructor.
         */
        if (!pol || pol->mode == MPOL_LOCAL)
                return 0;

        /* Check N_MEMORY */
        nodes_and(nsc->mask1,
                  cpuset_current_mems_allowed, node_states[N_MEMORY]);

        VM_BUG_ON(!nodes);

        if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
        else
                nodes_and(nsc->mask2, *nodes, nsc->mask1);

        if (mpol_store_user_nodemask(pol))
                pol->w.user_nodemask = *nodes;
        else
                pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed;

        ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
        return ret;
}

/*
 * This function just creates a new policy, does some check and simple
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                                  nodemask_t *nodes)
{
        struct mempolicy *policy;

        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
                return NULL;
        }
        VM_BUG_ON(!nodes);

        /*
         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
         * All other modes require a valid pointer to a non-empty nodemask.
         */
        if (mode == MPOL_PREFERRED) {
                if (nodes_empty(*nodes)) {
                        if (((flags & MPOL_F_STATIC_NODES) ||
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);

                        mode = MPOL_LOCAL;
                }
        } else if (mode == MPOL_LOCAL) {
                if (!nodes_empty(*nodes) ||
                    (flags & MPOL_F_STATIC_NODES) ||
                    (flags & MPOL_F_RELATIVE_NODES))
                        return ERR_PTR(-EINVAL);
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);

        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
        policy->mode = mode;
        policy->flags = flags;
        policy->home_node = NUMA_NO_NODE;

        return policy;
}

/* Slow path of a mpol destructor. */
void __mpol_put(struct mempolicy *pol)
{
        if (!atomic_dec_and_test(&pol->refcnt))
                return;
        kmem_cache_free(policy_cache, pol);
}

static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
{
}

static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
{
        nodemask_t tmp;

        if (pol->flags & MPOL_F_STATIC_NODES)
                nodes_and(tmp, pol->w.user_nodemask, *nodes);
        else if (pol->flags & MPOL_F_RELATIVE_NODES)
                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
        else {
                nodes_remap(tmp, pol->nodes, pol->w.cpuset_mems_allowed,
                                                                *nodes);
                pol->w.cpuset_mems_allowed = *nodes;
        }

        if (nodes_empty(tmp))
                tmp = *nodes;

        pol->nodes = tmp;
}

static void mpol_rebind_preferred(struct mempolicy *pol,
                                                const nodemask_t *nodes)
{
        pol->w.cpuset_mems_allowed = *nodes;
}

/*
 * mpol_rebind_policy - Migrate a policy to a different set of nodes
 *
 * Per-vma policies are protected by mmap_lock. Allocations using per-task
 * policies are protected by task->mems_allowed_seq to prevent a premature
 * OOM/allocation failure due to parallel nodemask modification.
 */
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
        if (!pol || pol->mode == MPOL_LOCAL)
                return;
        if (!mpol_store_user_nodemask(pol) &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;

        mpol_ops[pol->mode].rebind(pol, newmask);
}

/*
 * Wrapper for mpol_rebind_policy() that just requires task
 * pointer, and updates task mempolicy.
 *
 * Called with task's alloc_lock held.
 */
void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
{
        mpol_rebind_policy(tsk->mempolicy, new);
}

/*
 * Rebind each vma in mm to new nodemask.
 *
 * Call holding a reference to mm.  Takes mm->mmap_lock during call.
 */
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_write_lock(mm);
        for_each_vma(vmi, vma) {
                vma_start_write(vma);
                mpol_rebind_policy(vma->vm_policy, new);
        }
        mmap_write_unlock(mm);
}

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        [MPOL_DEFAULT] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_PREFERRED] = {
                .create = mpol_new_preferred,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_BIND] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_LOCAL] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_PREFERRED_MANY] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_WEIGHTED_INTERLEAVE] = {
                .create = mpol_new_nodemask,
                .rebind = mpol_rebind_nodemask,
        },
};

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags);
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                pgoff_t ilx, int *nid);

static bool strictly_unmovable(unsigned long flags)
{
        /*
         * STRICT without MOVE flags lets do_mbind() fail immediately with -EIO
         * if any misplaced page is found.
         */
        return (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ==
                         MPOL_MF_STRICT;
}

struct migration_mpol {                /* for alloc_migration_target_by_mpol() */
        struct mempolicy *pol;
        pgoff_t ilx;
};

struct queue_pages {
        struct list_head *pagelist;
        unsigned long flags;
        nodemask_t *nmask;
        unsigned long start;
        unsigned long end;
        struct vm_area_struct *first;
        struct folio *large;                /* note last large folio encountered */
        long nr_failed;                        /* could not be isolated at this time */
};

/*
 * Check if the folio's nid is in qp->nmask.
 *
 * If MPOL_MF_INVERT is set in qp->flags, check if the nid is
 * in the invert of qp->nmask.
 */
static inline bool queue_folio_required(struct folio *folio,
                                        struct queue_pages *qp)
{
        int nid = folio_nid(folio);
        unsigned long flags = qp->flags;

        return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}

static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
{
        struct folio *folio;
        struct queue_pages *qp = walk->private;

        if (unlikely(is_pmd_migration_entry(*pmd))) {
                qp->nr_failed++;
                return;
        }
        folio = pmd_folio(*pmd);
        if (is_huge_zero_folio(folio)) {
                walk->action = ACTION_CONTINUE;
                return;
        }
        if (!queue_folio_required(folio, qp))
                return;
        if (!(qp->flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma) ||
            !migrate_folio_add(folio, qp->pagelist, qp->flags))
                qp->nr_failed++;
}

/*
 * Scan through folios, checking if they satisfy the required conditions,
 * moving them from LRU to local pagelist for migration if they do (or not).
 *
 * queue_folios_pte_range() has two possible return values:
 * 0 - continue walking to scan for more, even if an existing folio on the
 *     wrong node could not be isolated and queued for migration.
 * -EIO - only MPOL_MF_STRICT was specified, without MPOL_MF_MOVE or ..._ALL,
 *        and an existing folio was on a node that does not follow the policy.
 */
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
                        unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->vma;
        struct folio *folio;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        pte_t *pte, *mapped_pte;
        pte_t ptent;
        spinlock_t *ptl;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                queue_folios_pmd(pmd, walk);
                spin_unlock(ptl);
                goto out;
        }

        mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        if (!pte) {
                walk->action = ACTION_AGAIN;
                return 0;
        }
        for (; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = ptep_get(pte);
                if (pte_none(ptent))
                        continue;
                if (!pte_present(ptent)) {
                        if (is_migration_entry(pte_to_swp_entry(ptent)))
                                qp->nr_failed++;
                        continue;
                }
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;
                /*
                 * vm_normal_folio() filters out zero pages, but there might
                 * still be reserved folios to skip, perhaps in a VDSO.
                 */
                if (folio_test_reserved(folio))
                        continue;
                if (!queue_folio_required(folio, qp))
                        continue;
                if (folio_test_large(folio)) {
                        /*
                         * A large folio can only be isolated from LRU once,
                         * but may be mapped by many PTEs (and Copy-On-Write may
                         * intersperse PTEs of other, order 0, folios).  This is
                         * a common case, so don't mistake it for failure (but
                         * there can be other cases of multi-mapped pages which
                         * this quick check does not help to filter out - and a
                         * search of the pagelist might grow to be prohibitive).
                         *
                         * migrate_pages(&pagelist) returns nr_failed folios, so
                         * check "large" now so that queue_pages_range() returns
                         * a comparable nr_failed folios.  This does imply that
                         * if folio could not be isolated for some racy reason
                         * at its first PTE, later PTEs will not give it another
                         * chance of isolation; but keeps the accounting simple.
                         */
                        if (folio == qp->large)
                                continue;
                        qp->large = folio;
                }
                if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
                    !vma_migratable(vma) ||
                    !migrate_folio_add(folio, qp->pagelist, flags)) {
                        qp->nr_failed++;
                        if (strictly_unmovable(flags))
                                break;
                }
        }
        pte_unmap_unlock(mapped_pte, ptl);
        cond_resched();
out:
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
        return 0;
}

static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
                               unsigned long addr, unsigned long end,
                               struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;
        struct folio *folio;
        spinlock_t *ptl;
        pte_t entry;

        ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
        entry = huge_ptep_get(walk->mm, addr, pte);
        if (!pte_present(entry)) {
                if (unlikely(is_hugetlb_entry_migration(entry)))
                        qp->nr_failed++;
                goto unlock;
        }
        folio = pfn_folio(pte_pfn(entry));
        if (!queue_folio_required(folio, qp))
                goto unlock;
        if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
            !vma_migratable(walk->vma)) {
                qp->nr_failed++;
                goto unlock;
        }
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_maybe_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) ||
            (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
                if (!folio_isolate_hugetlb(folio, qp->pagelist))
                        qp->nr_failed++;
unlock:
        spin_unlock(ptl);
        if (qp->nr_failed && strictly_unmovable(flags))
                return -EIO;
#endif
        return 0;
}

#ifdef CONFIG_NUMA_BALANCING
/*
 * This is used to mark a range of virtual addresses to be inaccessible.
 * These are later cleared by a NUMA hinting fault. Depending on these
 * faults, pages may be migrated for better NUMA placement.
 *
 * This is assuming that NUMA faults are handled using PROT_NONE. If
 * an architecture makes a different choice, it will need further
 * changes to the core.
 */
unsigned long change_prot_numa(struct vm_area_struct *vma,
                        unsigned long addr, unsigned long end)
{
        struct mmu_gather tlb;
        long nr_updated;

        tlb_gather_mmu(&tlb, vma->vm_mm);

        nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
        if (nr_updated > 0) {
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
                count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
        }

        tlb_finish_mmu(&tlb);

        return nr_updated;
}
#endif /* CONFIG_NUMA_BALANCING */

static int queue_pages_test_walk(unsigned long start, unsigned long end,
                                struct mm_walk *walk)
{
        struct vm_area_struct *next, *vma = walk->vma;
        struct queue_pages *qp = walk->private;
        unsigned long flags = qp->flags;

        /* range check first */
        VM_BUG_ON_VMA(!range_in_vma(vma, start, end), vma);

        if (!qp->first) {
                qp->first = vma;
                if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                        (qp->start < vma->vm_start))
                        /* hole at head side of range */
                        return -EFAULT;
        }
        next = find_vma(vma->vm_mm, vma->vm_end);
        if (!(flags & MPOL_MF_DISCONTIG_OK) &&
                ((vma->vm_end < qp->end) &&
                (!next || vma->vm_end < next->vm_start)))
                /* hole at middle or tail of range */
                return -EFAULT;

        /*
         * Need check MPOL_MF_STRICT to return -EIO if possible
         * regardless of vma_migratable
         */
        if (!vma_migratable(vma) &&
            !(flags & MPOL_MF_STRICT))
                return 1;

        /*
         * Check page nodes, and queue pages to move, in the current vma.
         * But if no moving, and no strict checking, the scan can be skipped.
         */
        if (flags & (MPOL_MF_STRICT | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                return 0;
        return 1;
}

static const struct mm_walk_ops queue_pages_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_RDLOCK,
};

static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = {
        .hugetlb_entry                = queue_folios_hugetlb,
        .pmd_entry                = queue_folios_pte_range,
        .test_walk                = queue_pages_test_walk,
        .walk_lock                = PGWALK_WRLOCK,
};

/*
 * Walk through page tables and collect pages to be migrated.
 *
 * If pages found in a given range are not on the required set of @nodes,
 * and migration is allowed, they are isolated and queued to @pagelist.
 *
 * queue_pages_range() may return:
 * 0 - all pages already on the right node, or successfully queued for moving
 *     (or neither strict checking nor moving requested: only range checking).
 * >0 - this number of misplaced folios could not be queued for moving
 *      (a hugetlbfs page or a transparent huge page being counted as 1).
 * -EIO - a misplaced page found, when MPOL_MF_STRICT specified without MOVEs.
 * -EFAULT - a hole in the memory range, when MPOL_MF_DISCONTIG_OK unspecified.
 */
static long
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                nodemask_t *nodes, unsigned long flags,
                struct list_head *pagelist)
{
        int err;
        struct queue_pages qp = {
                .pagelist = pagelist,
                .flags = flags,
                .nmask = nodes,
                .start = start,
                .end = end,
                .first = NULL,
        };
        const struct mm_walk_ops *ops = (flags & MPOL_MF_WRLOCK) ?
                        &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops;

        err = walk_page_range(mm, start, end, ops, &qp);

        if (!qp.first)
                /* whole range in hole */
                err = -EFAULT;

        return err ? : qp.nr_failed;
}

/*
 * Apply policy to a single VMA
 * This must be called with the mmap_lock held for writing.
 */
static int vma_replace_policy(struct vm_area_struct *vma,
                                struct mempolicy *pol)
{
        int err;
        struct mempolicy *old;
        struct mempolicy *new;

        vma_assert_write_locked(vma);

        new = mpol_dup(pol);
        if (IS_ERR(new))
                return PTR_ERR(new);

        if (vma->vm_ops && vma->vm_ops->set_policy) {
                err = vma->vm_ops->set_policy(vma, new);
                if (err)
                        goto err_out;
        }

        old = vma->vm_policy;
        vma->vm_policy = new; /* protected by mmap_lock */
        mpol_put(old);

        return 0;
 err_out:
        mpol_put(new);
        return err;
}

/* Split or merge the VMA (if required) and apply the new policy */
static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma,
                struct vm_area_struct **prev, unsigned long start,
                unsigned long end, struct mempolicy *new_pol)
{
        unsigned long vmstart, vmend;

        vmend = min(end, vma->vm_end);
        if (start > vma->vm_start) {
                *prev = vma;
                vmstart = start;
        } else {
                vmstart = vma->vm_start;
        }

        if (mpol_equal(vma->vm_policy, new_pol)) {
                *prev = vma;
                return 0;
        }

        vma =  vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol);
        if (IS_ERR(vma))
                return PTR_ERR(vma);

        *prev = vma;
        return vma_replace_policy(vma, new_pol);
}

/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
{
        struct mempolicy *new, *old;
        NODEMASK_SCRATCH(scratch);
        int ret;

        if (!scratch)
                return -ENOMEM;

        new = mpol_new(mode, flags, nodes);
        if (IS_ERR(new)) {
                ret = PTR_ERR(new);
                goto out;
        }

        task_lock(current);
        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                mpol_put(new);
                goto out;
        }

        old = current->mempolicy;
        current->mempolicy = new;
        if (new && (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE)) {
                current->il_prev = MAX_NUMNODES-1;
                current->il_weight = 0;
        }
        task_unlock(current);
        mpol_put(old);
        ret = 0;
out:
        NODEMASK_SCRATCH_FREE(scratch);
        return ret;
}

/*
 * Return nodemask for policy for get_mempolicy() query
 *
 * Called with task's alloc_lock held
 */
static void get_policy_nodemask(struct mempolicy *pol, nodemask_t *nodes)
{
        nodes_clear(*nodes);
        if (pol == &default_policy)
                return;

        switch (pol->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                *nodes = pol->nodes;
                break;
        case MPOL_LOCAL:
                /* return empty node mask for local allocation */
                break;
        default:
                BUG();
        }
}

static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
        struct page *p = NULL;
        int ret;

        ret = get_user_pages_fast(addr & PAGE_MASK, 1, 0, &p);
        if (ret > 0) {
                ret = page_to_nid(p);
                put_page(p);
        }
        return ret;
}

/* Retrieve NUMA policy */
static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                             unsigned long addr, unsigned long flags)
{
        int err;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;

        if (flags &
                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                return -EINVAL;

        if (flags & MPOL_F_MEMS_ALLOWED) {
                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
                        return -EINVAL;
                *policy = 0;        /* just so it's initialized */
                task_lock(current);
                *nmask  = cpuset_current_mems_allowed;
                task_unlock(current);
                return 0;
        }

        if (flags & MPOL_F_ADDR) {
                pgoff_t ilx;                /* ignored here */
                /*
                 * Do NOT fall back to task policy if the
                 * vma/shared policy at addr is NULL.  We
                 * want to return MPOL_DEFAULT in this case.
                 */
                mmap_read_lock(mm);
                vma = vma_lookup(mm, addr);
                if (!vma) {
                        mmap_read_unlock(mm);
                        return -EFAULT;
                }
                pol = __get_vma_policy(vma, addr, &ilx);
        } else if (addr)
                return -EINVAL;

        if (!pol)
                pol = &default_policy;        /* indicates default behavior */

        if (flags & MPOL_F_NODE) {
                if (flags & MPOL_F_ADDR) {
                        /*
                         * Take a refcount on the mpol, because we are about to
                         * drop the mmap_lock, after which only "pol" remains
                         * valid, "vma" is stale.
                         */
                        pol_refcount = pol;
                        vma = NULL;
                        mpol_get(pol);
                        mmap_read_unlock(mm);
                        err = lookup_node(mm, addr);
                        if (err < 0)
                                goto out;
                        *policy = err;
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_INTERLEAVE) {
                        *policy = next_node_in(current->il_prev, pol->nodes);
                } else if (pol == current->mempolicy &&
                                pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        if (current->il_weight)
                                *policy = current->il_prev;
                        else
                                *policy = next_node_in(current->il_prev,
                                                       pol->nodes);
                } else {
                        err = -EINVAL;
                        goto out;
                }
        } else {
                *policy = pol == &default_policy ? MPOL_DEFAULT :
                                                pol->mode;
                /*
                 * Internal mempolicy flags must be masked off before exposing
                 * the policy to userspace.
                 */
                *policy |= (pol->flags & MPOL_MODE_FLAGS);
        }

        err = 0;
        if (nmask) {
                if (mpol_store_user_nodemask(pol)) {
                        *nmask = pol->w.user_nodemask;
                } else {
                        task_lock(current);
                        get_policy_nodemask(pol, nmask);
                        task_unlock(current);
                }
        }

 out:
        mpol_cond_put(pol);
        if (vma)
                mmap_read_unlock(mm);
        if (pol_refcount)
                mpol_put(pol_refcount);
        return err;
}

#ifdef CONFIG_MIGRATION
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        /*
         * Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
         * Choosing not to migrate a shared folio is not counted as a failure.
         *
         * See folio_maybe_mapped_shared() on possible imprecision when we
         * cannot easily detect if a folio is shared.
         */
        if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
                if (folio_isolate_lru(folio)) {
                        list_add_tail(&folio->lru, foliolist);
                        node_stat_mod_folio(folio,
                                NR_ISOLATED_ANON + folio_is_file_lru(folio),
                                folio_nr_pages(folio));
                } else {
                        /*
                         * Non-movable folio may reach here.  And, there may be
                         * temporary off LRU folios or non-LRU movable folios.
                         * Treat them as unmovable folios since they can't be
                         * isolated, so they can't be moved at the moment.
                         */
                        return false;
                }
        }
        return true;
}

/*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
static long migrate_to_node(struct mm_struct *mm, int source, int dest,
                            int flags)
{
        nodemask_t nmask;
        struct vm_area_struct *vma;
        LIST_HEAD(pagelist);
        long nr_failed;
        long err = 0;
        struct migration_target_control mtc = {
                .nid = dest,
                .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
                .reason = MR_SYSCALL,
        };

        nodes_clear(nmask);
        node_set(source, nmask);

        VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));

        mmap_read_lock(mm);
        vma = find_vma(mm, 0);
        if (unlikely(!vma)) {
                mmap_read_unlock(mm);
                return 0;
        }

        /*
         * This does not migrate the range, but isolates all pages that
         * need migration.  Between passing in the full user address
         * space range and MPOL_MF_DISCONTIG_OK, this call cannot fail,
         * but passes back the count of pages which could not be isolated.
         */
        nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask,
                                      flags | MPOL_MF_DISCONTIG_OK, &pagelist);
        mmap_read_unlock(mm);

        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, alloc_migration_target, NULL,
                        (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
                if (err)
                        putback_movable_pages(&pagelist);
        }

        if (err >= 0)
                err += nr_failed;
        return err;
}

/*
 * Move pages between the two nodesets so as to preserve the physical
 * layout as much as possible.
 *
 * Returns the number of page that could not be moved.
 */
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        long nr_failed = 0;
        long err = 0;
        nodemask_t tmp;

        lru_cache_disable();

        /*
         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
         * bit in 'tmp', and return that <source, dest> pair for migration.
         * The pair of nodemasks 'to' and 'from' define the map.
         *
         * If no pair of bits is found that way, fallback to picking some
         * pair of 'source' and 'dest' bits that are not the same.  If the
         * 'source' and 'dest' bits are the same, this represents a node
         * that will be migrating to itself, so no pages need move.
         *
         * If no bits are left in 'tmp', or if all remaining bits left
         * in 'tmp' correspond to the same bit in 'to', return false
         * (nothing left to migrate).
         *
         * This lets us pick a pair of nodes to migrate between, such that
         * if possible the dest node is not already occupied by some other
         * source node, minimizing the risk of overloading the memory on a
         * node that would happen if we migrated incoming memory to a node
         * before migrating outgoing memory source that same node.
         *
         * A single scan of tmp is sufficient.  As we go, we remember the
         * most recent <s, d> pair that moved (s != d).  If we find a pair
         * that not only moved, but what's better, moved to an empty slot
         * (d is not set in tmp), then we break out then, with that pair.
         * Otherwise when we finish scanning from_tmp, we at least have the
         * most recent <s, d> pair that moved.  If we get all the way through
         * the scan of tmp without finding any node that moved, much less
         * moved to an empty node, then there is nothing left worth migrating.
         */

        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s, d;
                int source = NUMA_NO_NODE;
                int dest = 0;

                for_each_node_mask(s, tmp) {

                        /*
                         * do_migrate_pages() tries to maintain the relative
                         * node relationship of the pages established between
                         * threads and memory areas.
                         *
                         * However if the number of source nodes is not equal to
                         * the number of destination nodes we can not preserve
                         * this node relative relationship.  In that case, skip
                         * copying memory from a node that is in the destination
                         * mask.
                         *
                         * Example: [2,3,4] -> [3,4,5] moves everything.
                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
                         */

                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
                                                (node_isset(s, *to)))
                                continue;

                        d = node_remap(s, *from, *to);
                        if (s == d)
                                continue;

                        source = s;        /* Node moved. Memorize */
                        dest = d;

                        /* dest not in remaining from nodes? */
                        if (!node_isset(dest, tmp))
                                break;
                }
                if (source == NUMA_NO_NODE)
                        break;

                node_clear(source, tmp);
                err = migrate_to_node(mm, source, dest, flags);
                if (err > 0)
                        nr_failed += err;
                if (err < 0)
                        break;
        }

        lru_cache_enable();
        if (err < 0)
                return err;
        return (nr_failed < INT_MAX) ? nr_failed : INT_MAX;
}

/*
 * Allocate a new folio for page migration, according to NUMA mempolicy.
 */
static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        struct migration_mpol *mmpol = (struct migration_mpol *)private;
        struct mempolicy *pol = mmpol->pol;
        pgoff_t ilx = mmpol->ilx;
        unsigned int order;
        int nid = numa_node_id();
        gfp_t gfp;

        order = folio_order(src);
        ilx += src->index >> order;

        if (folio_test_hugetlb(src)) {
                nodemask_t *nodemask;
                struct hstate *h;

                h = folio_hstate(src);
                gfp = htlb_alloc_mask(h);
                nodemask = policy_nodemask(gfp, pol, ilx, &nid);
                return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp,
                                htlb_allow_alloc_fallback(MR_MEMPOLICY_MBIND));
        }

        if (folio_test_large(src))
                gfp = GFP_TRANSHUGE;
        else
                gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;

        return folio_alloc_mpol(gfp, order, pol, ilx, nid);
}
#else

static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
                                unsigned long flags)
{
        return false;
}

int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
                     const nodemask_t *to, int flags)
{
        return -ENOSYS;
}

static struct folio *alloc_migration_target_by_mpol(struct folio *src,
                                                    unsigned long private)
{
        return NULL;
}
#endif

static long do_mbind(unsigned long start, unsigned long len,
                     unsigned short mode, unsigned short mode_flags,
                     nodemask_t *nmask, unsigned long flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct vma_iterator vmi;
        struct migration_mpol mmpol;
        struct mempolicy *new;
        unsigned long end;
        long err;
        long nr_failed;
        LIST_HEAD(pagelist);

        if (flags & ~(unsigned long)MPOL_MF_VALID)
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;

        if (start & ~PAGE_MASK)
                return -EINVAL;

        if (mode == MPOL_DEFAULT)
                flags &= ~MPOL_MF_STRICT;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;

        new = mpol_new(mode, mode_flags, nmask);
        if (IS_ERR(new))
                return PTR_ERR(new);

        /*
         * If we are using the default policy then operation
         * on discontinuous address spaces is okay after all
         */
        if (!new)
                flags |= MPOL_MF_DISCONTIG_OK;

        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_disable();
        {
                NODEMASK_SCRATCH(scratch);
                if (scratch) {
                        mmap_write_lock(mm);
                        err = mpol_set_nodemask(new, nmask, scratch);
                        if (err)
                                mmap_write_unlock(mm);
                } else
                        err = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
        }
        if (err)
                goto mpol_out;

        /*
         * Lock the VMAs before scanning for pages to migrate,
         * to ensure we don't miss a concurrently inserted page.
         */
        nr_failed = queue_pages_range(mm, start, end, nmask,
                        flags | MPOL_MF_INVERT | MPOL_MF_WRLOCK, &pagelist);

        if (nr_failed < 0) {
                err = nr_failed;
                nr_failed = 0;
        } else {
                vma_iter_init(&vmi, mm, start);
                prev = vma_prev(&vmi);
                for_each_vma_range(vmi, vma, end) {
                        err = mbind_range(&vmi, vma, &prev, start, end, new);
                        if (err)
                                break;
                }
        }

        if (!err && !list_empty(&pagelist)) {
                /* Convert MPOL_DEFAULT's NULL to task or default policy */
                if (!new) {
                        new = get_task_policy(current);
                        mpol_get(new);
                }
                mmpol.pol = new;
                mmpol.ilx = 0;

                /*
                 * In the interleaved case, attempt to allocate on exactly the
                 * targeted nodes, for the first VMA to be migrated; for later
                 * VMAs, the nodes will still be interleaved from the targeted
                 * nodemask, but one by one may be selected differently.
                 */
                if (new->mode == MPOL_INTERLEAVE ||
                    new->mode == MPOL_WEIGHTED_INTERLEAVE) {
                        struct folio *folio;
                        unsigned int order;
                        unsigned long addr = -EFAULT;

                        list_for_each_entry(folio, &pagelist, lru) {
                                if (!folio_test_ksm(folio))
                                        break;
                        }
                        if (!list_entry_is_head(folio, &pagelist, lru)) {
                                vma_iter_init(&vmi, mm, start);
                                for_each_vma_range(vmi, vma, end) {
                                        addr = page_address_in_vma(folio,
                                                folio_page(folio, 0), vma);
                                        if (addr != -EFAULT)
                                                break;
                                }
                        }
                        if (addr != -EFAULT) {
                                order = folio_order(folio);
                                /* We already know the pol, but not the ilx */
                                mpol_cond_put(get_vma_policy(vma, addr, order,
                                                             &mmpol.ilx));
                                /* Set base from which to increment by index */
                                mmpol.ilx -= folio->index >> order;
                        }
                }
        }

        mmap_write_unlock(mm);

        if (!err && !list_empty(&pagelist)) {
                nr_failed |= migrate_pages(&pagelist,
                                alloc_migration_target_by_mpol, NULL,
                                (unsigned long)&mmpol, MIGRATE_SYNC,
                                MR_MEMPOLICY_MBIND, NULL);
        }

        if (nr_failed && (flags & MPOL_MF_STRICT))
                err = -EIO;
        if (!list_empty(&pagelist))
                putback_movable_pages(&pagelist);
mpol_out:
        mpol_put(new);
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                lru_cache_enable();
        return err;
}

/*
 * User space interface with variable sized bitmaps for nodelists.
 */
static int get_bitmap(unsigned long *mask, const unsigned long __user *nmask,
                      unsigned long maxnode)
{
        unsigned long nlongs = BITS_TO_LONGS(maxnode);
        int ret;

        if (in_compat_syscall())
                ret = compat_get_bitmap(mask,
                                        (const compat_ulong_t __user *)nmask,
                                        maxnode);
        else
                ret = copy_from_user(mask, nmask,
                                     nlongs * sizeof(unsigned long));

        if (ret)
                return -EFAULT;

        if (maxnode % BITS_PER_LONG)
                mask[nlongs - 1] &= (1UL << (maxnode % BITS_PER_LONG)) - 1;

        return 0;
}

/* Copy a node mask from user space. */
static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
{
        --maxnode;
        nodes_clear(*nodes);
        if (maxnode == 0 || !nmask)
                return 0;
        if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
                return -EINVAL;

        /*
         * When the user specified more nodes than supported just check
         * if the non supported part is all zero, one word at a time,
         * starting at the end.
         */
        while (maxnode > MAX_NUMNODES) {
                unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
                unsigned long t;

                if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
                        return -EFAULT;

                if (maxnode - bits >= MAX_NUMNODES) {
                        maxnode -= bits;
                } else {
                        maxnode = MAX_NUMNODES;
                        t &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1);
                }
                if (t)
                        return -EINVAL;
        }

        return get_bitmap(nodes_addr(*nodes), nmask, maxnode);
}

/* Copy a kernel node mask to user space */
static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
                              nodemask_t *nodes)
{
        unsigned long copy = ALIGN(maxnode-1, 64) / 8;
        unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
        bool compat = in_compat_syscall();

        if (compat)
                nbytes = BITS_TO_COMPAT_LONGS(nr_node_ids) * sizeof(compat_long_t);

        if (copy > nbytes) {
                if (copy > PAGE_SIZE)
                        return -EINVAL;
                if (clear_user((char __user *)mask + nbytes, copy - nbytes))
                        return -EFAULT;
                copy = nbytes;
                maxnode = nr_node_ids;
        }

        if (compat)
                return compat_put_bitmap((compat_ulong_t __user *)mask,
                                         nodes_addr(*nodes), maxnode);

        return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}

/* Basic parameter sanity check used by both mbind() and set_mempolicy() */
static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
{
        *flags = *mode & MPOL_MODE_FLAGS;
        *mode &= ~MPOL_MODE_FLAGS;

        if ((unsigned int)(*mode) >=  MPOL_MAX)
                return -EINVAL;
        if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        if (*flags & MPOL_F_NUMA_BALANCING) {
                if (*mode == MPOL_BIND || *mode == MPOL_PREFERRED_MANY)
                        *flags |= (MPOL_F_MOF | MPOL_F_MORON);
                else
                        return -EINVAL;
        }
        return 0;
}

static long kernel_mbind(unsigned long start, unsigned long len,
                         unsigned long mode, const unsigned long __user *nmask,
                         unsigned long maxnode, unsigned int flags)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        start = untagged_addr(start);
        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}

SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
                unsigned long, home_node, unsigned long, flags)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
        struct mempolicy *new, *old;
        unsigned long end;
        int err = -ENOENT;
        VMA_ITERATOR(vmi, mm, start);

        start = untagged_addr(start);
        if (start & ~PAGE_MASK)
                return -EINVAL;
        /*
         * flags is used for future extension if any.
         */
        if (flags != 0)
                return -EINVAL;

        /*
         * Check home_node is online to avoid accessing uninitialized
         * NODE_DATA.
         */
        if (home_node >= MAX_NUMNODES || !node_online(home_node))
                return -EINVAL;

        len = PAGE_ALIGN(len);
        end = start + len;

        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;
        mmap_write_lock(mm);
        prev = vma_prev(&vmi);
        for_each_vma_range(vmi, vma, end) {
                /*
                 * If any vma in the range got policy other than MPOL_BIND
                 * or MPOL_PREFERRED_MANY we return error. We don't reset
                 * the home node for vmas we already updated before.
                 */
                old = vma_policy(vma);
                if (!old) {
                        prev = vma;
                        continue;
                }
                if (old->mode != MPOL_BIND && old->mode != MPOL_PREFERRED_MANY) {
                        err = -EOPNOTSUPP;
                        break;
                }
                new = mpol_dup(old);
                if (IS_ERR(new)) {
                        err = PTR_ERR(new);
                        break;
                }

                vma_start_write(vma);
                new->home_node = home_node;
                err = mbind_range(&vmi, vma, &prev, start, end, new);
                mpol_put(new);
                if (err)
                        break;
        }
        mmap_write_unlock(mm);
        return err;
}

SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
                unsigned long, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode, unsigned int, flags)
{
        return kernel_mbind(start, len, mode, nmask, maxnode, flags);
}

/* Set the process memory policy */
static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask,
                                 unsigned long maxnode)
{
        unsigned short mode_flags;
        nodemask_t nodes;
        int lmode = mode;
        int err;

        err = sanitize_mpol_flags(&lmode, &mode_flags);
        if (err)
                return err;

        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;

        return do_set_mempolicy(lmode, mode_flags, &nodes);
}

SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask,
                unsigned long, maxnode)
{
        return kernel_set_mempolicy(mode, nmask, maxnode);
}

static int kernel_migrate_pages(pid_t pid, unsigned long maxnode,
                                const unsigned long __user *old_nodes,
                                const unsigned long __user *new_nodes)
{
        struct mm_struct *mm = NULL;
        struct task_struct *task;
        nodemask_t task_nodes;
        int err;
        nodemask_t *old;
        nodemask_t *new;
        NODEMASK_SCRATCH(scratch);

        if (!scratch)
                return -ENOMEM;

        old = &scratch->mask1;
        new = &scratch->mask2;

        err = get_nodes(old, old_nodes, maxnode);
        if (err)
                goto out;

        err = get_nodes(new, new_nodes, maxnode);
        if (err)
                goto out;

        /* Find the mm_struct */
        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        get_task_struct(task);

        err = -EINVAL;

        /*
         * Check if this process has the right to modify the specified process.
         * Use the regular "ptrace_may_access()" checks.
         */
        if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
                rcu_read_unlock();
                err = -EPERM;
                goto out_put;
        }
        rcu_read_unlock();

        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out_put;
        }

        task_nodes = cpuset_mems_allowed(current);
        nodes_and(*new, *new, task_nodes);
        if (nodes_empty(*new))
                goto out_put;

        err = security_task_movememory(task);
        if (err)
                goto out_put;

        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                err = -EINVAL;
                goto out;
        }

        err = do_migrate_pages(mm, old, new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);

        mmput(mm);
out:
        NODEMASK_SCRATCH_FREE(scratch);

        return err;

out_put:
        put_task_struct(task);
        goto out;
}

SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                const unsigned long __user *, old_nodes,
                const unsigned long __user *, new_nodes)
{
        return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes);
}

/* Retrieve NUMA policy */
static int kernel_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
                                unsigned long maxnode,
                                unsigned long addr,
                                unsigned long flags)
{
        int err;
        int pval;
        nodemask_t nodes;

        if (nmask != NULL && maxnode < nr_node_ids)
                return -EINVAL;

        addr = untagged_addr(addr);

        err = do_get_mempolicy(&pval, &nodes, addr, flags);

        if (err)
                return err;

        if (policy && put_user(pval, policy))
                return -EFAULT;

        if (nmask)
                err = copy_nodes_to_user(nmask, maxnode, &nodes);

        return err;
}

SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
                unsigned long __user *, nmask, unsigned long, maxnode,
                unsigned long, addr, unsigned long, flags)
{
        return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags);
}

bool vma_migratable(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
                return false;

        /*
         * DAX device mappings require predictable access latency, so avoid
         * incurring periodic faults.
         */
        if (vma_is_dax(vma))
                return false;

        if (is_vm_hugetlb_page(vma) &&
                !hugepage_migration_supported(hstate_vma(vma)))
                return false;

        /*
         * Migration allocates pages in the highest zone. If we cannot
         * do so then migration (at least from node to node) is not
         * possible.
         */
        if (vma->vm_file &&
                gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
                        < policy_zone)
                return false;
        return true;
}

struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
                                   unsigned long addr, pgoff_t *ilx)
{
        *ilx = 0;
        return (vma->vm_ops && vma->vm_ops->get_policy) ?
                vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy;
}

/*
 * get_vma_policy(@vma, @addr, @order, @ilx)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup
 * @order: 0, or appropriate huge_page_order for interleaving
 * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE or
 *       MPOL_WEIGHTED_INTERLEAVE
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to current->mempolicy or system default policy, as necessary.
 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
 * count--added by the get_policy() vm_op, as appropriate--to protect against
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
                                 unsigned long addr, int order, pgoff_t *ilx)
{
        struct mempolicy *pol;

        pol = __get_vma_policy(vma, addr, ilx);
        if (!pol)
                pol = get_task_policy(current);
        if (pol->mode == MPOL_INTERLEAVE ||
            pol->mode == MPOL_WEIGHTED_INTERLEAVE) {
                *ilx += vma->vm_pgoff >> order;
                *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order);
        }
        return pol;
}

bool vma_policy_mof(struct vm_area_struct *vma)
{
        struct mempolicy *pol;

        if (vma->vm_ops && vma->vm_ops->get_policy) {
                bool ret = false;
                pgoff_t ilx;                /* ignored here */

                pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx);
                if (pol && (pol->flags & MPOL_F_MOF))
                        ret = true;
                mpol_cond_put(pol);

                return ret;
        }

        pol = vma->vm_policy;
        if (!pol)
                pol = get_task_policy(current);

        return pol->flags & MPOL_F_MOF;
}

bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
        enum zone_type dynamic_policy_zone = policy_zone;

        BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);

        /*
         * if policy->nodes has movable memory only,
         * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
         *
         * policy->nodes is intersect with node_states[N_MEMORY].
         * so if the following test fails, it implies
         * policy->nodes has movable memory only.
         */
        if (!nodes_intersects(policy->nodes, node_states[N_HIGH_MEMORY]))
                dynamic_policy_zone = ZONE_MOVABLE;

        return zone >= dynamic_policy_zone;
}

static unsigned int weighted_interleave_nodes(struct mempolicy *policy)
{
        unsigned int node;
        unsigned int cpuset_mems_cookie;

retry:
        /* to prevent miscount use tsk->mems_allowed_seq to detect rebind */
        cpuset_mems_cookie = read_mems_allowed_begin();
        node = current->il_prev;
        if (!current->il_weight || !node_isset(node, policy->nodes)) {
                node = next_node_in(node, policy->nodes);
                if (read_mems_allowed_retry(cpuset_mems_cookie))
                        goto retry;
                if (node == MAX_NUMNODES)
                        return node;
                current->il_prev = node;
                current->il_weight = get_il_weight(node);
        }
        current->il_weight--;
        return node;
}

/* Do dynamic interleaving for a process */
static unsigned int interleave_nodes(struct mempolicy *policy)
{
        unsigned int nid;
        unsigned int cpuset_mems_cookie;

        /* to prevent miscount, use tsk->mems_allowed_seq to detect rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nid = next_node_in(current->il_prev, policy->nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        if (nid < MAX_NUMNODES)
                current->il_prev = nid;
        return nid;
}

/*
 * Depending on the memory policy provide a node from which to allocate the
 * next slab entry.
 */
unsigned int mempolicy_slab_node(void)
{
        struct mempolicy *policy;
        int node = numa_mem_id();

        if (!in_task())
                return node;

        policy = current->mempolicy;
        if (!policy)
                return node;

        switch (policy->mode) {
        case MPOL_PREFERRED:
                return first_node(policy->nodes);

        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);

        case MPOL_WEIGHTED_INTERLEAVE:
                return weighted_interleave_nodes(policy);

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
        {
                struct zoneref *z;

                /*
                 * Follow bind policy behavior and start allocation at the
                 * first node.
                 */
                struct zonelist *zonelist;
                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
                zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
                z = first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->nodes);
                return zonelist_zone(z) ? zonelist_node_idx(z) : node;
        }
        case MPOL_LOCAL:
                return node;

        default:
                BUG();
        }
}

static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
                                              nodemask_t *mask)
{
        /*
         * barrier stabilizes the nodemask locally so that it can be iterated
         * over safely without concern for changes. Allocators validate node
         * selection does not violate mems_allowed, so this is safe.
         */
        barrier();
        memcpy(mask, &pol->nodes, sizeof(nodemask_t));
        barrier();
        return nodes_weight(*mask);
}

static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        nodemask_t nodemask;
        unsigned int target, nr_nodes;
        u8 *table;
        unsigned int weight_total = 0;
        u8 weight;
        int nid;

        nr_nodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nr_nodes)
                return numa_node_id();

        rcu_read_lock();
        table = rcu_dereference(iw_table);
        /* calculate the total weight */
        for_each_node_mask(nid, nodemask) {
                /* detect system default usage */
                weight = table ? table[nid] : 1;
                weight = weight ? weight : 1;
                weight_total += weight;
        }

        /* Calculate the node offset based on totals */
        target = ilx % weight_total;
        nid = first_node(nodemask);
        while (target) {
                /* detect system default usage */
                weight = table ? table[nid] : 1;
                weight = weight ? weight : 1;
                if (target < weight)
                        break;
                target -= weight;
                nid = next_node_in(nid, nodemask);
        }
        rcu_read_unlock();
        return nid;
}

/*
 * Do static interleaving for interleave index @ilx.  Returns the ilx'th
 * node in pol->nodes (starting from ilx=0), wrapping around if ilx
 * exceeds the number of present nodes.
 */
static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
        nodemask_t nodemask;
        unsigned int target, nnodes;
        int i;
        int nid;

        nnodes = read_once_policy_nodemask(pol, &nodemask);
        if (!nnodes)
                return numa_node_id();
        target = ilx % nnodes;
        nid = first_node(nodemask);
        for (i = 0; i < target; i++)
                nid = next_node(nid, nodemask);
        return nid;
}

/*
 * Return a nodemask representing a mempolicy for filtering nodes for
 * page allocation, together with preferred node id (or the input node id).
 */
static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol,
                                   pgoff_t ilx, int *nid)
{
        nodemask_t *nodemask = NULL;

        switch (pol->mode) {
        case MPOL_PREFERRED:
                /* Override input node id */
                *nid = first_node(pol->nodes);
                break;
        case MPOL_PREFERRED_MANY:
                nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                break;
        case MPOL_BIND:
                /* Restrict to nodemask (but not on lower zones) */
                if (apply_policy_zone(pol, gfp_zone(gfp)) &&
                    cpuset_nodemask_valid_mems_allowed(&pol->nodes))
                        nodemask = &pol->nodes;
                if (pol->home_node != NUMA_NO_NODE)
                        *nid = pol->home_node;
                /*
                 * __GFP_THISNODE shouldn't even be used with the bind policy
                 * because we might easily break the expectation to stay on the
                 * requested node and not break the policy.
                 */
                WARN_ON_ONCE(gfp & __GFP_THISNODE);
                break;
        case MPOL_INTERLEAVE:
                /* Override input node id */
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        interleave_nodes(pol) : interleave_nid(pol, ilx);
                break;
        case MPOL_WEIGHTED_INTERLEAVE:
                *nid = (ilx == NO_INTERLEAVE_INDEX) ?
                        weighted_interleave_nodes(pol) :
                        weighted_interleave_nid(pol, ilx);
                break;
        }

        return nodemask;
}

#ifdef CONFIG_HUGETLBFS
/*
 * huge_node(@vma, @addr, @gfp_flags, @mpol)
 * @vma: virtual memory area whose policy is sought
 * @addr: address in @vma for shared policy lookup and interleave policy
 * @gfp_flags: for requested zone
 * @mpol: pointer to mempolicy pointer for reference counted mempolicy
 * @nodemask: pointer to nodemask pointer for 'bind' and 'prefer-many' policy
 *
 * Returns a nid suitable for a huge page allocation and a pointer
 * to the struct mempolicy for conditional unref after allocation.
 * If the effective policy is 'bind' or 'prefer-many', returns a pointer
 * to the mempolicy's @nodemask for filtering the zonelist.
 */
int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
                struct mempolicy **mpol, nodemask_t **nodemask)
{
        pgoff_t ilx;
        int nid;

        nid = numa_node_id();
        *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx);
        *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid);
        return nid;
}

/*
 * init_nodemask_of_mempolicy
 *
 * If the current task's mempolicy is "default" [NULL], return 'false'
 * to indicate default policy.  Otherwise, extract the policy nodemask
 * for 'bind' or 'interleave' policy into the argument nodemask, or
 * initialize the argument nodemask to contain the single node for
 * 'preferred' or 'local' policy and return 'true' to indicate presence
 * of non-default mempolicy.
 *
 * We don't bother with reference counting the mempolicy [mpol_get/put]
 * because the current task is examining it's own mempolicy and a task's
 * mempolicy is only ever changed by the task itself.
 *
 * N.B., it is the caller's responsibility to free a returned nodemask.
 */
bool init_nodemask_of_mempolicy(nodemask_t *mask)
{
        struct mempolicy *mempolicy;

        if (!(mask && current->mempolicy))
                return false;

        task_lock(current);
        mempolicy = current->mempolicy;
        switch (mempolicy->mode) {
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                *mask = mempolicy->nodes;
                break;

        case MPOL_LOCAL:
                init_nodemask_of_node(mask, numa_node_id());
                break;

        default:
                BUG();
        }
        task_unlock(current);

        return true;
}
#endif

/*
 * mempolicy_in_oom_domain
 *
 * If tsk's mempolicy is "bind", check for intersection between mask and
 * the policy nodemask. Otherwise, return true for all other policies
 * including "interleave", as a tsk with "interleave" policy may have
 * memory allocated from all nodes in system.
 *
 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
 */
bool mempolicy_in_oom_domain(struct task_struct *tsk,
                                        const nodemask_t *mask)
{
        struct mempolicy *mempolicy;
        bool ret = true;

        if (!mask)
                return ret;

        task_lock(tsk);
        mempolicy = tsk->mempolicy;
        if (mempolicy && mempolicy->mode == MPOL_BIND)
                ret = nodes_intersects(mempolicy->nodes, *mask);
        task_unlock(tsk);

        return ret;
}

static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
                                                int nid, nodemask_t *nodemask)
{
        struct page *page;
        gfp_t preferred_gfp;

        /*
         * This is a two pass approach. The first pass will only try the
         * preferred nodes but skip the direct reclaim and allow the
         * allocation to fail, while the second pass will try all the
         * nodes in system.
         */
        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
        page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
        if (!page)
                page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);

        return page;
}

/**
 * alloc_pages_mpol - Allocate pages according to NUMA mempolicy.
 * @gfp: GFP flags.
 * @order: Order of the page allocation.
 * @pol: Pointer to the NUMA mempolicy.
 * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()).
 * @nid: Preferred node (usually numa_node_id() but @mpol may override it).
 *
 * Return: The page on success or NULL if allocation fails.
 */
static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
                struct mempolicy *pol, pgoff_t ilx, int nid)
{
        nodemask_t *nodemask;
        struct page *page;

        nodemask = policy_nodemask(gfp, pol, ilx, &nid);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_preferred_many(gfp, order, nid, nodemask);

        if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
            /* filter "hugepage" allocation, unless from alloc_pages() */
            order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) {
                /*
                 * For hugepage allocation and non-interleave policy which
                 * allows the current node (or other explicitly preferred
                 * node) we only try to allocate from the current/preferred
                 * node and don't fall back to other nodes, as the cost of
                 * remote accesses would likely offset THP benefits.
                 *
                 * If the policy is interleave or does not allow the current
                 * node in its nodemask, we allocate the standard way.
                 */
                if (pol->mode != MPOL_INTERLEAVE &&
                    pol->mode != MPOL_WEIGHTED_INTERLEAVE &&
                    (!nodemask || node_isset(nid, *nodemask))) {
                        /*
                         * First, try to allocate THP only on local node, but
                         * don't reclaim unnecessarily, just compact.
                         */
                        page = __alloc_frozen_pages_noprof(
                                gfp | __GFP_THISNODE | __GFP_NORETRY, order,
                                nid, NULL);
                        if (page || !(gfp & __GFP_DIRECT_RECLAIM))
                                return page;
                        /*
                         * If hugepage allocations are configured to always
                         * synchronous compact or the vma has been madvised
                         * to prefer hugepage backing, retry allowing remote
                         * memory with both reclaim and compact as well.
                         */
                }
        }

        page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);

        if (unlikely(pol->mode == MPOL_INTERLEAVE ||
                     pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
                /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
                if (static_branch_likely(&vm_numa_stat_key) &&
                    page_to_nid(page) == nid) {
                        preempt_disable();
                        __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT);
                        preempt_enable();
                }
        }

        return page;
}

struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
                struct mempolicy *pol, pgoff_t ilx, int nid)
{
        struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
                        ilx, nid);
        if (!page)
                return NULL;

        set_page_refcounted(page);
        return page_rmappable_folio(page);
}

/**
 * vma_alloc_folio - Allocate a folio for a VMA.
 * @gfp: GFP flags.
 * @order: Order of the folio.
 * @vma: Pointer to VMA.
 * @addr: Virtual address of the allocation.  Must be inside @vma.
 *
 * Allocate a folio for a specific address in @vma, using the appropriate
 * NUMA policy.  The caller must hold the mmap_lock of the mm_struct of the
 * VMA to prevent it from going away.  Should be used for all allocations
 * for folios that will be mapped into user space, excepting hugetlbfs, and
 * excepting where direct use of folio_alloc_mpol() is more appropriate.
 *
 * Return: The folio on success or NULL if allocation fails.
 */
struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma,
                unsigned long addr)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct folio *folio;

        if (vma->vm_flags & VM_DROPPABLE)
                gfp |= __GFP_NOWARN;

        pol = get_vma_policy(vma, addr, order, &ilx);
        folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id());
        mpol_cond_put(pol);
        return folio;
}
EXPORT_SYMBOL(vma_alloc_folio_noprof);

struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
{
        struct mempolicy *pol = &default_policy;

        /*
         * No reference counting needed for current->mempolicy
         * nor system default_policy
         */
        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
                                       numa_node_id());
}

/**
 * alloc_pages - Allocate pages.
 * @gfp: GFP flags.
 * @order: Power of two of number of pages to allocate.
 *
 * Allocate 1 << @order contiguous pages.  The physical address of the
 * first page is naturally aligned (eg an order-3 allocation will be aligned
 * to a multiple of 8 * PAGE_SIZE bytes).  The NUMA policy of the current
 * process is honoured when in process context.
 *
 * Context: Can be called from any context, providing the appropriate GFP
 * flags are used.
 * Return: The page on success or NULL if allocation fails.
 */
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
        struct page *page = alloc_frozen_pages_noprof(gfp, order);

        if (page)
                set_page_refcounted(page);
        return page;
}
EXPORT_SYMBOL(alloc_pages_noprof);

struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
{
        return page_rmappable_folio(alloc_pages_noprof(gfp | __GFP_COMP, order));
}
EXPORT_SYMBOL(folio_alloc_noprof);

static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        int nodes;
        unsigned long nr_pages_per_node;
        int delta;
        int i;
        unsigned long nr_allocated;
        unsigned long total_allocated = 0;

        nodes = nodes_weight(pol->nodes);
        nr_pages_per_node = nr_pages / nodes;
        delta = nr_pages - nodes * nr_pages_per_node;

        for (i = 0; i < nodes; i++) {
                if (delta) {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node + 1,
                                        page_array);
                        delta--;
                } else {
                        nr_allocated = alloc_pages_bulk_noprof(gfp,
                                        interleave_nodes(pol), NULL,
                                        nr_pages_per_node, page_array);
                }

                page_array += nr_allocated;
                total_allocated += nr_allocated;
        }

        return total_allocated;
}

static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        struct task_struct *me = current;
        unsigned int cpuset_mems_cookie;
        unsigned long total_allocated = 0;
        unsigned long nr_allocated = 0;
        unsigned long rounds;
        unsigned long node_pages, delta;
        u8 *table, *weights, weight;
        unsigned int weight_total = 0;
        unsigned long rem_pages = nr_pages;
        nodemask_t nodes;
        int nnodes, node;
        int resume_node = MAX_NUMNODES - 1;
        u8 resume_weight = 0;
        int prev_node;
        int i;

        if (!nr_pages)
                return 0;

        /* read the nodes onto the stack, retry if done during rebind */
        do {
                cpuset_mems_cookie = read_mems_allowed_begin();
                nnodes = read_once_policy_nodemask(pol, &nodes);
        } while (read_mems_allowed_retry(cpuset_mems_cookie));

        /* if the nodemask has become invalid, we cannot do anything */
        if (!nnodes)
                return 0;

        /* Continue allocating from most recent node and adjust the nr_pages */
        node = me->il_prev;
        weight = me->il_weight;
        if (weight && node_isset(node, nodes)) {
                node_pages = min(rem_pages, weight);
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                /* if that's all the pages, no need to interleave */
                if (rem_pages <= weight) {
                        me->il_weight -= rem_pages;
                        return total_allocated;
                }
                /* Otherwise we adjust remaining pages, continue from there */
                rem_pages -= weight;
        }
        /* clear active weight in case of an allocation failure */
        me->il_weight = 0;
        prev_node = node;

        /* create a local copy of node weights to operate on outside rcu */
        weights = kzalloc(nr_node_ids, GFP_KERNEL);
        if (!weights)
                return total_allocated;

        rcu_read_lock();
        table = rcu_dereference(iw_table);
        if (table)
                memcpy(weights, table, nr_node_ids);
        rcu_read_unlock();

        /* calculate total, detect system default usage */
        for_each_node_mask(node, nodes) {
                if (!weights[node])
                        weights[node] = 1;
                weight_total += weights[node];
        }

        /*
         * Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
         * Track which node weighted interleave should resume from.
         *
         * if (rounds > 0) and (delta == 0), resume_node will always be
         * the node following prev_node and its weight.
         */
        rounds = rem_pages / weight_total;
        delta = rem_pages % weight_total;
        resume_node = next_node_in(prev_node, nodes);
        resume_weight = weights[resume_node];
        for (i = 0; i < nnodes; i++) {
                node = next_node_in(prev_node, nodes);
                weight = weights[node];
                node_pages = weight * rounds;
                /* If a delta exists, add this node's portion of the delta */
                if (delta > weight) {
                        node_pages += weight;
                        delta -= weight;
                } else if (delta) {
                        /* when delta is depleted, resume from that node */
                        node_pages += delta;
                        resume_node = node;
                        resume_weight = weight - delta;
                        delta = 0;
                }
                /* node_pages can be 0 if an allocation fails and rounds == 0 */
                if (!node_pages)
                        break;
                nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
                                                  page_array);
                page_array += nr_allocated;
                total_allocated += nr_allocated;
                if (total_allocated == nr_pages)
                        break;
                prev_node = node;
        }
        me->il_prev = resume_node;
        me->il_weight = resume_weight;
        kfree(weights);
        return total_allocated;
}

static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
                struct mempolicy *pol, unsigned long nr_pages,
                struct page **page_array)
{
        gfp_t preferred_gfp;
        unsigned long nr_allocated = 0;

        preferred_gfp = gfp | __GFP_NOWARN;
        preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);

        nr_allocated  = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
                                           nr_pages, page_array);

        if (nr_allocated < nr_pages)
                nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
                                nr_pages - nr_allocated,
                                page_array + nr_allocated);
        return nr_allocated;
}

/* alloc pages bulk and mempolicy should be considered at the
 * same time in some situation such as vmalloc.
 *
 * It can accelerate memory allocation especially interleaving
 * allocate memory.
 */
unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
                unsigned long nr_pages, struct page **page_array)
{
        struct mempolicy *pol = &default_policy;
        nodemask_t *nodemask;
        int nid;

        if (!in_interrupt() && !(gfp & __GFP_THISNODE))
                pol = get_task_policy(current);

        if (pol->mode == MPOL_INTERLEAVE)
                return alloc_pages_bulk_interleave(gfp, pol,
                                                         nr_pages, page_array);

        if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
                return alloc_pages_bulk_weighted_interleave(
                                  gfp, pol, nr_pages, page_array);

        if (pol->mode == MPOL_PREFERRED_MANY)
                return alloc_pages_bulk_preferred_many(gfp,
                                numa_node_id(), pol, nr_pages, page_array);

        nid = numa_node_id();
        nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
        return alloc_pages_bulk_noprof(gfp, nid, nodemask,
                                       nr_pages, page_array);
}

int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
        struct mempolicy *pol = mpol_dup(src->vm_policy);

        if (IS_ERR(pol))
                return PTR_ERR(pol);
        dst->vm_policy = pol;
        return 0;
}

/*
 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
 * with the mems_allowed returned by cpuset_mems_allowed().  This
 * keeps mempolicies cpuset relative after its cpuset moves.  See
 * further kernel/cpuset.c update_nodemask().
 *
 * current's mempolicy may be rebinded by the other task(the task that changes
 * cpuset's mems), so we needn't do rebind work for current task.
 */

/* Slow path of a mempolicy duplicate */
struct mempolicy *__mpol_dup(struct mempolicy *old)
{
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);

        if (!new)
                return ERR_PTR(-ENOMEM);

        /* task's mempolicy is protected by alloc_lock */
        if (old == current->mempolicy) {
                task_lock(current);
                *new = *old;
                task_unlock(current);
        } else
                *new = *old;

        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                mpol_rebind_policy(new, &mems);
        }
        atomic_set(&new->refcnt, 1);
        return new;
}

/* Slow path of a mempolicy comparison */
bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
        if (!a || !b)
                return false;
        if (a->mode != b->mode)
                return false;
        if (a->flags != b->flags)
                return false;
        if (a->home_node != b->home_node)
                return false;
        if (mpol_store_user_nodemask(a))
                if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
                        return false;

        switch (a->mode) {
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_WEIGHTED_INTERLEAVE:
                return !!nodes_equal(a->nodes, b->nodes);
        case MPOL_LOCAL:
                return true;
        default:
                BUG();
                return false;
        }
}

/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
 * The policies are kept in Red-Black tree linked from the inode.
 * They are protected by the sp->lock rwlock, which should be held
 * for any accesses to the tree.
 */

/*
 * lookup first element intersecting start-end.  Caller holds sp->lock for
 * reading or for writing
 */
static struct sp_node *sp_lookup(struct shared_policy *sp,
                                        pgoff_t start, pgoff_t end)
{
        struct rb_node *n = sp->root.rb_node;

        while (n) {
                struct sp_node *p = rb_entry(n, struct sp_node, nd);

                if (start >= p->end)
                        n = n->rb_right;
                else if (end <= p->start)
                        n = n->rb_left;
                else
                        break;
        }
        if (!n)
                return NULL;
        for (;;) {
                struct sp_node *w = NULL;
                struct rb_node *prev = rb_prev(n);
                if (!prev)
                        break;
                w = rb_entry(prev, struct sp_node, nd);
                if (w->end <= start)
                        break;
                n = prev;
        }
        return rb_entry(n, struct sp_node, nd);
}

/*
 * Insert a new shared policy into the list.  Caller holds sp->lock for
 * writing.
 */
static void sp_insert(struct shared_policy *sp, struct sp_node *new)
{
        struct rb_node **p = &sp->root.rb_node;
        struct rb_node *parent = NULL;
        struct sp_node *nd;

        while (*p) {
                parent = *p;
                nd = rb_entry(parent, struct sp_node, nd);
                if (new->start < nd->start)
                        p = &(*p)->rb_left;
                else if (new->end > nd->end)
                        p = &(*p)->rb_right;
                else
                        BUG();
        }
        rb_link_node(&new->nd, parent, p);
        rb_insert_color(&new->nd, &sp->root);
}

/* Find shared policy intersecting idx */
struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
                                                pgoff_t idx)
{
        struct mempolicy *pol = NULL;
        struct sp_node *sn;

        if (!sp->root.rb_node)
                return NULL;
        read_lock(&sp->lock);
        sn = sp_lookup(sp, idx, idx+1);
        if (sn) {
                mpol_get(sn->policy);
                pol = sn->policy;
        }
        read_unlock(&sp->lock);
        return pol;
}

static void sp_free(struct sp_node *n)
{
        mpol_put(n->policy);
        kmem_cache_free(sn_cache, n);
}

/**
 * mpol_misplaced - check whether current folio node is valid in policy
 *
 * @folio: folio to be checked
 * @vmf: structure describing the fault
 * @addr: virtual address in @vma for shared policy lookup and interleave policy
 *
 * Lookup current policy node id for vma,addr and "compare to" folio's
 * node id.  Policy determination "mimics" alloc_page_vma().
 * Called from fault path where we know the vma and faulting address.
 *
 * Return: NUMA_NO_NODE if the page is in a node that is valid for this
 * policy, or a suitable node ID to allocate a replacement folio from.
 */
int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
                   unsigned long addr)
{
        struct mempolicy *pol;
        pgoff_t ilx;
        struct zoneref *z;
        int curnid = folio_nid(folio);
        struct vm_area_struct *vma = vmf->vma;
        int thiscpu = raw_smp_processor_id();
        int thisnid = numa_node_id();
        int polnid = NUMA_NO_NODE;
        int ret = NUMA_NO_NODE;

        /*
         * Make sure ptl is held so that we don't preempt and we
         * have a stable smp processor id
         */
        lockdep_assert_held(vmf->ptl);
        pol = get_vma_policy(vma, addr, folio_order(folio), &ilx);
        if (!(pol->flags & MPOL_F_MOF))
                goto out;

        switch (pol->mode) {
        case MPOL_INTERLEAVE:
                polnid = interleave_nid(pol, ilx);
                break;

        case MPOL_WEIGHTED_INTERLEAVE:
                polnid = weighted_interleave_nid(pol, ilx);
                break;

        case MPOL_PREFERRED:
                if (node_isset(curnid, pol->nodes))
                        goto out;
                polnid = first_node(pol->nodes);
                break;

        case MPOL_LOCAL:
                polnid = numa_node_id();
                break;

        case MPOL_BIND:
        case MPOL_PREFERRED_MANY:
                /*
                 * Even though MPOL_PREFERRED_MANY can allocate pages outside
                 * policy nodemask we don't allow numa migration to nodes
                 * outside policy nodemask for now. This is done so that if we
                 * want demotion to slow memory to happen, before allocating
                 * from some DRAM node say 'x', we will end up using a
                 * MPOL_PREFERRED_MANY mask excluding node 'x'. In such scenario
                 * we should not promote to node 'x' from slow memory node.
                 */
                if (pol->flags & MPOL_F_MORON) {
                        /*
                         * Optimize placement among multiple nodes
                         * via NUMA balancing
                         */
                        if (node_isset(thisnid, pol->nodes))
                                break;
                        goto out;
                }

                /*
                 * use current page if in policy nodemask,
                 * else select nearest allowed node, if any.
                 * If no allowed nodes, use current [!misplaced].
                 */
                if (node_isset(curnid, pol->nodes))
                        goto out;
                z = first_zones_zonelist(
                                node_zonelist(thisnid, GFP_HIGHUSER),
                                gfp_zone(GFP_HIGHUSER),
                                &pol->nodes);
                polnid = zonelist_node_idx(z);
                break;

        default:
                BUG();
        }

        /* Migrate the folio towards the node whose CPU is referencing it */
        if (pol->flags & MPOL_F_MORON) {
                polnid = thisnid;

                if (!should_numa_migrate_memory(current, folio, curnid,
                                                thiscpu))
                        goto out;
        }

        if (curnid != polnid)
                ret = polnid;
out:
        mpol_cond_put(pol);

        return ret;
}

/*
 * Drop the (possibly final) reference to task->mempolicy.  It needs to be
 * dropped after task->mempolicy is set to NULL so that any allocation done as
 * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed
 * policy.
 */
void mpol_put_task_policy(struct task_struct *task)
{
        struct mempolicy *pol;

        task_lock(task);
        pol = task->mempolicy;
        task->mempolicy = NULL;
        task_unlock(task);
        mpol_put(pol);
}

static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
        rb_erase(&n->nd, &sp->root);
        sp_free(n);
}

static void sp_node_init(struct sp_node *node, unsigned long start,
                        unsigned long end, struct mempolicy *pol)
{
        node->start = start;
        node->end = end;
        node->policy = pol;
}

static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
                                struct mempolicy *pol)
{
        struct sp_node *n;
        struct mempolicy *newpol;

        n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n)
                return NULL;

        newpol = mpol_dup(pol);
        if (IS_ERR(newpol)) {
                kmem_cache_free(sn_cache, n);
                return NULL;
        }
        newpol->flags |= MPOL_F_SHARED;
        sp_node_init(n, start, end, newpol);

        return n;
}

/* Replace a policy range. */
static int shared_policy_replace(struct shared_policy *sp, pgoff_t start,
                                 pgoff_t end, struct sp_node *new)
{
        struct sp_node *n;
        struct sp_node *n_new = NULL;
        struct mempolicy *mpol_new = NULL;
        int ret = 0;

restart:
        write_lock(&sp->lock);
        n = sp_lookup(sp, start, end);
        /* Take care of old policies in the same range. */
        while (n && n->start < end) {
                struct rb_node *next = rb_next(&n->nd);
                if (n->start >= start) {
                        if (n->end <= end)
                                sp_delete(sp, n);
                        else
                                n->start = end;
                } else {
                        /* Old policy spanning whole new range. */
                        if (n->end > end) {
                                if (!n_new)
                                        goto alloc_new;

                                *mpol_new = *n->policy;
                                atomic_set(&mpol_new->refcnt, 1);
                                sp_node_init(n_new, end, n->end, mpol_new);
                                n->end = start;
                                sp_insert(sp, n_new);
                                n_new = NULL;
                                mpol_new = NULL;
                                break;
                        } else
                                n->end = start;
                }
                if (!next)
                        break;
                n = rb_entry(next, struct sp_node, nd);
        }
        if (new)
                sp_insert(sp, new);
        write_unlock(&sp->lock);
        ret = 0;

err_out:
        if (mpol_new)
                mpol_put(mpol_new);
        if (n_new)
                kmem_cache_free(sn_cache, n_new);

        return ret;

alloc_new:
        write_unlock(&sp->lock);
        ret = -ENOMEM;
        n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL);
        if (!n_new)
                goto err_out;
        mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!mpol_new)
                goto err_out;
        atomic_set(&mpol_new->refcnt, 1);
        goto restart;
}

/**
 * mpol_shared_policy_init - initialize shared policy for inode
 * @sp: pointer to inode shared policy
 * @mpol:  struct mempolicy to install
 *
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
 * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
        int ret;

        sp->root = RB_ROOT;                /* empty tree == default mempolicy */
        rwlock_init(&sp->lock);

        if (mpol) {
                struct sp_node *sn;
                struct mempolicy *npol;
                NODEMASK_SCRATCH(scratch);

                if (!scratch)
                        goto put_mpol;

                /* contextualize the tmpfs mount point mempolicy to this file */
                npol = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(npol))
                        goto free_scratch; /* no valid nodemask intersection */

                task_lock(current);
                ret = mpol_set_nodemask(npol, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                if (ret)
                        goto put_npol;

                /* alloc node covering entire file; adds ref to file's npol */
                sn = sp_alloc(0, MAX_LFS_FILESIZE >> PAGE_SHIFT, npol);
                if (sn)
                        sp_insert(sp, sn);
put_npol:
                mpol_put(npol);        /* drop initial ref on file's npol */
free_scratch:
                NODEMASK_SCRATCH_FREE(scratch);
put_mpol:
                mpol_put(mpol);        /* drop our incoming ref on sb mpol */
        }
}

int mpol_set_shared_policy(struct shared_policy *sp,
                        struct vm_area_struct *vma, struct mempolicy *pol)
{
        int err;
        struct sp_node *new = NULL;
        unsigned long sz = vma_pages(vma);

        if (pol) {
                new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, pol);
                if (!new)
                        return -ENOMEM;
        }
        err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff + sz, new);
        if (err && new)
                sp_free(new);
        return err;
}

/* Free a backing policy store on inode delete. */
void mpol_free_shared_policy(struct shared_policy *sp)
{
        struct sp_node *n;
        struct rb_node *next;

        if (!sp->root.rb_node)
                return;
        write_lock(&sp->lock);
        next = rb_first(&sp->root);
        while (next) {
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                sp_delete(sp, n);
        }
        write_unlock(&sp->lock);
}

#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;

static void __init check_numabalancing_enable(void)
{
        bool numabalancing_default = false;

        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
                numabalancing_default = true;

        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
        if (numabalancing_override)
                set_numabalancing_state(numabalancing_override == 1);

        if (num_online_nodes() > 1 && !numabalancing_override) {
                pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n",
                        numabalancing_default ? "Enabling" : "Disabling");
                set_numabalancing_state(numabalancing_default);
        }
}

static int __init setup_numabalancing(char *str)
{
        int ret = 0;
        if (!str)
                goto out;

        if (!strcmp(str, "enable")) {
                numabalancing_override = 1;
                ret = 1;
        } else if (!strcmp(str, "disable")) {
                numabalancing_override = -1;
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("Unable to parse numa_balancing=\n");

        return ret;
}
__setup("numa_balancing=", setup_numabalancing);
#else
static inline void __init check_numabalancing_enable(void)
{
}
#endif /* CONFIG_NUMA_BALANCING */

void __init numa_policy_init(void)
{
        nodemask_t interleave_nodes;
        unsigned long largest = 0;
        int nid, prefer = 0;

        policy_cache = kmem_cache_create("numa_policy",
                                         sizeof(struct mempolicy),
                                         0, SLAB_PANIC, NULL);

        sn_cache = kmem_cache_create("shared_policy_node",
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);

        for_each_node(nid) {
                preferred_node_policy[nid] = (struct mempolicy) {
                        .refcnt = ATOMIC_INIT(1),
                        .mode = MPOL_PREFERRED,
                        .flags = MPOL_F_MOF | MPOL_F_MORON,
                        .nodes = nodemask_of_node(nid),
                };
        }

        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
        for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);

                /* Preserve the largest node */
                if (largest < total_pages) {
                        largest = total_pages;
                        prefer = nid;
                }

                /* Interleave this node? */
                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
                        node_set(nid, interleave_nodes);
        }

        /* All too small, use the largest */
        if (unlikely(nodes_empty(interleave_nodes)))
                node_set(prefer, interleave_nodes);

        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                pr_err("%s: interleaving failed\n", __func__);

        check_numabalancing_enable();
}

/* Reset policy of current process to default */
void numa_default_policy(void)
{
        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
}

/*
 * Parse and format mempolicy from/to strings
 */
static const char * const policy_modes[] =
{
        [MPOL_DEFAULT]    = "default",
        [MPOL_PREFERRED]  = "prefer",
        [MPOL_BIND]       = "bind",
        [MPOL_INTERLEAVE] = "interleave",
        [MPOL_WEIGHTED_INTERLEAVE] = "weighted interleave",
        [MPOL_LOCAL]      = "local",
        [MPOL_PREFERRED_MANY]  = "prefer (many)",
};

#ifdef CONFIG_TMPFS
/**
 * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option.
 * @str:  string containing mempolicy to parse
 * @mpol:  pointer to struct mempolicy pointer, returned on success.
 *
 * Format of input:
 *        <mode>[=<flags>][:<nodelist>]
 *
 * Return: %0 on success, else %1
 */
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
        struct mempolicy *new = NULL;
        unsigned short mode_flags;
        nodemask_t nodes;
        char *nodelist = strchr(str, ':');
        char *flags = strchr(str, '=');
        int err = 1, mode;

        if (flags)
                *flags++ = '\0';        /* terminate mode string */

        if (nodelist) {
                /* NUL-terminate mode or flags string */
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, nodes))
                        goto out;
                if (!nodes_subset(nodes, node_states[N_MEMORY]))
                        goto out;
        } else
                nodes_clear(nodes);

        mode = match_string(policy_modes, MPOL_MAX, str);
        if (mode < 0)
                goto out;

        switch (mode) {
        case MPOL_PREFERRED:
                /*
                 * Insist on a nodelist of one node only, although later
                 * we use first_node(nodes) to grab a single node, so here
                 * nodelist (or nodes) cannot be empty.
                 */
                if (nodelist) {
                        char *rest = nodelist;
                        while (isdigit(*rest))
                                rest++;
                        if (*rest)
                                goto out;
                        if (nodes_empty(nodes))
                                goto out;
                }
                break;
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                /*
                 * Default to online nodes with memory if no nodelist
                 */
                if (!nodelist)
                        nodes = node_states[N_MEMORY];
                break;
        case MPOL_LOCAL:
                /*
                 * Don't allow a nodelist;  mpol_new() checks flags
                 */
                if (nodelist)
                        goto out;
                break;
        case MPOL_DEFAULT:
                /*
                 * Insist on a empty nodelist
                 */
                if (!nodelist)
                        err = 0;
                goto out;
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
                /*
                 * Insist on a nodelist
                 */
                if (!nodelist)
                        goto out;
        }

        mode_flags = 0;
        if (flags) {
                /*
                 * Currently, we only support two mutually exclusive
                 * mode flags.
                 */
                if (!strcmp(flags, "static"))
                        mode_flags |= MPOL_F_STATIC_NODES;
                else if (!strcmp(flags, "relative"))
                        mode_flags |= MPOL_F_RELATIVE_NODES;
                else
                        goto out;
        }

        new = mpol_new(mode, mode_flags, &nodes);
        if (IS_ERR(new))
                goto out;

        /*
         * Save nodes for mpol_to_str() to show the tmpfs mount options
         * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo.
         */
        if (mode != MPOL_PREFERRED) {
                new->nodes = nodes;
        } else if (nodelist) {
                nodes_clear(new->nodes);
                node_set(first_node(nodes), new->nodes);
        } else {
                new->mode = MPOL_LOCAL;
        }

        /*
         * Save nodes for contextualization: this will be used to "clone"
         * the mempolicy in a specific context [cpuset] at a later time.
         */
        new->w.user_nodemask = nodes;

        err = 0;

out:
        /* Restore string for error message */
        if (nodelist)
                *--nodelist = ':';
        if (flags)
                *--flags = '=';
        if (!err)
                *mpol = new;
        return err;
}
#endif /* CONFIG_TMPFS */

/**
 * mpol_to_str - format a mempolicy structure for printing
 * @buffer:  to contain formatted mempolicy string
 * @maxlen:  length of @buffer
 * @pol:  pointer to mempolicy to be formatted
 *
 * Convert @pol into a string.  If @buffer is too short, truncate the string.
 * Recommend a @maxlen of at least 51 for the longest mode, "weighted
 * interleave", plus the longest flag flags, "relative|balancing", and to
 * display at least a few node ids.
 */
void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
        char *p = buffer;
        nodemask_t nodes = NODE_MASK_NONE;
        unsigned short mode = MPOL_DEFAULT;
        unsigned short flags = 0;

        if (pol &&
            pol != &default_policy &&
            !(pol >= &preferred_node_policy[0] &&
              pol <= &preferred_node_policy[ARRAY_SIZE(preferred_node_policy) - 1])) {
                mode = pol->mode;
                flags = pol->flags;
        }

        switch (mode) {
        case MPOL_DEFAULT:
        case MPOL_LOCAL:
                break;
        case MPOL_PREFERRED:
        case MPOL_PREFERRED_MANY:
        case MPOL_BIND:
        case MPOL_INTERLEAVE:
        case MPOL_WEIGHTED_INTERLEAVE:
                nodes = pol->nodes;
                break;
        default:
                WARN_ON_ONCE(1);
                snprintf(p, maxlen, "unknown");
                return;
        }

        p += snprintf(p, maxlen, "%s", policy_modes[mode]);

        if (flags & MPOL_MODE_FLAGS) {
                p += snprintf(p, buffer + maxlen - p, "=");

                /*
                 * Static and relative are mutually exclusive.
                 */
                if (flags & MPOL_F_STATIC_NODES)
                        p += snprintf(p, buffer + maxlen - p, "static");
                else if (flags & MPOL_F_RELATIVE_NODES)
                        p += snprintf(p, buffer + maxlen - p, "relative");

                if (flags & MPOL_F_NUMA_BALANCING) {
                        if (!is_power_of_2(flags & MPOL_MODE_FLAGS))
                                p += snprintf(p, buffer + maxlen - p, "|");
                        p += snprintf(p, buffer + maxlen - p, "balancing");
                }
        }

        if (!nodes_empty(nodes))
                p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
                               nodemask_pr_args(&nodes));
}

#ifdef CONFIG_SYSFS
struct iw_node_attr {
        struct kobj_attribute kobj_attr;
        int nid;
};

static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
                         char *buf)
{
        struct iw_node_attr *node_attr;
        u8 weight;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        weight = get_il_weight(node_attr->nid);
        return sysfs_emit(buf, "%d\n", weight);
}

static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
                          const char *buf, size_t count)
{
        struct iw_node_attr *node_attr;
        u8 *new;
        u8 *old;
        u8 weight = 0;

        node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
        if (count == 0 || sysfs_streq(buf, ""))
                weight = 0;
        else if (kstrtou8(buf, 0, &weight))
                return -EINVAL;

        new = kzalloc(nr_node_ids, GFP_KERNEL);
        if (!new)
                return -ENOMEM;

        mutex_lock(&iw_table_lock);
        old = rcu_dereference_protected(iw_table,
                                        lockdep_is_held(&iw_table_lock));
        if (old)
                memcpy(new, old, nr_node_ids);
        new[node_attr->nid] = weight;
        rcu_assign_pointer(iw_table, new);
        mutex_unlock(&iw_table_lock);
        synchronize_rcu();
        kfree(old);
        return count;
}

static struct iw_node_attr **node_attrs;

static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
                                  struct kobject *parent)
{
        if (!node_attr)
                return;
        sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
        kfree(node_attr->kobj_attr.attr.name);
        kfree(node_attr);
}

static void sysfs_wi_release(struct kobject *wi_kobj)
{
        int i;

        for (i = 0; i < nr_node_ids; i++)
                sysfs_wi_node_release(node_attrs[i], wi_kobj);
        kobject_put(wi_kobj);
}

static const struct kobj_type wi_ktype = {
        .sysfs_ops = &kobj_sysfs_ops,
        .release = sysfs_wi_release,
};

static int add_weight_node(int nid, struct kobject *wi_kobj)
{
        struct iw_node_attr *node_attr;
        char *name;

        node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
        if (!node_attr)
                return -ENOMEM;

        name = kasprintf(GFP_KERNEL, "node%d", nid);
        if (!name) {
                kfree(node_attr);
                return -ENOMEM;
        }

        sysfs_attr_init(&node_attr->kobj_attr.attr);
        node_attr->kobj_attr.attr.name = name;
        node_attr->kobj_attr.attr.mode = 0644;
        node_attr->kobj_attr.show = node_show;
        node_attr->kobj_attr.store = node_store;
        node_attr->nid = nid;

        if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
                kfree(node_attr->kobj_attr.attr.name);
                kfree(node_attr);
                pr_err("failed to add attribute to weighted_interleave\n");
                return -ENOMEM;
        }

        node_attrs[nid] = node_attr;
        return 0;
}

static int add_weighted_interleave_group(struct kobject *root_kobj)
{
        struct kobject *wi_kobj;
        int nid, err;

        wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
        if (!wi_kobj)
                return -ENOMEM;

        err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
                                   "weighted_interleave");
        if (err) {
                kfree(wi_kobj);
                return err;
        }

        for_each_node_state(nid, N_POSSIBLE) {
                err = add_weight_node(nid, wi_kobj);
                if (err) {
                        pr_err("failed to add sysfs [node%d]\n", nid);
                        break;
                }
        }
        if (err)
                kobject_put(wi_kobj);
        return 0;
}

static void mempolicy_kobj_release(struct kobject *kobj)
{
        u8 *old;

        mutex_lock(&iw_table_lock);
        old = rcu_dereference_protected(iw_table,
                                        lockdep_is_held(&iw_table_lock));
        rcu_assign_pointer(iw_table, NULL);
        mutex_unlock(&iw_table_lock);
        synchronize_rcu();
        kfree(old);
        kfree(node_attrs);
        kfree(kobj);
}

static const struct kobj_type mempolicy_ktype = {
        .release = mempolicy_kobj_release
};

static int __init mempolicy_sysfs_init(void)
{
        int err;
        static struct kobject *mempolicy_kobj;

        mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
        if (!mempolicy_kobj) {
                err = -ENOMEM;
                goto err_out;
        }

        node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
                             GFP_KERNEL);
        if (!node_attrs) {
                err = -ENOMEM;
                goto mempol_out;
        }

        err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
                                   "mempolicy");
        if (err)
                goto node_out;

        err = add_weighted_interleave_group(mempolicy_kobj);
        if (err) {
                pr_err("mempolicy sysfs structure failed to initialize\n");
                kobject_put(mempolicy_kobj);
                return err;
        }

        return err;
node_out:
        kfree(node_attrs);
mempol_out:
        kfree(mempolicy_kobj);
err_out:
        pr_err("failed to add mempolicy kobject to the system\n");
        return err;
}

late_initcall(mempolicy_sysfs_init);
#endif /* CONFIG_SYSFS */





























































































































































  420 
  417 
































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/mm/init.c
 *
 * Copyright (C) 1995-2005 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/errno.h>
#include <linux/swap.h>
#include <linux/init.h>
#include <linux/cache.h>
#include <linux/mman.h>
#include <linux/nodemask.h>
#include <linux/initrd.h>
#include <linux/gfp.h>
#include <linux/math.h>
#include <linux/memblock.h>
#include <linux/sort.h>
#include <linux/of.h>
#include <linux/of_fdt.h>
#include <linux/dma-direct.h>
#include <linux/dma-map-ops.h>
#include <linux/efi.h>
#include <linux/swiotlb.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
#include <linux/hugetlb.h>
#include <linux/acpi_iort.h>
#include <linux/kmemleak.h>
#include <linux/execmem.h>

#include <asm/boot.h>
#include <asm/fixmap.h>
#include <asm/kasan.h>
#include <asm/kernel-pgtable.h>
#include <asm/kvm_host.h>
#include <asm/memory.h>
#include <asm/numa.h>
#include <asm/rsi.h>
#include <asm/sections.h>
#include <asm/setup.h>
#include <linux/sizes.h>
#include <asm/tlb.h>
#include <asm/alternative.h>
#include <asm/xen/swiotlb-xen.h>

/*
 * We need to be able to catch inadvertent references to memstart_addr
 * that occur (potentially in generic code) before arm64_memblock_init()
 * executes, which assigns it its actual value. So use a default value
 * that cannot be mistaken for a real physical address.
 */
s64 memstart_addr __ro_after_init = -1;
EXPORT_SYMBOL(memstart_addr);

/*
 * If the corresponding config options are enabled, we create both ZONE_DMA
 * and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory
 * unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4).
 * In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory,
 * otherwise it is empty.
 */
phys_addr_t __ro_after_init arm64_dma_phys_limit;

/*
 * To make optimal use of block mappings when laying out the linear
 * mapping, round down the base of physical memory to a size that can
 * be mapped efficiently, i.e., either PUD_SIZE (4k granule) or PMD_SIZE
 * (64k granule), or a multiple that can be mapped using contiguous bits
 * in the page tables: 32 * PMD_SIZE (16k granule)
 */
#if defined(CONFIG_ARM64_4K_PAGES)
#define ARM64_MEMSTART_SHIFT                PUD_SHIFT
#elif defined(CONFIG_ARM64_16K_PAGES)
#define ARM64_MEMSTART_SHIFT                CONT_PMD_SHIFT
#else
#define ARM64_MEMSTART_SHIFT                PMD_SHIFT
#endif

/*
 * sparsemem vmemmap imposes an additional requirement on the alignment of
 * memstart_addr, due to the fact that the base of the vmemmap region
 * has a direct correspondence, and needs to appear sufficiently aligned
 * in the virtual address space.
 */
#if ARM64_MEMSTART_SHIFT < SECTION_SIZE_BITS
#define ARM64_MEMSTART_ALIGN        (1UL << SECTION_SIZE_BITS)
#else
#define ARM64_MEMSTART_ALIGN        (1UL << ARM64_MEMSTART_SHIFT)
#endif

static void __init arch_reserve_crashkernel(void)
{
        unsigned long long low_size = 0;
        unsigned long long crash_base, crash_size;
        bool high = false;
        int ret;

        if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
                return;

        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
                                &crash_size, &crash_base,
                                &low_size, &high);
        if (ret)
                return;

        reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
}

static phys_addr_t __init max_zone_phys(phys_addr_t zone_limit)
{
        return min(zone_limit, memblock_end_of_DRAM() - 1) + 1;
}

static void __init zone_sizes_init(void)
{
        unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
        phys_addr_t __maybe_unused acpi_zone_dma_limit;
        phys_addr_t __maybe_unused dt_zone_dma_limit;
        phys_addr_t __maybe_unused dma32_phys_limit =
                max_zone_phys(DMA_BIT_MASK(32));

#ifdef CONFIG_ZONE_DMA
        acpi_zone_dma_limit = acpi_iort_dma_get_max_cpu_address();
        dt_zone_dma_limit = of_dma_get_max_cpu_address(NULL);
        zone_dma_limit = min(dt_zone_dma_limit, acpi_zone_dma_limit);
        /*
         * Information we get from firmware (e.g. DT dma-ranges) describe DMA
         * bus constraints. Devices using DMA might have their own limitations.
         * Some of them rely on DMA zone in low 32-bit memory. Keep low RAM
         * DMA zone on platforms that have RAM there.
         */
        if (memblock_start_of_DRAM() < U32_MAX)
                zone_dma_limit = min(zone_dma_limit, U32_MAX);
        arm64_dma_phys_limit = max_zone_phys(zone_dma_limit);
        max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit);
#endif
#ifdef CONFIG_ZONE_DMA32
        max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit);
        if (!arm64_dma_phys_limit)
                arm64_dma_phys_limit = dma32_phys_limit;
#endif
        if (!arm64_dma_phys_limit)
                arm64_dma_phys_limit = PHYS_MASK + 1;
        max_zone_pfns[ZONE_NORMAL] = max_pfn;

        free_area_init(max_zone_pfns);
}

int pfn_is_map_memory(unsigned long pfn)
{
        phys_addr_t addr = PFN_PHYS(pfn);

        /* avoid false positives for bogus PFNs, see comment in pfn_valid() */
        if (PHYS_PFN(addr) != pfn)
                return 0;

        return memblock_is_map_memory(addr);
}
EXPORT_SYMBOL(pfn_is_map_memory);

static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX;

/*
 * Limit the memory size that was specified via FDT.
 */
static int __init early_mem(char *p)
{
        if (!p)
                return 1;

        memory_limit = memparse(p, &p) & PAGE_MASK;
        pr_notice("Memory limited to %lldMB\n", memory_limit >> 20);

        return 0;
}
early_param("mem", early_mem);

void __init arm64_memblock_init(void)
{
        s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual);

        /*
         * Corner case: 52-bit VA capable systems running KVM in nVHE mode may
         * be limited in their ability to support a linear map that exceeds 51
         * bits of VA space, depending on the placement of the ID map. Given
         * that the placement of the ID map may be randomized, let's simply
         * limit the kernel's linear map to 51 bits as well if we detect this
         * configuration.
         */
        if (IS_ENABLED(CONFIG_KVM) && vabits_actual == 52 &&
            is_hyp_mode_available() && !is_kernel_in_hyp_mode()) {
                pr_info("Capping linear region to 51 bits for KVM in nVHE mode on LVA capable hardware.\n");
                linear_region_size = min_t(u64, linear_region_size, BIT(51));
        }

        /* Remove memory above our supported physical address size */
        memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);

        /*
         * Select a suitable value for the base of physical memory.
         */
        memstart_addr = round_down(memblock_start_of_DRAM(),
                                   ARM64_MEMSTART_ALIGN);

        if ((memblock_end_of_DRAM() - memstart_addr) > linear_region_size)
                pr_warn("Memory doesn't fit in the linear mapping, VA_BITS too small\n");

        /*
         * Remove the memory that we will not be able to cover with the
         * linear mapping. Take care not to clip the kernel which may be
         * high in memory.
         */
        memblock_remove(max_t(u64, memstart_addr + linear_region_size,
                        __pa_symbol(_end)), ULLONG_MAX);
        if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
                /* ensure that memstart_addr remains sufficiently aligned */
                memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
                                         ARM64_MEMSTART_ALIGN);
                memblock_remove(0, memstart_addr);
        }

        /*
         * If we are running with a 52-bit kernel VA config on a system that
         * does not support it, we have to place the available physical
         * memory in the 48-bit addressable part of the linear region, i.e.,
         * we have to move it upward. Since memstart_addr represents the
         * physical address of PAGE_OFFSET, we have to *subtract* from it.
         */
        if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
                memstart_addr -= _PAGE_OFFSET(vabits_actual) - _PAGE_OFFSET(52);

        /*
         * Apply the memory limit if it was set. Since the kernel may be loaded
         * high up in memory, add back the kernel region that must be accessible
         * via the linear mapping.
         */
        if (memory_limit != PHYS_ADDR_MAX) {
                memblock_mem_limit_remove_map(memory_limit);
                memblock_add(__pa_symbol(_text), (u64)(_end - _text));
        }

        if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
                /*
                 * Add back the memory we just removed if it results in the
                 * initrd to become inaccessible via the linear mapping.
                 * Otherwise, this is a no-op
                 */
                u64 base = phys_initrd_start & PAGE_MASK;
                u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;

                /*
                 * We can only add back the initrd memory if we don't end up
                 * with more memory than we can address via the linear mapping.
                 * It is up to the bootloader to position the kernel and the
                 * initrd reasonably close to each other (i.e., within 32 GB of
                 * each other) so that all granule/#levels combinations can
                 * always access both.
                 */
                if (WARN(base < memblock_start_of_DRAM() ||
                         base + size > memblock_start_of_DRAM() +
                                       linear_region_size,
                        "initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
                        phys_initrd_size = 0;
                } else {
                        memblock_add(base, size);
                        memblock_clear_nomap(base, size);
                        memblock_reserve(base, size);
                }
        }

        if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
                extern u16 memstart_offset_seed;
                u64 mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
                int parange = cpuid_feature_extract_unsigned_field(
                                        mmfr0, ID_AA64MMFR0_EL1_PARANGE_SHIFT);
                s64 range = linear_region_size -
                            BIT(id_aa64mmfr0_parange_to_phys_shift(parange));

                /*
                 * If the size of the linear region exceeds, by a sufficient
                 * margin, the size of the region that the physical memory can
                 * span, randomize the linear region as well.
                 */
                if (memstart_offset_seed > 0 && range >= (s64)ARM64_MEMSTART_ALIGN) {
                        range /= ARM64_MEMSTART_ALIGN;
                        memstart_addr -= ARM64_MEMSTART_ALIGN *
                                         ((range * memstart_offset_seed) >> 16);
                }
        }

        /*
         * Register the kernel text, kernel data, initrd, and initial
         * pagetables with memblock.
         */
        memblock_reserve(__pa_symbol(_stext), _end - _stext);
        if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
                /* the generic initrd code expects virtual addresses */
                initrd_start = __phys_to_virt(phys_initrd_start);
                initrd_end = initrd_start + phys_initrd_size;
        }

        early_init_fdt_scan_reserved_mem();
}

void __init bootmem_init(void)
{
        unsigned long min, max;

        min = PFN_UP(memblock_start_of_DRAM());
        max = PFN_DOWN(memblock_end_of_DRAM());

        early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);

        max_pfn = max_low_pfn = max;
        min_low_pfn = min;

        arch_numa_init();

        /*
         * must be done after arch_numa_init() which calls numa_init() to
         * initialize node_online_map that gets used in hugetlb_cma_reserve()
         * while allocating required CMA size across online nodes.
         */
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
        arm64_hugetlb_cma_reserve();
#endif

        kvm_hyp_reserve();

        /*
         * sparse_init() tries to allocate memory from memblock, so must be
         * done after the fixed reservations
         */
        sparse_init();
        zone_sizes_init();

        /*
         * Reserve the CMA area after arm64_dma_phys_limit was initialised.
         */
        dma_contiguous_reserve(arm64_dma_phys_limit);

        /*
         * request_standard_resources() depends on crashkernel's memory being
         * reserved, so do it here.
         */
        arch_reserve_crashkernel();

        memblock_dump_all();
}

void __init arch_mm_preinit(void)
{
        unsigned int flags = SWIOTLB_VERBOSE;
        bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit);

        if (is_realm_world()) {
                swiotlb = true;
                flags |= SWIOTLB_FORCE;
        }

        if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb) {
                /*
                 * If no bouncing needed for ZONE_DMA, reduce the swiotlb
                 * buffer for kmalloc() bouncing to 1MB per 1GB of RAM.
                 */
                unsigned long size =
                        DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
                swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
                swiotlb = true;
        }

        swiotlb_init(swiotlb, flags);
        swiotlb_update_mem_attributes();

        /*
         * Check boundaries twice: Some fundamental inconsistencies can be
         * detected at build time already.
         */
#ifdef CONFIG_COMPAT
        BUILD_BUG_ON(TASK_SIZE_32 > DEFAULT_MAP_WINDOW_64);
#endif

        /*
         * Selected page table levels should match when derived from
         * scratch using the virtual address range and page size.
         */
        BUILD_BUG_ON(ARM64_HW_PGTABLE_LEVELS(CONFIG_ARM64_VA_BITS) !=
                     CONFIG_PGTABLE_LEVELS);

        if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
                extern int sysctl_overcommit_memory;
                /*
                 * On a machine this small we won't get anywhere without
                 * overcommit, so turn it on by default.
                 */
                sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
        }
}

void free_initmem(void)
{
        void *lm_init_begin = lm_alias(__init_begin);
        void *lm_init_end = lm_alias(__init_end);

        WARN_ON(!IS_ALIGNED((unsigned long)lm_init_begin, PAGE_SIZE));
        WARN_ON(!IS_ALIGNED((unsigned long)lm_init_end, PAGE_SIZE));

        /* Delete __init region from memblock.reserved. */
        memblock_free(lm_init_begin, lm_init_end - lm_init_begin);

        free_reserved_area(lm_init_begin, lm_init_end,
                           POISON_FREE_INITMEM, "unused kernel");
        /*
         * Unmap the __init region but leave the VM area in place. This
         * prevents the region from being reused for kernel modules, which
         * is not supported by kallsyms.
         */
        vunmap_range((u64)__init_begin, (u64)__init_end);
}

void dump_mem_limit(void)
{
        if (memory_limit != PHYS_ADDR_MAX) {
                pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
        } else {
                pr_emerg("Memory Limit: none\n");
        }
}

#ifdef CONFIG_EXECMEM
static u64 module_direct_base __ro_after_init = 0;
static u64 module_plt_base __ro_after_init = 0;

/*
 * Choose a random page-aligned base address for a window of 'size' bytes which
 * entirely contains the interval [start, end - 1].
 */
static u64 __init random_bounding_box(u64 size, u64 start, u64 end)
{
        u64 max_pgoff, pgoff;

        if ((end - start) >= size)
                return 0;

        max_pgoff = (size - (end - start)) / PAGE_SIZE;
        pgoff = get_random_u32_inclusive(0, max_pgoff);

        return start - pgoff * PAGE_SIZE;
}

/*
 * Modules may directly reference data and text anywhere within the kernel
 * image and other modules. References using PREL32 relocations have a +/-2G
 * range, and so we need to ensure that the entire kernel image and all modules
 * fall within a 2G window such that these are always within range.
 *
 * Modules may directly branch to functions and code within the kernel text,
 * and to functions and code within other modules. These branches will use
 * CALL26/JUMP26 relocations with a +/-128M range. Without PLTs, we must ensure
 * that the entire kernel text and all module text falls within a 128M window
 * such that these are always within range. With PLTs, we can expand this to a
 * 2G window.
 *
 * We chose the 128M region to surround the entire kernel image (rather than
 * just the text) as using the same bounds for the 128M and 2G regions ensures
 * by construction that we never select a 128M region that is not a subset of
 * the 2G region. For very large and unusual kernel configurations this means
 * we may fall back to PLTs where they could have been avoided, but this keeps
 * the logic significantly simpler.
 */
static int __init module_init_limits(void)
{
        u64 kernel_end = (u64)_end;
        u64 kernel_start = (u64)_text;
        u64 kernel_size = kernel_end - kernel_start;

        /*
         * The default modules region is placed immediately below the kernel
         * image, and is large enough to use the full 2G relocation range.
         */
        BUILD_BUG_ON(KIMAGE_VADDR != MODULES_END);
        BUILD_BUG_ON(MODULES_VSIZE < SZ_2G);

        if (!kaslr_enabled()) {
                if (kernel_size < SZ_128M)
                        module_direct_base = kernel_end - SZ_128M;
                if (kernel_size < SZ_2G)
                        module_plt_base = kernel_end - SZ_2G;
        } else {
                u64 min = kernel_start;
                u64 max = kernel_end;

                if (IS_ENABLED(CONFIG_RANDOMIZE_MODULE_REGION_FULL)) {
                        pr_info("2G module region forced by RANDOMIZE_MODULE_REGION_FULL\n");
                } else {
                        module_direct_base = random_bounding_box(SZ_128M, min, max);
                        if (module_direct_base) {
                                min = module_direct_base;
                                max = module_direct_base + SZ_128M;
                        }
                }

                module_plt_base = random_bounding_box(SZ_2G, min, max);
        }

        pr_info("%llu pages in range for non-PLT usage",
                module_direct_base ? (SZ_128M - kernel_size) / PAGE_SIZE : 0);
        pr_info("%llu pages in range for PLT usage",
                module_plt_base ? (SZ_2G - kernel_size) / PAGE_SIZE : 0);

        return 0;
}

static struct execmem_info execmem_info __ro_after_init;

struct execmem_info __init *execmem_arch_setup(void)
{
        unsigned long fallback_start = 0, fallback_end = 0;
        unsigned long start = 0, end = 0;

        module_init_limits();

        /*
         * Where possible, prefer to allocate within direct branch range of the
         * kernel such that no PLTs are necessary.
         */
        if (module_direct_base) {
                start = module_direct_base;
                end = module_direct_base + SZ_128M;

                if (module_plt_base) {
                        fallback_start = module_plt_base;
                        fallback_end = module_plt_base + SZ_2G;
                }
        } else if (module_plt_base) {
                start = module_plt_base;
                end = module_plt_base + SZ_2G;
        }

        execmem_info = (struct execmem_info){
                .ranges = {
                        [EXECMEM_DEFAULT] = {
                                .start        = start,
                                .end        = end,
                                .pgprot        = PAGE_KERNEL,
                                .alignment = 1,
                                .fallback_start        = fallback_start,
                                .fallback_end        = fallback_end,
                        },
                        [EXECMEM_KPROBES] = {
                                .start        = VMALLOC_START,
                                .end        = VMALLOC_END,
                                .pgprot        = PAGE_KERNEL_ROX,
                                .alignment = 1,
                        },
                        [EXECMEM_BPF] = {
                                .start        = VMALLOC_START,
                                .end        = VMALLOC_END,
                                .pgprot        = PAGE_KERNEL,
                                .alignment = 1,
                        },
                },
        };

        return &execmem_info;
}
#endif /* CONFIG_EXECMEM */

































































































































































































































  265 





  265 





























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/of_address.h>
#include <linux/of_iommu.h>
#include <linux/of_reserved_mem.h>
#include <linux/dma-direct.h> /* for bus_dma_region */
#include <linux/dma-map-ops.h>
#include <linux/init.h>
#include <linux/mod_devicetable.h>
#include <linux/slab.h>
#include <linux/platform_device.h>

#include <asm/errno.h>
#include "of_private.h"

/**
 * of_match_device - Tell if a struct device matches an of_device_id list
 * @matches: array of of device match structures to search in
 * @dev: the of device structure to match against
 *
 * Used by a driver to check whether an platform_device present in the
 * system is in its list of supported devices.
 */
const struct of_device_id *of_match_device(const struct of_device_id *matches,
                                           const struct device *dev)
{
        if (!matches || !dev->of_node || dev->of_node_reused)
                return NULL;
        return of_match_node(matches, dev->of_node);
}
EXPORT_SYMBOL(of_match_device);

static void
of_dma_set_restricted_buffer(struct device *dev, struct device_node *np)
{
        struct device_node *node, *of_node = dev->of_node;
        int count, i;

        if (!IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL))
                return;

        count = of_property_count_elems_of_size(of_node, "memory-region",
                                                sizeof(u32));
        /*
         * If dev->of_node doesn't exist or doesn't contain memory-region, try
         * the OF node having DMA configuration.
         */
        if (count <= 0) {
                of_node = np;
                count = of_property_count_elems_of_size(
                        of_node, "memory-region", sizeof(u32));
        }

        for (i = 0; i < count; i++) {
                node = of_parse_phandle(of_node, "memory-region", i);
                /*
                 * There might be multiple memory regions, but only one
                 * restricted-dma-pool region is allowed.
                 */
                if (of_device_is_compatible(node, "restricted-dma-pool") &&
                    of_device_is_available(node)) {
                        of_node_put(node);
                        break;
                }
                of_node_put(node);
        }

        /*
         * Attempt to initialize a restricted-dma-pool region if one was found.
         * Note that count can hold a negative error code.
         */
        if (i < count && of_reserved_mem_device_init_by_idx(dev, of_node, i))
                dev_warn(dev, "failed to initialise \"restricted-dma-pool\" memory node\n");
}

/**
 * of_dma_configure_id - Setup DMA configuration
 * @dev:        Device to apply DMA configuration
 * @np:                Pointer to OF node having DMA configuration
 * @force_dma:  Whether device is to be set up by of_dma_configure() even if
 *                DMA capability is not explicitly described by firmware.
 * @id:                Optional const pointer value input id
 *
 * Try to get devices's DMA configuration from DT and update it
 * accordingly.
 *
 * If platform code needs to use its own special DMA configuration, it
 * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events
 * to fix up DMA configuration.
 */
int of_dma_configure_id(struct device *dev, struct device_node *np,
                        bool force_dma, const u32 *id)
{
        const struct bus_dma_region *map = NULL;
        struct device_node *bus_np;
        u64 mask, end = 0;
        bool coherent, set_map = false;
        int ret;

        if (dev->dma_range_map) {
                dev_dbg(dev, "dma_range_map already set\n");
                goto skip_map;
        }

        if (np == dev->of_node)
                bus_np = __of_get_dma_parent(np);
        else
                bus_np = of_node_get(np);

        ret = of_dma_get_range(bus_np, &map);
        of_node_put(bus_np);
        if (ret < 0) {
                /*
                 * For legacy reasons, we have to assume some devices need
                 * DMA configuration regardless of whether "dma-ranges" is
                 * correctly specified or not.
                 */
                if (!force_dma)
                        return ret == -ENODEV ? 0 : ret;
        } else {
                /* Determine the overall bounds of all DMA regions */
                end = dma_range_map_max(map);
                set_map = true;
        }
skip_map:
        /*
         * If @dev is expected to be DMA-capable then the bus code that created
         * it should have initialised its dma_mask pointer by this point. For
         * now, we'll continue the legacy behaviour of coercing it to the
         * coherent mask if not, but we'll no longer do so quietly.
         */
        if (!dev->dma_mask) {
                dev_warn(dev, "DMA mask not set\n");
                dev->dma_mask = &dev->coherent_dma_mask;
        }

        if (!end && dev->coherent_dma_mask)
                end = dev->coherent_dma_mask;
        else if (!end)
                end = (1ULL << 32) - 1;

        /*
         * Limit coherent and dma mask based on size and default mask
         * set by the driver.
         */
        mask = DMA_BIT_MASK(ilog2(end) + 1);
        dev->coherent_dma_mask &= mask;
        *dev->dma_mask &= mask;
        /* ...but only set bus limit and range map if we found valid dma-ranges earlier */
        if (set_map) {
                dev->bus_dma_limit = end;
                dev->dma_range_map = map;
        }

        coherent = of_dma_is_coherent(np);
        dev_dbg(dev, "device is%sdma coherent\n",
                coherent ? " " : " not ");

        ret = of_iommu_configure(dev, np, id);
        if (ret == -EPROBE_DEFER) {
                /* Don't touch range map if it wasn't set from a valid dma-ranges */
                if (set_map)
                        dev->dma_range_map = NULL;
                kfree(map);
                return -EPROBE_DEFER;
        }
        /* Take all other IOMMU errors to mean we'll just carry on without it */
        dev_dbg(dev, "device is%sbehind an iommu\n",
                !ret ? " " : " not ");

        arch_setup_dma_ops(dev, coherent);

        if (ret)
                of_dma_set_restricted_buffer(dev, np);

        return 0;
}
EXPORT_SYMBOL_GPL(of_dma_configure_id);

const void *of_device_get_match_data(const struct device *dev)
{
        const struct of_device_id *match;

        match = of_match_device(dev->driver->of_match_table, dev);
        if (!match)
                return NULL;

        return match->data;
}
EXPORT_SYMBOL(of_device_get_match_data);

/**
 * of_device_modalias - Fill buffer with newline terminated modalias string
 * @dev:        Calling device
 * @str:        Modalias string
 * @len:        Size of @str
 */
ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len)
{
        ssize_t sl;

        if (!dev || !dev->of_node || dev->of_node_reused)
                return -ENODEV;

        sl = of_modalias(dev->of_node, str, len - 2);
        if (sl < 0)
                return sl;
        if (sl > len - 2)
                return -ENOMEM;

        str[sl++] = '\n';
        str[sl] = 0;
        return sl;
}
EXPORT_SYMBOL_GPL(of_device_modalias);

/**
 * of_device_uevent - Display OF related uevent information
 * @dev:        Device to display the uevent information for
 * @env:        Kernel object's userspace event reference to fill up
 */
void of_device_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        const char *compat, *type;
        struct alias_prop *app;
        struct property *p;
        int seen = 0;

        if ((!dev) || (!dev->of_node))
                return;

        add_uevent_var(env, "OF_NAME=%pOFn", dev->of_node);
        add_uevent_var(env, "OF_FULLNAME=%pOF", dev->of_node);
        type = of_node_get_device_type(dev->of_node);
        if (type)
                add_uevent_var(env, "OF_TYPE=%s", type);

        /* Since the compatible field can contain pretty much anything
         * it's not really legal to split it out with commas. We split it
         * up using a number of environment variables instead. */
        of_property_for_each_string(dev->of_node, "compatible", p, compat) {
                add_uevent_var(env, "OF_COMPATIBLE_%d=%s", seen, compat);
                seen++;
        }
        add_uevent_var(env, "OF_COMPATIBLE_N=%d", seen);

        seen = 0;
        mutex_lock(&of_mutex);
        list_for_each_entry(app, &aliases_lookup, link) {
                if (dev->of_node == app->np) {
                        add_uevent_var(env, "OF_ALIAS_%d=%s", seen,
                                       app->alias);
                        seen++;
                }
        }
        mutex_unlock(&of_mutex);
}
EXPORT_SYMBOL_GPL(of_device_uevent);

int of_device_uevent_modalias(const struct device *dev, struct kobj_uevent_env *env)
{
        int sl;

        if ((!dev) || (!dev->of_node) || dev->of_node_reused)
                return -ENODEV;

        /* Devicetree modalias is tricky, we add it in 2 steps */
        if (add_uevent_var(env, "MODALIAS="))
                return -ENOMEM;

        sl = of_modalias(dev->of_node, &env->buf[env->buflen-1],
                         sizeof(env->buf) - env->buflen);
        if (sl < 0)
                return sl;
        if (sl >= (sizeof(env->buf) - env->buflen))
                return -ENOMEM;
        env->buflen += sl;

        return 0;
}
EXPORT_SYMBOL_GPL(of_device_uevent_modalias);

/**
 * of_device_make_bus_id - Use the device node data to assign a unique name
 * @dev: pointer to device structure that is linked to a device tree node
 *
 * This routine will first try using the translated bus address to
 * derive a unique name. If it cannot, then it will prepend names from
 * parent nodes until a unique name can be derived.
 */
void of_device_make_bus_id(struct device *dev)
{
        struct device_node *node = dev->of_node;
        const __be32 *reg;
        u64 addr;
        u32 mask;

        /* Construct the name, using parent nodes if necessary to ensure uniqueness */
        while (node->parent) {
                /*
                 * If the address can be translated, then that is as much
                 * uniqueness as we need. Make it the first component and return
                 */
                reg = of_get_property(node, "reg", NULL);
                if (reg && (addr = of_translate_address(node, reg)) != OF_BAD_ADDR) {
                        if (!of_property_read_u32(node, "mask", &mask))
                                dev_set_name(dev, dev_name(dev) ? "%llx.%x.%pOFn:%s" : "%llx.%x.%pOFn",
                                             addr, ffs(mask) - 1, node, dev_name(dev));

                        else
                                dev_set_name(dev, dev_name(dev) ? "%llx.%pOFn:%s" : "%llx.%pOFn",
                                             addr, node, dev_name(dev));
                        return;
                }

                /* format arguments only used if dev_name() resolves to NULL */
                dev_set_name(dev, dev_name(dev) ? "%s:%s" : "%s",
                             kbasename(node->full_name), dev_name(dev));
                node = node->parent;
        }
}
EXPORT_SYMBOL_GPL(of_device_make_bus_id);

























































   66 






































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_ERR_H
#define _LINUX_ERR_H

#include <linux/compiler.h>
#include <linux/types.h>

#include <asm/errno.h>

/*
 * Kernel pointers have redundant information, so we can use a
 * scheme where we can return either an error code or a normal
 * pointer with the same return value.
 *
 * This should be a per-architecture thing, to allow different
 * error and pointer decisions.
 */
#define MAX_ERRNO        4095

#ifndef __ASSEMBLY__

/**
 * IS_ERR_VALUE - Detect an error pointer.
 * @x: The pointer to check.
 *
 * Like IS_ERR(), but does not generate a compiler warning if result is unused.
 */
#define IS_ERR_VALUE(x) unlikely((unsigned long)(void *)(x) >= (unsigned long)-MAX_ERRNO)

/**
 * ERR_PTR - Create an error pointer.
 * @error: A negative error code.
 *
 * Encodes @error into a pointer value. Users should consider the result
 * opaque and not assume anything about how the error is encoded.
 *
 * Return: A pointer with @error encoded within its value.
 */
static inline void * __must_check ERR_PTR(long error)
{
        return (void *) error;
}

/* Return the pointer in the percpu address space. */
#define ERR_PTR_PCPU(error) ((void __percpu *)(unsigned long)ERR_PTR(error))

/* Cast an error pointer to __iomem. */
#define IOMEM_ERR_PTR(error) (__force void __iomem *)ERR_PTR(error)

/**
 * PTR_ERR - Extract the error code from an error pointer.
 * @ptr: An error pointer.
 * Return: The error code within @ptr.
 */
static inline long __must_check PTR_ERR(__force const void *ptr)
{
        return (long) ptr;
}

/* Read an error pointer from the percpu address space. */
#define PTR_ERR_PCPU(ptr) (PTR_ERR((const void *)(__force const unsigned long)(ptr)))

/**
 * IS_ERR - Detect an error pointer.
 * @ptr: The pointer to check.
 * Return: true if @ptr is an error pointer, false otherwise.
 */
static inline bool __must_check IS_ERR(__force const void *ptr)
{
        return IS_ERR_VALUE((unsigned long)ptr);
}

/* Read an error pointer from the percpu address space. */
#define IS_ERR_PCPU(ptr) (IS_ERR((const void *)(__force const unsigned long)(ptr)))

/**
 * IS_ERR_OR_NULL - Detect an error pointer or a null pointer.
 * @ptr: The pointer to check.
 *
 * Like IS_ERR(), but also returns true for a null pointer.
 */
static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr)
{
        return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr);
}

/**
 * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
 * @ptr: The pointer to cast.
 *
 * Explicitly cast an error-valued pointer to another pointer type in such a
 * way as to make it clear that's what's going on.
 */
static inline void * __must_check ERR_CAST(__force const void *ptr)
{
        /* cast away the const */
        return (void *) ptr;
}

/**
 * PTR_ERR_OR_ZERO - Extract the error code from a pointer if it has one.
 * @ptr: A potential error pointer.
 *
 * Convenience function that can be used inside a function that returns
 * an error code to propagate errors received as error pointers.
 * For example, ``return PTR_ERR_OR_ZERO(ptr);`` replaces:
 *
 * .. code-block:: c
 *
 *        if (IS_ERR(ptr))
 *                return PTR_ERR(ptr);
 *        else
 *                return 0;
 *
 * Return: The error code within @ptr if it is an error pointer; 0 otherwise.
 */
static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr)
{
        if (IS_ERR(ptr))
                return PTR_ERR(ptr);
        else
                return 0;
}

#endif

#endif /* _LINUX_ERR_H */

























































































































































































































































































































































































































































   43 


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAPOPS_H
#define _LINUX_SWAPOPS_H

#include <linux/radix-tree.h>
#include <linux/bug.h>
#include <linux/mm_types.h>

#ifdef CONFIG_MMU

#ifdef CONFIG_SWAP
#include <linux/swapfile.h>
#endif        /* CONFIG_SWAP */

/*
 * swapcache pages are stored in the swapper_space radix tree.  We want to
 * get good packing density in that tree, so the index should be dense in
 * the low-order bits.
 *
 * We arrange the `type' and `offset' fields so that `type' is at the six
 * high-order bits of the swp_entry_t and `offset' is right-aligned in the
 * remaining bits.  Although `type' itself needs only five bits, we allow for
 * shmem/tmpfs to shift it all up a further one bit: see swp_to_radix_entry().
 *
 * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
 */
#define SWP_TYPE_SHIFT        (BITS_PER_XA_VALUE - MAX_SWAPFILES_SHIFT)
#define SWP_OFFSET_MASK        ((1UL << SWP_TYPE_SHIFT) - 1)

/*
 * Definitions only for PFN swap entries (see is_pfn_swap_entry()).  To
 * store PFN, we only need SWP_PFN_BITS bits.  Each of the pfn swap entries
 * can use the extra bits to store other information besides PFN.
 */
#ifdef MAX_PHYSMEM_BITS
#define SWP_PFN_BITS                (MAX_PHYSMEM_BITS - PAGE_SHIFT)
#else  /* MAX_PHYSMEM_BITS */
#define SWP_PFN_BITS                min_t(int, \
                                      sizeof(phys_addr_t) * 8 - PAGE_SHIFT, \
                                      SWP_TYPE_SHIFT)
#endif        /* MAX_PHYSMEM_BITS */
#define SWP_PFN_MASK                (BIT(SWP_PFN_BITS) - 1)

/**
 * Migration swap entry specific bitfield definitions.  Layout:
 *
 *   |----------+--------------------|
 *   | swp_type | swp_offset         |
 *   |----------+--------+-+-+-------|
 *   |          | resv   |D|A|  PFN  |
 *   |----------+--------+-+-+-------|
 *
 * @SWP_MIG_YOUNG_BIT: Whether the page used to have young bit set (bit A)
 * @SWP_MIG_DIRTY_BIT: Whether the page used to have dirty bit set (bit D)
 *
 * Note: A/D bits will be stored in migration entries iff there're enough
 * free bits in arch specific swp offset.  By default we'll ignore A/D bits
 * when migrating a page.  Please refer to migration_entry_supports_ad()
 * for more information.  If there're more bits besides PFN and A/D bits,
 * they should be reserved and always be zeros.
 */
#define SWP_MIG_YOUNG_BIT                (SWP_PFN_BITS)
#define SWP_MIG_DIRTY_BIT                (SWP_PFN_BITS + 1)
#define SWP_MIG_TOTAL_BITS                (SWP_PFN_BITS + 2)

#define SWP_MIG_YOUNG                        BIT(SWP_MIG_YOUNG_BIT)
#define SWP_MIG_DIRTY                        BIT(SWP_MIG_DIRTY_BIT)

static inline bool is_pfn_swap_entry(swp_entry_t entry);

/* Clear all flags but only keep swp_entry_t related information */
static inline pte_t pte_swp_clear_flags(pte_t pte)
{
        if (pte_swp_exclusive(pte))
                pte = pte_swp_clear_exclusive(pte);
        if (pte_swp_soft_dirty(pte))
                pte = pte_swp_clear_soft_dirty(pte);
        if (pte_swp_uffd_wp(pte))
                pte = pte_swp_clear_uffd_wp(pte);
        return pte;
}

/*
 * Store a type+offset into a swp_entry_t in an arch-independent format
 */
static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
{
        swp_entry_t ret;

        ret.val = (type << SWP_TYPE_SHIFT) | (offset & SWP_OFFSET_MASK);
        return ret;
}

/*
 * Extract the `type' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline unsigned swp_type(swp_entry_t entry)
{
        return (entry.val >> SWP_TYPE_SHIFT);
}

/*
 * Extract the `offset' field from a swp_entry_t.  The swp_entry_t is in
 * arch-independent format
 */
static inline pgoff_t swp_offset(swp_entry_t entry)
{
        return entry.val & SWP_OFFSET_MASK;
}

/*
 * This should only be called upon a pfn swap entry to get the PFN stored
 * in the swap entry.  Please refers to is_pfn_swap_entry() for definition
 * of pfn swap entry.
 */
static inline unsigned long swp_offset_pfn(swp_entry_t entry)
{
        VM_BUG_ON(!is_pfn_swap_entry(entry));
        return swp_offset(entry) & SWP_PFN_MASK;
}

/* check whether a pte points to a swap entry */
static inline int is_swap_pte(pte_t pte)
{
        return !pte_none(pte) && !pte_present(pte);
}

/*
 * Convert the arch-dependent pte representation of a swp_entry_t into an
 * arch-independent swp_entry_t.
 */
static inline swp_entry_t pte_to_swp_entry(pte_t pte)
{
        swp_entry_t arch_entry;

        pte = pte_swp_clear_flags(pte);
        arch_entry = __pte_to_swp_entry(pte);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

/*
 * Convert the arch-independent representation of a swp_entry_t into the
 * arch-dependent pte representation.
 */
static inline pte_t swp_entry_to_pte(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pte(arch_entry);
}

static inline swp_entry_t radix_to_swp_entry(void *arg)
{
        swp_entry_t entry;

        entry.val = xa_to_value(arg);
        return entry;
}

static inline void *swp_to_radix_entry(swp_entry_t entry)
{
        return xa_mk_value(entry.val);
}

#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_READ, offset);
}

static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_WRITE, offset);
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        int type = swp_type(entry);
        return type == SWP_DEVICE_READ || type == SWP_DEVICE_WRITE;
}

static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_DEVICE_WRITE);
}

static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
{
        return swp_entry(SWP_DEVICE_EXCLUSIVE, offset);
}

static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_DEVICE_EXCLUSIVE;
}

#else /* CONFIG_DEVICE_PRIVATE */
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_writable_device_private_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline bool is_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline bool is_writable_device_private_entry(swp_entry_t entry)
{
        return false;
}

static inline swp_entry_t make_device_exclusive_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline bool is_device_exclusive_entry(swp_entry_t entry)
{
        return false;
}

#endif /* CONFIG_DEVICE_PRIVATE */

#ifdef CONFIG_MIGRATION
static inline int is_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ ||
                        swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE ||
                        swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline int is_writable_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE);
}

static inline int is_readable_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ);
}

static inline int is_readable_exclusive_migration_entry(swp_entry_t entry)
{
        return unlikely(swp_type(entry) == SWP_MIGRATION_READ_EXCLUSIVE);
}

static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
        return swp_entry(SWP_MIGRATION_READ, offset);
}

static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset)
{
        return swp_entry(SWP_MIGRATION_READ_EXCLUSIVE, offset);
}

static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
        return swp_entry(SWP_MIGRATION_WRITE, offset);
}

/*
 * Returns whether the host has large enough swap offset field to support
 * carrying over pgtable A/D bits for page migrations.  The result is
 * pretty much arch specific.
 */
static inline bool migration_entry_supports_ad(void)
{
#ifdef CONFIG_SWAP
        return swap_migration_ad_supported;
#else  /* CONFIG_SWAP */
        return false;
#endif        /* CONFIG_SWAP */
}

static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_entry(swp_type(entry),
                                 swp_offset(entry) | SWP_MIG_YOUNG);
        return entry;
}

static inline bool is_migration_entry_young(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_offset(entry) & SWP_MIG_YOUNG;
        /* Keep the old behavior of aging page after migration */
        return false;
}

static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_entry(swp_type(entry),
                                 swp_offset(entry) | SWP_MIG_DIRTY);
        return entry;
}

static inline bool is_migration_entry_dirty(swp_entry_t entry)
{
        if (migration_entry_supports_ad())
                return swp_offset(entry) & SWP_MIG_DIRTY;
        /* Keep the old behavior of clean page after migration */
        return false;
}

extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                        unsigned long address);
extern void migration_entry_wait_huge(struct vm_area_struct *vma, unsigned long addr, pte_t *pte);
#else  /* CONFIG_MIGRATION */
static inline swp_entry_t make_readable_migration_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_readable_exclusive_migration_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline swp_entry_t make_writable_migration_entry(pgoff_t offset)
{
        return swp_entry(0, 0);
}

static inline int is_migration_entry(swp_entry_t swp)
{
        return 0;
}

static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
                                        unsigned long address) { }
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,
                                             unsigned long addr, pte_t *pte) { }
static inline int is_writable_migration_entry(swp_entry_t entry)
{
        return 0;
}
static inline int is_readable_migration_entry(swp_entry_t entry)
{
        return 0;
}

static inline swp_entry_t make_migration_entry_young(swp_entry_t entry)
{
        return entry;
}

static inline bool is_migration_entry_young(swp_entry_t entry)
{
        return false;
}

static inline swp_entry_t make_migration_entry_dirty(swp_entry_t entry)
{
        return entry;
}

static inline bool is_migration_entry_dirty(swp_entry_t entry)
{
        return false;
}
#endif        /* CONFIG_MIGRATION */

#ifdef CONFIG_MEMORY_FAILURE

/*
 * Support for hardware poisoned pages
 */
static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        BUG_ON(!PageLocked(page));
        return swp_entry(SWP_HWPOISON, page_to_pfn(page));
}

static inline int is_hwpoison_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_HWPOISON;
}

#else

static inline swp_entry_t make_hwpoison_entry(struct page *page)
{
        return swp_entry(0, 0);
}

static inline int is_hwpoison_entry(swp_entry_t swp)
{
        return 0;
}
#endif

typedef unsigned long pte_marker;

#define  PTE_MARKER_UFFD_WP                        BIT(0)
/*
 * "Poisoned" here is meant in the very general sense of "future accesses are
 * invalid", instead of referring very specifically to hardware memory errors.
 * This marker is meant to represent any of various different causes of this.
 *
 * Note that, when encountered by the faulting logic, PTEs with this marker will
 * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error
 * logic.
 */
#define  PTE_MARKER_POISONED                        BIT(1)
/*
 * Indicates that, on fault, this PTE will case a SIGSEGV signal to be
 * sent. This means guard markers behave in effect as if the region were mapped
 * PROT_NONE, rather than if they were a memory hole or equivalent.
 */
#define  PTE_MARKER_GUARD                        BIT(2)
#define  PTE_MARKER_MASK                        (BIT(3) - 1)

static inline swp_entry_t make_pte_marker_entry(pte_marker marker)
{
        return swp_entry(SWP_PTE_MARKER, marker);
}

static inline bool is_pte_marker_entry(swp_entry_t entry)
{
        return swp_type(entry) == SWP_PTE_MARKER;
}

static inline pte_marker pte_marker_get(swp_entry_t entry)
{
        return swp_offset(entry) & PTE_MARKER_MASK;
}

static inline bool is_pte_marker(pte_t pte)
{
        return is_swap_pte(pte) && is_pte_marker_entry(pte_to_swp_entry(pte));
}

static inline pte_t make_pte_marker(pte_marker marker)
{
        return swp_entry_to_pte(make_pte_marker_entry(marker));
}

static inline swp_entry_t make_poisoned_swp_entry(void)
{
        return make_pte_marker_entry(PTE_MARKER_POISONED);
}

static inline int is_poisoned_swp_entry(swp_entry_t entry)
{
        return is_pte_marker_entry(entry) &&
            (pte_marker_get(entry) & PTE_MARKER_POISONED);

}

static inline swp_entry_t make_guard_swp_entry(void)
{
        return make_pte_marker_entry(PTE_MARKER_GUARD);
}

static inline int is_guard_swp_entry(swp_entry_t entry)
{
        return is_pte_marker_entry(entry) &&
                (pte_marker_get(entry) & PTE_MARKER_GUARD);
}

/*
 * This is a special version to check pte_none() just to cover the case when
 * the pte is a pte marker.  It existed because in many cases the pte marker
 * should be seen as a none pte; it's just that we have stored some information
 * onto the none pte so it becomes not-none any more.
 *
 * It should be used when the pte is file-backed, ram-based and backing
 * userspace pages, like shmem.  It is not needed upon pgtables that do not
 * support pte markers at all.  For example, it's not needed on anonymous
 * memory, kernel-only memory (including when the system is during-boot),
 * non-ram based generic file-system.  It's fine to be used even there, but the
 * extra pte marker check will be pure overhead.
 */
static inline int pte_none_mostly(pte_t pte)
{
        return pte_none(pte) || is_pte_marker(pte);
}

static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
{
        struct page *p = pfn_to_page(swp_offset_pfn(entry));

        /*
         * Any use of migration entries may only occur while the
         * corresponding page is locked
         */
        BUG_ON(is_migration_entry(entry) && !PageLocked(p));

        return p;
}

static inline struct folio *pfn_swap_entry_folio(swp_entry_t entry)
{
        struct folio *folio = pfn_folio(swp_offset_pfn(entry));

        /*
         * Any use of migration entries may only occur while the
         * corresponding folio is locked
         */
        BUG_ON(is_migration_entry(entry) && !folio_test_locked(folio));

        return folio;
}

/*
 * A pfn swap entry is a special type of swap entry that always has a pfn stored
 * in the swap offset. They can either be used to represent unaddressable device
 * memory, to restrict access to a page undergoing migration or to represent a
 * pfn which has been hwpoisoned and unmapped.
 */
static inline bool is_pfn_swap_entry(swp_entry_t entry)
{
        /* Make sure the swp offset can always store the needed fields */
        BUILD_BUG_ON(SWP_TYPE_SHIFT < SWP_PFN_BITS);

        return is_migration_entry(entry) || is_device_private_entry(entry) ||
               is_device_exclusive_entry(entry) || is_hwpoison_entry(entry);
}

struct page_vma_mapped_walk;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
extern int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page);

extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new);

extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        swp_entry_t arch_entry;

        if (pmd_swp_soft_dirty(pmd))
                pmd = pmd_swp_clear_soft_dirty(pmd);
        if (pmd_swp_uffd_wp(pmd))
                pmd = pmd_swp_clear_uffd_wp(pmd);
        arch_entry = __pmd_to_swp_entry(pmd);
        return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        swp_entry_t arch_entry;

        arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
        return __swp_entry_to_pmd(arch_entry);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return is_swap_pmd(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
}
#else  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
static inline int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
{
        BUILD_BUG();
}

static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
                struct page *new)
{
        BUILD_BUG();
}

static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { }

static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
{
        return swp_entry(0, 0);
}

static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
{
        return __pmd(0);
}

static inline int is_pmd_migration_entry(pmd_t pmd)
{
        return 0;
}
#endif  /* CONFIG_ARCH_ENABLE_THP_MIGRATION */

static inline int non_swap_entry(swp_entry_t entry)
{
        return swp_type(entry) >= MAX_SWAPFILES;
}

#endif /* CONFIG_MMU */
#endif /* _LINUX_SWAPOPS_H */





















































































































































































































































































































































    3 






    3 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <net/net_namespace.h>
#include <net/netfilter/nf_tables.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
#include <linux/netfilter_arp.h>
#include <net/netfilter/nf_tables_ipv4.h>
#include <net/netfilter/nf_tables_ipv6.h>

#ifdef CONFIG_NF_TABLES_IPV4
static unsigned int nft_do_chain_ipv4(void *priv,
                                      struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        struct nft_pktinfo pkt;

        nft_set_pktinfo(&pkt, skb, state);
        nft_set_pktinfo_ipv4(&pkt);

        return nft_do_chain(&pkt, priv);
}

static const struct nft_chain_type nft_chain_filter_ipv4 = {
        .name                = "filter",
        .type                = NFT_CHAIN_T_DEFAULT,
        .family                = NFPROTO_IPV4,
        .hook_mask        = (1 << NF_INET_LOCAL_IN) |
                          (1 << NF_INET_LOCAL_OUT) |
                          (1 << NF_INET_FORWARD) |
                          (1 << NF_INET_PRE_ROUTING) |
                          (1 << NF_INET_POST_ROUTING),
        .hooks                = {
                [NF_INET_LOCAL_IN]        = nft_do_chain_ipv4,
                [NF_INET_LOCAL_OUT]        = nft_do_chain_ipv4,
                [NF_INET_FORWARD]        = nft_do_chain_ipv4,
                [NF_INET_PRE_ROUTING]        = nft_do_chain_ipv4,
                [NF_INET_POST_ROUTING]        = nft_do_chain_ipv4,
        },
};

static void nft_chain_filter_ipv4_init(void)
{
        nft_register_chain_type(&nft_chain_filter_ipv4);
}
static void nft_chain_filter_ipv4_fini(void)
{
        nft_unregister_chain_type(&nft_chain_filter_ipv4);
}

#else
static inline void nft_chain_filter_ipv4_init(void) {}
static inline void nft_chain_filter_ipv4_fini(void) {}
#endif /* CONFIG_NF_TABLES_IPV4 */

#ifdef CONFIG_NF_TABLES_ARP
static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb,
                                     const struct nf_hook_state *state)
{
        struct nft_pktinfo pkt;

        nft_set_pktinfo(&pkt, skb, state);
        nft_set_pktinfo_unspec(&pkt);

        return nft_do_chain(&pkt, priv);
}

static const struct nft_chain_type nft_chain_filter_arp = {
        .name                = "filter",
        .type                = NFT_CHAIN_T_DEFAULT,
        .family                = NFPROTO_ARP,
        .owner                = THIS_MODULE,
        .hook_mask        = (1 << NF_ARP_IN) |
                          (1 << NF_ARP_OUT),
        .hooks                = {
                [NF_ARP_IN]                = nft_do_chain_arp,
                [NF_ARP_OUT]                = nft_do_chain_arp,
        },
};

static void nft_chain_filter_arp_init(void)
{
        nft_register_chain_type(&nft_chain_filter_arp);
}

static void nft_chain_filter_arp_fini(void)
{
        nft_unregister_chain_type(&nft_chain_filter_arp);
}
#else
static inline void nft_chain_filter_arp_init(void) {}
static inline void nft_chain_filter_arp_fini(void) {}
#endif /* CONFIG_NF_TABLES_ARP */

#ifdef CONFIG_NF_TABLES_IPV6
static unsigned int nft_do_chain_ipv6(void *priv,
                                      struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        struct nft_pktinfo pkt;

        nft_set_pktinfo(&pkt, skb, state);
        nft_set_pktinfo_ipv6(&pkt);

        return nft_do_chain(&pkt, priv);
}

static const struct nft_chain_type nft_chain_filter_ipv6 = {
        .name                = "filter",
        .type                = NFT_CHAIN_T_DEFAULT,
        .family                = NFPROTO_IPV6,
        .hook_mask        = (1 << NF_INET_LOCAL_IN) |
                          (1 << NF_INET_LOCAL_OUT) |
                          (1 << NF_INET_FORWARD) |
                          (1 << NF_INET_PRE_ROUTING) |
                          (1 << NF_INET_POST_ROUTING),
        .hooks                = {
                [NF_INET_LOCAL_IN]        = nft_do_chain_ipv6,
                [NF_INET_LOCAL_OUT]        = nft_do_chain_ipv6,
                [NF_INET_FORWARD]        = nft_do_chain_ipv6,
                [NF_INET_PRE_ROUTING]        = nft_do_chain_ipv6,
                [NF_INET_POST_ROUTING]        = nft_do_chain_ipv6,
        },
};

static void nft_chain_filter_ipv6_init(void)
{
        nft_register_chain_type(&nft_chain_filter_ipv6);
}

static void nft_chain_filter_ipv6_fini(void)
{
        nft_unregister_chain_type(&nft_chain_filter_ipv6);
}
#else
static inline void nft_chain_filter_ipv6_init(void) {}
static inline void nft_chain_filter_ipv6_fini(void) {}
#endif /* CONFIG_NF_TABLES_IPV6 */

#ifdef CONFIG_NF_TABLES_INET
static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        struct nft_pktinfo pkt;

        nft_set_pktinfo(&pkt, skb, state);

        switch (state->pf) {
        case NFPROTO_IPV4:
                nft_set_pktinfo_ipv4(&pkt);
                break;
        case NFPROTO_IPV6:
                nft_set_pktinfo_ipv6(&pkt);
                break;
        default:
                break;
        }

        return nft_do_chain(&pkt, priv);
}

static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb,
                                              const struct nf_hook_state *state)
{
        struct nf_hook_state ingress_state = *state;
        struct nft_pktinfo pkt;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                /* Original hook is NFPROTO_NETDEV and NF_NETDEV_INGRESS. */
                ingress_state.pf = NFPROTO_IPV4;
                ingress_state.hook = NF_INET_INGRESS;
                nft_set_pktinfo(&pkt, skb, &ingress_state);

                if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0)
                        return NF_DROP;
                break;
        case htons(ETH_P_IPV6):
                ingress_state.pf = NFPROTO_IPV6;
                ingress_state.hook = NF_INET_INGRESS;
                nft_set_pktinfo(&pkt, skb, &ingress_state);

                if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0)
                        return NF_DROP;
                break;
        default:
                return NF_ACCEPT;
        }

        return nft_do_chain(&pkt, priv);
}

static const struct nft_chain_type nft_chain_filter_inet = {
        .name                = "filter",
        .type                = NFT_CHAIN_T_DEFAULT,
        .family                = NFPROTO_INET,
        .hook_mask        = (1 << NF_INET_INGRESS) |
                          (1 << NF_INET_LOCAL_IN) |
                          (1 << NF_INET_LOCAL_OUT) |
                          (1 << NF_INET_FORWARD) |
                          (1 << NF_INET_PRE_ROUTING) |
                          (1 << NF_INET_POST_ROUTING),
        .hooks                = {
                [NF_INET_INGRESS]        = nft_do_chain_inet_ingress,
                [NF_INET_LOCAL_IN]        = nft_do_chain_inet,
                [NF_INET_LOCAL_OUT]        = nft_do_chain_inet,
                [NF_INET_FORWARD]        = nft_do_chain_inet,
                [NF_INET_PRE_ROUTING]        = nft_do_chain_inet,
                [NF_INET_POST_ROUTING]        = nft_do_chain_inet,
        },
};

static void nft_chain_filter_inet_init(void)
{
        nft_register_chain_type(&nft_chain_filter_inet);
}

static void nft_chain_filter_inet_fini(void)
{
        nft_unregister_chain_type(&nft_chain_filter_inet);
}
#else
static inline void nft_chain_filter_inet_init(void) {}
static inline void nft_chain_filter_inet_fini(void) {}
#endif /* CONFIG_NF_TABLES_IPV6 */

#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
static unsigned int
nft_do_chain_bridge(void *priv,
                    struct sk_buff *skb,
                    const struct nf_hook_state *state)
{
        struct nft_pktinfo pkt;

        nft_set_pktinfo(&pkt, skb, state);

        switch (eth_hdr(skb)->h_proto) {
        case htons(ETH_P_IP):
                nft_set_pktinfo_ipv4_validate(&pkt);
                break;
        case htons(ETH_P_IPV6):
                nft_set_pktinfo_ipv6_validate(&pkt);
                break;
        default:
                nft_set_pktinfo_unspec(&pkt);
                break;
        }

        return nft_do_chain(&pkt, priv);
}

static const struct nft_chain_type nft_chain_filter_bridge = {
        .name                = "filter",
        .type                = NFT_CHAIN_T_DEFAULT,
        .family                = NFPROTO_BRIDGE,
        .hook_mask        = (1 << NF_BR_PRE_ROUTING) |
                          (1 << NF_BR_LOCAL_IN) |
                          (1 << NF_BR_FORWARD) |
                          (1 << NF_BR_LOCAL_OUT) |
                          (1 << NF_BR_POST_ROUTING),
        .hooks                = {
                [NF_BR_PRE_ROUTING]        = nft_do_chain_bridge,
                [NF_BR_LOCAL_IN]        = nft_do_chain_bridge,
                [NF_BR_FORWARD]                = nft_do_chain_bridge,
                [NF_BR_LOCAL_OUT]        = nft_do_chain_bridge,
                [NF_BR_POST_ROUTING]        = nft_do_chain_bridge,
        },
};

static void nft_chain_filter_bridge_init(void)
{
        nft_register_chain_type(&nft_chain_filter_bridge);
}

static void nft_chain_filter_bridge_fini(void)
{
        nft_unregister_chain_type(&nft_chain_filter_bridge);
}
#else
static inline void nft_chain_filter_bridge_init(void) {}
static inline void nft_chain_filter_bridge_fini(void) {}
#endif /* CONFIG_NF_TABLES_BRIDGE */

#ifdef CONFIG_NF_TABLES_NETDEV
static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb,
                                        const struct nf_hook_state *state)
{
        struct nft_pktinfo pkt;

        nft_set_pktinfo(&pkt, skb, state);

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                nft_set_pktinfo_ipv4_validate(&pkt);
                break;
        case htons(ETH_P_IPV6):
                nft_set_pktinfo_ipv6_validate(&pkt);
                break;
        default:
                nft_set_pktinfo_unspec(&pkt);
                break;
        }

        return nft_do_chain(&pkt, priv);
}

static const struct nft_chain_type nft_chain_filter_netdev = {
        .name                = "filter",
        .type                = NFT_CHAIN_T_DEFAULT,
        .family                = NFPROTO_NETDEV,
        .hook_mask        = (1 << NF_NETDEV_INGRESS) |
                          (1 << NF_NETDEV_EGRESS),
        .hooks                = {
                [NF_NETDEV_INGRESS]        = nft_do_chain_netdev,
                [NF_NETDEV_EGRESS]        = nft_do_chain_netdev,
        },
};

static void nft_netdev_event(unsigned long event, struct net_device *dev,
                             struct nft_base_chain *basechain)
{
        struct nft_hook *hook;

        list_for_each_entry(hook, &basechain->hook_list, list) {
                if (hook->ops.dev != dev)
                        continue;

                if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT))
                        nf_unregister_net_hook(dev_net(dev), &hook->ops);

                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
                break;
        }
}

static int nf_tables_netdev_event(struct notifier_block *this,
                                  unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct nft_base_chain *basechain;
        struct nftables_pernet *nft_net;
        struct nft_chain *chain;
        struct nft_table *table;

        if (event != NETDEV_UNREGISTER)
                return NOTIFY_DONE;

        nft_net = nft_pernet(dev_net(dev));
        mutex_lock(&nft_net->commit_mutex);
        list_for_each_entry(table, &nft_net->tables, list) {
                if (table->family != NFPROTO_NETDEV &&
                    table->family != NFPROTO_INET)
                        continue;

                list_for_each_entry(chain, &table->chains, list) {
                        if (!nft_is_base_chain(chain))
                                continue;

                        basechain = nft_base_chain(chain);
                        if (table->family == NFPROTO_INET &&
                            basechain->ops.hooknum != NF_INET_INGRESS)
                                continue;

                        nft_netdev_event(event, dev, basechain);
                }
        }
        mutex_unlock(&nft_net->commit_mutex);

        return NOTIFY_DONE;
}

static struct notifier_block nf_tables_netdev_notifier = {
        .notifier_call        = nf_tables_netdev_event,
};

static int nft_chain_filter_netdev_init(void)
{
        int err;

        nft_register_chain_type(&nft_chain_filter_netdev);

        err = register_netdevice_notifier(&nf_tables_netdev_notifier);
        if (err)
                goto err_register_netdevice_notifier;

        return 0;

err_register_netdevice_notifier:
        nft_unregister_chain_type(&nft_chain_filter_netdev);

        return err;
}

static void nft_chain_filter_netdev_fini(void)
{
        nft_unregister_chain_type(&nft_chain_filter_netdev);
        unregister_netdevice_notifier(&nf_tables_netdev_notifier);
}
#else
static inline int nft_chain_filter_netdev_init(void) { return 0; }
static inline void nft_chain_filter_netdev_fini(void) {}
#endif /* CONFIG_NF_TABLES_NETDEV */

int __init nft_chain_filter_init(void)
{
        int err;

        err = nft_chain_filter_netdev_init();
        if (err < 0)
                return err;

        nft_chain_filter_ipv4_init();
        nft_chain_filter_ipv6_init();
        nft_chain_filter_arp_init();
        nft_chain_filter_inet_init();
        nft_chain_filter_bridge_init();

        return 0;
}

void nft_chain_filter_fini(void)
{
        nft_chain_filter_bridge_fini();
        nft_chain_filter_inet_fini();
        nft_chain_filter_arp_fini();
        nft_chain_filter_ipv6_fini();
        nft_chain_filter_ipv4_fini();
        nft_chain_filter_netdev_fini();
}

































































































































































































































































































































































































































































































































































































































































































  263 



  264 














































































































  223 






  223 




















































































  225 



  223 





  136 


  183 




























































































































































































































   12 























































































































  223 



  223 

   87 


























  221 































































































































































































































































































  219 





































































  223 
  223 




















































































  223 


  219 


  223 






















































  223 
























































































































































































































































































    3 















    3 



















































































    3 



    3 





















  222 






  223 
  222 
  223 
















































  222 











  223 






  223 




  223 



  222 





  223 












  220 


























  223 











  223 













  105 
  216 











  222 

































   12 



   12 

   12 

   12 



   12 
































































































  234 



  235 
  235 
  232 
  235 








  218 



   34 












   34 
























  232 





  233 

  232 

  231 



  233 
























    3 






    3 

    3 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    2 






    2 
    2 

    2 


    2 
































































































































   30 





   31 













   31 






   31 

   29 

    2 





   30 






   31 

   31 
   30 





   31 




   31 





   30 



























   31 

























   31 





   30 



    2 


   28 









































    2 
    2 





    2 






    2 






    2 

















    2 



























































































   25 


   26 










   12 





























   12 




   26 








   26 






















































   26 


















   26 
   26 
   26 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
// SPDX-License-Identifier: GPL-2.0-only
/*
 * kernel/workqueue.c - generic async execution with shared worker pool
 *
 * Copyright (C) 2002                Ingo Molnar
 *
 *   Derived from the taskqueue/keventd code by:
 *     David Woodhouse <dwmw2@infradead.org>
 *     Andrew Morton
 *     Kai Petzke <wpp@marie.physik.tu-berlin.de>
 *     Theodore Ts'o <tytso@mit.edu>
 *
 * Made to use alloc_percpu by Christoph Lameter.
 *
 * Copyright (C) 2010                SUSE Linux Products GmbH
 * Copyright (C) 2010                Tejun Heo <tj@kernel.org>
 *
 * This is the generic async execution mechanism.  Work items as are
 * executed in process context.  The worker pool is shared and
 * automatically managed.  There are two worker pools for each CPU (one for
 * normal work items and the other for high priority ones) and some extra
 * pools for workqueues which are not bound to any specific CPU - the
 * number of these backing pools is dynamic.
 *
 * Please read Documentation/core-api/workqueue.rst for details.
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/signal.h>
#include <linux/completion.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/kthread.h>
#include <linux/hardirq.h>
#include <linux/mempolicy.h>
#include <linux/freezer.h>
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
#include <linux/jhash.h>
#include <linux/hashtable.h>
#include <linux/rculist.h>
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
#include <linux/sched/isolation.h>
#include <linux/sched/debug.h>
#include <linux/nmi.h>
#include <linux/kvm_para.h>
#include <linux/delay.h>
#include <linux/irq_work.h>

#include "workqueue_internal.h"

enum worker_pool_flags {
        /*
         * worker_pool flags
         *
         * A bound pool is either associated or disassociated with its CPU.
         * While associated (!DISASSOCIATED), all workers are bound to the
         * CPU and none has %WORKER_UNBOUND set and concurrency management
         * is in effect.
         *
         * While DISASSOCIATED, the cpu may be offline and all workers have
         * %WORKER_UNBOUND set and concurrency management disabled, and may
         * be executing on any CPU.  The pool behaves as an unbound one.
         *
         * Note that DISASSOCIATED should be flipped only while holding
         * wq_pool_attach_mutex to avoid changing binding state while
         * worker_attach_to_pool() is in progress.
         *
         * As there can only be one concurrent BH execution context per CPU, a
         * BH pool is per-CPU and always DISASSOCIATED.
         */
        POOL_BH                        = 1 << 0,        /* is a BH pool */
        POOL_MANAGER_ACTIVE        = 1 << 1,        /* being managed */
        POOL_DISASSOCIATED        = 1 << 2,        /* cpu can't serve workers */
        POOL_BH_DRAINING        = 1 << 3,        /* draining after CPU offline */
};

enum worker_flags {
        /* worker flags */
        WORKER_DIE                = 1 << 1,        /* die die die */
        WORKER_IDLE                = 1 << 2,        /* is idle */
        WORKER_PREP                = 1 << 3,        /* preparing to run works */
        WORKER_CPU_INTENSIVE        = 1 << 6,        /* cpu intensive */
        WORKER_UNBOUND                = 1 << 7,        /* worker is unbound */
        WORKER_REBOUND                = 1 << 8,        /* worker was rebound */

        WORKER_NOT_RUNNING        = WORKER_PREP | WORKER_CPU_INTENSIVE |
                                  WORKER_UNBOUND | WORKER_REBOUND,
};

enum work_cancel_flags {
        WORK_CANCEL_DELAYED        = 1 << 0,        /* canceling a delayed_work */
        WORK_CANCEL_DISABLE        = 1 << 1,        /* canceling to disable */
};

enum wq_internal_consts {
        NR_STD_WORKER_POOLS        = 2,                /* # standard pools per cpu */

        UNBOUND_POOL_HASH_ORDER        = 6,                /* hashed by pool->attrs */
        BUSY_WORKER_HASH_ORDER        = 6,                /* 64 pointers */

        MAX_IDLE_WORKERS_RATIO        = 4,                /* 1/4 of busy can be idle */
        IDLE_WORKER_TIMEOUT        = 300 * HZ,        /* keep idle ones for 5 mins */

        MAYDAY_INITIAL_TIMEOUT  = HZ / 100 >= 2 ? HZ / 100 : 2,
                                                /* call for help after 10ms
                                                   (min two ticks) */
        MAYDAY_INTERVAL                = HZ / 10,        /* and then every 100ms */
        CREATE_COOLDOWN                = HZ,                /* time to breath after fail */

        /*
         * Rescue workers are used only on emergencies and shared by
         * all cpus.  Give MIN_NICE.
         */
        RESCUER_NICE_LEVEL        = MIN_NICE,
        HIGHPRI_NICE_LEVEL        = MIN_NICE,

        WQ_NAME_LEN                = 32,
        WORKER_ID_LEN                = 10 + WQ_NAME_LEN, /* "kworker/R-" + WQ_NAME_LEN */
};

/*
 * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and
 * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because
 * msecs_to_jiffies() can't be an initializer.
 */
#define BH_WORKER_JIFFIES        msecs_to_jiffies(2)
#define BH_WORKER_RESTARTS        10

/*
 * Structure fields follow one of the following exclusion rules.
 *
 * I: Modifiable by initialization/destruction paths and read-only for
 *    everyone else.
 *
 * P: Preemption protected.  Disabling preemption is enough and should
 *    only be modified and accessed from the local cpu.
 *
 * L: pool->lock protected.  Access with pool->lock held.
 *
 * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for
 *     reads.
 *
 * K: Only modified by worker while holding pool->lock. Can be safely read by
 *    self, while holding pool->lock or from IRQ context if %current is the
 *    kworker.
 *
 * S: Only modified by worker self.
 *
 * A: wq_pool_attach_mutex protected.
 *
 * PL: wq_pool_mutex protected.
 *
 * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
 *
 * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
 *
 * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
 *      RCU for reads.
 *
 * WQ: wq->mutex protected.
 *
 * WR: wq->mutex protected for writes.  RCU protected for reads.
 *
 * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read
 *     with READ_ONCE() without locking.
 *
 * MD: wq_mayday_lock protected.
 *
 * WD: Used internally by the watchdog.
 */

/* struct worker is defined in workqueue_internal.h */

struct worker_pool {
        raw_spinlock_t                lock;                /* the pool lock */
        int                        cpu;                /* I: the associated cpu */
        int                        node;                /* I: the associated node ID */
        int                        id;                /* I: pool ID */
        unsigned int                flags;                /* L: flags */

        unsigned long                watchdog_ts;        /* L: watchdog timestamp */
        bool                        cpu_stall;        /* WD: stalled cpu bound pool */

        /*
         * The counter is incremented in a process context on the associated CPU
         * w/ preemption disabled, and decremented or reset in the same context
         * but w/ pool->lock held. The readers grab pool->lock and are
         * guaranteed to see if the counter reached zero.
         */
        int                        nr_running;

        struct list_head        worklist;        /* L: list of pending works */

        int                        nr_workers;        /* L: total number of workers */
        int                        nr_idle;        /* L: currently idle workers */

        struct list_head        idle_list;        /* L: list of idle workers */
        struct timer_list        idle_timer;        /* L: worker idle timeout */
        struct work_struct      idle_cull_work; /* L: worker idle cleanup */

        struct timer_list        mayday_timer;          /* L: SOS timer for workers */

        /* a workers is either on busy_hash or idle_list, or the manager */
        DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                                                /* L: hash of busy workers */

        struct worker                *manager;        /* L: purely informational */
        struct list_head        workers;        /* A: attached workers */

        struct ida                worker_ida;        /* worker IDs for task name */

        struct workqueue_attrs        *attrs;                /* I: worker attributes */
        struct hlist_node        hash_node;        /* PL: unbound_pool_hash node */
        int                        refcnt;                /* PL: refcnt for unbound pools */

        /*
         * Destruction of pool is RCU protected to allow dereferences
         * from get_work_pool().
         */
        struct rcu_head                rcu;
};

/*
 * Per-pool_workqueue statistics. These can be monitored using
 * tools/workqueue/wq_monitor.py.
 */
enum pool_workqueue_stats {
        PWQ_STAT_STARTED,        /* work items started execution */
        PWQ_STAT_COMPLETED,        /* work items completed execution */
        PWQ_STAT_CPU_TIME,        /* total CPU time consumed */
        PWQ_STAT_CPU_INTENSIVE,        /* wq_cpu_intensive_thresh_us violations */
        PWQ_STAT_CM_WAKEUP,        /* concurrency-management worker wakeups */
        PWQ_STAT_REPATRIATED,        /* unbound workers brought back into scope */
        PWQ_STAT_MAYDAY,        /* maydays to rescuer */
        PWQ_STAT_RESCUED,        /* linked work items executed by rescuer */

        PWQ_NR_STATS,
};

/*
 * The per-pool workqueue.  While queued, bits below WORK_PWQ_SHIFT
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
 */
struct pool_workqueue {
        struct worker_pool        *pool;                /* I: the associated pool */
        struct workqueue_struct *wq;                /* I: the owning workqueue */
        int                        work_color;        /* L: current color */
        int                        flush_color;        /* L: flushing color */
        int                        refcnt;                /* L: reference count */
        int                        nr_in_flight[WORK_NR_COLORS];
                                                /* L: nr of in_flight works */
        bool                        plugged;        /* L: execution suspended */

        /*
         * nr_active management and WORK_STRUCT_INACTIVE:
         *
         * When pwq->nr_active >= max_active, new work item is queued to
         * pwq->inactive_works instead of pool->worklist and marked with
         * WORK_STRUCT_INACTIVE.
         *
         * All work items marked with WORK_STRUCT_INACTIVE do not participate in
         * nr_active and all work items in pwq->inactive_works are marked with
         * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are
         * in pwq->inactive_works. Some of them are ready to run in
         * pool->worklist or worker->scheduled. Those work itmes are only struct
         * wq_barrier which is used for flush_work() and should not participate
         * in nr_active. For non-barrier work item, it is marked with
         * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works.
         */
        int                        nr_active;        /* L: nr of active works */
        struct list_head        inactive_works;        /* L: inactive works */
        struct list_head        pending_node;        /* LN: node on wq_node_nr_active->pending_pwqs */
        struct list_head        pwqs_node;        /* WR: node on wq->pwqs */
        struct list_head        mayday_node;        /* MD: node on wq->maydays */

        u64                        stats[PWQ_NR_STATS];

        /*
         * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
         * and pwq_release_workfn() for details. pool_workqueue itself is also
         * RCU protected so that the first pwq can be determined without
         * grabbing wq->mutex.
         */
        struct kthread_work        release_work;
        struct rcu_head                rcu;
} __aligned(1 << WORK_STRUCT_PWQ_SHIFT);

/*
 * Structure used to wait for workqueue flush.
 */
struct wq_flusher {
        struct list_head        list;                /* WQ: list of flushers */
        int                        flush_color;        /* WQ: flush color waiting for */
        struct completion        done;                /* flush completion */
};

struct wq_device;

/*
 * Unlike in a per-cpu workqueue where max_active limits its concurrency level
 * on each CPU, in an unbound workqueue, max_active applies to the whole system.
 * As sharing a single nr_active across multiple sockets can be very expensive,
 * the counting and enforcement is per NUMA node.
 *
 * The following struct is used to enforce per-node max_active. When a pwq wants
 * to start executing a work item, it should increment ->nr using
 * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over
 * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish
 * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in
 * round-robin order.
 */
struct wq_node_nr_active {
        int                        max;                /* per-node max_active */
        atomic_t                nr;                /* per-node nr_active */
        raw_spinlock_t                lock;                /* nests inside pool locks */
        struct list_head        pending_pwqs;        /* LN: pwqs with inactive works */
};

/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
        struct list_head        pwqs;                /* WR: all pwqs of this wq */
        struct list_head        list;                /* PR: list of all workqueues */

        struct mutex                mutex;                /* protects this wq */
        int                        work_color;        /* WQ: current work color */
        int                        flush_color;        /* WQ: current flush color */
        atomic_t                nr_pwqs_to_flush; /* flush in progress */
        struct wq_flusher        *first_flusher;        /* WQ: first flusher */
        struct list_head        flusher_queue;        /* WQ: flush waiters */
        struct list_head        flusher_overflow; /* WQ: flush overflow list */

        struct list_head        maydays;        /* MD: pwqs requesting rescue */
        struct worker                *rescuer;        /* MD: rescue worker */

        int                        nr_drainers;        /* WQ: drain in progress */

        /* See alloc_workqueue() function comment for info on min/max_active */
        int                        max_active;        /* WO: max active works */
        int                        min_active;        /* WO: min active works */
        int                        saved_max_active; /* WQ: saved max_active */
        int                        saved_min_active; /* WQ: saved min_active */

        struct workqueue_attrs        *unbound_attrs;        /* PW: only for unbound wqs */
        struct pool_workqueue __rcu *dfl_pwq;   /* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
        struct wq_device        *wq_dev;        /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
        char                        *lock_name;
        struct lock_class_key        key;
        struct lockdep_map        __lockdep_map;
        struct lockdep_map        *lockdep_map;
#endif
        char                        name[WQ_NAME_LEN]; /* I: workqueue name */

        /*
         * Destruction of workqueue_struct is RCU protected to allow walking
         * the workqueues list without grabbing wq_pool_mutex.
         * This is used to dump all workqueues from sysrq.
         */
        struct rcu_head                rcu;

        /* hot fields used during command issue, aligned to cacheline */
        unsigned int                flags ____cacheline_aligned; /* WQ: WQ_* flags */
        struct pool_workqueue __rcu * __percpu *cpu_pwq; /* I: per-cpu pwqs */
        struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */
};

/*
 * Each pod type describes how CPUs should be grouped for unbound workqueues.
 * See the comment above workqueue_attrs->affn_scope.
 */
struct wq_pod_type {
        int                        nr_pods;        /* number of pods */
        cpumask_var_t                *pod_cpus;        /* pod -> cpus */
        int                        *pod_node;        /* pod -> node */
        int                        *cpu_pod;        /* cpu -> pod */
};

struct work_offq_data {
        u32                        pool_id;
        u32                        disable;
        u32                        flags;
};

static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
        [WQ_AFFN_DFL]                = "default",
        [WQ_AFFN_CPU]                = "cpu",
        [WQ_AFFN_SMT]                = "smt",
        [WQ_AFFN_CACHE]                = "cache",
        [WQ_AFFN_NUMA]                = "numa",
        [WQ_AFFN_SYSTEM]        = "system",
};

/*
 * Per-cpu work items which run for longer than the following threshold are
 * automatically considered CPU intensive and excluded from concurrency
 * management to prevent them from noticeably delaying other per-cpu work items.
 * ULONG_MAX indicates that the user hasn't overridden it with a boot parameter.
 * The actual value is initialized in wq_cpu_intensive_thresh_init().
 */
static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT
static unsigned int wq_cpu_intensive_warning_thresh = 4;
module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644);
#endif

/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

static bool wq_online;                        /* can kworkers be created yet? */
static bool wq_topo_initialized __read_mostly = false;

static struct kmem_cache *pwq_cache;

static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;

/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;

static DEFINE_MUTEX(wq_pool_mutex);        /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
static DEFINE_RAW_SPINLOCK(wq_mayday_lock);        /* protects wq->maydays list */
/* wait for manager to go away */
static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);

static LIST_HEAD(workqueues);                /* PR: list of all workqueues */
static bool workqueue_freezing;                /* PL: have wqs started freezing? */

/* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */
static cpumask_var_t wq_online_cpumask;

/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;

/* PL: user requested unbound cpumask via sysfs */
static cpumask_var_t wq_requested_unbound_cpumask;

/* PL: isolated cpumask to be excluded from unbound cpumask */
static cpumask_var_t wq_isolated_cpumask;

/* for further constrain wq_unbound_cpumask by cmdline parameter*/
static struct cpumask wq_cmdline_cpumask __initdata;

/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);

/*
 * Local execution of unbound work items is no longer guaranteed.  The
 * following always forces round-robin CPU selection on unbound work items
 * to uncover usages which depend on it.
 */
#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
static bool wq_debug_force_rr_cpu = true;
#else
static bool wq_debug_force_rr_cpu = false;
#endif
module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);

/* to raise softirq for the BH worker pools on other CPUs */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], bh_pool_irq_works);

/* the BH worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], bh_worker_pools);

/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);

static DEFINE_IDR(worker_pool_idr);        /* PR: idr of all pools */

/* PL: hash of all unbound pools keyed by pool->attrs */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];

/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

/*
 * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
 * process context while holding a pool lock. Bounce to a dedicated kthread
 * worker to avoid A-A deadlocks.
 */
static struct kthread_worker *pwq_release_worker __ro_after_init;

struct workqueue_struct *system_wq __ro_after_init;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_freezable_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
struct workqueue_struct *system_bh_wq;
EXPORT_SYMBOL_GPL(system_bh_wq);
struct workqueue_struct *system_bh_highpri_wq;
EXPORT_SYMBOL_GPL(system_bh_highpri_wq);

static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);
static void show_one_worker_pool(struct worker_pool *pool);

#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>

#define assert_rcu_or_pool_mutex()                                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&                        \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU or wq_pool_mutex should be held")

#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                        \
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() &&                        \
                         !lockdep_is_held(&wq->mutex) &&                \
                         !lockdep_is_held(&wq_pool_mutex),                \
                         "RCU, wq->mutex or wq_pool_mutex should be held")

#define for_each_bh_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(bh_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

#define for_each_cpu_worker_pool(pool, cpu)                                \
        for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];                \
             (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
             (pool)++)

/**
 * for_each_pool - iterate through all worker_pools in the system
 * @pool: iteration cursor
 * @pi: integer used for iteration
 *
 * This must be called either with wq_pool_mutex held or RCU read
 * locked.  If the pool needs to be used beyond the locking in effect, the
 * caller is responsible for guaranteeing that the pool stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool(pool, pi)                                                \
        idr_for_each_entry(&worker_pool_idr, pool, pi)                        \
                if (({ assert_rcu_or_pool_mutex(); false; })) { }        \
                else

/**
 * for_each_pool_worker - iterate through all workers of a worker_pool
 * @worker: iteration cursor
 * @pool: worker_pool to iterate workers of
 *
 * This must be called with wq_pool_attach_mutex.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pool_worker(worker, pool)                                \
        list_for_each_entry((worker), &(pool)->workers, node)                \
                if (({ lockdep_assert_held(&wq_pool_attach_mutex); false; })) { } \
                else

/**
 * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
 * @pwq: iteration cursor
 * @wq: the target workqueue
 *
 * This must be called either with wq->mutex held or RCU read locked.
 * If the pwq needs to be used beyond the locking in effect, the caller is
 * responsible for guaranteeing that the pwq stays online.
 *
 * The if/else clause exists only for the lockdep assertion and can be
 * ignored.
 */
#define for_each_pwq(pwq, wq)                                                \
        list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,                \
                                 lockdep_is_held(&(wq->mutex)))

#ifdef CONFIG_DEBUG_OBJECTS_WORK

static const struct debug_obj_descr work_debug_descr;

static void *work_debug_hint(void *addr)
{
        return ((struct work_struct *) addr)->func;
}

static bool work_is_static_object(void *addr)
{
        struct work_struct *work = addr;

        return test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work));
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool work_fixup_init(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_init(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool work_fixup_free(void *addr, enum debug_obj_state state)
{
        struct work_struct *work = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                cancel_work_sync(work);
                debug_object_free(work, &work_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr work_debug_descr = {
        .name                = "work_struct",
        .debug_hint        = work_debug_hint,
        .is_static_object = work_is_static_object,
        .fixup_init        = work_fixup_init,
        .fixup_free        = work_fixup_free,
};

static inline void debug_work_activate(struct work_struct *work)
{
        debug_object_activate(work, &work_debug_descr);
}

static inline void debug_work_deactivate(struct work_struct *work)
{
        debug_object_deactivate(work, &work_debug_descr);
}

void __init_work(struct work_struct *work, int onstack)
{
        if (onstack)
                debug_object_init_on_stack(work, &work_debug_descr);
        else
                debug_object_init(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(__init_work);

void destroy_work_on_stack(struct work_struct *work)
{
        debug_object_free(work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_work_on_stack);

void destroy_delayed_work_on_stack(struct delayed_work *work)
{
        destroy_timer_on_stack(&work->timer);
        debug_object_free(&work->work, &work_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_delayed_work_on_stack);

#else
static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif

/**
 * worker_pool_assign_id - allocate ID and assign it to @pool
 * @pool: the pool pointer of interest
 *
 * Returns 0 if ID in [0, WORK_OFFQ_POOL_NONE) is allocated and assigned
 * successfully, -errno on failure.
 */
static int worker_pool_assign_id(struct worker_pool *pool)
{
        int ret;

        lockdep_assert_held(&wq_pool_mutex);

        ret = idr_alloc(&worker_pool_idr, pool, 0, WORK_OFFQ_POOL_NONE,
                        GFP_KERNEL);
        if (ret >= 0) {
                pool->id = ret;
                return 0;
        }
        return ret;
}

static struct pool_workqueue __rcu **
unbound_pwq_slot(struct workqueue_struct *wq, int cpu)
{
       if (cpu >= 0)
               return per_cpu_ptr(wq->cpu_pwq, cpu);
       else
               return &wq->dfl_pwq;
}

/* @cpu < 0 for dfl_pwq */
static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu)
{
        return rcu_dereference_check(*unbound_pwq_slot(wq, cpu),
                                     lockdep_is_held(&wq_pool_mutex) ||
                                     lockdep_is_held(&wq->mutex));
}

/**
 * unbound_effective_cpumask - effective cpumask of an unbound workqueue
 * @wq: workqueue of interest
 *
 * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which
 * is masked with wq_unbound_cpumask to determine the effective cpumask. The
 * default pwq is always mapped to the pool with the current effective cpumask.
 */
static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq)
{
        return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask;
}

static unsigned int work_color_to_flags(int color)
{
        return color << WORK_STRUCT_COLOR_SHIFT;
}

static int get_work_color(unsigned long work_data)
{
        return (work_data >> WORK_STRUCT_COLOR_SHIFT) &
                ((1 << WORK_STRUCT_COLOR_BITS) - 1);
}

static int work_next_color(int color)
{
        return (color + 1) % WORK_NR_COLORS;
}

static unsigned long pool_offq_flags(struct worker_pool *pool)
{
        return (pool->flags & POOL_BH) ? WORK_OFFQ_BH : 0;
}

/*
 * While queued, %WORK_STRUCT_PWQ is set and non flag bits of a work's data
 * contain the pointer to the queued pwq.  Once execution starts, the flag
 * is cleared and the high bits contain OFFQ flags and pool ID.
 *
 * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling()
 * can be used to set the pwq, pool or clear work->data. These functions should
 * only be called while the work is owned - ie. while the PENDING bit is set.
 *
 * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq
 * corresponding to a work.  Pool is available once the work has been
 * queued anywhere after initialization until it is sync canceled.  pwq is
 * available only while the work item is queued.
 */
static inline void set_work_data(struct work_struct *work, unsigned long data)
{
        WARN_ON_ONCE(!work_pending(work));
        atomic_long_set(&work->data, data | work_static(work));
}

static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq,
                         unsigned long flags)
{
        set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING |
                      WORK_STRUCT_PWQ | flags);
}

static void set_work_pool_and_keep_pending(struct work_struct *work,
                                           int pool_id, unsigned long flags)
{
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      WORK_STRUCT_PENDING | flags);
}

static void set_work_pool_and_clear_pending(struct work_struct *work,
                                            int pool_id, unsigned long flags)
{
        /*
         * The following wmb is paired with the implied mb in
         * test_and_set_bit(PENDING) and ensures all updates to @work made
         * here are visible to and precede any updates by the next PENDING
         * owner.
         */
        smp_wmb();
        set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) |
                      flags);
        /*
         * The following mb guarantees that previous clear of a PENDING bit
         * will not be reordered with any speculative LOADS or STORES from
         * work->current_func, which is executed afterwards.  This possible
         * reordering can lead to a missed execution on attempt to queue
         * the same @work.  E.g. consider this case:
         *
         *   CPU#0                         CPU#1
         *   ----------------------------  --------------------------------
         *
         * 1  STORE event_indicated
         * 2  queue_work_on() {
         * 3    test_and_set_bit(PENDING)
         * 4 }                             set_..._and_clear_pending() {
         * 5                                 set_work_data() # clear bit
         * 6                                 smp_mb()
         * 7                               work->current_func() {
         * 8                                      LOAD event_indicated
         *                                   }
         *
         * Without an explicit full barrier speculative LOAD on line 8 can
         * be executed before CPU#0 does STORE on line 1.  If that happens,
         * CPU#0 observes the PENDING bit is still set and new execution of
         * a @work is not queued in a hope, that CPU#1 will eventually
         * finish the queued @work.  Meanwhile CPU#1 does not see
         * event_indicated is set, because speculative LOAD was executed
         * before actual STORE.
         */
        smp_mb();
}

static inline struct pool_workqueue *work_struct_pwq(unsigned long data)
{
        return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK);
}

static struct pool_workqueue *get_work_pwq(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data);
        else
                return NULL;
}

/**
 * get_work_pool - return the worker_pool a given work was associated with
 * @work: the work item of interest
 *
 * Pools are created and destroyed under wq_pool_mutex, and allows read
 * access under RCU read lock.  As such, this function should be
 * called under wq_pool_mutex or inside of a rcu_read_lock() region.
 *
 * All fields of the returned pool are accessible as long as the above
 * mentioned locking is in effect.  If the returned pool needs to be used
 * beyond the critical section, the caller is responsible for ensuring the
 * returned pool is and stays online.
 *
 * Return: The worker_pool @work was last associated with.  %NULL if none.
 */
static struct worker_pool *get_work_pool(struct work_struct *work)
{
        unsigned long data = atomic_long_read(&work->data);
        int pool_id;

        assert_rcu_or_pool_mutex();

        if (data & WORK_STRUCT_PWQ)
                return work_struct_pwq(data)->pool;

        pool_id = data >> WORK_OFFQ_POOL_SHIFT;
        if (pool_id == WORK_OFFQ_POOL_NONE)
                return NULL;

        return idr_find(&worker_pool_idr, pool_id);
}

static unsigned long shift_and_mask(unsigned long v, u32 shift, u32 bits)
{
        return (v >> shift) & ((1U << bits) - 1);
}

static void work_offqd_unpack(struct work_offq_data *offqd, unsigned long data)
{
        WARN_ON_ONCE(data & WORK_STRUCT_PWQ);

        offqd->pool_id = shift_and_mask(data, WORK_OFFQ_POOL_SHIFT,
                                        WORK_OFFQ_POOL_BITS);
        offqd->disable = shift_and_mask(data, WORK_OFFQ_DISABLE_SHIFT,
                                        WORK_OFFQ_DISABLE_BITS);
        offqd->flags = data & WORK_OFFQ_FLAG_MASK;
}

static unsigned long work_offqd_pack_flags(struct work_offq_data *offqd)
{
        return ((unsigned long)offqd->disable << WORK_OFFQ_DISABLE_SHIFT) |
                ((unsigned long)offqd->flags);
}

/*
 * Policy functions.  These define the policies on how the global worker
 * pools are managed.  Unless noted otherwise, these functions assume that
 * they're being called with pool->lock held.
 */

/*
 * Need to wake up a worker?  Called from anything but currently
 * running workers.
 *
 * Note that, because unbound workers never contribute to nr_running, this
 * function will always return %true for unbound pools as long as the
 * worklist isn't empty.
 */
static bool need_more_worker(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && !pool->nr_running;
}

/* Can I start working?  Called from busy but !running workers. */
static bool may_start_working(struct worker_pool *pool)
{
        return pool->nr_idle;
}

/* Do I need to keep working?  Called from currently running workers. */
static bool keep_working(struct worker_pool *pool)
{
        return !list_empty(&pool->worklist) && (pool->nr_running <= 1);
}

/* Do we need a new worker?  Called from manager. */
static bool need_to_create_worker(struct worker_pool *pool)
{
        return need_more_worker(pool) && !may_start_working(pool);
}

/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
        bool managing = pool->flags & POOL_MANAGER_ACTIVE;
        int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
        int nr_busy = pool->nr_workers - nr_idle;

        return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
 *
 * Set @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);

        /* If transitioning into NOT_RUNNING, adjust nr_running. */
        if ((flags & WORKER_NOT_RUNNING) &&
            !(worker->flags & WORKER_NOT_RUNNING)) {
                pool->nr_running--;
        }

        worker->flags |= flags;
}

/**
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to clear
 *
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
        struct worker_pool *pool = worker->pool;
        unsigned int oflags = worker->flags;

        lockdep_assert_held(&pool->lock);

        worker->flags &= ~flags;

        /*
         * If transitioning out of NOT_RUNNING, increment nr_running.  Note
         * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
         * of multiple flags, not a single flag.
         */
        if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
                if (!(worker->flags & WORKER_NOT_RUNNING))
                        pool->nr_running++;
}

/* Return the first idle worker.  Called with pool->lock held. */
static struct worker *first_idle_worker(struct worker_pool *pool)
{
        if (unlikely(list_empty(&pool->idle_list)))
                return NULL;

        return list_first_entry(&pool->idle_list, struct worker, entry);
}

/**
 * worker_enter_idle - enter idle state
 * @worker: worker which is entering idle state
 *
 * @worker is entering idle state.  Update stats and idle timer if
 * necessary.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_enter_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
            WARN_ON_ONCE(!list_empty(&worker->entry) &&
                         (worker->hentry.next || worker->hentry.pprev)))
                return;

        /* can't use worker_set_flags(), also called from create_worker() */
        worker->flags |= WORKER_IDLE;
        pool->nr_idle++;
        worker->last_active = jiffies;

        /* idle_list is LIFO */
        list_add(&worker->entry, &pool->idle_list);

        if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
                mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);

        /* Sanity check nr_running. */
        WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running);
}

/**
 * worker_leave_idle - leave idle state
 * @worker: worker which is leaving idle state
 *
 * @worker is leaving idle state.  Update stats.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock).
 */
static void worker_leave_idle(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
                return;
        worker_clr_flags(worker, WORKER_IDLE);
        pool->nr_idle--;
        list_del_init(&worker->entry);
}

/**
 * find_worker_executing_work - find worker which is executing a work
 * @pool: pool of interest
 * @work: work to find worker for
 *
 * Find a worker which is executing @work on @pool by searching
 * @pool->busy_hash which is keyed by the address of @work.  For a worker
 * to match, its current execution should match the address of @work and
 * its work function.  This is to avoid unwanted dependency between
 * unrelated work executions through a work item being recycled while still
 * being executed.
 *
 * This is a bit tricky.  A work item may be freed once its execution
 * starts and nothing prevents the freed area from being recycled for
 * another work item.  If the same work item address ends up being reused
 * before the original execution finishes, workqueue will identify the
 * recycled work item as currently executing and make it wait until the
 * current execution finishes, introducing an unwanted dependency.
 *
 * This function checks the work item address and work function to avoid
 * false positives.  Note that this isn't complete as one may construct a
 * work function which can introduce dependency onto itself through a
 * recycled work item.  Well, if somebody wants to shoot oneself in the
 * foot that badly, there's only so much we can do, and if such deadlock
 * actually occurs, it should be easy to locate the culprit work function.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 *
 * Return:
 * Pointer to worker which is executing @work if found, %NULL
 * otherwise.
 */
static struct worker *find_worker_executing_work(struct worker_pool *pool,
                                                 struct work_struct *work)
{
        struct worker *worker;

        hash_for_each_possible(pool->busy_hash, worker, hentry,
                               (unsigned long)work)
                if (worker->current_work == work &&
                    worker->current_func == work->func)
                        return worker;

        return NULL;
}

/**
 * move_linked_works - move linked works to a list
 * @work: start of series of works to be scheduled
 * @head: target list to append @work to
 * @nextp: out parameter for nested worklist walking
 *
 * Schedule linked works starting from @work to @head. Work series to be
 * scheduled starts at @work and includes any consecutive work with
 * WORK_STRUCT_LINKED set in its predecessor. See assign_work() for details on
 * @nextp.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void move_linked_works(struct work_struct *work, struct list_head *head,
                              struct work_struct **nextp)
{
        struct work_struct *n;

        /*
         * Linked worklist will always end before the end of the list,
         * use NULL for list head.
         */
        list_for_each_entry_safe_from(work, n, NULL, entry) {
                list_move_tail(&work->entry, head);
                if (!(*work_data_bits(work) & WORK_STRUCT_LINKED))
                        break;
        }

        /*
         * If we're already inside safe list traversal and have moved
         * multiple works to the scheduled queue, the next position
         * needs to be updated.
         */
        if (nextp)
                *nextp = n;
}

/**
 * assign_work - assign a work item and its linked work items to a worker
 * @work: work to assign
 * @worker: worker to assign to
 * @nextp: out parameter for nested worklist walking
 *
 * Assign @work and its linked work items to @worker. If @work is already being
 * executed by another worker in the same pool, it'll be punted there.
 *
 * If @nextp is not NULL, it's updated to point to the next work of the last
 * scheduled work. This allows assign_work() to be nested inside
 * list_for_each_entry_safe().
 *
 * Returns %true if @work was successfully assigned to @worker. %false if @work
 * was punted to another worker already executing it.
 */
static bool assign_work(struct work_struct *work, struct worker *worker,
                        struct work_struct **nextp)
{
        struct worker_pool *pool = worker->pool;
        struct worker *collision;

        lockdep_assert_held(&pool->lock);

        /*
         * A single work shouldn't be executed concurrently by multiple workers.
         * __queue_work() ensures that @work doesn't jump to a different pool
         * while still running in the previous pool. Here, we should ensure that
         * @work is not executed concurrently by multiple workers from the same
         * pool. Check whether anyone is already processing the work. If so,
         * defer the work to the currently executing one.
         */
        collision = find_worker_executing_work(pool, work);
        if (unlikely(collision)) {
                move_linked_works(work, &collision->scheduled, nextp);
                return false;
        }

        move_linked_works(work, &worker->scheduled, nextp);
        return true;
}

static struct irq_work *bh_pool_irq_work(struct worker_pool *pool)
{
        int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0;

        return &per_cpu(bh_pool_irq_works, pool->cpu)[high];
}

static void kick_bh_pool(struct worker_pool *pool)
{
#ifdef CONFIG_SMP
        /* see drain_dead_softirq_workfn() for BH_DRAINING */
        if (unlikely(pool->cpu != smp_processor_id() &&
                     !(pool->flags & POOL_BH_DRAINING))) {
                irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
                return;
        }
#endif
        if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                raise_softirq_irqoff(HI_SOFTIRQ);
        else
                raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

/**
 * kick_pool - wake up an idle worker if necessary
 * @pool: pool to kick
 *
 * @pool may have pending work items. Wake up worker if necessary. Returns
 * whether a worker was woken up.
 */
static bool kick_pool(struct worker_pool *pool)
{
        struct worker *worker = first_idle_worker(pool);
        struct task_struct *p;

        lockdep_assert_held(&pool->lock);

        if (!need_more_worker(pool) || !worker)
                return false;

        if (pool->flags & POOL_BH) {
                kick_bh_pool(pool);
                return true;
        }

        p = worker->task;

#ifdef CONFIG_SMP
        /*
         * Idle @worker is about to execute @work and waking up provides an
         * opportunity to migrate @worker at a lower cost by setting the task's
         * wake_cpu field. Let's see if we want to move @worker to improve
         * execution locality.
         *
         * We're waking the worker that went idle the latest and there's some
         * chance that @worker is marked idle but hasn't gone off CPU yet. If
         * so, setting the wake_cpu won't do anything. As this is a best-effort
         * optimization and the race window is narrow, let's leave as-is for
         * now. If this becomes pronounced, we can skip over workers which are
         * still on cpu when picking an idle worker.
         *
         * If @pool has non-strict affinity, @worker might have ended up outside
         * its affinity scope. Repatriate.
         */
        if (!pool->attrs->affn_strict &&
            !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
                struct work_struct *work = list_first_entry(&pool->worklist,
                                                struct work_struct, entry);
                int wake_cpu = cpumask_any_and_distribute(pool->attrs->__pod_cpumask,
                                                          cpu_online_mask);
                if (wake_cpu < nr_cpu_ids) {
                        p->wake_cpu = wake_cpu;
                        get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
                }
        }
#endif
        wake_up_process(p);
        return true;
}

#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT

/*
 * Concurrency-managed per-cpu work items that hog CPU for longer than
 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
 * which prevents them from stalling other concurrency-managed work items. If a
 * work function keeps triggering this mechanism, it's likely that the work item
 * should be using an unbound workqueue instead.
 *
 * wq_cpu_intensive_report() tracks work functions which trigger such conditions
 * and report them so that they can be examined and converted to use unbound
 * workqueues as appropriate. To avoid flooding the console, each violating work
 * function is tracked and reported with exponential backoff.
 */
#define WCI_MAX_ENTS 128

struct wci_ent {
        work_func_t                func;
        atomic64_t                cnt;
        struct hlist_node        hash_node;
};

static struct wci_ent wci_ents[WCI_MAX_ENTS];
static int wci_nr_ents;
static DEFINE_RAW_SPINLOCK(wci_lock);
static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));

static struct wci_ent *wci_find_ent(work_func_t func)
{
        struct wci_ent *ent;

        hash_for_each_possible_rcu(wci_hash, ent, hash_node,
                                   (unsigned long)func) {
                if (ent->func == func)
                        return ent;
        }
        return NULL;
}

static void wq_cpu_intensive_report(work_func_t func)
{
        struct wci_ent *ent;

restart:
        ent = wci_find_ent(func);
        if (ent) {
                u64 cnt;

                /*
                 * Start reporting from the warning_thresh and back off
                 * exponentially.
                 */
                cnt = atomic64_inc_return_relaxed(&ent->cnt);
                if (wq_cpu_intensive_warning_thresh &&
                    cnt >= wq_cpu_intensive_warning_thresh &&
                    is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh))
                        printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
                                        ent->func, wq_cpu_intensive_thresh_us,
                                        atomic64_read(&ent->cnt));
                return;
        }

        /*
         * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
         * is exhausted, something went really wrong and we probably made enough
         * noise already.
         */
        if (wci_nr_ents >= WCI_MAX_ENTS)
                return;

        raw_spin_lock(&wci_lock);

        if (wci_nr_ents >= WCI_MAX_ENTS) {
                raw_spin_unlock(&wci_lock);
                return;
        }

        if (wci_find_ent(func)) {
                raw_spin_unlock(&wci_lock);
                goto restart;
        }

        ent = &wci_ents[wci_nr_ents++];
        ent->func = func;
        atomic64_set(&ent->cnt, 0);
        hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);

        raw_spin_unlock(&wci_lock);

        goto restart;
}

#else        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
static void wq_cpu_intensive_report(work_func_t func) {}
#endif        /* CONFIG_WQ_CPU_INTENSIVE_REPORT */

/**
 * wq_worker_running - a worker is running again
 * @task: task waking up
 *
 * This function is called when a worker returns from schedule()
 */
void wq_worker_running(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        if (!READ_ONCE(worker->sleeping))
                return;

        /*
         * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
         * and the nr_running increment below, we may ruin the nr_running reset
         * and leave with an unexpected pool->nr_running == 1 on the newly unbound
         * pool. Protect against such race.
         */
        preempt_disable();
        if (!(worker->flags & WORKER_NOT_RUNNING))
                worker->pool->nr_running++;
        preempt_enable();

        /*
         * CPU intensive auto-detection cares about how long a work item hogged
         * CPU without sleeping. Reset the starting timestamp on wakeup.
         */
        worker->current_at = worker->task->se.sum_exec_runtime;

        WRITE_ONCE(worker->sleeping, 0);
}

/**
 * wq_worker_sleeping - a worker is going to sleep
 * @task: task going to sleep
 *
 * This function is called from schedule() when a busy worker is
 * going to sleep.
 */
void wq_worker_sleeping(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct worker_pool *pool;

        /*
         * Rescuers, which may not have all the fields set up like normal
         * workers, also reach here, let's not access anything before
         * checking NOT_RUNNING.
         */
        if (worker->flags & WORKER_NOT_RUNNING)
                return;

        pool = worker->pool;

        /* Return if preempted before wq_worker_running() was reached */
        if (READ_ONCE(worker->sleeping))
                return;

        WRITE_ONCE(worker->sleeping, 1);
        raw_spin_lock_irq(&pool->lock);

        /*
         * Recheck in case unbind_workers() preempted us. We don't
         * want to decrement nr_running after the worker is unbound
         * and nr_running has been reset.
         */
        if (worker->flags & WORKER_NOT_RUNNING) {
                raw_spin_unlock_irq(&pool->lock);
                return;
        }

        pool->nr_running--;
        if (kick_pool(pool))
                worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * wq_worker_tick - a scheduler tick occurred while a kworker is running
 * @task: task currently running
 *
 * Called from sched_tick(). We're in the IRQ context and the current
 * worker's fields which follow the 'K' locking rule can be accessed safely.
 */
void wq_worker_tick(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);
        struct pool_workqueue *pwq = worker->current_pwq;
        struct worker_pool *pool = worker->pool;

        if (!pwq)
                return;

        pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;

        if (!wq_cpu_intensive_thresh_us)
                return;

        /*
         * If the current worker is concurrency managed and hogged the CPU for
         * longer than wq_cpu_intensive_thresh_us, it's automatically marked
         * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
         *
         * Set @worker->sleeping means that @worker is in the process of
         * switching out voluntarily and won't be contributing to
         * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
         * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
         * double decrements. The task is releasing the CPU anyway. Let's skip.
         * We probably want to make this prettier in the future.
         */
        if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
            worker->task->se.sum_exec_runtime - worker->current_at <
            wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
                return;

        raw_spin_lock(&pool->lock);

        worker_set_flags(worker, WORKER_CPU_INTENSIVE);
        wq_cpu_intensive_report(worker->current_func);
        pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;

        if (kick_pool(pool))
                pwq->stats[PWQ_STAT_CM_WAKEUP]++;

        raw_spin_unlock(&pool->lock);
}

/**
 * wq_worker_last_func - retrieve worker's last work function
 * @task: Task to retrieve last work function of.
 *
 * Determine the last function a worker executed. This is called from
 * the scheduler to get a worker's last known identity.
 *
 * CONTEXT:
 * raw_spin_lock_irq(rq->lock)
 *
 * This function is called during schedule() when a kworker is going
 * to sleep. It's used by psi to identify aggregation workers during
 * dequeuing, to allow periodic aggregation to shut-off when that
 * worker is the last task in the system or cgroup to go to sleep.
 *
 * As this function doesn't involve any workqueue-related locking, it
 * only returns stable values when called from inside the scheduler's
 * queuing and dequeuing paths, when @task, which must be a kworker,
 * is guaranteed to not be processing any works.
 *
 * Return:
 * The last work function %current executed as a worker, NULL if it
 * hasn't executed any work yet.
 */
work_func_t wq_worker_last_func(struct task_struct *task)
{
        struct worker *worker = kthread_data(task);

        return worker->last_func;
}

/**
 * wq_node_nr_active - Determine wq_node_nr_active to use
 * @wq: workqueue of interest
 * @node: NUMA node, can be %NUMA_NO_NODE
 *
 * Determine wq_node_nr_active to use for @wq on @node. Returns:
 *
 * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
 *
 * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
 *
 * - Otherwise, node_nr_active[@node].
 */
static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
                                                   int node)
{
        if (!(wq->flags & WQ_UNBOUND))
                return NULL;

        if (node == NUMA_NO_NODE)
                node = nr_node_ids;

        return wq->node_nr_active[node];
}

/**
 * wq_update_node_max_active - Update per-node max_actives to use
 * @wq: workqueue to update
 * @off_cpu: CPU that's going down, -1 if a CPU is not going down
 *
 * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is
 * distributed among nodes according to the proportions of numbers of online
 * cpus. The result is always between @wq->min_active and max_active.
 */
static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu)
{
        struct cpumask *effective = unbound_effective_cpumask(wq);
        int min_active = READ_ONCE(wq->min_active);
        int max_active = READ_ONCE(wq->max_active);
        int total_cpus, node;

        lockdep_assert_held(&wq->mutex);

        if (!wq_topo_initialized)
                return;

        if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective))
                off_cpu = -1;

        total_cpus = cpumask_weight_and(effective, cpu_online_mask);
        if (off_cpu >= 0)
                total_cpus--;

        /* If all CPUs of the wq get offline, use the default values */
        if (unlikely(!total_cpus)) {
                for_each_node(node)
                        wq_node_nr_active(wq, node)->max = min_active;

                wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
                return;
        }

        for_each_node(node) {
                int node_cpus;

                node_cpus = cpumask_weight_and(effective, cpumask_of_node(node));
                if (off_cpu >= 0 && cpu_to_node(off_cpu) == node)
                        node_cpus--;

                wq_node_nr_active(wq, node)->max =
                        clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus),
                              min_active, max_active);
        }

        wq_node_nr_active(wq, NUMA_NO_NODE)->max = max_active;
}

/**
 * get_pwq - get an extra reference on the specified pool_workqueue
 * @pwq: pool_workqueue to get
 *
 * Obtain an extra reference on @pwq.  The caller should guarantee that
 * @pwq has positive refcnt and be holding the matching pool->lock.
 */
static void get_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        WARN_ON_ONCE(pwq->refcnt <= 0);
        pwq->refcnt++;
}

/**
 * put_pwq - put a pool_workqueue reference
 * @pwq: pool_workqueue to put
 *
 * Drop a reference of @pwq.  If its refcnt reaches zero, schedule its
 * destruction.  The caller should be holding the matching pool->lock.
 */
static void put_pwq(struct pool_workqueue *pwq)
{
        lockdep_assert_held(&pwq->pool->lock);
        if (likely(--pwq->refcnt))
                return;
        /*
         * @pwq can't be released under pool->lock, bounce to a dedicated
         * kthread_worker to avoid A-A deadlocks.
         */
        kthread_queue_work(pwq_release_worker, &pwq->release_work);
}

/**
 * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
 * @pwq: pool_workqueue to put (can be %NULL)
 *
 * put_pwq() with locking.  This function also allows %NULL @pwq.
 */
static void put_pwq_unlocked(struct pool_workqueue *pwq)
{
        if (pwq) {
                /*
                 * As both pwqs and pools are RCU protected, the
                 * following lock operations are safe.
                 */
                raw_spin_lock_irq(&pwq->pool->lock);
                put_pwq(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
}

static bool pwq_is_empty(struct pool_workqueue *pwq)
{
        return !pwq->nr_active && list_empty(&pwq->inactive_works);
}

static void __pwq_activate_work(struct pool_workqueue *pwq,
                                struct work_struct *work)
{
        unsigned long *wdb = work_data_bits(work);

        WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
        trace_workqueue_activate_work(work);
        if (list_empty(&pwq->pool->worklist))
                pwq->pool->watchdog_ts = jiffies;
        move_linked_works(work, &pwq->pool->worklist, NULL);
        __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}

static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
        int max = READ_ONCE(nna->max);

        while (true) {
                int old, tmp;

                old = atomic_read(&nna->nr);
                if (old >= max)
                        return false;
                tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1);
                if (tmp == old)
                        return true;
        }
}

/**
 * pwq_tryinc_nr_active - Try to increment nr_active for a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Try to increment nr_active for @pwq. Returns %true if an nr_active count is
 * successfully obtained. %false otherwise.
 */
static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill)
{
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
        bool obtained = false;

        lockdep_assert_held(&pool->lock);

        if (!nna) {
                /* BH or per-cpu workqueue, pwq->nr_active is sufficient */
                obtained = pwq->nr_active < READ_ONCE(wq->max_active);
                goto out;
        }

        if (unlikely(pwq->plugged))
                return false;

        /*
         * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is
         * already waiting on $nna, pwq_dec_nr_active() will maintain the
         * concurrency level. Don't jump the line.
         *
         * We need to ignore the pending test after max_active has increased as
         * pwq_dec_nr_active() can only maintain the concurrency level but not
         * increase it. This is indicated by @fill.
         */
        if (!list_empty(&pwq->pending_node) && likely(!fill))
                goto out;

        obtained = tryinc_node_nr_active(nna);
        if (obtained)
                goto out;

        /*
         * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs
         * and try again. The smp_mb() is paired with the implied memory barrier
         * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either
         * we see the decremented $nna->nr or they see non-empty
         * $nna->pending_pwqs.
         */
        raw_spin_lock(&nna->lock);

        if (list_empty(&pwq->pending_node))
                list_add_tail(&pwq->pending_node, &nna->pending_pwqs);
        else if (likely(!fill))
                goto out_unlock;

        smp_mb();

        obtained = tryinc_node_nr_active(nna);

        /*
         * If @fill, @pwq might have already been pending. Being spuriously
         * pending in cold paths doesn't affect anything. Let's leave it be.
         */
        if (obtained && likely(!fill))
                list_del_init(&pwq->pending_node);

out_unlock:
        raw_spin_unlock(&nna->lock);
out:
        if (obtained)
                pwq->nr_active++;
        return obtained;
}

/**
 * pwq_activate_first_inactive - Activate the first inactive work item on a pwq
 * @pwq: pool_workqueue of interest
 * @fill: max_active may have increased, try to increase concurrency level
 *
 * Activate the first inactive work item of @pwq if available and allowed by
 * max_active limit.
 *
 * Returns %true if an inactive work item has been activated. %false if no
 * inactive work item is found or max_active limit is reached.
 */
static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill)
{
        struct work_struct *work =
                list_first_entry_or_null(&pwq->inactive_works,
                                         struct work_struct, entry);

        if (work && pwq_tryinc_nr_active(pwq, fill)) {
                __pwq_activate_work(pwq, work);
                return true;
        } else {
                return false;
        }
}

/**
 * unplug_oldest_pwq - unplug the oldest pool_workqueue
 * @wq: workqueue_struct where its oldest pwq is to be unplugged
 *
 * This function should only be called for ordered workqueues where only the
 * oldest pwq is unplugged, the others are plugged to suspend execution to
 * ensure proper work item ordering::
 *
 *    dfl_pwq --------------+     [P] - plugged
 *                          |
 *                          v
 *    pwqs -> A -> B [P] -> C [P] (newest)
 *            |    |        |
 *            1    3        5
 *            |    |        |
 *            2    4        6
 *
 * When the oldest pwq is drained and removed, this function should be called
 * to unplug the next oldest one to start its work item execution. Note that
 * pwq's are linked into wq->pwqs with the oldest first, so the first one in
 * the list is the oldest.
 */
static void unplug_oldest_pwq(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq->mutex);

        /* Caller should make sure that pwqs isn't empty before calling */
        pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue,
                                       pwqs_node);
        raw_spin_lock_irq(&pwq->pool->lock);
        if (pwq->plugged) {
                pwq->plugged = false;
                if (pwq_activate_first_inactive(pwq, true))
                        kick_pool(pwq->pool);
        }
        raw_spin_unlock_irq(&pwq->pool->lock);
}

/**
 * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active
 * @nna: wq_node_nr_active to activate a pending pwq for
 * @caller_pool: worker_pool the caller is locking
 *
 * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked.
 * @caller_pool may be unlocked and relocked to lock other worker_pools.
 */
static void node_activate_pending_pwq(struct wq_node_nr_active *nna,
                                      struct worker_pool *caller_pool)
{
        struct worker_pool *locked_pool = caller_pool;
        struct pool_workqueue *pwq;
        struct work_struct *work;

        lockdep_assert_held(&caller_pool->lock);

        raw_spin_lock(&nna->lock);
retry:
        pwq = list_first_entry_or_null(&nna->pending_pwqs,
                                       struct pool_workqueue, pending_node);
        if (!pwq)
                goto out_unlock;

        /*
         * If @pwq is for a different pool than @locked_pool, we need to lock
         * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock
         * / lock dance. For that, we also need to release @nna->lock as it's
         * nested inside pool locks.
         */
        if (pwq->pool != locked_pool) {
                raw_spin_unlock(&locked_pool->lock);
                locked_pool = pwq->pool;
                if (!raw_spin_trylock(&locked_pool->lock)) {
                        raw_spin_unlock(&nna->lock);
                        raw_spin_lock(&locked_pool->lock);
                        raw_spin_lock(&nna->lock);
                        goto retry;
                }
        }

        /*
         * $pwq may not have any inactive work items due to e.g. cancellations.
         * Drop it from pending_pwqs and see if there's another one.
         */
        work = list_first_entry_or_null(&pwq->inactive_works,
                                        struct work_struct, entry);
        if (!work) {
                list_del_init(&pwq->pending_node);
                goto retry;
        }

        /*
         * Acquire an nr_active count and activate the inactive work item. If
         * $pwq still has inactive work items, rotate it to the end of the
         * pending_pwqs so that we round-robin through them. This means that
         * inactive work items are not activated in queueing order which is fine
         * given that there has never been any ordering across different pwqs.
         */
        if (likely(tryinc_node_nr_active(nna))) {
                pwq->nr_active++;
                __pwq_activate_work(pwq, work);

                if (list_empty(&pwq->inactive_works))
                        list_del_init(&pwq->pending_node);
                else
                        list_move_tail(&pwq->pending_node, &nna->pending_pwqs);

                /* if activating a foreign pool, make sure it's running */
                if (pwq->pool != caller_pool)
                        kick_pool(pwq->pool);
        }

out_unlock:
        raw_spin_unlock(&nna->lock);
        if (locked_pool != caller_pool) {
                raw_spin_unlock(&locked_pool->lock);
                raw_spin_lock(&caller_pool->lock);
        }
}

/**
 * pwq_dec_nr_active - Retire an active count
 * @pwq: pool_workqueue of interest
 *
 * Decrement @pwq's nr_active and try to activate the first inactive work item.
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock.
 */
static void pwq_dec_nr_active(struct pool_workqueue *pwq)
{
        struct worker_pool *pool = pwq->pool;
        struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);

        lockdep_assert_held(&pool->lock);

        /*
         * @pwq->nr_active should be decremented for both percpu and unbound
         * workqueues.
         */
        pwq->nr_active--;

        /*
         * For a percpu workqueue, it's simple. Just need to kick the first
         * inactive work item on @pwq itself.
         */
        if (!nna) {
                pwq_activate_first_inactive(pwq, false);
                return;
        }

        /*
         * If @pwq is for an unbound workqueue, it's more complicated because
         * multiple pwqs and pools may be sharing the nr_active count. When a
         * pwq needs to wait for an nr_active count, it puts itself on
         * $nna->pending_pwqs. The following atomic_dec_return()'s implied
         * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to
         * guarantee that either we see non-empty pending_pwqs or they see
         * decremented $nna->nr.
         *
         * $nna->max may change as CPUs come online/offline and @pwq->wq's
         * max_active gets updated. However, it is guaranteed to be equal to or
         * larger than @pwq->wq->min_active which is above zero unless freezing.
         * This maintains the forward progress guarantee.
         */
        if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max))
                return;

        if (!list_empty(&nna->pending_pwqs))
                node_activate_pending_pwq(nna, pool);
}

/**
 * pwq_dec_nr_in_flight - decrement pwq's nr_in_flight
 * @pwq: pwq of interest
 * @work_data: work_data of work which left the queue
 *
 * A work either has completed or is removed from pending queue,
 * decrement nr_in_flight of its pwq and handle workqueue flushing.
 *
 * NOTE:
 * For unbound workqueues, this function may temporarily drop @pwq->pool->lock
 * and thus should be called after all other state updates for the in-flight
 * work item is complete.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_data)
{
        int color = get_work_color(work_data);

        if (!(work_data & WORK_STRUCT_INACTIVE))
                pwq_dec_nr_active(pwq);

        pwq->nr_in_flight[color]--;

        /* is flush in progress and are we at the flushing tip? */
        if (likely(pwq->flush_color != color))
                goto out_put;

        /* are there still in-flight works? */
        if (pwq->nr_in_flight[color])
                goto out_put;

        /* this pwq is done, clear flush_color */
        pwq->flush_color = -1;

        /*
         * If this was the last pwq, wake up the first flusher.  It
         * will handle the rest.
         */
        if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
                complete(&pwq->wq->first_flusher->done);
out_put:
        put_pwq(pwq);
}

/**
 * try_to_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store irq state
 *
 * Try to grab PENDING bit of @work.  This function can handle @work in any
 * stable state - idle, on timer or on worklist.
 *
 * Return:
 *
 *  ========        ================================================================
 *  1                if @work was pending and we successfully stole PENDING
 *  0                if @work was idle and we claimed PENDING
 *  -EAGAIN        if PENDING couldn't be grabbed at the moment, safe to busy-retry
 *  ========        ================================================================
 *
 * Note:
 * On >= 0 return, the caller owns @work's PENDING bit.  To avoid getting
 * interrupted while holding PENDING and @work off queue, irq must be
 * disabled on entry.  This, combined with delayed_work->timer being
 * irqsafe, ensures that we return -EAGAIN for finite short period of time.
 *
 * On successful return, >= 0, irq is disabled and the caller is
 * responsible for releasing it using local_irq_restore(*@irq_flags).
 *
 * This function is safe to call from any context including IRQ handler.
 */
static int try_to_grab_pending(struct work_struct *work, u32 cflags,
                               unsigned long *irq_flags)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        local_irq_save(*irq_flags);

        /* try to steal the timer if it exists */
        if (cflags & WORK_CANCEL_DELAYED) {
                struct delayed_work *dwork = to_delayed_work(work);

                /*
                 * dwork->timer is irqsafe.  If timer_delete() fails, it's
                 * guaranteed that the timer is not queued anywhere and not
                 * running on the local CPU.
                 */
                if (likely(timer_delete(&dwork->timer)))
                        return 1;
        }

        /* try to claim PENDING the normal way */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)))
                return 0;

        rcu_read_lock();
        /*
         * The queueing is in progress, or it is already queued. Try to
         * steal it from ->worklist without clearing WORK_STRUCT_PENDING.
         */
        pool = get_work_pool(work);
        if (!pool)
                goto fail;

        raw_spin_lock(&pool->lock);
        /*
         * work->data is guaranteed to point to pwq only while the work
         * item is queued on pwq->wq, and both updating work->data to point
         * to pwq on queueing and to pool on dequeueing are done under
         * pwq->pool->lock.  This in turn guarantees that, if work->data
         * points to pwq which is associated with a locked pool, the work
         * item is currently queued on that pool.
         */
        pwq = get_work_pwq(work);
        if (pwq && pwq->pool == pool) {
                unsigned long work_data = *work_data_bits(work);

                debug_work_deactivate(work);

                /*
                 * A cancelable inactive work item must be in the
                 * pwq->inactive_works since a queued barrier can't be
                 * canceled (see the comments in insert_wq_barrier()).
                 *
                 * An inactive work item cannot be deleted directly because
                 * it might have linked barrier work items which, if left
                 * on the inactive_works list, will confuse pwq->nr_active
                 * management later on and cause stall.  Move the linked
                 * barrier work items to the worklist when deleting the grabbed
                 * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that
                 * it doesn't participate in nr_active management in later
                 * pwq_dec_nr_in_flight().
                 */
                if (work_data & WORK_STRUCT_INACTIVE)
                        move_linked_works(work, &pwq->pool->worklist, NULL);

                list_del_init(&work->entry);

                /*
                 * work->data points to pwq iff queued. Let's point to pool. As
                 * this destroys work->data needed by the next step, stash it.
                 */
                set_work_pool_and_keep_pending(work, pool->id,
                                               pool_offq_flags(pool));

                /* must be the last step, see the function comment */
                pwq_dec_nr_in_flight(pwq, work_data);

                raw_spin_unlock(&pool->lock);
                rcu_read_unlock();
                return 1;
        }
        raw_spin_unlock(&pool->lock);
fail:
        rcu_read_unlock();
        local_irq_restore(*irq_flags);
        return -EAGAIN;
}

/**
 * work_grab_pending - steal work item from worklist and disable irq
 * @work: work item to steal
 * @cflags: %WORK_CANCEL_ flags
 * @irq_flags: place to store IRQ state
 *
 * Grab PENDING bit of @work. @work can be in any stable state - idle, on timer
 * or on worklist.
 *
 * Can be called from any context. IRQ is disabled on return with IRQ state
 * stored in *@irq_flags. The caller is responsible for re-enabling it using
 * local_irq_restore().
 *
 * Returns %true if @work was pending. %false if idle.
 */
static bool work_grab_pending(struct work_struct *work, u32 cflags,
                              unsigned long *irq_flags)
{
        int ret;

        while (true) {
                ret = try_to_grab_pending(work, cflags, irq_flags);
                if (ret >= 0)
                        return ret;
                cpu_relax();
        }
}

/**
 * insert_work - insert a work into a pool
 * @pwq: pwq @work belongs to
 * @work: work to insert
 * @head: insertion point
 * @extra_flags: extra WORK_STRUCT_* flags to set
 *
 * Insert @work which belongs to @pwq after @head.  @extra_flags is or'd to
 * work_struct flags.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
                        struct list_head *head, unsigned int extra_flags)
{
        debug_work_activate(work);

        /* record the work call stack in order to print it in KASAN reports */
        kasan_record_aux_stack(work);

        /* we own @work, set data and link */
        set_work_pwq(work, pwq, extra_flags);
        list_add_tail(&work->entry, head);
        get_pwq(pwq);
}

/*
 * Test whether @work is being queued from another work executing on the
 * same workqueue.
 */
static bool is_chained_work(struct workqueue_struct *wq)
{
        struct worker *worker;

        worker = current_wq_worker();
        /*
         * Return %true iff I'm a worker executing a work item on @wq.  If
         * I'm @worker, it's safe to dereference it without locking.
         */
        return worker && worker->current_pwq->wq == wq;
}

/*
 * When queueing an unbound work item to a wq, prefer local CPU if allowed
 * by wq_unbound_cpumask.  Otherwise, round robin among the allowed ones to
 * avoid perturbing sensitive tasks.
 */
static int wq_select_unbound_cpu(int cpu)
{
        int new_cpu;

        if (likely(!wq_debug_force_rr_cpu)) {
                if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
                        return cpu;
        } else {
                pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
        }

        new_cpu = __this_cpu_read(wq_rr_cpu_last);
        new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
        if (unlikely(new_cpu >= nr_cpu_ids)) {
                new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
                if (unlikely(new_cpu >= nr_cpu_ids))
                        return cpu;
        }
        __this_cpu_write(wq_rr_cpu_last, new_cpu);

        return new_cpu;
}

static void __queue_work(int cpu, struct workqueue_struct *wq,
                         struct work_struct *work)
{
        struct pool_workqueue *pwq;
        struct worker_pool *last_pool, *pool;
        unsigned int work_flags;
        unsigned int req_cpu = cpu;

        /*
         * While a work item is PENDING && off queue, a task trying to
         * steal the PENDING will busy-loop waiting for it to either get
         * queued or lose PENDING.  Grabbing PENDING and queueing should
         * happen with IRQ disabled.
         */
        lockdep_assert_irqs_disabled();

        /*
         * For a draining wq, only works from the same workqueue are
         * allowed. The __WQ_DESTROYING helps to spot the issue that
         * queues a new work item to a wq after destroy_workqueue(wq).
         */
        if (unlikely(wq->flags & (__WQ_DESTROYING | __WQ_DRAINING) &&
                     WARN_ONCE(!is_chained_work(wq), "workqueue: cannot queue %ps on wq %s\n",
                               work->func, wq->name))) {
                return;
        }
        rcu_read_lock();
retry:
        /* pwq which will be used unless @work is executing elsewhere */
        if (req_cpu == WORK_CPU_UNBOUND) {
                if (wq->flags & WQ_UNBOUND)
                        cpu = wq_select_unbound_cpu(raw_smp_processor_id());
                else
                        cpu = raw_smp_processor_id();
        }

        pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu));
        pool = pwq->pool;

        /*
         * If @work was previously on a different pool, it might still be
         * running there, in which case the work needs to be queued on that
         * pool to guarantee non-reentrancy.
         *
         * For ordered workqueue, work items must be queued on the newest pwq
         * for accurate order management.  Guaranteed order also guarantees
         * non-reentrancy.  See the comments above unplug_oldest_pwq().
         */
        last_pool = get_work_pool(work);
        if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) {
                struct worker *worker;

                raw_spin_lock(&last_pool->lock);

                worker = find_worker_executing_work(last_pool, work);

                if (worker && worker->current_pwq->wq == wq) {
                        pwq = worker->current_pwq;
                        pool = pwq->pool;
                        WARN_ON_ONCE(pool != last_pool);
                } else {
                        /* meh... not running there, queue here */
                        raw_spin_unlock(&last_pool->lock);
                        raw_spin_lock(&pool->lock);
                }
        } else {
                raw_spin_lock(&pool->lock);
        }

        /*
         * pwq is determined and locked. For unbound pools, we could have raced
         * with pwq release and it could already be dead. If its refcnt is zero,
         * repeat pwq selection. Note that unbound pwqs never die without
         * another pwq replacing it in cpu_pwq or while work items are executing
         * on it, so the retrying is guaranteed to make forward-progress.
         */
        if (unlikely(!pwq->refcnt)) {
                if (wq->flags & WQ_UNBOUND) {
                        raw_spin_unlock(&pool->lock);
                        cpu_relax();
                        goto retry;
                }
                /* oops */
                WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
                          wq->name, cpu);
        }

        /* pwq determined, queue */
        trace_workqueue_queue_work(req_cpu, pwq, work);

        if (WARN_ON(!list_empty(&work->entry)))
                goto out;

        pwq->nr_in_flight[pwq->work_color]++;
        work_flags = work_color_to_flags(pwq->work_color);

        /*
         * Limit the number of concurrently active work items to max_active.
         * @work must also queue behind existing inactive work items to maintain
         * ordering when max_active changes. See wq_adjust_max_active().
         */
        if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
                if (list_empty(&pool->worklist))
                        pool->watchdog_ts = jiffies;

                trace_workqueue_activate_work(work);
                insert_work(pwq, work, &pool->worklist, work_flags);
                kick_pool(pool);
        } else {
                work_flags |= WORK_STRUCT_INACTIVE;
                insert_work(pwq, work, &pwq->inactive_works, work_flags);
        }

out:
        raw_spin_unlock(&pool->lock);
        rcu_read_unlock();
}

static bool clear_pending_if_disabled(struct work_struct *work)
{
        unsigned long data = *work_data_bits(work);
        struct work_offq_data offqd;

        if (likely((data & WORK_STRUCT_PWQ) ||
                   !(data & WORK_OFFQ_DISABLE_MASK)))
                return false;

        work_offqd_unpack(&offqd, data);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        return true;
}

/**
 * queue_work_on - queue work on specific cpu
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.  Callers that fail to ensure that the specified
 * CPU cannot go away will execute on a randomly chosen CPU.
 * But note well that callers specifying a CPU that never has been
 * online will get a splat.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_on(int cpu, struct workqueue_struct *wq,
                   struct work_struct *work)
{
        bool ret = false;
        unsigned long irq_flags;

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_work_on);

/**
 * select_numa_node_cpu - Select a CPU based on NUMA node
 * @node: NUMA node ID that we want to select a CPU from
 *
 * This function will attempt to find a "random" cpu available on a given
 * node. If there are no CPUs available on the given node it will return
 * WORK_CPU_UNBOUND indicating that we should just schedule to any
 * available CPU if we need to schedule this work.
 */
static int select_numa_node_cpu(int node)
{
        int cpu;

        /* Delay binding to CPU if node is not valid or online */
        if (node < 0 || node >= MAX_NUMNODES || !node_online(node))
                return WORK_CPU_UNBOUND;

        /* Use local node/cpu if we are already there */
        cpu = raw_smp_processor_id();
        if (node == cpu_to_node(cpu))
                return cpu;

        /* Use "random" otherwise know as "first" online CPU of node */
        cpu = cpumask_any_and(cpumask_of_node(node), cpu_online_mask);

        /* If CPU is valid return that, otherwise just defer */
        return cpu < nr_cpu_ids ? cpu : WORK_CPU_UNBOUND;
}

/**
 * queue_work_node - queue work on a "random" cpu for a given NUMA node
 * @node: NUMA node that we are targeting the work for
 * @wq: workqueue to use
 * @work: work to queue
 *
 * We queue the work to a "random" CPU within a given NUMA node. The basic
 * idea here is to provide a way to somehow associate work with a given
 * NUMA node.
 *
 * This function will only make a best effort attempt at getting this onto
 * the right NUMA node. If no node is requested or the requested node is
 * offline then we just fall back to standard queue_work behavior.
 *
 * Currently the "random" CPU ends up being the first available CPU in the
 * intersection of cpu_online_mask and the cpumask of the node, unless we
 * are running on the node. In that case we just use the current CPU.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
bool queue_work_node(int node, struct workqueue_struct *wq,
                     struct work_struct *work)
{
        unsigned long irq_flags;
        bool ret = false;

        /*
         * This current implementation is specific to unbound workqueues.
         * Specifically we only return the first available CPU for a given
         * node instead of cycling through individual CPUs within the node.
         *
         * If this is used with a per-cpu workqueue then the logic in
         * workqueue_select_cpu_near would need to be updated to allow for
         * some round robin type logic.
         */
        WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND));

        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                int cpu = select_numa_node_cpu(node);

                __queue_work(cpu, wq, work);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(queue_work_node);

void delayed_work_timer_fn(struct timer_list *t)
{
        struct delayed_work *dwork = from_timer(dwork, t, timer);

        /* should have been called from irqsafe timer with irq already off */
        __queue_work(dwork->cpu, dwork->wq, &dwork->work);
}
EXPORT_SYMBOL(delayed_work_timer_fn);

static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
                                struct delayed_work *dwork, unsigned long delay)
{
        struct timer_list *timer = &dwork->timer;
        struct work_struct *work = &dwork->work;

        WARN_ON_ONCE(!wq);
        WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
        WARN_ON_ONCE(timer_pending(timer));
        WARN_ON_ONCE(!list_empty(&work->entry));

        /*
         * If @delay is 0, queue @dwork->work immediately.  This is for
         * both optimization and correctness.  The earliest @timer can
         * expire is on the closest next tick and delayed_work users depend
         * on that there's no such delay when @delay is 0.
         */
        if (!delay) {
                __queue_work(cpu, wq, &dwork->work);
                return;
        }

        WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu));
        dwork->wq = wq;
        dwork->cpu = cpu;
        timer->expires = jiffies + delay;

        if (housekeeping_enabled(HK_TYPE_TIMER)) {
                /* If the current cpu is a housekeeping cpu, use it. */
                cpu = smp_processor_id();
                if (!housekeeping_test_cpu(cpu, HK_TYPE_TIMER))
                        cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
                add_timer_on(timer, cpu);
        } else {
                if (likely(cpu == WORK_CPU_UNBOUND))
                        add_timer_global(timer);
                else
                        add_timer_on(timer, cpu);
        }
}

/**
 * queue_delayed_work_on - queue work on specific CPU after delay
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * We queue the delayed_work to a specific CPU, for non-zero delays the
 * caller must ensure it is online and can't go away. Callers that fail
 * to ensure this, may get @dwork->timer queued to an offlined CPU and
 * this will prevent queueing of @dwork->work unless the offlined CPU
 * becomes online again.
 *
 * Return: %false if @work was already on a queue, %true otherwise.  If
 * @delay is zero and @dwork is idle, it will be scheduled for immediate
 * execution.
 */
bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                           struct delayed_work *dwork, unsigned long delay)
{
        struct work_struct *work = &dwork->work;
        bool ret = false;
        unsigned long irq_flags;

        /* read the comment in __queue_work() */
        local_irq_save(irq_flags);

        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !clear_pending_if_disabled(work)) {
                __queue_delayed_work(cpu, wq, dwork, delay);
                ret = true;
        }

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL(queue_delayed_work_on);

/**
 * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
 * @cpu: CPU number to execute work on
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise,
 * modify @dwork's timer so that it expires after @delay.  If @delay is
 * zero, @work is guaranteed to be scheduled immediately regardless of its
 * current state.
 *
 * Return: %false if @dwork was idle and queued, %true if @dwork was
 * pending and its timer was modified.
 *
 * This function is safe to call from any context including IRQ handler.
 * See try_to_grab_pending() for details.
 */
bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                         struct delayed_work *dwork, unsigned long delay)
{
        unsigned long irq_flags;
        bool ret;

        ret = work_grab_pending(&dwork->work, WORK_CANCEL_DELAYED, &irq_flags);

        if (!clear_pending_if_disabled(&dwork->work))
                __queue_delayed_work(cpu, wq, dwork, delay);

        local_irq_restore(irq_flags);
        return ret;
}
EXPORT_SYMBOL_GPL(mod_delayed_work_on);

static void rcu_work_rcufn(struct rcu_head *rcu)
{
        struct rcu_work *rwork = container_of(rcu, struct rcu_work, rcu);

        /* read the comment in __queue_work() */
        local_irq_disable();
        __queue_work(WORK_CPU_UNBOUND, rwork->wq, &rwork->work);
        local_irq_enable();
}

/**
 * queue_rcu_work - queue work after a RCU grace period
 * @wq: workqueue to use
 * @rwork: work to queue
 *
 * Return: %false if @rwork was already pending, %true otherwise.  Note
 * that a full RCU grace period is guaranteed only after a %true return.
 * While @rwork is guaranteed to be executed after a %false return, the
 * execution may happen before a full RCU grace period has passed.
 */
bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork)
{
        struct work_struct *work = &rwork->work;

        /*
         * rcu_work can't be canceled or disabled. Warn if the user reached
         * inside @rwork and disabled the inner work.
         */
        if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) &&
            !WARN_ON_ONCE(clear_pending_if_disabled(work))) {
                rwork->wq = wq;
                call_rcu_hurry(&rwork->rcu, rcu_work_rcufn);
                return true;
        }

        return false;
}
EXPORT_SYMBOL(queue_rcu_work);

static struct worker *alloc_worker(int node)
{
        struct worker *worker;

        worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
        if (worker) {
                INIT_LIST_HEAD(&worker->entry);
                INIT_LIST_HEAD(&worker->scheduled);
                INIT_LIST_HEAD(&worker->node);
                /* on creation a worker is in !idle && prep state */
                worker->flags = WORKER_PREP;
        }
        return worker;
}

static cpumask_t *pool_allowed_cpus(struct worker_pool *pool)
{
        if (pool->cpu < 0 && pool->attrs->affn_strict)
                return pool->attrs->__pod_cpumask;
        else
                return pool->attrs->cpumask;
}

/**
 * worker_attach_to_pool() - attach a worker to a pool
 * @worker: worker to be attached
 * @pool: the target pool
 *
 * Attach @worker to @pool.  Once attached, the %WORKER_UNBOUND flag and
 * cpu-binding of @worker are kept coordinated with the pool across
 * cpu-[un]hotplugs.
 */
static void worker_attach_to_pool(struct worker *worker,
                                  struct worker_pool *pool)
{
        mutex_lock(&wq_pool_attach_mutex);

        /*
         * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains stable
         * across this function. See the comments above the flag definition for
         * details. BH workers are, while per-CPU, always DISASSOCIATED.
         */
        if (pool->flags & POOL_DISASSOCIATED) {
                worker->flags |= WORKER_UNBOUND;
        } else {
                WARN_ON_ONCE(pool->flags & POOL_BH);
                kthread_set_per_cpu(worker->task, pool->cpu);
        }

        if (worker->rescue_wq)
                set_cpus_allowed_ptr(worker->task, pool_allowed_cpus(pool));

        list_add_tail(&worker->node, &pool->workers);
        worker->pool = pool;

        mutex_unlock(&wq_pool_attach_mutex);
}

static void unbind_worker(struct worker *worker)
{
        lockdep_assert_held(&wq_pool_attach_mutex);

        kthread_set_per_cpu(worker->task, -1);
        if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
        else
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
}


static void detach_worker(struct worker *worker)
{
        lockdep_assert_held(&wq_pool_attach_mutex);

        unbind_worker(worker);
        list_del(&worker->node);
}

/**
 * worker_detach_from_pool() - detach a worker from its pool
 * @worker: worker which is attached to its pool
 *
 * Undo the attaching which had been done in worker_attach_to_pool().  The
 * caller worker shouldn't access to the pool after detached except it has
 * other reference to the pool.
 */
static void worker_detach_from_pool(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        /* there is one permanent BH worker per CPU which should never detach */
        WARN_ON_ONCE(pool->flags & POOL_BH);

        mutex_lock(&wq_pool_attach_mutex);
        detach_worker(worker);
        worker->pool = NULL;
        mutex_unlock(&wq_pool_attach_mutex);

        /* clear leftover flags without pool->lock after it is detached */
        worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
}

static int format_worker_id(char *buf, size_t size, struct worker *worker,
                            struct worker_pool *pool)
{
        if (worker->rescue_wq)
                return scnprintf(buf, size, "kworker/R-%s",
                                 worker->rescue_wq->name);

        if (pool) {
                if (pool->cpu >= 0)
                        return scnprintf(buf, size, "kworker/%d:%d%s",
                                         pool->cpu, worker->id,
                                         pool->attrs->nice < 0  ? "H" : "");
                else
                        return scnprintf(buf, size, "kworker/u%d:%d",
                                         pool->id, worker->id);
        } else {
                return scnprintf(buf, size, "kworker/dying");
        }
}

/**
 * create_worker - create a new workqueue worker
 * @pool: pool the new worker will belong to
 *
 * Create and start a new worker which is attached to @pool.
 *
 * CONTEXT:
 * Might sleep.  Does GFP_KERNEL allocations.
 *
 * Return:
 * Pointer to the newly created worker.
 */
static struct worker *create_worker(struct worker_pool *pool)
{
        struct worker *worker;
        int id;

        /* ID is needed to determine kthread name */
        id = ida_alloc(&pool->worker_ida, GFP_KERNEL);
        if (id < 0) {
                pr_err_once("workqueue: Failed to allocate a worker ID: %pe\n",
                            ERR_PTR(id));
                return NULL;
        }

        worker = alloc_worker(pool->node);
        if (!worker) {
                pr_err_once("workqueue: Failed to allocate a worker\n");
                goto fail;
        }

        worker->id = id;

        if (!(pool->flags & POOL_BH)) {
                char id_buf[WORKER_ID_LEN];

                format_worker_id(id_buf, sizeof(id_buf), worker, pool);
                worker->task = kthread_create_on_node(worker_thread, worker,
                                                      pool->node, "%s", id_buf);
                if (IS_ERR(worker->task)) {
                        if (PTR_ERR(worker->task) == -EINTR) {
                                pr_err("workqueue: Interrupted when creating a worker thread \"%s\"\n",
                                       id_buf);
                        } else {
                                pr_err_once("workqueue: Failed to create a worker thread: %pe",
                                            worker->task);
                        }
                        goto fail;
                }

                set_user_nice(worker->task, pool->attrs->nice);
                kthread_bind_mask(worker->task, pool_allowed_cpus(pool));
        }

        /* successful, attach the worker to the pool */
        worker_attach_to_pool(worker, pool);

        /* start the newly created worker */
        raw_spin_lock_irq(&pool->lock);

        worker->pool->nr_workers++;
        worker_enter_idle(worker);

        /*
         * @worker is waiting on a completion in kthread() and will trigger hung
         * check if not woken up soon. As kick_pool() is noop if @pool is empty,
         * wake it up explicitly.
         */
        if (worker->task)
                wake_up_process(worker->task);

        raw_spin_unlock_irq(&pool->lock);

        return worker;

fail:
        ida_free(&pool->worker_ida, id);
        kfree(worker);
        return NULL;
}

static void detach_dying_workers(struct list_head *cull_list)
{
        struct worker *worker;

        list_for_each_entry(worker, cull_list, entry)
                detach_worker(worker);
}

static void reap_dying_workers(struct list_head *cull_list)
{
        struct worker *worker, *tmp;

        list_for_each_entry_safe(worker, tmp, cull_list, entry) {
                list_del_init(&worker->entry);
                kthread_stop_put(worker->task);
                kfree(worker);
        }
}

/**
 * set_worker_dying - Tag a worker for destruction
 * @worker: worker to be destroyed
 * @list: transfer worker away from its pool->idle_list and into list
 *
 * Tag @worker for destruction and adjust @pool stats accordingly.  The worker
 * should be idle.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void set_worker_dying(struct worker *worker, struct list_head *list)
{
        struct worker_pool *pool = worker->pool;

        lockdep_assert_held(&pool->lock);
        lockdep_assert_held(&wq_pool_attach_mutex);

        /* sanity check frenzy */
        if (WARN_ON(worker->current_work) ||
            WARN_ON(!list_empty(&worker->scheduled)) ||
            WARN_ON(!(worker->flags & WORKER_IDLE)))
                return;

        pool->nr_workers--;
        pool->nr_idle--;

        worker->flags |= WORKER_DIE;

        list_move(&worker->entry, list);

        /* get an extra task struct reference for later kthread_stop_put() */
        get_task_struct(worker->task);
}

/**
 * idle_worker_timeout - check if some idle workers can now be deleted.
 * @t: The pool's idle_timer that just expired
 *
 * The timer is armed in worker_enter_idle(). Note that it isn't disarmed in
 * worker_leave_idle(), as a worker flicking between idle and active while its
 * pool is at the too_many_workers() tipping point would cause too much timer
 * housekeeping overhead. Since IDLE_WORKER_TIMEOUT is long enough, we just let
 * it expire and re-evaluate things from there.
 */
static void idle_worker_timeout(struct timer_list *t)
{
        struct worker_pool *pool = from_timer(pool, t, idle_timer);
        bool do_cull = false;

        if (work_pending(&pool->idle_cull_work))
                return;

        raw_spin_lock_irq(&pool->lock);

        if (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                /* idle_list is kept in LIFO order, check the last one */
                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;
                do_cull = !time_before(jiffies, expires);

                if (!do_cull)
                        mod_timer(&pool->idle_timer, expires);
        }
        raw_spin_unlock_irq(&pool->lock);

        if (do_cull)
                queue_work(system_unbound_wq, &pool->idle_cull_work);
}

/**
 * idle_cull_fn - cull workers that have been idle for too long.
 * @work: the pool's work for handling these idle workers
 *
 * This goes through a pool's idle workers and gets rid of those that have been
 * idle for at least IDLE_WORKER_TIMEOUT seconds.
 *
 * We don't want to disturb isolated CPUs because of a pcpu kworker being
 * culled, so this also resets worker affinity. This requires a sleepable
 * context, hence the split between timer callback and work item.
 */
static void idle_cull_fn(struct work_struct *work)
{
        struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work);
        LIST_HEAD(cull_list);

        /*
         * Grabbing wq_pool_attach_mutex here ensures an already-running worker
         * cannot proceed beyong set_pf_worker() in its self-destruct path.
         * This is required as a previously-preempted worker could run after
         * set_worker_dying() has happened but before detach_dying_workers() did.
         */
        mutex_lock(&wq_pool_attach_mutex);
        raw_spin_lock_irq(&pool->lock);

        while (too_many_workers(pool)) {
                struct worker *worker;
                unsigned long expires;

                worker = list_last_entry(&pool->idle_list, struct worker, entry);
                expires = worker->last_active + IDLE_WORKER_TIMEOUT;

                if (time_before(jiffies, expires)) {
                        mod_timer(&pool->idle_timer, expires);
                        break;
                }

                set_worker_dying(worker, &cull_list);
        }

        raw_spin_unlock_irq(&pool->lock);
        detach_dying_workers(&cull_list);
        mutex_unlock(&wq_pool_attach_mutex);

        reap_dying_workers(&cull_list);
}

static void send_mayday(struct work_struct *work)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq_mayday_lock);

        if (!wq->rescuer)
                return;

        /* mayday mayday mayday */
        if (list_empty(&pwq->mayday_node)) {
                /*
                 * If @pwq is for an unbound wq, its base ref may be put at
                 * any time due to an attribute change.  Pin @pwq until the
                 * rescuer is done with it.
                 */
                get_pwq(pwq);
                list_add_tail(&pwq->mayday_node, &wq->maydays);
                wake_up_process(wq->rescuer->task);
                pwq->stats[PWQ_STAT_MAYDAY]++;
        }
}

static void pool_mayday_timeout(struct timer_list *t)
{
        struct worker_pool *pool = from_timer(pool, t, mayday_timer);
        struct work_struct *work;

        raw_spin_lock_irq(&pool->lock);
        raw_spin_lock(&wq_mayday_lock);                /* for wq->maydays */

        if (need_to_create_worker(pool)) {
                /*
                 * We've been trying to create a new worker but
                 * haven't been successful.  We might be hitting an
                 * allocation deadlock.  Send distress signals to
                 * rescuers.
                 */
                list_for_each_entry(work, &pool->worklist, entry)
                        send_mayday(work);
        }

        raw_spin_unlock(&wq_mayday_lock);
        raw_spin_unlock_irq(&pool->lock);

        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}

/**
 * maybe_create_worker - create a new worker if necessary
 * @pool: pool to create a new worker for
 *
 * Create a new worker for @pool if necessary.  @pool is guaranteed to
 * have at least one idle worker on return from this function.  If
 * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is
 * sent to all rescuers with works scheduled on @pool to resolve
 * possible allocation deadlock.
 *
 * On return, need_to_create_worker() is guaranteed to be %false and
 * may_start_working() %true.
 *
 * LOCKING:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.  Called only from
 * manager.
 */
static void maybe_create_worker(struct worker_pool *pool)
__releases(&pool->lock)
__acquires(&pool->lock)
{
restart:
        raw_spin_unlock_irq(&pool->lock);

        /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */
        mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);

        while (true) {
                if (create_worker(pool) || !need_to_create_worker(pool))
                        break;

                schedule_timeout_interruptible(CREATE_COOLDOWN);

                if (!need_to_create_worker(pool))
                        break;
        }

        timer_delete_sync(&pool->mayday_timer);
        raw_spin_lock_irq(&pool->lock);
        /*
         * This is necessary even after a new worker was just successfully
         * created as @pool->lock was dropped and the new worker might have
         * already become busy.
         */
        if (need_to_create_worker(pool))
                goto restart;
}

/**
 * manage_workers - manage worker pool
 * @worker: self
 *
 * Assume the manager role and manage the worker pool @worker belongs
 * to.  At any given time, there can be only zero or one manager per
 * pool.  The exclusion is handled automatically by this function.
 *
 * The caller can safely start processing works on false return.  On
 * true return, it's guaranteed that need_to_create_worker() is false
 * and may_start_working() is true.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.  Does GFP_KERNEL allocations.
 *
 * Return:
 * %false if the pool doesn't need management and the caller can safely
 * start processing works, %true if management function was performed and
 * the conditions that the caller verified before calling the function may
 * no longer be true.
 */
static bool manage_workers(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & POOL_MANAGER_ACTIVE)
                return false;

        pool->flags |= POOL_MANAGER_ACTIVE;
        pool->manager = worker;

        maybe_create_worker(pool);

        pool->manager = NULL;
        pool->flags &= ~POOL_MANAGER_ACTIVE;
        rcuwait_wake_up(&manager_wait);
        return true;
}

/**
 * process_one_work - process single work
 * @worker: self
 * @work: work to process
 *
 * Process @work.  This function contains all the logics necessary to
 * process a single work including synchronization against and
 * interaction with other workers on the same cpu, queueing and
 * flushing.  As long as context requirement is met, any worker can
 * call this function to process a work.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which is released and regrabbed.
 */
static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
        struct pool_workqueue *pwq = get_work_pwq(work);
        struct worker_pool *pool = worker->pool;
        unsigned long work_data;
        int lockdep_start_depth, rcu_start_depth;
        bool bh_draining = pool->flags & POOL_BH_DRAINING;
#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the struct work_struct from
         * inside the function that is called from it, this we need to
         * take into account for lockdep too.  To avoid bogus "held
         * lock freed" warnings as well as problems when looking into
         * work->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
        /* ensure we're on the correct CPU */
        WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
                     raw_smp_processor_id() != pool->cpu);

        /* claim and dequeue */
        debug_work_deactivate(work);
        hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
        worker->current_work = work;
        worker->current_func = work->func;
        worker->current_pwq = pwq;
        if (worker->task)
                worker->current_at = worker->task->se.sum_exec_runtime;
        work_data = *work_data_bits(work);
        worker->current_color = get_work_color(work_data);

        /*
         * Record wq name for cmdline and debug reporting, may get
         * overridden through set_worker_desc().
         */
        strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

        list_del_init(&work->entry);

        /*
         * CPU intensive works don't participate in concurrency management.
         * They're the scheduler's responsibility.  This takes @worker out
         * of concurrency management and the next code block will chain
         * execution of the pending work items.
         */
        if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
                worker_set_flags(worker, WORKER_CPU_INTENSIVE);

        /*
         * Kick @pool if necessary. It's always noop for per-cpu worker pools
         * since nr_running would always be >= 1 at this point. This is used to
         * chain execution of the pending work items for WORKER_NOT_RUNNING
         * workers such as the UNBOUND and CPU_INTENSIVE ones.
         */
        kick_pool(pool);

        /*
         * Record the last pool and clear PENDING which should be the last
         * update to @work.  Also, do this inside @pool->lock so that
         * PENDING and queued state changes happen together while IRQ is
         * disabled.
         */
        set_work_pool_and_clear_pending(work, pool->id, pool_offq_flags(pool));

        pwq->stats[PWQ_STAT_STARTED]++;
        raw_spin_unlock_irq(&pool->lock);

        rcu_start_depth = rcu_preempt_depth();
        lockdep_start_depth = lockdep_depth(current);
        /* see drain_dead_softirq_workfn() */
        if (!bh_draining)
                lock_map_acquire(pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        /*
         * Strictly speaking we should mark the invariant state without holding
         * any locks, that is, before these two lock_map_acquire()'s.
         *
         * However, that would result in:
         *
         *   A(W1)
         *   WFC(C)
         *                A(W1)
         *                C(C)
         *
         * Which would create W1->C->W1 dependencies, even though there is no
         * actual deadlock possible. There are two solutions, using a
         * read-recursive acquire on the work(queue) 'locks', but this will then
         * hit the lockdep limitation on recursive locks, or simply discard
         * these locks.
         *
         * AFAICT there is no possible deadlock scenario between the
         * flush_work() and complete() primitives (except for single-threaded
         * workqueues), so hiding them isn't a problem.
         */
        lockdep_invariant_state(true);
        trace_workqueue_execute_start(work);
        worker->current_func(work);
        /*
         * While we must be careful to not use "work" after this, the trace
         * point will only record its address.
         */
        trace_workqueue_execute_end(work, worker->current_func);
        pwq->stats[PWQ_STAT_COMPLETED]++;
        lock_map_release(&lockdep_map);
        if (!bh_draining)
                lock_map_release(pwq->wq->lockdep_map);

        if (unlikely((worker->task && in_atomic()) ||
                     lockdep_depth(current) != lockdep_start_depth ||
                     rcu_preempt_depth() != rcu_start_depth)) {
                pr_err("BUG: workqueue leaked atomic, lock or RCU: %s[%d]\n"
                       "     preempt=0x%08x lock=%d->%d RCU=%d->%d workfn=%ps\n",
                       current->comm, task_pid_nr(current), preempt_count(),
                       lockdep_start_depth, lockdep_depth(current),
                       rcu_start_depth, rcu_preempt_depth(),
                       worker->current_func);
                debug_show_held_locks(current);
                dump_stack();
        }

        /*
         * The following prevents a kworker from hogging CPU on !PREEMPTION
         * kernels, where a requeueing work item waiting for something to
         * happen could deadlock with stop_machine as such work item could
         * indefinitely requeue itself while all other CPUs are trapped in
         * stop_machine. At the same time, report a quiescent RCU state so
         * the same condition doesn't freeze RCU.
         */
        if (worker->task)
                cond_resched();

        raw_spin_lock_irq(&pool->lock);

        /*
         * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
         * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
         * wq_cpu_intensive_thresh_us. Clear it.
         */
        worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

        /* tag the worker for identification in schedule() */
        worker->last_func = worker->current_func;

        /* we're done with it, release */
        hash_del(&worker->hentry);
        worker->current_work = NULL;
        worker->current_func = NULL;
        worker->current_pwq = NULL;
        worker->current_color = INT_MAX;

        /* must be the last step, see the function comment */
        pwq_dec_nr_in_flight(pwq, work_data);
}

/**
 * process_scheduled_works - process scheduled works
 * @worker: self
 *
 * Process all scheduled works.  Please note that the scheduled list
 * may change while processing a work, so this function repeatedly
 * fetches a work from the top and executes it.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock) which may be released and regrabbed
 * multiple times.
 */
static void process_scheduled_works(struct worker *worker)
{
        struct work_struct *work;
        bool first = true;

        while ((work = list_first_entry_or_null(&worker->scheduled,
                                                struct work_struct, entry))) {
                if (first) {
                        worker->pool->watchdog_ts = jiffies;
                        first = false;
                }
                process_one_work(worker, work);
        }
}

static void set_pf_worker(bool val)
{
        mutex_lock(&wq_pool_attach_mutex);
        if (val)
                current->flags |= PF_WQ_WORKER;
        else
                current->flags &= ~PF_WQ_WORKER;
        mutex_unlock(&wq_pool_attach_mutex);
}

/**
 * worker_thread - the worker thread function
 * @__worker: self
 *
 * The worker thread function.  All workers belong to a worker_pool -
 * either a per-cpu one or dynamic unbound one.  These workers process all
 * work items regardless of their specific target workqueue.  The only
 * exception is work items which belong to workqueues with a rescuer which
 * will be explained in rescuer_thread().
 *
 * Return: 0
 */
static int worker_thread(void *__worker)
{
        struct worker *worker = __worker;
        struct worker_pool *pool = worker->pool;

        /* tell the scheduler that this is a workqueue worker */
        set_pf_worker(true);
woke_up:
        raw_spin_lock_irq(&pool->lock);

        /* am I supposed to die? */
        if (unlikely(worker->flags & WORKER_DIE)) {
                raw_spin_unlock_irq(&pool->lock);
                set_pf_worker(false);
                /*
                 * The worker is dead and PF_WQ_WORKER is cleared, worker->pool
                 * shouldn't be accessed, reset it to NULL in case otherwise.
                 */
                worker->pool = NULL;
                ida_free(&pool->worker_ida, worker->id);
                return 0;
        }

        worker_leave_idle(worker);
recheck:
        /* no more worker necessary? */
        if (!need_more_worker(pool))
                goto sleep;

        /* do we need to manage? */
        if (unlikely(!may_start_working(pool)) && manage_workers(worker))
                goto recheck;

        /*
         * ->scheduled list can only be filled while a worker is
         * preparing to process a work or actually processing it.
         * Make sure nobody diddled with it while I was sleeping.
         */
        WARN_ON_ONCE(!list_empty(&worker->scheduled));

        /*
         * Finish PREP stage.  We're guaranteed to have at least one idle
         * worker or that someone else has already assumed the manager
         * role.  This is where @worker starts participating in concurrency
         * management if applicable and concurrency management is restored
         * after being rebound.  See rebind_workers() for details.
         */
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool));

        worker_set_flags(worker, WORKER_PREP);
sleep:
        /*
         * pool->lock is held and there's no work to process and no need to
         * manage, sleep.  Workers are woken up only while holding
         * pool->lock or from local cpu, so setting the current state
         * before releasing pool->lock is enough to prevent losing any
         * event.
         */
        worker_enter_idle(worker);
        __set_current_state(TASK_IDLE);
        raw_spin_unlock_irq(&pool->lock);
        schedule();
        goto woke_up;
}

/**
 * rescuer_thread - the rescuer thread function
 * @__rescuer: self
 *
 * Workqueue rescuer thread function.  There's one rescuer for each
 * workqueue which has WQ_MEM_RECLAIM set.
 *
 * Regular work processing on a pool may block trying to create a new
 * worker which uses GFP_KERNEL allocation which has slight chance of
 * developing into deadlock if some works currently on the same queue
 * need to be processed to satisfy the GFP_KERNEL allocation.  This is
 * the problem rescuer solves.
 *
 * When such condition is possible, the pool summons rescuers of all
 * workqueues which have works queued on the pool and let them process
 * those works so that forward progress can be guaranteed.
 *
 * This should happen rarely.
 *
 * Return: 0
 */
static int rescuer_thread(void *__rescuer)
{
        struct worker *rescuer = __rescuer;
        struct workqueue_struct *wq = rescuer->rescue_wq;
        bool should_stop;

        set_user_nice(current, RESCUER_NICE_LEVEL);

        /*
         * Mark rescuer as worker too.  As WORKER_PREP is never cleared, it
         * doesn't participate in concurrency management.
         */
        set_pf_worker(true);
repeat:
        set_current_state(TASK_IDLE);

        /*
         * By the time the rescuer is requested to stop, the workqueue
         * shouldn't have any work pending, but @wq->maydays may still have
         * pwq(s) queued.  This can happen by non-rescuer workers consuming
         * all the work items before the rescuer got to them.  Go through
         * @wq->maydays processing before acting on should_stop so that the
         * list is always empty on exit.
         */
        should_stop = kthread_should_stop();

        /* see whether any pwq is asking for help */
        raw_spin_lock_irq(&wq_mayday_lock);

        while (!list_empty(&wq->maydays)) {
                struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
                                        struct pool_workqueue, mayday_node);
                struct worker_pool *pool = pwq->pool;
                struct work_struct *work, *n;

                __set_current_state(TASK_RUNNING);
                list_del_init(&pwq->mayday_node);

                raw_spin_unlock_irq(&wq_mayday_lock);

                worker_attach_to_pool(rescuer, pool);

                raw_spin_lock_irq(&pool->lock);

                /*
                 * Slurp in all works issued via this workqueue and
                 * process'em.
                 */
                WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
                list_for_each_entry_safe(work, n, &pool->worklist, entry) {
                        if (get_work_pwq(work) == pwq &&
                            assign_work(work, rescuer, &n))
                                pwq->stats[PWQ_STAT_RESCUED]++;
                }

                if (!list_empty(&rescuer->scheduled)) {
                        process_scheduled_works(rescuer);

                        /*
                         * The above execution of rescued work items could
                         * have created more to rescue through
                         * pwq_activate_first_inactive() or chained
                         * queueing.  Let's put @pwq back on mayday list so
                         * that such back-to-back work items, which may be
                         * being used to relieve memory pressure, don't
                         * incur MAYDAY_INTERVAL delay inbetween.
                         */
                        if (pwq->nr_active && need_to_create_worker(pool)) {
                                raw_spin_lock(&wq_mayday_lock);
                                /*
                                 * Queue iff we aren't racing destruction
                                 * and somebody else hasn't queued it already.
                                 */
                                if (wq->rescuer && list_empty(&pwq->mayday_node)) {
                                        get_pwq(pwq);
                                        list_add_tail(&pwq->mayday_node, &wq->maydays);
                                }
                                raw_spin_unlock(&wq_mayday_lock);
                        }
                }

                /*
                 * Leave this pool. Notify regular workers; otherwise, we end up
                 * with 0 concurrency and stalling the execution.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                worker_detach_from_pool(rescuer);

                /*
                 * Put the reference grabbed by send_mayday().  @pool might
                 * go away any time after it.
                 */
                put_pwq_unlocked(pwq);

                raw_spin_lock_irq(&wq_mayday_lock);
        }

        raw_spin_unlock_irq(&wq_mayday_lock);

        if (should_stop) {
                __set_current_state(TASK_RUNNING);
                set_pf_worker(false);
                return 0;
        }

        /* rescuers should never participate in concurrency management */
        WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
        schedule();
        goto repeat;
}

static void bh_worker(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;
        int nr_restarts = BH_WORKER_RESTARTS;
        unsigned long end = jiffies + BH_WORKER_JIFFIES;

        raw_spin_lock_irq(&pool->lock);
        worker_leave_idle(worker);

        /*
         * This function follows the structure of worker_thread(). See there for
         * explanations on each step.
         */
        if (!need_more_worker(pool))
                goto done;

        WARN_ON_ONCE(!list_empty(&worker->scheduled));
        worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

        do {
                struct work_struct *work =
                        list_first_entry(&pool->worklist,
                                         struct work_struct, entry);

                if (assign_work(work, worker, NULL))
                        process_scheduled_works(worker);
        } while (keep_working(pool) &&
                 --nr_restarts && time_before(jiffies, end));

        worker_set_flags(worker, WORKER_PREP);
done:
        worker_enter_idle(worker);
        kick_pool(pool);
        raw_spin_unlock_irq(&pool->lock);
}

/*
 * TODO: Convert all tasklet users to workqueue and use softirq directly.
 *
 * This is currently called from tasklet[_hi]action() and thus is also called
 * whenever there are tasklets to run. Let's do an early exit if there's nothing
 * queued. Once conversion from tasklet is complete, the need_more_worker() test
 * can be dropped.
 *
 * After full conversion, we'll add worker->softirq_action, directly use the
 * softirq action and obtain the worker pointer from the softirq_action pointer.
 */
void workqueue_softirq_action(bool highpri)
{
        struct worker_pool *pool =
                &per_cpu(bh_worker_pools, smp_processor_id())[highpri];
        if (need_more_worker(pool))
                bh_worker(list_first_entry(&pool->workers, struct worker, node));
}

struct wq_drain_dead_softirq_work {
        struct work_struct        work;
        struct worker_pool        *pool;
        struct completion        done;
};

static void drain_dead_softirq_workfn(struct work_struct *work)
{
        struct wq_drain_dead_softirq_work *dead_work =
                container_of(work, struct wq_drain_dead_softirq_work, work);
        struct worker_pool *pool = dead_work->pool;
        bool repeat;

        /*
         * @pool's CPU is dead and we want to execute its still pending work
         * items from this BH work item which is running on a different CPU. As
         * its CPU is dead, @pool can't be kicked and, as work execution path
         * will be nested, a lockdep annotation needs to be suppressed. Mark
         * @pool with %POOL_BH_DRAINING for the special treatments.
         */
        raw_spin_lock_irq(&pool->lock);
        pool->flags |= POOL_BH_DRAINING;
        raw_spin_unlock_irq(&pool->lock);

        bh_worker(list_first_entry(&pool->workers, struct worker, node));

        raw_spin_lock_irq(&pool->lock);
        pool->flags &= ~POOL_BH_DRAINING;
        repeat = need_more_worker(pool);
        raw_spin_unlock_irq(&pool->lock);

        /*
         * bh_worker() might hit consecutive execution limit and bail. If there
         * still are pending work items, reschedule self and return so that we
         * don't hog this CPU's BH.
         */
        if (repeat) {
                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, work);
                else
                        queue_work(system_bh_wq, work);
        } else {
                complete(&dead_work->done);
        }
}

/*
 * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
 * possible to allocate dead_work per CPU and avoid flushing. However, then we
 * have to worry about draining overlapping with CPU coming back online or
 * nesting (one CPU's dead_work queued on another CPU which is also dead and so
 * on). Let's keep it simple and drain them synchronously. These are BH work
 * items which shouldn't be requeued on the same pool. Shouldn't take long.
 */
void workqueue_softirq_dead(unsigned int cpu)
{
        int i;

        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
                struct wq_drain_dead_softirq_work dead_work;

                if (!need_more_worker(pool))
                        continue;

                INIT_WORK_ONSTACK(&dead_work.work, drain_dead_softirq_workfn);
                dead_work.pool = pool;
                init_completion(&dead_work.done);

                if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
                        queue_work(system_bh_highpri_wq, &dead_work.work);
                else
                        queue_work(system_bh_wq, &dead_work.work);

                wait_for_completion(&dead_work.done);
                destroy_work_on_stack(&dead_work.work);
        }
}

/**
 * check_flush_dependency - check for flush dependency sanity
 * @target_wq: workqueue being flushed
 * @target_work: work item being flushed (NULL for workqueue flushes)
 * @from_cancel: are we called from the work cancel path
 *
 * %current is trying to flush the whole @target_wq or @target_work on it.
 * If this is not the cancel path (which implies work being flushed is either
 * already running, or will not be at all), check if @target_wq doesn't have
 * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running
 * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward-
 * progress guarantee leading to a deadlock.
 */
static void check_flush_dependency(struct workqueue_struct *target_wq,
                                   struct work_struct *target_work,
                                   bool from_cancel)
{
        work_func_t target_func;
        struct worker *worker;

        if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM)
                return;

        worker = current_wq_worker();
        target_func = target_work ? target_work->func : NULL;

        WARN_ONCE(current->flags & PF_MEMALLOC,
                  "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
                  current->pid, current->comm, target_wq->name, target_func);
        WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
                              (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
                  "workqueue: WQ_MEM_RECLAIM %s:%ps is flushing !WQ_MEM_RECLAIM %s:%ps",
                  worker->current_pwq->wq->name, worker->current_func,
                  target_wq->name, target_func);
}

struct wq_barrier {
        struct work_struct        work;
        struct completion        done;
        struct task_struct        *task;        /* purely informational */
};

static void wq_barrier_func(struct work_struct *work)
{
        struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
        complete(&barr->done);
}

/**
 * insert_wq_barrier - insert a barrier work
 * @pwq: pwq to insert barrier into
 * @barr: wq_barrier to insert
 * @target: target work to attach @barr to
 * @worker: worker currently executing @target, NULL if @target is not executing
 *
 * @barr is linked to @target such that @barr is completed only after
 * @target finishes execution.  Please note that the ordering
 * guarantee is observed only with respect to @target and on the local
 * cpu.
 *
 * Currently, a queued barrier can't be canceled.  This is because
 * try_to_grab_pending() can't determine whether the work to be
 * grabbed is at the head of the queue and thus can't clear LINKED
 * flag of the previous work while there must be a valid next work
 * after a work with LINKED flag set.
 *
 * Note that when @worker is non-NULL, @target may be modified
 * underneath us, so we can't reliably determine pwq from @target.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock).
 */
static void insert_wq_barrier(struct pool_workqueue *pwq,
                              struct wq_barrier *barr,
                              struct work_struct *target, struct worker *worker)
{
        static __maybe_unused struct lock_class_key bh_key, thr_key;
        unsigned int work_flags = 0;
        unsigned int work_color;
        struct list_head *head;

        /*
         * debugobject calls are safe here even with pool->lock locked
         * as we know for sure that this will not trigger any of the
         * checks and call back into the fixup functions where we
         * might deadlock.
         *
         * BH and threaded workqueues need separate lockdep keys to avoid
         * spuriously triggering "inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W}
         * usage".
         */
        INIT_WORK_ONSTACK_KEY(&barr->work, wq_barrier_func,
                              (pwq->wq->flags & WQ_BH) ? &bh_key : &thr_key);
        __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));

        init_completion_map(&barr->done, &target->lockdep_map);

        barr->task = current;

        /* The barrier work item does not participate in nr_active. */
        work_flags |= WORK_STRUCT_INACTIVE;

        /*
         * If @target is currently being executed, schedule the
         * barrier to the worker; otherwise, put it after @target.
         */
        if (worker) {
                head = worker->scheduled.next;
                work_color = worker->current_color;
        } else {
                unsigned long *bits = work_data_bits(target);

                head = target->entry.next;
                /* there can already be other linked works, inherit and set */
                work_flags |= *bits & WORK_STRUCT_LINKED;
                work_color = get_work_color(*bits);
                __set_bit(WORK_STRUCT_LINKED_BIT, bits);
        }

        pwq->nr_in_flight[work_color]++;
        work_flags |= work_color_to_flags(work_color);

        insert_work(pwq, &barr->work, head, work_flags);
}

/**
 * flush_workqueue_prep_pwqs - prepare pwqs for workqueue flushing
 * @wq: workqueue being flushed
 * @flush_color: new flush color, < 0 for no-op
 * @work_color: new work color, < 0 for no-op
 *
 * Prepare pwqs for workqueue flushing.
 *
 * If @flush_color is non-negative, flush_color on all pwqs should be
 * -1.  If no pwq has in-flight commands at the specified color, all
 * pwq->flush_color's stay at -1 and %false is returned.  If any pwq
 * has in flight commands, its pwq->flush_color is set to
 * @flush_color, @wq->nr_pwqs_to_flush is updated accordingly, pwq
 * wakeup logic is armed and %true is returned.
 *
 * The caller should have initialized @wq->first_flusher prior to
 * calling this function with non-negative @flush_color.  If
 * @flush_color is negative, no flush color update is done and %false
 * is returned.
 *
 * If @work_color is non-negative, all pwqs should have the same
 * work_color which is previous to @work_color and all will be
 * advanced to @work_color.
 *
 * CONTEXT:
 * mutex_lock(wq->mutex).
 *
 * Return:
 * %true if @flush_color >= 0 and there's something to flush.  %false
 * otherwise.
 */
static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
                                      int flush_color, int work_color)
{
        bool wait = false;
        struct pool_workqueue *pwq;
        struct worker_pool *current_pool = NULL;

        if (flush_color >= 0) {
                WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
                atomic_set(&wq->nr_pwqs_to_flush, 1);
        }

        /*
         * For unbound workqueue, pwqs will map to only a few pools.
         * Most of the time, pwqs within the same pool will be linked
         * sequentially to wq->pwqs by cpu index. So in the majority
         * of pwq iters, the pool is the same, only doing lock/unlock
         * if the pool has changed. This can largely reduce expensive
         * lock operations.
         */
        for_each_pwq(pwq, wq) {
                if (current_pool != pwq->pool) {
                        if (likely(current_pool))
                                raw_spin_unlock_irq(&current_pool->lock);
                        current_pool = pwq->pool;
                        raw_spin_lock_irq(&current_pool->lock);
                }

                if (flush_color >= 0) {
                        WARN_ON_ONCE(pwq->flush_color != -1);

                        if (pwq->nr_in_flight[flush_color]) {
                                pwq->flush_color = flush_color;
                                atomic_inc(&wq->nr_pwqs_to_flush);
                                wait = true;
                        }
                }

                if (work_color >= 0) {
                        WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
                        pwq->work_color = work_color;
                }

        }

        if (current_pool)
                raw_spin_unlock_irq(&current_pool->lock);

        if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush))
                complete(&wq->first_flusher->done);

        return wait;
}

static void touch_wq_lockdep_map(struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (unlikely(!wq->lockdep_map))
                return;

        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(wq->lockdep_map);
        lock_map_release(wq->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

static void touch_work_lockdep_map(struct work_struct *work,
                                   struct workqueue_struct *wq)
{
#ifdef CONFIG_LOCKDEP
        if (wq->flags & WQ_BH)
                local_bh_disable();

        lock_map_acquire(&work->lockdep_map);
        lock_map_release(&work->lockdep_map);

        if (wq->flags & WQ_BH)
                local_bh_enable();
#endif
}

/**
 * __flush_workqueue - ensure that any scheduled work has run to completion.
 * @wq: workqueue to flush
 *
 * This function sleeps until all work items which were queued on entry
 * have finished execution, but it is not livelocked by new incoming ones.
 */
void __flush_workqueue(struct workqueue_struct *wq)
{
        struct wq_flusher this_flusher = {
                .list = LIST_HEAD_INIT(this_flusher.list),
                .flush_color = -1,
                .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, (*wq->lockdep_map)),
        };
        int next_color;

        if (WARN_ON(!wq_online))
                return;

        touch_wq_lockdep_map(wq);

        mutex_lock(&wq->mutex);

        /*
         * Start-to-wait phase
         */
        next_color = work_next_color(wq->work_color);

        if (next_color != wq->flush_color) {
                /*
                 * Color space is not full.  The current work_color
                 * becomes our flush_color and work_color is advanced
                 * by one.
                 */
                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
                this_flusher.flush_color = wq->work_color;
                wq->work_color = next_color;

                if (!wq->first_flusher) {
                        /* no flush in progress, become the first flusher */
                        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

                        wq->first_flusher = &this_flusher;

                        if (!flush_workqueue_prep_pwqs(wq, wq->flush_color,
                                                       wq->work_color)) {
                                /* nothing to flush, done */
                                wq->flush_color = next_color;
                                wq->first_flusher = NULL;
                                goto out_unlock;
                        }
                } else {
                        /* wait in queue */
                        WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
                        list_add_tail(&this_flusher.list, &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }
        } else {
                /*
                 * Oops, color space is full, wait on overflow queue.
                 * The next flush completion will assign us
                 * flush_color and transfer to flusher_queue.
                 */
                list_add_tail(&this_flusher.list, &wq->flusher_overflow);
        }

        check_flush_dependency(wq, NULL, false);

        mutex_unlock(&wq->mutex);

        wait_for_completion(&this_flusher.done);

        /*
         * Wake-up-and-cascade phase
         *
         * First flushers are responsible for cascading flushes and
         * handling overflow.  Non-first flushers can simply return.
         */
        if (READ_ONCE(wq->first_flusher) != &this_flusher)
                return;

        mutex_lock(&wq->mutex);

        /* we might have raced, check again with mutex held */
        if (wq->first_flusher != &this_flusher)
                goto out_unlock;

        WRITE_ONCE(wq->first_flusher, NULL);

        WARN_ON_ONCE(!list_empty(&this_flusher.list));
        WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);

        while (true) {
                struct wq_flusher *next, *tmp;

                /* complete all the flushers sharing the current flush color */
                list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) {
                        if (next->flush_color != wq->flush_color)
                                break;
                        list_del_init(&next->list);
                        complete(&next->done);
                }

                WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
                             wq->flush_color != work_next_color(wq->work_color));

                /* this flush_color is finished, advance by one */
                wq->flush_color = work_next_color(wq->flush_color);

                /* one color has been freed, handle overflow queue */
                if (!list_empty(&wq->flusher_overflow)) {
                        /*
                         * Assign the same color to all overflowed
                         * flushers, advance work_color and append to
                         * flusher_queue.  This is the start-to-wait
                         * phase for these overflowed flushers.
                         */
                        list_for_each_entry(tmp, &wq->flusher_overflow, list)
                                tmp->flush_color = wq->work_color;

                        wq->work_color = work_next_color(wq->work_color);

                        list_splice_tail_init(&wq->flusher_overflow,
                                              &wq->flusher_queue);
                        flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
                }

                if (list_empty(&wq->flusher_queue)) {
                        WARN_ON_ONCE(wq->flush_color != wq->work_color);
                        break;
                }

                /*
                 * Need to flush more colors.  Make the next flusher
                 * the new first flusher and arm pwqs.
                 */
                WARN_ON_ONCE(wq->flush_color == wq->work_color);
                WARN_ON_ONCE(wq->flush_color != next->flush_color);

                list_del_init(&next->list);
                wq->first_flusher = next;

                if (flush_workqueue_prep_pwqs(wq, wq->flush_color, -1))
                        break;

                /*
                 * Meh... this color is already done, clear first
                 * flusher and repeat cascading.
                 */
                wq->first_flusher = NULL;
        }

out_unlock:
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL(__flush_workqueue);

/**
 * drain_workqueue - drain a workqueue
 * @wq: workqueue to drain
 *
 * Wait until the workqueue becomes empty.  While draining is in progress,
 * only chain queueing is allowed.  IOW, only currently pending or running
 * work items on @wq can queue further work items on it.  @wq is flushed
 * repeatedly until it becomes empty.  The number of flushing is determined
 * by the depth of chaining and should be relatively short.  Whine if it
 * takes too long.
 */
void drain_workqueue(struct workqueue_struct *wq)
{
        unsigned int flush_cnt = 0;
        struct pool_workqueue *pwq;

        /*
         * __queue_work() needs to test whether there are drainers, is much
         * hotter than drain_workqueue() and already looks at @wq->flags.
         * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
         */
        mutex_lock(&wq->mutex);
        if (!wq->nr_drainers++)
                wq->flags |= __WQ_DRAINING;
        mutex_unlock(&wq->mutex);
reflush:
        __flush_workqueue(wq);

        mutex_lock(&wq->mutex);

        for_each_pwq(pwq, wq) {
                bool drained;

                raw_spin_lock_irq(&pwq->pool->lock);
                drained = pwq_is_empty(pwq);
                raw_spin_unlock_irq(&pwq->pool->lock);

                if (drained)
                        continue;

                if (++flush_cnt == 10 ||
                    (flush_cnt % 100 == 0 && flush_cnt <= 1000))
                        pr_warn("workqueue %s: %s() isn't complete after %u tries\n",
                                wq->name, __func__, flush_cnt);

                mutex_unlock(&wq->mutex);
                goto reflush;
        }

        if (!--wq->nr_drainers)
                wq->flags &= ~__WQ_DRAINING;
        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(drain_workqueue);

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
                             bool from_cancel)
{
        struct worker *worker = NULL;
        struct worker_pool *pool;
        struct pool_workqueue *pwq;
        struct workqueue_struct *wq;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (!pool) {
                rcu_read_unlock();
                return false;
        }

        raw_spin_lock_irq(&pool->lock);
        /* see the comment in try_to_grab_pending() with the same code */
        pwq = get_work_pwq(work);
        if (pwq) {
                if (unlikely(pwq->pool != pool))
                        goto already_gone;
        } else {
                worker = find_worker_executing_work(pool, work);
                if (!worker)
                        goto already_gone;
                pwq = worker->current_pwq;
        }

        wq = pwq->wq;
        check_flush_dependency(wq, work, from_cancel);

        insert_wq_barrier(pwq, barr, work, worker);
        raw_spin_unlock_irq(&pool->lock);

        touch_work_lockdep_map(work, wq);

        /*
         * Force a lock recursion deadlock when using flush_work() inside a
         * single-threaded or rescuer equipped workqueue.
         *
         * For single threaded workqueues the deadlock happens when the work
         * is after the work issuing the flush_work(). For rescuer equipped
         * workqueues the deadlock happens when the rescuer stalls, blocking
         * forward progress.
         */
        if (!from_cancel && (wq->saved_max_active == 1 || wq->rescuer))
                touch_wq_lockdep_map(wq);

        rcu_read_unlock();
        return true;
already_gone:
        raw_spin_unlock_irq(&pool->lock);
        rcu_read_unlock();
        return false;
}

static bool __flush_work(struct work_struct *work, bool from_cancel)
{
        struct wq_barrier barr;

        if (WARN_ON(!wq_online))
                return false;

        if (WARN_ON(!work->func))
                return false;

        if (!start_flush_work(work, &barr, from_cancel))
                return false;

        /*
         * start_flush_work() returned %true. If @from_cancel is set, we know
         * that @work must have been executing during start_flush_work() and
         * can't currently be queued. Its data must contain OFFQ bits. If @work
         * was queued on a BH workqueue, we also know that it was running in the
         * BH context and thus can be busy-waited.
         */
        if (from_cancel) {
                unsigned long data = *work_data_bits(work);

                if (!WARN_ON_ONCE(data & WORK_STRUCT_PWQ) &&
                    (data & WORK_OFFQ_BH)) {
                        /*
                         * On RT, prevent a live lock when %current preempted
                         * soft interrupt processing or prevents ksoftirqd from
                         * running by keeping flipping BH. If the BH work item
                         * runs on a different CPU then this has no effect other
                         * than doing the BH disable/enable dance for nothing.
                         * This is copied from
                         * kernel/softirq.c::tasklet_unlock_spin_wait().
                         */
                        while (!try_wait_for_completion(&barr.done)) {
                                if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                                        local_bh_disable();
                                        local_bh_enable();
                                } else {
                                        cpu_relax();
                                }
                        }
                        goto out_destroy;
                }
        }

        wait_for_completion(&barr.done);

out_destroy:
        destroy_work_on_stack(&barr.work);
        return true;
}

/**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_work(struct work_struct *work)
{
        might_sleep();
        return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);

/**
 * flush_delayed_work - wait for a dwork to finish executing the last queueing
 * @dwork: the delayed work to flush
 *
 * Delayed timer is cancelled and the pending work is queued for
 * immediate execution.  Like flush_work(), this function only
 * considers the last queueing instance of @dwork.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_delayed_work(struct delayed_work *dwork)
{
        local_irq_disable();
        if (timer_delete_sync(&dwork->timer))
                __queue_work(dwork->cpu, dwork->wq, &dwork->work);
        local_irq_enable();
        return flush_work(&dwork->work);
}
EXPORT_SYMBOL(flush_delayed_work);

/**
 * flush_rcu_work - wait for a rwork to finish executing the last queueing
 * @rwork: the rcu work to flush
 *
 * Return:
 * %true if flush_rcu_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_rcu_work(struct rcu_work *rwork)
{
        if (test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&rwork->work))) {
                rcu_barrier();
                flush_work(&rwork->work);
                return true;
        } else {
                return flush_work(&rwork->work);
        }
}
EXPORT_SYMBOL(flush_rcu_work);

static void work_offqd_disable(struct work_offq_data *offqd)
{
        const unsigned long max = (1lu << WORK_OFFQ_DISABLE_BITS) - 1;

        if (likely(offqd->disable < max))
                offqd->disable++;
        else
                WARN_ONCE(true, "workqueue: work disable count overflowed\n");
}

static void work_offqd_enable(struct work_offq_data *offqd)
{
        if (likely(offqd->disable > 0))
                offqd->disable--;
        else
                WARN_ONCE(true, "workqueue: work disable count underflowed\n");
}

static bool __cancel_work(struct work_struct *work, u32 cflags)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;
        int ret;

        ret = work_grab_pending(work, cflags, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));

        if (cflags & WORK_CANCEL_DISABLE)
                work_offqd_disable(&offqd);

        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);
        return ret;
}

static bool __cancel_work_sync(struct work_struct *work, u32 cflags)
{
        bool ret;

        ret = __cancel_work(work, cflags | WORK_CANCEL_DISABLE);

        if (*work_data_bits(work) & WORK_OFFQ_BH)
                WARN_ON_ONCE(in_hardirq());
        else
                might_sleep();

        /*
         * Skip __flush_work() during early boot when we know that @work isn't
         * executing. This allows canceling during early boot.
         */
        if (wq_online)
                __flush_work(work, true);

        if (!(cflags & WORK_CANCEL_DISABLE))
                enable_work(work);

        return ret;
}

/*
 * See cancel_delayed_work()
 */
bool cancel_work(struct work_struct *work)
{
        return __cancel_work(work, 0);
}
EXPORT_SYMBOL(cancel_work);

/**
 * cancel_work_sync - cancel a work and wait for it to finish
 * @work: the work to cancel
 *
 * Cancel @work and wait for its execution to finish. This function can be used
 * even if the work re-queues itself or migrates to another workqueue. On return
 * from this function, @work is guaranteed to be not pending or executing on any
 * CPU as long as there aren't racing enqueues.
 *
 * cancel_work_sync(&delayed_work->work) must not be used for delayed_work's.
 * Use cancel_delayed_work_sync() instead.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool cancel_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, 0);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);

/**
 * cancel_delayed_work - cancel a delayed work
 * @dwork: delayed_work to cancel
 *
 * Kill off a pending delayed_work.
 *
 * Return: %true if @dwork was pending and canceled; %false if it wasn't
 * pending.
 *
 * Note:
 * The work callback function may still be running on return, unless
 * it returns %true and the work doesn't re-arm itself.  Explicitly flush or
 * use cancel_delayed_work_sync() to wait on it.
 *
 * This function is safe to call from any context including IRQ handler.
 */
bool cancel_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work);

/**
 * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
 * @dwork: the delayed work cancel
 *
 * This is cancel_work_sync() for delayed works.
 *
 * Return:
 * %true if @dwork was pending, %false otherwise.
 */
bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work, WORK_CANCEL_DELAYED);
}
EXPORT_SYMBOL(cancel_delayed_work_sync);

/**
 * disable_work - Disable and cancel a work item
 * @work: work item to disable
 *
 * Disable @work by incrementing its disable count and cancel it if currently
 * pending. As long as the disable count is non-zero, any attempt to queue @work
 * will fail and return %false. The maximum supported disable depth is 2 to the
 * power of %WORK_OFFQ_DISABLE_BITS, currently 65536.
 *
 * Can be called from any context. Returns %true if @work was pending, %false
 * otherwise.
 */
bool disable_work(struct work_struct *work)
{
        return __cancel_work(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work);

/**
 * disable_work_sync - Disable, cancel and drain a work item
 * @work: work item to disable
 *
 * Similar to disable_work() but also wait for @work to finish if currently
 * executing.
 *
 * Must be called from a sleepable context if @work was last queued on a non-BH
 * workqueue. Can also be called from non-hardirq atomic contexts including BH
 * if @work was last queued on a BH workqueue.
 *
 * Returns %true if @work was pending, %false otherwise.
 */
bool disable_work_sync(struct work_struct *work)
{
        return __cancel_work_sync(work, WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_work_sync);

/**
 * enable_work - Enable a work item
 * @work: work item to enable
 *
 * Undo disable_work[_sync]() by decrementing @work's disable count. @work can
 * only be queued if its disable count is 0.
 *
 * Can be called from any context. Returns %true if the disable count reached 0.
 * Otherwise, %false.
 */
bool enable_work(struct work_struct *work)
{
        struct work_offq_data offqd;
        unsigned long irq_flags;

        work_grab_pending(work, 0, &irq_flags);

        work_offqd_unpack(&offqd, *work_data_bits(work));
        work_offqd_enable(&offqd);
        set_work_pool_and_clear_pending(work, offqd.pool_id,
                                        work_offqd_pack_flags(&offqd));
        local_irq_restore(irq_flags);

        return !offqd.disable;
}
EXPORT_SYMBOL_GPL(enable_work);

/**
 * disable_delayed_work - Disable and cancel a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work() for delayed work items.
 */
bool disable_delayed_work(struct delayed_work *dwork)
{
        return __cancel_work(&dwork->work,
                             WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work);

/**
 * disable_delayed_work_sync - Disable, cancel and drain a delayed work item
 * @dwork: delayed work item to disable
 *
 * disable_work_sync() for delayed work items.
 */
bool disable_delayed_work_sync(struct delayed_work *dwork)
{
        return __cancel_work_sync(&dwork->work,
                                  WORK_CANCEL_DELAYED | WORK_CANCEL_DISABLE);
}
EXPORT_SYMBOL_GPL(disable_delayed_work_sync);

/**
 * enable_delayed_work - Enable a delayed work item
 * @dwork: delayed work item to enable
 *
 * enable_work() for delayed work items.
 */
bool enable_delayed_work(struct delayed_work *dwork)
{
        return enable_work(&dwork->work);
}
EXPORT_SYMBOL_GPL(enable_delayed_work);

/**
 * schedule_on_each_cpu - execute a function synchronously on each online CPU
 * @func: the function to call
 *
 * schedule_on_each_cpu() executes @func on each online CPU using the
 * system workqueue and blocks until all CPUs have completed.
 * schedule_on_each_cpu() is very slow.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
int schedule_on_each_cpu(work_func_t func)
{
        int cpu;
        struct work_struct __percpu *works;

        works = alloc_percpu(struct work_struct);
        if (!works)
                return -ENOMEM;

        cpus_read_lock();

        for_each_online_cpu(cpu) {
                struct work_struct *work = per_cpu_ptr(works, cpu);

                INIT_WORK(work, func);
                schedule_work_on(cpu, work);
        }

        for_each_online_cpu(cpu)
                flush_work(per_cpu_ptr(works, cpu));

        cpus_read_unlock();
        free_percpu(works);
        return 0;
}

/**
 * execute_in_process_context - reliably execute the routine with user context
 * @fn:                the function to execute
 * @ew:                guaranteed storage for the execute work structure (must
 *                be available when the work executes)
 *
 * Executes the function immediately if process context is available,
 * otherwise schedules the function for delayed execution.
 *
 * Return:        0 - function was executed
 *                1 - function was scheduled for execution
 */
int execute_in_process_context(work_func_t fn, struct execute_work *ew)
{
        if (!in_interrupt()) {
                fn(&ew->work);
                return 0;
        }

        INIT_WORK(&ew->work, fn);
        schedule_work(&ew->work);

        return 1;
}
EXPORT_SYMBOL_GPL(execute_in_process_context);

/**
 * free_workqueue_attrs - free a workqueue_attrs
 * @attrs: workqueue_attrs to free
 *
 * Undo alloc_workqueue_attrs().
 */
void free_workqueue_attrs(struct workqueue_attrs *attrs)
{
        if (attrs) {
                free_cpumask_var(attrs->cpumask);
                free_cpumask_var(attrs->__pod_cpumask);
                kfree(attrs);
        }
}

/**
 * alloc_workqueue_attrs - allocate a workqueue_attrs
 *
 * Allocate a new workqueue_attrs, initialize with default settings and
 * return it.
 *
 * Return: The allocated new workqueue_attr on success. %NULL on failure.
 */
struct workqueue_attrs *alloc_workqueue_attrs(void)
{
        struct workqueue_attrs *attrs;

        attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
        if (!attrs)
                goto fail;
        if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL))
                goto fail;
        if (!alloc_cpumask_var(&attrs->__pod_cpumask, GFP_KERNEL))
                goto fail;

        cpumask_copy(attrs->cpumask, cpu_possible_mask);
        attrs->affn_scope = WQ_AFFN_DFL;
        return attrs;
fail:
        free_workqueue_attrs(attrs);
        return NULL;
}

static void copy_workqueue_attrs(struct workqueue_attrs *to,
                                 const struct workqueue_attrs *from)
{
        to->nice = from->nice;
        cpumask_copy(to->cpumask, from->cpumask);
        cpumask_copy(to->__pod_cpumask, from->__pod_cpumask);
        to->affn_strict = from->affn_strict;

        /*
         * Unlike hash and equality test, copying shouldn't ignore wq-only
         * fields as copying is used for both pool and wq attrs. Instead,
         * get_unbound_pool() explicitly clears the fields.
         */
        to->affn_scope = from->affn_scope;
        to->ordered = from->ordered;
}

/*
 * Some attrs fields are workqueue-only. Clear them for worker_pool's. See the
 * comments in 'struct workqueue_attrs' definition.
 */
static void wqattrs_clear_for_pool(struct workqueue_attrs *attrs)
{
        attrs->affn_scope = WQ_AFFN_NR_TYPES;
        attrs->ordered = false;
        if (attrs->affn_strict)
                cpumask_copy(attrs->cpumask, cpu_possible_mask);
}

/* hash value of the content of @attr */
static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
{
        u32 hash = 0;

        hash = jhash_1word(attrs->nice, hash);
        hash = jhash_1word(attrs->affn_strict, hash);
        hash = jhash(cpumask_bits(attrs->__pod_cpumask),
                     BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        if (!attrs->affn_strict)
                hash = jhash(cpumask_bits(attrs->cpumask),
                             BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
        return hash;
}

/* content equality test */
static bool wqattrs_equal(const struct workqueue_attrs *a,
                          const struct workqueue_attrs *b)
{
        if (a->nice != b->nice)
                return false;
        if (a->affn_strict != b->affn_strict)
                return false;
        if (!cpumask_equal(a->__pod_cpumask, b->__pod_cpumask))
                return false;
        if (!a->affn_strict && !cpumask_equal(a->cpumask, b->cpumask))
                return false;
        return true;
}

/* Update @attrs with actually available CPUs */
static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
                                      const cpumask_t *unbound_cpumask)
{
        /*
         * Calculate the effective CPU mask of @attrs given @unbound_cpumask. If
         * @attrs->cpumask doesn't overlap with @unbound_cpumask, we fallback to
         * @unbound_cpumask.
         */
        cpumask_and(attrs->cpumask, attrs->cpumask, unbound_cpumask);
        if (unlikely(cpumask_empty(attrs->cpumask)))
                cpumask_copy(attrs->cpumask, unbound_cpumask);
}

/* find wq_pod_type to use for @attrs */
static const struct wq_pod_type *
wqattrs_pod_type(const struct workqueue_attrs *attrs)
{
        enum wq_affn_scope scope;
        struct wq_pod_type *pt;

        /* to synchronize access to wq_affn_dfl */
        lockdep_assert_held(&wq_pool_mutex);

        if (attrs->affn_scope == WQ_AFFN_DFL)
                scope = wq_affn_dfl;
        else
                scope = attrs->affn_scope;

        pt = &wq_pod_types[scope];

        if (!WARN_ON_ONCE(attrs->affn_scope == WQ_AFFN_NR_TYPES) &&
            likely(pt->nr_pods))
                return pt;

        /*
         * Before workqueue_init_topology(), only SYSTEM is available which is
         * initialized in workqueue_init_early().
         */
        pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        BUG_ON(!pt->nr_pods);
        return pt;
}

/**
 * init_worker_pool - initialize a newly zalloc'd worker_pool
 * @pool: worker_pool to initialize
 *
 * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
 *
 * Return: 0 on success, -errno on failure.  Even on failure, all fields
 * inside @pool proper are initialized and put_unbound_pool() can be called
 * on @pool safely to release it.
 */
static int init_worker_pool(struct worker_pool *pool)
{
        raw_spin_lock_init(&pool->lock);
        pool->id = -1;
        pool->cpu = -1;
        pool->node = NUMA_NO_NODE;
        pool->flags |= POOL_DISASSOCIATED;
        pool->watchdog_ts = jiffies;
        INIT_LIST_HEAD(&pool->worklist);
        INIT_LIST_HEAD(&pool->idle_list);
        hash_init(pool->busy_hash);

        timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);
        INIT_WORK(&pool->idle_cull_work, idle_cull_fn);

        timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);

        INIT_LIST_HEAD(&pool->workers);

        ida_init(&pool->worker_ida);
        INIT_HLIST_NODE(&pool->hash_node);
        pool->refcnt = 1;

        /* shouldn't fail above this point */
        pool->attrs = alloc_workqueue_attrs();
        if (!pool->attrs)
                return -ENOMEM;

        wqattrs_clear_for_pool(pool->attrs);

        return 0;
}

#ifdef CONFIG_LOCKDEP
static void wq_init_lockdep(struct workqueue_struct *wq)
{
        char *lock_name;

        lockdep_register_key(&wq->key);
        lock_name = kasprintf(GFP_KERNEL, "%s%s", "(wq_completion)", wq->name);
        if (!lock_name)
                lock_name = wq->name;

        wq->lock_name = lock_name;
        wq->lockdep_map = &wq->__lockdep_map;
        lockdep_init_map(wq->lockdep_map, lock_name, &wq->key, 0);
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
        if (wq->lockdep_map != &wq->__lockdep_map)
                return;

        lockdep_unregister_key(&wq->key);
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
        if (wq->lockdep_map != &wq->__lockdep_map)
                return;

        if (wq->lock_name != wq->name)
                kfree(wq->lock_name);
}
#else
static void wq_init_lockdep(struct workqueue_struct *wq)
{
}

static void wq_unregister_lockdep(struct workqueue_struct *wq)
{
}

static void wq_free_lockdep(struct workqueue_struct *wq)
{
}
#endif

static void free_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        int node;

        for_each_node(node) {
                kfree(nna_ar[node]);
                nna_ar[node] = NULL;
        }

        kfree(nna_ar[nr_node_ids]);
        nna_ar[nr_node_ids] = NULL;
}

static void init_node_nr_active(struct wq_node_nr_active *nna)
{
        nna->max = WQ_DFL_MIN_ACTIVE;
        atomic_set(&nna->nr, 0);
        raw_spin_lock_init(&nna->lock);
        INIT_LIST_HEAD(&nna->pending_pwqs);
}

/*
 * Each node's nr_active counter will be accessed mostly from its own node and
 * should be allocated in the node.
 */
static int alloc_node_nr_active(struct wq_node_nr_active **nna_ar)
{
        struct wq_node_nr_active *nna;
        int node;

        for_each_node(node) {
                nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
                if (!nna)
                        goto err_free;
                init_node_nr_active(nna);
                nna_ar[node] = nna;
        }

        /* [nr_node_ids] is used as the fallback */
        nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
        if (!nna)
                goto err_free;
        init_node_nr_active(nna);
        nna_ar[nr_node_ids] = nna;

        return 0;

err_free:
        free_node_nr_active(nna_ar);
        return -ENOMEM;
}

static void rcu_free_wq(struct rcu_head *rcu)
{
        struct workqueue_struct *wq =
                container_of(rcu, struct workqueue_struct, rcu);

        if (wq->flags & WQ_UNBOUND)
                free_node_nr_active(wq->node_nr_active);

        wq_free_lockdep(wq);
        free_percpu(wq->cpu_pwq);
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
}

static void rcu_free_pool(struct rcu_head *rcu)
{
        struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);

        ida_destroy(&pool->worker_ida);
        free_workqueue_attrs(pool->attrs);
        kfree(pool);
}

/**
 * put_unbound_pool - put a worker_pool
 * @pool: worker_pool to put
 *
 * Put @pool.  If its refcnt reaches zero, it gets destroyed in RCU
 * safe manner.  get_unbound_pool() calls this function on its failure path
 * and this function should be able to release pools which went through,
 * successfully or not, init_worker_pool().
 *
 * Should be called with wq_pool_mutex held.
 */
static void put_unbound_pool(struct worker_pool *pool)
{
        struct worker *worker;
        LIST_HEAD(cull_list);

        lockdep_assert_held(&wq_pool_mutex);

        if (--pool->refcnt)
                return;

        /* sanity checks */
        if (WARN_ON(!(pool->cpu < 0)) ||
            WARN_ON(!list_empty(&pool->worklist)))
                return;

        /* release id and unhash */
        if (pool->id >= 0)
                idr_remove(&worker_pool_idr, pool->id);
        hash_del(&pool->hash_node);

        /*
         * Become the manager and destroy all workers.  This prevents
         * @pool's workers from blocking on attach_mutex.  We're the last
         * manager and @pool gets freed with the flag set.
         *
         * Having a concurrent manager is quite unlikely to happen as we can
         * only get here with
         *   pwq->refcnt == pool->refcnt == 0
         * which implies no work queued to the pool, which implies no worker can
         * become the manager. However a worker could have taken the role of
         * manager before the refcnts dropped to 0, since maybe_create_worker()
         * drops pool->lock
         */
        while (true) {
                rcuwait_wait_event(&manager_wait,
                                   !(pool->flags & POOL_MANAGER_ACTIVE),
                                   TASK_UNINTERRUPTIBLE);

                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);
                if (!(pool->flags & POOL_MANAGER_ACTIVE)) {
                        pool->flags |= POOL_MANAGER_ACTIVE;
                        break;
                }
                raw_spin_unlock_irq(&pool->lock);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        while ((worker = first_idle_worker(pool)))
                set_worker_dying(worker, &cull_list);
        WARN_ON(pool->nr_workers || pool->nr_idle);
        raw_spin_unlock_irq(&pool->lock);

        detach_dying_workers(&cull_list);

        mutex_unlock(&wq_pool_attach_mutex);

        reap_dying_workers(&cull_list);

        /* shut down the timers */
        timer_delete_sync(&pool->idle_timer);
        cancel_work_sync(&pool->idle_cull_work);
        timer_delete_sync(&pool->mayday_timer);

        /* RCU protected to allow dereferences from get_work_pool() */
        call_rcu(&pool->rcu, rcu_free_pool);
}

/**
 * get_unbound_pool - get a worker_pool with the specified attributes
 * @attrs: the attributes of the worker_pool to get
 *
 * Obtain a worker_pool which has the same attributes as @attrs, bump the
 * reference count and return it.  If there already is a matching
 * worker_pool, it will be used; otherwise, this function attempts to
 * create a new one.
 *
 * Should be called with wq_pool_mutex held.
 *
 * Return: On success, a worker_pool with the same attributes as @attrs.
 * On failure, %NULL.
 */
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_NUMA];
        u32 hash = wqattrs_hash(attrs);
        struct worker_pool *pool;
        int pod, node = NUMA_NO_NODE;

        lockdep_assert_held(&wq_pool_mutex);

        /* do we already have a matching pool? */
        hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
                if (wqattrs_equal(pool->attrs, attrs)) {
                        pool->refcnt++;
                        return pool;
                }
        }

        /* If __pod_cpumask is contained inside a NUMA pod, that's our node */
        for (pod = 0; pod < pt->nr_pods; pod++) {
                if (cpumask_subset(attrs->__pod_cpumask, pt->pod_cpus[pod])) {
                        node = pt->pod_node[pod];
                        break;
                }
        }

        /* nope, create a new one */
        pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, node);
        if (!pool || init_worker_pool(pool) < 0)
                goto fail;

        pool->node = node;
        copy_workqueue_attrs(pool->attrs, attrs);
        wqattrs_clear_for_pool(pool->attrs);

        if (worker_pool_assign_id(pool) < 0)
                goto fail;

        /* create and start the initial worker */
        if (wq_online && !create_worker(pool))
                goto fail;

        /* install */
        hash_add(unbound_pool_hash, &pool->hash_node, hash);

        return pool;
fail:
        if (pool)
                put_unbound_pool(pool);
        return NULL;
}

/*
 * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
 * refcnt and needs to be destroyed.
 */
static void pwq_release_workfn(struct kthread_work *work)
{
        struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
                                                  release_work);
        struct workqueue_struct *wq = pwq->wq;
        struct worker_pool *pool = pwq->pool;
        bool is_last = false;

        /*
         * When @pwq is not linked, it doesn't hold any reference to the
         * @wq, and @wq is invalid to access.
         */
        if (!list_empty(&pwq->pwqs_node)) {
                mutex_lock(&wq->mutex);
                list_del_rcu(&pwq->pwqs_node);
                is_last = list_empty(&wq->pwqs);

                /*
                 * For ordered workqueue with a plugged dfl_pwq, restart it now.
                 */
                if (!is_last && (wq->flags & __WQ_ORDERED))
                        unplug_oldest_pwq(wq);

                mutex_unlock(&wq->mutex);
        }

        if (wq->flags & WQ_UNBOUND) {
                mutex_lock(&wq_pool_mutex);
                put_unbound_pool(pool);
                mutex_unlock(&wq_pool_mutex);
        }

        if (!list_empty(&pwq->pending_node)) {
                struct wq_node_nr_active *nna =
                        wq_node_nr_active(pwq->wq, pwq->pool->node);

                raw_spin_lock_irq(&nna->lock);
                list_del_init(&pwq->pending_node);
                raw_spin_unlock_irq(&nna->lock);
        }

        kfree_rcu(pwq, rcu);

        /*
         * If we're the last pwq going away, @wq is already dead and no one
         * is gonna access it anymore.  Schedule RCU free.
         */
        if (is_last) {
                wq_unregister_lockdep(wq);
                call_rcu(&wq->rcu, rcu_free_wq);
        }
}

/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
                     struct worker_pool *pool)
{
        BUG_ON((unsigned long)pwq & ~WORK_STRUCT_PWQ_MASK);

        memset(pwq, 0, sizeof(*pwq));

        pwq->pool = pool;
        pwq->wq = wq;
        pwq->flush_color = -1;
        pwq->refcnt = 1;
        INIT_LIST_HEAD(&pwq->inactive_works);
        INIT_LIST_HEAD(&pwq->pending_node);
        INIT_LIST_HEAD(&pwq->pwqs_node);
        INIT_LIST_HEAD(&pwq->mayday_node);
        kthread_init_work(&pwq->release_work, pwq_release_workfn);
}

/* sync @pwq with the current state of its associated wq and link it */
static void link_pwq(struct pool_workqueue *pwq)
{
        struct workqueue_struct *wq = pwq->wq;

        lockdep_assert_held(&wq->mutex);

        /* may be called multiple times, ignore if already linked */
        if (!list_empty(&pwq->pwqs_node))
                return;

        /* set the matching work_color */
        pwq->work_color = wq->work_color;

        /* link in @pwq */
        list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
}

/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct worker_pool *pool;
        struct pool_workqueue *pwq;

        lockdep_assert_held(&wq_pool_mutex);

        pool = get_unbound_pool(attrs);
        if (!pool)
                return NULL;

        pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
        if (!pwq) {
                put_unbound_pool(pool);
                return NULL;
        }

        init_pwq(pwq, wq, pool);
        return pwq;
}

static void apply_wqattrs_lock(void)
{
        mutex_lock(&wq_pool_mutex);
}

static void apply_wqattrs_unlock(void)
{
        mutex_unlock(&wq_pool_mutex);
}

/**
 * wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
 * @attrs: the wq_attrs of the default pwq of the target workqueue
 * @cpu: the target CPU
 *
 * Calculate the cpumask a workqueue with @attrs should use on @pod.
 * The result is stored in @attrs->__pod_cpumask.
 *
 * If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
 * and @pod has online CPUs requested by @attrs, the returned cpumask is the
 * intersection of the possible CPUs of @pod and @attrs->cpumask.
 *
 * The caller is responsible for ensuring that the cpumask of @pod stays stable.
 */
static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu)
{
        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
        int pod = pt->cpu_pod[cpu];

        /* calculate possible CPUs in @pod that @attrs wants */
        cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
        /* does @pod have any online CPUs @attrs wants? */
        if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) {
                cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
                return;
        }
}

/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq,
                                        int cpu, struct pool_workqueue *pwq)
{
        struct pool_workqueue __rcu **slot = unbound_pwq_slot(wq, cpu);
        struct pool_workqueue *old_pwq;

        lockdep_assert_held(&wq_pool_mutex);
        lockdep_assert_held(&wq->mutex);

        /* link_pwq() can handle duplicate calls */
        link_pwq(pwq);

        old_pwq = rcu_access_pointer(*slot);
        rcu_assign_pointer(*slot, pwq);
        return old_pwq;
}

/* context to store the prepared attrs & pwqs before applying */
struct apply_wqattrs_ctx {
        struct workqueue_struct        *wq;                /* target workqueue */
        struct workqueue_attrs        *attrs;                /* attrs to apply */
        struct list_head        list;                /* queued for batching commit */
        struct pool_workqueue        *dfl_pwq;
        struct pool_workqueue        *pwq_tbl[];
};

/* free the resources after success or abort */
static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
{
        if (ctx) {
                int cpu;

                for_each_possible_cpu(cpu)
                        put_pwq_unlocked(ctx->pwq_tbl[cpu]);
                put_pwq_unlocked(ctx->dfl_pwq);

                free_workqueue_attrs(ctx->attrs);

                kfree(ctx);
        }
}

/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
                      const struct workqueue_attrs *attrs,
                      const cpumask_var_t unbound_cpumask)
{
        struct apply_wqattrs_ctx *ctx;
        struct workqueue_attrs *new_attrs;
        int cpu;

        lockdep_assert_held(&wq_pool_mutex);

        if (WARN_ON(attrs->affn_scope < 0 ||
                    attrs->affn_scope >= WQ_AFFN_NR_TYPES))
                return ERR_PTR(-EINVAL);

        ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL);

        new_attrs = alloc_workqueue_attrs();
        if (!ctx || !new_attrs)
                goto out_free;

        /*
         * If something goes wrong during CPU up/down, we'll fall back to
         * the default pwq covering whole @attrs->cpumask.  Always create
         * it even if we don't use it immediately.
         */
        copy_workqueue_attrs(new_attrs, attrs);
        wqattrs_actualize_cpumask(new_attrs, unbound_cpumask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
        if (!ctx->dfl_pwq)
                goto out_free;

        for_each_possible_cpu(cpu) {
                if (new_attrs->ordered) {
                        ctx->dfl_pwq->refcnt++;
                        ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
                } else {
                        wq_calc_pod_cpumask(new_attrs, cpu);
                        ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
                        if (!ctx->pwq_tbl[cpu])
                                goto out_free;
                }
        }

        /* save the user configured attrs and sanitize it. */
        copy_workqueue_attrs(new_attrs, attrs);
        cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
        cpumask_copy(new_attrs->__pod_cpumask, new_attrs->cpumask);
        ctx->attrs = new_attrs;

        /*
         * For initialized ordered workqueues, there should only be one pwq
         * (dfl_pwq). Set the plugged flag of ctx->dfl_pwq to suspend execution
         * of newly queued work items until execution of older work items in
         * the old pwq's have completed.
         */
        if ((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))
                ctx->dfl_pwq->plugged = true;

        ctx->wq = wq;
        return ctx;

out_free:
        free_workqueue_attrs(new_attrs);
        apply_wqattrs_cleanup(ctx);
        return ERR_PTR(-ENOMEM);
}

/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
        int cpu;

        /* all pwqs have been created successfully, let's install'em */
        mutex_lock(&ctx->wq->mutex);

        copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);

        /* save the previous pwqs and install the new ones */
        for_each_possible_cpu(cpu)
                ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu,
                                                        ctx->pwq_tbl[cpu]);
        ctx->dfl_pwq = install_unbound_pwq(ctx->wq, -1, ctx->dfl_pwq);

        /* update node_nr_active->max */
        wq_update_node_max_active(ctx->wq, -1);

        /* rescuer needs to respect wq cpumask changes */
        if (ctx->wq->rescuer)
                set_cpus_allowed_ptr(ctx->wq->rescuer->task,
                                     unbound_effective_cpumask(ctx->wq));

        mutex_unlock(&ctx->wq->mutex);
}

static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
                                        const struct workqueue_attrs *attrs)
{
        struct apply_wqattrs_ctx *ctx;

        /* only unbound workqueues can change attributes */
        if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
                return -EINVAL;

        ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_cpumask);
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);

        /* the ctx has been prepared successfully, let's commit it */
        apply_wqattrs_commit(ctx);
        apply_wqattrs_cleanup(ctx);

        return 0;
}

/**
 * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
 * @wq: the target workqueue
 * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
 *
 * Apply @attrs to an unbound workqueue @wq. Unless disabled, this function maps
 * a separate pwq to each CPU pod with possibles CPUs in @attrs->cpumask so that
 * work items are affine to the pod it was issued on. Older pwqs are released as
 * in-flight work items finish. Note that a work item which repeatedly requeues
 * itself back-to-back will stay on its current pwq.
 *
 * Performs GFP_KERNEL allocations.
 *
 * Return: 0 on success and -errno on failure.
 */
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs)
{
        int ret;

        mutex_lock(&wq_pool_mutex);
        ret = apply_workqueue_attrs_locked(wq, attrs);
        mutex_unlock(&wq_pool_mutex);

        return ret;
}

/**
 * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug
 * @wq: the target workqueue
 * @cpu: the CPU to update the pwq slot for
 *
 * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
 * %CPU_DOWN_FAILED.  @cpu is in the same pod of the CPU being hot[un]plugged.
 *
 *
 * If pod affinity can't be adjusted due to memory allocation failure, it falls
 * back to @wq->dfl_pwq which may not be optimal but is always correct.
 *
 * Note that when the last allowed CPU of a pod goes offline for a workqueue
 * with a cpumask spanning multiple pods, the workers which were already
 * executing the work items for the workqueue will lose their CPU affinity and
 * may execute on any CPU. This is similar to how per-cpu workqueues behave on
 * CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
 * responsibility to flush the work item from CPU_DOWN_PREPARE.
 */
static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
{
        struct pool_workqueue *old_pwq = NULL, *pwq;
        struct workqueue_attrs *target_attrs;

        lockdep_assert_held(&wq_pool_mutex);

        if (!(wq->flags & WQ_UNBOUND) || wq->unbound_attrs->ordered)
                return;

        /*
         * We don't wanna alloc/free wq_attrs for each wq for each CPU.
         * Let's use a preallocated one.  The following buf is protected by
         * CPU hotplug exclusion.
         */
        target_attrs = unbound_wq_update_pwq_attrs_buf;

        copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
        wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);

        /* nothing to do if the target cpumask matches the current pwq */
        wq_calc_pod_cpumask(target_attrs, cpu);
        if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
                return;

        /* create a new pwq */
        pwq = alloc_unbound_pwq(wq, target_attrs);
        if (!pwq) {
                pr_warn("workqueue: allocation failed while updating CPU pod affinity of \"%s\"\n",
                        wq->name);
                goto use_dfl_pwq;
        }

        /* Install the new pwq. */
        mutex_lock(&wq->mutex);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
        goto out_unlock;

use_dfl_pwq:
        mutex_lock(&wq->mutex);
        pwq = unbound_pwq(wq, -1);
        raw_spin_lock_irq(&pwq->pool->lock);
        get_pwq(pwq);
        raw_spin_unlock_irq(&pwq->pool->lock);
        old_pwq = install_unbound_pwq(wq, cpu, pwq);
out_unlock:
        mutex_unlock(&wq->mutex);
        put_pwq_unlocked(old_pwq);
}

static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
        bool highpri = wq->flags & WQ_HIGHPRI;
        int cpu, ret;

        lockdep_assert_held(&wq_pool_mutex);

        wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
        if (!wq->cpu_pwq)
                goto enomem;

        if (!(wq->flags & WQ_UNBOUND)) {
                struct worker_pool __percpu *pools;

                if (wq->flags & WQ_BH)
                        pools = bh_worker_pools;
                else
                        pools = cpu_worker_pools;

                for_each_possible_cpu(cpu) {
                        struct pool_workqueue **pwq_p;
                        struct worker_pool *pool;

                        pool = &(per_cpu_ptr(pools, cpu)[highpri]);
                        pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);

                        *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL,
                                                       pool->node);
                        if (!*pwq_p)
                                goto enomem;

                        init_pwq(*pwq_p, wq, pool);

                        mutex_lock(&wq->mutex);
                        link_pwq(*pwq_p);
                        mutex_unlock(&wq->mutex);
                }
                return 0;
        }

        if (wq->flags & __WQ_ORDERED) {
                struct pool_workqueue *dfl_pwq;

                ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
                /* there should only be single pwq for ordering guarantee */
                dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
                WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
                              wq->pwqs.prev != &dfl_pwq->pwqs_node),
                     "ordering guarantee broken for workqueue %s\n", wq->name);
        } else {
                ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
        }

        return ret;

enomem:
        if (wq->cpu_pwq) {
                for_each_possible_cpu(cpu) {
                        struct pool_workqueue *pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);

                        if (pwq)
                                kmem_cache_free(pwq_cache, pwq);
                }
                free_percpu(wq->cpu_pwq);
                wq->cpu_pwq = NULL;
        }
        return -ENOMEM;
}

static int wq_clamp_max_active(int max_active, unsigned int flags,
                               const char *name)
{
        if (max_active < 1 || max_active > WQ_MAX_ACTIVE)
                pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n",
                        max_active, name, 1, WQ_MAX_ACTIVE);

        return clamp_val(max_active, 1, WQ_MAX_ACTIVE);
}

/*
 * Workqueues which may be used during memory reclaim should have a rescuer
 * to guarantee forward progress.
 */
static int init_rescuer(struct workqueue_struct *wq)
{
        struct worker *rescuer;
        char id_buf[WORKER_ID_LEN];
        int ret;

        lockdep_assert_held(&wq_pool_mutex);

        if (!(wq->flags & WQ_MEM_RECLAIM))
                return 0;

        rescuer = alloc_worker(NUMA_NO_NODE);
        if (!rescuer) {
                pr_err("workqueue: Failed to allocate a rescuer for wq \"%s\"\n",
                       wq->name);
                return -ENOMEM;
        }

        rescuer->rescue_wq = wq;
        format_worker_id(id_buf, sizeof(id_buf), rescuer, NULL);

        rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", id_buf);
        if (IS_ERR(rescuer->task)) {
                ret = PTR_ERR(rescuer->task);
                pr_err("workqueue: Failed to create a rescuer kthread for wq \"%s\": %pe",
                       wq->name, ERR_PTR(ret));
                kfree(rescuer);
                return ret;
        }

        wq->rescuer = rescuer;
        if (wq->flags & WQ_UNBOUND)
                kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
        else
                kthread_bind_mask(rescuer->task, cpu_possible_mask);
        wake_up_process(rescuer->task);

        return 0;
}

/**
 * wq_adjust_max_active - update a wq's max_active to the current setting
 * @wq: target workqueue
 *
 * If @wq isn't freezing, set @wq->max_active to the saved_max_active and
 * activate inactive work items accordingly. If @wq is freezing, clear
 * @wq->max_active to zero.
 */
static void wq_adjust_max_active(struct workqueue_struct *wq)
{
        bool activated;
        int new_max, new_min;

        lockdep_assert_held(&wq->mutex);

        if ((wq->flags & WQ_FREEZABLE) && workqueue_freezing) {
                new_max = 0;
                new_min = 0;
        } else {
                new_max = wq->saved_max_active;
                new_min = wq->saved_min_active;
        }

        if (wq->max_active == new_max && wq->min_active == new_min)
                return;

        /*
         * Update @wq->max/min_active and then kick inactive work items if more
         * active work items are allowed. This doesn't break work item ordering
         * because new work items are always queued behind existing inactive
         * work items if there are any.
         */
        WRITE_ONCE(wq->max_active, new_max);
        WRITE_ONCE(wq->min_active, new_min);

        if (wq->flags & WQ_UNBOUND)
                wq_update_node_max_active(wq, -1);

        if (new_max == 0)
                return;

        /*
         * Round-robin through pwq's activating the first inactive work item
         * until max_active is filled.
         */
        do {
                struct pool_workqueue *pwq;

                activated = false;
                for_each_pwq(pwq, wq) {
                        unsigned long irq_flags;

                        /* can be called during early boot w/ irq disabled */
                        raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                        if (pwq_activate_first_inactive(pwq, true)) {
                                activated = true;
                                kick_pool(pwq->pool);
                        }
                        raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                }
        } while (activated);
}

__printf(1, 0)
static struct workqueue_struct *__alloc_workqueue(const char *fmt,
                                                  unsigned int flags,
                                                  int max_active, va_list args)
{
        struct workqueue_struct *wq;
        size_t wq_size;
        int name_len;

        if (flags & WQ_BH) {
                if (WARN_ON_ONCE(flags & ~__WQ_BH_ALLOWS))
                        return NULL;
                if (WARN_ON_ONCE(max_active))
                        return NULL;
        }

        /* see the comment above the definition of WQ_POWER_EFFICIENT */
        if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
                flags |= WQ_UNBOUND;

        /* allocate wq and format name */
        if (flags & WQ_UNBOUND)
                wq_size = struct_size(wq, node_nr_active, nr_node_ids + 1);
        else
                wq_size = sizeof(*wq);

        wq = kzalloc(wq_size, GFP_KERNEL);
        if (!wq)
                return NULL;

        if (flags & WQ_UNBOUND) {
                wq->unbound_attrs = alloc_workqueue_attrs();
                if (!wq->unbound_attrs)
                        goto err_free_wq;
        }

        name_len = vsnprintf(wq->name, sizeof(wq->name), fmt, args);

        if (name_len >= WQ_NAME_LEN)
                pr_warn_once("workqueue: name exceeds WQ_NAME_LEN. Truncating to: %s\n",
                             wq->name);

        if (flags & WQ_BH) {
                /*
                 * BH workqueues always share a single execution context per CPU
                 * and don't impose any max_active limit.
                 */
                max_active = INT_MAX;
        } else {
                max_active = max_active ?: WQ_DFL_ACTIVE;
                max_active = wq_clamp_max_active(max_active, flags, wq->name);
        }

        /* init wq */
        wq->flags = flags;
        wq->max_active = max_active;
        wq->min_active = min(max_active, WQ_DFL_MIN_ACTIVE);
        wq->saved_max_active = wq->max_active;
        wq->saved_min_active = wq->min_active;
        mutex_init(&wq->mutex);
        atomic_set(&wq->nr_pwqs_to_flush, 0);
        INIT_LIST_HEAD(&wq->pwqs);
        INIT_LIST_HEAD(&wq->flusher_queue);
        INIT_LIST_HEAD(&wq->flusher_overflow);
        INIT_LIST_HEAD(&wq->maydays);

        INIT_LIST_HEAD(&wq->list);

        if (flags & WQ_UNBOUND) {
                if (alloc_node_nr_active(wq->node_nr_active) < 0)
                        goto err_free_wq;
        }

        /*
         * wq_pool_mutex protects the workqueues list, allocations of PWQs,
         * and the global freeze state.
         */
        apply_wqattrs_lock();

        if (alloc_and_link_pwqs(wq) < 0)
                goto err_unlock_free_node_nr_active;

        mutex_lock(&wq->mutex);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);

        list_add_tail_rcu(&wq->list, &workqueues);

        if (wq_online && init_rescuer(wq) < 0)
                goto err_unlock_destroy;

        apply_wqattrs_unlock();

        if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
                goto err_destroy;

        return wq;

err_unlock_free_node_nr_active:
        apply_wqattrs_unlock();
        /*
         * Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
         * flushing the pwq_release_worker ensures that the pwq_release_workfn()
         * completes before calling kfree(wq).
         */
        if (wq->flags & WQ_UNBOUND) {
                kthread_flush_worker(pwq_release_worker);
                free_node_nr_active(wq->node_nr_active);
        }
err_free_wq:
        free_workqueue_attrs(wq->unbound_attrs);
        kfree(wq);
        return NULL;
err_unlock_destroy:
        apply_wqattrs_unlock();
err_destroy:
        destroy_workqueue(wq);
        return NULL;
}

__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
                                         unsigned int flags,
                                         int max_active, ...)
{
        struct workqueue_struct *wq;
        va_list args;

        va_start(args, max_active);
        wq = __alloc_workqueue(fmt, flags, max_active, args);
        va_end(args);
        if (!wq)
                return NULL;

        wq_init_lockdep(wq);

        return wq;
}
EXPORT_SYMBOL_GPL(alloc_workqueue);

#ifdef CONFIG_LOCKDEP
__printf(1, 5)
struct workqueue_struct *
alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags,
                            int max_active, struct lockdep_map *lockdep_map, ...)
{
        struct workqueue_struct *wq;
        va_list args;

        va_start(args, lockdep_map);
        wq = __alloc_workqueue(fmt, flags, max_active, args);
        va_end(args);
        if (!wq)
                return NULL;

        wq->lockdep_map = lockdep_map;

        return wq;
}
EXPORT_SYMBOL_GPL(alloc_workqueue_lockdep_map);
#endif

static bool pwq_busy(struct pool_workqueue *pwq)
{
        int i;

        for (i = 0; i < WORK_NR_COLORS; i++)
                if (pwq->nr_in_flight[i])
                        return true;

        if ((pwq != rcu_access_pointer(pwq->wq->dfl_pwq)) && (pwq->refcnt > 1))
                return true;
        if (!pwq_is_empty(pwq))
                return true;

        return false;
}

/**
 * destroy_workqueue - safely terminate a workqueue
 * @wq: target workqueue
 *
 * Safely destroy a workqueue. All work currently pending will be done first.
 */
void destroy_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        int cpu;

        /*
         * Remove it from sysfs first so that sanity check failure doesn't
         * lead to sysfs name conflicts.
         */
        workqueue_sysfs_unregister(wq);

        /* mark the workqueue destruction is in progress */
        mutex_lock(&wq->mutex);
        wq->flags |= __WQ_DESTROYING;
        mutex_unlock(&wq->mutex);

        /* drain it before proceeding with destruction */
        drain_workqueue(wq);

        /* kill rescuer, if sanity checks fail, leave it w/o rescuer */
        if (wq->rescuer) {
                struct worker *rescuer = wq->rescuer;

                /* this prevents new queueing */
                raw_spin_lock_irq(&wq_mayday_lock);
                wq->rescuer = NULL;
                raw_spin_unlock_irq(&wq_mayday_lock);

                /* rescuer will empty maydays list before exiting */
                kthread_stop(rescuer->task);
                kfree(rescuer);
        }

        /*
         * Sanity checks - grab all the locks so that we wait for all
         * in-flight operations which may do put_pwq().
         */
        mutex_lock(&wq_pool_mutex);
        mutex_lock(&wq->mutex);
        for_each_pwq(pwq, wq) {
                raw_spin_lock_irq(&pwq->pool->lock);
                if (WARN_ON(pwq_busy(pwq))) {
                        pr_warn("%s: %s has the following busy pwq\n",
                                __func__, wq->name);
                        show_pwq(pwq);
                        raw_spin_unlock_irq(&pwq->pool->lock);
                        mutex_unlock(&wq->mutex);
                        mutex_unlock(&wq_pool_mutex);
                        show_one_workqueue(wq);
                        return;
                }
                raw_spin_unlock_irq(&pwq->pool->lock);
        }
        mutex_unlock(&wq->mutex);

        /*
         * wq list is used to freeze wq, remove from list after
         * flushing is complete in case freeze races us.
         */
        list_del_rcu(&wq->list);
        mutex_unlock(&wq_pool_mutex);

        /*
         * We're the sole accessor of @wq. Directly access cpu_pwq and dfl_pwq
         * to put the base refs. @wq will be auto-destroyed from the last
         * pwq_put. RCU read lock prevents @wq from going away from under us.
         */
        rcu_read_lock();

        for_each_possible_cpu(cpu) {
                put_pwq_unlocked(unbound_pwq(wq, cpu));
                RCU_INIT_POINTER(*unbound_pwq_slot(wq, cpu), NULL);
        }

        put_pwq_unlocked(unbound_pwq(wq, -1));
        RCU_INIT_POINTER(*unbound_pwq_slot(wq, -1), NULL);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(destroy_workqueue);

/**
 * workqueue_set_max_active - adjust max_active of a workqueue
 * @wq: target workqueue
 * @max_active: new max_active value.
 *
 * Set max_active of @wq to @max_active. See the alloc_workqueue() function
 * comment.
 *
 * CONTEXT:
 * Don't call from IRQ context.
 */
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
        /* max_active doesn't mean anything for BH workqueues */
        if (WARN_ON(wq->flags & WQ_BH))
                return;
        /* disallow meddling with max_active for ordered workqueues */
        if (WARN_ON(wq->flags & __WQ_ORDERED))
                return;

        max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);

        mutex_lock(&wq->mutex);

        wq->saved_max_active = max_active;
        if (wq->flags & WQ_UNBOUND)
                wq->saved_min_active = min(wq->saved_min_active, max_active);

        wq_adjust_max_active(wq);

        mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(workqueue_set_max_active);

/**
 * workqueue_set_min_active - adjust min_active of an unbound workqueue
 * @wq: target unbound workqueue
 * @min_active: new min_active value
 *
 * Set min_active of an unbound workqueue. Unlike other types of workqueues, an
 * unbound workqueue is not guaranteed to be able to process max_active
 * interdependent work items. Instead, an unbound workqueue is guaranteed to be
 * able to process min_active number of interdependent work items which is
 * %WQ_DFL_MIN_ACTIVE by default.
 *
 * Use this function to adjust the min_active value between 0 and the current
 * max_active.
 */
void workqueue_set_min_active(struct workqueue_struct *wq, int min_active)
{
        /* min_active is only meaningful for non-ordered unbound workqueues */
        if (WARN_ON((wq->flags & (WQ_BH | WQ_UNBOUND | __WQ_ORDERED)) !=
                    WQ_UNBOUND))
                return;

        mutex_lock(&wq->mutex);
        wq->saved_min_active = clamp(min_active, 0, wq->saved_max_active);
        wq_adjust_max_active(wq);
        mutex_unlock(&wq->mutex);
}

/**
 * current_work - retrieve %current task's work struct
 *
 * Determine if %current task is a workqueue worker and what it's working on.
 * Useful to find out the context that the %current task is running in.
 *
 * Return: work struct if %current task is a workqueue worker, %NULL otherwise.
 */
struct work_struct *current_work(void)
{
        struct worker *worker = current_wq_worker();

        return worker ? worker->current_work : NULL;
}
EXPORT_SYMBOL(current_work);

/**
 * current_is_workqueue_rescuer - is %current workqueue rescuer?
 *
 * Determine whether %current is a workqueue rescuer.  Can be used from
 * work functions to determine whether it's being run off the rescuer task.
 *
 * Return: %true if %current is a workqueue rescuer. %false otherwise.
 */
bool current_is_workqueue_rescuer(void)
{
        struct worker *worker = current_wq_worker();

        return worker && worker->rescue_wq;
}

/**
 * workqueue_congested - test whether a workqueue is congested
 * @cpu: CPU in question
 * @wq: target workqueue
 *
 * Test whether @wq's cpu workqueue for @cpu is congested.  There is
 * no synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
 *
 * With the exception of ordered workqueues, all workqueues have per-cpu
 * pool_workqueues, each with its own congested state. A workqueue being
 * congested on one CPU doesn't mean that the workqueue is contested on any
 * other CPUs.
 *
 * Return:
 * %true if congested, %false otherwise.
 */
bool workqueue_congested(int cpu, struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool ret;

        rcu_read_lock();
        preempt_disable();

        if (cpu == WORK_CPU_UNBOUND)
                cpu = smp_processor_id();

        pwq = *per_cpu_ptr(wq->cpu_pwq, cpu);
        ret = !list_empty(&pwq->inactive_works);

        preempt_enable();
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(workqueue_congested);

/**
 * work_busy - test whether a work is currently pending or running
 * @work: the work to be tested
 *
 * Test whether @work is currently pending or running.  There is no
 * synchronization around this function and the test result is
 * unreliable and only useful as advisory hints or for debugging.
 *
 * Return:
 * OR'd bitmask of WORK_BUSY_* bits.
 */
unsigned int work_busy(struct work_struct *work)
{
        struct worker_pool *pool;
        unsigned long irq_flags;
        unsigned int ret = 0;

        if (work_pending(work))
                ret |= WORK_BUSY_PENDING;

        rcu_read_lock();
        pool = get_work_pool(work);
        if (pool) {
                raw_spin_lock_irqsave(&pool->lock, irq_flags);
                if (find_worker_executing_work(pool, work))
                        ret |= WORK_BUSY_RUNNING;
                raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        }
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(work_busy);

/**
 * set_worker_desc - set description for the current work item
 * @fmt: printf-style format string
 * @...: arguments for the format string
 *
 * This function can be called by a running work function to describe what
 * the work item is about.  If the worker task gets dumped, this
 * information will be printed out together to help debugging.  The
 * description can be at most WORKER_DESC_LEN including the trailing '\0'.
 */
void set_worker_desc(const char *fmt, ...)
{
        struct worker *worker = current_wq_worker();
        va_list args;

        if (worker) {
                va_start(args, fmt);
                vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
                va_end(args);
        }
}
EXPORT_SYMBOL_GPL(set_worker_desc);

/**
 * print_worker_info - print out worker information and description
 * @log_lvl: the log level to use when printing
 * @task: target task
 *
 * If @task is a worker and currently executing a work item, print out the
 * name of the workqueue being serviced and worker description set with
 * set_worker_desc() by the currently executing work item.
 *
 * This function can be safely called on any task as long as the
 * task_struct itself is accessible.  While safe, this function isn't
 * synchronized and may print out mixups or garbages of limited length.
 */
void print_worker_info(const char *log_lvl, struct task_struct *task)
{
        work_func_t *fn = NULL;
        char name[WQ_NAME_LEN] = { };
        char desc[WORKER_DESC_LEN] = { };
        struct pool_workqueue *pwq = NULL;
        struct workqueue_struct *wq = NULL;
        struct worker *worker;

        if (!(task->flags & PF_WQ_WORKER))
                return;

        /*
         * This function is called without any synchronization and @task
         * could be in any state.  Be careful with dereferences.
         */
        worker = kthread_probe_data(task);

        /*
         * Carefully copy the associated workqueue's workfn, name and desc.
         * Keep the original last '\0' in case the original is garbage.
         */
        copy_from_kernel_nofault(&fn, &worker->current_func, sizeof(fn));
        copy_from_kernel_nofault(&pwq, &worker->current_pwq, sizeof(pwq));
        copy_from_kernel_nofault(&wq, &pwq->wq, sizeof(wq));
        copy_from_kernel_nofault(name, wq->name, sizeof(name) - 1);
        copy_from_kernel_nofault(desc, worker->desc, sizeof(desc) - 1);

        if (fn || name[0] || desc[0]) {
                printk("%sWorkqueue: %s %ps", log_lvl, name, fn);
                if (strcmp(name, desc))
                        pr_cont(" (%s)", desc);
                pr_cont("\n");
        }
}

static void pr_cont_pool_info(struct worker_pool *pool)
{
        pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
        if (pool->node != NUMA_NO_NODE)
                pr_cont(" node=%d", pool->node);
        pr_cont(" flags=0x%x", pool->flags);
        if (pool->flags & POOL_BH)
                pr_cont(" bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont(" nice=%d", pool->attrs->nice);
}

static void pr_cont_worker_id(struct worker *worker)
{
        struct worker_pool *pool = worker->pool;

        if (pool->flags & WQ_BH)
                pr_cont("bh%s",
                        pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
        else
                pr_cont("%d%s", task_pid_nr(worker->task),
                        worker->rescue_wq ? "(RESCUER)" : "");
}

struct pr_cont_work_struct {
        bool comma;
        work_func_t func;
        long ctr;
};

static void pr_cont_work_flush(bool comma, work_func_t func, struct pr_cont_work_struct *pcwsp)
{
        if (!pcwsp->ctr)
                goto out_record;
        if (func == pcwsp->func) {
                pcwsp->ctr++;
                return;
        }
        if (pcwsp->ctr == 1)
                pr_cont("%s %ps", pcwsp->comma ? "," : "", pcwsp->func);
        else
                pr_cont("%s %ld*%ps", pcwsp->comma ? "," : "", pcwsp->ctr, pcwsp->func);
        pcwsp->ctr = 0;
out_record:
        if ((long)func == -1L)
                return;
        pcwsp->comma = comma;
        pcwsp->func = func;
        pcwsp->ctr = 1;
}

static void pr_cont_work(bool comma, struct work_struct *work, struct pr_cont_work_struct *pcwsp)
{
        if (work->func == wq_barrier_func) {
                struct wq_barrier *barr;

                barr = container_of(work, struct wq_barrier, work);

                pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont("%s BAR(%d)", comma ? "," : "",
                        task_pid_nr(barr->task));
        } else {
                if (!comma)
                        pr_cont_work_flush(comma, (work_func_t)-1, pcwsp);
                pr_cont_work_flush(comma, work->func, pcwsp);
        }
}

static void show_pwq(struct pool_workqueue *pwq)
{
        struct pr_cont_work_struct pcws = { .ctr = 0, };
        struct worker_pool *pool = pwq->pool;
        struct work_struct *work;
        struct worker *worker;
        bool has_in_flight = false, has_pending = false;
        int bkt;

        pr_info("  pwq %d:", pool->id);
        pr_cont_pool_info(pool);

        pr_cont(" active=%d refcnt=%d%s\n",
                pwq->nr_active, pwq->refcnt,
                !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (worker->current_pwq == pwq) {
                        has_in_flight = true;
                        break;
                }
        }
        if (has_in_flight) {
                bool comma = false;

                pr_info("    in-flight:");
                hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                        if (worker->current_pwq != pwq)
                                continue;

                        pr_cont(" %s", comma ? "," : "");
                        pr_cont_worker_id(worker);
                        pr_cont(":%ps", worker->current_func);
                        list_for_each_entry(work, &worker->scheduled, entry)
                                pr_cont_work(false, work, &pcws);
                        pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                        comma = true;
                }
                pr_cont("\n");
        }

        list_for_each_entry(work, &pool->worklist, entry) {
                if (get_work_pwq(work) == pwq) {
                        has_pending = true;
                        break;
                }
        }
        if (has_pending) {
                bool comma = false;

                pr_info("    pending:");
                list_for_each_entry(work, &pool->worklist, entry) {
                        if (get_work_pwq(work) != pwq)
                                continue;

                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }

        if (!list_empty(&pwq->inactive_works)) {
                bool comma = false;

                pr_info("    inactive:");
                list_for_each_entry(work, &pwq->inactive_works, entry) {
                        pr_cont_work(comma, work, &pcws);
                        comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
                }
                pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
                pr_cont("\n");
        }
}

/**
 * show_one_workqueue - dump state of specified workqueue
 * @wq: workqueue whose state will be printed
 */
void show_one_workqueue(struct workqueue_struct *wq)
{
        struct pool_workqueue *pwq;
        bool idle = true;
        unsigned long irq_flags;

        for_each_pwq(pwq, wq) {
                if (!pwq_is_empty(pwq)) {
                        idle = false;
                        break;
                }
        }
        if (idle) /* Nothing to print for idle workqueue */
                return;

        pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);

        for_each_pwq(pwq, wq) {
                raw_spin_lock_irqsave(&pwq->pool->lock, irq_flags);
                if (!pwq_is_empty(pwq)) {
                        /*
                         * Defer printing to avoid deadlocks in console
                         * drivers that queue work while holding locks
                         * also taken in their write paths.
                         */
                        printk_deferred_enter();
                        show_pwq(pwq);
                        printk_deferred_exit();
                }
                raw_spin_unlock_irqrestore(&pwq->pool->lock, irq_flags);
                /*
                 * We could be printing a lot from atomic context, e.g.
                 * sysrq-t -> show_all_workqueues(). Avoid triggering
                 * hard lockup.
                 */
                touch_nmi_watchdog();
        }

}

/**
 * show_one_worker_pool - dump state of specified worker pool
 * @pool: worker pool whose state will be printed
 */
static void show_one_worker_pool(struct worker_pool *pool)
{
        struct worker *worker;
        bool first = true;
        unsigned long irq_flags;
        unsigned long hung = 0;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);
        if (pool->nr_workers == pool->nr_idle)
                goto next_pool;

        /* How long the first pending work is waiting for a worker. */
        if (!list_empty(&pool->worklist))
                hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;

        /*
         * Defer printing to avoid deadlocks in console drivers that
         * queue work while holding locks also taken in their write
         * paths.
         */
        printk_deferred_enter();
        pr_info("pool %d:", pool->id);
        pr_cont_pool_info(pool);
        pr_cont(" hung=%lus workers=%d", hung, pool->nr_workers);
        if (pool->manager)
                pr_cont(" manager: %d",
                        task_pid_nr(pool->manager->task));
        list_for_each_entry(worker, &pool->idle_list, entry) {
                pr_cont(" %s", first ? "idle: " : "");
                pr_cont_worker_id(worker);
                first = false;
        }
        pr_cont("\n");
        printk_deferred_exit();
next_pool:
        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
        /*
         * We could be printing a lot from atomic context, e.g.
         * sysrq-t -> show_all_workqueues(). Avoid triggering
         * hard lockup.
         */
        touch_nmi_watchdog();

}

/**
 * show_all_workqueues - dump workqueue state
 *
 * Called from a sysrq handler and prints out all busy workqueues and pools.
 */
void show_all_workqueues(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int pi;

        rcu_read_lock();

        pr_info("Showing busy workqueues and worker pools:\n");

        list_for_each_entry_rcu(wq, &workqueues, list)
                show_one_workqueue(wq);

        for_each_pool(pool, pi)
                show_one_worker_pool(pool);

        rcu_read_unlock();
}

/**
 * show_freezable_workqueues - dump freezable workqueue state
 *
 * Called from try_to_freeze_tasks() and prints out all freezable workqueues
 * still busy.
 */
void show_freezable_workqueues(void)
{
        struct workqueue_struct *wq;

        rcu_read_lock();

        pr_info("Showing freezable workqueues that are still busy:\n");

        list_for_each_entry_rcu(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                show_one_workqueue(wq);
        }

        rcu_read_unlock();
}

/* used to show worker information through /proc/PID/{comm,stat,status} */
void wq_worker_comm(char *buf, size_t size, struct task_struct *task)
{
        /* stabilize PF_WQ_WORKER and worker pool association */
        mutex_lock(&wq_pool_attach_mutex);

        if (task->flags & PF_WQ_WORKER) {
                struct worker *worker = kthread_data(task);
                struct worker_pool *pool = worker->pool;
                int off;

                off = format_worker_id(buf, size, worker, pool);

                if (pool) {
                        raw_spin_lock_irq(&pool->lock);
                        /*
                         * ->desc tracks information (wq name or
                         * set_worker_desc()) for the latest execution.  If
                         * current, prepend '+', otherwise '-'.
                         */
                        if (worker->desc[0] != '\0') {
                                if (worker->current_work)
                                        scnprintf(buf + off, size - off, "+%s",
                                                  worker->desc);
                                else
                                        scnprintf(buf + off, size - off, "-%s",
                                                  worker->desc);
                        }
                        raw_spin_unlock_irq(&pool->lock);
                }
        } else {
                strscpy(buf, task->comm, size);
        }

        mutex_unlock(&wq_pool_attach_mutex);
}

#ifdef CONFIG_SMP

/*
 * CPU hotplug.
 *
 * There are two challenges in supporting CPU hotplug.  Firstly, there
 * are a lot of assumptions on strong associations among work, pwq and
 * pool which make migrating pending and scheduled works very
 * difficult to implement without impacting hot paths.  Secondly,
 * worker pools serve mix of short, long and very long running works making
 * blocked draining impractical.
 *
 * This is solved by allowing the pools to be disassociated from the CPU
 * running as an unbound one and allowing it to be reattached later if the
 * cpu comes back online.
 */

static void unbind_workers(int cpu)
{
        struct worker_pool *pool;
        struct worker *worker;

        for_each_cpu_worker_pool(pool, cpu) {
                mutex_lock(&wq_pool_attach_mutex);
                raw_spin_lock_irq(&pool->lock);

                /*
                 * We've blocked all attach/detach operations. Make all workers
                 * unbound and set DISASSOCIATED.  Before this, all workers
                 * must be on the cpu.  After this, they may become diasporas.
                 * And the preemption disabled section in their sched callbacks
                 * are guaranteed to see WORKER_UNBOUND since the code here
                 * is on the same cpu.
                 */
                for_each_pool_worker(worker, pool)
                        worker->flags |= WORKER_UNBOUND;

                pool->flags |= POOL_DISASSOCIATED;

                /*
                 * The handling of nr_running in sched callbacks are disabled
                 * now.  Zap nr_running.  After this, nr_running stays zero and
                 * need_more_worker() and keep_working() are always true as
                 * long as the worklist is not empty.  This pool now behaves as
                 * an unbound (in terms of concurrency management) pool which
                 * are served by workers tied to the pool.
                 */
                pool->nr_running = 0;

                /*
                 * With concurrency management just turned off, a busy
                 * worker blocking could lead to lengthy stalls.  Kick off
                 * unbound chain execution of currently pending work items.
                 */
                kick_pool(pool);

                raw_spin_unlock_irq(&pool->lock);

                for_each_pool_worker(worker, pool)
                        unbind_worker(worker);

                mutex_unlock(&wq_pool_attach_mutex);
        }
}

/**
 * rebind_workers - rebind all workers of a pool to the associated CPU
 * @pool: pool of interest
 *
 * @pool->cpu is coming online.  Rebind all workers to the CPU.
 */
static void rebind_workers(struct worker_pool *pool)
{
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /*
         * Restore CPU affinity of all workers.  As all idle workers should
         * be on the run-queue of the associated CPU before any local
         * wake-ups for concurrency management happen, restore CPU affinity
         * of all workers first and then clear UNBOUND.  As we're called
         * from CPU_ONLINE, the following shouldn't fail.
         */
        for_each_pool_worker(worker, pool) {
                kthread_set_per_cpu(worker->task, pool->cpu);
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
                                                  pool_allowed_cpus(pool)) < 0);
        }

        raw_spin_lock_irq(&pool->lock);

        pool->flags &= ~POOL_DISASSOCIATED;

        for_each_pool_worker(worker, pool) {
                unsigned int worker_flags = worker->flags;

                /*
                 * We want to clear UNBOUND but can't directly call
                 * worker_clr_flags() or adjust nr_running.  Atomically
                 * replace UNBOUND with another NOT_RUNNING flag REBOUND.
                 * @worker will clear REBOUND using worker_clr_flags() when
                 * it initiates the next execution cycle thus restoring
                 * concurrency management.  Note that when or whether
                 * @worker clears REBOUND doesn't affect correctness.
                 *
                 * WRITE_ONCE() is necessary because @worker->flags may be
                 * tested without holding any lock in
                 * wq_worker_running().  Without it, NOT_RUNNING test may
                 * fail incorrectly leading to premature concurrency
                 * management operations.
                 */
                WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
                worker_flags |= WORKER_REBOUND;
                worker_flags &= ~WORKER_UNBOUND;
                WRITE_ONCE(worker->flags, worker_flags);
        }

        raw_spin_unlock_irq(&pool->lock);
}

/**
 * restore_unbound_workers_cpumask - restore cpumask of unbound workers
 * @pool: unbound pool of interest
 * @cpu: the CPU which is coming up
 *
 * An unbound pool may end up with a cpumask which doesn't have any online
 * CPUs.  When a worker of such pool get scheduled, the scheduler resets
 * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
 * online CPU before, cpus_allowed of all its workers should be restored.
 */
static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
{
        static cpumask_t cpumask;
        struct worker *worker;

        lockdep_assert_held(&wq_pool_attach_mutex);

        /* is @cpu allowed for @pool? */
        if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
                return;

        cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);

        /* as we're called from CPU_ONLINE, the following shouldn't fail */
        for_each_pool_worker(worker, pool)
                WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0);
}

int workqueue_prepare_cpu(unsigned int cpu)
{
        struct worker_pool *pool;

        for_each_cpu_worker_pool(pool, cpu) {
                if (pool->nr_workers)
                        continue;
                if (!create_worker(pool))
                        return -ENOMEM;
        }
        return 0;
}

int workqueue_online_cpu(unsigned int cpu)
{
        struct worker_pool *pool;
        struct workqueue_struct *wq;
        int pi;

        mutex_lock(&wq_pool_mutex);

        cpumask_set_cpu(cpu, wq_online_cpumask);

        for_each_pool(pool, pi) {
                /* BH pools aren't affected by hotplug */
                if (pool->flags & POOL_BH)
                        continue;

                mutex_lock(&wq_pool_attach_mutex);
                if (pool->cpu == cpu)
                        rebind_workers(pool);
                else if (pool->cpu < 0)
                        restore_unbound_workers_cpumask(pool, cpu);
                mutex_unlock(&wq_pool_attach_mutex);
        }

        /* update pod affinity of unbound workqueues */
        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                unbound_wq_update_pwq(wq, tcpu);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
        return 0;
}

int workqueue_offline_cpu(unsigned int cpu)
{
        struct workqueue_struct *wq;

        /* unbinding per-cpu workers should happen on the local CPU */
        if (WARN_ON(cpu != smp_processor_id()))
                return -1;

        unbind_workers(cpu);

        /* update pod affinity of unbound workqueues */
        mutex_lock(&wq_pool_mutex);

        cpumask_clear_cpu(cpu, wq_online_cpumask);

        list_for_each_entry(wq, &workqueues, list) {
                struct workqueue_attrs *attrs = wq->unbound_attrs;

                if (attrs) {
                        const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
                        int tcpu;

                        for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
                                unbound_wq_update_pwq(wq, tcpu);

                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, cpu);
                        mutex_unlock(&wq->mutex);
                }
        }
        mutex_unlock(&wq_pool_mutex);

        return 0;
}

struct work_for_cpu {
        struct work_struct work;
        long (*fn)(void *);
        void *arg;
        long ret;
};

static void work_for_cpu_fn(struct work_struct *work)
{
        struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);

        wfc->ret = wfc->fn(wfc->arg);
}

/**
 * work_on_cpu_key - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn: the function to run
 * @arg: the function arg
 * @key: The lock class key for lock debugging purposes
 *
 * It is up to the caller to ensure that the cpu doesn't go offline.
 * The caller must not hold any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu_key(int cpu, long (*fn)(void *),
                     void *arg, struct lock_class_key *key)
{
        struct work_for_cpu wfc = { .fn = fn, .arg = arg };

        INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
        schedule_work_on(cpu, &wfc.work);
        flush_work(&wfc.work);
        destroy_work_on_stack(&wfc.work);
        return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_key);

/**
 * work_on_cpu_safe_key - run a function in thread context on a particular cpu
 * @cpu: the cpu to run on
 * @fn:  the function to run
 * @arg: the function argument
 * @key: The lock class key for lock debugging purposes
 *
 * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold
 * any locks which would prevent @fn from completing.
 *
 * Return: The value @fn returns.
 */
long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
                          void *arg, struct lock_class_key *key)
{
        long ret = -ENODEV;

        cpus_read_lock();
        if (cpu_online(cpu))
                ret = work_on_cpu_key(cpu, fn, arg, key);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu_safe_key);
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER

/**
 * freeze_workqueues_begin - begin freezing workqueues
 *
 * Start freezing workqueues.  After this function returns, all freezable
 * workqueues will queue new works to their inactive_works list instead of
 * pool->worklist.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void freeze_workqueues_begin(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(workqueue_freezing);
        workqueue_freezing = true;

        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

        mutex_unlock(&wq_pool_mutex);
}

/**
 * freeze_workqueues_busy - are freezable workqueues still busy?
 *
 * Check whether freezing is complete.  This function must be called
 * between freeze_workqueues_begin() and thaw_workqueues().
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex.
 *
 * Return:
 * %true if some freezable workqueues are still busy.  %false if freezing
 * is complete.
 */
bool freeze_workqueues_busy(void)
{
        bool busy = false;
        struct workqueue_struct *wq;
        struct pool_workqueue *pwq;

        mutex_lock(&wq_pool_mutex);

        WARN_ON_ONCE(!workqueue_freezing);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_FREEZABLE))
                        continue;
                /*
                 * nr_active is monotonically decreasing.  It's safe
                 * to peek without lock.
                 */
                rcu_read_lock();
                for_each_pwq(pwq, wq) {
                        WARN_ON_ONCE(pwq->nr_active < 0);
                        if (pwq->nr_active) {
                                busy = true;
                                rcu_read_unlock();
                                goto out_unlock;
                        }
                }
                rcu_read_unlock();
        }
out_unlock:
        mutex_unlock(&wq_pool_mutex);
        return busy;
}

/**
 * thaw_workqueues - thaw workqueues
 *
 * Thaw workqueues.  Normal queueing is restored and all collected
 * frozen works are transferred to their respective pool worklists.
 *
 * CONTEXT:
 * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
 */
void thaw_workqueues(void)
{
        struct workqueue_struct *wq;

        mutex_lock(&wq_pool_mutex);

        if (!workqueue_freezing)
                goto out_unlock;

        workqueue_freezing = false;

        /* restore max_active and repopulate worklist */
        list_for_each_entry(wq, &workqueues, list) {
                mutex_lock(&wq->mutex);
                wq_adjust_max_active(wq);
                mutex_unlock(&wq->mutex);
        }

out_unlock:
        mutex_unlock(&wq_pool_mutex);
}
#endif /* CONFIG_FREEZER */

static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
{
        LIST_HEAD(ctxs);
        int ret = 0;
        struct workqueue_struct *wq;
        struct apply_wqattrs_ctx *ctx, *n;

        lockdep_assert_held(&wq_pool_mutex);

        list_for_each_entry(wq, &workqueues, list) {
                if (!(wq->flags & WQ_UNBOUND) || (wq->flags & __WQ_DESTROYING))
                        continue;

                ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs, unbound_cpumask);
                if (IS_ERR(ctx)) {
                        ret = PTR_ERR(ctx);
                        break;
                }

                list_add_tail(&ctx->list, &ctxs);
        }

        list_for_each_entry_safe(ctx, n, &ctxs, list) {
                if (!ret)
                        apply_wqattrs_commit(ctx);
                apply_wqattrs_cleanup(ctx);
        }

        if (!ret) {
                mutex_lock(&wq_pool_attach_mutex);
                cpumask_copy(wq_unbound_cpumask, unbound_cpumask);
                mutex_unlock(&wq_pool_attach_mutex);
        }
        return ret;
}

/**
 * workqueue_unbound_exclude_cpumask - Exclude given CPUs from unbound cpumask
 * @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
 *
 * This function can be called from cpuset code to provide a set of isolated
 * CPUs that should be excluded from wq_unbound_cpumask.
 */
int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
        cpumask_var_t cpumask;
        int ret = 0;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        mutex_lock(&wq_pool_mutex);

        /*
         * If the operation fails, it will fall back to
         * wq_requested_unbound_cpumask which is initially set to
         * (HK_TYPE_WQ ∩ HK_TYPE_DOMAIN) house keeping mask and rewritten
         * by any subsequent write to workqueue/cpumask sysfs file.
         */
        if (!cpumask_andnot(cpumask, wq_requested_unbound_cpumask, exclude_cpumask))
                cpumask_copy(cpumask, wq_requested_unbound_cpumask);
        if (!cpumask_equal(cpumask, wq_unbound_cpumask))
                ret = workqueue_apply_unbound_cpumask(cpumask);

        /* Save the current isolated cpumask & export it via sysfs */
        if (!ret)
                cpumask_copy(wq_isolated_cpumask, exclude_cpumask);

        mutex_unlock(&wq_pool_mutex);
        free_cpumask_var(cpumask);
        return ret;
}

static int parse_affn_scope(const char *val)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(wq_affn_names); i++) {
                if (!strncasecmp(val, wq_affn_names[i], strlen(wq_affn_names[i])))
                        return i;
        }
        return -EINVAL;
}

static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
{
        struct workqueue_struct *wq;
        int affn, cpu;

        affn = parse_affn_scope(val);
        if (affn < 0)
                return affn;
        if (affn == WQ_AFFN_DFL)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&wq_pool_mutex);

        wq_affn_dfl = affn;

        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu)
                        unbound_wq_update_pwq(wq, cpu);
        }

        mutex_unlock(&wq_pool_mutex);
        cpus_read_unlock();

        return 0;
}

static int wq_affn_dfl_get(char *buffer, const struct kernel_param *kp)
{
        return scnprintf(buffer, PAGE_SIZE, "%s\n", wq_affn_names[wq_affn_dfl]);
}

static const struct kernel_param_ops wq_affn_dfl_ops = {
        .set        = wq_affn_dfl_set,
        .get        = wq_affn_dfl_get,
};

module_param_cb(default_affinity_scope, &wq_affn_dfl_ops, NULL, 0644);

#ifdef CONFIG_SYSFS
/*
 * Workqueues with WQ_SYSFS flag set is visible to userland via
 * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
 * following attributes.
 *
 *  per_cpu                RO bool        : whether the workqueue is per-cpu or unbound
 *  max_active                RW int        : maximum number of in-flight work items
 *
 * Unbound workqueues have the following extra attributes.
 *
 *  nice                RW int        : nice value of the workers
 *  cpumask                RW mask        : bitmask of allowed CPUs for the workers
 *  affinity_scope        RW str  : worker CPU affinity scope (cache, numa, none)
 *  affinity_strict        RW bool : worker CPU affinity is strict
 */
struct wq_device {
        struct workqueue_struct                *wq;
        struct device                        dev;
};

static struct workqueue_struct *dev_to_wq(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        return wq_dev->wq;
}

static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
}
static DEVICE_ATTR_RO(per_cpu);

static ssize_t max_active_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
}

static ssize_t max_active_store(struct device *dev,
                                struct device_attribute *attr, const char *buf,
                                size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int val;

        if (sscanf(buf, "%d", &val) != 1 || val <= 0)
                return -EINVAL;

        workqueue_set_max_active(wq, val);
        return count;
}
static DEVICE_ATTR_RW(max_active);

static struct attribute *wq_sysfs_attrs[] = {
        &dev_attr_per_cpu.attr,
        &dev_attr_max_active.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs);

static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
        mutex_unlock(&wq->mutex);

        return written;
}

/* prepare workqueue_attrs for sysfs store operations */
static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
{
        struct workqueue_attrs *attrs;

        lockdep_assert_held(&wq_pool_mutex);

        attrs = alloc_workqueue_attrs();
        if (!attrs)
                return NULL;

        copy_workqueue_attrs(attrs, wq->unbound_attrs);
        return attrs;
}

static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        if (sscanf(buf, "%d", &attrs->nice) == 1 &&
            attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
                ret = apply_workqueue_attrs_locked(wq, attrs);
        else
                ret = -EINVAL;

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_cpumask_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
                            cpumask_pr_args(wq->unbound_attrs->cpumask));
        mutex_unlock(&wq->mutex);
        return written;
}

static ssize_t wq_cpumask_store(struct device *dev,
                                struct device_attribute *attr,
                                const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int ret = -ENOMEM;

        apply_wqattrs_lock();

        attrs = wq_sysfs_prep_attrs(wq);
        if (!attrs)
                goto out_unlock;

        ret = cpumask_parse(buf, attrs->cpumask);
        if (!ret)
                ret = apply_workqueue_attrs_locked(wq, attrs);

out_unlock:
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affn_scope_show(struct device *dev,
                                  struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        int written;

        mutex_lock(&wq->mutex);
        if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
                written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
                                    wq_affn_names[WQ_AFFN_DFL],
                                    wq_affn_names[wq_affn_dfl]);
        else
                written = scnprintf(buf, PAGE_SIZE, "%s\n",
                                    wq_affn_names[wq->unbound_attrs->affn_scope]);
        mutex_unlock(&wq->mutex);

        return written;
}

static ssize_t wq_affn_scope_store(struct device *dev,
                                   struct device_attribute *attr,
                                   const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int affn, ret = -ENOMEM;

        affn = parse_affn_scope(buf);
        if (affn < 0)
                return affn;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_scope = affn;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static ssize_t wq_affinity_strict_show(struct device *dev,
                                       struct device_attribute *attr, char *buf)
{
        struct workqueue_struct *wq = dev_to_wq(dev);

        return scnprintf(buf, PAGE_SIZE, "%d\n",
                         wq->unbound_attrs->affn_strict);
}

static ssize_t wq_affinity_strict_store(struct device *dev,
                                        struct device_attribute *attr,
                                        const char *buf, size_t count)
{
        struct workqueue_struct *wq = dev_to_wq(dev);
        struct workqueue_attrs *attrs;
        int v, ret = -ENOMEM;

        if (sscanf(buf, "%d", &v) != 1)
                return -EINVAL;

        apply_wqattrs_lock();
        attrs = wq_sysfs_prep_attrs(wq);
        if (attrs) {
                attrs->affn_strict = (bool)v;
                ret = apply_workqueue_attrs_locked(wq, attrs);
        }
        apply_wqattrs_unlock();
        free_workqueue_attrs(attrs);
        return ret ?: count;
}

static struct device_attribute wq_sysfs_unbound_attrs[] = {
        __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
        __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
        __ATTR(affinity_scope, 0644, wq_affn_scope_show, wq_affn_scope_store),
        __ATTR(affinity_strict, 0644, wq_affinity_strict_show, wq_affinity_strict_store),
        __ATTR_NULL,
};

static const struct bus_type wq_subsys = {
        .name                                = "workqueue",
        .dev_groups                        = wq_sysfs_groups,
};

/**
 *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
 *  @cpumask: the cpumask to set
 *
 *  The low-level workqueues cpumask is a global cpumask that limits
 *  the affinity of all unbound workqueues.  This function check the @cpumask
 *  and apply it to all unbound workqueues and updates all pwqs of them.
 *
 *  Return:        0        - Success
 *                -EINVAL        - Invalid @cpumask
 *                -ENOMEM        - Failed to allocate memory for attrs or pwqs.
 */
static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
{
        int ret = -EINVAL;

        /*
         * Not excluding isolated cpus on purpose.
         * If the user wishes to include them, we allow that.
         */
        cpumask_and(cpumask, cpumask, cpu_possible_mask);
        if (!cpumask_empty(cpumask)) {
                ret = 0;
                apply_wqattrs_lock();
                if (!cpumask_equal(cpumask, wq_unbound_cpumask))
                        ret = workqueue_apply_unbound_cpumask(cpumask);
                if (!ret)
                        cpumask_copy(wq_requested_unbound_cpumask, cpumask);
                apply_wqattrs_unlock();
        }

        return ret;
}

static ssize_t __wq_cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf, cpumask_var_t mask)
{
        int written;

        mutex_lock(&wq_pool_mutex);
        written = scnprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
        mutex_unlock(&wq_pool_mutex);

        return written;
}

static ssize_t cpumask_requested_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_requested_unbound_cpumask);
}
static DEVICE_ATTR_RO(cpumask_requested);

static ssize_t cpumask_isolated_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_isolated_cpumask);
}
static DEVICE_ATTR_RO(cpumask_isolated);

static ssize_t cpumask_show(struct device *dev,
                struct device_attribute *attr, char *buf)
{
        return __wq_cpumask_show(dev, attr, buf, wq_unbound_cpumask);
}

static ssize_t cpumask_store(struct device *dev,
                struct device_attribute *attr, const char *buf, size_t count)
{
        cpumask_var_t cpumask;
        int ret;

        if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
                return -ENOMEM;

        ret = cpumask_parse(buf, cpumask);
        if (!ret)
                ret = workqueue_set_unbound_cpumask(cpumask);

        free_cpumask_var(cpumask);
        return ret ? ret : count;
}
static DEVICE_ATTR_RW(cpumask);

static struct attribute *wq_sysfs_cpumask_attrs[] = {
        &dev_attr_cpumask.attr,
        &dev_attr_cpumask_requested.attr,
        &dev_attr_cpumask_isolated.attr,
        NULL,
};
ATTRIBUTE_GROUPS(wq_sysfs_cpumask);

static int __init wq_sysfs_init(void)
{
        return subsys_virtual_register(&wq_subsys, wq_sysfs_cpumask_groups);
}
core_initcall(wq_sysfs_init);

static void wq_device_release(struct device *dev)
{
        struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);

        kfree(wq_dev);
}

/**
 * workqueue_sysfs_register - make a workqueue visible in sysfs
 * @wq: the workqueue to register
 *
 * Expose @wq in sysfs under /sys/bus/workqueue/devices.
 * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
 * which is the preferred method.
 *
 * Workqueue user should use this function directly iff it wants to apply
 * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
 * apply_workqueue_attrs() may race against userland updating the
 * attributes.
 *
 * Return: 0 on success, -errno on failure.
 */
int workqueue_sysfs_register(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev;
        int ret;

        /*
         * Adjusting max_active breaks ordering guarantee.  Disallow exposing
         * ordered workqueues.
         */
        if (WARN_ON(wq->flags & __WQ_ORDERED))
                return -EINVAL;

        wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
        if (!wq_dev)
                return -ENOMEM;

        wq_dev->wq = wq;
        wq_dev->dev.bus = &wq_subsys;
        wq_dev->dev.release = wq_device_release;
        dev_set_name(&wq_dev->dev, "%s", wq->name);

        /*
         * unbound_attrs are created separately.  Suppress uevent until
         * everything is ready.
         */
        dev_set_uevent_suppress(&wq_dev->dev, true);

        ret = device_register(&wq_dev->dev);
        if (ret) {
                put_device(&wq_dev->dev);
                wq->wq_dev = NULL;
                return ret;
        }

        if (wq->flags & WQ_UNBOUND) {
                struct device_attribute *attr;

                for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
                        ret = device_create_file(&wq_dev->dev, attr);
                        if (ret) {
                                device_unregister(&wq_dev->dev);
                                wq->wq_dev = NULL;
                                return ret;
                        }
                }
        }

        dev_set_uevent_suppress(&wq_dev->dev, false);
        kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
        return 0;
}

/**
 * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
 * @wq: the workqueue to unregister
 *
 * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
 */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
{
        struct wq_device *wq_dev = wq->wq_dev;

        if (!wq->wq_dev)
                return;

        wq->wq_dev = NULL;
        device_unregister(&wq_dev->dev);
}
#else        /* CONFIG_SYSFS */
static void workqueue_sysfs_unregister(struct workqueue_struct *wq)        { }
#endif        /* CONFIG_SYSFS */

/*
 * Workqueue watchdog.
 *
 * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal
 * flush dependency, a concurrency managed work item which stays RUNNING
 * indefinitely.  Workqueue stalls can be very difficult to debug as the
 * usual warning mechanisms don't trigger and internal workqueue state is
 * largely opaque.
 *
 * Workqueue watchdog monitors all worker pools periodically and dumps
 * state if some pools failed to make forward progress for a while where
 * forward progress is defined as the first item on ->worklist changing.
 *
 * This mechanism is controlled through the kernel parameter
 * "workqueue.watchdog_thresh" which can be updated at runtime through the
 * corresponding sysfs parameter file.
 */
#ifdef CONFIG_WQ_WATCHDOG

static unsigned long wq_watchdog_thresh = 30;
static struct timer_list wq_watchdog_timer;

static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;

static unsigned int wq_panic_on_stall;
module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);

/*
 * Show workers that might prevent the processing of pending work items.
 * The only candidates are CPU-bound workers in the running state.
 * Pending work items should be handled by another idle worker
 * in all other situations.
 */
static void show_cpu_pool_hog(struct worker_pool *pool)
{
        struct worker *worker;
        unsigned long irq_flags;
        int bkt;

        raw_spin_lock_irqsave(&pool->lock, irq_flags);

        hash_for_each(pool->busy_hash, bkt, worker, hentry) {
                if (task_is_running(worker->task)) {
                        /*
                         * Defer printing to avoid deadlocks in console
                         * drivers that queue work while holding locks
                         * also taken in their write paths.
                         */
                        printk_deferred_enter();

                        pr_info("pool %d:\n", pool->id);
                        sched_show_task(worker->task);

                        printk_deferred_exit();
                }
        }

        raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}

static void show_cpu_pools_hogs(void)
{
        struct worker_pool *pool;
        int pi;

        pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");

        rcu_read_lock();

        for_each_pool(pool, pi) {
                if (pool->cpu_stall)
                        show_cpu_pool_hog(pool);

        }

        rcu_read_unlock();
}

static void panic_on_wq_watchdog(void)
{
        static unsigned int wq_stall;

        if (wq_panic_on_stall) {
                wq_stall++;
                BUG_ON(wq_stall >= wq_panic_on_stall);
        }
}

static void wq_watchdog_reset_touched(void)
{
        int cpu;

        wq_watchdog_touched = jiffies;
        for_each_possible_cpu(cpu)
                per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
}

static void wq_watchdog_timer_fn(struct timer_list *unused)
{
        unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
        bool lockup_detected = false;
        bool cpu_pool_stall = false;
        unsigned long now = jiffies;
        struct worker_pool *pool;
        int pi;

        if (!thresh)
                return;

        rcu_read_lock();

        for_each_pool(pool, pi) {
                unsigned long pool_ts, touched, ts;

                pool->cpu_stall = false;
                if (list_empty(&pool->worklist))
                        continue;

                /*
                 * If a virtual machine is stopped by the host it can look to
                 * the watchdog like a stall.
                 */
                kvm_check_and_clear_guest_paused();

                /* get the latest of pool and touched timestamps */
                if (pool->cpu >= 0)
                        touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
                else
                        touched = READ_ONCE(wq_watchdog_touched);
                pool_ts = READ_ONCE(pool->watchdog_ts);

                if (time_after(pool_ts, touched))
                        ts = pool_ts;
                else
                        ts = touched;

                /* did we stall? */
                if (time_after(now, ts + thresh)) {
                        lockup_detected = true;
                        if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
                                pool->cpu_stall = true;
                                cpu_pool_stall = true;
                        }
                        pr_emerg("BUG: workqueue lockup - pool");
                        pr_cont_pool_info(pool);
                        pr_cont(" stuck for %us!\n",
                                jiffies_to_msecs(now - pool_ts) / 1000);
                }


        }

        rcu_read_unlock();

        if (lockup_detected)
                show_all_workqueues();

        if (cpu_pool_stall)
                show_cpu_pools_hogs();

        if (lockup_detected)
                panic_on_wq_watchdog();

        wq_watchdog_reset_touched();
        mod_timer(&wq_watchdog_timer, jiffies + thresh);
}

notrace void wq_watchdog_touch(int cpu)
{
        unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
        unsigned long touch_ts = READ_ONCE(wq_watchdog_touched);
        unsigned long now = jiffies;

        if (cpu >= 0)
                per_cpu(wq_watchdog_touched_cpu, cpu) = now;
        else
                WARN_ONCE(1, "%s should be called with valid CPU", __func__);

        /* Don't unnecessarily store to global cacheline */
        if (time_after(now, touch_ts + thresh / 4))
                WRITE_ONCE(wq_watchdog_touched, jiffies);
}

static void wq_watchdog_set_thresh(unsigned long thresh)
{
        wq_watchdog_thresh = 0;
        timer_delete_sync(&wq_watchdog_timer);

        if (thresh) {
                wq_watchdog_thresh = thresh;
                wq_watchdog_reset_touched();
                mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ);
        }
}

static int wq_watchdog_param_set_thresh(const char *val,
                                        const struct kernel_param *kp)
{
        unsigned long thresh;
        int ret;

        ret = kstrtoul(val, 0, &thresh);
        if (ret)
                return ret;

        if (system_wq)
                wq_watchdog_set_thresh(thresh);
        else
                wq_watchdog_thresh = thresh;

        return 0;
}

static const struct kernel_param_ops wq_watchdog_thresh_ops = {
        .set        = wq_watchdog_param_set_thresh,
        .get        = param_get_ulong,
};

module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh,
                0644);

static void wq_watchdog_init(void)
{
        timer_setup(&wq_watchdog_timer, wq_watchdog_timer_fn, TIMER_DEFERRABLE);
        wq_watchdog_set_thresh(wq_watchdog_thresh);
}

#else        /* CONFIG_WQ_WATCHDOG */

static inline void wq_watchdog_init(void) { }

#endif        /* CONFIG_WQ_WATCHDOG */

static void bh_pool_kick_normal(struct irq_work *irq_work)
{
        raise_softirq_irqoff(TASKLET_SOFTIRQ);
}

static void bh_pool_kick_highpri(struct irq_work *irq_work)
{
        raise_softirq_irqoff(HI_SOFTIRQ);
}

static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
{
        if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
                pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
                        cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
                return;
        }

        cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
}

static void __init init_cpu_worker_pool(struct worker_pool *pool, int cpu, int nice)
{
        BUG_ON(init_worker_pool(pool));
        pool->cpu = cpu;
        cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
        cpumask_copy(pool->attrs->__pod_cpumask, cpumask_of(cpu));
        pool->attrs->nice = nice;
        pool->attrs->affn_strict = true;
        pool->node = cpu_to_node(cpu);

        /* alloc pool ID */
        mutex_lock(&wq_pool_mutex);
        BUG_ON(worker_pool_assign_id(pool));
        mutex_unlock(&wq_pool_mutex);
}

/**
 * workqueue_init_early - early init for workqueue subsystem
 *
 * This is the first step of three-staged workqueue subsystem initialization and
 * invoked as soon as the bare basics - memory allocation, cpumasks and idr are
 * up. It sets up all the data structures and system workqueues and allows early
 * boot code to create workqueues and queue/cancel work items. Actual work item
 * execution starts only after kthreads can be created and scheduled right
 * before early initcalls.
 */
void __init workqueue_init_early(void)
{
        struct wq_pod_type *pt = &wq_pod_types[WQ_AFFN_SYSTEM];
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
        void (*irq_work_fns[2])(struct irq_work *) = { bh_pool_kick_normal,
                                                       bh_pool_kick_highpri };
        int i, cpu;

        BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

        BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
        BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));

        cpumask_copy(wq_online_cpumask, cpu_online_mask);
        cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
        restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
        restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
        if (!cpumask_empty(&wq_cmdline_cpumask))
                restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);

        cpumask_copy(wq_requested_unbound_cpumask, wq_unbound_cpumask);

        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

        unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
        BUG_ON(!unbound_wq_update_pwq_attrs_buf);

        /*
         * If nohz_full is enabled, set power efficient workqueue as unbound.
         * This allows workqueue items to be moved to HK CPUs.
         */
        if (housekeeping_enabled(HK_TYPE_TICK))
                wq_power_efficient = true;

        /* initialize WQ_AFFN_SYSTEM pods */
        pt->pod_cpus = kcalloc(1, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
        pt->pod_node = kcalloc(1, sizeof(pt->pod_node[0]), GFP_KERNEL);
        pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
        BUG_ON(!pt->pod_cpus || !pt->pod_node || !pt->cpu_pod);

        BUG_ON(!zalloc_cpumask_var_node(&pt->pod_cpus[0], GFP_KERNEL, NUMA_NO_NODE));

        pt->nr_pods = 1;
        cpumask_copy(pt->pod_cpus[0], cpu_possible_mask);
        pt->pod_node[0] = NUMA_NO_NODE;
        pt->cpu_pod[0] = 0;

        /* initialize BH and CPU pools */
        for_each_possible_cpu(cpu) {
                struct worker_pool *pool;

                i = 0;
                for_each_bh_worker_pool(pool, cpu) {
                        init_cpu_worker_pool(pool, cpu, std_nice[i]);
                        pool->flags |= POOL_BH;
                        init_irq_work(bh_pool_irq_work(pool), irq_work_fns[i]);
                        i++;
                }

                i = 0;
                for_each_cpu_worker_pool(pool, cpu)
                        init_cpu_worker_pool(pool, cpu, std_nice[i++]);
        }

        /* create default unbound and ordered wq attrs */
        for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
                struct workqueue_attrs *attrs;

                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                unbound_std_wq_attrs[i] = attrs;

                /*
                 * An ordered wq should have only one pwq as ordering is
                 * guaranteed by max_active which is enforced by pwqs.
                 */
                BUG_ON(!(attrs = alloc_workqueue_attrs()));
                attrs->nice = std_nice[i];
                attrs->ordered = true;
                ordered_wq_attrs[i] = attrs;
        }

        system_wq = alloc_workqueue("events", 0, 0);
        system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
        system_long_wq = alloc_workqueue("events_long", 0, 0);
        system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                                            WQ_MAX_ACTIVE);
        system_freezable_wq = alloc_workqueue("events_freezable",
                                              WQ_FREEZABLE, 0);
        system_power_efficient_wq = alloc_workqueue("events_power_efficient",
                                              WQ_POWER_EFFICIENT, 0);
        system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_pwr_efficient",
                                              WQ_FREEZABLE | WQ_POWER_EFFICIENT,
                                              0);
        system_bh_wq = alloc_workqueue("events_bh", WQ_BH, 0);
        system_bh_highpri_wq = alloc_workqueue("events_bh_highpri",
                                               WQ_BH | WQ_HIGHPRI, 0);
        BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
               !system_unbound_wq || !system_freezable_wq ||
               !system_power_efficient_wq ||
               !system_freezable_power_efficient_wq ||
               !system_bh_wq || !system_bh_highpri_wq);
}

static void __init wq_cpu_intensive_thresh_init(void)
{
        unsigned long thresh;
        unsigned long bogo;

        pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release");
        BUG_ON(IS_ERR(pwq_release_worker));

        /* if the user set it to a specific value, keep it */
        if (wq_cpu_intensive_thresh_us != ULONG_MAX)
                return;

        /*
         * The default of 10ms is derived from the fact that most modern (as of
         * 2023) processors can do a lot in 10ms and that it's just below what
         * most consider human-perceivable. However, the kernel also runs on a
         * lot slower CPUs including microcontrollers where the threshold is way
         * too low.
         *
         * Let's scale up the threshold upto 1 second if BogoMips is below 4000.
         * This is by no means accurate but it doesn't have to be. The mechanism
         * is still useful even when the threshold is fully scaled up. Also, as
         * the reports would usually be applicable to everyone, some machines
         * operating on longer thresholds won't significantly diminish their
         * usefulness.
         */
        thresh = 10 * USEC_PER_MSEC;

        /* see init/calibrate.c for lpj -> BogoMIPS calculation */
        bogo = max_t(unsigned long, loops_per_jiffy / 500000 * HZ, 1);
        if (bogo < 4000)
                thresh = min_t(unsigned long, thresh * 4000 / bogo, USEC_PER_SEC);

        pr_debug("wq_cpu_intensive_thresh: lpj=%lu BogoMIPS=%lu thresh_us=%lu\n",
                 loops_per_jiffy, bogo, thresh);

        wq_cpu_intensive_thresh_us = thresh;
}

/**
 * workqueue_init - bring workqueue subsystem fully online
 *
 * This is the second step of three-staged workqueue subsystem initialization
 * and invoked as soon as kthreads can be created and scheduled. Workqueues have
 * been created and work items queued on them, but there are no kworkers
 * executing the work items yet. Populate the worker pools with the initial
 * workers and enable future kworker creations.
 */
void __init workqueue_init(void)
{
        struct workqueue_struct *wq;
        struct worker_pool *pool;
        int cpu, bkt;

        wq_cpu_intensive_thresh_init();

        mutex_lock(&wq_pool_mutex);

        /*
         * Per-cpu pools created earlier could be missing node hint. Fix them
         * up. Also, create a rescuer for workqueues that requested it.
         */
        for_each_possible_cpu(cpu) {
                for_each_bh_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
                for_each_cpu_worker_pool(pool, cpu)
                        pool->node = cpu_to_node(cpu);
        }

        list_for_each_entry(wq, &workqueues, list) {
                WARN(init_rescuer(wq),
                     "workqueue: failed to create early rescuer for %s",
                     wq->name);
        }

        mutex_unlock(&wq_pool_mutex);

        /*
         * Create the initial workers. A BH pool has one pseudo worker that
         * represents the shared BH execution context and thus doesn't get
         * affected by hotplug events. Create the BH pseudo workers for all
         * possible CPUs here.
         */
        for_each_possible_cpu(cpu)
                for_each_bh_worker_pool(pool, cpu)
                        BUG_ON(!create_worker(pool));

        for_each_online_cpu(cpu) {
                for_each_cpu_worker_pool(pool, cpu) {
                        pool->flags &= ~POOL_DISASSOCIATED;
                        BUG_ON(!create_worker(pool));
                }
        }

        hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
                BUG_ON(!create_worker(pool));

        wq_online = true;
        wq_watchdog_init();
}

/*
 * Initialize @pt by first initializing @pt->cpu_pod[] with pod IDs according to
 * @cpu_shares_pod(). Each subset of CPUs that share a pod is assigned a unique
 * and consecutive pod ID. The rest of @pt is initialized accordingly.
 */
static void __init init_pod_type(struct wq_pod_type *pt,
                                 bool (*cpus_share_pod)(int, int))
{
        int cur, pre, cpu, pod;

        pt->nr_pods = 0;

        /* init @pt->cpu_pod[] according to @cpus_share_pod() */
        pt->cpu_pod = kcalloc(nr_cpu_ids, sizeof(pt->cpu_pod[0]), GFP_KERNEL);
        BUG_ON(!pt->cpu_pod);

        for_each_possible_cpu(cur) {
                for_each_possible_cpu(pre) {
                        if (pre >= cur) {
                                pt->cpu_pod[cur] = pt->nr_pods++;
                                break;
                        }
                        if (cpus_share_pod(cur, pre)) {
                                pt->cpu_pod[cur] = pt->cpu_pod[pre];
                                break;
                        }
                }
        }

        /* init the rest to match @pt->cpu_pod[] */
        pt->pod_cpus = kcalloc(pt->nr_pods, sizeof(pt->pod_cpus[0]), GFP_KERNEL);
        pt->pod_node = kcalloc(pt->nr_pods, sizeof(pt->pod_node[0]), GFP_KERNEL);
        BUG_ON(!pt->pod_cpus || !pt->pod_node);

        for (pod = 0; pod < pt->nr_pods; pod++)
                BUG_ON(!zalloc_cpumask_var(&pt->pod_cpus[pod], GFP_KERNEL));

        for_each_possible_cpu(cpu) {
                cpumask_set_cpu(cpu, pt->pod_cpus[pt->cpu_pod[cpu]]);
                pt->pod_node[pt->cpu_pod[cpu]] = cpu_to_node(cpu);
        }
}

static bool __init cpus_dont_share(int cpu0, int cpu1)
{
        return false;
}

static bool __init cpus_share_smt(int cpu0, int cpu1)
{
#ifdef CONFIG_SCHED_SMT
        return cpumask_test_cpu(cpu0, cpu_smt_mask(cpu1));
#else
        return false;
#endif
}

static bool __init cpus_share_numa(int cpu0, int cpu1)
{
        return cpu_to_node(cpu0) == cpu_to_node(cpu1);
}

/**
 * workqueue_init_topology - initialize CPU pods for unbound workqueues
 *
 * This is the third step of three-staged workqueue subsystem initialization and
 * invoked after SMP and topology information are fully initialized. It
 * initializes the unbound CPU pods accordingly.
 */
void __init workqueue_init_topology(void)
{
        struct workqueue_struct *wq;
        int cpu;

        init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
        init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
        init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
        init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);

        wq_topo_initialized = true;

        mutex_lock(&wq_pool_mutex);

        /*
         * Workqueues allocated earlier would have all CPUs sharing the default
         * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue
         * and CPU combinations to apply per-pod sharing.
         */
        list_for_each_entry(wq, &workqueues, list) {
                for_each_online_cpu(cpu)
                        unbound_wq_update_pwq(wq, cpu);
                if (wq->flags & WQ_UNBOUND) {
                        mutex_lock(&wq->mutex);
                        wq_update_node_max_active(wq, -1);
                        mutex_unlock(&wq->mutex);
                }
        }

        mutex_unlock(&wq_pool_mutex);
}

void __warn_flushing_systemwide_wq(void)
{
        pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n");
        dump_stack();
}
EXPORT_SYMBOL(__warn_flushing_systemwide_wq);

static int __init workqueue_unbound_cpus_setup(char *str)
{
        if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) {
                cpumask_clear(&wq_cmdline_cpumask);
                pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n");
        }

        return 1;
}
__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup);



























































































































































































































































































































   28 





   24 
















































































































































































































    5 







    5 






    3 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2016 Linaro
 * Author: Christoffer Dall <christoffer.dall@linaro.org>
 */

#include <linux/cpu.h>
#include <linux/debugfs.h>
#include <linux/interrupt.h>
#include <linux/kvm_host.h>
#include <linux/seq_file.h>
#include <kvm/arm_vgic.h>
#include <asm/kvm_mmu.h>
#include "vgic.h"

/*
 * Structure to control looping through the entire vgic state.  We start at
 * zero for each field and move upwards.  So, if dist_id is 0 we print the
 * distributor info.  When dist_id is 1, we have already printed it and move
 * on.
 *
 * When vcpu_id < nr_cpus we print the vcpu info until vcpu_id == nr_cpus and
 * so on.
 */
struct vgic_state_iter {
        int nr_cpus;
        int nr_spis;
        int nr_lpis;
        int dist_id;
        int vcpu_id;
        unsigned long intid;
        int lpi_idx;
};

static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter)
{
        struct vgic_dist *dist = &kvm->arch.vgic;

        if (iter->dist_id == 0) {
                iter->dist_id++;
                return;
        }

        /*
         * Let the xarray drive the iterator after the last SPI, as the iterator
         * has exhausted the sequentially-allocated INTID space.
         */
        if (iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS - 1) &&
            iter->nr_lpis) {
                if (iter->lpi_idx < iter->nr_lpis)
                        xa_find_after(&dist->lpi_xa, &iter->intid,
                                      VGIC_LPI_MAX_INTID,
                                      LPI_XA_MARK_DEBUG_ITER);
                iter->lpi_idx++;
                return;
        }

        iter->intid++;
        if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
            ++iter->vcpu_id < iter->nr_cpus)
                iter->intid = 0;
}

static int iter_mark_lpis(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct vgic_irq *irq;
        unsigned long intid;
        int nr_lpis = 0;

        xa_for_each(&dist->lpi_xa, intid, irq) {
                if (!vgic_try_get_irq_kref(irq))
                        continue;

                xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
                nr_lpis++;
        }

        return nr_lpis;
}

static void iter_unmark_lpis(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct vgic_irq *irq;
        unsigned long intid;

        xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) {
                xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
                vgic_put_irq(kvm, irq);
        }
}

static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
                      loff_t pos)
{
        int nr_cpus = atomic_read(&kvm->online_vcpus);

        memset(iter, 0, sizeof(*iter));

        iter->nr_cpus = nr_cpus;
        iter->nr_spis = kvm->arch.vgic.nr_spis;
        if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
                iter->nr_lpis = iter_mark_lpis(kvm);

        /* Fast forward to the right position if needed */
        while (pos--)
                iter_next(kvm, iter);
}

static bool end_of_vgic(struct vgic_state_iter *iter)
{
        return iter->dist_id > 0 &&
                iter->vcpu_id == iter->nr_cpus &&
                iter->intid >= (iter->nr_spis + VGIC_NR_PRIVATE_IRQS) &&
                (!iter->nr_lpis || iter->lpi_idx > iter->nr_lpis);
}

static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
{
        struct kvm *kvm = s->private;
        struct vgic_state_iter *iter;

        mutex_lock(&kvm->arch.config_lock);
        iter = kvm->arch.vgic.iter;
        if (iter) {
                iter = ERR_PTR(-EBUSY);
                goto out;
        }

        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter) {
                iter = ERR_PTR(-ENOMEM);
                goto out;
        }

        iter_init(kvm, iter, *pos);
        kvm->arch.vgic.iter = iter;

        if (end_of_vgic(iter))
                iter = NULL;
out:
        mutex_unlock(&kvm->arch.config_lock);
        return iter;
}

static void *vgic_debug_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct kvm *kvm = s->private;
        struct vgic_state_iter *iter = kvm->arch.vgic.iter;

        ++*pos;
        iter_next(kvm, iter);
        if (end_of_vgic(iter))
                iter = NULL;
        return iter;
}

static void vgic_debug_stop(struct seq_file *s, void *v)
{
        struct kvm *kvm = s->private;
        struct vgic_state_iter *iter;

        /*
         * If the seq file wasn't properly opened, there's nothing to clearn
         * up.
         */
        if (IS_ERR(v))
                return;

        mutex_lock(&kvm->arch.config_lock);
        iter = kvm->arch.vgic.iter;
        iter_unmark_lpis(kvm);
        kfree(iter);
        kvm->arch.vgic.iter = NULL;
        mutex_unlock(&kvm->arch.config_lock);
}

static void print_dist_state(struct seq_file *s, struct vgic_dist *dist,
                             struct vgic_state_iter *iter)
{
        bool v3 = dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3;

        seq_printf(s, "Distributor\n");
        seq_printf(s, "===========\n");
        seq_printf(s, "vgic_model:\t%s\n", v3 ? "GICv3" : "GICv2");
        seq_printf(s, "nr_spis:\t%d\n", dist->nr_spis);
        if (v3)
                seq_printf(s, "nr_lpis:\t%d\n", iter->nr_lpis);
        seq_printf(s, "enabled:\t%d\n", dist->enabled);
        seq_printf(s, "\n");

        seq_printf(s, "P=pending_latch, L=line_level, A=active\n");
        seq_printf(s, "E=enabled, H=hw, C=config (level=1, edge=0)\n");
        seq_printf(s, "G=group\n");
}

static void print_header(struct seq_file *s, struct vgic_irq *irq,
                         struct kvm_vcpu *vcpu)
{
        int id = 0;
        char *hdr = "SPI ";

        if (vcpu) {
                hdr = "VCPU";
                id = vcpu->vcpu_idx;
        }

        seq_printf(s, "\n");
        seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHCG     HWID   TARGET SRC PRI VCPU_ID\n", hdr, id);
        seq_printf(s, "----------------------------------------------------------------\n");
}

static void print_irq_state(struct seq_file *s, struct vgic_irq *irq,
                            struct kvm_vcpu *vcpu)
{
        char *type;
        bool pending;

        if (irq->intid < VGIC_NR_SGIS)
                type = "SGI";
        else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
                type = "PPI";
        else if (irq->intid < VGIC_MAX_SPI)
                type = "SPI";
        else
                type = "LPI";

        if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
                print_header(s, irq, vcpu);

        pending = irq->pending_latch;
        if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
                int err;

                err = irq_get_irqchip_state(irq->host_irq,
                                            IRQCHIP_STATE_PENDING,
                                            &pending);
                WARN_ON_ONCE(err);
        }

        seq_printf(s, "       %s %4d "
                      "    %2d "
                      "%d%d%d%d%d%d%d "
                      "%8d "
                      "%8x "
                      " %2x "
                      "%3d "
                      "     %2d "
                      "\n",
                        type, irq->intid,
                        (irq->target_vcpu) ? irq->target_vcpu->vcpu_idx : -1,
                        pending,
                        irq->line_level,
                        irq->active,
                        irq->enabled,
                        irq->hw,
                        irq->config == VGIC_CONFIG_LEVEL,
                        irq->group,
                        irq->hwintid,
                        irq->mpidr,
                        irq->source,
                        irq->priority,
                        (irq->vcpu) ? irq->vcpu->vcpu_idx : -1);
}

static int vgic_debug_show(struct seq_file *s, void *v)
{
        struct kvm *kvm = s->private;
        struct vgic_state_iter *iter = v;
        struct vgic_irq *irq;
        struct kvm_vcpu *vcpu = NULL;
        unsigned long flags;

        if (iter->dist_id == 0) {
                print_dist_state(s, &kvm->arch.vgic, iter);
                return 0;
        }

        if (!kvm->arch.vgic.initialized)
                return 0;

        if (iter->vcpu_id < iter->nr_cpus)
                vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);

        /*
         * Expect this to succeed, as iter_mark_lpis() takes a reference on
         * every LPI to be visited.
         */
        if (iter->intid < VGIC_NR_PRIVATE_IRQS)
                irq = vgic_get_vcpu_irq(vcpu, iter->intid);
        else
                irq = vgic_get_irq(kvm, iter->intid);
        if (WARN_ON_ONCE(!irq))
                return -EINVAL;

        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        print_irq_state(s, irq, vcpu);
        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

        vgic_put_irq(kvm, irq);
        return 0;
}

static const struct seq_operations vgic_debug_sops = {
        .start = vgic_debug_start,
        .next  = vgic_debug_next,
        .stop  = vgic_debug_stop,
        .show  = vgic_debug_show
};

DEFINE_SEQ_ATTRIBUTE(vgic_debug);

void vgic_debug_init(struct kvm *kvm)
{
        debugfs_create_file("vgic-state", 0444, kvm->debugfs_dentry, kvm,
                            &vgic_debug_fops);
}

void vgic_debug_destroy(struct kvm *kvm)
{
}

/**
 * struct vgic_its_iter - Iterator for traversing VGIC ITS device tables.
 * @dev: Pointer to the current its_device being processed.
 * @ite: Pointer to the current its_ite within the device being processed.
 *
 * This structure is used to maintain the current position during iteration
 * over the ITS device tables. It holds pointers to both the current device
 * and the current ITE within that device.
 */
struct vgic_its_iter {
        struct its_device *dev;
        struct its_ite *ite;
};

/**
 * end_of_iter - Checks if the iterator has reached the end.
 * @iter: The iterator to check.
 *
 * When the iterator completed processing the final ITE in the last device
 * table, it was marked to indicate the end of iteration by setting its
 * device and ITE pointers to NULL.
 * This function checks whether the iterator was marked as end.
 *
 * Return: True if the iterator is marked as end, false otherwise.
 */
static inline bool end_of_iter(struct vgic_its_iter *iter)
{
        return !iter->dev && !iter->ite;
}

/**
 * vgic_its_iter_next - Advances the iterator to the next entry in the ITS tables.
 * @its: The VGIC ITS structure.
 * @iter: The iterator to advance.
 *
 * This function moves the iterator to the next ITE within the current device,
 * or to the first ITE of the next device if the current ITE is the last in
 * the device. If the current device is the last device, the iterator is set
 * to indicate the end of iteration.
 */
static void vgic_its_iter_next(struct vgic_its *its, struct vgic_its_iter *iter)
{
        struct its_device *dev = iter->dev;
        struct its_ite *ite = iter->ite;

        if (!ite || list_is_last(&ite->ite_list, &dev->itt_head)) {
                if (list_is_last(&dev->dev_list, &its->device_list)) {
                        dev = NULL;
                        ite = NULL;
                } else {
                        dev = list_next_entry(dev, dev_list);
                        ite = list_first_entry_or_null(&dev->itt_head,
                                                       struct its_ite,
                                                       ite_list);
                }
        } else {
                ite = list_next_entry(ite, ite_list);
        }

        iter->dev = dev;
        iter->ite = ite;
}

/**
 * vgic_its_debug_start - Start function for the seq_file interface.
 * @s: The seq_file structure.
 * @pos: The starting position (offset).
 *
 * This function initializes the iterator to the beginning of the ITS tables
 * and advances it to the specified position. It acquires the its_lock mutex
 * to protect shared data.
 *
 * Return: An iterator pointer on success, NULL if no devices are found or
 *         the end of the list is reached, or ERR_PTR(-ENOMEM) on memory
 *         allocation failure.
 */
static void *vgic_its_debug_start(struct seq_file *s, loff_t *pos)
{
        struct vgic_its *its = s->private;
        struct vgic_its_iter *iter;
        struct its_device *dev;
        loff_t offset = *pos;

        mutex_lock(&its->its_lock);

        dev = list_first_entry_or_null(&its->device_list,
                                       struct its_device, dev_list);
        if (!dev)
                return NULL;

        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter)
                return ERR_PTR(-ENOMEM);

        iter->dev = dev;
        iter->ite = list_first_entry_or_null(&dev->itt_head,
                                             struct its_ite, ite_list);

        while (!end_of_iter(iter) && offset--)
                vgic_its_iter_next(its, iter);

        if (end_of_iter(iter)) {
                kfree(iter);
                return NULL;
        }

        return iter;
}

/**
 * vgic_its_debug_next - Next function for the seq_file interface.
 * @s: The seq_file structure.
 * @v: The current iterator.
 * @pos: The current position (offset).
 *
 * This function advances the iterator to the next entry and increments the
 * position.
 *
 * Return: An iterator pointer on success, or NULL if the end of the list is
 *         reached.
 */
static void *vgic_its_debug_next(struct seq_file *s, void *v, loff_t *pos)
{
        struct vgic_its *its = s->private;
        struct vgic_its_iter *iter = v;

        ++*pos;
        vgic_its_iter_next(its, iter);

        if (end_of_iter(iter)) {
                kfree(iter);
                return NULL;
        }
        return iter;
}

/**
 * vgic_its_debug_stop - Stop function for the seq_file interface.
 * @s: The seq_file structure.
 * @v: The current iterator.
 *
 * This function frees the iterator and releases the its_lock mutex.
 */
static void vgic_its_debug_stop(struct seq_file *s, void *v)
{
        struct vgic_its *its = s->private;
        struct vgic_its_iter *iter = v;

        if (!IS_ERR_OR_NULL(iter))
                kfree(iter);
        mutex_unlock(&its->its_lock);
}

/**
 * vgic_its_debug_show - Show function for the seq_file interface.
 * @s: The seq_file structure.
 * @v: The current iterator.
 *
 * This function formats and prints the ITS table entry information to the
 * seq_file output.
 *
 * Return: 0 on success.
 */
static int vgic_its_debug_show(struct seq_file *s, void *v)
{
        struct vgic_its_iter *iter = v;
        struct its_device *dev = iter->dev;
        struct its_ite *ite = iter->ite;

        if (list_is_first(&ite->ite_list, &dev->itt_head)) {
                seq_printf(s, "\n");
                seq_printf(s, "Device ID: 0x%x, Event ID Range: [0 - %llu]\n",
                           dev->device_id, BIT_ULL(dev->num_eventid_bits) - 1);
                seq_printf(s, "EVENT_ID    INTID  HWINTID   TARGET   COL_ID HW\n");
                seq_printf(s, "-----------------------------------------------\n");
        }

        if (ite && ite->irq && ite->collection) {
                seq_printf(s, "%8u %8u %8u %8u %8u %2d\n",
                           ite->event_id, ite->irq->intid, ite->irq->hwintid,
                           ite->collection->target_addr,
                           ite->collection->collection_id, ite->irq->hw);
        }

        return 0;
}

static const struct seq_operations vgic_its_debug_sops = {
        .start = vgic_its_debug_start,
        .next  = vgic_its_debug_next,
        .stop  = vgic_its_debug_stop,
        .show  = vgic_its_debug_show
};

DEFINE_SEQ_ATTRIBUTE(vgic_its_debug);

/**
 * vgic_its_debug_init - Initializes the debugfs interface for VGIC ITS.
 * @dev: The KVM device structure.
 *
 * This function creates a debugfs file named "vgic-its-state@%its_base"
 * to expose the ITS table information.
 *
 * Return: 0 on success.
 */
int vgic_its_debug_init(struct kvm_device *dev)
{
        struct vgic_its *its = dev->private;
        char *name;

        name = kasprintf(GFP_KERNEL, "vgic-its-state@%llx", (u64)its->vgic_its_base);
        if (!name)
                return -ENOMEM;

        debugfs_create_file(name, 0444, dev->kvm->debugfs_dentry, its, &vgic_its_debug_fops);

        kfree(name);
        return 0;
}

void vgic_its_debug_destroy(struct kvm_device *dev)
{
}



















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
 */
#ifndef __IPVLAN_H
#define __IPVLAN_H

#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/if_arp.h>
#include <linux/if_link.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/inetdevice.h>
#include <linux/netfilter.h>
#include <net/ip.h>
#include <net/ip6_route.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/route.h>
#include <net/addrconf.h>
#include <net/l3mdev.h>

#define IPVLAN_DRV        "ipvlan"
#define IPV_DRV_VER        "0.1"

#define IPVLAN_HASH_SIZE        (1 << BITS_PER_BYTE)
#define IPVLAN_HASH_MASK        (IPVLAN_HASH_SIZE - 1)

#define IPVLAN_MAC_FILTER_BITS        8
#define IPVLAN_MAC_FILTER_SIZE        (1 << IPVLAN_MAC_FILTER_BITS)
#define IPVLAN_MAC_FILTER_MASK        (IPVLAN_MAC_FILTER_SIZE - 1)

#define IPVLAN_QBACKLOG_LIMIT        1000

typedef enum {
        IPVL_IPV6 = 0,
        IPVL_ICMPV6,
        IPVL_IPV4,
        IPVL_ARP,
} ipvl_hdr_type;

struct ipvl_pcpu_stats {
        u64_stats_t                rx_pkts;
        u64_stats_t                rx_bytes;
        u64_stats_t                rx_mcast;
        u64_stats_t                tx_pkts;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync        syncp;
        u32                        rx_errs;
        u32                        tx_drps;
};

struct ipvl_port;

struct ipvl_dev {
        struct net_device        *dev;
        struct list_head        pnode;
        struct ipvl_port        *port;
        struct net_device        *phy_dev;
        struct list_head        addrs;
        struct ipvl_pcpu_stats        __percpu *pcpu_stats;
        DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE);
        netdev_features_t        sfeatures;
        u32                        msg_enable;
        spinlock_t                addrs_lock;
};

struct ipvl_addr {
        struct ipvl_dev                *master; /* Back pointer to master */
        union {
                struct in6_addr        ip6;         /* IPv6 address on logical interface */
                struct in_addr        ip4;         /* IPv4 address on logical interface */
        } ipu;
#define ip6addr        ipu.ip6
#define ip4addr ipu.ip4
        struct hlist_node        hlnode;  /* Hash-table linkage */
        struct list_head        anode;   /* logical-interface linkage */
        ipvl_hdr_type                atype;
        struct rcu_head                rcu;
};

struct ipvl_port {
        struct net_device        *dev;
        possible_net_t                pnet;
        struct hlist_head        hlhead[IPVLAN_HASH_SIZE];
        struct list_head        ipvlans;
        u16                        mode;
        u16                        flags;
        u16                        dev_id_start;
        struct work_struct        wq;
        struct sk_buff_head        backlog;
        int                        count;
        struct ida                ida;
        netdevice_tracker        dev_tracker;
};

struct ipvl_skb_cb {
        bool tx_pkt;
};
#define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0]))

static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d)
{
        return rcu_dereference(d->rx_handler_data);
}

static inline struct ipvl_port *ipvlan_port_get_rcu_bh(const struct net_device *d)
{
        return rcu_dereference_bh(d->rx_handler_data);
}

static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d)
{
        return rtnl_dereference(d->rx_handler_data);
}

static inline bool ipvlan_is_private(const struct ipvl_port *port)
{
        return !!(port->flags & IPVLAN_F_PRIVATE);
}

static inline void ipvlan_mark_private(struct ipvl_port *port)
{
        port->flags |= IPVLAN_F_PRIVATE;
}

static inline void ipvlan_clear_private(struct ipvl_port *port)
{
        port->flags &= ~IPVLAN_F_PRIVATE;
}

static inline bool ipvlan_is_vepa(const struct ipvl_port *port)
{
        return !!(port->flags & IPVLAN_F_VEPA);
}

static inline void ipvlan_mark_vepa(struct ipvl_port *port)
{
        port->flags |= IPVLAN_F_VEPA;
}

static inline void ipvlan_clear_vepa(struct ipvl_port *port)
{
        port->flags &= ~IPVLAN_F_VEPA;
}

void ipvlan_init_secret(void);
unsigned int ipvlan_mac_hash(const unsigned char *addr);
rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb);
void ipvlan_process_multicast(struct work_struct *work);
int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev);
void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr);
struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
                                   const void *iaddr, bool is_v6);
bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6);
void ipvlan_ht_addr_del(struct ipvl_addr *addr);
struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h,
                                     int addr_type, bool use_dest);
void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type);
void ipvlan_count_rx(const struct ipvl_dev *ipvlan,
                     unsigned int len, bool success, bool mcast);
int ipvlan_link_new(struct net_device *dev, struct rtnl_newlink_params *params,
                    struct netlink_ext_ack *extack);
void ipvlan_link_delete(struct net_device *dev, struct list_head *head);
void ipvlan_link_setup(struct net_device *dev);
int ipvlan_link_register(struct rtnl_link_ops *ops);
#ifdef CONFIG_IPVLAN_L3S
int ipvlan_l3s_register(struct ipvl_port *port);
void ipvlan_l3s_unregister(struct ipvl_port *port);
void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet);
int ipvlan_l3s_init(void);
void ipvlan_l3s_cleanup(void);
#else
static inline int ipvlan_l3s_register(struct ipvl_port *port)
{
        return -ENOTSUPP;
}

static inline void ipvlan_l3s_unregister(struct ipvl_port *port)
{
}

static inline void ipvlan_migrate_l3s_hook(struct net *oldnet,
                                           struct net *newnet)
{
}

static inline int ipvlan_l3s_init(void)
{
        return 0;
}

static inline void ipvlan_l3s_cleanup(void)
{
}
#endif /* CONFIG_IPVLAN_L3S */

static inline bool netif_is_ipvlan_port(const struct net_device *dev)
{
        return rcu_access_pointer(dev->rx_handler) == ipvlan_handle_frame;
}

#endif /* __IPVLAN_H */















































































   28 

























   28 



































   27 



   28 









   28 












    4 












    8 









































































































































































































































































































































































































































































































































    3 


    3 










    3 

















    3 

    3 



    3 















































































    3 
    3 



    3 


    3 



























    3 

































    3 











    3 









    3 







    3 



    3 
































    3 
























































































































































































    3 



























































    3 




















    3 



    3 


    3 





















    3 
















    3 


































    3 
    3 
    3 
    3 















    3 































































    3 


    3 

















    3 




    3 












    3 




    3 












    3 




    3 













    3 
















    3 

























































































































































    3 



    3 


























    3 





















    3 
    3 
    3 

    3 





















    3 
    3 
















    3 











    3 



    3 



    3 








    3 












































    3 



    3 


    3 



    3 








    3 







    3 




    3 





    3 


    3 












    3 





    3 







    3 


















    3 



    3 












    3 










    3 







    3 







    3 





    3 







    3 




















    3 







    3 















    3 











    3 













    3 





    3 












    3 








    3 














































    3 





    3 

























    3 
    3 



    3 


    3 



    3 



    3 


    3 
    3 









    3 















    3 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 









    3 







    3 
















    3 















    3 




    3 
    3 


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 



































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Routing netlink socket interface: protocol independent part.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *        Fixes:
 *        Vitaly E. Lavrov                RTA_OK arithmetic was wrong.
 */

#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fcntl.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/mutex.h>
#include <linux/if_addr.h>
#include <linux/if_bridge.h>
#include <linux/if_vlan.h>
#include <linux/pci.h>
#include <linux/etherdevice.h>
#include <linux/bpf.h>

#include <linux/uaccess.h>

#include <linux/inet.h>
#include <linux/netdevice.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/udp.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/fib_rules.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/netdev_lock.h>
#include <net/devlink.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/addrconf.h>
#endif
#include <linux/dpll.h>

#include "dev.h"

#define RTNL_MAX_TYPE                50
#define RTNL_SLAVE_MAX_TYPE        44

struct rtnl_link {
        rtnl_doit_func                doit;
        rtnl_dumpit_func        dumpit;
        struct module                *owner;
        unsigned int                flags;
        struct rcu_head                rcu;
};

static DEFINE_MUTEX(rtnl_mutex);

void rtnl_lock(void)
{
        mutex_lock(&rtnl_mutex);
}
EXPORT_SYMBOL(rtnl_lock);

int rtnl_lock_interruptible(void)
{
        return mutex_lock_interruptible(&rtnl_mutex);
}

int rtnl_lock_killable(void)
{
        return mutex_lock_killable(&rtnl_mutex);
}

static struct sk_buff *defer_kfree_skb_list;
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
{
        if (head && tail) {
                tail->next = defer_kfree_skb_list;
                defer_kfree_skb_list = head;
        }
}
EXPORT_SYMBOL(rtnl_kfree_skbs);

void __rtnl_unlock(void)
{
        struct sk_buff *head = defer_kfree_skb_list;

        defer_kfree_skb_list = NULL;

        /* Ensure that we didn't actually add any TODO item when __rtnl_unlock()
         * is used. In some places, e.g. in cfg80211, we have code that will do
         * something like
         *   rtnl_lock()
         *   wiphy_lock()
         *   ...
         *   rtnl_unlock()
         *
         * and because netdev_run_todo() acquires the RTNL for items on the list
         * we could cause a situation such as this:
         * Thread 1                        Thread 2
         *                                  rtnl_lock()
         *                                  unregister_netdevice()
         *                                  __rtnl_unlock()
         * rtnl_lock()
         * wiphy_lock()
         * rtnl_unlock()
         *   netdev_run_todo()
         *     __rtnl_unlock()
         *
         *     // list not empty now
         *     // because of thread 2
         *                                  rtnl_lock()
         *     while (!list_empty(...))
         *       rtnl_lock()
         *                                  wiphy_lock()
         * **** DEADLOCK ****
         *
         * However, usage of __rtnl_unlock() is rare, and so we can ensure that
         * it's not used in cases where something is added to do the list.
         */
        WARN_ON(!list_empty(&net_todo_list));

        mutex_unlock(&rtnl_mutex);

        while (head) {
                struct sk_buff *next = head->next;

                kfree_skb(head);
                cond_resched();
                head = next;
        }
}

void rtnl_unlock(void)
{
        /* This fellow will unlock it for us. */
        netdev_run_todo();
}
EXPORT_SYMBOL(rtnl_unlock);

int rtnl_trylock(void)
{
        return mutex_trylock(&rtnl_mutex);
}
EXPORT_SYMBOL(rtnl_trylock);

int rtnl_is_locked(void)
{
        return mutex_is_locked(&rtnl_mutex);
}
EXPORT_SYMBOL(rtnl_is_locked);

bool refcount_dec_and_rtnl_lock(refcount_t *r)
{
        return refcount_dec_and_mutex_lock(r, &rtnl_mutex);
}
EXPORT_SYMBOL(refcount_dec_and_rtnl_lock);

#ifdef CONFIG_PROVE_LOCKING
bool lockdep_rtnl_is_held(void)
{
        return lockdep_is_held(&rtnl_mutex);
}
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */

#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
void __rtnl_net_lock(struct net *net)
{
        ASSERT_RTNL();

        mutex_lock(&net->rtnl_mutex);
}
EXPORT_SYMBOL(__rtnl_net_lock);

void __rtnl_net_unlock(struct net *net)
{
        ASSERT_RTNL();

        mutex_unlock(&net->rtnl_mutex);
}
EXPORT_SYMBOL(__rtnl_net_unlock);

void rtnl_net_lock(struct net *net)
{
        rtnl_lock();
        __rtnl_net_lock(net);
}
EXPORT_SYMBOL(rtnl_net_lock);

void rtnl_net_unlock(struct net *net)
{
        __rtnl_net_unlock(net);
        rtnl_unlock();
}
EXPORT_SYMBOL(rtnl_net_unlock);

int rtnl_net_trylock(struct net *net)
{
        int ret = rtnl_trylock();

        if (ret)
                __rtnl_net_lock(net);

        return ret;
}
EXPORT_SYMBOL(rtnl_net_trylock);

int rtnl_net_lock_killable(struct net *net)
{
        int ret = rtnl_lock_killable();

        if (!ret)
                __rtnl_net_lock(net);

        return ret;
}

static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
{
        if (net_eq(net_a, net_b))
                return 0;

        /* always init_net first */
        if (net_eq(net_a, &init_net))
                return -1;

        if (net_eq(net_b, &init_net))
                return 1;

        /* otherwise lock in ascending order */
        return net_a < net_b ? -1 : 1;
}

int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b)
{
        const struct net *net_a, *net_b;

        net_a = container_of(a, struct net, rtnl_mutex.dep_map);
        net_b = container_of(b, struct net, rtnl_mutex.dep_map);

        return rtnl_net_cmp_locks(net_a, net_b);
}

bool rtnl_net_is_locked(struct net *net)
{
        return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex);
}
EXPORT_SYMBOL(rtnl_net_is_locked);

bool lockdep_rtnl_net_is_held(struct net *net)
{
        return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex);
}
EXPORT_SYMBOL(lockdep_rtnl_net_is_held);
#else
static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
{
        /* No need to swap */
        return -1;
}
#endif

struct rtnl_nets {
        /* ->newlink() needs to freeze 3 netns at most;
         * 2 for the new device, 1 for its peer.
         */
        struct net *net[3];
        unsigned char len;
};

static void rtnl_nets_init(struct rtnl_nets *rtnl_nets)
{
        memset(rtnl_nets, 0, sizeof(*rtnl_nets));
}

static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets)
{
        int i;

        for (i = 0; i < rtnl_nets->len; i++) {
                put_net(rtnl_nets->net[i]);
                rtnl_nets->net[i] = NULL;
        }

        rtnl_nets->len = 0;
}

/**
 * rtnl_nets_add - Add netns to be locked before ->newlink().
 *
 * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net().
 * @net: netns pointer with an extra refcnt held.
 *
 * The extra refcnt is released in rtnl_nets_destroy().
 */
static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net)
{
        int i;

        DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net));

        for (i = 0; i < rtnl_nets->len; i++) {
                switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) {
                case 0:
                        put_net(net);
                        return;
                case 1:
                        swap(rtnl_nets->net[i], net);
                }
        }

        rtnl_nets->net[i] = net;
        rtnl_nets->len++;
}

static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets)
{
        int i;

        rtnl_lock();

        for (i = 0; i < rtnl_nets->len; i++)
                __rtnl_net_lock(rtnl_nets->net[i]);
}

static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets)
{
        int i;

        for (i = 0; i < rtnl_nets->len; i++)
                __rtnl_net_unlock(rtnl_nets->net[i]);

        rtnl_unlock();
}

static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];

static inline int rtm_msgindex(int msgtype)
{
        int msgindex = msgtype - RTM_BASE;

        /*
         * msgindex < 0 implies someone tried to register a netlink
         * control code. msgindex >= RTM_NR_MSGTYPES may indicate that
         * the message type has not been added to linux/rtnetlink.h
         */
        BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES);

        return msgindex;
}

static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
{
        struct rtnl_link __rcu **tab;

        if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
                protocol = PF_UNSPEC;

        tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]);
        if (!tab)
                tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);

        return rcu_dereference_rtnl(tab[msgtype]);
}

static int rtnl_register_internal(struct module *owner,
                                  int protocol, int msgtype,
                                  rtnl_doit_func doit, rtnl_dumpit_func dumpit,
                                  unsigned int flags)
{
        struct rtnl_link *link, *old;
        struct rtnl_link __rcu **tab;
        int msgindex;
        int ret = -ENOBUFS;

        BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
        msgindex = rtm_msgindex(msgtype);

        rtnl_lock();
        tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
        if (tab == NULL) {
                tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
                if (!tab)
                        goto unlock;

                /* ensures we see the 0 stores */
                rcu_assign_pointer(rtnl_msg_handlers[protocol], tab);
        }

        old = rtnl_dereference(tab[msgindex]);
        if (old) {
                link = kmemdup(old, sizeof(*old), GFP_KERNEL);
                if (!link)
                        goto unlock;
        } else {
                link = kzalloc(sizeof(*link), GFP_KERNEL);
                if (!link)
                        goto unlock;
        }

        WARN_ON(link->owner && link->owner != owner);
        link->owner = owner;

        WARN_ON(doit && link->doit && link->doit != doit);
        if (doit)
                link->doit = doit;
        WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit);
        if (dumpit)
                link->dumpit = dumpit;

        WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL &&
                (flags & RTNL_FLAG_BULK_DEL_SUPPORTED));
        link->flags |= flags;

        /* publish protocol:msgtype */
        rcu_assign_pointer(tab[msgindex], link);
        ret = 0;
        if (old)
                kfree_rcu(old, rcu);
unlock:
        rtnl_unlock();
        return ret;
}

/**
 * rtnl_unregister - Unregister a rtnetlink message type
 * @protocol: Protocol family or PF_UNSPEC
 * @msgtype: rtnetlink message type
 *
 * Returns 0 on success or a negative error code.
 */
static int rtnl_unregister(int protocol, int msgtype)
{
        struct rtnl_link __rcu **tab;
        struct rtnl_link *link;
        int msgindex;

        BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
        msgindex = rtm_msgindex(msgtype);

        rtnl_lock();
        tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
        if (!tab) {
                rtnl_unlock();
                return -ENOENT;
        }

        link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
        rtnl_unlock();

        kfree_rcu(link, rcu);

        return 0;
}

/**
 * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
 * @protocol : Protocol family or PF_UNSPEC
 *
 * Identical to calling rtnl_unregster() for all registered message types
 * of a certain protocol family.
 */
void rtnl_unregister_all(int protocol)
{
        struct rtnl_link __rcu **tab;
        struct rtnl_link *link;
        int msgindex;

        BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);

        rtnl_lock();
        tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL);
        if (!tab) {
                rtnl_unlock();
                return;
        }
        for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
                link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
                kfree_rcu(link, rcu);
        }
        rtnl_unlock();

        synchronize_net();

        kfree(tab);
}
EXPORT_SYMBOL_GPL(rtnl_unregister_all);

/**
 * __rtnl_register_many - Register rtnetlink message types
 * @handlers: Array of struct rtnl_msg_handlers
 * @n: The length of @handlers
 *
 * Registers the specified function pointers (at least one of them has
 * to be non-NULL) to be called whenever a request message for the
 * specified protocol family and message type is received.
 *
 * The special protocol family PF_UNSPEC may be used to define fallback
 * function pointers for the case when no entry for the specific protocol
 * family exists.
 *
 * When one element of @handlers fails to register,
 * 1) built-in: panics.
 * 2) modules : the previous successful registrations are unwinded
 *              and an error is returned.
 *
 * Use rtnl_register_many().
 */
int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n)
{
        const struct rtnl_msg_handler *handler;
        int i, err;

        for (i = 0, handler = handlers; i < n; i++, handler++) {
                err = rtnl_register_internal(handler->owner, handler->protocol,
                                             handler->msgtype, handler->doit,
                                             handler->dumpit, handler->flags);
                if (err) {
                        if (!handler->owner)
                                panic("Unable to register rtnetlink message "
                                      "handlers, %pS\n", handlers);

                        __rtnl_unregister_many(handlers, i);
                        break;
                }
        }

        return err;
}
EXPORT_SYMBOL_GPL(__rtnl_register_many);

void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n)
{
        const struct rtnl_msg_handler *handler;
        int i;

        for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--)
                rtnl_unregister(handler->protocol, handler->msgtype);
}
EXPORT_SYMBOL_GPL(__rtnl_unregister_many);

static DEFINE_MUTEX(link_ops_mutex);
static LIST_HEAD(link_ops);

static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index)
{
        struct rtnl_link_ops *ops;

        rcu_read_lock();

        list_for_each_entry_rcu(ops, &link_ops, list) {
                if (!strcmp(ops->kind, kind)) {
                        *srcu_index = srcu_read_lock(&ops->srcu);
                        goto unlock;
                }
        }

        ops = NULL;
unlock:
        rcu_read_unlock();

        return ops;
}

static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index)
{
        srcu_read_unlock(&ops->srcu, srcu_index);
}

/**
 * rtnl_link_register - Register rtnl_link_ops with rtnetlink.
 * @ops: struct rtnl_link_ops * to register
 *
 * Returns 0 on success or a negative error code.
 */
int rtnl_link_register(struct rtnl_link_ops *ops)
{
        struct rtnl_link_ops *tmp;
        int err;

        /* Sanity-check max sizes to avoid stack buffer overflow. */
        if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
                    ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
                return -EINVAL;

        /* The check for alloc/setup is here because if ops
         * does not have that filled up, it is not possible
         * to use the ops for creating device. So do not
         * fill up dellink as well. That disables rtnl_dellink.
         */
        if ((ops->alloc || ops->setup) && !ops->dellink)
                ops->dellink = unregister_netdevice_queue;

        err = init_srcu_struct(&ops->srcu);
        if (err)
                return err;

        mutex_lock(&link_ops_mutex);

        list_for_each_entry(tmp, &link_ops, list) {
                if (!strcmp(ops->kind, tmp->kind)) {
                        err = -EEXIST;
                        goto unlock;
                }
        }

        list_add_tail_rcu(&ops->list, &link_ops);
unlock:
        mutex_unlock(&link_ops_mutex);

        return err;
}
EXPORT_SYMBOL_GPL(rtnl_link_register);

static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
{
        struct net_device *dev;
        LIST_HEAD(list_kill);

        for_each_netdev(net, dev) {
                if (dev->rtnl_link_ops == ops)
                        ops->dellink(dev, &list_kill);
        }
        unregister_netdevice_many(&list_kill);
}

/* Return with the rtnl_lock held when there are no network
 * devices unregistering in any network namespace.
 */
static void rtnl_lock_unregistering_all(void)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);

        add_wait_queue(&netdev_unregistering_wq, &wait);
        for (;;) {
                rtnl_lock();
                /* We held write locked pernet_ops_rwsem, and parallel
                 * setup_net() and cleanup_net() are not possible.
                 */
                if (!atomic_read(&dev_unreg_count))
                        break;
                __rtnl_unlock();

                wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
        }
        remove_wait_queue(&netdev_unregistering_wq, &wait);
}

/**
 * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
 * @ops: struct rtnl_link_ops * to unregister
 */
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
        struct net *net;

        mutex_lock(&link_ops_mutex);
        list_del_rcu(&ops->list);
        mutex_unlock(&link_ops_mutex);

        synchronize_srcu(&ops->srcu);
        cleanup_srcu_struct(&ops->srcu);

        /* Close the race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);
        rtnl_lock_unregistering_all();

        for_each_net(net)
                __rtnl_kill_links(net, ops);

        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);

static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev)
{
        struct net_device *master_dev;
        const struct rtnl_link_ops *ops;
        size_t size = 0;

        rcu_read_lock();

        master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
        if (!master_dev)
                goto out;

        ops = master_dev->rtnl_link_ops;
        if (!ops || !ops->get_slave_size)
                goto out;
        /* IFLA_INFO_SLAVE_DATA + nested data */
        size = nla_total_size(sizeof(struct nlattr)) +
               ops->get_slave_size(master_dev, dev);

out:
        rcu_read_unlock();
        return size;
}

static size_t rtnl_link_get_size(const struct net_device *dev)
{
        const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
        size_t size;

        if (!ops)
                return 0;

        size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */
               nla_total_size(strlen(ops->kind) + 1);  /* IFLA_INFO_KIND */

        if (ops->get_size)
                /* IFLA_INFO_DATA + nested data */
                size += nla_total_size(sizeof(struct nlattr)) +
                        ops->get_size(dev);

        if (ops->get_xstats_size)
                /* IFLA_INFO_XSTATS */
                size += nla_total_size(ops->get_xstats_size(dev));

        size += rtnl_link_get_slave_info_data_size(dev);

        return size;
}

static LIST_HEAD(rtnl_af_ops);

static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index)
{
        struct rtnl_af_ops *ops;

        ASSERT_RTNL();

        rcu_read_lock();

        list_for_each_entry_rcu(ops, &rtnl_af_ops, list) {
                if (ops->family == family) {
                        *srcu_index = srcu_read_lock(&ops->srcu);
                        goto unlock;
                }
        }

        ops = NULL;
unlock:
        rcu_read_unlock();

        return ops;
}

static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index)
{
        srcu_read_unlock(&ops->srcu, srcu_index);
}

/**
 * rtnl_af_register - Register rtnl_af_ops with rtnetlink.
 * @ops: struct rtnl_af_ops * to register
 *
 * Return: 0 on success or a negative error code.
 */
int rtnl_af_register(struct rtnl_af_ops *ops)
{
        int err = init_srcu_struct(&ops->srcu);

        if (err)
                return err;

        rtnl_lock();
        list_add_tail_rcu(&ops->list, &rtnl_af_ops);
        rtnl_unlock();

        return 0;
}
EXPORT_SYMBOL_GPL(rtnl_af_register);

/**
 * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink.
 * @ops: struct rtnl_af_ops * to unregister
 */
void rtnl_af_unregister(struct rtnl_af_ops *ops)
{
        rtnl_lock();
        list_del_rcu(&ops->list);
        rtnl_unlock();

        synchronize_rcu();
        synchronize_srcu(&ops->srcu);
        cleanup_srcu_struct(&ops->srcu);
}
EXPORT_SYMBOL_GPL(rtnl_af_unregister);

static size_t rtnl_link_get_af_size(const struct net_device *dev,
                                    u32 ext_filter_mask)
{
        struct rtnl_af_ops *af_ops;
        size_t size;

        /* IFLA_AF_SPEC */
        size = nla_total_size(sizeof(struct nlattr));

        rcu_read_lock();
        list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
                if (af_ops->get_link_af_size) {
                        /* AF_* + nested data */
                        size += nla_total_size(sizeof(struct nlattr)) +
                                af_ops->get_link_af_size(dev, ext_filter_mask);
                }
        }
        rcu_read_unlock();

        return size;
}

static bool rtnl_have_link_slave_info(const struct net_device *dev)
{
        struct net_device *master_dev;
        bool ret = false;

        rcu_read_lock();

        master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev);
        if (master_dev && master_dev->rtnl_link_ops)
                ret = true;
        rcu_read_unlock();
        return ret;
}

static int rtnl_link_slave_info_fill(struct sk_buff *skb,
                                     const struct net_device *dev)
{
        struct net_device *master_dev;
        const struct rtnl_link_ops *ops;
        struct nlattr *slave_data;
        int err;

        master_dev = netdev_master_upper_dev_get((struct net_device *) dev);
        if (!master_dev)
                return 0;
        ops = master_dev->rtnl_link_ops;
        if (!ops)
                return 0;
        if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0)
                return -EMSGSIZE;
        if (ops->fill_slave_info) {
                slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA);
                if (!slave_data)
                        return -EMSGSIZE;
                err = ops->fill_slave_info(skb, master_dev, dev);
                if (err < 0)
                        goto err_cancel_slave_data;
                nla_nest_end(skb, slave_data);
        }
        return 0;

err_cancel_slave_data:
        nla_nest_cancel(skb, slave_data);
        return err;
}

static int rtnl_link_info_fill(struct sk_buff *skb,
                               const struct net_device *dev)
{
        const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
        struct nlattr *data;
        int err;

        if (!ops)
                return 0;
        if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0)
                return -EMSGSIZE;
        if (ops->fill_xstats) {
                err = ops->fill_xstats(skb, dev);
                if (err < 0)
                        return err;
        }
        if (ops->fill_info) {
                data = nla_nest_start_noflag(skb, IFLA_INFO_DATA);
                if (data == NULL)
                        return -EMSGSIZE;
                err = ops->fill_info(skb, dev);
                if (err < 0)
                        goto err_cancel_data;
                nla_nest_end(skb, data);
        }
        return 0;

err_cancel_data:
        nla_nest_cancel(skb, data);
        return err;
}

static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
{
        struct nlattr *linkinfo;
        int err = -EMSGSIZE;

        linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO);
        if (linkinfo == NULL)
                goto out;

        err = rtnl_link_info_fill(skb, dev);
        if (err < 0)
                goto err_cancel_link;

        err = rtnl_link_slave_info_fill(skb, dev);
        if (err < 0)
                goto err_cancel_link;

        nla_nest_end(skb, linkinfo);
        return 0;

err_cancel_link:
        nla_nest_cancel(skb, linkinfo);
out:
        return err;
}

int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
{
        struct sock *rtnl = net->rtnl;

        return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL);
}

int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
{
        struct sock *rtnl = net->rtnl;

        return nlmsg_unicast(rtnl, skb, pid);
}
EXPORT_SYMBOL(rtnl_unicast);

void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
                 const struct nlmsghdr *nlh, gfp_t flags)
{
        struct sock *rtnl = net->rtnl;

        nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags);
}
EXPORT_SYMBOL(rtnl_notify);

void rtnl_set_sk_err(struct net *net, u32 group, int error)
{
        struct sock *rtnl = net->rtnl;

        netlink_set_err(rtnl, 0, group, error);
}
EXPORT_SYMBOL(rtnl_set_sk_err);

int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
{
        struct nlattr *mx;
        int i, valid = 0;

        /* nothing is dumped for dst_default_metrics, so just skip the loop */
        if (metrics == dst_default_metrics.metrics)
                return 0;

        mx = nla_nest_start_noflag(skb, RTA_METRICS);
        if (mx == NULL)
                return -ENOBUFS;

        for (i = 0; i < RTAX_MAX; i++) {
                if (metrics[i]) {
                        if (i == RTAX_CC_ALGO - 1) {
                                char tmp[TCP_CA_NAME_MAX], *name;

                                name = tcp_ca_get_name_by_key(metrics[i], tmp);
                                if (!name)
                                        continue;
                                if (nla_put_string(skb, i + 1, name))
                                        goto nla_put_failure;
                        } else if (i == RTAX_FEATURES - 1) {
                                u32 user_features = metrics[i] & RTAX_FEATURE_MASK;

                                if (!user_features)
                                        continue;
                                BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK);
                                if (nla_put_u32(skb, i + 1, user_features))
                                        goto nla_put_failure;
                        } else {
                                if (nla_put_u32(skb, i + 1, metrics[i]))
                                        goto nla_put_failure;
                        }
                        valid++;
                }
        }

        if (!valid) {
                nla_nest_cancel(skb, mx);
                return 0;
        }

        return nla_nest_end(skb, mx);

nla_put_failure:
        nla_nest_cancel(skb, mx);
        return -EMSGSIZE;
}
EXPORT_SYMBOL(rtnetlink_put_metrics);

int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
                       long expires, u32 error)
{
        struct rta_cacheinfo ci = {
                .rta_error = error,
                .rta_id =  id,
        };

        if (dst) {
                ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
                ci.rta_used = dst->__use;
                ci.rta_clntref = rcuref_read(&dst->__rcuref);
        }
        if (expires) {
                unsigned long clock;

                clock = jiffies_to_clock_t(abs(expires));
                clock = min_t(unsigned long, clock, INT_MAX);
                ci.rta_expires = (expires > 0) ? clock : -clock;
        }
        return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
}
EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);

void netif_set_operstate(struct net_device *dev, int newstate)
{
        unsigned int old = READ_ONCE(dev->operstate);

        do {
                if (old == newstate)
                        return;
        } while (!try_cmpxchg(&dev->operstate, &old, newstate));

        netif_state_change(dev);
}
EXPORT_SYMBOL(netif_set_operstate);

static void set_operstate(struct net_device *dev, unsigned char transition)
{
        unsigned char operstate = READ_ONCE(dev->operstate);

        switch (transition) {
        case IF_OPER_UP:
                if ((operstate == IF_OPER_DORMANT ||
                     operstate == IF_OPER_TESTING ||
                     operstate == IF_OPER_UNKNOWN) &&
                    !netif_dormant(dev) && !netif_testing(dev))
                        operstate = IF_OPER_UP;
                break;

        case IF_OPER_TESTING:
                if (netif_oper_up(dev))
                        operstate = IF_OPER_TESTING;
                break;

        case IF_OPER_DORMANT:
                if (netif_oper_up(dev))
                        operstate = IF_OPER_DORMANT;
                break;
        }

        netif_set_operstate(dev, operstate);
}

static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
{
        return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) |
               (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI));
}

static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
                                           const struct ifinfomsg *ifm)
{
        unsigned int flags = ifm->ifi_flags;

        /* bugwards compatibility: ifi_change == 0 is treated as ~0 */
        if (ifm->ifi_change)
                flags = (flags & ifm->ifi_change) |
                        (rtnl_dev_get_flags(dev) & ~ifm->ifi_change);

        return flags;
}

static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
                                 const struct rtnl_link_stats64 *b)
{
        a->rx_packets = b->rx_packets;
        a->tx_packets = b->tx_packets;
        a->rx_bytes = b->rx_bytes;
        a->tx_bytes = b->tx_bytes;
        a->rx_errors = b->rx_errors;
        a->tx_errors = b->tx_errors;
        a->rx_dropped = b->rx_dropped;
        a->tx_dropped = b->tx_dropped;

        a->multicast = b->multicast;
        a->collisions = b->collisions;

        a->rx_length_errors = b->rx_length_errors;
        a->rx_over_errors = b->rx_over_errors;
        a->rx_crc_errors = b->rx_crc_errors;
        a->rx_frame_errors = b->rx_frame_errors;
        a->rx_fifo_errors = b->rx_fifo_errors;
        a->rx_missed_errors = b->rx_missed_errors;

        a->tx_aborted_errors = b->tx_aborted_errors;
        a->tx_carrier_errors = b->tx_carrier_errors;
        a->tx_fifo_errors = b->tx_fifo_errors;
        a->tx_heartbeat_errors = b->tx_heartbeat_errors;
        a->tx_window_errors = b->tx_window_errors;

        a->rx_compressed = b->rx_compressed;
        a->tx_compressed = b->tx_compressed;

        a->rx_nohandler = b->rx_nohandler;
}

/* All VF info */
static inline int rtnl_vfinfo_size(const struct net_device *dev,
                                   u32 ext_filter_mask)
{
        if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) {
                int num_vfs = dev_num_vf(dev->dev.parent);
                size_t size = nla_total_size(0);
                size += num_vfs *
                        (nla_total_size(0) +
                         nla_total_size(sizeof(struct ifla_vf_mac)) +
                         nla_total_size(sizeof(struct ifla_vf_broadcast)) +
                         nla_total_size(sizeof(struct ifla_vf_vlan)) +
                         nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
                         nla_total_size(MAX_VLAN_LIST_LEN *
                                        sizeof(struct ifla_vf_vlan_info)) +
                         nla_total_size(sizeof(struct ifla_vf_spoofchk)) +
                         nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
                         nla_total_size(sizeof(struct ifla_vf_rate)) +
                         nla_total_size(sizeof(struct ifla_vf_link_state)) +
                         nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
                         nla_total_size(sizeof(struct ifla_vf_trust)));
                if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
                        size += num_vfs *
                                (nla_total_size(0) + /* nest IFLA_VF_STATS */
                                 /* IFLA_VF_STATS_RX_PACKETS */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_TX_PACKETS */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_RX_BYTES */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_TX_BYTES */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_BROADCAST */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_MULTICAST */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_RX_DROPPED */
                                 nla_total_size_64bit(sizeof(__u64)) +
                                 /* IFLA_VF_STATS_TX_DROPPED */
                                 nla_total_size_64bit(sizeof(__u64)));
                }
                if (dev->netdev_ops->ndo_get_vf_guid)
                        size += num_vfs * 2 *
                                nla_total_size(sizeof(struct ifla_vf_guid));
                return size;
        } else
                return 0;
}

static size_t rtnl_port_size(const struct net_device *dev,
                             u32 ext_filter_mask)
{
        size_t port_size = nla_total_size(4)                /* PORT_VF */
                + nla_total_size(PORT_PROFILE_MAX)        /* PORT_PROFILE */
                + nla_total_size(PORT_UUID_MAX)                /* PORT_INSTANCE_UUID */
                + nla_total_size(PORT_UUID_MAX)                /* PORT_HOST_UUID */
                + nla_total_size(1)                        /* PROT_VDP_REQUEST */
                + nla_total_size(2);                        /* PORT_VDP_RESPONSE */
        size_t vf_ports_size = nla_total_size(sizeof(struct nlattr));
        size_t vf_port_size = nla_total_size(sizeof(struct nlattr))
                + port_size;
        size_t port_self_size = nla_total_size(sizeof(struct nlattr))
                + port_size;

        if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
            !(ext_filter_mask & RTEXT_FILTER_VF))
                return 0;
        if (dev_num_vf(dev->dev.parent))
                return port_self_size + vf_ports_size +
                        vf_port_size * dev_num_vf(dev->dev.parent);
        else
                return port_self_size;
}

static size_t rtnl_xdp_size(void)
{
        size_t xdp_size = nla_total_size(0) +        /* nest IFLA_XDP */
                          nla_total_size(1) +        /* XDP_ATTACHED */
                          nla_total_size(4) +        /* XDP_PROG_ID (or 1st mode) */
                          nla_total_size(4);        /* XDP_<mode>_PROG_ID */

        return xdp_size;
}

static size_t rtnl_prop_list_size(const struct net_device *dev)
{
        struct netdev_name_node *name_node;
        unsigned int cnt = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(name_node, &dev->name_node->list, list)
                cnt++;
        rcu_read_unlock();

        if (!cnt)
                return 0;

        return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ);
}

static size_t rtnl_proto_down_size(const struct net_device *dev)
{
        size_t size = nla_total_size(1);

        /* Assume dev->proto_down_reason is not zero. */
        size += nla_total_size(0) + nla_total_size(4);

        return size;
}

static size_t rtnl_devlink_port_size(const struct net_device *dev)
{
        size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */

        if (dev->devlink_port)
                size += devlink_nl_port_handle_size(dev->devlink_port);

        return size;
}

static size_t rtnl_dpll_pin_size(const struct net_device *dev)
{
        size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */

        size += dpll_netdev_pin_handle_size(dev);

        return size;
}

static noinline size_t if_nlmsg_size(const struct net_device *dev,
                                     u32 ext_filter_mask)
{
        return NLMSG_ALIGN(sizeof(struct ifinfomsg))
               + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
               + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
               + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
               + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap))
               + nla_total_size(sizeof(struct rtnl_link_stats))
               + nla_total_size_64bit(sizeof(struct rtnl_link_stats64))
               + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
               + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
               + nla_total_size(4) /* IFLA_TXQLEN */
               + nla_total_size(4) /* IFLA_WEIGHT */
               + nla_total_size(4) /* IFLA_MTU */
               + nla_total_size(4) /* IFLA_LINK */
               + nla_total_size(4) /* IFLA_MASTER */
               + nla_total_size(1) /* IFLA_CARRIER */
               + nla_total_size(4) /* IFLA_PROMISCUITY */
               + nla_total_size(4) /* IFLA_ALLMULTI */
               + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
               + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
               + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
               + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
               + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */
               + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */
               + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */
               + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */
               + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */
               + nla_total_size(1) /* IFLA_OPERSTATE */
               + nla_total_size(1) /* IFLA_LINKMODE */
               + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */
               + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
               + nla_total_size(4) /* IFLA_LINK_NETNSID */
               + nla_total_size(4) /* IFLA_GROUP */
               + nla_total_size(ext_filter_mask
                                & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
               + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
               + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */
               + rtnl_link_get_size(dev) /* IFLA_LINKINFO */
               + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */
               + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
               + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
               + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
               + rtnl_xdp_size() /* IFLA_XDP */
               + nla_total_size(4)  /* IFLA_EVENT */
               + nla_total_size(4)  /* IFLA_NEW_NETNSID */
               + nla_total_size(4)  /* IFLA_NEW_IFINDEX */
               + rtnl_proto_down_size(dev)  /* proto down */
               + nla_total_size(4)  /* IFLA_TARGET_NETNSID */
               + nla_total_size(4)  /* IFLA_CARRIER_UP_COUNT */
               + nla_total_size(4)  /* IFLA_CARRIER_DOWN_COUNT */
               + nla_total_size(4)  /* IFLA_MIN_MTU */
               + nla_total_size(4)  /* IFLA_MAX_MTU */
               + rtnl_prop_list_size(dev)
               + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
               + rtnl_devlink_port_size(dev)
               + rtnl_dpll_pin_size(dev)
               + nla_total_size(8)  /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
               + 0;
}

static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
{
        struct nlattr *vf_ports;
        struct nlattr *vf_port;
        int vf;
        int err;

        vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS);
        if (!vf_ports)
                return -EMSGSIZE;

        for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
                vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT);
                if (!vf_port)
                        goto nla_put_failure;
                if (nla_put_u32(skb, IFLA_PORT_VF, vf))
                        goto nla_put_failure;
                err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb);
                if (err == -EMSGSIZE)
                        goto nla_put_failure;
                if (err) {
                        nla_nest_cancel(skb, vf_port);
                        continue;
                }
                nla_nest_end(skb, vf_port);
        }

        nla_nest_end(skb, vf_ports);

        return 0;

nla_put_failure:
        nla_nest_cancel(skb, vf_ports);
        return -EMSGSIZE;
}

static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
{
        struct nlattr *port_self;
        int err;

        port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF);
        if (!port_self)
                return -EMSGSIZE;

        err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb);
        if (err) {
                nla_nest_cancel(skb, port_self);
                return (err == -EMSGSIZE) ? err : 0;
        }

        nla_nest_end(skb, port_self);

        return 0;
}

static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev,
                          u32 ext_filter_mask)
{
        int err;

        if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent ||
            !(ext_filter_mask & RTEXT_FILTER_VF))
                return 0;

        err = rtnl_port_self_fill(skb, dev);
        if (err)
                return err;

        if (dev_num_vf(dev->dev.parent)) {
                err = rtnl_vf_ports_fill(skb, dev);
                if (err)
                        return err;
        }

        return 0;
}

static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev)
{
        int err;
        struct netdev_phys_item_id ppid;

        err = dev_get_phys_port_id(dev, &ppid);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id))
                return -EMSGSIZE;

        return 0;
}

static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
{
        char name[IFNAMSIZ];
        int err;

        err = dev_get_phys_port_name(dev, name, sizeof(name));
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name))
                return -EMSGSIZE;

        return 0;
}

static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
{
        struct netdev_phys_item_id ppid = { };
        int err;

        err = dev_get_port_parent_id(dev, &ppid, false);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id))
                return -EMSGSIZE;

        return 0;
}

static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
                                              struct net_device *dev)
{
        struct rtnl_link_stats64 *sp;
        struct nlattr *attr;

        attr = nla_reserve_64bit(skb, IFLA_STATS64,
                                 sizeof(struct rtnl_link_stats64), IFLA_PAD);
        if (!attr)
                return -EMSGSIZE;

        sp = nla_data(attr);
        dev_get_stats(dev, sp);

        attr = nla_reserve(skb, IFLA_STATS,
                           sizeof(struct rtnl_link_stats));
        if (!attr)
                return -EMSGSIZE;

        copy_rtnl_link_stats(nla_data(attr), sp);

        return 0;
}

static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
                                               struct net_device *dev,
                                               int vfs_num,
                                               u32 ext_filter_mask)
{
        struct ifla_vf_rss_query_en vf_rss_query_en;
        struct nlattr *vf, *vfstats, *vfvlanlist;
        struct ifla_vf_link_state vf_linkstate;
        struct ifla_vf_vlan_info vf_vlan_info;
        struct ifla_vf_spoofchk vf_spoofchk;
        struct ifla_vf_tx_rate vf_tx_rate;
        struct ifla_vf_stats vf_stats;
        struct ifla_vf_trust vf_trust;
        struct ifla_vf_vlan vf_vlan;
        struct ifla_vf_rate vf_rate;
        struct ifla_vf_mac vf_mac;
        struct ifla_vf_broadcast vf_broadcast;
        struct ifla_vf_info ivi;
        struct ifla_vf_guid node_guid;
        struct ifla_vf_guid port_guid;

        memset(&ivi, 0, sizeof(ivi));

        /* Not all SR-IOV capable drivers support the
         * spoofcheck and "RSS query enable" query.  Preset to
         * -1 so the user space tool can detect that the driver
         * didn't report anything.
         */
        ivi.spoofchk = -1;
        ivi.rss_query_en = -1;
        ivi.trusted = -1;
        /* The default value for VF link state is "auto"
         * IFLA_VF_LINK_STATE_AUTO which equals zero
         */
        ivi.linkstate = 0;
        /* VLAN Protocol by default is 802.1Q */
        ivi.vlan_proto = htons(ETH_P_8021Q);
        if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
                return 0;

        memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
        memset(&node_guid, 0, sizeof(node_guid));
        memset(&port_guid, 0, sizeof(port_guid));

        vf_mac.vf =
                vf_vlan.vf =
                vf_vlan_info.vf =
                vf_rate.vf =
                vf_tx_rate.vf =
                vf_spoofchk.vf =
                vf_linkstate.vf =
                vf_rss_query_en.vf =
                vf_trust.vf =
                node_guid.vf =
                port_guid.vf = ivi.vf;

        memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
        memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
        vf_vlan.vlan = ivi.vlan;
        vf_vlan.qos = ivi.qos;
        vf_vlan_info.vlan = ivi.vlan;
        vf_vlan_info.qos = ivi.qos;
        vf_vlan_info.vlan_proto = ivi.vlan_proto;
        vf_tx_rate.rate = ivi.max_tx_rate;
        vf_rate.min_tx_rate = ivi.min_tx_rate;
        vf_rate.max_tx_rate = ivi.max_tx_rate;
        vf_spoofchk.setting = ivi.spoofchk;
        vf_linkstate.link_state = ivi.linkstate;
        vf_rss_query_en.setting = ivi.rss_query_en;
        vf_trust.setting = ivi.trusted;
        vf = nla_nest_start_noflag(skb, IFLA_VF_INFO);
        if (!vf)
                return -EMSGSIZE;
        if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
            nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) ||
            nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
            nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
                    &vf_rate) ||
            nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
                    &vf_tx_rate) ||
            nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
                    &vf_spoofchk) ||
            nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
                    &vf_linkstate) ||
            nla_put(skb, IFLA_VF_RSS_QUERY_EN,
                    sizeof(vf_rss_query_en),
                    &vf_rss_query_en) ||
            nla_put(skb, IFLA_VF_TRUST,
                    sizeof(vf_trust), &vf_trust))
                goto nla_put_vf_failure;

        if (dev->netdev_ops->ndo_get_vf_guid &&
            !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid,
                                              &port_guid)) {
                if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid),
                            &node_guid) ||
                    nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid),
                            &port_guid))
                        goto nla_put_vf_failure;
        }
        vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST);
        if (!vfvlanlist)
                goto nla_put_vf_failure;
        if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
                    &vf_vlan_info)) {
                nla_nest_cancel(skb, vfvlanlist);
                goto nla_put_vf_failure;
        }
        nla_nest_end(skb, vfvlanlist);
        if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
                memset(&vf_stats, 0, sizeof(vf_stats));
                if (dev->netdev_ops->ndo_get_vf_stats)
                        dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
                                                          &vf_stats);
                vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS);
                if (!vfstats)
                        goto nla_put_vf_failure;
                if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
                                      vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
                                      vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
                                      vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
                                      vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
                                      vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
                                      vf_stats.multicast, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
                                      vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
                    nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
                                      vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
                        nla_nest_cancel(skb, vfstats);
                        goto nla_put_vf_failure;
                }
                nla_nest_end(skb, vfstats);
        }
        nla_nest_end(skb, vf);
        return 0;

nla_put_vf_failure:
        nla_nest_cancel(skb, vf);
        return -EMSGSIZE;
}

static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
                                           struct net_device *dev,
                                           u32 ext_filter_mask)
{
        struct nlattr *vfinfo;
        int i, num_vfs;

        if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0))
                return 0;

        num_vfs = dev_num_vf(dev->dev.parent);
        if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs))
                return -EMSGSIZE;

        if (!dev->netdev_ops->ndo_get_vf_config)
                return 0;

        vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST);
        if (!vfinfo)
                return -EMSGSIZE;

        for (i = 0; i < num_vfs; i++) {
                if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) {
                        nla_nest_cancel(skb, vfinfo);
                        return -EMSGSIZE;
                }
        }

        nla_nest_end(skb, vfinfo);
        return 0;
}

static int rtnl_fill_link_ifmap(struct sk_buff *skb,
                                const struct net_device *dev)
{
        struct rtnl_link_ifmap map;

        memset(&map, 0, sizeof(map));
        map.mem_start = READ_ONCE(dev->mem_start);
        map.mem_end   = READ_ONCE(dev->mem_end);
        map.base_addr = READ_ONCE(dev->base_addr);
        map.irq       = READ_ONCE(dev->irq);
        map.dma       = READ_ONCE(dev->dma);
        map.port      = READ_ONCE(dev->if_port);

        if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD))
                return -EMSGSIZE;

        return 0;
}

static u32 rtnl_xdp_prog_skb(struct net_device *dev)
{
        const struct bpf_prog *generic_xdp_prog;
        u32 res = 0;

        rcu_read_lock();
        generic_xdp_prog = rcu_dereference(dev->xdp_prog);
        if (generic_xdp_prog)
                res = generic_xdp_prog->aux->id;
        rcu_read_unlock();

        return res;
}

static u32 rtnl_xdp_prog_drv(struct net_device *dev)
{
        return dev_xdp_prog_id(dev, XDP_MODE_DRV);
}

static u32 rtnl_xdp_prog_hw(struct net_device *dev)
{
        return dev_xdp_prog_id(dev, XDP_MODE_HW);
}

static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
                               u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
                               u32 (*get_prog_id)(struct net_device *dev))
{
        u32 curr_id;
        int err;

        curr_id = get_prog_id(dev);
        if (!curr_id)
                return 0;

        *prog_id = curr_id;
        err = nla_put_u32(skb, attr, curr_id);
        if (err)
                return err;

        if (*mode != XDP_ATTACHED_NONE)
                *mode = XDP_ATTACHED_MULTI;
        else
                *mode = tgt_mode;

        return 0;
}

static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
{
        struct nlattr *xdp;
        u32 prog_id;
        int err;
        u8 mode;

        xdp = nla_nest_start_noflag(skb, IFLA_XDP);
        if (!xdp)
                return -EMSGSIZE;

        prog_id = 0;
        mode = XDP_ATTACHED_NONE;
        err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
                                  IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb);
        if (err)
                goto err_cancel;
        err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
                                  IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv);
        if (err)
                goto err_cancel;
        err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
                                  IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw);
        if (err)
                goto err_cancel;

        err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
        if (err)
                goto err_cancel;

        if (prog_id && mode != XDP_ATTACHED_MULTI) {
                err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
                if (err)
                        goto err_cancel;
        }

        nla_nest_end(skb, xdp);
        return 0;

err_cancel:
        nla_nest_cancel(skb, xdp);
        return err;
}

static u32 rtnl_get_event(unsigned long event)
{
        u32 rtnl_event_type = IFLA_EVENT_NONE;

        switch (event) {
        case NETDEV_REBOOT:
                rtnl_event_type = IFLA_EVENT_REBOOT;
                break;
        case NETDEV_FEAT_CHANGE:
                rtnl_event_type = IFLA_EVENT_FEATURES;
                break;
        case NETDEV_BONDING_FAILOVER:
                rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER;
                break;
        case NETDEV_NOTIFY_PEERS:
                rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS;
                break;
        case NETDEV_RESEND_IGMP:
                rtnl_event_type = IFLA_EVENT_IGMP_RESEND;
                break;
        case NETDEV_CHANGEINFODATA:
                rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS;
                break;
        default:
                break;
        }

        return rtnl_event_type;
}

static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
{
        const struct net_device *upper_dev;
        int ret = 0;

        rcu_read_lock();

        upper_dev = netdev_master_upper_dev_get_rcu(dev);
        if (upper_dev)
                ret = nla_put_u32(skb, IFLA_MASTER,
                                  READ_ONCE(upper_dev->ifindex));

        rcu_read_unlock();
        return ret;
}

static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev,
                          bool force)
{
        int iflink = dev_get_iflink(dev);

        if (force || READ_ONCE(dev->ifindex) != iflink)
                return nla_put_u32(skb, IFLA_LINK, iflink);

        return 0;
}

static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
                                              struct net_device *dev)
{
        char buf[IFALIASZ];
        int ret;

        ret = dev_get_alias(dev, buf, sizeof(buf));
        return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0;
}

static int rtnl_fill_link_netnsid(struct sk_buff *skb,
                                  const struct net_device *dev,
                                  struct net *src_net, gfp_t gfp)
{
        bool put_iflink = false;

        if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) {
                struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);

                if (!net_eq(dev_net(dev), link_net)) {
                        int id = peernet2id_alloc(src_net, link_net, gfp);

                        if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
                                return -EMSGSIZE;

                        put_iflink = true;
                }
        }

        return nla_put_iflink(skb, dev, put_iflink);
}

static int rtnl_fill_link_af(struct sk_buff *skb,
                             const struct net_device *dev,
                             u32 ext_filter_mask)
{
        const struct rtnl_af_ops *af_ops;
        struct nlattr *af_spec;

        af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
        if (!af_spec)
                return -EMSGSIZE;

        list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
                struct nlattr *af;
                int err;

                if (!af_ops->fill_link_af)
                        continue;

                af = nla_nest_start_noflag(skb, af_ops->family);
                if (!af)
                        return -EMSGSIZE;

                err = af_ops->fill_link_af(skb, dev, ext_filter_mask);
                /*
                 * Caller may return ENODATA to indicate that there
                 * was no data to be dumped. This is not an error, it
                 * means we should trim the attribute header and
                 * continue.
                 */
                if (err == -ENODATA)
                        nla_nest_cancel(skb, af);
                else if (err < 0)
                        return -EMSGSIZE;

                nla_nest_end(skb, af);
        }

        nla_nest_end(skb, af_spec);
        return 0;
}

static int rtnl_fill_alt_ifnames(struct sk_buff *skb,
                                 const struct net_device *dev)
{
        struct netdev_name_node *name_node;
        int count = 0;

        list_for_each_entry_rcu(name_node, &dev->name_node->list, list) {
                if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name))
                        return -EMSGSIZE;
                count++;
        }
        return count;
}

/* RCU protected. */
static int rtnl_fill_prop_list(struct sk_buff *skb,
                               const struct net_device *dev)
{
        struct nlattr *prop_list;
        int ret;

        prop_list = nla_nest_start(skb, IFLA_PROP_LIST);
        if (!prop_list)
                return -EMSGSIZE;

        ret = rtnl_fill_alt_ifnames(skb, dev);
        if (ret <= 0)
                goto nest_cancel;

        nla_nest_end(skb, prop_list);
        return 0;

nest_cancel:
        nla_nest_cancel(skb, prop_list);
        return ret;
}

static int rtnl_fill_proto_down(struct sk_buff *skb,
                                const struct net_device *dev)
{
        struct nlattr *pr;
        u32 preason;

        if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down)))
                goto nla_put_failure;

        preason = READ_ONCE(dev->proto_down_reason);
        if (!preason)
                return 0;

        pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON);
        if (!pr)
                return -EMSGSIZE;

        if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) {
                nla_nest_cancel(skb, pr);
                goto nla_put_failure;
        }

        nla_nest_end(skb, pr);
        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int rtnl_fill_devlink_port(struct sk_buff *skb,
                                  const struct net_device *dev)
{
        struct nlattr *devlink_port_nest;
        int ret;

        devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT);
        if (!devlink_port_nest)
                return -EMSGSIZE;

        if (dev->devlink_port) {
                ret = devlink_nl_port_handle_fill(skb, dev->devlink_port);
                if (ret < 0)
                        goto nest_cancel;
        }

        nla_nest_end(skb, devlink_port_nest);
        return 0;

nest_cancel:
        nla_nest_cancel(skb, devlink_port_nest);
        return ret;
}

static int rtnl_fill_dpll_pin(struct sk_buff *skb,
                              const struct net_device *dev)
{
        struct nlattr *dpll_pin_nest;
        int ret;

        dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN);
        if (!dpll_pin_nest)
                return -EMSGSIZE;

        ret = dpll_netdev_add_pin_handle(skb, dev);
        if (ret < 0)
                goto nest_cancel;

        nla_nest_end(skb, dpll_pin_nest);
        return 0;

nest_cancel:
        nla_nest_cancel(skb, dpll_pin_nest);
        return ret;
}

static int rtnl_fill_ifinfo(struct sk_buff *skb,
                            struct net_device *dev, struct net *src_net,
                            int type, u32 pid, u32 seq, u32 change,
                            unsigned int flags, u32 ext_filter_mask,
                            u32 event, int *new_nsid, int new_ifindex,
                            int tgt_netnsid, gfp_t gfp)
{
        char devname[IFNAMSIZ];
        struct ifinfomsg *ifm;
        struct nlmsghdr *nlh;
        struct Qdisc *qdisc;

        ASSERT_RTNL();
        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        ifm = nlmsg_data(nlh);
        ifm->ifi_family = AF_UNSPEC;
        ifm->__ifi_pad = 0;
        ifm->ifi_type = READ_ONCE(dev->type);
        ifm->ifi_index = READ_ONCE(dev->ifindex);
        ifm->ifi_flags = dev_get_flags(dev);
        ifm->ifi_change = change;

        if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
                goto nla_put_failure;

        netdev_copy_name(dev, devname);
        if (nla_put_string(skb, IFLA_IFNAME, devname))
                goto nla_put_failure;

        if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) ||
            nla_put_u8(skb, IFLA_OPERSTATE,
                       netif_running(dev) ? READ_ONCE(dev->operstate) :
                                            IF_OPER_DOWN) ||
            nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) ||
            nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) ||
            nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
            nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) ||
            nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) ||
            nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) ||
            nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) ||
            nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) ||
            nla_put_u32(skb, IFLA_NUM_TX_QUEUES,
                        READ_ONCE(dev->num_tx_queues)) ||
            nla_put_u32(skb, IFLA_GSO_MAX_SEGS,
                        READ_ONCE(dev->gso_max_segs)) ||
            nla_put_u32(skb, IFLA_GSO_MAX_SIZE,
                        READ_ONCE(dev->gso_max_size)) ||
            nla_put_u32(skb, IFLA_GRO_MAX_SIZE,
                        READ_ONCE(dev->gro_max_size)) ||
            nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE,
                        READ_ONCE(dev->gso_ipv4_max_size)) ||
            nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE,
                        READ_ONCE(dev->gro_ipv4_max_size)) ||
            nla_put_u32(skb, IFLA_TSO_MAX_SIZE,
                        READ_ONCE(dev->tso_max_size)) ||
            nla_put_u32(skb, IFLA_TSO_MAX_SEGS,
                        READ_ONCE(dev->tso_max_segs)) ||
            nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON,
                         READ_ONCE(dev->max_pacing_offload_horizon)) ||
#ifdef CONFIG_RPS
            nla_put_u32(skb, IFLA_NUM_RX_QUEUES,
                        READ_ONCE(dev->num_rx_queues)) ||
#endif
            put_master_ifindex(skb, dev) ||
            nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
            nla_put_ifalias(skb, dev) ||
            nla_put_u32(skb, IFLA_CARRIER_CHANGES,
                        atomic_read(&dev->carrier_up_count) +
                        atomic_read(&dev->carrier_down_count)) ||
            nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
                        atomic_read(&dev->carrier_up_count)) ||
            nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
                        atomic_read(&dev->carrier_down_count)))
                goto nla_put_failure;

        if (rtnl_fill_proto_down(skb, dev))
                goto nla_put_failure;

        if (event != IFLA_EVENT_NONE) {
                if (nla_put_u32(skb, IFLA_EVENT, event))
                        goto nla_put_failure;
        }

        if (dev->addr_len) {
                if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
                    nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast))
                        goto nla_put_failure;
        }

        if (rtnl_phys_port_id_fill(skb, dev))
                goto nla_put_failure;

        if (rtnl_phys_port_name_fill(skb, dev))
                goto nla_put_failure;

        if (rtnl_phys_switch_id_fill(skb, dev))
                goto nla_put_failure;

        if (rtnl_fill_stats(skb, dev))
                goto nla_put_failure;

        if (rtnl_fill_vf(skb, dev, ext_filter_mask))
                goto nla_put_failure;

        if (rtnl_port_fill(skb, dev, ext_filter_mask))
                goto nla_put_failure;

        if (rtnl_xdp_fill(skb, dev))
                goto nla_put_failure;

        if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) {
                if (rtnl_link_fill(skb, dev) < 0)
                        goto nla_put_failure;
        }

        if (new_nsid &&
            nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
                goto nla_put_failure;
        if (new_ifindex &&
            nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0)
                goto nla_put_failure;

        if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) &&
            nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr))
                goto nla_put_failure;

        rcu_read_lock();
        if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC))
                goto nla_put_failure_rcu;
        qdisc = rcu_dereference(dev->qdisc);
        if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id))
                goto nla_put_failure_rcu;
        if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
                goto nla_put_failure_rcu;
        if (rtnl_fill_link_ifmap(skb, dev))
                goto nla_put_failure_rcu;
        if (rtnl_fill_prop_list(skb, dev))
                goto nla_put_failure_rcu;
        rcu_read_unlock();

        if (dev->dev.parent &&
            nla_put_string(skb, IFLA_PARENT_DEV_NAME,
                           dev_name(dev->dev.parent)))
                goto nla_put_failure;

        if (dev->dev.parent && dev->dev.parent->bus &&
            nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME,
                           dev->dev.parent->bus->name))
                goto nla_put_failure;

        if (rtnl_fill_devlink_port(skb, dev))
                goto nla_put_failure;

        if (rtnl_fill_dpll_pin(skb, dev))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure_rcu:
        rcu_read_unlock();
nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
        [IFLA_UNSPEC]                = { .strict_start_type = IFLA_DPLL_PIN },
        [IFLA_IFNAME]                = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
        [IFLA_ADDRESS]                = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
        [IFLA_BROADCAST]        = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
        [IFLA_MAP]                = { .len = sizeof(struct rtnl_link_ifmap) },
        [IFLA_MTU]                = { .type = NLA_U32 },
        [IFLA_LINK]                = { .type = NLA_U32 },
        [IFLA_MASTER]                = { .type = NLA_U32 },
        [IFLA_CARRIER]                = { .type = NLA_U8 },
        [IFLA_TXQLEN]                = { .type = NLA_U32 },
        [IFLA_WEIGHT]                = { .type = NLA_U32 },
        [IFLA_OPERSTATE]        = { .type = NLA_U8 },
        [IFLA_LINKMODE]                = { .type = NLA_U8 },
        [IFLA_LINKINFO]                = { .type = NLA_NESTED },
        [IFLA_NET_NS_PID]        = { .type = NLA_U32 },
        [IFLA_NET_NS_FD]        = { .type = NLA_U32 },
        /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to
         * allow 0-length string (needed to remove an alias).
         */
        [IFLA_IFALIAS]                = { .type = NLA_BINARY, .len = IFALIASZ - 1 },
        [IFLA_VFINFO_LIST]        = {. type = NLA_NESTED },
        [IFLA_VF_PORTS]                = { .type = NLA_NESTED },
        [IFLA_PORT_SELF]        = { .type = NLA_NESTED },
        [IFLA_AF_SPEC]                = { .type = NLA_NESTED },
        [IFLA_EXT_MASK]                = { .type = NLA_U32 },
        [IFLA_PROMISCUITY]        = { .type = NLA_U32 },
        [IFLA_NUM_TX_QUEUES]        = { .type = NLA_U32 },
        [IFLA_NUM_RX_QUEUES]        = { .type = NLA_U32 },
        [IFLA_GSO_MAX_SEGS]        = { .type = NLA_U32 },
        [IFLA_GSO_MAX_SIZE]        = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
        [IFLA_PHYS_PORT_ID]        = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
        [IFLA_CARRIER_CHANGES]        = { .type = NLA_U32 },  /* ignored */
        [IFLA_PHYS_SWITCH_ID]        = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
        [IFLA_LINK_NETNSID]        = { .type = NLA_S32 },
        [IFLA_PROTO_DOWN]        = { .type = NLA_U8 },
        [IFLA_XDP]                = { .type = NLA_NESTED },
        [IFLA_EVENT]                = { .type = NLA_U32 },
        [IFLA_GROUP]                = { .type = NLA_U32 },
        [IFLA_TARGET_NETNSID]        = { .type = NLA_S32 },
        [IFLA_CARRIER_UP_COUNT]        = { .type = NLA_U32 },
        [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
        [IFLA_MIN_MTU]                = { .type = NLA_U32 },
        [IFLA_MAX_MTU]                = { .type = NLA_U32 },
        [IFLA_PROP_LIST]        = { .type = NLA_NESTED },
        [IFLA_ALT_IFNAME]        = { .type = NLA_STRING,
                                    .len = ALTIFNAMSIZ - 1 },
        [IFLA_PERM_ADDRESS]        = { .type = NLA_REJECT },
        [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
        [IFLA_NEW_IFINDEX]        = NLA_POLICY_MIN(NLA_S32, 1),
        [IFLA_PARENT_DEV_NAME]        = { .type = NLA_NUL_STRING },
        [IFLA_GRO_MAX_SIZE]        = { .type = NLA_U32 },
        [IFLA_TSO_MAX_SIZE]        = { .type = NLA_REJECT },
        [IFLA_TSO_MAX_SEGS]        = { .type = NLA_REJECT },
        [IFLA_ALLMULTI]                = { .type = NLA_REJECT },
        [IFLA_GSO_IPV4_MAX_SIZE]        = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
        [IFLA_GRO_IPV4_MAX_SIZE]        = { .type = NLA_U32 },
        [IFLA_NETNS_IMMUTABLE]        = { .type = NLA_REJECT },
};

static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
        [IFLA_INFO_KIND]        = { .type = NLA_STRING },
        [IFLA_INFO_DATA]        = { .type = NLA_NESTED },
        [IFLA_INFO_SLAVE_KIND]        = { .type = NLA_STRING },
        [IFLA_INFO_SLAVE_DATA]        = { .type = NLA_NESTED },
};

static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
        [IFLA_VF_MAC]                = { .len = sizeof(struct ifla_vf_mac) },
        [IFLA_VF_BROADCAST]        = { .type = NLA_REJECT },
        [IFLA_VF_VLAN]                = { .len = sizeof(struct ifla_vf_vlan) },
        [IFLA_VF_VLAN_LIST]     = { .type = NLA_NESTED },
        [IFLA_VF_TX_RATE]        = { .len = sizeof(struct ifla_vf_tx_rate) },
        [IFLA_VF_SPOOFCHK]        = { .len = sizeof(struct ifla_vf_spoofchk) },
        [IFLA_VF_RATE]                = { .len = sizeof(struct ifla_vf_rate) },
        [IFLA_VF_LINK_STATE]        = { .len = sizeof(struct ifla_vf_link_state) },
        [IFLA_VF_RSS_QUERY_EN]        = { .len = sizeof(struct ifla_vf_rss_query_en) },
        [IFLA_VF_STATS]                = { .type = NLA_NESTED },
        [IFLA_VF_TRUST]                = { .len = sizeof(struct ifla_vf_trust) },
        [IFLA_VF_IB_NODE_GUID]        = { .len = sizeof(struct ifla_vf_guid) },
        [IFLA_VF_IB_PORT_GUID]        = { .len = sizeof(struct ifla_vf_guid) },
};

static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
        [IFLA_PORT_VF]                = { .type = NLA_U32 },
        [IFLA_PORT_PROFILE]        = { .type = NLA_STRING,
                                    .len = PORT_PROFILE_MAX },
        [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY,
                                      .len = PORT_UUID_MAX },
        [IFLA_PORT_HOST_UUID]        = { .type = NLA_STRING,
                                    .len = PORT_UUID_MAX },
        [IFLA_PORT_REQUEST]        = { .type = NLA_U8, },
        [IFLA_PORT_RESPONSE]        = { .type = NLA_U16, },

        /* Unused, but we need to keep it here since user space could
         * fill it. It's also broken with regard to NLA_BINARY use in
         * combination with structs.
         */
        [IFLA_PORT_VSI_TYPE]        = { .type = NLA_BINARY,
                                    .len = sizeof(struct ifla_port_vsi) },
};

static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
        [IFLA_XDP_UNSPEC]        = { .strict_start_type = IFLA_XDP_EXPECTED_FD },
        [IFLA_XDP_FD]                = { .type = NLA_S32 },
        [IFLA_XDP_EXPECTED_FD]        = { .type = NLA_S32 },
        [IFLA_XDP_ATTACHED]        = { .type = NLA_U8 },
        [IFLA_XDP_FLAGS]        = { .type = NLA_U32 },
        [IFLA_XDP_PROG_ID]        = { .type = NLA_U32 },
};

static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla,
                                                  int *ops_srcu_index)
{
        struct nlattr *linfo[IFLA_INFO_MAX + 1];
        struct rtnl_link_ops *ops = NULL;

        if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0)
                return NULL;

        if (linfo[IFLA_INFO_KIND]) {
                char kind[MODULE_NAME_LEN];

                nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
                ops = rtnl_link_ops_get(kind, ops_srcu_index);
        }

        return ops;
}

static bool link_master_filtered(struct net_device *dev, int master_idx)
{
        struct net_device *master;

        if (!master_idx)
                return false;

        master = netdev_master_upper_dev_get(dev);

        /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need
         * another invalid value for ifindex to denote "no master".
         */
        if (master_idx == -1)
                return !!master;

        if (!master || master->ifindex != master_idx)
                return true;

        return false;
}

static bool link_kind_filtered(const struct net_device *dev,
                               const struct rtnl_link_ops *kind_ops)
{
        if (kind_ops && dev->rtnl_link_ops != kind_ops)
                return true;

        return false;
}

static bool link_dump_filtered(struct net_device *dev,
                               int master_idx,
                               const struct rtnl_link_ops *kind_ops)
{
        if (link_master_filtered(dev, master_idx) ||
            link_kind_filtered(dev, kind_ops))
                return true;

        return false;
}

/**
 * rtnl_get_net_ns_capable - Get netns if sufficiently privileged.
 * @sk: netlink socket
 * @netnsid: network namespace identifier
 *
 * Returns the network namespace identified by netnsid on success or an error
 * pointer on failure.
 */
struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid)
{
        struct net *net;

        net = get_net_ns_by_id(sock_net(sk), netnsid);
        if (!net)
                return ERR_PTR(-EINVAL);

        /* For now, the caller is required to have CAP_NET_ADMIN in
         * the user namespace owning the target net ns.
         */
        if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) {
                put_net(net);
                return ERR_PTR(-EACCES);
        }
        return net;
}
EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable);

static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
                                      bool strict_check, struct nlattr **tb,
                                      struct netlink_ext_ack *extack)
{
        int hdrlen;

        if (strict_check) {
                struct ifinfomsg *ifm;

                if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                        NL_SET_ERR_MSG(extack, "Invalid header for link dump");
                        return -EINVAL;
                }

                ifm = nlmsg_data(nlh);
                if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
                    ifm->ifi_change) {
                        NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request");
                        return -EINVAL;
                }
                if (ifm->ifi_index) {
                        NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps");
                        return -EINVAL;
                }

                return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb,
                                                     IFLA_MAX, ifla_policy,
                                                     extack);
        }

        /* A hack to preserve kernel<->userspace interface.
         * The correct header is ifinfomsg. It is consistent with rtnl_getlink.
         * However, before Linux v3.9 the code here assumed rtgenmsg and that's
         * what iproute2 < v3.9.0 used.
         * We can detect the old iproute2. Even including the IFLA_EXT_MASK
         * attribute, its netlink message is shorter than struct ifinfomsg.
         */
        hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
                 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);

        return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy,
                                      extack);
}

static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct rtnl_link_ops *kind_ops = NULL;
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        unsigned int flags = NLM_F_MULTI;
        struct nlattr *tb[IFLA_MAX+1];
        struct {
                unsigned long ifindex;
        } *ctx = (void *)cb->ctx;
        struct net *tgt_net = net;
        u32 ext_filter_mask = 0;
        struct net_device *dev;
        int ops_srcu_index;
        int master_idx = 0;
        int netnsid = -1;
        int err, i;

        err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
        if (err < 0) {
                if (cb->strict_check)
                        return err;

                goto walk_entries;
        }

        for (i = 0; i <= IFLA_MAX; ++i) {
                if (!tb[i])
                        continue;

                /* new attributes should only be added with strict checking */
                switch (i) {
                case IFLA_TARGET_NETNSID:
                        netnsid = nla_get_s32(tb[i]);
                        tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
                        if (IS_ERR(tgt_net)) {
                                NL_SET_ERR_MSG(extack, "Invalid target network namespace id");
                                err = PTR_ERR(tgt_net);
                                netnsid = -1;
                                goto out;
                        }
                        break;
                case IFLA_EXT_MASK:
                        ext_filter_mask = nla_get_u32(tb[i]);
                        break;
                case IFLA_MASTER:
                        master_idx = nla_get_u32(tb[i]);
                        break;
                case IFLA_LINKINFO:
                        kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index);
                        break;
                default:
                        if (cb->strict_check) {
                                NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request");
                                err = -EINVAL;
                                goto out;
                        }
                }
        }

        if (master_idx || kind_ops)
                flags |= NLM_F_DUMP_FILTERED;

walk_entries:
        err = 0;
        for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
                if (link_dump_filtered(dev, master_idx, kind_ops))
                        continue;
                err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
                                       NETLINK_CB(cb->skb).portid,
                                       nlh->nlmsg_seq, 0, flags,
                                       ext_filter_mask, 0, NULL, 0,
                                       netnsid, GFP_KERNEL);
                if (err < 0)
                        break;
        }


        cb->seq = tgt_net->dev_base_seq;
        nl_dump_check_consistent(cb, nlmsg_hdr(skb));

out:

        if (kind_ops)
                rtnl_link_ops_put(kind_ops, ops_srcu_index);
        if (netnsid >= 0)
                put_net(tgt_net);

        return err;
}

int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
                             struct netlink_ext_ack *exterr)
{
        const struct ifinfomsg *ifmp;
        const struct nlattr *attrs;
        size_t len;

        ifmp = nla_data(nla_peer);
        attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg);
        len = nla_len(nla_peer) - sizeof(struct ifinfomsg);

        if (ifmp->ifi_index < 0) {
                NL_SET_ERR_MSG_ATTR(exterr, nla_peer,
                                    "ifindex can't be negative");
                return -EINVAL;
        }

        return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy,
                                    exterr);
}
EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg);

static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[])
{
        struct net *net = NULL;

        /* Examine the link attributes and figure out which
         * network namespace we are talking about.
         */
        if (tb[IFLA_NET_NS_PID])
                net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
        else if (tb[IFLA_NET_NS_FD])
                net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));

        return net;
}

struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
{
        struct net *net = rtnl_link_get_net_ifla(tb);

        if (!net)
                net = get_net(src_net);

        return net;
}
EXPORT_SYMBOL(rtnl_link_get_net);

/* Figure out which network namespace we are talking about by
 * examining the link attributes in the following order:
 *
 * 1. IFLA_NET_NS_PID
 * 2. IFLA_NET_NS_FD
 * 3. IFLA_TARGET_NETNSID
 */
static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
                                               struct nlattr *tb[])
{
        struct net *net;

        if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
                return rtnl_link_get_net(src_net, tb);

        if (!tb[IFLA_TARGET_NETNSID])
                return get_net(src_net);

        net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID]));
        if (!net)
                return ERR_PTR(-EINVAL);

        return net;
}

static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb,
                                             struct net *src_net,
                                             struct nlattr *tb[], int cap)
{
        struct net *net;

        net = rtnl_link_get_net_by_nlattr(src_net, tb);
        if (IS_ERR(net))
                return net;

        if (!netlink_ns_capable(skb, net->user_ns, cap)) {
                put_net(net);
                return ERR_PTR(-EPERM);
        }

        return net;
}

/* Verify that rtnetlink requests do not pass additional properties
 * potentially referring to different network namespaces.
 */
static int rtnl_ensure_unique_netns(struct nlattr *tb[],
                                    struct netlink_ext_ack *extack,
                                    bool netns_id_only)
{

        if (netns_id_only) {
                if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD])
                        return 0;

                NL_SET_ERR_MSG(extack, "specified netns attribute not supported");
                return -EOPNOTSUPP;
        }

        if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
                goto invalid_attr;

        if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD]))
                goto invalid_attr;

        if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID]))
                goto invalid_attr;

        return 0;

invalid_attr:
        NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified");
        return -EINVAL;
}

static        int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
                             int max_tx_rate)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_set_vf_rate)
                return -EOPNOTSUPP;
        if (max_tx_rate && max_tx_rate < min_tx_rate)
                return -EINVAL;

        return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate);
}

static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
                            struct netlink_ext_ack *extack)
{
        if (tb[IFLA_ADDRESS] &&
            nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
                return -EINVAL;

        if (tb[IFLA_BROADCAST] &&
            nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
                return -EINVAL;

        if (tb[IFLA_GSO_MAX_SIZE] &&
            nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) {
                NL_SET_ERR_MSG(extack, "too big gso_max_size");
                return -EINVAL;
        }

        if (tb[IFLA_GSO_MAX_SEGS] &&
            (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS ||
             nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) {
                NL_SET_ERR_MSG(extack, "too big gso_max_segs");
                return -EINVAL;
        }

        if (tb[IFLA_GRO_MAX_SIZE] &&
            nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) {
                NL_SET_ERR_MSG(extack, "too big gro_max_size");
                return -EINVAL;
        }

        if (tb[IFLA_GSO_IPV4_MAX_SIZE] &&
            nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) {
                NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size");
                return -EINVAL;
        }

        if (tb[IFLA_GRO_IPV4_MAX_SIZE] &&
            nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) {
                NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size");
                return -EINVAL;
        }

        if (tb[IFLA_AF_SPEC]) {
                struct nlattr *af;
                int rem, err;

                nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
                        struct rtnl_af_ops *af_ops;
                        int af_ops_srcu_index;

                        af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
                        if (!af_ops)
                                return -EAFNOSUPPORT;

                        if (!af_ops->set_link_af)
                                err = -EOPNOTSUPP;
                        else if (af_ops->validate_link_af)
                                err = af_ops->validate_link_af(dev, af, extack);
                        else
                                err = 0;

                        rtnl_af_put(af_ops, af_ops_srcu_index);

                        if (err < 0)
                                return err;
                }
        }

        return 0;
}

static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt,
                                  int guid_type)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type);
}

static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type)
{
        if (dev->type != ARPHRD_INFINIBAND)
                return -EOPNOTSUPP;

        return handle_infiniband_guid(dev, ivt, guid_type);
}

static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err = -EINVAL;

        if (tb[IFLA_VF_MAC]) {
                struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);

                if (ivm->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_mac)
                        err = ops->ndo_set_vf_mac(dev, ivm->vf,
                                                  ivm->mac);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_VLAN]) {
                struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);

                if (ivv->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_vlan)
                        err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
                                                   ivv->qos,
                                                   htons(ETH_P_8021Q));
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_VLAN_LIST]) {
                struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN];
                struct nlattr *attr;
                int rem, len = 0;

                err = -EOPNOTSUPP;
                if (!ops->ndo_set_vf_vlan)
                        return err;

                nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
                        if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
                            nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) {
                                return -EINVAL;
                        }
                        if (len >= MAX_VLAN_LIST_LEN)
                                return -EOPNOTSUPP;
                        ivvl[len] = nla_data(attr);

                        len++;
                }
                if (len == 0)
                        return -EINVAL;

                if (ivvl[0]->vf >= INT_MAX)
                        return -EINVAL;
                err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
                                           ivvl[0]->qos, ivvl[0]->vlan_proto);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_TX_RATE]) {
                struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
                struct ifla_vf_info ivf;

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_get_vf_config)
                        err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
                if (err < 0)
                        return err;

                err = rtnl_set_vf_rate(dev, ivt->vf,
                                       ivf.min_tx_rate, ivt->rate);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_RATE]) {
                struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;

                err = rtnl_set_vf_rate(dev, ivt->vf,
                                       ivt->min_tx_rate, ivt->max_tx_rate);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_SPOOFCHK]) {
                struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);

                if (ivs->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_spoofchk)
                        err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
                                                       ivs->setting);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_LINK_STATE]) {
                struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);

                if (ivl->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_link_state)
                        err = ops->ndo_set_vf_link_state(dev, ivl->vf,
                                                         ivl->link_state);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_RSS_QUERY_EN]) {
                struct ifla_vf_rss_query_en *ivrssq_en;

                err = -EOPNOTSUPP;
                ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
                if (ivrssq_en->vf >= INT_MAX)
                        return -EINVAL;
                if (ops->ndo_set_vf_rss_query_en)
                        err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
                                                           ivrssq_en->setting);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_TRUST]) {
                struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]);

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;
                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_trust)
                        err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting);
                if (err < 0)
                        return err;
        }

        if (tb[IFLA_VF_IB_NODE_GUID]) {
                struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;
                if (!ops->ndo_set_vf_guid)
                        return -EOPNOTSUPP;
                return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
        }

        if (tb[IFLA_VF_IB_PORT_GUID]) {
                struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);

                if (ivt->vf >= INT_MAX)
                        return -EINVAL;
                if (!ops->ndo_set_vf_guid)
                        return -EOPNOTSUPP;

                return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID);
        }

        return err;
}

static int do_set_master(struct net_device *dev, int ifindex,
                         struct netlink_ext_ack *extack)
{
        struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
        const struct net_device_ops *ops;
        int err;

        /* Release the lower lock, the upper is responsible for locking
         * the lower if needed. None of the existing upper devices
         * use netdev instance lock, so don't grab it.
         */

        if (upper_dev) {
                if (upper_dev->ifindex == ifindex)
                        return 0;
                ops = upper_dev->netdev_ops;
                if (ops->ndo_del_slave) {
                        netdev_unlock_ops(dev);
                        err = ops->ndo_del_slave(upper_dev, dev);
                        netdev_lock_ops(dev);
                        if (err)
                                return err;
                } else {
                        return -EOPNOTSUPP;
                }
        }

        if (ifindex) {
                upper_dev = __dev_get_by_index(dev_net(dev), ifindex);
                if (!upper_dev)
                        return -EINVAL;
                ops = upper_dev->netdev_ops;
                if (ops->ndo_add_slave) {
                        netdev_unlock_ops(dev);
                        err = ops->ndo_add_slave(upper_dev, dev, extack);
                        netdev_lock_ops(dev);
                        if (err)
                                return err;
                } else {
                        return -EOPNOTSUPP;
                }
        }
        return 0;
}

static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = {
        [IFLA_PROTO_DOWN_REASON_MASK]        = { .type = NLA_U32 },
        [IFLA_PROTO_DOWN_REASON_VALUE]        = { .type = NLA_U32 },
};

static int do_set_proto_down(struct net_device *dev,
                             struct nlattr *nl_proto_down,
                             struct nlattr *nl_proto_down_reason,
                             struct netlink_ext_ack *extack)
{
        struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1];
        unsigned long mask = 0;
        u32 value;
        bool proto_down;
        int err;

        if (!dev->change_proto_down) {
                NL_SET_ERR_MSG(extack,  "Protodown not supported by device");
                return -EOPNOTSUPP;
        }

        if (nl_proto_down_reason) {
                err = nla_parse_nested_deprecated(pdreason,
                                                  IFLA_PROTO_DOWN_REASON_MAX,
                                                  nl_proto_down_reason,
                                                  ifla_proto_down_reason_policy,
                                                  NULL);
                if (err < 0)
                        return err;

                if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) {
                        NL_SET_ERR_MSG(extack, "Invalid protodown reason value");
                        return -EINVAL;
                }

                value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]);

                if (pdreason[IFLA_PROTO_DOWN_REASON_MASK])
                        mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]);

                netdev_change_proto_down_reason_locked(dev, mask, value);
        }

        if (nl_proto_down) {
                proto_down = nla_get_u8(nl_proto_down);

                /* Don't turn off protodown if there are active reasons */
                if (!proto_down && dev->proto_down_reason) {
                        NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons");
                        return -EBUSY;
                }
                err = netif_change_proto_down(dev, proto_down);
                if (err)
                        return err;
        }

        return 0;
}

#define DO_SETLINK_MODIFIED        0x01
/* notify flag means notify + modified. */
#define DO_SETLINK_NOTIFY        0x03
static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
                      struct net *tgt_net, struct ifinfomsg *ifm,
                      struct netlink_ext_ack *extack,
                      struct nlattr **tb, int status)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        char ifname[IFNAMSIZ];
        int err;

        err = validate_linkmsg(dev, tb, extack);
        if (err < 0)
                return err;

        if (tb[IFLA_IFNAME])
                nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
        else
                ifname[0] = '\0';

        if (!net_eq(tgt_net, dev_net(dev))) {
                const char *pat = ifname[0] ? ifname : NULL;
                int new_ifindex;

                new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0);

                err = __dev_change_net_namespace(dev, tgt_net, pat,
                                                 new_ifindex, extack);
                if (err)
                        return err;

                status |= DO_SETLINK_MODIFIED;
        }

        netdev_lock_ops(dev);

        if (tb[IFLA_MAP]) {
                struct rtnl_link_ifmap *u_map;
                struct ifmap k_map;

                if (!ops->ndo_set_config) {
                        err = -EOPNOTSUPP;
                        goto errout;
                }

                if (!netif_device_present(dev)) {
                        err = -ENODEV;
                        goto errout;
                }

                u_map = nla_data(tb[IFLA_MAP]);
                k_map.mem_start = (unsigned long) u_map->mem_start;
                k_map.mem_end = (unsigned long) u_map->mem_end;
                k_map.base_addr = (unsigned short) u_map->base_addr;
                k_map.irq = (unsigned char) u_map->irq;
                k_map.dma = (unsigned char) u_map->dma;
                k_map.port = (unsigned char) u_map->port;

                err = ops->ndo_set_config(dev, &k_map);
                if (err < 0)
                        goto errout;

                status |= DO_SETLINK_NOTIFY;
        }

        if (tb[IFLA_ADDRESS]) {
                struct sockaddr *sa;
                int len;

                len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len,
                                                  sizeof(*sa));
                sa = kmalloc(len, GFP_KERNEL);
                if (!sa) {
                        err = -ENOMEM;
                        goto errout;
                }
                sa->sa_family = dev->type;

                netdev_unlock_ops(dev);

                /* dev_addr_sem is an outer lock, enforce proper ordering */
                down_write(&dev_addr_sem);
                netdev_lock_ops(dev);

                memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
                       dev->addr_len);
                err = netif_set_mac_address(dev, sa, extack);
                kfree(sa);
                if (err) {
                        up_write(&dev_addr_sem);
                        goto errout;
                }
                status |= DO_SETLINK_MODIFIED;

                up_write(&dev_addr_sem);
        }

        if (tb[IFLA_MTU]) {
                err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
                if (err < 0)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_GROUP]) {
                netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
                status |= DO_SETLINK_NOTIFY;
        }

        /*
         * Interface selected by interface index but interface
         * name provided implies that a name change has been
         * requested.
         */
        if (ifm->ifi_index > 0 && ifname[0]) {
                err = netif_change_name(dev, ifname);
                if (err < 0)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_IFALIAS]) {
                err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
                                      nla_len(tb[IFLA_IFALIAS]));
                if (err < 0)
                        goto errout;
                status |= DO_SETLINK_NOTIFY;
        }

        if (tb[IFLA_BROADCAST]) {
                nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len);
                call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
        }

        if (ifm->ifi_flags || ifm->ifi_change) {
                err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
                                         extack);
                if (err < 0)
                        goto errout;
        }

        if (tb[IFLA_MASTER]) {
                err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
                if (err)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_CARRIER]) {
                err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
                if (err)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_TXQLEN]) {
                unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);

                err = netif_change_tx_queue_len(dev, value);
                if (err)
                        goto errout;
                status |= DO_SETLINK_MODIFIED;
        }

        if (tb[IFLA_GSO_MAX_SIZE]) {
                u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);

                if (dev->gso_max_size ^ max_size) {
                        netif_set_gso_max_size(dev, max_size);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_GSO_MAX_SEGS]) {
                u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);

                if (dev->gso_max_segs ^ max_segs) {
                        netif_set_gso_max_segs(dev, max_segs);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_GRO_MAX_SIZE]) {
                u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]);

                if (dev->gro_max_size ^ gro_max_size) {
                        netif_set_gro_max_size(dev, gro_max_size);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_GSO_IPV4_MAX_SIZE]) {
                u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]);

                if (dev->gso_ipv4_max_size ^ max_size) {
                        netif_set_gso_ipv4_max_size(dev, max_size);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_GRO_IPV4_MAX_SIZE]) {
                u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]);

                if (dev->gro_ipv4_max_size ^ gro_max_size) {
                        netif_set_gro_ipv4_max_size(dev, gro_max_size);
                        status |= DO_SETLINK_MODIFIED;
                }
        }

        if (tb[IFLA_OPERSTATE])
                set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));

        if (tb[IFLA_LINKMODE]) {
                unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);

                if (dev->link_mode ^ value)
                        status |= DO_SETLINK_NOTIFY;
                WRITE_ONCE(dev->link_mode, value);
        }

        if (tb[IFLA_VFINFO_LIST]) {
                struct nlattr *vfinfo[IFLA_VF_MAX + 1];
                struct nlattr *attr;
                int rem;

                nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) {
                        if (nla_type(attr) != IFLA_VF_INFO ||
                            nla_len(attr) < NLA_HDRLEN) {
                                err = -EINVAL;
                                goto errout;
                        }
                        err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX,
                                                          attr,
                                                          ifla_vf_policy,
                                                          NULL);
                        if (err < 0)
                                goto errout;
                        err = do_setvfinfo(dev, vfinfo);
                        if (err < 0)
                                goto errout;
                        status |= DO_SETLINK_NOTIFY;
                }
        }
        err = 0;

        if (tb[IFLA_VF_PORTS]) {
                struct nlattr *port[IFLA_PORT_MAX+1];
                struct nlattr *attr;
                int vf;
                int rem;

                err = -EOPNOTSUPP;
                if (!ops->ndo_set_vf_port)
                        goto errout;

                nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) {
                        if (nla_type(attr) != IFLA_VF_PORT ||
                            nla_len(attr) < NLA_HDRLEN) {
                                err = -EINVAL;
                                goto errout;
                        }
                        err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX,
                                                          attr,
                                                          ifla_port_policy,
                                                          NULL);
                        if (err < 0)
                                goto errout;
                        if (!port[IFLA_PORT_VF]) {
                                err = -EOPNOTSUPP;
                                goto errout;
                        }
                        vf = nla_get_u32(port[IFLA_PORT_VF]);
                        err = ops->ndo_set_vf_port(dev, vf, port);
                        if (err < 0)
                                goto errout;
                        status |= DO_SETLINK_NOTIFY;
                }
        }
        err = 0;

        if (tb[IFLA_PORT_SELF]) {
                struct nlattr *port[IFLA_PORT_MAX+1];

                err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX,
                                                  tb[IFLA_PORT_SELF],
                                                  ifla_port_policy, NULL);
                if (err < 0)
                        goto errout;

                err = -EOPNOTSUPP;
                if (ops->ndo_set_vf_port)
                        err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port);
                if (err < 0)
                        goto errout;
                status |= DO_SETLINK_NOTIFY;
        }

        if (tb[IFLA_AF_SPEC]) {
                struct nlattr *af;
                int rem;

                nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
                        struct rtnl_af_ops *af_ops;
                        int af_ops_srcu_index;

                        af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
                        if (!af_ops) {
                                err = -EAFNOSUPPORT;
                                goto errout;
                        }

                        err = af_ops->set_link_af(dev, af, extack);
                        rtnl_af_put(af_ops, af_ops_srcu_index);

                        if (err < 0)
                                goto errout;

                        status |= DO_SETLINK_NOTIFY;
                }
        }
        err = 0;

        if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) {
                err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN],
                                        tb[IFLA_PROTO_DOWN_REASON], extack);
                if (err)
                        goto errout;
                status |= DO_SETLINK_NOTIFY;
        }

        if (tb[IFLA_XDP]) {
                struct nlattr *xdp[IFLA_XDP_MAX + 1];
                u32 xdp_flags = 0;

                err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX,
                                                  tb[IFLA_XDP],
                                                  ifla_xdp_policy, NULL);
                if (err < 0)
                        goto errout;

                if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) {
                        err = -EINVAL;
                        goto errout;
                }

                if (xdp[IFLA_XDP_FLAGS]) {
                        xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]);
                        if (xdp_flags & ~XDP_FLAGS_MASK) {
                                err = -EINVAL;
                                goto errout;
                        }
                        if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) {
                                err = -EINVAL;
                                goto errout;
                        }
                }

                if (xdp[IFLA_XDP_FD]) {
                        int expected_fd = -1;

                        if (xdp_flags & XDP_FLAGS_REPLACE) {
                                if (!xdp[IFLA_XDP_EXPECTED_FD]) {
                                        err = -EINVAL;
                                        goto errout;
                                }
                                expected_fd =
                                        nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]);
                        }

                        err = dev_change_xdp_fd(dev, extack,
                                                nla_get_s32(xdp[IFLA_XDP_FD]),
                                                expected_fd,
                                                xdp_flags);
                        if (err)
                                goto errout;
                        status |= DO_SETLINK_NOTIFY;
                }
        }

errout:
        if (status & DO_SETLINK_MODIFIED) {
                if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
                        netif_state_change(dev);

                if (err < 0)
                        net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
                                             dev->name);
        }

        netdev_unlock_ops(dev);

        return err;
}

static struct net_device *rtnl_dev_get(struct net *net,
                                       struct nlattr *tb[])
{
        char ifname[ALTIFNAMSIZ];

        if (tb[IFLA_IFNAME])
                nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
        else if (tb[IFLA_ALT_IFNAME])
                nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ);
        else
                return NULL;

        return __dev_get_by_name(net, ifname);
}

static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct ifinfomsg *ifm = nlmsg_data(nlh);
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[IFLA_MAX+1];
        struct net_device *dev = NULL;
        struct rtnl_nets rtnl_nets;
        struct net *tgt_net;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
                                     ifla_policy, extack);
        if (err < 0)
                goto errout;

        err = rtnl_ensure_unique_netns(tb, extack, false);
        if (err < 0)
                goto errout;

        tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
        if (IS_ERR(tgt_net)) {
                err = PTR_ERR(tgt_net);
                goto errout;
        }

        rtnl_nets_init(&rtnl_nets);
        rtnl_nets_add(&rtnl_nets, get_net(net));
        rtnl_nets_add(&rtnl_nets, tgt_net);

        rtnl_nets_lock(&rtnl_nets);

        if (ifm->ifi_index > 0)
                dev = __dev_get_by_index(net, ifm->ifi_index);
        else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
                dev = rtnl_dev_get(net, tb);
        else
                err = -EINVAL;

        if (dev)
                err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
        else if (!err)
                err = -ENODEV;

        rtnl_nets_unlock(&rtnl_nets);
        rtnl_nets_destroy(&rtnl_nets);
errout:
        return err;
}

static int rtnl_group_dellink(const struct net *net, int group)
{
        struct net_device *dev, *aux;
        LIST_HEAD(list_kill);
        bool found = false;

        if (!group)
                return -EPERM;

        for_each_netdev(net, dev) {
                if (dev->group == group) {
                        const struct rtnl_link_ops *ops;

                        found = true;
                        ops = dev->rtnl_link_ops;
                        if (!ops || !ops->dellink)
                                return -EOPNOTSUPP;
                }
        }

        if (!found)
                return -ENODEV;

        for_each_netdev_safe(net, dev, aux) {
                if (dev->group == group) {
                        const struct rtnl_link_ops *ops;

                        ops = dev->rtnl_link_ops;
                        ops->dellink(dev, &list_kill);
                }
        }
        unregister_netdevice_many(&list_kill);

        return 0;
}

int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh)
{
        const struct rtnl_link_ops *ops;
        LIST_HEAD(list_kill);

        ops = dev->rtnl_link_ops;
        if (!ops || !ops->dellink)
                return -EOPNOTSUPP;

        ops->dellink(dev, &list_kill);
        unregister_netdevice_many_notify(&list_kill, portid, nlh);

        return 0;
}
EXPORT_SYMBOL_GPL(rtnl_delete_link);

static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct ifinfomsg *ifm = nlmsg_data(nlh);
        struct net *net = sock_net(skb->sk);
        u32 portid = NETLINK_CB(skb).portid;
        struct nlattr *tb[IFLA_MAX+1];
        struct net_device *dev = NULL;
        struct net *tgt_net = net;
        int netnsid = -1;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
                                     ifla_policy, extack);
        if (err < 0)
                return err;

        err = rtnl_ensure_unique_netns(tb, extack, true);
        if (err < 0)
                return err;

        if (tb[IFLA_TARGET_NETNSID]) {
                netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
                tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
                if (IS_ERR(tgt_net))
                        return PTR_ERR(tgt_net);
        }

        rtnl_net_lock(tgt_net);

        if (ifm->ifi_index > 0)
                dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
        else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
                dev = rtnl_dev_get(tgt_net, tb);

        if (dev)
                err = rtnl_delete_link(dev, portid, nlh);
        else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
                err = -ENODEV;
        else if (tb[IFLA_GROUP])
                err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
        else
                err = -EINVAL;

        rtnl_net_unlock(tgt_net);

        if (netnsid >= 0)
                put_net(tgt_net);

        return err;
}

int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
                        u32 portid, const struct nlmsghdr *nlh)
{
        unsigned int old_flags;
        int err;

        old_flags = dev->flags;
        if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
                err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
                                         NULL);
                if (err < 0)
                        return err;
        }

        if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
                __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh);
        } else {
                dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
                __dev_notify_flags(dev, old_flags, ~0U, portid, nlh);
        }
        return 0;
}
EXPORT_SYMBOL(rtnl_configure_link);

struct net_device *rtnl_create_link(struct net *net, const char *ifname,
                                    unsigned char name_assign_type,
                                    const struct rtnl_link_ops *ops,
                                    struct nlattr *tb[],
                                    struct netlink_ext_ack *extack)
{
        struct net_device *dev;
        unsigned int num_tx_queues = 1;
        unsigned int num_rx_queues = 1;
        int err;

        if (tb[IFLA_NUM_TX_QUEUES])
                num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
        else if (ops->get_num_tx_queues)
                num_tx_queues = ops->get_num_tx_queues();

        if (tb[IFLA_NUM_RX_QUEUES])
                num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]);
        else if (ops->get_num_rx_queues)
                num_rx_queues = ops->get_num_rx_queues();

        if (num_tx_queues < 1 || num_tx_queues > 4096) {
                NL_SET_ERR_MSG(extack, "Invalid number of transmit queues");
                return ERR_PTR(-EINVAL);
        }

        if (num_rx_queues < 1 || num_rx_queues > 4096) {
                NL_SET_ERR_MSG(extack, "Invalid number of receive queues");
                return ERR_PTR(-EINVAL);
        }

        if (ops->alloc) {
                dev = ops->alloc(tb, ifname, name_assign_type,
                                 num_tx_queues, num_rx_queues);
                if (IS_ERR(dev))
                        return dev;
        } else {
                dev = alloc_netdev_mqs(ops->priv_size, ifname,
                                       name_assign_type, ops->setup,
                                       num_tx_queues, num_rx_queues);
        }

        if (!dev)
                return ERR_PTR(-ENOMEM);

        err = validate_linkmsg(dev, tb, extack);
        if (err < 0) {
                free_netdev(dev);
                return ERR_PTR(err);
        }

        dev_net_set(dev, net);
        dev->rtnl_link_ops = ops;
        dev->rtnl_link_state = RTNL_LINK_INITIALIZING;

        if (tb[IFLA_MTU]) {
                u32 mtu = nla_get_u32(tb[IFLA_MTU]);

                err = dev_validate_mtu(dev, mtu, extack);
                if (err) {
                        free_netdev(dev);
                        return ERR_PTR(err);
                }
                dev->mtu = mtu;
        }
        if (tb[IFLA_ADDRESS]) {
                __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]),
                               nla_len(tb[IFLA_ADDRESS]));
                dev->addr_assign_type = NET_ADDR_SET;
        }
        if (tb[IFLA_BROADCAST])
                memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]),
                                nla_len(tb[IFLA_BROADCAST]));
        if (tb[IFLA_TXQLEN])
                dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]);
        if (tb[IFLA_OPERSTATE])
                set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
        if (tb[IFLA_LINKMODE])
                dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
        if (tb[IFLA_GROUP])
                dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
        if (tb[IFLA_GSO_MAX_SIZE])
                netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
        if (tb[IFLA_GSO_MAX_SEGS])
                netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS]));
        if (tb[IFLA_GRO_MAX_SIZE])
                netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE]));
        if (tb[IFLA_GSO_IPV4_MAX_SIZE])
                netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]));
        if (tb[IFLA_GRO_IPV4_MAX_SIZE])
                netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]));

        return dev;
}
EXPORT_SYMBOL(rtnl_create_link);

struct rtnl_newlink_tbs {
        struct nlattr *tb[IFLA_MAX + 1];
        struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
        struct nlattr *attr[RTNL_MAX_TYPE + 1];
        struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
};

static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh,
                           const struct rtnl_link_ops *ops,
                           struct net_device *dev, struct net *tgt_net,
                           struct rtnl_newlink_tbs *tbs,
                           struct nlattr **data,
                           struct netlink_ext_ack *extack)
{
        struct nlattr ** const linkinfo = tbs->linkinfo;
        struct nlattr ** const tb = tbs->tb;
        int status = 0;
        int err;

        if (nlh->nlmsg_flags & NLM_F_EXCL)
                return -EEXIST;

        if (nlh->nlmsg_flags & NLM_F_REPLACE)
                return -EOPNOTSUPP;

        if (linkinfo[IFLA_INFO_DATA]) {
                if (!ops || ops != dev->rtnl_link_ops || !ops->changelink)
                        return -EOPNOTSUPP;

                err = ops->changelink(dev, tb, data, extack);
                if (err < 0)
                        return err;

                status |= DO_SETLINK_NOTIFY;
        }

        if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
                const struct rtnl_link_ops *m_ops = NULL;
                struct nlattr **slave_data = NULL;
                struct net_device *master_dev;

                master_dev = netdev_master_upper_dev_get(dev);
                if (master_dev)
                        m_ops = master_dev->rtnl_link_ops;

                if (!m_ops || !m_ops->slave_changelink)
                        return -EOPNOTSUPP;

                if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
                        return -EINVAL;

                if (m_ops->slave_maxtype) {
                        err = nla_parse_nested_deprecated(tbs->slave_attr,
                                                          m_ops->slave_maxtype,
                                                          linkinfo[IFLA_INFO_SLAVE_DATA],
                                                          m_ops->slave_policy, extack);
                        if (err < 0)
                                return err;

                        slave_data = tbs->slave_attr;
                }

                err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack);
                if (err < 0)
                        return err;

                status |= DO_SETLINK_NOTIFY;
        }

        return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status);
}

static int rtnl_group_changelink(const struct sk_buff *skb,
                                 struct net *net, struct net *tgt_net,
                                 int group, struct ifinfomsg *ifm,
                                 struct netlink_ext_ack *extack,
                                 struct nlattr **tb)
{
        struct net_device *dev, *aux;
        int err;

        for_each_netdev_safe(net, dev, aux) {
                if (dev->group == group) {
                        err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
                        if (err < 0)
                                return err;
                }
        }

        return 0;
}

static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
                               const struct rtnl_link_ops *ops,
                               struct net *tgt_net, struct net *link_net,
                               struct net *peer_net,
                               const struct nlmsghdr *nlh,
                               struct nlattr **tb, struct nlattr **data,
                               struct netlink_ext_ack *extack)
{
        unsigned char name_assign_type = NET_NAME_USER;
        struct rtnl_newlink_params params = {
                .src_net = sock_net(skb->sk),
                .link_net = link_net,
                .peer_net = peer_net,
                .tb = tb,
                .data = data,
        };
        u32 portid = NETLINK_CB(skb).portid;
        struct net_device *dev;
        char ifname[IFNAMSIZ];
        int err;

        if (!ops->alloc && !ops->setup)
                return -EOPNOTSUPP;

        if (tb[IFLA_IFNAME]) {
                nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
        } else {
                snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
                name_assign_type = NET_NAME_ENUM;
        }

        dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb,
                               extack);
        if (IS_ERR(dev)) {
                err = PTR_ERR(dev);
                goto out;
        }

        dev->ifindex = ifm->ifi_index;

        if (ops->newlink)
                err = ops->newlink(dev, &params, extack);
        else
                err = register_netdevice(dev);
        if (err < 0) {
                free_netdev(dev);
                goto out;
        }

        netdev_lock_ops(dev);

        err = rtnl_configure_link(dev, ifm, portid, nlh);
        if (err < 0)
                goto out_unregister;
        if (tb[IFLA_MASTER]) {
                err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
                if (err)
                        goto out_unregister;
        }

        netdev_unlock_ops(dev);
out:
        return err;
out_unregister:
        netdev_unlock_ops(dev);
        if (ops->newlink) {
                LIST_HEAD(list_kill);

                ops->dellink(dev, &list_kill);
                unregister_netdevice_many(&list_kill);
        } else {
                unregister_netdevice(dev);
        }
        goto out;
}

static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops,
                                     struct nlattr *tbp[],
                                     struct nlattr *data[],
                                     struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_MAX + 1];
        int err;

        if (!data || !data[ops->peer_type])
                return rtnl_link_get_net_ifla(tbp);

        err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack);
        if (err < 0)
                return ERR_PTR(err);

        if (ops->validate) {
                err = ops->validate(tb, NULL, extack);
                if (err < 0)
                        return ERR_PTR(err);
        }

        return rtnl_link_get_net_ifla(tb);
}

static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                          const struct rtnl_link_ops *ops,
                          struct net *tgt_net, struct net *link_net,
                          struct net *peer_net,
                          struct rtnl_newlink_tbs *tbs,
                          struct nlattr **data,
                          struct netlink_ext_ack *extack)
{
        struct nlattr ** const tb = tbs->tb;
        struct net *net = sock_net(skb->sk);
        struct net *device_net;
        struct net_device *dev;
        struct ifinfomsg *ifm;
        bool link_specified;

        /* When creating, lookup for existing device in target net namespace */
        device_net = (nlh->nlmsg_flags & NLM_F_CREATE) &&
                     (nlh->nlmsg_flags & NLM_F_EXCL) ?
                     tgt_net : net;

        ifm = nlmsg_data(nlh);
        if (ifm->ifi_index > 0) {
                link_specified = true;
                dev = __dev_get_by_index(device_net, ifm->ifi_index);
        } else if (ifm->ifi_index < 0) {
                NL_SET_ERR_MSG(extack, "ifindex can't be negative");
                return -EINVAL;
        } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) {
                link_specified = true;
                dev = rtnl_dev_get(device_net, tb);
        } else {
                link_specified = false;
                dev = NULL;
        }

        if (dev)
                return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack);

        if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
                /* No dev found and NLM_F_CREATE not set. Requested dev does not exist,
                 * or it's for a group
                */
                if (link_specified || !tb[IFLA_GROUP])
                        return -ENODEV;

                return rtnl_group_changelink(skb, net, tgt_net,
                                             nla_get_u32(tb[IFLA_GROUP]),
                                             ifm, extack, tb);
        }

        if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
                return -EOPNOTSUPP;

        if (!ops) {
                NL_SET_ERR_MSG(extack, "Unknown device type");
                return -EOPNOTSUPP;
        }

        return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh,
                                   tb, data, extack);
}

static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net *tgt_net, *link_net = NULL, *peer_net = NULL;
        struct nlattr **tb, **linkinfo, **data = NULL;
        struct rtnl_link_ops *ops = NULL;
        struct rtnl_newlink_tbs *tbs;
        struct rtnl_nets rtnl_nets;
        int ops_srcu_index;
        int ret;

        tbs = kmalloc(sizeof(*tbs), GFP_KERNEL);
        if (!tbs)
                return -ENOMEM;

        tb = tbs->tb;
        ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb,
                                     IFLA_MAX, ifla_policy, extack);
        if (ret < 0)
                goto free;

        ret = rtnl_ensure_unique_netns(tb, extack, false);
        if (ret < 0)
                goto free;

        linkinfo = tbs->linkinfo;
        if (tb[IFLA_LINKINFO]) {
                ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,
                                                  tb[IFLA_LINKINFO],
                                                  ifla_info_policy, NULL);
                if (ret < 0)
                        goto free;
        } else {
                memset(linkinfo, 0, sizeof(tbs->linkinfo));
        }

        if (linkinfo[IFLA_INFO_KIND]) {
                char kind[MODULE_NAME_LEN];

                nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
                ops = rtnl_link_ops_get(kind, &ops_srcu_index);
#ifdef CONFIG_MODULES
                if (!ops) {
                        request_module("rtnl-link-%s", kind);
                        ops = rtnl_link_ops_get(kind, &ops_srcu_index);
                }
#endif
        }

        rtnl_nets_init(&rtnl_nets);

        if (ops) {
                if (ops->maxtype > RTNL_MAX_TYPE) {
                        ret = -EINVAL;
                        goto put_ops;
                }

                if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
                        ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype,
                                                          linkinfo[IFLA_INFO_DATA],
                                                          ops->policy, extack);
                        if (ret < 0)
                                goto put_ops;

                        data = tbs->attr;
                }

                if (ops->validate) {
                        ret = ops->validate(tb, data, extack);
                        if (ret < 0)
                                goto put_ops;
                }

                if (ops->peer_type) {
                        peer_net = rtnl_get_peer_net(ops, tb, data, extack);
                        if (IS_ERR(peer_net)) {
                                ret = PTR_ERR(peer_net);
                                goto put_ops;
                        }
                        if (peer_net)
                                rtnl_nets_add(&rtnl_nets, peer_net);
                }
        }

        tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN);
        if (IS_ERR(tgt_net)) {
                ret = PTR_ERR(tgt_net);
                goto put_net;
        }

        rtnl_nets_add(&rtnl_nets, tgt_net);

        if (tb[IFLA_LINK_NETNSID]) {
                int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);

                link_net = get_net_ns_by_id(tgt_net, id);
                if (!link_net) {
                        NL_SET_ERR_MSG(extack, "Unknown network namespace id");
                        ret =  -EINVAL;
                        goto put_net;
                }

                rtnl_nets_add(&rtnl_nets, link_net);

                if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        goto put_net;
                }
        }

        rtnl_nets_lock(&rtnl_nets);
        ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack);
        rtnl_nets_unlock(&rtnl_nets);

put_net:
        rtnl_nets_destroy(&rtnl_nets);
put_ops:
        if (ops)
                rtnl_link_ops_put(ops, ops_srcu_index);
free:
        kfree(tbs);
        return ret;
}

static int rtnl_valid_getlink_req(struct sk_buff *skb,
                                  const struct nlmsghdr *nlh,
                                  struct nlattr **tb,
                                  struct netlink_ext_ack *extack)
{
        struct ifinfomsg *ifm;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for get link");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
                                              ifla_policy, extack);

        ifm = nlmsg_data(nlh);
        if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
            ifm->ifi_change) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for get link request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX,
                                            ifla_policy, extack);
        if (err)
                return err;

        for (i = 0; i <= IFLA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case IFLA_IFNAME:
                case IFLA_ALT_IFNAME:
                case IFLA_EXT_MASK:
                case IFLA_TARGET_NETNSID:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct net *tgt_net = net;
        struct ifinfomsg *ifm;
        struct nlattr *tb[IFLA_MAX+1];
        struct net_device *dev = NULL;
        struct sk_buff *nskb;
        int netnsid = -1;
        int err;
        u32 ext_filter_mask = 0;

        err = rtnl_valid_getlink_req(skb, nlh, tb, extack);
        if (err < 0)
                return err;

        err = rtnl_ensure_unique_netns(tb, extack, true);
        if (err < 0)
                return err;

        if (tb[IFLA_TARGET_NETNSID]) {
                netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
                tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
                if (IS_ERR(tgt_net))
                        return PTR_ERR(tgt_net);
        }

        if (tb[IFLA_EXT_MASK])
                ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);

        err = -EINVAL;
        ifm = nlmsg_data(nlh);
        if (ifm->ifi_index > 0)
                dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
        else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
                dev = rtnl_dev_get(tgt_net, tb);
        else
                goto out;

        err = -ENODEV;
        if (dev == NULL)
                goto out;

        err = -ENOBUFS;
        nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask));
        if (nskb == NULL)
                goto out;

        /* Synchronize the carrier state so we don't report a state
         * that we're not actually going to honour immediately; if
         * the driver just did a carrier off->on transition, we can
         * only TX if link watch work has run, but without this we'd
         * already report carrier on, even if it doesn't work yet.
         */
        linkwatch_sync_dev(dev);

        err = rtnl_fill_ifinfo(nskb, dev, net,
                               RTM_NEWLINK, NETLINK_CB(skb).portid,
                               nlh->nlmsg_seq, 0, 0, ext_filter_mask,
                               0, NULL, 0, netnsid, GFP_KERNEL);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in if_nlmsg_size */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(nskb);
        } else
                err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
out:
        if (netnsid >= 0)
                put_net(tgt_net);

        return err;
}

static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
                           bool *changed, struct netlink_ext_ack *extack)
{
        char *alt_ifname;
        size_t size;
        int err;

        err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
        if (err)
                return err;

        if (cmd == RTM_NEWLINKPROP) {
                size = rtnl_prop_list_size(dev);
                size += nla_total_size(ALTIFNAMSIZ);
                if (size >= U16_MAX) {
                        NL_SET_ERR_MSG(extack,
                                       "effective property list too long");
                        return -EINVAL;
                }
        }

        alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
        if (!alt_ifname)
                return -ENOMEM;

        if (cmd == RTM_NEWLINKPROP) {
                err = netdev_name_node_alt_create(dev, alt_ifname);
                if (!err)
                        alt_ifname = NULL;
        } else if (cmd == RTM_DELLINKPROP) {
                err = netdev_name_node_alt_destroy(dev, alt_ifname);
        } else {
                WARN_ON_ONCE(1);
                err = -EINVAL;
        }

        kfree(alt_ifname);
        if (!err)
                *changed = true;
        return err;
}

static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh,
                         struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[IFLA_MAX + 1];
        struct net_device *dev;
        struct ifinfomsg *ifm;
        bool changed = false;
        struct nlattr *attr;
        int err, rem;

        err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
        if (err)
                return err;

        err = rtnl_ensure_unique_netns(tb, extack, true);
        if (err)
                return err;

        ifm = nlmsg_data(nlh);
        if (ifm->ifi_index > 0)
                dev = __dev_get_by_index(net, ifm->ifi_index);
        else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
                dev = rtnl_dev_get(net, tb);
        else
                return -EINVAL;

        if (!dev)
                return -ENODEV;

        if (!tb[IFLA_PROP_LIST])
                return 0;

        nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) {
                switch (nla_type(attr)) {
                case IFLA_ALT_IFNAME:
                        err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack);
                        if (err)
                                return err;
                        break;
                }
        }

        if (changed)
                netdev_state_change(dev);
        return 0;
}

static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack);
}

static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
}

static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb,
                                          struct nlmsghdr *nlh)
{
        struct net *net = sock_net(skb->sk);
        size_t min_ifinfo_dump_size = 0;
        u32 ext_filter_mask = 0;
        struct net_device *dev;
        struct nlattr *nla;
        int hdrlen, rem;

        /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
        hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
                 sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);

        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
                return NLMSG_GOODSIZE;

        nla_for_each_attr_type(nla, IFLA_EXT_MASK,
                               nlmsg_attrdata(nlh, hdrlen),
                               nlmsg_attrlen(nlh, hdrlen), rem) {
                if (nla_len(nla) == sizeof(u32))
                        ext_filter_mask = nla_get_u32(nla);
        }

        if (!ext_filter_mask)
                return NLMSG_GOODSIZE;
        /*
         * traverse the list of net devices and compute the minimum
         * buffer size based upon the filter mask.
         */
        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                min_ifinfo_dump_size = max(min_ifinfo_dump_size,
                                           if_nlmsg_size(dev, ext_filter_mask));
        }
        rcu_read_unlock();

        return nlmsg_total_size(min_ifinfo_dump_size);
}

static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
{
        int idx;
        int s_idx = cb->family;
        int type = cb->nlh->nlmsg_type - RTM_BASE;
        int ret = 0;

        if (s_idx == 0)
                s_idx = 1;

        for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
                struct rtnl_link __rcu **tab;
                struct rtnl_link *link;
                rtnl_dumpit_func dumpit;

                if (idx < s_idx || idx == PF_PACKET)
                        continue;

                if (type < 0 || type >= RTM_NR_MSGTYPES)
                        continue;

                tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]);
                if (!tab)
                        continue;

                link = rcu_dereference_rtnl(tab[type]);
                if (!link)
                        continue;

                dumpit = link->dumpit;
                if (!dumpit)
                        continue;

                if (idx > s_idx) {
                        memset(&cb->args[0], 0, sizeof(cb->args));
                        cb->prev_seq = 0;
                        cb->seq = 0;
                }
                ret = dumpit(skb, cb);
                if (ret)
                        break;
        }
        cb->family = idx;

        return skb->len ? : ret;
}

struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
                                       unsigned int change,
                                       u32 event, gfp_t flags, int *new_nsid,
                                       int new_ifindex, u32 portid,
                                       const struct nlmsghdr *nlh)
{
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -ENOBUFS;
        u32 seq = 0;

        skb = nlmsg_new(if_nlmsg_size(dev, 0), flags);
        if (skb == NULL)
                goto errout;

        if (nlmsg_report(nlh))
                seq = nlmsg_seq(nlh);
        else
                portid = 0;

        err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
                               type, portid, seq, change, 0, 0, event,
                               new_nsid, new_ifindex, -1, flags);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in if_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        return skb;
errout:
        rtnl_set_sk_err(net, RTNLGRP_LINK, err);
        return NULL;
}

void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags,
                       u32 portid, const struct nlmsghdr *nlh)
{
        struct net *net = dev_net(dev);

        rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags);
}

static void rtmsg_ifinfo_event(int type, struct net_device *dev,
                               unsigned int change, u32 event,
                               gfp_t flags, int *new_nsid, int new_ifindex,
                               u32 portid, const struct nlmsghdr *nlh)
{
        struct sk_buff *skb;

        if (dev->reg_state != NETREG_REGISTERED)
                return;

        skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
                                     new_ifindex, portid, nlh);
        if (skb)
                rtmsg_ifinfo_send(skb, dev, flags, portid, nlh);
}

void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
                  gfp_t flags, u32 portid, const struct nlmsghdr *nlh)
{
        rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
                           NULL, 0, portid, nlh);
}

void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
                         gfp_t flags, int *new_nsid, int new_ifindex)
{
        rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
                           new_nsid, new_ifindex, 0, NULL);
}

static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
                                   struct net_device *dev,
                                   u8 *addr, u16 vid, u32 pid, u32 seq,
                                   int type, unsigned int flags,
                                   int nlflags, u16 ndm_state)
{
        struct nlmsghdr *nlh;
        struct ndmsg *ndm;

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags);
        if (!nlh)
                return -EMSGSIZE;

        ndm = nlmsg_data(nlh);
        ndm->ndm_family  = AF_BRIDGE;
        ndm->ndm_pad1         = 0;
        ndm->ndm_pad2    = 0;
        ndm->ndm_flags         = flags;
        ndm->ndm_type         = 0;
        ndm->ndm_ifindex = dev->ifindex;
        ndm->ndm_state   = ndm_state;

        if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr))
                goto nla_put_failure;
        if (vid)
                if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid))
                        goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev)
{
        return NLMSG_ALIGN(sizeof(struct ndmsg)) +
               nla_total_size(dev->addr_len) +        /* NDA_LLADDR */
               nla_total_size(sizeof(u16)) +        /* NDA_VLAN */
               0;
}

static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
                            u16 ndm_state)
{
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = nlmsg_populate_fdb_fill(skb, dev, addr, vid,
                                      0, 0, type, NTF_SELF, 0, ndm_state);
        if (err < 0) {
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}

/*
 * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
 */
int ndo_dflt_fdb_add(struct ndmsg *ndm,
                     struct nlattr *tb[],
                     struct net_device *dev,
                     const unsigned char *addr, u16 vid,
                     u16 flags)
{
        int err = -EINVAL;

        /* If aging addresses are supported device will need to
         * implement its own handler for this.
         */
        if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
                netdev_info(dev, "default FDB implementation only supports local addresses\n");
                return err;
        }

        if (tb[NDA_FLAGS_EXT]) {
                netdev_info(dev, "invalid flags given to default FDB implementation\n");
                return err;
        }

        if (vid) {
                netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n");
                return err;
        }

        if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
                err = dev_uc_add_excl(dev, addr);
        else if (is_multicast_ether_addr(addr))
                err = dev_mc_add_excl(dev, addr);

        /* Only return duplicate errors if NLM_F_EXCL is set */
        if (err == -EEXIST && !(flags & NLM_F_EXCL))
                err = 0;

        return err;
}
EXPORT_SYMBOL(ndo_dflt_fdb_add);

static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid,
                         struct netlink_ext_ack *extack)
{
        u16 vid = 0;

        if (vlan_attr) {
                if (nla_len(vlan_attr) != sizeof(u16)) {
                        NL_SET_ERR_MSG(extack, "invalid vlan attribute size");
                        return -EINVAL;
                }

                vid = nla_get_u16(vlan_attr);

                if (!vid || vid >= VLAN_VID_MASK) {
                        NL_SET_ERR_MSG(extack, "invalid vlan id");
                        return -EINVAL;
                }
        }
        *p_vid = vid;
        return 0;
}

static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ndmsg *ndm;
        struct nlattr *tb[NDA_MAX+1];
        struct net_device *dev;
        u8 *addr;
        u16 vid;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL,
                                     extack);
        if (err < 0)
                return err;

        ndm = nlmsg_data(nlh);
        if (ndm->ndm_ifindex == 0) {
                NL_SET_ERR_MSG(extack, "invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, ndm->ndm_ifindex);
        if (dev == NULL) {
                NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }

        if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
                NL_SET_ERR_MSG(extack, "invalid address");
                return -EINVAL;
        }

        if (dev->type != ARPHRD_ETHER) {
                NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices");
                return -EINVAL;
        }

        addr = nla_data(tb[NDA_LLADDR]);

        err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
        if (err)
                return err;

        err = -EOPNOTSUPP;

        /* Support fdb on master device the net/bridge default case */
        if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
            netif_is_bridge_port(dev)) {
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);
                const struct net_device_ops *ops = br_dev->netdev_ops;
                bool notified = false;

                err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
                                       nlh->nlmsg_flags, &notified, extack);
                if (err)
                        goto out;
                else
                        ndm->ndm_flags &= ~NTF_MASTER;
        }

        /* Embedded bridge, macvlan, and any other device support */
        if ((ndm->ndm_flags & NTF_SELF)) {
                bool notified = false;

                if (dev->netdev_ops->ndo_fdb_add)
                        err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
                                                           vid,
                                                           nlh->nlmsg_flags,
                                                           &notified, extack);
                else
                        err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
                                               nlh->nlmsg_flags);

                if (!err && !notified) {
                        rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
                                        ndm->ndm_state);
                        ndm->ndm_flags &= ~NTF_SELF;
                }
        }
out:
        return err;
}

/*
 * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
 */
int ndo_dflt_fdb_del(struct ndmsg *ndm,
                     struct nlattr *tb[],
                     struct net_device *dev,
                     const unsigned char *addr, u16 vid)
{
        int err = -EINVAL;

        /* If aging addresses are supported device will need to
         * implement its own handler for this.
         */
        if (!(ndm->ndm_state & NUD_PERMANENT)) {
                netdev_info(dev, "default FDB implementation only supports local addresses\n");
                return err;
        }

        if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
                err = dev_uc_del(dev, addr);
        else if (is_multicast_ether_addr(addr))
                err = dev_mc_del(dev, addr);

        return err;
}
EXPORT_SYMBOL(ndo_dflt_fdb_del);

static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
        struct net *net = sock_net(skb->sk);
        const struct net_device_ops *ops;
        struct ndmsg *ndm;
        struct nlattr *tb[NDA_MAX+1];
        struct net_device *dev;
        __u8 *addr = NULL;
        int err;
        u16 vid;

        if (!netlink_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        if (!del_bulk) {
                err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
                                             NULL, extack);
        } else {
                /* For bulk delete, the drivers will parse the message with
                 * policy.
                 */
                err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
        }
        if (err < 0)
                return err;

        ndm = nlmsg_data(nlh);
        if (ndm->ndm_ifindex == 0) {
                NL_SET_ERR_MSG(extack, "invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, ndm->ndm_ifindex);
        if (dev == NULL) {
                NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }

        if (!del_bulk) {
                if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
                        NL_SET_ERR_MSG(extack, "invalid address");
                        return -EINVAL;
                }
                addr = nla_data(tb[NDA_LLADDR]);

                err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
                if (err)
                        return err;
        }

        if (dev->type != ARPHRD_ETHER) {
                NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices");
                return -EINVAL;
        }

        err = -EOPNOTSUPP;

        /* Support fdb on master device the net/bridge default case */
        if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
            netif_is_bridge_port(dev)) {
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);
                bool notified = false;

                ops = br_dev->netdev_ops;
                if (!del_bulk) {
                        if (ops->ndo_fdb_del)
                                err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
                                                       &notified, extack);
                } else {
                        if (ops->ndo_fdb_del_bulk)
                                err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
                }

                if (err)
                        goto out;
                else
                        ndm->ndm_flags &= ~NTF_MASTER;
        }

        /* Embedded bridge, macvlan, and any other device support */
        if (ndm->ndm_flags & NTF_SELF) {
                bool notified = false;

                ops = dev->netdev_ops;
                if (!del_bulk) {
                        if (ops->ndo_fdb_del)
                                err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
                                                       &notified, extack);
                        else
                                err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
                } else {
                        /* in case err was cleared by NTF_MASTER call */
                        err = -EOPNOTSUPP;
                        if (ops->ndo_fdb_del_bulk)
                                err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
                }

                if (!err) {
                        if (!del_bulk && !notified)
                                rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
                                                ndm->ndm_state);
                        ndm->ndm_flags &= ~NTF_SELF;
                }
        }
out:
        return err;
}

static int nlmsg_populate_fdb(struct sk_buff *skb,
                              struct netlink_callback *cb,
                              struct net_device *dev,
                              int *idx,
                              struct netdev_hw_addr_list *list)
{
        struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
        struct netdev_hw_addr *ha;
        u32 portid, seq;
        int err;

        portid = NETLINK_CB(cb->skb).portid;
        seq = cb->nlh->nlmsg_seq;

        list_for_each_entry(ha, &list->list, list) {
                if (*idx < ctx->fdb_idx)
                        goto skip;

                err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
                                              portid, seq,
                                              RTM_NEWNEIGH, NTF_SELF,
                                              NLM_F_MULTI, NUD_PERMANENT);
                if (err < 0)
                        return err;
skip:
                *idx += 1;
        }
        return 0;
}

/**
 * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table.
 * @skb: socket buffer to store message in
 * @cb: netlink callback
 * @dev: netdevice
 * @filter_dev: ignored
 * @idx: the number of FDB table entries dumped is added to *@idx
 *
 * Default netdevice operation to dump the existing unicast address list.
 * Returns number of addresses from list put in skb.
 */
int ndo_dflt_fdb_dump(struct sk_buff *skb,
                      struct netlink_callback *cb,
                      struct net_device *dev,
                      struct net_device *filter_dev,
                      int *idx)
{
        int err;

        if (dev->type != ARPHRD_ETHER)
                return -EINVAL;

        netif_addr_lock_bh(dev);
        err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
        if (err)
                goto out;
        err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc);
out:
        netif_addr_unlock_bh(dev);
        return err;
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);

static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
                                 int *br_idx, int *brport_idx,
                                 struct netlink_ext_ack *extack)
{
        struct nlattr *tb[NDA_MAX + 1];
        struct ndmsg *ndm;
        int err, i;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request");
                return -EINVAL;
        }

        ndm = nlmsg_data(nlh);
        if (ndm->ndm_pad1  || ndm->ndm_pad2  || ndm->ndm_state ||
            ndm->ndm_flags || ndm->ndm_type) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
                                            NDA_MAX, NULL, extack);
        if (err < 0)
                return err;

        *brport_idx = ndm->ndm_ifindex;
        for (i = 0; i <= NDA_MAX; ++i) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NDA_IFINDEX:
                        if (nla_len(tb[i]) != sizeof(u32)) {
                                NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request");
                                return -EINVAL;
                        }
                        *brport_idx = nla_get_u32(tb[NDA_IFINDEX]);
                        break;
                case NDA_MASTER:
                        if (nla_len(tb[i]) != sizeof(u32)) {
                                NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request");
                                return -EINVAL;
                        }
                        *br_idx = nla_get_u32(tb[NDA_MASTER]);
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
                                 int *br_idx, int *brport_idx,
                                 struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_MAX+1];
        int err;

        /* A hack to preserve kernel<->userspace interface.
         * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0.
         * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails.
         * So, check for ndmsg with an optional u32 attribute (not used here).
         * Fortunately these sizes don't conflict with the size of ifinfomsg
         * with an optional attribute.
         */
        if (nlmsg_len(nlh) != sizeof(struct ndmsg) &&
            (nlmsg_len(nlh) != sizeof(struct ndmsg) +
             nla_attr_size(sizeof(u32)))) {
                struct ifinfomsg *ifm;

                err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg),
                                             tb, IFLA_MAX, ifla_policy,
                                             extack);
                if (err < 0) {
                        return -EINVAL;
                } else if (err == 0) {
                        if (tb[IFLA_MASTER])
                                *br_idx = nla_get_u32(tb[IFLA_MASTER]);
                }

                ifm = nlmsg_data(nlh);
                *brport_idx = ifm->ifi_index;
        }
        return 0;
}

static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct net_device_ops *ops = NULL, *cops = NULL;
        struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
        struct net_device *dev, *br_dev = NULL;
        struct net *net = sock_net(skb->sk);
        int brport_idx = 0;
        int br_idx = 0;
        int fidx = 0;
        int err;

        NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context);

        if (cb->strict_check)
                err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
                                            cb->extack);
        else
                err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx,
                                            cb->extack);
        if (err < 0)
                return err;

        if (br_idx) {
                br_dev = __dev_get_by_index(net, br_idx);
                if (!br_dev)
                        return -ENODEV;

                ops = br_dev->netdev_ops;
        }

        for_each_netdev_dump(net, dev, ctx->ifindex) {
                if (brport_idx && (dev->ifindex != brport_idx))
                        continue;

                if (!br_idx) { /* user did not specify a specific bridge */
                        if (netif_is_bridge_port(dev)) {
                                br_dev = netdev_master_upper_dev_get(dev);
                                cops = br_dev->netdev_ops;
                        }
                } else {
                        if (dev != br_dev &&
                            !netif_is_bridge_port(dev))
                                continue;

                        if (br_dev != netdev_master_upper_dev_get(dev) &&
                            !netif_is_bridge_master(dev))
                                continue;
                        cops = ops;
                }

                if (netif_is_bridge_port(dev)) {
                        if (cops && cops->ndo_fdb_dump) {
                                err = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
                                                        &fidx);
                                if (err == -EMSGSIZE)
                                        break;
                        }
                }

                if (dev->netdev_ops->ndo_fdb_dump)
                        err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
                                                            &fidx);
                else
                        err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx);
                if (err == -EMSGSIZE)
                        break;

                cops = NULL;

                /* reset fdb offset to 0 for rest of the interfaces */
                ctx->fdb_idx = 0;
                fidx = 0;
        }

        ctx->fdb_idx = fidx;

        return skb->len;
}

static int valid_fdb_get_strict(const struct nlmsghdr *nlh,
                                struct nlattr **tb, u8 *ndm_flags,
                                int *br_idx, int *brport_idx, u8 **addr,
                                u16 *vid, struct netlink_ext_ack *extack)
{
        struct ndmsg *ndm;
        int err, i;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for fdb get request");
                return -EINVAL;
        }

        ndm = nlmsg_data(nlh);
        if (ndm->ndm_pad1  || ndm->ndm_pad2  || ndm->ndm_state ||
            ndm->ndm_type) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request");
                return -EINVAL;
        }

        if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) {
                NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
                                            NDA_MAX, nda_policy, extack);
        if (err < 0)
                return err;

        *ndm_flags = ndm->ndm_flags;
        *brport_idx = ndm->ndm_ifindex;
        for (i = 0; i <= NDA_MAX; ++i) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NDA_MASTER:
                        *br_idx = nla_get_u32(tb[i]);
                        break;
                case NDA_LLADDR:
                        if (nla_len(tb[i]) != ETH_ALEN) {
                                NL_SET_ERR_MSG(extack, "Invalid address in fdb get request");
                                return -EINVAL;
                        }
                        *addr = nla_data(tb[i]);
                        break;
                case NDA_VLAN:
                        err = fdb_vid_parse(tb[i], vid, extack);
                        if (err)
                                return err;
                        break;
                case NDA_VNI:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct net_device *dev = NULL, *br_dev = NULL;
        const struct net_device_ops *ops = NULL;
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[NDA_MAX + 1];
        struct sk_buff *skb;
        int brport_idx = 0;
        u8 ndm_flags = 0;
        int br_idx = 0;
        u8 *addr = NULL;
        u16 vid = 0;
        int err;

        err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx,
                                   &brport_idx, &addr, &vid, extack);
        if (err < 0)
                return err;

        if (!addr) {
                NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request");
                return -EINVAL;
        }

        if (brport_idx) {
                dev = __dev_get_by_index(net, brport_idx);
                if (!dev) {
                        NL_SET_ERR_MSG(extack, "Unknown device ifindex");
                        return -ENODEV;
                }
        }

        if (br_idx) {
                if (dev) {
                        NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive");
                        return -EINVAL;
                }

                br_dev = __dev_get_by_index(net, br_idx);
                if (!br_dev) {
                        NL_SET_ERR_MSG(extack, "Invalid master ifindex");
                        return -EINVAL;
                }
                ops = br_dev->netdev_ops;
        }

        if (dev) {
                if (!ndm_flags || (ndm_flags & NTF_MASTER)) {
                        if (!netif_is_bridge_port(dev)) {
                                NL_SET_ERR_MSG(extack, "Device is not a bridge port");
                                return -EINVAL;
                        }
                        br_dev = netdev_master_upper_dev_get(dev);
                        if (!br_dev) {
                                NL_SET_ERR_MSG(extack, "Master of device not found");
                                return -EINVAL;
                        }
                        ops = br_dev->netdev_ops;
                } else {
                        if (!(ndm_flags & NTF_SELF)) {
                                NL_SET_ERR_MSG(extack, "Missing NTF_SELF");
                                return -EINVAL;
                        }
                        ops = dev->netdev_ops;
                }
        }

        if (!br_dev && !dev) {
                NL_SET_ERR_MSG(extack, "No device specified");
                return -ENODEV;
        }

        if (!ops || !ops->ndo_fdb_get) {
                NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device");
                return -EOPNOTSUPP;
        }

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                return -ENOBUFS;

        if (br_dev)
                dev = br_dev;
        err = ops->ndo_fdb_get(skb, tb, dev, addr, vid,
                               NETLINK_CB(in_skb).portid,
                               nlh->nlmsg_seq, extack);
        if (err)
                goto out;

        return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
out:
        kfree_skb(skb);
        return err;
}

static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
                               unsigned int attrnum, unsigned int flag)
{
        if (mask & flag)
                return nla_put_u8(skb, attrnum, !!(flags & flag));
        return 0;
}

int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
                            struct net_device *dev, u16 mode,
                            u32 flags, u32 mask, int nlflags,
                            u32 filter_mask,
                            int (*vlan_fill)(struct sk_buff *skb,
                                             struct net_device *dev,
                                             u32 filter_mask))
{
        struct nlmsghdr *nlh;
        struct ifinfomsg *ifm;
        struct nlattr *br_afspec;
        struct nlattr *protinfo;
        u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
        struct net_device *br_dev = netdev_master_upper_dev_get(dev);
        int err = 0;

        nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags);
        if (nlh == NULL)
                return -EMSGSIZE;

        ifm = nlmsg_data(nlh);
        ifm->ifi_family = AF_BRIDGE;
        ifm->__ifi_pad = 0;
        ifm->ifi_type = dev->type;
        ifm->ifi_index = dev->ifindex;
        ifm->ifi_flags = dev_get_flags(dev);
        ifm->ifi_change = 0;


        if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
            nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
            nla_put_u8(skb, IFLA_OPERSTATE, operstate) ||
            (br_dev &&
             nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) ||
            (dev->addr_len &&
             nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
            (dev->ifindex != dev_get_iflink(dev) &&
             nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
                goto nla_put_failure;

        br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
        if (!br_afspec)
                goto nla_put_failure;

        if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) {
                nla_nest_cancel(skb, br_afspec);
                goto nla_put_failure;
        }

        if (mode != BRIDGE_MODE_UNDEF) {
                if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) {
                        nla_nest_cancel(skb, br_afspec);
                        goto nla_put_failure;
                }
        }
        if (vlan_fill) {
                err = vlan_fill(skb, dev, filter_mask);
                if (err) {
                        nla_nest_cancel(skb, br_afspec);
                        goto nla_put_failure;
                }
        }
        nla_nest_end(skb, br_afspec);

        protinfo = nla_nest_start(skb, IFLA_PROTINFO);
        if (!protinfo)
                goto nla_put_failure;

        if (brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_GUARD, BR_BPDU_GUARD) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_FAST_LEAVE,
                                BR_MULTICAST_FAST_LEAVE) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_LEARNING, BR_LEARNING) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_PROXYARP, BR_PROXYARP) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) ||
            brport_nla_put_flag(skb, flags, mask,
                                IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) {
                nla_nest_cancel(skb, protinfo);
                goto nla_put_failure;
        }

        nla_nest_end(skb, protinfo);

        nlmsg_end(skb, nlh);
        return 0;
nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return err ? err : -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);

static int valid_bridge_getlink_req(const struct nlmsghdr *nlh,
                                    bool strict_check, u32 *filter_mask,
                                    struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_MAX+1];
        int err, i;

        if (strict_check) {
                struct ifinfomsg *ifm;

                if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                        NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump");
                        return -EINVAL;
                }

                ifm = nlmsg_data(nlh);
                if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
                    ifm->ifi_change || ifm->ifi_index) {
                        NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request");
                        return -EINVAL;
                }

                err = nlmsg_parse_deprecated_strict(nlh,
                                                    sizeof(struct ifinfomsg),
                                                    tb, IFLA_MAX, ifla_policy,
                                                    extack);
        } else {
                err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg),
                                             tb, IFLA_MAX, ifla_policy,
                                             extack);
        }
        if (err < 0)
                return err;

        /* new attributes should only be added with strict checking */
        for (i = 0; i <= IFLA_MAX; ++i) {
                if (!tb[i])
                        continue;

                switch (i) {
                case IFLA_EXT_MASK:
                        *filter_mask = nla_get_u32(tb[i]);
                        break;
                default:
                        if (strict_check) {
                                NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request");
                                return -EINVAL;
                        }
                }
        }

        return 0;
}

static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        struct net_device *dev;
        int idx = 0;
        u32 portid = NETLINK_CB(cb->skb).portid;
        u32 seq = nlh->nlmsg_seq;
        u32 filter_mask = 0;
        int err;

        err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask,
                                       cb->extack);
        if (err < 0 && cb->strict_check)
                return err;

        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                const struct net_device_ops *ops = dev->netdev_ops;
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);

                if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) {
                        if (idx >= cb->args[0]) {
                                err = br_dev->netdev_ops->ndo_bridge_getlink(
                                                skb, portid, seq, dev,
                                                filter_mask, NLM_F_MULTI);
                                if (err < 0 && err != -EOPNOTSUPP) {
                                        if (likely(skb->len))
                                                break;

                                        goto out_err;
                                }
                        }
                        idx++;
                }

                if (ops->ndo_bridge_getlink) {
                        if (idx >= cb->args[0]) {
                                err = ops->ndo_bridge_getlink(skb, portid,
                                                              seq, dev,
                                                              filter_mask,
                                                              NLM_F_MULTI);
                                if (err < 0 && err != -EOPNOTSUPP) {
                                        if (likely(skb->len))
                                                break;

                                        goto out_err;
                                }
                        }
                        idx++;
                }
        }
        err = skb->len;
out_err:
        rcu_read_unlock();
        cb->args[0] = idx;

        return err;
}

static inline size_t bridge_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ifinfomsg))
                + nla_total_size(IFNAMSIZ)        /* IFLA_IFNAME */
                + nla_total_size(MAX_ADDR_LEN)        /* IFLA_ADDRESS */
                + nla_total_size(sizeof(u32))        /* IFLA_MASTER */
                + nla_total_size(sizeof(u32))        /* IFLA_MTU */
                + nla_total_size(sizeof(u32))        /* IFLA_LINK */
                + nla_total_size(sizeof(u32))        /* IFLA_OPERSTATE */
                + nla_total_size(sizeof(u8))        /* IFLA_PROTINFO */
                + nla_total_size(sizeof(struct nlattr))        /* IFLA_AF_SPEC */
                + nla_total_size(sizeof(u16))        /* IFLA_BRIDGE_FLAGS */
                + nla_total_size(sizeof(u16));        /* IFLA_BRIDGE_MODE */
}

static int rtnl_bridge_notify(struct net_device *dev)
{
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -EOPNOTSUPP;

        if (!dev->netdev_ops->ndo_bridge_getlink)
                return 0;

        skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC);
        if (!skb) {
                err = -ENOMEM;
                goto errout;
        }

        err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0);
        if (err < 0)
                goto errout;

        /* Notification info is only filled for bridge ports, not the bridge
         * device itself. Therefore, a zero notification length is valid and
         * should not result in an error.
         */
        if (!skb->len)
                goto errout;

        rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
        return 0;
errout:
        WARN_ON(err == -EMSGSIZE);
        kfree_skb(skb);
        if (err)
                rtnl_set_sk_err(net, RTNLGRP_LINK, err);
        return err;
}

static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
                               struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ifinfomsg *ifm;
        struct net_device *dev;
        struct nlattr *br_spec, *attr, *br_flags_attr = NULL;
        int rem, err = -EOPNOTSUPP;
        u16 flags = 0;

        if (nlmsg_len(nlh) < sizeof(*ifm))
                return -EINVAL;

        ifm = nlmsg_data(nlh);
        if (ifm->ifi_family != AF_BRIDGE)
                return -EPFNOSUPPORT;

        dev = __dev_get_by_index(net, ifm->ifi_index);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }

        br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
        if (br_spec) {
                nla_for_each_nested(attr, br_spec, rem) {
                        if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) {
                                if (nla_len(attr) < sizeof(flags))
                                        return -EINVAL;

                                br_flags_attr = attr;
                                flags = nla_get_u16(attr);
                        }

                        if (nla_type(attr) == IFLA_BRIDGE_MODE) {
                                if (nla_len(attr) < sizeof(u16))
                                        return -EINVAL;
                        }
                }
        }

        if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);

                if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) {
                        err = -EOPNOTSUPP;
                        goto out;
                }

                err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags,
                                                             extack);
                if (err)
                        goto out;

                flags &= ~BRIDGE_FLAGS_MASTER;
        }

        if ((flags & BRIDGE_FLAGS_SELF)) {
                if (!dev->netdev_ops->ndo_bridge_setlink)
                        err = -EOPNOTSUPP;
                else
                        err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh,
                                                                  flags,
                                                                  extack);
                if (!err) {
                        flags &= ~BRIDGE_FLAGS_SELF;

                        /* Generate event to notify upper layer of bridge
                         * change
                         */
                        err = rtnl_bridge_notify(dev);
                }
        }

        if (br_flags_attr)
                memcpy(nla_data(br_flags_attr), &flags, sizeof(flags));
out:
        return err;
}

static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
                               struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ifinfomsg *ifm;
        struct net_device *dev;
        struct nlattr *br_spec, *attr = NULL;
        int rem, err = -EOPNOTSUPP;
        u16 flags = 0;
        bool have_flags = false;

        if (nlmsg_len(nlh) < sizeof(*ifm))
                return -EINVAL;

        ifm = nlmsg_data(nlh);
        if (ifm->ifi_family != AF_BRIDGE)
                return -EPFNOSUPPORT;

        dev = __dev_get_by_index(net, ifm->ifi_index);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "unknown ifindex");
                return -ENODEV;
        }

        br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
        if (br_spec) {
                nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec,
                                         rem) {
                        if (nla_len(attr) < sizeof(flags))
                                return -EINVAL;

                        have_flags = true;
                        flags = nla_get_u16(attr);
                        break;
                }
        }

        if (!flags || (flags & BRIDGE_FLAGS_MASTER)) {
                struct net_device *br_dev = netdev_master_upper_dev_get(dev);

                if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) {
                        err = -EOPNOTSUPP;
                        goto out;
                }

                err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags);
                if (err)
                        goto out;

                flags &= ~BRIDGE_FLAGS_MASTER;
        }

        if ((flags & BRIDGE_FLAGS_SELF)) {
                if (!dev->netdev_ops->ndo_bridge_dellink)
                        err = -EOPNOTSUPP;
                else
                        err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh,
                                                                  flags);

                if (!err) {
                        flags &= ~BRIDGE_FLAGS_SELF;

                        /* Generate event to notify upper layer of bridge
                         * change
                         */
                        err = rtnl_bridge_notify(dev);
                }
        }

        if (have_flags)
                memcpy(nla_data(attr), &flags, sizeof(flags));
out:
        return err;
}

static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
{
        return (mask & IFLA_STATS_FILTER_BIT(attrid)) &&
               (!idxattr || idxattr == attrid);
}

static bool
rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id)
{
        return dev->netdev_ops &&
               dev->netdev_ops->ndo_has_offload_stats &&
               dev->netdev_ops->ndo_get_offload_stats &&
               dev->netdev_ops->ndo_has_offload_stats(dev, attr_id);
}

static unsigned int
rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id)
{
        return rtnl_offload_xstats_have_ndo(dev, attr_id) ?
               sizeof(struct rtnl_link_stats64) : 0;
}

static int
rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id,
                             struct sk_buff *skb)
{
        unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id);
        struct nlattr *attr = NULL;
        void *attr_data;
        int err;

        if (!size)
                return -ENODATA;

        attr = nla_reserve_64bit(skb, attr_id, size,
                                 IFLA_OFFLOAD_XSTATS_UNSPEC);
        if (!attr)
                return -EMSGSIZE;

        attr_data = nla_data(attr);
        memset(attr_data, 0, size);

        err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data);
        if (err)
                return err;

        return 0;
}

static unsigned int
rtnl_offload_xstats_get_size_stats(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type)
{
        bool enabled = netdev_offload_xstats_enabled(dev, type);

        return enabled ? sizeof(struct rtnl_hw_stats64) : 0;
}

struct rtnl_offload_xstats_request_used {
        bool request;
        bool used;
};

static int
rtnl_offload_xstats_get_stats(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_offload_xstats_request_used *ru,
                              struct rtnl_hw_stats64 *stats,
                              struct netlink_ext_ack *extack)
{
        bool request;
        bool used;
        int err;

        request = netdev_offload_xstats_enabled(dev, type);
        if (!request) {
                used = false;
                goto out;
        }

        err = netdev_offload_xstats_get(dev, type, stats, &used, extack);
        if (err)
                return err;

out:
        if (ru) {
                ru->request = request;
                ru->used = used;
        }
        return 0;
}

static int
rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id,
                                       struct rtnl_offload_xstats_request_used *ru)
{
        struct nlattr *nest;

        nest = nla_nest_start(skb, attr_id);
        if (!nest)
                return -EMSGSIZE;

        if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request))
                goto nla_put_failure;

        if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used))
                goto nla_put_failure;

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -EMSGSIZE;
}

static int
rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev,
                                   struct netlink_ext_ack *extack)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
        struct rtnl_offload_xstats_request_used ru_l3;
        struct nlattr *nest;
        int err;

        err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack);
        if (err)
                return err;

        nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO);
        if (!nest)
                return -EMSGSIZE;

        if (rtnl_offload_xstats_fill_hw_s_info_one(skb,
                                                   IFLA_OFFLOAD_XSTATS_L3_STATS,
                                                   &ru_l3))
                goto nla_put_failure;

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -EMSGSIZE;
}

static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
                                    int *prividx, u32 off_filter_mask,
                                    struct netlink_ext_ack *extack)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
        int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO;
        int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS;
        int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
        bool have_data = false;
        int err;

        if (*prividx <= attr_id_cpu_hit &&
            (off_filter_mask &
             IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) {
                err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb);
                if (!err) {
                        have_data = true;
                } else if (err != -ENODATA) {
                        *prividx = attr_id_cpu_hit;
                        return err;
                }
        }

        if (*prividx <= attr_id_hw_s_info &&
            (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) {
                *prividx = attr_id_hw_s_info;

                err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack);
                if (err)
                        return err;

                have_data = true;
                *prividx = 0;
        }

        if (*prividx <= attr_id_l3_stats &&
            (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) {
                unsigned int size_l3;
                struct nlattr *attr;

                *prividx = attr_id_l3_stats;

                size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3);
                if (!size_l3)
                        goto skip_l3_stats;
                attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3,
                                         IFLA_OFFLOAD_XSTATS_UNSPEC);
                if (!attr)
                        return -EMSGSIZE;

                err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL,
                                                    nla_data(attr), extack);
                if (err)
                        return err;

                have_data = true;
skip_l3_stats:
                *prividx = 0;
        }

        if (!have_data)
                return -ENODATA;

        *prividx = 0;
        return 0;
}

static unsigned int
rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev,
                                           enum netdev_offload_xstats_type type)
{
        return nla_total_size(0) +
                /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */
                nla_total_size(sizeof(u8)) +
                /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */
                nla_total_size(sizeof(u8)) +
                0;
}

static unsigned int
rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;

        return nla_total_size(0) +
                /* IFLA_OFFLOAD_XSTATS_L3_STATS */
                rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) +
                0;
}

static int rtnl_offload_xstats_get_size(const struct net_device *dev,
                                        u32 off_filter_mask)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
        int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
        int nla_size = 0;
        int size;

        if (off_filter_mask &
            IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) {
                size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit);
                nla_size += nla_total_size_64bit(size);
        }

        if (off_filter_mask &
            IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO))
                nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev);

        if (off_filter_mask &
            IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) {
                size = rtnl_offload_xstats_get_size_stats(dev, t_l3);
                nla_size += nla_total_size_64bit(size);
        }

        if (nla_size != 0)
                nla_size += nla_total_size(0);

        return nla_size;
}

struct rtnl_stats_dump_filters {
        /* mask[0] filters outer attributes. Then individual nests have their
         * filtering mask at the index of the nested attribute.
         */
        u32 mask[IFLA_STATS_MAX + 1];
};

static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
                               int type, u32 pid, u32 seq, u32 change,
                               unsigned int flags,
                               const struct rtnl_stats_dump_filters *filters,
                               int *idxattr, int *prividx,
                               struct netlink_ext_ack *extack)
{
        unsigned int filter_mask = filters->mask[0];
        struct if_stats_msg *ifsm;
        struct nlmsghdr *nlh;
        struct nlattr *attr;
        int s_prividx = *prividx;
        int err;

        ASSERT_RTNL();

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags);
        if (!nlh)
                return -EMSGSIZE;

        ifsm = nlmsg_data(nlh);
        ifsm->family = PF_UNSPEC;
        ifsm->pad1 = 0;
        ifsm->pad2 = 0;
        ifsm->ifindex = dev->ifindex;
        ifsm->filter_mask = filter_mask;

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) {
                struct rtnl_link_stats64 *sp;

                attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64,
                                         sizeof(struct rtnl_link_stats64),
                                         IFLA_STATS_UNSPEC);
                if (!attr) {
                        err = -EMSGSIZE;
                        goto nla_put_failure;
                }

                sp = nla_data(attr);
                dev_get_stats(dev, sp);
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) {
                const struct rtnl_link_ops *ops = dev->rtnl_link_ops;

                if (ops && ops->fill_linkxstats) {
                        *idxattr = IFLA_STATS_LINK_XSTATS;
                        attr = nla_nest_start_noflag(skb,
                                                     IFLA_STATS_LINK_XSTATS);
                        if (!attr) {
                                err = -EMSGSIZE;
                                goto nla_put_failure;
                        }

                        err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
                        nla_nest_end(skb, attr);
                        if (err)
                                goto nla_put_failure;
                        *idxattr = 0;
                }
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE,
                             *idxattr)) {
                const struct rtnl_link_ops *ops = NULL;
                const struct net_device *master;

                master = netdev_master_upper_dev_get(dev);
                if (master)
                        ops = master->rtnl_link_ops;
                if (ops && ops->fill_linkxstats) {
                        *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
                        attr = nla_nest_start_noflag(skb,
                                                     IFLA_STATS_LINK_XSTATS_SLAVE);
                        if (!attr) {
                                err = -EMSGSIZE;
                                goto nla_put_failure;
                        }

                        err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
                        nla_nest_end(skb, attr);
                        if (err)
                                goto nla_put_failure;
                        *idxattr = 0;
                }
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
                             *idxattr)) {
                u32 off_filter_mask;

                off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
                *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
                attr = nla_nest_start_noflag(skb,
                                             IFLA_STATS_LINK_OFFLOAD_XSTATS);
                if (!attr) {
                        err = -EMSGSIZE;
                        goto nla_put_failure;
                }

                err = rtnl_offload_xstats_fill(skb, dev, prividx,
                                               off_filter_mask, extack);
                if (err == -ENODATA)
                        nla_nest_cancel(skb, attr);
                else
                        nla_nest_end(skb, attr);

                if (err && err != -ENODATA)
                        goto nla_put_failure;
                *idxattr = 0;
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
                struct rtnl_af_ops *af_ops;

                *idxattr = IFLA_STATS_AF_SPEC;
                attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC);
                if (!attr) {
                        err = -EMSGSIZE;
                        goto nla_put_failure;
                }

                rcu_read_lock();
                list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
                        if (af_ops->fill_stats_af) {
                                struct nlattr *af;

                                af = nla_nest_start_noflag(skb,
                                                           af_ops->family);
                                if (!af) {
                                        rcu_read_unlock();
                                        err = -EMSGSIZE;
                                        goto nla_put_failure;
                                }
                                err = af_ops->fill_stats_af(skb, dev);

                                if (err == -ENODATA) {
                                        nla_nest_cancel(skb, af);
                                } else if (err < 0) {
                                        rcu_read_unlock();
                                        goto nla_put_failure;
                                }

                                nla_nest_end(skb, af);
                        }
                }
                rcu_read_unlock();

                nla_nest_end(skb, attr);

                *idxattr = 0;
        }

        nlmsg_end(skb, nlh);

        return 0;

nla_put_failure:
        /* not a multi message or no progress mean a real error */
        if (!(flags & NLM_F_MULTI) || s_prividx == *prividx)
                nlmsg_cancel(skb, nlh);
        else
                nlmsg_end(skb, nlh);

        return err;
}

static size_t if_nlmsg_stats_size(const struct net_device *dev,
                                  const struct rtnl_stats_dump_filters *filters)
{
        size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg));
        unsigned int filter_mask = filters->mask[0];

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
                size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) {
                const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
                int attr = IFLA_STATS_LINK_XSTATS;

                if (ops && ops->get_linkxstats_size) {
                        size += nla_total_size(ops->get_linkxstats_size(dev,
                                                                        attr));
                        /* for IFLA_STATS_LINK_XSTATS */
                        size += nla_total_size(0);
                }
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) {
                struct net_device *_dev = (struct net_device *)dev;
                const struct rtnl_link_ops *ops = NULL;
                const struct net_device *master;

                /* netdev_master_upper_dev_get can't take const */
                master = netdev_master_upper_dev_get(_dev);
                if (master)
                        ops = master->rtnl_link_ops;
                if (ops && ops->get_linkxstats_size) {
                        int attr = IFLA_STATS_LINK_XSTATS_SLAVE;

                        size += nla_total_size(ops->get_linkxstats_size(dev,
                                                                        attr));
                        /* for IFLA_STATS_LINK_XSTATS_SLAVE */
                        size += nla_total_size(0);
                }
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) {
                u32 off_filter_mask;

                off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
                size += rtnl_offload_xstats_get_size(dev, off_filter_mask);
        }

        if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
                struct rtnl_af_ops *af_ops;

                /* for IFLA_STATS_AF_SPEC */
                size += nla_total_size(0);

                rcu_read_lock();
                list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
                        if (af_ops->get_stats_af_size) {
                                size += nla_total_size(
                                        af_ops->get_stats_af_size(dev));

                                /* for AF_* */
                                size += nla_total_size(0);
                        }
                }
                rcu_read_unlock();
        }

        return size;
}

#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1)

static const struct nla_policy
rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = {
        [IFLA_STATS_LINK_OFFLOAD_XSTATS] =
                    NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID),
};

static const struct nla_policy
rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = {
        [IFLA_STATS_GET_FILTERS] =
                    NLA_POLICY_NESTED(rtnl_stats_get_policy_filters),
};

static const struct nla_policy
ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = {
        [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1),
};

static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters,
                                        struct rtnl_stats_dump_filters *filters,
                                        struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_STATS_MAX + 1];
        int err;
        int at;

        err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters,
                               rtnl_stats_get_policy_filters, extack);
        if (err < 0)
                return err;

        for (at = 1; at <= IFLA_STATS_MAX; at++) {
                if (tb[at]) {
                        if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) {
                                NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask");
                                return -EINVAL;
                        }
                        filters->mask[at] = nla_get_u32(tb[at]);
                }
        }

        return 0;
}

static int rtnl_stats_get_parse(const struct nlmsghdr *nlh,
                                u32 filter_mask,
                                struct rtnl_stats_dump_filters *filters,
                                struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
        int err;
        int i;

        filters->mask[0] = filter_mask;
        for (i = 1; i < ARRAY_SIZE(filters->mask); i++)
                filters->mask[i] = -1U;

        err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb,
                          IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack);
        if (err < 0)
                return err;

        if (tb[IFLA_STATS_GET_FILTERS]) {
                err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS],
                                                   filters, extack);
                if (err)
                        return err;
        }

        return 0;
}

static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
                                bool is_dump, struct netlink_ext_ack *extack)
{
        struct if_stats_msg *ifsm;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
                return -EINVAL;
        }

        if (!strict_check)
                return 0;

        ifsm = nlmsg_data(nlh);

        /* only requests using strict checks can pass data to influence
         * the dump. The legacy exception is filter_mask.
         */
        if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request");
                return -EINVAL;
        }
        if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) {
                NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask");
                return -EINVAL;
        }

        return 0;
}

static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct rtnl_stats_dump_filters filters;
        struct net *net = sock_net(skb->sk);
        struct net_device *dev = NULL;
        int idxattr = 0, prividx = 0;
        struct if_stats_msg *ifsm;
        struct sk_buff *nskb;
        int err;

        err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
                                   false, extack);
        if (err)
                return err;

        ifsm = nlmsg_data(nlh);
        if (ifsm->ifindex > 0)
                dev = __dev_get_by_index(net, ifsm->ifindex);
        else
                return -EINVAL;

        if (!dev)
                return -ENODEV;

        if (!ifsm->filter_mask) {
                NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get");
                return -EINVAL;
        }

        err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack);
        if (err)
                return err;

        nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL);
        if (!nskb)
                return -ENOBUFS;

        err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
                                  NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
                                  0, &filters, &idxattr, &prividx, extack);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(nskb);
        } else {
                err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
        }

        return err;
}

static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct rtnl_stats_dump_filters filters;
        struct net *net = sock_net(skb->sk);
        unsigned int flags = NLM_F_MULTI;
        struct if_stats_msg *ifsm;
        struct {
                unsigned long ifindex;
                int idxattr;
                int prividx;
        } *ctx = (void *)cb->ctx;
        struct net_device *dev;
        int err;

        cb->seq = net->dev_base_seq;

        err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack);
        if (err)
                return err;

        ifsm = nlmsg_data(cb->nlh);
        if (!ifsm->filter_mask) {
                NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump");
                return -EINVAL;
        }

        err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters,
                                   extack);
        if (err)
                return err;

        for_each_netdev_dump(net, dev, ctx->ifindex) {
                err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
                                          NETLINK_CB(cb->skb).portid,
                                          cb->nlh->nlmsg_seq, 0,
                                          flags, &filters,
                                          &ctx->idxattr, &ctx->prividx,
                                          extack);
                /* If we ran out of room on the first message,
                 * we're in trouble.
                 */
                WARN_ON((err == -EMSGSIZE) && (skb->len == 0));

                if (err < 0)
                        break;
                ctx->prividx = 0;
                ctx->idxattr = 0;
                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
        }

        return err;
}

void rtnl_offload_xstats_notify(struct net_device *dev)
{
        struct rtnl_stats_dump_filters response_filters = {};
        struct net *net = dev_net(dev);
        int idxattr = 0, prividx = 0;
        struct sk_buff *skb;
        int err = -ENOBUFS;

        ASSERT_RTNL();

        response_filters.mask[0] |=
                IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
        response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
                IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);

        skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters),
                        GFP_KERNEL);
        if (!skb)
                goto errout;

        err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0,
                                  &response_filters, &idxattr, &prividx, NULL);
        if (err < 0) {
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL);
        return;

errout:
        rtnl_set_sk_err(net, RTNLGRP_STATS, err);
}
EXPORT_SYMBOL(rtnl_offload_xstats_notify);

static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
        struct rtnl_stats_dump_filters response_filters = {};
        struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
        struct net *net = sock_net(skb->sk);
        struct net_device *dev = NULL;
        struct if_stats_msg *ifsm;
        bool notify = false;
        int err;

        err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
                                   false, extack);
        if (err)
                return err;

        ifsm = nlmsg_data(nlh);
        if (ifsm->family != AF_UNSPEC) {
                NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC");
                return -EINVAL;
        }

        if (ifsm->ifindex > 0)
                dev = __dev_get_by_index(net, ifsm->ifindex);
        else
                return -EINVAL;

        if (!dev)
                return -ENODEV;

        if (ifsm->filter_mask) {
                NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set");
                return -EINVAL;
        }

        err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX,
                          ifla_stats_set_policy, extack);
        if (err < 0)
                return err;

        if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) {
                u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]);

                if (req)
                        err = netdev_offload_xstats_enable(dev, t_l3, extack);
                else
                        err = netdev_offload_xstats_disable(dev, t_l3);

                if (!err)
                        notify = true;
                else if (err != -EALREADY)
                        return err;

                response_filters.mask[0] |=
                        IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
                response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
                        IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
        }

        if (notify)
                rtnl_offload_xstats_notify(dev);

        return 0;
}

static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh,
                                   struct netlink_ext_ack *extack)
{
        struct br_port_msg *bpm;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*bpm))) {
                NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request");
                return -EINVAL;
        }

        bpm = nlmsg_data(nlh);
        if (bpm->ifindex) {
                NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request");
                return -EINVAL;
        }
        if (nlmsg_attrlen(nlh, sizeof(*bpm))) {
                NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request");
                return -EINVAL;
        }

        return 0;
}

struct rtnl_mdb_dump_ctx {
        long idx;
};

static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx;
        struct net *net = sock_net(skb->sk);
        struct net_device *dev;
        int idx, s_idx;
        int err;

        NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx);

        if (cb->strict_check) {
                err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack);
                if (err)
                        return err;
        }

        s_idx = ctx->idx;
        idx = 0;

        for_each_netdev(net, dev) {
                if (idx < s_idx)
                        goto skip;
                if (!dev->netdev_ops->ndo_mdb_dump)
                        goto skip;

                err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb);
                if (err == -EMSGSIZE)
                        goto out;
                /* Moving on to next device, reset markers and sequence
                 * counters since they are all maintained per-device.
                 */
                memset(cb->ctx, 0, sizeof(cb->ctx));
                cb->prev_seq = 0;
                cb->seq = 0;
skip:
                idx++;
        }

out:
        ctx->idx = idx;
        return skb->len;
}

static int rtnl_validate_mdb_entry_get(const struct nlattr *attr,
                                       struct netlink_ext_ack *extack)
{
        struct br_mdb_entry *entry = nla_data(attr);

        if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
                NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
                return -EINVAL;
        }

        if (entry->ifindex) {
                NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified");
                return -EINVAL;
        }

        if (entry->state) {
                NL_SET_ERR_MSG(extack, "Entry state cannot be specified");
                return -EINVAL;
        }

        if (entry->flags) {
                NL_SET_ERR_MSG(extack, "Entry flags cannot be specified");
                return -EINVAL;
        }

        if (entry->vid >= VLAN_VID_MASK) {
                NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
                return -EINVAL;
        }

        if (entry->addr.proto != htons(ETH_P_IP) &&
            entry->addr.proto != htons(ETH_P_IPV6) &&
            entry->addr.proto != 0) {
                NL_SET_ERR_MSG(extack, "Unknown entry protocol");
                return -EINVAL;
        }

        return 0;
}

static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = {
        [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
                                                  rtnl_validate_mdb_entry_get,
                                                  sizeof(struct br_mdb_entry)),
        [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED },
};

static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1];
        struct net *net = sock_net(in_skb->sk);
        struct br_port_msg *bpm;
        struct net_device *dev;
        int err;

        err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb,
                          MDBA_GET_ENTRY_MAX, mdba_get_policy, extack);
        if (err)
                return err;

        bpm = nlmsg_data(nlh);
        if (!bpm->ifindex) {
                NL_SET_ERR_MSG(extack, "Invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, bpm->ifindex);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "Device doesn't exist");
                return -ENODEV;
        }

        if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) {
                NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute");
                return -EINVAL;
        }

        if (!dev->netdev_ops->ndo_mdb_get) {
                NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
                return -EOPNOTSUPP;
        }

        return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid,
                                            nlh->nlmsg_seq, extack);
}

static int rtnl_validate_mdb_entry(const struct nlattr *attr,
                                   struct netlink_ext_ack *extack)
{
        struct br_mdb_entry *entry = nla_data(attr);

        if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
                NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
                return -EINVAL;
        }

        if (entry->ifindex == 0) {
                NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed");
                return -EINVAL;
        }

        if (entry->addr.proto == htons(ETH_P_IP)) {
                if (!ipv4_is_multicast(entry->addr.u.ip4) &&
                    !ipv4_is_zeronet(entry->addr.u.ip4)) {
                        NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0");
                        return -EINVAL;
                }
                if (ipv4_is_local_multicast(entry->addr.u.ip4)) {
                        NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast");
                        return -EINVAL;
                }
#if IS_ENABLED(CONFIG_IPV6)
        } else if (entry->addr.proto == htons(ETH_P_IPV6)) {
                if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) {
                        NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes");
                        return -EINVAL;
                }
#endif
        } else if (entry->addr.proto == 0) {
                /* L2 mdb */
                if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) {
                        NL_SET_ERR_MSG(extack, "L2 entry group is not multicast");
                        return -EINVAL;
                }
        } else {
                NL_SET_ERR_MSG(extack, "Unknown entry protocol");
                return -EINVAL;
        }

        if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
                NL_SET_ERR_MSG(extack, "Unknown entry state");
                return -EINVAL;
        }
        if (entry->vid >= VLAN_VID_MASK) {
                NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
                return -EINVAL;
        }

        return 0;
}

static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = {
        [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 },
        [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
                                                  rtnl_validate_mdb_entry,
                                                  sizeof(struct br_mdb_entry)),
        [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
};

static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
        struct net *net = sock_net(skb->sk);
        struct br_port_msg *bpm;
        struct net_device *dev;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
                                     MDBA_SET_ENTRY_MAX, mdba_policy, extack);
        if (err)
                return err;

        bpm = nlmsg_data(nlh);
        if (!bpm->ifindex) {
                NL_SET_ERR_MSG(extack, "Invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, bpm->ifindex);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "Device doesn't exist");
                return -ENODEV;
        }

        if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) {
                NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute");
                return -EINVAL;
        }

        if (!dev->netdev_ops->ndo_mdb_add) {
                NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
                return -EOPNOTSUPP;
        }

        return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack);
}

static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr,
                                            struct netlink_ext_ack *extack)
{
        struct br_mdb_entry *entry = nla_data(attr);
        struct br_mdb_entry zero_entry = {};

        if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
                NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
                return -EINVAL;
        }

        if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
                NL_SET_ERR_MSG(extack, "Unknown entry state");
                return -EINVAL;
        }

        if (entry->flags) {
                NL_SET_ERR_MSG(extack, "Entry flags cannot be set");
                return -EINVAL;
        }

        if (entry->vid >= VLAN_N_VID - 1) {
                NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
                return -EINVAL;
        }

        if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) {
                NL_SET_ERR_MSG(extack, "Entry address cannot be set");
                return -EINVAL;
        }

        return 0;
}

static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = {
        [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
                                                  rtnl_validate_mdb_entry_del_bulk,
                                                  sizeof(struct br_mdb_entry)),
        [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
};

static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
                        struct netlink_ext_ack *extack)
{
        bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
        struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
        struct net *net = sock_net(skb->sk);
        struct br_port_msg *bpm;
        struct net_device *dev;
        int err;

        if (!del_bulk)
                err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
                                             MDBA_SET_ENTRY_MAX, mdba_policy,
                                             extack);
        else
                err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX,
                                  mdba_del_bulk_policy, extack);
        if (err)
                return err;

        bpm = nlmsg_data(nlh);
        if (!bpm->ifindex) {
                NL_SET_ERR_MSG(extack, "Invalid ifindex");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, bpm->ifindex);
        if (!dev) {
                NL_SET_ERR_MSG(extack, "Device doesn't exist");
                return -ENODEV;
        }

        if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) {
                NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute");
                return -EINVAL;
        }

        if (del_bulk) {
                if (!dev->netdev_ops->ndo_mdb_del_bulk) {
                        NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion");
                        return -EOPNOTSUPP;
                }
                return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack);
        }

        if (!dev->netdev_ops->ndo_mdb_del) {
                NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
                return -EOPNOTSUPP;
        }

        return dev->netdev_ops->ndo_mdb_del(dev, tb, extack);
}

/* Process one rtnetlink message. */

static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
        const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED);
        rtnl_dumpit_func dumpit = cb->data;
        int err;

        /* Previous iteration have already finished, avoid calling->dumpit()
         * again, it may not expect to be called after it reached the end.
         */
        if (!dumpit)
                return 0;

        if (needs_lock)
                rtnl_lock();
        err = dumpit(skb, cb);
        if (needs_lock)
                rtnl_unlock();

        /* Old dump handlers used to send NLM_DONE as in a separate recvmsg().
         * Some applications which parse netlink manually depend on this.
         */
        if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) {
                if (err < 0 && err != -EMSGSIZE)
                        return err;
                if (!err)
                        cb->data = NULL;

                return skb->len;
        }
        return err;
}

static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                                const struct nlmsghdr *nlh,
                                struct netlink_dump_control *control)
{
        if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE ||
            !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) {
                WARN_ON(control->data);
                control->data = control->dump;
                control->dump = rtnl_dumpit;
        }

        return netlink_dump_start(ssk, skb, nlh, control);
}

static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct rtnl_link *link;
        enum rtnl_kinds kind;
        struct module *owner;
        int err = -EOPNOTSUPP;
        rtnl_doit_func doit;
        unsigned int flags;
        int family;
        int type;

        type = nlh->nlmsg_type;
        if (type > RTM_MAX)
                return -EOPNOTSUPP;

        type -= RTM_BASE;

        /* All the messages must have at least 1 byte length */
        if (nlmsg_len(nlh) < sizeof(struct rtgenmsg))
                return 0;

        family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
        kind = rtnl_msgtype_kind(type);

        if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        rcu_read_lock();
        if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) {
                struct sock *rtnl;
                rtnl_dumpit_func dumpit;
                u32 min_dump_alloc = 0;

                link = rtnl_get_link(family, type);
                if (!link || !link->dumpit) {
                        family = PF_UNSPEC;
                        link = rtnl_get_link(family, type);
                        if (!link || !link->dumpit)
                                goto err_unlock;
                }
                owner = link->owner;
                dumpit = link->dumpit;
                flags = link->flags;

                if (type == RTM_GETLINK - RTM_BASE)
                        min_dump_alloc = rtnl_calcit(skb, nlh);

                err = 0;
                /* need to do this before rcu_read_unlock() */
                if (!try_module_get(owner))
                        err = -EPROTONOSUPPORT;

                rcu_read_unlock();

                rtnl = net->rtnl;
                if (err == 0) {
                        struct netlink_dump_control c = {
                                .dump                = dumpit,
                                .min_dump_alloc        = min_dump_alloc,
                                .module                = owner,
                                .flags                = flags,
                        };
                        err = rtnetlink_dump_start(rtnl, skb, nlh, &c);
                        /* netlink_dump_start() will keep a reference on
                         * module if dump is still in progress.
                         */
                        module_put(owner);
                }
                return err;
        }

        link = rtnl_get_link(family, type);
        if (!link || !link->doit) {
                family = PF_UNSPEC;
                link = rtnl_get_link(PF_UNSPEC, type);
                if (!link || !link->doit)
                        goto out_unlock;
        }

        owner = link->owner;
        if (!try_module_get(owner)) {
                err = -EPROTONOSUPPORT;
                goto out_unlock;
        }

        flags = link->flags;
        if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) &&
            !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) {
                NL_SET_ERR_MSG(extack, "Bulk delete is not supported");
                module_put(owner);
                goto err_unlock;
        }

        if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
                doit = link->doit;
                rcu_read_unlock();
                if (doit)
                        err = doit(skb, nlh, extack);
                module_put(owner);
                return err;
        }
        rcu_read_unlock();

        rtnl_lock();
        link = rtnl_get_link(family, type);
        if (link && link->doit)
                err = link->doit(skb, nlh, extack);
        rtnl_unlock();

        module_put(owner);

        return err;

out_unlock:
        rcu_read_unlock();
        return err;

err_unlock:
        rcu_read_unlock();
        return -EOPNOTSUPP;
}

static void rtnetlink_rcv(struct sk_buff *skb)
{
        netlink_rcv_skb(skb, &rtnetlink_rcv_msg);
}

static int rtnetlink_bind(struct net *net, int group)
{
        switch (group) {
        case RTNLGRP_IPV4_MROUTE_R:
        case RTNLGRP_IPV6_MROUTE_R:
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                break;
        }
        return 0;
}

static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        switch (event) {
        case NETDEV_REBOOT:
        case NETDEV_CHANGEMTU:
        case NETDEV_CHANGEADDR:
        case NETDEV_CHANGENAME:
        case NETDEV_FEAT_CHANGE:
        case NETDEV_BONDING_FAILOVER:
        case NETDEV_POST_TYPE_CHANGE:
        case NETDEV_NOTIFY_PEERS:
        case NETDEV_CHANGEUPPER:
        case NETDEV_RESEND_IGMP:
        case NETDEV_CHANGEINFODATA:
        case NETDEV_CHANGELOWERSTATE:
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
                                   GFP_KERNEL, NULL, 0, 0, NULL);
                break;
        default:
                break;
        }
        return NOTIFY_DONE;
}

static struct notifier_block rtnetlink_dev_notifier = {
        .notifier_call        = rtnetlink_event,
};


static int __net_init rtnetlink_net_init(struct net *net)
{
        struct sock *sk;
        struct netlink_kernel_cfg cfg = {
                .groups                = RTNLGRP_MAX,
                .input                = rtnetlink_rcv,
                .flags                = NL_CFG_F_NONROOT_RECV,
                .bind                = rtnetlink_bind,
        };

        sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg);
        if (!sk)
                return -ENOMEM;
        net->rtnl = sk;
        return 0;
}

static void __net_exit rtnetlink_net_exit(struct net *net)
{
        netlink_kernel_release(net->rtnl);
        net->rtnl = NULL;
}

static struct pernet_operations rtnetlink_net_ops = {
        .init = rtnetlink_net_init,
        .exit = rtnetlink_net_exit,
};

static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = {
        {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink,
         .flags = RTNL_FLAG_DOIT_PERNET},
        {.msgtype = RTM_DELLINK, .doit = rtnl_dellink,
         .flags = RTNL_FLAG_DOIT_PERNET_WIP},
        {.msgtype = RTM_GETLINK, .doit = rtnl_getlink,
         .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
        {.msgtype = RTM_SETLINK, .doit = rtnl_setlink,
         .flags = RTNL_FLAG_DOIT_PERNET_WIP},
        {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all},
        {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all},
        {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all},
        {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get,
         .dumpit = rtnl_stats_dump},
        {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set},
        {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop},
        {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop},
        {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK,
         .dumpit = rtnl_bridge_getlink},
        {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK,
         .doit = rtnl_bridge_dellink},
        {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK,
         .doit = rtnl_bridge_setlink},
        {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add},
        {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del,
         .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
        {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get,
         .dumpit = rtnl_fdb_dump},
        {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add},
        {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del,
         .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
        {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get,
         .dumpit = rtnl_mdb_dump},
};

void __init rtnetlink_init(void)
{
        if (register_pernet_subsys(&rtnetlink_net_ops))
                panic("rtnetlink_init: cannot initialize rtnetlink\n");

        register_netdevice_notifier(&rtnetlink_dev_notifier);

        rtnl_register_many(rtnetlink_rtnl_msg_handlers);
}





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
// SPDX-License-Identifier: GPL-2.0-only
/*
 * GENEVE: Generic Network Virtualization Encapsulation
 *
 * Copyright (c) 2015 Red Hat, Inc.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/ethtool.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/etherdevice.h>
#include <linux/hash.h>
#include <net/ipv6_stubs.h>
#include <net/dst_metadata.h>
#include <net/gro_cells.h>
#include <net/rtnetlink.h>
#include <net/geneve.h>
#include <net/gro.h>
#include <net/netdev_lock.h>
#include <net/protocol.h>

#define GENEVE_NETDEV_VER        "0.6"

#define GENEVE_N_VID                (1u << 24)
#define GENEVE_VID_MASK                (GENEVE_N_VID - 1)

#define VNI_HASH_BITS                10
#define VNI_HASH_SIZE                (1<<VNI_HASH_BITS)

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

#define GENEVE_VER 0
#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
#define GENEVE_IPV4_HLEN (ETH_HLEN + sizeof(struct iphdr) + GENEVE_BASE_HLEN)
#define GENEVE_IPV6_HLEN (ETH_HLEN + sizeof(struct ipv6hdr) + GENEVE_BASE_HLEN)

/* per-network namespace private data for this module */
struct geneve_net {
        struct list_head        geneve_list;
        struct list_head        sock_list;
};

static unsigned int geneve_net_id;

struct geneve_dev_node {
        struct hlist_node hlist;
        struct geneve_dev *geneve;
};

struct geneve_config {
        struct ip_tunnel_info        info;
        bool                        collect_md;
        bool                        use_udp6_rx_checksums;
        bool                        ttl_inherit;
        enum ifla_geneve_df        df;
        bool                        inner_proto_inherit;
        u16                        port_min;
        u16                        port_max;
};

/* Pseudo network device */
struct geneve_dev {
        struct geneve_dev_node hlist4;        /* vni hash table for IPv4 socket */
#if IS_ENABLED(CONFIG_IPV6)
        struct geneve_dev_node hlist6;        /* vni hash table for IPv6 socket */
#endif
        struct net           *net;        /* netns for packet i/o */
        struct net_device  *dev;        /* netdev for geneve tunnel */
        struct geneve_sock __rcu *sock4;        /* IPv4 socket used for geneve tunnel */
#if IS_ENABLED(CONFIG_IPV6)
        struct geneve_sock __rcu *sock6;        /* IPv6 socket used for geneve tunnel */
#endif
        struct list_head   next;        /* geneve's per namespace list */
        struct gro_cells   gro_cells;
        struct geneve_config cfg;
};

struct geneve_sock {
        bool                        collect_md;
        struct list_head        list;
        struct socket                *sock;
        struct rcu_head                rcu;
        int                        refcnt;
        struct hlist_head        vni_list[VNI_HASH_SIZE];
};

static inline __u32 geneve_net_vni_hash(u8 vni[3])
{
        __u32 vnid;

        vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2];
        return hash_32(vnid, VNI_HASH_BITS);
}

static __be64 vni_to_tunnel_id(const __u8 *vni)
{
#ifdef __BIG_ENDIAN
        return (vni[0] << 16) | (vni[1] << 8) | vni[2];
#else
        return (__force __be64)(((__force u64)vni[0] << 40) |
                                ((__force u64)vni[1] << 48) |
                                ((__force u64)vni[2] << 56));
#endif
}

/* Convert 64 bit tunnel ID to 24 bit VNI. */
static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
{
#ifdef __BIG_ENDIAN
        vni[0] = (__force __u8)(tun_id >> 16);
        vni[1] = (__force __u8)(tun_id >> 8);
        vni[2] = (__force __u8)tun_id;
#else
        vni[0] = (__force __u8)((__force u64)tun_id >> 40);
        vni[1] = (__force __u8)((__force u64)tun_id >> 48);
        vni[2] = (__force __u8)((__force u64)tun_id >> 56);
#endif
}

static bool eq_tun_id_and_vni(u8 *tun_id, u8 *vni)
{
        return !memcmp(vni, &tun_id[5], 3);
}

static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
{
        return gs->sock->sk->sk_family;
}

static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
                                        __be32 addr, u8 vni[])
{
        struct hlist_head *vni_list_head;
        struct geneve_dev_node *node;
        __u32 hash;

        /* Find the device for this VNI */
        hash = geneve_net_vni_hash(vni);
        vni_list_head = &gs->vni_list[hash];
        hlist_for_each_entry_rcu(node, vni_list_head, hlist) {
                if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) &&
                    addr == node->geneve->cfg.info.key.u.ipv4.dst)
                        return node->geneve;
        }
        return NULL;
}

#if IS_ENABLED(CONFIG_IPV6)
static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
                                         struct in6_addr addr6, u8 vni[])
{
        struct hlist_head *vni_list_head;
        struct geneve_dev_node *node;
        __u32 hash;

        /* Find the device for this VNI */
        hash = geneve_net_vni_hash(vni);
        vni_list_head = &gs->vni_list[hash];
        hlist_for_each_entry_rcu(node, vni_list_head, hlist) {
                if (eq_tun_id_and_vni((u8 *)&node->geneve->cfg.info.key.tun_id, vni) &&
                    ipv6_addr_equal(&addr6, &node->geneve->cfg.info.key.u.ipv6.dst))
                        return node->geneve;
        }
        return NULL;
}
#endif

static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
{
        return (struct genevehdr *)(udp_hdr(skb) + 1);
}

static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs,
                                            struct sk_buff *skb)
{
        static u8 zero_vni[3];
        u8 *vni;

        if (geneve_get_sk_family(gs) == AF_INET) {
                struct iphdr *iph;
                __be32 addr;

                iph = ip_hdr(skb); /* outer IP header... */

                if (gs->collect_md) {
                        vni = zero_vni;
                        addr = 0;
                } else {
                        vni = geneve_hdr(skb)->vni;
                        addr = iph->saddr;
                }

                return geneve_lookup(gs, addr, vni);
#if IS_ENABLED(CONFIG_IPV6)
        } else if (geneve_get_sk_family(gs) == AF_INET6) {
                static struct in6_addr zero_addr6;
                struct ipv6hdr *ip6h;
                struct in6_addr addr6;

                ip6h = ipv6_hdr(skb); /* outer IPv6 header... */

                if (gs->collect_md) {
                        vni = zero_vni;
                        addr6 = zero_addr6;
                } else {
                        vni = geneve_hdr(skb)->vni;
                        addr6 = ip6h->saddr;
                }

                return geneve6_lookup(gs, addr6, vni);
#endif
        }
        return NULL;
}

/* geneve receive/decap routine */
static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
                      struct sk_buff *skb)
{
        struct genevehdr *gnvh = geneve_hdr(skb);
        struct metadata_dst *tun_dst = NULL;
        unsigned int len;
        int nh, err = 0;
        void *oiph;

        if (ip_tunnel_collect_metadata() || gs->collect_md) {
                IP_TUNNEL_DECLARE_FLAGS(flags) = { };

                __set_bit(IP_TUNNEL_KEY_BIT, flags);
                __assign_bit(IP_TUNNEL_OAM_BIT, flags, gnvh->oam);
                __assign_bit(IP_TUNNEL_CRIT_OPT_BIT, flags, gnvh->critical);

                tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags,
                                         vni_to_tunnel_id(gnvh->vni),
                                         gnvh->opt_len * 4);
                if (!tun_dst) {
                        dev_dstats_rx_dropped(geneve->dev);
                        goto drop;
                }
                /* Update tunnel dst according to Geneve options. */
                ip_tunnel_flags_zero(flags);
                __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, flags);
                ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
                                        gnvh->options, gnvh->opt_len * 4,
                                        flags);
        } else {
                /* Drop packets w/ critical options,
                 * since we don't support any...
                 */
                if (gnvh->critical) {
                        DEV_STATS_INC(geneve->dev, rx_frame_errors);
                        DEV_STATS_INC(geneve->dev, rx_errors);
                        goto drop;
                }
        }

        if (tun_dst)
                skb_dst_set(skb, &tun_dst->dst);

        if (gnvh->proto_type == htons(ETH_P_TEB)) {
                skb_reset_mac_header(skb);
                skb->protocol = eth_type_trans(skb, geneve->dev);
                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

                /* Ignore packet loops (and multicast echo) */
                if (ether_addr_equal(eth_hdr(skb)->h_source,
                                     geneve->dev->dev_addr)) {
                        DEV_STATS_INC(geneve->dev, rx_errors);
                        goto drop;
                }
        } else {
                skb_reset_mac_header(skb);
                skb->dev = geneve->dev;
                skb->pkt_type = PACKET_HOST;
        }

        /* Save offset of outer header relative to skb->head,
         * because we are going to reset the network header to the inner header
         * and might change skb->head.
         */
        nh = skb_network_header(skb) - skb->head;

        skb_reset_network_header(skb);

        if (!pskb_inet_may_pull(skb)) {
                DEV_STATS_INC(geneve->dev, rx_length_errors);
                DEV_STATS_INC(geneve->dev, rx_errors);
                goto drop;
        }

        /* Get the outer header. */
        oiph = skb->head + nh;

        if (geneve_get_sk_family(gs) == AF_INET)
                err = IP_ECN_decapsulate(oiph, skb);
#if IS_ENABLED(CONFIG_IPV6)
        else
                err = IP6_ECN_decapsulate(oiph, skb);
#endif

        if (unlikely(err)) {
                if (log_ecn_error) {
                        if (geneve_get_sk_family(gs) == AF_INET)
                                net_info_ratelimited("non-ECT from %pI4 "
                                                     "with TOS=%#x\n",
                                                     &((struct iphdr *)oiph)->saddr,
                                                     ((struct iphdr *)oiph)->tos);
#if IS_ENABLED(CONFIG_IPV6)
                        else
                                net_info_ratelimited("non-ECT from %pI6\n",
                                                     &((struct ipv6hdr *)oiph)->saddr);
#endif
                }
                if (err > 1) {
                        DEV_STATS_INC(geneve->dev, rx_frame_errors);
                        DEV_STATS_INC(geneve->dev, rx_errors);
                        goto drop;
                }
        }

        len = skb->len;
        err = gro_cells_receive(&geneve->gro_cells, skb);
        if (likely(err == NET_RX_SUCCESS))
                dev_dstats_rx_add(geneve->dev, len);

        return;
drop:
        /* Consume bad packet */
        kfree_skb(skb);
}

/* Setup stats when device is created */
static int geneve_init(struct net_device *dev)
{
        struct geneve_dev *geneve = netdev_priv(dev);
        int err;

        err = gro_cells_init(&geneve->gro_cells, dev);
        if (err)
                return err;

        err = dst_cache_init(&geneve->cfg.info.dst_cache, GFP_KERNEL);
        if (err) {
                gro_cells_destroy(&geneve->gro_cells);
                return err;
        }
        netdev_lockdep_set_classes(dev);
        return 0;
}

static void geneve_uninit(struct net_device *dev)
{
        struct geneve_dev *geneve = netdev_priv(dev);

        dst_cache_destroy(&geneve->cfg.info.dst_cache);
        gro_cells_destroy(&geneve->gro_cells);
}

/* Callback from net/ipv4/udp.c to receive packets */
static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
        struct genevehdr *geneveh;
        struct geneve_dev *geneve;
        struct geneve_sock *gs;
        __be16 inner_proto;
        int opts_len;

        /* Need UDP and Geneve header to be present */
        if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
                goto drop;

        /* Return packets with reserved bits set */
        geneveh = geneve_hdr(skb);
        if (unlikely(geneveh->ver != GENEVE_VER))
                goto drop;

        gs = rcu_dereference_sk_user_data(sk);
        if (!gs)
                goto drop;

        geneve = geneve_lookup_skb(gs, skb);
        if (!geneve)
                goto drop;

        inner_proto = geneveh->proto_type;

        if (unlikely((!geneve->cfg.inner_proto_inherit &&
                      inner_proto != htons(ETH_P_TEB)))) {
                dev_dstats_rx_dropped(geneve->dev);
                goto drop;
        }

        opts_len = geneveh->opt_len * 4;
        if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, inner_proto,
                                 !net_eq(geneve->net, dev_net(geneve->dev)))) {
                dev_dstats_rx_dropped(geneve->dev);
                goto drop;
        }

        geneve_rx(geneve, gs, skb);
        return 0;

drop:
        /* Consume bad packet */
        kfree_skb(skb);
        return 0;
}

/* Callback from net/ipv{4,6}/udp.c to check that we have a tunnel for errors */
static int geneve_udp_encap_err_lookup(struct sock *sk, struct sk_buff *skb)
{
        struct genevehdr *geneveh;
        struct geneve_sock *gs;
        u8 zero_vni[3] = { 0 };
        u8 *vni = zero_vni;

        if (!pskb_may_pull(skb, skb_transport_offset(skb) + GENEVE_BASE_HLEN))
                return -EINVAL;

        geneveh = geneve_hdr(skb);
        if (geneveh->ver != GENEVE_VER)
                return -EINVAL;

        if (geneveh->proto_type != htons(ETH_P_TEB))
                return -EINVAL;

        gs = rcu_dereference_sk_user_data(sk);
        if (!gs)
                return -ENOENT;

        if (geneve_get_sk_family(gs) == AF_INET) {
                struct iphdr *iph = ip_hdr(skb);
                __be32 addr4 = 0;

                if (!gs->collect_md) {
                        vni = geneve_hdr(skb)->vni;
                        addr4 = iph->daddr;
                }

                return geneve_lookup(gs, addr4, vni) ? 0 : -ENOENT;
        }

#if IS_ENABLED(CONFIG_IPV6)
        if (geneve_get_sk_family(gs) == AF_INET6) {
                struct ipv6hdr *ip6h = ipv6_hdr(skb);
                struct in6_addr addr6;

                memset(&addr6, 0, sizeof(struct in6_addr));

                if (!gs->collect_md) {
                        vni = geneve_hdr(skb)->vni;
                        addr6 = ip6h->daddr;
                }

                return geneve6_lookup(gs, addr6, vni) ? 0 : -ENOENT;
        }
#endif

        return -EPFNOSUPPORT;
}

static struct socket *geneve_create_sock(struct net *net, bool ipv6,
                                         __be16 port, bool ipv6_rx_csum)
{
        struct socket *sock;
        struct udp_port_cfg udp_conf;
        int err;

        memset(&udp_conf, 0, sizeof(udp_conf));

        if (ipv6) {
                udp_conf.family = AF_INET6;
                udp_conf.ipv6_v6only = 1;
                udp_conf.use_udp6_rx_checksums = ipv6_rx_csum;
        } else {
                udp_conf.family = AF_INET;
                udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
        }

        udp_conf.local_udp_port = port;

        /* Open UDP socket */
        err = udp_sock_create(net, &udp_conf, &sock);
        if (err < 0)
                return ERR_PTR(err);

        udp_allow_gso(sock->sk);
        return sock;
}

static int geneve_hlen(struct genevehdr *gh)
{
        return sizeof(*gh) + gh->opt_len * 4;
}

static struct sk_buff *geneve_gro_receive(struct sock *sk,
                                          struct list_head *head,
                                          struct sk_buff *skb)
{
        struct sk_buff *pp = NULL;
        struct sk_buff *p;
        struct genevehdr *gh, *gh2;
        unsigned int hlen, gh_len, off_gnv;
        const struct packet_offload *ptype;
        __be16 type;
        int flush = 1;

        off_gnv = skb_gro_offset(skb);
        hlen = off_gnv + sizeof(*gh);
        gh = skb_gro_header(skb, hlen, off_gnv);
        if (unlikely(!gh))
                goto out;

        if (gh->ver != GENEVE_VER || gh->oam)
                goto out;
        gh_len = geneve_hlen(gh);

        hlen = off_gnv + gh_len;
        if (!skb_gro_may_pull(skb, hlen)) {
                gh = skb_gro_header_slow(skb, hlen, off_gnv);
                if (unlikely(!gh))
                        goto out;
        }

        list_for_each_entry(p, head, list) {
                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                gh2 = (struct genevehdr *)(p->data + off_gnv);
                if (gh->opt_len != gh2->opt_len ||
                    memcmp(gh, gh2, gh_len)) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }
        }

        skb_gro_pull(skb, gh_len);
        skb_gro_postpull_rcsum(skb, gh, gh_len);
        type = gh->proto_type;
        if (likely(type == htons(ETH_P_TEB)))
                return call_gro_receive(eth_gro_receive, head, skb);

        ptype = gro_find_receive_by_type(type);
        if (!ptype)
                goto out;

        pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
        flush = 0;

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}

static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
                               int nhoff)
{
        struct genevehdr *gh;
        struct packet_offload *ptype;
        __be16 type;
        int gh_len;
        int err = -ENOSYS;

        gh = (struct genevehdr *)(skb->data + nhoff);
        gh_len = geneve_hlen(gh);
        type = gh->proto_type;

        /* since skb->encapsulation is set, eth_gro_complete() sets the inner mac header */
        if (likely(type == htons(ETH_P_TEB)))
                return eth_gro_complete(skb, nhoff + gh_len);

        ptype = gro_find_complete_by_type(type);
        if (ptype)
                err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);

        skb_set_inner_mac_header(skb, nhoff + gh_len);

        return err;
}

/* Create new listen socket if needed */
static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
                                                bool ipv6, bool ipv6_rx_csum)
{
        struct geneve_net *gn = net_generic(net, geneve_net_id);
        struct geneve_sock *gs;
        struct socket *sock;
        struct udp_tunnel_sock_cfg tunnel_cfg;
        int h;

        gs = kzalloc(sizeof(*gs), GFP_KERNEL);
        if (!gs)
                return ERR_PTR(-ENOMEM);

        sock = geneve_create_sock(net, ipv6, port, ipv6_rx_csum);
        if (IS_ERR(sock)) {
                kfree(gs);
                return ERR_CAST(sock);
        }

        gs->sock = sock;
        gs->refcnt = 1;
        for (h = 0; h < VNI_HASH_SIZE; ++h)
                INIT_HLIST_HEAD(&gs->vni_list[h]);

        /* Initialize the geneve udp offloads structure */
        udp_tunnel_notify_add_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);

        /* Mark socket as an encapsulation socket */
        memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
        tunnel_cfg.sk_user_data = gs;
        tunnel_cfg.encap_type = 1;
        tunnel_cfg.gro_receive = geneve_gro_receive;
        tunnel_cfg.gro_complete = geneve_gro_complete;
        tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
        tunnel_cfg.encap_err_lookup = geneve_udp_encap_err_lookup;
        tunnel_cfg.encap_destroy = NULL;
        setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
        list_add(&gs->list, &gn->sock_list);
        return gs;
}

static void __geneve_sock_release(struct geneve_sock *gs)
{
        if (!gs || --gs->refcnt)
                return;

        list_del(&gs->list);
        udp_tunnel_notify_del_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);
        udp_tunnel_sock_release(gs->sock);
        kfree_rcu(gs, rcu);
}

static void geneve_sock_release(struct geneve_dev *geneve)
{
        struct geneve_sock *gs4 = rtnl_dereference(geneve->sock4);
#if IS_ENABLED(CONFIG_IPV6)
        struct geneve_sock *gs6 = rtnl_dereference(geneve->sock6);

        rcu_assign_pointer(geneve->sock6, NULL);
#endif

        rcu_assign_pointer(geneve->sock4, NULL);
        synchronize_net();

        __geneve_sock_release(gs4);
#if IS_ENABLED(CONFIG_IPV6)
        __geneve_sock_release(gs6);
#endif
}

static struct geneve_sock *geneve_find_sock(struct geneve_net *gn,
                                            sa_family_t family,
                                            __be16 dst_port)
{
        struct geneve_sock *gs;

        list_for_each_entry(gs, &gn->sock_list, list) {
                if (inet_sk(gs->sock->sk)->inet_sport == dst_port &&
                    geneve_get_sk_family(gs) == family) {
                        return gs;
                }
        }
        return NULL;
}

static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
{
        struct net *net = geneve->net;
        struct geneve_net *gn = net_generic(net, geneve_net_id);
        struct geneve_dev_node *node;
        struct geneve_sock *gs;
        __u8 vni[3];
        __u32 hash;

        gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->cfg.info.key.tp_dst);
        if (gs) {
                gs->refcnt++;
                goto out;
        }

        gs = geneve_socket_create(net, geneve->cfg.info.key.tp_dst, ipv6,
                                  geneve->cfg.use_udp6_rx_checksums);
        if (IS_ERR(gs))
                return PTR_ERR(gs);

out:
        gs->collect_md = geneve->cfg.collect_md;
#if IS_ENABLED(CONFIG_IPV6)
        if (ipv6) {
                rcu_assign_pointer(geneve->sock6, gs);
                node = &geneve->hlist6;
        } else
#endif
        {
                rcu_assign_pointer(geneve->sock4, gs);
                node = &geneve->hlist4;
        }
        node->geneve = geneve;

        tunnel_id_to_vni(geneve->cfg.info.key.tun_id, vni);
        hash = geneve_net_vni_hash(vni);
        hlist_add_head_rcu(&node->hlist, &gs->vni_list[hash]);
        return 0;
}

static int geneve_open(struct net_device *dev)
{
        struct geneve_dev *geneve = netdev_priv(dev);
        bool metadata = geneve->cfg.collect_md;
        bool ipv4, ipv6;
        int ret = 0;

        ipv6 = geneve->cfg.info.mode & IP_TUNNEL_INFO_IPV6 || metadata;
        ipv4 = !ipv6 || metadata;
#if IS_ENABLED(CONFIG_IPV6)
        if (ipv6) {
                ret = geneve_sock_add(geneve, true);
                if (ret < 0 && ret != -EAFNOSUPPORT)
                        ipv4 = false;
        }
#endif
        if (ipv4)
                ret = geneve_sock_add(geneve, false);
        if (ret < 0)
                geneve_sock_release(geneve);

        return ret;
}

static int geneve_stop(struct net_device *dev)
{
        struct geneve_dev *geneve = netdev_priv(dev);

        hlist_del_init_rcu(&geneve->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
        hlist_del_init_rcu(&geneve->hlist6.hlist);
#endif
        geneve_sock_release(geneve);
        return 0;
}

static void geneve_build_header(struct genevehdr *geneveh,
                                const struct ip_tunnel_info *info,
                                __be16 inner_proto)
{
        geneveh->ver = GENEVE_VER;
        geneveh->opt_len = info->options_len / 4;
        geneveh->oam = test_bit(IP_TUNNEL_OAM_BIT, info->key.tun_flags);
        geneveh->critical = test_bit(IP_TUNNEL_CRIT_OPT_BIT,
                                     info->key.tun_flags);
        geneveh->rsvd1 = 0;
        tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
        geneveh->proto_type = inner_proto;
        geneveh->rsvd2 = 0;

        if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags))
                ip_tunnel_info_opts_get(geneveh->options, info);
}

static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
                            const struct ip_tunnel_info *info,
                            bool xnet, int ip_hdr_len,
                            bool inner_proto_inherit)
{
        bool udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
        struct genevehdr *gnvh;
        __be16 inner_proto;
        int min_headroom;
        int err;

        skb_reset_mac_header(skb);
        skb_scrub_packet(skb, xnet);

        min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len +
                       GENEVE_BASE_HLEN + info->options_len + ip_hdr_len;
        err = skb_cow_head(skb, min_headroom);
        if (unlikely(err))
                goto free_dst;

        err = udp_tunnel_handle_offloads(skb, udp_sum);
        if (err)
                goto free_dst;

        gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
        inner_proto = inner_proto_inherit ? skb->protocol : htons(ETH_P_TEB);
        geneve_build_header(gnvh, info, inner_proto);
        skb_set_inner_protocol(skb, inner_proto);
        return 0;

free_dst:
        dst_release(dst);
        return err;
}

static u8 geneve_get_dsfield(struct sk_buff *skb, struct net_device *dev,
                             const struct ip_tunnel_info *info,
                             bool *use_cache)
{
        struct geneve_dev *geneve = netdev_priv(dev);
        u8 dsfield;

        dsfield = info->key.tos;
        if (dsfield == 1 && !geneve->cfg.collect_md) {
                dsfield = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
                *use_cache = false;
        }

        return dsfield;
}

static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
                           struct geneve_dev *geneve,
                           const struct ip_tunnel_info *info)
{
        bool inner_proto_inherit = geneve->cfg.inner_proto_inherit;
        bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
        struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
        const struct ip_tunnel_key *key = &info->key;
        struct rtable *rt;
        bool use_cache;
        __u8 tos, ttl;
        __be16 df = 0;
        __be32 saddr;
        __be16 sport;
        int err;

        if (skb_vlan_inet_prepare(skb, inner_proto_inherit))
                return -EINVAL;

        if (!gs4)
                return -EIO;

        use_cache = ip_tunnel_dst_cache_usable(skb, info);
        tos = geneve_get_dsfield(skb, dev, info, &use_cache);
        sport = udp_flow_src_port(geneve->net, skb,
                                  geneve->cfg.port_min,
                                  geneve->cfg.port_max, true);

        rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr,
                                   &info->key,
                                   sport, geneve->cfg.info.key.tp_dst, tos,
                                   use_cache ?
                                   (struct dst_cache *)&info->dst_cache : NULL);
        if (IS_ERR(rt))
                return PTR_ERR(rt);

        err = skb_tunnel_check_pmtu(skb, &rt->dst,
                                    GENEVE_IPV4_HLEN + info->options_len,
                                    netif_is_any_bridge_port(dev));
        if (err < 0) {
                dst_release(&rt->dst);
                return err;
        } else if (err) {
                struct ip_tunnel_info *info;

                info = skb_tunnel_info(skb);
                if (info) {
                        struct ip_tunnel_info *unclone;

                        unclone = skb_tunnel_info_unclone(skb);
                        if (unlikely(!unclone)) {
                                dst_release(&rt->dst);
                                return -ENOMEM;
                        }

                        unclone->key.u.ipv4.dst = saddr;
                        unclone->key.u.ipv4.src = info->key.u.ipv4.dst;
                }

                if (!pskb_may_pull(skb, ETH_HLEN)) {
                        dst_release(&rt->dst);
                        return -EINVAL;
                }

                skb->protocol = eth_type_trans(skb, geneve->dev);
                __netif_rx(skb);
                dst_release(&rt->dst);
                return -EMSGSIZE;
        }

        tos = ip_tunnel_ecn_encap(tos, ip_hdr(skb), skb);
        if (geneve->cfg.collect_md) {
                ttl = key->ttl;

                df = test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags) ?
                     htons(IP_DF) : 0;
        } else {
                if (geneve->cfg.ttl_inherit)
                        ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
                else
                        ttl = key->ttl;
                ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);

                if (geneve->cfg.df == GENEVE_DF_SET) {
                        df = htons(IP_DF);
                } else if (geneve->cfg.df == GENEVE_DF_INHERIT) {
                        struct ethhdr *eth = skb_eth_hdr(skb);

                        if (ntohs(eth->h_proto) == ETH_P_IPV6) {
                                df = htons(IP_DF);
                        } else if (ntohs(eth->h_proto) == ETH_P_IP) {
                                struct iphdr *iph = ip_hdr(skb);

                                if (iph->frag_off & htons(IP_DF))
                                        df = htons(IP_DF);
                        }
                }
        }

        err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr),
                               inner_proto_inherit);
        if (unlikely(err))
                return err;

        udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, saddr, info->key.u.ipv4.dst,
                            tos, ttl, df, sport, geneve->cfg.info.key.tp_dst,
                            !net_eq(geneve->net, dev_net(geneve->dev)),
                            !test_bit(IP_TUNNEL_CSUM_BIT,
                                      info->key.tun_flags));
        return 0;
}

#if IS_ENABLED(CONFIG_IPV6)
static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
                            struct geneve_dev *geneve,
                            const struct ip_tunnel_info *info)
{
        bool inner_proto_inherit = geneve->cfg.inner_proto_inherit;
        bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
        struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
        const struct ip_tunnel_key *key = &info->key;
        struct dst_entry *dst = NULL;
        struct in6_addr saddr;
        bool use_cache;
        __u8 prio, ttl;
        __be16 sport;
        int err;

        if (skb_vlan_inet_prepare(skb, inner_proto_inherit))
                return -EINVAL;

        if (!gs6)
                return -EIO;

        use_cache = ip_tunnel_dst_cache_usable(skb, info);
        prio = geneve_get_dsfield(skb, dev, info, &use_cache);
        sport = udp_flow_src_port(geneve->net, skb,
                                  geneve->cfg.port_min,
                                  geneve->cfg.port_max, true);

        dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0,
                                     &saddr, key, sport,
                                     geneve->cfg.info.key.tp_dst, prio,
                                     use_cache ?
                                     (struct dst_cache *)&info->dst_cache : NULL);
        if (IS_ERR(dst))
                return PTR_ERR(dst);

        err = skb_tunnel_check_pmtu(skb, dst,
                                    GENEVE_IPV6_HLEN + info->options_len,
                                    netif_is_any_bridge_port(dev));
        if (err < 0) {
                dst_release(dst);
                return err;
        } else if (err) {
                struct ip_tunnel_info *info = skb_tunnel_info(skb);

                if (info) {
                        struct ip_tunnel_info *unclone;

                        unclone = skb_tunnel_info_unclone(skb);
                        if (unlikely(!unclone)) {
                                dst_release(dst);
                                return -ENOMEM;
                        }

                        unclone->key.u.ipv6.dst = saddr;
                        unclone->key.u.ipv6.src = info->key.u.ipv6.dst;
                }

                if (!pskb_may_pull(skb, ETH_HLEN)) {
                        dst_release(dst);
                        return -EINVAL;
                }

                skb->protocol = eth_type_trans(skb, geneve->dev);
                __netif_rx(skb);
                dst_release(dst);
                return -EMSGSIZE;
        }

        prio = ip_tunnel_ecn_encap(prio, ip_hdr(skb), skb);
        if (geneve->cfg.collect_md) {
                ttl = key->ttl;
        } else {
                if (geneve->cfg.ttl_inherit)
                        ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb);
                else
                        ttl = key->ttl;
                ttl = ttl ? : ip6_dst_hoplimit(dst);
        }
        err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr),
                               inner_proto_inherit);
        if (unlikely(err))
                return err;

        udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
                             &saddr, &key->u.ipv6.dst, prio, ttl,
                             info->key.label, sport, geneve->cfg.info.key.tp_dst,
                             !test_bit(IP_TUNNEL_CSUM_BIT,
                                       info->key.tun_flags));
        return 0;
}
#endif

static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct geneve_dev *geneve = netdev_priv(dev);
        struct ip_tunnel_info *info = NULL;
        int err;

        if (geneve->cfg.collect_md) {
                info = skb_tunnel_info(skb);
                if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
                        netdev_dbg(dev, "no tunnel metadata\n");
                        dev_kfree_skb(skb);
                        dev_dstats_tx_dropped(dev);
                        return NETDEV_TX_OK;
                }
        } else {
                info = &geneve->cfg.info;
        }

        rcu_read_lock();
#if IS_ENABLED(CONFIG_IPV6)
        if (info->mode & IP_TUNNEL_INFO_IPV6)
                err = geneve6_xmit_skb(skb, dev, geneve, info);
        else
#endif
                err = geneve_xmit_skb(skb, dev, geneve, info);
        rcu_read_unlock();

        if (likely(!err))
                return NETDEV_TX_OK;

        if (err != -EMSGSIZE)
                dev_kfree_skb(skb);

        if (err == -ELOOP)
                DEV_STATS_INC(dev, collisions);
        else if (err == -ENETUNREACH)
                DEV_STATS_INC(dev, tx_carrier_errors);

        DEV_STATS_INC(dev, tx_errors);
        return NETDEV_TX_OK;
}

static int geneve_change_mtu(struct net_device *dev, int new_mtu)
{
        if (new_mtu > dev->max_mtu)
                new_mtu = dev->max_mtu;
        else if (new_mtu < dev->min_mtu)
                new_mtu = dev->min_mtu;

        WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
}

static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
        struct ip_tunnel_info *info = skb_tunnel_info(skb);
        struct geneve_dev *geneve = netdev_priv(dev);
        __be16 sport;

        if (ip_tunnel_info_af(info) == AF_INET) {
                struct rtable *rt;
                struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
                bool use_cache;
                __be32 saddr;
                u8 tos;

                if (!gs4)
                        return -EIO;

                use_cache = ip_tunnel_dst_cache_usable(skb, info);
                tos = geneve_get_dsfield(skb, dev, info, &use_cache);
                sport = udp_flow_src_port(geneve->net, skb,
                                          geneve->cfg.port_min,
                                          geneve->cfg.port_max, true);

                rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr,
                                           &info->key,
                                           sport, geneve->cfg.info.key.tp_dst,
                                           tos,
                                           use_cache ? &info->dst_cache : NULL);
                if (IS_ERR(rt))
                        return PTR_ERR(rt);

                ip_rt_put(rt);
                info->key.u.ipv4.src = saddr;
#if IS_ENABLED(CONFIG_IPV6)
        } else if (ip_tunnel_info_af(info) == AF_INET6) {
                struct dst_entry *dst;
                struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
                struct in6_addr saddr;
                bool use_cache;
                u8 prio;

                if (!gs6)
                        return -EIO;

                use_cache = ip_tunnel_dst_cache_usable(skb, info);
                prio = geneve_get_dsfield(skb, dev, info, &use_cache);
                sport = udp_flow_src_port(geneve->net, skb,
                                          geneve->cfg.port_min,
                                          geneve->cfg.port_max, true);

                dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0,
                                             &saddr, &info->key, sport,
                                             geneve->cfg.info.key.tp_dst, prio,
                                             use_cache ? &info->dst_cache : NULL);
                if (IS_ERR(dst))
                        return PTR_ERR(dst);

                dst_release(dst);
                info->key.u.ipv6.src = saddr;
#endif
        } else {
                return -EINVAL;
        }

        info->key.tp_src = sport;
        info->key.tp_dst = geneve->cfg.info.key.tp_dst;
        return 0;
}

static const struct net_device_ops geneve_netdev_ops = {
        .ndo_init                = geneve_init,
        .ndo_uninit                = geneve_uninit,
        .ndo_open                = geneve_open,
        .ndo_stop                = geneve_stop,
        .ndo_start_xmit                = geneve_xmit,
        .ndo_change_mtu                = geneve_change_mtu,
        .ndo_validate_addr        = eth_validate_addr,
        .ndo_set_mac_address        = eth_mac_addr,
        .ndo_fill_metadata_dst        = geneve_fill_metadata_dst,
};

static void geneve_get_drvinfo(struct net_device *dev,
                               struct ethtool_drvinfo *drvinfo)
{
        strscpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version));
        strscpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver));
}

static const struct ethtool_ops geneve_ethtool_ops = {
        .get_drvinfo        = geneve_get_drvinfo,
        .get_link        = ethtool_op_get_link,
};

/* Info for udev, that this is a virtual tunnel endpoint */
static const struct device_type geneve_type = {
        .name = "geneve",
};

/* Calls the ndo_udp_tunnel_add of the caller in order to
 * supply the listening GENEVE udp ports. Callers are expected
 * to implement the ndo_udp_tunnel_add.
 */
static void geneve_offload_rx_ports(struct net_device *dev, bool push)
{
        struct net *net = dev_net(dev);
        struct geneve_net *gn = net_generic(net, geneve_net_id);
        struct geneve_sock *gs;

        rcu_read_lock();
        list_for_each_entry_rcu(gs, &gn->sock_list, list) {
                if (push) {
                        udp_tunnel_push_rx_port(dev, gs->sock,
                                                UDP_TUNNEL_TYPE_GENEVE);
                } else {
                        udp_tunnel_drop_rx_port(dev, gs->sock,
                                                UDP_TUNNEL_TYPE_GENEVE);
                }
        }
        rcu_read_unlock();
}

/* Initialize the device structure. */
static void geneve_setup(struct net_device *dev)
{
        ether_setup(dev);

        dev->netdev_ops = &geneve_netdev_ops;
        dev->ethtool_ops = &geneve_ethtool_ops;
        dev->needs_free_netdev = true;

        SET_NETDEV_DEVTYPE(dev, &geneve_type);

        dev->features    |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
        dev->features    |= NETIF_F_RXCSUM;
        dev->features    |= NETIF_F_GSO_SOFTWARE;

        dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
        dev->hw_features |= NETIF_F_RXCSUM;
        dev->hw_features |= NETIF_F_GSO_SOFTWARE;

        dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
        /* MTU range: 68 - (something less than 65535) */
        dev->min_mtu = ETH_MIN_MTU;
        /* The max_mtu calculation does not take account of GENEVE
         * options, to avoid excluding potentially valid
         * configurations. This will be further reduced by IPvX hdr size.
         */
        dev->max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len;

        netif_keep_dst(dev);
        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
        dev->lltx = true;
        eth_hw_addr_random(dev);
}

static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
        [IFLA_GENEVE_UNSPEC]                = { .strict_start_type = IFLA_GENEVE_INNER_PROTO_INHERIT },
        [IFLA_GENEVE_ID]                = { .type = NLA_U32 },
        [IFLA_GENEVE_REMOTE]                = { .len = sizeof_field(struct iphdr, daddr) },
        [IFLA_GENEVE_REMOTE6]                = { .len = sizeof(struct in6_addr) },
        [IFLA_GENEVE_TTL]                = { .type = NLA_U8 },
        [IFLA_GENEVE_TOS]                = { .type = NLA_U8 },
        [IFLA_GENEVE_LABEL]                = { .type = NLA_U32 },
        [IFLA_GENEVE_PORT]                = { .type = NLA_U16 },
        [IFLA_GENEVE_COLLECT_METADATA]        = { .type = NLA_FLAG },
        [IFLA_GENEVE_UDP_CSUM]                = { .type = NLA_U8 },
        [IFLA_GENEVE_UDP_ZERO_CSUM6_TX]        = { .type = NLA_U8 },
        [IFLA_GENEVE_UDP_ZERO_CSUM6_RX]        = { .type = NLA_U8 },
        [IFLA_GENEVE_TTL_INHERIT]        = { .type = NLA_U8 },
        [IFLA_GENEVE_DF]                = { .type = NLA_U8 },
        [IFLA_GENEVE_INNER_PROTO_INHERIT]        = { .type = NLA_FLAG },
        [IFLA_GENEVE_PORT_RANGE]        = NLA_POLICY_EXACT_LEN(sizeof(struct ifla_geneve_port_range)),
};

static int geneve_validate(struct nlattr *tb[], struct nlattr *data[],
                           struct netlink_ext_ack *extack)
{
        if (tb[IFLA_ADDRESS]) {
                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
                                            "Provided link layer address is not Ethernet");
                        return -EINVAL;
                }

                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
                                            "Provided Ethernet address is not unicast");
                        return -EADDRNOTAVAIL;
                }
        }

        if (!data) {
                NL_SET_ERR_MSG(extack,
                               "Not enough attributes provided to perform the operation");
                return -EINVAL;
        }

        if (data[IFLA_GENEVE_ID]) {
                __u32 vni =  nla_get_u32(data[IFLA_GENEVE_ID]);

                if (vni >= GENEVE_N_VID) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_ID],
                                            "Geneve ID must be lower than 16777216");
                        return -ERANGE;
                }
        }

        if (data[IFLA_GENEVE_DF]) {
                enum ifla_geneve_df df = nla_get_u8(data[IFLA_GENEVE_DF]);

                if (df < 0 || df > GENEVE_DF_MAX) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_DF],
                                            "Invalid DF attribute");
                        return -EINVAL;
                }
        }

        if (data[IFLA_GENEVE_PORT_RANGE]) {
                const struct ifla_geneve_port_range *p;

                p = nla_data(data[IFLA_GENEVE_PORT_RANGE]);
                if (ntohs(p->high) < ntohs(p->low)) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_PORT_RANGE],
                                            "Invalid source port range");
                        return -EINVAL;
                }
        }

        return 0;
}

static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
                                          const struct ip_tunnel_info *info,
                                          bool *tun_on_same_port,
                                          bool *tun_collect_md)
{
        struct geneve_dev *geneve, *t = NULL;

        *tun_on_same_port = false;
        *tun_collect_md = false;
        list_for_each_entry(geneve, &gn->geneve_list, next) {
                if (info->key.tp_dst == geneve->cfg.info.key.tp_dst) {
                        *tun_collect_md = geneve->cfg.collect_md;
                        *tun_on_same_port = true;
                }
                if (info->key.tun_id == geneve->cfg.info.key.tun_id &&
                    info->key.tp_dst == geneve->cfg.info.key.tp_dst &&
                    !memcmp(&info->key.u, &geneve->cfg.info.key.u, sizeof(info->key.u)))
                        t = geneve;
        }
        return t;
}

static bool is_tnl_info_zero(const struct ip_tunnel_info *info)
{
        return !(info->key.tun_id || info->key.tos ||
                 !ip_tunnel_flags_empty(info->key.tun_flags) ||
                 info->key.ttl || info->key.label || info->key.tp_src ||
                 memchr_inv(&info->key.u, 0, sizeof(info->key.u)));
}

static bool geneve_dst_addr_equal(struct ip_tunnel_info *a,
                                  struct ip_tunnel_info *b)
{
        if (ip_tunnel_info_af(a) == AF_INET)
                return a->key.u.ipv4.dst == b->key.u.ipv4.dst;
        else
                return ipv6_addr_equal(&a->key.u.ipv6.dst, &b->key.u.ipv6.dst);
}

static int geneve_configure(struct net *net, struct net_device *dev,
                            struct netlink_ext_ack *extack,
                            const struct geneve_config *cfg)
{
        struct geneve_net *gn = net_generic(net, geneve_net_id);
        struct geneve_dev *t, *geneve = netdev_priv(dev);
        const struct ip_tunnel_info *info = &cfg->info;
        bool tun_collect_md, tun_on_same_port;
        int err, encap_len;

        if (cfg->collect_md && !is_tnl_info_zero(info)) {
                NL_SET_ERR_MSG(extack,
                               "Device is externally controlled, so attributes (VNI, Port, and so on) must not be specified");
                return -EINVAL;
        }

        geneve->net = net;
        geneve->dev = dev;

        t = geneve_find_dev(gn, info, &tun_on_same_port, &tun_collect_md);
        if (t)
                return -EBUSY;

        /* make enough headroom for basic scenario */
        encap_len = GENEVE_BASE_HLEN + ETH_HLEN;
        if (!cfg->collect_md && ip_tunnel_info_af(info) == AF_INET) {
                encap_len += sizeof(struct iphdr);
                dev->max_mtu -= sizeof(struct iphdr);
        } else {
                encap_len += sizeof(struct ipv6hdr);
                dev->max_mtu -= sizeof(struct ipv6hdr);
        }
        dev->needed_headroom = encap_len + ETH_HLEN;

        if (cfg->collect_md) {
                if (tun_on_same_port) {
                        NL_SET_ERR_MSG(extack,
                                       "There can be only one externally controlled device on a destination port");
                        return -EPERM;
                }
        } else {
                if (tun_collect_md) {
                        NL_SET_ERR_MSG(extack,
                                       "There already exists an externally controlled device on this destination port");
                        return -EPERM;
                }
        }

        dst_cache_reset(&geneve->cfg.info.dst_cache);
        memcpy(&geneve->cfg, cfg, sizeof(*cfg));

        if (geneve->cfg.inner_proto_inherit) {
                dev->header_ops = NULL;
                dev->type = ARPHRD_NONE;
                dev->hard_header_len = 0;
                dev->addr_len = 0;
                dev->flags = IFF_POINTOPOINT | IFF_NOARP;
        }

        err = register_netdevice(dev);
        if (err)
                return err;

        list_add(&geneve->next, &gn->geneve_list);
        return 0;
}

static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port)
{
        memset(info, 0, sizeof(*info));
        info->key.tp_dst = htons(dst_port);
}

static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[],
                          struct netlink_ext_ack *extack,
                          struct geneve_config *cfg, bool changelink)
{
        struct ip_tunnel_info *info = &cfg->info;
        int attrtype;

        if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) {
                NL_SET_ERR_MSG(extack,
                               "Cannot specify both IPv4 and IPv6 Remote addresses");
                return -EINVAL;
        }

        if (data[IFLA_GENEVE_REMOTE]) {
                if (changelink && (ip_tunnel_info_af(info) == AF_INET6)) {
                        attrtype = IFLA_GENEVE_REMOTE;
                        goto change_notsup;
                }

                info->key.u.ipv4.dst =
                        nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);

                if (ipv4_is_multicast(info->key.u.ipv4.dst)) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE],
                                            "Remote IPv4 address cannot be Multicast");
                        return -EINVAL;
                }
        }

        if (data[IFLA_GENEVE_REMOTE6]) {
#if IS_ENABLED(CONFIG_IPV6)
                if (changelink && (ip_tunnel_info_af(info) == AF_INET)) {
                        attrtype = IFLA_GENEVE_REMOTE6;
                        goto change_notsup;
                }

                info->mode = IP_TUNNEL_INFO_IPV6;
                info->key.u.ipv6.dst =
                        nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);

                if (ipv6_addr_type(&info->key.u.ipv6.dst) &
                    IPV6_ADDR_LINKLOCAL) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6],
                                            "Remote IPv6 address cannot be link-local");
                        return -EINVAL;
                }
                if (ipv6_addr_is_multicast(&info->key.u.ipv6.dst)) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6],
                                            "Remote IPv6 address cannot be Multicast");
                        return -EINVAL;
                }
                __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
                cfg->use_udp6_rx_checksums = true;
#else
                NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_REMOTE6],
                                    "IPv6 support not enabled in the kernel");
                return -EPFNOSUPPORT;
#endif
        }

        if (data[IFLA_GENEVE_ID]) {
                __u32 vni;
                __u8 tvni[3];
                __be64 tunid;

                vni = nla_get_u32(data[IFLA_GENEVE_ID]);
                tvni[0] = (vni & 0x00ff0000) >> 16;
                tvni[1] = (vni & 0x0000ff00) >> 8;
                tvni[2] =  vni & 0x000000ff;

                tunid = vni_to_tunnel_id(tvni);
                if (changelink && (tunid != info->key.tun_id)) {
                        attrtype = IFLA_GENEVE_ID;
                        goto change_notsup;
                }
                info->key.tun_id = tunid;
        }

        if (data[IFLA_GENEVE_TTL_INHERIT]) {
                if (nla_get_u8(data[IFLA_GENEVE_TTL_INHERIT]))
                        cfg->ttl_inherit = true;
                else
                        cfg->ttl_inherit = false;
        } else if (data[IFLA_GENEVE_TTL]) {
                info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
                cfg->ttl_inherit = false;
        }

        if (data[IFLA_GENEVE_TOS])
                info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);

        if (data[IFLA_GENEVE_DF])
                cfg->df = nla_get_u8(data[IFLA_GENEVE_DF]);

        if (data[IFLA_GENEVE_LABEL]) {
                info->key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
                                  IPV6_FLOWLABEL_MASK;
                if (info->key.label && (!(info->mode & IP_TUNNEL_INFO_IPV6))) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_LABEL],
                                            "Label attribute only applies for IPv6 Geneve devices");
                        return -EINVAL;
                }
        }

        if (data[IFLA_GENEVE_PORT]) {
                if (changelink) {
                        attrtype = IFLA_GENEVE_PORT;
                        goto change_notsup;
                }
                info->key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]);
        }

        if (data[IFLA_GENEVE_PORT_RANGE]) {
                const struct ifla_geneve_port_range *p;

                if (changelink) {
                        attrtype = IFLA_GENEVE_PORT_RANGE;
                        goto change_notsup;
                }
                p = nla_data(data[IFLA_GENEVE_PORT_RANGE]);
                cfg->port_min = ntohs(p->low);
                cfg->port_max = ntohs(p->high);
        }

        if (data[IFLA_GENEVE_COLLECT_METADATA]) {
                if (changelink) {
                        attrtype = IFLA_GENEVE_COLLECT_METADATA;
                        goto change_notsup;
                }
                cfg->collect_md = true;
        }

        if (data[IFLA_GENEVE_UDP_CSUM]) {
                if (changelink) {
                        attrtype = IFLA_GENEVE_UDP_CSUM;
                        goto change_notsup;
                }
                if (nla_get_u8(data[IFLA_GENEVE_UDP_CSUM]))
                        __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
        }

        if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]) {
#if IS_ENABLED(CONFIG_IPV6)
                if (changelink) {
                        attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_TX;
                        goto change_notsup;
                }
                if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]))
                        __clear_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
#else
                NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX],
                                    "IPv6 support not enabled in the kernel");
                return -EPFNOSUPPORT;
#endif
        }

        if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]) {
#if IS_ENABLED(CONFIG_IPV6)
                if (changelink) {
                        attrtype = IFLA_GENEVE_UDP_ZERO_CSUM6_RX;
                        goto change_notsup;
                }
                if (nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
                        cfg->use_udp6_rx_checksums = false;
#else
                NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX],
                                    "IPv6 support not enabled in the kernel");
                return -EPFNOSUPPORT;
#endif
        }

        if (data[IFLA_GENEVE_INNER_PROTO_INHERIT]) {
                if (changelink) {
                        attrtype = IFLA_GENEVE_INNER_PROTO_INHERIT;
                        goto change_notsup;
                }
                cfg->inner_proto_inherit = true;
        }

        return 0;
change_notsup:
        NL_SET_ERR_MSG_ATTR(extack, data[attrtype],
                            "Changing VNI, Port, endpoint IP address family, external, inner_proto_inherit, and UDP checksum attributes are not supported");
        return -EOPNOTSUPP;
}

static void geneve_link_config(struct net_device *dev,
                               struct ip_tunnel_info *info, struct nlattr *tb[])
{
        struct geneve_dev *geneve = netdev_priv(dev);
        int ldev_mtu = 0;

        if (tb[IFLA_MTU]) {
                geneve_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
                return;
        }

        switch (ip_tunnel_info_af(info)) {
        case AF_INET: {
                struct flowi4 fl4 = { .daddr = info->key.u.ipv4.dst };
                struct rtable *rt = ip_route_output_key(geneve->net, &fl4);

                if (!IS_ERR(rt) && rt->dst.dev) {
                        ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV4_HLEN;
                        ip_rt_put(rt);
                }
                break;
        }
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6: {
                struct rt6_info *rt;

                if (!__in6_dev_get(dev))
                        break;

                rt = rt6_lookup(geneve->net, &info->key.u.ipv6.dst, NULL, 0,
                                NULL, 0);

                if (rt && rt->dst.dev)
                        ldev_mtu = rt->dst.dev->mtu - GENEVE_IPV6_HLEN;
                ip6_rt_put(rt);
                break;
        }
#endif
        }

        if (ldev_mtu <= 0)
                return;

        geneve_change_mtu(dev, ldev_mtu - info->options_len);
}

static int geneve_newlink(struct net_device *dev,
                          struct rtnl_newlink_params *params,
                          struct netlink_ext_ack *extack)
{
        struct net *link_net = rtnl_newlink_link_net(params);
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct geneve_config cfg = {
                .df = GENEVE_DF_UNSET,
                .use_udp6_rx_checksums = false,
                .ttl_inherit = false,
                .collect_md = false,
                .port_min = 1,
                .port_max = USHRT_MAX,
        };
        int err;

        init_tnl_info(&cfg.info, GENEVE_UDP_PORT);
        err = geneve_nl2info(tb, data, extack, &cfg, false);
        if (err)
                return err;

        err = geneve_configure(link_net, dev, extack, &cfg);
        if (err)
                return err;

        geneve_link_config(dev, &cfg.info, tb);

        return 0;
}

/* Quiesces the geneve device data path for both TX and RX.
 *
 * On transmit geneve checks for non-NULL geneve_sock before it proceeds.
 * So, if we set that socket to NULL under RCU and wait for synchronize_net()
 * to complete for the existing set of in-flight packets to be transmitted,
 * then we would have quiesced the transmit data path. All the future packets
 * will get dropped until we unquiesce the data path.
 *
 * On receive geneve dereference the geneve_sock stashed in the socket. So,
 * if we set that to NULL under RCU and wait for synchronize_net() to
 * complete, then we would have quiesced the receive data path.
 */
static void geneve_quiesce(struct geneve_dev *geneve, struct geneve_sock **gs4,
                           struct geneve_sock **gs6)
{
        *gs4 = rtnl_dereference(geneve->sock4);
        rcu_assign_pointer(geneve->sock4, NULL);
        if (*gs4)
                rcu_assign_sk_user_data((*gs4)->sock->sk, NULL);
#if IS_ENABLED(CONFIG_IPV6)
        *gs6 = rtnl_dereference(geneve->sock6);
        rcu_assign_pointer(geneve->sock6, NULL);
        if (*gs6)
                rcu_assign_sk_user_data((*gs6)->sock->sk, NULL);
#else
        *gs6 = NULL;
#endif
        synchronize_net();
}

/* Resumes the geneve device data path for both TX and RX. */
static void geneve_unquiesce(struct geneve_dev *geneve, struct geneve_sock *gs4,
                             struct geneve_sock __maybe_unused *gs6)
{
        rcu_assign_pointer(geneve->sock4, gs4);
        if (gs4)
                rcu_assign_sk_user_data(gs4->sock->sk, gs4);
#if IS_ENABLED(CONFIG_IPV6)
        rcu_assign_pointer(geneve->sock6, gs6);
        if (gs6)
                rcu_assign_sk_user_data(gs6->sock->sk, gs6);
#endif
        synchronize_net();
}

static int geneve_changelink(struct net_device *dev, struct nlattr *tb[],
                             struct nlattr *data[],
                             struct netlink_ext_ack *extack)
{
        struct geneve_dev *geneve = netdev_priv(dev);
        struct geneve_sock *gs4, *gs6;
        struct geneve_config cfg;
        int err;

        /* If the geneve device is configured for metadata (or externally
         * controlled, for example, OVS), then nothing can be changed.
         */
        if (geneve->cfg.collect_md)
                return -EOPNOTSUPP;

        /* Start with the existing info. */
        memcpy(&cfg, &geneve->cfg, sizeof(cfg));
        err = geneve_nl2info(tb, data, extack, &cfg, true);
        if (err)
                return err;

        if (!geneve_dst_addr_equal(&geneve->cfg.info, &cfg.info)) {
                dst_cache_reset(&cfg.info.dst_cache);
                geneve_link_config(dev, &cfg.info, tb);
        }

        geneve_quiesce(geneve, &gs4, &gs6);
        memcpy(&geneve->cfg, &cfg, sizeof(cfg));
        geneve_unquiesce(geneve, gs4, gs6);

        return 0;
}

static void geneve_dellink(struct net_device *dev, struct list_head *head)
{
        struct geneve_dev *geneve = netdev_priv(dev);

        list_del(&geneve->next);
        unregister_netdevice_queue(dev, head);
}

static size_t geneve_get_size(const struct net_device *dev)
{
        return nla_total_size(sizeof(__u32)) +        /* IFLA_GENEVE_ID */
                nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
                nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
                nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
                nla_total_size(sizeof(__u8)) +        /* IFLA_GENEVE_DF */
                nla_total_size(sizeof(__be32)) +  /* IFLA_GENEVE_LABEL */
                nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
                nla_total_size(0) +         /* IFLA_GENEVE_COLLECT_METADATA */
                nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
                nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */
                nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */
                nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */
                nla_total_size(0) +         /* IFLA_GENEVE_INNER_PROTO_INHERIT */
                nla_total_size(sizeof(struct ifla_geneve_port_range)) + /* IFLA_GENEVE_PORT_RANGE */
                0;
}

static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct geneve_dev *geneve = netdev_priv(dev);
        struct ip_tunnel_info *info = &geneve->cfg.info;
        bool ttl_inherit = geneve->cfg.ttl_inherit;
        bool metadata = geneve->cfg.collect_md;
        struct ifla_geneve_port_range ports = {
                .low        = htons(geneve->cfg.port_min),
                .high        = htons(geneve->cfg.port_max),
        };
        __u8 tmp_vni[3];
        __u32 vni;

        tunnel_id_to_vni(info->key.tun_id, tmp_vni);
        vni = (tmp_vni[0] << 16) | (tmp_vni[1] << 8) | tmp_vni[2];
        if (nla_put_u32(skb, IFLA_GENEVE_ID, vni))
                goto nla_put_failure;

        if (!metadata && ip_tunnel_info_af(info) == AF_INET) {
                if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE,
                                    info->key.u.ipv4.dst))
                        goto nla_put_failure;
                if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM,
                               test_bit(IP_TUNNEL_CSUM_BIT,
                                        info->key.tun_flags)))
                        goto nla_put_failure;

#if IS_ENABLED(CONFIG_IPV6)
        } else if (!metadata) {
                if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6,
                                     &info->key.u.ipv6.dst))
                        goto nla_put_failure;
                if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
                               !test_bit(IP_TUNNEL_CSUM_BIT,
                                         info->key.tun_flags)))
                        goto nla_put_failure;
#endif
        }

        if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) ||
            nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) ||
            nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
                goto nla_put_failure;

        if (nla_put_u8(skb, IFLA_GENEVE_DF, geneve->cfg.df))
                goto nla_put_failure;

        if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst))
                goto nla_put_failure;

        if (metadata && nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
                goto nla_put_failure;

#if IS_ENABLED(CONFIG_IPV6)
        if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
                       !geneve->cfg.use_udp6_rx_checksums))
                goto nla_put_failure;
#endif

        if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit))
                goto nla_put_failure;

        if (geneve->cfg.inner_proto_inherit &&
            nla_put_flag(skb, IFLA_GENEVE_INNER_PROTO_INHERIT))
                goto nla_put_failure;

        if (nla_put(skb, IFLA_GENEVE_PORT_RANGE, sizeof(ports), &ports))
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static struct rtnl_link_ops geneve_link_ops __read_mostly = {
        .kind                = "geneve",
        .maxtype        = IFLA_GENEVE_MAX,
        .policy                = geneve_policy,
        .priv_size        = sizeof(struct geneve_dev),
        .setup                = geneve_setup,
        .validate        = geneve_validate,
        .newlink        = geneve_newlink,
        .changelink        = geneve_changelink,
        .dellink        = geneve_dellink,
        .get_size        = geneve_get_size,
        .fill_info        = geneve_fill_info,
};

struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
                                        u8 name_assign_type, u16 dst_port)
{
        struct nlattr *tb[IFLA_MAX + 1];
        struct net_device *dev;
        LIST_HEAD(list_kill);
        int err;
        struct geneve_config cfg = {
                .df = GENEVE_DF_UNSET,
                .use_udp6_rx_checksums = true,
                .ttl_inherit = false,
                .collect_md = true,
                .port_min = 1,
                .port_max = USHRT_MAX,
        };

        memset(tb, 0, sizeof(tb));
        dev = rtnl_create_link(net, name, name_assign_type,
                               &geneve_link_ops, tb, NULL);
        if (IS_ERR(dev))
                return dev;

        init_tnl_info(&cfg.info, dst_port);
        err = geneve_configure(net, dev, NULL, &cfg);
        if (err) {
                free_netdev(dev);
                return ERR_PTR(err);
        }

        /* openvswitch users expect packet sizes to be unrestricted,
         * so set the largest MTU we can.
         */
        err = geneve_change_mtu(dev, IP_MAX_MTU);
        if (err)
                goto err;

        err = rtnl_configure_link(dev, NULL, 0, NULL);
        if (err < 0)
                goto err;

        return dev;
err:
        geneve_dellink(dev, &list_kill);
        unregister_netdevice_many(&list_kill);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(geneve_dev_create_fb);

static int geneve_netdevice_event(struct notifier_block *unused,
                                  unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
                geneve_offload_rx_ports(dev, true);
        else if (event == NETDEV_UDP_TUNNEL_DROP_INFO)
                geneve_offload_rx_ports(dev, false);

        return NOTIFY_DONE;
}

static struct notifier_block geneve_notifier_block __read_mostly = {
        .notifier_call = geneve_netdevice_event,
};

static __net_init int geneve_init_net(struct net *net)
{
        struct geneve_net *gn = net_generic(net, geneve_net_id);

        INIT_LIST_HEAD(&gn->geneve_list);
        INIT_LIST_HEAD(&gn->sock_list);
        return 0;
}

static void geneve_destroy_tunnels(struct net *net, struct list_head *head)
{
        struct geneve_net *gn = net_generic(net, geneve_net_id);
        struct geneve_dev *geneve, *next;

        list_for_each_entry_safe(geneve, next, &gn->geneve_list, next)
                geneve_dellink(geneve->dev, head);
}

static void __net_exit geneve_exit_batch_rtnl(struct list_head *net_list,
                                              struct list_head *dev_to_kill)
{
        struct net *net;

        list_for_each_entry(net, net_list, exit_list)
                geneve_destroy_tunnels(net, dev_to_kill);
}

static void __net_exit geneve_exit_net(struct net *net)
{
        const struct geneve_net *gn = net_generic(net, geneve_net_id);

        WARN_ON_ONCE(!list_empty(&gn->sock_list));
}

static struct pernet_operations geneve_net_ops = {
        .init = geneve_init_net,
        .exit_batch_rtnl = geneve_exit_batch_rtnl,
        .exit = geneve_exit_net,
        .id   = &geneve_net_id,
        .size = sizeof(struct geneve_net),
};

static int __init geneve_init_module(void)
{
        int rc;

        rc = register_pernet_subsys(&geneve_net_ops);
        if (rc)
                goto out1;

        rc = register_netdevice_notifier(&geneve_notifier_block);
        if (rc)
                goto out2;

        rc = rtnl_link_register(&geneve_link_ops);
        if (rc)
                goto out3;

        return 0;
out3:
        unregister_netdevice_notifier(&geneve_notifier_block);
out2:
        unregister_pernet_subsys(&geneve_net_ops);
out1:
        return rc;
}
late_initcall(geneve_init_module);

static void __exit geneve_cleanup_module(void)
{
        rtnl_link_unregister(&geneve_link_ops);
        unregister_netdevice_notifier(&geneve_notifier_block);
        unregister_pernet_subsys(&geneve_net_ops);
}
module_exit(geneve_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_VERSION(GENEVE_NETDEV_VER);
MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>");
MODULE_DESCRIPTION("Interface driver for GENEVE encapsulated traffic");
MODULE_ALIAS_RTNL_LINK("geneve");















































































   24 
   24 




    9 


    1 


    8 


    1 


    1 




    1 







    1 







    1 


    2 










    1 












  249 








  248 





































  246 








    2 






    4 




  244 





   96 




   96 

   95 
    3 












   24 





   24 
























   79 






















   76 


   77 


   47 

    2 




































    2 





    2 

    2 


    1 


    1 









    2 


    1 


    2 


    1 


    2 


    2 


    2 


    2 


    1 


    1 


    2 



    3 


    1 
    1 





    2 





    2 






   23 




  248 





  248 



   72 
   72 









   72 













   72 
























   72 






   69 



   12 

   12 









    2 




    1 





  166 


































  166 













  166 






  164 
























  160 
  161 














  166 


  161 
    4 

  165 




  166 




  166 

  165 







  166 




  126 
  127 








  127 



  127 



  126 






   17 











    2 






    1 










    1 







    3 






    1 


    1 


    1 


















    2 

    1 
    2 















  166 







  157 



   20 
   20 

























   20 
   20 
   10 















  168 



    2 


    1 






    4 




  109 





   60 







  166 



















  148 

   18 



  162 










  163 






   17 




    5 



    5 





    5 



    5 






    7 


    7 



    6 





















    2 











    2 

    2 


    1 


    1 

    1 














































  155 



   26 


   24 






   25 

   25 


   25 







   24 


   24 








   25 







  142 


   15 


   15 



























   59 
  104 















    3 
  157 































  165 



  163 

    8 





    2 









  151 








  163 








  150 
   13 

  162 



  163 









  163 


   17 
   12 
    8 
   21 


   29 








  157 
  155 














  148 
    9 













   59 
  100 

  156 














  157 


  122 


  156 



  157 




  157 









  157 














   46 
   80 













  127 

    8 


















    4 

    3 




    7 







    5 







   21 










   21 

    1 

    1 



    7 





    7 

    2 



    4 










    1 


    2 









   76 


   76 


   76 


   76 
   76 
   76 



   76 














    2 



   76 










   67 



    9 



    9 























   58 



   68 














   69 





   68 






   68 
   68 











   77 



   77 




   69 

    3 


    4 














   60 
   19 



   77 












   71 



    1 


   72 







   16 

   56 













   37 













   10 













    6 





















   10 




    2 


    8 




  355 





  251 




    2 


   77 







    3 



    2 







  248 



  164 

   85 








    1 



    1 



    2 



    1 


    2 

    5 




    2 

   37 




    1 

   10 




    1 

    6 








    3 







    1 


   20 




    1 


    5 












    5 






    1 



    3 







    1 

    1 







    1 

   16 






   87 




    1 




    1 







    1 

    4 






   46 







    1 

    2 




    1 

    4 


    2 


    2 


    2 


   17 




    1 

    3 











  116 
   67 





  111 
  111 

  111 




  118 



  118 








  116 

    9 










































































































































































































































































































































































































































































































































































































































































































































































































   23 






   23 
   23 

   21 
    3 

    5 
    5 



   23 

    5 


    8 
   20 






    7 



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <linux/bug.h>
#include <linux/cpu_pm.h>
#include <linux/entry-kvm.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kvm_host.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
#include <linux/kvm.h>
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
#include <linux/sched/stat.h>
#include <linux/psci.h>
#include <trace/events/kvm.h>

#define CREATE_TRACE_POINTS
#include "trace_arm.h"

#include <linux/uaccess.h>
#include <asm/ptrace.h>
#include <asm/mman.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/virt.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/kvm_pkvm.h>
#include <asm/kvm_ptrauth.h>
#include <asm/sections.h>

#include <kvm/arm_hypercalls.h>
#include <kvm/arm_pmu.h>
#include <kvm/arm_psci.h>

#include "sys_regs.h"

static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT;

enum kvm_wfx_trap_policy {
        KVM_WFX_NOTRAP_SINGLE_TASK, /* Default option */
        KVM_WFX_NOTRAP,
        KVM_WFX_TRAP,
};

static enum kvm_wfx_trap_policy kvm_wfi_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;
static enum kvm_wfx_trap_policy kvm_wfe_trap_policy __read_mostly = KVM_WFX_NOTRAP_SINGLE_TASK;

DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector);

DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_base);
DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);

DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt);

static bool vgic_present, kvm_arm_initialised;

static DEFINE_PER_CPU(unsigned char, kvm_hyp_initialized);

bool is_kvm_arm_initialised(void)
{
        return kvm_arm_initialised;
}

int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
{
        return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
}

int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                            struct kvm_enable_cap *cap)
{
        int r = -EINVAL;

        if (cap->flags)
                return -EINVAL;

        if (kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(cap->cap))
                return -EINVAL;

        switch (cap->cap) {
        case KVM_CAP_ARM_NISV_TO_USER:
                r = 0;
                set_bit(KVM_ARCH_FLAG_RETURN_NISV_IO_ABORT_TO_USER,
                        &kvm->arch.flags);
                break;
        case KVM_CAP_ARM_MTE:
                mutex_lock(&kvm->lock);
                if (system_supports_mte() && !kvm->created_vcpus) {
                        r = 0;
                        set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
                }
                mutex_unlock(&kvm->lock);
                break;
        case KVM_CAP_ARM_SYSTEM_SUSPEND:
                r = 0;
                set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags);
                break;
        case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
                mutex_lock(&kvm->slots_lock);
                /*
                 * To keep things simple, allow changing the chunk
                 * size only when no memory slots have been created.
                 */
                if (kvm_are_all_memslots_empty(kvm)) {
                        u64 new_cap = cap->args[0];

                        if (!new_cap || kvm_is_block_size_supported(new_cap)) {
                                r = 0;
                                kvm->arch.mmu.split_page_chunk_size = new_cap;
                        }
                }
                mutex_unlock(&kvm->slots_lock);
                break;
        case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
                mutex_lock(&kvm->lock);
                if (!kvm->created_vcpus) {
                        r = 0;
                        set_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags);
                }
                mutex_unlock(&kvm->lock);
                break;
        default:
                break;
        }

        return r;
}

static int kvm_arm_default_max_vcpus(void)
{
        return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS;
}

/**
 * kvm_arch_init_vm - initializes a VM data structure
 * @kvm:        pointer to the KVM struct
 * @type:        kvm device type
 */
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
        int ret;

        mutex_init(&kvm->arch.config_lock);

#ifdef CONFIG_LOCKDEP
        /* Clue in lockdep that the config_lock must be taken inside kvm->lock */
        mutex_lock(&kvm->lock);
        mutex_lock(&kvm->arch.config_lock);
        mutex_unlock(&kvm->arch.config_lock);
        mutex_unlock(&kvm->lock);
#endif

        kvm_init_nested(kvm);

        ret = kvm_share_hyp(kvm, kvm + 1);
        if (ret)
                return ret;

        ret = pkvm_init_host_vm(kvm);
        if (ret)
                goto err_unshare_kvm;

        if (!zalloc_cpumask_var(&kvm->arch.supported_cpus, GFP_KERNEL_ACCOUNT)) {
                ret = -ENOMEM;
                goto err_unshare_kvm;
        }
        cpumask_copy(kvm->arch.supported_cpus, cpu_possible_mask);

        ret = kvm_init_stage2_mmu(kvm, &kvm->arch.mmu, type);
        if (ret)
                goto err_free_cpumask;

        kvm_vgic_early_init(kvm);

        kvm_timer_init_vm(kvm);

        /* The maximum number of VCPUs is limited by the host's GIC model */
        kvm->max_vcpus = kvm_arm_default_max_vcpus();

        kvm_arm_init_hypercalls(kvm);

        bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES);

        return 0;

err_free_cpumask:
        free_cpumask_var(kvm->arch.supported_cpus);
err_unshare_kvm:
        kvm_unshare_hyp(kvm, kvm + 1);
        return ret;
}

vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
{
        return VM_FAULT_SIGBUS;
}

void kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
        kvm_sys_regs_create_debugfs(kvm);
        kvm_s2_ptdump_create_debugfs(kvm);
}

static void kvm_destroy_mpidr_data(struct kvm *kvm)
{
        struct kvm_mpidr_data *data;

        mutex_lock(&kvm->arch.config_lock);

        data = rcu_dereference_protected(kvm->arch.mpidr_data,
                                         lockdep_is_held(&kvm->arch.config_lock));
        if (data) {
                rcu_assign_pointer(kvm->arch.mpidr_data, NULL);
                synchronize_rcu();
                kfree(data);
        }

        mutex_unlock(&kvm->arch.config_lock);
}

/**
 * kvm_arch_destroy_vm - destroy the VM data structure
 * @kvm:        pointer to the KVM struct
 */
void kvm_arch_destroy_vm(struct kvm *kvm)
{
        bitmap_free(kvm->arch.pmu_filter);
        free_cpumask_var(kvm->arch.supported_cpus);

        kvm_vgic_destroy(kvm);

        if (is_protected_kvm_enabled())
                pkvm_destroy_hyp_vm(kvm);

        kvm_destroy_mpidr_data(kvm);

        kfree(kvm->arch.sysreg_masks);
        kvm_destroy_vcpus(kvm);

        kvm_unshare_hyp(kvm, kvm + 1);

        kvm_arm_teardown_hypercalls(kvm);
}

static bool kvm_has_full_ptr_auth(void)
{
        bool apa, gpa, api, gpi, apa3, gpa3;
        u64 isar1, isar2, val;

        /*
         * Check that:
         *
         * - both Address and Generic auth are implemented for a given
         *   algorithm (Q5, IMPDEF or Q3)
         * - only a single algorithm is implemented.
         */
        if (!system_has_full_ptr_auth())
                return false;

        isar1 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
        isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);

        apa = !!FIELD_GET(ID_AA64ISAR1_EL1_APA_MASK, isar1);
        val = FIELD_GET(ID_AA64ISAR1_EL1_GPA_MASK, isar1);
        gpa = (val == ID_AA64ISAR1_EL1_GPA_IMP);

        api = !!FIELD_GET(ID_AA64ISAR1_EL1_API_MASK, isar1);
        val = FIELD_GET(ID_AA64ISAR1_EL1_GPI_MASK, isar1);
        gpi = (val == ID_AA64ISAR1_EL1_GPI_IMP);

        apa3 = !!FIELD_GET(ID_AA64ISAR2_EL1_APA3_MASK, isar2);
        val  = FIELD_GET(ID_AA64ISAR2_EL1_GPA3_MASK, isar2);
        gpa3 = (val == ID_AA64ISAR2_EL1_GPA3_IMP);

        return (apa == gpa && api == gpi && apa3 == gpa3 &&
                (apa + api + apa3) == 1);
}

int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
        int r;

        if (kvm && kvm_vm_is_protected(kvm) && !kvm_pvm_ext_allowed(ext))
                return 0;

        switch (ext) {
        case KVM_CAP_IRQCHIP:
                r = vgic_present;
                break;
        case KVM_CAP_IOEVENTFD:
        case KVM_CAP_USER_MEMORY:
        case KVM_CAP_SYNC_MMU:
        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
        case KVM_CAP_ONE_REG:
        case KVM_CAP_ARM_PSCI:
        case KVM_CAP_ARM_PSCI_0_2:
        case KVM_CAP_READONLY_MEM:
        case KVM_CAP_MP_STATE:
        case KVM_CAP_IMMEDIATE_EXIT:
        case KVM_CAP_VCPU_EVENTS:
        case KVM_CAP_ARM_IRQ_LINE_LAYOUT_2:
        case KVM_CAP_ARM_NISV_TO_USER:
        case KVM_CAP_ARM_INJECT_EXT_DABT:
        case KVM_CAP_SET_GUEST_DEBUG:
        case KVM_CAP_VCPU_ATTRIBUTES:
        case KVM_CAP_PTP_KVM:
        case KVM_CAP_ARM_SYSTEM_SUSPEND:
        case KVM_CAP_IRQFD_RESAMPLE:
        case KVM_CAP_COUNTER_OFFSET:
        case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
                r = 1;
                break;
        case KVM_CAP_SET_GUEST_DEBUG2:
                return KVM_GUESTDBG_VALID_MASK;
        case KVM_CAP_ARM_SET_DEVICE_ADDR:
                r = 1;
                break;
        case KVM_CAP_NR_VCPUS:
                /*
                 * ARM64 treats KVM_CAP_NR_CPUS differently from all other
                 * architectures, as it does not always bound it to
                 * KVM_CAP_MAX_VCPUS. It should not matter much because
                 * this is just an advisory value.
                 */
                r = min_t(unsigned int, num_online_cpus(),
                          kvm_arm_default_max_vcpus());
                break;
        case KVM_CAP_MAX_VCPUS:
        case KVM_CAP_MAX_VCPU_ID:
                if (kvm)
                        r = kvm->max_vcpus;
                else
                        r = kvm_arm_default_max_vcpus();
                break;
        case KVM_CAP_MSI_DEVID:
                if (!kvm)
                        r = -EINVAL;
                else
                        r = kvm->arch.vgic.msis_require_devid;
                break;
        case KVM_CAP_ARM_USER_IRQ:
                /*
                 * 1: EL1_VTIMER, EL1_PTIMER, and PMU.
                 * (bump this number if adding more devices)
                 */
                r = 1;
                break;
        case KVM_CAP_ARM_MTE:
                r = system_supports_mte();
                break;
        case KVM_CAP_STEAL_TIME:
                r = kvm_arm_pvtime_supported();
                break;
        case KVM_CAP_ARM_EL1_32BIT:
                r = cpus_have_final_cap(ARM64_HAS_32BIT_EL1);
                break;
        case KVM_CAP_ARM_EL2:
                r = cpus_have_final_cap(ARM64_HAS_NESTED_VIRT);
                break;
        case KVM_CAP_ARM_EL2_E2H0:
                r = cpus_have_final_cap(ARM64_HAS_HCR_NV1);
                break;
        case KVM_CAP_GUEST_DEBUG_HW_BPS:
                r = get_num_brps();
                break;
        case KVM_CAP_GUEST_DEBUG_HW_WPS:
                r = get_num_wrps();
                break;
        case KVM_CAP_ARM_PMU_V3:
                r = kvm_supports_guest_pmuv3();
                break;
        case KVM_CAP_ARM_INJECT_SERROR_ESR:
                r = cpus_have_final_cap(ARM64_HAS_RAS_EXTN);
                break;
        case KVM_CAP_ARM_VM_IPA_SIZE:
                r = get_kvm_ipa_limit();
                break;
        case KVM_CAP_ARM_SVE:
                r = system_supports_sve();
                break;
        case KVM_CAP_ARM_PTRAUTH_ADDRESS:
        case KVM_CAP_ARM_PTRAUTH_GENERIC:
                r = kvm_has_full_ptr_auth();
                break;
        case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE:
                if (kvm)
                        r = kvm->arch.mmu.split_page_chunk_size;
                else
                        r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
                break;
        case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES:
                r = kvm_supported_block_sizes();
                break;
        case KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES:
                r = BIT(0);
                break;
        default:
                r = 0;
        }

        return r;
}

long kvm_arch_dev_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
{
        return -EINVAL;
}

struct kvm *kvm_arch_alloc_vm(void)
{
        size_t sz = sizeof(struct kvm);

        if (!has_vhe())
                return kzalloc(sz, GFP_KERNEL_ACCOUNT);

        return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
}

int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
        if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
                return -EBUSY;

        if (id >= kvm->max_vcpus)
                return -EINVAL;

        return 0;
}

int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
        int err;

        spin_lock_init(&vcpu->arch.mp_state_lock);

#ifdef CONFIG_LOCKDEP
        /* Inform lockdep that the config_lock is acquired after vcpu->mutex */
        mutex_lock(&vcpu->mutex);
        mutex_lock(&vcpu->kvm->arch.config_lock);
        mutex_unlock(&vcpu->kvm->arch.config_lock);
        mutex_unlock(&vcpu->mutex);
#endif

        /* Force users to call KVM_ARM_VCPU_INIT */
        vcpu_clear_flag(vcpu, VCPU_INITIALIZED);

        vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO;

        /* Set up the timer */
        kvm_timer_vcpu_init(vcpu);

        kvm_pmu_vcpu_init(vcpu);

        kvm_arm_pvtime_vcpu_init(&vcpu->arch);

        vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;

        /*
         * This vCPU may have been created after mpidr_data was initialized.
         * Throw out the pre-computed mappings if that is the case which forces
         * KVM to fall back to iteratively searching the vCPUs.
         */
        kvm_destroy_mpidr_data(vcpu->kvm);

        err = kvm_vgic_vcpu_init(vcpu);
        if (err)
                return err;

        err = kvm_share_hyp(vcpu, vcpu + 1);
        if (err)
                kvm_vgic_vcpu_destroy(vcpu);

        return err;
}

void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
}

void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
        if (!is_protected_kvm_enabled())
                kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
        else
                free_hyp_memcache(&vcpu->arch.pkvm_memcache);
        kvm_timer_vcpu_terminate(vcpu);
        kvm_pmu_vcpu_destroy(vcpu);
        kvm_vgic_vcpu_destroy(vcpu);
        kvm_arm_vcpu_destroy(vcpu);
}

void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
{

}

void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
{

}

static void vcpu_set_pauth_traps(struct kvm_vcpu *vcpu)
{
        if (vcpu_has_ptrauth(vcpu) && !is_protected_kvm_enabled()) {
                /*
                 * Either we're running an L2 guest, and the API/APK bits come
                 * from L1's HCR_EL2, or API/APK are both set.
                 */
                if (unlikely(vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))) {
                        u64 val;

                        val = __vcpu_sys_reg(vcpu, HCR_EL2);
                        val &= (HCR_API | HCR_APK);
                        vcpu->arch.hcr_el2 &= ~(HCR_API | HCR_APK);
                        vcpu->arch.hcr_el2 |= val;
                } else {
                        vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK);
                }

                /*
                 * Save the host keys if there is any chance for the guest
                 * to use pauth, as the entry code will reload the guest
                 * keys in that case.
                 */
                if (vcpu->arch.hcr_el2 & (HCR_API | HCR_APK)) {
                        struct kvm_cpu_context *ctxt;

                        ctxt = this_cpu_ptr_hyp_sym(kvm_hyp_ctxt);
                        ptrauth_save_keys(ctxt);
                }
        }
}

static bool kvm_vcpu_should_clear_twi(struct kvm_vcpu *vcpu)
{
        if (unlikely(kvm_wfi_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
                return kvm_wfi_trap_policy == KVM_WFX_NOTRAP;

        return single_task_running() &&
               (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
                vcpu->kvm->arch.vgic.nassgireq);
}

static bool kvm_vcpu_should_clear_twe(struct kvm_vcpu *vcpu)
{
        if (unlikely(kvm_wfe_trap_policy != KVM_WFX_NOTRAP_SINGLE_TASK))
                return kvm_wfe_trap_policy == KVM_WFX_NOTRAP;

        return single_task_running();
}

void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
        struct kvm_s2_mmu *mmu;
        int *last_ran;

        if (is_protected_kvm_enabled())
                goto nommu;

        if (vcpu_has_nv(vcpu))
                kvm_vcpu_load_hw_mmu(vcpu);

        mmu = vcpu->arch.hw_mmu;
        last_ran = this_cpu_ptr(mmu->last_vcpu_ran);

        /*
         * Ensure a VMID is allocated for the MMU before programming VTTBR_EL2,
         * which happens eagerly in VHE.
         *
         * Also, the VMID allocator only preserves VMIDs that are active at the
         * time of rollover, so KVM might need to grab a new VMID for the MMU if
         * this is called from kvm_sched_in().
         */
        kvm_arm_vmid_update(&mmu->vmid);

        /*
         * We guarantee that both TLBs and I-cache are private to each
         * vcpu. If detecting that a vcpu from the same VM has
         * previously run on the same physical CPU, call into the
         * hypervisor code to nuke the relevant contexts.
         *
         * We might get preempted before the vCPU actually runs, but
         * over-invalidation doesn't affect correctness.
         */
        if (*last_ran != vcpu->vcpu_idx) {
                kvm_call_hyp(__kvm_flush_cpu_context, mmu);
                *last_ran = vcpu->vcpu_idx;
        }

nommu:
        vcpu->cpu = cpu;

        /*
         * The timer must be loaded before the vgic to correctly set up physical
         * interrupt deactivation in nested state (e.g. timer interrupt).
         */
        kvm_timer_vcpu_load(vcpu);
        kvm_vgic_load(vcpu);
        kvm_vcpu_load_debug(vcpu);
        if (has_vhe())
                kvm_vcpu_load_vhe(vcpu);
        kvm_arch_vcpu_load_fp(vcpu);
        kvm_vcpu_pmu_restore_guest(vcpu);
        if (kvm_arm_is_pvtime_enabled(&vcpu->arch))
                kvm_make_request(KVM_REQ_RECORD_STEAL, vcpu);

        if (kvm_vcpu_should_clear_twe(vcpu))
                vcpu->arch.hcr_el2 &= ~HCR_TWE;
        else
                vcpu->arch.hcr_el2 |= HCR_TWE;

        if (kvm_vcpu_should_clear_twi(vcpu))
                vcpu->arch.hcr_el2 &= ~HCR_TWI;
        else
                vcpu->arch.hcr_el2 |= HCR_TWI;

        vcpu_set_pauth_traps(vcpu);

        if (is_protected_kvm_enabled()) {
                kvm_call_hyp_nvhe(__pkvm_vcpu_load,
                                  vcpu->kvm->arch.pkvm.handle,
                                  vcpu->vcpu_idx, vcpu->arch.hcr_el2);
                kvm_call_hyp(__vgic_v3_restore_vmcr_aprs,
                             &vcpu->arch.vgic_cpu.vgic_v3);
        }

        if (!cpumask_test_cpu(cpu, vcpu->kvm->arch.supported_cpus))
                vcpu_set_on_unsupported_cpu(vcpu);
}

void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
        if (is_protected_kvm_enabled()) {
                kvm_call_hyp(__vgic_v3_save_vmcr_aprs,
                             &vcpu->arch.vgic_cpu.vgic_v3);
                kvm_call_hyp_nvhe(__pkvm_vcpu_put);
        }

        kvm_vcpu_put_debug(vcpu);
        kvm_arch_vcpu_put_fp(vcpu);
        if (has_vhe())
                kvm_vcpu_put_vhe(vcpu);
        kvm_timer_vcpu_put(vcpu);
        kvm_vgic_put(vcpu);
        kvm_vcpu_pmu_restore_host(vcpu);
        if (vcpu_has_nv(vcpu))
                kvm_vcpu_put_hw_mmu(vcpu);
        kvm_arm_vmid_clear_active();

        vcpu_clear_on_unsupported_cpu(vcpu);
        vcpu->cpu = -1;
}

static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
{
        WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
        kvm_make_request(KVM_REQ_SLEEP, vcpu);
        kvm_vcpu_kick(vcpu);
}

void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
{
        spin_lock(&vcpu->arch.mp_state_lock);
        __kvm_arm_vcpu_power_off(vcpu);
        spin_unlock(&vcpu->arch.mp_state_lock);
}

bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
{
        return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
}

static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
{
        WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_SUSPENDED);
        kvm_make_request(KVM_REQ_SUSPEND, vcpu);
        kvm_vcpu_kick(vcpu);
}

static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
{
        return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED;
}

int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
{
        *mp_state = READ_ONCE(vcpu->arch.mp_state);

        return 0;
}

int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                    struct kvm_mp_state *mp_state)
{
        int ret = 0;

        spin_lock(&vcpu->arch.mp_state_lock);

        switch (mp_state->mp_state) {
        case KVM_MP_STATE_RUNNABLE:
                WRITE_ONCE(vcpu->arch.mp_state, *mp_state);
                break;
        case KVM_MP_STATE_STOPPED:
                __kvm_arm_vcpu_power_off(vcpu);
                break;
        case KVM_MP_STATE_SUSPENDED:
                kvm_arm_vcpu_suspend(vcpu);
                break;
        default:
                ret = -EINVAL;
        }

        spin_unlock(&vcpu->arch.mp_state_lock);

        return ret;
}

/**
 * kvm_arch_vcpu_runnable - determine if the vcpu can be scheduled
 * @v:                The VCPU pointer
 *
 * If the guest CPU is not waiting for interrupts or an interrupt line is
 * asserted, the CPU is by definition runnable.
 */
int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
{
        bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
        return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
                && !kvm_arm_vcpu_stopped(v) && !v->arch.pause);
}

bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
        return vcpu_mode_priv(vcpu);
}

#ifdef CONFIG_GUEST_PERF_EVENTS
unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
{
        return *vcpu_pc(vcpu);
}
#endif

static void kvm_init_mpidr_data(struct kvm *kvm)
{
        struct kvm_mpidr_data *data = NULL;
        unsigned long c, mask, nr_entries;
        u64 aff_set = 0, aff_clr = ~0UL;
        struct kvm_vcpu *vcpu;

        mutex_lock(&kvm->arch.config_lock);

        if (rcu_access_pointer(kvm->arch.mpidr_data) ||
            atomic_read(&kvm->online_vcpus) == 1)
                goto out;

        kvm_for_each_vcpu(c, vcpu, kvm) {
                u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
                aff_set |= aff;
                aff_clr &= aff;
        }

        /*
         * A significant bit can be either 0 or 1, and will only appear in
         * aff_set. Use aff_clr to weed out the useless stuff.
         */
        mask = aff_set ^ aff_clr;
        nr_entries = BIT_ULL(hweight_long(mask));

        /*
         * Don't let userspace fool us. If we need more than a single page
         * to describe the compressed MPIDR array, just fall back to the
         * iterative method. Single vcpu VMs do not need this either.
         */
        if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
                data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
                               GFP_KERNEL_ACCOUNT);

        if (!data)
                goto out;

        data->mpidr_mask = mask;

        kvm_for_each_vcpu(c, vcpu, kvm) {
                u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
                u16 index = kvm_mpidr_index(data, aff);

                data->cmpidr_to_idx[index] = c;
        }

        rcu_assign_pointer(kvm->arch.mpidr_data, data);
out:
        mutex_unlock(&kvm->arch.config_lock);
}

/*
 * Handle both the initialisation that is being done when the vcpu is
 * run for the first time, as well as the updates that must be
 * performed each time we get a new thread dealing with this vcpu.
 */
int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        int ret;

        if (!kvm_vcpu_initialized(vcpu))
                return -ENOEXEC;

        if (!kvm_arm_vcpu_is_finalized(vcpu))
                return -EPERM;

        ret = kvm_arch_vcpu_run_map_fp(vcpu);
        if (ret)
                return ret;

        if (likely(vcpu_has_run_once(vcpu)))
                return 0;

        kvm_init_mpidr_data(kvm);

        if (likely(irqchip_in_kernel(kvm))) {
                /*
                 * Map the VGIC hardware resources before running a vcpu the
                 * first time on this VM.
                 */
                ret = kvm_vgic_map_resources(kvm);
                if (ret)
                        return ret;
        }

        ret = kvm_finalize_sys_regs(vcpu);
        if (ret)
                return ret;

        if (vcpu_has_nv(vcpu)) {
                ret = kvm_vcpu_allocate_vncr_tlb(vcpu);
                if (ret)
                        return ret;

                ret = kvm_vgic_vcpu_nv_init(vcpu);
                if (ret)
                        return ret;
        }

        /*
         * This needs to happen after any restriction has been applied
         * to the feature set.
         */
        kvm_calculate_traps(vcpu);

        ret = kvm_timer_enable(vcpu);
        if (ret)
                return ret;

        if (kvm_vcpu_has_pmu(vcpu)) {
                ret = kvm_arm_pmu_v3_enable(vcpu);
                if (ret)
                        return ret;
        }

        if (is_protected_kvm_enabled()) {
                ret = pkvm_create_hyp_vm(kvm);
                if (ret)
                        return ret;

                ret = pkvm_create_hyp_vcpu(vcpu);
                if (ret)
                        return ret;
        }

        mutex_lock(&kvm->arch.config_lock);
        set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags);
        mutex_unlock(&kvm->arch.config_lock);

        return ret;
}

bool kvm_arch_intc_initialized(struct kvm *kvm)
{
        return vgic_initialized(kvm);
}

void kvm_arm_halt_guest(struct kvm *kvm)
{
        unsigned long i;
        struct kvm_vcpu *vcpu;

        kvm_for_each_vcpu(i, vcpu, kvm)
                vcpu->arch.pause = true;
        kvm_make_all_cpus_request(kvm, KVM_REQ_SLEEP);
}

void kvm_arm_resume_guest(struct kvm *kvm)
{
        unsigned long i;
        struct kvm_vcpu *vcpu;

        kvm_for_each_vcpu(i, vcpu, kvm) {
                vcpu->arch.pause = false;
                __kvm_vcpu_wake_up(vcpu);
        }
}

static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu)
{
        struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);

        rcuwait_wait_event(wait,
                           (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause),
                           TASK_INTERRUPTIBLE);

        if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) {
                /* Awaken to handle a signal, request we sleep again later. */
                kvm_make_request(KVM_REQ_SLEEP, vcpu);
        }

        /*
         * Make sure we will observe a potential reset request if we've
         * observed a change to the power state. Pairs with the smp_wmb() in
         * kvm_psci_vcpu_on().
         */
        smp_rmb();
}

/**
 * kvm_vcpu_wfi - emulate Wait-For-Interrupt behavior
 * @vcpu:        The VCPU pointer
 *
 * Suspend execution of a vCPU until a valid wake event is detected, i.e. until
 * the vCPU is runnable.  The vCPU may or may not be scheduled out, depending
 * on when a wake event arrives, e.g. there may already be a pending wake event.
 */
void kvm_vcpu_wfi(struct kvm_vcpu *vcpu)
{
        /*
         * Sync back the state of the GIC CPU interface so that we have
         * the latest PMR and group enables. This ensures that
         * kvm_arch_vcpu_runnable has up-to-date data to decide whether
         * we have pending interrupts, e.g. when determining if the
         * vCPU should block.
         *
         * For the same reason, we want to tell GICv4 that we need
         * doorbells to be signalled, should an interrupt become pending.
         */
        preempt_disable();
        vcpu_set_flag(vcpu, IN_WFI);
        kvm_vgic_put(vcpu);
        preempt_enable();

        kvm_vcpu_halt(vcpu);
        vcpu_clear_flag(vcpu, IN_WFIT);

        preempt_disable();
        vcpu_clear_flag(vcpu, IN_WFI);
        kvm_vgic_load(vcpu);
        preempt_enable();
}

static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu)
{
        if (!kvm_arm_vcpu_suspended(vcpu))
                return 1;

        kvm_vcpu_wfi(vcpu);

        /*
         * The suspend state is sticky; we do not leave it until userspace
         * explicitly marks the vCPU as runnable. Request that we suspend again
         * later.
         */
        kvm_make_request(KVM_REQ_SUSPEND, vcpu);

        /*
         * Check to make sure the vCPU is actually runnable. If so, exit to
         * userspace informing it of the wakeup condition.
         */
        if (kvm_arch_vcpu_runnable(vcpu)) {
                memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
                vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP;
                vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
                return 0;
        }

        /*
         * Otherwise, we were unblocked to process a different event, such as a
         * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to
         * process the event.
         */
        return 1;
}

/**
 * check_vcpu_requests - check and handle pending vCPU requests
 * @vcpu:        the VCPU pointer
 *
 * Return: 1 if we should enter the guest
 *           0 if we should exit to userspace
 *           < 0 if we should exit to userspace, where the return value indicates
 *           an error
 */
static int check_vcpu_requests(struct kvm_vcpu *vcpu)
{
        if (kvm_request_pending(vcpu)) {
                if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu))
                        return -EIO;

                if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
                        kvm_vcpu_sleep(vcpu);

                if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
                        kvm_reset_vcpu(vcpu);

                /*
                 * Clear IRQ_PENDING requests that were made to guarantee
                 * that a VCPU sees new virtual interrupts.
                 */
                kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);

                if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
                        kvm_update_stolen_time(vcpu);

                if (kvm_check_request(KVM_REQ_RELOAD_GICv4, vcpu)) {
                        /* The distributor enable bits were changed */
                        preempt_disable();
                        vgic_v4_put(vcpu);
                        vgic_v4_load(vcpu);
                        preempt_enable();
                }

                if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu))
                        kvm_vcpu_reload_pmu(vcpu);

                if (kvm_check_request(KVM_REQ_RESYNC_PMU_EL0, vcpu))
                        kvm_vcpu_pmu_restore_guest(vcpu);

                if (kvm_check_request(KVM_REQ_SUSPEND, vcpu))
                        return kvm_vcpu_suspend(vcpu);

                if (kvm_dirty_ring_check_request(vcpu))
                        return 0;

                check_nested_vcpu_requests(vcpu);
        }

        return 1;
}

static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu)
{
        if (likely(!vcpu_mode_is_32bit(vcpu)))
                return false;

        if (vcpu_has_nv(vcpu))
                return true;

        return !kvm_supports_32bit_el0();
}

/**
 * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest
 * @vcpu:        The VCPU pointer
 * @ret:        Pointer to write optional return code
 *
 * Returns: true if the VCPU needs to return to a preemptible + interruptible
 *            and skip guest entry.
 *
 * This function disambiguates between two different types of exits: exits to a
 * preemptible + interruptible kernel context and exits to userspace. For an
 * exit to userspace, this function will write the return code to ret and return
 * true. For an exit to preemptible + interruptible kernel context (i.e. check
 * for pending work and re-enter), return true without writing to ret.
 */
static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret)
{
        struct kvm_run *run = vcpu->run;

        /*
         * If we're using a userspace irqchip, then check if we need
         * to tell a userspace irqchip about timer or PMU level
         * changes and if so, exit to userspace (the actual level
         * state gets updated in kvm_timer_update_run and
         * kvm_pmu_update_run below).
         */
        if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
                if (kvm_timer_should_notify_user(vcpu) ||
                    kvm_pmu_should_notify_user(vcpu)) {
                        *ret = -EINTR;
                        run->exit_reason = KVM_EXIT_INTR;
                        return true;
                }
        }

        if (unlikely(vcpu_on_unsupported_cpu(vcpu))) {
                run->exit_reason = KVM_EXIT_FAIL_ENTRY;
                run->fail_entry.hardware_entry_failure_reason = KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED;
                run->fail_entry.cpu = smp_processor_id();
                *ret = 0;
                return true;
        }

        return kvm_request_pending(vcpu) ||
                        xfer_to_guest_mode_work_pending();
}

/*
 * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while
 * the vCPU is running.
 *
 * This must be noinstr as instrumentation may make use of RCU, and this is not
 * safe during the EQS.
 */
static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu)
{
        int ret;

        guest_state_enter_irqoff();
        ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu);
        guest_state_exit_irqoff();

        return ret;
}

/**
 * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code
 * @vcpu:        The VCPU pointer
 *
 * This function is called through the VCPU_RUN ioctl called from user space. It
 * will execute VM code in a loop until the time slice for the process is used
 * or some emulation is needed from user space in which case the function will
 * return with return value 0 and with the kvm_run structure filled in with the
 * required data for the requested emulation.
 */
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
        struct kvm_run *run = vcpu->run;
        int ret;

        if (run->exit_reason == KVM_EXIT_MMIO) {
                ret = kvm_handle_mmio_return(vcpu);
                if (ret <= 0)
                        return ret;
        }

        vcpu_load(vcpu);

        if (!vcpu->wants_to_run) {
                ret = -EINTR;
                goto out;
        }

        kvm_sigset_activate(vcpu);

        ret = 1;
        run->exit_reason = KVM_EXIT_UNKNOWN;
        run->flags = 0;
        while (ret > 0) {
                /*
                 * Check conditions before entering the guest
                 */
                ret = xfer_to_guest_mode_handle_work(vcpu);
                if (!ret)
                        ret = 1;

                if (ret > 0)
                        ret = check_vcpu_requests(vcpu);

                /*
                 * Preparing the interrupts to be injected also
                 * involves poking the GIC, which must be done in a
                 * non-preemptible context.
                 */
                preempt_disable();

                if (kvm_vcpu_has_pmu(vcpu))
                        kvm_pmu_flush_hwstate(vcpu);

                local_irq_disable();

                kvm_vgic_flush_hwstate(vcpu);

                kvm_pmu_update_vcpu_events(vcpu);

                /*
                 * Ensure we set mode to IN_GUEST_MODE after we disable
                 * interrupts and before the final VCPU requests check.
                 * See the comment in kvm_vcpu_exiting_guest_mode() and
                 * Documentation/virt/kvm/vcpu-requests.rst
                 */
                smp_store_mb(vcpu->mode, IN_GUEST_MODE);

                if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) {
                        vcpu->mode = OUTSIDE_GUEST_MODE;
                        isb(); /* Ensure work in x_flush_hwstate is committed */
                        if (kvm_vcpu_has_pmu(vcpu))
                                kvm_pmu_sync_hwstate(vcpu);
                        if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
                                kvm_timer_sync_user(vcpu);
                        kvm_vgic_sync_hwstate(vcpu);
                        local_irq_enable();
                        preempt_enable();
                        continue;
                }

                kvm_arch_vcpu_ctxflush_fp(vcpu);

                /**************************************************************
                 * Enter the guest
                 */
                trace_kvm_entry(*vcpu_pc(vcpu));
                guest_timing_enter_irqoff();

                ret = kvm_arm_vcpu_enter_exit(vcpu);

                vcpu->mode = OUTSIDE_GUEST_MODE;
                vcpu->stat.exits++;
                /*
                 * Back from guest
                 *************************************************************/

                /*
                 * We must sync the PMU state before the vgic state so
                 * that the vgic can properly sample the updated state of the
                 * interrupt line.
                 */
                if (kvm_vcpu_has_pmu(vcpu))
                        kvm_pmu_sync_hwstate(vcpu);

                /*
                 * Sync the vgic state before syncing the timer state because
                 * the timer code needs to know if the virtual timer
                 * interrupts are active.
                 */
                kvm_vgic_sync_hwstate(vcpu);

                /*
                 * Sync the timer hardware state before enabling interrupts as
                 * we don't want vtimer interrupts to race with syncing the
                 * timer virtual interrupt state.
                 */
                if (unlikely(!irqchip_in_kernel(vcpu->kvm)))
                        kvm_timer_sync_user(vcpu);

                if (is_hyp_ctxt(vcpu))
                        kvm_timer_sync_nested(vcpu);

                kvm_arch_vcpu_ctxsync_fp(vcpu);

                /*
                 * We must ensure that any pending interrupts are taken before
                 * we exit guest timing so that timer ticks are accounted as
                 * guest time. Transiently unmask interrupts so that any
                 * pending interrupts are taken.
                 *
                 * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other
                 * context synchronization event) is necessary to ensure that
                 * pending interrupts are taken.
                 */
                if (ARM_EXCEPTION_CODE(ret) == ARM_EXCEPTION_IRQ) {
                        local_irq_enable();
                        isb();
                        local_irq_disable();
                }

                guest_timing_exit_irqoff();

                local_irq_enable();

                trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));

                /* Exit types that need handling before we can be preempted */
                handle_exit_early(vcpu, ret);

                preempt_enable();

                /*
                 * The ARMv8 architecture doesn't give the hypervisor
                 * a mechanism to prevent a guest from dropping to AArch32 EL0
                 * if implemented by the CPU. If we spot the guest in such
                 * state and that we decided it wasn't supposed to do so (like
                 * with the asymmetric AArch32 case), return to userspace with
                 * a fatal error.
                 */
                if (vcpu_mode_is_bad_32bit(vcpu)) {
                        /*
                         * As we have caught the guest red-handed, decide that
                         * it isn't fit for purpose anymore by making the vcpu
                         * invalid. The VMM can try and fix it by issuing  a
                         * KVM_ARM_VCPU_INIT if it really wants to.
                         */
                        vcpu_clear_flag(vcpu, VCPU_INITIALIZED);
                        ret = ARM_EXCEPTION_IL;
                }

                ret = handle_exit(vcpu, ret);
        }

        /* Tell userspace about in-kernel device output levels */
        if (unlikely(!irqchip_in_kernel(vcpu->kvm))) {
                kvm_timer_update_run(vcpu);
                kvm_pmu_update_run(vcpu);
        }

        kvm_sigset_deactivate(vcpu);

out:
        /*
         * In the unlikely event that we are returning to userspace
         * with pending exceptions or PC adjustment, commit these
         * adjustments in order to give userspace a consistent view of
         * the vcpu state. Note that this relies on __kvm_adjust_pc()
         * being preempt-safe on VHE.
         */
        if (unlikely(vcpu_get_flag(vcpu, PENDING_EXCEPTION) ||
                     vcpu_get_flag(vcpu, INCREMENT_PC)))
                kvm_call_hyp(__kvm_adjust_pc, vcpu);

        vcpu_put(vcpu);
        return ret;
}

static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
{
        int bit_index;
        bool set;
        unsigned long *hcr;

        if (number == KVM_ARM_IRQ_CPU_IRQ)
                bit_index = __ffs(HCR_VI);
        else /* KVM_ARM_IRQ_CPU_FIQ */
                bit_index = __ffs(HCR_VF);

        hcr = vcpu_hcr(vcpu);
        if (level)
                set = test_and_set_bit(bit_index, hcr);
        else
                set = test_and_clear_bit(bit_index, hcr);

        /*
         * If we didn't change anything, no need to wake up or kick other CPUs
         */
        if (set == level)
                return 0;

        /*
         * The vcpu irq_lines field was updated, wake up sleeping VCPUs and
         * trigger a world-switch round on the running physical CPU to set the
         * virtual IRQ/FIQ fields in the HCR appropriately.
         */
        kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
        kvm_vcpu_kick(vcpu);

        return 0;
}

int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
                          bool line_status)
{
        u32 irq = irq_level->irq;
        unsigned int irq_type, vcpu_id, irq_num;
        struct kvm_vcpu *vcpu = NULL;
        bool level = irq_level->level;

        irq_type = (irq >> KVM_ARM_IRQ_TYPE_SHIFT) & KVM_ARM_IRQ_TYPE_MASK;
        vcpu_id = (irq >> KVM_ARM_IRQ_VCPU_SHIFT) & KVM_ARM_IRQ_VCPU_MASK;
        vcpu_id += ((irq >> KVM_ARM_IRQ_VCPU2_SHIFT) & KVM_ARM_IRQ_VCPU2_MASK) * (KVM_ARM_IRQ_VCPU_MASK + 1);
        irq_num = (irq >> KVM_ARM_IRQ_NUM_SHIFT) & KVM_ARM_IRQ_NUM_MASK;

        trace_kvm_irq_line(irq_type, vcpu_id, irq_num, irq_level->level);

        switch (irq_type) {
        case KVM_ARM_IRQ_TYPE_CPU:
                if (irqchip_in_kernel(kvm))
                        return -ENXIO;

                vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
                if (!vcpu)
                        return -EINVAL;

                if (irq_num > KVM_ARM_IRQ_CPU_FIQ)
                        return -EINVAL;

                return vcpu_interrupt_line(vcpu, irq_num, level);
        case KVM_ARM_IRQ_TYPE_PPI:
                if (!irqchip_in_kernel(kvm))
                        return -ENXIO;

                vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
                if (!vcpu)
                        return -EINVAL;

                if (irq_num < VGIC_NR_SGIS || irq_num >= VGIC_NR_PRIVATE_IRQS)
                        return -EINVAL;

                return kvm_vgic_inject_irq(kvm, vcpu, irq_num, level, NULL);
        case KVM_ARM_IRQ_TYPE_SPI:
                if (!irqchip_in_kernel(kvm))
                        return -ENXIO;

                if (irq_num < VGIC_NR_PRIVATE_IRQS)
                        return -EINVAL;

                return kvm_vgic_inject_irq(kvm, NULL, irq_num, level, NULL);
        }

        return -EINVAL;
}

static unsigned long system_supported_vcpu_features(void)
{
        unsigned long features = KVM_VCPU_VALID_FEATURES;

        if (!cpus_have_final_cap(ARM64_HAS_32BIT_EL1))
                clear_bit(KVM_ARM_VCPU_EL1_32BIT, &features);

        if (!kvm_supports_guest_pmuv3())
                clear_bit(KVM_ARM_VCPU_PMU_V3, &features);

        if (!system_supports_sve())
                clear_bit(KVM_ARM_VCPU_SVE, &features);

        if (!kvm_has_full_ptr_auth()) {
                clear_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features);
                clear_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features);
        }

        if (!cpus_have_final_cap(ARM64_HAS_NESTED_VIRT))
                clear_bit(KVM_ARM_VCPU_HAS_EL2, &features);

        return features;
}

static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu,
                                        const struct kvm_vcpu_init *init)
{
        unsigned long features = init->features[0];
        int i;

        if (features & ~KVM_VCPU_VALID_FEATURES)
                return -ENOENT;

        for (i = 1; i < ARRAY_SIZE(init->features); i++) {
                if (init->features[i])
                        return -ENOENT;
        }

        if (features & ~system_supported_vcpu_features())
                return -EINVAL;

        /*
         * For now make sure that both address/generic pointer authentication
         * features are requested by the userspace together.
         */
        if (test_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, &features) !=
            test_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, &features))
                return -EINVAL;

        if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features))
                return 0;

        /* MTE is incompatible with AArch32 */
        if (kvm_has_mte(vcpu->kvm))
                return -EINVAL;

        /* NV is incompatible with AArch32 */
        if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features))
                return -EINVAL;

        return 0;
}

static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu,
                                  const struct kvm_vcpu_init *init)
{
        unsigned long features = init->features[0];

        return !bitmap_equal(vcpu->kvm->arch.vcpu_features, &features,
                             KVM_VCPU_MAX_FEATURES);
}

static int kvm_setup_vcpu(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        int ret = 0;

        /*
         * When the vCPU has a PMU, but no PMU is set for the guest
         * yet, set the default one.
         */
        if (kvm_vcpu_has_pmu(vcpu) && !kvm->arch.arm_pmu)
                ret = kvm_arm_set_default_pmu(kvm);

        /* Prepare for nested if required */
        if (!ret && vcpu_has_nv(vcpu))
                ret = kvm_vcpu_init_nested(vcpu);

        return ret;
}

static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
                                 const struct kvm_vcpu_init *init)
{
        unsigned long features = init->features[0];
        struct kvm *kvm = vcpu->kvm;
        int ret = -EINVAL;

        mutex_lock(&kvm->arch.config_lock);

        if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) &&
            kvm_vcpu_init_changed(vcpu, init))
                goto out_unlock;

        bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES);

        ret = kvm_setup_vcpu(vcpu);
        if (ret)
                goto out_unlock;

        /* Now we know what it is, we can reset it. */
        kvm_reset_vcpu(vcpu);

        set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags);
        vcpu_set_flag(vcpu, VCPU_INITIALIZED);
        ret = 0;
out_unlock:
        mutex_unlock(&kvm->arch.config_lock);
        return ret;
}

static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
                               const struct kvm_vcpu_init *init)
{
        int ret;

        if (init->target != KVM_ARM_TARGET_GENERIC_V8 &&
            init->target != kvm_target_cpu())
                return -EINVAL;

        ret = kvm_vcpu_init_check_features(vcpu, init);
        if (ret)
                return ret;

        if (!kvm_vcpu_initialized(vcpu))
                return __kvm_vcpu_set_target(vcpu, init);

        if (kvm_vcpu_init_changed(vcpu, init))
                return -EINVAL;

        kvm_reset_vcpu(vcpu);
        return 0;
}

static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
                                         struct kvm_vcpu_init *init)
{
        bool power_off = false;
        int ret;

        /*
         * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid
         * reflecting it in the finalized feature set, thus limiting its scope
         * to a single KVM_ARM_VCPU_INIT call.
         */
        if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) {
                init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF);
                power_off = true;
        }

        ret = kvm_vcpu_set_target(vcpu, init);
        if (ret)
                return ret;

        /*
         * Ensure a rebooted VM will fault in RAM pages and detect if the
         * guest MMU is turned off and flush the caches as needed.
         *
         * S2FWB enforces all memory accesses to RAM being cacheable,
         * ensuring that the data side is always coherent. We still
         * need to invalidate the I-cache though, as FWB does *not*
         * imply CTR_EL0.DIC.
         */
        if (vcpu_has_run_once(vcpu)) {
                if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
                        stage2_unmap_vm(vcpu->kvm);
                else
                        icache_inval_all_pou();
        }

        vcpu_reset_hcr(vcpu);

        /*
         * Handle the "start in power-off" case.
         */
        spin_lock(&vcpu->arch.mp_state_lock);

        if (power_off)
                __kvm_arm_vcpu_power_off(vcpu);
        else
                WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);

        spin_unlock(&vcpu->arch.mp_state_lock);

        return 0;
}

static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
                                 struct kvm_device_attr *attr)
{
        int ret = -ENXIO;

        switch (attr->group) {
        default:
                ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
                break;
        }

        return ret;
}

static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
                                 struct kvm_device_attr *attr)
{
        int ret = -ENXIO;

        switch (attr->group) {
        default:
                ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
                break;
        }

        return ret;
}

static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
                                 struct kvm_device_attr *attr)
{
        int ret = -ENXIO;

        switch (attr->group) {
        default:
                ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
                break;
        }

        return ret;
}

static int kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu,
                                   struct kvm_vcpu_events *events)
{
        memset(events, 0, sizeof(*events));

        return __kvm_arm_vcpu_get_events(vcpu, events);
}

static int kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu,
                                   struct kvm_vcpu_events *events)
{
        int i;

        /* check whether the reserved field is zero */
        for (i = 0; i < ARRAY_SIZE(events->reserved); i++)
                if (events->reserved[i])
                        return -EINVAL;

        /* check whether the pad field is zero */
        for (i = 0; i < ARRAY_SIZE(events->exception.pad); i++)
                if (events->exception.pad[i])
                        return -EINVAL;

        return __kvm_arm_vcpu_set_events(vcpu, events);
}

long kvm_arch_vcpu_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
{
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
        struct kvm_device_attr attr;
        long r;

        switch (ioctl) {
        case KVM_ARM_VCPU_INIT: {
                struct kvm_vcpu_init init;

                r = -EFAULT;
                if (copy_from_user(&init, argp, sizeof(init)))
                        break;

                r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init);
                break;
        }
        case KVM_SET_ONE_REG:
        case KVM_GET_ONE_REG: {
                struct kvm_one_reg reg;

                r = -ENOEXEC;
                if (unlikely(!kvm_vcpu_initialized(vcpu)))
                        break;

                r = -EFAULT;
                if (copy_from_user(&reg, argp, sizeof(reg)))
                        break;

                /*
                 * We could owe a reset due to PSCI. Handle the pending reset
                 * here to ensure userspace register accesses are ordered after
                 * the reset.
                 */
                if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
                        kvm_reset_vcpu(vcpu);

                if (ioctl == KVM_SET_ONE_REG)
                        r = kvm_arm_set_reg(vcpu, &reg);
                else
                        r = kvm_arm_get_reg(vcpu, &reg);
                break;
        }
        case KVM_GET_REG_LIST: {
                struct kvm_reg_list __user *user_list = argp;
                struct kvm_reg_list reg_list;
                unsigned n;

                r = -ENOEXEC;
                if (unlikely(!kvm_vcpu_initialized(vcpu)))
                        break;

                r = -EPERM;
                if (!kvm_arm_vcpu_is_finalized(vcpu))
                        break;

                r = -EFAULT;
                if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
                        break;
                n = reg_list.n;
                reg_list.n = kvm_arm_num_regs(vcpu);
                if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
                        break;
                r = -E2BIG;
                if (n < reg_list.n)
                        break;
                r = kvm_arm_copy_reg_indices(vcpu, user_list->reg);
                break;
        }
        case KVM_SET_DEVICE_ATTR: {
                r = -EFAULT;
                if (copy_from_user(&attr, argp, sizeof(attr)))
                        break;
                r = kvm_arm_vcpu_set_attr(vcpu, &attr);
                break;
        }
        case KVM_GET_DEVICE_ATTR: {
                r = -EFAULT;
                if (copy_from_user(&attr, argp, sizeof(attr)))
                        break;
                r = kvm_arm_vcpu_get_attr(vcpu, &attr);
                break;
        }
        case KVM_HAS_DEVICE_ATTR: {
                r = -EFAULT;
                if (copy_from_user(&attr, argp, sizeof(attr)))
                        break;
                r = kvm_arm_vcpu_has_attr(vcpu, &attr);
                break;
        }
        case KVM_GET_VCPU_EVENTS: {
                struct kvm_vcpu_events events;

                if (kvm_arm_vcpu_get_events(vcpu, &events))
                        return -EINVAL;

                if (copy_to_user(argp, &events, sizeof(events)))
                        return -EFAULT;

                return 0;
        }
        case KVM_SET_VCPU_EVENTS: {
                struct kvm_vcpu_events events;

                if (copy_from_user(&events, argp, sizeof(events)))
                        return -EFAULT;

                return kvm_arm_vcpu_set_events(vcpu, &events);
        }
        case KVM_ARM_VCPU_FINALIZE: {
                int what;

                if (!kvm_vcpu_initialized(vcpu))
                        return -ENOEXEC;

                if (get_user(what, (const int __user *)argp))
                        return -EFAULT;

                return kvm_arm_vcpu_finalize(vcpu, what);
        }
        default:
                r = -EINVAL;
        }

        return r;
}

void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{

}

static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
                                        struct kvm_arm_device_addr *dev_addr)
{
        switch (FIELD_GET(KVM_ARM_DEVICE_ID_MASK, dev_addr->id)) {
        case KVM_ARM_DEVICE_VGIC_V2:
                if (!vgic_present)
                        return -ENXIO;
                return kvm_set_legacy_vgic_v2_addr(kvm, dev_addr);
        default:
                return -ENODEV;
        }
}

static int kvm_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_ARM_VM_SMCCC_CTRL:
                return kvm_vm_smccc_has_attr(kvm, attr);
        default:
                return -ENXIO;
        }
}

static int kvm_vm_set_attr(struct kvm *kvm, struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_ARM_VM_SMCCC_CTRL:
                return kvm_vm_smccc_set_attr(kvm, attr);
        default:
                return -ENXIO;
        }
}

int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
{
        struct kvm *kvm = filp->private_data;
        void __user *argp = (void __user *)arg;
        struct kvm_device_attr attr;

        switch (ioctl) {
        case KVM_CREATE_IRQCHIP: {
                int ret;
                if (!vgic_present)
                        return -ENXIO;
                mutex_lock(&kvm->lock);
                ret = kvm_vgic_create(kvm, KVM_DEV_TYPE_ARM_VGIC_V2);
                mutex_unlock(&kvm->lock);
                return ret;
        }
        case KVM_ARM_SET_DEVICE_ADDR: {
                struct kvm_arm_device_addr dev_addr;

                if (copy_from_user(&dev_addr, argp, sizeof(dev_addr)))
                        return -EFAULT;
                return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr);
        }
        case KVM_ARM_PREFERRED_TARGET: {
                struct kvm_vcpu_init init = {
                        .target = KVM_ARM_TARGET_GENERIC_V8,
                };

                if (copy_to_user(argp, &init, sizeof(init)))
                        return -EFAULT;

                return 0;
        }
        case KVM_ARM_MTE_COPY_TAGS: {
                struct kvm_arm_copy_mte_tags copy_tags;

                if (copy_from_user(&copy_tags, argp, sizeof(copy_tags)))
                        return -EFAULT;
                return kvm_vm_ioctl_mte_copy_tags(kvm, &copy_tags);
        }
        case KVM_ARM_SET_COUNTER_OFFSET: {
                struct kvm_arm_counter_offset offset;

                if (copy_from_user(&offset, argp, sizeof(offset)))
                        return -EFAULT;
                return kvm_vm_ioctl_set_counter_offset(kvm, &offset);
        }
        case KVM_HAS_DEVICE_ATTR: {
                if (copy_from_user(&attr, argp, sizeof(attr)))
                        return -EFAULT;

                return kvm_vm_has_attr(kvm, &attr);
        }
        case KVM_SET_DEVICE_ATTR: {
                if (copy_from_user(&attr, argp, sizeof(attr)))
                        return -EFAULT;

                return kvm_vm_set_attr(kvm, &attr);
        }
        case KVM_ARM_GET_REG_WRITABLE_MASKS: {
                struct reg_mask_range range;

                if (copy_from_user(&range, argp, sizeof(range)))
                        return -EFAULT;
                return kvm_vm_ioctl_get_reg_writable_masks(kvm, &range);
        }
        default:
                return -EINVAL;
        }
}

/* unlocks vcpus from @vcpu_lock_idx and smaller */
static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx)
{
        struct kvm_vcpu *tmp_vcpu;

        for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
                tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
                mutex_unlock(&tmp_vcpu->mutex);
        }
}

void unlock_all_vcpus(struct kvm *kvm)
{
        lockdep_assert_held(&kvm->lock);

        unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1);
}

/* Returns true if all vcpus were locked, false otherwise */
bool lock_all_vcpus(struct kvm *kvm)
{
        struct kvm_vcpu *tmp_vcpu;
        unsigned long c;

        lockdep_assert_held(&kvm->lock);

        /*
         * Any time a vcpu is in an ioctl (including running), the
         * core KVM code tries to grab the vcpu->mutex.
         *
         * By grabbing the vcpu->mutex of all VCPUs we ensure that no
         * other VCPUs can fiddle with the state while we access it.
         */
        kvm_for_each_vcpu(c, tmp_vcpu, kvm) {
                if (!mutex_trylock(&tmp_vcpu->mutex)) {
                        unlock_vcpus(kvm, c - 1);
                        return false;
                }
        }

        return true;
}

static unsigned long nvhe_percpu_size(void)
{
        return (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_end) -
                (unsigned long)CHOOSE_NVHE_SYM(__per_cpu_start);
}

static unsigned long nvhe_percpu_order(void)
{
        unsigned long size = nvhe_percpu_size();

        return size ? get_order(size) : 0;
}

static size_t pkvm_host_sve_state_order(void)
{
        return get_order(pkvm_host_sve_state_size());
}

/* A lookup table holding the hypervisor VA for each vector slot */
static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS];

static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot)
{
        hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot);
}

static int kvm_init_vector_slots(void)
{
        int err;
        void *base;

        base = kern_hyp_va(kvm_ksym_ref(__kvm_hyp_vector));
        kvm_init_vector_slot(base, HYP_VECTOR_DIRECT);

        base = kern_hyp_va(kvm_ksym_ref(__bp_harden_hyp_vecs));
        kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_DIRECT);

        if (kvm_system_needs_idmapped_vectors() &&
            !is_protected_kvm_enabled()) {
                err = create_hyp_exec_mappings(__pa_symbol(__bp_harden_hyp_vecs),
                                               __BP_HARDEN_HYP_VECS_SZ, &base);
                if (err)
                        return err;
        }

        kvm_init_vector_slot(base, HYP_VECTOR_INDIRECT);
        kvm_init_vector_slot(base, HYP_VECTOR_SPECTRE_INDIRECT);
        return 0;
}

static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits)
{
        struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
        unsigned long tcr;

        /*
         * Calculate the raw per-cpu offset without a translation from the
         * kernel's mapping to the linear mapping, and store it in tpidr_el2
         * so that we can use adr_l to access per-cpu variables in EL2.
         * Also drop the KASAN tag which gets in the way...
         */
        params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) -
                            (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start));

        params->mair_el2 = read_sysreg(mair_el1);

        tcr = read_sysreg(tcr_el1);
        if (cpus_have_final_cap(ARM64_KVM_HVHE)) {
                tcr &= ~(TCR_HD | TCR_HA | TCR_A1 | TCR_T0SZ_MASK);
                tcr |= TCR_EPD1_MASK;
        } else {
                unsigned long ips = FIELD_GET(TCR_IPS_MASK, tcr);

                tcr &= TCR_EL2_MASK;
                tcr |= TCR_EL2_RES1 | FIELD_PREP(TCR_EL2_PS_MASK, ips);
                if (lpa2_is_enabled())
                        tcr |= TCR_EL2_DS;
        }
        tcr |= TCR_T0SZ(hyp_va_bits);
        params->tcr_el2 = tcr;

        params->pgd_pa = kvm_mmu_get_httbr();
        if (is_protected_kvm_enabled())
                params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS;
        else
                params->hcr_el2 = HCR_HOST_NVHE_FLAGS;
        if (cpus_have_final_cap(ARM64_KVM_HVHE))
                params->hcr_el2 |= HCR_E2H;
        params->vttbr = params->vtcr = 0;

        /*
         * Flush the init params from the data cache because the struct will
         * be read while the MMU is off.
         */
        kvm_flush_dcache_to_poc(params, sizeof(*params));
}

static void hyp_install_host_vector(void)
{
        struct kvm_nvhe_init_params *params;
        struct arm_smccc_res res;

        /* Switch from the HYP stub to our own HYP init vector */
        __hyp_set_vectors(kvm_get_idmap_vector());

        /*
         * Call initialization code, and switch to the full blown HYP code.
         * If the cpucaps haven't been finalized yet, something has gone very
         * wrong, and hyp will crash and burn when it uses any
         * cpus_have_*_cap() wrapper.
         */
        BUG_ON(!system_capabilities_finalized());
        params = this_cpu_ptr_nvhe_sym(kvm_init_params);
        arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res);
        WARN_ON(res.a0 != SMCCC_RET_SUCCESS);
}

static void cpu_init_hyp_mode(void)
{
        hyp_install_host_vector();

        /*
         * Disabling SSBD on a non-VHE system requires us to enable SSBS
         * at EL2.
         */
        if (this_cpu_has_cap(ARM64_SSBS) &&
            arm64_get_spectre_v4_state() == SPECTRE_VULNERABLE) {
                kvm_call_hyp_nvhe(__kvm_enable_ssbs);
        }
}

static void cpu_hyp_reset(void)
{
        if (!is_kernel_in_hyp_mode())
                __hyp_reset_vectors();
}

/*
 * EL2 vectors can be mapped and rerouted in a number of ways,
 * depending on the kernel configuration and CPU present:
 *
 * - If the CPU is affected by Spectre-v2, the hardening sequence is
 *   placed in one of the vector slots, which is executed before jumping
 *   to the real vectors.
 *
 * - If the CPU also has the ARM64_SPECTRE_V3A cap, the slot
 *   containing the hardening sequence is mapped next to the idmap page,
 *   and executed before jumping to the real vectors.
 *
 * - If the CPU only has the ARM64_SPECTRE_V3A cap, then an
 *   empty slot is selected, mapped next to the idmap page, and
 *   executed before jumping to the real vectors.
 *
 * Note that ARM64_SPECTRE_V3A is somewhat incompatible with
 * VHE, as we don't have hypervisor-specific mappings. If the system
 * is VHE and yet selects this capability, it will be ignored.
 */
static void cpu_set_hyp_vector(void)
{
        struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);
        void *vector = hyp_spectre_vector_selector[data->slot];

        if (!is_protected_kvm_enabled())
                *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector;
        else
                kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot);
}

static void cpu_hyp_init_context(void)
{
        kvm_init_host_cpu_context(host_data_ptr(host_ctxt));
        kvm_init_host_debug_data();

        if (!is_kernel_in_hyp_mode())
                cpu_init_hyp_mode();
}

static void cpu_hyp_init_features(void)
{
        cpu_set_hyp_vector();

        if (is_kernel_in_hyp_mode())
                kvm_timer_init_vhe();

        if (vgic_present)
                kvm_vgic_init_cpu_hardware();
}

static void cpu_hyp_reinit(void)
{
        cpu_hyp_reset();
        cpu_hyp_init_context();
        cpu_hyp_init_features();
}

static void cpu_hyp_init(void *discard)
{
        if (!__this_cpu_read(kvm_hyp_initialized)) {
                cpu_hyp_reinit();
                __this_cpu_write(kvm_hyp_initialized, 1);
        }
}

static void cpu_hyp_uninit(void *discard)
{
        if (__this_cpu_read(kvm_hyp_initialized)) {
                cpu_hyp_reset();
                __this_cpu_write(kvm_hyp_initialized, 0);
        }
}

int kvm_arch_enable_virtualization_cpu(void)
{
        /*
         * Most calls to this function are made with migration
         * disabled, but not with preemption disabled. The former is
         * enough to ensure correctness, but most of the helpers
         * expect the later and will throw a tantrum otherwise.
         */
        preempt_disable();

        cpu_hyp_init(NULL);

        kvm_vgic_cpu_up();
        kvm_timer_cpu_up();

        preempt_enable();

        return 0;
}

void kvm_arch_disable_virtualization_cpu(void)
{
        kvm_timer_cpu_down();
        kvm_vgic_cpu_down();

        if (!is_protected_kvm_enabled())
                cpu_hyp_uninit(NULL);
}

#ifdef CONFIG_CPU_PM
static int hyp_init_cpu_pm_notifier(struct notifier_block *self,
                                    unsigned long cmd,
                                    void *v)
{
        /*
         * kvm_hyp_initialized is left with its old value over
         * PM_ENTER->PM_EXIT. It is used to indicate PM_EXIT should
         * re-enable hyp.
         */
        switch (cmd) {
        case CPU_PM_ENTER:
                if (__this_cpu_read(kvm_hyp_initialized))
                        /*
                         * don't update kvm_hyp_initialized here
                         * so that the hyp will be re-enabled
                         * when we resume. See below.
                         */
                        cpu_hyp_reset();

                return NOTIFY_OK;
        case CPU_PM_ENTER_FAILED:
        case CPU_PM_EXIT:
                if (__this_cpu_read(kvm_hyp_initialized))
                        /* The hyp was enabled before suspend. */
                        cpu_hyp_reinit();

                return NOTIFY_OK;

        default:
                return NOTIFY_DONE;
        }
}

static struct notifier_block hyp_init_cpu_pm_nb = {
        .notifier_call = hyp_init_cpu_pm_notifier,
};

static void __init hyp_cpu_pm_init(void)
{
        if (!is_protected_kvm_enabled())
                cpu_pm_register_notifier(&hyp_init_cpu_pm_nb);
}
static void __init hyp_cpu_pm_exit(void)
{
        if (!is_protected_kvm_enabled())
                cpu_pm_unregister_notifier(&hyp_init_cpu_pm_nb);
}
#else
static inline void __init hyp_cpu_pm_init(void)
{
}
static inline void __init hyp_cpu_pm_exit(void)
{
}
#endif

static void __init init_cpu_logical_map(void)
{
        unsigned int cpu;

        /*
         * Copy the MPIDR <-> logical CPU ID mapping to hyp.
         * Only copy the set of online CPUs whose features have been checked
         * against the finalized system capabilities. The hypervisor will not
         * allow any other CPUs from the `possible` set to boot.
         */
        for_each_online_cpu(cpu)
                hyp_cpu_logical_map[cpu] = cpu_logical_map(cpu);
}

#define init_psci_0_1_impl_state(config, what)        \
        config.psci_0_1_ ## what ## _implemented = psci_ops.what

static bool __init init_psci_relay(void)
{
        /*
         * If PSCI has not been initialized, protected KVM cannot install
         * itself on newly booted CPUs.
         */
        if (!psci_ops.get_version) {
                kvm_err("Cannot initialize protected mode without PSCI\n");
                return false;
        }

        kvm_host_psci_config.version = psci_ops.get_version();
        kvm_host_psci_config.smccc_version = arm_smccc_get_version();

        if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) {
                kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids();
                init_psci_0_1_impl_state(kvm_host_psci_config, cpu_suspend);
                init_psci_0_1_impl_state(kvm_host_psci_config, cpu_on);
                init_psci_0_1_impl_state(kvm_host_psci_config, cpu_off);
                init_psci_0_1_impl_state(kvm_host_psci_config, migrate);
        }
        return true;
}

static int __init init_subsystems(void)
{
        int err = 0;

        /*
         * Enable hardware so that subsystem initialisation can access EL2.
         */
        on_each_cpu(cpu_hyp_init, NULL, 1);

        /*
         * Register CPU lower-power notifier
         */
        hyp_cpu_pm_init();

        /*
         * Init HYP view of VGIC
         */
        err = kvm_vgic_hyp_init();
        switch (err) {
        case 0:
                vgic_present = true;
                break;
        case -ENODEV:
        case -ENXIO:
                /*
                 * No VGIC? No pKVM for you.
                 *
                 * Protected mode assumes that VGICv3 is present, so no point
                 * in trying to hobble along if vgic initialization fails.
                 */
                if (is_protected_kvm_enabled())
                        goto out;

                /*
                 * Otherwise, userspace could choose to implement a GIC for its
                 * guest on non-cooperative hardware.
                 */
                vgic_present = false;
                err = 0;
                break;
        default:
                goto out;
        }

        if (kvm_mode == KVM_MODE_NV &&
           !(vgic_present && kvm_vgic_global_state.type == VGIC_V3)) {
                kvm_err("NV support requires GICv3, giving up\n");
                err = -EINVAL;
                goto out;
        }

        /*
         * Init HYP architected timer support
         */
        err = kvm_timer_hyp_init(vgic_present);
        if (err)
                goto out;

        kvm_register_perf_callbacks(NULL);

out:
        if (err)
                hyp_cpu_pm_exit();

        if (err || !is_protected_kvm_enabled())
                on_each_cpu(cpu_hyp_uninit, NULL, 1);

        return err;
}

static void __init teardown_subsystems(void)
{
        kvm_unregister_perf_callbacks();
        hyp_cpu_pm_exit();
}

static void __init teardown_hyp_mode(void)
{
        bool free_sve = system_supports_sve() && is_protected_kvm_enabled();
        int cpu;

        free_hyp_pgds();
        for_each_possible_cpu(cpu) {
                free_pages(per_cpu(kvm_arm_hyp_stack_base, cpu), NVHE_STACK_SHIFT - PAGE_SHIFT);
                free_pages(kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu], nvhe_percpu_order());

                if (free_sve) {
                        struct cpu_sve_state *sve_state;

                        sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
                        free_pages((unsigned long) sve_state, pkvm_host_sve_state_order());
                }
        }
}

static int __init do_pkvm_init(u32 hyp_va_bits)
{
        void *per_cpu_base = kvm_ksym_ref(kvm_nvhe_sym(kvm_arm_hyp_percpu_base));
        int ret;

        preempt_disable();
        cpu_hyp_init_context();
        ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size,
                                num_possible_cpus(), kern_hyp_va(per_cpu_base),
                                hyp_va_bits);
        cpu_hyp_init_features();

        /*
         * The stub hypercalls are now disabled, so set our local flag to
         * prevent a later re-init attempt in kvm_arch_enable_virtualization_cpu().
         */
        __this_cpu_write(kvm_hyp_initialized, 1);
        preempt_enable();

        return ret;
}

static u64 get_hyp_id_aa64pfr0_el1(void)
{
        /*
         * Track whether the system isn't affected by spectre/meltdown in the
         * hypervisor's view of id_aa64pfr0_el1, used for protected VMs.
         * Although this is per-CPU, we make it global for simplicity, e.g., not
         * to have to worry about vcpu migration.
         *
         * Unlike for non-protected VMs, userspace cannot override this for
         * protected VMs.
         */
        u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);

        val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) |
                 ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3));

        val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2),
                          arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED);
        val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3),
                          arm64_get_meltdown_state() == SPECTRE_UNAFFECTED);

        return val;
}

static void kvm_hyp_init_symbols(void)
{
        kvm_nvhe_sym(id_aa64pfr0_el1_sys_val) = get_hyp_id_aa64pfr0_el1();
        kvm_nvhe_sym(id_aa64pfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1);
        kvm_nvhe_sym(id_aa64isar0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR0_EL1);
        kvm_nvhe_sym(id_aa64isar1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR1_EL1);
        kvm_nvhe_sym(id_aa64isar2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);
        kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
        kvm_nvhe_sym(id_aa64mmfr2_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR2_EL1);
        kvm_nvhe_sym(id_aa64smfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64SMFR0_EL1);
        kvm_nvhe_sym(__icache_flags) = __icache_flags;
        kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;

        /* Propagate the FGT state to the the nVHE side */
        kvm_nvhe_sym(hfgrtr_masks)  = hfgrtr_masks;
        kvm_nvhe_sym(hfgwtr_masks)  = hfgwtr_masks;
        kvm_nvhe_sym(hfgitr_masks)  = hfgitr_masks;
        kvm_nvhe_sym(hdfgrtr_masks) = hdfgrtr_masks;
        kvm_nvhe_sym(hdfgwtr_masks) = hdfgwtr_masks;
        kvm_nvhe_sym(hafgrtr_masks) = hafgrtr_masks;
        kvm_nvhe_sym(hfgrtr2_masks) = hfgrtr2_masks;
        kvm_nvhe_sym(hfgwtr2_masks) = hfgwtr2_masks;
        kvm_nvhe_sym(hfgitr2_masks) = hfgitr2_masks;
        kvm_nvhe_sym(hdfgrtr2_masks)= hdfgrtr2_masks;
        kvm_nvhe_sym(hdfgwtr2_masks)= hdfgwtr2_masks;

        /*
         * Flush entire BSS since part of its data containing init symbols is read
         * while the MMU is off.
         */
        kvm_flush_dcache_to_poc(kvm_ksym_ref(__hyp_bss_start),
                                kvm_ksym_ref(__hyp_bss_end) - kvm_ksym_ref(__hyp_bss_start));
}

static int __init kvm_hyp_init_protection(u32 hyp_va_bits)
{
        void *addr = phys_to_virt(hyp_mem_base);
        int ret;

        ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP);
        if (ret)
                return ret;

        ret = do_pkvm_init(hyp_va_bits);
        if (ret)
                return ret;

        free_hyp_pgds();

        return 0;
}

static int init_pkvm_host_sve_state(void)
{
        int cpu;

        if (!system_supports_sve())
                return 0;

        /* Allocate pages for host sve state in protected mode. */
        for_each_possible_cpu(cpu) {
                struct page *page = alloc_pages(GFP_KERNEL, pkvm_host_sve_state_order());

                if (!page)
                        return -ENOMEM;

                per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state = page_address(page);
        }

        /*
         * Don't map the pages in hyp since these are only used in protected
         * mode, which will (re)create its own mapping when initialized.
         */

        return 0;
}

/*
 * Finalizes the initialization of hyp mode, once everything else is initialized
 * and the initialziation process cannot fail.
 */
static void finalize_init_hyp_mode(void)
{
        int cpu;

        if (system_supports_sve() && is_protected_kvm_enabled()) {
                for_each_possible_cpu(cpu) {
                        struct cpu_sve_state *sve_state;

                        sve_state = per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state;
                        per_cpu_ptr_nvhe_sym(kvm_host_data, cpu)->sve_state =
                                kern_hyp_va(sve_state);
                }
        }
}

static void pkvm_hyp_init_ptrauth(void)
{
        struct kvm_cpu_context *hyp_ctxt;
        int cpu;

        for_each_possible_cpu(cpu) {
                hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu);
                hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long();
                hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long();
                hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long();
                hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long();
                hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long();
                hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long();
                hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long();
                hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long();
                hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long();
                hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long();
        }
}

/* Inits Hyp-mode on all online CPUs */
static int __init init_hyp_mode(void)
{
        u32 hyp_va_bits;
        int cpu;
        int err = -ENOMEM;

        /*
         * The protected Hyp-mode cannot be initialized if the memory pool
         * allocation has failed.
         */
        if (is_protected_kvm_enabled() && !hyp_mem_base)
                goto out_err;

        /*
         * Allocate Hyp PGD and setup Hyp identity mapping
         */
        err = kvm_mmu_init(&hyp_va_bits);
        if (err)
                goto out_err;

        /*
         * Allocate stack pages for Hypervisor-mode
         */
        for_each_possible_cpu(cpu) {
                unsigned long stack_base;

                stack_base = __get_free_pages(GFP_KERNEL, NVHE_STACK_SHIFT - PAGE_SHIFT);
                if (!stack_base) {
                        err = -ENOMEM;
                        goto out_err;
                }

                per_cpu(kvm_arm_hyp_stack_base, cpu) = stack_base;
        }

        /*
         * Allocate and initialize pages for Hypervisor-mode percpu regions.
         */
        for_each_possible_cpu(cpu) {
                struct page *page;
                void *page_addr;

                page = alloc_pages(GFP_KERNEL, nvhe_percpu_order());
                if (!page) {
                        err = -ENOMEM;
                        goto out_err;
                }

                page_addr = page_address(page);
                memcpy(page_addr, CHOOSE_NVHE_SYM(__per_cpu_start), nvhe_percpu_size());
                kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu] = (unsigned long)page_addr;
        }

        /*
         * Map the Hyp-code called directly from the host
         */
        err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
                                  kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
        if (err) {
                kvm_err("Cannot map world-switch code\n");
                goto out_err;
        }

        err = create_hyp_mappings(kvm_ksym_ref(__hyp_data_start),
                                  kvm_ksym_ref(__hyp_data_end), PAGE_HYP);
        if (err) {
                kvm_err("Cannot map .hyp.data section\n");
                goto out_err;
        }

        err = create_hyp_mappings(kvm_ksym_ref(__hyp_rodata_start),
                                  kvm_ksym_ref(__hyp_rodata_end), PAGE_HYP_RO);
        if (err) {
                kvm_err("Cannot map .hyp.rodata section\n");
                goto out_err;
        }

        err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
                                  kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
        if (err) {
                kvm_err("Cannot map rodata section\n");
                goto out_err;
        }

        /*
         * .hyp.bss is guaranteed to be placed at the beginning of the .bss
         * section thanks to an assertion in the linker script. Map it RW and
         * the rest of .bss RO.
         */
        err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start),
                                  kvm_ksym_ref(__hyp_bss_end), PAGE_HYP);
        if (err) {
                kvm_err("Cannot map hyp bss section: %d\n", err);
                goto out_err;
        }

        err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end),
                                  kvm_ksym_ref(__bss_stop), PAGE_HYP_RO);
        if (err) {
                kvm_err("Cannot map bss section\n");
                goto out_err;
        }

        /*
         * Map the Hyp stack pages
         */
        for_each_possible_cpu(cpu) {
                struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu);
                char *stack_base = (char *)per_cpu(kvm_arm_hyp_stack_base, cpu);

                err = create_hyp_stack(__pa(stack_base), &params->stack_hyp_va);
                if (err) {
                        kvm_err("Cannot map hyp stack\n");
                        goto out_err;
                }

                /*
                 * Save the stack PA in nvhe_init_params. This will be needed
                 * to recreate the stack mapping in protected nVHE mode.
                 * __hyp_pa() won't do the right thing there, since the stack
                 * has been mapped in the flexible private VA space.
                 */
                params->stack_pa = __pa(stack_base);
        }

        for_each_possible_cpu(cpu) {
                char *percpu_begin = (char *)kvm_nvhe_sym(kvm_arm_hyp_percpu_base)[cpu];
                char *percpu_end = percpu_begin + nvhe_percpu_size();

                /* Map Hyp percpu pages */
                err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP);
                if (err) {
                        kvm_err("Cannot map hyp percpu region\n");
                        goto out_err;
                }

                /* Prepare the CPU initialization parameters */
                cpu_prepare_hyp_mode(cpu, hyp_va_bits);
        }

        kvm_hyp_init_symbols();

        if (is_protected_kvm_enabled()) {
                if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) &&
                    cpus_have_final_cap(ARM64_HAS_ADDRESS_AUTH))
                        pkvm_hyp_init_ptrauth();

                init_cpu_logical_map();

                if (!init_psci_relay()) {
                        err = -ENODEV;
                        goto out_err;
                }

                err = init_pkvm_host_sve_state();
                if (err)
                        goto out_err;

                err = kvm_hyp_init_protection(hyp_va_bits);
                if (err) {
                        kvm_err("Failed to init hyp memory protection\n");
                        goto out_err;
                }
        }

        return 0;

out_err:
        teardown_hyp_mode();
        kvm_err("error initializing Hyp mode: %d\n", err);
        return err;
}

struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr)
{
        struct kvm_vcpu *vcpu = NULL;
        struct kvm_mpidr_data *data;
        unsigned long i;

        mpidr &= MPIDR_HWID_BITMASK;

        rcu_read_lock();
        data = rcu_dereference(kvm->arch.mpidr_data);

        if (data) {
                u16 idx = kvm_mpidr_index(data, mpidr);

                vcpu = kvm_get_vcpu(kvm, data->cmpidr_to_idx[idx]);
                if (mpidr != kvm_vcpu_get_mpidr_aff(vcpu))
                        vcpu = NULL;
        }

        rcu_read_unlock();

        if (vcpu)
                return vcpu;

        kvm_for_each_vcpu(i, vcpu, kvm) {
                if (mpidr == kvm_vcpu_get_mpidr_aff(vcpu))
                        return vcpu;
        }
        return NULL;
}

bool kvm_arch_irqchip_in_kernel(struct kvm *kvm)
{
        return irqchip_in_kernel(kvm);
}

int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
                                      struct irq_bypass_producer *prod)
{
        struct kvm_kernel_irqfd *irqfd =
                container_of(cons, struct kvm_kernel_irqfd, consumer);
        struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;

        /*
         * The only thing we have a chance of directly-injecting is LPIs. Maybe
         * one day...
         */
        if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
                return 0;

        return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq,
                                          &irqfd->irq_entry);
}
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
                                      struct irq_bypass_producer *prod)
{
        struct kvm_kernel_irqfd *irqfd =
                container_of(cons, struct kvm_kernel_irqfd, consumer);
        struct kvm_kernel_irq_routing_entry *irq_entry = &irqfd->irq_entry;

        if (irq_entry->type != KVM_IRQ_ROUTING_MSI)
                return;

        kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq,
                                     &irqfd->irq_entry);
}

void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons)
{
        struct kvm_kernel_irqfd *irqfd =
                container_of(cons, struct kvm_kernel_irqfd, consumer);

        kvm_arm_halt_guest(irqfd->kvm);
}

void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons)
{
        struct kvm_kernel_irqfd *irqfd =
                container_of(cons, struct kvm_kernel_irqfd, consumer);

        kvm_arm_resume_guest(irqfd->kvm);
}

/* Initialize Hyp-mode and memory mappings on all CPUs */
static __init int kvm_arm_init(void)
{
        int err;
        bool in_hyp_mode;

        if (!is_hyp_mode_available()) {
                kvm_info("HYP mode not available\n");
                return -ENODEV;
        }

        if (kvm_get_mode() == KVM_MODE_NONE) {
                kvm_info("KVM disabled from command line\n");
                return -ENODEV;
        }

        err = kvm_sys_reg_table_init();
        if (err) {
                kvm_info("Error initializing system register tables");
                return err;
        }

        in_hyp_mode = is_kernel_in_hyp_mode();

        if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) ||
            cpus_have_final_cap(ARM64_WORKAROUND_1508412))
                kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \
                         "Only trusted guests should be used on this system.\n");

        err = kvm_set_ipa_limit();
        if (err)
                return err;

        err = kvm_arm_init_sve();
        if (err)
                return err;

        err = kvm_arm_vmid_alloc_init();
        if (err) {
                kvm_err("Failed to initialize VMID allocator.\n");
                return err;
        }

        if (!in_hyp_mode) {
                err = init_hyp_mode();
                if (err)
                        goto out_err;
        }

        err = kvm_init_vector_slots();
        if (err) {
                kvm_err("Cannot initialise vector slots\n");
                goto out_hyp;
        }

        err = init_subsystems();
        if (err)
                goto out_hyp;

        kvm_info("%s%sVHE%s mode initialized successfully\n",
                 in_hyp_mode ? "" : (is_protected_kvm_enabled() ?
                                     "Protected " : "Hyp "),
                 in_hyp_mode ? "" : (cpus_have_final_cap(ARM64_KVM_HVHE) ?
                                     "h" : "n"),
                 cpus_have_final_cap(ARM64_HAS_NESTED_VIRT) ? "+NV2": "");

        /*
         * FIXME: Do something reasonable if kvm_init() fails after pKVM
         * hypervisor protection is finalized.
         */
        err = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE);
        if (err)
                goto out_subs;

        /*
         * This should be called after initialization is done and failure isn't
         * possible anymore.
         */
        if (!in_hyp_mode)
                finalize_init_hyp_mode();

        kvm_arm_initialised = true;

        return 0;

out_subs:
        teardown_subsystems();
out_hyp:
        if (!in_hyp_mode)
                teardown_hyp_mode();
out_err:
        kvm_arm_vmid_alloc_free();
        return err;
}

static int __init early_kvm_mode_cfg(char *arg)
{
        if (!arg)
                return -EINVAL;

        if (strcmp(arg, "none") == 0) {
                kvm_mode = KVM_MODE_NONE;
                return 0;
        }

        if (!is_hyp_mode_available()) {
                pr_warn_once("KVM is not available. Ignoring kvm-arm.mode\n");
                return 0;
        }

        if (strcmp(arg, "protected") == 0) {
                if (!is_kernel_in_hyp_mode())
                        kvm_mode = KVM_MODE_PROTECTED;
                else
                        pr_warn_once("Protected KVM not available with VHE\n");

                return 0;
        }

        if (strcmp(arg, "nvhe") == 0 && !WARN_ON(is_kernel_in_hyp_mode())) {
                kvm_mode = KVM_MODE_DEFAULT;
                return 0;
        }

        if (strcmp(arg, "nested") == 0 && !WARN_ON(!is_kernel_in_hyp_mode())) {
                kvm_mode = KVM_MODE_NV;
                return 0;
        }

        return -EINVAL;
}
early_param("kvm-arm.mode", early_kvm_mode_cfg);

static int __init early_kvm_wfx_trap_policy_cfg(char *arg, enum kvm_wfx_trap_policy *p)
{
        if (!arg)
                return -EINVAL;

        if (strcmp(arg, "trap") == 0) {
                *p = KVM_WFX_TRAP;
                return 0;
        }

        if (strcmp(arg, "notrap") == 0) {
                *p = KVM_WFX_NOTRAP;
                return 0;
        }

        return -EINVAL;
}

static int __init early_kvm_wfi_trap_policy_cfg(char *arg)
{
        return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfi_trap_policy);
}
early_param("kvm-arm.wfi_trap_policy", early_kvm_wfi_trap_policy_cfg);

static int __init early_kvm_wfe_trap_policy_cfg(char *arg)
{
        return early_kvm_wfx_trap_policy_cfg(arg, &kvm_wfe_trap_policy);
}
early_param("kvm-arm.wfe_trap_policy", early_kvm_wfe_trap_policy_cfg);

enum kvm_mode kvm_get_mode(void)
{
        return kvm_mode;
}

module_init(kvm_arm_init);











































































































































































































  672 














  672 






























  189 













  189 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* Credentials management - see Documentation/security/credentials.rst
 *
 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_CRED_H
#define _LINUX_CRED_H

#include <linux/capability.h>
#include <linux/init.h>
#include <linux/key.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/uidgid.h>
#include <linux/sched.h>
#include <linux/sched/user.h>

struct cred;
struct inode;

/*
 * COW Supplementary groups list
 */
struct group_info {
        refcount_t        usage;
        int                ngroups;
        kgid_t                gid[];
} __randomize_layout;

/**
 * get_group_info - Get a reference to a group info structure
 * @group_info: The group info to reference
 *
 * This gets a reference to a set of supplementary groups.
 *
 * If the caller is accessing a task's credentials, they must hold the RCU read
 * lock when reading.
 */
static inline struct group_info *get_group_info(struct group_info *gi)
{
        refcount_inc(&gi->usage);
        return gi;
}

/**
 * put_group_info - Release a reference to a group info structure
 * @group_info: The group info to release
 */
#define put_group_info(group_info)                        \
do {                                                        \
        if (refcount_dec_and_test(&(group_info)->usage))        \
                groups_free(group_info);                \
} while (0)

#ifdef CONFIG_MULTIUSER
extern struct group_info *groups_alloc(int);
extern void groups_free(struct group_info *);

extern int in_group_p(kgid_t);
extern int in_egroup_p(kgid_t);
extern int groups_search(const struct group_info *, kgid_t);

extern int set_current_groups(struct group_info *);
extern void set_groups(struct cred *, struct group_info *);
extern bool may_setgroups(void);
extern void groups_sort(struct group_info *);
#else
static inline void groups_free(struct group_info *group_info)
{
}

static inline int in_group_p(kgid_t grp)
{
        return 1;
}
static inline int in_egroup_p(kgid_t grp)
{
        return 1;
}
static inline int groups_search(const struct group_info *group_info, kgid_t grp)
{
        return 1;
}
#endif

/*
 * The security context of a task
 *
 * The parts of the context break down into two categories:
 *
 *  (1) The objective context of a task.  These parts are used when some other
 *        task is attempting to affect this one.
 *
 *  (2) The subjective context.  These details are used when the task is acting
 *        upon another object, be that a file, a task, a key or whatever.
 *
 * Note that some members of this structure belong to both categories - the
 * LSM security pointer for instance.
 *
 * A task has two security pointers.  task->real_cred points to the objective
 * context that defines that task's actual details.  The objective part of this
 * context is used whenever that task is acted upon.
 *
 * task->cred points to the subjective context that defines the details of how
 * that task is going to act upon another object.  This may be overridden
 * temporarily to point to another security context, but normally points to the
 * same context as task->real_cred.
 */
struct cred {
        atomic_long_t        usage;
        kuid_t                uid;                /* real UID of the task */
        kgid_t                gid;                /* real GID of the task */
        kuid_t                suid;                /* saved UID of the task */
        kgid_t                sgid;                /* saved GID of the task */
        kuid_t                euid;                /* effective UID of the task */
        kgid_t                egid;                /* effective GID of the task */
        kuid_t                fsuid;                /* UID for VFS ops */
        kgid_t                fsgid;                /* GID for VFS ops */
        unsigned        securebits;        /* SUID-less security management */
        kernel_cap_t        cap_inheritable; /* caps our children can inherit */
        kernel_cap_t        cap_permitted;        /* caps we're permitted */
        kernel_cap_t        cap_effective;        /* caps we can actually use */
        kernel_cap_t        cap_bset;        /* capability bounding set */
        kernel_cap_t        cap_ambient;        /* Ambient capability set */
#ifdef CONFIG_KEYS
        unsigned char        jit_keyring;        /* default keyring to attach requested
                                         * keys to */
        struct key        *session_keyring; /* keyring inherited over fork */
        struct key        *process_keyring; /* keyring private to this process */
        struct key        *thread_keyring; /* keyring private to this thread */
        struct key        *request_key_auth; /* assumed request_key authority */
#endif
#ifdef CONFIG_SECURITY
        void                *security;        /* LSM security */
#endif
        struct user_struct *user;        /* real user ID subscription */
        struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
        struct ucounts *ucounts;
        struct group_info *group_info;        /* supplementary groups for euid/fsgid */
        /* RCU deletion */
        union {
                int non_rcu;                        /* Can we skip RCU deletion? */
                struct rcu_head        rcu;                /* RCU deletion hook */
        };
} __randomize_layout;

extern void __put_cred(struct cred *);
extern void exit_creds(struct task_struct *);
extern int copy_creds(struct task_struct *, unsigned long);
extern const struct cred *get_task_cred(struct task_struct *);
extern struct cred *cred_alloc_blank(void);
extern struct cred *prepare_creds(void);
extern struct cred *prepare_exec_creds(void);
extern int commit_creds(struct cred *);
extern void abort_creds(struct cred *);
extern struct cred *prepare_kernel_cred(struct task_struct *);
extern int set_security_override(struct cred *, u32);
extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void);
extern int set_cred_ucounts(struct cred *);

static inline bool cap_ambient_invariant_ok(const struct cred *cred)
{
        return cap_issubset(cred->cap_ambient,
                            cap_intersect(cred->cap_permitted,
                                          cred->cap_inheritable));
}

static inline const struct cred *override_creds(const struct cred *override_cred)
{
        return rcu_replace_pointer(current->cred, override_cred, 1);
}

static inline const struct cred *revert_creds(const struct cred *revert_cred)
{
        return rcu_replace_pointer(current->cred, revert_cred, 1);
}

/**
 * get_cred_many - Get references on a set of credentials
 * @cred: The credentials to reference
 * @nr: Number of references to acquire
 *
 * Get references on the specified set of credentials.  The caller must release
 * all acquired reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.  Although the
 * pointer is const, this will temporarily discard the const and increment the
 * usage count.  The purpose of this is to attempt to catch at compile time the
 * accidental alteration of a set of credentials that should be considered
 * immutable.
 */
static inline const struct cred *get_cred_many(const struct cred *cred, int nr)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return cred;
        nonconst_cred->non_rcu = 0;
        atomic_long_add(nr, &nonconst_cred->usage);
        return cred;
}

/*
 * get_cred - Get a reference on a set of credentials
 * @cred: The credentials to reference
 *
 * Get a reference on the specified set of credentials.  The caller must
 * release the reference.  If %NULL is passed, it is returned with no action.
 *
 * This is used to deal with a committed set of credentials.
 */
static inline const struct cred *get_cred(const struct cred *cred)
{
        return get_cred_many(cred, 1);
}

static inline const struct cred *get_cred_rcu(const struct cred *cred)
{
        struct cred *nonconst_cred = (struct cred *) cred;
        if (!cred)
                return NULL;
        if (!atomic_long_inc_not_zero(&nonconst_cred->usage))
                return NULL;
        nonconst_cred->non_rcu = 0;
        return cred;
}

/**
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 * @nr: Number of references to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 *
 * This takes a const pointer to a set of credentials because the credentials
 * on task_struct are attached by const pointers to prevent accidental
 * alteration of otherwise immutable credential sets.
 */
static inline void put_cred_many(const struct cred *_cred, int nr)
{
        struct cred *cred = (struct cred *) _cred;

        if (cred) {
                if (atomic_long_sub_and_test(nr, &cred->usage))
                        __put_cred(cred);
        }
}

/*
 * put_cred - Release a reference to a set of credentials
 * @cred: The credentials to release
 *
 * Release a reference to a set of credentials, deleting them when the last ref
 * is released.  If %NULL is passed, nothing is done.
 */
static inline void put_cred(const struct cred *cred)
{
        put_cred_many(cred, 1);
}

/**
 * current_cred - Access the current task's subjective credentials
 *
 * Access the subjective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_cred() \
        rcu_dereference_protected(current->cred, 1)

/**
 * current_real_cred - Access the current task's objective credentials
 *
 * Access the objective credentials of the current task.  RCU-safe,
 * since nobody else can modify it.
 */
#define current_real_cred() \
        rcu_dereference_protected(current->real_cred, 1)

/**
 * __task_cred - Access a task's objective credentials
 * @task: The task to query
 *
 * Access the objective credentials of a task.  The caller must hold the RCU
 * readlock.
 *
 * The result of this function should not be passed directly to get_cred();
 * rather get_task_cred() should be used instead.
 */
#define __task_cred(task)        \
        rcu_dereference((task)->real_cred)

/**
 * get_current_cred - Get the current task's subjective credentials
 *
 * Get the subjective credentials of the current task, pinning them so that
 * they can't go away.  Accessing the current task's credentials directly is
 * not permitted.
 */
#define get_current_cred()                                \
        (get_cred(current_cred()))

/**
 * get_current_user - Get the current task's user_struct
 *
 * Get the user record of the current task, pinning it so that it can't go
 * away.
 */
#define get_current_user()                                \
({                                                        \
        struct user_struct *__u;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __u = get_uid(__cred->user);                        \
        __u;                                                \
})

/**
 * get_current_groups - Get the current task's supplementary group list
 *
 * Get the supplementary group list of the current task, pinning it so that it
 * can't go away.
 */
#define get_current_groups()                                \
({                                                        \
        struct group_info *__groups;                        \
        const struct cred *__cred;                        \
        __cred = current_cred();                        \
        __groups = get_group_info(__cred->group_info);        \
        __groups;                                        \
})

#define task_cred_xxx(task, xxx)                        \
({                                                        \
        __typeof__(((struct cred *)NULL)->xxx) ___val;        \
        rcu_read_lock();                                \
        ___val = __task_cred((task))->xxx;                \
        rcu_read_unlock();                                \
        ___val;                                                \
})

#define task_uid(task)                (task_cred_xxx((task), uid))
#define task_euid(task)                (task_cred_xxx((task), euid))
#define task_ucounts(task)        (task_cred_xxx((task), ucounts))

#define current_cred_xxx(xxx)                        \
({                                                \
        current_cred()->xxx;                        \
})

#define current_uid()                (current_cred_xxx(uid))
#define current_gid()                (current_cred_xxx(gid))
#define current_euid()                (current_cred_xxx(euid))
#define current_egid()                (current_cred_xxx(egid))
#define current_suid()                (current_cred_xxx(suid))
#define current_sgid()                (current_cred_xxx(sgid))
#define current_fsuid()         (current_cred_xxx(fsuid))
#define current_fsgid()         (current_cred_xxx(fsgid))
#define current_cap()                (current_cred_xxx(cap_effective))
#define current_user()                (current_cred_xxx(user))
#define current_ucounts()        (current_cred_xxx(ucounts))

extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS
#define current_user_ns()        (current_cred_xxx(user_ns))
#else
static inline struct user_namespace *current_user_ns(void)
{
        return &init_user_ns;
}
#endif


#define current_uid_gid(_uid, _gid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_uid) = __cred->uid;                        \
        *(_gid) = __cred->gid;                        \
} while(0)

#define current_euid_egid(_euid, _egid)                \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_euid) = __cred->euid;                \
        *(_egid) = __cred->egid;                \
} while(0)

#define current_fsuid_fsgid(_fsuid, _fsgid)        \
do {                                                \
        const struct cred *__cred;                \
        __cred = current_cred();                \
        *(_fsuid) = __cred->fsuid;                \
        *(_fsgid) = __cred->fsgid;                \
} while(0)

#endif /* _LINUX_CRED_H */

































































  200 





























    2 











































































  136 











  725 



    7 



  725 

  384 
  134 

  661 
   10 










  664 


  664 


























  724 


  402 




  662 



























  229 



























  146 

  146 

























   91 



   91 





















  114 
    4 





















   90 





   89 



   39 































  211 













   40 

   66 




   27 











    5 

   14 
   14 







































   19 


   17 





   17 








    4 


    3 
   14 




















































  211 




   50 


   40 
   37 




   68 



   40 




   40 











   40 
    2 











   10 


   38 

   40 






















  228 








  209 

  211 

  211 


  211 

  211 







   82 







  230 


   62 


   61 



   66 



   91 





























































   27 






  118 
  111 
   19 
















  252 









  230 


   61 





  253 

  252 





  184 

  118 


  228 
   60 










  251 

  253 




  112 




   19 


   19 

   19 






  118 
































    7 






    7 






    7 














   78 



   13 


   66 
   19 








   66 
















   46 




    7 

   74 






















































































































































































































































































































































































































































  379 




  139 



   44 



  379 

  100 






  117 

   36 




   38 

   24 



   37 

































    7 














    7 




    7 





































































  110 










  108 

   13 

   24 

   24 



    9 




   24 

































  636 



  637 



  636 







   91 

















    5 


















    6 


























   27 










   27 



   27 





















































   27 
   27 
















   91 




   91 



















   69 





   69 





















































































































































































































































































































































































  163 



  164 




  164 

  164 

   46 
  153 



































  145 






  145 




  145 

  140 

   59 

   59 


  145 

  140 
   59 






















































































































   23 






   24 

   24 
   24 


   24 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
// SPDX-License-Identifier: GPL-2.0+
/*
 * XArray implementation
 * Copyright (c) 2017-2018 Microsoft Corporation
 * Copyright (c) 2018-2020 Oracle
 * Author: Matthew Wilcox <willy@infradead.org>
 */

#include <linux/bitmap.h>
#include <linux/export.h>
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/xarray.h>

#include "radix-tree.h"

/*
 * Coding conventions in this file:
 *
 * @xa is used to refer to the entire xarray.
 * @xas is the 'xarray operation state'.  It may be either a pointer to
 * an xa_state, or an xa_state stored on the stack.  This is an unfortunate
 * ambiguity.
 * @index is the index of the entry being operated on
 * @mark is an xa_mark_t; a small number indicating one of the mark bits.
 * @node refers to an xa_node; usually the primary one being operated on by
 * this function.
 * @offset is the index into the slots array inside an xa_node.
 * @parent refers to the @xa_node closer to the head than @node.
 * @entry refers to something stored in a slot in the xarray
 */

static inline unsigned int xa_lock_type(const struct xarray *xa)
{
        return (__force unsigned int)xa->xa_flags & 3;
}

static inline void xas_lock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_lock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_lock_bh(xas);
        else
                xas_lock(xas);
}

static inline void xas_unlock_type(struct xa_state *xas, unsigned int lock_type)
{
        if (lock_type == XA_LOCK_IRQ)
                xas_unlock_irq(xas);
        else if (lock_type == XA_LOCK_BH)
                xas_unlock_bh(xas);
        else
                xas_unlock(xas);
}

static inline bool xa_track_free(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_TRACK_FREE;
}

static inline bool xa_zero_busy(const struct xarray *xa)
{
        return xa->xa_flags & XA_FLAGS_ZERO_BUSY;
}

static inline void xa_mark_set(struct xarray *xa, xa_mark_t mark)
{
        if (!(xa->xa_flags & XA_FLAGS_MARK(mark)))
                xa->xa_flags |= XA_FLAGS_MARK(mark);
}

static inline void xa_mark_clear(struct xarray *xa, xa_mark_t mark)
{
        if (xa->xa_flags & XA_FLAGS_MARK(mark))
                xa->xa_flags &= ~(XA_FLAGS_MARK(mark));
}

static inline unsigned long *node_marks(struct xa_node *node, xa_mark_t mark)
{
        return node->marks[(__force unsigned)mark];
}

static inline bool node_get_mark(struct xa_node *node,
                unsigned int offset, xa_mark_t mark)
{
        return test_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_set_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_set_bit(offset, node_marks(node, mark));
}

/* returns true if the bit was set */
static inline bool node_clear_mark(struct xa_node *node, unsigned int offset,
                                xa_mark_t mark)
{
        return __test_and_clear_bit(offset, node_marks(node, mark));
}

static inline bool node_any_mark(struct xa_node *node, xa_mark_t mark)
{
        return !bitmap_empty(node_marks(node, mark), XA_CHUNK_SIZE);
}

static inline void node_mark_all(struct xa_node *node, xa_mark_t mark)
{
        bitmap_fill(node_marks(node, mark), XA_CHUNK_SIZE);
}

#define mark_inc(mark) do { \
        mark = (__force xa_mark_t)((__force unsigned)(mark) + 1); \
} while (0)

/*
 * xas_squash_marks() - Merge all marks to the first entry
 * @xas: Array operation state.
 *
 * Set a mark on the first entry if any entry has it set.  Clear marks on
 * all sibling entries.
 */
static void xas_squash_marks(const struct xa_state *xas)
{
        xa_mark_t mark = 0;
        unsigned int limit = xas->xa_offset + xas->xa_sibs + 1;

        for (;;) {
                unsigned long *marks = node_marks(xas->xa_node, mark);

                if (find_next_bit(marks, limit, xas->xa_offset + 1) != limit) {
                        __set_bit(xas->xa_offset, marks);
                        bitmap_clear(marks, xas->xa_offset + 1, xas->xa_sibs);
                }
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}

/* extracts the offset within this node from the index */
static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
        return (index >> node->shift) & XA_CHUNK_MASK;
}

static void xas_set_offset(struct xa_state *xas)
{
        xas->xa_offset = get_offset(xas->xa_index, xas->xa_node);
}

/* move the index either forwards (find) or backwards (sibling slot) */
static void xas_move_index(struct xa_state *xas, unsigned long offset)
{
        unsigned int shift = xas->xa_node->shift;
        xas->xa_index &= ~XA_CHUNK_MASK << shift;
        xas->xa_index += offset << shift;
}

static void xas_next_offset(struct xa_state *xas)
{
        xas->xa_offset++;
        xas_move_index(xas, xas->xa_offset);
}

static void *set_bounds(struct xa_state *xas)
{
        xas->xa_node = XAS_BOUNDS;
        return NULL;
}

/*
 * Starts a walk.  If the @xas is already valid, we assume that it's on
 * the right path and just return where we've got to.  If we're in an
 * error state, return NULL.  If the index is outside the current scope
 * of the xarray, return NULL without changing @xas->xa_node.  Otherwise
 * set @xas->xa_node to NULL and return the current head of the array.
 */
static void *xas_start(struct xa_state *xas)
{
        void *entry;

        if (xas_valid(xas))
                return xas_reload(xas);
        if (xas_error(xas))
                return NULL;

        entry = xa_head(xas->xa);
        if (!xa_is_node(entry)) {
                if (xas->xa_index)
                        return set_bounds(xas);
        } else {
                if ((xas->xa_index >> xa_to_node(entry)->shift) > XA_CHUNK_MASK)
                        return set_bounds(xas);
        }

        xas->xa_node = NULL;
        return entry;
}

static __always_inline void *xas_descend(struct xa_state *xas,
                                        struct xa_node *node)
{
        unsigned int offset = get_offset(xas->xa_index, node);
        void *entry = xa_entry(xas->xa, node, offset);

        xas->xa_node = node;
        while (xa_is_sibling(entry)) {
                offset = xa_to_sibling(entry);
                entry = xa_entry(xas->xa, node, offset);
                if (node->shift && xa_is_node(entry))
                        entry = XA_RETRY_ENTRY;
        }

        xas->xa_offset = offset;
        return entry;
}

/**
 * xas_load() - Load an entry from the XArray (advanced).
 * @xas: XArray operation state.
 *
 * Usually walks the @xas to the appropriate state to load the entry
 * stored at xa_index.  However, it will do nothing and return %NULL if
 * @xas is in an error state.  xas_load() will never expand the tree.
 *
 * If the xa_state is set up to operate on a multi-index entry, xas_load()
 * may return %NULL or an internal entry, even if there are entries
 * present within the range specified by @xas.
 *
 * Context: Any context.  The caller should hold the xa_lock or the RCU lock.
 * Return: Usually an entry in the XArray, but see description for exceptions.
 */
void *xas_load(struct xa_state *xas)
{
        void *entry = xas_start(xas);

        while (xa_is_node(entry)) {
                struct xa_node *node = xa_to_node(entry);

                if (xas->xa_shift > node->shift)
                        break;
                entry = xas_descend(xas, node);
                if (node->shift == 0)
                        break;
        }
        return entry;
}
EXPORT_SYMBOL_GPL(xas_load);

#define XA_RCU_FREE        ((struct xarray *)1)

static void xa_node_free(struct xa_node *node)
{
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->array = XA_RCU_FREE;
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

/*
 * xas_destroy() - Free any resources allocated during the XArray operation.
 * @xas: XArray operation state.
 *
 * Most users will not need to call this function; it is called for you
 * by xas_nomem().
 */
void xas_destroy(struct xa_state *xas)
{
        struct xa_node *next, *node = xas->xa_alloc;

        while (node) {
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
                next = rcu_dereference_raw(node->parent);
                radix_tree_node_rcu_free(&node->rcu_head);
                xas->xa_alloc = node = next;
        }
}
EXPORT_SYMBOL_GPL(xas_destroy);

/**
 * xas_nomem() - Allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * If we need to add new nodes to the XArray, we try to allocate memory
 * with GFP_NOWAIT while holding the lock, which will usually succeed.
 * If it fails, @xas is flagged as needing memory to continue.  The caller
 * should drop the lock and call xas_nomem().  If xas_nomem() succeeds,
 * the caller should retry the operation.
 *
 * Forward progress is guaranteed as one node is allocated here and
 * stored in the xa_state where it will be found by xas_alloc().  More
 * nodes will likely be found in the slab allocator, but we do not tie
 * them up here.
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
bool xas_nomem(struct xa_state *xas, gfp_t gfp)
{
        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}
EXPORT_SYMBOL_GPL(xas_nomem);

/*
 * __xas_nomem() - Drop locks and allocate memory if needed.
 * @xas: XArray operation state.
 * @gfp: Memory allocation flags.
 *
 * Internal variant of xas_nomem().
 *
 * Return: true if memory was needed, and was successfully allocated.
 */
static bool __xas_nomem(struct xa_state *xas, gfp_t gfp)
        __must_hold(xas->xa->xa_lock)
{
        unsigned int lock_type = xa_lock_type(xas->xa);

        if (xas->xa_node != XA_ERROR(-ENOMEM)) {
                xas_destroy(xas);
                return false;
        }
        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;
        if (gfpflags_allow_blocking(gfp)) {
                xas_unlock_type(xas, lock_type);
                xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                xas_lock_type(xas, lock_type);
        } else {
                xas->xa_alloc = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
        }
        if (!xas->xa_alloc)
                return false;
        xas->xa_alloc->parent = NULL;
        XA_NODE_BUG_ON(xas->xa_alloc, !list_empty(&xas->xa_alloc->private_list));
        xas->xa_node = XAS_RESTART;
        return true;
}

static void xas_update(struct xa_state *xas, struct xa_node *node)
{
        if (xas->xa_update)
                xas->xa_update(node);
        else
                XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
}

static void *xas_alloc(struct xa_state *xas, unsigned int shift)
{
        struct xa_node *parent = xas->xa_node;
        struct xa_node *node = xas->xa_alloc;

        if (xas_invalid(xas))
                return NULL;

        if (node) {
                xas->xa_alloc = NULL;
        } else {
                gfp_t gfp = GFP_NOWAIT | __GFP_NOWARN;

                if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                        gfp |= __GFP_ACCOUNT;

                node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                if (!node) {
                        xas_set_err(xas, -ENOMEM);
                        return NULL;
                }
        }

        if (parent) {
                node->offset = xas->xa_offset;
                parent->count++;
                XA_NODE_BUG_ON(node, parent->count > XA_CHUNK_SIZE);
                xas_update(xas, parent);
        }
        XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
        XA_NODE_BUG_ON(node, !list_empty(&node->private_list));
        node->shift = shift;
        node->count = 0;
        node->nr_values = 0;
        RCU_INIT_POINTER(node->parent, xas->xa_node);
        node->array = xas->xa;

        return node;
}

#ifdef CONFIG_XARRAY_MULTI
/* Returns the number of indices covered by a given xa_state */
static unsigned long xas_size(const struct xa_state *xas)
{
        return (xas->xa_sibs + 1UL) << xas->xa_shift;
}
#endif

/*
 * Use this to calculate the maximum index that will need to be created
 * in order to add the entry described by @xas.  Because we cannot store a
 * multi-index entry at index 0, the calculation is a little more complex
 * than you might expect.
 */
static unsigned long xas_max(struct xa_state *xas)
{
        unsigned long max = xas->xa_index;

#ifdef CONFIG_XARRAY_MULTI
        if (xas->xa_shift || xas->xa_sibs) {
                unsigned long mask = xas_size(xas) - 1;
                max |= mask;
                if (mask == max)
                        max++;
        }
#endif

        return max;
}

/* The maximum index that can be contained in the array without expanding it */
static unsigned long max_index(void *entry)
{
        if (!xa_is_node(entry))
                return 0;
        return (XA_CHUNK_SIZE << xa_to_node(entry)->shift) - 1;
}

static inline void *xa_zero_to_null(void *entry)
{
        return xa_is_zero(entry) ? NULL : entry;
}

static void xas_shrink(struct xa_state *xas)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = xas->xa_node;

        for (;;) {
                void *entry;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count != 1)
                        break;
                entry = xa_entry_locked(xa, node, 0);
                if (!entry)
                        break;
                if (!xa_is_node(entry) && node->shift)
                        break;
                if (xa_zero_busy(xa))
                        entry = xa_zero_to_null(entry);
                xas->xa_node = XAS_BOUNDS;

                RCU_INIT_POINTER(xa->xa_head, entry);
                if (xa_track_free(xa) && !node_get_mark(node, 0, XA_FREE_MARK))
                        xa_mark_clear(xa, XA_FREE_MARK);

                node->count = 0;
                node->nr_values = 0;
                if (!xa_is_node(entry))
                        RCU_INIT_POINTER(node->slots[0], XA_RETRY_ENTRY);
                xas_update(xas, node);
                xa_node_free(node);
                if (!xa_is_node(entry))
                        break;
                node = xa_to_node(entry);
                node->parent = NULL;
        }
}

/*
 * xas_delete_node() - Attempt to delete an xa_node
 * @xas: Array operation state.
 *
 * Attempts to delete the @xas->xa_node.  This will fail if xa->node has
 * a non-zero reference count.
 */
static void xas_delete_node(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        for (;;) {
                struct xa_node *parent;

                XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
                if (node->count)
                        break;

                parent = xa_parent_locked(xas->xa, node);
                xas->xa_node = parent;
                xas->xa_offset = node->offset;
                xa_node_free(node);

                if (!parent) {
                        xas->xa->xa_head = NULL;
                        xas->xa_node = XAS_BOUNDS;
                        return;
                }

                parent->slots[xas->xa_offset] = NULL;
                parent->count--;
                XA_NODE_BUG_ON(parent, parent->count > XA_CHUNK_SIZE);
                node = parent;
                xas_update(xas, node);
        }

        if (!node->parent)
                xas_shrink(xas);
}

/**
 * xas_free_nodes() - Free this node and all nodes that it references
 * @xas: Array operation state.
 * @top: Node to free
 *
 * This node has been removed from the tree.  We must now free it and all
 * of its subnodes.  There may be RCU walkers with references into the tree,
 * so we must replace all entries with retry markers.
 */
static void xas_free_nodes(struct xa_state *xas, struct xa_node *top)
{
        unsigned int offset = 0;
        struct xa_node *node = top;

        for (;;) {
                void *entry = xa_entry_locked(xas->xa, node, offset);

                if (node->shift && xa_is_node(entry)) {
                        node = xa_to_node(entry);
                        offset = 0;
                        continue;
                }
                if (entry)
                        RCU_INIT_POINTER(node->slots[offset], XA_RETRY_ENTRY);
                offset++;
                while (offset == XA_CHUNK_SIZE) {
                        struct xa_node *parent;

                        parent = xa_parent_locked(xas->xa, node);
                        offset = node->offset + 1;
                        node->count = 0;
                        node->nr_values = 0;
                        xas_update(xas, node);
                        xa_node_free(node);
                        if (node == top)
                                return;
                        node = parent;
                }
        }
}

/*
 * xas_expand adds nodes to the head of the tree until it has reached
 * sufficient height to be able to contain @xas->xa_index
 */
static int xas_expand(struct xa_state *xas, void *head)
{
        struct xarray *xa = xas->xa;
        struct xa_node *node = NULL;
        unsigned int shift = 0;
        unsigned long max = xas_max(xas);

        if (!head) {
                if (max == 0)
                        return 0;
                while ((max >> shift) >= XA_CHUNK_SIZE)
                        shift += XA_CHUNK_SHIFT;
                return shift + XA_CHUNK_SHIFT;
        } else if (xa_is_node(head)) {
                node = xa_to_node(head);
                shift = node->shift + XA_CHUNK_SHIFT;
        }
        xas->xa_node = NULL;

        while (max > max_index(head)) {
                xa_mark_t mark = 0;

                XA_NODE_BUG_ON(node, shift > BITS_PER_LONG);
                node = xas_alloc(xas, shift);
                if (!node)
                        return -ENOMEM;

                node->count = 1;
                if (xa_is_value(head))
                        node->nr_values = 1;
                RCU_INIT_POINTER(node->slots[0], head);

                /* Propagate the aggregated mark info to the new child */
                for (;;) {
                        if (xa_track_free(xa) && mark == XA_FREE_MARK) {
                                node_mark_all(node, XA_FREE_MARK);
                                if (!xa_marked(xa, XA_FREE_MARK)) {
                                        node_clear_mark(node, 0, XA_FREE_MARK);
                                        xa_mark_set(xa, XA_FREE_MARK);
                                }
                        } else if (xa_marked(xa, mark)) {
                                node_set_mark(node, 0, mark);
                        }
                        if (mark == XA_MARK_MAX)
                                break;
                        mark_inc(mark);
                }

                /*
                 * Now that the new node is fully initialised, we can add
                 * it to the tree
                 */
                if (xa_is_node(head)) {
                        xa_to_node(head)->offset = 0;
                        rcu_assign_pointer(xa_to_node(head)->parent, node);
                }
                head = xa_mk_node(node);
                rcu_assign_pointer(xa->xa_head, head);
                xas_update(xas, node);

                shift += XA_CHUNK_SHIFT;
        }

        xas->xa_node = node;
        return shift;
}

/*
 * xas_create() - Create a slot to store an entry in.
 * @xas: XArray operation state.
 * @allow_root: %true if we can store the entry in the root directly
 *
 * Most users will not need to call this function directly, as it is called
 * by xas_store().  It is useful for doing conditional store operations
 * (see the xa_cmpxchg() implementation for an example).
 *
 * Return: If the slot already existed, returns the contents of this slot.
 * If the slot was newly created, returns %NULL.  If it failed to create the
 * slot, returns %NULL and indicates the error in @xas.
 */
static void *xas_create(struct xa_state *xas, bool allow_root)
{
        struct xarray *xa = xas->xa;
        void *entry;
        void __rcu **slot;
        struct xa_node *node = xas->xa_node;
        int shift;
        unsigned int order = xas->xa_shift;

        if (xas_top(node)) {
                entry = xa_head_locked(xa);
                xas->xa_node = NULL;
                if (!entry && xa_zero_busy(xa))
                        entry = XA_ZERO_ENTRY;
                shift = xas_expand(xas, entry);
                if (shift < 0)
                        return NULL;
                if (!shift && !allow_root)
                        shift = XA_CHUNK_SHIFT;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        } else if (xas_error(xas)) {
                return NULL;
        } else if (node) {
                unsigned int offset = xas->xa_offset;

                shift = node->shift;
                entry = xa_entry_locked(xa, node, offset);
                slot = &node->slots[offset];
        } else {
                shift = 0;
                entry = xa_head_locked(xa);
                slot = &xa->xa_head;
        }

        while (shift > order) {
                shift -= XA_CHUNK_SHIFT;
                if (!entry) {
                        node = xas_alloc(xas, shift);
                        if (!node)
                                break;
                        if (xa_track_free(xa))
                                node_mark_all(node, XA_FREE_MARK);
                        rcu_assign_pointer(*slot, xa_mk_node(node));
                } else if (xa_is_node(entry)) {
                        node = xa_to_node(entry);
                } else {
                        break;
                }
                entry = xas_descend(xas, node);
                slot = &node->slots[xas->xa_offset];
        }

        return entry;
}

/**
 * xas_create_range() - Ensure that stores to this range will succeed
 * @xas: XArray operation state.
 *
 * Creates all of the slots in the range covered by @xas.  Sets @xas to
 * create single-index entries and positions it at the beginning of the
 * range.  This is for the benefit of users which have not yet been
 * converted to use multi-index entries.
 */
void xas_create_range(struct xa_state *xas)
{
        unsigned long index = xas->xa_index;
        unsigned char shift = xas->xa_shift;
        unsigned char sibs = xas->xa_sibs;

        xas->xa_index |= ((sibs + 1UL) << shift) - 1;
        if (xas_is_node(xas) && xas->xa_node->shift == xas->xa_shift)
                xas->xa_offset |= sibs;
        xas->xa_shift = 0;
        xas->xa_sibs = 0;

        for (;;) {
                xas_create(xas, true);
                if (xas_error(xas))
                        goto restore;
                if (xas->xa_index <= (index | XA_CHUNK_MASK))
                        goto success;
                xas->xa_index -= XA_CHUNK_SIZE;

                for (;;) {
                        struct xa_node *node = xas->xa_node;
                        if (node->shift >= shift)
                                break;
                        xas->xa_node = xa_parent_locked(xas->xa, node);
                        xas->xa_offset = node->offset - 1;
                        if (node->offset != 0)
                                break;
                }
        }

restore:
        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
        xas->xa_index = index;
        return;
success:
        xas->xa_index = index;
        if (xas->xa_node)
                xas_set_offset(xas);
}
EXPORT_SYMBOL_GPL(xas_create_range);

static void update_node(struct xa_state *xas, struct xa_node *node,
                int count, int values)
{
        if (!node || (!count && !values))
                return;

        node->count += count;
        node->nr_values += values;
        XA_NODE_BUG_ON(node, node->count > XA_CHUNK_SIZE);
        XA_NODE_BUG_ON(node, node->nr_values > XA_CHUNK_SIZE);
        xas_update(xas, node);
        if (count < 0)
                xas_delete_node(xas);
}

/**
 * xas_store() - Store this entry in the XArray.
 * @xas: XArray operation state.
 * @entry: New entry.
 *
 * If @xas is operating on a multi-index entry, the entry returned by this
 * function is essentially meaningless (it may be an internal entry or it
 * may be %NULL, even if there are non-NULL entries at some of the indices
 * covered by the range).  This is not a problem for any current users,
 * and can be changed if needed.
 *
 * Return: The old entry at this index.
 */
void *xas_store(struct xa_state *xas, void *entry)
{
        struct xa_node *node;
        void __rcu **slot = &xas->xa->xa_head;
        unsigned int offset, max;
        int count = 0;
        int values = 0;
        void *first, *next;
        bool value = xa_is_value(entry);

        if (entry) {
                bool allow_root = !xa_is_node(entry) && !xa_is_zero(entry);
                first = xas_create(xas, allow_root);
        } else {
                first = xas_load(xas);
        }

        if (xas_invalid(xas))
                return first;
        node = xas->xa_node;
        if (node && (xas->xa_shift < node->shift))
                xas->xa_sibs = 0;
        if ((first == entry) && !xas->xa_sibs)
                return first;

        next = first;
        offset = xas->xa_offset;
        max = xas->xa_offset + xas->xa_sibs;
        if (node) {
                slot = &node->slots[offset];
                if (xas->xa_sibs)
                        xas_squash_marks(xas);
        }
        if (!entry)
                xas_init_marks(xas);

        for (;;) {
                /*
                 * Must clear the marks before setting the entry to NULL,
                 * otherwise xas_for_each_marked may find a NULL entry and
                 * stop early.  rcu_assign_pointer contains a release barrier
                 * so the mark clearing will appear to happen before the
                 * entry is set to NULL.
                 */
                rcu_assign_pointer(*slot, entry);
                if (xa_is_node(next) && (!node || node->shift))
                        xas_free_nodes(xas, xa_to_node(next));
                if (!node)
                        break;
                count += !next - !entry;
                values += !xa_is_value(first) - !value;
                if (entry) {
                        if (offset == max)
                                break;
                        if (!xa_is_sibling(entry))
                                entry = xa_mk_sibling(xas->xa_offset);
                } else {
                        if (offset == XA_CHUNK_MASK)
                                break;
                }
                next = xa_entry_locked(xas->xa, node, ++offset);
                if (!xa_is_sibling(next)) {
                        if (!entry && (offset > max))
                                break;
                        first = next;
                }
                slot++;
        }

        update_node(xas, node, count, values);
        return first;
}
EXPORT_SYMBOL_GPL(xas_store);

/**
 * xas_get_mark() - Returns the state of this mark.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Return: true if the mark is set, false if the mark is clear or @xas
 * is in an error state.
 */
bool xas_get_mark(const struct xa_state *xas, xa_mark_t mark)
{
        if (xas_invalid(xas))
                return false;
        if (!xas->xa_node)
                return xa_marked(xas->xa, mark);
        return node_get_mark(xas->xa_node, xas->xa_offset, mark);
}
EXPORT_SYMBOL_GPL(xas_get_mark);

/**
 * xas_set_mark() - Sets the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Sets the specified mark on this entry, and walks up the tree setting it
 * on all the ancestor entries.  Does nothing if @xas has not been walked to
 * an entry, or is in an error state.
 */
void xas_set_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (node_set_mark(node, offset, mark))
                        return;
                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (!xa_marked(xas->xa, mark))
                xa_mark_set(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_set_mark);

/**
 * xas_clear_mark() - Clears the mark on this entry and its parents.
 * @xas: XArray operation state.
 * @mark: Mark number.
 *
 * Clears the specified mark on this entry, and walks back to the head
 * attempting to clear it on all the ancestor entries.  Does nothing if
 * @xas has not been walked to an entry, or is in an error state.
 */
void xas_clear_mark(const struct xa_state *xas, xa_mark_t mark)
{
        struct xa_node *node = xas->xa_node;
        unsigned int offset = xas->xa_offset;

        if (xas_invalid(xas))
                return;

        while (node) {
                if (!node_clear_mark(node, offset, mark))
                        return;
                if (node_any_mark(node, mark))
                        return;

                offset = node->offset;
                node = xa_parent_locked(xas->xa, node);
        }

        if (xa_marked(xas->xa, mark))
                xa_mark_clear(xas->xa, mark);
}
EXPORT_SYMBOL_GPL(xas_clear_mark);

/**
 * xas_init_marks() - Initialise all marks for the entry
 * @xas: Array operations state.
 *
 * Initialise all marks for the entry specified by @xas.  If we're tracking
 * free entries with a mark, we need to set it on all entries.  All other
 * marks are cleared.
 *
 * This implementation is not as efficient as it could be; we may walk
 * up the tree multiple times.
 */
void xas_init_marks(const struct xa_state *xas)
{
        xa_mark_t mark = 0;

        for (;;) {
                if (xa_track_free(xas->xa) && mark == XA_FREE_MARK)
                        xas_set_mark(xas, mark);
                else
                        xas_clear_mark(xas, mark);
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}
EXPORT_SYMBOL_GPL(xas_init_marks);

#ifdef CONFIG_XARRAY_MULTI
static unsigned int node_get_marks(struct xa_node *node, unsigned int offset)
{
        unsigned int marks = 0;
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (node_get_mark(node, offset, mark))
                        marks |= 1 << (__force unsigned int)mark;
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }

        return marks;
}

static inline void node_mark_slots(struct xa_node *node, unsigned int sibs,
                xa_mark_t mark)
{
        int i;

        if (sibs == 0)
                node_mark_all(node, mark);
        else {
                for (i = 0; i < XA_CHUNK_SIZE; i += sibs + 1)
                        node_set_mark(node, i, mark);
        }
}

static void node_set_marks(struct xa_node *node, unsigned int offset,
                        struct xa_node *child, unsigned int sibs,
                        unsigned int marks)
{
        xa_mark_t mark = XA_MARK_0;

        for (;;) {
                if (marks & (1 << (__force unsigned int)mark)) {
                        node_set_mark(node, offset, mark);
                        if (child)
                                node_mark_slots(child, sibs, mark);
                }
                if (mark == XA_MARK_MAX)
                        break;
                mark_inc(mark);
        }
}

static void __xas_init_node_for_split(struct xa_state *xas,
                struct xa_node *node, void *entry)
{
        unsigned int i;
        void *sibling = NULL;
        unsigned int mask = xas->xa_sibs;

        if (!node)
                return;
        node->array = xas->xa;
        for (i = 0; i < XA_CHUNK_SIZE; i++) {
                if ((i & mask) == 0) {
                        RCU_INIT_POINTER(node->slots[i], entry);
                        sibling = xa_mk_sibling(i);
                } else {
                        RCU_INIT_POINTER(node->slots[i], sibling);
                }
        }
}

/**
 * xas_split_alloc() - Allocate memory for splitting an entry.
 * @xas: XArray operation state.
 * @entry: New entry which will be stored in the array.
 * @order: Current entry order.
 * @gfp: Memory allocation flags.
 *
 * This function should be called before calling xas_split().
 * If necessary, it will allocate new nodes (and fill them with @entry)
 * to prepare for the upcoming split of an entry of @order size into
 * entries of the order stored in the @xas.
 *
 * Context: May sleep if @gfp flags permit.
 */
void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order,
                gfp_t gfp)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;

        /* XXX: no support for splitting really large entries yet */
        if (WARN_ON(xas->xa_shift + 2 * XA_CHUNK_SHIFT <= order))
                goto nomem;
        if (xas->xa_shift + XA_CHUNK_SHIFT > order)
                return;

        do {
                struct xa_node *node;

                node = kmem_cache_alloc_lru(radix_tree_node_cachep, xas->xa_lru, gfp);
                if (!node)
                        goto nomem;

                __xas_init_node_for_split(xas, node, entry);
                RCU_INIT_POINTER(node->parent, xas->xa_alloc);
                xas->xa_alloc = node;
        } while (sibs-- > 0);

        return;
nomem:
        xas_destroy(xas);
        xas_set_err(xas, -ENOMEM);
}
EXPORT_SYMBOL_GPL(xas_split_alloc);

/**
 * xas_split() - Split a multi-index entry into smaller entries.
 * @xas: XArray operation state.
 * @entry: New entry to store in the array.
 * @order: Current entry order.
 *
 * The size of the new entries is set in @xas.  The value in @entry is
 * copied to all the replacement entries.
 *
 * Context: Any context.  The caller should hold the xa_lock.
 */
void xas_split(struct xa_state *xas, void *entry, unsigned int order)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int offset, marks;
        struct xa_node *node;
        void *curr = xas_load(xas);
        int values = 0;

        node = xas->xa_node;
        if (xas_top(node))
                return;

        marks = node_get_marks(node, xas->xa_offset);

        offset = xas->xa_offset + sibs;
        do {
                if (xas->xa_shift < node->shift) {
                        struct xa_node *child = xas->xa_alloc;

                        xas->xa_alloc = rcu_dereference_raw(child->parent);
                        child->shift = node->shift - XA_CHUNK_SHIFT;
                        child->offset = offset;
                        child->count = XA_CHUNK_SIZE;
                        child->nr_values = xa_is_value(entry) ?
                                        XA_CHUNK_SIZE : 0;
                        RCU_INIT_POINTER(child->parent, node);
                        node_set_marks(node, offset, child, xas->xa_sibs,
                                        marks);
                        rcu_assign_pointer(node->slots[offset],
                                        xa_mk_node(child));
                        if (xa_is_value(curr))
                                values--;
                        xas_update(xas, child);
                } else {
                        unsigned int canon = offset - xas->xa_sibs;

                        node_set_marks(node, canon, NULL, 0, marks);
                        rcu_assign_pointer(node->slots[canon], entry);
                        while (offset > canon)
                                rcu_assign_pointer(node->slots[offset--],
                                                xa_mk_sibling(canon));
                        values += (xa_is_value(entry) - xa_is_value(curr)) *
                                        (xas->xa_sibs + 1);
                }
        } while (offset-- > xas->xa_offset);

        node->nr_values += values;
        xas_update(xas, node);
}
EXPORT_SYMBOL_GPL(xas_split);

/**
 * xas_try_split_min_order() - Minimal split order xas_try_split() can accept
 * @order: Current entry order.
 *
 * xas_try_split() can split a multi-index entry to smaller than @order - 1 if
 * no new xa_node is needed. This function provides the minimal order
 * xas_try_split() supports.
 *
 * Return: the minimal order xas_try_split() supports
 *
 * Context: Any context.
 *
 */
unsigned int xas_try_split_min_order(unsigned int order)
{
        if (order % XA_CHUNK_SHIFT == 0)
                return order == 0 ? 0 : order - 1;

        return order - (order % XA_CHUNK_SHIFT);
}
EXPORT_SYMBOL_GPL(xas_try_split_min_order);

/**
 * xas_try_split() - Try to split a multi-index entry.
 * @xas: XArray operation state.
 * @entry: New entry to store in the array.
 * @order: Current entry order.
 *
 * The size of the new entries is set in @xas.  The value in @entry is
 * copied to all the replacement entries. If and only if one new xa_node is
 * needed, the function will use GFP_NOWAIT to get one if xas->xa_alloc is
 * NULL. If more new xa_node are needed, the function gives EINVAL error.
 *
 * NOTE: use xas_try_split_min_order() to get next split order instead of
 * @order - 1 if you want to minmize xas_try_split() calls.
 *
 * Context: Any context.  The caller should hold the xa_lock.
 */
void xas_try_split(struct xa_state *xas, void *entry, unsigned int order)
{
        unsigned int sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1;
        unsigned int offset, marks;
        struct xa_node *node;
        void *curr = xas_load(xas);
        int values = 0;
        gfp_t gfp = GFP_NOWAIT;

        node = xas->xa_node;
        if (xas_top(node))
                return;

        if (xas->xa->xa_flags & XA_FLAGS_ACCOUNT)
                gfp |= __GFP_ACCOUNT;

        marks = node_get_marks(node, xas->xa_offset);

        offset = xas->xa_offset + sibs;

        if (xas->xa_shift < node->shift) {
                struct xa_node *child = xas->xa_alloc;
                unsigned int expected_sibs =
                        (1 << ((order - 1) % XA_CHUNK_SHIFT)) - 1;

                /*
                 * No support for splitting sibling entries
                 * (horizontally) or cascade split (vertically), which
                 * requires two or more new xa_nodes.
                 * Since if one xa_node allocation fails,
                 * it is hard to free the prior allocations.
                 */
                if (sibs || xas->xa_sibs != expected_sibs) {
                        xas_destroy(xas);
                        xas_set_err(xas, -EINVAL);
                        return;
                }

                if (!child) {
                        child = kmem_cache_alloc_lru(radix_tree_node_cachep,
                                                     xas->xa_lru, gfp);
                        if (!child) {
                                xas_destroy(xas);
                                xas_set_err(xas, -ENOMEM);
                                return;
                        }
                        RCU_INIT_POINTER(child->parent, xas->xa_alloc);
                }
                __xas_init_node_for_split(xas, child, entry);

                xas->xa_alloc = rcu_dereference_raw(child->parent);
                child->shift = node->shift - XA_CHUNK_SHIFT;
                child->offset = offset;
                child->count = XA_CHUNK_SIZE;
                child->nr_values = xa_is_value(entry) ?
                                XA_CHUNK_SIZE : 0;
                RCU_INIT_POINTER(child->parent, node);
                node_set_marks(node, offset, child, xas->xa_sibs,
                                marks);
                rcu_assign_pointer(node->slots[offset],
                                xa_mk_node(child));
                if (xa_is_value(curr))
                        values--;
                xas_update(xas, child);

        } else {
                do {
                        unsigned int canon = offset - xas->xa_sibs;

                        node_set_marks(node, canon, NULL, 0, marks);
                        rcu_assign_pointer(node->slots[canon], entry);
                        while (offset > canon)
                                rcu_assign_pointer(node->slots[offset--],
                                                xa_mk_sibling(canon));
                        values += (xa_is_value(entry) - xa_is_value(curr)) *
                                        (xas->xa_sibs + 1);
                } while (offset-- > xas->xa_offset);
        }

        node->nr_values += values;
        xas_update(xas, node);
}
EXPORT_SYMBOL_GPL(xas_try_split);
#endif

/**
 * xas_pause() - Pause a walk to drop a lock.
 * @xas: XArray operation state.
 *
 * Some users need to pause a walk and drop the lock they're holding in
 * order to yield to a higher priority thread or carry out an operation
 * on an entry.  Those users should call this function before they drop
 * the lock.  It resets the @xas to be suitable for the next iteration
 * of the loop after the user has reacquired the lock.  If most entries
 * found during a walk require you to call xas_pause(), the xa_for_each()
 * iterator may be more appropriate.
 *
 * Note that xas_pause() only works for forward iteration.  If a user needs
 * to pause a reverse iteration, we will need a xas_pause_rev().
 */
void xas_pause(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;

        if (xas_invalid(xas))
                return;

        xas->xa_node = XAS_RESTART;
        if (node) {
                unsigned long offset = xas->xa_offset;
                while (++offset < XA_CHUNK_SIZE) {
                        if (!xa_is_sibling(xa_entry(xas->xa, node, offset)))
                                break;
                }
                xas->xa_index &= ~0UL << node->shift;
                xas->xa_index += (offset - xas->xa_offset) << node->shift;
                if (xas->xa_index == 0)
                        xas->xa_node = XAS_BOUNDS;
        } else {
                xas->xa_index++;
        }
}
EXPORT_SYMBOL_GPL(xas_pause);

/*
 * __xas_prev() - Find the previous entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_prev() which handles all the complex cases
 * out of line.
 */
void *__xas_prev(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index--;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset--;

        while (xas->xa_offset == 255) {
                xas->xa_offset = xas->xa_node->offset - 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_prev);

/*
 * __xas_next() - Find the next entry in the XArray.
 * @xas: XArray operation state.
 *
 * Helper function for xas_next() which handles all the complex cases
 * out of line.
 */
void *__xas_next(struct xa_state *xas)
{
        void *entry;

        if (!xas_frozen(xas->xa_node))
                xas->xa_index++;
        if (!xas->xa_node)
                return set_bounds(xas);
        if (xas_not_node(xas->xa_node))
                return xas_load(xas);

        if (xas->xa_offset != get_offset(xas->xa_index, xas->xa_node))
                xas->xa_offset++;

        while (xas->xa_offset == XA_CHUNK_SIZE) {
                xas->xa_offset = xas->xa_node->offset + 1;
                xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                if (!xas->xa_node)
                        return set_bounds(xas);
        }

        for (;;) {
                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!xa_is_node(entry))
                        return entry;

                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }
}
EXPORT_SYMBOL_GPL(__xas_next);

/**
 * xas_find() - Find the next present entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 *
 * If the @xas has not yet been walked to an entry, return the entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we move to the
 * next entry.
 *
 * If no entry is found and the array is smaller than @max, the iterator
 * is set to the smallest index not yet in the array.  This allows @xas
 * to be immediately passed to xas_store().
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find(struct xa_state *xas, unsigned long max)
{
        void *entry;

        if (xas_error(xas) || xas->xa_node == XAS_BOUNDS)
                return NULL;
        if (xas->xa_index > max)
                return set_bounds(xas);

        if (!xas->xa_node) {
                xas->xa_index = 1;
                return set_bounds(xas);
        } else if (xas->xa_node == XAS_RESTART) {
                entry = xas_load(xas);
                if (entry || xas_not_node(xas->xa_node))
                        return entry;
        } else if (!xas->xa_node->shift &&
                    xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK)) {
                xas->xa_offset = ((xas->xa_index - 1) & XA_CHUNK_MASK) + 1;
        }

        xas_next_offset(xas);

        while (xas->xa_node && (xas->xa_index <= max)) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (xa_is_node(entry)) {
                        xas->xa_node = xa_to_node(entry);
                        xas->xa_offset = 0;
                        continue;
                }
                if (entry && !xa_is_sibling(entry))
                        return entry;

                xas_next_offset(xas);
        }

        if (!xas->xa_node)
                xas->xa_node = XAS_BOUNDS;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find);

/**
 * xas_find_marked() - Find the next marked entry in the XArray.
 * @xas: XArray operation state.
 * @max: Highest index to return.
 * @mark: Mark number to search for.
 *
 * If the @xas has not yet been walked to an entry, return the marked entry
 * which has an index >= xas.xa_index.  If it has been walked, the entry
 * currently being pointed at has been processed, and so we return the
 * first marked entry with an index > xas.xa_index.
 *
 * If no marked entry is found and the array is smaller than @max, @xas is
 * set to the bounds state and xas->xa_index is set to the smallest index
 * not yet in the array.  This allows @xas to be immediately passed to
 * xas_store().
 *
 * If no entry is found before @max is reached, @xas is set to the restart
 * state.
 *
 * Return: The entry, if found, otherwise %NULL.
 */
void *xas_find_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark)
{
        bool advance = true;
        unsigned int offset;
        void *entry;

        if (xas_error(xas))
                return NULL;
        if (xas->xa_index > max)
                goto max;

        if (!xas->xa_node) {
                xas->xa_index = 1;
                goto out;
        } else if (xas_top(xas->xa_node)) {
                advance = false;
                entry = xa_head(xas->xa);
                xas->xa_node = NULL;
                if (xas->xa_index > max_index(entry))
                        goto out;
                if (!xa_is_node(entry)) {
                        if (xa_marked(xas->xa, mark))
                                return entry;
                        xas->xa_index = 1;
                        goto out;
                }
                xas->xa_node = xa_to_node(entry);
                xas->xa_offset = xas->xa_index >> xas->xa_node->shift;
        }

        while (xas->xa_index <= max) {
                if (unlikely(xas->xa_offset == XA_CHUNK_SIZE)) {
                        xas->xa_offset = xas->xa_node->offset + 1;
                        xas->xa_node = xa_parent(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        advance = false;
                        continue;
                }

                if (!advance) {
                        entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                        if (xa_is_sibling(entry)) {
                                xas->xa_offset = xa_to_sibling(entry);
                                xas_move_index(xas, xas->xa_offset);
                        }
                }

                offset = xas_find_chunk(xas, advance, mark);
                if (offset > xas->xa_offset) {
                        advance = false;
                        xas_move_index(xas, offset);
                        /* Mind the wrap */
                        if ((xas->xa_index - 1) >= max)
                                goto max;
                        xas->xa_offset = offset;
                        if (offset == XA_CHUNK_SIZE)
                                continue;
                }

                entry = xa_entry(xas->xa, xas->xa_node, xas->xa_offset);
                if (!entry && !(xa_track_free(xas->xa) && mark == XA_FREE_MARK))
                        continue;
                if (xa_is_sibling(entry))
                        continue;
                if (!xa_is_node(entry))
                        return entry;
                xas->xa_node = xa_to_node(entry);
                xas_set_offset(xas);
        }

out:
        if (xas->xa_index > max)
                goto max;
        return set_bounds(xas);
max:
        xas->xa_node = XAS_RESTART;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_marked);

/**
 * xas_find_conflict() - Find the next present entry in a range.
 * @xas: XArray operation state.
 *
 * The @xas describes both a range and a position within that range.
 *
 * Context: Any context.  Expects xa_lock to be held.
 * Return: The next entry in the range covered by @xas or %NULL.
 */
void *xas_find_conflict(struct xa_state *xas)
{
        void *curr;

        if (xas_error(xas))
                return NULL;

        if (!xas->xa_node)
                return NULL;

        if (xas_top(xas->xa_node)) {
                curr = xas_start(xas);
                if (!curr)
                        return NULL;
                while (xa_is_node(curr)) {
                        struct xa_node *node = xa_to_node(curr);
                        curr = xas_descend(xas, node);
                }
                if (curr)
                        return curr;
        }

        if (xas->xa_node->shift > xas->xa_shift)
                return NULL;

        for (;;) {
                if (xas->xa_node->shift == xas->xa_shift) {
                        if ((xas->xa_offset & xas->xa_sibs) == xas->xa_sibs)
                                break;
                } else if (xas->xa_offset == XA_CHUNK_MASK) {
                        xas->xa_offset = xas->xa_node->offset;
                        xas->xa_node = xa_parent_locked(xas->xa, xas->xa_node);
                        if (!xas->xa_node)
                                break;
                        continue;
                }
                curr = xa_entry_locked(xas->xa, xas->xa_node, ++xas->xa_offset);
                if (xa_is_sibling(curr))
                        continue;
                while (xa_is_node(curr)) {
                        xas->xa_node = xa_to_node(curr);
                        xas->xa_offset = 0;
                        curr = xa_entry_locked(xas->xa, xas->xa_node, 0);
                }
                if (curr)
                        return curr;
        }
        xas->xa_offset -= xas->xa_sibs;
        return NULL;
}
EXPORT_SYMBOL_GPL(xas_find_conflict);

/**
 * xa_load() - Load an entry from an XArray.
 * @xa: XArray.
 * @index: index into array.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry at @index in @xa.
 */
void *xa_load(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        do {
                entry = xa_zero_to_null(xas_load(&xas));
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        return entry;
}
EXPORT_SYMBOL(xa_load);

static void *xas_result(struct xa_state *xas, void *curr)
{
        if (xas_error(xas))
                curr = xas->xa_node;
        return curr;
}

/**
 * __xa_erase() - Erase this entry from the XArray while locked.
 * @xa: XArray.
 * @index: Index into array.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 * Return: The entry which used to be at this index.
 */
void *__xa_erase(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        return xas_result(&xas, xa_zero_to_null(xas_store(&xas, NULL)));
}
EXPORT_SYMBOL(__xa_erase);

/**
 * xa_erase() - Erase this entry from the XArray.
 * @xa: XArray.
 * @index: Index of entry.
 *
 * After this function returns, loading from @index will return %NULL.
 * If the index is part of a multi-index entry, all indices will be erased
 * and none of the entries will be part of a multi-index entry.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * Return: The entry which used to be at this index.
 */
void *xa_erase(struct xarray *xa, unsigned long index)
{
        void *entry;

        xa_lock(xa);
        entry = __xa_erase(xa, index);
        xa_unlock(xa);

        return entry;
}
EXPORT_SYMBOL(xa_erase);

/**
 * __xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);
        if (xa_track_free(xa) && !entry)
                entry = XA_ZERO_ENTRY;

        do {
                curr = xas_store(&xas, entry);
                if (xa_track_free(xa))
                        xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, xa_zero_to_null(curr));
}
EXPORT_SYMBOL(__xa_store);

/**
 * xa_store() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from this index will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Any context.  Takes and releases the xa_lock.
 * May sleep if the @gfp flags permit.
 * Return: The old entry at this index on success, xa_err(-EINVAL) if @entry
 * cannot be stored in an XArray, or xa_err(-ENOMEM) if memory allocation
 * failed.
 */
void *xa_store(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        void *curr;

        xa_lock(xa);
        curr = __xa_store(xa, index, entry, gfp);
        xa_unlock(xa);

        return curr;
}
EXPORT_SYMBOL(xa_store);

static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp);

/**
 * __xa_cmpxchg() - Store this entry in the XArray.
 * @xa: XArray.
 * @index: Index into array.
 * @old: Old value to test against.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * You must already be holding the xa_lock when calling this function.
 * It will drop the lock if needed to allocate memory, and then reacquire
 * it afterwards.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: The old entry at this index or xa_err() if an error happened.
 */
void *__xa_cmpxchg(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        return xa_zero_to_null(__xa_cmpxchg_raw(xa, index, old, entry, gfp));
}
EXPORT_SYMBOL(__xa_cmpxchg);

static inline void *__xa_cmpxchg_raw(struct xarray *xa, unsigned long index,
                        void *old, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, index);
        void *curr;

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return XA_ERROR(-EINVAL);

        do {
                curr = xas_load(&xas);
                if (curr == old) {
                        xas_store(&xas, entry);
                        if (xa_track_free(xa) && entry && !curr)
                                xas_clear_mark(&xas, XA_FREE_MARK);
                }
        } while (__xas_nomem(&xas, gfp));

        return xas_result(&xas, curr);
}

/**
 * __xa_insert() - Store this entry in the XArray if no entry is present.
 * @xa: XArray.
 * @index: Index into array.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Inserting a NULL entry will store a reserved entry (like xa_reserve())
 * if no entry is present.  Inserting will fail if a reserved entry is
 * present, even though loading from this index will return NULL.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the store succeeded.  -EBUSY if another entry was present.
 * -ENOMEM if memory could not be allocated.
 */
int __xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp)
{
        void *curr;
        int errno;

        if (!entry)
                entry = XA_ZERO_ENTRY;
        curr = __xa_cmpxchg_raw(xa, index, NULL, entry, gfp);
        errno = xa_err(curr);
        if (errno)
                return errno;
        return (curr != NULL) ? -EBUSY : 0;
}
EXPORT_SYMBOL(__xa_insert);

#ifdef CONFIG_XARRAY_MULTI
static void xas_set_range(struct xa_state *xas, unsigned long first,
                unsigned long last)
{
        unsigned int shift = 0;
        unsigned long sibs = last - first;
        unsigned int offset = XA_CHUNK_MASK;

        xas_set(xas, first);

        while ((first & XA_CHUNK_MASK) == 0) {
                if (sibs < XA_CHUNK_MASK)
                        break;
                if ((sibs == XA_CHUNK_MASK) && (offset < XA_CHUNK_MASK))
                        break;
                shift += XA_CHUNK_SHIFT;
                if (offset == XA_CHUNK_MASK)
                        offset = sibs & XA_CHUNK_MASK;
                sibs >>= XA_CHUNK_SHIFT;
                first >>= XA_CHUNK_SHIFT;
        }

        offset = first & XA_CHUNK_MASK;
        if (offset + sibs > XA_CHUNK_MASK)
                sibs = XA_CHUNK_MASK - offset;
        if ((((first + sibs + 1) << shift) - 1) > last)
                sibs -= 1;

        xas->xa_shift = shift;
        xas->xa_sibs = sibs;
}

/**
 * xa_store_range() - Store this entry at a range of indices in the XArray.
 * @xa: XArray.
 * @first: First index to affect.
 * @last: Last index to affect.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * After this function returns, loads from any index between @first and @last,
 * inclusive will return @entry.
 * Storing into an existing multi-index entry updates the entry of every index.
 * The marks associated with @index are unaffected unless @entry is %NULL.
 *
 * Context: Process context.  Takes and releases the xa_lock.  May sleep
 * if the @gfp flags permit.
 * Return: %NULL on success, xa_err(-EINVAL) if @entry cannot be stored in
 * an XArray, or xa_err(-ENOMEM) if memory allocation failed.
 */
void *xa_store_range(struct xarray *xa, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_internal(entry)))
                return XA_ERROR(-EINVAL);
        if (last < first)
                return XA_ERROR(-EINVAL);

        do {
                xas_lock(&xas);
                if (entry) {
                        unsigned int order = BITS_PER_LONG;
                        if (last + 1)
                                order = __ffs(last + 1);
                        xas_set_order(&xas, last, order);
                        xas_create(&xas, true);
                        if (xas_error(&xas))
                                goto unlock;
                }
                do {
                        xas_set_range(&xas, first, last);
                        xas_store(&xas, entry);
                        if (xas_error(&xas))
                                goto unlock;
                        first += xas_size(&xas);
                } while (first <= last);
unlock:
                xas_unlock(&xas);
        } while (xas_nomem(&xas, gfp));

        return xas_result(&xas, NULL);
}
EXPORT_SYMBOL(xa_store_range);

/**
 * xas_get_order() - Get the order of an entry.
 * @xas: XArray operation state.
 *
 * Called after xas_load, the xas should not be in an error state.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xas_get_order(struct xa_state *xas)
{
        int order = 0;

        if (!xas->xa_node)
                return 0;

        for (;;) {
                unsigned int slot = xas->xa_offset + (1 << order);

                if (slot >= XA_CHUNK_SIZE)
                        break;
                if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot)))
                        break;
                order++;
        }

        order += xas->xa_node->shift;
        return order;
}
EXPORT_SYMBOL_GPL(xas_get_order);

/**
 * xa_get_order() - Get the order of an entry.
 * @xa: XArray.
 * @index: Index of the entry.
 *
 * Return: A number between 0 and 63 indicating the order of the entry.
 */
int xa_get_order(struct xarray *xa, unsigned long index)
{
        XA_STATE(xas, xa, index);
        int order = 0;
        void *entry;

        rcu_read_lock();
        entry = xas_load(&xas);
        if (entry)
                order = xas_get_order(&xas);
        rcu_read_unlock();

        return order;
}
EXPORT_SYMBOL(xa_get_order);
#endif /* CONFIG_XARRAY_MULTI */

/**
 * __xa_alloc() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @limit: Range for allocated ID.
 * @entry: New entry.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 on success, -ENOMEM if memory could not be allocated or
 * -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, gfp_t gfp)
{
        XA_STATE(xas, xa, 0);

        if (WARN_ON_ONCE(xa_is_advanced(entry)))
                return -EINVAL;
        if (WARN_ON_ONCE(!xa_track_free(xa)))
                return -EINVAL;

        if (!entry)
                entry = XA_ZERO_ENTRY;

        do {
                xas.xa_index = limit.min;
                xas_find_marked(&xas, limit.max, XA_FREE_MARK);
                if (xas.xa_node == XAS_RESTART)
                        xas_set_err(&xas, -EBUSY);
                else
                        *id = xas.xa_index;
                xas_store(&xas, entry);
                xas_clear_mark(&xas, XA_FREE_MARK);
        } while (__xas_nomem(&xas, gfp));

        return xas_error(&xas);
}
EXPORT_SYMBOL(__xa_alloc);

/**
 * __xa_alloc_cyclic() - Find somewhere to store this entry in the XArray.
 * @xa: XArray.
 * @id: Pointer to ID.
 * @entry: New entry.
 * @limit: Range of allocated ID.
 * @next: Pointer to next ID to allocate.
 * @gfp: Memory allocation flags.
 *
 * Finds an empty entry in @xa between @limit.min and @limit.max,
 * stores the index into the @id pointer, then stores the entry at
 * that index.  A concurrent lookup will not see an uninitialised @id.
 * The search for an empty entry will start at @next and will wrap
 * around if necessary.
 *
 * Must only be operated on an xarray initialized with flag XA_FLAGS_ALLOC set
 * in xa_init_flags().
 *
 * Context: Any context.  Expects xa_lock to be held on entry.  May
 * release and reacquire xa_lock if @gfp flags permit.
 * Return: 0 if the allocation succeeded without wrapping.  1 if the
 * allocation succeeded after wrapping, -ENOMEM if memory could not be
 * allocated or -EBUSY if there are no free entries in @limit.
 */
int __xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry,
                struct xa_limit limit, u32 *next, gfp_t gfp)
{
        u32 min = limit.min;
        int ret;

        limit.min = max(min, *next);
        ret = __xa_alloc(xa, id, entry, limit, gfp);
        if ((xa->xa_flags & XA_FLAGS_ALLOC_WRAPPED) && ret == 0) {
                xa->xa_flags &= ~XA_FLAGS_ALLOC_WRAPPED;
                ret = 1;
        }

        if (ret < 0 && limit.min > min) {
                limit.min = min;
                ret = __xa_alloc(xa, id, entry, limit, gfp);
                if (ret == 0)
                        ret = 1;
        }

        if (ret >= 0) {
                *next = *id + 1;
                if (*next == 0)
                        xa->xa_flags |= XA_FLAGS_ALLOC_WRAPPED;
        }
        return ret;
}
EXPORT_SYMBOL(__xa_alloc_cyclic);

/**
 * __xa_set_mark() - Set this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_set_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_set_mark);

/**
 * __xa_clear_mark() - Clear this mark on this entry while locked.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Context: Any context.  Expects xa_lock to be held on entry.
 */
void __xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry = xas_load(&xas);

        if (entry)
                xas_clear_mark(&xas, mark);
}
EXPORT_SYMBOL(__xa_clear_mark);

/**
 * xa_get_mark() - Inquire whether this mark is set on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * This function uses the RCU read lock, so the result may be out of date
 * by the time it returns.  If you need the result to be stable, use a lock.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: True if the entry at @index has this mark set, false if it doesn't.
 */
bool xa_get_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        XA_STATE(xas, xa, index);
        void *entry;

        rcu_read_lock();
        entry = xas_start(&xas);
        while (xas_get_mark(&xas, mark)) {
                if (!xa_is_node(entry))
                        goto found;
                entry = xas_descend(&xas, xa_to_node(entry));
        }
        rcu_read_unlock();
        return false;
 found:
        rcu_read_unlock();
        return true;
}
EXPORT_SYMBOL(xa_get_mark);

/**
 * xa_set_mark() - Set this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Attempting to set a mark on a %NULL entry does not succeed.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_set_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_set_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_set_mark);

/**
 * xa_clear_mark() - Clear this mark on this entry.
 * @xa: XArray.
 * @index: Index of entry.
 * @mark: Mark number.
 *
 * Clearing a mark always succeeds.
 *
 * Context: Process context.  Takes and releases the xa_lock.
 */
void xa_clear_mark(struct xarray *xa, unsigned long index, xa_mark_t mark)
{
        xa_lock(xa);
        __xa_clear_mark(xa, index, mark);
        xa_unlock(xa);
}
EXPORT_SYMBOL(xa_clear_mark);

/**
 * xa_find() - Search the XArray for an entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter, and has the lowest
 * index that is at least @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may not find
 * entries which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The entry, if found, otherwise %NULL.
 */
void *xa_find(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp);
        void *entry;

        rcu_read_lock();
        do {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find);

static bool xas_sibling(struct xa_state *xas)
{
        struct xa_node *node = xas->xa_node;
        unsigned long mask;

        if (!IS_ENABLED(CONFIG_XARRAY_MULTI) || !node)
                return false;
        mask = (XA_CHUNK_SIZE << node->shift) - 1;
        return (xas->xa_index & mask) >
                ((unsigned long)xas->xa_offset << node->shift);
}

/**
 * xa_find_after() - Search the XArray for a present entry.
 * @xa: XArray.
 * @indexp: Pointer to an index.
 * @max: Maximum index to search to.
 * @filter: Selection criterion.
 *
 * Finds the entry in @xa which matches the @filter and has the lowest
 * index that is above @indexp and no more than @max.
 * If an entry is found, @indexp is updated to be the index of the entry.
 * This function is protected by the RCU read lock, so it may miss entries
 * which are being simultaneously added.  It will not return an
 * %XA_RETRY_ENTRY; if you need to see retry entries, use xas_find().
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The pointer, if found, otherwise %NULL.
 */
void *xa_find_after(struct xarray *xa, unsigned long *indexp,
                        unsigned long max, xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp + 1);
        void *entry;

        if (xas.xa_index == 0)
                return NULL;

        rcu_read_lock();
        for (;;) {
                if ((__force unsigned int)filter < XA_MAX_MARKS)
                        entry = xas_find_marked(&xas, max, filter);
                else
                        entry = xas_find(&xas, max);

                if (xas_invalid(&xas))
                        break;
                if (xas_sibling(&xas))
                        continue;
                if (!xas_retry(&xas, entry))
                        break;
        }
        rcu_read_unlock();

        if (entry)
                *indexp = xas.xa_index;
        return entry;
}
EXPORT_SYMBOL(xa_find_after);

static unsigned int xas_extract_present(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each(xas, entry, max) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

static unsigned int xas_extract_marked(struct xa_state *xas, void **dst,
                        unsigned long max, unsigned int n, xa_mark_t mark)
{
        void *entry;
        unsigned int i = 0;

        rcu_read_lock();
        xas_for_each_marked(xas, entry, max, mark) {
                if (xas_retry(xas, entry))
                        continue;
                dst[i++] = entry;
                if (i == n)
                        break;
        }
        rcu_read_unlock();

        return i;
}

/**
 * xa_extract() - Copy selected entries from the XArray into a normal array.
 * @xa: The source XArray to copy from.
 * @dst: The buffer to copy entries into.
 * @start: The first index in the XArray eligible to be selected.
 * @max: The last index in the XArray eligible to be selected.
 * @n: The maximum number of entries to copy.
 * @filter: Selection criterion.
 *
 * Copies up to @n entries that match @filter from the XArray.  The
 * copied entries will have indices between @start and @max, inclusive.
 *
 * The @filter may be an XArray mark value, in which case entries which are
 * marked with that mark will be copied.  It may also be %XA_PRESENT, in
 * which case all entries which are not %NULL will be copied.
 *
 * The entries returned may not represent a snapshot of the XArray at a
 * moment in time.  For example, if another thread stores to index 5, then
 * index 10, calling xa_extract() may return the old contents of index 5
 * and the new contents of index 10.  Indices not modified while this
 * function is running will not be skipped.
 *
 * If you need stronger guarantees, holding the xa_lock across calls to this
 * function will prevent concurrent modification.
 *
 * Context: Any context.  Takes and releases the RCU lock.
 * Return: The number of entries copied.
 */
unsigned int xa_extract(struct xarray *xa, void **dst, unsigned long start,
                        unsigned long max, unsigned int n, xa_mark_t filter)
{
        XA_STATE(xas, xa, start);

        if (!n)
                return 0;

        if ((__force unsigned int)filter < XA_MAX_MARKS)
                return xas_extract_marked(&xas, dst, max, n, filter);
        return xas_extract_present(&xas, dst, max, n);
}
EXPORT_SYMBOL(xa_extract);

/**
 * xa_delete_node() - Private interface for workingset code.
 * @node: Node to be removed from the tree.
 * @update: Function to call to update ancestor nodes.
 *
 * Context: xa_lock must be held on entry and will not be released.
 */
void xa_delete_node(struct xa_node *node, xa_update_node_t update)
{
        struct xa_state xas = {
                .xa = node->array,
                .xa_index = (unsigned long)node->offset <<
                                (node->shift + XA_CHUNK_SHIFT),
                .xa_shift = node->shift + XA_CHUNK_SHIFT,
                .xa_offset = node->offset,
                .xa_node = xa_parent_locked(node->array, node),
                .xa_update = update,
        };

        xas_store(&xas, NULL);
}
EXPORT_SYMBOL_GPL(xa_delete_node);        /* For the benefit of the test suite */

/**
 * xa_destroy() - Free all internal data structures.
 * @xa: XArray.
 *
 * After calling this function, the XArray is empty and has freed all memory
 * allocated for its internal data structures.  You are responsible for
 * freeing the objects referenced by the XArray.
 *
 * Context: Any context.  Takes and releases the xa_lock, interrupt-safe.
 */
void xa_destroy(struct xarray *xa)
{
        XA_STATE(xas, xa, 0);
        unsigned long flags;
        void *entry;

        xas.xa_node = NULL;
        xas_lock_irqsave(&xas, flags);
        entry = xa_head_locked(xa);
        RCU_INIT_POINTER(xa->xa_head, NULL);
        xas_init_marks(&xas);
        if (xa_zero_busy(xa))
                xa_mark_clear(xa, XA_FREE_MARK);
        /* lockdep checks we're still holding the lock in xas_free_nodes() */
        if (xa_is_node(entry))
                xas_free_nodes(&xas, xa_to_node(entry));
        xas_unlock_irqrestore(&xas, flags);
}
EXPORT_SYMBOL(xa_destroy);

#ifdef XA_DEBUG
void xa_dump_node(const struct xa_node *node)
{
        unsigned i, j;

        if (!node)
                return;
        if ((unsigned long)node & 3) {
                pr_cont("node %px\n", node);
                return;
        }

        pr_cont("node %px %s %d parent %px shift %d count %d values %d "
                "array %px list %px %px marks",
                node, node->parent ? "offset" : "max", node->offset,
                node->parent, node->shift, node->count, node->nr_values,
                node->array, node->private_list.prev, node->private_list.next);
        for (i = 0; i < XA_MAX_MARKS; i++)
                for (j = 0; j < XA_MARK_LONGS; j++)
                        pr_cont(" %lx", node->marks[i][j]);
        pr_cont("\n");
}

void xa_dump_index(unsigned long index, unsigned int shift)
{
        if (!shift)
                pr_info("%lu: ", index);
        else if (shift >= BITS_PER_LONG)
                pr_info("0-%lu: ", ~0UL);
        else
                pr_info("%lu-%lu: ", index, index | ((1UL << shift) - 1));
}

void xa_dump_entry(const void *entry, unsigned long index, unsigned long shift)
{
        if (!entry)
                return;

        xa_dump_index(index, shift);

        if (xa_is_node(entry)) {
                if (shift == 0) {
                        pr_cont("%px\n", entry);
                } else {
                        unsigned long i;
                        struct xa_node *node = xa_to_node(entry);
                        xa_dump_node(node);
                        for (i = 0; i < XA_CHUNK_SIZE; i++)
                                xa_dump_entry(node->slots[i],
                                      index + (i << node->shift), node->shift);
                }
        } else if (xa_is_value(entry))
                pr_cont("value %ld (0x%lx) [%px]\n", xa_to_value(entry),
                                                xa_to_value(entry), entry);
        else if (!xa_is_internal(entry))
                pr_cont("%px\n", entry);
        else if (xa_is_retry(entry))
                pr_cont("retry (%ld)\n", xa_to_internal(entry));
        else if (xa_is_sibling(entry))
                pr_cont("sibling (slot %ld)\n", xa_to_sibling(entry));
        else if (xa_is_zero(entry))
                pr_cont("zero (%ld)\n", xa_to_internal(entry));
        else
                pr_cont("UNKNOWN ENTRY (%px)\n", entry);
}

void xa_dump(const struct xarray *xa)
{
        void *entry = xa->xa_head;
        unsigned int shift = 0;

        pr_info("xarray: %px head %px flags %x marks %d %d %d\n", xa, entry,
                        xa->xa_flags, xa_marked(xa, XA_MARK_0),
                        xa_marked(xa, XA_MARK_1), xa_marked(xa, XA_MARK_2));
        if (xa_is_node(entry))
                shift = xa_to_node(entry)->shift + XA_CHUNK_SHIFT;
        xa_dump_entry(entry, 0, shift);
}
#endif


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 






    3 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2007-2009 Patrick McHardy <kaber@trash.net>
 *
 * Development of this code funded by Astaro AG (http://www.astaro.com/)
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/vmalloc.h>
#include <linux/rhashtable.h>
#include <linux/audit.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_offload.h>
#include <net/net_namespace.h>
#include <net/sock.h>

#define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
#define NFT_SET_MAX_ANONLEN 16

/* limit compaction to avoid huge kmalloc/krealloc sizes. */
#define NFT_MAX_SET_NELEMS ((2048 - sizeof(struct nft_trans_elem)) / sizeof(struct nft_trans_one_elem))

unsigned int nf_tables_net_id __read_mostly;

static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
static DEFINE_SPINLOCK(nf_tables_gc_list_lock);

enum {
        NFT_VALIDATE_SKIP        = 0,
        NFT_VALIDATE_NEED,
        NFT_VALIDATE_DO,
};

static struct rhltable nft_objname_ht;

static u32 nft_chain_hash(const void *data, u32 len, u32 seed);
static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed);
static int nft_chain_hash_cmp(struct rhashtable_compare_arg *, const void *);

static u32 nft_objname_hash(const void *data, u32 len, u32 seed);
static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed);
static int nft_objname_hash_cmp(struct rhashtable_compare_arg *, const void *);

static const struct rhashtable_params nft_chain_ht_params = {
        .head_offset                = offsetof(struct nft_chain, rhlhead),
        .key_offset                = offsetof(struct nft_chain, name),
        .hashfn                        = nft_chain_hash,
        .obj_hashfn                = nft_chain_hash_obj,
        .obj_cmpfn                = nft_chain_hash_cmp,
        .automatic_shrinking        = true,
};

static const struct rhashtable_params nft_objname_ht_params = {
        .head_offset                = offsetof(struct nft_object, rhlhead),
        .key_offset                = offsetof(struct nft_object, key),
        .hashfn                        = nft_objname_hash,
        .obj_hashfn                = nft_objname_hash_obj,
        .obj_cmpfn                = nft_objname_hash_cmp,
        .automatic_shrinking        = true,
};

struct nft_audit_data {
        struct nft_table *table;
        int entries;
        int op;
        struct list_head list;
};

static const u8 nft2audit_op[NFT_MSG_MAX] = { // enum nf_tables_msg_types
        [NFT_MSG_NEWTABLE]        = AUDIT_NFT_OP_TABLE_REGISTER,
        [NFT_MSG_GETTABLE]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELTABLE]        = AUDIT_NFT_OP_TABLE_UNREGISTER,
        [NFT_MSG_NEWCHAIN]        = AUDIT_NFT_OP_CHAIN_REGISTER,
        [NFT_MSG_GETCHAIN]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELCHAIN]        = AUDIT_NFT_OP_CHAIN_UNREGISTER,
        [NFT_MSG_NEWRULE]        = AUDIT_NFT_OP_RULE_REGISTER,
        [NFT_MSG_GETRULE]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELRULE]        = AUDIT_NFT_OP_RULE_UNREGISTER,
        [NFT_MSG_NEWSET]        = AUDIT_NFT_OP_SET_REGISTER,
        [NFT_MSG_GETSET]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELSET]        = AUDIT_NFT_OP_SET_UNREGISTER,
        [NFT_MSG_NEWSETELEM]        = AUDIT_NFT_OP_SETELEM_REGISTER,
        [NFT_MSG_GETSETELEM]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELSETELEM]        = AUDIT_NFT_OP_SETELEM_UNREGISTER,
        [NFT_MSG_NEWGEN]        = AUDIT_NFT_OP_GEN_REGISTER,
        [NFT_MSG_GETGEN]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_TRACE]                = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_NEWOBJ]        = AUDIT_NFT_OP_OBJ_REGISTER,
        [NFT_MSG_GETOBJ]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELOBJ]        = AUDIT_NFT_OP_OBJ_UNREGISTER,
        [NFT_MSG_GETOBJ_RESET]        = AUDIT_NFT_OP_OBJ_RESET,
        [NFT_MSG_NEWFLOWTABLE]        = AUDIT_NFT_OP_FLOWTABLE_REGISTER,
        [NFT_MSG_GETFLOWTABLE]        = AUDIT_NFT_OP_INVALID,
        [NFT_MSG_DELFLOWTABLE]        = AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
        [NFT_MSG_GETSETELEM_RESET] = AUDIT_NFT_OP_SETELEM_RESET,
};

static void nft_validate_state_update(struct nft_table *table, u8 new_validate_state)
{
        switch (table->validate_state) {
        case NFT_VALIDATE_SKIP:
                WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO);
                break;
        case NFT_VALIDATE_NEED:
                break;
        case NFT_VALIDATE_DO:
                if (new_validate_state == NFT_VALIDATE_NEED)
                        return;
        }

        table->validate_state = new_validate_state;
}
static void nf_tables_trans_destroy_work(struct work_struct *w);

static void nft_trans_gc_work(struct work_struct *work);
static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);

static void nft_ctx_init(struct nft_ctx *ctx,
                         struct net *net,
                         const struct sk_buff *skb,
                         const struct nlmsghdr *nlh,
                         u8 family,
                         struct nft_table *table,
                         struct nft_chain *chain,
                         const struct nlattr * const *nla)
{
        ctx->net        = net;
        ctx->family        = family;
        ctx->level        = 0;
        ctx->table        = table;
        ctx->chain        = chain;
        ctx->nla           = nla;
        ctx->portid        = NETLINK_CB(skb).portid;
        ctx->report        = nlmsg_report(nlh);
        ctx->flags        = nlh->nlmsg_flags;
        ctx->seq        = nlh->nlmsg_seq;

        bitmap_zero(ctx->reg_inited, NFT_REG32_NUM);
}

static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
                                             int msg_type, u32 size, gfp_t gfp)
{
        struct nft_trans *trans;

        trans = kzalloc(size, gfp);
        if (trans == NULL)
                return NULL;

        INIT_LIST_HEAD(&trans->list);
        trans->msg_type = msg_type;

        trans->net = ctx->net;
        trans->table = ctx->table;
        trans->seq = ctx->seq;
        trans->flags = ctx->flags;
        trans->report = ctx->report;

        return trans;
}

static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
                                         int msg_type, u32 size)
{
        return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
}

static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans)
{
        switch (trans->msg_type) {
        case NFT_MSG_NEWCHAIN:
        case NFT_MSG_NEWSET:
                return container_of(trans, struct nft_trans_binding, nft_trans);
        }

        return NULL;
}

static void nft_trans_list_del(struct nft_trans *trans)
{
        struct nft_trans_binding *trans_binding;

        list_del(&trans->list);

        trans_binding = nft_trans_get_binding(trans);
        if (trans_binding)
                list_del(&trans_binding->binding_list);
}

static void nft_trans_destroy(struct nft_trans *trans)
{
        nft_trans_list_del(trans);
        kfree(trans);
}

static void __nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set,
                                 bool bind)
{
        struct nftables_pernet *nft_net;
        struct net *net = ctx->net;
        struct nft_trans *trans;

        if (!nft_set_is_anonymous(set))
                return;

        nft_net = nft_pernet(net);
        list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
                switch (trans->msg_type) {
                case NFT_MSG_NEWSET:
                        if (nft_trans_set(trans) == set)
                                nft_trans_set_bound(trans) = bind;
                        break;
                case NFT_MSG_NEWSETELEM:
                        if (nft_trans_elem_set(trans) == set)
                                nft_trans_elem_set_bound(trans) = bind;
                        break;
                }
        }
}

static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set)
{
        return __nft_set_trans_bind(ctx, set, true);
}

static void nft_set_trans_unbind(const struct nft_ctx *ctx, struct nft_set *set)
{
        return __nft_set_trans_bind(ctx, set, false);
}

static void __nft_chain_trans_bind(const struct nft_ctx *ctx,
                                   struct nft_chain *chain, bool bind)
{
        struct nftables_pernet *nft_net;
        struct net *net = ctx->net;
        struct nft_trans *trans;

        if (!nft_chain_binding(chain))
                return;

        nft_net = nft_pernet(net);
        list_for_each_entry_reverse(trans, &nft_net->commit_list, list) {
                switch (trans->msg_type) {
                case NFT_MSG_NEWCHAIN:
                        if (nft_trans_chain(trans) == chain)
                                nft_trans_chain_bound(trans) = bind;
                        break;
                case NFT_MSG_NEWRULE:
                        if (nft_trans_rule_chain(trans) == chain)
                                nft_trans_rule_bound(trans) = bind;
                        break;
                }
        }
}

static void nft_chain_trans_bind(const struct nft_ctx *ctx,
                                 struct nft_chain *chain)
{
        __nft_chain_trans_bind(ctx, chain, true);
}

int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
{
        if (!nft_chain_binding(chain))
                return 0;

        if (nft_chain_binding(ctx->chain))
                return -EOPNOTSUPP;

        if (chain->bound)
                return -EBUSY;

        if (!nft_use_inc(&chain->use))
                return -EMFILE;

        chain->bound = true;
        nft_chain_trans_bind(ctx, chain);

        return 0;
}

void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain)
{
        __nft_chain_trans_bind(ctx, chain, false);
}

static int nft_netdev_register_hooks(struct net *net,
                                     struct list_head *hook_list)
{
        struct nft_hook *hook;
        int err, j;

        j = 0;
        list_for_each_entry(hook, hook_list, list) {
                err = nf_register_net_hook(net, &hook->ops);
                if (err < 0)
                        goto err_register;

                j++;
        }
        return 0;

err_register:
        list_for_each_entry(hook, hook_list, list) {
                if (j-- <= 0)
                        break;

                nf_unregister_net_hook(net, &hook->ops);
        }
        return err;
}

static void nft_netdev_unregister_hooks(struct net *net,
                                        struct list_head *hook_list,
                                        bool release_netdev)
{
        struct nft_hook *hook, *next;

        list_for_each_entry_safe(hook, next, hook_list, list) {
                nf_unregister_net_hook(net, &hook->ops);
                if (release_netdev) {
                        list_del(&hook->list);
                        kfree_rcu(hook, rcu);
                }
        }
}

static int nf_tables_register_hook(struct net *net,
                                   const struct nft_table *table,
                                   struct nft_chain *chain)
{
        struct nft_base_chain *basechain;
        const struct nf_hook_ops *ops;

        if (table->flags & NFT_TABLE_F_DORMANT ||
            !nft_is_base_chain(chain))
                return 0;

        basechain = nft_base_chain(chain);
        ops = &basechain->ops;

        if (basechain->type->ops_register)
                return basechain->type->ops_register(net, ops);

        if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
                return nft_netdev_register_hooks(net, &basechain->hook_list);

        return nf_register_net_hook(net, &basechain->ops);
}

static void __nf_tables_unregister_hook(struct net *net,
                                        const struct nft_table *table,
                                        struct nft_chain *chain,
                                        bool release_netdev)
{
        struct nft_base_chain *basechain;
        const struct nf_hook_ops *ops;

        if (table->flags & NFT_TABLE_F_DORMANT ||
            !nft_is_base_chain(chain))
                return;
        basechain = nft_base_chain(chain);
        ops = &basechain->ops;

        if (basechain->type->ops_unregister)
                return basechain->type->ops_unregister(net, ops);

        if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
                nft_netdev_unregister_hooks(net, &basechain->hook_list,
                                            release_netdev);
        else
                nf_unregister_net_hook(net, &basechain->ops);
}

static void nf_tables_unregister_hook(struct net *net,
                                      const struct nft_table *table,
                                      struct nft_chain *chain)
{
        return __nf_tables_unregister_hook(net, table, chain, false);
}

static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, const struct nft_trans_elem *b)
{
        /* NB: the ->bound equality check is defensive, at this time we only merge
         * a new nft_trans_elem transaction request with the transaction tail
         * element, but a->bound != b->bound would imply a NEWRULE transaction
         * is queued in-between.
         *
         * The set check is mandatory, the NFT_MAX_SET_NELEMS check prevents
         * huge krealloc() requests.
         */
        return a->set == b->set && a->bound == b->bound && a->nelems < NFT_MAX_SET_NELEMS;
}

static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
                                        struct nft_trans_elem *tail,
                                        struct nft_trans_elem *trans,
                                        gfp_t gfp)
{
        unsigned int nelems, old_nelems = tail->nelems;
        struct nft_trans_elem *new_trans;

        if (!nft_trans_collapse_set_elem_allowed(tail, trans))
                return false;

        /* "cannot happen", at this time userspace element add
         * requests always allocate a new transaction element.
         *
         * This serves as a reminder to adjust the list_add_tail
         * logic below in case this ever changes.
         */
        if (WARN_ON_ONCE(trans->nelems != 1))
                return false;

        if (check_add_overflow(old_nelems, trans->nelems, &nelems))
                return false;

        /* krealloc might free tail which invalidates list pointers */
        list_del_init(&tail->nft_trans.list);

        new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp);
        if (!new_trans) {
                list_add_tail(&tail->nft_trans.list, &nft_net->commit_list);
                return false;
        }

        /*
         * new_trans->nft_trans.list contains garbage, but
         * list_add_tail() doesn't care.
         */
        new_trans->nelems = nelems;
        new_trans->elems[old_nelems] = trans->elems[0];
        list_add_tail(&new_trans->nft_trans.list, &nft_net->commit_list);

        return true;
}

static bool nft_trans_try_collapse(struct nftables_pernet *nft_net,
                                   struct nft_trans *trans, gfp_t gfp)
{
        struct nft_trans *tail;

        if (list_empty(&nft_net->commit_list))
                return false;

        tail = list_last_entry(&nft_net->commit_list, struct nft_trans, list);

        if (tail->msg_type != trans->msg_type)
                return false;

        switch (trans->msg_type) {
        case NFT_MSG_NEWSETELEM:
        case NFT_MSG_DELSETELEM:
                return nft_trans_collapse_set_elem(nft_net,
                                                   nft_trans_container_elem(tail),
                                                   nft_trans_container_elem(trans), gfp);
        }

        return false;
}

static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *trans)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans_binding *binding;
        struct nft_trans_set *trans_set;

        list_add_tail(&trans->list, &nft_net->commit_list);

        binding = nft_trans_get_binding(trans);
        if (!binding)
                return;

        switch (trans->msg_type) {
        case NFT_MSG_NEWSET:
                trans_set = nft_trans_container_set(trans);

                if (!nft_trans_set_update(trans) &&
                    nft_set_is_anonymous(nft_trans_set(trans)))
                        list_add_tail(&binding->binding_list, &nft_net->binding_list);

                list_add_tail(&trans_set->list_trans_newset, &nft_net->commit_set_list);
                break;
        case NFT_MSG_NEWCHAIN:
                if (!nft_trans_chain_update(trans) &&
                    nft_chain_binding(nft_trans_chain(trans)))
                        list_add_tail(&binding->binding_list, &nft_net->binding_list);
                break;
        }
}

static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans,
                                           gfp_t gfp)
{
        struct nftables_pernet *nft_net = nft_pernet(net);

        WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM &&
                     trans->msg_type != NFT_MSG_DELSETELEM);

        might_alloc(gfp);

        if (nft_trans_try_collapse(nft_net, trans, gfp)) {
                kfree(trans);
                return;
        }

        nft_trans_commit_list_add_tail(net, trans);
}

static int nft_trans_table_add(struct nft_ctx *ctx, int msg_type)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_table));
        if (trans == NULL)
                return -ENOMEM;

        if (msg_type == NFT_MSG_NEWTABLE)
                nft_activate_next(ctx->net, ctx->table);

        nft_trans_commit_list_add_tail(ctx->net, trans);
        return 0;
}

static int nft_deltable(struct nft_ctx *ctx)
{
        int err;

        err = nft_trans_table_add(ctx, NFT_MSG_DELTABLE);
        if (err < 0)
                return err;

        nft_deactivate_next(ctx->net, ctx->table);
        return err;
}

static struct nft_trans *
nft_trans_alloc_chain(const struct nft_ctx *ctx, int msg_type)
{
        struct nft_trans_chain *trans_chain;
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_chain));
        if (!trans)
                return NULL;

        trans_chain = nft_trans_container_chain(trans);
        INIT_LIST_HEAD(&trans_chain->nft_trans_binding.binding_list);
        trans_chain->chain = ctx->chain;

        return trans;
}

static struct nft_trans *nft_trans_chain_add(struct nft_ctx *ctx, int msg_type)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc_chain(ctx, msg_type);
        if (trans == NULL)
                return ERR_PTR(-ENOMEM);

        if (msg_type == NFT_MSG_NEWCHAIN) {
                nft_activate_next(ctx->net, ctx->chain);

                if (ctx->nla[NFTA_CHAIN_ID]) {
                        nft_trans_chain_id(trans) =
                                ntohl(nla_get_be32(ctx->nla[NFTA_CHAIN_ID]));
                }
        }
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return trans;
}

static int nft_delchain(struct nft_ctx *ctx)
{
        struct nft_trans *trans;

        trans = nft_trans_chain_add(ctx, NFT_MSG_DELCHAIN);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        nft_use_dec(&ctx->table->use);
        nft_deactivate_next(ctx->net, ctx->chain);

        return 0;
}

void nft_rule_expr_activate(const struct nft_ctx *ctx, struct nft_rule *rule)
{
        struct nft_expr *expr;

        expr = nft_expr_first(rule);
        while (nft_expr_more(rule, expr)) {
                if (expr->ops->activate)
                        expr->ops->activate(ctx, expr);

                expr = nft_expr_next(expr);
        }
}

void nft_rule_expr_deactivate(const struct nft_ctx *ctx, struct nft_rule *rule,
                              enum nft_trans_phase phase)
{
        struct nft_expr *expr;

        expr = nft_expr_first(rule);
        while (nft_expr_more(rule, expr)) {
                if (expr->ops->deactivate)
                        expr->ops->deactivate(ctx, expr, phase);

                expr = nft_expr_next(expr);
        }
}

static int
nf_tables_delrule_deactivate(struct nft_ctx *ctx, struct nft_rule *rule)
{
        /* You cannot delete the same rule twice */
        if (nft_is_active_next(ctx->net, rule)) {
                nft_deactivate_next(ctx->net, rule);
                nft_use_dec(&ctx->chain->use);
                return 0;
        }
        return -ENOENT;
}

static struct nft_trans *nft_trans_rule_add(struct nft_ctx *ctx, int msg_type,
                                            struct nft_rule *rule)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_rule));
        if (trans == NULL)
                return NULL;

        if (msg_type == NFT_MSG_NEWRULE && ctx->nla[NFTA_RULE_ID] != NULL) {
                nft_trans_rule_id(trans) =
                        ntohl(nla_get_be32(ctx->nla[NFTA_RULE_ID]));
        }
        nft_trans_rule(trans) = rule;
        nft_trans_rule_chain(trans) = ctx->chain;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return trans;
}

static int nft_delrule(struct nft_ctx *ctx, struct nft_rule *rule)
{
        struct nft_flow_rule *flow;
        struct nft_trans *trans;
        int err;

        trans = nft_trans_rule_add(ctx, NFT_MSG_DELRULE, rule);
        if (trans == NULL)
                return -ENOMEM;

        if (ctx->chain->flags & NFT_CHAIN_HW_OFFLOAD) {
                flow = nft_flow_rule_create(ctx->net, rule);
                if (IS_ERR(flow)) {
                        nft_trans_destroy(trans);
                        return PTR_ERR(flow);
                }

                nft_trans_flow_rule(trans) = flow;
        }

        err = nf_tables_delrule_deactivate(ctx, rule);
        if (err < 0) {
                nft_trans_destroy(trans);
                return err;
        }
        nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_PREPARE);

        return 0;
}

static int nft_delrule_by_chain(struct nft_ctx *ctx)
{
        struct nft_rule *rule;
        int err;

        list_for_each_entry(rule, &ctx->chain->rules, list) {
                if (!nft_is_active_next(ctx->net, rule))
                        continue;

                err = nft_delrule(ctx, rule);
                if (err < 0)
                        return err;
        }
        return 0;
}

static int __nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
                               struct nft_set *set,
                               const struct nft_set_desc *desc)
{
        struct nft_trans_set *trans_set;
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_set));
        if (trans == NULL)
                return -ENOMEM;

        trans_set = nft_trans_container_set(trans);
        INIT_LIST_HEAD(&trans_set->nft_trans_binding.binding_list);
        INIT_LIST_HEAD(&trans_set->list_trans_newset);

        if (msg_type == NFT_MSG_NEWSET && ctx->nla[NFTA_SET_ID] && !desc) {
                nft_trans_set_id(trans) =
                        ntohl(nla_get_be32(ctx->nla[NFTA_SET_ID]));
                nft_activate_next(ctx->net, set);
        }
        nft_trans_set(trans) = set;
        if (desc) {
                nft_trans_set_update(trans) = true;
                nft_trans_set_gc_int(trans) = desc->gc_int;
                nft_trans_set_timeout(trans) = desc->timeout;
                nft_trans_set_size(trans) = desc->size;
        }
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;
}

static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type,
                             struct nft_set *set)
{
        return __nft_trans_set_add(ctx, msg_type, set, NULL);
}

static int nft_mapelem_deactivate(const struct nft_ctx *ctx,
                                  struct nft_set *set,
                                  const struct nft_set_iter *iter,
                                  struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        nft_set_elem_change_active(ctx->net, set, ext);
        nft_setelem_data_deactivate(ctx->net, set, elem_priv);

        return 0;
}

struct nft_set_elem_catchall {
        struct list_head        list;
        struct rcu_head                rcu;
        struct nft_elem_priv        *elem;
};

static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
                                        struct nft_set *set)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;

        list_for_each_entry(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask))
                        continue;

                nft_set_elem_change_active(ctx->net, set, ext);
                nft_setelem_data_deactivate(ctx->net, set, catchall->elem);
                break;
        }
}

static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set)
{
        struct nft_set_iter iter = {
                .genmask        = nft_genmask_next(ctx->net),
                .type                = NFT_ITER_UPDATE,
                .fn                = nft_mapelem_deactivate,
        };

        set->ops->walk(ctx, set, &iter);
        WARN_ON_ONCE(iter.err);

        nft_map_catchall_deactivate(ctx, set);
}

static int nft_delset(const struct nft_ctx *ctx, struct nft_set *set)
{
        int err;

        err = nft_trans_set_add(ctx, NFT_MSG_DELSET, set);
        if (err < 0)
                return err;

        if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                nft_map_deactivate(ctx, set);

        nft_deactivate_next(ctx->net, set);
        nft_use_dec(&ctx->table->use);

        return err;
}

static int nft_trans_obj_add(struct nft_ctx *ctx, int msg_type,
                             struct nft_object *obj)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, sizeof(struct nft_trans_obj));
        if (trans == NULL)
                return -ENOMEM;

        if (msg_type == NFT_MSG_NEWOBJ)
                nft_activate_next(ctx->net, obj);

        nft_trans_obj(trans) = obj;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;
}

static int nft_delobj(struct nft_ctx *ctx, struct nft_object *obj)
{
        int err;

        err = nft_trans_obj_add(ctx, NFT_MSG_DELOBJ, obj);
        if (err < 0)
                return err;

        nft_deactivate_next(ctx->net, obj);
        nft_use_dec(&ctx->table->use);

        return err;
}

static struct nft_trans *
nft_trans_flowtable_add(struct nft_ctx *ctx, int msg_type,
                        struct nft_flowtable *flowtable)
{
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type,
                                sizeof(struct nft_trans_flowtable));
        if (trans == NULL)
                return ERR_PTR(-ENOMEM);

        if (msg_type == NFT_MSG_NEWFLOWTABLE)
                nft_activate_next(ctx->net, flowtable);

        INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
        nft_trans_flowtable(trans) = flowtable;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return trans;
}

static int nft_delflowtable(struct nft_ctx *ctx,
                            struct nft_flowtable *flowtable)
{
        struct nft_trans *trans;

        trans = nft_trans_flowtable_add(ctx, NFT_MSG_DELFLOWTABLE, flowtable);
        if (IS_ERR(trans))
                return PTR_ERR(trans);

        nft_deactivate_next(ctx->net, flowtable);
        nft_use_dec(&ctx->table->use);

        return 0;
}

static void __nft_reg_track_clobber(struct nft_regs_track *track, u8 dreg)
{
        int i;

        for (i = track->regs[dreg].num_reg; i > 0; i--)
                __nft_reg_track_cancel(track, dreg - i);
}

static void __nft_reg_track_update(struct nft_regs_track *track,
                                   const struct nft_expr *expr,
                                   u8 dreg, u8 num_reg)
{
        track->regs[dreg].selector = expr;
        track->regs[dreg].bitwise = NULL;
        track->regs[dreg].num_reg = num_reg;
}

void nft_reg_track_update(struct nft_regs_track *track,
                          const struct nft_expr *expr, u8 dreg, u8 len)
{
        unsigned int regcount;
        int i;

        __nft_reg_track_clobber(track, dreg);

        regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
        for (i = 0; i < regcount; i++, dreg++)
                __nft_reg_track_update(track, expr, dreg, i);
}
EXPORT_SYMBOL_GPL(nft_reg_track_update);

void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len)
{
        unsigned int regcount;
        int i;

        __nft_reg_track_clobber(track, dreg);

        regcount = DIV_ROUND_UP(len, NFT_REG32_SIZE);
        for (i = 0; i < regcount; i++, dreg++)
                __nft_reg_track_cancel(track, dreg);
}
EXPORT_SYMBOL_GPL(nft_reg_track_cancel);

void __nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg)
{
        track->regs[dreg].selector = NULL;
        track->regs[dreg].bitwise = NULL;
        track->regs[dreg].num_reg = 0;
}
EXPORT_SYMBOL_GPL(__nft_reg_track_cancel);

/*
 * Tables
 */

static struct nft_table *nft_table_lookup(const struct net *net,
                                          const struct nlattr *nla,
                                          u8 family, u8 genmask, u32 nlpid)
{
        struct nftables_pernet *nft_net;
        struct nft_table *table;

        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        nft_net = nft_pernet(net);
        list_for_each_entry_rcu(table, &nft_net->tables, list,
                                lockdep_is_held(&nft_net->commit_mutex)) {
                if (!nla_strcmp(nla, table->name) &&
                    table->family == family &&
                    nft_active_genmask(table, genmask)) {
                        if (nft_table_has_owner(table) &&
                            nlpid && table->nlpid != nlpid)
                                return ERR_PTR(-EPERM);

                        return table;
                }
        }

        return ERR_PTR(-ENOENT);
}

static struct nft_table *nft_table_lookup_byhandle(const struct net *net,
                                                   const struct nlattr *nla,
                                                   int family, u8 genmask, u32 nlpid)
{
        struct nftables_pernet *nft_net;
        struct nft_table *table;

        nft_net = nft_pernet(net);
        list_for_each_entry(table, &nft_net->tables, list) {
                if (be64_to_cpu(nla_get_be64(nla)) == table->handle &&
                    table->family == family &&
                    nft_active_genmask(table, genmask)) {
                        if (nft_table_has_owner(table) &&
                            nlpid && table->nlpid != nlpid)
                                return ERR_PTR(-EPERM);

                        return table;
                }
        }

        return ERR_PTR(-ENOENT);
}

static inline u64 nf_tables_alloc_handle(struct nft_table *table)
{
        return ++table->hgenerator;
}

static const struct nft_chain_type *chain_type[NFPROTO_NUMPROTO][NFT_CHAIN_T_MAX];

static const struct nft_chain_type *
__nft_chain_type_get(u8 family, enum nft_chain_types type)
{
        if (family >= NFPROTO_NUMPROTO ||
            type >= NFT_CHAIN_T_MAX)
                return NULL;

        return chain_type[family][type];
}

static const struct nft_chain_type *
__nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
{
        const struct nft_chain_type *type;
        int i;

        for (i = 0; i < NFT_CHAIN_T_MAX; i++) {
                type = __nft_chain_type_get(family, i);
                if (!type)
                        continue;
                if (!nla_strcmp(nla, type->name))
                        return type;
        }
        return NULL;
}

struct nft_module_request {
        struct list_head        list;
        char                        module[MODULE_NAME_LEN];
        bool                        done;
};

#ifdef CONFIG_MODULES
__printf(2, 3) int nft_request_module(struct net *net, const char *fmt,
                                      ...)
{
        char module_name[MODULE_NAME_LEN];
        struct nftables_pernet *nft_net;
        struct nft_module_request *req;
        va_list args;
        int ret;

        va_start(args, fmt);
        ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
        va_end(args);
        if (ret >= MODULE_NAME_LEN)
                return 0;

        nft_net = nft_pernet(net);
        list_for_each_entry(req, &nft_net->module_list, list) {
                if (!strcmp(req->module, module_name)) {
                        if (req->done)
                                return 0;

                        /* A request to load this module already exists. */
                        return -EAGAIN;
                }
        }

        req = kmalloc(sizeof(*req), GFP_KERNEL);
        if (!req)
                return -ENOMEM;

        req->done = false;
        strscpy(req->module, module_name, MODULE_NAME_LEN);
        list_add_tail(&req->list, &nft_net->module_list);

        return -EAGAIN;
}
EXPORT_SYMBOL_GPL(nft_request_module);
#endif

static void lockdep_nfnl_nft_mutex_not_held(void)
{
#ifdef CONFIG_PROVE_LOCKING
        if (debug_locks)
                WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
#endif
}

static const struct nft_chain_type *
nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
                            u8 family, bool autoload)
{
        const struct nft_chain_type *type;

        type = __nf_tables_chain_type_lookup(nla, family);
        if (type != NULL)
                return type;

        lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
        if (autoload) {
                if (nft_request_module(net, "nft-chain-%u-%.*s", family,
                                       nla_len(nla),
                                       (const char *)nla_data(nla)) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);
        }
#endif
        return ERR_PTR(-ENOENT);
}

static __be16 nft_base_seq(const struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);

        return htons(nft_net->base_seq & 0xffff);
}

static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
        [NFTA_TABLE_NAME]        = { .type = NLA_STRING,
                                    .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_TABLE_FLAGS]        = { .type = NLA_U32 },
        [NFTA_TABLE_HANDLE]        = { .type = NLA_U64 },
        [NFTA_TABLE_USERDATA]        = { .type = NLA_BINARY,
                                    .len = NFT_USERDATA_MAXLEN }
};

static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
                                     u32 portid, u32 seq, int event, u32 flags,
                                     int family, const struct nft_table *table)
{
        struct nlmsghdr *nlh;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_TABLE_NAME, table->name) ||
            nla_put_be32(skb, NFTA_TABLE_USE, htonl(table->use)) ||
            nla_put_be64(skb, NFTA_TABLE_HANDLE, cpu_to_be64(table->handle),
                         NFTA_TABLE_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELTABLE) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (nla_put_be32(skb, NFTA_TABLE_FLAGS,
                         htonl(table->flags & NFT_TABLE_F_MASK)))
                goto nla_put_failure;

        if (nft_table_has_owner(table) &&
            nla_put_be32(skb, NFTA_TABLE_OWNER, htonl(table->nlpid)))
                goto nla_put_failure;

        if (table->udata) {
                if (nla_put(skb, NFTA_TABLE_USERDATA, table->udlen, table->udata))
                        goto nla_put_failure;
        }

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

struct nftnl_skb_parms {
        bool report;
};
#define NFT_CB(skb)        (*(struct nftnl_skb_parms*)&((skb)->cb))

static void nft_notify_enqueue(struct sk_buff *skb, bool report,
                               struct list_head *notify_list)
{
        NFT_CB(skb).report = report;
        list_add_tail(&skb->list, notify_list);
}

static void nf_tables_table_notify(const struct nft_ctx *ctx, int event)
{
        struct nftables_pernet *nft_net;
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_table_info(skb, ctx->net, ctx->portid, ctx->seq,
                                        event, flags, ctx->family, ctx->table);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_net = nft_pernet(ctx->net);
        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static int nf_tables_dump_tables(struct sk_buff *skb,
                                 struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        struct nftables_pernet *nft_net;
        const struct nft_table *table;
        unsigned int idx = 0, s_idx = cb->args[0];
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                if (idx < s_idx)
                        goto cont;
                if (idx > s_idx)
                        memset(&cb->args[1], 0,
                               sizeof(cb->args) - sizeof(cb->args[0]));
                if (!nft_is_active(net, table))
                        continue;
                if (nf_tables_fill_table_info(skb, net,
                                              NETLINK_CB(cb->skb).portid,
                                              cb->nlh->nlmsg_seq,
                                              NFT_MSG_NEWTABLE, NLM_F_MULTI,
                                              table->family, table) < 0)
                        goto done;

                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                idx++;
        }
done:
        rcu_read_unlock();
        cb->args[0] = idx;
        return skb->len;
}

static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb,
                                      const struct nlmsghdr *nlh,
                                      struct netlink_dump_control *c)
{
        int err;

        if (!try_module_get(THIS_MODULE))
                return -EINVAL;

        rcu_read_unlock();
        err = netlink_dump_start(nlsk, skb, nlh, c);
        rcu_read_lock();
        module_put(THIS_MODULE);

        return err;
}

/* called with rcu_read_lock held */
static int nf_tables_gettable(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_table *table;
        struct net *net = info->net;
        struct sk_buff *skb2;
        int err;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_tables,
                        .module = THIS_MODULE,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_TABLE_NAME]);
                return PTR_ERR(table);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;

        err = nf_tables_fill_table_info(skb2, net, NETLINK_CB(skb).portid,
                                        info->nlh->nlmsg_seq, NFT_MSG_NEWTABLE,
                                        0, family, table);
        if (err < 0)
                goto err_fill_table_info;

        return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);

err_fill_table_info:
        kfree_skb(skb2);
        return err;
}

static void nft_table_disable(struct net *net, struct nft_table *table, u32 cnt)
{
        struct nft_chain *chain;
        u32 i = 0;

        list_for_each_entry(chain, &table->chains, list) {
                if (!nft_is_active_next(net, chain))
                        continue;
                if (!nft_is_base_chain(chain))
                        continue;

                if (cnt && i++ == cnt)
                        break;

                nf_tables_unregister_hook(net, table, chain);
        }
}

static int nf_tables_table_enable(struct net *net, struct nft_table *table)
{
        struct nft_chain *chain;
        int err, i = 0;

        list_for_each_entry(chain, &table->chains, list) {
                if (!nft_is_active_next(net, chain))
                        continue;
                if (!nft_is_base_chain(chain))
                        continue;

                err = nf_tables_register_hook(net, table, chain);
                if (err < 0)
                        goto err_register_hooks;

                i++;
        }
        return 0;

err_register_hooks:
        if (i)
                nft_table_disable(net, table, i);
        return err;
}

static void nf_tables_table_disable(struct net *net, struct nft_table *table)
{
        table->flags &= ~NFT_TABLE_F_DORMANT;
        nft_table_disable(net, table, 0);
        table->flags |= NFT_TABLE_F_DORMANT;
}

#define __NFT_TABLE_F_INTERNAL                (NFT_TABLE_F_MASK + 1)
#define __NFT_TABLE_F_WAS_DORMANT        (__NFT_TABLE_F_INTERNAL << 0)
#define __NFT_TABLE_F_WAS_AWAKEN        (__NFT_TABLE_F_INTERNAL << 1)
#define __NFT_TABLE_F_WAS_ORPHAN        (__NFT_TABLE_F_INTERNAL << 2)
#define __NFT_TABLE_F_UPDATE                (__NFT_TABLE_F_WAS_DORMANT | \
                                         __NFT_TABLE_F_WAS_AWAKEN | \
                                         __NFT_TABLE_F_WAS_ORPHAN)

static bool nft_table_pending_update(const struct nft_ctx *ctx)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        struct nft_trans *trans;

        if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
                return true;

        list_for_each_entry(trans, &nft_net->commit_list, list) {
                if (trans->table == ctx->table &&
                    ((trans->msg_type == NFT_MSG_NEWCHAIN &&
                      nft_trans_chain_update(trans)) ||
                     (trans->msg_type == NFT_MSG_DELCHAIN &&
                      nft_is_base_chain(nft_trans_chain(trans)))))
                        return true;
        }

        return false;
}

static int nf_tables_updtable(struct nft_ctx *ctx)
{
        struct nft_trans *trans;
        u32 flags;
        int ret;

        if (!ctx->nla[NFTA_TABLE_FLAGS])
                return 0;

        flags = ntohl(nla_get_be32(ctx->nla[NFTA_TABLE_FLAGS]));
        if (flags & ~NFT_TABLE_F_MASK)
                return -EOPNOTSUPP;

        if (flags == (ctx->table->flags & NFT_TABLE_F_MASK))
                return 0;

        if ((nft_table_has_owner(ctx->table) &&
             !(flags & NFT_TABLE_F_OWNER)) ||
            (flags & NFT_TABLE_F_OWNER &&
             !nft_table_is_orphan(ctx->table)))
                return -EOPNOTSUPP;

        if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST)
                return -EOPNOTSUPP;

        /* No dormant off/on/off/on games in single transaction */
        if (nft_table_pending_update(ctx))
                return -EINVAL;

        trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE,
                                sizeof(struct nft_trans_table));
        if (trans == NULL)
                return -ENOMEM;

        if ((flags & NFT_TABLE_F_DORMANT) &&
            !(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
                ctx->table->flags |= NFT_TABLE_F_DORMANT;
                if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE))
                        ctx->table->flags |= __NFT_TABLE_F_WAS_AWAKEN;
        } else if (!(flags & NFT_TABLE_F_DORMANT) &&
                   ctx->table->flags & NFT_TABLE_F_DORMANT) {
                ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
                if (!(ctx->table->flags & __NFT_TABLE_F_UPDATE)) {
                        ret = nf_tables_table_enable(ctx->net, ctx->table);
                        if (ret < 0)
                                goto err_register_hooks;

                        ctx->table->flags |= __NFT_TABLE_F_WAS_DORMANT;
                }
        }

        if ((flags & NFT_TABLE_F_OWNER) &&
            !nft_table_has_owner(ctx->table)) {
                ctx->table->nlpid = ctx->portid;
                ctx->table->flags |= NFT_TABLE_F_OWNER |
                                     __NFT_TABLE_F_WAS_ORPHAN;
        }

        nft_trans_table_update(trans) = true;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_register_hooks:
        ctx->table->flags |= NFT_TABLE_F_DORMANT;
        nft_trans_destroy(trans);
        return ret;
}

static u32 nft_chain_hash(const void *data, u32 len, u32 seed)
{
        const char *name = data;

        return jhash(name, strlen(name), seed);
}

static u32 nft_chain_hash_obj(const void *data, u32 len, u32 seed)
{
        const struct nft_chain *chain = data;

        return nft_chain_hash(chain->name, 0, seed);
}

static int nft_chain_hash_cmp(struct rhashtable_compare_arg *arg,
                              const void *ptr)
{
        const struct nft_chain *chain = ptr;
        const char *name = arg->key;

        return strcmp(chain->name, name);
}

static u32 nft_objname_hash(const void *data, u32 len, u32 seed)
{
        const struct nft_object_hash_key *k = data;

        seed ^= hash_ptr(k->table, 32);

        return jhash(k->name, strlen(k->name), seed);
}

static u32 nft_objname_hash_obj(const void *data, u32 len, u32 seed)
{
        const struct nft_object *obj = data;

        return nft_objname_hash(&obj->key, 0, seed);
}

static int nft_objname_hash_cmp(struct rhashtable_compare_arg *arg,
                                const void *ptr)
{
        const struct nft_object_hash_key *k = arg->key;
        const struct nft_object *obj = ptr;

        if (obj->key.table != k->table)
                return -1;

        return strcmp(obj->key.name, k->name);
}

static bool nft_supported_family(u8 family)
{
        return false
#ifdef CONFIG_NF_TABLES_INET
                || family == NFPROTO_INET
#endif
#ifdef CONFIG_NF_TABLES_IPV4
                || family == NFPROTO_IPV4
#endif
#ifdef CONFIG_NF_TABLES_ARP
                || family == NFPROTO_ARP
#endif
#ifdef CONFIG_NF_TABLES_NETDEV
                || family == NFPROTO_NETDEV
#endif
#if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE)
                || family == NFPROTO_BRIDGE
#endif
#ifdef CONFIG_NF_TABLES_IPV6
                || family == NFPROTO_IPV6
#endif
                ;
}

static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_ctx ctx;
        u32 flags = 0;
        int err;

        if (!nft_supported_family(family))
                return -EOPNOTSUPP;

        lockdep_assert_held(&nft_net->commit_mutex);
        attr = nla[NFTA_TABLE_NAME];
        table = nft_table_lookup(net, attr, family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                if (PTR_ERR(table) != -ENOENT)
                        return PTR_ERR(table);
        } else {
                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;

                nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

                return nf_tables_updtable(&ctx);
        }

        if (nla[NFTA_TABLE_FLAGS]) {
                flags = ntohl(nla_get_be32(nla[NFTA_TABLE_FLAGS]));
                if (flags & ~NFT_TABLE_F_MASK)
                        return -EOPNOTSUPP;
        }

        err = -ENOMEM;
        table = kzalloc(sizeof(*table), GFP_KERNEL_ACCOUNT);
        if (table == NULL)
                goto err_kzalloc;

        table->validate_state = nft_net->validate_state;
        table->name = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
        if (table->name == NULL)
                goto err_strdup;

        if (nla[NFTA_TABLE_USERDATA]) {
                table->udata = nla_memdup(nla[NFTA_TABLE_USERDATA], GFP_KERNEL_ACCOUNT);
                if (table->udata == NULL)
                        goto err_table_udata;

                table->udlen = nla_len(nla[NFTA_TABLE_USERDATA]);
        }

        err = rhltable_init(&table->chains_ht, &nft_chain_ht_params);
        if (err)
                goto err_chain_ht;

        INIT_LIST_HEAD(&table->chains);
        INIT_LIST_HEAD(&table->sets);
        INIT_LIST_HEAD(&table->objects);
        INIT_LIST_HEAD(&table->flowtables);
        table->family = family;
        table->flags = flags;
        table->handle = ++nft_net->table_handle;
        if (table->flags & NFT_TABLE_F_OWNER)
                table->nlpid = NETLINK_CB(skb).portid;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
        err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
        if (err < 0)
                goto err_trans;

        list_add_tail_rcu(&table->list, &nft_net->tables);
        return 0;
err_trans:
        rhltable_destroy(&table->chains_ht);
err_chain_ht:
        kfree(table->udata);
err_table_udata:
        kfree(table->name);
err_strdup:
        kfree(table);
err_kzalloc:
        return err;
}

static int nft_flush_table(struct nft_ctx *ctx)
{
        struct nft_flowtable *flowtable, *nft;
        struct nft_chain *chain, *nc;
        struct nft_object *obj, *ne;
        struct nft_set *set, *ns;
        int err;

        list_for_each_entry(chain, &ctx->table->chains, list) {
                if (!nft_is_active_next(ctx->net, chain))
                        continue;

                if (nft_chain_binding(chain))
                        continue;

                ctx->chain = chain;

                err = nft_delrule_by_chain(ctx);
                if (err < 0)
                        goto out;
        }

        list_for_each_entry_safe(set, ns, &ctx->table->sets, list) {
                if (!nft_is_active_next(ctx->net, set))
                        continue;

                if (nft_set_is_anonymous(set))
                        continue;

                err = nft_delset(ctx, set);
                if (err < 0)
                        goto out;
        }

        list_for_each_entry_safe(flowtable, nft, &ctx->table->flowtables, list) {
                if (!nft_is_active_next(ctx->net, flowtable))
                        continue;

                err = nft_delflowtable(ctx, flowtable);
                if (err < 0)
                        goto out;
        }

        list_for_each_entry_safe(obj, ne, &ctx->table->objects, list) {
                if (!nft_is_active_next(ctx->net, obj))
                        continue;

                err = nft_delobj(ctx, obj);
                if (err < 0)
                        goto out;
        }

        list_for_each_entry_safe(chain, nc, &ctx->table->chains, list) {
                if (!nft_is_active_next(ctx->net, chain))
                        continue;

                if (nft_chain_binding(chain))
                        continue;

                ctx->chain = chain;

                err = nft_delchain(ctx);
                if (err < 0)
                        goto out;
        }

        err = nft_deltable(ctx);
out:
        return err;
}

static int nft_flush(struct nft_ctx *ctx, int family)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        const struct nlattr * const *nla = ctx->nla;
        struct nft_table *table, *nt;
        int err = 0;

        list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
                if (family != AF_UNSPEC && table->family != family)
                        continue;

                ctx->family = table->family;

                if (!nft_is_active_next(ctx->net, table))
                        continue;

                if (nft_table_has_owner(table) && table->nlpid != ctx->portid)
                        continue;

                if (nla[NFTA_TABLE_NAME] &&
                    nla_strcmp(nla[NFTA_TABLE_NAME], table->name) != 0)
                        continue;

                ctx->table = table;

                err = nft_flush_table(ctx);
                if (err < 0)
                        goto out;
        }
out:
        return err;
}

static int nf_tables_deltable(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_ctx ctx;

        nft_ctx_init(&ctx, net, skb, info->nlh, 0, NULL, NULL, nla);
        if (family == AF_UNSPEC ||
            (!nla[NFTA_TABLE_NAME] && !nla[NFTA_TABLE_HANDLE]))
                return nft_flush(&ctx, family);

        if (nla[NFTA_TABLE_HANDLE]) {
                attr = nla[NFTA_TABLE_HANDLE];
                table = nft_table_lookup_byhandle(net, attr, family, genmask,
                                                  NETLINK_CB(skb).portid);
        } else {
                attr = nla[NFTA_TABLE_NAME];
                table = nft_table_lookup(net, attr, family, genmask,
                                         NETLINK_CB(skb).portid);
        }

        if (IS_ERR(table)) {
                if (PTR_ERR(table) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYTABLE)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(table);
        }

        if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
            table->use > 0)
                return -EBUSY;

        ctx.family = family;
        ctx.table = table;

        return nft_flush_table(&ctx);
}

static void nf_tables_table_destroy(struct nft_table *table)
{
        if (WARN_ON(table->use > 0))
                return;

        rhltable_destroy(&table->chains_ht);
        kfree(table->name);
        kfree(table->udata);
        kfree(table);
}

void nft_register_chain_type(const struct nft_chain_type *ctype)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        if (WARN_ON(__nft_chain_type_get(ctype->family, ctype->type))) {
                nfnl_unlock(NFNL_SUBSYS_NFTABLES);
                return;
        }
        chain_type[ctype->family][ctype->type] = ctype;
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_register_chain_type);

void nft_unregister_chain_type(const struct nft_chain_type *ctype)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        chain_type[ctype->family][ctype->type] = NULL;
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_unregister_chain_type);

/*
 * Chains
 */

static struct nft_chain *
nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
{
        struct nft_chain *chain;

        list_for_each_entry(chain, &table->chains, list) {
                if (chain->handle == handle &&
                    nft_active_genmask(chain, genmask))
                        return chain;
        }

        return ERR_PTR(-ENOENT);
}

static bool lockdep_commit_lock_is_held(const struct net *net)
{
#ifdef CONFIG_PROVE_LOCKING
        struct nftables_pernet *nft_net = nft_pernet(net);

        return lockdep_is_held(&nft_net->commit_mutex);
#else
        return true;
#endif
}

static struct nft_chain *nft_chain_lookup(struct net *net,
                                          struct nft_table *table,
                                          const struct nlattr *nla, u8 genmask)
{
        char search[NFT_CHAIN_MAXNAMELEN + 1];
        struct rhlist_head *tmp, *list;
        struct nft_chain *chain;

        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        nla_strscpy(search, nla, sizeof(search));

        WARN_ON(!rcu_read_lock_held() &&
                !lockdep_commit_lock_is_held(net));

        chain = ERR_PTR(-ENOENT);
        rcu_read_lock();
        list = rhltable_lookup(&table->chains_ht, search, nft_chain_ht_params);
        if (!list)
                goto out_unlock;

        rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
                if (nft_active_genmask(chain, genmask))
                        goto out_unlock;
        }
        chain = ERR_PTR(-ENOENT);
out_unlock:
        rcu_read_unlock();
        return chain;
}

static const struct nla_policy nft_chain_policy[NFTA_CHAIN_MAX + 1] = {
        [NFTA_CHAIN_TABLE]        = { .type = NLA_STRING,
                                    .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_CHAIN_HANDLE]        = { .type = NLA_U64 },
        [NFTA_CHAIN_NAME]        = { .type = NLA_STRING,
                                    .len = NFT_CHAIN_MAXNAMELEN - 1 },
        [NFTA_CHAIN_HOOK]        = { .type = NLA_NESTED },
        [NFTA_CHAIN_POLICY]        = { .type = NLA_U32 },
        [NFTA_CHAIN_TYPE]        = { .type = NLA_STRING,
                                    .len = NFT_MODULE_AUTOLOAD_LIMIT },
        [NFTA_CHAIN_COUNTERS]        = { .type = NLA_NESTED },
        [NFTA_CHAIN_FLAGS]        = { .type = NLA_U32 },
        [NFTA_CHAIN_ID]                = { .type = NLA_U32 },
        [NFTA_CHAIN_USERDATA]        = { .type = NLA_BINARY,
                                    .len = NFT_USERDATA_MAXLEN },
};

static const struct nla_policy nft_hook_policy[NFTA_HOOK_MAX + 1] = {
        [NFTA_HOOK_HOOKNUM]        = { .type = NLA_U32 },
        [NFTA_HOOK_PRIORITY]        = { .type = NLA_U32 },
        [NFTA_HOOK_DEV]                = { .type = NLA_STRING,
                                    .len = IFNAMSIZ - 1 },
};

static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats)
{
        struct nft_stats *cpu_stats, total;
        struct nlattr *nest;
        unsigned int seq;
        u64 pkts, bytes;
        int cpu;

        if (!stats)
                return 0;

        memset(&total, 0, sizeof(total));
        for_each_possible_cpu(cpu) {
                cpu_stats = per_cpu_ptr(stats, cpu);
                do {
                        seq = u64_stats_fetch_begin(&cpu_stats->syncp);
                        pkts = cpu_stats->pkts;
                        bytes = cpu_stats->bytes;
                } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq));
                total.pkts += pkts;
                total.bytes += bytes;
        }
        nest = nla_nest_start_noflag(skb, NFTA_CHAIN_COUNTERS);
        if (nest == NULL)
                goto nla_put_failure;

        if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts),
                         NFTA_COUNTER_PAD) ||
            nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes),
                         NFTA_COUNTER_PAD))
                goto nla_put_failure;

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        return -ENOSPC;
}

static int nft_dump_basechain_hook(struct sk_buff *skb,
                                   const struct net *net, int family,
                                   const struct nft_base_chain *basechain,
                                   const struct list_head *hook_list)
{
        const struct nf_hook_ops *ops = &basechain->ops;
        struct nft_hook *hook, *first = NULL;
        struct nlattr *nest, *nest_devs;
        int n = 0;

        nest = nla_nest_start_noflag(skb, NFTA_CHAIN_HOOK);
        if (nest == NULL)
                goto nla_put_failure;
        if (nla_put_be32(skb, NFTA_HOOK_HOOKNUM, htonl(ops->hooknum)))
                goto nla_put_failure;
        if (nla_put_be32(skb, NFTA_HOOK_PRIORITY, htonl(ops->priority)))
                goto nla_put_failure;

        if (nft_base_chain_netdev(family, ops->hooknum)) {
                nest_devs = nla_nest_start_noflag(skb, NFTA_HOOK_DEVS);
                if (!nest_devs)
                        goto nla_put_failure;

                if (!hook_list)
                        hook_list = &basechain->hook_list;

                list_for_each_entry_rcu(hook, hook_list, list,
                                        lockdep_commit_lock_is_held(net)) {
                        if (!first)
                                first = hook;

                        if (nla_put(skb, NFTA_DEVICE_NAME,
                                    hook->ifnamelen, hook->ifname))
                                goto nla_put_failure;
                        n++;
                }
                nla_nest_end(skb, nest_devs);

                if (n == 1 &&
                    nla_put(skb, NFTA_HOOK_DEV,
                            first->ifnamelen, first->ifname))
                        goto nla_put_failure;
        }
        nla_nest_end(skb, nest);

        return 0;
nla_put_failure:
        return -1;
}

static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
                                     u32 portid, u32 seq, int event, u32 flags,
                                     int family, const struct nft_table *table,
                                     const struct nft_chain *chain,
                                     const struct list_head *hook_list)
{
        struct nlmsghdr *nlh;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name) ||
            nla_put_string(skb, NFTA_CHAIN_NAME, chain->name) ||
            nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle),
                         NFTA_CHAIN_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELCHAIN && !hook_list) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (nft_is_base_chain(chain)) {
                const struct nft_base_chain *basechain = nft_base_chain(chain);
                struct nft_stats __percpu *stats;

                if (nft_dump_basechain_hook(skb, net, family, basechain, hook_list))
                        goto nla_put_failure;

                if (nla_put_be32(skb, NFTA_CHAIN_POLICY,
                                 htonl(basechain->policy)))
                        goto nla_put_failure;

                if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name))
                        goto nla_put_failure;

                stats = rcu_dereference_check(basechain->stats,
                                              lockdep_commit_lock_is_held(net));
                if (nft_dump_stats(skb, stats))
                        goto nla_put_failure;
        }

        if (chain->flags &&
            nla_put_be32(skb, NFTA_CHAIN_FLAGS, htonl(chain->flags)))
                goto nla_put_failure;

        if (nla_put_be32(skb, NFTA_CHAIN_USE, htonl(chain->use)))
                goto nla_put_failure;

        if (chain->udata &&
            nla_put(skb, NFTA_CHAIN_USERDATA, chain->udlen, chain->udata))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static void nf_tables_chain_notify(const struct nft_ctx *ctx, int event,
                                   const struct list_head *hook_list)
{
        struct nftables_pernet *nft_net;
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_chain_info(skb, ctx->net, ctx->portid, ctx->seq,
                                        event, flags, ctx->family, ctx->table,
                                        ctx->chain, hook_list);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_net = nft_pernet(ctx->net);
        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static int nf_tables_dump_chains(struct sk_buff *skb,
                                 struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        unsigned int idx = 0, s_idx = cb->args[0];
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;
        struct nftables_pernet *nft_net;
        const struct nft_table *table;
        const struct nft_chain *chain;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                list_for_each_entry_rcu(chain, &table->chains, list) {
                        if (idx < s_idx)
                                goto cont;
                        if (idx > s_idx)
                                memset(&cb->args[1], 0,
                                       sizeof(cb->args) - sizeof(cb->args[0]));
                        if (!nft_is_active(net, chain))
                                continue;
                        if (nf_tables_fill_chain_info(skb, net,
                                                      NETLINK_CB(cb->skb).portid,
                                                      cb->nlh->nlmsg_seq,
                                                      NFT_MSG_NEWCHAIN,
                                                      NLM_F_MULTI,
                                                      table->family, table,
                                                      chain, NULL) < 0)
                                goto done;

                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                        idx++;
                }
        }
done:
        rcu_read_unlock();
        cb->args[0] = idx;
        return skb->len;
}

/* called with rcu_read_lock held */
static int nf_tables_getchain(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_chain *chain;
        struct net *net = info->net;
        struct nft_table *table;
        struct sk_buff *skb2;
        int err;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .dump = nf_tables_dump_chains,
                        .module = THIS_MODULE,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
                return PTR_ERR(table);
        }

        chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask);
        if (IS_ERR(chain)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
                return PTR_ERR(chain);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;

        err = nf_tables_fill_chain_info(skb2, net, NETLINK_CB(skb).portid,
                                        info->nlh->nlmsg_seq, NFT_MSG_NEWCHAIN,
                                        0, family, table, chain, NULL);
        if (err < 0)
                goto err_fill_chain_info;

        return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);

err_fill_chain_info:
        kfree_skb(skb2);
        return err;
}

static const struct nla_policy nft_counter_policy[NFTA_COUNTER_MAX + 1] = {
        [NFTA_COUNTER_PACKETS]        = { .type = NLA_U64 },
        [NFTA_COUNTER_BYTES]        = { .type = NLA_U64 },
};

static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr)
{
        struct nlattr *tb[NFTA_COUNTER_MAX+1];
        struct nft_stats __percpu *newstats;
        struct nft_stats *stats;
        int err;

        err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr,
                                          nft_counter_policy, NULL);
        if (err < 0)
                return ERR_PTR_PCPU(err);

        if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS])
                return ERR_PTR_PCPU(-EINVAL);

        newstats = netdev_alloc_pcpu_stats(struct nft_stats);
        if (newstats == NULL)
                return ERR_PTR_PCPU(-ENOMEM);

        /* Restore old counters on this cpu, no problem. Per-cpu statistics
         * are not exposed to userspace.
         */
        preempt_disable();
        stats = this_cpu_ptr(newstats);
        stats->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES]));
        stats->pkts = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS]));
        preempt_enable();

        return newstats;
}

static void nft_chain_stats_replace(struct nft_trans_chain *trans)
{
        const struct nft_trans *t = &trans->nft_trans_binding.nft_trans;
        struct nft_base_chain *chain = nft_base_chain(trans->chain);

        if (!trans->stats)
                return;

        trans->stats =
                rcu_replace_pointer(chain->stats, trans->stats,
                                    lockdep_commit_lock_is_held(t->net));

        if (!trans->stats)
                static_branch_inc(&nft_counters_enabled);
}

static void nf_tables_chain_free_chain_rules(struct nft_chain *chain)
{
        struct nft_rule_blob *g0 = rcu_dereference_raw(chain->blob_gen_0);
        struct nft_rule_blob *g1 = rcu_dereference_raw(chain->blob_gen_1);

        if (g0 != g1)
                kvfree(g1);
        kvfree(g0);

        /* should be NULL either via abort or via successful commit */
        WARN_ON_ONCE(chain->blob_next);
        kvfree(chain->blob_next);
}

void nf_tables_chain_destroy(struct nft_chain *chain)
{
        const struct nft_table *table = chain->table;
        struct nft_hook *hook, *next;

        if (WARN_ON(chain->use > 0))
                return;

        /* no concurrent access possible anymore */
        nf_tables_chain_free_chain_rules(chain);

        if (nft_is_base_chain(chain)) {
                struct nft_base_chain *basechain = nft_base_chain(chain);

                if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
                        list_for_each_entry_safe(hook, next,
                                                 &basechain->hook_list, list) {
                                list_del_rcu(&hook->list);
                                kfree_rcu(hook, rcu);
                        }
                }
                module_put(basechain->type->owner);
                if (rcu_access_pointer(basechain->stats)) {
                        static_branch_dec(&nft_counters_enabled);
                        free_percpu(rcu_dereference_raw(basechain->stats));
                }
                kfree(chain->name);
                kfree(chain->udata);
                kfree(basechain);
        } else {
                kfree(chain->name);
                kfree(chain->udata);
                kfree(chain);
        }
}

static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
                                              const struct nlattr *attr)
{
        struct net_device *dev;
        struct nft_hook *hook;
        int err;

        hook = kzalloc(sizeof(struct nft_hook), GFP_KERNEL_ACCOUNT);
        if (!hook) {
                err = -ENOMEM;
                goto err_hook_alloc;
        }

        err = nla_strscpy(hook->ifname, attr, IFNAMSIZ);
        if (err < 0)
                goto err_hook_dev;

        hook->ifnamelen = nla_len(attr);

        /* nf_tables_netdev_event() is called under rtnl_mutex, this is
         * indirectly serializing all the other holders of the commit_mutex with
         * the rtnl_mutex.
         */
        dev = __dev_get_by_name(net, hook->ifname);
        if (!dev) {
                err = -ENOENT;
                goto err_hook_dev;
        }
        hook->ops.dev = dev;

        return hook;

err_hook_dev:
        kfree(hook);
err_hook_alloc:
        return ERR_PTR(err);
}

static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
                                           const struct nft_hook *this)
{
        struct nft_hook *hook;

        list_for_each_entry(hook, hook_list, list) {
                if (!strcmp(hook->ifname, this->ifname))
                        return hook;
        }

        return NULL;
}

static int nf_tables_parse_netdev_hooks(struct net *net,
                                        const struct nlattr *attr,
                                        struct list_head *hook_list,
                                        struct netlink_ext_ack *extack)
{
        struct nft_hook *hook, *next;
        const struct nlattr *tmp;
        int rem, n = 0, err;

        nla_for_each_nested(tmp, attr, rem) {
                if (nla_type(tmp) != NFTA_DEVICE_NAME) {
                        err = -EINVAL;
                        goto err_hook;
                }

                hook = nft_netdev_hook_alloc(net, tmp);
                if (IS_ERR(hook)) {
                        NL_SET_BAD_ATTR(extack, tmp);
                        err = PTR_ERR(hook);
                        goto err_hook;
                }
                if (nft_hook_list_find(hook_list, hook)) {
                        NL_SET_BAD_ATTR(extack, tmp);
                        kfree(hook);
                        err = -EEXIST;
                        goto err_hook;
                }
                list_add_tail(&hook->list, hook_list);
                n++;

                if (n == NFT_NETDEVICE_MAX) {
                        err = -EFBIG;
                        goto err_hook;
                }
        }

        return 0;

err_hook:
        list_for_each_entry_safe(hook, next, hook_list, list) {
                list_del(&hook->list);
                kfree(hook);
        }
        return err;
}

struct nft_chain_hook {
        u32                                num;
        s32                                priority;
        const struct nft_chain_type        *type;
        struct list_head                list;
};

static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[],
                                  struct list_head *hook_list,
                                  struct netlink_ext_ack *extack, u32 flags)
{
        struct nft_hook *hook;
        int err;

        if (tb[NFTA_HOOK_DEV]) {
                hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]);
                if (IS_ERR(hook)) {
                        NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]);
                        return PTR_ERR(hook);
                }

                list_add_tail(&hook->list, hook_list);
        } else if (tb[NFTA_HOOK_DEVS]) {
                err = nf_tables_parse_netdev_hooks(net, tb[NFTA_HOOK_DEVS],
                                                   hook_list, extack);
                if (err < 0)
                        return err;

        }

        if (flags & NFT_CHAIN_HW_OFFLOAD &&
            list_empty(hook_list))
                return -EINVAL;

        return 0;
}

static int nft_chain_parse_hook(struct net *net,
                                struct nft_base_chain *basechain,
                                const struct nlattr * const nla[],
                                struct nft_chain_hook *hook, u8 family,
                                u32 flags, struct netlink_ext_ack *extack)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nlattr *ha[NFTA_HOOK_MAX + 1];
        const struct nft_chain_type *type;
        int err;

        lockdep_assert_held(&nft_net->commit_mutex);
        lockdep_nfnl_nft_mutex_not_held();

        err = nla_parse_nested_deprecated(ha, NFTA_HOOK_MAX,
                                          nla[NFTA_CHAIN_HOOK],
                                          nft_hook_policy, NULL);
        if (err < 0)
                return err;

        if (!basechain) {
                if (!ha[NFTA_HOOK_HOOKNUM] ||
                    !ha[NFTA_HOOK_PRIORITY]) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
                        return -ENOENT;
                }

                hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
                hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));

                type = __nft_chain_type_get(family, NFT_CHAIN_T_DEFAULT);
                if (!type)
                        return -EOPNOTSUPP;

                if (nla[NFTA_CHAIN_TYPE]) {
                        type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
                                                           family, true);
                        if (IS_ERR(type)) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
                                return PTR_ERR(type);
                        }
                }
                if (hook->num >= NFT_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
                        return -EOPNOTSUPP;

                if (type->type == NFT_CHAIN_T_NAT &&
                    hook->priority <= NF_IP_PRI_CONNTRACK)
                        return -EOPNOTSUPP;
        } else {
                if (ha[NFTA_HOOK_HOOKNUM]) {
                        hook->num = ntohl(nla_get_be32(ha[NFTA_HOOK_HOOKNUM]));
                        if (hook->num != basechain->ops.hooknum)
                                return -EOPNOTSUPP;
                }
                if (ha[NFTA_HOOK_PRIORITY]) {
                        hook->priority = ntohl(nla_get_be32(ha[NFTA_HOOK_PRIORITY]));
                        if (hook->priority != basechain->ops.priority)
                                return -EOPNOTSUPP;
                }

                if (nla[NFTA_CHAIN_TYPE]) {
                        type = __nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
                                                             family);
                        if (!type) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
                                return -ENOENT;
                        }
                } else {
                        type = basechain->type;
                }
        }

        if (!try_module_get(type->owner)) {
                if (nla[NFTA_CHAIN_TYPE])
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TYPE]);
                return -ENOENT;
        }

        hook->type = type;

        INIT_LIST_HEAD(&hook->list);
        if (nft_base_chain_netdev(family, hook->num)) {
                err = nft_chain_parse_netdev(net, ha, &hook->list, extack, flags);
                if (err < 0) {
                        module_put(type->owner);
                        return err;
                }
        } else if (ha[NFTA_HOOK_DEV] || ha[NFTA_HOOK_DEVS]) {
                module_put(type->owner);
                return -EOPNOTSUPP;
        }

        return 0;
}

static void nft_chain_release_hook(struct nft_chain_hook *hook)
{
        struct nft_hook *h, *next;

        list_for_each_entry_safe(h, next, &hook->list, list) {
                list_del(&h->list);
                kfree(h);
        }
        module_put(hook->type->owner);
}

static void nft_last_rule(const struct nft_chain *chain, const void *ptr)
{
        struct nft_rule_dp_last *lrule;

        BUILD_BUG_ON(offsetof(struct nft_rule_dp_last, end) != 0);

        lrule = (struct nft_rule_dp_last *)ptr;
        lrule->end.is_last = 1;
        lrule->chain = chain;
        /* blob size does not include the trailer rule */
}

static struct nft_rule_blob *nf_tables_chain_alloc_rules(const struct nft_chain *chain,
                                                         unsigned int size)
{
        struct nft_rule_blob *blob;

        if (size > INT_MAX)
                return NULL;

        size += sizeof(struct nft_rule_blob) + sizeof(struct nft_rule_dp_last);

        blob = kvmalloc(size, GFP_KERNEL_ACCOUNT);
        if (!blob)
                return NULL;

        blob->size = 0;
        nft_last_rule(chain, blob->data);

        return blob;
}

static void nft_basechain_hook_init(struct nf_hook_ops *ops, u8 family,
                                    const struct nft_chain_hook *hook,
                                    struct nft_chain *chain)
{
        ops->pf                        = family;
        ops->hooknum                = hook->num;
        ops->priority                = hook->priority;
        ops->priv                = chain;
        ops->hook                = hook->type->hooks[ops->hooknum];
        ops->hook_ops_type        = NF_HOOK_OP_NF_TABLES;
}

static int nft_basechain_init(struct nft_base_chain *basechain, u8 family,
                              struct nft_chain_hook *hook, u32 flags)
{
        struct nft_chain *chain;
        struct nft_hook *h;

        basechain->type = hook->type;
        INIT_LIST_HEAD(&basechain->hook_list);
        chain = &basechain->chain;

        if (nft_base_chain_netdev(family, hook->num)) {
                list_splice_init(&hook->list, &basechain->hook_list);
                list_for_each_entry(h, &basechain->hook_list, list)
                        nft_basechain_hook_init(&h->ops, family, hook, chain);
        }
        nft_basechain_hook_init(&basechain->ops, family, hook, chain);

        chain->flags |= NFT_CHAIN_BASE | flags;
        basechain->policy = NF_ACCEPT;
        if (chain->flags & NFT_CHAIN_HW_OFFLOAD &&
            !nft_chain_offload_support(basechain)) {
                list_splice_init(&basechain->hook_list, &hook->list);
                return -EOPNOTSUPP;
        }

        flow_block_init(&basechain->flow_block);

        return 0;
}

int nft_chain_add(struct nft_table *table, struct nft_chain *chain)
{
        int err;

        err = rhltable_insert_key(&table->chains_ht, chain->name,
                                  &chain->rhlhead, nft_chain_ht_params);
        if (err)
                return err;

        list_add_tail_rcu(&chain->list, &table->chains);

        return 0;
}

static u64 chain_id;

static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy,
                              u32 flags, struct netlink_ext_ack *extack)
{
        const struct nlattr * const *nla = ctx->nla;
        struct nft_table *table = ctx->table;
        struct nft_base_chain *basechain;
        struct net *net = ctx->net;
        char name[NFT_NAME_MAXLEN];
        struct nft_rule_blob *blob;
        struct nft_trans *trans;
        struct nft_chain *chain;
        int err;

        if (nla[NFTA_CHAIN_HOOK]) {
                struct nft_stats __percpu *stats = NULL;
                struct nft_chain_hook hook = {};

                if (table->flags & __NFT_TABLE_F_UPDATE)
                        return -EINVAL;

                if (flags & NFT_CHAIN_BINDING)
                        return -EOPNOTSUPP;

                err = nft_chain_parse_hook(net, NULL, nla, &hook, family, flags,
                                           extack);
                if (err < 0)
                        return err;

                basechain = kzalloc(sizeof(*basechain), GFP_KERNEL_ACCOUNT);
                if (basechain == NULL) {
                        nft_chain_release_hook(&hook);
                        return -ENOMEM;
                }
                chain = &basechain->chain;

                if (nla[NFTA_CHAIN_COUNTERS]) {
                        stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
                        if (IS_ERR_PCPU(stats)) {
                                nft_chain_release_hook(&hook);
                                kfree(basechain);
                                return PTR_ERR_PCPU(stats);
                        }
                        rcu_assign_pointer(basechain->stats, stats);
                }

                err = nft_basechain_init(basechain, family, &hook, flags);
                if (err < 0) {
                        nft_chain_release_hook(&hook);
                        kfree(basechain);
                        free_percpu(stats);
                        return err;
                }
                if (stats)
                        static_branch_inc(&nft_counters_enabled);
        } else {
                if (flags & NFT_CHAIN_BASE)
                        return -EINVAL;
                if (flags & NFT_CHAIN_HW_OFFLOAD)
                        return -EOPNOTSUPP;

                chain = kzalloc(sizeof(*chain), GFP_KERNEL_ACCOUNT);
                if (chain == NULL)
                        return -ENOMEM;

                chain->flags = flags;
        }
        ctx->chain = chain;

        INIT_LIST_HEAD(&chain->rules);
        chain->handle = nf_tables_alloc_handle(table);
        chain->table = table;

        if (nla[NFTA_CHAIN_NAME]) {
                chain->name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
        } else {
                if (!(flags & NFT_CHAIN_BINDING)) {
                        err = -EINVAL;
                        goto err_destroy_chain;
                }

                snprintf(name, sizeof(name), "__chain%llu", ++chain_id);
                chain->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
        }

        if (!chain->name) {
                err = -ENOMEM;
                goto err_destroy_chain;
        }

        if (nla[NFTA_CHAIN_USERDATA]) {
                chain->udata = nla_memdup(nla[NFTA_CHAIN_USERDATA], GFP_KERNEL_ACCOUNT);
                if (chain->udata == NULL) {
                        err = -ENOMEM;
                        goto err_destroy_chain;
                }
                chain->udlen = nla_len(nla[NFTA_CHAIN_USERDATA]);
        }

        blob = nf_tables_chain_alloc_rules(chain, 0);
        if (!blob) {
                err = -ENOMEM;
                goto err_destroy_chain;
        }

        RCU_INIT_POINTER(chain->blob_gen_0, blob);
        RCU_INIT_POINTER(chain->blob_gen_1, blob);

        if (!nft_use_inc(&table->use)) {
                err = -EMFILE;
                goto err_destroy_chain;
        }

        trans = nft_trans_chain_add(ctx, NFT_MSG_NEWCHAIN);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto err_trans;
        }

        nft_trans_chain_policy(trans) = NFT_CHAIN_POLICY_UNSET;
        if (nft_is_base_chain(chain))
                nft_trans_chain_policy(trans) = policy;

        err = nft_chain_add(table, chain);
        if (err < 0)
                goto err_chain_add;

        /* This must be LAST to ensure no packets are walking over this chain. */
        err = nf_tables_register_hook(net, table, chain);
        if (err < 0)
                goto err_register_hook;

        return 0;

err_register_hook:
        nft_chain_del(chain);
err_chain_add:
        nft_trans_destroy(trans);
err_trans:
        nft_use_dec_restore(&table->use);
err_destroy_chain:
        nf_tables_chain_destroy(chain);

        return err;
}

static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
                              u32 flags, const struct nlattr *attr,
                              struct netlink_ext_ack *extack)
{
        const struct nlattr * const *nla = ctx->nla;
        struct nft_base_chain *basechain = NULL;
        struct nft_table *table = ctx->table;
        struct nft_chain *chain = ctx->chain;
        struct nft_chain_hook hook = {};
        struct nft_stats __percpu *stats = NULL;
        struct nft_hook *h, *next;
        struct nf_hook_ops *ops;
        struct nft_trans *trans;
        bool unregister = false;
        int err;

        if (chain->flags ^ flags)
                return -EOPNOTSUPP;

        INIT_LIST_HEAD(&hook.list);

        if (nla[NFTA_CHAIN_HOOK]) {
                if (!nft_is_base_chain(chain)) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return -EEXIST;
                }

                basechain = nft_base_chain(chain);
                err = nft_chain_parse_hook(ctx->net, basechain, nla, &hook,
                                           ctx->family, flags, extack);
                if (err < 0)
                        return err;

                if (basechain->type != hook.type) {
                        nft_chain_release_hook(&hook);
                        NL_SET_BAD_ATTR(extack, attr);
                        return -EEXIST;
                }

                if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) {
                        list_for_each_entry_safe(h, next, &hook.list, list) {
                                h->ops.pf        = basechain->ops.pf;
                                h->ops.hooknum        = basechain->ops.hooknum;
                                h->ops.priority        = basechain->ops.priority;
                                h->ops.priv        = basechain->ops.priv;
                                h->ops.hook        = basechain->ops.hook;

                                if (nft_hook_list_find(&basechain->hook_list, h)) {
                                        list_del(&h->list);
                                        kfree(h);
                                }
                        }
                } else {
                        ops = &basechain->ops;
                        if (ops->hooknum != hook.num ||
                            ops->priority != hook.priority) {
                                nft_chain_release_hook(&hook);
                                NL_SET_BAD_ATTR(extack, attr);
                                return -EEXIST;
                        }
                }
        }

        if (nla[NFTA_CHAIN_HANDLE] &&
            nla[NFTA_CHAIN_NAME]) {
                struct nft_chain *chain2;

                chain2 = nft_chain_lookup(ctx->net, table,
                                          nla[NFTA_CHAIN_NAME], genmask);
                if (!IS_ERR(chain2)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
                        err = -EEXIST;
                        goto err_hooks;
                }
        }

        if (table->flags & __NFT_TABLE_F_UPDATE &&
            !list_empty(&hook.list)) {
                NL_SET_BAD_ATTR(extack, attr);
                err = -EOPNOTSUPP;
                goto err_hooks;
        }

        if (!(table->flags & NFT_TABLE_F_DORMANT) &&
            nft_is_base_chain(chain) &&
            !list_empty(&hook.list)) {
                basechain = nft_base_chain(chain);
                ops = &basechain->ops;

                if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) {
                        err = nft_netdev_register_hooks(ctx->net, &hook.list);
                        if (err < 0)
                                goto err_hooks;

                        unregister = true;
                }
        }

        if (nla[NFTA_CHAIN_COUNTERS]) {
                if (!nft_is_base_chain(chain)) {
                        err = -EOPNOTSUPP;
                        goto err_hooks;
                }

                stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]);
                if (IS_ERR_PCPU(stats)) {
                        err = PTR_ERR_PCPU(stats);
                        goto err_hooks;
                }
        }

        err = -ENOMEM;
        trans = nft_trans_alloc_chain(ctx, NFT_MSG_NEWCHAIN);
        if (trans == NULL)
                goto err_trans;

        nft_trans_chain_stats(trans) = stats;
        nft_trans_chain_update(trans) = true;

        if (nla[NFTA_CHAIN_POLICY])
                nft_trans_chain_policy(trans) = policy;
        else
                nft_trans_chain_policy(trans) = -1;

        if (nla[NFTA_CHAIN_HANDLE] &&
            nla[NFTA_CHAIN_NAME]) {
                struct nftables_pernet *nft_net = nft_pernet(ctx->net);
                struct nft_trans *tmp;
                char *name;

                err = -ENOMEM;
                name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL_ACCOUNT);
                if (!name)
                        goto err_trans;

                err = -EEXIST;
                list_for_each_entry(tmp, &nft_net->commit_list, list) {
                        if (tmp->msg_type == NFT_MSG_NEWCHAIN &&
                            tmp->table == table &&
                            nft_trans_chain_update(tmp) &&
                            nft_trans_chain_name(tmp) &&
                            strcmp(name, nft_trans_chain_name(tmp)) == 0) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
                                kfree(name);
                                goto err_trans;
                        }
                }

                nft_trans_chain_name(trans) = name;
        }

        nft_trans_basechain(trans) = basechain;
        INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
        list_splice(&hook.list, &nft_trans_chain_hooks(trans));
        if (nla[NFTA_CHAIN_HOOK])
                module_put(hook.type->owner);

        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_trans:
        free_percpu(stats);
        kfree(trans);
err_hooks:
        if (nla[NFTA_CHAIN_HOOK]) {
                list_for_each_entry_safe(h, next, &hook.list, list) {
                        if (unregister)
                                nf_unregister_net_hook(ctx->net, &h->ops);
                        list_del(&h->list);
                        kfree_rcu(h, rcu);
                }
                module_put(hook.type->owner);
        }

        return err;
}

static struct nft_chain *nft_chain_lookup_byid(const struct net *net,
                                               const struct nft_table *table,
                                               const struct nlattr *nla, u8 genmask)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        u32 id = ntohl(nla_get_be32(nla));
        struct nft_trans *trans;

        list_for_each_entry(trans, &nft_net->commit_list, list) {
                if (trans->msg_type == NFT_MSG_NEWCHAIN &&
                    nft_trans_chain(trans)->table == table &&
                    id == nft_trans_chain_id(trans) &&
                    nft_active_genmask(nft_trans_chain(trans), genmask))
                        return nft_trans_chain(trans);
        }
        return ERR_PTR(-ENOENT);
}

static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_chain *chain = NULL;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        u8 policy = NF_ACCEPT;
        struct nft_ctx ctx;
        u64 handle = 0;
        u32 flags = 0;

        lockdep_assert_held(&nft_net->commit_mutex);

        table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
                return PTR_ERR(table);
        }

        chain = NULL;
        attr = nla[NFTA_CHAIN_NAME];

        if (nla[NFTA_CHAIN_HANDLE]) {
                handle = be64_to_cpu(nla_get_be64(nla[NFTA_CHAIN_HANDLE]));
                chain = nft_chain_lookup_byhandle(table, handle, genmask);
                if (IS_ERR(chain)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_HANDLE]);
                        return PTR_ERR(chain);
                }
                attr = nla[NFTA_CHAIN_HANDLE];
        } else if (nla[NFTA_CHAIN_NAME]) {
                chain = nft_chain_lookup(net, table, attr, genmask);
                if (IS_ERR(chain)) {
                        if (PTR_ERR(chain) != -ENOENT) {
                                NL_SET_BAD_ATTR(extack, attr);
                                return PTR_ERR(chain);
                        }
                        chain = NULL;
                }
        } else if (!nla[NFTA_CHAIN_ID]) {
                return -EINVAL;
        }

        if (nla[NFTA_CHAIN_POLICY]) {
                if (chain != NULL &&
                    !nft_is_base_chain(chain)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
                        return -EOPNOTSUPP;
                }

                if (chain == NULL &&
                    nla[NFTA_CHAIN_HOOK] == NULL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_POLICY]);
                        return -EOPNOTSUPP;
                }

                policy = ntohl(nla_get_be32(nla[NFTA_CHAIN_POLICY]));
                switch (policy) {
                case NF_DROP:
                case NF_ACCEPT:
                        break;
                default:
                        return -EINVAL;
                }
        }

        if (nla[NFTA_CHAIN_FLAGS])
                flags = ntohl(nla_get_be32(nla[NFTA_CHAIN_FLAGS]));
        else if (chain)
                flags = chain->flags;

        if (flags & ~NFT_CHAIN_FLAGS)
                return -EOPNOTSUPP;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

        if (chain != NULL) {
                if (chain->flags & NFT_CHAIN_BINDING)
                        return -EINVAL;

                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;

                flags |= chain->flags & NFT_CHAIN_BASE;
                return nf_tables_updchain(&ctx, genmask, policy, flags, attr,
                                          extack);
        }

        return nf_tables_addchain(&ctx, family, policy, flags, extack);
}

static int nft_delchain_hook(struct nft_ctx *ctx,
                             struct nft_base_chain *basechain,
                             struct netlink_ext_ack *extack)
{
        const struct nft_chain *chain = &basechain->chain;
        const struct nlattr * const *nla = ctx->nla;
        struct nft_chain_hook chain_hook = {};
        struct nft_hook *this, *hook;
        LIST_HEAD(chain_del_list);
        struct nft_trans *trans;
        int err;

        if (ctx->table->flags & __NFT_TABLE_F_UPDATE)
                return -EOPNOTSUPP;

        err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook,
                                   ctx->family, chain->flags, extack);
        if (err < 0)
                return err;

        list_for_each_entry(this, &chain_hook.list, list) {
                hook = nft_hook_list_find(&basechain->hook_list, this);
                if (!hook) {
                        err = -ENOENT;
                        goto err_chain_del_hook;
                }
                list_move(&hook->list, &chain_del_list);
        }

        trans = nft_trans_alloc_chain(ctx, NFT_MSG_DELCHAIN);
        if (!trans) {
                err = -ENOMEM;
                goto err_chain_del_hook;
        }

        nft_trans_basechain(trans) = basechain;
        nft_trans_chain_update(trans) = true;
        INIT_LIST_HEAD(&nft_trans_chain_hooks(trans));
        list_splice(&chain_del_list, &nft_trans_chain_hooks(trans));
        nft_chain_release_hook(&chain_hook);

        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_chain_del_hook:
        list_splice(&chain_del_list, &basechain->hook_list);
        nft_chain_release_hook(&chain_hook);

        return err;
}

static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_chain *chain;
        struct nft_rule *rule;
        struct nft_ctx ctx;
        u64 handle;
        u32 use;
        int err;

        table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_CHAIN_HANDLE]) {
                attr = nla[NFTA_CHAIN_HANDLE];
                handle = be64_to_cpu(nla_get_be64(attr));
                chain = nft_chain_lookup_byhandle(table, handle, genmask);
        } else {
                attr = nla[NFTA_CHAIN_NAME];
                chain = nft_chain_lookup(net, table, attr, genmask);
        }
        if (IS_ERR(chain)) {
                if (PTR_ERR(chain) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(chain);
        }

        if (nft_chain_binding(chain))
                return -EOPNOTSUPP;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

        if (nla[NFTA_CHAIN_HOOK]) {
                if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN ||
                    chain->flags & NFT_CHAIN_HW_OFFLOAD)
                        return -EOPNOTSUPP;

                if (nft_is_base_chain(chain)) {
                        struct nft_base_chain *basechain = nft_base_chain(chain);

                        if (nft_base_chain_netdev(table->family, basechain->ops.hooknum))
                                return nft_delchain_hook(&ctx, basechain, extack);
                }
        }

        if (info->nlh->nlmsg_flags & NLM_F_NONREC &&
            chain->use > 0)
                return -EBUSY;

        use = chain->use;
        list_for_each_entry(rule, &chain->rules, list) {
                if (!nft_is_active_next(net, rule))
                        continue;
                use--;

                err = nft_delrule(&ctx, rule);
                if (err < 0)
                        return err;
        }

        /* There are rules and elements that are still holding references to us,
         * we cannot do a recursive removal in this case.
         */
        if (use > 0) {
                NL_SET_BAD_ATTR(extack, attr);
                return -EBUSY;
        }

        return nft_delchain(&ctx);
}

/*
 * Expressions
 */

/**
 *        nft_register_expr - register nf_tables expr type
 *        @type: expr type
 *
 *        Registers the expr type for use with nf_tables. Returns zero on
 *        success or a negative errno code otherwise.
 */
int nft_register_expr(struct nft_expr_type *type)
{
        if (WARN_ON_ONCE(type->maxattr > NFT_EXPR_MAXATTR))
                return -ENOMEM;

        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        if (type->family == NFPROTO_UNSPEC)
                list_add_tail_rcu(&type->list, &nf_tables_expressions);
        else
                list_add_rcu(&type->list, &nf_tables_expressions);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
        return 0;
}
EXPORT_SYMBOL_GPL(nft_register_expr);

/**
 *        nft_unregister_expr - unregister nf_tables expr type
 *        @type: expr type
 *
 *         Unregisters the expr typefor use with nf_tables.
 */
void nft_unregister_expr(struct nft_expr_type *type)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_del_rcu(&type->list);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_unregister_expr);

static const struct nft_expr_type *__nft_expr_type_get(u8 family,
                                                       struct nlattr *nla)
{
        const struct nft_expr_type *type, *candidate = NULL;

        list_for_each_entry_rcu(type, &nf_tables_expressions, list) {
                if (!nla_strcmp(nla, type->name)) {
                        if (!type->family && !candidate)
                                candidate = type;
                        else if (type->family == family)
                                candidate = type;
                }
        }
        return candidate;
}

#ifdef CONFIG_MODULES
static int nft_expr_type_request_module(struct net *net, u8 family,
                                        struct nlattr *nla)
{
        if (nft_request_module(net, "nft-expr-%u-%.*s", family,
                               nla_len(nla), (char *)nla_data(nla)) == -EAGAIN)
                return -EAGAIN;

        return 0;
}
#endif

static const struct nft_expr_type *nft_expr_type_get(struct net *net,
                                                     u8 family,
                                                     struct nlattr *nla)
{
        const struct nft_expr_type *type;

        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        rcu_read_lock();
        type = __nft_expr_type_get(family, nla);
        if (type != NULL && try_module_get(type->owner)) {
                rcu_read_unlock();
                return type;
        }
        rcu_read_unlock();

        lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
        if (type == NULL) {
                if (nft_expr_type_request_module(net, family, nla) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);

                if (nft_request_module(net, "nft-expr-%.*s",
                                       nla_len(nla),
                                       (char *)nla_data(nla)) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);
        }
#endif
        return ERR_PTR(-ENOENT);
}

static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
        [NFTA_EXPR_NAME]        = { .type = NLA_STRING,
                                    .len = NFT_MODULE_AUTOLOAD_LIMIT },
        [NFTA_EXPR_DATA]        = { .type = NLA_NESTED },
};

static int nf_tables_fill_expr_info(struct sk_buff *skb,
                                    const struct nft_expr *expr, bool reset)
{
        if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
                goto nla_put_failure;

        if (expr->ops->dump) {
                struct nlattr *data = nla_nest_start_noflag(skb,
                                                            NFTA_EXPR_DATA);
                if (data == NULL)
                        goto nla_put_failure;
                if (expr->ops->dump(skb, expr, reset) < 0)
                        goto nla_put_failure;
                nla_nest_end(skb, data);
        }

        return skb->len;

nla_put_failure:
        return -1;
};

int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
                  const struct nft_expr *expr, bool reset)
{
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, attr);
        if (!nest)
                goto nla_put_failure;
        if (nf_tables_fill_expr_info(skb, expr, reset) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        return -1;
}

struct nft_expr_info {
        const struct nft_expr_ops        *ops;
        const struct nlattr                *attr;
        struct nlattr                        *tb[NFT_EXPR_MAXATTR + 1];
};

static int nf_tables_expr_parse(const struct nft_ctx *ctx,
                                const struct nlattr *nla,
                                struct nft_expr_info *info)
{
        const struct nft_expr_type *type;
        const struct nft_expr_ops *ops;
        struct nlattr *tb[NFTA_EXPR_MAX + 1];
        int err;

        err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
                                          nft_expr_policy, NULL);
        if (err < 0)
                return err;

        type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]);
        if (IS_ERR(type))
                return PTR_ERR(type);

        if (tb[NFTA_EXPR_DATA]) {
                err = nla_parse_nested_deprecated(info->tb, type->maxattr,
                                                  tb[NFTA_EXPR_DATA],
                                                  type->policy, NULL);
                if (err < 0)
                        goto err1;
        } else
                memset(info->tb, 0, sizeof(info->tb[0]) * (type->maxattr + 1));

        if (type->select_ops != NULL) {
                ops = type->select_ops(ctx,
                                       (const struct nlattr * const *)info->tb);
                if (IS_ERR(ops)) {
                        err = PTR_ERR(ops);
#ifdef CONFIG_MODULES
                        if (err == -EAGAIN)
                                if (nft_expr_type_request_module(ctx->net,
                                                                 ctx->family,
                                                                 tb[NFTA_EXPR_NAME]) != -EAGAIN)
                                        err = -ENOENT;
#endif
                        goto err1;
                }
        } else
                ops = type->ops;

        info->attr = nla;
        info->ops = ops;

        return 0;

err1:
        module_put(type->owner);
        return err;
}

int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
                         struct nft_expr_info *info)
{
        struct nlattr *tb[NFTA_EXPR_MAX + 1];
        const struct nft_expr_type *type;
        int err;

        err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
                                          nft_expr_policy, NULL);
        if (err < 0)
                return err;

        if (!tb[NFTA_EXPR_DATA] || !tb[NFTA_EXPR_NAME])
                return -EINVAL;

        rcu_read_lock();

        type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
        if (!type) {
                err = -ENOENT;
                goto out_unlock;
        }

        if (!type->inner_ops) {
                err = -EOPNOTSUPP;
                goto out_unlock;
        }

        err = nla_parse_nested_deprecated(info->tb, type->maxattr,
                                          tb[NFTA_EXPR_DATA],
                                          type->policy, NULL);
        if (err < 0)
                goto out_unlock;

        info->attr = nla;
        info->ops = type->inner_ops;

        /* No module reference will be taken on type->owner.
         * Presence of type->inner_ops implies that the expression
         * is builtin, so it cannot go away.
         */
        rcu_read_unlock();
        return 0;

out_unlock:
        rcu_read_unlock();
        return err;
}

static int nf_tables_newexpr(const struct nft_ctx *ctx,
                             const struct nft_expr_info *expr_info,
                             struct nft_expr *expr)
{
        const struct nft_expr_ops *ops = expr_info->ops;
        int err;

        expr->ops = ops;
        if (ops->init) {
                err = ops->init(ctx, expr, (const struct nlattr **)expr_info->tb);
                if (err < 0)
                        goto err1;
        }

        return 0;
err1:
        expr->ops = NULL;
        return err;
}

static void nf_tables_expr_destroy(const struct nft_ctx *ctx,
                                   struct nft_expr *expr)
{
        const struct nft_expr_type *type = expr->ops->type;

        if (expr->ops->destroy)
                expr->ops->destroy(ctx, expr);
        module_put(type->owner);
}

static struct nft_expr *nft_expr_init(const struct nft_ctx *ctx,
                                      const struct nlattr *nla)
{
        struct nft_expr_info expr_info;
        struct nft_expr *expr;
        struct module *owner;
        int err;

        err = nf_tables_expr_parse(ctx, nla, &expr_info);
        if (err < 0)
                goto err_expr_parse;

        err = -EOPNOTSUPP;
        if (!(expr_info.ops->type->flags & NFT_EXPR_STATEFUL))
                goto err_expr_stateful;

        err = -ENOMEM;
        expr = kzalloc(expr_info.ops->size, GFP_KERNEL_ACCOUNT);
        if (expr == NULL)
                goto err_expr_stateful;

        err = nf_tables_newexpr(ctx, &expr_info, expr);
        if (err < 0)
                goto err_expr_new;

        return expr;
err_expr_new:
        kfree(expr);
err_expr_stateful:
        owner = expr_info.ops->type->owner;
        if (expr_info.ops->type->release_ops)
                expr_info.ops->type->release_ops(expr_info.ops);

        module_put(owner);
err_expr_parse:
        return ERR_PTR(err);
}

int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp)
{
        int err;

        if (WARN_ON_ONCE(!src->ops->clone))
                return -EINVAL;

        dst->ops = src->ops;
        err = src->ops->clone(dst, src, gfp);
        if (err < 0)
                return err;

        __module_get(src->ops->type->owner);

        return 0;
}

void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr)
{
        nf_tables_expr_destroy(ctx, expr);
        kfree(expr);
}

/*
 * Rules
 */

static struct nft_rule *__nft_rule_lookup(const struct net *net,
                                          const struct nft_chain *chain,
                                          u64 handle)
{
        struct nft_rule *rule;

        // FIXME: this sucks
        list_for_each_entry_rcu(rule, &chain->rules, list,
                                lockdep_commit_lock_is_held(net)) {
                if (handle == rule->handle)
                        return rule;
        }

        return ERR_PTR(-ENOENT);
}

static struct nft_rule *nft_rule_lookup(const struct net *net,
                                        const struct nft_chain *chain,
                                        const struct nlattr *nla)
{
        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla)));
}

static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = {
        [NFTA_RULE_TABLE]        = { .type = NLA_STRING,
                                    .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_RULE_CHAIN]        = { .type = NLA_STRING,
                                    .len = NFT_CHAIN_MAXNAMELEN - 1 },
        [NFTA_RULE_HANDLE]        = { .type = NLA_U64 },
        [NFTA_RULE_EXPRESSIONS]        = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
        [NFTA_RULE_COMPAT]        = { .type = NLA_NESTED },
        [NFTA_RULE_POSITION]        = { .type = NLA_U64 },
        [NFTA_RULE_USERDATA]        = { .type = NLA_BINARY,
                                    .len = NFT_USERDATA_MAXLEN },
        [NFTA_RULE_ID]                = { .type = NLA_U32 },
        [NFTA_RULE_POSITION_ID]        = { .type = NLA_U32 },
        [NFTA_RULE_CHAIN_ID]        = { .type = NLA_U32 },
};

static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
                                    u32 portid, u32 seq, int event,
                                    u32 flags, int family,
                                    const struct nft_table *table,
                                    const struct nft_chain *chain,
                                    const struct nft_rule *rule, u64 handle,
                                    bool reset)
{
        struct nlmsghdr *nlh;
        const struct nft_expr *expr, *next;
        struct nlattr *list;
        u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);

        nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
                           nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_RULE_TABLE, table->name))
                goto nla_put_failure;
        if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name))
                goto nla_put_failure;
        if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle),
                         NFTA_RULE_PAD))
                goto nla_put_failure;

        if (event != NFT_MSG_DELRULE && handle) {
                if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(handle),
                                 NFTA_RULE_PAD))
                        goto nla_put_failure;
        }

        if (chain->flags & NFT_CHAIN_HW_OFFLOAD)
                nft_flow_rule_stats(chain, rule);

        list = nla_nest_start_noflag(skb, NFTA_RULE_EXPRESSIONS);
        if (list == NULL)
                goto nla_put_failure;
        nft_rule_for_each_expr(expr, next, rule) {
                if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
                        goto nla_put_failure;
        }
        nla_nest_end(skb, list);

        if (rule->udata) {
                struct nft_userdata *udata = nft_userdata(rule);
                if (nla_put(skb, NFTA_RULE_USERDATA, udata->len + 1,
                            udata->data) < 0)
                        goto nla_put_failure;
        }

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static void nf_tables_rule_notify(const struct nft_ctx *ctx,
                                  const struct nft_rule *rule, int event)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        const struct nft_rule *prule;
        struct sk_buff *skb;
        u64 handle = 0;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (event == NFT_MSG_NEWRULE &&
            !list_is_first(&rule->list, &ctx->chain->rules) &&
            !list_is_last(&rule->list, &ctx->chain->rules)) {
                prule = list_prev_entry(rule, list);
                handle = prule->handle;
        }
        if (ctx->flags & (NLM_F_APPEND | NLM_F_REPLACE))
                flags |= NLM_F_APPEND;
        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
                                       event, flags, ctx->family, ctx->table,
                                       ctx->chain, rule, handle, false);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static void audit_log_rule_reset(const struct nft_table *table,
                                 unsigned int base_seq,
                                 unsigned int nentries)
{
        char *buf = kasprintf(GFP_ATOMIC, "%s:%u",
                              table->name, base_seq);

        audit_log_nfcfg(buf, table->family, nentries,
                        AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
        kfree(buf);
}

struct nft_rule_dump_ctx {
        unsigned int s_idx;
        char *table;
        char *chain;
        bool reset;
};

static int __nf_tables_dump_rules(struct sk_buff *skb,
                                  unsigned int *idx,
                                  struct netlink_callback *cb,
                                  const struct nft_table *table,
                                  const struct nft_chain *chain)
{
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
        struct net *net = sock_net(skb->sk);
        const struct nft_rule *rule, *prule;
        unsigned int entries = 0;
        int ret = 0;
        u64 handle;

        prule = NULL;
        list_for_each_entry_rcu(rule, &chain->rules, list) {
                if (!nft_is_active(net, rule))
                        goto cont_skip;
                if (*idx < ctx->s_idx)
                        goto cont;
                if (prule)
                        handle = prule->handle;
                else
                        handle = 0;

                if (nf_tables_fill_rule_info(skb, net, NETLINK_CB(cb->skb).portid,
                                        cb->nlh->nlmsg_seq,
                                        NFT_MSG_NEWRULE,
                                        NLM_F_MULTI | NLM_F_APPEND,
                                        table->family,
                                        table, chain, rule, handle, ctx->reset) < 0) {
                        ret = 1;
                        break;
                }
                entries++;
                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                prule = rule;
cont_skip:
                (*idx)++;
        }

        if (ctx->reset && entries)
                audit_log_rule_reset(table, cb->seq, entries);

        return ret;
}

static int nf_tables_dump_rules(struct sk_buff *skb,
                                struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
        struct nft_table *table;
        const struct nft_chain *chain;
        unsigned int idx = 0;
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;
        struct nftables_pernet *nft_net;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                if (ctx->table && strcmp(ctx->table, table->name) != 0)
                        continue;

                if (ctx->table && ctx->chain) {
                        struct rhlist_head *list, *tmp;

                        list = rhltable_lookup(&table->chains_ht, ctx->chain,
                                               nft_chain_ht_params);
                        if (!list)
                                goto done;

                        rhl_for_each_entry_rcu(chain, tmp, list, rhlhead) {
                                if (!nft_is_active(net, chain))
                                        continue;
                                __nf_tables_dump_rules(skb, &idx,
                                                       cb, table, chain);
                                break;
                        }
                        goto done;
                }

                list_for_each_entry_rcu(chain, &table->chains, list) {
                        if (__nf_tables_dump_rules(skb, &idx,
                                                   cb, table, chain))
                                goto done;
                }

                if (ctx->table)
                        break;
        }
done:
        rcu_read_unlock();

        ctx->s_idx = idx;
        return skb->len;
}

static int nf_tables_dumpreset_rules(struct sk_buff *skb,
                                     struct netlink_callback *cb)
{
        struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
        int ret;

        /* Mutex is held is to prevent that two concurrent dump-and-reset calls
         * do not underrun counters and quotas. The commit_mutex is used for
         * the lack a better lock, this is not transaction path.
         */
        mutex_lock(&nft_net->commit_mutex);
        ret = nf_tables_dump_rules(skb, cb);
        mutex_unlock(&nft_net->commit_mutex);

        return ret;
}

static int nf_tables_dump_rules_start(struct netlink_callback *cb)
{
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;
        const struct nlattr * const *nla = cb->data;

        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));

        if (nla[NFTA_RULE_TABLE]) {
                ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], GFP_ATOMIC);
                if (!ctx->table)
                        return -ENOMEM;
        }
        if (nla[NFTA_RULE_CHAIN]) {
                ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], GFP_ATOMIC);
                if (!ctx->chain) {
                        kfree(ctx->table);
                        return -ENOMEM;
                }
        }
        return 0;
}

static int nf_tables_dumpreset_rules_start(struct netlink_callback *cb)
{
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;

        ctx->reset = true;

        return nf_tables_dump_rules_start(cb);
}

static int nf_tables_dump_rules_done(struct netlink_callback *cb)
{
        struct nft_rule_dump_ctx *ctx = (void *)cb->ctx;

        kfree(ctx->table);
        kfree(ctx->chain);
        return 0;
}

/* Caller must hold rcu read lock or transaction mutex */
static struct sk_buff *
nf_tables_getrule_single(u32 portid, const struct nfnl_info *info,
                         const struct nlattr * const nla[], bool reset)
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_chain *chain;
        const struct nft_rule *rule;
        struct net *net = info->net;
        struct nft_table *table;
        struct sk_buff *skb2;
        int err;

        table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
                return ERR_CAST(table);
        }

        chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
        if (IS_ERR(chain)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
                return ERR_CAST(chain);
        }

        rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]);
        if (IS_ERR(rule)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
                return ERR_CAST(rule);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return ERR_PTR(-ENOMEM);

        err = nf_tables_fill_rule_info(skb2, net, portid,
                                       info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
                                       family, table, chain, rule, 0, reset);
        if (err < 0) {
                kfree_skb(skb2);
                return ERR_PTR(err);
        }

        return skb2;
}

static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const nla[])
{
        u32 portid = NETLINK_CB(skb).portid;
        struct net *net = info->net;
        struct sk_buff *skb2;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start= nf_tables_dump_rules_start,
                        .dump = nf_tables_dump_rules,
                        .done = nf_tables_dump_rules_done,
                        .module = THIS_MODULE,
                        .data = (void *)nla,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        skb2 = nf_tables_getrule_single(portid, info, nla, false);
        if (IS_ERR(skb2))
                return PTR_ERR(skb2);

        return nfnetlink_unicast(skb2, net, portid);
}

static int nf_tables_getrule_reset(struct sk_buff *skb,
                                   const struct nfnl_info *info,
                                   const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        u32 portid = NETLINK_CB(skb).portid;
        struct net *net = info->net;
        struct sk_buff *skb2;
        char *buf;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start= nf_tables_dumpreset_rules_start,
                        .dump = nf_tables_dumpreset_rules,
                        .done = nf_tables_dump_rules_done,
                        .module = THIS_MODULE,
                        .data = (void *)nla,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!try_module_get(THIS_MODULE))
                return -EINVAL;
        rcu_read_unlock();
        mutex_lock(&nft_net->commit_mutex);
        skb2 = nf_tables_getrule_single(portid, info, nla, true);
        mutex_unlock(&nft_net->commit_mutex);
        rcu_read_lock();
        module_put(THIS_MODULE);

        if (IS_ERR(skb2))
                return PTR_ERR(skb2);

        buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
                        nla_len(nla[NFTA_RULE_TABLE]),
                        (char *)nla_data(nla[NFTA_RULE_TABLE]),
                        nft_net->base_seq);
        audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
                        AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
        kfree(buf);

        return nfnetlink_unicast(skb2, net, portid);
}

void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule)
{
        struct nft_expr *expr, *next;

        /*
         * Careful: some expressions might not be initialized in case this
         * is called on error from nf_tables_newrule().
         */
        expr = nft_expr_first(rule);
        while (nft_expr_more(rule, expr)) {
                next = nft_expr_next(expr);
                nf_tables_expr_destroy(ctx, expr);
                expr = next;
        }
        kfree(rule);
}

/* can only be used if rule is no longer visible to dumps */
static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule)
{
        lockdep_commit_lock_is_held(ctx->net);

        nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE);
        nf_tables_rule_destroy(ctx, rule);
}

/** nft_chain_validate - loop detection and hook validation
 *
 * @ctx: context containing call depth and base chain
 * @chain: chain to validate
 *
 * Walk through the rules of the given chain and chase all jumps/gotos
 * and set lookups until either the jump limit is hit or all reachable
 * chains have been validated.
 */
int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
{
        struct nft_expr *expr, *last;
        struct nft_rule *rule;
        int err;

        if (ctx->level == NFT_JUMP_STACK_SIZE)
                return -EMLINK;

        list_for_each_entry(rule, &chain->rules, list) {
                if (fatal_signal_pending(current))
                        return -EINTR;

                if (!nft_is_active_next(ctx->net, rule))
                        continue;

                nft_rule_for_each_expr(expr, last, rule) {
                        if (!expr->ops->validate)
                                continue;

                        /* This may call nft_chain_validate() recursively,
                         * callers that do so must increment ctx->level.
                         */
                        err = expr->ops->validate(ctx, expr);
                        if (err < 0)
                                return err;
                }
        }

        return 0;
}
EXPORT_SYMBOL_GPL(nft_chain_validate);

static int nft_table_validate(struct net *net, const struct nft_table *table)
{
        struct nft_chain *chain;
        struct nft_ctx ctx = {
                .net        = net,
                .family        = table->family,
        };
        int err;

        list_for_each_entry(chain, &table->chains, list) {
                if (!nft_is_base_chain(chain))
                        continue;

                ctx.chain = chain;
                err = nft_chain_validate(&ctx, chain);
                if (err < 0)
                        return err;

                cond_resched();
        }

        return 0;
}

int nft_setelem_validate(const struct nft_ctx *ctx, struct nft_set *set,
                         const struct nft_set_iter *iter,
                         struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        struct nft_ctx *pctx = (struct nft_ctx *)ctx;
        const struct nft_data *data;
        int err;

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
            *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
                return 0;

        data = nft_set_ext_data(ext);
        switch (data->verdict.code) {
        case NFT_JUMP:
        case NFT_GOTO:
                pctx->level++;
                err = nft_chain_validate(ctx, data->verdict.chain);
                if (err < 0)
                        return err;
                pctx->level--;
                break;
        default:
                break;
        }

        return 0;
}

int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set)
{
        struct nft_set_iter dummy_iter = {
                .genmask        = nft_genmask_next(ctx->net),
        };
        struct nft_set_elem_catchall *catchall;

        struct nft_set_ext *ext;
        int ret = 0;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list,
                                lockdep_commit_lock_is_held(ctx->net)) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, dummy_iter.genmask))
                        continue;

                ret = nft_setelem_validate(ctx, set, &dummy_iter, catchall->elem);
                if (ret < 0)
                        return ret;
        }

        return ret;
}

static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
                                             const struct nft_chain *chain,
                                             const struct nlattr *nla);

#define NFT_RULE_MAXEXPRS        128

static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        struct netlink_ext_ack *extack = info->extack;
        unsigned int size, i, n, ulen = 0, usize = 0;
        u8 genmask = nft_genmask_next(info->net);
        struct nft_rule *rule, *old_rule = NULL;
        struct nft_expr_info *expr_info = NULL;
        u8 family = info->nfmsg->nfgen_family;
        struct nft_flow_rule *flow = NULL;
        struct net *net = info->net;
        struct nft_userdata *udata;
        struct nft_table *table;
        struct nft_chain *chain;
        struct nft_trans *trans;
        u64 handle, pos_handle;
        struct nft_expr *expr;
        struct nft_ctx ctx;
        struct nlattr *tmp;
        int err, rem;

        lockdep_assert_held(&nft_net->commit_mutex);

        table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_RULE_CHAIN]) {
                chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
                                         genmask);
                if (IS_ERR(chain)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
                        return PTR_ERR(chain);
                }

        } else if (nla[NFTA_RULE_CHAIN_ID]) {
                chain = nft_chain_lookup_byid(net, table, nla[NFTA_RULE_CHAIN_ID],
                                              genmask);
                if (IS_ERR(chain)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN_ID]);
                        return PTR_ERR(chain);
                }
        } else {
                return -EINVAL;
        }

        if (nft_chain_is_bound(chain))
                return -EOPNOTSUPP;

        if (nla[NFTA_RULE_HANDLE]) {
                handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE]));
                rule = __nft_rule_lookup(net, chain, handle);
                if (IS_ERR(rule)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
                        return PTR_ERR(rule);
                }

                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        old_rule = rule;
                else
                        return -EOPNOTSUPP;
        } else {
                if (!(info->nlh->nlmsg_flags & NLM_F_CREATE) ||
                    info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EINVAL;
                handle = nf_tables_alloc_handle(table);

                if (nla[NFTA_RULE_POSITION]) {
                        pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION]));
                        old_rule = __nft_rule_lookup(net, chain, pos_handle);
                        if (IS_ERR(old_rule)) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]);
                                return PTR_ERR(old_rule);
                        }
                } else if (nla[NFTA_RULE_POSITION_ID]) {
                        old_rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_POSITION_ID]);
                        if (IS_ERR(old_rule)) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]);
                                return PTR_ERR(old_rule);
                        }
                }
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

        n = 0;
        size = 0;
        if (nla[NFTA_RULE_EXPRESSIONS]) {
                expr_info = kvmalloc_array(NFT_RULE_MAXEXPRS,
                                           sizeof(struct nft_expr_info),
                                           GFP_KERNEL);
                if (!expr_info)
                        return -ENOMEM;

                nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
                        err = -EINVAL;
                        if (nla_type(tmp) != NFTA_LIST_ELEM)
                                goto err_release_expr;
                        if (n == NFT_RULE_MAXEXPRS)
                                goto err_release_expr;
                        err = nf_tables_expr_parse(&ctx, tmp, &expr_info[n]);
                        if (err < 0) {
                                NL_SET_BAD_ATTR(extack, tmp);
                                goto err_release_expr;
                        }
                        size += expr_info[n].ops->size;
                        n++;
                }
        }
        /* Check for overflow of dlen field */
        err = -EFBIG;
        if (size >= 1 << 12)
                goto err_release_expr;

        if (nla[NFTA_RULE_USERDATA]) {
                ulen = nla_len(nla[NFTA_RULE_USERDATA]);
                if (ulen > 0)
                        usize = sizeof(struct nft_userdata) + ulen;
        }

        err = -ENOMEM;
        rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL_ACCOUNT);
        if (rule == NULL)
                goto err_release_expr;

        nft_activate_next(net, rule);

        rule->handle = handle;
        rule->dlen   = size;
        rule->udata  = ulen ? 1 : 0;

        if (ulen) {
                udata = nft_userdata(rule);
                udata->len = ulen - 1;
                nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen);
        }

        expr = nft_expr_first(rule);
        for (i = 0; i < n; i++) {
                err = nf_tables_newexpr(&ctx, &expr_info[i], expr);
                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, expr_info[i].attr);
                        goto err_release_rule;
                }

                if (expr_info[i].ops->validate)
                        nft_validate_state_update(table, NFT_VALIDATE_NEED);

                expr_info[i].ops = NULL;
                expr = nft_expr_next(expr);
        }

        if (chain->flags & NFT_CHAIN_HW_OFFLOAD) {
                flow = nft_flow_rule_create(net, rule);
                if (IS_ERR(flow)) {
                        err = PTR_ERR(flow);
                        goto err_release_rule;
                }
        }

        if (!nft_use_inc(&chain->use)) {
                err = -EMFILE;
                goto err_release_rule;
        }

        if (info->nlh->nlmsg_flags & NLM_F_REPLACE) {
                if (nft_chain_binding(chain)) {
                        err = -EOPNOTSUPP;
                        goto err_destroy_flow_rule;
                }

                err = nft_delrule(&ctx, old_rule);
                if (err < 0)
                        goto err_destroy_flow_rule;

                trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
                if (trans == NULL) {
                        err = -ENOMEM;
                        goto err_destroy_flow_rule;
                }
                list_add_tail_rcu(&rule->list, &old_rule->list);
        } else {
                trans = nft_trans_rule_add(&ctx, NFT_MSG_NEWRULE, rule);
                if (!trans) {
                        err = -ENOMEM;
                        goto err_destroy_flow_rule;
                }

                if (info->nlh->nlmsg_flags & NLM_F_APPEND) {
                        if (old_rule)
                                list_add_rcu(&rule->list, &old_rule->list);
                        else
                                list_add_tail_rcu(&rule->list, &chain->rules);
                 } else {
                        if (old_rule)
                                list_add_tail_rcu(&rule->list, &old_rule->list);
                        else
                                list_add_rcu(&rule->list, &chain->rules);
                }
        }
        kvfree(expr_info);

        if (flow)
                nft_trans_flow_rule(trans) = flow;

        if (table->validate_state == NFT_VALIDATE_DO)
                return nft_table_validate(net, table);

        return 0;

err_destroy_flow_rule:
        nft_use_dec_restore(&chain->use);
        if (flow)
                nft_flow_rule_destroy(flow);
err_release_rule:
        nft_rule_expr_deactivate(&ctx, rule, NFT_TRANS_PREPARE_ERROR);
        nf_tables_rule_destroy(&ctx, rule);
err_release_expr:
        for (i = 0; i < n; i++) {
                if (expr_info[i].ops) {
                        module_put(expr_info[i].ops->type->owner);
                        if (expr_info[i].ops->type->release_ops)
                                expr_info[i].ops->type->release_ops(expr_info[i].ops);
                }
        }
        kvfree(expr_info);

        return err;
}

static struct nft_rule *nft_rule_lookup_byid(const struct net *net,
                                             const struct nft_chain *chain,
                                             const struct nlattr *nla)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        u32 id = ntohl(nla_get_be32(nla));
        struct nft_trans *trans;

        list_for_each_entry(trans, &nft_net->commit_list, list) {
                if (trans->msg_type == NFT_MSG_NEWRULE &&
                    nft_trans_rule_chain(trans) == chain &&
                    id == nft_trans_rule_id(trans))
                        return nft_trans_rule(trans);
        }
        return ERR_PTR(-ENOENT);
}

static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_chain *chain = NULL;
        struct net *net = info->net;
        struct nft_table *table;
        struct nft_rule *rule;
        struct nft_ctx ctx;
        int err = 0;

        table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_RULE_CHAIN]) {
                chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
                                         genmask);
                if (IS_ERR(chain)) {
                        if (PTR_ERR(chain) == -ENOENT &&
                            NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
                                return 0;

                        NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
                        return PTR_ERR(chain);
                }
                if (nft_chain_binding(chain))
                        return -EOPNOTSUPP;
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla);

        if (chain) {
                if (nla[NFTA_RULE_HANDLE]) {
                        rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]);
                        if (IS_ERR(rule)) {
                                if (PTR_ERR(rule) == -ENOENT &&
                                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE)
                                        return 0;

                                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]);
                                return PTR_ERR(rule);
                        }

                        err = nft_delrule(&ctx, rule);
                } else if (nla[NFTA_RULE_ID]) {
                        rule = nft_rule_lookup_byid(net, chain, nla[NFTA_RULE_ID]);
                        if (IS_ERR(rule)) {
                                NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_ID]);
                                return PTR_ERR(rule);
                        }

                        err = nft_delrule(&ctx, rule);
                } else {
                        err = nft_delrule_by_chain(&ctx);
                }
        } else {
                list_for_each_entry(chain, &table->chains, list) {
                        if (!nft_is_active_next(net, chain))
                                continue;
                        if (nft_chain_binding(chain))
                                continue;

                        ctx.chain = chain;
                        err = nft_delrule_by_chain(&ctx);
                        if (err < 0)
                                break;
                }
        }

        return err;
}

/*
 * Sets
 */
static const struct nft_set_type *nft_set_types[] = {
        &nft_set_hash_fast_type,
        &nft_set_hash_type,
        &nft_set_rhash_type,
        &nft_set_bitmap_type,
        &nft_set_rbtree_type,
#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
        &nft_set_pipapo_avx2_type,
#endif
        &nft_set_pipapo_type,
};

#define NFT_SET_FEATURES        (NFT_SET_INTERVAL | NFT_SET_MAP | \
                                 NFT_SET_TIMEOUT | NFT_SET_OBJECT | \
                                 NFT_SET_EVAL)

static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags)
{
        return (flags & type->features) == (flags & NFT_SET_FEATURES);
}

/*
 * Select a set implementation based on the data characteristics and the
 * given policy. The total memory use might not be known if no size is
 * given, in that case the amount of memory per element is used.
 */
static const struct nft_set_ops *
nft_select_set_ops(const struct nft_ctx *ctx, u32 flags,
                   const struct nft_set_desc *desc)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        const struct nft_set_ops *ops, *bops;
        struct nft_set_estimate est, best;
        const struct nft_set_type *type;
        int i;

        lockdep_assert_held(&nft_net->commit_mutex);
        lockdep_nfnl_nft_mutex_not_held();

        bops            = NULL;
        best.size   = ~0;
        best.lookup = ~0;
        best.space  = ~0;

        for (i = 0; i < ARRAY_SIZE(nft_set_types); i++) {
                type = nft_set_types[i];
                ops = &type->ops;

                if (!nft_set_ops_candidate(type, flags))
                        continue;
                if (!ops->estimate(desc, flags, &est))
                        continue;

                switch (desc->policy) {
                case NFT_SET_POL_PERFORMANCE:
                        if (est.lookup < best.lookup)
                                break;
                        if (est.lookup == best.lookup &&
                            est.space < best.space)
                                break;
                        continue;
                case NFT_SET_POL_MEMORY:
                        if (!desc->size) {
                                if (est.space < best.space)
                                        break;
                                if (est.space == best.space &&
                                    est.lookup < best.lookup)
                                        break;
                        } else if (est.size < best.size || !bops) {
                                break;
                        }
                        continue;
                default:
                        break;
                }

                bops = ops;
                best = est;
        }

        if (bops != NULL)
                return bops;

        return ERR_PTR(-EOPNOTSUPP);
}

static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
        [NFTA_SET_TABLE]                = { .type = NLA_STRING,
                                            .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_SET_NAME]                        = { .type = NLA_STRING,
                                            .len = NFT_SET_MAXNAMELEN - 1 },
        [NFTA_SET_FLAGS]                = { .type = NLA_U32 },
        [NFTA_SET_KEY_TYPE]                = { .type = NLA_U32 },
        [NFTA_SET_KEY_LEN]                = { .type = NLA_U32 },
        [NFTA_SET_DATA_TYPE]                = { .type = NLA_U32 },
        [NFTA_SET_DATA_LEN]                = { .type = NLA_U32 },
        [NFTA_SET_POLICY]                = { .type = NLA_U32 },
        [NFTA_SET_DESC]                        = { .type = NLA_NESTED },
        [NFTA_SET_ID]                        = { .type = NLA_U32 },
        [NFTA_SET_TIMEOUT]                = { .type = NLA_U64 },
        [NFTA_SET_GC_INTERVAL]                = { .type = NLA_U32 },
        [NFTA_SET_USERDATA]                = { .type = NLA_BINARY,
                                            .len  = NFT_USERDATA_MAXLEN },
        [NFTA_SET_OBJ_TYPE]                = { .type = NLA_U32 },
        [NFTA_SET_HANDLE]                = { .type = NLA_U64 },
        [NFTA_SET_EXPR]                        = { .type = NLA_NESTED },
        [NFTA_SET_EXPRESSIONS]                = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
};

static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = {
        [NFTA_SET_FIELD_LEN]        = { .type = NLA_U32 },
};

static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
        [NFTA_SET_DESC_SIZE]                = { .type = NLA_U32 },
        [NFTA_SET_DESC_CONCAT]                = NLA_POLICY_NESTED_ARRAY(nft_concat_policy),
};

static struct nft_set *nft_set_lookup(const struct net *net,
                                      const struct nft_table *table,
                                      const struct nlattr *nla, u8 genmask)
{
        struct nft_set *set;

        if (nla == NULL)
                return ERR_PTR(-EINVAL);

        list_for_each_entry_rcu(set, &table->sets, list,
                                lockdep_commit_lock_is_held(net)) {
                if (!nla_strcmp(nla, set->name) &&
                    nft_active_genmask(set, genmask))
                        return set;
        }
        return ERR_PTR(-ENOENT);
}

static struct nft_set *nft_set_lookup_byhandle(const struct nft_table *table,
                                               const struct nlattr *nla,
                                               u8 genmask)
{
        struct nft_set *set;

        list_for_each_entry(set, &table->sets, list) {
                if (be64_to_cpu(nla_get_be64(nla)) == set->handle &&
                    nft_active_genmask(set, genmask))
                        return set;
        }
        return ERR_PTR(-ENOENT);
}

static struct nft_set *nft_set_lookup_byid(const struct net *net,
                                           const struct nft_table *table,
                                           const struct nlattr *nla, u8 genmask)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        u32 id = ntohl(nla_get_be32(nla));
        struct nft_trans_set *trans;

        /* its likely the id we need is at the tail, not at start */
        list_for_each_entry_reverse(trans, &nft_net->commit_set_list, list_trans_newset) {
                struct nft_set *set = trans->set;

                if (id == trans->set_id &&
                    set->table == table &&
                    nft_active_genmask(set, genmask))
                        return set;
        }
        return ERR_PTR(-ENOENT);
}

struct nft_set *nft_set_lookup_global(const struct net *net,
                                      const struct nft_table *table,
                                      const struct nlattr *nla_set_name,
                                      const struct nlattr *nla_set_id,
                                      u8 genmask)
{
        struct nft_set *set;

        set = nft_set_lookup(net, table, nla_set_name, genmask);
        if (IS_ERR(set)) {
                if (!nla_set_id)
                        return set;

                set = nft_set_lookup_byid(net, table, nla_set_id, genmask);
        }
        return set;
}
EXPORT_SYMBOL_GPL(nft_set_lookup_global);

static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
                                    const char *name)
{
        const struct nft_set *i;
        const char *p;
        unsigned long *inuse;
        unsigned int n = 0, min = 0;

        p = strchr(name, '%');
        if (p != NULL) {
                if (p[1] != 'd' || strchr(p + 2, '%'))
                        return -EINVAL;

                if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN)
                        return -EINVAL;

                inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
                if (inuse == NULL)
                        return -ENOMEM;
cont:
                list_for_each_entry(i, &ctx->table->sets, list) {
                        int tmp;

                        if (!nft_is_active_next(ctx->net, i))
                                continue;
                        if (!sscanf(i->name, name, &tmp))
                                continue;
                        if (tmp < min || tmp >= min + BITS_PER_BYTE * PAGE_SIZE)
                                continue;

                        set_bit(tmp - min, inuse);
                }

                n = find_first_zero_bit(inuse, BITS_PER_BYTE * PAGE_SIZE);
                if (n >= BITS_PER_BYTE * PAGE_SIZE) {
                        min += BITS_PER_BYTE * PAGE_SIZE;
                        memset(inuse, 0, PAGE_SIZE);
                        goto cont;
                }
                free_page((unsigned long)inuse);
        }

        set->name = kasprintf(GFP_KERNEL_ACCOUNT, name, min + n);
        if (!set->name)
                return -ENOMEM;

        list_for_each_entry(i, &ctx->table->sets, list) {
                if (!nft_is_active_next(ctx->net, i))
                        continue;
                if (!strcmp(set->name, i->name)) {
                        kfree(set->name);
                        set->name = NULL;
                        return -ENFILE;
                }
        }
        return 0;
}

int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result)
{
        u64 ms = be64_to_cpu(nla_get_be64(nla));
        u64 max = (u64)(~((u64)0));

        max = div_u64(max, NSEC_PER_MSEC);
        if (ms >= max)
                return -ERANGE;

        ms *= NSEC_PER_MSEC;
        *result = nsecs_to_jiffies64(ms) ? : !!ms;
        return 0;
}

__be64 nf_jiffies64_to_msecs(u64 input)
{
        return cpu_to_be64(jiffies64_to_msecs(input));
}

static int nf_tables_fill_set_concat(struct sk_buff *skb,
                                     const struct nft_set *set)
{
        struct nlattr *concat, *field;
        int i;

        concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT);
        if (!concat)
                return -ENOMEM;

        for (i = 0; i < set->field_count; i++) {
                field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM);
                if (!field)
                        return -ENOMEM;

                if (nla_put_be32(skb, NFTA_SET_FIELD_LEN,
                                 htonl(set->field_len[i])))
                        return -ENOMEM;

                nla_nest_end(skb, field);
        }

        nla_nest_end(skb, concat);

        return 0;
}

static u32 nft_set_userspace_size(const struct nft_set_ops *ops, u32 size)
{
        if (ops->usize)
                return ops->usize(size);

        return size;
}

static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
                              const struct nft_set *set, u16 event, u16 flags)
{
        u64 timeout = READ_ONCE(set->timeout);
        u32 gc_int = READ_ONCE(set->gc_int);
        u32 portid = ctx->portid;
        struct nlmsghdr *nlh;
        struct nlattr *nest;
        u32 seq = ctx->seq;
        int i;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
                           NFNETLINK_V0, nft_base_seq(ctx->net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
                goto nla_put_failure;
        if (nla_put_string(skb, NFTA_SET_NAME, set->name))
                goto nla_put_failure;
        if (nla_put_be64(skb, NFTA_SET_HANDLE, cpu_to_be64(set->handle),
                         NFTA_SET_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELSET) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (set->flags != 0)
                if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
                        goto nla_put_failure;

        if (nla_put_be32(skb, NFTA_SET_KEY_TYPE, htonl(set->ktype)))
                goto nla_put_failure;
        if (nla_put_be32(skb, NFTA_SET_KEY_LEN, htonl(set->klen)))
                goto nla_put_failure;
        if (set->flags & NFT_SET_MAP) {
                if (nla_put_be32(skb, NFTA_SET_DATA_TYPE, htonl(set->dtype)))
                        goto nla_put_failure;
                if (nla_put_be32(skb, NFTA_SET_DATA_LEN, htonl(set->dlen)))
                        goto nla_put_failure;
        }
        if (set->flags & NFT_SET_OBJECT &&
            nla_put_be32(skb, NFTA_SET_OBJ_TYPE, htonl(set->objtype)))
                goto nla_put_failure;

        if (timeout &&
            nla_put_be64(skb, NFTA_SET_TIMEOUT,
                         nf_jiffies64_to_msecs(timeout),
                         NFTA_SET_PAD))
                goto nla_put_failure;
        if (gc_int &&
            nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(gc_int)))
                goto nla_put_failure;

        if (set->policy != NFT_SET_POL_PERFORMANCE) {
                if (nla_put_be32(skb, NFTA_SET_POLICY, htonl(set->policy)))
                        goto nla_put_failure;
        }

        if (set->udata &&
            nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
                goto nla_put_failure;

        nest = nla_nest_start_noflag(skb, NFTA_SET_DESC);
        if (!nest)
                goto nla_put_failure;
        if (set->size &&
            nla_put_be32(skb, NFTA_SET_DESC_SIZE,
                         htonl(nft_set_userspace_size(set->ops, set->size))))
                goto nla_put_failure;

        if (set->field_count > 1 &&
            nf_tables_fill_set_concat(skb, set))
                goto nla_put_failure;

        nla_nest_end(skb, nest);

        if (set->num_exprs == 1) {
                nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
                if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0)
                        goto nla_put_failure;

                nla_nest_end(skb, nest);
        } else if (set->num_exprs > 1) {
                nest = nla_nest_start_noflag(skb, NFTA_SET_EXPRESSIONS);
                if (nest == NULL)
                        goto nla_put_failure;

                for (i = 0; i < set->num_exprs; i++) {
                        if (nft_expr_dump(skb, NFTA_LIST_ELEM,
                                          set->exprs[i], false) < 0)
                                goto nla_put_failure;
                }
                nla_nest_end(skb, nest);
        }

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static void nf_tables_set_notify(const struct nft_ctx *ctx,
                                 const struct nft_set *set, int event,
                                 gfp_t gfp_flags)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        u32 portid = ctx->portid;
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, gfp_flags);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_set(skb, ctx, set, event, flags);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nft_set *set;
        unsigned int idx, s_idx = cb->args[0];
        struct nft_table *table, *cur_table = (struct nft_table *)cb->args[2];
        struct net *net = sock_net(skb->sk);
        struct nft_ctx *ctx = cb->data, ctx_set;
        struct nftables_pernet *nft_net;

        if (cb->args[1])
                return skb->len;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (ctx->family != NFPROTO_UNSPEC &&
                    ctx->family != table->family)
                        continue;

                if (ctx->table && ctx->table != table)
                        continue;

                if (cur_table) {
                        if (cur_table != table)
                                continue;

                        cur_table = NULL;
                }
                idx = 0;
                list_for_each_entry_rcu(set, &table->sets, list) {
                        if (idx < s_idx)
                                goto cont;
                        if (!nft_is_active(net, set))
                                goto cont;

                        ctx_set = *ctx;
                        ctx_set.table = table;
                        ctx_set.family = table->family;

                        if (nf_tables_fill_set(skb, &ctx_set, set,
                                               NFT_MSG_NEWSET,
                                               NLM_F_MULTI) < 0) {
                                cb->args[0] = idx;
                                cb->args[2] = (unsigned long) table;
                                goto done;
                        }
                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                        idx++;
                }
                if (s_idx)
                        s_idx = 0;
        }
        cb->args[1] = 1;
done:
        rcu_read_unlock();
        return skb->len;
}

static int nf_tables_dump_sets_start(struct netlink_callback *cb)
{
        struct nft_ctx *ctx_dump = NULL;

        ctx_dump = kmemdup(cb->data, sizeof(*ctx_dump), GFP_ATOMIC);
        if (ctx_dump == NULL)
                return -ENOMEM;

        cb->data = ctx_dump;
        return 0;
}

static int nf_tables_dump_sets_done(struct netlink_callback *cb)
{
        kfree(cb->data);
        return 0;
}

/* called with rcu_read_lock held */
static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_table *table = NULL;
        struct net *net = info->net;
        const struct nft_set *set;
        struct sk_buff *skb2;
        struct nft_ctx ctx;
        int err;

        if (nla[NFTA_SET_TABLE]) {
                table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
                                         genmask, 0);
                if (IS_ERR(table)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
                        return PTR_ERR(table);
                }
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_sets_start,
                        .dump = nf_tables_dump_sets,
                        .done = nf_tables_dump_sets_done,
                        .data = &ctx,
                        .module = THIS_MODULE,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        /* Only accept unspec with dump */
        if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
                return -EAFNOSUPPORT;
        if (!nla[NFTA_SET_TABLE])
                return -EINVAL;

        set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask);
        if (IS_ERR(set)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
                return PTR_ERR(set);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb2 == NULL)
                return -ENOMEM;

        err = nf_tables_fill_set(skb2, &ctx, set, NFT_MSG_NEWSET, 0);
        if (err < 0)
                goto err_fill_set_info;

        return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);

err_fill_set_info:
        kfree_skb(skb2);
        return err;
}

static int nft_set_desc_concat_parse(const struct nlattr *attr,
                                     struct nft_set_desc *desc)
{
        struct nlattr *tb[NFTA_SET_FIELD_MAX + 1];
        u32 len;
        int err;

        if (desc->field_count >= ARRAY_SIZE(desc->field_len))
                return -E2BIG;

        err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr,
                                          nft_concat_policy, NULL);
        if (err < 0)
                return err;

        if (!tb[NFTA_SET_FIELD_LEN])
                return -EINVAL;

        len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN]));
        if (!len || len > U8_MAX)
                return -EINVAL;

        desc->field_len[desc->field_count++] = len;

        return 0;
}

static int nft_set_desc_concat(struct nft_set_desc *desc,
                               const struct nlattr *nla)
{
        u32 len = 0, num_regs;
        struct nlattr *attr;
        int rem, err, i;

        nla_for_each_nested(attr, nla, rem) {
                if (nla_type(attr) != NFTA_LIST_ELEM)
                        return -EINVAL;

                err = nft_set_desc_concat_parse(attr, desc);
                if (err < 0)
                        return err;
        }

        for (i = 0; i < desc->field_count; i++)
                len += round_up(desc->field_len[i], sizeof(u32));

        if (len != desc->klen)
                return -EINVAL;

        num_regs = DIV_ROUND_UP(desc->klen, sizeof(u32));
        if (num_regs > NFT_REG32_COUNT)
                return -E2BIG;

        return 0;
}

static int nf_tables_set_desc_parse(struct nft_set_desc *desc,
                                    const struct nlattr *nla)
{
        struct nlattr *da[NFTA_SET_DESC_MAX + 1];
        int err;

        err = nla_parse_nested_deprecated(da, NFTA_SET_DESC_MAX, nla,
                                          nft_set_desc_policy, NULL);
        if (err < 0)
                return err;

        if (da[NFTA_SET_DESC_SIZE] != NULL)
                desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE]));
        if (da[NFTA_SET_DESC_CONCAT])
                err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]);

        return err;
}

static int nft_set_expr_alloc(struct nft_ctx *ctx, struct nft_set *set,
                              const struct nlattr * const *nla,
                              struct nft_expr **exprs, int *num_exprs,
                              u32 flags)
{
        struct nft_expr *expr;
        int err, i;

        if (nla[NFTA_SET_EXPR]) {
                expr = nft_set_elem_expr_alloc(ctx, set, nla[NFTA_SET_EXPR]);
                if (IS_ERR(expr)) {
                        err = PTR_ERR(expr);
                        goto err_set_expr_alloc;
                }
                exprs[0] = expr;
                (*num_exprs)++;
        } else if (nla[NFTA_SET_EXPRESSIONS]) {
                struct nlattr *tmp;
                int left;

                if (!(flags & NFT_SET_EXPR)) {
                        err = -EINVAL;
                        goto err_set_expr_alloc;
                }
                i = 0;
                nla_for_each_nested(tmp, nla[NFTA_SET_EXPRESSIONS], left) {
                        if (i == NFT_SET_EXPR_MAX) {
                                err = -E2BIG;
                                goto err_set_expr_alloc;
                        }
                        if (nla_type(tmp) != NFTA_LIST_ELEM) {
                                err = -EINVAL;
                                goto err_set_expr_alloc;
                        }
                        expr = nft_set_elem_expr_alloc(ctx, set, tmp);
                        if (IS_ERR(expr)) {
                                err = PTR_ERR(expr);
                                goto err_set_expr_alloc;
                        }
                        exprs[i++] = expr;
                        (*num_exprs)++;
                }
        }

        return 0;

err_set_expr_alloc:
        for (i = 0; i < *num_exprs; i++)
                nft_expr_destroy(ctx, exprs[i]);

        return err;
}

static bool nft_set_is_same(const struct nft_set *set,
                            const struct nft_set_desc *desc,
                            struct nft_expr *exprs[], u32 num_exprs, u32 flags)
{
        int i;

        if (set->ktype != desc->ktype ||
            set->dtype != desc->dtype ||
            set->flags != flags ||
            set->klen != desc->klen ||
            set->dlen != desc->dlen ||
            set->field_count != desc->field_count ||
            set->num_exprs != num_exprs)
                return false;

        for (i = 0; i < desc->field_count; i++) {
                if (set->field_len[i] != desc->field_len[i])
                        return false;
        }

        for (i = 0; i < num_exprs; i++) {
                if (set->exprs[i]->ops != exprs[i]->ops)
                        return false;
        }

        return true;
}

static u32 nft_set_kernel_size(const struct nft_set_ops *ops,
                               const struct nft_set_desc *desc)
{
        if (ops->ksize)
                return ops->ksize(desc->size);

        return desc->size;
}

static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_set_ops *ops;
        struct net *net = info->net;
        struct nft_set_desc desc;
        struct nft_table *table;
        unsigned char *udata;
        struct nft_set *set;
        struct nft_ctx ctx;
        size_t alloc_size;
        int num_exprs = 0;
        char *name;
        int err, i;
        u16 udlen;
        u32 flags;
        u64 size;

        if (nla[NFTA_SET_TABLE] == NULL ||
            nla[NFTA_SET_NAME] == NULL ||
            nla[NFTA_SET_KEY_LEN] == NULL ||
            nla[NFTA_SET_ID] == NULL)
                return -EINVAL;

        memset(&desc, 0, sizeof(desc));

        desc.ktype = NFT_DATA_VALUE;
        if (nla[NFTA_SET_KEY_TYPE] != NULL) {
                desc.ktype = ntohl(nla_get_be32(nla[NFTA_SET_KEY_TYPE]));
                if ((desc.ktype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK)
                        return -EINVAL;
        }

        desc.klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN]));
        if (desc.klen == 0 || desc.klen > NFT_DATA_VALUE_MAXLEN)
                return -EINVAL;

        flags = 0;
        if (nla[NFTA_SET_FLAGS] != NULL) {
                flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS]));
                if (flags & ~(NFT_SET_ANONYMOUS | NFT_SET_CONSTANT |
                              NFT_SET_INTERVAL | NFT_SET_TIMEOUT |
                              NFT_SET_MAP | NFT_SET_EVAL |
                              NFT_SET_OBJECT | NFT_SET_CONCAT | NFT_SET_EXPR))
                        return -EOPNOTSUPP;
                /* Only one of these operations is supported */
                if ((flags & (NFT_SET_MAP | NFT_SET_OBJECT)) ==
                             (NFT_SET_MAP | NFT_SET_OBJECT))
                        return -EOPNOTSUPP;
                if ((flags & (NFT_SET_EVAL | NFT_SET_OBJECT)) ==
                             (NFT_SET_EVAL | NFT_SET_OBJECT))
                        return -EOPNOTSUPP;
                if ((flags & (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT | NFT_SET_EVAL)) ==
                             (NFT_SET_ANONYMOUS | NFT_SET_TIMEOUT))
                        return -EOPNOTSUPP;
                if ((flags & (NFT_SET_CONSTANT | NFT_SET_TIMEOUT)) ==
                             (NFT_SET_CONSTANT | NFT_SET_TIMEOUT))
                        return -EOPNOTSUPP;
        }

        desc.dtype = 0;
        if (nla[NFTA_SET_DATA_TYPE] != NULL) {
                if (!(flags & NFT_SET_MAP))
                        return -EINVAL;

                desc.dtype = ntohl(nla_get_be32(nla[NFTA_SET_DATA_TYPE]));
                if ((desc.dtype & NFT_DATA_RESERVED_MASK) == NFT_DATA_RESERVED_MASK &&
                    desc.dtype != NFT_DATA_VERDICT)
                        return -EINVAL;

                if (desc.dtype != NFT_DATA_VERDICT) {
                        if (nla[NFTA_SET_DATA_LEN] == NULL)
                                return -EINVAL;
                        desc.dlen = ntohl(nla_get_be32(nla[NFTA_SET_DATA_LEN]));
                        if (desc.dlen == 0 || desc.dlen > NFT_DATA_VALUE_MAXLEN)
                                return -EINVAL;
                } else
                        desc.dlen = sizeof(struct nft_verdict);
        } else if (flags & NFT_SET_MAP)
                return -EINVAL;

        if (nla[NFTA_SET_OBJ_TYPE] != NULL) {
                if (!(flags & NFT_SET_OBJECT))
                        return -EINVAL;

                desc.objtype = ntohl(nla_get_be32(nla[NFTA_SET_OBJ_TYPE]));
                if (desc.objtype == NFT_OBJECT_UNSPEC ||
                    desc.objtype > NFT_OBJECT_MAX)
                        return -EOPNOTSUPP;
        } else if (flags & NFT_SET_OBJECT)
                return -EINVAL;
        else
                desc.objtype = NFT_OBJECT_UNSPEC;

        desc.timeout = 0;
        if (nla[NFTA_SET_TIMEOUT] != NULL) {
                if (!(flags & NFT_SET_TIMEOUT))
                        return -EINVAL;

                if (flags & NFT_SET_ANONYMOUS)
                        return -EOPNOTSUPP;

                err = nf_msecs_to_jiffies64(nla[NFTA_SET_TIMEOUT], &desc.timeout);
                if (err)
                        return err;
        }
        desc.gc_int = 0;
        if (nla[NFTA_SET_GC_INTERVAL] != NULL) {
                if (!(flags & NFT_SET_TIMEOUT))
                        return -EINVAL;

                if (flags & NFT_SET_ANONYMOUS)
                        return -EOPNOTSUPP;

                desc.gc_int = ntohl(nla_get_be32(nla[NFTA_SET_GC_INTERVAL]));
        }

        desc.policy = NFT_SET_POL_PERFORMANCE;
        if (nla[NFTA_SET_POLICY] != NULL) {
                desc.policy = ntohl(nla_get_be32(nla[NFTA_SET_POLICY]));
                switch (desc.policy) {
                case NFT_SET_POL_PERFORMANCE:
                case NFT_SET_POL_MEMORY:
                        break;
                default:
                        return -EOPNOTSUPP;
                }
        }

        if (nla[NFTA_SET_DESC] != NULL) {
                err = nf_tables_set_desc_parse(&desc, nla[NFTA_SET_DESC]);
                if (err < 0)
                        return err;

                if (desc.field_count > 1) {
                        if (!(flags & NFT_SET_CONCAT))
                                return -EINVAL;
                } else if (flags & NFT_SET_CONCAT) {
                        return -EINVAL;
                }
        } else if (flags & NFT_SET_CONCAT) {
                return -EINVAL;
        }

        if (nla[NFTA_SET_EXPR] || nla[NFTA_SET_EXPRESSIONS])
                desc.expr = true;

        table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
                return PTR_ERR(table);
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask);
        if (IS_ERR(set)) {
                if (PTR_ERR(set) != -ENOENT) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
                        return PTR_ERR(set);
                }
        } else {
                struct nft_expr *exprs[NFT_SET_EXPR_MAX] = {};

                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;

                if (nft_set_is_anonymous(set))
                        return -EOPNOTSUPP;

                err = nft_set_expr_alloc(&ctx, set, nla, exprs, &num_exprs, flags);
                if (err < 0)
                        return err;

                if (desc.size)
                        desc.size = nft_set_kernel_size(set->ops, &desc);

                err = 0;
                if (!nft_set_is_same(set, &desc, exprs, num_exprs, flags)) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]);
                        err = -EEXIST;
                }

                for (i = 0; i < num_exprs; i++)
                        nft_expr_destroy(&ctx, exprs[i]);

                if (err < 0)
                        return err;

                return __nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set, &desc);
        }

        if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
                return -ENOENT;

        ops = nft_select_set_ops(&ctx, flags, &desc);
        if (IS_ERR(ops))
                return PTR_ERR(ops);

        if (desc.size)
                desc.size = nft_set_kernel_size(ops, &desc);

        udlen = 0;
        if (nla[NFTA_SET_USERDATA])
                udlen = nla_len(nla[NFTA_SET_USERDATA]);

        size = 0;
        if (ops->privsize != NULL)
                size = ops->privsize(nla, &desc);
        alloc_size = sizeof(*set) + size + udlen;
        if (alloc_size < size || alloc_size > INT_MAX)
                return -ENOMEM;

        if (!nft_use_inc(&table->use))
                return -EMFILE;

        set = kvzalloc(alloc_size, GFP_KERNEL_ACCOUNT);
        if (!set) {
                err = -ENOMEM;
                goto err_alloc;
        }

        name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL_ACCOUNT);
        if (!name) {
                err = -ENOMEM;
                goto err_set_name;
        }

        err = nf_tables_set_alloc_name(&ctx, set, name);
        kfree(name);
        if (err < 0)
                goto err_set_name;

        udata = NULL;
        if (udlen) {
                udata = set->data + size;
                nla_memcpy(udata, nla[NFTA_SET_USERDATA], udlen);
        }

        INIT_LIST_HEAD(&set->bindings);
        INIT_LIST_HEAD(&set->catchall_list);
        refcount_set(&set->refs, 1);
        set->table = table;
        write_pnet(&set->net, net);
        set->ops = ops;
        set->ktype = desc.ktype;
        set->klen = desc.klen;
        set->dtype = desc.dtype;
        set->objtype = desc.objtype;
        set->dlen = desc.dlen;
        set->flags = flags;
        set->size = desc.size;
        set->policy = desc.policy;
        set->udlen = udlen;
        set->udata = udata;
        set->timeout = desc.timeout;
        set->gc_int = desc.gc_int;

        set->field_count = desc.field_count;
        for (i = 0; i < desc.field_count; i++)
                set->field_len[i] = desc.field_len[i];

        err = ops->init(set, &desc, nla);
        if (err < 0)
                goto err_set_init;

        err = nft_set_expr_alloc(&ctx, set, nla, set->exprs, &num_exprs, flags);
        if (err < 0)
                goto err_set_destroy;

        set->num_exprs = num_exprs;
        set->handle = nf_tables_alloc_handle(table);
        INIT_LIST_HEAD(&set->pending_update);

        err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set);
        if (err < 0)
                goto err_set_expr_alloc;

        list_add_tail_rcu(&set->list, &table->sets);

        return 0;

err_set_expr_alloc:
        for (i = 0; i < set->num_exprs; i++)
                nft_expr_destroy(&ctx, set->exprs[i]);
err_set_destroy:
        ops->destroy(&ctx, set);
err_set_init:
        kfree(set->name);
err_set_name:
        kvfree(set);
err_alloc:
        nft_use_dec_restore(&table->use);

        return err;
}

static void nft_set_catchall_destroy(const struct nft_ctx *ctx,
                                     struct nft_set *set)
{
        struct nft_set_elem_catchall *next, *catchall;

        list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
                list_del_rcu(&catchall->list);
                nf_tables_set_elem_destroy(ctx, set, catchall->elem);
                kfree_rcu(catchall, rcu);
        }
}

static void nft_set_put(struct nft_set *set)
{
        if (refcount_dec_and_test(&set->refs)) {
                kfree(set->name);
                kvfree(set);
        }
}

static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set)
{
        int i;

        if (WARN_ON(set->use > 0))
                return;

        for (i = 0; i < set->num_exprs; i++)
                nft_expr_destroy(ctx, set->exprs[i]);

        set->ops->destroy(ctx, set);
        nft_set_catchall_destroy(ctx, set);
        nft_set_put(set);
}

static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_set *set;
        struct nft_ctx ctx;

        if (info->nfmsg->nfgen_family == NFPROTO_UNSPEC)
                return -EAFNOSUPPORT;

        table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_SET_HANDLE]) {
                attr = nla[NFTA_SET_HANDLE];
                set = nft_set_lookup_byhandle(table, attr, genmask);
        } else {
                attr = nla[NFTA_SET_NAME];
                set = nft_set_lookup(net, table, attr, genmask);
        }

        if (IS_ERR(set)) {
                if (PTR_ERR(set) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSET)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(set);
        }
        if (set->use ||
            (info->nlh->nlmsg_flags & NLM_F_NONREC &&
             atomic_read(&set->nelems) > 0)) {
                NL_SET_BAD_ATTR(extack, attr);
                return -EBUSY;
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        return nft_delset(&ctx, set);
}

static int nft_validate_register_store(const struct nft_ctx *ctx,
                                       enum nft_registers reg,
                                       const struct nft_data *data,
                                       enum nft_data_types type,
                                       unsigned int len);

static int nft_setelem_data_validate(const struct nft_ctx *ctx,
                                     struct nft_set *set,
                                     struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        enum nft_registers dreg;

        dreg = nft_type_to_reg(set->dtype);
        return nft_validate_register_store(ctx, dreg, nft_set_ext_data(ext),
                                           set->dtype == NFT_DATA_VERDICT ?
                                           NFT_DATA_VERDICT : NFT_DATA_VALUE,
                                           set->dlen);
}

static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
                                        struct nft_set *set,
                                        const struct nft_set_iter *iter,
                                        struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        return nft_setelem_data_validate(ctx, set, elem_priv);
}

static int nft_set_catchall_bind_check(const struct nft_ctx *ctx,
                                       struct nft_set *set)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;
        int ret = 0;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list,
                                lockdep_commit_lock_is_held(ctx->net)) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask))
                        continue;

                ret = nft_setelem_data_validate(ctx, set, catchall->elem);
                if (ret < 0)
                        break;
        }

        return ret;
}

int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set,
                       struct nft_set_binding *binding)
{
        struct nft_set_binding *i;
        struct nft_set_iter iter;

        if (!list_empty(&set->bindings) && nft_set_is_anonymous(set))
                return -EBUSY;

        if (binding->flags & NFT_SET_MAP) {
                /* If the set is already bound to the same chain all
                 * jumps are already validated for that chain.
                 */
                list_for_each_entry(i, &set->bindings, list) {
                        if (i->flags & NFT_SET_MAP &&
                            i->chain == binding->chain)
                                goto bind;
                }

                iter.genmask        = nft_genmask_next(ctx->net);
                iter.type        = NFT_ITER_UPDATE;
                iter.skip         = 0;
                iter.count        = 0;
                iter.err        = 0;
                iter.fn                = nf_tables_bind_check_setelem;

                set->ops->walk(ctx, set, &iter);
                if (!iter.err)
                        iter.err = nft_set_catchall_bind_check(ctx, set);

                if (iter.err < 0)
                        return iter.err;
        }
bind:
        if (!nft_use_inc(&set->use))
                return -EMFILE;

        binding->chain = ctx->chain;
        list_add_tail_rcu(&binding->list, &set->bindings);
        nft_set_trans_bind(ctx, set);

        return 0;
}
EXPORT_SYMBOL_GPL(nf_tables_bind_set);

static void nf_tables_unbind_set(const struct nft_ctx *ctx, struct nft_set *set,
                                 struct nft_set_binding *binding, bool event)
{
        list_del_rcu(&binding->list);

        if (list_empty(&set->bindings) && nft_set_is_anonymous(set)) {
                list_del_rcu(&set->list);
                set->dead = 1;
                if (event)
                        nf_tables_set_notify(ctx, set, NFT_MSG_DELSET,
                                             GFP_KERNEL);
        }
}

static void nft_setelem_data_activate(const struct net *net,
                                      const struct nft_set *set,
                                      struct nft_elem_priv *elem_priv);

static int nft_mapelem_activate(const struct nft_ctx *ctx,
                                struct nft_set *set,
                                const struct nft_set_iter *iter,
                                struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        /* called from abort path, reverse check to undo changes. */
        if (nft_set_elem_active(ext, iter->genmask))
                return 0;

        nft_clear(ctx->net, ext);
        nft_setelem_data_activate(ctx->net, set, elem_priv);

        return 0;
}

static void nft_map_catchall_activate(const struct nft_ctx *ctx,
                                      struct nft_set *set)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;

        list_for_each_entry(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask))
                        continue;

                nft_clear(ctx->net, ext);
                nft_setelem_data_activate(ctx->net, set, catchall->elem);
                break;
        }
}

static void nft_map_activate(const struct nft_ctx *ctx, struct nft_set *set)
{
        struct nft_set_iter iter = {
                .genmask        = nft_genmask_next(ctx->net),
                .type                = NFT_ITER_UPDATE,
                .fn                = nft_mapelem_activate,
        };

        set->ops->walk(ctx, set, &iter);
        WARN_ON_ONCE(iter.err);

        nft_map_catchall_activate(ctx, set);
}

void nf_tables_activate_set(const struct nft_ctx *ctx, struct nft_set *set)
{
        if (nft_set_is_anonymous(set)) {
                if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                        nft_map_activate(ctx, set);

                nft_clear(ctx->net, set);
        }

        nft_use_inc_restore(&set->use);
}
EXPORT_SYMBOL_GPL(nf_tables_activate_set);

void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set,
                              struct nft_set_binding *binding,
                              enum nft_trans_phase phase)
{
        lockdep_commit_lock_is_held(ctx->net);

        switch (phase) {
        case NFT_TRANS_PREPARE_ERROR:
                nft_set_trans_unbind(ctx, set);
                if (nft_set_is_anonymous(set))
                        nft_deactivate_next(ctx->net, set);
                else
                        list_del_rcu(&binding->list);

                nft_use_dec(&set->use);
                break;
        case NFT_TRANS_PREPARE:
                if (nft_set_is_anonymous(set)) {
                        if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                                nft_map_deactivate(ctx, set);

                        nft_deactivate_next(ctx->net, set);
                }
                nft_use_dec(&set->use);
                return;
        case NFT_TRANS_ABORT:
        case NFT_TRANS_RELEASE:
                if (nft_set_is_anonymous(set) &&
                    set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                        nft_map_deactivate(ctx, set);

                nft_use_dec(&set->use);
                fallthrough;
        default:
                nf_tables_unbind_set(ctx, set, binding,
                                     phase == NFT_TRANS_COMMIT);
        }
}
EXPORT_SYMBOL_GPL(nf_tables_deactivate_set);

void nf_tables_destroy_set(const struct nft_ctx *ctx, struct nft_set *set)
{
        if (list_empty(&set->bindings) && nft_set_is_anonymous(set))
                nft_set_destroy(ctx, set);
}
EXPORT_SYMBOL_GPL(nf_tables_destroy_set);

const struct nft_set_ext_type nft_set_ext_types[] = {
        [NFT_SET_EXT_KEY]                = {
                .align        = __alignof__(u32),
        },
        [NFT_SET_EXT_DATA]                = {
                .align        = __alignof__(u32),
        },
        [NFT_SET_EXT_EXPRESSIONS]        = {
                .align        = __alignof__(struct nft_set_elem_expr),
        },
        [NFT_SET_EXT_OBJREF]                = {
                .len        = sizeof(struct nft_object *),
                .align        = __alignof__(struct nft_object *),
        },
        [NFT_SET_EXT_FLAGS]                = {
                .len        = sizeof(u8),
                .align        = __alignof__(u8),
        },
        [NFT_SET_EXT_TIMEOUT]                = {
                .len        = sizeof(struct nft_timeout),
                .align        = __alignof__(struct nft_timeout),
        },
        [NFT_SET_EXT_USERDATA]                = {
                .len        = sizeof(struct nft_userdata),
                .align        = __alignof__(struct nft_userdata),
        },
        [NFT_SET_EXT_KEY_END]                = {
                .align        = __alignof__(u32),
        },
};

/*
 * Set elements
 */

static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = {
        [NFTA_SET_ELEM_KEY]                = { .type = NLA_NESTED },
        [NFTA_SET_ELEM_DATA]                = { .type = NLA_NESTED },
        [NFTA_SET_ELEM_FLAGS]                = { .type = NLA_U32 },
        [NFTA_SET_ELEM_TIMEOUT]                = { .type = NLA_U64 },
        [NFTA_SET_ELEM_EXPIRATION]        = { .type = NLA_U64 },
        [NFTA_SET_ELEM_USERDATA]        = { .type = NLA_BINARY,
                                            .len = NFT_USERDATA_MAXLEN },
        [NFTA_SET_ELEM_EXPR]                = { .type = NLA_NESTED },
        [NFTA_SET_ELEM_OBJREF]                = { .type = NLA_STRING,
                                            .len = NFT_OBJ_MAXNAMELEN - 1 },
        [NFTA_SET_ELEM_KEY_END]                = { .type = NLA_NESTED },
        [NFTA_SET_ELEM_EXPRESSIONS]        = NLA_POLICY_NESTED_ARRAY(nft_expr_policy),
};

static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = {
        [NFTA_SET_ELEM_LIST_TABLE]        = { .type = NLA_STRING,
                                            .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_SET_ELEM_LIST_SET]        = { .type = NLA_STRING,
                                            .len = NFT_SET_MAXNAMELEN - 1 },
        [NFTA_SET_ELEM_LIST_ELEMENTS]        = NLA_POLICY_NESTED_ARRAY(nft_set_elem_policy),
        [NFTA_SET_ELEM_LIST_SET_ID]        = { .type = NLA_U32 },
};

static int nft_set_elem_expr_dump(struct sk_buff *skb,
                                  const struct nft_set *set,
                                  const struct nft_set_ext *ext,
                                  bool reset)
{
        struct nft_set_elem_expr *elem_expr;
        u32 size, num_exprs = 0;
        struct nft_expr *expr;
        struct nlattr *nest;

        elem_expr = nft_set_ext_expr(ext);
        nft_setelem_expr_foreach(expr, elem_expr, size)
                num_exprs++;

        if (num_exprs == 1) {
                expr = nft_setelem_expr_at(elem_expr, 0);
                if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, reset) < 0)
                        return -1;

                return 0;
        } else if (num_exprs > 1) {
                nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_EXPRESSIONS);
                if (nest == NULL)
                        goto nla_put_failure;

                nft_setelem_expr_foreach(expr, elem_expr, size) {
                        expr = nft_setelem_expr_at(elem_expr, size);
                        if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
                                goto nla_put_failure;
                }
                nla_nest_end(skb, nest);
        }
        return 0;

nla_put_failure:
        return -1;
}

static int nf_tables_fill_setelem(struct sk_buff *skb,
                                  const struct nft_set *set,
                                  const struct nft_elem_priv *elem_priv,
                                  bool reset)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        unsigned char *b = skb_tail_pointer(skb);
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, NFTA_LIST_ELEM);
        if (nest == NULL)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
            nft_data_dump(skb, NFTA_SET_ELEM_KEY, nft_set_ext_key(ext),
                          NFT_DATA_VALUE, set->klen) < 0)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
            nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext),
                          NFT_DATA_VALUE, set->klen) < 0)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
            nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext),
                          nft_set_datatype(set), set->dlen) < 0)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS) &&
            nft_set_elem_expr_dump(skb, set, ext, reset))
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
            nla_put_string(skb, NFTA_SET_ELEM_OBJREF,
                           (*nft_set_ext_obj(ext))->key.name) < 0)
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
            nla_put_be32(skb, NFTA_SET_ELEM_FLAGS,
                         htonl(*nft_set_ext_flags(ext))))
                goto nla_put_failure;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) {
                u64 timeout = READ_ONCE(nft_set_ext_timeout(ext)->timeout);
                u64 set_timeout = READ_ONCE(set->timeout);
                __be64 msecs = 0;

                if (set_timeout != timeout) {
                        msecs = nf_jiffies64_to_msecs(timeout);
                        if (nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, msecs,
                                         NFTA_SET_ELEM_PAD))
                                goto nla_put_failure;
                }

                if (timeout > 0) {
                        u64 expires, now = get_jiffies_64();

                        expires = READ_ONCE(nft_set_ext_timeout(ext)->expiration);
                        if (time_before64(now, expires))
                                expires -= now;
                        else
                                expires = 0;

                        if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION,
                                         nf_jiffies64_to_msecs(expires),
                                         NFTA_SET_ELEM_PAD))
                                goto nla_put_failure;
                }
        }

        if (nft_set_ext_exists(ext, NFT_SET_EXT_USERDATA)) {
                struct nft_userdata *udata;

                udata = nft_set_ext_userdata(ext);
                if (nla_put(skb, NFTA_SET_ELEM_USERDATA,
                            udata->len + 1, udata->data))
                        goto nla_put_failure;
        }

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, b);
        return -EMSGSIZE;
}

struct nft_set_dump_args {
        const struct netlink_callback        *cb;
        struct nft_set_iter                iter;
        struct sk_buff                        *skb;
        bool                                reset;
};

static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
                                  struct nft_set *set,
                                  const struct nft_set_iter *iter,
                                  struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        struct nft_set_dump_args *args;

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        if (nft_set_elem_expired(ext) || nft_set_elem_is_dead(ext))
                return 0;

        args = container_of(iter, struct nft_set_dump_args, iter);
        return nf_tables_fill_setelem(args->skb, set, elem_priv, args->reset);
}

static void audit_log_nft_set_reset(const struct nft_table *table,
                                    unsigned int base_seq,
                                    unsigned int nentries)
{
        char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);

        audit_log_nfcfg(buf, table->family, nentries,
                        AUDIT_NFT_OP_SETELEM_RESET, GFP_ATOMIC);
        kfree(buf);
}

struct nft_set_dump_ctx {
        const struct nft_set        *set;
        struct nft_ctx                ctx;
        bool                        reset;
};

static int nft_set_catchall_dump(struct net *net, struct sk_buff *skb,
                                 const struct nft_set *set, bool reset,
                                 unsigned int base_seq)
{
        struct nft_set_elem_catchall *catchall;
        u8 genmask = nft_genmask_cur(net);
        struct nft_set_ext *ext;
        int ret = 0;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask) ||
                    nft_set_elem_expired(ext))
                        continue;

                ret = nf_tables_fill_setelem(skb, set, catchall->elem, reset);
                if (reset && !ret)
                        audit_log_nft_set_reset(set->table, base_seq, 1);
                break;
        }

        return ret;
}

static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct nft_set_dump_ctx *dump_ctx = cb->data;
        struct net *net = sock_net(skb->sk);
        struct nftables_pernet *nft_net;
        struct nft_table *table;
        struct nft_set *set;
        struct nft_set_dump_args args;
        bool set_found = false;
        struct nlmsghdr *nlh;
        struct nlattr *nest;
        u32 portid, seq;
        int event;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
                    dump_ctx->ctx.family != table->family)
                        continue;

                if (table != dump_ctx->ctx.table)
                        continue;

                list_for_each_entry_rcu(set, &table->sets, list) {
                        if (set == dump_ctx->set) {
                                set_found = true;
                                break;
                        }
                }
                break;
        }

        if (!set_found) {
                rcu_read_unlock();
                return -ENOENT;
        }

        event  = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM);
        portid = NETLINK_CB(cb->skb).portid;
        seq    = cb->nlh->nlmsg_seq;

        nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
                           table->family, NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_SET_ELEM_LIST_TABLE, table->name))
                goto nla_put_failure;
        if (nla_put_string(skb, NFTA_SET_ELEM_LIST_SET, set->name))
                goto nla_put_failure;

        nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
        if (nest == NULL)
                goto nla_put_failure;

        args.cb                        = cb;
        args.skb                = skb;
        args.reset                = dump_ctx->reset;
        args.iter.genmask        = nft_genmask_cur(net);
        args.iter.type                = NFT_ITER_READ;
        args.iter.skip                = cb->args[0];
        args.iter.count                = 0;
        args.iter.err                = 0;
        args.iter.fn                = nf_tables_dump_setelem;
        set->ops->walk(&dump_ctx->ctx, set, &args.iter);

        if (!args.iter.err && args.iter.count == cb->args[0])
                args.iter.err = nft_set_catchall_dump(net, skb, set,
                                                      dump_ctx->reset, cb->seq);
        nla_nest_end(skb, nest);
        nlmsg_end(skb, nlh);

        rcu_read_unlock();

        if (args.iter.err && args.iter.err != -EMSGSIZE)
                return args.iter.err;
        if (args.iter.count == cb->args[0])
                return 0;

        cb->args[0] = args.iter.count;
        return skb->len;

nla_put_failure:
        rcu_read_unlock();
        return -ENOSPC;
}

static int nf_tables_dumpreset_set(struct sk_buff *skb,
                                   struct netlink_callback *cb)
{
        struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
        struct nft_set_dump_ctx *dump_ctx = cb->data;
        int ret, skip = cb->args[0];

        mutex_lock(&nft_net->commit_mutex);

        ret = nf_tables_dump_set(skb, cb);

        if (cb->args[0] > skip)
                audit_log_nft_set_reset(dump_ctx->ctx.table, cb->seq,
                                        cb->args[0] - skip);

        mutex_unlock(&nft_net->commit_mutex);

        return ret;
}

static int nf_tables_dump_set_start(struct netlink_callback *cb)
{
        struct nft_set_dump_ctx *dump_ctx = cb->data;

        cb->data = kmemdup(dump_ctx, sizeof(*dump_ctx), GFP_ATOMIC);

        return cb->data ? 0 : -ENOMEM;
}

static int nf_tables_dump_set_done(struct netlink_callback *cb)
{
        kfree(cb->data);
        return 0;
}

static int nf_tables_fill_setelem_info(struct sk_buff *skb,
                                       const struct nft_ctx *ctx, u32 seq,
                                       u32 portid, int event, u16 flags,
                                       const struct nft_set *set,
                                       const struct nft_elem_priv *elem_priv,
                                       bool reset)
{
        struct nlmsghdr *nlh;
        struct nlattr *nest;
        int err;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
                           NFNETLINK_V0, nft_base_seq(ctx->net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_SET_TABLE, ctx->table->name))
                goto nla_put_failure;
        if (nla_put_string(skb, NFTA_SET_NAME, set->name))
                goto nla_put_failure;

        nest = nla_nest_start_noflag(skb, NFTA_SET_ELEM_LIST_ELEMENTS);
        if (nest == NULL)
                goto nla_put_failure;

        err = nf_tables_fill_setelem(skb, set, elem_priv, reset);
        if (err < 0)
                goto nla_put_failure;

        nla_nest_end(skb, nest);

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static int nft_setelem_parse_flags(const struct nft_set *set,
                                   const struct nlattr *attr, u32 *flags)
{
        if (attr == NULL)
                return 0;

        *flags = ntohl(nla_get_be32(attr));
        if (*flags & ~(NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
                return -EOPNOTSUPP;
        if (!(set->flags & NFT_SET_INTERVAL) &&
            *flags & NFT_SET_ELEM_INTERVAL_END)
                return -EINVAL;
        if ((*flags & (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL)) ==
            (NFT_SET_ELEM_INTERVAL_END | NFT_SET_ELEM_CATCHALL))
                return -EINVAL;

        return 0;
}

static int nft_setelem_parse_key(struct nft_ctx *ctx, const struct nft_set *set,
                                 struct nft_data *key, struct nlattr *attr)
{
        struct nft_data_desc desc = {
                .type        = NFT_DATA_VALUE,
                .size        = NFT_DATA_VALUE_MAXLEN,
                .len        = set->klen,
        };

        return nft_data_init(ctx, key, &desc, attr);
}

static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set,
                                  struct nft_data_desc *desc,
                                  struct nft_data *data,
                                  struct nlattr *attr)
{
        u32 dtype;

        if (set->dtype == NFT_DATA_VERDICT)
                dtype = NFT_DATA_VERDICT;
        else
                dtype = NFT_DATA_VALUE;

        desc->type = dtype;
        desc->size = NFT_DATA_VALUE_MAXLEN;
        desc->len = set->dlen;
        desc->flags = NFT_DATA_DESC_SETELEM;

        return nft_data_init(ctx, data, desc, attr);
}

static void *nft_setelem_catchall_get(const struct net *net,
                                      const struct nft_set *set)
{
        struct nft_set_elem_catchall *catchall;
        u8 genmask = nft_genmask_cur(net);
        struct nft_set_ext *ext;
        void *priv = NULL;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask) ||
                    nft_set_elem_expired(ext))
                        continue;

                priv = catchall->elem;
                break;
        }

        return priv;
}

static int nft_setelem_get(struct nft_ctx *ctx, const struct nft_set *set,
                           struct nft_set_elem *elem, u32 flags)
{
        void *priv;

        if (!(flags & NFT_SET_ELEM_CATCHALL)) {
                priv = set->ops->get(ctx->net, set, elem, flags);
                if (IS_ERR(priv))
                        return PTR_ERR(priv);
        } else {
                priv = nft_setelem_catchall_get(ctx->net, set);
                if (!priv)
                        return -ENOENT;
        }
        elem->priv = priv;

        return 0;
}

static int nft_get_set_elem(struct nft_ctx *ctx, const struct nft_set *set,
                            const struct nlattr *attr, bool reset)
{
        struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
        struct nft_set_elem elem;
        struct sk_buff *skb;
        uint32_t flags = 0;
        int err;

        err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
                                          nft_set_elem_policy, NULL);
        if (err < 0)
                return err;

        err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
        if (err < 0)
                return err;

        if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
                return -EINVAL;

        if (nla[NFTA_SET_ELEM_KEY]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key.val,
                                            nla[NFTA_SET_ELEM_KEY]);
                if (err < 0)
                        return err;
        }

        if (nla[NFTA_SET_ELEM_KEY_END]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
                                            nla[NFTA_SET_ELEM_KEY_END]);
                if (err < 0)
                        return err;
        }

        err = nft_setelem_get(ctx, set, &elem, flags);
        if (err < 0)
                return err;

        err = -ENOMEM;
        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb == NULL)
                return err;

        err = nf_tables_fill_setelem_info(skb, ctx, ctx->seq, ctx->portid,
                                          NFT_MSG_NEWSETELEM, 0, set, elem.priv,
                                          reset);
        if (err < 0)
                goto err_fill_setelem;

        return nfnetlink_unicast(skb, ctx->net, ctx->portid);

err_fill_setelem:
        kfree_skb(skb);
        return err;
}

static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx,
                                 const struct sk_buff *skb,
                                 const struct nfnl_info *info,
                                 const struct nlattr * const nla[],
                                 bool reset)
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        struct nft_table *table;
        struct nft_set *set;

        table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
                                 genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
                return PTR_ERR(table);
        }

        set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
        if (IS_ERR(set)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
                return PTR_ERR(set);
        }

        nft_ctx_init(&dump_ctx->ctx, net, skb,
                     info->nlh, family, table, NULL, nla);
        dump_ctx->set = set;
        dump_ctx->reset = reset;
        return 0;
}

/* called with rcu_read_lock held */
static int nf_tables_getsetelem(struct sk_buff *skb,
                                const struct nfnl_info *info,
                                const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        struct nft_set_dump_ctx dump_ctx;
        struct nlattr *attr;
        int rem, err = 0;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_set_start,
                        .dump = nf_tables_dump_set,
                        .done = nf_tables_dump_set_done,
                        .module = THIS_MODULE,
                };

                err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
                if (err)
                        return err;

                c.data = &dump_ctx;
                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
                return -EINVAL;

        err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, false);
        if (err)
                return err;

        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, false);
                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, attr);
                        break;
                }
        }

        return err;
}

static int nf_tables_getsetelem_reset(struct sk_buff *skb,
                                      const struct nfnl_info *info,
                                      const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        struct netlink_ext_ack *extack = info->extack;
        struct nft_set_dump_ctx dump_ctx;
        int rem, err = 0, nelems = 0;
        struct nlattr *attr;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_set_start,
                        .dump = nf_tables_dumpreset_set,
                        .done = nf_tables_dump_set_done,
                        .module = THIS_MODULE,
                };

                err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
                if (err)
                        return err;

                c.data = &dump_ctx;
                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
                return -EINVAL;

        if (!try_module_get(THIS_MODULE))
                return -EINVAL;
        rcu_read_unlock();
        mutex_lock(&nft_net->commit_mutex);
        rcu_read_lock();

        err = nft_set_dump_ctx_init(&dump_ctx, skb, info, nla, true);
        if (err)
                goto out_unlock;

        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_get_set_elem(&dump_ctx.ctx, dump_ctx.set, attr, true);
                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, attr);
                        break;
                }
                nelems++;
        }
        audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems);

out_unlock:
        rcu_read_unlock();
        mutex_unlock(&nft_net->commit_mutex);
        rcu_read_lock();
        module_put(THIS_MODULE);

        return err;
}

static void nf_tables_setelem_notify(const struct nft_ctx *ctx,
                                     const struct nft_set *set,
                                     const struct nft_elem_priv *elem_priv,
                                     int event)
{
        struct nftables_pernet *nft_net;
        struct net *net = ctx->net;
        u32 portid = ctx->portid;
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report && !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_setelem_info(skb, ctx, 0, portid, event, flags,
                                          set, elem_priv, false);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_net = nft_pernet(net);
        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx,
                                              int msg_type,
                                              struct nft_set *set)
{
        struct nft_trans_elem *te;
        struct nft_trans *trans;

        trans = nft_trans_alloc(ctx, msg_type, struct_size(te, elems, 1));
        if (trans == NULL)
                return NULL;

        te = nft_trans_container_elem(trans);
        te->nelems = 1;
        te->set = set;

        return trans;
}

struct nft_expr *nft_set_elem_expr_alloc(const struct nft_ctx *ctx,
                                         const struct nft_set *set,
                                         const struct nlattr *attr)
{
        struct nft_expr *expr;
        int err;

        expr = nft_expr_init(ctx, attr);
        if (IS_ERR(expr))
                return expr;

        err = -EOPNOTSUPP;
        if (expr->ops->type->flags & NFT_EXPR_GC) {
                if (set->flags & NFT_SET_TIMEOUT)
                        goto err_set_elem_expr;
                if (!set->ops->gc_init)
                        goto err_set_elem_expr;
                set->ops->gc_init(set);
        }

        return expr;

err_set_elem_expr:
        nft_expr_destroy(ctx, expr);
        return ERR_PTR(err);
}

static int nft_set_ext_check(const struct nft_set_ext_tmpl *tmpl, u8 id, u32 len)
{
        len += nft_set_ext_types[id].len;
        if (len > tmpl->ext_len[id] ||
            len > U8_MAX)
                return -1;

        return 0;
}

static int nft_set_ext_memcpy(const struct nft_set_ext_tmpl *tmpl, u8 id,
                              void *to, const void *from, u32 len)
{
        if (nft_set_ext_check(tmpl, id, len) < 0)
                return -1;

        memcpy(to, from, len);

        return 0;
}

struct nft_elem_priv *nft_set_elem_init(const struct nft_set *set,
                                        const struct nft_set_ext_tmpl *tmpl,
                                        const u32 *key, const u32 *key_end,
                                        const u32 *data,
                                        u64 timeout, u64 expiration, gfp_t gfp)
{
        struct nft_set_ext *ext;
        void *elem;

        elem = kzalloc(set->ops->elemsize + tmpl->len, gfp);
        if (elem == NULL)
                return ERR_PTR(-ENOMEM);

        ext = nft_set_elem_ext(set, elem);
        nft_set_ext_init(ext, tmpl);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY) &&
            nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY,
                               nft_set_ext_key(ext), key, set->klen) < 0)
                goto err_ext_check;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) &&
            nft_set_ext_memcpy(tmpl, NFT_SET_EXT_KEY_END,
                               nft_set_ext_key_end(ext), key_end, set->klen) < 0)
                goto err_ext_check;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
            nft_set_ext_memcpy(tmpl, NFT_SET_EXT_DATA,
                               nft_set_ext_data(ext), data, set->dlen) < 0)
                goto err_ext_check;

        if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT)) {
                nft_set_ext_timeout(ext)->timeout = timeout;

                if (expiration == 0)
                        expiration = timeout;

                nft_set_ext_timeout(ext)->expiration = get_jiffies_64() + expiration;
        }

        return elem;

err_ext_check:
        kfree(elem);

        return ERR_PTR(-EINVAL);
}

static void __nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
                                        struct nft_expr *expr)
{
        if (expr->ops->destroy_clone) {
                expr->ops->destroy_clone(ctx, expr);
                module_put(expr->ops->type->owner);
        } else {
                nf_tables_expr_destroy(ctx, expr);
        }
}

static void nft_set_elem_expr_destroy(const struct nft_ctx *ctx,
                                      struct nft_set_elem_expr *elem_expr)
{
        struct nft_expr *expr;
        u32 size;

        nft_setelem_expr_foreach(expr, elem_expr, size)
                __nft_set_elem_expr_destroy(ctx, expr);
}

/* Drop references and destroy. Called from gc, dynset and abort path. */
static void __nft_set_elem_destroy(const struct nft_ctx *ctx,
                                   const struct nft_set *set,
                                   const struct nft_elem_priv *elem_priv,
                                   bool destroy_expr)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        nft_data_release(nft_set_ext_key(ext), NFT_DATA_VALUE);
        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
                nft_data_release(nft_set_ext_data(ext), set->dtype);
        if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
                nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));
        if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
                nft_use_dec(&(*nft_set_ext_obj(ext))->use);

        kfree(elem_priv);
}

/* Drop references and destroy. Called from gc and dynset. */
void nft_set_elem_destroy(const struct nft_set *set,
                          const struct nft_elem_priv *elem_priv,
                          bool destroy_expr)
{
        struct nft_ctx ctx = {
                .net        = read_pnet(&set->net),
                .family        = set->table->family,
        };

        __nft_set_elem_destroy(&ctx, set, elem_priv, destroy_expr);
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);

/* Drop references and destroy. Called from abort path. */
static void nft_trans_set_elem_destroy(const struct nft_ctx *ctx, struct nft_trans_elem *te)
{
        int i;

        for (i = 0; i < te->nelems; i++) {
                /* skip update request, see nft_trans_elems_new_abort() */
                if (!te->elems[i].priv)
                        continue;

                __nft_set_elem_destroy(ctx, te->set, te->elems[i].priv, true);
        }
}

/* Destroy element. References have been already dropped in the preparation
 * path via nft_setelem_data_deactivate().
 */
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
                                const struct nft_set *set,
                                const struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPRESSIONS))
                nft_set_elem_expr_destroy(ctx, nft_set_ext_expr(ext));

        kfree(elem_priv);
}

static void nft_trans_elems_destroy(const struct nft_ctx *ctx,
                                    const struct nft_trans_elem *te)
{
        int i;

        for (i = 0; i < te->nelems; i++)
                nf_tables_set_elem_destroy(ctx, te->set, te->elems[i].priv);
}

int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
                            struct nft_expr *expr_array[])
{
        struct nft_expr *expr;
        int err, i, k;

        for (i = 0; i < set->num_exprs; i++) {
                expr = kzalloc(set->exprs[i]->ops->size, GFP_KERNEL_ACCOUNT);
                if (!expr)
                        goto err_expr;

                err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT);
                if (err < 0) {
                        kfree(expr);
                        goto err_expr;
                }
                expr_array[i] = expr;
        }

        return 0;

err_expr:
        for (k = i - 1; k >= 0; k--)
                nft_expr_destroy(ctx, expr_array[k]);

        return -ENOMEM;
}

static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
                                   const struct nft_set_ext_tmpl *tmpl,
                                   const struct nft_set_ext *ext,
                                   struct nft_expr *expr_array[],
                                   u32 num_exprs)
{
        struct nft_set_elem_expr *elem_expr = nft_set_ext_expr(ext);
        u32 len = sizeof(struct nft_set_elem_expr);
        struct nft_expr *expr;
        int i, err;

        if (num_exprs == 0)
                return 0;

        for (i = 0; i < num_exprs; i++)
                len += expr_array[i]->ops->size;

        if (nft_set_ext_check(tmpl, NFT_SET_EXT_EXPRESSIONS, len) < 0)
                return -EINVAL;

        for (i = 0; i < num_exprs; i++) {
                expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
                err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT);
                if (err < 0)
                        goto err_elem_expr_setup;

                elem_expr->size += expr_array[i]->ops->size;
                nft_expr_destroy(ctx, expr_array[i]);
                expr_array[i] = NULL;
        }

        return 0;

err_elem_expr_setup:
        for (; i < num_exprs; i++) {
                nft_expr_destroy(ctx, expr_array[i]);
                expr_array[i] = NULL;
        }

        return -ENOMEM;
}

struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
                                            const struct nft_set *set)
{
        struct nft_set_elem_catchall *catchall;
        u8 genmask = nft_genmask_cur(net);
        struct nft_set_ext *ext;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (nft_set_elem_active(ext, genmask) &&
                    !nft_set_elem_expired(ext) &&
                    !nft_set_elem_is_dead(ext))
                        return ext;
        }

        return NULL;
}
EXPORT_SYMBOL_GPL(nft_set_catchall_lookup);

static int nft_setelem_catchall_insert(const struct net *net,
                                       struct nft_set *set,
                                       const struct nft_set_elem *elem,
                                       struct nft_elem_priv **priv)
{
        struct nft_set_elem_catchall *catchall;
        u8 genmask = nft_genmask_next(net);
        struct nft_set_ext *ext;

        list_for_each_entry(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (nft_set_elem_active(ext, genmask)) {
                        *priv = catchall->elem;
                        return -EEXIST;
                }
        }

        catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT);
        if (!catchall)
                return -ENOMEM;

        catchall->elem = elem->priv;
        list_add_tail_rcu(&catchall->list, &set->catchall_list);

        return 0;
}

static int nft_setelem_insert(const struct net *net,
                              struct nft_set *set,
                              const struct nft_set_elem *elem,
                              struct nft_elem_priv **elem_priv,
                              unsigned int flags)
{
        int ret;

        if (flags & NFT_SET_ELEM_CATCHALL)
                ret = nft_setelem_catchall_insert(net, set, elem, elem_priv);
        else
                ret = set->ops->insert(net, set, elem, elem_priv);

        return ret;
}

static bool nft_setelem_is_catchall(const struct nft_set *set,
                                    const struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
            *nft_set_ext_flags(ext) & NFT_SET_ELEM_CATCHALL)
                return true;

        return false;
}

static void nft_setelem_activate(struct net *net, struct nft_set *set,
                                 struct nft_elem_priv *elem_priv)
{
        struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_setelem_is_catchall(set, elem_priv)) {
                nft_clear(net, ext);
        } else {
                set->ops->activate(net, set, elem_priv);
        }
}

static void nft_trans_elem_update(const struct nft_set *set,
                                  const struct nft_trans_one_elem *elem)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
        const struct nft_elem_update *update = elem->update;

        if (update->flags & NFT_TRANS_UPD_TIMEOUT)
                WRITE_ONCE(nft_set_ext_timeout(ext)->timeout, update->timeout);

        if (update->flags & NFT_TRANS_UPD_EXPIRATION)
                WRITE_ONCE(nft_set_ext_timeout(ext)->expiration, get_jiffies_64() + update->expiration);
}

static void nft_trans_elems_add(const struct nft_ctx *ctx,
                                struct nft_trans_elem *te)
{
        int i;

        for (i = 0; i < te->nelems; i++) {
                struct nft_trans_one_elem *elem = &te->elems[i];

                if (elem->update)
                        nft_trans_elem_update(te->set, elem);
                else
                        nft_setelem_activate(ctx->net, te->set, elem->priv);

                nf_tables_setelem_notify(ctx, te->set, elem->priv,
                                         NFT_MSG_NEWSETELEM);
                kfree(elem->update);
        }
}

static int nft_setelem_catchall_deactivate(const struct net *net,
                                           struct nft_set *set,
                                           struct nft_set_elem *elem)
{
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;

        list_for_each_entry(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_is_active_next(net, ext))
                        continue;

                kfree(elem->priv);
                elem->priv = catchall->elem;
                nft_set_elem_change_active(net, set, ext);
                return 0;
        }

        return -ENOENT;
}

static int __nft_setelem_deactivate(const struct net *net,
                                    struct nft_set *set,
                                    struct nft_set_elem *elem)
{
        void *priv;

        priv = set->ops->deactivate(net, set, elem);
        if (!priv)
                return -ENOENT;

        kfree(elem->priv);
        elem->priv = priv;
        set->ndeact++;

        return 0;
}

static int nft_setelem_deactivate(const struct net *net,
                                  struct nft_set *set,
                                  struct nft_set_elem *elem, u32 flags)
{
        int ret;

        if (flags & NFT_SET_ELEM_CATCHALL)
                ret = nft_setelem_catchall_deactivate(net, set, elem);
        else
                ret = __nft_setelem_deactivate(net, set, elem);

        return ret;
}

static void nft_setelem_catchall_destroy(struct nft_set_elem_catchall *catchall)
{
        list_del_rcu(&catchall->list);
        kfree_rcu(catchall, rcu);
}

static void nft_setelem_catchall_remove(const struct net *net,
                                        const struct nft_set *set,
                                        struct nft_elem_priv *elem_priv)
{
        struct nft_set_elem_catchall *catchall, *next;

        list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
                if (catchall->elem == elem_priv) {
                        nft_setelem_catchall_destroy(catchall);
                        break;
                }
        }
}

static void nft_setelem_remove(const struct net *net,
                               const struct nft_set *set,
                               struct nft_elem_priv *elem_priv)
{
        if (nft_setelem_is_catchall(set, elem_priv))
                nft_setelem_catchall_remove(net, set, elem_priv);
        else
                set->ops->remove(net, set, elem_priv);
}

static void nft_trans_elems_remove(const struct nft_ctx *ctx,
                                   const struct nft_trans_elem *te)
{
        int i;

        for (i = 0; i < te->nelems; i++) {
                WARN_ON_ONCE(te->elems[i].update);

                nf_tables_setelem_notify(ctx, te->set,
                                         te->elems[i].priv,
                                         te->nft_trans.msg_type);

                nft_setelem_remove(ctx->net, te->set, te->elems[i].priv);
                if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) {
                        atomic_dec(&te->set->nelems);
                        te->set->ndeact--;
                }
        }
}

static bool nft_setelem_valid_key_end(const struct nft_set *set,
                                      struct nlattr **nla, u32 flags)
{
        if ((set->flags & (NFT_SET_CONCAT | NFT_SET_INTERVAL)) ==
                          (NFT_SET_CONCAT | NFT_SET_INTERVAL)) {
                if (flags & NFT_SET_ELEM_INTERVAL_END)
                        return false;

                if (nla[NFTA_SET_ELEM_KEY_END] &&
                    flags & NFT_SET_ELEM_CATCHALL)
                        return false;
        } else {
                if (nla[NFTA_SET_ELEM_KEY_END])
                        return false;
        }

        return true;
}

static u32 nft_set_maxsize(const struct nft_set *set)
{
        u32 maxsize, delta;

        if (!set->size)
                return UINT_MAX;

        if (set->ops->adjust_maxsize)
                delta = set->ops->adjust_maxsize(set);
        else
                delta = 0;

        if (check_add_overflow(set->size, set->ndeact, &maxsize))
                return UINT_MAX;

        if (check_add_overflow(maxsize, delta, &maxsize))
                return UINT_MAX;

        return maxsize;
}

static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
                            const struct nlattr *attr, u32 nlmsg_flags)
{
        struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
        struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
        u8 genmask = nft_genmask_next(ctx->net);
        u32 flags = 0, size = 0, num_exprs = 0;
        struct nft_set_ext_tmpl tmpl;
        struct nft_set_ext *ext, *ext2;
        struct nft_set_elem elem;
        struct nft_set_binding *binding;
        struct nft_elem_priv *elem_priv;
        struct nft_object *obj = NULL;
        struct nft_userdata *udata;
        struct nft_data_desc desc;
        enum nft_registers dreg;
        struct nft_trans *trans;
        u64 expiration;
        u64 timeout;
        int err, i;
        u8 ulen;

        err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
                                          nft_set_elem_policy, NULL);
        if (err < 0)
                return err;

        nft_set_ext_prepare(&tmpl);

        err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
        if (err < 0)
                return err;

        if (((flags & NFT_SET_ELEM_CATCHALL) && nla[NFTA_SET_ELEM_KEY]) ||
            (!(flags & NFT_SET_ELEM_CATCHALL) && !nla[NFTA_SET_ELEM_KEY]))
                return -EINVAL;

        if (flags != 0) {
                err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
                if (err < 0)
                        return err;
        }

        if (set->flags & NFT_SET_MAP) {
                if (nla[NFTA_SET_ELEM_DATA] == NULL &&
                    !(flags & NFT_SET_ELEM_INTERVAL_END))
                        return -EINVAL;
        } else {
                if (nla[NFTA_SET_ELEM_DATA] != NULL)
                        return -EINVAL;
        }

        if (set->flags & NFT_SET_OBJECT) {
                if (!nla[NFTA_SET_ELEM_OBJREF] &&
                    !(flags & NFT_SET_ELEM_INTERVAL_END))
                        return -EINVAL;
        } else {
                if (nla[NFTA_SET_ELEM_OBJREF])
                        return -EINVAL;
        }

        if (!nft_setelem_valid_key_end(set, nla, flags))
                return -EINVAL;

        if ((flags & NFT_SET_ELEM_INTERVAL_END) &&
             (nla[NFTA_SET_ELEM_DATA] ||
              nla[NFTA_SET_ELEM_OBJREF] ||
              nla[NFTA_SET_ELEM_TIMEOUT] ||
              nla[NFTA_SET_ELEM_EXPIRATION] ||
              nla[NFTA_SET_ELEM_USERDATA] ||
              nla[NFTA_SET_ELEM_EXPR] ||
              nla[NFTA_SET_ELEM_KEY_END] ||
              nla[NFTA_SET_ELEM_EXPRESSIONS]))
                return -EINVAL;

        timeout = 0;
        if (nla[NFTA_SET_ELEM_TIMEOUT] != NULL) {
                if (!(set->flags & NFT_SET_TIMEOUT))
                        return -EINVAL;
                err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_TIMEOUT],
                                            &timeout);
                if (err)
                        return err;
        } else if (set->flags & NFT_SET_TIMEOUT &&
                   !(flags & NFT_SET_ELEM_INTERVAL_END)) {
                timeout = set->timeout;
        }

        expiration = 0;
        if (nla[NFTA_SET_ELEM_EXPIRATION] != NULL) {
                if (!(set->flags & NFT_SET_TIMEOUT))
                        return -EINVAL;
                if (timeout == 0)
                        return -EOPNOTSUPP;

                err = nf_msecs_to_jiffies64(nla[NFTA_SET_ELEM_EXPIRATION],
                                            &expiration);
                if (err)
                        return err;

                if (expiration > timeout)
                        return -ERANGE;
        }

        if (nla[NFTA_SET_ELEM_EXPR]) {
                struct nft_expr *expr;

                if (set->num_exprs && set->num_exprs != 1)
                        return -EOPNOTSUPP;

                expr = nft_set_elem_expr_alloc(ctx, set,
                                               nla[NFTA_SET_ELEM_EXPR]);
                if (IS_ERR(expr))
                        return PTR_ERR(expr);

                expr_array[0] = expr;
                num_exprs = 1;

                if (set->num_exprs && set->exprs[0]->ops != expr->ops) {
                        err = -EOPNOTSUPP;
                        goto err_set_elem_expr;
                }
        } else if (nla[NFTA_SET_ELEM_EXPRESSIONS]) {
                struct nft_expr *expr;
                struct nlattr *tmp;
                int left;

                i = 0;
                nla_for_each_nested(tmp, nla[NFTA_SET_ELEM_EXPRESSIONS], left) {
                        if (i == NFT_SET_EXPR_MAX ||
                            (set->num_exprs && set->num_exprs == i)) {
                                err = -E2BIG;
                                goto err_set_elem_expr;
                        }
                        if (nla_type(tmp) != NFTA_LIST_ELEM) {
                                err = -EINVAL;
                                goto err_set_elem_expr;
                        }
                        expr = nft_set_elem_expr_alloc(ctx, set, tmp);
                        if (IS_ERR(expr)) {
                                err = PTR_ERR(expr);
                                goto err_set_elem_expr;
                        }
                        expr_array[i] = expr;
                        num_exprs++;

                        if (set->num_exprs && expr->ops != set->exprs[i]->ops) {
                                err = -EOPNOTSUPP;
                                goto err_set_elem_expr;
                        }
                        i++;
                }
                if (set->num_exprs && set->num_exprs != i) {
                        err = -EOPNOTSUPP;
                        goto err_set_elem_expr;
                }
        } else if (set->num_exprs > 0 &&
                   !(flags & NFT_SET_ELEM_INTERVAL_END)) {
                err = nft_set_elem_expr_clone(ctx, set, expr_array);
                if (err < 0)
                        goto err_set_elem_expr_clone;

                num_exprs = set->num_exprs;
        }

        if (nla[NFTA_SET_ELEM_KEY]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key.val,
                                            nla[NFTA_SET_ELEM_KEY]);
                if (err < 0)
                        goto err_set_elem_expr;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
                if (err < 0)
                        goto err_parse_key;
        }

        if (nla[NFTA_SET_ELEM_KEY_END]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
                                            nla[NFTA_SET_ELEM_KEY_END]);
                if (err < 0)
                        goto err_parse_key;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
                if (err < 0)
                        goto err_parse_key_end;
        }

        if (set->flags & NFT_SET_TIMEOUT) {
                err = nft_set_ext_add(&tmpl, NFT_SET_EXT_TIMEOUT);
                if (err < 0)
                        goto err_parse_key_end;
        }

        if (num_exprs) {
                for (i = 0; i < num_exprs; i++)
                        size += expr_array[i]->ops->size;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_EXPRESSIONS,
                                             sizeof(struct nft_set_elem_expr) + size);
                if (err < 0)
                        goto err_parse_key_end;
        }

        if (nla[NFTA_SET_ELEM_OBJREF] != NULL) {
                obj = nft_obj_lookup(ctx->net, ctx->table,
                                     nla[NFTA_SET_ELEM_OBJREF],
                                     set->objtype, genmask);
                if (IS_ERR(obj)) {
                        err = PTR_ERR(obj);
                        obj = NULL;
                        goto err_parse_key_end;
                }

                if (!nft_use_inc(&obj->use)) {
                        err = -EMFILE;
                        obj = NULL;
                        goto err_parse_key_end;
                }

                err = nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF);
                if (err < 0)
                        goto err_parse_key_end;
        }

        if (nla[NFTA_SET_ELEM_DATA] != NULL) {
                err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val,
                                             nla[NFTA_SET_ELEM_DATA]);
                if (err < 0)
                        goto err_parse_key_end;

                dreg = nft_type_to_reg(set->dtype);
                list_for_each_entry(binding, &set->bindings, list) {
                        struct nft_ctx bind_ctx = {
                                .net        = ctx->net,
                                .family        = ctx->family,
                                .table        = ctx->table,
                                .chain        = (struct nft_chain *)binding->chain,
                        };

                        if (!(binding->flags & NFT_SET_MAP))
                                continue;

                        err = nft_validate_register_store(&bind_ctx, dreg,
                                                          &elem.data.val,
                                                          desc.type, desc.len);
                        if (err < 0)
                                goto err_parse_data;

                        if (desc.type == NFT_DATA_VERDICT &&
                            (elem.data.val.verdict.code == NFT_GOTO ||
                             elem.data.val.verdict.code == NFT_JUMP))
                                nft_validate_state_update(ctx->table,
                                                          NFT_VALIDATE_NEED);
                }

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len);
                if (err < 0)
                        goto err_parse_data;
        }

        /* The full maximum length of userdata can exceed the maximum
         * offset value (U8_MAX) for following extensions, therefor it
         * must be the last extension added.
         */
        ulen = 0;
        if (nla[NFTA_SET_ELEM_USERDATA] != NULL) {
                ulen = nla_len(nla[NFTA_SET_ELEM_USERDATA]);
                if (ulen > 0) {
                        err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_USERDATA,
                                                     ulen);
                        if (err < 0)
                                goto err_parse_data;
                }
        }

        elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
                                      elem.key_end.val.data, elem.data.val.data,
                                      timeout, expiration, GFP_KERNEL_ACCOUNT);
        if (IS_ERR(elem.priv)) {
                err = PTR_ERR(elem.priv);
                goto err_parse_data;
        }

        ext = nft_set_elem_ext(set, elem.priv);
        if (flags)
                *nft_set_ext_flags(ext) = flags;

        if (obj)
                *nft_set_ext_obj(ext) = obj;

        if (ulen > 0) {
                if (nft_set_ext_check(&tmpl, NFT_SET_EXT_USERDATA, ulen) < 0) {
                        err = -EINVAL;
                        goto err_elem_free;
                }
                udata = nft_set_ext_userdata(ext);
                udata->len = ulen - 1;
                nla_memcpy(&udata->data, nla[NFTA_SET_ELEM_USERDATA], ulen);
        }
        err = nft_set_elem_expr_setup(ctx, &tmpl, ext, expr_array, num_exprs);
        if (err < 0)
                goto err_elem_free;

        trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
        if (trans == NULL) {
                err = -ENOMEM;
                goto err_elem_free;
        }

        ext->genmask = nft_genmask_cur(ctx->net);

        err = nft_setelem_insert(ctx->net, set, &elem, &elem_priv, flags);
        if (err) {
                if (err == -EEXIST) {
                        ext2 = nft_set_elem_ext(set, elem_priv);
                        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^
                            nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) ||
                            nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^
                            nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF))
                                goto err_element_clash;
                        if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) &&
                             nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) &&
                             memcmp(nft_set_ext_data(ext),
                                    nft_set_ext_data(ext2), set->dlen) != 0) ||
                            (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) &&
                             nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF) &&
                             *nft_set_ext_obj(ext) != *nft_set_ext_obj(ext2)))
                                goto err_element_clash;
                        else if (!(nlmsg_flags & NLM_F_EXCL)) {
                                err = 0;
                                if (nft_set_ext_exists(ext2, NFT_SET_EXT_TIMEOUT)) {
                                        struct nft_elem_update update = { };

                                        if (timeout != nft_set_ext_timeout(ext2)->timeout) {
                                                update.timeout = timeout;
                                                if (expiration == 0)
                                                        expiration = timeout;

                                                update.flags |= NFT_TRANS_UPD_TIMEOUT;
                                        }
                                        if (expiration) {
                                                update.expiration = expiration;
                                                update.flags |= NFT_TRANS_UPD_EXPIRATION;
                                        }

                                        if (update.flags) {
                                                struct nft_trans_one_elem *ue;

                                                ue = &nft_trans_container_elem(trans)->elems[0];

                                                ue->update = kmemdup(&update, sizeof(update), GFP_KERNEL);
                                                if (!ue->update) {
                                                        err = -ENOMEM;
                                                        goto err_element_clash;
                                                }

                                                ue->priv = elem_priv;
                                                nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
                                                goto err_elem_free;
                                        }
                                }
                        }
                } else if (err == -ENOTEMPTY) {
                        /* ENOTEMPTY reports overlapping between this element
                         * and an existing one.
                         */
                        err = -EEXIST;
                }
                goto err_element_clash;
        }

        if (!(flags & NFT_SET_ELEM_CATCHALL)) {
                unsigned int max = nft_set_maxsize(set);

                if (!atomic_add_unless(&set->nelems, 1, max)) {
                        err = -ENFILE;
                        goto err_set_full;
                }
        }

        nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
        nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
        return 0;

err_set_full:
        nft_setelem_remove(ctx->net, set, elem.priv);
err_element_clash:
        kfree(trans);
err_elem_free:
        nf_tables_set_elem_destroy(ctx, set, elem.priv);
err_parse_data:
        if (nla[NFTA_SET_ELEM_DATA] != NULL)
                nft_data_release(&elem.data.val, desc.type);
err_parse_key_end:
        if (obj)
                nft_use_dec_restore(&obj->use);

        nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
err_parse_key:
        nft_data_release(&elem.key.val, NFT_DATA_VALUE);
err_set_elem_expr:
        for (i = 0; i < num_exprs && expr_array[i]; i++)
                nft_expr_destroy(ctx, expr_array[i]);
err_set_elem_expr_clone:
        return err;
}

static int nf_tables_newsetelem(struct sk_buff *skb,
                                const struct nfnl_info *info,
                                const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_set *set;
        struct nft_ctx ctx;
        int rem, err;

        if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
                return PTR_ERR(table);
        }

        set = nft_set_lookup_global(net, table, nla[NFTA_SET_ELEM_LIST_SET],
                                    nla[NFTA_SET_ELEM_LIST_SET_ID], genmask);
        if (IS_ERR(set)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
                return PTR_ERR(set);
        }

        if (!list_empty(&set->bindings) &&
            (set->flags & (NFT_SET_CONSTANT | NFT_SET_ANONYMOUS)))
                return -EBUSY;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return err;
                }
        }

        if (table->validate_state == NFT_VALIDATE_DO)
                return nft_table_validate(net, table);

        return 0;
}

/**
 *        nft_data_hold - hold a nft_data item
 *
 *        @data: struct nft_data to release
 *        @type: type of data
 *
 *        Hold a nft_data item. NFT_DATA_VALUE types can be silently discarded,
 *        NFT_DATA_VERDICT bumps the reference to chains in case of NFT_JUMP and
 *        NFT_GOTO verdicts. This function must be called on active data objects
 *        from the second phase of the commit protocol.
 */
void nft_data_hold(const struct nft_data *data, enum nft_data_types type)
{
        struct nft_chain *chain;

        if (type == NFT_DATA_VERDICT) {
                switch (data->verdict.code) {
                case NFT_JUMP:
                case NFT_GOTO:
                        chain = data->verdict.chain;
                        nft_use_inc_restore(&chain->use);
                        break;
                }
        }
}

static int nft_setelem_active_next(const struct net *net,
                                   const struct nft_set *set,
                                   struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        u8 genmask = nft_genmask_next(net);

        return nft_set_elem_active(ext, genmask);
}

static void nft_setelem_data_activate(const struct net *net,
                                      const struct nft_set *set,
                                      struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
                nft_data_hold(nft_set_ext_data(ext), set->dtype);
        if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
                nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use);
}

void nft_setelem_data_deactivate(const struct net *net,
                                 const struct nft_set *set,
                                 struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);

        if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA))
                nft_data_release(nft_set_ext_data(ext), set->dtype);
        if (nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF))
                nft_use_dec(&(*nft_set_ext_obj(ext))->use);
}

/* similar to nft_trans_elems_remove, but called from abort path to undo newsetelem.
 * No notifications and no ndeact changes.
 *
 * Returns true if set had been added to (i.e., elements need to be removed again).
 */
static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx,
                                      struct nft_trans_elem *te)
{
        bool removed = false;
        int i;

        for (i = 0; i < te->nelems; i++) {
                if (te->elems[i].update) {
                        kfree(te->elems[i].update);
                        te->elems[i].update = NULL;
                        /* Update request, so do not release this element */
                        te->elems[i].priv = NULL;
                        continue;
                }

                if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv))
                        nft_setelem_remove(ctx->net, te->set, te->elems[i].priv);

                if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
                        atomic_dec(&te->set->nelems);

                removed = true;
        }

        return removed;
}

/* Called from abort path to undo DELSETELEM/DESTROYSETELEM. */
static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx,
                                          const struct nft_trans_elem *te)
{
        int i;

        for (i = 0; i < te->nelems; i++) {
                if (!nft_setelem_active_next(ctx->net, te->set, te->elems[i].priv)) {
                        nft_setelem_data_activate(ctx->net, te->set, te->elems[i].priv);
                        nft_setelem_activate(ctx->net, te->set, te->elems[i].priv);
                }

                if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
                        te->set->ndeact--;
        }
}

static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
                           const struct nlattr *attr)
{
        struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
        struct nft_set_ext_tmpl tmpl;
        struct nft_set_elem elem;
        struct nft_set_ext *ext;
        struct nft_trans *trans;
        u32 flags = 0;
        int err;

        err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr,
                                          nft_set_elem_policy, NULL);
        if (err < 0)
                return err;

        err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
        if (err < 0)
                return err;

        if (!nla[NFTA_SET_ELEM_KEY] && !(flags & NFT_SET_ELEM_CATCHALL))
                return -EINVAL;

        if (!nft_setelem_valid_key_end(set, nla, flags))
                return -EINVAL;

        nft_set_ext_prepare(&tmpl);

        if (flags != 0) {
                err = nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
                if (err < 0)
                        return err;
        }

        if (nla[NFTA_SET_ELEM_KEY]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key.val,
                                            nla[NFTA_SET_ELEM_KEY]);
                if (err < 0)
                        return err;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen);
                if (err < 0)
                        goto fail_elem;
        }

        if (nla[NFTA_SET_ELEM_KEY_END]) {
                err = nft_setelem_parse_key(ctx, set, &elem.key_end.val,
                                            nla[NFTA_SET_ELEM_KEY_END]);
                if (err < 0)
                        goto fail_elem;

                err = nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen);
                if (err < 0)
                        goto fail_elem_key_end;
        }

        err = -ENOMEM;
        elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data,
                                      elem.key_end.val.data, NULL, 0, 0,
                                      GFP_KERNEL_ACCOUNT);
        if (IS_ERR(elem.priv)) {
                err = PTR_ERR(elem.priv);
                goto fail_elem_key_end;
        }

        ext = nft_set_elem_ext(set, elem.priv);
        if (flags)
                *nft_set_ext_flags(ext) = flags;

        trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
        if (trans == NULL)
                goto fail_trans;

        err = nft_setelem_deactivate(ctx->net, set, &elem, flags);
        if (err < 0)
                goto fail_ops;

        nft_setelem_data_deactivate(ctx->net, set, elem.priv);

        nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
        nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
        return 0;

fail_ops:
        kfree(trans);
fail_trans:
        kfree(elem.priv);
fail_elem_key_end:
        nft_data_release(&elem.key_end.val, NFT_DATA_VALUE);
fail_elem:
        nft_data_release(&elem.key.val, NFT_DATA_VALUE);
        return err;
}

static int nft_setelem_flush(const struct nft_ctx *ctx,
                             struct nft_set *set,
                             const struct nft_set_iter *iter,
                             struct nft_elem_priv *elem_priv)
{
        const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv);
        struct nft_trans *trans;

        if (!nft_set_elem_active(ext, iter->genmask))
                return 0;

        trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
                                    struct_size_t(struct nft_trans_elem, elems, 1),
                                    GFP_ATOMIC);
        if (!trans)
                return -ENOMEM;

        set->ops->flush(ctx->net, set, elem_priv);
        set->ndeact++;

        nft_setelem_data_deactivate(ctx->net, set, elem_priv);
        nft_trans_elem_set(trans) = set;
        nft_trans_container_elem(trans)->nelems = 1;
        nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
        nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC);

        return 0;
}

static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
                                    struct nft_set *set,
                                    struct nft_elem_priv *elem_priv)
{
        struct nft_trans *trans;

        trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
        if (!trans)
                return -ENOMEM;

        nft_setelem_data_deactivate(ctx->net, set, elem_priv);
        nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
        nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);

        return 0;
}

static int nft_set_catchall_flush(const struct nft_ctx *ctx,
                                  struct nft_set *set)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_set_elem_catchall *catchall;
        struct nft_set_ext *ext;
        int ret = 0;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list,
                                lockdep_commit_lock_is_held(ctx->net)) {
                ext = nft_set_elem_ext(set, catchall->elem);
                if (!nft_set_elem_active(ext, genmask))
                        continue;

                ret = __nft_set_catchall_flush(ctx, set, catchall->elem);
                if (ret < 0)
                        break;
                nft_set_elem_change_active(ctx->net, set, ext);
        }

        return ret;
}

static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
{
        struct nft_set_iter iter = {
                .genmask        = genmask,
                .type                = NFT_ITER_UPDATE,
                .fn                = nft_setelem_flush,
        };

        set->ops->walk(ctx, set, &iter);
        if (!iter.err)
                iter.err = nft_set_catchall_flush(ctx, set);

        return iter.err;
}

static int nf_tables_delsetelem(struct sk_buff *skb,
                                const struct nfnl_info *info,
                                const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_set *set;
        struct nft_ctx ctx;
        int rem, err = 0;

        table = nft_table_lookup(net, nla[NFTA_SET_ELEM_LIST_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_TABLE]);
                return PTR_ERR(table);
        }

        set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask);
        if (IS_ERR(set)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]);
                return PTR_ERR(set);
        }

        if (nft_set_is_anonymous(set))
                return -EOPNOTSUPP;

        if (!list_empty(&set->bindings) && (set->flags & NFT_SET_CONSTANT))
                return -EBUSY;

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS])
                return nft_set_flush(&ctx, set, genmask);

        nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
                err = nft_del_setelem(&ctx, set, attr);
                if (err == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM)
                        continue;

                if (err < 0) {
                        NL_SET_BAD_ATTR(extack, attr);
                        return err;
                }
        }

        return 0;
}

/*
 * Stateful objects
 */

/**
 *        nft_register_obj- register nf_tables stateful object type
 *        @obj_type: object type
 *
 *        Registers the object type for use with nf_tables. Returns zero on
 *        success or a negative errno code otherwise.
 */
int nft_register_obj(struct nft_object_type *obj_type)
{
        if (obj_type->type == NFT_OBJECT_UNSPEC)
                return -EINVAL;

        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_add_rcu(&obj_type->list, &nf_tables_objects);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
        return 0;
}
EXPORT_SYMBOL_GPL(nft_register_obj);

/**
 *        nft_unregister_obj - unregister nf_tables object type
 *        @obj_type: object type
 *
 *         Unregisters the object type for use with nf_tables.
 */
void nft_unregister_obj(struct nft_object_type *obj_type)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_del_rcu(&obj_type->list);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_unregister_obj);

struct nft_object *nft_obj_lookup(const struct net *net,
                                  const struct nft_table *table,
                                  const struct nlattr *nla, u32 objtype,
                                  u8 genmask)
{
        struct nft_object_hash_key k = { .table = table };
        char search[NFT_OBJ_MAXNAMELEN];
        struct rhlist_head *tmp, *list;
        struct nft_object *obj;

        nla_strscpy(search, nla, sizeof(search));
        k.name = search;

        WARN_ON_ONCE(!rcu_read_lock_held() &&
                     !lockdep_commit_lock_is_held(net));

        rcu_read_lock();
        list = rhltable_lookup(&nft_objname_ht, &k, nft_objname_ht_params);
        if (!list)
                goto out;

        rhl_for_each_entry_rcu(obj, tmp, list, rhlhead) {
                if (objtype == obj->ops->type->type &&
                    nft_active_genmask(obj, genmask)) {
                        rcu_read_unlock();
                        return obj;
                }
        }
out:
        rcu_read_unlock();
        return ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL_GPL(nft_obj_lookup);

static struct nft_object *nft_obj_lookup_byhandle(const struct nft_table *table,
                                                  const struct nlattr *nla,
                                                  u32 objtype, u8 genmask)
{
        struct nft_object *obj;

        list_for_each_entry(obj, &table->objects, list) {
                if (be64_to_cpu(nla_get_be64(nla)) == obj->handle &&
                    objtype == obj->ops->type->type &&
                    nft_active_genmask(obj, genmask))
                        return obj;
        }
        return ERR_PTR(-ENOENT);
}

static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = {
        [NFTA_OBJ_TABLE]        = { .type = NLA_STRING,
                                    .len = NFT_TABLE_MAXNAMELEN - 1 },
        [NFTA_OBJ_NAME]                = { .type = NLA_STRING,
                                    .len = NFT_OBJ_MAXNAMELEN - 1 },
        [NFTA_OBJ_TYPE]                = { .type = NLA_U32 },
        [NFTA_OBJ_DATA]                = { .type = NLA_NESTED },
        [NFTA_OBJ_HANDLE]        = { .type = NLA_U64},
        [NFTA_OBJ_USERDATA]        = { .type = NLA_BINARY,
                                    .len = NFT_USERDATA_MAXLEN },
};

static struct nft_object *nft_obj_init(const struct nft_ctx *ctx,
                                       const struct nft_object_type *type,
                                       const struct nlattr *attr)
{
        struct nlattr **tb;
        const struct nft_object_ops *ops;
        struct nft_object *obj;
        int err = -ENOMEM;

        tb = kmalloc_array(type->maxattr + 1, sizeof(*tb), GFP_KERNEL);
        if (!tb)
                goto err1;

        if (attr) {
                err = nla_parse_nested_deprecated(tb, type->maxattr, attr,
                                                  type->policy, NULL);
                if (err < 0)
                        goto err2;
        } else {
                memset(tb, 0, sizeof(tb[0]) * (type->maxattr + 1));
        }

        if (type->select_ops) {
                ops = type->select_ops(ctx, (const struct nlattr * const *)tb);
                if (IS_ERR(ops)) {
                        err = PTR_ERR(ops);
                        goto err2;
                }
        } else {
                ops = type->ops;
        }

        err = -ENOMEM;
        obj = kzalloc(sizeof(*obj) + ops->size, GFP_KERNEL_ACCOUNT);
        if (!obj)
                goto err2;

        err = ops->init(ctx, (const struct nlattr * const *)tb, obj);
        if (err < 0)
                goto err3;

        obj->ops = ops;

        kfree(tb);
        return obj;
err3:
        kfree(obj);
err2:
        kfree(tb);
err1:
        return ERR_PTR(err);
}

static int nft_object_dump(struct sk_buff *skb, unsigned int attr,
                           struct nft_object *obj, bool reset)
{
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, attr);
        if (!nest)
                goto nla_put_failure;
        if (obj->ops->dump(skb, obj, reset) < 0)
                goto nla_put_failure;
        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        return -1;
}

static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family)
{
        const struct nft_object_type *type;

        list_for_each_entry_rcu(type, &nf_tables_objects, list) {
                if (type->family != NFPROTO_UNSPEC &&
                    type->family != family)
                        continue;

                if (objtype == type->type)
                        return type;
        }
        return NULL;
}

static const struct nft_object_type *
nft_obj_type_get(struct net *net, u32 objtype, u8 family)
{
        const struct nft_object_type *type;

        rcu_read_lock();
        type = __nft_obj_type_get(objtype, family);
        if (type != NULL && try_module_get(type->owner)) {
                rcu_read_unlock();
                return type;
        }
        rcu_read_unlock();

        lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
        if (type == NULL) {
                if (nft_request_module(net, "nft-obj-%u", objtype) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);
        }
#endif
        return ERR_PTR(-ENOENT);
}

static int nf_tables_updobj(const struct nft_ctx *ctx,
                            const struct nft_object_type *type,
                            const struct nlattr *attr,
                            struct nft_object *obj)
{
        struct nft_object *newobj;
        struct nft_trans *trans;
        int err = -ENOMEM;

        /* caller must have obtained type->owner reference. */
        trans = nft_trans_alloc(ctx, NFT_MSG_NEWOBJ,
                                sizeof(struct nft_trans_obj));
        if (!trans)
                goto err_trans;

        newobj = nft_obj_init(ctx, type, attr);
        if (IS_ERR(newobj)) {
                err = PTR_ERR(newobj);
                goto err_free_trans;
        }

        nft_trans_obj(trans) = obj;
        nft_trans_obj_update(trans) = true;
        nft_trans_obj_newobj(trans) = newobj;
        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_free_trans:
        kfree(trans);
err_trans:
        module_put(type->owner);
        return err;
}

static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_object_type *type;
        struct net *net = info->net;
        struct nft_table *table;
        struct nft_object *obj;
        struct nft_ctx ctx;
        u32 objtype;
        int err;

        if (!nla[NFTA_OBJ_TYPE] ||
            !nla[NFTA_OBJ_NAME] ||
            !nla[NFTA_OBJ_DATA])
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
                return PTR_ERR(table);
        }

        objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
        obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
        if (IS_ERR(obj)) {
                err = PTR_ERR(obj);
                if (err != -ENOENT) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
                        return err;
                }
        } else {
                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
                        return -EEXIST;
                }
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        return -EOPNOTSUPP;

                if (!obj->ops->update)
                        return 0;

                type = nft_obj_type_get(net, objtype, family);
                if (WARN_ON_ONCE(IS_ERR(type)))
                        return PTR_ERR(type);

                nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

                /* type->owner reference is put when transaction object is released. */
                return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj);
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (!nft_use_inc(&table->use))
                return -EMFILE;

        type = nft_obj_type_get(net, objtype, family);
        if (IS_ERR(type)) {
                err = PTR_ERR(type);
                goto err_type;
        }

        obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]);
        if (IS_ERR(obj)) {
                err = PTR_ERR(obj);
                goto err_init;
        }
        obj->key.table = table;
        obj->handle = nf_tables_alloc_handle(table);

        obj->key.name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL_ACCOUNT);
        if (!obj->key.name) {
                err = -ENOMEM;
                goto err_strdup;
        }

        if (nla[NFTA_OBJ_USERDATA]) {
                obj->udata = nla_memdup(nla[NFTA_OBJ_USERDATA], GFP_KERNEL_ACCOUNT);
                if (obj->udata == NULL)
                        goto err_userdata;

                obj->udlen = nla_len(nla[NFTA_OBJ_USERDATA]);
        }

        err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj);
        if (err < 0)
                goto err_trans;

        err = rhltable_insert(&nft_objname_ht, &obj->rhlhead,
                              nft_objname_ht_params);
        if (err < 0)
                goto err_obj_ht;

        list_add_tail_rcu(&obj->list, &table->objects);

        return 0;
err_obj_ht:
        /* queued in transaction log */
        INIT_LIST_HEAD(&obj->list);
        return err;
err_trans:
        kfree(obj->udata);
err_userdata:
        kfree(obj->key.name);
err_strdup:
        if (obj->ops->destroy)
                obj->ops->destroy(&ctx, obj);
        kfree(obj);
err_init:
        module_put(type->owner);
err_type:
        nft_use_dec_restore(&table->use);

        return err;
}

static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
                                   u32 portid, u32 seq, int event, u32 flags,
                                   int family, const struct nft_table *table,
                                   struct nft_object *obj, bool reset)
{
        struct nlmsghdr *nlh;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
            nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) ||
            nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle),
                         NFTA_OBJ_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELOBJ) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
            nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
            nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
                goto nla_put_failure;

        if (obj->udata &&
            nla_put(skb, NFTA_OBJ_USERDATA, obj->udlen, obj->udata))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

static void audit_log_obj_reset(const struct nft_table *table,
                                unsigned int base_seq, unsigned int nentries)
{
        char *buf = kasprintf(GFP_ATOMIC, "%s:%u", table->name, base_seq);

        audit_log_nfcfg(buf, table->family, nentries,
                        AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
        kfree(buf);
}

struct nft_obj_dump_ctx {
        unsigned int        s_idx;
        char                *table;
        u32                type;
        bool                reset;
};

static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;
        struct nftables_pernet *nft_net;
        const struct nft_table *table;
        unsigned int entries = 0;
        struct nft_object *obj;
        unsigned int idx = 0;
        int rc = 0;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                entries = 0;
                list_for_each_entry_rcu(obj, &table->objects, list) {
                        if (!nft_is_active(net, obj))
                                goto cont;
                        if (idx < ctx->s_idx)
                                goto cont;
                        if (ctx->table && strcmp(ctx->table, table->name))
                                goto cont;
                        if (ctx->type != NFT_OBJECT_UNSPEC &&
                            obj->ops->type->type != ctx->type)
                                goto cont;

                        rc = nf_tables_fill_obj_info(skb, net,
                                                     NETLINK_CB(cb->skb).portid,
                                                     cb->nlh->nlmsg_seq,
                                                     NFT_MSG_NEWOBJ,
                                                     NLM_F_MULTI | NLM_F_APPEND,
                                                     table->family, table,
                                                     obj, ctx->reset);
                        if (rc < 0)
                                break;

                        entries++;
                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                        idx++;
                }
                if (ctx->reset && entries)
                        audit_log_obj_reset(table, nft_net->base_seq, entries);
                if (rc < 0)
                        break;
        }
        rcu_read_unlock();

        ctx->s_idx = idx;
        return skb->len;
}

static int nf_tables_dumpreset_obj(struct sk_buff *skb,
                                   struct netlink_callback *cb)
{
        struct nftables_pernet *nft_net = nft_pernet(sock_net(skb->sk));
        int ret;

        mutex_lock(&nft_net->commit_mutex);
        ret = nf_tables_dump_obj(skb, cb);
        mutex_unlock(&nft_net->commit_mutex);

        return ret;
}

static int nf_tables_dump_obj_start(struct netlink_callback *cb)
{
        struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;
        const struct nlattr * const *nla = cb->data;

        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));

        if (nla[NFTA_OBJ_TABLE]) {
                ctx->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
                if (!ctx->table)
                        return -ENOMEM;
        }

        if (nla[NFTA_OBJ_TYPE])
                ctx->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));

        return 0;
}

static int nf_tables_dumpreset_obj_start(struct netlink_callback *cb)
{
        struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;

        ctx->reset = true;

        return nf_tables_dump_obj_start(cb);
}

static int nf_tables_dump_obj_done(struct netlink_callback *cb)
{
        struct nft_obj_dump_ctx *ctx = (void *)cb->ctx;

        kfree(ctx->table);

        return 0;
}

/* Caller must hold rcu read lock or transaction mutex */
static struct sk_buff *
nf_tables_getobj_single(u32 portid, const struct nfnl_info *info,
                        const struct nlattr * const nla[], bool reset)
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nft_table *table;
        struct net *net = info->net;
        struct nft_object *obj;
        struct sk_buff *skb2;
        u32 objtype;
        int err;

        if (!nla[NFTA_OBJ_NAME] ||
            !nla[NFTA_OBJ_TYPE])
                return ERR_PTR(-EINVAL);

        table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
                return ERR_CAST(table);
        }

        objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
        obj = nft_obj_lookup(net, table, nla[NFTA_OBJ_NAME], objtype, genmask);
        if (IS_ERR(obj)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_NAME]);
                return ERR_CAST(obj);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return ERR_PTR(-ENOMEM);

        err = nf_tables_fill_obj_info(skb2, net, portid,
                                      info->nlh->nlmsg_seq, NFT_MSG_NEWOBJ, 0,
                                      family, table, obj, reset);
        if (err < 0) {
                kfree_skb(skb2);
                return ERR_PTR(err);
        }

        return skb2;
}

static int nf_tables_getobj(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        u32 portid = NETLINK_CB(skb).portid;
        struct sk_buff *skb2;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_obj_start,
                        .dump = nf_tables_dump_obj,
                        .done = nf_tables_dump_obj_done,
                        .module = THIS_MODULE,
                        .data = (void *)nla,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        skb2 = nf_tables_getobj_single(portid, info, nla, false);
        if (IS_ERR(skb2))
                return PTR_ERR(skb2);

        return nfnetlink_unicast(skb2, info->net, portid);
}

static int nf_tables_getobj_reset(struct sk_buff *skb,
                                  const struct nfnl_info *info,
                                  const struct nlattr * const nla[])
{
        struct nftables_pernet *nft_net = nft_pernet(info->net);
        u32 portid = NETLINK_CB(skb).portid;
        struct net *net = info->net;
        struct sk_buff *skb2;
        char *buf;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dumpreset_obj_start,
                        .dump = nf_tables_dumpreset_obj,
                        .done = nf_tables_dump_obj_done,
                        .module = THIS_MODULE,
                        .data = (void *)nla,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!try_module_get(THIS_MODULE))
                return -EINVAL;
        rcu_read_unlock();
        mutex_lock(&nft_net->commit_mutex);
        skb2 = nf_tables_getobj_single(portid, info, nla, true);
        mutex_unlock(&nft_net->commit_mutex);
        rcu_read_lock();
        module_put(THIS_MODULE);

        if (IS_ERR(skb2))
                return PTR_ERR(skb2);

        buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
                        nla_len(nla[NFTA_OBJ_TABLE]),
                        (char *)nla_data(nla[NFTA_OBJ_TABLE]),
                        nft_net->base_seq);
        audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
                        AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
        kfree(buf);

        return nfnetlink_unicast(skb2, net, portid);
}

static void nft_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj)
{
        if (obj->ops->destroy)
                obj->ops->destroy(ctx, obj);

        module_put(obj->ops->type->owner);
        kfree(obj->key.name);
        kfree(obj->udata);
        kfree(obj);
}

static int nf_tables_delobj(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_object *obj;
        struct nft_ctx ctx;
        u32 objtype;

        if (!nla[NFTA_OBJ_TYPE] ||
            (!nla[NFTA_OBJ_NAME] && !nla[NFTA_OBJ_HANDLE]))
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_OBJ_TABLE], family, genmask,
                                 NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_OBJ_TABLE]);
                return PTR_ERR(table);
        }

        objtype = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
        if (nla[NFTA_OBJ_HANDLE]) {
                attr = nla[NFTA_OBJ_HANDLE];
                obj = nft_obj_lookup_byhandle(table, attr, objtype, genmask);
        } else {
                attr = nla[NFTA_OBJ_NAME];
                obj = nft_obj_lookup(net, table, attr, objtype, genmask);
        }

        if (IS_ERR(obj)) {
                if (PTR_ERR(obj) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYOBJ)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(obj);
        }
        if (obj->use > 0) {
                NL_SET_BAD_ATTR(extack, attr);
                return -EBUSY;
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        return nft_delobj(&ctx, obj);
}

static void
__nft_obj_notify(struct net *net, const struct nft_table *table,
                 struct nft_object *obj, u32 portid, u32 seq, int event,
                 u16 flags, int family, int report, gfp_t gfp)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct sk_buff *skb;
        int err;

        if (!report &&
            !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, gfp);
        if (skb == NULL)
                goto err;

        err = nf_tables_fill_obj_info(skb, net, portid, seq, event,
                                      flags & (NLM_F_CREATE | NLM_F_EXCL),
                                      family, table, obj, false);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_notify_enqueue(skb, report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

void nft_obj_notify(struct net *net, const struct nft_table *table,
                    struct nft_object *obj, u32 portid, u32 seq, int event,
                    u16 flags, int family, int report, gfp_t gfp)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        char *buf = kasprintf(gfp, "%s:%u",
                              table->name, nft_net->base_seq);

        audit_log_nfcfg(buf,
                        family,
                        obj->handle,
                        event == NFT_MSG_NEWOBJ ?
                                 AUDIT_NFT_OP_OBJ_REGISTER :
                                 AUDIT_NFT_OP_OBJ_UNREGISTER,
                        gfp);
        kfree(buf);

        __nft_obj_notify(net, table, obj, portid, seq, event,
                         flags, family, report, gfp);
}
EXPORT_SYMBOL_GPL(nft_obj_notify);

static void nf_tables_obj_notify(const struct nft_ctx *ctx,
                                 struct nft_object *obj, int event)
{
        __nft_obj_notify(ctx->net, ctx->table, obj, ctx->portid,
                         ctx->seq, event, ctx->flags, ctx->family,
                         ctx->report, GFP_KERNEL);
}

/*
 * Flow tables
 */
void nft_register_flowtable_type(struct nf_flowtable_type *type)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_add_tail_rcu(&type->list, &nf_tables_flowtables);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_register_flowtable_type);

void nft_unregister_flowtable_type(struct nf_flowtable_type *type)
{
        nfnl_lock(NFNL_SUBSYS_NFTABLES);
        list_del_rcu(&type->list);
        nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
EXPORT_SYMBOL_GPL(nft_unregister_flowtable_type);

static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = {
        [NFTA_FLOWTABLE_TABLE]                = { .type = NLA_STRING,
                                            .len = NFT_NAME_MAXLEN - 1 },
        [NFTA_FLOWTABLE_NAME]                = { .type = NLA_STRING,
                                            .len = NFT_NAME_MAXLEN - 1 },
        [NFTA_FLOWTABLE_HOOK]                = { .type = NLA_NESTED },
        [NFTA_FLOWTABLE_HANDLE]                = { .type = NLA_U64 },
        [NFTA_FLOWTABLE_FLAGS]                = { .type = NLA_U32 },
};

struct nft_flowtable *nft_flowtable_lookup(const struct net *net,
                                           const struct nft_table *table,
                                           const struct nlattr *nla, u8 genmask)
{
        struct nft_flowtable *flowtable;

        list_for_each_entry_rcu(flowtable, &table->flowtables, list,
                                lockdep_commit_lock_is_held(net)) {
                if (!nla_strcmp(nla, flowtable->name) &&
                    nft_active_genmask(flowtable, genmask))
                        return flowtable;
        }
        return ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL_GPL(nft_flowtable_lookup);

void nf_tables_deactivate_flowtable(const struct nft_ctx *ctx,
                                    struct nft_flowtable *flowtable,
                                    enum nft_trans_phase phase)
{
        switch (phase) {
        case NFT_TRANS_PREPARE_ERROR:
        case NFT_TRANS_PREPARE:
        case NFT_TRANS_ABORT:
        case NFT_TRANS_RELEASE:
                nft_use_dec(&flowtable->use);
                fallthrough;
        default:
                return;
        }
}
EXPORT_SYMBOL_GPL(nf_tables_deactivate_flowtable);

static struct nft_flowtable *
nft_flowtable_lookup_byhandle(const struct nft_table *table,
                              const struct nlattr *nla, u8 genmask)
{
       struct nft_flowtable *flowtable;

       list_for_each_entry(flowtable, &table->flowtables, list) {
               if (be64_to_cpu(nla_get_be64(nla)) == flowtable->handle &&
                   nft_active_genmask(flowtable, genmask))
                       return flowtable;
       }
       return ERR_PTR(-ENOENT);
}

struct nft_flowtable_hook {
        u32                        num;
        int                        priority;
        struct list_head        list;
};

static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = {
        [NFTA_FLOWTABLE_HOOK_NUM]        = { .type = NLA_U32 },
        [NFTA_FLOWTABLE_HOOK_PRIORITY]        = { .type = NLA_U32 },
        [NFTA_FLOWTABLE_HOOK_DEVS]        = { .type = NLA_NESTED },
};

static int nft_flowtable_parse_hook(const struct nft_ctx *ctx,
                                    const struct nlattr * const nla[],
                                    struct nft_flowtable_hook *flowtable_hook,
                                    struct nft_flowtable *flowtable,
                                    struct netlink_ext_ack *extack, bool add)
{
        struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1];
        struct nft_hook *hook;
        int hooknum, priority;
        int err;

        INIT_LIST_HEAD(&flowtable_hook->list);

        err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX,
                                          nla[NFTA_FLOWTABLE_HOOK],
                                          nft_flowtable_hook_policy, NULL);
        if (err < 0)
                return err;

        if (add) {
                if (!tb[NFTA_FLOWTABLE_HOOK_NUM] ||
                    !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
                        return -ENOENT;
                }

                hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
                if (hooknum != NF_NETDEV_INGRESS)
                        return -EOPNOTSUPP;

                priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));

                flowtable_hook->priority        = priority;
                flowtable_hook->num                = hooknum;
        } else {
                if (tb[NFTA_FLOWTABLE_HOOK_NUM]) {
                        hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM]));
                        if (hooknum != flowtable->hooknum)
                                return -EOPNOTSUPP;
                }

                if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) {
                        priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY]));
                        if (priority != flowtable->data.priority)
                                return -EOPNOTSUPP;
                }

                flowtable_hook->priority        = flowtable->data.priority;
                flowtable_hook->num                = flowtable->hooknum;
        }

        if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) {
                err = nf_tables_parse_netdev_hooks(ctx->net,
                                                   tb[NFTA_FLOWTABLE_HOOK_DEVS],
                                                   &flowtable_hook->list,
                                                   extack);
                if (err < 0)
                        return err;
        }

        list_for_each_entry(hook, &flowtable_hook->list, list) {
                hook->ops.pf                = NFPROTO_NETDEV;
                hook->ops.hooknum        = flowtable_hook->num;
                hook->ops.priority        = flowtable_hook->priority;
                hook->ops.priv                = &flowtable->data;
                hook->ops.hook                = flowtable->data.type->hook;
        }

        return err;
}

/* call under rcu_read_lock */
static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
{
        const struct nf_flowtable_type *type;

        list_for_each_entry_rcu(type, &nf_tables_flowtables, list) {
                if (family == type->family)
                        return type;
        }
        return NULL;
}

static const struct nf_flowtable_type *
nft_flowtable_type_get(struct net *net, u8 family)
{
        const struct nf_flowtable_type *type;

        rcu_read_lock();
        type = __nft_flowtable_type_get(family);
        if (type != NULL && try_module_get(type->owner)) {
                rcu_read_unlock();
                return type;
        }
        rcu_read_unlock();

        lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
        if (type == NULL) {
                if (nft_request_module(net, "nf-flowtable-%u", family) == -EAGAIN)
                        return ERR_PTR(-EAGAIN);
        }
#endif
        return ERR_PTR(-ENOENT);
}

/* Only called from error and netdev event paths. */
static void nft_unregister_flowtable_hook(struct net *net,
                                          struct nft_flowtable *flowtable,
                                          struct nft_hook *hook)
{
        nf_unregister_net_hook(net, &hook->ops);
        flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
                                    FLOW_BLOCK_UNBIND);
}

static void __nft_unregister_flowtable_net_hooks(struct net *net,
                                                 struct nft_flowtable *flowtable,
                                                 struct list_head *hook_list,
                                                 bool release_netdev)
{
        struct nft_hook *hook, *next;

        list_for_each_entry_safe(hook, next, hook_list, list) {
                nf_unregister_net_hook(net, &hook->ops);
                flowtable->data.type->setup(&flowtable->data, hook->ops.dev,
                                            FLOW_BLOCK_UNBIND);
                if (release_netdev) {
                        list_del(&hook->list);
                        kfree_rcu(hook, rcu);
                }
        }
}

static void nft_unregister_flowtable_net_hooks(struct net *net,
                                               struct nft_flowtable *flowtable,
                                               struct list_head *hook_list)
{
        __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false);
}

static int nft_register_flowtable_net_hooks(struct net *net,
                                            struct nft_table *table,
                                            struct list_head *hook_list,
                                            struct nft_flowtable *flowtable)
{
        struct nft_hook *hook, *next;
        struct nft_flowtable *ft;
        int err, i = 0;

        list_for_each_entry(hook, hook_list, list) {
                list_for_each_entry(ft, &table->flowtables, list) {
                        if (!nft_is_active_next(net, ft))
                                continue;

                        if (nft_hook_list_find(&ft->hook_list, hook)) {
                                err = -EEXIST;
                                goto err_unregister_net_hooks;
                        }
                }

                err = flowtable->data.type->setup(&flowtable->data,
                                                  hook->ops.dev,
                                                  FLOW_BLOCK_BIND);
                if (err < 0)
                        goto err_unregister_net_hooks;

                err = nf_register_net_hook(net, &hook->ops);
                if (err < 0) {
                        flowtable->data.type->setup(&flowtable->data,
                                                    hook->ops.dev,
                                                    FLOW_BLOCK_UNBIND);
                        goto err_unregister_net_hooks;
                }

                i++;
        }

        return 0;

err_unregister_net_hooks:
        list_for_each_entry_safe(hook, next, hook_list, list) {
                if (i-- <= 0)
                        break;

                nft_unregister_flowtable_hook(net, flowtable, hook);
                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
        }

        return err;
}

static void nft_hooks_destroy(struct list_head *hook_list)
{
        struct nft_hook *hook, *next;

        list_for_each_entry_safe(hook, next, hook_list, list) {
                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
        }
}

static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh,
                                struct nft_flowtable *flowtable,
                                struct netlink_ext_ack *extack)
{
        const struct nlattr * const *nla = ctx->nla;
        struct nft_flowtable_hook flowtable_hook;
        struct nft_hook *hook, *next;
        struct nft_trans *trans;
        bool unregister = false;
        u32 flags;
        int err;

        err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
                                       extack, false);
        if (err < 0)
                return err;

        list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
                if (nft_hook_list_find(&flowtable->hook_list, hook)) {
                        list_del(&hook->list);
                        kfree(hook);
                }
        }

        if (nla[NFTA_FLOWTABLE_FLAGS]) {
                flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
                if (flags & ~NFT_FLOWTABLE_MASK) {
                        err = -EOPNOTSUPP;
                        goto err_flowtable_update_hook;
                }
                if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
                    (flags & NFT_FLOWTABLE_HW_OFFLOAD)) {
                        err = -EOPNOTSUPP;
                        goto err_flowtable_update_hook;
                }
        } else {
                flags = flowtable->data.flags;
        }

        err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
                                               &flowtable_hook.list, flowtable);
        if (err < 0)
                goto err_flowtable_update_hook;

        trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE,
                                sizeof(struct nft_trans_flowtable));
        if (!trans) {
                unregister = true;
                err = -ENOMEM;
                goto err_flowtable_update_hook;
        }

        nft_trans_flowtable_flags(trans) = flags;
        nft_trans_flowtable(trans) = flowtable;
        nft_trans_flowtable_update(trans) = true;
        INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
        list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans));

        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_flowtable_update_hook:
        list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) {
                if (unregister)
                        nft_unregister_flowtable_hook(ctx->net, flowtable, hook);
                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
        }

        return err;

}

static int nf_tables_newflowtable(struct sk_buff *skb,
                                  const struct nfnl_info *info,
                                  const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        struct nft_flowtable_hook flowtable_hook;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        const struct nf_flowtable_type *type;
        struct nft_flowtable *flowtable;
        struct net *net = info->net;
        struct nft_table *table;
        struct nft_trans *trans;
        struct nft_ctx ctx;
        int err;

        if (!nla[NFTA_FLOWTABLE_TABLE] ||
            !nla[NFTA_FLOWTABLE_NAME] ||
            !nla[NFTA_FLOWTABLE_HOOK])
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
                return PTR_ERR(table);
        }

        flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME],
                                         genmask);
        if (IS_ERR(flowtable)) {
                err = PTR_ERR(flowtable);
                if (err != -ENOENT) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
                        return err;
                }
        } else {
                if (info->nlh->nlmsg_flags & NLM_F_EXCL) {
                        NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
                        return -EEXIST;
                }

                nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

                return nft_flowtable_update(&ctx, info->nlh, flowtable, extack);
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (!nft_use_inc(&table->use))
                return -EMFILE;

        flowtable = kzalloc(sizeof(*flowtable), GFP_KERNEL_ACCOUNT);
        if (!flowtable) {
                err = -ENOMEM;
                goto flowtable_alloc;
        }

        flowtable->table = table;
        flowtable->handle = nf_tables_alloc_handle(table);
        INIT_LIST_HEAD(&flowtable->hook_list);

        flowtable->name = nla_strdup(nla[NFTA_FLOWTABLE_NAME], GFP_KERNEL_ACCOUNT);
        if (!flowtable->name) {
                err = -ENOMEM;
                goto err1;
        }

        type = nft_flowtable_type_get(net, family);
        if (IS_ERR(type)) {
                err = PTR_ERR(type);
                goto err2;
        }

        if (nla[NFTA_FLOWTABLE_FLAGS]) {
                flowtable->data.flags =
                        ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
                if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) {
                        err = -EOPNOTSUPP;
                        goto err3;
                }
        }

        write_pnet(&flowtable->data.net, net);
        flowtable->data.type = type;
        err = type->init(&flowtable->data);
        if (err < 0)
                goto err3;

        err = nft_flowtable_parse_hook(&ctx, nla, &flowtable_hook, flowtable,
                                       extack, true);
        if (err < 0)
                goto err_flowtable_parse_hooks;

        list_splice(&flowtable_hook.list, &flowtable->hook_list);
        flowtable->data.priority = flowtable_hook.priority;
        flowtable->hooknum = flowtable_hook.num;

        trans = nft_trans_flowtable_add(&ctx, NFT_MSG_NEWFLOWTABLE, flowtable);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto err_flowtable_trans;
        }

        /* This must be LAST to ensure no packets are walking over this flowtable. */
        err = nft_register_flowtable_net_hooks(ctx.net, table,
                                               &flowtable->hook_list,
                                               flowtable);
        if (err < 0)
                goto err_flowtable_hooks;

        list_add_tail_rcu(&flowtable->list, &table->flowtables);

        return 0;

err_flowtable_hooks:
        nft_trans_destroy(trans);
err_flowtable_trans:
        nft_hooks_destroy(&flowtable->hook_list);
err_flowtable_parse_hooks:
        flowtable->data.type->free(&flowtable->data);
err3:
        module_put(type->owner);
err2:
        kfree(flowtable->name);
err1:
        kfree(flowtable);
flowtable_alloc:
        nft_use_dec_restore(&table->use);

        return err;
}

static void nft_flowtable_hook_release(struct nft_flowtable_hook *flowtable_hook)
{
        struct nft_hook *this, *next;

        list_for_each_entry_safe(this, next, &flowtable_hook->list, list) {
                list_del(&this->list);
                kfree(this);
        }
}

static int nft_delflowtable_hook(struct nft_ctx *ctx,
                                 struct nft_flowtable *flowtable,
                                 struct netlink_ext_ack *extack)
{
        const struct nlattr * const *nla = ctx->nla;
        struct nft_flowtable_hook flowtable_hook;
        LIST_HEAD(flowtable_del_list);
        struct nft_hook *this, *hook;
        struct nft_trans *trans;
        int err;

        err = nft_flowtable_parse_hook(ctx, nla, &flowtable_hook, flowtable,
                                       extack, false);
        if (err < 0)
                return err;

        list_for_each_entry(this, &flowtable_hook.list, list) {
                hook = nft_hook_list_find(&flowtable->hook_list, this);
                if (!hook) {
                        err = -ENOENT;
                        goto err_flowtable_del_hook;
                }
                list_move(&hook->list, &flowtable_del_list);
        }

        trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE,
                                sizeof(struct nft_trans_flowtable));
        if (!trans) {
                err = -ENOMEM;
                goto err_flowtable_del_hook;
        }

        nft_trans_flowtable(trans) = flowtable;
        nft_trans_flowtable_update(trans) = true;
        INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
        list_splice(&flowtable_del_list, &nft_trans_flowtable_hooks(trans));
        nft_flowtable_hook_release(&flowtable_hook);

        nft_trans_commit_list_add_tail(ctx->net, trans);

        return 0;

err_flowtable_del_hook:
        list_splice(&flowtable_del_list, &flowtable->hook_list);
        nft_flowtable_hook_release(&flowtable_hook);

        return err;
}

static int nf_tables_delflowtable(struct sk_buff *skb,
                                  const struct nfnl_info *info,
                                  const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_next(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_flowtable *flowtable;
        struct net *net = info->net;
        const struct nlattr *attr;
        struct nft_table *table;
        struct nft_ctx ctx;

        if (!nla[NFTA_FLOWTABLE_TABLE] ||
            (!nla[NFTA_FLOWTABLE_NAME] &&
             !nla[NFTA_FLOWTABLE_HANDLE]))
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
                                 genmask, NETLINK_CB(skb).portid);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
                return PTR_ERR(table);
        }

        if (nla[NFTA_FLOWTABLE_HANDLE]) {
                attr = nla[NFTA_FLOWTABLE_HANDLE];
                flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask);
        } else {
                attr = nla[NFTA_FLOWTABLE_NAME];
                flowtable = nft_flowtable_lookup(net, table, attr, genmask);
        }

        if (IS_ERR(flowtable)) {
                if (PTR_ERR(flowtable) == -ENOENT &&
                    NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYFLOWTABLE)
                        return 0;

                NL_SET_BAD_ATTR(extack, attr);
                return PTR_ERR(flowtable);
        }

        nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);

        if (nla[NFTA_FLOWTABLE_HOOK])
                return nft_delflowtable_hook(&ctx, flowtable, extack);

        if (flowtable->use > 0) {
                NL_SET_BAD_ATTR(extack, attr);
                return -EBUSY;
        }

        return nft_delflowtable(&ctx, flowtable);
}

static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
                                         u32 portid, u32 seq, int event,
                                         u32 flags, int family,
                                         struct nft_flowtable *flowtable,
                                         struct list_head *hook_list)
{
        struct nlattr *nest, *nest_devs;
        struct nft_hook *hook;
        struct nlmsghdr *nlh;

        event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
        nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_string(skb, NFTA_FLOWTABLE_TABLE, flowtable->table->name) ||
            nla_put_string(skb, NFTA_FLOWTABLE_NAME, flowtable->name) ||
            nla_put_be64(skb, NFTA_FLOWTABLE_HANDLE, cpu_to_be64(flowtable->handle),
                         NFTA_FLOWTABLE_PAD))
                goto nla_put_failure;

        if (event == NFT_MSG_DELFLOWTABLE && !hook_list) {
                nlmsg_end(skb, nlh);
                return 0;
        }

        if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
            nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
                goto nla_put_failure;

        nest = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK);
        if (!nest)
                goto nla_put_failure;
        if (nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_NUM, htonl(flowtable->hooknum)) ||
            nla_put_be32(skb, NFTA_FLOWTABLE_HOOK_PRIORITY, htonl(flowtable->data.priority)))
                goto nla_put_failure;

        nest_devs = nla_nest_start_noflag(skb, NFTA_FLOWTABLE_HOOK_DEVS);
        if (!nest_devs)
                goto nla_put_failure;

        if (!hook_list)
                hook_list = &flowtable->hook_list;

        list_for_each_entry_rcu(hook, hook_list, list,
                                lockdep_commit_lock_is_held(net)) {
                if (nla_put(skb, NFTA_DEVICE_NAME,
                            hook->ifnamelen, hook->ifname))
                        goto nla_put_failure;
        }
        nla_nest_end(skb, nest_devs);
        nla_nest_end(skb, nest);

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -1;
}

struct nft_flowtable_filter {
        char                *table;
};

static int nf_tables_dump_flowtable(struct sk_buff *skb,
                                    struct netlink_callback *cb)
{
        const struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
        struct nft_flowtable_filter *filter = cb->data;
        unsigned int idx = 0, s_idx = cb->args[0];
        struct net *net = sock_net(skb->sk);
        int family = nfmsg->nfgen_family;
        struct nft_flowtable *flowtable;
        struct nftables_pernet *nft_net;
        const struct nft_table *table;

        rcu_read_lock();
        nft_net = nft_pernet(net);
        cb->seq = READ_ONCE(nft_net->base_seq);

        list_for_each_entry_rcu(table, &nft_net->tables, list) {
                if (family != NFPROTO_UNSPEC && family != table->family)
                        continue;

                list_for_each_entry_rcu(flowtable, &table->flowtables, list) {
                        if (!nft_is_active(net, flowtable))
                                goto cont;
                        if (idx < s_idx)
                                goto cont;
                        if (idx > s_idx)
                                memset(&cb->args[1], 0,
                                       sizeof(cb->args) - sizeof(cb->args[0]));
                        if (filter && filter->table &&
                            strcmp(filter->table, table->name))
                                goto cont;

                        if (nf_tables_fill_flowtable_info(skb, net, NETLINK_CB(cb->skb).portid,
                                                          cb->nlh->nlmsg_seq,
                                                          NFT_MSG_NEWFLOWTABLE,
                                                          NLM_F_MULTI | NLM_F_APPEND,
                                                          table->family,
                                                          flowtable, NULL) < 0)
                                goto done;

                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
cont:
                        idx++;
                }
        }
done:
        rcu_read_unlock();

        cb->args[0] = idx;
        return skb->len;
}

static int nf_tables_dump_flowtable_start(struct netlink_callback *cb)
{
        const struct nlattr * const *nla = cb->data;
        struct nft_flowtable_filter *filter = NULL;

        if (nla[NFTA_FLOWTABLE_TABLE]) {
                filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
                if (!filter)
                        return -ENOMEM;

                filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
                                           GFP_ATOMIC);
                if (!filter->table) {
                        kfree(filter);
                        return -ENOMEM;
                }
        }

        cb->data = filter;
        return 0;
}

static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
{
        struct nft_flowtable_filter *filter = cb->data;

        if (!filter)
                return 0;

        kfree(filter->table);
        kfree(filter);

        return 0;
}

/* called with rcu_read_lock held */
static int nf_tables_getflowtable(struct sk_buff *skb,
                                  const struct nfnl_info *info,
                                  const struct nlattr * const nla[])
{
        struct netlink_ext_ack *extack = info->extack;
        u8 genmask = nft_genmask_cur(info->net);
        u8 family = info->nfmsg->nfgen_family;
        struct nft_flowtable *flowtable;
        const struct nft_table *table;
        struct net *net = info->net;
        struct sk_buff *skb2;
        int err;

        if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
                struct netlink_dump_control c = {
                        .start = nf_tables_dump_flowtable_start,
                        .dump = nf_tables_dump_flowtable,
                        .done = nf_tables_dump_flowtable_done,
                        .module = THIS_MODULE,
                        .data = (void *)nla,
                };

                return nft_netlink_dump_start_rcu(info->sk, skb, info->nlh, &c);
        }

        if (!nla[NFTA_FLOWTABLE_NAME])
                return -EINVAL;

        table = nft_table_lookup(net, nla[NFTA_FLOWTABLE_TABLE], family,
                                 genmask, 0);
        if (IS_ERR(table)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_TABLE]);
                return PTR_ERR(table);
        }

        flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME],
                                         genmask);
        if (IS_ERR(flowtable)) {
                NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]);
                return PTR_ERR(flowtable);
        }

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (!skb2)
                return -ENOMEM;

        err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid,
                                            info->nlh->nlmsg_seq,
                                            NFT_MSG_NEWFLOWTABLE, 0, family,
                                            flowtable, NULL);
        if (err < 0)
                goto err_fill_flowtable_info;

        return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);

err_fill_flowtable_info:
        kfree_skb(skb2);
        return err;
}

static void nf_tables_flowtable_notify(struct nft_ctx *ctx,
                                       struct nft_flowtable *flowtable,
                                       struct list_head *hook_list, int event)
{
        struct nftables_pernet *nft_net = nft_pernet(ctx->net);
        struct sk_buff *skb;
        u16 flags = 0;
        int err;

        if (!ctx->report &&
            !nfnetlink_has_listeners(ctx->net, NFNLGRP_NFTABLES))
                return;

        skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb == NULL)
                goto err;

        if (ctx->flags & (NLM_F_CREATE | NLM_F_EXCL))
                flags |= ctx->flags & (NLM_F_CREATE | NLM_F_EXCL);

        err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid,
                                            ctx->seq, event, flags,
                                            ctx->family, flowtable, hook_list);
        if (err < 0) {
                kfree_skb(skb);
                goto err;
        }

        nft_notify_enqueue(skb, ctx->report, &nft_net->notify_list);
        return;
err:
        nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS);
}

static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
{
        struct nft_hook *hook, *next;

        flowtable->data.type->free(&flowtable->data);
        list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) {
                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
        }
        kfree(flowtable->name);
        module_put(flowtable->data.type->owner);
        kfree(flowtable);
}

static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
                                   u32 portid, u32 seq)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nlmsghdr *nlh;
        char buf[TASK_COMM_LEN];
        int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);

        nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
                           NFNETLINK_V0, nft_base_seq(net));
        if (!nlh)
                goto nla_put_failure;

        if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) ||
            nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
            nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_trim(skb, nlh);
        return -EMSGSIZE;
}

static void nft_flowtable_event(unsigned long event, struct net_device *dev,
                                struct nft_flowtable *flowtable)
{
        struct nft_hook *hook;

        list_for_each_entry(hook, &flowtable->hook_list, list) {
                if (hook->ops.dev != dev)
                        continue;

                /* flow_offload_netdev_event() cleans up entries for us. */
                nft_unregister_flowtable_hook(dev_net(dev), flowtable, hook);
                list_del_rcu(&hook->list);
                kfree_rcu(hook, rcu);
                break;
        }
}

static int nf_tables_flowtable_event(struct notifier_block *this,
                                     unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct nft_flowtable *flowtable;
        struct nftables_pernet *nft_net;
        struct nft_table *table;
        struct net *net;

        if (event != NETDEV_UNREGISTER)
                return 0;

        net = dev_net(dev);
        nft_net = nft_pernet(net);
        mutex_lock(&nft_net->commit_mutex);
        list_for_each_entry(table, &nft_net->tables, list) {
                list_for_each_entry(flowtable, &table->flowtables, list) {
                        nft_flowtable_event(event, dev, flowtable);
                }
        }
        mutex_unlock(&nft_net->commit_mutex);

        return NOTIFY_DONE;
}

static struct notifier_block nf_tables_flowtable_notifier = {
        .notifier_call        = nf_tables_flowtable_event,
};

static void nf_tables_gen_notify(struct net *net, struct sk_buff *skb,
                                 int event)
{
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
        struct sk_buff *skb2;
        int err;

        if (!nlmsg_report(nlh) &&
            !nfnetlink_has_listeners(net, NFNLGRP_NFTABLES))
                return;

        skb2 = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
        if (skb2 == NULL)
                goto err;

        err = nf_tables_fill_gen_info(skb2, net, NETLINK_CB(skb).portid,
                                      nlh->nlmsg_seq);
        if (err < 0) {
                kfree_skb(skb2);
                goto err;
        }

        nfnetlink_send(skb2, net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
                       nlmsg_report(nlh), GFP_KERNEL);
        return;
err:
        nfnetlink_set_err(net, NETLINK_CB(skb).portid, NFNLGRP_NFTABLES,
                          -ENOBUFS);
}

static int nf_tables_getgen(struct sk_buff *skb, const struct nfnl_info *info,
                            const struct nlattr * const nla[])
{
        struct sk_buff *skb2;
        int err;

        skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC);
        if (skb2 == NULL)
                return -ENOMEM;

        err = nf_tables_fill_gen_info(skb2, info->net, NETLINK_CB(skb).portid,
                                      info->nlh->nlmsg_seq);
        if (err < 0)
                goto err_fill_gen_info;

        return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);

err_fill_gen_info:
        kfree_skb(skb2);
        return err;
}

static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
        [NFT_MSG_NEWTABLE] = {
                .call                = nf_tables_newtable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_TABLE_MAX,
                .policy                = nft_table_policy,
        },
        [NFT_MSG_GETTABLE] = {
                .call                = nf_tables_gettable,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_TABLE_MAX,
                .policy                = nft_table_policy,
        },
        [NFT_MSG_DELTABLE] = {
                .call                = nf_tables_deltable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_TABLE_MAX,
                .policy                = nft_table_policy,
        },
        [NFT_MSG_DESTROYTABLE] = {
                .call                = nf_tables_deltable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_TABLE_MAX,
                .policy                = nft_table_policy,
        },
        [NFT_MSG_NEWCHAIN] = {
                .call                = nf_tables_newchain,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_CHAIN_MAX,
                .policy                = nft_chain_policy,
        },
        [NFT_MSG_GETCHAIN] = {
                .call                = nf_tables_getchain,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_CHAIN_MAX,
                .policy                = nft_chain_policy,
        },
        [NFT_MSG_DELCHAIN] = {
                .call                = nf_tables_delchain,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_CHAIN_MAX,
                .policy                = nft_chain_policy,
        },
        [NFT_MSG_DESTROYCHAIN] = {
                .call                = nf_tables_delchain,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_CHAIN_MAX,
                .policy                = nft_chain_policy,
        },
        [NFT_MSG_NEWRULE] = {
                .call                = nf_tables_newrule,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_GETRULE] = {
                .call                = nf_tables_getrule,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_GETRULE_RESET] = {
                .call                = nf_tables_getrule_reset,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_DELRULE] = {
                .call                = nf_tables_delrule,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_DESTROYRULE] = {
                .call                = nf_tables_delrule,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_RULE_MAX,
                .policy                = nft_rule_policy,
        },
        [NFT_MSG_NEWSET] = {
                .call                = nf_tables_newset,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_MAX,
                .policy                = nft_set_policy,
        },
        [NFT_MSG_GETSET] = {
                .call                = nf_tables_getset,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_SET_MAX,
                .policy                = nft_set_policy,
        },
        [NFT_MSG_DELSET] = {
                .call                = nf_tables_delset,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_MAX,
                .policy                = nft_set_policy,
        },
        [NFT_MSG_DESTROYSET] = {
                .call                = nf_tables_delset,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_MAX,
                .policy                = nft_set_policy,
        },
        [NFT_MSG_NEWSETELEM] = {
                .call                = nf_tables_newsetelem,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETSETELEM] = {
                .call                = nf_tables_getsetelem,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETSETELEM_RESET] = {
                .call                = nf_tables_getsetelem_reset,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_DELSETELEM] = {
                .call                = nf_tables_delsetelem,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_DESTROYSETELEM] = {
                .call                = nf_tables_delsetelem,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_SET_ELEM_LIST_MAX,
                .policy                = nft_set_elem_list_policy,
        },
        [NFT_MSG_GETGEN] = {
                .call                = nf_tables_getgen,
                .type                = NFNL_CB_RCU,
        },
        [NFT_MSG_NEWOBJ] = {
                .call                = nf_tables_newobj,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_GETOBJ] = {
                .call                = nf_tables_getobj,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_DELOBJ] = {
                .call                = nf_tables_delobj,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_DESTROYOBJ] = {
                .call                = nf_tables_delobj,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_GETOBJ_RESET] = {
                .call                = nf_tables_getobj_reset,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_OBJ_MAX,
                .policy                = nft_obj_policy,
        },
        [NFT_MSG_NEWFLOWTABLE] = {
                .call                = nf_tables_newflowtable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_FLOWTABLE_MAX,
                .policy                = nft_flowtable_policy,
        },
        [NFT_MSG_GETFLOWTABLE] = {
                .call                = nf_tables_getflowtable,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFTA_FLOWTABLE_MAX,
                .policy                = nft_flowtable_policy,
        },
        [NFT_MSG_DELFLOWTABLE] = {
                .call                = nf_tables_delflowtable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_FLOWTABLE_MAX,
                .policy                = nft_flowtable_policy,
        },
        [NFT_MSG_DESTROYFLOWTABLE] = {
                .call                = nf_tables_delflowtable,
                .type                = NFNL_CB_BATCH,
                .attr_count        = NFTA_FLOWTABLE_MAX,
                .policy                = nft_flowtable_policy,
        },
};

static int nf_tables_validate(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_table *table;

        list_for_each_entry(table, &nft_net->tables, list) {
                switch (table->validate_state) {
                case NFT_VALIDATE_SKIP:
                        continue;
                case NFT_VALIDATE_NEED:
                        nft_validate_state_update(table, NFT_VALIDATE_DO);
                        fallthrough;
                case NFT_VALIDATE_DO:
                        if (nft_table_validate(net, table) < 0)
                                return -EAGAIN;

                        nft_validate_state_update(table, NFT_VALIDATE_SKIP);
                        break;
                }
        }

        return 0;
}

/* a drop policy has to be deferred until all rules have been activated,
 * otherwise a large ruleset that contains a drop-policy base chain will
 * cause all packets to get dropped until the full transaction has been
 * processed.
 *
 * We defer the drop policy until the transaction has been finalized.
 */
static void nft_chain_commit_drop_policy(struct nft_trans_chain *trans)
{
        struct nft_base_chain *basechain;

        if (trans->policy != NF_DROP)
                return;

        if (!nft_is_base_chain(trans->chain))
                return;

        basechain = nft_base_chain(trans->chain);
        basechain->policy = NF_DROP;
}

static void nft_chain_commit_update(struct nft_trans_chain *trans)
{
        struct nft_table *table = trans->nft_trans_binding.nft_trans.table;
        struct nft_base_chain *basechain;

        if (trans->name) {
                rhltable_remove(&table->chains_ht,
                                &trans->chain->rhlhead,
                                nft_chain_ht_params);
                swap(trans->chain->name, trans->name);
                rhltable_insert_key(&table->chains_ht,
                                    trans->chain->name,
                                    &trans->chain->rhlhead,
                                    nft_chain_ht_params);
        }

        if (!nft_is_base_chain(trans->chain))
                return;

        nft_chain_stats_replace(trans);

        basechain = nft_base_chain(trans->chain);

        switch (trans->policy) {
        case NF_DROP:
        case NF_ACCEPT:
                basechain->policy = trans->policy;
                break;
        }
}

static void nft_obj_commit_update(const struct nft_ctx *ctx,
                                  struct nft_trans *trans)
{
        struct nft_object *newobj;
        struct nft_object *obj;

        obj = nft_trans_obj(trans);
        newobj = nft_trans_obj_newobj(trans);

        if (WARN_ON_ONCE(!obj->ops->update))
                return;

        obj->ops->update(obj, newobj);
        nft_obj_destroy(ctx, newobj);
}

static void nft_commit_release(struct nft_trans *trans)
{
        struct nft_ctx ctx = {
                .net = trans->net,
        };

        nft_ctx_update(&ctx, trans);

        switch (trans->msg_type) {
        case NFT_MSG_DELTABLE:
        case NFT_MSG_DESTROYTABLE:
                nf_tables_table_destroy(trans->table);
                break;
        case NFT_MSG_NEWCHAIN:
                free_percpu(nft_trans_chain_stats(trans));
                kfree(nft_trans_chain_name(trans));
                break;
        case NFT_MSG_DELCHAIN:
        case NFT_MSG_DESTROYCHAIN:
                if (nft_trans_chain_update(trans))
                        nft_hooks_destroy(&nft_trans_chain_hooks(trans));
                else
                        nf_tables_chain_destroy(nft_trans_chain(trans));
                break;
        case NFT_MSG_DELRULE:
        case NFT_MSG_DESTROYRULE:
                nf_tables_rule_destroy(&ctx, nft_trans_rule(trans));
                break;
        case NFT_MSG_DELSET:
        case NFT_MSG_DESTROYSET:
                nft_set_destroy(&ctx, nft_trans_set(trans));
                break;
        case NFT_MSG_DELSETELEM:
        case NFT_MSG_DESTROYSETELEM:
                nft_trans_elems_destroy(&ctx, nft_trans_container_elem(trans));
                break;
        case NFT_MSG_DELOBJ:
        case NFT_MSG_DESTROYOBJ:
                nft_obj_destroy(&ctx, nft_trans_obj(trans));
                break;
        case NFT_MSG_DELFLOWTABLE:
        case NFT_MSG_DESTROYFLOWTABLE:
                if (nft_trans_flowtable_update(trans))
                        nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
                else
                        nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
                break;
        }

        if (trans->put_net)
                put_net(trans->net);

        kfree(trans);
}

static void nf_tables_trans_destroy_work(struct work_struct *w)
{
        struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work);
        struct nft_trans *trans, *next;
        LIST_HEAD(head);

        spin_lock(&nf_tables_destroy_list_lock);
        list_splice_init(&nft_net->destroy_list, &head);
        spin_unlock(&nf_tables_destroy_list_lock);

        if (list_empty(&head))
                return;

        synchronize_rcu();

        list_for_each_entry_safe(trans, next, &head, list) {
                nft_trans_list_del(trans);
                nft_commit_release(trans);
        }
}

void nf_tables_trans_destroy_flush_work(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);

        flush_work(&nft_net->destroy_work);
}
EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work);

static bool nft_expr_reduce(struct nft_regs_track *track,
                            const struct nft_expr *expr)
{
        return false;
}

static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain)
{
        const struct nft_expr *expr, *last;
        struct nft_regs_track track = {};
        unsigned int size, data_size;
        void *data, *data_boundary;
        struct nft_rule_dp *prule;
        struct nft_rule *rule;

        /* already handled or inactive chain? */
        if (chain->blob_next || !nft_is_active_next(net, chain))
                return 0;

        data_size = 0;
        list_for_each_entry(rule, &chain->rules, list) {
                if (nft_is_active_next(net, rule)) {
                        data_size += sizeof(*prule) + rule->dlen;
                        if (data_size > INT_MAX)
                                return -ENOMEM;
                }
        }

        chain->blob_next = nf_tables_chain_alloc_rules(chain, data_size);
        if (!chain->blob_next)
                return -ENOMEM;

        data = (void *)chain->blob_next->data;
        data_boundary = data + data_size;
        size = 0;

        list_for_each_entry(rule, &chain->rules, list) {
                if (!nft_is_active_next(net, rule))
                        continue;

                prule = (struct nft_rule_dp *)data;
                data += offsetof(struct nft_rule_dp, data);
                if (WARN_ON_ONCE(data > data_boundary))
                        return -ENOMEM;

                size = 0;
                track.last = nft_expr_last(rule);
                nft_rule_for_each_expr(expr, last, rule) {
                        track.cur = expr;

                        if (nft_expr_reduce(&track, expr)) {
                                expr = track.cur;
                                continue;
                        }

                        if (WARN_ON_ONCE(data + size + expr->ops->size > data_boundary))
                                return -ENOMEM;

                        memcpy(data + size, expr, expr->ops->size);
                        size += expr->ops->size;
                }
                if (WARN_ON_ONCE(size >= 1 << 12))
                        return -ENOMEM;

                prule->handle = rule->handle;
                prule->dlen = size;
                prule->is_last = 0;

                data += size;
                size = 0;
                chain->blob_next->size += (unsigned long)(data - (void *)prule);
        }

        if (WARN_ON_ONCE(data > data_boundary))
                return -ENOMEM;

        prule = (struct nft_rule_dp *)data;
        nft_last_rule(chain, prule);

        return 0;
}

static void nf_tables_commit_chain_prepare_cancel(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans *trans, *next;

        list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
                if (trans->msg_type == NFT_MSG_NEWRULE ||
                    trans->msg_type == NFT_MSG_DELRULE) {
                        struct nft_chain *chain = nft_trans_rule_chain(trans);

                        kvfree(chain->blob_next);
                        chain->blob_next = NULL;
                }
        }
}

static void __nf_tables_commit_chain_free_rules(struct rcu_head *h)
{
        struct nft_rule_dp_last *l = container_of(h, struct nft_rule_dp_last, h);

        kvfree(l->blob);
}

static void nf_tables_commit_chain_free_rules_old(struct nft_rule_blob *blob)
{
        struct nft_rule_dp_last *last;

        /* last rule trailer is after end marker */
        last = (void *)blob + sizeof(*blob) + blob->size;
        last->blob = blob;

        call_rcu(&last->h, __nf_tables_commit_chain_free_rules);
}

static void nf_tables_commit_chain(struct net *net, struct nft_chain *chain)
{
        struct nft_rule_blob *g0, *g1;
        bool next_genbit;

        next_genbit = nft_gencursor_next(net);

        g0 = rcu_dereference_protected(chain->blob_gen_0,
                                       lockdep_commit_lock_is_held(net));
        g1 = rcu_dereference_protected(chain->blob_gen_1,
                                       lockdep_commit_lock_is_held(net));

        /* No changes to this chain? */
        if (chain->blob_next == NULL) {
                /* chain had no change in last or next generation */
                if (g0 == g1)
                        return;
                /*
                 * chain had no change in this generation; make sure next
                 * one uses same rules as current generation.
                 */
                if (next_genbit) {
                        rcu_assign_pointer(chain->blob_gen_1, g0);
                        nf_tables_commit_chain_free_rules_old(g1);
                } else {
                        rcu_assign_pointer(chain->blob_gen_0, g1);
                        nf_tables_commit_chain_free_rules_old(g0);
                }

                return;
        }

        if (next_genbit)
                rcu_assign_pointer(chain->blob_gen_1, chain->blob_next);
        else
                rcu_assign_pointer(chain->blob_gen_0, chain->blob_next);

        chain->blob_next = NULL;

        if (g0 == g1)
                return;

        if (next_genbit)
                nf_tables_commit_chain_free_rules_old(g1);
        else
                nf_tables_commit_chain_free_rules_old(g0);
}

static void nft_obj_del(struct nft_object *obj)
{
        rhltable_remove(&nft_objname_ht, &obj->rhlhead, nft_objname_ht_params);
        list_del_rcu(&obj->list);
}

void nft_chain_del(struct nft_chain *chain)
{
        struct nft_table *table = chain->table;

        WARN_ON_ONCE(rhltable_remove(&table->chains_ht, &chain->rhlhead,
                                     nft_chain_ht_params));
        list_del_rcu(&chain->list);
}

static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx,
                                        struct nft_trans_gc *trans)
{
        struct nft_elem_priv **priv = trans->priv;
        unsigned int i;

        for (i = 0; i < trans->count; i++) {
                nft_setelem_data_deactivate(ctx->net, trans->set, priv[i]);
                nft_setelem_remove(ctx->net, trans->set, priv[i]);
        }
}

void nft_trans_gc_destroy(struct nft_trans_gc *trans)
{
        nft_set_put(trans->set);
        put_net(trans->net);
        kfree(trans);
}

static void nft_trans_gc_trans_free(struct rcu_head *rcu)
{
        struct nft_elem_priv *elem_priv;
        struct nft_trans_gc *trans;
        struct nft_ctx ctx = {};
        unsigned int i;

        trans = container_of(rcu, struct nft_trans_gc, rcu);
        ctx.net        = read_pnet(&trans->set->net);

        for (i = 0; i < trans->count; i++) {
                elem_priv = trans->priv[i];
                if (!nft_setelem_is_catchall(trans->set, elem_priv))
                        atomic_dec(&trans->set->nelems);

                nf_tables_set_elem_destroy(&ctx, trans->set, elem_priv);
        }

        nft_trans_gc_destroy(trans);
}

static bool nft_trans_gc_work_done(struct nft_trans_gc *trans)
{
        struct nftables_pernet *nft_net;
        struct nft_ctx ctx = {};

        nft_net = nft_pernet(trans->net);

        mutex_lock(&nft_net->commit_mutex);

        /* Check for race with transaction, otherwise this batch refers to
         * stale objects that might not be there anymore. Skip transaction if
         * set has been destroyed from control plane transaction in case gc
         * worker loses race.
         */
        if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) {
                mutex_unlock(&nft_net->commit_mutex);
                return false;
        }

        ctx.net = trans->net;
        ctx.table = trans->set->table;

        nft_trans_gc_setelem_remove(&ctx, trans);
        mutex_unlock(&nft_net->commit_mutex);

        return true;
}

static void nft_trans_gc_work(struct work_struct *work)
{
        struct nft_trans_gc *trans, *next;
        LIST_HEAD(trans_gc_list);

        spin_lock(&nf_tables_gc_list_lock);
        list_splice_init(&nf_tables_gc_list, &trans_gc_list);
        spin_unlock(&nf_tables_gc_list_lock);

        list_for_each_entry_safe(trans, next, &trans_gc_list, list) {
                list_del(&trans->list);
                if (!nft_trans_gc_work_done(trans)) {
                        nft_trans_gc_destroy(trans);
                        continue;
                }
                call_rcu(&trans->rcu, nft_trans_gc_trans_free);
        }
}

struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
                                        unsigned int gc_seq, gfp_t gfp)
{
        struct net *net = read_pnet(&set->net);
        struct nft_trans_gc *trans;

        trans = kzalloc(sizeof(*trans), gfp);
        if (!trans)
                return NULL;

        trans->net = maybe_get_net(net);
        if (!trans->net) {
                kfree(trans);
                return NULL;
        }

        refcount_inc(&set->refs);
        trans->set = set;
        trans->seq = gc_seq;

        return trans;
}

void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv)
{
        trans->priv[trans->count++] = priv;
}

static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
{
        spin_lock(&nf_tables_gc_list_lock);
        list_add_tail(&trans->list, &nf_tables_gc_list);
        spin_unlock(&nf_tables_gc_list_lock);

        schedule_work(&trans_gc_work);
}

static int nft_trans_gc_space(struct nft_trans_gc *trans)
{
        return NFT_TRANS_GC_BATCHCOUNT - trans->count;
}

struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
                                              unsigned int gc_seq, gfp_t gfp)
{
        struct nft_set *set;

        if (nft_trans_gc_space(gc))
                return gc;

        set = gc->set;
        nft_trans_gc_queue_work(gc);

        return nft_trans_gc_alloc(set, gc_seq, gfp);
}

void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans)
{
        if (trans->count == 0) {
                nft_trans_gc_destroy(trans);
                return;
        }

        nft_trans_gc_queue_work(trans);
}

struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp)
{
        struct nft_set *set;

        if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net)))
                return NULL;

        if (nft_trans_gc_space(gc))
                return gc;

        set = gc->set;
        call_rcu(&gc->rcu, nft_trans_gc_trans_free);

        return nft_trans_gc_alloc(set, 0, gfp);
}

void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans)
{
        WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net));

        if (trans->count == 0) {
                nft_trans_gc_destroy(trans);
                return;
        }

        call_rcu(&trans->rcu, nft_trans_gc_trans_free);
}

struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc,
                                                 unsigned int gc_seq)
{
        struct nft_set_elem_catchall *catchall;
        const struct nft_set *set = gc->set;
        struct nft_set_ext *ext;

        list_for_each_entry_rcu(catchall, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);

                if (!nft_set_elem_expired(ext))
                        continue;
                if (nft_set_elem_is_dead(ext))
                        goto dead_elem;

                nft_set_elem_dead(ext);
dead_elem:
                gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
                if (!gc)
                        return NULL;

                nft_trans_gc_elem_add(gc, catchall->elem);
        }

        return gc;
}

struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc)
{
        struct nft_set_elem_catchall *catchall, *next;
        u64 tstamp = nft_net_tstamp(gc->net);
        const struct nft_set *set = gc->set;
        struct nft_elem_priv *elem_priv;
        struct nft_set_ext *ext;

        WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net));

        list_for_each_entry_safe(catchall, next, &set->catchall_list, list) {
                ext = nft_set_elem_ext(set, catchall->elem);

                if (!__nft_set_elem_expired(ext, tstamp))
                        continue;

                gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
                if (!gc)
                        return NULL;

                elem_priv = catchall->elem;
                nft_setelem_data_deactivate(gc->net, gc->set, elem_priv);
                nft_setelem_catchall_destroy(catchall);
                nft_trans_gc_elem_add(gc, elem_priv);
        }

        return gc;
}

static void nf_tables_module_autoload_cleanup(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_module_request *req, *next;

        WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
        list_for_each_entry_safe(req, next, &nft_net->module_list, list) {
                WARN_ON_ONCE(!req->done);
                list_del(&req->list);
                kfree(req);
        }
}

static void nf_tables_commit_release(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans *trans;

        /* all side effects have to be made visible.
         * For example, if a chain named 'foo' has been deleted, a
         * new transaction must not find it anymore.
         *
         * Memory reclaim happens asynchronously from work queue
         * to prevent expensive synchronize_rcu() in commit phase.
         */
        if (list_empty(&nft_net->commit_list)) {
                nf_tables_module_autoload_cleanup(net);
                mutex_unlock(&nft_net->commit_mutex);
                return;
        }

        trans = list_last_entry(&nft_net->commit_list,
                                struct nft_trans, list);
        get_net(trans->net);
        WARN_ON_ONCE(trans->put_net);

        trans->put_net = true;
        spin_lock(&nf_tables_destroy_list_lock);
        list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list);
        spin_unlock(&nf_tables_destroy_list_lock);

        nf_tables_module_autoload_cleanup(net);
        schedule_work(&nft_net->destroy_work);

        mutex_unlock(&nft_net->commit_mutex);
}

static void nft_commit_notify(struct net *net, u32 portid)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct sk_buff *batch_skb = NULL, *nskb, *skb;
        unsigned char *data;
        int len;

        list_for_each_entry_safe(skb, nskb, &nft_net->notify_list, list) {
                if (!batch_skb) {
new_batch:
                        batch_skb = skb;
                        len = NLMSG_GOODSIZE - skb->len;
                        list_del(&skb->list);
                        continue;
                }
                len -= skb->len;
                if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) {
                        data = skb_put(batch_skb, skb->len);
                        memcpy(data, skb->data, skb->len);
                        list_del(&skb->list);
                        kfree_skb(skb);
                        continue;
                }
                nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
                               NFT_CB(batch_skb).report, GFP_KERNEL);
                goto new_batch;
        }

        if (batch_skb) {
                nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES,
                               NFT_CB(batch_skb).report, GFP_KERNEL);
        }

        WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
}

static int nf_tables_commit_audit_alloc(struct list_head *adl,
                                        struct nft_table *table)
{
        struct nft_audit_data *adp;

        list_for_each_entry(adp, adl, list) {
                if (adp->table == table)
                        return 0;
        }
        adp = kzalloc(sizeof(*adp), GFP_KERNEL);
        if (!adp)
                return -ENOMEM;
        adp->table = table;
        list_add(&adp->list, adl);
        return 0;
}

static void nf_tables_commit_audit_free(struct list_head *adl)
{
        struct nft_audit_data *adp, *adn;

        list_for_each_entry_safe(adp, adn, adl, list) {
                list_del(&adp->list);
                kfree(adp);
        }
}

/* nft audit emits the number of elements that get added/removed/updated,
 * so NEW/DELSETELEM needs to increment based on the total elem count.
 */
static unsigned int nf_tables_commit_audit_entrycount(const struct nft_trans *trans)
{
        switch (trans->msg_type) {
        case NFT_MSG_NEWSETELEM:
        case NFT_MSG_DELSETELEM:
                return nft_trans_container_elem(trans)->nelems;
        }

        return 1;
}

static void nf_tables_commit_audit_collect(struct list_head *adl,
                                           const struct nft_trans *trans, u32 op)
{
        const struct nft_table *table = trans->table;
        struct nft_audit_data *adp;

        list_for_each_entry(adp, adl, list) {
                if (adp->table == table)
                        goto found;
        }
        WARN_ONCE(1, "table=%s not expected in commit list", table->name);
        return;
found:
        adp->entries += nf_tables_commit_audit_entrycount(trans);
        if (!adp->op || adp->op > op)
                adp->op = op;
}

#define AUNFTABLENAMELEN (NFT_TABLE_MAXNAMELEN + 22)

static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation)
{
        struct nft_audit_data *adp, *adn;
        char aubuf[AUNFTABLENAMELEN];

        list_for_each_entry_safe(adp, adn, adl, list) {
                snprintf(aubuf, AUNFTABLENAMELEN, "%s:%u", adp->table->name,
                         generation);
                audit_log_nfcfg(aubuf, adp->table->family, adp->entries,
                                nft2audit_op[adp->op], GFP_KERNEL);
                list_del(&adp->list);
                kfree(adp);
        }
}

static void nft_set_commit_update(struct list_head *set_update_list)
{
        struct nft_set *set, *next;

        list_for_each_entry_safe(set, next, set_update_list, pending_update) {
                list_del_init(&set->pending_update);

                if (!set->ops->commit || set->dead)
                        continue;

                set->ops->commit(set);
        }
}

static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net)
{
        unsigned int gc_seq;

        /* Bump gc counter, it becomes odd, this is the busy mark. */
        gc_seq = READ_ONCE(nft_net->gc_seq);
        WRITE_ONCE(nft_net->gc_seq, ++gc_seq);

        return gc_seq;
}

static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq)
{
        WRITE_ONCE(nft_net->gc_seq, ++gc_seq);
}

static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        const struct nlmsghdr *nlh = nlmsg_hdr(skb);
        struct nft_trans_binding *trans_binding;
        struct nft_trans *trans, *next;
        unsigned int base_seq, gc_seq;
        LIST_HEAD(set_update_list);
        struct nft_trans_elem *te;
        struct nft_chain *chain;
        struct nft_table *table;
        struct nft_ctx ctx;
        LIST_HEAD(adl);
        int err;

        if (list_empty(&nft_net->commit_list)) {
                mutex_unlock(&nft_net->commit_mutex);
                return 0;
        }

        nft_ctx_init(&ctx, net, skb, nlh, NFPROTO_UNSPEC, NULL, NULL, NULL);

        list_for_each_entry(trans_binding, &nft_net->binding_list, binding_list) {
                trans = &trans_binding->nft_trans;
                switch (trans->msg_type) {
                case NFT_MSG_NEWSET:
                        if (!nft_trans_set_update(trans) &&
                            nft_set_is_anonymous(nft_trans_set(trans)) &&
                            !nft_trans_set_bound(trans)) {
                                pr_warn_once("nftables ruleset with unbound set\n");
                                return -EINVAL;
                        }
                        break;
                case NFT_MSG_NEWCHAIN:
                        if (!nft_trans_chain_update(trans) &&
                            nft_chain_binding(nft_trans_chain(trans)) &&
                            !nft_trans_chain_bound(trans)) {
                                pr_warn_once("nftables ruleset with unbound chain\n");
                                return -EINVAL;
                        }
                        break;
                default:
                        WARN_ONCE(1, "Unhandled bind type %d", trans->msg_type);
                        break;
                }
        }

        /* 0. Validate ruleset, otherwise roll back for error reporting. */
        if (nf_tables_validate(net) < 0) {
                nft_net->validate_state = NFT_VALIDATE_DO;
                return -EAGAIN;
        }

        err = nft_flow_rule_offload_commit(net);
        if (err < 0)
                return err;

        /* 1.  Allocate space for next generation rules_gen_X[] */
        list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
                struct nft_table *table = trans->table;
                int ret;

                ret = nf_tables_commit_audit_alloc(&adl, table);
                if (ret) {
                        nf_tables_commit_chain_prepare_cancel(net);
                        nf_tables_commit_audit_free(&adl);
                        return ret;
                }
                if (trans->msg_type == NFT_MSG_NEWRULE ||
                    trans->msg_type == NFT_MSG_DELRULE) {
                        chain = nft_trans_rule_chain(trans);

                        ret = nf_tables_commit_chain_prepare(net, chain);
                        if (ret < 0) {
                                nf_tables_commit_chain_prepare_cancel(net);
                                nf_tables_commit_audit_free(&adl);
                                return ret;
                        }
                }
        }

        /* step 2.  Make rules_gen_X visible to packet path */
        list_for_each_entry(table, &nft_net->tables, list) {
                list_for_each_entry(chain, &table->chains, list)
                        nf_tables_commit_chain(net, chain);
        }

        /*
         * Bump generation counter, invalidate any dump in progress.
         * Cannot fail after this point.
         */
        base_seq = READ_ONCE(nft_net->base_seq);
        while (++base_seq == 0)
                ;

        WRITE_ONCE(nft_net->base_seq, base_seq);

        gc_seq = nft_gc_seq_begin(nft_net);

        /* step 3. Start new generation, rules_gen_X now in use. */
        net->nft.gencursor = nft_gencursor_next(net);

        list_for_each_entry_safe(trans, next, &nft_net->commit_list, list) {
                struct nft_table *table = trans->table;

                nft_ctx_update(&ctx, trans);

                nf_tables_commit_audit_collect(&adl, trans, trans->msg_type);
                switch (trans->msg_type) {
                case NFT_MSG_NEWTABLE:
                        if (nft_trans_table_update(trans)) {
                                if (!(table->flags & __NFT_TABLE_F_UPDATE)) {
                                        nft_trans_destroy(trans);
                                        break;
                                }
                                if (table->flags & NFT_TABLE_F_DORMANT)
                                        nf_tables_table_disable(net, table);

                                table->flags &= ~__NFT_TABLE_F_UPDATE;
                        } else {
                                nft_clear(net, table);
                        }
                        nf_tables_table_notify(&ctx, NFT_MSG_NEWTABLE);
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELTABLE:
                case NFT_MSG_DESTROYTABLE:
                        list_del_rcu(&table->list);
                        nf_tables_table_notify(&ctx, trans->msg_type);
                        break;
                case NFT_MSG_NEWCHAIN:
                        if (nft_trans_chain_update(trans)) {
                                nft_chain_commit_update(nft_trans_container_chain(trans));
                                nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN,
                                                       &nft_trans_chain_hooks(trans));
                                list_splice(&nft_trans_chain_hooks(trans),
                                            &nft_trans_basechain(trans)->hook_list);
                                /* trans destroyed after rcu grace period */
                        } else {
                                nft_chain_commit_drop_policy(nft_trans_container_chain(trans));
                                nft_clear(net, nft_trans_chain(trans));
                                nf_tables_chain_notify(&ctx, NFT_MSG_NEWCHAIN, NULL);
                                nft_trans_destroy(trans);
                        }
                        break;
                case NFT_MSG_DELCHAIN:
                case NFT_MSG_DESTROYCHAIN:
                        if (nft_trans_chain_update(trans)) {
                                nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
                                                       &nft_trans_chain_hooks(trans));
                                if (!(table->flags & NFT_TABLE_F_DORMANT)) {
                                        nft_netdev_unregister_hooks(net,
                                                                    &nft_trans_chain_hooks(trans),
                                                                    true);
                                }
                        } else {
                                nft_chain_del(nft_trans_chain(trans));
                                nf_tables_chain_notify(&ctx, NFT_MSG_DELCHAIN,
                                                       NULL);
                                nf_tables_unregister_hook(ctx.net, ctx.table,
                                                          nft_trans_chain(trans));
                        }
                        break;
                case NFT_MSG_NEWRULE:
                        nft_clear(net, nft_trans_rule(trans));
                        nf_tables_rule_notify(&ctx, nft_trans_rule(trans),
                                              NFT_MSG_NEWRULE);
                        if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
                                nft_flow_rule_destroy(nft_trans_flow_rule(trans));

                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELRULE:
                case NFT_MSG_DESTROYRULE:
                        list_del_rcu(&nft_trans_rule(trans)->list);
                        nf_tables_rule_notify(&ctx, nft_trans_rule(trans),
                                              trans->msg_type);
                        nft_rule_expr_deactivate(&ctx, nft_trans_rule(trans),
                                                 NFT_TRANS_COMMIT);

                        if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
                                nft_flow_rule_destroy(nft_trans_flow_rule(trans));
                        break;
                case NFT_MSG_NEWSET:
                        list_del(&nft_trans_container_set(trans)->list_trans_newset);
                        if (nft_trans_set_update(trans)) {
                                struct nft_set *set = nft_trans_set(trans);

                                WRITE_ONCE(set->timeout, nft_trans_set_timeout(trans));
                                WRITE_ONCE(set->gc_int, nft_trans_set_gc_int(trans));

                                if (nft_trans_set_size(trans))
                                        WRITE_ONCE(set->size, nft_trans_set_size(trans));
                        } else {
                                nft_clear(net, nft_trans_set(trans));
                                /* This avoids hitting -EBUSY when deleting the table
                                 * from the transaction.
                                 */
                                if (nft_set_is_anonymous(nft_trans_set(trans)) &&
                                    !list_empty(&nft_trans_set(trans)->bindings))
                                        nft_use_dec(&table->use);
                        }
                        nf_tables_set_notify(&ctx, nft_trans_set(trans),
                                             NFT_MSG_NEWSET, GFP_KERNEL);
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELSET:
                case NFT_MSG_DESTROYSET:
                        nft_trans_set(trans)->dead = 1;
                        list_del_rcu(&nft_trans_set(trans)->list);
                        nf_tables_set_notify(&ctx, nft_trans_set(trans),
                                             trans->msg_type, GFP_KERNEL);
                        break;
                case NFT_MSG_NEWSETELEM:
                        te = nft_trans_container_elem(trans);

                        nft_trans_elems_add(&ctx, te);

                        if (te->set->ops->commit &&
                            list_empty(&te->set->pending_update)) {
                                list_add_tail(&te->set->pending_update,
                                              &set_update_list);
                        }
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELSETELEM:
                case NFT_MSG_DESTROYSETELEM:
                        te = nft_trans_container_elem(trans);

                        nft_trans_elems_remove(&ctx, te);

                        if (te->set->ops->commit &&
                            list_empty(&te->set->pending_update)) {
                                list_add_tail(&te->set->pending_update,
                                              &set_update_list);
                        }
                        break;
                case NFT_MSG_NEWOBJ:
                        if (nft_trans_obj_update(trans)) {
                                nft_obj_commit_update(&ctx, trans);
                                nf_tables_obj_notify(&ctx,
                                                     nft_trans_obj(trans),
                                                     NFT_MSG_NEWOBJ);
                        } else {
                                nft_clear(net, nft_trans_obj(trans));
                                nf_tables_obj_notify(&ctx,
                                                     nft_trans_obj(trans),
                                                     NFT_MSG_NEWOBJ);
                                nft_trans_destroy(trans);
                        }
                        break;
                case NFT_MSG_DELOBJ:
                case NFT_MSG_DESTROYOBJ:
                        nft_obj_del(nft_trans_obj(trans));
                        nf_tables_obj_notify(&ctx, nft_trans_obj(trans),
                                             trans->msg_type);
                        break;
                case NFT_MSG_NEWFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
                                nft_trans_flowtable(trans)->data.flags =
                                        nft_trans_flowtable_flags(trans);
                                nf_tables_flowtable_notify(&ctx,
                                                           nft_trans_flowtable(trans),
                                                           &nft_trans_flowtable_hooks(trans),
                                                           NFT_MSG_NEWFLOWTABLE);
                                list_splice(&nft_trans_flowtable_hooks(trans),
                                            &nft_trans_flowtable(trans)->hook_list);
                        } else {
                                nft_clear(net, nft_trans_flowtable(trans));
                                nf_tables_flowtable_notify(&ctx,
                                                           nft_trans_flowtable(trans),
                                                           NULL,
                                                           NFT_MSG_NEWFLOWTABLE);
                        }
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_DELFLOWTABLE:
                case NFT_MSG_DESTROYFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
                                nf_tables_flowtable_notify(&ctx,
                                                           nft_trans_flowtable(trans),
                                                           &nft_trans_flowtable_hooks(trans),
                                                           trans->msg_type);
                                nft_unregister_flowtable_net_hooks(net,
                                                                   nft_trans_flowtable(trans),
                                                                   &nft_trans_flowtable_hooks(trans));
                        } else {
                                list_del_rcu(&nft_trans_flowtable(trans)->list);
                                nf_tables_flowtable_notify(&ctx,
                                                           nft_trans_flowtable(trans),
                                                           NULL,
                                                           trans->msg_type);
                                nft_unregister_flowtable_net_hooks(net,
                                                nft_trans_flowtable(trans),
                                                &nft_trans_flowtable(trans)->hook_list);
                        }
                        break;
                }
        }

        nft_set_commit_update(&set_update_list);

        nft_commit_notify(net, NETLINK_CB(skb).portid);
        nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
        nf_tables_commit_audit_log(&adl, nft_net->base_seq);

        nft_gc_seq_end(nft_net, gc_seq);
        nft_net->validate_state = NFT_VALIDATE_SKIP;
        nf_tables_commit_release(net);

        return 0;
}

static void nf_tables_module_autoload(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_module_request *req, *next;
        LIST_HEAD(module_list);

        list_splice_init(&nft_net->module_list, &module_list);
        mutex_unlock(&nft_net->commit_mutex);
        list_for_each_entry_safe(req, next, &module_list, list) {
                request_module("%s", req->module);
                req->done = true;
        }
        mutex_lock(&nft_net->commit_mutex);
        list_splice(&module_list, &nft_net->module_list);
}

static void nf_tables_abort_release(struct nft_trans *trans)
{
        struct nft_ctx ctx = { };

        nft_ctx_update(&ctx, trans);

        switch (trans->msg_type) {
        case NFT_MSG_NEWTABLE:
                nf_tables_table_destroy(trans->table);
                break;
        case NFT_MSG_NEWCHAIN:
                if (nft_trans_chain_update(trans))
                        nft_hooks_destroy(&nft_trans_chain_hooks(trans));
                else
                        nf_tables_chain_destroy(nft_trans_chain(trans));
                break;
        case NFT_MSG_NEWRULE:
                nf_tables_rule_destroy(&ctx, nft_trans_rule(trans));
                break;
        case NFT_MSG_NEWSET:
                nft_set_destroy(&ctx, nft_trans_set(trans));
                break;
        case NFT_MSG_NEWSETELEM:
                nft_trans_set_elem_destroy(&ctx, nft_trans_container_elem(trans));
                break;
        case NFT_MSG_NEWOBJ:
                nft_obj_destroy(&ctx, nft_trans_obj(trans));
                break;
        case NFT_MSG_NEWFLOWTABLE:
                if (nft_trans_flowtable_update(trans))
                        nft_hooks_destroy(&nft_trans_flowtable_hooks(trans));
                else
                        nf_tables_flowtable_destroy(nft_trans_flowtable(trans));
                break;
        }
        kfree(trans);
}

static void nft_set_abort_update(struct list_head *set_update_list)
{
        struct nft_set *set, *next;

        list_for_each_entry_safe(set, next, set_update_list, pending_update) {
                list_del_init(&set->pending_update);

                if (!set->ops->abort)
                        continue;

                set->ops->abort(set);
        }
}

static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_trans *trans, *next;
        LIST_HEAD(set_update_list);
        struct nft_trans_elem *te;
        struct nft_ctx ctx = {
                .net = net,
        };
        int err = 0;

        if (action == NFNL_ABORT_VALIDATE &&
            nf_tables_validate(net) < 0)
                err = -EAGAIN;

        list_for_each_entry_safe_reverse(trans, next, &nft_net->commit_list,
                                         list) {
                struct nft_table *table = trans->table;

                nft_ctx_update(&ctx, trans);

                switch (trans->msg_type) {
                case NFT_MSG_NEWTABLE:
                        if (nft_trans_table_update(trans)) {
                                if (!(table->flags & __NFT_TABLE_F_UPDATE)) {
                                        nft_trans_destroy(trans);
                                        break;
                                }
                                if (table->flags & __NFT_TABLE_F_WAS_DORMANT) {
                                        nf_tables_table_disable(net, table);
                                        table->flags |= NFT_TABLE_F_DORMANT;
                                } else if (table->flags & __NFT_TABLE_F_WAS_AWAKEN) {
                                        table->flags &= ~NFT_TABLE_F_DORMANT;
                                }
                                if (table->flags & __NFT_TABLE_F_WAS_ORPHAN) {
                                        table->flags &= ~NFT_TABLE_F_OWNER;
                                        table->nlpid = 0;
                                }
                                table->flags &= ~__NFT_TABLE_F_UPDATE;
                                nft_trans_destroy(trans);
                        } else {
                                list_del_rcu(&table->list);
                        }
                        break;
                case NFT_MSG_DELTABLE:
                case NFT_MSG_DESTROYTABLE:
                        nft_clear(trans->net, table);
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWCHAIN:
                        if (nft_trans_chain_update(trans)) {
                                if (!(table->flags & NFT_TABLE_F_DORMANT)) {
                                        nft_netdev_unregister_hooks(net,
                                                                    &nft_trans_chain_hooks(trans),
                                                                    true);
                                }
                                free_percpu(nft_trans_chain_stats(trans));
                                kfree(nft_trans_chain_name(trans));
                                nft_trans_destroy(trans);
                        } else {
                                if (nft_trans_chain_bound(trans)) {
                                        nft_trans_destroy(trans);
                                        break;
                                }
                                nft_use_dec_restore(&table->use);
                                nft_chain_del(nft_trans_chain(trans));
                                nf_tables_unregister_hook(trans->net, table,
                                                          nft_trans_chain(trans));
                        }
                        break;
                case NFT_MSG_DELCHAIN:
                case NFT_MSG_DESTROYCHAIN:
                        if (nft_trans_chain_update(trans)) {
                                list_splice(&nft_trans_chain_hooks(trans),
                                            &nft_trans_basechain(trans)->hook_list);
                        } else {
                                nft_use_inc_restore(&table->use);
                                nft_clear(trans->net, nft_trans_chain(trans));
                        }
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWRULE:
                        if (nft_trans_rule_bound(trans)) {
                                nft_trans_destroy(trans);
                                break;
                        }
                        nft_use_dec_restore(&nft_trans_rule_chain(trans)->use);
                        list_del_rcu(&nft_trans_rule(trans)->list);
                        nft_rule_expr_deactivate(&ctx,
                                                 nft_trans_rule(trans),
                                                 NFT_TRANS_ABORT);
                        if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
                                nft_flow_rule_destroy(nft_trans_flow_rule(trans));
                        break;
                case NFT_MSG_DELRULE:
                case NFT_MSG_DESTROYRULE:
                        nft_use_inc_restore(&nft_trans_rule_chain(trans)->use);
                        nft_clear(trans->net, nft_trans_rule(trans));
                        nft_rule_expr_activate(&ctx, nft_trans_rule(trans));
                        if (nft_trans_rule_chain(trans)->flags & NFT_CHAIN_HW_OFFLOAD)
                                nft_flow_rule_destroy(nft_trans_flow_rule(trans));

                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWSET:
                        list_del(&nft_trans_container_set(trans)->list_trans_newset);
                        if (nft_trans_set_update(trans)) {
                                nft_trans_destroy(trans);
                                break;
                        }
                        nft_use_dec_restore(&table->use);
                        if (nft_trans_set_bound(trans)) {
                                nft_trans_destroy(trans);
                                break;
                        }
                        nft_trans_set(trans)->dead = 1;
                        list_del_rcu(&nft_trans_set(trans)->list);
                        break;
                case NFT_MSG_DELSET:
                case NFT_MSG_DESTROYSET:
                        nft_use_inc_restore(&table->use);
                        nft_clear(trans->net, nft_trans_set(trans));
                        if (nft_trans_set(trans)->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                                nft_map_activate(&ctx, nft_trans_set(trans));

                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWSETELEM:
                        if (nft_trans_elem_set_bound(trans)) {
                                nft_trans_destroy(trans);
                                break;
                        }
                        te = nft_trans_container_elem(trans);
                        if (!nft_trans_elems_new_abort(&ctx, te)) {
                                nft_trans_destroy(trans);
                                break;
                        }

                        if (te->set->ops->abort &&
                            list_empty(&te->set->pending_update)) {
                                list_add_tail(&te->set->pending_update,
                                              &set_update_list);
                        }
                        break;
                case NFT_MSG_DELSETELEM:
                case NFT_MSG_DESTROYSETELEM:
                        te = nft_trans_container_elem(trans);

                        nft_trans_elems_destroy_abort(&ctx, te);

                        if (te->set->ops->abort &&
                            list_empty(&te->set->pending_update)) {
                                list_add_tail(&te->set->pending_update,
                                              &set_update_list);
                        }
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWOBJ:
                        if (nft_trans_obj_update(trans)) {
                                nft_obj_destroy(&ctx, nft_trans_obj_newobj(trans));
                                nft_trans_destroy(trans);
                        } else {
                                nft_use_dec_restore(&table->use);
                                nft_obj_del(nft_trans_obj(trans));
                        }
                        break;
                case NFT_MSG_DELOBJ:
                case NFT_MSG_DESTROYOBJ:
                        nft_use_inc_restore(&table->use);
                        nft_clear(trans->net, nft_trans_obj(trans));
                        nft_trans_destroy(trans);
                        break;
                case NFT_MSG_NEWFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
                                nft_unregister_flowtable_net_hooks(net,
                                                nft_trans_flowtable(trans),
                                                &nft_trans_flowtable_hooks(trans));
                        } else {
                                nft_use_dec_restore(&table->use);
                                list_del_rcu(&nft_trans_flowtable(trans)->list);
                                nft_unregister_flowtable_net_hooks(net,
                                                nft_trans_flowtable(trans),
                                                &nft_trans_flowtable(trans)->hook_list);
                        }
                        break;
                case NFT_MSG_DELFLOWTABLE:
                case NFT_MSG_DESTROYFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
                                list_splice(&nft_trans_flowtable_hooks(trans),
                                            &nft_trans_flowtable(trans)->hook_list);
                        } else {
                                nft_use_inc_restore(&table->use);
                                nft_clear(trans->net, nft_trans_flowtable(trans));
                        }
                        nft_trans_destroy(trans);
                        break;
                }
        }

        WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list));

        nft_set_abort_update(&set_update_list);

        synchronize_rcu();

        list_for_each_entry_safe_reverse(trans, next,
                                         &nft_net->commit_list, list) {
                nft_trans_list_del(trans);
                nf_tables_abort_release(trans);
        }

        return err;
}

static int nf_tables_abort(struct net *net, struct sk_buff *skb,
                           enum nfnl_abort_action action)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        unsigned int gc_seq;
        int ret;

        gc_seq = nft_gc_seq_begin(nft_net);
        ret = __nf_tables_abort(net, action);
        nft_gc_seq_end(nft_net, gc_seq);

        WARN_ON_ONCE(!list_empty(&nft_net->commit_list));

        /* module autoload needs to happen after GC sequence update because it
         * temporarily releases and grabs mutex again.
         */
        if (action == NFNL_ABORT_AUTOLOAD)
                nf_tables_module_autoload(net);
        else
                nf_tables_module_autoload_cleanup(net);

        mutex_unlock(&nft_net->commit_mutex);

        return ret;
}

static bool nf_tables_valid_genid(struct net *net, u32 genid)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        bool genid_ok;

        mutex_lock(&nft_net->commit_mutex);
        nft_net->tstamp = get_jiffies_64();

        genid_ok = genid == 0 || nft_net->base_seq == genid;
        if (!genid_ok)
                mutex_unlock(&nft_net->commit_mutex);

        /* else, commit mutex has to be released by commit or abort function */
        return genid_ok;
}

static const struct nfnetlink_subsystem nf_tables_subsys = {
        .name                = "nf_tables",
        .subsys_id        = NFNL_SUBSYS_NFTABLES,
        .cb_count        = NFT_MSG_MAX,
        .cb                = nf_tables_cb,
        .commit                = nf_tables_commit,
        .abort                = nf_tables_abort,
        .valid_genid        = nf_tables_valid_genid,
        .owner                = THIS_MODULE,
};

int nft_chain_validate_dependency(const struct nft_chain *chain,
                                  enum nft_chain_types type)
{
        const struct nft_base_chain *basechain;

        if (nft_is_base_chain(chain)) {
                basechain = nft_base_chain(chain);
                if (basechain->type->type != type)
                        return -EOPNOTSUPP;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(nft_chain_validate_dependency);

int nft_chain_validate_hooks(const struct nft_chain *chain,
                             unsigned int hook_flags)
{
        struct nft_base_chain *basechain;

        if (nft_is_base_chain(chain)) {
                basechain = nft_base_chain(chain);

                if ((1 << basechain->ops.hooknum) & hook_flags)
                        return 0;

                return -EOPNOTSUPP;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(nft_chain_validate_hooks);

/**
 *        nft_parse_u32_check - fetch u32 attribute and check for maximum value
 *
 *        @attr: netlink attribute to fetch value from
 *        @max: maximum value to be stored in dest
 *        @dest: pointer to the variable
 *
 *        Parse, check and store a given u32 netlink attribute into variable.
 *        This function returns -ERANGE if the value goes over maximum value.
 *        Otherwise a 0 is returned and the attribute value is stored in the
 *        destination variable.
 */
int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest)
{
        u32 val;

        val = ntohl(nla_get_be32(attr));
        if (val > max)
                return -ERANGE;

        *dest = val;
        return 0;
}
EXPORT_SYMBOL_GPL(nft_parse_u32_check);

static int nft_parse_register(const struct nlattr *attr, u32 *preg)
{
        unsigned int reg;

        reg = ntohl(nla_get_be32(attr));
        switch (reg) {
        case NFT_REG_VERDICT...NFT_REG_4:
                *preg = reg * NFT_REG_SIZE / NFT_REG32_SIZE;
                break;
        case NFT_REG32_00...NFT_REG32_15:
                *preg = reg + NFT_REG_SIZE / NFT_REG32_SIZE - NFT_REG32_00;
                break;
        default:
                return -ERANGE;
        }

        return 0;
}

/**
 *        nft_dump_register - dump a register value to a netlink attribute
 *
 *        @skb: socket buffer
 *        @attr: attribute number
 *        @reg: register number
 *
 *        Construct a netlink attribute containing the register number. For
 *        compatibility reasons, register numbers being a multiple of 4 are
 *        translated to the corresponding 128 bit register numbers.
 */
int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg)
{
        if (reg % (NFT_REG_SIZE / NFT_REG32_SIZE) == 0)
                reg = reg / (NFT_REG_SIZE / NFT_REG32_SIZE);
        else
                reg = reg - NFT_REG_SIZE / NFT_REG32_SIZE + NFT_REG32_00;

        return nla_put_be32(skb, attr, htonl(reg));
}
EXPORT_SYMBOL_GPL(nft_dump_register);

static int nft_validate_register_load(enum nft_registers reg, unsigned int len)
{
        if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
                return -EINVAL;
        if (len == 0)
                return -EINVAL;
        if (reg * NFT_REG32_SIZE + len > sizeof_field(struct nft_regs, data))
                return -ERANGE;

        return 0;
}

int nft_parse_register_load(const struct nft_ctx *ctx,
                            const struct nlattr *attr, u8 *sreg, u32 len)
{
        int err, invalid_reg;
        u32 reg, next_register;

        err = nft_parse_register(attr, &reg);
        if (err < 0)
                return err;

        err = nft_validate_register_load(reg, len);
        if (err < 0)
                return err;

        next_register = DIV_ROUND_UP(len, NFT_REG32_SIZE) + reg;

        /* Can't happen: nft_validate_register_load() should have failed */
        if (WARN_ON_ONCE(next_register > NFT_REG32_NUM))
                return -EINVAL;

        /* find first register that did not see an earlier store. */
        invalid_reg = find_next_zero_bit(ctx->reg_inited, NFT_REG32_NUM, reg);

        /* invalid register within the range that we're loading from? */
        if (invalid_reg < next_register)
                return -ENODATA;

        *sreg = reg;
        return 0;
}
EXPORT_SYMBOL_GPL(nft_parse_register_load);

static void nft_saw_register_store(const struct nft_ctx *__ctx,
                                   int reg, unsigned int len)
{
        unsigned int registers = DIV_ROUND_UP(len, NFT_REG32_SIZE);
        struct nft_ctx *ctx = (struct nft_ctx *)__ctx;

        if (WARN_ON_ONCE(len == 0 || reg < 0))
                return;

        bitmap_set(ctx->reg_inited, reg, registers);
}

static int nft_validate_register_store(const struct nft_ctx *ctx,
                                       enum nft_registers reg,
                                       const struct nft_data *data,
                                       enum nft_data_types type,
                                       unsigned int len)
{
        int err;

        switch (reg) {
        case NFT_REG_VERDICT:
                if (type != NFT_DATA_VERDICT)
                        return -EINVAL;

                if (data != NULL &&
                    (data->verdict.code == NFT_GOTO ||
                     data->verdict.code == NFT_JUMP)) {
                        err = nft_chain_validate(ctx, data->verdict.chain);
                        if (err < 0)
                                return err;
                }

                break;
        default:
                if (type != NFT_DATA_VALUE)
                        return -EINVAL;

                if (reg < NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE)
                        return -EINVAL;
                if (len == 0)
                        return -EINVAL;
                if (reg * NFT_REG32_SIZE + len >
                    sizeof_field(struct nft_regs, data))
                        return -ERANGE;

                break;
        }

        nft_saw_register_store(ctx, reg, len);
        return 0;
}

int nft_parse_register_store(const struct nft_ctx *ctx,
                             const struct nlattr *attr, u8 *dreg,
                             const struct nft_data *data,
                             enum nft_data_types type, unsigned int len)
{
        int err;
        u32 reg;

        err = nft_parse_register(attr, &reg);
        if (err < 0)
                return err;

        err = nft_validate_register_store(ctx, reg, data, type, len);
        if (err < 0)
                return err;

        *dreg = reg;
        return 0;
}
EXPORT_SYMBOL_GPL(nft_parse_register_store);

static const struct nla_policy nft_verdict_policy[NFTA_VERDICT_MAX + 1] = {
        [NFTA_VERDICT_CODE]        = { .type = NLA_U32 },
        [NFTA_VERDICT_CHAIN]        = { .type = NLA_STRING,
                                    .len = NFT_CHAIN_MAXNAMELEN - 1 },
        [NFTA_VERDICT_CHAIN_ID]        = { .type = NLA_U32 },
};

static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
                            struct nft_data_desc *desc, const struct nlattr *nla)
{
        u8 genmask = nft_genmask_next(ctx->net);
        struct nlattr *tb[NFTA_VERDICT_MAX + 1];
        struct nft_chain *chain;
        int err;

        err = nla_parse_nested_deprecated(tb, NFTA_VERDICT_MAX, nla,
                                          nft_verdict_policy, NULL);
        if (err < 0)
                return err;

        if (!tb[NFTA_VERDICT_CODE])
                return -EINVAL;

        /* zero padding hole for memcmp */
        memset(data, 0, sizeof(*data));
        data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE]));

        switch (data->verdict.code) {
        case NF_ACCEPT:
        case NF_DROP:
        case NF_QUEUE:
                break;
        case NFT_CONTINUE:
        case NFT_BREAK:
        case NFT_RETURN:
                break;
        case NFT_JUMP:
        case NFT_GOTO:
                if (tb[NFTA_VERDICT_CHAIN]) {
                        chain = nft_chain_lookup(ctx->net, ctx->table,
                                                 tb[NFTA_VERDICT_CHAIN],
                                                 genmask);
                } else if (tb[NFTA_VERDICT_CHAIN_ID]) {
                        chain = nft_chain_lookup_byid(ctx->net, ctx->table,
                                                      tb[NFTA_VERDICT_CHAIN_ID],
                                                      genmask);
                        if (IS_ERR(chain))
                                return PTR_ERR(chain);
                } else {
                        return -EINVAL;
                }

                if (IS_ERR(chain))
                        return PTR_ERR(chain);
                if (nft_is_base_chain(chain))
                        return -EOPNOTSUPP;
                if (nft_chain_is_bound(chain))
                        return -EINVAL;
                if (desc->flags & NFT_DATA_DESC_SETELEM &&
                    chain->flags & NFT_CHAIN_BINDING)
                        return -EINVAL;
                if (!nft_use_inc(&chain->use))
                        return -EMFILE;

                data->verdict.chain = chain;
                break;
        default:
                return -EINVAL;
        }

        desc->len = sizeof(data->verdict);

        return 0;
}

static void nft_verdict_uninit(const struct nft_data *data)
{
        struct nft_chain *chain;

        switch (data->verdict.code) {
        case NFT_JUMP:
        case NFT_GOTO:
                chain = data->verdict.chain;
                nft_use_dec(&chain->use);
                break;
        }
}

int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v)
{
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, type);
        if (!nest)
                goto nla_put_failure;

        if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(v->code)))
                goto nla_put_failure;

        switch (v->code) {
        case NFT_JUMP:
        case NFT_GOTO:
                if (nla_put_string(skb, NFTA_VERDICT_CHAIN,
                                   v->chain->name))
                        goto nla_put_failure;
        }
        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        return -1;
}

static int nft_value_init(const struct nft_ctx *ctx,
                          struct nft_data *data, struct nft_data_desc *desc,
                          const struct nlattr *nla)
{
        unsigned int len;

        len = nla_len(nla);
        if (len == 0)
                return -EINVAL;
        if (len > desc->size)
                return -EOVERFLOW;
        if (desc->len) {
                if (len != desc->len)
                        return -EINVAL;
        } else {
                desc->len = len;
        }

        nla_memcpy(data->data, nla, len);

        return 0;
}

static int nft_value_dump(struct sk_buff *skb, const struct nft_data *data,
                          unsigned int len)
{
        return nla_put(skb, NFTA_DATA_VALUE, len, data->data);
}

static const struct nla_policy nft_data_policy[NFTA_DATA_MAX + 1] = {
        [NFTA_DATA_VALUE]        = { .type = NLA_BINARY },
        [NFTA_DATA_VERDICT]        = { .type = NLA_NESTED },
};

/**
 *        nft_data_init - parse nf_tables data netlink attributes
 *
 *        @ctx: context of the expression using the data
 *        @data: destination struct nft_data
 *        @desc: data description
 *        @nla: netlink attribute containing data
 *
 *        Parse the netlink data attributes and initialize a struct nft_data.
 *        The type and length of data are returned in the data description.
 *
 *        The caller can indicate that it only wants to accept data of type
 *        NFT_DATA_VALUE by passing NULL for the ctx argument.
 */
int nft_data_init(const struct nft_ctx *ctx, struct nft_data *data,
                  struct nft_data_desc *desc, const struct nlattr *nla)
{
        struct nlattr *tb[NFTA_DATA_MAX + 1];
        int err;

        if (WARN_ON_ONCE(!desc->size))
                return -EINVAL;

        err = nla_parse_nested_deprecated(tb, NFTA_DATA_MAX, nla,
                                          nft_data_policy, NULL);
        if (err < 0)
                return err;

        if (tb[NFTA_DATA_VALUE]) {
                if (desc->type != NFT_DATA_VALUE)
                        return -EINVAL;

                err = nft_value_init(ctx, data, desc, tb[NFTA_DATA_VALUE]);
        } else if (tb[NFTA_DATA_VERDICT] && ctx != NULL) {
                if (desc->type != NFT_DATA_VERDICT)
                        return -EINVAL;

                err = nft_verdict_init(ctx, data, desc, tb[NFTA_DATA_VERDICT]);
        } else {
                err = -EINVAL;
        }

        return err;
}
EXPORT_SYMBOL_GPL(nft_data_init);

/**
 *        nft_data_release - release a nft_data item
 *
 *        @data: struct nft_data to release
 *        @type: type of data
 *
 *        Release a nft_data item. NFT_DATA_VALUE types can be silently discarded,
 *        all others need to be released by calling this function.
 */
void nft_data_release(const struct nft_data *data, enum nft_data_types type)
{
        if (type < NFT_DATA_VERDICT)
                return;
        switch (type) {
        case NFT_DATA_VERDICT:
                return nft_verdict_uninit(data);
        default:
                WARN_ON(1);
        }
}
EXPORT_SYMBOL_GPL(nft_data_release);

int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
                  enum nft_data_types type, unsigned int len)
{
        struct nlattr *nest;
        int err;

        nest = nla_nest_start_noflag(skb, attr);
        if (nest == NULL)
                return -1;

        switch (type) {
        case NFT_DATA_VALUE:
                err = nft_value_dump(skb, data, len);
                break;
        case NFT_DATA_VERDICT:
                err = nft_verdict_dump(skb, NFTA_DATA_VERDICT, &data->verdict);
                break;
        default:
                err = -EINVAL;
                WARN_ON(1);
        }

        nla_nest_end(skb, nest);
        return err;
}
EXPORT_SYMBOL_GPL(nft_data_dump);

static void __nft_release_hook(struct net *net, struct nft_table *table)
{
        struct nft_flowtable *flowtable;
        struct nft_chain *chain;

        list_for_each_entry(chain, &table->chains, list)
                __nf_tables_unregister_hook(net, table, chain, true);
        list_for_each_entry(flowtable, &table->flowtables, list)
                __nft_unregister_flowtable_net_hooks(net, flowtable,
                                                     &flowtable->hook_list,
                                                     true);
}

static void __nft_release_hooks(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_table *table;

        list_for_each_entry(table, &nft_net->tables, list) {
                if (nft_table_has_owner(table))
                        continue;

                __nft_release_hook(net, table);
        }
}

static void __nft_release_table(struct net *net, struct nft_table *table)
{
        struct nft_flowtable *flowtable, *nf;
        struct nft_chain *chain, *nc;
        struct nft_object *obj, *ne;
        struct nft_rule *rule, *nr;
        struct nft_set *set, *ns;
        struct nft_ctx ctx = {
                .net        = net,
                .family        = NFPROTO_NETDEV,
        };

        ctx.family = table->family;
        ctx.table = table;
        list_for_each_entry(chain, &table->chains, list) {
                if (nft_chain_binding(chain))
                        continue;

                ctx.chain = chain;
                list_for_each_entry_safe(rule, nr, &chain->rules, list) {
                        list_del(&rule->list);
                        nft_use_dec(&chain->use);
                        nf_tables_rule_release(&ctx, rule);
                }
        }
        list_for_each_entry_safe(flowtable, nf, &table->flowtables, list) {
                list_del(&flowtable->list);
                nft_use_dec(&table->use);
                nf_tables_flowtable_destroy(flowtable);
        }
        list_for_each_entry_safe(set, ns, &table->sets, list) {
                list_del(&set->list);
                nft_use_dec(&table->use);
                if (set->flags & (NFT_SET_MAP | NFT_SET_OBJECT))
                        nft_map_deactivate(&ctx, set);

                nft_set_destroy(&ctx, set);
        }
        list_for_each_entry_safe(obj, ne, &table->objects, list) {
                nft_obj_del(obj);
                nft_use_dec(&table->use);
                nft_obj_destroy(&ctx, obj);
        }
        list_for_each_entry_safe(chain, nc, &table->chains, list) {
                nft_chain_del(chain);
                nft_use_dec(&table->use);
                nf_tables_chain_destroy(chain);
        }
        nf_tables_table_destroy(table);
}

static void __nft_release_tables(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        struct nft_table *table, *nt;

        list_for_each_entry_safe(table, nt, &nft_net->tables, list) {
                if (nft_table_has_owner(table))
                        continue;

                list_del(&table->list);

                __nft_release_table(net, table);
        }
}

static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
                            void *ptr)
{
        struct nft_table *table, *to_delete[8];
        struct nftables_pernet *nft_net;
        struct netlink_notify *n = ptr;
        struct net *net = n->net;
        unsigned int deleted;
        bool restart = false;
        unsigned int gc_seq;

        if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER)
                return NOTIFY_DONE;

        nft_net = nft_pernet(net);
        deleted = 0;
        mutex_lock(&nft_net->commit_mutex);

        gc_seq = nft_gc_seq_begin(nft_net);

        nf_tables_trans_destroy_flush_work(net);
again:
        list_for_each_entry(table, &nft_net->tables, list) {
                if (nft_table_has_owner(table) &&
                    n->portid == table->nlpid) {
                        if (table->flags & NFT_TABLE_F_PERSIST) {
                                table->flags &= ~NFT_TABLE_F_OWNER;
                                continue;
                        }
                        __nft_release_hook(net, table);
                        list_del_rcu(&table->list);
                        to_delete[deleted++] = table;
                        if (deleted >= ARRAY_SIZE(to_delete))
                                break;
                }
        }
        if (deleted) {
                restart = deleted >= ARRAY_SIZE(to_delete);
                synchronize_rcu();
                while (deleted)
                        __nft_release_table(net, to_delete[--deleted]);

                if (restart)
                        goto again;
        }
        nft_gc_seq_end(nft_net, gc_seq);

        mutex_unlock(&nft_net->commit_mutex);

        return NOTIFY_DONE;
}

static struct notifier_block nft_nl_notifier = {
        .notifier_call  = nft_rcv_nl_event,
};

static int __net_init nf_tables_init_net(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);

        INIT_LIST_HEAD(&nft_net->tables);
        INIT_LIST_HEAD(&nft_net->commit_list);
        INIT_LIST_HEAD(&nft_net->destroy_list);
        INIT_LIST_HEAD(&nft_net->commit_set_list);
        INIT_LIST_HEAD(&nft_net->binding_list);
        INIT_LIST_HEAD(&nft_net->module_list);
        INIT_LIST_HEAD(&nft_net->notify_list);
        mutex_init(&nft_net->commit_mutex);
        nft_net->base_seq = 1;
        nft_net->gc_seq = 0;
        nft_net->validate_state = NFT_VALIDATE_SKIP;
        INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work);

        return 0;
}

static void __net_exit nf_tables_pre_exit_net(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);

        mutex_lock(&nft_net->commit_mutex);
        __nft_release_hooks(net);
        mutex_unlock(&nft_net->commit_mutex);
}

static void __net_exit nf_tables_exit_net(struct net *net)
{
        struct nftables_pernet *nft_net = nft_pernet(net);
        unsigned int gc_seq;

        mutex_lock(&nft_net->commit_mutex);

        gc_seq = nft_gc_seq_begin(nft_net);

        WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
        WARN_ON_ONCE(!list_empty(&nft_net->commit_set_list));

        if (!list_empty(&nft_net->module_list))
                nf_tables_module_autoload_cleanup(net);

        cancel_work_sync(&nft_net->destroy_work);
        __nft_release_tables(net);

        nft_gc_seq_end(nft_net, gc_seq);

        mutex_unlock(&nft_net->commit_mutex);

        WARN_ON_ONCE(!list_empty(&nft_net->tables));
        WARN_ON_ONCE(!list_empty(&nft_net->module_list));
        WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
        WARN_ON_ONCE(!list_empty(&nft_net->destroy_list));
}

static void nf_tables_exit_batch(struct list_head *net_exit_list)
{
        flush_work(&trans_gc_work);
}

static struct pernet_operations nf_tables_net_ops = {
        .init                = nf_tables_init_net,
        .pre_exit        = nf_tables_pre_exit_net,
        .exit                = nf_tables_exit_net,
        .exit_batch        = nf_tables_exit_batch,
        .id                = &nf_tables_net_id,
        .size                = sizeof(struct nftables_pernet),
};

static int __init nf_tables_module_init(void)
{
        int err;

        BUILD_BUG_ON(offsetof(struct nft_trans_table, nft_trans) != 0);
        BUILD_BUG_ON(offsetof(struct nft_trans_chain, nft_trans_binding.nft_trans) != 0);
        BUILD_BUG_ON(offsetof(struct nft_trans_rule, nft_trans) != 0);
        BUILD_BUG_ON(offsetof(struct nft_trans_set, nft_trans_binding.nft_trans) != 0);
        BUILD_BUG_ON(offsetof(struct nft_trans_elem, nft_trans) != 0);
        BUILD_BUG_ON(offsetof(struct nft_trans_obj, nft_trans) != 0);
        BUILD_BUG_ON(offsetof(struct nft_trans_flowtable, nft_trans) != 0);

        err = register_pernet_subsys(&nf_tables_net_ops);
        if (err < 0)
                return err;

        err = nft_chain_filter_init();
        if (err < 0)
                goto err_chain_filter;

        err = nf_tables_core_module_init();
        if (err < 0)
                goto err_core_module;

        err = register_netdevice_notifier(&nf_tables_flowtable_notifier);
        if (err < 0)
                goto err_netdev_notifier;

        err = rhltable_init(&nft_objname_ht, &nft_objname_ht_params);
        if (err < 0)
                goto err_rht_objname;

        err = nft_offload_init();
        if (err < 0)
                goto err_offload;

        err = netlink_register_notifier(&nft_nl_notifier);
        if (err < 0)
                goto err_netlink_notifier;

        /* must be last */
        err = nfnetlink_subsys_register(&nf_tables_subsys);
        if (err < 0)
                goto err_nfnl_subsys;

        nft_chain_route_init();

        return err;

err_nfnl_subsys:
        netlink_unregister_notifier(&nft_nl_notifier);
err_netlink_notifier:
        nft_offload_exit();
err_offload:
        rhltable_destroy(&nft_objname_ht);
err_rht_objname:
        unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
err_netdev_notifier:
        nf_tables_core_module_exit();
err_core_module:
        nft_chain_filter_fini();
err_chain_filter:
        unregister_pernet_subsys(&nf_tables_net_ops);
        return err;
}

static void __exit nf_tables_module_exit(void)
{
        nfnetlink_subsys_unregister(&nf_tables_subsys);
        netlink_unregister_notifier(&nft_nl_notifier);
        nft_offload_exit();
        unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
        nft_chain_filter_fini();
        nft_chain_route_fini();
        unregister_pernet_subsys(&nf_tables_net_ops);
        cancel_work_sync(&trans_gc_work);
        rcu_barrier();
        rhltable_destroy(&nft_objname_ht);
        nf_tables_core_module_exit();
}

module_init(nf_tables_module_init);
module_exit(nf_tables_module_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("Framework for packet filtering and classification");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_NFTABLES);





















    5 











    6 





































    5 




    5 




    5 





































    3 








    3 


































    3 




    5 








    5 







    5 




    1 





    1 








    1 












































































































































































































































































    5 








    5 


























































    4 





    6 





    6 
    4 
























    6 

    6 





    1 

    5 










    5 








    4 



    5 

    5 

    5 



















    1 



    1 



















    1 
    1 































    6 




    6 





    6 




























































    2 




    2 


    2 



    2 
    1 




    2 




    1 




    1 



    1 













    1 

    1 



    1 


   84 



   73 


   84 








   16 













    4 



    4 















   43 





   41 

















   34 






   40 




















    2 





   42 


   41 




    3 
   51 












    3 




   68 











    9 









    8 










   10 





    5 

    3 








   19 

   10 

    9 




   40 




   41 










    5 


   39 


   34 



   40 





   46 


   46 

   46 








    7 


   40 


   34 













   58 








   58 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
// SPDX-License-Identifier: GPL-2.0-only
/*
 * VGIC MMIO handling functions
 */

#include <linux/bitops.h>
#include <linux/bsearch.h>
#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <kvm/iodev.h>
#include <kvm/arm_arch_timer.h>
#include <kvm/arm_vgic.h>

#include "vgic.h"
#include "vgic-mmio.h"

unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
                                 gpa_t addr, unsigned int len)
{
        return 0;
}

unsigned long vgic_mmio_read_rao(struct kvm_vcpu *vcpu,
                                 gpa_t addr, unsigned int len)
{
        return -1UL;
}

void vgic_mmio_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
                        unsigned int len, unsigned long val)
{
        /* Ignore */
}

int vgic_mmio_uaccess_write_wi(struct kvm_vcpu *vcpu, gpa_t addr,
                               unsigned int len, unsigned long val)
{
        /* Ignore */
        return 0;
}

unsigned long vgic_mmio_read_group(struct kvm_vcpu *vcpu,
                                   gpa_t addr, unsigned int len)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        u32 value = 0;
        int i;

        /* Loop over all IRQs affected by this read */
        for (i = 0; i < len * 8; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                if (irq->group)
                        value |= BIT(i);

                vgic_put_irq(vcpu->kvm, irq);
        }

        return value;
}

static void vgic_update_vsgi(struct vgic_irq *irq)
{
        WARN_ON(its_prop_update_vsgi(irq->host_irq, irq->priority, irq->group));
}

void vgic_mmio_write_group(struct kvm_vcpu *vcpu, gpa_t addr,
                           unsigned int len, unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;
        unsigned long flags;

        for (i = 0; i < len * 8; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                irq->group = !!(val & BIT(i));
                if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
                        vgic_update_vsgi(irq);
                        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                } else {
                        vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
                }

                vgic_put_irq(vcpu->kvm, irq);
        }
}

/*
 * Read accesses to both GICD_ICENABLER and GICD_ISENABLER return the value
 * of the enabled bit, so there is only one function for both here.
 */
unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
                                    gpa_t addr, unsigned int len)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        u32 value = 0;
        int i;

        /* Loop over all IRQs affected by this read */
        for (i = 0; i < len * 8; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                if (irq->enabled)
                        value |= (1U << i);

                vgic_put_irq(vcpu->kvm, irq);
        }

        return value;
}

void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
                             gpa_t addr, unsigned int len,
                             unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;
        unsigned long flags;

        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
                        if (!irq->enabled) {
                                struct irq_data *data;

                                irq->enabled = true;
                                data = &irq_to_desc(irq->host_irq)->irq_data;
                                while (irqd_irq_disabled(data))
                                        enable_irq(irq->host_irq);
                        }

                        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                        vgic_put_irq(vcpu->kvm, irq);

                        continue;
                } else if (vgic_irq_is_mapped_level(irq)) {
                        bool was_high = irq->line_level;

                        /*
                         * We need to update the state of the interrupt because
                         * the guest might have changed the state of the device
                         * while the interrupt was disabled at the VGIC level.
                         */
                        irq->line_level = vgic_get_phys_line_level(irq);
                        /*
                         * Deactivate the physical interrupt so the GIC will let
                         * us know when it is asserted again.
                         */
                        if (!irq->active && was_high && !irq->line_level)
                                vgic_irq_set_phys_active(irq, false);
                }
                irq->enabled = true;
                vgic_queue_irq_unlock(vcpu->kvm, irq, flags);

                vgic_put_irq(vcpu->kvm, irq);
        }
}

void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
                             gpa_t addr, unsigned int len,
                             unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;
        unsigned long flags;

        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                if (irq->hw && vgic_irq_is_sgi(irq->intid) && irq->enabled)
                        disable_irq_nosync(irq->host_irq);

                irq->enabled = false;

                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
}

int vgic_uaccess_write_senable(struct kvm_vcpu *vcpu,
                               gpa_t addr, unsigned int len,
                               unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;
        unsigned long flags;

        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                irq->enabled = true;
                vgic_queue_irq_unlock(vcpu->kvm, irq, flags);

                vgic_put_irq(vcpu->kvm, irq);
        }

        return 0;
}

int vgic_uaccess_write_cenable(struct kvm_vcpu *vcpu,
                               gpa_t addr, unsigned int len,
                               unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;
        unsigned long flags;

        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                irq->enabled = false;
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

                vgic_put_irq(vcpu->kvm, irq);
        }

        return 0;
}

static unsigned long __read_pending(struct kvm_vcpu *vcpu,
                                    gpa_t addr, unsigned int len,
                                    bool is_user)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        u32 value = 0;
        int i;

        /* Loop over all IRQs affected by this read */
        for (i = 0; i < len * 8; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
                unsigned long flags;
                bool val;

                /*
                 * When used from userspace with a GICv3 model:
                 *
                 * Pending state of interrupt is latched in pending_latch
                 * variable.  Userspace will save and restore pending state
                 * and line_level separately.
                 * Refer to Documentation/virt/kvm/devices/arm-vgic-v3.rst
                 * for handling of ISPENDR and ICPENDR.
                 */
                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
                        int err;

                        val = false;
                        err = irq_get_irqchip_state(irq->host_irq,
                                                    IRQCHIP_STATE_PENDING,
                                                    &val);
                        WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
                } else if (!is_user && vgic_irq_is_mapped_level(irq)) {
                        val = vgic_get_phys_line_level(irq);
                } else {
                        switch (vcpu->kvm->arch.vgic.vgic_model) {
                        case KVM_DEV_TYPE_ARM_VGIC_V3:
                                if (is_user) {
                                        val = irq->pending_latch;
                                        break;
                                }
                                fallthrough;
                        default:
                                val = irq_is_pending(irq);
                                break;
                        }
                }

                value |= ((u32)val << i);
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

                vgic_put_irq(vcpu->kvm, irq);
        }

        return value;
}

unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
                                     gpa_t addr, unsigned int len)
{
        return __read_pending(vcpu, addr, len, false);
}

unsigned long vgic_uaccess_read_pending(struct kvm_vcpu *vcpu,
                                        gpa_t addr, unsigned int len)
{
        return __read_pending(vcpu, addr, len, true);
}

static bool is_vgic_v2_sgi(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
        return (vgic_irq_is_sgi(irq->intid) &&
                vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2);
}

static void __set_pending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
                          unsigned long val, bool is_user)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;
        unsigned long flags;

        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                /* GICD_ISPENDR0 SGI bits are WI when written from the guest. */
                if (is_vgic_v2_sgi(vcpu, irq) && !is_user) {
                        vgic_put_irq(vcpu->kvm, irq);
                        continue;
                }

                raw_spin_lock_irqsave(&irq->irq_lock, flags);

                /*
                 * GICv2 SGIs are terribly broken. We can't restore
                 * the source of the interrupt, so just pick the vcpu
                 * itself as the source...
                 */
                if (is_vgic_v2_sgi(vcpu, irq))
                        irq->source |= BIT(vcpu->vcpu_id);

                if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
                        /* HW SGI? Ask the GIC to inject it */
                        int err;
                        err = irq_set_irqchip_state(irq->host_irq,
                                                    IRQCHIP_STATE_PENDING,
                                                    true);
                        WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);

                        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                        vgic_put_irq(vcpu->kvm, irq);

                        continue;
                }

                irq->pending_latch = true;
                if (irq->hw && !is_user)
                        vgic_irq_set_phys_active(irq, true);

                vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
}

void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
                              gpa_t addr, unsigned int len,
                              unsigned long val)
{
        __set_pending(vcpu, addr, len, val, false);
}

int vgic_uaccess_write_spending(struct kvm_vcpu *vcpu,
                                gpa_t addr, unsigned int len,
                                unsigned long val)
{
        __set_pending(vcpu, addr, len, val, true);
        return 0;
}

/* Must be called with irq->irq_lock held */
static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
        irq->pending_latch = false;

        /*
         * We don't want the guest to effectively mask the physical
         * interrupt by doing a write to SPENDR followed by a write to
         * CPENDR for HW interrupts, so we clear the active state on
         * the physical side if the virtual interrupt is not active.
         * This may lead to taking an additional interrupt on the
         * host, but that should not be a problem as the worst that
         * can happen is an additional vgic injection.  We also clear
         * the pending state to maintain proper semantics for edge HW
         * interrupts.
         */
        vgic_irq_set_phys_pending(irq, false);
        if (!irq->active)
                vgic_irq_set_phys_active(irq, false);
}

static void __clear_pending(struct kvm_vcpu *vcpu,
                            gpa_t addr, unsigned int len,
                            unsigned long val, bool is_user)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;
        unsigned long flags;

        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                /* GICD_ICPENDR0 SGI bits are WI when written from the guest. */
                if (is_vgic_v2_sgi(vcpu, irq) && !is_user) {
                        vgic_put_irq(vcpu->kvm, irq);
                        continue;
                }

                raw_spin_lock_irqsave(&irq->irq_lock, flags);

                /*
                 * More fun with GICv2 SGIs! If we're clearing one of them
                 * from userspace, which source vcpu to clear? Let's not
                 * even think of it, and blow the whole set.
                 */
                if (is_vgic_v2_sgi(vcpu, irq))
                        irq->source = 0;

                if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
                        /* HW SGI? Ask the GIC to clear its pending bit */
                        int err;
                        err = irq_set_irqchip_state(irq->host_irq,
                                                    IRQCHIP_STATE_PENDING,
                                                    false);
                        WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);

                        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                        vgic_put_irq(vcpu->kvm, irq);

                        continue;
                }

                if (irq->hw && !is_user)
                        vgic_hw_irq_cpending(vcpu, irq);
                else
                        irq->pending_latch = false;

                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
}

void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
                              gpa_t addr, unsigned int len,
                              unsigned long val)
{
        __clear_pending(vcpu, addr, len, val, false);
}

int vgic_uaccess_write_cpending(struct kvm_vcpu *vcpu,
                                gpa_t addr, unsigned int len,
                                unsigned long val)
{
        __clear_pending(vcpu, addr, len, val, true);
        return 0;
}

/*
 * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
 * is not queued on some running VCPU's LRs, because then the change to the
 * active state can be overwritten when the VCPU's state is synced coming back
 * from the guest.
 *
 * For shared interrupts as well as GICv3 private interrupts accessed from the
 * non-owning CPU, we have to stop all the VCPUs because interrupts can be
 * migrated while we don't hold the IRQ locks and we don't want to be chasing
 * moving targets.
 *
 * For GICv2 private interrupts we don't have to do anything because
 * userspace accesses to the VGIC state already require all VCPUs to be
 * stopped, and only the VCPU itself can modify its private interrupts
 * active state, which guarantees that the VCPU is not running.
 */
static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
{
        if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 &&
             vcpu != kvm_get_running_vcpu()) ||
            intid >= VGIC_NR_PRIVATE_IRQS)
                kvm_arm_halt_guest(vcpu->kvm);
}

/* See vgic_access_active_prepare */
static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
{
        if ((vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 &&
             vcpu != kvm_get_running_vcpu()) ||
            intid >= VGIC_NR_PRIVATE_IRQS)
                kvm_arm_resume_guest(vcpu->kvm);
}

static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
                                             gpa_t addr, unsigned int len)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        u32 value = 0;
        int i;

        /* Loop over all IRQs affected by this read */
        for (i = 0; i < len * 8; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                /*
                 * Even for HW interrupts, don't evaluate the HW state as
                 * all the guest is interested in is the virtual state.
                 */
                if (irq->active)
                        value |= (1U << i);

                vgic_put_irq(vcpu->kvm, irq);
        }

        return value;
}

unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
                                    gpa_t addr, unsigned int len)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        u32 val;

        mutex_lock(&vcpu->kvm->arch.config_lock);
        vgic_access_active_prepare(vcpu, intid);

        val = __vgic_mmio_read_active(vcpu, addr, len);

        vgic_access_active_finish(vcpu, intid);
        mutex_unlock(&vcpu->kvm->arch.config_lock);

        return val;
}

unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
                                    gpa_t addr, unsigned int len)
{
        return __vgic_mmio_read_active(vcpu, addr, len);
}

/* Must be called with irq->irq_lock held */
static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
                                      bool active, bool is_uaccess)
{
        if (is_uaccess)
                return;

        irq->active = active;
        vgic_irq_set_phys_active(irq, active);
}

static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
                                    bool active)
{
        unsigned long flags;
        struct kvm_vcpu *requester_vcpu = kvm_get_running_vcpu();

        raw_spin_lock_irqsave(&irq->irq_lock, flags);

        if (irq->hw && !vgic_irq_is_sgi(irq->intid)) {
                vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
        } else if (irq->hw && vgic_irq_is_sgi(irq->intid)) {
                /*
                 * GICv4.1 VSGI feature doesn't track an active state,
                 * so let's not kid ourselves, there is nothing we can
                 * do here.
                 */
                irq->active = false;
        } else {
                u32 model = vcpu->kvm->arch.vgic.vgic_model;
                u8 active_source;

                irq->active = active;

                /*
                 * The GICv2 architecture indicates that the source CPUID for
                 * an SGI should be provided during an EOI which implies that
                 * the active state is stored somewhere, but at the same time
                 * this state is not architecturally exposed anywhere and we
                 * have no way of knowing the right source.
                 *
                 * This may lead to a VCPU not being able to receive
                 * additional instances of a particular SGI after migration
                 * for a GICv2 VM on some GIC implementations.  Oh well.
                 */
                active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;

                if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
                    active && vgic_irq_is_sgi(irq->intid))
                        irq->active_source = active_source;
        }

        if (irq->active)
                vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
        else
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
}

static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
                                      gpa_t addr, unsigned int len,
                                      unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;

        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
                vgic_mmio_change_active(vcpu, irq, false);
                vgic_put_irq(vcpu->kvm, irq);
        }
}

void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
                             gpa_t addr, unsigned int len,
                             unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);

        mutex_lock(&vcpu->kvm->arch.config_lock);
        vgic_access_active_prepare(vcpu, intid);

        __vgic_mmio_write_cactive(vcpu, addr, len, val);

        vgic_access_active_finish(vcpu, intid);
        mutex_unlock(&vcpu->kvm->arch.config_lock);
}

int vgic_mmio_uaccess_write_cactive(struct kvm_vcpu *vcpu,
                                     gpa_t addr, unsigned int len,
                                     unsigned long val)
{
        __vgic_mmio_write_cactive(vcpu, addr, len, val);
        return 0;
}

static void __vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
                                      gpa_t addr, unsigned int len,
                                      unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
        int i;

        for_each_set_bit(i, &val, len * 8) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);
                vgic_mmio_change_active(vcpu, irq, true);
                vgic_put_irq(vcpu->kvm, irq);
        }
}

void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
                             gpa_t addr, unsigned int len,
                             unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 1);

        mutex_lock(&vcpu->kvm->arch.config_lock);
        vgic_access_active_prepare(vcpu, intid);

        __vgic_mmio_write_sactive(vcpu, addr, len, val);

        vgic_access_active_finish(vcpu, intid);
        mutex_unlock(&vcpu->kvm->arch.config_lock);
}

int vgic_mmio_uaccess_write_sactive(struct kvm_vcpu *vcpu,
                                     gpa_t addr, unsigned int len,
                                     unsigned long val)
{
        __vgic_mmio_write_sactive(vcpu, addr, len, val);
        return 0;
}

unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
                                      gpa_t addr, unsigned int len)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
        int i;
        u64 val = 0;

        for (i = 0; i < len; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                val |= (u64)irq->priority << (i * 8);

                vgic_put_irq(vcpu->kvm, irq);
        }

        return val;
}

/*
 * We currently don't handle changing the priority of an interrupt that
 * is already pending on a VCPU. If there is a need for this, we would
 * need to make this VCPU exit and re-evaluate the priorities, potentially
 * leading to this interrupt getting presented now to the guest (if it has
 * been masked by the priority mask before).
 */
void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
                              gpa_t addr, unsigned int len,
                              unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 8);
        int i;
        unsigned long flags;

        for (i = 0; i < len; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                /* Narrow the priority range to what we actually support */
                irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
                if (irq->hw && vgic_irq_is_sgi(irq->intid))
                        vgic_update_vsgi(irq);
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

                vgic_put_irq(vcpu->kvm, irq);
        }
}

unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
                                    gpa_t addr, unsigned int len)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
        u32 value = 0;
        int i;

        for (i = 0; i < len * 4; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, intid + i);

                if (irq->config == VGIC_CONFIG_EDGE)
                        value |= (2U << (i * 2));

                vgic_put_irq(vcpu->kvm, irq);
        }

        return value;
}

void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
                            gpa_t addr, unsigned int len,
                            unsigned long val)
{
        u32 intid = VGIC_ADDR_TO_INTID(addr, 2);
        int i;
        unsigned long flags;

        for (i = 0; i < len * 4; i++) {
                struct vgic_irq *irq;

                /*
                 * The configuration cannot be changed for SGIs in general,
                 * for PPIs this is IMPLEMENTATION DEFINED. The arch timer
                 * code relies on PPIs being level triggered, so we also
                 * make them read-only here.
                 */
                if (intid + i < VGIC_NR_PRIVATE_IRQS)
                        continue;

                irq = vgic_get_irq(vcpu->kvm, intid + i);
                raw_spin_lock_irqsave(&irq->irq_lock, flags);

                if (test_bit(i * 2 + 1, &val))
                        irq->config = VGIC_CONFIG_EDGE;
                else
                        irq->config = VGIC_CONFIG_LEVEL;

                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
}

u32 vgic_read_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid)
{
        int i;
        u32 val = 0;
        int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;

        for (i = 0; i < 32; i++) {
                struct vgic_irq *irq;

                if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
                        continue;

                irq = vgic_get_vcpu_irq(vcpu, intid + i);
                if (irq->config == VGIC_CONFIG_LEVEL && irq->line_level)
                        val |= (1U << i);

                vgic_put_irq(vcpu->kvm, irq);
        }

        return val;
}

void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
                                    const u32 val)
{
        int i;
        int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
        unsigned long flags;

        for (i = 0; i < 32; i++) {
                struct vgic_irq *irq;
                bool new_level;

                if ((intid + i) < VGIC_NR_SGIS || (intid + i) >= nr_irqs)
                        continue;

                irq = vgic_get_vcpu_irq(vcpu, intid + i);

                /*
                 * Line level is set irrespective of irq type
                 * (level or edge) to avoid dependency that VM should
                 * restore irq config before line level.
                 */
                new_level = !!(val & (1U << i));
                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                irq->line_level = new_level;
                if (new_level)
                        vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
                else
                        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

                vgic_put_irq(vcpu->kvm, irq);
        }
}

static int match_region(const void *key, const void *elt)
{
        const unsigned int offset = (unsigned long)key;
        const struct vgic_register_region *region = elt;

        if (offset < region->reg_offset)
                return -1;

        if (offset >= region->reg_offset + region->len)
                return 1;

        return 0;
}

const struct vgic_register_region *
vgic_find_mmio_region(const struct vgic_register_region *regions,
                      int nr_regions, unsigned int offset)
{
        return bsearch((void *)(uintptr_t)offset, regions, nr_regions,
                       sizeof(regions[0]), match_region);
}

void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
{
        if (kvm_vgic_global_state.type == VGIC_V2)
                vgic_v2_set_vmcr(vcpu, vmcr);
        else
                vgic_v3_set_vmcr(vcpu, vmcr);
}

void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
{
        if (kvm_vgic_global_state.type == VGIC_V2)
                vgic_v2_get_vmcr(vcpu, vmcr);
        else
                vgic_v3_get_vmcr(vcpu, vmcr);
}

/*
 * kvm_mmio_read_buf() returns a value in a format where it can be converted
 * to a byte array and be directly observed as the guest wanted it to appear
 * in memory if it had done the store itself, which is LE for the GIC, as the
 * guest knows the GIC is always LE.
 *
 * We convert this value to the CPUs native format to deal with it as a data
 * value.
 */
unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len)
{
        unsigned long data = kvm_mmio_read_buf(val, len);

        switch (len) {
        case 1:
                return data;
        case 2:
                return le16_to_cpu(data);
        case 4:
                return le32_to_cpu(data);
        default:
                return le64_to_cpu(data);
        }
}

/*
 * kvm_mmio_write_buf() expects a value in a format such that if converted to
 * a byte array it is observed as the guest would see it if it could perform
 * the load directly.  Since the GIC is LE, and the guest knows this, the
 * guest expects a value in little endian format.
 *
 * We convert the data value from the CPUs native format to LE so that the
 * value is returned in the proper format.
 */
void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
                                unsigned long data)
{
        switch (len) {
        case 1:
                break;
        case 2:
                data = cpu_to_le16(data);
                break;
        case 4:
                data = cpu_to_le32(data);
                break;
        default:
                data = cpu_to_le64(data);
        }

        kvm_mmio_write_buf(buf, len, data);
}

static
struct vgic_io_device *kvm_to_vgic_iodev(const struct kvm_io_device *dev)
{
        return container_of(dev, struct vgic_io_device, dev);
}

static bool check_region(const struct kvm *kvm,
                         const struct vgic_register_region *region,
                         gpa_t addr, int len)
{
        int flags, nr_irqs = kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;

        switch (len) {
        case sizeof(u8):
                flags = VGIC_ACCESS_8bit;
                break;
        case sizeof(u32):
                flags = VGIC_ACCESS_32bit;
                break;
        case sizeof(u64):
                flags = VGIC_ACCESS_64bit;
                break;
        default:
                return false;
        }

        if ((region->access_flags & flags) && IS_ALIGNED(addr, len)) {
                if (!region->bits_per_irq)
                        return true;

                /* Do we access a non-allocated IRQ? */
                return VGIC_ADDR_TO_INTID(addr, region->bits_per_irq) < nr_irqs;
        }

        return false;
}

const struct vgic_register_region *
vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                     gpa_t addr, int len)
{
        const struct vgic_register_region *region;

        region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
                                       addr - iodev->base_addr);
        if (!region || !check_region(vcpu->kvm, region, addr, len))
                return NULL;

        return region;
}

static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                             gpa_t addr, u32 *val)
{
        const struct vgic_register_region *region;
        struct kvm_vcpu *r_vcpu;

        region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
        if (!region) {
                *val = 0;
                return 0;
        }

        r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
        if (region->uaccess_read)
                *val = region->uaccess_read(r_vcpu, addr, sizeof(u32));
        else
                *val = region->read(r_vcpu, addr, sizeof(u32));

        return 0;
}

static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                              gpa_t addr, const u32 *val)
{
        const struct vgic_register_region *region;
        struct kvm_vcpu *r_vcpu;

        region = vgic_get_mmio_region(vcpu, iodev, addr, sizeof(u32));
        if (!region)
                return 0;

        r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
        if (region->uaccess_write)
                return region->uaccess_write(r_vcpu, addr, sizeof(u32), *val);

        region->write(r_vcpu, addr, sizeof(u32), *val);
        return 0;
}

/*
 * Userland access to VGIC registers.
 */
int vgic_uaccess(struct kvm_vcpu *vcpu, struct vgic_io_device *dev,
                 bool is_write, int offset, u32 *val)
{
        if (is_write)
                return vgic_uaccess_write(vcpu, dev, offset, val);
        else
                return vgic_uaccess_read(vcpu, dev, offset, val);
}

static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                              gpa_t addr, int len, void *val)
{
        struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
        unsigned long data = 0;

        region = vgic_get_mmio_region(vcpu, iodev, addr, len);
        if (!region) {
                memset(val, 0, len);
                return 0;
        }

        switch (iodev->iodev_type) {
        case IODEV_CPUIF:
                data = region->read(vcpu, addr, len);
                break;
        case IODEV_DIST:
                data = region->read(vcpu, addr, len);
                break;
        case IODEV_REDIST:
                data = region->read(iodev->redist_vcpu, addr, len);
                break;
        case IODEV_ITS:
                data = region->its_read(vcpu->kvm, iodev->its, addr, len);
                break;
        }

        vgic_data_host_to_mmio_bus(val, len, data);
        return 0;
}

static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
                               gpa_t addr, int len, const void *val)
{
        struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
        const struct vgic_register_region *region;
        unsigned long data = vgic_data_mmio_bus_to_host(val, len);

        region = vgic_get_mmio_region(vcpu, iodev, addr, len);
        if (!region)
                return 0;

        switch (iodev->iodev_type) {
        case IODEV_CPUIF:
                region->write(vcpu, addr, len, data);
                break;
        case IODEV_DIST:
                region->write(vcpu, addr, len, data);
                break;
        case IODEV_REDIST:
                region->write(iodev->redist_vcpu, addr, len, data);
                break;
        case IODEV_ITS:
                region->its_write(vcpu->kvm, iodev->its, addr, len, data);
                break;
        }

        return 0;
}

const struct kvm_io_device_ops kvm_io_gic_ops = {
        .read = dispatch_mmio_read,
        .write = dispatch_mmio_write,
};

int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
                             enum vgic_type type)
{
        struct vgic_io_device *io_device = &kvm->arch.vgic.dist_iodev;
        unsigned int len;

        switch (type) {
        case VGIC_V2:
                len = vgic_v2_init_dist_iodev(io_device);
                break;
        case VGIC_V3:
                len = vgic_v3_init_dist_iodev(io_device);
                break;
        default:
                BUG_ON(1);
        }

        io_device->base_addr = dist_base_address;
        io_device->iodev_type = IODEV_DIST;
        io_device->redist_vcpu = NULL;

        return kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist_base_address,
                                       len, &io_device->dev);
}















































































































    3 



    3 



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
/*
 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include "ipoib.h"

#include <linux/module.h>

#include <linux/init.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>

#include <linux/if_arp.h>        /* For ARPHRD_xxx */

#include <linux/ip.h>
#include <linux/in.h>

#include <linux/jhash.h>
#include <net/arp.h>
#include <net/addrconf.h>
#include <net/pkt_sched.h>
#include <linux/inetdevice.h>
#include <rdma/ib_cache.h>

MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL");

int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;

module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;

module_param_named(debug_level, ipoib_debug_level, int, 0644);
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
#endif

struct ipoib_path_iter {
        struct net_device *dev;
        struct ipoib_path  path;
};

static const u8 ipv4_bcast_addr[] = {
        0x00, 0xff, 0xff, 0xff,
        0xff, 0x12, 0x40, 0x1b,        0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00,        0xff, 0xff, 0xff, 0xff
};

struct workqueue_struct *ipoib_workqueue;

struct ib_sa_client ipoib_sa_client;

static int ipoib_add_one(struct ib_device *device);
static void ipoib_remove_one(struct ib_device *device, void *client_data);
static void ipoib_neigh_reclaim(struct rcu_head *rp);
static struct net_device *ipoib_get_net_dev_by_params(
                struct ib_device *dev, u32 port, u16 pkey,
                const union ib_gid *gid, const struct sockaddr *addr,
                void *client_data);
static int ipoib_set_mac(struct net_device *dev, void *addr);
static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
                       int cmd);

static struct ib_client ipoib_client = {
        .name   = "ipoib",
        .add    = ipoib_add_one,
        .remove = ipoib_remove_one,
        .get_net_dev_by_params = ipoib_get_net_dev_by_params,
};

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
static int ipoib_netdev_event(struct notifier_block *this,
                              unsigned long event, void *ptr)
{
        struct netdev_notifier_info *ni = ptr;
        struct net_device *dev = ni->dev;

        if (dev->netdev_ops->ndo_open != ipoib_open)
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_REGISTER:
                ipoib_create_debug_files(dev);
                break;
        case NETDEV_CHANGENAME:
                ipoib_delete_debug_files(dev);
                ipoib_create_debug_files(dev);
                break;
        case NETDEV_UNREGISTER:
                ipoib_delete_debug_files(dev);
                break;
        }

        return NOTIFY_DONE;
}
#endif

int ipoib_open(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        ipoib_dbg(priv, "bringing up interface\n");

        netif_carrier_off(dev);

        set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

        if (ipoib_ib_dev_open(dev)) {
                if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
                        return 0;
                goto err_disable;
        }

        ipoib_ib_dev_up(dev);

        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
                struct ipoib_dev_priv *cpriv;

                /* Bring up any child interfaces too */
                down_read(&priv->vlan_rwsem);
                list_for_each_entry(cpriv, &priv->child_intfs, list) {
                        int flags;

                        flags = cpriv->dev->flags;
                        if (flags & IFF_UP)
                                continue;

                        dev_change_flags(cpriv->dev, flags | IFF_UP, NULL);
                }
                up_read(&priv->vlan_rwsem);
        } else if (priv->parent) {
                struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);

                if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags))
                        ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n",
                                  ppriv->dev->name);
        }
        netif_start_queue(dev);

        return 0;

err_disable:
        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

        return -EINVAL;
}

static int ipoib_stop(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        ipoib_dbg(priv, "stopping interface\n");

        clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);

        netif_stop_queue(dev);

        ipoib_ib_dev_down(dev);
        ipoib_ib_dev_stop(dev);

        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
                struct ipoib_dev_priv *cpriv;

                /* Bring down any child interfaces too */
                down_read(&priv->vlan_rwsem);
                list_for_each_entry(cpriv, &priv->child_intfs, list) {
                        int flags;

                        flags = cpriv->dev->flags;
                        if (!(flags & IFF_UP))
                                continue;

                        dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL);
                }
                up_read(&priv->vlan_rwsem);
        }

        return 0;
}

static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
                features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);

        return features;
}

static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        int ret = 0;

        /* dev->mtu > 2K ==> connected mode */
        if (ipoib_cm_admin_enabled(dev)) {
                if (new_mtu > ipoib_cm_max_mtu(dev))
                        return -EINVAL;

                if (new_mtu > priv->mcast_mtu)
                        ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
                                   priv->mcast_mtu);

                WRITE_ONCE(dev->mtu, new_mtu);
                return 0;
        }

        if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) ||
            new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
                return -EINVAL;

        priv->admin_mtu = new_mtu;

        if (priv->mcast_mtu < priv->admin_mtu)
                ipoib_dbg(priv, "MTU must be smaller than the underlying "
                                "link layer MTU - 4 (%u)\n", priv->mcast_mtu);

        new_mtu = min(priv->mcast_mtu, priv->admin_mtu);

        if (priv->rn_ops->ndo_change_mtu) {
                bool carrier_status = netif_carrier_ok(dev);

                netif_carrier_off(dev);

                /* notify lower level on the real mtu */
                ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu);

                if (carrier_status)
                        netif_carrier_on(dev);
        } else {
                WRITE_ONCE(dev->mtu, new_mtu);
        }

        return ret;
}

static void ipoib_get_stats(struct net_device *dev,
                            struct rtnl_link_stats64 *stats)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        if (priv->rn_ops->ndo_get_stats64)
                priv->rn_ops->ndo_get_stats64(dev, stats);
        else
                netdev_stats_to_stats64(stats, &dev->stats);
}

/* Called with an RCU read lock taken */
static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
                                        struct net_device *dev)
{
        struct net *net = dev_net(dev);
        struct in_device *in_dev;
        struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
        struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
        __be32 ret_addr;

        switch (addr->sa_family) {
        case AF_INET:
                in_dev = in_dev_get(dev);
                if (!in_dev)
                        return false;

                ret_addr = inet_confirm_addr(net, in_dev, 0,
                                             addr_in->sin_addr.s_addr,
                                             RT_SCOPE_HOST);
                in_dev_put(in_dev);
                if (ret_addr)
                        return true;

                break;
        case AF_INET6:
                if (IS_ENABLED(CONFIG_IPV6) &&
                    ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
                        return true;

                break;
        }
        return false;
}

/*
 * Find the master net_device on top of the given net_device.
 * @dev: base IPoIB net_device
 *
 * Returns the master net_device with a reference held, or the same net_device
 * if no master exists.
 */
static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
{
        struct net_device *master;

        rcu_read_lock();
        master = netdev_master_upper_dev_get_rcu(dev);
        dev_hold(master);
        rcu_read_unlock();

        if (master)
                return master;

        dev_hold(dev);
        return dev;
}

struct ipoib_walk_data {
        const struct sockaddr *addr;
        struct net_device *result;
};

static int ipoib_upper_walk(struct net_device *upper,
                            struct netdev_nested_priv *priv)
{
        struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data;
        int ret = 0;

        if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) {
                dev_hold(upper);
                data->result = upper;
                ret = 1;
        }

        return ret;
}

/**
 * ipoib_get_net_dev_match_addr - Find a net_device matching
 * the given address, which is an upper device of the given net_device.
 *
 * @addr: IP address to look for.
 * @dev: base IPoIB net_device
 *
 * If found, returns the net_device with a reference held. Otherwise return
 * NULL.
 */
static struct net_device *ipoib_get_net_dev_match_addr(
                const struct sockaddr *addr, struct net_device *dev)
{
        struct netdev_nested_priv priv;
        struct ipoib_walk_data data = {
                .addr = addr,
        };

        priv.data = (void *)&data;
        rcu_read_lock();
        if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
                dev_hold(dev);
                data.result = dev;
                goto out;
        }

        netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv);
out:
        rcu_read_unlock();
        return data.result;
}

/* returns the number of IPoIB netdevs on top a given ipoib device matching a
 * pkey_index and address, if one exists.
 *
 * @found_net_dev: contains a matching net_device if the return value >= 1,
 * with a reference held. */
static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
                                     const union ib_gid *gid,
                                     u16 pkey_index,
                                     const struct sockaddr *addr,
                                     int nesting,
                                     struct net_device **found_net_dev)
{
        struct ipoib_dev_priv *child_priv;
        struct net_device *net_dev = NULL;
        int matches = 0;

        if (priv->pkey_index == pkey_index &&
            (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
                if (!addr) {
                        net_dev = ipoib_get_master_net_dev(priv->dev);
                } else {
                        /* Verify the net_device matches the IP address, as
                         * IPoIB child devices currently share a GID. */
                        net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
                }
                if (net_dev) {
                        if (!*found_net_dev)
                                *found_net_dev = net_dev;
                        else
                                dev_put(net_dev);
                        ++matches;
                }
        }

        /* Check child interfaces */
        down_read_nested(&priv->vlan_rwsem, nesting);
        list_for_each_entry(child_priv, &priv->child_intfs, list) {
                matches += ipoib_match_gid_pkey_addr(child_priv, gid,
                                                    pkey_index, addr,
                                                    nesting + 1,
                                                    found_net_dev);
                if (matches > 1)
                        break;
        }
        up_read(&priv->vlan_rwsem);

        return matches;
}

/* Returns the number of matching net_devs found (between 0 and 2). Also
 * return the matching net_device in the @net_dev parameter, holding a
 * reference to the net_device, if the number of matches >= 1 */
static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port,
                                         u16 pkey_index,
                                         const union ib_gid *gid,
                                         const struct sockaddr *addr,
                                         struct net_device **net_dev)
{
        struct ipoib_dev_priv *priv;
        int matches = 0;

        *net_dev = NULL;

        list_for_each_entry(priv, dev_list, list) {
                if (priv->port != port)
                        continue;

                matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
                                                     addr, 0, net_dev);
                if (matches > 1)
                        break;
        }

        return matches;
}

static struct net_device *ipoib_get_net_dev_by_params(
                struct ib_device *dev, u32 port, u16 pkey,
                const union ib_gid *gid, const struct sockaddr *addr,
                void *client_data)
{
        struct net_device *net_dev;
        struct list_head *dev_list = client_data;
        u16 pkey_index;
        int matches;
        int ret;

        if (!rdma_protocol_ib(dev, port))
                return NULL;

        ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
        if (ret)
                return NULL;

        /* See if we can find a unique device matching the L2 parameters */
        matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
                                                gid, NULL, &net_dev);

        switch (matches) {
        case 0:
                return NULL;
        case 1:
                return net_dev;
        }

        dev_put(net_dev);

        /* Couldn't find a unique device with L2 parameters only. Use L3
         * address to uniquely match the net device */
        matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
                                                gid, addr, &net_dev);
        switch (matches) {
        case 0:
                return NULL;
        default:
                dev_warn_ratelimited(&dev->dev,
                                     "duplicate IP address detected\n");
                fallthrough;
        case 1:
                return net_dev;
        }
}

int ipoib_set_mode(struct net_device *dev, const char *buf)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
             !strcmp(buf, "connected\n")) ||
             (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) &&
             !strcmp(buf, "datagram\n"))) {
                return 0;
        }

        /* flush paths if we switch modes so that connections are restarted */
        if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
                set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
                ipoib_warn(priv, "enabling connected mode "
                           "will cause multicast packet drops\n");
                netdev_update_features(dev);
                dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
                netif_set_real_num_tx_queues(dev, 1);
                rtnl_unlock();
                priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;

                ipoib_flush_paths(dev);
                return (!rtnl_trylock()) ? -EBUSY : 0;
        }

        if (!strcmp(buf, "datagram\n")) {
                clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
                netdev_update_features(dev);
                dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
                netif_set_real_num_tx_queues(dev, dev->num_tx_queues);
                rtnl_unlock();
                ipoib_flush_paths(dev);
                return (!rtnl_trylock()) ? -EBUSY : 0;
        }

        return -EINVAL;
}

struct ipoib_path *__path_find(struct net_device *dev, void *gid)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct rb_node *n = priv->path_tree.rb_node;
        struct ipoib_path *path;
        int ret;

        while (n) {
                path = rb_entry(n, struct ipoib_path, rb_node);

                ret = memcmp(gid, path->pathrec.dgid.raw,
                             sizeof (union ib_gid));

                if (ret < 0)
                        n = n->rb_left;
                else if (ret > 0)
                        n = n->rb_right;
                else
                        return path;
        }

        return NULL;
}

static int __path_add(struct net_device *dev, struct ipoib_path *path)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct rb_node **n = &priv->path_tree.rb_node;
        struct rb_node *pn = NULL;
        struct ipoib_path *tpath;
        int ret;

        while (*n) {
                pn = *n;
                tpath = rb_entry(pn, struct ipoib_path, rb_node);

                ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
                             sizeof (union ib_gid));
                if (ret < 0)
                        n = &pn->rb_left;
                else if (ret > 0)
                        n = &pn->rb_right;
                else
                        return -EEXIST;
        }

        rb_link_node(&path->rb_node, pn, n);
        rb_insert_color(&path->rb_node, &priv->path_tree);

        list_add_tail(&path->list, &priv->path_list);

        return 0;
}

static void path_free(struct net_device *dev, struct ipoib_path *path)
{
        struct sk_buff *skb;

        while ((skb = __skb_dequeue(&path->queue)))
                dev_kfree_skb_irq(skb);

        ipoib_dbg(ipoib_priv(dev), "%s\n", __func__);

        /* remove all neigh connected to this path */
        ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);

        if (path->ah)
                ipoib_put_ah(path->ah);

        kfree(path);
}

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG

struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
{
        struct ipoib_path_iter *iter;

        iter = kmalloc(sizeof(*iter), GFP_KERNEL);
        if (!iter)
                return NULL;

        iter->dev = dev;
        memset(iter->path.pathrec.dgid.raw, 0, 16);

        if (ipoib_path_iter_next(iter)) {
                kfree(iter);
                return NULL;
        }

        return iter;
}

int ipoib_path_iter_next(struct ipoib_path_iter *iter)
{
        struct ipoib_dev_priv *priv = ipoib_priv(iter->dev);
        struct rb_node *n;
        struct ipoib_path *path;
        int ret = 1;

        spin_lock_irq(&priv->lock);

        n = rb_first(&priv->path_tree);

        while (n) {
                path = rb_entry(n, struct ipoib_path, rb_node);

                if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
                           sizeof (union ib_gid)) < 0) {
                        iter->path = *path;
                        ret = 0;
                        break;
                }

                n = rb_next(n);
        }

        spin_unlock_irq(&priv->lock);

        return ret;
}

void ipoib_path_iter_read(struct ipoib_path_iter *iter,
                          struct ipoib_path *path)
{
        *path = iter->path;
}

#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */

void ipoib_mark_paths_invalid(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ipoib_path *path, *tp;

        spin_lock_irq(&priv->lock);

        list_for_each_entry_safe(path, tp, &priv->path_list, list) {
                ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n",
                          be32_to_cpu(sa_path_get_dlid(&path->pathrec)),
                          path->pathrec.dgid.raw);
                if (path->ah)
                        path->ah->valid = 0;
        }

        spin_unlock_irq(&priv->lock);
}

static void push_pseudo_header(struct sk_buff *skb, const char *daddr)
{
        struct ipoib_pseudo_header *phdr;

        phdr = skb_push(skb, sizeof(*phdr));
        memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
}

void ipoib_flush_paths(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ipoib_path *path, *tp;
        LIST_HEAD(remove_list);
        unsigned long flags;

        netif_tx_lock_bh(dev);
        spin_lock_irqsave(&priv->lock, flags);

        list_splice_init(&priv->path_list, &remove_list);

        list_for_each_entry(path, &remove_list, list)
                rb_erase(&path->rb_node, &priv->path_tree);

        list_for_each_entry_safe(path, tp, &remove_list, list) {
                if (path->query)
                        ib_sa_cancel_query(path->query_id, path->query);
                spin_unlock_irqrestore(&priv->lock, flags);
                netif_tx_unlock_bh(dev);
                wait_for_completion(&path->done);
                path_free(dev, path);
                netif_tx_lock_bh(dev);
                spin_lock_irqsave(&priv->lock, flags);
        }

        spin_unlock_irqrestore(&priv->lock, flags);
        netif_tx_unlock_bh(dev);
}

static void path_rec_completion(int status,
                                struct sa_path_rec *pathrec,
                                unsigned int num_prs, void *path_ptr)
{
        struct ipoib_path *path = path_ptr;
        struct net_device *dev = path->dev;
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ipoib_ah *ah = NULL;
        struct ipoib_ah *old_ah = NULL;
        struct ipoib_neigh *neigh, *tn;
        struct sk_buff_head skqueue;
        struct sk_buff *skb;
        unsigned long flags;

        if (!status)
                ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
                          be32_to_cpu(sa_path_get_dlid(pathrec)),
                          pathrec->dgid.raw);
        else
                ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
                          status, path->pathrec.dgid.raw);

        skb_queue_head_init(&skqueue);

        if (!status) {
                struct rdma_ah_attr av;

                if (!ib_init_ah_attr_from_path(priv->ca, priv->port,
                                               pathrec, &av, NULL)) {
                        ah = ipoib_create_ah(dev, priv->pd, &av);
                        rdma_destroy_ah_attr(&av);
                }
        }

        spin_lock_irqsave(&priv->lock, flags);

        if (!IS_ERR_OR_NULL(ah)) {
                /*
                 * pathrec.dgid is used as the database key from the LLADDR,
                 * it must remain unchanged even if the SA returns a different
                 * GID to use in the AH.
                 */
                if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw,
                           sizeof(union ib_gid))) {
                        ipoib_dbg(
                                priv,
                                "%s got PathRec for gid %pI6 while asked for %pI6\n",
                                dev->name, pathrec->dgid.raw,
                                path->pathrec.dgid.raw);
                        memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw,
                               sizeof(union ib_gid));
                }

                path->pathrec = *pathrec;

                old_ah   = path->ah;
                path->ah = ah;

                ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
                          ah, be32_to_cpu(sa_path_get_dlid(pathrec)),
                          pathrec->sl);

                while ((skb = __skb_dequeue(&path->queue)))
                        __skb_queue_tail(&skqueue, skb);

                list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
                        if (neigh->ah) {
                                WARN_ON(neigh->ah != old_ah);
                                /*
                                 * Dropping the ah reference inside
                                 * priv->lock is safe here, because we
                                 * will hold one more reference from
                                 * the original value of path->ah (ie
                                 * old_ah).
                                 */
                                ipoib_put_ah(neigh->ah);
                        }
                        kref_get(&path->ah->ref);
                        neigh->ah = path->ah;

                        if (ipoib_cm_enabled(dev, neigh->daddr)) {
                                if (!ipoib_cm_get(neigh))
                                        ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
                                                                               path,
                                                                               neigh));
                                if (!ipoib_cm_get(neigh)) {
                                        ipoib_neigh_free(neigh);
                                        continue;
                                }
                        }

                        while ((skb = __skb_dequeue(&neigh->queue)))
                                __skb_queue_tail(&skqueue, skb);
                }
                path->ah->valid = 1;
        }

        path->query = NULL;
        complete(&path->done);

        spin_unlock_irqrestore(&priv->lock, flags);

        if (IS_ERR_OR_NULL(ah))
                ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);

        if (old_ah)
                ipoib_put_ah(old_ah);

        while ((skb = __skb_dequeue(&skqueue))) {
                int ret;
                skb->dev = dev;
                ret = dev_queue_xmit(skb);
                if (ret)
                        ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n",
                                   __func__, ret);
        }
}

static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path,
                          void *gid)
{
        path->dev = priv->dev;

        if (rdma_cap_opa_ah(priv->ca, priv->port))
                path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA;
        else
                path->pathrec.rec_type = SA_PATH_REC_TYPE_IB;

        memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid));
        path->pathrec.sgid            = priv->local_gid;
        path->pathrec.pkey            = cpu_to_be16(priv->pkey);
        path->pathrec.numb_path     = 1;
        path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
}

static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ipoib_path *path;

        if (!priv->broadcast)
                return NULL;

        path = kzalloc(sizeof(*path), GFP_ATOMIC);
        if (!path)
                return NULL;

        skb_queue_head_init(&path->queue);

        INIT_LIST_HEAD(&path->neigh_list);

        init_path_rec(priv, path, gid);

        return path;
}

static int path_rec_start(struct net_device *dev,
                          struct ipoib_path *path)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        ipoib_dbg(priv, "Start path record lookup for %pI6\n",
                  path->pathrec.dgid.raw);

        init_completion(&path->done);

        path->query_id =
                ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
                                   &path->pathrec,
                                   IB_SA_PATH_REC_DGID                |
                                   IB_SA_PATH_REC_SGID                |
                                   IB_SA_PATH_REC_NUMB_PATH        |
                                   IB_SA_PATH_REC_TRAFFIC_CLASS |
                                   IB_SA_PATH_REC_PKEY,
                                   1000, GFP_ATOMIC,
                                   path_rec_completion,
                                   path, &path->query);
        if (path->query_id < 0) {
                ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
                path->query = NULL;
                complete(&path->done);
                return path->query_id;
        }

        return 0;
}

static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr,
                               struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ipoib_path *path;
        unsigned long flags;

        spin_lock_irqsave(&priv->lock, flags);

        path = __path_find(dev, daddr + 4);
        if (!path)
                goto out;
        if (!path->query)
                path_rec_start(dev, path);
out:
        spin_unlock_irqrestore(&priv->lock, flags);
}

static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr,
                                          struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct rdma_netdev *rn = netdev_priv(dev);
        struct ipoib_path *path;
        struct ipoib_neigh *neigh;
        unsigned long flags;

        spin_lock_irqsave(&priv->lock, flags);
        neigh = ipoib_neigh_alloc(daddr, dev);
        if (!neigh) {
                spin_unlock_irqrestore(&priv->lock, flags);
                ++dev->stats.tx_dropped;
                dev_kfree_skb_any(skb);
                return NULL;
        }

        /* To avoid race condition, make sure that the
         * neigh will be added only once.
         */
        if (unlikely(!list_empty(&neigh->list))) {
                spin_unlock_irqrestore(&priv->lock, flags);
                return neigh;
        }

        path = __path_find(dev, daddr + 4);
        if (!path) {
                path = path_rec_create(dev, daddr + 4);
                if (!path)
                        goto err_path;

                __path_add(dev, path);
        }

        list_add_tail(&neigh->list, &path->neigh_list);

        if (path->ah && path->ah->valid) {
                kref_get(&path->ah->ref);
                neigh->ah = path->ah;

                if (ipoib_cm_enabled(dev, neigh->daddr)) {
                        if (!ipoib_cm_get(neigh))
                                ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
                        if (!ipoib_cm_get(neigh)) {
                                ipoib_neigh_free(neigh);
                                goto err_drop;
                        }
                        if (skb_queue_len(&neigh->queue) <
                            IPOIB_MAX_PATH_REC_QUEUE) {
                                push_pseudo_header(skb, neigh->daddr);
                                __skb_queue_tail(&neigh->queue, skb);
                        } else {
                                ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
                                           skb_queue_len(&neigh->queue));
                                goto err_drop;
                        }
                } else {
                        spin_unlock_irqrestore(&priv->lock, flags);
                        path->ah->last_send = rn->send(dev, skb, path->ah->ah,
                                                       IPOIB_QPN(daddr));
                        ipoib_neigh_put(neigh);
                        return NULL;
                }
        } else {
                neigh->ah  = NULL;

                if (!path->query && path_rec_start(dev, path))
                        goto err_path;
                if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
                        push_pseudo_header(skb, neigh->daddr);
                        __skb_queue_tail(&neigh->queue, skb);
                } else {
                        goto err_drop;
                }
        }

        spin_unlock_irqrestore(&priv->lock, flags);
        ipoib_neigh_put(neigh);
        return NULL;

err_path:
        ipoib_neigh_free(neigh);
err_drop:
        ++dev->stats.tx_dropped;
        dev_kfree_skb_any(skb);

        spin_unlock_irqrestore(&priv->lock, flags);
        ipoib_neigh_put(neigh);

        return NULL;
}

static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
                             struct ipoib_pseudo_header *phdr)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct rdma_netdev *rn = netdev_priv(dev);
        struct ipoib_path *path;
        unsigned long flags;

        spin_lock_irqsave(&priv->lock, flags);

        /* no broadcast means that all paths are (going to be) not valid */
        if (!priv->broadcast)
                goto drop_and_unlock;

        path = __path_find(dev, phdr->hwaddr + 4);
        if (!path || !path->ah || !path->ah->valid) {
                if (!path) {
                        path = path_rec_create(dev, phdr->hwaddr + 4);
                        if (!path)
                                goto drop_and_unlock;
                        __path_add(dev, path);
                } else {
                        /*
                         * make sure there are no changes in the existing
                         * path record
                         */
                        init_path_rec(priv, path, phdr->hwaddr + 4);
                }
                if (!path->query && path_rec_start(dev, path)) {
                        goto drop_and_unlock;
                }

                if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
                        push_pseudo_header(skb, phdr->hwaddr);
                        __skb_queue_tail(&path->queue, skb);
                        goto unlock;
                } else {
                        goto drop_and_unlock;
                }
        }

        spin_unlock_irqrestore(&priv->lock, flags);
        ipoib_dbg(priv, "Send unicast ARP to %08x\n",
                  be32_to_cpu(sa_path_get_dlid(&path->pathrec)));
        path->ah->last_send = rn->send(dev, skb, path->ah->ah,
                                       IPOIB_QPN(phdr->hwaddr));
        return;

drop_and_unlock:
        ++dev->stats.tx_dropped;
        dev_kfree_skb_any(skb);
unlock:
        spin_unlock_irqrestore(&priv->lock, flags);
}

static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct rdma_netdev *rn = netdev_priv(dev);
        struct ipoib_neigh *neigh;
        struct ipoib_pseudo_header *phdr;
        struct ipoib_header *header;
        unsigned long flags;

        phdr = (struct ipoib_pseudo_header *) skb->data;
        skb_pull(skb, sizeof(*phdr));
        header = (struct ipoib_header *) skb->data;

        if (unlikely(phdr->hwaddr[4] == 0xff)) {
                /* multicast, arrange "if" according to probability */
                if ((header->proto != htons(ETH_P_IP)) &&
                    (header->proto != htons(ETH_P_IPV6)) &&
                    (header->proto != htons(ETH_P_ARP)) &&
                    (header->proto != htons(ETH_P_RARP)) &&
                    (header->proto != htons(ETH_P_TIPC))) {
                        /* ethertype not supported by IPoIB */
                        ++dev->stats.tx_dropped;
                        dev_kfree_skb_any(skb);
                        return NETDEV_TX_OK;
                }
                /* Add in the P_Key for multicast*/
                phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
                phdr->hwaddr[9] = priv->pkey & 0xff;

                neigh = ipoib_neigh_get(dev, phdr->hwaddr);
                if (likely(neigh))
                        goto send_using_neigh;
                ipoib_mcast_send(dev, phdr->hwaddr, skb);
                return NETDEV_TX_OK;
        }

        /* unicast, arrange "switch" according to probability */
        switch (header->proto) {
        case htons(ETH_P_IP):
        case htons(ETH_P_IPV6):
        case htons(ETH_P_TIPC):
                neigh = ipoib_neigh_get(dev, phdr->hwaddr);
                if (unlikely(!neigh)) {
                        neigh = neigh_add_path(skb, phdr->hwaddr, dev);
                        if (likely(!neigh))
                                return NETDEV_TX_OK;
                }
                break;
        case htons(ETH_P_ARP):
        case htons(ETH_P_RARP):
                /* for unicast ARP and RARP should always perform path find */
                unicast_arp_send(skb, dev, phdr);
                return NETDEV_TX_OK;
        default:
                /* ethertype not supported by IPoIB */
                ++dev->stats.tx_dropped;
                dev_kfree_skb_any(skb);
                return NETDEV_TX_OK;
        }

send_using_neigh:
        /* note we now hold a ref to neigh */
        if (ipoib_cm_get(neigh)) {
                if (ipoib_cm_up(neigh)) {
                        ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
                        goto unref;
                }
        } else if (neigh->ah && neigh->ah->valid) {
                neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah,
                                                IPOIB_QPN(phdr->hwaddr));
                goto unref;
        } else if (neigh->ah) {
                neigh_refresh_path(neigh, phdr->hwaddr, dev);
        }

        if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
                push_pseudo_header(skb, phdr->hwaddr);
                spin_lock_irqsave(&priv->lock, flags);
                __skb_queue_tail(&neigh->queue, skb);
                spin_unlock_irqrestore(&priv->lock, flags);
        } else {
                ++dev->stats.tx_dropped;
                dev_kfree_skb_any(skb);
        }

unref:
        ipoib_neigh_put(neigh);

        return NETDEV_TX_OK;
}

static void ipoib_timeout(struct net_device *dev, unsigned int txqueue)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct rdma_netdev *rn = netdev_priv(dev);

        if (rn->tx_timeout) {
                rn->tx_timeout(dev, txqueue);
                return;
        }
        ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
                   jiffies_to_msecs(jiffies - dev_trans_start(dev)));
        ipoib_warn(priv,
                   "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n",
                   netif_queue_stopped(dev), priv->tx_head, priv->tx_tail,
                   priv->global_tx_head, priv->global_tx_tail);


        schedule_work(&priv->tx_timeout_work);
}

void ipoib_ib_tx_timeout_work(struct work_struct *work)
{
        struct ipoib_dev_priv *priv = container_of(work,
                                                   struct ipoib_dev_priv,
                                                   tx_timeout_work);
        int err;

        rtnl_lock();

        if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
                goto unlock;

        ipoib_stop(priv->dev);
        err = ipoib_open(priv->dev);
        if (err) {
                ipoib_warn(priv, "ipoib_open failed recovering from a tx_timeout, err(%d).\n",
                                err);
                goto unlock;
        }

        netif_tx_wake_all_queues(priv->dev);
unlock:
        rtnl_unlock();

}

static int ipoib_hard_header(struct sk_buff *skb,
                             struct net_device *dev,
                             unsigned short type,
                             const void *daddr,
                             const void *saddr,
                             unsigned int len)
{
        struct ipoib_header *header;

        header = skb_push(skb, sizeof(*header));

        header->proto = htons(type);
        header->reserved = 0;

        /*
         * we don't rely on dst_entry structure,  always stuff the
         * destination address into skb hard header so we can figure out where
         * to send the packet later.
         */
        push_pseudo_header(skb, daddr);

        return IPOIB_HARD_LEN;
}

static void ipoib_set_mcast_list(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
                ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
                return;
        }

        queue_work(priv->wq, &priv->restart_task);
}

static int ipoib_get_iflink(const struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        /* parent interface */
        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
                return READ_ONCE(dev->ifindex);

        /* child/vlan interface */
        return READ_ONCE(priv->parent->ifindex);
}

static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
{
        /*
         * Use only the address parts that contributes to spreading
         * The subnet prefix is not used as one can not connect to
         * same remote port (GUID) using the same remote QPN via two
         * different subnets.
         */
         /* qpn octets[1:4) & port GUID octets[12:20) */
        u32 *d32 = (u32 *) daddr;
        u32 hv;

        hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
        return hv & htbl->mask;
}

struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ipoib_neigh_table *ntbl = &priv->ntbl;
        struct ipoib_neigh_hash *htbl;
        struct ipoib_neigh *neigh = NULL;
        u32 hash_val;

        rcu_read_lock_bh();

        htbl = rcu_dereference_bh(ntbl->htbl);

        if (!htbl)
                goto out_unlock;

        hash_val = ipoib_addr_hash(htbl, daddr);
        for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
             neigh != NULL;
             neigh = rcu_dereference_bh(neigh->hnext)) {
                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
                        /* found, take one ref on behalf of the caller */
                        if (!refcount_inc_not_zero(&neigh->refcnt)) {
                                /* deleted */
                                neigh = NULL;
                                goto out_unlock;
                        }

                        if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE))
                                neigh->alive = jiffies;
                        goto out_unlock;
                }
        }

out_unlock:
        rcu_read_unlock_bh();
        return neigh;
}

static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
{
        struct ipoib_neigh_table *ntbl = &priv->ntbl;
        struct ipoib_neigh_hash *htbl;
        unsigned long neigh_obsolete;
        unsigned long dt;
        unsigned long flags;
        int i;
        LIST_HEAD(remove_list);

        spin_lock_irqsave(&priv->lock, flags);

        htbl = rcu_dereference_protected(ntbl->htbl,
                                         lockdep_is_held(&priv->lock));

        if (!htbl)
                goto out_unlock;

        /* neigh is obsolete if it was idle for two GC periods */
        dt = 2 * arp_tbl.gc_interval;
        neigh_obsolete = jiffies - dt;

        for (i = 0; i < htbl->size; i++) {
                struct ipoib_neigh *neigh;
                struct ipoib_neigh __rcu **np = &htbl->buckets[i];

                while ((neigh = rcu_dereference_protected(*np,
                                                          lockdep_is_held(&priv->lock))) != NULL) {
                        /* was the neigh idle for two GC periods */
                        if (time_after(neigh_obsolete, neigh->alive)) {

                                ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);

                                rcu_assign_pointer(*np,
                                                   rcu_dereference_protected(neigh->hnext,
                                                                             lockdep_is_held(&priv->lock)));
                                /* remove from path/mc list */
                                list_del_init(&neigh->list);
                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
                        } else {
                                np = &neigh->hnext;
                        }

                }
        }

out_unlock:
        spin_unlock_irqrestore(&priv->lock, flags);
        ipoib_mcast_remove_list(&remove_list);
}

static void ipoib_reap_neigh(struct work_struct *work)
{
        struct ipoib_dev_priv *priv =
                container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);

        __ipoib_reap_neigh(priv);

        queue_delayed_work(priv->wq, &priv->neigh_reap_task,
                           arp_tbl.gc_interval);
}


static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
                                      struct net_device *dev)
{
        struct ipoib_neigh *neigh;

        neigh = kzalloc(sizeof(*neigh), GFP_ATOMIC);
        if (!neigh)
                return NULL;

        neigh->dev = dev;
        memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
        skb_queue_head_init(&neigh->queue);
        INIT_LIST_HEAD(&neigh->list);
        ipoib_cm_set(neigh, NULL);
        /* one ref on behalf of the caller */
        refcount_set(&neigh->refcnt, 1);

        return neigh;
}

struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
                                      struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ipoib_neigh_table *ntbl = &priv->ntbl;
        struct ipoib_neigh_hash *htbl;
        struct ipoib_neigh *neigh;
        u32 hash_val;

        htbl = rcu_dereference_protected(ntbl->htbl,
                                         lockdep_is_held(&priv->lock));
        if (!htbl) {
                neigh = NULL;
                goto out_unlock;
        }

        /* need to add a new neigh, but maybe some other thread succeeded?
         * recalc hash, maybe hash resize took place so we do a search
         */
        hash_val = ipoib_addr_hash(htbl, daddr);
        for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
                                               lockdep_is_held(&priv->lock));
             neigh != NULL;
             neigh = rcu_dereference_protected(neigh->hnext,
                                               lockdep_is_held(&priv->lock))) {
                if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
                        /* found, take one ref on behalf of the caller */
                        if (!refcount_inc_not_zero(&neigh->refcnt)) {
                                /* deleted */
                                neigh = NULL;
                                break;
                        }
                        neigh->alive = jiffies;
                        goto out_unlock;
                }
        }

        neigh = ipoib_neigh_ctor(daddr, dev);
        if (!neigh)
                goto out_unlock;

        /* one ref on behalf of the hash table */
        refcount_inc(&neigh->refcnt);
        neigh->alive = jiffies;
        /* put in hash */
        rcu_assign_pointer(neigh->hnext,
                           rcu_dereference_protected(htbl->buckets[hash_val],
                                                     lockdep_is_held(&priv->lock)));
        rcu_assign_pointer(htbl->buckets[hash_val], neigh);
        atomic_inc(&ntbl->entries);

out_unlock:

        return neigh;
}

void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
{
        /* neigh reference count was dropprd to zero */
        struct net_device *dev = neigh->dev;
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct sk_buff *skb;
        if (neigh->ah)
                ipoib_put_ah(neigh->ah);
        while ((skb = __skb_dequeue(&neigh->queue))) {
                ++dev->stats.tx_dropped;
                dev_kfree_skb_any(skb);
        }
        if (ipoib_cm_get(neigh))
                ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
        ipoib_dbg(ipoib_priv(dev),
                  "neigh free for %06x %pI6\n",
                  IPOIB_QPN(neigh->daddr),
                  neigh->daddr + 4);
        kfree(neigh);
        if (atomic_dec_and_test(&priv->ntbl.entries)) {
                if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
                        complete(&priv->ntbl.flushed);
        }
}

static void ipoib_neigh_reclaim(struct rcu_head *rp)
{
        /* Called as a result of removal from hash table */
        struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
        /* note TX context may hold another ref */
        ipoib_neigh_put(neigh);
}

void ipoib_neigh_free(struct ipoib_neigh *neigh)
{
        struct net_device *dev = neigh->dev;
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ipoib_neigh_table *ntbl = &priv->ntbl;
        struct ipoib_neigh_hash *htbl;
        struct ipoib_neigh __rcu **np;
        struct ipoib_neigh *n;
        u32 hash_val;

        htbl = rcu_dereference_protected(ntbl->htbl,
                                        lockdep_is_held(&priv->lock));
        if (!htbl)
                return;

        hash_val = ipoib_addr_hash(htbl, neigh->daddr);
        np = &htbl->buckets[hash_val];
        for (n = rcu_dereference_protected(*np,
                                            lockdep_is_held(&priv->lock));
             n != NULL;
             n = rcu_dereference_protected(*np,
                                        lockdep_is_held(&priv->lock))) {
                if (n == neigh) {
                        /* found */
                        rcu_assign_pointer(*np,
                                           rcu_dereference_protected(neigh->hnext,
                                                                     lockdep_is_held(&priv->lock)));
                        /* remove from parent list */
                        list_del_init(&neigh->list);
                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
                        return;
                } else {
                        np = &n->hnext;
                }
        }
}

static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
{
        struct ipoib_neigh_table *ntbl = &priv->ntbl;
        struct ipoib_neigh_hash *htbl;
        struct ipoib_neigh __rcu **buckets;
        u32 size;

        clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
        ntbl->htbl = NULL;
        htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
        if (!htbl)
                return -ENOMEM;
        size = roundup_pow_of_two(arp_tbl.gc_thresh3);
        buckets = kvcalloc(size, sizeof(*buckets), GFP_KERNEL);
        if (!buckets) {
                kfree(htbl);
                return -ENOMEM;
        }
        htbl->size = size;
        htbl->mask = (size - 1);
        htbl->buckets = buckets;
        RCU_INIT_POINTER(ntbl->htbl, htbl);
        htbl->ntbl = ntbl;
        atomic_set(&ntbl->entries, 0);

        /* start garbage collection */
        queue_delayed_work(priv->wq, &priv->neigh_reap_task,
                           arp_tbl.gc_interval);

        return 0;
}

static void neigh_hash_free_rcu(struct rcu_head *head)
{
        struct ipoib_neigh_hash *htbl = container_of(head,
                                                    struct ipoib_neigh_hash,
                                                    rcu);
        struct ipoib_neigh __rcu **buckets = htbl->buckets;
        struct ipoib_neigh_table *ntbl = htbl->ntbl;

        kvfree(buckets);
        kfree(htbl);
        complete(&ntbl->deleted);
}

void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct ipoib_neigh_table *ntbl = &priv->ntbl;
        struct ipoib_neigh_hash *htbl;
        unsigned long flags;
        int i;

        /* remove all neigh connected to a given path or mcast */
        spin_lock_irqsave(&priv->lock, flags);

        htbl = rcu_dereference_protected(ntbl->htbl,
                                         lockdep_is_held(&priv->lock));

        if (!htbl)
                goto out_unlock;

        for (i = 0; i < htbl->size; i++) {
                struct ipoib_neigh *neigh;
                struct ipoib_neigh __rcu **np = &htbl->buckets[i];

                while ((neigh = rcu_dereference_protected(*np,
                                                          lockdep_is_held(&priv->lock))) != NULL) {
                        /* delete neighs belong to this parent */
                        if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
                                rcu_assign_pointer(*np,
                                                   rcu_dereference_protected(neigh->hnext,
                                                                             lockdep_is_held(&priv->lock)));
                                /* remove from parent list */
                                list_del_init(&neigh->list);
                                call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
                        } else {
                                np = &neigh->hnext;
                        }

                }
        }
out_unlock:
        spin_unlock_irqrestore(&priv->lock, flags);
}

static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
{
        struct ipoib_neigh_table *ntbl = &priv->ntbl;
        struct ipoib_neigh_hash *htbl;
        unsigned long flags;
        int i, wait_flushed = 0;

        init_completion(&priv->ntbl.flushed);
        set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);

        spin_lock_irqsave(&priv->lock, flags);

        htbl = rcu_dereference_protected(ntbl->htbl,
                                        lockdep_is_held(&priv->lock));
        if (!htbl)
                goto out_unlock;

        wait_flushed = atomic_read(&priv->ntbl.entries);
        if (!wait_flushed)
                goto free_htbl;

        for (i = 0; i < htbl->size; i++) {
                struct ipoib_neigh *neigh;
                struct ipoib_neigh __rcu **np = &htbl->buckets[i];

                while ((neigh = rcu_dereference_protected(*np,
                                       lockdep_is_held(&priv->lock))) != NULL) {
                        rcu_assign_pointer(*np,
                                           rcu_dereference_protected(neigh->hnext,
                                                                     lockdep_is_held(&priv->lock)));
                        /* remove from path/mc list */
                        list_del_init(&neigh->list);
                        call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
                }
        }

free_htbl:
        rcu_assign_pointer(ntbl->htbl, NULL);
        call_rcu(&htbl->rcu, neigh_hash_free_rcu);

out_unlock:
        spin_unlock_irqrestore(&priv->lock, flags);
        if (wait_flushed)
                wait_for_completion(&priv->ntbl.flushed);
}

static void ipoib_neigh_hash_uninit(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        ipoib_dbg(priv, "%s\n", __func__);
        init_completion(&priv->ntbl.deleted);

        cancel_delayed_work_sync(&priv->neigh_reap_task);

        ipoib_flush_neighs(priv);

        wait_for_completion(&priv->ntbl.deleted);
}

static void ipoib_napi_add(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll,
                              IPOIB_NUM_WC);
        netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll,
                              MAX_SEND_CQE);
}

static void ipoib_napi_del(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        netif_napi_del(&priv->recv_napi);
        netif_napi_del(&priv->send_napi);
}

static void ipoib_dev_uninit_default(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        ipoib_transport_dev_cleanup(dev);

        ipoib_napi_del(dev);

        ipoib_cm_dev_cleanup(dev);

        kfree(priv->rx_ring);
        vfree(priv->tx_ring);

        priv->rx_ring = NULL;
        priv->tx_ring = NULL;
}

static int ipoib_dev_init_default(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        u8 addr_mod[3];

        ipoib_napi_add(dev);

        /* Allocate RX/TX "rings" to hold queued skbs */
        priv->rx_ring =        kcalloc(ipoib_recvq_size,
                                       sizeof(*priv->rx_ring),
                                       GFP_KERNEL);
        if (!priv->rx_ring)
                goto out;

        priv->tx_ring = vzalloc(array_size(ipoib_sendq_size,
                                           sizeof(*priv->tx_ring)));
        if (!priv->tx_ring) {
                pr_warn("%s: failed to allocate TX ring (%d entries)\n",
                        priv->ca->name, ipoib_sendq_size);
                goto out_rx_ring_cleanup;
        }

        /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */

        if (ipoib_transport_dev_init(dev, priv->ca)) {
                pr_warn("%s: ipoib_transport_dev_init failed\n",
                        priv->ca->name);
                goto out_tx_ring_cleanup;
        }

        /* after qp created set dev address */
        addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff;
        addr_mod[1] = (priv->qp->qp_num >>  8) & 0xff;
        addr_mod[2] = (priv->qp->qp_num) & 0xff;
        dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod));

        return 0;

out_tx_ring_cleanup:
        vfree(priv->tx_ring);

out_rx_ring_cleanup:
        kfree(priv->rx_ring);

out:
        ipoib_napi_del(dev);
        return -ENOMEM;
}

static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr,
                       int cmd)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        if (!priv->rn_ops->ndo_eth_ioctl)
                return -EOPNOTSUPP;

        return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd);
}

static int ipoib_dev_init(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        int ret = -ENOMEM;

        priv->qp = NULL;

        /*
         * the various IPoIB tasks assume they will never race against
         * themselves, so always use a single thread workqueue
         */
        priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM);
        if (!priv->wq) {
                pr_warn("%s: failed to allocate device WQ\n", dev->name);
                goto out;
        }

        /* create pd, which used both for control and datapath*/
        priv->pd = ib_alloc_pd(priv->ca, 0);
        if (IS_ERR(priv->pd)) {
                pr_warn("%s: failed to allocate PD\n", priv->ca->name);
                goto clean_wq;
        }

        ret = priv->rn_ops->ndo_init(dev);
        if (ret) {
                pr_warn("%s failed to init HW resource\n", dev->name);
                goto out_free_pd;
        }

        ret = ipoib_neigh_hash_init(priv);
        if (ret) {
                pr_warn("%s failed to init neigh hash\n", dev->name);
                goto out_dev_uninit;
        }

        if (dev->flags & IFF_UP) {
                if (ipoib_ib_dev_open(dev)) {
                        pr_warn("%s failed to open device\n", dev->name);
                        ret = -ENODEV;
                        goto out_hash_uninit;
                }
        }

        return 0;

out_hash_uninit:
        ipoib_neigh_hash_uninit(dev);

out_dev_uninit:
        ipoib_ib_dev_cleanup(dev);

out_free_pd:
        if (priv->pd) {
                ib_dealloc_pd(priv->pd);
                priv->pd = NULL;
        }

clean_wq:
        if (priv->wq) {
                destroy_workqueue(priv->wq);
                priv->wq = NULL;
        }

out:
        return ret;
}

/*
 * This must be called before doing an unregister_netdev on a parent device to
 * shutdown the IB event handler.
 */
static void ipoib_parent_unregister_pre(struct net_device *ndev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(ndev);

        /*
         * ipoib_set_mac checks netif_running before pushing work, clearing
         * running ensures the it will not add more work.
         */
        rtnl_lock();
        dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL);
        rtnl_unlock();

        /* ipoib_event() cannot be running once this returns */
        ib_unregister_event_handler(&priv->event_handler);

        /*
         * Work on the queue grabs the rtnl lock, so this cannot be done while
         * also holding it.
         */
        flush_workqueue(ipoib_workqueue);
}

static void ipoib_set_dev_features(struct ipoib_dev_priv *priv)
{
        priv->hca_caps = priv->ca->attrs.device_cap_flags;
        priv->kernel_caps = priv->ca->attrs.kernel_cap_flags;

        if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
                priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM;

                if (priv->kernel_caps & IBK_UD_TSO)
                        priv->dev->hw_features |= NETIF_F_TSO;

                priv->dev->features |= priv->dev->hw_features;
        }
}

static int ipoib_parent_init(struct net_device *ndev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
        struct ib_port_attr attr;
        int result;

        result = ib_query_port(priv->ca, priv->port, &attr);
        if (result) {
                pr_warn("%s: ib_query_port %d failed\n", priv->ca->name,
                        priv->port);
                return result;
        }
        priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr);

        result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
        if (result) {
                pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n",
                        priv->ca->name, priv->port, result);
                return result;
        }

        result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid);
        if (result) {
                pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n",
                        priv->ca->name, priv->port, result);
                return result;
        }
        dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid));

        SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent);
        priv->dev->dev_port = priv->port - 1;
        /* Let's set this one too for backwards compatibility. */
        priv->dev->dev_id = priv->port - 1;

        return 0;
}

static void ipoib_child_init(struct net_device *ndev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
        struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);

        priv->max_ib_mtu = ppriv->max_ib_mtu;
        set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
        if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN))
                memcpy(&priv->local_gid, priv->dev->dev_addr + 4,
                       sizeof(priv->local_gid));
        else {
                __dev_addr_set(priv->dev, ppriv->dev->dev_addr,
                               INFINIBAND_ALEN);
                memcpy(&priv->local_gid, &ppriv->local_gid,
                       sizeof(priv->local_gid));
        }
}

static int ipoib_ndo_init(struct net_device *ndev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(ndev);
        int rc;
        struct rdma_netdev *rn = netdev_priv(ndev);

        if (priv->parent) {
                ipoib_child_init(ndev);
        } else {
                rc = ipoib_parent_init(ndev);
                if (rc)
                        return rc;
        }

        /* MTU will be reset when mcast join happens */
        ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
        priv->mcast_mtu = priv->admin_mtu = ndev->mtu;
        rn->mtu = priv->mcast_mtu;
        ndev->max_mtu = IPOIB_CM_MTU;

        ndev->neigh_priv_len = sizeof(struct ipoib_neigh);

        /*
         * Set the full membership bit, so that we join the right
         * broadcast group, etc.
         */
        priv->pkey |= 0x8000;

        ndev->broadcast[8] = priv->pkey >> 8;
        ndev->broadcast[9] = priv->pkey & 0xff;
        set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);

        ipoib_set_dev_features(priv);

        rc = ipoib_dev_init(ndev);
        if (rc) {
                pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n",
                        priv->ca->name, priv->dev->name, priv->port, rc);
                return rc;
        }

        if (priv->parent) {
                struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);

                dev_hold(priv->parent);

                down_write(&ppriv->vlan_rwsem);
                list_add_tail(&priv->list, &ppriv->child_intfs);
                up_write(&ppriv->vlan_rwsem);
        }

        return 0;
}

static void ipoib_ndo_uninit(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        ASSERT_RTNL();

        /*
         * ipoib_remove_one guarantees the children are removed before the
         * parent, and that is the only place where a parent can be removed.
         */
        WARN_ON(!list_empty(&priv->child_intfs));

        if (priv->parent) {
                struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent);

                down_write(&ppriv->vlan_rwsem);
                list_del(&priv->list);
                up_write(&ppriv->vlan_rwsem);
        }

        ipoib_neigh_hash_uninit(dev);

        ipoib_ib_dev_cleanup(dev);

        /* no more works over the priv->wq */
        if (priv->wq) {
                /* See ipoib_mcast_carrier_on_task() */
                WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags));
                destroy_workqueue(priv->wq);
                priv->wq = NULL;
        }

        dev_put(priv->parent);
}

static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state);
}

static int ipoib_get_vf_config(struct net_device *dev, int vf,
                               struct ifla_vf_info *ivf)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        int err;

        err = ib_get_vf_config(priv->ca, vf, priv->port, ivf);
        if (err)
                return err;

        ivf->vf = vf;
        memcpy(ivf->mac, dev->dev_addr, dev->addr_len);

        return 0;
}

static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID)
                return -EINVAL;

        return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
}

static int ipoib_get_vf_guid(struct net_device *dev, int vf,
                             struct ifla_vf_guid *node_guid,
                             struct ifla_vf_guid *port_guid)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid);
}

static int ipoib_get_vf_stats(struct net_device *dev, int vf,
                              struct ifla_vf_stats *vf_stats)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats);
}

static const struct header_ops ipoib_header_ops = {
        .create        = ipoib_hard_header,
};

static const struct net_device_ops ipoib_netdev_ops_pf = {
        .ndo_init                 = ipoib_ndo_init,
        .ndo_uninit                 = ipoib_ndo_uninit,
        .ndo_open                 = ipoib_open,
        .ndo_stop                 = ipoib_stop,
        .ndo_change_mtu                 = ipoib_change_mtu,
        .ndo_fix_features         = ipoib_fix_features,
        .ndo_start_xmit                 = ipoib_start_xmit,
        .ndo_tx_timeout                 = ipoib_timeout,
        .ndo_set_rx_mode         = ipoib_set_mcast_list,
        .ndo_get_iflink                 = ipoib_get_iflink,
        .ndo_set_vf_link_state         = ipoib_set_vf_link_state,
        .ndo_get_vf_config         = ipoib_get_vf_config,
        .ndo_get_vf_stats         = ipoib_get_vf_stats,
        .ndo_get_vf_guid         = ipoib_get_vf_guid,
        .ndo_set_vf_guid         = ipoib_set_vf_guid,
        .ndo_set_mac_address         = ipoib_set_mac,
        .ndo_get_stats64         = ipoib_get_stats,
        .ndo_eth_ioctl                 = ipoib_ioctl,
};

static const struct net_device_ops ipoib_netdev_ops_vf = {
        .ndo_init                 = ipoib_ndo_init,
        .ndo_uninit                 = ipoib_ndo_uninit,
        .ndo_open                 = ipoib_open,
        .ndo_stop                 = ipoib_stop,
        .ndo_change_mtu                 = ipoib_change_mtu,
        .ndo_fix_features         = ipoib_fix_features,
        .ndo_start_xmit                  = ipoib_start_xmit,
        .ndo_tx_timeout                 = ipoib_timeout,
        .ndo_set_rx_mode         = ipoib_set_mcast_list,
        .ndo_get_iflink                 = ipoib_get_iflink,
        .ndo_get_stats64         = ipoib_get_stats,
        .ndo_eth_ioctl                 = ipoib_ioctl,
};

static const struct net_device_ops ipoib_netdev_default_pf = {
        .ndo_init                 = ipoib_dev_init_default,
        .ndo_uninit                 = ipoib_dev_uninit_default,
        .ndo_open                 = ipoib_ib_dev_open_default,
        .ndo_stop                 = ipoib_ib_dev_stop_default,
};

void ipoib_setup_common(struct net_device *dev)
{
        dev->header_ops                 = &ipoib_header_ops;
        dev->netdev_ops          = &ipoib_netdev_default_pf;

        ipoib_set_ethtool_ops(dev);

        dev->watchdog_timeo         = 10 * HZ;

        dev->flags                |= IFF_BROADCAST | IFF_MULTICAST;

        dev->hard_header_len         = IPOIB_HARD_LEN;
        dev->addr_len                 = INFINIBAND_ALEN;
        dev->type                 = ARPHRD_INFINIBAND;
        dev->tx_queue_len         = DEFAULT_TX_QUEUE_LEN;
        dev->features                 = (NETIF_F_VLAN_CHALLENGED        |
                                    NETIF_F_HIGHDMA);
        netif_keep_dst(dev);

        memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);

        /*
         * unregister_netdev always frees the netdev, we use this mode
         * consistently to unify all the various unregister paths, including
         * those connected to rtnl_link_ops which require it.
         */
        dev->needs_free_netdev = true;
}

static void ipoib_build_priv(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);

        priv->dev = dev;
        spin_lock_init(&priv->lock);
        init_rwsem(&priv->vlan_rwsem);
        mutex_init(&priv->mcast_mutex);

        INIT_LIST_HEAD(&priv->path_list);
        INIT_LIST_HEAD(&priv->child_intfs);
        INIT_LIST_HEAD(&priv->dead_ahs);
        INIT_LIST_HEAD(&priv->multicast_list);

        INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
        INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
        INIT_WORK(&priv->reschedule_napi_work, ipoib_napi_schedule_work);
        INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
        INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
        INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
        INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
        INIT_WORK(&priv->tx_timeout_work, ipoib_ib_tx_timeout_work);
        INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
        INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
}

static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port,
                                             const char *name)
{
        struct net_device *dev;

        dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
                                NET_NAME_UNKNOWN, ipoib_setup_common);
        if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP)
                return dev;

        dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN,
                           ipoib_setup_common);
        if (!dev)
                return ERR_PTR(-ENOMEM);
        return dev;
}

int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name,
                    struct net_device *dev)
{
        struct rdma_netdev *rn = netdev_priv(dev);
        struct ipoib_dev_priv *priv;
        int rc;

        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
        if (!priv)
                return -ENOMEM;

        priv->ca = hca;
        priv->port = port;

        rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name,
                              NET_NAME_UNKNOWN, ipoib_setup_common, dev);
        if (rc) {
                if (rc != -EOPNOTSUPP)
                        goto out;

                rn->send = ipoib_send;
                rn->attach_mcast = ipoib_mcast_attach;
                rn->detach_mcast = ipoib_mcast_detach;
                rn->hca = hca;

                rc = netif_set_real_num_tx_queues(dev, 1);
                if (rc)
                        goto out;

                rc = netif_set_real_num_rx_queues(dev, 1);
                if (rc)
                        goto out;
        }

        priv->rn_ops = dev->netdev_ops;

        if (hca->attrs.kernel_cap_flags & IBK_VIRTUAL_FUNCTION)
                dev->netdev_ops        = &ipoib_netdev_ops_vf;
        else
                dev->netdev_ops        = &ipoib_netdev_ops_pf;

        rn->clnt_priv = priv;
        /*
         * Only the child register_netdev flows can handle priv_destructor
         * being set, so we force it to NULL here and handle manually until it
         * is safe to turn on.
         */
        priv->next_priv_destructor = dev->priv_destructor;
        dev->priv_destructor = NULL;

        ipoib_build_priv(dev);

        return 0;

out:
        kfree(priv);
        return rc;
}

struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port,
                                    const char *name)
{
        struct net_device *dev;
        int rc;

        dev = ipoib_alloc_netdev(hca, port, name);
        if (IS_ERR(dev))
                return dev;

        rc = ipoib_intf_init(hca, port, name, dev);
        if (rc) {
                free_netdev(dev);
                return ERR_PTR(rc);
        }

        /*
         * Upon success the caller must ensure ipoib_intf_free is called or
         * register_netdevice succeed'd and priv_destructor is set to
         * ipoib_intf_free.
         */
        return dev;
}

void ipoib_intf_free(struct net_device *dev)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct rdma_netdev *rn = netdev_priv(dev);

        dev->priv_destructor = priv->next_priv_destructor;
        if (dev->priv_destructor)
                dev->priv_destructor(dev);

        /*
         * There are some error flows around register_netdev failing that may
         * attempt to call priv_destructor twice, prevent that from happening.
         */
        dev->priv_destructor = NULL;

        /* unregister/destroy is very complicated. Make bugs more obvious. */
        rn->clnt_priv = NULL;

        kfree(priv);
}

static ssize_t pkey_show(struct device *dev, struct device_attribute *attr,
                         char *buf)
{
        struct net_device *ndev = to_net_dev(dev);
        struct ipoib_dev_priv *priv = ipoib_priv(ndev);

        return sysfs_emit(buf, "0x%04x\n", priv->pkey);
}
static DEVICE_ATTR_RO(pkey);

static ssize_t umcast_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct net_device *ndev = to_net_dev(dev);
        struct ipoib_dev_priv *priv = ipoib_priv(ndev);

        return sysfs_emit(buf, "%d\n",
                          test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
}

void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
{
        struct ipoib_dev_priv *priv = ipoib_priv(ndev);

        if (umcast_val > 0) {
                set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
                ipoib_warn(priv, "ignoring multicast groups joined directly "
                                "by userspace\n");
        } else
                clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
}

static ssize_t umcast_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        unsigned long umcast_val = simple_strtoul(buf, NULL, 0);

        ipoib_set_umcast(to_net_dev(dev), umcast_val);

        return count;
}
static DEVICE_ATTR_RW(umcast);

int ipoib_add_umcast_attr(struct net_device *dev)
{
        return device_create_file(&dev->dev, &dev_attr_umcast);
}

static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
{
        struct ipoib_dev_priv *child_priv;
        struct net_device *netdev = priv->dev;

        netif_addr_lock_bh(netdev);

        memcpy(&priv->local_gid.global.interface_id,
               &gid->global.interface_id,
               sizeof(gid->global.interface_id));
        dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid));
        clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);

        netif_addr_unlock_bh(netdev);

        if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
                down_read(&priv->vlan_rwsem);
                list_for_each_entry(child_priv, &priv->child_intfs, list)
                        set_base_guid(child_priv, gid);
                up_read(&priv->vlan_rwsem);
        }
}

static int ipoib_check_lladdr(struct net_device *dev,
                              struct sockaddr_storage *ss)
{
        union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
        int ret = 0;

        netif_addr_lock_bh(dev);

        /* Make sure the QPN, reserved and subnet prefix match the current
         * lladdr, it also makes sure the lladdr is unicast.
         */
        if (memcmp(dev->dev_addr, ss->__data,
                   4 + sizeof(gid->global.subnet_prefix)) ||
            gid->global.interface_id == 0)
                ret = -EINVAL;

        netif_addr_unlock_bh(dev);

        return ret;
}

static int ipoib_set_mac(struct net_device *dev, void *addr)
{
        struct ipoib_dev_priv *priv = ipoib_priv(dev);
        struct sockaddr_storage *ss = addr;
        int ret;

        if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
                return -EBUSY;

        ret = ipoib_check_lladdr(dev, ss);
        if (ret)
                return ret;

        set_base_guid(priv, (union ib_gid *)(ss->__data + 4));

        queue_work(ipoib_workqueue, &priv->flush_light);

        return 0;
}

static ssize_t create_child_store(struct device *dev,
                                  struct device_attribute *attr,
                                  const char *buf, size_t count)
{
        int pkey;
        int ret;

        if (sscanf(buf, "%i", &pkey) != 1)
                return -EINVAL;

        if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
                return -EINVAL;

        ret = ipoib_vlan_add(to_net_dev(dev), pkey);

        return ret ? ret : count;
}
static DEVICE_ATTR_WO(create_child);

static ssize_t delete_child_store(struct device *dev,
                                  struct device_attribute *attr,
                                  const char *buf, size_t count)
{
        int pkey;
        int ret;

        if (sscanf(buf, "%i", &pkey) != 1)
                return -EINVAL;

        if (pkey < 0 || pkey > 0xffff)
                return -EINVAL;

        ret = ipoib_vlan_delete(to_net_dev(dev), pkey);

        return ret ? ret : count;

}
static DEVICE_ATTR_WO(delete_child);

int ipoib_add_pkey_attr(struct net_device *dev)
{
        return device_create_file(&dev->dev, &dev_attr_pkey);
}

/*
 * We erroneously exposed the iface's port number in the dev_id
 * sysfs field long after dev_port was introduced for that purpose[1],
 * and we need to stop everyone from relying on that.
 * Let's overload the shower routine for the dev_id file here
 * to gently bring the issue up.
 *
 * [1] https://www.spinics.net/lists/netdev/msg272123.html
 */
static ssize_t dev_id_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        struct net_device *ndev = to_net_dev(dev);

        /*
         * ndev->dev_port will be equal to 0 in old kernel prior to commit
         * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface
         * port numbers") Zero was chosen as special case for user space
         * applications to fallback and query dev_id to check if it has
         * different value or not.
         *
         * Don't print warning in such scenario.
         *
         * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358
         */
        if (ndev->dev_port && ndev->dev_id == ndev->dev_port)
                netdev_info_once(ndev,
                        "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n",
                        current->comm);

        return sysfs_emit(buf, "%#x\n", ndev->dev_id);
}
static DEVICE_ATTR_RO(dev_id);

static int ipoib_intercept_dev_id_attr(struct net_device *dev)
{
        device_remove_file(&dev->dev, &dev_attr_dev_id);
        return device_create_file(&dev->dev, &dev_attr_dev_id);
}

static struct net_device *ipoib_add_port(const char *format,
                                         struct ib_device *hca, u32 port)
{
        struct rtnl_link_ops *ops = ipoib_get_link_ops();
        struct rdma_netdev_alloc_params params;
        struct ipoib_dev_priv *priv;
        struct net_device *ndev;
        int result;

        ndev = ipoib_intf_alloc(hca, port, format);
        if (IS_ERR(ndev)) {
                pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port,
                        PTR_ERR(ndev));
                return ndev;
        }
        priv = ipoib_priv(ndev);

        INIT_IB_EVENT_HANDLER(&priv->event_handler,
                              priv->ca, ipoib_event);
        ib_register_event_handler(&priv->event_handler);

        /* call event handler to ensure pkey in sync */
        queue_work(ipoib_workqueue, &priv->flush_heavy);

        ndev->rtnl_link_ops = ipoib_get_link_ops();

        result = register_netdev(ndev);
        if (result) {
                pr_warn("%s: couldn't register ipoib port %d; error %d\n",
                        hca->name, port, result);

                ipoib_parent_unregister_pre(ndev);
                ipoib_intf_free(ndev);
                free_netdev(ndev);

                return ERR_PTR(result);
        }

        if (hca->ops.rdma_netdev_get_params) {
                int rc = hca->ops.rdma_netdev_get_params(hca, port,
                                                     RDMA_NETDEV_IPOIB,
                                                     &params);

                if (!rc && ops->priv_size < params.sizeof_priv)
                        ops->priv_size = params.sizeof_priv;
        }
        /*
         * We cannot set priv_destructor before register_netdev because we
         * need priv to be always valid during the error flow to execute
         * ipoib_parent_unregister_pre(). Instead handle it manually and only
         * enter priv_destructor mode once we are completely registered.
         */
        ndev->priv_destructor = ipoib_intf_free;

        if (ipoib_intercept_dev_id_attr(ndev))
                goto sysfs_failed;
        if (ipoib_cm_add_mode_attr(ndev))
                goto sysfs_failed;
        if (ipoib_add_pkey_attr(ndev))
                goto sysfs_failed;
        if (ipoib_add_umcast_attr(ndev))
                goto sysfs_failed;
        if (device_create_file(&ndev->dev, &dev_attr_create_child))
                goto sysfs_failed;
        if (device_create_file(&ndev->dev, &dev_attr_delete_child))
                goto sysfs_failed;

        return ndev;

sysfs_failed:
        ipoib_parent_unregister_pre(ndev);
        unregister_netdev(ndev);
        return ERR_PTR(-ENOMEM);
}

static int ipoib_add_one(struct ib_device *device)
{
        struct list_head *dev_list;
        struct net_device *dev;
        struct ipoib_dev_priv *priv;
        unsigned int p;
        int count = 0;

        dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL);
        if (!dev_list)
                return -ENOMEM;

        INIT_LIST_HEAD(dev_list);

        rdma_for_each_port (device, p) {
                if (!rdma_protocol_ib(device, p))
                        continue;
                dev = ipoib_add_port("ib%d", device, p);
                if (!IS_ERR(dev)) {
                        priv = ipoib_priv(dev);
                        list_add_tail(&priv->list, dev_list);
                        count++;
                }
        }

        if (!count) {
                kfree(dev_list);
                return -EOPNOTSUPP;
        }

        ib_set_client_data(device, &ipoib_client, dev_list);
        return 0;
}

static void ipoib_remove_one(struct ib_device *device, void *client_data)
{
        struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv;
        struct list_head *dev_list = client_data;

        list_for_each_entry_safe(priv, tmp, dev_list, list) {
                LIST_HEAD(head);
                ipoib_parent_unregister_pre(priv->dev);

                rtnl_lock();

                list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs,
                                         list)
                        unregister_netdevice_queue(cpriv->dev, &head);
                unregister_netdevice_queue(priv->dev, &head);
                unregister_netdevice_many(&head);

                rtnl_unlock();
        }

        kfree(dev_list);
}

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
static struct notifier_block ipoib_netdev_notifier = {
        .notifier_call = ipoib_netdev_event,
};
#endif

static int __init ipoib_init_module(void)
{
        int ret;

        ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
        ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
        ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);

        ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
        ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
        ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
#ifdef CONFIG_INFINIBAND_IPOIB_CM
        ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
        ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0);
#endif

        /*
         * When copying small received packets, we only copy from the
         * linear data part of the SKB, so we rely on this condition.
         */
        BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);

        ipoib_register_debugfs();

        /*
         * We create a global workqueue here that is used for all flush
         * operations.  However, if you attempt to flush a workqueue
         * from a task on that same workqueue, it deadlocks the system.
         * We want to be able to flush the tasks associated with a
         * specific net device, so we also create a workqueue for each
         * netdevice.  We queue up the tasks for that device only on
         * its private workqueue, and we only queue up flush events
         * on our global flush workqueue.  This avoids the deadlocks.
         */
        ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0);
        if (!ipoib_workqueue) {
                ret = -ENOMEM;
                goto err_fs;
        }

        ib_sa_register_client(&ipoib_sa_client);

        ret = ib_register_client(&ipoib_client);
        if (ret)
                goto err_sa;

        ret = ipoib_netlink_init();
        if (ret)
                goto err_client;

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
        register_netdevice_notifier(&ipoib_netdev_notifier);
#endif
        return 0;

err_client:
        ib_unregister_client(&ipoib_client);

err_sa:
        ib_sa_unregister_client(&ipoib_sa_client);
        destroy_workqueue(ipoib_workqueue);

err_fs:
        ipoib_unregister_debugfs();

        return ret;
}

static void __exit ipoib_cleanup_module(void)
{
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
        unregister_netdevice_notifier(&ipoib_netdev_notifier);
#endif
        ipoib_netlink_fini();
        ib_unregister_client(&ipoib_client);
        ib_sa_unregister_client(&ipoib_sa_client);
        ipoib_unregister_debugfs();
        destroy_workqueue(ipoib_workqueue);
}

module_init(ipoib_init_module);
module_exit(ipoib_cleanup_module);

















    1 






    8 





    8 

    1 








  102 








  147 
  148 












  102 































  148 






  149 




  149 

  102 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// SPDX-License-Identifier: GPL-2.0
/*
 * Based on arch/arm/mm/extable.c
 */

#include <linux/bitfield.h>
#include <linux/extable.h>
#include <linux/uaccess.h>

#include <asm/asm-extable.h>
#include <asm/esr.h>
#include <asm/ptrace.h>

static bool cpy_faulted_on_uaccess(const struct exception_table_entry *ex,
                                   unsigned long esr)
{
        bool uaccess_is_write = FIELD_GET(EX_DATA_UACCESS_WRITE, ex->data);
        bool fault_on_write = esr & ESR_ELx_WNR;

        return uaccess_is_write == fault_on_write;
}

bool insn_may_access_user(unsigned long addr, unsigned long esr)
{
        const struct exception_table_entry *ex = search_exception_tables(addr);

        if (!ex)
                return false;

        switch (ex->type) {
        case EX_TYPE_UACCESS_CPY:
                return cpy_faulted_on_uaccess(ex, esr);
        default:
                return true;
        }
}

static inline unsigned long
get_ex_fixup(const struct exception_table_entry *ex)
{
        return ((unsigned long)&ex->fixup + ex->fixup);
}

static bool ex_handler_uaccess_err_zero(const struct exception_table_entry *ex,
                                        struct pt_regs *regs)
{
        int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
        int reg_zero = FIELD_GET(EX_DATA_REG_ZERO, ex->data);

        pt_regs_write_reg(regs, reg_err, -EFAULT);
        pt_regs_write_reg(regs, reg_zero, 0);

        regs->pc = get_ex_fixup(ex);
        return true;
}

static bool ex_handler_uaccess_cpy(const struct exception_table_entry *ex,
                                   struct pt_regs *regs, unsigned long esr)
{
        /* Do not fix up faults on kernel memory accesses */
        if (!cpy_faulted_on_uaccess(ex, esr))
                return false;

        regs->pc = get_ex_fixup(ex);
        return true;
}

static bool
ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex,
                                  struct pt_regs *regs)
{
        int reg_data = FIELD_GET(EX_DATA_REG_DATA, ex->data);
        int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
        unsigned long data, addr, offset;

        addr = pt_regs_read_reg(regs, reg_addr);

        offset = addr & 0x7UL;
        addr &= ~0x7UL;

        data = *(unsigned long*)addr;

#ifndef __AARCH64EB__
        data >>= 8 * offset;
#else
        data <<= 8 * offset;
#endif

        pt_regs_write_reg(regs, reg_data, data);

        regs->pc = get_ex_fixup(ex);
        return true;
}

bool fixup_exception(struct pt_regs *regs, unsigned long esr)
{
        const struct exception_table_entry *ex;

        ex = search_exception_tables(instruction_pointer(regs));
        if (!ex)
                return false;

        switch (ex->type) {
        case EX_TYPE_BPF:
                return ex_handler_bpf(ex, regs);
        case EX_TYPE_UACCESS_ERR_ZERO:
        case EX_TYPE_KACCESS_ERR_ZERO:
                return ex_handler_uaccess_err_zero(ex, regs);
        case EX_TYPE_UACCESS_CPY:
                return ex_handler_uaccess_cpy(ex, regs, esr);
        case EX_TYPE_LOAD_UNALIGNED_ZEROPAD:
                return ex_handler_load_unaligned_zeropad(ex, regs);
        }

        BUG();
}































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MEMREMAP_H_
#define _LINUX_MEMREMAP_H_

#include <linux/mmzone.h>
#include <linux/range.h>
#include <linux/ioport.h>
#include <linux/percpu-refcount.h>

struct resource;
struct device;

/**
 * struct vmem_altmap - pre-allocated storage for vmemmap_populate
 * @base_pfn: base of the entire dev_pagemap mapping
 * @reserve: pages mapped, but reserved for driver use (relative to @base)
 * @free: free pages set aside in the mapping for memmap storage
 * @align: pages reserved to meet allocation alignments
 * @alloc: track pages consumed, private to vmemmap_populate()
 */
struct vmem_altmap {
        unsigned long base_pfn;
        const unsigned long end_pfn;
        const unsigned long reserve;
        unsigned long free;
        unsigned long align;
        unsigned long alloc;
        bool inaccessible;
};

/*
 * Specialize ZONE_DEVICE memory into multiple types each has a different
 * usage.
 *
 * MEMORY_DEVICE_PRIVATE:
 * Device memory that is not directly addressable by the CPU: CPU can neither
 * read nor write private memory. In this case, we do still have struct pages
 * backing the device memory. Doing so simplifies the implementation, but it is
 * important to remember that there are certain points at which the struct page
 * must be treated as an opaque object, rather than a "normal" struct page.
 *
 * A more complete discussion of unaddressable memory may be found in
 * include/linux/hmm.h and Documentation/mm/hmm.rst.
 *
 * MEMORY_DEVICE_COHERENT:
 * Device memory that is cache coherent from device and CPU point of view. This
 * is used on platforms that have an advanced system bus (like CAPI or CXL). A
 * driver can hotplug the device memory using ZONE_DEVICE and with that memory
 * type. Any page of a process can be migrated to such memory. However no one
 * should be allowed to pin such memory so that it can always be evicted.
 *
 * MEMORY_DEVICE_FS_DAX:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. In support of coordinating page
 * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a
 * wakeup event whenever a page is unpinned and becomes idle. This
 * wakeup is used to coordinate physical address space management (ex:
 * fs truncate/hole punch) vs pinned pages (ex: device dma).
 *
 * MEMORY_DEVICE_GENERIC:
 * Host memory that has similar access semantics as System RAM i.e. DMA
 * coherent and supports page pinning. This is for example used by DAX devices
 * that expose memory using a character device.
 *
 * MEMORY_DEVICE_PCI_P2PDMA:
 * Device memory residing in a PCI BAR intended for use with Peer-to-Peer
 * transactions.
 */
enum memory_type {
        /* 0 is reserved to catch uninitialized type fields */
        MEMORY_DEVICE_PRIVATE = 1,
        MEMORY_DEVICE_COHERENT,
        MEMORY_DEVICE_FS_DAX,
        MEMORY_DEVICE_GENERIC,
        MEMORY_DEVICE_PCI_P2PDMA,
};

struct dev_pagemap_ops {
        /*
         * Called once the page refcount reaches 0.  The reference count will be
         * reset to one by the core code after the method is called to prepare
         * for handing out the page again.
         */
        void (*page_free)(struct page *page);

        /*
         * Used for private (un-addressable) device memory only.  Must migrate
         * the page back to a CPU accessible page.
         */
        vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf);

        /*
         * Handle the memory failure happens on a range of pfns.  Notify the
         * processes who are using these pfns, and try to recover the data on
         * them if necessary.  The mf_flags is finally passed to the recover
         * function through the whole notify routine.
         *
         * When this is not implemented, or it returns -EOPNOTSUPP, the caller
         * will fall back to a common handler called mf_generic_kill_procs().
         */
        int (*memory_failure)(struct dev_pagemap *pgmap, unsigned long pfn,
                              unsigned long nr_pages, int mf_flags);
};

#define PGMAP_ALTMAP_VALID        (1 << 0)

/**
 * struct dev_pagemap - metadata for ZONE_DEVICE mappings
 * @altmap: pre-allocated/reserved memory for vmemmap allocations
 * @ref: reference count that pins the devm_memremap_pages() mapping
 * @done: completion for @ref
 * @type: memory type: see MEMORY_* above in memremap.h
 * @flags: PGMAP_* flags to specify defailed behavior
 * @vmemmap_shift: structural definition of how the vmemmap page metadata
 *      is populated, specifically the metadata page order.
 *        A zero value (default) uses base pages as the vmemmap metadata
 *        representation. A bigger value will set up compound struct pages
 *        of the requested order value.
 * @ops: method table
 * @owner: an opaque pointer identifying the entity that manages this
 *        instance.  Used by various helpers to make sure that no
 *        foreign ZONE_DEVICE memory is accessed.
 * @nr_range: number of ranges to be mapped
 * @range: range to be mapped when nr_range == 1
 * @ranges: array of ranges to be mapped when nr_range > 1
 */
struct dev_pagemap {
        struct vmem_altmap altmap;
        struct percpu_ref ref;
        struct completion done;
        enum memory_type type;
        unsigned int flags;
        unsigned long vmemmap_shift;
        const struct dev_pagemap_ops *ops;
        void *owner;
        int nr_range;
        union {
                struct range range;
                DECLARE_FLEX_ARRAY(struct range, ranges);
        };
};

static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
{
        return pgmap->ops && pgmap->ops->memory_failure;
}

static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{
        if (pgmap->flags & PGMAP_ALTMAP_VALID)
                return &pgmap->altmap;
        return NULL;
}

static inline unsigned long pgmap_vmemmap_nr(struct dev_pagemap *pgmap)
{
        return 1 << pgmap->vmemmap_shift;
}

static inline bool is_device_private_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_DEVICE_PRIVATE) &&
                is_zone_device_page(page) &&
                page_pgmap(page)->type == MEMORY_DEVICE_PRIVATE;
}

static inline bool folio_is_device_private(const struct folio *folio)
{
        return is_device_private_page(&folio->page);
}

static inline bool is_pci_p2pdma_page(const struct page *page)
{
        return IS_ENABLED(CONFIG_PCI_P2PDMA) &&
                is_zone_device_page(page) &&
                page_pgmap(page)->type == MEMORY_DEVICE_PCI_P2PDMA;
}

static inline bool is_device_coherent_page(const struct page *page)
{
        return is_zone_device_page(page) &&
                page_pgmap(page)->type == MEMORY_DEVICE_COHERENT;
}

static inline bool folio_is_device_coherent(const struct folio *folio)
{
        return is_device_coherent_page(&folio->page);
}

static inline bool is_fsdax_page(const struct page *page)
{
        return is_zone_device_page(page) &&
                page_pgmap(page)->type == MEMORY_DEVICE_FS_DAX;
}

static inline bool folio_is_fsdax(const struct folio *folio)
{
        return is_fsdax_page(&folio->page);
}

#ifdef CONFIG_ZONE_DEVICE
void zone_device_page_init(struct page *page);
void *memremap_pages(struct dev_pagemap *pgmap, int nid);
void memunmap_pages(struct dev_pagemap *pgmap);
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
                struct dev_pagemap *pgmap);
bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn);

unsigned long memremap_compat_align(void);
#else
static inline void *devm_memremap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
        /*
         * Fail attempts to call devm_memremap_pages() without
         * ZONE_DEVICE support enabled, this requires callers to fall
         * back to plain devm_memremap() based on config
         */
        WARN_ON_ONCE(1);
        return ERR_PTR(-ENXIO);
}

static inline void devm_memunmap_pages(struct device *dev,
                struct dev_pagemap *pgmap)
{
}

static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
                struct dev_pagemap *pgmap)
{
        return NULL;
}

static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn)
{
        return false;
}

/* when memremap_pages() is disabled all archs can remap a single page */
static inline unsigned long memremap_compat_align(void)
{
        return PAGE_SIZE;
}
#endif /* CONFIG_ZONE_DEVICE */

static inline void put_dev_pagemap(struct dev_pagemap *pgmap)
{
        if (pgmap)
                percpu_ref_put(&pgmap->ref);
}

#endif /* _LINUX_MEMREMAP_H_ */

































































































































































































































































































































































































































































































    3 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Generic nexthop implementation
 *
 * Copyright (c) 2017-19 Cumulus Networks
 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
 */

#ifndef __LINUX_NEXTHOP_H
#define __LINUX_NEXTHOP_H

#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/route.h>
#include <linux/types.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/netlink.h>

#define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK

struct nexthop;

struct nh_config {
        u32                nh_id;

        u8                nh_family;
        u8                nh_protocol;
        u8                nh_blackhole;
        u8                nh_fdb;
        u32                nh_flags;

        int                nh_ifindex;
        struct net_device *dev;

        union {
                __be32                ipv4;
                struct in6_addr        ipv6;
        } gw;

        struct nlattr        *nh_grp;
        u16                nh_grp_type;
        u16                nh_grp_res_num_buckets;
        unsigned long        nh_grp_res_idle_timer;
        unsigned long        nh_grp_res_unbalanced_timer;
        bool                nh_grp_res_has_num_buckets;
        bool                nh_grp_res_has_idle_timer;
        bool                nh_grp_res_has_unbalanced_timer;

        bool                nh_hw_stats;

        struct nlattr        *nh_encap;
        u16                nh_encap_type;

        u32                nlflags;
        struct nl_info        nlinfo;
};

struct nh_info {
        struct hlist_node        dev_hash;    /* entry on netns devhash */
        struct nexthop                *nh_parent;

        u8                        family;
        bool                        reject_nh;
        bool                        fdb_nh;

        union {
                struct fib_nh_common        fib_nhc;
                struct fib_nh                fib_nh;
                struct fib6_nh                fib6_nh;
        };
};

struct nh_res_bucket {
        struct nh_grp_entry __rcu *nh_entry;
        atomic_long_t                used_time;
        unsigned long                migrated_time;
        bool                        occupied;
        u8                        nh_flags;
};

struct nh_res_table {
        struct net                *net;
        u32                        nhg_id;
        struct delayed_work        upkeep_dw;

        /* List of NHGEs that have too few buckets ("uw" for underweight).
         * Reclaimed buckets will be given to entries in this list.
         */
        struct list_head        uw_nh_entries;
        unsigned long                unbalanced_since;

        u32                        idle_timer;
        u32                        unbalanced_timer;

        u16                        num_nh_buckets;
        struct nh_res_bucket        nh_buckets[] __counted_by(num_nh_buckets);
};

struct nh_grp_entry_stats {
        u64_stats_t packets;
        struct u64_stats_sync syncp;
};

struct nh_grp_entry {
        struct nexthop        *nh;
        struct nh_grp_entry_stats __percpu        *stats;
        u16                weight;

        union {
                struct {
                        atomic_t        upper_bound;
                } hthr;
                struct {
                        /* Member on uw_nh_entries. */
                        struct list_head        uw_nh_entry;

                        u16                        count_buckets;
                        u16                        wants_buckets;
                } res;
        };

        struct list_head nh_list;
        struct nexthop        *nh_parent;  /* nexthop of group with this entry */
        u64                packets_hw;
};

struct nh_group {
        struct nh_group                *spare; /* spare group for removals */
        u16                        num_nh;
        bool                        is_multipath;
        bool                        hash_threshold;
        bool                        resilient;
        bool                        fdb_nh;
        bool                        has_v4;
        bool                        hw_stats;

        struct nh_res_table __rcu *res_table;
        struct nh_grp_entry        nh_entries[] __counted_by(num_nh);
};

struct nexthop {
        struct rb_node                rb_node;    /* entry on netns rbtree */
        struct list_head        fi_list;    /* v4 entries using nh */
        struct list_head        f6i_list;   /* v6 entries using nh */
        struct list_head        fdb_list;   /* fdb entries using this nh */
        struct list_head        grp_list;   /* nh group entries using this nh */
        struct net                *net;

        u32                        id;

        u8                        protocol;   /* app managing this nh */
        u8                        nh_flags;
        bool                        is_group;

        refcount_t                refcnt;
        struct rcu_head                rcu;

        union {
                struct nh_info        __rcu *nh_info;
                struct nh_group __rcu *nh_grp;
        };
};

enum nexthop_event_type {
        NEXTHOP_EVENT_DEL,
        NEXTHOP_EVENT_REPLACE,
        NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
        NEXTHOP_EVENT_BUCKET_REPLACE,
        NEXTHOP_EVENT_HW_STATS_REPORT_DELTA,
};

enum nh_notifier_info_type {
        NH_NOTIFIER_INFO_TYPE_SINGLE,
        NH_NOTIFIER_INFO_TYPE_GRP,
        NH_NOTIFIER_INFO_TYPE_RES_TABLE,
        NH_NOTIFIER_INFO_TYPE_RES_BUCKET,
        NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS,
};

struct nh_notifier_single_info {
        struct net_device *dev;
        u8 gw_family;
        union {
                __be32 ipv4;
                struct in6_addr ipv6;
        };
        u32 id;
        u8 is_reject:1,
           is_fdb:1,
           has_encap:1;
};

struct nh_notifier_grp_entry_info {
        u16 weight;
        struct nh_notifier_single_info nh;
};

struct nh_notifier_grp_info {
        u16 num_nh;
        bool is_fdb;
        bool hw_stats;
        struct nh_notifier_grp_entry_info nh_entries[] __counted_by(num_nh);
};

struct nh_notifier_res_bucket_info {
        u16 bucket_index;
        unsigned int idle_timer_ms;
        bool force;
        struct nh_notifier_single_info old_nh;
        struct nh_notifier_single_info new_nh;
};

struct nh_notifier_res_table_info {
        u16 num_nh_buckets;
        bool hw_stats;
        struct nh_notifier_single_info nhs[] __counted_by(num_nh_buckets);
};

struct nh_notifier_grp_hw_stats_entry_info {
        u32 id;
        u64 packets;
};

struct nh_notifier_grp_hw_stats_info {
        u16 num_nh;
        bool hw_stats_used;
        struct nh_notifier_grp_hw_stats_entry_info stats[] __counted_by(num_nh);
};

struct nh_notifier_info {
        struct net *net;
        struct netlink_ext_ack *extack;
        u32 id;
        enum nh_notifier_info_type type;
        union {
                struct nh_notifier_single_info *nh;
                struct nh_notifier_grp_info *nh_grp;
                struct nh_notifier_res_table_info *nh_res_table;
                struct nh_notifier_res_bucket_info *nh_res_bucket;
                struct nh_notifier_grp_hw_stats_info *nh_grp_hw_stats;
        };
};

int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
                              struct netlink_ext_ack *extack);
int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap);
void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
                                 bool offload, bool trap);
void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
                                     unsigned long *activity);
void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info,
                                  unsigned int nh_idx,
                                  u64 delta_packets);

/* caller is holding rcu or rtnl; no reference taken to nexthop */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
void nexthop_free_rcu(struct rcu_head *head);

static inline bool nexthop_get(struct nexthop *nh)
{
        return refcount_inc_not_zero(&nh->refcnt);
}

static inline void nexthop_put(struct nexthop *nh)
{
        if (refcount_dec_and_test(&nh->refcnt))
                call_rcu_hurry(&nh->rcu, nexthop_free_rcu);
}

static inline bool nexthop_cmp(const struct nexthop *nh1,
                               const struct nexthop *nh2)
{
        return nh1 == nh2;
}

static inline bool nexthop_is_fdb(const struct nexthop *nh)
{
        if (nh->is_group) {
                const struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->fdb_nh;
        } else {
                const struct nh_info *nhi;

                nhi = rcu_dereference_rtnl(nh->nh_info);
                return nhi->fdb_nh;
        }
}

static inline bool nexthop_has_v4(const struct nexthop *nh)
{
        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->has_v4;
        }
        return false;
}

static inline bool nexthop_is_multipath(const struct nexthop *nh)
{
        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                return nh_grp->is_multipath;
        }
        return false;
}

struct nexthop *nexthop_select_path(struct nexthop *nh, int hash);

static inline unsigned int nexthop_num_path(const struct nexthop *nh)
{
        unsigned int rc = 1;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->is_multipath)
                        rc = nh_grp->num_nh;
        }

        return rc;
}

static inline
struct nexthop *nexthop_mpath_select(const struct nh_group *nhg, int nhsel)
{
        /* for_nexthops macros in fib_semantics.c grabs a pointer to
         * the nexthop before checking nhsel
         */
        if (nhsel >= nhg->num_nh)
                return NULL;

        return nhg->nh_entries[nhsel].nh;
}

static inline
int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh,
                            u8 rt_family)
{
        struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
        int i;

        for (i = 0; i < nhg->num_nh; i++) {
                struct nexthop *nhe = nhg->nh_entries[i].nh;
                struct nh_info *nhi = rcu_dereference_rtnl(nhe->nh_info);
                struct fib_nh_common *nhc = &nhi->fib_nhc;
                int weight = nhg->nh_entries[i].weight;

                if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0)
                        return -EMSGSIZE;
        }

        return 0;
}

/* called with rcu lock */
static inline bool nexthop_is_blackhole(const struct nexthop *nh)
{
        const struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->num_nh > 1)
                        return false;

                nh = nh_grp->nh_entries[0].nh;
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        return nhi->reject_nh;
}

static inline void nexthop_path_fib_result(struct fib_result *res, int hash)
{
        struct nh_info *nhi;
        struct nexthop *nh;

        nh = nexthop_select_path(res->fi->nh, hash);
        nhi = rcu_dereference(nh->nh_info);
        res->nhc = &nhi->fib_nhc;
}

/* called with rcu read lock or rtnl held */
static inline
struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel)
{
        struct nh_info *nhi;

        BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0);
        BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0);

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                if (nh_grp->is_multipath) {
                        nh = nexthop_mpath_select(nh_grp, nhsel);
                        if (!nh)
                                return NULL;
                }
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        return &nhi->fib_nhc;
}

/* called from fib_table_lookup with rcu_lock */
static inline
struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh,
                                             int fib_flags,
                                             const struct flowi4 *flp,
                                             int *nhsel)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nhg = rcu_dereference(nh->nh_grp);
                int i;

                for (i = 0; i < nhg->num_nh; i++) {
                        struct nexthop *nhe = nhg->nh_entries[i].nh;

                        nhi = rcu_dereference(nhe->nh_info);
                        if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
                                *nhsel = i;
                                return &nhi->fib_nhc;
                        }
                }
        } else {
                nhi = rcu_dereference(nh->nh_info);
                if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) {
                        *nhsel = 0;
                        return &nhi->fib_nhc;
                }
        }

        return NULL;
}

static inline bool nexthop_uses_dev(const struct nexthop *nh,
                                    const struct net_device *dev)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nhg = rcu_dereference(nh->nh_grp);
                int i;

                for (i = 0; i < nhg->num_nh; i++) {
                        struct nexthop *nhe = nhg->nh_entries[i].nh;

                        nhi = rcu_dereference(nhe->nh_info);
                        if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
                                return true;
                }
        } else {
                nhi = rcu_dereference(nh->nh_info);
                if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev))
                        return true;
        }

        return false;
}

static inline unsigned int fib_info_num_path(const struct fib_info *fi)
{
        if (unlikely(fi->nh))
                return nexthop_num_path(fi->nh);

        return fi->fib_nhs;
}

int fib_check_nexthop(struct nexthop *nh, u8 scope,
                      struct netlink_ext_ack *extack);

static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel)
{
        if (unlikely(fi->nh))
                return nexthop_fib_nhc(fi->nh, nhsel);

        return &fi->fib_nh[nhsel].nh_common;
}

/* only used when fib_nh is built into fib_info */
static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
{
        WARN_ON(fi->nh);

        return &fi->fib_nh[nhsel];
}

/*
 * IPv6 variants
 */
int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
                       struct netlink_ext_ack *extack);

/* Caller should either hold rcu_read_lock(), or RTNL. */
static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
{
        struct nh_info *nhi;

        if (nh->is_group) {
                struct nh_group *nh_grp;

                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
                nh = nexthop_mpath_select(nh_grp, 0);
                if (!nh)
                        return NULL;
        }

        nhi = rcu_dereference_rtnl(nh->nh_info);
        if (nhi->family == AF_INET6)
                return &nhi->fib6_nh;

        return NULL;
}

static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
{
        struct fib6_nh *fib6_nh;

        fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh;
        return fib6_nh->fib_nh_dev;
}

static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
{
        struct nexthop *nh = res->f6i->nh;
        struct nh_info *nhi;

        nh = nexthop_select_path(nh, hash);

        nhi = rcu_dereference_rtnl(nh->nh_info);
        if (nhi->reject_nh) {
                res->fib6_type = RTN_BLACKHOLE;
                res->fib6_flags |= RTF_REJECT;
                res->nh = nexthop_fib6_nh(nh);
        } else {
                res->nh = &nhi->fib6_nh;
        }
}

int nexthop_for_each_fib6_nh(struct nexthop *nh,
                             int (*cb)(struct fib6_nh *nh, void *arg),
                             void *arg);

static inline int nexthop_get_family(struct nexthop *nh)
{
        struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);

        return nhi->family;
}

static inline
struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
{
        struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);

        return &nhi->fib_nhc;
}

static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
                                                            int hash)
{
        struct nh_info *nhi;
        struct nexthop *nhp;

        nhp = nexthop_select_path(nh, hash);
        if (unlikely(!nhp))
                return NULL;
        nhi = rcu_dereference(nhp->nh_info);
        return &nhi->fib_nhc;
}
#endif


















































































































































































































































  420 




   78 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/drivers/clocksource/arm_arch_timer.c
 *
 *  Copyright (C) 2011 ARM Ltd.
 *  All Rights Reserved
 */

#define pr_fmt(fmt)         "arch_timer: " fmt

#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/device.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/cpu_pm.h>
#include <linux/clockchips.h>
#include <linux/clocksource.h>
#include <linux/clocksource_ids.h>
#include <linux/interrupt.h>
#include <linux/kstrtox.h>
#include <linux/of_irq.h>
#include <linux/of_address.h>
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/sched/clock.h>
#include <linux/sched_clock.h>
#include <linux/acpi.h>
#include <linux/arm-smccc.h>
#include <linux/ptp_kvm.h>

#include <asm/arch_timer.h>
#include <asm/virt.h>

#include <clocksource/arm_arch_timer.h>

#define CNTTIDR                0x08
#define CNTTIDR_VIRT(n)        (BIT(1) << ((n) * 4))

#define CNTACR(n)        (0x40 + ((n) * 4))
#define CNTACR_RPCT        BIT(0)
#define CNTACR_RVCT        BIT(1)
#define CNTACR_RFRQ        BIT(2)
#define CNTACR_RVOFF        BIT(3)
#define CNTACR_RWVT        BIT(4)
#define CNTACR_RWPT        BIT(5)

#define CNTPCT_LO        0x00
#define CNTVCT_LO        0x08
#define CNTFRQ                0x10
#define CNTP_CVAL_LO        0x20
#define CNTP_CTL        0x2c
#define CNTV_CVAL_LO        0x30
#define CNTV_CTL        0x3c

/*
 * The minimum amount of time a generic counter is guaranteed to not roll over
 * (40 years)
 */
#define MIN_ROLLOVER_SECS        (40ULL * 365 * 24 * 3600)

static unsigned arch_timers_present __initdata;

struct arch_timer {
        void __iomem *base;
        struct clock_event_device evt;
};

static struct arch_timer *arch_timer_mem __ro_after_init;

#define to_arch_timer(e) container_of(e, struct arch_timer, evt)

static u32 arch_timer_rate __ro_after_init;
static int arch_timer_ppi[ARCH_TIMER_MAX_TIMER_PPI] __ro_after_init;

static const char *arch_timer_ppi_names[ARCH_TIMER_MAX_TIMER_PPI] = {
        [ARCH_TIMER_PHYS_SECURE_PPI]        = "sec-phys",
        [ARCH_TIMER_PHYS_NONSECURE_PPI]        = "phys",
        [ARCH_TIMER_VIRT_PPI]                = "virt",
        [ARCH_TIMER_HYP_PPI]                = "hyp-phys",
        [ARCH_TIMER_HYP_VIRT_PPI]        = "hyp-virt",
};

static struct clock_event_device __percpu *arch_timer_evt;

static enum arch_timer_ppi_nr arch_timer_uses_ppi __ro_after_init = ARCH_TIMER_VIRT_PPI;
static bool arch_timer_c3stop __ro_after_init;
static bool arch_timer_mem_use_virtual __ro_after_init;
static bool arch_counter_suspend_stop __ro_after_init;
#ifdef CONFIG_GENERIC_GETTIMEOFDAY
static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_ARCHTIMER;
#else
static enum vdso_clock_mode vdso_default = VDSO_CLOCKMODE_NONE;
#endif /* CONFIG_GENERIC_GETTIMEOFDAY */

static cpumask_t evtstrm_available = CPU_MASK_NONE;
static bool evtstrm_enable __ro_after_init = IS_ENABLED(CONFIG_ARM_ARCH_TIMER_EVTSTREAM);

static int __init early_evtstrm_cfg(char *buf)
{
        return kstrtobool(buf, &evtstrm_enable);
}
early_param("clocksource.arm_arch_timer.evtstrm", early_evtstrm_cfg);

/*
 * Makes an educated guess at a valid counter width based on the Generic Timer
 * specification. Of note:
 *   1) the system counter is at least 56 bits wide
 *   2) a roll-over time of not less than 40 years
 *
 * See 'ARM DDI 0487G.a D11.1.2 ("The system counter")' for more details.
 */
static int arch_counter_get_width(void)
{
        u64 min_cycles = MIN_ROLLOVER_SECS * arch_timer_rate;

        /* guarantee the returned width is within the valid range */
        return clamp_val(ilog2(min_cycles - 1) + 1, 56, 64);
}

/*
 * Architected system timer support.
 */

static __always_inline
void arch_timer_reg_write(int access, enum arch_timer_reg reg, u64 val,
                          struct clock_event_device *clk)
{
        if (access == ARCH_TIMER_MEM_PHYS_ACCESS) {
                struct arch_timer *timer = to_arch_timer(clk);
                switch (reg) {
                case ARCH_TIMER_REG_CTRL:
                        writel_relaxed((u32)val, timer->base + CNTP_CTL);
                        break;
                case ARCH_TIMER_REG_CVAL:
                        /*
                         * Not guaranteed to be atomic, so the timer
                         * must be disabled at this point.
                         */
                        writeq_relaxed(val, timer->base + CNTP_CVAL_LO);
                        break;
                default:
                        BUILD_BUG();
                }
        } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) {
                struct arch_timer *timer = to_arch_timer(clk);
                switch (reg) {
                case ARCH_TIMER_REG_CTRL:
                        writel_relaxed((u32)val, timer->base + CNTV_CTL);
                        break;
                case ARCH_TIMER_REG_CVAL:
                        /* Same restriction as above */
                        writeq_relaxed(val, timer->base + CNTV_CVAL_LO);
                        break;
                default:
                        BUILD_BUG();
                }
        } else {
                arch_timer_reg_write_cp15(access, reg, val);
        }
}

static __always_inline
u32 arch_timer_reg_read(int access, enum arch_timer_reg reg,
                        struct clock_event_device *clk)
{
        u32 val;

        if (access == ARCH_TIMER_MEM_PHYS_ACCESS) {
                struct arch_timer *timer = to_arch_timer(clk);
                switch (reg) {
                case ARCH_TIMER_REG_CTRL:
                        val = readl_relaxed(timer->base + CNTP_CTL);
                        break;
                default:
                        BUILD_BUG();
                }
        } else if (access == ARCH_TIMER_MEM_VIRT_ACCESS) {
                struct arch_timer *timer = to_arch_timer(clk);
                switch (reg) {
                case ARCH_TIMER_REG_CTRL:
                        val = readl_relaxed(timer->base + CNTV_CTL);
                        break;
                default:
                        BUILD_BUG();
                }
        } else {
                val = arch_timer_reg_read_cp15(access, reg);
        }

        return val;
}

static noinstr u64 raw_counter_get_cntpct_stable(void)
{
        return __arch_counter_get_cntpct_stable();
}

static notrace u64 arch_counter_get_cntpct_stable(void)
{
        u64 val;
        preempt_disable_notrace();
        val = __arch_counter_get_cntpct_stable();
        preempt_enable_notrace();
        return val;
}

static noinstr u64 arch_counter_get_cntpct(void)
{
        return __arch_counter_get_cntpct();
}

static noinstr u64 raw_counter_get_cntvct_stable(void)
{
        return __arch_counter_get_cntvct_stable();
}

static notrace u64 arch_counter_get_cntvct_stable(void)
{
        u64 val;
        preempt_disable_notrace();
        val = __arch_counter_get_cntvct_stable();
        preempt_enable_notrace();
        return val;
}

static noinstr u64 arch_counter_get_cntvct(void)
{
        return __arch_counter_get_cntvct();
}

/*
 * Default to cp15 based access because arm64 uses this function for
 * sched_clock() before DT is probed and the cp15 method is guaranteed
 * to exist on arm64. arm doesn't use this before DT is probed so even
 * if we don't have the cp15 accessors we won't have a problem.
 */
u64 (*arch_timer_read_counter)(void) __ro_after_init = arch_counter_get_cntvct;
EXPORT_SYMBOL_GPL(arch_timer_read_counter);

static u64 arch_counter_read(struct clocksource *cs)
{
        return arch_timer_read_counter();
}

static u64 arch_counter_read_cc(const struct cyclecounter *cc)
{
        return arch_timer_read_counter();
}

static struct clocksource clocksource_counter = {
        .name        = "arch_sys_counter",
        .id        = CSID_ARM_ARCH_COUNTER,
        .rating        = 400,
        .read        = arch_counter_read,
        .flags        = CLOCK_SOURCE_IS_CONTINUOUS,
};

static struct cyclecounter cyclecounter __ro_after_init = {
        .read        = arch_counter_read_cc,
};

struct ate_acpi_oem_info {
        char oem_id[ACPI_OEM_ID_SIZE + 1];
        char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
        u32 oem_revision;
};

#ifdef CONFIG_FSL_ERRATUM_A008585
/*
 * The number of retries is an arbitrary value well beyond the highest number
 * of iterations the loop has been observed to take.
 */
#define __fsl_a008585_read_reg(reg) ({                        \
        u64 _old, _new;                                        \
        int _retries = 200;                                \
                                                        \
        do {                                                \
                _old = read_sysreg(reg);                \
                _new = read_sysreg(reg);                \
                _retries--;                                \
        } while (unlikely(_old != _new) && _retries);        \
                                                        \
        WARN_ON_ONCE(!_retries);                        \
        _new;                                                \
})

static u64 notrace fsl_a008585_read_cntpct_el0(void)
{
        return __fsl_a008585_read_reg(cntpct_el0);
}

static u64 notrace fsl_a008585_read_cntvct_el0(void)
{
        return __fsl_a008585_read_reg(cntvct_el0);
}
#endif

#ifdef CONFIG_HISILICON_ERRATUM_161010101
/*
 * Verify whether the value of the second read is larger than the first by
 * less than 32 is the only way to confirm the value is correct, so clear the
 * lower 5 bits to check whether the difference is greater than 32 or not.
 * Theoretically the erratum should not occur more than twice in succession
 * when reading the system counter, but it is possible that some interrupts
 * may lead to more than twice read errors, triggering the warning, so setting
 * the number of retries far beyond the number of iterations the loop has been
 * observed to take.
 */
#define __hisi_161010101_read_reg(reg) ({                                \
        u64 _old, _new;                                                \
        int _retries = 50;                                        \
                                                                \
        do {                                                        \
                _old = read_sysreg(reg);                        \
                _new = read_sysreg(reg);                        \
                _retries--;                                        \
        } while (unlikely((_new - _old) >> 5) && _retries);        \
                                                                \
        WARN_ON_ONCE(!_retries);                                \
        _new;                                                        \
})

static u64 notrace hisi_161010101_read_cntpct_el0(void)
{
        return __hisi_161010101_read_reg(cntpct_el0);
}

static u64 notrace hisi_161010101_read_cntvct_el0(void)
{
        return __hisi_161010101_read_reg(cntvct_el0);
}

static const struct ate_acpi_oem_info hisi_161010101_oem_info[] = {
        /*
         * Note that trailing spaces are required to properly match
         * the OEM table information.
         */
        {
                .oem_id                = "HISI  ",
                .oem_table_id        = "HIP05   ",
                .oem_revision        = 0,
        },
        {
                .oem_id                = "HISI  ",
                .oem_table_id        = "HIP06   ",
                .oem_revision        = 0,
        },
        {
                .oem_id                = "HISI  ",
                .oem_table_id        = "HIP07   ",
                .oem_revision        = 0,
        },
        { /* Sentinel indicating the end of the OEM array */ },
};
#endif

#ifdef CONFIG_ARM64_ERRATUM_858921
static u64 notrace arm64_858921_read_cntpct_el0(void)
{
        u64 old, new;

        old = read_sysreg(cntpct_el0);
        new = read_sysreg(cntpct_el0);
        return (((old ^ new) >> 32) & 1) ? old : new;
}

static u64 notrace arm64_858921_read_cntvct_el0(void)
{
        u64 old, new;

        old = read_sysreg(cntvct_el0);
        new = read_sysreg(cntvct_el0);
        return (((old ^ new) >> 32) & 1) ? old : new;
}
#endif

#ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1
/*
 * The low bits of the counter registers are indeterminate while bit 10 or
 * greater is rolling over. Since the counter value can jump both backward
 * (7ff -> 000 -> 800) and forward (7ff -> fff -> 800), ignore register values
 * with all ones or all zeros in the low bits. Bound the loop by the maximum
 * number of CPU cycles in 3 consecutive 24 MHz counter periods.
 */
#define __sun50i_a64_read_reg(reg) ({                                        \
        u64 _val;                                                        \
        int _retries = 150;                                                \
                                                                        \
        do {                                                                \
                _val = read_sysreg(reg);                                \
                _retries--;                                                \
        } while (((_val + 1) & GENMASK(8, 0)) <= 1 && _retries);        \
                                                                        \
        WARN_ON_ONCE(!_retries);                                        \
        _val;                                                                \
})

static u64 notrace sun50i_a64_read_cntpct_el0(void)
{
        return __sun50i_a64_read_reg(cntpct_el0);
}

static u64 notrace sun50i_a64_read_cntvct_el0(void)
{
        return __sun50i_a64_read_reg(cntvct_el0);
}
#endif

#ifdef CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND
DEFINE_PER_CPU(const struct arch_timer_erratum_workaround *, timer_unstable_counter_workaround);
EXPORT_SYMBOL_GPL(timer_unstable_counter_workaround);

static atomic_t timer_unstable_counter_workaround_in_use = ATOMIC_INIT(0);

/*
 * Force the inlining of this function so that the register accesses
 * can be themselves correctly inlined.
 */
static __always_inline
void erratum_set_next_event_generic(const int access, unsigned long evt,
                                    struct clock_event_device *clk)
{
        unsigned long ctrl;
        u64 cval;

        ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk);
        ctrl |= ARCH_TIMER_CTRL_ENABLE;
        ctrl &= ~ARCH_TIMER_CTRL_IT_MASK;

        if (access == ARCH_TIMER_PHYS_ACCESS) {
                cval = evt + arch_counter_get_cntpct_stable();
                write_sysreg(cval, cntp_cval_el0);
        } else {
                cval = evt + arch_counter_get_cntvct_stable();
                write_sysreg(cval, cntv_cval_el0);
        }

        arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
}

static __maybe_unused int erratum_set_next_event_virt(unsigned long evt,
                                            struct clock_event_device *clk)
{
        erratum_set_next_event_generic(ARCH_TIMER_VIRT_ACCESS, evt, clk);
        return 0;
}

static __maybe_unused int erratum_set_next_event_phys(unsigned long evt,
                                            struct clock_event_device *clk)
{
        erratum_set_next_event_generic(ARCH_TIMER_PHYS_ACCESS, evt, clk);
        return 0;
}

static const struct arch_timer_erratum_workaround ool_workarounds[] = {
#ifdef CONFIG_FSL_ERRATUM_A008585
        {
                .match_type = ate_match_dt,
                .id = "fsl,erratum-a008585",
                .desc = "Freescale erratum a005858",
                .read_cntpct_el0 = fsl_a008585_read_cntpct_el0,
                .read_cntvct_el0 = fsl_a008585_read_cntvct_el0,
                .set_next_event_phys = erratum_set_next_event_phys,
                .set_next_event_virt = erratum_set_next_event_virt,
        },
#endif
#ifdef CONFIG_HISILICON_ERRATUM_161010101
        {
                .match_type = ate_match_dt,
                .id = "hisilicon,erratum-161010101",
                .desc = "HiSilicon erratum 161010101",
                .read_cntpct_el0 = hisi_161010101_read_cntpct_el0,
                .read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
                .set_next_event_phys = erratum_set_next_event_phys,
                .set_next_event_virt = erratum_set_next_event_virt,
        },
        {
                .match_type = ate_match_acpi_oem_info,
                .id = hisi_161010101_oem_info,
                .desc = "HiSilicon erratum 161010101",
                .read_cntpct_el0 = hisi_161010101_read_cntpct_el0,
                .read_cntvct_el0 = hisi_161010101_read_cntvct_el0,
                .set_next_event_phys = erratum_set_next_event_phys,
                .set_next_event_virt = erratum_set_next_event_virt,
        },
#endif
#ifdef CONFIG_ARM64_ERRATUM_858921
        {
                .match_type = ate_match_local_cap_id,
                .id = (void *)ARM64_WORKAROUND_858921,
                .desc = "ARM erratum 858921",
                .read_cntpct_el0 = arm64_858921_read_cntpct_el0,
                .read_cntvct_el0 = arm64_858921_read_cntvct_el0,
                .set_next_event_phys = erratum_set_next_event_phys,
                .set_next_event_virt = erratum_set_next_event_virt,
        },
#endif
#ifdef CONFIG_SUN50I_ERRATUM_UNKNOWN1
        {
                .match_type = ate_match_dt,
                .id = "allwinner,erratum-unknown1",
                .desc = "Allwinner erratum UNKNOWN1",
                .read_cntpct_el0 = sun50i_a64_read_cntpct_el0,
                .read_cntvct_el0 = sun50i_a64_read_cntvct_el0,
                .set_next_event_phys = erratum_set_next_event_phys,
                .set_next_event_virt = erratum_set_next_event_virt,
        },
#endif
#ifdef CONFIG_ARM64_ERRATUM_1418040
        {
                .match_type = ate_match_local_cap_id,
                .id = (void *)ARM64_WORKAROUND_1418040,
                .desc = "ARM erratum 1418040",
                .disable_compat_vdso = true,
        },
#endif
};

typedef bool (*ate_match_fn_t)(const struct arch_timer_erratum_workaround *,
                               const void *);

static
bool arch_timer_check_dt_erratum(const struct arch_timer_erratum_workaround *wa,
                                 const void *arg)
{
        const struct device_node *np = arg;

        return of_property_read_bool(np, wa->id);
}

static
bool arch_timer_check_local_cap_erratum(const struct arch_timer_erratum_workaround *wa,
                                        const void *arg)
{
        return this_cpu_has_cap((uintptr_t)wa->id);
}


static
bool arch_timer_check_acpi_oem_erratum(const struct arch_timer_erratum_workaround *wa,
                                       const void *arg)
{
        static const struct ate_acpi_oem_info empty_oem_info = {};
        const struct ate_acpi_oem_info *info = wa->id;
        const struct acpi_table_header *table = arg;

        /* Iterate over the ACPI OEM info array, looking for a match */
        while (memcmp(info, &empty_oem_info, sizeof(*info))) {
                if (!memcmp(info->oem_id, table->oem_id, ACPI_OEM_ID_SIZE) &&
                    !memcmp(info->oem_table_id, table->oem_table_id, ACPI_OEM_TABLE_ID_SIZE) &&
                    info->oem_revision == table->oem_revision)
                        return true;

                info++;
        }

        return false;
}

static const struct arch_timer_erratum_workaround *
arch_timer_iterate_errata(enum arch_timer_erratum_match_type type,
                          ate_match_fn_t match_fn,
                          void *arg)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(ool_workarounds); i++) {
                if (ool_workarounds[i].match_type != type)
                        continue;

                if (match_fn(&ool_workarounds[i], arg))
                        return &ool_workarounds[i];
        }

        return NULL;
}

static
void arch_timer_enable_workaround(const struct arch_timer_erratum_workaround *wa,
                                  bool local)
{
        int i;

        if (local) {
                __this_cpu_write(timer_unstable_counter_workaround, wa);
        } else {
                for_each_possible_cpu(i)
                        per_cpu(timer_unstable_counter_workaround, i) = wa;
        }

        if (wa->read_cntvct_el0 || wa->read_cntpct_el0)
                atomic_set(&timer_unstable_counter_workaround_in_use, 1);

        /*
         * Don't use the vdso fastpath if errata require using the
         * out-of-line counter accessor. We may change our mind pretty
         * late in the game (with a per-CPU erratum, for example), so
         * change both the default value and the vdso itself.
         */
        if (wa->read_cntvct_el0) {
                clocksource_counter.vdso_clock_mode = VDSO_CLOCKMODE_NONE;
                vdso_default = VDSO_CLOCKMODE_NONE;
        } else if (wa->disable_compat_vdso && vdso_default != VDSO_CLOCKMODE_NONE) {
                vdso_default = VDSO_CLOCKMODE_ARCHTIMER_NOCOMPAT;
                clocksource_counter.vdso_clock_mode = vdso_default;
        }
}

static void arch_timer_check_ool_workaround(enum arch_timer_erratum_match_type type,
                                            void *arg)
{
        const struct arch_timer_erratum_workaround *wa, *__wa;
        ate_match_fn_t match_fn = NULL;
        bool local = false;

        switch (type) {
        case ate_match_dt:
                match_fn = arch_timer_check_dt_erratum;
                break;
        case ate_match_local_cap_id:
                match_fn = arch_timer_check_local_cap_erratum;
                local = true;
                break;
        case ate_match_acpi_oem_info:
                match_fn = arch_timer_check_acpi_oem_erratum;
                break;
        default:
                WARN_ON(1);
                return;
        }

        wa = arch_timer_iterate_errata(type, match_fn, arg);
        if (!wa)
                return;

        __wa = __this_cpu_read(timer_unstable_counter_workaround);
        if (__wa && wa != __wa)
                pr_warn("Can't enable workaround for %s (clashes with %s\n)",
                        wa->desc, __wa->desc);

        if (__wa)
                return;

        arch_timer_enable_workaround(wa, local);
        pr_info("Enabling %s workaround for %s\n",
                local ? "local" : "global", wa->desc);
}

static bool arch_timer_this_cpu_has_cntvct_wa(void)
{
        return has_erratum_handler(read_cntvct_el0);
}

static bool arch_timer_counter_has_wa(void)
{
        return atomic_read(&timer_unstable_counter_workaround_in_use);
}
#else
#define arch_timer_check_ool_workaround(t,a)                do { } while(0)
#define arch_timer_this_cpu_has_cntvct_wa()                ({false;})
#define arch_timer_counter_has_wa()                        ({false;})
#endif /* CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND */

static __always_inline irqreturn_t timer_handler(const int access,
                                        struct clock_event_device *evt)
{
        unsigned long ctrl;

        ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, evt);
        if (ctrl & ARCH_TIMER_CTRL_IT_STAT) {
                ctrl |= ARCH_TIMER_CTRL_IT_MASK;
                arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, evt);
                evt->event_handler(evt);
                return IRQ_HANDLED;
        }

        return IRQ_NONE;
}

static irqreturn_t arch_timer_handler_virt(int irq, void *dev_id)
{
        struct clock_event_device *evt = dev_id;

        return timer_handler(ARCH_TIMER_VIRT_ACCESS, evt);
}

static irqreturn_t arch_timer_handler_phys(int irq, void *dev_id)
{
        struct clock_event_device *evt = dev_id;

        return timer_handler(ARCH_TIMER_PHYS_ACCESS, evt);
}

static irqreturn_t arch_timer_handler_phys_mem(int irq, void *dev_id)
{
        struct clock_event_device *evt = dev_id;

        return timer_handler(ARCH_TIMER_MEM_PHYS_ACCESS, evt);
}

static irqreturn_t arch_timer_handler_virt_mem(int irq, void *dev_id)
{
        struct clock_event_device *evt = dev_id;

        return timer_handler(ARCH_TIMER_MEM_VIRT_ACCESS, evt);
}

static __always_inline int arch_timer_shutdown(const int access,
                                               struct clock_event_device *clk)
{
        unsigned long ctrl;

        ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk);
        ctrl &= ~ARCH_TIMER_CTRL_ENABLE;
        arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);

        return 0;
}

static int arch_timer_shutdown_virt(struct clock_event_device *clk)
{
        return arch_timer_shutdown(ARCH_TIMER_VIRT_ACCESS, clk);
}

static int arch_timer_shutdown_phys(struct clock_event_device *clk)
{
        return arch_timer_shutdown(ARCH_TIMER_PHYS_ACCESS, clk);
}

static int arch_timer_shutdown_virt_mem(struct clock_event_device *clk)
{
        return arch_timer_shutdown(ARCH_TIMER_MEM_VIRT_ACCESS, clk);
}

static int arch_timer_shutdown_phys_mem(struct clock_event_device *clk)
{
        return arch_timer_shutdown(ARCH_TIMER_MEM_PHYS_ACCESS, clk);
}

static __always_inline void set_next_event(const int access, unsigned long evt,
                                           struct clock_event_device *clk)
{
        unsigned long ctrl;
        u64 cnt;

        ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk);
        ctrl |= ARCH_TIMER_CTRL_ENABLE;
        ctrl &= ~ARCH_TIMER_CTRL_IT_MASK;

        if (access == ARCH_TIMER_PHYS_ACCESS)
                cnt = __arch_counter_get_cntpct();
        else
                cnt = __arch_counter_get_cntvct();

        arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk);
        arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
}

static int arch_timer_set_next_event_virt(unsigned long evt,
                                          struct clock_event_device *clk)
{
        set_next_event(ARCH_TIMER_VIRT_ACCESS, evt, clk);
        return 0;
}

static int arch_timer_set_next_event_phys(unsigned long evt,
                                          struct clock_event_device *clk)
{
        set_next_event(ARCH_TIMER_PHYS_ACCESS, evt, clk);
        return 0;
}

static noinstr u64 arch_counter_get_cnt_mem(struct arch_timer *t, int offset_lo)
{
        u32 cnt_lo, cnt_hi, tmp_hi;

        do {
                cnt_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4));
                cnt_lo = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo));
                tmp_hi = __le32_to_cpu((__le32 __force)__raw_readl(t->base + offset_lo + 4));
        } while (cnt_hi != tmp_hi);

        return ((u64) cnt_hi << 32) | cnt_lo;
}

static __always_inline void set_next_event_mem(const int access, unsigned long evt,
                                           struct clock_event_device *clk)
{
        struct arch_timer *timer = to_arch_timer(clk);
        unsigned long ctrl;
        u64 cnt;

        ctrl = arch_timer_reg_read(access, ARCH_TIMER_REG_CTRL, clk);

        /* Timer must be disabled before programming CVAL */
        if (ctrl & ARCH_TIMER_CTRL_ENABLE) {
                ctrl &= ~ARCH_TIMER_CTRL_ENABLE;
                arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
        }

        ctrl |= ARCH_TIMER_CTRL_ENABLE;
        ctrl &= ~ARCH_TIMER_CTRL_IT_MASK;

        if (access ==  ARCH_TIMER_MEM_VIRT_ACCESS)
                cnt = arch_counter_get_cnt_mem(timer, CNTVCT_LO);
        else
                cnt = arch_counter_get_cnt_mem(timer, CNTPCT_LO);

        arch_timer_reg_write(access, ARCH_TIMER_REG_CVAL, evt + cnt, clk);
        arch_timer_reg_write(access, ARCH_TIMER_REG_CTRL, ctrl, clk);
}

static int arch_timer_set_next_event_virt_mem(unsigned long evt,
                                              struct clock_event_device *clk)
{
        set_next_event_mem(ARCH_TIMER_MEM_VIRT_ACCESS, evt, clk);
        return 0;
}

static int arch_timer_set_next_event_phys_mem(unsigned long evt,
                                              struct clock_event_device *clk)
{
        set_next_event_mem(ARCH_TIMER_MEM_PHYS_ACCESS, evt, clk);
        return 0;
}

static u64 __arch_timer_check_delta(void)
{
#ifdef CONFIG_ARM64
        const struct midr_range broken_cval_midrs[] = {
                /*
                 * XGene-1 implements CVAL in terms of TVAL, meaning
                 * that the maximum timer range is 32bit. Shame on them.
                 *
                 * Note that TVAL is signed, thus has only 31 of its
                 * 32 bits to express magnitude.
                 */
                MIDR_REV_RANGE(MIDR_CPU_MODEL(ARM_CPU_IMP_APM,
                                              APM_CPU_PART_XGENE),
                               APM_CPU_VAR_POTENZA, 0x0, 0xf),
                {},
        };

        if (is_midr_in_range_list(broken_cval_midrs)) {
                pr_warn_once("Broken CNTx_CVAL_EL1, using 31 bit TVAL instead.\n");
                return CLOCKSOURCE_MASK(31);
        }
#endif
        return CLOCKSOURCE_MASK(arch_counter_get_width());
}

static void __arch_timer_setup(unsigned type,
                               struct clock_event_device *clk)
{
        u64 max_delta;

        clk->features = CLOCK_EVT_FEAT_ONESHOT;

        if (type == ARCH_TIMER_TYPE_CP15) {
                typeof(clk->set_next_event) sne;

                arch_timer_check_ool_workaround(ate_match_local_cap_id, NULL);

                if (arch_timer_c3stop)
                        clk->features |= CLOCK_EVT_FEAT_C3STOP;
                clk->name = "arch_sys_timer";
                clk->rating = 450;
                clk->cpumask = cpumask_of(smp_processor_id());
                clk->irq = arch_timer_ppi[arch_timer_uses_ppi];
                switch (arch_timer_uses_ppi) {
                case ARCH_TIMER_VIRT_PPI:
                        clk->set_state_shutdown = arch_timer_shutdown_virt;
                        clk->set_state_oneshot_stopped = arch_timer_shutdown_virt;
                        sne = erratum_handler(set_next_event_virt);
                        break;
                case ARCH_TIMER_PHYS_SECURE_PPI:
                case ARCH_TIMER_PHYS_NONSECURE_PPI:
                case ARCH_TIMER_HYP_PPI:
                        clk->set_state_shutdown = arch_timer_shutdown_phys;
                        clk->set_state_oneshot_stopped = arch_timer_shutdown_phys;
                        sne = erratum_handler(set_next_event_phys);
                        break;
                default:
                        BUG();
                }

                clk->set_next_event = sne;
                max_delta = __arch_timer_check_delta();
        } else {
                clk->features |= CLOCK_EVT_FEAT_DYNIRQ;
                clk->name = "arch_mem_timer";
                clk->rating = 400;
                clk->cpumask = cpu_possible_mask;
                if (arch_timer_mem_use_virtual) {
                        clk->set_state_shutdown = arch_timer_shutdown_virt_mem;
                        clk->set_state_oneshot_stopped = arch_timer_shutdown_virt_mem;
                        clk->set_next_event =
                                arch_timer_set_next_event_virt_mem;
                } else {
                        clk->set_state_shutdown = arch_timer_shutdown_phys_mem;
                        clk->set_state_oneshot_stopped = arch_timer_shutdown_phys_mem;
                        clk->set_next_event =
                                arch_timer_set_next_event_phys_mem;
                }

                max_delta = CLOCKSOURCE_MASK(56);
        }

        clk->set_state_shutdown(clk);

        clockevents_config_and_register(clk, arch_timer_rate, 0xf, max_delta);
}

static void arch_timer_evtstrm_enable(unsigned int divider)
{
        u32 cntkctl = arch_timer_get_cntkctl();

#ifdef CONFIG_ARM64
        /* ECV is likely to require a large divider. Use the EVNTIS flag. */
        if (cpus_have_final_cap(ARM64_HAS_ECV) && divider > 15) {
                cntkctl |= ARCH_TIMER_EVT_INTERVAL_SCALE;
                divider -= 8;
        }
#endif

        divider = min(divider, 15U);
        cntkctl &= ~ARCH_TIMER_EVT_TRIGGER_MASK;
        /* Set the divider and enable virtual event stream */
        cntkctl |= (divider << ARCH_TIMER_EVT_TRIGGER_SHIFT)
                        | ARCH_TIMER_VIRT_EVT_EN;
        arch_timer_set_cntkctl(cntkctl);
        arch_timer_set_evtstrm_feature();
        cpumask_set_cpu(smp_processor_id(), &evtstrm_available);
}

static void arch_timer_configure_evtstream(void)
{
        int evt_stream_div, lsb;

        /*
         * As the event stream can at most be generated at half the frequency
         * of the counter, use half the frequency when computing the divider.
         */
        evt_stream_div = arch_timer_rate / ARCH_TIMER_EVT_STREAM_FREQ / 2;

        /*
         * Find the closest power of two to the divisor. If the adjacent bit
         * of lsb (last set bit, starts from 0) is set, then we use (lsb + 1).
         */
        lsb = fls(evt_stream_div) - 1;
        if (lsb > 0 && (evt_stream_div & BIT(lsb - 1)))
                lsb++;

        /* enable event stream */
        arch_timer_evtstrm_enable(max(0, lsb));
}

static int arch_timer_evtstrm_starting_cpu(unsigned int cpu)
{
        arch_timer_configure_evtstream();
        return 0;
}

static int arch_timer_evtstrm_dying_cpu(unsigned int cpu)
{
        cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
        return 0;
}

static int __init arch_timer_evtstrm_register(void)
{
        if (!arch_timer_evt || !evtstrm_enable)
                return 0;

        return cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_EVTSTRM_STARTING,
                                 "clockevents/arm/arch_timer_evtstrm:starting",
                                 arch_timer_evtstrm_starting_cpu,
                                 arch_timer_evtstrm_dying_cpu);
}
core_initcall(arch_timer_evtstrm_register);

static void arch_counter_set_user_access(void)
{
        u32 cntkctl = arch_timer_get_cntkctl();

        /* Disable user access to the timers and both counters */
        /* Also disable virtual event stream */
        cntkctl &= ~(ARCH_TIMER_USR_PT_ACCESS_EN
                        | ARCH_TIMER_USR_VT_ACCESS_EN
                        | ARCH_TIMER_USR_VCT_ACCESS_EN
                        | ARCH_TIMER_VIRT_EVT_EN
                        | ARCH_TIMER_USR_PCT_ACCESS_EN);

        /*
         * Enable user access to the virtual counter if it doesn't
         * need to be workaround. The vdso may have been already
         * disabled though.
         */
        if (arch_timer_this_cpu_has_cntvct_wa())
                pr_info("CPU%d: Trapping CNTVCT access\n", smp_processor_id());
        else
                cntkctl |= ARCH_TIMER_USR_VCT_ACCESS_EN;

        arch_timer_set_cntkctl(cntkctl);
}

static bool arch_timer_has_nonsecure_ppi(void)
{
        return (arch_timer_uses_ppi == ARCH_TIMER_PHYS_SECURE_PPI &&
                arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]);
}

static u32 check_ppi_trigger(int irq)
{
        u32 flags = irq_get_trigger_type(irq);

        if (flags != IRQF_TRIGGER_HIGH && flags != IRQF_TRIGGER_LOW) {
                pr_warn("WARNING: Invalid trigger for IRQ%d, assuming level low\n", irq);
                pr_warn("WARNING: Please fix your firmware\n");
                flags = IRQF_TRIGGER_LOW;
        }

        return flags;
}

static int arch_timer_starting_cpu(unsigned int cpu)
{
        struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);
        u32 flags;

        __arch_timer_setup(ARCH_TIMER_TYPE_CP15, clk);

        flags = check_ppi_trigger(arch_timer_ppi[arch_timer_uses_ppi]);
        enable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], flags);

        if (arch_timer_has_nonsecure_ppi()) {
                flags = check_ppi_trigger(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]);
                enable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI],
                                  flags);
        }

        arch_counter_set_user_access();

        return 0;
}

static int validate_timer_rate(void)
{
        if (!arch_timer_rate)
                return -EINVAL;

        /* Arch timer frequency < 1MHz can cause trouble */
        WARN_ON(arch_timer_rate < 1000000);

        return 0;
}

/*
 * For historical reasons, when probing with DT we use whichever (non-zero)
 * rate was probed first, and don't verify that others match. If the first node
 * probed has a clock-frequency property, this overrides the HW register.
 */
static void __init arch_timer_of_configure_rate(u32 rate, struct device_node *np)
{
        /* Who has more than one independent system counter? */
        if (arch_timer_rate)
                return;

        if (of_property_read_u32(np, "clock-frequency", &arch_timer_rate))
                arch_timer_rate = rate;

        /* Check the timer frequency. */
        if (validate_timer_rate())
                pr_warn("frequency not available\n");
}

static void __init arch_timer_banner(unsigned type)
{
        pr_info("%s%s%s timer(s) running at %lu.%02luMHz (%s%s%s).\n",
                type & ARCH_TIMER_TYPE_CP15 ? "cp15" : "",
                type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ?
                        " and " : "",
                type & ARCH_TIMER_TYPE_MEM ? "mmio" : "",
                (unsigned long)arch_timer_rate / 1000000,
                (unsigned long)(arch_timer_rate / 10000) % 100,
                type & ARCH_TIMER_TYPE_CP15 ?
                        (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) ? "virt" : "phys" :
                        "",
                type == (ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM) ? "/" : "",
                type & ARCH_TIMER_TYPE_MEM ?
                        arch_timer_mem_use_virtual ? "virt" : "phys" :
                        "");
}

u32 arch_timer_get_rate(void)
{
        return arch_timer_rate;
}

bool arch_timer_evtstrm_available(void)
{
        /*
         * We might get called from a preemptible context. This is fine
         * because availability of the event stream should be always the same
         * for a preemptible context and context where we might resume a task.
         */
        return cpumask_test_cpu(raw_smp_processor_id(), &evtstrm_available);
}

static noinstr u64 arch_counter_get_cntvct_mem(void)
{
        return arch_counter_get_cnt_mem(arch_timer_mem, CNTVCT_LO);
}

static struct arch_timer_kvm_info arch_timer_kvm_info;

struct arch_timer_kvm_info *arch_timer_get_kvm_info(void)
{
        return &arch_timer_kvm_info;
}

static void __init arch_counter_register(unsigned type)
{
        u64 (*scr)(void);
        u64 start_count;
        int width;

        /* Register the CP15 based counter if we have one */
        if (type & ARCH_TIMER_TYPE_CP15) {
                u64 (*rd)(void);

                if ((IS_ENABLED(CONFIG_ARM64) && !is_hyp_mode_available()) ||
                    arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) {
                        if (arch_timer_counter_has_wa()) {
                                rd = arch_counter_get_cntvct_stable;
                                scr = raw_counter_get_cntvct_stable;
                        } else {
                                rd = arch_counter_get_cntvct;
                                scr = arch_counter_get_cntvct;
                        }
                } else {
                        if (arch_timer_counter_has_wa()) {
                                rd = arch_counter_get_cntpct_stable;
                                scr = raw_counter_get_cntpct_stable;
                        } else {
                                rd = arch_counter_get_cntpct;
                                scr = arch_counter_get_cntpct;
                        }
                }

                arch_timer_read_counter = rd;
                clocksource_counter.vdso_clock_mode = vdso_default;
        } else {
                arch_timer_read_counter = arch_counter_get_cntvct_mem;
                scr = arch_counter_get_cntvct_mem;
        }

        width = arch_counter_get_width();
        clocksource_counter.mask = CLOCKSOURCE_MASK(width);
        cyclecounter.mask = CLOCKSOURCE_MASK(width);

        if (!arch_counter_suspend_stop)
                clocksource_counter.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
        start_count = arch_timer_read_counter();
        clocksource_register_hz(&clocksource_counter, arch_timer_rate);
        cyclecounter.mult = clocksource_counter.mult;
        cyclecounter.shift = clocksource_counter.shift;
        timecounter_init(&arch_timer_kvm_info.timecounter,
                         &cyclecounter, start_count);

        sched_clock_register(scr, width, arch_timer_rate);
}

static void arch_timer_stop(struct clock_event_device *clk)
{
        pr_debug("disable IRQ%d cpu #%d\n", clk->irq, smp_processor_id());

        disable_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi]);
        if (arch_timer_has_nonsecure_ppi())
                disable_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI]);
}

static int arch_timer_dying_cpu(unsigned int cpu)
{
        struct clock_event_device *clk = this_cpu_ptr(arch_timer_evt);

        arch_timer_stop(clk);
        return 0;
}

#ifdef CONFIG_CPU_PM
static DEFINE_PER_CPU(unsigned long, saved_cntkctl);
static int arch_timer_cpu_pm_notify(struct notifier_block *self,
                                    unsigned long action, void *hcpu)
{
        if (action == CPU_PM_ENTER) {
                __this_cpu_write(saved_cntkctl, arch_timer_get_cntkctl());

                cpumask_clear_cpu(smp_processor_id(), &evtstrm_available);
        } else if (action == CPU_PM_ENTER_FAILED || action == CPU_PM_EXIT) {
                arch_timer_set_cntkctl(__this_cpu_read(saved_cntkctl));

                if (arch_timer_have_evtstrm_feature())
                        cpumask_set_cpu(smp_processor_id(), &evtstrm_available);
        }
        return NOTIFY_OK;
}

static struct notifier_block arch_timer_cpu_pm_notifier = {
        .notifier_call = arch_timer_cpu_pm_notify,
};

static int __init arch_timer_cpu_pm_init(void)
{
        return cpu_pm_register_notifier(&arch_timer_cpu_pm_notifier);
}

static void __init arch_timer_cpu_pm_deinit(void)
{
        WARN_ON(cpu_pm_unregister_notifier(&arch_timer_cpu_pm_notifier));
}

#else
static int __init arch_timer_cpu_pm_init(void)
{
        return 0;
}

static void __init arch_timer_cpu_pm_deinit(void)
{
}
#endif

static int __init arch_timer_register(void)
{
        int err;
        int ppi;

        arch_timer_evt = alloc_percpu(struct clock_event_device);
        if (!arch_timer_evt) {
                err = -ENOMEM;
                goto out;
        }

        ppi = arch_timer_ppi[arch_timer_uses_ppi];
        switch (arch_timer_uses_ppi) {
        case ARCH_TIMER_VIRT_PPI:
                err = request_percpu_irq(ppi, arch_timer_handler_virt,
                                         "arch_timer", arch_timer_evt);
                break;
        case ARCH_TIMER_PHYS_SECURE_PPI:
        case ARCH_TIMER_PHYS_NONSECURE_PPI:
                err = request_percpu_irq(ppi, arch_timer_handler_phys,
                                         "arch_timer", arch_timer_evt);
                if (!err && arch_timer_has_nonsecure_ppi()) {
                        ppi = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI];
                        err = request_percpu_irq(ppi, arch_timer_handler_phys,
                                                 "arch_timer", arch_timer_evt);
                        if (err)
                                free_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_SECURE_PPI],
                                                arch_timer_evt);
                }
                break;
        case ARCH_TIMER_HYP_PPI:
                err = request_percpu_irq(ppi, arch_timer_handler_phys,
                                         "arch_timer", arch_timer_evt);
                break;
        default:
                BUG();
        }

        if (err) {
                pr_err("can't register interrupt %d (%d)\n", ppi, err);
                goto out_free;
        }

        err = arch_timer_cpu_pm_init();
        if (err)
                goto out_unreg_notify;

        /* Register and immediately configure the timer on the boot CPU */
        err = cpuhp_setup_state(CPUHP_AP_ARM_ARCH_TIMER_STARTING,
                                "clockevents/arm/arch_timer:starting",
                                arch_timer_starting_cpu, arch_timer_dying_cpu);
        if (err)
                goto out_unreg_cpupm;
        return 0;

out_unreg_cpupm:
        arch_timer_cpu_pm_deinit();

out_unreg_notify:
        free_percpu_irq(arch_timer_ppi[arch_timer_uses_ppi], arch_timer_evt);
        if (arch_timer_has_nonsecure_ppi())
                free_percpu_irq(arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI],
                                arch_timer_evt);

out_free:
        free_percpu(arch_timer_evt);
        arch_timer_evt = NULL;
out:
        return err;
}

static int __init arch_timer_mem_register(void __iomem *base, unsigned int irq)
{
        int ret;
        irq_handler_t func;

        arch_timer_mem = kzalloc(sizeof(*arch_timer_mem), GFP_KERNEL);
        if (!arch_timer_mem)
                return -ENOMEM;

        arch_timer_mem->base = base;
        arch_timer_mem->evt.irq = irq;
        __arch_timer_setup(ARCH_TIMER_TYPE_MEM, &arch_timer_mem->evt);

        if (arch_timer_mem_use_virtual)
                func = arch_timer_handler_virt_mem;
        else
                func = arch_timer_handler_phys_mem;

        ret = request_irq(irq, func, IRQF_TIMER, "arch_mem_timer", &arch_timer_mem->evt);
        if (ret) {
                pr_err("Failed to request mem timer irq\n");
                kfree(arch_timer_mem);
                arch_timer_mem = NULL;
        }

        return ret;
}

static const struct of_device_id arch_timer_of_match[] __initconst = {
        { .compatible   = "arm,armv7-timer",    },
        { .compatible   = "arm,armv8-timer",    },
        {},
};

static const struct of_device_id arch_timer_mem_of_match[] __initconst = {
        { .compatible   = "arm,armv7-timer-mem", },
        {},
};

static bool __init arch_timer_needs_of_probing(void)
{
        struct device_node *dn;
        bool needs_probing = false;
        unsigned int mask = ARCH_TIMER_TYPE_CP15 | ARCH_TIMER_TYPE_MEM;

        /* We have two timers, and both device-tree nodes are probed. */
        if ((arch_timers_present & mask) == mask)
                return false;

        /*
         * Only one type of timer is probed,
         * check if we have another type of timer node in device-tree.
         */
        if (arch_timers_present & ARCH_TIMER_TYPE_CP15)
                dn = of_find_matching_node(NULL, arch_timer_mem_of_match);
        else
                dn = of_find_matching_node(NULL, arch_timer_of_match);

        if (dn && of_device_is_available(dn))
                needs_probing = true;

        of_node_put(dn);

        return needs_probing;
}

static int __init arch_timer_common_init(void)
{
        arch_timer_banner(arch_timers_present);
        arch_counter_register(arch_timers_present);
        return arch_timer_arch_init();
}

/**
 * arch_timer_select_ppi() - Select suitable PPI for the current system.
 *
 * If HYP mode is available, we know that the physical timer
 * has been configured to be accessible from PL1. Use it, so
 * that a guest can use the virtual timer instead.
 *
 * On ARMv8.1 with VH extensions, the kernel runs in HYP. VHE
 * accesses to CNTP_*_EL1 registers are silently redirected to
 * their CNTHP_*_EL2 counterparts, and use a different PPI
 * number.
 *
 * If no interrupt provided for virtual timer, we'll have to
 * stick to the physical timer. It'd better be accessible...
 * For arm64 we never use the secure interrupt.
 *
 * Return: a suitable PPI type for the current system.
 */
static enum arch_timer_ppi_nr __init arch_timer_select_ppi(void)
{
        if (is_kernel_in_hyp_mode())
                return ARCH_TIMER_HYP_PPI;

        if (!is_hyp_mode_available() && arch_timer_ppi[ARCH_TIMER_VIRT_PPI])
                return ARCH_TIMER_VIRT_PPI;

        if (IS_ENABLED(CONFIG_ARM64))
                return ARCH_TIMER_PHYS_NONSECURE_PPI;

        return ARCH_TIMER_PHYS_SECURE_PPI;
}

static void __init arch_timer_populate_kvm_info(void)
{
        arch_timer_kvm_info.virtual_irq = arch_timer_ppi[ARCH_TIMER_VIRT_PPI];
        if (is_kernel_in_hyp_mode())
                arch_timer_kvm_info.physical_irq = arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI];
}

static int __init arch_timer_of_init(struct device_node *np)
{
        int i, irq, ret;
        u32 rate;
        bool has_names;

        if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
                pr_warn("multiple nodes in dt, skipping\n");
                return 0;
        }

        arch_timers_present |= ARCH_TIMER_TYPE_CP15;

        has_names = of_property_present(np, "interrupt-names");

        for (i = ARCH_TIMER_PHYS_SECURE_PPI; i < ARCH_TIMER_MAX_TIMER_PPI; i++) {
                if (has_names)
                        irq = of_irq_get_byname(np, arch_timer_ppi_names[i]);
                else
                        irq = of_irq_get(np, i);
                if (irq > 0)
                        arch_timer_ppi[i] = irq;
        }

        arch_timer_populate_kvm_info();

        rate = arch_timer_get_cntfrq();
        arch_timer_of_configure_rate(rate, np);

        arch_timer_c3stop = !of_property_read_bool(np, "always-on");

        /* Check for globally applicable workarounds */
        arch_timer_check_ool_workaround(ate_match_dt, np);

        /*
         * If we cannot rely on firmware initializing the timer registers then
         * we should use the physical timers instead.
         */
        if (IS_ENABLED(CONFIG_ARM) &&
            of_property_read_bool(np, "arm,cpu-registers-not-fw-configured"))
                arch_timer_uses_ppi = ARCH_TIMER_PHYS_SECURE_PPI;
        else
                arch_timer_uses_ppi = arch_timer_select_ppi();

        if (!arch_timer_ppi[arch_timer_uses_ppi]) {
                pr_err("No interrupt available, giving up\n");
                return -EINVAL;
        }

        /* On some systems, the counter stops ticking when in suspend. */
        arch_counter_suspend_stop = of_property_read_bool(np,
                                                         "arm,no-tick-in-suspend");

        ret = arch_timer_register();
        if (ret)
                return ret;

        if (arch_timer_needs_of_probing())
                return 0;

        return arch_timer_common_init();
}
TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init);
TIMER_OF_DECLARE(armv8_arch_timer, "arm,armv8-timer", arch_timer_of_init);

static u32 __init
arch_timer_mem_frame_get_cntfrq(struct arch_timer_mem_frame *frame)
{
        void __iomem *base;
        u32 rate;

        base = ioremap(frame->cntbase, frame->size);
        if (!base) {
                pr_err("Unable to map frame @ %pa\n", &frame->cntbase);
                return 0;
        }

        rate = readl_relaxed(base + CNTFRQ);

        iounmap(base);

        return rate;
}

static struct arch_timer_mem_frame * __init
arch_timer_mem_find_best_frame(struct arch_timer_mem *timer_mem)
{
        struct arch_timer_mem_frame *frame, *best_frame = NULL;
        void __iomem *cntctlbase;
        u32 cnttidr;
        int i;

        cntctlbase = ioremap(timer_mem->cntctlbase, timer_mem->size);
        if (!cntctlbase) {
                pr_err("Can't map CNTCTLBase @ %pa\n",
                        &timer_mem->cntctlbase);
                return NULL;
        }

        cnttidr = readl_relaxed(cntctlbase + CNTTIDR);

        /*
         * Try to find a virtual capable frame. Otherwise fall back to a
         * physical capable frame.
         */
        for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) {
                u32 cntacr = CNTACR_RFRQ | CNTACR_RWPT | CNTACR_RPCT |
                             CNTACR_RWVT | CNTACR_RVOFF | CNTACR_RVCT;

                frame = &timer_mem->frame[i];
                if (!frame->valid)
                        continue;

                /* Try enabling everything, and see what sticks */
                writel_relaxed(cntacr, cntctlbase + CNTACR(i));
                cntacr = readl_relaxed(cntctlbase + CNTACR(i));

                if ((cnttidr & CNTTIDR_VIRT(i)) &&
                    !(~cntacr & (CNTACR_RWVT | CNTACR_RVCT))) {
                        best_frame = frame;
                        arch_timer_mem_use_virtual = true;
                        break;
                }

                if (~cntacr & (CNTACR_RWPT | CNTACR_RPCT))
                        continue;

                best_frame = frame;
        }

        iounmap(cntctlbase);

        return best_frame;
}

static int __init
arch_timer_mem_frame_register(struct arch_timer_mem_frame *frame)
{
        void __iomem *base;
        int ret, irq;

        if (arch_timer_mem_use_virtual)
                irq = frame->virt_irq;
        else
                irq = frame->phys_irq;

        if (!irq) {
                pr_err("Frame missing %s irq.\n",
                       arch_timer_mem_use_virtual ? "virt" : "phys");
                return -EINVAL;
        }

        if (!request_mem_region(frame->cntbase, frame->size,
                                "arch_mem_timer"))
                return -EBUSY;

        base = ioremap(frame->cntbase, frame->size);
        if (!base) {
                pr_err("Can't map frame's registers\n");
                return -ENXIO;
        }

        ret = arch_timer_mem_register(base, irq);
        if (ret) {
                iounmap(base);
                return ret;
        }

        arch_timers_present |= ARCH_TIMER_TYPE_MEM;

        return 0;
}

static int __init arch_timer_mem_of_init(struct device_node *np)
{
        struct arch_timer_mem *timer_mem;
        struct arch_timer_mem_frame *frame;
        struct resource res;
        int ret = -EINVAL;
        u32 rate;

        timer_mem = kzalloc(sizeof(*timer_mem), GFP_KERNEL);
        if (!timer_mem)
                return -ENOMEM;

        if (of_address_to_resource(np, 0, &res))
                goto out;
        timer_mem->cntctlbase = res.start;
        timer_mem->size = resource_size(&res);

        for_each_available_child_of_node_scoped(np, frame_node) {
                u32 n;
                struct arch_timer_mem_frame *frame;

                if (of_property_read_u32(frame_node, "frame-number", &n)) {
                        pr_err(FW_BUG "Missing frame-number.\n");
                        goto out;
                }
                if (n >= ARCH_TIMER_MEM_MAX_FRAMES) {
                        pr_err(FW_BUG "Wrong frame-number, only 0-%u are permitted.\n",
                               ARCH_TIMER_MEM_MAX_FRAMES - 1);
                        goto out;
                }
                frame = &timer_mem->frame[n];

                if (frame->valid) {
                        pr_err(FW_BUG "Duplicated frame-number.\n");
                        goto out;
                }

                if (of_address_to_resource(frame_node, 0, &res))
                        goto out;

                frame->cntbase = res.start;
                frame->size = resource_size(&res);

                frame->virt_irq = irq_of_parse_and_map(frame_node,
                                                       ARCH_TIMER_VIRT_SPI);
                frame->phys_irq = irq_of_parse_and_map(frame_node,
                                                       ARCH_TIMER_PHYS_SPI);

                frame->valid = true;
        }

        frame = arch_timer_mem_find_best_frame(timer_mem);
        if (!frame) {
                pr_err("Unable to find a suitable frame in timer @ %pa\n",
                        &timer_mem->cntctlbase);
                ret = -EINVAL;
                goto out;
        }

        rate = arch_timer_mem_frame_get_cntfrq(frame);
        arch_timer_of_configure_rate(rate, np);

        ret = arch_timer_mem_frame_register(frame);
        if (!ret && !arch_timer_needs_of_probing())
                ret = arch_timer_common_init();
out:
        kfree(timer_mem);
        return ret;
}
TIMER_OF_DECLARE(armv7_arch_timer_mem, "arm,armv7-timer-mem",
                       arch_timer_mem_of_init);

#ifdef CONFIG_ACPI_GTDT
static int __init
arch_timer_mem_verify_cntfrq(struct arch_timer_mem *timer_mem)
{
        struct arch_timer_mem_frame *frame;
        u32 rate;
        int i;

        for (i = 0; i < ARCH_TIMER_MEM_MAX_FRAMES; i++) {
                frame = &timer_mem->frame[i];

                if (!frame->valid)
                        continue;

                rate = arch_timer_mem_frame_get_cntfrq(frame);
                if (rate == arch_timer_rate)
                        continue;

                pr_err(FW_BUG "CNTFRQ mismatch: frame @ %pa: (0x%08lx), CPU: (0x%08lx)\n",
                        &frame->cntbase,
                        (unsigned long)rate, (unsigned long)arch_timer_rate);

                return -EINVAL;
        }

        return 0;
}

static int __init arch_timer_mem_acpi_init(int platform_timer_count)
{
        struct arch_timer_mem *timers, *timer;
        struct arch_timer_mem_frame *frame, *best_frame = NULL;
        int timer_count, i, ret = 0;

        timers = kcalloc(platform_timer_count, sizeof(*timers),
                            GFP_KERNEL);
        if (!timers)
                return -ENOMEM;

        ret = acpi_arch_timer_mem_init(timers, &timer_count);
        if (ret || !timer_count)
                goto out;

        /*
         * While unlikely, it's theoretically possible that none of the frames
         * in a timer expose the combination of feature we want.
         */
        for (i = 0; i < timer_count; i++) {
                timer = &timers[i];

                frame = arch_timer_mem_find_best_frame(timer);
                if (!best_frame)
                        best_frame = frame;

                ret = arch_timer_mem_verify_cntfrq(timer);
                if (ret) {
                        pr_err("Disabling MMIO timers due to CNTFRQ mismatch\n");
                        goto out;
                }

                if (!best_frame) /* implies !frame */
                        /*
                         * Only complain about missing suitable frames if we
                         * haven't already found one in a previous iteration.
                         */
                        pr_err("Unable to find a suitable frame in timer @ %pa\n",
                                &timer->cntctlbase);
        }

        if (best_frame)
                ret = arch_timer_mem_frame_register(best_frame);
out:
        kfree(timers);
        return ret;
}

/* Initialize per-processor generic timer and memory-mapped timer(if present) */
static int __init arch_timer_acpi_init(struct acpi_table_header *table)
{
        int ret, platform_timer_count;

        if (arch_timers_present & ARCH_TIMER_TYPE_CP15) {
                pr_warn("already initialized, skipping\n");
                return -EINVAL;
        }

        arch_timers_present |= ARCH_TIMER_TYPE_CP15;

        ret = acpi_gtdt_init(table, &platform_timer_count);
        if (ret)
                return ret;

        arch_timer_ppi[ARCH_TIMER_PHYS_NONSECURE_PPI] =
                acpi_gtdt_map_ppi(ARCH_TIMER_PHYS_NONSECURE_PPI);

        arch_timer_ppi[ARCH_TIMER_VIRT_PPI] =
                acpi_gtdt_map_ppi(ARCH_TIMER_VIRT_PPI);

        arch_timer_ppi[ARCH_TIMER_HYP_PPI] =
                acpi_gtdt_map_ppi(ARCH_TIMER_HYP_PPI);

        arch_timer_populate_kvm_info();

        /*
         * When probing via ACPI, we have no mechanism to override the sysreg
         * CNTFRQ value. This *must* be correct.
         */
        arch_timer_rate = arch_timer_get_cntfrq();
        ret = validate_timer_rate();
        if (ret) {
                pr_err(FW_BUG "frequency not available.\n");
                return ret;
        }

        arch_timer_uses_ppi = arch_timer_select_ppi();
        if (!arch_timer_ppi[arch_timer_uses_ppi]) {
                pr_err("No interrupt available, giving up\n");
                return -EINVAL;
        }

        /* Always-on capability */
        arch_timer_c3stop = acpi_gtdt_c3stop(arch_timer_uses_ppi);

        /* Check for globally applicable workarounds */
        arch_timer_check_ool_workaround(ate_match_acpi_oem_info, table);

        ret = arch_timer_register();
        if (ret)
                return ret;

        if (platform_timer_count &&
            arch_timer_mem_acpi_init(platform_timer_count))
                pr_err("Failed to initialize memory-mapped timer.\n");

        return arch_timer_common_init();
}
TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init);
#endif

int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *ts,
                                 enum clocksource_ids *cs_id)
{
        struct arm_smccc_res hvc_res;
        u32 ptp_counter;
        ktime_t ktime;

        if (!IS_ENABLED(CONFIG_HAVE_ARM_SMCCC_DISCOVERY))
                return -EOPNOTSUPP;

        if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI)
                ptp_counter = KVM_PTP_VIRT_COUNTER;
        else
                ptp_counter = KVM_PTP_PHYS_COUNTER;

        arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID,
                             ptp_counter, &hvc_res);

        if ((int)(hvc_res.a0) < 0)
                return -EOPNOTSUPP;

        ktime = (u64)hvc_res.a0 << 32 | hvc_res.a1;
        *ts = ktime_to_timespec64(ktime);
        if (cycle)
                *cycle = (u64)hvc_res.a2 << 32 | hvc_res.a3;
        if (cs_id)
                *cs_id = CSID_ARM_ARCH_COUNTER;

        return 0;
}
EXPORT_SYMBOL_GPL(kvm_arch_ptp_get_crosststamp);












































































































































































































































































































































  265 


  265 

  265 


  264 














































  265 


  265 
  265 






























































  268 






  268 
  268 















































































































































































































































































































































































































































































































































































































































































































































































































































  268 


  267 
  268 

  265 


    3 







































































  265 





  265 
  265 


  265 

  264 

















  265 






























  265 



  265 
  268 















  265 


  265 









  265 








  265 











  265 

  265 










  265 







  267 






















  268 

  264 
  268 



  268 

  268 





  268 
  265 









  265 







































































































































































































































































































































































































































































































































































































   35 


















































































































































































































































































































































































































































    3 





    3 











    3 




































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NETLINK      Kernel-user communication protocol.
 *
 *                 Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                                 Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *                                 Patrick McHardy <kaber@trash.net>
 *
 * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
 *                               added netlink_proto_exit
 * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
 *                                  use nlk_sk, as sk->protinfo is on a diet 8)
 * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
 *                                  - inc module use count of module that owns
 *                                    the kernel socket in case userspace opens
 *                                    socket of same protocol
 *                                  - remove all module support, since netlink is
 *                                    mandatory if CONFIG_NET=y these days
 */

#include <linux/module.h>

#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/filter.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/if_arp.h>
#include <linux/rhashtable.h>
#include <asm/cacheflush.h>
#include <linux/hash.h>
#include <linux/net_namespace.h>
#include <linux/nospec.h>
#include <linux/btf_ids.h>

#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/scm.h>
#include <net/netlink.h>
#define CREATE_TRACE_POINTS
#include <trace/events/netlink.h>

#include "af_netlink.h"
#include "genetlink.h"

struct listeners {
        struct rcu_head                rcu;
        unsigned long                masks[];
};

/* state bits */
#define NETLINK_S_CONGESTED                0x0

static inline int netlink_is_kernel(struct sock *sk)
{
        return nlk_test_bit(KERNEL_SOCKET, sk);
}

struct netlink_table *nl_table __read_mostly;
EXPORT_SYMBOL_GPL(nl_table);

static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];

static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
        "nlk_cb_mutex-ROUTE",
        "nlk_cb_mutex-1",
        "nlk_cb_mutex-USERSOCK",
        "nlk_cb_mutex-FIREWALL",
        "nlk_cb_mutex-SOCK_DIAG",
        "nlk_cb_mutex-NFLOG",
        "nlk_cb_mutex-XFRM",
        "nlk_cb_mutex-SELINUX",
        "nlk_cb_mutex-ISCSI",
        "nlk_cb_mutex-AUDIT",
        "nlk_cb_mutex-FIB_LOOKUP",
        "nlk_cb_mutex-CONNECTOR",
        "nlk_cb_mutex-NETFILTER",
        "nlk_cb_mutex-IP6_FW",
        "nlk_cb_mutex-DNRTMSG",
        "nlk_cb_mutex-KOBJECT_UEVENT",
        "nlk_cb_mutex-GENERIC",
        "nlk_cb_mutex-17",
        "nlk_cb_mutex-SCSITRANSPORT",
        "nlk_cb_mutex-ECRYPTFS",
        "nlk_cb_mutex-RDMA",
        "nlk_cb_mutex-CRYPTO",
        "nlk_cb_mutex-SMC",
        "nlk_cb_mutex-23",
        "nlk_cb_mutex-24",
        "nlk_cb_mutex-25",
        "nlk_cb_mutex-26",
        "nlk_cb_mutex-27",
        "nlk_cb_mutex-28",
        "nlk_cb_mutex-29",
        "nlk_cb_mutex-30",
        "nlk_cb_mutex-31",
        "nlk_cb_mutex-MAX_LINKS"
};

static int netlink_dump(struct sock *sk, bool lock_taken);

/* nl_table locking explained:
 * Lookup and traversal are protected with an RCU read-side lock. Insertion
 * and removal are protected with per bucket lock while using RCU list
 * modification primitives and may run in parallel to RCU protected lookups.
 * Destruction of the Netlink socket may only occur *after* nl_table_lock has
 * been acquired * either during or after the socket has been removed from
 * the list and after an RCU grace period.
 */
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
static atomic_t nl_table_users = ATOMIC_INIT(0);

#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));

static BLOCKING_NOTIFIER_HEAD(netlink_chain);


static const struct rhashtable_params netlink_rhashtable_params;

void do_trace_netlink_extack(const char *msg)
{
        trace_netlink_extack(msg);
}
EXPORT_SYMBOL(do_trace_netlink_extack);

static inline u32 netlink_group_mask(u32 group)
{
        if (group > 32)
                return 0;
        return group ? 1 << (group - 1) : 0;
}

static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
                                           gfp_t gfp_mask)
{
        unsigned int len = skb->len;
        struct sk_buff *new;

        new = alloc_skb(len, gfp_mask);
        if (new == NULL)
                return NULL;

        NETLINK_CB(new).portid = NETLINK_CB(skb).portid;
        NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group;
        NETLINK_CB(new).creds = NETLINK_CB(skb).creds;

        skb_put_data(new, skb->data, len);
        return new;
}

static unsigned int netlink_tap_net_id;

struct netlink_tap_net {
        struct list_head netlink_tap_all;
        struct mutex netlink_tap_lock;
};

int netlink_add_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        if (unlikely(nt->dev->type != ARPHRD_NETLINK))
                return -EINVAL;

        mutex_lock(&nn->netlink_tap_lock);
        list_add_rcu(&nt->list, &nn->netlink_tap_all);
        mutex_unlock(&nn->netlink_tap_lock);

        __module_get(nt->module);

        return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);

static int __netlink_remove_tap(struct netlink_tap *nt)
{
        struct net *net = dev_net(nt->dev);
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
        bool found = false;
        struct netlink_tap *tmp;

        mutex_lock(&nn->netlink_tap_lock);

        list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
                if (nt == tmp) {
                        list_del_rcu(&nt->list);
                        found = true;
                        goto out;
                }
        }

        pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
        mutex_unlock(&nn->netlink_tap_lock);

        if (found)
                module_put(nt->module);

        return found ? 0 : -ENODEV;
}

int netlink_remove_tap(struct netlink_tap *nt)
{
        int ret;

        ret = __netlink_remove_tap(nt);
        synchronize_net();

        return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);

static __net_init int netlink_tap_init_net(struct net *net)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        INIT_LIST_HEAD(&nn->netlink_tap_all);
        mutex_init(&nn->netlink_tap_lock);
        return 0;
}

static struct pernet_operations netlink_tap_net_ops = {
        .init = netlink_tap_init_net,
        .id   = &netlink_tap_net_id,
        .size = sizeof(struct netlink_tap_net),
};

static bool netlink_filter_tap(const struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        /* We take the more conservative approach and
         * whitelist socket protocols that may pass.
         */
        switch (sk->sk_protocol) {
        case NETLINK_ROUTE:
        case NETLINK_USERSOCK:
        case NETLINK_SOCK_DIAG:
        case NETLINK_NFLOG:
        case NETLINK_XFRM:
        case NETLINK_FIB_LOOKUP:
        case NETLINK_NETFILTER:
        case NETLINK_GENERIC:
                return true;
        }

        return false;
}

static int __netlink_deliver_tap_skb(struct sk_buff *skb,
                                     struct net_device *dev)
{
        struct sk_buff *nskb;
        struct sock *sk = skb->sk;
        int ret = -ENOMEM;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                return 0;

        dev_hold(dev);

        if (is_vmalloc_addr(skb->head))
                nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
        else
                nskb = skb_clone(skb, GFP_ATOMIC);
        if (nskb) {
                nskb->dev = dev;
                nskb->protocol = htons((u16) sk->sk_protocol);
                nskb->pkt_type = netlink_is_kernel(sk) ?
                                 PACKET_KERNEL : PACKET_USER;
                skb_reset_network_header(nskb);
                ret = dev_queue_xmit(nskb);
                if (unlikely(ret > 0))
                        ret = net_xmit_errno(ret);
        }

        dev_put(dev);
        return ret;
}

static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
{
        int ret;
        struct netlink_tap *tmp;

        if (!netlink_filter_tap(skb))
                return;

        list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
                ret = __netlink_deliver_tap_skb(skb, tmp->dev);
                if (unlikely(ret))
                        break;
        }
}

static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
{
        struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);

        rcu_read_lock();

        if (unlikely(!list_empty(&nn->netlink_tap_all)))
                __netlink_deliver_tap(skb, nn);

        rcu_read_unlock();
}

static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
                                       struct sk_buff *skb)
{
        if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
                netlink_deliver_tap(sock_net(dst), skb);
}

static void netlink_overrun(struct sock *sk)
{
        if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
                if (!test_and_set_bit(NETLINK_S_CONGESTED,
                                      &nlk_sk(sk)->state)) {
                        WRITE_ONCE(sk->sk_err, ENOBUFS);
                        sk_error_report(sk);
                }
        }
        atomic_inc(&sk->sk_drops);
}

static void netlink_rcv_wake(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (skb_queue_empty_lockless(&sk->sk_receive_queue))
                clear_bit(NETLINK_S_CONGESTED, &nlk->state);
        if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
                wake_up_interruptible(&nlk->wait);
}

static void netlink_skb_destructor(struct sk_buff *skb)
{
        if (is_vmalloc_addr(skb->head)) {
                if (!skb->cloned ||
                    !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
                        vfree_atomic(skb->head);

                skb->head = NULL;
        }
        if (skb->sk != NULL)
                sock_rfree(skb);
}

static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        WARN_ON(skb->sk != NULL);
        skb->sk = sk;
        skb->destructor = netlink_skb_destructor;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, skb->truesize);
}

static void netlink_sock_destruct(struct sock *sk)
{
        skb_queue_purge(&sk->sk_receive_queue);

        if (!sock_flag(sk, SOCK_DEAD)) {
                printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
                return;
        }

        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(refcount_read(&sk->sk_wmem_alloc));
        WARN_ON(nlk_sk(sk)->groups);
}

/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
 * SMP. Look, when several writers sleep and reader wakes them up, all but one
 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
 * this, _but_ remember, it adds useless work on UP machines.
 */

void netlink_table_grab(void)
        __acquires(nl_table_lock)
{
        might_sleep();

        write_lock_irq(&nl_table_lock);

        if (atomic_read(&nl_table_users)) {
                DECLARE_WAITQUEUE(wait, current);

                add_wait_queue_exclusive(&nl_table_wait, &wait);
                for (;;) {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        if (atomic_read(&nl_table_users) == 0)
                                break;
                        write_unlock_irq(&nl_table_lock);
                        schedule();
                        write_lock_irq(&nl_table_lock);
                }

                __set_current_state(TASK_RUNNING);
                remove_wait_queue(&nl_table_wait, &wait);
        }
}

void netlink_table_ungrab(void)
        __releases(nl_table_lock)
{
        write_unlock_irq(&nl_table_lock);
        wake_up(&nl_table_wait);
}

static inline void
netlink_lock_table(void)
{
        unsigned long flags;

        /* read_lock() synchronizes us to netlink_table_grab */

        read_lock_irqsave(&nl_table_lock, flags);
        atomic_inc(&nl_table_users);
        read_unlock_irqrestore(&nl_table_lock, flags);
}

static inline void
netlink_unlock_table(void)
{
        if (atomic_dec_and_test(&nl_table_users))
                wake_up(&nl_table_wait);
}

struct netlink_compare_arg
{
        possible_net_t pnet;
        u32 portid;
};

/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
#define netlink_compare_arg_len \
        (offsetof(struct netlink_compare_arg, portid) + sizeof(u32))

static inline int netlink_compare(struct rhashtable_compare_arg *arg,
                                  const void *ptr)
{
        const struct netlink_compare_arg *x = arg->key;
        const struct netlink_sock *nlk = ptr;

        return nlk->portid != x->portid ||
               !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
}

static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
                                     struct net *net, u32 portid)
{
        memset(arg, 0, sizeof(*arg));
        write_pnet(&arg->pnet, net);
        arg->portid = portid;
}

static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
                                     struct net *net)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, net, portid);
        return rhashtable_lookup_fast(&table->hash, &arg,
                                      netlink_rhashtable_params);
}

static int __netlink_insert(struct netlink_table *table, struct sock *sk)
{
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
        return rhashtable_lookup_insert_key(&table->hash, &arg,
                                            &nlk_sk(sk)->node,
                                            netlink_rhashtable_params);
}

static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
{
        struct netlink_table *table = &nl_table[protocol];
        struct sock *sk;

        rcu_read_lock();
        sk = __netlink_lookup(table, portid, net);
        if (sk)
                sock_hold(sk);
        rcu_read_unlock();

        return sk;
}

static const struct proto_ops netlink_ops;

static void
netlink_update_listeners(struct sock *sk)
{
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];
        unsigned long mask;
        unsigned int i;
        struct listeners *listeners;

        listeners = nl_deref_protected(tbl->listeners);
        if (!listeners)
                return;

        for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
                mask = 0;
                sk_for_each_bound(sk, &tbl->mc_list) {
                        if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
                                mask |= nlk_sk(sk)->groups[i];
                }
                listeners->masks[i] = mask;
        }
        /* this function is only called with the netlink table "grabbed", which
         * makes sure updates are visible before bind or setsockopt return. */
}

static int netlink_insert(struct sock *sk, u32 portid)
{
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        int err;

        lock_sock(sk);

        err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
        if (nlk_sk(sk)->bound)
                goto err;

        /* portid can be read locklessly from netlink_getname(). */
        WRITE_ONCE(nlk_sk(sk)->portid, portid);

        sock_hold(sk);

        err = __netlink_insert(table, sk);
        if (err) {
                /* In case the hashtable backend returns with -EBUSY
                 * from here, it must not escape to the caller.
                 */
                if (unlikely(err == -EBUSY))
                        err = -EOVERFLOW;
                if (err == -EEXIST)
                        err = -EADDRINUSE;
                sock_put(sk);
                goto err;
        }

        /* We need to ensure that the socket is hashed and visible. */
        smp_wmb();
        /* Paired with lockless reads from netlink_bind(),
         * netlink_connect() and netlink_sendmsg().
         */
        WRITE_ONCE(nlk_sk(sk)->bound, portid);

err:
        release_sock(sk);
        return err;
}

static void netlink_remove(struct sock *sk)
{
        struct netlink_table *table;

        table = &nl_table[sk->sk_protocol];
        if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
                                    netlink_rhashtable_params)) {
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }

        netlink_table_grab();
        if (nlk_sk(sk)->subscriptions) {
                __sk_del_bind_node(sk);
                netlink_update_listeners(sk);
        }
        if (sk->sk_protocol == NETLINK_GENERIC)
                atomic_inc(&genl_sk_destructing_cnt);
        netlink_table_ungrab();
}

static struct proto netlink_proto = {
        .name          = "NETLINK",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct netlink_sock),
};

static int __netlink_create(struct net *net, struct socket *sock,
                            int protocol, int kern)
{
        struct sock *sk;
        struct netlink_sock *nlk;

        sock->ops = &netlink_ops;

        sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
        if (!sk)
                return -ENOMEM;

        sock_init_data(sock, sk);

        nlk = nlk_sk(sk);
        mutex_init(&nlk->nl_cb_mutex);
        lockdep_set_class_and_name(&nlk->nl_cb_mutex,
                                           nlk_cb_mutex_keys + protocol,
                                           nlk_cb_mutex_key_strings[protocol]);
        init_waitqueue_head(&nlk->wait);

        sk->sk_destruct = netlink_sock_destruct;
        sk->sk_protocol = protocol;
        return 0;
}

static int netlink_create(struct net *net, struct socket *sock, int protocol,
                          int kern)
{
        struct module *module = NULL;
        struct netlink_sock *nlk;
        int (*bind)(struct net *net, int group);
        void (*unbind)(struct net *net, int group);
        void (*release)(struct sock *sock, unsigned long *groups);
        int err = 0;

        sock->state = SS_UNCONNECTED;

        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
                return -ESOCKTNOSUPPORT;

        if (protocol < 0 || protocol >= MAX_LINKS)
                return -EPROTONOSUPPORT;
        protocol = array_index_nospec(protocol, MAX_LINKS);

        netlink_lock_table();
#ifdef CONFIG_MODULES
        if (!nl_table[protocol].registered) {
                netlink_unlock_table();
                request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
                netlink_lock_table();
        }
#endif
        if (nl_table[protocol].registered &&
            try_module_get(nl_table[protocol].module))
                module = nl_table[protocol].module;
        else
                err = -EPROTONOSUPPORT;
        bind = nl_table[protocol].bind;
        unbind = nl_table[protocol].unbind;
        release = nl_table[protocol].release;
        netlink_unlock_table();

        if (err < 0)
                goto out;

        err = __netlink_create(net, sock, protocol, kern);
        if (err < 0)
                goto out_module;

        sock_prot_inuse_add(net, &netlink_proto, 1);

        nlk = nlk_sk(sock->sk);
        nlk->module = module;
        nlk->netlink_bind = bind;
        nlk->netlink_unbind = unbind;
        nlk->netlink_release = release;
out:
        return err;

out_module:
        module_put(module);
        goto out;
}

static void deferred_put_nlk_sk(struct rcu_head *head)
{
        struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
        struct sock *sk = &nlk->sk;

        kfree(nlk->groups);
        nlk->groups = NULL;

        if (!refcount_dec_and_test(&sk->sk_refcnt))
                return;

        sk_free(sk);
}

static int netlink_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk;

        if (!sk)
                return 0;

        netlink_remove(sk);
        sock_orphan(sk);
        nlk = nlk_sk(sk);

        /*
         * OK. Socket is unlinked, any packets that arrive now
         * will be purged.
         */
        if (nlk->netlink_release)
                nlk->netlink_release(sk, nlk->groups);

        /* must not acquire netlink_table_lock in any way again before unbind
         * and notifying genetlink is done as otherwise it might deadlock
         */
        if (nlk->netlink_unbind) {
                int i;

                for (i = 0; i < nlk->ngroups; i++)
                        if (test_bit(i, nlk->groups))
                                nlk->netlink_unbind(sock_net(sk), i + 1);
        }
        if (sk->sk_protocol == NETLINK_GENERIC &&
            atomic_dec_return(&genl_sk_destructing_cnt) == 0)
                wake_up(&genl_sk_destructing_waitq);

        sock->sk = NULL;
        wake_up_interruptible_all(&nlk->wait);

        skb_queue_purge(&sk->sk_write_queue);

        if (nlk->portid && nlk->bound) {
                struct netlink_notify n = {
                                                .net = sock_net(sk),
                                                .protocol = sk->sk_protocol,
                                                .portid = nlk->portid,
                                          };
                blocking_notifier_call_chain(&netlink_chain,
                                NETLINK_URELEASE, &n);
        }

        /* Terminate any outstanding dump */
        if (nlk->cb_running) {
                if (nlk->cb.done)
                        nlk->cb.done(&nlk->cb);
                module_put(nlk->cb.module);
                kfree_skb(nlk->cb.skb);
                WRITE_ONCE(nlk->cb_running, false);
        }

        module_put(nlk->module);

        if (netlink_is_kernel(sk)) {
                netlink_table_grab();
                BUG_ON(nl_table[sk->sk_protocol].registered == 0);
                if (--nl_table[sk->sk_protocol].registered == 0) {
                        struct listeners *old;

                        old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
                        RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
                        kfree_rcu(old, rcu);
                        nl_table[sk->sk_protocol].module = NULL;
                        nl_table[sk->sk_protocol].bind = NULL;
                        nl_table[sk->sk_protocol].unbind = NULL;
                        nl_table[sk->sk_protocol].flags = 0;
                        nl_table[sk->sk_protocol].registered = 0;
                }
                netlink_table_ungrab();
        }

        sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);

        call_rcu(&nlk->rcu, deferred_put_nlk_sk);
        return 0;
}

static int netlink_autobind(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_table *table = &nl_table[sk->sk_protocol];
        s32 portid = task_tgid_vnr(current);
        int err;
        s32 rover = -4096;
        bool ok;

retry:
        cond_resched();
        rcu_read_lock();
        ok = !__netlink_lookup(table, portid, net);
        rcu_read_unlock();
        if (!ok) {
                /* Bind collision, search negative portid values. */
                if (rover == -4096)
                        /* rover will be in range [S32_MIN, -4097] */
                        rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN);
                else if (rover >= -4096)
                        rover = -4097;
                portid = rover--;
                goto retry;
        }

        err = netlink_insert(sk, portid);
        if (err == -EADDRINUSE)
                goto retry;

        /* If 2 threads race to autobind, that is fine.  */
        if (err == -EBUSY)
                err = 0;

        return err;
}

/**
 * __netlink_ns_capable - General netlink message capability test
 * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
                        struct user_namespace *user_ns, int cap)
{
        return ((nsp->flags & NETLINK_SKB_DST) ||
                file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(__netlink_ns_capable);

/**
 * netlink_ns_capable - General netlink message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in the user namespace @user_ns.
 */
bool netlink_ns_capable(const struct sk_buff *skb,
                        struct user_namespace *user_ns, int cap)
{
        return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);

/**
 * netlink_capable - Netlink global message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap in all user namespaces.
 */
bool netlink_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);

/**
 * netlink_net_capable - Netlink network namespace message capability test
 * @skb: socket buffer holding a netlink command from userspace
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket we received the message
 * from had when the netlink socket was created and the sender of the
 * message has the capability @cap over the network namespace of
 * the socket we received the message from.
 */
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
        return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);

static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
{
        return (nl_table[sock->sk->sk_protocol].flags & flag) ||
                ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
}

static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (nlk->subscriptions && !subscriptions)
                __sk_del_bind_node(sk);
        else if (!nlk->subscriptions && subscriptions)
                sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
        nlk->subscriptions = subscriptions;
}

static int netlink_realloc_groups(struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int groups;
        unsigned long *new_groups;
        int err = 0;

        netlink_table_grab();

        groups = nl_table[sk->sk_protocol].groups;
        if (!nl_table[sk->sk_protocol].registered) {
                err = -ENOENT;
                goto out_unlock;
        }

        if (nlk->ngroups >= groups)
                goto out_unlock;

        new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
        if (new_groups == NULL) {
                err = -ENOMEM;
                goto out_unlock;
        }
        memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
               NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));

        nlk->groups = new_groups;
        nlk->ngroups = groups;
 out_unlock:
        netlink_table_ungrab();
        return err;
}

static void netlink_undo_bind(int group, long unsigned int groups,
                              struct sock *sk)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int undo;

        if (!nlk->netlink_unbind)
                return;

        for (undo = 0; undo < group; undo++)
                if (test_bit(undo, &groups))
                        nlk->netlink_unbind(sock_net(sk), undo + 1);
}

static int netlink_bind(struct socket *sock, struct sockaddr *addr,
                        int addr_len)
{
        struct sock *sk = sock->sk;
        struct net *net = sock_net(sk);
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
        int err = 0;
        unsigned long groups;
        bool bound;

        if (addr_len < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if (nladdr->nl_family != AF_NETLINK)
                return -EINVAL;
        groups = nladdr->nl_groups;

        /* Only superuser is allowed to listen multicasts */
        if (groups) {
                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
        }

        if (nlk->ngroups < BITS_PER_LONG)
                groups &= (1UL << nlk->ngroups) - 1;

        /* Paired with WRITE_ONCE() in netlink_insert() */
        bound = READ_ONCE(nlk->bound);
        if (bound) {
                /* Ensure nlk->portid is up-to-date. */
                smp_rmb();

                if (nladdr->nl_pid != nlk->portid)
                        return -EINVAL;
        }

        if (nlk->netlink_bind && groups) {
                int group;

                /* nl_groups is a u32, so cap the maximum groups we can bind */
                for (group = 0; group < BITS_PER_TYPE(u32); group++) {
                        if (!test_bit(group, &groups))
                                continue;
                        err = nlk->netlink_bind(net, group + 1);
                        if (!err)
                                continue;
                        netlink_undo_bind(group, groups, sk);
                        return err;
                }
        }

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         */
        netlink_lock_table();
        if (!bound) {
                err = nladdr->nl_pid ?
                        netlink_insert(sk, nladdr->nl_pid) :
                        netlink_autobind(sock);
                if (err) {
                        netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk);
                        goto unlock;
                }
        }

        if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
                goto unlock;
        netlink_unlock_table();

        netlink_table_grab();
        netlink_update_subscriptions(sk, nlk->subscriptions +
                                         hweight32(groups) -
                                         hweight32(nlk->groups[0]));
        nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
        netlink_update_listeners(sk);
        netlink_table_ungrab();

        return 0;

unlock:
        netlink_unlock_table();
        return err;
}

static int netlink_connect(struct socket *sock, struct sockaddr *addr,
                           int alen, int flags)
{
        int err = 0;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;

        if (alen < sizeof(addr->sa_family))
                return -EINVAL;

        if (addr->sa_family == AF_UNSPEC) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, 0);
                WRITE_ONCE(nlk->dst_group, 0);
                return 0;
        }
        if (addr->sa_family != AF_NETLINK)
                return -EINVAL;

        if (alen < sizeof(struct sockaddr_nl))
                return -EINVAL;

        if ((nladdr->nl_groups || nladdr->nl_pid) &&
            !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                return -EPERM;

        /* No need for barriers here as we return to user-space without
         * using any of the bound attributes.
         * Paired with WRITE_ONCE() in netlink_insert().
         */
        if (!READ_ONCE(nlk->bound))
                err = netlink_autobind(sock);

        if (err == 0) {
                /* paired with READ_ONCE() in netlink_getsockbyportid() */
                WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED);
                /* dst_portid and dst_group can be read locklessly */
                WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid);
                WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups));
        }

        return err;
}

static int netlink_getname(struct socket *sock, struct sockaddr *addr,
                           int peer)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);

        nladdr->nl_family = AF_NETLINK;
        nladdr->nl_pad = 0;

        if (peer) {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                nladdr->nl_pid = READ_ONCE(nlk->dst_portid);
                nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group));
        } else {
                /* Paired with WRITE_ONCE() in netlink_insert() */
                nladdr->nl_pid = READ_ONCE(nlk->portid);
                netlink_lock_table();
                nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
                netlink_unlock_table();
        }
        return sizeof(*nladdr);
}

static int netlink_ioctl(struct socket *sock, unsigned int cmd,
                         unsigned long arg)
{
        /* try to hand this ioctl down to the NIC drivers.
         */
        return -ENOIOCTLCMD;
}

static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
{
        struct sock *sock;
        struct netlink_sock *nlk;

        sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
        if (!sock)
                return ERR_PTR(-ECONNREFUSED);

        /* Don't bother queuing skb if kernel socket has no input function */
        nlk = nlk_sk(sock);
        /* dst_portid and sk_state can be changed in netlink_connect() */
        if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED &&
            READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) {
                sock_put(sock);
                return ERR_PTR(-ECONNREFUSED);
        }
        return sock;
}

struct sock *netlink_getsockbyfd(int fd)
{
        CLASS(fd, f)(fd);
        struct inode *inode;
        struct sock *sock;

        if (fd_empty(f))
                return ERR_PTR(-EBADF);

        inode = file_inode(fd_file(f));
        if (!S_ISSOCK(inode->i_mode))
                return ERR_PTR(-ENOTSOCK);

        sock = SOCKET_I(inode)->sk;
        if (sock->sk_family != AF_NETLINK)
                return ERR_PTR(-EINVAL);

        sock_hold(sock);
        return sock;
}

struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast)
{
        size_t head_size = SKB_HEAD_ALIGN(size);
        struct sk_buff *skb;
        void *data;

        if (head_size <= PAGE_SIZE || broadcast)
                return alloc_skb(size, GFP_KERNEL);

        data = kvmalloc(head_size, GFP_KERNEL);
        if (!data)
                return NULL;

        skb = __build_skb(data, head_size);
        if (!skb)
                kvfree(data);
        else if (is_vmalloc_addr(data))
                skb->destructor = netlink_skb_destructor;

        return skb;
}

/*
 * Attach a skb to a netlink socket.
 * The caller must hold a reference to the destination socket. On error, the
 * reference is dropped. The skb is not send to the destination, just all
 * all error checks are performed and memory in the queue is reserved.
 * Return values:
 * < 0: error. skb freed, reference to sock dropped.
 * 0: continue
 * 1: repeat lookup - reference dropped while waiting for socket memory.
 */
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
                      long *timeo, struct sock *ssk)
{
        struct netlink_sock *nlk;

        nlk = nlk_sk(sk);

        if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
             test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
                DECLARE_WAITQUEUE(wait, current);
                if (!*timeo) {
                        if (!ssk || netlink_is_kernel(ssk))
                                netlink_overrun(sk);
                        sock_put(sk);
                        kfree_skb(skb);
                        return -EAGAIN;
                }

                __set_current_state(TASK_INTERRUPTIBLE);
                add_wait_queue(&nlk->wait, &wait);

                if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
                     test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
                    !sock_flag(sk, SOCK_DEAD))
                        *timeo = schedule_timeout(*timeo);

                __set_current_state(TASK_RUNNING);
                remove_wait_queue(&nlk->wait, &wait);
                sock_put(sk);

                if (signal_pending(current)) {
                        kfree_skb(skb);
                        return sock_intr_errno(*timeo);
                }
                return 1;
        }
        netlink_skb_set_owner_r(skb, sk);
        return 0;
}

static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = skb->len;

        netlink_deliver_tap(sock_net(sk), skb);

        skb_queue_tail(&sk->sk_receive_queue, skb);
        sk->sk_data_ready(sk);
        return len;
}

int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
        int len = __netlink_sendskb(sk, skb);

        sock_put(sk);
        return len;
}

void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
        kfree_skb(skb);
        sock_put(sk);
}

static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
{
        int delta;

        skb_assert_len(skb);
        WARN_ON(skb->sk != NULL);
        delta = skb->end - skb->tail;
        if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
                return skb;

        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, allocation);
                if (!nskb)
                        return skb;
                consume_skb(skb);
                skb = nskb;
        }

        pskb_expand_head(skb, 0, -delta,
                         (allocation & ~__GFP_DIRECT_RECLAIM) |
                         __GFP_NOWARN | __GFP_NORETRY);
        return skb;
}

static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
                                  struct sock *ssk)
{
        int ret;
        struct netlink_sock *nlk = nlk_sk(sk);

        ret = -ECONNREFUSED;
        if (nlk->netlink_rcv != NULL) {
                ret = skb->len;
                netlink_skb_set_owner_r(skb, sk);
                NETLINK_CB(skb).sk = ssk;
                netlink_deliver_tap_kernel(sk, ssk, skb);
                nlk->netlink_rcv(skb);
                consume_skb(skb);
        } else {
                kfree_skb(skb);
        }
        sock_put(sk);
        return ret;
}

int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
                    u32 portid, int nonblock)
{
        struct sock *sk;
        int err;
        long timeo;

        skb = netlink_trim(skb, gfp_any());

        timeo = sock_sndtimeo(ssk, nonblock);
retry:
        sk = netlink_getsockbyportid(ssk, portid);
        if (IS_ERR(sk)) {
                kfree_skb(skb);
                return PTR_ERR(sk);
        }
        if (netlink_is_kernel(sk))
                return netlink_unicast_kernel(sk, skb, ssk);

        if (sk_filter(sk, skb)) {
                err = skb->len;
                kfree_skb(skb);
                sock_put(sk);
                return err;
        }

        err = netlink_attachskb(sk, skb, &timeo, ssk);
        if (err == 1)
                goto retry;
        if (err)
                return err;

        return netlink_sendskb(sk, skb);
}
EXPORT_SYMBOL(netlink_unicast);

int netlink_has_listeners(struct sock *sk, unsigned int group)
{
        int res = 0;
        struct listeners *listeners;

        BUG_ON(!netlink_is_kernel(sk));

        rcu_read_lock();
        listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);

        if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
                res = test_bit(group - 1, listeners->masks);

        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);

bool netlink_strict_get_check(struct sk_buff *skb)
{
        return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);
}
EXPORT_SYMBOL_GPL(netlink_strict_get_check);

static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
        struct netlink_sock *nlk = nlk_sk(sk);

        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
            !test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
                netlink_skb_set_owner_r(skb, sk);
                __netlink_sendskb(sk, skb);
                return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
        }
        return -1;
}

struct netlink_broadcast_data {
        struct sock *exclude_sk;
        struct net *net;
        u32 portid;
        u32 group;
        int failure;
        int delivery_failure;
        int congested;
        int delivered;
        gfp_t allocation;
        struct sk_buff *skb, *skb2;
        int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
        void *tx_data;
};

static void do_one_broadcast(struct sock *sk,
                                    struct netlink_broadcast_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int val;

        if (p->exclude_sk == sk)
                return;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                return;

        if (!net_eq(sock_net(sk), p->net)) {
                if (!nlk_test_bit(LISTEN_ALL_NSID, sk))
                        return;

                if (!peernet_has_id(sock_net(sk), p->net))
                        return;

                if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
                                     CAP_NET_BROADCAST))
                        return;
        }

        if (p->failure) {
                netlink_overrun(sk);
                return;
        }

        sock_hold(sk);
        if (p->skb2 == NULL) {
                if (skb_shared(p->skb)) {
                        p->skb2 = skb_clone(p->skb, p->allocation);
                } else {
                        p->skb2 = skb_get(p->skb);
                        /*
                         * skb ownership may have been set when
                         * delivered to a previous socket.
                         */
                        skb_orphan(p->skb2);
                }
        }
        if (p->skb2 == NULL) {
                netlink_overrun(sk);
                /* Clone failed. Notify ALL listeners. */
                p->failure = 1;
                if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
                        p->delivery_failure = 1;
                goto out;
        }

        if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }

        if (sk_filter(sk, p->skb2)) {
                kfree_skb(p->skb2);
                p->skb2 = NULL;
                goto out;
        }
        NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
        if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
                NETLINK_CB(p->skb2).nsid_is_set = true;
        val = netlink_broadcast_deliver(sk, p->skb2);
        if (val < 0) {
                netlink_overrun(sk);
                if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
                        p->delivery_failure = 1;
        } else {
                p->congested |= val;
                p->delivered = 1;
                p->skb2 = NULL;
        }
out:
        sock_put(sk);
}

int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
                               u32 portid,
                               u32 group, gfp_t allocation,
                               netlink_filter_fn filter,
                               void *filter_data)
{
        struct net *net = sock_net(ssk);
        struct netlink_broadcast_data info;
        struct sock *sk;

        skb = netlink_trim(skb, allocation);

        info.exclude_sk = ssk;
        info.net = net;
        info.portid = portid;
        info.group = group;
        info.failure = 0;
        info.delivery_failure = 0;
        info.congested = 0;
        info.delivered = 0;
        info.allocation = allocation;
        info.skb = skb;
        info.skb2 = NULL;
        info.tx_filter = filter;
        info.tx_data = filter_data;

        /* While we sleep in clone, do not allow to change socket list */

        netlink_lock_table();

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                do_one_broadcast(sk, &info);

        consume_skb(skb);

        netlink_unlock_table();

        if (info.delivery_failure) {
                kfree_skb(info.skb2);
                return -ENOBUFS;
        }
        consume_skb(info.skb2);

        if (info.delivered) {
                if (info.congested && gfpflags_allow_blocking(allocation))
                        yield();
                return 0;
        }
        return -ESRCH;
}
EXPORT_SYMBOL(netlink_broadcast_filtered);

int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
                      u32 group, gfp_t allocation)
{
        return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
                                          NULL, NULL);
}
EXPORT_SYMBOL(netlink_broadcast);

struct netlink_set_err_data {
        struct sock *exclude_sk;
        u32 portid;
        u32 group;
        int code;
};

static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        int ret = 0;

        if (sk == p->exclude_sk)
                goto out;

        if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
                goto out;

        if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
            !test_bit(p->group - 1, nlk->groups))
                goto out;

        if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
                ret = 1;
                goto out;
        }

        WRITE_ONCE(sk->sk_err, p->code);
        sk_error_report(sk);
out:
        return ret;
}

/**
 * netlink_set_err - report error to broadcast listeners
 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
 * @portid: the PORTID of a process that we want to skip (if any)
 * @group: the broadcast group that will notice the error
 * @code: error code, must be negative (as usual in kernelspace)
 *
 * This function returns the number of broadcast listeners that have set the
 * NETLINK_NO_ENOBUFS socket option.
 */
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
{
        struct netlink_set_err_data info;
        unsigned long flags;
        struct sock *sk;
        int ret = 0;

        info.exclude_sk = ssk;
        info.portid = portid;
        info.group = group;
        /* sk->sk_err wants a positive error value */
        info.code = -code;

        read_lock_irqsave(&nl_table_lock, flags);

        sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
                ret += do_one_set_err(sk, &info);

        read_unlock_irqrestore(&nl_table_lock, flags);
        return ret;
}
EXPORT_SYMBOL(netlink_set_err);

/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
                                     unsigned int group,
                                     int is_new)
{
        int old, new = !!is_new, subscriptions;

        old = test_bit(group - 1, nlk->groups);
        subscriptions = nlk->subscriptions - old + new;
        __assign_bit(group - 1, nlk->groups, new);
        netlink_update_subscriptions(&nlk->sk, subscriptions);
        netlink_update_listeners(&nlk->sk);
}

static int netlink_setsockopt(struct socket *sock, int level, int optname,
                              sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int val = 0;
        int nr = -1;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (optlen >= sizeof(int) &&
            copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        switch (optname) {
        case NETLINK_PKTINFO:
                nr = NETLINK_F_RECV_PKTINFO;
                break;
        case NETLINK_ADD_MEMBERSHIP:
        case NETLINK_DROP_MEMBERSHIP: {
                int err;

                if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
                        return -EPERM;
                err = netlink_realloc_groups(sk);
                if (err)
                        return err;
                if (!val || val - 1 >= nlk->ngroups)
                        return -EINVAL;
                if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
                        err = nlk->netlink_bind(sock_net(sk), val);
                        if (err)
                                return err;
                }
                netlink_table_grab();
                netlink_update_socket_mc(nlk, val,
                                         optname == NETLINK_ADD_MEMBERSHIP);
                netlink_table_ungrab();
                if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
                        nlk->netlink_unbind(sock_net(sk), val);

                break;
        }
        case NETLINK_BROADCAST_ERROR:
                nr = NETLINK_F_BROADCAST_SEND_ERROR;
                break;
        case NETLINK_NO_ENOBUFS:
                assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val);
                if (val) {
                        clear_bit(NETLINK_S_CONGESTED, &nlk->state);
                        wake_up_interruptible(&nlk->wait);
                }
                break;
        case NETLINK_LISTEN_ALL_NSID:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
                        return -EPERM;
                nr = NETLINK_F_LISTEN_ALL_NSID;
                break;
        case NETLINK_CAP_ACK:
                nr = NETLINK_F_CAP_ACK;
                break;
        case NETLINK_EXT_ACK:
                nr = NETLINK_F_EXT_ACK;
                break;
        case NETLINK_GET_STRICT_CHK:
                nr = NETLINK_F_STRICT_CHK;
                break;
        default:
                return -ENOPROTOOPT;
        }
        if (nr >= 0)
                assign_bit(nr, &nlk->flags, val);
        return 0;
}

static int netlink_getsockopt(struct socket *sock, int level, int optname,
                              char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        unsigned int flag;
        int len, val;

        if (level != SOL_NETLINK)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case NETLINK_PKTINFO:
                flag = NETLINK_F_RECV_PKTINFO;
                break;
        case NETLINK_BROADCAST_ERROR:
                flag = NETLINK_F_BROADCAST_SEND_ERROR;
                break;
        case NETLINK_NO_ENOBUFS:
                flag = NETLINK_F_RECV_NO_ENOBUFS;
                break;
        case NETLINK_LIST_MEMBERSHIPS: {
                int pos, idx, shift, err = 0;

                netlink_lock_table();
                for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
                        if (len - pos < sizeof(u32))
                                break;

                        idx = pos / sizeof(unsigned long);
                        shift = (pos % sizeof(unsigned long)) * 8;
                        if (put_user((u32)(nlk->groups[idx] >> shift),
                                     (u32 __user *)(optval + pos))) {
                                err = -EFAULT;
                                break;
                        }
                }
                if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen))
                        err = -EFAULT;
                netlink_unlock_table();
                return err;
        }
        case NETLINK_LISTEN_ALL_NSID:
                flag = NETLINK_F_LISTEN_ALL_NSID;
                break;
        case NETLINK_CAP_ACK:
                flag = NETLINK_F_CAP_ACK;
                break;
        case NETLINK_EXT_ACK:
                flag = NETLINK_F_EXT_ACK;
                break;
        case NETLINK_GET_STRICT_CHK:
                flag = NETLINK_F_STRICT_CHK;
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (len < sizeof(int))
                return -EINVAL;

        len = sizeof(int);
        val = test_bit(flag, &nlk->flags);

        if (put_user(len, optlen) ||
            copy_to_user(optval, &val, len))
                return -EFAULT;

        return 0;
}

static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
        struct nl_pktinfo info;

        info.group = NETLINK_CB(skb).dst_group;
        put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}

static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
                                         struct sk_buff *skb)
{
        if (!NETLINK_CB(skb).nsid_is_set)
                return;

        put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
                 &NETLINK_CB(skb).nsid);
}

static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
        u32 dst_portid;
        u32 dst_group;
        struct sk_buff *skb;
        int err;
        struct scm_cookie scm;
        u32 netlink_skb_flags = 0;

        if (msg->msg_flags & MSG_OOB)
                return -EOPNOTSUPP;

        if (len == 0) {
                pr_warn_once("Zero length message leads to an empty skb\n");
                return -ENODATA;
        }

        err = scm_send(sock, msg, &scm, true);
        if (err < 0)
                return err;

        if (msg->msg_namelen) {
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(struct sockaddr_nl))
                        goto out;
                if (addr->nl_family != AF_NETLINK)
                        goto out;
                dst_portid = addr->nl_pid;
                dst_group = ffs(addr->nl_groups);
                err =  -EPERM;
                if ((dst_group || dst_portid) &&
                    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
                        goto out;
                netlink_skb_flags |= NETLINK_SKB_DST;
        } else {
                /* Paired with WRITE_ONCE() in netlink_connect() */
                dst_portid = READ_ONCE(nlk->dst_portid);
                dst_group = READ_ONCE(nlk->dst_group);
        }

        /* Paired with WRITE_ONCE() in netlink_insert() */
        if (!READ_ONCE(nlk->bound)) {
                err = netlink_autobind(sock);
                if (err)
                        goto out;
        } else {
                /* Ensure nlk is hashed and visible. */
                smp_rmb();
        }

        err = -EMSGSIZE;
        if (len > sk->sk_sndbuf - 32)
                goto out;
        err = -ENOBUFS;
        skb = netlink_alloc_large_skb(len, dst_group);
        if (skb == NULL)
                goto out;

        NETLINK_CB(skb).portid        = nlk->portid;
        NETLINK_CB(skb).dst_group = dst_group;
        NETLINK_CB(skb).creds        = scm.creds;
        NETLINK_CB(skb).flags        = netlink_skb_flags;

        err = -EFAULT;
        if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
                kfree_skb(skb);
                goto out;
        }

        err = security_netlink_send(sk, skb);
        if (err) {
                kfree_skb(skb);
                goto out;
        }

        if (dst_group) {
                refcount_inc(&skb->users);
                netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
        }
        err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT);

out:
        scm_destroy(&scm);
        return err;
}

static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                           int flags)
{
        struct scm_cookie scm;
        struct sock *sk = sock->sk;
        struct netlink_sock *nlk = nlk_sk(sk);
        size_t copied, max_recvmsg_len;
        struct sk_buff *skb, *data_skb;
        int err, ret;

        if (flags & MSG_OOB)
                return -EOPNOTSUPP;

        copied = 0;

        skb = skb_recv_datagram(sk, flags, &err);
        if (skb == NULL)
                goto out;

        data_skb = skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                /*
                 * If this skb has a frag_list, then here that means that we
                 * will have to use the frag_list skb's data for compat tasks
                 * and the regular skb's data for normal (non-compat) tasks.
                 *
                 * If we need to send the compat skb, assign it to the
                 * 'data_skb' variable so that it will be used below for data
                 * copying. We keep 'skb' for everything else, including
                 * freeing both later.
                 */
                if (flags & MSG_CMSG_COMPAT)
                        data_skb = skb_shinfo(skb)->frag_list;
        }
#endif

        /* Record the max length of recvmsg() calls for future allocations */
        max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len);
        max_recvmsg_len = min_t(size_t, max_recvmsg_len,
                                SKB_WITH_OVERHEAD(32768));
        WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len);

        copied = data_skb->len;
        if (len < copied) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }

        err = skb_copy_datagram_msg(data_skb, 0, msg, copied);

        if (msg->msg_name) {
                DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
                addr->nl_family = AF_NETLINK;
                addr->nl_pad    = 0;
                addr->nl_pid        = NETLINK_CB(skb).portid;
                addr->nl_groups        = netlink_group_mask(NETLINK_CB(skb).dst_group);
                msg->msg_namelen = sizeof(*addr);
        }

        if (nlk_test_bit(RECV_PKTINFO, sk))
                netlink_cmsg_recv_pktinfo(msg, skb);
        if (nlk_test_bit(LISTEN_ALL_NSID, sk))
                netlink_cmsg_listen_all_nsid(sk, msg, skb);

        memset(&scm, 0, sizeof(scm));
        scm.creds = *NETLINK_CREDS(skb);
        if (flags & MSG_TRUNC)
                copied = data_skb->len;

        skb_free_datagram(sk, skb);

        if (READ_ONCE(nlk->cb_running) &&
            atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
                ret = netlink_dump(sk, false);
                if (ret) {
                        WRITE_ONCE(sk->sk_err, -ret);
                        sk_error_report(sk);
                }
        }

        scm_recv(sock, msg, &scm, flags);
out:
        netlink_rcv_wake(sk);
        return err ? : copied;
}

static void netlink_data_ready(struct sock *sk)
{
        BUG();
}

/*
 *        We export these functions to other modules. They provide a
 *        complete set of kernel non-blocking support for message
 *        queueing.
 */

struct sock *
__netlink_kernel_create(struct net *net, int unit, struct module *module,
                        struct netlink_kernel_cfg *cfg)
{
        struct socket *sock;
        struct sock *sk;
        struct netlink_sock *nlk;
        struct listeners *listeners = NULL;
        unsigned int groups;

        BUG_ON(!nl_table);

        if (unit < 0 || unit >= MAX_LINKS)
                return NULL;

        if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
                return NULL;

        if (__netlink_create(net, sock, unit, 1) < 0)
                goto out_sock_release_nosk;

        sk = sock->sk;

        if (!cfg || cfg->groups < 32)
                groups = 32;
        else
                groups = cfg->groups;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                goto out_sock_release;

        sk->sk_data_ready = netlink_data_ready;
        if (cfg && cfg->input)
                nlk_sk(sk)->netlink_rcv = cfg->input;

        if (netlink_insert(sk, 0))
                goto out_sock_release;

        nlk = nlk_sk(sk);
        set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags);

        netlink_table_grab();
        if (!nl_table[unit].registered) {
                nl_table[unit].groups = groups;
                rcu_assign_pointer(nl_table[unit].listeners, listeners);
                nl_table[unit].module = module;
                if (cfg) {
                        nl_table[unit].bind = cfg->bind;
                        nl_table[unit].unbind = cfg->unbind;
                        nl_table[unit].release = cfg->release;
                        nl_table[unit].flags = cfg->flags;
                }
                nl_table[unit].registered = 1;
        } else {
                kfree(listeners);
                nl_table[unit].registered++;
        }
        netlink_table_ungrab();
        return sk;

out_sock_release:
        kfree(listeners);
        netlink_kernel_release(sk);
        return NULL;

out_sock_release_nosk:
        sock_release(sock);
        return NULL;
}
EXPORT_SYMBOL(__netlink_kernel_create);

void
netlink_kernel_release(struct sock *sk)
{
        if (sk == NULL || sk->sk_socket == NULL)
                return;

        sock_release(sk->sk_socket);
}
EXPORT_SYMBOL(netlink_kernel_release);

int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        struct listeners *new, *old;
        struct netlink_table *tbl = &nl_table[sk->sk_protocol];

        if (groups < 32)
                groups = 32;

        if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
                new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
                if (!new)
                        return -ENOMEM;
                old = nl_deref_protected(tbl->listeners);
                memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
                rcu_assign_pointer(tbl->listeners, new);

                kfree_rcu(old, rcu);
        }
        tbl->groups = groups;

        return 0;
}

/**
 * netlink_change_ngroups - change number of multicast groups
 *
 * This changes the number of multicast groups that are available
 * on a certain netlink family. Note that it is not possible to
 * change the number of groups to below 32. Also note that it does
 * not implicitly call netlink_clear_multicast_users() when the
 * number of groups is reduced.
 *
 * @sk: The kernel netlink socket, as returned by netlink_kernel_create().
 * @groups: The new number of groups.
 */
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
        int err;

        netlink_table_grab();
        err = __netlink_change_ngroups(sk, groups);
        netlink_table_ungrab();

        return err;
}

void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
        struct sock *sk;
        struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
        struct hlist_node *tmp;

        sk_for_each_bound_safe(sk, tmp, &tbl->mc_list)
                netlink_update_socket_mc(nlk_sk(sk), group, 0);
}

struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
{
        struct nlmsghdr *nlh;
        int size = nlmsg_msg_size(len);

        nlh = skb_put(skb, NLMSG_ALIGN(size));
        nlh->nlmsg_type = type;
        nlh->nlmsg_len = size;
        nlh->nlmsg_flags = flags;
        nlh->nlmsg_pid = portid;
        nlh->nlmsg_seq = seq;
        if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
                memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
        return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);

static size_t
netlink_ack_tlv_len(struct netlink_sock *nlk, int err,
                    const struct netlink_ext_ack *extack)
{
        size_t tlvlen;

        if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags))
                return 0;

        tlvlen = 0;
        if (extack->_msg)
                tlvlen += nla_total_size(strlen(extack->_msg) + 1);
        if (extack->cookie_len)
                tlvlen += nla_total_size(extack->cookie_len);

        /* Following attributes are only reported as error (not warning) */
        if (!err)
                return tlvlen;

        if (extack->bad_attr)
                tlvlen += nla_total_size(sizeof(u32));
        if (extack->policy)
                tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy);
        if (extack->miss_type)
                tlvlen += nla_total_size(sizeof(u32));
        if (extack->miss_nest)
                tlvlen += nla_total_size(sizeof(u32));

        return tlvlen;
}

static bool nlmsg_check_in_payload(const struct nlmsghdr *nlh, const void *addr)
{
        return !WARN_ON(addr < nlmsg_data(nlh) ||
                        addr - (const void *) nlh >= nlh->nlmsg_len);
}

static void
netlink_ack_tlv_fill(struct sk_buff *skb, const struct nlmsghdr *nlh, int err,
                     const struct netlink_ext_ack *extack)
{
        if (extack->_msg)
                WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg));
        if (extack->cookie_len)
                WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
                                extack->cookie_len, extack->cookie));

        if (!err)
                return;

        if (extack->bad_attr && nlmsg_check_in_payload(nlh, extack->bad_attr))
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
                                    (u8 *)extack->bad_attr - (const u8 *)nlh));
        if (extack->policy)
                netlink_policy_dump_write_attr(skb, extack->policy,
                                               NLMSGERR_ATTR_POLICY);
        if (extack->miss_type)
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE,
                                    extack->miss_type));
        if (extack->miss_nest && nlmsg_check_in_payload(nlh, extack->miss_nest))
                WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST,
                                    (u8 *)extack->miss_nest - (const u8 *)nlh));
}

/*
 * It looks a bit ugly.
 * It would be better to create kernel thread.
 */

static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb,
                             struct netlink_callback *cb,
                             struct netlink_ext_ack *extack)
{
        struct nlmsghdr *nlh;
        size_t extack_len;

        nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno),
                               NLM_F_MULTI | cb->answer_flags);
        if (WARN_ON(!nlh))
                return -ENOBUFS;

        nl_dump_check_consistent(cb, nlh);
        memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno));

        extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack);
        if (extack_len) {
                nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
                if (skb_tailroom(skb) >= extack_len) {
                        netlink_ack_tlv_fill(skb, cb->nlh,
                                             nlk->dump_done_errno, extack);
                        nlmsg_end(skb, nlh);
                }
        }

        return 0;
}

static int netlink_dump(struct sock *sk, bool lock_taken)
{
        struct netlink_sock *nlk = nlk_sk(sk);
        struct netlink_ext_ack extack = {};
        struct netlink_callback *cb;
        struct sk_buff *skb = NULL;
        size_t max_recvmsg_len;
        struct module *module;
        int err = -ENOBUFS;
        int alloc_min_size;
        int alloc_size;

        if (!lock_taken)
                mutex_lock(&nlk->nl_cb_mutex);
        if (!nlk->cb_running) {
                err = -EINVAL;
                goto errout_skb;
        }

        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
                goto errout_skb;

        /* NLMSG_GOODSIZE is small to avoid high order allocations being
         * required, but it makes sense to _attempt_ a 32KiB allocation
         * to reduce number of system calls on dump operations, if user
         * ever provided a big enough buffer.
         */
        cb = &nlk->cb;
        alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);

        max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len);
        if (alloc_min_size < max_recvmsg_len) {
                alloc_size = max_recvmsg_len;
                skb = alloc_skb(alloc_size,
                                (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
                                __GFP_NOWARN | __GFP_NORETRY);
        }
        if (!skb) {
                alloc_size = alloc_min_size;
                skb = alloc_skb(alloc_size, GFP_KERNEL);
        }
        if (!skb)
                goto errout_skb;

        /* Trim skb to allocated size. User is expected to provide buffer as
         * large as max(min_dump_alloc, 32KiB (max_recvmsg_len capped at
         * netlink_recvmsg())). dump will pack as many smaller messages as
         * could fit within the allocated skb. skb is typically allocated
         * with larger space than required (could be as much as near 2x the
         * requested size with align to next power of 2 approach). Allowing
         * dump to use the excess space makes it difficult for a user to have a
         * reasonable static buffer based on the expected largest dump of a
         * single netdev. The outcome is MSG_TRUNC error.
         */
        skb_reserve(skb, skb_tailroom(skb) - alloc_size);

        /* Make sure malicious BPF programs can not read unitialized memory
         * from skb->head -> skb->data
         */
        skb_reset_network_header(skb);
        skb_reset_mac_header(skb);

        netlink_skb_set_owner_r(skb, sk);

        if (nlk->dump_done_errno > 0) {
                cb->extack = &extack;

                nlk->dump_done_errno = cb->dump(skb, cb);

                /* EMSGSIZE plus something already in the skb means
                 * that there's more to dump but current skb has filled up.
                 * If the callback really wants to return EMSGSIZE to user space
                 * it needs to do so again, on the next cb->dump() call,
                 * without putting data in the skb.
                 */
                if (nlk->dump_done_errno == -EMSGSIZE && skb->len)
                        nlk->dump_done_errno = skb->len;

                cb->extack = NULL;
        }

        if (nlk->dump_done_errno > 0 ||
            skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
                mutex_unlock(&nlk->nl_cb_mutex);

                if (sk_filter(sk, skb))
                        kfree_skb(skb);
                else
                        __netlink_sendskb(sk, skb);
                return 0;
        }

        if (netlink_dump_done(nlk, skb, cb, &extack))
                goto errout_skb;

#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
        /* frag_list skb's data is used for compat tasks
         * and the regular skb's data for normal (non-compat) tasks.
         * See netlink_recvmsg().
         */
        if (unlikely(skb_shinfo(skb)->frag_list)) {
                if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack))
                        goto errout_skb;
        }
#endif

        if (sk_filter(sk, skb))
                kfree_skb(skb);
        else
                __netlink_sendskb(sk, skb);

        if (cb->done)
                cb->done(cb);

        WRITE_ONCE(nlk->cb_running, false);
        module = cb->module;
        skb = cb->skb;
        mutex_unlock(&nlk->nl_cb_mutex);
        module_put(module);
        consume_skb(skb);
        return 0;

errout_skb:
        mutex_unlock(&nlk->nl_cb_mutex);
        kfree_skb(skb);
        return err;
}

int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
                         const struct nlmsghdr *nlh,
                         struct netlink_dump_control *control)
{
        struct netlink_callback *cb;
        struct netlink_sock *nlk;
        struct sock *sk;
        int ret;

        refcount_inc(&skb->users);

        sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
        if (sk == NULL) {
                ret = -ECONNREFUSED;
                goto error_free;
        }

        nlk = nlk_sk(sk);
        mutex_lock(&nlk->nl_cb_mutex);
        /* A dump is in progress... */
        if (nlk->cb_running) {
                ret = -EBUSY;
                goto error_unlock;
        }
        /* add reference of module which cb->dump belongs to */
        if (!try_module_get(control->module)) {
                ret = -EPROTONOSUPPORT;
                goto error_unlock;
        }

        cb = &nlk->cb;
        memset(cb, 0, sizeof(*cb));
        cb->dump = control->dump;
        cb->done = control->done;
        cb->nlh = nlh;
        cb->data = control->data;
        cb->module = control->module;
        cb->min_dump_alloc = control->min_dump_alloc;
        cb->flags = control->flags;
        cb->skb = skb;

        cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);

        if (control->start) {
                cb->extack = control->extack;
                ret = control->start(cb);
                cb->extack = NULL;
                if (ret)
                        goto error_put;
        }

        WRITE_ONCE(nlk->cb_running, true);
        nlk->dump_done_errno = INT_MAX;

        ret = netlink_dump(sk, true);

        sock_put(sk);

        if (ret)
                return ret;

        /* We successfully started a dump, by returning -EINTR we
         * signal not to send ACK even if it was requested.
         */
        return -EINTR;

error_put:
        module_put(control->module);
error_unlock:
        sock_put(sk);
        mutex_unlock(&nlk->nl_cb_mutex);
error_free:
        kfree_skb(skb);
        return ret;
}
EXPORT_SYMBOL(__netlink_dump_start);

void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
                 const struct netlink_ext_ack *extack)
{
        struct sk_buff *skb;
        struct nlmsghdr *rep;
        struct nlmsgerr *errmsg;
        size_t payload = sizeof(*errmsg);
        struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
        unsigned int flags = 0;
        size_t tlvlen;

        /* Error messages get the original request appened, unless the user
         * requests to cap the error message, and get extra error data if
         * requested.
         */
        if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags))
                payload += nlmsg_len(nlh);
        else
                flags |= NLM_F_CAPPED;

        tlvlen = netlink_ack_tlv_len(nlk, err, extack);
        if (tlvlen)
                flags |= NLM_F_ACK_TLVS;

        skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
        if (!skb)
                goto err_skb;

        rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                        NLMSG_ERROR, sizeof(*errmsg), flags);
        if (!rep)
                goto err_bad_put;
        errmsg = nlmsg_data(rep);
        errmsg->error = err;
        errmsg->msg = *nlh;

        if (!(flags & NLM_F_CAPPED)) {
                if (!nlmsg_append(skb, nlmsg_len(nlh)))
                        goto err_bad_put;

                memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh),
                       nlmsg_len(nlh));
        }

        if (tlvlen)
                netlink_ack_tlv_fill(skb, nlh, err, extack);

        nlmsg_end(skb, rep);

        nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);

        return;

err_bad_put:
        nlmsg_free(skb);
err_skb:
        WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS);
        sk_error_report(NETLINK_CB(in_skb).sk);
}
EXPORT_SYMBOL(netlink_ack);

int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
                                                   struct nlmsghdr *,
                                                   struct netlink_ext_ack *))
{
        struct netlink_ext_ack extack;
        struct nlmsghdr *nlh;
        int err;

        while (skb->len >= nlmsg_total_size(0)) {
                int msglen;

                memset(&extack, 0, sizeof(extack));
                nlh = nlmsg_hdr(skb);
                err = 0;

                if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
                        return 0;

                /* Only requests are handled by the kernel */
                if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
                        goto ack;

                /* Skip control messages */
                if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
                        goto ack;

                err = cb(skb, nlh, &extack);
                if (err == -EINTR)
                        goto skip;

ack:
                if (nlh->nlmsg_flags & NLM_F_ACK || err)
                        netlink_ack(skb, nlh, err, &extack);

skip:
                msglen = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msglen > skb->len)
                        msglen = skb->len;
                skb_pull(skb, msglen);
        }

        return 0;
}
EXPORT_SYMBOL(netlink_rcv_skb);

/**
 * nlmsg_notify - send a notification netlink message
 * @sk: netlink socket to use
 * @skb: notification message
 * @portid: destination netlink portid for reports or 0
 * @group: destination multicast group or 0
 * @report: 1 to report back, 0 to disable
 * @flags: allocation flags
 */
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags)
{
        int err = 0;

        if (group) {
                int exclude_portid = 0;

                if (report) {
                        refcount_inc(&skb->users);
                        exclude_portid = portid;
                }

                /* errors reported via destination sk->sk_err, but propagate
                 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
                err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
                if (err == -ESRCH)
                        err = 0;
        }

        if (report) {
                int err2;

                err2 = nlmsg_unicast(sk, skb, portid);
                if (!err)
                        err = err2;
        }

        return err;
}
EXPORT_SYMBOL(nlmsg_notify);

#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
        struct seq_net_private p;
        struct rhashtable_iter hti;
        int link;
};

static void netlink_walk_start(struct nl_seq_iter *iter)
{
        rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
        rhashtable_walk_start(&iter->hti);
}

static void netlink_walk_stop(struct nl_seq_iter *iter)
{
        rhashtable_walk_stop(&iter->hti);
        rhashtable_walk_exit(&iter->hti);
}

static void *__netlink_seq_next(struct seq_file *seq)
{
        struct nl_seq_iter *iter = seq->private;
        struct netlink_sock *nlk;

        do {
                for (;;) {
                        nlk = rhashtable_walk_next(&iter->hti);

                        if (IS_ERR(nlk)) {
                                if (PTR_ERR(nlk) == -EAGAIN)
                                        continue;

                                return nlk;
                        }

                        if (nlk)
                                break;

                        netlink_walk_stop(iter);
                        if (++iter->link >= MAX_LINKS)
                                return NULL;

                        netlink_walk_start(iter);
                }
        } while (sock_net(&nlk->sk) != seq_file_net(seq));

        return nlk;
}

static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
        __acquires(RCU)
{
        struct nl_seq_iter *iter = seq->private;
        void *obj = SEQ_START_TOKEN;
        loff_t pos;

        iter->link = 0;

        netlink_walk_start(iter);

        for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
                obj = __netlink_seq_next(seq);

        return obj;
}

static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        return __netlink_seq_next(seq);
}

static void netlink_native_seq_stop(struct seq_file *seq, void *v)
{
        struct nl_seq_iter *iter = seq->private;

        if (iter->link >= MAX_LINKS)
                return;

        netlink_walk_stop(iter);
}


static int netlink_native_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "sk               Eth Pid        Groups   "
                         "Rmem     Wmem     Dump  Locks    Drops    Inode\n");
        } else {
                struct sock *s = v;
                struct netlink_sock *nlk = nlk_sk(s);

                seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n",
                           s,
                           s->sk_protocol,
                           nlk->portid,
                           nlk->groups ? (u32)nlk->groups[0] : 0,
                           sk_rmem_alloc_get(s),
                           sk_wmem_alloc_get(s),
                           READ_ONCE(nlk->cb_running),
                           refcount_read(&s->sk_refcnt),
                           atomic_read(&s->sk_drops),
                           sock_i_ino(s)
                        );

        }
        return 0;
}

#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__netlink {
        __bpf_md_ptr(struct bpf_iter_meta *, meta);
        __bpf_md_ptr(struct netlink_sock *, sk);
};

DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)

static int netlink_prog_seq_show(struct bpf_prog *prog,
                                  struct bpf_iter_meta *meta,
                                  void *v)
{
        struct bpf_iter__netlink ctx;

        meta->seq_num--;  /* skip SEQ_START_TOKEN */
        ctx.meta = meta;
        ctx.sk = nlk_sk((struct sock *)v);
        return bpf_iter_run_prog(prog, &ctx);
}

static int netlink_seq_show(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        if (!prog)
                return netlink_native_seq_show(seq, v);

        if (v != SEQ_START_TOKEN)
                return netlink_prog_seq_show(prog, &meta, v);

        return 0;
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)netlink_prog_seq_show(prog, &meta, v);
        }

        netlink_native_seq_stop(seq, v);
}
#else
static int netlink_seq_show(struct seq_file *seq, void *v)
{
        return netlink_native_seq_show(seq, v);
}

static void netlink_seq_stop(struct seq_file *seq, void *v)
{
        netlink_native_seq_stop(seq, v);
}
#endif

static const struct seq_operations netlink_seq_ops = {
        .start  = netlink_seq_start,
        .next   = netlink_seq_next,
        .stop   = netlink_seq_stop,
        .show   = netlink_seq_show,
};
#endif

int netlink_register_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_register_notifier);

int netlink_unregister_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_unregister_notifier);

static const struct proto_ops netlink_ops = {
        .family =        PF_NETLINK,
        .owner =        THIS_MODULE,
        .release =        netlink_release,
        .bind =                netlink_bind,
        .connect =        netlink_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        netlink_getname,
        .poll =                datagram_poll,
        .ioctl =        netlink_ioctl,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        netlink_setsockopt,
        .getsockopt =        netlink_getsockopt,
        .sendmsg =        netlink_sendmsg,
        .recvmsg =        netlink_recvmsg,
        .mmap =                sock_no_mmap,
};

static const struct net_proto_family netlink_family_ops = {
        .family = PF_NETLINK,
        .create = netlink_create,
        .owner        = THIS_MODULE,        /* for consistency 8) */
};

static int __net_init netlink_net_init(struct net *net)
{
#ifdef CONFIG_PROC_FS
        if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops,
                        sizeof(struct nl_seq_iter)))
                return -ENOMEM;
#endif
        return 0;
}

static void __net_exit netlink_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("netlink", net->proc_net);
#endif
}

static void __init netlink_add_usersock_entry(void)
{
        struct listeners *listeners;
        int groups = 32;

        listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
        if (!listeners)
                panic("netlink_add_usersock_entry: Cannot allocate listeners\n");

        netlink_table_grab();

        nl_table[NETLINK_USERSOCK].groups = groups;
        rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
        nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
        nl_table[NETLINK_USERSOCK].registered = 1;
        nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;

        netlink_table_ungrab();
}

static struct pernet_operations __net_initdata netlink_net_ops = {
        .init = netlink_net_init,
        .exit = netlink_net_exit,
};

static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
{
        const struct netlink_sock *nlk = data;
        struct netlink_compare_arg arg;

        netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
        return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
}

static const struct rhashtable_params netlink_rhashtable_params = {
        .head_offset = offsetof(struct netlink_sock, node),
        .key_len = netlink_compare_arg_len,
        .obj_hashfn = netlink_hash,
        .obj_cmpfn = netlink_compare,
        .automatic_shrinking = true,
};

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
BTF_ID_LIST(btf_netlink_sock_id)
BTF_ID(struct, netlink_sock)

static const struct bpf_iter_seq_info netlink_seq_info = {
        .seq_ops                = &netlink_seq_ops,
        .init_seq_private        = bpf_iter_init_seq_net,
        .fini_seq_private        = bpf_iter_fini_seq_net,
        .seq_priv_size                = sizeof(struct nl_seq_iter),
};

static struct bpf_iter_reg netlink_reg_info = {
        .target                        = "netlink",
        .ctx_arg_info_size        = 1,
        .ctx_arg_info                = {
                { offsetof(struct bpf_iter__netlink, sk),
                  PTR_TO_BTF_ID_OR_NULL },
        },
        .seq_info                = &netlink_seq_info,
};

static int __init bpf_iter_register(void)
{
        netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id;
        return bpf_iter_reg_target(&netlink_reg_info);
}
#endif

static int __init netlink_proto_init(void)
{
        int i;
        int err = proto_register(&netlink_proto, 0);

        if (err != 0)
                goto out;

#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
        err = bpf_iter_register();
        if (err)
                goto out;
#endif

        BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));

        nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
        if (!nl_table)
                goto panic;

        for (i = 0; i < MAX_LINKS; i++) {
                if (rhashtable_init(&nl_table[i].hash,
                                    &netlink_rhashtable_params) < 0)
                        goto panic;
        }

        netlink_add_usersock_entry();

        sock_register(&netlink_family_ops);
        register_pernet_subsys(&netlink_net_ops);
        register_pernet_subsys(&netlink_tap_net_ops);
        /* The netlink device handler may be needed early. */
        rtnetlink_init();
out:
        return err;
panic:
        panic("netlink_init: Cannot allocate nl_table\n");
}

core_initcall(netlink_proto_init);
























    6 
















    6 


    6 


    6 


    6 






    4 





    4 








    6 

    6 



    2 







    6 

    6 



    2 










  165 














  166 













  165 











  165 
  164 
  165 






  126 










  127 
  127 













    6 










    6 
    6 








    6 


    6 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright 2019 Arm Limited
 * Author: Andrew Murray <Andrew.Murray@arm.com>
 */
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
#include <linux/perf/arm_pmu.h>
#include <linux/perf/arm_pmuv3.h>

static DEFINE_PER_CPU(struct kvm_pmu_events, kvm_pmu_events);

/*
 * Given the perf event attributes and system type, determine
 * if we are going to need to switch counters at guest entry/exit.
 */
static bool kvm_pmu_switch_needed(struct perf_event_attr *attr)
{
        /**
         * With VHE the guest kernel runs at EL1 and the host at EL2,
         * where user (EL0) is excluded then we have no reason to switch
         * counters.
         */
        if (has_vhe() && attr->exclude_user)
                return false;

        /* Only switch if attributes are different */
        return (attr->exclude_host != attr->exclude_guest);
}

struct kvm_pmu_events *kvm_get_pmu_events(void)
{
        return this_cpu_ptr(&kvm_pmu_events);
}

/*
 * Add events to track that we may want to switch at guest entry/exit
 * time.
 */
void kvm_set_pmu_events(u64 set, struct perf_event_attr *attr)
{
        struct kvm_pmu_events *pmu = kvm_get_pmu_events();

        if (!system_supports_pmuv3() || !kvm_pmu_switch_needed(attr))
                return;

        if (!attr->exclude_host)
                pmu->events_host |= set;
        if (!attr->exclude_guest)
                pmu->events_guest |= set;
}

/*
 * Stop tracking events
 */
void kvm_clr_pmu_events(u64 clr)
{
        struct kvm_pmu_events *pmu = kvm_get_pmu_events();

        if (!system_supports_pmuv3())
                return;

        pmu->events_host &= ~clr;
        pmu->events_guest &= ~clr;
}

/*
 * Read a value direct from PMEVTYPER<idx> where idx is 0-30
 * or PMxCFILTR_EL0 where idx is 31-32.
 */
static u64 kvm_vcpu_pmu_read_evtype_direct(int idx)
{
        if (idx == ARMV8_PMU_CYCLE_IDX)
                return read_pmccfiltr();
        else if (idx == ARMV8_PMU_INSTR_IDX)
                return read_pmicfiltr();

        return read_pmevtypern(idx);
}

/*
 * Write a value direct to PMEVTYPER<idx> where idx is 0-30
 * or PMxCFILTR_EL0 where idx is 31-32.
 */
static void kvm_vcpu_pmu_write_evtype_direct(int idx, u32 val)
{
        if (idx == ARMV8_PMU_CYCLE_IDX)
                write_pmccfiltr(val);
        else if (idx == ARMV8_PMU_INSTR_IDX)
                write_pmicfiltr(val);
        else
                write_pmevtypern(idx, val);
}

/*
 * Modify ARMv8 PMU events to include EL0 counting
 */
static void kvm_vcpu_pmu_enable_el0(unsigned long events)
{
        u64 typer;
        u32 counter;

        for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) {
                typer = kvm_vcpu_pmu_read_evtype_direct(counter);
                typer &= ~ARMV8_PMU_EXCLUDE_EL0;
                kvm_vcpu_pmu_write_evtype_direct(counter, typer);
        }
}

/*
 * Modify ARMv8 PMU events to exclude EL0 counting
 */
static void kvm_vcpu_pmu_disable_el0(unsigned long events)
{
        u64 typer;
        u32 counter;

        for_each_set_bit(counter, &events, ARMPMU_MAX_HWEVENTS) {
                typer = kvm_vcpu_pmu_read_evtype_direct(counter);
                typer |= ARMV8_PMU_EXCLUDE_EL0;
                kvm_vcpu_pmu_write_evtype_direct(counter, typer);
        }
}

/*
 * On VHE ensure that only guest events have EL0 counting enabled.
 * This is called from both vcpu_{load,put} and the sysreg handling.
 * Since the latter is preemptible, special care must be taken to
 * disable preemption.
 */
void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu)
{
        struct kvm_pmu_events *pmu;
        u64 events_guest, events_host;

        if (!system_supports_pmuv3() || !has_vhe())
                return;

        preempt_disable();
        pmu = kvm_get_pmu_events();
        events_guest = pmu->events_guest;
        events_host = pmu->events_host;

        kvm_vcpu_pmu_enable_el0(events_guest);
        kvm_vcpu_pmu_disable_el0(events_host);
        preempt_enable();
}

/*
 * On VHE ensure that only host events have EL0 counting enabled
 */
void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu)
{
        struct kvm_pmu_events *pmu;
        u64 events_guest, events_host;

        if (!system_supports_pmuv3() || !has_vhe())
                return;

        pmu = kvm_get_pmu_events();
        events_guest = pmu->events_guest;
        events_host = pmu->events_host;

        kvm_vcpu_pmu_enable_el0(events_host);
        kvm_vcpu_pmu_disable_el0(events_guest);
}

/*
 * With VHE, keep track of the PMUSERENR_EL0 value for the host EL0 on the pCPU
 * where PMUSERENR_EL0 for the guest is loaded, since PMUSERENR_EL0 is switched
 * to the value for the guest on vcpu_load().  The value for the host EL0
 * will be restored on vcpu_put(), before returning to userspace.
 * This isn't necessary for nVHE, as the register is context switched for
 * every guest enter/exit.
 *
 * Return true if KVM takes care of the register. Otherwise return false.
 */
bool kvm_set_pmuserenr(u64 val)
{
        struct kvm_cpu_context *hctxt;
        struct kvm_vcpu *vcpu;

        if (!system_supports_pmuv3() || !has_vhe())
                return false;

        vcpu = kvm_get_running_vcpu();
        if (!vcpu || !vcpu_get_flag(vcpu, PMUSERENR_ON_CPU))
                return false;

        hctxt = host_data_ptr(host_ctxt);
        ctxt_sys_reg(hctxt, PMUSERENR_EL0) = val;
        return true;
}

/*
 * If we interrupted the guest to update the host PMU context, make
 * sure we re-apply the guest EL0 state.
 */
void kvm_vcpu_pmu_resync_el0(void)
{
        struct kvm_vcpu *vcpu;

        if (!has_vhe() || !in_interrupt())
                return;

        vcpu = kvm_get_running_vcpu();
        if (!vcpu)
                return;

        kvm_make_request(KVM_REQ_RESYNC_PMU_EL0, vcpu);
}



















































































  314 
    2 










































  331 
   68 



















  331 




  331 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_BL_H
#define _LINUX_LIST_BL_H

#include <linux/list.h>
#include <linux/bit_spinlock.h>

/*
 * Special version of lists, where head of the list has a lock in the lowest
 * bit. This is useful for scalable hash tables without increasing memory
 * footprint overhead.
 *
 * For modification operations, the 0 bit of hlist_bl_head->first
 * pointer must be set.
 *
 * With some small modifications, this can easily be adapted to store several
 * arbitrary bits (not just a single lock bit), if the need arises to store
 * some fast and compact auxiliary data.
 */

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
#define LIST_BL_LOCKMASK        1UL
#else
#define LIST_BL_LOCKMASK        0UL
#endif

#ifdef CONFIG_DEBUG_LIST
#define LIST_BL_BUG_ON(x) BUG_ON(x)
#else
#define LIST_BL_BUG_ON(x)
#endif


struct hlist_bl_head {
        struct hlist_bl_node *first;
};

struct hlist_bl_node {
        struct hlist_bl_node *next, **pprev;
};
#define INIT_HLIST_BL_HEAD(ptr) \
        ((ptr)->first = NULL)

static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member)

static inline bool  hlist_bl_unhashed(const struct hlist_bl_node *h)
{
        return !h->pprev;
}

static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)h->first & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_set_first(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK);
}

static inline bool hlist_bl_empty(const struct hlist_bl_head *h)
{
        return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
}

static inline void hlist_bl_add_head(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;
        hlist_bl_set_first(h, n);
}

static inline void hlist_bl_add_before(struct hlist_bl_node *n,
                                       struct hlist_bl_node *next)
{
        struct hlist_bl_node **pprev = next->pprev;

        n->pprev = pprev;
        n->next = next;
        next->pprev = &n->next;

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((uintptr_t)n | ((uintptr_t)*pprev & LIST_BL_LOCKMASK)));
}

static inline void hlist_bl_add_behind(struct hlist_bl_node *n,
                                       struct hlist_bl_node *prev)
{
        n->next = prev->next;
        n->pprev = &prev->next;
        prev->next = n;

        if (n->next)
                n->next->pprev = &n->next;
}

static inline void __hlist_bl_del(struct hlist_bl_node *n)
{
        struct hlist_bl_node *next = n->next;
        struct hlist_bl_node **pprev = n->pprev;

        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);

        /* pprev may be `first`, so be careful not to lose the lock bit */
        WRITE_ONCE(*pprev,
                   (struct hlist_bl_node *)
                        ((unsigned long)next |
                         ((unsigned long)*pprev & LIST_BL_LOCKMASK)));
        if (next)
                next->pprev = pprev;
}

static inline void hlist_bl_del(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

static inline void hlist_bl_del_init(struct hlist_bl_node *n)
{
        if (!hlist_bl_unhashed(n)) {
                __hlist_bl_del(n);
                INIT_HLIST_BL_NODE(n);
        }
}

static inline void hlist_bl_lock(struct hlist_bl_head *b)
{
        bit_spin_lock(0, (unsigned long *)b);
}

static inline void hlist_bl_unlock(struct hlist_bl_head *b)
{
        __bit_spin_unlock(0, (unsigned long *)b);
}

static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
{
        return bit_spin_is_locked(0, (unsigned long *)b);
}

/**
 * hlist_bl_for_each_entry        - iterate over list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 *
 */
#define hlist_bl_for_each_entry(tpos, pos, head, member)                \
        for (pos = hlist_bl_first(head);                                \
             pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = pos->next)

/**
 * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @n:                another &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member)         \
        for (pos = hlist_bl_first(head);                                 \
             pos && ({ n = pos->next; 1; }) &&                                  \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \
             pos = n)

#endif











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 
    3 





































    3 


































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                PACKET - implements raw packet sockets.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *
 * Fixes:
 *                Alan Cox        :        verify_area() now used correctly
 *                Alan Cox        :        new skbuff lists, look ma no backlogs!
 *                Alan Cox        :        tidied skbuff lists.
 *                Alan Cox        :        Now uses generic datagram routines I
 *                                        added. Also fixed the peek/read crash
 *                                        from all old Linux datagram code.
 *                Alan Cox        :        Uses the improved datagram code.
 *                Alan Cox        :        Added NULL's for socket options.
 *                Alan Cox        :        Re-commented the code.
 *                Alan Cox        :        Use new kernel side addressing
 *                Rob Janssen        :        Correct MTU usage.
 *                Dave Platt        :        Counter leaks caused by incorrect
 *                                        interrupt locking and some slightly
 *                                        dubious gcc output. Can you read
 *                                        compiler: it said _VOLATILE_
 *        Richard Kooijman        :        Timestamp fixes.
 *                Alan Cox        :        New buffers. Use sk->mac.raw.
 *                Alan Cox        :        sendmsg/recvmsg support.
 *                Alan Cox        :        Protocol setting support
 *        Alexey Kuznetsov        :        Untied from IPv4 stack.
 *        Cyrus Durgin                :        Fixed kerneld for kmod.
 *        Michal Ostrowski        :       Module initialization cleanup.
 *         Ulises Alonso        :       Frame number limit removal and
 *                                      packet_set_ring memory leak.
 *                Eric Biederman        :        Allow for > 8 byte hardware addresses.
 *                                        The convention is that longer addresses
 *                                        will simply extend the hardware address
 *                                        byte arrays at the end of sockaddr_ll
 *                                        and packet_mreq.
 *                Johann Baudy        :        Added TX RING.
 *                Chetan Loke        :        Implemented TPACKET_V3 block abstraction
 *                                        layer.
 *                                        Copyright (C) 2011, <lokec@ccs.neu.edu>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/ethtool.h>
#include <linux/filter.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/capability.h>
#include <linux/fcntl.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/if_packet.h>
#include <linux/wireless.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <asm/io.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/if_vlan.h>
#include <linux/virtio_net.h>
#include <linux/errqueue.h>
#include <linux/net_tstamp.h>
#include <linux/percpu.h>
#ifdef CONFIG_INET
#include <net/inet_common.h>
#endif
#include <linux/bpf.h>
#include <net/compat.h>
#include <linux/netfilter_netdev.h>

#include "internal.h"

/*
   Assumptions:
   - If the device has no dev->header_ops->create, there is no LL header
     visible above the device. In this case, its hard_header_len should be 0.
     The device may prepend its own header internally. In this case, its
     needed_headroom should be set to the space needed for it to add its
     internal header.
     For example, a WiFi driver pretending to be an Ethernet driver should
     set its hard_header_len to be the Ethernet header length, and set its
     needed_headroom to be (the real WiFi header length - the fake Ethernet
     header length).
   - packet socket receives packets with pulled ll header,
     so that SOCK_RAW should push it back.

On receive:
-----------

Incoming, dev_has_header(dev) == true
   mac_header -> ll header
   data       -> data

Outgoing, dev_has_header(dev) == true
   mac_header -> ll header
   data       -> ll header

Incoming, dev_has_header(dev) == false
   mac_header -> data
     However drivers often make it point to the ll header.
     This is incorrect because the ll header should be invisible to us.
   data       -> data

Outgoing, dev_has_header(dev) == false
   mac_header -> data. ll header is invisible to us.
   data       -> data

Resume
  If dev_has_header(dev) == false we are unable to restore the ll header,
    because it is invisible to us.


On transmit:
------------

dev_has_header(dev) == true
   mac_header -> ll header
   data       -> ll header

dev_has_header(dev) == false (ll header is invisible to us)
   mac_header -> data
   data       -> data

   We should set network_header on output to the correct position,
   packet classifier depends on it.
 */

/* Private packet socket structures. */

/* identical to struct packet_mreq except it has
 * a longer address field.
 */
struct packet_mreq_max {
        int                mr_ifindex;
        unsigned short        mr_type;
        unsigned short        mr_alen;
        unsigned char        mr_address[MAX_ADDR_LEN];
};

union tpacket_uhdr {
        struct tpacket_hdr  *h1;
        struct tpacket2_hdr *h2;
        struct tpacket3_hdr *h3;
        void *raw;
};

static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                int closing, int tx_ring);

#define V3_ALIGNMENT        (8)

#define BLK_HDR_LEN        (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))

#define BLK_PLUS_PRIV(sz_of_priv) \
        (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))

#define BLOCK_STATUS(x)        ((x)->hdr.bh1.block_status)
#define BLOCK_NUM_PKTS(x)        ((x)->hdr.bh1.num_pkts)
#define BLOCK_O2FP(x)                ((x)->hdr.bh1.offset_to_first_pkt)
#define BLOCK_LEN(x)                ((x)->hdr.bh1.blk_len)
#define BLOCK_SNUM(x)                ((x)->hdr.bh1.seq_num)
#define BLOCK_O2PRIV(x)        ((x)->offset_to_priv)

struct packet_sock;
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
                       struct packet_type *pt, struct net_device *orig_dev);

static void *packet_previous_frame(struct packet_sock *po,
                struct packet_ring_buffer *rb,
                int status);
static void packet_increment_head(struct packet_ring_buffer *buff);
static int prb_curr_blk_in_use(struct tpacket_block_desc *);
static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
                        struct packet_sock *);
static void prb_retire_current_block(struct tpacket_kbdq_core *,
                struct packet_sock *, unsigned int status);
static int prb_queue_frozen(struct tpacket_kbdq_core *);
static void prb_open_block(struct tpacket_kbdq_core *,
                struct tpacket_block_desc *);
static void prb_retire_rx_blk_timer_expired(struct timer_list *);
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
static void prb_clear_rxhash(struct tpacket_kbdq_core *,
                struct tpacket3_hdr *);
static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
                struct tpacket3_hdr *);
static void packet_flush_mclist(struct sock *sk);
static u16 packet_pick_tx_queue(struct sk_buff *skb);

struct packet_skb_cb {
        union {
                struct sockaddr_pkt pkt;
                union {
                        /* Trick: alias skb original length with
                         * ll.sll_family and ll.protocol in order
                         * to save room.
                         */
                        unsigned int origlen;
                        struct sockaddr_ll ll;
                };
        } sa;
};

#define vio_le() virtio_legacy_is_little_endian()

#define PACKET_SKB_CB(__skb)        ((struct packet_skb_cb *)((__skb)->cb))

#define GET_PBDQC_FROM_RB(x)        ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
#define GET_PBLOCK_DESC(x, bid)        \
        ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
#define GET_CURR_PBLOCK_DESC_FROM_CORE(x)        \
        ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
#define GET_NEXT_PRB_BLK_NUM(x) \
        (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
        ((x)->kactive_blk_num+1) : 0)

static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
static void __fanout_link(struct sock *sk, struct packet_sock *po);

#ifdef CONFIG_NETFILTER_EGRESS
static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb)
{
        struct sk_buff *next, *head = NULL, *tail;
        int rc;

        rcu_read_lock();
        for (; skb != NULL; skb = next) {
                next = skb->next;
                skb_mark_not_on_list(skb);

                if (!nf_hook_egress(skb, &rc, skb->dev))
                        continue;

                if (!head)
                        head = skb;
                else
                        tail->next = skb;

                tail = skb;
        }
        rcu_read_unlock();

        return head;
}
#endif

static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb)
{
        if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS))
                return dev_queue_xmit(skb);

#ifdef CONFIG_NETFILTER_EGRESS
        if (nf_hook_egress_active()) {
                skb = nf_hook_direct_egress(skb);
                if (!skb)
                        return NET_XMIT_DROP;
        }
#endif
        return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
}

static struct net_device *packet_cached_dev_get(struct packet_sock *po)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = rcu_dereference(po->cached_dev);
        dev_hold(dev);
        rcu_read_unlock();

        return dev;
}

static void packet_cached_dev_assign(struct packet_sock *po,
                                     struct net_device *dev)
{
        rcu_assign_pointer(po->cached_dev, dev);
}

static void packet_cached_dev_reset(struct packet_sock *po)
{
        RCU_INIT_POINTER(po->cached_dev, NULL);
}

static u16 packet_pick_tx_queue(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        const struct net_device_ops *ops = dev->netdev_ops;
        int cpu = raw_smp_processor_id();
        u16 queue_index;

#ifdef CONFIG_XPS
        skb->sender_cpu = cpu + 1;
#endif
        skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
        if (ops->ndo_select_queue) {
                queue_index = ops->ndo_select_queue(dev, skb, NULL);
                queue_index = netdev_cap_txqueue(dev, queue_index);
        } else {
                queue_index = netdev_pick_tx(dev, skb, NULL);
        }

        return queue_index;
}

/* __register_prot_hook must be invoked through register_prot_hook
 * or from a context in which asynchronous accesses to the packet
 * socket is not possible (packet_create()).
 */
static void __register_prot_hook(struct sock *sk)
{
        struct packet_sock *po = pkt_sk(sk);

        if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
                if (po->fanout)
                        __fanout_link(sk, po);
                else
                        dev_add_pack(&po->prot_hook);

                sock_hold(sk);
                packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1);
        }
}

static void register_prot_hook(struct sock *sk)
{
        lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
        __register_prot_hook(sk);
}

/* If the sync parameter is true, we will temporarily drop
 * the po->bind_lock and do a synchronize_net to make sure no
 * asynchronous packet processing paths still refer to the elements
 * of po->prot_hook.  If the sync parameter is false, it is the
 * callers responsibility to take care of this.
 */
static void __unregister_prot_hook(struct sock *sk, bool sync)
{
        struct packet_sock *po = pkt_sk(sk);

        lockdep_assert_held_once(&po->bind_lock);

        packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0);

        if (po->fanout)
                __fanout_unlink(sk, po);
        else
                __dev_remove_pack(&po->prot_hook);

        __sock_put(sk);

        if (sync) {
                spin_unlock(&po->bind_lock);
                synchronize_net();
                spin_lock(&po->bind_lock);
        }
}

static void unregister_prot_hook(struct sock *sk, bool sync)
{
        struct packet_sock *po = pkt_sk(sk);

        if (packet_sock_flag(po, PACKET_SOCK_RUNNING))
                __unregister_prot_hook(sk, sync);
}

static inline struct page * __pure pgv_to_page(void *addr)
{
        if (is_vmalloc_addr(addr))
                return vmalloc_to_page(addr);
        return virt_to_page(addr);
}

static void __packet_set_status(struct packet_sock *po, void *frame, int status)
{
        union tpacket_uhdr h;

        /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */

        h.raw = frame;
        switch (po->tp_version) {
        case TPACKET_V1:
                WRITE_ONCE(h.h1->tp_status, status);
                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
                break;
        case TPACKET_V2:
                WRITE_ONCE(h.h2->tp_status, status);
                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                break;
        case TPACKET_V3:
                WRITE_ONCE(h.h3->tp_status, status);
                flush_dcache_page(pgv_to_page(&h.h3->tp_status));
                break;
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
        }

        smp_wmb();
}

static int __packet_get_status(const struct packet_sock *po, void *frame)
{
        union tpacket_uhdr h;

        smp_rmb();

        /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */

        h.raw = frame;
        switch (po->tp_version) {
        case TPACKET_V1:
                flush_dcache_page(pgv_to_page(&h.h1->tp_status));
                return READ_ONCE(h.h1->tp_status);
        case TPACKET_V2:
                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                return READ_ONCE(h.h2->tp_status);
        case TPACKET_V3:
                flush_dcache_page(pgv_to_page(&h.h3->tp_status));
                return READ_ONCE(h.h3->tp_status);
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
                return 0;
        }
}

static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
                                   unsigned int flags)
{
        struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);

        if (shhwtstamps &&
            (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
            ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
                return TP_STATUS_TS_RAW_HARDWARE;

        if ((flags & SOF_TIMESTAMPING_SOFTWARE) &&
            ktime_to_timespec64_cond(skb_tstamp(skb), ts))
                return TP_STATUS_TS_SOFTWARE;

        return 0;
}

static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
                                    struct sk_buff *skb)
{
        union tpacket_uhdr h;
        struct timespec64 ts;
        __u32 ts_status;

        if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp))))
                return 0;

        h.raw = frame;
        /*
         * versions 1 through 3 overflow the timestamps in y2106, since they
         * all store the seconds in a 32-bit unsigned integer.
         * If we create a version 4, that should have a 64-bit timestamp,
         * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
         * nanoseconds.
         */
        switch (po->tp_version) {
        case TPACKET_V1:
                h.h1->tp_sec = ts.tv_sec;
                h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
                break;
        case TPACKET_V2:
                h.h2->tp_sec = ts.tv_sec;
                h.h2->tp_nsec = ts.tv_nsec;
                break;
        case TPACKET_V3:
                h.h3->tp_sec = ts.tv_sec;
                h.h3->tp_nsec = ts.tv_nsec;
                break;
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
        }

        /* one flush is safe, as both fields always lie on the same cacheline */
        flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
        smp_wmb();

        return ts_status;
}

static void *packet_lookup_frame(const struct packet_sock *po,
                                 const struct packet_ring_buffer *rb,
                                 unsigned int position,
                                 int status)
{
        unsigned int pg_vec_pos, frame_offset;
        union tpacket_uhdr h;

        pg_vec_pos = position / rb->frames_per_block;
        frame_offset = position % rb->frames_per_block;

        h.raw = rb->pg_vec[pg_vec_pos].buffer +
                (frame_offset * rb->frame_size);

        if (status != __packet_get_status(po, h.raw))
                return NULL;

        return h.raw;
}

static void *packet_current_frame(struct packet_sock *po,
                struct packet_ring_buffer *rb,
                int status)
{
        return packet_lookup_frame(po, rb, rb->head, status);
}

static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev)
{
        struct vlan_hdr vhdr, *vh;
        unsigned int header_len;

        if (!dev)
                return 0;

        /* In the SOCK_DGRAM scenario, skb data starts at the network
         * protocol, which is after the VLAN headers. The outer VLAN
         * header is at the hard_header_len offset in non-variable
         * length link layer headers. If it's a VLAN device, the
         * min_header_len should be used to exclude the VLAN header
         * size.
         */
        if (dev->min_header_len == dev->hard_header_len)
                header_len = dev->hard_header_len;
        else if (is_vlan_dev(dev))
                header_len = dev->min_header_len;
        else
                return 0;

        vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len,
                                sizeof(vhdr), &vhdr);
        if (unlikely(!vh))
                return 0;

        return ntohs(vh->h_vlan_TCI);
}

static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
{
        __be16 proto = skb->protocol;

        if (unlikely(eth_type_vlan(proto)))
                proto = __vlan_get_protocol_offset(skb, proto,
                                                   skb_mac_offset(skb), NULL);

        return proto;
}

static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
{
        timer_delete_sync(&pkc->retire_blk_timer);
}

static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
                struct sk_buff_head *rb_queue)
{
        struct tpacket_kbdq_core *pkc;

        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);

        spin_lock_bh(&rb_queue->lock);
        pkc->delete_blk_timer = 1;
        spin_unlock_bh(&rb_queue->lock);

        prb_del_retire_blk_timer(pkc);
}

static void prb_setup_retire_blk_timer(struct packet_sock *po)
{
        struct tpacket_kbdq_core *pkc;

        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
        timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
                    0);
        pkc->retire_blk_timer.expires = jiffies;
}

static int prb_calc_retire_blk_tmo(struct packet_sock *po,
                                int blk_size_in_bytes)
{
        struct net_device *dev;
        unsigned int mbits, div;
        struct ethtool_link_ksettings ecmd;
        int err;

        rtnl_lock();
        dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
        if (unlikely(!dev)) {
                rtnl_unlock();
                return DEFAULT_PRB_RETIRE_TOV;
        }
        err = __ethtool_get_link_ksettings(dev, &ecmd);
        rtnl_unlock();
        if (err)
                return DEFAULT_PRB_RETIRE_TOV;

        /* If the link speed is so slow you don't really
         * need to worry about perf anyways
         */
        if (ecmd.base.speed < SPEED_1000 ||
            ecmd.base.speed == SPEED_UNKNOWN)
                return DEFAULT_PRB_RETIRE_TOV;

        div = ecmd.base.speed / 1000;
        mbits = (blk_size_in_bytes * 8) / (1024 * 1024);

        if (div)
                mbits /= div;

        if (div)
                return mbits + 1;
        return mbits;
}

static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
                        union tpacket_req_u *req_u)
{
        p1->feature_req_word = req_u->req3.tp_feature_req_word;
}

static void init_prb_bdqc(struct packet_sock *po,
                        struct packet_ring_buffer *rb,
                        struct pgv *pg_vec,
                        union tpacket_req_u *req_u)
{
        struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
        struct tpacket_block_desc *pbd;

        memset(p1, 0x0, sizeof(*p1));

        p1->knxt_seq_num = 1;
        p1->pkbdq = pg_vec;
        pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
        p1->pkblk_start        = pg_vec[0].buffer;
        p1->kblk_size = req_u->req3.tp_block_size;
        p1->knum_blocks        = req_u->req3.tp_block_nr;
        p1->hdrlen = po->tp_hdrlen;
        p1->version = po->tp_version;
        p1->last_kactive_blk_num = 0;
        po->stats.stats3.tp_freeze_q_cnt = 0;
        if (req_u->req3.tp_retire_blk_tov)
                p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
        else
                p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
                                                req_u->req3.tp_block_size);
        p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
        p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
        rwlock_init(&p1->blk_fill_in_prog_lock);

        p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
        prb_init_ft_ops(p1, req_u);
        prb_setup_retire_blk_timer(po);
        prb_open_block(p1, pbd);
}

/*  Do NOT update the last_blk_num first.
 *  Assumes sk_buff_head lock is held.
 */
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
{
        mod_timer(&pkc->retire_blk_timer,
                        jiffies + pkc->tov_in_jiffies);
        pkc->last_kactive_blk_num = pkc->kactive_blk_num;
}

/*
 * Timer logic:
 * 1) We refresh the timer only when we open a block.
 *    By doing this we don't waste cycles refreshing the timer
 *          on packet-by-packet basis.
 *
 * With a 1MB block-size, on a 1Gbps line, it will take
 * i) ~8 ms to fill a block + ii) memcpy etc.
 * In this cut we are not accounting for the memcpy time.
 *
 * So, if the user sets the 'tmo' to 10ms then the timer
 * will never fire while the block is still getting filled
 * (which is what we want). However, the user could choose
 * to close a block early and that's fine.
 *
 * But when the timer does fire, we check whether or not to refresh it.
 * Since the tmo granularity is in msecs, it is not too expensive
 * to refresh the timer, lets say every '8' msecs.
 * Either the user can set the 'tmo' or we can derive it based on
 * a) line-speed and b) block-size.
 * prb_calc_retire_blk_tmo() calculates the tmo.
 *
 */
static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
{
        struct packet_sock *po =
                from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
        struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
        unsigned int frozen;
        struct tpacket_block_desc *pbd;

        spin_lock(&po->sk.sk_receive_queue.lock);

        frozen = prb_queue_frozen(pkc);
        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);

        if (unlikely(pkc->delete_blk_timer))
                goto out;

        /* We only need to plug the race when the block is partially filled.
         * tpacket_rcv:
         *                lock(); increment BLOCK_NUM_PKTS; unlock()
         *                copy_bits() is in progress ...
         *                timer fires on other cpu:
         *                we can't retire the current block because copy_bits
         *                is in progress.
         *
         */
        if (BLOCK_NUM_PKTS(pbd)) {
                /* Waiting for skb_copy_bits to finish... */
                write_lock(&pkc->blk_fill_in_prog_lock);
                write_unlock(&pkc->blk_fill_in_prog_lock);
        }

        if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
                if (!frozen) {
                        if (!BLOCK_NUM_PKTS(pbd)) {
                                /* An empty block. Just refresh the timer. */
                                goto refresh_timer;
                        }
                        prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
                        if (!prb_dispatch_next_block(pkc, po))
                                goto refresh_timer;
                        else
                                goto out;
                } else {
                        /* Case 1. Queue was frozen because user-space was
                         *           lagging behind.
                         */
                        if (prb_curr_blk_in_use(pbd)) {
                                /*
                                 * Ok, user-space is still behind.
                                 * So just refresh the timer.
                                 */
                                goto refresh_timer;
                        } else {
                               /* Case 2. queue was frozen,user-space caught up,
                                * now the link went idle && the timer fired.
                                * We don't have a block to close.So we open this
                                * block and restart the timer.
                                * opening a block thaws the queue,restarts timer
                                * Thawing/timer-refresh is a side effect.
                                */
                                prb_open_block(pkc, pbd);
                                goto out;
                        }
                }
        }

refresh_timer:
        _prb_refresh_rx_retire_blk_timer(pkc);

out:
        spin_unlock(&po->sk.sk_receive_queue.lock);
}

static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
                struct tpacket_block_desc *pbd1, __u32 status)
{
        /* Flush everything minus the block header */

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
        u8 *start, *end;

        start = (u8 *)pbd1;

        /* Skip the block header(we know header WILL fit in 4K) */
        start += PAGE_SIZE;

        end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
        for (; start < end; start += PAGE_SIZE)
                flush_dcache_page(pgv_to_page(start));

        smp_wmb();
#endif

        /* Now update the block status. */

        BLOCK_STATUS(pbd1) = status;

        /* Flush the block header */

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
        start = (u8 *)pbd1;
        flush_dcache_page(pgv_to_page(start));

        smp_wmb();
#endif
}

/*
 * Side effect:
 *
 * 1) flush the block
 * 2) Increment active_blk_num
 *
 * Note:We DONT refresh the timer on purpose.
 *        Because almost always the next block will be opened.
 */
static void prb_close_block(struct tpacket_kbdq_core *pkc1,
                struct tpacket_block_desc *pbd1,
                struct packet_sock *po, unsigned int stat)
{
        __u32 status = TP_STATUS_USER | stat;

        struct tpacket3_hdr *last_pkt;
        struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
        struct sock *sk = &po->sk;

        if (atomic_read(&po->tp_drops))
                status |= TP_STATUS_LOSING;

        last_pkt = (struct tpacket3_hdr *)pkc1->prev;
        last_pkt->tp_next_offset = 0;

        /* Get the ts of the last pkt */
        if (BLOCK_NUM_PKTS(pbd1)) {
                h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
                h1->ts_last_pkt.ts_nsec        = last_pkt->tp_nsec;
        } else {
                /* Ok, we tmo'd - so get the current time.
                 *
                 * It shouldn't really happen as we don't close empty
                 * blocks. See prb_retire_rx_blk_timer_expired().
                 */
                struct timespec64 ts;
                ktime_get_real_ts64(&ts);
                h1->ts_last_pkt.ts_sec = ts.tv_sec;
                h1->ts_last_pkt.ts_nsec        = ts.tv_nsec;
        }

        smp_wmb();

        /* Flush the block */
        prb_flush_block(pkc1, pbd1, status);

        sk->sk_data_ready(sk);

        pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
}

static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
{
        pkc->reset_pending_on_curr_blk = 0;
}

/*
 * Side effect of opening a block:
 *
 * 1) prb_queue is thawed.
 * 2) retire_blk_timer is refreshed.
 *
 */
static void prb_open_block(struct tpacket_kbdq_core *pkc1,
        struct tpacket_block_desc *pbd1)
{
        struct timespec64 ts;
        struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;

        smp_rmb();

        /* We could have just memset this but we will lose the
         * flexibility of making the priv area sticky
         */

        BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
        BLOCK_NUM_PKTS(pbd1) = 0;
        BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);

        ktime_get_real_ts64(&ts);

        h1->ts_first_pkt.ts_sec = ts.tv_sec;
        h1->ts_first_pkt.ts_nsec = ts.tv_nsec;

        pkc1->pkblk_start = (char *)pbd1;
        pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);

        BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
        BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;

        pbd1->version = pkc1->version;
        pkc1->prev = pkc1->nxt_offset;
        pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;

        prb_thaw_queue(pkc1);
        _prb_refresh_rx_retire_blk_timer(pkc1);

        smp_wmb();
}

/*
 * Queue freeze logic:
 * 1) Assume tp_block_nr = 8 blocks.
 * 2) At time 't0', user opens Rx ring.
 * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
 * 4) user-space is either sleeping or processing block '0'.
 * 5) tpacket_rcv is currently filling block '7', since there is no space left,
 *    it will close block-7,loop around and try to fill block '0'.
 *    call-flow:
 *    __packet_lookup_frame_in_block
 *      prb_retire_current_block()
 *      prb_dispatch_next_block()
 *        |->(BLOCK_STATUS == USER) evaluates to true
 *    5.1) Since block-0 is currently in-use, we just freeze the queue.
 * 6) Now there are two cases:
 *    6.1) Link goes idle right after the queue is frozen.
 *         But remember, the last open_block() refreshed the timer.
 *         When this timer expires,it will refresh itself so that we can
 *         re-open block-0 in near future.
 *    6.2) Link is busy and keeps on receiving packets. This is a simple
 *         case and __packet_lookup_frame_in_block will check if block-0
 *         is free and can now be re-used.
 */
static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
                                  struct packet_sock *po)
{
        pkc->reset_pending_on_curr_blk = 1;
        po->stats.stats3.tp_freeze_q_cnt++;
}

#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))

/*
 * If the next block is free then we will dispatch it
 * and return a good offset.
 * Else, we will freeze the queue.
 * So, caller must check the return value.
 */
static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
                struct packet_sock *po)
{
        struct tpacket_block_desc *pbd;

        smp_rmb();

        /* 1. Get current block num */
        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);

        /* 2. If this block is currently in_use then freeze the queue */
        if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
                prb_freeze_queue(pkc, po);
                return NULL;
        }

        /*
         * 3.
         * open this block and return the offset where the first packet
         * needs to get stored.
         */
        prb_open_block(pkc, pbd);
        return (void *)pkc->nxt_offset;
}

static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
                struct packet_sock *po, unsigned int status)
{
        struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);

        /* retire/close the current block */
        if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
                /*
                 * Plug the case where copy_bits() is in progress on
                 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
                 * have space to copy the pkt in the current block and
                 * called prb_retire_current_block()
                 *
                 * We don't need to worry about the TMO case because
                 * the timer-handler already handled this case.
                 */
                if (!(status & TP_STATUS_BLK_TMO)) {
                        /* Waiting for skb_copy_bits to finish... */
                        write_lock(&pkc->blk_fill_in_prog_lock);
                        write_unlock(&pkc->blk_fill_in_prog_lock);
                }
                prb_close_block(pkc, pbd, po, status);
                return;
        }
}

static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
{
        return TP_STATUS_USER & BLOCK_STATUS(pbd);
}

static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
{
        return pkc->reset_pending_on_curr_blk;
}

static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
        __releases(&pkc->blk_fill_in_prog_lock)
{
        struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);

        read_unlock(&pkc->blk_fill_in_prog_lock);
}

static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
                        struct tpacket3_hdr *ppd)
{
        ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
}

static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
                        struct tpacket3_hdr *ppd)
{
        ppd->hv1.tp_rxhash = 0;
}

static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
                        struct tpacket3_hdr *ppd)
{
        struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc);

        if (skb_vlan_tag_present(pkc->skb)) {
                ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
                ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
                ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
        } else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) {
                ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev);
                ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol);
                ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
        } else {
                ppd->hv1.tp_vlan_tci = 0;
                ppd->hv1.tp_vlan_tpid = 0;
                ppd->tp_status = TP_STATUS_AVAILABLE;
        }
}

static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
                        struct tpacket3_hdr *ppd)
{
        ppd->hv1.tp_padding = 0;
        prb_fill_vlan_info(pkc, ppd);

        if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
                prb_fill_rxhash(pkc, ppd);
        else
                prb_clear_rxhash(pkc, ppd);
}

static void prb_fill_curr_block(char *curr,
                                struct tpacket_kbdq_core *pkc,
                                struct tpacket_block_desc *pbd,
                                unsigned int len)
        __acquires(&pkc->blk_fill_in_prog_lock)
{
        struct tpacket3_hdr *ppd;

        ppd  = (struct tpacket3_hdr *)curr;
        ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
        pkc->prev = curr;
        pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
        BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
        BLOCK_NUM_PKTS(pbd) += 1;
        read_lock(&pkc->blk_fill_in_prog_lock);
        prb_run_all_ft_ops(pkc, ppd);
}

/* Assumes caller has the sk->rx_queue.lock */
static void *__packet_lookup_frame_in_block(struct packet_sock *po,
                                            struct sk_buff *skb,
                                            unsigned int len
                                            )
{
        struct tpacket_kbdq_core *pkc;
        struct tpacket_block_desc *pbd;
        char *curr, *end;

        pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
        pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);

        /* Queue is frozen when user space is lagging behind */
        if (prb_queue_frozen(pkc)) {
                /*
                 * Check if that last block which caused the queue to freeze,
                 * is still in_use by user-space.
                 */
                if (prb_curr_blk_in_use(pbd)) {
                        /* Can't record this packet */
                        return NULL;
                } else {
                        /*
                         * Ok, the block was released by user-space.
                         * Now let's open that block.
                         * opening a block also thaws the queue.
                         * Thawing is a side effect.
                         */
                        prb_open_block(pkc, pbd);
                }
        }

        smp_mb();
        curr = pkc->nxt_offset;
        pkc->skb = skb;
        end = (char *)pbd + pkc->kblk_size;

        /* first try the current block */
        if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
                prb_fill_curr_block(curr, pkc, pbd, len);
                return (void *)curr;
        }

        /* Ok, close the current block */
        prb_retire_current_block(pkc, po, 0);

        /* Now, try to dispatch the next block */
        curr = (char *)prb_dispatch_next_block(pkc, po);
        if (curr) {
                pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
                prb_fill_curr_block(curr, pkc, pbd, len);
                return (void *)curr;
        }

        /*
         * No free blocks are available.user_space hasn't caught up yet.
         * Queue was just frozen and now this packet will get dropped.
         */
        return NULL;
}

static void *packet_current_rx_frame(struct packet_sock *po,
                                            struct sk_buff *skb,
                                            int status, unsigned int len)
{
        char *curr = NULL;
        switch (po->tp_version) {
        case TPACKET_V1:
        case TPACKET_V2:
                curr = packet_lookup_frame(po, &po->rx_ring,
                                        po->rx_ring.head, status);
                return curr;
        case TPACKET_V3:
                return __packet_lookup_frame_in_block(po, skb, len);
        default:
                WARN(1, "TPACKET version not supported\n");
                BUG();
                return NULL;
        }
}

static void *prb_lookup_block(const struct packet_sock *po,
                              const struct packet_ring_buffer *rb,
                              unsigned int idx,
                              int status)
{
        struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
        struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);

        if (status != BLOCK_STATUS(pbd))
                return NULL;
        return pbd;
}

static int prb_previous_blk_num(struct packet_ring_buffer *rb)
{
        unsigned int prev;
        if (rb->prb_bdqc.kactive_blk_num)
                prev = rb->prb_bdqc.kactive_blk_num-1;
        else
                prev = rb->prb_bdqc.knum_blocks-1;
        return prev;
}

/* Assumes caller has held the rx_queue.lock */
static void *__prb_previous_block(struct packet_sock *po,
                                         struct packet_ring_buffer *rb,
                                         int status)
{
        unsigned int previous = prb_previous_blk_num(rb);
        return prb_lookup_block(po, rb, previous, status);
}

static void *packet_previous_rx_frame(struct packet_sock *po,
                                             struct packet_ring_buffer *rb,
                                             int status)
{
        if (po->tp_version <= TPACKET_V2)
                return packet_previous_frame(po, rb, status);

        return __prb_previous_block(po, rb, status);
}

static void packet_increment_rx_head(struct packet_sock *po,
                                            struct packet_ring_buffer *rb)
{
        switch (po->tp_version) {
        case TPACKET_V1:
        case TPACKET_V2:
                return packet_increment_head(rb);
        case TPACKET_V3:
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
                return;
        }
}

static void *packet_previous_frame(struct packet_sock *po,
                struct packet_ring_buffer *rb,
                int status)
{
        unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
        return packet_lookup_frame(po, rb, previous, status);
}

static void packet_increment_head(struct packet_ring_buffer *buff)
{
        buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
}

static void packet_inc_pending(struct packet_ring_buffer *rb)
{
        this_cpu_inc(*rb->pending_refcnt);
}

static void packet_dec_pending(struct packet_ring_buffer *rb)
{
        this_cpu_dec(*rb->pending_refcnt);
}

static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
{
        unsigned int refcnt = 0;
        int cpu;

        /* We don't use pending refcount in rx_ring. */
        if (rb->pending_refcnt == NULL)
                return 0;

        for_each_possible_cpu(cpu)
                refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);

        return refcnt;
}

static int packet_alloc_pending(struct packet_sock *po)
{
        po->rx_ring.pending_refcnt = NULL;

        po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
        if (unlikely(po->tx_ring.pending_refcnt == NULL))
                return -ENOBUFS;

        return 0;
}

static void packet_free_pending(struct packet_sock *po)
{
        free_percpu(po->tx_ring.pending_refcnt);
}

#define ROOM_POW_OFF        2
#define ROOM_NONE        0x0
#define ROOM_LOW        0x1
#define ROOM_NORMAL        0x2

static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
{
        int idx, len;

        len = READ_ONCE(po->rx_ring.frame_max) + 1;
        idx = READ_ONCE(po->rx_ring.head);
        if (pow_off)
                idx += len >> pow_off;
        if (idx >= len)
                idx -= len;
        return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}

static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
{
        int idx, len;

        len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
        idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
        if (pow_off)
                idx += len >> pow_off;
        if (idx >= len)
                idx -= len;
        return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}

static int __packet_rcv_has_room(const struct packet_sock *po,
                                 const struct sk_buff *skb)
{
        const struct sock *sk = &po->sk;
        int ret = ROOM_NONE;

        if (po->prot_hook.func != tpacket_rcv) {
                int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
                int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
                                   - (skb ? skb->truesize : 0);

                if (avail > (rcvbuf >> ROOM_POW_OFF))
                        return ROOM_NORMAL;
                else if (avail > 0)
                        return ROOM_LOW;
                else
                        return ROOM_NONE;
        }

        if (po->tp_version == TPACKET_V3) {
                if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
                        ret = ROOM_NORMAL;
                else if (__tpacket_v3_has_room(po, 0))
                        ret = ROOM_LOW;
        } else {
                if (__tpacket_has_room(po, ROOM_POW_OFF))
                        ret = ROOM_NORMAL;
                else if (__tpacket_has_room(po, 0))
                        ret = ROOM_LOW;
        }

        return ret;
}

static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
        bool pressure;
        int ret;

        ret = __packet_rcv_has_room(po, skb);
        pressure = ret != ROOM_NORMAL;

        if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure)
                packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure);

        return ret;
}

static void packet_rcv_try_clear_pressure(struct packet_sock *po)
{
        if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) &&
            __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
                packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false);
}

static void packet_sock_destruct(struct sock *sk)
{
        skb_queue_purge(&sk->sk_error_queue);

        WARN_ON(atomic_read(&sk->sk_rmem_alloc));
        WARN_ON(refcount_read(&sk->sk_wmem_alloc));

        if (!sock_flag(sk, SOCK_DEAD)) {
                pr_err("Attempt to release alive packet socket: %p\n", sk);
                return;
        }
}

static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
{
        u32 *history = po->rollover->history;
        u32 victim, rxhash;
        int i, count = 0;

        rxhash = skb_get_hash(skb);
        for (i = 0; i < ROLLOVER_HLEN; i++)
                if (READ_ONCE(history[i]) == rxhash)
                        count++;

        victim = get_random_u32_below(ROLLOVER_HLEN);

        /* Avoid dirtying the cache line if possible */
        if (READ_ONCE(history[victim]) != rxhash)
                WRITE_ONCE(history[victim], rxhash);

        return count > (ROLLOVER_HLEN >> 1);
}

static unsigned int fanout_demux_hash(struct packet_fanout *f,
                                      struct sk_buff *skb,
                                      unsigned int num)
{
        return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
}

static unsigned int fanout_demux_lb(struct packet_fanout *f,
                                    struct sk_buff *skb,
                                    unsigned int num)
{
        unsigned int val = atomic_inc_return(&f->rr_cur);

        return val % num;
}

static unsigned int fanout_demux_cpu(struct packet_fanout *f,
                                     struct sk_buff *skb,
                                     unsigned int num)
{
        return smp_processor_id() % num;
}

static unsigned int fanout_demux_rnd(struct packet_fanout *f,
                                     struct sk_buff *skb,
                                     unsigned int num)
{
        return get_random_u32_below(num);
}

static unsigned int fanout_demux_rollover(struct packet_fanout *f,
                                          struct sk_buff *skb,
                                          unsigned int idx, bool try_self,
                                          unsigned int num)
{
        struct packet_sock *po, *po_next, *po_skip = NULL;
        unsigned int i, j, room = ROOM_NONE;

        po = pkt_sk(rcu_dereference(f->arr[idx]));

        if (try_self) {
                room = packet_rcv_has_room(po, skb);
                if (room == ROOM_NORMAL ||
                    (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
                        return idx;
                po_skip = po;
        }

        i = j = min_t(int, po->rollover->sock, num - 1);
        do {
                po_next = pkt_sk(rcu_dereference(f->arr[i]));
                if (po_next != po_skip &&
                    !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) &&
                    packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
                        if (i != j)
                                po->rollover->sock = i;
                        atomic_long_inc(&po->rollover->num);
                        if (room == ROOM_LOW)
                                atomic_long_inc(&po->rollover->num_huge);
                        return i;
                }

                if (++i == num)
                        i = 0;
        } while (i != j);

        atomic_long_inc(&po->rollover->num_failed);
        return idx;
}

static unsigned int fanout_demux_qm(struct packet_fanout *f,
                                    struct sk_buff *skb,
                                    unsigned int num)
{
        return skb_get_queue_mapping(skb) % num;
}

static unsigned int fanout_demux_bpf(struct packet_fanout *f,
                                     struct sk_buff *skb,
                                     unsigned int num)
{
        struct bpf_prog *prog;
        unsigned int ret = 0;

        rcu_read_lock();
        prog = rcu_dereference(f->bpf_prog);
        if (prog)
                ret = bpf_prog_run_clear_cb(prog, skb) % num;
        rcu_read_unlock();

        return ret;
}

static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
{
        return f->flags & (flag >> 8);
}

static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
                             struct packet_type *pt, struct net_device *orig_dev)
{
        struct packet_fanout *f = pt->af_packet_priv;
        unsigned int num = READ_ONCE(f->num_members);
        struct net *net = read_pnet(&f->net);
        struct packet_sock *po;
        unsigned int idx;

        if (!net_eq(dev_net(dev), net) || !num) {
                kfree_skb(skb);
                return 0;
        }

        if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
                skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
                if (!skb)
                        return 0;
        }
        switch (f->type) {
        case PACKET_FANOUT_HASH:
        default:
                idx = fanout_demux_hash(f, skb, num);
                break;
        case PACKET_FANOUT_LB:
                idx = fanout_demux_lb(f, skb, num);
                break;
        case PACKET_FANOUT_CPU:
                idx = fanout_demux_cpu(f, skb, num);
                break;
        case PACKET_FANOUT_RND:
                idx = fanout_demux_rnd(f, skb, num);
                break;
        case PACKET_FANOUT_QM:
                idx = fanout_demux_qm(f, skb, num);
                break;
        case PACKET_FANOUT_ROLLOVER:
                idx = fanout_demux_rollover(f, skb, 0, false, num);
                break;
        case PACKET_FANOUT_CBPF:
        case PACKET_FANOUT_EBPF:
                idx = fanout_demux_bpf(f, skb, num);
                break;
        }

        if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
                idx = fanout_demux_rollover(f, skb, idx, true, num);

        po = pkt_sk(rcu_dereference(f->arr[idx]));
        return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
}

DEFINE_MUTEX(fanout_mutex);
EXPORT_SYMBOL_GPL(fanout_mutex);
static LIST_HEAD(fanout_list);
static u16 fanout_next_id;

static void __fanout_link(struct sock *sk, struct packet_sock *po)
{
        struct packet_fanout *f = po->fanout;

        spin_lock(&f->lock);
        rcu_assign_pointer(f->arr[f->num_members], sk);
        smp_wmb();
        f->num_members++;
        if (f->num_members == 1)
                dev_add_pack(&f->prot_hook);
        spin_unlock(&f->lock);
}

static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
{
        struct packet_fanout *f = po->fanout;
        int i;

        spin_lock(&f->lock);
        for (i = 0; i < f->num_members; i++) {
                if (rcu_dereference_protected(f->arr[i],
                                              lockdep_is_held(&f->lock)) == sk)
                        break;
        }
        BUG_ON(i >= f->num_members);
        rcu_assign_pointer(f->arr[i],
                           rcu_dereference_protected(f->arr[f->num_members - 1],
                                                     lockdep_is_held(&f->lock)));
        f->num_members--;
        if (f->num_members == 0)
                __dev_remove_pack(&f->prot_hook);
        spin_unlock(&f->lock);
}

static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
{
        if (sk->sk_family != PF_PACKET)
                return false;

        return ptype->af_packet_priv == pkt_sk(sk)->fanout;
}

static void fanout_init_data(struct packet_fanout *f)
{
        switch (f->type) {
        case PACKET_FANOUT_LB:
                atomic_set(&f->rr_cur, 0);
                break;
        case PACKET_FANOUT_CBPF:
        case PACKET_FANOUT_EBPF:
                RCU_INIT_POINTER(f->bpf_prog, NULL);
                break;
        }
}

static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
{
        struct bpf_prog *old;

        spin_lock(&f->lock);
        old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
        rcu_assign_pointer(f->bpf_prog, new);
        spin_unlock(&f->lock);

        if (old) {
                synchronize_net();
                bpf_prog_destroy(old);
        }
}

static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
                                unsigned int len)
{
        struct bpf_prog *new;
        struct sock_fprog fprog;
        int ret;

        if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
                return -EPERM;

        ret = copy_bpf_fprog_from_user(&fprog, data, len);
        if (ret)
                return ret;

        ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
        if (ret)
                return ret;

        __fanout_set_data_bpf(po->fanout, new);
        return 0;
}

static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
                                unsigned int len)
{
        struct bpf_prog *new;
        u32 fd;

        if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
                return -EPERM;
        if (len != sizeof(fd))
                return -EINVAL;
        if (copy_from_sockptr(&fd, data, len))
                return -EFAULT;

        new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
        if (IS_ERR(new))
                return PTR_ERR(new);

        __fanout_set_data_bpf(po->fanout, new);
        return 0;
}

static int fanout_set_data(struct packet_sock *po, sockptr_t data,
                           unsigned int len)
{
        switch (po->fanout->type) {
        case PACKET_FANOUT_CBPF:
                return fanout_set_data_cbpf(po, data, len);
        case PACKET_FANOUT_EBPF:
                return fanout_set_data_ebpf(po, data, len);
        default:
                return -EINVAL;
        }
}

static void fanout_release_data(struct packet_fanout *f)
{
        switch (f->type) {
        case PACKET_FANOUT_CBPF:
        case PACKET_FANOUT_EBPF:
                __fanout_set_data_bpf(f, NULL);
        }
}

static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
{
        struct packet_fanout *f;

        list_for_each_entry(f, &fanout_list, list) {
                if (f->id == candidate_id &&
                    read_pnet(&f->net) == sock_net(sk)) {
                        return false;
                }
        }
        return true;
}

static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
{
        u16 id = fanout_next_id;

        do {
                if (__fanout_id_is_free(sk, id)) {
                        *new_id = id;
                        fanout_next_id = id + 1;
                        return true;
                }

                id++;
        } while (id != fanout_next_id);

        return false;
}

static int fanout_add(struct sock *sk, struct fanout_args *args)
{
        struct packet_rollover *rollover = NULL;
        struct packet_sock *po = pkt_sk(sk);
        u16 type_flags = args->type_flags;
        struct packet_fanout *f, *match;
        u8 type = type_flags & 0xff;
        u8 flags = type_flags >> 8;
        u16 id = args->id;
        int err;

        switch (type) {
        case PACKET_FANOUT_ROLLOVER:
                if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
                        return -EINVAL;
                break;
        case PACKET_FANOUT_HASH:
        case PACKET_FANOUT_LB:
        case PACKET_FANOUT_CPU:
        case PACKET_FANOUT_RND:
        case PACKET_FANOUT_QM:
        case PACKET_FANOUT_CBPF:
        case PACKET_FANOUT_EBPF:
                break;
        default:
                return -EINVAL;
        }

        mutex_lock(&fanout_mutex);

        err = -EALREADY;
        if (po->fanout)
                goto out;

        if (type == PACKET_FANOUT_ROLLOVER ||
            (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
                err = -ENOMEM;
                rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
                if (!rollover)
                        goto out;
                atomic_long_set(&rollover->num, 0);
                atomic_long_set(&rollover->num_huge, 0);
                atomic_long_set(&rollover->num_failed, 0);
        }

        if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
                if (id != 0) {
                        err = -EINVAL;
                        goto out;
                }
                if (!fanout_find_new_id(sk, &id)) {
                        err = -ENOMEM;
                        goto out;
                }
                /* ephemeral flag for the first socket in the group: drop it */
                flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
        }

        match = NULL;
        list_for_each_entry(f, &fanout_list, list) {
                if (f->id == id &&
                    read_pnet(&f->net) == sock_net(sk)) {
                        match = f;
                        break;
                }
        }
        err = -EINVAL;
        if (match) {
                if (match->flags != flags)
                        goto out;
                if (args->max_num_members &&
                    args->max_num_members != match->max_num_members)
                        goto out;
        } else {
                if (args->max_num_members > PACKET_FANOUT_MAX)
                        goto out;
                if (!args->max_num_members)
                        /* legacy PACKET_FANOUT_MAX */
                        args->max_num_members = 256;
                err = -ENOMEM;
                match = kvzalloc(struct_size(match, arr, args->max_num_members),
                                 GFP_KERNEL);
                if (!match)
                        goto out;
                write_pnet(&match->net, sock_net(sk));
                match->id = id;
                match->type = type;
                match->flags = flags;
                INIT_LIST_HEAD(&match->list);
                spin_lock_init(&match->lock);
                refcount_set(&match->sk_ref, 0);
                fanout_init_data(match);
                match->prot_hook.type = po->prot_hook.type;
                match->prot_hook.dev = po->prot_hook.dev;
                match->prot_hook.func = packet_rcv_fanout;
                match->prot_hook.af_packet_priv = match;
                match->prot_hook.af_packet_net = read_pnet(&match->net);
                match->prot_hook.id_match = match_fanout_group;
                match->max_num_members = args->max_num_members;
                match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING;
                list_add(&match->list, &fanout_list);
        }
        err = -EINVAL;

        spin_lock(&po->bind_lock);
        if (po->num &&
            match->type == type &&
            match->prot_hook.type == po->prot_hook.type &&
            match->prot_hook.dev == po->prot_hook.dev) {
                err = -ENOSPC;
                if (refcount_read(&match->sk_ref) < match->max_num_members) {
                        /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */
                        WRITE_ONCE(po->fanout, match);

                        po->rollover = rollover;
                        rollover = NULL;
                        refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
                        if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
                                __dev_remove_pack(&po->prot_hook);
                                __fanout_link(sk, po);
                        }
                        err = 0;
                }
        }
        spin_unlock(&po->bind_lock);

        if (err && !refcount_read(&match->sk_ref)) {
                list_del(&match->list);
                kvfree(match);
        }

out:
        kfree(rollover);
        mutex_unlock(&fanout_mutex);
        return err;
}

/* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
 * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
 * It is the responsibility of the caller to call fanout_release_data() and
 * free the returned packet_fanout (after synchronize_net())
 */
static struct packet_fanout *fanout_release(struct sock *sk)
{
        struct packet_sock *po = pkt_sk(sk);
        struct packet_fanout *f;

        mutex_lock(&fanout_mutex);
        f = po->fanout;
        if (f) {
                po->fanout = NULL;

                if (refcount_dec_and_test(&f->sk_ref))
                        list_del(&f->list);
                else
                        f = NULL;
        }
        mutex_unlock(&fanout_mutex);

        return f;
}

static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
                                          struct sk_buff *skb)
{
        /* Earlier code assumed this would be a VLAN pkt, double-check
         * this now that we have the actual packet in hand. We can only
         * do this check on Ethernet devices.
         */
        if (unlikely(dev->type != ARPHRD_ETHER))
                return false;

        skb_reset_mac_header(skb);
        return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
}

static const struct proto_ops packet_ops;

static const struct proto_ops packet_ops_spkt;

static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
                           struct packet_type *pt, struct net_device *orig_dev)
{
        struct sock *sk;
        struct sockaddr_pkt *spkt;

        /*
         *        When we registered the protocol we saved the socket in the data
         *        field for just this event.
         */

        sk = pt->af_packet_priv;

        /*
         *        Yank back the headers [hope the device set this
         *        right or kerboom...]
         *
         *        Incoming packets have ll header pulled,
         *        push it back.
         *
         *        For outgoing ones skb->data == skb_mac_header(skb)
         *        so that this procedure is noop.
         */

        if (skb->pkt_type == PACKET_LOOPBACK)
                goto out;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                goto out;

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (skb == NULL)
                goto oom;

        /* drop any routing info */
        skb_dst_drop(skb);

        /* drop conntrack reference */
        nf_reset_ct(skb);

        spkt = &PACKET_SKB_CB(skb)->sa.pkt;

        skb_push(skb, skb->data - skb_mac_header(skb));

        /*
         *        The SOCK_PACKET socket receives _all_ frames.
         */

        spkt->spkt_family = dev->type;
        strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
        spkt->spkt_protocol = skb->protocol;

        /*
         *        Charge the memory to the socket. This is done specifically
         *        to prevent sockets using all the memory up.
         */

        if (sock_queue_rcv_skb(sk, skb) == 0)
                return 0;

out:
        kfree_skb(skb);
oom:
        return 0;
}

static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
{
        int depth;

        if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
            sock->type == SOCK_RAW) {
                skb_reset_mac_header(skb);
                skb->protocol = dev_parse_header_protocol(skb);
        }

        /* Move network header to the right position for VLAN tagged packets */
        if (likely(skb->dev->type == ARPHRD_ETHER) &&
            eth_type_vlan(skb->protocol) &&
            vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0)
                skb_set_network_header(skb, depth);

        skb_probe_transport_header(skb);
}

/*
 *        Output a raw packet to a device layer. This bypasses all the other
 *        protocol layers and you must therefore supply it with a complete frame
 */

static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
                               size_t len)
{
        struct sock *sk = sock->sk;
        DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
        struct sk_buff *skb = NULL;
        struct net_device *dev;
        struct sockcm_cookie sockc;
        __be16 proto = 0;
        int err;
        int extra_len = 0;

        /*
         *        Get and verify the address.
         */

        if (saddr) {
                if (msg->msg_namelen < sizeof(struct sockaddr))
                        return -EINVAL;
                if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
                        proto = saddr->spkt_protocol;
        } else
                return -ENOTCONN;        /* SOCK_PACKET must be sent giving an address */

        /*
         *        Find the device first to size check it
         */

        saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
retry:
        rcu_read_lock();
        dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
        err = -ENODEV;
        if (dev == NULL)
                goto out_unlock;

        err = -ENETDOWN;
        if (!(dev->flags & IFF_UP))
                goto out_unlock;

        /*
         * You may not queue a frame bigger than the mtu. This is the lowest level
         * raw protocol and you must do your own fragmentation at this level.
         */

        if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
                if (!netif_supports_nofcs(dev)) {
                        err = -EPROTONOSUPPORT;
                        goto out_unlock;
                }
                extra_len = 4; /* We're doing our own CRC */
        }

        err = -EMSGSIZE;
        if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
                goto out_unlock;

        if (!skb) {
                size_t reserved = LL_RESERVED_SPACE(dev);
                int tlen = dev->needed_tailroom;
                unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;

                rcu_read_unlock();
                skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
                if (skb == NULL)
                        return -ENOBUFS;
                /* FIXME: Save some space for broken drivers that write a hard
                 * header at transmission time by themselves. PPP is the notable
                 * one here. This should really be fixed at the driver level.
                 */
                skb_reserve(skb, reserved);
                skb_reset_network_header(skb);

                /* Try to align data part correctly */
                if (hhlen) {
                        skb->data -= hhlen;
                        skb->tail -= hhlen;
                        if (len < hhlen)
                                skb_reset_network_header(skb);
                }
                err = memcpy_from_msg(skb_put(skb, len), msg, len);
                if (err)
                        goto out_free;
                goto retry;
        }

        if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
                err = -EINVAL;
                goto out_unlock;
        }
        if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
            !packet_extra_vlan_len_allowed(dev, skb)) {
                err = -EMSGSIZE;
                goto out_unlock;
        }

        sockcm_init(&sockc, sk);
        if (msg->msg_controllen) {
                err = sock_cmsg_send(sk, msg, &sockc);
                if (unlikely(err))
                        goto out_unlock;
        }

        skb->protocol = proto;
        skb->dev = dev;
        skb->priority = sockc.priority;
        skb->mark = sockc.mark;
        skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);
        skb_setup_tx_timestamp(skb, &sockc);

        if (unlikely(extra_len == 4))
                skb->no_fcs = 1;

        packet_parse_headers(skb, sock);

        dev_queue_xmit(skb);
        rcu_read_unlock();
        return len;

out_unlock:
        rcu_read_unlock();
out_free:
        kfree_skb(skb);
        return err;
}

static unsigned int run_filter(struct sk_buff *skb,
                               const struct sock *sk,
                               unsigned int res)
{
        struct sk_filter *filter;

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter != NULL)
                res = bpf_prog_run_clear_cb(filter->prog, skb);
        rcu_read_unlock();

        return res;
}

static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
                           size_t *len, int vnet_hdr_sz)
{
        struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };

        if (*len < vnet_hdr_sz)
                return -EINVAL;
        *len -= vnet_hdr_sz;

        if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
                return -EINVAL;

        return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
}

/*
 * This function makes lazy skb cloning in hope that most of packets
 * are discarded by BPF.
 *
 * Note tricky part: we DO mangle shared skb! skb->data, skb->len
 * and skb->cb are mangled. It works because (and until) packets
 * falling here are owned by current CPU. Output packets are cloned
 * by dev_queue_xmit_nit(), input packets are processed by net_bh
 * sequentially, so that if we return skb to original state on exit,
 * we will not harm anyone.
 */

static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
                      struct packet_type *pt, struct net_device *orig_dev)
{
        enum skb_drop_reason drop_reason = SKB_CONSUMED;
        struct sock *sk = NULL;
        struct sockaddr_ll *sll;
        struct packet_sock *po;
        u8 *skb_head = skb->data;
        int skb_len = skb->len;
        unsigned int snaplen, res;

        if (skb->pkt_type == PACKET_LOOPBACK)
                goto drop;

        sk = pt->af_packet_priv;
        po = pkt_sk(sk);

        if (!net_eq(dev_net(dev), sock_net(sk)))
                goto drop;

        skb->dev = dev;

        if (dev_has_header(dev)) {
                /* The device has an explicit notion of ll header,
                 * exported to higher levels.
                 *
                 * Otherwise, the device hides details of its frame
                 * structure, so that corresponding packet head is
                 * never delivered to user.
                 */
                if (sk->sk_type != SOCK_DGRAM)
                        skb_push(skb, skb->data - skb_mac_header(skb));
                else if (skb->pkt_type == PACKET_OUTGOING) {
                        /* Special case: outgoing packets have ll header at head */
                        skb_pull(skb, skb_network_offset(skb));
                }
        }

        snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);

        res = run_filter(skb, sk, snaplen);
        if (!res)
                goto drop_n_restore;
        if (snaplen > res)
                snaplen = res;

        if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
                goto drop_n_acct;

        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
                if (nskb == NULL)
                        goto drop_n_acct;

                if (skb_head != skb->data) {
                        skb->data = skb_head;
                        skb->len = skb_len;
                }
                consume_skb(skb);
                skb = nskb;
        }

        sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);

        sll = &PACKET_SKB_CB(skb)->sa.ll;
        sll->sll_hatype = dev->type;
        sll->sll_pkttype = skb->pkt_type;
        if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
                sll->sll_ifindex = orig_dev->ifindex;
        else
                sll->sll_ifindex = dev->ifindex;

        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);

        /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
         * Use their space for storing the original skb length.
         */
        PACKET_SKB_CB(skb)->sa.origlen = skb->len;

        if (pskb_trim(skb, snaplen))
                goto drop_n_acct;

        skb_set_owner_r(skb, sk);
        skb->dev = NULL;
        skb_dst_drop(skb);

        /* drop conntrack reference */
        nf_reset_ct(skb);

        spin_lock(&sk->sk_receive_queue.lock);
        po->stats.stats1.tp_packets++;
        sock_skb_set_dropcount(sk, skb);
        skb_clear_delivery_time(skb);
        __skb_queue_tail(&sk->sk_receive_queue, skb);
        spin_unlock(&sk->sk_receive_queue.lock);
        sk->sk_data_ready(sk);
        return 0;

drop_n_acct:
        atomic_inc(&po->tp_drops);
        atomic_inc(&sk->sk_drops);
        drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;

drop_n_restore:
        if (skb_head != skb->data && skb_shared(skb)) {
                skb->data = skb_head;
                skb->len = skb_len;
        }
drop:
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;
}

static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
                       struct packet_type *pt, struct net_device *orig_dev)
{
        enum skb_drop_reason drop_reason = SKB_CONSUMED;
        struct sock *sk = NULL;
        struct packet_sock *po;
        struct sockaddr_ll *sll;
        union tpacket_uhdr h;
        u8 *skb_head = skb->data;
        int skb_len = skb->len;
        unsigned int snaplen, res;
        unsigned long status = TP_STATUS_USER;
        unsigned short macoff, hdrlen;
        unsigned int netoff;
        struct sk_buff *copy_skb = NULL;
        struct timespec64 ts;
        __u32 ts_status;
        unsigned int slot_id = 0;
        int vnet_hdr_sz = 0;

        /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
         * We may add members to them until current aligned size without forcing
         * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
         */
        BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
        BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);

        if (skb->pkt_type == PACKET_LOOPBACK)
                goto drop;

        sk = pt->af_packet_priv;
        po = pkt_sk(sk);

        if (!net_eq(dev_net(dev), sock_net(sk)))
                goto drop;

        if (dev_has_header(dev)) {
                if (sk->sk_type != SOCK_DGRAM)
                        skb_push(skb, skb->data - skb_mac_header(skb));
                else if (skb->pkt_type == PACKET_OUTGOING) {
                        /* Special case: outgoing packets have ll header at head */
                        skb_pull(skb, skb_network_offset(skb));
                }
        }

        snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);

        res = run_filter(skb, sk, snaplen);
        if (!res)
                goto drop_n_restore;

        /* If we are flooded, just give up */
        if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
                atomic_inc(&po->tp_drops);
                goto drop_n_restore;
        }

        if (skb->ip_summed == CHECKSUM_PARTIAL)
                status |= TP_STATUS_CSUMNOTREADY;
        else if (skb->pkt_type != PACKET_OUTGOING &&
                 skb_csum_unnecessary(skb))
                status |= TP_STATUS_CSUM_VALID;
        if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
                status |= TP_STATUS_GSO_TCP;

        if (snaplen > res)
                snaplen = res;

        if (sk->sk_type == SOCK_DGRAM) {
                macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
                                  po->tp_reserve;
        } else {
                unsigned int maclen = skb_network_offset(skb);
                netoff = TPACKET_ALIGN(po->tp_hdrlen +
                                       (maclen < 16 ? 16 : maclen)) +
                                       po->tp_reserve;
                vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
                if (vnet_hdr_sz)
                        netoff += vnet_hdr_sz;
                macoff = netoff - maclen;
        }
        if (netoff > USHRT_MAX) {
                atomic_inc(&po->tp_drops);
                goto drop_n_restore;
        }
        if (po->tp_version <= TPACKET_V2) {
                if (macoff + snaplen > po->rx_ring.frame_size) {
                        if (READ_ONCE(po->copy_thresh) &&
                            atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
                                if (skb_shared(skb)) {
                                        copy_skb = skb_clone(skb, GFP_ATOMIC);
                                } else {
                                        copy_skb = skb_get(skb);
                                        skb_head = skb->data;
                                }
                                if (copy_skb) {
                                        memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0,
                                               sizeof(PACKET_SKB_CB(copy_skb)->sa.ll));
                                        skb_set_owner_r(copy_skb, sk);
                                }
                        }
                        snaplen = po->rx_ring.frame_size - macoff;
                        if ((int)snaplen < 0) {
                                snaplen = 0;
                                vnet_hdr_sz = 0;
                        }
                }
        } else if (unlikely(macoff + snaplen >
                            GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
                u32 nval;

                nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
                pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
                            snaplen, nval, macoff);
                snaplen = nval;
                if (unlikely((int)snaplen < 0)) {
                        snaplen = 0;
                        macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
                        vnet_hdr_sz = 0;
                }
        }
        spin_lock(&sk->sk_receive_queue.lock);
        h.raw = packet_current_rx_frame(po, skb,
                                        TP_STATUS_KERNEL, (macoff+snaplen));
        if (!h.raw)
                goto drop_n_account;

        if (po->tp_version <= TPACKET_V2) {
                slot_id = po->rx_ring.head;
                if (test_bit(slot_id, po->rx_ring.rx_owner_map))
                        goto drop_n_account;
                __set_bit(slot_id, po->rx_ring.rx_owner_map);
        }

        if (vnet_hdr_sz &&
            virtio_net_hdr_from_skb(skb, h.raw + macoff -
                                    sizeof(struct virtio_net_hdr),
                                    vio_le(), true, 0)) {
                if (po->tp_version == TPACKET_V3)
                        prb_clear_blk_fill_status(&po->rx_ring);
                goto drop_n_account;
        }

        if (po->tp_version <= TPACKET_V2) {
                packet_increment_rx_head(po, &po->rx_ring);
        /*
         * LOSING will be reported till you read the stats,
         * because it's COR - Clear On Read.
         * Anyways, moving it for V1/V2 only as V3 doesn't need this
         * at packet level.
         */
                if (atomic_read(&po->tp_drops))
                        status |= TP_STATUS_LOSING;
        }

        po->stats.stats1.tp_packets++;
        if (copy_skb) {
                status |= TP_STATUS_COPY;
                skb_clear_delivery_time(copy_skb);
                __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
        }
        spin_unlock(&sk->sk_receive_queue.lock);

        skb_copy_bits(skb, 0, h.raw + macoff, snaplen);

        /* Always timestamp; prefer an existing software timestamp taken
         * closer to the time of capture.
         */
        ts_status = tpacket_get_timestamp(skb, &ts,
                                          READ_ONCE(po->tp_tstamp) |
                                          SOF_TIMESTAMPING_SOFTWARE);
        if (!ts_status)
                ktime_get_real_ts64(&ts);

        status |= ts_status;

        switch (po->tp_version) {
        case TPACKET_V1:
                h.h1->tp_len = skb->len;
                h.h1->tp_snaplen = snaplen;
                h.h1->tp_mac = macoff;
                h.h1->tp_net = netoff;
                h.h1->tp_sec = ts.tv_sec;
                h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
                hdrlen = sizeof(*h.h1);
                break;
        case TPACKET_V2:
                h.h2->tp_len = skb->len;
                h.h2->tp_snaplen = snaplen;
                h.h2->tp_mac = macoff;
                h.h2->tp_net = netoff;
                h.h2->tp_sec = ts.tv_sec;
                h.h2->tp_nsec = ts.tv_nsec;
                if (skb_vlan_tag_present(skb)) {
                        h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
                        h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
                        status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
                } else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
                        h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev);
                        h.h2->tp_vlan_tpid = ntohs(skb->protocol);
                        status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
                } else {
                        h.h2->tp_vlan_tci = 0;
                        h.h2->tp_vlan_tpid = 0;
                }
                memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
                hdrlen = sizeof(*h.h2);
                break;
        case TPACKET_V3:
                /* tp_nxt_offset,vlan are already populated above.
                 * So DONT clear those fields here
                 */
                h.h3->tp_status |= status;
                h.h3->tp_len = skb->len;
                h.h3->tp_snaplen = snaplen;
                h.h3->tp_mac = macoff;
                h.h3->tp_net = netoff;
                h.h3->tp_sec  = ts.tv_sec;
                h.h3->tp_nsec = ts.tv_nsec;
                memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
                hdrlen = sizeof(*h.h3);
                break;
        default:
                BUG();
        }

        sll = h.raw + TPACKET_ALIGN(hdrlen);
        sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
        sll->sll_family = AF_PACKET;
        sll->sll_hatype = dev->type;
        sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ?
                vlan_get_protocol_dgram(skb) : skb->protocol;
        sll->sll_pkttype = skb->pkt_type;
        if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV)))
                sll->sll_ifindex = orig_dev->ifindex;
        else
                sll->sll_ifindex = dev->ifindex;

        smp_mb();

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
        if (po->tp_version <= TPACKET_V2) {
                u8 *start, *end;

                end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
                                        macoff + snaplen);

                for (start = h.raw; start < end; start += PAGE_SIZE)
                        flush_dcache_page(pgv_to_page(start));
        }
        smp_wmb();
#endif

        if (po->tp_version <= TPACKET_V2) {
                spin_lock(&sk->sk_receive_queue.lock);
                __packet_set_status(po, h.raw, status);
                __clear_bit(slot_id, po->rx_ring.rx_owner_map);
                spin_unlock(&sk->sk_receive_queue.lock);
                sk->sk_data_ready(sk);
        } else if (po->tp_version == TPACKET_V3) {
                prb_clear_blk_fill_status(&po->rx_ring);
        }

drop_n_restore:
        if (skb_head != skb->data && skb_shared(skb)) {
                skb->data = skb_head;
                skb->len = skb_len;
        }
drop:
        sk_skb_reason_drop(sk, skb, drop_reason);
        return 0;

drop_n_account:
        spin_unlock(&sk->sk_receive_queue.lock);
        atomic_inc(&po->tp_drops);
        drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;

        sk->sk_data_ready(sk);
        sk_skb_reason_drop(sk, copy_skb, drop_reason);
        goto drop_n_restore;
}

static void tpacket_destruct_skb(struct sk_buff *skb)
{
        struct packet_sock *po = pkt_sk(skb->sk);

        if (likely(po->tx_ring.pg_vec)) {
                void *ph;
                __u32 ts;

                ph = skb_zcopy_get_nouarg(skb);
                packet_dec_pending(&po->tx_ring);

                ts = __packet_set_timestamp(po, ph, skb);
                __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);

                complete(&po->skb_completion);
        }

        sock_wfree(skb);
}

static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
{
        if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
            (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
             __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
              __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
                vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
                         __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
                        __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);

        if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
                return -EINVAL;

        return 0;
}

static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
                                 struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
{
        int ret;

        if (*len < vnet_hdr_sz)
                return -EINVAL;
        *len -= vnet_hdr_sz;

        if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
                return -EFAULT;

        ret = __packet_snd_vnet_parse(vnet_hdr, *len);
        if (ret)
                return ret;

        /* move iter to point to the start of mac header */
        if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
                iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));

        return 0;
}

static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
                void *frame, struct net_device *dev, void *data, int tp_len,
                __be16 proto, unsigned char *addr, int hlen, int copylen,
                const struct sockcm_cookie *sockc)
{
        union tpacket_uhdr ph;
        int to_write, offset, len, nr_frags, len_max;
        struct socket *sock = po->sk.sk_socket;
        struct page *page;
        int err;

        ph.raw = frame;

        skb->protocol = proto;
        skb->dev = dev;
        skb->priority = sockc->priority;
        skb->mark = sockc->mark;
        skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid);
        skb_setup_tx_timestamp(skb, sockc);
        skb_zcopy_set_nouarg(skb, ph.raw);

        skb_reserve(skb, hlen);
        skb_reset_network_header(skb);

        to_write = tp_len;

        if (sock->type == SOCK_DGRAM) {
                err = dev_hard_header(skb, dev, ntohs(proto), addr,
                                NULL, tp_len);
                if (unlikely(err < 0))
                        return -EINVAL;
        } else if (copylen) {
                int hdrlen = min_t(int, copylen, tp_len);

                skb_push(skb, dev->hard_header_len);
                skb_put(skb, copylen - dev->hard_header_len);
                err = skb_store_bits(skb, 0, data, hdrlen);
                if (unlikely(err))
                        return err;
                if (!dev_validate_header(dev, skb->data, hdrlen))
                        return -EINVAL;

                data += hdrlen;
                to_write -= hdrlen;
        }

        offset = offset_in_page(data);
        len_max = PAGE_SIZE - offset;
        len = ((to_write > len_max) ? len_max : to_write);

        skb->data_len = to_write;
        skb->len += to_write;
        skb->truesize += to_write;
        refcount_add(to_write, &po->sk.sk_wmem_alloc);

        while (likely(to_write)) {
                nr_frags = skb_shinfo(skb)->nr_frags;

                if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
                        pr_err("Packet exceed the number of skb frags(%u)\n",
                               (unsigned int)MAX_SKB_FRAGS);
                        return -EFAULT;
                }

                page = pgv_to_page(data);
                data += len;
                flush_dcache_page(page);
                get_page(page);
                skb_fill_page_desc(skb, nr_frags, page, offset, len);
                to_write -= len;
                offset = 0;
                len_max = PAGE_SIZE;
                len = ((to_write > len_max) ? len_max : to_write);
        }

        packet_parse_headers(skb, sock);

        return tp_len;
}

static int tpacket_parse_header(struct packet_sock *po, void *frame,
                                int size_max, void **data)
{
        union tpacket_uhdr ph;
        int tp_len, off;

        ph.raw = frame;

        switch (po->tp_version) {
        case TPACKET_V3:
                if (ph.h3->tp_next_offset != 0) {
                        pr_warn_once("variable sized slot not supported");
                        return -EINVAL;
                }
                tp_len = ph.h3->tp_len;
                break;
        case TPACKET_V2:
                tp_len = ph.h2->tp_len;
                break;
        default:
                tp_len = ph.h1->tp_len;
                break;
        }
        if (unlikely(tp_len > size_max)) {
                pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
                return -EMSGSIZE;
        }

        if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) {
                int off_min, off_max;

                off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
                off_max = po->tx_ring.frame_size - tp_len;
                if (po->sk.sk_type == SOCK_DGRAM) {
                        switch (po->tp_version) {
                        case TPACKET_V3:
                                off = ph.h3->tp_net;
                                break;
                        case TPACKET_V2:
                                off = ph.h2->tp_net;
                                break;
                        default:
                                off = ph.h1->tp_net;
                                break;
                        }
                } else {
                        switch (po->tp_version) {
                        case TPACKET_V3:
                                off = ph.h3->tp_mac;
                                break;
                        case TPACKET_V2:
                                off = ph.h2->tp_mac;
                                break;
                        default:
                                off = ph.h1->tp_mac;
                                break;
                        }
                }
                if (unlikely((off < off_min) || (off_max < off)))
                        return -EINVAL;
        } else {
                off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
        }

        *data = frame + off;
        return tp_len;
}

static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
{
        struct sk_buff *skb = NULL;
        struct net_device *dev;
        struct virtio_net_hdr *vnet_hdr = NULL;
        struct sockcm_cookie sockc;
        __be16 proto;
        int err, reserve = 0;
        void *ph;
        DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
        bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
        int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
        unsigned char *addr = NULL;
        int tp_len, size_max;
        void *data;
        int len_sum = 0;
        int status = TP_STATUS_AVAILABLE;
        int hlen, tlen, copylen = 0;
        long timeo = 0;

        mutex_lock(&po->pg_vec_lock);

        /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
         * we need to confirm it under protection of pg_vec_lock.
         */
        if (unlikely(!po->tx_ring.pg_vec)) {
                err = -EBUSY;
                goto out;
        }
        if (likely(saddr == NULL)) {
                dev        = packet_cached_dev_get(po);
                proto        = READ_ONCE(po->num);
        } else {
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(struct sockaddr_ll))
                        goto out;
                if (msg->msg_namelen < (saddr->sll_halen
                                        + offsetof(struct sockaddr_ll,
                                                sll_addr)))
                        goto out;
                proto        = saddr->sll_protocol;
                dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
                if (po->sk.sk_socket->type == SOCK_DGRAM) {
                        if (dev && msg->msg_namelen < dev->addr_len +
                                   offsetof(struct sockaddr_ll, sll_addr))
                                goto out_put;
                        addr = saddr->sll_addr;
                }
        }

        err = -ENXIO;
        if (unlikely(dev == NULL))
                goto out;
        err = -ENETDOWN;
        if (unlikely(!(dev->flags & IFF_UP)))
                goto out_put;

        sockcm_init(&sockc, &po->sk);
        if (msg->msg_controllen) {
                err = sock_cmsg_send(&po->sk, msg, &sockc);
                if (unlikely(err))
                        goto out_put;
        }

        if (po->sk.sk_socket->type == SOCK_RAW)
                reserve = dev->hard_header_len;
        size_max = po->tx_ring.frame_size
                - (po->tp_hdrlen - sizeof(struct sockaddr_ll));

        if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
                size_max = dev->mtu + reserve + VLAN_HLEN;

        reinit_completion(&po->skb_completion);

        do {
                ph = packet_current_frame(po, &po->tx_ring,
                                          TP_STATUS_SEND_REQUEST);
                if (unlikely(ph == NULL)) {
                        if (need_wait && skb) {
                                timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
                                timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
                                if (timeo <= 0) {
                                        err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
                                        goto out_put;
                                }
                        }
                        /* check for additional frames */
                        continue;
                }

                skb = NULL;
                tp_len = tpacket_parse_header(po, ph, size_max, &data);
                if (tp_len < 0)
                        goto tpacket_error;

                status = TP_STATUS_SEND_REQUEST;
                hlen = LL_RESERVED_SPACE(dev);
                tlen = dev->needed_tailroom;
                if (vnet_hdr_sz) {
                        vnet_hdr = data;
                        data += vnet_hdr_sz;
                        tp_len -= vnet_hdr_sz;
                        if (tp_len < 0 ||
                            __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
                                tp_len = -EINVAL;
                                goto tpacket_error;
                        }
                        copylen = __virtio16_to_cpu(vio_le(),
                                                    vnet_hdr->hdr_len);
                }
                copylen = max_t(int, copylen, dev->hard_header_len);
                skb = sock_alloc_send_skb(&po->sk,
                                hlen + tlen + sizeof(struct sockaddr_ll) +
                                (copylen - dev->hard_header_len),
                                !need_wait, &err);

                if (unlikely(skb == NULL)) {
                        /* we assume the socket was initially writeable ... */
                        if (likely(len_sum > 0))
                                err = len_sum;
                        goto out_status;
                }
                tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
                                          addr, hlen, copylen, &sockc);
                if (likely(tp_len >= 0) &&
                    tp_len > dev->mtu + reserve &&
                    !vnet_hdr_sz &&
                    !packet_extra_vlan_len_allowed(dev, skb))
                        tp_len = -EMSGSIZE;

                if (unlikely(tp_len < 0)) {
tpacket_error:
                        if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) {
                                __packet_set_status(po, ph,
                                                TP_STATUS_AVAILABLE);
                                packet_increment_head(&po->tx_ring);
                                kfree_skb(skb);
                                continue;
                        } else {
                                status = TP_STATUS_WRONG_FORMAT;
                                err = tp_len;
                                goto out_status;
                        }
                }

                if (vnet_hdr_sz) {
                        if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
                                tp_len = -EINVAL;
                                goto tpacket_error;
                        }
                        virtio_net_hdr_set_proto(skb, vnet_hdr);
                }

                skb->destructor = tpacket_destruct_skb;
                __packet_set_status(po, ph, TP_STATUS_SENDING);
                packet_inc_pending(&po->tx_ring);

                status = TP_STATUS_SEND_REQUEST;
                err = packet_xmit(po, skb);
                if (unlikely(err != 0)) {
                        if (err > 0)
                                err = net_xmit_errno(err);
                        if (err && __packet_get_status(po, ph) ==
                                   TP_STATUS_AVAILABLE) {
                                /* skb was destructed already */
                                skb = NULL;
                                goto out_status;
                        }
                        /*
                         * skb was dropped but not destructed yet;
                         * let's treat it like congestion or err < 0
                         */
                        err = 0;
                }
                packet_increment_head(&po->tx_ring);
                len_sum += tp_len;
        } while (likely((ph != NULL) ||
                /* Note: packet_read_pending() might be slow if we have
                 * to call it as it's per_cpu variable, but in fast-path
                 * we already short-circuit the loop with the first
                 * condition, and luckily don't have to go that path
                 * anyway.
                 */
                 (need_wait && packet_read_pending(&po->tx_ring))));

        err = len_sum;
        goto out_put;

out_status:
        __packet_set_status(po, ph, status);
        kfree_skb(skb);
out_put:
        dev_put(dev);
out:
        mutex_unlock(&po->pg_vec_lock);
        return err;
}

static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
                                        size_t reserve, size_t len,
                                        size_t linear, int noblock,
                                        int *err)
{
        struct sk_buff *skb;

        /* Under a page?  Don't bother with paged skb. */
        if (prepad + len < PAGE_SIZE || !linear)
                linear = len;

        if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
                linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
                                   err, PAGE_ALLOC_COSTLY_ORDER);
        if (!skb)
                return NULL;

        skb_reserve(skb, reserve);
        skb_put(skb, linear);
        skb->data_len = len - linear;
        skb->len += len - linear;

        return skb;
}

static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
        struct sk_buff *skb;
        struct net_device *dev;
        __be16 proto;
        unsigned char *addr = NULL;
        int err, reserve = 0;
        struct sockcm_cookie sockc;
        struct virtio_net_hdr vnet_hdr = { 0 };
        int offset = 0;
        struct packet_sock *po = pkt_sk(sk);
        int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
        int hlen, tlen, linear;
        int extra_len = 0;

        /*
         *        Get and verify the address.
         */

        if (likely(saddr == NULL)) {
                dev        = packet_cached_dev_get(po);
                proto        = READ_ONCE(po->num);
        } else {
                err = -EINVAL;
                if (msg->msg_namelen < sizeof(struct sockaddr_ll))
                        goto out;
                if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
                        goto out;
                proto        = saddr->sll_protocol;
                dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
                if (sock->type == SOCK_DGRAM) {
                        if (dev && msg->msg_namelen < dev->addr_len +
                                   offsetof(struct sockaddr_ll, sll_addr))
                                goto out_unlock;
                        addr = saddr->sll_addr;
                }
        }

        err = -ENXIO;
        if (unlikely(dev == NULL))
                goto out_unlock;
        err = -ENETDOWN;
        if (unlikely(!(dev->flags & IFF_UP)))
                goto out_unlock;

        sockcm_init(&sockc, sk);
        if (msg->msg_controllen) {
                err = sock_cmsg_send(sk, msg, &sockc);
                if (unlikely(err))
                        goto out_unlock;
        }

        if (sock->type == SOCK_RAW)
                reserve = dev->hard_header_len;
        if (vnet_hdr_sz) {
                err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
                if (err)
                        goto out_unlock;
        }

        if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
                if (!netif_supports_nofcs(dev)) {
                        err = -EPROTONOSUPPORT;
                        goto out_unlock;
                }
                extra_len = 4; /* We're doing our own CRC */
        }

        err = -EMSGSIZE;
        if (!vnet_hdr.gso_type &&
            (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
                goto out_unlock;

        err = -ENOBUFS;
        hlen = LL_RESERVED_SPACE(dev);
        tlen = dev->needed_tailroom;
        linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
        linear = max(linear, min_t(int, len, dev->hard_header_len));
        skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
                               msg->msg_flags & MSG_DONTWAIT, &err);
        if (skb == NULL)
                goto out_unlock;

        skb_reset_network_header(skb);

        err = -EINVAL;
        if (sock->type == SOCK_DGRAM) {
                offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
                if (unlikely(offset < 0))
                        goto out_free;
        } else if (reserve) {
                skb_reserve(skb, -reserve);
                if (len < reserve + sizeof(struct ipv6hdr) &&
                    dev->min_header_len != dev->hard_header_len)
                        skb_reset_network_header(skb);
        }

        /* Returns -EFAULT on error */
        err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
        if (err)
                goto out_free;

        if ((sock->type == SOCK_RAW &&
             !dev_validate_header(dev, skb->data, len)) || !skb->len) {
                err = -EINVAL;
                goto out_free;
        }

        skb_setup_tx_timestamp(skb, &sockc);

        if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
            !packet_extra_vlan_len_allowed(dev, skb)) {
                err = -EMSGSIZE;
                goto out_free;
        }

        skb->protocol = proto;
        skb->dev = dev;
        skb->priority = sockc.priority;
        skb->mark = sockc.mark;
        skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);

        if (unlikely(extra_len == 4))
                skb->no_fcs = 1;

        packet_parse_headers(skb, sock);

        if (vnet_hdr_sz) {
                err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
                if (err)
                        goto out_free;
                len += vnet_hdr_sz;
                virtio_net_hdr_set_proto(skb, &vnet_hdr);
        }

        err = packet_xmit(po, skb);

        if (unlikely(err != 0)) {
                if (err > 0)
                        err = net_xmit_errno(err);
                if (err)
                        goto out_unlock;
        }

        dev_put(dev);

        return len;

out_free:
        kfree_skb(skb);
out_unlock:
        dev_put(dev);
out:
        return err;
}

static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);

        /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy.
         * tpacket_snd() will redo the check safely.
         */
        if (data_race(po->tx_ring.pg_vec))
                return tpacket_snd(po, msg);

        return packet_snd(sock, msg, len);
}

/*
 *        Close a PACKET socket. This is fairly simple. We immediately go
 *        to 'closed' state and remove our protocol entry in the device list.
 */

static int packet_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po;
        struct packet_fanout *f;
        struct net *net;
        union tpacket_req_u req_u;

        if (!sk)
                return 0;

        net = sock_net(sk);
        po = pkt_sk(sk);

        mutex_lock(&net->packet.sklist_lock);
        sk_del_node_init_rcu(sk);
        mutex_unlock(&net->packet.sklist_lock);

        sock_prot_inuse_add(net, sk->sk_prot, -1);

        spin_lock(&po->bind_lock);
        unregister_prot_hook(sk, false);
        packet_cached_dev_reset(po);

        if (po->prot_hook.dev) {
                netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);
                po->prot_hook.dev = NULL;
        }
        spin_unlock(&po->bind_lock);

        packet_flush_mclist(sk);

        lock_sock(sk);
        if (po->rx_ring.pg_vec) {
                memset(&req_u, 0, sizeof(req_u));
                packet_set_ring(sk, &req_u, 1, 0);
        }

        if (po->tx_ring.pg_vec) {
                memset(&req_u, 0, sizeof(req_u));
                packet_set_ring(sk, &req_u, 1, 1);
        }
        release_sock(sk);

        f = fanout_release(sk);

        synchronize_net();

        kfree(po->rollover);
        if (f) {
                fanout_release_data(f);
                kvfree(f);
        }
        /*
         *        Now the socket is dead. No more input will appear.
         */
        sock_orphan(sk);
        sock->sk = NULL;

        /* Purge queues */

        skb_queue_purge(&sk->sk_receive_queue);
        packet_free_pending(po);

        sock_put(sk);
        return 0;
}

/*
 *        Attach a packet hook.
 */

static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
                          __be16 proto)
{
        struct packet_sock *po = pkt_sk(sk);
        struct net_device *dev = NULL;
        bool unlisted = false;
        bool need_rehook;
        int ret = 0;

        lock_sock(sk);
        spin_lock(&po->bind_lock);
        if (!proto)
                proto = po->num;

        rcu_read_lock();

        if (po->fanout) {
                ret = -EINVAL;
                goto out_unlock;
        }

        if (name) {
                dev = dev_get_by_name_rcu(sock_net(sk), name);
                if (!dev) {
                        ret = -ENODEV;
                        goto out_unlock;
                }
        } else if (ifindex) {
                dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
                if (!dev) {
                        ret = -ENODEV;
                        goto out_unlock;
                }
        }

        need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev;

        if (need_rehook) {
                dev_hold(dev);
                if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
                        rcu_read_unlock();
                        /* prevents packet_notifier() from calling
                         * register_prot_hook()
                         */
                        WRITE_ONCE(po->num, 0);
                        __unregister_prot_hook(sk, true);
                        rcu_read_lock();
                        if (dev)
                                unlisted = !dev_get_by_index_rcu(sock_net(sk),
                                                                 dev->ifindex);
                }

                BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING));
                WRITE_ONCE(po->num, proto);
                po->prot_hook.type = proto;

                netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker);

                if (unlikely(unlisted)) {
                        po->prot_hook.dev = NULL;
                        WRITE_ONCE(po->ifindex, -1);
                        packet_cached_dev_reset(po);
                } else {
                        netdev_hold(dev, &po->prot_hook.dev_tracker,
                                    GFP_ATOMIC);
                        po->prot_hook.dev = dev;
                        WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0);
                        packet_cached_dev_assign(po, dev);
                }
                dev_put(dev);
        }

        if (proto == 0 || !need_rehook)
                goto out_unlock;

        if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
                register_prot_hook(sk);
        } else {
                sk->sk_err = ENETDOWN;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
        }

out_unlock:
        rcu_read_unlock();
        spin_unlock(&po->bind_lock);
        release_sock(sk);
        return ret;
}

/*
 *        Bind a packet socket to a device
 */

static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
                            int addr_len)
{
        struct sock *sk = sock->sk;
        char name[sizeof(uaddr->sa_data_min) + 1];

        /*
         *        Check legality
         */

        if (addr_len != sizeof(struct sockaddr))
                return -EINVAL;
        /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
         * zero-terminated.
         */
        memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min));
        name[sizeof(uaddr->sa_data_min)] = 0;

        return packet_do_bind(sk, name, 0, 0);
}

static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
        struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
        struct sock *sk = sock->sk;

        /*
         *        Check legality
         */

        if (addr_len < sizeof(struct sockaddr_ll))
                return -EINVAL;
        if (sll->sll_family != AF_PACKET)
                return -EINVAL;

        return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol);
}

static struct proto packet_proto = {
        .name          = "PACKET",
        .owner          = THIS_MODULE,
        .obj_size = sizeof(struct packet_sock),
};

/*
 *        Create a packet of type SOCK_PACKET.
 */

static int packet_create(struct net *net, struct socket *sock, int protocol,
                         int kern)
{
        struct sock *sk;
        struct packet_sock *po;
        __be16 proto = (__force __be16)protocol; /* weird, but documented */
        int err;

        if (!ns_capable(net->user_ns, CAP_NET_RAW))
                return -EPERM;
        if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
            sock->type != SOCK_PACKET)
                return -ESOCKTNOSUPPORT;

        sock->state = SS_UNCONNECTED;

        err = -ENOBUFS;
        sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
        if (sk == NULL)
                goto out;

        sock->ops = &packet_ops;
        if (sock->type == SOCK_PACKET)
                sock->ops = &packet_ops_spkt;

        po = pkt_sk(sk);
        err = packet_alloc_pending(po);
        if (err)
                goto out_sk_free;

        sock_init_data(sock, sk);

        init_completion(&po->skb_completion);
        sk->sk_family = PF_PACKET;
        po->num = proto;

        packet_cached_dev_reset(po);

        sk->sk_destruct = packet_sock_destruct;

        /*
         *        Attach a protocol block
         */

        spin_lock_init(&po->bind_lock);
        mutex_init(&po->pg_vec_lock);
        po->rollover = NULL;
        po->prot_hook.func = packet_rcv;

        if (sock->type == SOCK_PACKET)
                po->prot_hook.func = packet_rcv_spkt;

        po->prot_hook.af_packet_priv = sk;
        po->prot_hook.af_packet_net = sock_net(sk);

        if (proto) {
                po->prot_hook.type = proto;
                __register_prot_hook(sk);
        }

        mutex_lock(&net->packet.sklist_lock);
        sk_add_node_tail_rcu(sk, &net->packet.sklist);
        mutex_unlock(&net->packet.sklist_lock);

        sock_prot_inuse_add(net, &packet_proto, 1);

        return 0;
out_sk_free:
        sk_free(sk);
out:
        return err;
}

/*
 *        Pull a packet from our receive queue and hand it to the user.
 *        If necessary we block.
 */

static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
                          int flags)
{
        struct sock *sk = sock->sk;
        struct sk_buff *skb;
        int copied, err;
        int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
        unsigned int origlen = 0;

        err = -EINVAL;
        if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
                goto out;

#if 0
        /* What error should we return now? EUNATTACH? */
        if (pkt_sk(sk)->ifindex < 0)
                return -ENODEV;
#endif

        if (flags & MSG_ERRQUEUE) {
                err = sock_recv_errqueue(sk, msg, len,
                                         SOL_PACKET, PACKET_TX_TIMESTAMP);
                goto out;
        }

        /*
         *        Call the generic datagram receiver. This handles all sorts
         *        of horrible races and re-entrancy so we can forget about it
         *        in the protocol layers.
         *
         *        Now it will return ENETDOWN, if device have just gone down,
         *        but then it will block.
         */

        skb = skb_recv_datagram(sk, flags, &err);

        /*
         *        An error occurred so return it. Because skb_recv_datagram()
         *        handles the blocking we don't see and worry about blocking
         *        retries.
         */

        if (skb == NULL)
                goto out;

        packet_rcv_try_clear_pressure(pkt_sk(sk));

        if (vnet_hdr_len) {
                err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
                if (err)
                        goto out_free;
        }

        /* You lose any data beyond the buffer you gave. If it worries
         * a user program they can ask the device for its MTU
         * anyway.
         */
        copied = skb->len;
        if (copied > len) {
                copied = len;
                msg->msg_flags |= MSG_TRUNC;
        }

        err = skb_copy_datagram_msg(skb, 0, msg, copied);
        if (err)
                goto out_free;

        if (sock->type != SOCK_PACKET) {
                struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;

                /* Original length was stored in sockaddr_ll fields */
                origlen = PACKET_SKB_CB(skb)->sa.origlen;
                sll->sll_family = AF_PACKET;
                sll->sll_protocol = (sock->type == SOCK_DGRAM) ?
                        vlan_get_protocol_dgram(skb) : skb->protocol;
        }

        sock_recv_cmsgs(msg, sk, skb);

        if (msg->msg_name) {
                const size_t max_len = min(sizeof(skb->cb),
                                           sizeof(struct sockaddr_storage));
                int copy_len;

                /* If the address length field is there to be filled
                 * in, we fill it in now.
                 */
                if (sock->type == SOCK_PACKET) {
                        __sockaddr_check_size(sizeof(struct sockaddr_pkt));
                        msg->msg_namelen = sizeof(struct sockaddr_pkt);
                        copy_len = msg->msg_namelen;
                } else {
                        struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;

                        msg->msg_namelen = sll->sll_halen +
                                offsetof(struct sockaddr_ll, sll_addr);
                        copy_len = msg->msg_namelen;
                        if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
                                memset(msg->msg_name +
                                       offsetof(struct sockaddr_ll, sll_addr),
                                       0, sizeof(sll->sll_addr));
                                msg->msg_namelen = sizeof(struct sockaddr_ll);
                        }
                }
                if (WARN_ON_ONCE(copy_len > max_len)) {
                        copy_len = max_len;
                        msg->msg_namelen = copy_len;
                }
                memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
        }

        if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) {
                struct tpacket_auxdata aux;

                aux.tp_status = TP_STATUS_USER;
                if (skb->ip_summed == CHECKSUM_PARTIAL)
                        aux.tp_status |= TP_STATUS_CSUMNOTREADY;
                else if (skb->pkt_type != PACKET_OUTGOING &&
                         skb_csum_unnecessary(skb))
                        aux.tp_status |= TP_STATUS_CSUM_VALID;
                if (skb_is_gso(skb) && skb_is_gso_tcp(skb))
                        aux.tp_status |= TP_STATUS_GSO_TCP;

                aux.tp_len = origlen;
                aux.tp_snaplen = skb->len;
                aux.tp_mac = 0;
                aux.tp_net = skb_network_offset(skb);
                if (skb_vlan_tag_present(skb)) {
                        aux.tp_vlan_tci = skb_vlan_tag_get(skb);
                        aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
                        aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
                } else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) {
                        struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
                        struct net_device *dev;

                        rcu_read_lock();
                        dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex);
                        if (dev) {
                                aux.tp_vlan_tci = vlan_get_tci(skb, dev);
                                aux.tp_vlan_tpid = ntohs(skb->protocol);
                                aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
                        } else {
                                aux.tp_vlan_tci = 0;
                                aux.tp_vlan_tpid = 0;
                        }
                        rcu_read_unlock();
                } else {
                        aux.tp_vlan_tci = 0;
                        aux.tp_vlan_tpid = 0;
                }
                put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
        }

        /*
         *        Free or return the buffer as appropriate. Again this
         *        hides all the races and re-entrancy issues from us.
         */
        err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);

out_free:
        skb_free_datagram(sk, skb);
out:
        return err;
}

static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
                               int peer)
{
        struct net_device *dev;
        struct sock *sk        = sock->sk;

        if (peer)
                return -EOPNOTSUPP;

        uaddr->sa_family = AF_PACKET;
        memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min));
        rcu_read_lock();
        dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex));
        if (dev)
                strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min));
        rcu_read_unlock();

        return sizeof(*uaddr);
}

static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
                          int peer)
{
        struct net_device *dev;
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
        int ifindex;

        if (peer)
                return -EOPNOTSUPP;

        ifindex = READ_ONCE(po->ifindex);
        sll->sll_family = AF_PACKET;
        sll->sll_ifindex = ifindex;
        sll->sll_protocol = READ_ONCE(po->num);
        sll->sll_pkttype = 0;
        rcu_read_lock();
        dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
        if (dev) {
                sll->sll_hatype = dev->type;
                sll->sll_halen = dev->addr_len;

                /* Let __fortify_memcpy_chk() know the actual buffer size. */
                memcpy(((struct sockaddr_storage *)sll)->__data +
                       offsetof(struct sockaddr_ll, sll_addr) -
                       offsetofend(struct sockaddr_ll, sll_family),
                       dev->dev_addr, dev->addr_len);
        } else {
                sll->sll_hatype = 0;        /* Bad: we have no ARPHRD_UNSPEC */
                sll->sll_halen = 0;
        }
        rcu_read_unlock();

        return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
}

static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
                         int what)
{
        switch (i->type) {
        case PACKET_MR_MULTICAST:
                if (i->alen != dev->addr_len)
                        return -EINVAL;
                if (what > 0)
                        return dev_mc_add(dev, i->addr);
                else
                        return dev_mc_del(dev, i->addr);
                break;
        case PACKET_MR_PROMISC:
                return dev_set_promiscuity(dev, what);
        case PACKET_MR_ALLMULTI:
                return dev_set_allmulti(dev, what);
        case PACKET_MR_UNICAST:
                if (i->alen != dev->addr_len)
                        return -EINVAL;
                if (what > 0)
                        return dev_uc_add(dev, i->addr);
                else
                        return dev_uc_del(dev, i->addr);
                break;
        default:
                break;
        }
        return 0;
}

static void packet_dev_mclist_delete(struct net_device *dev,
                                     struct packet_mclist **mlp)
{
        struct packet_mclist *ml;

        while ((ml = *mlp) != NULL) {
                if (ml->ifindex == dev->ifindex) {
                        packet_dev_mc(dev, ml, -1);
                        *mlp = ml->next;
                        kfree(ml);
                } else
                        mlp = &ml->next;
        }
}

static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
{
        struct packet_sock *po = pkt_sk(sk);
        struct packet_mclist *ml, *i;
        struct net_device *dev;
        int err;

        rtnl_lock();

        err = -ENODEV;
        dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
        if (!dev)
                goto done;

        err = -EINVAL;
        if (mreq->mr_alen > dev->addr_len)
                goto done;

        err = -ENOBUFS;
        i = kmalloc(sizeof(*i), GFP_KERNEL);
        if (i == NULL)
                goto done;

        err = 0;
        for (ml = po->mclist; ml; ml = ml->next) {
                if (ml->ifindex == mreq->mr_ifindex &&
                    ml->type == mreq->mr_type &&
                    ml->alen == mreq->mr_alen &&
                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
                        ml->count++;
                        /* Free the new element ... */
                        kfree(i);
                        goto done;
                }
        }

        i->type = mreq->mr_type;
        i->ifindex = mreq->mr_ifindex;
        i->alen = mreq->mr_alen;
        memcpy(i->addr, mreq->mr_address, i->alen);
        memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
        i->count = 1;
        i->next = po->mclist;
        po->mclist = i;
        err = packet_dev_mc(dev, i, 1);
        if (err) {
                po->mclist = i->next;
                kfree(i);
        }

done:
        rtnl_unlock();
        return err;
}

static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
{
        struct packet_mclist *ml, **mlp;

        rtnl_lock();

        for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
                if (ml->ifindex == mreq->mr_ifindex &&
                    ml->type == mreq->mr_type &&
                    ml->alen == mreq->mr_alen &&
                    memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
                        if (--ml->count == 0) {
                                struct net_device *dev;
                                *mlp = ml->next;
                                dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
                                if (dev)
                                        packet_dev_mc(dev, ml, -1);
                                kfree(ml);
                        }
                        break;
                }
        }
        rtnl_unlock();
        return 0;
}

static void packet_flush_mclist(struct sock *sk)
{
        struct packet_sock *po = pkt_sk(sk);
        struct packet_mclist *ml;

        if (!po->mclist)
                return;

        rtnl_lock();
        while ((ml = po->mclist) != NULL) {
                struct net_device *dev;

                po->mclist = ml->next;
                dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
                if (dev != NULL)
                        packet_dev_mc(dev, ml, -1);
                kfree(ml);
        }
        rtnl_unlock();
}

static int
packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
                  unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        int ret;

        if (level != SOL_PACKET)
                return -ENOPROTOOPT;

        switch (optname) {
        case PACKET_ADD_MEMBERSHIP:
        case PACKET_DROP_MEMBERSHIP:
        {
                struct packet_mreq_max mreq;
                int len = optlen;
                memset(&mreq, 0, sizeof(mreq));
                if (len < sizeof(struct packet_mreq))
                        return -EINVAL;
                if (len > sizeof(mreq))
                        len = sizeof(mreq);
                if (copy_from_sockptr(&mreq, optval, len))
                        return -EFAULT;
                if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
                        return -EINVAL;
                if (optname == PACKET_ADD_MEMBERSHIP)
                        ret = packet_mc_add(sk, &mreq);
                else
                        ret = packet_mc_drop(sk, &mreq);
                return ret;
        }

        case PACKET_RX_RING:
        case PACKET_TX_RING:
        {
                union tpacket_req_u req_u;

                ret = -EINVAL;
                lock_sock(sk);
                switch (po->tp_version) {
                case TPACKET_V1:
                case TPACKET_V2:
                        if (optlen < sizeof(req_u.req))
                                break;
                        ret = copy_from_sockptr(&req_u.req, optval,
                                                sizeof(req_u.req)) ?
                                                -EINVAL : 0;
                        break;
                case TPACKET_V3:
                default:
                        if (optlen < sizeof(req_u.req3))
                                break;
                        ret = copy_from_sockptr(&req_u.req3, optval,
                                                sizeof(req_u.req3)) ?
                                                -EINVAL : 0;
                        break;
                }
                if (!ret)
                        ret = packet_set_ring(sk, &req_u, 0,
                                              optname == PACKET_TX_RING);
                release_sock(sk);
                return ret;
        }
        case PACKET_COPY_THRESH:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                WRITE_ONCE(pkt_sk(sk)->copy_thresh, val);
                return 0;
        }
        case PACKET_VERSION:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;
                switch (val) {
                case TPACKET_V1:
                case TPACKET_V2:
                case TPACKET_V3:
                        break;
                default:
                        return -EINVAL;
                }
                lock_sock(sk);
                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
                        ret = -EBUSY;
                } else {
                        po->tp_version = val;
                        ret = 0;
                }
                release_sock(sk);
                return ret;
        }
        case PACKET_RESERVE:
        {
                unsigned int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;
                if (val > INT_MAX)
                        return -EINVAL;
                lock_sock(sk);
                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
                        ret = -EBUSY;
                } else {
                        po->tp_reserve = val;
                        ret = 0;
                }
                release_sock(sk);
                return ret;
        }
        case PACKET_LOSS:
        {
                unsigned int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                lock_sock(sk);
                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
                        ret = -EBUSY;
                } else {
                        packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val);
                        ret = 0;
                }
                release_sock(sk);
                return ret;
        }
        case PACKET_AUXDATA:
        {
                int val;

                if (optlen < sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                packet_sock_flag_set(po, PACKET_SOCK_AUXDATA, val);
                return 0;
        }
        case PACKET_ORIGDEV:
        {
                int val;

                if (optlen < sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                packet_sock_flag_set(po, PACKET_SOCK_ORIGDEV, val);
                return 0;
        }
        case PACKET_VNET_HDR:
        case PACKET_VNET_HDR_SZ:
        {
                int val, hdr_len;

                if (sock->type != SOCK_RAW)
                        return -EINVAL;
                if (optlen < sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                if (optname == PACKET_VNET_HDR_SZ) {
                        if (val && val != sizeof(struct virtio_net_hdr) &&
                            val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
                                return -EINVAL;
                        hdr_len = val;
                } else {
                        hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
                }
                lock_sock(sk);
                if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
                        ret = -EBUSY;
                } else {
                        WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
                        ret = 0;
                }
                release_sock(sk);
                return ret;
        }
        case PACKET_TIMESTAMP:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                WRITE_ONCE(po->tp_tstamp, val);
                return 0;
        }
        case PACKET_FANOUT:
        {
                struct fanout_args args = { 0 };

                if (optlen != sizeof(int) && optlen != sizeof(args))
                        return -EINVAL;
                if (copy_from_sockptr(&args, optval, optlen))
                        return -EFAULT;

                return fanout_add(sk, &args);
        }
        case PACKET_FANOUT_DATA:
        {
                /* Paired with the WRITE_ONCE() in fanout_add() */
                if (!READ_ONCE(po->fanout))
                        return -EINVAL;

                return fanout_set_data(po, optval, optlen);
        }
        case PACKET_IGNORE_OUTGOING:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;
                if (val < 0 || val > 1)
                        return -EINVAL;

                WRITE_ONCE(po->prot_hook.ignore_outgoing, !!val);
                return 0;
        }
        case PACKET_TX_HAS_OFF:
        {
                unsigned int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                lock_sock(sk);
                if (!po->rx_ring.pg_vec && !po->tx_ring.pg_vec)
                        packet_sock_flag_set(po, PACKET_SOCK_TX_HAS_OFF, val);

                release_sock(sk);
                return 0;
        }
        case PACKET_QDISC_BYPASS:
        {
                int val;

                if (optlen != sizeof(val))
                        return -EINVAL;
                if (copy_from_sockptr(&val, optval, sizeof(val)))
                        return -EFAULT;

                packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val);
                return 0;
        }
        default:
                return -ENOPROTOOPT;
        }
}

static int packet_getsockopt(struct socket *sock, int level, int optname,
                             char __user *optval, int __user *optlen)
{
        int len;
        int val, lv = sizeof(val);
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        void *data = &val;
        union tpacket_stats_u st;
        struct tpacket_rollover_stats rstats;
        int drops;

        if (level != SOL_PACKET)
                return -ENOPROTOOPT;

        if (get_user(len, optlen))
                return -EFAULT;

        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case PACKET_STATISTICS:
                spin_lock_bh(&sk->sk_receive_queue.lock);
                memcpy(&st, &po->stats, sizeof(st));
                memset(&po->stats, 0, sizeof(po->stats));
                spin_unlock_bh(&sk->sk_receive_queue.lock);
                drops = atomic_xchg(&po->tp_drops, 0);

                if (po->tp_version == TPACKET_V3) {
                        lv = sizeof(struct tpacket_stats_v3);
                        st.stats3.tp_drops = drops;
                        st.stats3.tp_packets += drops;
                        data = &st.stats3;
                } else {
                        lv = sizeof(struct tpacket_stats);
                        st.stats1.tp_drops = drops;
                        st.stats1.tp_packets += drops;
                        data = &st.stats1;
                }

                break;
        case PACKET_AUXDATA:
                val = packet_sock_flag(po, PACKET_SOCK_AUXDATA);
                break;
        case PACKET_ORIGDEV:
                val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
                break;
        case PACKET_VNET_HDR:
                val = !!READ_ONCE(po->vnet_hdr_sz);
                break;
        case PACKET_VNET_HDR_SZ:
                val = READ_ONCE(po->vnet_hdr_sz);
                break;
        case PACKET_COPY_THRESH:
                val = READ_ONCE(pkt_sk(sk)->copy_thresh);
                break;
        case PACKET_VERSION:
                val = po->tp_version;
                break;
        case PACKET_HDRLEN:
                if (len > sizeof(int))
                        len = sizeof(int);
                if (len < sizeof(int))
                        return -EINVAL;
                if (copy_from_user(&val, optval, len))
                        return -EFAULT;
                switch (val) {
                case TPACKET_V1:
                        val = sizeof(struct tpacket_hdr);
                        break;
                case TPACKET_V2:
                        val = sizeof(struct tpacket2_hdr);
                        break;
                case TPACKET_V3:
                        val = sizeof(struct tpacket3_hdr);
                        break;
                default:
                        return -EINVAL;
                }
                break;
        case PACKET_RESERVE:
                val = po->tp_reserve;
                break;
        case PACKET_LOSS:
                val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS);
                break;
        case PACKET_TIMESTAMP:
                val = READ_ONCE(po->tp_tstamp);
                break;
        case PACKET_FANOUT:
                val = (po->fanout ?
                       ((u32)po->fanout->id |
                        ((u32)po->fanout->type << 16) |
                        ((u32)po->fanout->flags << 24)) :
                       0);
                break;
        case PACKET_IGNORE_OUTGOING:
                val = READ_ONCE(po->prot_hook.ignore_outgoing);
                break;
        case PACKET_ROLLOVER_STATS:
                if (!po->rollover)
                        return -EINVAL;
                rstats.tp_all = atomic_long_read(&po->rollover->num);
                rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
                rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
                data = &rstats;
                lv = sizeof(rstats);
                break;
        case PACKET_TX_HAS_OFF:
                val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF);
                break;
        case PACKET_QDISC_BYPASS:
                val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS);
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (len > lv)
                len = lv;
        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, data, len))
                return -EFAULT;
        return 0;
}

static int packet_notifier(struct notifier_block *this,
                           unsigned long msg, void *ptr)
{
        struct sock *sk;
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);

        rcu_read_lock();
        sk_for_each_rcu(sk, &net->packet.sklist) {
                struct packet_sock *po = pkt_sk(sk);

                switch (msg) {
                case NETDEV_UNREGISTER:
                        if (po->mclist)
                                packet_dev_mclist_delete(dev, &po->mclist);
                        fallthrough;

                case NETDEV_DOWN:
                        if (dev->ifindex == po->ifindex) {
                                spin_lock(&po->bind_lock);
                                if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
                                        __unregister_prot_hook(sk, false);
                                        sk->sk_err = ENETDOWN;
                                        if (!sock_flag(sk, SOCK_DEAD))
                                                sk_error_report(sk);
                                }
                                if (msg == NETDEV_UNREGISTER) {
                                        packet_cached_dev_reset(po);
                                        WRITE_ONCE(po->ifindex, -1);
                                        netdev_put(po->prot_hook.dev,
                                                   &po->prot_hook.dev_tracker);
                                        po->prot_hook.dev = NULL;
                                }
                                spin_unlock(&po->bind_lock);
                        }
                        break;
                case NETDEV_UP:
                        if (dev->ifindex == po->ifindex) {
                                spin_lock(&po->bind_lock);
                                if (po->num)
                                        register_prot_hook(sk);
                                spin_unlock(&po->bind_lock);
                        }
                        break;
                }
        }
        rcu_read_unlock();
        return NOTIFY_DONE;
}


static int packet_ioctl(struct socket *sock, unsigned int cmd,
                        unsigned long arg)
{
        struct sock *sk = sock->sk;

        switch (cmd) {
        case SIOCOUTQ:
        {
                int amount = sk_wmem_alloc_get(sk);

                return put_user(amount, (int __user *)arg);
        }
        case SIOCINQ:
        {
                struct sk_buff *skb;
                int amount = 0;

                spin_lock_bh(&sk->sk_receive_queue.lock);
                skb = skb_peek(&sk->sk_receive_queue);
                if (skb)
                        amount = skb->len;
                spin_unlock_bh(&sk->sk_receive_queue.lock);
                return put_user(amount, (int __user *)arg);
        }
#ifdef CONFIG_INET
        case SIOCADDRT:
        case SIOCDELRT:
        case SIOCDARP:
        case SIOCGARP:
        case SIOCSARP:
        case SIOCGIFADDR:
        case SIOCSIFADDR:
        case SIOCGIFBRDADDR:
        case SIOCSIFBRDADDR:
        case SIOCGIFNETMASK:
        case SIOCSIFNETMASK:
        case SIOCGIFDSTADDR:
        case SIOCSIFDSTADDR:
        case SIOCSIFFLAGS:
                return inet_dgram_ops.ioctl(sock, cmd, arg);
#endif

        default:
                return -ENOIOCTLCMD;
        }
        return 0;
}

static __poll_t packet_poll(struct file *file, struct socket *sock,
                                poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        __poll_t mask = datagram_poll(file, sock, wait);

        spin_lock_bh(&sk->sk_receive_queue.lock);
        if (po->rx_ring.pg_vec) {
                if (!packet_previous_rx_frame(po, &po->rx_ring,
                        TP_STATUS_KERNEL))
                        mask |= EPOLLIN | EPOLLRDNORM;
        }
        packet_rcv_try_clear_pressure(po);
        spin_unlock_bh(&sk->sk_receive_queue.lock);
        spin_lock_bh(&sk->sk_write_queue.lock);
        if (po->tx_ring.pg_vec) {
                if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
                        mask |= EPOLLOUT | EPOLLWRNORM;
        }
        spin_unlock_bh(&sk->sk_write_queue.lock);
        return mask;
}


/* Dirty? Well, I still did not learn better way to account
 * for user mmaps.
 */

static void packet_mm_open(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct socket *sock = file->private_data;
        struct sock *sk = sock->sk;

        if (sk)
                atomic_long_inc(&pkt_sk(sk)->mapped);
}

static void packet_mm_close(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct socket *sock = file->private_data;
        struct sock *sk = sock->sk;

        if (sk)
                atomic_long_dec(&pkt_sk(sk)->mapped);
}

static const struct vm_operations_struct packet_mmap_ops = {
        .open        =        packet_mm_open,
        .close        =        packet_mm_close,
};

static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
                        unsigned int len)
{
        int i;

        for (i = 0; i < len; i++) {
                if (likely(pg_vec[i].buffer)) {
                        if (is_vmalloc_addr(pg_vec[i].buffer))
                                vfree(pg_vec[i].buffer);
                        else
                                free_pages((unsigned long)pg_vec[i].buffer,
                                           order);
                        pg_vec[i].buffer = NULL;
                }
        }
        kfree(pg_vec);
}

static char *alloc_one_pg_vec_page(unsigned long order)
{
        char *buffer;
        gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
                          __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;

        buffer = (char *) __get_free_pages(gfp_flags, order);
        if (buffer)
                return buffer;

        /* __get_free_pages failed, fall back to vmalloc */
        buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
        if (buffer)
                return buffer;

        /* vmalloc failed, lets dig into swap here */
        gfp_flags &= ~__GFP_NORETRY;
        buffer = (char *) __get_free_pages(gfp_flags, order);
        if (buffer)
                return buffer;

        /* complete and utter failure */
        return NULL;
}

static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
{
        unsigned int block_nr = req->tp_block_nr;
        struct pgv *pg_vec;
        int i;

        pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
        if (unlikely(!pg_vec))
                goto out;

        for (i = 0; i < block_nr; i++) {
                pg_vec[i].buffer = alloc_one_pg_vec_page(order);
                if (unlikely(!pg_vec[i].buffer))
                        goto out_free_pgvec;
        }

out:
        return pg_vec;

out_free_pgvec:
        free_pg_vec(pg_vec, order, block_nr);
        pg_vec = NULL;
        goto out;
}

static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                int closing, int tx_ring)
{
        struct pgv *pg_vec = NULL;
        struct packet_sock *po = pkt_sk(sk);
        unsigned long *rx_owner_map = NULL;
        int was_running, order = 0;
        struct packet_ring_buffer *rb;
        struct sk_buff_head *rb_queue;
        __be16 num;
        int err;
        /* Added to avoid minimal code churn */
        struct tpacket_req *req = &req_u->req;

        rb = tx_ring ? &po->tx_ring : &po->rx_ring;
        rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;

        err = -EBUSY;
        if (!closing) {
                if (atomic_long_read(&po->mapped))
                        goto out;
                if (packet_read_pending(rb))
                        goto out;
        }

        if (req->tp_block_nr) {
                unsigned int min_frame_size;

                /* Sanity tests and some calculations */
                err = -EBUSY;
                if (unlikely(rb->pg_vec))
                        goto out;

                switch (po->tp_version) {
                case TPACKET_V1:
                        po->tp_hdrlen = TPACKET_HDRLEN;
                        break;
                case TPACKET_V2:
                        po->tp_hdrlen = TPACKET2_HDRLEN;
                        break;
                case TPACKET_V3:
                        po->tp_hdrlen = TPACKET3_HDRLEN;
                        break;
                }

                err = -EINVAL;
                if (unlikely((int)req->tp_block_size <= 0))
                        goto out;
                if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
                        goto out;
                min_frame_size = po->tp_hdrlen + po->tp_reserve;
                if (po->tp_version >= TPACKET_V3 &&
                    req->tp_block_size <
                    BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
                        goto out;
                if (unlikely(req->tp_frame_size < min_frame_size))
                        goto out;
                if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
                        goto out;

                rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
                if (unlikely(rb->frames_per_block == 0))
                        goto out;
                if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
                        goto out;
                if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
                                        req->tp_frame_nr))
                        goto out;

                err = -ENOMEM;
                order = get_order(req->tp_block_size);
                pg_vec = alloc_pg_vec(req, order);
                if (unlikely(!pg_vec))
                        goto out;
                switch (po->tp_version) {
                case TPACKET_V3:
                        /* Block transmit is not supported yet */
                        if (!tx_ring) {
                                init_prb_bdqc(po, rb, pg_vec, req_u);
                        } else {
                                struct tpacket_req3 *req3 = &req_u->req3;

                                if (req3->tp_retire_blk_tov ||
                                    req3->tp_sizeof_priv ||
                                    req3->tp_feature_req_word) {
                                        err = -EINVAL;
                                        goto out_free_pg_vec;
                                }
                        }
                        break;
                default:
                        if (!tx_ring) {
                                rx_owner_map = bitmap_alloc(req->tp_frame_nr,
                                        GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
                                if (!rx_owner_map)
                                        goto out_free_pg_vec;
                        }
                        break;
                }
        }
        /* Done */
        else {
                err = -EINVAL;
                if (unlikely(req->tp_frame_nr))
                        goto out;
        }


        /* Detach socket from network */
        spin_lock(&po->bind_lock);
        was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING);
        num = po->num;
        if (was_running) {
                WRITE_ONCE(po->num, 0);
                __unregister_prot_hook(sk, false);
        }
        spin_unlock(&po->bind_lock);

        synchronize_net();

        err = -EBUSY;
        mutex_lock(&po->pg_vec_lock);
        if (closing || atomic_long_read(&po->mapped) == 0) {
                err = 0;
                spin_lock_bh(&rb_queue->lock);
                swap(rb->pg_vec, pg_vec);
                if (po->tp_version <= TPACKET_V2)
                        swap(rb->rx_owner_map, rx_owner_map);
                rb->frame_max = (req->tp_frame_nr - 1);
                rb->head = 0;
                rb->frame_size = req->tp_frame_size;
                spin_unlock_bh(&rb_queue->lock);

                swap(rb->pg_vec_order, order);
                swap(rb->pg_vec_len, req->tp_block_nr);

                rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
                po->prot_hook.func = (po->rx_ring.pg_vec) ?
                                                tpacket_rcv : packet_rcv;
                skb_queue_purge(rb_queue);
                if (atomic_long_read(&po->mapped))
                        pr_err("packet_mmap: vma is busy: %ld\n",
                               atomic_long_read(&po->mapped));
        }
        mutex_unlock(&po->pg_vec_lock);

        spin_lock(&po->bind_lock);
        if (was_running) {
                WRITE_ONCE(po->num, num);
                register_prot_hook(sk);
        }
        spin_unlock(&po->bind_lock);
        if (pg_vec && (po->tp_version > TPACKET_V2)) {
                /* Because we don't support block-based V3 on tx-ring */
                if (!tx_ring)
                        prb_shutdown_retire_blk_timer(po, rb_queue);
        }

out_free_pg_vec:
        if (pg_vec) {
                bitmap_free(rx_owner_map);
                free_pg_vec(pg_vec, order, req->tp_block_nr);
        }
out:
        return err;
}

static int packet_mmap(struct file *file, struct socket *sock,
                struct vm_area_struct *vma)
{
        struct sock *sk = sock->sk;
        struct packet_sock *po = pkt_sk(sk);
        unsigned long size, expected_size;
        struct packet_ring_buffer *rb;
        unsigned long start;
        int err = -EINVAL;
        int i;

        if (vma->vm_pgoff)
                return -EINVAL;

        mutex_lock(&po->pg_vec_lock);

        expected_size = 0;
        for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
                if (rb->pg_vec) {
                        expected_size += rb->pg_vec_len
                                                * rb->pg_vec_pages
                                                * PAGE_SIZE;
                }
        }

        if (expected_size == 0)
                goto out;

        size = vma->vm_end - vma->vm_start;
        if (size != expected_size)
                goto out;

        start = vma->vm_start;
        for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
                if (rb->pg_vec == NULL)
                        continue;

                for (i = 0; i < rb->pg_vec_len; i++) {
                        struct page *page;
                        void *kaddr = rb->pg_vec[i].buffer;
                        int pg_num;

                        for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
                                page = pgv_to_page(kaddr);
                                err = vm_insert_page(vma, start, page);
                                if (unlikely(err))
                                        goto out;
                                start += PAGE_SIZE;
                                kaddr += PAGE_SIZE;
                        }
                }
        }

        atomic_long_inc(&po->mapped);
        vma->vm_ops = &packet_mmap_ops;
        err = 0;

out:
        mutex_unlock(&po->pg_vec_lock);
        return err;
}

static const struct proto_ops packet_ops_spkt = {
        .family =        PF_PACKET,
        .owner =        THIS_MODULE,
        .release =        packet_release,
        .bind =                packet_bind_spkt,
        .connect =        sock_no_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        packet_getname_spkt,
        .poll =                datagram_poll,
        .ioctl =        packet_ioctl,
        .gettstamp =        sock_gettstamp,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .sendmsg =        packet_sendmsg_spkt,
        .recvmsg =        packet_recvmsg,
        .mmap =                sock_no_mmap,
};

static const struct proto_ops packet_ops = {
        .family =        PF_PACKET,
        .owner =        THIS_MODULE,
        .release =        packet_release,
        .bind =                packet_bind,
        .connect =        sock_no_connect,
        .socketpair =        sock_no_socketpair,
        .accept =        sock_no_accept,
        .getname =        packet_getname,
        .poll =                packet_poll,
        .ioctl =        packet_ioctl,
        .gettstamp =        sock_gettstamp,
        .listen =        sock_no_listen,
        .shutdown =        sock_no_shutdown,
        .setsockopt =        packet_setsockopt,
        .getsockopt =        packet_getsockopt,
        .sendmsg =        packet_sendmsg,
        .recvmsg =        packet_recvmsg,
        .mmap =                packet_mmap,
};

static const struct net_proto_family packet_family_ops = {
        .family =        PF_PACKET,
        .create =        packet_create,
        .owner        =        THIS_MODULE,
};

static struct notifier_block packet_netdev_notifier = {
        .notifier_call =        packet_notifier,
};

#ifdef CONFIG_PROC_FS

static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        struct net *net = seq_file_net(seq);

        rcu_read_lock();
        return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
}

static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct net *net = seq_file_net(seq);
        return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
}

static void packet_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        rcu_read_unlock();
}

static int packet_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN)
                seq_printf(seq,
                           "%*sRefCnt Type Proto  Iface R Rmem   User   Inode\n",
                           IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
        else {
                struct sock *s = sk_entry(v);
                const struct packet_sock *po = pkt_sk(s);

                seq_printf(seq,
                           "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
                           s,
                           refcount_read(&s->sk_refcnt),
                           s->sk_type,
                           ntohs(READ_ONCE(po->num)),
                           READ_ONCE(po->ifindex),
                           packet_sock_flag(po, PACKET_SOCK_RUNNING),
                           atomic_read(&s->sk_rmem_alloc),
                           from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
                           sock_i_ino(s));
        }

        return 0;
}

static const struct seq_operations packet_seq_ops = {
        .start        = packet_seq_start,
        .next        = packet_seq_next,
        .stop        = packet_seq_stop,
        .show        = packet_seq_show,
};
#endif

static int __net_init packet_net_init(struct net *net)
{
        mutex_init(&net->packet.sklist_lock);
        INIT_HLIST_HEAD(&net->packet.sklist);

#ifdef CONFIG_PROC_FS
        if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
                        sizeof(struct seq_net_private)))
                return -ENOMEM;
#endif /* CONFIG_PROC_FS */

        return 0;
}

static void __net_exit packet_net_exit(struct net *net)
{
        remove_proc_entry("packet", net->proc_net);
        WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
}

static struct pernet_operations packet_net_ops = {
        .init = packet_net_init,
        .exit = packet_net_exit,
};


static void __exit packet_exit(void)
{
        sock_unregister(PF_PACKET);
        proto_unregister(&packet_proto);
        unregister_netdevice_notifier(&packet_netdev_notifier);
        unregister_pernet_subsys(&packet_net_ops);
}

static int __init packet_init(void)
{
        int rc;

        rc = register_pernet_subsys(&packet_net_ops);
        if (rc)
                goto out;
        rc = register_netdevice_notifier(&packet_netdev_notifier);
        if (rc)
                goto out_pernet;
        rc = proto_register(&packet_proto, 0);
        if (rc)
                goto out_notifier;
        rc = sock_register(&packet_family_ops);
        if (rc)
                goto out_proto;

        return 0;

out_proto:
        proto_unregister(&packet_proto);
out_notifier:
        unregister_netdevice_notifier(&packet_netdev_notifier);
out_pernet:
        unregister_pernet_subsys(&packet_net_ops);
out:
        return rc;
}

module_init(packet_init);
module_exit(packet_exit);
MODULE_DESCRIPTION("Packet socket support (AF_PACKET)");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_PACKET);





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  297 



  290 










   28 





  177 
  255 











    2 


















































































































































































































































































































































































































































































  290 


























































































    3 































































































































  265 

















    8 


















































































































































































































































































































































































































































































































































































































   14 
   14 












   14 




















   14 










































  319 




    1 




    1 
























  267 



















































































    8 










   24 




   24 












   11 


    5 










   15 
































    3 



























   21 






























































   24 
























































































   17 






    8 












   24 











   23 























   24 





















   16 

    8 
































































































































    5 





    5 







    1 
    2 





















































   26 














  265 









   18 




















































































































































































































    5 































































































































































































































    4 



















    4 

































































































































































    4 


























































































































































































































































    7 


    2 







    7 
    7 


































































































































































































































































































































































































































































































































































































































































  289 








  265 

























































































  265 
















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Definitions for the 'struct sk_buff' memory handlers.
 *
 *        Authors:
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Florian La Roche, <rzsfl@rz.uni-sb.de>
 */

#ifndef _LINUX_SKBUFF_H
#define _LINUX_SKBUFF_H

#include <linux/kernel.h>
#include <linux/compiler.h>
#include <linux/time.h>
#include <linux/bug.h>
#include <linux/bvec.h>
#include <linux/cache.h>
#include <linux/rbtree.h>
#include <linux/socket.h>
#include <linux/refcount.h>

#include <linux/atomic.h>
#include <asm/types.h>
#include <linux/spinlock.h>
#include <net/checksum.h>
#include <linux/rcupdate.h>
#include <linux/dma-mapping.h>
#include <linux/netdev_features.h>
#include <net/flow_dissector.h>
#include <linux/in6.h>
#include <linux/if_packet.h>
#include <linux/llist.h>
#include <linux/page_frag_cache.h>
#include <net/flow.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <linux/netfilter/nf_conntrack_common.h>
#endif
#include <net/net_debug.h>
#include <net/dropreason-core.h>
#include <net/netmem.h>

/**
 * DOC: skb checksums
 *
 * The interface for checksum offload between the stack and networking drivers
 * is as follows...
 *
 * IP checksum related features
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Drivers advertise checksum offload capabilities in the features of a device.
 * From the stack's point of view these are capabilities offered by the driver.
 * A driver typically only advertises features that it is capable of offloading
 * to its device.
 *
 * .. flat-table:: Checksum related device features
 *   :widths: 1 10
 *
 *   * - %NETIF_F_HW_CSUM
 *     - The driver (or its device) is able to compute one
 *         IP (one's complement) checksum for any combination
 *         of protocols or protocol layering. The checksum is
 *         computed and set in a packet per the CHECKSUM_PARTIAL
 *         interface (see below).
 *
 *   * - %NETIF_F_IP_CSUM
 *     - Driver (device) is only able to checksum plain
 *         TCP or UDP packets over IPv4. These are specifically
 *         unencapsulated packets of the form IPv4|TCP or
 *         IPv4|UDP where the Protocol field in the IPv4 header
 *         is TCP or UDP. The IPv4 header may contain IP options.
 *         This feature cannot be set in features for a device
 *         with NETIF_F_HW_CSUM also set. This feature is being
 *         DEPRECATED (see below).
 *
 *   * - %NETIF_F_IPV6_CSUM
 *     - Driver (device) is only able to checksum plain
 *         TCP or UDP packets over IPv6. These are specifically
 *         unencapsulated packets of the form IPv6|TCP or
 *         IPv6|UDP where the Next Header field in the IPv6
 *         header is either TCP or UDP. IPv6 extension headers
 *         are not supported with this feature. This feature
 *         cannot be set in features for a device with
 *         NETIF_F_HW_CSUM also set. This feature is being
 *         DEPRECATED (see below).
 *
 *   * - %NETIF_F_RXCSUM
 *     - Driver (device) performs receive checksum offload.
 *         This flag is only used to disable the RX checksum
 *         feature for a device. The stack will accept receive
 *         checksum indication in packets received on a device
 *         regardless of whether NETIF_F_RXCSUM is set.
 *
 * Checksumming of received packets by device
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * Indication of checksum verification is set in &sk_buff.ip_summed.
 * Possible values are:
 *
 * - %CHECKSUM_NONE
 *
 *   Device did not checksum this packet e.g. due to lack of capabilities.
 *   The packet contains full (though not verified) checksum in packet but
 *   not in skb->csum. Thus, skb->csum is undefined in this case.
 *
 * - %CHECKSUM_UNNECESSARY
 *
 *   The hardware you're dealing with doesn't calculate the full checksum
 *   (as in %CHECKSUM_COMPLETE), but it does parse headers and verify checksums
 *   for specific protocols. For such packets it will set %CHECKSUM_UNNECESSARY
 *   if their checksums are okay. &sk_buff.csum is still undefined in this case
 *   though. A driver or device must never modify the checksum field in the
 *   packet even if checksum is verified.
 *
 *   %CHECKSUM_UNNECESSARY is applicable to following protocols:
 *
 *     - TCP: IPv6 and IPv4.
 *     - UDP: IPv4 and IPv6. A device may apply CHECKSUM_UNNECESSARY to a
 *       zero UDP checksum for either IPv4 or IPv6, the networking stack
 *       may perform further validation in this case.
 *     - GRE: only if the checksum is present in the header.
 *     - SCTP: indicates the CRC in SCTP header has been validated.
 *     - FCOE: indicates the CRC in FC frame has been validated.
 *
 *   &sk_buff.csum_level indicates the number of consecutive checksums found in
 *   the packet minus one that have been verified as %CHECKSUM_UNNECESSARY.
 *   For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
 *   and a device is able to verify the checksums for UDP (possibly zero),
 *   GRE (checksum flag is set) and TCP, &sk_buff.csum_level would be set to
 *   two. If the device were only able to verify the UDP checksum and not
 *   GRE, either because it doesn't support GRE checksum or because GRE
 *   checksum is bad, skb->csum_level would be set to zero (TCP checksum is
 *   not considered in this case).
 *
 * - %CHECKSUM_COMPLETE
 *
 *   This is the most generic way. The device supplied checksum of the _whole_
 *   packet as seen by netif_rx() and fills in &sk_buff.csum. This means the
 *   hardware doesn't need to parse L3/L4 headers to implement this.
 *
 *   Notes:
 *
 *   - Even if device supports only some protocols, but is able to produce
 *     skb->csum, it MUST use CHECKSUM_COMPLETE, not CHECKSUM_UNNECESSARY.
 *   - CHECKSUM_COMPLETE is not applicable to SCTP and FCoE protocols.
 *
 * - %CHECKSUM_PARTIAL
 *
 *   A checksum is set up to be offloaded to a device as described in the
 *   output description for CHECKSUM_PARTIAL. This may occur on a packet
 *   received directly from another Linux OS, e.g., a virtualized Linux kernel
 *   on the same host, or it may be set in the input path in GRO or remote
 *   checksum offload. For the purposes of checksum verification, the checksum
 *   referred to by skb->csum_start + skb->csum_offset and any preceding
 *   checksums in the packet are considered verified. Any checksums in the
 *   packet that are after the checksum being offloaded are not considered to
 *   be verified.
 *
 * Checksumming on transmit for non-GSO
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * The stack requests checksum offload in the &sk_buff.ip_summed for a packet.
 * Values are:
 *
 * - %CHECKSUM_PARTIAL
 *
 *   The driver is required to checksum the packet as seen by hard_start_xmit()
 *   from &sk_buff.csum_start up to the end, and to record/write the checksum at
 *   offset &sk_buff.csum_start + &sk_buff.csum_offset.
 *   A driver may verify that the
 *   csum_start and csum_offset values are valid values given the length and
 *   offset of the packet, but it should not attempt to validate that the
 *   checksum refers to a legitimate transport layer checksum -- it is the
 *   purview of the stack to validate that csum_start and csum_offset are set
 *   correctly.
 *
 *   When the stack requests checksum offload for a packet, the driver MUST
 *   ensure that the checksum is set correctly. A driver can either offload the
 *   checksum calculation to the device, or call skb_checksum_help (in the case
 *   that the device does not support offload for a particular checksum).
 *
 *   %NETIF_F_IP_CSUM and %NETIF_F_IPV6_CSUM are being deprecated in favor of
 *   %NETIF_F_HW_CSUM. New devices should use %NETIF_F_HW_CSUM to indicate
 *   checksum offload capability.
 *   skb_csum_hwoffload_help() can be called to resolve %CHECKSUM_PARTIAL based
 *   on network device checksumming capabilities: if a packet does not match
 *   them, skb_checksum_help() or skb_crc32c_help() (depending on the value of
 *   &sk_buff.csum_not_inet, see :ref:`crc`)
 *   is called to resolve the checksum.
 *
 * - %CHECKSUM_NONE
 *
 *   The skb was already checksummed by the protocol, or a checksum is not
 *   required.
 *
 * - %CHECKSUM_UNNECESSARY
 *
 *   This has the same meaning as CHECKSUM_NONE for checksum offload on
 *   output.
 *
 * - %CHECKSUM_COMPLETE
 *
 *   Not used in checksum output. If a driver observes a packet with this value
 *   set in skbuff, it should treat the packet as if %CHECKSUM_NONE were set.
 *
 * .. _crc:
 *
 * Non-IP checksum (CRC) offloads
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * .. flat-table::
 *   :widths: 1 10
 *
 *   * - %NETIF_F_SCTP_CRC
 *     - This feature indicates that a device is capable of
 *         offloading the SCTP CRC in a packet. To perform this offload the stack
 *         will set csum_start and csum_offset accordingly, set ip_summed to
 *         %CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication
 *         in the skbuff that the %CHECKSUM_PARTIAL refers to CRC32c.
 *         A driver that supports both IP checksum offload and SCTP CRC32c offload
 *         must verify which offload is configured for a packet by testing the
 *         value of &sk_buff.csum_not_inet; skb_crc32c_csum_help() is provided to
 *         resolve %CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
 *
 *   * - %NETIF_F_FCOE_CRC
 *     - This feature indicates that a device is capable of offloading the FCOE
 *         CRC in a packet. To perform this offload the stack will set ip_summed
 *         to %CHECKSUM_PARTIAL and set csum_start and csum_offset
 *         accordingly. Note that there is no indication in the skbuff that the
 *         %CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
 *         both IP checksum offload and FCOE CRC offload must verify which offload
 *         is configured for a packet, presumably by inspecting packet headers.
 *
 * Checksumming on output with GSO
 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *
 * In the case of a GSO packet (skb_is_gso() is true), checksum offload
 * is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
 * gso_type is %SKB_GSO_TCPV4 or %SKB_GSO_TCPV6, TCP checksum offload as
 * part of the GSO operation is implied. If a checksum is being offloaded
 * with GSO then ip_summed is %CHECKSUM_PARTIAL, and both csum_start and
 * csum_offset are set to refer to the outermost checksum being offloaded
 * (two offloaded checksums are possible with UDP encapsulation).
 */

/* Don't change this without changing skb_csum_unnecessary! */
#define CHECKSUM_NONE                0
#define CHECKSUM_UNNECESSARY        1
#define CHECKSUM_COMPLETE        2
#define CHECKSUM_PARTIAL        3

/* Maximum value in skb->csum_level */
#define SKB_MAX_CSUM_LEVEL        3

#define SKB_DATA_ALIGN(X)        ALIGN(X, SMP_CACHE_BYTES)
#define SKB_WITH_OVERHEAD(X)        \
        ((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

/* For X bytes available in skb->head, what is the minimal
 * allocation needed, knowing struct skb_shared_info needs
 * to be aligned.
 */
#define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \
        SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

#define SKB_MAX_ORDER(X, ORDER) \
        SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
#define SKB_MAX_HEAD(X)                (SKB_MAX_ORDER((X), 0))
#define SKB_MAX_ALLOC                (SKB_MAX_ORDER(0, 2))

/* return minimum truesize of one skb containing X bytes of data */
#define SKB_TRUESIZE(X) ((X) +                                                \
                         SKB_DATA_ALIGN(sizeof(struct sk_buff)) +        \
                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

struct ahash_request;
struct net_device;
struct scatterlist;
struct pipe_inode_info;
struct iov_iter;
struct napi_struct;
struct bpf_prog;
union bpf_attr;
struct skb_ext;
struct ts_config;

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info {
        enum {
                BRNF_PROTO_UNCHANGED,
                BRNF_PROTO_8021Q,
                BRNF_PROTO_PPPOE
        } orig_proto:8;
        u8                        pkt_otherhost:1;
        u8                        in_prerouting:1;
        u8                        bridged_dnat:1;
        u8                        sabotage_in_done:1;
        __u16                        frag_max_size;
        int                        physinif;

        /* always valid & non-NULL from FORWARD on, for physdev match */
        struct net_device        *physoutdev;
        union {
                /* prerouting: detect dnat in orig/reply direction */
                __be32          ipv4_daddr;
                struct in6_addr ipv6_daddr;

                /* after prerouting + nat detected: store original source
                 * mac since neigh resolution overwrites it, only used while
                 * skb is out in neigh layer.
                 */
                char neigh_header[8];
        };
};
#endif

#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
/* Chain in tc_skb_ext will be used to share the tc chain with
 * ovs recirc_id. It will be set to the current chain by tc
 * and read by ovs to recirc_id.
 */
struct tc_skb_ext {
        union {
                u64 act_miss_cookie;
                __u32 chain;
        };
        __u16 mru;
        __u16 zone;
        u8 post_ct:1;
        u8 post_ct_snat:1;
        u8 post_ct_dnat:1;
        u8 act_miss:1; /* Set if act_miss_cookie is used */
        u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */
};
#endif

struct sk_buff_head {
        /* These two members must be first to match sk_buff. */
        struct_group_tagged(sk_buff_list, list,
                struct sk_buff        *next;
                struct sk_buff        *prev;
        );

        __u32                qlen;
        spinlock_t        lock;
};

struct sk_buff;

#ifndef CONFIG_MAX_SKB_FRAGS
# define CONFIG_MAX_SKB_FRAGS 17
#endif

#define MAX_SKB_FRAGS CONFIG_MAX_SKB_FRAGS

/* Set skb_shinfo(skb)->gso_size to this in case you want skb_segment to
 * segment using its current segmentation instead.
 */
#define GSO_BY_FRAGS        0xFFFF

typedef struct skb_frag {
        netmem_ref netmem;
        unsigned int len;
        unsigned int offset;
} skb_frag_t;

/**
 * skb_frag_size() - Returns the size of a skb fragment
 * @frag: skb fragment
 */
static inline unsigned int skb_frag_size(const skb_frag_t *frag)
{
        return frag->len;
}

/**
 * skb_frag_size_set() - Sets the size of a skb fragment
 * @frag: skb fragment
 * @size: size of fragment
 */
static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size)
{
        frag->len = size;
}

/**
 * skb_frag_size_add() - Increments the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_size_add(skb_frag_t *frag, int delta)
{
        frag->len += delta;
}

/**
 * skb_frag_size_sub() - Decrements the size of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to subtract
 */
static inline void skb_frag_size_sub(skb_frag_t *frag, int delta)
{
        frag->len -= delta;
}

/**
 * skb_frag_must_loop - Test if %p is a high memory page
 * @p: fragment's page
 */
static inline bool skb_frag_must_loop(struct page *p)
{
#if defined(CONFIG_HIGHMEM)
        if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) || PageHighMem(p))
                return true;
#endif
        return false;
}

/**
 *        skb_frag_foreach_page - loop over pages in a fragment
 *
 *        @f:                skb frag to operate on
 *        @f_off:                offset from start of f->netmem
 *        @f_len:                length from f_off to loop over
 *        @p:                (temp var) current page
 *        @p_off:                (temp var) offset from start of current page,
 *                                   non-zero only on first page.
 *        @p_len:                (temp var) length in current page,
 *                                   < PAGE_SIZE only on first and last page.
 *        @copied:        (temp var) length so far, excluding current p_len.
 *
 *        A fragment can hold a compound page, in which case per-page
 *        operations, notably kmap_atomic, must be called for each
 *        regular page.
 */
#define skb_frag_foreach_page(f, f_off, f_len, p, p_off, p_len, copied)        \
        for (p = skb_frag_page(f) + ((f_off) >> PAGE_SHIFT),                \
             p_off = (f_off) & (PAGE_SIZE - 1),                                \
             p_len = skb_frag_must_loop(p) ?                                \
             min_t(u32, f_len, PAGE_SIZE - p_off) : f_len,                \
             copied = 0;                                                \
             copied < f_len;                                                \
             copied += p_len, p++, p_off = 0,                                \
             p_len = min_t(u32, f_len - copied, PAGE_SIZE))                \

/**
 * struct skb_shared_hwtstamps - hardware time stamps
 * @hwtstamp:                hardware time stamp transformed into duration
 *                        since arbitrary point in time
 * @netdev_data:        address/cookie of network device driver used as
 *                        reference to actual hardware time stamp
 *
 * Software time stamps generated by ktime_get_real() are stored in
 * skb->tstamp.
 *
 * hwtstamps can only be compared against other hwtstamps from
 * the same device.
 *
 * This structure is attached to packets as part of the
 * &skb_shared_info. Use skb_hwtstamps() to get a pointer.
 */
struct skb_shared_hwtstamps {
        union {
                ktime_t        hwtstamp;
                void *netdev_data;
        };
};

/* Definitions for tx_flags in struct skb_shared_info */
enum {
        /* generate hardware time stamp */
        SKBTX_HW_TSTAMP_NOBPF = 1 << 0,

        /* generate software time stamp when queueing packet to NIC */
        SKBTX_SW_TSTAMP = 1 << 1,

        /* device driver is going to provide hardware time stamp */
        SKBTX_IN_PROGRESS = 1 << 2,

        /* generate software time stamp on packet tx completion */
        SKBTX_COMPLETION_TSTAMP = 1 << 3,

        /* generate wifi status information (where possible) */
        SKBTX_WIFI_STATUS = 1 << 4,

        /* determine hardware time stamp based on time or cycles */
        SKBTX_HW_TSTAMP_NETDEV = 1 << 5,

        /* generate software time stamp when entering packet scheduling */
        SKBTX_SCHED_TSTAMP = 1 << 6,

        /* used for bpf extension when a bpf program is loaded */
        SKBTX_BPF = 1 << 7,
};

#define SKBTX_HW_TSTAMP                (SKBTX_HW_TSTAMP_NOBPF | SKBTX_BPF)

#define SKBTX_ANY_SW_TSTAMP        (SKBTX_SW_TSTAMP    | \
                                 SKBTX_SCHED_TSTAMP | \
                                 SKBTX_BPF          | \
                                 SKBTX_COMPLETION_TSTAMP)
#define SKBTX_ANY_TSTAMP        (SKBTX_HW_TSTAMP | \
                                 SKBTX_ANY_SW_TSTAMP)

/* Definitions for flags in struct skb_shared_info */
enum {
        /* use zcopy routines */
        SKBFL_ZEROCOPY_ENABLE = BIT(0),

        /* This indicates at least one fragment might be overwritten
         * (as in vmsplice(), sendfile() ...)
         * If we need to compute a TX checksum, we'll need to copy
         * all frags to avoid possible bad checksum
         */
        SKBFL_SHARED_FRAG = BIT(1),

        /* segment contains only zerocopy data and should not be
         * charged to the kernel memory.
         */
        SKBFL_PURE_ZEROCOPY = BIT(2),

        SKBFL_DONT_ORPHAN = BIT(3),

        /* page references are managed by the ubuf_info, so it's safe to
         * use frags only up until ubuf_info is released
         */
        SKBFL_MANAGED_FRAG_REFS = BIT(4),
};

#define SKBFL_ZEROCOPY_FRAG        (SKBFL_ZEROCOPY_ENABLE | SKBFL_SHARED_FRAG)
#define SKBFL_ALL_ZEROCOPY        (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \
                                 SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS)

struct ubuf_info_ops {
        void (*complete)(struct sk_buff *, struct ubuf_info *,
                         bool zerocopy_success);
        /* has to be compatible with skb_zcopy_set() */
        int (*link_skb)(struct sk_buff *skb, struct ubuf_info *uarg);
};

/*
 * The callback notifies userspace to release buffers when skb DMA is done in
 * lower device, the skb last reference should be 0 when calling this.
 * The zerocopy_success argument is true if zero copy transmit occurred,
 * false on data copy or out of memory error caused by data copy attempt.
 * The ctx field is used to track device context.
 * The desc field is used to track userspace buffer index.
 */
struct ubuf_info {
        const struct ubuf_info_ops *ops;
        refcount_t refcnt;
        u8 flags;
};

struct ubuf_info_msgzc {
        struct ubuf_info ubuf;

        union {
                struct {
                        unsigned long desc;
                        void *ctx;
                };
                struct {
                        u32 id;
                        u16 len;
                        u16 zerocopy:1;
                        u32 bytelen;
                };
        };

        struct mmpin {
                struct user_struct *user;
                unsigned int num_pg;
        } mmp;
};

#define skb_uarg(SKB)        ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
#define uarg_to_msgzc(ubuf_ptr)        container_of((ubuf_ptr), struct ubuf_info_msgzc, \
                                             ubuf)

int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);

/* Preserve some data across TX submission and completion.
 *
 * Note, this state is stored in the driver. Extending the layout
 * might need some special care.
 */
struct xsk_tx_metadata_compl {
        __u64 *tx_timestamp;
};

/* This data is invariant across clones and lives at
 * the end of the header data, ie. at skb->end.
 */
struct skb_shared_info {
        __u8                flags;
        __u8                meta_len;
        __u8                nr_frags;
        __u8                tx_flags;
        unsigned short        gso_size;
        /* Warning: this field is not always filled in (UFO)! */
        unsigned short        gso_segs;
        struct sk_buff        *frag_list;
        union {
                struct skb_shared_hwtstamps hwtstamps;
                struct xsk_tx_metadata_compl xsk_meta;
        };
        unsigned int        gso_type;
        u32                tskey;

        /*
         * Warning : all fields before dataref are cleared in __alloc_skb()
         */
        atomic_t        dataref;

        union {
                struct {
                        u32                xdp_frags_size;
                        u32                xdp_frags_truesize;
                };

                /*
                 * Intermediate layers must ensure that destructor_arg
                 * remains valid until skb destructor.
                 */
                void                *destructor_arg;
        };

        /* must be last field, see pskb_expand_head() */
        skb_frag_t        frags[MAX_SKB_FRAGS];
};

/**
 * DOC: dataref and headerless skbs
 *
 * Transport layers send out clones of payload skbs they hold for
 * retransmissions. To allow lower layers of the stack to prepend their headers
 * we split &skb_shared_info.dataref into two halves.
 * The lower 16 bits count the overall number of references.
 * The higher 16 bits indicate how many of the references are payload-only.
 * skb_header_cloned() checks if skb is allowed to add / write the headers.
 *
 * The creator of the skb (e.g. TCP) marks its skb as &sk_buff.nohdr
 * (via __skb_header_release()). Any clone created from marked skb will get
 * &sk_buff.hdr_len populated with the available headroom.
 * If there's the only clone in existence it's able to modify the headroom
 * at will. The sequence of calls inside the transport layer is::
 *
 *  <alloc skb>
 *  skb_reserve()
 *  __skb_header_release()
 *  skb_clone()
 *  // send the clone down the stack
 *
 * This is not a very generic construct and it depends on the transport layers
 * doing the right thing. In practice there's usually only one payload-only skb.
 * Having multiple payload-only skbs with different lengths of hdr_len is not
 * possible. The payload-only skbs should never leave their owner.
 */
#define SKB_DATAREF_SHIFT 16
#define SKB_DATAREF_MASK ((1 << SKB_DATAREF_SHIFT) - 1)


enum {
        SKB_FCLONE_UNAVAILABLE,        /* skb has no fclone (from head_cache) */
        SKB_FCLONE_ORIG,        /* orig skb (from fclone_cache) */
        SKB_FCLONE_CLONE,        /* companion fclone skb (from fclone_cache) */
};

enum {
        SKB_GSO_TCPV4 = 1 << 0,

        /* This indicates the skb is from an untrusted source. */
        SKB_GSO_DODGY = 1 << 1,

        /* This indicates the tcp segment has CWR set. */
        SKB_GSO_TCP_ECN = 1 << 2,

        SKB_GSO_TCP_FIXEDID = 1 << 3,

        SKB_GSO_TCPV6 = 1 << 4,

        SKB_GSO_FCOE = 1 << 5,

        SKB_GSO_GRE = 1 << 6,

        SKB_GSO_GRE_CSUM = 1 << 7,

        SKB_GSO_IPXIP4 = 1 << 8,

        SKB_GSO_IPXIP6 = 1 << 9,

        SKB_GSO_UDP_TUNNEL = 1 << 10,

        SKB_GSO_UDP_TUNNEL_CSUM = 1 << 11,

        SKB_GSO_PARTIAL = 1 << 12,

        SKB_GSO_TUNNEL_REMCSUM = 1 << 13,

        SKB_GSO_SCTP = 1 << 14,

        SKB_GSO_ESP = 1 << 15,

        SKB_GSO_UDP = 1 << 16,

        SKB_GSO_UDP_L4 = 1 << 17,

        SKB_GSO_FRAGLIST = 1 << 18,

        SKB_GSO_TCP_ACCECN = 1 << 19,
};

#if BITS_PER_LONG > 32
#define NET_SKBUFF_DATA_USES_OFFSET 1
#endif

#ifdef NET_SKBUFF_DATA_USES_OFFSET
typedef unsigned int sk_buff_data_t;
#else
typedef unsigned char *sk_buff_data_t;
#endif

enum skb_tstamp_type {
        SKB_CLOCK_REALTIME,
        SKB_CLOCK_MONOTONIC,
        SKB_CLOCK_TAI,
        __SKB_CLOCK_MAX = SKB_CLOCK_TAI,
};

/**
 * DOC: Basic sk_buff geometry
 *
 * struct sk_buff itself is a metadata structure and does not hold any packet
 * data. All the data is held in associated buffers.
 *
 * &sk_buff.head points to the main "head" buffer. The head buffer is divided
 * into two parts:
 *
 *  - data buffer, containing headers and sometimes payload;
 *    this is the part of the skb operated on by the common helpers
 *    such as skb_put() or skb_pull();
 *  - shared info (struct skb_shared_info) which holds an array of pointers
 *    to read-only data in the (page, offset, length) format.
 *
 * Optionally &skb_shared_info.frag_list may point to another skb.
 *
 * Basic diagram may look like this::
 *
 *                                  ---------------
 *                                 | sk_buff       |
 *                                  ---------------
 *     ,---------------------------  + head
 *    /          ,-----------------  + data
 *   /          /      ,-----------  + tail
 *  |          |      |            , + end
 *  |          |      |           |
 *  v          v      v           v
 *   -----------------------------------------------
 *  | headroom | data |  tailroom | skb_shared_info |
 *   -----------------------------------------------
 *                                 + [page frag]
 *                                 + [page frag]
 *                                 + [page frag]
 *                                 + [page frag]       ---------
 *                                 + frag_list    --> | sk_buff |
 *                                                     ---------
 *
 */

/**
 *        struct sk_buff - socket buffer
 *        @next: Next buffer in list
 *        @prev: Previous buffer in list
 *        @tstamp: Time we arrived/left
 *        @skb_mstamp_ns: (aka @tstamp) earliest departure time; start point
 *                for retransmit timer
 *        @rbnode: RB tree node, alternative to next/prev for netem/tcp
 *        @list: queue head
 *        @ll_node: anchor in an llist (eg socket defer_list)
 *        @sk: Socket we are owned by
 *        @dev: Device we arrived on/are leaving by
 *        @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL
 *        @cb: Control buffer. Free for use by every layer. Put private vars here
 *        @_skb_refdst: destination entry (with norefcount bit)
 *        @len: Length of actual data
 *        @data_len: Data length
 *        @mac_len: Length of link layer header
 *        @hdr_len: writable header length of cloned skb
 *        @csum: Checksum (must include start/offset pair)
 *        @csum_start: Offset from skb->head where checksumming should start
 *        @csum_offset: Offset from csum_start where checksum should be stored
 *        @priority: Packet queueing priority
 *        @ignore_df: allow local fragmentation
 *        @cloned: Head may be cloned (check refcnt to be sure)
 *        @ip_summed: Driver fed us an IP checksum
 *        @nohdr: Payload reference only, must not modify header
 *        @pkt_type: Packet class
 *        @fclone: skbuff clone status
 *        @ipvs_property: skbuff is owned by ipvs
 *        @inner_protocol_type: whether the inner protocol is
 *                ENCAP_TYPE_ETHER or ENCAP_TYPE_IPPROTO
 *        @remcsum_offload: remote checksum offload is enabled
 *        @offload_fwd_mark: Packet was L2-forwarded in hardware
 *        @offload_l3_fwd_mark: Packet was L3-forwarded in hardware
 *        @tc_skip_classify: do not classify packet. set by IFB device
 *        @tc_at_ingress: used within tc_classify to distinguish in/egress
 *        @redirected: packet was redirected by packet classifier
 *        @from_ingress: packet was redirected from the ingress path
 *        @nf_skip_egress: packet shall skip nf egress - see netfilter_netdev.h
 *        @peeked: this packet has been seen already, so stats have been
 *                done for it, don't do them again
 *        @nf_trace: netfilter packet trace flag
 *        @protocol: Packet protocol from driver
 *        @destructor: Destruct function
 *        @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
 *        @_sk_redir: socket redirection information for skmsg
 *        @_nfct: Associated connection, if any (with nfctinfo bits)
 *        @skb_iif: ifindex of device we arrived on
 *        @tc_index: Traffic control index
 *        @hash: the packet hash
 *        @queue_mapping: Queue mapping for multiqueue devices
 *        @head_frag: skb was allocated from page fragments,
 *                not allocated by kmalloc() or vmalloc().
 *        @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
 *        @pp_recycle: mark the packet for recycling instead of freeing (implies
 *                page_pool support on driver)
 *        @active_extensions: active extensions (skb_ext_id types)
 *        @ndisc_nodetype: router type (from link layer)
 *        @ooo_okay: allow the mapping of a socket to a queue to be changed
 *        @l4_hash: indicate hash is a canonical 4-tuple hash over transport
 *                ports.
 *        @sw_hash: indicates hash was computed in software stack
 *        @wifi_acked_valid: wifi_acked was set
 *        @wifi_acked: whether frame was acked on wifi or not
 *        @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 *        @encapsulation: indicates the inner headers in the skbuff are valid
 *        @encap_hdr_csum: software checksum is needed
 *        @csum_valid: checksum is already valid
 *        @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
 *        @csum_complete_sw: checksum was completed by software
 *        @csum_level: indicates the number of consecutive checksums found in
 *                the packet minus one that have been verified as
 *                CHECKSUM_UNNECESSARY (max 3)
 *        @unreadable: indicates that at least 1 of the fragments in this skb is
 *                unreadable.
 *        @dst_pending_confirm: need to confirm neighbour
 *        @decrypted: Decrypted SKB
 *        @slow_gro: state present at GRO time, slower prepare step required
 *        @tstamp_type: When set, skb->tstamp has the
 *                delivery_time clock base of skb->tstamp.
 *        @napi_id: id of the NAPI struct this skb came from
 *        @sender_cpu: (aka @napi_id) source CPU in XPS
 *        @alloc_cpu: CPU which did the skb allocation.
 *        @secmark: security marking
 *        @mark: Generic packet mark
 *        @reserved_tailroom: (aka @mark) number of bytes of free space available
 *                at the tail of an sk_buff
 *        @vlan_all: vlan fields (proto & tci)
 *        @vlan_proto: vlan encapsulation protocol
 *        @vlan_tci: vlan tag control information
 *        @inner_protocol: Protocol (encapsulation)
 *        @inner_ipproto: (aka @inner_protocol) stores ipproto when
 *                skb->inner_protocol_type == ENCAP_TYPE_IPPROTO;
 *        @inner_transport_header: Inner transport layer header (encapsulation)
 *        @inner_network_header: Network layer header (encapsulation)
 *        @inner_mac_header: Link layer header (encapsulation)
 *        @transport_header: Transport layer header
 *        @network_header: Network layer header
 *        @mac_header: Link layer header
 *        @kcov_handle: KCOV remote handle for remote coverage collection
 *        @tail: Tail pointer
 *        @end: End pointer
 *        @head: Head of buffer
 *        @data: Data head pointer
 *        @truesize: Buffer size
 *        @users: User count - see {datagram,tcp}.c
 *        @extensions: allocated extensions, valid if active_extensions is nonzero
 */

struct sk_buff {
        union {
                struct {
                        /* These two members must be first to match sk_buff_head. */
                        struct sk_buff                *next;
                        struct sk_buff                *prev;

                        union {
                                struct net_device        *dev;
                                /* Some protocols might use this space to store information,
                                 * while device pointer would be NULL.
                                 * UDP receive path is one user.
                                 */
                                unsigned long                dev_scratch;
                        };
                };
                struct rb_node                rbnode; /* used in netem, ip4 defrag, and tcp stack */
                struct list_head        list;
                struct llist_node        ll_node;
        };

        struct sock                *sk;

        union {
                ktime_t                tstamp;
                u64                skb_mstamp_ns; /* earliest departure time */
        };
        /*
         * This is the control buffer. It is free to use for every
         * layer. Please put your private variables there. If you
         * want to keep them across layers you have to do a skb_clone()
         * first. This is owned by whoever has the skb queued ATM.
         */
        char                        cb[48] __aligned(8);

        union {
                struct {
                        unsigned long        _skb_refdst;
                        void                (*destructor)(struct sk_buff *skb);
                };
                struct list_head        tcp_tsorted_anchor;
#ifdef CONFIG_NET_SOCK_MSG
                unsigned long                _sk_redir;
#endif
        };

#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        unsigned long                 _nfct;
#endif
        unsigned int                len,
                                data_len;
        __u16                        mac_len,
                                hdr_len;

        /* Following fields are _not_ copied in __copy_skb_header()
         * Note that queue_mapping is here mostly to fill a hole.
         */
        __u16                        queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK        (1 << 7)
#else
#define CLONED_MASK        1
#endif
#define CLONED_OFFSET                offsetof(struct sk_buff, __cloned_offset)

        /* private: */
        __u8                        __cloned_offset[0];
        /* public: */
        __u8                        cloned:1,
                                nohdr:1,
                                fclone:2,
                                peeked:1,
                                head_frag:1,
                                pfmemalloc:1,
                                pp_recycle:1; /* page_pool recycle indicator */
#ifdef CONFIG_SKB_EXTENSIONS
        __u8                        active_extensions;
#endif

        /* Fields enclosed in headers group are copied
         * using a single memcpy() in __copy_skb_header()
         */
        struct_group(headers,

        /* private: */
        __u8                        __pkt_type_offset[0];
        /* public: */
        __u8                        pkt_type:3; /* see PKT_TYPE_MAX */
        __u8                        ignore_df:1;
        __u8                        dst_pending_confirm:1;
        __u8                        ip_summed:2;
        __u8                        ooo_okay:1;

        /* private: */
        __u8                        __mono_tc_offset[0];
        /* public: */
        __u8                        tstamp_type:2;        /* See skb_tstamp_type */
#ifdef CONFIG_NET_XGRESS
        __u8                        tc_at_ingress:1;        /* See TC_AT_INGRESS_MASK */
        __u8                        tc_skip_classify:1;
#endif
        __u8                        remcsum_offload:1;
        __u8                        csum_complete_sw:1;
        __u8                        csum_level:2;
        __u8                        inner_protocol_type:1;

        __u8                        l4_hash:1;
        __u8                        sw_hash:1;
#ifdef CONFIG_WIRELESS
        __u8                        wifi_acked_valid:1;
        __u8                        wifi_acked:1;
#endif
        __u8                        no_fcs:1;
        /* Indicates the inner headers are valid in the skbuff. */
        __u8                        encapsulation:1;
        __u8                        encap_hdr_csum:1;
        __u8                        csum_valid:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
        __u8                        ndisc_nodetype:2;
#endif

#if IS_ENABLED(CONFIG_IP_VS)
        __u8                        ipvs_property:1;
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        __u8                        nf_trace:1;
#endif
#ifdef CONFIG_NET_SWITCHDEV
        __u8                        offload_fwd_mark:1;
        __u8                        offload_l3_fwd_mark:1;
#endif
        __u8                        redirected:1;
#ifdef CONFIG_NET_REDIRECT
        __u8                        from_ingress:1;
#endif
#ifdef CONFIG_NETFILTER_SKIP_EGRESS
        __u8                        nf_skip_egress:1;
#endif
#ifdef CONFIG_SKB_DECRYPTED
        __u8                        decrypted:1;
#endif
        __u8                        slow_gro:1;
#if IS_ENABLED(CONFIG_IP_SCTP)
        __u8                        csum_not_inet:1;
#endif
        __u8                        unreadable:1;
#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
        __u16                        tc_index;        /* traffic control index */
#endif

        u16                        alloc_cpu;

        union {
                __wsum                csum;
                struct {
                        __u16        csum_start;
                        __u16        csum_offset;
                };
        };
        __u32                        priority;
        int                        skb_iif;
        __u32                        hash;
        union {
                u32                vlan_all;
                struct {
                        __be16        vlan_proto;
                        __u16        vlan_tci;
                };
        };
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
        union {
                unsigned int        napi_id;
                unsigned int        sender_cpu;
        };
#endif
#ifdef CONFIG_NETWORK_SECMARK
        __u32                secmark;
#endif

        union {
                __u32                mark;
                __u32                reserved_tailroom;
        };

        union {
                __be16                inner_protocol;
                __u8                inner_ipproto;
        };

        __u16                        inner_transport_header;
        __u16                        inner_network_header;
        __u16                        inner_mac_header;

        __be16                        protocol;
        __u16                        transport_header;
        __u16                        network_header;
        __u16                        mac_header;

#ifdef CONFIG_KCOV
        u64                        kcov_handle;
#endif

        ); /* end headers group */

        /* These elements must be at the end, see alloc_skb() for details.  */
        sk_buff_data_t                tail;
        sk_buff_data_t                end;
        unsigned char                *head,
                                *data;
        unsigned int                truesize;
        refcount_t                users;

#ifdef CONFIG_SKB_EXTENSIONS
        /* only usable after checking ->active_extensions != 0 */
        struct skb_ext                *extensions;
#endif
};

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX        (7 << 5)
#else
#define PKT_TYPE_MAX        7
#endif
#define PKT_TYPE_OFFSET                offsetof(struct sk_buff, __pkt_type_offset)

/* if you move tc_at_ingress or tstamp_type
 * around, you also must adapt these constants.
 */
#ifdef __BIG_ENDIAN_BITFIELD
#define SKB_TSTAMP_TYPE_MASK                (3 << 6)
#define SKB_TSTAMP_TYPE_RSHIFT                (6)
#define TC_AT_INGRESS_MASK                (1 << 5)
#else
#define SKB_TSTAMP_TYPE_MASK                (3)
#define TC_AT_INGRESS_MASK                (1 << 2)
#endif
#define SKB_BF_MONO_TC_OFFSET                offsetof(struct sk_buff, __mono_tc_offset)

#ifdef __KERNEL__
/*
 *        Handling routines are only of interest to the kernel
 */

#define SKB_ALLOC_FCLONE        0x01
#define SKB_ALLOC_RX                0x02
#define SKB_ALLOC_NAPI                0x04

/**
 * skb_pfmemalloc - Test if the skb was allocated from PFMEMALLOC reserves
 * @skb: buffer
 */
static inline bool skb_pfmemalloc(const struct sk_buff *skb)
{
        return unlikely(skb->pfmemalloc);
}

/*
 * skb might have a dst pointer attached, refcounted or not.
 * _skb_refdst low order bit is set if refcount was _not_ taken
 */
#define SKB_DST_NOREF        1UL
#define SKB_DST_PTRMASK        ~(SKB_DST_NOREF)

/**
 * skb_dst - returns skb dst_entry
 * @skb: buffer
 *
 * Returns: skb dst_entry, regardless of reference taken or not.
 */
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
        /* If refdst was not refcounted, check we still are in a
         * rcu_read_lock section
         */
        WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
                !rcu_read_lock_held() &&
                !rcu_read_lock_bh_held());
        return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}

/**
 * skb_dst_set - sets skb dst
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was taken on dst and should
 * be released by skb_dst_drop()
 */
static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
{
        skb->slow_gro |= !!dst;
        skb->_skb_refdst = (unsigned long)dst;
}

/**
 * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
 * @skb: buffer
 * @dst: dst entry
 *
 * Sets skb dst, assuming a reference was not taken on dst.
 * If dst entry is cached, we do not take reference and dst_release
 * will be avoided by refdst_drop. If dst entry is not cached, we take
 * reference, so that last dst_release can destroy the dst immediately.
 */
static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
{
        WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
        skb->slow_gro |= !!dst;
        skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
}

/**
 * skb_dst_is_noref - Test if skb dst isn't refcounted
 * @skb: buffer
 */
static inline bool skb_dst_is_noref(const struct sk_buff *skb)
{
        return (skb->_skb_refdst & SKB_DST_NOREF) && skb_dst(skb);
}

/* For mangling skb->pkt_type from user space side from applications
 * such as nft, tc, etc, we only allow a conservative subset of
 * possible pkt_types to be set.
*/
static inline bool skb_pkt_type_ok(u32 ptype)
{
        return ptype <= PACKET_OTHERHOST;
}

/**
 * skb_napi_id - Returns the skb's NAPI id
 * @skb: buffer
 */
static inline unsigned int skb_napi_id(const struct sk_buff *skb)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
        return skb->napi_id;
#else
        return 0;
#endif
}

static inline bool skb_wifi_acked_valid(const struct sk_buff *skb)
{
#ifdef CONFIG_WIRELESS
        return skb->wifi_acked_valid;
#else
        return 0;
#endif
}

/**
 * skb_unref - decrement the skb's reference count
 * @skb: buffer
 *
 * Returns: true if we can free the skb.
 */
static inline bool skb_unref(struct sk_buff *skb)
{
        if (unlikely(!skb))
                return false;
        if (!IS_ENABLED(CONFIG_DEBUG_NET) && likely(refcount_read(&skb->users) == 1))
                smp_rmb();
        else if (likely(!refcount_dec_and_test(&skb->users)))
                return false;

        return true;
}

static inline bool skb_data_unref(const struct sk_buff *skb,
                                  struct skb_shared_info *shinfo)
{
        int bias;

        if (!skb->cloned)
                return true;

        bias = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1;

        if (atomic_read(&shinfo->dataref) == bias)
                smp_rmb();
        else if (atomic_sub_return(bias, &shinfo->dataref))
                return false;

        return true;
}

void __fix_address sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
                                      enum skb_drop_reason reason);

static inline void
kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        sk_skb_reason_drop(NULL, skb, reason);
}

/**
 *        kfree_skb - free an sk_buff with 'NOT_SPECIFIED' reason
 *        @skb: buffer to free
 */
static inline void kfree_skb(struct sk_buff *skb)
{
        kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

void skb_release_head_state(struct sk_buff *skb);
void kfree_skb_list_reason(struct sk_buff *segs,
                           enum skb_drop_reason reason);
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt);
void skb_tx_error(struct sk_buff *skb);

static inline void kfree_skb_list(struct sk_buff *segs)
{
        kfree_skb_list_reason(segs, SKB_DROP_REASON_NOT_SPECIFIED);
}

#ifdef CONFIG_TRACEPOINTS
void consume_skb(struct sk_buff *skb);
#else
static inline void consume_skb(struct sk_buff *skb)
{
        return kfree_skb(skb);
}
#endif

void __consume_stateless_skb(struct sk_buff *skb);
void  __kfree_skb(struct sk_buff *skb);

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen);
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize);

struct sk_buff *__alloc_skb(unsigned int size, gfp_t priority, int flags,
                            int node);
struct sk_buff *__build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb(void *data, unsigned int frag_size);
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size);
void skb_attempt_defer_free(struct sk_buff *skb);

u32 napi_skb_cache_get_bulk(void **skbs, u32 n);
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size);
struct sk_buff *slab_build_skb(void *data);

/**
 * alloc_skb - allocate a network buffer
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb(unsigned int size,
                                        gfp_t priority)
{
        return __alloc_skb(size, priority, 0, NUMA_NO_NODE);
}

struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int max_page_order,
                                     int *errcode,
                                     gfp_t gfp_mask);
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first);

/* Layout of fast clones : [skb1][skb2][fclone_ref] */
struct sk_buff_fclones {
        struct sk_buff        skb1;

        struct sk_buff        skb2;

        refcount_t        fclone_ref;
};

/**
 *        skb_fclone_busy - check if fclone is busy
 *        @sk: socket
 *        @skb: buffer
 *
 * Returns: true if skb is a fast clone, and its clone is not freed.
 * Some drivers call skb_orphan() in their ndo_start_xmit(),
 * so we also check that didn't happen.
 */
static inline bool skb_fclone_busy(const struct sock *sk,
                                   const struct sk_buff *skb)
{
        const struct sk_buff_fclones *fclones;

        fclones = container_of(skb, struct sk_buff_fclones, skb1);

        return skb->fclone == SKB_FCLONE_ORIG &&
               refcount_read(&fclones->fclone_ref) > 1 &&
               READ_ONCE(fclones->skb2.sk) == sk;
}

/**
 * alloc_skb_fclone - allocate a network buffer from fclone cache
 * @size: size to allocate
 * @priority: allocation mask
 *
 * This function is a convenient wrapper around __alloc_skb().
 */
static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
                                               gfp_t priority)
{
        return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
}

struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
void skb_headers_offset_update(struct sk_buff *skb, int off);
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t priority);
void skb_copy_header(struct sk_buff *new, const struct sk_buff *old);
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t priority);
struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone);
static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
                                          gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, headroom, gfp_mask, false);
}

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
                                     unsigned int headroom);
struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom);
struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
                                int newtailroom, gfp_t priority);
int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                                     int offset, int len);
int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
                              int offset, int len);
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error);

/**
 *        skb_pad                        -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error.
 */
static inline int skb_pad(struct sk_buff *skb, int pad)
{
        return __skb_pad(skb, pad, true);
}
#define dev_kfree_skb(a)        consume_skb(a)

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size, size_t max_frags);

struct skb_seq_state {
        __u32                lower_offset;
        __u32                upper_offset;
        __u32                frag_idx;
        __u32                stepped_offset;
        struct sk_buff        *root_skb;
        struct sk_buff        *cur_skb;
        __u8                *frag_data;
        __u32                frag_off;
};

void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st);
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st);
void skb_abort_seq_read(struct skb_seq_state *st);
int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len);

unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config);

/*
 * Packet hash types specify the type of hash in skb_set_hash.
 *
 * Hash types refer to the protocol layer addresses which are used to
 * construct a packet's hash. The hashes are used to differentiate or identify
 * flows of the protocol layer for the hash type. Hash types are either
 * layer-2 (L2), layer-3 (L3), or layer-4 (L4).
 *
 * Properties of hashes:
 *
 * 1) Two packets in different flows have different hash values
 * 2) Two packets in the same flow should have the same hash value
 *
 * A hash at a higher layer is considered to be more specific. A driver should
 * set the most specific hash possible.
 *
 * A driver cannot indicate a more specific hash than the layer at which a hash
 * was computed. For instance an L3 hash cannot be set as an L4 hash.
 *
 * A driver may indicate a hash level which is less specific than the
 * actual layer the hash was computed on. For instance, a hash computed
 * at L4 may be considered an L3 hash. This should only be done if the
 * driver can't unambiguously determine that the HW computed the hash at
 * the higher layer. Note that the "should" in the second property above
 * permits this.
 */
enum pkt_hash_types {
        PKT_HASH_TYPE_NONE,        /* Undefined type */
        PKT_HASH_TYPE_L2,        /* Input: src_MAC, dest_MAC */
        PKT_HASH_TYPE_L3,        /* Input: src_IP, dst_IP */
        PKT_HASH_TYPE_L4,        /* Input: src_IP, dst_IP, src_port, dst_port */
};

static inline void skb_clear_hash(struct sk_buff *skb)
{
        skb->hash = 0;
        skb->sw_hash = 0;
        skb->l4_hash = 0;
}

static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb)
{
        if (!skb->l4_hash)
                skb_clear_hash(skb);
}

static inline void
__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4)
{
        skb->l4_hash = is_l4;
        skb->sw_hash = is_sw;
        skb->hash = hash;
}

static inline void
skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type)
{
        /* Used by drivers to set hash from HW */
        __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4);
}

static inline void
__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4)
{
        __skb_set_hash(skb, hash, true, is_l4);
}

u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb);

static inline u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
{
        return __skb_get_hash_symmetric_net(NULL, skb);
}

void __skb_get_hash_net(const struct net *net, struct sk_buff *skb);
u32 skb_get_poff(const struct sk_buff *skb);
u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen);
__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                          const void *data, int hlen_proto);

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count);

struct bpf_flow_dissector;
u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                     __be16 proto, int nhoff, int hlen, unsigned int flags);

bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        __be16 proto, int nhoff, int hlen, unsigned int flags);

static inline bool skb_flow_dissect(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, unsigned int flags)
{
        return __skb_flow_dissect(NULL, skb, flow_dissector,
                                  target_container, NULL, 0, 0, 0, flags);
}

static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb,
                                              struct flow_keys *flow,
                                              unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(NULL, skb, &flow_keys_dissector,
                                  flow, NULL, 0, 0, 0, flags);
}

static inline bool
skb_flow_dissect_flow_keys_basic(const struct net *net,
                                 const struct sk_buff *skb,
                                 struct flow_keys_basic *flow,
                                 const void *data, __be16 proto,
                                 int nhoff, int hlen, unsigned int flags)
{
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
                                  data, proto, nhoff, hlen, flags);
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

/* Gets a skb connection tracking info, ctinfo map should be a
 * map of mapsize to translate enum ip_conntrack_info states
 * to user states.
 */
void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container,
                    u16 *ctinfo_map, size_t mapsize,
                    bool post_ct, u16 zone);
void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container);

static inline __u32 skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash_net(net, skb);

        return skb->hash;
}

static inline __u32 skb_get_hash(struct sk_buff *skb)
{
        if (!skb->l4_hash && !skb->sw_hash)
                __skb_get_hash_net(NULL, skb);

        return skb->hash;
}

static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6)
{
        if (!skb->l4_hash && !skb->sw_hash) {
                struct flow_keys keys;
                __u32 hash = __get_hash_from_flowi6(fl6, &keys);

                __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
        }

        return skb->hash;
}

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb);

static inline __u32 skb_get_hash_raw(const struct sk_buff *skb)
{
        return skb->hash;
}

static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from)
{
        to->hash = from->hash;
        to->sw_hash = from->sw_hash;
        to->l4_hash = from->l4_hash;
};

static inline int skb_cmp_decrypted(const struct sk_buff *skb1,
                                    const struct sk_buff *skb2)
{
#ifdef CONFIG_SKB_DECRYPTED
        return skb2->decrypted - skb1->decrypted;
#else
        return 0;
#endif
}

static inline bool skb_is_decrypted(const struct sk_buff *skb)
{
#ifdef CONFIG_SKB_DECRYPTED
        return skb->decrypted;
#else
        return false;
#endif
}

static inline void skb_copy_decrypted(struct sk_buff *to,
                                      const struct sk_buff *from)
{
#ifdef CONFIG_SKB_DECRYPTED
        to->decrypted = from->decrypted;
#endif
}

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end;
}

static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
        skb->end = offset;
}
#else
static inline unsigned char *skb_end_pointer(const struct sk_buff *skb)
{
        return skb->end;
}

static inline unsigned int skb_end_offset(const struct sk_buff *skb)
{
        return skb->end - skb->head;
}

static inline void skb_set_end_offset(struct sk_buff *skb, unsigned int offset)
{
        skb->end = skb->head + offset;
}
#endif

extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops;

struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
                                       struct ubuf_info *uarg);

void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);

int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
                            struct sk_buff *skb, struct iov_iter *from,
                            size_t length);

int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
                                struct iov_iter *from, size_t length);

static inline int skb_zerocopy_iter_dgram(struct sk_buff *skb,
                                          struct msghdr *msg, int len)
{
        return __zerocopy_sg_from_iter(msg, skb->sk, skb, &msg->msg_iter, len);
}

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg);

/* Internal */
#define skb_shinfo(SKB)        ((struct skb_shared_info *)(skb_end_pointer(SKB)))

static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
{
        return &skb_shinfo(skb)->hwtstamps;
}

static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
{
        bool is_zcopy = skb && skb_shinfo(skb)->flags & SKBFL_ZEROCOPY_ENABLE;

        return is_zcopy ? skb_uarg(skb) : NULL;
}

static inline bool skb_zcopy_pure(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->flags & SKBFL_PURE_ZEROCOPY;
}

static inline bool skb_zcopy_managed(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->flags & SKBFL_MANAGED_FRAG_REFS;
}

static inline bool skb_pure_zcopy_same(const struct sk_buff *skb1,
                                       const struct sk_buff *skb2)
{
        return skb_zcopy_pure(skb1) == skb_zcopy_pure(skb2);
}

static inline void net_zcopy_get(struct ubuf_info *uarg)
{
        refcount_inc(&uarg->refcnt);
}

static inline void skb_zcopy_init(struct sk_buff *skb, struct ubuf_info *uarg)
{
        skb_shinfo(skb)->destructor_arg = uarg;
        skb_shinfo(skb)->flags |= uarg->flags;
}

static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
                                 bool *have_ref)
{
        if (skb && uarg && !skb_zcopy(skb)) {
                if (unlikely(have_ref && *have_ref))
                        *have_ref = false;
                else
                        net_zcopy_get(uarg);
                skb_zcopy_init(skb, uarg);
        }
}

static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
{
        skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t) val | 0x1UL);
        skb_shinfo(skb)->flags |= SKBFL_ZEROCOPY_FRAG;
}

static inline bool skb_zcopy_is_nouarg(struct sk_buff *skb)
{
        return (uintptr_t) skb_shinfo(skb)->destructor_arg & 0x1UL;
}

static inline void *skb_zcopy_get_nouarg(struct sk_buff *skb)
{
        return (void *)((uintptr_t) skb_shinfo(skb)->destructor_arg & ~0x1UL);
}

static inline void net_zcopy_put(struct ubuf_info *uarg)
{
        if (uarg)
                uarg->ops->complete(NULL, uarg, true);
}

static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
        if (uarg) {
                if (uarg->ops == &msg_zerocopy_ubuf_ops)
                        msg_zerocopy_put_abort(uarg, have_uref);
                else if (have_uref)
                        net_zcopy_put(uarg);
        }
}

/* Release a reference on a zerocopy structure */
static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy_success)
{
        struct ubuf_info *uarg = skb_zcopy(skb);

        if (uarg) {
                if (!skb_zcopy_is_nouarg(skb))
                        uarg->ops->complete(skb, uarg, zerocopy_success);

                skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY;
        }
}

void __skb_zcopy_downgrade_managed(struct sk_buff *skb);

static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
        if (unlikely(skb_zcopy_managed(skb)))
                __skb_zcopy_downgrade_managed(skb);
}

/* Return true if frags in this skb are readable by the host. */
static inline bool skb_frags_readable(const struct sk_buff *skb)
{
        return !skb->unreadable;
}

static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
        skb->next = NULL;
}

static inline void skb_poison_list(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        skb->next = SKB_LIST_POISON_NEXT;
#endif
}

/* Iterate through singly-linked GSO fragments of an skb. */
#define skb_list_walk_safe(first, skb, next_skb)                               \
        for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb);  \
             (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL)

static inline void skb_list_del_init(struct sk_buff *skb)
{
        __list_del_entry(&skb->list);
        skb_mark_not_on_list(skb);
}

/**
 *        skb_queue_empty - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 */
static inline int skb_queue_empty(const struct sk_buff_head *list)
{
        return list->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_empty_lockless - check if a queue is empty
 *        @list: queue head
 *
 *        Returns true if the queue is empty, false otherwise.
 *        This variant can be used in lockless contexts.
 */
static inline bool skb_queue_empty_lockless(const struct sk_buff_head *list)
{
        return READ_ONCE(list->next) == (const struct sk_buff *) list;
}


/**
 *        skb_queue_is_last - check if skb is the last entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the last buffer on the list.
 */
static inline bool skb_queue_is_last(const struct sk_buff_head *list,
                                     const struct sk_buff *skb)
{
        return skb->next == (const struct sk_buff *) list;
}

/**
 *        skb_queue_is_first - check if skb is the first entry in the queue
 *        @list: queue head
 *        @skb: buffer
 *
 *        Returns true if @skb is the first buffer on the list.
 */
static inline bool skb_queue_is_first(const struct sk_buff_head *list,
                                      const struct sk_buff *skb)
{
        return skb->prev == (const struct sk_buff *) list;
}

/**
 *        skb_queue_next - return the next packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the next packet in @list after @skb.  It is only valid to
 *        call this if skb_queue_is_last() evaluates to false.
 */
static inline struct sk_buff *skb_queue_next(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_last(list, skb));
        return skb->next;
}

/**
 *        skb_queue_prev - return the prev packet in the queue
 *        @list: queue head
 *        @skb: current buffer
 *
 *        Return the prev packet in @list before @skb.  It is only valid to
 *        call this if skb_queue_is_first() evaluates to false.
 */
static inline struct sk_buff *skb_queue_prev(const struct sk_buff_head *list,
                                             const struct sk_buff *skb)
{
        /* This BUG_ON may seem severe, but if we just return then we
         * are going to dereference garbage.
         */
        BUG_ON(skb_queue_is_first(list, skb));
        return skb->prev;
}

/**
 *        skb_get - reference buffer
 *        @skb: buffer to reference
 *
 *        Makes another reference to a socket buffer and returns a pointer
 *        to the buffer.
 */
static inline struct sk_buff *skb_get(struct sk_buff *skb)
{
        refcount_inc(&skb->users);
        return skb;
}

/*
 * If users == 1, we are the only owner and can avoid redundant atomic changes.
 */

/**
 *        skb_cloned - is the buffer a clone
 *        @skb: buffer to check
 *
 *        Returns true if the buffer was generated with skb_clone() and is
 *        one of multiple shared copies of the buffer. Cloned buffers are
 *        shared data so must not be written to under normal circumstances.
 */
static inline int skb_cloned(const struct sk_buff *skb)
{
        return skb->cloned &&
               (atomic_read(&skb_shinfo(skb)->dataref) & SKB_DATAREF_MASK) != 1;
}

static inline int skb_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/* This variant of skb_unclone() makes sure skb->truesize
 * and skb_end_offset() are not changed, whenever a new skb->head is needed.
 *
 * Indeed there is no guarantee that ksize(kmalloc(X)) == ksize(kmalloc(X))
 * when various debugging features are in place.
 */
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri);
static inline int skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_cloned(skb))
                return __skb_unclone_keeptruesize(skb, pri);
        return 0;
}

/**
 *        skb_header_cloned - is the header a clone
 *        @skb: buffer to check
 *
 *        Returns true if modifying the header part of the buffer requires
 *        the data to be copied.
 */
static inline int skb_header_cloned(const struct sk_buff *skb)
{
        int dataref;

        if (!skb->cloned)
                return 0;

        dataref = atomic_read(&skb_shinfo(skb)->dataref);
        dataref = (dataref & SKB_DATAREF_MASK) - (dataref >> SKB_DATAREF_SHIFT);
        return dataref != 1;
}

static inline int skb_header_unclone(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));

        if (skb_header_cloned(skb))
                return pskb_expand_head(skb, 0, 0, pri);

        return 0;
}

/**
 * __skb_header_release() - allow clones to use the headroom
 * @skb: buffer to operate on
 *
 * See "DOC: dataref and headerless skbs".
 */
static inline void __skb_header_release(struct sk_buff *skb)
{
        skb->nohdr = 1;
        atomic_set(&skb_shinfo(skb)->dataref, 1 + (1 << SKB_DATAREF_SHIFT));
}


/**
 *        skb_shared - is the buffer shared
 *        @skb: buffer to check
 *
 *        Returns true if more than one person has a reference to this
 *        buffer.
 */
static inline int skb_shared(const struct sk_buff *skb)
{
        return refcount_read(&skb->users) != 1;
}

/**
 *        skb_share_check - check if buffer is shared and if so clone it
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the buffer is shared the buffer is cloned and the old copy
 *        drops a reference. A new clone with a single reference is returned.
 *        If the buffer is not shared the original buffer is returned. When
 *        being called from interrupt status or with spinlocks held pri must
 *        be GFP_ATOMIC.
 *
 *        NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_shared(skb)) {
                struct sk_buff *nskb = skb_clone(skb, pri);

                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/*
 *        Copy shared buffers into a new sk_buff. We effectively do COW on
 *        packets to handle cases where we have a local reader and forward
 *        and a couple of other messy ones. The normal one is tcpdumping
 *        a packet that's being forwarded.
 */

/**
 *        skb_unshare - make a copy of a shared buffer
 *        @skb: buffer to check
 *        @pri: priority for memory allocation
 *
 *        If the socket buffer is a clone then this function creates a new
 *        copy of the data, drops a reference count on the old copy and returns
 *        the new copy with the reference count at 1. If the buffer is not a clone
 *        the original buffer is returned. When called with a spinlock held or
 *        from interrupt state @pri must be %GFP_ATOMIC
 *
 *        %NULL is returned on a memory allocation failure.
 */
static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
                                          gfp_t pri)
{
        might_sleep_if(gfpflags_allow_blocking(pri));
        if (skb_cloned(skb)) {
                struct sk_buff *nskb = skb_copy(skb, pri);

                /* Free our shared copy */
                if (likely(nskb))
                        consume_skb(skb);
                else
                        kfree_skb(skb);
                skb = nskb;
        }
        return skb;
}

/**
 *        skb_peek - peek at the head of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the head element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = list_->next;

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;
}

/**
 *        __skb_peek - peek at the head of a non-empty &sk_buff_head
 *        @list_: list to peek at
 *
 *        Like skb_peek(), but the caller knows that the list is not empty.
 */
static inline struct sk_buff *__skb_peek(const struct sk_buff_head *list_)
{
        return list_->next;
}

/**
 *        skb_peek_next - peek skb following the given one from a queue
 *        @skb: skb to start from
 *        @list_: list to peek at
 *
 *        Returns %NULL when the end of the list is met or a pointer to the
 *        next element. The reference count is not incremented and the
 *        reference is therefore volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_next(struct sk_buff *skb,
                const struct sk_buff_head *list_)
{
        struct sk_buff *next = skb->next;

        if (next == (struct sk_buff *)list_)
                next = NULL;
        return next;
}

/**
 *        skb_peek_tail - peek at the tail of an &sk_buff_head
 *        @list_: list to peek at
 *
 *        Peek an &sk_buff. Unlike most other operations you _MUST_
 *        be careful with this one. A peek leaves the buffer on the
 *        list and someone else may run off with it. You must hold
 *        the appropriate locks or have a private queue to do this.
 *
 *        Returns %NULL for an empty list or a pointer to the tail element.
 *        The reference count is not incremented and the reference is therefore
 *        volatile. Use with caution.
 */
static inline struct sk_buff *skb_peek_tail(const struct sk_buff_head *list_)
{
        struct sk_buff *skb = READ_ONCE(list_->prev);

        if (skb == (struct sk_buff *)list_)
                skb = NULL;
        return skb;

}

/**
 *        skb_queue_len        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 */
static inline __u32 skb_queue_len(const struct sk_buff_head *list_)
{
        return list_->qlen;
}

/**
 *        skb_queue_len_lockless        - get queue length
 *        @list_: list to measure
 *
 *        Return the length of an &sk_buff queue.
 *        This variant can be used in lockless contexts.
 */
static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_)
{
        return READ_ONCE(list_->qlen);
}

/**
 *        __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head
 *        @list: queue to initialize
 *
 *        This initializes only the list and queue length aspects of
 *        an sk_buff_head object.  This allows to initialize the list
 *        aspects of an sk_buff_head without reinitializing things like
 *        the spinlock.  It can also be used for on-stack sk_buff_head
 *        objects where the spinlock is known to not be used.
 */
static inline void __skb_queue_head_init(struct sk_buff_head *list)
{
        list->prev = list->next = (struct sk_buff *)list;
        list->qlen = 0;
}

/*
 * This function creates a split out lock class for each invocation;
 * this is needed for now since a whole lot of users of the skb-queue
 * infrastructure in drivers have different locking usage (in hardirq)
 * than the networking core (in softirq only). In the long run either the
 * network layer or drivers should need annotation to consolidate the
 * main types of usage into 3 classes.
 */
static inline void skb_queue_head_init(struct sk_buff_head *list)
{
        spin_lock_init(&list->lock);
        __skb_queue_head_init(list);
}

static inline void skb_queue_head_init_class(struct sk_buff_head *list,
                struct lock_class_key *class)
{
        skb_queue_head_init(list);
        lockdep_set_class(&list->lock, class);
}

/*
 *        Insert an sk_buff on a list.
 *
 *        The "__skb_xxxx()" functions are the non-atomic ones that
 *        can only be called with interrupts disabled.
 */
static inline void __skb_insert(struct sk_buff *newsk,
                                struct sk_buff *prev, struct sk_buff *next,
                                struct sk_buff_head *list)
{
        /* See skb_queue_empty_lockless() and skb_peek_tail()
         * for the opposite READ_ONCE()
         */
        WRITE_ONCE(newsk->next, next);
        WRITE_ONCE(newsk->prev, prev);
        WRITE_ONCE(((struct sk_buff_list *)next)->prev, newsk);
        WRITE_ONCE(((struct sk_buff_list *)prev)->next, newsk);
        WRITE_ONCE(list->qlen, list->qlen + 1);
}

static inline void __skb_queue_splice(const struct sk_buff_head *list,
                                      struct sk_buff *prev,
                                      struct sk_buff *next)
{
        struct sk_buff *first = list->next;
        struct sk_buff *last = list->prev;

        WRITE_ONCE(first->prev, prev);
        WRITE_ONCE(prev->next, first);

        WRITE_ONCE(last->next, next);
        WRITE_ONCE(next->prev, last);
}

/**
 *        skb_queue_splice - join two skb lists, this is designed for stacks
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice(const struct sk_buff_head *list,
                                    struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_init(struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, (struct sk_buff *) head, head->next);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        skb_queue_splice_tail - join two skb lists, each list being a queue
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 */
static inline void skb_queue_splice_tail(const struct sk_buff_head *list,
                                         struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
        }
}

/**
 *        skb_queue_splice_tail_init - join two skb lists and reinitialise the emptied list
 *        @list: the new list to add
 *        @head: the place to add it in the first list
 *
 *        Each of the lists is a queue.
 *        The list at @list is reinitialised
 */
static inline void skb_queue_splice_tail_init(struct sk_buff_head *list,
                                              struct sk_buff_head *head)
{
        if (!skb_queue_empty(list)) {
                __skb_queue_splice(list, head->prev, (struct sk_buff *) head);
                head->qlen += list->qlen;
                __skb_queue_head_init(list);
        }
}

/**
 *        __skb_queue_after - queue a buffer at the list head
 *        @list: list to use
 *        @prev: place after this buffer
 *        @newsk: buffer to queue
 *
 *        Queue a buffer int the middle of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_after(struct sk_buff_head *list,
                                     struct sk_buff *prev,
                                     struct sk_buff *newsk)
{
        __skb_insert(newsk, prev, ((struct sk_buff_list *)prev)->next, list);
}

void skb_append(struct sk_buff *old, struct sk_buff *newsk,
                struct sk_buff_head *list);

static inline void __skb_queue_before(struct sk_buff_head *list,
                                      struct sk_buff *next,
                                      struct sk_buff *newsk)
{
        __skb_insert(newsk, ((struct sk_buff_list *)next)->prev, next, list);
}

/**
 *        __skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_head(struct sk_buff_head *list,
                                    struct sk_buff *newsk)
{
        __skb_queue_after(list, (struct sk_buff *)list, newsk);
}
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk);

/**
 *        __skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the end of a list. This function takes no locks
 *        and you must therefore hold required locks before calling it.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
static inline void __skb_queue_tail(struct sk_buff_head *list,
                                   struct sk_buff *newsk)
{
        __skb_queue_before(list, (struct sk_buff *)list, newsk);
}
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk);

/*
 * remove sk_buff from list. _Must_ be called atomically, and with
 * the list known..
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list);
static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        struct sk_buff *next, *prev;

        WRITE_ONCE(list->qlen, list->qlen - 1);
        next           = skb->next;
        prev           = skb->prev;
        skb->next  = skb->prev = NULL;
        WRITE_ONCE(next->prev, prev);
        WRITE_ONCE(prev->next, next);
}

/**
 *        __skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The head item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue(struct sk_buff_head *list);

/**
 *        __skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. This function does not take any locks
 *        so must be used with appropriate locks held only. The tail item is
 *        returned or %NULL if the list is empty.
 */
static inline struct sk_buff *__skb_dequeue_tail(struct sk_buff_head *list)
{
        struct sk_buff *skb = skb_peek_tail(list);
        if (skb)
                __skb_unlink(skb, list);
        return skb;
}
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list);


static inline bool skb_is_nonlinear(const struct sk_buff *skb)
{
        return skb->data_len;
}

static inline unsigned int skb_headlen(const struct sk_buff *skb)
{
        return skb->len - skb->data_len;
}

static inline unsigned int __skb_pagelen(const struct sk_buff *skb)
{
        unsigned int i, len = 0;

        for (i = skb_shinfo(skb)->nr_frags - 1; (int)i >= 0; i--)
                len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
        return len;
}

static inline unsigned int skb_pagelen(const struct sk_buff *skb)
{
        return skb_headlen(skb) + __skb_pagelen(skb);
}

static inline void skb_frag_fill_netmem_desc(skb_frag_t *frag,
                                             netmem_ref netmem, int off,
                                             int size)
{
        frag->netmem = netmem;
        frag->offset = off;
        skb_frag_size_set(frag, size);
}

static inline void skb_frag_fill_page_desc(skb_frag_t *frag,
                                           struct page *page,
                                           int off, int size)
{
        skb_frag_fill_netmem_desc(frag, page_to_netmem(page), off, size);
}

static inline void __skb_fill_netmem_desc_noacc(struct skb_shared_info *shinfo,
                                                int i, netmem_ref netmem,
                                                int off, int size)
{
        skb_frag_t *frag = &shinfo->frags[i];

        skb_frag_fill_netmem_desc(frag, netmem, off, size);
}

static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo,
                                              int i, struct page *page,
                                              int off, int size)
{
        __skb_fill_netmem_desc_noacc(shinfo, i, page_to_netmem(page), off,
                                     size);
}

/**
 * skb_len_add - adds a number to len fields of skb
 * @skb: buffer to add len to
 * @delta: number of bytes to add
 */
static inline void skb_len_add(struct sk_buff *skb, int delta)
{
        skb->len += delta;
        skb->data_len += delta;
        skb->truesize += delta;
}

/**
 * __skb_fill_netmem_desc - initialise a fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: fragment index to initialise
 * @netmem: the netmem to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Initialises the @i'th fragment of @skb to point to &size bytes at
 * offset @off within @page.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
                                          netmem_ref netmem, int off, int size)
{
        struct page *page;

        __skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);

        if (netmem_is_net_iov(netmem)) {
                skb->unreadable = true;
                return;
        }

        page = netmem_to_page(netmem);

        /* Propagate page pfmemalloc to the skb if we can. The problem is
         * that not all callers have unique ownership of the page but rely
         * on page_is_pfmemalloc doing the right thing(tm).
         */
        page = compound_head(page);
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
                                        struct page *page, int off, int size)
{
        __skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
}

static inline void skb_fill_netmem_desc(struct sk_buff *skb, int i,
                                        netmem_ref netmem, int off, int size)
{
        __skb_fill_netmem_desc(skb, i, netmem, off, size);
        skb_shinfo(skb)->nr_frags = i + 1;
}

/**
 * skb_fill_page_desc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * As per __skb_fill_page_desc() -- initialises the @i'th fragment of
 * @skb to point to @size bytes at offset @off within @page. In
 * addition updates @skb such that @i is the last fragment.
 *
 * Does not take any additional reference on the fragment.
 */
static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
                                      struct page *page, int off, int size)
{
        skb_fill_netmem_desc(skb, i, page_to_netmem(page), off, size);
}

/**
 * skb_fill_page_desc_noacc - initialise a paged fragment in an skb
 * @skb: buffer containing fragment to be initialised
 * @i: paged fragment index to initialise
 * @page: the page to use for this fragment
 * @off: the offset to the data with @page
 * @size: the length of the data
 *
 * Variant of skb_fill_page_desc() which does not deal with
 * pfmemalloc, if page is not owned by us.
 */
static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i,
                                            struct page *page, int off,
                                            int size)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        __skb_fill_page_desc_noacc(shinfo, i, page, off, size);
        shinfo->nr_frags = i + 1;
}

void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
                            int off, int size, unsigned int truesize);

static inline void skb_add_rx_frag(struct sk_buff *skb, int i,
                                   struct page *page, int off, int size,
                                   unsigned int truesize)
{
        skb_add_rx_frag_netmem(skb, i, page_to_netmem(page), off, size,
                               truesize);
}

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize);

#define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))

#ifdef NET_SKBUFF_DATA_USES_OFFSET
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->head + skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data - skb->head;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb_reset_tail_pointer(skb);
        skb->tail += offset;
}

#else /* NET_SKBUFF_DATA_USES_OFFSET */
static inline unsigned char *skb_tail_pointer(const struct sk_buff *skb)
{
        return skb->tail;
}

static inline void skb_reset_tail_pointer(struct sk_buff *skb)
{
        skb->tail = skb->data;
}

static inline void skb_set_tail_pointer(struct sk_buff *skb, const int offset)
{
        skb->tail = skb->data + offset;
}

#endif /* NET_SKBUFF_DATA_USES_OFFSET */

static inline void skb_assert_len(struct sk_buff *skb)
{
#ifdef CONFIG_DEBUG_NET
        if (WARN_ONCE(!skb->len, "%s\n", __func__))
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
#endif /* CONFIG_DEBUG_NET */
}

#if defined(CONFIG_FAIL_SKB_REALLOC)
void skb_might_realloc(struct sk_buff *skb);
#else
static inline void skb_might_realloc(struct sk_buff *skb) {}
#endif

/*
 *        Add data to an sk_buff
 */
void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len);
void *skb_put(struct sk_buff *skb, unsigned int len);
static inline void *__skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        return tmp;
}

static inline void *__skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memset(tmp, 0, len);
        return tmp;
}

static inline void *__skb_put_data(struct sk_buff *skb, const void *data,
                                   unsigned int len)
{
        void *tmp = __skb_put(skb, len);

        memcpy(tmp, data, len);
        return tmp;
}

static inline void __skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)__skb_put(skb, 1) = val;
}

static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memset(tmp, 0, len);

        return tmp;
}

static inline void *skb_put_data(struct sk_buff *skb, const void *data,
                                 unsigned int len)
{
        void *tmp = skb_put(skb, len);

        memcpy(tmp, data, len);

        return tmp;
}

static inline void skb_put_u8(struct sk_buff *skb, u8 val)
{
        *(u8 *)skb_put(skb, 1) = val;
}

void *skb_push(struct sk_buff *skb, unsigned int len);
static inline void *__skb_push(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);

        skb->data -= len;
        skb->len  += len;
        return skb->data;
}

void *skb_pull(struct sk_buff *skb, unsigned int len);
static inline void *__skb_pull(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);

        skb->len -= len;
        if (unlikely(skb->len < skb->data_len)) {
#if defined(CONFIG_DEBUG_NET)
                skb->len += len;
                pr_err("__skb_pull(len=%u)\n", len);
                skb_dump(KERN_ERR, skb, false);
#endif
                BUG();
        }
        return skb->data += len;
}

static inline void *skb_pull_inline(struct sk_buff *skb, unsigned int len)
{
        return unlikely(len > skb->len) ? NULL : __skb_pull(skb, len);
}

void *skb_pull_data(struct sk_buff *skb, size_t len);

void *__pskb_pull_tail(struct sk_buff *skb, int delta);

static inline enum skb_drop_reason
pskb_may_pull_reason(struct sk_buff *skb, unsigned int len)
{
        DEBUG_NET_WARN_ON_ONCE(len > INT_MAX);
        skb_might_realloc(skb);

        if (likely(len <= skb_headlen(skb)))
                return SKB_NOT_DROPPED_YET;

        if (unlikely(len > skb->len))
                return SKB_DROP_REASON_PKT_TOO_SMALL;

        if (unlikely(!__pskb_pull_tail(skb, len - skb_headlen(skb))))
                return SKB_DROP_REASON_NOMEM;

        return SKB_NOT_DROPPED_YET;
}

static inline bool pskb_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET;
}

static inline void *pskb_pull(struct sk_buff *skb, unsigned int len)
{
        if (!pskb_may_pull(skb, len))
                return NULL;

        skb->len -= len;
        return skb->data += len;
}

void skb_condense(struct sk_buff *skb);

/**
 *        skb_headroom - bytes at buffer head
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the head of an &sk_buff.
 */
static inline unsigned int skb_headroom(const struct sk_buff *skb)
{
        return skb->data - skb->head;
}

/**
 *        skb_tailroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 */
static inline int skb_tailroom(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? 0 : skb->end - skb->tail;
}

/**
 *        skb_availroom - bytes at buffer end
 *        @skb: buffer to check
 *
 *        Return the number of bytes of free space at the tail of an sk_buff
 *        allocated by sk_stream_alloc()
 */
static inline int skb_availroom(const struct sk_buff *skb)
{
        if (skb_is_nonlinear(skb))
                return 0;

        return skb->end - skb->tail - skb->reserved_tailroom;
}

/**
 *        skb_reserve - adjust headroom
 *        @skb: buffer to alter
 *        @len: bytes to move
 *
 *        Increase the headroom of an empty &sk_buff by reducing the tail
 *        room. This is only allowed for an empty buffer.
 */
static inline void skb_reserve(struct sk_buff *skb, int len)
{
        skb->data += len;
        skb->tail += len;
}

/**
 *        skb_tailroom_reserve - adjust reserved_tailroom
 *        @skb: buffer to alter
 *        @mtu: maximum amount of headlen permitted
 *        @needed_tailroom: minimum amount of reserved_tailroom
 *
 *        Set reserved_tailroom so that headlen can be as large as possible but
 *        not larger than mtu and tailroom cannot be smaller than
 *        needed_tailroom.
 *        The required headroom should already have been reserved before using
 *        this function.
 */
static inline void skb_tailroom_reserve(struct sk_buff *skb, unsigned int mtu,
                                        unsigned int needed_tailroom)
{
        SKB_LINEAR_ASSERT(skb);
        if (mtu < skb_tailroom(skb) - needed_tailroom)
                /* use at most mtu */
                skb->reserved_tailroom = skb_tailroom(skb) - mtu;
        else
                /* use up to all available space */
                skb->reserved_tailroom = needed_tailroom;
}

#define ENCAP_TYPE_ETHER        0
#define ENCAP_TYPE_IPPROTO        1

static inline void skb_set_inner_protocol(struct sk_buff *skb,
                                          __be16 protocol)
{
        skb->inner_protocol = protocol;
        skb->inner_protocol_type = ENCAP_TYPE_ETHER;
}

static inline void skb_set_inner_ipproto(struct sk_buff *skb,
                                         __u8 ipproto)
{
        skb->inner_ipproto = ipproto;
        skb->inner_protocol_type = ENCAP_TYPE_IPPROTO;
}

static inline void skb_reset_inner_headers(struct sk_buff *skb)
{
        skb->inner_mac_header = skb->mac_header;
        skb->inner_network_header = skb->network_header;
        skb->inner_transport_header = skb->transport_header;
}

static inline int skb_mac_header_was_set(const struct sk_buff *skb)
{
        return skb->mac_header != (typeof(skb->mac_header))~0U;
}

static inline void skb_reset_mac_len(struct sk_buff *skb)
{
        if (!skb_mac_header_was_set(skb)) {
                DEBUG_NET_WARN_ON_ONCE(1);
                skb->mac_len = 0;
        } else {
                skb->mac_len = skb->network_header - skb->mac_header;
        }
}

static inline unsigned char *skb_inner_transport_header(const struct sk_buff
                                                        *skb)
{
        return skb->head + skb->inner_transport_header;
}

static inline int skb_inner_transport_offset(const struct sk_buff *skb)
{
        return skb_inner_transport_header(skb) - skb->data;
}

static inline void skb_reset_inner_transport_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_transport_header))offset);
        skb->inner_transport_header = offset;
}

static inline void skb_set_inner_transport_header(struct sk_buff *skb,
                                                   const int offset)
{
        skb_reset_inner_transport_header(skb);
        skb->inner_transport_header += offset;
}

static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_network_header;
}

static inline void skb_reset_inner_network_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_network_header))offset);
        skb->inner_network_header = offset;
}

static inline void skb_set_inner_network_header(struct sk_buff *skb,
                                                const int offset)
{
        skb_reset_inner_network_header(skb);
        skb->inner_network_header += offset;
}

static inline bool skb_inner_network_header_was_set(const struct sk_buff *skb)
{
        return skb->inner_network_header > 0;
}

static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
{
        return skb->head + skb->inner_mac_header;
}

static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->inner_mac_header))offset);
        skb->inner_mac_header = offset;
}

static inline void skb_set_inner_mac_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_inner_mac_header(skb);
        skb->inner_mac_header += offset;
}
static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
{
        return skb->transport_header != (typeof(skb->transport_header))~0U;
}

static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
        return skb->head + skb->transport_header;
}

static inline void skb_reset_transport_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->transport_header))offset);
        skb->transport_header = offset;
}

static inline void skb_set_transport_header(struct sk_buff *skb,
                                            const int offset)
{
        skb_reset_transport_header(skb);
        skb->transport_header += offset;
}

static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
        return skb->head + skb->network_header;
}

static inline void skb_reset_network_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->network_header))offset);
        skb->network_header = offset;
}

static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
{
        skb_reset_network_header(skb);
        skb->network_header += offset;
}

static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
        return skb->head + skb->mac_header;
}

static inline int skb_mac_offset(const struct sk_buff *skb)
{
        return skb_mac_header(skb) - skb->data;
}

static inline u32 skb_mac_header_len(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_mac_header_was_set(skb));
        return skb->network_header - skb->mac_header;
}

static inline void skb_unset_mac_header(struct sk_buff *skb)
{
        skb->mac_header = (typeof(skb->mac_header))~0U;
}

static inline void skb_reset_mac_header(struct sk_buff *skb)
{
        long offset = skb->data - skb->head;

        DEBUG_NET_WARN_ON_ONCE(offset != (typeof(skb->mac_header))offset);
        skb->mac_header = offset;
}

static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
{
        skb_reset_mac_header(skb);
        skb->mac_header += offset;
}

static inline void skb_pop_mac_header(struct sk_buff *skb)
{
        skb->mac_header = skb->network_header;
}

static inline void skb_probe_transport_header(struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (skb_transport_header_was_set(skb))
                return;

        if (skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                             NULL, 0, 0, 0, 0))
                skb_set_transport_header(skb, keys.control.thoff);
}

static inline void skb_mac_header_rebuild(struct sk_buff *skb)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -skb->mac_len);
                memmove(skb_mac_header(skb), old_mac, skb->mac_len);
        }
}

/* Move the full mac header up to current network_header.
 * Leaves skb->data pointing at offset skb->mac_len into the mac_header.
 * Must be provided the complete mac header length.
 */
static inline void skb_mac_header_rebuild_full(struct sk_buff *skb, u32 full_mac_len)
{
        if (skb_mac_header_was_set(skb)) {
                const unsigned char *old_mac = skb_mac_header(skb);

                skb_set_mac_header(skb, -full_mac_len);
                memmove(skb_mac_header(skb), old_mac, full_mac_len);
                __skb_push(skb, full_mac_len - skb->mac_len);
        }
}

static inline int skb_checksum_start_offset(const struct sk_buff *skb)
{
        return skb->csum_start - skb_headroom(skb);
}

static inline unsigned char *skb_checksum_start(const struct sk_buff *skb)
{
        return skb->head + skb->csum_start;
}

static inline int skb_transport_offset(const struct sk_buff *skb)
{
        return skb_transport_header(skb) - skb->data;
}

static inline u32 skb_network_header_len(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(!skb_transport_header_was_set(skb));
        return skb->transport_header - skb->network_header;
}

static inline u32 skb_inner_network_header_len(const struct sk_buff *skb)
{
        return skb->inner_transport_header - skb->inner_network_header;
}

static inline int skb_network_offset(const struct sk_buff *skb)
{
        return skb_network_header(skb) - skb->data;
}

static inline int skb_inner_network_offset(const struct sk_buff *skb)
{
        return skb_inner_network_header(skb) - skb->data;
}

static inline enum skb_drop_reason
pskb_network_may_pull_reason(struct sk_buff *skb, unsigned int len)
{
        return pskb_may_pull_reason(skb, skb_network_offset(skb) + len);
}

static inline int pskb_network_may_pull(struct sk_buff *skb, unsigned int len)
{
        return pskb_network_may_pull_reason(skb, len) == SKB_NOT_DROPPED_YET;
}

/*
 * CPUs often take a performance hit when accessing unaligned memory
 * locations. The actual performance hit varies, it can be small if the
 * hardware handles it or large if we have to take an exception and fix it
 * in software.
 *
 * Since an ethernet header is 14 bytes network drivers often end up with
 * the IP header at an unaligned offset. The IP header can be aligned by
 * shifting the start of the packet by 2 bytes. Drivers should do this
 * with:
 *
 * skb_reserve(skb, NET_IP_ALIGN);
 *
 * The downside to this alignment of the IP header is that the DMA is now
 * unaligned. On some architectures the cost of an unaligned DMA is high
 * and this cost outweighs the gains made by aligning the IP header.
 *
 * Since this trade off varies between architectures, we allow NET_IP_ALIGN
 * to be overridden.
 */
#ifndef NET_IP_ALIGN
#define NET_IP_ALIGN        2
#endif

/*
 * The networking layer reserves some headroom in skb data (via
 * dev_alloc_skb). This is used to avoid having to reallocate skb data when
 * the header has to grow. In the default case, if the header has to grow
 * 32 bytes or less we avoid the reallocation.
 *
 * Unfortunately this headroom changes the DMA alignment of the resulting
 * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
 * on some architectures. An architecture can override this value,
 * perhaps setting it to a cacheline in size (since that will maintain
 * cacheline alignment of the DMA). It must be a power of 2.
 *
 * Various parts of the networking layer expect at least 32 bytes of
 * headroom, you should not reduce this.
 *
 * Using max(32, L1_CACHE_BYTES) makes sense (especially with RPS)
 * to reduce average number of cache lines per packet.
 * get_rps_cpu() for example only access one 64 bytes aligned block :
 * NET_IP_ALIGN(2) + ethernet_header(14) + IP_header(20/40) + ports(8)
 */
#ifndef NET_SKB_PAD
#define NET_SKB_PAD        max(32, L1_CACHE_BYTES)
#endif

int ___pskb_trim(struct sk_buff *skb, unsigned int len);

static inline void __skb_set_length(struct sk_buff *skb, unsigned int len)
{
        if (WARN_ON(skb_is_nonlinear(skb)))
                return;
        skb->len = len;
        skb_set_tail_pointer(skb, len);
}

static inline void __skb_trim(struct sk_buff *skb, unsigned int len)
{
        __skb_set_length(skb, len);
}

void skb_trim(struct sk_buff *skb, unsigned int len);

static inline int __pskb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->data_len)
                return ___pskb_trim(skb, len);
        __skb_trim(skb, len);
        return 0;
}

static inline int pskb_trim(struct sk_buff *skb, unsigned int len)
{
        skb_might_realloc(skb);
        return (len < skb->len) ? __pskb_trim(skb, len) : 0;
}

/**
 *        pskb_trim_unique - remove end from a paged unique (not cloned) buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        This is identical to pskb_trim except that the caller knows that
 *        the skb is not cloned so we should never get an error due to out-
 *        of-memory.
 */
static inline void pskb_trim_unique(struct sk_buff *skb, unsigned int len)
{
        int err = pskb_trim(skb, len);
        BUG_ON(err);
}

static inline int __skb_grow(struct sk_buff *skb, unsigned int len)
{
        unsigned int diff = len - skb->len;

        if (skb_tailroom(skb) < diff) {
                int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb),
                                           GFP_ATOMIC);
                if (ret)
                        return ret;
        }
        __skb_set_length(skb, len);
        return 0;
}

/**
 *        skb_orphan - orphan a buffer
 *        @skb: buffer to orphan
 *
 *        If a buffer currently has an owner then we call the owner's
 *        destructor function and make the @skb unowned. The buffer continues
 *        to exist but is no longer charged to its former owner.
 */
static inline void skb_orphan(struct sk_buff *skb)
{
        if (skb->destructor) {
                skb->destructor(skb);
                skb->destructor = NULL;
                skb->sk                = NULL;
        } else {
                BUG_ON(skb->sk);
        }
}

/**
 *        skb_orphan_frags - orphan the frags contained in a buffer
 *        @skb: buffer to orphan frags from
 *        @gfp_mask: allocation mask for replacement pages
 *
 *        For each frag in the SKB which needs a destructor (i.e. has an
 *        owner) create a copy of that frag and release the original
 *        page by calling the destructor.
 */
static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        if (skb_shinfo(skb)->flags & SKBFL_DONT_ORPHAN)
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/* Frags must be orphaned, even if refcounted, if skb might loop to rx path */
static inline int skb_orphan_frags_rx(struct sk_buff *skb, gfp_t gfp_mask)
{
        if (likely(!skb_zcopy(skb)))
                return 0;
        return skb_copy_ubufs(skb, gfp_mask);
}

/**
 *        __skb_queue_purge_reason - empty a list
 *        @list: list to empty
 *        @reason: drop reason
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function does not take the
 *        list lock and the caller must hold the relevant locks to use it.
 */
static inline void __skb_queue_purge_reason(struct sk_buff_head *list,
                                            enum skb_drop_reason reason)
{
        struct sk_buff *skb;

        while ((skb = __skb_dequeue(list)) != NULL)
                kfree_skb_reason(skb, reason);
}

static inline void __skb_queue_purge(struct sk_buff_head *list)
{
        __skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE);
}

void skb_queue_purge_reason(struct sk_buff_head *list,
                            enum skb_drop_reason reason);

static inline void skb_queue_purge(struct sk_buff_head *list)
{
        skb_queue_purge_reason(list, SKB_DROP_REASON_QUEUE_PURGE);
}

unsigned int skb_rbtree_purge(struct rb_root *root);
void skb_errqueue_purge(struct sk_buff_head *list);

void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);

/**
 * netdev_alloc_frag - allocate a page fragment
 * @fragsz: fragment size
 *
 * Allocates a frag from a page for receive buffer.
 * Uses GFP_ATOMIC allocations.
 */
static inline void *netdev_alloc_frag(unsigned int fragsz)
{
        return __netdev_alloc_frag_align(fragsz, ~0u);
}

static inline void *netdev_alloc_frag_align(unsigned int fragsz,
                                            unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __netdev_alloc_frag_align(fragsz, -align);
}

struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
                                   gfp_t gfp_mask);

/**
 *        netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @length: length to allocate
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has unspecified headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory. Although this function
 *        allocates memory it can be called from an interrupt.
 */
static inline struct sk_buff *netdev_alloc_skb(struct net_device *dev,
                                               unsigned int length)
{
        return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
}

/* legacy helper around __netdev_alloc_skb() */
static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
                                              gfp_t gfp_mask)
{
        return __netdev_alloc_skb(NULL, length, gfp_mask);
}

/* legacy helper around netdev_alloc_skb() */
static inline struct sk_buff *dev_alloc_skb(unsigned int length)
{
        return netdev_alloc_skb(NULL, length);
}


static inline struct sk_buff *__netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length, gfp_t gfp)
{
        struct sk_buff *skb = __netdev_alloc_skb(dev, length + NET_IP_ALIGN, gfp);

        if (NET_IP_ALIGN && skb)
                skb_reserve(skb, NET_IP_ALIGN);
        return skb;
}

static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev,
                unsigned int length)
{
        return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC);
}

static inline void skb_free_frag(void *addr)
{
        page_frag_free(addr);
}

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);

static inline void *napi_alloc_frag(unsigned int fragsz)
{
        return __napi_alloc_frag_align(fragsz, ~0u);
}

static inline void *napi_alloc_frag_align(unsigned int fragsz,
                                          unsigned int align)
{
        WARN_ON_ONCE(!is_power_of_2(align));
        return __napi_alloc_frag_align(fragsz, -align);
}

struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int length);
void napi_consume_skb(struct sk_buff *skb, int budget);

void napi_skb_free_stolen_head(struct sk_buff *skb);
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason);

/**
 * __dev_alloc_pages - allocate page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 * @order: size of the allocation
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
*/
static inline struct page *__dev_alloc_pages_noprof(gfp_t gfp_mask,
                                             unsigned int order)
{
        /* This piece of code contains several assumptions.
         * 1.  This is for device Rx, therefore a cold page is preferred.
         * 2.  The expectation is the user wants a compound page.
         * 3.  If requesting a order 0 page it will not be compound
         *     due to the check to see if order has a value in prep_new_page
         * 4.  __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to
         *     code in gfp_to_alloc_flags that should be enforcing this.
         */
        gfp_mask |= __GFP_COMP | __GFP_MEMALLOC;

        return alloc_pages_node_noprof(NUMA_NO_NODE, gfp_mask, order);
}
#define __dev_alloc_pages(...)        alloc_hooks(__dev_alloc_pages_noprof(__VA_ARGS__))

/*
 * This specialized allocator has to be a macro for its allocations to be
 * accounted separately (to have a separate alloc_tag).
 */
#define dev_alloc_pages(_order) __dev_alloc_pages(GFP_ATOMIC | __GFP_NOWARN, _order)

/**
 * __dev_alloc_page - allocate a page for network Rx
 * @gfp_mask: allocation priority. Set __GFP_NOMEMALLOC if not for network Rx
 *
 * Allocate a new page.
 *
 * %NULL is returned if there is no free memory.
 */
static inline struct page *__dev_alloc_page_noprof(gfp_t gfp_mask)
{
        return __dev_alloc_pages_noprof(gfp_mask, 0);
}
#define __dev_alloc_page(...)        alloc_hooks(__dev_alloc_page_noprof(__VA_ARGS__))

/*
 * This specialized allocator has to be a macro for its allocations to be
 * accounted separately (to have a separate alloc_tag).
 */
#define dev_alloc_page()        dev_alloc_pages(0)

/**
 * dev_page_is_reusable - check whether a page can be reused for network Rx
 * @page: the page to test
 *
 * A page shouldn't be considered for reusing/recycling if it was allocated
 * under memory pressure or at a distant memory node.
 *
 * Returns: false if this page should be returned to page allocator, true
 * otherwise.
 */
static inline bool dev_page_is_reusable(const struct page *page)
{
        return likely(page_to_nid(page) == numa_mem_id() &&
                      !page_is_pfmemalloc(page));
}

/**
 *        skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page
 *        @page: The page that was allocated from skb_alloc_page
 *        @skb: The skb that may need pfmemalloc set
 */
static inline void skb_propagate_pfmemalloc(const struct page *page,
                                            struct sk_buff *skb)
{
        if (page_is_pfmemalloc(page))
                skb->pfmemalloc = true;
}

/**
 * skb_frag_off() - Returns the offset of a skb fragment
 * @frag: the paged fragment
 */
static inline unsigned int skb_frag_off(const skb_frag_t *frag)
{
        return frag->offset;
}

/**
 * skb_frag_off_add() - Increments the offset of a skb fragment by @delta
 * @frag: skb fragment
 * @delta: value to add
 */
static inline void skb_frag_off_add(skb_frag_t *frag, int delta)
{
        frag->offset += delta;
}

/**
 * skb_frag_off_set() - Sets the offset of a skb fragment
 * @frag: skb fragment
 * @offset: offset of fragment
 */
static inline void skb_frag_off_set(skb_frag_t *frag, unsigned int offset)
{
        frag->offset = offset;
}

/**
 * skb_frag_off_copy() - Sets the offset of a skb fragment from another fragment
 * @fragto: skb fragment where offset is set
 * @fragfrom: skb fragment offset is copied from
 */
static inline void skb_frag_off_copy(skb_frag_t *fragto,
                                     const skb_frag_t *fragfrom)
{
        fragto->offset = fragfrom->offset;
}

/* Return: true if the skb_frag contains a net_iov. */
static inline bool skb_frag_is_net_iov(const skb_frag_t *frag)
{
        return netmem_is_net_iov(frag->netmem);
}

/**
 * skb_frag_net_iov - retrieve the net_iov referred to by fragment
 * @frag: the fragment
 *
 * Return: the &struct net_iov associated with @frag. Returns NULL if this
 * frag has no associated net_iov.
 */
static inline struct net_iov *skb_frag_net_iov(const skb_frag_t *frag)
{
        if (!skb_frag_is_net_iov(frag))
                return NULL;

        return netmem_to_net_iov(frag->netmem);
}

/**
 * skb_frag_page - retrieve the page referred to by a paged fragment
 * @frag: the paged fragment
 *
 * Return: the &struct page associated with @frag. Returns NULL if this frag
 * has no associated page.
 */
static inline struct page *skb_frag_page(const skb_frag_t *frag)
{
        if (skb_frag_is_net_iov(frag))
                return NULL;

        return netmem_to_page(frag->netmem);
}

/**
 * skb_frag_netmem - retrieve the netmem referred to by a fragment
 * @frag: the fragment
 *
 * Return: the &netmem_ref associated with @frag.
 */
static inline netmem_ref skb_frag_netmem(const skb_frag_t *frag)
{
        return frag->netmem;
}

int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
                    unsigned int headroom);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
                         const struct bpf_prog *prog);

/**
 * skb_frag_address - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns: the address of the data within @frag. The page must already
 * be mapped.
 */
static inline void *skb_frag_address(const skb_frag_t *frag)
{
        if (!skb_frag_page(frag))
                return NULL;

        return page_address(skb_frag_page(frag)) + skb_frag_off(frag);
}

/**
 * skb_frag_address_safe - gets the address of the data contained in a paged fragment
 * @frag: the paged fragment buffer
 *
 * Returns: the address of the data within @frag. Checks that the page
 * is mapped and returns %NULL otherwise.
 */
static inline void *skb_frag_address_safe(const skb_frag_t *frag)
{
        void *ptr = page_address(skb_frag_page(frag));
        if (unlikely(!ptr))
                return NULL;

        return ptr + skb_frag_off(frag);
}

/**
 * skb_frag_page_copy() - sets the page in a fragment from another fragment
 * @fragto: skb fragment where page is set
 * @fragfrom: skb fragment page is copied from
 */
static inline void skb_frag_page_copy(skb_frag_t *fragto,
                                      const skb_frag_t *fragfrom)
{
        fragto->netmem = fragfrom->netmem;
}

bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio);

/**
 * __skb_frag_dma_map - maps a paged fragment via the DMA API
 * @dev: the device to map the fragment to
 * @frag: the paged fragment to map
 * @offset: the offset within the fragment (starting at the
 *          fragment's own offset)
 * @size: the number of bytes to map
 * @dir: the direction of the mapping (``PCI_DMA_*``)
 *
 * Maps the page associated with @frag to @device.
 */
static inline dma_addr_t __skb_frag_dma_map(struct device *dev,
                                            const skb_frag_t *frag,
                                            size_t offset, size_t size,
                                            enum dma_data_direction dir)
{
        return dma_map_page(dev, skb_frag_page(frag),
                            skb_frag_off(frag) + offset, size, dir);
}

#define skb_frag_dma_map(dev, frag, ...)                                \
        CONCATENATE(_skb_frag_dma_map,                                        \
                    COUNT_ARGS(__VA_ARGS__))(dev, frag, ##__VA_ARGS__)

#define __skb_frag_dma_map1(dev, frag, offset, uf, uo) ({                \
        const skb_frag_t *uf = (frag);                                        \
        size_t uo = (offset);                                                \
                                                                        \
        __skb_frag_dma_map(dev, uf, uo, skb_frag_size(uf) - uo,                \
                           DMA_TO_DEVICE);                                \
})
#define _skb_frag_dma_map1(dev, frag, offset)                                \
        __skb_frag_dma_map1(dev, frag, offset, __UNIQUE_ID(frag_),        \
                            __UNIQUE_ID(offset_))
#define _skb_frag_dma_map0(dev, frag)                                        \
        _skb_frag_dma_map1(dev, frag, 0)
#define _skb_frag_dma_map2(dev, frag, offset, size)                        \
        __skb_frag_dma_map(dev, frag, offset, size, DMA_TO_DEVICE)
#define _skb_frag_dma_map3(dev, frag, offset, size, dir)                \
        __skb_frag_dma_map(dev, frag, offset, size, dir)

static inline struct sk_buff *pskb_copy(struct sk_buff *skb,
                                        gfp_t gfp_mask)
{
        return __pskb_copy(skb, skb_headroom(skb), gfp_mask);
}


static inline struct sk_buff *pskb_copy_for_clone(struct sk_buff *skb,
                                                  gfp_t gfp_mask)
{
        return __pskb_copy_fclone(skb, skb_headroom(skb), gfp_mask, true);
}


/**
 *        skb_clone_writable - is the header of a clone writable
 *        @skb: buffer to check
 *        @len: length up to which to write
 *
 *        Returns true if modifying the header part of the cloned buffer
 *        does not requires the data to be copied.
 */
static inline int skb_clone_writable(const struct sk_buff *skb, unsigned int len)
{
        return !skb_header_cloned(skb) &&
               skb_headroom(skb) + len <= skb->hdr_len;
}

static inline int skb_try_make_writable(struct sk_buff *skb,
                                        unsigned int write_len)
{
        return skb_cloned(skb) && !skb_clone_writable(skb, write_len) &&
               pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}

static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
                            int cloned)
{
        int delta = 0;

        if (headroom > skb_headroom(skb))
                delta = headroom - skb_headroom(skb);

        if (delta || cloned)
                return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
                                        GFP_ATOMIC);
        return 0;
}

/**
 *        skb_cow - copy header of skb when it is required
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        If the skb passed lacks sufficient headroom or its data part
 *        is shared, data is reallocated. If reallocation fails, an error
 *        is returned and original skb is not changed.
 *
 *        The result is skb with writable area skb->head...skb->tail
 *        and at least @headroom of space at head.
 */
static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_cloned(skb));
}

/**
 *        skb_cow_head - skb_cow but only making the head writable
 *        @skb: buffer to cow
 *        @headroom: needed headroom
 *
 *        This function is identical to skb_cow except that we replace the
 *        skb_cloned check by skb_header_cloned.  It should be used when
 *        you only need to push on some header and do not need to modify
 *        the data.
 */
static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
{
        return __skb_cow(skb, headroom, skb_header_cloned(skb));
}

/**
 *        skb_padto        - pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int skb_padto(struct sk_buff *skb, unsigned int len)
{
        unsigned int size = skb->len;
        if (likely(size >= len))
                return 0;
        return skb_pad(skb, len - size);
}

/**
 *        __skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *        @free_on_error: free buffer on error
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error if @free_on_error is true.
 */
static inline int __must_check __skb_put_padto(struct sk_buff *skb,
                                               unsigned int len,
                                               bool free_on_error)
{
        unsigned int size = skb->len;

        if (unlikely(size < len)) {
                len -= size;
                if (__skb_pad(skb, len, free_on_error))
                        return -ENOMEM;
                __skb_put(skb, len);
        }
        return 0;
}

/**
 *        skb_put_padto - increase size and pad an skbuff up to a minimal size
 *        @skb: buffer to pad
 *        @len: minimal length
 *
 *        Pads up a buffer to ensure the trailing bytes exist and are
 *        blanked. If the buffer already contains sufficient data it
 *        is untouched. Otherwise it is extended. Returns zero on
 *        success. The skb is freed on error.
 */
static inline int __must_check skb_put_padto(struct sk_buff *skb, unsigned int len)
{
        return __skb_put_padto(skb, len, true);
}

bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i)
        __must_check;

static inline bool skb_can_coalesce(struct sk_buff *skb, int i,
                                    const struct page *page, int off)
{
        if (skb_zcopy(skb))
                return false;
        if (i) {
                const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];

                return page == skb_frag_page(frag) &&
                       off == skb_frag_off(frag) + skb_frag_size(frag);
        }
        return false;
}

static inline int __skb_linearize(struct sk_buff *skb)
{
        return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM;
}

/**
 *        skb_linearize - convert paged skb to linear one
 *        @skb: buffer to linarize
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0;
}

/**
 * skb_has_shared_frag - can any frag be overwritten
 * @skb: buffer to test
 *
 * Return: true if the skb has at least one frag that might be modified
 * by an external entity (as in vmsplice()/sendfile())
 */
static inline bool skb_has_shared_frag(const struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) &&
               skb_shinfo(skb)->flags & SKBFL_SHARED_FRAG;
}

/**
 *        skb_linearize_cow - make sure skb is linear and writable
 *        @skb: buffer to process
 *
 *        If there is no free memory -ENOMEM is returned, otherwise zero
 *        is returned and the old skb data released.
 */
static inline int skb_linearize_cow(struct sk_buff *skb)
{
        return skb_is_nonlinear(skb) || skb_cloned(skb) ?
               __skb_linearize(skb) : 0;
}

static __always_inline void
__skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_sub(skb->csum,
                                           csum_partial(start, len, 0), off);
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 *        skb_postpull_rcsum - update checksum for received skb after pull
 *        @skb: buffer to update
 *        @start: start of data before pull
 *        @len: length of data pulled
 *
 *        After doing a pull on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum, or set ip_summed to
 *        CHECKSUM_NONE so that it can be recomputed from scratch.
 */
static inline void skb_postpull_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = wsum_negate(csum_partial(start, len,
                                                     wsum_negate(skb->csum)));
        else if (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) < 0)
                skb->ip_summed = CHECKSUM_NONE;
}

static __always_inline void
__skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len,
                     unsigned int off)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->csum = csum_block_add(skb->csum,
                                           csum_partial(start, len, 0), off);
}

/**
 *        skb_postpush_rcsum - update checksum for received skb after push
 *        @skb: buffer to update
 *        @start: start of data after push
 *        @len: length of data pushed
 *
 *        After doing a push on a received packet, you need to call this to
 *        update the CHECKSUM_COMPLETE checksum.
 */
static inline void skb_postpush_rcsum(struct sk_buff *skb,
                                      const void *start, unsigned int len)
{
        __skb_postpush_rcsum(skb, start, len, 0);
}

void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len);

/**
 *        skb_push_rcsum - push skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_push on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_push unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len)
{
        skb_push(skb, len);
        skb_postpush_rcsum(skb, skb->data, len);
        return skb->data;
}

int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len);
/**
 *        pskb_trim_rcsum - trim received skb and update checksum
 *        @skb: buffer to trim
 *        @len: new length
 *
 *        This is exactly the same as pskb_trim except that it ensures the
 *        checksum of received packets are still valid after the operation.
 *        It can change skb pointers.
 */

static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        skb_might_realloc(skb);
        if (likely(len >= skb->len))
                return 0;
        return pskb_trim_rcsum_slow(skb, len);
}

static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        __skb_trim(skb, len);
        return 0;
}

static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
        return __skb_grow(skb, len);
}

#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
#define skb_rb_first(root) rb_to_skb(rb_first(root))
#define skb_rb_last(root)  rb_to_skb(rb_last(root))
#define skb_rb_next(skb)   rb_to_skb(rb_next(&(skb)->rbnode))
#define skb_rb_prev(skb)   rb_to_skb(rb_prev(&(skb)->rbnode))

#define skb_queue_walk(queue, skb) \
                for (skb = (queue)->next;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->next)

#define skb_queue_walk_safe(queue, skb, tmp)                                        \
                for (skb = (queue)->next, tmp = skb->next;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_walk_from(queue, skb)                                                \
                for (; skb != (struct sk_buff *)(queue);                        \
                     skb = skb->next)

#define skb_rbtree_walk(skb, root)                                                \
                for (skb = skb_rb_first(root); skb != NULL;                        \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from(skb)                                                \
                for (; skb != NULL;                                                \
                     skb = skb_rb_next(skb))

#define skb_rbtree_walk_from_safe(skb, tmp)                                        \
                for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL);        \
                     skb = tmp)

#define skb_queue_walk_from_safe(queue, skb, tmp)                                \
                for (tmp = skb->next;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->next)

#define skb_queue_reverse_walk(queue, skb) \
                for (skb = (queue)->prev;                                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = skb->prev)

#define skb_queue_reverse_walk_safe(queue, skb, tmp)                                \
                for (skb = (queue)->prev, tmp = skb->prev;                        \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

#define skb_queue_reverse_walk_from_safe(queue, skb, tmp)                        \
                for (tmp = skb->prev;                                                \
                     skb != (struct sk_buff *)(queue);                                \
                     skb = tmp, tmp = skb->prev)

static inline bool skb_has_frag_list(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->frag_list != NULL;
}

static inline void skb_frag_list_init(struct sk_buff *skb)
{
        skb_shinfo(skb)->frag_list = NULL;
}

#define skb_walk_frags(skb, iter)        \
        for (iter = skb_shinfo(skb)->frag_list; iter; iter = iter->next)


int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
                                int *err, long *timeo_p,
                                const struct sk_buff *skb);
struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
                                          struct sk_buff_head *queue,
                                          unsigned int flags,
                                          int *off, int *err,
                                          struct sk_buff **last);
struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
                                        struct sk_buff_head *queue,
                                        unsigned int flags, int *off, int *err,
                                        struct sk_buff **last);
struct sk_buff *__skb_recv_datagram(struct sock *sk,
                                    struct sk_buff_head *sk_queue,
                                    unsigned int flags, int *off, int *err);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags, int *err);
__poll_t datagram_poll(struct file *file, struct socket *sock,
                           struct poll_table_struct *wait);
int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
                           struct iov_iter *to, int size);
static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
                                        struct msghdr *msg, int size)
{
        return skb_copy_datagram_iter(from, offset, &msg->msg_iter, size);
}
int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen,
                                   struct msghdr *msg);
int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset,
                           struct iov_iter *to, int len,
                           struct ahash_request *hash);
int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
                                 struct iov_iter *from, int len);
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len);
int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len);
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, u8 *to,
                              int len);
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int len,
                    unsigned int flags);
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len);
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len);
void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to);
unsigned int skb_zerocopy_headlen(const struct sk_buff *from);
int skb_zerocopy(struct sk_buff *to, struct sk_buff *from,
                 int len, int hlen);
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len);
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen);
void skb_scrub_packet(struct sk_buff *skb, bool xnet);
struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
struct sk_buff *skb_segment_list(struct sk_buff *skb, netdev_features_t features,
                                 unsigned int offset);
struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len);
int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev);
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci);
int skb_vlan_pop(struct sk_buff *skb);
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci);
int skb_eth_pop(struct sk_buff *skb);
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src);
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet);
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet);
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse);
int skb_mpls_dec_ttl(struct sk_buff *skb);
struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy,
                             gfp_t gfp);

static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len)
{
        return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT;
}

static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len)
{
        return copy_to_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT;
}

struct skb_checksum_ops {
        __wsum (*update)(const void *mem, int len, __wsum wsum);
        __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
};

extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;

__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
                      __wsum csum, const struct skb_checksum_ops *ops);
__wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
                    __wsum csum);

static inline void * __must_check
__skb_header_pointer(const struct sk_buff *skb, int offset, int len,
                     const void *data, int hlen, void *buffer)
{
        if (likely(hlen - offset >= len))
                return (void *)data + offset;

        if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
                return NULL;

        return buffer;
}

static inline void * __must_check
skb_header_pointer(const struct sk_buff *skb, int offset, int len, void *buffer)
{
        return __skb_header_pointer(skb, offset, len, skb->data,
                                    skb_headlen(skb), buffer);
}

static inline void * __must_check
skb_pointer_if_linear(const struct sk_buff *skb, int offset, int len)
{
        if (likely(skb_headlen(skb) - offset >= len))
                return skb->data + offset;
        return NULL;
}

/**
 *        skb_needs_linearize - check if we need to linearize a given skb
 *                              depending on the given device features.
 *        @skb: socket buffer to check
 *        @features: net device features
 *
 *        Returns true if either:
 *        1. skb has frag_list and the device doesn't support FRAGLIST, or
 *        2. skb is fragmented and the device does not support SG.
 */
static inline bool skb_needs_linearize(struct sk_buff *skb,
                                       netdev_features_t features)
{
        return skb_is_nonlinear(skb) &&
               ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) ||
                (skb_shinfo(skb)->nr_frags && !(features & NETIF_F_SG)));
}

static inline void skb_copy_from_linear_data(const struct sk_buff *skb,
                                             void *to,
                                             const unsigned int len)
{
        memcpy(to, skb->data, len);
}

static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
                                                    const int offset, void *to,
                                                    const unsigned int len)
{
        memcpy(to, skb->data + offset, len);
}

static inline void skb_copy_to_linear_data(struct sk_buff *skb,
                                           const void *from,
                                           const unsigned int len)
{
        memcpy(skb->data, from, len);
}

static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
                                                  const int offset,
                                                  const void *from,
                                                  const unsigned int len)
{
        memcpy(skb->data + offset, from, len);
}

void skb_init(void);

static inline ktime_t skb_get_ktime(const struct sk_buff *skb)
{
        return skb->tstamp;
}

/**
 *        skb_get_timestamp - get timestamp from a skb
 *        @skb: skb to get stamp from
 *        @stamp: pointer to struct __kernel_old_timeval to store stamp in
 *
 *        Timestamps are stored in the skb as offsets to a base timestamp.
 *        This function converts the offset back to a struct timeval and stores
 *        it in stamp.
 */
static inline void skb_get_timestamp(const struct sk_buff *skb,
                                     struct __kernel_old_timeval *stamp)
{
        *stamp = ns_to_kernel_old_timeval(skb->tstamp);
}

static inline void skb_get_new_timestamp(const struct sk_buff *skb,
                                         struct __kernel_sock_timeval *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_usec = ts.tv_nsec / 1000;
}

static inline void skb_get_timestampns(const struct sk_buff *skb,
                                       struct __kernel_old_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void skb_get_new_timestampns(const struct sk_buff *skb,
                                           struct __kernel_timespec *stamp)
{
        struct timespec64 ts = ktime_to_timespec64(skb->tstamp);

        stamp->tv_sec = ts.tv_sec;
        stamp->tv_nsec = ts.tv_nsec;
}

static inline void __net_timestamp(struct sk_buff *skb)
{
        skb->tstamp = ktime_get_real();
        skb->tstamp_type = SKB_CLOCK_REALTIME;
}

static inline ktime_t net_timedelta(ktime_t t)
{
        return ktime_sub(ktime_get_real(), t);
}

static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
                                         u8 tstamp_type)
{
        skb->tstamp = kt;

        if (kt)
                skb->tstamp_type = tstamp_type;
        else
                skb->tstamp_type = SKB_CLOCK_REALTIME;
}

static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb,
                                                    ktime_t kt, clockid_t clockid)
{
        u8 tstamp_type = SKB_CLOCK_REALTIME;

        switch (clockid) {
        case CLOCK_REALTIME:
                break;
        case CLOCK_MONOTONIC:
                tstamp_type = SKB_CLOCK_MONOTONIC;
                break;
        case CLOCK_TAI:
                tstamp_type = SKB_CLOCK_TAI;
                break;
        default:
                WARN_ON_ONCE(1);
                kt = 0;
        }

        skb_set_delivery_time(skb, kt, tstamp_type);
}

DECLARE_STATIC_KEY_FALSE(netstamp_needed_key);

/* It is used in the ingress path to clear the delivery_time.
 * If needed, set the skb->tstamp to the (rcv) timestamp.
 */
static inline void skb_clear_delivery_time(struct sk_buff *skb)
{
        if (skb->tstamp_type) {
                skb->tstamp_type = SKB_CLOCK_REALTIME;
                if (static_branch_unlikely(&netstamp_needed_key))
                        skb->tstamp = ktime_get_real();
                else
                        skb->tstamp = 0;
        }
}

static inline void skb_clear_tstamp(struct sk_buff *skb)
{
        if (skb->tstamp_type)
                return;

        skb->tstamp = 0;
}

static inline ktime_t skb_tstamp(const struct sk_buff *skb)
{
        if (skb->tstamp_type)
                return 0;

        return skb->tstamp;
}

static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond)
{
        if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp)
                return skb->tstamp;

        if (static_branch_unlikely(&netstamp_needed_key) || cond)
                return ktime_get_real();

        return 0;
}

static inline u8 skb_metadata_len(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->meta_len;
}

static inline void *skb_metadata_end(const struct sk_buff *skb)
{
        return skb_mac_header(skb);
}

static inline bool __skb_metadata_differs(const struct sk_buff *skb_a,
                                          const struct sk_buff *skb_b,
                                          u8 meta_len)
{
        const void *a = skb_metadata_end(skb_a);
        const void *b = skb_metadata_end(skb_b);
        u64 diffs = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
            BITS_PER_LONG != 64)
                goto slow;

        /* Using more efficient variant than plain call to memcmp(). */
        switch (meta_len) {
#define __it(x, op) (x -= sizeof(u##op))
#define __it_diff(a, b, op) (*(u##op *)__it(a, op)) ^ (*(u##op *)__it(b, op))
        case 32: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 24: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 16: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  8: diffs |= __it_diff(a, b, 64);
                break;
        case 28: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 20: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case 12: diffs |= __it_diff(a, b, 64);
                fallthrough;
        case  4: diffs |= __it_diff(a, b, 32);
                break;
        default:
slow:
                return memcmp(a - meta_len, b - meta_len, meta_len);
        }
        return diffs;
}

static inline bool skb_metadata_differs(const struct sk_buff *skb_a,
                                        const struct sk_buff *skb_b)
{
        u8 len_a = skb_metadata_len(skb_a);
        u8 len_b = skb_metadata_len(skb_b);

        if (!(len_a | len_b))
                return false;

        return len_a != len_b ?
               true : __skb_metadata_differs(skb_a, skb_b, len_a);
}

static inline void skb_metadata_set(struct sk_buff *skb, u8 meta_len)
{
        skb_shinfo(skb)->meta_len = meta_len;
}

static inline void skb_metadata_clear(struct sk_buff *skb)
{
        skb_metadata_set(skb, 0);
}

struct sk_buff *skb_clone_sk(struct sk_buff *skb);

#ifdef CONFIG_NETWORK_PHY_TIMESTAMPING

void skb_clone_tx_timestamp(struct sk_buff *skb);
bool skb_defer_rx_timestamp(struct sk_buff *skb);

#else /* CONFIG_NETWORK_PHY_TIMESTAMPING */

static inline void skb_clone_tx_timestamp(struct sk_buff *skb)
{
}

static inline bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
        return false;
}

#endif /* !CONFIG_NETWORK_PHY_TIMESTAMPING */

/**
 * skb_complete_tx_timestamp() - deliver cloned skb with tx timestamps
 *
 * PHY drivers may accept clones of transmitted packets for
 * timestamping via their phy_driver.txtstamp method. These drivers
 * must call this function to return the skb back to the stack with a
 * timestamp.
 *
 * @skb: clone of the original outgoing packet
 * @hwtstamps: hardware time stamps
 *
 */
void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps);

void __skb_tstamp_tx(struct sk_buff *orig_skb, const struct sk_buff *ack_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype);

/**
 * skb_tstamp_tx - queue clone of skb with send time stamps
 * @orig_skb:        the original outgoing packet
 * @hwtstamps:        hardware time stamps, may be NULL if not available
 *
 * If the skb has a socket associated, then this function clones the
 * skb (thus sharing the actual data and optional structures), stores
 * the optional hardware time stamping information (if non NULL) or
 * generates a software time stamp (otherwise), then queues the clone
 * to the error queue of the socket.  Errors are silently ignored.
 */
void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps);

/**
 * skb_tx_timestamp() - Driver hook for transmit timestamping
 *
 * Ethernet MAC Drivers should call this function in their hard_xmit()
 * function immediately before giving the sk_buff to the MAC hardware.
 *
 * Specifically, one should make absolutely sure that this function is
 * called before TX completion of this packet can trigger.  Otherwise
 * the packet could potentially already be freed.
 *
 * @skb: A socket buffer.
 */
static inline void skb_tx_timestamp(struct sk_buff *skb)
{
        skb_clone_tx_timestamp(skb);
        if (skb_shinfo(skb)->tx_flags & (SKBTX_SW_TSTAMP | SKBTX_BPF))
                skb_tstamp_tx(skb, NULL);
}

/**
 * skb_complete_wifi_ack - deliver skb with wifi status
 *
 * @skb: the original outgoing packet
 * @acked: ack status
 *
 */
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked);

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
__sum16 __skb_checksum_complete(struct sk_buff *skb);

static inline int skb_csum_unnecessary(const struct sk_buff *skb)
{
        return ((skb->ip_summed == CHECKSUM_UNNECESSARY) ||
                skb->csum_valid ||
                (skb->ip_summed == CHECKSUM_PARTIAL &&
                 skb_checksum_start_offset(skb) >= 0));
}

/**
 *        skb_checksum_complete - Calculate checksum of an entire packet
 *        @skb: packet to process
 *
 *        This function calculates the checksum over the entire packet plus
 *        the value of skb->csum.  The latter can be used to supply the
 *        checksum of a pseudo header as used by TCP/UDP.  It returns the
 *        checksum.
 *
 *        For protocols that contain complete checksums such as ICMP/TCP/UDP,
 *        this function can be used to verify that checksum on received
 *        packets.  In that case the function should return zero if the
 *        checksum is correct.  In particular, this function will return zero
 *        if skb->ip_summed is CHECKSUM_UNNECESSARY which indicates that the
 *        hardware has already verified the correctness of the checksum.
 */
static inline __sum16 skb_checksum_complete(struct sk_buff *skb)
{
        return skb_csum_unnecessary(skb) ?
               0 : __skb_checksum_complete(skb);
}

static inline void __skb_decr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level == 0)
                        skb->ip_summed = CHECKSUM_NONE;
                else
                        skb->csum_level--;
        }
}

static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                if (skb->csum_level < SKB_MAX_CSUM_LEVEL)
                        skb->csum_level++;
        } else if (skb->ip_summed == CHECKSUM_NONE) {
                skb->ip_summed = CHECKSUM_UNNECESSARY;
                skb->csum_level = 0;
        }
}

static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
                skb->ip_summed = CHECKSUM_NONE;
                skb->csum_level = 0;
        }
}

/* Check if we need to perform checksum complete validation.
 *
 * Returns: true if checksum complete is needed, false otherwise
 * (either checksum is unnecessary or zero checksum is allowed).
 */
static inline bool __skb_checksum_validate_needed(struct sk_buff *skb,
                                                  bool zero_okay,
                                                  __sum16 check)
{
        if (skb_csum_unnecessary(skb) || (zero_okay && !check)) {
                skb->csum_valid = 1;
                __skb_decr_checksum_unnecessary(skb);
                return false;
        }

        return true;
}

/* For small packets <= CHECKSUM_BREAK perform checksum complete directly
 * in checksum_init.
 */
#define CHECKSUM_BREAK 76

/* Unset checksum-complete
 *
 * Unset checksum complete can be done when packet is being modified
 * (uncompressed for instance) and checksum-complete value is
 * invalidated.
 */
static inline void skb_checksum_complete_unset(struct sk_buff *skb)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/* Validate (init) checksum based on checksum complete.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete. In the latter
 *        case the ip_summed will not be CHECKSUM_UNNECESSARY and the pseudo
 *        checksum is stored in skb->csum for use in __skb_checksum_complete
 *   non-zero: value of invalid checksum
 *
 */
static inline __sum16 __skb_checksum_validate_complete(struct sk_buff *skb,
                                                       bool complete,
                                                       __wsum psum)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                if (!csum_fold(csum_add(psum, skb->csum))) {
                        skb->csum_valid = 1;
                        return 0;
                }
        }

        skb->csum = psum;

        if (complete || skb->len <= CHECKSUM_BREAK) {
                __sum16 csum;

                csum = __skb_checksum_complete(skb);
                skb->csum_valid = !csum;
                return csum;
        }

        return 0;
}

static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto)
{
        return 0;
}

/* Perform checksum validate (init). Note that this is a macro since we only
 * want to calculate the pseudo header which is an input function if necessary.
 * First we try to validate without any computation (checksum unnecessary) and
 * then calculate based on checksum complete calling the function to compute
 * pseudo header.
 *
 * Return values:
 *   0: checksum is validated or try to in skb_checksum_complete
 *   non-zero: value of invalid checksum
 */
#define __skb_checksum_validate(skb, proto, complete,                        \
                                zero_okay, check, compute_pseudo)        \
({                                                                        \
        __sum16 __ret = 0;                                                \
        skb->csum_valid = 0;                                                \
        if (__skb_checksum_validate_needed(skb, zero_okay, check))        \
                __ret = __skb_checksum_validate_complete(skb,                \
                                complete, compute_pseudo(skb, proto));        \
        __ret;                                                                \
})

#define skb_checksum_init(skb, proto, compute_pseudo)                        \
        __skb_checksum_validate(skb, proto, false, false, 0, compute_pseudo)

#define skb_checksum_init_zero_check(skb, proto, check, compute_pseudo)        \
        __skb_checksum_validate(skb, proto, false, true, check, compute_pseudo)

#define skb_checksum_validate(skb, proto, compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, false, 0, compute_pseudo)

#define skb_checksum_validate_zero_check(skb, proto, check,                \
                                         compute_pseudo)                \
        __skb_checksum_validate(skb, proto, true, true, check, compute_pseudo)

#define skb_checksum_simple_validate(skb)                                \
        __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo)

static inline bool __skb_checksum_convert_check(struct sk_buff *skb)
{
        return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid);
}

static inline void __skb_checksum_convert(struct sk_buff *skb, __wsum pseudo)
{
        skb->csum = ~pseudo;
        skb->ip_summed = CHECKSUM_COMPLETE;
}

#define skb_checksum_try_convert(skb, proto, compute_pseudo)        \
do {                                                                        \
        if (__skb_checksum_convert_check(skb))                                \
                __skb_checksum_convert(skb, compute_pseudo(skb, proto)); \
} while (0)

static inline void skb_remcsum_adjust_partial(struct sk_buff *skb, void *ptr,
                                              u16 start, u16 offset)
{
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = ((unsigned char *)ptr + start) - skb->head;
        skb->csum_offset = offset - start;
}

/* Update skbuf and packet to reflect the remote checksum offload operation.
 * When called, ptr indicates the starting point for skb->csum when
 * ip_summed is CHECKSUM_COMPLETE. If we need create checksum complete
 * here, skb_postpull_rcsum is done so skb->csum start is ptr.
 */
static inline void skb_remcsum_process(struct sk_buff *skb, void *ptr,
                                       int start, int offset, bool nopartial)
{
        __wsum delta;

        if (!nopartial) {
                skb_remcsum_adjust_partial(skb, ptr, start, offset);
                return;
        }

        if (unlikely(skb->ip_summed != CHECKSUM_COMPLETE)) {
                __skb_checksum_complete(skb);
                skb_postpull_rcsum(skb, skb->data, ptr - (void *)skb->data);
        }

        delta = remcsum_adjust(ptr, skb->csum, start, offset);

        /* Adjust skb->csum since we changed the packet */
        skb->csum = csum_add(skb->csum, delta);
}

static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return (void *)(skb->_nfct & NFCT_PTRMASK);
#else
        return NULL;
#endif
}

static inline unsigned long skb_get_nfct(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        return skb->_nfct;
#else
        return 0UL;
#endif
}

static inline void skb_set_nfct(struct sk_buff *skb, unsigned long nfct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        skb->slow_gro |= !!nfct;
        skb->_nfct = nfct;
#endif
}

#ifdef CONFIG_SKB_EXTENSIONS
enum skb_ext_id {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        SKB_EXT_BRIDGE_NF,
#endif
#ifdef CONFIG_XFRM
        SKB_EXT_SEC_PATH,
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        TC_SKB_EXT,
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        SKB_EXT_MPTCP,
#endif
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
        SKB_EXT_MCTP,
#endif
        SKB_EXT_NUM, /* must be last */
};

/**
 *        struct skb_ext - sk_buff extensions
 *        @refcnt: 1 on allocation, deallocated on 0
 *        @offset: offset to add to @data to obtain extension address
 *        @chunks: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
 *        @data: start of extension data, variable sized
 *
 *        Note: offsets/lengths are stored in chunks of 8 bytes, this allows
 *        to use 'u8' types while allowing up to 2kb worth of extension data.
 */
struct skb_ext {
        refcount_t refcnt;
        u8 offset[SKB_EXT_NUM]; /* in chunks of 8 bytes */
        u8 chunks;                /* same */
        char data[] __aligned(8);
};

struct skb_ext *__skb_ext_alloc(gfp_t flags);
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext);
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
void __skb_ext_put(struct skb_ext *ext);

static inline void skb_ext_put(struct sk_buff *skb)
{
        if (skb->active_extensions)
                __skb_ext_put(skb->extensions);
}

static inline void __skb_ext_copy(struct sk_buff *dst,
                                  const struct sk_buff *src)
{
        dst->active_extensions = src->active_extensions;

        if (src->active_extensions) {
                struct skb_ext *ext = src->extensions;

                refcount_inc(&ext->refcnt);
                dst->extensions = ext;
        }
}

static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
{
        skb_ext_put(dst);
        __skb_ext_copy(dst, src);
}

static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
{
        return !!ext->offset[i];
}

static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
{
        return skb->active_extensions & (1 << id);
}

static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id))
                __skb_ext_del(skb, id);
}

static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
{
        if (skb_ext_exist(skb, id)) {
                struct skb_ext *ext = skb->extensions;

                return (void *)ext + (ext->offset[id] << 3);
        }

        return NULL;
}

static inline void skb_ext_reset(struct sk_buff *skb)
{
        if (unlikely(skb->active_extensions)) {
                __skb_ext_put(skb->extensions);
                skb->active_extensions = 0;
        }
}

static inline bool skb_has_extensions(struct sk_buff *skb)
{
        return unlikely(skb->active_extensions);
}
#else
static inline void skb_ext_put(struct sk_buff *skb) {}
static inline void skb_ext_reset(struct sk_buff *skb) {}
static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
static inline bool skb_has_extensions(struct sk_buff *skb) { return false; }
#endif /* CONFIG_SKB_EXTENSIONS */

static inline void nf_reset_ct(struct sk_buff *skb)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(skb));
        skb->_nfct = 0;
#endif
}

static inline void nf_reset_trace(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        skb->nf_trace = 0;
#endif
}

static inline void ipvs_reset(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_VS)
        skb->ipvs_property = 0;
#endif
}

/* Note: This doesn't put any conntrack info in dst. */
static inline void __nf_copy(struct sk_buff *dst, const struct sk_buff *src,
                             bool copy)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        dst->_nfct = src->_nfct;
        nf_conntrack_get(skb_nfct(src));
#endif
#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || IS_ENABLED(CONFIG_NF_TABLES)
        if (copy)
                dst->nf_trace = src->nf_trace;
#endif
}

static inline void nf_copy(struct sk_buff *dst, const struct sk_buff *src)
{
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
        nf_conntrack_put(skb_nfct(dst));
#endif
        dst->slow_gro = src->slow_gro;
        __nf_copy(dst, src, true);
}

#ifdef CONFIG_NETWORK_SECMARK
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{
        to->secmark = from->secmark;
}

static inline void skb_init_secmark(struct sk_buff *skb)
{
        skb->secmark = 0;
}
#else
static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from)
{ }

static inline void skb_init_secmark(struct sk_buff *skb)
{ }
#endif

static inline int secpath_exists(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_exist(skb, SKB_EXT_SEC_PATH);
#else
        return 0;
#endif
}

static inline bool skb_irq_freeable(const struct sk_buff *skb)
{
        return !skb->destructor &&
                !secpath_exists(skb) &&
                !skb_nfct(skb) &&
                !skb->_skb_refdst &&
                !skb_has_frag_list(skb);
}

static inline void skb_set_queue_mapping(struct sk_buff *skb, u16 queue_mapping)
{
        skb->queue_mapping = queue_mapping;
}

static inline u16 skb_get_queue_mapping(const struct sk_buff *skb)
{
        return skb->queue_mapping;
}

static inline void skb_copy_queue_mapping(struct sk_buff *to, const struct sk_buff *from)
{
        to->queue_mapping = from->queue_mapping;
}

static inline void skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue)
{
        skb->queue_mapping = rx_queue + 1;
}

static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
{
        return skb->queue_mapping - 1;
}

static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
{
        return skb->queue_mapping != 0;
}

static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val)
{
        skb->dst_pending_confirm = val;
}

static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb)
{
        return skb->dst_pending_confirm != 0;
}

static inline struct sec_path *skb_sec_path(const struct sk_buff *skb)
{
#ifdef CONFIG_XFRM
        return skb_ext_find(skb, SKB_EXT_SEC_PATH);
#else
        return NULL;
#endif
}

static inline bool skb_is_gso(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_size;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_v6(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_sctp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & SKB_GSO_SCTP;
}

/* Note: Should be called only if skb_is_gso(skb) is true */
static inline bool skb_is_gso_tcp(const struct sk_buff *skb)
{
        return skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6);
}

static inline void skb_gso_reset(struct sk_buff *skb)
{
        skb_shinfo(skb)->gso_size = 0;
        skb_shinfo(skb)->gso_segs = 0;
        skb_shinfo(skb)->gso_type = 0;
}

static inline void skb_increase_gso_size(struct skb_shared_info *shinfo,
                                         u16 increment)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size += increment;
}

static inline void skb_decrease_gso_size(struct skb_shared_info *shinfo,
                                         u16 decrement)
{
        if (WARN_ON_ONCE(shinfo->gso_size == GSO_BY_FRAGS))
                return;
        shinfo->gso_size -= decrement;
}

void __skb_warn_lro_forwarding(const struct sk_buff *skb);

static inline bool skb_warn_if_lro(const struct sk_buff *skb)
{
        /* LRO sets gso_size but not gso_type, whereas if GSO is really
         * wanted then gso_type will be set. */
        const struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (skb_is_nonlinear(skb) && shinfo->gso_size != 0 &&
            unlikely(shinfo->gso_type == 0)) {
                __skb_warn_lro_forwarding(skb);
                return true;
        }
        return false;
}

static inline void skb_forward_csum(struct sk_buff *skb)
{
        /* Unfortunately we don't support this one.  Any brave souls? */
        if (skb->ip_summed == CHECKSUM_COMPLETE)
                skb->ip_summed = CHECKSUM_NONE;
}

/**
 * skb_checksum_none_assert - make sure skb ip_summed is CHECKSUM_NONE
 * @skb: skb to check
 *
 * fresh skbs have their ip_summed set to CHECKSUM_NONE.
 * Instead of forcing ip_summed to CHECKSUM_NONE, we can
 * use this helper, to document places where we make this assertion.
 */
static inline void skb_checksum_none_assert(const struct sk_buff *skb)
{
        DEBUG_NET_WARN_ON_ONCE(skb->ip_summed != CHECKSUM_NONE);
}

bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);

int skb_checksum_setup(struct sk_buff *skb, bool recalculate);
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb));

/**
 * skb_head_is_locked - Determine if the skb->head is locked down
 * @skb: skb to check
 *
 * The head on skbs build around a head frag can be removed if they are
 * not cloned.  This function returns true if the skb head is locked down
 * due to either being allocated via kmalloc, or by being a clone with
 * multiple references to the head.
 */
static inline bool skb_head_is_locked(const struct sk_buff *skb)
{
        return !skb->head_frag || skb_cloned(skb);
}

/* Local Checksum Offload.
 * Compute outer checksum based on the assumption that the
 * inner checksum will be offloaded later.
 * See Documentation/networking/checksum-offloads.rst for
 * explanation of how this works.
 * Fill in outer checksum adjustment (e.g. with sum of outer
 * pseudo-header) before calling.
 * Also ensure that inner checksum is in linear data area.
 */
static inline __wsum lco_csum(struct sk_buff *skb)
{
        unsigned char *csum_start = skb_checksum_start(skb);
        unsigned char *l4_hdr = skb_transport_header(skb);
        __wsum partial;

        /* Start with complement of inner checksum adjustment */
        partial = ~csum_unfold(*(__force __sum16 *)(csum_start +
                                                    skb->csum_offset));

        /* Add in checksum of our headers (incl. outer checksum
         * adjustment filled in by caller) and return result.
         */
        return csum_partial(l4_hdr, csum_start - l4_hdr, partial);
}

static inline bool skb_is_redirected(const struct sk_buff *skb)
{
        return skb->redirected;
}

static inline void skb_set_redirected(struct sk_buff *skb, bool from_ingress)
{
        skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
        skb->from_ingress = from_ingress;
        if (skb->from_ingress)
                skb_clear_tstamp(skb);
#endif
}

static inline void skb_reset_redirect(struct sk_buff *skb)
{
        skb->redirected = 0;
}

static inline void skb_set_redirected_noclear(struct sk_buff *skb,
                                              bool from_ingress)
{
        skb->redirected = 1;
#ifdef CONFIG_NET_REDIRECT
        skb->from_ingress = from_ingress;
#endif
}

static inline bool skb_csum_is_sctp(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_IP_SCTP)
        return skb->csum_not_inet;
#else
        return 0;
#endif
}

static inline void skb_reset_csum_not_inet(struct sk_buff *skb)
{
        skb->ip_summed = CHECKSUM_NONE;
#if IS_ENABLED(CONFIG_IP_SCTP)
        skb->csum_not_inet = 0;
#endif
}

static inline void skb_set_kcov_handle(struct sk_buff *skb,
                                       const u64 kcov_handle)
{
#ifdef CONFIG_KCOV
        skb->kcov_handle = kcov_handle;
#endif
}

static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
{
#ifdef CONFIG_KCOV
        return skb->kcov_handle;
#else
        return 0;
#endif
}

static inline void skb_mark_for_recycle(struct sk_buff *skb)
{
#ifdef CONFIG_PAGE_POOL
        skb->pp_recycle = 1;
#endif
}

ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
                             ssize_t maxsize, gfp_t gfp);

#endif        /* __KERNEL__ */
#endif        /* _LINUX_SKBUFF_H */































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/extents_status.h
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *        Allison Henderson <achender@linux.vnet.ibm.com>
 *        Zheng Liu <wenqing.lz@taobao.com>
 *
 */

#ifndef _EXT4_EXTENTS_STATUS_H
#define _EXT4_EXTENTS_STATUS_H

/*
 * Turn on ES_DEBUG__ to get lots of info about extent status operations.
 */
#ifdef ES_DEBUG__
#define es_debug(fmt, ...)        printk(fmt, ##__VA_ARGS__)
#else
#define es_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

/*
 * With ES_AGGRESSIVE_TEST defined, the result of es caching will be
 * checked with old map_block's result.
 */
#define ES_AGGRESSIVE_TEST__

/*
 * These flags live in the high bits of extent_status.es_pblk
 */
enum {
        ES_WRITTEN_B,
        ES_UNWRITTEN_B,
        ES_DELAYED_B,
        ES_HOLE_B,
        ES_REFERENCED_B,
        ES_FLAGS
};

#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS)
#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT)

/*
 * Besides EXTENT_STATUS_REFERENCED, all these extent type masks
 * are exclusive, only one type can be set at a time.
 */
#define EXTENT_STATUS_WRITTEN        (1 << ES_WRITTEN_B)
#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B)
#define EXTENT_STATUS_DELAYED        (1 << ES_DELAYED_B)
#define EXTENT_STATUS_HOLE        (1 << ES_HOLE_B)
#define EXTENT_STATUS_REFERENCED        (1 << ES_REFERENCED_B)

#define ES_TYPE_MASK        ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \
                          EXTENT_STATUS_UNWRITTEN | \
                          EXTENT_STATUS_DELAYED | \
                          EXTENT_STATUS_HOLE))

#define ES_TYPE_VALID(type)        ((type) && !((type) & ((type) - 1)))

struct ext4_sb_info;
struct ext4_extent;

struct extent_status {
        struct rb_node rb_node;
        ext4_lblk_t es_lblk;        /* first logical block extent covers */
        ext4_lblk_t es_len;        /* length of extent in block */
        ext4_fsblk_t es_pblk;        /* first physical block */
};

struct ext4_es_tree {
        struct rb_root root;
        struct extent_status *cache_es;        /* recently accessed extent */
};

struct ext4_es_stats {
        unsigned long es_stats_shrunk;
        struct percpu_counter es_stats_cache_hits;
        struct percpu_counter es_stats_cache_misses;
        u64 es_stats_scan_time;
        u64 es_stats_max_scan_time;
        struct percpu_counter es_stats_all_cnt;
        struct percpu_counter es_stats_shk_cnt;
};

/*
 * Pending cluster reservations for bigalloc file systems
 *
 * A cluster with a pending reservation is a logical cluster shared by at
 * least one extent in the extents status tree with delayed and unwritten
 * status and at least one other written or unwritten extent.  The
 * reservation is said to be pending because a cluster reservation would
 * have to be taken in the event all blocks in the cluster shared with
 * written or unwritten extents were deleted while the delayed and
 * unwritten blocks remained.
 *
 * The set of pending cluster reservations is an auxiliary data structure
 * used with the extents status tree to implement reserved cluster/block
 * accounting for bigalloc file systems.  The set is kept in memory and
 * records all pending cluster reservations.
 *
 * Its primary function is to avoid the need to read extents from the
 * disk when invalidating pages as a result of a truncate, punch hole, or
 * collapse range operation.  Page invalidation requires a decrease in the
 * reserved cluster count if it results in the removal of all delayed
 * and unwritten extents (blocks) from a cluster that is not shared with a
 * written or unwritten extent, and no decrease otherwise.  Determining
 * whether the cluster is shared can be done by searching for a pending
 * reservation on it.
 *
 * Secondarily, it provides a potentially faster method for determining
 * whether the reserved cluster count should be increased when a physical
 * cluster is deallocated as a result of a truncate, punch hole, or
 * collapse range operation.  The necessary information is also present
 * in the extents status tree, but might be more rapidly accessed in
 * the pending reservation set in many cases due to smaller size.
 *
 * The pending cluster reservation set is implemented as a red-black tree
 * with the goal of minimizing per page search time overhead.
 */

struct pending_reservation {
        struct rb_node rb_node;
        ext4_lblk_t lclu;
};

struct ext4_pending_tree {
        struct rb_root root;
};

extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);

extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                                  ext4_lblk_t len, ext4_fsblk_t pblk,
                                  unsigned int status,
                                  bool delalloc_reserve_used);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t len, ext4_fsblk_t pblk,
                                 unsigned int status);
extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                                  ext4_lblk_t len);
extern void ext4_es_find_extent_range(struct inode *inode,
                                      int (*match_fn)(struct extent_status *es),
                                      ext4_lblk_t lblk, ext4_lblk_t end,
                                      struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                                 ext4_lblk_t *next_lblk,
                                 struct extent_status *es);
extern bool ext4_es_scan_range(struct inode *inode,
                               int (*matching_fn)(struct extent_status *es),
                               ext4_lblk_t lblk, ext4_lblk_t end);
extern bool ext4_es_scan_clu(struct inode *inode,
                             int (*matching_fn)(struct extent_status *es),
                             ext4_lblk_t lblk);

static inline unsigned int ext4_es_status(struct extent_status *es)
{
        return es->es_pblk >> ES_SHIFT;
}

static inline unsigned int ext4_es_type(struct extent_status *es)
{
        return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK;
}

static inline int ext4_es_is_written(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0;
}

static inline int ext4_es_is_unwritten(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0;
}

static inline int ext4_es_is_delayed(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0;
}

static inline int ext4_es_is_hole(struct extent_status *es)
{
        return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0;
}

static inline int ext4_es_is_mapped(struct extent_status *es)
{
        return (ext4_es_is_written(es) || ext4_es_is_unwritten(es));
}

static inline void ext4_es_set_referenced(struct extent_status *es)
{
        es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT;
}

static inline void ext4_es_clear_referenced(struct extent_status *es)
{
        es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT);
}

static inline int ext4_es_is_referenced(struct extent_status *es)
{
        return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0;
}

static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es)
{
        return es->es_pblk & ~ES_MASK;
}

static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es)
{
        ext4_fsblk_t pblock = ext4_es_pblock(es);
        return pblock == ~ES_MASK ? 0 : pblock;
}

static inline void ext4_es_store_pblock(struct extent_status *es,
                                        ext4_fsblk_t pb)
{
        ext4_fsblk_t block;

        block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK);
        es->es_pblk = block;
}

static inline void ext4_es_store_pblock_status(struct extent_status *es,
                                               ext4_fsblk_t pb,
                                               unsigned int status)
{
        WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK));

        es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) |
                      (pb & ~ES_MASK);
}

extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);

extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);

extern int __init ext4_init_pending(void);
extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
                                          ext4_lblk_t len, bool lclu_allocated,
                                          bool end_allocated);
extern void ext4_clear_inode_es(struct inode *inode);

#endif /* _EXT4_EXTENTS_STATUS_H */














































































































































































































































































































































    3 





    3 

    3 










    3 

































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * net/sched/sch_cbs.c        Credit Based Shaper
 *
 * Authors:        Vinicius Costa Gomes <vinicius.gomes@intel.com>
 */

/* Credit Based Shaper (CBS)
 * =========================
 *
 * This is a simple rate-limiting shaper aimed at TSN applications on
 * systems with known traffic workloads.
 *
 * Its algorithm is defined by the IEEE 802.1Q-2014 Specification,
 * Section 8.6.8.2, and explained in more detail in the Annex L of the
 * same specification.
 *
 * There are four tunables to be considered:
 *
 *        'idleslope': Idleslope is the rate of credits that is
 *        accumulated (in kilobits per second) when there is at least
 *        one packet waiting for transmission. Packets are transmitted
 *        when the current value of credits is equal or greater than
 *        zero. When there is no packet to be transmitted the amount of
 *        credits is set to zero. This is the main tunable of the CBS
 *        algorithm.
 *
 *        'sendslope':
 *        Sendslope is the rate of credits that is depleted (it should be a
 *        negative number of kilobits per second) when a transmission is
 *        ocurring. It can be calculated as follows, (IEEE 802.1Q-2014 Section
 *        8.6.8.2 item g):
 *
 *        sendslope = idleslope - port_transmit_rate
 *
 *        'hicredit': Hicredit defines the maximum amount of credits (in
 *        bytes) that can be accumulated. Hicredit depends on the
 *        characteristics of interfering traffic,
 *        'max_interference_size' is the maximum size of any burst of
 *        traffic that can delay the transmission of a frame that is
 *        available for transmission for this traffic class, (IEEE
 *        802.1Q-2014 Annex L, Equation L-3):
 *
 *        hicredit = max_interference_size * (idleslope / port_transmit_rate)
 *
 *        'locredit': Locredit is the minimum amount of credits that can
 *        be reached. It is a function of the traffic flowing through
 *        this qdisc (IEEE 802.1Q-2014 Annex L, Equation L-2):
 *
 *        locredit = max_frame_size * (sendslope / port_transmit_rate)
 */

#include <linux/ethtool.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/units.h>

#include <net/netevent.h>
#include <net/netlink.h>
#include <net/sch_generic.h>
#include <net/pkt_sched.h>

static LIST_HEAD(cbs_list);
static DEFINE_SPINLOCK(cbs_list_lock);

struct cbs_sched_data {
        bool offload;
        int queue;
        atomic64_t port_rate; /* in bytes/s */
        s64 last; /* timestamp in ns */
        s64 credits; /* in bytes */
        s32 locredit; /* in bytes */
        s32 hicredit; /* in bytes */
        s64 sendslope; /* in bytes/s */
        s64 idleslope; /* in bytes/s */
        struct qdisc_watchdog watchdog;
        int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch,
                       struct sk_buff **to_free);
        struct sk_buff *(*dequeue)(struct Qdisc *sch);
        struct Qdisc *qdisc;
        struct list_head cbs_list;
};

static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                             struct Qdisc *child,
                             struct sk_buff **to_free)
{
        unsigned int len = qdisc_pkt_len(skb);
        int err;

        err = child->ops->enqueue(skb, child, to_free);
        if (err != NET_XMIT_SUCCESS)
                return err;

        sch->qstats.backlog += len;
        sch->q.qlen++;

        return NET_XMIT_SUCCESS;
}

static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch,
                               struct sk_buff **to_free)
{
        struct cbs_sched_data *q = qdisc_priv(sch);
        struct Qdisc *qdisc = q->qdisc;

        return cbs_child_enqueue(skb, sch, qdisc, to_free);
}

static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch,
                            struct sk_buff **to_free)
{
        struct cbs_sched_data *q = qdisc_priv(sch);
        struct Qdisc *qdisc = q->qdisc;

        if (sch->q.qlen == 0 && q->credits > 0) {
                /* We need to stop accumulating credits when there's
                 * no enqueued packets and q->credits is positive.
                 */
                q->credits = 0;
                q->last = ktime_get_ns();
        }

        return cbs_child_enqueue(skb, sch, qdisc, to_free);
}

static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                       struct sk_buff **to_free)
{
        struct cbs_sched_data *q = qdisc_priv(sch);

        return q->enqueue(skb, sch, to_free);
}

/* timediff is in ns, slope is in bytes/s */
static s64 timediff_to_credits(s64 timediff, s64 slope)
{
        return div64_s64(timediff * slope, NSEC_PER_SEC);
}

static s64 delay_from_credits(s64 credits, s64 slope)
{
        if (unlikely(slope == 0))
                return S64_MAX;

        return div64_s64(-credits * NSEC_PER_SEC, slope);
}

static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate)
{
        if (unlikely(port_rate == 0))
                return S64_MAX;

        return div64_s64(len * slope, port_rate);
}

static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child)
{
        struct sk_buff *skb;

        skb = child->ops->dequeue(child);
        if (!skb)
                return NULL;

        qdisc_qstats_backlog_dec(sch, skb);
        qdisc_bstats_update(sch, skb);
        sch->q.qlen--;

        return skb;
}

static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
{
        struct cbs_sched_data *q = qdisc_priv(sch);
        struct Qdisc *qdisc = q->qdisc;
        s64 now = ktime_get_ns();
        struct sk_buff *skb;
        s64 credits;
        int len;

        /* The previous packet is still being sent */
        if (now < q->last) {
                qdisc_watchdog_schedule_ns(&q->watchdog, q->last);
                return NULL;
        }
        if (q->credits < 0) {
                credits = timediff_to_credits(now - q->last, q->idleslope);

                credits = q->credits + credits;
                q->credits = min_t(s64, credits, q->hicredit);

                if (q->credits < 0) {
                        s64 delay;

                        delay = delay_from_credits(q->credits, q->idleslope);
                        qdisc_watchdog_schedule_ns(&q->watchdog, now + delay);

                        q->last = now;

                        return NULL;
                }
        }
        skb = cbs_child_dequeue(sch, qdisc);
        if (!skb)
                return NULL;

        len = qdisc_pkt_len(skb);

        /* As sendslope is a negative number, this will decrease the
         * amount of q->credits.
         */
        credits = credits_from_len(len, q->sendslope,
                                   atomic64_read(&q->port_rate));
        credits += q->credits;

        q->credits = max_t(s64, credits, q->locredit);
        /* Estimate of the transmission of the last byte of the packet in ns */
        if (unlikely(atomic64_read(&q->port_rate) == 0))
                q->last = now;
        else
                q->last = now + div64_s64(len * NSEC_PER_SEC,
                                          atomic64_read(&q->port_rate));

        return skb;
}

static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch)
{
        struct cbs_sched_data *q = qdisc_priv(sch);
        struct Qdisc *qdisc = q->qdisc;

        return cbs_child_dequeue(sch, qdisc);
}

static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
{
        struct cbs_sched_data *q = qdisc_priv(sch);

        return q->dequeue(sch);
}

static const struct nla_policy cbs_policy[TCA_CBS_MAX + 1] = {
        [TCA_CBS_PARMS]        = { .len = sizeof(struct tc_cbs_qopt) },
};

static void cbs_disable_offload(struct net_device *dev,
                                struct cbs_sched_data *q)
{
        struct tc_cbs_qopt_offload cbs = { };
        const struct net_device_ops *ops;
        int err;

        if (!q->offload)
                return;

        q->enqueue = cbs_enqueue_soft;
        q->dequeue = cbs_dequeue_soft;

        ops = dev->netdev_ops;
        if (!ops->ndo_setup_tc)
                return;

        cbs.queue = q->queue;
        cbs.enable = 0;

        err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs);
        if (err < 0)
                pr_warn("Couldn't disable CBS offload for queue %d\n",
                        cbs.queue);
}

static int cbs_enable_offload(struct net_device *dev, struct cbs_sched_data *q,
                              const struct tc_cbs_qopt *opt,
                              struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        struct tc_cbs_qopt_offload cbs = { };
        int err;

        if (!ops->ndo_setup_tc) {
                NL_SET_ERR_MSG(extack, "Specified device does not support cbs offload");
                return -EOPNOTSUPP;
        }

        cbs.queue = q->queue;

        cbs.enable = 1;
        cbs.hicredit = opt->hicredit;
        cbs.locredit = opt->locredit;
        cbs.idleslope = opt->idleslope;
        cbs.sendslope = opt->sendslope;

        err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_CBS, &cbs);
        if (err < 0) {
                NL_SET_ERR_MSG(extack, "Specified device failed to setup cbs hardware offload");
                return err;
        }

        q->enqueue = cbs_enqueue_offload;
        q->dequeue = cbs_dequeue_offload;

        return 0;
}

static void cbs_set_port_rate(struct net_device *dev, struct cbs_sched_data *q)
{
        struct ethtool_link_ksettings ecmd;
        int speed = SPEED_10;
        s64 port_rate;
        int err;

        err = __ethtool_get_link_ksettings(dev, &ecmd);
        if (err < 0)
                goto skip;

        if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
                speed = ecmd.base.speed;

skip:
        port_rate = speed * 1000 * BYTES_PER_KBIT;

        atomic64_set(&q->port_rate, port_rate);
        netdev_dbg(dev, "cbs: set %s's port_rate to: %lld, linkspeed: %d\n",
                   dev->name, (long long)atomic64_read(&q->port_rate),
                   ecmd.base.speed);
}

static int cbs_dev_notifier(struct notifier_block *nb, unsigned long event,
                            void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct cbs_sched_data *q;
        struct net_device *qdev;
        bool found = false;

        ASSERT_RTNL();

        if (event != NETDEV_UP && event != NETDEV_CHANGE)
                return NOTIFY_DONE;

        spin_lock(&cbs_list_lock);
        list_for_each_entry(q, &cbs_list, cbs_list) {
                qdev = qdisc_dev(q->qdisc);
                if (qdev == dev) {
                        found = true;
                        break;
                }
        }
        spin_unlock(&cbs_list_lock);

        if (found)
                cbs_set_port_rate(dev, q);

        return NOTIFY_DONE;
}

static int cbs_change(struct Qdisc *sch, struct nlattr *opt,
                      struct netlink_ext_ack *extack)
{
        struct cbs_sched_data *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        struct nlattr *tb[TCA_CBS_MAX + 1];
        struct tc_cbs_qopt *qopt;
        int err;

        err = nla_parse_nested_deprecated(tb, TCA_CBS_MAX, opt, cbs_policy,
                                          extack);
        if (err < 0)
                return err;

        if (!tb[TCA_CBS_PARMS]) {
                NL_SET_ERR_MSG(extack, "Missing CBS parameter which are mandatory");
                return -EINVAL;
        }

        qopt = nla_data(tb[TCA_CBS_PARMS]);

        if (!qopt->offload) {
                cbs_set_port_rate(dev, q);
                cbs_disable_offload(dev, q);
        } else {
                err = cbs_enable_offload(dev, q, qopt, extack);
                if (err < 0)
                        return err;
        }

        /* Everything went OK, save the parameters used. */
        WRITE_ONCE(q->hicredit, qopt->hicredit);
        WRITE_ONCE(q->locredit, qopt->locredit);
        WRITE_ONCE(q->idleslope, qopt->idleslope * BYTES_PER_KBIT);
        WRITE_ONCE(q->sendslope, qopt->sendslope * BYTES_PER_KBIT);
        WRITE_ONCE(q->offload, qopt->offload);

        return 0;
}

static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
                    struct netlink_ext_ack *extack)
{
        struct cbs_sched_data *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);

        if (!opt) {
                NL_SET_ERR_MSG(extack, "Missing CBS qdisc options  which are mandatory");
                return -EINVAL;
        }

        q->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
                                     sch->handle, extack);
        if (!q->qdisc)
                return -ENOMEM;

        spin_lock(&cbs_list_lock);
        list_add(&q->cbs_list, &cbs_list);
        spin_unlock(&cbs_list_lock);

        qdisc_hash_add(q->qdisc, false);

        q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);

        q->enqueue = cbs_enqueue_soft;
        q->dequeue = cbs_dequeue_soft;

        qdisc_watchdog_init(&q->watchdog, sch);

        return cbs_change(sch, opt, extack);
}

static void cbs_destroy(struct Qdisc *sch)
{
        struct cbs_sched_data *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);

        /* Nothing to do if we couldn't create the underlying qdisc */
        if (!q->qdisc)
                return;

        qdisc_watchdog_cancel(&q->watchdog);
        cbs_disable_offload(dev, q);

        spin_lock(&cbs_list_lock);
        list_del(&q->cbs_list);
        spin_unlock(&cbs_list_lock);

        qdisc_put(q->qdisc);
}

static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
{
        struct cbs_sched_data *q = qdisc_priv(sch);
        struct tc_cbs_qopt opt = { };
        struct nlattr *nest;

        nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
        if (!nest)
                goto nla_put_failure;

        opt.hicredit = READ_ONCE(q->hicredit);
        opt.locredit = READ_ONCE(q->locredit);
        opt.sendslope = div64_s64(READ_ONCE(q->sendslope), BYTES_PER_KBIT);
        opt.idleslope = div64_s64(READ_ONCE(q->idleslope), BYTES_PER_KBIT);
        opt.offload = READ_ONCE(q->offload);

        if (nla_put(skb, TCA_CBS_PARMS, sizeof(opt), &opt))
                goto nla_put_failure;

        return nla_nest_end(skb, nest);

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -1;
}

static int cbs_dump_class(struct Qdisc *sch, unsigned long cl,
                          struct sk_buff *skb, struct tcmsg *tcm)
{
        struct cbs_sched_data *q = qdisc_priv(sch);

        if (cl != 1 || !q->qdisc)        /* only one class */
                return -ENOENT;

        tcm->tcm_handle |= TC_H_MIN(1);
        tcm->tcm_info = q->qdisc->handle;

        return 0;
}

static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
                     struct Qdisc **old, struct netlink_ext_ack *extack)
{
        struct cbs_sched_data *q = qdisc_priv(sch);

        if (!new) {
                new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
                                        sch->handle, NULL);
                if (!new)
                        new = &noop_qdisc;
        }

        *old = qdisc_replace(sch, new, &q->qdisc);
        return 0;
}

static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg)
{
        struct cbs_sched_data *q = qdisc_priv(sch);

        return q->qdisc;
}

static unsigned long cbs_find(struct Qdisc *sch, u32 classid)
{
        return 1;
}

static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker)
{
        if (!walker->stop) {
                tc_qdisc_stats_dump(sch, 1, walker);
        }
}

static const struct Qdisc_class_ops cbs_class_ops = {
        .graft                =        cbs_graft,
        .leaf                =        cbs_leaf,
        .find                =        cbs_find,
        .walk                =        cbs_walk,
        .dump                =        cbs_dump_class,
};

static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
        .id                =        "cbs",
        .cl_ops                =        &cbs_class_ops,
        .priv_size        =        sizeof(struct cbs_sched_data),
        .enqueue        =        cbs_enqueue,
        .dequeue        =        cbs_dequeue,
        .peek                =        qdisc_peek_dequeued,
        .init                =        cbs_init,
        .reset                =        qdisc_reset_queue,
        .destroy        =        cbs_destroy,
        .change                =        cbs_change,
        .dump                =        cbs_dump,
        .owner                =        THIS_MODULE,
};
MODULE_ALIAS_NET_SCH("cbs");

static struct notifier_block cbs_device_notifier = {
        .notifier_call = cbs_dev_notifier,
};

static int __init cbs_module_init(void)
{
        int err;

        err = register_netdevice_notifier(&cbs_device_notifier);
        if (err)
                return err;

        err = register_qdisc(&cbs_qdisc_ops);
        if (err)
                unregister_netdevice_notifier(&cbs_device_notifier);

        return err;
}

static void __exit cbs_module_exit(void)
{
        unregister_qdisc(&cbs_qdisc_ops);
        unregister_netdevice_notifier(&cbs_device_notifier);
}
module_init(cbs_module_init)
module_exit(cbs_module_exit)
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Credit Based shaper");

















































   46 




    4 


















































   96 








    4 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM filemap

#if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_FILEMAP_H

#include <linux/types.h>
#include <linux/tracepoint.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
#include <linux/device.h>
#include <linux/kdev_t.h>
#include <linux/errseq.h>

DECLARE_EVENT_CLASS(mm_filemap_op_page_cache,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(unsigned long, pfn)
                __field(unsigned long, i_ino)
                __field(unsigned long, index)
                __field(dev_t, s_dev)
                __field(unsigned char, order)
        ),

        TP_fast_assign(
                __entry->pfn = folio_pfn(folio);
                __entry->i_ino = folio->mapping->host->i_ino;
                __entry->index = folio->index;
                if (folio->mapping->host->i_sb)
                        __entry->s_dev = folio->mapping->host->i_sb->s_dev;
                else
                        __entry->s_dev = folio->mapping->host->i_rdev;
                __entry->order = folio_order(folio);
        ),

        TP_printk("dev %d:%d ino %lx pfn=0x%lx ofs=%lu order=%u",
                MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                __entry->i_ino,
                __entry->pfn,
                __entry->index << PAGE_SHIFT,
                __entry->order)
);

DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache,
        TP_PROTO(struct folio *folio),
        TP_ARGS(folio)
        );

DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache,
        TP_PROTO(struct folio *folio),
        TP_ARGS(folio)
        );

DECLARE_EVENT_CLASS(mm_filemap_op_page_cache_range,

        TP_PROTO(
                struct address_space *mapping,
                pgoff_t index,
                pgoff_t last_index
        ),

        TP_ARGS(mapping, index, last_index),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(unsigned long, index)
                __field(unsigned long, last_index)
        ),

        TP_fast_assign(
                __entry->i_ino = mapping->host->i_ino;
                if (mapping->host->i_sb)
                        __entry->s_dev =
                                mapping->host->i_sb->s_dev;
                else
                        __entry->s_dev = mapping->host->i_rdev;
                __entry->index = index;
                __entry->last_index = last_index;
        ),

        TP_printk(
                "dev=%d:%d ino=%lx ofs=%lld-%lld",
                MAJOR(__entry->s_dev),
                MINOR(__entry->s_dev), __entry->i_ino,
                ((loff_t)__entry->index) << PAGE_SHIFT,
                ((((loff_t)__entry->last_index + 1) << PAGE_SHIFT) - 1)
        )
);

DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_get_pages,
        TP_PROTO(
                struct address_space *mapping,
                pgoff_t index,
                pgoff_t last_index
        ),
        TP_ARGS(mapping, index, last_index)
);

DEFINE_EVENT(mm_filemap_op_page_cache_range, mm_filemap_map_pages,
        TP_PROTO(
                struct address_space *mapping,
                pgoff_t index,
                pgoff_t last_index
        ),
        TP_ARGS(mapping, index, last_index)
);

TRACE_EVENT(mm_filemap_fault,
        TP_PROTO(struct address_space *mapping, pgoff_t index),

        TP_ARGS(mapping, index),

        TP_STRUCT__entry(
                __field(unsigned long, i_ino)
                __field(dev_t, s_dev)
                __field(unsigned long, index)
        ),

        TP_fast_assign(
                __entry->i_ino = mapping->host->i_ino;
                if (mapping->host->i_sb)
                        __entry->s_dev =
                                mapping->host->i_sb->s_dev;
                else
                        __entry->s_dev = mapping->host->i_rdev;
                __entry->index = index;
        ),

        TP_printk(
                "dev=%d:%d ino=%lx ofs=%lld",
                MAJOR(__entry->s_dev),
                MINOR(__entry->s_dev), __entry->i_ino,
                ((loff_t)__entry->index) << PAGE_SHIFT
        )
);

TRACE_EVENT(filemap_set_wb_err,
                TP_PROTO(struct address_space *mapping, errseq_t eseq),

                TP_ARGS(mapping, eseq),

                TP_STRUCT__entry(
                        __field(unsigned long, i_ino)
                        __field(dev_t, s_dev)
                        __field(errseq_t, errseq)
                ),

                TP_fast_assign(
                        __entry->i_ino = mapping->host->i_ino;
                        __entry->errseq = eseq;
                        if (mapping->host->i_sb)
                                __entry->s_dev = mapping->host->i_sb->s_dev;
                        else
                                __entry->s_dev = mapping->host->i_rdev;
                ),

                TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x",
                        MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
                        __entry->i_ino, __entry->errseq)
);

TRACE_EVENT(file_check_and_advance_wb_err,
                TP_PROTO(struct file *file, errseq_t old),

                TP_ARGS(file, old),

                TP_STRUCT__entry(
                        __field(struct file *, file)
                        __field(unsigned long, i_ino)
                        __field(dev_t, s_dev)
                        __field(errseq_t, old)
                        __field(errseq_t, new)
                ),

                TP_fast_assign(
                        __entry->file = file;
                        __entry->i_ino = file->f_mapping->host->i_ino;
                        if (file->f_mapping->host->i_sb)
                                __entry->s_dev =
                                        file->f_mapping->host->i_sb->s_dev;
                        else
                                __entry->s_dev =
                                        file->f_mapping->host->i_rdev;
                        __entry->old = old;
                        __entry->new = file->f_wb_err;
                ),

                TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x",
                        __entry->file, MAJOR(__entry->s_dev),
                        MINOR(__entry->s_dev), __entry->i_ino, __entry->old,
                        __entry->new)
);
#endif /* _TRACE_FILEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 









    4 








    4 







    4 



    4 














    4 


    4 














    4 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
// SPDX-License-Identifier: GPL-2.0
/*
 *  fs/ext4/extents_status.c
 *
 * Written by Yongqiang Yang <xiaoqiangnk@gmail.com>
 * Modified by
 *        Allison Henderson <achender@linux.vnet.ibm.com>
 *        Hugh Dickins <hughd@google.com>
 *        Zheng Liu <wenqing.lz@taobao.com>
 *
 * Ext4 extents status tree core functions.
 */
#include <linux/list_sort.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include "ext4.h"

#include <trace/events/ext4.h>

/*
 * According to previous discussion in Ext4 Developer Workshop, we
 * will introduce a new structure called io tree to track all extent
 * status in order to solve some problems that we have met
 * (e.g. Reservation space warning), and provide extent-level locking.
 * Delay extent tree is the first step to achieve this goal.  It is
 * original built by Yongqiang Yang.  At that time it is called delay
 * extent tree, whose goal is only track delayed extents in memory to
 * simplify the implementation of fiemap and bigalloc, and introduce
 * lseek SEEK_DATA/SEEK_HOLE support.  That is why it is still called
 * delay extent tree at the first commit.  But for better understand
 * what it does, it has been rename to extent status tree.
 *
 * Step1:
 * Currently the first step has been done.  All delayed extents are
 * tracked in the tree.  It maintains the delayed extent when a delayed
 * allocation is issued, and the delayed extent is written out or
 * invalidated.  Therefore the implementation of fiemap and bigalloc
 * are simplified, and SEEK_DATA/SEEK_HOLE are introduced.
 *
 * The following comment describes the implemenmtation of extent
 * status tree and future works.
 *
 * Step2:
 * In this step all extent status are tracked by extent status tree.
 * Thus, we can first try to lookup a block mapping in this tree before
 * finding it in extent tree.  Hence, single extent cache can be removed
 * because extent status tree can do a better job.  Extents in status
 * tree are loaded on-demand.  Therefore, the extent status tree may not
 * contain all of the extents in a file.  Meanwhile we define a shrinker
 * to reclaim memory from extent status tree because fragmented extent
 * tree will make status tree cost too much memory.  written/unwritten/-
 * hole extents in the tree will be reclaimed by this shrinker when we
 * are under high memory pressure.  Delayed extents will not be
 * reclimed because fiemap, bigalloc, and seek_data/hole need it.
 */

/*
 * Extent status tree implementation for ext4.
 *
 *
 * ==========================================================================
 * Extent status tree tracks all extent status.
 *
 * 1. Why we need to implement extent status tree?
 *
 * Without extent status tree, ext4 identifies a delayed extent by looking
 * up page cache, this has several deficiencies - complicated, buggy,
 * and inefficient code.
 *
 * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a
 * block or a range of blocks are belonged to a delayed extent.
 *
 * Let us have a look at how they do without extent status tree.
 *   --        FIEMAP
 *        FIEMAP looks up page cache to identify delayed allocations from holes.
 *
 *   --        SEEK_HOLE/DATA
 *        SEEK_HOLE/DATA has the same problem as FIEMAP.
 *
 *   --        bigalloc
 *        bigalloc looks up page cache to figure out if a block is
 *        already under delayed allocation or not to determine whether
 *        quota reserving is needed for the cluster.
 *
 *   --        writeout
 *        Writeout looks up whole page cache to see if a buffer is
 *        mapped, If there are not very many delayed buffers, then it is
 *        time consuming.
 *
 * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA,
 * bigalloc and writeout can figure out if a block or a range of
 * blocks is under delayed allocation(belonged to a delayed extent) or
 * not by searching the extent tree.
 *
 *
 * ==========================================================================
 * 2. Ext4 extent status tree impelmentation
 *
 *   --        extent
 *        A extent is a range of blocks which are contiguous logically and
 *        physically.  Unlike extent in extent tree, this extent in ext4 is
 *        a in-memory struct, there is no corresponding on-disk data.  There
 *        is no limit on length of extent, so an extent can contain as many
 *        blocks as they are contiguous logically and physically.
 *
 *   --        extent status tree
 *        Every inode has an extent status tree and all allocation blocks
 *        are added to the tree with different status.  The extent in the
 *        tree are ordered by logical block no.
 *
 *   --        operations on a extent status tree
 *        There are three important operations on a delayed extent tree: find
 *        next extent, adding a extent(a range of blocks) and removing a extent.
 *
 *   --        race on a extent status tree
 *        Extent status tree is protected by inode->i_es_lock.
 *
 *   --        memory consumption
 *      Fragmented extent tree will make extent status tree cost too much
 *      memory.  Hence, we will reclaim written/unwritten/hole extents from
 *      the tree under a heavy memory pressure.
 *
 *
 * ==========================================================================
 * 3. Performance analysis
 *
 *   --        overhead
 *        1. There is a cache extent for write access, so if writes are
 *        not very random, adding space operaions are in O(1) time.
 *
 *   --        gain
 *        2. Code is much simpler, more readable, more maintainable and
 *        more efficient.
 *
 *
 * ==========================================================================
 * 4. TODO list
 *
 *   -- Refactor delayed space reservation
 *
 *   -- Extent-level locking
 */

static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;

static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
                              struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end, int *reserved,
                              struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei);
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                            ext4_lblk_t len,
                            struct pending_reservation **prealloc);

int __init ext4_init_es(void)
{
        ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT);
        if (ext4_es_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_es(void)
{
        kmem_cache_destroy(ext4_es_cachep);
}

void ext4_es_init_tree(struct ext4_es_tree *tree)
{
        tree->root = RB_ROOT;
        tree->cache_es = NULL;
}

#ifdef ES_DEBUG__
static void ext4_es_print_tree(struct inode *inode)
{
        struct ext4_es_tree *tree;
        struct rb_node *node;

        printk(KERN_DEBUG "status extents for inode %lu:", inode->i_ino);
        tree = &EXT4_I(inode)->i_es_tree;
        node = rb_first(&tree->root);
        while (node) {
                struct extent_status *es;
                es = rb_entry(node, struct extent_status, rb_node);
                printk(KERN_DEBUG " [%u/%u) %llu %x",
                       es->es_lblk, es->es_len,
                       ext4_es_pblock(es), ext4_es_status(es));
                node = rb_next(node);
        }
        printk(KERN_DEBUG "\n");
}
#else
#define ext4_es_print_tree(inode)
#endif

static inline ext4_lblk_t ext4_es_end(struct extent_status *es)
{
        BUG_ON(es->es_lblk + es->es_len < es->es_lblk);
        return es->es_lblk + es->es_len - 1;
}

/*
 * search through the tree for an delayed extent with a given offset.  If
 * it can't be found, try to find next extent.
 */
static struct extent_status *__es_tree_search(struct rb_root *root,
                                              ext4_lblk_t lblk)
{
        struct rb_node *node = root->rb_node;
        struct extent_status *es = NULL;

        while (node) {
                es = rb_entry(node, struct extent_status, rb_node);
                if (lblk < es->es_lblk)
                        node = node->rb_left;
                else if (lblk > ext4_es_end(es))
                        node = node->rb_right;
                else
                        return es;
        }

        if (es && lblk < es->es_lblk)
                return es;

        if (es && lblk > ext4_es_end(es)) {
                node = rb_next(&es->rb_node);
                return node ? rb_entry(node, struct extent_status, rb_node) :
                              NULL;
        }

        return NULL;
}

/*
 * ext4_es_find_extent_range - find extent with specified status within block
 *                             range or next extent following block range in
 *                             extents status tree
 *
 * @inode - file containing the range
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block defining start of range
 * @end - logical block defining end of range
 * @es - extent found, if any
 *
 * Find the first extent within the block range specified by @lblk and @end
 * in the extents status tree that satisfies @matching_fn.  If a match
 * is found, it's returned in @es.  If not, and a matching extent is found
 * beyond the block range, it's returned in @es.  If no match is found, an
 * extent is returned in @es whose es_lblk, es_len, and es_pblk components
 * are 0.
 */
static void __es_find_extent_range(struct inode *inode,
                                   int (*matching_fn)(struct extent_status *es),
                                   ext4_lblk_t lblk, ext4_lblk_t end,
                                   struct extent_status *es)
{
        struct ext4_es_tree *tree = NULL;
        struct extent_status *es1 = NULL;
        struct rb_node *node;

        WARN_ON(es == NULL);
        WARN_ON(end < lblk);

        tree = &EXT4_I(inode)->i_es_tree;

        /* see if the extent has been cached */
        es->es_lblk = es->es_len = es->es_pblk = 0;
        es1 = READ_ONCE(tree->cache_es);
        if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
                es_debug("%u cached by [%u/%u) %llu %x\n",
                         lblk, es1->es_lblk, es1->es_len,
                         ext4_es_pblock(es1), ext4_es_status(es1));
                goto out;
        }

        es1 = __es_tree_search(&tree->root, lblk);

out:
        if (es1 && !matching_fn(es1)) {
                while ((node = rb_next(&es1->rb_node)) != NULL) {
                        es1 = rb_entry(node, struct extent_status, rb_node);
                        if (es1->es_lblk > end) {
                                es1 = NULL;
                                break;
                        }
                        if (matching_fn(es1))
                                break;
                }
        }

        if (es1 && matching_fn(es1)) {
                WRITE_ONCE(tree->cache_es, es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
        }

}

/*
 * Locking for __es_find_extent_range() for external use
 */
void ext4_es_find_extent_range(struct inode *inode,
                               int (*matching_fn)(struct extent_status *es),
                               ext4_lblk_t lblk, ext4_lblk_t end,
                               struct extent_status *es)
{
        es->es_lblk = es->es_len = es->es_pblk = 0;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        trace_ext4_es_find_extent_range_enter(inode, lblk);

        read_lock(&EXT4_I(inode)->i_es_lock);
        __es_find_extent_range(inode, matching_fn, lblk, end, es);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        trace_ext4_es_find_extent_range_exit(inode, es);
}

/*
 * __es_scan_range - search block range for block with specified status
 *                   in extents status tree
 *
 * @inode - file containing the range
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block defining start of range
 * @end - logical block defining end of range
 *
 * Returns true if at least one block in the specified block range satisfies
 * the criterion specified by @matching_fn, and false if not.  If at least
 * one extent has the specified status, then there is at least one block
 * in the cluster with that status.  Should only be called by code that has
 * taken i_es_lock.
 */
static bool __es_scan_range(struct inode *inode,
                            int (*matching_fn)(struct extent_status *es),
                            ext4_lblk_t start, ext4_lblk_t end)
{
        struct extent_status es;

        __es_find_extent_range(inode, matching_fn, start, end, &es);
        if (es.es_len == 0)
                return false;   /* no matching extent in the tree */
        else if (es.es_lblk <= start &&
                 start < es.es_lblk + es.es_len)
                return true;
        else if (start <= es.es_lblk && es.es_lblk <= end)
                return true;
        else
                return false;
}
/*
 * Locking for __es_scan_range() for external use
 */
bool ext4_es_scan_range(struct inode *inode,
                        int (*matching_fn)(struct extent_status *es),
                        ext4_lblk_t lblk, ext4_lblk_t end)
{
        bool ret;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return false;

        read_lock(&EXT4_I(inode)->i_es_lock);
        ret = __es_scan_range(inode, matching_fn, lblk, end);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        return ret;
}

/*
 * __es_scan_clu - search cluster for block with specified status in
 *                 extents status tree
 *
 * @inode - file containing the cluster
 * @matching_fn - pointer to function that matches extents with desired status
 * @lblk - logical block in cluster to be searched
 *
 * Returns true if at least one extent in the cluster containing @lblk
 * satisfies the criterion specified by @matching_fn, and false if not.  If at
 * least one extent has the specified status, then there is at least one block
 * in the cluster with that status.  Should only be called by code that has
 * taken i_es_lock.
 */
static bool __es_scan_clu(struct inode *inode,
                          int (*matching_fn)(struct extent_status *es),
                          ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t lblk_start, lblk_end;

        lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
        lblk_end = lblk_start + sbi->s_cluster_ratio - 1;

        return __es_scan_range(inode, matching_fn, lblk_start, lblk_end);
}

/*
 * Locking for __es_scan_clu() for external use
 */
bool ext4_es_scan_clu(struct inode *inode,
                      int (*matching_fn)(struct extent_status *es),
                      ext4_lblk_t lblk)
{
        bool ret;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return false;

        read_lock(&EXT4_I(inode)->i_es_lock);
        ret = __es_scan_clu(inode, matching_fn, lblk);
        read_unlock(&EXT4_I(inode)->i_es_lock);

        return ret;
}

static void ext4_es_list_add(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        if (!list_empty(&ei->i_es_list))
                return;

        spin_lock(&sbi->s_es_lock);
        if (list_empty(&ei->i_es_list)) {
                list_add_tail(&ei->i_es_list, &sbi->s_es_list);
                sbi->s_es_nr_inode++;
        }
        spin_unlock(&sbi->s_es_lock);
}

static void ext4_es_list_del(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);

        spin_lock(&sbi->s_es_lock);
        if (!list_empty(&ei->i_es_list)) {
                list_del_init(&ei->i_es_list);
                sbi->s_es_nr_inode--;
                WARN_ON_ONCE(sbi->s_es_nr_inode < 0);
        }
        spin_unlock(&sbi->s_es_lock);
}

static inline struct pending_reservation *__alloc_pending(bool nofail)
{
        if (!nofail)
                return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);

        return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
}

static inline void __free_pending(struct pending_reservation *pr)
{
        kmem_cache_free(ext4_pending_cachep, pr);
}

/*
 * Returns true if we cannot fail to allocate memory for this extent_status
 * entry and cannot reclaim it until its status changes.
 */
static inline bool ext4_es_must_keep(struct extent_status *es)
{
        /* fiemap, bigalloc, and seek_data/hole need to use it. */
        if (ext4_es_is_delayed(es))
                return true;

        return false;
}

static inline struct extent_status *__es_alloc_extent(bool nofail)
{
        if (!nofail)
                return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);

        return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
}

static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
                ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
{
        es->es_lblk = lblk;
        es->es_len = len;
        es->es_pblk = pblk;

        /* We never try to reclaim a must kept extent, so we don't count it. */
        if (!ext4_es_must_keep(es)) {
                if (!EXT4_I(inode)->i_es_shk_nr++)
                        ext4_es_list_add(inode);
                percpu_counter_inc(&EXT4_SB(inode->i_sb)->
                                        s_es_stats.es_stats_shk_cnt);
        }

        EXT4_I(inode)->i_es_all_nr++;
        percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
}

static inline void __es_free_extent(struct extent_status *es)
{
        kmem_cache_free(ext4_es_cachep, es);
}

static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
{
        EXT4_I(inode)->i_es_all_nr--;
        percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);

        /* Decrease the shrink counter when we can reclaim the extent. */
        if (!ext4_es_must_keep(es)) {
                BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
                if (!--EXT4_I(inode)->i_es_shk_nr)
                        ext4_es_list_del(inode);
                percpu_counter_dec(&EXT4_SB(inode->i_sb)->
                                        s_es_stats.es_stats_shk_cnt);
        }

        __es_free_extent(es);
}

/*
 * Check whether or not two extents can be merged
 * Condition:
 *  - logical block number is contiguous
 *  - physical block number is contiguous
 *  - status is equal
 */
static int ext4_es_can_be_merged(struct extent_status *es1,
                                 struct extent_status *es2)
{
        if (ext4_es_type(es1) != ext4_es_type(es2))
                return 0;

        if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) {
                pr_warn("ES assertion failed when merging extents. "
                        "The sum of lengths of es1 (%d) and es2 (%d) "
                        "is bigger than allowed file size (%d)\n",
                        es1->es_len, es2->es_len, EXT_MAX_BLOCKS);
                WARN_ON(1);
                return 0;
        }

        if (((__u64) es1->es_lblk) + es1->es_len != es2->es_lblk)
                return 0;

        if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) &&
            (ext4_es_pblock(es1) + es1->es_len == ext4_es_pblock(es2)))
                return 1;

        if (ext4_es_is_hole(es1))
                return 1;

        /* we need to check delayed extent */
        if (ext4_es_is_delayed(es1))
                return 1;

        return 0;
}

static struct extent_status *
ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es1;
        struct rb_node *node;

        node = rb_prev(&es->rb_node);
        if (!node)
                return es;

        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es1, es)) {
                es1->es_len += es->es_len;
                if (ext4_es_is_referenced(es))
                        ext4_es_set_referenced(es1);
                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                es = es1;
        }

        return es;
}

static struct extent_status *
ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct extent_status *es1;
        struct rb_node *node;

        node = rb_next(&es->rb_node);
        if (!node)
                return es;

        es1 = rb_entry(node, struct extent_status, rb_node);
        if (ext4_es_can_be_merged(es, es1)) {
                es->es_len += es1->es_len;
                if (ext4_es_is_referenced(es1))
                        ext4_es_set_referenced(es);
                rb_erase(node, &tree->root);
                ext4_es_free_extent(inode, es1);
        }

        return es;
}

#ifdef ES_AGGRESSIVE_TEST
#include "ext4_extents.h"        /* Needed when ES_AGGRESSIVE_TEST is defined */

static void ext4_es_insert_extent_ext_check(struct inode *inode,
                                            struct extent_status *es)
{
        struct ext4_ext_path *path = NULL;
        struct ext4_extent *ex;
        ext4_lblk_t ee_block;
        ext4_fsblk_t ee_start;
        unsigned short ee_len;
        int depth, ee_status, es_status;

        path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE);
        if (IS_ERR(path))
                return;

        depth = ext_depth(inode);
        ex = path[depth].p_ext;

        if (ex) {

                ee_block = le32_to_cpu(ex->ee_block);
                ee_start = ext4_ext_pblock(ex);
                ee_len = ext4_ext_get_actual_len(ex);

                ee_status = ext4_ext_is_unwritten(ex) ? 1 : 0;
                es_status = ext4_es_is_unwritten(es) ? 1 : 0;

                /*
                 * Make sure ex and es are not overlap when we try to insert
                 * a delayed/hole extent.
                 */
                if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
                        if (in_range(es->es_lblk, ee_block, ee_len)) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu we can find an extent "
                                        "at block [%d/%d/%llu/%c], but we "
                                        "want to add a delayed/hole extent "
                                        "[%d/%d/%llu/%x]\n",
                                        inode->i_ino, ee_block, ee_len,
                                        ee_start, ee_status ? 'u' : 'w',
                                        es->es_lblk, es->es_len,
                                        ext4_es_pblock(es), ext4_es_status(es));
                        }
                        goto out;
                }

                /*
                 * We don't check ee_block == es->es_lblk, etc. because es
                 * might be a part of whole extent, vice versa.
                 */
                if (es->es_lblk < ee_block ||
                    ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
                                ext4_es_pblock(es), es_status ? 'u' : 'w');
                        goto out;
                }

                if (ee_status ^ es_status) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "ex_status [%d/%d/%llu/%c] != "
                                "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
                                ee_block, ee_len, ee_start,
                                ee_status ? 'u' : 'w', es->es_lblk, es->es_len,
                                ext4_es_pblock(es), es_status ? 'u' : 'w');
                }
        } else {
                /*
                 * We can't find an extent on disk.  So we need to make sure
                 * that we don't want to add an written/unwritten extent.
                 */
                if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "can't find an extent at block %d but we want "
                                "to add a written/unwritten extent "
                                "[%d/%d/%llu/%x]\n", inode->i_ino,
                                es->es_lblk, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                }
        }
out:
        ext4_free_ext_path(path);
}

static void ext4_es_insert_extent_ind_check(struct inode *inode,
                                            struct extent_status *es)
{
        struct ext4_map_blocks map;
        int retval;

        /*
         * Here we call ext4_ind_map_blocks to lookup a block mapping because
         * 'Indirect' structure is defined in indirect.c.  So we couldn't
         * access direct/indirect tree from outside.  It is too dirty to define
         * this function in indirect.c file.
         */

        map.m_lblk = es->es_lblk;
        map.m_len = es->es_len;

        retval = ext4_ind_map_blocks(NULL, inode, &map, 0);
        if (retval > 0) {
                if (ext4_es_is_delayed(es) || ext4_es_is_hole(es)) {
                        /*
                         * We want to add a delayed/hole extent but this
                         * block has been allocated.
                         */
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can find blocks but we want to add a "
                                "delayed/hole extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
                } else if (ext4_es_is_written(es)) {
                        if (retval != es->es_len) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu retval %d != es_len %d\n",
                                        inode->i_ino, retval, es->es_len);
                                return;
                        }
                        if (map.m_pblk != ext4_es_pblock(es)) {
                                pr_warn("ES insert assertion failed for "
                                        "inode: %lu m_pblk %llu != "
                                        "es_pblk %llu\n",
                                        inode->i_ino, map.m_pblk,
                                        ext4_es_pblock(es));
                                return;
                        }
                } else {
                        /*
                         * We don't need to check unwritten extent because
                         * indirect-based file doesn't have it.
                         */
                        BUG();
                }
        } else if (retval == 0) {
                if (ext4_es_is_written(es)) {
                        pr_warn("ES insert assertion failed for inode: %lu "
                                "We can't find the block but we want to add "
                                "a written extent [%d/%d/%llu/%x]\n",
                                inode->i_ino, es->es_lblk, es->es_len,
                                ext4_es_pblock(es), ext4_es_status(es));
                        return;
                }
        }
}

static inline void ext4_es_insert_extent_check(struct inode *inode,
                                               struct extent_status *es)
{
        /*
         * We don't need to worry about the race condition because
         * caller takes i_data_sem locking.
         */
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode)->i_data_sem));
        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                ext4_es_insert_extent_ext_check(inode, es);
        else
                ext4_es_insert_extent_ind_check(inode, es);
}
#else
static inline void ext4_es_insert_extent_check(struct inode *inode,
                                               struct extent_status *es)
{
}
#endif

static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
                              struct extent_status *prealloc)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct rb_node **p = &tree->root.rb_node;
        struct rb_node *parent = NULL;
        struct extent_status *es;

        while (*p) {
                parent = *p;
                es = rb_entry(parent, struct extent_status, rb_node);

                if (newes->es_lblk < es->es_lblk) {
                        if (ext4_es_can_be_merged(newes, es)) {
                                /*
                                 * Here we can modify es_lblk directly
                                 * because it isn't overlapped.
                                 */
                                es->es_lblk = newes->es_lblk;
                                es->es_len += newes->es_len;
                                if (ext4_es_is_written(es) ||
                                    ext4_es_is_unwritten(es))
                                        ext4_es_store_pblock(es,
                                                             newes->es_pblk);
                                es = ext4_es_try_to_merge_left(inode, es);
                                goto out;
                        }
                        p = &(*p)->rb_left;
                } else if (newes->es_lblk > ext4_es_end(es)) {
                        if (ext4_es_can_be_merged(es, newes)) {
                                es->es_len += newes->es_len;
                                es = ext4_es_try_to_merge_right(inode, es);
                                goto out;
                        }
                        p = &(*p)->rb_right;
                } else {
                        BUG();
                        return -EINVAL;
                }
        }

        if (prealloc)
                es = prealloc;
        else
                es = __es_alloc_extent(false);
        if (!es)
                return -ENOMEM;
        ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
                            newes->es_pblk);

        rb_link_node(&es->rb_node, parent, p);
        rb_insert_color(&es->rb_node, &tree->root);

out:
        tree->cache_es = es;
        return 0;
}

/*
 * ext4_es_insert_extent() adds information to an inode's extent
 * status tree.
 */
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
                           ext4_lblk_t len, ext4_fsblk_t pblk,
                           unsigned int status, bool delalloc_reserve_used)
{
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
        int err1 = 0, err2 = 0, err3 = 0;
        int resv_used = 0, pending = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
        struct pending_reservation *pr = NULL;
        bool revise_pending = false;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n",
                 lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino);

        if (!len)
                return;

        BUG_ON(end < lblk);
        WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED);

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, pblk, status);
        trace_ext4_es_insert_extent(inode, &newes);

        ext4_es_insert_extent_check(inode, &newes);

        revise_pending = sbi->s_cluster_ratio > 1 &&
                         test_opt(inode->i_sb, DELALLOC) &&
                         (status & (EXTENT_STATUS_WRITTEN |
                                    EXTENT_STATUS_UNWRITTEN));
retry:
        if (err1 && !es1)
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
        if ((err1 || err2 || err3 < 0) && revise_pending && !pr)
                pr = __alloc_pending(true);
        write_lock(&EXT4_I(inode)->i_es_lock);

        err1 = __es_remove_extent(inode, lblk, end, &resv_used, es1);
        if (err1 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es1) {
                if (!es1->es_len)
                        __es_free_extent(es1);
                es1 = NULL;
        }

        err2 = __es_insert_extent(inode, &newes, es2);
        if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
                err2 = 0;
        if (err2 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es2) {
                if (!es2->es_len)
                        __es_free_extent(es2);
                es2 = NULL;
        }

        if (revise_pending) {
                err3 = __revise_pending(inode, lblk, len, &pr);
                if (err3 < 0)
                        goto error;
                if (pr) {
                        __free_pending(pr);
                        pr = NULL;
                }
                pending = err3;
        }
error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        /*
         * Reduce the reserved cluster count to reflect successful deferred
         * allocation of delayed allocated clusters or direct allocation of
         * clusters discovered to be delayed allocated.  Once allocated, a
         * cluster is not included in the reserved count.
         *
         * When direct allocating (from fallocate, filemap, DIO, or clusters
         * allocated when delalloc has been disabled by ext4_nonda_switch())
         * an extent either 1) contains delayed blocks but start with
         * non-delayed allocated blocks (e.g. hole) or 2) contains non-delayed
         * allocated blocks which belong to delayed allocated clusters when
         * bigalloc feature is enabled, quota has already been claimed by
         * ext4_mb_new_blocks(), so release the quota reservations made for
         * any previously delayed allocated clusters instead of claim them
         * again.
         */
        resv_used += pending;
        if (resv_used)
                ext4_da_update_reserve_space(inode, resv_used,
                                             delalloc_reserve_used);

        if (err1 || err2 || err3 < 0)
                goto retry;

        ext4_es_print_tree(inode);
        return;
}

/*
 * ext4_es_cache_extent() inserts information into the extent status
 * tree if and only if there isn't information about the range in
 * question already.
 */
void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t len, ext4_fsblk_t pblk,
                          unsigned int status)
{
        struct extent_status *es;
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, pblk, status);
        trace_ext4_es_cache_extent(inode, &newes);

        if (!len)
                return;

        BUG_ON(end < lblk);

        write_lock(&EXT4_I(inode)->i_es_lock);

        es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
        if (!es || es->es_lblk > end)
                __es_insert_extent(inode, &newes, NULL);
        write_unlock(&EXT4_I(inode)->i_es_lock);
}

/*
 * ext4_es_lookup_extent() looks up an extent in extent status tree.
 *
 * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks.
 *
 * Return: 1 on found, 0 on not
 */
int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
                          ext4_lblk_t *next_lblk,
                          struct extent_status *es)
{
        struct ext4_es_tree *tree;
        struct ext4_es_stats *stats;
        struct extent_status *es1 = NULL;
        struct rb_node *node;
        int found = 0;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return 0;

        trace_ext4_es_lookup_extent_enter(inode, lblk);
        es_debug("lookup extent in block %u\n", lblk);

        tree = &EXT4_I(inode)->i_es_tree;
        read_lock(&EXT4_I(inode)->i_es_lock);

        /* find extent in cache firstly */
        es->es_lblk = es->es_len = es->es_pblk = 0;
        es1 = READ_ONCE(tree->cache_es);
        if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
                es_debug("%u cached by [%u/%u)\n",
                         lblk, es1->es_lblk, es1->es_len);
                found = 1;
                goto out;
        }

        node = tree->root.rb_node;
        while (node) {
                es1 = rb_entry(node, struct extent_status, rb_node);
                if (lblk < es1->es_lblk)
                        node = node->rb_left;
                else if (lblk > ext4_es_end(es1))
                        node = node->rb_right;
                else {
                        found = 1;
                        break;
                }
        }

out:
        stats = &EXT4_SB(inode->i_sb)->s_es_stats;
        if (found) {
                BUG_ON(!es1);
                es->es_lblk = es1->es_lblk;
                es->es_len = es1->es_len;
                es->es_pblk = es1->es_pblk;
                if (!ext4_es_is_referenced(es1))
                        ext4_es_set_referenced(es1);
                percpu_counter_inc(&stats->es_stats_cache_hits);
                if (next_lblk) {
                        node = rb_next(&es1->rb_node);
                        if (node) {
                                es1 = rb_entry(node, struct extent_status,
                                               rb_node);
                                *next_lblk = es1->es_lblk;
                        } else
                                *next_lblk = 0;
                }
        } else {
                percpu_counter_inc(&stats->es_stats_cache_misses);
        }

        read_unlock(&EXT4_I(inode)->i_es_lock);

        trace_ext4_es_lookup_extent_exit(inode, es, found);
        return found;
}

struct rsvd_count {
        int ndelayed;
        bool first_do_lblk_found;
        ext4_lblk_t first_do_lblk;
        ext4_lblk_t last_do_lblk;
        struct extent_status *left_es;
        bool partial;
        ext4_lblk_t lclu;
};

/*
 * init_rsvd - initialize reserved count data before removing block range
 *               in file from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @es - pointer to first extent in range
 * @rc - pointer to reserved count data
 *
 * Assumes es is not NULL
 */
static void init_rsvd(struct inode *inode, ext4_lblk_t lblk,
                      struct extent_status *es, struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct rb_node *node;

        rc->ndelayed = 0;

        /*
         * for bigalloc, note the first delayed block in the range has not
         * been found, record the extent containing the block to the left of
         * the region to be removed, if any, and note that there's no partial
         * cluster to track
         */
        if (sbi->s_cluster_ratio > 1) {
                rc->first_do_lblk_found = false;
                if (lblk > es->es_lblk) {
                        rc->left_es = es;
                } else {
                        node = rb_prev(&es->rb_node);
                        rc->left_es = node ? rb_entry(node,
                                                      struct extent_status,
                                                      rb_node) : NULL;
                }
                rc->partial = false;
        }
}

/*
 * count_rsvd - count the clusters containing delayed blocks in a range
 *                within an extent and add to the running tally in rsvd_count
 *
 * @inode - file containing extent
 * @lblk - first block in range
 * @len - length of range in blocks
 * @es - pointer to extent containing clusters to be counted
 * @rc - pointer to reserved count data
 *
 * Tracks partial clusters found at the beginning and end of extents so
 * they aren't overcounted when they span adjacent extents
 */
static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len,
                       struct extent_status *es, struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t i, end, nclu;

        if (!ext4_es_is_delayed(es))
                return;

        WARN_ON(len <= 0);

        if (sbi->s_cluster_ratio == 1) {
                rc->ndelayed += (int) len;
                return;
        }

        /* bigalloc */

        i = (lblk < es->es_lblk) ? es->es_lblk : lblk;
        end = lblk + (ext4_lblk_t) len - 1;
        end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end;

        /* record the first block of the first delayed extent seen */
        if (!rc->first_do_lblk_found) {
                rc->first_do_lblk = i;
                rc->first_do_lblk_found = true;
        }

        /* update the last lblk in the region seen so far */
        rc->last_do_lblk = end;

        /*
         * if we're tracking a partial cluster and the current extent
         * doesn't start with it, count it and stop tracking
         */
        if (rc->partial && (rc->lclu != EXT4_B2C(sbi, i))) {
                rc->ndelayed++;
                rc->partial = false;
        }

        /*
         * if the first cluster doesn't start on a cluster boundary but
         * ends on one, count it
         */
        if (EXT4_LBLK_COFF(sbi, i) != 0) {
                if (end >= EXT4_LBLK_CFILL(sbi, i)) {
                        rc->ndelayed++;
                        rc->partial = false;
                        i = EXT4_LBLK_CFILL(sbi, i) + 1;
                }
        }

        /*
         * if the current cluster starts on a cluster boundary, count the
         * number of whole delayed clusters in the extent
         */
        if ((i + sbi->s_cluster_ratio - 1) <= end) {
                nclu = (end - i + 1) >> sbi->s_cluster_bits;
                rc->ndelayed += nclu;
                i += nclu << sbi->s_cluster_bits;
        }

        /*
         * start tracking a partial cluster if there's a partial at the end
         * of the current extent and we're not already tracking one
         */
        if (!rc->partial && i <= end) {
                rc->partial = true;
                rc->lclu = EXT4_B2C(sbi, i);
        }
}

/*
 * __pr_tree_search - search for a pending cluster reservation
 *
 * @root - root of pending reservation tree
 * @lclu - logical cluster to search for
 *
 * Returns the pending reservation for the cluster identified by @lclu
 * if found.  If not, returns a reservation for the next cluster if any,
 * and if not, returns NULL.
 */
static struct pending_reservation *__pr_tree_search(struct rb_root *root,
                                                    ext4_lblk_t lclu)
{
        struct rb_node *node = root->rb_node;
        struct pending_reservation *pr = NULL;

        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                if (lclu < pr->lclu)
                        node = node->rb_left;
                else if (lclu > pr->lclu)
                        node = node->rb_right;
                else
                        return pr;
        }
        if (pr && lclu < pr->lclu)
                return pr;
        if (pr && lclu > pr->lclu) {
                node = rb_next(&pr->rb_node);
                return node ? rb_entry(node, struct pending_reservation,
                                       rb_node) : NULL;
        }
        return NULL;
}

/*
 * get_rsvd - calculates and returns the number of cluster reservations to be
 *              released when removing a block range from the extent status tree
 *              and releases any pending reservations within the range
 *
 * @inode - file containing block range
 * @end - last block in range
 * @right_es - pointer to extent containing next block beyond end or NULL
 * @rc - pointer to reserved count data
 *
 * The number of reservations to be released is equal to the number of
 * clusters containing delayed blocks within the range, minus the number of
 * clusters still containing delayed blocks at the ends of the range, and
 * minus the number of pending reservations within the range.
 */
static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
                             struct extent_status *right_es,
                             struct rsvd_count *rc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct pending_reservation *pr;
        struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
        struct rb_node *node;
        ext4_lblk_t first_lclu, last_lclu;
        bool left_delayed, right_delayed, count_pending;
        struct extent_status *es;

        if (sbi->s_cluster_ratio > 1) {
                /* count any remaining partial cluster */
                if (rc->partial)
                        rc->ndelayed++;

                if (rc->ndelayed == 0)
                        return 0;

                first_lclu = EXT4_B2C(sbi, rc->first_do_lblk);
                last_lclu = EXT4_B2C(sbi, rc->last_do_lblk);

                /*
                 * decrease the delayed count by the number of clusters at the
                 * ends of the range that still contain delayed blocks -
                 * these clusters still need to be reserved
                 */
                left_delayed = right_delayed = false;

                es = rc->left_es;
                while (es && ext4_es_end(es) >=
                       EXT4_LBLK_CMASK(sbi, rc->first_do_lblk)) {
                        if (ext4_es_is_delayed(es)) {
                                rc->ndelayed--;
                                left_delayed = true;
                                break;
                        }
                        node = rb_prev(&es->rb_node);
                        if (!node)
                                break;
                        es = rb_entry(node, struct extent_status, rb_node);
                }
                if (right_es && (!left_delayed || first_lclu != last_lclu)) {
                        if (end < ext4_es_end(right_es)) {
                                es = right_es;
                        } else {
                                node = rb_next(&right_es->rb_node);
                                es = node ? rb_entry(node, struct extent_status,
                                                     rb_node) : NULL;
                        }
                        while (es && es->es_lblk <=
                               EXT4_LBLK_CFILL(sbi, rc->last_do_lblk)) {
                                if (ext4_es_is_delayed(es)) {
                                        rc->ndelayed--;
                                        right_delayed = true;
                                        break;
                                }
                                node = rb_next(&es->rb_node);
                                if (!node)
                                        break;
                                es = rb_entry(node, struct extent_status,
                                              rb_node);
                        }
                }

                /*
                 * Determine the block range that should be searched for
                 * pending reservations, if any.  Clusters on the ends of the
                 * original removed range containing delayed blocks are
                 * excluded.  They've already been accounted for and it's not
                 * possible to determine if an associated pending reservation
                 * should be released with the information available in the
                 * extents status tree.
                 */
                if (first_lclu == last_lclu) {
                        if (left_delayed | right_delayed)
                                count_pending = false;
                        else
                                count_pending = true;
                } else {
                        if (left_delayed)
                                first_lclu++;
                        if (right_delayed)
                                last_lclu--;
                        if (first_lclu <= last_lclu)
                                count_pending = true;
                        else
                                count_pending = false;
                }

                /*
                 * a pending reservation found between first_lclu and last_lclu
                 * represents an allocated cluster that contained at least one
                 * delayed block, so the delayed total must be reduced by one
                 * for each pending reservation found and released
                 */
                if (count_pending) {
                        pr = __pr_tree_search(&tree->root, first_lclu);
                        while (pr && pr->lclu <= last_lclu) {
                                rc->ndelayed--;
                                node = rb_next(&pr->rb_node);
                                rb_erase(&pr->rb_node, &tree->root);
                                __free_pending(pr);
                                if (!node)
                                        break;
                                pr = rb_entry(node, struct pending_reservation,
                                              rb_node);
                        }
                }
        }
        return rc->ndelayed;
}


/*
 * __es_remove_extent - removes block range from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @end - last block in range
 * @reserved - number of cluster reservations released
 * @prealloc - pre-allocated es to avoid memory allocation failures
 *
 * If @reserved is not NULL and delayed allocation is enabled, counts
 * block/cluster reservations freed by removing range and if bigalloc
 * enabled cancels pending reservations as needed. Returns 0 on success,
 * error code on failure.
 */
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                              ext4_lblk_t end, int *reserved,
                              struct extent_status *prealloc)
{
        struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
        struct rb_node *node;
        struct extent_status *es;
        struct extent_status orig_es;
        ext4_lblk_t len1, len2;
        ext4_fsblk_t block;
        int err = 0;
        bool count_reserved = true;
        struct rsvd_count rc;

        if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
                count_reserved = false;

        es = __es_tree_search(&tree->root, lblk);
        if (!es)
                goto out;
        if (es->es_lblk > end)
                goto out;

        /* Simply invalidate cache_es. */
        tree->cache_es = NULL;
        if (count_reserved)
                init_rsvd(inode, lblk, es, &rc);

        orig_es.es_lblk = es->es_lblk;
        orig_es.es_len = es->es_len;
        orig_es.es_pblk = es->es_pblk;

        len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
        len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0;
        if (len1 > 0)
                es->es_len = len1;
        if (len2 > 0) {
                if (len1 > 0) {
                        struct extent_status newes;

                        newes.es_lblk = end + 1;
                        newes.es_len = len2;
                        block = 0x7FDEADBEEFULL;
                        if (ext4_es_is_written(&orig_es) ||
                            ext4_es_is_unwritten(&orig_es))
                                block = ext4_es_pblock(&orig_es) +
                                        orig_es.es_len - len2;
                        ext4_es_store_pblock_status(&newes, block,
                                                    ext4_es_status(&orig_es));
                        err = __es_insert_extent(inode, &newes, prealloc);
                        if (err) {
                                if (!ext4_es_must_keep(&newes))
                                        return 0;

                                es->es_lblk = orig_es.es_lblk;
                                es->es_len = orig_es.es_len;
                                goto out;
                        }
                } else {
                        es->es_lblk = end + 1;
                        es->es_len = len2;
                        if (ext4_es_is_written(es) ||
                            ext4_es_is_unwritten(es)) {
                                block = orig_es.es_pblk + orig_es.es_len - len2;
                                ext4_es_store_pblock(es, block);
                        }
                }
                if (count_reserved)
                        count_rsvd(inode, orig_es.es_lblk + len1,
                                   orig_es.es_len - len1 - len2, &orig_es, &rc);
                goto out_get_reserved;
        }

        if (len1 > 0) {
                if (count_reserved)
                        count_rsvd(inode, lblk, orig_es.es_len - len1,
                                   &orig_es, &rc);
                node = rb_next(&es->rb_node);
                if (node)
                        es = rb_entry(node, struct extent_status, rb_node);
                else
                        es = NULL;
        }

        while (es && ext4_es_end(es) <= end) {
                if (count_reserved)
                        count_rsvd(inode, es->es_lblk, es->es_len, es, &rc);
                node = rb_next(&es->rb_node);
                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                if (!node) {
                        es = NULL;
                        break;
                }
                es = rb_entry(node, struct extent_status, rb_node);
        }

        if (es && es->es_lblk < end + 1) {
                ext4_lblk_t orig_len = es->es_len;

                len1 = ext4_es_end(es) - end;
                if (count_reserved)
                        count_rsvd(inode, es->es_lblk, orig_len - len1,
                                   es, &rc);
                es->es_lblk = end + 1;
                es->es_len = len1;
                if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) {
                        block = es->es_pblk + orig_len - len1;
                        ext4_es_store_pblock(es, block);
                }
        }

out_get_reserved:
        if (count_reserved)
                *reserved = get_rsvd(inode, end, es, &rc);
out:
        return err;
}

/*
 * ext4_es_remove_extent - removes block range from extent status tree
 *
 * @inode - file containing range
 * @lblk - first block in range
 * @len - number of blocks to remove
 *
 * Reduces block/cluster reservation count and for bigalloc cancels pending
 * reservations as needed.
 */
void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
                           ext4_lblk_t len)
{
        ext4_lblk_t end;
        int err = 0;
        int reserved = 0;
        struct extent_status *es = NULL;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        trace_ext4_es_remove_extent(inode, lblk, len);
        es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
                 lblk, len, inode->i_ino);

        if (!len)
                return;

        end = lblk + len - 1;
        BUG_ON(end < lblk);

retry:
        if (err && !es)
                es = __es_alloc_extent(true);
        /*
         * ext4_clear_inode() depends on us taking i_es_lock unconditionally
         * so that we are sure __es_shrink() is done with the inode before it
         * is reclaimed.
         */
        write_lock(&EXT4_I(inode)->i_es_lock);
        err = __es_remove_extent(inode, lblk, end, &reserved, es);
        /* Free preallocated extent if it didn't get used. */
        if (es) {
                if (!es->es_len)
                        __es_free_extent(es);
                es = NULL;
        }
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err)
                goto retry;

        ext4_es_print_tree(inode);
        ext4_da_release_space(inode, reserved);
}

static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei)
{
        struct ext4_inode_info *ei;
        struct ext4_es_stats *es_stats;
        ktime_t start_time;
        u64 scan_time;
        int nr_to_walk;
        int nr_shrunk = 0;
        int retried = 0, nr_skipped = 0;

        es_stats = &sbi->s_es_stats;
        start_time = ktime_get();

retry:
        spin_lock(&sbi->s_es_lock);
        nr_to_walk = sbi->s_es_nr_inode;
        while (nr_to_walk-- > 0) {
                if (list_empty(&sbi->s_es_list)) {
                        spin_unlock(&sbi->s_es_lock);
                        goto out;
                }
                ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info,
                                      i_es_list);
                /* Move the inode to the tail */
                list_move_tail(&ei->i_es_list, &sbi->s_es_list);

                /*
                 * Normally we try hard to avoid shrinking precached inodes,
                 * but we will as a last resort.
                 */
                if (!retried && ext4_test_inode_state(&ei->vfs_inode,
                                                EXT4_STATE_EXT_PRECACHED)) {
                        nr_skipped++;
                        continue;
                }

                if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) {
                        nr_skipped++;
                        continue;
                }
                /*
                 * Now we hold i_es_lock which protects us from inode reclaim
                 * freeing inode under us
                 */
                spin_unlock(&sbi->s_es_lock);

                nr_shrunk += es_reclaim_extents(ei, &nr_to_scan);
                write_unlock(&ei->i_es_lock);

                if (nr_to_scan <= 0)
                        goto out;
                spin_lock(&sbi->s_es_lock);
        }
        spin_unlock(&sbi->s_es_lock);

        /*
         * If we skipped any inodes, and we weren't able to make any
         * forward progress, try again to scan precached inodes.
         */
        if ((nr_shrunk == 0) && nr_skipped && !retried) {
                retried++;
                goto retry;
        }

        if (locked_ei && nr_shrunk == 0)
                nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan);

out:
        scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
        if (likely(es_stats->es_stats_scan_time))
                es_stats->es_stats_scan_time = (scan_time +
                                es_stats->es_stats_scan_time*3) / 4;
        else
                es_stats->es_stats_scan_time = scan_time;
        if (scan_time > es_stats->es_stats_max_scan_time)
                es_stats->es_stats_max_scan_time = scan_time;
        if (likely(es_stats->es_stats_shrunk))
                es_stats->es_stats_shrunk = (nr_shrunk +
                                es_stats->es_stats_shrunk*3) / 4;
        else
                es_stats->es_stats_shrunk = nr_shrunk;

        trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time,
                             nr_skipped, retried);
        return nr_shrunk;
}

static unsigned long ext4_es_count(struct shrinker *shrink,
                                   struct shrink_control *sc)
{
        unsigned long nr;
        struct ext4_sb_info *sbi;

        sbi = shrink->private_data;
        nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr);
        return nr;
}

static unsigned long ext4_es_scan(struct shrinker *shrink,
                                  struct shrink_control *sc)
{
        struct ext4_sb_info *sbi = shrink->private_data;
        int nr_to_scan = sc->nr_to_scan;
        int ret, nr_shrunk;

        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret);

        nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL);

        ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt);
        trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret);
        return nr_shrunk;
}

int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
{
        struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
        struct ext4_es_stats *es_stats = &sbi->s_es_stats;
        struct ext4_inode_info *ei, *max = NULL;
        unsigned int inode_cnt = 0;

        if (v != SEQ_START_TOKEN)
                return 0;

        /* here we just find an inode that has the max nr. of objects */
        spin_lock(&sbi->s_es_lock);
        list_for_each_entry(ei, &sbi->s_es_list, i_es_list) {
                inode_cnt++;
                if (max && max->i_es_all_nr < ei->i_es_all_nr)
                        max = ei;
                else if (!max)
                        max = ei;
        }
        spin_unlock(&sbi->s_es_lock);

        seq_printf(seq, "stats:\n  %lld objects\n  %lld reclaimable objects\n",
                   percpu_counter_sum_positive(&es_stats->es_stats_all_cnt),
                   percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt));
        seq_printf(seq, "  %lld/%lld cache hits/misses\n",
                   percpu_counter_sum_positive(&es_stats->es_stats_cache_hits),
                   percpu_counter_sum_positive(&es_stats->es_stats_cache_misses));
        if (inode_cnt)
                seq_printf(seq, "  %d inodes on list\n", inode_cnt);

        seq_printf(seq, "average:\n  %llu us scan time\n",
            div_u64(es_stats->es_stats_scan_time, 1000));
        seq_printf(seq, "  %lu shrunk objects\n", es_stats->es_stats_shrunk);
        if (inode_cnt)
                seq_printf(seq,
                    "maximum:\n  %lu inode (%u objects, %u reclaimable)\n"
                    "  %llu us max scan time\n",
                    max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr,
                    div_u64(es_stats->es_stats_max_scan_time, 1000));

        return 0;
}

int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
        int err;

        /* Make sure we have enough bits for physical block number */
        BUILD_BUG_ON(ES_SHIFT < 48);
        INIT_LIST_HEAD(&sbi->s_es_list);
        sbi->s_es_nr_inode = 0;
        spin_lock_init(&sbi->s_es_lock);
        sbi->s_es_stats.es_stats_shrunk = 0;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_hits, 0,
                                  GFP_KERNEL);
        if (err)
                return err;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_cache_misses, 0,
                                  GFP_KERNEL);
        if (err)
                goto err1;
        sbi->s_es_stats.es_stats_scan_time = 0;
        sbi->s_es_stats.es_stats_max_scan_time = 0;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL);
        if (err)
                goto err2;
        err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL);
        if (err)
                goto err3;

        sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id);
        if (!sbi->s_es_shrinker) {
                err = -ENOMEM;
                goto err4;
        }

        sbi->s_es_shrinker->scan_objects = ext4_es_scan;
        sbi->s_es_shrinker->count_objects = ext4_es_count;
        sbi->s_es_shrinker->private_data = sbi;

        shrinker_register(sbi->s_es_shrinker);

        return 0;
err4:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
err3:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
err2:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
err1:
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
        return err;
}

void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_hits);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
        percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
        shrinker_free(sbi->s_es_shrinker);
}

/*
 * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at
 * most *nr_to_scan extents, update *nr_to_scan accordingly.
 *
 * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan.
 * Increment *nr_shrunk by the number of reclaimed extents. Also update
 * ei->i_es_shrink_lblk to where we should continue scanning.
 */
static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
                                 int *nr_to_scan, int *nr_shrunk)
{
        struct inode *inode = &ei->vfs_inode;
        struct ext4_es_tree *tree = &ei->i_es_tree;
        struct extent_status *es;
        struct rb_node *node;

        es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk);
        if (!es)
                goto out_wrap;

        while (*nr_to_scan > 0) {
                if (es->es_lblk > end) {
                        ei->i_es_shrink_lblk = end + 1;
                        return 0;
                }

                (*nr_to_scan)--;
                node = rb_next(&es->rb_node);

                if (ext4_es_must_keep(es))
                        goto next;
                if (ext4_es_is_referenced(es)) {
                        ext4_es_clear_referenced(es);
                        goto next;
                }

                rb_erase(&es->rb_node, &tree->root);
                ext4_es_free_extent(inode, es);
                (*nr_shrunk)++;
next:
                if (!node)
                        goto out_wrap;
                es = rb_entry(node, struct extent_status, rb_node);
        }
        ei->i_es_shrink_lblk = es->es_lblk;
        return 1;
out_wrap:
        ei->i_es_shrink_lblk = 0;
        return 0;
}

static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan)
{
        struct inode *inode = &ei->vfs_inode;
        int nr_shrunk = 0;
        ext4_lblk_t start = ei->i_es_shrink_lblk;
        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
                                      DEFAULT_RATELIMIT_BURST);

        if (ei->i_es_shk_nr == 0)
                return 0;

        if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) &&
            __ratelimit(&_rs))
                ext4_warning(inode->i_sb, "forced shrink of precached extents");

        if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) &&
            start != 0)
                es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk);

        ei->i_es_tree.cache_es = NULL;
        return nr_shrunk;
}

/*
 * Called to support EXT4_IOC_CLEAR_ES_CACHE.  We can only remove
 * discretionary entries from the extent status cache.  (Some entries
 * must be present for proper operations.)
 */
void ext4_clear_inode_es(struct inode *inode)
{
        struct ext4_inode_info *ei = EXT4_I(inode);
        struct extent_status *es;
        struct ext4_es_tree *tree;
        struct rb_node *node;

        write_lock(&ei->i_es_lock);
        tree = &EXT4_I(inode)->i_es_tree;
        tree->cache_es = NULL;
        node = rb_first(&tree->root);
        while (node) {
                es = rb_entry(node, struct extent_status, rb_node);
                node = rb_next(node);
                if (!ext4_es_must_keep(es)) {
                        rb_erase(&es->rb_node, &tree->root);
                        ext4_es_free_extent(inode, es);
                }
        }
        ext4_clear_inode_state(inode, EXT4_STATE_EXT_PRECACHED);
        write_unlock(&ei->i_es_lock);
}

#ifdef ES_DEBUG__
static void ext4_print_pending_tree(struct inode *inode)
{
        struct ext4_pending_tree *tree;
        struct rb_node *node;
        struct pending_reservation *pr;

        printk(KERN_DEBUG "pending reservations for inode %lu:", inode->i_ino);
        tree = &EXT4_I(inode)->i_pending_tree;
        node = rb_first(&tree->root);
        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                printk(KERN_DEBUG " %u", pr->lclu);
                node = rb_next(node);
        }
        printk(KERN_DEBUG "\n");
}
#else
#define ext4_print_pending_tree(inode)
#endif

int __init ext4_init_pending(void)
{
        ext4_pending_cachep = KMEM_CACHE(pending_reservation, SLAB_RECLAIM_ACCOUNT);
        if (ext4_pending_cachep == NULL)
                return -ENOMEM;
        return 0;
}

void ext4_exit_pending(void)
{
        kmem_cache_destroy(ext4_pending_cachep);
}

void ext4_init_pending_tree(struct ext4_pending_tree *tree)
{
        tree->root = RB_ROOT;
}

/*
 * __get_pending - retrieve a pointer to a pending reservation
 *
 * @inode - file containing the pending cluster reservation
 * @lclu - logical cluster of interest
 *
 * Returns a pointer to a pending reservation if it's a member of
 * the set, and NULL if not.  Must be called holding i_es_lock.
 */
static struct pending_reservation *__get_pending(struct inode *inode,
                                                 ext4_lblk_t lclu)
{
        struct ext4_pending_tree *tree;
        struct rb_node *node;
        struct pending_reservation *pr = NULL;

        tree = &EXT4_I(inode)->i_pending_tree;
        node = (&tree->root)->rb_node;

        while (node) {
                pr = rb_entry(node, struct pending_reservation, rb_node);
                if (lclu < pr->lclu)
                        node = node->rb_left;
                else if (lclu > pr->lclu)
                        node = node->rb_right;
                else if (lclu == pr->lclu)
                        return pr;
        }
        return NULL;
}

/*
 * __insert_pending - adds a pending cluster reservation to the set of
 *                    pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the cluster to be added
 * @prealloc - preallocated pending entry
 *
 * Returns 1 on successful insertion and -ENOMEM on failure.  If the
 * pending reservation is already in the set, returns successfully.
 */
static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
                            struct pending_reservation **prealloc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
        struct rb_node **p = &tree->root.rb_node;
        struct rb_node *parent = NULL;
        struct pending_reservation *pr;
        ext4_lblk_t lclu;
        int ret = 0;

        lclu = EXT4_B2C(sbi, lblk);
        /* search to find parent for insertion */
        while (*p) {
                parent = *p;
                pr = rb_entry(parent, struct pending_reservation, rb_node);

                if (lclu < pr->lclu) {
                        p = &(*p)->rb_left;
                } else if (lclu > pr->lclu) {
                        p = &(*p)->rb_right;
                } else {
                        /* pending reservation already inserted */
                        goto out;
                }
        }

        if (likely(*prealloc == NULL)) {
                pr = __alloc_pending(false);
                if (!pr) {
                        ret = -ENOMEM;
                        goto out;
                }
        } else {
                pr = *prealloc;
                *prealloc = NULL;
        }
        pr->lclu = lclu;

        rb_link_node(&pr->rb_node, parent, p);
        rb_insert_color(&pr->rb_node, &tree->root);
        ret = 1;

out:
        return ret;
}

/*
 * __remove_pending - removes a pending cluster reservation from the set
 *                    of pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the pending cluster reservation to be removed
 *
 * Returns successfully if pending reservation is not a member of the set.
 */
static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct pending_reservation *pr;
        struct ext4_pending_tree *tree;

        pr = __get_pending(inode, EXT4_B2C(sbi, lblk));
        if (pr != NULL) {
                tree = &EXT4_I(inode)->i_pending_tree;
                rb_erase(&pr->rb_node, &tree->root);
                __free_pending(pr);
        }
}

/*
 * ext4_remove_pending - removes a pending cluster reservation from the set
 *                       of pending reservations
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the pending cluster reservation to be removed
 *
 * Locking for external use of __remove_pending.
 */
void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        write_lock(&ei->i_es_lock);
        __remove_pending(inode, lblk);
        write_unlock(&ei->i_es_lock);
}

/*
 * ext4_is_pending - determine whether a cluster has a pending reservation
 *                   on it
 *
 * @inode - file containing the cluster
 * @lblk - logical block in the cluster
 *
 * Returns true if there's a pending reservation for the cluster in the
 * set of pending reservations, and false if not.
 */
bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
        bool ret;

        read_lock(&ei->i_es_lock);
        ret = (bool)(__get_pending(inode, EXT4_B2C(sbi, lblk)) != NULL);
        read_unlock(&ei->i_es_lock);

        return ret;
}

/*
 * ext4_es_insert_delayed_extent - adds some delayed blocks to the extents
 *                                 status tree, adding a pending reservation
 *                                 where needed
 *
 * @inode - file containing the newly added block
 * @lblk - start logical block to be added
 * @len - length of blocks to be added
 * @lclu_allocated/end_allocated - indicates whether a physical cluster has
 *                                 been allocated for the logical cluster
 *                                 that contains the start/end block. Note that
 *                                 end_allocated should always be set to false
 *                                 if the start and the end block are in the
 *                                 same cluster
 */
void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
                                   ext4_lblk_t len, bool lclu_allocated,
                                   bool end_allocated)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
        int err1 = 0, err2 = 0, err3 = 0;
        struct extent_status *es1 = NULL;
        struct extent_status *es2 = NULL;
        struct pending_reservation *pr1 = NULL;
        struct pending_reservation *pr2 = NULL;

        if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
                return;

        es_debug("add [%u/%u) delayed to extent status tree of inode %lu\n",
                 lblk, len, inode->i_ino);
        if (!len)
                return;

        WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) &&
                     end_allocated);

        newes.es_lblk = lblk;
        newes.es_len = len;
        ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED);
        trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated,
                                            end_allocated);

        ext4_es_insert_extent_check(inode, &newes);

retry:
        if (err1 && !es1)
                es1 = __es_alloc_extent(true);
        if ((err1 || err2) && !es2)
                es2 = __es_alloc_extent(true);
        if (err1 || err2 || err3 < 0) {
                if (lclu_allocated && !pr1)
                        pr1 = __alloc_pending(true);
                if (end_allocated && !pr2)
                        pr2 = __alloc_pending(true);
        }
        write_lock(&EXT4_I(inode)->i_es_lock);

        err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
        if (err1 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es1) {
                if (!es1->es_len)
                        __es_free_extent(es1);
                es1 = NULL;
        }

        err2 = __es_insert_extent(inode, &newes, es2);
        if (err2 != 0)
                goto error;
        /* Free preallocated extent if it didn't get used. */
        if (es2) {
                if (!es2->es_len)
                        __es_free_extent(es2);
                es2 = NULL;
        }

        if (lclu_allocated) {
                err3 = __insert_pending(inode, lblk, &pr1);
                if (err3 < 0)
                        goto error;
                if (pr1) {
                        __free_pending(pr1);
                        pr1 = NULL;
                }
        }
        if (end_allocated) {
                err3 = __insert_pending(inode, end, &pr2);
                if (err3 < 0)
                        goto error;
                if (pr2) {
                        __free_pending(pr2);
                        pr2 = NULL;
                }
        }
error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
        if (err1 || err2 || err3 < 0)
                goto retry;

        ext4_es_print_tree(inode);
        ext4_print_pending_tree(inode);
        return;
}

/*
 * __revise_pending - makes, cancels, or leaves unchanged pending cluster
 *                    reservations for a specified block range depending
 *                    upon the presence or absence of delayed blocks
 *                    outside the range within clusters at the ends of the
 *                    range
 *
 * @inode - file containing the range
 * @lblk - logical block defining the start of range
 * @len  - length of range in blocks
 * @prealloc - preallocated pending entry
 *
 * Used after a newly allocated extent is added to the extents status tree.
 * Requires that the extents in the range have either written or unwritten
 * status.  Must be called while holding i_es_lock. Returns number of new
 * inserts pending cluster on insert pendings, returns 0 on remove pendings,
 * return -ENOMEM on failure.
 */
static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
                            ext4_lblk_t len,
                            struct pending_reservation **prealloc)
{
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        ext4_lblk_t end = lblk + len - 1;
        ext4_lblk_t first, last;
        bool f_del = false, l_del = false;
        int pendings = 0;
        int ret = 0;

        if (len == 0)
                return 0;

        /*
         * Two cases - block range within single cluster and block range
         * spanning two or more clusters.  Note that a cluster belonging
         * to a range starting and/or ending on a cluster boundary is treated
         * as if it does not contain a delayed extent.  The new range may
         * have allocated space for previously delayed blocks out to the
         * cluster boundary, requiring that any pre-existing pending
         * reservation be canceled.  Because this code only looks at blocks
         * outside the range, it should revise pending reservations
         * correctly even if the extent represented by the range can't be
         * inserted in the extents status tree due to ENOSPC.
         */

        if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
                first = EXT4_LBLK_CMASK(sbi, lblk);
                if (first != lblk)
                        f_del = __es_scan_range(inode, &ext4_es_is_delayed,
                                                first, lblk - 1);
                if (f_del) {
                        ret = __insert_pending(inode, first, prealloc);
                        if (ret < 0)
                                goto out;
                        pendings += ret;
                } else {
                        last = EXT4_LBLK_CMASK(sbi, end) +
                               sbi->s_cluster_ratio - 1;
                        if (last != end)
                                l_del = __es_scan_range(inode,
                                                        &ext4_es_is_delayed,
                                                        end + 1, last);
                        if (l_del) {
                                ret = __insert_pending(inode, last, prealloc);
                                if (ret < 0)
                                        goto out;
                                pendings += ret;
                        } else
                                __remove_pending(inode, last);
                }
        } else {
                first = EXT4_LBLK_CMASK(sbi, lblk);
                if (first != lblk)
                        f_del = __es_scan_range(inode, &ext4_es_is_delayed,
                                                first, lblk - 1);
                if (f_del) {
                        ret = __insert_pending(inode, first, prealloc);
                        if (ret < 0)
                                goto out;
                        pendings += ret;
                } else
                        __remove_pending(inode, first);

                last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
                if (last != end)
                        l_del = __es_scan_range(inode, &ext4_es_is_delayed,
                                                end + 1, last);
                if (l_del) {
                        ret = __insert_pending(inode, last, prealloc);
                        if (ret < 0)
                                goto out;
                        pendings += ret;
                } else
                        __remove_pending(inode, last);
        }
out:
        return (ret < 0) ? ret : pendings;
}
































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 








    3 










    3 




    3 





    3 

    3 














    3 























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                IPv4 Forwarding Information Base: semantics.
 *
 * Authors:        Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 */

#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/netlink.h>
#include <linux/hash.h>
#include <linux/nospec.h>

#include <net/arp.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/nexthop.h>
#include <net/netlink.h>
#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>
#include <net/addrconf.h>

#include "fib_lookup.h"

/* for_nexthops and change_nexthops only used when nexthop object
 * is not set in a fib_info. The logic within can reference fib_nh.
 */
#ifdef CONFIG_IP_ROUTE_MULTIPATH

#define for_nexthops(fi) {                                                \
        int nhsel; const struct fib_nh *nh;                                \
        for (nhsel = 0, nh = (fi)->fib_nh;                                \
             nhsel < fib_info_num_path((fi));                                \
             nh++, nhsel++)

#define change_nexthops(fi) {                                                \
        int nhsel; struct fib_nh *nexthop_nh;                                \
        for (nhsel = 0,        nexthop_nh = (struct fib_nh *)((fi)->fib_nh);        \
             nhsel < fib_info_num_path((fi));                                \
             nexthop_nh++, nhsel++)

#else /* CONFIG_IP_ROUTE_MULTIPATH */

/* Hope, that gcc will optimize it to get rid of dummy loop */

#define for_nexthops(fi) {                                                \
        int nhsel; const struct fib_nh *nh = (fi)->fib_nh;                \
        for (nhsel = 0; nhsel < 1; nhsel++)

#define change_nexthops(fi) {                                                \
        int nhsel;                                                        \
        struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);        \
        for (nhsel = 0; nhsel < 1; nhsel++)

#endif /* CONFIG_IP_ROUTE_MULTIPATH */

#define endfor_nexthops(fi) }


const struct fib_prop fib_props[RTN_MAX + 1] = {
        [RTN_UNSPEC] = {
                .error        = 0,
                .scope        = RT_SCOPE_NOWHERE,
        },
        [RTN_UNICAST] = {
                .error        = 0,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_LOCAL] = {
                .error        = 0,
                .scope        = RT_SCOPE_HOST,
        },
        [RTN_BROADCAST] = {
                .error        = 0,
                .scope        = RT_SCOPE_LINK,
        },
        [RTN_ANYCAST] = {
                .error        = 0,
                .scope        = RT_SCOPE_LINK,
        },
        [RTN_MULTICAST] = {
                .error        = 0,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_BLACKHOLE] = {
                .error        = -EINVAL,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_UNREACHABLE] = {
                .error        = -EHOSTUNREACH,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_PROHIBIT] = {
                .error        = -EACCES,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_THROW] = {
                .error        = -EAGAIN,
                .scope        = RT_SCOPE_UNIVERSE,
        },
        [RTN_NAT] = {
                .error        = -EINVAL,
                .scope        = RT_SCOPE_NOWHERE,
        },
        [RTN_XRESOLVE] = {
                .error        = -EINVAL,
                .scope        = RT_SCOPE_NOWHERE,
        },
};

static void rt_fibinfo_free(struct rtable __rcu **rtp)
{
        struct rtable *rt = rcu_dereference_protected(*rtp, 1);

        if (!rt)
                return;

        /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
         * because we waited an RCU grace period before calling
         * free_fib_info_rcu()
         */

        dst_dev_put(&rt->dst);
        dst_release_immediate(&rt->dst);
}

static void free_nh_exceptions(struct fib_nh_common *nhc)
{
        struct fnhe_hash_bucket *hash;
        int i;

        hash = rcu_dereference_protected(nhc->nhc_exceptions, 1);
        if (!hash)
                return;
        for (i = 0; i < FNHE_HASH_SIZE; i++) {
                struct fib_nh_exception *fnhe;

                fnhe = rcu_dereference_protected(hash[i].chain, 1);
                while (fnhe) {
                        struct fib_nh_exception *next;

                        next = rcu_dereference_protected(fnhe->fnhe_next, 1);

                        rt_fibinfo_free(&fnhe->fnhe_rth_input);
                        rt_fibinfo_free(&fnhe->fnhe_rth_output);

                        kfree(fnhe);

                        fnhe = next;
                }
        }
        kfree(hash);
}

static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
{
        int cpu;

        if (!rtp)
                return;

        for_each_possible_cpu(cpu) {
                struct rtable *rt;

                rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
                if (rt) {
                        dst_dev_put(&rt->dst);
                        dst_release_immediate(&rt->dst);
                }
        }
        free_percpu(rtp);
}

void fib_nh_common_release(struct fib_nh_common *nhc)
{
        netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker);
        lwtstate_put(nhc->nhc_lwtstate);
        rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
        rt_fibinfo_free(&nhc->nhc_rth_input);
        free_nh_exceptions(nhc);
}
EXPORT_SYMBOL_GPL(fib_nh_common_release);

void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
{
#ifdef CONFIG_IP_ROUTE_CLASSID
        if (fib_nh->nh_tclassid)
                atomic_dec(&net->ipv4.fib_num_tclassid_users);
#endif
        fib_nh_common_release(&fib_nh->nh_common);
}

/* Release a nexthop info record */
static void free_fib_info_rcu(struct rcu_head *head)
{
        struct fib_info *fi = container_of(head, struct fib_info, rcu);

        if (fi->nh) {
                nexthop_put(fi->nh);
        } else {
                change_nexthops(fi) {
                        fib_nh_release(fi->fib_net, nexthop_nh);
                } endfor_nexthops(fi);
        }

        ip_fib_metrics_put(fi->fib_metrics);

        kfree(fi);
}

void free_fib_info(struct fib_info *fi)
{
        if (fi->fib_dead == 0) {
                pr_warn("Freeing alive fib_info %p\n", fi);
                return;
        }

        call_rcu_hurry(&fi->rcu, free_fib_info_rcu);
}
EXPORT_SYMBOL_GPL(free_fib_info);

void fib_release_info(struct fib_info *fi)
{
        ASSERT_RTNL();
        if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
                hlist_del(&fi->fib_hash);
                fi->fib_net->ipv4.fib_info_cnt--;

                if (fi->fib_prefsrc)
                        hlist_del(&fi->fib_lhash);
                if (fi->nh) {
                        list_del(&fi->nh_list);
                } else {
                        change_nexthops(fi) {
                                if (!nexthop_nh->fib_nh_dev)
                                        continue;
                                hlist_del_rcu(&nexthop_nh->nh_hash);
                        } endfor_nexthops(fi)
                }
                /* Paired with READ_ONCE() from fib_table_lookup() */
                WRITE_ONCE(fi->fib_dead, 1);
                fib_info_put(fi);
        }
}

static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
{
        const struct fib_nh *onh;

        if (fi->nh || ofi->nh)
                return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1;

        if (ofi->fib_nhs == 0)
                return 0;

        for_nexthops(fi) {
                onh = fib_info_nh(ofi, nhsel);

                if (nh->fib_nh_oif != onh->fib_nh_oif ||
                    nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
                    nh->fib_nh_scope != onh->fib_nh_scope ||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                    nh->fib_nh_weight != onh->fib_nh_weight ||
#endif
#ifdef CONFIG_IP_ROUTE_CLASSID
                    nh->nh_tclassid != onh->nh_tclassid ||
#endif
                    lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) ||
                    ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK))
                        return -1;

                if (nh->fib_nh_gw_family == AF_INET &&
                    nh->fib_nh_gw4 != onh->fib_nh_gw4)
                        return -1;

                if (nh->fib_nh_gw_family == AF_INET6 &&
                    ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
                        return -1;
        } endfor_nexthops(fi);
        return 0;
}

static struct hlist_head *fib_nh_head(struct net_device *dev)
{
        return &dev->fib_nh_head;
}

static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
                                      u32 prefsrc, u32 priority)
{
        unsigned int val = init_val;

        val ^= (protocol << 8) | scope;
        val ^= prefsrc;
        val ^= priority;

        return val;
}

static unsigned int fib_info_hashfn_result(const struct net *net,
                                           unsigned int val)
{
        return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits);
}

static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi)
{
        struct net *net = fi->fib_net;
        unsigned int val;

        val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
                                fi->fib_scope, (__force u32)fi->fib_prefsrc,
                                fi->fib_priority);

        if (fi->nh) {
                val ^= fi->nh->id;
        } else {
                for_nexthops(fi) {
                        val ^= nh->fib_nh_oif;
                } endfor_nexthops(fi)
        }

        return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)];
}

static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net,
                                                    __be32 val)
{
        unsigned int hash_bits = net->ipv4.fib_info_hash_bits;
        u32 slot;

        slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits);

        return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot];
}

static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits)
{
        /* The second half is used for prefsrc */
        return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *),
                        GFP_KERNEL);
}

static void fib_info_hash_free(struct hlist_head *head)
{
        kvfree(head);
}

static void fib_info_hash_grow(struct net *net)
{
        unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits;
        struct hlist_head *new_info_hash, *old_info_hash;
        unsigned int i;

        if (net->ipv4.fib_info_cnt < old_size)
                return;

        new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1);
        if (!new_info_hash)
                return;

        old_info_hash = net->ipv4.fib_info_hash;
        net->ipv4.fib_info_hash = new_info_hash;
        net->ipv4.fib_info_hash_bits += 1;

        for (i = 0; i < old_size; i++) {
                struct hlist_head *head = &old_info_hash[i];
                struct hlist_node *n;
                struct fib_info *fi;

                hlist_for_each_entry_safe(fi, n, head, fib_hash)
                        hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
        }

        for (i = 0; i < old_size; i++) {
                struct hlist_head *lhead = &old_info_hash[old_size + i];
                struct hlist_node *n;
                struct fib_info *fi;

                hlist_for_each_entry_safe(fi, n, lhead, fib_lhash)
                        hlist_add_head(&fi->fib_lhash,
                                       fib_info_laddrhash_bucket(fi->fib_net,
                                                                 fi->fib_prefsrc));
        }

        fib_info_hash_free(old_info_hash);
}

/* no metrics, only nexthop id */
static struct fib_info *fib_find_info_nh(struct net *net,
                                         const struct fib_config *cfg)
{
        struct hlist_head *head;
        struct fib_info *fi;
        unsigned int hash;

        hash = fib_info_hashfn_1(cfg->fc_nh_id,
                                 cfg->fc_protocol, cfg->fc_scope,
                                 (__force u32)cfg->fc_prefsrc,
                                 cfg->fc_priority);
        hash = fib_info_hashfn_result(net, hash);
        head = &net->ipv4.fib_info_hash[hash];

        hlist_for_each_entry(fi, head, fib_hash) {
                if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
                        continue;

                if (cfg->fc_protocol == fi->fib_protocol &&
                    cfg->fc_scope == fi->fib_scope &&
                    cfg->fc_prefsrc == fi->fib_prefsrc &&
                    cfg->fc_priority == fi->fib_priority &&
                    cfg->fc_type == fi->fib_type &&
                    cfg->fc_table == fi->fib_tb_id &&
                    !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK))
                        return fi;
        }

        return NULL;
}

static struct fib_info *fib_find_info(struct fib_info *nfi)
{
        struct hlist_head *head = fib_info_hash_bucket(nfi);
        struct fib_info *fi;

        hlist_for_each_entry(fi, head, fib_hash) {
                if (fi->fib_nhs != nfi->fib_nhs)
                        continue;

                if (nfi->fib_protocol == fi->fib_protocol &&
                    nfi->fib_scope == fi->fib_scope &&
                    nfi->fib_prefsrc == fi->fib_prefsrc &&
                    nfi->fib_priority == fi->fib_priority &&
                    nfi->fib_type == fi->fib_type &&
                    nfi->fib_tb_id == fi->fib_tb_id &&
                    memcmp(nfi->fib_metrics, fi->fib_metrics,
                           sizeof(u32) * RTAX_MAX) == 0 &&
                    !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
                    nh_comp(fi, nfi) == 0)
                        return fi;
        }

        return NULL;
}

/* Check, that the gateway is already configured.
 * Used only by redirect accept routine, under rcu_read_lock();
 */
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
        struct hlist_head *head;
        struct fib_nh *nh;

        head = fib_nh_head(dev);

        hlist_for_each_entry_rcu(nh, head, nh_hash) {
                DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
                if (nh->fib_nh_gw4 == gw &&
                    !(nh->fib_nh_flags & RTNH_F_DEAD)) {
                        return 0;
                }
        }

        return -1;
}

size_t fib_nlmsg_size(struct fib_info *fi)
{
        size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
                         + nla_total_size(4) /* RTA_TABLE */
                         + nla_total_size(4) /* RTA_DST */
                         + nla_total_size(4) /* RTA_PRIORITY */
                         + nla_total_size(4) /* RTA_PREFSRC */
                         + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
        unsigned int nhs = fib_info_num_path(fi);

        /* space for nested metrics */
        payload += nla_total_size((RTAX_MAX * nla_total_size(4)));

        if (fi->nh)
                payload += nla_total_size(4); /* RTA_NH_ID */

        if (nhs) {
                size_t nh_encapsize = 0;
                /* Also handles the special case nhs == 1 */

                /* each nexthop is packed in an attribute */
                size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
                unsigned int i;

                /* may contain flow and gateway attribute */
                nhsize += 2 * nla_total_size(4);

                /* grab encap info */
                for (i = 0; i < fib_info_num_path(fi); i++) {
                        struct fib_nh_common *nhc = fib_info_nhc(fi, i);

                        if (nhc->nhc_lwtstate) {
                                /* RTA_ENCAP_TYPE */
                                nh_encapsize += lwtunnel_get_encap_size(
                                                nhc->nhc_lwtstate);
                                /* RTA_ENCAP */
                                nh_encapsize +=  nla_total_size(2);
                        }
                }

                /* all nexthops are packed in a nested attribute */
                payload += nla_total_size((nhs * nhsize) + nh_encapsize);

        }

        return payload;
}

void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
               int dst_len, u32 tb_id, const struct nl_info *info,
               unsigned int nlm_flags)
{
        struct fib_rt_info fri;
        struct sk_buff *skb;
        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
        int err = -ENOBUFS;

        skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
        if (!skb)
                goto errout;

        fri.fi = fa->fa_info;
        fri.tb_id = tb_id;
        fri.dst = key;
        fri.dst_len = dst_len;
        fri.dscp = fa->fa_dscp;
        fri.type = fa->fa_type;
        fri.offload = READ_ONCE(fa->offload);
        fri.trap = READ_ONCE(fa->trap);
        fri.offload_failed = READ_ONCE(fa->offload_failed);
        err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
                    info->nlh, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}

static int fib_detect_death(struct fib_info *fi, int order,
                            struct fib_info **last_resort, int *last_idx,
                            int dflt)
{
        const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
        struct neighbour *n;
        int state = NUD_NONE;

        if (likely(nhc->nhc_gw_family == AF_INET))
                n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev);
        else if (nhc->nhc_gw_family == AF_INET6)
                n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6,
                                 nhc->nhc_dev);
        else
                n = NULL;

        if (n) {
                state = READ_ONCE(n->nud_state);
                neigh_release(n);
        } else {
                return 0;
        }
        if (state == NUD_REACHABLE)
                return 0;
        if ((state & NUD_VALID) && order != dflt)
                return 0;
        if ((state & NUD_VALID) ||
            (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
                *last_resort = fi;
                *last_idx = order;
        }
        return 1;
}

int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
                       struct nlattr *encap, u16 encap_type,
                       void *cfg, gfp_t gfp_flags,
                       struct netlink_ext_ack *extack)
{
        int err;

        nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
                                                    gfp_flags);
        if (!nhc->nhc_pcpu_rth_output)
                return -ENOMEM;

        if (encap) {
                struct lwtunnel_state *lwtstate;

                if (encap_type == LWTUNNEL_ENCAP_NONE) {
                        NL_SET_ERR_MSG(extack, "LWT encap type not specified");
                        err = -EINVAL;
                        goto lwt_failure;
                }
                err = lwtunnel_build_state(net, encap_type, encap,
                                           nhc->nhc_family, cfg, &lwtstate,
                                           extack);
                if (err)
                        goto lwt_failure;

                nhc->nhc_lwtstate = lwtstate_get(lwtstate);
        }

        return 0;

lwt_failure:
        rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
        nhc->nhc_pcpu_rth_output = NULL;
        return err;
}
EXPORT_SYMBOL_GPL(fib_nh_common_init);

int fib_nh_init(struct net *net, struct fib_nh *nh,
                struct fib_config *cfg, int nh_weight,
                struct netlink_ext_ack *extack)
{
        int err;

        nh->fib_nh_family = AF_INET;

        err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap,
                                 cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
        if (err)
                return err;

        nh->fib_nh_oif = cfg->fc_oif;
        nh->fib_nh_gw_family = cfg->fc_gw_family;
        if (cfg->fc_gw_family == AF_INET)
                nh->fib_nh_gw4 = cfg->fc_gw4;
        else if (cfg->fc_gw_family == AF_INET6)
                nh->fib_nh_gw6 = cfg->fc_gw6;

        nh->fib_nh_flags = cfg->fc_flags;

#ifdef CONFIG_IP_ROUTE_CLASSID
        nh->nh_tclassid = cfg->fc_flow;
        if (nh->nh_tclassid)
                atomic_inc(&net->ipv4.fib_num_tclassid_users);
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        nh->fib_nh_weight = nh_weight;
#endif
        return 0;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH

static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
                              struct netlink_ext_ack *extack)
{
        int nhs = 0;

        while (rtnh_ok(rtnh, remaining)) {
                nhs++;
                rtnh = rtnh_next(rtnh, &remaining);
        }

        /* leftover implies invalid nexthop configuration, discard it */
        if (remaining > 0) {
                NL_SET_ERR_MSG(extack,
                               "Invalid nexthop configuration - extra data after nexthops");
                nhs = 0;
        }

        return nhs;
}

static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla,
                            struct netlink_ext_ack *extack)
{
        if (nla_len(nla) < sizeof(*gw)) {
                NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY");
                return -EINVAL;
        }

        *gw = nla_get_in_addr(nla);

        return 0;
}

/* only called when fib_nh is integrated into fib_info */
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
                       int remaining, struct fib_config *cfg,
                       struct netlink_ext_ack *extack)
{
        struct net *net = fi->fib_net;
        struct fib_config fib_cfg;
        struct fib_nh *nh;
        int ret;

        change_nexthops(fi) {
                int attrlen;

                memset(&fib_cfg, 0, sizeof(fib_cfg));

                if (!rtnh_ok(rtnh, remaining)) {
                        NL_SET_ERR_MSG(extack,
                                       "Invalid nexthop configuration - extra data after nexthop");
                        return -EINVAL;
                }

                if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
                        NL_SET_ERR_MSG(extack,
                                       "Invalid flags for nexthop - can not contain DEAD or LINKDOWN");
                        return -EINVAL;
                }

                fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
                fib_cfg.fc_oif = rtnh->rtnh_ifindex;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        nlav = nla_find(attrs, attrlen, RTA_VIA);
                        if (nla && nlav) {
                                NL_SET_ERR_MSG(extack,
                                               "Nexthop configuration can not contain both GATEWAY and VIA");
                                return -EINVAL;
                        }
                        if (nla) {
                                ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla,
                                                       extack);
                                if (ret)
                                        goto errout;

                                if (fib_cfg.fc_gw4)
                                        fib_cfg.fc_gw_family = AF_INET;
                        } else if (nlav) {
                                ret = fib_gw_from_via(&fib_cfg, nlav, extack);
                                if (ret)
                                        goto errout;
                        }

                        nla = nla_find(attrs, attrlen, RTA_FLOW);
                        if (nla) {
                                if (nla_len(nla) < sizeof(u32)) {
                                        NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
                                        return -EINVAL;
                                }
                                fib_cfg.fc_flow = nla_get_u32(nla);
                        }

                        fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
                        /* RTA_ENCAP_TYPE length checked in
                         * lwtunnel_valid_encap_type_attr
                         */
                        nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
                        if (nla)
                                fib_cfg.fc_encap_type = nla_get_u16(nla);
                }

                ret = fib_nh_init(net, nexthop_nh, &fib_cfg,
                                  rtnh->rtnh_hops + 1, extack);
                if (ret)
                        goto errout;

                rtnh = rtnh_next(rtnh, &remaining);
        } endfor_nexthops(fi);

        ret = -EINVAL;
        nh = fib_info_nh(fi, 0);
        if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) {
                NL_SET_ERR_MSG(extack,
                               "Nexthop device index does not match RTA_OIF");
                goto errout;
        }
        if (cfg->fc_gw_family) {
                if (cfg->fc_gw_family != nh->fib_nh_gw_family ||
                    (cfg->fc_gw_family == AF_INET &&
                     nh->fib_nh_gw4 != cfg->fc_gw4) ||
                    (cfg->fc_gw_family == AF_INET6 &&
                     ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) {
                        NL_SET_ERR_MSG(extack,
                                       "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
                        goto errout;
                }
        }
#ifdef CONFIG_IP_ROUTE_CLASSID
        if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) {
                NL_SET_ERR_MSG(extack,
                               "Nexthop class id does not match RTA_FLOW");
                goto errout;
        }
#endif
        ret = 0;
errout:
        return ret;
}

/* only called when fib_nh is integrated into fib_info */
static void fib_rebalance(struct fib_info *fi)
{
        int total;
        int w;

        if (fib_info_num_path(fi) < 2)
                return;

        total = 0;
        for_nexthops(fi) {
                if (nh->fib_nh_flags & RTNH_F_DEAD)
                        continue;

                if (ip_ignore_linkdown(nh->fib_nh_dev) &&
                    nh->fib_nh_flags & RTNH_F_LINKDOWN)
                        continue;

                total += nh->fib_nh_weight;
        } endfor_nexthops(fi);

        w = 0;
        change_nexthops(fi) {
                int upper_bound;

                if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) {
                        upper_bound = -1;
                } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) &&
                           nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
                        upper_bound = -1;
                } else {
                        w += nexthop_nh->fib_nh_weight;
                        upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
                                                            total) - 1;
                }

                atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound);
        } endfor_nexthops(fi);
}
#else /* CONFIG_IP_ROUTE_MULTIPATH */

static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
                       int remaining, struct fib_config *cfg,
                       struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel");

        return -EINVAL;
}

#define fib_rebalance(fi) do { } while (0)

#endif /* CONFIG_IP_ROUTE_MULTIPATH */

static int fib_encap_match(struct net *net, u16 encap_type,
                           struct nlattr *encap,
                           const struct fib_nh *nh,
                           const struct fib_config *cfg,
                           struct netlink_ext_ack *extack)
{
        struct lwtunnel_state *lwtstate;
        int ret, result = 0;

        if (encap_type == LWTUNNEL_ENCAP_NONE)
                return 0;

        ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
                                   cfg, &lwtstate, extack);
        if (!ret) {
                result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
                lwtstate_free(lwtstate);
        }

        return result;
}

int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
                 struct netlink_ext_ack *extack)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        struct rtnexthop *rtnh;
        int remaining;
#endif

        if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
                return 1;

        if (cfg->fc_nh_id) {
                if (fi->nh && cfg->fc_nh_id == fi->nh->id)
                        return 0;
                return 1;
        }

        if (fi->nh) {
                if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp)
                        return 1;
                return 0;
        }

        if (cfg->fc_oif || cfg->fc_gw_family) {
                struct fib_nh *nh;

                nh = fib_info_nh(fi, 0);
                if (cfg->fc_encap) {
                        if (fib_encap_match(net, cfg->fc_encap_type,
                                            cfg->fc_encap, nh, cfg, extack))
                                return 1;
                }
#ifdef CONFIG_IP_ROUTE_CLASSID
                if (cfg->fc_flow &&
                    cfg->fc_flow != nh->nh_tclassid)
                        return 1;
#endif
                if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) ||
                    (cfg->fc_gw_family &&
                     cfg->fc_gw_family != nh->fib_nh_gw_family))
                        return 1;

                if (cfg->fc_gw_family == AF_INET &&
                    cfg->fc_gw4 != nh->fib_nh_gw4)
                        return 1;

                if (cfg->fc_gw_family == AF_INET6 &&
                    ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6))
                        return 1;

                return 0;
        }

#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (!cfg->fc_mp)
                return 0;

        rtnh = cfg->fc_mp;
        remaining = cfg->fc_mp_len;

        for_nexthops(fi) {
                int attrlen;

                if (!rtnh_ok(rtnh, remaining))
                        return -EINVAL;

                if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif)
                        return 1;

                attrlen = rtnh_attrlen(rtnh);
                if (attrlen > 0) {
                        struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
                        int err;

                        nla = nla_find(attrs, attrlen, RTA_GATEWAY);
                        nlav = nla_find(attrs, attrlen, RTA_VIA);
                        if (nla && nlav) {
                                NL_SET_ERR_MSG(extack,
                                               "Nexthop configuration can not contain both GATEWAY and VIA");
                                return -EINVAL;
                        }

                        if (nla) {
                                __be32 gw;

                                err = fib_gw_from_attr(&gw, nla, extack);
                                if (err)
                                        return err;

                                if (nh->fib_nh_gw_family != AF_INET ||
                                    gw != nh->fib_nh_gw4)
                                        return 1;
                        } else if (nlav) {
                                struct fib_config cfg2;

                                err = fib_gw_from_via(&cfg2, nlav, extack);
                                if (err)
                                        return err;

                                switch (nh->fib_nh_gw_family) {
                                case AF_INET:
                                        if (cfg2.fc_gw_family != AF_INET ||
                                            cfg2.fc_gw4 != nh->fib_nh_gw4)
                                                return 1;
                                        break;
                                case AF_INET6:
                                        if (cfg2.fc_gw_family != AF_INET6 ||
                                            ipv6_addr_cmp(&cfg2.fc_gw6,
                                                          &nh->fib_nh_gw6))
                                                return 1;
                                        break;
                                }
                        }

#ifdef CONFIG_IP_ROUTE_CLASSID
                        nla = nla_find(attrs, attrlen, RTA_FLOW);
                        if (nla) {
                                if (nla_len(nla) < sizeof(u32)) {
                                        NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
                                        return -EINVAL;
                                }
                                if (nla_get_u32(nla) != nh->nh_tclassid)
                                        return 1;
                        }
#endif
                }

                rtnh = rtnh_next(rtnh, &remaining);
        } endfor_nexthops(fi);
#endif
        return 0;
}

bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
{
        struct nlattr *nla;
        int remaining;

        if (!cfg->fc_mx)
                return true;

        nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
                int type = nla_type(nla);
                u32 fi_val, val;

                if (!type)
                        continue;
                if (type > RTAX_MAX)
                        return false;

                type = array_index_nospec(type, RTAX_MAX + 1);
                if (type == RTAX_CC_ALGO) {
                        char tmp[TCP_CA_NAME_MAX];
                        bool ecn_ca = false;

                        nla_strscpy(tmp, nla, sizeof(tmp));
                        val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
                } else {
                        if (nla_len(nla) != sizeof(u32))
                                return false;
                        val = nla_get_u32(nla);
                }

                fi_val = fi->fib_metrics->metrics[type - 1];
                if (type == RTAX_FEATURES)
                        fi_val &= ~DST_FEATURE_ECN_CA;

                if (fi_val != val)
                        return false;
        }

        return true;
}

static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
                              u32 table, struct netlink_ext_ack *extack)
{
        struct fib6_config cfg = {
                .fc_table = table,
                .fc_flags = nh->fib_nh_flags | RTF_GATEWAY,
                .fc_ifindex = nh->fib_nh_oif,
                .fc_gateway = nh->fib_nh_gw6,
        };
        struct fib6_nh fib6_nh = {};
        int err;

        err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
        if (!err) {
                nh->fib_nh_dev = fib6_nh.fib_nh_dev;
                netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
                            GFP_KERNEL);
                nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
                nh->fib_nh_scope = RT_SCOPE_LINK;

                ipv6_stub->fib6_nh_release(&fib6_nh);
        }

        return err;
}

/*
 * Picture
 * -------
 *
 * Semantics of nexthop is very messy by historical reasons.
 * We have to take into account, that:
 * a) gateway can be actually local interface address,
 *    so that gatewayed route is direct.
 * b) gateway must be on-link address, possibly
 *    described not by an ifaddr, but also by a direct route.
 * c) If both gateway and interface are specified, they should not
 *    contradict.
 * d) If we use tunnel routes, gateway could be not on-link.
 *
 * Attempt to reconcile all of these (alas, self-contradictory) conditions
 * results in pretty ugly and hairy code with obscure logic.
 *
 * I chose to generalized it instead, so that the size
 * of code does not increase practically, but it becomes
 * much more general.
 * Every prefix is assigned a "scope" value: "host" is local address,
 * "link" is direct route,
 * [ ... "site" ... "interior" ... ]
 * and "universe" is true gateway route with global meaning.
 *
 * Every prefix refers to a set of "nexthop"s (gw, oif),
 * where gw must have narrower scope. This recursion stops
 * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
 * which means that gw is forced to be on link.
 *
 * Code is still hairy, but now it is apparently logically
 * consistent and very flexible. F.e. as by-product it allows
 * to co-exists in peace independent exterior and interior
 * routing processes.
 *
 * Normally it looks as following.
 *
 * {universe prefix}  -> (gw, oif) [scope link]
 *                  |
 *                  |-> {link prefix} -> (gw, oif) [scope local]
 *                                        |
 *                                        |-> {local prefix} (terminal node)
 */
static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
                              u8 scope, struct netlink_ext_ack *extack)
{
        struct net_device *dev;
        struct fib_result res;
        int err = 0;

        if (nh->fib_nh_flags & RTNH_F_ONLINK) {
                unsigned int addr_type;

                if (scope >= RT_SCOPE_LINK) {
                        NL_SET_ERR_MSG(extack, "Nexthop has invalid scope");
                        return -EINVAL;
                }
                dev = __dev_get_by_index(net, nh->fib_nh_oif);
                if (!dev) {
                        NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
                        return -ENODEV;
                }
                if (!(dev->flags & IFF_UP)) {
                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
                        return -ENETDOWN;
                }
                addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4);
                if (addr_type != RTN_UNICAST) {
                        NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
                        return -EINVAL;
                }
                if (!netif_carrier_ok(dev))
                        nh->fib_nh_flags |= RTNH_F_LINKDOWN;
                nh->fib_nh_dev = dev;
                netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
                nh->fib_nh_scope = RT_SCOPE_LINK;
                return 0;
        }
        rcu_read_lock();
        {
                struct fib_table *tbl = NULL;
                struct flowi4 fl4 = {
                        .daddr = nh->fib_nh_gw4,
                        .flowi4_scope = scope + 1,
                        .flowi4_oif = nh->fib_nh_oif,
                        .flowi4_iif = LOOPBACK_IFINDEX,
                };

                /* It is not necessary, but requires a bit of thinking */
                if (fl4.flowi4_scope < RT_SCOPE_LINK)
                        fl4.flowi4_scope = RT_SCOPE_LINK;

                if (table && table != RT_TABLE_MAIN)
                        tbl = fib_get_table(net, table);

                if (tbl)
                        err = fib_table_lookup(tbl, &fl4, &res,
                                               FIB_LOOKUP_IGNORE_LINKSTATE |
                                               FIB_LOOKUP_NOREF);

                /* on error or if no table given do full lookup. This
                 * is needed for example when nexthops are in the local
                 * table rather than the given table
                 */
                if (!tbl || err) {
                        err = fib_lookup(net, &fl4, &res,
                                         FIB_LOOKUP_IGNORE_LINKSTATE);
                }

                if (err) {
                        NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
                        goto out;
                }
        }

        err = -EINVAL;
        if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
                NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
                goto out;
        }
        nh->fib_nh_scope = res.scope;
        nh->fib_nh_oif = FIB_RES_OIF(res);
        nh->fib_nh_dev = dev = FIB_RES_DEV(res);
        if (!dev) {
                NL_SET_ERR_MSG(extack,
                               "No egress device for nexthop gateway");
                goto out;
        }
        netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
        if (!netif_carrier_ok(dev))
                nh->fib_nh_flags |= RTNH_F_LINKDOWN;
        err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
out:
        rcu_read_unlock();
        return err;
}

static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
                              struct netlink_ext_ack *extack)
{
        struct in_device *in_dev;
        int err;

        if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
                return -EINVAL;
        }

        rcu_read_lock();

        err = -ENODEV;
        in_dev = inetdev_by_index(net, nh->fib_nh_oif);
        if (!in_dev)
                goto out;
        err = -ENETDOWN;
        if (!(in_dev->dev->flags & IFF_UP)) {
                NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
                goto out;
        }

        nh->fib_nh_dev = in_dev->dev;
        netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
        nh->fib_nh_scope = RT_SCOPE_HOST;
        if (!netif_carrier_ok(nh->fib_nh_dev))
                nh->fib_nh_flags |= RTNH_F_LINKDOWN;
        err = 0;
out:
        rcu_read_unlock();
        return err;
}

int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
                 struct netlink_ext_ack *extack)
{
        int err;

        if (nh->fib_nh_gw_family == AF_INET)
                err = fib_check_nh_v4_gw(net, nh, table, scope, extack);
        else if (nh->fib_nh_gw_family == AF_INET6)
                err = fib_check_nh_v6_gw(net, nh, table, extack);
        else
                err = fib_check_nh_nongw(net, nh, extack);

        return err;
}

__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
                                 unsigned char scope)
{
        struct fib_nh *nh;
        __be32 saddr;

        if (nhc->nhc_family != AF_INET)
                return inet_select_addr(nhc->nhc_dev, 0, scope);

        nh = container_of(nhc, struct fib_nh, nh_common);
        saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope);

        WRITE_ONCE(nh->nh_saddr, saddr);
        WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid));

        return saddr;
}

__be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
{
        struct fib_nh_common *nhc = res->nhc;

        if (res->fi->fib_prefsrc)
                return res->fi->fib_prefsrc;

        if (nhc->nhc_family == AF_INET) {
                struct fib_nh *nh;

                nh = container_of(nhc, struct fib_nh, nh_common);
                if (READ_ONCE(nh->nh_saddr_genid) ==
                    atomic_read(&net->ipv4.dev_addr_genid))
                        return READ_ONCE(nh->nh_saddr);
        }

        return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope);
}

static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
{
        if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
            fib_prefsrc != cfg->fc_dst) {
                u32 tb_id = cfg->fc_table;
                int rc;

                if (tb_id == RT_TABLE_MAIN)
                        tb_id = RT_TABLE_LOCAL;

                rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
                                          fib_prefsrc, tb_id);

                if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
                        rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
                                                  fib_prefsrc, RT_TABLE_LOCAL);
                }

                if (rc != RTN_LOCAL)
                        return false;
        }
        return true;
}

struct fib_info *fib_create_info(struct fib_config *cfg,
                                 struct netlink_ext_ack *extack)
{
        int err;
        struct fib_info *fi = NULL;
        struct nexthop *nh = NULL;
        struct fib_info *ofi;
        int nhs = 1;
        struct net *net = cfg->fc_nlinfo.nl_net;

        ASSERT_RTNL();
        if (cfg->fc_type > RTN_MAX)
                goto err_inval;

        /* Fast check to catch the most weird cases */
        if (fib_props[cfg->fc_type].scope > cfg->fc_scope) {
                NL_SET_ERR_MSG(extack, "Invalid scope");
                goto err_inval;
        }

        if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid rtm_flags - can not contain DEAD or LINKDOWN");
                goto err_inval;
        }

        if (cfg->fc_nh_id) {
                if (!cfg->fc_mx) {
                        fi = fib_find_info_nh(net, cfg);
                        if (fi) {
                                refcount_inc(&fi->fib_treeref);
                                return fi;
                        }
                }

                nh = nexthop_find_by_id(net, cfg->fc_nh_id);
                if (!nh) {
                        NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                        goto err_inval;
                }
                nhs = 0;
        }

#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (cfg->fc_mp) {
                nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
                if (nhs == 0)
                        goto err_inval;
        }
#endif

        fib_info_hash_grow(net);

        fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
        if (!fi) {
                err = -ENOBUFS;
                goto failure;
        }

        fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack);
        if (IS_ERR(fi->fib_metrics)) {
                err = PTR_ERR(fi->fib_metrics);
                kfree(fi);
                return ERR_PTR(err);
        }

        fi->fib_net = net;
        fi->fib_protocol = cfg->fc_protocol;
        fi->fib_scope = cfg->fc_scope;
        fi->fib_flags = cfg->fc_flags;
        fi->fib_priority = cfg->fc_priority;
        fi->fib_prefsrc = cfg->fc_prefsrc;
        fi->fib_type = cfg->fc_type;
        fi->fib_tb_id = cfg->fc_table;

        fi->fib_nhs = nhs;
        if (nh) {
                if (!nexthop_get(nh)) {
                        NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
                        err = -EINVAL;
                } else {
                        err = 0;
                        fi->nh = nh;
                }
        } else {
                change_nexthops(fi) {
                        nexthop_nh->nh_parent = fi;
                } endfor_nexthops(fi)

                if (cfg->fc_mp)
                        err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg,
                                          extack);
                else
                        err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
        }

        if (err != 0)
                goto failure;

        if (fib_props[cfg->fc_type].error) {
                if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) {
                        NL_SET_ERR_MSG(extack,
                                       "Gateway, device and multipath can not be specified for this route type");
                        goto err_inval;
                }
                goto link_it;
        } else {
                switch (cfg->fc_type) {
                case RTN_UNICAST:
                case RTN_LOCAL:
                case RTN_BROADCAST:
                case RTN_ANYCAST:
                case RTN_MULTICAST:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Invalid route type");
                        goto err_inval;
                }
        }

        if (cfg->fc_scope > RT_SCOPE_HOST) {
                NL_SET_ERR_MSG(extack, "Invalid scope");
                goto err_inval;
        }

        if (fi->nh) {
                err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack);
                if (err)
                        goto failure;
        } else if (cfg->fc_scope == RT_SCOPE_HOST) {
                struct fib_nh *nh = fi->fib_nh;

                /* Local address is added. */
                if (nhs != 1) {
                        NL_SET_ERR_MSG(extack,
                                       "Route with host scope can not have multiple nexthops");
                        goto err_inval;
                }
                if (nh->fib_nh_gw_family) {
                        NL_SET_ERR_MSG(extack,
                                       "Route with host scope can not have a gateway");
                        goto err_inval;
                }
                nh->fib_nh_scope = RT_SCOPE_NOWHERE;
                nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif);
                err = -ENODEV;
                if (!nh->fib_nh_dev)
                        goto failure;
                netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
                                     GFP_KERNEL);
        } else {
                int linkdown = 0;

                change_nexthops(fi) {
                        err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh,
                                           cfg->fc_table, cfg->fc_scope,
                                           extack);
                        if (err != 0)
                                goto failure;
                        if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
                                linkdown++;
                } endfor_nexthops(fi)
                if (linkdown == fi->fib_nhs)
                        fi->fib_flags |= RTNH_F_LINKDOWN;
        }

        if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) {
                NL_SET_ERR_MSG(extack, "Invalid prefsrc address");
                goto err_inval;
        }

        if (!fi->nh) {
                change_nexthops(fi) {
                        fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
                                                  fi->fib_scope);
                        if (nexthop_nh->fib_nh_gw_family == AF_INET6)
                                fi->fib_nh_is_v6 = true;
                } endfor_nexthops(fi)

                fib_rebalance(fi);
        }

link_it:
        ofi = fib_find_info(fi);
        if (ofi) {
                /* fib_table_lookup() should not see @fi yet. */
                fi->fib_dead = 1;
                free_fib_info(fi);
                refcount_inc(&ofi->fib_treeref);
                return ofi;
        }

        refcount_set(&fi->fib_treeref, 1);
        refcount_set(&fi->fib_clntref, 1);

        net->ipv4.fib_info_cnt++;
        hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));

        if (fi->fib_prefsrc) {
                struct hlist_head *head;

                head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc);
                hlist_add_head(&fi->fib_lhash, head);
        }
        if (fi->nh) {
                list_add(&fi->nh_list, &nh->fi_list);
        } else {
                change_nexthops(fi) {
                        struct hlist_head *head;

                        if (!nexthop_nh->fib_nh_dev)
                                continue;
                        head = fib_nh_head(nexthop_nh->fib_nh_dev);
                        hlist_add_head_rcu(&nexthop_nh->nh_hash, head);
                } endfor_nexthops(fi)
        }
        return fi;

err_inval:
        err = -EINVAL;

failure:
        if (fi) {
                /* fib_table_lookup() should not see @fi yet. */
                fi->fib_dead = 1;
                free_fib_info(fi);
        }

        return ERR_PTR(err);
}

int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
                     u8 rt_family, unsigned char *flags, bool skip_oif)
{
        if (nhc->nhc_flags & RTNH_F_DEAD)
                *flags |= RTNH_F_DEAD;

        if (nhc->nhc_flags & RTNH_F_LINKDOWN) {
                *flags |= RTNH_F_LINKDOWN;

                rcu_read_lock();
                switch (nhc->nhc_family) {
                case AF_INET:
                        if (ip_ignore_linkdown(nhc->nhc_dev))
                                *flags |= RTNH_F_DEAD;
                        break;
                case AF_INET6:
                        if (ip6_ignore_linkdown(nhc->nhc_dev))
                                *flags |= RTNH_F_DEAD;
                        break;
                }
                rcu_read_unlock();
        }

        switch (nhc->nhc_gw_family) {
        case AF_INET:
                if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
                        goto nla_put_failure;
                break;
        case AF_INET6:
                /* if gateway family does not match nexthop family
                 * gateway is encoded as RTA_VIA
                 */
                if (rt_family != nhc->nhc_gw_family) {
                        int alen = sizeof(struct in6_addr);
                        struct nlattr *nla;
                        struct rtvia *via;

                        nla = nla_reserve(skb, RTA_VIA, alen + 2);
                        if (!nla)
                                goto nla_put_failure;

                        via = nla_data(nla);
                        via->rtvia_family = AF_INET6;
                        memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen);
                } else if (nla_put_in6_addr(skb, RTA_GATEWAY,
                                            &nhc->nhc_gw.ipv6) < 0) {
                        goto nla_put_failure;
                }
                break;
        }

        *flags |= (nhc->nhc_flags &
                   (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP));

        if (!skip_oif && nhc->nhc_dev &&
            nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex))
                goto nla_put_failure;

        if (nhc->nhc_lwtstate &&
            lwtunnel_fill_encap(skb, nhc->nhc_lwtstate,
                                RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(fib_nexthop_info);

#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
                    int nh_weight, u8 rt_family, u32 nh_tclassid)
{
        const struct net_device *dev = nhc->nhc_dev;
        struct rtnexthop *rtnh;
        unsigned char flags = 0;

        rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
        if (!rtnh)
                goto nla_put_failure;

        rtnh->rtnh_hops = nh_weight - 1;
        rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;

        if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0)
                goto nla_put_failure;

        rtnh->rtnh_flags = flags;

        if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid))
                goto nla_put_failure;

        /* length of rtnetlink header + attributes */
        rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}
EXPORT_SYMBOL_GPL(fib_add_nexthop);
#endif

#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
{
        struct nlattr *mp;

        mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
        if (!mp)
                goto nla_put_failure;

        if (unlikely(fi->nh)) {
                if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0)
                        goto nla_put_failure;
                goto mp_end;
        }

        for_nexthops(fi) {
                u32 nh_tclassid = 0;
#ifdef CONFIG_IP_ROUTE_CLASSID
                nh_tclassid = nh->nh_tclassid;
#endif
                if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
                                    AF_INET, nh_tclassid) < 0)
                        goto nla_put_failure;
        } endfor_nexthops(fi);

mp_end:
        nla_nest_end(skb, mp);

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}
#else
static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
{
        return 0;
}
#endif

int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
                  const struct fib_rt_info *fri, unsigned int flags)
{
        unsigned int nhs = fib_info_num_path(fri->fi);
        struct fib_info *fi = fri->fi;
        u32 tb_id = fri->tb_id;
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
        if (!nlh)
                return -EMSGSIZE;

        rtm = nlmsg_data(nlh);
        rtm->rtm_family = AF_INET;
        rtm->rtm_dst_len = fri->dst_len;
        rtm->rtm_src_len = 0;
        rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp);
        if (tb_id < 256)
                rtm->rtm_table = tb_id;
        else
                rtm->rtm_table = RT_TABLE_COMPAT;
        if (nla_put_u32(skb, RTA_TABLE, tb_id))
                goto nla_put_failure;
        rtm->rtm_type = fri->type;
        rtm->rtm_flags = fi->fib_flags;
        rtm->rtm_scope = fi->fib_scope;
        rtm->rtm_protocol = fi->fib_protocol;

        if (rtm->rtm_dst_len &&
            nla_put_in_addr(skb, RTA_DST, fri->dst))
                goto nla_put_failure;
        if (fi->fib_priority &&
            nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
                goto nla_put_failure;
        if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
                goto nla_put_failure;

        if (fi->fib_prefsrc &&
            nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
                goto nla_put_failure;

        if (fi->nh) {
                if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id))
                        goto nla_put_failure;
                if (nexthop_is_blackhole(fi->nh))
                        rtm->rtm_type = RTN_BLACKHOLE;
                if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode))
                        goto offload;
        }

        if (nhs == 1) {
                const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
                unsigned char flags = 0;

                if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0)
                        goto nla_put_failure;

                rtm->rtm_flags = flags;
#ifdef CONFIG_IP_ROUTE_CLASSID
                if (nhc->nhc_family == AF_INET) {
                        struct fib_nh *nh;

                        nh = container_of(nhc, struct fib_nh, nh_common);
                        if (nh->nh_tclassid &&
                            nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
                                goto nla_put_failure;
                }
#endif
        } else {
                if (fib_add_multipath(skb, fi) < 0)
                        goto nla_put_failure;
        }

offload:
        if (fri->offload)
                rtm->rtm_flags |= RTM_F_OFFLOAD;
        if (fri->trap)
                rtm->rtm_flags |= RTM_F_TRAP;
        if (fri->offload_failed)
                rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

/*
 * Update FIB if:
 * - local address disappeared -> we must delete all the entries
 *   referring to it.
 * - device went down -> we must shutdown all nexthops going via it.
 */
int fib_sync_down_addr(struct net_device *dev, __be32 local)
{
        int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
        struct net *net = dev_net(dev);
        struct hlist_head *head;
        struct fib_info *fi;
        int ret = 0;

        if (!local)
                return 0;

        head = fib_info_laddrhash_bucket(net, local);
        hlist_for_each_entry(fi, head, fib_lhash) {
                if (!net_eq(fi->fib_net, net) ||
                    fi->fib_tb_id != tb_id)
                        continue;
                if (fi->fib_prefsrc == local) {
                        fi->fib_flags |= RTNH_F_DEAD;
                        fi->pfsrc_removed = true;
                        ret++;
                }
        }
        return ret;
}

static int call_fib_nh_notifiers(struct fib_nh *nh,
                                 enum fib_event_type event_type)
{
        bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev);
        struct fib_nh_notifier_info info = {
                .fib_nh = nh,
        };

        switch (event_type) {
        case FIB_EVENT_NH_ADD:
                if (nh->fib_nh_flags & RTNH_F_DEAD)
                        break;
                if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN)
                        break;
                return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type,
                                           &info.info);
        case FIB_EVENT_NH_DEL:
                if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) ||
                    (nh->fib_nh_flags & RTNH_F_DEAD))
                        return call_fib4_notifiers(dev_net(nh->fib_nh_dev),
                                                   event_type, &info.info);
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

/* Update the PMTU of exceptions when:
 * - the new MTU of the first hop becomes smaller than the PMTU
 * - the old MTU was the same as the PMTU, and it limited discovery of
 *   larger MTUs on the path. With that limit raised, we can now
 *   discover larger MTUs
 * A special case is locked exceptions, for which the PMTU is smaller
 * than the minimal accepted PMTU:
 * - if the new MTU is greater than the PMTU, don't make any change
 * - otherwise, unlock and set PMTU
 */
void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
{
        struct fnhe_hash_bucket *bucket;
        int i;

        bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1);
        if (!bucket)
                return;

        for (i = 0; i < FNHE_HASH_SIZE; i++) {
                struct fib_nh_exception *fnhe;

                for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
                     fnhe;
                     fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
                        if (fnhe->fnhe_mtu_locked) {
                                if (new <= fnhe->fnhe_pmtu) {
                                        fnhe->fnhe_pmtu = new;
                                        fnhe->fnhe_mtu_locked = false;
                                }
                        } else if (new < fnhe->fnhe_pmtu ||
                                   orig == fnhe->fnhe_pmtu) {
                                fnhe->fnhe_pmtu = new;
                        }
                }
        }
}

void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
{
        struct hlist_head *head = fib_nh_head(dev);
        struct fib_nh *nh;

        hlist_for_each_entry(nh, head, nh_hash) {
                DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
                fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
        }
}

/* Event              force Flags           Description
 * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
 * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
 * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
 * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
 *
 * only used when fib_nh is built into fib_info
 */
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
{
        struct hlist_head *head = fib_nh_head(dev);
        struct fib_info *prev_fi = NULL;
        int scope = RT_SCOPE_NOWHERE;
        struct fib_nh *nh;
        int ret = 0;

        if (force)
                scope = -1;

        hlist_for_each_entry(nh, head, nh_hash) {
                struct fib_info *fi = nh->nh_parent;
                int dead;

                BUG_ON(!fi->fib_nhs);
                DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
                if (fi == prev_fi)
                        continue;
                prev_fi = fi;
                dead = 0;
                change_nexthops(fi) {
                        if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD)
                                dead++;
                        else if (nexthop_nh->fib_nh_dev == dev &&
                                 nexthop_nh->fib_nh_scope != scope) {
                                switch (event) {
                                case NETDEV_DOWN:
                                case NETDEV_UNREGISTER:
                                        nexthop_nh->fib_nh_flags |= RTNH_F_DEAD;
                                        fallthrough;
                                case NETDEV_CHANGE:
                                        nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
                                        break;
                                }
                                call_fib_nh_notifiers(nexthop_nh,
                                                      FIB_EVENT_NH_DEL);
                                dead++;
                        }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                        if (event == NETDEV_UNREGISTER &&
                            nexthop_nh->fib_nh_dev == dev) {
                                dead = fi->fib_nhs;
                                break;
                        }
#endif
                } endfor_nexthops(fi)
                if (dead == fi->fib_nhs) {
                        switch (event) {
                        case NETDEV_DOWN:
                        case NETDEV_UNREGISTER:
                                fi->fib_flags |= RTNH_F_DEAD;
                                fallthrough;
                        case NETDEV_CHANGE:
                                fi->fib_flags |= RTNH_F_LINKDOWN;
                                break;
                        }
                        ret++;
                }

                fib_rebalance(fi);
        }

        return ret;
}

/* Must be invoked inside of an RCU protected region.  */
static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
{
        struct fib_info *fi = NULL, *last_resort = NULL;
        struct hlist_head *fa_head = res->fa_head;
        struct fib_table *tb = res->table;
        u8 slen = 32 - res->prefixlen;
        int order = -1, last_idx = -1;
        struct fib_alias *fa, *fa1 = NULL;
        u32 last_prio = res->fi->fib_priority;
        dscp_t last_dscp = 0;

        hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
                struct fib_info *next_fi = fa->fa_info;
                struct fib_nh_common *nhc;

                if (fa->fa_slen != slen)
                        continue;
                if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp))
                        continue;
                if (fa->tb_id != tb->tb_id)
                        continue;
                if (next_fi->fib_priority > last_prio &&
                    fa->fa_dscp == last_dscp) {
                        if (last_dscp)
                                continue;
                        break;
                }
                if (next_fi->fib_flags & RTNH_F_DEAD)
                        continue;
                last_dscp = fa->fa_dscp;
                last_prio = next_fi->fib_priority;

                if (next_fi->fib_scope != res->scope ||
                    fa->fa_type != RTN_UNICAST)
                        continue;

                nhc = fib_info_nhc(next_fi, 0);
                if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK)
                        continue;

                fib_alias_accessed(fa);

                if (!fi) {
                        if (next_fi != res->fi)
                                break;
                        fa1 = fa;
                } else if (!fib_detect_death(fi, order, &last_resort,
                                             &last_idx, fa1->fa_default)) {
                        fib_result_assign(res, fi);
                        fa1->fa_default = order;
                        goto out;
                }
                fi = next_fi;
                order++;
        }

        if (order <= 0 || !fi) {
                if (fa1)
                        fa1->fa_default = -1;
                goto out;
        }

        if (!fib_detect_death(fi, order, &last_resort, &last_idx,
                              fa1->fa_default)) {
                fib_result_assign(res, fi);
                fa1->fa_default = order;
                goto out;
        }

        if (last_idx >= 0)
                fib_result_assign(res, last_resort);
        fa1->fa_default = last_idx;
out:
        return;
}

/*
 * Dead device goes up. We wake up dead nexthops.
 * It takes sense only on multipath routes.
 *
 * only used when fib_nh is built into fib_info
 */
int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
{
        struct fib_info *prev_fi;
        struct hlist_head *head;
        struct fib_nh *nh;
        int ret;

        if (!(dev->flags & IFF_UP))
                return 0;

        if (nh_flags & RTNH_F_DEAD) {
                unsigned int flags = dev_get_flags(dev);

                if (flags & (IFF_RUNNING | IFF_LOWER_UP))
                        nh_flags |= RTNH_F_LINKDOWN;
        }

        prev_fi = NULL;
        head = fib_nh_head(dev);
        ret = 0;

        hlist_for_each_entry(nh, head, nh_hash) {
                struct fib_info *fi = nh->nh_parent;
                int alive;

                BUG_ON(!fi->fib_nhs);
                DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
                if (fi == prev_fi)
                        continue;

                prev_fi = fi;
                alive = 0;
                change_nexthops(fi) {
                        if (!(nexthop_nh->fib_nh_flags & nh_flags)) {
                                alive++;
                                continue;
                        }
                        if (!nexthop_nh->fib_nh_dev ||
                            !(nexthop_nh->fib_nh_dev->flags & IFF_UP))
                                continue;
                        if (nexthop_nh->fib_nh_dev != dev ||
                            !__in_dev_get_rtnl(dev))
                                continue;
                        alive++;
                        nexthop_nh->fib_nh_flags &= ~nh_flags;
                        call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
                } endfor_nexthops(fi)

                if (alive > 0) {
                        fi->fib_flags &= ~nh_flags;
                        ret++;
                }

                fib_rebalance(fi);
        }

        return ret;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
static bool fib_good_nh(const struct fib_nh *nh)
{
        int state = NUD_REACHABLE;

        if (nh->fib_nh_scope == RT_SCOPE_LINK) {
                struct neighbour *n;

                rcu_read_lock();

                if (likely(nh->fib_nh_gw_family == AF_INET))
                        n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
                                                   (__force u32)nh->fib_nh_gw4);
                else if (nh->fib_nh_gw_family == AF_INET6)
                        n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev,
                                                           &nh->fib_nh_gw6);
                else
                        n = NULL;
                if (n)
                        state = READ_ONCE(n->nud_state);

                rcu_read_unlock();
        }

        return !!(state & NUD_VALID);
}

void fib_select_multipath(struct fib_result *res, int hash)
{
        struct fib_info *fi = res->fi;
        struct net *net = fi->fib_net;
        bool first = false;

        if (unlikely(res->fi->nh)) {
                nexthop_path_fib_result(res, hash);
                return;
        }

        change_nexthops(fi) {
                if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) {
                        if (!fib_good_nh(nexthop_nh))
                                continue;
                        if (!first) {
                                res->nh_sel = nhsel;
                                res->nhc = &nexthop_nh->nh_common;
                                first = true;
                        }
                }

                if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound))
                        continue;

                res->nh_sel = nhsel;
                res->nhc = &nexthop_nh->nh_common;
                return;
        } endfor_nexthops(fi);
}
#endif

void fib_select_path(struct net *net, struct fib_result *res,
                     struct flowi4 *fl4, const struct sk_buff *skb)
{
        if (fl4->flowi4_oif)
                goto check_saddr;

#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (fib_info_num_path(res->fi) > 1) {
                int h = fib_multipath_hash(net, fl4, skb, NULL);

                fib_select_multipath(res, h);
        }
        else
#endif
        if (!res->prefixlen &&
            res->table->tb_num_default > 1 &&
            res->type == RTN_UNICAST)
                fib_select_default(fl4, res);

check_saddr:
        if (!fl4->saddr) {
                struct net_device *l3mdev;

                l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev);

                if (!l3mdev ||
                    l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev)
                        fl4->saddr = fib_result_prefsrc(net, res);
                else
                        fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
        }
}

int __net_init fib4_semantics_init(struct net *net)
{
        unsigned int hash_bits = 4;

        net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits);
        if (!net->ipv4.fib_info_hash)
                return -ENOMEM;

        net->ipv4.fib_info_hash_bits = hash_bits;
        net->ipv4.fib_info_cnt = 0;

        return 0;
}

void __net_exit fib4_semantics_exit(struct net *net)
{
        fib_info_hash_free(net->ipv4.fib_info_hash);
}




















































































































































































   11 
















    5 





   10 








   10 






















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
// SPDX-License-Identifier: GPL-2.0-only
/*
 * IRQ offload/bypass manager
 *
 * Copyright (C) 2015 Red Hat, Inc.
 * Copyright (c) 2015 Linaro Ltd.
 *
 * Various virtualization hardware acceleration techniques allow bypassing or
 * offloading interrupts received from devices around the host kernel.  Posted
 * Interrupts on Intel VT-d systems can allow interrupts to be received
 * directly by a virtual machine.  ARM IRQ Forwarding allows forwarded physical
 * interrupts to be directly deactivated by the guest.  This manager allows
 * interrupt producers and consumers to find each other to enable this sort of
 * bypass.
 */

#include <linux/irqbypass.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mutex.h>

MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("IRQ bypass manager utility module");

static LIST_HEAD(producers);
static LIST_HEAD(consumers);
static DEFINE_MUTEX(lock);

/* @lock must be held when calling connect */
static int __connect(struct irq_bypass_producer *prod,
                     struct irq_bypass_consumer *cons)
{
        int ret = 0;

        if (prod->stop)
                prod->stop(prod);
        if (cons->stop)
                cons->stop(cons);

        if (prod->add_consumer)
                ret = prod->add_consumer(prod, cons);

        if (!ret) {
                ret = cons->add_producer(cons, prod);
                if (ret && prod->del_consumer)
                        prod->del_consumer(prod, cons);
        }

        if (cons->start)
                cons->start(cons);
        if (prod->start)
                prod->start(prod);

        return ret;
}

/* @lock must be held when calling disconnect */
static void __disconnect(struct irq_bypass_producer *prod,
                         struct irq_bypass_consumer *cons)
{
        if (prod->stop)
                prod->stop(prod);
        if (cons->stop)
                cons->stop(cons);

        cons->del_producer(cons, prod);

        if (prod->del_consumer)
                prod->del_consumer(prod, cons);

        if (cons->start)
                cons->start(cons);
        if (prod->start)
                prod->start(prod);
}

/**
 * irq_bypass_register_producer - register IRQ bypass producer
 * @producer: pointer to producer structure
 *
 * Add the provided IRQ producer to the list of producers and connect
 * with any matching token found on the IRQ consumers list.
 */
int irq_bypass_register_producer(struct irq_bypass_producer *producer)
{
        struct irq_bypass_producer *tmp;
        struct irq_bypass_consumer *consumer;
        int ret;

        if (!producer->token)
                return -EINVAL;

        might_sleep();

        if (!try_module_get(THIS_MODULE))
                return -ENODEV;

        mutex_lock(&lock);

        list_for_each_entry(tmp, &producers, node) {
                if (tmp->token == producer->token) {
                        ret = -EBUSY;
                        goto out_err;
                }
        }

        list_for_each_entry(consumer, &consumers, node) {
                if (consumer->token == producer->token) {
                        ret = __connect(producer, consumer);
                        if (ret)
                                goto out_err;
                        break;
                }
        }

        list_add(&producer->node, &producers);

        mutex_unlock(&lock);

        return 0;
out_err:
        mutex_unlock(&lock);
        module_put(THIS_MODULE);
        return ret;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_producer);

/**
 * irq_bypass_unregister_producer - unregister IRQ bypass producer
 * @producer: pointer to producer structure
 *
 * Remove a previously registered IRQ producer from the list of producers
 * and disconnect it from any connected IRQ consumer.
 */
void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
{
        struct irq_bypass_producer *tmp;
        struct irq_bypass_consumer *consumer;

        if (!producer->token)
                return;

        might_sleep();

        if (!try_module_get(THIS_MODULE))
                return; /* nothing in the list anyway */

        mutex_lock(&lock);

        list_for_each_entry(tmp, &producers, node) {
                if (tmp->token != producer->token)
                        continue;

                list_for_each_entry(consumer, &consumers, node) {
                        if (consumer->token == producer->token) {
                                __disconnect(producer, consumer);
                                break;
                        }
                }

                list_del(&producer->node);
                module_put(THIS_MODULE);
                break;
        }

        mutex_unlock(&lock);

        module_put(THIS_MODULE);
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);

/**
 * irq_bypass_register_consumer - register IRQ bypass consumer
 * @consumer: pointer to consumer structure
 *
 * Add the provided IRQ consumer to the list of consumers and connect
 * with any matching token found on the IRQ producer list.
 */
int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
{
        struct irq_bypass_consumer *tmp;
        struct irq_bypass_producer *producer;
        int ret;

        if (!consumer->token ||
            !consumer->add_producer || !consumer->del_producer)
                return -EINVAL;

        might_sleep();

        if (!try_module_get(THIS_MODULE))
                return -ENODEV;

        mutex_lock(&lock);

        list_for_each_entry(tmp, &consumers, node) {
                if (tmp->token == consumer->token || tmp == consumer) {
                        ret = -EBUSY;
                        goto out_err;
                }
        }

        list_for_each_entry(producer, &producers, node) {
                if (producer->token == consumer->token) {
                        ret = __connect(producer, consumer);
                        if (ret)
                                goto out_err;
                        break;
                }
        }

        list_add(&consumer->node, &consumers);

        mutex_unlock(&lock);

        return 0;
out_err:
        mutex_unlock(&lock);
        module_put(THIS_MODULE);
        return ret;
}
EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);

/**
 * irq_bypass_unregister_consumer - unregister IRQ bypass consumer
 * @consumer: pointer to consumer structure
 *
 * Remove a previously registered IRQ consumer from the list of consumers
 * and disconnect it from any connected IRQ producer.
 */
void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
{
        struct irq_bypass_consumer *tmp;
        struct irq_bypass_producer *producer;

        if (!consumer->token)
                return;

        might_sleep();

        if (!try_module_get(THIS_MODULE))
                return; /* nothing in the list anyway */

        mutex_lock(&lock);

        list_for_each_entry(tmp, &consumers, node) {
                if (tmp != consumer)
                        continue;

                list_for_each_entry(producer, &producers, node) {
                        if (producer->token == consumer->token) {
                                __disconnect(producer, consumer);
                                break;
                        }
                }

                list_del(&consumer->node);
                module_put(THIS_MODULE);
                break;
        }

        mutex_unlock(&lock);

        module_put(THIS_MODULE);
}
EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);






































































































































































































































































































































































































  321 










































































  319 





  322 




  322 



  136 
  136 





























































  319 





















  318 



























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* include/asm-generic/tlb.h
 *
 *        Generic TLB shootdown code
 *
 * Copyright 2001 Red Hat, Inc.
 * Based on code from mm/memory.c Copyright Linus Torvalds and others.
 *
 * Copyright 2011 Red Hat, Inc., Peter Zijlstra
 */
#ifndef _ASM_GENERIC__TLB_H
#define _ASM_GENERIC__TLB_H

#include <linux/mmu_notifier.h>
#include <linux/swap.h>
#include <linux/hugetlb_inline.h>
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>

/*
 * Blindly accessing user memory from NMI context can be dangerous
 * if we're in the middle of switching the current user task or switching
 * the loaded mm.
 */
#ifndef nmi_uaccess_okay
# define nmi_uaccess_okay() true
#endif

#ifdef CONFIG_MMU

/*
 * Generic MMU-gather implementation.
 *
 * The mmu_gather data structure is used by the mm code to implement the
 * correct and efficient ordering of freeing pages and TLB invalidations.
 *
 * This correct ordering is:
 *
 *  1) unhook page
 *  2) TLB invalidate page
 *  3) free page
 *
 * That is, we must never free a page before we have ensured there are no live
 * translations left to it. Otherwise it might be possible to observe (or
 * worse, change) the page content after it has been reused.
 *
 * The mmu_gather API consists of:
 *
 *  - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu()
 *
 *    start and finish a mmu_gather
 *
 *    Finish in particular will issue a (final) TLB invalidate and free
 *    all (remaining) queued pages.
 *
 *  - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA
 *
 *    Defaults to flushing at tlb_end_vma() to reset the range; helps when
 *    there's large holes between the VMAs.
 *
 *  - tlb_remove_table()
 *
 *    tlb_remove_table() is the basic primitive to free page-table directories
 *    (__p*_free_tlb()).  In it's most primitive form it is an alias for
 *    tlb_remove_page() below, for when page directories are pages and have no
 *    additional constraints.
 *
 *    See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE.
 *
 *  - tlb_remove_page() / tlb_remove_page_size()
 *  - __tlb_remove_folio_pages() / __tlb_remove_page_size()
 *  - __tlb_remove_folio_pages_size()
 *
 *    __tlb_remove_folio_pages_size() is the basic primitive that queues pages
 *    for freeing. It will return a boolean indicating if the queue is (now)
 *    full and a call to tlb_flush_mmu() is required.
 *
 *    tlb_remove_page() and tlb_remove_page_size() imply the call to
 *    tlb_flush_mmu() when required and has no return value.
 *
 *    __tlb_remove_folio_pages() is similar to __tlb_remove_page_size(),
 *    however, instead of removing a single page, assume PAGE_SIZE and remove
 *    the given number of consecutive pages that are all part of the
 *    same (large) folio.
 *
 *  - tlb_change_page_size()
 *
 *    call before __tlb_remove_page*() to set the current page-size; implies a
 *    possible tlb_flush_mmu() call.
 *
 *  - tlb_flush_mmu() / tlb_flush_mmu_tlbonly()
 *
 *    tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets
 *                              related state, like the range)
 *
 *    tlb_flush_mmu() - in addition to the above TLB invalidate, also frees
 *                        whatever pages are still batched.
 *
 *  - mmu_gather::fullmm
 *
 *    A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free
 *    the entire mm; this allows a number of optimizations.
 *
 *    - We can ignore tlb_{start,end}_vma(); because we don't
 *      care about ranges. Everything will be shot down.
 *
 *    - (RISC) architectures that use ASIDs can cycle to a new ASID
 *      and delay the invalidation until ASID space runs out.
 *
 *  - mmu_gather::need_flush_all
 *
 *    A flag that can be set by the arch code if it wants to force
 *    flush the entire TLB irrespective of the range. For instance
 *    x86-PAE needs this when changing top-level entries.
 *
 * And allows the architecture to provide and implement tlb_flush():
 *
 * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make
 * use of:
 *
 *  - mmu_gather::start / mmu_gather::end
 *
 *    which provides the range that needs to be flushed to cover the pages to
 *    be freed.
 *
 *  - mmu_gather::freed_tables
 *
 *    set when we freed page table pages
 *
 *  - tlb_get_unmap_shift() / tlb_get_unmap_size()
 *
 *    returns the smallest TLB entry size unmapped in this range.
 *
 * If an architecture does not provide tlb_flush() a default implementation
 * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is
 * specified, in which case we'll default to flush_tlb_mm().
 *
 * Additionally there are a few opt-in features:
 *
 *  MMU_GATHER_PAGE_SIZE
 *
 *  This ensures we call tlb_flush() every time tlb_change_page_size() actually
 *  changes the size and provides mmu_gather::page_size to tlb_flush().
 *
 *  This might be useful if your architecture has size specific TLB
 *  invalidation instructions.
 *
 *  MMU_GATHER_TABLE_FREE
 *
 *  This provides tlb_remove_table(), to be used instead of tlb_remove_page()
 *  for page directores (__p*_free_tlb()).
 *
 *  Useful if your architecture has non-page page directories.
 *
 *  When used, an architecture is expected to provide __tlb_remove_table() or
 *  use the generic __tlb_remove_table(), which does the actual freeing of these
 *  pages.
 *
 *  MMU_GATHER_RCU_TABLE_FREE
 *
 *  Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see
 *  comment below).
 *
 *  Useful if your architecture doesn't use IPIs for remote TLB invalidates
 *  and therefore doesn't naturally serialize with software page-table walkers.
 *
 *  MMU_GATHER_NO_FLUSH_CACHE
 *
 *  Indicates the architecture has flush_cache_range() but it needs *NOT* be called
 *  before unmapping a VMA.
 *
 *  NOTE: strictly speaking we shouldn't have this knob and instead rely on
 *          flush_cache_range() being a NOP, except Sparc64 seems to be
 *          different here.
 *
 *  MMU_GATHER_MERGE_VMAS
 *
 *  Indicates the architecture wants to merge ranges over VMAs; typical when
 *  multiple range invalidates are more expensive than a full invalidate.
 *
 *  MMU_GATHER_NO_RANGE
 *
 *  Use this if your architecture lacks an efficient flush_tlb_range(). This
 *  option implies MMU_GATHER_MERGE_VMAS above.
 *
 *  MMU_GATHER_NO_GATHER
 *
 *  If the option is set the mmu_gather will not track individual pages for
 *  delayed page free anymore. A platform that enables the option needs to
 *  provide its own implementation of the __tlb_remove_page_size() function to
 *  free pages.
 *
 *  This is useful if your architecture already flushes TLB entries in the
 *  various ptep_get_and_clear() functions.
 */

#ifdef CONFIG_MMU_GATHER_TABLE_FREE

struct mmu_table_batch {
#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
        struct rcu_head                rcu;
#endif
        unsigned int                nr;
        void                        *tables[];
};

#define MAX_TABLE_BATCH                \
        ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *))

#ifndef __HAVE_ARCH_TLB_REMOVE_TABLE
static inline void __tlb_remove_table(void *table)
{
        struct ptdesc *ptdesc = (struct ptdesc *)table;

        pagetable_dtor_free(ptdesc);
}
#endif

extern void tlb_remove_table(struct mmu_gather *tlb, void *table);

#else /* !CONFIG_MMU_GATHER_TABLE_FREE */

static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page);
/*
 * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based
 * page directories and we can use the normal page batching to free them.
 */
static inline void tlb_remove_table(struct mmu_gather *tlb, void *table)
{
        struct ptdesc *ptdesc = (struct ptdesc *)table;

        pagetable_dtor(ptdesc);
        tlb_remove_page(tlb, ptdesc_page(ptdesc));
}
#endif /* CONFIG_MMU_GATHER_TABLE_FREE */

#ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
/*
 * This allows an architecture that does not use the linux page-tables for
 * hardware to skip the TLBI when freeing page tables.
 */
#ifndef tlb_needs_table_invalidate
#define tlb_needs_table_invalidate() (true)
#endif

void tlb_remove_table_sync_one(void);

#else

#ifdef tlb_needs_table_invalidate
#error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
#endif

static inline void tlb_remove_table_sync_one(void) { }

#endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */


#ifndef CONFIG_MMU_GATHER_NO_GATHER
/*
 * If we can't allocate a page to make a big batch of page pointers
 * to work on, then just handle a few from the on-stack structure.
 */
#define MMU_GATHER_BUNDLE        8

struct mmu_gather_batch {
        struct mmu_gather_batch        *next;
        unsigned int                nr;
        unsigned int                max;
        struct encoded_page        *encoded_pages[];
};

#define MAX_GATHER_BATCH        \
        ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))

/*
 * Limit the maximum number of mmu_gather batches to reduce a risk of soft
 * lockups for non-preemptible kernels on huge machines when a lot of memory
 * is zapped during unmapping.
 * 10K pages freed at once should be safe even without a preemption point.
 */
#define MAX_GATHER_BATCH_COUNT        (10000UL/MAX_GATHER_BATCH)

extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page,
                bool delay_rmap, int page_size);
bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
                unsigned int nr_pages, bool delay_rmap);

#ifdef CONFIG_SMP
/*
 * This both sets 'delayed_rmap', and returns true. It would be an inline
 * function, except we define it before the 'struct mmu_gather'.
 */
#define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true)
extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma);
#endif

#endif

/*
 * We have a no-op version of the rmap removal that doesn't
 * delay anything. That is used on S390, which flushes remote
 * TLBs synchronously, and on UP, which doesn't have any
 * remote TLBs to flush and is not preemptible due to this
 * all happening under the page table lock.
 */
#ifndef tlb_delay_rmap
#define tlb_delay_rmap(tlb) (false)
static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { }
#endif

/*
 * struct mmu_gather is an opaque type used by the mm code for passing around
 * any data needed by arch specific code for tlb_remove_page.
 */
struct mmu_gather {
        struct mm_struct        *mm;

#ifdef CONFIG_MMU_GATHER_TABLE_FREE
        struct mmu_table_batch        *batch;
#endif

        unsigned long                start;
        unsigned long                end;
        /*
         * we are in the middle of an operation to clear
         * a full mm and can make some optimizations
         */
        unsigned int                fullmm : 1;

        /*
         * we have performed an operation which
         * requires a complete flush of the tlb
         */
        unsigned int                need_flush_all : 1;

        /*
         * we have removed page directories
         */
        unsigned int                freed_tables : 1;

        /*
         * Do we have pending delayed rmap removals?
         */
        unsigned int                delayed_rmap : 1;

        /*
         * at which levels have we cleared entries?
         */
        unsigned int                cleared_ptes : 1;
        unsigned int                cleared_pmds : 1;
        unsigned int                cleared_puds : 1;
        unsigned int                cleared_p4ds : 1;

        /*
         * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma
         */
        unsigned int                vma_exec : 1;
        unsigned int                vma_huge : 1;
        unsigned int                vma_pfn  : 1;

        unsigned int                batch_count;

#ifndef CONFIG_MMU_GATHER_NO_GATHER
        struct mmu_gather_batch *active;
        struct mmu_gather_batch        local;
        struct page                *__pages[MMU_GATHER_BUNDLE];

#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        unsigned int page_size;
#endif
#endif
};

void tlb_flush_mmu(struct mmu_gather *tlb);

static inline void __tlb_adjust_range(struct mmu_gather *tlb,
                                      unsigned long address,
                                      unsigned int range_size)
{
        tlb->start = min(tlb->start, address);
        tlb->end = max(tlb->end, address + range_size);
}

static inline void __tlb_reset_range(struct mmu_gather *tlb)
{
        if (tlb->fullmm) {
                tlb->start = tlb->end = ~0;
        } else {
                tlb->start = TASK_SIZE;
                tlb->end = 0;
        }
        tlb->freed_tables = 0;
        tlb->cleared_ptes = 0;
        tlb->cleared_pmds = 0;
        tlb->cleared_puds = 0;
        tlb->cleared_p4ds = 0;
        /*
         * Do not reset mmu_gather::vma_* fields here, we do not
         * call into tlb_start_vma() again to set them if there is an
         * intermediate flush.
         */
}

#ifdef CONFIG_MMU_GATHER_NO_RANGE

#if defined(tlb_flush)
#error MMU_GATHER_NO_RANGE relies on default tlb_flush()
#endif

/*
 * When an architecture does not have efficient means of range flushing TLBs
 * there is no point in doing intermediate flushes on tlb_end_vma() to keep the
 * range small. We equally don't have to worry about page granularity or other
 * things.
 *
 * All we need to do is issue a full flush for any !0 range.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->end)
                flush_tlb_mm(tlb->mm);
}

#else /* CONFIG_MMU_GATHER_NO_RANGE */

#ifndef tlb_flush
/*
 * When an architecture does not provide its own tlb_flush() implementation
 * but does have a reasonably efficient flush_vma_range() implementation
 * use that.
 */
static inline void tlb_flush(struct mmu_gather *tlb)
{
        if (tlb->fullmm || tlb->need_flush_all) {
                flush_tlb_mm(tlb->mm);
        } else if (tlb->end) {
                struct vm_area_struct vma = {
                        .vm_mm = tlb->mm,
                        .vm_flags = (tlb->vma_exec ? VM_EXEC    : 0) |
                                    (tlb->vma_huge ? VM_HUGETLB : 0),
                };

                flush_tlb_range(&vma, tlb->start, tlb->end);
        }
}
#endif

#endif /* CONFIG_MMU_GATHER_NO_RANGE */

static inline void
tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        /*
         * flush_tlb_range() implementations that look at VM_HUGETLB (tile,
         * mips-4k) flush only large pages.
         *
         * flush_tlb_range() implementations that flush I-TLB also flush D-TLB
         * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing
         * range.
         *
         * We rely on tlb_end_vma() to issue a flush, such that when we reset
         * these values the batch is empty.
         */
        tlb->vma_huge = is_vm_hugetlb_page(vma);
        tlb->vma_exec = !!(vma->vm_flags & VM_EXEC);
        tlb->vma_pfn  = !!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP));
}

static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
        /*
         * Anything calling __tlb_adjust_range() also sets at least one of
         * these bits.
         */
        if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
              tlb->cleared_puds || tlb->cleared_p4ds))
                return;

        tlb_flush(tlb);
        __tlb_reset_range(tlb);
}

static inline void tlb_remove_page_size(struct mmu_gather *tlb,
                                        struct page *page, int page_size)
{
        if (__tlb_remove_page_size(tlb, page, false, page_size))
                tlb_flush_mmu(tlb);
}

static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
        return tlb_remove_page_size(tlb, page, PAGE_SIZE);
}

static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt)
{
        tlb_remove_table(tlb, pt);
}

static inline void tlb_change_page_size(struct mmu_gather *tlb,
                                                     unsigned int page_size)
{
#ifdef CONFIG_MMU_GATHER_PAGE_SIZE
        if (tlb->page_size && tlb->page_size != page_size) {
                if (!tlb->fullmm && !tlb->need_flush_all)
                        tlb_flush_mmu(tlb);
        }

        tlb->page_size = page_size;
#endif
}

static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb)
{
        if (tlb->cleared_ptes)
                return PAGE_SHIFT;
        if (tlb->cleared_pmds)
                return PMD_SHIFT;
        if (tlb->cleared_puds)
                return PUD_SHIFT;
        if (tlb->cleared_p4ds)
                return P4D_SHIFT;

        return PAGE_SHIFT;
}

static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
{
        return 1UL << tlb_get_unmap_shift(tlb);
}

/*
 * In the case of tlb vma handling, we can optimise these away in the
 * case where we're doing a full MM flush.  When we're doing a munmap,
 * the vmas are adjusted to only cover the region to be torn down.
 */
static inline void tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        tlb_update_vma_flags(tlb, vma);
#ifndef CONFIG_MMU_GATHER_NO_FLUSH_CACHE
        flush_cache_range(vma, vma->vm_start, vma->vm_end);
#endif
}

static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
{
        if (tlb->fullmm)
                return;

        /*
         * VM_PFNMAP is more fragile because the core mm will not track the
         * page mapcount -- there might not be page-frames for these PFNs after
         * all. Force flush TLBs for such ranges to avoid munmap() vs
         * unmap_mapping_range() races.
         */
        if (tlb->vma_pfn || !IS_ENABLED(CONFIG_MMU_GATHER_MERGE_VMAS)) {
                /*
                 * Do a TLB flush and reset the range at VMA boundaries; this avoids
                 * the ranges growing with the unused space between consecutive VMAs.
                 */
                tlb_flush_mmu_tlbonly(tlb);
        }
}

/*
 * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
 * and set corresponding cleared_*.
 */
static inline void tlb_flush_pte_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_ptes = 1;
}

static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_pmds = 1;
}

static inline void tlb_flush_pud_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_puds = 1;
}

static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
                                     unsigned long address, unsigned long size)
{
        __tlb_adjust_range(tlb, address, size);
        tlb->cleared_p4ds = 1;
}

#ifndef __tlb_remove_tlb_entry
static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address)
{
}
#endif

/**
 * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation.
 *
 * Record the fact that pte's were really unmapped by updating the range,
 * so we can later optimise away the tlb invalidate.   This helps when
 * userspace is unmapping already-unmapped pages, which happens quite a lot.
 */
#define tlb_remove_tlb_entry(tlb, ptep, address)                \
        do {                                                        \
                tlb_flush_pte_range(tlb, address, PAGE_SIZE);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_tlb_entries - remember unmapping of multiple consecutive ptes for
 *                            later tlb invalidation.
 *
 * Similar to tlb_remove_tlb_entry(), but remember unmapping of multiple
 * consecutive ptes instead of only a single one.
 */
static inline void tlb_remove_tlb_entries(struct mmu_gather *tlb,
                pte_t *ptep, unsigned int nr, unsigned long address)
{
        tlb_flush_pte_range(tlb, address, PAGE_SIZE * nr);
        for (;;) {
                __tlb_remove_tlb_entry(tlb, ptep, address);
                if (--nr == 0)
                        break;
                ptep++;
                address += PAGE_SIZE;
        }
}

#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)        \
        do {                                                        \
                unsigned long _sz = huge_page_size(h);                \
                if (_sz >= P4D_SIZE)                                \
                        tlb_flush_p4d_range(tlb, address, _sz);        \
                else if (_sz >= PUD_SIZE)                        \
                        tlb_flush_pud_range(tlb, address, _sz);        \
                else if (_sz >= PMD_SIZE)                        \
                        tlb_flush_pmd_range(tlb, address, _sz);        \
                else                                                \
                        tlb_flush_pte_range(tlb, address, _sz);        \
                __tlb_remove_tlb_entry(tlb, ptep, address);        \
        } while (0)

/**
 * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation
 * This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pmd_tlb_entry
#define __tlb_remove_pmd_tlb_entry(tlb, pmdp, address) do {} while (0)
#endif

#define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)                        \
        do {                                                                \
                tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE);        \
                __tlb_remove_pmd_tlb_entry(tlb, pmdp, address);                \
        } while (0)

/**
 * tlb_remove_pud_tlb_entry - remember a pud mapping for later tlb
 * invalidation. This is a nop so far, because only x86 needs it.
 */
#ifndef __tlb_remove_pud_tlb_entry
#define __tlb_remove_pud_tlb_entry(tlb, pudp, address) do {} while (0)
#endif

#define tlb_remove_pud_tlb_entry(tlb, pudp, address)                        \
        do {                                                                \
                tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE);        \
                __tlb_remove_pud_tlb_entry(tlb, pudp, address);                \
        } while (0)

/*
 * For things like page tables caches (ie caching addresses "inside" the
 * page tables, like x86 does), for legacy reasons, flushing an
 * individual page had better flush the page table caches behind it. This
 * is definitely how x86 works, for example. And if you have an
 * architected non-legacy page table cache (which I'm not aware of
 * anybody actually doing), you're going to have some architecturally
 * explicit flushing for that, likely *separate* from a regular TLB entry
 * flush, and thus you'd need more than just some range expansion..
 *
 * So if we ever find an architecture
 * that would want something that odd, I think it is up to that
 * architecture to do its own odd thing, not cause pain for others
 * http://lkml.kernel.org/r/CA+55aFzBggoXtNXQeng5d_mRoDnaMBE5Y+URs+PHR67nUpMtaw@mail.gmail.com
 *
 * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
 */

#ifndef pte_free_tlb
#define pte_free_tlb(tlb, ptep, address)                        \
        do {                                                        \
                tlb_flush_pmd_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pte_free_tlb(tlb, ptep, address);                \
        } while (0)
#endif

#ifndef pmd_free_tlb
#define pmd_free_tlb(tlb, pmdp, address)                        \
        do {                                                        \
                tlb_flush_pud_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pmd_free_tlb(tlb, pmdp, address);                \
        } while (0)
#endif

#ifndef pud_free_tlb
#define pud_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                tlb_flush_p4d_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __pud_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef p4d_free_tlb
#define p4d_free_tlb(tlb, pudp, address)                        \
        do {                                                        \
                __tlb_adjust_range(tlb, address, PAGE_SIZE);        \
                tlb->freed_tables = 1;                                \
                __p4d_free_tlb(tlb, pudp, address);                \
        } while (0)
#endif

#ifndef pte_needs_flush
static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
{
        return true;
}
#endif

#ifndef huge_pmd_needs_flush
static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
{
        return true;
}
#endif

#endif /* CONFIG_MMU */

#endif /* _ASM_GENERIC__TLB_H */











































































































































  169 







  475 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#ifndef __ASM__VIRT_H
#define __ASM__VIRT_H

/*
 * The arm64 hcall implementation uses x0 to specify the hcall
 * number. A value less than HVC_STUB_HCALL_NR indicates a special
 * hcall, such as set vector. Any other value is handled in a
 * hypervisor specific way.
 *
 * The hypercall is allowed to clobber any of the caller-saved
 * registers (x0-x18), so it is advisable to use it through the
 * indirection of a function call (as implemented in hyp-stub.S).
 */

/*
 * HVC_SET_VECTORS - Set the value of the vbar_el2 register.
 *
 * @x1: Physical address of the new vector table.
 */
#define HVC_SET_VECTORS 0

/*
 * HVC_SOFT_RESTART - CPU soft reset, used by the cpu_soft_restart routine.
 */
#define HVC_SOFT_RESTART 1

/*
 * HVC_RESET_VECTORS - Restore the vectors to the original HYP stubs
 */
#define HVC_RESET_VECTORS 2

/*
 * HVC_FINALISE_EL2 - Upgrade the CPU from EL1 to EL2, if possible
 */
#define HVC_FINALISE_EL2        3

/* Max number of HYP stub hypercalls */
#define HVC_STUB_HCALL_NR 4

/* Error returned when an invalid stub number is passed into x0 */
#define HVC_STUB_ERR        0xbadca11

#define BOOT_CPU_MODE_EL1        (0xe11)
#define BOOT_CPU_MODE_EL2        (0xe12)

/*
 * Flags returned together with the boot mode, but not preserved in
 * __boot_cpu_mode. Used by the idreg override code to work out the
 * boot state.
 */
#define BOOT_CPU_FLAG_E2H        BIT_ULL(32)

#ifndef __ASSEMBLY__

#include <asm/ptrace.h>
#include <asm/sections.h>
#include <asm/sysreg.h>
#include <asm/cpufeature.h>

/*
 * __boot_cpu_mode records what mode CPUs were booted in.
 * A correctly-implemented bootloader must start all CPUs in the same mode:
 * In this case, both 32bit halves of __boot_cpu_mode will contain the
 * same value (either 0 if booted in EL1, BOOT_CPU_MODE_EL2 if booted in EL2).
 *
 * Should the bootloader fail to do this, the two values will be different.
 * This allows the kernel to flag an error when the secondaries have come up.
 */
extern u32 __boot_cpu_mode[2];

#define ARM64_VECTOR_TABLE_LEN        SZ_2K

void __hyp_set_vectors(phys_addr_t phys_vector_base);
void __hyp_reset_vectors(void);
bool is_kvm_arm_initialised(void);

DECLARE_STATIC_KEY_FALSE(kvm_protected_mode_initialized);

static inline bool is_pkvm_initialized(void)
{
        return IS_ENABLED(CONFIG_KVM) &&
               static_branch_likely(&kvm_protected_mode_initialized);
}

/* Reports the availability of HYP mode */
static inline bool is_hyp_mode_available(void)
{
        /*
         * If KVM protected mode is initialized, all CPUs must have been booted
         * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1.
         */
        if (is_pkvm_initialized())
                return true;

        return (__boot_cpu_mode[0] == BOOT_CPU_MODE_EL2 &&
                __boot_cpu_mode[1] == BOOT_CPU_MODE_EL2);
}

/* Check if the bootloader has booted CPUs in different modes */
static inline bool is_hyp_mode_mismatched(void)
{
        /*
         * If KVM protected mode is initialized, all CPUs must have been booted
         * in EL2. Avoid checking __boot_cpu_mode as CPUs now come up in EL1.
         */
        if (is_pkvm_initialized())
                return false;

        return __boot_cpu_mode[0] != __boot_cpu_mode[1];
}

static __always_inline bool is_kernel_in_hyp_mode(void)
{
        BUILD_BUG_ON(__is_defined(__KVM_NVHE_HYPERVISOR__) ||
                     __is_defined(__KVM_VHE_HYPERVISOR__));
        return read_sysreg(CurrentEL) == CurrentEL_EL2;
}

static __always_inline bool has_vhe(void)
{
        /*
         * Code only run in VHE/NVHE hyp context can assume VHE is present or
         * absent. Otherwise fall back to caps.
         * This allows the compiler to discard VHE-specific code from the
         * nVHE object, reducing the number of external symbol references
         * needed to link.
         */
        if (is_vhe_hyp_code())
                return true;
        else if (is_nvhe_hyp_code())
                return false;
        else
                return cpus_have_final_cap(ARM64_HAS_VIRT_HOST_EXTN);
}

static __always_inline bool is_protected_kvm_enabled(void)
{
        if (is_vhe_hyp_code())
                return false;
        else
                return cpus_have_final_cap(ARM64_KVM_PROTECTED_MODE);
}

static __always_inline bool has_hvhe(void)
{
        if (is_vhe_hyp_code())
                return false;

        return cpus_have_final_cap(ARM64_KVM_HVHE);
}

static inline bool is_hyp_nvhe(void)
{
        return is_hyp_mode_available() && !is_kernel_in_hyp_mode();
}

#endif /* __ASSEMBLY__ */

#endif /* ! __ASM__VIRT_H */























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * This is <linux/capability.h>
 *
 * Andrew G. Morgan <morgan@kernel.org>
 * Alexander Kjeldaas <astor@guardian.no>
 * with help from Aleph1, Roland Buresund and Andrew Main.
 *
 * See here for the libcap library ("POSIX draft" compliance):
 *
 * ftp://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/
 */
#ifndef _LINUX_CAPABILITY_H
#define _LINUX_CAPABILITY_H

#include <uapi/linux/capability.h>
#include <linux/uidgid.h>
#include <linux/bits.h>

#define _KERNEL_CAPABILITY_VERSION _LINUX_CAPABILITY_VERSION_3

extern int file_caps_enabled;

typedef struct { u64 val; } kernel_cap_t;

/* same as vfs_ns_cap_data but in cpu endian and always filled completely */
struct cpu_vfs_cap_data {
        __u32 magic_etc;
        kuid_t rootid;
        kernel_cap_t permitted;
        kernel_cap_t inheritable;
};

#define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
#define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))

struct file;
struct inode;
struct dentry;
struct task_struct;
struct user_namespace;
struct mnt_idmap;

/*
 * CAP_FS_MASK and CAP_NFSD_MASKS:
 *
 * The fs mask is all the privileges that fsuid==0 historically meant.
 * At one time in the past, that included CAP_MKNOD and CAP_LINUX_IMMUTABLE.
 *
 * It has never meant setting security.* and trusted.* xattrs.
 *
 * We could also define fsmask as follows:
 *   1. CAP_FS_MASK is the privilege to bypass all fs-related DAC permissions
 *   2. The security.* and trusted.* xattrs are fs-related MAC permissions
 */

# define CAP_FS_MASK     (BIT_ULL(CAP_CHOWN)                \
                        | BIT_ULL(CAP_MKNOD)                \
                        | BIT_ULL(CAP_DAC_OVERRIDE)        \
                        | BIT_ULL(CAP_DAC_READ_SEARCH)        \
                        | BIT_ULL(CAP_FOWNER)                \
                        | BIT_ULL(CAP_FSETID)                \
                        | BIT_ULL(CAP_MAC_OVERRIDE))
#define CAP_VALID_MASK         (BIT_ULL(CAP_LAST_CAP+1)-1)

# define CAP_EMPTY_SET    ((kernel_cap_t) { 0 })
# define CAP_FULL_SET     ((kernel_cap_t) { CAP_VALID_MASK })
# define CAP_FS_SET       ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_LINUX_IMMUTABLE) })
# define CAP_NFSD_SET     ((kernel_cap_t) { CAP_FS_MASK | BIT_ULL(CAP_SYS_RESOURCE) })

# define cap_clear(c)         do { (c).val = 0; } while (0)

#define cap_raise(c, flag)  ((c).val |= BIT_ULL(flag))
#define cap_lower(c, flag)  ((c).val &= ~BIT_ULL(flag))
#define cap_raised(c, flag) (((c).val & BIT_ULL(flag)) != 0)

static inline kernel_cap_t cap_combine(const kernel_cap_t a,
                                       const kernel_cap_t b)
{
        return (kernel_cap_t) { a.val | b.val };
}

static inline kernel_cap_t cap_intersect(const kernel_cap_t a,
                                         const kernel_cap_t b)
{
        return (kernel_cap_t) { a.val & b.val };
}

static inline kernel_cap_t cap_drop(const kernel_cap_t a,
                                    const kernel_cap_t drop)
{
        return (kernel_cap_t) { a.val &~ drop.val };
}

static inline bool cap_isclear(const kernel_cap_t a)
{
        return !a.val;
}

static inline bool cap_isidentical(const kernel_cap_t a, const kernel_cap_t b)
{
        return a.val == b.val;
}

/*
 * Check if "a" is a subset of "set".
 * return true if ALL of the capabilities in "a" are also in "set"
 *        cap_issubset(0101, 1111) will return true
 * return false if ANY of the capabilities in "a" are not in "set"
 *        cap_issubset(1111, 0101) will return false
 */
static inline bool cap_issubset(const kernel_cap_t a, const kernel_cap_t set)
{
        return !(a.val & ~set.val);
}

/* Used to decide between falling back on the old suser() or fsuser(). */

static inline kernel_cap_t cap_drop_fs_set(const kernel_cap_t a)
{
        return cap_drop(a, CAP_FS_SET);
}

static inline kernel_cap_t cap_raise_fs_set(const kernel_cap_t a,
                                            const kernel_cap_t permitted)
{
        return cap_combine(a, cap_intersect(permitted, CAP_FS_SET));
}

static inline kernel_cap_t cap_drop_nfsd_set(const kernel_cap_t a)
{
        return cap_drop(a, CAP_NFSD_SET);
}

static inline kernel_cap_t cap_raise_nfsd_set(const kernel_cap_t a,
                                              const kernel_cap_t permitted)
{
        return cap_combine(a, cap_intersect(permitted, CAP_NFSD_SET));
}

#ifdef CONFIG_MULTIUSER
extern bool has_ns_capability(struct task_struct *t,
                              struct user_namespace *ns, int cap);
extern bool has_capability_noaudit(struct task_struct *t, int cap);
extern bool has_ns_capability_noaudit(struct task_struct *t,
                                      struct user_namespace *ns, int cap);
extern bool capable(int cap);
extern bool ns_capable(struct user_namespace *ns, int cap);
extern bool ns_capable_noaudit(struct user_namespace *ns, int cap);
extern bool ns_capable_setid(struct user_namespace *ns, int cap);
#else
static inline bool has_ns_capability(struct task_struct *t,
                              struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool has_capability_noaudit(struct task_struct *t, int cap)
{
        return true;
}
static inline bool has_ns_capability_noaudit(struct task_struct *t,
                                      struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool capable(int cap)
{
        return true;
}
static inline bool ns_capable(struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool ns_capable_noaudit(struct user_namespace *ns, int cap)
{
        return true;
}
static inline bool ns_capable_setid(struct user_namespace *ns, int cap)
{
        return true;
}
#endif /* CONFIG_MULTIUSER */
bool privileged_wrt_inode_uidgid(struct user_namespace *ns,
                                 struct mnt_idmap *idmap,
                                 const struct inode *inode);
bool capable_wrt_inode_uidgid(struct mnt_idmap *idmap,
                              const struct inode *inode, int cap);
extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
extern bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns);
static inline bool perfmon_capable(void)
{
        return capable(CAP_PERFMON) || capable(CAP_SYS_ADMIN);
}

static inline bool bpf_capable(void)
{
        return capable(CAP_BPF) || capable(CAP_SYS_ADMIN);
}

static inline bool checkpoint_restore_ns_capable(struct user_namespace *ns)
{
        return ns_capable(ns, CAP_CHECKPOINT_RESTORE) ||
                ns_capable(ns, CAP_SYS_ADMIN);
}

/* audit system wants to get cap info from files as well */
int get_vfs_caps_from_disk(struct mnt_idmap *idmap,
                           const struct dentry *dentry,
                           struct cpu_vfs_cap_data *cpu_caps);

int cap_convert_nscap(struct mnt_idmap *idmap, struct dentry *dentry,
                      const void **ivalue, size_t size);

#endif /* !_LINUX_CAPABILITY_H */























  223 



































  223 









































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM workqueue

#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WORKQUEUE_H

#include <linux/tracepoint.h>
#include <linux/workqueue.h>

struct pool_workqueue;

/**
 * workqueue_queue_work - called when a work gets queued
 * @req_cpu:        the requested cpu
 * @pwq:        pointer to struct pool_workqueue
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a work is queued immediately or once a
 * delayed work is actually queued on a workqueue (ie: once the delay
 * has been reached).
 */
TRACE_EVENT(workqueue_queue_work,

        TP_PROTO(int req_cpu, struct pool_workqueue *pwq,
                 struct work_struct *work),

        TP_ARGS(req_cpu, pwq, work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
                __string( workqueue,        pwq->wq->name)
                __field( int,        req_cpu        )
                __field( int,        cpu        )
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
                __assign_str(workqueue);
                __entry->req_cpu        = req_cpu;
                __entry->cpu                = pwq->pool->cpu;
        ),

        TP_printk("work struct=%p function=%ps workqueue=%s req_cpu=%d cpu=%d",
                  __entry->work, __entry->function, __get_str(workqueue),
                  __entry->req_cpu, __entry->cpu)
);

/**
 * workqueue_activate_work - called when a work gets activated
 * @work:        pointer to struct work_struct
 *
 * This event occurs when a queued work is put on the active queue,
 * which happens immediately after queueing unless @max_active limit
 * is reached.
 */
TRACE_EVENT(workqueue_activate_work,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p function=%ps ", __entry->work, __entry->function)
);

/**
 * workqueue_execute_start - called immediately before the workqueue callback
 * @work:        pointer to struct work_struct
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_start,

        TP_PROTO(struct work_struct *work),

        TP_ARGS(work),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = work->func;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

/**
 * workqueue_execute_end - called immediately after the workqueue callback
 * @work:        pointer to struct work_struct
 * @function:   pointer to worker function
 *
 * Allows to track workqueue execution.
 */
TRACE_EVENT(workqueue_execute_end,

        TP_PROTO(struct work_struct *work, work_func_t function),

        TP_ARGS(work, function),

        TP_STRUCT__entry(
                __field( void *,        work        )
                __field( void *,        function)
        ),

        TP_fast_assign(
                __entry->work                = work;
                __entry->function        = function;
        ),

        TP_printk("work struct %p: function %ps", __entry->work, __entry->function)
);

#endif /*  _TRACE_WORKQUEUE_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  125 







  125 





































































































  125 













































   50 















   50 





















































  106 






  106 





  105 

























































  122 





  122 


























































































































   46 






   46 





   46 











































   50 






   50 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the diskquota system for the LINUX operating system. QUOTA
 * is implemented using the BSD system call interface as the means of
 * communication with the user level. This file contains the generic routines
 * called by the different filesystems on allocation of an inode or block.
 * These routines take care of the administration needed to have a consistent
 * diskquota tracking system. The ideas of both user and group quotas are based
 * on the Melbourne quota system as used on BSD derived systems. The internal
 * implementation is based on one of the several variants of the LINUX
 * inode-subsystem with added complexity of the diskquota system.
 *
 * Author:        Marco van Wieringen <mvw@planets.elm.net>
 *
 * Fixes:   Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96
 *
 *                Revised list management to avoid races
 *                -- Bill Hawes, <whawes@star.net>, 9/98
 *
 *                Fixed races in dquot_transfer(), dqget() and dquot_alloc_...().
 *                As the consequence the locking was moved from dquot_decr_...(),
 *                dquot_incr_...() to calling functions.
 *                invalidate_dquots() now writes modified dquots.
 *                Serialized quota_off() and quota_on() for mount point.
 *                Fixed a few bugs in grow_dquots().
 *                Fixed deadlock in write_dquot() - we no longer account quotas on
 *                quota files
 *                remove_dquot_ref() moved to inode.c - it now traverses through inodes
 *                add_dquot_ref() restarts after blocking
 *                Added check for bogus uid and fixed check for group in quotactl.
 *                Jan Kara, <jack@suse.cz>, sponsored by SuSE CR, 10-11/99
 *
 *                Used struct list_head instead of own list struct
 *                Invalidation of referenced dquots is no longer possible
 *                Improved free_dquots list management
 *                Quota and i_blocks are now updated in one place to avoid races
 *                Warnings are now delayed so we won't block in critical section
 *                Write updated not to require dquot lock
 *                Jan Kara, <jack@suse.cz>, 9/2000
 *
 *                Added dynamic quota structure allocation
 *                Jan Kara <jack@suse.cz> 12/2000
 *
 *                Rewritten quota interface. Implemented new quota format and
 *                formats registering.
 *                Jan Kara, <jack@suse.cz>, 2001,2002
 *
 *                New SMP locking.
 *                Jan Kara, <jack@suse.cz>, 10/2002
 *
 *                Added journalled quota support, fix lock inversion problems
 *                Jan Kara, <jack@suse.cz>, 2003,2004
 *
 * (C) Copyright 1994 - 1997 Marco van Wieringen
 */

#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/mm.h>
#include <linux/time.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/tty.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/security.h>
#include <linux/sched.h>
#include <linux/cred.h>
#include <linux/kmod.h>
#include <linux/namei.h>
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
#include <linux/sched/mm.h>

#include <linux/uaccess.h>

/*
 * There are five quota SMP locks:
 * * dq_list_lock protects all lists with quotas and quota formats.
 * * dquot->dq_dqb_lock protects data from dq_dqb
 * * inode->i_lock protects inode->i_blocks, i_bytes and also guards
 *   consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that
 *   dquot_transfer() can stabilize amount it transfers
 * * dq_data_lock protects mem_dqinfo structures and modifications of dquot
 *   pointers in the inode
 * * dq_state_lock protects modifications of quota state (on quotaon and
 *   quotaoff) and readers who care about latest values take it as well.
 *
 * The spinlock ordering is hence:
 *   dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock,
 *   dq_list_lock > dq_state_lock
 *
 * Note that some things (eg. sb pointer, type, id) doesn't change during
 * the life of the dquot structure and so needn't to be protected by a lock
 *
 * Operation accessing dquots via inode pointers are protected by dquot_srcu.
 * Operation of reading pointer needs srcu_read_lock(&dquot_srcu), and
 * synchronize_srcu(&dquot_srcu) is called after clearing pointers from
 * inode and before dropping dquot references to avoid use of dquots after
 * they are freed. dq_data_lock is used to serialize the pointer setting and
 * clearing operations.
 * Special care needs to be taken about S_NOQUOTA inode flag (marking that
 * inode is a quota file). Functions adding pointers from inode to dquots have
 * to check this flag under dq_data_lock and then (if S_NOQUOTA is not set) they
 * have to do all pointer modifications before dropping dq_data_lock. This makes
 * sure they cannot race with quotaon which first sets S_NOQUOTA flag and
 * then drops all pointers to dquots from an inode.
 *
 * Each dquot has its dq_lock mutex.  Dquot is locked when it is being read to
 * memory (or space for it is being allocated) on the first dqget(), when it is
 * being written out, and when it is being released on the last dqput(). The
 * allocation and release operations are serialized by the dq_lock and by
 * checking the use count in dquot_release().
 *
 * Lock ordering (including related VFS locks) is the following:
 *   s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem
 */

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock);
EXPORT_SYMBOL(dq_data_lock);
DEFINE_STATIC_SRCU(dquot_srcu);

static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq);

void __quota_error(struct super_block *sb, const char *func,
                   const char *fmt, ...)
{
        if (printk_ratelimit()) {
                va_list args;
                struct va_format vaf;

                va_start(args, fmt);

                vaf.fmt = fmt;
                vaf.va = &args;

                printk(KERN_ERR "Quota error (device %s): %s: %pV\n",
                       sb->s_id, func, &vaf);

                va_end(args);
        }
}
EXPORT_SYMBOL(__quota_error);

#if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING)
static char *quotatypes[] = INITQFNAMES;
#endif
static struct quota_format_type *quota_formats;        /* List of registered formats */
static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES;

/* SLAB cache for dquot structures */
static struct kmem_cache *dquot_cachep;

void register_quota_format(struct quota_format_type *fmt)
{
        spin_lock(&dq_list_lock);
        fmt->qf_next = quota_formats;
        quota_formats = fmt;
        spin_unlock(&dq_list_lock);
}
EXPORT_SYMBOL(register_quota_format);

void unregister_quota_format(struct quota_format_type *fmt)
{
        struct quota_format_type **actqf;

        spin_lock(&dq_list_lock);
        for (actqf = &quota_formats; *actqf && *actqf != fmt;
             actqf = &(*actqf)->qf_next)
                ;
        if (*actqf)
                *actqf = (*actqf)->qf_next;
        spin_unlock(&dq_list_lock);
}
EXPORT_SYMBOL(unregister_quota_format);

static struct quota_format_type *find_quota_format(int id)
{
        struct quota_format_type *actqf;

        spin_lock(&dq_list_lock);
        for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
             actqf = actqf->qf_next)
                ;
        if (!actqf || !try_module_get(actqf->qf_owner)) {
                int qm;

                spin_unlock(&dq_list_lock);

                for (qm = 0; module_names[qm].qm_fmt_id &&
                             module_names[qm].qm_fmt_id != id; qm++)
                        ;
                if (!module_names[qm].qm_fmt_id ||
                    request_module(module_names[qm].qm_mod_name))
                        return NULL;

                spin_lock(&dq_list_lock);
                for (actqf = quota_formats; actqf && actqf->qf_fmt_id != id;
                     actqf = actqf->qf_next)
                        ;
                if (actqf && !try_module_get(actqf->qf_owner))
                        actqf = NULL;
        }
        spin_unlock(&dq_list_lock);
        return actqf;
}

static void put_quota_format(struct quota_format_type *fmt)
{
        module_put(fmt->qf_owner);
}

/*
 * Dquot List Management:
 * The quota code uses five lists for dquot management: the inuse_list,
 * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array.
 * A single dquot structure may be on some of those lists, depending on
 * its current state.
 *
 * All dquots are placed to the end of inuse_list when first created, and this
 * list is used for invalidate operation, which must look at every dquot.
 *
 * When the last reference of a dquot is dropped, the dquot is added to
 * releasing_dquots. We'll then queue work item which will call
 * synchronize_srcu() and after that perform the final cleanup of all the
 * dquots on the list. Each cleaned up dquot is moved to free_dquots list.
 * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot
 * struct.
 *
 * Unused and cleaned up dquots are in the free_dquots list and this list is
 * searched whenever we need an available dquot. Dquots are removed from the
 * list as soon as they are used again and dqstats.free_dquots gives the number
 * of dquots on the list. When dquot is invalidated it's completely released
 * from memory.
 *
 * Dirty dquots are added to the dqi_dirty_list of quota_info when mark
 * dirtied, and this list is searched when writing dirty dquots back to
 * quota file. Note that some filesystems do dirty dquot tracking on their
 * own (e.g. in a journal) and thus don't use dqi_dirty_list.
 *
 * Dquots with a specific identity (device, type and id) are placed on
 * one of the dquot_hash[] hash chains. The provides an efficient search
 * mechanism to locate a specific dquot.
 */

static LIST_HEAD(inuse_list);
static LIST_HEAD(free_dquots);
static LIST_HEAD(releasing_dquots);
static unsigned int dq_hash_bits, dq_hash_mask;
static struct hlist_head *dquot_hash;

struct dqstats dqstats;
EXPORT_SYMBOL(dqstats);

static qsize_t inode_get_rsv_space(struct inode *inode);
static qsize_t __inode_get_rsv_space(struct inode *inode);
static int __dquot_initialize(struct inode *inode, int type);

static void quota_release_workfn(struct work_struct *work);
static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn);

static inline unsigned int
hashfn(const struct super_block *sb, struct kqid qid)
{
        unsigned int id = from_kqid(&init_user_ns, qid);
        int type = qid.type;
        unsigned long tmp;

        tmp = (((unsigned long)sb>>L1_CACHE_SHIFT) ^ id) * (MAXQUOTAS - type);
        return (tmp + (tmp >> dq_hash_bits)) & dq_hash_mask;
}

/*
 * Following list functions expect dq_list_lock to be held
 */
static inline void insert_dquot_hash(struct dquot *dquot)
{
        struct hlist_head *head;
        head = dquot_hash + hashfn(dquot->dq_sb, dquot->dq_id);
        hlist_add_head(&dquot->dq_hash, head);
}

static inline void remove_dquot_hash(struct dquot *dquot)
{
        hlist_del_init(&dquot->dq_hash);
}

static struct dquot *find_dquot(unsigned int hashent, struct super_block *sb,
                                struct kqid qid)
{
        struct dquot *dquot;

        hlist_for_each_entry(dquot, dquot_hash+hashent, dq_hash)
                if (dquot->dq_sb == sb && qid_eq(dquot->dq_id, qid))
                        return dquot;

        return NULL;
}

/* Add a dquot to the tail of the free list */
static inline void put_dquot_last(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &free_dquots);
        dqstats_inc(DQST_FREE_DQUOTS);
}

static inline void put_releasing_dquots(struct dquot *dquot)
{
        list_add_tail(&dquot->dq_free, &releasing_dquots);
        set_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void remove_free_dquot(struct dquot *dquot)
{
        if (list_empty(&dquot->dq_free))
                return;
        list_del_init(&dquot->dq_free);
        if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags))
                dqstats_dec(DQST_FREE_DQUOTS);
        else
                clear_bit(DQ_RELEASING_B, &dquot->dq_flags);
}

static inline void put_inuse(struct dquot *dquot)
{
        /* We add to the back of inuse list so we don't have to restart
         * when traversing this list and we block */
        list_add_tail(&dquot->dq_inuse, &inuse_list);
        dqstats_inc(DQST_ALLOC_DQUOTS);
}

static inline void remove_inuse(struct dquot *dquot)
{
        dqstats_dec(DQST_ALLOC_DQUOTS);
        list_del(&dquot->dq_inuse);
}
/*
 * End of list functions needing dq_list_lock
 */

static void wait_on_dquot(struct dquot *dquot)
{
        mutex_lock(&dquot->dq_lock);
        mutex_unlock(&dquot->dq_lock);
}

static inline int dquot_active(struct dquot *dquot)
{
        return test_bit(DQ_ACTIVE_B, &dquot->dq_flags);
}

static inline int dquot_dirty(struct dquot *dquot)
{
        return test_bit(DQ_MOD_B, &dquot->dq_flags);
}

static inline int mark_dquot_dirty(struct dquot *dquot)
{
        return dquot->dq_sb->dq_op->mark_dirty(dquot);
}

/* Mark dquot dirty in atomic manner, and return it's old dirty flag state */
int dquot_mark_dquot_dirty(struct dquot *dquot)
{
        int ret = 1;

        if (!dquot_active(dquot))
                return 0;

        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags);

        /* If quota is dirty already, we don't have to acquire dq_list_lock */
        if (dquot_dirty(dquot))
                return 1;

        spin_lock(&dq_list_lock);
        if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
                list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
                                info[dquot->dq_id.type].dqi_dirty_list);
                ret = 0;
        }
        spin_unlock(&dq_list_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_mark_dquot_dirty);

/* Dirtify all the dquots - this can block when journalling */
static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
{
        int ret, err, cnt;
        struct dquot *dquot;

        ret = err = 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot)
                        /* Even in case of error we have to continue */
                        ret = mark_dquot_dirty(dquot);
                if (!err && ret < 0)
                        err = ret;
        }
        return err;
}

static inline void dqput_all(struct dquot **dquot)
{
        unsigned int cnt;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                dqput(dquot[cnt]);
}

static inline int clear_dquot_dirty(struct dquot *dquot)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
                return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags);

        spin_lock(&dq_list_lock);
        if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) {
                spin_unlock(&dq_list_lock);
                return 0;
        }
        list_del_init(&dquot->dq_dirty);
        spin_unlock(&dq_list_lock);
        return 1;
}

void mark_info_dirty(struct super_block *sb, int type)
{
        spin_lock(&dq_data_lock);
        sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY;
        spin_unlock(&dq_data_lock);
}
EXPORT_SYMBOL(mark_info_dirty);

/*
 *        Read dquot from disk and alloc space for it
 */

int dquot_acquire(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
                ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
                if (ret < 0)
                        goto out_iolock;
        }
        /* Make sure flags update is visible after dquot has been filled */
        smp_mb__before_atomic();
        set_bit(DQ_READ_B, &dquot->dq_flags);
        /* Instantiate dquot if needed */
        if (!dquot_active(dquot) && !dquot->dq_off) {
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
                /* Write the info if needed */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                        dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret < 0)
                        goto out_iolock;
                if (ret2 < 0) {
                        ret = ret2;
                        goto out_iolock;
                }
        }
        /*
         * Make sure flags update is visible after on-disk struct has been
         * allocated. Paired with smp_rmb() in dqget().
         */
        smp_mb__before_atomic();
        set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_acquire);

/*
 *        Write dquot to disk
 */
int dquot_commit(struct dquot *dquot)
{
        int ret = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        if (!clear_dquot_dirty(dquot))
                goto out_lock;
        /* Inactive dquot can be only if there was error during read/init
         * => we have better not writing it */
        if (dquot_active(dquot))
                ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
        else
                ret = -EIO;
out_lock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_commit);

/*
 *        Release dquot
 */
int dquot_release(struct dquot *dquot)
{
        int ret = 0, ret2 = 0;
        unsigned int memalloc;
        struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);

        mutex_lock(&dquot->dq_lock);
        memalloc = memalloc_nofs_save();
        /* Check whether we are not racing with some other dqget() */
        if (dquot_is_busy(dquot))
                goto out_dqlock;
        if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
                ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
                /* Write the info */
                if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
                        ret2 = dqopt->ops[dquot->dq_id.type]->write_file_info(
                                                dquot->dq_sb, dquot->dq_id.type);
                }
                if (ret >= 0)
                        ret = ret2;
        }
        clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_dqlock:
        memalloc_nofs_restore(memalloc);
        mutex_unlock(&dquot->dq_lock);
        return ret;
}
EXPORT_SYMBOL(dquot_release);

void dquot_destroy(struct dquot *dquot)
{
        kmem_cache_free(dquot_cachep, dquot);
}
EXPORT_SYMBOL(dquot_destroy);

static inline void do_destroy_dquot(struct dquot *dquot)
{
        dquot->dq_sb->dq_op->destroy_dquot(dquot);
}

/* Invalidate all dquots on the list. Note that this function is called after
 * quota is disabled and pointers from inodes removed so there cannot be new
 * quota users. There can still be some users of quotas due to inodes being
 * just deleted or pruned by prune_icache() (those are not attached to any
 * list) or parallel quotactl call. We have to wait for such users.
 */
static void invalidate_dquots(struct super_block *sb, int type)
{
        struct dquot *dquot, *tmp;

restart:
        flush_delayed_work(&quota_release_work);

        spin_lock(&dq_list_lock);
        list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
                if (dquot->dq_sb != sb)
                        continue;
                if (dquot->dq_id.type != type)
                        continue;
                /* Wait for dquot users */
                if (atomic_read(&dquot->dq_count)) {
                        atomic_inc(&dquot->dq_count);
                        spin_unlock(&dq_list_lock);
                        /*
                         * Once dqput() wakes us up, we know it's time to free
                         * the dquot.
                         * IMPORTANT: we rely on the fact that there is always
                         * at most one process waiting for dquot to free.
                         * Otherwise dq_count would be > 1 and we would never
                         * wake up.
                         */
                        wait_event(dquot_ref_wq,
                                   atomic_read(&dquot->dq_count) == 1);
                        dqput(dquot);
                        /* At this moment dquot() need not exist (it could be
                         * reclaimed by prune_dqcache(). Hence we must
                         * restart. */
                        goto restart;
                }
                /*
                 * The last user already dropped its reference but dquot didn't
                 * get fully cleaned up yet. Restart the scan which flushes the
                 * work cleaning up released dquots.
                 */
                if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                        spin_unlock(&dq_list_lock);
                        goto restart;
                }
                /*
                 * Quota now has no users and it has been written on last
                 * dqput()
                 */
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/* Call callback for every active dquot on given filesystem */
int dquot_scan_active(struct super_block *sb,
                      int (*fn)(struct dquot *dquot, unsigned long priv),
                      unsigned long priv)
{
        struct dquot *dquot, *old_dquot = NULL;
        int ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        spin_lock(&dq_list_lock);
        list_for_each_entry(dquot, &inuse_list, dq_inuse) {
                if (!dquot_active(dquot))
                        continue;
                if (dquot->dq_sb != sb)
                        continue;
                /* Now we have active dquot so we can just increase use count */
                atomic_inc(&dquot->dq_count);
                spin_unlock(&dq_list_lock);
                dqput(old_dquot);
                old_dquot = dquot;
                /*
                 * ->release_dquot() can be racing with us. Our reference
                 * protects us from new calls to it so just wait for any
                 * outstanding call and recheck the DQ_ACTIVE_B after that.
                 */
                wait_on_dquot(dquot);
                if (dquot_active(dquot)) {
                        ret = fn(dquot, priv);
                        if (ret < 0)
                                goto out;
                }
                spin_lock(&dq_list_lock);
                /* We are safe to continue now because our dquot could not
                 * be moved out of the inuse list while we hold the reference */
        }
        spin_unlock(&dq_list_lock);
out:
        dqput(old_dquot);
        return ret;
}
EXPORT_SYMBOL(dquot_scan_active);

static inline int dquot_write_dquot(struct dquot *dquot)
{
        int ret = dquot->dq_sb->dq_op->write_dquot(dquot);
        if (ret < 0) {
                quota_error(dquot->dq_sb, "Can't write quota structure "
                            "(error %d). Quota may get out of sync!", ret);
                /* Clear dirty bit anyway to avoid infinite loop. */
                clear_dquot_dirty(dquot);
        }
        return ret;
}

/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
        struct list_head dirty;
        struct dquot *dquot;
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int err, ret = 0;

        WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));

        flush_delayed_work(&quota_release_work);

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                spin_lock(&dq_list_lock);
                /* Move list away to avoid livelock. */
                list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty);
                while (!list_empty(&dirty)) {
                        dquot = list_first_entry(&dirty, struct dquot,
                                                 dq_dirty);

                        WARN_ON(!dquot_active(dquot));
                        /* If the dquot is releasing we should not touch it */
                        if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
                                spin_unlock(&dq_list_lock);
                                flush_delayed_work(&quota_release_work);
                                spin_lock(&dq_list_lock);
                                continue;
                        }

                        /* Now we have active dquot from which someone is
                          * holding reference so we can safely just increase
                         * use count */
                        dqgrab(dquot);
                        spin_unlock(&dq_list_lock);
                        err = dquot_write_dquot(dquot);
                        if (err && !ret)
                                ret = err;
                        dqput(dquot);
                        spin_lock(&dq_list_lock);
                }
                spin_unlock(&dq_list_lock);
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
                    && info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
        dqstats_inc(DQST_SYNCS);

        return ret;
}
EXPORT_SYMBOL(dquot_writeback_dquots);

/* Write all dquot structures to disk and make them visible from userspace */
int dquot_quota_sync(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int cnt;
        int ret;

        ret = dquot_writeback_dquots(sb, type);
        if (ret)
                return ret;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                return 0;

        /* This is not very clever (and fast) but currently I don't know about
         * any other simple way of getting quota data to disk and we must get
         * them there for userspace to be visible... */
        if (sb->s_op->sync_fs) {
                ret = sb->s_op->sync_fs(sb, 1);
                if (ret)
                        return ret;
        }
        ret = sync_blockdev(sb->s_bdev);
        if (ret)
                return ret;

        /*
         * Now when everything is written we can discard the pagecache so
         * that userspace sees the changes.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                inode_lock(dqopt->files[cnt]);
                truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                inode_unlock(dqopt->files[cnt]);
        }

        return 0;
}
EXPORT_SYMBOL(dquot_quota_sync);

static unsigned long
dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
{
        struct dquot *dquot;
        unsigned long freed = 0;

        spin_lock(&dq_list_lock);
        while (!list_empty(&free_dquots) && sc->nr_to_scan) {
                dquot = list_first_entry(&free_dquots, struct dquot, dq_free);
                remove_dquot_hash(dquot);
                remove_free_dquot(dquot);
                remove_inuse(dquot);
                do_destroy_dquot(dquot);
                sc->nr_to_scan--;
                freed++;
        }
        spin_unlock(&dq_list_lock);
        return freed;
}

static unsigned long
dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
{
        return vfs_pressure_ratio(
        percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
}

/*
 * Safely release dquot and put reference to dquot.
 */
static void quota_release_workfn(struct work_struct *work)
{
        struct dquot *dquot;
        struct list_head rls_head;

        spin_lock(&dq_list_lock);
        /* Exchange the list head to avoid livelock. */
        list_replace_init(&releasing_dquots, &rls_head);
        spin_unlock(&dq_list_lock);
        synchronize_srcu(&dquot_srcu);

restart:
        spin_lock(&dq_list_lock);
        while (!list_empty(&rls_head)) {
                dquot = list_first_entry(&rls_head, struct dquot, dq_free);
                WARN_ON_ONCE(atomic_read(&dquot->dq_count));
                /*
                 * Note that DQ_RELEASING_B protects us from racing with
                 * invalidate_dquots() calls so we are safe to work with the
                 * dquot even after we drop dq_list_lock.
                 */
                if (dquot_dirty(dquot)) {
                        spin_unlock(&dq_list_lock);
                        /* Commit dquot before releasing */
                        dquot_write_dquot(dquot);
                        goto restart;
                }
                if (dquot_active(dquot)) {
                        spin_unlock(&dq_list_lock);
                        dquot->dq_sb->dq_op->release_dquot(dquot);
                        goto restart;
                }
                /* Dquot is inactive and clean, now move it to free list */
                remove_free_dquot(dquot);
                put_dquot_last(dquot);
        }
        spin_unlock(&dq_list_lock);
}

/*
 * Put reference to dquot
 */
void dqput(struct dquot *dquot)
{
        if (!dquot)
                return;
#ifdef CONFIG_QUOTA_DEBUG
        if (!atomic_read(&dquot->dq_count)) {
                quota_error(dquot->dq_sb, "trying to free free dquot of %s %d",
                            quotatypes[dquot->dq_id.type],
                            from_kqid(&init_user_ns, dquot->dq_id));
                BUG();
        }
#endif
        dqstats_inc(DQST_DROPS);

        spin_lock(&dq_list_lock);
        if (atomic_read(&dquot->dq_count) > 1) {
                /* We have more than one user... nothing to do */
                atomic_dec(&dquot->dq_count);
                /* Releasing dquot during quotaoff phase? */
                if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) &&
                    atomic_read(&dquot->dq_count) == 1)
                        wake_up(&dquot_ref_wq);
                spin_unlock(&dq_list_lock);
                return;
        }

        /* Need to release dquot? */
        WARN_ON_ONCE(!list_empty(&dquot->dq_free));
        put_releasing_dquots(dquot);
        atomic_dec(&dquot->dq_count);
        spin_unlock(&dq_list_lock);
        queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);

struct dquot *dquot_alloc(struct super_block *sb, int type)
{
        return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
}
EXPORT_SYMBOL(dquot_alloc);

static struct dquot *get_empty_dquot(struct super_block *sb, int type)
{
        struct dquot *dquot;

        dquot = sb->dq_op->alloc_dquot(sb, type);
        if(!dquot)
                return NULL;

        mutex_init(&dquot->dq_lock);
        INIT_LIST_HEAD(&dquot->dq_free);
        INIT_LIST_HEAD(&dquot->dq_inuse);
        INIT_HLIST_NODE(&dquot->dq_hash);
        INIT_LIST_HEAD(&dquot->dq_dirty);
        dquot->dq_sb = sb;
        dquot->dq_id = make_kqid_invalid(type);
        atomic_set(&dquot->dq_count, 1);
        spin_lock_init(&dquot->dq_dqb_lock);

        return dquot;
}

/*
 * Get reference to dquot
 *
 * Locking is slightly tricky here. We are guarded from parallel quotaoff()
 * destroying our dquot by:
 *   a) checking for quota flags under dq_list_lock and
 *   b) getting a reference to dquot before we release dq_list_lock
 */
struct dquot *dqget(struct super_block *sb, struct kqid qid)
{
        unsigned int hashent = hashfn(sb, qid);
        struct dquot *dquot, *empty = NULL;

        if (!qid_has_mapping(sb->s_user_ns, qid))
                return ERR_PTR(-EINVAL);

        if (!sb_has_quota_active(sb, qid.type))
                return ERR_PTR(-ESRCH);
we_slept:
        spin_lock(&dq_list_lock);
        spin_lock(&dq_state_lock);
        if (!sb_has_quota_active(sb, qid.type)) {
                spin_unlock(&dq_state_lock);
                spin_unlock(&dq_list_lock);
                dquot = ERR_PTR(-ESRCH);
                goto out;
        }
        spin_unlock(&dq_state_lock);

        dquot = find_dquot(hashent, sb, qid);
        if (!dquot) {
                if (!empty) {
                        spin_unlock(&dq_list_lock);
                        empty = get_empty_dquot(sb, qid.type);
                        if (!empty)
                                schedule();        /* Try to wait for a moment... */
                        goto we_slept;
                }
                dquot = empty;
                empty = NULL;
                dquot->dq_id = qid;
                /* all dquots go on the inuse_list */
                put_inuse(dquot);
                /* hash it first so it can be found */
                insert_dquot_hash(dquot);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_LOOKUPS);
        } else {
                if (!atomic_read(&dquot->dq_count))
                        remove_free_dquot(dquot);
                atomic_inc(&dquot->dq_count);
                spin_unlock(&dq_list_lock);
                dqstats_inc(DQST_CACHE_HITS);
                dqstats_inc(DQST_LOOKUPS);
        }
        /* Wait for dq_lock - after this we know that either dquot_release() is
         * already finished or it will be canceled due to dq_count > 0 test */
        wait_on_dquot(dquot);
        /* Read the dquot / allocate space in quota file */
        if (!dquot_active(dquot)) {
                int err;

                err = sb->dq_op->acquire_dquot(dquot);
                if (err < 0) {
                        dqput(dquot);
                        dquot = ERR_PTR(err);
                        goto out;
                }
        }
        /*
         * Make sure following reads see filled structure - paired with
         * smp_mb__before_atomic() in dquot_acquire().
         */
        smp_rmb();
        /* Has somebody invalidated entry under us? */
        WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash));
out:
        if (empty)
                do_destroy_dquot(empty);

        return dquot;
}
EXPORT_SYMBOL(dqget);

static inline struct dquot __rcu **i_dquot(struct inode *inode)
{
        return inode->i_sb->s_op->get_dquots(inode);
}

static int dqinit_needed(struct inode *inode, int type)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return 0;

        dquots = i_dquot(inode);
        if (type != -1)
                return !dquots[type];
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!dquots[cnt])
                        return 1;
        return 0;
}

/* This routine is guarded by s_umount semaphore */
static int add_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode, *old_inode = NULL;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif
        int err = 0;

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                spin_lock(&inode->i_lock);
                if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
                    !atomic_read(&inode->i_writecount) ||
                    !dqinit_needed(inode, type)) {
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                spin_unlock(&sb->s_inode_list_lock);

#ifdef CONFIG_QUOTA_DEBUG
                if (unlikely(inode_get_rsv_space(inode) > 0))
                        reserved = 1;
#endif
                iput(old_inode);
                err = __dquot_initialize(inode, type);
                if (err) {
                        iput(inode);
                        goto out;
                }

                /*
                 * We hold a reference to 'inode' so it couldn't have been
                 * removed from s_inodes list while we dropped the
                 * s_inode_list_lock. We cannot iput the inode now as we can be
                 * holding the last reference and we cannot iput it under
                 * s_inode_list_lock. So we keep the reference and iput it
                 * later.
                 */
                old_inode = inode;
                cond_resched();
                spin_lock(&sb->s_inode_list_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
        iput(old_inode);
out:
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                quota_error(sb, "Writes happened before quota was turned on "
                        "thus quota information is probably inconsistent. "
                        "Please run quotacheck(8)");
        }
#endif
        return err;
}

static void remove_dquot_ref(struct super_block *sb, int type)
{
        struct inode *inode;
#ifdef CONFIG_QUOTA_DEBUG
        int reserved = 0;
#endif

        spin_lock(&sb->s_inode_list_lock);
        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                /*
                 *  We have to scan also I_NEW inodes because they can already
                 *  have quota pointer initialized. Luckily, we need to touch
                 *  only quota pointers and these have separate locking
                 *  (dq_data_lock).
                 */
                spin_lock(&dq_data_lock);
                if (!IS_NOQUOTA(inode)) {
                        struct dquot __rcu **dquots = i_dquot(inode);
                        struct dquot *dquot = srcu_dereference_check(
                                dquots[type], &dquot_srcu,
                                lockdep_is_held(&dq_data_lock));

#ifdef CONFIG_QUOTA_DEBUG
                        if (unlikely(inode_get_rsv_space(inode) > 0))
                                reserved = 1;
#endif
                        rcu_assign_pointer(dquots[type], NULL);
                        if (dquot)
                                dqput(dquot);
                }
                spin_unlock(&dq_data_lock);
        }
        spin_unlock(&sb->s_inode_list_lock);
#ifdef CONFIG_QUOTA_DEBUG
        if (reserved) {
                printk(KERN_WARNING "VFS (%s): Writes happened after quota"
                        " was disabled thus quota information is probably "
                        "inconsistent. Please run quotacheck(8).\n", sb->s_id);
        }
#endif
}

/* Gather all references from inodes and drop them */
static void drop_dquot_ref(struct super_block *sb, int type)
{
        if (sb->dq_op)
                remove_dquot_ref(sb, type);
}

static inline
void dquot_free_reserved_space(struct dquot *dquot, qsize_t number)
{
        if (dquot->dq_dqb.dqb_rsvspace >= number)
                dquot->dq_dqb.dqb_rsvspace -= number;
        else {
                WARN_ON_ONCE(1);
                dquot->dq_dqb.dqb_rsvspace = 0;
        }
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curinodes >= number)
                dquot->dq_dqb.dqb_curinodes -= number;
        else
                dquot->dq_dqb.dqb_curinodes = 0;
        if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
                dquot->dq_dqb.dqb_itime = (time64_t) 0;
        clear_bit(DQ_INODES_B, &dquot->dq_flags);
}

static void dquot_decr_space(struct dquot *dquot, qsize_t number)
{
        if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
            dquot->dq_dqb.dqb_curspace >= number)
                dquot->dq_dqb.dqb_curspace -= number;
        else
                dquot->dq_dqb.dqb_curspace = 0;
        if (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace <=
            dquot->dq_dqb.dqb_bsoftlimit)
                dquot->dq_dqb.dqb_btime = (time64_t) 0;
        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}

struct dquot_warn {
        struct super_block *w_sb;
        struct kqid w_dq_id;
        short w_type;
};

static int warning_issued(struct dquot *dquot, const int warntype)
{
        int flag = (warntype == QUOTA_NL_BHARDWARN ||
                warntype == QUOTA_NL_BSOFTLONGWARN) ? DQ_BLKS_B :
                ((warntype == QUOTA_NL_IHARDWARN ||
                warntype == QUOTA_NL_ISOFTLONGWARN) ? DQ_INODES_B : 0);

        if (!flag)
                return 0;
        return test_and_set_bit(flag, &dquot->dq_flags);
}

#ifdef CONFIG_PRINT_QUOTA_WARNING
static int flag_print_warnings = 1;

static int need_print_warning(struct dquot_warn *warn)
{
        if (!flag_print_warnings)
                return 0;

        switch (warn->w_dq_id.type) {
                case USRQUOTA:
                        return uid_eq(current_fsuid(), warn->w_dq_id.uid);
                case GRPQUOTA:
                        return in_group_p(warn->w_dq_id.gid);
                case PRJQUOTA:
                        return 1;
        }
        return 0;
}

/* Print warning to user which exceeded quota */
static void print_warning(struct dquot_warn *warn)
{
        char *msg = NULL;
        struct tty_struct *tty;
        int warntype = warn->w_type;

        if (warntype == QUOTA_NL_IHARDBELOW ||
            warntype == QUOTA_NL_ISOFTBELOW ||
            warntype == QUOTA_NL_BHARDBELOW ||
            warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(warn))
                return;

        tty = get_current_tty();
        if (!tty)
                return;
        tty_write_message(tty, warn->w_sb->s_id);
        if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN)
                tty_write_message(tty, ": warning, ");
        else
                tty_write_message(tty, ": write failed, ");
        tty_write_message(tty, quotatypes[warn->w_dq_id.type]);
        switch (warntype) {
                case QUOTA_NL_IHARDWARN:
                        msg = " file limit reached.\r\n";
                        break;
                case QUOTA_NL_ISOFTLONGWARN:
                        msg = " file quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_ISOFTWARN:
                        msg = " file quota exceeded.\r\n";
                        break;
                case QUOTA_NL_BHARDWARN:
                        msg = " block limit reached.\r\n";
                        break;
                case QUOTA_NL_BSOFTLONGWARN:
                        msg = " block quota exceeded too long.\r\n";
                        break;
                case QUOTA_NL_BSOFTWARN:
                        msg = " block quota exceeded.\r\n";
                        break;
        }
        tty_write_message(tty, msg);
        tty_kref_put(tty);
}
#endif

static void prepare_warning(struct dquot_warn *warn, struct dquot *dquot,
                            int warntype)
{
        if (warning_issued(dquot, warntype))
                return;
        warn->w_type = warntype;
        warn->w_sb = dquot->dq_sb;
        warn->w_dq_id = dquot->dq_id;
}

/*
 * Write warnings to the console and send warning messages over netlink.
 *
 * Note that this function can call into tty and networking code.
 */
static void flush_warnings(struct dquot_warn *warn)
{
        int i;

        for (i = 0; i < MAXQUOTAS; i++) {
                if (warn[i].w_type == QUOTA_NL_NOWARN)
                        continue;
#ifdef CONFIG_PRINT_QUOTA_WARNING
                print_warning(&warn[i]);
#endif
                quota_send_warning(warn[i].w_dq_id,
                                   warn[i].w_sb->s_dev, warn[i].w_type);
        }
}

static int ignore_hardlimit(struct dquot *dquot)
{
        struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];

        return capable(CAP_SYS_RESOURCE) &&
               (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
                !(info->dqi_flags & DQF_ROOT_SQUASH));
}

static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes,
                            struct dquot_warn *warn)
{
        qsize_t newinodes;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        newinodes = dquot->dq_dqb.dqb_curinodes + inodes;
        if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto add;

        if (dquot->dq_dqb.dqb_ihardlimit &&
            newinodes > dquot->dq_dqb.dqb_ihardlimit &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
            !ignore_hardlimit(dquot)) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
                ret = -EDQUOT;
                goto out;
        }

        if (dquot->dq_dqb.dqb_isoftlimit &&
            newinodes > dquot->dq_dqb.dqb_isoftlimit &&
            dquot->dq_dqb.dqb_itime == 0) {
                prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
                dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
                    sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
        }
add:
        dquot->dq_dqb.dqb_curinodes = newinodes;

out:
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int dquot_add_space(struct dquot *dquot, qsize_t space,
                           qsize_t rsv_space, unsigned int flags,
                           struct dquot_warn *warn)
{
        qsize_t tspace;
        struct super_block *sb = dquot->dq_sb;
        int ret = 0;

        spin_lock(&dquot->dq_dqb_lock);
        if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) ||
            test_bit(DQ_FAKE_B, &dquot->dq_flags))
                goto finish;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace
                + space + rsv_space;

        if (dquot->dq_dqb.dqb_bhardlimit &&
            tspace > dquot->dq_dqb.dqb_bhardlimit &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime &&
            ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
            !ignore_hardlimit(dquot)) {
                if (flags & DQUOT_SPACE_WARN)
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
                ret = -EDQUOT;
                goto finish;
        }

        if (dquot->dq_dqb.dqb_bsoftlimit &&
            tspace > dquot->dq_dqb.dqb_bsoftlimit &&
            dquot->dq_dqb.dqb_btime == 0) {
                if (flags & DQUOT_SPACE_WARN) {
                        prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
                        dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
                            sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
                } else {
                        /*
                         * We don't allow preallocation to exceed softlimit so exceeding will
                         * be always printed
                         */
                        ret = -EDQUOT;
                        goto finish;
                }
        }
finish:
        /*
         * We have to be careful and go through warning generation & grace time
         * setting even if DQUOT_SPACE_NOFAIL is set. That's why we check it
         * only here...
         */
        if (flags & DQUOT_SPACE_NOFAIL)
                ret = 0;
        if (!ret) {
                dquot->dq_dqb.dqb_rsvspace += rsv_space;
                dquot->dq_dqb.dqb_curspace += space;
        }
        spin_unlock(&dquot->dq_dqb_lock);
        return ret;
}

static int info_idq_free(struct dquot *dquot, qsize_t inodes)
{
        qsize_t newinodes;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
            !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type))
                return QUOTA_NL_NOWARN;

        newinodes = dquot->dq_dqb.dqb_curinodes - inodes;
        if (newinodes <= dquot->dq_dqb.dqb_isoftlimit)
                return QUOTA_NL_ISOFTBELOW;
        if (dquot->dq_dqb.dqb_curinodes >= dquot->dq_dqb.dqb_ihardlimit &&
            newinodes < dquot->dq_dqb.dqb_ihardlimit)
                return QUOTA_NL_IHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int info_bdq_free(struct dquot *dquot, qsize_t space)
{
        qsize_t tspace;

        tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace;

        if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
            tspace <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_NOWARN;

        if (tspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
                return QUOTA_NL_BSOFTBELOW;
        if (tspace >= dquot->dq_dqb.dqb_bhardlimit &&
            tspace - space < dquot->dq_dqb.dqb_bhardlimit)
                return QUOTA_NL_BHARDBELOW;
        return QUOTA_NL_NOWARN;
}

static int inode_quota_active(const struct inode *inode)
{
        struct super_block *sb = inode->i_sb;

        if (IS_NOQUOTA(inode))
                return 0;
        return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb);
}

/*
 * Initialize quota pointers in inode
 *
 * It is better to call this function outside of any transaction as it
 * might need a lot of space in journal for dquot structure allocation.
 */
static int __dquot_initialize(struct inode *inode, int type)
{
        int cnt, init_needed = 0;
        struct dquot __rcu **dquots;
        struct dquot *got[MAXQUOTAS] = {};
        struct super_block *sb = inode->i_sb;
        qsize_t rsv;
        int ret = 0;

        if (!inode_quota_active(inode))
                return 0;

        dquots = i_dquot(inode);

        /* First get references to structures we might need. */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                struct kqid qid;
                kprojid_t projid;
                int rc;
                struct dquot *dquot;

                if (type != -1 && cnt != type)
                        continue;
                /*
                 * The i_dquot should have been initialized in most cases,
                 * we check it without locking here to avoid unnecessary
                 * dqget()/dqput() calls.
                 */
                if (dquots[cnt])
                        continue;

                if (!sb_has_quota_active(sb, cnt))
                        continue;

                init_needed = 1;

                switch (cnt) {
                case USRQUOTA:
                        qid = make_kqid_uid(inode->i_uid);
                        break;
                case GRPQUOTA:
                        qid = make_kqid_gid(inode->i_gid);
                        break;
                case PRJQUOTA:
                        rc = inode->i_sb->dq_op->get_projid(inode, &projid);
                        if (rc)
                                continue;
                        qid = make_kqid_projid(projid);
                        break;
                }
                dquot = dqget(sb, qid);
                if (IS_ERR(dquot)) {
                        /* We raced with somebody turning quotas off... */
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                got[cnt] = dquot;
        }

        /* All required i_dquot has been initialized */
        if (!init_needed)
                return 0;

        spin_lock(&dq_data_lock);
        if (IS_NOQUOTA(inode))
                goto out_lock;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(sb, cnt))
                        continue;
                /* We could race with quotaon or dqget() could have failed */
                if (!got[cnt])
                        continue;
                if (!dquots[cnt]) {
                        rcu_assign_pointer(dquots[cnt], got[cnt]);
                        got[cnt] = NULL;
                        /*
                         * Make quota reservation system happy if someone
                         * did a write before quota was turned on
                         */
                        rsv = inode_get_rsv_space(inode);
                        if (unlikely(rsv)) {
                                struct dquot *dquot = srcu_dereference_check(
                                        dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));

                                spin_lock(&inode->i_lock);
                                /* Get reservation again under proper lock */
                                rsv = __inode_get_rsv_space(inode);
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot->dq_dqb.dqb_rsvspace += rsv;
                                spin_unlock(&dquot->dq_dqb_lock);
                                spin_unlock(&inode->i_lock);
                        }
                }
        }
out_lock:
        spin_unlock(&dq_data_lock);
out_put:
        /* Drop unused references */
        dqput_all(got);

        return ret;
}

int dquot_initialize(struct inode *inode)
{
        return __dquot_initialize(inode, -1);
}
EXPORT_SYMBOL(dquot_initialize);

bool dquot_initialize_needed(struct inode *inode)
{
        struct dquot __rcu **dquots;
        int i;

        if (!inode_quota_active(inode))
                return false;

        dquots = i_dquot(inode);
        for (i = 0; i < MAXQUOTAS; i++)
                if (!dquots[i] && sb_has_quota_active(inode->i_sb, i))
                        return true;
        return false;
}
EXPORT_SYMBOL(dquot_initialize_needed);

/*
 * Release all quotas referenced by inode.
 *
 * This function only be called on inode free or converting
 * a file to quota file, no other users for the i_dquot in
 * both cases, so we needn't call synchronize_srcu() after
 * clearing i_dquot.
 */
static void __dquot_drop(struct inode *inode)
{
        int cnt;
        struct dquot __rcu **dquots = i_dquot(inode);
        struct dquot *put[MAXQUOTAS];

        spin_lock(&dq_data_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu,
                                        lockdep_is_held(&dq_data_lock));
                rcu_assign_pointer(dquots[cnt], NULL);
        }
        spin_unlock(&dq_data_lock);
        dqput_all(put);
}

void dquot_drop(struct inode *inode)
{
        struct dquot __rcu * const *dquots;
        int cnt;

        if (IS_NOQUOTA(inode))
                return;

        /*
         * Test before calling to rule out calls from proc and such
         * where we are not allowed to block. Note that this is
         * actually reliable test even without the lock - the caller
         * must assure that nobody can come after the DQUOT_DROP and
         * add quota pointers back anyway.
         */
        dquots = i_dquot(inode);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (dquots[cnt])
                        break;
        }

        if (cnt < MAXQUOTAS)
                __dquot_drop(inode);
}
EXPORT_SYMBOL(dquot_drop);

/*
 * inode_reserved_space is managed internally by quota, and protected by
 * i_lock similar to i_blocks+i_bytes.
 */
static qsize_t *inode_reserved_space(struct inode * inode)
{
        /* Filesystem must explicitly define it's own method in order to use
         * quota reservation interface */
        BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
        return inode->i_sb->dq_op->get_reserved_space(inode);
}

static qsize_t __inode_get_rsv_space(struct inode *inode)
{
        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        return *inode_reserved_space(inode);
}

static qsize_t inode_get_rsv_space(struct inode *inode)
{
        qsize_t ret;

        if (!inode->i_sb->dq_op->get_reserved_space)
                return 0;
        spin_lock(&inode->i_lock);
        ret = __inode_get_rsv_space(inode);
        spin_unlock(&inode->i_lock);
        return ret;
}

/*
 * This functions updates i_blocks+i_bytes fields and quota information
 * (together with appropriate checks).
 *
 * NOTE: We absolutely rely on the fact that caller dirties the inode
 * (usually helpers in quotaops.h care about this) and holds a handle for
 * the current transaction so that dquot write and inode write go into the
 * same transaction.
 */

/*
 * This operation can block, but only after everything is updated
 */
int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        int reserve = flags & DQUOT_SPACE_RESERVE;
        struct dquot __rcu **dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) += number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_add_bytes(inode, number);
                }
                goto out;
        }

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                if (reserve) {
                        ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]);
                } else {
                        ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]);
                }
                if (ret) {
                        /* Back out changes we already did */
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                spin_lock(&dquot->dq_dqb_lock);
                                if (reserve)
                                        dquot_free_reserved_space(dquot, number);
                                else
                                        dquot_decr_space(dquot, number);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        spin_unlock(&inode->i_lock);
                        goto out_flush_warn;
                }
        }
        if (reserve)
                *inode_reserved_space(inode) += number;
        else
                __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_flush_warn;
        ret = mark_all_dquot_dirty(dquots);
out_flush_warn:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
out:
        return ret;
}
EXPORT_SYMBOL(__dquot_alloc_space);

/*
 * This operation can block, but only after everything is updated
 */
int dquot_alloc_inode(struct inode *inode)
{
        int cnt, ret = 0, index;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;

        if (!inode_quota_active(inode))
                return 0;
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                warn[cnt].w_type = QUOTA_NL_NOWARN;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                ret = dquot_add_inodes(dquot, 1, &warn[cnt]);
                if (ret) {
                        for (cnt--; cnt >= 0; cnt--) {
                                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                                if (!dquot)
                                        continue;
                                /* Back out changes we already did */
                                spin_lock(&dquot->dq_dqb_lock);
                                dquot_decr_inodes(dquot, 1);
                                spin_unlock(&dquot->dq_dqb_lock);
                        }
                        goto warn_put_all;
                }
        }

warn_put_all:
        spin_unlock(&inode->i_lock);
        if (ret == 0)
                ret = mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
        return ret;
}
EXPORT_SYMBOL(dquot_alloc_inode);

/*
 * Convert in-memory reserved quotas to real consumed quotas
 */
void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) -= number;
                __inode_add_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
                                number = dquot->dq_dqb.dqb_rsvspace;
                        dquot->dq_dqb.dqb_curspace += number;
                        dquot->dq_dqb.dqb_rsvspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) -= number;
        __inode_add_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
}
EXPORT_SYMBOL(dquot_claim_space_nodirty);

/*
 * Convert allocated space back to in-memory reserved quotas
 */
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int cnt, index;

        if (!inode_quota_active(inode)) {
                spin_lock(&inode->i_lock);
                *inode_reserved_space(inode) += number;
                __inode_sub_bytes(inode, number);
                spin_unlock(&inode->i_lock);
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        /* Claim reserved quotas to allocated quotas */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (dquot) {
                        spin_lock(&dquot->dq_dqb_lock);
                        if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
                                number = dquot->dq_dqb.dqb_curspace;
                        dquot->dq_dqb.dqb_rsvspace += number;
                        dquot->dq_dqb.dqb_curspace -= number;
                        spin_unlock(&dquot->dq_dqb_lock);
                }
        }
        /* Update inode bytes */
        *inode_reserved_space(inode) += number;
        __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
}
EXPORT_SYMBOL(dquot_reclaim_space_nodirty);

/*
 * This operation can block, but only after everything is updated
 */
void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu **dquots;
        struct dquot *dquot;
        int reserve = flags & DQUOT_SPACE_RESERVE, index;

        if (!inode_quota_active(inode)) {
                if (reserve) {
                        spin_lock(&inode->i_lock);
                        *inode_reserved_space(inode) -= number;
                        spin_unlock(&inode->i_lock);
                } else {
                        inode_sub_bytes(inode, number);
                }
                return;
        }

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;

                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_bdq_free(dquot, number);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                if (reserve)
                        dquot_free_reserved_space(dquot, number);
                else
                        dquot_decr_space(dquot, number);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        if (reserve)
                *inode_reserved_space(inode) -= number;
        else
                __inode_sub_bytes(inode, number);
        spin_unlock(&inode->i_lock);

        if (reserve)
                goto out_unlock;
        mark_all_dquot_dirty(dquots);
out_unlock:
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(__dquot_free_space);

/*
 * This operation can block, but only after everything is updated
 */
void dquot_free_inode(struct inode *inode)
{
        unsigned int cnt;
        struct dquot_warn warn[MAXQUOTAS];
        struct dquot __rcu * const *dquots;
        struct dquot *dquot;
        int index;

        if (!inode_quota_active(inode))
                return;

        dquots = i_dquot(inode);
        index = srcu_read_lock(&dquot_srcu);
        spin_lock(&inode->i_lock);
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                int wtype;
                warn[cnt].w_type = QUOTA_NL_NOWARN;
                dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
                if (!dquot)
                        continue;
                spin_lock(&dquot->dq_dqb_lock);
                wtype = info_idq_free(dquot, 1);
                if (wtype != QUOTA_NL_NOWARN)
                        prepare_warning(&warn[cnt], dquot, wtype);
                dquot_decr_inodes(dquot, 1);
                spin_unlock(&dquot->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        mark_all_dquot_dirty(dquots);
        srcu_read_unlock(&dquot_srcu, index);
        flush_warnings(warn);
}
EXPORT_SYMBOL(dquot_free_inode);

/*
 * Transfer the number of inode and blocks from one diskquota to an other.
 * On success, dquot references in transfer_to are consumed and references
 * to original dquots that need to be released are placed there. On failure,
 * references are kept untouched.
 *
 * This operation can block, but only after everything is updated
 * A transaction must be started when entering this function.
 *
 * We are holding reference on transfer_from & transfer_to, no need to
 * protect them by srcu_read_lock().
 */
int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
{
        qsize_t cur_space;
        qsize_t rsv_space = 0;
        qsize_t inode_usage = 1;
        struct dquot __rcu **dquots;
        struct dquot *transfer_from[MAXQUOTAS] = {};
        int cnt, index, ret = 0, err;
        char is_valid[MAXQUOTAS] = {};
        struct dquot_warn warn_to[MAXQUOTAS];
        struct dquot_warn warn_from_inodes[MAXQUOTAS];
        struct dquot_warn warn_from_space[MAXQUOTAS];

        if (IS_NOQUOTA(inode))
                return 0;

        if (inode->i_sb->dq_op->get_inode_usage) {
                ret = inode->i_sb->dq_op->get_inode_usage(inode, &inode_usage);
                if (ret)
                        return ret;
        }

        /* Initialize the arrays */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                warn_to[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_inodes[cnt].w_type = QUOTA_NL_NOWARN;
                warn_from_space[cnt].w_type = QUOTA_NL_NOWARN;
        }

        spin_lock(&dq_data_lock);
        spin_lock(&inode->i_lock);
        if (IS_NOQUOTA(inode)) {        /* File without quota accounting? */
                spin_unlock(&inode->i_lock);
                spin_unlock(&dq_data_lock);
                return 0;
        }
        cur_space = __inode_get_bytes(inode);
        rsv_space = __inode_get_rsv_space(inode);
        dquots = i_dquot(inode);
        /*
         * Build the transfer_from list, check limits, and update usage in
         * the target structures.
         */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                /*
                 * Skip changes for same uid or gid or for turned off quota-type.
                 */
                if (!transfer_to[cnt])
                        continue;
                /* Avoid races with quotaoff() */
                if (!sb_has_quota_active(inode->i_sb, cnt))
                        continue;
                is_valid[cnt] = 1;
                transfer_from[cnt] = srcu_dereference_check(dquots[cnt],
                                &dquot_srcu, lockdep_is_held(&dq_data_lock));
                ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
                                       &warn_to[cnt]);
                if (ret)
                        goto over_quota;
                ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space,
                                      DQUOT_SPACE_WARN, &warn_to[cnt]);
                if (ret) {
                        spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                        dquot_decr_inodes(transfer_to[cnt], inode_usage);
                        spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
                        goto over_quota;
                }
        }

        /* Decrease usage for source structures and update quota pointers */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (!is_valid[cnt])
                        continue;
                /* Due to IO error we might not have transfer_from[] structure */
                if (transfer_from[cnt]) {
                        int wtype;

                        spin_lock(&transfer_from[cnt]->dq_dqb_lock);
                        wtype = info_idq_free(transfer_from[cnt], inode_usage);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_inodes[cnt],
                                                transfer_from[cnt], wtype);
                        wtype = info_bdq_free(transfer_from[cnt],
                                              cur_space + rsv_space);
                        if (wtype != QUOTA_NL_NOWARN)
                                prepare_warning(&warn_from_space[cnt],
                                                transfer_from[cnt], wtype);
                        dquot_decr_inodes(transfer_from[cnt], inode_usage);
                        dquot_decr_space(transfer_from[cnt], cur_space);
                        dquot_free_reserved_space(transfer_from[cnt],
                                                  rsv_space);
                        spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
                }
                rcu_assign_pointer(dquots[cnt], transfer_to[cnt]);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);

        /*
         * These arrays are local and we hold dquot references so we don't need
         * the srcu protection but still take dquot_srcu to avoid warning in
         * mark_all_dquot_dirty().
         */
        index = srcu_read_lock(&dquot_srcu);
        err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
        if (err < 0)
                ret = err;
        err = mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
        if (err < 0)
                ret = err;
        srcu_read_unlock(&dquot_srcu, index);

        flush_warnings(warn_to);
        flush_warnings(warn_from_inodes);
        flush_warnings(warn_from_space);
        /* Pass back references to put */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (is_valid[cnt])
                        transfer_to[cnt] = transfer_from[cnt];
        return ret;
over_quota:
        /* Back out changes we already did */
        for (cnt--; cnt >= 0; cnt--) {
                if (!is_valid[cnt])
                        continue;
                spin_lock(&transfer_to[cnt]->dq_dqb_lock);
                dquot_decr_inodes(transfer_to[cnt], inode_usage);
                dquot_decr_space(transfer_to[cnt], cur_space);
                dquot_free_reserved_space(transfer_to[cnt], rsv_space);
                spin_unlock(&transfer_to[cnt]->dq_dqb_lock);
        }
        spin_unlock(&inode->i_lock);
        spin_unlock(&dq_data_lock);
        flush_warnings(warn_to);
        return ret;
}
EXPORT_SYMBOL(__dquot_transfer);

/* Wrapper for transferring ownership of an inode for uid/gid only
 * Called from FSXXX_setattr()
 */
int dquot_transfer(struct mnt_idmap *idmap, struct inode *inode,
                   struct iattr *iattr)
{
        struct dquot *transfer_to[MAXQUOTAS] = {};
        struct dquot *dquot;
        struct super_block *sb = inode->i_sb;
        int ret;

        if (!inode_quota_active(inode))
                return 0;

        if (i_uid_needs_update(idmap, iattr, inode)) {
                kuid_t kuid = from_vfsuid(idmap, i_user_ns(inode),
                                          iattr->ia_vfsuid);

                dquot = dqget(sb, make_kqid_uid(kuid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[USRQUOTA] = dquot;
        }
        if (i_gid_needs_update(idmap, iattr, inode)) {
                kgid_t kgid = from_vfsgid(idmap, i_user_ns(inode),
                                          iattr->ia_vfsgid);

                dquot = dqget(sb, make_kqid_gid(kgid));
                if (IS_ERR(dquot)) {
                        if (PTR_ERR(dquot) != -ESRCH) {
                                ret = PTR_ERR(dquot);
                                goto out_put;
                        }
                        dquot = NULL;
                }
                transfer_to[GRPQUOTA] = dquot;
        }
        ret = __dquot_transfer(inode, transfer_to);
out_put:
        dqput_all(transfer_to);
        return ret;
}
EXPORT_SYMBOL(dquot_transfer);

/*
 * Write info of quota file to disk
 */
int dquot_commit_info(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        return dqopt->ops[type]->write_file_info(sb, type);
}
EXPORT_SYMBOL(dquot_commit_info);

int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
{
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!sb_has_quota_active(sb, qid->type))
                return -ESRCH;
        if (!dqopt->ops[qid->type]->get_next_id)
                return -ENOSYS;
        return dqopt->ops[qid->type]->get_next_id(sb, qid);
}
EXPORT_SYMBOL(dquot_get_next_id);

/*
 * Definitions of diskquota operations.
 */
const struct dquot_operations dquot_operations = {
        .write_dquot        = dquot_commit,
        .acquire_dquot        = dquot_acquire,
        .release_dquot        = dquot_release,
        .mark_dirty        = dquot_mark_dquot_dirty,
        .write_info        = dquot_commit_info,
        .alloc_dquot        = dquot_alloc,
        .destroy_dquot        = dquot_destroy,
        .get_next_id        = dquot_get_next_id,
};
EXPORT_SYMBOL(dquot_operations);

/*
 * Generic helper for ->open on filesystems supporting disk quotas.
 */
int dquot_file_open(struct inode *inode, struct file *file)
{
        int error;

        error = generic_file_open(inode, file);
        if (!error && (file->f_mode & FMODE_WRITE))
                error = dquot_initialize(inode);
        return error;
}
EXPORT_SYMBOL(dquot_file_open);

static void vfs_cleanup_quota_inode(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode = dqopt->files[type];

        if (!inode)
                return;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                inode_lock(inode);
                inode->i_flags &= ~S_NOQUOTA;
                inode_unlock(inode);
        }
        dqopt->files[type] = NULL;
        iput(inode);
}

/*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
int dquot_disable(struct super_block *sb, int type, unsigned int flags)
{
        int cnt;
        struct quota_info *dqopt = sb_dqopt(sb);

        rwsem_assert_held_write(&sb->s_umount);

        /* Cannot turn off usage accounting without turning off limits, or
         * suspend quotas and simultaneously turn quotas off. */
        if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
            || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
            DQUOT_USAGE_ENABLED)))
                return -EINVAL;

        /*
         * Skip everything if there's nothing to do. We have to do this because
         * sometimes we are called when fill_super() failed and calling
         * sync_fs() in such cases does no good.
         */
        if (!sb_any_quota_loaded(sb))
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_loaded(sb, cnt))
                        continue;

                if (flags & DQUOT_SUSPENDED) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_SUSPENDED, cnt);
                        spin_unlock(&dq_state_lock);
                } else {
                        spin_lock(&dq_state_lock);
                        dqopt->flags &= ~dquot_state_flag(flags, cnt);
                        /* Turning off suspended quotas? */
                        if (!sb_has_quota_loaded(sb, cnt) &&
                            sb_has_quota_suspended(sb, cnt)) {
                                dqopt->flags &=        ~dquot_state_flag(
                                                        DQUOT_SUSPENDED, cnt);
                                spin_unlock(&dq_state_lock);
                                vfs_cleanup_quota_inode(sb, cnt);
                                continue;
                        }
                        spin_unlock(&dq_state_lock);
                }

                /* We still have to keep quota loaded? */
                if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
                        continue;

                /* Note: these are blocking operations */
                drop_dquot_ref(sb, cnt);
                invalidate_dquots(sb, cnt);
                /*
                 * Now all dquots should be invalidated, all writes done so we
                 * should be only users of the info. No locks needed.
                 */
                if (info_dirty(&dqopt->info[cnt]))
                        sb->dq_op->write_info(sb, cnt);
                if (dqopt->ops[cnt]->free_file_info)
                        dqopt->ops[cnt]->free_file_info(sb, cnt);
                put_quota_format(dqopt->info[cnt].dqi_format);
                dqopt->info[cnt].dqi_flags = 0;
                dqopt->info[cnt].dqi_igrace = 0;
                dqopt->info[cnt].dqi_bgrace = 0;
                dqopt->ops[cnt] = NULL;
        }

        /* Skip syncing and setting flags if quota files are hidden */
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
                goto put_inodes;

        /* Sync the superblock so that buffers with quota data are written to
         * disk (and so userspace sees correct data afterwards). */
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, 1);
        sync_blockdev(sb->s_bdev);
        /* Now the quota files are just ordinary files and we can set the
         * inode flags back. Moreover we discard the pagecache so that
         * userspace sees the writes we did bypassing the pagecache. We
         * must also discard the blockdev buffers so that we see the
         * changes done by userspace on the next quotaon() */
        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt) && dqopt->files[cnt]) {
                        inode_lock(dqopt->files[cnt]);
                        truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
                        inode_unlock(dqopt->files[cnt]);
                }
        if (sb->s_bdev)
                invalidate_bdev(sb->s_bdev);
put_inodes:
        /* We are done when suspending quotas */
        if (flags & DQUOT_SUSPENDED)
                return 0;

        for (cnt = 0; cnt < MAXQUOTAS; cnt++)
                if (!sb_has_quota_loaded(sb, cnt))
                        vfs_cleanup_quota_inode(sb, cnt);
        return 0;
}
EXPORT_SYMBOL(dquot_disable);

int dquot_quota_off(struct super_block *sb, int type)
{
        return dquot_disable(sb, type,
                             DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
}
EXPORT_SYMBOL(dquot_quota_off);

/*
 *        Turn quotas on on a device
 */

static int vfs_setup_quota_inode(struct inode *inode, int type)
{
        struct super_block *sb = inode->i_sb;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (is_bad_inode(inode))
                return -EUCLEAN;
        if (!S_ISREG(inode->i_mode))
                return -EACCES;
        if (IS_RDONLY(inode))
                return -EROFS;
        if (sb_has_quota_loaded(sb, type))
                return -EBUSY;

        /*
         * Quota files should never be encrypted.  They should be thought of as
         * filesystem metadata, not user data.  New-style internal quota files
         * cannot be encrypted by users anyway, but old-style external quota
         * files could potentially be incorrectly created in an encrypted
         * directory, hence this explicit check.  Some reasons why encrypted
         * quota files don't work include: (1) some filesystems that support
         * encryption don't handle it in their quota_read and quota_write, and
         * (2) cleaning up encrypted quota files at unmount would need special
         * consideration, as quota files are cleaned up later than user files.
         */
        if (IS_ENCRYPTED(inode))
                return -EINVAL;

        dqopt->files[type] = igrab(inode);
        if (!dqopt->files[type])
                return -EIO;
        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* We don't want quota and atime on quota files (deadlocks
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
                inode_lock(inode);
                inode->i_flags |= S_NOQUOTA;
                inode_unlock(inode);
                /*
                 * When S_NOQUOTA is set, remove dquot references as no more
                 * references can be added
                 */
                __dquot_drop(inode);
        }
        return 0;
}

int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
        unsigned int flags)
{
        struct quota_format_type *fmt;
        struct quota_info *dqopt = sb_dqopt(sb);
        int error;

        lockdep_assert_held_write(&sb->s_umount);

        /* Just unsuspend quotas? */
        if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED))
                return -EINVAL;

        fmt = find_quota_format(format_id);
        if (!fmt)
                return -ESRCH;
        if (!sb->dq_op || !sb->s_qcop ||
            (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Filesystems outside of init_user_ns not yet supported */
        if (sb->s_user_ns != &init_user_ns) {
                error = -EINVAL;
                goto out_fmt;
        }
        /* Usage always has to be set... */
        if (!(flags & DQUOT_USAGE_ENABLED)) {
                error = -EINVAL;
                goto out_fmt;
        }
        if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
                goto out_fmt;
        }

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
                /* As we bypass the pagecache we must now flush all the
                 * dirty data and invalidate caches so that kernel sees
                 * changes from userspace. It is not enough to just flush
                 * the quota file since if blocksize < pagesize, invalidation
                 * of the cache could fail because of other unrelated dirty
                 * data */
                sync_filesystem(sb);
                invalidate_bdev(sb->s_bdev);
        }

        error = -EINVAL;
        if (!fmt->qf_ops->check_quota_file(sb, type))
                goto out_fmt;

        dqopt->ops[type] = fmt->qf_ops;
        dqopt->info[type].dqi_format = fmt;
        dqopt->info[type].dqi_fmt_id = format_id;
        INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
        error = dqopt->ops[type]->read_file_info(sb, type);
        if (error < 0)
                goto out_fmt;
        if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) {
                spin_lock(&dq_data_lock);
                dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
                spin_unlock(&dq_data_lock);
        }
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
        spin_unlock(&dq_state_lock);

        error = add_dquot_ref(sb, type);
        if (error)
                dquot_disable(sb, type,
                              DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        return error;
out_fmt:
        put_quota_format(fmt);

        return error;
}
EXPORT_SYMBOL(dquot_load_quota_sb);

/*
 * More powerful function for turning on quotas on given quota inode allowing
 * setting of individual quota flags
 */
int dquot_load_quota_inode(struct inode *inode, int type, int format_id,
        unsigned int flags)
{
        int err;

        err = vfs_setup_quota_inode(inode, type);
        if (err < 0)
                return err;
        err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags);
        if (err < 0)
                vfs_cleanup_quota_inode(inode->i_sb, type);
        return err;
}
EXPORT_SYMBOL(dquot_load_quota_inode);

/* Reenable quotas on remount RW */
int dquot_resume(struct super_block *sb, int type)
{
        struct quota_info *dqopt = sb_dqopt(sb);
        int ret = 0, cnt;
        unsigned int flags;

        rwsem_assert_held_write(&sb->s_umount);

        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
                if (type != -1 && cnt != type)
                        continue;
                if (!sb_has_quota_suspended(sb, cnt))
                        continue;

                spin_lock(&dq_state_lock);
                flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
                                                        DQUOT_LIMITS_ENABLED,
                                                        cnt);
                dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
                spin_unlock(&dq_state_lock);

                flags = dquot_generic_flag(flags, cnt);
                ret = dquot_load_quota_sb(sb, cnt, dqopt->info[cnt].dqi_fmt_id,
                                          flags);
                if (ret < 0)
                        vfs_cleanup_quota_inode(sb, cnt);
        }

        return ret;
}
EXPORT_SYMBOL(dquot_resume);

int dquot_quota_on(struct super_block *sb, int type, int format_id,
                   const struct path *path)
{
        int error = security_quota_on(path->dentry);
        if (error)
                return error;
        /* Quota file not on the same filesystem? */
        if (path->dentry->d_sb != sb)
                error = -EXDEV;
        else
                error = dquot_load_quota_inode(d_inode(path->dentry), type,
                                             format_id, DQUOT_USAGE_ENABLED |
                                             DQUOT_LIMITS_ENABLED);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on);

/*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
                int format_id, int type)
{
        struct dentry *dentry;
        int error;

        dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name));
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        error = security_quota_on(dentry);
        if (!error)
                error = dquot_load_quota_inode(d_inode(dentry), type, format_id,
                                DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);

        dput(dentry);
        return error;
}
EXPORT_SYMBOL(dquot_quota_on_mount);

static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /* Accounting cannot be turned on while fs is mounted */
        flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
        if (!flags)
                return -EINVAL;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!(flags & qtype_enforce_flag(type)))
                        continue;
                /* Can't enforce without accounting */
                if (!sb_has_quota_usage_enabled(sb, type)) {
                        ret = -EINVAL;
                        goto out_err;
                }
                if (sb_has_quota_limits_enabled(sb, type)) {
                        /* compatible with XFS */
                        ret = -EEXIST;
                        goto out_err;
                }
                spin_lock(&dq_state_lock);
                dqopt->flags |= dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                spin_unlock(&dq_state_lock);
        }
        return 0;
out_err:
        /* Backout enforcement enablement we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type))
                        dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
        }
        return ret;
}

static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
{
        int ret;
        int type;
        struct quota_info *dqopt = sb_dqopt(sb);

        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
                return -ENOSYS;
        /*
         * We don't support turning off accounting via quotactl. In principle
         * quota infrastructure can do this but filesystems don't expect
         * userspace to be able to do it.
         */
        if (flags &
                  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
                return -EOPNOTSUPP;

        /* Filter out limits not enabled */
        for (type = 0; type < MAXQUOTAS; type++)
                if (!sb_has_quota_limits_enabled(sb, type))
                        flags &= ~qtype_enforce_flag(type);
        /* Nothing left? */
        if (!flags)
                return -EEXIST;
        for (type = 0; type < MAXQUOTAS; type++) {
                if (flags & qtype_enforce_flag(type)) {
                        ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
                        if (ret < 0)
                                goto out_err;
                }
        }
        return 0;
out_err:
        /* Backout enforcement disabling we already did */
        for (type--; type >= 0; type--)  {
                if (flags & qtype_enforce_flag(type)) {
                        spin_lock(&dq_state_lock);
                        dqopt->flags |=
                                dquot_state_flag(DQUOT_LIMITS_ENABLED, type);
                        spin_unlock(&dq_state_lock);
                }
        }
        return ret;
}

/* Generic routine for getting common part of quota structure */
static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;

        memset(di, 0, sizeof(*di));
        spin_lock(&dquot->dq_dqb_lock);
        di->d_spc_hardlimit = dm->dqb_bhardlimit;
        di->d_spc_softlimit = dm->dqb_bsoftlimit;
        di->d_ino_hardlimit = dm->dqb_ihardlimit;
        di->d_ino_softlimit = dm->dqb_isoftlimit;
        di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
        di->d_ino_count = dm->dqb_curinodes;
        di->d_spc_timer = dm->dqb_btime;
        di->d_ino_timer = dm->dqb_itime;
        spin_unlock(&dquot->dq_dqb_lock);
}

int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
                    struct qc_dqblk *di)
{
        struct dquot *dquot;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_dqblk);

int dquot_get_next_dqblk(struct super_block *sb, struct kqid *qid,
                         struct qc_dqblk *di)
{
        struct dquot *dquot;
        int err;

        if (!sb->dq_op->get_next_id)
                return -ENOSYS;
        err = sb->dq_op->get_next_id(sb, qid);
        if (err < 0)
                return err;
        dquot = dqget(sb, *qid);
        if (IS_ERR(dquot))
                return PTR_ERR(dquot);
        do_get_dqblk(dquot, di);
        dqput(dquot);

        return 0;
}
EXPORT_SYMBOL(dquot_get_next_dqblk);

#define VFS_QC_MASK \
        (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
         QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
         QC_SPC_TIMER | QC_INO_TIMER)

/* Generic routine for setting common part of quota structure */
static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
        int ret;

        if (di->d_fieldmask & ~VFS_QC_MASK)
                return -EINVAL;

        if (((di->d_fieldmask & QC_SPC_SOFT) &&
             di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_SPC_HARD) &&
             di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
            ((di->d_fieldmask & QC_INO_SOFT) &&
             (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
            ((di->d_fieldmask & QC_INO_HARD) &&
             (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
                return -ERANGE;

        spin_lock(&dquot->dq_dqb_lock);
        if (di->d_fieldmask & QC_SPACE) {
                dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_SOFT)
                dm->dqb_bsoftlimit = di->d_spc_softlimit;
        if (di->d_fieldmask & QC_SPC_HARD)
                dm->dqb_bhardlimit = di->d_spc_hardlimit;
        if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_COUNT) {
                dm->dqb_curinodes = di->d_ino_count;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_SOFT)
                dm->dqb_isoftlimit = di->d_ino_softlimit;
        if (di->d_fieldmask & QC_INO_HARD)
                dm->dqb_ihardlimit = di->d_ino_hardlimit;
        if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_SPC_TIMER) {
                dm->dqb_btime = di->d_spc_timer;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }

        if (di->d_fieldmask & QC_INO_TIMER) {
                dm->dqb_itime = di->d_ino_timer;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }

        if (check_blim) {
                if (!dm->dqb_bsoftlimit ||
                    dm->dqb_curspace + dm->dqb_rsvspace <= dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_SPC_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace;
        }
        if (check_ilim) {
                if (!dm->dqb_isoftlimit ||
                    dm->dqb_curinodes <= dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
                } else if (!(di->d_fieldmask & QC_INO_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace;
        }
        if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
            dm->dqb_isoftlimit)
                clear_bit(DQ_FAKE_B, &dquot->dq_flags);
        else
                set_bit(DQ_FAKE_B, &dquot->dq_flags);
        spin_unlock(&dquot->dq_dqb_lock);
        ret = mark_dquot_dirty(dquot);
        if (ret < 0)
                return ret;
        return 0;
}

int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
                  struct qc_dqblk *di)
{
        struct dquot *dquot;
        int rc;

        dquot = dqget(sb, qid);
        if (IS_ERR(dquot)) {
                rc = PTR_ERR(dquot);
                goto out;
        }
        rc = do_set_dqblk(dquot, di);
        dqput(dquot);
out:
        return rc;
}
EXPORT_SYMBOL(dquot_set_dqblk);

/* Generic routine for getting common part of quota file information */
int dquot_get_state(struct super_block *sb, struct qc_state *state)
{
        struct mem_dqinfo *mi;
        struct qc_type_state *tstate;
        struct quota_info *dqopt = sb_dqopt(sb);
        int type;

        memset(state, 0, sizeof(*state));
        for (type = 0; type < MAXQUOTAS; type++) {
                if (!sb_has_quota_active(sb, type))
                        continue;
                tstate = state->s_state + type;
                mi = sb_dqopt(sb)->info + type;
                tstate->flags = QCI_ACCT_ENABLED;
                spin_lock(&dq_data_lock);
                if (mi->dqi_flags & DQF_SYS_FILE)
                        tstate->flags |= QCI_SYSFILE;
                if (mi->dqi_flags & DQF_ROOT_SQUASH)
                        tstate->flags |= QCI_ROOT_SQUASH;
                if (sb_has_quota_limits_enabled(sb, type))
                        tstate->flags |= QCI_LIMITS_ENFORCED;
                tstate->spc_timelimit = mi->dqi_bgrace;
                tstate->ino_timelimit = mi->dqi_igrace;
                if (dqopt->files[type]) {
                        tstate->ino = dqopt->files[type]->i_ino;
                        tstate->blocks = dqopt->files[type]->i_blocks;
                }
                tstate->nextents = 1;        /* We don't know... */
                spin_unlock(&dq_data_lock);
        }
        return 0;
}
EXPORT_SYMBOL(dquot_get_state);

/* Generic routine for setting common part of quota file information */
int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
{
        struct mem_dqinfo *mi;

        if ((ii->i_fieldmask & QC_WARNS_MASK) ||
            (ii->i_fieldmask & QC_RT_SPC_TIMER))
                return -EINVAL;
        if (!sb_has_quota_active(sb, type))
                return -ESRCH;
        mi = sb_dqopt(sb)->info + type;
        if (ii->i_fieldmask & QC_FLAGS) {
                if ((ii->i_flags & QCI_ROOT_SQUASH &&
                     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD))
                        return -EINVAL;
        }
        spin_lock(&dq_data_lock);
        if (ii->i_fieldmask & QC_SPC_TIMER)
                mi->dqi_bgrace = ii->i_spc_timelimit;
        if (ii->i_fieldmask & QC_INO_TIMER)
                mi->dqi_igrace = ii->i_ino_timelimit;
        if (ii->i_fieldmask & QC_FLAGS) {
                if (ii->i_flags & QCI_ROOT_SQUASH)
                        mi->dqi_flags |= DQF_ROOT_SQUASH;
                else
                        mi->dqi_flags &= ~DQF_ROOT_SQUASH;
        }
        spin_unlock(&dq_data_lock);
        mark_info_dirty(sb, type);
        /* Force write to disk */
        return sb->dq_op->write_info(sb, type);
}
EXPORT_SYMBOL(dquot_set_dqinfo);

const struct quotactl_ops dquot_quotactl_sysfile_ops = {
        .quota_enable        = dquot_quota_enable,
        .quota_disable        = dquot_quota_disable,
        .quota_sync        = dquot_quota_sync,
        .get_state        = dquot_get_state,
        .set_info        = dquot_set_dqinfo,
        .get_dqblk        = dquot_get_dqblk,
        .get_nextdqblk        = dquot_get_next_dqblk,
        .set_dqblk        = dquot_set_dqblk
};
EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);

static int do_proc_dqstats(const struct ctl_table *table, int write,
                     void *buffer, size_t *lenp, loff_t *ppos)
{
        unsigned int type = (unsigned long *)table->data - dqstats.stat;
        s64 value = percpu_counter_sum(&dqstats.counter[type]);

        /* Filter negative values for non-monotonic counters */
        if (value < 0 && (type == DQST_ALLOC_DQUOTS ||
                          type == DQST_FREE_DQUOTS))
                value = 0;

        /* Update global table */
        dqstats.stat[type] = value;
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table fs_dqstats_table[] = {
        {
                .procname        = "lookups",
                .data                = &dqstats.stat[DQST_LOOKUPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "drops",
                .data                = &dqstats.stat[DQST_DROPS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "reads",
                .data                = &dqstats.stat[DQST_READS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "writes",
                .data                = &dqstats.stat[DQST_WRITES],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "cache_hits",
                .data                = &dqstats.stat[DQST_CACHE_HITS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "allocated_dquots",
                .data                = &dqstats.stat[DQST_ALLOC_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "free_dquots",
                .data                = &dqstats.stat[DQST_FREE_DQUOTS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
        {
                .procname        = "syncs",
                .data                = &dqstats.stat[DQST_SYNCS],
                .maxlen                = sizeof(unsigned long),
                .mode                = 0444,
                .proc_handler        = do_proc_dqstats,
        },
#ifdef CONFIG_PRINT_QUOTA_WARNING
        {
                .procname        = "warnings",
                .data                = &flag_print_warnings,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
};

static int __init dquot_init(void)
{
        int i, ret;
        unsigned long nr_hash, order;
        struct shrinker *dqcache_shrinker;

        printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);

        register_sysctl_init("fs/quota", fs_dqstats_table);

        dquot_cachep = kmem_cache_create("dquot",
                        sizeof(struct dquot), sizeof(unsigned long) * 4,
                        (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
                                SLAB_PANIC),
                        NULL);

        order = 0;
        dquot_hash = (struct hlist_head *)__get_free_pages(GFP_KERNEL, order);
        if (!dquot_hash)
                panic("Cannot create dquot hash table");

        ret = percpu_counter_init_many(dqstats.counter, 0, GFP_KERNEL,
                                       _DQST_DQSTAT_LAST);
        if (ret)
                panic("Cannot create dquot stat counters");

        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
        dq_hash_bits = ilog2(nr_hash);

        nr_hash = 1UL << dq_hash_bits;
        dq_hash_mask = nr_hash - 1;
        for (i = 0; i < nr_hash; i++)
                INIT_HLIST_HEAD(dquot_hash + i);

        pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld,"
                " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order));

        dqcache_shrinker = shrinker_alloc(0, "dquota-cache");
        if (!dqcache_shrinker)
                panic("Cannot allocate dquot shrinker");

        dqcache_shrinker->count_objects = dqcache_shrink_count;
        dqcache_shrinker->scan_objects = dqcache_shrink_scan;

        shrinker_register(dqcache_shrinker);

        return 0;
}
fs_initcall(dquot_init);










































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




























    3 
































    3 
    3 


    3 



















    3 






    3 


    3 


































    3 




















    3 




































    3 
    3 


    3 

















    3 






    3 


    3 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
// SPDX-License-Identifier: GPL-2.0
/*
 * NETLINK      Netlink attributes
 *
 *                 Authors:        Thomas Graf <tgraf@suug.ch>
 *                                 Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 */

#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/jiffies.h>
#include <linux/nospec.h>
#include <linux/skbuff.h>
#include <linux/string.h>
#include <linux/types.h>
#include <net/netlink.h>

/* For these data types, attribute length should be exactly the given
 * size. However, to maintain compatibility with broken commands, if the
 * attribute length does not match the expected size a warning is emitted
 * to the user that the command is sending invalid data and needs to be fixed.
 */
static const u8 nla_attr_len[NLA_TYPE_MAX+1] = {
        [NLA_U8]        = sizeof(u8),
        [NLA_U16]        = sizeof(u16),
        [NLA_U32]        = sizeof(u32),
        [NLA_U64]        = sizeof(u64),
        [NLA_S8]        = sizeof(s8),
        [NLA_S16]        = sizeof(s16),
        [NLA_S32]        = sizeof(s32),
        [NLA_S64]        = sizeof(s64),
        [NLA_BE16]        = sizeof(__be16),
        [NLA_BE32]        = sizeof(__be32),
};

static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = {
        [NLA_U8]        = sizeof(u8),
        [NLA_U16]        = sizeof(u16),
        [NLA_U32]        = sizeof(u32),
        [NLA_U64]        = sizeof(u64),
        [NLA_MSECS]        = sizeof(u64),
        [NLA_NESTED]        = NLA_HDRLEN,
        [NLA_S8]        = sizeof(s8),
        [NLA_S16]        = sizeof(s16),
        [NLA_S32]        = sizeof(s32),
        [NLA_S64]        = sizeof(s64),
        [NLA_BE16]        = sizeof(__be16),
        [NLA_BE32]        = sizeof(__be32),
};

/*
 * Nested policies might refer back to the original
 * policy in some cases, and userspace could try to
 * abuse that and recurse by nesting in the right
 * ways. Limit recursion to avoid this problem.
 */
#define MAX_POLICY_RECURSION_DEPTH        10

static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack,
                                struct nlattr **tb, unsigned int depth);

static int validate_nla_bitfield32(const struct nlattr *nla,
                                   const u32 valid_flags_mask)
{
        const struct nla_bitfield32 *bf = nla_data(nla);

        if (!valid_flags_mask)
                return -EINVAL;

        /*disallow invalid bit selector */
        if (bf->selector & ~valid_flags_mask)
                return -EINVAL;

        /*disallow invalid bit values */
        if (bf->value & ~valid_flags_mask)
                return -EINVAL;

        /*disallow valid bit values that are not selected*/
        if (bf->value & ~bf->selector)
                return -EINVAL;

        return 0;
}

static int nla_validate_array(const struct nlattr *head, int len, int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack,
                              unsigned int validate, unsigned int depth)
{
        const struct nlattr *entry;
        int rem;

        nla_for_each_attr(entry, head, len, rem) {
                int ret;

                if (nla_len(entry) == 0)
                        continue;

                if (nla_len(entry) < NLA_HDRLEN) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy,
                                                "Array element too short");
                        return -ERANGE;
                }

                ret = __nla_validate_parse(nla_data(entry), nla_len(entry),
                                           maxtype, policy, validate, extack,
                                           NULL, depth + 1);
                if (ret < 0)
                        return ret;
        }

        return 0;
}

void nla_get_range_unsigned(const struct nla_policy *pt,
                            struct netlink_range_validation *range)
{
        WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR &&
                     (pt->min < 0 || pt->max < 0));

        range->min = 0;

        switch (pt->type) {
        case NLA_U8:
                range->max = U8_MAX;
                break;
        case NLA_U16:
        case NLA_BE16:
        case NLA_BINARY:
                range->max = U16_MAX;
                break;
        case NLA_U32:
        case NLA_BE32:
                range->max = U32_MAX;
                break;
        case NLA_U64:
        case NLA_UINT:
        case NLA_MSECS:
                range->max = U64_MAX;
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }

        switch (pt->validation_type) {
        case NLA_VALIDATE_RANGE:
        case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
                range->min = pt->min;
                range->max = pt->max;
                break;
        case NLA_VALIDATE_RANGE_PTR:
                *range = *pt->range;
                break;
        case NLA_VALIDATE_MIN:
                range->min = pt->min;
                break;
        case NLA_VALIDATE_MAX:
                range->max = pt->max;
                break;
        default:
                break;
        }
}

static int nla_validate_range_unsigned(const struct nla_policy *pt,
                                       const struct nlattr *nla,
                                       struct netlink_ext_ack *extack,
                                       unsigned int validate)
{
        struct netlink_range_validation range;
        u64 value;

        switch (pt->type) {
        case NLA_U8:
                value = nla_get_u8(nla);
                break;
        case NLA_U16:
                value = nla_get_u16(nla);
                break;
        case NLA_U32:
                value = nla_get_u32(nla);
                break;
        case NLA_U64:
                value = nla_get_u64(nla);
                break;
        case NLA_UINT:
                value = nla_get_uint(nla);
                break;
        case NLA_MSECS:
                value = nla_get_u64(nla);
                break;
        case NLA_BINARY:
                value = nla_len(nla);
                break;
        case NLA_BE16:
                value = ntohs(nla_get_be16(nla));
                break;
        case NLA_BE32:
                value = ntohl(nla_get_be32(nla));
                break;
        default:
                return -EINVAL;
        }

        nla_get_range_unsigned(pt, &range);

        if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG &&
            pt->type == NLA_BINARY && value > range.max) {
                pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
                                    current->comm, pt->type);
                if (validate & NL_VALIDATE_STRICT_ATTRS) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }

                /* this assumes min <= max (don't validate against min) */
                return 0;
        }

        if (value < range.min || value > range.max) {
                bool binary = pt->type == NLA_BINARY;

                if (binary)
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "binary attribute size out of range");
                else
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "integer out of range");

                return -ERANGE;
        }

        return 0;
}

void nla_get_range_signed(const struct nla_policy *pt,
                          struct netlink_range_validation_signed *range)
{
        switch (pt->type) {
        case NLA_S8:
                range->min = S8_MIN;
                range->max = S8_MAX;
                break;
        case NLA_S16:
                range->min = S16_MIN;
                range->max = S16_MAX;
                break;
        case NLA_S32:
                range->min = S32_MIN;
                range->max = S32_MAX;
                break;
        case NLA_S64:
        case NLA_SINT:
                range->min = S64_MIN;
                range->max = S64_MAX;
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }

        switch (pt->validation_type) {
        case NLA_VALIDATE_RANGE:
                range->min = pt->min;
                range->max = pt->max;
                break;
        case NLA_VALIDATE_RANGE_PTR:
                *range = *pt->range_signed;
                break;
        case NLA_VALIDATE_MIN:
                range->min = pt->min;
                break;
        case NLA_VALIDATE_MAX:
                range->max = pt->max;
                break;
        default:
                break;
        }
}

static int nla_validate_int_range_signed(const struct nla_policy *pt,
                                         const struct nlattr *nla,
                                         struct netlink_ext_ack *extack)
{
        struct netlink_range_validation_signed range;
        s64 value;

        switch (pt->type) {
        case NLA_S8:
                value = nla_get_s8(nla);
                break;
        case NLA_S16:
                value = nla_get_s16(nla);
                break;
        case NLA_S32:
                value = nla_get_s32(nla);
                break;
        case NLA_S64:
                value = nla_get_s64(nla);
                break;
        case NLA_SINT:
                value = nla_get_sint(nla);
                break;
        default:
                return -EINVAL;
        }

        nla_get_range_signed(pt, &range);

        if (value < range.min || value > range.max) {
                NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                        "integer out of range");
                return -ERANGE;
        }

        return 0;
}

static int nla_validate_int_range(const struct nla_policy *pt,
                                  const struct nlattr *nla,
                                  struct netlink_ext_ack *extack,
                                  unsigned int validate)
{
        switch (pt->type) {
        case NLA_U8:
        case NLA_U16:
        case NLA_U32:
        case NLA_U64:
        case NLA_UINT:
        case NLA_MSECS:
        case NLA_BINARY:
        case NLA_BE16:
        case NLA_BE32:
                return nla_validate_range_unsigned(pt, nla, extack, validate);
        case NLA_S8:
        case NLA_S16:
        case NLA_S32:
        case NLA_S64:
        case NLA_SINT:
                return nla_validate_int_range_signed(pt, nla, extack);
        default:
                WARN_ON(1);
                return -EINVAL;
        }
}

static int nla_validate_mask(const struct nla_policy *pt,
                             const struct nlattr *nla,
                             struct netlink_ext_ack *extack)
{
        u64 value;

        switch (pt->type) {
        case NLA_U8:
                value = nla_get_u8(nla);
                break;
        case NLA_U16:
                value = nla_get_u16(nla);
                break;
        case NLA_U32:
                value = nla_get_u32(nla);
                break;
        case NLA_U64:
                value = nla_get_u64(nla);
                break;
        case NLA_UINT:
                value = nla_get_uint(nla);
                break;
        case NLA_BE16:
                value = ntohs(nla_get_be16(nla));
                break;
        case NLA_BE32:
                value = ntohl(nla_get_be32(nla));
                break;
        default:
                return -EINVAL;
        }

        if (value & ~(u64)pt->mask) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set");
                return -EINVAL;
        }

        return 0;
}

static int validate_nla(const struct nlattr *nla, int maxtype,
                        const struct nla_policy *policy, unsigned int validate,
                        struct netlink_ext_ack *extack, unsigned int depth)
{
        u16 strict_start_type = policy[0].strict_start_type;
        const struct nla_policy *pt;
        int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla);
        int err = -ERANGE;

        if (strict_start_type && type >= strict_start_type)
                validate |= NL_VALIDATE_STRICT;

        if (type <= 0 || type > maxtype)
                return 0;

        type = array_index_nospec(type, maxtype + 1);
        pt = &policy[type];

        BUG_ON(pt->type > NLA_TYPE_MAX);

        if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) {
                pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n",
                                    current->comm, type);
                if (validate & NL_VALIDATE_STRICT_ATTRS) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }
        }

        if (validate & NL_VALIDATE_NESTED) {
                if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) &&
                    !(nla->nla_type & NLA_F_NESTED)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "NLA_F_NESTED is missing");
                        return -EINVAL;
                }
                if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY &&
                    pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "NLA_F_NESTED not expected");
                        return -EINVAL;
                }
        }

        switch (pt->type) {
        case NLA_REJECT:
                if (extack && pt->reject_message) {
                        NL_SET_BAD_ATTR(extack, nla);
                        extack->_msg = pt->reject_message;
                        return -EINVAL;
                }
                err = -EINVAL;
                goto out_err;

        case NLA_FLAG:
                if (attrlen > 0)
                        goto out_err;
                break;

        case NLA_SINT:
        case NLA_UINT:
                if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) {
                        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                                "invalid attribute length");
                        return -EINVAL;
                }
                break;

        case NLA_BITFIELD32:
                if (attrlen != sizeof(struct nla_bitfield32))
                        goto out_err;

                err = validate_nla_bitfield32(nla, pt->bitfield32_valid);
                if (err)
                        goto out_err;
                break;

        case NLA_NUL_STRING:
                if (pt->len)
                        minlen = min_t(int, attrlen, pt->len + 1);
                else
                        minlen = attrlen;

                if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) {
                        err = -EINVAL;
                        goto out_err;
                }
                fallthrough;

        case NLA_STRING:
                if (attrlen < 1)
                        goto out_err;

                if (pt->len) {
                        char *buf = nla_data(nla);

                        if (buf[attrlen - 1] == '\0')
                                attrlen--;

                        if (attrlen > pt->len)
                                goto out_err;
                }
                break;

        case NLA_BINARY:
                if (pt->len && attrlen > pt->len)
                        goto out_err;
                break;

        case NLA_NESTED:
                /* a nested attributes is allowed to be empty; if its not,
                 * it must have a size of at least NLA_HDRLEN.
                 */
                if (attrlen == 0)
                        break;
                if (attrlen < NLA_HDRLEN)
                        goto out_err;
                if (pt->nested_policy) {
                        err = __nla_validate_parse(nla_data(nla), nla_len(nla),
                                                   pt->len, pt->nested_policy,
                                                   validate, extack, NULL,
                                                   depth + 1);
                        if (err < 0) {
                                /*
                                 * return directly to preserve the inner
                                 * error message/attribute pointer
                                 */
                                return err;
                        }
                }
                break;
        case NLA_NESTED_ARRAY:
                /* a nested array attribute is allowed to be empty; if its not,
                 * it must have a size of at least NLA_HDRLEN.
                 */
                if (attrlen == 0)
                        break;
                if (attrlen < NLA_HDRLEN)
                        goto out_err;
                if (pt->nested_policy) {
                        int err;

                        err = nla_validate_array(nla_data(nla), nla_len(nla),
                                                 pt->len, pt->nested_policy,
                                                 extack, validate, depth);
                        if (err < 0) {
                                /*
                                 * return directly to preserve the inner
                                 * error message/attribute pointer
                                 */
                                return err;
                        }
                }
                break;

        case NLA_UNSPEC:
                if (validate & NL_VALIDATE_UNSPEC) {
                        NL_SET_ERR_MSG_ATTR(extack, nla,
                                            "Unsupported attribute");
                        return -EINVAL;
                }
                if (attrlen < pt->len)
                        goto out_err;
                break;

        default:
                if (pt->len)
                        minlen = pt->len;
                else
                        minlen = nla_attr_minlen[pt->type];

                if (attrlen < minlen)
                        goto out_err;
        }

        /* further validation */
        switch (pt->validation_type) {
        case NLA_VALIDATE_NONE:
                /* nothing to do */
                break;
        case NLA_VALIDATE_RANGE_PTR:
        case NLA_VALIDATE_RANGE:
        case NLA_VALIDATE_RANGE_WARN_TOO_LONG:
        case NLA_VALIDATE_MIN:
        case NLA_VALIDATE_MAX:
                err = nla_validate_int_range(pt, nla, extack, validate);
                if (err)
                        return err;
                break;
        case NLA_VALIDATE_MASK:
                err = nla_validate_mask(pt, nla, extack);
                if (err)
                        return err;
                break;
        case NLA_VALIDATE_FUNCTION:
                if (pt->validate) {
                        err = pt->validate(nla, extack);
                        if (err)
                                return err;
                }
                break;
        }

        return 0;
out_err:
        NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt,
                                "Attribute failed policy validation");
        return err;
}

static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack,
                                struct nlattr **tb, unsigned int depth)
{
        const struct nlattr *nla;
        int rem;

        if (depth >= MAX_POLICY_RECURSION_DEPTH) {
                NL_SET_ERR_MSG(extack,
                               "allowed policy recursion depth exceeded");
                return -EINVAL;
        }

        if (tb)
                memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));

        nla_for_each_attr(nla, head, len, rem) {
                u16 type = nla_type(nla);

                if (type == 0 || type > maxtype) {
                        if (validate & NL_VALIDATE_MAXTYPE) {
                                NL_SET_ERR_MSG_ATTR(extack, nla,
                                                    "Unknown attribute type");
                                return -EINVAL;
                        }
                        continue;
                }
                type = array_index_nospec(type, maxtype + 1);
                if (policy) {
                        int err = validate_nla(nla, maxtype, policy,
                                               validate, extack, depth);

                        if (err < 0)
                                return err;
                }

                if (tb)
                        tb[type] = (struct nlattr *)nla;
        }

        if (unlikely(rem > 0)) {
                pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
                                    rem, current->comm);
                NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes");
                if (validate & NL_VALIDATE_TRAILING)
                        return -EINVAL;
        }

        return 0;
}

/**
 * __nla_validate - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation depends on the validate flags passed, see
 * &enum netlink_validation for more details on that.
 * See documentation of struct nla_policy for more details.
 *
 * Returns 0 on success or a negative error code.
 */
int __nla_validate(const struct nlattr *head, int len, int maxtype,
                   const struct nla_policy *policy, unsigned int validate,
                   struct netlink_ext_ack *extack)
{
        return __nla_validate_parse(head, len, maxtype, policy, validate,
                                    extack, NULL, 0);
}
EXPORT_SYMBOL(__nla_validate);

/**
 * nla_policy_len - Determine the max. length of a policy
 * @p: policy to use
 * @n: number of policies
 *
 * Determines the max. length of the policy.  It is currently used
 * to allocated Netlink buffers roughly the size of the actual
 * message.
 *
 * Returns 0 on success or a negative error code.
 */
int
nla_policy_len(const struct nla_policy *p, int n)
{
        int i, len = 0;

        for (i = 0; i < n; i++, p++) {
                if (p->len)
                        len += nla_total_size(p->len);
                else if (nla_attr_len[p->type])
                        len += nla_total_size(nla_attr_len[p->type]);
                else if (nla_attr_minlen[p->type])
                        len += nla_total_size(nla_attr_minlen[p->type]);
        }

        return len;
}
EXPORT_SYMBOL(nla_policy_len);

/**
 * __nla_parse - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type.
 * Validation is controlled by the @validate parameter.
 *
 * Returns 0 on success or a negative error code.
 */
int __nla_parse(struct nlattr **tb, int maxtype,
                const struct nlattr *head, int len,
                const struct nla_policy *policy, unsigned int validate,
                struct netlink_ext_ack *extack)
{
        return __nla_validate_parse(head, len, maxtype, policy, validate,
                                    extack, tb, 0);
}
EXPORT_SYMBOL(__nla_parse);

/**
 * nla_find - Find a specific attribute in a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @attrtype: type of attribute to look for
 *
 * Returns the first attribute in the stream matching the specified type.
 */
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype)
{
        const struct nlattr *nla;
        int rem;

        nla_for_each_attr(nla, head, len, rem)
                if (nla_type(nla) == attrtype)
                        return (struct nlattr *)nla;

        return NULL;
}
EXPORT_SYMBOL(nla_find);

/**
 * nla_strscpy - Copy string attribute payload into a sized buffer
 * @dst: Where to copy the string to.
 * @nla: Attribute to copy the string from.
 * @dstsize: Size of destination buffer.
 *
 * Copies at most dstsize - 1 bytes into the destination buffer.
 * Unlike strscpy() the destination buffer is always padded out.
 *
 * Return:
 * * srclen - Returns @nla length (not including the trailing %NUL).
 * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater
 *            than @dstsize.
 */
ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize)
{
        size_t srclen = nla_len(nla);
        char *src = nla_data(nla);
        ssize_t ret;
        size_t len;

        if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX))
                return -E2BIG;

        if (srclen > 0 && src[srclen - 1] == '\0')
                srclen--;

        if (srclen >= dstsize) {
                len = dstsize - 1;
                ret = -E2BIG;
        } else {
                len = srclen;
                ret = len;
        }

        memcpy(dst, src, len);
        /* Zero pad end of dst. */
        memset(dst + len, 0, dstsize - len);

        return ret;
}
EXPORT_SYMBOL(nla_strscpy);

/**
 * nla_strdup - Copy string attribute payload into a newly allocated buffer
 * @nla: attribute to copy the string from
 * @flags: the type of memory to allocate (see kmalloc).
 *
 * Returns a pointer to the allocated buffer or NULL on error.
 */
char *nla_strdup(const struct nlattr *nla, gfp_t flags)
{
        size_t srclen = nla_len(nla);
        char *src = nla_data(nla), *dst;

        if (srclen > 0 && src[srclen - 1] == '\0')
                srclen--;

        dst = kmalloc(srclen + 1, flags);
        if (dst != NULL) {
                memcpy(dst, src, srclen);
                dst[srclen] = '\0';
        }
        return dst;
}
EXPORT_SYMBOL(nla_strdup);

/**
 * nla_memcpy - Copy a netlink attribute into another memory area
 * @dest: where to copy to memcpy
 * @src: netlink attribute to copy from
 * @count: size of the destination area
 *
 * Note: The number of bytes copied is limited by the length of
 *       attribute's payload. memcpy
 *
 * Returns the number of bytes copied.
 */
int nla_memcpy(void *dest, const struct nlattr *src, int count)
{
        int minlen = min_t(int, count, nla_len(src));

        memcpy(dest, nla_data(src), minlen);
        if (count > minlen)
                memset(dest + minlen, 0, count - minlen);

        return minlen;
}
EXPORT_SYMBOL(nla_memcpy);

/**
 * nla_memcmp - Compare an attribute with sized memory area
 * @nla: netlink attribute
 * @data: memory area
 * @size: size of memory area
 */
int nla_memcmp(const struct nlattr *nla, const void *data,
                             size_t size)
{
        int d = nla_len(nla) - size;

        if (d == 0)
                d = memcmp(nla_data(nla), data, size);

        return d;
}
EXPORT_SYMBOL(nla_memcmp);

/**
 * nla_strcmp - Compare a string attribute against a string
 * @nla: netlink string attribute
 * @str: another string
 */
int nla_strcmp(const struct nlattr *nla, const char *str)
{
        int len = strlen(str);
        char *buf = nla_data(nla);
        int attrlen = nla_len(nla);
        int d;

        while (attrlen > 0 && buf[attrlen - 1] == '\0')
                attrlen--;

        d = attrlen - len;
        if (d == 0)
                d = memcmp(nla_data(nla), str, len);

        return d;
}
EXPORT_SYMBOL(nla_strcmp);

#ifdef CONFIG_NET
/**
 * __nla_reserve - reserve room for attribute on the skb
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
        struct nlattr *nla;

        nla = skb_put(skb, nla_total_size(attrlen));
        nla->nla_type = attrtype;
        nla->nla_len = nla_attr_size(attrlen);

        memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen));

        return nla;
}
EXPORT_SYMBOL(__nla_reserve);

/**
 * __nla_reserve_64bit - reserve room for attribute on the skb and align it
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @padattr: attribute type for the padding
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it. It also ensure that this
 * attribute will have a 64-bit aligned nla_data() area.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                   int attrlen, int padattr)
{
        nla_align_64bit(skb, padattr);

        return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(__nla_reserve_64bit);

/**
 * __nla_reserve_nohdr - reserve room for attribute without header
 * @skb: socket buffer to reserve room on
 * @attrlen: length of attribute payload
 *
 * Reserves room for attribute payload without a header.
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the payload.
 */
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
        return skb_put_zero(skb, NLA_ALIGN(attrlen));
}
EXPORT_SYMBOL(__nla_reserve_nohdr);

/**
 * nla_reserve - reserve room for attribute on the skb
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
        if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
                return NULL;

        return __nla_reserve(skb, attrtype, attrlen);
}
EXPORT_SYMBOL(nla_reserve);

/**
 * nla_reserve_64bit - reserve room for attribute on the skb and align it
 * @skb: socket buffer to reserve room on
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @padattr: attribute type for the padding
 *
 * Adds a netlink attribute header to a socket buffer and reserves
 * room for the payload but does not copy it. It also ensure that this
 * attribute will have a 64-bit aligned nla_data() area.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                                 int padattr)
{
        size_t len;

        if (nla_need_padding_for_64bit(skb))
                len = nla_total_size_64bit(attrlen);
        else
                len = nla_total_size(attrlen);
        if (unlikely(skb_tailroom(skb) < len))
                return NULL;

        return __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
}
EXPORT_SYMBOL(nla_reserve_64bit);

/**
 * nla_reserve_nohdr - reserve room for attribute without header
 * @skb: socket buffer to reserve room on
 * @attrlen: length of attribute payload
 *
 * Reserves room for attribute payload without a header.
 *
 * Returns NULL if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return NULL;

        return __nla_reserve_nohdr(skb, attrlen);
}
EXPORT_SYMBOL(nla_reserve_nohdr);

/**
 * __nla_put - Add a netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
                             const void *data)
{
        struct nlattr *nla;

        nla = __nla_reserve(skb, attrtype, attrlen);
        memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put);

/**
 * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 * @padattr: attribute type for the padding
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute header and payload.
 */
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                     const void *data, int padattr)
{
        struct nlattr *nla;

        nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr);
        memcpy(nla_data(nla), data, attrlen);
}
EXPORT_SYMBOL(__nla_put_64bit);

/**
 * __nla_put_nohdr - Add a netlink attribute without header
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * The caller is responsible to ensure that the skb provides enough
 * tailroom for the attribute payload.
 */
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
        void *start;

        start = __nla_reserve_nohdr(skb, attrlen);
        memcpy(start, data, attrlen);
}
EXPORT_SYMBOL(__nla_put_nohdr);

/**
 * nla_put - Add a netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen)))
                return -EMSGSIZE;

        __nla_put(skb, attrtype, attrlen, data);
        return 0;
}
EXPORT_SYMBOL(nla_put);

/**
 * nla_put_64bit - Add a netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 * @padattr: attribute type for the padding
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute header and payload.
 */
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                  const void *data, int padattr)
{
        size_t len;

        if (nla_need_padding_for_64bit(skb))
                len = nla_total_size_64bit(attrlen);
        else
                len = nla_total_size(attrlen);
        if (unlikely(skb_tailroom(skb) < len))
                return -EMSGSIZE;

        __nla_put_64bit(skb, attrtype, attrlen, data, padattr);
        return 0;
}
EXPORT_SYMBOL(nla_put_64bit);

/**
 * nla_put_nohdr - Add a netlink attribute without header
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return -EMSGSIZE;

        __nla_put_nohdr(skb, attrlen, data);
        return 0;
}
EXPORT_SYMBOL(nla_put_nohdr);

/**
 * nla_append - Add a netlink attribute without header or padding
 * @skb: socket buffer to add attribute to
 * @attrlen: length of attribute payload
 * @data: head of attribute payload
 *
 * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store
 * the attribute payload.
 */
int nla_append(struct sk_buff *skb, int attrlen, const void *data)
{
        if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen)))
                return -EMSGSIZE;

        skb_put_data(skb, data, attrlen);
        return 0;
}
EXPORT_SYMBOL(nla_append);
#endif

















































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 

















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
// SPDX-License-Identifier: GPL-2.0-only
// Copyright (c) 2020 Facebook Inc.

#include <linux/ethtool_netlink.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <net/udp_tunnel.h>
#include <net/vxlan.h>

enum udp_tunnel_nic_table_entry_flags {
        UDP_TUNNEL_NIC_ENTRY_ADD        = BIT(0),
        UDP_TUNNEL_NIC_ENTRY_DEL        = BIT(1),
        UDP_TUNNEL_NIC_ENTRY_OP_FAIL        = BIT(2),
        UDP_TUNNEL_NIC_ENTRY_FROZEN        = BIT(3),
};

struct udp_tunnel_nic_table_entry {
        __be16 port;
        u8 type;
        u8 flags;
        u16 use_cnt;
#define UDP_TUNNEL_NIC_USE_CNT_MAX        U16_MAX
        u8 hw_priv;
};

/**
 * struct udp_tunnel_nic - UDP tunnel port offload state
 * @work:        async work for talking to hardware from process context
 * @dev:        netdev pointer
 * @need_sync:        at least one port start changed
 * @need_replay: space was freed, we need a replay of all ports
 * @work_pending: @work is currently scheduled
 * @n_tables:        number of tables under @entries
 * @missed:        bitmap of tables which overflown
 * @entries:        table of tables of ports currently offloaded
 */
struct udp_tunnel_nic {
        struct work_struct work;

        struct net_device *dev;

        u8 need_sync:1;
        u8 need_replay:1;
        u8 work_pending:1;

        unsigned int n_tables;
        unsigned long missed;
        struct udp_tunnel_nic_table_entry *entries[] __counted_by(n_tables);
};

/* We ensure all work structs are done using driver state, but not the code.
 * We need a workqueue we can flush before module gets removed.
 */
static struct workqueue_struct *udp_tunnel_nic_workqueue;

static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type)
{
        switch (type) {
        case UDP_TUNNEL_TYPE_VXLAN:
                return "vxlan";
        case UDP_TUNNEL_TYPE_GENEVE:
                return "geneve";
        case UDP_TUNNEL_TYPE_VXLAN_GPE:
                return "vxlan-gpe";
        default:
                return "unknown";
        }
}

static bool
udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry)
{
        return entry->use_cnt == 0 && !entry->flags;
}

static bool
udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry)
{
        return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN);
}

static bool
udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry)
{
        return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN;
}

static void
udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry)
{
        if (!udp_tunnel_nic_entry_is_free(entry))
                entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN;
}

static void
udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry)
{
        entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN;
}

static bool
udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry)
{
        return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD |
                               UDP_TUNNEL_NIC_ENTRY_DEL);
}

static void
udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn,
                           struct udp_tunnel_nic_table_entry *entry,
                           unsigned int flag)
{
        entry->flags |= flag;
        utn->need_sync = 1;
}

static void
udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry,
                             struct udp_tunnel_info *ti)
{
        memset(ti, 0, sizeof(*ti));
        ti->port = entry->port;
        ti->type = entry->type;
        ti->hw_priv = entry->hw_priv;
}

static bool
udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        unsigned int i, j;

        for (i = 0; i < utn->n_tables; i++)
                for (j = 0; j < info->tables[i].n_entries; j++)
                        if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j]))
                                return false;
        return true;
}

static bool
udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
{
        const struct udp_tunnel_nic_table_info *table;
        unsigned int i, j;

        if (!utn->missed)
                return false;

        for (i = 0; i < utn->n_tables; i++) {
                table = &dev->udp_tunnel_nic_info->tables[i];
                if (!test_bit(i, &utn->missed))
                        continue;

                for (j = 0; j < table->n_entries; j++)
                        if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j]))
                                return true;
        }

        return false;
}

static void
__udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table,
                          unsigned int idx, struct udp_tunnel_info *ti)
{
        struct udp_tunnel_nic_table_entry *entry;
        struct udp_tunnel_nic *utn;

        utn = dev->udp_tunnel_nic;
        entry = &utn->entries[table][idx];

        if (entry->use_cnt)
                udp_tunnel_nic_ti_from_entry(entry, ti);
}

static void
__udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
                               unsigned int idx, u8 priv)
{
        dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv;
}

static void
udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry,
                                 int err)
{
        bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL;

        WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD &&
                     entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL);

        if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD &&
            (!err || (err == -EEXIST && dodgy)))
                entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD;

        if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL &&
            (!err || (err == -ENOENT && dodgy)))
                entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL;

        if (!err)
                entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
        else
                entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
}

static void
udp_tunnel_nic_device_sync_one(struct net_device *dev,
                               struct udp_tunnel_nic *utn,
                               unsigned int table, unsigned int idx)
{
        struct udp_tunnel_nic_table_entry *entry;
        struct udp_tunnel_info ti;
        int err;

        entry = &utn->entries[table][idx];
        if (!udp_tunnel_nic_entry_is_queued(entry))
                return;

        udp_tunnel_nic_ti_from_entry(entry, &ti);
        if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD)
                err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti);
        else
                err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx,
                                                           &ti);
        udp_tunnel_nic_entry_update_done(entry, err);

        if (err)
                netdev_warn(dev,
                            "UDP tunnel port sync failed port %d type %s: %d\n",
                            be16_to_cpu(entry->port),
                            udp_tunnel_nic_tunnel_type_name(entry->type),
                            err);
}

static void
udp_tunnel_nic_device_sync_by_port(struct net_device *dev,
                                   struct udp_tunnel_nic *utn)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        unsigned int i, j;

        for (i = 0; i < utn->n_tables; i++)
                for (j = 0; j < info->tables[i].n_entries; j++)
                        udp_tunnel_nic_device_sync_one(dev, utn, i, j);
}

static void
udp_tunnel_nic_device_sync_by_table(struct net_device *dev,
                                    struct udp_tunnel_nic *utn)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        unsigned int i, j;
        int err;

        for (i = 0; i < utn->n_tables; i++) {
                /* Find something that needs sync in this table */
                for (j = 0; j < info->tables[i].n_entries; j++)
                        if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j]))
                                break;
                if (j == info->tables[i].n_entries)
                        continue;

                err = info->sync_table(dev, i);
                if (err)
                        netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n",
                                    i, err);

                for (j = 0; j < info->tables[i].n_entries; j++) {
                        struct udp_tunnel_nic_table_entry *entry;

                        entry = &utn->entries[i][j];
                        if (udp_tunnel_nic_entry_is_queued(entry))
                                udp_tunnel_nic_entry_update_done(entry, err);
                }
        }
}

static void
__udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
{
        if (!utn->need_sync)
                return;

        if (dev->udp_tunnel_nic_info->sync_table)
                udp_tunnel_nic_device_sync_by_table(dev, utn);
        else
                udp_tunnel_nic_device_sync_by_port(dev, utn);

        utn->need_sync = 0;
        /* Can't replay directly here, in case we come from the tunnel driver's
         * notification - trying to replay may deadlock inside tunnel driver.
         */
        utn->need_replay = udp_tunnel_nic_should_replay(dev, utn);
}

static void
udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        bool may_sleep;

        if (!utn->need_sync)
                return;

        /* Drivers which sleep in the callback need to update from
         * the workqueue, if we come from the tunnel driver's notification.
         */
        may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP;
        if (!may_sleep)
                __udp_tunnel_nic_device_sync(dev, utn);
        if (may_sleep || utn->need_replay) {
                queue_work(udp_tunnel_nic_workqueue, &utn->work);
                utn->work_pending = 1;
        }
}

static bool
udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table,
                                struct udp_tunnel_info *ti)
{
        return table->tunnel_types & ti->type;
}

static bool
udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn,
                          struct udp_tunnel_info *ti)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        unsigned int i;

        /* Special case IPv4-only NICs */
        if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY &&
            ti->sa_family != AF_INET)
                return false;

        for (i = 0; i < utn->n_tables; i++)
                if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti))
                        return true;
        return false;
}

static int
udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn,
                             struct udp_tunnel_info *ti)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        struct udp_tunnel_nic_table_entry *entry;
        unsigned int i, j;

        for (i = 0; i < utn->n_tables; i++)
                for (j = 0; j < info->tables[i].n_entries; j++) {
                        entry =        &utn->entries[i][j];

                        if (!udp_tunnel_nic_entry_is_free(entry) &&
                            entry->port == ti->port &&
                            entry->type != ti->type) {
                                __set_bit(i, &utn->missed);
                                return true;
                        }
                }
        return false;
}

static void
udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn,
                         unsigned int table, unsigned int idx, int use_cnt_adj)
{
        struct udp_tunnel_nic_table_entry *entry =  &utn->entries[table][idx];
        bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
        unsigned int from, to;

        WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX);

        /* If not going from used to unused or vice versa - all done.
         * For dodgy entries make sure we try to sync again (queue the entry).
         */
        entry->use_cnt += use_cnt_adj;
        if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj))
                return;

        /* Cancel the op before it was sent to the device, if possible,
         * otherwise we'd need to take special care to issue commands
         * in the same order the ports arrived.
         */
        if (use_cnt_adj < 0) {
                from = UDP_TUNNEL_NIC_ENTRY_ADD;
                to = UDP_TUNNEL_NIC_ENTRY_DEL;
        } else {
                from = UDP_TUNNEL_NIC_ENTRY_DEL;
                to = UDP_TUNNEL_NIC_ENTRY_ADD;
        }

        if (entry->flags & from) {
                entry->flags &= ~from;
                if (!dodgy)
                        return;
        }

        udp_tunnel_nic_entry_queue(utn, entry, to);
}

static bool
udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn,
                             unsigned int table, unsigned int idx,
                             struct udp_tunnel_info *ti, int use_cnt_adj)
{
        struct udp_tunnel_nic_table_entry *entry =  &utn->entries[table][idx];

        if (udp_tunnel_nic_entry_is_free(entry) ||
            entry->port != ti->port ||
            entry->type != ti->type)
                return false;

        if (udp_tunnel_nic_entry_is_frozen(entry))
                return true;

        udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj);
        return true;
}

/* Try to find existing matching entry and adjust its use count, instead of
 * adding a new one. Returns true if entry was found. In case of delete the
 * entry may have gotten removed in the process, in which case it will be
 * queued for removal.
 */
static bool
udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
                            struct udp_tunnel_info *ti, int use_cnt_adj)
{
        const struct udp_tunnel_nic_table_info *table;
        unsigned int i, j;

        for (i = 0; i < utn->n_tables; i++) {
                table = &dev->udp_tunnel_nic_info->tables[i];
                if (!udp_tunnel_nic_table_is_capable(table, ti))
                        continue;

                for (j = 0; j < table->n_entries; j++)
                        if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti,
                                                         use_cnt_adj))
                                return true;
        }

        return false;
}

static bool
udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
                            struct udp_tunnel_info *ti)
{
        return udp_tunnel_nic_try_existing(dev, utn, ti, +1);
}

static bool
udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
                            struct udp_tunnel_info *ti)
{
        return udp_tunnel_nic_try_existing(dev, utn, ti, -1);
}

static bool
udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn,
                       struct udp_tunnel_info *ti)
{
        const struct udp_tunnel_nic_table_info *table;
        unsigned int i, j;

        for (i = 0; i < utn->n_tables; i++) {
                table = &dev->udp_tunnel_nic_info->tables[i];
                if (!udp_tunnel_nic_table_is_capable(table, ti))
                        continue;

                for (j = 0; j < table->n_entries; j++) {
                        struct udp_tunnel_nic_table_entry *entry;

                        entry = &utn->entries[i][j];
                        if (!udp_tunnel_nic_entry_is_free(entry))
                                continue;

                        entry->port = ti->port;
                        entry->type = ti->type;
                        entry->use_cnt = 1;
                        udp_tunnel_nic_entry_queue(utn, entry,
                                                   UDP_TUNNEL_NIC_ENTRY_ADD);
                        return true;
                }

                /* The different table may still fit this port in, but there
                 * are no devices currently which have multiple tables accepting
                 * the same tunnel type, and false positives are okay.
                 */
                __set_bit(i, &utn->missed);
        }

        return false;
}

static void
__udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        struct udp_tunnel_nic *utn;

        utn = dev->udp_tunnel_nic;
        if (!utn)
                return;
        if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)
                return;
        if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN &&
            ti->port == htons(IANA_VXLAN_UDP_PORT)) {
                if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
                        netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n");
                return;
        }

        if (!udp_tunnel_nic_is_capable(dev, utn, ti))
                return;

        /* It may happen that a tunnel of one type is removed and different
         * tunnel type tries to reuse its port before the device was informed.
         * Rely on utn->missed to re-add this port later.
         */
        if (udp_tunnel_nic_has_collision(dev, utn, ti))
                return;

        if (!udp_tunnel_nic_add_existing(dev, utn, ti))
                udp_tunnel_nic_add_new(dev, utn, ti);

        udp_tunnel_nic_device_sync(dev, utn);
}

static void
__udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti)
{
        struct udp_tunnel_nic *utn;

        utn = dev->udp_tunnel_nic;
        if (!utn)
                return;

        if (!udp_tunnel_nic_is_capable(dev, utn, ti))
                return;

        udp_tunnel_nic_del_existing(dev, utn, ti);

        udp_tunnel_nic_device_sync(dev, utn);
}

static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        struct udp_tunnel_nic *utn;
        unsigned int i, j;

        ASSERT_RTNL();

        utn = dev->udp_tunnel_nic;
        if (!utn)
                return;

        utn->need_sync = false;
        for (i = 0; i < utn->n_tables; i++)
                for (j = 0; j < info->tables[i].n_entries; j++) {
                        struct udp_tunnel_nic_table_entry *entry;

                        entry = &utn->entries[i][j];

                        entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL |
                                          UDP_TUNNEL_NIC_ENTRY_OP_FAIL);
                        /* We don't release rtnl across ops */
                        WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN);
                        if (!entry->use_cnt)
                                continue;

                        udp_tunnel_nic_entry_queue(utn, entry,
                                                   UDP_TUNNEL_NIC_ENTRY_ADD);
                }

        __udp_tunnel_nic_device_sync(dev, utn);
}

static size_t
__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        struct udp_tunnel_nic *utn;
        unsigned int j;
        size_t size;

        utn = dev->udp_tunnel_nic;
        if (!utn)
                return 0;

        size = 0;
        for (j = 0; j < info->tables[table].n_entries; j++) {
                if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j]))
                        continue;

                size += nla_total_size(0) +                 /* _TABLE_ENTRY */
                        nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */
                        nla_total_size(sizeof(u32));         /* _ENTRY_TYPE */
        }

        return size;
}

static int
__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
                            struct sk_buff *skb)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        struct udp_tunnel_nic *utn;
        struct nlattr *nest;
        unsigned int j;

        utn = dev->udp_tunnel_nic;
        if (!utn)
                return 0;

        for (j = 0; j < info->tables[table].n_entries; j++) {
                if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j]))
                        continue;

                nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY);
                if (!nest)
                        return -EMSGSIZE;

                if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT,
                                 utn->entries[table][j].port) ||
                    nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE,
                                ilog2(utn->entries[table][j].type)))
                        goto err_cancel;

                nla_nest_end(skb, nest);
        }

        return 0;

err_cancel:
        nla_nest_cancel(skb, nest);
        return -EMSGSIZE;
}

static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = {
        .get_port        = __udp_tunnel_nic_get_port,
        .set_port_priv        = __udp_tunnel_nic_set_port_priv,
        .add_port        = __udp_tunnel_nic_add_port,
        .del_port        = __udp_tunnel_nic_del_port,
        .reset_ntf        = __udp_tunnel_nic_reset_ntf,
        .dump_size        = __udp_tunnel_nic_dump_size,
        .dump_write        = __udp_tunnel_nic_dump_write,
};

static void
udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        unsigned int i, j;

        for (i = 0; i < utn->n_tables; i++)
                for (j = 0; j < info->tables[i].n_entries; j++) {
                        int adj_cnt = -utn->entries[i][j].use_cnt;

                        if (adj_cnt)
                                udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt);
                }

        __udp_tunnel_nic_device_sync(dev, utn);

        for (i = 0; i < utn->n_tables; i++)
                memset(utn->entries[i], 0, array_size(info->tables[i].n_entries,
                                                      sizeof(**utn->entries)));
        WARN_ON(utn->need_sync);
        utn->need_replay = 0;
}

static void
udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        struct udp_tunnel_nic_shared_node *node;
        unsigned int i, j;

        /* Freeze all the ports we are already tracking so that the replay
         * does not double up the refcount.
         */
        for (i = 0; i < utn->n_tables; i++)
                for (j = 0; j < info->tables[i].n_entries; j++)
                        udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]);
        utn->missed = 0;
        utn->need_replay = 0;

        if (!info->shared) {
                udp_tunnel_get_rx_info(dev);
        } else {
                list_for_each_entry(node, &info->shared->devices, list)
                        udp_tunnel_get_rx_info(node->dev);
        }

        for (i = 0; i < utn->n_tables; i++)
                for (j = 0; j < info->tables[i].n_entries; j++)
                        udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]);
}

static void udp_tunnel_nic_device_sync_work(struct work_struct *work)
{
        struct udp_tunnel_nic *utn =
                container_of(work, struct udp_tunnel_nic, work);

        rtnl_lock();
        utn->work_pending = 0;
        __udp_tunnel_nic_device_sync(utn->dev, utn);

        if (utn->need_replay)
                udp_tunnel_nic_replay(utn->dev, utn);
        rtnl_unlock();
}

static struct udp_tunnel_nic *
udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info,
                     unsigned int n_tables)
{
        struct udp_tunnel_nic *utn;
        unsigned int i;

        utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL);
        if (!utn)
                return NULL;
        utn->n_tables = n_tables;
        INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work);

        for (i = 0; i < n_tables; i++) {
                utn->entries[i] = kcalloc(info->tables[i].n_entries,
                                          sizeof(*utn->entries[i]), GFP_KERNEL);
                if (!utn->entries[i])
                        goto err_free_prev_entries;
        }

        return utn;

err_free_prev_entries:
        while (i--)
                kfree(utn->entries[i]);
        kfree(utn);
        return NULL;
}

static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn)
{
        unsigned int i;

        for (i = 0; i < utn->n_tables; i++)
                kfree(utn->entries[i]);
        kfree(utn);
}

static int udp_tunnel_nic_register(struct net_device *dev)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
        struct udp_tunnel_nic_shared_node *node = NULL;
        struct udp_tunnel_nic *utn;
        unsigned int n_tables, i;

        BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE <
                     UDP_TUNNEL_NIC_MAX_TABLES);
        /* Expect use count of at most 2 (IPv4, IPv6) per device */
        BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX <
                     UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2);

        /* Check that the driver info is sane */
        if (WARN_ON(!info->set_port != !info->unset_port) ||
            WARN_ON(!info->set_port == !info->sync_table) ||
            WARN_ON(!info->tables[0].n_entries))
                return -EINVAL;

        if (WARN_ON(info->shared &&
                    info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
                return -EINVAL;

        n_tables = 1;
        for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) {
                if (!info->tables[i].n_entries)
                        continue;

                n_tables++;
                if (WARN_ON(!info->tables[i - 1].n_entries))
                        return -EINVAL;
        }

        /* Create UDP tunnel state structures */
        if (info->shared) {
                node = kzalloc(sizeof(*node), GFP_KERNEL);
                if (!node)
                        return -ENOMEM;

                node->dev = dev;
        }

        if (info->shared && info->shared->udp_tunnel_nic_info) {
                utn = info->shared->udp_tunnel_nic_info;
        } else {
                utn = udp_tunnel_nic_alloc(info, n_tables);
                if (!utn) {
                        kfree(node);
                        return -ENOMEM;
                }
        }

        if (info->shared) {
                if (!info->shared->udp_tunnel_nic_info) {
                        INIT_LIST_HEAD(&info->shared->devices);
                        info->shared->udp_tunnel_nic_info = utn;
                }

                list_add_tail(&node->list, &info->shared->devices);
        }

        utn->dev = dev;
        dev_hold(dev);
        dev->udp_tunnel_nic = utn;

        if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
                udp_tunnel_get_rx_info(dev);

        return 0;
}

static void
udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
{
        const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;

        /* For a shared table remove this dev from the list of sharing devices
         * and if there are other devices just detach.
         */
        if (info->shared) {
                struct udp_tunnel_nic_shared_node *node, *first;

                list_for_each_entry(node, &info->shared->devices, list)
                        if (node->dev == dev)
                                break;
                if (list_entry_is_head(node, &info->shared->devices, list))
                        return;

                list_del(&node->list);
                kfree(node);

                first = list_first_entry_or_null(&info->shared->devices,
                                                 typeof(*first), list);
                if (first) {
                        udp_tunnel_drop_rx_info(dev);
                        utn->dev = first->dev;
                        goto release_dev;
                }

                info->shared->udp_tunnel_nic_info = NULL;
        }

        /* Flush before we check work, so we don't waste time adding entries
         * from the work which we will boot immediately.
         */
        udp_tunnel_nic_flush(dev, utn);

        /* Wait for the work to be done using the state, netdev core will
         * retry unregister until we give up our reference on this device.
         */
        if (utn->work_pending)
                return;

        udp_tunnel_nic_free(utn);
release_dev:
        dev->udp_tunnel_nic = NULL;
        dev_put(dev);
}

static int
udp_tunnel_nic_netdevice_event(struct notifier_block *unused,
                               unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        const struct udp_tunnel_nic_info *info;
        struct udp_tunnel_nic *utn;

        info = dev->udp_tunnel_nic_info;
        if (!info)
                return NOTIFY_DONE;

        if (event == NETDEV_REGISTER) {
                int err;

                err = udp_tunnel_nic_register(dev);
                if (err)
                        netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err);
                return notifier_from_errno(err);
        }
        /* All other events will need the udp_tunnel_nic state */
        utn = dev->udp_tunnel_nic;
        if (!utn)
                return NOTIFY_DONE;

        if (event == NETDEV_UNREGISTER) {
                udp_tunnel_nic_unregister(dev, utn);
                return NOTIFY_OK;
        }

        /* All other events only matter if NIC has to be programmed open */
        if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
                return NOTIFY_DONE;

        if (event == NETDEV_UP) {
                WARN_ON(!udp_tunnel_nic_is_empty(dev, utn));
                udp_tunnel_get_rx_info(dev);
                return NOTIFY_OK;
        }
        if (event == NETDEV_GOING_DOWN) {
                udp_tunnel_nic_flush(dev, utn);
                return NOTIFY_OK;
        }

        return NOTIFY_DONE;
}

static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = {
        .notifier_call = udp_tunnel_nic_netdevice_event,
};

static int __init udp_tunnel_nic_init_module(void)
{
        int err;

        udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0);
        if (!udp_tunnel_nic_workqueue)
                return -ENOMEM;

        rtnl_lock();
        udp_tunnel_nic_ops = &__udp_tunnel_nic_ops;
        rtnl_unlock();

        err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block);
        if (err)
                goto err_unset_ops;

        return 0;

err_unset_ops:
        rtnl_lock();
        udp_tunnel_nic_ops = NULL;
        rtnl_unlock();
        destroy_workqueue(udp_tunnel_nic_workqueue);
        return err;
}
late_initcall(udp_tunnel_nic_init_module);

static void __exit udp_tunnel_nic_cleanup_module(void)
{
        unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block);

        rtnl_lock();
        udp_tunnel_nic_ops = NULL;
        rtnl_unlock();

        destroy_workqueue(udp_tunnel_nic_workqueue);
}
module_exit(udp_tunnel_nic_cleanup_module);

MODULE_LICENSE("GPL");

































    3 








    3 

    3 






































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
// SPDX-License-Identifier: GPL-2.0
/* Copyright 2011-2014 Autronica Fire and Security AS
 *
 * Author(s):
 *        2011-2014 Arvid Brodin, arvid.brodin@alten.se
 *
 * Event handling for HSR and PRP devices.
 */

#include <linux/netdevice.h>
#include <net/rtnetlink.h>
#include <linux/rculist.h>
#include <linux/timer.h>
#include <linux/etherdevice.h>
#include "hsr_main.h"
#include "hsr_device.h"
#include "hsr_netlink.h"
#include "hsr_framereg.h"
#include "hsr_slave.h"

static bool hsr_slave_empty(struct hsr_priv *hsr)
{
        struct hsr_port *port;

        hsr_for_each_port(hsr, port)
                if (port->type != HSR_PT_MASTER)
                        return false;
        return true;
}

static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
                             void *ptr)
{
        struct hsr_port *port, *master;
        struct net_device *dev;
        struct hsr_priv *hsr;
        LIST_HEAD(list_kill);
        int mtu_max;
        int res;

        dev = netdev_notifier_info_to_dev(ptr);
        port = hsr_port_get_rtnl(dev);
        if (!port) {
                if (!is_hsr_master(dev))
                        return NOTIFY_DONE;        /* Not an HSR device */
                hsr = netdev_priv(dev);
                port = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
                if (!port) {
                        /* Resend of notification concerning removed device? */
                        return NOTIFY_DONE;
                }
        } else {
                hsr = port->hsr;
        }

        switch (event) {
        case NETDEV_UP:                /* Administrative state DOWN */
        case NETDEV_DOWN:        /* Administrative state UP */
        case NETDEV_CHANGE:        /* Link (carrier) state changes */
                hsr_check_carrier_and_operstate(hsr);
                break;
        case NETDEV_CHANGENAME:
                if (is_hsr_master(dev))
                        hsr_debugfs_rename(dev);
                break;
        case NETDEV_CHANGEADDR:
                if (port->type == HSR_PT_MASTER) {
                        /* This should not happen since there's no
                         * ndo_set_mac_address() for HSR devices - i.e. not
                         * supported.
                         */
                        break;
                }

                master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);

                if (port->type == HSR_PT_SLAVE_A) {
                        eth_hw_addr_set(master->dev, dev->dev_addr);
                        call_netdevice_notifiers(NETDEV_CHANGEADDR,
                                                 master->dev);
                }

                /* Make sure we recognize frames from ourselves in hsr_rcv() */
                port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
                res = hsr_create_self_node(hsr,
                                           master->dev->dev_addr,
                                           port ?
                                                port->dev->dev_addr :
                                                master->dev->dev_addr);
                if (res)
                        netdev_warn(master->dev,
                                    "Could not update HSR node address.\n");
                break;
        case NETDEV_CHANGEMTU:
                if (port->type == HSR_PT_MASTER)
                        break; /* Handled in ndo_change_mtu() */
                mtu_max = hsr_get_max_mtu(port->hsr);
                master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER);
                WRITE_ONCE(master->dev->mtu, mtu_max);
                break;
        case NETDEV_UNREGISTER:
                if (!is_hsr_master(dev)) {
                        master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER);
                        hsr_del_port(port);
                        if (hsr_slave_empty(master->hsr)) {
                                const struct rtnl_link_ops *ops;

                                ops = master->dev->rtnl_link_ops;
                                ops->dellink(master->dev, &list_kill);
                                unregister_netdevice_many(&list_kill);
                        }
                }
                break;
        case NETDEV_PRE_TYPE_CHANGE:
                /* HSR works only on Ethernet devices. Refuse slave to change
                 * its type.
                 */
                return NOTIFY_BAD;
        }

        return NOTIFY_DONE;
}

struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt)
{
        struct hsr_port *port;

        hsr_for_each_port(hsr, port)
                if (port->type == pt)
                        return port;
        return NULL;
}

int hsr_get_version(struct net_device *dev, enum hsr_version *ver)
{
        struct hsr_priv *hsr;

        hsr = netdev_priv(dev);
        *ver = hsr->prot_version;

        return 0;
}
EXPORT_SYMBOL(hsr_get_version);

static struct notifier_block hsr_nb = {
        .notifier_call = hsr_netdev_notify,        /* Slave event notifications */
};

static int __init hsr_init(void)
{
        int err;

        BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN);

        err = register_netdevice_notifier(&hsr_nb);
        if (err)
                return err;

        err = hsr_netlink_init();
        if (err) {
                unregister_netdevice_notifier(&hsr_nb);
                return err;
        }

        return 0;
}

static void __exit hsr_exit(void)
{
        hsr_netlink_exit();
        hsr_debugfs_remove_root();
        unregister_netdevice_notifier(&hsr_nb);
}

module_init(hsr_init);
module_exit(hsr_exit);
MODULE_DESCRIPTION("High-availability Seamless Redundancy (HSR) driver");
MODULE_LICENSE("GPL");




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 









    1 

    1 























































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
// SPDX-License-Identifier: GPL-2.0-only
/*
 * VFIO core
 *
 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
 *     Author: Alex Williamson <alex.williamson@redhat.com>
 *
 * Derived from original vfio:
 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
 * Author: Tom Lyon, pugs@cisco.com
 */

#include <linux/cdev.h>
#include <linux/compat.h>
#include <linux/device.h>
#include <linux/fs.h>
#include <linux/idr.h>
#include <linux/iommu.h>
#if IS_ENABLED(CONFIG_KVM)
#include <linux/kvm_host.h>
#endif
#include <linux/list.h>
#include <linux/miscdevice.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/mutex.h>
#include <linux/pci.h>
#include <linux/pseudo_fs.h>
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/pm_runtime.h>
#include <linux/interval_tree.h>
#include <linux/iova_bitmap.h>
#include <linux/iommufd.h>
#include "vfio.h"

#define DRIVER_VERSION        "0.3"
#define DRIVER_AUTHOR        "Alex Williamson <alex.williamson@redhat.com>"
#define DRIVER_DESC        "VFIO - User Level meta-driver"

#define VFIO_MAGIC 0x5646494f /* "VFIO" */

static struct vfio {
        struct class                        *device_class;
        struct ida                        device_ida;
        struct vfsmount                        *vfs_mount;
        int                                fs_count;
} vfio;

#ifdef CONFIG_VFIO_NOIOMMU
bool vfio_noiommu __read_mostly;
module_param_named(enable_unsafe_noiommu_mode,
                   vfio_noiommu, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode.  This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel.  If you do not know what this is for, step away. (default: false)");
#endif

static DEFINE_XARRAY(vfio_device_set_xa);

int vfio_assign_device_set(struct vfio_device *device, void *set_id)
{
        unsigned long idx = (unsigned long)set_id;
        struct vfio_device_set *new_dev_set;
        struct vfio_device_set *dev_set;

        if (WARN_ON(!set_id))
                return -EINVAL;

        /*
         * Atomically acquire a singleton object in the xarray for this set_id
         */
        xa_lock(&vfio_device_set_xa);
        dev_set = xa_load(&vfio_device_set_xa, idx);
        if (dev_set)
                goto found_get_ref;
        xa_unlock(&vfio_device_set_xa);

        new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
        if (!new_dev_set)
                return -ENOMEM;
        mutex_init(&new_dev_set->lock);
        INIT_LIST_HEAD(&new_dev_set->device_list);
        new_dev_set->set_id = set_id;

        xa_lock(&vfio_device_set_xa);
        dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
                               GFP_KERNEL);
        if (!dev_set) {
                dev_set = new_dev_set;
                goto found_get_ref;
        }

        kfree(new_dev_set);
        if (xa_is_err(dev_set)) {
                xa_unlock(&vfio_device_set_xa);
                return xa_err(dev_set);
        }

found_get_ref:
        dev_set->device_count++;
        xa_unlock(&vfio_device_set_xa);
        mutex_lock(&dev_set->lock);
        device->dev_set = dev_set;
        list_add_tail(&device->dev_set_list, &dev_set->device_list);
        mutex_unlock(&dev_set->lock);
        return 0;
}
EXPORT_SYMBOL_GPL(vfio_assign_device_set);

static void vfio_release_device_set(struct vfio_device *device)
{
        struct vfio_device_set *dev_set = device->dev_set;

        if (!dev_set)
                return;

        mutex_lock(&dev_set->lock);
        list_del(&device->dev_set_list);
        mutex_unlock(&dev_set->lock);

        xa_lock(&vfio_device_set_xa);
        if (!--dev_set->device_count) {
                __xa_erase(&vfio_device_set_xa,
                           (unsigned long)dev_set->set_id);
                mutex_destroy(&dev_set->lock);
                kfree(dev_set);
        }
        xa_unlock(&vfio_device_set_xa);
}

unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
{
        struct vfio_device *cur;
        unsigned int open_count = 0;

        lockdep_assert_held(&dev_set->lock);

        list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
                open_count += cur->open_count;
        return open_count;
}
EXPORT_SYMBOL_GPL(vfio_device_set_open_count);

struct vfio_device *
vfio_find_device_in_devset(struct vfio_device_set *dev_set,
                           struct device *dev)
{
        struct vfio_device *cur;

        lockdep_assert_held(&dev_set->lock);

        list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
                if (cur->dev == dev)
                        return cur;
        return NULL;
}
EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);

/*
 * Device objects - create, release, get, put, search
 */
/* Device reference always implies a group reference */
void vfio_device_put_registration(struct vfio_device *device)
{
        if (refcount_dec_and_test(&device->refcount))
                complete(&device->comp);
}

bool vfio_device_try_get_registration(struct vfio_device *device)
{
        return refcount_inc_not_zero(&device->refcount);
}

/*
 * VFIO driver API
 */
/* Release helper called by vfio_put_device() */
static void vfio_device_release(struct device *dev)
{
        struct vfio_device *device =
                        container_of(dev, struct vfio_device, device);

        vfio_release_device_set(device);
        ida_free(&vfio.device_ida, device->index);

        if (device->ops->release)
                device->ops->release(device);

        iput(device->inode);
        simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
        kvfree(device);
}

static int vfio_init_device(struct vfio_device *device, struct device *dev,
                            const struct vfio_device_ops *ops);

/*
 * Allocate and initialize vfio_device so it can be registered to vfio
 * core.
 *
 * Drivers should use the wrapper vfio_alloc_device() for allocation.
 * @size is the size of the structure to be allocated, including any
 * private data used by the driver.
 *
 * Driver may provide an @init callback to cover device private data.
 *
 * Use vfio_put_device() to release the structure after success return.
 */
struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
                                       const struct vfio_device_ops *ops)
{
        struct vfio_device *device;
        int ret;

        if (WARN_ON(size < sizeof(struct vfio_device)))
                return ERR_PTR(-EINVAL);

        device = kvzalloc(size, GFP_KERNEL);
        if (!device)
                return ERR_PTR(-ENOMEM);

        ret = vfio_init_device(device, dev, ops);
        if (ret)
                goto out_free;
        return device;

out_free:
        kvfree(device);
        return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(_vfio_alloc_device);

static int vfio_fs_init_fs_context(struct fs_context *fc)
{
        return init_pseudo(fc, VFIO_MAGIC) ? 0 : -ENOMEM;
}

static struct file_system_type vfio_fs_type = {
        .name = "vfio",
        .owner = THIS_MODULE,
        .init_fs_context = vfio_fs_init_fs_context,
        .kill_sb = kill_anon_super,
};

static struct inode *vfio_fs_inode_new(void)
{
        struct inode *inode;
        int ret;

        ret = simple_pin_fs(&vfio_fs_type, &vfio.vfs_mount, &vfio.fs_count);
        if (ret)
                return ERR_PTR(ret);

        inode = alloc_anon_inode(vfio.vfs_mount->mnt_sb);
        if (IS_ERR(inode))
                simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);

        return inode;
}

/*
 * Initialize a vfio_device so it can be registered to vfio core.
 */
static int vfio_init_device(struct vfio_device *device, struct device *dev,
                            const struct vfio_device_ops *ops)
{
        int ret;

        ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
        if (ret < 0) {
                dev_dbg(dev, "Error to alloc index\n");
                return ret;
        }

        device->index = ret;
        init_completion(&device->comp);
        device->dev = dev;
        device->ops = ops;
        device->inode = vfio_fs_inode_new();
        if (IS_ERR(device->inode)) {
                ret = PTR_ERR(device->inode);
                goto out_inode;
        }

        if (ops->init) {
                ret = ops->init(device);
                if (ret)
                        goto out_uninit;
        }

        device_initialize(&device->device);
        device->device.release = vfio_device_release;
        device->device.class = vfio.device_class;
        device->device.parent = device->dev;
        return 0;

out_uninit:
        iput(device->inode);
        simple_release_fs(&vfio.vfs_mount, &vfio.fs_count);
out_inode:
        vfio_release_device_set(device);
        ida_free(&vfio.device_ida, device->index);
        return ret;
}

static int __vfio_register_dev(struct vfio_device *device,
                               enum vfio_group_type type)
{
        int ret;

        if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
                    (!device->ops->bind_iommufd ||
                     !device->ops->unbind_iommufd ||
                     !device->ops->attach_ioas ||
                     !device->ops->detach_ioas)))
                return -EINVAL;

        /*
         * If the driver doesn't specify a set then the device is added to a
         * singleton set just for itself.
         */
        if (!device->dev_set)
                vfio_assign_device_set(device, device);

        ret = dev_set_name(&device->device, "vfio%d", device->index);
        if (ret)
                return ret;

        ret = vfio_device_set_group(device, type);
        if (ret)
                return ret;

        /*
         * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
         * restore cache coherency. It has to be checked here because it is only
         * valid for cases where we are using iommu groups.
         */
        if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) &&
            !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) {
                ret = -EINVAL;
                goto err_out;
        }

        ret = vfio_device_add(device);
        if (ret)
                goto err_out;

        /* Refcounting can't start until the driver calls register */
        refcount_set(&device->refcount, 1);

        vfio_device_group_register(device);
        vfio_device_debugfs_init(device);

        return 0;
err_out:
        vfio_device_remove_group(device);
        return ret;
}

int vfio_register_group_dev(struct vfio_device *device)
{
        return __vfio_register_dev(device, VFIO_IOMMU);
}
EXPORT_SYMBOL_GPL(vfio_register_group_dev);

/*
 * Register a virtual device without IOMMU backing.  The user of this
 * device must not be able to directly trigger unmediated DMA.
 */
int vfio_register_emulated_iommu_dev(struct vfio_device *device)
{
        return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
}
EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);

/*
 * Decrement the device reference count and wait for the device to be
 * removed.  Open file descriptors for the device... */
void vfio_unregister_group_dev(struct vfio_device *device)
{
        unsigned int i = 0;
        bool interrupted = false;
        long rc;

        /*
         * Prevent new device opened by userspace via the
         * VFIO_GROUP_GET_DEVICE_FD in the group path.
         */
        vfio_device_group_unregister(device);

        /*
         * Balances vfio_device_add() in register path, also prevents
         * new device opened by userspace in the cdev path.
         */
        vfio_device_del(device);

        vfio_device_put_registration(device);
        rc = try_wait_for_completion(&device->comp);
        while (rc <= 0) {
                if (device->ops->request)
                        device->ops->request(device, i++);

                if (interrupted) {
                        rc = wait_for_completion_timeout(&device->comp,
                                                         HZ * 10);
                } else {
                        rc = wait_for_completion_interruptible_timeout(
                                &device->comp, HZ * 10);
                        if (rc < 0) {
                                interrupted = true;
                                dev_warn(device->dev,
                                         "Device is currently in use, task"
                                         " \"%s\" (%d) "
                                         "blocked until device is released",
                                         current->comm, task_pid_nr(current));
                        }
                }
        }

        vfio_device_debugfs_exit(device);
        /* Balances vfio_device_set_group in register path */
        vfio_device_remove_group(device);
}
EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);

#if IS_ENABLED(CONFIG_KVM)
void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
{
        void (*pfn)(struct kvm *kvm);
        bool (*fn)(struct kvm *kvm);
        bool ret;

        lockdep_assert_held(&device->dev_set->lock);

        if (!kvm)
                return;

        pfn = symbol_get(kvm_put_kvm);
        if (WARN_ON(!pfn))
                return;

        fn = symbol_get(kvm_get_kvm_safe);
        if (WARN_ON(!fn)) {
                symbol_put(kvm_put_kvm);
                return;
        }

        ret = fn(kvm);
        symbol_put(kvm_get_kvm_safe);
        if (!ret) {
                symbol_put(kvm_put_kvm);
                return;
        }

        device->put_kvm = pfn;
        device->kvm = kvm;
}

void vfio_device_put_kvm(struct vfio_device *device)
{
        lockdep_assert_held(&device->dev_set->lock);

        if (!device->kvm)
                return;

        if (WARN_ON(!device->put_kvm))
                goto clear;

        device->put_kvm(device->kvm);
        device->put_kvm = NULL;
        symbol_put(kvm_put_kvm);

clear:
        device->kvm = NULL;
}
#endif

/* true if the vfio_device has open_device() called but not close_device() */
static bool vfio_assert_device_open(struct vfio_device *device)
{
        return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
}

struct vfio_device_file *
vfio_allocate_device_file(struct vfio_device *device)
{
        struct vfio_device_file *df;

        df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT);
        if (!df)
                return ERR_PTR(-ENOMEM);

        df->device = device;
        spin_lock_init(&df->kvm_ref_lock);

        return df;
}

static int vfio_df_device_first_open(struct vfio_device_file *df)
{
        struct vfio_device *device = df->device;
        struct iommufd_ctx *iommufd = df->iommufd;
        int ret;

        lockdep_assert_held(&device->dev_set->lock);

        if (!try_module_get(device->dev->driver->owner))
                return -ENODEV;

        if (iommufd)
                ret = vfio_df_iommufd_bind(df);
        else
                ret = vfio_device_group_use_iommu(device);
        if (ret)
                goto err_module_put;

        if (device->ops->open_device) {
                ret = device->ops->open_device(device);
                if (ret)
                        goto err_unuse_iommu;
        }
        return 0;

err_unuse_iommu:
        if (iommufd)
                vfio_df_iommufd_unbind(df);
        else
                vfio_device_group_unuse_iommu(device);
err_module_put:
        module_put(device->dev->driver->owner);
        return ret;
}

static void vfio_df_device_last_close(struct vfio_device_file *df)
{
        struct vfio_device *device = df->device;
        struct iommufd_ctx *iommufd = df->iommufd;

        lockdep_assert_held(&device->dev_set->lock);

        if (device->ops->close_device)
                device->ops->close_device(device);
        if (iommufd)
                vfio_df_iommufd_unbind(df);
        else
                vfio_device_group_unuse_iommu(device);
        module_put(device->dev->driver->owner);
}

int vfio_df_open(struct vfio_device_file *df)
{
        struct vfio_device *device = df->device;
        int ret = 0;

        lockdep_assert_held(&device->dev_set->lock);

        /*
         * Only the group path allows the device to be opened multiple
         * times.  The device cdev path doesn't have a secure way for it.
         */
        if (device->open_count != 0 && !df->group)
                return -EINVAL;

        device->open_count++;
        if (device->open_count == 1) {
                ret = vfio_df_device_first_open(df);
                if (ret)
                        device->open_count--;
        }

        return ret;
}

void vfio_df_close(struct vfio_device_file *df)
{
        struct vfio_device *device = df->device;

        lockdep_assert_held(&device->dev_set->lock);

        vfio_assert_device_open(device);
        if (device->open_count == 1)
                vfio_df_device_last_close(df);
        device->open_count--;
}

/*
 * Wrapper around pm_runtime_resume_and_get().
 * Return error code on failure or 0 on success.
 */
static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
{
        struct device *dev = device->dev;

        if (dev->driver && dev->driver->pm) {
                int ret;

                ret = pm_runtime_resume_and_get(dev);
                if (ret) {
                        dev_info_ratelimited(dev,
                                "vfio: runtime resume failed %d\n", ret);
                        return -EIO;
                }
        }

        return 0;
}

/*
 * Wrapper around pm_runtime_put().
 */
static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
{
        struct device *dev = device->dev;

        if (dev->driver && dev->driver->pm)
                pm_runtime_put(dev);
}

/*
 * VFIO Device fd
 */
static int vfio_device_fops_release(struct inode *inode, struct file *filep)
{
        struct vfio_device_file *df = filep->private_data;
        struct vfio_device *device = df->device;

        if (df->group)
                vfio_df_group_close(df);
        else
                vfio_df_unbind_iommufd(df);

        vfio_device_put_registration(device);

        kfree(df);

        return 0;
}

/*
 * vfio_mig_get_next_state - Compute the next step in the FSM
 * @cur_fsm - The current state the device is in
 * @new_fsm - The target state to reach
 * @next_fsm - Pointer to the next step to get to new_fsm
 *
 * Return 0 upon success, otherwise -errno
 * Upon success the next step in the state progression between cur_fsm and
 * new_fsm will be set in next_fsm.
 *
 * This breaks down requests for combination transitions into smaller steps and
 * returns the next step to get to new_fsm. The function may need to be called
 * multiple times before reaching new_fsm.
 *
 */
int vfio_mig_get_next_state(struct vfio_device *device,
                            enum vfio_device_mig_state cur_fsm,
                            enum vfio_device_mig_state new_fsm,
                            enum vfio_device_mig_state *next_fsm)
{
        enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
        /*
         * The coding in this table requires the driver to implement the
         * following FSM arcs:
         *         RESUMING -> STOP
         *         STOP -> RESUMING
         *         STOP -> STOP_COPY
         *         STOP_COPY -> STOP
         *
         * If P2P is supported then the driver must also implement these FSM
         * arcs:
         *         RUNNING -> RUNNING_P2P
         *         RUNNING_P2P -> RUNNING
         *         RUNNING_P2P -> STOP
         *         STOP -> RUNNING_P2P
         *
         * If precopy is supported then the driver must support these additional
         * FSM arcs:
         *         RUNNING -> PRE_COPY
         *         PRE_COPY -> RUNNING
         *         PRE_COPY -> STOP_COPY
         * However, if precopy and P2P are supported together then the driver
         * must support these additional arcs beyond the P2P arcs above:
         *         PRE_COPY -> RUNNING
         *         PRE_COPY -> PRE_COPY_P2P
         *         PRE_COPY_P2P -> PRE_COPY
         *         PRE_COPY_P2P -> RUNNING_P2P
         *         PRE_COPY_P2P -> STOP_COPY
         *         RUNNING -> PRE_COPY
         *         RUNNING_P2P -> PRE_COPY_P2P
         *
         * Without P2P and precopy the driver must implement:
         *         RUNNING -> STOP
         *         STOP -> RUNNING
         *
         * The coding will step through multiple states for some combination
         * transitions; if all optional features are supported, this means the
         * following ones:
         *         PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
         *         PRE_COPY -> RUNNING -> RUNNING_P2P
         *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
         *         PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
         *         PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
         *         PRE_COPY_P2P -> RUNNING_P2P -> STOP
         *         PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
         *         RESUMING -> STOP -> RUNNING_P2P
         *         RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
         *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING
         *         RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
         *         RESUMING -> STOP -> STOP_COPY
         *         RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
         *         RUNNING -> RUNNING_P2P -> STOP
         *         RUNNING -> RUNNING_P2P -> STOP -> RESUMING
         *         RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
         *         RUNNING_P2P -> RUNNING -> PRE_COPY
         *         RUNNING_P2P -> STOP -> RESUMING
         *         RUNNING_P2P -> STOP -> STOP_COPY
         *         STOP -> RUNNING_P2P -> PRE_COPY_P2P
         *         STOP -> RUNNING_P2P -> RUNNING
         *         STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
         *         STOP_COPY -> STOP -> RESUMING
         *         STOP_COPY -> STOP -> RUNNING_P2P
         *         STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
         *
         *  The following transitions are blocked:
         *         STOP_COPY -> PRE_COPY
         *         STOP_COPY -> PRE_COPY_P2P
         */
        static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
                [VFIO_DEVICE_STATE_STOP] = {
                        [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
                        [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
                        [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
                },
                [VFIO_DEVICE_STATE_RUNNING] = {
                        [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
                        [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
                        [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
                },
                [VFIO_DEVICE_STATE_PRE_COPY] = {
                        [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
                        [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
                        [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
                        [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
                        [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
                        [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
                        [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
                        [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
                },
                [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
                        [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
                        [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
                        [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
                        [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
                },
                [VFIO_DEVICE_STATE_STOP_COPY] = {
                        [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
                        [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
                        [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
                        [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
                },
                [VFIO_DEVICE_STATE_RESUMING] = {
                        [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
                        [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
                },
                [VFIO_DEVICE_STATE_RUNNING_P2P] = {
                        [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
                        [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
                        [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
                        [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
                        [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
                        [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
                },
                [VFIO_DEVICE_STATE_ERROR] = {
                        [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
                        [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
                        [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
                        [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
                        [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
                        [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
                        [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
                        [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
                },
        };

        static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
                [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
                [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
                [VFIO_DEVICE_STATE_PRE_COPY] =
                        VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
                [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
                                                   VFIO_MIGRATION_P2P |
                                                   VFIO_MIGRATION_PRE_COPY,
                [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
                [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
                [VFIO_DEVICE_STATE_RUNNING_P2P] =
                        VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
                [VFIO_DEVICE_STATE_ERROR] = ~0U,
        };

        if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
                    (state_flags_table[cur_fsm] & device->migration_flags) !=
                        state_flags_table[cur_fsm]))
                return -EINVAL;

        if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
           (state_flags_table[new_fsm] & device->migration_flags) !=
                        state_flags_table[new_fsm])
                return -EINVAL;

        /*
         * Arcs touching optional and unsupported states are skipped over. The
         * driver will instead see an arc from the original state to the next
         * logical state, as per the above comment.
         */
        *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
        while ((state_flags_table[*next_fsm] & device->migration_flags) !=
                        state_flags_table[*next_fsm])
                *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];

        return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
}
EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);

/*
 * Convert the drivers's struct file into a FD number and return it to userspace
 */
static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
                                   struct vfio_device_feature_mig_state *mig)
{
        int ret;
        int fd;

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0) {
                ret = fd;
                goto out_fput;
        }

        mig->data_fd = fd;
        if (copy_to_user(arg, mig, sizeof(*mig))) {
                ret = -EFAULT;
                goto out_put_unused;
        }
        fd_install(fd, filp);
        return 0;

out_put_unused:
        put_unused_fd(fd);
out_fput:
        fput(filp);
        return ret;
}

static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
                                           u32 flags, void __user *arg,
                                           size_t argsz)
{
        size_t minsz =
                offsetofend(struct vfio_device_feature_mig_state, data_fd);
        struct vfio_device_feature_mig_state mig;
        struct file *filp = NULL;
        int ret;

        if (!device->mig_ops)
                return -ENOTTY;

        ret = vfio_check_feature(flags, argsz,
                                 VFIO_DEVICE_FEATURE_SET |
                                 VFIO_DEVICE_FEATURE_GET,
                                 sizeof(mig));
        if (ret != 1)
                return ret;

        if (copy_from_user(&mig, arg, minsz))
                return -EFAULT;

        if (flags & VFIO_DEVICE_FEATURE_GET) {
                enum vfio_device_mig_state curr_state;

                ret = device->mig_ops->migration_get_state(device,
                                                           &curr_state);
                if (ret)
                        return ret;
                mig.device_state = curr_state;
                goto out_copy;
        }

        /* Handle the VFIO_DEVICE_FEATURE_SET */
        filp = device->mig_ops->migration_set_state(device, mig.device_state);
        if (IS_ERR(filp) || !filp)
                goto out_copy;

        return vfio_ioct_mig_return_fd(filp, arg, &mig);
out_copy:
        mig.data_fd = -1;
        if (copy_to_user(arg, &mig, sizeof(mig)))
                return -EFAULT;
        if (IS_ERR(filp))
                return PTR_ERR(filp);
        return 0;
}

static int
vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
                                              u32 flags, void __user *arg,
                                              size_t argsz)
{
        struct vfio_device_feature_mig_data_size data_size = {};
        unsigned long stop_copy_length;
        int ret;

        if (!device->mig_ops)
                return -ENOTTY;

        ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
                                 sizeof(data_size));
        if (ret != 1)
                return ret;

        ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
        if (ret)
                return ret;

        data_size.stop_copy_length = stop_copy_length;
        if (copy_to_user(arg, &data_size, sizeof(data_size)))
                return -EFAULT;

        return 0;
}

static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
                                               u32 flags, void __user *arg,
                                               size_t argsz)
{
        struct vfio_device_feature_migration mig = {
                .flags = device->migration_flags,
        };
        int ret;

        if (!device->mig_ops)
                return -ENOTTY;

        ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
                                 sizeof(mig));
        if (ret != 1)
                return ret;
        if (copy_to_user(arg, &mig, sizeof(mig)))
                return -EFAULT;
        return 0;
}

void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
                              u32 req_nodes)
{
        struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
        unsigned long min_gap, curr_gap;

        /* Special shortcut when a single range is required */
        if (req_nodes == 1) {
                unsigned long last;

                comb_start = interval_tree_iter_first(root, 0, ULONG_MAX);

                /* Empty list */
                if (WARN_ON_ONCE(!comb_start))
                        return;

                curr = comb_start;
                while (curr) {
                        last = curr->last;
                        prev = curr;
                        curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
                        if (prev != comb_start)
                                interval_tree_remove(prev, root);
                }
                comb_start->last = last;
                return;
        }

        /* Combine ranges which have the smallest gap */
        while (cur_nodes > req_nodes) {
                prev = NULL;
                min_gap = ULONG_MAX;
                curr = interval_tree_iter_first(root, 0, ULONG_MAX);
                while (curr) {
                        if (prev) {
                                curr_gap = curr->start - prev->last;
                                if (curr_gap < min_gap) {
                                        min_gap = curr_gap;
                                        comb_start = prev;
                                        comb_end = curr;
                                }
                        }
                        prev = curr;
                        curr = interval_tree_iter_next(curr, 0, ULONG_MAX);
                }

                /* Empty list or no nodes to combine */
                if (WARN_ON_ONCE(min_gap == ULONG_MAX))
                        break;

                comb_start->last = comb_end->last;
                interval_tree_remove(comb_end, root);
                cur_nodes--;
        }
}
EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);

/* Ranges should fit into a single kernel page */
#define LOG_MAX_RANGES \
        (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))

static int
vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
                                        u32 flags, void __user *arg,
                                        size_t argsz)
{
        size_t minsz =
                offsetofend(struct vfio_device_feature_dma_logging_control,
                            ranges);
        struct vfio_device_feature_dma_logging_range __user *ranges;
        struct vfio_device_feature_dma_logging_control control;
        struct vfio_device_feature_dma_logging_range range;
        struct rb_root_cached root = RB_ROOT_CACHED;
        struct interval_tree_node *nodes;
        u64 iova_end;
        u32 nnodes;
        int i, ret;

        if (!device->log_ops)
                return -ENOTTY;

        ret = vfio_check_feature(flags, argsz,
                                 VFIO_DEVICE_FEATURE_SET,
                                 sizeof(control));
        if (ret != 1)
                return ret;

        if (copy_from_user(&control, arg, minsz))
                return -EFAULT;

        nnodes = control.num_ranges;
        if (!nnodes)
                return -EINVAL;

        if (nnodes > LOG_MAX_RANGES)
                return -E2BIG;

        ranges = u64_to_user_ptr(control.ranges);
        nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
                              GFP_KERNEL);
        if (!nodes)
                return -ENOMEM;

        for (i = 0; i < nnodes; i++) {
                if (copy_from_user(&range, &ranges[i], sizeof(range))) {
                        ret = -EFAULT;
                        goto end;
                }
                if (!IS_ALIGNED(range.iova, control.page_size) ||
                    !IS_ALIGNED(range.length, control.page_size)) {
                        ret = -EINVAL;
                        goto end;
                }

                if (check_add_overflow(range.iova, range.length, &iova_end) ||
                    iova_end > ULONG_MAX) {
                        ret = -EOVERFLOW;
                        goto end;
                }

                nodes[i].start = range.iova;
                nodes[i].last = range.iova + range.length - 1;
                if (interval_tree_iter_first(&root, nodes[i].start,
                                             nodes[i].last)) {
                        /* Range overlapping */
                        ret = -EINVAL;
                        goto end;
                }
                interval_tree_insert(nodes + i, &root);
        }

        ret = device->log_ops->log_start(device, &root, nnodes,
                                         &control.page_size);
        if (ret)
                goto end;

        if (copy_to_user(arg, &control, sizeof(control))) {
                ret = -EFAULT;
                device->log_ops->log_stop(device);
        }

end:
        kfree(nodes);
        return ret;
}

static int
vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
                                       u32 flags, void __user *arg,
                                       size_t argsz)
{
        int ret;

        if (!device->log_ops)
                return -ENOTTY;

        ret = vfio_check_feature(flags, argsz,
                                 VFIO_DEVICE_FEATURE_SET, 0);
        if (ret != 1)
                return ret;

        return device->log_ops->log_stop(device);
}

static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
                                          unsigned long iova, size_t length,
                                          void *opaque)
{
        struct vfio_device *device = opaque;

        return device->log_ops->log_read_and_clear(device, iova, length, iter);
}

static int
vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
                                         u32 flags, void __user *arg,
                                         size_t argsz)
{
        size_t minsz =
                offsetofend(struct vfio_device_feature_dma_logging_report,
                            bitmap);
        struct vfio_device_feature_dma_logging_report report;
        struct iova_bitmap *iter;
        u64 iova_end;
        int ret;

        if (!device->log_ops)
                return -ENOTTY;

        ret = vfio_check_feature(flags, argsz,
                                 VFIO_DEVICE_FEATURE_GET,
                                 sizeof(report));
        if (ret != 1)
                return ret;

        if (copy_from_user(&report, arg, minsz))
                return -EFAULT;

        if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
                return -EINVAL;

        if (check_add_overflow(report.iova, report.length, &iova_end) ||
            iova_end > ULONG_MAX)
                return -EOVERFLOW;

        iter = iova_bitmap_alloc(report.iova, report.length,
                                 report.page_size,
                                 u64_to_user_ptr(report.bitmap));
        if (IS_ERR(iter))
                return PTR_ERR(iter);

        ret = iova_bitmap_for_each(iter, device,
                                   vfio_device_log_read_and_clear);

        iova_bitmap_free(iter);
        return ret;
}

static int vfio_ioctl_device_feature(struct vfio_device *device,
                                     struct vfio_device_feature __user *arg)
{
        size_t minsz = offsetofend(struct vfio_device_feature, flags);
        struct vfio_device_feature feature;

        if (copy_from_user(&feature, arg, minsz))
                return -EFAULT;

        if (feature.argsz < minsz)
                return -EINVAL;

        /* Check unknown flags */
        if (feature.flags &
            ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
              VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
                return -EINVAL;

        /* GET & SET are mutually exclusive except with PROBE */
        if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
            (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
            (feature.flags & VFIO_DEVICE_FEATURE_GET))
                return -EINVAL;

        switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
        case VFIO_DEVICE_FEATURE_MIGRATION:
                return vfio_ioctl_device_feature_migration(
                        device, feature.flags, arg->data,
                        feature.argsz - minsz);
        case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
                return vfio_ioctl_device_feature_mig_device_state(
                        device, feature.flags, arg->data,
                        feature.argsz - minsz);
        case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
                return vfio_ioctl_device_feature_logging_start(
                        device, feature.flags, arg->data,
                        feature.argsz - minsz);
        case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
                return vfio_ioctl_device_feature_logging_stop(
                        device, feature.flags, arg->data,
                        feature.argsz - minsz);
        case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
                return vfio_ioctl_device_feature_logging_report(
                        device, feature.flags, arg->data,
                        feature.argsz - minsz);
        case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
                return vfio_ioctl_device_feature_migration_data_size(
                        device, feature.flags, arg->data,
                        feature.argsz - minsz);
        default:
                if (unlikely(!device->ops->device_feature))
                        return -EINVAL;
                return device->ops->device_feature(device, feature.flags,
                                                   arg->data,
                                                   feature.argsz - minsz);
        }
}

static long vfio_device_fops_unl_ioctl(struct file *filep,
                                       unsigned int cmd, unsigned long arg)
{
        struct vfio_device_file *df = filep->private_data;
        struct vfio_device *device = df->device;
        void __user *uptr = (void __user *)arg;
        int ret;

        if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
                return vfio_df_ioctl_bind_iommufd(df, uptr);

        /* Paired with smp_store_release() following vfio_df_open() */
        if (!smp_load_acquire(&df->access_granted))
                return -EINVAL;

        ret = vfio_device_pm_runtime_get(device);
        if (ret)
                return ret;

        /* cdev only ioctls */
        if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
                switch (cmd) {
                case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
                        ret = vfio_df_ioctl_attach_pt(df, uptr);
                        goto out;

                case VFIO_DEVICE_DETACH_IOMMUFD_PT:
                        ret = vfio_df_ioctl_detach_pt(df, uptr);
                        goto out;
                }
        }

        switch (cmd) {
        case VFIO_DEVICE_FEATURE:
                ret = vfio_ioctl_device_feature(device, uptr);
                break;

        default:
                if (unlikely(!device->ops->ioctl))
                        ret = -EINVAL;
                else
                        ret = device->ops->ioctl(device, cmd, arg);
                break;
        }
out:
        vfio_device_pm_runtime_put(device);
        return ret;
}

static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
                                     size_t count, loff_t *ppos)
{
        struct vfio_device_file *df = filep->private_data;
        struct vfio_device *device = df->device;

        /* Paired with smp_store_release() following vfio_df_open() */
        if (!smp_load_acquire(&df->access_granted))
                return -EINVAL;

        if (unlikely(!device->ops->read))
                return -EINVAL;

        return device->ops->read(device, buf, count, ppos);
}

static ssize_t vfio_device_fops_write(struct file *filep,
                                      const char __user *buf,
                                      size_t count, loff_t *ppos)
{
        struct vfio_device_file *df = filep->private_data;
        struct vfio_device *device = df->device;

        /* Paired with smp_store_release() following vfio_df_open() */
        if (!smp_load_acquire(&df->access_granted))
                return -EINVAL;

        if (unlikely(!device->ops->write))
                return -EINVAL;

        return device->ops->write(device, buf, count, ppos);
}

static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
{
        struct vfio_device_file *df = filep->private_data;
        struct vfio_device *device = df->device;

        /* Paired with smp_store_release() following vfio_df_open() */
        if (!smp_load_acquire(&df->access_granted))
                return -EINVAL;

        if (unlikely(!device->ops->mmap))
                return -EINVAL;

        return device->ops->mmap(device, vma);
}

const struct file_operations vfio_device_fops = {
        .owner                = THIS_MODULE,
        .open                = vfio_device_fops_cdev_open,
        .release        = vfio_device_fops_release,
        .read                = vfio_device_fops_read,
        .write                = vfio_device_fops_write,
        .unlocked_ioctl        = vfio_device_fops_unl_ioctl,
        .compat_ioctl        = compat_ptr_ioctl,
        .mmap                = vfio_device_fops_mmap,
};

static struct vfio_device *vfio_device_from_file(struct file *file)
{
        struct vfio_device_file *df = file->private_data;

        if (file->f_op != &vfio_device_fops)
                return NULL;
        return df->device;
}

/**
 * vfio_file_is_valid - True if the file is valid vfio file
 * @file: VFIO group file or VFIO device file
 */
bool vfio_file_is_valid(struct file *file)
{
        return vfio_group_from_file(file) ||
               vfio_device_from_file(file);
}
EXPORT_SYMBOL_GPL(vfio_file_is_valid);

/**
 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
 *        is always CPU cache coherent
 * @file: VFIO group file or VFIO device file
 *
 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
 * bit in DMA transactions. A return of false indicates that the user has
 * rights to access additional instructions such as wbinvd on x86.
 */
bool vfio_file_enforced_coherent(struct file *file)
{
        struct vfio_device *device;
        struct vfio_group *group;

        group = vfio_group_from_file(file);
        if (group)
                return vfio_group_enforced_coherent(group);

        device = vfio_device_from_file(file);
        if (device)
                return device_iommu_capable(device->dev,
                                            IOMMU_CAP_ENFORCE_CACHE_COHERENCY);

        return true;
}
EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);

static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
{
        struct vfio_device_file *df = file->private_data;

        /*
         * The kvm is first recorded in the vfio_device_file, and will
         * be propagated to vfio_device::kvm when the file is bound to
         * iommufd successfully in the vfio device cdev path.
         */
        spin_lock(&df->kvm_ref_lock);
        df->kvm = kvm;
        spin_unlock(&df->kvm_ref_lock);
}

/**
 * vfio_file_set_kvm - Link a kvm with VFIO drivers
 * @file: VFIO group file or VFIO device file
 * @kvm: KVM to link
 *
 * When a VFIO device is first opened the KVM will be available in
 * device->kvm if one was associated with the file.
 */
void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
{
        struct vfio_group *group;

        group = vfio_group_from_file(file);
        if (group)
                vfio_group_set_kvm(group, kvm);

        if (vfio_device_from_file(file))
                vfio_device_file_set_kvm(file, kvm);
}
EXPORT_SYMBOL_GPL(vfio_file_set_kvm);

/*
 * Sub-module support
 */
/*
 * Helper for managing a buffer of info chain capabilities, allocate or
 * reallocate a buffer with additional @size, filling in @id and @version
 * of the capability.  A pointer to the new capability is returned.
 *
 * NB. The chain is based at the head of the buffer, so new entries are
 * added to the tail, vfio_info_cap_shift() should be called to fixup the
 * next offsets prior to copying to the user buffer.
 */
struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
                                               size_t size, u16 id, u16 version)
{
        void *buf;
        struct vfio_info_cap_header *header, *tmp;

        /* Ensure that the next capability struct will be aligned */
        size = ALIGN(size, sizeof(u64));

        buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
        if (!buf) {
                kfree(caps->buf);
                caps->buf = NULL;
                caps->size = 0;
                return ERR_PTR(-ENOMEM);
        }

        caps->buf = buf;
        header = buf + caps->size;

        /* Eventually copied to user buffer, zero */
        memset(header, 0, size);

        header->id = id;
        header->version = version;

        /* Add to the end of the capability chain */
        for (tmp = buf; tmp->next; tmp = buf + tmp->next)
                ; /* nothing */

        tmp->next = caps->size;
        caps->size += size;

        return header;
}
EXPORT_SYMBOL_GPL(vfio_info_cap_add);

void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
{
        struct vfio_info_cap_header *tmp;
        void *buf = (void *)caps->buf;

        /* Capability structs should start with proper alignment */
        WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));

        for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
                tmp->next += offset;
}
EXPORT_SYMBOL(vfio_info_cap_shift);

int vfio_info_add_capability(struct vfio_info_cap *caps,
                             struct vfio_info_cap_header *cap, size_t size)
{
        struct vfio_info_cap_header *header;

        header = vfio_info_cap_add(caps, size, cap->id, cap->version);
        if (IS_ERR(header))
                return PTR_ERR(header);

        memcpy(header + 1, cap + 1, size - sizeof(*header));

        return 0;
}
EXPORT_SYMBOL(vfio_info_add_capability);

int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
                                       int max_irq_type, size_t *data_size)
{
        unsigned long minsz;
        size_t size;

        minsz = offsetofend(struct vfio_irq_set, count);

        if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
            (hdr->count >= (U32_MAX - hdr->start)) ||
            (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
                                VFIO_IRQ_SET_ACTION_TYPE_MASK)))
                return -EINVAL;

        if (data_size)
                *data_size = 0;

        if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
                return -EINVAL;

        switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
        case VFIO_IRQ_SET_DATA_NONE:
                size = 0;
                break;
        case VFIO_IRQ_SET_DATA_BOOL:
                size = sizeof(uint8_t);
                break;
        case VFIO_IRQ_SET_DATA_EVENTFD:
                size = sizeof(int32_t);
                break;
        default:
                return -EINVAL;
        }

        if (size) {
                if (hdr->argsz - minsz < hdr->count * size)
                        return -EINVAL;

                if (!data_size)
                        return -EINVAL;

                *data_size = hdr->count * size;
        }

        return 0;
}
EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);

/*
 * Pin contiguous user pages and return their associated host pages for local
 * domain only.
 * @device [in]  : device
 * @iova [in]    : starting IOVA of user pages to be pinned.
 * @npage [in]   : count of pages to be pinned.  This count should not
 *                   be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
 * @prot [in]    : protection flags
 * @pages[out]   : array of host pages
 * Return error or number of pages pinned.
 *
 * A driver may only call this function if the vfio_device was created
 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
 */
int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
                   int npage, int prot, struct page **pages)
{
        /* group->container cannot change while a vfio device is open */
        if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
                return -EINVAL;
        if (!device->ops->dma_unmap)
                return -EINVAL;
        if (vfio_device_has_container(device))
                return vfio_device_container_pin_pages(device, iova,
                                                       npage, prot, pages);
        if (device->iommufd_access) {
                int ret;

                if (iova > ULONG_MAX)
                        return -EINVAL;
                /*
                 * VFIO ignores the sub page offset, npages is from the start of
                 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
                 * the sub page offset by doing:
                 *     pages[0] + (iova % PAGE_SIZE)
                 */
                ret = iommufd_access_pin_pages(
                        device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
                        npage * PAGE_SIZE, pages,
                        (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
                if (ret)
                        return ret;
                return npage;
        }
        return -EINVAL;
}
EXPORT_SYMBOL(vfio_pin_pages);

/*
 * Unpin contiguous host pages for local domain only.
 * @device [in]  : device
 * @iova [in]    : starting address of user pages to be unpinned.
 * @npage [in]   : count of pages to be unpinned.  This count should not
 *                 be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
 */
void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
{
        if (WARN_ON(!vfio_assert_device_open(device)))
                return;
        if (WARN_ON(!device->ops->dma_unmap))
                return;

        if (vfio_device_has_container(device)) {
                vfio_device_container_unpin_pages(device, iova, npage);
                return;
        }
        if (device->iommufd_access) {
                if (WARN_ON(iova > ULONG_MAX))
                        return;
                iommufd_access_unpin_pages(device->iommufd_access,
                                           ALIGN_DOWN(iova, PAGE_SIZE),
                                           npage * PAGE_SIZE);
                return;
        }
}
EXPORT_SYMBOL(vfio_unpin_pages);

/*
 * This interface allows the CPUs to perform some sort of virtual DMA on
 * behalf of the device.
 *
 * CPUs read/write from/into a range of IOVAs pointing to user space memory
 * into/from a kernel buffer.
 *
 * As the read/write of user space memory is conducted via the CPUs and is
 * not a real device DMA, it is not necessary to pin the user space memory.
 *
 * @device [in]                : VFIO device
 * @iova [in]                : base IOVA of a user space buffer
 * @data [in]                : pointer to kernel buffer
 * @len [in]                : kernel buffer length
 * @write                : indicate read or write
 * Return error code on failure or 0 on success.
 */
int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
                size_t len, bool write)
{
        if (!data || len <= 0 || !vfio_assert_device_open(device))
                return -EINVAL;

        if (vfio_device_has_container(device))
                return vfio_device_container_dma_rw(device, iova,
                                                    data, len, write);

        if (device->iommufd_access) {
                unsigned int flags = 0;

                if (iova > ULONG_MAX)
                        return -EINVAL;

                /* VFIO historically tries to auto-detect a kthread */
                if (!current->mm)
                        flags |= IOMMUFD_ACCESS_RW_KTHREAD;
                if (write)
                        flags |= IOMMUFD_ACCESS_RW_WRITE;
                return iommufd_access_rw(device->iommufd_access, iova, data,
                                         len, flags);
        }
        return -EINVAL;
}
EXPORT_SYMBOL(vfio_dma_rw);

/*
 * Module/class support
 */
static int __init vfio_init(void)
{
        int ret;

        ida_init(&vfio.device_ida);

        ret = vfio_group_init();
        if (ret)
                return ret;

        ret = vfio_virqfd_init();
        if (ret)
                goto err_virqfd;

        /* /sys/class/vfio-dev/vfioX */
        vfio.device_class = class_create("vfio-dev");
        if (IS_ERR(vfio.device_class)) {
                ret = PTR_ERR(vfio.device_class);
                goto err_dev_class;
        }

        ret = vfio_cdev_init(vfio.device_class);
        if (ret)
                goto err_alloc_dev_chrdev;

        vfio_debugfs_create_root();
        pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
        return 0;

err_alloc_dev_chrdev:
        class_destroy(vfio.device_class);
        vfio.device_class = NULL;
err_dev_class:
        vfio_virqfd_exit();
err_virqfd:
        vfio_group_cleanup();
        return ret;
}

static void __exit vfio_cleanup(void)
{
        vfio_debugfs_remove_root();
        ida_destroy(&vfio.device_ida);
        vfio_cdev_cleanup();
        class_destroy(vfio.device_class);
        vfio.device_class = NULL;
        vfio_virqfd_exit();
        vfio_group_cleanup();
        xa_destroy(&vfio_device_set_xa);
}

module_init(vfio_init);
module_exit(vfio_cleanup);

MODULE_IMPORT_NS("IOMMUFD");
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);
MODULE_DESCRIPTION(DRIVER_DESC);
MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");





































   34 








   34 
   34 




















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
#ifndef _LINUX_RSEQ_H
#define _LINUX_RSEQ_H

#ifdef CONFIG_RSEQ

#include <linux/preempt.h>
#include <linux/sched.h>

/*
 * Map the event mask on the user-space ABI enum rseq_cs_flags
 * for direct mask checks.
 */
enum rseq_event_mask_bits {
        RSEQ_EVENT_PREEMPT_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT,
        RSEQ_EVENT_SIGNAL_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT,
        RSEQ_EVENT_MIGRATE_BIT        = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT,
};

enum rseq_event_mask {
        RSEQ_EVENT_PREEMPT        = (1U << RSEQ_EVENT_PREEMPT_BIT),
        RSEQ_EVENT_SIGNAL        = (1U << RSEQ_EVENT_SIGNAL_BIT),
        RSEQ_EVENT_MIGRATE        = (1U << RSEQ_EVENT_MIGRATE_BIT),
};

static inline void rseq_set_notify_resume(struct task_struct *t)
{
        if (t->rseq)
                set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}

void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs);

static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
        if (current->rseq)
                __rseq_handle_notify_resume(ksig, regs);
}

static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
        preempt_disable();
        __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask);
        preempt_enable();
        rseq_handle_notify_resume(ksig, regs);
}

/* rseq_preempt() requires preemption to be disabled. */
static inline void rseq_preempt(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/* rseq_migrate() requires preemption to be disabled. */
static inline void rseq_migrate(struct task_struct *t)
{
        __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask);
        rseq_set_notify_resume(t);
}

/*
 * If parent process has a registered restartable sequences area, the
 * child inherits. Unregister rseq for a clone with CLONE_VM set.
 */
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
        if (clone_flags & CLONE_VM) {
                t->rseq = NULL;
                t->rseq_len = 0;
                t->rseq_sig = 0;
                t->rseq_event_mask = 0;
        } else {
                t->rseq = current->rseq;
                t->rseq_len = current->rseq_len;
                t->rseq_sig = current->rseq_sig;
                t->rseq_event_mask = current->rseq_event_mask;
        }
}

static inline void rseq_execve(struct task_struct *t)
{
        t->rseq = NULL;
        t->rseq_len = 0;
        t->rseq_sig = 0;
        t->rseq_event_mask = 0;
}

#else

static inline void rseq_set_notify_resume(struct task_struct *t)
{
}
static inline void rseq_handle_notify_resume(struct ksignal *ksig,
                                             struct pt_regs *regs)
{
}
static inline void rseq_signal_deliver(struct ksignal *ksig,
                                       struct pt_regs *regs)
{
}
static inline void rseq_preempt(struct task_struct *t)
{
}
static inline void rseq_migrate(struct task_struct *t)
{
}
static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
{
}
static inline void rseq_execve(struct task_struct *t)
{
}

#endif

#ifdef CONFIG_DEBUG_RSEQ

void rseq_syscall(struct pt_regs *regs);

#else

static inline void rseq_syscall(struct pt_regs *regs)
{
}

#endif

#endif /* _LINUX_RSEQ_H */




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the IP protocol.
 *
 * Version:        @(#)ip.h        1.0.2        04/28/93
 *
 * Authors:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 */
#ifndef _LINUX_IP_H
#define _LINUX_IP_H

#include <linux/skbuff.h>
#include <uapi/linux/ip.h>

static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
{
        return (struct iphdr *)skb_network_header(skb);
}

static inline struct iphdr *inner_ip_hdr(const struct sk_buff *skb)
{
        return (struct iphdr *)skb_inner_network_header(skb);
}

static inline struct iphdr *ipip_hdr(const struct sk_buff *skb)
{
        return (struct iphdr *)skb_transport_header(skb);
}

static inline unsigned int ip_transport_len(const struct sk_buff *skb)
{
        return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb);
}

static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph)
{
        u32 len = ntohs(iph->tot_len);

        return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ?
               len : skb->len - skb_network_offset(skb);
}

static inline unsigned int skb_ip_totlen(const struct sk_buff *skb)
{
        return iph_totlen(skb, ip_hdr(skb));
}

/* IPv4 datagram length is stored into 16bit field (tot_len) */
#define IP_MAX_MTU        0xFFFFU

static inline void iph_set_totlen(struct iphdr *iph, unsigned int len)
{
        iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0;
}
#endif        /* _LINUX_IP_H */






































































































































   67 




   66 













































































    8 






    8 


















   15 






   15 




























































































































































































































































































































  320 







  321 

  321 



  321 

  159 
  203 




































































   69 
   69 




































  228 






  229 






































































































































































































































  139 












  139 































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/compiler.h>
#include <linux/export.h>
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/sysctl.h>
#include <linux/mman.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
#include <linux/elf.h>
#include <linux/elf-randomize.h>
#include <linux/personality.h>
#include <linux/random.h>
#include <linux/processor.h>
#include <linux/sizes.h>
#include <linux/compat.h>
#include <linux/fsnotify.h>

#include <linux/uaccess.h>

#include <kunit/visibility.h>

#include "internal.h"
#include "swap.h"

/**
 * kfree_const - conditionally free memory
 * @x: pointer to the memory
 *
 * Function calls kfree only if @x is not in .rodata section.
 */
void kfree_const(const void *x)
{
        if (!is_kernel_rodata((unsigned long)x))
                kfree(x);
}
EXPORT_SYMBOL(kfree_const);

/**
 * __kmemdup_nul - Create a NUL-terminated string from @s, which might be unterminated.
 * @s: The data to copy
 * @len: The size of the data, not including the NUL terminator
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
static __always_inline char *__kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        char *buf;

        /* '+1' for the NUL terminator */
        buf = kmalloc_track_caller(len + 1, gfp);
        if (!buf)
                return NULL;

        memcpy(buf, s, len);
        /* Ensure the buf is always NUL-terminated, regardless of @s. */
        buf[len] = '\0';
        return buf;
}

/**
 * kstrdup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
noinline
char *kstrdup(const char *s, gfp_t gfp)
{
        return s ? __kmemdup_nul(s, strlen(s), gfp) : NULL;
}
EXPORT_SYMBOL(kstrdup);

/**
 * kstrdup_const - conditionally duplicate an existing const string
 * @s: the string to duplicate
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Strings allocated by kstrdup_const should be freed by kfree_const and
 * must not be passed to krealloc().
 *
 * Return: source string if it is in .rodata section otherwise
 * fallback to kstrdup.
 */
const char *kstrdup_const(const char *s, gfp_t gfp)
{
        if (is_kernel_rodata((unsigned long)s))
                return s;

        return kstrdup(s, gfp);
}
EXPORT_SYMBOL(kstrdup_const);

/**
 * kstrndup - allocate space for and copy an existing string
 * @s: the string to duplicate
 * @max: read at most @max chars from @s
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Note: Use kmemdup_nul() instead if the size is known exactly.
 *
 * Return: newly allocated copy of @s or %NULL in case of error
 */
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
        return s ? __kmemdup_nul(s, strnlen(s, max), gfp) : NULL;
}
EXPORT_SYMBOL(kstrndup);

/**
 * kmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kmemdup_noprof);

/**
 * kmemdup_array - duplicate a given array.
 *
 * @src: array to duplicate.
 * @count: number of elements to duplicate from array.
 * @element_size: size of each element of array.
 * @gfp: GFP mask to use.
 *
 * Return: duplicated array of @src or %NULL in case of error,
 * result is physically contiguous. Use kfree() to free.
 */
void *kmemdup_array(const void *src, size_t count, size_t element_size, gfp_t gfp)
{
        return kmemdup(src, size_mul(element_size, count), gfp);
}
EXPORT_SYMBOL(kmemdup_array);

/**
 * kvmemdup - duplicate region of memory
 *
 * @src: memory region to duplicate
 * @len: memory region length
 * @gfp: GFP mask to use
 *
 * Return: newly allocated copy of @src or %NULL in case of error,
 * result may be not physically contiguous. Use kvfree() to free.
 */
void *kvmemdup(const void *src, size_t len, gfp_t gfp)
{
        void *p;

        p = kvmalloc(len, gfp);
        if (p)
                memcpy(p, src, len);
        return p;
}
EXPORT_SYMBOL(kvmemdup);

/**
 * kmemdup_nul - Create a NUL-terminated string from unterminated data
 * @s: The data to stringify
 * @len: The size of the data
 * @gfp: the GFP mask used in the kmalloc() call when allocating memory
 *
 * Return: newly allocated copy of @s with NUL-termination or %NULL in
 * case of error
 */
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
        return s ? __kmemdup_nul(s, len, gfp) : NULL;
}
EXPORT_SYMBOL(kmemdup_nul);

static kmem_buckets *user_buckets __ro_after_init;

static int __init init_user_buckets(void)
{
        user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL);

        return 0;
}
subsys_initcall(init_user_buckets);

/**
 * memdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result is physically
 * contiguous, to be freed by kfree().
 */
void *memdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(memdup_user);

/**
 * vmemdup_user - duplicate memory region from user space
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.  Result may be not
 * physically contiguous.  Use kvfree() to free.
 */
void *vmemdup_user(const void __user *src, size_t len)
{
        void *p;

        p = kmem_buckets_valloc(user_buckets, len, GFP_USER);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kvfree(p);
                return ERR_PTR(-EFAULT);
        }

        return p;
}
EXPORT_SYMBOL(vmemdup_user);

/**
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
 * @n: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Return: newly allocated copy of @s or an ERR_PTR() in case of error
 */
char *strndup_user(const char __user *s, long n)
{
        char *p;
        long length;

        length = strnlen_user(s, n);

        if (!length)
                return ERR_PTR(-EFAULT);

        if (length > n)
                return ERR_PTR(-EINVAL);

        p = memdup_user(s, length);

        if (IS_ERR(p))
                return p;

        p[length - 1] = '\0';

        return p;
}
EXPORT_SYMBOL(strndup_user);

/**
 * memdup_user_nul - duplicate memory region from user space and NUL-terminate
 *
 * @src: source address in user space
 * @len: number of bytes to copy
 *
 * Return: an ERR_PTR() on failure.
 */
void *memdup_user_nul(const void __user *src, size_t len)
{
        char *p;

        p = kmem_buckets_alloc_track_caller(user_buckets, len + 1, GFP_USER | __GFP_NOWARN);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (copy_from_user(p, src, len)) {
                kfree(p);
                return ERR_PTR(-EFAULT);
        }
        p[len] = '\0';

        return p;
}
EXPORT_SYMBOL(memdup_user_nul);

/* Check if the vma is being used as a stack by this task */
int vma_is_stack_for_current(struct vm_area_struct *vma)
{
        struct task_struct * __maybe_unused t = current;

        return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}

/*
 * Change backing file, only valid to use during initial VMA setup.
 */
void vma_set_file(struct vm_area_struct *vma, struct file *file)
{
        /* Changing an anonymous vma with this is illegal */
        get_file(file);
        swap(vma->vm_file, file);
        fput(file);
}
EXPORT_SYMBOL(vma_set_file);

#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
#endif

unsigned long randomize_stack_top(unsigned long stack_top)
{
        unsigned long random_variable = 0;

        if (current->flags & PF_RANDOMIZE) {
                random_variable = get_random_long();
                random_variable &= STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
#ifdef CONFIG_STACK_GROWSUP
        return PAGE_ALIGN(stack_top) + random_variable;
#else
        return PAGE_ALIGN(stack_top) - random_variable;
#endif
}

/**
 * randomize_page - Generate a random, page aligned address
 * @start:        The smallest acceptable address the caller will take.
 * @range:        The size of the area, starting at @start, within which the
 *                random address must fall.
 *
 * If @start + @range would overflow, @range is capped.
 *
 * NOTE: Historical use of randomize_range, which this replaces, presumed that
 * @start was already page aligned.  We now align it regardless.
 *
 * Return: A page aligned address within [start, start + range).  On error,
 * @start is returned.
 */
unsigned long randomize_page(unsigned long start, unsigned long range)
{
        if (!PAGE_ALIGNED(start)) {
                range -= PAGE_ALIGN(start) - start;
                start = PAGE_ALIGN(start);
        }

        if (start > ULONG_MAX - range)
                range = ULONG_MAX - start;

        range >>= PAGE_SHIFT;

        if (range == 0)
                return start;

        return start + (get_random_long() % range << PAGE_SHIFT);
}

#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long __weak arch_randomize_brk(struct mm_struct *mm)
{
        /* Is the current task 32bit ? */
        if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
                return randomize_page(mm->brk, SZ_32M);

        return randomize_page(mm->brk, SZ_1G);
}

unsigned long arch_mmap_rnd(void)
{
        unsigned long rnd;

#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
        if (is_compat_task())
                rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
        else
#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
                rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);

        return rnd << PAGE_SHIFT;
}

static int mmap_is_legacy(struct rlimit *rlim_stack)
{
        if (current->personality & ADDR_COMPAT_LAYOUT)
                return 1;

        /* On parisc the stack always grows up - so a unlimited stack should
         * not be an indicator to use the legacy memory layout. */
        if (rlim_stack->rlim_cur == RLIM_INFINITY &&
                !IS_ENABLED(CONFIG_STACK_GROWSUP))
                return 1;

        return sysctl_legacy_va_layout;
}

/*
 * Leave enough space between the mmap area and the stack to honour ulimit in
 * the face of randomisation.
 */
#define MIN_GAP                (SZ_128M)
#define MAX_GAP                (STACK_TOP / 6 * 5)

static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
{
#ifdef CONFIG_STACK_GROWSUP
        /*
         * For an upwards growing stack the calculation is much simpler.
         * Memory for the maximum stack size is reserved at the top of the
         * task. mmap_base starts directly below the stack and grows
         * downwards.
         */
        return PAGE_ALIGN_DOWN(mmap_upper_limit(rlim_stack) - rnd);
#else
        unsigned long gap = rlim_stack->rlim_cur;
        unsigned long pad = stack_guard_gap;

        /* Account for stack randomization if necessary */
        if (current->flags & PF_RANDOMIZE)
                pad += (STACK_RND_MASK << PAGE_SHIFT);

        /* Values close to RLIM_INFINITY can overflow. */
        if (gap + pad > gap)
                gap += pad;

        if (gap < MIN_GAP && MIN_GAP < MAX_GAP)
                gap = MIN_GAP;
        else if (gap > MAX_GAP)
                gap = MAX_GAP;

        return PAGE_ALIGN(STACK_TOP - gap - rnd);
#endif
}

void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        unsigned long random_factor = 0UL;

        if (current->flags & PF_RANDOMIZE)
                random_factor = arch_mmap_rnd();

        if (mmap_is_legacy(rlim_stack)) {
                mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
                clear_bit(MMF_TOPDOWN, &mm->flags);
        } else {
                mm->mmap_base = mmap_base(random_factor, rlim_stack);
                set_bit(MMF_TOPDOWN, &mm->flags);
        }
}
#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
        mm->mmap_base = TASK_UNMAPPED_BASE;
        clear_bit(MMF_TOPDOWN, &mm->flags);
}
#endif
#ifdef CONFIG_MMU
EXPORT_SYMBOL_IF_KUNIT(arch_pick_mmap_layout);
#endif

/**
 * __account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 * @task:        task used to check RLIMIT_MEMLOCK
 * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
 *
 * Assumes @task and @mm are valid (i.e. at least one reference on each), and
 * that mmap_lock is held as writer.
 *
 * Return:
 * * 0       on success
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
                        struct task_struct *task, bool bypass_rlim)
{
        unsigned long locked_vm, limit;
        int ret = 0;

        mmap_assert_write_locked(mm);

        locked_vm = mm->locked_vm;
        if (inc) {
                if (!bypass_rlim) {
                        limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
                        if (locked_vm + pages > limit)
                                ret = -ENOMEM;
                }
                if (!ret)
                        mm->locked_vm = locked_vm + pages;
        } else {
                WARN_ON_ONCE(pages > locked_vm);
                mm->locked_vm = locked_vm - pages;
        }

        pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
                 (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
                 locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
                 ret ? " - exceeded" : "");

        return ret;
}
EXPORT_SYMBOL_GPL(__account_locked_vm);

/**
 * account_locked_vm - account locked pages to an mm's locked_vm
 * @mm:          mm to account against, may be NULL
 * @pages:       number of pages to account
 * @inc:         %true if @pages should be considered positive, %false if not
 *
 * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
 *
 * Return:
 * * 0       on success, or if mm is NULL
 * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
 */
int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
{
        int ret;

        if (pages == 0 || !mm)
                return 0;

        mmap_write_lock(mm);
        ret = __account_locked_vm(mm, pages, inc, current,
                                  capable(CAP_IPC_LOCK));
        mmap_write_unlock(mm);

        return ret;
}
EXPORT_SYMBOL_GPL(account_locked_vm);

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long pgoff)
{
        unsigned long ret;
        struct mm_struct *mm = current->mm;
        unsigned long populate;
        LIST_HEAD(uf);

        ret = security_mmap_file(file, prot, flag);
        if (!ret)
                ret = fsnotify_mmap_perm(file, prot, pgoff >> PAGE_SHIFT, len);
        if (!ret) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
                ret = do_mmap(file, addr, len, prot, flag, 0, pgoff, &populate,
                              &uf);
                mmap_write_unlock(mm);
                userfaultfd_unmap_complete(mm, &uf);
                if (populate)
                        mm_populate(ret, populate);
        }
        return ret;
}

/*
 * Perform a userland memory mapping into the current process address space. See
 * the comment for do_mmap() for more details on this operation in general.
 *
 * This differs from do_mmap() in that:
 *
 * a. An offset parameter is provided rather than pgoff, which is both checked
 *    for overflow and page alignment.
 * b. mmap locking is performed on the caller's behalf.
 * c. Userfaultfd unmap events and memory population are handled.
 *
 * This means that this function performs essentially the same work as if
 * userland were invoking mmap (2).
 *
 * Returns either an error, or the address at which the requested mapping has
 * been performed.
 */
unsigned long vm_mmap(struct file *file, unsigned long addr,
        unsigned long len, unsigned long prot,
        unsigned long flag, unsigned long offset)
{
        if (unlikely(offset + PAGE_ALIGN(len) < offset))
                return -EINVAL;
        if (unlikely(offset_in_page(offset)))
                return -EINVAL;

        return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
}
EXPORT_SYMBOL(vm_mmap);

/**
 * __vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return __vmalloc_noprof(bytes, flags);
}
EXPORT_SYMBOL(__vmalloc_array_noprof);

/**
 * vmalloc_array - allocate memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vmalloc_array_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_array_noprof);

/**
 * __vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags)
{
        return __vmalloc_array_noprof(n, size, flags | __GFP_ZERO);
}
EXPORT_SYMBOL(__vcalloc_noprof);

/**
 * vcalloc - allocate and zero memory for a virtually contiguous array.
 * @n: number of elements.
 * @size: element size.
 */
void *vcalloc_noprof(size_t n, size_t size)
{
        return __vmalloc_array_noprof(n, size, GFP_KERNEL | __GFP_ZERO);
}
EXPORT_SYMBOL(vcalloc_noprof);

struct anon_vma *folio_anon_vma(const struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                return NULL;
        return (void *)(mapping - PAGE_MAPPING_ANON);
}

/**
 * folio_mapping - Find the mapping where this folio is stored.
 * @folio: The folio.
 *
 * For folios which are in the page cache, return the mapping that this
 * page belongs to.  Folios in the swap cache return the swap mapping
 * this page is stored in (which is different from the mapping for the
 * swap file or swap device where the data is stored).
 *
 * You can call this for folios which aren't in the swap cache or page
 * cache and it will return NULL.
 */
struct address_space *folio_mapping(struct folio *folio)
{
        struct address_space *mapping;

        /* This happens if someone calls flush_dcache_page on slab page */
        if (unlikely(folio_test_slab(folio)))
                return NULL;

        if (unlikely(folio_test_swapcache(folio)))
                return swap_address_space(folio->swap);

        mapping = folio->mapping;
        if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
                return NULL;

        return mapping;
}
EXPORT_SYMBOL(folio_mapping);

/**
 * folio_copy - Copy the contents of one folio to another.
 * @dst: Folio to copy to.
 * @src: Folio to copy from.
 *
 * The bytes in the folio represented by @src are copied to @dst.
 * Assumes the caller has validated that @dst is at least as large as @src.
 * Can be called in atomic context for order-0 folios, but if the folio is
 * larger, it may sleep.
 */
void folio_copy(struct folio *dst, struct folio *src)
{
        long i = 0;
        long nr = folio_nr_pages(src);

        for (;;) {
                copy_highpage(folio_page(dst, i), folio_page(src, i));
                if (++i == nr)
                        break;
                cond_resched();
        }
}
EXPORT_SYMBOL(folio_copy);

int folio_mc_copy(struct folio *dst, struct folio *src)
{
        long nr = folio_nr_pages(src);
        long i = 0;

        for (;;) {
                if (copy_mc_highpage(folio_page(dst, i), folio_page(src, i)))
                        return -EHWPOISON;
                if (++i == nr)
                        break;
                cond_resched();
        }

        return 0;
}
EXPORT_SYMBOL(folio_mc_copy);

int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;
static int sysctl_overcommit_ratio __read_mostly = 50;
static unsigned long sysctl_overcommit_kbytes __read_mostly;
int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */

#ifdef CONFIG_SYSCTL

static int overcommit_ratio_handler(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_kbytes = 0;
        return ret;
}

static void sync_overcommit_as(struct work_struct *dummy)
{
        percpu_counter_sync(&vm_committed_as);
}

static int overcommit_policy_handler(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct ctl_table t;
        int new_policy = -1;
        int ret;

        /*
         * The deviation of sync_overcommit_as could be big with loose policy
         * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
         * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
         * with the strict "NEVER", and to avoid possible race condition (even
         * though user usually won't too frequently do the switching to policy
         * OVERCOMMIT_NEVER), the switch is done in the following order:
         *        1. changing the batch
         *        2. sync percpu count on each CPU
         *        3. switch the policy
         */
        if (write) {
                t = *table;
                t.data = &new_policy;
                ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
                if (ret || new_policy == -1)
                        return ret;

                mm_compute_batch(new_policy);
                if (new_policy == OVERCOMMIT_NEVER)
                        schedule_on_each_cpu(sync_overcommit_as);
                sysctl_overcommit_memory = new_policy;
        } else {
                ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        }

        return ret;
}

static int overcommit_kbytes_handler(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                sysctl_overcommit_ratio = 0;
        return ret;
}

static const struct ctl_table util_sysctl_table[] = {
        {
                .procname        = "overcommit_memory",
                .data                = &sysctl_overcommit_memory,
                .maxlen                = sizeof(sysctl_overcommit_memory),
                .mode                = 0644,
                .proc_handler        = overcommit_policy_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "overcommit_ratio",
                .data                = &sysctl_overcommit_ratio,
                .maxlen                = sizeof(sysctl_overcommit_ratio),
                .mode                = 0644,
                .proc_handler        = overcommit_ratio_handler,
        },
        {
                .procname        = "overcommit_kbytes",
                .data                = &sysctl_overcommit_kbytes,
                .maxlen                = sizeof(sysctl_overcommit_kbytes),
                .mode                = 0644,
                .proc_handler        = overcommit_kbytes_handler,
        },
        {
                .procname        = "user_reserve_kbytes",
                .data                = &sysctl_user_reserve_kbytes,
                .maxlen                = sizeof(sysctl_user_reserve_kbytes),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
        {
                .procname        = "admin_reserve_kbytes",
                .data                = &sysctl_admin_reserve_kbytes,
                .maxlen                = sizeof(sysctl_admin_reserve_kbytes),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
};

static int __init init_vm_util_sysctls(void)
{
        register_sysctl_init("vm", util_sysctl_table);
        return 0;
}
subsys_initcall(init_vm_util_sysctls);
#endif /* CONFIG_SYSCTL */

/*
 * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
 */
unsigned long vm_commit_limit(void)
{
        unsigned long allowed;

        if (sysctl_overcommit_kbytes)
                allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
        else
                allowed = ((totalram_pages() - hugetlb_total_pages())
                           * sysctl_overcommit_ratio / 100);
        allowed += total_swap_pages;

        return allowed;
}

/*
 * Make sure vm_committed_as in one cacheline and not cacheline shared with
 * other variables. It can be updated by several CPUs frequently.
 */
struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;

/*
 * The global memory commitment made in the system can be a metric
 * that can be used to drive ballooning decisions when Linux is hosted
 * as a guest. On Hyper-V, the host implements a policy engine for dynamically
 * balancing memory across competing virtual machines that are hosted.
 * Several metrics drive this policy engine including the guest reported
 * memory commitment.
 *
 * The time cost of this is very low for small platforms, and for big
 * platform like a 2S/36C/72T Skylake server, in worst case where
 * vm_committed_as's spinlock is under severe contention, the time cost
 * could be about 30~40 microseconds.
 */
unsigned long vm_memory_committed(void)
{
        return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);

/*
 * Check that a process has enough memory to allocate a new virtual
 * mapping. 0 means there is enough memory for the allocation to
 * succeed and -ENOMEM implies there is not.
 *
 * We currently support three overcommit policies, which are set via the
 * vm.overcommit_memory sysctl.  See Documentation/mm/overcommit-accounting.rst
 *
 * Strict overcommit modes added 2002 Feb 26 by Alan Cox.
 * Additional code 2002 Jul 20 by Robert Love.
 *
 * cap_sys_admin is 1 if the process has admin privileges, 0 otherwise.
 *
 * Note this is a helper function intended to be used by LSMs which
 * wish to use this logic.
 */
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
        long allowed;
        unsigned long bytes_failed;

        vm_acct_memory(pages);

        /*
         * Sometimes we want to use more memory than we have
         */
        if (sysctl_overcommit_memory == OVERCOMMIT_ALWAYS)
                return 0;

        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                if (pages > totalram_pages() + total_swap_pages)
                        goto error;
                return 0;
        }

        allowed = vm_commit_limit();
        /*
         * Reserve some for root
         */
        if (!cap_sys_admin)
                allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);

        /*
         * Don't let a single process grow so big a user can't recover
         */
        if (mm) {
                long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);

                allowed -= min_t(long, mm->total_vm / 32, reserve);
        }

        if (percpu_counter_read_positive(&vm_committed_as) < allowed)
                return 0;
error:
        bytes_failed = pages << PAGE_SHIFT;
        pr_warn_ratelimited("%s: pid: %d, comm: %s, bytes: %lu not enough memory for the allocation\n",
                            __func__, current->pid, current->comm, bytes_failed);
        vm_unacct_memory(pages);

        return -ENOMEM;
}

/**
 * get_cmdline() - copy the cmdline value to a buffer.
 * @task:     the task whose cmdline value to copy.
 * @buffer:   the buffer to copy to.
 * @buflen:   the length of the buffer. Larger cmdline values are truncated
 *            to this length.
 *
 * Return: the size of the cmdline field copied. Note that the copy does
 * not guarantee an ending NULL byte.
 */
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
{
        int res = 0;
        unsigned int len;
        struct mm_struct *mm = get_task_mm(task);
        unsigned long arg_start, arg_end, env_start, env_end;
        if (!mm)
                goto out;
        if (!mm->arg_end)
                goto out_mm;        /* Shh! No looking before we're done */

        spin_lock(&mm->arg_lock);
        arg_start = mm->arg_start;
        arg_end = mm->arg_end;
        env_start = mm->env_start;
        env_end = mm->env_end;
        spin_unlock(&mm->arg_lock);

        len = arg_end - arg_start;

        if (len > buflen)
                len = buflen;

        res = access_process_vm(task, arg_start, buffer, len, FOLL_FORCE);

        /*
         * If the nul at the end of args has been overwritten, then
         * assume application is using setproctitle(3).
         */
        if (res > 0 && buffer[res-1] != '\0' && len < buflen) {
                len = strnlen(buffer, res);
                if (len < res) {
                        res = len;
                } else {
                        len = env_end - env_start;
                        if (len > buflen - res)
                                len = buflen - res;
                        res += access_process_vm(task, env_start,
                                                 buffer+res, len,
                                                 FOLL_FORCE);
                        res = strnlen(buffer, res);
                }
        }
out_mm:
        mmput(mm);
out:
        return res;
}

int __weak memcmp_pages(struct page *page1, struct page *page2)
{
        char *addr1, *addr2;
        int ret;

        addr1 = kmap_local_page(page1);
        addr2 = kmap_local_page(page2);
        ret = memcmp(addr1, addr2, PAGE_SIZE);
        kunmap_local(addr2);
        kunmap_local(addr1);
        return ret;
}

#ifdef CONFIG_PRINTK
/**
 * mem_dump_obj - Print available provenance information
 * @object: object for which to find provenance information.
 *
 * This function uses pr_cont(), so that the caller is expected to have
 * printed out whatever preamble is appropriate.  The provenance information
 * depends on the type of object and on how much debugging is enabled.
 * For example, for a slab-cache object, the slab name is printed, and,
 * if available, the return address and stack trace from the allocation
 * and last free path of that object.
 */
void mem_dump_obj(void *object)
{
        const char *type;

        if (kmem_dump_obj(object))
                return;

        if (vmalloc_dump_obj(object))
                return;

        if (is_vmalloc_addr(object))
                type = "vmalloc memory";
        else if (virt_addr_valid(object))
                type = "non-slab/vmalloc memory";
        else if (object == NULL)
                type = "NULL pointer";
        else if (object == ZERO_SIZE_PTR)
                type = "zero-size pointer";
        else
                type = "non-paged memory";

        pr_cont(" %s\n", type);
}
EXPORT_SYMBOL_GPL(mem_dump_obj);
#endif

/*
 * A driver might set a page logically offline -- PageOffline() -- and
 * turn the page inaccessible in the hypervisor; after that, access to page
 * content can be fatal.
 *
 * Some special PFN walkers -- i.e., /proc/kcore -- read content of random
 * pages after checking PageOffline(); however, these PFN walkers can race
 * with drivers that set PageOffline().
 *
 * page_offline_freeze()/page_offline_thaw() allows for a subsystem to
 * synchronize with such drivers, achieving that a page cannot be set
 * PageOffline() while frozen.
 *
 * page_offline_begin()/page_offline_end() is used by drivers that care about
 * such races when setting a page PageOffline().
 */
static DECLARE_RWSEM(page_offline_rwsem);

void page_offline_freeze(void)
{
        down_read(&page_offline_rwsem);
}

void page_offline_thaw(void)
{
        up_read(&page_offline_rwsem);
}

void page_offline_begin(void)
{
        down_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_begin);

void page_offline_end(void)
{
        up_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_end);

#ifndef flush_dcache_folio
void flush_dcache_folio(struct folio *folio)
{
        long i, nr = folio_nr_pages(folio);

        for (i = 0; i < nr; i++)
                flush_dcache_page(folio_page(folio, i));
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif














  156 
  157 



  157 
  157 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#include <hyp/debug-sr.h>

#include <linux/kvm_host.h>

#include <asm/kvm_hyp.h>

void __debug_switch_to_guest(struct kvm_vcpu *vcpu)
{
        __debug_switch_to_guest_common(vcpu);
}

void __debug_switch_to_host(struct kvm_vcpu *vcpu)
{
        __debug_switch_to_host_common(vcpu);
}





































































































































































































































































































    1 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PIPE_FS_I_H
#define _LINUX_PIPE_FS_I_H

#define PIPE_DEF_BUFFERS        16

#define PIPE_BUF_FLAG_LRU        0x01        /* page is on the LRU */
#define PIPE_BUF_FLAG_ATOMIC        0x02        /* was atomically mapped */
#define PIPE_BUF_FLAG_GIFT        0x04        /* page is a gift */
#define PIPE_BUF_FLAG_PACKET        0x08        /* read() as a packet */
#define PIPE_BUF_FLAG_CAN_MERGE        0x10        /* can merge buffers */
#define PIPE_BUF_FLAG_WHOLE        0x20        /* read() must return entire buffer or error */
#ifdef CONFIG_WATCH_QUEUE
#define PIPE_BUF_FLAG_LOSS        0x40        /* Message loss happened after this buffer */
#endif

/**
 *        struct pipe_buffer - a linux kernel pipe buffer
 *        @page: the page containing the data for the pipe buffer
 *        @offset: offset of data inside the @page
 *        @len: length of data inside the @page
 *        @ops: operations associated with this buffer. See @pipe_buf_operations.
 *        @flags: pipe buffer flags. See above.
 *        @private: private data owned by the ops.
 **/
struct pipe_buffer {
        struct page *page;
        unsigned int offset, len;
        const struct pipe_buf_operations *ops;
        unsigned int flags;
        unsigned long private;
};

/*
 * Really only alpha needs 32-bit fields, but
 * might as well do it for 64-bit architectures
 * since that's what we've historically done,
 * and it makes 'head_tail' always be a simple
 * 'unsigned long'.
 */
#ifdef CONFIG_64BIT
typedef unsigned int pipe_index_t;
#else
typedef unsigned short pipe_index_t;
#endif

/*
 * We have to declare this outside 'struct pipe_inode_info',
 * but then we can't use 'union pipe_index' for an anonymous
 * union, so we end up having to duplicate this declaration
 * below. Annoying.
 */
union pipe_index {
        unsigned long head_tail;
        struct {
                pipe_index_t head;
                pipe_index_t tail;
        };
};

/**
 *        struct pipe_inode_info - a linux kernel pipe
 *        @mutex: mutex protecting the whole thing
 *        @rd_wait: reader wait point in case of empty pipe
 *        @wr_wait: writer wait point in case of full pipe
 *        @head: The point of buffer production
 *        @tail: The point of buffer consumption
 *        @head_tail: unsigned long union of @head and @tail
 *        @note_loss: The next read() should insert a data-lost message
 *        @max_usage: The maximum number of slots that may be used in the ring
 *        @ring_size: total number of buffers (should be a power of 2)
 *        @nr_accounted: The amount this pipe accounts for in user->pipe_bufs
 *        @tmp_page: cached released page
 *        @readers: number of current readers of this pipe
 *        @writers: number of current writers of this pipe
 *        @files: number of struct file referring this pipe (protected by ->i_lock)
 *        @r_counter: reader counter
 *        @w_counter: writer counter
 *        @poll_usage: is this pipe used for epoll, which has crazy wakeups?
 *        @fasync_readers: reader side fasync
 *        @fasync_writers: writer side fasync
 *        @bufs: the circular array of pipe buffers
 *        @user: the user who created this pipe
 *        @watch_queue: If this pipe is a watch_queue, this is the stuff for that
 **/
struct pipe_inode_info {
        struct mutex mutex;
        wait_queue_head_t rd_wait, wr_wait;

        /* This has to match the 'union pipe_index' above */
        union {
                unsigned long head_tail;
                struct {
                        pipe_index_t head;
                        pipe_index_t tail;
                };
        };

        unsigned int max_usage;
        unsigned int ring_size;
        unsigned int nr_accounted;
        unsigned int readers;
        unsigned int writers;
        unsigned int files;
        unsigned int r_counter;
        unsigned int w_counter;
        bool poll_usage;
#ifdef CONFIG_WATCH_QUEUE
        bool note_loss;
#endif
        struct page *tmp_page[2];
        struct fasync_struct *fasync_readers;
        struct fasync_struct *fasync_writers;
        struct pipe_buffer *bufs;
        struct user_struct *user;
#ifdef CONFIG_WATCH_QUEUE
        struct watch_queue *watch_queue;
#endif
};

/*
 * Note on the nesting of these functions:
 *
 * ->confirm()
 *        ->try_steal()
 *
 * That is, ->try_steal() must be called on a confirmed buffer.  See below for
 * the meaning of each operation.  Also see the kerneldoc in fs/pipe.c for the
 * pipe and generic variants of these hooks.
 */
struct pipe_buf_operations {
        /*
         * ->confirm() verifies that the data in the pipe buffer is there
         * and that the contents are good. If the pages in the pipe belong
         * to a file system, we may need to wait for IO completion in this
         * hook. Returns 0 for good, or a negative error value in case of
         * error.  If not present all pages are considered good.
         */
        int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *);

        /*
         * When the contents of this pipe buffer has been completely
         * consumed by a reader, ->release() is called.
         */
        void (*release)(struct pipe_inode_info *, struct pipe_buffer *);

        /*
         * Attempt to take ownership of the pipe buffer and its contents.
         * ->try_steal() returns %true for success, in which case the contents
         * of the pipe (the buf->page) is locked and now completely owned by the
         * caller. The page may then be transferred to a different mapping, the
         * most often used case is insertion into different file address space
         * cache.
         */
        bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *);

        /*
         * Get a reference to the pipe buffer.
         */
        bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};

/**
 * pipe_has_watch_queue - Check whether the pipe is a watch_queue,
 * i.e. it was created with O_NOTIFICATION_PIPE
 * @pipe: The pipe to check
 *
 * Return: true if pipe is a watch queue, false otherwise.
 */
static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe)
{
#ifdef CONFIG_WATCH_QUEUE
        return pipe->watch_queue != NULL;
#else
        return false;
#endif
}

/**
 * pipe_occupancy - Return number of slots used in the pipe
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 */
static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail)
{
        return (pipe_index_t)(head - tail);
}

/**
 * pipe_empty - Return true if the pipe is empty
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 */
static inline bool pipe_empty(unsigned int head, unsigned int tail)
{
        return !pipe_occupancy(head, tail);
}

/**
 * pipe_full - Return true if the pipe is full
 * @head: The pipe ring head pointer
 * @tail: The pipe ring tail pointer
 * @limit: The maximum amount of slots available.
 */
static inline bool pipe_full(unsigned int head, unsigned int tail,
                             unsigned int limit)
{
        return pipe_occupancy(head, tail) >= limit;
}

/**
 * pipe_is_full - Return true if the pipe is full
 * @pipe: the pipe
 */
static inline bool pipe_is_full(const struct pipe_inode_info *pipe)
{
        return pipe_full(pipe->head, pipe->tail, pipe->max_usage);
}

/**
 * pipe_is_empty - Return true if the pipe is empty
 * @pipe: the pipe
 */
static inline bool pipe_is_empty(const struct pipe_inode_info *pipe)
{
        return pipe_empty(pipe->head, pipe->tail);
}

/**
 * pipe_buf_usage - Return how many pipe buffers are in use
 * @pipe: the pipe
 */
static inline unsigned int pipe_buf_usage(const struct pipe_inode_info *pipe)
{
        return pipe_occupancy(pipe->head, pipe->tail);
}

/**
 * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring
 * @pipe: The pipe to access
 * @slot: The slot of interest
 */
static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe,
                                           unsigned int slot)
{
        return &pipe->bufs[slot & (pipe->ring_size - 1)];
}

/**
 * pipe_head_buf - Return the pipe buffer at the head of the pipe ring
 * @pipe: The pipe to access
 */
static inline struct pipe_buffer *pipe_head_buf(const struct pipe_inode_info *pipe)
{
        return pipe_buf(pipe, pipe->head);
}

/**
 * pipe_buf_get - get a reference to a pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to get a reference to
 *
 * Return: %true if the reference was successfully obtained.
 */
static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
                                struct pipe_buffer *buf)
{
        return buf->ops->get(pipe, buf);
}

/**
 * pipe_buf_release - put a reference to a pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to put a reference to
 */
static inline void pipe_buf_release(struct pipe_inode_info *pipe,
                                    struct pipe_buffer *buf)
{
        const struct pipe_buf_operations *ops = buf->ops;

        buf->ops = NULL;
        ops->release(pipe, buf);
}

/**
 * pipe_buf_confirm - verify contents of the pipe buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to confirm
 */
static inline int pipe_buf_confirm(struct pipe_inode_info *pipe,
                                   struct pipe_buffer *buf)
{
        if (!buf->ops->confirm)
                return 0;
        return buf->ops->confirm(pipe, buf);
}

/**
 * pipe_buf_try_steal - attempt to take ownership of a pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to attempt to steal
 */
static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        if (!buf->ops->try_steal)
                return false;
        return buf->ops->try_steal(pipe, buf);
}

/* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
   memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
#define PIPE_SIZE                PAGE_SIZE

/* Pipe lock and unlock operations */
void pipe_lock(struct pipe_inode_info *);
void pipe_unlock(struct pipe_inode_info *);
void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);

/* Wait for a pipe to be readable/writable while dropping the pipe lock */
void pipe_wait_readable(struct pipe_inode_info *);
void pipe_wait_writable(struct pipe_inode_info *);

struct pipe_inode_info *alloc_pipe_info(void);
void free_pipe_info(struct pipe_inode_info *);

/* Generic pipe buffer ops functions */
bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
bool generic_pipe_buf_try_steal(struct pipe_inode_info *, struct pipe_buffer *);
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);

extern const struct pipe_buf_operations nosteal_pipe_buf_ops;

unsigned long account_pipe_buffers(struct user_struct *user,
                                   unsigned long old, unsigned long new);
bool too_many_pipe_buffers_soft(unsigned long user_bufs);
bool too_many_pipe_buffers_hard(unsigned long user_bufs);
bool pipe_is_unprivileged_user(void);

/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots);
long pipe_fcntl(struct file *, unsigned int, unsigned int arg);
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice);

int create_pipe_files(struct file **, int);
unsigned int round_pipe_size(unsigned int size);

#endif
































































































    1 






















    1 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/drivers/base/map.c
 *
 * (C) Copyright Al Viro 2002,2003
 *
 * NOTE: data structure needs to be changed.  It works, but for large dev_t
 * it will be too slow.  It is isolated, though, so these changes will be
 * local to that file.
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/kdev_t.h>
#include <linux/kobject.h>
#include <linux/kobj_map.h>

struct kobj_map {
        struct probe {
                struct probe *next;
                dev_t dev;
                unsigned long range;
                struct module *owner;
                kobj_probe_t *get;
                int (*lock)(dev_t, void *);
                void *data;
        } *probes[255];
        struct mutex *lock;
};

int kobj_map(struct kobj_map *domain, dev_t dev, unsigned long range,
             struct module *module, kobj_probe_t *probe,
             int (*lock)(dev_t, void *), void *data)
{
        unsigned int n = MAJOR(dev + range - 1) - MAJOR(dev) + 1;
        unsigned int index = MAJOR(dev);
        unsigned int i;
        struct probe *p;

        if (n > 255)
                n = 255;

        p = kmalloc_array(n, sizeof(struct probe), GFP_KERNEL);
        if (p == NULL)
                return -ENOMEM;

        for (i = 0; i < n; i++, p++) {
                p->owner = module;
                p->get = probe;
                p->lock = lock;
                p->dev = dev;
                p->range = range;
                p->data = data;
        }
        mutex_lock(domain->lock);
        for (i = 0, p -= n; i < n; i++, p++, index++) {
                struct probe **s = &domain->probes[index % 255];
                while (*s && (*s)->range < range)
                        s = &(*s)->next;
                p->next = *s;
                *s = p;
        }
        mutex_unlock(domain->lock);
        return 0;
}

void kobj_unmap(struct kobj_map *domain, dev_t dev, unsigned long range)
{
        unsigned int n = MAJOR(dev + range - 1) - MAJOR(dev) + 1;
        unsigned int index = MAJOR(dev);
        unsigned int i;
        struct probe *found = NULL;

        if (n > 255)
                n = 255;

        mutex_lock(domain->lock);
        for (i = 0; i < n; i++, index++) {
                struct probe **s;
                for (s = &domain->probes[index % 255]; *s; s = &(*s)->next) {
                        struct probe *p = *s;
                        if (p->dev == dev && p->range == range) {
                                *s = p->next;
                                if (!found)
                                        found = p;
                                break;
                        }
                }
        }
        mutex_unlock(domain->lock);
        kfree(found);
}

struct kobject *kobj_lookup(struct kobj_map *domain, dev_t dev, int *index)
{
        struct kobject *kobj;
        struct probe *p;
        unsigned long best = ~0UL;

retry:
        mutex_lock(domain->lock);
        for (p = domain->probes[MAJOR(dev) % 255]; p; p = p->next) {
                struct kobject *(*probe)(dev_t, int *, void *);
                struct module *owner;
                void *data;

                if (p->dev > dev || p->dev + p->range - 1 < dev)
                        continue;
                if (p->range - 1 >= best)
                        break;
                if (!try_module_get(p->owner))
                        continue;
                owner = p->owner;
                data = p->data;
                probe = p->get;
                best = p->range - 1;
                *index = dev - p->dev;
                if (p->lock && p->lock(dev, data) < 0) {
                        module_put(owner);
                        continue;
                }
                mutex_unlock(domain->lock);
                kobj = probe(dev, index, data);
                /* Currently ->owner protects _only_ ->probe() itself. */
                module_put(owner);
                if (kobj)
                        return kobj;
                goto retry;
        }
        mutex_unlock(domain->lock);
        return NULL;
}

struct kobj_map *kobj_map_init(kobj_probe_t *base_probe, struct mutex *lock)
{
        struct kobj_map *p = kmalloc(sizeof(struct kobj_map), GFP_KERNEL);
        struct probe *base = kzalloc(sizeof(*base), GFP_KERNEL);
        int i;

        if ((p == NULL) || (base == NULL)) {
                kfree(p);
                kfree(base);
                return NULL;
        }

        base->dev = 1;
        base->range = ~0;
        base->get = base_probe;
        for (i = 0; i < 255; i++)
                p->probes[i] = base;
        p->lock = lock;
        return p;
}














































































































   48 



































    9 













  858 







  863 











  836 


    4 







   66 



  197 
  162 










  190 


   29 








   54 



  186 

































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_UACCESS_H__
#define __LINUX_UACCESS_H__

#include <linux/fault-inject-usercopy.h>
#include <linux/instrumented.h>
#include <linux/minmax.h>
#include <linux/nospec.h>
#include <linux/sched.h>
#include <linux/ucopysize.h>

#include <asm/uaccess.h>

/*
 * Architectures that support memory tagging (assigning tags to memory regions,
 * embedding these tags into addresses that point to these memory regions, and
 * checking that the memory and the pointer tags match on memory accesses)
 * redefine this macro to strip tags from pointers.
 *
 * Passing down mm_struct allows to define untagging rules on per-process
 * basis.
 *
 * It's defined as noop for architectures that don't support memory tagging.
 */
#ifndef untagged_addr
#define untagged_addr(addr) (addr)
#endif

#ifndef untagged_addr_remote
#define untagged_addr_remote(mm, addr)        ({                \
        mmap_assert_locked(mm);                                \
        untagged_addr(addr);                                \
})
#endif

#ifdef masked_user_access_begin
 #define can_do_masked_user_access() 1
#else
 #define can_do_masked_user_access() 0
 #define masked_user_access_begin(src) NULL
 #define mask_user_address(src) (src)
#endif

/*
 * Architectures should provide two primitives (raw_copy_{to,from}_user())
 * and get rid of their private instances of copy_{to,from}_user() and
 * __copy_{to,from}_user{,_inatomic}().
 *
 * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and
 * return the amount left to copy.  They should assume that access_ok() has
 * already been checked (and succeeded); they should *not* zero-pad anything.
 * No KASAN or object size checks either - those belong here.
 *
 * Both of these functions should attempt to copy size bytes starting at from
 * into the area starting at to.  They must not fetch or store anything
 * outside of those areas.  Return value must be between 0 (everything
 * copied successfully) and size (nothing copied).
 *
 * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting
 * at to must become equal to the bytes fetched from the corresponding area
 * starting at from.  All data past to + size - N must be left unmodified.
 *
 * If copying succeeds, the return value must be 0.  If some data cannot be
 * fetched, it is permitted to copy less than had been fetched; the only
 * hard requirement is that not storing anything at all (i.e. returning size)
 * should happen only when nothing could be copied.  In other words, you don't
 * have to squeeze as much as possible - it is allowed, but not necessary.
 *
 * For raw_copy_from_user() to always points to kernel memory and no faults
 * on store should happen.  Interpretation of from is affected by set_fs().
 * For raw_copy_to_user() it's the other way round.
 *
 * Both can be inlined - it's up to architectures whether it wants to bother
 * with that.  They should not be used directly; they are used to implement
 * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic())
 * that are used instead.  Out of those, __... ones are inlined.  Plain
 * copy_{to,from}_user() might or might not be inlined.  If you want them
 * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER.
 *
 * NOTE: only copy_from_user() zero-pads the destination in case of short copy.
 * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything
 * at all; their callers absolutely must check the return value.
 *
 * Biarch ones should also provide raw_copy_in_user() - similar to the above,
 * but both source and destination are __user pointers (affected by set_fs()
 * as usual) and both source and destination can trigger faults.
 */

static __always_inline __must_check unsigned long
__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        instrument_copy_from_user_before(to, from, n);
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

static __always_inline __must_check unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res;

        might_fault();
        instrument_copy_from_user_before(to, from, n);
        if (should_fail_usercopy())
                return n;
        check_object_size(to, n, false);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        return res;
}

/**
 * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
 * @to:   Destination address, in user space.
 * @from: Source address, in kernel space.
 * @n:    Number of bytes to copy.
 *
 * Context: User context only.
 *
 * Copy data from kernel space to user space.  Caller must check
 * the specified block with access_ok() before calling this function.
 * The caller should also make sure he pins the user space address
 * so that we don't result in page fault and sleep.
 */
static __always_inline __must_check unsigned long
__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
{
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

static __always_inline __must_check unsigned long
__copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        instrument_copy_to_user(to, from, n);
        check_object_size(from, n, true);
        return raw_copy_to_user(to, from, n);
}

/*
 * Architectures that #define INLINE_COPY_TO_USER use this function
 * directly in the normal copy_to/from_user(), the other ones go
 * through an extern _copy_to/from_user(), which expands the same code
 * here.
 *
 * Rust code always uses the extern definition.
 */
static inline __must_check unsigned long
_inline_copy_from_user(void *to, const void __user *from, unsigned long n)
{
        unsigned long res = n;
        might_fault();
        if (should_fail_usercopy())
                goto fail;
        if (can_do_masked_user_access())
                from = mask_user_address(from);
        else {
                if (!access_ok(from, n))
                        goto fail;
                /*
                 * Ensure that bad access_ok() speculation will not
                 * lead to nasty side effects *after* the copy is
                 * finished:
                 */
                barrier_nospec();
        }
        instrument_copy_from_user_before(to, from, n);
        res = raw_copy_from_user(to, from, n);
        instrument_copy_from_user_after(to, from, n, res);
        if (likely(!res))
                return 0;
fail:
        memset(to + (n - res), 0, res);
        return res;
}
extern __must_check unsigned long
_copy_from_user(void *, const void __user *, unsigned long);

static inline __must_check unsigned long
_inline_copy_to_user(void __user *to, const void *from, unsigned long n)
{
        might_fault();
        if (should_fail_usercopy())
                return n;
        if (access_ok(to, n)) {
                instrument_copy_to_user(to, from, n);
                n = raw_copy_to_user(to, from, n);
        }
        return n;
}
extern __must_check unsigned long
_copy_to_user(void __user *, const void *, unsigned long);

static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
        if (!check_copy_size(to, n, false))
                return n;
#ifdef INLINE_COPY_FROM_USER
        return _inline_copy_from_user(to, from, n);
#else
        return _copy_from_user(to, from, n);
#endif
}

static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
        if (!check_copy_size(from, n, true))
                return n;

#ifdef INLINE_COPY_TO_USER
        return _inline_copy_to_user(to, from, n);
#else
        return _copy_to_user(to, from, n);
#endif
}

#ifndef copy_mc_to_kernel
/*
 * Without arch opt-in this generic copy_mc_to_kernel() will not handle
 * #MC (or arch equivalent) during source read.
 */
static inline unsigned long __must_check
copy_mc_to_kernel(void *dst, const void *src, size_t cnt)
{
        memcpy(dst, src, cnt);
        return 0;
}
#endif

static __always_inline void pagefault_disabled_inc(void)
{
        current->pagefault_disabled++;
}

static __always_inline void pagefault_disabled_dec(void)
{
        current->pagefault_disabled--;
}

/*
 * These routines enable/disable the pagefault handler. If disabled, it will
 * not take any locks and go straight to the fixup table.
 *
 * User access methods will not sleep when called from a pagefault_disabled()
 * environment.
 */
static inline void pagefault_disable(void)
{
        pagefault_disabled_inc();
        /*
         * make sure to have issued the store before a pagefault
         * can hit.
         */
        barrier();
}

static inline void pagefault_enable(void)
{
        /*
         * make sure to issue those last loads/stores before enabling
         * the pagefault handler again.
         */
        barrier();
        pagefault_disabled_dec();
}

/*
 * Is the pagefault handler disabled? If so, user access methods will not sleep.
 */
static inline bool pagefault_disabled(void)
{
        return current->pagefault_disabled != 0;
}

/*
 * The pagefault handler is in general disabled by pagefault_disable() or
 * when in irq context (via in_atomic()).
 *
 * This function should only be used by the fault handlers. Other users should
 * stick to pagefault_disabled().
 * Please NEVER use preempt_disable() to disable the fault handler. With
 * !CONFIG_PREEMPT_COUNT, this is like a NOP. So the handler won't be disabled.
 * in_atomic() will report different values based on !CONFIG_PREEMPT_COUNT.
 */
#define faulthandler_disabled() (pagefault_disabled() || in_atomic())

#ifndef CONFIG_ARCH_HAS_SUBPAGE_FAULTS

/**
 * probe_subpage_writeable: probe the user range for write faults at sub-page
 *                            granularity (e.g. arm64 MTE)
 * @uaddr: start of address range
 * @size: size of address range
 *
 * Returns 0 on success, the number of bytes not probed on fault.
 *
 * It is expected that the caller checked for the write permission of each
 * page in the range either by put_user() or GUP. The architecture port can
 * implement a more efficient get_user() probing if the same sub-page faults
 * are triggered by either a read or a write.
 */
static inline size_t probe_subpage_writeable(char __user *uaddr, size_t size)
{
        return 0;
}

#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */

#ifndef ARCH_HAS_NOCACHE_UACCESS

static inline __must_check unsigned long
__copy_from_user_inatomic_nocache(void *to, const void __user *from,
                                  unsigned long n)
{
        return __copy_from_user_inatomic(to, from, n);
}

#endif                /* ARCH_HAS_NOCACHE_UACCESS */

extern __must_check int check_zeroed_user(const void __user *from, size_t size);

/**
 * copy_struct_from_user: copy a struct from userspace
 * @dst:   Destination address, in kernel space. This buffer must be @ksize
 *         bytes long.
 * @ksize: Size of @dst struct.
 * @src:   Source address, in userspace.
 * @usize: (Alleged) size of @src struct.
 *
 * Copies a struct from userspace to kernel space, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * @ksize is just sizeof(*dst), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, const struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *        return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *        return -EINVAL;
 *
 *      err = copy_struct_from_user(&karg, sizeof(karg), uarg, usize);
 *      if (err)
 *        return err;
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the userspace has passed an old struct to a
 *    newer kernel. The rest of the trailing bytes in @dst (@ksize - @usize)
 *    are to be zero-filled.
 *  * If @usize > @ksize, then the userspace has passed a new struct to an
 *    older kernel. The trailing bytes unknown to the kernel (@usize - @ksize)
 *    are checked to ensure they are zeroed, otherwise -E2BIG is returned.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -E2BIG:  (@usize > @ksize) and there are non-zero trailing bytes in @src.
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_from_user(void *dst, size_t ksize, const void __user *src,
                      size_t usize)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(dst, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize < ksize) {
                memset(dst + size, 0, rest);
        } else if (usize > ksize) {
                int ret = check_zeroed_user(src + size, rest);
                if (ret <= 0)
                        return ret ?: -E2BIG;
        }
        /* Copy the interoperable parts of the struct. */
        if (copy_from_user(dst, src, size))
                return -EFAULT;
        return 0;
}

/**
 * copy_struct_to_user: copy a struct to userspace
 * @dst:   Destination address, in userspace. This buffer must be @ksize
 *         bytes long.
 * @usize: (Alleged) size of @dst struct.
 * @src:   Source address, in kernel space.
 * @ksize: Size of @src struct.
 * @ignored_trailing: Set to %true if there was a non-zero byte in @src that
 * userspace cannot see because they are using an smaller struct.
 *
 * Copies a struct from kernel space to userspace, in a way that guarantees
 * backwards-compatibility for struct syscall arguments (as long as future
 * struct extensions are made such that all new fields are *appended* to the
 * old struct, and zeroed-out new fields have the same meaning as the old
 * struct).
 *
 * Some syscalls may wish to make sure that userspace knows about everything in
 * the struct, and if there is a non-zero value that userspce doesn't know
 * about, they want to return an error (such as -EMSGSIZE) or have some other
 * fallback (such as adding a "you're missing some information" flag). If
 * @ignored_trailing is non-%NULL, it will be set to %true if there was a
 * non-zero byte that could not be copied to userspace (ie. was past @usize).
 *
 * While unconditionally returning an error in this case is the simplest
 * solution, for maximum backward compatibility you should try to only return
 * -EMSGSIZE if the user explicitly requested the data that couldn't be copied.
 * Note that structure sizes can change due to header changes and simple
 * recompilations without code changes(!), so if you care about
 * @ignored_trailing you probably want to make sure that any new field data is
 * associated with a flag. Otherwise you might assume that a program knows
 * about data it does not.
 *
 * @ksize is just sizeof(*src), and @usize should've been passed by userspace.
 * The recommended usage is something like the following:
 *
 *   SYSCALL_DEFINE2(foobar, struct foo __user *, uarg, size_t, usize)
 *   {
 *      int err;
 *      bool ignored_trailing;
 *      struct foo karg = {};
 *
 *      if (usize > PAGE_SIZE)
 *                return -E2BIG;
 *      if (usize < FOO_SIZE_VER0)
 *                return -EINVAL;
 *
 *      // ... modify karg somehow ...
 *
 *      err = copy_struct_to_user(uarg, usize, &karg, sizeof(karg),
 *                                  &ignored_trailing);
 *      if (err)
 *                return err;
 *      if (ignored_trailing)
 *                return -EMSGSIZE:
 *
 *      // ...
 *   }
 *
 * There are three cases to consider:
 *  * If @usize == @ksize, then it's copied verbatim.
 *  * If @usize < @ksize, then the kernel is trying to pass userspace a newer
 *    struct than it supports. Thus we only copy the interoperable portions
 *    (@usize) and ignore the rest (but @ignored_trailing is set to %true if
 *    any of the trailing (@ksize - @usize) bytes are non-zero).
 *  * If @usize > @ksize, then the kernel is trying to pass userspace an older
 *    struct than userspace supports. In order to make sure the
 *    unknown-to-the-kernel fields don't contain garbage values, we zero the
 *    trailing (@usize - @ksize) bytes.
 *
 * Returns (in all cases, some data may have been copied):
 *  * -EFAULT: access to userspace failed.
 */
static __always_inline __must_check int
copy_struct_to_user(void __user *dst, size_t usize, const void *src,
                    size_t ksize, bool *ignored_trailing)
{
        size_t size = min(ksize, usize);
        size_t rest = max(ksize, usize) - size;

        /* Double check if ksize is larger than a known object size. */
        if (WARN_ON_ONCE(ksize > __builtin_object_size(src, 1)))
                return -E2BIG;

        /* Deal with trailing bytes. */
        if (usize > ksize) {
                if (clear_user(dst + size, rest))
                        return -EFAULT;
        }
        if (ignored_trailing)
                *ignored_trailing = ksize < usize &&
                        memchr_inv(src + size, 0, rest) != NULL;
        /* Copy the interoperable parts of the struct. */
        if (copy_to_user(dst, src, size))
                return -EFAULT;
        return 0;
}

bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size);

long copy_from_kernel_nofault(void *dst, const void *src, size_t size);
long notrace copy_to_kernel_nofault(void *dst, const void *src, size_t size);

long copy_from_user_nofault(void *dst, const void __user *src, size_t size);
long notrace copy_to_user_nofault(void __user *dst, const void *src,
                size_t size);

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr,
                long count);

long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                long count);
long strnlen_user_nofault(const void __user *unsafe_addr, long count);

#ifndef __get_kernel_nofault
#define __get_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(src);        \
        type data;                                        \
        if (__get_user(data, p))                        \
                goto label;                                \
        *(type *)dst = data;                                \
} while (0)

#define __put_kernel_nofault(dst, src, type, label)        \
do {                                                        \
        type __user *p = (type __force __user *)(dst);        \
        type data = *(type *)src;                        \
        if (__put_user(data, p))                        \
                goto label;                                \
} while (0)
#endif

/**
 * get_kernel_nofault(): safely attempt to read from a location
 * @val: read into this variable
 * @ptr: address to read from
 *
 * Returns 0 on success, or -EFAULT.
 */
#define get_kernel_nofault(val, ptr) ({                                \
        const typeof(val) *__gk_ptr = (ptr);                        \
        copy_from_kernel_nofault(&(val), __gk_ptr, sizeof(val));\
})

#ifndef user_access_begin
#define user_access_begin(ptr,len) access_ok(ptr, len)
#define user_access_end() do { } while (0)
#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0)
#define unsafe_get_user(x,p,e) unsafe_op_wrap(__get_user(x,p),e)
#define unsafe_put_user(x,p,e) unsafe_op_wrap(__put_user(x,p),e)
#define unsafe_copy_to_user(d,s,l,e) unsafe_op_wrap(__copy_to_user(d,s,l),e)
#define unsafe_copy_from_user(d,s,l,e) unsafe_op_wrap(__copy_from_user(d,s,l),e)
static inline unsigned long user_access_save(void) { return 0UL; }
static inline void user_access_restore(unsigned long flags) { }
#endif
#ifndef user_write_access_begin
#define user_write_access_begin user_access_begin
#define user_write_access_end user_access_end
#endif
#ifndef user_read_access_begin
#define user_read_access_begin user_access_begin
#define user_read_access_end user_access_end
#endif

#ifdef CONFIG_HARDENED_USERCOPY
void __noreturn usercopy_abort(const char *name, const char *detail,
                               bool to_user, unsigned long offset,
                               unsigned long len);
#endif

#endif                /* __LINUX_UACCESS_H__ */



























































































































































































































































































































































    3 














































    3 


    3 
    3 

    3 
    3 














    3 




















    3 
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ADDRCONF_H
#define _ADDRCONF_H

#define MAX_RTR_SOLICITATIONS                -1                /* unlimited */
#define RTR_SOLICITATION_INTERVAL        (4*HZ)
#define RTR_SOLICITATION_MAX_INTERVAL        (3600*HZ)        /* 1 hour */

#define MIN_VALID_LIFETIME                (2*3600)        /* 2 hours */

#define TEMP_VALID_LIFETIME                (7*86400)       /* 1 week */
#define TEMP_PREFERRED_LIFETIME                (86400)         /* 24 hours */
#define REGEN_MIN_ADVANCE                (2)             /* 2 seconds */
#define REGEN_MAX_RETRY                        (3)
#define MAX_DESYNC_FACTOR                (600)

#define ADDR_CHECK_FREQUENCY                (120*HZ)

#define IPV6_MAX_ADDRESSES                16

#define ADDRCONF_TIMER_FUZZ_MINUS        (HZ > 50 ? HZ / 50 : 1)
#define ADDRCONF_TIMER_FUZZ                (HZ / 4)
#define ADDRCONF_TIMER_FUZZ_MAX                (HZ)

#define ADDRCONF_NOTIFY_PRIORITY        0

#include <linux/in.h>
#include <linux/in6.h>

struct prefix_info {
        __u8                        type;
        __u8                        length;
        __u8                        prefix_len;

        union __packed {
                __u8                flags;
                struct __packed {
#if defined(__BIG_ENDIAN_BITFIELD)
                        __u8        onlink : 1,
                                autoconf : 1,
                                routeraddr : 1,
                                preferpd : 1,
                                reserved : 4;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
                        __u8        reserved : 4,
                                preferpd : 1,
                                routeraddr : 1,
                                autoconf : 1,
                                onlink : 1;
#else
#error "Please fix <asm/byteorder.h>"
#endif
                };
        };
        __be32                        valid;
        __be32                        prefered;
        __be32                        reserved2;

        struct in6_addr                prefix;
};

/* rfc4861 4.6.2: IPv6 PIO is 32 bytes in size */
static_assert(sizeof(struct prefix_info) == 32);

#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <net/if_inet6.h>
#include <net/ipv6.h>

struct in6_validator_info {
        struct in6_addr                i6vi_addr;
        struct inet6_dev        *i6vi_dev;
        struct netlink_ext_ack        *extack;
};

struct ifa6_config {
        const struct in6_addr        *pfx;
        unsigned int                plen;

        u8                        ifa_proto;

        const struct in6_addr        *peer_pfx;

        u32                        rt_priority;
        u32                        ifa_flags;
        u32                        preferred_lft;
        u32                        valid_lft;
        u16                        scope;
};

enum addr_type_t {
        UNICAST_ADDR,
        MULTICAST_ADDR,
        ANYCAST_ADDR,
};

struct inet6_fill_args {
        u32 portid;
        u32 seq;
        int event;
        unsigned int flags;
        int netnsid;
        int ifindex;
        enum addr_type_t type;
        bool force_rt_scope_universe;
};

int addrconf_init(void);
void addrconf_cleanup(void);

int addrconf_add_ifaddr(struct net *net, void __user *arg);
int addrconf_del_ifaddr(struct net *net, void __user *arg);
int addrconf_set_dstaddr(struct net *net, void __user *arg);

int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
                  const struct net_device *dev, int strict);
int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
                            const struct net_device *dev, bool skip_dev_check,
                            int strict, u32 banned_flags);

#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr);
#endif

int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
                          unsigned char nsegs);

bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
                                   const unsigned int prefix_len,
                                   struct net_device *dev);

int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev);

struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
                                 struct net_device *dev);

struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev, int strict);

int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
                       const struct in6_addr *daddr, unsigned int srcprefs,
                       struct in6_addr *saddr);
int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
                    u32 banned_flags);
bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
                          bool match_wildcard);
bool inet_rcv_saddr_any(const struct sock *sk);
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr);

void addrconf_add_linklocal(struct inet6_dev *idev,
                            const struct in6_addr *addr, u32 flags);

int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
                                 const struct prefix_info *pinfo,
                                 struct inet6_dev *in6_dev,
                                 const struct in6_addr *addr, int addr_type,
                                 u32 addr_flags, bool sllao, bool tokenized,
                                 __u32 valid_lft, u32 prefered_lft);

static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr)
{
        memcpy(eui, addr, 3);
        eui[3] = 0xFF;
        eui[4] = 0xFE;
        memcpy(eui + 5, addr + 3, 3);
}

static inline void addrconf_addr_eui48(u8 *eui, const char *const addr)
{
        addrconf_addr_eui48_base(eui, addr);
        eui[0] ^= 2;
}

static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev)
{
        if (dev->addr_len != ETH_ALEN)
                return -1;

        /*
         * The zSeries OSA network cards can be shared among various
         * OS instances, but the OSA cards have only one MAC address.
         * This leads to duplicate address conflicts in conjunction
         * with IPv6 if more than one instance uses the same card.
         *
         * The driver for these cards can deliver a unique 16-bit
         * identifier for each instance sharing the same card.  It is
         * placed instead of 0xFFFE in the interface identifier.  The
         * "u" bit of the interface identifier is not inverted in this
         * case.  Hence the resulting interface identifier has local
         * scope according to RFC2373.
         */

        addrconf_addr_eui48_base(eui, dev->dev_addr);

        if (dev->dev_id) {
                eui[3] = (dev->dev_id >> 8) & 0xFF;
                eui[4] = dev->dev_id & 0xFF;
        } else {
                eui[0] ^= 2;
        }

        return 0;
}

#define INFINITY_LIFE_TIME 0xFFFFFFFF

static inline unsigned long addrconf_timeout_fixup(u32 timeout,
                                                   unsigned int unit)
{
        if (timeout == INFINITY_LIFE_TIME)
                return ~0UL;

        /*
         * Avoid arithmetic overflow.
         * Assuming unit is constant and non-zero, this "if" statement
         * will go away on 64bit archs.
         */
        if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit)
                return LONG_MAX / unit;

        return timeout;
}

static inline int addrconf_finite_timeout(unsigned long timeout)
{
        return ~timeout;
}

/*
 *        IPv6 Address Label subsystem (addrlabel.c)
 */
int ipv6_addr_label_init(void);
void ipv6_addr_label_cleanup(void);
int ipv6_addr_label_rtnl_register(void);
u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr,
                    int type, int ifindex);

/*
 *        multicast prototypes (mcast.c)
 */
static inline bool ipv6_mc_may_pull(struct sk_buff *skb,
                                    unsigned int len)
{
        if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len)
                return false;

        return pskb_may_pull(skb, len);
}

int ipv6_sock_mc_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_mc_close(struct sock *sk);
void ipv6_sock_mc_close(struct sock *sk);
bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
                    const struct in6_addr *src_addr);

int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr);
int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr);
int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr);
void ipv6_mc_up(struct inet6_dev *idev);
void ipv6_mc_down(struct inet6_dev *idev);
void ipv6_mc_unmap(struct inet6_dev *idev);
void ipv6_mc_remap(struct inet6_dev *idev);
void ipv6_mc_init_dev(struct inet6_dev *idev);
void ipv6_mc_destroy_dev(struct inet6_dev *idev);
int ipv6_mc_check_mld(struct sk_buff *skb);
void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp);

bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
                         const struct in6_addr *src_addr);

void ipv6_mc_dad_complete(struct inet6_dev *idev);

/*
 * identify MLD packets for MLD filter exceptions
 */
static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
{
        struct icmp6hdr *hdr;

        if (nexthdr != IPPROTO_ICMPV6 ||
            !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr)))
                return false;

        hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset);

        switch (hdr->icmp6_type) {
        case ICMPV6_MGM_QUERY:
        case ICMPV6_MGM_REPORT:
        case ICMPV6_MGM_REDUCTION:
        case ICMPV6_MLD2_REPORT:
                return true;
        default:
                break;
        }
        return false;
}

void addrconf_prefix_rcv(struct net_device *dev,
                         u8 *opt, int len, bool sllao);

/*
 *        anycast prototypes (anycast.c)
 */
int ipv6_sock_ac_join(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
int ipv6_sock_ac_drop(struct sock *sk, int ifindex,
                      const struct in6_addr *addr);
void __ipv6_sock_ac_close(struct sock *sk);
void ipv6_sock_ac_close(struct sock *sk);

int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr);
int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr);
void ipv6_ac_destroy_dev(struct inet6_dev *idev);
bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
                         const struct in6_addr *addr);
bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
                             const struct in6_addr *addr);
int ipv6_anycast_init(void);
void ipv6_anycast_cleanup(void);

/* Device notifier */
int register_inet6addr_notifier(struct notifier_block *nb);
int unregister_inet6addr_notifier(struct notifier_block *nb);
int inet6addr_notifier_call_chain(unsigned long val, void *v);

int register_inet6addr_validator_notifier(struct notifier_block *nb);
int unregister_inet6addr_validator_notifier(struct notifier_block *nb);
int inet6addr_validator_notifier_call_chain(unsigned long val, void *v);

void inet6_netconf_notify_devconf(struct net *net, int event, int type,
                                  int ifindex, struct ipv6_devconf *devconf);

/**
 * __in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev)
{
        return rcu_dereference_rtnl(dev->ip6_ptr);
}

static inline struct inet6_dev *__in6_dev_get_rtnl_net(const struct net_device *dev)
{
        return rtnl_net_dereference(dev_net(dev), dev->ip6_ptr);
}

/**
 * __in6_dev_stats_get - get inet6_dev pointer for stats
 * @dev: network device
 * @skb: skb for original incoming interface if needed
 *
 * Caller must hold rcu_read_lock or RTNL, because this function
 * does not take a reference on the inet6_dev.
 */
static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        if (netif_is_l3_master(dev))
                dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb));
        return __in6_dev_get(dev);
}

/**
 * __in6_dev_get_safely - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This is a safer version of __in6_dev_get
 */
static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev)
{
        if (likely(dev))
                return rcu_dereference_rtnl(dev->ip6_ptr);
        else
                return NULL;
}

/**
 * in6_dev_get - get inet6_dev pointer from netdevice
 * @dev: network device
 *
 * This version can be used in any context, and takes a reference
 * on the inet6_dev. Callers must use in6_dev_put() later to
 * release this reference.
 */
static inline struct inet6_dev *in6_dev_get(const struct net_device *dev)
{
        struct inet6_dev *idev;

        rcu_read_lock();
        idev = rcu_dereference(dev->ip6_ptr);
        if (idev)
                refcount_inc(&idev->refcnt);
        rcu_read_unlock();
        return idev;
}

static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev)
{
        struct inet6_dev *idev = __in6_dev_get(dev);

        return idev ? idev->nd_parms : NULL;
}

void in6_dev_finish_destroy(struct inet6_dev *idev);

static inline void in6_dev_put(struct inet6_dev *idev)
{
        if (refcount_dec_and_test(&idev->refcnt))
                in6_dev_finish_destroy(idev);
}

static inline void in6_dev_put_clear(struct inet6_dev **pidev)
{
        struct inet6_dev *idev = *pidev;

        if (idev) {
                in6_dev_put(idev);
                *pidev = NULL;
        }
}

static inline void __in6_dev_put(struct inet6_dev *idev)
{
        refcount_dec(&idev->refcnt);
}

static inline void in6_dev_hold(struct inet6_dev *idev)
{
        refcount_inc(&idev->refcnt);
}

/* called with rcu_read_lock held */
static inline bool ip6_ignore_linkdown(const struct net_device *dev)
{
        const struct inet6_dev *idev = __in6_dev_get(dev);

        if (unlikely(!idev))
                return true;

        return !!READ_ONCE(idev->cnf.ignore_routes_with_linkdown);
}

void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp);

static inline void in6_ifa_put(struct inet6_ifaddr *ifp)
{
        if (refcount_dec_and_test(&ifp->refcnt))
                inet6_ifa_finish_destroy(ifp);
}

static inline void __in6_ifa_put(struct inet6_ifaddr *ifp)
{
        refcount_dec(&ifp->refcnt);
}

static inline void in6_ifa_hold(struct inet6_ifaddr *ifp)
{
        refcount_inc(&ifp->refcnt);
}

static inline bool in6_ifa_hold_safe(struct inet6_ifaddr *ifp)
{
        return refcount_inc_not_zero(&ifp->refcnt);
}

/*
 *        compute link-local solicited-node multicast address
 */

static inline void addrconf_addr_solict_mult(const struct in6_addr *addr,
                                             struct in6_addr *solicited)
{
        ipv6_addr_set(solicited,
                      htonl(0xFF020000), 0,
                      htonl(0x1),
                      htonl(0xFF000000) | addr->s6_addr32[3]);
}

static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0;
#endif
}

static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0;
#endif
}

static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr)
{
        return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE);
}

static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;
        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) &
                 cpu_to_be64(0xffffffffff000000UL))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] |
                (addr->s6_addr32[2] ^ htonl(0x00000001)) |
                (addr->s6_addr[12] ^ 0xff)) == 0;
#endif
}

static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr)
{
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
        __be64 *p = (__force __be64 *)addr;

        return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) |
                (p[1] ^ cpu_to_be64(0x6a))) == 0UL;
#else
        return ((addr->s6_addr32[0] ^ htonl(0xff020000)) |
                addr->s6_addr32[1] | addr->s6_addr32[2] |
                (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0;
#endif
}

#ifdef CONFIG_PROC_FS
int if6_proc_init(void);
void if6_proc_exit(void);
#endif

int inet6_fill_ifmcaddr(struct sk_buff *skb,
                        const struct ifmcaddr6 *ifmca,
                        struct inet6_fill_args *args);

int inet6_fill_ifacaddr(struct sk_buff *skb,
                        const struct ifacaddr6 *ifaca,
                        struct inet6_fill_args *args);
#endif








































  157 
    4 

  157 






  166 



    1 




  157 


  157 





  165 
  165 






  166 

















  166 










  157 








  125 
  126 
  126 
  126 
  127 
  127 










  126 
  127 
  126 
  127 
  127 
  126 
  127 
  127 
  126 
  127 
  125 

  127 




  127 
  125 
  127 













  157 




  157 


  157 





  165 
  166 










  164 
  165 












  166 
  166 
  165 
  165 










  165 
  165 
  165 
  166 
  166 
  165 
  164 
  165 
  165 
  166 
  166 

  166 






















  165 
  165 
  165 







  156 













  156 




















  157 




  118 







    8 
    8 


    8 




  150 







   16 
   16 


   16 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012-2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#ifndef __ARM64_KVM_HYP_SYSREG_SR_H__
#define __ARM64_KVM_HYP_SYSREG_SR_H__

#include <linux/compiler.h>
#include <linux/kvm_host.h>

#include <asm/kprobes.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>

static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt);

static inline struct kvm_vcpu *ctxt_to_vcpu(struct kvm_cpu_context *ctxt)
{
        struct kvm_vcpu *vcpu = ctxt->__hyp_running_vcpu;

        if (!vcpu)
                vcpu = container_of(ctxt, struct kvm_vcpu, arch.ctxt);

        return vcpu;
}

static inline bool ctxt_is_guest(struct kvm_cpu_context *ctxt)
{
        return host_data_ptr(host_ctxt) != ctxt;
}

static inline u64 *ctxt_mdscr_el1(struct kvm_cpu_context *ctxt)
{
        struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt);

        if (ctxt_is_guest(ctxt) && kvm_host_owns_debug_regs(vcpu))
                return &vcpu->arch.external_mdscr_el1;

        return &ctxt_sys_reg(ctxt, MDSCR_EL1);
}

static inline u64 ctxt_midr_el1(struct kvm_cpu_context *ctxt)
{
        struct kvm *kvm = kern_hyp_va(ctxt_to_vcpu(ctxt)->kvm);

        if (!(ctxt_is_guest(ctxt) &&
              test_bit(KVM_ARCH_FLAG_WRITABLE_IMP_ID_REGS, &kvm->arch.flags)))
                return read_cpuid_id();

        return kvm_read_vm_id_reg(kvm, SYS_MIDR_EL1);
}

static inline void __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
{
        *ctxt_mdscr_el1(ctxt)        = read_sysreg(mdscr_el1);

        // POR_EL0 can affect uaccess, so must be saved/restored early.
        if (ctxt_has_s1poe(ctxt))
                ctxt_sys_reg(ctxt, POR_EL0)        = read_sysreg_s(SYS_POR_EL0);
}

static inline void __sysreg_save_user_state(struct kvm_cpu_context *ctxt)
{
        ctxt_sys_reg(ctxt, TPIDR_EL0)        = read_sysreg(tpidr_el0);
        ctxt_sys_reg(ctxt, TPIDRRO_EL0)        = read_sysreg(tpidrro_el0);
}

static inline bool ctxt_has_mte(struct kvm_cpu_context *ctxt)
{
        struct kvm_vcpu *vcpu = ctxt_to_vcpu(ctxt);

        return kvm_has_mte(kern_hyp_va(vcpu->kvm));
}

static inline bool ctxt_has_s1pie(struct kvm_cpu_context *ctxt)
{
        struct kvm_vcpu *vcpu;

        if (!cpus_have_final_cap(ARM64_HAS_S1PIE))
                return false;

        vcpu = ctxt_to_vcpu(ctxt);
        return kvm_has_s1pie(kern_hyp_va(vcpu->kvm));
}

static inline bool ctxt_has_tcrx(struct kvm_cpu_context *ctxt)
{
        struct kvm_vcpu *vcpu;

        if (!cpus_have_final_cap(ARM64_HAS_TCR2))
                return false;

        vcpu = ctxt_to_vcpu(ctxt);
        return kvm_has_tcr2(kern_hyp_va(vcpu->kvm));
}

static inline bool ctxt_has_s1poe(struct kvm_cpu_context *ctxt)
{
        struct kvm_vcpu *vcpu;

        if (!system_supports_poe())
                return false;

        vcpu = ctxt_to_vcpu(ctxt);
        return kvm_has_s1poe(kern_hyp_va(vcpu->kvm));
}

static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
{
        ctxt_sys_reg(ctxt, SCTLR_EL1)        = read_sysreg_el1(SYS_SCTLR);
        ctxt_sys_reg(ctxt, CPACR_EL1)        = read_sysreg_el1(SYS_CPACR);
        ctxt_sys_reg(ctxt, TTBR0_EL1)        = read_sysreg_el1(SYS_TTBR0);
        ctxt_sys_reg(ctxt, TTBR1_EL1)        = read_sysreg_el1(SYS_TTBR1);
        ctxt_sys_reg(ctxt, TCR_EL1)        = read_sysreg_el1(SYS_TCR);
        if (ctxt_has_tcrx(ctxt)) {
                ctxt_sys_reg(ctxt, TCR2_EL1)        = read_sysreg_el1(SYS_TCR2);

                if (ctxt_has_s1pie(ctxt)) {
                        ctxt_sys_reg(ctxt, PIR_EL1)        = read_sysreg_el1(SYS_PIR);
                        ctxt_sys_reg(ctxt, PIRE0_EL1)        = read_sysreg_el1(SYS_PIRE0);
                }

                if (ctxt_has_s1poe(ctxt))
                        ctxt_sys_reg(ctxt, POR_EL1)        = read_sysreg_el1(SYS_POR);
        }
        ctxt_sys_reg(ctxt, ESR_EL1)        = read_sysreg_el1(SYS_ESR);
        ctxt_sys_reg(ctxt, AFSR0_EL1)        = read_sysreg_el1(SYS_AFSR0);
        ctxt_sys_reg(ctxt, AFSR1_EL1)        = read_sysreg_el1(SYS_AFSR1);
        ctxt_sys_reg(ctxt, FAR_EL1)        = read_sysreg_el1(SYS_FAR);
        ctxt_sys_reg(ctxt, MAIR_EL1)        = read_sysreg_el1(SYS_MAIR);
        ctxt_sys_reg(ctxt, VBAR_EL1)        = read_sysreg_el1(SYS_VBAR);
        ctxt_sys_reg(ctxt, CONTEXTIDR_EL1) = read_sysreg_el1(SYS_CONTEXTIDR);
        ctxt_sys_reg(ctxt, AMAIR_EL1)        = read_sysreg_el1(SYS_AMAIR);
        ctxt_sys_reg(ctxt, CNTKCTL_EL1)        = read_sysreg_el1(SYS_CNTKCTL);
        ctxt_sys_reg(ctxt, PAR_EL1)        = read_sysreg_par();
        ctxt_sys_reg(ctxt, TPIDR_EL1)        = read_sysreg(tpidr_el1);

        if (ctxt_has_mte(ctxt)) {
                ctxt_sys_reg(ctxt, TFSR_EL1) = read_sysreg_el1(SYS_TFSR);
                ctxt_sys_reg(ctxt, TFSRE0_EL1) = read_sysreg_s(SYS_TFSRE0_EL1);
        }

        ctxt_sys_reg(ctxt, SP_EL1)        = read_sysreg(sp_el1);
        ctxt_sys_reg(ctxt, ELR_EL1)        = read_sysreg_el1(SYS_ELR);
        ctxt_sys_reg(ctxt, SPSR_EL1)        = read_sysreg_el1(SYS_SPSR);
}

static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
{
        ctxt->regs.pc                        = read_sysreg_el2(SYS_ELR);
        /*
         * Guest PSTATE gets saved at guest fixup time in all
         * cases. We still need to handle the nVHE host side here.
         */
        if (!has_vhe() && ctxt->__hyp_running_vcpu)
                ctxt->regs.pstate        = read_sysreg_el2(SYS_SPSR);

        if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
                ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2);
}

static inline void __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
{
        write_sysreg(*ctxt_mdscr_el1(ctxt),  mdscr_el1);

        // POR_EL0 can affect uaccess, so must be saved/restored early.
        if (ctxt_has_s1poe(ctxt))
                write_sysreg_s(ctxt_sys_reg(ctxt, POR_EL0),        SYS_POR_EL0);
}

static inline void __sysreg_restore_user_state(struct kvm_cpu_context *ctxt)
{
        write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL0),        tpidr_el0);
        write_sysreg(ctxt_sys_reg(ctxt, TPIDRRO_EL0),        tpidrro_el0);
}

static inline void __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt,
                                              u64 midr, u64 mpidr)
{
        write_sysreg(midr,                                vpidr_el2);
        write_sysreg(mpidr,                                vmpidr_el2);

        if (has_vhe() ||
            !cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) {
                write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL1),        SYS_SCTLR);
                write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL1),        SYS_TCR);
        } else        if (!ctxt->__hyp_running_vcpu) {
                /*
                 * Must only be done for guest registers, hence the context
                 * test. We're coming from the host, so SCTLR.M is already
                 * set. Pairs with nVHE's __activate_traps().
                 */
                write_sysreg_el1((ctxt_sys_reg(ctxt, TCR_EL1) |
                                  TCR_EPD1_MASK | TCR_EPD0_MASK),
                                 SYS_TCR);
                isb();
        }

        write_sysreg_el1(ctxt_sys_reg(ctxt, CPACR_EL1),        SYS_CPACR);
        write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR0_EL1),        SYS_TTBR0);
        write_sysreg_el1(ctxt_sys_reg(ctxt, TTBR1_EL1),        SYS_TTBR1);
        if (ctxt_has_tcrx(ctxt)) {
                write_sysreg_el1(ctxt_sys_reg(ctxt, TCR2_EL1),        SYS_TCR2);

                if (ctxt_has_s1pie(ctxt)) {
                        write_sysreg_el1(ctxt_sys_reg(ctxt, PIR_EL1),        SYS_PIR);
                        write_sysreg_el1(ctxt_sys_reg(ctxt, PIRE0_EL1),        SYS_PIRE0);
                }

                if (ctxt_has_s1poe(ctxt))
                        write_sysreg_el1(ctxt_sys_reg(ctxt, POR_EL1),        SYS_POR);
        }
        write_sysreg_el1(ctxt_sys_reg(ctxt, ESR_EL1),        SYS_ESR);
        write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR0_EL1),        SYS_AFSR0);
        write_sysreg_el1(ctxt_sys_reg(ctxt, AFSR1_EL1),        SYS_AFSR1);
        write_sysreg_el1(ctxt_sys_reg(ctxt, FAR_EL1),        SYS_FAR);
        write_sysreg_el1(ctxt_sys_reg(ctxt, MAIR_EL1),        SYS_MAIR);
        write_sysreg_el1(ctxt_sys_reg(ctxt, VBAR_EL1),        SYS_VBAR);
        write_sysreg_el1(ctxt_sys_reg(ctxt, CONTEXTIDR_EL1), SYS_CONTEXTIDR);
        write_sysreg_el1(ctxt_sys_reg(ctxt, AMAIR_EL1),        SYS_AMAIR);
        write_sysreg_el1(ctxt_sys_reg(ctxt, CNTKCTL_EL1), SYS_CNTKCTL);
        write_sysreg(ctxt_sys_reg(ctxt, PAR_EL1),        par_el1);
        write_sysreg(ctxt_sys_reg(ctxt, TPIDR_EL1),        tpidr_el1);

        if (ctxt_has_mte(ctxt)) {
                write_sysreg_el1(ctxt_sys_reg(ctxt, TFSR_EL1), SYS_TFSR);
                write_sysreg_s(ctxt_sys_reg(ctxt, TFSRE0_EL1), SYS_TFSRE0_EL1);
        }

        if (!has_vhe() &&
            cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT) &&
            ctxt->__hyp_running_vcpu) {
                /*
                 * Must only be done for host registers, hence the context
                 * test. Pairs with nVHE's __deactivate_traps().
                 */
                isb();
                /*
                 * At this stage, and thanks to the above isb(), S2 is
                 * deconfigured and disabled. We can now restore the host's
                 * S1 configuration: SCTLR, and only then TCR.
                 */
                write_sysreg_el1(ctxt_sys_reg(ctxt, SCTLR_EL1),        SYS_SCTLR);
                isb();
                write_sysreg_el1(ctxt_sys_reg(ctxt, TCR_EL1),        SYS_TCR);
        }

        write_sysreg(ctxt_sys_reg(ctxt, SP_EL1),        sp_el1);
        write_sysreg_el1(ctxt_sys_reg(ctxt, ELR_EL1),        SYS_ELR);
        write_sysreg_el1(ctxt_sys_reg(ctxt, SPSR_EL1),        SYS_SPSR);
}

/* Read the VCPU state's PSTATE, but translate (v)EL2 to EL1. */
static inline u64 to_hw_pstate(const struct kvm_cpu_context *ctxt)
{
        u64 mode = ctxt->regs.pstate & (PSR_MODE_MASK | PSR_MODE32_BIT);

        switch (mode) {
        case PSR_MODE_EL2t:
                mode = PSR_MODE_EL1t;
                break;
        case PSR_MODE_EL2h:
                mode = PSR_MODE_EL1h;
                break;
        }

        return (ctxt->regs.pstate & ~(PSR_MODE_MASK | PSR_MODE32_BIT)) | mode;
}

static inline void __sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
{
        u64 pstate = to_hw_pstate(ctxt);
        u64 mode = pstate & PSR_AA32_MODE_MASK;

        /*
         * Safety check to ensure we're setting the CPU up to enter the guest
         * in a less privileged mode.
         *
         * If we are attempting a return to EL2 or higher in AArch64 state,
         * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that
         * we'll take an illegal exception state exception immediately after
         * the ERET to the guest.  Attempts to return to AArch32 Hyp will
         * result in an illegal exception return because EL2's execution state
         * is determined by SCR_EL3.RW.
         */
        if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t)
                pstate = PSR_MODE_EL2h | PSR_IL_BIT;

        write_sysreg_el2(ctxt->regs.pc,                        SYS_ELR);
        write_sysreg_el2(pstate,                        SYS_SPSR);

        if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
                write_sysreg_s(ctxt_sys_reg(ctxt, DISR_EL1), SYS_VDISR_EL2);
}

static inline void __sysreg32_save_state(struct kvm_vcpu *vcpu)
{
        if (!vcpu_el1_is_32bit(vcpu))
                return;

        vcpu->arch.ctxt.spsr_abt = read_sysreg(spsr_abt);
        vcpu->arch.ctxt.spsr_und = read_sysreg(spsr_und);
        vcpu->arch.ctxt.spsr_irq = read_sysreg(spsr_irq);
        vcpu->arch.ctxt.spsr_fiq = read_sysreg(spsr_fiq);

        __vcpu_sys_reg(vcpu, DACR32_EL2) = read_sysreg(dacr32_el2);
        __vcpu_sys_reg(vcpu, IFSR32_EL2) = read_sysreg(ifsr32_el2);

        if (has_vhe() || kvm_debug_regs_in_use(vcpu))
                __vcpu_sys_reg(vcpu, DBGVCR32_EL2) = read_sysreg(dbgvcr32_el2);
}

static inline void __sysreg32_restore_state(struct kvm_vcpu *vcpu)
{
        if (!vcpu_el1_is_32bit(vcpu))
                return;

        write_sysreg(vcpu->arch.ctxt.spsr_abt, spsr_abt);
        write_sysreg(vcpu->arch.ctxt.spsr_und, spsr_und);
        write_sysreg(vcpu->arch.ctxt.spsr_irq, spsr_irq);
        write_sysreg(vcpu->arch.ctxt.spsr_fiq, spsr_fiq);

        write_sysreg(__vcpu_sys_reg(vcpu, DACR32_EL2), dacr32_el2);
        write_sysreg(__vcpu_sys_reg(vcpu, IFSR32_EL2), ifsr32_el2);

        if (has_vhe() || kvm_debug_regs_in_use(vcpu))
                write_sysreg(__vcpu_sys_reg(vcpu, DBGVCR32_EL2), dbgvcr32_el2);
}

#endif /* __ARM64_KVM_HYP_SYSREG_SR_H__ */







































































































  258 
  259 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
// SPDX-License-Identifier: GPL-2.0
/*
 *        linux/mm/madvise.c
 *
 * Copyright (C) 1999  Linus Torvalds
 * Copyright (C) 2002  Christoph Hellwig
 */

#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/mmu_notifier.h>

#include <asm/tlb.h>

#include "internal.h"
#include "swap.h"

/*
 * Maximum number of attempts we make to install guard pages before we give up
 * and return -ERESTARTNOINTR to have userspace try again.
 */
#define MAX_MADVISE_GUARD_RETRIES 3

struct madvise_walk_private {
        struct mmu_gather *tlb;
        bool pageout;
};

/*
 * Any behaviour which results in changes to the vma->vm_flags needs to
 * take mmap_lock for writing. Others, which simply traverse vmas, need
 * to only take it for reading.
 */
static int madvise_need_mmap_write(int behavior)
{
        switch (behavior) {
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_FREE:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
        case MADV_COLLAPSE:
        case MADV_GUARD_INSTALL:
        case MADV_GUARD_REMOVE:
                return 0;
        default:
                /* be safe, default to 1. list exceptions explicitly */
                return 1;
        }
}

#ifdef CONFIG_ANON_VMA_NAME
struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
        struct anon_vma_name *anon_name;
        size_t count;

        /* Add 1 for NUL terminator at the end of the anon_name->name */
        count = strlen(name) + 1;
        anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
        if (anon_name) {
                kref_init(&anon_name->kref);
                memcpy(anon_name->name, name, count);
        }

        return anon_name;
}

void anon_vma_name_free(struct kref *kref)
{
        struct anon_vma_name *anon_name =
                        container_of(kref, struct anon_vma_name, kref);
        kfree(anon_name);
}

struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
        mmap_assert_locked(vma->vm_mm);

        return vma->anon_name;
}

/* mmap_lock should be write-locked */
static int replace_anon_vma_name(struct vm_area_struct *vma,
                                 struct anon_vma_name *anon_name)
{
        struct anon_vma_name *orig_name = anon_vma_name(vma);

        if (!anon_name) {
                vma->anon_name = NULL;
                anon_vma_name_put(orig_name);
                return 0;
        }

        if (anon_vma_name_eq(orig_name, anon_name))
                return 0;

        vma->anon_name = anon_vma_name_reuse(anon_name);
        anon_vma_name_put(orig_name);

        return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
static int replace_anon_vma_name(struct vm_area_struct *vma,
                                 struct anon_vma_name *anon_name)
{
        if (anon_name)
                return -EINVAL;

        return 0;
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
 * Update the vm_flags on region of a vma, splitting it or merging it as
 * necessary.  Must be called with mmap_lock held for writing;
 * Caller should ensure anon_name stability by raising its refcount even when
 * anon_name belongs to a valid vma because this function might free that vma.
 */
static int madvise_update_vma(struct vm_area_struct *vma,
                              struct vm_area_struct **prev, unsigned long start,
                              unsigned long end, unsigned long new_flags,
                              struct anon_vma_name *anon_name)
{
        struct mm_struct *mm = vma->vm_mm;
        int error;
        VMA_ITERATOR(vmi, mm, start);

        if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
                *prev = vma;
                return 0;
        }

        vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
                                    anon_name);
        if (IS_ERR(vma))
                return PTR_ERR(vma);

        *prev = vma;

        /* vm_flags is protected by the mmap_lock held in write mode. */
        vma_start_write(vma);
        vm_flags_reset(vma, new_flags);
        if (!vma->vm_file || vma_is_anon_shmem(vma)) {
                error = replace_anon_vma_name(vma, anon_name);
                if (error)
                        return error;
        }

        return 0;
}

#ifdef CONFIG_SWAP
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
                unsigned long end, struct mm_walk *walk)
{
        struct vm_area_struct *vma = walk->private;
        struct swap_iocb *splug = NULL;
        pte_t *ptep = NULL;
        spinlock_t *ptl;
        unsigned long addr;

        for (addr = start; addr < end; addr += PAGE_SIZE) {
                pte_t pte;
                swp_entry_t entry;
                struct folio *folio;

                if (!ptep++) {
                        ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
                        if (!ptep)
                                break;
                }

                pte = ptep_get(ptep);
                if (!is_swap_pte(pte))
                        continue;
                entry = pte_to_swp_entry(pte);
                if (unlikely(non_swap_entry(entry)))
                        continue;

                pte_unmap_unlock(ptep, ptl);
                ptep = NULL;

                folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
                                             vma, addr, &splug);
                if (folio)
                        folio_put(folio);
        }

        if (ptep)
                pte_unmap_unlock(ptep, ptl);
        swap_read_unplug(splug);
        cond_resched();

        return 0;
}

static const struct mm_walk_ops swapin_walk_ops = {
        .pmd_entry                = swapin_walk_pmd_entry,
        .walk_lock                = PGWALK_RDLOCK,
};

static void shmem_swapin_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                struct address_space *mapping)
{
        XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
        pgoff_t end_index = linear_page_index(vma, end) - 1;
        struct folio *folio;
        struct swap_iocb *splug = NULL;

        rcu_read_lock();
        xas_for_each(&xas, folio, end_index) {
                unsigned long addr;
                swp_entry_t entry;

                if (!xa_is_value(folio))
                        continue;
                entry = radix_to_swp_entry(folio);
                /* There might be swapin error entries in shmem mapping. */
                if (non_swap_entry(entry))
                        continue;

                addr = vma->vm_start +
                        ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
                xas_pause(&xas);
                rcu_read_unlock();

                folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
                                             vma, addr, &splug);
                if (folio)
                        folio_put(folio);

                rcu_read_lock();
        }
        rcu_read_unlock();
        swap_read_unplug(splug);
}
#endif                /* CONFIG_SWAP */

/*
 * Schedule all required I/O operations.  Do not wait for completion.
 */
static long madvise_willneed(struct vm_area_struct *vma,
                             struct vm_area_struct **prev,
                             unsigned long start, unsigned long end)
{
        struct mm_struct *mm = vma->vm_mm;
        struct file *file = vma->vm_file;
        loff_t offset;

        *prev = vma;
#ifdef CONFIG_SWAP
        if (!file) {
                walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
                lru_add_drain(); /* Push any new pages onto the LRU now */
                return 0;
        }

        if (shmem_mapping(file->f_mapping)) {
                shmem_swapin_range(vma, start, end, file->f_mapping);
                lru_add_drain(); /* Push any new pages onto the LRU now */
                return 0;
        }
#else
        if (!file)
                return -EBADF;
#endif

        if (IS_DAX(file_inode(file))) {
                /* no bad return value, but ignore advice */
                return 0;
        }

        /*
         * Filesystem's fadvise may need to take various locks.  We need to
         * explicitly grab a reference because the vma (and hence the
         * vma's reference to the file) can go away as soon as we drop
         * mmap_lock.
         */
        *prev = NULL;        /* tell sys_madvise we drop mmap_lock */
        get_file(file);
        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
        mmap_read_unlock(mm);
        vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
        fput(file);
        mmap_read_lock(mm);
        return 0;
}

static inline bool can_do_file_pageout(struct vm_area_struct *vma)
{
        if (!vma->vm_file)
                return false;
        /*
         * paging out pagecache only for non-anonymous mappings that correspond
         * to the files the calling process could (if tried) open for writing;
         * otherwise we'd be including shared non-exclusive mappings, which
         * opens a side channel.
         */
        return inode_owner_or_capable(&nop_mnt_idmap,
                                      file_inode(vma->vm_file)) ||
               file_permission(vma->vm_file, MAY_WRITE) == 0;
}

static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
                                          struct folio *folio, pte_t *ptep,
                                          pte_t pte, bool *any_young,
                                          bool *any_dirty)
{
        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
        int max_nr = (end - addr) / PAGE_SIZE;

        return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
                               any_young, any_dirty);
}

static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
                                unsigned long addr, unsigned long end,
                                struct mm_walk *walk)
{
        struct madvise_walk_private *private = walk->private;
        struct mmu_gather *tlb = private->tlb;
        bool pageout = private->pageout;
        struct mm_struct *mm = tlb->mm;
        struct vm_area_struct *vma = walk->vma;
        pte_t *start_pte, *pte, ptent;
        spinlock_t *ptl;
        struct folio *folio = NULL;
        LIST_HEAD(folio_list);
        bool pageout_anon_only_filter;
        unsigned int batch_count = 0;
        int nr;

        if (fatal_signal_pending(current))
                return -EINTR;

        pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
                                        !can_do_file_pageout(vma);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        if (pmd_trans_huge(*pmd)) {
                pmd_t orig_pmd;
                unsigned long next = pmd_addr_end(addr, end);

                tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
                ptl = pmd_trans_huge_lock(pmd, vma);
                if (!ptl)
                        return 0;

                orig_pmd = *pmd;
                if (is_huge_zero_pmd(orig_pmd))
                        goto huge_unlock;

                if (unlikely(!pmd_present(orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                        !is_pmd_migration_entry(orig_pmd));
                        goto huge_unlock;
                }

                folio = pmd_folio(orig_pmd);

                /* Do not interfere with other mappings of this folio */
                if (folio_maybe_mapped_shared(folio))
                        goto huge_unlock;

                if (pageout_anon_only_filter && !folio_test_anon(folio))
                        goto huge_unlock;

                if (next - addr != HPAGE_PMD_SIZE) {
                        int err;

                        folio_get(folio);
                        spin_unlock(ptl);
                        folio_lock(folio);
                        err = split_folio(folio);
                        folio_unlock(folio);
                        folio_put(folio);
                        if (!err)
                                goto regular_folio;
                        return 0;
                }

                if (!pageout && pmd_young(orig_pmd)) {
                        pmdp_invalidate(vma, addr, pmd);
                        orig_pmd = pmd_mkold(orig_pmd);

                        set_pmd_at(mm, addr, pmd, orig_pmd);
                        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
                }

                folio_clear_referenced(folio);
                folio_test_clear_young(folio);
                if (folio_test_active(folio))
                        folio_set_workingset(folio);
                if (pageout) {
                        if (folio_isolate_lru(folio)) {
                                if (folio_test_unevictable(folio))
                                        folio_putback_lru(folio);
                                else
                                        list_add(&folio->lru, &folio_list);
                        }
                } else
                        folio_deactivate(folio);
huge_unlock:
                spin_unlock(ptl);
                if (pageout)
                        reclaim_pages(&folio_list);
                return 0;
        }

regular_folio:
#endif
        tlb_change_page_size(tlb, PAGE_SIZE);
restart:
        start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (!start_pte)
                return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
                nr = 1;
                ptent = ptep_get(pte);

                if (++batch_count == SWAP_CLUSTER_MAX) {
                        batch_count = 0;
                        if (need_resched()) {
                                arch_leave_lazy_mmu_mode();
                                pte_unmap_unlock(start_pte, ptl);
                                cond_resched();
                                goto restart;
                        }
                }

                if (pte_none(ptent))
                        continue;

                if (!pte_present(ptent))
                        continue;

                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                /*
                 * If we encounter a large folio, only split it if it is not
                 * fully mapped within the range we are operating on. Otherwise
                 * leave it as is so that it can be swapped out whole. If we
                 * fail to split a folio, leave it in place and advance to the
                 * next pte in the range.
                 */
                if (folio_test_large(folio)) {
                        bool any_young;

                        nr = madvise_folio_pte_batch(addr, end, folio, pte,
                                                     ptent, &any_young, NULL);
                        if (any_young)
                                ptent = pte_mkyoung(ptent);

                        if (nr < folio_nr_pages(folio)) {
                                int err;

                                if (folio_maybe_mapped_shared(folio))
                                        continue;
                                if (pageout_anon_only_filter && !folio_test_anon(folio))
                                        continue;
                                if (!folio_trylock(folio))
                                        continue;
                                folio_get(folio);
                                arch_leave_lazy_mmu_mode();
                                pte_unmap_unlock(start_pte, ptl);
                                start_pte = NULL;
                                err = split_folio(folio);
                                folio_unlock(folio);
                                folio_put(folio);
                                start_pte = pte =
                                        pte_offset_map_lock(mm, pmd, addr, &ptl);
                                if (!start_pte)
                                        break;
                                arch_enter_lazy_mmu_mode();
                                if (!err)
                                        nr = 0;
                                continue;
                        }
                }

                /*
                 * Do not interfere with other mappings of this folio and
                 * non-LRU folio. If we have a large folio at this point, we
                 * know it is fully mapped so if its mapcount is the same as its
                 * number of pages, it must be exclusive.
                 */
                if (!folio_test_lru(folio) ||
                    folio_mapcount(folio) != folio_nr_pages(folio))
                        continue;

                if (pageout_anon_only_filter && !folio_test_anon(folio))
                        continue;

                if (!pageout && pte_young(ptent)) {
                        clear_young_dirty_ptes(vma, addr, pte, nr,
                                               CYDP_CLEAR_YOUNG);
                        tlb_remove_tlb_entries(tlb, pte, nr, addr);
                }

                /*
                 * We are deactivating a folio for accelerating reclaiming.
                 * VM couldn't reclaim the folio unless we clear PG_young.
                 * As a side effect, it makes confuse idle-page tracking
                 * because they will miss recent referenced history.
                 */
                folio_clear_referenced(folio);
                folio_test_clear_young(folio);
                if (folio_test_active(folio))
                        folio_set_workingset(folio);
                if (pageout) {
                        if (folio_isolate_lru(folio)) {
                                if (folio_test_unevictable(folio))
                                        folio_putback_lru(folio);
                                else
                                        list_add(&folio->lru, &folio_list);
                        }
                } else
                        folio_deactivate(folio);
        }

        if (start_pte) {
                arch_leave_lazy_mmu_mode();
                pte_unmap_unlock(start_pte, ptl);
        }
        if (pageout)
                reclaim_pages(&folio_list);
        cond_resched();

        return 0;
}

static const struct mm_walk_ops cold_walk_ops = {
        .pmd_entry = madvise_cold_or_pageout_pte_range,
        .walk_lock = PGWALK_RDLOCK,
};

static void madvise_cold_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end)
{
        struct madvise_walk_private walk_private = {
                .pageout = false,
                .tlb = tlb,
        };

        tlb_start_vma(tlb, vma);
        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
        tlb_end_vma(tlb, vma);
}

static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
        return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
}

static long madvise_cold(struct vm_area_struct *vma,
                        struct vm_area_struct **prev,
                        unsigned long start_addr, unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;

        *prev = vma;
        if (!can_madv_lru_vma(vma))
                return -EINVAL;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
        tlb_finish_mmu(&tlb);

        return 0;
}

static void madvise_pageout_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end)
{
        struct madvise_walk_private walk_private = {
                .pageout = true,
                .tlb = tlb,
        };

        tlb_start_vma(tlb, vma);
        walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
        tlb_end_vma(tlb, vma);
}

static long madvise_pageout(struct vm_area_struct *vma,
                        struct vm_area_struct **prev,
                        unsigned long start_addr, unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;

        *prev = vma;
        if (!can_madv_lru_vma(vma))
                return -EINVAL;

        /*
         * If the VMA belongs to a private file mapping, there can be private
         * dirty pages which can be paged out if even this process is neither
         * owner nor write capable of the file. We allow private file mappings
         * further to pageout dirty anon pages.
         */
        if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
                                (vma->vm_flags & VM_MAYSHARE)))
                return 0;

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
        tlb_finish_mmu(&tlb);

        return 0;
}

static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)

{
        const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
        struct mmu_gather *tlb = walk->private;
        struct mm_struct *mm = tlb->mm;
        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *start_pte, *pte, ptent;
        struct folio *folio;
        int nr_swap = 0;
        unsigned long next;
        int nr, max_nr;

        next = pmd_addr_end(addr, end);
        if (pmd_trans_huge(*pmd))
                if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
                        return 0;

        tlb_change_page_size(tlb, PAGE_SIZE);
        start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        if (!start_pte)
                return 0;
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
                nr = 1;
                ptent = ptep_get(pte);

                if (pte_none(ptent))
                        continue;
                /*
                 * If the pte has swp_entry, just clear page table to
                 * prevent swap-in which is more expensive rather than
                 * (page allocation + zeroing).
                 */
                if (!pte_present(ptent)) {
                        swp_entry_t entry;

                        entry = pte_to_swp_entry(ptent);
                        if (!non_swap_entry(entry)) {
                                max_nr = (end - addr) / PAGE_SIZE;
                                nr = swap_pte_batch(pte, max_nr, ptent);
                                nr_swap -= nr;
                                free_swap_and_cache_nr(entry, nr);
                                clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
                        } else if (is_hwpoison_entry(entry) ||
                                   is_poisoned_swp_entry(entry)) {
                                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        }
                        continue;
                }

                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                /*
                 * If we encounter a large folio, only split it if it is not
                 * fully mapped within the range we are operating on. Otherwise
                 * leave it as is so that it can be marked as lazyfree. If we
                 * fail to split a folio, leave it in place and advance to the
                 * next pte in the range.
                 */
                if (folio_test_large(folio)) {
                        bool any_young, any_dirty;

                        nr = madvise_folio_pte_batch(addr, end, folio, pte,
                                                     ptent, &any_young, &any_dirty);

                        if (nr < folio_nr_pages(folio)) {
                                int err;

                                if (folio_maybe_mapped_shared(folio))
                                        continue;
                                if (!folio_trylock(folio))
                                        continue;
                                folio_get(folio);
                                arch_leave_lazy_mmu_mode();
                                pte_unmap_unlock(start_pte, ptl);
                                start_pte = NULL;
                                err = split_folio(folio);
                                folio_unlock(folio);
                                folio_put(folio);
                                pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
                                start_pte = pte;
                                if (!start_pte)
                                        break;
                                arch_enter_lazy_mmu_mode();
                                if (!err)
                                        nr = 0;
                                continue;
                        }

                        if (any_young)
                                ptent = pte_mkyoung(ptent);
                        if (any_dirty)
                                ptent = pte_mkdirty(ptent);
                }

                if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
                        if (!folio_trylock(folio))
                                continue;
                        /*
                         * If we have a large folio at this point, we know it is
                         * fully mapped so if its mapcount is the same as its
                         * number of pages, it must be exclusive.
                         */
                        if (folio_mapcount(folio) != folio_nr_pages(folio)) {
                                folio_unlock(folio);
                                continue;
                        }

                        if (folio_test_swapcache(folio) &&
                            !folio_free_swap(folio)) {
                                folio_unlock(folio);
                                continue;
                        }

                        folio_clear_dirty(folio);
                        folio_unlock(folio);
                }

                if (pte_young(ptent) || pte_dirty(ptent)) {
                        clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
                        tlb_remove_tlb_entries(tlb, pte, nr, addr);
                }
                folio_mark_lazyfree(folio);
        }

        if (nr_swap)
                add_mm_counter(mm, MM_SWAPENTS, nr_swap);
        if (start_pte) {
                arch_leave_lazy_mmu_mode();
                pte_unmap_unlock(start_pte, ptl);
        }
        cond_resched();

        return 0;
}

static const struct mm_walk_ops madvise_free_walk_ops = {
        .pmd_entry                = madvise_free_pte_range,
        .walk_lock                = PGWALK_RDLOCK,
};

static int madvise_free_single_vma(struct vm_area_struct *vma,
                        unsigned long start_addr, unsigned long end_addr)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        /* MADV_FREE works for only anon vma at the moment */
        if (!vma_is_anonymous(vma))
                return -EINVAL;

        range.start = max(vma->vm_start, start_addr);
        if (range.start >= vma->vm_end)
                return -EINVAL;
        range.end = min(vma->vm_end, end_addr);
        if (range.end <= vma->vm_start)
                return -EINVAL;
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                range.start, range.end);

        lru_add_drain();
        tlb_gather_mmu(&tlb, mm);
        update_hiwater_rss(mm);

        mmu_notifier_invalidate_range_start(&range);
        tlb_start_vma(&tlb, vma);
        walk_page_range(vma->vm_mm, range.start, range.end,
                        &madvise_free_walk_ops, &tlb);
        tlb_end_vma(&tlb, vma);
        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);

        return 0;
}

/*
 * Application no longer needs these pages.  If the pages are dirty,
 * it's OK to just throw them away.  The app will be more careful about
 * data it wants to keep.  Be sure to free swap resources too.  The
 * zap_page_range_single call sets things up for shrink_active_list to actually
 * free these pages later if no one else has touched them in the meantime,
 * although we could add these pages to a global reuse list for
 * shrink_active_list to pick up before reclaiming other pages.
 *
 * NB: This interface discards data rather than pushes it out to swap,
 * as some implementations do.  This has performance implications for
 * applications like large transactional databases which want to discard
 * pages in anonymous maps after committing to backing store the data
 * that was kept in them.  There is no reason to write this data out to
 * the swap area if the application is discarding it.
 *
 * An interface that causes the system to free clean pages and flush
 * dirty pages is already available as msync(MS_INVALIDATE).
 */
static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
                                        unsigned long start, unsigned long end)
{
        struct zap_details details = {
                .reclaim_pt = true,
                .even_cows = true,
        };

        zap_page_range_single(vma, start, end - start, &details);
        return 0;
}

static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
                                            unsigned long start,
                                            unsigned long *end,
                                            int behavior)
{
        if (!is_vm_hugetlb_page(vma)) {
                unsigned int forbidden = VM_PFNMAP;

                if (behavior != MADV_DONTNEED_LOCKED)
                        forbidden |= VM_LOCKED;

                return !(vma->vm_flags & forbidden);
        }

        if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
                return false;
        if (start & ~huge_page_mask(hstate_vma(vma)))
                return false;

        /*
         * Madvise callers expect the length to be rounded up to PAGE_SIZE
         * boundaries, and may be unaware that this VMA uses huge pages.
         * Avoid unexpected data loss by rounding down the number of
         * huge pages freed.
         */
        *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));

        return true;
}

static long madvise_dontneed_free(struct vm_area_struct *vma,
                                  struct vm_area_struct **prev,
                                  unsigned long start, unsigned long end,
                                  int behavior)
{
        struct mm_struct *mm = vma->vm_mm;

        *prev = vma;
        if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
                return -EINVAL;

        if (start == end)
                return 0;

        if (!userfaultfd_remove(vma, start, end)) {
                *prev = NULL; /* mmap_lock has been dropped, prev is stale */

                mmap_read_lock(mm);
                vma = vma_lookup(mm, start);
                if (!vma)
                        return -ENOMEM;
                /*
                 * Potential end adjustment for hugetlb vma is OK as
                 * the check below keeps end within vma.
                 */
                if (!madvise_dontneed_free_valid_vma(vma, start, &end,
                                                     behavior))
                        return -EINVAL;
                if (end > vma->vm_end) {
                        /*
                         * Don't fail if end > vma->vm_end. If the old
                         * vma was split while the mmap_lock was
                         * released the effect of the concurrent
                         * operation may not cause madvise() to
                         * have an undefined result. There may be an
                         * adjacent next vma that we'll walk
                         * next. userfaultfd_remove() will generate an
                         * UFFD_EVENT_REMOVE repetition on the
                         * end-vma->vm_end range, but the manager can
                         * handle a repetition fine.
                         */
                        end = vma->vm_end;
                }
                /*
                 * If the memory region between start and end was
                 * originally backed by 4kB pages and then remapped to
                 * be backed by hugepages while mmap_lock was dropped,
                 * the adjustment for hugetlb vma above may have rounded
                 * end down to the start address.
                 */
                if (start == end)
                        return 0;
                VM_WARN_ON(start > end);
        }

        if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
                return madvise_dontneed_single_vma(vma, start, end);
        else if (behavior == MADV_FREE)
                return madvise_free_single_vma(vma, start, end);
        else
                return -EINVAL;
}

static long madvise_populate(struct mm_struct *mm, unsigned long start,
                unsigned long end, int behavior)
{
        const bool write = behavior == MADV_POPULATE_WRITE;
        int locked = 1;
        long pages;

        while (start < end) {
                /* Populate (prefault) page tables readable/writable. */
                pages = faultin_page_range(mm, start, end, write, &locked);
                if (!locked) {
                        mmap_read_lock(mm);
                        locked = 1;
                }
                if (pages < 0) {
                        switch (pages) {
                        case -EINTR:
                                return -EINTR;
                        case -EINVAL: /* Incompatible mappings / permissions. */
                                return -EINVAL;
                        case -EHWPOISON:
                                return -EHWPOISON;
                        case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
                                return -EFAULT;
                        default:
                                pr_warn_once("%s: unhandled return value: %ld\n",
                                             __func__, pages);
                                fallthrough;
                        case -ENOMEM: /* No VMA or out of memory. */
                                return -ENOMEM;
                        }
                }
                start += pages * PAGE_SIZE;
        }
        return 0;
}

/*
 * Application wants to free up the pages and associated backing store.
 * This is effectively punching a hole into the middle of a file.
 */
static long madvise_remove(struct vm_area_struct *vma,
                                struct vm_area_struct **prev,
                                unsigned long start, unsigned long end)
{
        loff_t offset;
        int error;
        struct file *f;
        struct mm_struct *mm = vma->vm_mm;

        *prev = NULL;        /* tell sys_madvise we drop mmap_lock */

        if (vma->vm_flags & VM_LOCKED)
                return -EINVAL;

        f = vma->vm_file;

        if (!f || !f->f_mapping || !f->f_mapping->host) {
                        return -EINVAL;
        }

        if (!vma_is_shared_maywrite(vma))
                return -EACCES;

        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);

        /*
         * Filesystem's fallocate may need to take i_rwsem.  We need to
         * explicitly grab a reference because the vma (and hence the
         * vma's reference to the file) can go away as soon as we drop
         * mmap_lock.
         */
        get_file(f);
        if (userfaultfd_remove(vma, start, end)) {
                /* mmap_lock was not released by userfaultfd_remove() */
                mmap_read_unlock(mm);
        }
        error = vfs_fallocate(f,
                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                                offset, end - start);
        fput(f);
        mmap_read_lock(mm);
        return error;
}

static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
{
        vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;

        /*
         * A user could lock after setting a guard range but that's fine, as
         * they'd not be able to fault in. The issue arises when we try to zap
         * existing locked VMAs. We don't want to do that.
         */
        if (!allow_locked)
                disallowed |= VM_LOCKED;

        return !(vma->vm_flags & disallowed);
}

static bool is_guard_pte_marker(pte_t ptent)
{
        return is_pte_marker(ptent) &&
                is_guard_swp_entry(pte_to_swp_entry(ptent));
}

static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
                                   unsigned long next, struct mm_walk *walk)
{
        pud_t pudval = pudp_get(pud);

        /* If huge return >0 so we abort the operation + zap. */
        return pud_trans_huge(pudval) || pud_devmap(pudval);
}

static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
                                   unsigned long next, struct mm_walk *walk)
{
        pmd_t pmdval = pmdp_get(pmd);

        /* If huge return >0 so we abort the operation + zap. */
        return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
}

static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
                                   unsigned long next, struct mm_walk *walk)
{
        pte_t pteval = ptep_get(pte);
        unsigned long *nr_pages = (unsigned long *)walk->private;

        /* If there is already a guard page marker, we have nothing to do. */
        if (is_guard_pte_marker(pteval)) {
                (*nr_pages)++;

                return 0;
        }

        /* If populated return >0 so we abort the operation + zap. */
        return 1;
}

static int guard_install_set_pte(unsigned long addr, unsigned long next,
                                 pte_t *ptep, struct mm_walk *walk)
{
        unsigned long *nr_pages = (unsigned long *)walk->private;

        /* Simply install a PTE marker, this causes segfault on access. */
        *ptep = make_pte_marker(PTE_MARKER_GUARD);
        (*nr_pages)++;

        return 0;
}

static const struct mm_walk_ops guard_install_walk_ops = {
        .pud_entry                = guard_install_pud_entry,
        .pmd_entry                = guard_install_pmd_entry,
        .pte_entry                = guard_install_pte_entry,
        .install_pte                = guard_install_set_pte,
        .walk_lock                = PGWALK_RDLOCK,
};

static long madvise_guard_install(struct vm_area_struct *vma,
                                 struct vm_area_struct **prev,
                                 unsigned long start, unsigned long end)
{
        long err;
        int i;

        *prev = vma;
        if (!is_valid_guard_vma(vma, /* allow_locked = */false))
                return -EINVAL;

        /*
         * If we install guard markers, then the range is no longer
         * empty from a page table perspective and therefore it's
         * appropriate to have an anon_vma.
         *
         * This ensures that on fork, we copy page tables correctly.
         */
        err = anon_vma_prepare(vma);
        if (err)
                return err;

        /*
         * Optimistically try to install the guard marker pages first. If any
         * non-guard pages are encountered, give up and zap the range before
         * trying again.
         *
         * We try a few times before giving up and releasing back to userland to
         * loop around, releasing locks in the process to avoid contention. This
         * would only happen if there was a great many racing page faults.
         *
         * In most cases we should simply install the guard markers immediately
         * with no zap or looping.
         */
        for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
                unsigned long nr_pages = 0;

                /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
                err = walk_page_range_mm(vma->vm_mm, start, end,
                                         &guard_install_walk_ops, &nr_pages);
                if (err < 0)
                        return err;

                if (err == 0) {
                        unsigned long nr_expected_pages = PHYS_PFN(end - start);

                        VM_WARN_ON(nr_pages != nr_expected_pages);
                        return 0;
                }

                /*
                 * OK some of the range have non-guard pages mapped, zap
                 * them. This leaves existing guard pages in place.
                 */
                zap_page_range_single(vma, start, end - start, NULL);
        }

        /*
         * We were unable to install the guard pages due to being raced by page
         * faults. This should not happen ordinarily. We return to userspace and
         * immediately retry, relieving lock contention.
         */
        return restart_syscall();
}

static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
                                  unsigned long next, struct mm_walk *walk)
{
        pud_t pudval = pudp_get(pud);

        /* If huge, cannot have guard pages present, so no-op - skip. */
        if (pud_trans_huge(pudval) || pud_devmap(pudval))
                walk->action = ACTION_CONTINUE;

        return 0;
}

static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
                                  unsigned long next, struct mm_walk *walk)
{
        pmd_t pmdval = pmdp_get(pmd);

        /* If huge, cannot have guard pages present, so no-op - skip. */
        if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
                walk->action = ACTION_CONTINUE;

        return 0;
}

static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
                                  unsigned long next, struct mm_walk *walk)
{
        pte_t ptent = ptep_get(pte);

        if (is_guard_pte_marker(ptent)) {
                /* Simply clear the PTE marker. */
                pte_clear_not_present_full(walk->mm, addr, pte, false);
                update_mmu_cache(walk->vma, addr, pte);
        }

        return 0;
}

static const struct mm_walk_ops guard_remove_walk_ops = {
        .pud_entry                = guard_remove_pud_entry,
        .pmd_entry                = guard_remove_pmd_entry,
        .pte_entry                = guard_remove_pte_entry,
        .walk_lock                = PGWALK_RDLOCK,
};

static long madvise_guard_remove(struct vm_area_struct *vma,
                                 struct vm_area_struct **prev,
                                 unsigned long start, unsigned long end)
{
        *prev = vma;
        /*
         * We're ok with removing guards in mlock()'d ranges, as this is a
         * non-destructive action.
         */
        if (!is_valid_guard_vma(vma, /* allow_locked = */true))
                return -EINVAL;

        return walk_page_range(vma->vm_mm, start, end,
                               &guard_remove_walk_ops, NULL);
}

/*
 * Apply an madvise behavior to a region of a vma.  madvise_update_vma
 * will handle splitting a vm area into separate areas, each area with its own
 * behavior.
 */
static int madvise_vma_behavior(struct vm_area_struct *vma,
                                struct vm_area_struct **prev,
                                unsigned long start, unsigned long end,
                                unsigned long behavior)
{
        int error;
        struct anon_vma_name *anon_name;
        unsigned long new_flags = vma->vm_flags;

        if (unlikely(!can_modify_vma_madv(vma, behavior)))
                return -EPERM;

        switch (behavior) {
        case MADV_REMOVE:
                return madvise_remove(vma, prev, start, end);
        case MADV_WILLNEED:
                return madvise_willneed(vma, prev, start, end);
        case MADV_COLD:
                return madvise_cold(vma, prev, start, end);
        case MADV_PAGEOUT:
                return madvise_pageout(vma, prev, start, end);
        case MADV_FREE:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
                return madvise_dontneed_free(vma, prev, start, end, behavior);
        case MADV_NORMAL:
                new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
                break;
        case MADV_SEQUENTIAL:
                new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
                break;
        case MADV_RANDOM:
                new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
                break;
        case MADV_DONTFORK:
                new_flags |= VM_DONTCOPY;
                break;
        case MADV_DOFORK:
                if (vma->vm_flags & VM_IO)
                        return -EINVAL;
                new_flags &= ~VM_DONTCOPY;
                break;
        case MADV_WIPEONFORK:
                /* MADV_WIPEONFORK is only supported on anonymous memory. */
                if (vma->vm_file || vma->vm_flags & VM_SHARED)
                        return -EINVAL;
                new_flags |= VM_WIPEONFORK;
                break;
        case MADV_KEEPONFORK:
                if (vma->vm_flags & VM_DROPPABLE)
                        return -EINVAL;
                new_flags &= ~VM_WIPEONFORK;
                break;
        case MADV_DONTDUMP:
                new_flags |= VM_DONTDUMP;
                break;
        case MADV_DODUMP:
                if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
                    (vma->vm_flags & VM_DROPPABLE))
                        return -EINVAL;
                new_flags &= ~VM_DONTDUMP;
                break;
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
                error = ksm_madvise(vma, start, end, behavior, &new_flags);
                if (error)
                        goto out;
                break;
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
                error = hugepage_madvise(vma, &new_flags, behavior);
                if (error)
                        goto out;
                break;
        case MADV_COLLAPSE:
                return madvise_collapse(vma, prev, start, end);
        case MADV_GUARD_INSTALL:
                return madvise_guard_install(vma, prev, start, end);
        case MADV_GUARD_REMOVE:
                return madvise_guard_remove(vma, prev, start, end);
        }

        anon_name = anon_vma_name(vma);
        anon_vma_name_get(anon_name);
        error = madvise_update_vma(vma, prev, start, end, new_flags,
                                   anon_name);
        anon_vma_name_put(anon_name);

out:
        /*
         * madvise() returns EAGAIN if kernel resources, such as
         * slab, are temporarily unavailable.
         */
        if (error == -ENOMEM)
                error = -EAGAIN;
        return error;
}

#ifdef CONFIG_MEMORY_FAILURE
/*
 * Error injection support for memory error handling.
 */
static int madvise_inject_error(int behavior,
                unsigned long start, unsigned long end)
{
        unsigned long size;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;


        for (; start < end; start += size) {
                unsigned long pfn;
                struct page *page;
                int ret;

                ret = get_user_pages_fast(start, 1, 0, &page);
                if (ret != 1)
                        return ret;
                pfn = page_to_pfn(page);

                /*
                 * When soft offlining hugepages, after migrating the page
                 * we dissolve it, therefore in the second loop "page" will
                 * no longer be a compound page.
                 */
                size = page_size(compound_head(page));

                if (behavior == MADV_SOFT_OFFLINE) {
                        pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
                                 pfn, start);
                        ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
                } else {
                        pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
                                 pfn, start);
                        ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED);
                        if (ret == -EOPNOTSUPP)
                                ret = 0;
                }

                if (ret)
                        return ret;
        }

        return 0;
}

static bool is_memory_failure(int behavior)
{
        switch (behavior) {
        case MADV_HWPOISON:
        case MADV_SOFT_OFFLINE:
                return true;
        default:
                return false;
        }
}

#else

static int madvise_inject_error(int behavior,
                unsigned long start, unsigned long end)
{
        return 0;
}

static bool is_memory_failure(int behavior)
{
        return false;
}

#endif        /* CONFIG_MEMORY_FAILURE */

static bool
madvise_behavior_valid(int behavior)
{
        switch (behavior) {
        case MADV_DOFORK:
        case MADV_DONTFORK:
        case MADV_NORMAL:
        case MADV_SEQUENTIAL:
        case MADV_RANDOM:
        case MADV_REMOVE:
        case MADV_WILLNEED:
        case MADV_DONTNEED:
        case MADV_DONTNEED_LOCKED:
        case MADV_FREE:
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
#ifdef CONFIG_KSM
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        case MADV_HUGEPAGE:
        case MADV_NOHUGEPAGE:
        case MADV_COLLAPSE:
#endif
        case MADV_DONTDUMP:
        case MADV_DODUMP:
        case MADV_WIPEONFORK:
        case MADV_KEEPONFORK:
        case MADV_GUARD_INSTALL:
        case MADV_GUARD_REMOVE:
#ifdef CONFIG_MEMORY_FAILURE
        case MADV_SOFT_OFFLINE:
        case MADV_HWPOISON:
#endif
                return true;

        default:
                return false;
        }
}

/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
static bool process_madvise_remote_valid(int behavior)
{
        switch (behavior) {
        case MADV_COLD:
        case MADV_PAGEOUT:
        case MADV_WILLNEED:
        case MADV_COLLAPSE:
                return true;
        default:
                return false;
        }
}

/*
 * Walk the vmas in range [start,end), and call the visit function on each one.
 * The visit function will get start and end parameters that cover the overlap
 * between the current vma and the original range.  Any unmapped regions in the
 * original range will result in this function returning -ENOMEM while still
 * calling the visit function on all of the existing vmas in the range.
 * Must be called with the mmap_lock held for reading or writing.
 */
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
                      unsigned long end, unsigned long arg,
                      int (*visit)(struct vm_area_struct *vma,
                                   struct vm_area_struct **prev, unsigned long start,
                                   unsigned long end, unsigned long arg))
{
        struct vm_area_struct *vma;
        struct vm_area_struct *prev;
        unsigned long tmp;
        int unmapped_error = 0;

        /*
         * If the interval [start,end) covers some unmapped address
         * ranges, just ignore them, but return -ENOMEM at the end.
         * - different from the way of handling in mlock etc.
         */
        vma = find_vma_prev(mm, start, &prev);
        if (vma && start > vma->vm_start)
                prev = vma;

        for (;;) {
                int error;

                /* Still start < end. */
                if (!vma)
                        return -ENOMEM;

                /* Here start < (end|vma->vm_end). */
                if (start < vma->vm_start) {
                        unmapped_error = -ENOMEM;
                        start = vma->vm_start;
                        if (start >= end)
                                break;
                }

                /* Here vma->vm_start <= start < (end|vma->vm_end) */
                tmp = vma->vm_end;
                if (end < tmp)
                        tmp = end;

                /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
                error = visit(vma, &prev, start, tmp, arg);
                if (error)
                        return error;
                start = tmp;
                if (prev && start < prev->vm_end)
                        start = prev->vm_end;
                if (start >= end)
                        break;
                if (prev)
                        vma = find_vma(mm, prev->vm_end);
                else        /* madvise_remove dropped mmap_lock */
                        vma = find_vma(mm, start);
        }

        return unmapped_error;
}

#ifdef CONFIG_ANON_VMA_NAME
static int madvise_vma_anon_name(struct vm_area_struct *vma,
                                 struct vm_area_struct **prev,
                                 unsigned long start, unsigned long end,
                                 unsigned long anon_name)
{
        int error;

        /* Only anonymous mappings can be named */
        if (vma->vm_file && !vma_is_anon_shmem(vma))
                return -EBADF;

        error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
                                   (struct anon_vma_name *)anon_name);

        /*
         * madvise() returns EAGAIN if kernel resources, such as
         * slab, are temporarily unavailable.
         */
        if (error == -ENOMEM)
                error = -EAGAIN;
        return error;
}

int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
                          unsigned long len_in, struct anon_vma_name *anon_name)
{
        unsigned long end;
        unsigned long len;

        if (start & ~PAGE_MASK)
                return -EINVAL;
        len = (len_in + ~PAGE_MASK) & PAGE_MASK;

        /* Check to see whether len was rounded up from small -ve to zero */
        if (len_in && !len)
                return -EINVAL;

        end = start + len;
        if (end < start)
                return -EINVAL;

        if (end == start)
                return 0;

        return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
                                 madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */

static int madvise_lock(struct mm_struct *mm, int behavior)
{
        if (is_memory_failure(behavior))
                return 0;

        if (madvise_need_mmap_write(behavior)) {
                if (mmap_write_lock_killable(mm))
                        return -EINTR;
        } else {
                mmap_read_lock(mm);
        }
        return 0;
}

static void madvise_unlock(struct mm_struct *mm, int behavior)
{
        if (is_memory_failure(behavior))
                return;

        if (madvise_need_mmap_write(behavior))
                mmap_write_unlock(mm);
        else
                mmap_read_unlock(mm);
}

static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior)
{
        size_t len;

        if (!madvise_behavior_valid(behavior))
                return false;

        if (!PAGE_ALIGNED(start))
                return false;
        len = PAGE_ALIGN(len_in);

        /* Check to see whether len was rounded up from small -ve to zero */
        if (len_in && !len)
                return false;

        if (start + len < start)
                return false;

        return true;
}

/*
 * madvise_should_skip() - Return if the request is invalid or nothing.
 * @start:        Start address of madvise-requested address range.
 * @len_in:        Length of madvise-requested address range.
 * @behavior:        Requested madvise behavor.
 * @err:        Pointer to store an error code from the check.
 *
 * If the specified behaviour is invalid or nothing would occur, we skip the
 * operation.  This function returns true in the cases, otherwise false.  In
 * the former case we store an error on @err.
 */
static bool madvise_should_skip(unsigned long start, size_t len_in,
                int behavior, int *err)
{
        if (!is_valid_madvise(start, len_in, behavior)) {
                *err = -EINVAL;
                return true;
        }
        if (start + PAGE_ALIGN(len_in) == start) {
                *err = 0;
                return true;
        }
        return false;
}

static bool is_madvise_populate(int behavior)
{
        switch (behavior) {
        case MADV_POPULATE_READ:
        case MADV_POPULATE_WRITE:
                return true;
        default:
                return false;
        }
}

static int madvise_do_behavior(struct mm_struct *mm,
                unsigned long start, size_t len_in, int behavior)
{
        struct blk_plug plug;
        unsigned long end;
        int error;

        if (is_memory_failure(behavior))
                return madvise_inject_error(behavior, start, start + len_in);
        start = untagged_addr_remote(mm, start);
        end = start + PAGE_ALIGN(len_in);

        blk_start_plug(&plug);
        if (is_madvise_populate(behavior))
                error = madvise_populate(mm, start, end, behavior);
        else
                error = madvise_walk_vmas(mm, start, end, behavior,
                                          madvise_vma_behavior);
        blk_finish_plug(&plug);
        return error;
}

/*
 * The madvise(2) system call.
 *
 * Applications can use madvise() to advise the kernel how it should
 * handle paging I/O in this VM area.  The idea is to help the kernel
 * use appropriate read-ahead and caching techniques.  The information
 * provided is advisory only, and can be safely disregarded by the
 * kernel without affecting the correct operation of the application.
 *
 * behavior values:
 *  MADV_NORMAL - the default behavior is to read clusters.  This
 *                results in some read-ahead and read-behind.
 *  MADV_RANDOM - the system should read the minimum amount of data
 *                on any access, since it is unlikely that the appli-
 *                cation will need more than what it asks for.
 *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
 *                once, so they can be aggressively read ahead, and
 *                can be freed soon after they are accessed.
 *  MADV_WILLNEED - the application is notifying the system to read
 *                some pages ahead.
 *  MADV_DONTNEED - the application is finished with the given range,
 *                so the kernel can free resources associated with it.
 *  MADV_FREE - the application marks pages in the given range as lazy free,
 *                where actual purges are postponed until memory pressure happens.
 *  MADV_REMOVE - the application wants to free up the given range of
 *                pages and associated backing store.
 *  MADV_DONTFORK - omit this area from child's address space when forking:
 *                typically, to avoid COWing pages pinned by get_user_pages().
 *  MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
 *  MADV_WIPEONFORK - present the child process with zero-filled memory in this
 *              range after a fork.
 *  MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
 *  MADV_HWPOISON - trigger memory error handler as if the given memory range
 *                were corrupted by unrecoverable hardware memory failure.
 *  MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
 *  MADV_MERGEABLE - the application recommends that KSM try to merge pages in
 *                this area with pages of identical content from other such areas.
 *  MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
 *  MADV_HUGEPAGE - the application wants to back the given range by transparent
 *                huge pages in the future. Existing pages might be coalesced and
 *                new pages might be allocated as THP.
 *  MADV_NOHUGEPAGE - mark the given range as not worth being backed by
 *                transparent huge pages so the existing pages will not be
 *                coalesced into THP and new pages will not be allocated as THP.
 *  MADV_COLLAPSE - synchronously coalesce pages into new THP.
 *  MADV_DONTDUMP - the application wants to prevent pages in the given range
 *                from being included in its core dump.
 *  MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
 *  MADV_COLD - the application is not expected to use this memory soon,
 *                deactivate pages in this range so that they can be reclaimed
 *                easily if memory pressure happens.
 *  MADV_PAGEOUT - the application is not expected to use this memory soon,
 *                page out the pages in this range immediately.
 *  MADV_POPULATE_READ - populate (prefault) page tables readable by
 *                triggering read faults if required
 *  MADV_POPULATE_WRITE - populate (prefault) page tables writable by
 *                triggering write faults if required
 *
 * return values:
 *  zero    - success
 *  -EINVAL - start + len < 0, start is not page-aligned,
 *                "behavior" is not a valid value, or application
 *                is attempting to release locked or shared pages,
 *                or the specified address range includes file, Huge TLB,
 *                MAP_SHARED or VMPFNMAP range.
 *  -ENOMEM - addresses in the specified range are not currently
 *                mapped, or are outside the AS of the process.
 *  -EIO    - an I/O error occurred while paging in data.
 *  -EBADF  - map exists, but area maps something that isn't a file.
 *  -EAGAIN - a kernel resource was temporarily unavailable.
 *  -EPERM  - memory is sealed.
 */
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
        int error;

        if (madvise_should_skip(start, len_in, behavior, &error))
                return error;
        error = madvise_lock(mm, behavior);
        if (error)
                return error;
        error = madvise_do_behavior(mm, start, len_in, behavior);
        madvise_unlock(mm, behavior);

        return error;
}

SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
{
        return do_madvise(current->mm, start, len_in, behavior);
}

/* Perform an madvise operation over a vector of addresses and lengths. */
static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
                              int behavior)
{
        ssize_t ret = 0;
        size_t total_len;

        total_len = iov_iter_count(iter);

        ret = madvise_lock(mm, behavior);
        if (ret)
                return ret;

        while (iov_iter_count(iter)) {
                unsigned long start = (unsigned long)iter_iov_addr(iter);
                size_t len_in = iter_iov_len(iter);
                int error;

                if (madvise_should_skip(start, len_in, behavior, &error))
                        ret = error;
                else
                        ret = madvise_do_behavior(mm, start, len_in, behavior);
                /*
                 * An madvise operation is attempting to restart the syscall,
                 * but we cannot proceed as it would not be correct to repeat
                 * the operation in aggregate, and would be surprising to the
                 * user.
                 *
                 * We drop and reacquire locks so it is safe to just loop and
                 * try again. We check for fatal signals in case we need exit
                 * early anyway.
                 */
                if (ret == -ERESTARTNOINTR) {
                        if (fatal_signal_pending(current)) {
                                ret = -EINTR;
                                break;
                        }

                        /* Drop and reacquire lock to unwind race. */
                        madvise_unlock(mm, behavior);
                        madvise_lock(mm, behavior);
                        continue;
                }
                if (ret < 0)
                        break;
                iov_iter_advance(iter, iter_iov_len(iter));
        }
        madvise_unlock(mm, behavior);

        ret = (total_len - iov_iter_count(iter)) ? : ret;

        return ret;
}

SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
                size_t, vlen, int, behavior, unsigned int, flags)
{
        ssize_t ret;
        struct iovec iovstack[UIO_FASTIOV];
        struct iovec *iov = iovstack;
        struct iov_iter iter;
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned int f_flags;

        if (flags != 0) {
                ret = -EINVAL;
                goto out;
        }

        ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
        if (ret < 0)
                goto out;

        task = pidfd_get_task(pidfd, &f_flags);
        if (IS_ERR(task)) {
                ret = PTR_ERR(task);
                goto free_iov;
        }

        /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
        mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
        if (IS_ERR(mm)) {
                ret = PTR_ERR(mm);
                goto release_task;
        }

        /*
         * We need only perform this check if we are attempting to manipulate a
         * remote process's address space.
         */
        if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
                ret = -EINVAL;
                goto release_mm;
        }

        /*
         * Require CAP_SYS_NICE for influencing process performance. Note that
         * only non-destructive hints are currently supported for remote
         * processes.
         */
        if (mm != current->mm && !capable(CAP_SYS_NICE)) {
                ret = -EPERM;
                goto release_mm;
        }

        ret = vector_madvise(mm, &iter, behavior);

release_mm:
        mmput(mm);
release_task:
        put_task_struct(task);
free_iov:
        kfree(iov);
out:
        return ret;
}





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 
    3 









    3 

























































































































































































































































































































































































































































































































































































    3 



    3 















    3 




    3 

















    3 














































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
/*
 * Copyright (c) 2004 Topspin Communications.  All rights reserved.
 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#include <linux/module.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/netdevice.h>
#include <net/net_namespace.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/hashtable.h>
#include <rdma/rdma_netlink.h>
#include <rdma/ib_addr.h>
#include <rdma/ib_cache.h>
#include <rdma/rdma_counter.h>

#include "core_priv.h"
#include "restrack.h"

MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("core kernel InfiniBand API");
MODULE_LICENSE("Dual BSD/GPL");

struct workqueue_struct *ib_comp_wq;
struct workqueue_struct *ib_comp_unbound_wq;
struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq);
static struct workqueue_struct *ib_unreg_wq;

/*
 * Each of the three rwsem locks (devices, clients, client_data) protects the
 * xarray of the same name. Specifically it allows the caller to assert that
 * the MARK will/will not be changing under the lock, and for devices and
 * clients, that the value in the xarray is still a valid pointer. Change of
 * the MARK is linked to the object state, so holding the lock and testing the
 * MARK also asserts that the contained object is in a certain state.
 *
 * This is used to build a two stage register/unregister flow where objects
 * can continue to be in the xarray even though they are still in progress to
 * register/unregister.
 *
 * The xarray itself provides additional locking, and restartable iteration,
 * which is also relied on.
 *
 * Locks should not be nested, with the exception of client_data, which is
 * allowed to nest under the read side of the other two locks.
 *
 * The devices_rwsem also protects the device name list, any change or
 * assignment of device name must also hold the write side to guarantee unique
 * names.
 */

/*
 * devices contains devices that have had their names assigned. The
 * devices may not be registered. Users that care about the registration
 * status need to call ib_device_try_get() on the device to ensure it is
 * registered, and keep it registered, for the required duration.
 *
 */
static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
static DECLARE_RWSEM(devices_rwsem);
#define DEVICE_REGISTERED XA_MARK_1

static u32 highest_client_id;
#define CLIENT_REGISTERED XA_MARK_1
static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC);
static DECLARE_RWSEM(clients_rwsem);

static void ib_client_put(struct ib_client *client)
{
        if (refcount_dec_and_test(&client->uses))
                complete(&client->uses_zero);
}

/*
 * If client_data is registered then the corresponding client must also still
 * be registered.
 */
#define CLIENT_DATA_REGISTERED XA_MARK_1

unsigned int rdma_dev_net_id;

/*
 * A list of net namespaces is maintained in an xarray. This is necessary
 * because we can't get the locking right using the existing net ns list. We
 * would require a init_net callback after the list is updated.
 */
static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
/*
 * rwsem to protect accessing the rdma_nets xarray entries.
 */
static DECLARE_RWSEM(rdma_nets_rwsem);

bool ib_devices_shared_netns = true;
module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
MODULE_PARM_DESC(netns_mode,
                 "Share device among net namespaces; default=1 (shared)");
/**
 * rdma_dev_access_netns() - Return whether an rdma device can be accessed
 *                             from a specified net namespace or not.
 * @dev:        Pointer to rdma device which needs to be checked
 * @net:        Pointer to net namesapce for which access to be checked
 *
 * When the rdma device is in shared mode, it ignores the net namespace.
 * When the rdma device is exclusive to a net namespace, rdma device net
 * namespace is checked against the specified one.
 */
bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
{
        return (ib_devices_shared_netns ||
                net_eq(read_pnet(&dev->coredev.rdma_net), net));
}
EXPORT_SYMBOL(rdma_dev_access_netns);

/*
 * xarray has this behavior where it won't iterate over NULL values stored in
 * allocated arrays.  So we need our own iterator to see all values stored in
 * the array. This does the same thing as xa_for_each except that it also
 * returns NULL valued entries if the array is allocating. Simplified to only
 * work on simple xarrays.
 */
static void *xan_find_marked(struct xarray *xa, unsigned long *indexp,
                             xa_mark_t filter)
{
        XA_STATE(xas, xa, *indexp);
        void *entry;

        rcu_read_lock();
        do {
                entry = xas_find_marked(&xas, ULONG_MAX, filter);
                if (xa_is_zero(entry))
                        break;
        } while (xas_retry(&xas, entry));
        rcu_read_unlock();

        if (entry) {
                *indexp = xas.xa_index;
                if (xa_is_zero(entry))
                        return NULL;
                return entry;
        }
        return XA_ERROR(-ENOENT);
}
#define xan_for_each_marked(xa, index, entry, filter)                          \
        for (index = 0, entry = xan_find_marked(xa, &(index), filter);         \
             !xa_is_err(entry);                                                \
             (index)++, entry = xan_find_marked(xa, &(index), filter))

/* RCU hash table mapping netdevice pointers to struct ib_port_data */
static DEFINE_SPINLOCK(ndev_hash_lock);
static DECLARE_HASHTABLE(ndev_hash, 5);

static void free_netdevs(struct ib_device *ib_dev);
static void ib_unregister_work(struct work_struct *work);
static void __ib_unregister_device(struct ib_device *device);
static int ib_security_change(struct notifier_block *nb, unsigned long event,
                              void *lsm_data);
static void ib_policy_change_task(struct work_struct *work);
static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);

static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
                           struct va_format *vaf)
{
        if (ibdev && ibdev->dev.parent)
                dev_printk_emit(level[1] - '0',
                                ibdev->dev.parent,
                                "%s %s %s: %pV",
                                dev_driver_string(ibdev->dev.parent),
                                dev_name(ibdev->dev.parent),
                                dev_name(&ibdev->dev),
                                vaf);
        else if (ibdev)
                printk("%s%s: %pV",
                       level, dev_name(&ibdev->dev), vaf);
        else
                printk("%s(NULL ib_device): %pV", level, vaf);
}

#define define_ibdev_printk_level(func, level)                  \
void func(const struct ib_device *ibdev, const char *fmt, ...)  \
{                                                               \
        struct va_format vaf;                                   \
        va_list args;                                           \
                                                                \
        va_start(args, fmt);                                    \
                                                                \
        vaf.fmt = fmt;                                          \
        vaf.va = &args;                                         \
                                                                \
        __ibdev_printk(level, ibdev, &vaf);                     \
                                                                \
        va_end(args);                                           \
}                                                               \
EXPORT_SYMBOL(func);

define_ibdev_printk_level(ibdev_emerg, KERN_EMERG);
define_ibdev_printk_level(ibdev_alert, KERN_ALERT);
define_ibdev_printk_level(ibdev_crit, KERN_CRIT);
define_ibdev_printk_level(ibdev_err, KERN_ERR);
define_ibdev_printk_level(ibdev_warn, KERN_WARNING);
define_ibdev_printk_level(ibdev_notice, KERN_NOTICE);
define_ibdev_printk_level(ibdev_info, KERN_INFO);

static struct notifier_block ibdev_lsm_nb = {
        .notifier_call = ib_security_change,
};

static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
                                 struct net *net);

/* Pointer to the RCU head at the start of the ib_port_data array */
struct ib_port_data_rcu {
        struct rcu_head rcu_head;
        struct ib_port_data pdata[];
};

static void ib_device_check_mandatory(struct ib_device *device)
{
#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x }
        static const struct {
                size_t offset;
                char  *name;
        } mandatory_table[] = {
                IB_MANDATORY_FUNC(query_device),
                IB_MANDATORY_FUNC(query_port),
                IB_MANDATORY_FUNC(alloc_pd),
                IB_MANDATORY_FUNC(dealloc_pd),
                IB_MANDATORY_FUNC(create_qp),
                IB_MANDATORY_FUNC(modify_qp),
                IB_MANDATORY_FUNC(destroy_qp),
                IB_MANDATORY_FUNC(post_send),
                IB_MANDATORY_FUNC(post_recv),
                IB_MANDATORY_FUNC(create_cq),
                IB_MANDATORY_FUNC(destroy_cq),
                IB_MANDATORY_FUNC(poll_cq),
                IB_MANDATORY_FUNC(req_notify_cq),
                IB_MANDATORY_FUNC(get_dma_mr),
                IB_MANDATORY_FUNC(reg_user_mr),
                IB_MANDATORY_FUNC(dereg_mr),
                IB_MANDATORY_FUNC(get_port_immutable)
        };
        int i;

        device->kverbs_provider = true;
        for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
                if (!*(void **) ((void *) &device->ops +
                                 mandatory_table[i].offset)) {
                        device->kverbs_provider = false;
                        break;
                }
        }
}

/*
 * Caller must perform ib_device_put() to return the device reference count
 * when ib_device_get_by_index() returns valid device pointer.
 */
struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)
{
        struct ib_device *device;

        down_read(&devices_rwsem);
        device = xa_load(&devices, index);
        if (device) {
                if (!rdma_dev_access_netns(device, net)) {
                        device = NULL;
                        goto out;
                }

                if (!ib_device_try_get(device))
                        device = NULL;
        }
out:
        up_read(&devices_rwsem);
        return device;
}

/**
 * ib_device_put - Release IB device reference
 * @device: device whose reference to be released
 *
 * ib_device_put() releases reference to the IB device to allow it to be
 * unregistered and eventually free.
 */
void ib_device_put(struct ib_device *device)
{
        if (refcount_dec_and_test(&device->refcount))
                complete(&device->unreg_completion);
}
EXPORT_SYMBOL(ib_device_put);

static struct ib_device *__ib_device_get_by_name(const char *name)
{
        struct ib_device *device;
        unsigned long index;

        xa_for_each (&devices, index, device)
                if (!strcmp(name, dev_name(&device->dev)))
                        return device;

        return NULL;
}

/**
 * ib_device_get_by_name - Find an IB device by name
 * @name: The name to look for
 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
 *
 * Find and hold an ib_device by its name. The caller must call
 * ib_device_put() on the returned pointer.
 */
struct ib_device *ib_device_get_by_name(const char *name,
                                        enum rdma_driver_id driver_id)
{
        struct ib_device *device;

        down_read(&devices_rwsem);
        device = __ib_device_get_by_name(name);
        if (device && driver_id != RDMA_DRIVER_UNKNOWN &&
            device->ops.driver_id != driver_id)
                device = NULL;

        if (device) {
                if (!ib_device_try_get(device))
                        device = NULL;
        }
        up_read(&devices_rwsem);
        return device;
}
EXPORT_SYMBOL(ib_device_get_by_name);

static int rename_compat_devs(struct ib_device *device)
{
        struct ib_core_device *cdev;
        unsigned long index;
        int ret = 0;

        mutex_lock(&device->compat_devs_mutex);
        xa_for_each (&device->compat_devs, index, cdev) {
                ret = device_rename(&cdev->dev, dev_name(&device->dev));
                if (ret) {
                        dev_warn(&cdev->dev,
                                 "Fail to rename compatdev to new name %s\n",
                                 dev_name(&device->dev));
                        break;
                }
        }
        mutex_unlock(&device->compat_devs_mutex);
        return ret;
}

int ib_device_rename(struct ib_device *ibdev, const char *name)
{
        unsigned long index;
        void *client_data;
        int ret;

        down_write(&devices_rwsem);
        if (!strcmp(name, dev_name(&ibdev->dev))) {
                up_write(&devices_rwsem);
                return 0;
        }

        if (__ib_device_get_by_name(name)) {
                up_write(&devices_rwsem);
                return -EEXIST;
        }

        ret = device_rename(&ibdev->dev, name);
        if (ret) {
                up_write(&devices_rwsem);
                return ret;
        }

        strscpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
        ret = rename_compat_devs(ibdev);

        downgrade_write(&devices_rwsem);
        down_read(&ibdev->client_data_rwsem);
        xan_for_each_marked(&ibdev->client_data, index, client_data,
                            CLIENT_DATA_REGISTERED) {
                struct ib_client *client = xa_load(&clients, index);

                if (!client || !client->rename)
                        continue;

                client->rename(ibdev, client_data);
        }
        up_read(&ibdev->client_data_rwsem);
        rdma_nl_notify_event(ibdev, 0, RDMA_RENAME_EVENT);
        up_read(&devices_rwsem);
        return 0;
}

int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim)
{
        if (use_dim > 1)
                return -EINVAL;
        ibdev->use_cq_dim = use_dim;

        return 0;
}

static int alloc_name(struct ib_device *ibdev, const char *name)
{
        struct ib_device *device;
        unsigned long index;
        struct ida inuse;
        int rc;
        int i;

        lockdep_assert_held_write(&devices_rwsem);
        ida_init(&inuse);
        xa_for_each (&devices, index, device) {
                char buf[IB_DEVICE_NAME_MAX];

                if (sscanf(dev_name(&device->dev), name, &i) != 1)
                        continue;
                if (i < 0 || i >= INT_MAX)
                        continue;
                snprintf(buf, sizeof buf, name, i);
                if (strcmp(buf, dev_name(&device->dev)) != 0)
                        continue;

                rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL);
                if (rc < 0)
                        goto out;
        }

        rc = ida_alloc(&inuse, GFP_KERNEL);
        if (rc < 0)
                goto out;

        rc = dev_set_name(&ibdev->dev, name, rc);
out:
        ida_destroy(&inuse);
        return rc;
}

static void ib_device_release(struct device *device)
{
        struct ib_device *dev = container_of(device, struct ib_device, dev);

        free_netdevs(dev);
        WARN_ON(refcount_read(&dev->refcount));
        if (dev->hw_stats_data)
                ib_device_release_hw_stats(dev->hw_stats_data);
        if (dev->port_data) {
                ib_cache_release_one(dev);
                ib_security_release_port_pkey_list(dev);
                rdma_counter_release(dev);
                kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
                                       pdata[0]),
                          rcu_head);
        }

        mutex_destroy(&dev->subdev_lock);
        mutex_destroy(&dev->unregistration_lock);
        mutex_destroy(&dev->compat_devs_mutex);

        xa_destroy(&dev->compat_devs);
        xa_destroy(&dev->client_data);
        kfree_rcu(dev, rcu_head);
}

static int ib_device_uevent(const struct device *device,
                            struct kobj_uevent_env *env)
{
        if (add_uevent_var(env, "NAME=%s", dev_name(device)))
                return -ENOMEM;

        /*
         * It would be nice to pass the node GUID with the event...
         */

        return 0;
}

static const void *net_namespace(const struct device *d)
{
        const struct ib_core_device *coredev =
                        container_of(d, struct ib_core_device, dev);

        return read_pnet(&coredev->rdma_net);
}

static struct class ib_class = {
        .name    = "infiniband",
        .dev_release = ib_device_release,
        .dev_uevent = ib_device_uevent,
        .ns_type = &net_ns_type_operations,
        .namespace = net_namespace,
};

static void rdma_init_coredev(struct ib_core_device *coredev,
                              struct ib_device *dev, struct net *net)
{
        bool is_full_dev = &dev->coredev == coredev;

        /* This BUILD_BUG_ON is intended to catch layout change
         * of union of ib_core_device and device.
         * dev must be the first element as ib_core and providers
         * driver uses it. Adding anything in ib_core_device before
         * device will break this assumption.
         */
        BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
                     offsetof(struct ib_device, dev));

        coredev->dev.class = &ib_class;
        coredev->dev.groups = dev->groups;

        /*
         * Don't expose hw counters outside of the init namespace.
         */
        if (!is_full_dev && dev->hw_stats_attr_index)
                coredev->dev.groups[dev->hw_stats_attr_index] = NULL;

        device_initialize(&coredev->dev);
        coredev->owner = dev;
        INIT_LIST_HEAD(&coredev->port_list);
        write_pnet(&coredev->rdma_net, net);
}

/**
 * _ib_alloc_device - allocate an IB device struct
 * @size:size of structure to allocate
 *
 * Low-level drivers should use ib_alloc_device() to allocate &struct
 * ib_device.  @size is the size of the structure to be allocated,
 * including any private data used by the low-level driver.
 * ib_dealloc_device() must be used to free structures allocated with
 * ib_alloc_device().
 */
struct ib_device *_ib_alloc_device(size_t size)
{
        struct ib_device *device;
        unsigned int i;

        if (WARN_ON(size < sizeof(struct ib_device)))
                return NULL;

        device = kzalloc(size, GFP_KERNEL);
        if (!device)
                return NULL;

        if (rdma_restrack_init(device)) {
                kfree(device);
                return NULL;
        }

        rdma_init_coredev(&device->coredev, device, &init_net);

        INIT_LIST_HEAD(&device->event_handler_list);
        spin_lock_init(&device->qp_open_list_lock);
        init_rwsem(&device->event_handler_rwsem);
        mutex_init(&device->unregistration_lock);
        /*
         * client_data needs to be alloc because we don't want our mark to be
         * destroyed if the user stores NULL in the client data.
         */
        xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
        init_rwsem(&device->client_data_rwsem);
        xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
        mutex_init(&device->compat_devs_mutex);
        init_completion(&device->unreg_completion);
        INIT_WORK(&device->unregistration_work, ib_unregister_work);

        spin_lock_init(&device->cq_pools_lock);
        for (i = 0; i < ARRAY_SIZE(device->cq_pools); i++)
                INIT_LIST_HEAD(&device->cq_pools[i]);

        rwlock_init(&device->cache_lock);

        device->uverbs_cmd_mask =
                BIT_ULL(IB_USER_VERBS_CMD_ALLOC_MW) |
                BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) |
                BIT_ULL(IB_USER_VERBS_CMD_ATTACH_MCAST) |
                BIT_ULL(IB_USER_VERBS_CMD_CLOSE_XRCD) |
                BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) |
                BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
                BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) |
                BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) |
                BIT_ULL(IB_USER_VERBS_CMD_CREATE_SRQ) |
                BIT_ULL(IB_USER_VERBS_CMD_CREATE_XSRQ) |
                BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_MW) |
                BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) |
                BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) |
                BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) |
                BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) |
                BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) |
                BIT_ULL(IB_USER_VERBS_CMD_DESTROY_SRQ) |
                BIT_ULL(IB_USER_VERBS_CMD_DETACH_MCAST) |
                BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) |
                BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) |
                BIT_ULL(IB_USER_VERBS_CMD_MODIFY_SRQ) |
                BIT_ULL(IB_USER_VERBS_CMD_OPEN_QP) |
                BIT_ULL(IB_USER_VERBS_CMD_OPEN_XRCD) |
                BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) |
                BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) |
                BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) |
                BIT_ULL(IB_USER_VERBS_CMD_QUERY_SRQ) |
                BIT_ULL(IB_USER_VERBS_CMD_REG_MR) |
                BIT_ULL(IB_USER_VERBS_CMD_REREG_MR) |
                BIT_ULL(IB_USER_VERBS_CMD_RESIZE_CQ);

        mutex_init(&device->subdev_lock);
        INIT_LIST_HEAD(&device->subdev_list_head);
        INIT_LIST_HEAD(&device->subdev_list);

        return device;
}
EXPORT_SYMBOL(_ib_alloc_device);

/**
 * ib_dealloc_device - free an IB device struct
 * @device:structure to free
 *
 * Free a structure allocated with ib_alloc_device().
 */
void ib_dealloc_device(struct ib_device *device)
{
        if (device->ops.dealloc_driver)
                device->ops.dealloc_driver(device);

        /*
         * ib_unregister_driver() requires all devices to remain in the xarray
         * while their ops are callable. The last op we call is dealloc_driver
         * above.  This is needed to create a fence on op callbacks prior to
         * allowing the driver module to unload.
         */
        down_write(&devices_rwsem);
        if (xa_load(&devices, device->index) == device)
                xa_erase(&devices, device->index);
        up_write(&devices_rwsem);

        /* Expedite releasing netdev references */
        free_netdevs(device);

        WARN_ON(!xa_empty(&device->compat_devs));
        WARN_ON(!xa_empty(&device->client_data));
        WARN_ON(refcount_read(&device->refcount));
        rdma_restrack_clean(device);
        /* Balances with device_initialize */
        put_device(&device->dev);
}
EXPORT_SYMBOL(ib_dealloc_device);

/*
 * add_client_context() and remove_client_context() must be safe against
 * parallel calls on the same device - registration/unregistration of both the
 * device and client can be occurring in parallel.
 *
 * The routines need to be a fence, any caller must not return until the add
 * or remove is fully completed.
 */
static int add_client_context(struct ib_device *device,
                              struct ib_client *client)
{
        int ret = 0;

        if (!device->kverbs_provider && !client->no_kverbs_req)
                return 0;

        down_write(&device->client_data_rwsem);
        /*
         * So long as the client is registered hold both the client and device
         * unregistration locks.
         */
        if (!refcount_inc_not_zero(&client->uses))
                goto out_unlock;
        refcount_inc(&device->refcount);

        /*
         * Another caller to add_client_context got here first and has already
         * completely initialized context.
         */
        if (xa_get_mark(&device->client_data, client->client_id,
                    CLIENT_DATA_REGISTERED))
                goto out;

        ret = xa_err(xa_store(&device->client_data, client->client_id, NULL,
                              GFP_KERNEL));
        if (ret)
                goto out;
        downgrade_write(&device->client_data_rwsem);
        if (client->add) {
                if (client->add(device)) {
                        /*
                         * If a client fails to add then the error code is
                         * ignored, but we won't call any more ops on this
                         * client.
                         */
                        xa_erase(&device->client_data, client->client_id);
                        up_read(&device->client_data_rwsem);
                        ib_device_put(device);
                        ib_client_put(client);
                        return 0;
                }
        }

        /* Readers shall not see a client until add has been completed */
        xa_set_mark(&device->client_data, client->client_id,
                    CLIENT_DATA_REGISTERED);
        up_read(&device->client_data_rwsem);
        return 0;

out:
        ib_device_put(device);
        ib_client_put(client);
out_unlock:
        up_write(&device->client_data_rwsem);
        return ret;
}

static void remove_client_context(struct ib_device *device,
                                  unsigned int client_id)
{
        struct ib_client *client;
        void *client_data;

        down_write(&device->client_data_rwsem);
        if (!xa_get_mark(&device->client_data, client_id,
                         CLIENT_DATA_REGISTERED)) {
                up_write(&device->client_data_rwsem);
                return;
        }
        client_data = xa_load(&device->client_data, client_id);
        xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED);
        client = xa_load(&clients, client_id);
        up_write(&device->client_data_rwsem);

        /*
         * Notice we cannot be holding any exclusive locks when calling the
         * remove callback as the remove callback can recurse back into any
         * public functions in this module and thus try for any locks those
         * functions take.
         *
         * For this reason clients and drivers should not call the
         * unregistration functions will holdling any locks.
         */
        if (client->remove)
                client->remove(device, client_data);

        xa_erase(&device->client_data, client_id);
        ib_device_put(device);
        ib_client_put(client);
}

static int alloc_port_data(struct ib_device *device)
{
        struct ib_port_data_rcu *pdata_rcu;
        u32 port;

        if (device->port_data)
                return 0;

        /* This can only be called once the physical port range is defined */
        if (WARN_ON(!device->phys_port_cnt))
                return -EINVAL;

        /* Reserve U32_MAX so the logic to go over all the ports is sane */
        if (WARN_ON(device->phys_port_cnt == U32_MAX))
                return -EINVAL;

        /*
         * device->port_data is indexed directly by the port number to make
         * access to this data as efficient as possible.
         *
         * Therefore port_data is declared as a 1 based array with potential
         * empty slots at the beginning.
         */
        pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata,
                                        size_add(rdma_end_port(device), 1)),
                            GFP_KERNEL);
        if (!pdata_rcu)
                return -ENOMEM;
        /*
         * The rcu_head is put in front of the port data array and the stored
         * pointer is adjusted since we never need to see that member until
         * kfree_rcu.
         */
        device->port_data = pdata_rcu->pdata;

        rdma_for_each_port (device, port) {
                struct ib_port_data *pdata = &device->port_data[port];

                pdata->ib_dev = device;
                spin_lock_init(&pdata->pkey_list_lock);
                INIT_LIST_HEAD(&pdata->pkey_list);
                spin_lock_init(&pdata->netdev_lock);
                INIT_HLIST_NODE(&pdata->ndev_hash_link);
        }
        return 0;
}

static int verify_immutable(const struct ib_device *dev, u32 port)
{
        return WARN_ON(!rdma_cap_ib_mad(dev, port) &&
                            rdma_max_mad_size(dev, port) != 0);
}

static int setup_port_data(struct ib_device *device)
{
        u32 port;
        int ret;

        ret = alloc_port_data(device);
        if (ret)
                return ret;

        rdma_for_each_port (device, port) {
                struct ib_port_data *pdata = &device->port_data[port];

                ret = device->ops.get_port_immutable(device, port,
                                                     &pdata->immutable);
                if (ret)
                        return ret;

                if (verify_immutable(device, port))
                        return -EINVAL;
        }
        return 0;
}

/**
 * ib_port_immutable_read() - Read rdma port's immutable data
 * @dev: IB device
 * @port: port number whose immutable data to read. It starts with index 1 and
 *        valid upto including rdma_end_port().
 */
const struct ib_port_immutable*
ib_port_immutable_read(struct ib_device *dev, unsigned int port)
{
        WARN_ON(!rdma_is_port_valid(dev, port));
        return &dev->port_data[port].immutable;
}
EXPORT_SYMBOL(ib_port_immutable_read);

void ib_get_device_fw_str(struct ib_device *dev, char *str)
{
        if (dev->ops.get_dev_fw_str)
                dev->ops.get_dev_fw_str(dev, str);
        else
                str[0] = '\0';
}
EXPORT_SYMBOL(ib_get_device_fw_str);

static void ib_policy_change_task(struct work_struct *work)
{
        struct ib_device *dev;
        unsigned long index;

        down_read(&devices_rwsem);
        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
                unsigned int i;

                rdma_for_each_port (dev, i) {
                        u64 sp;
                        ib_get_cached_subnet_prefix(dev, i, &sp);
                        ib_security_cache_change(dev, i, sp);
                }
        }
        up_read(&devices_rwsem);
}

static int ib_security_change(struct notifier_block *nb, unsigned long event,
                              void *lsm_data)
{
        if (event != LSM_POLICY_CHANGE)
                return NOTIFY_DONE;

        schedule_work(&ib_policy_change_work);
        ib_mad_agent_security_change();

        return NOTIFY_OK;
}

static void compatdev_release(struct device *dev)
{
        struct ib_core_device *cdev =
                container_of(dev, struct ib_core_device, dev);

        kfree(cdev);
}

static int add_one_compat_dev(struct ib_device *device,
                              struct rdma_dev_net *rnet)
{
        struct ib_core_device *cdev;
        int ret;

        lockdep_assert_held(&rdma_nets_rwsem);
        if (!ib_devices_shared_netns)
                return 0;

        /*
         * Create and add compat device in all namespaces other than where it
         * is currently bound to.
         */
        if (net_eq(read_pnet(&rnet->net),
                   read_pnet(&device->coredev.rdma_net)))
                return 0;

        /*
         * The first of init_net() or ib_register_device() to take the
         * compat_devs_mutex wins and gets to add the device. Others will wait
         * for completion here.
         */
        mutex_lock(&device->compat_devs_mutex);
        cdev = xa_load(&device->compat_devs, rnet->id);
        if (cdev) {
                ret = 0;
                goto done;
        }
        ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
        if (ret)
                goto done;

        cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
        if (!cdev) {
                ret = -ENOMEM;
                goto cdev_err;
        }

        cdev->dev.parent = device->dev.parent;
        rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
        cdev->dev.release = compatdev_release;
        ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
        if (ret)
                goto add_err;

        ret = device_add(&cdev->dev);
        if (ret)
                goto add_err;
        ret = ib_setup_port_attrs(cdev);
        if (ret)
                goto port_err;

        ret = xa_err(xa_store(&device->compat_devs, rnet->id,
                              cdev, GFP_KERNEL));
        if (ret)
                goto insert_err;

        mutex_unlock(&device->compat_devs_mutex);
        return 0;

insert_err:
        ib_free_port_attrs(cdev);
port_err:
        device_del(&cdev->dev);
add_err:
        put_device(&cdev->dev);
cdev_err:
        xa_release(&device->compat_devs, rnet->id);
done:
        mutex_unlock(&device->compat_devs_mutex);
        return ret;
}

static void remove_one_compat_dev(struct ib_device *device, u32 id)
{
        struct ib_core_device *cdev;

        mutex_lock(&device->compat_devs_mutex);
        cdev = xa_erase(&device->compat_devs, id);
        mutex_unlock(&device->compat_devs_mutex);
        if (cdev) {
                ib_free_port_attrs(cdev);
                device_del(&cdev->dev);
                put_device(&cdev->dev);
        }
}

static void remove_compat_devs(struct ib_device *device)
{
        struct ib_core_device *cdev;
        unsigned long index;

        xa_for_each (&device->compat_devs, index, cdev)
                remove_one_compat_dev(device, index);
}

static int add_compat_devs(struct ib_device *device)
{
        struct rdma_dev_net *rnet;
        unsigned long index;
        int ret = 0;

        lockdep_assert_held(&devices_rwsem);

        down_read(&rdma_nets_rwsem);
        xa_for_each (&rdma_nets, index, rnet) {
                ret = add_one_compat_dev(device, rnet);
                if (ret)
                        break;
        }
        up_read(&rdma_nets_rwsem);
        return ret;
}

static void remove_all_compat_devs(void)
{
        struct ib_compat_device *cdev;
        struct ib_device *dev;
        unsigned long index;

        down_read(&devices_rwsem);
        xa_for_each (&devices, index, dev) {
                unsigned long c_index = 0;

                /* Hold nets_rwsem so that any other thread modifying this
                 * system param can sync with this thread.
                 */
                down_read(&rdma_nets_rwsem);
                xa_for_each (&dev->compat_devs, c_index, cdev)
                        remove_one_compat_dev(dev, c_index);
                up_read(&rdma_nets_rwsem);
        }
        up_read(&devices_rwsem);
}

static int add_all_compat_devs(void)
{
        struct rdma_dev_net *rnet;
        struct ib_device *dev;
        unsigned long index;
        int ret = 0;

        down_read(&devices_rwsem);
        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
                unsigned long net_index = 0;

                /* Hold nets_rwsem so that any other thread modifying this
                 * system param can sync with this thread.
                 */
                down_read(&rdma_nets_rwsem);
                xa_for_each (&rdma_nets, net_index, rnet) {
                        ret = add_one_compat_dev(dev, rnet);
                        if (ret)
                                break;
                }
                up_read(&rdma_nets_rwsem);
        }
        up_read(&devices_rwsem);
        if (ret)
                remove_all_compat_devs();
        return ret;
}

int rdma_compatdev_set(u8 enable)
{
        struct rdma_dev_net *rnet;
        unsigned long index;
        int ret = 0;

        down_write(&rdma_nets_rwsem);
        if (ib_devices_shared_netns == enable) {
                up_write(&rdma_nets_rwsem);
                return 0;
        }

        /* enable/disable of compat devices is not supported
         * when more than default init_net exists.
         */
        xa_for_each (&rdma_nets, index, rnet) {
                ret++;
                break;
        }
        if (!ret)
                ib_devices_shared_netns = enable;
        up_write(&rdma_nets_rwsem);
        if (ret)
                return -EBUSY;

        if (enable)
                ret = add_all_compat_devs();
        else
                remove_all_compat_devs();
        return ret;
}

static void rdma_dev_exit_net(struct net *net)
{
        struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
        struct ib_device *dev;
        unsigned long index;
        int ret;

        down_write(&rdma_nets_rwsem);
        /*
         * Prevent the ID from being re-used and hide the id from xa_for_each.
         */
        ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
        WARN_ON(ret);
        up_write(&rdma_nets_rwsem);

        down_read(&devices_rwsem);
        xa_for_each (&devices, index, dev) {
                get_device(&dev->dev);
                /*
                 * Release the devices_rwsem so that pontentially blocking
                 * device_del, doesn't hold the devices_rwsem for too long.
                 */
                up_read(&devices_rwsem);

                remove_one_compat_dev(dev, rnet->id);

                /*
                 * If the real device is in the NS then move it back to init.
                 */
                rdma_dev_change_netns(dev, net, &init_net);

                put_device(&dev->dev);
                down_read(&devices_rwsem);
        }
        up_read(&devices_rwsem);

        rdma_nl_net_exit(rnet);
        xa_erase(&rdma_nets, rnet->id);
}

static __net_init int rdma_dev_init_net(struct net *net)
{
        struct rdma_dev_net *rnet = rdma_net_to_dev_net(net);
        unsigned long index;
        struct ib_device *dev;
        int ret;

        write_pnet(&rnet->net, net);

        ret = rdma_nl_net_init(rnet);
        if (ret)
                return ret;

        /* No need to create any compat devices in default init_net. */
        if (net_eq(net, &init_net))
                return 0;

        ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
        if (ret) {
                rdma_nl_net_exit(rnet);
                return ret;
        }

        down_read(&devices_rwsem);
        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
                /* Hold nets_rwsem so that netlink command cannot change
                 * system configuration for device sharing mode.
                 */
                down_read(&rdma_nets_rwsem);
                ret = add_one_compat_dev(dev, rnet);
                up_read(&rdma_nets_rwsem);
                if (ret)
                        break;
        }
        up_read(&devices_rwsem);

        if (ret)
                rdma_dev_exit_net(net);

        return ret;
}

/*
 * Assign the unique string device name and the unique device index. This is
 * undone by ib_dealloc_device.
 */
static int assign_name(struct ib_device *device, const char *name)
{
        static u32 last_id;
        int ret;

        down_write(&devices_rwsem);
        /* Assign a unique name to the device */
        if (strchr(name, '%'))
                ret = alloc_name(device, name);
        else
                ret = dev_set_name(&device->dev, name);
        if (ret)
                goto out;

        if (__ib_device_get_by_name(dev_name(&device->dev))) {
                ret = -ENFILE;
                goto out;
        }
        strscpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX);

        ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b,
                        &last_id, GFP_KERNEL);
        if (ret > 0)
                ret = 0;

out:
        up_write(&devices_rwsem);
        return ret;
}

/*
 * setup_device() allocates memory and sets up data that requires calling the
 * device ops, this is the only reason these actions are not done during
 * ib_alloc_device. It is undone by ib_dealloc_device().
 */
static int setup_device(struct ib_device *device)
{
        struct ib_udata uhw = {.outlen = 0, .inlen = 0};
        int ret;

        ib_device_check_mandatory(device);

        ret = setup_port_data(device);
        if (ret) {
                dev_warn(&device->dev, "Couldn't create per-port data\n");
                return ret;
        }

        memset(&device->attrs, 0, sizeof(device->attrs));
        ret = device->ops.query_device(device, &device->attrs, &uhw);
        if (ret) {
                dev_warn(&device->dev,
                         "Couldn't query the device attributes\n");
                return ret;
        }

        return 0;
}

static void disable_device(struct ib_device *device)
{
        u32 cid;

        WARN_ON(!refcount_read(&device->refcount));

        down_write(&devices_rwsem);
        xa_clear_mark(&devices, device->index, DEVICE_REGISTERED);
        up_write(&devices_rwsem);

        /*
         * Remove clients in LIFO order, see assign_client_id. This could be
         * more efficient if xarray learns to reverse iterate. Since no new
         * clients can be added to this ib_device past this point we only need
         * the maximum possible client_id value here.
         */
        down_read(&clients_rwsem);
        cid = highest_client_id;
        up_read(&clients_rwsem);
        while (cid) {
                cid--;
                remove_client_context(device, cid);
        }

        ib_cq_pool_cleanup(device);

        /* Pairs with refcount_set in enable_device */
        ib_device_put(device);
        wait_for_completion(&device->unreg_completion);

        /*
         * compat devices must be removed after device refcount drops to zero.
         * Otherwise init_net() may add more compatdevs after removing compat
         * devices and before device is disabled.
         */
        remove_compat_devs(device);
}

/*
 * An enabled device is visible to all clients and to all the public facing
 * APIs that return a device pointer. This always returns with a new get, even
 * if it fails.
 */
static int enable_device_and_get(struct ib_device *device)
{
        struct ib_client *client;
        unsigned long index;
        int ret = 0;

        /*
         * One ref belongs to the xa and the other belongs to this
         * thread. This is needed to guard against parallel unregistration.
         */
        refcount_set(&device->refcount, 2);
        down_write(&devices_rwsem);
        xa_set_mark(&devices, device->index, DEVICE_REGISTERED);

        /*
         * By using downgrade_write() we ensure that no other thread can clear
         * DEVICE_REGISTERED while we are completing the client setup.
         */
        downgrade_write(&devices_rwsem);

        if (device->ops.enable_driver) {
                ret = device->ops.enable_driver(device);
                if (ret)
                        goto out;
        }

        down_read(&clients_rwsem);
        xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
                ret = add_client_context(device, client);
                if (ret)
                        break;
        }
        up_read(&clients_rwsem);
        if (!ret)
                ret = add_compat_devs(device);
out:
        up_read(&devices_rwsem);
        return ret;
}

static void prevent_dealloc_device(struct ib_device *ib_dev)
{
}

static void ib_device_notify_register(struct ib_device *device)
{
        struct net_device *netdev;
        u32 port;
        int ret;

        down_read(&devices_rwsem);

        ret = rdma_nl_notify_event(device, 0, RDMA_REGISTER_EVENT);
        if (ret)
                goto out;

        rdma_for_each_port(device, port) {
                netdev = ib_device_get_netdev(device, port);
                if (!netdev)
                        continue;

                ret = rdma_nl_notify_event(device, port,
                                           RDMA_NETDEV_ATTACH_EVENT);
                dev_put(netdev);
                if (ret)
                        goto out;
        }

out:
        up_read(&devices_rwsem);
}

/**
 * ib_register_device - Register an IB device with IB core
 * @device: Device to register
 * @name: unique string device name. This may include a '%' which will
 *           cause a unique index to be added to the passed device name.
 * @dma_device: pointer to a DMA-capable device. If %NULL, then the IB
 *                device will be used. In this case the caller should fully
 *                setup the ibdev for DMA. This usually means using dma_virt_ops.
 *
 * Low-level drivers use ib_register_device() to register their
 * devices with the IB core.  All registered clients will receive a
 * callback for each device that is added. @device must be allocated
 * with ib_alloc_device().
 *
 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device()
 * asynchronously then the device pointer may become freed as soon as this
 * function returns.
 */
int ib_register_device(struct ib_device *device, const char *name,
                       struct device *dma_device)
{
        int ret;

        ret = assign_name(device, name);
        if (ret)
                return ret;

        /*
         * If the caller does not provide a DMA capable device then the IB core
         * will set up ib_sge and scatterlist structures that stash the kernel
         * virtual address into the address field.
         */
        WARN_ON(dma_device && !dma_device->dma_parms);
        device->dma_device = dma_device;

        ret = setup_device(device);
        if (ret)
                return ret;

        ret = ib_cache_setup_one(device);
        if (ret) {
                dev_warn(&device->dev,
                         "Couldn't set up InfiniBand P_Key/GID cache\n");
                return ret;
        }

        device->groups[0] = &ib_dev_attr_group;
        device->groups[1] = device->ops.device_group;
        ret = ib_setup_device_attrs(device);
        if (ret)
                goto cache_cleanup;

        ib_device_register_rdmacg(device);

        rdma_counter_init(device);

        /*
         * Ensure that ADD uevent is not fired because it
         * is too early amd device is not initialized yet.
         */
        dev_set_uevent_suppress(&device->dev, true);
        ret = device_add(&device->dev);
        if (ret)
                goto cg_cleanup;

        ret = ib_setup_port_attrs(&device->coredev);
        if (ret) {
                dev_warn(&device->dev,
                         "Couldn't register device with driver model\n");
                goto dev_cleanup;
        }

        ret = enable_device_and_get(device);
        if (ret) {
                void (*dealloc_fn)(struct ib_device *);

                /*
                 * If we hit this error flow then we don't want to
                 * automatically dealloc the device since the caller is
                 * expected to call ib_dealloc_device() after
                 * ib_register_device() fails. This is tricky due to the
                 * possibility for a parallel unregistration along with this
                 * error flow. Since we have a refcount here we know any
                 * parallel flow is stopped in disable_device and will see the
                 * special dealloc_driver pointer, causing the responsibility to
                 * ib_dealloc_device() to revert back to this thread.
                 */
                dealloc_fn = device->ops.dealloc_driver;
                device->ops.dealloc_driver = prevent_dealloc_device;
                ib_device_put(device);
                __ib_unregister_device(device);
                device->ops.dealloc_driver = dealloc_fn;
                dev_set_uevent_suppress(&device->dev, false);
                return ret;
        }
        dev_set_uevent_suppress(&device->dev, false);
        /* Mark for userspace that device is ready */
        kobject_uevent(&device->dev.kobj, KOBJ_ADD);

        ib_device_notify_register(device);
        ib_device_put(device);

        return 0;

dev_cleanup:
        device_del(&device->dev);
cg_cleanup:
        dev_set_uevent_suppress(&device->dev, false);
        ib_device_unregister_rdmacg(device);
cache_cleanup:
        ib_cache_cleanup_one(device);
        return ret;
}
EXPORT_SYMBOL(ib_register_device);

/* Callers must hold a get on the device. */
static void __ib_unregister_device(struct ib_device *ib_dev)
{
        struct ib_device *sub, *tmp;

        mutex_lock(&ib_dev->subdev_lock);
        list_for_each_entry_safe_reverse(sub, tmp,
                                         &ib_dev->subdev_list_head,
                                         subdev_list) {
                list_del(&sub->subdev_list);
                ib_dev->ops.del_sub_dev(sub);
                ib_device_put(ib_dev);
        }
        mutex_unlock(&ib_dev->subdev_lock);

        /*
         * We have a registration lock so that all the calls to unregister are
         * fully fenced, once any unregister returns the device is truely
         * unregistered even if multiple callers are unregistering it at the
         * same time. This also interacts with the registration flow and
         * provides sane semantics if register and unregister are racing.
         */
        mutex_lock(&ib_dev->unregistration_lock);
        if (!refcount_read(&ib_dev->refcount))
                goto out;

        disable_device(ib_dev);
        rdma_nl_notify_event(ib_dev, 0, RDMA_UNREGISTER_EVENT);

        /* Expedite removing unregistered pointers from the hash table */
        free_netdevs(ib_dev);

        ib_free_port_attrs(&ib_dev->coredev);
        device_del(&ib_dev->dev);
        ib_device_unregister_rdmacg(ib_dev);
        ib_cache_cleanup_one(ib_dev);

        /*
         * Drivers using the new flow may not call ib_dealloc_device except
         * in error unwind prior to registration success.
         */
        if (ib_dev->ops.dealloc_driver &&
            ib_dev->ops.dealloc_driver != prevent_dealloc_device) {
                WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1);
                ib_dealloc_device(ib_dev);
        }
out:
        mutex_unlock(&ib_dev->unregistration_lock);
}

/**
 * ib_unregister_device - Unregister an IB device
 * @ib_dev: The device to unregister
 *
 * Unregister an IB device.  All clients will receive a remove callback.
 *
 * Callers should call this routine only once, and protect against races with
 * registration. Typically it should only be called as part of a remove
 * callback in an implementation of driver core's struct device_driver and
 * related.
 *
 * If ops.dealloc_driver is used then ib_dev will be freed upon return from
 * this function.
 */
void ib_unregister_device(struct ib_device *ib_dev)
{
        get_device(&ib_dev->dev);
        __ib_unregister_device(ib_dev);
        put_device(&ib_dev->dev);
}
EXPORT_SYMBOL(ib_unregister_device);

/**
 * ib_unregister_device_and_put - Unregister a device while holding a 'get'
 * @ib_dev: The device to unregister
 *
 * This is the same as ib_unregister_device(), except it includes an internal
 * ib_device_put() that should match a 'get' obtained by the caller.
 *
 * It is safe to call this routine concurrently from multiple threads while
 * holding the 'get'. When the function returns the device is fully
 * unregistered.
 *
 * Drivers using this flow MUST use the driver_unregister callback to clean up
 * their resources associated with the device and dealloc it.
 */
void ib_unregister_device_and_put(struct ib_device *ib_dev)
{
        WARN_ON(!ib_dev->ops.dealloc_driver);
        get_device(&ib_dev->dev);
        ib_device_put(ib_dev);
        __ib_unregister_device(ib_dev);
        put_device(&ib_dev->dev);
}
EXPORT_SYMBOL(ib_unregister_device_and_put);

/**
 * ib_unregister_driver - Unregister all IB devices for a driver
 * @driver_id: The driver to unregister
 *
 * This implements a fence for device unregistration. It only returns once all
 * devices associated with the driver_id have fully completed their
 * unregistration and returned from ib_unregister_device*().
 *
 * If device's are not yet unregistered it goes ahead and starts unregistering
 * them.
 *
 * This does not block creation of new devices with the given driver_id, that
 * is the responsibility of the caller.
 */
void ib_unregister_driver(enum rdma_driver_id driver_id)
{
        struct ib_device *ib_dev;
        unsigned long index;

        down_read(&devices_rwsem);
        xa_for_each (&devices, index, ib_dev) {
                if (ib_dev->ops.driver_id != driver_id)
                        continue;

                get_device(&ib_dev->dev);
                up_read(&devices_rwsem);

                WARN_ON(!ib_dev->ops.dealloc_driver);
                __ib_unregister_device(ib_dev);

                put_device(&ib_dev->dev);
                down_read(&devices_rwsem);
        }
        up_read(&devices_rwsem);
}
EXPORT_SYMBOL(ib_unregister_driver);

static void ib_unregister_work(struct work_struct *work)
{
        struct ib_device *ib_dev =
                container_of(work, struct ib_device, unregistration_work);

        __ib_unregister_device(ib_dev);
        put_device(&ib_dev->dev);
}

/**
 * ib_unregister_device_queued - Unregister a device using a work queue
 * @ib_dev: The device to unregister
 *
 * This schedules an asynchronous unregistration using a WQ for the device. A
 * driver should use this to avoid holding locks while doing unregistration,
 * such as holding the RTNL lock.
 *
 * Drivers using this API must use ib_unregister_driver before module unload
 * to ensure that all scheduled unregistrations have completed.
 */
void ib_unregister_device_queued(struct ib_device *ib_dev)
{
        WARN_ON(!refcount_read(&ib_dev->refcount));
        WARN_ON(!ib_dev->ops.dealloc_driver);
        get_device(&ib_dev->dev);
        if (!queue_work(ib_unreg_wq, &ib_dev->unregistration_work))
                put_device(&ib_dev->dev);
}
EXPORT_SYMBOL(ib_unregister_device_queued);

/*
 * The caller must pass in a device that has the kref held and the refcount
 * released. If the device is in cur_net and still registered then it is moved
 * into net.
 */
static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
                                 struct net *net)
{
        int ret2 = -EINVAL;
        int ret;

        mutex_lock(&device->unregistration_lock);

        /*
         * If a device not under ib_device_get() or if the unregistration_lock
         * is not held, the namespace can be changed, or it can be unregistered.
         * Check again under the lock.
         */
        if (refcount_read(&device->refcount) == 0 ||
            !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
                ret = -ENODEV;
                goto out;
        }

        kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
        disable_device(device);

        /*
         * At this point no one can be using the device, so it is safe to
         * change the namespace.
         */
        write_pnet(&device->coredev.rdma_net, net);

        down_read(&devices_rwsem);
        /*
         * Currently rdma devices are system wide unique. So the device name
         * is guaranteed free in the new namespace. Publish the new namespace
         * at the sysfs level.
         */
        ret = device_rename(&device->dev, dev_name(&device->dev));
        up_read(&devices_rwsem);
        if (ret) {
                dev_warn(&device->dev,
                         "%s: Couldn't rename device after namespace change\n",
                         __func__);
                /* Try and put things back and re-enable the device */
                write_pnet(&device->coredev.rdma_net, cur_net);
        }

        ret2 = enable_device_and_get(device);
        if (ret2) {
                /*
                 * This shouldn't really happen, but if it does, let the user
                 * retry at later point. So don't disable the device.
                 */
                dev_warn(&device->dev,
                         "%s: Couldn't re-enable device after namespace change\n",
                         __func__);
        }
        kobject_uevent(&device->dev.kobj, KOBJ_ADD);

        ib_device_put(device);
out:
        mutex_unlock(&device->unregistration_lock);
        if (ret)
                return ret;
        return ret2;
}

int ib_device_set_netns_put(struct sk_buff *skb,
                            struct ib_device *dev, u32 ns_fd)
{
        struct net *net;
        int ret;

        net = get_net_ns_by_fd(ns_fd);
        if (IS_ERR(net)) {
                ret = PTR_ERR(net);
                goto net_err;
        }

        if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
                ret = -EPERM;
                goto ns_err;
        }

        /*
         * All the ib_clients, including uverbs, are reset when the namespace is
         * changed and this cannot be blocked waiting for userspace to do
         * something, so disassociation is mandatory.
         */
        if (!dev->ops.disassociate_ucontext || ib_devices_shared_netns) {
                ret = -EOPNOTSUPP;
                goto ns_err;
        }

        get_device(&dev->dev);
        ib_device_put(dev);
        ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
        put_device(&dev->dev);

        put_net(net);
        return ret;

ns_err:
        put_net(net);
net_err:
        ib_device_put(dev);
        return ret;
}

static struct pernet_operations rdma_dev_net_ops = {
        .init = rdma_dev_init_net,
        .exit = rdma_dev_exit_net,
        .id = &rdma_dev_net_id,
        .size = sizeof(struct rdma_dev_net),
};

static int assign_client_id(struct ib_client *client)
{
        int ret;

        lockdep_assert_held(&clients_rwsem);
        /*
         * The add/remove callbacks must be called in FIFO/LIFO order. To
         * achieve this we assign client_ids so they are sorted in
         * registration order.
         */
        client->client_id = highest_client_id;
        ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL);
        if (ret)
                return ret;

        highest_client_id++;
        xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED);
        return 0;
}

static void remove_client_id(struct ib_client *client)
{
        down_write(&clients_rwsem);
        xa_erase(&clients, client->client_id);
        for (; highest_client_id; highest_client_id--)
                if (xa_load(&clients, highest_client_id - 1))
                        break;
        up_write(&clients_rwsem);
}

/**
 * ib_register_client - Register an IB client
 * @client:Client to register
 *
 * Upper level users of the IB drivers can use ib_register_client() to
 * register callbacks for IB device addition and removal.  When an IB
 * device is added, each registered client's add method will be called
 * (in the order the clients were registered), and when a device is
 * removed, each client's remove method will be called (in the reverse
 * order that clients were registered).  In addition, when
 * ib_register_client() is called, the client will receive an add
 * callback for all devices already registered.
 */
int ib_register_client(struct ib_client *client)
{
        struct ib_device *device;
        unsigned long index;
        bool need_unreg = false;
        int ret;

        refcount_set(&client->uses, 1);
        init_completion(&client->uses_zero);

        /*
         * The devices_rwsem is held in write mode to ensure that a racing
         * ib_register_device() sees a consisent view of clients and devices.
         */
        down_write(&devices_rwsem);
        down_write(&clients_rwsem);
        ret = assign_client_id(client);
        if (ret)
                goto out;

        need_unreg = true;
        xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) {
                ret = add_client_context(device, client);
                if (ret)
                        goto out;
        }
        ret = 0;
out:
        up_write(&clients_rwsem);
        up_write(&devices_rwsem);
        if (need_unreg && ret)
                ib_unregister_client(client);
        return ret;
}
EXPORT_SYMBOL(ib_register_client);

/**
 * ib_unregister_client - Unregister an IB client
 * @client:Client to unregister
 *
 * Upper level users use ib_unregister_client() to remove their client
 * registration.  When ib_unregister_client() is called, the client
 * will receive a remove callback for each IB device still registered.
 *
 * This is a full fence, once it returns no client callbacks will be called,
 * or are running in another thread.
 */
void ib_unregister_client(struct ib_client *client)
{
        struct ib_device *device;
        unsigned long index;

        down_write(&clients_rwsem);
        ib_client_put(client);
        xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED);
        up_write(&clients_rwsem);

        /* We do not want to have locks while calling client->remove() */
        rcu_read_lock();
        xa_for_each (&devices, index, device) {
                if (!ib_device_try_get(device))
                        continue;
                rcu_read_unlock();

                remove_client_context(device, client->client_id);

                ib_device_put(device);
                rcu_read_lock();
        }
        rcu_read_unlock();

        /*
         * remove_client_context() is not a fence, it can return even though a
         * removal is ongoing. Wait until all removals are completed.
         */
        wait_for_completion(&client->uses_zero);
        remove_client_id(client);
}
EXPORT_SYMBOL(ib_unregister_client);

static int __ib_get_global_client_nl_info(const char *client_name,
                                          struct ib_client_nl_info *res)
{
        struct ib_client *client;
        unsigned long index;
        int ret = -ENOENT;

        down_read(&clients_rwsem);
        xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
                if (strcmp(client->name, client_name) != 0)
                        continue;
                if (!client->get_global_nl_info) {
                        ret = -EOPNOTSUPP;
                        break;
                }
                ret = client->get_global_nl_info(res);
                if (WARN_ON(ret == -ENOENT))
                        ret = -EINVAL;
                if (!ret && res->cdev)
                        get_device(res->cdev);
                break;
        }
        up_read(&clients_rwsem);
        return ret;
}

static int __ib_get_client_nl_info(struct ib_device *ibdev,
                                   const char *client_name,
                                   struct ib_client_nl_info *res)
{
        unsigned long index;
        void *client_data;
        int ret = -ENOENT;

        down_read(&ibdev->client_data_rwsem);
        xan_for_each_marked (&ibdev->client_data, index, client_data,
                             CLIENT_DATA_REGISTERED) {
                struct ib_client *client = xa_load(&clients, index);

                if (!client || strcmp(client->name, client_name) != 0)
                        continue;
                if (!client->get_nl_info) {
                        ret = -EOPNOTSUPP;
                        break;
                }
                ret = client->get_nl_info(ibdev, client_data, res);
                if (WARN_ON(ret == -ENOENT))
                        ret = -EINVAL;

                /*
                 * The cdev is guaranteed valid as long as we are inside the
                 * client_data_rwsem as remove_one can't be called. Keep it
                 * valid for the caller.
                 */
                if (!ret && res->cdev)
                        get_device(res->cdev);
                break;
        }
        up_read(&ibdev->client_data_rwsem);

        return ret;
}

/**
 * ib_get_client_nl_info - Fetch the nl_info from a client
 * @ibdev: IB device
 * @client_name: Name of the client
 * @res: Result of the query
 */
int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name,
                          struct ib_client_nl_info *res)
{
        int ret;

        if (ibdev)
                ret = __ib_get_client_nl_info(ibdev, client_name, res);
        else
                ret = __ib_get_global_client_nl_info(client_name, res);
#ifdef CONFIG_MODULES
        if (ret == -ENOENT) {
                request_module("rdma-client-%s", client_name);
                if (ibdev)
                        ret = __ib_get_client_nl_info(ibdev, client_name, res);
                else
                        ret = __ib_get_global_client_nl_info(client_name, res);
        }
#endif
        if (ret) {
                if (ret == -ENOENT)
                        return -EOPNOTSUPP;
                return ret;
        }

        if (WARN_ON(!res->cdev))
                return -EINVAL;
        return 0;
}

/**
 * ib_set_client_data - Set IB client context
 * @device:Device to set context for
 * @client:Client to set context for
 * @data:Context to set
 *
 * ib_set_client_data() sets client context data that can be retrieved with
 * ib_get_client_data(). This can only be called while the client is
 * registered to the device, once the ib_client remove() callback returns this
 * cannot be called.
 */
void ib_set_client_data(struct ib_device *device, struct ib_client *client,
                        void *data)
{
        void *rc;

        if (WARN_ON(IS_ERR(data)))
                data = NULL;

        rc = xa_store(&device->client_data, client->client_id, data,
                      GFP_KERNEL);
        WARN_ON(xa_is_err(rc));
}
EXPORT_SYMBOL(ib_set_client_data);

/**
 * ib_register_event_handler - Register an IB event handler
 * @event_handler:Handler to register
 *
 * ib_register_event_handler() registers an event handler that will be
 * called back when asynchronous IB events occur (as defined in
 * chapter 11 of the InfiniBand Architecture Specification). This
 * callback occurs in workqueue context.
 */
void ib_register_event_handler(struct ib_event_handler *event_handler)
{
        down_write(&event_handler->device->event_handler_rwsem);
        list_add_tail(&event_handler->list,
                      &event_handler->device->event_handler_list);
        up_write(&event_handler->device->event_handler_rwsem);
}
EXPORT_SYMBOL(ib_register_event_handler);

/**
 * ib_unregister_event_handler - Unregister an event handler
 * @event_handler:Handler to unregister
 *
 * Unregister an event handler registered with
 * ib_register_event_handler().
 */
void ib_unregister_event_handler(struct ib_event_handler *event_handler)
{
        down_write(&event_handler->device->event_handler_rwsem);
        list_del(&event_handler->list);
        up_write(&event_handler->device->event_handler_rwsem);
}
EXPORT_SYMBOL(ib_unregister_event_handler);

void ib_dispatch_event_clients(struct ib_event *event)
{
        struct ib_event_handler *handler;

        down_read(&event->device->event_handler_rwsem);

        list_for_each_entry(handler, &event->device->event_handler_list, list)
                handler->handler(handler, event);

        up_read(&event->device->event_handler_rwsem);
}

static int iw_query_port(struct ib_device *device,
                           u32 port_num,
                           struct ib_port_attr *port_attr)
{
        struct in_device *inetdev;
        struct net_device *netdev;

        memset(port_attr, 0, sizeof(*port_attr));

        netdev = ib_device_get_netdev(device, port_num);
        if (!netdev)
                return -ENODEV;

        port_attr->max_mtu = IB_MTU_4096;
        port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu);

        if (!netif_carrier_ok(netdev)) {
                port_attr->state = IB_PORT_DOWN;
                port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED;
        } else {
                rcu_read_lock();
                inetdev = __in_dev_get_rcu(netdev);

                if (inetdev && inetdev->ifa_list) {
                        port_attr->state = IB_PORT_ACTIVE;
                        port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
                } else {
                        port_attr->state = IB_PORT_INIT;
                        port_attr->phys_state =
                                IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING;
                }

                rcu_read_unlock();
        }

        dev_put(netdev);
        return device->ops.query_port(device, port_num, port_attr);
}

static int __ib_query_port(struct ib_device *device,
                           u32 port_num,
                           struct ib_port_attr *port_attr)
{
        int err;

        memset(port_attr, 0, sizeof(*port_attr));

        err = device->ops.query_port(device, port_num, port_attr);
        if (err || port_attr->subnet_prefix)
                return err;

        if (rdma_port_get_link_layer(device, port_num) !=
            IB_LINK_LAYER_INFINIBAND)
                return 0;

        ib_get_cached_subnet_prefix(device, port_num,
                                    &port_attr->subnet_prefix);
        return 0;
}

/**
 * ib_query_port - Query IB port attributes
 * @device:Device to query
 * @port_num:Port number to query
 * @port_attr:Port attributes
 *
 * ib_query_port() returns the attributes of a port through the
 * @port_attr pointer.
 */
int ib_query_port(struct ib_device *device,
                  u32 port_num,
                  struct ib_port_attr *port_attr)
{
        if (!rdma_is_port_valid(device, port_num))
                return -EINVAL;

        if (rdma_protocol_iwarp(device, port_num))
                return iw_query_port(device, port_num, port_attr);
        else
                return __ib_query_port(device, port_num, port_attr);
}
EXPORT_SYMBOL(ib_query_port);

static void add_ndev_hash(struct ib_port_data *pdata)
{
        unsigned long flags;

        might_sleep();

        spin_lock_irqsave(&ndev_hash_lock, flags);
        if (hash_hashed(&pdata->ndev_hash_link)) {
                hash_del_rcu(&pdata->ndev_hash_link);
                spin_unlock_irqrestore(&ndev_hash_lock, flags);
                /*
                 * We cannot do hash_add_rcu after a hash_del_rcu until the
                 * grace period
                 */
                synchronize_rcu();
                spin_lock_irqsave(&ndev_hash_lock, flags);
        }
        if (pdata->netdev)
                hash_add_rcu(ndev_hash, &pdata->ndev_hash_link,
                             (uintptr_t)pdata->netdev);
        spin_unlock_irqrestore(&ndev_hash_lock, flags);
}

/**
 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device
 * @ib_dev: Device to modify
 * @ndev: net_device to affiliate, may be NULL
 * @port: IB port the net_device is connected to
 *
 * Drivers should use this to link the ib_device to a netdev so the netdev
 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be
 * affiliated with any port.
 *
 * The caller must ensure that the given ndev is not unregistered or
 * unregistering, and that either the ib_device is unregistered or
 * ib_device_set_netdev() is called with NULL when the ndev sends a
 * NETDEV_UNREGISTER event.
 */
int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev,
                         u32 port)
{
        enum rdma_nl_notify_event_type etype;
        struct net_device *old_ndev;
        struct ib_port_data *pdata;
        unsigned long flags;
        int ret;

        if (!rdma_is_port_valid(ib_dev, port))
                return -EINVAL;

        /*
         * Drivers wish to call this before ib_register_driver, so we have to
         * setup the port data early.
         */
        ret = alloc_port_data(ib_dev);
        if (ret)
                return ret;

        pdata = &ib_dev->port_data[port];
        spin_lock_irqsave(&pdata->netdev_lock, flags);
        old_ndev = rcu_dereference_protected(
                pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
        if (old_ndev == ndev) {
                spin_unlock_irqrestore(&pdata->netdev_lock, flags);
                return 0;
        }

        rcu_assign_pointer(pdata->netdev, ndev);
        netdev_put(old_ndev, &pdata->netdev_tracker);
        netdev_hold(ndev, &pdata->netdev_tracker, GFP_ATOMIC);
        spin_unlock_irqrestore(&pdata->netdev_lock, flags);

        add_ndev_hash(pdata);

        /* Make sure that the device is registered before we send events */
        if (xa_load(&devices, ib_dev->index) != ib_dev)
                return 0;

        etype = ndev ? RDMA_NETDEV_ATTACH_EVENT : RDMA_NETDEV_DETACH_EVENT;
        rdma_nl_notify_event(ib_dev, port, etype);

        return 0;
}
EXPORT_SYMBOL(ib_device_set_netdev);

static void free_netdevs(struct ib_device *ib_dev)
{
        unsigned long flags;
        u32 port;

        if (!ib_dev->port_data)
                return;

        rdma_for_each_port (ib_dev, port) {
                struct ib_port_data *pdata = &ib_dev->port_data[port];
                struct net_device *ndev;

                spin_lock_irqsave(&pdata->netdev_lock, flags);
                ndev = rcu_dereference_protected(
                        pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
                if (ndev) {
                        spin_lock(&ndev_hash_lock);
                        hash_del_rcu(&pdata->ndev_hash_link);
                        spin_unlock(&ndev_hash_lock);

                        /*
                         * If this is the last dev_put there is still a
                         * synchronize_rcu before the netdev is kfreed, so we
                         * can continue to rely on unlocked pointer
                         * comparisons after the put
                         */
                        rcu_assign_pointer(pdata->netdev, NULL);
                        netdev_put(ndev, &pdata->netdev_tracker);
                }
                spin_unlock_irqrestore(&pdata->netdev_lock, flags);
        }
}

struct net_device *ib_device_get_netdev(struct ib_device *ib_dev,
                                        u32 port)
{
        struct ib_port_data *pdata;
        struct net_device *res;

        if (!rdma_is_port_valid(ib_dev, port))
                return NULL;

        if (!ib_dev->port_data)
                return NULL;

        pdata = &ib_dev->port_data[port];

        /*
         * New drivers should use ib_device_set_netdev() not the legacy
         * get_netdev().
         */
        if (ib_dev->ops.get_netdev)
                res = ib_dev->ops.get_netdev(ib_dev, port);
        else {
                spin_lock(&pdata->netdev_lock);
                res = rcu_dereference_protected(
                        pdata->netdev, lockdep_is_held(&pdata->netdev_lock));
                dev_hold(res);
                spin_unlock(&pdata->netdev_lock);
        }

        return res;
}
EXPORT_SYMBOL(ib_device_get_netdev);

/**
 * ib_query_netdev_port - Query the port number of a net_device
 * associated with an ibdev
 * @ibdev: IB device
 * @ndev: Network device
 * @port: IB port the net_device is connected to
 */
int ib_query_netdev_port(struct ib_device *ibdev, struct net_device *ndev,
                         u32 *port)
{
        struct net_device *ib_ndev;
        u32 port_num;

        rdma_for_each_port(ibdev, port_num) {
                ib_ndev = ib_device_get_netdev(ibdev, port_num);
                if (ndev == ib_ndev) {
                        *port = port_num;
                        dev_put(ib_ndev);
                        return 0;
                }
                dev_put(ib_ndev);
        }

        return -ENOENT;
}
EXPORT_SYMBOL(ib_query_netdev_port);

/**
 * ib_device_get_by_netdev - Find an IB device associated with a netdev
 * @ndev: netdev to locate
 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all)
 *
 * Find and hold an ib_device that is associated with a netdev via
 * ib_device_set_netdev(). The caller must call ib_device_put() on the
 * returned pointer.
 */
struct ib_device *ib_device_get_by_netdev(struct net_device *ndev,
                                          enum rdma_driver_id driver_id)
{
        struct ib_device *res = NULL;
        struct ib_port_data *cur;

        rcu_read_lock();
        hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link,
                                    (uintptr_t)ndev) {
                if (rcu_access_pointer(cur->netdev) == ndev &&
                    (driver_id == RDMA_DRIVER_UNKNOWN ||
                     cur->ib_dev->ops.driver_id == driver_id) &&
                    ib_device_try_get(cur->ib_dev)) {
                        res = cur->ib_dev;
                        break;
                }
        }
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(ib_device_get_by_netdev);

/**
 * ib_enum_roce_netdev - enumerate all RoCE ports
 * @ib_dev : IB device we want to query
 * @filter: Should we call the callback?
 * @filter_cookie: Cookie passed to filter
 * @cb: Callback to call for each found RoCE ports
 * @cookie: Cookie passed back to the callback
 *
 * Enumerates all of the physical RoCE ports of ib_dev
 * which are related to netdevice and calls callback() on each
 * device for which filter() function returns non zero.
 */
void ib_enum_roce_netdev(struct ib_device *ib_dev,
                         roce_netdev_filter filter,
                         void *filter_cookie,
                         roce_netdev_callback cb,
                         void *cookie)
{
        u32 port;

        rdma_for_each_port (ib_dev, port)
                if (rdma_protocol_roce(ib_dev, port)) {
                        struct net_device *idev =
                                ib_device_get_netdev(ib_dev, port);

                        if (filter(ib_dev, port, idev, filter_cookie))
                                cb(ib_dev, port, idev, cookie);
                        dev_put(idev);
                }
}

/**
 * ib_enum_all_roce_netdevs - enumerate all RoCE devices
 * @filter: Should we call the callback?
 * @filter_cookie: Cookie passed to filter
 * @cb: Callback to call for each found RoCE ports
 * @cookie: Cookie passed back to the callback
 *
 * Enumerates all RoCE devices' physical ports which are related
 * to netdevices and calls callback() on each device for which
 * filter() function returns non zero.
 */
void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
                              void *filter_cookie,
                              roce_netdev_callback cb,
                              void *cookie)
{
        struct ib_device *dev;
        unsigned long index;

        down_read(&devices_rwsem);
        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
                ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
        up_read(&devices_rwsem);
}

/*
 * ib_enum_all_devs - enumerate all ib_devices
 * @cb: Callback to call for each found ib_device
 *
 * Enumerates all ib_devices and calls callback() on each device.
 */
int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
                     struct netlink_callback *cb)
{
        unsigned long index;
        struct ib_device *dev;
        unsigned int idx = 0;
        int ret = 0;

        down_read(&devices_rwsem);
        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
                if (!rdma_dev_access_netns(dev, sock_net(skb->sk)))
                        continue;

                ret = nldev_cb(dev, skb, cb, idx);
                if (ret)
                        break;
                idx++;
        }
        up_read(&devices_rwsem);
        return ret;
}

/**
 * ib_query_pkey - Get P_Key table entry
 * @device:Device to query
 * @port_num:Port number to query
 * @index:P_Key table index to query
 * @pkey:Returned P_Key
 *
 * ib_query_pkey() fetches the specified P_Key table entry.
 */
int ib_query_pkey(struct ib_device *device,
                  u32 port_num, u16 index, u16 *pkey)
{
        if (!rdma_is_port_valid(device, port_num))
                return -EINVAL;

        if (!device->ops.query_pkey)
                return -EOPNOTSUPP;

        return device->ops.query_pkey(device, port_num, index, pkey);
}
EXPORT_SYMBOL(ib_query_pkey);

/**
 * ib_modify_device - Change IB device attributes
 * @device:Device to modify
 * @device_modify_mask:Mask of attributes to change
 * @device_modify:New attribute values
 *
 * ib_modify_device() changes a device's attributes as specified by
 * the @device_modify_mask and @device_modify structure.
 */
int ib_modify_device(struct ib_device *device,
                     int device_modify_mask,
                     struct ib_device_modify *device_modify)
{
        if (!device->ops.modify_device)
                return -EOPNOTSUPP;

        return device->ops.modify_device(device, device_modify_mask,
                                         device_modify);
}
EXPORT_SYMBOL(ib_modify_device);

/**
 * ib_modify_port - Modifies the attributes for the specified port.
 * @device: The device to modify.
 * @port_num: The number of the port to modify.
 * @port_modify_mask: Mask used to specify which attributes of the port
 *   to change.
 * @port_modify: New attribute values for the port.
 *
 * ib_modify_port() changes a port's attributes as specified by the
 * @port_modify_mask and @port_modify structure.
 */
int ib_modify_port(struct ib_device *device,
                   u32 port_num, int port_modify_mask,
                   struct ib_port_modify *port_modify)
{
        int rc;

        if (!rdma_is_port_valid(device, port_num))
                return -EINVAL;

        if (device->ops.modify_port)
                rc = device->ops.modify_port(device, port_num,
                                             port_modify_mask,
                                             port_modify);
        else if (rdma_protocol_roce(device, port_num) &&
                 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 ||
                  (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0))
                rc = 0;
        else
                rc = -EOPNOTSUPP;
        return rc;
}
EXPORT_SYMBOL(ib_modify_port);

/**
 * ib_find_gid - Returns the port number and GID table index where
 *   a specified GID value occurs. Its searches only for IB link layer.
 * @device: The device to query.
 * @gid: The GID value to search for.
 * @port_num: The port number of the device where the GID value was found.
 * @index: The index into the GID table where the GID was found.  This
 *   parameter may be NULL.
 */
int ib_find_gid(struct ib_device *device, union ib_gid *gid,
                u32 *port_num, u16 *index)
{
        union ib_gid tmp_gid;
        u32 port;
        int ret, i;

        rdma_for_each_port (device, port) {
                if (!rdma_protocol_ib(device, port))
                        continue;

                for (i = 0; i < device->port_data[port].immutable.gid_tbl_len;
                     ++i) {
                        ret = rdma_query_gid(device, port, i, &tmp_gid);
                        if (ret)
                                continue;

                        if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
                                *port_num = port;
                                if (index)
                                        *index = i;
                                return 0;
                        }
                }
        }

        return -ENOENT;
}
EXPORT_SYMBOL(ib_find_gid);

/**
 * ib_find_pkey - Returns the PKey table index where a specified
 *   PKey value occurs.
 * @device: The device to query.
 * @port_num: The port number of the device to search for the PKey.
 * @pkey: The PKey value to search for.
 * @index: The index into the PKey table where the PKey was found.
 */
int ib_find_pkey(struct ib_device *device,
                 u32 port_num, u16 pkey, u16 *index)
{
        int ret, i;
        u16 tmp_pkey;
        int partial_ix = -1;

        for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len;
             ++i) {
                ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
                if (ret)
                        return ret;
                if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
                        /* if there is full-member pkey take it.*/
                        if (tmp_pkey & 0x8000) {
                                *index = i;
                                return 0;
                        }
                        if (partial_ix < 0)
                                partial_ix = i;
                }
        }

        /*no full-member, if exists take the limited*/
        if (partial_ix >= 0) {
                *index = partial_ix;
                return 0;
        }
        return -ENOENT;
}
EXPORT_SYMBOL(ib_find_pkey);

/**
 * ib_get_net_dev_by_params() - Return the appropriate net_dev
 * for a received CM request
 * @dev:        An RDMA device on which the request has been received.
 * @port:        Port number on the RDMA device.
 * @pkey:        The Pkey the request came on.
 * @gid:        A GID that the net_dev uses to communicate.
 * @addr:        Contains the IP address that the request specified as its
 *                destination.
 *
 */
struct net_device *ib_get_net_dev_by_params(struct ib_device *dev,
                                            u32 port,
                                            u16 pkey,
                                            const union ib_gid *gid,
                                            const struct sockaddr *addr)
{
        struct net_device *net_dev = NULL;
        unsigned long index;
        void *client_data;

        if (!rdma_protocol_ib(dev, port))
                return NULL;

        /*
         * Holding the read side guarantees that the client will not become
         * unregistered while we are calling get_net_dev_by_params()
         */
        down_read(&dev->client_data_rwsem);
        xan_for_each_marked (&dev->client_data, index, client_data,
                             CLIENT_DATA_REGISTERED) {
                struct ib_client *client = xa_load(&clients, index);

                if (!client || !client->get_net_dev_by_params)
                        continue;

                net_dev = client->get_net_dev_by_params(dev, port, pkey, gid,
                                                        addr, client_data);
                if (net_dev)
                        break;
        }
        up_read(&dev->client_data_rwsem);

        return net_dev;
}
EXPORT_SYMBOL(ib_get_net_dev_by_params);

void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
{
        struct ib_device_ops *dev_ops = &dev->ops;
#define SET_DEVICE_OP(ptr, name)                                               \
        do {                                                                   \
                if (ops->name)                                                 \
                        if (!((ptr)->name))                                       \
                                (ptr)->name = ops->name;                       \
        } while (0)

#define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name)

        if (ops->driver_id != RDMA_DRIVER_UNKNOWN) {
                WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN &&
                        dev_ops->driver_id != ops->driver_id);
                dev_ops->driver_id = ops->driver_id;
        }
        if (ops->owner) {
                WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner);
                dev_ops->owner = ops->owner;
        }
        if (ops->uverbs_abi_ver)
                dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver;

        dev_ops->uverbs_no_driver_id_binding |=
                ops->uverbs_no_driver_id_binding;

        SET_DEVICE_OP(dev_ops, add_gid);
        SET_DEVICE_OP(dev_ops, add_sub_dev);
        SET_DEVICE_OP(dev_ops, advise_mr);
        SET_DEVICE_OP(dev_ops, alloc_dm);
        SET_DEVICE_OP(dev_ops, alloc_hw_device_stats);
        SET_DEVICE_OP(dev_ops, alloc_hw_port_stats);
        SET_DEVICE_OP(dev_ops, alloc_mr);
        SET_DEVICE_OP(dev_ops, alloc_mr_integrity);
        SET_DEVICE_OP(dev_ops, alloc_mw);
        SET_DEVICE_OP(dev_ops, alloc_pd);
        SET_DEVICE_OP(dev_ops, alloc_rdma_netdev);
        SET_DEVICE_OP(dev_ops, alloc_ucontext);
        SET_DEVICE_OP(dev_ops, alloc_xrcd);
        SET_DEVICE_OP(dev_ops, attach_mcast);
        SET_DEVICE_OP(dev_ops, check_mr_status);
        SET_DEVICE_OP(dev_ops, counter_alloc_stats);
        SET_DEVICE_OP(dev_ops, counter_bind_qp);
        SET_DEVICE_OP(dev_ops, counter_dealloc);
        SET_DEVICE_OP(dev_ops, counter_init);
        SET_DEVICE_OP(dev_ops, counter_unbind_qp);
        SET_DEVICE_OP(dev_ops, counter_update_stats);
        SET_DEVICE_OP(dev_ops, create_ah);
        SET_DEVICE_OP(dev_ops, create_counters);
        SET_DEVICE_OP(dev_ops, create_cq);
        SET_DEVICE_OP(dev_ops, create_flow);
        SET_DEVICE_OP(dev_ops, create_qp);
        SET_DEVICE_OP(dev_ops, create_rwq_ind_table);
        SET_DEVICE_OP(dev_ops, create_srq);
        SET_DEVICE_OP(dev_ops, create_user_ah);
        SET_DEVICE_OP(dev_ops, create_wq);
        SET_DEVICE_OP(dev_ops, dealloc_dm);
        SET_DEVICE_OP(dev_ops, dealloc_driver);
        SET_DEVICE_OP(dev_ops, dealloc_mw);
        SET_DEVICE_OP(dev_ops, dealloc_pd);
        SET_DEVICE_OP(dev_ops, dealloc_ucontext);
        SET_DEVICE_OP(dev_ops, dealloc_xrcd);
        SET_DEVICE_OP(dev_ops, del_gid);
        SET_DEVICE_OP(dev_ops, del_sub_dev);
        SET_DEVICE_OP(dev_ops, dereg_mr);
        SET_DEVICE_OP(dev_ops, destroy_ah);
        SET_DEVICE_OP(dev_ops, destroy_counters);
        SET_DEVICE_OP(dev_ops, destroy_cq);
        SET_DEVICE_OP(dev_ops, destroy_flow);
        SET_DEVICE_OP(dev_ops, destroy_flow_action);
        SET_DEVICE_OP(dev_ops, destroy_qp);
        SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table);
        SET_DEVICE_OP(dev_ops, destroy_srq);
        SET_DEVICE_OP(dev_ops, destroy_wq);
        SET_DEVICE_OP(dev_ops, device_group);
        SET_DEVICE_OP(dev_ops, detach_mcast);
        SET_DEVICE_OP(dev_ops, disassociate_ucontext);
        SET_DEVICE_OP(dev_ops, drain_rq);
        SET_DEVICE_OP(dev_ops, drain_sq);
        SET_DEVICE_OP(dev_ops, enable_driver);
        SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry);
        SET_DEVICE_OP(dev_ops, fill_res_cq_entry);
        SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw);
        SET_DEVICE_OP(dev_ops, fill_res_mr_entry);
        SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw);
        SET_DEVICE_OP(dev_ops, fill_res_qp_entry);
        SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw);
        SET_DEVICE_OP(dev_ops, fill_res_srq_entry);
        SET_DEVICE_OP(dev_ops, fill_res_srq_entry_raw);
        SET_DEVICE_OP(dev_ops, fill_stat_mr_entry);
        SET_DEVICE_OP(dev_ops, get_dev_fw_str);
        SET_DEVICE_OP(dev_ops, get_dma_mr);
        SET_DEVICE_OP(dev_ops, get_hw_stats);
        SET_DEVICE_OP(dev_ops, get_link_layer);
        SET_DEVICE_OP(dev_ops, get_netdev);
        SET_DEVICE_OP(dev_ops, get_numa_node);
        SET_DEVICE_OP(dev_ops, get_port_immutable);
        SET_DEVICE_OP(dev_ops, get_vector_affinity);
        SET_DEVICE_OP(dev_ops, get_vf_config);
        SET_DEVICE_OP(dev_ops, get_vf_guid);
        SET_DEVICE_OP(dev_ops, get_vf_stats);
        SET_DEVICE_OP(dev_ops, iw_accept);
        SET_DEVICE_OP(dev_ops, iw_add_ref);
        SET_DEVICE_OP(dev_ops, iw_connect);
        SET_DEVICE_OP(dev_ops, iw_create_listen);
        SET_DEVICE_OP(dev_ops, iw_destroy_listen);
        SET_DEVICE_OP(dev_ops, iw_get_qp);
        SET_DEVICE_OP(dev_ops, iw_reject);
        SET_DEVICE_OP(dev_ops, iw_rem_ref);
        SET_DEVICE_OP(dev_ops, map_mr_sg);
        SET_DEVICE_OP(dev_ops, map_mr_sg_pi);
        SET_DEVICE_OP(dev_ops, mmap);
        SET_DEVICE_OP(dev_ops, mmap_free);
        SET_DEVICE_OP(dev_ops, modify_ah);
        SET_DEVICE_OP(dev_ops, modify_cq);
        SET_DEVICE_OP(dev_ops, modify_device);
        SET_DEVICE_OP(dev_ops, modify_hw_stat);
        SET_DEVICE_OP(dev_ops, modify_port);
        SET_DEVICE_OP(dev_ops, modify_qp);
        SET_DEVICE_OP(dev_ops, modify_srq);
        SET_DEVICE_OP(dev_ops, modify_wq);
        SET_DEVICE_OP(dev_ops, peek_cq);
        SET_DEVICE_OP(dev_ops, poll_cq);
        SET_DEVICE_OP(dev_ops, port_groups);
        SET_DEVICE_OP(dev_ops, post_recv);
        SET_DEVICE_OP(dev_ops, post_send);
        SET_DEVICE_OP(dev_ops, post_srq_recv);
        SET_DEVICE_OP(dev_ops, process_mad);
        SET_DEVICE_OP(dev_ops, query_ah);
        SET_DEVICE_OP(dev_ops, query_device);
        SET_DEVICE_OP(dev_ops, query_gid);
        SET_DEVICE_OP(dev_ops, query_pkey);
        SET_DEVICE_OP(dev_ops, query_port);
        SET_DEVICE_OP(dev_ops, query_qp);
        SET_DEVICE_OP(dev_ops, query_srq);
        SET_DEVICE_OP(dev_ops, query_ucontext);
        SET_DEVICE_OP(dev_ops, rdma_netdev_get_params);
        SET_DEVICE_OP(dev_ops, read_counters);
        SET_DEVICE_OP(dev_ops, reg_dm_mr);
        SET_DEVICE_OP(dev_ops, reg_user_mr);
        SET_DEVICE_OP(dev_ops, reg_user_mr_dmabuf);
        SET_DEVICE_OP(dev_ops, req_notify_cq);
        SET_DEVICE_OP(dev_ops, rereg_user_mr);
        SET_DEVICE_OP(dev_ops, resize_cq);
        SET_DEVICE_OP(dev_ops, set_vf_guid);
        SET_DEVICE_OP(dev_ops, set_vf_link_state);
        SET_DEVICE_OP(dev_ops, ufile_hw_cleanup);
        SET_DEVICE_OP(dev_ops, report_port_event);

        SET_OBJ_SIZE(dev_ops, ib_ah);
        SET_OBJ_SIZE(dev_ops, ib_counters);
        SET_OBJ_SIZE(dev_ops, ib_cq);
        SET_OBJ_SIZE(dev_ops, ib_mw);
        SET_OBJ_SIZE(dev_ops, ib_pd);
        SET_OBJ_SIZE(dev_ops, ib_qp);
        SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table);
        SET_OBJ_SIZE(dev_ops, ib_srq);
        SET_OBJ_SIZE(dev_ops, ib_ucontext);
        SET_OBJ_SIZE(dev_ops, ib_xrcd);
        SET_OBJ_SIZE(dev_ops, rdma_counter);
}
EXPORT_SYMBOL(ib_set_device_ops);

int ib_add_sub_device(struct ib_device *parent,
                      enum rdma_nl_dev_type type,
                      const char *name)
{
        struct ib_device *sub;
        int ret = 0;

        if (!parent->ops.add_sub_dev || !parent->ops.del_sub_dev)
                return -EOPNOTSUPP;

        if (!ib_device_try_get(parent))
                return -EINVAL;

        sub = parent->ops.add_sub_dev(parent, type, name);
        if (IS_ERR(sub)) {
                ib_device_put(parent);
                return PTR_ERR(sub);
        }

        sub->type = type;
        sub->parent = parent;

        mutex_lock(&parent->subdev_lock);
        list_add_tail(&parent->subdev_list_head, &sub->subdev_list);
        mutex_unlock(&parent->subdev_lock);

        return ret;
}
EXPORT_SYMBOL(ib_add_sub_device);

int ib_del_sub_device_and_put(struct ib_device *sub)
{
        struct ib_device *parent = sub->parent;

        if (!parent)
                return -EOPNOTSUPP;

        mutex_lock(&parent->subdev_lock);
        list_del(&sub->subdev_list);
        mutex_unlock(&parent->subdev_lock);

        ib_device_put(sub);
        parent->ops.del_sub_dev(sub);
        ib_device_put(parent);

        return 0;
}
EXPORT_SYMBOL(ib_del_sub_device_and_put);

#ifdef CONFIG_INFINIBAND_VIRT_DMA
int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents)
{
        struct scatterlist *s;
        int i;

        for_each_sg(sg, s, nents, i) {
                sg_dma_address(s) = (uintptr_t)sg_virt(s);
                sg_dma_len(s) = s->length;
        }
        return nents;
}
EXPORT_SYMBOL(ib_dma_virt_map_sg);
#endif /* CONFIG_INFINIBAND_VIRT_DMA */

static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
        [RDMA_NL_LS_OP_RESOLVE] = {
                .doit = ib_nl_handle_resolve_resp,
                .flags = RDMA_NL_ADMIN_PERM,
        },
        [RDMA_NL_LS_OP_SET_TIMEOUT] = {
                .doit = ib_nl_handle_set_timeout,
                .flags = RDMA_NL_ADMIN_PERM,
        },
        [RDMA_NL_LS_OP_IP_RESOLVE] = {
                .doit = ib_nl_handle_ip_res_resp,
                .flags = RDMA_NL_ADMIN_PERM,
        },
};

void ib_dispatch_port_state_event(struct ib_device *ibdev, struct net_device *ndev)
{
        enum ib_port_state curr_state;
        struct ib_event ibevent = {};
        u32 port;

        if (ib_query_netdev_port(ibdev, ndev, &port))
                return;

        curr_state = ib_get_curr_port_state(ndev);

        write_lock_irq(&ibdev->cache_lock);
        if (ibdev->port_data[port].cache.last_port_state == curr_state) {
                write_unlock_irq(&ibdev->cache_lock);
                return;
        }
        ibdev->port_data[port].cache.last_port_state = curr_state;
        write_unlock_irq(&ibdev->cache_lock);

        ibevent.event = (curr_state == IB_PORT_DOWN) ?
                                        IB_EVENT_PORT_ERR : IB_EVENT_PORT_ACTIVE;
        ibevent.device = ibdev;
        ibevent.element.port_num = port;
        ib_dispatch_event(&ibevent);
}
EXPORT_SYMBOL(ib_dispatch_port_state_event);

static void handle_port_event(struct net_device *ndev, unsigned long event)
{
        struct ib_device *ibdev;

        /* Currently, link events in bonding scenarios are still
         * reported by drivers that support bonding.
         */
        if (netif_is_lag_master(ndev) || netif_is_lag_port(ndev))
                return;

        ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN);
        if (!ibdev)
                return;

        if (ibdev->ops.report_port_event) {
                ibdev->ops.report_port_event(ibdev, ndev, event);
                goto put_ibdev;
        }

        ib_dispatch_port_state_event(ibdev, ndev);

put_ibdev:
        ib_device_put(ibdev);
};

static int ib_netdevice_event(struct notifier_block *this,
                              unsigned long event, void *ptr)
{
        struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
        struct ib_device *ibdev;
        u32 port;

        switch (event) {
        case NETDEV_CHANGENAME:
                ibdev = ib_device_get_by_netdev(ndev, RDMA_DRIVER_UNKNOWN);
                if (!ibdev)
                        return NOTIFY_DONE;

                if (ib_query_netdev_port(ibdev, ndev, &port)) {
                        ib_device_put(ibdev);
                        break;
                }

                rdma_nl_notify_event(ibdev, port, RDMA_NETDEV_RENAME_EVENT);
                ib_device_put(ibdev);
                break;

        case NETDEV_UP:
        case NETDEV_CHANGE:
        case NETDEV_DOWN:
                handle_port_event(ndev, event);
                break;

        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block nb_netdevice = {
        .notifier_call = ib_netdevice_event,
};

static int __init ib_core_init(void)
{
        int ret = -ENOMEM;

        ib_wq = alloc_workqueue("infiniband", 0, 0);
        if (!ib_wq)
                return -ENOMEM;

        ib_unreg_wq = alloc_workqueue("ib-unreg-wq", WQ_UNBOUND,
                                      WQ_UNBOUND_MAX_ACTIVE);
        if (!ib_unreg_wq)
                goto err;

        ib_comp_wq = alloc_workqueue("ib-comp-wq",
                        WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
        if (!ib_comp_wq)
                goto err_unbound;

        ib_comp_unbound_wq =
                alloc_workqueue("ib-comp-unb-wq",
                                WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM |
                                WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE);
        if (!ib_comp_unbound_wq)
                goto err_comp;

        ret = class_register(&ib_class);
        if (ret) {
                pr_warn("Couldn't create InfiniBand device class\n");
                goto err_comp_unbound;
        }

        rdma_nl_init();

        ret = addr_init();
        if (ret) {
                pr_warn("Couldn't init IB address resolution\n");
                goto err_ibnl;
        }

        ret = ib_mad_init();
        if (ret) {
                pr_warn("Couldn't init IB MAD\n");
                goto err_addr;
        }

        ret = ib_sa_init();
        if (ret) {
                pr_warn("Couldn't init SA\n");
                goto err_mad;
        }

        ret = register_blocking_lsm_notifier(&ibdev_lsm_nb);
        if (ret) {
                pr_warn("Couldn't register LSM notifier. ret %d\n", ret);
                goto err_sa;
        }

        ret = register_pernet_device(&rdma_dev_net_ops);
        if (ret) {
                pr_warn("Couldn't init compat dev. ret %d\n", ret);
                goto err_compat;
        }

        nldev_init();
        rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
        ret = roce_gid_mgmt_init();
        if (ret) {
                pr_warn("Couldn't init RoCE GID management\n");
                goto err_parent;
        }

        register_netdevice_notifier(&nb_netdevice);

        return 0;

err_parent:
        rdma_nl_unregister(RDMA_NL_LS);
        nldev_exit();
        unregister_pernet_device(&rdma_dev_net_ops);
err_compat:
        unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
err_sa:
        ib_sa_cleanup();
err_mad:
        ib_mad_cleanup();
err_addr:
        addr_cleanup();
err_ibnl:
        class_unregister(&ib_class);
err_comp_unbound:
        destroy_workqueue(ib_comp_unbound_wq);
err_comp:
        destroy_workqueue(ib_comp_wq);
err_unbound:
        destroy_workqueue(ib_unreg_wq);
err:
        destroy_workqueue(ib_wq);
        return ret;
}

static void __exit ib_core_cleanup(void)
{
        unregister_netdevice_notifier(&nb_netdevice);
        roce_gid_mgmt_cleanup();
        rdma_nl_unregister(RDMA_NL_LS);
        nldev_exit();
        unregister_pernet_device(&rdma_dev_net_ops);
        unregister_blocking_lsm_notifier(&ibdev_lsm_nb);
        ib_sa_cleanup();
        ib_mad_cleanup();
        addr_cleanup();
        rdma_nl_exit();
        class_unregister(&ib_class);
        destroy_workqueue(ib_comp_unbound_wq);
        destroy_workqueue(ib_comp_wq);
        /* Make sure that any pending umem accounting work is done. */
        destroy_workqueue(ib_wq);
        destroy_workqueue(ib_unreg_wq);
        WARN_ON(!xa_empty(&clients));
        WARN_ON(!xa_empty(&devices));
}

MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);

/* ib core relies on netdev stack to first register net_ns_type_operations
 * ns kobject type before ib_core initialization.
 */
fs_initcall(ib_core_init);
module_exit(ib_core_cleanup);



















  320 


  319 



  320 


















  319 



























  302 








  302 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_MMAN_H__
#define __ASM_MMAN_H__

#include <uapi/asm/mman.h>

#ifndef BUILD_VDSO
#include <linux/compiler.h>
#include <linux/fs.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/types.h>

static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
        unsigned long pkey)
{
        unsigned long ret = 0;

        if (system_supports_bti() && (prot & PROT_BTI))
                ret |= VM_ARM64_BTI;

        if (system_supports_mte() && (prot & PROT_MTE))
                ret |= VM_MTE;

#ifdef CONFIG_ARCH_HAS_PKEYS
        if (system_supports_poe()) {
                ret |= pkey & BIT(0) ? VM_PKEY_BIT0 : 0;
                ret |= pkey & BIT(1) ? VM_PKEY_BIT1 : 0;
                ret |= pkey & BIT(2) ? VM_PKEY_BIT2 : 0;
        }
#endif

        return ret;
}
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)

static inline unsigned long arch_calc_vm_flag_bits(struct file *file,
                                                   unsigned long flags)
{
        /*
         * Only allow MTE on anonymous mappings as these are guaranteed to be
         * backed by tags-capable memory. The vm_flags may be overridden by a
         * filesystem supporting MTE (RAM-based).
         */
        if (system_supports_mte()) {
                if (flags & (MAP_ANONYMOUS | MAP_HUGETLB))
                        return VM_MTE_ALLOWED;
                if (shmem_file(file) || is_file_hugepages(file))
                        return VM_MTE_ALLOWED;
        }

        return 0;
}
#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags)

static inline bool arch_validate_prot(unsigned long prot,
        unsigned long addr __always_unused)
{
        unsigned long supported = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM;

        if (system_supports_bti())
                supported |= PROT_BTI;

        if (system_supports_mte())
                supported |= PROT_MTE;

        return (prot & ~supported) == 0;
}
#define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr)

static inline bool arch_validate_flags(unsigned long vm_flags)
{
        if (system_supports_mte()) {
                /*
                 * only allow VM_MTE if VM_MTE_ALLOWED has been set
                 * previously
                 */
                if ((vm_flags & VM_MTE) && !(vm_flags & VM_MTE_ALLOWED))
                        return false;
        }

        if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
                /* An executable GCS isn't a good idea. */
                if (vm_flags & VM_EXEC)
                        return false;

                /* The memory management core should prevent this */
                VM_WARN_ON(vm_flags & VM_SHARED);
        }

        return true;

}
#define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)

#endif /* !BUILD_VDSO */

#endif /* ! __ASM_MMAN_H__ */



























































































































































   46 
   46 


   46 







   46 



   46 



   46 



































































































































































   23 









   23 
























































































































   23 







   23 

   23 






























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/truncate.c - code for taking down pages from address_spaces
 *
 * Copyright (C) 2002, Linus Torvalds
 *
 * 10Sep2002        Andrew Morton
 *                Initial version.
 */

#include <linux/kernel.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/pagevec.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include "internal.h"

static void clear_shadow_entries(struct address_space *mapping,
                                 unsigned long start, unsigned long max)
{
        XA_STATE(xas, &mapping->i_pages, start);
        struct folio *folio;

        /* Handled by shmem itself, or for DAX we do nothing. */
        if (shmem_mapping(mapping) || dax_mapping(mapping))
                return;

        xas_set_update(&xas, workingset_update_node);

        spin_lock(&mapping->host->i_lock);
        xas_lock_irq(&xas);

        /* Clear all shadow entries from start to max */
        xas_for_each(&xas, folio, max) {
                if (xa_is_value(folio))
                        xas_store(&xas, NULL);
        }

        xas_unlock_irq(&xas);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);
}

/*
 * Unconditionally remove exceptional entries. Usually called from truncate
 * path. Note that the folio_batch may be altered by this function by removing
 * exceptional entries similar to what folio_batch_remove_exceptionals() does.
 * Please note that indices[] has entries in ascending order as guaranteed by
 * either find_get_entries() or find_lock_entries().
 */
static void truncate_folio_batch_exceptionals(struct address_space *mapping,
                                struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, indices[0]);
        int nr = folio_batch_count(fbatch);
        struct folio *folio;
        int i, j;

        /* Handled by shmem itself */
        if (shmem_mapping(mapping))
                return;

        for (j = 0; j < nr; j++)
                if (xa_is_value(fbatch->folios[j]))
                        break;

        if (j == nr)
                return;

        if (dax_mapping(mapping)) {
                for (i = j; i < nr; i++) {
                        if (xa_is_value(fbatch->folios[i])) {
                                /*
                                 * File systems should already have called
                                 * dax_break_layout_entry() to remove all DAX
                                 * entries while holding a lock to prevent
                                 * establishing new entries. Therefore we
                                 * shouldn't find any here.
                                 */
                                WARN_ON_ONCE(1);

                                /*
                                 * Delete the mapping so truncate_pagecache()
                                 * doesn't loop forever.
                                 */
                                dax_delete_mapping_entry(mapping, indices[i]);
                        }
                }
                goto out;
        }

        xas_set(&xas, indices[j]);
        xas_set_update(&xas, workingset_update_node);

        spin_lock(&mapping->host->i_lock);
        xas_lock_irq(&xas);

        xas_for_each(&xas, folio, indices[nr-1]) {
                if (xa_is_value(folio))
                        xas_store(&xas, NULL);
        }

        xas_unlock_irq(&xas);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);
out:
        folio_batch_remove_exceptionals(fbatch);
}

/**
 * folio_invalidate - Invalidate part or all of a folio.
 * @folio: The folio which is affected.
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * folio_invalidate() is called when all or part of the folio has become
 * invalidated by a truncate operation.
 *
 * folio_invalidate() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void folio_invalidate(struct folio *folio, size_t offset, size_t length)
{
        const struct address_space_operations *aops = folio->mapping->a_ops;

        if (aops->invalidate_folio)
                aops->invalidate_folio(folio, offset, length);
}
EXPORT_SYMBOL_GPL(folio_invalidate);

/*
 * If truncate cannot remove the fs-private metadata from the page, the page
 * becomes orphaned.  It will be left on the LRU and may even be mapped into
 * user pagetables if we're racing with filemap_fault().
 *
 * We need to bail out if page->mapping is no longer equal to the original
 * mapping.  This happens a) when the VM reclaimed the page while we waited on
 * its lock, b) when a concurrent invalidate_mapping_pages got there first and
 * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
 */
static void truncate_cleanup_folio(struct folio *folio)
{
        if (folio_mapped(folio))
                unmap_mapping_folio(folio);

        if (folio_needs_release(folio))
                folio_invalidate(folio, 0, folio_size(folio));

        /*
         * Some filesystems seem to re-dirty the page even after
         * the VM has canceled the dirty bit (eg ext3 journaling).
         * Hence dirty accounting check is placed after invalidation.
         */
        folio_cancel_dirty(folio);
}

int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
{
        if (folio->mapping != mapping)
                return -EIO;

        truncate_cleanup_folio(folio);
        filemap_remove_folio(folio);
        return 0;
}

/*
 * Handle partial folios.  The folio may be entirely within the
 * range if a split has raced with us.  If not, we zero the part of the
 * folio that's within the [start, end] range, and then split the folio if
 * it's large.  split_page_range() will discard pages which now lie beyond
 * i_size, and we rely on the caller to discard pages which lie within a
 * newly created hole.
 *
 * Returns false if splitting failed so the caller can avoid
 * discarding the entire folio which is stubbornly unsplit.
 */
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
        loff_t pos = folio_pos(folio);
        unsigned int offset, length;
        struct page *split_at, *split_at2;

        if (pos < start)
                offset = start - pos;
        else
                offset = 0;
        length = folio_size(folio);
        if (pos + length <= (u64)end)
                length = length - offset;
        else
                length = end + 1 - pos - offset;

        folio_wait_writeback(folio);
        if (length == folio_size(folio)) {
                truncate_inode_folio(folio->mapping, folio);
                return true;
        }

        /*
         * We may be zeroing pages we're about to discard, but it avoids
         * doing a complex calculation here, and then doing the zeroing
         * anyway if the page split fails.
         */
        if (!mapping_inaccessible(folio->mapping))
                folio_zero_range(folio, offset, length);

        if (folio_needs_release(folio))
                folio_invalidate(folio, offset, length);
        if (!folio_test_large(folio))
                return true;

        split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
        split_at2 = folio_page(folio,
                        PAGE_ALIGN_DOWN(offset + length) / PAGE_SIZE);

        if (!try_folio_split(folio, split_at, NULL)) {
                /*
                 * try to split at offset + length to make sure folios within
                 * the range can be dropped, especially to avoid memory waste
                 * for shmem truncate
                 */
                struct folio *folio2 = page_folio(split_at2);

                if (!folio_try_get(folio2))
                        goto no_split;

                if (!folio_test_large(folio2))
                        goto out;

                if (!folio_trylock(folio2))
                        goto out;

                /*
                 * make sure folio2 is large and does not change its mapping.
                 * Its split result does not matter here.
                 */
                if (folio_test_large(folio2) &&
                    folio2->mapping == folio->mapping)
                        try_folio_split(folio2, split_at2, NULL);

                folio_unlock(folio2);
out:
                folio_put(folio2);
no_split:
                return true;
        }
        if (folio_test_dirty(folio))
                return false;
        truncate_inode_folio(folio->mapping, folio);
        return true;
}

/*
 * Used to get rid of pages on hardware memory corruption.
 */
int generic_error_remove_folio(struct address_space *mapping,
                struct folio *folio)
{
        if (!mapping)
                return -EINVAL;
        /*
         * Only punch for normal data pages for now.
         * Handling other types like directories would need more auditing.
         */
        if (!S_ISREG(mapping->host->i_mode))
                return -EIO;
        return truncate_inode_folio(mapping, folio);
}
EXPORT_SYMBOL(generic_error_remove_folio);

/**
 * mapping_evict_folio() - Remove an unused folio from the page-cache.
 * @mapping: The mapping this folio belongs to.
 * @folio: The folio to remove.
 *
 * Safely remove one folio from the page cache.
 * It only drops clean, unused folios.
 *
 * Context: Folio must be locked.
 * Return: The number of pages successfully removed.
 */
long mapping_evict_folio(struct address_space *mapping, struct folio *folio)
{
        /* The page may have been truncated before it was locked */
        if (!mapping)
                return 0;
        if (folio_test_dirty(folio) || folio_test_writeback(folio))
                return 0;
        /* The refcount will be elevated if any page in the folio is mapped */
        if (folio_ref_count(folio) >
                        folio_nr_pages(folio) + folio_has_private(folio) + 1)
                return 0;
        if (!filemap_release_folio(folio, 0))
                return 0;

        return remove_mapping(mapping, folio);
}

/**
 * truncate_inode_pages_range - truncate range of pages specified by start & end byte offsets
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 * @lend: offset to which to truncate (inclusive)
 *
 * Truncate the page cache, removing the pages that are between
 * specified offsets (and zeroing out partial pages
 * if lstart or lend + 1 is not page aligned).
 *
 * Truncate takes two passes - the first pass is nonblocking.  It will not
 * block on page locks and it will not block on writeback.  The second pass
 * will wait.  This is to prevent as much IO as possible in the affected region.
 * The first pass will remove most pages, so the search cost of the second pass
 * is low.
 *
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
 *
 * Note that since ->invalidate_folio() accepts range to invalidate
 * truncate_inode_pages_range is able to handle cases where lend + 1 is not
 * page aligned properly.
 */
void truncate_inode_pages_range(struct address_space *mapping,
                                loff_t lstart, loff_t lend)
{
        pgoff_t                start;                /* inclusive */
        pgoff_t                end;                /* exclusive */
        struct folio_batch fbatch;
        pgoff_t                indices[PAGEVEC_SIZE];
        pgoff_t                index;
        int                i;
        struct folio        *folio;
        bool                same_folio;

        if (mapping_empty(mapping))
                return;

        /*
         * 'start' and 'end' always covers the range of pages to be fully
         * truncated. Partial pages are covered with 'partial_start' at the
         * start of the range and 'partial_end' at the end of the range.
         * Note that 'end' is exclusive while 'lend' is inclusive.
         */
        start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
        if (lend == -1)
                /*
                 * lend == -1 indicates end-of-file so we have to set 'end'
                 * to the highest possible pgoff_t and since the type is
                 * unsigned we're using -1.
                 */
                end = -1;
        else
                end = (lend + 1) >> PAGE_SHIFT;

        folio_batch_init(&fbatch);
        index = start;
        while (index < end && find_lock_entries(mapping, &index, end - 1,
                        &fbatch, indices)) {
                truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        truncate_cleanup_folio(fbatch.folios[i]);
                delete_from_page_cache_batch(mapping, &fbatch);
                for (i = 0; i < folio_batch_count(&fbatch); i++)
                        folio_unlock(fbatch.folios[i]);
                folio_batch_release(&fbatch);
                cond_resched();
        }

        same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
        folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
        if (!IS_ERR(folio)) {
                same_folio = lend < folio_pos(folio) + folio_size(folio);
                if (!truncate_inode_partial_folio(folio, lstart, lend)) {
                        start = folio_next_index(folio);
                        if (same_folio)
                                end = folio->index;
                }
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
        }

        if (!same_folio) {
                folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
                                                FGP_LOCK, 0);
                if (!IS_ERR(folio)) {
                        if (!truncate_inode_partial_folio(folio, lstart, lend))
                                end = folio->index;
                        folio_unlock(folio);
                        folio_put(folio);
                }
        }

        index = start;
        while (index < end) {
                cond_resched();
                if (!find_get_entries(mapping, &index, end - 1, &fbatch,
                                indices)) {
                        /* If all gone from start onwards, we're done */
                        if (index == start)
                                break;
                        /* Otherwise restart to make sure all gone */
                        index = start;
                        continue;
                }

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing page->index */

                        if (xa_is_value(folio))
                                continue;

                        folio_lock(folio);
                        VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
                        folio_wait_writeback(folio);
                        truncate_inode_folio(mapping, folio);
                        folio_unlock(folio);
                }
                truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
                folio_batch_release(&fbatch);
        }
}
EXPORT_SYMBOL(truncate_inode_pages_range);

/**
 * truncate_inode_pages - truncate *all* the pages from an offset
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
 *
 * Called under (and serialised by) inode->i_rwsem and
 * mapping->invalidate_lock.
 *
 * Note: When this function returns, there can be a page in the process of
 * deletion (inside __filemap_remove_folio()) in the specified range.  Thus
 * mapping->nrpages can be non-zero when this function returns even after
 * truncation of the whole mapping.
 */
void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
{
        truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
}
EXPORT_SYMBOL(truncate_inode_pages);

/**
 * truncate_inode_pages_final - truncate *all* pages before inode dies
 * @mapping: mapping to truncate
 *
 * Called under (and serialized by) inode->i_rwsem.
 *
 * Filesystems have to use this in the .evict_inode path to inform the
 * VM that this is the final truncate and the inode is going away.
 */
void truncate_inode_pages_final(struct address_space *mapping)
{
        /*
         * Page reclaim can not participate in regular inode lifetime
         * management (can't call iput()) and thus can race with the
         * inode teardown.  Tell it when the address space is exiting,
         * so that it does not install eviction information after the
         * final truncate has begun.
         */
        mapping_set_exiting(mapping);

        if (!mapping_empty(mapping)) {
                /*
                 * As truncation uses a lockless tree lookup, cycle
                 * the tree lock to make sure any ongoing tree
                 * modification that does not see AS_EXITING is
                 * completed before starting the final truncate.
                 */
                xa_lock_irq(&mapping->i_pages);
                xa_unlock_irq(&mapping->i_pages);
        }

        truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);

/**
 * mapping_try_invalidate - Invalidate all the evictable folios of one inode
 * @mapping: the address_space which holds the folios to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 * @nr_failed: How many folio invalidations failed
 *
 * This function is similar to invalidate_mapping_pages(), except that it
 * returns the number of folios which could not be evicted in @nr_failed.
 */
unsigned long mapping_try_invalidate(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_failed)
{
        pgoff_t indices[PAGEVEC_SIZE];
        struct folio_batch fbatch;
        pgoff_t index = start;
        unsigned long ret;
        unsigned long count = 0;
        int i;

        folio_batch_init(&fbatch);
        while (find_lock_entries(mapping, &index, end, &fbatch, indices)) {
                bool xa_has_values = false;
                int nr = folio_batch_count(&fbatch);

                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing folio->index */

                        if (xa_is_value(folio)) {
                                xa_has_values = true;
                                count++;
                                continue;
                        }

                        ret = mapping_evict_folio(mapping, folio);
                        folio_unlock(folio);
                        /*
                         * Invalidation is a hint that the folio is no longer
                         * of interest and try to speed up its reclaim.
                         */
                        if (!ret) {
                                deactivate_file_folio(folio);
                                /* Likely in the lru cache of a remote CPU */
                                if (nr_failed)
                                        (*nr_failed)++;
                        }
                        count += ret;
                }

                if (xa_has_values)
                        clear_shadow_entries(mapping, indices[0], indices[nr-1]);

                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
        return count;
}

/**
 * invalidate_mapping_pages - Invalidate all clean, unlocked cache of one inode
 * @mapping: the address_space which holds the cache to invalidate
 * @start: the offset 'from' which to invalidate
 * @end: the offset 'to' which to invalidate (inclusive)
 *
 * This function removes pages that are clean, unmapped and unlocked,
 * as well as shadow entries. It will not block on IO activity.
 *
 * If you want to remove all the pages of one inode, regardless of
 * their use and writeback state, use truncate_inode_pages().
 *
 * Return: The number of indices that had their contents invalidated
 */
unsigned long invalidate_mapping_pages(struct address_space *mapping,
                pgoff_t start, pgoff_t end)
{
        return mapping_try_invalidate(mapping, start, end, NULL);
}
EXPORT_SYMBOL(invalidate_mapping_pages);

static int folio_launder(struct address_space *mapping, struct folio *folio)
{
        if (!folio_test_dirty(folio))
                return 0;
        if (folio->mapping != mapping || mapping->a_ops->launder_folio == NULL)
                return 0;
        return mapping->a_ops->launder_folio(folio);
}

/*
 * This is like mapping_evict_folio(), except it ignores the folio's
 * refcount.  We do this because invalidate_inode_pages2() needs stronger
 * invalidation guarantees, and cannot afford to leave folios behind because
 * shrink_folio_list() has a temp ref on them, or because they're transiently
 * sitting in the folio_add_lru() caches.
 */
int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
                           gfp_t gfp)
{
        int ret;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (folio_mapped(folio))
                unmap_mapping_folio(folio);
        BUG_ON(folio_mapped(folio));

        ret = folio_launder(mapping, folio);
        if (ret)
                return ret;
        if (folio->mapping != mapping)
                return -EBUSY;
        if (!filemap_release_folio(folio, gfp))
                return -EBUSY;

        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        if (folio_test_dirty(folio))
                goto failed;

        BUG_ON(folio_has_private(folio));
        __filemap_remove_folio(folio, NULL);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        filemap_free_folio(mapping, folio);
        return 1;
failed:
        xa_unlock_irq(&mapping->i_pages);
        spin_unlock(&mapping->host->i_lock);
        return -EBUSY;
}

/**
 * invalidate_inode_pages2_range - remove range of pages from an address_space
 * @mapping: the address_space
 * @start: the page offset 'from' which to invalidate
 * @end: the page offset 'to' which to invalidate (inclusive)
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
{
        pgoff_t indices[PAGEVEC_SIZE];
        struct folio_batch fbatch;
        pgoff_t index;
        int i;
        int ret = 0;
        int ret2 = 0;
        int did_range_unmap = 0;

        if (mapping_empty(mapping))
                return 0;

        folio_batch_init(&fbatch);
        index = start;
        while (find_get_entries(mapping, &index, end, &fbatch, indices)) {
                bool xa_has_values = false;
                int nr = folio_batch_count(&fbatch);

                for (i = 0; i < nr; i++) {
                        struct folio *folio = fbatch.folios[i];

                        /* We rely upon deletion not changing folio->index */

                        if (xa_is_value(folio)) {
                                xa_has_values = true;
                                if (dax_mapping(mapping) &&
                                    !dax_invalidate_mapping_entry_sync(mapping, indices[i]))
                                        ret = -EBUSY;
                                continue;
                        }

                        if (!did_range_unmap && folio_mapped(folio)) {
                                /*
                                 * If folio is mapped, before taking its lock,
                                 * zap the rest of the file in one hit.
                                 */
                                unmap_mapping_pages(mapping, indices[i],
                                                (1 + end - indices[i]), false);
                                did_range_unmap = 1;
                        }

                        folio_lock(folio);
                        if (unlikely(folio->mapping != mapping)) {
                                folio_unlock(folio);
                                continue;
                        }
                        VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio);
                        folio_wait_writeback(folio);
                        ret2 = folio_unmap_invalidate(mapping, folio, GFP_KERNEL);
                        if (ret2 < 0)
                                ret = ret2;
                        folio_unlock(folio);
                }

                if (xa_has_values)
                        clear_shadow_entries(mapping, indices[0], indices[nr-1]);

                folio_batch_remove_exceptionals(&fbatch);
                folio_batch_release(&fbatch);
                cond_resched();
        }
        /*
         * For DAX we invalidate page tables after invalidating page cache.  We
         * could invalidate page tables while invalidating each entry however
         * that would be expensive. And doing range unmapping before doesn't
         * work as we have no cheap way to find whether page cache entry didn't
         * get remapped later.
         */
        if (dax_mapping(mapping)) {
                unmap_mapping_pages(mapping, start, end - start + 1, false);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);

/**
 * invalidate_inode_pages2 - remove all pages from an address_space
 * @mapping: the address_space
 *
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
 * Return: -EBUSY if any pages could not be invalidated.
 */
int invalidate_inode_pages2(struct address_space *mapping)
{
        return invalidate_inode_pages2_range(mapping, 0, -1);
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2);

/**
 * truncate_pagecache - unmap and remove pagecache that has been truncated
 * @inode: inode
 * @newsize: new file size
 *
 * inode's new i_size must already be written before truncate_pagecache
 * is called.
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache(struct inode *inode, loff_t newsize)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t holebegin = round_up(newsize, PAGE_SIZE);

        /*
         * unmap_mapping_range is called twice, first simply for
         * efficiency so that truncate_inode_pages does fewer
         * single-page unmaps.  However after this first call, and
         * before truncate_inode_pages finishes, it is possible for
         * private pages to be COWed, which remain after
         * truncate_inode_pages finishes, hence the second
         * unmap_mapping_range call must be made for correctness.
         */
        unmap_mapping_range(mapping, holebegin, 0, 1);
        truncate_inode_pages(mapping, newsize);
        unmap_mapping_range(mapping, holebegin, 0, 1);
}
EXPORT_SYMBOL(truncate_pagecache);

/**
 * truncate_setsize - update inode and pagecache for a new file size
 * @inode: inode
 * @newsize: new file size
 *
 * truncate_setsize updates i_size and performs pagecache truncation (if
 * necessary) to @newsize. It will be typically be called from the filesystem's
 * setattr function when ATTR_SIZE is passed in.
 *
 * Must be called with a lock serializing truncates and writes (generally
 * i_rwsem but e.g. xfs uses a different lock) and before all filesystem
 * specific block truncation has been performed.
 */
void truncate_setsize(struct inode *inode, loff_t newsize)
{
        loff_t oldsize = inode->i_size;

        i_size_write(inode, newsize);
        if (newsize > oldsize)
                pagecache_isize_extended(inode, oldsize, newsize);
        truncate_pagecache(inode, newsize);
}
EXPORT_SYMBOL(truncate_setsize);

/**
 * pagecache_isize_extended - update pagecache after extension of i_size
 * @inode:        inode for which i_size was extended
 * @from:        original inode size
 * @to:                new inode size
 *
 * Handle extension of inode size either caused by extending truncate or
 * by write starting after current i_size.  We mark the page straddling
 * current i_size RO so that page_mkwrite() is called on the first
 * write access to the page.  The filesystem will update its per-block
 * information before user writes to the page via mmap after the i_size
 * has been changed.
 *
 * The function must be called after i_size is updated so that page fault
 * coming after we unlock the folio will already see the new i_size.
 * The function must be called while we still hold i_rwsem - this not only
 * makes sure i_size is stable but also that userspace cannot observe new
 * i_size value before we are prepared to store mmap writes at new inode size.
 */
void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
{
        int bsize = i_blocksize(inode);
        loff_t rounded_from;
        struct folio *folio;

        WARN_ON(to > inode->i_size);

        if (from >= to || bsize >= PAGE_SIZE)
                return;
        /* Page straddling @from will not have any hole block created? */
        rounded_from = round_up(from, bsize);
        if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
                return;

        folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE);
        /* Folio not cached? Nothing to do */
        if (IS_ERR(folio))
                return;
        /*
         * See folio_clear_dirty_for_io() for details why folio_mark_dirty()
         * is needed.
         */
        if (folio_mkclean(folio))
                folio_mark_dirty(folio);

        /*
         * The post-eof range of the folio must be zeroed before it is exposed
         * to the file. Writeback normally does this, but since i_size has been
         * increased we handle it here.
         */
        if (folio_test_dirty(folio)) {
                unsigned int offset, end;

                offset = from - folio_pos(folio);
                end = min_t(unsigned int, to - folio_pos(folio),
                            folio_size(folio));
                folio_zero_segment(folio, offset, end);
        }

        folio_unlock(folio);
        folio_put(folio);
}
EXPORT_SYMBOL(pagecache_isize_extended);

/**
 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 * @inode: inode
 * @lstart: offset of beginning of hole
 * @lend: offset of last byte of hole
 *
 * This function should typically be called before the filesystem
 * releases resources associated with the freed range (eg. deallocates
 * blocks). This way, pagecache will always stay logically coherent
 * with on-disk format, and the filesystem would not have to deal with
 * situations such as writepage being called for a page that has already
 * had its underlying blocks deallocated.
 */
void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend)
{
        struct address_space *mapping = inode->i_mapping;
        loff_t unmap_start = round_up(lstart, PAGE_SIZE);
        loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1;
        /*
         * This rounding is currently just for example: unmap_mapping_range
         * expands its hole outwards, whereas we want it to contract the hole
         * inwards.  However, existing callers of truncate_pagecache_range are
         * doing their own page rounding first.  Note that unmap_mapping_range
         * allows holelen 0 for all, and we allow lend -1 for end of file.
         */

        /*
         * Unlike in truncate_pagecache, unmap_mapping_range is called only
         * once (before truncating pagecache), and without "even_cows" flag:
         * hole-punching should not remove private COWed pages from the hole.
         */
        if ((u64)unmap_end > (u64)unmap_start)
                unmap_mapping_range(mapping, unmap_start,
                                    1 + unmap_end - unmap_start, 0);
        truncate_inode_pages_range(mapping, lstart, lend);
}
EXPORT_SYMBOL(truncate_pagecache_range);
























































































































































































































   68 





















































































































































































































































    2 

















































































































































































































































































































































































    2 



































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Handle detection, reporting and mitigation of Spectre v1, v2, v3a and v4, as
 * detailed at:
 *
 *   https://developer.arm.com/support/arm-security-updates/speculative-processor-vulnerability
 *
 * This code was originally written hastily under an awful lot of stress and so
 * aspects of it are somewhat hacky. Unfortunately, changing anything in here
 * instantly makes me feel ill. Thanks, Jann. Thann.
 *
 * Copyright (C) 2018 ARM Ltd, All Rights Reserved.
 * Copyright (C) 2020 Google LLC
 *
 * "If there's something strange in your neighbourhood, who you gonna call?"
 *
 * Authors: Will Deacon <will@kernel.org> and Marc Zyngier <maz@kernel.org>
 */

#include <linux/arm-smccc.h>
#include <linux/bpf.h>
#include <linux/cpu.h>
#include <linux/device.h>
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/sched/task_stack.h>

#include <asm/debug-monitors.h>
#include <asm/insn.h>
#include <asm/spectre.h>
#include <asm/traps.h>
#include <asm/vectors.h>
#include <asm/virt.h>

/*
 * We try to ensure that the mitigation state can never change as the result of
 * onlining a late CPU.
 */
static void update_mitigation_state(enum mitigation_state *oldp,
                                    enum mitigation_state new)
{
        enum mitigation_state state;

        do {
                state = READ_ONCE(*oldp);
                if (new <= state)
                        break;

                /* Userspace almost certainly can't deal with this. */
                if (WARN_ON(system_capabilities_finalized()))
                        break;
        } while (cmpxchg_relaxed(oldp, state, new) != state);
}

/*
 * Spectre v1.
 *
 * The kernel can't protect userspace for this one: it's each person for
 * themselves. Advertise what we're doing and be done with it.
 */
ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        return sprintf(buf, "Mitigation: __user pointer sanitization\n");
}

/*
 * Spectre v2.
 *
 * This one sucks. A CPU is either:
 *
 * - Mitigated in hardware and advertised by ID_AA64PFR0_EL1.CSV2.
 * - Mitigated in hardware and listed in our "safe list".
 * - Mitigated in software by firmware.
 * - Mitigated in software by a CPU-specific dance in the kernel and a
 *   firmware call at EL2.
 * - Vulnerable.
 *
 * It's not unlikely for different CPUs in a big.LITTLE system to fall into
 * different camps.
 */
static enum mitigation_state spectre_v2_state;

static bool __read_mostly __nospectre_v2;
static int __init parse_spectre_v2_param(char *str)
{
        __nospectre_v2 = true;
        return 0;
}
early_param("nospectre_v2", parse_spectre_v2_param);

static bool spectre_v2_mitigations_off(void)
{
        bool ret = __nospectre_v2 || cpu_mitigations_off();

        if (ret)
                pr_info_once("spectre-v2 mitigation disabled by command line option\n");

        return ret;
}

static const char *get_bhb_affected_string(enum mitigation_state bhb_state)
{
        switch (bhb_state) {
        case SPECTRE_UNAFFECTED:
                return "";
        default:
        case SPECTRE_VULNERABLE:
                return ", but not BHB";
        case SPECTRE_MITIGATED:
                return ", BHB";
        }
}

static bool _unprivileged_ebpf_enabled(void)
{
#ifdef CONFIG_BPF_SYSCALL
        return !sysctl_unprivileged_bpf_disabled;
#else
        return false;
#endif
}

ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        enum mitigation_state bhb_state = arm64_get_spectre_bhb_state();
        const char *bhb_str = get_bhb_affected_string(bhb_state);
        const char *v2_str = "Branch predictor hardening";

        switch (spectre_v2_state) {
        case SPECTRE_UNAFFECTED:
                if (bhb_state == SPECTRE_UNAFFECTED)
                        return sprintf(buf, "Not affected\n");

                /*
                 * Platforms affected by Spectre-BHB can't report
                 * "Not affected" for Spectre-v2.
                 */
                v2_str = "CSV2";
                fallthrough;
        case SPECTRE_MITIGATED:
                if (bhb_state == SPECTRE_MITIGATED && _unprivileged_ebpf_enabled())
                        return sprintf(buf, "Vulnerable: Unprivileged eBPF enabled\n");

                return sprintf(buf, "Mitigation: %s%s\n", v2_str, bhb_str);
        case SPECTRE_VULNERABLE:
                fallthrough;
        default:
                return sprintf(buf, "Vulnerable\n");
        }
}

static enum mitigation_state spectre_v2_get_cpu_hw_mitigation_state(void)
{
        u64 pfr0;
        static const struct midr_range spectre_v2_safe_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
                MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
                MIDR_ALL_VERSIONS(MIDR_HISI_TSV110),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
                { /* sentinel */ }
        };

        /* If the CPU has CSV2 set, we're safe */
        pfr0 = read_cpuid(ID_AA64PFR0_EL1);
        if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_CSV2_SHIFT))
                return SPECTRE_UNAFFECTED;

        /* Alternatively, we have a list of unaffected CPUs */
        if (is_midr_in_range_list(spectre_v2_safe_list))
                return SPECTRE_UNAFFECTED;

        return SPECTRE_VULNERABLE;
}

static enum mitigation_state spectre_v2_get_cpu_fw_mitigation_state(void)
{
        int ret;
        struct arm_smccc_res res;

        arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
                             ARM_SMCCC_ARCH_WORKAROUND_1, &res);

        ret = res.a0;
        switch (ret) {
        case SMCCC_RET_SUCCESS:
                return SPECTRE_MITIGATED;
        case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
                return SPECTRE_UNAFFECTED;
        default:
                fallthrough;
        case SMCCC_RET_NOT_SUPPORTED:
                return SPECTRE_VULNERABLE;
        }
}

bool has_spectre_v2(const struct arm64_cpu_capabilities *entry, int scope)
{
        WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());

        if (spectre_v2_get_cpu_hw_mitigation_state() == SPECTRE_UNAFFECTED)
                return false;

        if (spectre_v2_get_cpu_fw_mitigation_state() == SPECTRE_UNAFFECTED)
                return false;

        return true;
}

enum mitigation_state arm64_get_spectre_v2_state(void)
{
        return spectre_v2_state;
}

DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data);

static void install_bp_hardening_cb(bp_hardening_cb_t fn)
{
        __this_cpu_write(bp_hardening_data.fn, fn);

        /*
         * Vinz Clortho takes the hyp_vecs start/end "keys" at
         * the door when we're a guest. Skip the hyp-vectors work.
         */
        if (!is_hyp_mode_available())
                return;

        __this_cpu_write(bp_hardening_data.slot, HYP_VECTOR_SPECTRE_DIRECT);
}

/* Called during entry so must be noinstr */
static noinstr void call_smc_arch_workaround_1(void)
{
        arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
}

/* Called during entry so must be noinstr */
static noinstr void call_hvc_arch_workaround_1(void)
{
        arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL);
}

/* Called during entry so must be noinstr */
static noinstr void qcom_link_stack_sanitisation(void)
{
        u64 tmp;

        asm volatile("mov        %0, x30                \n"
                     ".rept        16                \n"
                     "bl        . + 4                \n"
                     ".endr                        \n"
                     "mov        x30, %0                \n"
                     : "=&r" (tmp));
}

static bp_hardening_cb_t spectre_v2_get_sw_mitigation_cb(void)
{
        u32 midr = read_cpuid_id();
        if (((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR) &&
            ((midr & MIDR_CPU_MODEL_MASK) != MIDR_QCOM_FALKOR_V1))
                return NULL;

        return qcom_link_stack_sanitisation;
}

static enum mitigation_state spectre_v2_enable_fw_mitigation(void)
{
        bp_hardening_cb_t cb;
        enum mitigation_state state;

        state = spectre_v2_get_cpu_fw_mitigation_state();
        if (state != SPECTRE_MITIGATED)
                return state;

        if (spectre_v2_mitigations_off())
                return SPECTRE_VULNERABLE;

        switch (arm_smccc_1_1_get_conduit()) {
        case SMCCC_CONDUIT_HVC:
                cb = call_hvc_arch_workaround_1;
                break;

        case SMCCC_CONDUIT_SMC:
                cb = call_smc_arch_workaround_1;
                break;

        default:
                return SPECTRE_VULNERABLE;
        }

        /*
         * Prefer a CPU-specific workaround if it exists. Note that we
         * still rely on firmware for the mitigation at EL2.
         */
        cb = spectre_v2_get_sw_mitigation_cb() ?: cb;
        install_bp_hardening_cb(cb);
        return SPECTRE_MITIGATED;
}

void spectre_v2_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
{
        enum mitigation_state state;

        WARN_ON(preemptible());

        state = spectre_v2_get_cpu_hw_mitigation_state();
        if (state == SPECTRE_VULNERABLE)
                state = spectre_v2_enable_fw_mitigation();

        update_mitigation_state(&spectre_v2_state, state);
}

/*
 * Spectre-v3a.
 *
 * Phew, there's not an awful lot to do here! We just instruct EL2 to use
 * an indirect trampoline for the hyp vectors so that guests can't read
 * VBAR_EL2 to defeat randomisation of the hypervisor VA layout.
 */
bool has_spectre_v3a(const struct arm64_cpu_capabilities *entry, int scope)
{
        static const struct midr_range spectre_v3a_unsafe_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
                {},
        };

        WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());
        return is_midr_in_range_list(spectre_v3a_unsafe_list);
}

void spectre_v3a_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
{
        struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);

        if (this_cpu_has_cap(ARM64_SPECTRE_V3A))
                data->slot += HYP_VECTOR_INDIRECT;
}

/*
 * Spectre v4.
 *
 * If you thought Spectre v2 was nasty, wait until you see this mess. A CPU is
 * either:
 *
 * - Mitigated in hardware and listed in our "safe list".
 * - Mitigated in hardware via PSTATE.SSBS.
 * - Mitigated in software by firmware (sometimes referred to as SSBD).
 *
 * Wait, that doesn't sound so bad, does it? Keep reading...
 *
 * A major source of headaches is that the software mitigation is enabled both
 * on a per-task basis, but can also be forced on for the kernel, necessitating
 * both context-switch *and* entry/exit hooks. To make it even worse, some CPUs
 * allow EL0 to toggle SSBS directly, which can end up with the prctl() state
 * being stale when re-entering the kernel. The usual big.LITTLE caveats apply,
 * so you can have systems that have both firmware and SSBS mitigations. This
 * means we actually have to reject late onlining of CPUs with mitigations if
 * all of the currently onlined CPUs are safelisted, as the mitigation tends to
 * be opt-in for userspace. Yes, really, the cure is worse than the disease.
 *
 * The only good part is that if the firmware mitigation is present, then it is
 * present for all CPUs, meaning we don't have to worry about late onlining of a
 * vulnerable CPU if one of the boot CPUs is using the firmware mitigation.
 *
 * Give me a VAX-11/780 any day of the week...
 */
static enum mitigation_state spectre_v4_state;

/* This is the per-cpu state tracking whether we need to talk to firmware */
DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required);

enum spectre_v4_policy {
        SPECTRE_V4_POLICY_MITIGATION_DYNAMIC,
        SPECTRE_V4_POLICY_MITIGATION_ENABLED,
        SPECTRE_V4_POLICY_MITIGATION_DISABLED,
};

static enum spectre_v4_policy __read_mostly __spectre_v4_policy;

static const struct spectre_v4_param {
        const char                *str;
        enum spectre_v4_policy        policy;
} spectre_v4_params[] = {
        { "force-on",        SPECTRE_V4_POLICY_MITIGATION_ENABLED, },
        { "force-off",        SPECTRE_V4_POLICY_MITIGATION_DISABLED, },
        { "kernel",        SPECTRE_V4_POLICY_MITIGATION_DYNAMIC, },
};
static int __init parse_spectre_v4_param(char *str)
{
        int i;

        if (!str || !str[0])
                return -EINVAL;

        for (i = 0; i < ARRAY_SIZE(spectre_v4_params); i++) {
                const struct spectre_v4_param *param = &spectre_v4_params[i];

                if (strncmp(str, param->str, strlen(param->str)))
                        continue;

                __spectre_v4_policy = param->policy;
                return 0;
        }

        return -EINVAL;
}
early_param("ssbd", parse_spectre_v4_param);

/*
 * Because this was all written in a rush by people working in different silos,
 * we've ended up with multiple command line options to control the same thing.
 * Wrap these up in some helpers, which prefer disabling the mitigation if faced
 * with contradictory parameters. The mitigation is always either "off",
 * "dynamic" or "on".
 */
static bool spectre_v4_mitigations_off(void)
{
        bool ret = cpu_mitigations_off() ||
                   __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DISABLED;

        if (ret)
                pr_info_once("spectre-v4 mitigation disabled by command-line option\n");

        return ret;
}

/* Do we need to toggle the mitigation state on entry to/exit from the kernel? */
static bool spectre_v4_mitigations_dynamic(void)
{
        return !spectre_v4_mitigations_off() &&
               __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_DYNAMIC;
}

static bool spectre_v4_mitigations_on(void)
{
        return !spectre_v4_mitigations_off() &&
               __spectre_v4_policy == SPECTRE_V4_POLICY_MITIGATION_ENABLED;
}

ssize_t cpu_show_spec_store_bypass(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        switch (spectre_v4_state) {
        case SPECTRE_UNAFFECTED:
                return sprintf(buf, "Not affected\n");
        case SPECTRE_MITIGATED:
                return sprintf(buf, "Mitigation: Speculative Store Bypass disabled via prctl\n");
        case SPECTRE_VULNERABLE:
                fallthrough;
        default:
                return sprintf(buf, "Vulnerable\n");
        }
}

enum mitigation_state arm64_get_spectre_v4_state(void)
{
        return spectre_v4_state;
}

static enum mitigation_state spectre_v4_get_cpu_hw_mitigation_state(void)
{
        static const struct midr_range spectre_v4_safe_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
                MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
                { /* sentinel */ },
        };

        if (is_midr_in_range_list(spectre_v4_safe_list))
                return SPECTRE_UNAFFECTED;

        /* CPU features are detected first */
        if (this_cpu_has_cap(ARM64_SSBS))
                return SPECTRE_MITIGATED;

        return SPECTRE_VULNERABLE;
}

static enum mitigation_state spectre_v4_get_cpu_fw_mitigation_state(void)
{
        int ret;
        struct arm_smccc_res res;

        arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
                             ARM_SMCCC_ARCH_WORKAROUND_2, &res);

        ret = res.a0;
        switch (ret) {
        case SMCCC_RET_SUCCESS:
                return SPECTRE_MITIGATED;
        case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
                fallthrough;
        case SMCCC_RET_NOT_REQUIRED:
                return SPECTRE_UNAFFECTED;
        default:
                fallthrough;
        case SMCCC_RET_NOT_SUPPORTED:
                return SPECTRE_VULNERABLE;
        }
}

bool has_spectre_v4(const struct arm64_cpu_capabilities *cap, int scope)
{
        enum mitigation_state state;

        WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());

        state = spectre_v4_get_cpu_hw_mitigation_state();
        if (state == SPECTRE_VULNERABLE)
                state = spectre_v4_get_cpu_fw_mitigation_state();

        return state != SPECTRE_UNAFFECTED;
}

bool try_emulate_el1_ssbs(struct pt_regs *regs, u32 instr)
{
        const u32 instr_mask = ~(1U << PSTATE_Imm_shift);
        const u32 instr_val = 0xd500401f | PSTATE_SSBS;

        if ((instr & instr_mask) != instr_val)
                return false;

        if (instr & BIT(PSTATE_Imm_shift))
                regs->pstate |= PSR_SSBS_BIT;
        else
                regs->pstate &= ~PSR_SSBS_BIT;

        arm64_skip_faulting_instruction(regs, 4);
        return true;
}

static enum mitigation_state spectre_v4_enable_hw_mitigation(void)
{
        enum mitigation_state state;

        /*
         * If the system is mitigated but this CPU doesn't have SSBS, then
         * we must be on the safelist and there's nothing more to do.
         */
        state = spectre_v4_get_cpu_hw_mitigation_state();
        if (state != SPECTRE_MITIGATED || !this_cpu_has_cap(ARM64_SSBS))
                return state;

        if (spectre_v4_mitigations_off()) {
                sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS);
                set_pstate_ssbs(1);
                return SPECTRE_VULNERABLE;
        }

        /* SCTLR_EL1.DSSBS was initialised to 0 during boot */
        set_pstate_ssbs(0);

        /*
         * SSBS is self-synchronizing and is intended to affect subsequent
         * speculative instructions, but some CPUs can speculate with a stale
         * value of SSBS.
         *
         * Mitigate this with an unconditional speculation barrier, as CPUs
         * could mis-speculate branches and bypass a conditional barrier.
         */
        if (IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386))
                spec_bar();

        return SPECTRE_MITIGATED;
}

/*
 * Patch a branch over the Spectre-v4 mitigation code with a NOP so that
 * we fallthrough and check whether firmware needs to be called on this CPU.
 */
void __init spectre_v4_patch_fw_mitigation_enable(struct alt_instr *alt,
                                                  __le32 *origptr,
                                                  __le32 *updptr, int nr_inst)
{
        BUG_ON(nr_inst != 1); /* Branch -> NOP */

        if (spectre_v4_mitigations_off())
                return;

        if (cpus_have_cap(ARM64_SSBS))
                return;

        if (spectre_v4_mitigations_dynamic())
                *updptr = cpu_to_le32(aarch64_insn_gen_nop());
}

/*
 * Patch a NOP in the Spectre-v4 mitigation code with an SMC/HVC instruction
 * to call into firmware to adjust the mitigation state.
 */
void __init smccc_patch_fw_mitigation_conduit(struct alt_instr *alt,
                                               __le32 *origptr,
                                               __le32 *updptr, int nr_inst)
{
        u32 insn;

        BUG_ON(nr_inst != 1); /* NOP -> HVC/SMC */

        switch (arm_smccc_1_1_get_conduit()) {
        case SMCCC_CONDUIT_HVC:
                insn = aarch64_insn_get_hvc_value();
                break;
        case SMCCC_CONDUIT_SMC:
                insn = aarch64_insn_get_smc_value();
                break;
        default:
                return;
        }

        *updptr = cpu_to_le32(insn);
}

static enum mitigation_state spectre_v4_enable_fw_mitigation(void)
{
        enum mitigation_state state;

        state = spectre_v4_get_cpu_fw_mitigation_state();
        if (state != SPECTRE_MITIGATED)
                return state;

        if (spectre_v4_mitigations_off()) {
                arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, false, NULL);
                return SPECTRE_VULNERABLE;
        }

        arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_WORKAROUND_2, true, NULL);

        if (spectre_v4_mitigations_dynamic())
                __this_cpu_write(arm64_ssbd_callback_required, 1);

        return SPECTRE_MITIGATED;
}

void spectre_v4_enable_mitigation(const struct arm64_cpu_capabilities *__unused)
{
        enum mitigation_state state;

        WARN_ON(preemptible());

        state = spectre_v4_enable_hw_mitigation();
        if (state == SPECTRE_VULNERABLE)
                state = spectre_v4_enable_fw_mitigation();

        update_mitigation_state(&spectre_v4_state, state);
}

static void __update_pstate_ssbs(struct pt_regs *regs, bool state)
{
        u64 bit = compat_user_mode(regs) ? PSR_AA32_SSBS_BIT : PSR_SSBS_BIT;

        if (state)
                regs->pstate |= bit;
        else
                regs->pstate &= ~bit;
}

void spectre_v4_enable_task_mitigation(struct task_struct *tsk)
{
        struct pt_regs *regs = task_pt_regs(tsk);
        bool ssbs = false, kthread = tsk->flags & PF_KTHREAD;

        if (spectre_v4_mitigations_off())
                ssbs = true;
        else if (spectre_v4_mitigations_dynamic() && !kthread)
                ssbs = !test_tsk_thread_flag(tsk, TIF_SSBD);

        __update_pstate_ssbs(regs, ssbs);
}

/*
 * The Spectre-v4 mitigation can be controlled via a prctl() from userspace.
 * This is interesting because the "speculation disabled" behaviour can be
 * configured so that it is preserved across exec(), which means that the
 * prctl() may be necessary even when PSTATE.SSBS can be toggled directly
 * from userspace.
 */
static void ssbd_prctl_enable_mitigation(struct task_struct *task)
{
        task_clear_spec_ssb_noexec(task);
        task_set_spec_ssb_disable(task);
        set_tsk_thread_flag(task, TIF_SSBD);
}

static void ssbd_prctl_disable_mitigation(struct task_struct *task)
{
        task_clear_spec_ssb_noexec(task);
        task_clear_spec_ssb_disable(task);
        clear_tsk_thread_flag(task, TIF_SSBD);
}

static int ssbd_prctl_set(struct task_struct *task, unsigned long ctrl)
{
        switch (ctrl) {
        case PR_SPEC_ENABLE:
                /* Enable speculation: disable mitigation */
                /*
                 * Force disabled speculation prevents it from being
                 * re-enabled.
                 */
                if (task_spec_ssb_force_disable(task))
                        return -EPERM;

                /*
                 * If the mitigation is forced on, then speculation is forced
                 * off and we again prevent it from being re-enabled.
                 */
                if (spectre_v4_mitigations_on())
                        return -EPERM;

                ssbd_prctl_disable_mitigation(task);
                break;
        case PR_SPEC_FORCE_DISABLE:
                /* Force disable speculation: force enable mitigation */
                /*
                 * If the mitigation is forced off, then speculation is forced
                 * on and we prevent it from being disabled.
                 */
                if (spectre_v4_mitigations_off())
                        return -EPERM;

                task_set_spec_ssb_force_disable(task);
                fallthrough;
        case PR_SPEC_DISABLE:
                /* Disable speculation: enable mitigation */
                /* Same as PR_SPEC_FORCE_DISABLE */
                if (spectre_v4_mitigations_off())
                        return -EPERM;

                ssbd_prctl_enable_mitigation(task);
                break;
        case PR_SPEC_DISABLE_NOEXEC:
                /* Disable speculation until execve(): enable mitigation */
                /*
                 * If the mitigation state is forced one way or the other, then
                 * we must fail now before we try to toggle it on execve().
                 */
                if (task_spec_ssb_force_disable(task) ||
                    spectre_v4_mitigations_off() ||
                    spectre_v4_mitigations_on()) {
                        return -EPERM;
                }

                ssbd_prctl_enable_mitigation(task);
                task_set_spec_ssb_noexec(task);
                break;
        default:
                return -ERANGE;
        }

        spectre_v4_enable_task_mitigation(task);
        return 0;
}

int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which,
                             unsigned long ctrl)
{
        switch (which) {
        case PR_SPEC_STORE_BYPASS:
                return ssbd_prctl_set(task, ctrl);
        default:
                return -ENODEV;
        }
}

static int ssbd_prctl_get(struct task_struct *task)
{
        switch (spectre_v4_state) {
        case SPECTRE_UNAFFECTED:
                return PR_SPEC_NOT_AFFECTED;
        case SPECTRE_MITIGATED:
                if (spectre_v4_mitigations_on())
                        return PR_SPEC_NOT_AFFECTED;

                if (spectre_v4_mitigations_dynamic())
                        break;

                /* Mitigations are disabled, so we're vulnerable. */
                fallthrough;
        case SPECTRE_VULNERABLE:
                fallthrough;
        default:
                return PR_SPEC_ENABLE;
        }

        /* Check the mitigation state for this task */
        if (task_spec_ssb_force_disable(task))
                return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE;

        if (task_spec_ssb_noexec(task))
                return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC;

        if (task_spec_ssb_disable(task))
                return PR_SPEC_PRCTL | PR_SPEC_DISABLE;

        return PR_SPEC_PRCTL | PR_SPEC_ENABLE;
}

int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which)
{
        switch (which) {
        case PR_SPEC_STORE_BYPASS:
                return ssbd_prctl_get(task);
        default:
                return -ENODEV;
        }
}

/*
 * Spectre BHB.
 *
 * A CPU is either:
 * - Mitigated by a branchy loop a CPU specific number of times, and listed
 *   in our "loop mitigated list".
 * - Mitigated in software by the firmware Spectre v2 call.
 * - Has the ClearBHB instruction to perform the mitigation.
 * - Has the 'Exception Clears Branch History Buffer' (ECBHB) feature, so no
 *   software mitigation in the vectors is needed.
 * - Has CSV2.3, so is unaffected.
 */
static enum mitigation_state spectre_bhb_state;

enum mitigation_state arm64_get_spectre_bhb_state(void)
{
        return spectre_bhb_state;
}

enum bhb_mitigation_bits {
        BHB_LOOP,
        BHB_FW,
        BHB_HW,
        BHB_INSN,
};
static unsigned long system_bhb_mitigations;

/*
 * This must be called with SCOPE_LOCAL_CPU for each type of CPU, before any
 * SCOPE_SYSTEM call will give the right answer.
 */
static bool is_spectre_bhb_safe(int scope)
{
        static const struct midr_range spectre_bhb_safe_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A35),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A53),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A55),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A510),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A520),
                MIDR_ALL_VERSIONS(MIDR_BRAHMA_B53),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_2XX_SILVER),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_3XX_SILVER),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_SILVER),
                {},
        };
        static bool all_safe = true;

        if (scope != SCOPE_LOCAL_CPU)
                return all_safe;

        if (is_midr_in_range_list(spectre_bhb_safe_list))
                return true;

        all_safe = false;

        return false;
}

static u8 spectre_bhb_loop_affected(void)
{
        u8 k = 0;

        static const struct midr_range spectre_bhb_k132_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CORTEX_X3),
                MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V2),
        };
        static const struct midr_range spectre_bhb_k38_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A715),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A720),
        };
        static const struct midr_range spectre_bhb_k32_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A78),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_X1),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_X2),
                MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2),
                MIDR_ALL_VERSIONS(MIDR_NEOVERSE_V1),
                {},
        };
        static const struct midr_range spectre_bhb_k24_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A76),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A76AE),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A77),
                MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1),
                MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD),
                MIDR_ALL_VERSIONS(MIDR_HISI_HIP09),
                {},
        };
        static const struct midr_range spectre_bhb_k11_list[] = {
                MIDR_ALL_VERSIONS(MIDR_AMPERE1),
                {},
        };
        static const struct midr_range spectre_bhb_k8_list[] = {
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A72),
                MIDR_ALL_VERSIONS(MIDR_CORTEX_A57),
                {},
        };

        if (is_midr_in_range_list(spectre_bhb_k132_list))
                k = 132;
        else if (is_midr_in_range_list(spectre_bhb_k38_list))
                k = 38;
        else if (is_midr_in_range_list(spectre_bhb_k32_list))
                k = 32;
        else if (is_midr_in_range_list(spectre_bhb_k24_list))
                k = 24;
        else if (is_midr_in_range_list(spectre_bhb_k11_list))
                k = 11;
        else if (is_midr_in_range_list(spectre_bhb_k8_list))
                k =  8;

        return k;
}

static enum mitigation_state spectre_bhb_get_cpu_fw_mitigation_state(void)
{
        int ret;
        struct arm_smccc_res res;

        arm_smccc_1_1_invoke(ARM_SMCCC_ARCH_FEATURES_FUNC_ID,
                             ARM_SMCCC_ARCH_WORKAROUND_3, &res);

        ret = res.a0;
        switch (ret) {
        case SMCCC_RET_SUCCESS:
                return SPECTRE_MITIGATED;
        case SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED:
                return SPECTRE_UNAFFECTED;
        default:
                fallthrough;
        case SMCCC_RET_NOT_SUPPORTED:
                return SPECTRE_VULNERABLE;
        }
}

static bool has_spectre_bhb_fw_mitigation(void)
{
        enum mitigation_state fw_state;
        bool has_smccc = arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_NONE;

        fw_state = spectre_bhb_get_cpu_fw_mitigation_state();
        return has_smccc && fw_state == SPECTRE_MITIGATED;
}

static bool supports_ecbhb(int scope)
{
        u64 mmfr1;

        if (scope == SCOPE_LOCAL_CPU)
                mmfr1 = read_sysreg_s(SYS_ID_AA64MMFR1_EL1);
        else
                mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);

        return cpuid_feature_extract_unsigned_field(mmfr1,
                                                    ID_AA64MMFR1_EL1_ECBHB_SHIFT);
}

static u8 max_bhb_k;

bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry,
                             int scope)
{
        WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible());

        if (supports_csv2p3(scope))
                return false;

        if (is_spectre_bhb_safe(scope))
                return false;

        /*
         * At this point the core isn't known to be "safe" so we're going to
         * assume it's vulnerable. We still need to update `max_bhb_k` though,
         * but only if we aren't mitigating with clearbhb though.
         */
        if (scope == SCOPE_LOCAL_CPU && !supports_clearbhb(SCOPE_LOCAL_CPU))
                max_bhb_k = max(max_bhb_k, spectre_bhb_loop_affected());

        return true;
}

static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot)
{
        const char *v = arm64_get_bp_hardening_vector(slot);

        __this_cpu_write(this_cpu_vector, v);

        /*
         * When KPTI is in use, the vectors are switched when exiting to
         * user-space.
         */
        if (cpus_have_cap(ARM64_UNMAP_KERNEL_AT_EL0))
                return;

        write_sysreg(v, vbar_el1);
        isb();
}

static bool __read_mostly __nospectre_bhb;
static int __init parse_spectre_bhb_param(char *str)
{
        __nospectre_bhb = true;
        return 0;
}
early_param("nospectre_bhb", parse_spectre_bhb_param);

void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry)
{
        bp_hardening_cb_t cpu_cb;
        enum mitigation_state state = SPECTRE_VULNERABLE;
        struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data);

        if (!is_spectre_bhb_affected(entry, SCOPE_LOCAL_CPU))
                return;

        if (arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) {
                /* No point mitigating Spectre-BHB alone. */
        } else if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY)) {
                pr_info_once("spectre-bhb mitigation disabled by compile time option\n");
        } else if (cpu_mitigations_off() || __nospectre_bhb) {
                pr_info_once("spectre-bhb mitigation disabled by command line option\n");
        } else if (supports_ecbhb(SCOPE_LOCAL_CPU)) {
                state = SPECTRE_MITIGATED;
                set_bit(BHB_HW, &system_bhb_mitigations);
        } else if (supports_clearbhb(SCOPE_LOCAL_CPU)) {
                /*
                 * Ensure KVM uses the indirect vector which will have ClearBHB
                 * added.
                 */
                if (!data->slot)
                        data->slot = HYP_VECTOR_INDIRECT;

                this_cpu_set_vectors(EL1_VECTOR_BHB_CLEAR_INSN);
                state = SPECTRE_MITIGATED;
                set_bit(BHB_INSN, &system_bhb_mitigations);
        } else if (spectre_bhb_loop_affected()) {
                /*
                 * Ensure KVM uses the indirect vector which will have the
                 * branchy-loop added. A57/A72-r0 will already have selected
                 * the spectre-indirect vector, which is sufficient for BHB
                 * too.
                 */
                if (!data->slot)
                        data->slot = HYP_VECTOR_INDIRECT;

                this_cpu_set_vectors(EL1_VECTOR_BHB_LOOP);
                state = SPECTRE_MITIGATED;
                set_bit(BHB_LOOP, &system_bhb_mitigations);
        } else if (has_spectre_bhb_fw_mitigation()) {
                /*
                 * Ensure KVM uses one of the spectre bp_hardening
                 * vectors. The indirect vector doesn't include the EL3
                 * call, so needs upgrading to
                 * HYP_VECTOR_SPECTRE_INDIRECT.
                 */
                if (!data->slot || data->slot == HYP_VECTOR_INDIRECT)
                        data->slot += 1;

                this_cpu_set_vectors(EL1_VECTOR_BHB_FW);

                /*
                 * The WA3 call in the vectors supersedes the WA1 call
                 * made during context-switch. Uninstall any firmware
                 * bp_hardening callback.
                 */
                cpu_cb = spectre_v2_get_sw_mitigation_cb();
                if (__this_cpu_read(bp_hardening_data.fn) != cpu_cb)
                        __this_cpu_write(bp_hardening_data.fn, NULL);

                state = SPECTRE_MITIGATED;
                set_bit(BHB_FW, &system_bhb_mitigations);
        }

        update_mitigation_state(&spectre_bhb_state, state);
}

/* Patched to NOP when enabled */
void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt,
                                                     __le32 *origptr,
                                                      __le32 *updptr, int nr_inst)
{
        BUG_ON(nr_inst != 1);

        if (test_bit(BHB_LOOP, &system_bhb_mitigations))
                *updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
}

/* Patched to NOP when enabled */
void noinstr spectre_bhb_patch_fw_mitigation_enabled(struct alt_instr *alt,
                                                   __le32 *origptr,
                                                   __le32 *updptr, int nr_inst)
{
        BUG_ON(nr_inst != 1);

        if (test_bit(BHB_FW, &system_bhb_mitigations))
                *updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
}

/* Patched to correct the immediate */
void noinstr spectre_bhb_patch_loop_iter(struct alt_instr *alt,
                                   __le32 *origptr, __le32 *updptr, int nr_inst)
{
        u8 rd;
        u32 insn;

        BUG_ON(nr_inst != 1); /* MOV -> MOV */

        if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY))
                return;

        insn = le32_to_cpu(*origptr);
        rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn);
        insn = aarch64_insn_gen_movewide(rd, max_bhb_k, 0,
                                         AARCH64_INSN_VARIANT_64BIT,
                                         AARCH64_INSN_MOVEWIDE_ZERO);
        *updptr++ = cpu_to_le32(insn);
}

/* Patched to mov WA3 when supported */
void noinstr spectre_bhb_patch_wa3(struct alt_instr *alt,
                                   __le32 *origptr, __le32 *updptr, int nr_inst)
{
        u8 rd;
        u32 insn;

        BUG_ON(nr_inst != 1); /* MOV -> MOV */

        if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) ||
            !test_bit(BHB_FW, &system_bhb_mitigations))
                return;

        insn = le32_to_cpu(*origptr);
        rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, insn);

        insn = aarch64_insn_gen_logical_immediate(AARCH64_INSN_LOGIC_ORR,
                                                  AARCH64_INSN_VARIANT_32BIT,
                                                  AARCH64_INSN_REG_ZR, rd,
                                                  ARM_SMCCC_ARCH_WORKAROUND_3);
        if (WARN_ON_ONCE(insn == AARCH64_BREAK_FAULT))
                return;

        *updptr++ = cpu_to_le32(insn);
}

/* Patched to NOP when not supported */
void __init spectre_bhb_patch_clearbhb(struct alt_instr *alt,
                                   __le32 *origptr, __le32 *updptr, int nr_inst)
{
        BUG_ON(nr_inst != 2);

        if (test_bit(BHB_INSN, &system_bhb_mitigations))
                return;

        *updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
        *updptr++ = cpu_to_le32(aarch64_insn_gen_nop());
}

#ifdef CONFIG_BPF_SYSCALL
#define EBPF_WARN "Unprivileged eBPF is enabled, data leaks possible via Spectre v2 BHB attacks!\n"
void unpriv_ebpf_notify(int new_state)
{
        if (spectre_v2_state == SPECTRE_VULNERABLE ||
            spectre_bhb_state != SPECTRE_MITIGATED)
                return;

        if (!new_state)
                pr_err("WARNING: %s", EBPF_WARN);
}
#endif































    3 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_STRUCT_H
#define _LINUX_FS_STRUCT_H

#include <linux/path.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>

struct fs_struct {
        int users;
        spinlock_t lock;
        seqcount_spinlock_t seq;
        int umask;
        int in_exec;
        struct path root, pwd;
} __randomize_layout;

extern struct kmem_cache *fs_cachep;

extern void exit_fs(struct task_struct *);
extern void set_fs_root(struct fs_struct *, const struct path *);
extern void set_fs_pwd(struct fs_struct *, const struct path *);
extern struct fs_struct *copy_fs_struct(struct fs_struct *);
extern void free_fs_struct(struct fs_struct *);
extern int unshare_fs_struct(void);

static inline void get_fs_root(struct fs_struct *fs, struct path *root)
{
        spin_lock(&fs->lock);
        *root = fs->root;
        path_get(root);
        spin_unlock(&fs->lock);
}

static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
{
        spin_lock(&fs->lock);
        *pwd = fs->pwd;
        path_get(pwd);
        spin_unlock(&fs->lock);
}

extern bool current_chrooted(void);

#endif /* _LINUX_FS_STRUCT_H */


































































































































































































































































































































  491 







































  221 





  215 





  204 

















  306 























































































































   69 

















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* audit.h -- Auditing support
 *
 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 */
#ifndef _LINUX_AUDIT_H_
#define _LINUX_AUDIT_H_

#include <linux/sched.h>
#include <linux/ptrace.h>
#include <linux/audit_arch.h>
#include <uapi/linux/audit.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/fanotify.h>

#define AUDIT_INO_UNSET ((unsigned long)-1)
#define AUDIT_DEV_UNSET ((dev_t)-1)

struct audit_sig_info {
        uid_t                uid;
        pid_t                pid;
        char                ctx[];
};

struct audit_buffer;
struct audit_context;
struct inode;
struct netlink_skb_parms;
struct path;
struct linux_binprm;
struct mq_attr;
struct mqstat;
struct audit_watch;
struct audit_tree;
struct sk_buff;
struct kern_ipc_perm;

struct audit_krule {
        u32                        pflags;
        u32                        flags;
        u32                        listnr;
        u32                        action;
        u32                        mask[AUDIT_BITMASK_SIZE];
        u32                        buflen; /* for data alloc on list rules */
        u32                        field_count;
        char                        *filterkey; /* ties events to rules */
        struct audit_field        *fields;
        struct audit_field        *arch_f; /* quick access to arch field */
        struct audit_field        *inode_f; /* quick access to an inode field */
        struct audit_watch        *watch;        /* associated watch */
        struct audit_tree        *tree;        /* associated watched tree */
        struct audit_fsnotify_mark        *exe;
        struct list_head        rlist;        /* entry in audit_{watch,tree}.rules list */
        struct list_head        list;        /* for AUDIT_LIST* purposes only */
        u64                        prio;
};

/* Flag to indicate legacy AUDIT_LOGINUID unset usage */
#define AUDIT_LOGINUID_LEGACY                0x1

struct audit_field {
        u32                                type;
        union {
                u32                        val;
                kuid_t                        uid;
                kgid_t                        gid;
                struct {
                        char                *lsm_str;
                        void                *lsm_rule;
                };
        };
        u32                                op;
};

enum audit_ntp_type {
        AUDIT_NTP_OFFSET,
        AUDIT_NTP_FREQ,
        AUDIT_NTP_STATUS,
        AUDIT_NTP_TAI,
        AUDIT_NTP_TICK,
        AUDIT_NTP_ADJUST,

        AUDIT_NTP_NVALS /* count */
};

#ifdef CONFIG_AUDITSYSCALL
struct audit_ntp_val {
        long long oldval, newval;
};

struct audit_ntp_data {
        struct audit_ntp_val vals[AUDIT_NTP_NVALS];
};
#else
struct audit_ntp_data {};
#endif

enum audit_nfcfgop {
        AUDIT_XT_OP_REGISTER,
        AUDIT_XT_OP_REPLACE,
        AUDIT_XT_OP_UNREGISTER,
        AUDIT_NFT_OP_TABLE_REGISTER,
        AUDIT_NFT_OP_TABLE_UNREGISTER,
        AUDIT_NFT_OP_CHAIN_REGISTER,
        AUDIT_NFT_OP_CHAIN_UNREGISTER,
        AUDIT_NFT_OP_RULE_REGISTER,
        AUDIT_NFT_OP_RULE_UNREGISTER,
        AUDIT_NFT_OP_SET_REGISTER,
        AUDIT_NFT_OP_SET_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_REGISTER,
        AUDIT_NFT_OP_SETELEM_UNREGISTER,
        AUDIT_NFT_OP_GEN_REGISTER,
        AUDIT_NFT_OP_OBJ_REGISTER,
        AUDIT_NFT_OP_OBJ_UNREGISTER,
        AUDIT_NFT_OP_OBJ_RESET,
        AUDIT_NFT_OP_FLOWTABLE_REGISTER,
        AUDIT_NFT_OP_FLOWTABLE_UNREGISTER,
        AUDIT_NFT_OP_SETELEM_RESET,
        AUDIT_NFT_OP_RULE_RESET,
        AUDIT_NFT_OP_INVALID,
};

extern int __init audit_register_class(int class, unsigned *list);
extern int audit_classify_syscall(int abi, unsigned syscall);
extern int audit_classify_arch(int arch);
/* only for compat system calls */
extern unsigned compat_write_class[];
extern unsigned compat_read_class[];
extern unsigned compat_dir_class[];
extern unsigned compat_chattr_class[];
extern unsigned compat_signal_class[];

/* audit_names->type values */
#define        AUDIT_TYPE_UNKNOWN        0        /* we don't know yet */
#define        AUDIT_TYPE_NORMAL        1        /* a "normal" audit record */
#define        AUDIT_TYPE_PARENT        2        /* a parent audit record */
#define        AUDIT_TYPE_CHILD_DELETE 3        /* a child being deleted */
#define        AUDIT_TYPE_CHILD_CREATE 4        /* a child being created */

/* maximized args number that audit_socketcall can process */
#define AUDITSC_ARGS                6

/* bit values for ->signal->audit_tty */
#define AUDIT_TTY_ENABLE        BIT(0)
#define AUDIT_TTY_LOG_PASSWD        BIT(1)

struct filename;

#define AUDIT_OFF        0
#define AUDIT_ON        1
#define AUDIT_LOCKED        2
#ifdef CONFIG_AUDIT
/* These are defined in audit.c */
                                /* Public API */
extern __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...);

extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
extern __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...);
extern void                    audit_log_end(struct audit_buffer *ab);
extern bool                    audit_string_contains_control(const char *string,
                                                          size_t len);
extern void                    audit_log_n_hex(struct audit_buffer *ab,
                                          const unsigned char *buf,
                                          size_t len);
extern void                    audit_log_n_string(struct audit_buffer *ab,
                                               const char *buf,
                                               size_t n);
extern void                    audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                        const char *string,
                                                        size_t n);
extern void                    audit_log_untrustedstring(struct audit_buffer *ab,
                                                      const char *string);
extern void                    audit_log_d_path(struct audit_buffer *ab,
                                             const char *prefix,
                                             const struct path *path);
extern void                    audit_log_key(struct audit_buffer *ab,
                                          char *key);
extern void                    audit_log_path_denied(int type,
                                                  const char *operation);
extern void                    audit_log_lost(const char *message);

extern int audit_log_task_context(struct audit_buffer *ab);
extern void audit_log_task_info(struct audit_buffer *ab);

extern int                    audit_update_lsm_rules(void);

                                /* Private API (for audit.c only) */
extern int audit_rule_change(int type, int seq, void *data, size_t datasz);
extern int audit_list_rules_send(struct sk_buff *request_skb, int seq);

extern int audit_set_loginuid(kuid_t loginuid);

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return tsk->loginuid;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return tsk->sessionid;
}

extern u32 audit_enabled;

extern int audit_signal_info(int sig, struct task_struct *t);

#else /* CONFIG_AUDIT */
static inline __printf(4, 5)
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...)
{ }
static inline struct audit_buffer *audit_log_start(struct audit_context *ctx,
                                                   gfp_t gfp_mask, int type)
{
        return NULL;
}
static inline __printf(2, 3)
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{ }
static inline void audit_log_end(struct audit_buffer *ab)
{ }
static inline void audit_log_n_hex(struct audit_buffer *ab,
                                   const unsigned char *buf, size_t len)
{ }
static inline void audit_log_n_string(struct audit_buffer *ab,
                                      const char *buf, size_t n)
{ }
static inline void  audit_log_n_untrustedstring(struct audit_buffer *ab,
                                                const char *string, size_t n)
{ }
static inline void audit_log_untrustedstring(struct audit_buffer *ab,
                                             const char *string)
{ }
static inline void audit_log_d_path(struct audit_buffer *ab,
                                    const char *prefix,
                                    const struct path *path)
{ }
static inline void audit_log_key(struct audit_buffer *ab, char *key)
{ }
static inline void audit_log_path_denied(int type, const char *operation)
{ }
static inline int audit_log_task_context(struct audit_buffer *ab)
{
        return 0;
}
static inline void audit_log_task_info(struct audit_buffer *ab)
{ }

static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
        return INVALID_UID;
}

static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
        return AUDIT_SID_UNSET;
}

#define audit_enabled AUDIT_OFF

static inline int audit_signal_info(int sig, struct task_struct *t)
{
        return 0;
}

#endif /* CONFIG_AUDIT */

#ifdef CONFIG_AUDIT_COMPAT_GENERIC
#define audit_is_compat(arch)  (!((arch) & __AUDIT_ARCH_64BIT))
#else
#define audit_is_compat(arch)  false
#endif

#define AUDIT_INODE_PARENT        1        /* dentry represents the parent */
#define AUDIT_INODE_HIDDEN        2        /* audit record should be hidden */
#define AUDIT_INODE_NOEVAL        4        /* audit record incomplete */

#ifdef CONFIG_AUDITSYSCALL
#include <asm/syscall.h> /* for syscall_get_arch() */

/* These are defined in auditsc.c */
                                /* Public API */
extern int  audit_alloc(struct task_struct *task);
extern void __audit_free(struct task_struct *task);
extern void __audit_uring_entry(u8 op);
extern void __audit_uring_exit(int success, long code);
extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1,
                                  unsigned long a2, unsigned long a3);
extern void __audit_syscall_exit(int ret_success, long ret_value);
extern struct filename *__audit_reusename(const __user char *uptr);
extern void __audit_getname(struct filename *name);
extern void __audit_inode(struct filename *name, const struct dentry *dentry,
                                unsigned int flags);
extern void __audit_file(const struct file *);
extern void __audit_inode_child(struct inode *parent,
                                const struct dentry *dentry,
                                const unsigned char type);
extern void audit_seccomp(unsigned long syscall, long signr, int code);
extern void audit_seccomp_actions_logged(const char *names,
                                         const char *old_names, int res);
extern void __audit_ptrace(struct task_struct *t);

static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{
        task->audit_context = ctx;
}

static inline struct audit_context *audit_context(void)
{
        return current->audit_context;
}

static inline bool audit_dummy_context(void)
{
        void *p = audit_context();
        return !p || *(int *)p;
}
static inline void audit_free(struct task_struct *task)
{
        if (unlikely(task->audit_context))
                __audit_free(task);
}
static inline void audit_uring_entry(u8 op)
{
        /*
         * We intentionally check audit_context() before audit_enabled as most
         * Linux systems (as of ~2021) rely on systemd which forces audit to
         * be enabled regardless of the user's audit configuration.
         */
        if (unlikely(audit_context() && audit_enabled))
                __audit_uring_entry(op);
}
static inline void audit_uring_exit(int success, long code)
{
        if (unlikely(audit_context()))
                __audit_uring_exit(success, code);
}
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{
        if (unlikely(audit_context()))
                __audit_syscall_entry(major, a0, a1, a2, a3);
}
static inline void audit_syscall_exit(void *pt_regs)
{
        if (unlikely(audit_context())) {
                int success = is_syscall_success(pt_regs);
                long return_code = regs_return_value(pt_regs);

                __audit_syscall_exit(success, return_code);
        }
}
static inline struct filename *audit_reusename(const __user char *name)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_reusename(name);
        return NULL;
}
static inline void audit_getname(struct filename *name)
{
        if (unlikely(!audit_dummy_context()))
                __audit_getname(name);
}
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry, aflags);
}
static inline void audit_file(struct file *file)
{
        if (unlikely(!audit_dummy_context()))
                __audit_file(file);
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                                const struct dentry *dentry)
{
        if (unlikely(!audit_dummy_context()))
                __audit_inode(name, dentry,
                                AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN);
}
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type) {
        if (unlikely(!audit_dummy_context()))
                __audit_inode_child(parent, dentry, type);
}
void audit_core_dumps(long signr);

static inline void audit_ptrace(struct task_struct *t)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ptrace(t);
}

                                /* Private API (for audit.c only) */
extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
extern void __audit_bprm(struct linux_binprm *bprm);
extern int __audit_socketcall(int nargs, unsigned long *args);
extern int __audit_sockaddr(int len, void *addr);
extern void __audit_fd_pair(int fd1, int fd2);
extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout);
extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification);
extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat);
extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                  const struct cred *new,
                                  const struct cred *old);
extern void __audit_log_capset(const struct cred *new, const struct cred *old);
extern void __audit_mmap_fd(int fd, int flags);
extern void __audit_openat2_how(struct open_how *how);
extern void __audit_log_kern_module(char *name);
extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar);
extern void __audit_tk_injoffset(struct timespec64 offset);
extern void __audit_ntp_log(const struct audit_ntp_data *ad);
extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries,
                              enum audit_nfcfgop op, gfp_t gfp);

static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_obj(ipcp);
}
static inline void audit_fd_pair(int fd1, int fd2)
{
        if (unlikely(!audit_dummy_context()))
                __audit_fd_pair(fd1, fd2);
}
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
        if (unlikely(!audit_dummy_context()))
                __audit_ipc_set_perm(qbytes, uid, gid, mode);
}
static inline void audit_bprm(struct linux_binprm *bprm)
{
        if (unlikely(!audit_dummy_context()))
                __audit_bprm(bprm);
}
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_socketcall(nargs, args);
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        unsigned long a[AUDITSC_ARGS];
        int i;

        if (audit_dummy_context())
                return 0;

        for (i = 0; i < nargs; i++)
                a[i] = (unsigned long)args[i];
        return __audit_socketcall(nargs, a);
}

static inline int audit_sockaddr(int len, void *addr)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_sockaddr(len, addr);
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_open(oflag, mode, attr);
}
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout);
}
static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_notify(mqdes, notification);
}
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mq_getsetattr(mqdes, mqstat);
}

static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                return __audit_log_bprm_fcaps(bprm, new, old);
        return 0;
}

static inline void audit_log_capset(const struct cred *new,
                                   const struct cred *old)
{
        if (unlikely(!audit_dummy_context()))
                __audit_log_capset(new, old);
}

static inline void audit_mmap_fd(int fd, int flags)
{
        if (unlikely(!audit_dummy_context()))
                __audit_mmap_fd(fd, flags);
}

static inline void audit_openat2_how(struct open_how *how)
{
        if (unlikely(!audit_dummy_context()))
                __audit_openat2_how(how);
}

static inline void audit_log_kern_module(char *name)
{
        if (!audit_dummy_context())
                __audit_log_kern_module(name);
}

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{
        if (!audit_dummy_context())
                __audit_fanotify(response, friar);
}

static inline void audit_tk_injoffset(struct timespec64 offset)
{
        /* ignore no-op events */
        if (offset.tv_sec == 0 && offset.tv_nsec == 0)
                return;

        if (!audit_dummy_context())
                __audit_tk_injoffset(offset);
}

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{
        memset(ad, 0, sizeof(*ad));
}

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].oldval = val;
}

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{
        ad->vals[type].newval = val;
}

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{
        if (!audit_dummy_context())
                __audit_ntp_log(ad);
}

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{
        if (audit_enabled)
                __audit_log_nfcfg(name, af, nentries, op, gfp);
}

extern int audit_n_rules;
extern int audit_signals;
#else /* CONFIG_AUDITSYSCALL */
static inline int audit_alloc(struct task_struct *task)
{
        return 0;
}
static inline void audit_free(struct task_struct *task)
{ }
static inline void audit_uring_entry(u8 op)
{ }
static inline void audit_uring_exit(int success, long code)
{ }
static inline void audit_syscall_entry(int major, unsigned long a0,
                                       unsigned long a1, unsigned long a2,
                                       unsigned long a3)
{ }
static inline void audit_syscall_exit(void *pt_regs)
{ }
static inline bool audit_dummy_context(void)
{
        return true;
}
static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx)
{ }
static inline struct audit_context *audit_context(void)
{
        return NULL;
}
static inline struct filename *audit_reusename(const __user char *name)
{
        return NULL;
}
static inline void audit_getname(struct filename *name)
{ }
static inline void audit_inode(struct filename *name,
                                const struct dentry *dentry,
                                unsigned int aflags)
{ }
static inline void audit_file(struct file *file)
{
}
static inline void audit_inode_parent_hidden(struct filename *name,
                                const struct dentry *dentry)
{ }
static inline void audit_inode_child(struct inode *parent,
                                     const struct dentry *dentry,
                                     const unsigned char type)
{ }
static inline void audit_core_dumps(long signr)
{ }
static inline void audit_seccomp(unsigned long syscall, long signr, int code)
{ }
static inline void audit_seccomp_actions_logged(const char *names,
                                                const char *old_names, int res)
{ }
static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{ }
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
                                        gid_t gid, umode_t mode)
{ }
static inline void audit_bprm(struct linux_binprm *bprm)
{ }
static inline int audit_socketcall(int nargs, unsigned long *args)
{
        return 0;
}

static inline int audit_socketcall_compat(int nargs, u32 *args)
{
        return 0;
}

static inline void audit_fd_pair(int fd1, int fd2)
{ }
static inline int audit_sockaddr(int len, void *addr)
{
        return 0;
}
static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{ }
static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len,
                                     unsigned int msg_prio,
                                     const struct timespec64 *abs_timeout)
{ }
static inline void audit_mq_notify(mqd_t mqdes,
                                   const struct sigevent *notification)
{ }
static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
{ }
static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm,
                                       const struct cred *new,
                                       const struct cred *old)
{
        return 0;
}
static inline void audit_log_capset(const struct cred *new,
                                    const struct cred *old)
{ }
static inline void audit_mmap_fd(int fd, int flags)
{ }

static inline void audit_openat2_how(struct open_how *how)
{ }

static inline void audit_log_kern_module(char *name)
{
}

static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar)
{ }

static inline void audit_tk_injoffset(struct timespec64 offset)
{ }

static inline void audit_ntp_init(struct audit_ntp_data *ad)
{ }

static inline void audit_ntp_set_old(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_set_new(struct audit_ntp_data *ad,
                                     enum audit_ntp_type type, long long val)
{ }

static inline void audit_ntp_log(const struct audit_ntp_data *ad)
{ }

static inline void audit_ptrace(struct task_struct *t)
{ }

static inline void audit_log_nfcfg(const char *name, u8 af,
                                   unsigned int nentries,
                                   enum audit_nfcfgop op, gfp_t gfp)
{ }

#define audit_n_rules 0
#define audit_signals 0
#endif /* CONFIG_AUDITSYSCALL */

static inline bool audit_loginuid_set(struct task_struct *tsk)
{
        return uid_valid(audit_get_loginuid(tsk));
}

#endif

















































































    1 
   71 







   72 


























    1 









   85 




   21 









  141 


















   50 






    3 






























   13 



  169 
















































   11 






















    2 






    1 

    1 


















    4 























































  127 
























































   11 










































  127 




   71 
















    3 


   97 








   99 











   95 





   44 












   42 











   34 

    1 



   41 










   86 
   86 




































































































































  166 


  165 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Derived from arch/arm/include/kvm_emulate.h
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#ifndef __ARM64_KVM_EMULATE_H__
#define __ARM64_KVM_EMULATE_H__

#include <linux/bitfield.h>
#include <linux/kvm_host.h>

#include <asm/debug-monitors.h>
#include <asm/esr.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_nested.h>
#include <asm/ptrace.h>
#include <asm/cputype.h>
#include <asm/virt.h>

#define CURRENT_EL_SP_EL0_VECTOR        0x0
#define CURRENT_EL_SP_ELx_VECTOR        0x200
#define LOWER_EL_AArch64_VECTOR                0x400
#define LOWER_EL_AArch32_VECTOR                0x600

enum exception_type {
        except_type_sync        = 0,
        except_type_irq                = 0x80,
        except_type_fiq                = 0x100,
        except_type_serror        = 0x180,
};

#define kvm_exception_type_names                \
        { except_type_sync,        "SYNC"   },        \
        { except_type_irq,        "IRQ"    },        \
        { except_type_fiq,        "FIQ"    },        \
        { except_type_serror,        "SERROR" }

bool kvm_condition_valid32(const struct kvm_vcpu *vcpu);
void kvm_skip_instr32(struct kvm_vcpu *vcpu);

void kvm_inject_undefined(struct kvm_vcpu *vcpu);
void kvm_inject_vabt(struct kvm_vcpu *vcpu);
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
void kvm_inject_size_fault(struct kvm_vcpu *vcpu);

void kvm_vcpu_wfi(struct kvm_vcpu *vcpu);

void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu);
int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2);
int kvm_inject_nested_irq(struct kvm_vcpu *vcpu);

static inline void kvm_inject_nested_sve_trap(struct kvm_vcpu *vcpu)
{
        u64 esr = FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_SVE) |
                  ESR_ELx_IL;

        kvm_inject_nested_sync(vcpu, esr);
}

#if defined(__KVM_VHE_HYPERVISOR__) || defined(__KVM_NVHE_HYPERVISOR__)
static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
{
        return !(vcpu->arch.hcr_el2 & HCR_RW);
}
#else
static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu)
{
        return vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT);
}
#endif

static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
{
        if (!vcpu_has_run_once(vcpu))
                vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;

        /*
         * For non-FWB CPUs, we trap VM ops (HCR_EL2.TVM) until M+C
         * get set in SCTLR_EL1 such that we can detect when the guest
         * MMU gets turned on and do the necessary cache maintenance
         * then.
         */
        if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
                vcpu->arch.hcr_el2 |= HCR_TVM;
}

static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu)
{
        return (unsigned long *)&vcpu->arch.hcr_el2;
}

static inline void vcpu_clear_wfx_traps(struct kvm_vcpu *vcpu)
{
        vcpu->arch.hcr_el2 &= ~HCR_TWE;
        if (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) ||
            vcpu->kvm->arch.vgic.nassgireq)
                vcpu->arch.hcr_el2 &= ~HCR_TWI;
        else
                vcpu->arch.hcr_el2 |= HCR_TWI;
}

static inline void vcpu_set_wfx_traps(struct kvm_vcpu *vcpu)
{
        vcpu->arch.hcr_el2 |= HCR_TWE;
        vcpu->arch.hcr_el2 |= HCR_TWI;
}

static inline unsigned long vcpu_get_vsesr(struct kvm_vcpu *vcpu)
{
        return vcpu->arch.vsesr_el2;
}

static inline void vcpu_set_vsesr(struct kvm_vcpu *vcpu, u64 vsesr)
{
        vcpu->arch.vsesr_el2 = vsesr;
}

static __always_inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
{
        return (unsigned long *)&vcpu_gp_regs(vcpu)->pc;
}

static __always_inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu)
{
        return (unsigned long *)&vcpu_gp_regs(vcpu)->pstate;
}

static __always_inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu)
{
        return !!(*vcpu_cpsr(vcpu) & PSR_MODE32_BIT);
}

static __always_inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu)
{
        if (vcpu_mode_is_32bit(vcpu))
                return kvm_condition_valid32(vcpu);

        return true;
}

static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu)
{
        *vcpu_cpsr(vcpu) |= PSR_AA32_T_BIT;
}

/*
 * vcpu_get_reg and vcpu_set_reg should always be passed a register number
 * coming from a read of ESR_EL2. Otherwise, it may give the wrong result on
 * AArch32 with banked registers.
 */
static __always_inline unsigned long vcpu_get_reg(const struct kvm_vcpu *vcpu,
                                         u8 reg_num)
{
        return (reg_num == 31) ? 0 : vcpu_gp_regs(vcpu)->regs[reg_num];
}

static __always_inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num,
                                unsigned long val)
{
        if (reg_num != 31)
                vcpu_gp_regs(vcpu)->regs[reg_num] = val;
}

static inline bool vcpu_is_el2_ctxt(const struct kvm_cpu_context *ctxt)
{
        switch (ctxt->regs.pstate & (PSR_MODE32_BIT | PSR_MODE_MASK)) {
        case PSR_MODE_EL2h:
        case PSR_MODE_EL2t:
                return true;
        default:
                return false;
        }
}

static inline bool vcpu_is_el2(const struct kvm_vcpu *vcpu)
{
        return vcpu_is_el2_ctxt(&vcpu->arch.ctxt);
}

static inline bool vcpu_el2_e2h_is_set(const struct kvm_vcpu *vcpu)
{
        return (!cpus_have_final_cap(ARM64_HAS_HCR_NV1) ||
                (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_E2H));
}

static inline bool vcpu_el2_tge_is_set(const struct kvm_vcpu *vcpu)
{
        return ctxt_sys_reg(&vcpu->arch.ctxt, HCR_EL2) & HCR_TGE;
}

static inline bool is_hyp_ctxt(const struct kvm_vcpu *vcpu)
{
        bool e2h, tge;
        u64 hcr;

        if (!vcpu_has_nv(vcpu))
                return false;

        hcr = __vcpu_sys_reg(vcpu, HCR_EL2);

        e2h = (hcr & HCR_E2H);
        tge = (hcr & HCR_TGE);

        /*
         * We are in a hypervisor context if the vcpu mode is EL2 or
         * E2H and TGE bits are set. The latter means we are in the user space
         * of the VHE kernel. ARMv8.1 ARM describes this as 'InHost'
         *
         * Note that the HCR_EL2.{E2H,TGE}={0,1} isn't really handled in the
         * rest of the KVM code, and will result in a misbehaving guest.
         */
        return vcpu_is_el2(vcpu) || (e2h && tge) || tge;
}

static inline bool vcpu_is_host_el0(const struct kvm_vcpu *vcpu)
{
        return is_hyp_ctxt(vcpu) && !vcpu_is_el2(vcpu);
}

/*
 * The layout of SPSR for an AArch32 state is different when observed from an
 * AArch64 SPSR_ELx or an AArch32 SPSR_*. This function generates the AArch32
 * view given an AArch64 view.
 *
 * In ARM DDI 0487E.a see:
 *
 * - The AArch64 view (SPSR_EL2) in section C5.2.18, page C5-426
 * - The AArch32 view (SPSR_abt) in section G8.2.126, page G8-6256
 * - The AArch32 view (SPSR_und) in section G8.2.132, page G8-6280
 *
 * Which show the following differences:
 *
 * | Bit | AA64 | AA32 | Notes                       |
 * +-----+------+------+-----------------------------|
 * | 24  | DIT  | J    | J is RES0 in ARMv8          |
 * | 21  | SS   | DIT  | SS doesn't exist in AArch32 |
 *
 * ... and all other bits are (currently) common.
 */
static inline unsigned long host_spsr_to_spsr32(unsigned long spsr)
{
        const unsigned long overlap = BIT(24) | BIT(21);
        unsigned long dit = !!(spsr & PSR_AA32_DIT_BIT);

        spsr &= ~overlap;

        spsr |= dit << 21;

        return spsr;
}

static inline bool vcpu_mode_priv(const struct kvm_vcpu *vcpu)
{
        u32 mode;

        if (vcpu_mode_is_32bit(vcpu)) {
                mode = *vcpu_cpsr(vcpu) & PSR_AA32_MODE_MASK;
                return mode > PSR_AA32_MODE_USR;
        }

        mode = *vcpu_cpsr(vcpu) & PSR_MODE_MASK;

        return mode != PSR_MODE_EL0t;
}

static __always_inline u64 kvm_vcpu_get_esr(const struct kvm_vcpu *vcpu)
{
        return vcpu->arch.fault.esr_el2;
}

static inline bool guest_hyp_wfx_traps_enabled(const struct kvm_vcpu *vcpu)
{
        u64 esr = kvm_vcpu_get_esr(vcpu);
        bool is_wfe = !!(esr & ESR_ELx_WFx_ISS_WFE);
        u64 hcr_el2 = __vcpu_sys_reg(vcpu, HCR_EL2);

        if (!vcpu_has_nv(vcpu) || vcpu_is_el2(vcpu))
                return false;

        return ((is_wfe && (hcr_el2 & HCR_TWE)) ||
                (!is_wfe && (hcr_el2 & HCR_TWI)));
}

static __always_inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu)
{
        u64 esr = kvm_vcpu_get_esr(vcpu);

        if (esr & ESR_ELx_CV)
                return (esr & ESR_ELx_COND_MASK) >> ESR_ELx_COND_SHIFT;

        return -1;
}

static __always_inline unsigned long kvm_vcpu_get_hfar(const struct kvm_vcpu *vcpu)
{
        return vcpu->arch.fault.far_el2;
}

static __always_inline phys_addr_t kvm_vcpu_get_fault_ipa(const struct kvm_vcpu *vcpu)
{
        u64 hpfar = vcpu->arch.fault.hpfar_el2;

        if (unlikely(!(hpfar & HPFAR_EL2_NS)))
                return INVALID_GPA;

        return FIELD_GET(HPFAR_EL2_FIPA, hpfar) << 12;
}

static inline u64 kvm_vcpu_get_disr(const struct kvm_vcpu *vcpu)
{
        return vcpu->arch.fault.disr_el1;
}

static inline u32 kvm_vcpu_hvc_get_imm(const struct kvm_vcpu *vcpu)
{
        return kvm_vcpu_get_esr(vcpu) & ESR_ELx_xVC_IMM_MASK;
}

static __always_inline bool kvm_vcpu_dabt_isvalid(const struct kvm_vcpu *vcpu)
{
        return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_ISV);
}

static inline unsigned long kvm_vcpu_dabt_iss_nisv_sanitized(const struct kvm_vcpu *vcpu)
{
        return kvm_vcpu_get_esr(vcpu) & (ESR_ELx_CM | ESR_ELx_WNR | ESR_ELx_FSC);
}

static inline bool kvm_vcpu_dabt_issext(const struct kvm_vcpu *vcpu)
{
        return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_SSE);
}

static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu)
{
        return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_SF);
}

static __always_inline int kvm_vcpu_dabt_get_rd(const struct kvm_vcpu *vcpu)
{
        return (kvm_vcpu_get_esr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT;
}

static __always_inline bool kvm_vcpu_abt_iss1tw(const struct kvm_vcpu *vcpu)
{
        return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_S1PTW);
}

/* Always check for S1PTW *before* using this. */
static __always_inline bool kvm_vcpu_dabt_iswrite(const struct kvm_vcpu *vcpu)
{
        return kvm_vcpu_get_esr(vcpu) & ESR_ELx_WNR;
}

static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu)
{
        return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_CM);
}

static __always_inline unsigned int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu)
{
        return 1 << ((kvm_vcpu_get_esr(vcpu) & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT);
}

/* This one is not specific to Data Abort */
static __always_inline bool kvm_vcpu_trap_il_is32bit(const struct kvm_vcpu *vcpu)
{
        return !!(kvm_vcpu_get_esr(vcpu) & ESR_ELx_IL);
}

static __always_inline u8 kvm_vcpu_trap_get_class(const struct kvm_vcpu *vcpu)
{
        return ESR_ELx_EC(kvm_vcpu_get_esr(vcpu));
}

static inline bool kvm_vcpu_trap_is_iabt(const struct kvm_vcpu *vcpu)
{
        return kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_IABT_LOW;
}

static inline bool kvm_vcpu_trap_is_exec_fault(const struct kvm_vcpu *vcpu)
{
        return kvm_vcpu_trap_is_iabt(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu);
}

static __always_inline u8 kvm_vcpu_trap_get_fault(const struct kvm_vcpu *vcpu)
{
        return kvm_vcpu_get_esr(vcpu) & ESR_ELx_FSC;
}

static inline
bool kvm_vcpu_trap_is_permission_fault(const struct kvm_vcpu *vcpu)
{
        return esr_fsc_is_permission_fault(kvm_vcpu_get_esr(vcpu));
}

static inline
bool kvm_vcpu_trap_is_translation_fault(const struct kvm_vcpu *vcpu)
{
        return esr_fsc_is_translation_fault(kvm_vcpu_get_esr(vcpu));
}

static inline
u64 kvm_vcpu_trap_get_perm_fault_granule(const struct kvm_vcpu *vcpu)
{
        unsigned long esr = kvm_vcpu_get_esr(vcpu);

        BUG_ON(!esr_fsc_is_permission_fault(esr));
        return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(esr & ESR_ELx_FSC_LEVEL));
}

static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu)
{
        switch (kvm_vcpu_trap_get_fault(vcpu)) {
        case ESR_ELx_FSC_EXTABT:
        case ESR_ELx_FSC_SEA_TTW(-1) ... ESR_ELx_FSC_SEA_TTW(3):
        case ESR_ELx_FSC_SECC:
        case ESR_ELx_FSC_SECC_TTW(-1) ... ESR_ELx_FSC_SECC_TTW(3):
                return true;
        default:
                return false;
        }
}

static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu)
{
        u64 esr = kvm_vcpu_get_esr(vcpu);
        return ESR_ELx_SYS64_ISS_RT(esr);
}

static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
{
        if (kvm_vcpu_abt_iss1tw(vcpu)) {
                /*
                 * Only a permission fault on a S1PTW should be
                 * considered as a write. Otherwise, page tables baked
                 * in a read-only memslot will result in an exception
                 * being delivered in the guest.
                 *
                 * The drawback is that we end-up faulting twice if the
                 * guest is using any of HW AF/DB: a translation fault
                 * to map the page containing the PT (read only at
                 * first), then a permission fault to allow the flags
                 * to be set.
                 */
                return kvm_vcpu_trap_is_permission_fault(vcpu);
        }

        if (kvm_vcpu_trap_is_iabt(vcpu))
                return false;

        return kvm_vcpu_dabt_iswrite(vcpu);
}

static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
{
        return __vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK;
}

static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
{
        if (vcpu_mode_is_32bit(vcpu)) {
                *vcpu_cpsr(vcpu) |= PSR_AA32_E_BIT;
        } else {
                u64 sctlr = vcpu_read_sys_reg(vcpu, SCTLR_EL1);
                sctlr |= SCTLR_ELx_EE;
                vcpu_write_sys_reg(vcpu, sctlr, SCTLR_EL1);
        }
}

static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu)
{
        if (vcpu_mode_is_32bit(vcpu))
                return !!(*vcpu_cpsr(vcpu) & PSR_AA32_E_BIT);

        if (vcpu_mode_priv(vcpu))
                return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_ELx_EE);
        else
                return !!(vcpu_read_sys_reg(vcpu, SCTLR_EL1) & SCTLR_EL1_E0E);
}

static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu,
                                                    unsigned long data,
                                                    unsigned int len)
{
        if (kvm_vcpu_is_be(vcpu)) {
                switch (len) {
                case 1:
                        return data & 0xff;
                case 2:
                        return be16_to_cpu(data & 0xffff);
                case 4:
                        return be32_to_cpu(data & 0xffffffff);
                default:
                        return be64_to_cpu(data);
                }
        } else {
                switch (len) {
                case 1:
                        return data & 0xff;
                case 2:
                        return le16_to_cpu(data & 0xffff);
                case 4:
                        return le32_to_cpu(data & 0xffffffff);
                default:
                        return le64_to_cpu(data);
                }
        }

        return data;                /* Leave LE untouched */
}

static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu,
                                                    unsigned long data,
                                                    unsigned int len)
{
        if (kvm_vcpu_is_be(vcpu)) {
                switch (len) {
                case 1:
                        return data & 0xff;
                case 2:
                        return cpu_to_be16(data & 0xffff);
                case 4:
                        return cpu_to_be32(data & 0xffffffff);
                default:
                        return cpu_to_be64(data);
                }
        } else {
                switch (len) {
                case 1:
                        return data & 0xff;
                case 2:
                        return cpu_to_le16(data & 0xffff);
                case 4:
                        return cpu_to_le32(data & 0xffffffff);
                default:
                        return cpu_to_le64(data);
                }
        }

        return data;                /* Leave LE untouched */
}

static __always_inline void kvm_incr_pc(struct kvm_vcpu *vcpu)
{
        WARN_ON(vcpu_get_flag(vcpu, PENDING_EXCEPTION));
        vcpu_set_flag(vcpu, INCREMENT_PC);
}

#define kvm_pend_exception(v, e)                                        \
        do {                                                                \
                WARN_ON(vcpu_get_flag((v), INCREMENT_PC));                \
                vcpu_set_flag((v), PENDING_EXCEPTION);                        \
                vcpu_set_flag((v), e);                                        \
        } while (0)

#define __build_check_all_or_none(r, bits)                                \
        BUILD_BUG_ON(((r) & (bits)) && ((r) & (bits)) != (bits))

#define __cpacr_to_cptr_clr(clr, set)                                        \
        ({                                                                \
                u64 cptr = 0;                                                \
                                                                        \
                if ((set) & CPACR_EL1_FPEN)                                \
                        cptr |= CPTR_EL2_TFP;                                \
                if ((set) & CPACR_EL1_ZEN)                                \
                        cptr |= CPTR_EL2_TZ;                                \
                if ((set) & CPACR_EL1_SMEN)                                \
                        cptr |= CPTR_EL2_TSM;                                \
                if ((clr) & CPACR_EL1_TTA)                                \
                        cptr |= CPTR_EL2_TTA;                                \
                if ((clr) & CPTR_EL2_TAM)                                \
                        cptr |= CPTR_EL2_TAM;                                \
                if ((clr) & CPTR_EL2_TCPAC)                                \
                        cptr |= CPTR_EL2_TCPAC;                                \
                                                                        \
                cptr;                                                        \
        })

#define __cpacr_to_cptr_set(clr, set)                                        \
        ({                                                                \
                u64 cptr = 0;                                                \
                                                                        \
                if ((clr) & CPACR_EL1_FPEN)                                \
                        cptr |= CPTR_EL2_TFP;                                \
                if ((clr) & CPACR_EL1_ZEN)                                \
                        cptr |= CPTR_EL2_TZ;                                \
                if ((clr) & CPACR_EL1_SMEN)                                \
                        cptr |= CPTR_EL2_TSM;                                \
                if ((set) & CPACR_EL1_TTA)                                \
                        cptr |= CPTR_EL2_TTA;                                \
                if ((set) & CPTR_EL2_TAM)                                \
                        cptr |= CPTR_EL2_TAM;                                \
                if ((set) & CPTR_EL2_TCPAC)                                \
                        cptr |= CPTR_EL2_TCPAC;                                \
                                                                        \
                cptr;                                                        \
        })

#define cpacr_clear_set(clr, set)                                        \
        do {                                                                \
                BUILD_BUG_ON((set) & CPTR_VHE_EL2_RES0);                \
                BUILD_BUG_ON((clr) & CPACR_EL1_E0POE);                        \
                __build_check_all_or_none((clr), CPACR_EL1_FPEN);        \
                __build_check_all_or_none((set), CPACR_EL1_FPEN);        \
                __build_check_all_or_none((clr), CPACR_EL1_ZEN);        \
                __build_check_all_or_none((set), CPACR_EL1_ZEN);        \
                __build_check_all_or_none((clr), CPACR_EL1_SMEN);        \
                __build_check_all_or_none((set), CPACR_EL1_SMEN);        \
                                                                        \
                if (has_vhe() || has_hvhe())                                \
                        sysreg_clear_set(cpacr_el1, clr, set);                \
                else                                                        \
                        sysreg_clear_set(cptr_el2,                        \
                                         __cpacr_to_cptr_clr(clr, set),        \
                                         __cpacr_to_cptr_set(clr, set));\
        } while (0)

/*
 * Returns a 'sanitised' view of CPTR_EL2, translating from nVHE to the VHE
 * format if E2H isn't set.
 */
static inline u64 vcpu_sanitised_cptr_el2(const struct kvm_vcpu *vcpu)
{
        u64 cptr = __vcpu_sys_reg(vcpu, CPTR_EL2);

        if (!vcpu_el2_e2h_is_set(vcpu))
                cptr = translate_cptr_el2_to_cpacr_el1(cptr);

        return cptr;
}

static inline bool ____cptr_xen_trap_enabled(const struct kvm_vcpu *vcpu,
                                             unsigned int xen)
{
        switch (xen) {
        case 0b00:
        case 0b10:
                return true;
        case 0b01:
                return vcpu_el2_tge_is_set(vcpu) && !vcpu_is_el2(vcpu);
        case 0b11:
        default:
                return false;
        }
}

#define __guest_hyp_cptr_xen_trap_enabled(vcpu, xen)                                \
        (!vcpu_has_nv(vcpu) ? false :                                                \
         ____cptr_xen_trap_enabled(vcpu,                                        \
                                   SYS_FIELD_GET(CPACR_EL1, xen,                \
                                                 vcpu_sanitised_cptr_el2(vcpu))))

static inline bool guest_hyp_fpsimd_traps_enabled(const struct kvm_vcpu *vcpu)
{
        return __guest_hyp_cptr_xen_trap_enabled(vcpu, FPEN);
}

static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu)
{
        return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN);
}

static inline void vcpu_set_hcrx(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;

        if (cpus_have_final_cap(ARM64_HAS_HCX)) {
                /*
                 * In general, all HCRX_EL2 bits are gated by a feature.
                 * The only reason we can set SMPME without checking any
                 * feature is that its effects are not directly observable
                 * from the guest.
                 */
                vcpu->arch.hcrx_el2 = HCRX_EL2_SMPME;

                if (kvm_has_feat(kvm, ID_AA64ISAR2_EL1, MOPS, IMP))
                        vcpu->arch.hcrx_el2 |= (HCRX_EL2_MSCEn | HCRX_EL2_MCE2);

                if (kvm_has_tcr2(kvm))
                        vcpu->arch.hcrx_el2 |= HCRX_EL2_TCR2En;

                if (kvm_has_fpmr(kvm))
                        vcpu->arch.hcrx_el2 |= HCRX_EL2_EnFPM;
        }
}
#endif /* __ARM64_KVM_EMULATE_H__ */












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 




































    4 
    4 














































    4 
    4 








    4 

















    4 

    4 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 1991, 1992 Linus Torvalds
 * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
 * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
 *        -  July2000
 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
 */

/*
 * This handles all read/write requests to block devices
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-pm.h>
#include <linux/blk-integrity.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/completion.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/fault-inject.h>
#include <linux/list_sort.h>
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/part_stat.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>

#define CREATE_TRACE_POINTS
#include <trace/events/block.h>

#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-cgroup.h"
#include "blk-throttle.h"
#include "blk-ioprio.h"

struct dentry *blk_debugfs_root;

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);

static DEFINE_IDA(blk_queue_ida);

/*
 * For queue allocation
 */
static struct kmem_cache *blk_requestq_cachep;

/*
 * Controlling structure to kblockd
 */
static struct workqueue_struct *kblockd_workqueue;

/**
 * blk_queue_flag_set - atomically set a queue flag
 * @flag: flag to be set
 * @q: request queue
 */
void blk_queue_flag_set(unsigned int flag, struct request_queue *q)
{
        set_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_set);

/**
 * blk_queue_flag_clear - atomically clear a queue flag
 * @flag: flag to be cleared
 * @q: request queue
 */
void blk_queue_flag_clear(unsigned int flag, struct request_queue *q)
{
        clear_bit(flag, &q->queue_flags);
}
EXPORT_SYMBOL(blk_queue_flag_clear);

#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
static const char *const blk_op_name[] = {
        REQ_OP_NAME(READ),
        REQ_OP_NAME(WRITE),
        REQ_OP_NAME(FLUSH),
        REQ_OP_NAME(DISCARD),
        REQ_OP_NAME(SECURE_ERASE),
        REQ_OP_NAME(ZONE_RESET),
        REQ_OP_NAME(ZONE_RESET_ALL),
        REQ_OP_NAME(ZONE_OPEN),
        REQ_OP_NAME(ZONE_CLOSE),
        REQ_OP_NAME(ZONE_FINISH),
        REQ_OP_NAME(ZONE_APPEND),
        REQ_OP_NAME(WRITE_ZEROES),
        REQ_OP_NAME(DRV_IN),
        REQ_OP_NAME(DRV_OUT),
};
#undef REQ_OP_NAME

/**
 * blk_op_str - Return string XXX in the REQ_OP_XXX.
 * @op: REQ_OP_XXX.
 *
 * Description: Centralize block layer function to convert REQ_OP_XXX into
 * string format. Useful in the debugging and tracing bio or request. For
 * invalid REQ_OP_XXX it returns string "UNKNOWN".
 */
inline const char *blk_op_str(enum req_op op)
{
        const char *op_str = "UNKNOWN";

        if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
                op_str = blk_op_name[op];

        return op_str;
}
EXPORT_SYMBOL_GPL(blk_op_str);

static const struct {
        int                errno;
        const char        *name;
} blk_errors[] = {
        [BLK_STS_OK]                = { 0,                "" },
        [BLK_STS_NOTSUPP]        = { -EOPNOTSUPP, "operation not supported" },
        [BLK_STS_TIMEOUT]        = { -ETIMEDOUT,        "timeout" },
        [BLK_STS_NOSPC]                = { -ENOSPC,        "critical space allocation" },
        [BLK_STS_TRANSPORT]        = { -ENOLINK,        "recoverable transport" },
        [BLK_STS_TARGET]        = { -EREMOTEIO,        "critical target" },
        [BLK_STS_RESV_CONFLICT]        = { -EBADE,        "reservation conflict" },
        [BLK_STS_MEDIUM]        = { -ENODATA,        "critical medium" },
        [BLK_STS_PROTECTION]        = { -EILSEQ,        "protection" },
        [BLK_STS_RESOURCE]        = { -ENOMEM,        "kernel resource" },
        [BLK_STS_DEV_RESOURCE]        = { -EBUSY,        "device resource" },
        [BLK_STS_AGAIN]                = { -EAGAIN,        "nonblocking retry" },
        [BLK_STS_OFFLINE]        = { -ENODEV,        "device offline" },

        /* device mapper special case, should not leak out: */
        [BLK_STS_DM_REQUEUE]        = { -EREMCHG, "dm internal retry" },

        /* zone device specific errors */
        [BLK_STS_ZONE_OPEN_RESOURCE]        = { -ETOOMANYREFS, "open zones exceeded" },
        [BLK_STS_ZONE_ACTIVE_RESOURCE]        = { -EOVERFLOW, "active zones exceeded" },

        /* Command duration limit device-side timeout */
        [BLK_STS_DURATION_LIMIT]        = { -ETIME, "duration limit exceeded" },

        [BLK_STS_INVAL]                = { -EINVAL,        "invalid" },

        /* everything else not covered above: */
        [BLK_STS_IOERR]                = { -EIO,        "I/O" },
};

blk_status_t errno_to_blk_status(int errno)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(blk_errors); i++) {
                if (blk_errors[i].errno == errno)
                        return (__force blk_status_t)i;
        }

        return BLK_STS_IOERR;
}
EXPORT_SYMBOL_GPL(errno_to_blk_status);

int blk_status_to_errno(blk_status_t status)
{
        int idx = (__force int)status;

        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
                return -EIO;
        return blk_errors[idx].errno;
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);

const char *blk_status_to_str(blk_status_t status)
{
        int idx = (__force int)status;

        if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
                return "<null>";
        return blk_errors[idx].name;
}
EXPORT_SYMBOL_GPL(blk_status_to_str);

/**
 * blk_sync_queue - cancel any pending callbacks on a queue
 * @q: the queue
 *
 * Description:
 *     The block layer may perform asynchronous callback activity
 *     on a queue, such as calling the unplug function after a timeout.
 *     A block device may call blk_sync_queue to ensure that any
 *     such activity is cancelled, thus allowing it to release resources
 *     that the callbacks might use. The caller must already have made sure
 *     that its ->submit_bio will not re-add plugging prior to calling
 *     this function.
 *
 *     This function does not cancel any asynchronous activity arising
 *     out of elevator or throttling code. That would require elevator_exit()
 *     and blkcg_exit_queue() to be called with queue lock initialized.
 *
 */
void blk_sync_queue(struct request_queue *q)
{
        timer_delete_sync(&q->timeout);
        cancel_work_sync(&q->timeout_work);
}
EXPORT_SYMBOL(blk_sync_queue);

/**
 * blk_set_pm_only - increment pm_only counter
 * @q: request queue pointer
 */
void blk_set_pm_only(struct request_queue *q)
{
        atomic_inc(&q->pm_only);
}
EXPORT_SYMBOL_GPL(blk_set_pm_only);

void blk_clear_pm_only(struct request_queue *q)
{
        int pm_only;

        pm_only = atomic_dec_return(&q->pm_only);
        WARN_ON_ONCE(pm_only < 0);
        if (pm_only == 0)
                wake_up_all(&q->mq_freeze_wq);
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);

static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
        struct request_queue *q = container_of(rcu_head,
                        struct request_queue, rcu_head);

        percpu_ref_exit(&q->q_usage_counter);
        kmem_cache_free(blk_requestq_cachep, q);
}

static void blk_free_queue(struct request_queue *q)
{
        blk_free_queue_stats(q->stats);
        if (queue_is_mq(q))
                blk_mq_release(q);

        ida_free(&blk_queue_ida, q->id);
        lockdep_unregister_key(&q->io_lock_cls_key);
        lockdep_unregister_key(&q->q_lock_cls_key);
        call_rcu(&q->rcu_head, blk_free_queue_rcu);
}

/**
 * blk_put_queue - decrement the request_queue refcount
 * @q: the request_queue structure to decrement the refcount for
 *
 * Decrements the refcount of the request_queue and free it when the refcount
 * reaches 0.
 */
void blk_put_queue(struct request_queue *q)
{
        if (refcount_dec_and_test(&q->refs))
                blk_free_queue(q);
}
EXPORT_SYMBOL(blk_put_queue);

bool blk_queue_start_drain(struct request_queue *q)
{
        /*
         * When queue DYING flag is set, we need to block new req
         * entering queue, so we call blk_freeze_queue_start() to
         * prevent I/O from crossing blk_queue_enter().
         */
        bool freeze = __blk_freeze_queue_start(q, current);
        if (queue_is_mq(q))
                blk_mq_wake_waiters(q);
        /* Make blk_queue_enter() reexamine the DYING flag. */
        wake_up_all(&q->mq_freeze_wq);

        return freeze;
}

/**
 * blk_queue_enter() - try to increase q->q_usage_counter
 * @q: request queue pointer
 * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
 */
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
        const bool pm = flags & BLK_MQ_REQ_PM;

        while (!blk_try_enter_queue(q, pm)) {
                if (flags & BLK_MQ_REQ_NOWAIT)
                        return -EAGAIN;

                /*
                 * read pair of barrier in blk_freeze_queue_start(), we need to
                 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
                 * reading .mq_freeze_depth or queue dying flag, otherwise the
                 * following wait may never return if the two reads are
                 * reordered.
                 */
                smp_rmb();
                wait_event(q->mq_freeze_wq,
                           (!q->mq_freeze_depth &&
                            blk_pm_resume_queue(pm, q)) ||
                           blk_queue_dying(q));
                if (blk_queue_dying(q))
                        return -ENODEV;
        }

        rwsem_acquire_read(&q->q_lockdep_map, 0, 0, _RET_IP_);
        rwsem_release(&q->q_lockdep_map, _RET_IP_);
        return 0;
}

int __bio_queue_enter(struct request_queue *q, struct bio *bio)
{
        while (!blk_try_enter_queue(q, false)) {
                struct gendisk *disk = bio->bi_bdev->bd_disk;

                if (bio->bi_opf & REQ_NOWAIT) {
                        if (test_bit(GD_DEAD, &disk->state))
                                goto dead;
                        bio_wouldblock_error(bio);
                        return -EAGAIN;
                }

                /*
                 * read pair of barrier in blk_freeze_queue_start(), we need to
                 * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
                 * reading .mq_freeze_depth or queue dying flag, otherwise the
                 * following wait may never return if the two reads are
                 * reordered.
                 */
                smp_rmb();
                wait_event(q->mq_freeze_wq,
                           (!q->mq_freeze_depth &&
                            blk_pm_resume_queue(false, q)) ||
                           test_bit(GD_DEAD, &disk->state));
                if (test_bit(GD_DEAD, &disk->state))
                        goto dead;
        }

        rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
        rwsem_release(&q->io_lockdep_map, _RET_IP_);
        return 0;
dead:
        bio_io_error(bio);
        return -ENODEV;
}

void blk_queue_exit(struct request_queue *q)
{
        percpu_ref_put(&q->q_usage_counter);
}

static void blk_queue_usage_counter_release(struct percpu_ref *ref)
{
        struct request_queue *q =
                container_of(ref, struct request_queue, q_usage_counter);

        wake_up_all(&q->mq_freeze_wq);
}

static void blk_rq_timed_out_timer(struct timer_list *t)
{
        struct request_queue *q = from_timer(q, t, timeout);

        kblockd_schedule_work(&q->timeout_work);
}

static void blk_timeout_work(struct work_struct *work)
{
}

struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
{
        struct request_queue *q;
        int error;

        q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
                                  node_id);
        if (!q)
                return ERR_PTR(-ENOMEM);

        q->last_merge = NULL;

        q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
        if (q->id < 0) {
                error = q->id;
                goto fail_q;
        }

        q->stats = blk_alloc_queue_stats();
        if (!q->stats) {
                error = -ENOMEM;
                goto fail_id;
        }

        error = blk_set_default_limits(lim);
        if (error)
                goto fail_stats;
        q->limits = *lim;

        q->node = node_id;

        atomic_set(&q->nr_active_requests_shared_tags, 0);

        timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
        INIT_WORK(&q->timeout_work, blk_timeout_work);
        INIT_LIST_HEAD(&q->icq_list);

        refcount_set(&q->refs, 1);
        mutex_init(&q->debugfs_mutex);
        mutex_init(&q->elevator_lock);
        mutex_init(&q->sysfs_lock);
        mutex_init(&q->limits_lock);
        mutex_init(&q->rq_qos_mutex);
        spin_lock_init(&q->queue_lock);

        init_waitqueue_head(&q->mq_freeze_wq);
        mutex_init(&q->mq_freeze_lock);

        blkg_init_queue(q);

        /*
         * Init percpu_ref in atomic mode so that it's faster to shutdown.
         * See blk_register_queue() for details.
         */
        error = percpu_ref_init(&q->q_usage_counter,
                                blk_queue_usage_counter_release,
                                PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
        if (error)
                goto fail_stats;
        lockdep_register_key(&q->io_lock_cls_key);
        lockdep_register_key(&q->q_lock_cls_key);
        lockdep_init_map(&q->io_lockdep_map, "&q->q_usage_counter(io)",
                         &q->io_lock_cls_key, 0);
        lockdep_init_map(&q->q_lockdep_map, "&q->q_usage_counter(queue)",
                         &q->q_lock_cls_key, 0);

        /* Teach lockdep about lock ordering (reclaim WRT queue freeze lock). */
        fs_reclaim_acquire(GFP_KERNEL);
        rwsem_acquire_read(&q->io_lockdep_map, 0, 0, _RET_IP_);
        rwsem_release(&q->io_lockdep_map, _RET_IP_);
        fs_reclaim_release(GFP_KERNEL);

        q->nr_requests = BLKDEV_DEFAULT_RQ;

        return q;

fail_stats:
        blk_free_queue_stats(q->stats);
fail_id:
        ida_free(&blk_queue_ida, q->id);
fail_q:
        kmem_cache_free(blk_requestq_cachep, q);
        return ERR_PTR(error);
}

/**
 * blk_get_queue - increment the request_queue refcount
 * @q: the request_queue structure to increment the refcount for
 *
 * Increment the refcount of the request_queue kobject.
 *
 * Context: Any context.
 */
bool blk_get_queue(struct request_queue *q)
{
        if (unlikely(blk_queue_dying(q)))
                return false;
        refcount_inc(&q->refs);
        return true;
}
EXPORT_SYMBOL(blk_get_queue);

#ifdef CONFIG_FAIL_MAKE_REQUEST

static DECLARE_FAULT_ATTR(fail_make_request);

static int __init setup_fail_make_request(char *str)
{
        return setup_fault_attr(&fail_make_request, str);
}
__setup("fail_make_request=", setup_fail_make_request);

bool should_fail_request(struct block_device *part, unsigned int bytes)
{
        return bdev_test_flag(part, BD_MAKE_IT_FAIL) &&
               should_fail(&fail_make_request, bytes);
}

static int __init fail_make_request_debugfs(void)
{
        struct dentry *dir = fault_create_debugfs_attr("fail_make_request",
                                                NULL, &fail_make_request);

        return PTR_ERR_OR_ZERO(dir);
}

late_initcall(fail_make_request_debugfs);
#endif /* CONFIG_FAIL_MAKE_REQUEST */

static inline void bio_check_ro(struct bio *bio)
{
        if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
                if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
                        return;

                if (bdev_test_flag(bio->bi_bdev, BD_RO_WARNED))
                        return;

                bdev_set_flag(bio->bi_bdev, BD_RO_WARNED);

                /*
                 * Use ioctl to set underlying disk of raid/dm to read-only
                 * will trigger this.
                 */
                pr_warn("Trying to write to read-only block-device %pg\n",
                        bio->bi_bdev);
        }
}

static noinline int should_fail_bio(struct bio *bio)
{
        if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
                return -EIO;
        return 0;
}
ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);

/*
 * Check whether this bio extends beyond the end of the device or partition.
 * This may well happen - the kernel calls bread() without checking the size of
 * the device, e.g., when mounting a file system.
 */
static inline int bio_check_eod(struct bio *bio)
{
        sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
        unsigned int nr_sectors = bio_sectors(bio);

        if (nr_sectors &&
            (nr_sectors > maxsector ||
             bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
                pr_info_ratelimited("%s: attempt to access beyond end of device\n"
                                    "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
                                    current->comm, bio->bi_bdev, bio->bi_opf,
                                    bio->bi_iter.bi_sector, nr_sectors, maxsector);
                return -EIO;
        }
        return 0;
}

/*
 * Remap block n of partition p to block n+start(p) of the disk.
 */
static int blk_partition_remap(struct bio *bio)
{
        struct block_device *p = bio->bi_bdev;

        if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
                return -EIO;
        if (bio_sectors(bio)) {
                bio->bi_iter.bi_sector += p->bd_start_sect;
                trace_block_bio_remap(bio, p->bd_dev,
                                      bio->bi_iter.bi_sector -
                                      p->bd_start_sect);
        }
        bio_set_flag(bio, BIO_REMAPPED);
        return 0;
}

/*
 * Check write append to a zoned block device.
 */
static inline blk_status_t blk_check_zone_append(struct request_queue *q,
                                                 struct bio *bio)
{
        int nr_sectors = bio_sectors(bio);

        /* Only applicable to zoned block devices */
        if (!bdev_is_zoned(bio->bi_bdev))
                return BLK_STS_NOTSUPP;

        /* The bio sector must point to the start of a sequential zone */
        if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
                return BLK_STS_IOERR;

        /*
         * Not allowed to cross zone boundaries. Otherwise, the BIO will be
         * split and could result in non-contiguous sectors being written in
         * different zones.
         */
        if (nr_sectors > q->limits.chunk_sectors)
                return BLK_STS_IOERR;

        /* Make sure the BIO is small enough and will not get split */
        if (nr_sectors > q->limits.max_zone_append_sectors)
                return BLK_STS_IOERR;

        bio->bi_opf |= REQ_NOMERGE;

        return BLK_STS_OK;
}

static void __submit_bio(struct bio *bio)
{
        /* If plug is not used, add new plug here to cache nsecs time. */
        struct blk_plug plug;

        if (unlikely(!blk_crypto_bio_prep(&bio)))
                return;

        blk_start_plug(&plug);

        if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) {
                blk_mq_submit_bio(bio);
        } else if (likely(bio_queue_enter(bio) == 0)) {
                struct gendisk *disk = bio->bi_bdev->bd_disk;
        
                if ((bio->bi_opf & REQ_POLLED) &&
                    !(disk->queue->limits.features & BLK_FEAT_POLL)) {
                        bio->bi_status = BLK_STS_NOTSUPP;
                        bio_endio(bio);
                } else {
                        disk->fops->submit_bio(bio);
                }
                blk_queue_exit(disk->queue);
        }

        blk_finish_plug(&plug);
}

/*
 * The loop in this function may be a bit non-obvious, and so deserves some
 * explanation:
 *
 *  - Before entering the loop, bio->bi_next is NULL (as all callers ensure
 *    that), so we have a list with a single bio.
 *  - We pretend that we have just taken it off a longer list, so we assign
 *    bio_list to a pointer to the bio_list_on_stack, thus initialising the
 *    bio_list of new bios to be added.  ->submit_bio() may indeed add some more
 *    bios through a recursive call to submit_bio_noacct.  If it did, we find a
 *    non-NULL value in bio_list and re-enter the loop from the top.
 *  - In this case we really did just take the bio of the top of the list (no
 *    pretending) and so remove it from bio_list, and call into ->submit_bio()
 *    again.
 *
 * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
 * bio_list_on_stack[1] contains bios that were submitted before the current
 *        ->submit_bio, but that haven't been processed yet.
 */
static void __submit_bio_noacct(struct bio *bio)
{
        struct bio_list bio_list_on_stack[2];

        BUG_ON(bio->bi_next);

        bio_list_init(&bio_list_on_stack[0]);
        current->bio_list = bio_list_on_stack;

        do {
                struct request_queue *q = bdev_get_queue(bio->bi_bdev);
                struct bio_list lower, same;

                /*
                 * Create a fresh bio_list for all subordinate requests.
                 */
                bio_list_on_stack[1] = bio_list_on_stack[0];
                bio_list_init(&bio_list_on_stack[0]);

                __submit_bio(bio);

                /*
                 * Sort new bios into those for a lower level and those for the
                 * same level.
                 */
                bio_list_init(&lower);
                bio_list_init(&same);
                while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
                        if (q == bdev_get_queue(bio->bi_bdev))
                                bio_list_add(&same, bio);
                        else
                                bio_list_add(&lower, bio);

                /*
                 * Now assemble so we handle the lowest level first.
                 */
                bio_list_merge(&bio_list_on_stack[0], &lower);
                bio_list_merge(&bio_list_on_stack[0], &same);
                bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
        } while ((bio = bio_list_pop(&bio_list_on_stack[0])));

        current->bio_list = NULL;
}

static void __submit_bio_noacct_mq(struct bio *bio)
{
        struct bio_list bio_list[2] = { };

        current->bio_list = bio_list;

        do {
                __submit_bio(bio);
        } while ((bio = bio_list_pop(&bio_list[0])));

        current->bio_list = NULL;
}

void submit_bio_noacct_nocheck(struct bio *bio)
{
        blk_cgroup_bio_start(bio);
        blkcg_bio_issue_init(bio);

        if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
                trace_block_bio_queue(bio);
                /*
                 * Now that enqueuing has been traced, we need to trace
                 * completion as well.
                 */
                bio_set_flag(bio, BIO_TRACE_COMPLETION);
        }

        /*
         * We only want one ->submit_bio to be active at a time, else stack
         * usage with stacked devices could be a problem.  Use current->bio_list
         * to collect a list of requests submited by a ->submit_bio method while
         * it is active, and then process them after it returned.
         */
        if (current->bio_list)
                bio_list_add(&current->bio_list[0], bio);
        else if (!bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
                __submit_bio_noacct_mq(bio);
        else
                __submit_bio_noacct(bio);
}

static blk_status_t blk_validate_atomic_write_op_size(struct request_queue *q,
                                                 struct bio *bio)
{
        if (bio->bi_iter.bi_size > queue_atomic_write_unit_max_bytes(q))
                return BLK_STS_INVAL;

        if (bio->bi_iter.bi_size % queue_atomic_write_unit_min_bytes(q))
                return BLK_STS_INVAL;

        return BLK_STS_OK;
}

/**
 * submit_bio_noacct - re-submit a bio to the block device layer for I/O
 * @bio:  The bio describing the location in memory and on the device.
 *
 * This is a version of submit_bio() that shall only be used for I/O that is
 * resubmitted to lower level drivers by stacking block drivers.  All file
 * systems and other upper level users of the block layer should use
 * submit_bio() instead.
 */
void submit_bio_noacct(struct bio *bio)
{
        struct block_device *bdev = bio->bi_bdev;
        struct request_queue *q = bdev_get_queue(bdev);
        blk_status_t status = BLK_STS_IOERR;

        might_sleep();

        /*
         * For a REQ_NOWAIT based request, return -EOPNOTSUPP
         * if queue does not support NOWAIT.
         */
        if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
                goto not_supported;

        if (should_fail_bio(bio))
                goto end_io;
        bio_check_ro(bio);
        if (!bio_flagged(bio, BIO_REMAPPED)) {
                if (unlikely(bio_check_eod(bio)))
                        goto end_io;
                if (bdev_is_partition(bdev) &&
                    unlikely(blk_partition_remap(bio)))
                        goto end_io;
        }

        /*
         * Filter flush bio's early so that bio based drivers without flush
         * support don't have to worry about them.
         */
        if (op_is_flush(bio->bi_opf)) {
                if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
                                 bio_op(bio) != REQ_OP_ZONE_APPEND))
                        goto end_io;
                if (!bdev_write_cache(bdev)) {
                        bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
                        if (!bio_sectors(bio)) {
                                status = BLK_STS_OK;
                                goto end_io;
                        }
                }
        }

        switch (bio_op(bio)) {
        case REQ_OP_READ:
                break;
        case REQ_OP_WRITE:
                if (bio->bi_opf & REQ_ATOMIC) {
                        status = blk_validate_atomic_write_op_size(q, bio);
                        if (status != BLK_STS_OK)
                                goto end_io;
                }
                break;
        case REQ_OP_FLUSH:
                /*
                 * REQ_OP_FLUSH can't be submitted through bios, it is only
                 * synthetized in struct request by the flush state machine.
                 */
                goto not_supported;
        case REQ_OP_DISCARD:
                if (!bdev_max_discard_sectors(bdev))
                        goto not_supported;
                break;
        case REQ_OP_SECURE_ERASE:
                if (!bdev_max_secure_erase_sectors(bdev))
                        goto not_supported;
                break;
        case REQ_OP_ZONE_APPEND:
                status = blk_check_zone_append(q, bio);
                if (status != BLK_STS_OK)
                        goto end_io;
                break;
        case REQ_OP_WRITE_ZEROES:
                if (!q->limits.max_write_zeroes_sectors)
                        goto not_supported;
                break;
        case REQ_OP_ZONE_RESET:
        case REQ_OP_ZONE_OPEN:
        case REQ_OP_ZONE_CLOSE:
        case REQ_OP_ZONE_FINISH:
        case REQ_OP_ZONE_RESET_ALL:
                if (!bdev_is_zoned(bio->bi_bdev))
                        goto not_supported;
                break;
        case REQ_OP_DRV_IN:
        case REQ_OP_DRV_OUT:
                /*
                 * Driver private operations are only used with passthrough
                 * requests.
                 */
                fallthrough;
        default:
                goto not_supported;
        }

        if (blk_throtl_bio(bio))
                return;
        submit_bio_noacct_nocheck(bio);
        return;

not_supported:
        status = BLK_STS_NOTSUPP;
end_io:
        bio->bi_status = status;
        bio_endio(bio);
}
EXPORT_SYMBOL(submit_bio_noacct);

static void bio_set_ioprio(struct bio *bio)
{
        /* Nobody set ioprio so far? Initialize it based on task's nice value */
        if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
                bio->bi_ioprio = get_current_ioprio();
        blkcg_set_ioprio(bio);
}

/**
 * submit_bio - submit a bio to the block device layer for I/O
 * @bio: The &struct bio which describes the I/O
 *
 * submit_bio() is used to submit I/O requests to block devices.  It is passed a
 * fully set up &struct bio that describes the I/O that needs to be done.  The
 * bio will be send to the device described by the bi_bdev field.
 *
 * The success/failure status of the request, along with notification of
 * completion, is delivered asynchronously through the ->bi_end_io() callback
 * in @bio.  The bio must NOT be touched by the caller until ->bi_end_io() has
 * been called.
 */
void submit_bio(struct bio *bio)
{
        if (bio_op(bio) == REQ_OP_READ) {
                task_io_account_read(bio->bi_iter.bi_size);
                count_vm_events(PGPGIN, bio_sectors(bio));
        } else if (bio_op(bio) == REQ_OP_WRITE) {
                count_vm_events(PGPGOUT, bio_sectors(bio));
        }

        bio_set_ioprio(bio);
        submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);

/**
 * bio_poll - poll for BIO completions
 * @bio: bio to poll for
 * @iob: batches of IO
 * @flags: BLK_POLL_* flags that control the behavior
 *
 * Poll for completions on queue associated with the bio. Returns number of
 * completed entries found.
 *
 * Note: the caller must either be the context that submitted @bio, or
 * be in a RCU critical section to prevent freeing of @bio.
 */
int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
{
        blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
        struct block_device *bdev;
        struct request_queue *q;
        int ret = 0;

        bdev = READ_ONCE(bio->bi_bdev);
        if (!bdev)
                return 0;

        q = bdev_get_queue(bdev);
        if (cookie == BLK_QC_T_NONE)
                return 0;

        blk_flush_plug(current->plug, false);

        /*
         * We need to be able to enter a frozen queue, similar to how
         * timeouts also need to do that. If that is blocked, then we can
         * have pending IO when a queue freeze is started, and then the
         * wait for the freeze to finish will wait for polled requests to
         * timeout as the poller is preventer from entering the queue and
         * completing them. As long as we prevent new IO from being queued,
         * that should be all that matters.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return 0;
        if (queue_is_mq(q)) {
                ret = blk_mq_poll(q, cookie, iob, flags);
        } else {
                struct gendisk *disk = q->disk;

                if ((q->limits.features & BLK_FEAT_POLL) && disk &&
                    disk->fops->poll_bio)
                        ret = disk->fops->poll_bio(bio, iob, flags);
        }
        blk_queue_exit(q);
        return ret;
}
EXPORT_SYMBOL_GPL(bio_poll);

/*
 * Helper to implement file_operations.iopoll.  Requires the bio to be stored
 * in iocb->private, and cleared before freeing the bio.
 */
int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
                    unsigned int flags)
{
        struct bio *bio;
        int ret = 0;

        /*
         * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
         * point to a freshly allocated bio at this point.  If that happens
         * we have a few cases to consider:
         *
         *  1) the bio is beeing initialized and bi_bdev is NULL.  We can just
         *     simply nothing in this case
         *  2) the bio points to a not poll enabled device.  bio_poll will catch
         *     this and return 0
         *  3) the bio points to a poll capable device, including but not
         *     limited to the one that the original bio pointed to.  In this
         *     case we will call into the actual poll method and poll for I/O,
         *     even if we don't need to, but it won't cause harm either.
         *
         * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
         * is still allocated. Because partitions hold a reference to the whole
         * device bdev and thus disk, the disk is also still valid.  Grabbing
         * a reference to the queue in bio_poll() ensures the hctxs and requests
         * are still valid as well.
         */
        rcu_read_lock();
        bio = READ_ONCE(kiocb->private);
        if (bio)
                ret = bio_poll(bio, iob, flags);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL_GPL(iocb_bio_iopoll);

void update_io_ticks(struct block_device *part, unsigned long now, bool end)
{
        unsigned long stamp;
again:
        stamp = READ_ONCE(part->bd_stamp);
        if (unlikely(time_after(now, stamp)) &&
            likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
            (end || part_in_flight(part)))
                __part_stat_add(part, io_ticks, now - stamp);

        if (bdev_is_partition(part)) {
                part = bdev_whole(part);
                goto again;
        }
}

unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
                                 unsigned long start_time)
{
        part_stat_lock();
        update_io_ticks(bdev, start_time, false);
        part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
        part_stat_unlock();

        return start_time;
}
EXPORT_SYMBOL(bdev_start_io_acct);

/**
 * bio_start_io_acct - start I/O accounting for bio based drivers
 * @bio:        bio to start account for
 *
 * Returns the start time that should be passed back to bio_end_io_acct().
 */
unsigned long bio_start_io_acct(struct bio *bio)
{
        return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
}
EXPORT_SYMBOL_GPL(bio_start_io_acct);

void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
                      unsigned int sectors, unsigned long start_time)
{
        const int sgrp = op_stat_group(op);
        unsigned long now = READ_ONCE(jiffies);
        unsigned long duration = now - start_time;

        part_stat_lock();
        update_io_ticks(bdev, now, true);
        part_stat_inc(bdev, ios[sgrp]);
        part_stat_add(bdev, sectors[sgrp], sectors);
        part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
        part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
        part_stat_unlock();
}
EXPORT_SYMBOL(bdev_end_io_acct);

void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
                              struct block_device *orig_bdev)
{
        bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
}
EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);

/**
 * blk_lld_busy - Check if underlying low-level drivers of a device are busy
 * @q : the queue of the device being checked
 *
 * Description:
 *    Check if underlying low-level drivers of a device are busy.
 *    If the drivers want to export their busy state, they must set own
 *    exporting function using blk_queue_lld_busy() first.
 *
 *    Basically, this function is used only by request stacking drivers
 *    to stop dispatching requests to underlying devices when underlying
 *    devices are busy.  This behavior helps more I/O merging on the queue
 *    of the request stacking driver and prevents I/O throughput regression
 *    on burst I/O load.
 *
 * Return:
 *    0 - Not busy (The request stacking driver should dispatch request)
 *    1 - Busy (The request stacking driver should stop dispatching request)
 */
int blk_lld_busy(struct request_queue *q)
{
        if (queue_is_mq(q) && q->mq_ops->busy)
                return q->mq_ops->busy(q);

        return 0;
}
EXPORT_SYMBOL_GPL(blk_lld_busy);

int kblockd_schedule_work(struct work_struct *work)
{
        return queue_work(kblockd_workqueue, work);
}
EXPORT_SYMBOL(kblockd_schedule_work);

int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
                                unsigned long delay)
{
        return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay);
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);

void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
{
        struct task_struct *tsk = current;

        /*
         * If this is a nested plug, don't actually assign it.
         */
        if (tsk->plug)
                return;

        plug->cur_ktime = 0;
        rq_list_init(&plug->mq_list);
        rq_list_init(&plug->cached_rqs);
        plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
        plug->rq_count = 0;
        plug->multiple_queues = false;
        plug->has_elevator = false;
        INIT_LIST_HEAD(&plug->cb_list);

        /*
         * Store ordering should not be needed here, since a potential
         * preempt will imply a full memory barrier
         */
        tsk->plug = plug;
}

/**
 * blk_start_plug - initialize blk_plug and track it inside the task_struct
 * @plug:        The &struct blk_plug that needs to be initialized
 *
 * Description:
 *   blk_start_plug() indicates to the block layer an intent by the caller
 *   to submit multiple I/O requests in a batch.  The block layer may use
 *   this hint to defer submitting I/Os from the caller until blk_finish_plug()
 *   is called.  However, the block layer may choose to submit requests
 *   before a call to blk_finish_plug() if the number of queued I/Os
 *   exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than
 *   %BLK_PLUG_FLUSH_SIZE.  The queued I/Os may also be submitted early if
 *   the task schedules (see below).
 *
 *   Tracking blk_plug inside the task_struct will help with auto-flushing the
 *   pending I/O should the task end up blocking between blk_start_plug() and
 *   blk_finish_plug(). This is important from a performance perspective, but
 *   also ensures that we don't deadlock. For instance, if the task is blocking
 *   for a memory allocation, memory reclaim could end up wanting to free a
 *   page belonging to that request that is currently residing in our private
 *   plug. By flushing the pending I/O when the process goes to sleep, we avoid
 *   this kind of deadlock.
 */
void blk_start_plug(struct blk_plug *plug)
{
        blk_start_plug_nr_ios(plug, 1);
}
EXPORT_SYMBOL(blk_start_plug);

static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule)
{
        LIST_HEAD(callbacks);

        while (!list_empty(&plug->cb_list)) {
                list_splice_init(&plug->cb_list, &callbacks);

                while (!list_empty(&callbacks)) {
                        struct blk_plug_cb *cb = list_first_entry(&callbacks,
                                                          struct blk_plug_cb,
                                                          list);
                        list_del(&cb->list);
                        cb->callback(cb, from_schedule);
                }
        }
}

struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
                                      int size)
{
        struct blk_plug *plug = current->plug;
        struct blk_plug_cb *cb;

        if (!plug)
                return NULL;

        list_for_each_entry(cb, &plug->cb_list, list)
                if (cb->callback == unplug && cb->data == data)
                        return cb;

        /* Not currently on the callback list */
        BUG_ON(size < sizeof(*cb));
        cb = kzalloc(size, GFP_ATOMIC);
        if (cb) {
                cb->data = data;
                cb->callback = unplug;
                list_add(&cb->list, &plug->cb_list);
        }
        return cb;
}
EXPORT_SYMBOL(blk_check_plugged);

void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
{
        if (!list_empty(&plug->cb_list))
                flush_plug_callbacks(plug, from_schedule);
        blk_mq_flush_plug_list(plug, from_schedule);
        /*
         * Unconditionally flush out cached requests, even if the unplug
         * event came from schedule. Since we know hold references to the
         * queue for cached requests, we don't want a blocked task holding
         * up a queue freeze/quiesce event.
         */
        if (unlikely(!rq_list_empty(&plug->cached_rqs)))
                blk_mq_free_plug_rqs(plug);

        plug->cur_ktime = 0;
        current->flags &= ~PF_BLOCK_TS;
}

/**
 * blk_finish_plug - mark the end of a batch of submitted I/O
 * @plug:        The &struct blk_plug passed to blk_start_plug()
 *
 * Description:
 * Indicate that a batch of I/O submissions is complete.  This function
 * must be paired with an initial call to blk_start_plug().  The intent
 * is to allow the block layer to optimize I/O submission.  See the
 * documentation for blk_start_plug() for more information.
 */
void blk_finish_plug(struct blk_plug *plug)
{
        if (plug == current->plug) {
                __blk_flush_plug(plug, false);
                current->plug = NULL;
        }
}
EXPORT_SYMBOL(blk_finish_plug);

void blk_io_schedule(void)
{
        /* Prevent hang_check timer from firing at us during very long I/O */
        unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;

        if (timeout)
                io_schedule_timeout(timeout);
        else
                io_schedule();
}
EXPORT_SYMBOL_GPL(blk_io_schedule);

int __init blk_dev_init(void)
{
        BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
                        sizeof_field(struct request, cmd_flags));
        BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
                        sizeof_field(struct bio, bi_opf));

        /* used for unplugging and affects IO latency/throughput - HIGHPRI */
        kblockd_workqueue = alloc_workqueue("kblockd",
                                            WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
        if (!kblockd_workqueue)
                panic("Failed to create kblockd\n");

        blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);

        blk_debugfs_root = debugfs_create_dir("block", NULL);

        return 0;
}




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   34 















  124 









   16 






























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_H
#define _LINUX_SCHED_H

/*
 * Define 'struct task_struct' and provide the main scheduler
 * APIs (schedule(), wakeup variants, etc.)
 */

#include <uapi/linux/sched.h>

#include <asm/current.h>
#include <asm/processor.h>
#include <linux/thread_info.h>
#include <linux/preempt.h>
#include <linux/cpumask_types.h>

#include <linux/cache.h>
#include <linux/irqflags_types.h>
#include <linux/smp_types.h>
#include <linux/pid_types.h>
#include <linux/sem_types.h>
#include <linux/shm.h>
#include <linux/kmsan_types.h>
#include <linux/mutex_types.h>
#include <linux/plist_types.h>
#include <linux/hrtimer_types.h>
#include <linux/timer_types.h>
#include <linux/seccomp_types.h>
#include <linux/nodemask_types.h>
#include <linux/refcount_types.h>
#include <linux/resource.h>
#include <linux/latencytop.h>
#include <linux/sched/prio.h>
#include <linux/sched/types.h>
#include <linux/signal_types.h>
#include <linux/syscall_user_dispatch_types.h>
#include <linux/mm_types_task.h>
#include <linux/netdevice_xmit.h>
#include <linux/task_io_accounting.h>
#include <linux/posix-timers_types.h>
#include <linux/restart_block.h>
#include <uapi/linux/rseq.h>
#include <linux/seqlock_types.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
#include <linux/livepatch_sched.h>
#include <linux/uidgid_types.h>
#include <linux/tracepoint-defs.h>
#include <asm/kmap_size.h>

/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
struct bio_list;
struct blk_plug;
struct bpf_local_storage;
struct bpf_run_ctx;
struct bpf_net_context;
struct capture_control;
struct cfs_rq;
struct fs_struct;
struct futex_pi_state;
struct io_context;
struct io_uring_task;
struct mempolicy;
struct nameidata;
struct nsproxy;
struct perf_event_context;
struct perf_ctx_data;
struct pid_namespace;
struct pipe_inode_info;
struct rcu_node;
struct reclaim_state;
struct robust_list_head;
struct root_domain;
struct rq;
struct sched_attr;
struct sched_dl_entity;
struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_group;
struct task_struct;
struct user_event_mm;

#include <linux/sched/ext.h>

/*
 * Task state bitmask. NOTE! These bits are also
 * encoded in fs/proc/array.c: get_task_state().
 *
 * We have two separate sets of flags: task->__state
 * is about runnability, while task->exit_state are
 * about the task exiting. Confusing, but this way
 * modifying one set can't modify the other one by
 * mistake.
 */

/* Used in tsk->__state: */
#define TASK_RUNNING                        0x00000000
#define TASK_INTERRUPTIBLE                0x00000001
#define TASK_UNINTERRUPTIBLE                0x00000002
#define __TASK_STOPPED                        0x00000004
#define __TASK_TRACED                        0x00000008
/* Used in tsk->exit_state: */
#define EXIT_DEAD                        0x00000010
#define EXIT_ZOMBIE                        0x00000020
#define EXIT_TRACE                        (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->__state again: */
#define TASK_PARKED                        0x00000040
#define TASK_DEAD                        0x00000080
#define TASK_WAKEKILL                        0x00000100
#define TASK_WAKING                        0x00000200
#define TASK_NOLOAD                        0x00000400
#define TASK_NEW                        0x00000800
#define TASK_RTLOCK_WAIT                0x00001000
#define TASK_FREEZABLE                        0x00002000
#define __TASK_FREEZABLE_UNSAFE               (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP))
#define TASK_FROZEN                        0x00008000
#define TASK_STATE_MAX                        0x00010000

#define TASK_ANY                        (TASK_STATE_MAX-1)

/*
 * DO NOT ADD ANY NEW USERS !
 */
#define TASK_FREEZABLE_UNSAFE                (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE)

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE                        (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED                        (TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED                        __TASK_TRACED

#define TASK_IDLE                        (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)

/* Convenience macros for the sake of wake_up(): */
#define TASK_NORMAL                        (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)

/* get_task_state(): */
#define TASK_REPORT                        (TASK_RUNNING | TASK_INTERRUPTIBLE | \
                                         TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
                                         __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
                                         TASK_PARKED)

#define task_is_running(task)                (READ_ONCE((task)->__state) == TASK_RUNNING)

#define task_is_traced(task)                ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
#define task_is_stopped(task)                ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0)
#define task_is_stopped_or_traced(task)        ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0)

/*
 * Special states are those that do not use the normal wait-loop pattern. See
 * the comment with set_special_state().
 */
#define is_special_task_state(state)                                        \
        ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED |        \
                    TASK_DEAD | TASK_FROZEN))

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
# define debug_normal_state_change(state_value)                                \
        do {                                                                \
                WARN_ON_ONCE(is_special_task_state(state_value));        \
                current->task_state_change = _THIS_IP_;                        \
        } while (0)

# define debug_special_state_change(state_value)                        \
        do {                                                                \
                WARN_ON_ONCE(!is_special_task_state(state_value));        \
                current->task_state_change = _THIS_IP_;                        \
        } while (0)

# define debug_rtlock_wait_set_state()                                        \
        do {                                                                 \
                current->saved_state_change = current->task_state_change;\
                current->task_state_change = _THIS_IP_;                         \
        } while (0)

# define debug_rtlock_wait_restore_state()                                \
        do {                                                                 \
                current->task_state_change = current->saved_state_change;\
        } while (0)

#else
# define debug_normal_state_change(cond)        do { } while (0)
# define debug_special_state_change(cond)        do { } while (0)
# define debug_rtlock_wait_set_state()                do { } while (0)
# define debug_rtlock_wait_restore_state()        do { } while (0)
#endif

#define trace_set_current_state(state_value)                     \
        do {                                                     \
                if (tracepoint_enabled(sched_set_state_tp))      \
                        __trace_set_current_state(state_value); \
        } while (0)

/*
 * set_current_state() includes a barrier so that the write of current->__state
 * is correctly serialised wrt the caller's subsequent test of whether to
 * actually sleep:
 *
 *   for (;;) {
 *        set_current_state(TASK_UNINTERRUPTIBLE);
 *        if (CONDITION)
 *           break;
 *
 *        schedule();
 *   }
 *   __set_current_state(TASK_RUNNING);
 *
 * If the caller does not need such serialisation (because, for instance, the
 * CONDITION test and condition change and wakeup are under the same lock) then
 * use __set_current_state().
 *
 * The above is typically ordered against the wakeup, which does:
 *
 *   CONDITION = 1;
 *   wake_up_state(p, TASK_UNINTERRUPTIBLE);
 *
 * where wake_up_state()/try_to_wake_up() executes a full memory barrier before
 * accessing p->__state.
 *
 * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is,
 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
 *
 * However, with slightly different timing the wakeup TASK_RUNNING store can
 * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not
 * a problem either because that will result in one extra go around the loop
 * and our @cond test will save the day.
 *
 * Also see the comments of try_to_wake_up().
 */
#define __set_current_state(state_value)                                \
        do {                                                                \
                debug_normal_state_change((state_value));                \
                trace_set_current_state(state_value);                        \
                WRITE_ONCE(current->__state, (state_value));                \
        } while (0)

#define set_current_state(state_value)                                        \
        do {                                                                \
                debug_normal_state_change((state_value));                \
                trace_set_current_state(state_value);                        \
                smp_store_mb(current->__state, (state_value));                \
        } while (0)

/*
 * set_special_state() should be used for those states when the blocking task
 * can not use the regular condition based wait-loop. In that case we must
 * serialize against wakeups such that any possible in-flight TASK_RUNNING
 * stores will not collide with our state change.
 */
#define set_special_state(state_value)                                        \
        do {                                                                \
                unsigned long flags; /* may shadow */                        \
                                                                        \
                raw_spin_lock_irqsave(&current->pi_lock, flags);        \
                debug_special_state_change((state_value));                \
                trace_set_current_state(state_value);                        \
                WRITE_ONCE(current->__state, (state_value));                \
                raw_spin_unlock_irqrestore(&current->pi_lock, flags);        \
        } while (0)

/*
 * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
 *
 * RT's spin/rwlock substitutions are state preserving. The state of the
 * task when blocking on the lock is saved in task_struct::saved_state and
 * restored after the lock has been acquired.  These operations are
 * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
 * lock related wakeups while the task is blocked on the lock are
 * redirected to operate on task_struct::saved_state to ensure that these
 * are not dropped. On restore task_struct::saved_state is set to
 * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
 *
 * The lock operation looks like this:
 *
 *        current_save_and_set_rtlock_wait_state();
 *        for (;;) {
 *                if (try_lock())
 *                        break;
 *                raw_spin_unlock_irq(&lock->wait_lock);
 *                schedule_rtlock();
 *                raw_spin_lock_irq(&lock->wait_lock);
 *                set_current_state(TASK_RTLOCK_WAIT);
 *        }
 *        current_restore_rtlock_saved_state();
 */
#define current_save_and_set_rtlock_wait_state()                        \
        do {                                                                \
                lockdep_assert_irqs_disabled();                                \
                raw_spin_lock(&current->pi_lock);                        \
                current->saved_state = current->__state;                \
                debug_rtlock_wait_set_state();                                \
                trace_set_current_state(TASK_RTLOCK_WAIT);                \
                WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);                \
                raw_spin_unlock(&current->pi_lock);                        \
        } while (0);

#define current_restore_rtlock_saved_state()                                \
        do {                                                                \
                lockdep_assert_irqs_disabled();                                \
                raw_spin_lock(&current->pi_lock);                        \
                debug_rtlock_wait_restore_state();                        \
                trace_set_current_state(current->saved_state);                \
                WRITE_ONCE(current->__state, current->saved_state);        \
                current->saved_state = TASK_RUNNING;                        \
                raw_spin_unlock(&current->pi_lock);                        \
        } while (0);

#define get_current_state()        READ_ONCE(current->__state)

/*
 * Define the task command name length as enum, then it can be visible to
 * BPF programs.
 */
enum {
        TASK_COMM_LEN = 16,
};

extern void sched_tick(void);

#define        MAX_SCHEDULE_TIMEOUT                LONG_MAX

extern long schedule_timeout(long timeout);
extern long schedule_timeout_interruptible(long timeout);
extern long schedule_timeout_killable(long timeout);
extern long schedule_timeout_uninterruptible(long timeout);
extern long schedule_timeout_idle(long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
asmlinkage void preempt_schedule_irq(void);
#ifdef CONFIG_PREEMPT_RT
 extern void schedule_rtlock(void);
#endif

extern int __must_check io_schedule_prepare(void);
extern void io_schedule_finish(int token);
extern long io_schedule_timeout(long timeout);
extern void io_schedule(void);

/* wrapper function to trace from this header file */
DECLARE_TRACEPOINT(sched_set_state_tp);
extern void __trace_set_current_state(int state_value);

/**
 * struct prev_cputime - snapshot of system and user cputime
 * @utime: time spent in user mode
 * @stime: time spent in system mode
 * @lock: protects the above two fields
 *
 * Stores previous user/system time values such that we can guarantee
 * monotonicity.
 */
struct prev_cputime {
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        u64                                utime;
        u64                                stime;
        raw_spinlock_t                        lock;
#endif
};

enum vtime_state {
        /* Task is sleeping or running in a CPU with VTIME inactive: */
        VTIME_INACTIVE = 0,
        /* Task is idle */
        VTIME_IDLE,
        /* Task runs in kernelspace in a CPU with VTIME active: */
        VTIME_SYS,
        /* Task runs in userspace in a CPU with VTIME active: */
        VTIME_USER,
        /* Task runs as guests in a CPU with VTIME active: */
        VTIME_GUEST,
};

struct vtime {
        seqcount_t                seqcount;
        unsigned long long        starttime;
        enum vtime_state        state;
        unsigned int                cpu;
        u64                        utime;
        u64                        stime;
        u64                        gtime;
};

/*
 * Utilization clamp constraints.
 * @UCLAMP_MIN:        Minimum utilization
 * @UCLAMP_MAX:        Maximum utilization
 * @UCLAMP_CNT:        Utilization clamp constraints count
 */
enum uclamp_id {
        UCLAMP_MIN = 0,
        UCLAMP_MAX,
        UCLAMP_CNT
};

#ifdef CONFIG_SMP
extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
extern void sched_domains_mutex_lock(void);
extern void sched_domains_mutex_unlock(void);
#else
static inline void sched_domains_mutex_lock(void) { }
static inline void sched_domains_mutex_unlock(void) { }
#endif

struct sched_param {
        int sched_priority;
};

struct sched_info {
#ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */

        /* # of times we have run on this CPU: */
        unsigned long                        pcount;

        /* Time spent waiting on a runqueue: */
        unsigned long long                run_delay;

        /* Max time spent waiting on a runqueue: */
        unsigned long long                max_run_delay;

        /* Min time spent waiting on a runqueue: */
        unsigned long long                min_run_delay;

        /* Timestamps: */

        /* When did we last run on a CPU? */
        unsigned long long                last_arrival;

        /* When were we last queued to run? */
        unsigned long long                last_queued;

#endif /* CONFIG_SCHED_INFO */
};

/*
 * Integer metrics need fixed point arithmetic, e.g., sched/fair
 * has a few: load, load_avg, util_avg, freq, and capacity.
 *
 * We define a basic fixed point arithmetic range, and then formalize
 * all these metrics based on that basic range.
 */
# define SCHED_FIXEDPOINT_SHIFT                10
# define SCHED_FIXEDPOINT_SCALE                (1L << SCHED_FIXEDPOINT_SHIFT)

/* Increase resolution of cpu_capacity calculations */
# define SCHED_CAPACITY_SHIFT                SCHED_FIXEDPOINT_SHIFT
# define SCHED_CAPACITY_SCALE                (1L << SCHED_CAPACITY_SHIFT)

struct load_weight {
        unsigned long                        weight;
        u32                                inv_weight;
};

/*
 * The load/runnable/util_avg accumulates an infinite geometric series
 * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
 *
 * [load_avg definition]
 *
 *   load_avg = runnable% * scale_load_down(load)
 *
 * [runnable_avg definition]
 *
 *   runnable_avg = runnable% * SCHED_CAPACITY_SCALE
 *
 * [util_avg definition]
 *
 *   util_avg = running% * SCHED_CAPACITY_SCALE
 *
 * where runnable% is the time ratio that a sched_entity is runnable and
 * running% the time ratio that a sched_entity is running.
 *
 * For cfs_rq, they are the aggregated values of all runnable and blocked
 * sched_entities.
 *
 * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU
 * capacity scaling. The scaling is done through the rq_clock_pelt that is used
 * for computing those signals (see update_rq_clock_pelt())
 *
 * N.B., the above ratios (runnable% and running%) themselves are in the
 * range of [0, 1]. To do fixed point arithmetics, we therefore scale them
 * to as large a range as necessary. This is for example reflected by
 * util_avg's SCHED_CAPACITY_SCALE.
 *
 * [Overflow issue]
 *
 * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
 * with the highest load (=88761), always runnable on a single cfs_rq,
 * and should not overflow as the number already hits PID_MAX_LIMIT.
 *
 * For all other cases (including 32-bit kernels), struct load_weight's
 * weight will overflow first before we do, because:
 *
 *    Max(load_avg) <= Max(load.weight)
 *
 * Then it is the load_weight's responsibility to consider overflow
 * issues.
 */
struct sched_avg {
        u64                                last_update_time;
        u64                                load_sum;
        u64                                runnable_sum;
        u32                                util_sum;
        u32                                period_contrib;
        unsigned long                        load_avg;
        unsigned long                        runnable_avg;
        unsigned long                        util_avg;
        unsigned int                        util_est;
} ____cacheline_aligned;

/*
 * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
 * updates. When a task is dequeued, its util_est should not be updated if its
 * util_avg has not been updated in the meantime.
 * This information is mapped into the MSB bit of util_est at dequeue time.
 * Since max value of util_est for a task is 1024 (PELT util_avg for a task)
 * it is safe to use MSB.
 */
#define UTIL_EST_WEIGHT_SHIFT                2
#define UTIL_AVG_UNCHANGED                0x80000000

struct sched_statistics {
#ifdef CONFIG_SCHEDSTATS
        u64                                wait_start;
        u64                                wait_max;
        u64                                wait_count;
        u64                                wait_sum;
        u64                                iowait_count;
        u64                                iowait_sum;

        u64                                sleep_start;
        u64                                sleep_max;
        s64                                sum_sleep_runtime;

        u64                                block_start;
        u64                                block_max;
        s64                                sum_block_runtime;

        s64                                exec_max;
        u64                                slice_max;

        u64                                nr_migrations_cold;
        u64                                nr_failed_migrations_affine;
        u64                                nr_failed_migrations_running;
        u64                                nr_failed_migrations_hot;
        u64                                nr_forced_migrations;

        u64                                nr_wakeups;
        u64                                nr_wakeups_sync;
        u64                                nr_wakeups_migrate;
        u64                                nr_wakeups_local;
        u64                                nr_wakeups_remote;
        u64                                nr_wakeups_affine;
        u64                                nr_wakeups_affine_attempts;
        u64                                nr_wakeups_passive;
        u64                                nr_wakeups_idle;

#ifdef CONFIG_SCHED_CORE
        u64                                core_forceidle_sum;
#endif
#endif /* CONFIG_SCHEDSTATS */
} ____cacheline_aligned;

struct sched_entity {
        /* For load-balancing: */
        struct load_weight                load;
        struct rb_node                        run_node;
        u64                                deadline;
        u64                                min_vruntime;
        u64                                min_slice;

        struct list_head                group_node;
        unsigned char                        on_rq;
        unsigned char                        sched_delayed;
        unsigned char                        rel_deadline;
        unsigned char                        custom_slice;
                                        /* hole */

        u64                                exec_start;
        u64                                sum_exec_runtime;
        u64                                prev_sum_exec_runtime;
        u64                                vruntime;
        s64                                vlag;
        u64                                slice;

        u64                                nr_migrations;

#ifdef CONFIG_FAIR_GROUP_SCHED
        int                                depth;
        struct sched_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct cfs_rq                        *cfs_rq;
        /* rq "owned" by this entity/group: */
        struct cfs_rq                        *my_q;
        /* cached value of my_q->h_nr_running */
        unsigned long                        runnable_weight;
#endif

#ifdef CONFIG_SMP
        /*
         * Per entity load average tracking.
         *
         * Put into separate cache line so it does not
         * collide with read-mostly values above.
         */
        struct sched_avg                avg;
#endif
};

struct sched_rt_entity {
        struct list_head                run_list;
        unsigned long                        timeout;
        unsigned long                        watchdog_stamp;
        unsigned int                        time_slice;
        unsigned short                        on_rq;
        unsigned short                        on_list;

        struct sched_rt_entity                *back;
#ifdef CONFIG_RT_GROUP_SCHED
        struct sched_rt_entity                *parent;
        /* rq on which this entity is (to be) queued: */
        struct rt_rq                        *rt_rq;
        /* rq "owned" by this entity/group: */
        struct rt_rq                        *my_q;
#endif
} __randomize_layout;

typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *);
typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *);

struct sched_dl_entity {
        struct rb_node                        rb_node;

        /*
         * Original scheduling parameters. Copied here from sched_attr
         * during sched_setattr(), they will remain the same until
         * the next sched_setattr().
         */
        u64                                dl_runtime;        /* Maximum runtime for each instance        */
        u64                                dl_deadline;        /* Relative deadline of each instance        */
        u64                                dl_period;        /* Separation of two instances (period) */
        u64                                dl_bw;                /* dl_runtime / dl_period                */
        u64                                dl_density;        /* dl_runtime / dl_deadline                */

        /*
         * Actual scheduling parameters. Initialized with the values above,
         * they are continuously updated during task execution. Note that
         * the remaining runtime could be < 0 in case we are in overrun.
         */
        s64                                runtime;        /* Remaining runtime for this instance        */
        u64                                deadline;        /* Absolute deadline for this instance        */
        unsigned int                        flags;                /* Specifying the scheduler behaviour        */

        /*
         * Some bool flags:
         *
         * @dl_throttled tells if we exhausted the runtime. If so, the
         * task has to wait for a replenishment to be performed at the
         * next firing of dl_timer.
         *
         * @dl_yielded tells if task gave up the CPU before consuming
         * all its available runtime during the last job.
         *
         * @dl_non_contending tells if the task is inactive while still
         * contributing to the active utilization. In other words, it
         * indicates if the inactive timer has been armed and its handler
         * has not been executed yet. This flag is useful to avoid race
         * conditions between the inactive timer handler and the wakeup
         * code.
         *
         * @dl_overrun tells if the task asked to be informed about runtime
         * overruns.
         *
         * @dl_server tells if this is a server entity.
         *
         * @dl_defer tells if this is a deferred or regular server. For
         * now only defer server exists.
         *
         * @dl_defer_armed tells if the deferrable server is waiting
         * for the replenishment timer to activate it.
         *
         * @dl_server_active tells if the dlserver is active(started).
         * dlserver is started on first cfs enqueue on an idle runqueue
         * and is stopped when a dequeue results in 0 cfs tasks on the
         * runqueue. In other words, dlserver is active only when cpu's
         * runqueue has atleast one cfs task.
         *
         * @dl_defer_running tells if the deferrable server is actually
         * running, skipping the defer phase.
         */
        unsigned int                        dl_throttled      : 1;
        unsigned int                        dl_yielded        : 1;
        unsigned int                        dl_non_contending : 1;
        unsigned int                        dl_overrun          : 1;
        unsigned int                        dl_server         : 1;
        unsigned int                        dl_server_active  : 1;
        unsigned int                        dl_defer          : 1;
        unsigned int                        dl_defer_armed          : 1;
        unsigned int                        dl_defer_running  : 1;

        /*
         * Bandwidth enforcement timer. Each -deadline task has its
         * own bandwidth to be enforced, thus we need one timer per task.
         */
        struct hrtimer                        dl_timer;

        /*
         * Inactive timer, responsible for decreasing the active utilization
         * at the "0-lag time". When a -deadline task blocks, it contributes
         * to GRUB's active utilization until the "0-lag time", hence a
         * timer is needed to decrease the active utilization at the correct
         * time.
         */
        struct hrtimer                        inactive_timer;

        /*
         * Bits for DL-server functionality. Also see the comment near
         * dl_server_update().
         *
         * @rq the runqueue this server is for
         *
         * @server_has_tasks() returns true if @server_pick return a
         * runnable task.
         */
        struct rq                        *rq;
        dl_server_has_tasks_f                server_has_tasks;
        dl_server_pick_f                server_pick_task;

#ifdef CONFIG_RT_MUTEXES
        /*
         * Priority Inheritance. When a DEADLINE scheduling entity is boosted
         * pi_se points to the donor, otherwise points to the dl_se it belongs
         * to (the original one/itself).
         */
        struct sched_dl_entity *pi_se;
#endif
};

#ifdef CONFIG_UCLAMP_TASK
/* Number of utilization clamp buckets (shorter alias) */
#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT

/*
 * Utilization clamp for a scheduling entity
 * @value:                clamp value "assigned" to a se
 * @bucket_id:                bucket index corresponding to the "assigned" value
 * @active:                the se is currently refcounted in a rq's bucket
 * @user_defined:        the requested clamp value comes from user-space
 *
 * The bucket_id is the index of the clamp bucket matching the clamp value
 * which is pre-computed and stored to avoid expensive integer divisions from
 * the fast path.
 *
 * The active bit is set whenever a task has got an "effective" value assigned,
 * which can be different from the clamp value "requested" from user-space.
 * This allows to know a task is refcounted in the rq's bucket corresponding
 * to the "effective" bucket_id.
 *
 * The user_defined bit is set whenever a task has got a task-specific clamp
 * value requested from userspace, i.e. the system defaults apply to this task
 * just as a restriction. This allows to relax default clamps when a less
 * restrictive task-specific value has been requested, thus allowing to
 * implement a "nice" semantic. For example, a task running with a 20%
 * default boost can still drop its own boosting to 0%.
 */
struct uclamp_se {
        unsigned int value                : bits_per(SCHED_CAPACITY_SCALE);
        unsigned int bucket_id                : bits_per(UCLAMP_BUCKETS);
        unsigned int active                : 1;
        unsigned int user_defined        : 1;
};
#endif /* CONFIG_UCLAMP_TASK */

union rcu_special {
        struct {
                u8                        blocked;
                u8                        need_qs;
                u8                        exp_hint; /* Hint for performance. */
                u8                        need_mb; /* Readers need smp_mb(). */
        } b; /* Bits. */
        u32 s; /* Set of bits. */
};

enum perf_event_task_context {
        perf_invalid_context = -1,
        perf_hw_context = 0,
        perf_sw_context,
        perf_nr_task_contexts,
};

/*
 * Number of contexts where an event can trigger:
 *      task, softirq, hardirq, nmi.
 */
#define PERF_NR_CONTEXTS        4

struct wake_q_node {
        struct wake_q_node *next;
};

struct kmap_ctrl {
#ifdef CONFIG_KMAP_LOCAL
        int                                idx;
        pte_t                                pteval[KM_MAX_IDX];
#endif
};

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /*
         * For reasons of header soup (see current_thread_info()), this
         * must be the first element of task_struct.
         */
        struct thread_info                thread_info;
#endif
        unsigned int                        __state;

        /* saved state for "spinlock sleepers" */
        unsigned int                        saved_state;

        /*
         * This begins the randomizable portion of task_struct. Only
         * scheduling-critical items should be added above here.
         */
        randomized_struct_fields_start

        void                                *stack;
        refcount_t                        usage;
        /* Per task flags (PF_*), defined further below: */
        unsigned int                        flags;
        unsigned int                        ptrace;

#ifdef CONFIG_MEM_ALLOC_PROFILING
        struct alloc_tag                *alloc_tag;
#endif

#ifdef CONFIG_SMP
        int                                on_cpu;
        struct __call_single_node        wake_entry;
        unsigned int                        wakee_flips;
        unsigned long                        wakee_flip_decay_ts;
        struct task_struct                *last_wakee;

        /*
         * recent_used_cpu is initially set as the last CPU used by a task
         * that wakes affine another task. Waker/wakee relationships can
         * push tasks around a CPU where each wakeup moves to the next one.
         * Tracking a recently used CPU allows a quick search for a recently
         * used CPU that may be idle.
         */
        int                                recent_used_cpu;
        int                                wake_cpu;
#endif
        int                                on_rq;

        int                                prio;
        int                                static_prio;
        int                                normal_prio;
        unsigned int                        rt_priority;

        struct sched_entity                se;
        struct sched_rt_entity                rt;
        struct sched_dl_entity                dl;
        struct sched_dl_entity                *dl_server;
#ifdef CONFIG_SCHED_CLASS_EXT
        struct sched_ext_entity                scx;
#endif
        const struct sched_class        *sched_class;

#ifdef CONFIG_SCHED_CORE
        struct rb_node                        core_node;
        unsigned long                        core_cookie;
        unsigned int                        core_occupation;
#endif

#ifdef CONFIG_CGROUP_SCHED
        struct task_group                *sched_task_group;
#endif


#ifdef CONFIG_UCLAMP_TASK
        /*
         * Clamp values requested for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp_req[UCLAMP_CNT];
        /*
         * Effective clamp values used for a scheduling entity.
         * Must be updated with task_rq_lock() held.
         */
        struct uclamp_se                uclamp[UCLAMP_CNT];
#endif

        struct sched_statistics         stats;

#ifdef CONFIG_PREEMPT_NOTIFIERS
        /* List of struct preempt_notifier: */
        struct hlist_head                preempt_notifiers;
#endif

#ifdef CONFIG_BLK_DEV_IO_TRACE
        unsigned int                        btrace_seq;
#endif

        unsigned int                        policy;
        unsigned long                        max_allowed_capacity;
        int                                nr_cpus_allowed;
        const cpumask_t                        *cpus_ptr;
        cpumask_t                        *user_cpus_ptr;
        cpumask_t                        cpus_mask;
        void                                *migration_pending;
#ifdef CONFIG_SMP
        unsigned short                        migration_disabled;
#endif
        unsigned short                        migration_flags;

#ifdef CONFIG_PREEMPT_RCU
        int                                rcu_read_lock_nesting;
        union rcu_special                rcu_read_unlock_special;
        struct list_head                rcu_node_entry;
        struct rcu_node                        *rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TASKS_RCU
        unsigned long                        rcu_tasks_nvcsw;
        u8                                rcu_tasks_holdout;
        u8                                rcu_tasks_idx;
        int                                rcu_tasks_idle_cpu;
        struct list_head                rcu_tasks_holdout_list;
        int                                rcu_tasks_exit_cpu;
        struct list_head                rcu_tasks_exit_list;
#endif /* #ifdef CONFIG_TASKS_RCU */

#ifdef CONFIG_TASKS_TRACE_RCU
        int                                trc_reader_nesting;
        int                                trc_ipi_to_cpu;
        union rcu_special                trc_reader_special;
        struct list_head                trc_holdout_list;
        struct list_head                trc_blkd_node;
        int                                trc_blkd_cpu;
#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */

        struct sched_info                sched_info;

        struct list_head                tasks;
#ifdef CONFIG_SMP
        struct plist_node                pushable_tasks;
        struct rb_node                        pushable_dl_tasks;
#endif

        struct mm_struct                *mm;
        struct mm_struct                *active_mm;
        struct address_space                *faults_disabled_mapping;

        int                                exit_state;
        int                                exit_code;
        int                                exit_signal;
        /* The signal sent when the parent dies: */
        int                                pdeath_signal;
        /* JOBCTL_*, siglock protected: */
        unsigned long                        jobctl;

        /* Used for emulating ABI behavior of previous Linux versions: */
        unsigned int                        personality;

        /* Scheduler bits, serialized by scheduler locks: */
        unsigned                        sched_reset_on_fork:1;
        unsigned                        sched_contributes_to_load:1;
        unsigned                        sched_migrated:1;
        unsigned                        sched_task_hot:1;

        /* Force alignment to the next boundary: */
        unsigned                        :0;

        /* Unserialized, strictly 'current' */

        /*
         * This field must not be in the scheduler word above due to wakelist
         * queueing no longer being serialized by p->on_cpu. However:
         *
         * p->XXX = X;                        ttwu()
         * schedule()                          if (p->on_rq && ..) // false
         *   smp_mb__after_spinlock();          if (smp_load_acquire(&p->on_cpu) && //true
         *   deactivate_task()                      ttwu_queue_wakelist())
         *     p->on_rq = 0;                        p->sched_remote_wakeup = Y;
         *
         * guarantees all stores of 'current' are visible before
         * ->sched_remote_wakeup gets used, so it can be in this word.
         */
        unsigned                        sched_remote_wakeup:1;
#ifdef CONFIG_RT_MUTEXES
        unsigned                        sched_rt_mutex:1;
#endif

        /* Bit to tell TOMOYO we're in execve(): */
        unsigned                        in_execve:1;
        unsigned                        in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
        unsigned                        restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG_V1
        unsigned                        in_user_fault:1;
#endif
#ifdef CONFIG_LRU_GEN
        /* whether the LRU algorithm may apply to this access */
        unsigned                        in_lru_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
        unsigned                        brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
        /* disallow userland-initiated cgroup migration */
        unsigned                        no_cgroup_migration:1;
        /* task is frozen/stopped (used by the cgroup freezer) */
        unsigned                        frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
        unsigned                        use_memdelay:1;
#endif
#ifdef CONFIG_PSI
        /* Stalled due to lack of memory */
        unsigned                        in_memstall:1;
#endif
#ifdef CONFIG_PAGE_OWNER
        /* Used by page_owner=on to detect recursion in page tracking. */
        unsigned                        in_page_owner:1;
#endif
#ifdef CONFIG_EVENTFD
        /* Recursion prevention for eventfd_signal() */
        unsigned                        in_eventfd:1;
#endif
#ifdef CONFIG_ARCH_HAS_CPU_PASID
        unsigned                        pasid_activated:1;
#endif
#ifdef CONFIG_X86_BUS_LOCK_DETECT
        unsigned                        reported_split_lock:1;
#endif
#ifdef CONFIG_TASK_DELAY_ACCT
        /* delay due to memory thrashing */
        unsigned                        in_thrashing:1;
#endif
#ifdef CONFIG_PREEMPT_RT
        struct netdev_xmit                net_xmit;
#endif
        unsigned long                        atomic_flags; /* Flags requiring atomic access. */

        struct restart_block                restart_block;

        pid_t                                pid;
        pid_t                                tgid;

#ifdef CONFIG_STACKPROTECTOR
        /* Canary value for the -fstack-protector GCC feature: */
        unsigned long                        stack_canary;
#endif
        /*
         * Pointers to the (original) parent process, youngest child, younger sibling,
         * older sibling, respectively.  (p->father can be replaced with
         * p->real_parent->pid)
         */

        /* Real parent process: */
        struct task_struct __rcu        *real_parent;

        /* Recipient of SIGCHLD, wait4() reports: */
        struct task_struct __rcu        *parent;

        /*
         * Children/sibling form the list of natural children:
         */
        struct list_head                children;
        struct list_head                sibling;
        struct task_struct                *group_leader;

        /*
         * 'ptraced' is the list of tasks this task is using ptrace() on.
         *
         * This includes both natural children and PTRACE_ATTACH targets.
         * 'ptrace_entry' is this task's link on the p->parent->ptraced list.
         */
        struct list_head                ptraced;
        struct list_head                ptrace_entry;

        /* PID/PID hash table linkage. */
        struct pid                        *thread_pid;
        struct hlist_node                pid_links[PIDTYPE_MAX];
        struct list_head                thread_node;

        struct completion                *vfork_done;

        /* CLONE_CHILD_SETTID: */
        int __user                        *set_child_tid;

        /* CLONE_CHILD_CLEARTID: */
        int __user                        *clear_child_tid;

        /* PF_KTHREAD | PF_IO_WORKER */
        void                                *worker_private;

        u64                                utime;
        u64                                stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
        u64                                utimescaled;
        u64                                stimescaled;
#endif
        u64                                gtime;
        struct prev_cputime                prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
        struct vtime                        vtime;
#endif

#ifdef CONFIG_NO_HZ_FULL
        atomic_t                        tick_dep_mask;
#endif
        /* Context switch counts: */
        unsigned long                        nvcsw;
        unsigned long                        nivcsw;

        /* Monotonic time in nsecs: */
        u64                                start_time;

        /* Boot based time in nsecs: */
        u64                                start_boottime;

        /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
        unsigned long                        min_flt;
        unsigned long                        maj_flt;

        /* Empty if CONFIG_POSIX_CPUTIMERS=n */
        struct posix_cputimers                posix_cputimers;

#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK
        struct posix_cputimers_work        posix_cputimers_work;
#endif

        /* Process credentials: */

        /* Tracer's credentials at attach: */
        const struct cred __rcu                *ptracer_cred;

        /* Objective and real subjective task credentials (COW): */
        const struct cred __rcu                *real_cred;

        /* Effective (overridable) subjective task credentials (COW): */
        const struct cred __rcu                *cred;

#ifdef CONFIG_KEYS
        /* Cached requested key. */
        struct key                        *cached_requested_key;
#endif

        /*
         * executable name, excluding path.
         *
         * - normally initialized begin_new_exec()
         * - set it with set_task_comm()
         *   - strscpy_pad() to ensure it is always NUL-terminated and
         *     zero-padded
         *   - task_lock() to ensure the operation is atomic and the name is
         *     fully updated.
         */
        char                                comm[TASK_COMM_LEN];

        struct nameidata                *nameidata;

#ifdef CONFIG_SYSVIPC
        struct sysv_sem                        sysvsem;
        struct sysv_shm                        sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
        unsigned long                        last_switch_count;
        unsigned long                        last_switch_time;
#endif
        /* Filesystem information: */
        struct fs_struct                *fs;

        /* Open file information: */
        struct files_struct                *files;

#ifdef CONFIG_IO_URING
        struct io_uring_task                *io_uring;
#endif

        /* Namespaces: */
        struct nsproxy                        *nsproxy;

        /* Signal handlers: */
        struct signal_struct                *signal;
        struct sighand_struct __rcu                *sighand;
        sigset_t                        blocked;
        sigset_t                        real_blocked;
        /* Restored if set_restore_sigmask() was used: */
        sigset_t                        saved_sigmask;
        struct sigpending                pending;
        unsigned long                        sas_ss_sp;
        size_t                                sas_ss_size;
        unsigned int                        sas_ss_flags;

        struct callback_head                *task_works;

#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
        struct audit_context                *audit_context;
#endif
        kuid_t                                loginuid;
        unsigned int                        sessionid;
#endif
        struct seccomp                        seccomp;
        struct syscall_user_dispatch        syscall_dispatch;

        /* Thread group tracking: */
        u64                                parent_exec_id;
        u64                                self_exec_id;

        /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
        spinlock_t                        alloc_lock;

        /* Protection of the PI data structures: */
        raw_spinlock_t                        pi_lock;

        struct wake_q_node                wake_q;

#ifdef CONFIG_RT_MUTEXES
        /* PI waiters blocked on a rt_mutex held by this task: */
        struct rb_root_cached                pi_waiters;
        /* Updated under owner's pi_lock and rq lock */
        struct task_struct                *pi_top_task;
        /* Deadlock detection and priority inheritance handling: */
        struct rt_mutex_waiter                *pi_blocked_on;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
        /* Mutex deadlock detection: */
        struct mutex_waiter                *blocked_on;
#endif

#ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER
        struct mutex                        *blocker_mutex;
#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        int                                non_block_count;
#endif

#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                irqtrace;
        unsigned int                        hardirq_threaded;
        u64                                hardirq_chain_key;
        int                                softirqs_enabled;
        int                                softirq_context;
        int                                irq_config;
#endif
#ifdef CONFIG_PREEMPT_RT
        int                                softirq_disable_cnt;
#endif

#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH                        48UL
        u64                                curr_chain_key;
        int                                lockdep_depth;
        unsigned int                        lockdep_recursion;
        struct held_lock                held_locks[MAX_LOCK_DEPTH];
#endif

#if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP)
        unsigned int                        in_ubsan;
#endif

        /* Journalling filesystem info: */
        void                                *journal_info;

        /* Stacked block device info: */
        struct bio_list                        *bio_list;

        /* Stack plugging: */
        struct blk_plug                        *plug;

        /* VM state: */
        struct reclaim_state                *reclaim_state;

        struct io_context                *io_context;

#ifdef CONFIG_COMPACTION
        struct capture_control                *capture_control;
#endif
        /* Ptrace state: */
        unsigned long                        ptrace_message;
        kernel_siginfo_t                *last_siginfo;

        struct task_io_accounting        ioac;
#ifdef CONFIG_PSI
        /* Pressure stall state */
        unsigned int                        psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
        /* Accumulated RSS usage: */
        u64                                acct_rss_mem1;
        /* Accumulated virtual memory usage: */
        u64                                acct_vm_mem1;
        /* stime + utime since last update: */
        u64                                acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
        /* Protected by ->alloc_lock: */
        nodemask_t                        mems_allowed;
        /* Sequence number to catch updates: */
        seqcount_spinlock_t                mems_allowed_seq;
        int                                cpuset_mem_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
        /* Control Group info protected by css_set_lock: */
        struct css_set __rcu                *cgroups;
        /* cg_list protected by css_set_lock and tsk->alloc_lock: */
        struct list_head                cg_list;
#endif
#ifdef CONFIG_X86_CPU_RESCTRL
        u32                                closid;
        u32                                rmid;
#endif
#ifdef CONFIG_FUTEX
        struct robust_list_head __user        *robust_list;
#ifdef CONFIG_COMPAT
        struct compat_robust_list_head __user *compat_robust_list;
#endif
        struct list_head                pi_state_list;
        struct futex_pi_state                *pi_state_cache;
        struct mutex                        futex_exit_mutex;
        unsigned int                        futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
        u8                                perf_recursion[PERF_NR_CONTEXTS];
        struct perf_event_context        *perf_event_ctxp;
        struct mutex                        perf_event_mutex;
        struct list_head                perf_event_list;
        struct perf_ctx_data __rcu        *perf_ctx_data;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
        unsigned long                        preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
        /* Protected by alloc_lock: */
        struct mempolicy                *mempolicy;
        short                                il_prev;
        u8                                il_weight;
        short                                pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
        int                                numa_scan_seq;
        unsigned int                        numa_scan_period;
        unsigned int                        numa_scan_period_max;
        int                                numa_preferred_nid;
        unsigned long                        numa_migrate_retry;
        /* Migration stamp: */
        u64                                node_stamp;
        u64                                last_task_numa_placement;
        u64                                last_sum_exec_runtime;
        struct callback_head                numa_work;

        /*
         * This pointer is only modified for current in syscall and
         * pagefault context (and for tasks being destroyed), so it can be read
         * from any of the following contexts:
         *  - RCU read-side critical section
         *  - current->numa_group from everywhere
         *  - task's runqueue locked, task not running
         */
        struct numa_group __rcu                *numa_group;

        /*
         * numa_faults is an array split into four regions:
         * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
         * in this precise order.
         *
         * faults_memory: Exponential decaying average of faults on a per-node
         * basis. Scheduling placement decisions are made based on these
         * counts. The values remain static for the duration of a PTE scan.
         * faults_cpu: Track the nodes the process was running on when a NUMA
         * hinting fault was incurred.
         * faults_memory_buffer and faults_cpu_buffer: Record faults per node
         * during the current scan window. When the scan completes, the counts
         * in faults_memory and faults_cpu decay and these values are copied.
         */
        unsigned long                        *numa_faults;
        unsigned long                        total_numa_faults;

        /*
         * numa_faults_locality tracks if faults recorded during the last
         * scan window were remote/local or failed to migrate. The task scan
         * period is adapted based on the locality of the faults with different
         * weights depending on whether they were shared or private faults
         */
        unsigned long                        numa_faults_locality[3];

        unsigned long                        numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_RSEQ
        struct rseq __user *rseq;
        u32 rseq_len;
        u32 rseq_sig;
        /*
         * RmW on rseq_event_mask must be performed atomically
         * with respect to preemption.
         */
        unsigned long rseq_event_mask;
# ifdef CONFIG_DEBUG_RSEQ
        /*
         * This is a place holder to save a copy of the rseq fields for
         * validation of read-only fields. The struct rseq has a
         * variable-length array at the end, so it cannot be used
         * directly. Reserve a size large enough for the known fields.
         */
        char                                rseq_fields[sizeof(struct rseq)];
# endif
#endif

#ifdef CONFIG_SCHED_MM_CID
        int                                mm_cid;                /* Current cid in mm */
        int                                last_mm_cid;        /* Most recent cid in mm */
        int                                migrate_from_cpu;
        int                                mm_cid_active;        /* Whether cid bitmap is active */
        struct callback_head                cid_work;
#endif

        struct tlbflush_unmap_batch        tlb_ubc;

        /* Cache last used pipe for splice(): */
        struct pipe_inode_info                *splice_pipe;

        struct page_frag                task_frag;

#ifdef CONFIG_TASK_DELAY_ACCT
        struct task_delay_info                *delays;
#endif

#ifdef CONFIG_FAULT_INJECTION
        int                                make_it_fail;
        unsigned int                        fail_nth;
#endif
        /*
         * When (nr_dirtied >= nr_dirtied_pause), it's time to call
         * balance_dirty_pages() for a dirty throttling pause:
         */
        int                                nr_dirtied;
        int                                nr_dirtied_pause;
        /* Start of a write-and-pause period: */
        unsigned long                        dirty_paused_when;

#ifdef CONFIG_LATENCYTOP
        int                                latency_record_count;
        struct latency_record                latency_record[LT_SAVECOUNT];
#endif
        /*
         * Time slack values; these are used to round up poll() and
         * select() etc timeout values. These are in nanoseconds.
         */
        u64                                timer_slack_ns;
        u64                                default_timer_slack_ns;

#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
        unsigned int                        kasan_depth;
#endif

#ifdef CONFIG_KCSAN
        struct kcsan_ctx                kcsan_ctx;
#ifdef CONFIG_TRACE_IRQFLAGS
        struct irqtrace_events                kcsan_save_irqtrace;
#endif
#ifdef CONFIG_KCSAN_WEAK_MEMORY
        int                                kcsan_stack_depth;
#endif
#endif

#ifdef CONFIG_KMSAN
        struct kmsan_ctx                kmsan_ctx;
#endif

#if IS_ENABLED(CONFIG_KUNIT)
        struct kunit                        *kunit_test;
#endif

#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        /* Index of current stored address in ret_stack: */
        int                                curr_ret_stack;
        int                                curr_ret_depth;

        /* Stack of return addresses for return function tracing: */
        unsigned long                        *ret_stack;

        /* Timestamp for last schedule: */
        unsigned long long                ftrace_timestamp;
        unsigned long long                ftrace_sleeptime;

        /*
         * Number of functions that haven't been traced
         * because of depth overrun:
         */
        atomic_t                        trace_overrun;

        /* Pause tracing: */
        atomic_t                        tracing_graph_pause;
#endif

#ifdef CONFIG_TRACING
        /* Bitmask and counter of trace recursion: */
        unsigned long                        trace_recursion;
#endif /* CONFIG_TRACING */

#ifdef CONFIG_KCOV
        /* See kernel/kcov.c for more details. */

        /* Coverage collection mode enabled for this task (0 if disabled): */
        unsigned int                        kcov_mode;

        /* Size of the kcov_area: */
        unsigned int                        kcov_size;

        /* Buffer for coverage collection: */
        void                                *kcov_area;

        /* KCOV descriptor wired with this task or NULL: */
        struct kcov                        *kcov;

        /* KCOV common handle for remote coverage collection: */
        u64                                kcov_handle;

        /* KCOV sequence number: */
        int                                kcov_sequence;

        /* Collect coverage from softirq context: */
        unsigned int                        kcov_softirq;
#endif

#ifdef CONFIG_MEMCG_V1
        struct mem_cgroup                *memcg_in_oom;
#endif

#ifdef CONFIG_MEMCG
        /* Number of pages to reclaim on returning to userland: */
        unsigned int                        memcg_nr_pages_over_high;

        /* Used by memcontrol for targeted memcg charge: */
        struct mem_cgroup                *active_memcg;

        /* Cache for current->cgroups->memcg->objcg lookups: */
        struct obj_cgroup                *objcg;
#endif

#ifdef CONFIG_BLK_CGROUP
        struct gendisk                        *throttle_disk;
#endif

#ifdef CONFIG_UPROBES
        struct uprobe_task                *utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
        unsigned int                        sequential_io;
        unsigned int                        sequential_io_avg;
#endif
        struct kmap_ctrl                kmap_ctrl;
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
        unsigned long                        task_state_change;
# ifdef CONFIG_PREEMPT_RT
        unsigned long                        saved_state_change;
# endif
#endif
        struct rcu_head                        rcu;
        refcount_t                        rcu_users;
        int                                pagefault_disabled;
#ifdef CONFIG_MMU
        struct task_struct                *oom_reaper_list;
        struct timer_list                oom_reaper_timer;
#endif
#ifdef CONFIG_VMAP_STACK
        struct vm_struct                *stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
        /* A live task holds one reference: */
        refcount_t                        stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
        int patch_state;
#endif
#ifdef CONFIG_SECURITY
        /* Used by LSM modules for access restriction: */
        void                                *security;
#endif
#ifdef CONFIG_BPF_SYSCALL
        /* Used by BPF task local storage */
        struct bpf_local_storage __rcu        *bpf_storage;
        /* Used for BPF run context */
        struct bpf_run_ctx                *bpf_ctx;
#endif
        /* Used by BPF for per-TASK xdp storage */
        struct bpf_net_context                *bpf_net_context;

#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
        unsigned long                        lowest_stack;
        unsigned long                        prev_lowest_stack;
#endif

#ifdef CONFIG_X86_MCE
        void __user                        *mce_vaddr;
        __u64                                mce_kflags;
        u64                                mce_addr;
        __u64                                mce_ripv : 1,
                                        mce_whole_page : 1,
                                        __mce_reserved : 62;
        struct callback_head                mce_kill_me;
        int                                mce_count;
#endif

#ifdef CONFIG_KRETPROBES
        struct llist_head               kretprobe_instances;
#endif
#ifdef CONFIG_RETHOOK
        struct llist_head               rethooks;
#endif

#ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH
        /*
         * If L1D flush is supported on mm context switch
         * then we use this callback head to queue kill work
         * to kill tasks that are not running on SMT disabled
         * cores
         */
        struct callback_head                l1d_flush_kill;
#endif

#ifdef CONFIG_RV
        /*
         * Per-task RV monitor. Nowadays fixed in RV_PER_TASK_MONITORS.
         * If we find justification for more monitors, we can think
         * about adding more or developing a dynamic method. So far,
         * none of these are justified.
         */
        union rv_task_monitor                rv[RV_PER_TASK_MONITORS];
#endif

#ifdef CONFIG_USER_EVENTS
        struct user_event_mm                *user_event_mm;
#endif

        /*
         * New fields for task_struct should be added above here, so that
         * they are included in the randomized portion of task_struct.
         */
        randomized_struct_fields_end

        /* CPU-specific state of this task: */
        struct thread_struct                thread;

        /*
         * WARNING: on x86, 'thread_struct' contains a variable-sized
         * structure.  It *MUST* be at the end of 'task_struct'.
         *
         * Do not put anything below here!
         */
};

#define TASK_REPORT_IDLE        (TASK_REPORT + 1)
#define TASK_REPORT_MAX                (TASK_REPORT_IDLE << 1)

static inline unsigned int __task_state_index(unsigned int tsk_state,
                                              unsigned int tsk_exit_state)
{
        unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT;

        BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);

        if ((tsk_state & TASK_IDLE) == TASK_IDLE)
                state = TASK_REPORT_IDLE;

        /*
         * We're lying here, but rather than expose a completely new task state
         * to userspace, we can make this appear as if the task has gone through
         * a regular rt_mutex_lock() call.
         * Report frozen tasks as uninterruptible.
         */
        if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN))
                state = TASK_UNINTERRUPTIBLE;

        return fls(state);
}

static inline unsigned int task_state_index(struct task_struct *tsk)
{
        return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state);
}

static inline char task_index_to_char(unsigned int state)
{
        static const char state_char[] = "RSDTtXZPI";

        BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1));

        return state_char[state];
}

static inline char task_state_to_char(struct task_struct *tsk)
{
        return task_index_to_char(task_state_index(tsk));
}

extern struct pid *cad_pid;

/*
 * Per process flags
 */
#define PF_VCPU                        0x00000001        /* I'm a virtual CPU */
#define PF_IDLE                        0x00000002        /* I am an IDLE thread */
#define PF_EXITING                0x00000004        /* Getting shut down */
#define PF_POSTCOREDUMP                0x00000008        /* Coredumps should ignore this task */
#define PF_IO_WORKER                0x00000010        /* Task is an IO worker */
#define PF_WQ_WORKER                0x00000020        /* I'm a workqueue worker */
#define PF_FORKNOEXEC                0x00000040        /* Forked but didn't exec */
#define PF_MCE_PROCESS                0x00000080      /* Process policy on mce errors */
#define PF_SUPERPRIV                0x00000100        /* Used super-user privileges */
#define PF_DUMPCORE                0x00000200        /* Dumped core */
#define PF_SIGNALED                0x00000400        /* Killed by a signal */
#define PF_MEMALLOC                0x00000800        /* Allocating memory to free memory. See memalloc_noreclaim_save() */
#define PF_NPROC_EXCEEDED        0x00001000        /* set_user() noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH                0x00002000        /* If unset the fpu must be initialized before use */
#define PF_USER_WORKER                0x00004000        /* Kernel thread cloned from userspace thread */
#define PF_NOFREEZE                0x00008000        /* This thread should not be frozen */
#define PF_KCOMPACTD                0x00010000        /* I am kcompactd */
#define PF_KSWAPD                0x00020000        /* I am kswapd */
#define PF_MEMALLOC_NOFS        0x00040000        /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */
#define PF_MEMALLOC_NOIO        0x00080000        /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */
#define PF_LOCAL_THROTTLE        0x00100000        /* Throttle writes only against the bdi I write to,
                                                 * I am cleaning dirty pages from some other bdi. */
#define PF_KTHREAD                0x00200000        /* I am a kernel thread */
#define PF_RANDOMIZE                0x00400000        /* Randomize virtual address space */
#define PF__HOLE__00800000        0x00800000
#define PF__HOLE__01000000        0x01000000
#define PF__HOLE__02000000        0x02000000
#define PF_NO_SETAFFINITY        0x04000000        /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY                0x08000000      /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN                0x10000000        /* Allocations constrained to zones which allow long term pinning.
                                                 * See memalloc_pin_save() */
#define PF_BLOCK_TS                0x20000000        /* plug has ts that needs updating */
#define PF__HOLE__40000000        0x40000000
#define PF_SUSPEND_TASK                0x80000000      /* This thread called freeze_processes() and should not be frozen */

/*
 * Only the _current_ task can read/write to tsk->flags, but other
 * tasks can access tsk->flags in readonly mode for example
 * with tsk_used_math (like during threaded core dumping).
 * There is however an exception to this rule during ptrace
 * or during fork: the ptracer task is allowed to write to the
 * child->flags of its traced child (same goes for fork, the parent
 * can write to the child->flags), because we're guaranteed the
 * child is not running and in turn not changing child->flags
 * at the same time the parent does it.
 */
#define clear_stopped_child_used_math(child)        do { (child)->flags &= ~PF_USED_MATH; } while (0)
#define set_stopped_child_used_math(child)        do { (child)->flags |= PF_USED_MATH; } while (0)
#define clear_used_math()                        clear_stopped_child_used_math(current)
#define set_used_math()                                set_stopped_child_used_math(current)

#define conditional_stopped_child_used_math(condition, child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)

#define conditional_used_math(condition)        conditional_stopped_child_used_math(condition, current)

#define copy_to_stopped_child_used_math(child) \
        do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)

/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
#define tsk_used_math(p)                        ((p)->flags & PF_USED_MATH)
#define used_math()                                tsk_used_math(current)

static __always_inline bool is_percpu_thread(void)
{
#ifdef CONFIG_SMP
        return (current->flags & PF_NO_SETAFFINITY) &&
                (current->nr_cpus_allowed  == 1);
#else
        return true;
#endif
}

/* Per-process atomic flags. */
#define PFA_NO_NEW_PRIVS                0        /* May not gain new privileges. */
#define PFA_SPREAD_PAGE                        1        /* Spread page cache over cpuset */
#define PFA_SPREAD_SLAB                        2        /* Spread some slab caches over cpuset */
#define PFA_SPEC_SSB_DISABLE                3        /* Speculative Store Bypass disabled */
#define PFA_SPEC_SSB_FORCE_DISABLE        4        /* Speculative Store Bypass force disabled*/
#define PFA_SPEC_IB_DISABLE                5        /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE        6        /* Indirect branch speculation permanently restricted */
#define PFA_SPEC_SSB_NOEXEC                7        /* Speculative Store Bypass clear on execve() */

#define TASK_PFA_TEST(name, func)                                        \
        static inline bool task_##func(struct task_struct *p)                \
        { return test_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_SET(name, func)                                        \
        static inline void task_set_##func(struct task_struct *p)        \
        { set_bit(PFA_##name, &p->atomic_flags); }

#define TASK_PFA_CLEAR(name, func)                                        \
        static inline void task_clear_##func(struct task_struct *p)        \
        { clear_bit(PFA_##name, &p->atomic_flags); }

TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)

TASK_PFA_TEST(SPREAD_PAGE, spread_page)
TASK_PFA_SET(SPREAD_PAGE, spread_page)
TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)

TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
TASK_PFA_SET(SPREAD_SLAB, spread_slab)
TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)

TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable)
TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable)

TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec)
TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec)

TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)
TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable)

TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)

TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)

static inline void
current_restore_flags(unsigned long orig_flags, unsigned long flags)
{
        current->flags &= ~flags;
        current->flags |= orig_flags & flags;
}

extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
extern int task_can_attach(struct task_struct *p);
extern int dl_bw_alloc(int cpu, u64 dl_bw);
extern void dl_bw_free(int cpu, u64 dl_bw);
#ifdef CONFIG_SMP

/* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);

/**
 * set_cpus_allowed_ptr - set CPU affinity mask of a task
 * @p: the task
 * @new_mask: CPU affinity mask
 *
 * Return: zero if successful, or a negative error code
 */
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node);
extern void release_user_cpus_ptr(struct task_struct *p);
extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask);
extern void force_compatible_cpus_allowed_ptr(struct task_struct *p);
extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p);
#else
static inline void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
}
static inline int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
        /* Opencoded cpumask_test_cpu(0, new_mask) to avoid dependency on cpumask.h */
        if ((*cpumask_bits(new_mask) & 1) == 0)
                return -EINVAL;
        return 0;
}
static inline int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node)
{
        if (src->user_cpus_ptr)
                return -EINVAL;
        return 0;
}
static inline void release_user_cpus_ptr(struct task_struct *p)
{
        WARN_ON(p->user_cpus_ptr);
}

static inline int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
        return 0;
}
#endif

extern int yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);

/**
 * task_nice - return the nice value of a given task.
 * @p: the task in question.
 *
 * Return: The nice value [ -20 ... 0 ... 19 ].
 */
static inline int task_nice(const struct task_struct *p)
{
        return PRIO_TO_NICE((p)->static_prio);
}

extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern void sched_set_fifo(struct task_struct *p);
extern void sched_set_fifo_low(struct task_struct *p);
extern void sched_set_normal(struct task_struct *p, int nice);
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);

/**
 * is_idle_task - is the specified task an idle task?
 * @p: the task in question.
 *
 * Return: 1 if @p is an idle task. 0 otherwise.
 */
static __always_inline bool is_idle_task(const struct task_struct *p)
{
        return !!(p->flags & PF_IDLE);
}

extern struct task_struct *curr_task(int cpu);
extern void ia64_set_curr_task(int cpu, struct task_struct *p);

void yield(void);

union thread_union {
        struct task_struct task;
#ifndef CONFIG_THREAD_INFO_IN_TASK
        struct thread_info thread_info;
#endif
        unsigned long stack[THREAD_SIZE/sizeof(long)];
};

#ifndef CONFIG_THREAD_INFO_IN_TASK
extern struct thread_info init_thread_info;
#endif

extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)];

#ifdef CONFIG_THREAD_INFO_IN_TASK
# define task_thread_info(task)        (&(task)->thread_info)
#else
# define task_thread_info(task)        ((struct thread_info *)(task)->stack)
#endif

/*
 * find a task by one of its numerical ids
 *
 * find_task_by_pid_ns():
 *      finds a task by its pid in the specified namespace
 * find_task_by_vpid():
 *      finds a task by its virtual pid
 *
 * see also find_vpid() etc in include/linux/pid.h
 */

extern struct task_struct *find_task_by_vpid(pid_t nr);
extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns);

/*
 * find a task by its virtual pid and get the task struct
 */
extern struct task_struct *find_get_task_by_vpid(pid_t nr);

extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);

#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif

extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
#define set_task_comm(tsk, from) ({                        \
        BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN);        \
        __set_task_comm(tsk, from, false);                \
})

/*
 * - Why not use task_lock()?
 *   User space can randomly change their names anyway, so locking for readers
 *   doesn't make sense. For writers, locking is probably necessary, as a race
 *   condition could lead to long-term mixed results.
 *   The strscpy_pad() in __set_task_comm() can ensure that the task comm is
 *   always NUL-terminated and zero-padded. Therefore the race condition between
 *   reader and writer is not an issue.
 *
 * - BUILD_BUG_ON() can help prevent the buf from being truncated.
 *   Since the callers don't perform any return value checks, this safeguard is
 *   necessary.
 */
#define get_task_comm(buf, tsk) ({                        \
        BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN);        \
        strscpy_pad(buf, (tsk)->comm);                        \
        buf;                                                \
})

#ifdef CONFIG_SMP
static __always_inline void scheduler_ipi(void)
{
        /*
         * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
         * TIF_NEED_RESCHED remotely (for the first time) will also send
         * this IPI.
         */
        preempt_fold_need_resched();
}
#else
static inline void scheduler_ipi(void) { }
#endif

extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);

/*
 * Set thread flags in other task's structures.
 * See asm/thread_info.h for TIF_xxxx flags available:
 */
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag,
                                          bool value)
{
        update_ti_thread_flag(task_thread_info(tsk), flag, value);
}

static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
{
        return test_ti_thread_flag(task_thread_info(tsk), flag);
}

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
        set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
        atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
                           (atomic_long_t *)&task_thread_info(tsk)->flags);
}

static inline int test_tsk_need_resched(struct task_struct *tsk)
{
        return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

/*
 * cond_resched() and cond_resched_lock(): latency reduction via
 * explicit rescheduling in places that are safe. The return
 * value indicates whether a reschedule was done in fact.
 * cond_resched_lock() will drop the spinlock before scheduling,
 */
#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC)
extern int __cond_resched(void);

#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)

void sched_dynamic_klp_enable(void);
void sched_dynamic_klp_disable(void);

DECLARE_STATIC_CALL(cond_resched, __cond_resched);

static __always_inline int _cond_resched(void)
{
        return static_call_mod(cond_resched)();
}

#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)

extern int dynamic_cond_resched(void);

static __always_inline int _cond_resched(void)
{
        return dynamic_cond_resched();
}

#else /* !CONFIG_PREEMPTION */

static inline int _cond_resched(void)
{
        klp_sched_try_switch();
        return __cond_resched();
}

#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */

#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */

static inline int _cond_resched(void)
{
        klp_sched_try_switch();
        return 0;
}

#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */

#define cond_resched() ({                        \
        __might_resched(__FILE__, __LINE__, 0);        \
        _cond_resched();                        \
})

extern int __cond_resched_lock(spinlock_t *lock);
extern int __cond_resched_rwlock_read(rwlock_t *lock);
extern int __cond_resched_rwlock_write(rwlock_t *lock);

#define MIGHT_RESCHED_RCU_SHIFT                8
#define MIGHT_RESCHED_PREEMPT_MASK        ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1)

#ifndef CONFIG_PREEMPT_RT
/*
 * Non RT kernels have an elevated preempt count due to the held lock,
 * but are not allowed to be inside a RCU read side critical section
 */
# define PREEMPT_LOCK_RESCHED_OFFSETS        PREEMPT_LOCK_OFFSET
#else
/*
 * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in
 * cond_resched*lock() has to take that into account because it checks for
 * preempt_count() and rcu_preempt_depth().
 */
# define PREEMPT_LOCK_RESCHED_OFFSETS        \
        (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT))
#endif

#define cond_resched_lock(lock) ({                                                \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_lock(lock);                                                \
})

#define cond_resched_rwlock_read(lock) ({                                        \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_rwlock_read(lock);                                        \
})

#define cond_resched_rwlock_write(lock) ({                                        \
        __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS);        \
        __cond_resched_rwlock_write(lock);                                        \
})

static __always_inline bool need_resched(void)
{
        return unlikely(tif_need_resched());
}

/*
 * Wrappers for p->thread_info->cpu access. No-op on UP.
 */
#ifdef CONFIG_SMP

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return READ_ONCE(task_thread_info(p)->cpu);
}

extern void set_task_cpu(struct task_struct *p, unsigned int cpu);

#else

static inline unsigned int task_cpu(const struct task_struct *p)
{
        return 0;
}

static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
}

#endif /* CONFIG_SMP */

static inline bool task_is_runnable(struct task_struct *p)
{
        return p->on_rq && !p->se.sched_delayed;
}

extern bool sched_task_on_rq(struct task_struct *p);
extern unsigned long get_wchan(struct task_struct *p);
extern struct task_struct *cpu_curr_snapshot(int cpu);

#include <linux/spinlock.h>

/*
 * In order to reduce various lock holder preemption latencies provide an
 * interface to see if a vCPU is currently running or not.
 *
 * This allows us to terminate optimistic spin loops and block, analogous to
 * the native optimistic spin heuristic of testing if the lock owner task is
 * running or not.
 */
#ifndef vcpu_is_preempted
static inline bool vcpu_is_preempted(int cpu)
{
        return false;
}
#endif

extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);

#ifndef TASK_SIZE_OF
#define TASK_SIZE_OF(tsk)        TASK_SIZE
#endif

#ifdef CONFIG_SMP
static inline bool owner_on_cpu(struct task_struct *owner)
{
        /*
         * As lock holder preemption issue, we both skip spinning if
         * task is not on cpu or its cpu is preempted
         */
        return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner));
}

/* Returns effective CPU energy utilization, as seen by the scheduler */
unsigned long sched_cpu_util(int cpu);
#endif /* CONFIG_SMP */

#ifdef CONFIG_SCHED_CORE
extern void sched_core_free(struct task_struct *tsk);
extern void sched_core_fork(struct task_struct *p);
extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
                                unsigned long uaddr);
extern int sched_core_idle_cpu(int cpu);
#else
static inline void sched_core_free(struct task_struct *tsk) { }
static inline void sched_core_fork(struct task_struct *p) { }
static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
#endif

extern void sched_set_stop_task(int cpu, struct task_struct *stop);

#ifdef CONFIG_MEM_ALLOC_PROFILING
static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
{
        swap(current->alloc_tag, tag);
        return tag;
}

static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
{
#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
        WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
#endif
        current->alloc_tag = old;
}
#else
#define alloc_tag_save(_tag)                        NULL
#define alloc_tag_restore(_tag, _old)                do {} while (0)
#endif

#endif



































    4 









    4 


    4 
    4 


    4 





    4 
    4 














    4 



    4 
    4 
    4 

    4 




























    4 

    4 






































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * lib/plist.c
 *
 * Descending-priority-sorted double-linked list
 *
 * (C) 2002-2003 Intel Corp
 * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
 *
 * 2001-2005 (c) MontaVista Software, Inc.
 * Daniel Walker <dwalker@mvista.com>
 *
 * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
 *
 * Simplifications of the original code by
 * Oleg Nesterov <oleg@tv-sign.ru>
 *
 * Based on simple lists (include/linux/list.h).
 *
 * This file contains the add / del functions which are considered to
 * be too large to inline. See include/linux/plist.h for further
 * information.
 */

#include <linux/bug.h>
#include <linux/plist.h>

#ifdef CONFIG_DEBUG_PLIST

static struct plist_head test_head;

static void plist_check_prev_next(struct list_head *t, struct list_head *p,
                                  struct list_head *n)
{
        WARN(n->prev != p || p->next != n,
                        "top: %p, n: %p, p: %p\n"
                        "prev: %p, n: %p, p: %p\n"
                        "next: %p, n: %p, p: %p\n",
                         t, t->next, t->prev,
                        p, p->next, p->prev,
                        n, n->next, n->prev);
}

static void plist_check_list(struct list_head *top)
{
        struct list_head *prev = top, *next = top->next;

        plist_check_prev_next(top, prev, next);
        while (next != top) {
                WRITE_ONCE(prev, next);
                WRITE_ONCE(next, prev->next);
                plist_check_prev_next(top, prev, next);
        }
}

static void plist_check_head(struct plist_head *head)
{
        if (!plist_head_empty(head))
                plist_check_list(&plist_first(head)->prio_list);
        plist_check_list(&head->node_list);
}

#else
# define plist_check_head(h)        do { } while (0)
#endif

/**
 * plist_add - add @node to @head
 *
 * @node:        &struct plist_node pointer
 * @head:        &struct plist_head pointer
 */
void plist_add(struct plist_node *node, struct plist_head *head)
{
        struct plist_node *first, *iter, *prev = NULL, *last, *reverse_iter;
        struct list_head *node_next = &head->node_list;

        plist_check_head(head);
        WARN_ON(!plist_node_empty(node));
        WARN_ON(!list_empty(&node->prio_list));

        if (plist_head_empty(head))
                goto ins_node;

        first = iter = plist_first(head);
        last = reverse_iter = list_entry(first->prio_list.prev, struct plist_node, prio_list);

        do {
                if (node->prio < iter->prio) {
                        node_next = &iter->node_list;
                        break;
                } else if (node->prio >= reverse_iter->prio) {
                        prev = reverse_iter;
                        iter = list_entry(reverse_iter->prio_list.next,
                                struct plist_node, prio_list);
                        if (likely(reverse_iter != last))
                                node_next = &iter->node_list;
                        break;
                }

                prev = iter;
                iter = list_entry(iter->prio_list.next,
                                struct plist_node, prio_list);
                reverse_iter = list_entry(reverse_iter->prio_list.prev,
                                struct plist_node, prio_list);
        } while (iter != first);

        if (!prev || prev->prio != node->prio)
                list_add_tail(&node->prio_list, &iter->prio_list);
ins_node:
        list_add_tail(&node->node_list, node_next);

        plist_check_head(head);
}

/**
 * plist_del - Remove a @node from plist.
 *
 * @node:        &struct plist_node pointer - entry to be removed
 * @head:        &struct plist_head pointer - list head
 */
void plist_del(struct plist_node *node, struct plist_head *head)
{
        plist_check_head(head);

        if (!list_empty(&node->prio_list)) {
                if (node->node_list.next != &head->node_list) {
                        struct plist_node *next;

                        next = list_entry(node->node_list.next,
                                        struct plist_node, node_list);

                        /* add the next plist_node into prio_list */
                        if (list_empty(&next->prio_list))
                                list_add(&next->prio_list, &node->prio_list);
                }
                list_del_init(&node->prio_list);
        }

        list_del_init(&node->node_list);

        plist_check_head(head);
}

/**
 * plist_requeue - Requeue @node at end of same-prio entries.
 *
 * This is essentially an optimized plist_del() followed by
 * plist_add().  It moves an entry already in the plist to
 * after any other same-priority entries.
 *
 * @node:        &struct plist_node pointer - entry to be moved
 * @head:        &struct plist_head pointer - list head
 */
void plist_requeue(struct plist_node *node, struct plist_head *head)
{
        struct plist_node *iter;
        struct list_head *node_next = &head->node_list;

        plist_check_head(head);
        BUG_ON(plist_head_empty(head));
        BUG_ON(plist_node_empty(node));

        if (node == plist_last(head))
                return;

        iter = plist_next(node);

        if (node->prio != iter->prio)
                return;

        plist_del(node, head);

        /*
         * After plist_del(), iter is the replacement of the node.  If the node
         * was on prio_list, take shortcut to find node_next instead of looping.
         */
        if (!list_empty(&iter->prio_list)) {
                iter = list_entry(iter->prio_list.next, struct plist_node,
                                  prio_list);
                node_next = &iter->node_list;
                goto queue;
        }

        plist_for_each_continue(iter, head) {
                if (node->prio != iter->prio) {
                        node_next = &iter->node_list;
                        break;
                }
        }
queue:
        list_add_tail(&node->node_list, node_next);

        plist_check_head(head);
}

#ifdef CONFIG_DEBUG_PLIST
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/module.h>
#include <linux/init.h>

static struct plist_node __initdata test_node[241];

static void __init plist_test_check(int nr_expect)
{
        struct plist_node *first, *prio_pos, *node_pos;

        if (plist_head_empty(&test_head)) {
                BUG_ON(nr_expect != 0);
                return;
        }

        prio_pos = first = plist_first(&test_head);
        plist_for_each(node_pos, &test_head) {
                if (nr_expect-- < 0)
                        break;
                if (node_pos == first)
                        continue;
                if (node_pos->prio == prio_pos->prio) {
                        BUG_ON(!list_empty(&node_pos->prio_list));
                        continue;
                }

                BUG_ON(prio_pos->prio > node_pos->prio);
                BUG_ON(prio_pos->prio_list.next != &node_pos->prio_list);
                prio_pos = node_pos;
        }

        BUG_ON(nr_expect != 0);
        BUG_ON(prio_pos->prio_list.next != &first->prio_list);
}

static void __init plist_test_requeue(struct plist_node *node)
{
        plist_requeue(node, &test_head);

        if (node != plist_last(&test_head))
                BUG_ON(node->prio == plist_next(node)->prio);
}

static int  __init plist_test(void)
{
        int nr_expect = 0, i, loop;
        unsigned int r = local_clock();

        printk(KERN_DEBUG "start plist test\n");
        plist_head_init(&test_head);
        for (i = 0; i < ARRAY_SIZE(test_node); i++)
                plist_node_init(test_node + i, 0);

        for (loop = 0; loop < 1000; loop++) {
                r = r * 193939 % 47629;
                i = r % ARRAY_SIZE(test_node);
                if (plist_node_empty(test_node + i)) {
                        r = r * 193939 % 47629;
                        test_node[i].prio = r % 99;
                        plist_add(test_node + i, &test_head);
                        nr_expect++;
                } else {
                        plist_del(test_node + i, &test_head);
                        nr_expect--;
                }
                plist_test_check(nr_expect);
                if (!plist_node_empty(test_node + i)) {
                        plist_test_requeue(test_node + i);
                        plist_test_check(nr_expect);
                }
        }

        for (i = 0; i < ARRAY_SIZE(test_node); i++) {
                if (plist_node_empty(test_node + i))
                        continue;
                plist_del(test_node + i, &test_head);
                nr_expect--;
                plist_test_check(nr_expect);
        }

        printk(KERN_DEBUG "end plist test\n");

        /* Worst case test for plist_add() */
        unsigned int test_data[241];

        for (i = 0; i < ARRAY_SIZE(test_data); i++)
                test_data[i] = i;

        ktime_t start, end, time_elapsed = 0;

        plist_head_init(&test_head);

        for (i = 0; i < ARRAY_SIZE(test_node); i++) {
                plist_node_init(test_node + i, 0);
                test_node[i].prio = test_data[i];
        }

        for (i = 0; i < ARRAY_SIZE(test_node); i++) {
                if (plist_node_empty(test_node + i)) {
                        start = ktime_get();
                        plist_add(test_node + i, &test_head);
                        end = ktime_get();
                        time_elapsed += (end - start);
                }
        }

        pr_debug("plist_add worst case test time elapsed %lld\n", time_elapsed);
        return 0;
}

module_init(plist_test);

#endif









































































































  224 








  224 





   27 





   77 



  164 







   89 









   27 



   73 

   73 
























   27 

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   97 
   97 
   96 

   97 






   97 









   97 










   97 







   60 


   97 








  236 
  147 

  151 






  259 

  259 


  259 


  216 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Copyright (C) 2009  Red Hat, Inc.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/sched/numa_balancing.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/mm_types.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
#include <linux/pfn_t.h>
#include <linux/mman.h>
#include <linux/memremap.h>
#include <linux/pagemap.h>
#include <linux/debugfs.h>
#include <linux/migrate.h>
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <linux/numa.h>
#include <linux/page_owner.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
#include <linux/compat.h>
#include <linux/pgalloc_tag.h>
#include <linux/pagewalk.h>

#include <asm/tlb.h>
#include <asm/pgalloc.h>
#include "internal.h"
#include "swap.h"

#define CREATE_TRACE_POINTS
#include <trace/events/thp.h>

/*
 * By default, transparent hugepage support is disabled in order to avoid
 * risking an increased memory footprint for applications that are not
 * guaranteed to benefit from it. When transparent hugepage support is
 * enabled, it is for all mappings, and khugepaged scans all mappings.
 * Defrag is invoked by khugepaged hugepage allocations and by page faults
 * for all hugepage allocations.
 */
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);

static struct shrinker *deferred_split_shrinker;
static unsigned long deferred_split_count(struct shrinker *shrink,
                                          struct shrink_control *sc);
static unsigned long deferred_split_scan(struct shrinker *shrink,
                                         struct shrink_control *sc);
static bool split_underused_thp = true;

static atomic_t huge_zero_refcount;
struct folio *huge_zero_folio __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
static bool anon_orders_configured __initdata;

static inline bool file_thp_enabled(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
                return false;

        if (!vma->vm_file)
                return false;

        inode = file_inode(vma->vm_file);

        return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
}

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
                                         unsigned long vm_flags,
                                         unsigned long tva_flags,
                                         unsigned long orders)
{
        bool smaps = tva_flags & TVA_SMAPS;
        bool in_pf = tva_flags & TVA_IN_PF;
        bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
        unsigned long supported_orders;

        /* Check the intersection of requested and supported orders. */
        if (vma_is_anonymous(vma))
                supported_orders = THP_ORDERS_ALL_ANON;
        else if (vma_is_special_huge(vma))
                supported_orders = THP_ORDERS_ALL_SPECIAL;
        else
                supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;

        orders &= supported_orders;
        if (!orders)
                return 0;

        if (!vma->vm_mm)                /* vdso */
                return 0;

        if (thp_disabled_by_hw() || vma_thp_disabled(vma, vm_flags))
                return 0;

        /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
        if (vma_is_dax(vma))
                return in_pf ? orders : 0;

        /*
         * khugepaged special VMA and hugetlb VMA.
         * Must be checked after dax since some dax mappings may have
         * VM_MIXEDMAP set.
         */
        if (!in_pf && !smaps && (vm_flags & VM_NO_KHUGEPAGED))
                return 0;

        /*
         * Check alignment for file vma and size for both file and anon vma by
         * filtering out the unsuitable orders.
         *
         * Skip the check for page fault. Huge fault does the check in fault
         * handlers.
         */
        if (!in_pf) {
                int order = highest_order(orders);
                unsigned long addr;

                while (orders) {
                        addr = vma->vm_end - (PAGE_SIZE << order);
                        if (thp_vma_suitable_order(vma, addr, order))
                                break;
                        order = next_order(&orders, order);
                }

                if (!orders)
                        return 0;
        }

        /*
         * Enabled via shmem mount options or sysfs settings.
         * Must be done before hugepage flags check since shmem has its
         * own flags.
         */
        if (!in_pf && shmem_file(vma->vm_file))
                return shmem_allowable_huge_orders(file_inode(vma->vm_file),
                                                   vma, vma->vm_pgoff, 0,
                                                   !enforce_sysfs);

        if (!vma_is_anonymous(vma)) {
                /*
                 * Enforce sysfs THP requirements as necessary. Anonymous vmas
                 * were already handled in thp_vma_allowable_orders().
                 */
                if (enforce_sysfs &&
                    (!hugepage_global_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
                                                    !hugepage_global_always())))
                        return 0;

                /*
                 * Trust that ->huge_fault() handlers know what they are doing
                 * in fault path.
                 */
                if (((in_pf || smaps)) && vma->vm_ops->huge_fault)
                        return orders;
                /* Only regular file is valid in collapse path */
                if (((!in_pf || smaps)) && file_thp_enabled(vma))
                        return orders;
                return 0;
        }

        if (vma_is_temporary_stack(vma))
                return 0;

        /*
         * THPeligible bit of smaps should show 1 for proper VMAs even
         * though anon_vma is not initialized yet.
         *
         * Allow page fault since anon_vma may be not initialized until
         * the first page fault.
         */
        if (!vma->anon_vma)
                return (smaps || in_pf) ? orders : 0;

        return orders;
}

static bool get_huge_zero_page(void)
{
        struct folio *zero_folio;
retry:
        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
                return true;

        zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
                        HPAGE_PMD_ORDER);
        if (!zero_folio) {
                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
                return false;
        }
        /* Ensure zero folio won't have large_rmappable flag set. */
        folio_clear_large_rmappable(zero_folio);
        preempt_disable();
        if (cmpxchg(&huge_zero_folio, NULL, zero_folio)) {
                preempt_enable();
                folio_put(zero_folio);
                goto retry;
        }
        WRITE_ONCE(huge_zero_pfn, folio_pfn(zero_folio));

        /* We take additional reference here. It will be put back by shrinker */
        atomic_set(&huge_zero_refcount, 2);
        preempt_enable();
        count_vm_event(THP_ZERO_PAGE_ALLOC);
        return true;
}

static void put_huge_zero_page(void)
{
        /*
         * Counter should never go to zero here. Only shrinker can put
         * last reference.
         */
        BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
}

struct folio *mm_get_huge_zero_folio(struct mm_struct *mm)
{
        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                return READ_ONCE(huge_zero_folio);

        if (!get_huge_zero_page())
                return NULL;

        if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                put_huge_zero_page();

        return READ_ONCE(huge_zero_folio);
}

void mm_put_huge_zero_folio(struct mm_struct *mm)
{
        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
                put_huge_zero_page();
}

static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
                                        struct shrink_control *sc)
{
        /* we can free zero page only if last reference remains */
        return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
}

static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
                                       struct shrink_control *sc)
{
        if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
                struct folio *zero_folio = xchg(&huge_zero_folio, NULL);
                BUG_ON(zero_folio == NULL);
                WRITE_ONCE(huge_zero_pfn, ~0UL);
                folio_put(zero_folio);
                return HPAGE_PMD_NR;
        }

        return 0;
}

static struct shrinker *huge_zero_page_shrinker;

#ifdef CONFIG_SYSFS
static ssize_t enabled_show(struct kobject *kobj,
                            struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
                output = "[always] madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always [madvise] never";
        else
                output = "always madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t enabled_store(struct kobject *kobj,
                             struct kobj_attribute *attr,
                             const char *buf, size_t count)
{
        ssize_t ret = count;

        if (sysfs_streq(buf, "always")) {
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else
                ret = -EINVAL;

        if (ret > 0) {
                int err = start_stop_khugepaged();
                if (err)
                        ret = err;
        }
        return ret;
}

static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);

ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
                                  enum transparent_hugepage_flag flag)
{
        return sysfs_emit(buf, "%d\n",
                          !!test_bit(flag, &transparent_hugepage_flags));
}

ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                 struct kobj_attribute *attr,
                                 const char *buf, size_t count,
                                 enum transparent_hugepage_flag flag)
{
        unsigned long value;
        int ret;

        ret = kstrtoul(buf, 10, &value);
        if (ret < 0)
                return ret;
        if (value > 1)
                return -EINVAL;

        if (value)
                set_bit(flag, &transparent_hugepage_flags);
        else
                clear_bit(flag, &transparent_hugepage_flags);

        return count;
}

static ssize_t defrag_show(struct kobject *kobj,
                           struct kobj_attribute *attr, char *buf)
{
        const char *output;

        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
                     &transparent_hugepage_flags))
                output = "[always] defer defer+madvise madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
                          &transparent_hugepage_flags))
                output = "always [defer] defer+madvise madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always defer [defer+madvise] madvise never";
        else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
                          &transparent_hugepage_flags))
                output = "always defer defer+madvise [madvise] never";
        else
                output = "always defer defer+madvise madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t defrag_store(struct kobject *kobj,
                            struct kobj_attribute *attr,
                            const char *buf, size_t count)
{
        if (sysfs_streq(buf, "always")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "defer+madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "defer")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else if (sysfs_streq(buf, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
        } else
                return -EINVAL;

        return count;
}
static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);

static ssize_t use_zero_page_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        return single_hugepage_flag_show(kobj, attr, buf,
                                         TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static ssize_t use_zero_page_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        return single_hugepage_flag_store(kobj, attr, buf, count,
                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);

static ssize_t hpage_pmd_size_show(struct kobject *kobj,
                                   struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
}
static struct kobj_attribute hpage_pmd_size_attr =
        __ATTR_RO(hpage_pmd_size);

static ssize_t split_underused_thp_show(struct kobject *kobj,
                            struct kobj_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", split_underused_thp);
}

static ssize_t split_underused_thp_store(struct kobject *kobj,
                             struct kobj_attribute *attr,
                             const char *buf, size_t count)
{
        int err = kstrtobool(buf, &split_underused_thp);

        if (err < 0)
                return err;

        return count;
}

static struct kobj_attribute split_underused_thp_attr = __ATTR(
        shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);

static struct attribute *hugepage_attr[] = {
        &enabled_attr.attr,
        &defrag_attr.attr,
        &use_zero_page_attr.attr,
        &hpage_pmd_size_attr.attr,
#ifdef CONFIG_SHMEM
        &shmem_enabled_attr.attr,
#endif
        &split_underused_thp_attr.attr,
        NULL,
};

static const struct attribute_group hugepage_attr_group = {
        .attrs = hugepage_attr,
};

static void hugepage_exit_sysfs(struct kobject *hugepage_kobj);
static void thpsize_release(struct kobject *kobj);
static DEFINE_SPINLOCK(huge_anon_orders_lock);
static LIST_HEAD(thpsize_list);

static ssize_t anon_enabled_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
{
        int order = to_thpsize(kobj)->order;
        const char *output;

        if (test_bit(order, &huge_anon_orders_always))
                output = "[always] inherit madvise never";
        else if (test_bit(order, &huge_anon_orders_inherit))
                output = "always [inherit] madvise never";
        else if (test_bit(order, &huge_anon_orders_madvise))
                output = "always inherit [madvise] never";
        else
                output = "always inherit madvise [never]";

        return sysfs_emit(buf, "%s\n", output);
}

static ssize_t anon_enabled_store(struct kobject *kobj,
                                  struct kobj_attribute *attr,
                                  const char *buf, size_t count)
{
        int order = to_thpsize(kobj)->order;
        ssize_t ret = count;

        if (sysfs_streq(buf, "always")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_inherit);
                clear_bit(order, &huge_anon_orders_madvise);
                set_bit(order, &huge_anon_orders_always);
                spin_unlock(&huge_anon_orders_lock);
        } else if (sysfs_streq(buf, "inherit")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_always);
                clear_bit(order, &huge_anon_orders_madvise);
                set_bit(order, &huge_anon_orders_inherit);
                spin_unlock(&huge_anon_orders_lock);
        } else if (sysfs_streq(buf, "madvise")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_always);
                clear_bit(order, &huge_anon_orders_inherit);
                set_bit(order, &huge_anon_orders_madvise);
                spin_unlock(&huge_anon_orders_lock);
        } else if (sysfs_streq(buf, "never")) {
                spin_lock(&huge_anon_orders_lock);
                clear_bit(order, &huge_anon_orders_always);
                clear_bit(order, &huge_anon_orders_inherit);
                clear_bit(order, &huge_anon_orders_madvise);
                spin_unlock(&huge_anon_orders_lock);
        } else
                ret = -EINVAL;

        if (ret > 0) {
                int err;

                err = start_stop_khugepaged();
                if (err)
                        ret = err;
        }
        return ret;
}

static struct kobj_attribute anon_enabled_attr =
        __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);

static struct attribute *anon_ctrl_attrs[] = {
        &anon_enabled_attr.attr,
        NULL,
};

static const struct attribute_group anon_ctrl_attr_grp = {
        .attrs = anon_ctrl_attrs,
};

static struct attribute *file_ctrl_attrs[] = {
#ifdef CONFIG_SHMEM
        &thpsize_shmem_enabled_attr.attr,
#endif
        NULL,
};

static const struct attribute_group file_ctrl_attr_grp = {
        .attrs = file_ctrl_attrs,
};

static struct attribute *any_ctrl_attrs[] = {
        NULL,
};

static const struct attribute_group any_ctrl_attr_grp = {
        .attrs = any_ctrl_attrs,
};

static const struct kobj_type thpsize_ktype = {
        .release = &thpsize_release,
        .sysfs_ops = &kobj_sysfs_ops,
};

DEFINE_PER_CPU(struct mthp_stat, mthp_stats) = {{{0}}};

static unsigned long sum_mthp_stat(int order, enum mthp_stat_item item)
{
        unsigned long sum = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                struct mthp_stat *this = &per_cpu(mthp_stats, cpu);

                sum += this->stats[order][item];
        }

        return sum;
}

#define DEFINE_MTHP_STAT_ATTR(_name, _index)                                \
static ssize_t _name##_show(struct kobject *kobj,                        \
                        struct kobj_attribute *attr, char *buf)                \
{                                                                        \
        int order = to_thpsize(kobj)->order;                                \
                                                                        \
        return sysfs_emit(buf, "%lu\n", sum_mthp_stat(order, _index));        \
}                                                                        \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)

DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
DEFINE_MTHP_STAT_ATTR(swpin_fallback_charge, MTHP_STAT_SWPIN_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
#ifdef CONFIG_SHMEM
DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
#endif
DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);

static struct attribute *anon_stats_attrs[] = {
        &anon_fault_alloc_attr.attr,
        &anon_fault_fallback_attr.attr,
        &anon_fault_fallback_charge_attr.attr,
#ifndef CONFIG_SHMEM
        &zswpout_attr.attr,
        &swpin_attr.attr,
        &swpin_fallback_attr.attr,
        &swpin_fallback_charge_attr.attr,
        &swpout_attr.attr,
        &swpout_fallback_attr.attr,
#endif
        &split_deferred_attr.attr,
        &nr_anon_attr.attr,
        &nr_anon_partially_mapped_attr.attr,
        NULL,
};

static struct attribute_group anon_stats_attr_grp = {
        .name = "stats",
        .attrs = anon_stats_attrs,
};

static struct attribute *file_stats_attrs[] = {
#ifdef CONFIG_SHMEM
        &shmem_alloc_attr.attr,
        &shmem_fallback_attr.attr,
        &shmem_fallback_charge_attr.attr,
#endif
        NULL,
};

static struct attribute_group file_stats_attr_grp = {
        .name = "stats",
        .attrs = file_stats_attrs,
};

static struct attribute *any_stats_attrs[] = {
#ifdef CONFIG_SHMEM
        &zswpout_attr.attr,
        &swpin_attr.attr,
        &swpin_fallback_attr.attr,
        &swpin_fallback_charge_attr.attr,
        &swpout_attr.attr,
        &swpout_fallback_attr.attr,
#endif
        &split_attr.attr,
        &split_failed_attr.attr,
        NULL,
};

static struct attribute_group any_stats_attr_grp = {
        .name = "stats",
        .attrs = any_stats_attrs,
};

static int sysfs_add_group(struct kobject *kobj,
                           const struct attribute_group *grp)
{
        int ret = -ENOENT;

        /*
         * If the group is named, try to merge first, assuming the subdirectory
         * was already created. This avoids the warning emitted by
         * sysfs_create_group() if the directory already exists.
         */
        if (grp->name)
                ret = sysfs_merge_group(kobj, grp);
        if (ret)
                ret = sysfs_create_group(kobj, grp);

        return ret;
}

static struct thpsize *thpsize_create(int order, struct kobject *parent)
{
        unsigned long size = (PAGE_SIZE << order) / SZ_1K;
        struct thpsize *thpsize;
        int ret = -ENOMEM;

        thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
        if (!thpsize)
                goto err;

        thpsize->order = order;

        ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
                                   "hugepages-%lukB", size);
        if (ret) {
                kfree(thpsize);
                goto err;
        }


        ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp);
        if (ret)
                goto err_put;

        ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp);
        if (ret)
                goto err_put;

        if (BIT(order) & THP_ORDERS_ALL_ANON) {
                ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
                if (ret)
                        goto err_put;

                ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
                if (ret)
                        goto err_put;
        }

        if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
                ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
                if (ret)
                        goto err_put;

                ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
                if (ret)
                        goto err_put;
        }

        return thpsize;
err_put:
        kobject_put(&thpsize->kobj);
err:
        return ERR_PTR(ret);
}

static void thpsize_release(struct kobject *kobj)
{
        kfree(to_thpsize(kobj));
}

static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
        int err;
        struct thpsize *thpsize;
        unsigned long orders;
        int order;

        /*
         * Default to setting PMD-sized THP to inherit the global setting and
         * disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
         * constant so we have to do this here.
         */
        if (!anon_orders_configured)
                huge_anon_orders_inherit = BIT(PMD_ORDER);

        *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
        if (unlikely(!*hugepage_kobj)) {
                pr_err("failed to create transparent hugepage kobject\n");
                return -ENOMEM;
        }

        err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
        if (err) {
                pr_err("failed to register transparent hugepage group\n");
                goto delete_obj;
        }

        err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
        if (err) {
                pr_err("failed to register transparent hugepage group\n");
                goto remove_hp_group;
        }

        orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
        order = highest_order(orders);
        while (orders) {
                thpsize = thpsize_create(order, *hugepage_kobj);
                if (IS_ERR(thpsize)) {
                        pr_err("failed to create thpsize for order %d\n", order);
                        err = PTR_ERR(thpsize);
                        goto remove_all;
                }
                list_add(&thpsize->node, &thpsize_list);
                order = next_order(&orders, order);
        }

        return 0;

remove_all:
        hugepage_exit_sysfs(*hugepage_kobj);
        return err;
remove_hp_group:
        sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
delete_obj:
        kobject_put(*hugepage_kobj);
        return err;
}

static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
        struct thpsize *thpsize, *tmp;

        list_for_each_entry_safe(thpsize, tmp, &thpsize_list, node) {
                list_del(&thpsize->node);
                kobject_put(&thpsize->kobj);
        }

        sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
        sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
        kobject_put(hugepage_kobj);
}
#else
static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
{
        return 0;
}

static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
{
}
#endif /* CONFIG_SYSFS */

static int __init thp_shrinker_init(void)
{
        huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero");
        if (!huge_zero_page_shrinker)
                return -ENOMEM;

        deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
                                                 SHRINKER_MEMCG_AWARE |
                                                 SHRINKER_NONSLAB,
                                                 "thp-deferred_split");
        if (!deferred_split_shrinker) {
                shrinker_free(huge_zero_page_shrinker);
                return -ENOMEM;
        }

        huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count;
        huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan;
        shrinker_register(huge_zero_page_shrinker);

        deferred_split_shrinker->count_objects = deferred_split_count;
        deferred_split_shrinker->scan_objects = deferred_split_scan;
        shrinker_register(deferred_split_shrinker);

        return 0;
}

static void __init thp_shrinker_exit(void)
{
        shrinker_free(huge_zero_page_shrinker);
        shrinker_free(deferred_split_shrinker);
}

static int __init hugepage_init(void)
{
        int err;
        struct kobject *hugepage_kobj;

        if (!has_transparent_hugepage()) {
                transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED;
                return -EINVAL;
        }

        /*
         * hugepages can't be allocated by the buddy allocator
         */
        MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER > MAX_PAGE_ORDER);

        err = hugepage_init_sysfs(&hugepage_kobj);
        if (err)
                goto err_sysfs;

        err = khugepaged_init();
        if (err)
                goto err_slab;

        err = thp_shrinker_init();
        if (err)
                goto err_shrinker;

        /*
         * By default disable transparent hugepages on smaller systems,
         * where the extra memory used could hurt more than TLB overhead
         * is likely to save.  The admin can still enable it through /sys.
         */
        if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
                transparent_hugepage_flags = 0;
                return 0;
        }

        err = start_stop_khugepaged();
        if (err)
                goto err_khugepaged;

        return 0;
err_khugepaged:
        thp_shrinker_exit();
err_shrinker:
        khugepaged_destroy();
err_slab:
        hugepage_exit_sysfs(hugepage_kobj);
err_sysfs:
        return err;
}
subsys_initcall(hugepage_init);

static int __init setup_transparent_hugepage(char *str)
{
        int ret = 0;
        if (!str)
                goto out;
        if (!strcmp(str, "always")) {
                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
                        &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags);
                ret = 1;
        } else if (!strcmp(str, "madvise")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                          &transparent_hugepage_flags);
                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                        &transparent_hugepage_flags);
                ret = 1;
        } else if (!strcmp(str, "never")) {
                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
                          &transparent_hugepage_flags);
                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
                          &transparent_hugepage_flags);
                ret = 1;
        }
out:
        if (!ret)
                pr_warn("transparent_hugepage= cannot parse, ignored\n");
        return ret;
}
__setup("transparent_hugepage=", setup_transparent_hugepage);

static char str_dup[PAGE_SIZE] __initdata;
static int __init setup_thp_anon(char *str)
{
        char *token, *range, *policy, *subtoken;
        unsigned long always, inherit, madvise;
        char *start_size, *end_size;
        int start, end, nr;
        char *p;

        if (!str || strlen(str) + 1 > PAGE_SIZE)
                goto err;
        strscpy(str_dup, str);

        always = huge_anon_orders_always;
        madvise = huge_anon_orders_madvise;
        inherit = huge_anon_orders_inherit;
        p = str_dup;
        while ((token = strsep(&p, ";")) != NULL) {
                range = strsep(&token, ":");
                policy = token;

                if (!policy)
                        goto err;

                while ((subtoken = strsep(&range, ",")) != NULL) {
                        if (strchr(subtoken, '-')) {
                                start_size = strsep(&subtoken, "-");
                                end_size = subtoken;

                                start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON);
                                end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON);
                        } else {
                                start_size = end_size = subtoken;
                                start = end = get_order_from_str(subtoken,
                                                                 THP_ORDERS_ALL_ANON);
                        }

                        if (start == -EINVAL) {
                                pr_err("invalid size %s in thp_anon boot parameter\n", start_size);
                                goto err;
                        }

                        if (end == -EINVAL) {
                                pr_err("invalid size %s in thp_anon boot parameter\n", end_size);
                                goto err;
                        }

                        if (start < 0 || end < 0 || start > end)
                                goto err;

                        nr = end - start + 1;
                        if (!strcmp(policy, "always")) {
                                bitmap_set(&always, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                        } else if (!strcmp(policy, "madvise")) {
                                bitmap_set(&madvise, start, nr);
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&always, start, nr);
                        } else if (!strcmp(policy, "inherit")) {
                                bitmap_set(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                        } else if (!strcmp(policy, "never")) {
                                bitmap_clear(&inherit, start, nr);
                                bitmap_clear(&madvise, start, nr);
                                bitmap_clear(&always, start, nr);
                        } else {
                                pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
                                goto err;
                        }
                }
        }

        huge_anon_orders_always = always;
        huge_anon_orders_madvise = madvise;
        huge_anon_orders_inherit = inherit;
        anon_orders_configured = true;
        return 1;

err:
        pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
        return 0;
}
__setup("thp_anon=", setup_thp_anon);

pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pmd = pmd_mkwrite(pmd, vma);
        return pmd;
}

#ifdef CONFIG_MEMCG
static inline
struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
        struct mem_cgroup *memcg = folio_memcg(folio);
        struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));

        if (memcg)
                return &memcg->deferred_split_queue;
        else
                return &pgdat->deferred_split_queue;
}
#else
static inline
struct deferred_split *get_deferred_split_queue(struct folio *folio)
{
        struct pglist_data *pgdat = NODE_DATA(folio_nid(folio));

        return &pgdat->deferred_split_queue;
}
#endif

static inline bool is_transparent_hugepage(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return false;

        return is_huge_zero_folio(folio) ||
                folio_test_large_rmappable(folio);
}

static unsigned long __thp_get_unmapped_area(struct file *filp,
                unsigned long addr, unsigned long len,
                loff_t off, unsigned long flags, unsigned long size,
                vm_flags_t vm_flags)
{
        loff_t off_end = off + len;
        loff_t off_align = round_up(off, size);
        unsigned long len_pad, ret, off_sub;

        if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall())
                return 0;

        if (off_end <= off_align || (off_end - off_align) < size)
                return 0;

        len_pad = len + size;
        if (len_pad < len || (off + len_pad) < off)
                return 0;

        ret = mm_get_unmapped_area_vmflags(current->mm, filp, addr, len_pad,
                                           off >> PAGE_SHIFT, flags, vm_flags);

        /*
         * The failure might be due to length padding. The caller will retry
         * without the padding.
         */
        if (IS_ERR_VALUE(ret))
                return 0;

        /*
         * Do not try to align to THP boundary if allocation at the address
         * hint succeeds.
         */
        if (ret == addr)
                return addr;

        off_sub = (off - ret) & (size - 1);

        if (test_bit(MMF_TOPDOWN, &current->mm->flags) && !off_sub)
                return ret + size;

        ret += off_sub;
        return ret;
}

unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags,
                vm_flags_t vm_flags)
{
        unsigned long ret;
        loff_t off = (loff_t)pgoff << PAGE_SHIFT;

        ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE, vm_flags);
        if (ret)
                return ret;

        return mm_get_unmapped_area_vmflags(current->mm, filp, addr, len, pgoff, flags,
                                            vm_flags);
}

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags)
{
        return thp_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, 0);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);

static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
                unsigned long addr)
{
        gfp_t gfp = vma_thp_gfp_mask(vma);
        const int order = HPAGE_PMD_ORDER;
        struct folio *folio;

        folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK);

        if (unlikely(!folio)) {
                count_vm_event(THP_FAULT_FALLBACK);
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                return NULL;
        }

        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
        if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
                folio_put(folio);
                count_vm_event(THP_FAULT_FALLBACK);
                count_vm_event(THP_FAULT_FALLBACK_CHARGE);
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
                count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
                return NULL;
        }
        folio_throttle_swaprate(folio, gfp);

       /*
        * When a folio is not zeroed during allocation (__GFP_ZERO not used)
        * or user folios require special handling, folio_zero_user() is used to
        * make sure that the page corresponding to the faulting address will be
        * hot in the cache after zeroing.
        */
        if (user_alloc_needs_zeroing())
                folio_zero_user(folio, addr);
        /*
         * The memory barrier inside __folio_mark_uptodate makes sure that
         * folio_zero_user writes become visible before the set_pmd_at()
         * write.
         */
        __folio_mark_uptodate(folio);
        return folio;
}

static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
                struct vm_area_struct *vma, unsigned long haddr)
{
        pmd_t entry;

        entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
        folio_add_lru_vma(folio, vma);
        set_pmd_at(vma->vm_mm, haddr, pmd, entry);
        update_mmu_cache_pmd(vma, haddr, pmd);
        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
        count_vm_event(THP_FAULT_ALLOC);
        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}

static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        pgtable_t pgtable;
        vm_fault_t ret = 0;

        folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
        if (unlikely(!folio))
                return VM_FAULT_FALLBACK;

        pgtable = pte_alloc_one(vma->vm_mm);
        if (unlikely(!pgtable)) {
                ret = VM_FAULT_OOM;
                goto release;
        }

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_none(*vmf->pmd))) {
                goto unlock_release;
        } else {
                ret = check_stable_address_space(vma->vm_mm);
                if (ret)
                        goto unlock_release;

                /* Deliver the page fault to userland */
                if (userfaultfd_missing(vma)) {
                        spin_unlock(vmf->ptl);
                        folio_put(folio);
                        pte_free(vma->vm_mm, pgtable);
                        ret = handle_userfault(vmf, VM_UFFD_MISSING);
                        VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        return ret;
                }
                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
                map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
                mm_inc_nr_ptes(vma->vm_mm);
                deferred_split_folio(folio, false);
                spin_unlock(vmf->ptl);
        }

        return 0;
unlock_release:
        spin_unlock(vmf->ptl);
release:
        if (pgtable)
                pte_free(vma->vm_mm, pgtable);
        folio_put(folio);
        return ret;

}

/*
 * always: directly stall for all thp allocations
 * defer: wake kswapd and fail if not immediately available
 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
 *                  fail if not immediately available
 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
 *            available
 * never: never stall for any thp allocation
 */
gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
{
        const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);

        /* Always do synchronous compaction */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);

        /* Kick kcompactd and fail quickly */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;

        /* Synchronous compaction if madvised, otherwise kick kcompactd */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT |
                        (vma_madvised ? __GFP_DIRECT_RECLAIM :
                                        __GFP_KSWAPD_RECLAIM);

        /* Only do synchronous compaction if madvised */
        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
                return GFP_TRANSHUGE_LIGHT |
                       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);

        return GFP_TRANSHUGE_LIGHT;
}

/* Caller must hold page table lock. */
static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
                struct folio *zero_folio)
{
        pmd_t entry;
        entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
        entry = pmd_mkhuge(entry);
        pgtable_trans_huge_deposit(mm, pmd, pgtable);
        set_pmd_at(mm, haddr, pmd, entry);
        mm_inc_nr_ptes(mm);
}

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        vm_fault_t ret;

        if (!thp_vma_suitable_order(vma, haddr, PMD_ORDER))
                return VM_FAULT_FALLBACK;
        ret = vmf_anon_prepare(vmf);
        if (ret)
                return ret;
        khugepaged_enter_vma(vma, vma->vm_flags);

        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm) &&
                        transparent_hugepage_use_zero_page()) {
                pgtable_t pgtable;
                struct folio *zero_folio;
                vm_fault_t ret;

                pgtable = pte_alloc_one(vma->vm_mm);
                if (unlikely(!pgtable))
                        return VM_FAULT_OOM;
                zero_folio = mm_get_huge_zero_folio(vma->vm_mm);
                if (unlikely(!zero_folio)) {
                        pte_free(vma->vm_mm, pgtable);
                        count_vm_event(THP_FAULT_FALLBACK);
                        return VM_FAULT_FALLBACK;
                }
                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
                ret = 0;
                if (pmd_none(*vmf->pmd)) {
                        ret = check_stable_address_space(vma->vm_mm);
                        if (ret) {
                                spin_unlock(vmf->ptl);
                                pte_free(vma->vm_mm, pgtable);
                        } else if (userfaultfd_missing(vma)) {
                                spin_unlock(vmf->ptl);
                                pte_free(vma->vm_mm, pgtable);
                                ret = handle_userfault(vmf, VM_UFFD_MISSING);
                                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
                        } else {
                                set_huge_zero_folio(pgtable, vma->vm_mm, vma,
                                                   haddr, vmf->pmd, zero_folio);
                                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                                spin_unlock(vmf->ptl);
                        }
                } else {
                        spin_unlock(vmf->ptl);
                        pte_free(vma->vm_mm, pgtable);
                }
                return ret;
        }

        return __do_huge_pmd_anonymous_page(vmf);
}

static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
                pgtable_t pgtable)
{
        struct mm_struct *mm = vma->vm_mm;
        pmd_t entry;

        lockdep_assert_held(pmd_lockptr(mm, pmd));

        if (!pmd_none(*pmd)) {
                if (write) {
                        if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
                                WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
                                return -EEXIST;
                        }
                        entry = pmd_mkyoung(*pmd);
                        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                        if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
                                update_mmu_cache_pmd(vma, addr, pmd);
                }

                return -EEXIST;
        }

        entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
        if (pfn_t_devmap(pfn))
                entry = pmd_mkdevmap(entry);
        else
                entry = pmd_mkspecial(entry);
        if (write) {
                entry = pmd_mkyoung(pmd_mkdirty(entry));
                entry = maybe_pmd_mkwrite(entry, vma);
        }

        if (pgtable) {
                pgtable_trans_huge_deposit(mm, pmd, pgtable);
                mm_inc_nr_ptes(mm);
        }

        set_pmd_at(mm, addr, pmd, entry);
        update_mmu_cache_pmd(vma, addr, pmd);
        return 0;
}

/**
 * vmf_insert_pfn_pmd - insert a pmd size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pmd size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{
        unsigned long addr = vmf->address & PMD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;
        pgtable_t pgtable = NULL;
        spinlock_t *ptl;
        int error;

        /*
         * If we had pmd_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
         * can't support a 'special' bit.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
                        !pfn_t_devmap(pfn));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (arch_needs_pgtable_deposit()) {
                pgtable = pte_alloc_one(vma->vm_mm);
                if (!pgtable)
                        return VM_FAULT_OOM;
        }

        track_pfn_insert(vma, &pgprot, pfn);
        ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write,
                        pgtable);
        spin_unlock(ptl);
        if (error && pgtable)
                pte_free(vma->vm_mm, pgtable);

        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);

vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
                                bool write)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long addr = vmf->address & PMD_MASK;
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        pgtable_t pgtable = NULL;
        int error;

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
                return VM_FAULT_SIGBUS;

        if (arch_needs_pgtable_deposit()) {
                pgtable = pte_alloc_one(vma->vm_mm);
                if (!pgtable)
                        return VM_FAULT_OOM;
        }

        ptl = pmd_lock(mm, vmf->pmd);
        if (pmd_none(*vmf->pmd)) {
                folio_get(folio);
                folio_add_file_rmap_pmd(folio, &folio->page, vma);
                add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
        }
        error = insert_pfn_pmd(vma, addr, vmf->pmd,
                        pfn_to_pfn_t(folio_pfn(folio)), vma->vm_page_prot,
                        write, pgtable);
        spin_unlock(ptl);
        if (error && pgtable)
                pte_free(mm, pgtable);

        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
{
        if (likely(vma->vm_flags & VM_WRITE))
                pud = pud_mkwrite(pud);
        return pud;
}

static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
                pud_t *pud, pfn_t pfn, bool write)
{
        struct mm_struct *mm = vma->vm_mm;
        pgprot_t prot = vma->vm_page_prot;
        pud_t entry;

        if (!pud_none(*pud)) {
                if (write) {
                        if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
                                return;
                        entry = pud_mkyoung(*pud);
                        entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
                        if (pudp_set_access_flags(vma, addr, pud, entry, 1))
                                update_mmu_cache_pud(vma, addr, pud);
                }
                return;
        }

        entry = pud_mkhuge(pfn_t_pud(pfn, prot));
        if (pfn_t_devmap(pfn))
                entry = pud_mkdevmap(entry);
        else
                entry = pud_mkspecial(entry);
        if (write) {
                entry = pud_mkyoung(pud_mkdirty(entry));
                entry = maybe_pud_mkwrite(entry, vma);
        }
        set_pud_at(mm, addr, pud, entry);
        update_mmu_cache_pud(vma, addr, pud);
}

/**
 * vmf_insert_pfn_pud - insert a pud size pfn
 * @vmf: Structure describing the fault
 * @pfn: pfn to insert
 * @write: whether it's a write fault
 *
 * Insert a pud size pfn. See vmf_insert_pfn() for additional info.
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{
        unsigned long addr = vmf->address & PUD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        pgprot_t pgprot = vma->vm_page_prot;
        spinlock_t *ptl;

        /*
         * If we had pud_special, we could avoid all these restrictions,
         * but we need to be consistent with PTEs and architectures that
         * can't support a 'special' bit.
         */
        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
                        !pfn_t_devmap(pfn));
        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
                                                (VM_PFNMAP|VM_MIXEDMAP));
        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        track_pfn_insert(vma, &pgprot, pfn);

        ptl = pud_lock(vma->vm_mm, vmf->pud);
        insert_pfn_pud(vma, addr, vmf->pud, pfn, write);
        spin_unlock(ptl);

        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);

/**
 * vmf_insert_folio_pud - insert a pud size folio mapped by a pud entry
 * @vmf: Structure describing the fault
 * @folio: folio to insert
 * @write: whether it's a write fault
 *
 * Return: vm_fault_t value.
 */
vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
                                bool write)
{
        struct vm_area_struct *vma = vmf->vma;
        unsigned long addr = vmf->address & PUD_MASK;
        pud_t *pud = vmf->pud;
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;

        if (addr < vma->vm_start || addr >= vma->vm_end)
                return VM_FAULT_SIGBUS;

        if (WARN_ON_ONCE(folio_order(folio) != PUD_ORDER))
                return VM_FAULT_SIGBUS;

        ptl = pud_lock(mm, pud);

        /*
         * If there is already an entry present we assume the folio is
         * already mapped, hence no need to take another reference. We
         * still call insert_pfn_pud() though in case the mapping needs
         * upgrading to writeable.
         */
        if (pud_none(*vmf->pud)) {
                folio_get(folio);
                folio_add_file_rmap_pud(folio, &folio->page, vma);
                add_mm_counter(mm, mm_counter_file(folio), HPAGE_PUD_NR);
        }
        insert_pfn_pud(vma, addr, vmf->pud, pfn_to_pfn_t(folio_pfn(folio)),
                write);
        spin_unlock(ptl);

        return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
               pmd_t *pmd, bool write)
{
        pmd_t _pmd;

        _pmd = pmd_mkyoung(*pmd);
        if (write)
                _pmd = pmd_mkdirty(_pmd);
        if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
                                  pmd, _pmd, write))
                update_mmu_cache_pmd(vma, addr, pmd);
}

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
        unsigned long pfn = pmd_pfn(*pmd);
        struct mm_struct *mm = vma->vm_mm;
        struct page *page;
        int ret;

        assert_spin_locked(pmd_lockptr(mm, pmd));

        if (flags & FOLL_WRITE && !pmd_write(*pmd))
                return NULL;

        if (pmd_present(*pmd) && pmd_devmap(*pmd))
                /* pass */;
        else
                return NULL;

        if (flags & FOLL_TOUCH)
                touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);

        /*
         * device mapped pages can only be returned if the
         * caller will manage the page reference count.
         */
        if (!(flags & (FOLL_GET | FOLL_PIN)))
                return ERR_PTR(-EEXIST);

        pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
        *pgmap = get_dev_pagemap(pfn, *pgmap);
        if (!*pgmap)
                return ERR_PTR(-EFAULT);
        page = pfn_to_page(pfn);
        ret = try_grab_folio(page_folio(page), 1, flags);
        if (ret)
                page = ERR_PTR(ret);

        return page;
}

int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
        spinlock_t *dst_ptl, *src_ptl;
        struct page *src_page;
        struct folio *src_folio;
        pmd_t pmd;
        pgtable_t pgtable = NULL;
        int ret = -ENOMEM;

        pmd = pmdp_get_lockless(src_pmd);
        if (unlikely(pmd_present(pmd) && pmd_special(pmd))) {
                dst_ptl = pmd_lock(dst_mm, dst_pmd);
                src_ptl = pmd_lockptr(src_mm, src_pmd);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                /*
                 * No need to recheck the pmd, it can't change with write
                 * mmap lock held here.
                 *
                 * Meanwhile, making sure it's not a CoW VMA with writable
                 * mapping, otherwise it means either the anon page wrongly
                 * applied special bit, or we made the PRIVATE mapping be
                 * able to wrongly write to the backend MMIO.
                 */
                VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
                goto set_pmd;
        }

        /* Skip if can be re-fill on fault */
        if (!vma_is_anonymous(dst_vma))
                return 0;

        pgtable = pte_alloc_one(dst_mm);
        if (unlikely(!pgtable))
                goto out;

        dst_ptl = pmd_lock(dst_mm, dst_pmd);
        src_ptl = pmd_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        ret = -EAGAIN;
        pmd = *src_pmd;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (unlikely(is_swap_pmd(pmd))) {
                swp_entry_t entry = pmd_to_swp_entry(pmd);

                VM_BUG_ON(!is_pmd_migration_entry(pmd));
                if (!is_readable_migration_entry(entry)) {
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pmd = swp_entry_to_pmd(entry);
                        if (pmd_swp_soft_dirty(*src_pmd))
                                pmd = pmd_swp_mksoft_dirty(pmd);
                        if (pmd_swp_uffd_wp(*src_pmd))
                                pmd = pmd_swp_mkuffd_wp(pmd);
                        set_pmd_at(src_mm, addr, src_pmd, pmd);
                }
                add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm_inc_nr_ptes(dst_mm);
                pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
                if (!userfaultfd_wp(dst_vma))
                        pmd = pmd_swp_clear_uffd_wp(pmd);
                set_pmd_at(dst_mm, addr, dst_pmd, pmd);
                ret = 0;
                goto out_unlock;
        }
#endif

        if (unlikely(!pmd_trans_huge(pmd))) {
                pte_free(dst_mm, pgtable);
                goto out_unlock;
        }
        /*
         * When page table lock is held, the huge zero pmd should not be
         * under splitting since we don't split the page itself, only pmd to
         * a page table.
         */
        if (is_huge_zero_pmd(pmd)) {
                /*
                 * mm_get_huge_zero_folio() will never allocate a new
                 * folio here, since we already have a zero page to
                 * copy. It just takes a reference.
                 */
                mm_get_huge_zero_folio(dst_mm);
                goto out_zero_page;
        }

        src_page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
        src_folio = page_folio(src_page);

        folio_get(src_folio);
        if (unlikely(folio_try_dup_anon_rmap_pmd(src_folio, src_page, dst_vma, src_vma))) {
                /* Page maybe pinned: split and retry the fault on PTEs. */
                folio_put(src_folio);
                pte_free(dst_mm, pgtable);
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
                __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
                return -EAGAIN;
        }
        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
out_zero_page:
        mm_inc_nr_ptes(dst_mm);
        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
        if (!userfaultfd_wp(dst_vma))
                pmd = pmd_clear_uffd_wp(pmd);
        pmd = pmd_wrprotect(pmd);
set_pmd:
        pmd = pmd_mkold(pmd);
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);

        ret = 0;
out_unlock:
        spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
out:
        return ret;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
               pud_t *pud, bool write)
{
        pud_t _pud;

        _pud = pud_mkyoung(*pud);
        if (write)
                _pud = pud_mkdirty(_pud);
        if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
                                  pud, _pud, write))
                update_mmu_cache_pud(vma, addr, pud);
}

int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma)
{
        spinlock_t *dst_ptl, *src_ptl;
        pud_t pud;
        int ret;

        dst_ptl = pud_lock(dst_mm, dst_pud);
        src_ptl = pud_lockptr(src_mm, src_pud);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        ret = -EAGAIN;
        pud = *src_pud;
        if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
                goto out_unlock;

        /*
         * TODO: once we support anonymous pages, use
         * folio_try_dup_anon_rmap_*() and split if duplicating fails.
         */
        if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) {
                pudp_set_wrprotect(src_mm, addr, src_pud);
                pud = pud_wrprotect(pud);
        }
        pud = pud_mkold(pud);
        set_pud_at(dst_mm, addr, dst_pud, pud);

        ret = 0;
out_unlock:
        spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
        return ret;
}

void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
        if (unlikely(!pud_same(*vmf->pud, orig_pud)))
                goto unlock;

        touch_pud(vmf->vma, vmf->address, vmf->pud, write);
unlock:
        spin_unlock(vmf->ptl);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

void huge_pmd_set_accessed(struct vm_fault *vmf)
{
        bool write = vmf->flags & FAULT_FLAG_WRITE;

        vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
                goto unlock;

        touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);

unlock:
        spin_unlock(vmf->ptl);
}

static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
{
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        struct vm_area_struct *vma = vmf->vma;
        struct mmu_notifier_range range;
        struct folio *folio;
        vm_fault_t ret = 0;

        folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
        if (unlikely(!folio))
                return VM_FAULT_FALLBACK;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr,
                                haddr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd)))
                goto release;
        ret = check_stable_address_space(vma->vm_mm);
        if (ret)
                goto release;
        (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd);
        map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
        goto unlock;
release:
        folio_put(folio);
unlock:
        spin_unlock(vmf->ptl);
        mmu_notifier_invalidate_range_end(&range);
        return ret;
}

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
{
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        struct page *page;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        pmd_t orig_pmd = vmf->orig_pmd;

        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);

        if (is_huge_zero_pmd(orig_pmd)) {
                vm_fault_t ret = do_huge_zero_wp_pmd(vmf);

                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;

                /* Fallback to splitting PMD if THP cannot be allocated */
                goto fallback;
        }

        spin_lock(vmf->ptl);

        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                spin_unlock(vmf->ptl);
                return 0;
        }

        page = pmd_page(orig_pmd);
        folio = page_folio(page);
        VM_BUG_ON_PAGE(!PageHead(page), page);

        /* Early check when only holding the PT lock. */
        if (PageAnonExclusive(page))
                goto reuse;

        if (!folio_trylock(folio)) {
                folio_get(folio);
                spin_unlock(vmf->ptl);
                folio_lock(folio);
                spin_lock(vmf->ptl);
                if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                        spin_unlock(vmf->ptl);
                        folio_unlock(folio);
                        folio_put(folio);
                        return 0;
                }
                folio_put(folio);
        }

        /* Recheck after temporarily dropping the PT lock. */
        if (PageAnonExclusive(page)) {
                folio_unlock(folio);
                goto reuse;
        }

        /*
         * See do_wp_page(): we can only reuse the folio exclusively if
         * there are no additional references. Note that we always drain
         * the LRU cache immediately after adding a THP.
         */
        if (folio_ref_count(folio) >
                        1 + folio_test_swapcache(folio) * folio_nr_pages(folio))
                goto unlock_fallback;
        if (folio_test_swapcache(folio))
                folio_free_swap(folio);
        if (folio_ref_count(folio) == 1) {
                pmd_t entry;

                folio_move_anon_rmap(folio, vma);
                SetPageAnonExclusive(page);
                folio_unlock(folio);
reuse:
                if (unlikely(unshare)) {
                        spin_unlock(vmf->ptl);
                        return 0;
                }
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
                spin_unlock(vmf->ptl);
                return 0;
        }

unlock_fallback:
        folio_unlock(folio);
        spin_unlock(vmf->ptl);
fallback:
        __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
        return VM_FAULT_FALLBACK;
}

static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
                                           unsigned long addr, pmd_t pmd)
{
        struct page *page;

        if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE)))
                return false;

        /* Don't touch entries that are not even readable (NUMA hinting). */
        if (pmd_protnone(pmd))
                return false;

        /* Do we need write faults for softdirty tracking? */
        if (pmd_needs_soft_dirty_wp(vma, pmd))
                return false;

        /* Do we need write faults for uffd-wp tracking? */
        if (userfaultfd_huge_pmd_wp(vma, pmd))
                return false;

        if (!(vma->vm_flags & VM_SHARED)) {
                /* See can_change_pte_writable(). */
                page = vm_normal_page_pmd(vma, addr, pmd);
                return page && PageAnon(page) && PageAnonExclusive(page);
        }

        /* See can_change_pte_writable(). */
        return pmd_dirty(pmd);
}

/* NUMA hinting page fault entry point for trans huge pmds */
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct folio *folio;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        int nid = NUMA_NO_NODE;
        int target_nid, last_cpupid;
        pmd_t pmd, old_pmd;
        bool writable = false;
        int flags = 0;

        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        old_pmd = pmdp_get(vmf->pmd);

        if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
                spin_unlock(vmf->ptl);
                return 0;
        }

        pmd = pmd_modify(old_pmd, vma->vm_page_prot);

        /*
         * Detect now whether the PMD could be writable; this information
         * is only valid while holding the PT lock.
         */
        writable = pmd_write(pmd);
        if (!writable && vma_wants_manual_pte_write_upgrade(vma) &&
            can_change_pmd_writable(vma, vmf->address, pmd))
                writable = true;

        folio = vm_normal_folio_pmd(vma, haddr, pmd);
        if (!folio)
                goto out_map;

        nid = folio_nid(folio);

        target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
                                        &last_cpupid);
        if (target_nid == NUMA_NO_NODE)
                goto out_map;
        if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
                flags |= TNF_MIGRATE_FAIL;
                goto out_map;
        }
        /* The folio is isolated and isolation code holds a folio reference. */
        spin_unlock(vmf->ptl);
        writable = false;

        if (!migrate_misplaced_folio(folio, target_nid)) {
                flags |= TNF_MIGRATED;
                nid = target_nid;
                task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
                return 0;
        }

        flags |= TNF_MIGRATE_FAIL;
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
                spin_unlock(vmf->ptl);
                return 0;
        }
out_map:
        /* Restore the PMD */
        pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot);
        pmd = pmd_mkyoung(pmd);
        if (writable)
                pmd = pmd_mkwrite(pmd, vma);
        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
        spin_unlock(vmf->ptl);

        if (nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags);
        return 0;
}

/*
 * Return true if we do MADV_FREE successfully on entire pmd page.
 * Otherwise, return false.
 */
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pmd_t *pmd, unsigned long addr, unsigned long next)
{
        spinlock_t *ptl;
        pmd_t orig_pmd;
        struct folio *folio;
        struct mm_struct *mm = tlb->mm;
        bool ret = false;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                goto out_unlocked;

        orig_pmd = *pmd;
        if (is_huge_zero_pmd(orig_pmd))
                goto out;

        if (unlikely(!pmd_present(orig_pmd))) {
                VM_BUG_ON(thp_migration_supported() &&
                                  !is_pmd_migration_entry(orig_pmd));
                goto out;
        }

        folio = pmd_folio(orig_pmd);
        /*
         * If other processes are mapping this folio, we couldn't discard
         * the folio unless they all do MADV_FREE so let's skip the folio.
         */
        if (folio_maybe_mapped_shared(folio))
                goto out;

        if (!folio_trylock(folio))
                goto out;

        /*
         * If user want to discard part-pages of THP, split it so MADV_FREE
         * will deactivate only them.
         */
        if (next - addr != HPAGE_PMD_SIZE) {
                folio_get(folio);
                spin_unlock(ptl);
                split_folio(folio);
                folio_unlock(folio);
                folio_put(folio);
                goto out_unlocked;
        }

        if (folio_test_dirty(folio))
                folio_clear_dirty(folio);
        folio_unlock(folio);

        if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
                pmdp_invalidate(vma, addr, pmd);
                orig_pmd = pmd_mkold(orig_pmd);
                orig_pmd = pmd_mkclean(orig_pmd);

                set_pmd_at(mm, addr, pmd, orig_pmd);
                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        }

        folio_mark_lazyfree(folio);
        ret = true;
out:
        spin_unlock(ptl);
out_unlocked:
        return ret;
}

static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
{
        pgtable_t pgtable;

        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pte_free(mm, pgtable);
        mm_dec_nr_ptes(mm);
}

int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pmd_t *pmd, unsigned long addr)
{
        pmd_t orig_pmd;
        spinlock_t *ptl;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                return 0;
        /*
         * For architectures like ppc64 we look at deposited pgtable
         * when calling pmdp_huge_get_and_clear. So do the
         * pgtable_trans_huge_withdraw after finishing pmdp related
         * operations.
         */
        orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
                                                tlb->fullmm);
        arch_check_zapped_pmd(vma, orig_pmd);
        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(tlb->mm, pmd);
                spin_unlock(ptl);
        } else if (is_huge_zero_pmd(orig_pmd)) {
                if (!vma_is_dax(vma) || arch_needs_pgtable_deposit())
                        zap_deposited_table(tlb->mm, pmd);
                spin_unlock(ptl);
        } else {
                struct folio *folio = NULL;
                int flush_needed = 1;

                if (pmd_present(orig_pmd)) {
                        struct page *page = pmd_page(orig_pmd);

                        folio = page_folio(page);
                        folio_remove_rmap_pmd(folio, page, vma);
                        WARN_ON_ONCE(folio_mapcount(folio) < 0);
                        VM_BUG_ON_PAGE(!PageHead(page), page);
                } else if (thp_migration_supported()) {
                        swp_entry_t entry;

                        VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
                        entry = pmd_to_swp_entry(orig_pmd);
                        folio = pfn_swap_entry_folio(entry);
                        flush_needed = 0;
                } else
                        WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");

                if (folio_test_anon(folio)) {
                        zap_deposited_table(tlb->mm, pmd);
                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
                } else {
                        if (arch_needs_pgtable_deposit())
                                zap_deposited_table(tlb->mm, pmd);
                        add_mm_counter(tlb->mm, mm_counter_file(folio),
                                       -HPAGE_PMD_NR);
                }

                spin_unlock(ptl);
                if (flush_needed)
                        tlb_remove_page_size(tlb, &folio->page, HPAGE_PMD_SIZE);
        }
        return 1;
}

#ifndef pmd_move_must_withdraw
static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
                                         spinlock_t *old_pmd_ptl,
                                         struct vm_area_struct *vma)
{
        /*
         * With split pmd lock we also need to move preallocated
         * PTE page table if new_pmd is on different PMD page table.
         *
         * We also don't deposit and withdraw tables for file pages.
         */
        return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
}
#endif

static pmd_t move_soft_dirty_pmd(pmd_t pmd)
{
#ifdef CONFIG_MEM_SOFT_DIRTY
        if (unlikely(is_pmd_migration_entry(pmd)))
                pmd = pmd_swp_mksoft_dirty(pmd);
        else if (pmd_present(pmd))
                pmd = pmd_mksoft_dirty(pmd);
#endif
        return pmd;
}

static pmd_t clear_uffd_wp_pmd(pmd_t pmd)
{
        if (pmd_present(pmd))
                pmd = pmd_clear_uffd_wp(pmd);
        else if (is_swap_pmd(pmd))
                pmd = pmd_swp_clear_uffd_wp(pmd);

        return pmd;
}

bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
        spinlock_t *old_ptl, *new_ptl;
        pmd_t pmd;
        struct mm_struct *mm = vma->vm_mm;
        bool force_flush = false;

        /*
         * The destination pmd shouldn't be established, free_pgtables()
         * should have released it; but move_page_tables() might have already
         * inserted a page table, if racing against shmem/file collapse.
         */
        if (!pmd_none(*new_pmd)) {
                VM_BUG_ON(pmd_trans_huge(*new_pmd));
                return false;
        }

        /*
         * We don't have to worry about the ordering of src and dst
         * ptlocks because exclusive mmap_lock prevents deadlock.
         */
        old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
        if (old_ptl) {
                new_ptl = pmd_lockptr(mm, new_pmd);
                if (new_ptl != old_ptl)
                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
                if (pmd_present(pmd))
                        force_flush = true;
                VM_BUG_ON(!pmd_none(*new_pmd));

                if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
                        pgtable_t pgtable;
                        pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
                }
                pmd = move_soft_dirty_pmd(pmd);
                if (vma_has_uffd_without_event_remap(vma))
                        pmd = clear_uffd_wp_pmd(pmd);
                set_pmd_at(mm, new_addr, new_pmd, pmd);
                if (force_flush)
                        flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
                if (new_ptl != old_ptl)
                        spin_unlock(new_ptl);
                spin_unlock(old_ptl);
                return true;
        }
        return false;
}

/*
 * Returns
 *  - 0 if PMD could not be locked
 *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
 *      or if prot_numa but THP migration is not supported
 *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
 */
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *ptl;
        pmd_t oldpmd, entry;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
        int ret = 1;

        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);

        if (prot_numa && !thp_migration_supported())
                return 1;

        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                return 0;

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (is_swap_pmd(*pmd)) {
                swp_entry_t entry = pmd_to_swp_entry(*pmd);
                struct folio *folio = pfn_swap_entry_folio(entry);
                pmd_t newpmd;

                VM_BUG_ON(!is_pmd_migration_entry(*pmd));
                if (is_writable_migration_entry(entry)) {
                        /*
                         * A protection check is difficult so
                         * just be safe and disable write
                         */
                        if (folio_test_anon(folio))
                                entry = make_readable_exclusive_migration_entry(swp_offset(entry));
                        else
                                entry = make_readable_migration_entry(swp_offset(entry));
                        newpmd = swp_entry_to_pmd(entry);
                        if (pmd_swp_soft_dirty(*pmd))
                                newpmd = pmd_swp_mksoft_dirty(newpmd);
                } else {
                        newpmd = *pmd;
                }

                if (uffd_wp)
                        newpmd = pmd_swp_mkuffd_wp(newpmd);
                else if (uffd_wp_resolve)
                        newpmd = pmd_swp_clear_uffd_wp(newpmd);
                if (!pmd_same(*pmd, newpmd))
                        set_pmd_at(mm, addr, pmd, newpmd);
                goto unlock;
        }
#endif

        if (prot_numa) {
                struct folio *folio;
                bool toptier;
                /*
                 * Avoid trapping faults against the zero page. The read-only
                 * data is likely to be read-cached on the local CPU and
                 * local/remote hits to the zero page are not interesting.
                 */
                if (is_huge_zero_pmd(*pmd))
                        goto unlock;

                if (pmd_protnone(*pmd))
                        goto unlock;

                folio = pmd_folio(*pmd);
                toptier = node_is_toptier(folio_nid(folio));
                /*
                 * Skip scanning top tier node if normal numa
                 * balancing is disabled
                 */
                if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
                    toptier)
                        goto unlock;

                if (folio_use_access_time(folio))
                        folio_xchg_access_time(folio,
                                               jiffies_to_msecs(jiffies));
        }
        /*
         * In case prot_numa, we are under mmap_read_lock(mm). It's critical
         * to not clear pmd intermittently to avoid race with MADV_DONTNEED
         * which is also under mmap_read_lock(mm):
         *
         *        CPU0:                                CPU1:
         *                                change_huge_pmd(prot_numa=1)
         *                                 pmdp_huge_get_and_clear_notify()
         * madvise_dontneed()
         *  zap_pmd_range()
         *   pmd_trans_huge(*pmd) == 0 (without ptl)
         *   // skip the pmd
         *                                 set_pmd_at();
         *                                 // pmd is re-established
         *
         * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
         * which may break userspace.
         *
         * pmdp_invalidate_ad() is required to make sure we don't miss
         * dirty/young flags set by hardware.
         */
        oldpmd = pmdp_invalidate_ad(vma, addr, pmd);

        entry = pmd_modify(oldpmd, newprot);
        if (uffd_wp)
                entry = pmd_mkuffd_wp(entry);
        else if (uffd_wp_resolve)
                /*
                 * Leave the write bit to be handled by PF interrupt
                 * handler, then things like COW could be properly
                 * handled.
                 */
                entry = pmd_clear_uffd_wp(entry);

        /* See change_pte_range(). */
        if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) &&
            can_change_pmd_writable(vma, addr, entry))
                entry = pmd_mkwrite(entry, vma);

        ret = HPAGE_PMD_NR;
        set_pmd_at(mm, addr, pmd, entry);

        if (huge_pmd_needs_flush(oldpmd, entry))
                tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE);
unlock:
        spin_unlock(ptl);
        return ret;
}

/*
 * Returns:
 *
 * - 0: if pud leaf changed from under us
 * - 1: if pud can be skipped
 * - HPAGE_PUD_NR: if pud was successfully processed
 */
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pud_t *pudp, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        pud_t oldpud, entry;
        spinlock_t *ptl;

        tlb_change_page_size(tlb, HPAGE_PUD_SIZE);

        /* NUMA balancing doesn't apply to dax */
        if (cp_flags & MM_CP_PROT_NUMA)
                return 1;

        /*
         * Huge entries on userfault-wp only works with anonymous, while we
         * don't have anonymous PUDs yet.
         */
        if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
                return 1;

        ptl = __pud_trans_huge_lock(pudp, vma);
        if (!ptl)
                return 0;

        /*
         * Can't clear PUD or it can race with concurrent zapping.  See
         * change_huge_pmd().
         */
        oldpud = pudp_invalidate(vma, addr, pudp);
        entry = pud_modify(oldpud, newprot);
        set_pud_at(mm, addr, pudp, entry);
        tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);

        spin_unlock(ptl);
        return HPAGE_PUD_NR;
}
#endif

#ifdef CONFIG_USERFAULTFD
/*
 * The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
 * the caller, but it must return after releasing the page_table_lock.
 * Just move the page from src_pmd to dst_pmd if possible.
 * Return zero if succeeded in moving the page, -EAGAIN if it needs to be
 * repeated by the caller, or other errors in case of failure.
 */
int move_pages_huge_pmd(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
                        struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                        unsigned long dst_addr, unsigned long src_addr)
{
        pmd_t _dst_pmd, src_pmdval;
        struct page *src_page;
        struct folio *src_folio;
        struct anon_vma *src_anon_vma;
        spinlock_t *src_ptl, *dst_ptl;
        pgtable_t src_pgtable;
        struct mmu_notifier_range range;
        int err = 0;

        src_pmdval = *src_pmd;
        src_ptl = pmd_lockptr(mm, src_pmd);

        lockdep_assert_held(src_ptl);
        vma_assert_locked(src_vma);
        vma_assert_locked(dst_vma);

        /* Sanity checks before the operation */
        if (WARN_ON_ONCE(!pmd_none(dst_pmdval)) || WARN_ON_ONCE(src_addr & ~HPAGE_PMD_MASK) ||
            WARN_ON_ONCE(dst_addr & ~HPAGE_PMD_MASK)) {
                spin_unlock(src_ptl);
                return -EINVAL;
        }

        if (!pmd_trans_huge(src_pmdval)) {
                spin_unlock(src_ptl);
                if (is_pmd_migration_entry(src_pmdval)) {
                        pmd_migration_entry_wait(mm, &src_pmdval);
                        return -EAGAIN;
                }
                return -ENOENT;
        }

        src_page = pmd_page(src_pmdval);

        if (!is_huge_zero_pmd(src_pmdval)) {
                if (unlikely(!PageAnonExclusive(src_page))) {
                        spin_unlock(src_ptl);
                        return -EBUSY;
                }

                src_folio = page_folio(src_page);
                folio_get(src_folio);
        } else
                src_folio = NULL;

        spin_unlock(src_ptl);

        flush_cache_range(src_vma, src_addr, src_addr + HPAGE_PMD_SIZE);
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, src_addr,
                                src_addr + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);

        if (src_folio) {
                folio_lock(src_folio);

                /*
                 * split_huge_page walks the anon_vma chain without the page
                 * lock. Serialize against it with the anon_vma lock, the page
                 * lock is not enough.
                 */
                src_anon_vma = folio_get_anon_vma(src_folio);
                if (!src_anon_vma) {
                        err = -EAGAIN;
                        goto unlock_folio;
                }
                anon_vma_lock_write(src_anon_vma);
        } else
                src_anon_vma = NULL;

        dst_ptl = pmd_lockptr(mm, dst_pmd);
        double_pt_lock(src_ptl, dst_ptl);
        if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
                     !pmd_same(*dst_pmd, dst_pmdval))) {
                err = -EAGAIN;
                goto unlock_ptls;
        }
        if (src_folio) {
                if (folio_maybe_dma_pinned(src_folio) ||
                    !PageAnonExclusive(&src_folio->page)) {
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                if (WARN_ON_ONCE(!folio_test_head(src_folio)) ||
                    WARN_ON_ONCE(!folio_test_anon(src_folio))) {
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
                /* Folio got pinned from under us. Put it back and fail the move. */
                if (folio_maybe_dma_pinned(src_folio)) {
                        set_pmd_at(mm, src_addr, src_pmd, src_pmdval);
                        err = -EBUSY;
                        goto unlock_ptls;
                }

                folio_move_anon_rmap(src_folio, dst_vma);
                src_folio->index = linear_page_index(dst_vma, dst_addr);

                _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
                /* Follow mremap() behavior and treat the entry dirty after the move */
                _dst_pmd = pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
        } else {
                src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
                _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
        }
        set_pmd_at(mm, dst_addr, dst_pmd, _dst_pmd);

        src_pgtable = pgtable_trans_huge_withdraw(mm, src_pmd);
        pgtable_trans_huge_deposit(mm, dst_pmd, src_pgtable);
unlock_ptls:
        double_pt_unlock(src_ptl, dst_ptl);
        if (src_anon_vma) {
                anon_vma_unlock_write(src_anon_vma);
                put_anon_vma(src_anon_vma);
        }
unlock_folio:
        /* unblock rmap walks */
        if (src_folio)
                folio_unlock(src_folio);
        mmu_notifier_invalidate_range_end(&range);
        if (src_folio)
                folio_put(src_folio);
        return err;
}
#endif /* CONFIG_USERFAULTFD */

/*
 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
        spinlock_t *ptl;
        ptl = pmd_lock(vma->vm_mm, pmd);
        if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
                        pmd_devmap(*pmd)))
                return ptl;
        spin_unlock(ptl);
        return NULL;
}

/*
 * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
 *
 * Note that if it returns page table lock pointer, this routine returns without
 * unlocking page table lock. So callers must unlock it.
 */
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
        spinlock_t *ptl;

        ptl = pud_lock(vma->vm_mm, pud);
        if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
                return ptl;
        spin_unlock(ptl);
        return NULL;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 pud_t *pud, unsigned long addr)
{
        spinlock_t *ptl;
        pud_t orig_pud;

        ptl = __pud_trans_huge_lock(pud, vma);
        if (!ptl)
                return 0;

        orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
        arch_check_zapped_pud(vma, orig_pud);
        tlb_remove_pud_tlb_entry(tlb, pud, addr);
        if (!vma_is_dax(vma) && vma_is_special_huge(vma)) {
                spin_unlock(ptl);
                /* No zero page support yet */
        } else {
                struct page *page = NULL;
                struct folio *folio;

                /* No support for anonymous PUD pages or migration yet */
                VM_WARN_ON_ONCE(vma_is_anonymous(vma) ||
                                !pud_present(orig_pud));

                page = pud_page(orig_pud);
                folio = page_folio(page);
                folio_remove_rmap_pud(folio, page, vma);
                add_mm_counter(tlb->mm, mm_counter_file(folio), -HPAGE_PUD_NR);

                spin_unlock(ptl);
                tlb_remove_page_size(tlb, page, HPAGE_PUD_SIZE);
        }
        return 1;
}

static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
                unsigned long haddr)
{
        struct folio *folio;
        struct page *page;
        pud_t old_pud;

        VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
        VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));

        count_vm_event(THP_SPLIT_PUD);

        old_pud = pudp_huge_clear_flush(vma, haddr, pud);

        if (!vma_is_dax(vma))
                return;

        page = pud_page(old_pud);
        folio = page_folio(page);

        if (!folio_test_dirty(folio) && pud_dirty(old_pud))
                folio_mark_dirty(folio);
        if (!folio_test_referenced(folio) && pud_young(old_pud))
                folio_set_referenced(folio);
        folio_remove_rmap_pud(folio, page, vma);
        folio_put(folio);
        add_mm_counter(vma->vm_mm, mm_counter_file(folio),
                -HPAGE_PUD_NR);
}

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address)
{
        spinlock_t *ptl;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address & HPAGE_PUD_MASK,
                                (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pud_lock(vma->vm_mm, pud);
        if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
                goto out;
        __split_huge_pud_locked(vma, pud, range.start);

out:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
}
#else
void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address)
{
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */

static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
                unsigned long haddr, pmd_t *pmd)
{
        struct mm_struct *mm = vma->vm_mm;
        pgtable_t pgtable;
        pmd_t _pmd, old_pmd;
        unsigned long addr;
        pte_t *pte;
        int i;

        /*
         * Leave pmd empty until pte is filled note that it is fine to delay
         * notification until mmu_notifier_invalidate_range_end() as we are
         * replacing a zero pmd write protected page with a zero pte write
         * protected page.
         *
         * See Documentation/mm/mmu_notifier.rst
         */
        old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);

        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);

        pte = pte_offset_map(&_pmd, haddr);
        VM_BUG_ON(!pte);
        for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                pte_t entry;

                entry = pfn_pte(my_zero_pfn(addr), vma->vm_page_prot);
                entry = pte_mkspecial(entry);
                if (pmd_uffd_wp(old_pmd))
                        entry = pte_mkuffd_wp(entry);
                VM_BUG_ON(!pte_none(ptep_get(pte)));
                set_pte_at(mm, addr, pte, entry);
                pte++;
        }
        pte_unmap(pte - 1);
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
}

static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long haddr, bool freeze)
{
        struct mm_struct *mm = vma->vm_mm;
        struct folio *folio;
        struct page *page;
        pgtable_t pgtable;
        pmd_t old_pmd, _pmd;
        bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
        bool anon_exclusive = false, dirty = false;
        unsigned long addr;
        pte_t *pte;
        int i;

        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
        VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
                                && !pmd_devmap(*pmd));

        count_vm_event(THP_SPLIT_PMD);

        if (!vma_is_anonymous(vma)) {
                old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd);
                /*
                 * We are going to unmap this huge page. So
                 * just go ahead and zap it
                 */
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(mm, pmd);
                if (!vma_is_dax(vma) && vma_is_special_huge(vma))
                        return;
                if (unlikely(is_pmd_migration_entry(old_pmd))) {
                        swp_entry_t entry;

                        entry = pmd_to_swp_entry(old_pmd);
                        folio = pfn_swap_entry_folio(entry);
                } else if (is_huge_zero_pmd(old_pmd)) {
                        return;
                } else {
                        page = pmd_page(old_pmd);
                        folio = page_folio(page);
                        if (!folio_test_dirty(folio) && pmd_dirty(old_pmd))
                                folio_mark_dirty(folio);
                        if (!folio_test_referenced(folio) && pmd_young(old_pmd))
                                folio_set_referenced(folio);
                        folio_remove_rmap_pmd(folio, page, vma);
                        folio_put(folio);
                }
                add_mm_counter(mm, mm_counter_file(folio), -HPAGE_PMD_NR);
                return;
        }

        if (is_huge_zero_pmd(*pmd)) {
                /*
                 * FIXME: Do we want to invalidate secondary mmu by calling
                 * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below
                 * inside __split_huge_pmd() ?
                 *
                 * We are going from a zero huge page write protected to zero
                 * small page also write protected so it does not seems useful
                 * to invalidate secondary mmu at this time.
                 */
                return __split_huge_zero_page_pmd(vma, haddr, pmd);
        }

        pmd_migration = is_pmd_migration_entry(*pmd);
        if (unlikely(pmd_migration)) {
                swp_entry_t entry;

                old_pmd = *pmd;
                entry = pmd_to_swp_entry(old_pmd);
                page = pfn_swap_entry_to_page(entry);
                write = is_writable_migration_entry(entry);
                if (PageAnon(page))
                        anon_exclusive = is_readable_exclusive_migration_entry(entry);
                young = is_migration_entry_young(entry);
                dirty = is_migration_entry_dirty(entry);
                soft_dirty = pmd_swp_soft_dirty(old_pmd);
                uffd_wp = pmd_swp_uffd_wp(old_pmd);
        } else {
                /*
                 * Up to this point the pmd is present and huge and userland has
                 * the whole access to the hugepage during the split (which
                 * happens in place). If we overwrite the pmd with the not-huge
                 * version pointing to the pte here (which of course we could if
                 * all CPUs were bug free), userland could trigger a small page
                 * size TLB miss on the small sized TLB while the hugepage TLB
                 * entry is still established in the huge TLB. Some CPU doesn't
                 * like that. See
                 * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
                 * 383 on page 105. Intel should be safe but is also warns that
                 * it's only safe if the permission and cache attributes of the
                 * two entries loaded in the two TLB is identical (which should
                 * be the case here). But it is generally safer to never allow
                 * small and huge TLB entries for the same virtual address to be
                 * loaded simultaneously. So instead of doing "pmd_populate();
                 * flush_pmd_tlb_range();" we first mark the current pmd
                 * notpresent (atomically because here the pmd_trans_huge must
                 * remain set at all times on the pmd until the split is
                 * complete for this pmd), then we flush the SMP TLB and finally
                 * we write the non-huge version of the pmd entry with
                 * pmd_populate.
                 */
                old_pmd = pmdp_invalidate(vma, haddr, pmd);
                page = pmd_page(old_pmd);
                folio = page_folio(page);
                if (pmd_dirty(old_pmd)) {
                        dirty = true;
                        folio_set_dirty(folio);
                }
                write = pmd_write(old_pmd);
                young = pmd_young(old_pmd);
                soft_dirty = pmd_soft_dirty(old_pmd);
                uffd_wp = pmd_uffd_wp(old_pmd);

                VM_WARN_ON_FOLIO(!folio_ref_count(folio), folio);
                VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

                /*
                 * Without "freeze", we'll simply split the PMD, propagating the
                 * PageAnonExclusive() flag for each PTE by setting it for
                 * each subpage -- no need to (temporarily) clear.
                 *
                 * With "freeze" we want to replace mapped pages by
                 * migration entries right away. This is only possible if we
                 * managed to clear PageAnonExclusive() -- see
                 * set_pmd_migration_entry().
                 *
                 * In case we cannot clear PageAnonExclusive(), split the PMD
                 * only and let try_to_migrate_one() fail later.
                 *
                 * See folio_try_share_anon_rmap_pmd(): invalidate PMD first.
                 */
                anon_exclusive = PageAnonExclusive(page);
                if (freeze && anon_exclusive &&
                    folio_try_share_anon_rmap_pmd(folio, page))
                        freeze = false;
                if (!freeze) {
                        rmap_t rmap_flags = RMAP_NONE;

                        folio_ref_add(folio, HPAGE_PMD_NR - 1);
                        if (anon_exclusive)
                                rmap_flags |= RMAP_EXCLUSIVE;
                        folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
                                                 vma, haddr, rmap_flags);
                }
        }

        /*
         * Withdraw the table only after we mark the pmd entry invalid.
         * This's critical for some architectures (Power).
         */
        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
        pmd_populate(mm, &_pmd, pgtable);

        pte = pte_offset_map(&_pmd, haddr);
        VM_BUG_ON(!pte);

        /*
         * Note that NUMA hinting access restrictions are not transferred to
         * avoid any possibility of altering permissions across VMAs.
         */
        if (freeze || pmd_migration) {
                for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
                        pte_t entry;
                        swp_entry_t swp_entry;

                        if (write)
                                swp_entry = make_writable_migration_entry(
                                                        page_to_pfn(page + i));
                        else if (anon_exclusive)
                                swp_entry = make_readable_exclusive_migration_entry(
                                                        page_to_pfn(page + i));
                        else
                                swp_entry = make_readable_migration_entry(
                                                        page_to_pfn(page + i));
                        if (young)
                                swp_entry = make_migration_entry_young(swp_entry);
                        if (dirty)
                                swp_entry = make_migration_entry_dirty(swp_entry);
                        entry = swp_entry_to_pte(swp_entry);
                        if (soft_dirty)
                                entry = pte_swp_mksoft_dirty(entry);
                        if (uffd_wp)
                                entry = pte_swp_mkuffd_wp(entry);

                        VM_WARN_ON(!pte_none(ptep_get(pte + i)));
                        set_pte_at(mm, addr, pte + i, entry);
                }
        } else {
                pte_t entry;

                entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
                if (write)
                        entry = pte_mkwrite(entry, vma);
                if (!young)
                        entry = pte_mkold(entry);
                /* NOTE: this may set soft-dirty too on some archs */
                if (dirty)
                        entry = pte_mkdirty(entry);
                if (soft_dirty)
                        entry = pte_mksoft_dirty(entry);
                if (uffd_wp)
                        entry = pte_mkuffd_wp(entry);

                for (i = 0; i < HPAGE_PMD_NR; i++)
                        VM_WARN_ON(!pte_none(ptep_get(pte + i)));

                set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
        }
        pte_unmap(pte);

        if (!pmd_migration)
                folio_remove_rmap_pmd(folio, page, vma);
        if (freeze)
                put_page(page);

        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
}

void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
                           pmd_t *pmd, bool freeze, struct folio *folio)
{
        VM_WARN_ON_ONCE(folio && !folio_test_pmd_mappable(folio));
        VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
        VM_WARN_ON_ONCE(folio && !folio_test_locked(folio));
        VM_BUG_ON(freeze && !folio);

        /*
         * When the caller requests to set up a migration entry, we
         * require a folio to check the PMD against. Otherwise, there
         * is a risk of replacing the wrong folio.
         */
        if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
            is_pmd_migration_entry(*pmd)) {
                if (folio && folio != pmd_folio(*pmd))
                        return;
                __split_huge_pmd_locked(vma, pmd, address, freeze);
        }
}

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct folio *folio)
{
        spinlock_t *ptl;
        struct mmu_notifier_range range;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                address & HPAGE_PMD_MASK,
                                (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
        mmu_notifier_invalidate_range_start(&range);
        ptl = pmd_lock(vma->vm_mm, pmd);
        split_huge_pmd_locked(vma, range.start, pmd, freeze, folio);
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
}

void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
                bool freeze, struct folio *folio)
{
        pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);

        if (!pmd)
                return;

        __split_huge_pmd(vma, pmd, address, freeze, folio);
}

static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned long address)
{
        /*
         * If the new address isn't hpage aligned and it could previously
         * contain an hugepage: check if we need to split an huge pmd.
         */
        if (!IS_ALIGNED(address, HPAGE_PMD_SIZE) &&
            range_in_vma(vma, ALIGN_DOWN(address, HPAGE_PMD_SIZE),
                         ALIGN(address, HPAGE_PMD_SIZE)))
                split_huge_pmd_address(vma, address, false, NULL);
}

void vma_adjust_trans_huge(struct vm_area_struct *vma,
                           unsigned long start,
                           unsigned long end,
                           struct vm_area_struct *next)
{
        /* Check if we need to split start first. */
        split_huge_pmd_if_needed(vma, start);

        /* Check if we need to split end next. */
        split_huge_pmd_if_needed(vma, end);

        /* If we're incrementing next->vm_start, we might need to split it. */
        if (next)
                split_huge_pmd_if_needed(next, end);
}

static void unmap_folio(struct folio *folio)
{
        enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SYNC |
                TTU_BATCH_FLUSH;

        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

        if (folio_test_pmd_mappable(folio))
                ttu_flags |= TTU_SPLIT_HUGE_PMD;

        /*
         * Anon pages need migration entries to preserve them, but file
         * pages can simply be left unmapped, then faulted back on demand.
         * If that is ever changed (perhaps for mlock), update remap_page().
         */
        if (folio_test_anon(folio))
                try_to_migrate(folio, ttu_flags);
        else
                try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK);

        try_to_unmap_flush();
}

static bool __discard_anon_folio_pmd_locked(struct vm_area_struct *vma,
                                            unsigned long addr, pmd_t *pmdp,
                                            struct folio *folio)
{
        struct mm_struct *mm = vma->vm_mm;
        int ref_count, map_count;
        pmd_t orig_pmd = *pmdp;

        if (pmd_dirty(orig_pmd))
                folio_set_dirty(folio);
        if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
                folio_set_swapbacked(folio);
                return false;
        }

        orig_pmd = pmdp_huge_clear_flush(vma, addr, pmdp);

        /*
         * Syncing against concurrent GUP-fast:
         * - clear PMD; barrier; read refcount
         * - inc refcount; barrier; read PMD
         */
        smp_mb();

        ref_count = folio_ref_count(folio);
        map_count = folio_mapcount(folio);

        /*
         * Order reads for folio refcount and dirty flag
         * (see comments in __remove_mapping()).
         */
        smp_rmb();

        /*
         * If the folio or its PMD is redirtied at this point, or if there
         * are unexpected references, we will give up to discard this folio
         * and remap it.
         *
         * The only folio refs must be one from isolation plus the rmap(s).
         */
        if (pmd_dirty(orig_pmd))
                folio_set_dirty(folio);
        if (folio_test_dirty(folio) && !(vma->vm_flags & VM_DROPPABLE)) {
                folio_set_swapbacked(folio);
                set_pmd_at(mm, addr, pmdp, orig_pmd);
                return false;
        }

        if (ref_count != map_count + 1) {
                set_pmd_at(mm, addr, pmdp, orig_pmd);
                return false;
        }

        folio_remove_rmap_pmd(folio, pmd_page(orig_pmd), vma);
        zap_deposited_table(mm, pmdp);
        add_mm_counter(mm, MM_ANONPAGES, -HPAGE_PMD_NR);
        if (vma->vm_flags & VM_LOCKED)
                mlock_drain_local();
        folio_put(folio);

        return true;
}

bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
                           pmd_t *pmdp, struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_pmd_mappable(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(folio_test_swapbacked(folio), folio);
        VM_WARN_ON_ONCE(!IS_ALIGNED(addr, HPAGE_PMD_SIZE));

        return __discard_anon_folio_pmd_locked(vma, addr, pmdp, folio);
}

static void remap_page(struct folio *folio, unsigned long nr, int flags)
{
        int i = 0;

        /* If unmap_folio() uses try_to_migrate() on file, remove this check */
        if (!folio_test_anon(folio))
                return;
        for (;;) {
                remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
                i += folio_nr_pages(folio);
                if (i >= nr)
                        break;
                folio = folio_next(folio);
        }
}

static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
                struct lruvec *lruvec, struct list_head *list)
{
        VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
        lockdep_assert_held(&lruvec->lru_lock);

        if (list) {
                /* page reclaim is reclaiming a huge page */
                VM_WARN_ON(folio_test_lru(folio));
                folio_get(new_folio);
                list_add_tail(&new_folio->lru, list);
        } else {
                /* head is still on lru (and we have it frozen) */
                VM_WARN_ON(!folio_test_lru(folio));
                if (folio_test_unevictable(folio))
                        new_folio->mlock_count = 0;
                else
                        list_add_tail(&new_folio->lru, &folio->lru);
                folio_set_lru(new_folio);
        }
}

/* Racy check whether the huge page can be split */
bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{
        int extra_pins;

        /* Additional pins from page cache */
        if (folio_test_anon(folio))
                extra_pins = folio_test_swapcache(folio) ?
                                folio_nr_pages(folio) : 0;
        else
                extra_pins = folio_nr_pages(folio);
        if (pextra_pins)
                *pextra_pins = extra_pins;
        return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
                                        caller_pins;
}

/*
 * It splits @folio into @new_order folios and copies the @folio metadata to
 * all the resulting folios.
 */
static void __split_folio_to_order(struct folio *folio, int old_order,
                int new_order)
{
        long new_nr_pages = 1 << new_order;
        long nr_pages = 1 << old_order;
        long i;

        /*
         * Skip the first new_nr_pages, since the new folio from them have all
         * the flags from the original folio.
         */
        for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
                struct page *new_head = &folio->page + i;

                /*
                 * Careful: new_folio is not a "real" folio before we cleared PageTail.
                 * Don't pass it around before clear_compound_head().
                 */
                struct folio *new_folio = (struct folio *)new_head;

                VM_BUG_ON_PAGE(atomic_read(&new_folio->_mapcount) != -1, new_head);

                /*
                 * Clone page flags before unfreezing refcount.
                 *
                 * After successful get_page_unless_zero() might follow flags change,
                 * for example lock_page() which set PG_waiters.
                 *
                 * Note that for mapped sub-pages of an anonymous THP,
                 * PG_anon_exclusive has been cleared in unmap_folio() and is stored in
                 * the migration entry instead from where remap_page() will restore it.
                 * We can still have PG_anon_exclusive set on effectively unmapped and
                 * unreferenced sub-pages of an anonymous THP: we can simply drop
                 * PG_anon_exclusive (-> PG_mappedtodisk) for these here.
                 */
                new_folio->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
                new_folio->flags |= (folio->flags &
                                ((1L << PG_referenced) |
                                 (1L << PG_swapbacked) |
                                 (1L << PG_swapcache) |
                                 (1L << PG_mlocked) |
                                 (1L << PG_uptodate) |
                                 (1L << PG_active) |
                                 (1L << PG_workingset) |
                                 (1L << PG_locked) |
                                 (1L << PG_unevictable) |
#ifdef CONFIG_ARCH_USES_PG_ARCH_2
                                 (1L << PG_arch_2) |
#endif
#ifdef CONFIG_ARCH_USES_PG_ARCH_3
                                 (1L << PG_arch_3) |
#endif
                                 (1L << PG_dirty) |
                                 LRU_GEN_MASK | LRU_REFS_MASK));

                new_folio->mapping = folio->mapping;
                new_folio->index = folio->index + i;

                /*
                 * page->private should not be set in tail pages. Fix up and warn once
                 * if private is unexpectedly set.
                 */
                if (unlikely(new_folio->private)) {
                        VM_WARN_ON_ONCE_PAGE(true, new_head);
                        new_folio->private = NULL;
                }

                if (folio_test_swapcache(folio))
                        new_folio->swap.val = folio->swap.val + i;

                /* Page flags must be visible before we make the page non-compound. */
                smp_wmb();

                /*
                 * Clear PageTail before unfreezing page refcount.
                 *
                 * After successful get_page_unless_zero() might follow put_page()
                 * which needs correct compound_head().
                 */
                clear_compound_head(new_head);
                if (new_order) {
                        prep_compound_page(new_head, new_order);
                        folio_set_large_rmappable(new_folio);
                }

                if (folio_test_young(folio))
                        folio_set_young(new_folio);
                if (folio_test_idle(folio))
                        folio_set_idle(new_folio);
#ifdef CONFIG_MEMCG
                new_folio->memcg_data = folio->memcg_data;
#endif

                folio_xchg_last_cpupid(new_folio, folio_last_cpupid(folio));
        }

        if (new_order)
                folio_set_order(folio, new_order);
        else
                ClearPageCompound(&folio->page);
}

/*
 * It splits an unmapped @folio to lower order smaller folios in two ways.
 * @folio: the to-be-split folio
 * @new_order: the smallest order of the after split folios (since buddy
 *             allocator like split generates folios with orders from @folio's
 *             order - 1 to new_order).
 * @split_at: in buddy allocator like split, the folio containing @split_at
 *            will be split until its order becomes @new_order.
 * @lock_at: the folio containing @lock_at is left locked for caller.
 * @list: the after split folios will be added to @list if it is not NULL,
 *        otherwise to LRU lists.
 * @end: the end of the file @folio maps to. -1 if @folio is anonymous memory.
 * @xas: xa_state pointing to folio->mapping->i_pages and locked by caller
 * @mapping: @folio->mapping
 * @uniform_split: if the split is uniform or not (buddy allocator like split)
 *
 *
 * 1. uniform split: the given @folio into multiple @new_order small folios,
 *    where all small folios have the same order. This is done when
 *    uniform_split is true.
 * 2. buddy allocator like (non-uniform) split: the given @folio is split into
 *    half and one of the half (containing the given page) is split into half
 *    until the given @page's order becomes @new_order. This is done when
 *    uniform_split is false.
 *
 * The high level flow for these two methods are:
 * 1. uniform split: a single __split_folio_to_order() is called to split the
 *    @folio into @new_order, then we traverse all the resulting folios one by
 *    one in PFN ascending order and perform stats, unfreeze, adding to list,
 *    and file mapping index operations.
 * 2. non-uniform split: in general, folio_order - @new_order calls to
 *    __split_folio_to_order() are made in a for loop to split the @folio
 *    to one lower order at a time. The resulting small folios are processed
 *    like what is done during the traversal in 1, except the one containing
 *    @page, which is split in next for loop.
 *
 * After splitting, the caller's folio reference will be transferred to the
 * folio containing @page. The other folios may be freed if they are not mapped.
 *
 * In terms of locking, after splitting,
 * 1. uniform split leaves @page (or the folio contains it) locked;
 * 2. buddy allocator like (non-uniform) split leaves @folio locked.
 *
 *
 * For !uniform_split, when -ENOMEM is returned, the original folio might be
 * split. The caller needs to check the input folio.
 */
static int __split_unmapped_folio(struct folio *folio, int new_order,
                struct page *split_at, struct page *lock_at,
                struct list_head *list, pgoff_t end,
                struct xa_state *xas, struct address_space *mapping,
                bool uniform_split)
{
        struct lruvec *lruvec;
        struct address_space *swap_cache = NULL;
        struct folio *origin_folio = folio;
        struct folio *next_folio = folio_next(folio);
        struct folio *new_folio;
        struct folio *next;
        int order = folio_order(folio);
        int split_order;
        int start_order = uniform_split ? new_order : order - 1;
        int nr_dropped = 0;
        int ret = 0;
        bool stop_split = false;

        if (folio_test_swapcache(folio)) {
                VM_BUG_ON(mapping);

                /* a swapcache folio can only be uniformly split to order-0 */
                if (!uniform_split || new_order != 0)
                        return -EINVAL;

                swap_cache = swap_address_space(folio->swap);
                xa_lock(&swap_cache->i_pages);
        }

        if (folio_test_anon(folio))
                mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);

        /* lock lru list/PageCompound, ref frozen by page_ref_freeze */
        lruvec = folio_lruvec_lock(folio);

        folio_clear_has_hwpoisoned(folio);

        /*
         * split to new_order one order at a time. For uniform split,
         * folio is split to new_order directly.
         */
        for (split_order = start_order;
             split_order >= new_order && !stop_split;
             split_order--) {
                int old_order = folio_order(folio);
                struct folio *release;
                struct folio *end_folio = folio_next(folio);

                /* order-1 anonymous folio is not supported */
                if (folio_test_anon(folio) && split_order == 1)
                        continue;
                if (uniform_split && split_order != new_order)
                        continue;

                if (mapping) {
                        /*
                         * uniform split has xas_split_alloc() called before
                         * irq is disabled to allocate enough memory, whereas
                         * non-uniform split can handle ENOMEM.
                         */
                        if (uniform_split)
                                xas_split(xas, folio, old_order);
                        else {
                                xas_set_order(xas, folio->index, split_order);
                                xas_try_split(xas, folio, old_order);
                                if (xas_error(xas)) {
                                        ret = xas_error(xas);
                                        stop_split = true;
                                        goto after_split;
                                }
                        }
                }

                folio_split_memcg_refs(folio, old_order, split_order);
                split_page_owner(&folio->page, old_order, split_order);
                pgalloc_tag_split(folio, old_order, split_order);

                __split_folio_to_order(folio, old_order, split_order);

after_split:
                /*
                 * Iterate through after-split folios and perform related
                 * operations. But in buddy allocator like split, the folio
                 * containing the specified page is skipped until its order
                 * is new_order, since the folio will be worked on in next
                 * iteration.
                 */
                for (release = folio; release != end_folio; release = next) {
                        next = folio_next(release);
                        /*
                         * for buddy allocator like split, the folio containing
                         * page will be split next and should not be released,
                         * until the folio's order is new_order or stop_split
                         * is set to true by the above xas_split() failure.
                         */
                        if (release == page_folio(split_at)) {
                                folio = release;
                                if (split_order != new_order && !stop_split)
                                        continue;
                        }
                        if (folio_test_anon(release)) {
                                mod_mthp_stat(folio_order(release),
                                                MTHP_STAT_NR_ANON, 1);
                        }

                        /*
                         * origin_folio should be kept frozon until page cache
                         * entries are updated with all the other after-split
                         * folios to prevent others seeing stale page cache
                         * entries.
                         */
                        if (release == origin_folio)
                                continue;

                        folio_ref_unfreeze(release, 1 +
                                        ((mapping || swap_cache) ?
                                                folio_nr_pages(release) : 0));

                        lru_add_split_folio(origin_folio, release, lruvec,
                                        list);

                        /* Some pages can be beyond EOF: drop them from cache */
                        if (release->index >= end) {
                                if (shmem_mapping(mapping))
                                        nr_dropped += folio_nr_pages(release);
                                else if (folio_test_clear_dirty(release))
                                        folio_account_cleaned(release,
                                                inode_to_wb(mapping->host));
                                __filemap_remove_folio(release, NULL);
                                folio_put_refs(release, folio_nr_pages(release));
                        } else if (mapping) {
                                __xa_store(&mapping->i_pages,
                                                release->index, release, 0);
                        } else if (swap_cache) {
                                __xa_store(&swap_cache->i_pages,
                                                swap_cache_index(release->swap),
                                                release, 0);
                        }
                }
        }

        /*
         * Unfreeze origin_folio only after all page cache entries, which used
         * to point to it, have been updated with new folios. Otherwise,
         * a parallel folio_try_get() can grab origin_folio and its caller can
         * see stale page cache entries.
         */
        folio_ref_unfreeze(origin_folio, 1 +
                ((mapping || swap_cache) ? folio_nr_pages(origin_folio) : 0));

        unlock_page_lruvec(lruvec);

        if (swap_cache)
                xa_unlock(&swap_cache->i_pages);
        if (mapping)
                xa_unlock(&mapping->i_pages);

        /* Caller disabled irqs, so they are still disabled here */
        local_irq_enable();

        if (nr_dropped)
                shmem_uncharge(mapping->host, nr_dropped);

        remap_page(origin_folio, 1 << order,
                        folio_test_anon(origin_folio) ?
                                RMP_USE_SHARED_ZEROPAGE : 0);

        /*
         * At this point, folio should contain the specified page.
         * For uniform split, it is left for caller to unlock.
         * For buddy allocator like split, the first after-split folio is left
         * for caller to unlock.
         */
        for (new_folio = origin_folio; new_folio != next_folio; new_folio = next) {
                next = folio_next(new_folio);
                if (new_folio == page_folio(lock_at))
                        continue;

                folio_unlock(new_folio);
                /*
                 * Subpages may be freed if there wasn't any mapping
                 * like if add_to_swap() is running on a lru page that
                 * had its mapping zapped. And freeing these pages
                 * requires taking the lru_lock so we do the put_page
                 * of the tail pages after the split is complete.
                 */
                free_page_and_swap_cache(&new_folio->page);
        }
        return ret;
}

bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
                bool warns)
{
        if (folio_test_anon(folio)) {
                /* order-1 is not supported for anonymous THP. */
                VM_WARN_ONCE(warns && new_order == 1,
                                "Cannot split to order-1 folio");
                return new_order != 1;
        } else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
            !mapping_large_folio_support(folio->mapping)) {
                /*
                 * No split if the file system does not support large folio.
                 * Note that we might still have THPs in such mappings due to
                 * CONFIG_READ_ONLY_THP_FOR_FS. But in that case, the mapping
                 * does not actually support large folios properly.
                 */
                VM_WARN_ONCE(warns,
                        "Cannot split file folio to non-0 order");
                return false;
        }

        /* Only swapping a whole PMD-mapped folio is supported */
        if (folio_test_swapcache(folio)) {
                VM_WARN_ONCE(warns,
                        "Cannot split swapcache folio to non-0 order");
                return false;
        }

        return true;
}

/* See comments in non_uniform_split_supported() */
bool uniform_split_supported(struct folio *folio, unsigned int new_order,
                bool warns)
{
        if (folio_test_anon(folio)) {
                VM_WARN_ONCE(warns && new_order == 1,
                                "Cannot split to order-1 folio");
                return new_order != 1;
        } else  if (new_order) {
                if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
                    !mapping_large_folio_support(folio->mapping)) {
                        VM_WARN_ONCE(warns,
                                "Cannot split file folio to non-0 order");
                        return false;
                }
        }

        if (new_order && folio_test_swapcache(folio)) {
                VM_WARN_ONCE(warns,
                        "Cannot split swapcache folio to non-0 order");
                return false;
        }

        return true;
}

/*
 * __folio_split: split a folio at @split_at to a @new_order folio
 * @folio: folio to split
 * @new_order: the order of the new folio
 * @split_at: a page within the new folio
 * @lock_at: a page within @folio to be left locked to caller
 * @list: after-split folios will be put on it if non NULL
 * @uniform_split: perform uniform split or not (non-uniform split)
 *
 * It calls __split_unmapped_folio() to perform uniform and non-uniform split.
 * It is in charge of checking whether the split is supported or not and
 * preparing @folio for __split_unmapped_folio().
 *
 * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
 * split but not to @new_order, the caller needs to check)
 */
static int __folio_split(struct folio *folio, unsigned int new_order,
                struct page *split_at, struct page *lock_at,
                struct list_head *list, bool uniform_split)
{
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
        XA_STATE(xas, &folio->mapping->i_pages, folio->index);
        bool is_anon = folio_test_anon(folio);
        struct address_space *mapping = NULL;
        struct anon_vma *anon_vma = NULL;
        int order = folio_order(folio);
        int extra_pins, ret;
        pgoff_t end;
        bool is_hzp;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);

        if (folio != page_folio(split_at) || folio != page_folio(lock_at))
                return -EINVAL;

        if (new_order >= folio_order(folio))
                return -EINVAL;

        if (uniform_split && !uniform_split_supported(folio, new_order, true))
                return -EINVAL;

        if (!uniform_split &&
            !non_uniform_split_supported(folio, new_order, true))
                return -EINVAL;

        is_hzp = is_huge_zero_folio(folio);
        if (is_hzp) {
                pr_warn_ratelimited("Called split_huge_page for huge zero page\n");
                return -EBUSY;
        }

        if (folio_test_writeback(folio))
                return -EBUSY;

        if (is_anon) {
                /*
                 * The caller does not necessarily hold an mmap_lock that would
                 * prevent the anon_vma disappearing so we first we take a
                 * reference to it and then lock the anon_vma for write. This
                 * is similar to folio_lock_anon_vma_read except the write lock
                 * is taken to serialise against parallel split or collapse
                 * operations.
                 */
                anon_vma = folio_get_anon_vma(folio);
                if (!anon_vma) {
                        ret = -EBUSY;
                        goto out;
                }
                end = -1;
                mapping = NULL;
                anon_vma_lock_write(anon_vma);
        } else {
                unsigned int min_order;
                gfp_t gfp;

                mapping = folio->mapping;

                /* Truncated ? */
                /*
                 * TODO: add support for large shmem folio in swap cache.
                 * When shmem is in swap cache, mapping is NULL and
                 * folio_test_swapcache() is true.
                 */
                if (!mapping) {
                        ret = -EBUSY;
                        goto out;
                }

                min_order = mapping_min_folio_order(folio->mapping);
                if (new_order < min_order) {
                        VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
                                     min_order);
                        ret = -EINVAL;
                        goto out;
                }

                gfp = current_gfp_context(mapping_gfp_mask(mapping) &
                                                        GFP_RECLAIM_MASK);

                if (!filemap_release_folio(folio, gfp)) {
                        ret = -EBUSY;
                        goto out;
                }

                if (uniform_split) {
                        xas_set_order(&xas, folio->index, new_order);
                        xas_split_alloc(&xas, folio, folio_order(folio), gfp);
                        if (xas_error(&xas)) {
                                ret = xas_error(&xas);
                                goto out;
                        }
                }

                anon_vma = NULL;
                i_mmap_lock_read(mapping);

                /*
                 *__split_unmapped_folio() may need to trim off pages beyond
                 * EOF: but on 32-bit, i_size_read() takes an irq-unsafe
                 * seqlock, which cannot be nested inside the page tree lock.
                 * So note end now: i_size itself may be changed at any moment,
                 * but folio lock is good enough to serialize the trimming.
                 */
                end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
                if (shmem_mapping(mapping))
                        end = shmem_fallocend(mapping->host, end);
        }

        /*
         * Racy check if we can split the page, before unmap_folio() will
         * split PMDs
         */
        if (!can_split_folio(folio, 1, &extra_pins)) {
                ret = -EAGAIN;
                goto out_unlock;
        }

        unmap_folio(folio);

        /* block interrupt reentry in xa_lock and spinlock */
        local_irq_disable();
        if (mapping) {
                /*
                 * Check if the folio is present in page cache.
                 * We assume all tail are present too, if folio is there.
                 */
                xas_lock(&xas);
                xas_reset(&xas);
                if (xas_load(&xas) != folio)
                        goto fail;
        }

        /* Prevent deferred_split_scan() touching ->_refcount */
        spin_lock(&ds_queue->split_queue_lock);
        if (folio_ref_freeze(folio, 1 + extra_pins)) {
                if (folio_order(folio) > 1 &&
                    !list_empty(&folio->_deferred_list)) {
                        ds_queue->split_queue_len--;
                        if (folio_test_partially_mapped(folio)) {
                                folio_clear_partially_mapped(folio);
                                mod_mthp_stat(folio_order(folio),
                                              MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
                        }
                        /*
                         * Reinitialize page_deferred_list after removing the
                         * page from the split_queue, otherwise a subsequent
                         * split will see list corruption when checking the
                         * page_deferred_list.
                         */
                        list_del_init(&folio->_deferred_list);
                }
                spin_unlock(&ds_queue->split_queue_lock);
                if (mapping) {
                        int nr = folio_nr_pages(folio);

                        if (folio_test_pmd_mappable(folio) &&
                            new_order < HPAGE_PMD_ORDER) {
                                if (folio_test_swapbacked(folio)) {
                                        __lruvec_stat_mod_folio(folio,
                                                        NR_SHMEM_THPS, -nr);
                                } else {
                                        __lruvec_stat_mod_folio(folio,
                                                        NR_FILE_THPS, -nr);
                                        filemap_nr_thps_dec(mapping);
                                }
                        }
                }

                ret = __split_unmapped_folio(folio, new_order,
                                split_at, lock_at, list, end, &xas, mapping,
                                uniform_split);
        } else {
                spin_unlock(&ds_queue->split_queue_lock);
fail:
                if (mapping)
                        xas_unlock(&xas);
                local_irq_enable();
                remap_page(folio, folio_nr_pages(folio), 0);
                ret = -EAGAIN;
        }

out_unlock:
        if (anon_vma) {
                anon_vma_unlock_write(anon_vma);
                put_anon_vma(anon_vma);
        }
        if (mapping)
                i_mmap_unlock_read(mapping);
out:
        xas_destroy(&xas);
        if (order == HPAGE_PMD_ORDER)
                count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
        count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED);
        return ret;
}

/*
 * This function splits a large folio into smaller folios of order @new_order.
 * @page can point to any page of the large folio to split. The split operation
 * does not change the position of @page.
 *
 * Prerequisites:
 *
 * 1) The caller must hold a reference on the @page's owning folio, also known
 *    as the large folio.
 *
 * 2) The large folio must be locked.
 *
 * 3) The folio must not be pinned. Any unexpected folio references, including
 *    GUP pins, will result in the folio not getting split; instead, the caller
 *    will receive an -EAGAIN.
 *
 * 4) @new_order > 1, usually. Splitting to order-1 anonymous folios is not
 *    supported for non-file-backed folios, because folio->_deferred_list, which
 *    is used by partially mapped folios, is stored in subpage 2, but an order-1
 *    folio only has subpages 0 and 1. File-backed order-1 folios are supported,
 *    since they do not use _deferred_list.
 *
 * After splitting, the caller's folio reference will be transferred to @page,
 * resulting in a raised refcount of @page after this call. The other pages may
 * be freed if they are not mapped.
 *
 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
 *
 * Pages in @new_order will inherit the mapping, flags, and so on from the
 * huge page.
 *
 * Returns 0 if the huge page was split successfully.
 *
 * Returns -EAGAIN if the folio has unexpected reference (e.g., GUP) or if
 * the folio was concurrently removed from the page cache.
 *
 * Returns -EBUSY when trying to split the huge zeropage, if the folio is
 * under writeback, if fs-specific folio metadata cannot currently be
 * released, or if some unexpected race happened (e.g., anon VMA disappeared,
 * truncation).
 *
 * Callers should ensure that the order respects the address space mapping
 * min-order if one is set for non-anonymous folios.
 *
 * Returns -EINVAL when trying to split to an order that is incompatible
 * with the folio. Splitting to order 0 is compatible with all folios.
 */
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                                     unsigned int new_order)
{
        struct folio *folio = page_folio(page);

        return __folio_split(folio, new_order, &folio->page, page, list, true);
}

/*
 * folio_split: split a folio at @split_at to a @new_order folio
 * @folio: folio to split
 * @new_order: the order of the new folio
 * @split_at: a page within the new folio
 *
 * return: 0: successful, <0 failed (if -ENOMEM is returned, @folio might be
 * split but not to @new_order, the caller needs to check)
 *
 * It has the same prerequisites and returns as
 * split_huge_page_to_list_to_order().
 *
 * Split a folio at @split_at to a new_order folio, leave the
 * remaining subpages of the original folio as large as possible. For example,
 * in the case of splitting an order-9 folio at its third order-3 subpages to
 * an order-3 folio, there are 2^(9-3)=64 order-3 subpages in the order-9 folio.
 * After the split, there will be a group of folios with different orders and
 * the new folio containing @split_at is marked in bracket:
 * [order-4, {order-3}, order-3, order-5, order-6, order-7, order-8].
 *
 * After split, folio is left locked for caller.
 */
int folio_split(struct folio *folio, unsigned int new_order,
                struct page *split_at, struct list_head *list)
{
        return __folio_split(folio, new_order, split_at, &folio->page, list,
                        false);
}

int min_order_for_split(struct folio *folio)
{
        if (folio_test_anon(folio))
                return 0;

        if (!folio->mapping) {
                if (folio_test_pmd_mappable(folio))
                        count_vm_event(THP_SPLIT_PAGE_FAILED);
                return -EBUSY;
        }

        return mapping_min_folio_order(folio->mapping);
}

int split_folio_to_list(struct folio *folio, struct list_head *list)
{
        int ret = min_order_for_split(folio);

        if (ret < 0)
                return ret;

        return split_huge_page_to_list_to_order(&folio->page, list, ret);
}

/*
 * __folio_unqueue_deferred_split() is not to be called directly:
 * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
 * limits its calls to those folios which may have a _deferred_list for
 * queueing THP splits, and that list is (racily observed to be) non-empty.
 *
 * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
 * zero: because even when split_queue_lock is held, a non-empty _deferred_list
 * might be in use on deferred_split_scan()'s unlocked on-stack list.
 *
 * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
 * therefore important to unqueue deferred split before changing folio memcg.
 */
bool __folio_unqueue_deferred_split(struct folio *folio)
{
        struct deferred_split *ds_queue;
        unsigned long flags;
        bool unqueued = false;

        WARN_ON_ONCE(folio_ref_count(folio));
        WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));

        ds_queue = get_deferred_split_queue(folio);
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        if (!list_empty(&folio->_deferred_list)) {
                ds_queue->split_queue_len--;
                if (folio_test_partially_mapped(folio)) {
                        folio_clear_partially_mapped(folio);
                        mod_mthp_stat(folio_order(folio),
                                      MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
                }
                list_del_init(&folio->_deferred_list);
                unqueued = true;
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

        return unqueued;        /* useful for debug warnings */
}

/* partially_mapped=false won't clear PG_partially_mapped folio flag */
void deferred_split_folio(struct folio *folio, bool partially_mapped)
{
        struct deferred_split *ds_queue = get_deferred_split_queue(folio);
#ifdef CONFIG_MEMCG
        struct mem_cgroup *memcg = folio_memcg(folio);
#endif
        unsigned long flags;

        /*
         * Order 1 folios have no space for a deferred list, but we also
         * won't waste much memory by not adding them to the deferred list.
         */
        if (folio_order(folio) <= 1)
                return;

        if (!partially_mapped && !split_underused_thp)
                return;

        /*
         * Exclude swapcache: originally to avoid a corrupt deferred split
         * queue. Nowadays that is fully prevented by memcg1_swapout();
         * but if page reclaim is already handling the same folio, it is
         * unnecessary to handle it again in the shrinker, so excluding
         * swapcache here may still be a useful optimization.
         */
        if (folio_test_swapcache(folio))
                return;

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        if (partially_mapped) {
                if (!folio_test_partially_mapped(folio)) {
                        folio_set_partially_mapped(folio);
                        if (folio_test_pmd_mappable(folio))
                                count_vm_event(THP_DEFERRED_SPLIT_PAGE);
                        count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
                        mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);

                }
        } else {
                /* partially mapped folios cannot become non-partially mapped */
                VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
        }
        if (list_empty(&folio->_deferred_list)) {
                list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
                ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
                if (memcg)
                        set_shrinker_bit(memcg, folio_nid(folio),
                                         deferred_split_shrinker->id);
#endif
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}

static unsigned long deferred_split_count(struct shrinker *shrink,
                struct shrink_control *sc)
{
        struct pglist_data *pgdata = NODE_DATA(sc->nid);
        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;

#ifdef CONFIG_MEMCG
        if (sc->memcg)
                ds_queue = &sc->memcg->deferred_split_queue;
#endif
        return READ_ONCE(ds_queue->split_queue_len);
}

static bool thp_underused(struct folio *folio)
{
        int num_zero_pages = 0, num_filled_pages = 0;
        void *kaddr;
        int i;

        if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
                return false;

        for (i = 0; i < folio_nr_pages(folio); i++) {
                kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
                if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
                        num_zero_pages++;
                        if (num_zero_pages > khugepaged_max_ptes_none) {
                                kunmap_local(kaddr);
                                return true;
                        }
                } else {
                        /*
                         * Another path for early exit once the number
                         * of non-zero filled pages exceeds threshold.
                         */
                        num_filled_pages++;
                        if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
                                kunmap_local(kaddr);
                                return false;
                        }
                }
                kunmap_local(kaddr);
        }
        return false;
}

static unsigned long deferred_split_scan(struct shrinker *shrink,
                struct shrink_control *sc)
{
        struct pglist_data *pgdata = NODE_DATA(sc->nid);
        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
        unsigned long flags;
        LIST_HEAD(list);
        struct folio *folio, *next, *prev = NULL;
        int split = 0, removed = 0;

#ifdef CONFIG_MEMCG
        if (sc->memcg)
                ds_queue = &sc->memcg->deferred_split_queue;
#endif

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        /* Take pin on all head pages to avoid freeing them under us */
        list_for_each_entry_safe(folio, next, &ds_queue->split_queue,
                                                        _deferred_list) {
                if (folio_try_get(folio)) {
                        list_move(&folio->_deferred_list, &list);
                } else {
                        /* We lost race with folio_put() */
                        if (folio_test_partially_mapped(folio)) {
                                folio_clear_partially_mapped(folio);
                                mod_mthp_stat(folio_order(folio),
                                              MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
                        }
                        list_del_init(&folio->_deferred_list);
                        ds_queue->split_queue_len--;
                }
                if (!--sc->nr_to_scan)
                        break;
        }
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

        list_for_each_entry_safe(folio, next, &list, _deferred_list) {
                bool did_split = false;
                bool underused = false;

                if (!folio_test_partially_mapped(folio)) {
                        underused = thp_underused(folio);
                        if (!underused)
                                goto next;
                }
                if (!folio_trylock(folio))
                        goto next;
                if (!split_folio(folio)) {
                        did_split = true;
                        if (underused)
                                count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
                        split++;
                }
                folio_unlock(folio);
next:
                /*
                 * split_folio() removes folio from list on success.
                 * Only add back to the queue if folio is partially mapped.
                 * If thp_underused returns false, or if split_folio fails
                 * in the case it was underused, then consider it used and
                 * don't add it back to split_queue.
                 */
                if (did_split) {
                        ; /* folio already removed from list */
                } else if (!folio_test_partially_mapped(folio)) {
                        list_del_init(&folio->_deferred_list);
                        removed++;
                } else {
                        /*
                         * That unlocked list_del_init() above would be unsafe,
                         * unless its folio is separated from any earlier folios
                         * left on the list (which may be concurrently unqueued)
                         * by one safe folio with refcount still raised.
                         */
                        swap(folio, prev);
                }
                if (folio)
                        folio_put(folio);
        }

        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        list_splice_tail(&list, &ds_queue->split_queue);
        ds_queue->split_queue_len -= removed;
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);

        if (prev)
                folio_put(prev);

        /*
         * Stop shrinker if we didn't split any page, but the queue is empty.
         * This can happen if pages were freed under us.
         */
        if (!split && list_empty(&ds_queue->split_queue))
                return SHRINK_STOP;
        return split;
}

#ifdef CONFIG_DEBUG_FS
static void split_huge_pages_all(void)
{
        struct zone *zone;
        struct page *page;
        struct folio *folio;
        unsigned long pfn, max_zone_pfn;
        unsigned long total = 0, split = 0;

        pr_debug("Split all THPs\n");
        for_each_zone(zone) {
                if (!managed_zone(zone))
                        continue;
                max_zone_pfn = zone_end_pfn(zone);
                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
                        int nr_pages;

                        page = pfn_to_online_page(pfn);
                        if (!page || PageTail(page))
                                continue;
                        folio = page_folio(page);
                        if (!folio_try_get(folio))
                                continue;

                        if (unlikely(page_folio(page) != folio))
                                goto next;

                        if (zone != folio_zone(folio))
                                goto next;

                        if (!folio_test_large(folio)
                                || folio_test_hugetlb(folio)
                                || !folio_test_lru(folio))
                                goto next;

                        total++;
                        folio_lock(folio);
                        nr_pages = folio_nr_pages(folio);
                        if (!split_folio(folio))
                                split++;
                        pfn += nr_pages - 1;
                        folio_unlock(folio);
next:
                        folio_put(folio);
                        cond_resched();
                }
        }

        pr_debug("%lu of %lu THP split\n", split, total);
}

static inline bool vma_not_suitable_for_thp_split(struct vm_area_struct *vma)
{
        return vma_is_special_huge(vma) || (vma->vm_flags & VM_IO) ||
                    is_vm_hugetlb_page(vma);
}

static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
                                unsigned long vaddr_end, unsigned int new_order,
                                long in_folio_offset)
{
        int ret = 0;
        struct task_struct *task;
        struct mm_struct *mm;
        unsigned long total = 0, split = 0;
        unsigned long addr;

        vaddr_start &= PAGE_MASK;
        vaddr_end &= PAGE_MASK;

        task = find_get_task_by_vpid(pid);
        if (!task) {
                ret = -ESRCH;
                goto out;
        }

        /* Find the mm_struct */
        mm = get_task_mm(task);
        put_task_struct(task);

        if (!mm) {
                ret = -EINVAL;
                goto out;
        }

        pr_debug("Split huge pages in pid: %d, vaddr: [0x%lx - 0x%lx]\n",
                 pid, vaddr_start, vaddr_end);

        mmap_read_lock(mm);
        /*
         * always increase addr by PAGE_SIZE, since we could have a PTE page
         * table filled with PTE-mapped THPs, each of which is distinct.
         */
        for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
                struct vm_area_struct *vma = vma_lookup(mm, addr);
                struct folio_walk fw;
                struct folio *folio;
                struct address_space *mapping;
                unsigned int target_order = new_order;

                if (!vma)
                        break;

                /* skip special VMA and hugetlb VMA */
                if (vma_not_suitable_for_thp_split(vma)) {
                        addr = vma->vm_end;
                        continue;
                }

                folio = folio_walk_start(&fw, vma, addr, 0);
                if (!folio)
                        continue;

                if (!is_transparent_hugepage(folio))
                        goto next;

                if (!folio_test_anon(folio)) {
                        mapping = folio->mapping;
                        target_order = max(new_order,
                                           mapping_min_folio_order(mapping));
                }

                if (target_order >= folio_order(folio))
                        goto next;

                total++;
                /*
                 * For folios with private, split_huge_page_to_list_to_order()
                 * will try to drop it before split and then check if the folio
                 * can be split or not. So skip the check here.
                 */
                if (!folio_test_private(folio) &&
                    !can_split_folio(folio, 0, NULL))
                        goto next;

                if (!folio_trylock(folio))
                        goto next;
                folio_get(folio);
                folio_walk_end(&fw, vma);

                if (!folio_test_anon(folio) && folio->mapping != mapping)
                        goto unlock;

                if (in_folio_offset < 0 ||
                    in_folio_offset >= folio_nr_pages(folio)) {
                        if (!split_folio_to_order(folio, target_order))
                                split++;
                } else {
                        struct page *split_at = folio_page(folio,
                                                           in_folio_offset);
                        if (!folio_split(folio, target_order, split_at, NULL))
                                split++;
                }

unlock:

                folio_unlock(folio);
                folio_put(folio);

                cond_resched();
                continue;
next:
                folio_walk_end(&fw, vma);
                cond_resched();
        }
        mmap_read_unlock(mm);
        mmput(mm);

        pr_debug("%lu of %lu THP split\n", split, total);

out:
        return ret;
}

static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start,
                                pgoff_t off_end, unsigned int new_order,
                                long in_folio_offset)
{
        struct filename *file;
        struct file *candidate;
        struct address_space *mapping;
        int ret = -EINVAL;
        pgoff_t index;
        int nr_pages = 1;
        unsigned long total = 0, split = 0;
        unsigned int min_order;
        unsigned int target_order;

        file = getname_kernel(file_path);
        if (IS_ERR(file))
                return ret;

        candidate = file_open_name(file, O_RDONLY, 0);
        if (IS_ERR(candidate))
                goto out;

        pr_debug("split file-backed THPs in file: %s, page offset: [0x%lx - 0x%lx]\n",
                 file_path, off_start, off_end);

        mapping = candidate->f_mapping;
        min_order = mapping_min_folio_order(mapping);
        target_order = max(new_order, min_order);

        for (index = off_start; index < off_end; index += nr_pages) {
                struct folio *folio = filemap_get_folio(mapping, index);

                nr_pages = 1;
                if (IS_ERR(folio))
                        continue;

                if (!folio_test_large(folio))
                        goto next;

                total++;
                nr_pages = folio_nr_pages(folio);

                if (target_order >= folio_order(folio))
                        goto next;

                if (!folio_trylock(folio))
                        goto next;

                if (folio->mapping != mapping)
                        goto unlock;

                if (in_folio_offset < 0 || in_folio_offset >= nr_pages) {
                        if (!split_folio_to_order(folio, target_order))
                                split++;
                } else {
                        struct page *split_at = folio_page(folio,
                                                           in_folio_offset);
                        if (!folio_split(folio, target_order, split_at, NULL))
                                split++;
                }

unlock:
                folio_unlock(folio);
next:
                folio_put(folio);
                cond_resched();
        }

        filp_close(candidate, NULL);
        ret = 0;

        pr_debug("%lu of %lu file-backed THP split\n", split, total);
out:
        putname(file);
        return ret;
}

#define MAX_INPUT_BUF_SZ 255

static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppops)
{
        static DEFINE_MUTEX(split_debug_mutex);
        ssize_t ret;
        /*
         * hold pid, start_vaddr, end_vaddr, new_order or
         * file_path, off_start, off_end, new_order
         */
        char input_buf[MAX_INPUT_BUF_SZ];
        int pid;
        unsigned long vaddr_start, vaddr_end;
        unsigned int new_order = 0;
        long in_folio_offset = -1;

        ret = mutex_lock_interruptible(&split_debug_mutex);
        if (ret)
                return ret;

        ret = -EFAULT;

        memset(input_buf, 0, MAX_INPUT_BUF_SZ);
        if (copy_from_user(input_buf, buf, min_t(size_t, count, MAX_INPUT_BUF_SZ)))
                goto out;

        input_buf[MAX_INPUT_BUF_SZ - 1] = '\0';

        if (input_buf[0] == '/') {
                char *tok;
                char *tok_buf = input_buf;
                char file_path[MAX_INPUT_BUF_SZ];
                pgoff_t off_start = 0, off_end = 0;
                size_t input_len = strlen(input_buf);

                tok = strsep(&tok_buf, ",");
                if (tok && tok_buf) {
                        strscpy(file_path, tok);
                } else {
                        ret = -EINVAL;
                        goto out;
                }

                ret = sscanf(tok_buf, "0x%lx,0x%lx,%d,%ld", &off_start, &off_end,
                                &new_order, &in_folio_offset);
                if (ret != 2 && ret != 3 && ret != 4) {
                        ret = -EINVAL;
                        goto out;
                }
                ret = split_huge_pages_in_file(file_path, off_start, off_end,
                                new_order, in_folio_offset);
                if (!ret)
                        ret = input_len;

                goto out;
        }

        ret = sscanf(input_buf, "%d,0x%lx,0x%lx,%d,%ld", &pid, &vaddr_start,
                        &vaddr_end, &new_order, &in_folio_offset);
        if (ret == 1 && pid == 1) {
                split_huge_pages_all();
                ret = strlen(input_buf);
                goto out;
        } else if (ret != 3 && ret != 4 && ret != 5) {
                ret = -EINVAL;
                goto out;
        }

        ret = split_huge_pages_pid(pid, vaddr_start, vaddr_end, new_order,
                        in_folio_offset);
        if (!ret)
                ret = strlen(input_buf);
out:
        mutex_unlock(&split_debug_mutex);
        return ret;

}

static const struct file_operations split_huge_pages_fops = {
        .owner         = THIS_MODULE,
        .write         = split_huge_pages_write,
};

static int __init split_huge_pages_debugfs(void)
{
        debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
                            &split_huge_pages_fops);
        return 0;
}
late_initcall(split_huge_pages_debugfs);
#endif

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
                struct page *page)
{
        struct folio *folio = page_folio(page);
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address = pvmw->address;
        bool anon_exclusive;
        pmd_t pmdval;
        swp_entry_t entry;
        pmd_t pmdswp;

        if (!(pvmw->pmd && !pvmw->pte))
                return 0;

        flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
        pmdval = pmdp_invalidate(vma, address, pvmw->pmd);

        /* See folio_try_share_anon_rmap_pmd(): invalidate PMD first. */
        anon_exclusive = folio_test_anon(folio) && PageAnonExclusive(page);
        if (anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) {
                set_pmd_at(mm, address, pvmw->pmd, pmdval);
                return -EBUSY;
        }

        if (pmd_dirty(pmdval))
                folio_mark_dirty(folio);
        if (pmd_write(pmdval))
                entry = make_writable_migration_entry(page_to_pfn(page));
        else if (anon_exclusive)
                entry = make_readable_exclusive_migration_entry(page_to_pfn(page));
        else
                entry = make_readable_migration_entry(page_to_pfn(page));
        if (pmd_young(pmdval))
                entry = make_migration_entry_young(entry);
        if (pmd_dirty(pmdval))
                entry = make_migration_entry_dirty(entry);
        pmdswp = swp_entry_to_pmd(entry);
        if (pmd_soft_dirty(pmdval))
                pmdswp = pmd_swp_mksoft_dirty(pmdswp);
        if (pmd_uffd_wp(pmdval))
                pmdswp = pmd_swp_mkuffd_wp(pmdswp);
        set_pmd_at(mm, address, pvmw->pmd, pmdswp);
        folio_remove_rmap_pmd(folio, page, vma);
        folio_put(folio);
        trace_set_migration_pmd(address, pmd_val(pmdswp));

        return 0;
}

void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
{
        struct folio *folio = page_folio(new);
        struct vm_area_struct *vma = pvmw->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address = pvmw->address;
        unsigned long haddr = address & HPAGE_PMD_MASK;
        pmd_t pmde;
        swp_entry_t entry;

        if (!(pvmw->pmd && !pvmw->pte))
                return;

        entry = pmd_to_swp_entry(*pvmw->pmd);
        folio_get(folio);
        pmde = mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot));
        if (pmd_swp_soft_dirty(*pvmw->pmd))
                pmde = pmd_mksoft_dirty(pmde);
        if (is_writable_migration_entry(entry))
                pmde = pmd_mkwrite(pmde, vma);
        if (pmd_swp_uffd_wp(*pvmw->pmd))
                pmde = pmd_mkuffd_wp(pmde);
        if (!is_migration_entry_young(entry))
                pmde = pmd_mkold(pmde);
        /* NOTE: this may contain setting soft-dirty on some archs */
        if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
                pmde = pmd_mkdirty(pmde);

        if (folio_test_anon(folio)) {
                rmap_t rmap_flags = RMAP_NONE;

                if (!is_readable_migration_entry(entry))
                        rmap_flags |= RMAP_EXCLUSIVE;

                folio_add_anon_rmap_pmd(folio, new, vma, haddr, rmap_flags);
        } else {
                folio_add_file_rmap_pmd(folio, new, vma);
        }
        VM_BUG_ON(pmd_write(pmde) && folio_test_anon(folio) && !PageAnonExclusive(new));
        set_pmd_at(mm, haddr, pvmw->pmd, pmde);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache_pmd(vma, address, pvmw->pmd);
        trace_remove_migration_pmd(address, pmd_val(pmde));
}
#endif































































































  170 


  168 



    5 




































































































































































































































  164 
































































































































  163 

  164 
  164 
  164 
































































    8 
    8 












































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic pidhash and scalable, time-bounded PID allocator
 *
 * (C) 2002-2003 Nadia Yvette Chambers, IBM
 * (C) 2004 Nadia Yvette Chambers, Oracle
 * (C) 2002-2004 Ingo Molnar, Red Hat
 *
 * pid-structures are backing objects for tasks sharing a given ID to chain
 * against. There is very little to them aside from hashing them and
 * parking tasks using given ID's on a list.
 *
 * The hash is always changed with the tasklist_lock write-acquired,
 * and the hash is only accessed with the tasklist_lock at least
 * read-acquired, so there's no additional SMP locking needed here.
 *
 * We have a list of bitmap pages, which bitmaps represent the PID space.
 * Allocating and freeing PIDs is completely lockless. The worst-case
 * allocation scenario when all but one out of 1 million PIDs possible are
 * allocated already: the scanning of 32 list entries and at most PAGE_SIZE
 * bytes. The typical fastpath is a single successful setbit. Freeing is O(1).
 *
 * Pid namespaces:
 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
 *     Many thanks to Oleg Nesterov for comments and help
 *
 */

#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/memblock.h>
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
#include <linux/proc_ns.h>
#include <linux/refcount.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <linux/pidfs.h>
#include <linux/seqlock.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>

struct pid init_struct_pid = {
        .count                = REFCOUNT_INIT(1),
        .tasks                = {
                { .first = NULL },
                { .first = NULL },
                { .first = NULL },
        },
        .level                = 0,
        .numbers        = { {
                .nr                = 0,
                .ns                = &init_pid_ns,
        }, }
};

static int pid_max_min = RESERVED_PIDS + 1;
static int pid_max_max = PID_MAX_LIMIT;

/*
 * PID-map pages start out as NULL, they get allocated upon
 * first use and are never deallocated. This way a low pid_max
 * value does not cause lots of bitmaps to be allocated, but
 * the scheme scales to up to 4 million PIDs, runtime.
 */
struct pid_namespace init_pid_ns = {
        .ns.count = REFCOUNT_INIT(2),
        .idr = IDR_INIT(init_pid_ns.idr),
        .pid_allocated = PIDNS_ADDING,
        .level = 0,
        .child_reaper = &init_task,
        .user_ns = &init_user_ns,
        .ns.inum = PROC_PID_INIT_INO,
#ifdef CONFIG_PID_NS
        .ns.ops = &pidns_operations,
#endif
        .pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
        .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
};
EXPORT_SYMBOL_GPL(init_pid_ns);

static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);

void put_pid(struct pid *pid)
{
        struct pid_namespace *ns;

        if (!pid)
                return;

        ns = pid->numbers[pid->level].ns;
        if (refcount_dec_and_test(&pid->count)) {
                kmem_cache_free(ns->pid_cachep, pid);
                put_pid_ns(ns);
        }
}
EXPORT_SYMBOL_GPL(put_pid);

static void delayed_put_pid(struct rcu_head *rhp)
{
        struct pid *pid = container_of(rhp, struct pid, rcu);
        put_pid(pid);
}

void free_pid(struct pid *pid)
{
        int i;

        lockdep_assert_not_held(&tasklist_lock);

        spin_lock(&pidmap_lock);
        for (i = 0; i <= pid->level; i++) {
                struct upid *upid = pid->numbers + i;
                struct pid_namespace *ns = upid->ns;
                switch (--ns->pid_allocated) {
                case 2:
                case 1:
                        /* When all that is left in the pid namespace
                         * is the reaper wake up the reaper.  The reaper
                         * may be sleeping in zap_pid_ns_processes().
                         */
                        wake_up_process(ns->child_reaper);
                        break;
                case PIDNS_ADDING:
                        /* Handle a fork failure of the first process */
                        WARN_ON(ns->child_reaper);
                        ns->pid_allocated = 0;
                        break;
                }

                idr_remove(&ns->idr, upid->nr);
        }
        pidfs_remove_pid(pid);
        spin_unlock(&pidmap_lock);

        call_rcu(&pid->rcu, delayed_put_pid);
}

void free_pids(struct pid **pids)
{
        int tmp;

        /*
         * This can batch pidmap_lock.
         */
        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
                if (pids[tmp])
                        free_pid(pids[tmp]);
}

struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                      size_t set_tid_size)
{
        struct pid *pid;
        enum pid_type type;
        int i, nr;
        struct pid_namespace *tmp;
        struct upid *upid;
        int retval = -ENOMEM;

        /*
         * set_tid_size contains the size of the set_tid array. Starting at
         * the most nested currently active PID namespace it tells alloc_pid()
         * which PID to set for a process in that most nested PID namespace
         * up to set_tid_size PID namespaces. It does not have to set the PID
         * for a process in all nested PID namespaces but set_tid_size must
         * never be greater than the current ns->level + 1.
         */
        if (set_tid_size > ns->level + 1)
                return ERR_PTR(-EINVAL);

        pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
        if (!pid)
                return ERR_PTR(retval);

        tmp = ns;
        pid->level = ns->level;

        for (i = ns->level; i >= 0; i--) {
                int tid = 0;
                int pid_max = READ_ONCE(tmp->pid_max);

                if (set_tid_size) {
                        tid = set_tid[ns->level - i];

                        retval = -EINVAL;
                        if (tid < 1 || tid >= pid_max)
                                goto out_free;
                        /*
                         * Also fail if a PID != 1 is requested and
                         * no PID 1 exists.
                         */
                        if (tid != 1 && !tmp->child_reaper)
                                goto out_free;
                        retval = -EPERM;
                        if (!checkpoint_restore_ns_capable(tmp->user_ns))
                                goto out_free;
                        set_tid_size--;
                }

                idr_preload(GFP_KERNEL);
                spin_lock(&pidmap_lock);

                if (tid) {
                        nr = idr_alloc(&tmp->idr, NULL, tid,
                                       tid + 1, GFP_ATOMIC);
                        /*
                         * If ENOSPC is returned it means that the PID is
                         * alreay in use. Return EEXIST in that case.
                         */
                        if (nr == -ENOSPC)
                                nr = -EEXIST;
                } else {
                        int pid_min = 1;
                        /*
                         * init really needs pid 1, but after reaching the
                         * maximum wrap back to RESERVED_PIDS
                         */
                        if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
                                pid_min = RESERVED_PIDS;

                        /*
                         * Store a null pointer so find_pid_ns does not find
                         * a partially initialized PID (see below).
                         */
                        nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
                                              pid_max, GFP_ATOMIC);
                }
                spin_unlock(&pidmap_lock);
                idr_preload_end();

                if (nr < 0) {
                        retval = (nr == -ENOSPC) ? -EAGAIN : nr;
                        goto out_free;
                }

                pid->numbers[i].nr = nr;
                pid->numbers[i].ns = tmp;
                tmp = tmp->parent;
        }

        /*
         * ENOMEM is not the most obvious choice especially for the case
         * where the child subreaper has already exited and the pid
         * namespace denies the creation of any new processes. But ENOMEM
         * is what we have exposed to userspace for a long time and it is
         * documented behavior for pid namespaces. So we can't easily
         * change it even if there were an error code better suited.
         */
        retval = -ENOMEM;

        get_pid_ns(ns);
        refcount_set(&pid->count, 1);
        spin_lock_init(&pid->lock);
        for (type = 0; type < PIDTYPE_MAX; ++type)
                INIT_HLIST_HEAD(&pid->tasks[type]);

        init_waitqueue_head(&pid->wait_pidfd);
        INIT_HLIST_HEAD(&pid->inodes);

        upid = pid->numbers + ns->level;
        idr_preload(GFP_KERNEL);
        spin_lock(&pidmap_lock);
        if (!(ns->pid_allocated & PIDNS_ADDING))
                goto out_unlock;
        pidfs_add_pid(pid);
        for ( ; upid >= pid->numbers; --upid) {
                /* Make the PID visible to find_pid_ns. */
                idr_replace(&upid->ns->idr, pid, upid->nr);
                upid->ns->pid_allocated++;
        }
        spin_unlock(&pidmap_lock);
        idr_preload_end();

        return pid;

out_unlock:
        spin_unlock(&pidmap_lock);
        idr_preload_end();
        put_pid_ns(ns);

out_free:
        spin_lock(&pidmap_lock);
        while (++i <= ns->level) {
                upid = pid->numbers + i;
                idr_remove(&upid->ns->idr, upid->nr);
        }

        /* On failure to allocate the first pid, reset the state */
        if (ns->pid_allocated == PIDNS_ADDING)
                idr_set_cursor(&ns->idr, 0);

        spin_unlock(&pidmap_lock);

        kmem_cache_free(ns->pid_cachep, pid);
        return ERR_PTR(retval);
}

void disable_pid_allocation(struct pid_namespace *ns)
{
        spin_lock(&pidmap_lock);
        ns->pid_allocated &= ~PIDNS_ADDING;
        spin_unlock(&pidmap_lock);
}

struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
        return idr_find(&ns->idr, nr);
}
EXPORT_SYMBOL_GPL(find_pid_ns);

struct pid *find_vpid(int nr)
{
        return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);

static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type)
{
        return (type == PIDTYPE_PID) ?
                &task->thread_pid :
                &task->signal->pids[type];
}

/*
 * attach_pid() must be called with the tasklist_lock write-held.
 */
void attach_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;

        lockdep_assert_held_write(&tasklist_lock);

        pid = *task_pid_ptr(task, type);
        hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]);
}

static void __change_pid(struct pid **pids, struct task_struct *task,
                         enum pid_type type, struct pid *new)
{
        struct pid **pid_ptr, *pid;
        int tmp;

        lockdep_assert_held_write(&tasklist_lock);

        pid_ptr = task_pid_ptr(task, type);
        pid = *pid_ptr;

        hlist_del_rcu(&task->pid_links[type]);
        *pid_ptr = new;

        if (type == PIDTYPE_PID) {
                WARN_ON_ONCE(pid_has_task(pid, PIDTYPE_PID));
                wake_up_all(&pid->wait_pidfd);
        }

        for (tmp = PIDTYPE_MAX; --tmp >= 0; )
                if (pid_has_task(pid, tmp))
                        return;

        WARN_ON(pids[type]);
        pids[type] = pid;
}

void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type type)
{
        __change_pid(pids, task, type, NULL);
}

void change_pid(struct pid **pids, struct task_struct *task, enum pid_type type,
                struct pid *pid)
{
        __change_pid(pids, task, type, pid);
        attach_pid(task, type);
}

void exchange_tids(struct task_struct *left, struct task_struct *right)
{
        struct pid *pid1 = left->thread_pid;
        struct pid *pid2 = right->thread_pid;
        struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID];
        struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID];

        lockdep_assert_held_write(&tasklist_lock);

        /* Swap the single entry tid lists */
        hlists_swap_heads_rcu(head1, head2);

        /* Swap the per task_struct pid */
        rcu_assign_pointer(left->thread_pid, pid2);
        rcu_assign_pointer(right->thread_pid, pid1);

        /* Swap the cached value */
        WRITE_ONCE(left->pid, pid_nr(pid2));
        WRITE_ONCE(right->pid, pid_nr(pid1));
}

/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
void transfer_pid(struct task_struct *old, struct task_struct *new,
                           enum pid_type type)
{
        WARN_ON_ONCE(type == PIDTYPE_PID);
        lockdep_assert_held_write(&tasklist_lock);
        hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]);
}

struct task_struct *pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result = NULL;
        if (pid) {
                struct hlist_node *first;
                first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
                                              lockdep_tasklist_lock_is_held());
                if (first)
                        result = hlist_entry(first, struct task_struct, pid_links[(type)]);
        }
        return result;
}
EXPORT_SYMBOL(pid_task);

/*
 * Must be called under rcu_read_lock().
 */
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
                         "find_task_by_pid_ns() needs rcu_read_lock() protection");
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}

struct task_struct *find_task_by_vpid(pid_t vnr)
{
        return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}

struct task_struct *find_get_task_by_vpid(pid_t nr)
{
        struct task_struct *task;

        rcu_read_lock();
        task = find_task_by_vpid(nr);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        return task;
}

struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        rcu_read_lock();
        pid = get_pid(rcu_dereference(*task_pid_ptr(task, type)));
        rcu_read_unlock();
        return pid;
}
EXPORT_SYMBOL_GPL(get_task_pid);

struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
        struct task_struct *result;
        rcu_read_lock();
        result = pid_task(pid, type);
        if (result)
                get_task_struct(result);
        rcu_read_unlock();
        return result;
}
EXPORT_SYMBOL_GPL(get_pid_task);

struct pid *find_get_pid(pid_t nr)
{
        struct pid *pid;

        rcu_read_lock();
        pid = get_pid(find_vpid(nr));
        rcu_read_unlock();

        return pid;
}
EXPORT_SYMBOL_GPL(find_get_pid);

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
{
        struct upid *upid;
        pid_t nr = 0;

        if (pid && ns->level <= pid->level) {
                upid = &pid->numbers[ns->level];
                if (upid->ns == ns)
                        nr = upid->nr;
        }
        return nr;
}
EXPORT_SYMBOL_GPL(pid_nr_ns);

pid_t pid_vnr(struct pid *pid)
{
        return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);

pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
                        struct pid_namespace *ns)
{
        pid_t nr = 0;

        rcu_read_lock();
        if (!ns)
                ns = task_active_pid_ns(current);
        nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns);
        rcu_read_unlock();

        return nr;
}
EXPORT_SYMBOL(__task_pid_nr_ns);

struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
        return ns_of_pid(task_pid(tsk));
}
EXPORT_SYMBOL_GPL(task_active_pid_ns);

/*
 * Used by proc to find the first pid that is greater than or equal to nr.
 *
 * If there is a pid at nr this function is exactly the same as find_pid_ns.
 */
struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
{
        return idr_get_next(&ns->idr, &nr);
}
EXPORT_SYMBOL_GPL(find_ge_pid);

struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
{
        CLASS(fd, f)(fd);
        struct pid *pid;

        if (fd_empty(f))
                return ERR_PTR(-EBADF);

        pid = pidfd_pid(fd_file(f));
        if (!IS_ERR(pid)) {
                get_pid(pid);
                *flags = fd_file(f)->f_flags;
        }
        return pid;
}

/**
 * pidfd_get_task() - Get the task associated with a pidfd
 *
 * @pidfd: pidfd for which to get the task
 * @flags: flags associated with this pidfd
 *
 * Return the task associated with @pidfd. The function takes a reference on
 * the returned task. The caller is responsible for releasing that reference.
 *
 * Return: On success, the task_struct associated with the pidfd.
 *           On error, a negative errno number will be returned.
 */
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
{
        unsigned int f_flags = 0;
        struct pid *pid;
        struct task_struct *task;
        enum pid_type type;

        switch (pidfd) {
        case  PIDFD_SELF_THREAD:
                type = PIDTYPE_PID;
                pid = get_task_pid(current, type);
                break;
        case  PIDFD_SELF_THREAD_GROUP:
                type = PIDTYPE_TGID;
                pid = get_task_pid(current, type);
                break;
        default:
                pid = pidfd_get_pid(pidfd, &f_flags);
                if (IS_ERR(pid))
                        return ERR_CAST(pid);
                type = PIDTYPE_TGID;
                break;
        }

        task = get_pid_task(pid, type);
        put_pid(pid);
        if (!task)
                return ERR_PTR(-ESRCH);

        *flags = f_flags;
        return task;
}

/**
 * pidfd_create() - Create a new pid file descriptor.
 *
 * @pid:   struct pid that the pidfd will reference
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
 *
 * Note, that this function can only be called after the fd table has
 * been unshared to avoid leaking the pidfd to the new process.
 *
 * This symbol should not be explicitly exported to loadable modules.
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
static int pidfd_create(struct pid *pid, unsigned int flags)
{
        int pidfd;
        struct file *pidfd_file;

        pidfd = pidfd_prepare(pid, flags, &pidfd_file);
        if (pidfd < 0)
                return pidfd;

        fd_install(pidfd, pidfd_file);
        return pidfd;
}

/**
 * sys_pidfd_open() - Open new pid file descriptor.
 *
 * @pid:   pid for which to retrieve a pidfd
 * @flags: flags to pass
 *
 * This creates a new pid file descriptor with the O_CLOEXEC flag set for
 * the task identified by @pid. Without PIDFD_THREAD flag the target task
 * must be a thread-group leader.
 *
 * Return: On success, a cloexec pidfd is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
        int fd;
        struct pid *p;

        if (flags & ~(PIDFD_NONBLOCK | PIDFD_THREAD))
                return -EINVAL;

        if (pid <= 0)
                return -EINVAL;

        p = find_get_pid(pid);
        if (!p)
                return -ESRCH;

        fd = pidfd_create(p, flags);

        put_pid(p);
        return fd;
}

#ifdef CONFIG_SYSCTL
static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
{
        return &task_active_pid_ns(current)->set;
}

static int set_is_seen(struct ctl_table_set *set)
{
        return &task_active_pid_ns(current)->set == set;
}

static int pid_table_root_permissions(struct ctl_table_header *head,
                                      const struct ctl_table *table)
{
        struct pid_namespace *pidns =
                container_of(head->set, struct pid_namespace, set);
        int mode = table->mode;

        if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
            uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
                mode = (mode & S_IRWXU) >> 6;
        else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
                mode = (mode & S_IRWXG) >> 3;
        else
                mode = mode & S_IROTH;
        return (mode << 6) | (mode << 3) | mode;
}

static void pid_table_root_set_ownership(struct ctl_table_header *head,
                                         kuid_t *uid, kgid_t *gid)
{
        struct pid_namespace *pidns =
                container_of(head->set, struct pid_namespace, set);
        kuid_t ns_root_uid;
        kgid_t ns_root_gid;

        ns_root_uid = make_kuid(pidns->user_ns, 0);
        if (uid_valid(ns_root_uid))
                *uid = ns_root_uid;

        ns_root_gid = make_kgid(pidns->user_ns, 0);
        if (gid_valid(ns_root_gid))
                *gid = ns_root_gid;
}

static struct ctl_table_root pid_table_root = {
        .lookup                = pid_table_root_lookup,
        .permissions        = pid_table_root_permissions,
        .set_ownership        = pid_table_root_set_ownership,
};

static const struct ctl_table pid_table[] = {
        {
                .procname        = "pid_max",
                .data                = &init_pid_ns.pid_max,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = &pid_max_min,
                .extra2                = &pid_max_max,
        },
};
#endif

int register_pidns_sysctls(struct pid_namespace *pidns)
{
#ifdef CONFIG_SYSCTL
        struct ctl_table *tbl;

        setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);

        tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
        if (!tbl)
                return -ENOMEM;
        tbl->data = &pidns->pid_max;
        pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
                             PIDS_PER_CPU_DEFAULT * num_possible_cpus()));

        pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
                                                 ARRAY_SIZE(pid_table));
        if (!pidns->sysctls) {
                kfree(tbl);
                retire_sysctl_set(&pidns->set);
                return -ENOMEM;
        }
#endif
        return 0;
}

void unregister_pidns_sysctls(struct pid_namespace *pidns)
{
#ifdef CONFIG_SYSCTL
        const struct ctl_table *tbl;

        tbl = pidns->sysctls->ctl_table_arg;
        unregister_sysctl_table(pidns->sysctls);
        retire_sysctl_set(&pidns->set);
        kfree(tbl);
#endif
}

void __init pid_idr_init(void)
{
        /* Verify no one has done anything silly: */
        BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);

        /* bump default and minimum pid_max based on number of cpus */
        init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
                                  PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
        pid_max_min = max_t(int, pid_max_min,
                                PIDS_PER_CPU_MIN * num_possible_cpus());
        pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);

        idr_init(&init_pid_ns.idr);

        init_pid_ns.pid_cachep = kmem_cache_create("pid",
                        struct_size_t(struct pid, numbers, 1),
                        __alignof__(struct pid),
                        SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT,
                        NULL);
}

static __init int pid_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
        /* "kernel" directory will have already been initialized. */
        BUG_ON(register_pidns_sysctls(&init_pid_ns));
#endif
        return 0;
}
subsys_initcall(pid_namespace_sysctl_init);

static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
        struct file *file;
        int ret;

        ret = down_read_killable(&task->signal->exec_update_lock);
        if (ret)
                return ERR_PTR(ret);

        if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS))
                file = fget_task(task, fd);
        else
                file = ERR_PTR(-EPERM);

        up_read(&task->signal->exec_update_lock);

        if (!file) {
                /*
                 * It is possible that the target thread is exiting; it can be
                 * either:
                 * 1. before exit_signals(), which gives a real fd
                 * 2. before exit_files() takes the task_lock() gives a real fd
                 * 3. after exit_files() releases task_lock(), ->files is NULL;
                 *    this has PF_EXITING, since it was set in exit_signals(),
                 *    __pidfd_fget() returns EBADF.
                 * In case 3 we get EBADF, but that really means ESRCH, since
                 * the task is currently exiting and has freed its files
                 * struct, so we fix it up.
                 */
                if (task->flags & PF_EXITING)
                        file = ERR_PTR(-ESRCH);
                else
                        file = ERR_PTR(-EBADF);
        }

        return file;
}

static int pidfd_getfd(struct pid *pid, int fd)
{
        struct task_struct *task;
        struct file *file;
        int ret;

        task = get_pid_task(pid, PIDTYPE_PID);
        if (!task)
                return -ESRCH;

        file = __pidfd_fget(task, fd);
        put_task_struct(task);
        if (IS_ERR(file))
                return PTR_ERR(file);

        ret = receive_fd(file, NULL, O_CLOEXEC);
        fput(file);

        return ret;
}

/**
 * sys_pidfd_getfd() - Get a file descriptor from another process
 *
 * @pidfd:        the pidfd file descriptor of the process
 * @fd:                the file descriptor number to get
 * @flags:        flags on how to get the fd (reserved)
 *
 * This syscall gets a copy of a file descriptor from another process
 * based on the pidfd, and file descriptor number. It requires that
 * the calling process has the ability to ptrace the process represented
 * by the pidfd. The process which is having its file descriptor copied
 * is otherwise unaffected.
 *
 * Return: On success, a cloexec file descriptor is returned.
 *         On error, a negative errno number will be returned.
 */
SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd,
                unsigned int, flags)
{
        struct pid *pid;

        /* flags is currently unused - make sure it's unset */
        if (flags)
                return -EINVAL;

        CLASS(fd, f)(pidfd);
        if (fd_empty(f))
                return -EBADF;

        pid = pidfd_pid(fd_file(f));
        if (IS_ERR(pid))
                return PTR_ERR(pid);

        return pidfd_getfd(pid, fd);
}
















































































































































































































































































































































































































































































    6 



















    6 





























    5 
    5 







    5 
















    7 











    6 







    5 






    5 





    2 



    5 

    5 



    4 
    4 







    4 








    6 



    6 


    6 



    4 



    4 









    4 












    6 







    6 



    4 








    6 

















    6 




    6 
















    4 




    4 










































    6 
    6 







    6 







































    6 
    6 

    6 



    4 
    4 




    6 







    6 











    8 








































































    4 
    4 






























    6 





    4 

    6 

    2 





















    4 


    4 




    4 


    4 
    4 
















    8 


















    8 





























    8 












































    8 

























    8 



    8 






    8 



    8 









    8 











    6 
    7 








    8 
































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
// SPDX-License-Identifier: GPL-2.0-only
/*
 * ARMv8 PMUv3 Performance Events handling code.
 *
 * Copyright (C) 2012 ARM Limited
 * Author: Will Deacon <will.deacon@arm.com>
 *
 * This code is based heavily on the ARMv7 perf event code.
 */

#include <asm/irq_regs.h>
#include <asm/perf_event.h>
#include <asm/virt.h>

#include <clocksource/arm_arch_timer.h>

#include <linux/acpi.h>
#include <linux/bitfield.h>
#include <linux/clocksource.h>
#include <linux/of.h>
#include <linux/perf/arm_pmu.h>
#include <linux/perf/arm_pmuv3.h>
#include <linux/platform_device.h>
#include <linux/sched_clock.h>
#include <linux/smp.h>
#include <linux/nmi.h>

/* ARMv8 Cortex-A53 specific event types. */
#define ARMV8_A53_PERFCTR_PREF_LINEFILL                                0xC2

/* ARMv8 Cavium ThunderX specific event types. */
#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST                        0xE9
#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS                0xEA
#define ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS                0xEB
#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS                0xEC
#define ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS                0xED

/*
 * ARMv8 Architectural defined events, not all of these may
 * be supported on any given implementation. Unsupported events will
 * be disabled at run-time based on the PMCEID registers.
 */
static const unsigned armv8_pmuv3_perf_map[PERF_COUNT_HW_MAX] = {
        PERF_MAP_ALL_UNSUPPORTED,
        [PERF_COUNT_HW_CPU_CYCLES]                = ARMV8_PMUV3_PERFCTR_CPU_CYCLES,
        [PERF_COUNT_HW_INSTRUCTIONS]                = ARMV8_PMUV3_PERFCTR_INST_RETIRED,
        [PERF_COUNT_HW_CACHE_REFERENCES]        = ARMV8_PMUV3_PERFCTR_L1D_CACHE,
        [PERF_COUNT_HW_CACHE_MISSES]                = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL,
        [PERF_COUNT_HW_BRANCH_MISSES]                = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
        [PERF_COUNT_HW_BUS_CYCLES]                = ARMV8_PMUV3_PERFCTR_BUS_CYCLES,
        [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]        = ARMV8_PMUV3_PERFCTR_STALL_FRONTEND,
        [PERF_COUNT_HW_STALLED_CYCLES_BACKEND]        = ARMV8_PMUV3_PERFCTR_STALL_BACKEND,
};

static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
                                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                                [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
        PERF_CACHE_MAP_ALL_UNSUPPORTED,

        [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_PMUV3_PERFCTR_L1D_CACHE,
        [C(L1D)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL,

        [C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_PMUV3_PERFCTR_L1I_CACHE,
        [C(L1I)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL,

        [C(DTLB)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL,
        [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_PMUV3_PERFCTR_L1D_TLB,

        [C(ITLB)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL,
        [C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_PMUV3_PERFCTR_L1I_TLB,

        [C(LL)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD,
        [C(LL)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_PMUV3_PERFCTR_LL_CACHE_RD,

        [C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_PMUV3_PERFCTR_BR_PRED,
        [C(BPU)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
};

static const unsigned armv8_a53_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
                                              [PERF_COUNT_HW_CACHE_OP_MAX]
                                              [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
        PERF_CACHE_MAP_ALL_UNSUPPORTED,

        [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_A53_PERFCTR_PREF_LINEFILL,

        [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
        [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
};

static const unsigned armv8_a57_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
                                              [PERF_COUNT_HW_CACHE_OP_MAX]
                                              [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
        PERF_CACHE_MAP_ALL_UNSUPPORTED,

        [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
        [C(L1D)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
        [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
        [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR,

        [C(DTLB)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
        [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,

        [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
        [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
};

static const unsigned armv8_a73_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
                                              [PERF_COUNT_HW_CACHE_OP_MAX]
                                              [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
        PERF_CACHE_MAP_ALL_UNSUPPORTED,

        [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
        [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
};

static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
                                                   [PERF_COUNT_HW_CACHE_OP_MAX]
                                                   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
        PERF_CACHE_MAP_ALL_UNSUPPORTED,

        [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
        [C(L1D)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
        [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
        [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]        = ARMV8_THUNDER_PERFCTR_L1D_CACHE_MISS_ST,
        [C(L1D)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_ACCESS,
        [C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1D_CACHE_PREF_MISS,

        [C(L1I)][C(OP_PREFETCH)][C(RESULT_ACCESS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_ACCESS,
        [C(L1I)][C(OP_PREFETCH)][C(RESULT_MISS)] = ARMV8_THUNDER_PERFCTR_L1I_CACHE_PREF_MISS,

        [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD,
        [C(DTLB)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
        [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR,
        [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,
};

static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
                                              [PERF_COUNT_HW_CACHE_OP_MAX]
                                              [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
        PERF_CACHE_MAP_ALL_UNSUPPORTED,

        [C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD,
        [C(L1D)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_RD,
        [C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR,
        [C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_WR,

        [C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_L1D_TLB_RD,
        [C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_WR,
        [C(DTLB)][C(OP_READ)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_RD,
        [C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]        = ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_WR,

        [C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]        = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_RD,
        [C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_WR,
};

static ssize_t
armv8pmu_events_sysfs_show(struct device *dev,
                           struct device_attribute *attr, char *page)
{
        struct perf_pmu_events_attr *pmu_attr;

        pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);

        return sprintf(page, "event=0x%04llx\n", pmu_attr->id);
}

#define ARMV8_EVENT_ATTR(name, config)                                                \
        PMU_EVENT_ATTR_ID(name, armv8pmu_events_sysfs_show, config)

static struct attribute *armv8_pmuv3_event_attrs[] = {
        /*
         * Don't expose the sw_incr event in /sys. It's not usable as writes to
         * PMSWINC_EL0 will trap as PMUSERENR.{SW,EN}=={0,0} and event rotation
         * means we don't have a fixed event<->counter relationship regardless.
         */
        ARMV8_EVENT_ATTR(l1i_cache_refill, ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL),
        ARMV8_EVENT_ATTR(l1i_tlb_refill, ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL),
        ARMV8_EVENT_ATTR(l1d_cache_refill, ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL),
        ARMV8_EVENT_ATTR(l1d_cache, ARMV8_PMUV3_PERFCTR_L1D_CACHE),
        ARMV8_EVENT_ATTR(l1d_tlb_refill, ARMV8_PMUV3_PERFCTR_L1D_TLB_REFILL),
        ARMV8_EVENT_ATTR(ld_retired, ARMV8_PMUV3_PERFCTR_LD_RETIRED),
        ARMV8_EVENT_ATTR(st_retired, ARMV8_PMUV3_PERFCTR_ST_RETIRED),
        ARMV8_EVENT_ATTR(inst_retired, ARMV8_PMUV3_PERFCTR_INST_RETIRED),
        ARMV8_EVENT_ATTR(exc_taken, ARMV8_PMUV3_PERFCTR_EXC_TAKEN),
        ARMV8_EVENT_ATTR(exc_return, ARMV8_PMUV3_PERFCTR_EXC_RETURN),
        ARMV8_EVENT_ATTR(cid_write_retired, ARMV8_PMUV3_PERFCTR_CID_WRITE_RETIRED),
        ARMV8_EVENT_ATTR(pc_write_retired, ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED),
        ARMV8_EVENT_ATTR(br_immed_retired, ARMV8_PMUV3_PERFCTR_BR_IMMED_RETIRED),
        ARMV8_EVENT_ATTR(br_return_retired, ARMV8_PMUV3_PERFCTR_BR_RETURN_RETIRED),
        ARMV8_EVENT_ATTR(unaligned_ldst_retired, ARMV8_PMUV3_PERFCTR_UNALIGNED_LDST_RETIRED),
        ARMV8_EVENT_ATTR(br_mis_pred, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED),
        ARMV8_EVENT_ATTR(cpu_cycles, ARMV8_PMUV3_PERFCTR_CPU_CYCLES),
        ARMV8_EVENT_ATTR(br_pred, ARMV8_PMUV3_PERFCTR_BR_PRED),
        ARMV8_EVENT_ATTR(mem_access, ARMV8_PMUV3_PERFCTR_MEM_ACCESS),
        ARMV8_EVENT_ATTR(l1i_cache, ARMV8_PMUV3_PERFCTR_L1I_CACHE),
        ARMV8_EVENT_ATTR(l1d_cache_wb, ARMV8_PMUV3_PERFCTR_L1D_CACHE_WB),
        ARMV8_EVENT_ATTR(l2d_cache, ARMV8_PMUV3_PERFCTR_L2D_CACHE),
        ARMV8_EVENT_ATTR(l2d_cache_refill, ARMV8_PMUV3_PERFCTR_L2D_CACHE_REFILL),
        ARMV8_EVENT_ATTR(l2d_cache_wb, ARMV8_PMUV3_PERFCTR_L2D_CACHE_WB),
        ARMV8_EVENT_ATTR(bus_access, ARMV8_PMUV3_PERFCTR_BUS_ACCESS),
        ARMV8_EVENT_ATTR(memory_error, ARMV8_PMUV3_PERFCTR_MEMORY_ERROR),
        ARMV8_EVENT_ATTR(inst_spec, ARMV8_PMUV3_PERFCTR_INST_SPEC),
        ARMV8_EVENT_ATTR(ttbr_write_retired, ARMV8_PMUV3_PERFCTR_TTBR_WRITE_RETIRED),
        ARMV8_EVENT_ATTR(bus_cycles, ARMV8_PMUV3_PERFCTR_BUS_CYCLES),
        /* Don't expose the chain event in /sys, since it's useless in isolation */
        ARMV8_EVENT_ATTR(l1d_cache_allocate, ARMV8_PMUV3_PERFCTR_L1D_CACHE_ALLOCATE),
        ARMV8_EVENT_ATTR(l2d_cache_allocate, ARMV8_PMUV3_PERFCTR_L2D_CACHE_ALLOCATE),
        ARMV8_EVENT_ATTR(br_retired, ARMV8_PMUV3_PERFCTR_BR_RETIRED),
        ARMV8_EVENT_ATTR(br_mis_pred_retired, ARMV8_PMUV3_PERFCTR_BR_MIS_PRED_RETIRED),
        ARMV8_EVENT_ATTR(stall_frontend, ARMV8_PMUV3_PERFCTR_STALL_FRONTEND),
        ARMV8_EVENT_ATTR(stall_backend, ARMV8_PMUV3_PERFCTR_STALL_BACKEND),
        ARMV8_EVENT_ATTR(l1d_tlb, ARMV8_PMUV3_PERFCTR_L1D_TLB),
        ARMV8_EVENT_ATTR(l1i_tlb, ARMV8_PMUV3_PERFCTR_L1I_TLB),
        ARMV8_EVENT_ATTR(l2i_cache, ARMV8_PMUV3_PERFCTR_L2I_CACHE),
        ARMV8_EVENT_ATTR(l2i_cache_refill, ARMV8_PMUV3_PERFCTR_L2I_CACHE_REFILL),
        ARMV8_EVENT_ATTR(l3d_cache_allocate, ARMV8_PMUV3_PERFCTR_L3D_CACHE_ALLOCATE),
        ARMV8_EVENT_ATTR(l3d_cache_refill, ARMV8_PMUV3_PERFCTR_L3D_CACHE_REFILL),
        ARMV8_EVENT_ATTR(l3d_cache, ARMV8_PMUV3_PERFCTR_L3D_CACHE),
        ARMV8_EVENT_ATTR(l3d_cache_wb, ARMV8_PMUV3_PERFCTR_L3D_CACHE_WB),
        ARMV8_EVENT_ATTR(l2d_tlb_refill, ARMV8_PMUV3_PERFCTR_L2D_TLB_REFILL),
        ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL),
        ARMV8_EVENT_ATTR(l2d_tlb, ARMV8_PMUV3_PERFCTR_L2D_TLB),
        ARMV8_EVENT_ATTR(l2i_tlb, ARMV8_PMUV3_PERFCTR_L2I_TLB),
        ARMV8_EVENT_ATTR(remote_access, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS),
        ARMV8_EVENT_ATTR(ll_cache, ARMV8_PMUV3_PERFCTR_LL_CACHE),
        ARMV8_EVENT_ATTR(ll_cache_miss, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS),
        ARMV8_EVENT_ATTR(dtlb_walk, ARMV8_PMUV3_PERFCTR_DTLB_WALK),
        ARMV8_EVENT_ATTR(itlb_walk, ARMV8_PMUV3_PERFCTR_ITLB_WALK),
        ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD),
        ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD),
        ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD),
        ARMV8_EVENT_ATTR(l1d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD),
        ARMV8_EVENT_ATTR(op_retired, ARMV8_PMUV3_PERFCTR_OP_RETIRED),
        ARMV8_EVENT_ATTR(op_spec, ARMV8_PMUV3_PERFCTR_OP_SPEC),
        ARMV8_EVENT_ATTR(stall, ARMV8_PMUV3_PERFCTR_STALL),
        ARMV8_EVENT_ATTR(stall_slot_backend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND),
        ARMV8_EVENT_ATTR(stall_slot_frontend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND),
        ARMV8_EVENT_ATTR(stall_slot, ARMV8_PMUV3_PERFCTR_STALL_SLOT),
        ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP),
        ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED),
        ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE),
        ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION),
        ARMV8_EVENT_ATTR(cnt_cycles, ARMV8_AMU_PERFCTR_CNT_CYCLES),
        ARMV8_EVENT_ATTR(stall_backend_mem, ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM),
        ARMV8_EVENT_ATTR(l1i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS),
        ARMV8_EVENT_ATTR(l2d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD),
        ARMV8_EVENT_ATTR(l2i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS),
        ARMV8_EVENT_ATTR(l3d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD),
        ARMV8_EVENT_ATTR(trb_wrap, ARMV8_PMUV3_PERFCTR_TRB_WRAP),
        ARMV8_EVENT_ATTR(trb_trig, ARMV8_PMUV3_PERFCTR_TRB_TRIG),
        ARMV8_EVENT_ATTR(trcextout0, ARMV8_PMUV3_PERFCTR_TRCEXTOUT0),
        ARMV8_EVENT_ATTR(trcextout1, ARMV8_PMUV3_PERFCTR_TRCEXTOUT1),
        ARMV8_EVENT_ATTR(trcextout2, ARMV8_PMUV3_PERFCTR_TRCEXTOUT2),
        ARMV8_EVENT_ATTR(trcextout3, ARMV8_PMUV3_PERFCTR_TRCEXTOUT3),
        ARMV8_EVENT_ATTR(cti_trigout4, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT4),
        ARMV8_EVENT_ATTR(cti_trigout5, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT5),
        ARMV8_EVENT_ATTR(cti_trigout6, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT6),
        ARMV8_EVENT_ATTR(cti_trigout7, ARMV8_PMUV3_PERFCTR_CTI_TRIGOUT7),
        ARMV8_EVENT_ATTR(ldst_align_lat, ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT),
        ARMV8_EVENT_ATTR(ld_align_lat, ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT),
        ARMV8_EVENT_ATTR(st_align_lat, ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT),
        ARMV8_EVENT_ATTR(mem_access_checked, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED),
        ARMV8_EVENT_ATTR(mem_access_checked_rd, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD),
        ARMV8_EVENT_ATTR(mem_access_checked_wr, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR),
        NULL,
};

static umode_t
armv8pmu_event_attr_is_visible(struct kobject *kobj,
                               struct attribute *attr, int unused)
{
        struct device *dev = kobj_to_dev(kobj);
        struct pmu *pmu = dev_get_drvdata(dev);
        struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
        struct perf_pmu_events_attr *pmu_attr;

        pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr.attr);

        if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS &&
            test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap))
                return attr->mode;

        if (pmu_attr->id >= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE) {
                u64 id = pmu_attr->id - ARMV8_PMUV3_EXT_COMMON_EVENT_BASE;

                if (id < ARMV8_PMUV3_MAX_COMMON_EVENTS &&
                    test_bit(id, cpu_pmu->pmceid_ext_bitmap))
                        return attr->mode;
        }

        return 0;
}

static const struct attribute_group armv8_pmuv3_events_attr_group = {
        .name = "events",
        .attrs = armv8_pmuv3_event_attrs,
        .is_visible = armv8pmu_event_attr_is_visible,
};

/* User ABI */
#define ATTR_CFG_FLD_event_CFG                config
#define ATTR_CFG_FLD_event_LO                0
#define ATTR_CFG_FLD_event_HI                15
#define ATTR_CFG_FLD_long_CFG                config1
#define ATTR_CFG_FLD_long_LO                0
#define ATTR_CFG_FLD_long_HI                0
#define ATTR_CFG_FLD_rdpmc_CFG                config1
#define ATTR_CFG_FLD_rdpmc_LO                1
#define ATTR_CFG_FLD_rdpmc_HI                1
#define ATTR_CFG_FLD_threshold_count_CFG        config1 /* PMEVTYPER.TC[0] */
#define ATTR_CFG_FLD_threshold_count_LO                2
#define ATTR_CFG_FLD_threshold_count_HI                2
#define ATTR_CFG_FLD_threshold_compare_CFG        config1 /* PMEVTYPER.TC[2:1] */
#define ATTR_CFG_FLD_threshold_compare_LO        3
#define ATTR_CFG_FLD_threshold_compare_HI        4
#define ATTR_CFG_FLD_threshold_CFG                config1 /* PMEVTYPER.TH */
#define ATTR_CFG_FLD_threshold_LO                5
#define ATTR_CFG_FLD_threshold_HI                16

GEN_PMU_FORMAT_ATTR(event);
GEN_PMU_FORMAT_ATTR(long);
GEN_PMU_FORMAT_ATTR(rdpmc);
GEN_PMU_FORMAT_ATTR(threshold_count);
GEN_PMU_FORMAT_ATTR(threshold_compare);
GEN_PMU_FORMAT_ATTR(threshold);

static int sysctl_perf_user_access __read_mostly;

static bool armv8pmu_event_is_64bit(struct perf_event *event)
{
        return ATTR_CFG_GET_FLD(&event->attr, long);
}

static bool armv8pmu_event_want_user_access(struct perf_event *event)
{
        return ATTR_CFG_GET_FLD(&event->attr, rdpmc);
}

static u32 armv8pmu_event_get_threshold(struct perf_event_attr *attr)
{
        return ATTR_CFG_GET_FLD(attr, threshold);
}

static u8 armv8pmu_event_threshold_control(struct perf_event_attr *attr)
{
        u8 th_compare = ATTR_CFG_GET_FLD(attr, threshold_compare);
        u8 th_count = ATTR_CFG_GET_FLD(attr, threshold_count);

        /*
         * The count bit is always the bottom bit of the full control field, and
         * the comparison is the upper two bits, but it's not explicitly
         * labelled in the Arm ARM. For the Perf interface we split it into two
         * fields, so reconstruct it here.
         */
        return (th_compare << 1) | th_count;
}

static struct attribute *armv8_pmuv3_format_attrs[] = {
        &format_attr_event.attr,
        &format_attr_long.attr,
        &format_attr_rdpmc.attr,
        &format_attr_threshold.attr,
        &format_attr_threshold_compare.attr,
        &format_attr_threshold_count.attr,
        NULL,
};

static const struct attribute_group armv8_pmuv3_format_attr_group = {
        .name = "format",
        .attrs = armv8_pmuv3_format_attrs,
};

static ssize_t slots_show(struct device *dev, struct device_attribute *attr,
                          char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
        u32 slots = FIELD_GET(ARMV8_PMU_SLOTS, cpu_pmu->reg_pmmir);

        return sysfs_emit(page, "0x%08x\n", slots);
}

static DEVICE_ATTR_RO(slots);

static ssize_t bus_slots_show(struct device *dev, struct device_attribute *attr,
                              char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
        u32 bus_slots = FIELD_GET(ARMV8_PMU_BUS_SLOTS, cpu_pmu->reg_pmmir);

        return sysfs_emit(page, "0x%08x\n", bus_slots);
}

static DEVICE_ATTR_RO(bus_slots);

static ssize_t bus_width_show(struct device *dev, struct device_attribute *attr,
                              char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);
        u32 bus_width = FIELD_GET(ARMV8_PMU_BUS_WIDTH, cpu_pmu->reg_pmmir);
        u32 val = 0;

        /* Encoded as Log2(number of bytes), plus one */
        if (bus_width > 2 && bus_width < 13)
                val = 1 << (bus_width - 1);

        return sysfs_emit(page, "0x%08x\n", val);
}

static DEVICE_ATTR_RO(bus_width);

static u32 threshold_max(struct arm_pmu *cpu_pmu)
{
        /*
         * PMMIR.THWIDTH is readable and non-zero on aarch32, but it would be
         * impossible to write the threshold in the upper 32 bits of PMEVTYPER.
         */
        if (IS_ENABLED(CONFIG_ARM))
                return 0;

        /*
         * The largest value that can be written to PMEVTYPER<n>_EL0.TH is
         * (2 ^ PMMIR.THWIDTH) - 1.
         */
        return (1 << FIELD_GET(ARMV8_PMU_THWIDTH, cpu_pmu->reg_pmmir)) - 1;
}

static ssize_t threshold_max_show(struct device *dev,
                                  struct device_attribute *attr, char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        struct arm_pmu *cpu_pmu = container_of(pmu, struct arm_pmu, pmu);

        return sysfs_emit(page, "0x%08x\n", threshold_max(cpu_pmu));
}

static DEVICE_ATTR_RO(threshold_max);

static struct attribute *armv8_pmuv3_caps_attrs[] = {
        &dev_attr_slots.attr,
        &dev_attr_bus_slots.attr,
        &dev_attr_bus_width.attr,
        &dev_attr_threshold_max.attr,
        NULL,
};

static const struct attribute_group armv8_pmuv3_caps_attr_group = {
        .name = "caps",
        .attrs = armv8_pmuv3_caps_attrs,
};

/*
 * We unconditionally enable ARMv8.5-PMU long event counter support
 * (64-bit events) where supported. Indicate if this arm_pmu has long
 * event counter support.
 *
 * On AArch32, long counters make no sense (you can't access the top
 * bits), so we only enable this on AArch64.
 */
static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu)
{
        return (IS_ENABLED(CONFIG_ARM64) && is_pmuv3p5(cpu_pmu->pmuver));
}

static bool armv8pmu_event_has_user_read(struct perf_event *event)
{
        return event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT;
}

/*
 * We must chain two programmable counters for 64 bit events,
 * except when we have allocated the 64bit cycle counter (for CPU
 * cycles event) or when user space counter access is enabled.
 */
static bool armv8pmu_event_is_chained(struct perf_event *event)
{
        int idx = event->hw.idx;
        struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);

        return !armv8pmu_event_has_user_read(event) &&
               armv8pmu_event_is_64bit(event) &&
               !armv8pmu_has_long_event(cpu_pmu) &&
               (idx < ARMV8_PMU_MAX_GENERAL_COUNTERS);
}

/*
 * ARMv8 low level PMU access
 */
static u64 armv8pmu_pmcr_read(void)
{
        return read_pmcr();
}

static void armv8pmu_pmcr_write(u64 val)
{
        val &= ARMV8_PMU_PMCR_MASK;
        isb();
        write_pmcr(val);
}

static int armv8pmu_has_overflowed(u64 pmovsr)
{
        return !!(pmovsr & ARMV8_PMU_OVERFLOWED_MASK);
}

static int armv8pmu_counter_has_overflowed(u64 pmnc, int idx)
{
        return !!(pmnc & BIT(idx));
}

static u64 armv8pmu_read_evcntr(int idx)
{
        return read_pmevcntrn(idx);
}

static u64 armv8pmu_read_hw_counter(struct perf_event *event)
{
        int idx = event->hw.idx;
        u64 val = armv8pmu_read_evcntr(idx);

        if (armv8pmu_event_is_chained(event))
                val = (val << 32) | armv8pmu_read_evcntr(idx - 1);
        return val;
}

/*
 * The cycle counter is always a 64-bit counter. When ARMV8_PMU_PMCR_LP
 * is set the event counters also become 64-bit counters. Unless the
 * user has requested a long counter (attr.config1) then we want to
 * interrupt upon 32-bit overflow - we achieve this by applying a bias.
 */
static bool armv8pmu_event_needs_bias(struct perf_event *event)
{
        struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
        struct hw_perf_event *hwc = &event->hw;
        int idx = hwc->idx;

        if (armv8pmu_event_is_64bit(event))
                return false;

        if (armv8pmu_has_long_event(cpu_pmu) ||
            idx >= ARMV8_PMU_MAX_GENERAL_COUNTERS)
                return true;

        return false;
}

static u64 armv8pmu_bias_long_counter(struct perf_event *event, u64 value)
{
        if (armv8pmu_event_needs_bias(event))
                value |= GENMASK_ULL(63, 32);

        return value;
}

static u64 armv8pmu_unbias_long_counter(struct perf_event *event, u64 value)
{
        if (armv8pmu_event_needs_bias(event))
                value &= ~GENMASK_ULL(63, 32);

        return value;
}

static u64 armv8pmu_read_counter(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        int idx = hwc->idx;
        u64 value;

        if (idx == ARMV8_PMU_CYCLE_IDX)
                value = read_pmccntr();
        else if (idx == ARMV8_PMU_INSTR_IDX)
                value = read_pmicntr();
        else
                value = armv8pmu_read_hw_counter(event);

        return  armv8pmu_unbias_long_counter(event, value);
}

static void armv8pmu_write_evcntr(int idx, u64 value)
{
        write_pmevcntrn(idx, value);
}

static void armv8pmu_write_hw_counter(struct perf_event *event,
                                             u64 value)
{
        int idx = event->hw.idx;

        if (armv8pmu_event_is_chained(event)) {
                armv8pmu_write_evcntr(idx, upper_32_bits(value));
                armv8pmu_write_evcntr(idx - 1, lower_32_bits(value));
        } else {
                armv8pmu_write_evcntr(idx, value);
        }
}

static void armv8pmu_write_counter(struct perf_event *event, u64 value)
{
        struct hw_perf_event *hwc = &event->hw;
        int idx = hwc->idx;

        value = armv8pmu_bias_long_counter(event, value);

        if (idx == ARMV8_PMU_CYCLE_IDX)
                write_pmccntr(value);
        else if (idx == ARMV8_PMU_INSTR_IDX)
                write_pmicntr(value);
        else
                armv8pmu_write_hw_counter(event, value);
}

static void armv8pmu_write_evtype(int idx, unsigned long val)
{
        unsigned long mask = ARMV8_PMU_EVTYPE_EVENT |
                             ARMV8_PMU_INCLUDE_EL2 |
                             ARMV8_PMU_EXCLUDE_EL0 |
                             ARMV8_PMU_EXCLUDE_EL1;

        if (IS_ENABLED(CONFIG_ARM64))
                mask |= ARMV8_PMU_EVTYPE_TC | ARMV8_PMU_EVTYPE_TH;

        val &= mask;
        write_pmevtypern(idx, val);
}

static void armv8pmu_write_event_type(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        int idx = hwc->idx;

        /*
         * For chained events, the low counter is programmed to count
         * the event of interest and the high counter is programmed
         * with CHAIN event code with filters set to count at all ELs.
         */
        if (armv8pmu_event_is_chained(event)) {
                u32 chain_evt = ARMV8_PMUV3_PERFCTR_CHAIN |
                                ARMV8_PMU_INCLUDE_EL2;

                armv8pmu_write_evtype(idx - 1, hwc->config_base);
                armv8pmu_write_evtype(idx, chain_evt);
        } else {
                if (idx == ARMV8_PMU_CYCLE_IDX)
                        write_pmccfiltr(hwc->config_base);
                else if (idx == ARMV8_PMU_INSTR_IDX)
                        write_pmicfiltr(hwc->config_base);
                else
                        armv8pmu_write_evtype(idx, hwc->config_base);
        }
}

static u64 armv8pmu_event_cnten_mask(struct perf_event *event)
{
        int counter = event->hw.idx;
        u64 mask = BIT(counter);

        if (armv8pmu_event_is_chained(event))
                mask |= BIT(counter - 1);
        return mask;
}

static void armv8pmu_enable_counter(u64 mask)
{
        /*
         * Make sure event configuration register writes are visible before we
         * enable the counter.
         * */
        isb();
        write_pmcntenset(mask);
}

static void armv8pmu_enable_event_counter(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;
        u64 mask = armv8pmu_event_cnten_mask(event);

        kvm_set_pmu_events(mask, attr);

        /* We rely on the hypervisor switch code to enable guest counters */
        if (!kvm_pmu_counter_deferred(attr))
                armv8pmu_enable_counter(mask);
}

static void armv8pmu_disable_counter(u64 mask)
{
        write_pmcntenclr(mask);
        /*
         * Make sure the effects of disabling the counter are visible before we
         * start configuring the event.
         */
        isb();
}

static void armv8pmu_disable_event_counter(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;
        u64 mask = armv8pmu_event_cnten_mask(event);

        kvm_clr_pmu_events(mask);

        /* We rely on the hypervisor switch code to disable guest counters */
        if (!kvm_pmu_counter_deferred(attr))
                armv8pmu_disable_counter(mask);
}

static void armv8pmu_enable_intens(u64 mask)
{
        write_pmintenset(mask);
}

static void armv8pmu_enable_event_irq(struct perf_event *event)
{
        armv8pmu_enable_intens(BIT(event->hw.idx));
}

static void armv8pmu_disable_intens(u64 mask)
{
        write_pmintenclr(mask);
        isb();
        /* Clear the overflow flag in case an interrupt is pending. */
        write_pmovsclr(mask);
        isb();
}

static void armv8pmu_disable_event_irq(struct perf_event *event)
{
        armv8pmu_disable_intens(BIT(event->hw.idx));
}

static u64 armv8pmu_getreset_flags(void)
{
        u64 value;

        /* Read */
        value = read_pmovsclr();

        /* Write to clear flags */
        value &= ARMV8_PMU_OVERFLOWED_MASK;
        write_pmovsclr(value);

        return value;
}

static void update_pmuserenr(u64 val)
{
        lockdep_assert_irqs_disabled();

        /*
         * The current PMUSERENR_EL0 value might be the value for the guest.
         * If that's the case, have KVM keep tracking of the register value
         * for the host EL0 so that KVM can restore it before returning to
         * the host EL0. Otherwise, update the register now.
         */
        if (kvm_set_pmuserenr(val))
                return;

        write_pmuserenr(val);
}

static void armv8pmu_disable_user_access(void)
{
        update_pmuserenr(0);
}

static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
{
        int i;
        struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);

        if (is_pmuv3p9(cpu_pmu->pmuver)) {
                u64 mask = 0;
                for_each_set_bit(i, cpuc->used_mask, ARMPMU_MAX_HWEVENTS) {
                        if (armv8pmu_event_has_user_read(cpuc->events[i]))
                                mask |= BIT(i);
                }
                write_pmuacr(mask);
        } else {
                /* Clear any unused counters to avoid leaking their contents */
                for_each_andnot_bit(i, cpu_pmu->cntr_mask, cpuc->used_mask,
                                    ARMPMU_MAX_HWEVENTS) {
                        if (i == ARMV8_PMU_CYCLE_IDX)
                                write_pmccntr(0);
                        else if (i == ARMV8_PMU_INSTR_IDX)
                                write_pmicntr(0);
                        else
                                armv8pmu_write_evcntr(i, 0);
                }
        }

        update_pmuserenr(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_UEN);
}

static void armv8pmu_enable_event(struct perf_event *event)
{
        armv8pmu_write_event_type(event);
        armv8pmu_enable_event_irq(event);
        armv8pmu_enable_event_counter(event);
}

static void armv8pmu_disable_event(struct perf_event *event)
{
        armv8pmu_disable_event_counter(event);
        armv8pmu_disable_event_irq(event);
}

static void armv8pmu_start(struct arm_pmu *cpu_pmu)
{
        struct perf_event_context *ctx;
        int nr_user = 0;

        ctx = perf_cpu_task_ctx();
        if (ctx)
                nr_user = ctx->nr_user;

        if (sysctl_perf_user_access && nr_user)
                armv8pmu_enable_user_access(cpu_pmu);
        else
                armv8pmu_disable_user_access();

        kvm_vcpu_pmu_resync_el0();

        /* Enable all counters */
        armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
}

static void armv8pmu_stop(struct arm_pmu *cpu_pmu)
{
        /* Disable all counters */
        armv8pmu_pmcr_write(armv8pmu_pmcr_read() & ~ARMV8_PMU_PMCR_E);
}

static irqreturn_t armv8pmu_handle_irq(struct arm_pmu *cpu_pmu)
{
        u64 pmovsr;
        struct perf_sample_data data;
        struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
        struct pt_regs *regs;
        int idx;

        /*
         * Get and reset the IRQ flags
         */
        pmovsr = armv8pmu_getreset_flags();

        /*
         * Did an overflow occur?
         */
        if (!armv8pmu_has_overflowed(pmovsr))
                return IRQ_NONE;

        /*
         * Handle the counter(s) overflow(s)
         */
        regs = get_irq_regs();

        /*
         * Stop the PMU while processing the counter overflows
         * to prevent skews in group events.
         */
        armv8pmu_stop(cpu_pmu);
        for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMPMU_MAX_HWEVENTS) {
                struct perf_event *event = cpuc->events[idx];
                struct hw_perf_event *hwc;

                /* Ignore if we don't have an event. */
                if (!event)
                        continue;

                /*
                 * We have a single interrupt for all counters. Check that
                 * each counter has overflowed before we process it.
                 */
                if (!armv8pmu_counter_has_overflowed(pmovsr, idx))
                        continue;

                hwc = &event->hw;
                armpmu_event_update(event);
                perf_sample_data_init(&data, 0, hwc->last_period);
                if (!armpmu_event_set_period(event))
                        continue;

                /*
                 * Perf event overflow will queue the processing of the event as
                 * an irq_work which will be taken care of in the handling of
                 * IPI_IRQ_WORK.
                 */
                if (perf_event_overflow(event, &data, regs))
                        cpu_pmu->disable(event);
        }
        armv8pmu_start(cpu_pmu);

        return IRQ_HANDLED;
}

static int armv8pmu_get_single_idx(struct pmu_hw_events *cpuc,
                                    struct arm_pmu *cpu_pmu)
{
        int idx;

        for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS) {
                if (!test_and_set_bit(idx, cpuc->used_mask))
                        return idx;
        }
        return -EAGAIN;
}

static int armv8pmu_get_chain_idx(struct pmu_hw_events *cpuc,
                                   struct arm_pmu *cpu_pmu)
{
        int idx;

        /*
         * Chaining requires two consecutive event counters, where
         * the lower idx must be even.
         */
        for_each_set_bit(idx, cpu_pmu->cntr_mask, ARMV8_PMU_MAX_GENERAL_COUNTERS) {
                if (!(idx & 0x1))
                        continue;
                if (!test_and_set_bit(idx, cpuc->used_mask)) {
                        /* Check if the preceding even counter is available */
                        if (!test_and_set_bit(idx - 1, cpuc->used_mask))
                                return idx;
                        /* Release the Odd counter */
                        clear_bit(idx, cpuc->used_mask);
                }
        }
        return -EAGAIN;
}

static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
                                  struct perf_event *event)
{
        struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
        struct hw_perf_event *hwc = &event->hw;
        unsigned long evtype = hwc->config_base & ARMV8_PMU_EVTYPE_EVENT;

        /* Always prefer to place a cycle counter into the cycle counter. */
        if ((evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
            !armv8pmu_event_get_threshold(&event->attr)) {
                if (!test_and_set_bit(ARMV8_PMU_CYCLE_IDX, cpuc->used_mask))
                        return ARMV8_PMU_CYCLE_IDX;
                else if (armv8pmu_event_is_64bit(event) &&
                           armv8pmu_event_want_user_access(event) &&
                           !armv8pmu_has_long_event(cpu_pmu))
                                return -EAGAIN;
        }

        /*
         * Always prefer to place a instruction counter into the instruction counter,
         * but don't expose the instruction counter to userspace access as userspace
         * may not know how to handle it.
         */
        if ((evtype == ARMV8_PMUV3_PERFCTR_INST_RETIRED) &&
            !armv8pmu_event_get_threshold(&event->attr) &&
            test_bit(ARMV8_PMU_INSTR_IDX, cpu_pmu->cntr_mask) &&
            !armv8pmu_event_want_user_access(event)) {
                if (!test_and_set_bit(ARMV8_PMU_INSTR_IDX, cpuc->used_mask))
                        return ARMV8_PMU_INSTR_IDX;
        }

        /*
         * Otherwise use events counters
         */
        if (armv8pmu_event_is_chained(event))
                return        armv8pmu_get_chain_idx(cpuc, cpu_pmu);
        else
                return armv8pmu_get_single_idx(cpuc, cpu_pmu);
}

static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc,
                                     struct perf_event *event)
{
        int idx = event->hw.idx;

        clear_bit(idx, cpuc->used_mask);
        if (armv8pmu_event_is_chained(event))
                clear_bit(idx - 1, cpuc->used_mask);
}

static int armv8pmu_user_event_idx(struct perf_event *event)
{
        if (!sysctl_perf_user_access || !armv8pmu_event_has_user_read(event))
                return 0;

        return event->hw.idx + 1;
}

/*
 * Add an event filter to a given event.
 */
static int armv8pmu_set_event_filter(struct hw_perf_event *event,
                                     struct perf_event_attr *attr)
{
        unsigned long config_base = 0;
        struct perf_event *perf_event = container_of(attr, struct perf_event,
                                                     attr);
        struct arm_pmu *cpu_pmu = to_arm_pmu(perf_event->pmu);
        u32 th;

        if (attr->exclude_idle) {
                pr_debug("ARM performance counters do not support mode exclusion\n");
                return -EOPNOTSUPP;
        }

        /*
         * If we're running in hyp mode, then we *are* the hypervisor.
         * Therefore we ignore exclude_hv in this configuration, since
         * there's no hypervisor to sample anyway. This is consistent
         * with other architectures (x86 and Power).
         */
        if (is_kernel_in_hyp_mode()) {
                if (!attr->exclude_kernel && !attr->exclude_host)
                        config_base |= ARMV8_PMU_INCLUDE_EL2;
                if (attr->exclude_guest)
                        config_base |= ARMV8_PMU_EXCLUDE_EL1;
                if (attr->exclude_host)
                        config_base |= ARMV8_PMU_EXCLUDE_EL0;
        } else {
                if (!attr->exclude_hv && !attr->exclude_host)
                        config_base |= ARMV8_PMU_INCLUDE_EL2;
        }

        /*
         * Filter out !VHE kernels and guest kernels
         */
        if (attr->exclude_kernel)
                config_base |= ARMV8_PMU_EXCLUDE_EL1;

        if (attr->exclude_user)
                config_base |= ARMV8_PMU_EXCLUDE_EL0;

        /*
         * If FEAT_PMUv3_TH isn't implemented, then THWIDTH (threshold_max) will
         * be 0 and will also trigger this check, preventing it from being used.
         */
        th = armv8pmu_event_get_threshold(attr);
        if (th > threshold_max(cpu_pmu)) {
                pr_debug("PMU event threshold exceeds max value\n");
                return -EINVAL;
        }

        if (th) {
                config_base |= FIELD_PREP(ARMV8_PMU_EVTYPE_TH, th);
                config_base |= FIELD_PREP(ARMV8_PMU_EVTYPE_TC,
                                          armv8pmu_event_threshold_control(attr));
        }

        /*
         * Install the filter into config_base as this is used to
         * construct the event type.
         */
        event->config_base = config_base;

        return 0;
}

static void armv8pmu_reset(void *info)
{
        struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
        u64 pmcr, mask;

        bitmap_to_arr64(&mask, cpu_pmu->cntr_mask, ARMPMU_MAX_HWEVENTS);

        /* The counter and interrupt enable registers are unknown at reset. */
        armv8pmu_disable_counter(mask);
        armv8pmu_disable_intens(mask);

        /* Clear the counters we flip at guest entry/exit */
        kvm_clr_pmu_events(mask);

        /*
         * Initialize & Reset PMNC. Request overflow interrupt for
         * 64 bit cycle counter but cheat in armv8pmu_write_counter().
         */
        pmcr = ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C | ARMV8_PMU_PMCR_LC;

        /* Enable long event counter support where available */
        if (armv8pmu_has_long_event(cpu_pmu))
                pmcr |= ARMV8_PMU_PMCR_LP;

        armv8pmu_pmcr_write(pmcr);
}

static int __armv8_pmuv3_map_event_id(struct arm_pmu *armpmu,
                                      struct perf_event *event)
{
        if (event->attr.type == PERF_TYPE_HARDWARE &&
            event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) {

                if (test_bit(ARMV8_PMUV3_PERFCTR_BR_RETIRED,
                             armpmu->pmceid_bitmap))
                        return ARMV8_PMUV3_PERFCTR_BR_RETIRED;

                if (test_bit(ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED,
                             armpmu->pmceid_bitmap))
                        return ARMV8_PMUV3_PERFCTR_PC_WRITE_RETIRED;

                return HW_OP_UNSUPPORTED;
        }

        return armpmu_map_event(event, &armv8_pmuv3_perf_map,
                                &armv8_pmuv3_perf_cache_map,
                                ARMV8_PMU_EVTYPE_EVENT);
}

static int __armv8_pmuv3_map_event(struct perf_event *event,
                                   const unsigned (*extra_event_map)
                                                  [PERF_COUNT_HW_MAX],
                                   const unsigned (*extra_cache_map)
                                                  [PERF_COUNT_HW_CACHE_MAX]
                                                  [PERF_COUNT_HW_CACHE_OP_MAX]
                                                  [PERF_COUNT_HW_CACHE_RESULT_MAX])
{
        int hw_event_id;
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);

        hw_event_id = __armv8_pmuv3_map_event_id(armpmu, event);

        /*
         * CHAIN events only work when paired with an adjacent counter, and it
         * never makes sense for a user to open one in isolation, as they'll be
         * rotated arbitrarily.
         */
        if (hw_event_id == ARMV8_PMUV3_PERFCTR_CHAIN)
                return -EINVAL;

        if (armv8pmu_event_is_64bit(event))
                event->hw.flags |= ARMPMU_EVT_64BIT;

        /*
         * User events must be allocated into a single counter, and so
         * must not be chained.
         *
         * Most 64-bit events require long counter support, but 64-bit
         * CPU_CYCLES events can be placed into the dedicated cycle
         * counter when this is free.
         */
        if (armv8pmu_event_want_user_access(event)) {
                if (!(event->attach_state & PERF_ATTACH_TASK))
                        return -EINVAL;
                if (armv8pmu_event_is_64bit(event) &&
                    (hw_event_id != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
                    !armv8pmu_has_long_event(armpmu))
                        return -EOPNOTSUPP;

                event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
        }

        /* Only expose micro/arch events supported by this PMU */
        if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS)
            && test_bit(hw_event_id, armpmu->pmceid_bitmap)) {
                return hw_event_id;
        }

        return armpmu_map_event(event, extra_event_map, extra_cache_map,
                                ARMV8_PMU_EVTYPE_EVENT);
}

static int armv8_pmuv3_map_event(struct perf_event *event)
{
        return __armv8_pmuv3_map_event(event, NULL, NULL);
}

static int armv8_a53_map_event(struct perf_event *event)
{
        return __armv8_pmuv3_map_event(event, NULL, &armv8_a53_perf_cache_map);
}

static int armv8_a57_map_event(struct perf_event *event)
{
        return __armv8_pmuv3_map_event(event, NULL, &armv8_a57_perf_cache_map);
}

static int armv8_a73_map_event(struct perf_event *event)
{
        return __armv8_pmuv3_map_event(event, NULL, &armv8_a73_perf_cache_map);
}

static int armv8_thunder_map_event(struct perf_event *event)
{
        return __armv8_pmuv3_map_event(event, NULL,
                                       &armv8_thunder_perf_cache_map);
}

static int armv8_vulcan_map_event(struct perf_event *event)
{
        return __armv8_pmuv3_map_event(event, NULL,
                                       &armv8_vulcan_perf_cache_map);
}

struct armv8pmu_probe_info {
        struct arm_pmu *pmu;
        bool present;
};

static void __armv8pmu_probe_pmu(void *info)
{
        struct armv8pmu_probe_info *probe = info;
        struct arm_pmu *cpu_pmu = probe->pmu;
        u64 pmceid_raw[2];
        u32 pmceid[2];
        int pmuver;

        pmuver = read_pmuver();
        if (!pmuv3_implemented(pmuver))
                return;

        cpu_pmu->pmuver = pmuver;
        probe->present = true;

        /* Read the nb of CNTx counters supported from PMNC */
        bitmap_set(cpu_pmu->cntr_mask,
                   0, FIELD_GET(ARMV8_PMU_PMCR_N, armv8pmu_pmcr_read()));

        /* Add the CPU cycles counter */
        set_bit(ARMV8_PMU_CYCLE_IDX, cpu_pmu->cntr_mask);

        /* Add the CPU instructions counter */
        if (pmuv3_has_icntr())
                set_bit(ARMV8_PMU_INSTR_IDX, cpu_pmu->cntr_mask);

        pmceid[0] = pmceid_raw[0] = read_pmceid0();
        pmceid[1] = pmceid_raw[1] = read_pmceid1();

        bitmap_from_arr32(cpu_pmu->pmceid_bitmap,
                             pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);

        pmceid[0] = pmceid_raw[0] >> 32;
        pmceid[1] = pmceid_raw[1] >> 32;

        bitmap_from_arr32(cpu_pmu->pmceid_ext_bitmap,
                             pmceid, ARMV8_PMUV3_MAX_COMMON_EVENTS);

        /* store PMMIR register for sysfs */
        if (is_pmuv3p4(pmuver))
                cpu_pmu->reg_pmmir = read_pmmir();
        else
                cpu_pmu->reg_pmmir = 0;
}

static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
{
        struct armv8pmu_probe_info probe = {
                .pmu = cpu_pmu,
                .present = false,
        };
        int ret;

        ret = smp_call_function_any(&cpu_pmu->supported_cpus,
                                    __armv8pmu_probe_pmu,
                                    &probe, 1);
        if (ret)
                return ret;

        return probe.present ? 0 : -ENODEV;
}

static void armv8pmu_disable_user_access_ipi(void *unused)
{
        armv8pmu_disable_user_access();
}

static int armv8pmu_proc_user_access_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write || sysctl_perf_user_access)
                return ret;

        on_each_cpu(armv8pmu_disable_user_access_ipi, NULL, 1);
        return 0;
}

static const struct ctl_table armv8_pmu_sysctl_table[] = {
        {
                .procname       = "perf_user_access",
                .data                = &sysctl_perf_user_access,
                .maxlen                = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler        = armv8pmu_proc_user_access_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static void armv8_pmu_register_sysctl_table(void)
{
        static u32 tbl_registered = 0;

        if (!cmpxchg_relaxed(&tbl_registered, 0, 1))
                register_sysctl("kernel", armv8_pmu_sysctl_table);
}

static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
                          int (*map_event)(struct perf_event *event))
{
        int ret = armv8pmu_probe_pmu(cpu_pmu);
        if (ret)
                return ret;

        cpu_pmu->handle_irq                = armv8pmu_handle_irq;
        cpu_pmu->enable                        = armv8pmu_enable_event;
        cpu_pmu->disable                = armv8pmu_disable_event;
        cpu_pmu->read_counter                = armv8pmu_read_counter;
        cpu_pmu->write_counter                = armv8pmu_write_counter;
        cpu_pmu->get_event_idx                = armv8pmu_get_event_idx;
        cpu_pmu->clear_event_idx        = armv8pmu_clear_event_idx;
        cpu_pmu->start                        = armv8pmu_start;
        cpu_pmu->stop                        = armv8pmu_stop;
        cpu_pmu->reset                        = armv8pmu_reset;
        cpu_pmu->set_event_filter        = armv8pmu_set_event_filter;

        cpu_pmu->pmu.event_idx                = armv8pmu_user_event_idx;

        cpu_pmu->name                        = name;
        cpu_pmu->map_event                = map_event;
        cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &armv8_pmuv3_events_attr_group;
        cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &armv8_pmuv3_format_attr_group;
        cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = &armv8_pmuv3_caps_attr_group;
        armv8_pmu_register_sysctl_table();
        return 0;
}

#define PMUV3_INIT_SIMPLE(name)                                                \
static int name##_pmu_init(struct arm_pmu *cpu_pmu)                        \
{                                                                        \
        return armv8_pmu_init(cpu_pmu, #name, armv8_pmuv3_map_event);        \
}

#define PMUV3_INIT_MAP_EVENT(name, map_event)                                \
static int name##_pmu_init(struct arm_pmu *cpu_pmu)                        \
{                                                                        \
        return armv8_pmu_init(cpu_pmu, #name, map_event);                \
}

PMUV3_INIT_SIMPLE(armv8_pmuv3)

PMUV3_INIT_SIMPLE(armv8_cortex_a34)
PMUV3_INIT_SIMPLE(armv8_cortex_a55)
PMUV3_INIT_SIMPLE(armv8_cortex_a65)
PMUV3_INIT_SIMPLE(armv8_cortex_a75)
PMUV3_INIT_SIMPLE(armv8_cortex_a76)
PMUV3_INIT_SIMPLE(armv8_cortex_a77)
PMUV3_INIT_SIMPLE(armv8_cortex_a78)
PMUV3_INIT_SIMPLE(armv9_cortex_a510)
PMUV3_INIT_SIMPLE(armv9_cortex_a520)
PMUV3_INIT_SIMPLE(armv9_cortex_a710)
PMUV3_INIT_SIMPLE(armv9_cortex_a715)
PMUV3_INIT_SIMPLE(armv9_cortex_a720)
PMUV3_INIT_SIMPLE(armv9_cortex_a725)
PMUV3_INIT_SIMPLE(armv8_cortex_x1)
PMUV3_INIT_SIMPLE(armv9_cortex_x2)
PMUV3_INIT_SIMPLE(armv9_cortex_x3)
PMUV3_INIT_SIMPLE(armv9_cortex_x4)
PMUV3_INIT_SIMPLE(armv9_cortex_x925)
PMUV3_INIT_SIMPLE(armv8_neoverse_e1)
PMUV3_INIT_SIMPLE(armv8_neoverse_n1)
PMUV3_INIT_SIMPLE(armv9_neoverse_n2)
PMUV3_INIT_SIMPLE(armv9_neoverse_n3)
PMUV3_INIT_SIMPLE(armv8_neoverse_v1)
PMUV3_INIT_SIMPLE(armv8_neoverse_v2)
PMUV3_INIT_SIMPLE(armv8_neoverse_v3)
PMUV3_INIT_SIMPLE(armv8_neoverse_v3ae)
PMUV3_INIT_SIMPLE(armv8_rainier)

PMUV3_INIT_SIMPLE(armv8_nvidia_carmel)
PMUV3_INIT_SIMPLE(armv8_nvidia_denver)

PMUV3_INIT_SIMPLE(armv8_samsung_mongoose)

PMUV3_INIT_MAP_EVENT(armv8_cortex_a35, armv8_a53_map_event)
PMUV3_INIT_MAP_EVENT(armv8_cortex_a53, armv8_a53_map_event)
PMUV3_INIT_MAP_EVENT(armv8_cortex_a57, armv8_a57_map_event)
PMUV3_INIT_MAP_EVENT(armv8_cortex_a72, armv8_a57_map_event)
PMUV3_INIT_MAP_EVENT(armv8_cortex_a73, armv8_a73_map_event)
PMUV3_INIT_MAP_EVENT(armv8_cavium_thunder, armv8_thunder_map_event)
PMUV3_INIT_MAP_EVENT(armv8_brcm_vulcan, armv8_vulcan_map_event)

static const struct of_device_id armv8_pmu_of_device_ids[] = {
        {.compatible = "arm,armv8-pmuv3",        .data = armv8_pmuv3_pmu_init},
        {.compatible = "arm,cortex-a34-pmu",        .data = armv8_cortex_a34_pmu_init},
        {.compatible = "arm,cortex-a35-pmu",        .data = armv8_cortex_a35_pmu_init},
        {.compatible = "arm,cortex-a53-pmu",        .data = armv8_cortex_a53_pmu_init},
        {.compatible = "arm,cortex-a55-pmu",        .data = armv8_cortex_a55_pmu_init},
        {.compatible = "arm,cortex-a57-pmu",        .data = armv8_cortex_a57_pmu_init},
        {.compatible = "arm,cortex-a65-pmu",        .data = armv8_cortex_a65_pmu_init},
        {.compatible = "arm,cortex-a72-pmu",        .data = armv8_cortex_a72_pmu_init},
        {.compatible = "arm,cortex-a73-pmu",        .data = armv8_cortex_a73_pmu_init},
        {.compatible = "arm,cortex-a75-pmu",        .data = armv8_cortex_a75_pmu_init},
        {.compatible = "arm,cortex-a76-pmu",        .data = armv8_cortex_a76_pmu_init},
        {.compatible = "arm,cortex-a77-pmu",        .data = armv8_cortex_a77_pmu_init},
        {.compatible = "arm,cortex-a78-pmu",        .data = armv8_cortex_a78_pmu_init},
        {.compatible = "arm,cortex-a510-pmu",        .data = armv9_cortex_a510_pmu_init},
        {.compatible = "arm,cortex-a520-pmu",        .data = armv9_cortex_a520_pmu_init},
        {.compatible = "arm,cortex-a710-pmu",        .data = armv9_cortex_a710_pmu_init},
        {.compatible = "arm,cortex-a715-pmu",        .data = armv9_cortex_a715_pmu_init},
        {.compatible = "arm,cortex-a720-pmu",        .data = armv9_cortex_a720_pmu_init},
        {.compatible = "arm,cortex-a725-pmu",        .data = armv9_cortex_a725_pmu_init},
        {.compatible = "arm,cortex-x1-pmu",        .data = armv8_cortex_x1_pmu_init},
        {.compatible = "arm,cortex-x2-pmu",        .data = armv9_cortex_x2_pmu_init},
        {.compatible = "arm,cortex-x3-pmu",        .data = armv9_cortex_x3_pmu_init},
        {.compatible = "arm,cortex-x4-pmu",        .data = armv9_cortex_x4_pmu_init},
        {.compatible = "arm,cortex-x925-pmu",        .data = armv9_cortex_x925_pmu_init},
        {.compatible = "arm,neoverse-e1-pmu",        .data = armv8_neoverse_e1_pmu_init},
        {.compatible = "arm,neoverse-n1-pmu",        .data = armv8_neoverse_n1_pmu_init},
        {.compatible = "arm,neoverse-n2-pmu",        .data = armv9_neoverse_n2_pmu_init},
        {.compatible = "arm,neoverse-n3-pmu",        .data = armv9_neoverse_n3_pmu_init},
        {.compatible = "arm,neoverse-v1-pmu",        .data = armv8_neoverse_v1_pmu_init},
        {.compatible = "arm,neoverse-v2-pmu",        .data = armv8_neoverse_v2_pmu_init},
        {.compatible = "arm,neoverse-v3-pmu",        .data = armv8_neoverse_v3_pmu_init},
        {.compatible = "arm,neoverse-v3ae-pmu",        .data = armv8_neoverse_v3ae_pmu_init},
        {.compatible = "arm,rainier-pmu",        .data = armv8_rainier_pmu_init},
        {.compatible = "cavium,thunder-pmu",        .data = armv8_cavium_thunder_pmu_init},
        {.compatible = "brcm,vulcan-pmu",        .data = armv8_brcm_vulcan_pmu_init},
        {.compatible = "nvidia,carmel-pmu",        .data = armv8_nvidia_carmel_pmu_init},
        {.compatible = "nvidia,denver-pmu",        .data = armv8_nvidia_denver_pmu_init},
        {.compatible = "samsung,mongoose-pmu",        .data = armv8_samsung_mongoose_pmu_init},
        {},
};

static int armv8_pmu_device_probe(struct platform_device *pdev)
{
        return arm_pmu_device_probe(pdev, armv8_pmu_of_device_ids, NULL);
}

static struct platform_driver armv8_pmu_driver = {
        .driver                = {
                .name        = ARMV8_PMU_PDEV_NAME,
                .of_match_table = armv8_pmu_of_device_ids,
                .suppress_bind_attrs = true,
        },
        .probe                = armv8_pmu_device_probe,
};

static int __init armv8_pmu_driver_init(void)
{
        int ret;

        if (acpi_disabled)
                ret = platform_driver_register(&armv8_pmu_driver);
        else
                ret = arm_pmu_acpi_probe(armv8_pmuv3_pmu_init);

        if (!ret)
                lockup_detector_retry_init();

        return ret;
}
device_initcall(armv8_pmu_driver_init)

void arch_perf_update_userpage(struct perf_event *event,
                               struct perf_event_mmap_page *userpg, u64 now)
{
        struct clock_read_data *rd;
        unsigned int seq;
        u64 ns;

        userpg->cap_user_time = 0;
        userpg->cap_user_time_zero = 0;
        userpg->cap_user_time_short = 0;
        userpg->cap_user_rdpmc = armv8pmu_event_has_user_read(event);

        if (userpg->cap_user_rdpmc) {
                if (event->hw.flags & ARMPMU_EVT_64BIT)
                        userpg->pmc_width = 64;
                else
                        userpg->pmc_width = 32;
        }

        do {
                rd = sched_clock_read_begin(&seq);

                if (rd->read_sched_clock != arch_timer_read_counter)
                        return;

                userpg->time_mult = rd->mult;
                userpg->time_shift = rd->shift;
                userpg->time_zero = rd->epoch_ns;
                userpg->time_cycles = rd->epoch_cyc;
                userpg->time_mask = rd->sched_clock_mask;

                /*
                 * Subtract the cycle base, such that software that
                 * doesn't know about cap_user_time_short still 'works'
                 * assuming no wraps.
                 */
                ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
                userpg->time_zero -= ns;

        } while (sched_clock_read_retry(seq));

        userpg->time_offset = userpg->time_zero - now;

        /*
         * time_shift is not expected to be greater than 31 due to
         * the original published conversion algorithm shifting a
         * 32-bit value (now specifies a 64-bit value) - refer
         * perf_event_mmap_page documentation in perf_event.h.
         */
        if (userpg->time_shift == 32) {
                userpg->time_shift = 31;
                userpg->time_mult >>= 1;
        }

        /*
         * Internal timekeeping for enabled/running/stopped times
         * is always computed with the sched_clock.
         */
        userpg->cap_user_time = 1;
        userpg->cap_user_time_zero = 1;
        userpg->cap_user_time_short = 1;
}




































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 











































































































































































































































































































    3 






    3 


    3 







    3 







    3 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
 * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
 */

#include "devl_internal.h"

#define DEVLINK_PORT_FN_CAPS_VALID_MASK \
        (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1)

static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
        [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
        [DEVLINK_PORT_FN_ATTR_STATE] =
                NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
                                 DEVLINK_PORT_FN_STATE_ACTIVE),
        [DEVLINK_PORT_FN_ATTR_CAPS] =
                NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
        [DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] = { .type = NLA_U32 },
};

#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port)                                \
        WARN_ON_ONCE(!(devlink_port)->registered)
#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port)                        \
        WARN_ON_ONCE((devlink_port)->registered)

struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
                                               unsigned int port_index)
{
        return xa_load(&devlink->ports, port_index);
}

struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
                                                 struct nlattr **attrs)
{
        if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
                u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
                struct devlink_port *devlink_port;

                devlink_port = devlink_port_get_by_index(devlink, port_index);
                if (!devlink_port)
                        return ERR_PTR(-ENODEV);
                return devlink_port;
        }
        return ERR_PTR(-EINVAL);
}

struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
                                                struct genl_info *info)
{
        return devlink_port_get_from_attrs(devlink, info->attrs);
}

static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps,
                                     u32 cap, bool is_enable)
{
        caps->selector |= cap;
        if (is_enable)
                caps->value |= cap;
}

static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port,
                                     struct nla_bitfield32 *caps,
                                     struct netlink_ext_ack *extack)
{
        bool is_enable;
        int err;

        if (!devlink_port->ops->port_fn_roce_get)
                return 0;

        err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable,
                                                  extack);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable);
        return 0;
}

static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port,
                                           struct nla_bitfield32 *caps,
                                           struct netlink_ext_ack *extack)
{
        bool is_enable;
        int err;

        if (!devlink_port->ops->port_fn_migratable_get ||
            devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
                return 0;

        err = devlink_port->ops->port_fn_migratable_get(devlink_port,
                                                        &is_enable, extack);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable);
        return 0;
}

static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port,
                                             struct nla_bitfield32 *caps,
                                             struct netlink_ext_ack *extack)
{
        bool is_enable;
        int err;

        if (!devlink_port->ops->port_fn_ipsec_crypto_get ||
            devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
                return 0;

        err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable);
        return 0;
}

static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port,
                                             struct nla_bitfield32 *caps,
                                             struct netlink_ext_ack *extack)
{
        bool is_enable;
        int err;

        if (!devlink_port->ops->port_fn_ipsec_packet_get ||
            devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
                return 0;

        err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }

        devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable);
        return 0;
}

static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port,
                                     struct sk_buff *msg,
                                     struct netlink_ext_ack *extack,
                                     bool *msg_updated)
{
        struct nla_bitfield32 caps = {};
        int err;

        err = devlink_port_fn_roce_fill(devlink_port, &caps, extack);
        if (err)
                return err;

        err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack);
        if (err)
                return err;

        err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack);
        if (err)
                return err;

        err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack);
        if (err)
                return err;

        if (!caps.selector)
                return 0;
        err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
                                 caps.selector);
        if (err)
                return err;

        *msg_updated = true;
        return 0;
}

static int devlink_port_fn_max_io_eqs_fill(struct devlink_port *port,
                                           struct sk_buff *msg,
                                           struct netlink_ext_ack *extack,
                                           bool *msg_updated)
{
        u32 max_io_eqs;
        int err;

        if (!port->ops->port_fn_max_io_eqs_get)
                return 0;

        err = port->ops->port_fn_max_io_eqs_get(port, &max_io_eqs, extack);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }
        err = nla_put_u32(msg, DEVLINK_PORT_FN_ATTR_MAX_IO_EQS, max_io_eqs);
        if (err)
                return err;
        *msg_updated = true;
        return 0;
}

int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port)
{
        if (devlink_nl_put_handle(msg, devlink_port->devlink))
                return -EMSGSIZE;
        if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
                return -EMSGSIZE;
        return 0;
}

size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port)
{
        struct devlink *devlink = devlink_port->devlink;

        return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */
             + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */
             + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */
}

static int devlink_nl_port_attrs_put(struct sk_buff *msg,
                                     struct devlink_port *devlink_port)
{
        struct devlink_port_attrs *attrs = &devlink_port->attrs;

        if (!devlink_port->attrs_set)
                return 0;
        if (attrs->lanes) {
                if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes))
                        return -EMSGSIZE;
        }
        if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable))
                return -EMSGSIZE;
        if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
                return -EMSGSIZE;
        switch (devlink_port->attrs.flavour) {
        case DEVLINK_PORT_FLAVOUR_PCI_PF:
                if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
                                attrs->pci_pf.controller) ||
                    nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf))
                        return -EMSGSIZE;
                if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external))
                        return -EMSGSIZE;
                break;
        case DEVLINK_PORT_FLAVOUR_PCI_VF:
                if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
                                attrs->pci_vf.controller) ||
                    nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) ||
                    nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf))
                        return -EMSGSIZE;
                if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external))
                        return -EMSGSIZE;
                break;
        case DEVLINK_PORT_FLAVOUR_PCI_SF:
                if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
                                attrs->pci_sf.controller) ||
                    nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
                                attrs->pci_sf.pf) ||
                    nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER,
                                attrs->pci_sf.sf))
                        return -EMSGSIZE;
                break;
        case DEVLINK_PORT_FLAVOUR_PHYSICAL:
        case DEVLINK_PORT_FLAVOUR_CPU:
        case DEVLINK_PORT_FLAVOUR_DSA:
                if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
                                attrs->phys.port_number))
                        return -EMSGSIZE;
                if (!attrs->split)
                        return 0;
                if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
                                attrs->phys.port_number))
                        return -EMSGSIZE;
                if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
                                attrs->phys.split_subport_number))
                        return -EMSGSIZE;
                break;
        default:
                break;
        }
        return 0;
}

static int devlink_port_fn_hw_addr_fill(struct devlink_port *port,
                                        struct sk_buff *msg,
                                        struct netlink_ext_ack *extack,
                                        bool *msg_updated)
{
        u8 hw_addr[MAX_ADDR_LEN];
        int hw_addr_len;
        int err;

        if (!port->ops->port_fn_hw_addr_get)
                return 0;

        err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len,
                                             extack);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }
        err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr);
        if (err)
                return err;
        *msg_updated = true;
        return 0;
}

static bool
devlink_port_fn_state_valid(enum devlink_port_fn_state state)
{
        return state == DEVLINK_PORT_FN_STATE_INACTIVE ||
               state == DEVLINK_PORT_FN_STATE_ACTIVE;
}

static bool
devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate)
{
        return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED ||
               opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED;
}

static int devlink_port_fn_state_fill(struct devlink_port *port,
                                      struct sk_buff *msg,
                                      struct netlink_ext_ack *extack,
                                      bool *msg_updated)
{
        enum devlink_port_fn_opstate opstate;
        enum devlink_port_fn_state state;
        int err;

        if (!port->ops->port_fn_state_get)
                return 0;

        err = port->ops->port_fn_state_get(port, &state, &opstate, extack);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
                return err;
        }
        if (!devlink_port_fn_state_valid(state)) {
                WARN_ON_ONCE(1);
                NL_SET_ERR_MSG(extack, "Invalid state read from driver");
                return -EINVAL;
        }
        if (!devlink_port_fn_opstate_valid(opstate)) {
                WARN_ON_ONCE(1);
                NL_SET_ERR_MSG(extack, "Invalid operational state read from driver");
                return -EINVAL;
        }
        if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) ||
            nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate))
                return -EMSGSIZE;
        *msg_updated = true;
        return 0;
}

static int
devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable,
                        struct netlink_ext_ack *extack)
{
        return devlink_port->ops->port_fn_migratable_set(devlink_port, enable,
                                                         extack);
}

static int
devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
                         struct netlink_ext_ack *extack)
{
        return devlink_port->ops->port_fn_roce_set(devlink_port, enable,
                                                   extack);
}

static int
devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable,
                                 struct netlink_ext_ack *extack)
{
        return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack);
}

static int
devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable,
                                 struct netlink_ext_ack *extack)
{
        return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack);
}

static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
                                    const struct nlattr *attr,
                                    struct netlink_ext_ack *extack)
{
        struct nla_bitfield32 caps;
        u32 caps_value;
        int err;

        caps = nla_get_bitfield32(attr);
        caps_value = caps.value & caps.selector;
        if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) {
                err = devlink_port_fn_roce_set(devlink_port,
                                               caps_value & DEVLINK_PORT_FN_CAP_ROCE,
                                               extack);
                if (err)
                        return err;
        }
        if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
                err = devlink_port_fn_mig_set(devlink_port, caps_value &
                                              DEVLINK_PORT_FN_CAP_MIGRATABLE,
                                              extack);
                if (err)
                        return err;
        }
        if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) {
                err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value &
                                                       DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO,
                                                       extack);
                if (err)
                        return err;
        }
        if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) {
                err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value &
                                                       DEVLINK_PORT_FN_CAP_IPSEC_PACKET,
                                                       extack);
                if (err)
                        return err;
        }
        return 0;
}

static int
devlink_port_fn_max_io_eqs_set(struct devlink_port *devlink_port,
                               const struct nlattr *attr,
                               struct netlink_ext_ack *extack)
{
        u32 max_io_eqs;

        max_io_eqs = nla_get_u32(attr);
        return devlink_port->ops->port_fn_max_io_eqs_set(devlink_port,
                                                         max_io_eqs, extack);
}

static int
devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
                                   struct netlink_ext_ack *extack)
{
        struct nlattr *function_attr;
        bool msg_updated = false;
        int err;

        function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION);
        if (!function_attr)
                return -EMSGSIZE;

        err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated);
        if (err)
                goto out;
        err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated);
        if (err)
                goto out;
        err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated);
        if (err)
                goto out;
        err = devlink_port_fn_max_io_eqs_fill(port, msg, extack, &msg_updated);
        if (err)
                goto out;
        err = devlink_rel_devlink_handle_put(msg, port->devlink,
                                             port->rel_index,
                                             DEVLINK_PORT_FN_ATTR_DEVLINK,
                                             &msg_updated);

out:
        if (err || !msg_updated)
                nla_nest_cancel(msg, function_attr);
        else
                nla_nest_end(msg, function_attr);
        return err;
}

static int devlink_nl_port_fill(struct sk_buff *msg,
                                struct devlink_port *devlink_port,
                                enum devlink_command cmd, u32 portid, u32 seq,
                                int flags, struct netlink_ext_ack *extack)
{
        struct devlink *devlink = devlink_port->devlink;
        void *hdr;

        hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
        if (!hdr)
                return -EMSGSIZE;

        if (devlink_nl_put_handle(msg, devlink))
                goto nla_put_failure;
        if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
                goto nla_put_failure;

        spin_lock_bh(&devlink_port->type_lock);
        if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
                goto nla_put_failure_type_locked;
        if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
            nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
                        devlink_port->desired_type))
                goto nla_put_failure_type_locked;
        if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
                if (devlink_port->type_eth.netdev &&
                    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
                                 devlink_port->type_eth.ifindex) ||
                     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
                                    devlink_port->type_eth.ifname)))
                        goto nla_put_failure_type_locked;
        }
        if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
                struct ib_device *ibdev = devlink_port->type_ib.ibdev;

                if (ibdev &&
                    nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
                                   ibdev->name))
                        goto nla_put_failure_type_locked;
        }
        spin_unlock_bh(&devlink_port->type_lock);
        if (devlink_nl_port_attrs_put(msg, devlink_port))
                goto nla_put_failure;
        if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack))
                goto nla_put_failure;
        if (devlink_port->linecard &&
            nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX,
                        devlink_linecard_index(devlink_port->linecard)))
                goto nla_put_failure;

        genlmsg_end(msg, hdr);
        return 0;

nla_put_failure_type_locked:
        spin_unlock_bh(&devlink_port->type_lock);
nla_put_failure:
        genlmsg_cancel(msg, hdr);
        return -EMSGSIZE;
}

static void devlink_port_notify(struct devlink_port *devlink_port,
                                enum devlink_command cmd)
{
        struct devlink *devlink = devlink_port->devlink;
        struct devlink_obj_desc desc;
        struct sk_buff *msg;
        int err;

        WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);

        if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
                return;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return;

        err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL);
        if (err) {
                nlmsg_free(msg);
                return;
        }

        devlink_nl_obj_desc_init(&desc, devlink);
        devlink_nl_obj_desc_port_set(&desc, devlink_port);
        devlink_nl_notify_send_desc(devlink, msg, &desc);
}

static void devlink_ports_notify(struct devlink *devlink,
                                 enum devlink_command cmd)
{
        struct devlink_port *devlink_port;
        unsigned long port_index;

        xa_for_each(&devlink->ports, port_index, devlink_port)
                devlink_port_notify(devlink_port, cmd);
}

void devlink_ports_notify_register(struct devlink *devlink)
{
        devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW);
}

void devlink_ports_notify_unregister(struct devlink *devlink)
{
        devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL);
}

int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct devlink_port *devlink_port = info->user_ptr[1];
        struct sk_buff *msg;
        int err;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg)
                return -ENOMEM;

        err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
                                   info->snd_portid, info->snd_seq, 0,
                                   info->extack);
        if (err) {
                nlmsg_free(msg);
                return err;
        }

        return genlmsg_reply(msg, info);
}

static int
devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
                             struct netlink_callback *cb, int flags)
{
        struct devlink_nl_dump_state *state = devlink_dump_state(cb);
        struct devlink_port *devlink_port;
        unsigned long port_index;
        int err = 0;

        xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) {
                err = devlink_nl_port_fill(msg, devlink_port,
                                           DEVLINK_CMD_PORT_NEW,
                                           NETLINK_CB(cb->skb).portid,
                                           cb->nlh->nlmsg_seq, flags,
                                           cb->extack);
                if (err) {
                        state->idx = port_index;
                        break;
                }
        }

        return err;
}

int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
        return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one);
}

static int devlink_port_type_set(struct devlink_port *devlink_port,
                                 enum devlink_port_type port_type)

{
        int err;

        if (!devlink_port->ops->port_type_set)
                return -EOPNOTSUPP;

        if (port_type == devlink_port->type)
                return 0;

        err = devlink_port->ops->port_type_set(devlink_port, port_type);
        if (err)
                return err;

        devlink_port->desired_type = port_type;
        devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
        return 0;
}

static int devlink_port_function_hw_addr_set(struct devlink_port *port,
                                             const struct nlattr *attr,
                                             struct netlink_ext_ack *extack)
{
        const u8 *hw_addr;
        int hw_addr_len;

        hw_addr = nla_data(attr);
        hw_addr_len = nla_len(attr);
        if (hw_addr_len > MAX_ADDR_LEN) {
                NL_SET_ERR_MSG(extack, "Port function hardware address too long");
                return -EINVAL;
        }
        if (port->type == DEVLINK_PORT_TYPE_ETH) {
                if (hw_addr_len != ETH_ALEN) {
                        NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device");
                        return -EINVAL;
                }
                if (!is_unicast_ether_addr(hw_addr)) {
                        NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported");
                        return -EINVAL;
                }
        }

        return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len,
                                              extack);
}

static int devlink_port_fn_state_set(struct devlink_port *port,
                                     const struct nlattr *attr,
                                     struct netlink_ext_ack *extack)
{
        enum devlink_port_fn_state state;

        state = nla_get_u8(attr);
        return port->ops->port_fn_state_set(port, state, extack);
}

static int devlink_port_function_validate(struct devlink_port *devlink_port,
                                          struct nlattr **tb,
                                          struct netlink_ext_ack *extack)
{
        const struct devlink_port_ops *ops = devlink_port->ops;
        struct nlattr *attr;

        if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] &&
            !ops->port_fn_hw_addr_set) {
                NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR],
                                    "Port doesn't support function attributes");
                return -EOPNOTSUPP;
        }
        if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) {
                NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_STATE],
                                    "Function does not support state setting");
                return -EOPNOTSUPP;
        }
        attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
        if (attr) {
                struct nla_bitfield32 caps;

                caps = nla_get_bitfield32(attr);
                if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE &&
                    !ops->port_fn_roce_set) {
                        NL_SET_ERR_MSG_ATTR(extack, attr,
                                            "Port doesn't support RoCE function attribute");
                        return -EOPNOTSUPP;
                }
                if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
                        if (!ops->port_fn_migratable_set) {
                                NL_SET_ERR_MSG_ATTR(extack, attr,
                                                    "Port doesn't support migratable function attribute");
                                return -EOPNOTSUPP;
                        }
                        if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
                                NL_SET_ERR_MSG_ATTR(extack, attr,
                                                    "migratable function attribute supported for VFs only");
                                return -EOPNOTSUPP;
                        }
                }
                if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) {
                        if (!ops->port_fn_ipsec_crypto_set) {
                                NL_SET_ERR_MSG_ATTR(extack, attr,
                                                    "Port doesn't support ipsec_crypto function attribute");
                                return -EOPNOTSUPP;
                        }
                        if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
                                NL_SET_ERR_MSG_ATTR(extack, attr,
                                                    "ipsec_crypto function attribute supported for VFs only");
                                return -EOPNOTSUPP;
                        }
                }
                if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) {
                        if (!ops->port_fn_ipsec_packet_set) {
                                NL_SET_ERR_MSG_ATTR(extack, attr,
                                                    "Port doesn't support ipsec_packet function attribute");
                                return -EOPNOTSUPP;
                        }
                        if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
                                NL_SET_ERR_MSG_ATTR(extack, attr,
                                                    "ipsec_packet function attribute supported for VFs only");
                                return -EOPNOTSUPP;
                        }
                }
        }
        if (tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] &&
            !ops->port_fn_max_io_eqs_set) {
                NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS],
                                    "Function does not support max_io_eqs setting");
                return -EOPNOTSUPP;
        }
        return 0;
}

static int devlink_port_function_set(struct devlink_port *port,
                                     const struct nlattr *attr,
                                     struct netlink_ext_ack *extack)
{
        struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1];
        int err;

        err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr,
                               devlink_function_nl_policy, extack);
        if (err < 0) {
                NL_SET_ERR_MSG(extack, "Fail to parse port function attributes");
                return err;
        }

        err = devlink_port_function_validate(port, tb, extack);
        if (err)
                return err;

        attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR];
        if (attr) {
                err = devlink_port_function_hw_addr_set(port, attr, extack);
                if (err)
                        return err;
        }

        attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
        if (attr) {
                err = devlink_port_fn_caps_set(port, attr, extack);
                if (err)
                        return err;
        }

        attr = tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS];
        if (attr) {
                err = devlink_port_fn_max_io_eqs_set(port, attr, extack);
                if (err)
                        return err;
        }

        /* Keep this as the last function attribute set, so that when
         * multiple port function attributes are set along with state,
         * Those can be applied first before activating the state.
         */
        attr = tb[DEVLINK_PORT_FN_ATTR_STATE];
        if (attr)
                err = devlink_port_fn_state_set(port, attr, extack);

        if (!err)
                devlink_port_notify(port, DEVLINK_CMD_PORT_NEW);
        return err;
}

int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct devlink_port *devlink_port = info->user_ptr[1];
        int err;

        if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
                enum devlink_port_type port_type;

                port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
                err = devlink_port_type_set(devlink_port, port_type);
                if (err)
                        return err;
        }

        if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) {
                struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION];
                struct netlink_ext_ack *extack = info->extack;

                err = devlink_port_function_set(devlink_port, attr, extack);
                if (err)
                        return err;
        }

        return 0;
}

int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct devlink_port *devlink_port = info->user_ptr[1];
        struct devlink *devlink = info->user_ptr[0];
        u32 count;

        if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT))
                return -EINVAL;
        if (!devlink_port->ops->port_split)
                return -EOPNOTSUPP;

        count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);

        if (!devlink_port->attrs.splittable) {
                /* Split ports cannot be split. */
                if (devlink_port->attrs.split)
                        NL_SET_ERR_MSG(info->extack, "Port cannot be split further");
                else
                        NL_SET_ERR_MSG(info->extack, "Port cannot be split");
                return -EINVAL;
        }

        if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) {
                NL_SET_ERR_MSG(info->extack, "Invalid split count");
                return -EINVAL;
        }

        return devlink_port->ops->port_split(devlink, devlink_port, count,
                                             info->extack);
}

int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct devlink_port *devlink_port = info->user_ptr[1];
        struct devlink *devlink = info->user_ptr[0];

        if (!devlink_port->ops->port_unsplit)
                return -EOPNOTSUPP;
        return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack);
}

int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct netlink_ext_ack *extack = info->extack;
        struct devlink_port_new_attrs new_attrs = {};
        struct devlink *devlink = info->user_ptr[0];
        struct devlink_port *devlink_port;
        struct sk_buff *msg;
        int err;

        if (!devlink->ops->port_new)
                return -EOPNOTSUPP;

        if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] ||
            !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) {
                NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified");
                return -EINVAL;
        }
        new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]);
        new_attrs.pfnum =
                nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]);

        if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
                /* Port index of the new port being created by driver. */
                new_attrs.port_index =
                        nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
                new_attrs.port_index_valid = true;
        }
        if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) {
                new_attrs.controller =
                        nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]);
                new_attrs.controller_valid = true;
        }
        if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF &&
            info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) {
                new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]);
                new_attrs.sfnum_valid = true;
        }

        err = devlink->ops->port_new(devlink, &new_attrs,
                                     extack, &devlink_port);
        if (err)
                return err;

        msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!msg) {
                err = -ENOMEM;
                goto err_out_port_del;
        }
        err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
                                   info->snd_portid, info->snd_seq, 0, NULL);
        if (WARN_ON_ONCE(err))
                goto err_out_msg_free;
        err = genlmsg_reply(msg, info);
        if (err)
                goto err_out_port_del;
        return 0;

err_out_msg_free:
        nlmsg_free(msg);
err_out_port_del:
        devlink_port->ops->port_del(devlink, devlink_port, NULL);
        return err;
}

int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct devlink_port *devlink_port = info->user_ptr[1];
        struct netlink_ext_ack *extack = info->extack;
        struct devlink *devlink = info->user_ptr[0];

        if (!devlink_port->ops->port_del)
                return -EOPNOTSUPP;

        return devlink_port->ops->port_del(devlink, devlink_port, extack);
}

static void devlink_port_type_warn(struct work_struct *work)
{
        struct devlink_port *port = container_of(to_delayed_work(work),
                                                 struct devlink_port,
                                                 type_warn_dw);
        dev_warn(port->devlink->dev, "Type was not set for devlink port.");
}

static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
{
        /* Ignore CPU and DSA flavours. */
        return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
               devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA &&
               devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED;
}

#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600)

static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
{
        if (!devlink_port_type_should_warn(devlink_port))
                return;
        /* Schedule a work to WARN in case driver does not set port
         * type within timeout.
         */
        schedule_delayed_work(&devlink_port->type_warn_dw,
                              DEVLINK_PORT_TYPE_WARN_TIMEOUT);
}

static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
{
        if (!devlink_port_type_should_warn(devlink_port))
                return;
        cancel_delayed_work_sync(&devlink_port->type_warn_dw);
}

/**
 * devlink_port_init() - Init devlink port
 *
 * @devlink: devlink
 * @devlink_port: devlink port
 *
 * Initialize essential stuff that is needed for functions
 * that may be called before devlink port registration.
 * Call to this function is optional and not needed
 * in case the driver does not use such functions.
 */
void devlink_port_init(struct devlink *devlink,
                       struct devlink_port *devlink_port)
{
        if (devlink_port->initialized)
                return;
        devlink_port->devlink = devlink;
        INIT_LIST_HEAD(&devlink_port->region_list);
        devlink_port->initialized = true;
}
EXPORT_SYMBOL_GPL(devlink_port_init);

/**
 * devlink_port_fini() - Deinitialize devlink port
 *
 * @devlink_port: devlink port
 *
 * Deinitialize essential stuff that is in use for functions
 * that may be called after devlink port unregistration.
 * Call to this function is optional and not needed
 * in case the driver does not use such functions.
 */
void devlink_port_fini(struct devlink_port *devlink_port)
{
        WARN_ON(!list_empty(&devlink_port->region_list));
}
EXPORT_SYMBOL_GPL(devlink_port_fini);

static const struct devlink_port_ops devlink_port_dummy_ops = {};

/**
 * devl_port_register_with_ops() - Register devlink port
 *
 * @devlink: devlink
 * @devlink_port: devlink port
 * @port_index: driver-specific numerical identifier of the port
 * @ops: port ops
 *
 * Register devlink port with provided port index. User can use
 * any indexing, even hw-related one. devlink_port structure
 * is convenient to be embedded inside user driver private structure.
 * Note that the caller should take care of zeroing the devlink_port
 * structure.
 */
int devl_port_register_with_ops(struct devlink *devlink,
                                struct devlink_port *devlink_port,
                                unsigned int port_index,
                                const struct devlink_port_ops *ops)
{
        int err;

        devl_assert_locked(devlink);

        ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);

        devlink_port_init(devlink, devlink_port);
        devlink_port->registered = true;
        devlink_port->index = port_index;
        devlink_port->ops = ops ? ops : &devlink_port_dummy_ops;
        spin_lock_init(&devlink_port->type_lock);
        INIT_LIST_HEAD(&devlink_port->reporter_list);
        err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL);
        if (err) {
                devlink_port->registered = false;
                return err;
        }

        INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
        devlink_port_type_warn_schedule(devlink_port);
        devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
        return 0;
}
EXPORT_SYMBOL_GPL(devl_port_register_with_ops);

/**
 *        devlink_port_register_with_ops - Register devlink port
 *
 *        @devlink: devlink
 *        @devlink_port: devlink port
 *        @port_index: driver-specific numerical identifier of the port
 *        @ops: port ops
 *
 *        Register devlink port with provided port index. User can use
 *        any indexing, even hw-related one. devlink_port structure
 *        is convenient to be embedded inside user driver private structure.
 *        Note that the caller should take care of zeroing the devlink_port
 *        structure.
 *
 *        Context: Takes and release devlink->lock <mutex>.
 */
int devlink_port_register_with_ops(struct devlink *devlink,
                                   struct devlink_port *devlink_port,
                                   unsigned int port_index,
                                   const struct devlink_port_ops *ops)
{
        int err;

        devl_lock(devlink);
        err = devl_port_register_with_ops(devlink, devlink_port,
                                          port_index, ops);
        devl_unlock(devlink);
        return err;
}
EXPORT_SYMBOL_GPL(devlink_port_register_with_ops);

/**
 * devl_port_unregister() - Unregister devlink port
 *
 * @devlink_port: devlink port
 */
void devl_port_unregister(struct devlink_port *devlink_port)
{
        lockdep_assert_held(&devlink_port->devlink->lock);
        WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET);

        devlink_port_type_warn_cancel(devlink_port);
        devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
        xa_erase(&devlink_port->devlink->ports, devlink_port->index);
        WARN_ON(!list_empty(&devlink_port->reporter_list));
        devlink_port->registered = false;
}
EXPORT_SYMBOL_GPL(devl_port_unregister);

/**
 *        devlink_port_unregister - Unregister devlink port
 *
 *        @devlink_port: devlink port
 *
 *        Context: Takes and release devlink->lock <mutex>.
 */
void devlink_port_unregister(struct devlink_port *devlink_port)
{
        struct devlink *devlink = devlink_port->devlink;

        devl_lock(devlink);
        devl_port_unregister(devlink_port);
        devl_unlock(devlink);
}
EXPORT_SYMBOL_GPL(devlink_port_unregister);

static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
                                            struct net_device *netdev)
{
        const struct net_device_ops *ops = netdev->netdev_ops;

        /* If driver registers devlink port, it should set devlink port
         * attributes accordingly so the compat functions are called
         * and the original ops are not used.
         */
        if (ops->ndo_get_phys_port_name) {
                /* Some drivers use the same set of ndos for netdevs
                 * that have devlink_port registered and also for
                 * those who don't. Make sure that ndo_get_phys_port_name
                 * returns -EOPNOTSUPP here in case it is defined.
                 * Warn if not.
                 */
                char name[IFNAMSIZ];
                int err;

                err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name));
                WARN_ON(err != -EOPNOTSUPP);
        }
        if (ops->ndo_get_port_parent_id) {
                /* Some drivers use the same set of ndos for netdevs
                 * that have devlink_port registered and also for
                 * those who don't. Make sure that ndo_get_port_parent_id
                 * returns -EOPNOTSUPP here in case it is defined.
                 * Warn if not.
                 */
                struct netdev_phys_item_id ppid;
                int err;

                err = ops->ndo_get_port_parent_id(netdev, &ppid);
                WARN_ON(err != -EOPNOTSUPP);
        }
}

static void __devlink_port_type_set(struct devlink_port *devlink_port,
                                    enum devlink_port_type type,
                                    void *type_dev)
{
        struct net_device *netdev = type_dev;

        ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);

        if (type == DEVLINK_PORT_TYPE_NOTSET) {
                devlink_port_type_warn_schedule(devlink_port);
        } else {
                devlink_port_type_warn_cancel(devlink_port);
                if (type == DEVLINK_PORT_TYPE_ETH && netdev)
                        devlink_port_type_netdev_checks(devlink_port, netdev);
        }

        spin_lock_bh(&devlink_port->type_lock);
        devlink_port->type = type;
        switch (type) {
        case DEVLINK_PORT_TYPE_ETH:
                devlink_port->type_eth.netdev = netdev;
                if (netdev) {
                        ASSERT_RTNL();
                        devlink_port->type_eth.ifindex = netdev->ifindex;
                        BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) !=
                                     sizeof(netdev->name));
                        strcpy(devlink_port->type_eth.ifname, netdev->name);
                }
                break;
        case DEVLINK_PORT_TYPE_IB:
                devlink_port->type_ib.ibdev = type_dev;
                break;
        default:
                break;
        }
        spin_unlock_bh(&devlink_port->type_lock);
        devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}

/**
 *        devlink_port_type_eth_set - Set port type to Ethernet
 *
 *        @devlink_port: devlink port
 *
 *        If driver is calling this, most likely it is doing something wrong.
 */
void devlink_port_type_eth_set(struct devlink_port *devlink_port)
{
        dev_warn(devlink_port->devlink->dev,
                 "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
                 devlink_port->index);
        __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL);
}
EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);

/**
 *        devlink_port_type_ib_set - Set port type to InfiniBand
 *
 *        @devlink_port: devlink port
 *        @ibdev: related IB device
 */
void devlink_port_type_ib_set(struct devlink_port *devlink_port,
                              struct ib_device *ibdev)
{
        __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev);
}
EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);

/**
 *        devlink_port_type_clear - Clear port type
 *
 *        @devlink_port: devlink port
 *
 *        If driver is calling this for clearing Ethernet type, most likely
 *        it is doing something wrong.
 */
void devlink_port_type_clear(struct devlink_port *devlink_port)
{
        if (devlink_port->type == DEVLINK_PORT_TYPE_ETH)
                dev_warn(devlink_port->devlink->dev,
                         "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n",
                         devlink_port->index);
        __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
}
EXPORT_SYMBOL_GPL(devlink_port_type_clear);

int devlink_port_netdevice_event(struct notifier_block *nb,
                                 unsigned long event, void *ptr)
{
        struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
        struct devlink_port *devlink_port = netdev->devlink_port;
        struct devlink *devlink;

        if (!devlink_port)
                return NOTIFY_OK;
        devlink = devlink_port->devlink;

        switch (event) {
        case NETDEV_POST_INIT:
                /* Set the type but not netdev pointer. It is going to be set
                 * later on by NETDEV_REGISTER event. Happens once during
                 * netdevice register
                 */
                __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
                                        NULL);
                break;
        case NETDEV_REGISTER:
        case NETDEV_CHANGENAME:
                if (devlink_net(devlink) != dev_net(netdev))
                        return NOTIFY_OK;
                /* Set the netdev on top of previously set type. Note this
                 * event happens also during net namespace change so here
                 * we take into account netdev pointer appearing in this
                 * namespace.
                 */
                __devlink_port_type_set(devlink_port, devlink_port->type,
                                        netdev);
                break;
        case NETDEV_UNREGISTER:
                if (devlink_net(devlink) != dev_net(netdev))
                        return NOTIFY_OK;
                /* Clear netdev pointer, but not the type. This event happens
                 * also during net namespace change so we need to clear
                 * pointer to netdev that is going to another net namespace.
                 */
                __devlink_port_type_set(devlink_port, devlink_port->type,
                                        NULL);
                break;
        case NETDEV_PRE_UNINIT:
                /* Clear the type and the netdev pointer. Happens one during
                 * netdevice unregister.
                 */
                __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET,
                                        NULL);
                break;
        }

        return NOTIFY_OK;
}

static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
                                    enum devlink_port_flavour flavour)
{
        struct devlink_port_attrs *attrs = &devlink_port->attrs;

        devlink_port->attrs_set = true;
        attrs->flavour = flavour;
        if (attrs->switch_id.id_len) {
                devlink_port->switch_port = true;
                if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN))
                        attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN;
        } else {
                devlink_port->switch_port = false;
        }
        return 0;
}

/**
 *        devlink_port_attrs_set - Set port attributes
 *
 *        @devlink_port: devlink port
 *        @attrs: devlink port attrs
 */
void devlink_port_attrs_set(struct devlink_port *devlink_port,
                            struct devlink_port_attrs *attrs)
{
        int ret;

        ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);

        devlink_port->attrs = *attrs;
        ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
        if (ret)
                return;
        WARN_ON(attrs->splittable && attrs->split);
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_set);

/**
 *        devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
 *
 *        @devlink_port: devlink port
 *        @controller: associated controller number for the devlink port instance
 *        @pf: associated PCI function number for the devlink port instance
 *        @external: indicates if the port is for an external controller
 */
void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller,
                                   u16 pf, bool external)
{
        struct devlink_port_attrs *attrs = &devlink_port->attrs;
        int ret;

        ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);

        ret = __devlink_port_attrs_set(devlink_port,
                                       DEVLINK_PORT_FLAVOUR_PCI_PF);
        if (ret)
                return;
        attrs->pci_pf.controller = controller;
        attrs->pci_pf.pf = pf;
        attrs->pci_pf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);

/**
 *        devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
 *
 *        @devlink_port: devlink port
 *        @controller: associated controller number for the devlink port instance
 *        @pf: associated PCI function number for the devlink port instance
 *        @vf: associated PCI VF number of a PF for the devlink port instance;
 *             VF number starts from 0 for the first PCI virtual function
 *        @external: indicates if the port is for an external controller
 */
void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller,
                                   u16 pf, u16 vf, bool external)
{
        struct devlink_port_attrs *attrs = &devlink_port->attrs;
        int ret;

        ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);

        ret = __devlink_port_attrs_set(devlink_port,
                                       DEVLINK_PORT_FLAVOUR_PCI_VF);
        if (ret)
                return;
        attrs->pci_vf.controller = controller;
        attrs->pci_vf.pf = pf;
        attrs->pci_vf.vf = vf;
        attrs->pci_vf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);

/**
 *        devlink_port_attrs_pci_sf_set - Set PCI SF port attributes
 *
 *        @devlink_port: devlink port
 *        @controller: associated controller number for the devlink port instance
 *        @pf: associated PCI function number for the devlink port instance
 *        @sf: associated SF number of a PF for the devlink port instance
 *        @external: indicates if the port is for an external controller
 */
void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller,
                                   u16 pf, u32 sf, bool external)
{
        struct devlink_port_attrs *attrs = &devlink_port->attrs;
        int ret;

        ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);

        ret = __devlink_port_attrs_set(devlink_port,
                                       DEVLINK_PORT_FLAVOUR_PCI_SF);
        if (ret)
                return;
        attrs->pci_sf.controller = controller;
        attrs->pci_sf.pf = pf;
        attrs->pci_sf.sf = sf;
        attrs->pci_sf.external = external;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);

static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index)
{
        struct devlink_port *devlink_port;

        devlink_port = devlink_port_get_by_index(devlink, port_index);
        if (!devlink_port)
                return;
        devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
}

static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index,
                                        u32 rel_index)
{
        struct devlink_port *devlink_port;

        devlink_port = devlink_port_get_by_index(devlink, port_index);
        if (devlink_port && devlink_port->rel_index == rel_index)
                devlink_port->rel_index = 0;
}

/**
 * devl_port_fn_devlink_set - Attach peer devlink
 *                              instance to port function.
 * @devlink_port: devlink port
 * @fn_devlink: devlink instance to attach
 */
int devl_port_fn_devlink_set(struct devlink_port *devlink_port,
                             struct devlink *fn_devlink)
{
        ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);

        if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF ||
                    devlink_port->attrs.pci_sf.external))
                return -EINVAL;

        return devlink_rel_nested_in_add(&devlink_port->rel_index,
                                         devlink_port->devlink->index,
                                         devlink_port->index,
                                         devlink_port_rel_notify_cb,
                                         devlink_port_rel_cleanup_cb,
                                         fn_devlink);
}
EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set);

/**
 *        devlink_port_linecard_set - Link port with a linecard
 *
 *        @devlink_port: devlink port
 *        @linecard: devlink linecard
 */
void devlink_port_linecard_set(struct devlink_port *devlink_port,
                               struct devlink_linecard *linecard)
{
        ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);

        devlink_port->linecard = linecard;
}
EXPORT_SYMBOL_GPL(devlink_port_linecard_set);

static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
                                             char *name, size_t len)
{
        struct devlink_port_attrs *attrs = &devlink_port->attrs;
        int n = 0;

        if (!devlink_port->attrs_set)
                return -EOPNOTSUPP;

        switch (attrs->flavour) {
        case DEVLINK_PORT_FLAVOUR_PHYSICAL:
                if (devlink_port->linecard)
                        n = snprintf(name, len, "l%u",
                                     devlink_linecard_index(devlink_port->linecard));
                if (n < len)
                        n += snprintf(name + n, len - n, "p%u",
                                      attrs->phys.port_number);
                if (n < len && attrs->split)
                        n += snprintf(name + n, len - n, "s%u",
                                      attrs->phys.split_subport_number);
                break;
        case DEVLINK_PORT_FLAVOUR_CPU:
        case DEVLINK_PORT_FLAVOUR_DSA:
        case DEVLINK_PORT_FLAVOUR_UNUSED:
                /* As CPU and DSA ports do not have a netdevice associated
                 * case should not ever happen.
                 */
                WARN_ON(1);
                return -EINVAL;
        case DEVLINK_PORT_FLAVOUR_PCI_PF:
                if (attrs->pci_pf.external) {
                        n = snprintf(name, len, "c%u", attrs->pci_pf.controller);
                        if (n >= len)
                                return -EINVAL;
                        len -= n;
                        name += n;
                }
                n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
                break;
        case DEVLINK_PORT_FLAVOUR_PCI_VF:
                if (attrs->pci_vf.external) {
                        n = snprintf(name, len, "c%u", attrs->pci_vf.controller);
                        if (n >= len)
                                return -EINVAL;
                        len -= n;
                        name += n;
                }
                n = snprintf(name, len, "pf%uvf%u",
                             attrs->pci_vf.pf, attrs->pci_vf.vf);
                break;
        case DEVLINK_PORT_FLAVOUR_PCI_SF:
                if (attrs->pci_sf.external) {
                        n = snprintf(name, len, "c%u", attrs->pci_sf.controller);
                        if (n >= len)
                                return -EINVAL;
                        len -= n;
                        name += n;
                }
                n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
                             attrs->pci_sf.sf);
                break;
        case DEVLINK_PORT_FLAVOUR_VIRTUAL:
                return -EOPNOTSUPP;
        }

        if (n >= len)
                return -EINVAL;

        return 0;
}

int devlink_compat_phys_port_name_get(struct net_device *dev,
                                      char *name, size_t len)
{
        struct devlink_port *devlink_port;

        /* RTNL mutex is held here which ensures that devlink_port
         * instance cannot disappear in the middle. No need to take
         * any devlink lock as only permanent values are accessed.
         */
        ASSERT_RTNL();

        devlink_port = dev->devlink_port;
        if (!devlink_port)
                return -EOPNOTSUPP;

        return __devlink_port_phys_port_name_get(devlink_port, name, len);
}

int devlink_compat_switch_id_get(struct net_device *dev,
                                 struct netdev_phys_item_id *ppid)
{
        struct devlink_port *devlink_port;

        /* Caller must hold RTNL mutex or reference to dev, which ensures that
         * devlink_port instance cannot disappear in the middle. No need to take
         * any devlink lock as only permanent values are accessed.
         */
        devlink_port = dev->devlink_port;
        if (!devlink_port || !devlink_port->switch_port)
                return -EOPNOTSUPP;

        memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid));

        return 0;
}

















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 






















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN
 *
 * This implementation does not provide ISO-TP specific return values to the
 * userspace.
 *
 * - RX path timeout of data reception leads to -ETIMEDOUT
 * - RX path SN mismatch leads to -EILSEQ
 * - RX path data reception with wrong padding leads to -EBADMSG
 * - TX path flowcontrol reception timeout leads to -ECOMM
 * - TX path flowcontrol reception overflow leads to -EMSGSIZE
 * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG
 * - when a transfer (tx) is on the run the next write() blocks until it's done
 * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent
 * - as we have static buffers the check whether the PDU fits into the buffer
 *   is done at FF reception time (no support for sending 'wait frames')
 *
 * Copyright (c) 2020 Volkswagen Group Electronic Research
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of Volkswagen nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * Alternatively, provided that this notice is retained in full, this
 * software may be distributed under the terms of the GNU General
 * Public License ("GPL") version 2, in which case the provisions of the
 * GPL apply INSTEAD OF those given above.
 *
 * The provided data structures and external interfaces from this code
 * are not restricted to be used by modules with a GPL compatible license.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/spinlock.h>
#include <linux/hrtimer.h>
#include <linux/wait.h>
#include <linux/uio.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/socket.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/can.h>
#include <linux/can/core.h>
#include <linux/can/skb.h>
#include <linux/can/isotp.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/net_namespace.h>

MODULE_DESCRIPTION("PF_CAN ISO 15765-2 transport protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>");
MODULE_ALIAS("can-proto-6");

#define ISOTP_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp)

#define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \
                         (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
                         (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))

/* Since ISO 15765-2:2016 the CAN isotp protocol supports more than 4095
 * byte per ISO PDU as the FF_DL can take full 32 bit values (4 Gbyte).
 * We would need some good concept to handle this between user space and
 * kernel space. For now set the static buffer to something about 8 kbyte
 * to be able to test this new functionality.
 */
#define DEFAULT_MAX_PDU_SIZE 8300

/* maximum PDU size before ISO 15765-2:2016 extension was 4095 */
#define MAX_12BIT_PDU_SIZE 4095

/* limit the isotp pdu size from the optional module parameter to 1MByte */
#define MAX_PDU_SIZE (1025 * 1024U)

static unsigned int max_pdu_size __read_mostly = DEFAULT_MAX_PDU_SIZE;
module_param(max_pdu_size, uint, 0444);
MODULE_PARM_DESC(max_pdu_size, "maximum isotp pdu size (default "
                 __stringify(DEFAULT_MAX_PDU_SIZE) ")");

/* N_PCI type values in bits 7-4 of N_PCI bytes */
#define N_PCI_SF 0x00        /* single frame */
#define N_PCI_FF 0x10        /* first frame */
#define N_PCI_CF 0x20        /* consecutive frame */
#define N_PCI_FC 0x30        /* flow control */

#define N_PCI_SZ 1        /* size of the PCI byte #1 */
#define SF_PCI_SZ4 1        /* size of SingleFrame PCI including 4 bit SF_DL */
#define SF_PCI_SZ8 2        /* size of SingleFrame PCI including 8 bit SF_DL */
#define FF_PCI_SZ12 2        /* size of FirstFrame PCI including 12 bit FF_DL */
#define FF_PCI_SZ32 6        /* size of FirstFrame PCI including 32 bit FF_DL */
#define FC_CONTENT_SZ 3        /* flow control content size in byte (FS/BS/STmin) */

#define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA)
#define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST)

/* Flow Status given in FC frame */
#define ISOTP_FC_CTS 0                /* clear to send */
#define ISOTP_FC_WT 1                /* wait */
#define ISOTP_FC_OVFLW 2        /* overflow */

#define ISOTP_FC_TIMEOUT 1        /* 1 sec */
#define ISOTP_ECHO_TIMEOUT 2        /* 2 secs */

enum {
        ISOTP_IDLE = 0,
        ISOTP_WAIT_FIRST_FC,
        ISOTP_WAIT_FC,
        ISOTP_WAIT_DATA,
        ISOTP_SENDING,
        ISOTP_SHUTDOWN,
};

struct tpcon {
        u8 *buf;
        unsigned int buflen;
        unsigned int len;
        unsigned int idx;
        u32 state;
        u8 bs;
        u8 sn;
        u8 ll_dl;
        u8 sbuf[DEFAULT_MAX_PDU_SIZE];
};

struct isotp_sock {
        struct sock sk;
        int bound;
        int ifindex;
        canid_t txid;
        canid_t rxid;
        ktime_t tx_gap;
        ktime_t lastrxcf_tstamp;
        struct hrtimer rxtimer, txtimer, txfrtimer;
        struct can_isotp_options opt;
        struct can_isotp_fc_options rxfc, txfc;
        struct can_isotp_ll_options ll;
        u32 frame_txtime;
        u32 force_tx_stmin;
        u32 force_rx_stmin;
        u32 cfecho; /* consecutive frame echo tag */
        struct tpcon rx, tx;
        struct list_head notifier;
        wait_queue_head_t wait;
        spinlock_t rx_lock; /* protect single thread state machine */
};

static LIST_HEAD(isotp_notifier_list);
static DEFINE_SPINLOCK(isotp_notifier_lock);
static struct isotp_sock *isotp_busy_notifier;

static inline struct isotp_sock *isotp_sk(const struct sock *sk)
{
        return (struct isotp_sock *)sk;
}

static u32 isotp_bc_flags(struct isotp_sock *so)
{
        return so->opt.flags & ISOTP_ALL_BC_FLAGS;
}

static bool isotp_register_rxid(struct isotp_sock *so)
{
        /* no broadcast modes => register rx_id for FC frame reception */
        return (isotp_bc_flags(so) == 0);
}

static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer)
{
        struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
                                             rxtimer);
        struct sock *sk = &so->sk;

        if (so->rx.state == ISOTP_WAIT_DATA) {
                /* we did not get new data frames in time */

                /* report 'connection timed out' */
                sk->sk_err = ETIMEDOUT;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);

                /* reset rx state */
                so->rx.state = ISOTP_IDLE;
        }

        return HRTIMER_NORESTART;
}

static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus)
{
        struct net_device *dev;
        struct sk_buff *nskb;
        struct canfd_frame *ncf;
        struct isotp_sock *so = isotp_sk(sk);
        int can_send_ret;

        nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any());
        if (!nskb)
                return 1;

        dev = dev_get_by_index(sock_net(sk), so->ifindex);
        if (!dev) {
                kfree_skb(nskb);
                return 1;
        }

        can_skb_reserve(nskb);
        can_skb_prv(nskb)->ifindex = dev->ifindex;
        can_skb_prv(nskb)->skbcnt = 0;

        nskb->dev = dev;
        can_skb_set_owner(nskb, sk);
        ncf = (struct canfd_frame *)nskb->data;
        skb_put_zero(nskb, so->ll.mtu);

        /* create & send flow control reply */
        ncf->can_id = so->txid;

        if (so->opt.flags & CAN_ISOTP_TX_PADDING) {
                memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN);
                ncf->len = CAN_MAX_DLEN;
        } else {
                ncf->len = ae + FC_CONTENT_SZ;
        }

        ncf->data[ae] = N_PCI_FC | flowstatus;
        ncf->data[ae + 1] = so->rxfc.bs;
        ncf->data[ae + 2] = so->rxfc.stmin;

        if (ae)
                ncf->data[0] = so->opt.ext_address;

        ncf->flags = so->ll.tx_flags;

        can_send_ret = can_send(nskb, 1);
        if (can_send_ret)
                pr_notice_once("can-isotp: %s: can_send_ret %pe\n",
                               __func__, ERR_PTR(can_send_ret));

        dev_put(dev);

        /* reset blocksize counter */
        so->rx.bs = 0;

        /* reset last CF frame rx timestamp for rx stmin enforcement */
        so->lastrxcf_tstamp = ktime_set(0, 0);

        /* start rx timeout watchdog */
        hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
                      HRTIMER_MODE_REL_SOFT);
        return 0;
}

static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk)
{
        struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb;

        BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can));

        memset(addr, 0, sizeof(*addr));
        addr->can_family = AF_CAN;
        addr->can_ifindex = skb->dev->ifindex;

        if (sock_queue_rcv_skb(sk, skb) < 0)
                kfree_skb(skb);
}

static u8 padlen(u8 datalen)
{
        static const u8 plen[] = {
                8, 8, 8, 8, 8, 8, 8, 8, 8,        /* 0 - 8 */
                12, 12, 12, 12,                        /* 9 - 12 */
                16, 16, 16, 16,                        /* 13 - 16 */
                20, 20, 20, 20,                        /* 17 - 20 */
                24, 24, 24, 24,                        /* 21 - 24 */
                32, 32, 32, 32, 32, 32, 32, 32,        /* 25 - 32 */
                48, 48, 48, 48, 48, 48, 48, 48,        /* 33 - 40 */
                48, 48, 48, 48, 48, 48, 48, 48        /* 41 - 48 */
        };

        if (datalen > 48)
                return 64;

        return plen[datalen];
}

/* check for length optimization and return 1/true when the check fails */
static int check_optimized(struct canfd_frame *cf, int start_index)
{
        /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the
         * padding would start at this point. E.g. if the padding would
         * start at cf.data[7] cf->len has to be 7 to be optimal.
         * Note: The data[] index starts with zero.
         */
        if (cf->len <= CAN_MAX_DLEN)
                return (cf->len != start_index);

        /* This relation is also valid in the non-linear DLC range, where
         * we need to take care of the minimal next possible CAN_DL.
         * The correct check would be (padlen(cf->len) != padlen(start_index)).
         * But as cf->len can only take discrete values from 12, .., 64 at this
         * point the padlen(cf->len) is always equal to cf->len.
         */
        return (cf->len != padlen(start_index));
}

/* check padding and return 1/true when the check fails */
static int check_pad(struct isotp_sock *so, struct canfd_frame *cf,
                     int start_index, u8 content)
{
        int i;

        /* no RX_PADDING value => check length of optimized frame length */
        if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) {
                if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN)
                        return check_optimized(cf, start_index);

                /* no valid test against empty value => ignore frame */
                return 1;
        }

        /* check datalength of correctly padded CAN frame */
        if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) &&
            cf->len != padlen(cf->len))
                return 1;

        /* check padding content */
        if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) {
                for (i = start_index; i < cf->len; i++)
                        if (cf->data[i] != content)
                                return 1;
        }
        return 0;
}

static void isotp_send_cframe(struct isotp_sock *so);

static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
{
        struct sock *sk = &so->sk;

        if (so->tx.state != ISOTP_WAIT_FC &&
            so->tx.state != ISOTP_WAIT_FIRST_FC)
                return 0;

        hrtimer_cancel(&so->txtimer);

        if ((cf->len < ae + FC_CONTENT_SZ) ||
            ((so->opt.flags & ISOTP_CHECK_PADDING) &&
             check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) {
                /* malformed PDU - report 'not a data message' */
                sk->sk_err = EBADMSG;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);

                so->tx.state = ISOTP_IDLE;
                wake_up_interruptible(&so->wait);
                return 1;
        }

        /* get static/dynamic communication params from first/every FC frame */
        if (so->tx.state == ISOTP_WAIT_FIRST_FC ||
            so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) {
                so->txfc.bs = cf->data[ae + 1];
                so->txfc.stmin = cf->data[ae + 2];

                /* fix wrong STmin values according spec */
                if (so->txfc.stmin > 0x7F &&
                    (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9))
                        so->txfc.stmin = 0x7F;

                so->tx_gap = ktime_set(0, 0);
                /* add transmission time for CAN frame N_As */
                so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime);
                /* add waiting time for consecutive frames N_Cs */
                if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
                        so->tx_gap = ktime_add_ns(so->tx_gap,
                                                  so->force_tx_stmin);
                else if (so->txfc.stmin < 0x80)
                        so->tx_gap = ktime_add_ns(so->tx_gap,
                                                  so->txfc.stmin * 1000000);
                else
                        so->tx_gap = ktime_add_ns(so->tx_gap,
                                                  (so->txfc.stmin - 0xF0)
                                                  * 100000);
                so->tx.state = ISOTP_WAIT_FC;
        }

        switch (cf->data[ae] & 0x0F) {
        case ISOTP_FC_CTS:
                so->tx.bs = 0;
                so->tx.state = ISOTP_SENDING;
                /* send CF frame and enable echo timeout handling */
                hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
                              HRTIMER_MODE_REL_SOFT);
                isotp_send_cframe(so);
                break;

        case ISOTP_FC_WT:
                /* start timer to wait for next FC frame */
                hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
                              HRTIMER_MODE_REL_SOFT);
                break;

        case ISOTP_FC_OVFLW:
                /* overflow on receiver side - report 'message too long' */
                sk->sk_err = EMSGSIZE;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
                fallthrough;

        default:
                /* stop this tx job */
                so->tx.state = ISOTP_IDLE;
                wake_up_interruptible(&so->wait);
        }
        return 0;
}

static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen,
                        struct sk_buff *skb, int len)
{
        struct isotp_sock *so = isotp_sk(sk);
        struct sk_buff *nskb;

        hrtimer_cancel(&so->rxtimer);
        so->rx.state = ISOTP_IDLE;

        if (!len || len > cf->len - pcilen)
                return 1;

        if ((so->opt.flags & ISOTP_CHECK_PADDING) &&
            check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) {
                /* malformed PDU - report 'not a data message' */
                sk->sk_err = EBADMSG;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
                return 1;
        }

        nskb = alloc_skb(len, gfp_any());
        if (!nskb)
                return 1;

        memcpy(skb_put(nskb, len), &cf->data[pcilen], len);

        nskb->tstamp = skb->tstamp;
        nskb->dev = skb->dev;
        isotp_rcv_skb(nskb, sk);
        return 0;
}

static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae)
{
        struct isotp_sock *so = isotp_sk(sk);
        int i;
        int off;
        int ff_pci_sz;

        hrtimer_cancel(&so->rxtimer);
        so->rx.state = ISOTP_IDLE;

        /* get the used sender LL_DL from the (first) CAN frame data length */
        so->rx.ll_dl = padlen(cf->len);

        /* the first frame has to use the entire frame up to LL_DL length */
        if (cf->len != so->rx.ll_dl)
                return 1;

        /* get the FF_DL */
        so->rx.len = (cf->data[ae] & 0x0F) << 8;
        so->rx.len += cf->data[ae + 1];

        /* Check for FF_DL escape sequence supporting 32 bit PDU length */
        if (so->rx.len) {
                ff_pci_sz = FF_PCI_SZ12;
        } else {
                /* FF_DL = 0 => get real length from next 4 bytes */
                so->rx.len = cf->data[ae + 2] << 24;
                so->rx.len += cf->data[ae + 3] << 16;
                so->rx.len += cf->data[ae + 4] << 8;
                so->rx.len += cf->data[ae + 5];
                ff_pci_sz = FF_PCI_SZ32;
        }

        /* take care of a potential SF_DL ESC offset for TX_DL > 8 */
        off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0;

        if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl)
                return 1;

        /* PDU size > default => try max_pdu_size */
        if (so->rx.len > so->rx.buflen && so->rx.buflen < max_pdu_size) {
                u8 *newbuf = kmalloc(max_pdu_size, GFP_ATOMIC);

                if (newbuf) {
                        so->rx.buf = newbuf;
                        so->rx.buflen = max_pdu_size;
                }
        }

        if (so->rx.len > so->rx.buflen) {
                /* send FC frame with overflow status */
                isotp_send_fc(sk, ae, ISOTP_FC_OVFLW);
                return 1;
        }

        /* copy the first received data bytes */
        so->rx.idx = 0;
        for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++)
                so->rx.buf[so->rx.idx++] = cf->data[i];

        /* initial setup for this pdu reception */
        so->rx.sn = 1;
        so->rx.state = ISOTP_WAIT_DATA;

        /* no creation of flow control frames */
        if (so->opt.flags & CAN_ISOTP_LISTEN_MODE)
                return 0;

        /* send our first FC frame */
        isotp_send_fc(sk, ae, ISOTP_FC_CTS);
        return 0;
}

static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae,
                        struct sk_buff *skb)
{
        struct isotp_sock *so = isotp_sk(sk);
        struct sk_buff *nskb;
        int i;

        if (so->rx.state != ISOTP_WAIT_DATA)
                return 0;

        /* drop if timestamp gap is less than force_rx_stmin nano secs */
        if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) {
                if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) <
                    so->force_rx_stmin)
                        return 0;

                so->lastrxcf_tstamp = skb->tstamp;
        }

        hrtimer_cancel(&so->rxtimer);

        /* CFs are never longer than the FF */
        if (cf->len > so->rx.ll_dl)
                return 1;

        /* CFs have usually the LL_DL length */
        if (cf->len < so->rx.ll_dl) {
                /* this is only allowed for the last CF */
                if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ)
                        return 1;
        }

        if ((cf->data[ae] & 0x0F) != so->rx.sn) {
                /* wrong sn detected - report 'illegal byte sequence' */
                sk->sk_err = EILSEQ;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);

                /* reset rx state */
                so->rx.state = ISOTP_IDLE;
                return 1;
        }
        so->rx.sn++;
        so->rx.sn %= 16;

        for (i = ae + N_PCI_SZ; i < cf->len; i++) {
                so->rx.buf[so->rx.idx++] = cf->data[i];
                if (so->rx.idx >= so->rx.len)
                        break;
        }

        if (so->rx.idx >= so->rx.len) {
                /* we are done */
                so->rx.state = ISOTP_IDLE;

                if ((so->opt.flags & ISOTP_CHECK_PADDING) &&
                    check_pad(so, cf, i + 1, so->opt.rxpad_content)) {
                        /* malformed PDU - report 'not a data message' */
                        sk->sk_err = EBADMSG;
                        if (!sock_flag(sk, SOCK_DEAD))
                                sk_error_report(sk);
                        return 1;
                }

                nskb = alloc_skb(so->rx.len, gfp_any());
                if (!nskb)
                        return 1;

                memcpy(skb_put(nskb, so->rx.len), so->rx.buf,
                       so->rx.len);

                nskb->tstamp = skb->tstamp;
                nskb->dev = skb->dev;
                isotp_rcv_skb(nskb, sk);
                return 0;
        }

        /* perform blocksize handling, if enabled */
        if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) {
                /* start rx timeout watchdog */
                hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
                              HRTIMER_MODE_REL_SOFT);
                return 0;
        }

        /* no creation of flow control frames */
        if (so->opt.flags & CAN_ISOTP_LISTEN_MODE)
                return 0;

        /* we reached the specified blocksize so->rxfc.bs */
        isotp_send_fc(sk, ae, ISOTP_FC_CTS);
        return 0;
}

static void isotp_rcv(struct sk_buff *skb, void *data)
{
        struct sock *sk = (struct sock *)data;
        struct isotp_sock *so = isotp_sk(sk);
        struct canfd_frame *cf;
        int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
        u8 n_pci_type, sf_dl;

        /* Strictly receive only frames with the configured MTU size
         * => clear separation of CAN2.0 / CAN FD transport channels
         */
        if (skb->len != so->ll.mtu)
                return;

        cf = (struct canfd_frame *)skb->data;

        /* if enabled: check reception of my configured extended address */
        if (ae && cf->data[0] != so->opt.rx_ext_address)
                return;

        n_pci_type = cf->data[ae] & 0xF0;

        /* Make sure the state changes and data structures stay consistent at
         * CAN frame reception time. This locking is not needed in real world
         * use cases but the inconsistency can be triggered with syzkaller.
         */
        spin_lock(&so->rx_lock);

        if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) {
                /* check rx/tx path half duplex expectations */
                if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) ||
                    (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC))
                        goto out_unlock;
        }

        switch (n_pci_type) {
        case N_PCI_FC:
                /* tx path: flow control frame containing the FC parameters */
                isotp_rcv_fc(so, cf, ae);
                break;

        case N_PCI_SF:
                /* rx path: single frame
                 *
                 * As we do not have a rx.ll_dl configuration, we can only test
                 * if the CAN frames payload length matches the LL_DL == 8
                 * requirements - no matter if it's CAN 2.0 or CAN FD
                 */

                /* get the SF_DL from the N_PCI byte */
                sf_dl = cf->data[ae] & 0x0F;

                if (cf->len <= CAN_MAX_DLEN) {
                        isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl);
                } else {
                        if (can_is_canfd_skb(skb)) {
                                /* We have a CAN FD frame and CAN_DL is greater than 8:
                                 * Only frames with the SF_DL == 0 ESC value are valid.
                                 *
                                 * If so take care of the increased SF PCI size
                                 * (SF_PCI_SZ8) to point to the message content behind
                                 * the extended SF PCI info and get the real SF_DL
                                 * length value from the formerly first data byte.
                                 */
                                if (sf_dl == 0)
                                        isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb,
                                                     cf->data[SF_PCI_SZ4 + ae]);
                        }
                }
                break;

        case N_PCI_FF:
                /* rx path: first frame */
                isotp_rcv_ff(sk, cf, ae);
                break;

        case N_PCI_CF:
                /* rx path: consecutive frame */
                isotp_rcv_cf(sk, cf, ae, skb);
                break;
        }

out_unlock:
        spin_unlock(&so->rx_lock);
}

static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so,
                                 int ae, int off)
{
        int pcilen = N_PCI_SZ + ae + off;
        int space = so->tx.ll_dl - pcilen;
        int num = min_t(int, so->tx.len - so->tx.idx, space);
        int i;

        cf->can_id = so->txid;
        cf->len = num + pcilen;

        if (num < space) {
                if (so->opt.flags & CAN_ISOTP_TX_PADDING) {
                        /* user requested padding */
                        cf->len = padlen(cf->len);
                        memset(cf->data, so->opt.txpad_content, cf->len);
                } else if (cf->len > CAN_MAX_DLEN) {
                        /* mandatory padding for CAN FD frames */
                        cf->len = padlen(cf->len);
                        memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT,
                               cf->len);
                }
        }

        for (i = 0; i < num; i++)
                cf->data[pcilen + i] = so->tx.buf[so->tx.idx++];

        if (ae)
                cf->data[0] = so->opt.ext_address;
}

static void isotp_send_cframe(struct isotp_sock *so)
{
        struct sock *sk = &so->sk;
        struct sk_buff *skb;
        struct net_device *dev;
        struct canfd_frame *cf;
        int can_send_ret;
        int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;

        dev = dev_get_by_index(sock_net(sk), so->ifindex);
        if (!dev)
                return;

        skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC);
        if (!skb) {
                dev_put(dev);
                return;
        }

        can_skb_reserve(skb);
        can_skb_prv(skb)->ifindex = dev->ifindex;
        can_skb_prv(skb)->skbcnt = 0;

        cf = (struct canfd_frame *)skb->data;
        skb_put_zero(skb, so->ll.mtu);

        /* create consecutive frame */
        isotp_fill_dataframe(cf, so, ae, 0);

        /* place consecutive frame N_PCI in appropriate index */
        cf->data[ae] = N_PCI_CF | so->tx.sn++;
        so->tx.sn %= 16;
        so->tx.bs++;

        cf->flags = so->ll.tx_flags;

        skb->dev = dev;
        can_skb_set_owner(skb, sk);

        /* cfecho should have been zero'ed by init/isotp_rcv_echo() */
        if (so->cfecho)
                pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho);

        /* set consecutive frame echo tag */
        so->cfecho = *(u32 *)cf->data;

        /* send frame with local echo enabled */
        can_send_ret = can_send(skb, 1);
        if (can_send_ret) {
                pr_notice_once("can-isotp: %s: can_send_ret %pe\n",
                               __func__, ERR_PTR(can_send_ret));
                if (can_send_ret == -ENOBUFS)
                        pr_notice_once("can-isotp: tx queue is full\n");
        }
        dev_put(dev);
}

static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so,
                                int ae)
{
        int i;
        int ff_pci_sz;

        cf->can_id = so->txid;
        cf->len = so->tx.ll_dl;
        if (ae)
                cf->data[0] = so->opt.ext_address;

        /* create N_PCI bytes with 12/32 bit FF_DL data length */
        if (so->tx.len > MAX_12BIT_PDU_SIZE) {
                /* use 32 bit FF_DL notation */
                cf->data[ae] = N_PCI_FF;
                cf->data[ae + 1] = 0;
                cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU;
                cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU;
                cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU;
                cf->data[ae + 5] = (u8)so->tx.len & 0xFFU;
                ff_pci_sz = FF_PCI_SZ32;
        } else {
                /* use 12 bit FF_DL notation */
                cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF;
                cf->data[ae + 1] = (u8)so->tx.len & 0xFFU;
                ff_pci_sz = FF_PCI_SZ12;
        }

        /* add first data bytes depending on ae */
        for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++)
                cf->data[i] = so->tx.buf[so->tx.idx++];

        so->tx.sn = 1;
}

static void isotp_rcv_echo(struct sk_buff *skb, void *data)
{
        struct sock *sk = (struct sock *)data;
        struct isotp_sock *so = isotp_sk(sk);
        struct canfd_frame *cf = (struct canfd_frame *)skb->data;

        /* only handle my own local echo CF/SF skb's (no FF!) */
        if (skb->sk != sk || so->cfecho != *(u32 *)cf->data)
                return;

        /* cancel local echo timeout */
        hrtimer_cancel(&so->txtimer);

        /* local echo skb with consecutive frame has been consumed */
        so->cfecho = 0;

        if (so->tx.idx >= so->tx.len) {
                /* we are done */
                so->tx.state = ISOTP_IDLE;
                wake_up_interruptible(&so->wait);
                return;
        }

        if (so->txfc.bs && so->tx.bs >= so->txfc.bs) {
                /* stop and wait for FC with timeout */
                so->tx.state = ISOTP_WAIT_FC;
                hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
                              HRTIMER_MODE_REL_SOFT);
                return;
        }

        /* no gap between data frames needed => use burst mode */
        if (!so->tx_gap) {
                /* enable echo timeout handling */
                hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
                              HRTIMER_MODE_REL_SOFT);
                isotp_send_cframe(so);
                return;
        }

        /* start timer to send next consecutive frame with correct delay */
        hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT);
}

static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer)
{
        struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
                                             txtimer);
        struct sock *sk = &so->sk;

        /* don't handle timeouts in IDLE or SHUTDOWN state */
        if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN)
                return HRTIMER_NORESTART;

        /* we did not get any flow control or echo frame in time */

        /* report 'communication error on send' */
        sk->sk_err = ECOMM;
        if (!sock_flag(sk, SOCK_DEAD))
                sk_error_report(sk);

        /* reset tx state */
        so->tx.state = ISOTP_IDLE;
        wake_up_interruptible(&so->wait);

        return HRTIMER_NORESTART;
}

static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer)
{
        struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
                                             txfrtimer);

        /* start echo timeout handling and cover below protocol error */
        hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
                      HRTIMER_MODE_REL_SOFT);

        /* cfecho should be consumed by isotp_rcv_echo() here */
        if (so->tx.state == ISOTP_SENDING && !so->cfecho)
                isotp_send_cframe(so);

        return HRTIMER_NORESTART;
}

static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
        struct sock *sk = sock->sk;
        struct isotp_sock *so = isotp_sk(sk);
        struct sk_buff *skb;
        struct net_device *dev;
        struct canfd_frame *cf;
        int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
        int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0;
        s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT;
        int off;
        int err;

        if (!so->bound || so->tx.state == ISOTP_SHUTDOWN)
                return -EADDRNOTAVAIL;

        while (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) {
                /* we do not support multiple buffers - for now */
                if (msg->msg_flags & MSG_DONTWAIT)
                        return -EAGAIN;

                if (so->tx.state == ISOTP_SHUTDOWN)
                        return -EADDRNOTAVAIL;

                /* wait for complete transmission of current pdu */
                err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
                if (err)
                        goto err_event_drop;
        }

        /* PDU size > default => try max_pdu_size */
        if (size > so->tx.buflen && so->tx.buflen < max_pdu_size) {
                u8 *newbuf = kmalloc(max_pdu_size, GFP_KERNEL);

                if (newbuf) {
                        so->tx.buf = newbuf;
                        so->tx.buflen = max_pdu_size;
                }
        }

        if (!size || size > so->tx.buflen) {
                err = -EINVAL;
                goto err_out_drop;
        }

        /* take care of a potential SF_DL ESC offset for TX_DL > 8 */
        off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0;

        /* does the given data fit into a single frame for SF_BROADCAST? */
        if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) &&
            (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) {
                err = -EINVAL;
                goto err_out_drop;
        }

        err = memcpy_from_msg(so->tx.buf, msg, size);
        if (err < 0)
                goto err_out_drop;

        dev = dev_get_by_index(sock_net(sk), so->ifindex);
        if (!dev) {
                err = -ENXIO;
                goto err_out_drop;
        }

        skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv),
                                  msg->msg_flags & MSG_DONTWAIT, &err);
        if (!skb) {
                dev_put(dev);
                goto err_out_drop;
        }

        can_skb_reserve(skb);
        can_skb_prv(skb)->ifindex = dev->ifindex;
        can_skb_prv(skb)->skbcnt = 0;

        so->tx.len = size;
        so->tx.idx = 0;

        cf = (struct canfd_frame *)skb->data;
        skb_put_zero(skb, so->ll.mtu);

        /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */
        if (so->cfecho)
                pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho);

        /* check for single frame transmission depending on TX_DL */
        if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) {
                /* The message size generally fits into a SingleFrame - good.
                 *
                 * SF_DL ESC offset optimization:
                 *
                 * When TX_DL is greater 8 but the message would still fit
                 * into a 8 byte CAN frame, we can omit the offset.
                 * This prevents a protocol caused length extension from
                 * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling.
                 */
                if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae)
                        off = 0;

                isotp_fill_dataframe(cf, so, ae, off);

                /* place single frame N_PCI w/o length in appropriate index */
                cf->data[ae] = N_PCI_SF;

                /* place SF_DL size value depending on the SF_DL ESC offset */
                if (off)
                        cf->data[SF_PCI_SZ4 + ae] = size;
                else
                        cf->data[ae] |= size;

                /* set CF echo tag for isotp_rcv_echo() (SF-mode) */
                so->cfecho = *(u32 *)cf->data;
        } else {
                /* send first frame */

                isotp_create_fframe(cf, so, ae);

                if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) {
                        /* set timer for FC-less operation (STmin = 0) */
                        if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
                                so->tx_gap = ktime_set(0, so->force_tx_stmin);
                        else
                                so->tx_gap = ktime_set(0, so->frame_txtime);

                        /* disable wait for FCs due to activated block size */
                        so->txfc.bs = 0;

                        /* set CF echo tag for isotp_rcv_echo() (CF-mode) */
                        so->cfecho = *(u32 *)cf->data;
                } else {
                        /* standard flow control check */
                        so->tx.state = ISOTP_WAIT_FIRST_FC;

                        /* start timeout for FC */
                        hrtimer_sec = ISOTP_FC_TIMEOUT;

                        /* no CF echo tag for isotp_rcv_echo() (FF-mode) */
                        so->cfecho = 0;
                }
        }

        hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0),
                      HRTIMER_MODE_REL_SOFT);

        /* send the first or only CAN frame */
        cf->flags = so->ll.tx_flags;

        skb->dev = dev;
        skb->sk = sk;
        err = can_send(skb, 1);
        dev_put(dev);
        if (err) {
                pr_notice_once("can-isotp: %s: can_send_ret %pe\n",
                               __func__, ERR_PTR(err));

                /* no transmission -> no timeout monitoring */
                hrtimer_cancel(&so->txtimer);

                /* reset consecutive frame echo tag */
                so->cfecho = 0;

                goto err_out_drop;
        }

        if (wait_tx_done) {
                /* wait for complete transmission of current pdu */
                err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
                if (err)
                        goto err_event_drop;

                err = sock_error(sk);
                if (err)
                        return err;
        }

        return size;

err_event_drop:
        /* got signal: force tx state machine to be idle */
        so->tx.state = ISOTP_IDLE;
        hrtimer_cancel(&so->txfrtimer);
        hrtimer_cancel(&so->txtimer);
err_out_drop:
        /* drop this PDU and unlock a potential wait queue */
        so->tx.state = ISOTP_IDLE;
        wake_up_interruptible(&so->wait);

        return err;
}

static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                         int flags)
{
        struct sock *sk = sock->sk;
        struct sk_buff *skb;
        struct isotp_sock *so = isotp_sk(sk);
        int ret = 0;

        if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT))
                return -EINVAL;

        if (!so->bound)
                return -EADDRNOTAVAIL;

        skb = skb_recv_datagram(sk, flags, &ret);
        if (!skb)
                return ret;

        if (size < skb->len)
                msg->msg_flags |= MSG_TRUNC;
        else
                size = skb->len;

        ret = memcpy_to_msg(msg, skb->data, size);
        if (ret < 0)
                goto out_err;

        sock_recv_cmsgs(msg, sk, skb);

        if (msg->msg_name) {
                __sockaddr_check_size(ISOTP_MIN_NAMELEN);
                msg->msg_namelen = ISOTP_MIN_NAMELEN;
                memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
        }

        /* set length of return value */
        ret = (flags & MSG_TRUNC) ? skb->len : size;

out_err:
        skb_free_datagram(sk, skb);

        return ret;
}

static int isotp_release(struct socket *sock)
{
        struct sock *sk = sock->sk;
        struct isotp_sock *so;
        struct net *net;

        if (!sk)
                return 0;

        so = isotp_sk(sk);
        net = sock_net(sk);

        /* wait for complete transmission of current pdu */
        while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 &&
               cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE)
                ;

        /* force state machines to be idle also when a signal occurred */
        so->tx.state = ISOTP_SHUTDOWN;
        so->rx.state = ISOTP_IDLE;

        spin_lock(&isotp_notifier_lock);
        while (isotp_busy_notifier == so) {
                spin_unlock(&isotp_notifier_lock);
                schedule_timeout_uninterruptible(1);
                spin_lock(&isotp_notifier_lock);
        }
        list_del(&so->notifier);
        spin_unlock(&isotp_notifier_lock);

        lock_sock(sk);

        /* remove current filters & unregister */
        if (so->bound) {
                if (so->ifindex) {
                        struct net_device *dev;

                        dev = dev_get_by_index(net, so->ifindex);
                        if (dev) {
                                if (isotp_register_rxid(so))
                                        can_rx_unregister(net, dev, so->rxid,
                                                          SINGLE_MASK(so->rxid),
                                                          isotp_rcv, sk);

                                can_rx_unregister(net, dev, so->txid,
                                                  SINGLE_MASK(so->txid),
                                                  isotp_rcv_echo, sk);
                                dev_put(dev);
                                synchronize_rcu();
                        }
                }
        }

        hrtimer_cancel(&so->txfrtimer);
        hrtimer_cancel(&so->txtimer);
        hrtimer_cancel(&so->rxtimer);

        so->ifindex = 0;
        so->bound = 0;

        if (so->rx.buf != so->rx.sbuf)
                kfree(so->rx.buf);

        if (so->tx.buf != so->tx.sbuf)
                kfree(so->tx.buf);

        sock_orphan(sk);
        sock->sk = NULL;

        release_sock(sk);
        sock_prot_inuse_add(net, sk->sk_prot, -1);
        sock_put(sk);

        return 0;
}

static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len)
{
        struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
        struct sock *sk = sock->sk;
        struct isotp_sock *so = isotp_sk(sk);
        struct net *net = sock_net(sk);
        int ifindex;
        struct net_device *dev;
        canid_t tx_id = addr->can_addr.tp.tx_id;
        canid_t rx_id = addr->can_addr.tp.rx_id;
        int err = 0;
        int notify_enetdown = 0;

        if (len < ISOTP_MIN_NAMELEN)
                return -EINVAL;

        if (addr->can_family != AF_CAN)
                return -EINVAL;

        /* sanitize tx CAN identifier */
        if (tx_id & CAN_EFF_FLAG)
                tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK);
        else
                tx_id &= CAN_SFF_MASK;

        /* give feedback on wrong CAN-ID value */
        if (tx_id != addr->can_addr.tp.tx_id)
                return -EINVAL;

        /* sanitize rx CAN identifier (if needed) */
        if (isotp_register_rxid(so)) {
                if (rx_id & CAN_EFF_FLAG)
                        rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK);
                else
                        rx_id &= CAN_SFF_MASK;

                /* give feedback on wrong CAN-ID value */
                if (rx_id != addr->can_addr.tp.rx_id)
                        return -EINVAL;
        }

        if (!addr->can_ifindex)
                return -ENODEV;

        lock_sock(sk);

        if (so->bound) {
                err = -EINVAL;
                goto out;
        }

        /* ensure different CAN IDs when the rx_id is to be registered */
        if (isotp_register_rxid(so) && rx_id == tx_id) {
                err = -EADDRNOTAVAIL;
                goto out;
        }

        dev = dev_get_by_index(net, addr->can_ifindex);
        if (!dev) {
                err = -ENODEV;
                goto out;
        }
        if (dev->type != ARPHRD_CAN) {
                dev_put(dev);
                err = -ENODEV;
                goto out;
        }
        if (dev->mtu < so->ll.mtu) {
                dev_put(dev);
                err = -EINVAL;
                goto out;
        }
        if (!(dev->flags & IFF_UP))
                notify_enetdown = 1;

        ifindex = dev->ifindex;

        if (isotp_register_rxid(so))
                can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id),
                                isotp_rcv, sk, "isotp", sk);

        /* no consecutive frame echo skb in flight */
        so->cfecho = 0;

        /* register for echo skb's */
        can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id),
                        isotp_rcv_echo, sk, "isotpe", sk);

        dev_put(dev);

        /* switch to new settings */
        so->ifindex = ifindex;
        so->rxid = rx_id;
        so->txid = tx_id;
        so->bound = 1;

out:
        release_sock(sk);

        if (notify_enetdown) {
                sk->sk_err = ENETDOWN;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
        }

        return err;
}

static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
{
        struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
        struct sock *sk = sock->sk;
        struct isotp_sock *so = isotp_sk(sk);

        if (peer)
                return -EOPNOTSUPP;

        memset(addr, 0, ISOTP_MIN_NAMELEN);
        addr->can_family = AF_CAN;
        addr->can_ifindex = so->ifindex;
        addr->can_addr.tp.rx_id = so->rxid;
        addr->can_addr.tp.tx_id = so->txid;

        return ISOTP_MIN_NAMELEN;
}

static int isotp_setsockopt_locked(struct socket *sock, int level, int optname,
                            sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;
        struct isotp_sock *so = isotp_sk(sk);
        int ret = 0;

        if (so->bound)
                return -EISCONN;

        switch (optname) {
        case CAN_ISOTP_OPTS:
                if (optlen != sizeof(struct can_isotp_options))
                        return -EINVAL;

                if (copy_from_sockptr(&so->opt, optval, optlen))
                        return -EFAULT;

                /* no separate rx_ext_address is given => use ext_address */
                if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR))
                        so->opt.rx_ext_address = so->opt.ext_address;

                /* these broadcast flags are not allowed together */
                if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) {
                        /* CAN_ISOTP_SF_BROADCAST is prioritized */
                        so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST;

                        /* give user feedback on wrong config attempt */
                        ret = -EINVAL;
                }

                /* check for frame_txtime changes (0 => no changes) */
                if (so->opt.frame_txtime) {
                        if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO)
                                so->frame_txtime = 0;
                        else
                                so->frame_txtime = so->opt.frame_txtime;
                }
                break;

        case CAN_ISOTP_RECV_FC:
                if (optlen != sizeof(struct can_isotp_fc_options))
                        return -EINVAL;

                if (copy_from_sockptr(&so->rxfc, optval, optlen))
                        return -EFAULT;
                break;

        case CAN_ISOTP_TX_STMIN:
                if (optlen != sizeof(u32))
                        return -EINVAL;

                if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen))
                        return -EFAULT;
                break;

        case CAN_ISOTP_RX_STMIN:
                if (optlen != sizeof(u32))
                        return -EINVAL;

                if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen))
                        return -EFAULT;
                break;

        case CAN_ISOTP_LL_OPTS:
                if (optlen == sizeof(struct can_isotp_ll_options)) {
                        struct can_isotp_ll_options ll;

                        if (copy_from_sockptr(&ll, optval, optlen))
                                return -EFAULT;

                        /* check for correct ISO 11898-1 DLC data length */
                        if (ll.tx_dl != padlen(ll.tx_dl))
                                return -EINVAL;

                        if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU)
                                return -EINVAL;

                        if (ll.mtu == CAN_MTU &&
                            (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0))
                                return -EINVAL;

                        memcpy(&so->ll, &ll, sizeof(ll));

                        /* set ll_dl for tx path to similar place as for rx */
                        so->tx.ll_dl = ll.tx_dl;
                } else {
                        return -EINVAL;
                }
                break;

        default:
                ret = -ENOPROTOOPT;
        }

        return ret;
}

static int isotp_setsockopt(struct socket *sock, int level, int optname,
                            sockptr_t optval, unsigned int optlen)

{
        struct sock *sk = sock->sk;
        int ret;

        if (level != SOL_CAN_ISOTP)
                return -EINVAL;

        lock_sock(sk);
        ret = isotp_setsockopt_locked(sock, level, optname, optval, optlen);
        release_sock(sk);
        return ret;
}

static int isotp_getsockopt(struct socket *sock, int level, int optname,
                            char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;
        struct isotp_sock *so = isotp_sk(sk);
        int len;
        void *val;

        if (level != SOL_CAN_ISOTP)
                return -EINVAL;
        if (get_user(len, optlen))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        switch (optname) {
        case CAN_ISOTP_OPTS:
                len = min_t(int, len, sizeof(struct can_isotp_options));
                val = &so->opt;
                break;

        case CAN_ISOTP_RECV_FC:
                len = min_t(int, len, sizeof(struct can_isotp_fc_options));
                val = &so->rxfc;
                break;

        case CAN_ISOTP_TX_STMIN:
                len = min_t(int, len, sizeof(u32));
                val = &so->force_tx_stmin;
                break;

        case CAN_ISOTP_RX_STMIN:
                len = min_t(int, len, sizeof(u32));
                val = &so->force_rx_stmin;
                break;

        case CAN_ISOTP_LL_OPTS:
                len = min_t(int, len, sizeof(struct can_isotp_ll_options));
                val = &so->ll;
                break;

        default:
                return -ENOPROTOOPT;
        }

        if (put_user(len, optlen))
                return -EFAULT;
        if (copy_to_user(optval, val, len))
                return -EFAULT;
        return 0;
}

static void isotp_notify(struct isotp_sock *so, unsigned long msg,
                         struct net_device *dev)
{
        struct sock *sk = &so->sk;

        if (!net_eq(dev_net(dev), sock_net(sk)))
                return;

        if (so->ifindex != dev->ifindex)
                return;

        switch (msg) {
        case NETDEV_UNREGISTER:
                lock_sock(sk);
                /* remove current filters & unregister */
                if (so->bound) {
                        if (isotp_register_rxid(so))
                                can_rx_unregister(dev_net(dev), dev, so->rxid,
                                                  SINGLE_MASK(so->rxid),
                                                  isotp_rcv, sk);

                        can_rx_unregister(dev_net(dev), dev, so->txid,
                                          SINGLE_MASK(so->txid),
                                          isotp_rcv_echo, sk);
                }

                so->ifindex = 0;
                so->bound  = 0;
                release_sock(sk);

                sk->sk_err = ENODEV;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
                break;

        case NETDEV_DOWN:
                sk->sk_err = ENETDOWN;
                if (!sock_flag(sk, SOCK_DEAD))
                        sk_error_report(sk);
                break;
        }
}

static int isotp_notifier(struct notifier_block *nb, unsigned long msg,
                          void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        if (dev->type != ARPHRD_CAN)
                return NOTIFY_DONE;
        if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
                return NOTIFY_DONE;
        if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */
                return NOTIFY_DONE;

        spin_lock(&isotp_notifier_lock);
        list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) {
                spin_unlock(&isotp_notifier_lock);
                isotp_notify(isotp_busy_notifier, msg, dev);
                spin_lock(&isotp_notifier_lock);
        }
        isotp_busy_notifier = NULL;
        spin_unlock(&isotp_notifier_lock);
        return NOTIFY_DONE;
}

static int isotp_init(struct sock *sk)
{
        struct isotp_sock *so = isotp_sk(sk);

        so->ifindex = 0;
        so->bound = 0;

        so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS;
        so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS;
        so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS;
        so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
        so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
        so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
        so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
        so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS;
        so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN;
        so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX;
        so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU;
        so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL;
        so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS;

        /* set ll_dl for tx path to similar place as for rx */
        so->tx.ll_dl = so->ll.tx_dl;

        so->rx.state = ISOTP_IDLE;
        so->tx.state = ISOTP_IDLE;

        so->rx.buf = so->rx.sbuf;
        so->tx.buf = so->tx.sbuf;
        so->rx.buflen = ARRAY_SIZE(so->rx.sbuf);
        so->tx.buflen = ARRAY_SIZE(so->tx.sbuf);

        hrtimer_setup(&so->rxtimer, isotp_rx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
        hrtimer_setup(&so->txtimer, isotp_tx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
        hrtimer_setup(&so->txfrtimer, isotp_txfr_timer_handler, CLOCK_MONOTONIC,
                      HRTIMER_MODE_REL_SOFT);

        init_waitqueue_head(&so->wait);
        spin_lock_init(&so->rx_lock);

        spin_lock(&isotp_notifier_lock);
        list_add_tail(&so->notifier, &isotp_notifier_list);
        spin_unlock(&isotp_notifier_lock);

        return 0;
}

static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
        struct sock *sk = sock->sk;
        struct isotp_sock *so = isotp_sk(sk);

        __poll_t mask = datagram_poll(file, sock, wait);
        poll_wait(file, &so->wait, wait);

        /* Check for false positives due to TX state */
        if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE))
                mask &= ~(EPOLLOUT | EPOLLWRNORM);

        return mask;
}

static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
                                  unsigned long arg)
{
        /* no ioctls for socket layer -> hand it down to NIC layer */
        return -ENOIOCTLCMD;
}

static const struct proto_ops isotp_ops = {
        .family = PF_CAN,
        .release = isotp_release,
        .bind = isotp_bind,
        .connect = sock_no_connect,
        .socketpair = sock_no_socketpair,
        .accept = sock_no_accept,
        .getname = isotp_getname,
        .poll = isotp_poll,
        .ioctl = isotp_sock_no_ioctlcmd,
        .gettstamp = sock_gettstamp,
        .listen = sock_no_listen,
        .shutdown = sock_no_shutdown,
        .setsockopt = isotp_setsockopt,
        .getsockopt = isotp_getsockopt,
        .sendmsg = isotp_sendmsg,
        .recvmsg = isotp_recvmsg,
        .mmap = sock_no_mmap,
};

static struct proto isotp_proto __read_mostly = {
        .name = "CAN_ISOTP",
        .owner = THIS_MODULE,
        .obj_size = sizeof(struct isotp_sock),
        .init = isotp_init,
};

static const struct can_proto isotp_can_proto = {
        .type = SOCK_DGRAM,
        .protocol = CAN_ISOTP,
        .ops = &isotp_ops,
        .prot = &isotp_proto,
};

static struct notifier_block canisotp_notifier = {
        .notifier_call = isotp_notifier
};

static __init int isotp_module_init(void)
{
        int err;

        max_pdu_size = max_t(unsigned int, max_pdu_size, MAX_12BIT_PDU_SIZE);
        max_pdu_size = min_t(unsigned int, max_pdu_size, MAX_PDU_SIZE);

        pr_info("can: isotp protocol (max_pdu_size %d)\n", max_pdu_size);

        err = can_proto_register(&isotp_can_proto);
        if (err < 0)
                pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err));
        else
                register_netdevice_notifier(&canisotp_notifier);

        return err;
}

static __exit void isotp_module_exit(void)
{
        can_proto_unregister(&isotp_can_proto);
        unregister_netdevice_notifier(&canisotp_notifier);
}

module_init(isotp_module_init);
module_exit(isotp_module_exit);

















































































































































































   63 











  318 
   24 
































































































































































































































  256 


    1 
























































   57 



  220 
  221 
  220 
  220 











  610 




















































































































































































































































  192 

















  321 

























































































































































































































































  231 
   16 















  120 
   31 






























































    4 












    8 
    4 















































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
 *
 * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H

#include <linux/fs.h>
#include <linux/khugepaged.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/pagewalk.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
#include <linux/tracepoint-defs.h>

/* Internal core VMA manipulation functions. */
#include "vma.h"

struct folio_batch;

/*
 * Maintains state across a page table move. The operation assumes both source
 * and destination VMAs already exist and are specified by the user.
 *
 * Partial moves are permitted, but the old and new ranges must both reside
 * within a VMA.
 *
 * mmap lock must be held in write and VMA write locks must be held on any VMA
 * that is visible.
 *
 * Use the PAGETABLE_MOVE() macro to initialise this struct.
 *
 * The old_addr and new_addr fields are updated as the page table move is
 * executed.
 *
 * NOTE: The page table move is affected by reading from [old_addr, old_end),
 * and old_addr may be updated for better page table alignment, so len_in
 * represents the length of the range being copied as specified by the user.
 */
struct pagetable_move_control {
        struct vm_area_struct *old; /* Source VMA. */
        struct vm_area_struct *new; /* Destination VMA. */
        unsigned long old_addr; /* Address from which the move begins. */
        unsigned long old_end; /* Exclusive address at which old range ends. */
        unsigned long new_addr; /* Address to move page tables to. */
        unsigned long len_in; /* Bytes to remap specified by user. */

        bool need_rmap_locks; /* Do rmap locks need to be taken? */
        bool for_stack; /* Is this an early temp stack being moved? */
};

#define PAGETABLE_MOVE(name, old_, new_, old_addr_, new_addr_, len_)        \
        struct pagetable_move_control name = {                                \
                .old = old_,                                                \
                .new = new_,                                                \
                .old_addr = old_addr_,                                        \
                .old_end = (old_addr_) + (len_),                        \
                .new_addr = new_addr_,                                        \
                .len_in = len_,                                                \
        }

/*
 * The set of flags that only affect watermark checking and reclaim
 * behaviour. This is used by the MM to obey the caller constraints
 * about IO, FS and watermark checking while ignoring placement
 * hints such as HIGHMEM usage.
 */
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
                        __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
                        __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
                        __GFP_NOLOCKDEP)

/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))

/* Control allocation cpuset and node placement constraints */
#define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)

/* Do not use these with a slab allocator */
#define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK)

/*
 * Different from WARN_ON_ONCE(), no warning will be issued
 * when we specify __GFP_NOWARN.
 */
#define WARN_ON_ONCE_GFP(cond, gfp)        ({                                \
        static bool __section(".data..once") __warned;                        \
        int __ret_warn_once = !!(cond);                                        \
                                                                        \
        if (unlikely(!(gfp & __GFP_NOWARN) && __ret_warn_once && !__warned)) { \
                __warned = true;                                        \
                WARN_ON(1);                                                \
        }                                                                \
        unlikely(__ret_warn_once);                                        \
})

void page_writeback_init(void);

/*
 * If a 16GB hugetlb folio were mapped by PTEs of all of its 4kB pages,
 * its nr_pages_mapped would be 0x400000: choose the ENTIRELY_MAPPED bit
 * above that range, instead of 2*(PMD_SIZE/PAGE_SIZE).  Hugetlb currently
 * leaves nr_pages_mapped at 0, but avoid surprise if it participates later.
 */
#define ENTIRELY_MAPPED                0x800000
#define FOLIO_PAGES_MAPPED        (ENTIRELY_MAPPED - 1)

/*
 * Flags passed to __show_mem() and show_free_areas() to suppress output in
 * various contexts.
 */
#define SHOW_MEM_FILTER_NODES                (0x0001u)        /* disallowed nodes */

/*
 * How many individual pages have an elevated _mapcount.  Excludes
 * the folio's entire_mapcount.
 *
 * Don't use this function outside of debugging code.
 */
static inline int folio_nr_pages_mapped(const struct folio *folio)
{
        if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT))
                return -1;
        return atomic_read(&folio->_nr_pages_mapped) & FOLIO_PAGES_MAPPED;
}

/*
 * Retrieve the first entry of a folio based on a provided entry within the
 * folio. We cannot rely on folio->swap as there is no guarantee that it has
 * been initialized. Used for calling arch_swap_restore()
 */
static inline swp_entry_t folio_swap(swp_entry_t entry,
                const struct folio *folio)
{
        swp_entry_t swap = {
                .val = ALIGN_DOWN(entry.val, folio_nr_pages(folio)),
        };

        return swap;
}

static inline void *folio_raw_mapping(const struct folio *folio)
{
        unsigned long mapping = (unsigned long)folio->mapping;

        return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
}

/*
 * This is a file-backed mapping, and is about to be memory mapped - invoke its
 * mmap hook and safely handle error conditions. On error, VMA hooks will be
 * mutated.
 *
 * @file: File which backs the mapping.
 * @vma:  VMA which we are mapping.
 *
 * Returns: 0 if success, error otherwise.
 */
static inline int mmap_file(struct file *file, struct vm_area_struct *vma)
{
        int err = call_mmap(file, vma);

        if (likely(!err))
                return 0;

        /*
         * OK, we tried to call the file hook for mmap(), but an error
         * arose. The mapping is in an inconsistent state and we most not invoke
         * any further hooks on it.
         */
        vma->vm_ops = &vma_dummy_vm_ops;

        return err;
}

/*
 * If the VMA has a close hook then close it, and since closing it might leave
 * it in an inconsistent state which makes the use of any hooks suspect, clear
 * them down by installing dummy empty hooks.
 */
static inline void vma_close(struct vm_area_struct *vma)
{
        if (vma->vm_ops && vma->vm_ops->close) {
                vma->vm_ops->close(vma);

                /*
                 * The mapping is in an inconsistent state, and no further hooks
                 * may be invoked upon it.
                 */
                vma->vm_ops = &vma_dummy_vm_ops;
        }
}

#ifdef CONFIG_MMU

/* Flags for folio_pte_batch(). */
typedef int __bitwise fpb_t;

/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
#define FPB_IGNORE_DIRTY                ((__force fpb_t)BIT(0))

/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
#define FPB_IGNORE_SOFT_DIRTY                ((__force fpb_t)BIT(1))

static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
{
        if (flags & FPB_IGNORE_DIRTY)
                pte = pte_mkclean(pte);
        if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
                pte = pte_clear_soft_dirty(pte);
        return pte_wrprotect(pte_mkold(pte));
}

/**
 * folio_pte_batch - detect a PTE batch for a large folio
 * @folio: The large folio to detect a PTE batch for.
 * @addr: The user virtual address the first page is mapped at.
 * @start_ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @max_nr: The maximum number of table entries to consider.
 * @flags: Flags to modify the PTE batch semantics.
 * @any_writable: Optional pointer to indicate whether any entry except the
 *                  first one is writable.
 * @any_young: Optional pointer to indicate whether any entry except the
 *                  first one is young.
 * @any_dirty: Optional pointer to indicate whether any entry except the
 *                  first one is dirty.
 *
 * Detect a PTE batch: consecutive (present) PTEs that map consecutive
 * pages of the same large folio.
 *
 * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
 * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
 * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
 *
 * start_ptep must map any page of the folio. max_nr must be at least one and
 * must be limited by the caller so scanning cannot exceed a single page table.
 *
 * Return: the number of table entries in the batch.
 */
static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
                pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
                bool *any_writable, bool *any_young, bool *any_dirty)
{
        unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
        const pte_t *end_ptep = start_ptep + max_nr;
        pte_t expected_pte, *ptep;
        bool writable, young, dirty;
        int nr;

        if (any_writable)
                *any_writable = false;
        if (any_young)
                *any_young = false;
        if (any_dirty)
                *any_dirty = false;

        VM_WARN_ON_FOLIO(!pte_present(pte), folio);
        VM_WARN_ON_FOLIO(!folio_test_large(folio) || max_nr < 1, folio);
        VM_WARN_ON_FOLIO(page_folio(pfn_to_page(pte_pfn(pte))) != folio, folio);

        nr = pte_batch_hint(start_ptep, pte);
        expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
        ptep = start_ptep + nr;

        while (ptep < end_ptep) {
                pte = ptep_get(ptep);
                if (any_writable)
                        writable = !!pte_write(pte);
                if (any_young)
                        young = !!pte_young(pte);
                if (any_dirty)
                        dirty = !!pte_dirty(pte);
                pte = __pte_batch_clear_ignored(pte, flags);

                if (!pte_same(pte, expected_pte))
                        break;

                /*
                 * Stop immediately once we reached the end of the folio. In
                 * corner cases the next PFN might fall into a different
                 * folio.
                 */
                if (pte_pfn(pte) >= folio_end_pfn)
                        break;

                if (any_writable)
                        *any_writable |= writable;
                if (any_young)
                        *any_young |= young;
                if (any_dirty)
                        *any_dirty |= dirty;

                nr = pte_batch_hint(ptep, pte);
                expected_pte = pte_advance_pfn(expected_pte, nr);
                ptep += nr;
        }

        return min(ptep - start_ptep, max_nr);
}

/**
 * pte_move_swp_offset - Move the swap entry offset field of a swap pte
 *         forward or backward by delta
 * @pte: The initial pte state; is_swap_pte(pte) must be true and
 *         non_swap_entry() must be false.
 * @delta: The direction and the offset we are moving; forward if delta
 *         is positive; backward if delta is negative
 *
 * Moves the swap offset, while maintaining all other fields, including
 * swap type, and any swp pte bits. The resulting pte is returned.
 */
static inline pte_t pte_move_swp_offset(pte_t pte, long delta)
{
        swp_entry_t entry = pte_to_swp_entry(pte);
        pte_t new = __swp_entry_to_pte(__swp_entry(swp_type(entry),
                                                   (swp_offset(entry) + delta)));

        if (pte_swp_soft_dirty(pte))
                new = pte_swp_mksoft_dirty(new);
        if (pte_swp_exclusive(pte))
                new = pte_swp_mkexclusive(new);
        if (pte_swp_uffd_wp(pte))
                new = pte_swp_mkuffd_wp(new);

        return new;
}


/**
 * pte_next_swp_offset - Increment the swap entry offset field of a swap pte.
 * @pte: The initial pte state; is_swap_pte(pte) must be true and
 *         non_swap_entry() must be false.
 *
 * Increments the swap offset, while maintaining all other fields, including
 * swap type, and any swp pte bits. The resulting pte is returned.
 */
static inline pte_t pte_next_swp_offset(pte_t pte)
{
        return pte_move_swp_offset(pte, 1);
}

/**
 * swap_pte_batch - detect a PTE batch for a set of contiguous swap entries
 * @start_ptep: Page table pointer for the first entry.
 * @max_nr: The maximum number of table entries to consider.
 * @pte: Page table entry for the first entry.
 *
 * Detect a batch of contiguous swap entries: consecutive (non-present) PTEs
 * containing swap entries all with consecutive offsets and targeting the same
 * swap type, all with matching swp pte bits.
 *
 * max_nr must be at least one and must be limited by the caller so scanning
 * cannot exceed a single page table.
 *
 * Return: the number of table entries in the batch.
 */
static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
{
        pte_t expected_pte = pte_next_swp_offset(pte);
        const pte_t *end_ptep = start_ptep + max_nr;
        swp_entry_t entry = pte_to_swp_entry(pte);
        pte_t *ptep = start_ptep + 1;
        unsigned short cgroup_id;

        VM_WARN_ON(max_nr < 1);
        VM_WARN_ON(!is_swap_pte(pte));
        VM_WARN_ON(non_swap_entry(entry));

        cgroup_id = lookup_swap_cgroup_id(entry);
        while (ptep < end_ptep) {
                pte = ptep_get(ptep);

                if (!pte_same(pte, expected_pte))
                        break;
                if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
                        break;
                expected_pte = pte_next_swp_offset(expected_pte);
                ptep++;
        }

        return ptep - start_ptep;
}
#endif /* CONFIG_MMU */

void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
                                                int nr_throttled);
static inline void acct_reclaim_writeback(struct folio *folio)
{
        pg_data_t *pgdat = folio_pgdat(folio);
        int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);

        if (nr_throttled)
                __acct_reclaim_writeback(pgdat, folio, nr_throttled);
}

static inline void wake_throttle_isolated(pg_data_t *pgdat)
{
        wait_queue_head_t *wqh;

        wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
        if (waitqueue_active(wqh))
                wake_up(wqh);
}

vm_fault_t __vmf_anon_prepare(struct vm_fault *vmf);
static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
{
        vm_fault_t ret = __vmf_anon_prepare(vmf);

        if (unlikely(ret & VM_FAULT_RETRY))
                vma_end_read(vmf->vma);
        return ret;
}

vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void deactivate_file_folio(struct folio *folio);
void folio_activate(struct folio *folio);

void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *start_vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);

struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
                             struct vm_area_struct *vma,
                             unsigned long addr, unsigned long end,
                             struct zap_details *details);
int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
                           gfp_t gfp);

void page_cache_ra_order(struct readahead_control *, struct file_ra_state *,
                unsigned int order);
void force_page_cache_ra(struct readahead_control *, unsigned long nr);
static inline void force_page_cache_readahead(struct address_space *mapping,
                struct file *file, pgoff_t index, unsigned long nr_to_read)
{
        DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, index);
        force_page_cache_ra(&ractl, nr_to_read);
}

unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
                loff_t end);
long mapping_evict_folio(struct address_space *mapping, struct folio *folio);
unsigned long mapping_try_invalidate(struct address_space *mapping,
                pgoff_t start, pgoff_t end, unsigned long *nr_failed);

/**
 * folio_evictable - Test whether a folio is evictable.
 * @folio: The folio to test.
 *
 * Test whether @folio is evictable -- i.e., should be placed on
 * active/inactive lists vs unevictable list.
 *
 * Reasons folio might not be evictable:
 * 1. folio's mapping marked unevictable
 * 2. One of the pages in the folio is part of an mlocked VMA
 */
static inline bool folio_evictable(struct folio *folio)
{
        bool ret;

        /* Prevent address_space of inode and swap cache from being freed */
        rcu_read_lock();
        ret = !mapping_unevictable(folio_mapping(folio)) &&
                        !folio_test_mlocked(folio);
        rcu_read_unlock();
        return ret;
}

/*
 * Turn a non-refcounted page (->_refcount == 0) into refcounted with
 * a count of one.
 */
static inline void set_page_refcounted(struct page *page)
{
        VM_BUG_ON_PAGE(PageTail(page), page);
        VM_BUG_ON_PAGE(page_ref_count(page), page);
        set_page_count(page, 1);
}

/*
 * Return true if a folio needs ->release_folio() calling upon it.
 */
static inline bool folio_needs_release(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        return folio_has_private(folio) ||
                (mapping && mapping_release_always(mapping));
}

extern unsigned long highest_memmap_pfn;

/*
 * Maximum number of reclaim retries without progress before the OOM
 * killer is consider the only way forward.
 */
#define MAX_RECLAIM_RETRIES 16

/*
 * in mm/vmscan.c:
 */
bool folio_isolate_lru(struct folio *folio);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);

/*
 * in mm/rmap.c:
 */
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);

/*
 * in mm/page_alloc.c
 */
#define K(x) ((x) << (PAGE_SHIFT-10))

extern char * const zone_names[MAX_NR_ZONES];

/* perform sanity checks on struct pages being allocated or freed */
DECLARE_STATIC_KEY_MAYBE(CONFIG_DEBUG_VM, check_pages_enabled);

extern int min_free_kbytes;
extern int defrag_mode;

void setup_per_zone_wmarks(void);
void calculate_min_free_kbytes(void);
int __meminit init_per_zone_wmark_min(void);
void page_alloc_sysctl_init(void);

/*
 * Structure for holding the mostly immutable allocation parameters passed
 * between functions involved in allocations, including the alloc_pages*
 * family of functions.
 *
 * nodemask, migratetype and highest_zoneidx are initialized only once in
 * __alloc_pages() and then never change.
 *
 * zonelist, preferred_zone and highest_zoneidx are set first in
 * __alloc_pages() for the fast path, and might be later changed
 * in __alloc_pages_slowpath(). All other functions pass the whole structure
 * by a const pointer.
 */
struct alloc_context {
        struct zonelist *zonelist;
        nodemask_t *nodemask;
        struct zoneref *preferred_zoneref;
        int migratetype;

        /*
         * highest_zoneidx represents highest usable zone index of
         * the allocation request. Due to the nature of the zone,
         * memory on lower zone than the highest_zoneidx will be
         * protected by lowmem_reserve[highest_zoneidx].
         *
         * highest_zoneidx is also used by reclaim/compaction to limit
         * the target zone since higher zone than this index cannot be
         * usable for this allocation request.
         */
        enum zone_type highest_zoneidx;
        bool spread_dirty_pages;
};

/*
 * This function returns the order of a free page in the buddy system. In
 * general, page_zone(page)->lock must be held by the caller to prevent the
 * page from being allocated in parallel and returning garbage as the order.
 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
 * page cannot be allocated or merged in parallel. Alternatively, it must
 * handle invalid values gracefully, and use buddy_order_unsafe() below.
 */
static inline unsigned int buddy_order(struct page *page)
{
        /* PageBuddy() must be checked by the caller */
        return page_private(page);
}

/*
 * Like buddy_order(), but for callers who cannot afford to hold the zone lock.
 * PageBuddy() should be checked first by the caller to minimize race window,
 * and invalid values must be handled gracefully.
 *
 * READ_ONCE is used so that if the caller assigns the result into a local
 * variable and e.g. tests it for valid range before using, the compiler cannot
 * decide to remove the variable and inline the page_private(page) multiple
 * times, potentially observing different values in the tests and the actual
 * use of the result.
 */
#define buddy_order_unsafe(page)        READ_ONCE(page_private(page))

/*
 * This function checks whether a page is free && is the buddy
 * we can coalesce a page and its buddy if
 * (a) the buddy is not in a hole (check before calling!) &&
 * (b) the buddy is in the buddy system &&
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
 * For recording whether a page is in the buddy system, we set PageBuddy.
 * Setting, clearing, and testing PageBuddy is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
static inline bool page_is_buddy(struct page *page, struct page *buddy,
                                 unsigned int order)
{
        if (!page_is_guard(buddy) && !PageBuddy(buddy))
                return false;

        if (buddy_order(buddy) != order)
                return false;

        /*
         * zone check is done late to avoid uselessly calculating
         * zone/node ids for pages that could never merge.
         */
        if (page_zone_id(page) != page_zone_id(buddy))
                return false;

        VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);

        return true;
}

/*
 * Locate the struct page for both the matching buddy in our
 * pair (buddy1) and the combined O(n+1) page they form (page).
 *
 * 1) Any buddy B1 will have an order O twin B2 which satisfies
 * the following equation:
 *     B2 = B1 ^ (1 << O)
 * For example, if the starting buddy (buddy2) is #8 its order
 * 1 buddy is #10:
 *     B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
 *
 * 2) Any buddy B will have an order O+1 parent P which
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_PAGE_ORDER
 */
static inline unsigned long
__find_buddy_pfn(unsigned long page_pfn, unsigned int order)
{
        return page_pfn ^ (1 << order);
}

/*
 * Find the buddy of @page and validate it.
 * @page: The input page
 * @pfn: The pfn of the page, it saves a call to page_to_pfn() when the
 *       function is used in the performance-critical __free_one_page().
 * @order: The order of the page
 * @buddy_pfn: The output pointer to the buddy pfn, it also saves a call to
 *             page_to_pfn().
 *
 * The found buddy can be a non PageBuddy, out of @page's zone, or its order is
 * not the same as @page. The validation is necessary before use it.
 *
 * Return: the found buddy page or NULL if not found.
 */
static inline struct page *find_buddy_page_pfn(struct page *page,
                        unsigned long pfn, unsigned int order, unsigned long *buddy_pfn)
{
        unsigned long __buddy_pfn = __find_buddy_pfn(pfn, order);
        struct page *buddy;

        buddy = page + (__buddy_pfn - pfn);
        if (buddy_pfn)
                *buddy_pfn = __buddy_pfn;

        if (page_is_buddy(page, buddy, order))
                return buddy;
        return NULL;
}

extern struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone);

static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
                                unsigned long end_pfn, struct zone *zone)
{
        if (zone->contiguous)
                return pfn_to_page(start_pfn);

        return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}

void set_zone_contiguous(struct zone *zone);
bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
                           unsigned long nr_pages);

static inline void clear_zone_contiguous(struct zone *zone)
{
        zone->contiguous = false;
}

extern int __isolate_free_page(struct page *page, unsigned int order);
extern void __putback_isolated_page(struct page *page, unsigned int order,
                                    int mt);
extern void memblock_free_pages(struct page *page, unsigned long pfn,
                                        unsigned int order);
extern void __free_pages_core(struct page *page, unsigned int order,
                enum meminit_context context);

/*
 * This will have no effect, other than possibly generating a warning, if the
 * caller passes in a non-large folio.
 */
static inline void folio_set_order(struct folio *folio, unsigned int order)
{
        if (WARN_ON_ONCE(!order || !folio_test_large(folio)))
                return;

        folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order;
#ifdef NR_PAGES_IN_LARGE_FOLIO
        folio->_nr_pages = 1U << order;
#endif
}

bool __folio_unqueue_deferred_split(struct folio *folio);
static inline bool folio_unqueue_deferred_split(struct folio *folio)
{
        if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
                return false;

        /*
         * At this point, there is no one trying to add the folio to
         * deferred_list. If folio is not in deferred_list, it's safe
         * to check without acquiring the split_queue_lock.
         */
        if (data_race(list_empty(&folio->_deferred_list)))
                return false;

        return __folio_unqueue_deferred_split(folio);
}

static inline struct folio *page_rmappable_folio(struct page *page)
{
        struct folio *folio = (struct folio *)page;

        if (folio && folio_test_large(folio))
                folio_set_large_rmappable(folio);
        return folio;
}

static inline void prep_compound_head(struct page *page, unsigned int order)
{
        struct folio *folio = (struct folio *)page;

        folio_set_order(folio, order);
        atomic_set(&folio->_large_mapcount, -1);
        if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                atomic_set(&folio->_nr_pages_mapped, 0);
        if (IS_ENABLED(CONFIG_MM_ID)) {
                folio->_mm_ids = 0;
                folio->_mm_id_mapcount[0] = -1;
                folio->_mm_id_mapcount[1] = -1;
        }
        if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
                atomic_set(&folio->_pincount, 0);
                atomic_set(&folio->_entire_mapcount, -1);
        }
        if (order > 1)
                INIT_LIST_HEAD(&folio->_deferred_list);
}

static inline void prep_compound_tail(struct page *head, int tail_idx)
{
        struct page *p = head + tail_idx;

        p->mapping = TAIL_MAPPING;
        set_compound_head(p, head);
        set_page_private(p, 0);
}

void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags);
extern bool free_pages_prepare(struct page *page, unsigned int order);

extern int user_min_free_kbytes;

struct page *__alloc_frozen_pages_noprof(gfp_t, unsigned int order, int nid,
                nodemask_t *);
#define __alloc_frozen_pages(...) \
        alloc_hooks(__alloc_frozen_pages_noprof(__VA_ARGS__))
void free_frozen_pages(struct page *page, unsigned int order);
void free_unref_folios(struct folio_batch *fbatch);

#ifdef CONFIG_NUMA
struct page *alloc_frozen_pages_noprof(gfp_t, unsigned int order);
#else
static inline struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order)
{
        return __alloc_frozen_pages_noprof(gfp, order, numa_node_id(), NULL);
}
#endif

#define alloc_frozen_pages(...) \
        alloc_hooks(alloc_frozen_pages_noprof(__VA_ARGS__))

extern void zone_pcp_reset(struct zone *zone);
extern void zone_pcp_disable(struct zone *zone);
extern void zone_pcp_enable(struct zone *zone);
extern void zone_pcp_init(struct zone *zone);

extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
                          phys_addr_t min_addr,
                          int nid, bool exact_nid);

void memmap_init_range(unsigned long, int, unsigned long, unsigned long,
                unsigned long, enum meminit_context, struct vmem_altmap *, int);

#if defined CONFIG_COMPACTION || defined CONFIG_CMA

/*
 * in mm/compaction.c
 */
/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
        struct list_head freepages[NR_PAGE_ORDERS];        /* List of free pages to migrate to */
        struct list_head migratepages;        /* List of pages being migrated */
        unsigned int nr_freepages;        /* Number of isolated free pages */
        unsigned int nr_migratepages;        /* Number of pages to migrate */
        unsigned long free_pfn;                /* isolate_freepages search base */
        /*
         * Acts as an in/out parameter to page isolation for migration.
         * isolate_migratepages uses it as a search base.
         * isolate_migratepages_block will update the value to the next pfn
         * after the last isolated one.
         */
        unsigned long migrate_pfn;
        unsigned long fast_start_pfn;        /* a pfn to start linear scan from */
        struct zone *zone;
        unsigned long total_migrate_scanned;
        unsigned long total_free_scanned;
        unsigned short fast_search_fail;/* failures to use free list searches */
        short search_order;                /* order to start a fast search at */
        const gfp_t gfp_mask;                /* gfp mask of a direct compactor */
        int order;                        /* order a direct compactor needs */
        int migratetype;                /* migratetype of direct compactor */
        const unsigned int alloc_flags;        /* alloc flags of a direct compactor */
        const int highest_zoneidx;        /* zone index of a direct compactor */
        enum migrate_mode mode;                /* Async or sync migration mode */
        bool ignore_skip_hint;                /* Scan blocks even if marked skip */
        bool no_set_skip_hint;                /* Don't mark blocks for skipping */
        bool ignore_block_suitable;        /* Scan blocks considered unsuitable */
        bool direct_compaction;                /* False from kcompactd or /proc/... */
        bool proactive_compaction;        /* kcompactd proactive compaction */
        bool whole_zone;                /* Whole zone should/has been scanned */
        bool contended;                        /* Signal lock contention */
        bool finish_pageblock;                /* Scan the remainder of a pageblock. Used
                                         * when there are potentially transient
                                         * isolation or migration failures to
                                         * ensure forward progress.
                                         */
        bool alloc_contig;                /* alloc_contig_range allocation */
};

/*
 * Used in direct compaction when a page should be taken from the freelists
 * immediately when one is created during the free path.
 */
struct capture_control {
        struct compact_control *cc;
        struct page *page;
};

unsigned long
isolate_freepages_range(struct compact_control *cc,
                        unsigned long start_pfn, unsigned long end_pfn);
int
isolate_migratepages_range(struct compact_control *cc,
                           unsigned long low_pfn, unsigned long end_pfn);

/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void init_cma_reserved_pageblock(struct page *page);

#endif /* CONFIG_COMPACTION || CONFIG_CMA */

struct cma;

#ifdef CONFIG_CMA
void *cma_reserve_early(struct cma *cma, unsigned long size);
void init_cma_pageblock(struct page *page);
#else
static inline void *cma_reserve_early(struct cma *cma, unsigned long size)
{
        return NULL;
}
static inline void init_cma_pageblock(struct page *page)
{
}
#endif


int find_suitable_fallback(struct free_area *area, unsigned int order,
                        int migratetype, bool claim_only, bool *claim_block);

static inline bool free_area_empty(struct free_area *area, int migratetype)
{
        return list_empty(&area->free_list[migratetype]);
}

/* mm/util.c */
struct anon_vma *folio_anon_vma(const struct folio *folio);

#ifdef CONFIG_MMU
void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, int *locked);
extern long faultin_page_range(struct mm_struct *mm, unsigned long start,
                unsigned long end, bool write, int *locked);
extern bool mlock_future_ok(struct mm_struct *mm, unsigned long flags,
                               unsigned long bytes);

/*
 * NOTE: This function can't tell whether the folio is "fully mapped" in the
 * range.
 * "fully mapped" means all the pages of folio is associated with the page
 * table of range while this function just check whether the folio range is
 * within the range [start, end). Function caller needs to do page table
 * check if it cares about the page table association.
 *
 * Typical usage (like mlock or madvise) is:
 * Caller knows at least 1 page of folio is associated with page table of VMA
 * and the range [start, end) is intersect with the VMA range. Caller wants
 * to know whether the folio is fully associated with the range. It calls
 * this function to check whether the folio is in the range first. Then checks
 * the page table to know whether the folio is fully mapped to the range.
 */
static inline bool
folio_within_range(struct folio *folio, struct vm_area_struct *vma,
                unsigned long start, unsigned long end)
{
        pgoff_t pgoff, addr;
        unsigned long vma_pglen = vma_pages(vma);

        VM_WARN_ON_FOLIO(folio_test_ksm(folio), folio);
        if (start > end)
                return false;

        if (start < vma->vm_start)
                start = vma->vm_start;

        if (end > vma->vm_end)
                end = vma->vm_end;

        pgoff = folio_pgoff(folio);

        /* if folio start address is not in vma range */
        if (!in_range(pgoff, vma->vm_pgoff, vma_pglen))
                return false;

        addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);

        return !(addr < start || end - addr < folio_size(folio));
}

static inline bool
folio_within_vma(struct folio *folio, struct vm_area_struct *vma)
{
        return folio_within_range(folio, vma, vma->vm_start, vma->vm_end);
}

/*
 * mlock_vma_folio() and munlock_vma_folio():
 * should be called with vma's mmap_lock held for read or write,
 * under page table lock for the pte/pmd being added or removed.
 *
 * mlock is usually called at the end of folio_add_*_rmap_*(), munlock at
 * the end of folio_remove_rmap_*(); but new anon folios are managed by
 * folio_add_lru_vma() calling mlock_new_folio().
 */
void mlock_folio(struct folio *folio);
static inline void mlock_vma_folio(struct folio *folio,
                                struct vm_area_struct *vma)
{
        /*
         * The VM_SPECIAL check here serves two purposes.
         * 1) VM_IO check prevents migration from double-counting during mlock.
         * 2) Although mmap_region() and mlock_fixup() take care that VM_LOCKED
         *    is never left set on a VM_SPECIAL vma, there is an interval while
         *    file->f_op->mmap() is using vm_insert_page(s), when VM_LOCKED may
         *    still be set while VM_SPECIAL bits are added: so ignore it then.
         */
        if (unlikely((vma->vm_flags & (VM_LOCKED|VM_SPECIAL)) == VM_LOCKED))
                mlock_folio(folio);
}

void munlock_folio(struct folio *folio);
static inline void munlock_vma_folio(struct folio *folio,
                                        struct vm_area_struct *vma)
{
        /*
         * munlock if the function is called. Ideally, we should only
         * do munlock if any page of folio is unmapped from VMA and
         * cause folio not fully mapped to VMA.
         *
         * But it's not easy to confirm that's the situation. So we
         * always munlock the folio and page reclaim will correct it
         * if it's wrong.
         */
        if (unlikely(vma->vm_flags & VM_LOCKED))
                munlock_folio(folio);
}

void mlock_new_folio(struct folio *folio);
bool need_mlock_drain(int cpu);
void mlock_drain_local(void);
void mlock_drain_remote(int cpu);

extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);

/**
 * vma_address - Find the virtual address a page range is mapped at
 * @vma: The vma which maps this object.
 * @pgoff: The page offset within its object.
 * @nr_pages: The number of pages to consider.
 *
 * If any page in this range is mapped by this VMA, return the first address
 * where any of these pages appear.  Otherwise, return -EFAULT.
 */
static inline unsigned long vma_address(const struct vm_area_struct *vma,
                pgoff_t pgoff, unsigned long nr_pages)
{
        unsigned long address;

        if (pgoff >= vma->vm_pgoff) {
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                /* Check for address beyond vma (or wrapped through 0?) */
                if (address < vma->vm_start || address >= vma->vm_end)
                        address = -EFAULT;
        } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
                /* Test above avoids possibility of wrap to 0 on 32-bit */
                address = vma->vm_start;
        } else {
                address = -EFAULT;
        }
        return address;
}

/*
 * Then at what user virtual address will none of the range be found in vma?
 * Assumes that vma_address() already returned a good starting address.
 */
static inline unsigned long vma_address_end(struct page_vma_mapped_walk *pvmw)
{
        struct vm_area_struct *vma = pvmw->vma;
        pgoff_t pgoff;
        unsigned long address;

        /* Common case, plus ->pgoff is invalid for KSM */
        if (pvmw->nr_pages == 1)
                return pvmw->address + PAGE_SIZE;

        pgoff = pvmw->pgoff + pvmw->nr_pages;
        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        /* Check for address beyond vma (or wrapped through 0?) */
        if (address < vma->vm_start || address > vma->vm_end)
                address = vma->vm_end;
        return address;
}

static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
                                                    struct file *fpin)
{
        int flags = vmf->flags;

        if (fpin)
                return fpin;

        /*
         * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
         * anything, so we only pin the file and drop the mmap_lock if only
         * FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
         */
        if (fault_flag_allow_retry_first(flags) &&
            !(flags & FAULT_FLAG_RETRY_NOWAIT)) {
                fpin = get_file(vmf->vma->vm_file);
                release_fault_lock(vmf);
        }
        return fpin;
}
#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
static inline void mlock_new_folio(struct folio *folio) { }
static inline bool need_mlock_drain(int cpu) { return false; }
static inline void mlock_drain_local(void) { }
static inline void mlock_drain_remote(int cpu) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
{
}
#endif /* !CONFIG_MMU */

/* Memory initialisation debug and verification */
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
DECLARE_STATIC_KEY_TRUE(deferred_pages);

bool __init deferred_grow_zone(struct zone *zone, unsigned int order);
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

enum mminit_level {
        MMINIT_WARNING,
        MMINIT_VERIFY,
        MMINIT_TRACE
};

#ifdef CONFIG_DEBUG_MEMORY_INIT

extern int mminit_loglevel;

#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
        if (level < mminit_loglevel) { \
                if (level <= MMINIT_WARNING) \
                        pr_warn("mminit::" prefix " " fmt, ##arg);        \
                else \
                        printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
        } \
} while (0)

extern void mminit_verify_pageflags_layout(void);
extern void mminit_verify_zonelist(void);
#else

static inline void mminit_dprintk(enum mminit_level level,
                                const char *prefix, const char *fmt, ...)
{
}

static inline void mminit_verify_pageflags_layout(void)
{
}

static inline void mminit_verify_zonelist(void)
{
}
#endif /* CONFIG_DEBUG_MEMORY_INIT */

#define NODE_RECLAIM_NOSCAN        -2
#define NODE_RECLAIM_FULL        -1
#define NODE_RECLAIM_SOME        0
#define NODE_RECLAIM_SUCCESS        1

#ifdef CONFIG_NUMA
extern int node_reclaim_mode;

extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
extern int find_next_best_node(int node, nodemask_t *used_node_mask);
#else
#define node_reclaim_mode 0

static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
                                unsigned int order)
{
        return NODE_RECLAIM_NOSCAN;
}
static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
{
        return NUMA_NO_NODE;
}
#endif

static inline bool node_reclaim_enabled(void)
{
        /* Is any node_reclaim_mode bit set? */
        return node_reclaim_mode & (RECLAIM_ZONE|RECLAIM_WRITE|RECLAIM_UNMAP);
}

/*
 * mm/memory-failure.c
 */
#ifdef CONFIG_MEMORY_FAILURE
int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);

extern u32 hwpoison_filter_dev_major;
extern u32 hwpoison_filter_dev_minor;
extern u64 hwpoison_filter_flags_mask;
extern u64 hwpoison_filter_flags_value;
extern u64 hwpoison_filter_memcg;
extern u32 hwpoison_filter_enable;
#define MAGIC_HWPOISON        0x48575053U        /* HWPS */
void SetPageHWPoisonTakenOff(struct page *page);
void ClearPageHWPoisonTakenOff(struct page *page);
bool take_page_off_buddy(struct page *page);
bool put_page_back_buddy(struct page *page);
struct task_struct *task_early_kill(struct task_struct *tsk, int force_early);
void add_to_kill_ksm(struct task_struct *tsk, const struct page *p,
                     struct vm_area_struct *vma, struct list_head *to_kill,
                     unsigned long ksm_addr);
unsigned long page_mapped_in_vma(const struct page *page,
                struct vm_area_struct *vma);

#else
static inline int unmap_poisoned_folio(struct folio *folio, unsigned long pfn, bool must_kill)
{
        return -EBUSY;
}
#endif

extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);

extern void set_pageblock_order(void);
struct folio *alloc_migrate_folio(struct folio *src, unsigned long private);
unsigned long reclaim_pages(struct list_head *folio_list);
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
                                            struct list_head *folio_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN                WMARK_MIN
#define ALLOC_WMARK_LOW                WMARK_LOW
#define ALLOC_WMARK_HIGH        WMARK_HIGH
#define ALLOC_NO_WATERMARKS        0x04 /* don't check watermarks at all */

/* Mask to get the watermark bits */
#define ALLOC_WMARK_MASK        (ALLOC_NO_WATERMARKS-1)

/*
 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
 * cannot assume a reduced access to memory reserves is sufficient for
 * !MMU
 */
#ifdef CONFIG_MMU
#define ALLOC_OOM                0x08
#else
#define ALLOC_OOM                ALLOC_NO_WATERMARKS
#endif

#define ALLOC_NON_BLOCK                 0x10 /* Caller cannot block. Allow access
                                       * to 25% of the min watermark or
                                       * 62.5% if __GFP_HIGH is set.
                                       */
#define ALLOC_MIN_RESERVE         0x20 /* __GFP_HIGH set. Allow access to 50%
                                       * of the min watermark.
                                       */
#define ALLOC_CPUSET                 0x40 /* check for correct cpuset */
#define ALLOC_CMA                 0x80 /* allow allocations from CMA areas */
#ifdef CONFIG_ZONE_DMA32
#define ALLOC_NOFRAGMENT        0x100 /* avoid mixing pageblock types */
#else
#define ALLOC_NOFRAGMENT          0x0
#endif
#define ALLOC_HIGHATOMIC        0x200 /* Allows access to MIGRATE_HIGHATOMIC */
#define ALLOC_TRYLOCK                0x400 /* Only use spin_trylock in allocation path */
#define ALLOC_KSWAPD                0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */

/* Flags that allow allocations below the min watermark. */
#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)

enum ttu_flags;
struct tlbflush_unmap_batch;


/*
 * only for MM internal work items which do not depend on
 * any allocations or locks which might depend on allocations
 */
extern struct workqueue_struct *mm_percpu_wq;

#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
void try_to_unmap_flush(void);
void try_to_unmap_flush_dirty(void);
void flush_tlb_batched_pending(struct mm_struct *mm);
#else
static inline void try_to_unmap_flush(void)
{
}
static inline void try_to_unmap_flush_dirty(void)
{
}
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
{
}
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */

extern const struct trace_print_flags pageflag_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];

static inline bool is_migrate_highatomic(enum migratetype migratetype)
{
        return migratetype == MIGRATE_HIGHATOMIC;
}

void setup_zone_pageset(struct zone *zone);

struct migration_target_control {
        int nid;                /* preferred node id */
        nodemask_t *nmask;
        gfp_t gfp_mask;
        enum migrate_reason reason;
};

/*
 * mm/filemap.c
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
                              struct folio *folio, loff_t fpos, size_t size);

/*
 * mm/vmalloc.c
 */
#ifdef CONFIG_MMU
void __init vmalloc_init(void);
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift);
unsigned int get_vm_area_page_order(struct vm_struct *vm);
#else
static inline void vmalloc_init(void)
{
}

static inline
int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end,
                pgprot_t prot, struct page **pages, unsigned int page_shift)
{
        return -EINVAL;
}
#endif

int __must_check __vmap_pages_range_noflush(unsigned long addr,
                               unsigned long end, pgprot_t prot,
                               struct page **pages, unsigned int page_shift);

void vunmap_range_noflush(unsigned long start, unsigned long end);

void __vunmap_range_noflush(unsigned long start, unsigned long end);

int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
                      unsigned long addr, int *flags, bool writable,
                      int *last_cpupid);

void free_zone_device_folio(struct folio *folio);
int migrate_device_coherent_folio(struct folio *folio);

struct vm_struct *__get_vm_area_node(unsigned long size,
                                     unsigned long align, unsigned long shift,
                                     unsigned long flags, unsigned long start,
                                     unsigned long end, int node, gfp_t gfp_mask,
                                     const void *caller);

/*
 * mm/gup.c
 */
int __must_check try_grab_folio(struct folio *folio, int refs,
                                unsigned int flags);

/*
 * mm/huge_memory.c
 */
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
               pud_t *pud, bool write);
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
               pmd_t *pmd, bool write);

/*
 * Parses a string with mem suffixes into its order. Useful to parse kernel
 * parameters.
 */
static inline int get_order_from_str(const char *size_str,
                                     unsigned long valid_orders)
{
        unsigned long size;
        char *endptr;
        int order;

        size = memparse(size_str, &endptr);

        if (!is_power_of_2(size))
                return -EINVAL;
        order = get_order(size);
        if (BIT(order) & ~valid_orders)
                return -EINVAL;

        return order;
}

enum {
        /* mark page accessed */
        FOLL_TOUCH = 1 << 16,
        /* a retry, previous pass started an IO */
        FOLL_TRIED = 1 << 17,
        /* we are working on non-current tsk/mm */
        FOLL_REMOTE = 1 << 18,
        /* pages must be released via unpin_user_page */
        FOLL_PIN = 1 << 19,
        /* gup_fast: prevent fall-back to slow gup */
        FOLL_FAST_ONLY = 1 << 20,
        /* allow unlocking the mmap lock */
        FOLL_UNLOCKABLE = 1 << 21,
        /* VMA lookup+checks compatible with MADV_POPULATE_(READ|WRITE) */
        FOLL_MADV_POPULATE = 1 << 22,
};

#define INTERNAL_GUP_FLAGS (FOLL_TOUCH | FOLL_TRIED | FOLL_REMOTE | FOLL_PIN | \
                            FOLL_FAST_ONLY | FOLL_UNLOCKABLE | \
                            FOLL_MADV_POPULATE)

/*
 * Indicates for which pages that are write-protected in the page table,
 * whether GUP has to trigger unsharing via FAULT_FLAG_UNSHARE such that the
 * GUP pin will remain consistent with the pages mapped into the page tables
 * of the MM.
 *
 * Temporary unmapping of PageAnonExclusive() pages or clearing of
 * PageAnonExclusive() has to protect against concurrent GUP:
 * * Ordinary GUP: Using the PT lock
 * * GUP-fast and fork(): mm->write_protect_seq
 * * GUP-fast and KSM or temporary unmapping (swap, migration): see
 *    folio_try_share_anon_rmap_*()
 *
 * Must be called with the (sub)page that's actually referenced via the
 * page table entry, which might not necessarily be the head page for a
 * PTE-mapped THP.
 *
 * If the vma is NULL, we're coming from the GUP-fast path and might have
 * to fallback to the slow path just to lookup the vma.
 */
static inline bool gup_must_unshare(struct vm_area_struct *vma,
                                    unsigned int flags, struct page *page)
{
        /*
         * FOLL_WRITE is implicitly handled correctly as the page table entry
         * has to be writable -- and if it references (part of) an anonymous
         * folio, that part is required to be marked exclusive.
         */
        if ((flags & (FOLL_WRITE | FOLL_PIN)) != FOLL_PIN)
                return false;
        /*
         * Note: PageAnon(page) is stable until the page is actually getting
         * freed.
         */
        if (!PageAnon(page)) {
                /*
                 * We only care about R/O long-term pining: R/O short-term
                 * pinning does not have the semantics to observe successive
                 * changes through the process page tables.
                 */
                if (!(flags & FOLL_LONGTERM))
                        return false;

                /* We really need the vma ... */
                if (!vma)
                        return true;

                /*
                 * ... because we only care about writable private ("COW")
                 * mappings where we have to break COW early.
                 */
                return is_cow_mapping(vma->vm_flags);
        }

        /* Paired with a memory barrier in folio_try_share_anon_rmap_*(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_rmb();

        /*
         * Note that KSM pages cannot be exclusive, and consequently,
         * cannot get pinned.
         */
        return !PageAnonExclusive(page);
}

extern bool mirrored_kernelcore;
bool memblock_has_mirror(void);
void memblock_free_all(void);

static __always_inline void vma_set_range(struct vm_area_struct *vma,
                                          unsigned long start, unsigned long end,
                                          pgoff_t pgoff)
{
        vma->vm_start = start;
        vma->vm_end = end;
        vma->vm_pgoff = pgoff;
}

static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
{
        /*
         * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
         * enablements, because when without soft-dirty being compiled in,
         * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
         * will be constantly true.
         */
        if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
                return false;

        /*
         * Soft-dirty is kind of special: its tracking is enabled when the
         * vma flags not set.
         */
        return !(vma->vm_flags & VM_SOFTDIRTY);
}

static inline bool pmd_needs_soft_dirty_wp(struct vm_area_struct *vma, pmd_t pmd)
{
        return vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd);
}

static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte)
{
        return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
}

void __meminit __init_single_page(struct page *page, unsigned long pfn,
                                unsigned long zone, int nid);
void __meminit __init_page_from_nid(unsigned long pfn, int nid);

/* shrinker related functions */
unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg,
                          int priority);

#ifdef CONFIG_SHRINKER_DEBUG
static inline __printf(2, 0) int shrinker_debugfs_name_alloc(
                        struct shrinker *shrinker, const char *fmt, va_list ap)
{
        shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);

        return shrinker->name ? 0 : -ENOMEM;
}

static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
        kfree_const(shrinker->name);
        shrinker->name = NULL;
}

extern int shrinker_debugfs_add(struct shrinker *shrinker);
extern struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                              int *debugfs_id);
extern void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                    int debugfs_id);
#else /* CONFIG_SHRINKER_DEBUG */
static inline int shrinker_debugfs_add(struct shrinker *shrinker)
{
        return 0;
}
static inline int shrinker_debugfs_name_alloc(struct shrinker *shrinker,
                                              const char *fmt, va_list ap)
{
        return 0;
}
static inline void shrinker_debugfs_name_free(struct shrinker *shrinker)
{
}
static inline struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker,
                                                     int *debugfs_id)
{
        *debugfs_id = -1;
        return NULL;
}
static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
                                           int debugfs_id)
{
}
#endif /* CONFIG_SHRINKER_DEBUG */

/* Only track the nodes of mappings with shadow entries */
void workingset_update_node(struct xa_node *node);
extern struct list_lru shadow_nodes;
#define mapping_set_update(xas, mapping) do {                        \
        if (!dax_mapping(mapping) && !shmem_mapping(mapping)) {        \
                xas_set_update(xas, workingset_update_node);        \
                xas_set_lru(xas, &shadow_nodes);                \
        }                                                        \
} while (0)

/* mremap.c */
unsigned long move_page_tables(struct pagetable_move_control *pmc);

#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
void unaccepted_cleanup_work(struct work_struct *work);
#else /* CONFIG_UNACCEPTED_MEMORY */
static inline void accept_page(struct page *page)
{
}
#endif /* CONFIG_UNACCEPTED_MEMORY */

/* pagewalk.c */
int walk_page_range_mm(struct mm_struct *mm, unsigned long start,
                unsigned long end, const struct mm_walk_ops *ops,
                void *private);

/* pt_reclaim.c */
bool try_get_and_clear_pmd(struct mm_struct *mm, pmd_t *pmd, pmd_t *pmdval);
void free_pte(struct mm_struct *mm, unsigned long addr, struct mmu_gather *tlb,
              pmd_t pmdval);
void try_to_free_pte(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
                     struct mmu_gather *tlb);

#ifdef CONFIG_PT_RECLAIM
bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
                           struct zap_details *details);
#else
static inline bool reclaim_pt_is_enabled(unsigned long start, unsigned long end,
                                         struct zap_details *details)
{
        return false;
}
#endif /* CONFIG_PT_RECLAIM */


#endif        /* __MM_INTERNAL_H */

open /syzkaller/managers/ci-qemu-native-arm64-kvm/kernel/security/selinux/flask.h: no such file or directory















































































































































































































































































































































































    3 




    3 


    3 





































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause

/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
/* Copyright (c) 2008-2019, IBM Corporation */

#include <linux/init.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <net/net_namespace.h>
#include <linux/rtnetlink.h>
#include <linux/if_arp.h>
#include <linux/list.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/dma-mapping.h>

#include <net/addrconf.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_user_verbs.h>
#include <rdma/rdma_netlink.h>
#include <linux/kthread.h>

#include "siw.h"
#include "siw_verbs.h"

MODULE_AUTHOR("Bernard Metzler");
MODULE_DESCRIPTION("Software iWARP Driver");
MODULE_LICENSE("Dual BSD/GPL");

/* transmit from user buffer, if possible */
const bool zcopy_tx = true;

/* Restrict usage of GSO, if hardware peer iwarp is unable to process
 * large packets. try_gso = true lets siw try to use local GSO,
 * if peer agrees.  Not using GSO severly limits siw maximum tx bandwidth.
 */
const bool try_gso;

/* Attach siw also with loopback devices */
const bool loopback_enabled = true;

/* We try to negotiate CRC on, if true */
const bool mpa_crc_required;

/* MPA CRC on/off enforced */
const bool mpa_crc_strict;

/* Control TCP_NODELAY socket option */
const bool siw_tcp_nagle;

/* Select MPA version to be used during connection setup */
u_char mpa_version = MPA_REVISION_2;

/* Selects MPA P2P mode (additional handshake during connection
 * setup, if true.
 */
const bool peer_to_peer;

struct task_struct *siw_tx_thread[NR_CPUS];

static int siw_device_register(struct siw_device *sdev, const char *name)
{
        struct ib_device *base_dev = &sdev->base_dev;
        static int dev_id = 1;
        int rv;

        sdev->vendor_part_id = dev_id++;

        rv = ib_register_device(base_dev, name, NULL);
        if (rv) {
                pr_warn("siw: device registration error %d\n", rv);
                return rv;
        }

        siw_dbg(base_dev, "HWaddr=%pM\n", sdev->raw_gid);
        return 0;
}

static void siw_device_cleanup(struct ib_device *base_dev)
{
        struct siw_device *sdev = to_siw_dev(base_dev);

        xa_destroy(&sdev->qp_xa);
        xa_destroy(&sdev->mem_xa);
}

static int siw_dev_qualified(struct net_device *netdev)
{
        /*
         * Additional hardware support can be added here
         * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see
         * <linux/if_arp.h> for type identifiers.
         */
        if (netdev->type == ARPHRD_ETHER || netdev->type == ARPHRD_IEEE802 ||
            netdev->type == ARPHRD_NONE ||
            (netdev->type == ARPHRD_LOOPBACK && loopback_enabled))
                return 1;

        return 0;
}

static DEFINE_PER_CPU(atomic_t, siw_use_cnt);

static struct {
        struct cpumask **tx_valid_cpus;
        int num_nodes;
} siw_cpu_info;

static void siw_destroy_cpulist(int number)
{
        int i = 0;

        while (i < number)
                kfree(siw_cpu_info.tx_valid_cpus[i++]);

        kfree(siw_cpu_info.tx_valid_cpus);
        siw_cpu_info.tx_valid_cpus = NULL;
}

static int siw_init_cpulist(void)
{
        int i, num_nodes = nr_node_ids;

        memset(siw_tx_thread, 0, sizeof(siw_tx_thread));

        siw_cpu_info.num_nodes = num_nodes;

        siw_cpu_info.tx_valid_cpus =
                kcalloc(num_nodes, sizeof(struct cpumask *), GFP_KERNEL);
        if (!siw_cpu_info.tx_valid_cpus) {
                siw_cpu_info.num_nodes = 0;
                return -ENOMEM;
        }
        for (i = 0; i < siw_cpu_info.num_nodes; i++) {
                siw_cpu_info.tx_valid_cpus[i] =
                        kzalloc(sizeof(struct cpumask), GFP_KERNEL);
                if (!siw_cpu_info.tx_valid_cpus[i])
                        goto out_err;

                cpumask_clear(siw_cpu_info.tx_valid_cpus[i]);
        }
        for_each_possible_cpu(i)
                cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]);

        return 0;

out_err:
        siw_cpu_info.num_nodes = 0;
        siw_destroy_cpulist(i);

        return -ENOMEM;
}

/*
 * Choose CPU with least number of active QP's from NUMA node of
 * TX interface.
 */
int siw_get_tx_cpu(struct siw_device *sdev)
{
        const struct cpumask *tx_cpumask;
        int i, num_cpus, cpu, min_use, node = sdev->numa_node, tx_cpu = -1;

        if (node < 0)
                tx_cpumask = cpu_online_mask;
        else
                tx_cpumask = siw_cpu_info.tx_valid_cpus[node];

        num_cpus = cpumask_weight(tx_cpumask);
        if (!num_cpus) {
                /* no CPU on this NUMA node */
                tx_cpumask = cpu_online_mask;
                num_cpus = cpumask_weight(tx_cpumask);
        }
        if (!num_cpus)
                goto out;

        cpu = cpumask_first(tx_cpumask);

        for (i = 0, min_use = SIW_MAX_QP; i < num_cpus;
             i++, cpu = cpumask_next(cpu, tx_cpumask)) {
                int usage;

                /* Skip any cores which have no TX thread */
                if (!siw_tx_thread[cpu])
                        continue;

                usage = atomic_read(&per_cpu(siw_use_cnt, cpu));
                if (usage <= min_use) {
                        tx_cpu = cpu;
                        min_use = usage;
                }
        }
        siw_dbg(&sdev->base_dev,
                "tx cpu %d, node %d, %d qp's\n", tx_cpu, node, min_use);

out:
        if (tx_cpu >= 0)
                atomic_inc(&per_cpu(siw_use_cnt, tx_cpu));
        else
                pr_warn("siw: no tx cpu found\n");

        return tx_cpu;
}

void siw_put_tx_cpu(int cpu)
{
        atomic_dec(&per_cpu(siw_use_cnt, cpu));
}

static struct ib_qp *siw_get_base_qp(struct ib_device *base_dev, int id)
{
        struct siw_qp *qp = siw_qp_id2obj(to_siw_dev(base_dev), id);

        if (qp) {
                /*
                 * siw_qp_id2obj() increments object reference count
                 */
                siw_qp_put(qp);
                return &qp->base_qp;
        }
        return NULL;
}

static const struct ib_device_ops siw_device_ops = {
        .owner = THIS_MODULE,
        .uverbs_abi_ver = SIW_ABI_VERSION,
        .driver_id = RDMA_DRIVER_SIW,

        .alloc_mr = siw_alloc_mr,
        .alloc_pd = siw_alloc_pd,
        .alloc_ucontext = siw_alloc_ucontext,
        .create_cq = siw_create_cq,
        .create_qp = siw_create_qp,
        .create_srq = siw_create_srq,
        .dealloc_driver = siw_device_cleanup,
        .dealloc_pd = siw_dealloc_pd,
        .dealloc_ucontext = siw_dealloc_ucontext,
        .dereg_mr = siw_dereg_mr,
        .destroy_cq = siw_destroy_cq,
        .destroy_qp = siw_destroy_qp,
        .destroy_srq = siw_destroy_srq,
        .get_dma_mr = siw_get_dma_mr,
        .get_port_immutable = siw_get_port_immutable,
        .iw_accept = siw_accept,
        .iw_add_ref = siw_qp_get_ref,
        .iw_connect = siw_connect,
        .iw_create_listen = siw_create_listen,
        .iw_destroy_listen = siw_destroy_listen,
        .iw_get_qp = siw_get_base_qp,
        .iw_reject = siw_reject,
        .iw_rem_ref = siw_qp_put_ref,
        .map_mr_sg = siw_map_mr_sg,
        .mmap = siw_mmap,
        .mmap_free = siw_mmap_free,
        .modify_qp = siw_verbs_modify_qp,
        .modify_srq = siw_modify_srq,
        .poll_cq = siw_poll_cq,
        .post_recv = siw_post_receive,
        .post_send = siw_post_send,
        .post_srq_recv = siw_post_srq_recv,
        .query_device = siw_query_device,
        .query_gid = siw_query_gid,
        .query_port = siw_query_port,
        .query_qp = siw_query_qp,
        .query_srq = siw_query_srq,
        .req_notify_cq = siw_req_notify_cq,
        .reg_user_mr = siw_reg_user_mr,

        INIT_RDMA_OBJ_SIZE(ib_cq, siw_cq, base_cq),
        INIT_RDMA_OBJ_SIZE(ib_pd, siw_pd, base_pd),
        INIT_RDMA_OBJ_SIZE(ib_qp, siw_qp, base_qp),
        INIT_RDMA_OBJ_SIZE(ib_srq, siw_srq, base_srq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, siw_ucontext, base_ucontext),
};

static struct siw_device *siw_device_create(struct net_device *netdev)
{
        struct siw_device *sdev = NULL;
        struct ib_device *base_dev;
        int rv;

        sdev = ib_alloc_device(siw_device, base_dev);
        if (!sdev)
                return NULL;

        base_dev = &sdev->base_dev;

        if (netdev->addr_len) {
                memcpy(sdev->raw_gid, netdev->dev_addr,
                       min_t(unsigned int, netdev->addr_len, ETH_ALEN));
        } else {
                /*
                 * This device does not have a HW address, but
                 * connection mangagement requires a unique gid.
                 */
                eth_random_addr(sdev->raw_gid);
        }
        addrconf_addr_eui48((u8 *)&base_dev->node_guid, sdev->raw_gid);

        base_dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND);

        base_dev->node_type = RDMA_NODE_RNIC;
        memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON,
               sizeof(SIW_NODE_DESC_COMMON));

        /*
         * Current model (one-to-one device association):
         * One Softiwarp device per net_device or, equivalently,
         * per physical port.
         */
        base_dev->phys_port_cnt = 1;
        base_dev->num_comp_vectors = num_possible_cpus();

        xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
        xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);

        ib_set_device_ops(base_dev, &siw_device_ops);
        rv = ib_device_set_netdev(base_dev, netdev, 1);
        if (rv)
                goto error;

        memcpy(base_dev->iw_ifname, netdev->name,
               sizeof(base_dev->iw_ifname));

        /* Disable TCP port mapping */
        base_dev->iw_driver_flags = IW_F_NO_PORT_MAP;

        sdev->attrs.max_qp = SIW_MAX_QP;
        sdev->attrs.max_qp_wr = SIW_MAX_QP_WR;
        sdev->attrs.max_ord = SIW_MAX_ORD_QP;
        sdev->attrs.max_ird = SIW_MAX_IRD_QP;
        sdev->attrs.max_sge = SIW_MAX_SGE;
        sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD;
        sdev->attrs.max_cq = SIW_MAX_CQ;
        sdev->attrs.max_cqe = SIW_MAX_CQE;
        sdev->attrs.max_mr = SIW_MAX_MR;
        sdev->attrs.max_pd = SIW_MAX_PD;
        sdev->attrs.max_mw = SIW_MAX_MW;
        sdev->attrs.max_srq = SIW_MAX_SRQ;
        sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
        sdev->attrs.max_srq_sge = SIW_MAX_SGE;

        INIT_LIST_HEAD(&sdev->cep_list);
        INIT_LIST_HEAD(&sdev->qp_list);

        atomic_set(&sdev->num_ctx, 0);
        atomic_set(&sdev->num_srq, 0);
        atomic_set(&sdev->num_qp, 0);
        atomic_set(&sdev->num_cq, 0);
        atomic_set(&sdev->num_mr, 0);
        atomic_set(&sdev->num_pd, 0);

        sdev->numa_node = dev_to_node(&netdev->dev);
        spin_lock_init(&sdev->lock);

        return sdev;
error:
        ib_dealloc_device(base_dev);

        return NULL;
}

static int siw_netdev_event(struct notifier_block *nb, unsigned long event,
                            void *arg)
{
        struct net_device *netdev = netdev_notifier_info_to_dev(arg);
        struct ib_device *base_dev;
        struct siw_device *sdev;

        dev_dbg(&netdev->dev, "siw: event %lu\n", event);

        base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
        if (!base_dev)
                return NOTIFY_OK;

        sdev = to_siw_dev(base_dev);

        switch (event) {
        case NETDEV_REGISTER:
                /*
                 * Device registration now handled only by
                 * rdma netlink commands. So it shall be impossible
                 * to end up here with a valid siw device.
                 */
                siw_dbg(base_dev, "unexpected NETDEV_REGISTER event\n");
                break;

        case NETDEV_UNREGISTER:
                ib_unregister_device_queued(&sdev->base_dev);
                break;

        case NETDEV_CHANGEADDR:
                siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE);
                break;
        /*
         * All other events are not handled
         */
        default:
                break;
        }
        ib_device_put(&sdev->base_dev);

        return NOTIFY_OK;
}

static struct notifier_block siw_netdev_nb = {
        .notifier_call = siw_netdev_event,
};

static int siw_newlink(const char *basedev_name, struct net_device *netdev)
{
        struct ib_device *base_dev;
        struct siw_device *sdev = NULL;
        int rv = -ENOMEM;

        if (!siw_dev_qualified(netdev))
                return -EINVAL;

        base_dev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_SIW);
        if (base_dev) {
                ib_device_put(base_dev);
                return -EEXIST;
        }
        sdev = siw_device_create(netdev);
        if (sdev) {
                dev_dbg(&netdev->dev, "siw: new device\n");
                ib_mark_name_assigned_by_user(&sdev->base_dev);
                rv = siw_device_register(sdev, basedev_name);
                if (rv)
                        ib_dealloc_device(&sdev->base_dev);
        }
        return rv;
}

static struct rdma_link_ops siw_link_ops = {
        .type = "siw",
        .newlink = siw_newlink,
};

/*
 * siw_init_module - Initialize Softiwarp module and register with netdev
 *                   subsystem.
 */
static __init int siw_init_module(void)
{
        int rv;

        if (SENDPAGE_THRESH < SIW_MAX_INLINE) {
                pr_info("siw: sendpage threshold too small: %u\n",
                        (int)SENDPAGE_THRESH);
                rv = -EINVAL;
                goto out_error;
        }
        rv = siw_init_cpulist();
        if (rv)
                goto out_error;

        rv = siw_cm_init();
        if (rv)
                goto out_error;

        if (!siw_create_tx_threads()) {
                pr_info("siw: Could not start any TX thread\n");
                rv = -ENOMEM;
                goto out_error;
        }

        rv = register_netdevice_notifier(&siw_netdev_nb);
        if (rv)
                goto out_error;

        rdma_link_register(&siw_link_ops);

        pr_info("SoftiWARP attached\n");
        return 0;

out_error:
        siw_stop_tx_threads();

        pr_info("SoftIWARP attach failed. Error: %d\n", rv);

        siw_cm_exit();
        siw_destroy_cpulist(siw_cpu_info.num_nodes);

        return rv;
}

static void __exit siw_exit_module(void)
{
        siw_stop_tx_threads();

        unregister_netdevice_notifier(&siw_netdev_nb);
        rdma_link_unregister(&siw_link_ops);
        ib_unregister_driver(RDMA_DRIVER_SIW);

        siw_cm_exit();

        siw_destroy_cpulist(siw_cpu_info.num_nodes);

        pr_info("SoftiWARP detached\n");
}

module_init(siw_init_module);
module_exit(siw_exit_module);

MODULE_ALIAS_RDMA_LINK("siw");












































































































































































































































































































































































































































































































































































































































































































































































































































  317 



  318 




















  352 



  351 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  fs/userfaultfd.c
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *  Copyright (C) 2008-2009 Red Hat, Inc.
 *  Copyright (C) 2015  Red Hat, Inc.
 *
 *  Some part derived from fs/eventfd.c (anon inode setup) and
 *  mm/ksm.c (mm hashing).
 */

#include <linux/list.h>
#include <linux/hashtable.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/file.h>
#include <linux/bug.h>
#include <linux/anon_inodes.h>
#include <linux/syscalls.h>
#include <linux/userfaultfd_k.h>
#include <linux/mempolicy.h>
#include <linux/ioctl.h>
#include <linux/security.h>
#include <linux/hugetlb.h>
#include <linux/swapops.h>
#include <linux/miscdevice.h>
#include <linux/uio.h>

static int sysctl_unprivileged_userfaultfd __read_mostly;

#ifdef CONFIG_SYSCTL
static const struct ctl_table vm_userfaultfd_table[] = {
        {
                .procname        = "unprivileged_userfaultfd",
                .data                = &sysctl_unprivileged_userfaultfd,
                .maxlen                = sizeof(sysctl_unprivileged_userfaultfd),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};
#endif

static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;

struct userfaultfd_fork_ctx {
        struct userfaultfd_ctx *orig;
        struct userfaultfd_ctx *new;
        struct list_head list;
};

struct userfaultfd_unmap_ctx {
        struct userfaultfd_ctx *ctx;
        unsigned long start;
        unsigned long end;
        struct list_head list;
};

struct userfaultfd_wait_queue {
        struct uffd_msg msg;
        wait_queue_entry_t wq;
        struct userfaultfd_ctx *ctx;
        bool waken;
};

struct userfaultfd_wake_range {
        unsigned long start;
        unsigned long len;
};

/* internal indication that UFFD_API ioctl was successfully executed */
#define UFFD_FEATURE_INITIALIZED                (1u << 31)

static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
{
        return ctx->features & UFFD_FEATURE_INITIALIZED;
}

static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
{
        return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
}

/*
 * Whether WP_UNPOPULATED is enabled on the uffd context.  It is only
 * meaningful when userfaultfd_wp()==true on the vma and when it's
 * anonymous.
 */
bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
{
        struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx)
                return false;

        return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
}

static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
                                     int wake_flags, void *key)
{
        struct userfaultfd_wake_range *range = key;
        int ret;
        struct userfaultfd_wait_queue *uwq;
        unsigned long start, len;

        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
        ret = 0;
        /* len == 0 means wake all */
        start = range->start;
        len = range->len;
        if (len && (start > uwq->msg.arg.pagefault.address ||
                    start + len <= uwq->msg.arg.pagefault.address))
                goto out;
        WRITE_ONCE(uwq->waken, true);
        /*
         * The Program-Order guarantees provided by the scheduler
         * ensure uwq->waken is visible before the task is woken.
         */
        ret = wake_up_state(wq->private, mode);
        if (ret) {
                /*
                 * Wake only once, autoremove behavior.
                 *
                 * After the effect of list_del_init is visible to the other
                 * CPUs, the waitqueue may disappear from under us, see the
                 * !list_empty_careful() in handle_userfault().
                 *
                 * try_to_wake_up() has an implicit smp_mb(), and the
                 * wq->private is read before calling the extern function
                 * "wake_up_state" (which in turns calls try_to_wake_up).
                 */
                list_del_init(&wq->entry);
        }
out:
        return ret;
}

/**
 * userfaultfd_ctx_get - Acquires a reference to the internal userfaultfd
 * context.
 * @ctx: [in] Pointer to the userfaultfd context.
 */
static void userfaultfd_ctx_get(struct userfaultfd_ctx *ctx)
{
        refcount_inc(&ctx->refcount);
}

/**
 * userfaultfd_ctx_put - Releases a reference to the internal userfaultfd
 * context.
 * @ctx: [in] Pointer to userfaultfd context.
 *
 * The userfaultfd context reference must have been previously acquired either
 * with userfaultfd_ctx_get() or userfaultfd_ctx_fdget().
 */
static void userfaultfd_ctx_put(struct userfaultfd_ctx *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                VM_BUG_ON(spin_is_locked(&ctx->fault_pending_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fault_pending_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->fault_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fault_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->event_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->event_wqh));
                VM_BUG_ON(spin_is_locked(&ctx->fd_wqh.lock));
                VM_BUG_ON(waitqueue_active(&ctx->fd_wqh));
                mmdrop(ctx->mm);
                kmem_cache_free(userfaultfd_ctx_cachep, ctx);
        }
}

static inline void msg_init(struct uffd_msg *msg)
{
        BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
        /*
         * Must use memset to zero out the paddings or kernel data is
         * leaked to userland.
         */
        memset(msg, 0, sizeof(struct uffd_msg));
}

static inline struct uffd_msg userfault_msg(unsigned long address,
                                            unsigned long real_address,
                                            unsigned int flags,
                                            unsigned long reason,
                                            unsigned int features)
{
        struct uffd_msg msg;

        msg_init(&msg);
        msg.event = UFFD_EVENT_PAGEFAULT;

        msg.arg.pagefault.address = (features & UFFD_FEATURE_EXACT_ADDRESS) ?
                                    real_address : address;

        /*
         * These flags indicate why the userfault occurred:
         * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
         * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
         * - Neither of these flags being set indicates a MISSING fault.
         *
         * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
         * fault. Otherwise, it was a read fault.
         */
        if (flags & FAULT_FLAG_WRITE)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
        if (reason & VM_UFFD_WP)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
        if (reason & VM_UFFD_MINOR)
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
        if (features & UFFD_FEATURE_THREAD_ID)
                msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
        return msg;
}

#ifdef CONFIG_HUGETLB_PAGE
/*
 * Same functionality as userfaultfd_must_wait below with modifications for
 * hugepmd ranges.
 */
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
                                              struct vm_fault *vmf,
                                              unsigned long reason)
{
        struct vm_area_struct *vma = vmf->vma;
        pte_t *ptep, pte;
        bool ret = true;

        assert_fault_locked(vmf);

        ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma));
        if (!ptep)
                goto out;

        ret = false;
        pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep);

        /*
         * Lockless access: we're in a wait_event so it's ok if it
         * changes under us.  PTE markers should be handled the same as none
         * ptes here.
         */
        if (huge_pte_none_mostly(pte))
                ret = true;
        if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
                ret = true;
out:
        return ret;
}
#else
static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
                                              struct vm_fault *vmf,
                                              unsigned long reason)
{
        return false;        /* should never get here */
}
#endif /* CONFIG_HUGETLB_PAGE */

/*
 * Verify the pagetables are still not ok after having reigstered into
 * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any
 * userfault that has already been resolved, if userfaultfd_read_iter and
 * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different
 * threads.
 */
static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
                                         struct vm_fault *vmf,
                                         unsigned long reason)
{
        struct mm_struct *mm = ctx->mm;
        unsigned long address = vmf->address;
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd, _pmd;
        pte_t *pte;
        pte_t ptent;
        bool ret = true;

        assert_fault_locked(vmf);

        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                goto out;
        p4d = p4d_offset(pgd, address);
        if (!p4d_present(*p4d))
                goto out;
        pud = pud_offset(p4d, address);
        if (!pud_present(*pud))
                goto out;
        pmd = pmd_offset(pud, address);
again:
        _pmd = pmdp_get_lockless(pmd);
        if (pmd_none(_pmd))
                goto out;

        ret = false;
        if (!pmd_present(_pmd) || pmd_devmap(_pmd))
                goto out;

        if (pmd_trans_huge(_pmd)) {
                if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
                        ret = true;
                goto out;
        }

        pte = pte_offset_map(pmd, address);
        if (!pte) {
                ret = true;
                goto again;
        }
        /*
         * Lockless access: we're in a wait_event so it's ok if it
         * changes under us.  PTE markers should be handled the same as none
         * ptes here.
         */
        ptent = ptep_get(pte);
        if (pte_none_mostly(ptent))
                ret = true;
        if (!pte_write(ptent) && (reason & VM_UFFD_WP))
                ret = true;
        pte_unmap(pte);

out:
        return ret;
}

static inline unsigned int userfaultfd_get_blocking_state(unsigned int flags)
{
        if (flags & FAULT_FLAG_INTERRUPTIBLE)
                return TASK_INTERRUPTIBLE;

        if (flags & FAULT_FLAG_KILLABLE)
                return TASK_KILLABLE;

        return TASK_UNINTERRUPTIBLE;
}

/*
 * The locking rules involved in returning VM_FAULT_RETRY depending on
 * FAULT_FLAG_ALLOW_RETRY, FAULT_FLAG_RETRY_NOWAIT and
 * FAULT_FLAG_KILLABLE are not straightforward. The "Caution"
 * recommendation in __lock_page_or_retry is not an understatement.
 *
 * If FAULT_FLAG_ALLOW_RETRY is set, the mmap_lock must be released
 * before returning VM_FAULT_RETRY only if FAULT_FLAG_RETRY_NOWAIT is
 * not set.
 *
 * If FAULT_FLAG_ALLOW_RETRY is set but FAULT_FLAG_KILLABLE is not
 * set, VM_FAULT_RETRY can still be returned if and only if there are
 * fatal_signal_pending()s, and the mmap_lock must be released before
 * returning it.
 */
vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct userfaultfd_ctx *ctx;
        struct userfaultfd_wait_queue uwq;
        vm_fault_t ret = VM_FAULT_SIGBUS;
        bool must_wait;
        unsigned int blocking_state;

        /*
         * We don't do userfault handling for the final child pid update
         * and when coredumping (faults triggered by get_dump_page()).
         */
        if (current->flags & (PF_EXITING|PF_DUMPCORE))
                goto out;

        assert_fault_locked(vmf);

        ctx = vma->vm_userfaultfd_ctx.ctx;
        if (!ctx)
                goto out;

        BUG_ON(ctx->mm != mm);

        /* Any unrecognized flag is a bug. */
        VM_BUG_ON(reason & ~__VM_UFFD_FLAGS);
        /* 0 or > 1 flags set is a bug; we expect exactly 1. */
        VM_BUG_ON(!reason || (reason & (reason - 1)));

        if (ctx->features & UFFD_FEATURE_SIGBUS)
                goto out;
        if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
                goto out;

        /*
         * Check that we can return VM_FAULT_RETRY.
         *
         * NOTE: it should become possible to return VM_FAULT_RETRY
         * even if FAULT_FLAG_TRIED is set without leading to gup()
         * -EBUSY failures, if the userfaultfd is to be extended for
         * VM_UFFD_WP tracking and we intend to arm the userfault
         * without first stopping userland access to the memory. For
         * VM_UFFD_MISSING userfaults this is enough for now.
         */
        if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
                /*
                 * Validate the invariant that nowait must allow retry
                 * to be sure not to return SIGBUS erroneously on
                 * nowait invocations.
                 */
                BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
#ifdef CONFIG_DEBUG_VM
                if (printk_ratelimit()) {
                        printk(KERN_WARNING
                               "FAULT_FLAG_ALLOW_RETRY missing %x\n",
                               vmf->flags);
                        dump_stack();
                }
#endif
                goto out;
        }

        /*
         * Handle nowait, not much to do other than tell it to retry
         * and wait.
         */
        ret = VM_FAULT_RETRY;
        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                goto out;

        if (unlikely(READ_ONCE(ctx->released))) {
                /*
                 * If a concurrent release is detected, do not return
                 * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always
                 * return VM_FAULT_RETRY with lock released proactively.
                 *
                 * If we were to return VM_FAULT_SIGBUS here, the non
                 * cooperative manager would be instead forced to
                 * always call UFFDIO_UNREGISTER before it can safely
                 * close the uffd, to avoid involuntary SIGBUS triggered.
                 *
                 * If we were to return VM_FAULT_NOPAGE, it would work for
                 * the fault path, in which the lock will be released
                 * later.  However for GUP, faultin_page() does nothing
                 * special on NOPAGE, so GUP would spin retrying without
                 * releasing the mmap read lock, causing possible livelock.
                 *
                 * Here only VM_FAULT_RETRY would make sure the mmap lock
                 * be released immediately, so that the thread concurrently
                 * releasing the userfault would always make progress.
                 */
                release_fault_lock(vmf);
                goto out;
        }

        /* take the reference before dropping the mmap_lock */
        userfaultfd_ctx_get(ctx);

        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
        uwq.wq.private = current;
        uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags,
                                reason, ctx->features);
        uwq.ctx = ctx;
        uwq.waken = false;

        blocking_state = userfaultfd_get_blocking_state(vmf->flags);

        /*
         * Take the vma lock now, in order to safely call
         * userfaultfd_huge_must_wait() later. Since acquiring the
         * (sleepable) vma lock can modify the current task state, that
         * must be before explicitly calling set_current_state().
         */
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_lock_read(vma);

        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        /*
         * After the __add_wait_queue the uwq is visible to userland
         * through poll/read().
         */
        __add_wait_queue(&ctx->fault_pending_wqh, &uwq.wq);
        /*
         * The smp_mb() after __set_current_state prevents the reads
         * following the spin_unlock to happen before the list_add in
         * __add_wait_queue.
         */
        set_current_state(blocking_state);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        if (!is_vm_hugetlb_page(vma))
                must_wait = userfaultfd_must_wait(ctx, vmf, reason);
        else
                must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason);
        if (is_vm_hugetlb_page(vma))
                hugetlb_vma_unlock_read(vma);
        release_fault_lock(vmf);

        if (likely(must_wait && !READ_ONCE(ctx->released))) {
                wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                schedule();
        }

        __set_current_state(TASK_RUNNING);

        /*
         * Here we race with the list_del; list_add in
         * userfaultfd_ctx_read(), however because we don't ever run
         * list_del_init() to refile across the two lists, the prev
         * and next pointers will never point to self. list_add also
         * would never let any of the two pointers to point to
         * self. So list_empty_careful won't risk to see both pointers
         * pointing to self at any time during the list refile. The
         * only case where list_del_init() is called is the full
         * removal in the wake function and there we don't re-list_add
         * and it's fine not to block on the spinlock. The uwq on this
         * kernel stack can be released after the list_del_init.
         */
        if (!list_empty_careful(&uwq.wq.entry)) {
                spin_lock_irq(&ctx->fault_pending_wqh.lock);
                /*
                 * No need of list_del_init(), the uwq on the stack
                 * will be freed shortly anyway.
                 */
                list_del(&uwq.wq.entry);
                spin_unlock_irq(&ctx->fault_pending_wqh.lock);
        }

        /*
         * ctx may go away after this if the userfault pseudo fd is
         * already released.
         */
        userfaultfd_ctx_put(ctx);

out:
        return ret;
}

static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
                                              struct userfaultfd_wait_queue *ewq)
{
        struct userfaultfd_ctx *release_new_ctx;

        if (WARN_ON_ONCE(current->flags & PF_EXITING))
                goto out;

        ewq->ctx = ctx;
        init_waitqueue_entry(&ewq->wq, current);
        release_new_ctx = NULL;

        spin_lock_irq(&ctx->event_wqh.lock);
        /*
         * After the __add_wait_queue the uwq is visible to userland
         * through poll/read().
         */
        __add_wait_queue(&ctx->event_wqh, &ewq->wq);
        for (;;) {
                set_current_state(TASK_KILLABLE);
                if (ewq->msg.event == 0)
                        break;
                if (READ_ONCE(ctx->released) ||
                    fatal_signal_pending(current)) {
                        /*
                         * &ewq->wq may be queued in fork_event, but
                         * __remove_wait_queue ignores the head
                         * parameter. It would be a problem if it
                         * didn't.
                         */
                        __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
                        if (ewq->msg.event == UFFD_EVENT_FORK) {
                                struct userfaultfd_ctx *new;

                                new = (struct userfaultfd_ctx *)
                                        (unsigned long)
                                        ewq->msg.arg.reserved.reserved1;
                                release_new_ctx = new;
                        }
                        break;
                }

                spin_unlock_irq(&ctx->event_wqh.lock);

                wake_up_poll(&ctx->fd_wqh, EPOLLIN);
                schedule();

                spin_lock_irq(&ctx->event_wqh.lock);
        }
        __set_current_state(TASK_RUNNING);
        spin_unlock_irq(&ctx->event_wqh.lock);

        if (release_new_ctx) {
                userfaultfd_release_new(release_new_ctx);
                userfaultfd_ctx_put(release_new_ctx);
        }

        /*
         * ctx may go away after this if the userfault pseudo fd is
         * already released.
         */
out:
        atomic_dec(&ctx->mmap_changing);
        VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0);
        userfaultfd_ctx_put(ctx);
}

static void userfaultfd_event_complete(struct userfaultfd_ctx *ctx,
                                       struct userfaultfd_wait_queue *ewq)
{
        ewq->msg.event = 0;
        wake_up_locked(&ctx->event_wqh);
        __remove_wait_queue(&ctx->event_wqh, &ewq->wq);
}

int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
{
        struct userfaultfd_ctx *ctx = NULL, *octx;
        struct userfaultfd_fork_ctx *fctx;

        octx = vma->vm_userfaultfd_ctx.ctx;
        if (!octx)
                return 0;

        if (!(octx->features & UFFD_FEATURE_EVENT_FORK)) {
                userfaultfd_reset_ctx(vma);
                return 0;
        }

        list_for_each_entry(fctx, fcs, list)
                if (fctx->orig == octx) {
                        ctx = fctx->new;
                        break;
                }

        if (!ctx) {
                fctx = kmalloc(sizeof(*fctx), GFP_KERNEL);
                if (!fctx)
                        return -ENOMEM;

                ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
                if (!ctx) {
                        kfree(fctx);
                        return -ENOMEM;
                }

                refcount_set(&ctx->refcount, 1);
                ctx->flags = octx->flags;
                ctx->features = octx->features;
                ctx->released = false;
                init_rwsem(&ctx->map_changing_lock);
                atomic_set(&ctx->mmap_changing, 0);
                ctx->mm = vma->vm_mm;
                mmgrab(ctx->mm);

                userfaultfd_ctx_get(octx);
                down_write(&octx->map_changing_lock);
                atomic_inc(&octx->mmap_changing);
                up_write(&octx->map_changing_lock);
                fctx->orig = octx;
                fctx->new = ctx;
                list_add_tail(&fctx->list, fcs);
        }

        vma->vm_userfaultfd_ctx.ctx = ctx;
        return 0;
}

static void dup_fctx(struct userfaultfd_fork_ctx *fctx)
{
        struct userfaultfd_ctx *ctx = fctx->orig;
        struct userfaultfd_wait_queue ewq;

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_FORK;
        ewq.msg.arg.reserved.reserved1 = (unsigned long)fctx->new;

        userfaultfd_event_wait_completion(ctx, &ewq);
}

void dup_userfaultfd_complete(struct list_head *fcs)
{
        struct userfaultfd_fork_ctx *fctx, *n;

        list_for_each_entry_safe(fctx, n, fcs, list) {
                dup_fctx(fctx);
                list_del(&fctx->list);
                kfree(fctx);
        }
}

void dup_userfaultfd_fail(struct list_head *fcs)
{
        struct userfaultfd_fork_ctx *fctx, *n;

        /*
         * An error has occurred on fork, we will tear memory down, but have
         * allocated memory for fctx's and raised reference counts for both the
         * original and child contexts (and on the mm for each as a result).
         *
         * These would ordinarily be taken care of by a user handling the event,
         * but we are no longer doing so, so manually clean up here.
         *
         * mm tear down will take care of cleaning up VMA contexts.
         */
        list_for_each_entry_safe(fctx, n, fcs, list) {
                struct userfaultfd_ctx *octx = fctx->orig;
                struct userfaultfd_ctx *ctx = fctx->new;

                atomic_dec(&octx->mmap_changing);
                VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
                userfaultfd_ctx_put(octx);
                userfaultfd_ctx_put(ctx);

                list_del(&fctx->list);
                kfree(fctx);
        }
}

void mremap_userfaultfd_prep(struct vm_area_struct *vma,
                             struct vm_userfaultfd_ctx *vm_ctx)
{
        struct userfaultfd_ctx *ctx;

        ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx)
                return;

        if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
                vm_ctx->ctx = ctx;
                userfaultfd_ctx_get(ctx);
                down_write(&ctx->map_changing_lock);
                atomic_inc(&ctx->mmap_changing);
                up_write(&ctx->map_changing_lock);
        } else {
                /* Drop uffd context if remap feature not enabled */
                userfaultfd_reset_ctx(vma);
        }
}

void mremap_userfaultfd_complete(struct vm_userfaultfd_ctx *vm_ctx,
                                 unsigned long from, unsigned long to,
                                 unsigned long len)
{
        struct userfaultfd_ctx *ctx = vm_ctx->ctx;
        struct userfaultfd_wait_queue ewq;

        if (!ctx)
                return;

        if (to & ~PAGE_MASK) {
                userfaultfd_ctx_put(ctx);
                return;
        }

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_REMAP;
        ewq.msg.arg.remap.from = from;
        ewq.msg.arg.remap.to = to;
        ewq.msg.arg.remap.len = len;

        userfaultfd_event_wait_completion(ctx, &ewq);
}

bool userfaultfd_remove(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end)
{
        struct mm_struct *mm = vma->vm_mm;
        struct userfaultfd_ctx *ctx;
        struct userfaultfd_wait_queue ewq;

        ctx = vma->vm_userfaultfd_ctx.ctx;
        if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_REMOVE))
                return true;

        userfaultfd_ctx_get(ctx);
        down_write(&ctx->map_changing_lock);
        atomic_inc(&ctx->mmap_changing);
        up_write(&ctx->map_changing_lock);
        mmap_read_unlock(mm);

        msg_init(&ewq.msg);

        ewq.msg.event = UFFD_EVENT_REMOVE;
        ewq.msg.arg.remove.start = start;
        ewq.msg.arg.remove.end = end;

        userfaultfd_event_wait_completion(ctx, &ewq);

        return false;
}

static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
                          unsigned long start, unsigned long end)
{
        struct userfaultfd_unmap_ctx *unmap_ctx;

        list_for_each_entry(unmap_ctx, unmaps, list)
                if (unmap_ctx->ctx == ctx && unmap_ctx->start == start &&
                    unmap_ctx->end == end)
                        return true;

        return false;
}

int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
                           unsigned long end, struct list_head *unmaps)
{
        struct userfaultfd_unmap_ctx *unmap_ctx;
        struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;

        if (!ctx || !(ctx->features & UFFD_FEATURE_EVENT_UNMAP) ||
            has_unmap_ctx(ctx, unmaps, start, end))
                return 0;

        unmap_ctx = kzalloc(sizeof(*unmap_ctx), GFP_KERNEL);
        if (!unmap_ctx)
                return -ENOMEM;

        userfaultfd_ctx_get(ctx);
        down_write(&ctx->map_changing_lock);
        atomic_inc(&ctx->mmap_changing);
        up_write(&ctx->map_changing_lock);
        unmap_ctx->ctx = ctx;
        unmap_ctx->start = start;
        unmap_ctx->end = end;
        list_add_tail(&unmap_ctx->list, unmaps);

        return 0;
}

void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf)
{
        struct userfaultfd_unmap_ctx *ctx, *n;
        struct userfaultfd_wait_queue ewq;

        list_for_each_entry_safe(ctx, n, uf, list) {
                msg_init(&ewq.msg);

                ewq.msg.event = UFFD_EVENT_UNMAP;
                ewq.msg.arg.remove.start = ctx->start;
                ewq.msg.arg.remove.end = ctx->end;

                userfaultfd_event_wait_completion(ctx->ctx, &ewq);

                list_del(&ctx->list);
                kfree(ctx);
        }
}

static int userfaultfd_release(struct inode *inode, struct file *file)
{
        struct userfaultfd_ctx *ctx = file->private_data;
        struct mm_struct *mm = ctx->mm;
        /* len == 0 means wake all */
        struct userfaultfd_wake_range range = { .len = 0, };

        WRITE_ONCE(ctx->released, true);

        userfaultfd_release_all(mm, ctx);

        /*
         * After no new page faults can wait on this fault_*wqh, flush
         * the last page faults that may have been already waiting on
         * the fault_*wqh.
         */
        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
        __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        /* Flush pending events that may still wait on event_wqh */
        wake_up_all(&ctx->event_wqh);

        wake_up_poll(&ctx->fd_wqh, EPOLLHUP);
        userfaultfd_ctx_put(ctx);
        return 0;
}

/* fault_pending_wqh.lock must be hold by the caller */
static inline struct userfaultfd_wait_queue *find_userfault_in(
                wait_queue_head_t *wqh)
{
        wait_queue_entry_t *wq;
        struct userfaultfd_wait_queue *uwq;

        lockdep_assert_held(&wqh->lock);

        uwq = NULL;
        if (!waitqueue_active(wqh))
                goto out;
        /* walk in reverse to provide FIFO behavior to read userfaults */
        wq = list_last_entry(&wqh->head, typeof(*wq), entry);
        uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
out:
        return uwq;
}

static inline struct userfaultfd_wait_queue *find_userfault(
                struct userfaultfd_ctx *ctx)
{
        return find_userfault_in(&ctx->fault_pending_wqh);
}

static inline struct userfaultfd_wait_queue *find_userfault_evt(
                struct userfaultfd_ctx *ctx)
{
        return find_userfault_in(&ctx->event_wqh);
}

static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
{
        struct userfaultfd_ctx *ctx = file->private_data;
        __poll_t ret;

        poll_wait(file, &ctx->fd_wqh, wait);

        if (!userfaultfd_is_initialized(ctx))
                return EPOLLERR;

        /*
         * poll() never guarantees that read won't block.
         * userfaults can be waken before they're read().
         */
        if (unlikely(!(file->f_flags & O_NONBLOCK)))
                return EPOLLERR;
        /*
         * lockless access to see if there are pending faults
         * __pollwait last action is the add_wait_queue but
         * the spin_unlock would allow the waitqueue_active to
         * pass above the actual list_add inside
         * add_wait_queue critical section. So use a full
         * memory barrier to serialize the list_add write of
         * add_wait_queue() with the waitqueue_active read
         * below.
         */
        ret = 0;
        smp_mb();
        if (waitqueue_active(&ctx->fault_pending_wqh))
                ret = EPOLLIN;
        else if (waitqueue_active(&ctx->event_wqh))
                ret = EPOLLIN;

        return ret;
}

static const struct file_operations userfaultfd_fops;

static int resolve_userfault_fork(struct userfaultfd_ctx *new,
                                  struct inode *inode,
                                  struct uffd_msg *msg)
{
        int fd;

        fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
                        O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
        if (fd < 0)
                return fd;

        msg->arg.reserved.reserved1 = 0;
        msg->arg.fork.ufd = fd;
        return 0;
}

static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait,
                                    struct uffd_msg *msg, struct inode *inode)
{
        ssize_t ret;
        DECLARE_WAITQUEUE(wait, current);
        struct userfaultfd_wait_queue *uwq;
        /*
         * Handling fork event requires sleeping operations, so
         * we drop the event_wqh lock, then do these ops, then
         * lock it back and wake up the waiter. While the lock is
         * dropped the ewq may go away so we keep track of it
         * carefully.
         */
        LIST_HEAD(fork_event);
        struct userfaultfd_ctx *fork_nctx = NULL;

        /* always take the fd_wqh lock before the fault_pending_wqh lock */
        spin_lock_irq(&ctx->fd_wqh.lock);
        __add_wait_queue(&ctx->fd_wqh, &wait);
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                spin_lock(&ctx->fault_pending_wqh.lock);
                uwq = find_userfault(ctx);
                if (uwq) {
                        /*
                         * Use a seqcount to repeat the lockless check
                         * in wake_userfault() to avoid missing
                         * wakeups because during the refile both
                         * waitqueue could become empty if this is the
                         * only userfault.
                         */
                        write_seqcount_begin(&ctx->refile_seq);

                        /*
                         * The fault_pending_wqh.lock prevents the uwq
                         * to disappear from under us.
                         *
                         * Refile this userfault from
                         * fault_pending_wqh to fault_wqh, it's not
                         * pending anymore after we read it.
                         *
                         * Use list_del() by hand (as
                         * userfaultfd_wake_function also uses
                         * list_del_init() by hand) to be sure nobody
                         * changes __remove_wait_queue() to use
                         * list_del_init() in turn breaking the
                         * !list_empty_careful() check in
                         * handle_userfault(). The uwq->wq.head list
                         * must never be empty at any time during the
                         * refile, or the waitqueue could disappear
                         * from under us. The "wait_queue_head_t"
                         * parameter of __remove_wait_queue() is unused
                         * anyway.
                         */
                        list_del(&uwq->wq.entry);
                        add_wait_queue(&ctx->fault_wqh, &uwq->wq);

                        write_seqcount_end(&ctx->refile_seq);

                        /* careful to always initialize msg if ret == 0 */
                        *msg = uwq->msg;
                        spin_unlock(&ctx->fault_pending_wqh.lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&ctx->fault_pending_wqh.lock);

                spin_lock(&ctx->event_wqh.lock);
                uwq = find_userfault_evt(ctx);
                if (uwq) {
                        *msg = uwq->msg;

                        if (uwq->msg.event == UFFD_EVENT_FORK) {
                                fork_nctx = (struct userfaultfd_ctx *)
                                        (unsigned long)
                                        uwq->msg.arg.reserved.reserved1;
                                list_move(&uwq->wq.entry, &fork_event);
                                /*
                                 * fork_nctx can be freed as soon as
                                 * we drop the lock, unless we take a
                                 * reference on it.
                                 */
                                userfaultfd_ctx_get(fork_nctx);
                                spin_unlock(&ctx->event_wqh.lock);
                                ret = 0;
                                break;
                        }

                        userfaultfd_event_complete(ctx, uwq);
                        spin_unlock(&ctx->event_wqh.lock);
                        ret = 0;
                        break;
                }
                spin_unlock(&ctx->event_wqh.lock);

                if (signal_pending(current)) {
                        ret = -ERESTARTSYS;
                        break;
                }
                if (no_wait) {
                        ret = -EAGAIN;
                        break;
                }
                spin_unlock_irq(&ctx->fd_wqh.lock);
                schedule();
                spin_lock_irq(&ctx->fd_wqh.lock);
        }
        __remove_wait_queue(&ctx->fd_wqh, &wait);
        __set_current_state(TASK_RUNNING);
        spin_unlock_irq(&ctx->fd_wqh.lock);

        if (!ret && msg->event == UFFD_EVENT_FORK) {
                ret = resolve_userfault_fork(fork_nctx, inode, msg);
                spin_lock_irq(&ctx->event_wqh.lock);
                if (!list_empty(&fork_event)) {
                        /*
                         * The fork thread didn't abort, so we can
                         * drop the temporary refcount.
                         */
                        userfaultfd_ctx_put(fork_nctx);

                        uwq = list_first_entry(&fork_event,
                                               typeof(*uwq),
                                               wq.entry);
                        /*
                         * If fork_event list wasn't empty and in turn
                         * the event wasn't already released by fork
                         * (the event is allocated on fork kernel
                         * stack), put the event back to its place in
                         * the event_wq. fork_event head will be freed
                         * as soon as we return so the event cannot
                         * stay queued there no matter the current
                         * "ret" value.
                         */
                        list_del(&uwq->wq.entry);
                        __add_wait_queue(&ctx->event_wqh, &uwq->wq);

                        /*
                         * Leave the event in the waitqueue and report
                         * error to userland if we failed to resolve
                         * the userfault fork.
                         */
                        if (likely(!ret))
                                userfaultfd_event_complete(ctx, uwq);
                } else {
                        /*
                         * Here the fork thread aborted and the
                         * refcount from the fork thread on fork_nctx
                         * has already been released. We still hold
                         * the reference we took before releasing the
                         * lock above. If resolve_userfault_fork
                         * failed we've to drop it because the
                         * fork_nctx has to be freed in such case. If
                         * it succeeded we'll hold it because the new
                         * uffd references it.
                         */
                        if (ret)
                                userfaultfd_ctx_put(fork_nctx);
                }
                spin_unlock_irq(&ctx->event_wqh.lock);
        }

        return ret;
}

static ssize_t userfaultfd_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct userfaultfd_ctx *ctx = file->private_data;
        ssize_t _ret, ret = 0;
        struct uffd_msg msg;
        struct inode *inode = file_inode(file);
        bool no_wait;

        if (!userfaultfd_is_initialized(ctx))
                return -EINVAL;

        no_wait = file->f_flags & O_NONBLOCK || iocb->ki_flags & IOCB_NOWAIT;
        for (;;) {
                if (iov_iter_count(to) < sizeof(msg))
                        return ret ? ret : -EINVAL;
                _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode);
                if (_ret < 0)
                        return ret ? ret : _ret;
                _ret = !copy_to_iter_full(&msg, sizeof(msg), to);
                if (_ret)
                        return ret ? ret : -EFAULT;
                ret += sizeof(msg);
                /*
                 * Allow to read more than one fault at time but only
                 * block if waiting for the very first one.
                 */
                no_wait = true;
        }
}

static void __wake_userfault(struct userfaultfd_ctx *ctx,
                             struct userfaultfd_wake_range *range)
{
        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        /* wake all in the range and autoremove */
        if (waitqueue_active(&ctx->fault_pending_wqh))
                __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
                                     range);
        if (waitqueue_active(&ctx->fault_wqh))
                __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range);
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);
}

static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
                                           struct userfaultfd_wake_range *range)
{
        unsigned seq;
        bool need_wakeup;

        /*
         * To be sure waitqueue_active() is not reordered by the CPU
         * before the pagetable update, use an explicit SMP memory
         * barrier here. PT lock release or mmap_read_unlock(mm) still
         * have release semantics that can allow the
         * waitqueue_active() to be reordered before the pte update.
         */
        smp_mb();

        /*
         * Use waitqueue_active because it's very frequent to
         * change the address space atomically even if there are no
         * userfaults yet. So we take the spinlock only when we're
         * sure we've userfaults to wake.
         */
        do {
                seq = read_seqcount_begin(&ctx->refile_seq);
                need_wakeup = waitqueue_active(&ctx->fault_pending_wqh) ||
                        waitqueue_active(&ctx->fault_wqh);
                cond_resched();
        } while (read_seqcount_retry(&ctx->refile_seq, seq));
        if (need_wakeup)
                __wake_userfault(ctx, range);
}

static __always_inline int validate_unaligned_range(
        struct mm_struct *mm, __u64 start, __u64 len)
{
        __u64 task_size = mm->task_size;

        if (len & ~PAGE_MASK)
                return -EINVAL;
        if (!len)
                return -EINVAL;
        if (start < mmap_min_addr)
                return -EINVAL;
        if (start >= task_size)
                return -EINVAL;
        if (len > task_size - start)
                return -EINVAL;
        if (start + len <= start)
                return -EINVAL;
        return 0;
}

static __always_inline int validate_range(struct mm_struct *mm,
                                          __u64 start, __u64 len)
{
        if (start & ~PAGE_MASK)
                return -EINVAL;

        return validate_unaligned_range(mm, start, len);
}

static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                                unsigned long arg)
{
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *cur;
        int ret;
        struct uffdio_register uffdio_register;
        struct uffdio_register __user *user_uffdio_register;
        unsigned long vm_flags;
        bool found;
        bool basic_ioctls;
        unsigned long start, end;
        struct vma_iterator vmi;
        bool wp_async = userfaultfd_wp_async_ctx(ctx);

        user_uffdio_register = (struct uffdio_register __user *) arg;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_register, user_uffdio_register,
                           sizeof(uffdio_register)-sizeof(__u64)))
                goto out;

        ret = -EINVAL;
        if (!uffdio_register.mode)
                goto out;
        if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES)
                goto out;
        vm_flags = 0;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING)
                vm_flags |= VM_UFFD_MISSING;
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
                goto out;
#endif
                vm_flags |= VM_UFFD_WP;
        }
        if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
                goto out;
#endif
                vm_flags |= VM_UFFD_MINOR;
        }

        ret = validate_range(mm, uffdio_register.range.start,
                             uffdio_register.range.len);
        if (ret)
                goto out;

        start = uffdio_register.range.start;
        end = start + uffdio_register.range.len;

        ret = -ENOMEM;
        if (!mmget_not_zero(mm))
                goto out;

        ret = -EINVAL;
        mmap_write_lock(mm);
        vma_iter_init(&vmi, mm, start);
        vma = vma_find(&vmi, end);
        if (!vma)
                goto out_unlock;

        /*
         * If the first vma contains huge pages, make sure start address
         * is aligned to huge page size.
         */
        if (is_vm_hugetlb_page(vma)) {
                unsigned long vma_hpagesize = vma_kernel_pagesize(vma);

                if (start & (vma_hpagesize - 1))
                        goto out_unlock;
        }

        /*
         * Search for not compatible vmas.
         */
        found = false;
        basic_ioctls = false;
        cur = vma;
        do {
                cond_resched();

                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
                       !!(cur->vm_flags & __VM_UFFD_FLAGS));

                /* check not compatible vmas */
                ret = -EINVAL;
                if (!vma_can_userfault(cur, vm_flags, wp_async))
                        goto out_unlock;

                /*
                 * UFFDIO_COPY will fill file holes even without
                 * PROT_WRITE. This check enforces that if this is a
                 * MAP_SHARED, the process has write permission to the backing
                 * file. If VM_MAYWRITE is set it also enforces that on a
                 * MAP_SHARED vma: there is no F_WRITE_SEAL and no further
                 * F_WRITE_SEAL can be taken until the vma is destroyed.
                 */
                ret = -EPERM;
                if (unlikely(!(cur->vm_flags & VM_MAYWRITE)))
                        goto out_unlock;

                /*
                 * If this vma contains ending address, and huge pages
                 * check alignment.
                 */
                if (is_vm_hugetlb_page(cur) && end <= cur->vm_end &&
                    end > cur->vm_start) {
                        unsigned long vma_hpagesize = vma_kernel_pagesize(cur);

                        ret = -EINVAL;

                        if (end & (vma_hpagesize - 1))
                                goto out_unlock;
                }
                if ((vm_flags & VM_UFFD_WP) && !(cur->vm_flags & VM_MAYWRITE))
                        goto out_unlock;

                /*
                 * Check that this vma isn't already owned by a
                 * different userfaultfd. We can't allow more than one
                 * userfaultfd to own a single vma simultaneously or we
                 * wouldn't know which one to deliver the userfaults to.
                 */
                ret = -EBUSY;
                if (cur->vm_userfaultfd_ctx.ctx &&
                    cur->vm_userfaultfd_ctx.ctx != ctx)
                        goto out_unlock;

                /*
                 * Note vmas containing huge pages
                 */
                if (is_vm_hugetlb_page(cur))
                        basic_ioctls = true;

                found = true;
        } for_each_vma_range(vmi, cur, end);
        BUG_ON(!found);

        ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
                                         wp_async);

out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
        if (!ret) {
                __u64 ioctls_out;

                ioctls_out = basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC :
                    UFFD_API_RANGE_IOCTLS;

                /*
                 * Declare the WP ioctl only if the WP mode is
                 * specified and all checks passed with the range
                 */
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);

                /* CONTINUE ioctl is only supported for MINOR ranges. */
                if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
                        ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);

                /*
                 * Now that we scanned all vmas we can already tell
                 * userland which ioctls methods are guaranteed to
                 * succeed on this range.
                 */
                if (put_user(ioctls_out, &user_uffdio_register->ioctls))
                        ret = -EFAULT;
        }
out:
        return ret;
}

static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                                  unsigned long arg)
{
        struct mm_struct *mm = ctx->mm;
        struct vm_area_struct *vma, *prev, *cur;
        int ret;
        struct uffdio_range uffdio_unregister;
        bool found;
        unsigned long start, end, vma_end;
        const void __user *buf = (void __user *)arg;
        struct vma_iterator vmi;
        bool wp_async = userfaultfd_wp_async_ctx(ctx);

        ret = -EFAULT;
        if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
                goto out;

        ret = validate_range(mm, uffdio_unregister.start,
                             uffdio_unregister.len);
        if (ret)
                goto out;

        start = uffdio_unregister.start;
        end = start + uffdio_unregister.len;

        ret = -ENOMEM;
        if (!mmget_not_zero(mm))
                goto out;

        mmap_write_lock(mm);
        ret = -EINVAL;
        vma_iter_init(&vmi, mm, start);
        vma = vma_find(&vmi, end);
        if (!vma)
                goto out_unlock;

        /*
         * If the first vma contains huge pages, make sure start address
         * is aligned to huge page size.
         */
        if (is_vm_hugetlb_page(vma)) {
                unsigned long vma_hpagesize = vma_kernel_pagesize(vma);

                if (start & (vma_hpagesize - 1))
                        goto out_unlock;
        }

        /*
         * Search for not compatible vmas.
         */
        found = false;
        cur = vma;
        do {
                cond_resched();

                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
                       !!(cur->vm_flags & __VM_UFFD_FLAGS));

                /*
                 * Check not compatible vmas, not strictly required
                 * here as not compatible vmas cannot have an
                 * userfaultfd_ctx registered on them, but this
                 * provides for more strict behavior to notice
                 * unregistration errors.
                 */
                if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
                        goto out_unlock;

                found = true;
        } for_each_vma_range(vmi, cur, end);
        BUG_ON(!found);

        vma_iter_set(&vmi, start);
        prev = vma_prev(&vmi);
        if (vma->vm_start < start)
                prev = vma;

        ret = 0;
        for_each_vma_range(vmi, vma, end) {
                cond_resched();

                BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async));

                /*
                 * Nothing to do: this vma is already registered into this
                 * userfaultfd and with the right tracking mode too.
                 */
                if (!vma->vm_userfaultfd_ctx.ctx)
                        goto skip;

                WARN_ON(!(vma->vm_flags & VM_MAYWRITE));

                if (vma->vm_start > start)
                        start = vma->vm_start;
                vma_end = min(end, vma->vm_end);

                if (userfaultfd_missing(vma)) {
                        /*
                         * Wake any concurrent pending userfault while
                         * we unregister, so they will not hang
                         * permanently and it avoids userland to call
                         * UFFDIO_WAKE explicitly.
                         */
                        struct userfaultfd_wake_range range;
                        range.start = start;
                        range.len = vma_end - start;
                        wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
                }

                vma = userfaultfd_clear_vma(&vmi, prev, vma,
                                            start, vma_end);
                if (IS_ERR(vma)) {
                        ret = PTR_ERR(vma);
                        break;
                }

        skip:
                prev = vma;
                start = vma->vm_end;
        }

out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
out:
        return ret;
}

/*
 * userfaultfd_wake may be used in combination with the
 * UFFDIO_*_MODE_DONTWAKE to wakeup userfaults in batches.
 */
static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        int ret;
        struct uffdio_range uffdio_wake;
        struct userfaultfd_wake_range range;
        const void __user *buf = (void __user *)arg;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
                goto out;

        ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
        if (ret)
                goto out;

        range.start = uffdio_wake.start;
        range.len = uffdio_wake.len;

        /*
         * len == 0 means wake all and we don't want to wake all here,
         * so check it again to be sure.
         */
        VM_BUG_ON(!range.len);

        wake_userfault(ctx, &range);
        ret = 0;

out:
        return ret;
}

static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        __s64 ret;
        struct uffdio_copy uffdio_copy;
        struct uffdio_copy __user *user_uffdio_copy;
        struct userfaultfd_wake_range range;
        uffd_flags_t flags = 0;

        user_uffdio_copy = (struct uffdio_copy __user *) arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_copy, user_uffdio_copy,
                           /* don't copy "copy" last field */
                           sizeof(uffdio_copy)-sizeof(__s64)))
                goto out;

        ret = validate_unaligned_range(ctx->mm, uffdio_copy.src,
                                       uffdio_copy.len);
        if (ret)
                goto out;
        ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
                goto out;
        if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
                flags |= MFILL_ATOMIC_WP;
        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
                                        uffdio_copy.len, flags);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }
        if (unlikely(put_user(ret, &user_uffdio_copy->copy)))
                return -EFAULT;
        if (ret < 0)
                goto out;
        BUG_ON(!ret);
        /* len == 0 would wake all */
        range.len = ret;
        if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
                range.start = uffdio_copy.dst;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
out:
        return ret;
}

static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
                                unsigned long arg)
{
        __s64 ret;
        struct uffdio_zeropage uffdio_zeropage;
        struct uffdio_zeropage __user *user_uffdio_zeropage;
        struct userfaultfd_wake_range range;

        user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
                           /* don't copy "zeropage" last field */
                           sizeof(uffdio_zeropage)-sizeof(__s64)))
                goto out;

        ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
                             uffdio_zeropage.range.len);
        if (ret)
                goto out;
        ret = -EINVAL;
        if (uffdio_zeropage.mode & ~UFFDIO_ZEROPAGE_MODE_DONTWAKE)
                goto out;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
                                           uffdio_zeropage.range.len);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }
        if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
                return -EFAULT;
        if (ret < 0)
                goto out;
        /* len == 0 would wake all */
        BUG_ON(!ret);
        range.len = ret;
        if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
                range.start = uffdio_zeropage.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
out:
        return ret;
}

static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
                                    unsigned long arg)
{
        int ret;
        struct uffdio_writeprotect uffdio_wp;
        struct uffdio_writeprotect __user *user_uffdio_wp;
        struct userfaultfd_wake_range range;
        bool mode_wp, mode_dontwake;

        if (atomic_read(&ctx->mmap_changing))
                return -EAGAIN;

        user_uffdio_wp = (struct uffdio_writeprotect __user *) arg;

        if (copy_from_user(&uffdio_wp, user_uffdio_wp,
                           sizeof(struct uffdio_writeprotect)))
                return -EFAULT;

        ret = validate_range(ctx->mm, uffdio_wp.range.start,
                             uffdio_wp.range.len);
        if (ret)
                return ret;

        if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
                               UFFDIO_WRITEPROTECT_MODE_WP))
                return -EINVAL;

        mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
        mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;

        if (mode_wp && mode_dontwake)
                return -EINVAL;

        if (mmget_not_zero(ctx->mm)) {
                ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
                                          uffdio_wp.range.len, mode_wp);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (ret)
                return ret;

        if (!mode_wp && !mode_dontwake) {
                range.start = uffdio_wp.range.start;
                range.len = uffdio_wp.range.len;
                wake_userfault(ctx, &range);
        }
        return ret;
}

static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
        __s64 ret;
        struct uffdio_continue uffdio_continue;
        struct uffdio_continue __user *user_uffdio_continue;
        struct userfaultfd_wake_range range;
        uffd_flags_t flags = 0;

        user_uffdio_continue = (struct uffdio_continue __user *)arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_continue, user_uffdio_continue,
                           /* don't copy the output fields */
                           sizeof(uffdio_continue) - (sizeof(__s64))))
                goto out;

        ret = validate_range(ctx->mm, uffdio_continue.range.start,
                             uffdio_continue.range.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
                                     UFFDIO_CONTINUE_MODE_WP))
                goto out;
        if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
                flags |= MFILL_ATOMIC_WP;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
                                            uffdio_continue.range.len, flags);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        BUG_ON(!ret);
        range.len = ret;
        if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
                range.start = uffdio_continue.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;

out:
        return ret;
}

static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg)
{
        __s64 ret;
        struct uffdio_poison uffdio_poison;
        struct uffdio_poison __user *user_uffdio_poison;
        struct userfaultfd_wake_range range;

        user_uffdio_poison = (struct uffdio_poison __user *)arg;

        ret = -EAGAIN;
        if (atomic_read(&ctx->mmap_changing))
                goto out;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_poison, user_uffdio_poison,
                           /* don't copy the output fields */
                           sizeof(uffdio_poison) - (sizeof(__s64))))
                goto out;

        ret = validate_range(ctx->mm, uffdio_poison.range.start,
                             uffdio_poison.range.len);
        if (ret)
                goto out;

        ret = -EINVAL;
        if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE)
                goto out;

        if (mmget_not_zero(ctx->mm)) {
                ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
                                          uffdio_poison.range.len, 0);
                mmput(ctx->mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_poison->updated)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        BUG_ON(!ret);
        range.len = ret;
        if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) {
                range.start = uffdio_poison.range.start;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN;

out:
        return ret;
}

bool userfaultfd_wp_async(struct vm_area_struct *vma)
{
        return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
}

static inline unsigned int uffd_ctx_features(__u64 user_features)
{
        /*
         * For the current set of features the bits just coincide. Set
         * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
         */
        return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}

static int userfaultfd_move(struct userfaultfd_ctx *ctx,
                            unsigned long arg)
{
        __s64 ret;
        struct uffdio_move uffdio_move;
        struct uffdio_move __user *user_uffdio_move;
        struct userfaultfd_wake_range range;
        struct mm_struct *mm = ctx->mm;

        user_uffdio_move = (struct uffdio_move __user *) arg;

        if (atomic_read(&ctx->mmap_changing))
                return -EAGAIN;

        if (copy_from_user(&uffdio_move, user_uffdio_move,
                           /* don't copy "move" last field */
                           sizeof(uffdio_move)-sizeof(__s64)))
                return -EFAULT;

        /* Do not allow cross-mm moves. */
        if (mm != current->mm)
                return -EINVAL;

        ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
        if (ret)
                return ret;

        ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
        if (ret)
                return ret;

        if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
                                  UFFDIO_MOVE_MODE_DONTWAKE))
                return -EINVAL;

        if (mmget_not_zero(mm)) {
                ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
                                 uffdio_move.len, uffdio_move.mode);
                mmput(mm);
        } else {
                return -ESRCH;
        }

        if (unlikely(put_user(ret, &user_uffdio_move->move)))
                return -EFAULT;
        if (ret < 0)
                goto out;

        /* len == 0 would wake all */
        VM_WARN_ON(!ret);
        range.len = ret;
        if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
                range.start = uffdio_move.dst;
                wake_userfault(ctx, &range);
        }
        ret = range.len == uffdio_move.len ? 0 : -EAGAIN;

out:
        return ret;
}

/*
 * userland asks for a certain API version and we return which bits
 * and ioctl commands are implemented in this kernel for such API
 * version or -EINVAL if unknown.
 */
static int userfaultfd_api(struct userfaultfd_ctx *ctx,
                           unsigned long arg)
{
        struct uffdio_api uffdio_api;
        void __user *buf = (void __user *)arg;
        unsigned int ctx_features;
        int ret;
        __u64 features;

        ret = -EFAULT;
        if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
                goto out;
        features = uffdio_api.features;
        ret = -EINVAL;
        if (uffdio_api.api != UFFD_API)
                goto err_out;
        ret = -EPERM;
        if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
                goto err_out;

        /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */
        if (features & UFFD_FEATURE_WP_ASYNC)
                features |= UFFD_FEATURE_WP_UNPOPULATED;

        /* report all available features and ioctls to userland */
        uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
        uffdio_api.features &=
                ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
        uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
#endif
#ifndef CONFIG_PTE_MARKER_UFFD_WP
        uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
        uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
        uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
#endif

        ret = -EINVAL;
        if (features & ~uffdio_api.features)
                goto err_out;

        uffdio_api.ioctls = UFFD_API_IOCTLS;
        ret = -EFAULT;
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                goto out;

        /* only enable the requested features for this uffd context */
        ctx_features = uffd_ctx_features(features);
        ret = -EINVAL;
        if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
                goto err_out;

        ret = 0;
out:
        return ret;
err_out:
        memset(&uffdio_api, 0, sizeof(uffdio_api));
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                ret = -EFAULT;
        goto out;
}

static long userfaultfd_ioctl(struct file *file, unsigned cmd,
                              unsigned long arg)
{
        int ret = -EINVAL;
        struct userfaultfd_ctx *ctx = file->private_data;

        if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
                return -EINVAL;

        switch(cmd) {
        case UFFDIO_API:
                ret = userfaultfd_api(ctx, arg);
                break;
        case UFFDIO_REGISTER:
                ret = userfaultfd_register(ctx, arg);
                break;
        case UFFDIO_UNREGISTER:
                ret = userfaultfd_unregister(ctx, arg);
                break;
        case UFFDIO_WAKE:
                ret = userfaultfd_wake(ctx, arg);
                break;
        case UFFDIO_COPY:
                ret = userfaultfd_copy(ctx, arg);
                break;
        case UFFDIO_ZEROPAGE:
                ret = userfaultfd_zeropage(ctx, arg);
                break;
        case UFFDIO_MOVE:
                ret = userfaultfd_move(ctx, arg);
                break;
        case UFFDIO_WRITEPROTECT:
                ret = userfaultfd_writeprotect(ctx, arg);
                break;
        case UFFDIO_CONTINUE:
                ret = userfaultfd_continue(ctx, arg);
                break;
        case UFFDIO_POISON:
                ret = userfaultfd_poison(ctx, arg);
                break;
        }
        return ret;
}

#ifdef CONFIG_PROC_FS
static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
{
        struct userfaultfd_ctx *ctx = f->private_data;
        wait_queue_entry_t *wq;
        unsigned long pending = 0, total = 0;

        spin_lock_irq(&ctx->fault_pending_wqh.lock);
        list_for_each_entry(wq, &ctx->fault_pending_wqh.head, entry) {
                pending++;
                total++;
        }
        list_for_each_entry(wq, &ctx->fault_wqh.head, entry) {
                total++;
        }
        spin_unlock_irq(&ctx->fault_pending_wqh.lock);

        /*
         * If more protocols will be added, there will be all shown
         * separated by a space. Like this:
         *        protocols: aa:... bb:...
         */
        seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
                   pending, total, UFFD_API, ctx->features,
                   UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
}
#endif

static const struct file_operations userfaultfd_fops = {
#ifdef CONFIG_PROC_FS
        .show_fdinfo        = userfaultfd_show_fdinfo,
#endif
        .release        = userfaultfd_release,
        .poll                = userfaultfd_poll,
        .read_iter        = userfaultfd_read_iter,
        .unlocked_ioctl = userfaultfd_ioctl,
        .compat_ioctl        = compat_ptr_ioctl,
        .llseek                = noop_llseek,
};

static void init_once_userfaultfd_ctx(void *mem)
{
        struct userfaultfd_ctx *ctx = (struct userfaultfd_ctx *) mem;

        init_waitqueue_head(&ctx->fault_pending_wqh);
        init_waitqueue_head(&ctx->fault_wqh);
        init_waitqueue_head(&ctx->event_wqh);
        init_waitqueue_head(&ctx->fd_wqh);
        seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
}

static int new_userfaultfd(int flags)
{
        struct userfaultfd_ctx *ctx;
        struct file *file;
        int fd;

        BUG_ON(!current->mm);

        /* Check the UFFD_* constants for consistency.  */
        BUILD_BUG_ON(UFFD_USER_MODE_ONLY & UFFD_SHARED_FCNTL_FLAGS);
        BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK);

        if (flags & ~(UFFD_SHARED_FCNTL_FLAGS | UFFD_USER_MODE_ONLY))
                return -EINVAL;

        ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL);
        if (!ctx)
                return -ENOMEM;

        refcount_set(&ctx->refcount, 1);
        ctx->flags = flags;
        ctx->features = 0;
        ctx->released = false;
        init_rwsem(&ctx->map_changing_lock);
        atomic_set(&ctx->mmap_changing, 0);
        ctx->mm = current->mm;

        fd = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS);
        if (fd < 0)
                goto err_out;

        /* Create a new inode so that the LSM can block the creation.  */
        file = anon_inode_create_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
                        O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                fd = PTR_ERR(file);
                goto err_out;
        }
        /* prevent the mm struct to be freed */
        mmgrab(ctx->mm);
        file->f_mode |= FMODE_NOWAIT;
        fd_install(fd, file);
        return fd;
err_out:
        kmem_cache_free(userfaultfd_ctx_cachep, ctx);
        return fd;
}

static inline bool userfaultfd_syscall_allowed(int flags)
{
        /* Userspace-only page faults are always allowed */
        if (flags & UFFD_USER_MODE_ONLY)
                return true;

        /*
         * The user is requesting a userfaultfd which can handle kernel faults.
         * Privileged users are always allowed to do this.
         */
        if (capable(CAP_SYS_PTRACE))
                return true;

        /* Otherwise, access to kernel fault handling is sysctl controlled. */
        return sysctl_unprivileged_userfaultfd;
}

SYSCALL_DEFINE1(userfaultfd, int, flags)
{
        if (!userfaultfd_syscall_allowed(flags))
                return -EPERM;

        return new_userfaultfd(flags);
}

static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
{
        if (cmd != USERFAULTFD_IOC_NEW)
                return -EINVAL;

        return new_userfaultfd(flags);
}

static const struct file_operations userfaultfd_dev_fops = {
        .unlocked_ioctl = userfaultfd_dev_ioctl,
        .compat_ioctl = userfaultfd_dev_ioctl,
        .owner = THIS_MODULE,
        .llseek = noop_llseek,
};

static struct miscdevice userfaultfd_misc = {
        .minor = MISC_DYNAMIC_MINOR,
        .name = "userfaultfd",
        .fops = &userfaultfd_dev_fops
};

static int __init userfaultfd_init(void)
{
        int ret;

        ret = misc_register(&userfaultfd_misc);
        if (ret)
                return ret;

        userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
                                                sizeof(struct userfaultfd_ctx),
                                                0,
                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                init_once_userfaultfd_ctx);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", vm_userfaultfd_table);
#endif
        return 0;
}
__initcall(userfaultfd_init);








































































































































































































































































































































































































































































































































































































































































































































































































    8 





































































































































    8 









    8 









































    8 
    8 




















































   35 





   35 













































  497 
  499 









  171 

  171 


   49 


  122 

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2005-2010 IBM Corporation
 *
 * Author:
 * Mimi Zohar <zohar@us.ibm.com>
 * Kylene Hall <kjhall@us.ibm.com>
 *
 * File: evm_main.c
 *        implements evm_inode_setxattr, evm_inode_post_setxattr,
 *        evm_inode_removexattr, evm_verifyxattr, and evm_inode_set_acl.
 */

#define pr_fmt(fmt) "EVM: "fmt

#include <linux/init.h>
#include <linux/audit.h>
#include <linux/xattr.h>
#include <linux/integrity.h>
#include <linux/evm.h>
#include <linux/magic.h>
#include <linux/posix_acl_xattr.h>
#include <linux/lsm_hooks.h>

#include <crypto/hash.h>
#include <crypto/hash_info.h>
#include <crypto/utils.h>
#include "evm.h"

int evm_initialized;

static const char * const integrity_status_msg[] = {
        "pass", "pass_immutable", "fail", "fail_immutable", "no_label",
        "no_xattrs", "unknown"
};
int evm_hmac_attrs;

static struct xattr_list evm_config_default_xattrnames[] = {
        {
         .name = XATTR_NAME_SELINUX,
         .enabled = IS_ENABLED(CONFIG_SECURITY_SELINUX)
        },
        {
         .name = XATTR_NAME_SMACK,
         .enabled = IS_ENABLED(CONFIG_SECURITY_SMACK)
        },
        {
         .name = XATTR_NAME_SMACKEXEC,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_SMACKTRANSMUTE,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_SMACKMMAP,
         .enabled = IS_ENABLED(CONFIG_EVM_EXTRA_SMACK_XATTRS)
        },
        {
         .name = XATTR_NAME_APPARMOR,
         .enabled = IS_ENABLED(CONFIG_SECURITY_APPARMOR)
        },
        {
         .name = XATTR_NAME_IMA,
         .enabled = IS_ENABLED(CONFIG_IMA_APPRAISE)
        },
        {
         .name = XATTR_NAME_CAPS,
         .enabled = true
        },
};

LIST_HEAD(evm_config_xattrnames);

static int evm_fixmode __ro_after_init;
static int __init evm_set_fixmode(char *str)
{
        if (strncmp(str, "fix", 3) == 0)
                evm_fixmode = 1;
        else
                pr_err("invalid \"%s\" mode", str);

        return 1;
}
__setup("evm=", evm_set_fixmode);

static void __init evm_init_config(void)
{
        int i, xattrs;

        xattrs = ARRAY_SIZE(evm_config_default_xattrnames);

        pr_info("Initialising EVM extended attributes:\n");
        for (i = 0; i < xattrs; i++) {
                pr_info("%s%s\n", evm_config_default_xattrnames[i].name,
                        !evm_config_default_xattrnames[i].enabled ?
                        " (disabled)" : "");
                list_add_tail(&evm_config_default_xattrnames[i].list,
                              &evm_config_xattrnames);
        }

#ifdef CONFIG_EVM_ATTR_FSUUID
        evm_hmac_attrs |= EVM_ATTR_FSUUID;
#endif
        pr_info("HMAC attrs: 0x%x\n", evm_hmac_attrs);
}

static bool evm_key_loaded(void)
{
        return (bool)(evm_initialized & EVM_KEY_MASK);
}

/*
 * This function determines whether or not it is safe to ignore verification
 * errors, based on the ability of EVM to calculate HMACs. If the HMAC key
 * is not loaded, and it cannot be loaded in the future due to the
 * EVM_SETUP_COMPLETE initialization flag, allowing an operation despite the
 * attrs/xattrs being found invalid will not make them valid.
 */
static bool evm_hmac_disabled(void)
{
        if (evm_initialized & EVM_INIT_HMAC)
                return false;

        if (!(evm_initialized & EVM_SETUP_COMPLETE))
                return false;

        return true;
}

static int evm_find_protected_xattrs(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        struct xattr_list *xattr;
        int error;
        int count = 0;

        if (!(inode->i_opflags & IOP_XATTR))
                return -EOPNOTSUPP;

        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                error = __vfs_getxattr(dentry, inode, xattr->name, NULL, 0);
                if (error < 0) {
                        if (error == -ENODATA)
                                continue;
                        return error;
                }
                count++;
        }

        return count;
}

static int is_unsupported_hmac_fs(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        if (inode->i_sb->s_iflags & SB_I_EVM_HMAC_UNSUPPORTED) {
                pr_info_once("%s not supported\n", inode->i_sb->s_type->name);
                return 1;
        }
        return 0;
}

/*
 * evm_verify_hmac - calculate and compare the HMAC with the EVM xattr
 *
 * Compute the HMAC on the dentry's protected set of extended attributes
 * and compare it against the stored security.evm xattr.
 *
 * For performance:
 * - use the previously retrieved xattr value and length to calculate the
 *   HMAC.)
 * - cache the verification result in the iint, when available.
 *
 * Returns integrity status
 */
static enum integrity_status evm_verify_hmac(struct dentry *dentry,
                                             const char *xattr_name,
                                             char *xattr_value,
                                             size_t xattr_value_len)
{
        struct evm_ima_xattr_data *xattr_data = NULL;
        struct signature_v2_hdr *hdr;
        enum integrity_status evm_status = INTEGRITY_PASS;
        struct evm_digest digest;
        struct inode *inode = d_backing_inode(dentry);
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        int rc, xattr_len, evm_immutable = 0;

        if (iint && (iint->evm_status == INTEGRITY_PASS ||
                     iint->evm_status == INTEGRITY_PASS_IMMUTABLE))
                return iint->evm_status;

        /*
         * On unsupported filesystems without EVM_INIT_X509 enabled, skip
         * signature verification.
         */
        if (!(evm_initialized & EVM_INIT_X509) &&
            is_unsupported_hmac_fs(dentry))
                return INTEGRITY_UNKNOWN;

        /* if status is not PASS, try to check again - against -ENOMEM */

        /* first need to know the sig type */
        rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_EVM,
                                (char **)&xattr_data, 0, GFP_NOFS);
        if (rc <= 0) {
                evm_status = INTEGRITY_FAIL;
                if (rc == -ENODATA) {
                        rc = evm_find_protected_xattrs(dentry);
                        if (rc > 0)
                                evm_status = INTEGRITY_NOLABEL;
                        else if (rc == 0)
                                evm_status = INTEGRITY_NOXATTRS; /* new file */
                } else if (rc == -EOPNOTSUPP) {
                        evm_status = INTEGRITY_UNKNOWN;
                }
                goto out;
        }

        xattr_len = rc;

        /* check value type */
        switch (xattr_data->type) {
        case EVM_XATTR_HMAC:
                if (xattr_len != sizeof(struct evm_xattr)) {
                        evm_status = INTEGRITY_FAIL;
                        goto out;
                }

                digest.hdr.algo = HASH_ALGO_SHA1;
                rc = evm_calc_hmac(dentry, xattr_name, xattr_value,
                                   xattr_value_len, &digest, iint);
                if (rc)
                        break;
                rc = crypto_memneq(xattr_data->data, digest.digest,
                                   SHA1_DIGEST_SIZE);
                if (rc)
                        rc = -EINVAL;
                break;
        case EVM_XATTR_PORTABLE_DIGSIG:
                evm_immutable = 1;
                fallthrough;
        case EVM_IMA_XATTR_DIGSIG:
                /* accept xattr with non-empty signature field */
                if (xattr_len <= sizeof(struct signature_v2_hdr)) {
                        evm_status = INTEGRITY_FAIL;
                        goto out;
                }

                hdr = (struct signature_v2_hdr *)xattr_data;
                digest.hdr.algo = hdr->hash_algo;
                rc = evm_calc_hash(dentry, xattr_name, xattr_value,
                                   xattr_value_len, xattr_data->type, &digest,
                                   iint);
                if (rc)
                        break;
                rc = integrity_digsig_verify(INTEGRITY_KEYRING_EVM,
                                        (const char *)xattr_data, xattr_len,
                                        digest.digest, digest.hdr.length);
                if (!rc) {
                        if (xattr_data->type == EVM_XATTR_PORTABLE_DIGSIG) {
                                if (iint)
                                        iint->flags |= EVM_IMMUTABLE_DIGSIG;
                                evm_status = INTEGRITY_PASS_IMMUTABLE;
                        } else if (!IS_RDONLY(inode) &&
                                   !(inode->i_sb->s_readonly_remount) &&
                                   !IS_IMMUTABLE(inode) &&
                                   !is_unsupported_hmac_fs(dentry)) {
                                evm_update_evmxattr(dentry, xattr_name,
                                                    xattr_value,
                                                    xattr_value_len);
                        }
                }
                break;
        default:
                rc = -EINVAL;
                break;
        }

        if (rc) {
                if (rc == -ENODATA)
                        evm_status = INTEGRITY_NOXATTRS;
                else if (evm_immutable)
                        evm_status = INTEGRITY_FAIL_IMMUTABLE;
                else
                        evm_status = INTEGRITY_FAIL;
        }
        pr_debug("digest: (%d) [%*phN]\n", digest.hdr.length, digest.hdr.length,
                  digest.digest);
out:
        if (iint)
                iint->evm_status = evm_status;
        kfree(xattr_data);
        return evm_status;
}

static int evm_protected_xattr_common(const char *req_xattr_name,
                                      bool all_xattrs)
{
        int namelen;
        int found = 0;
        struct xattr_list *xattr;

        namelen = strlen(req_xattr_name);
        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                if (!all_xattrs && !xattr->enabled)
                        continue;

                if ((strlen(xattr->name) == namelen)
                    && (strncmp(req_xattr_name, xattr->name, namelen) == 0)) {
                        found = 1;
                        break;
                }
                if (strncmp(req_xattr_name,
                            xattr->name + XATTR_SECURITY_PREFIX_LEN,
                            strlen(req_xattr_name)) == 0) {
                        found = 1;
                        break;
                }
        }

        return found;
}

int evm_protected_xattr(const char *req_xattr_name)
{
        return evm_protected_xattr_common(req_xattr_name, false);
}

int evm_protected_xattr_if_enabled(const char *req_xattr_name)
{
        return evm_protected_xattr_common(req_xattr_name, true);
}

/**
 * evm_read_protected_xattrs - read EVM protected xattr names, lengths, values
 * @dentry: dentry of the read xattrs
 * @buffer: buffer xattr names, lengths or values are copied to
 * @buffer_size: size of buffer
 * @type: n: names, l: lengths, v: values
 * @canonical_fmt: data format (true: little endian, false: native format)
 *
 * Read protected xattr names (separated by |), lengths (u32) or values for a
 * given dentry and return the total size of copied data. If buffer is NULL,
 * just return the total size.
 *
 * Returns the total size on success, a negative value on error.
 */
int evm_read_protected_xattrs(struct dentry *dentry, u8 *buffer,
                              int buffer_size, char type, bool canonical_fmt)
{
        struct xattr_list *xattr;
        int rc, size, total_size = 0;

        list_for_each_entry_lockless(xattr, &evm_config_xattrnames, list) {
                rc = __vfs_getxattr(dentry, d_backing_inode(dentry),
                                    xattr->name, NULL, 0);
                if (rc < 0 && rc == -ENODATA)
                        continue;
                else if (rc < 0)
                        return rc;

                switch (type) {
                case 'n':
                        size = strlen(xattr->name) + 1;
                        if (buffer) {
                                if (total_size)
                                        *(buffer + total_size - 1) = '|';

                                memcpy(buffer + total_size, xattr->name, size);
                        }
                        break;
                case 'l':
                        size = sizeof(u32);
                        if (buffer) {
                                if (canonical_fmt)
                                        rc = (__force int)cpu_to_le32(rc);

                                *(u32 *)(buffer + total_size) = rc;
                        }
                        break;
                case 'v':
                        size = rc;
                        if (buffer) {
                                rc = __vfs_getxattr(dentry,
                                        d_backing_inode(dentry), xattr->name,
                                        buffer + total_size,
                                        buffer_size - total_size);
                                if (rc < 0)
                                        return rc;
                        }
                        break;
                default:
                        return -EINVAL;
                }

                total_size += size;
        }

        return total_size;
}

/**
 * evm_verifyxattr - verify the integrity of the requested xattr
 * @dentry: object of the verify xattr
 * @xattr_name: requested xattr
 * @xattr_value: requested xattr value
 * @xattr_value_len: requested xattr value length
 *
 * Calculate the HMAC for the given dentry and verify it against the stored
 * security.evm xattr. For performance, use the xattr value and length
 * previously retrieved to calculate the HMAC.
 *
 * Returns the xattr integrity status.
 *
 * This function requires the caller to lock the inode's i_mutex before it
 * is executed.
 */
enum integrity_status evm_verifyxattr(struct dentry *dentry,
                                      const char *xattr_name,
                                      void *xattr_value, size_t xattr_value_len)
{
        if (!evm_key_loaded() || !evm_protected_xattr(xattr_name))
                return INTEGRITY_UNKNOWN;

        return evm_verify_hmac(dentry, xattr_name, xattr_value,
                                 xattr_value_len);
}
EXPORT_SYMBOL_GPL(evm_verifyxattr);

/*
 * evm_verify_current_integrity - verify the dentry's metadata integrity
 * @dentry: pointer to the affected dentry
 *
 * Verify and return the dentry's metadata integrity. The exceptions are
 * before EVM is initialized or in 'fix' mode.
 */
static enum integrity_status evm_verify_current_integrity(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        if (!evm_key_loaded() || !S_ISREG(inode->i_mode) || evm_fixmode)
                return INTEGRITY_PASS;
        return evm_verify_hmac(dentry, NULL, NULL, 0);
}

/*
 * evm_xattr_change - check if passed xattr value differs from current value
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: requested xattr
 * @xattr_value: requested xattr value
 * @xattr_value_len: requested xattr value length
 *
 * Check if passed xattr value differs from current value.
 *
 * Returns 1 if passed xattr value differs from current value, 0 otherwise.
 */
static int evm_xattr_change(struct mnt_idmap *idmap,
                            struct dentry *dentry, const char *xattr_name,
                            const void *xattr_value, size_t xattr_value_len)
{
        char *xattr_data = NULL;
        int rc = 0;

        rc = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, xattr_name, &xattr_data,
                                0, GFP_NOFS);
        if (rc < 0) {
                rc = 1;
                goto out;
        }

        if (rc == xattr_value_len)
                rc = !!memcmp(xattr_value, xattr_data, rc);
        else
                rc = 1;

out:
        kfree(xattr_data);
        return rc;
}

/*
 * evm_protect_xattr - protect the EVM extended attribute
 *
 * Prevent security.evm from being modified or removed without the
 * necessary permissions or when the existing value is invalid.
 *
 * The posix xattr acls are 'system' prefixed, which normally would not
 * affect security.evm.  An interesting side affect of writing posix xattr
 * acls is their modifying of the i_mode, which is included in security.evm.
 * For posix xattr acls only, permit security.evm, even if it currently
 * doesn't exist, to be updated unless the EVM signature is immutable.
 */
static int evm_protect_xattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, const char *xattr_name,
                             const void *xattr_value, size_t xattr_value_len)
{
        enum integrity_status evm_status;

        if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) {
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                if (is_unsupported_hmac_fs(dentry))
                        return -EPERM;
        } else if (!evm_protected_xattr(xattr_name)) {
                if (!posix_xattr_acl(xattr_name))
                        return 0;
                if (is_unsupported_hmac_fs(dentry))
                        return 0;

                evm_status = evm_verify_current_integrity(dentry);
                if ((evm_status == INTEGRITY_PASS) ||
                    (evm_status == INTEGRITY_NOXATTRS))
                        return 0;
                goto out;
        } else if (is_unsupported_hmac_fs(dentry))
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        if (evm_status == INTEGRITY_NOXATTRS) {
                struct evm_iint_cache *iint;

                /* Exception if the HMAC is not going to be calculated. */
                if (evm_hmac_disabled())
                        return 0;

                iint = evm_iint_inode(d_backing_inode(dentry));
                if (iint && (iint->flags & EVM_NEW_FILE))
                        return 0;

                /* exception for pseudo filesystems */
                if (dentry->d_sb->s_magic == TMPFS_MAGIC
                    || dentry->d_sb->s_magic == SYSFS_MAGIC)
                        return 0;

                integrity_audit_msg(AUDIT_INTEGRITY_METADATA,
                                    dentry->d_inode, dentry->d_name.name,
                                    "update_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        }
out:
        /* Exception if the HMAC is not going to be calculated. */
        if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
            evm_status == INTEGRITY_UNKNOWN))
                return 0;

        /*
         * Writing other xattrs is safe for portable signatures, as portable
         * signatures are immutable and can never be updated.
         */
        if (evm_status == INTEGRITY_FAIL_IMMUTABLE)
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_xattr_change(idmap, dentry, xattr_name, xattr_value,
                              xattr_value_len))
                return 0;

        if (evm_status != INTEGRITY_PASS &&
            evm_status != INTEGRITY_PASS_IMMUTABLE)
                integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                                    dentry->d_name.name, "appraise_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        return evm_status == INTEGRITY_PASS ? 0 : -EPERM;
}

/**
 * evm_inode_setxattr - protect the EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 * @xattr_value: pointer to the new extended attribute value
 * @xattr_value_len: pointer to the new extended attribute value length
 * @flags: flags to pass into filesystem operations
 *
 * Before allowing the 'security.evm' protected xattr to be updated,
 * verify the existing value is valid.  As only the kernel should have
 * access to the EVM encrypted key needed to calculate the HMAC, prevent
 * userspace from writing HMAC value.  Writing 'security.evm' requires
 * requires CAP_SYS_ADMIN privileges.
 */
static int evm_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                              const char *xattr_name, const void *xattr_value,
                              size_t xattr_value_len, int flags)
{
        const struct evm_ima_xattr_data *xattr_data = xattr_value;

        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        if (strcmp(xattr_name, XATTR_NAME_EVM) == 0) {
                if (!xattr_value_len)
                        return -EINVAL;
                if (xattr_data->type != EVM_IMA_XATTR_DIGSIG &&
                    xattr_data->type != EVM_XATTR_PORTABLE_DIGSIG)
                        return -EPERM;
        }
        return evm_protect_xattr(idmap, dentry, xattr_name, xattr_value,
                                 xattr_value_len);
}

/**
 * evm_inode_removexattr - protect the EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Removing 'security.evm' requires CAP_SYS_ADMIN privileges and that
 * the current value is valid.
 */
static int evm_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 const char *xattr_name)
{
        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        return evm_protect_xattr(idmap, dentry, xattr_name, NULL, 0);
}

#ifdef CONFIG_FS_POSIX_ACL
static int evm_inode_set_acl_change(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *name,
                                    struct posix_acl *kacl)
{
        int rc;

        umode_t mode;
        struct inode *inode = d_backing_inode(dentry);

        if (!kacl)
                return 1;

        rc = posix_acl_update_mode(idmap, inode, &mode, &kacl);
        if (rc || (inode->i_mode != mode))
                return 1;

        return 0;
}
#else
static inline int evm_inode_set_acl_change(struct mnt_idmap *idmap,
                                           struct dentry *dentry,
                                           const char *name,
                                           struct posix_acl *kacl)
{
        return 0;
}
#endif

/**
 * evm_inode_set_acl - protect the EVM extended attribute from posix acls
 * @idmap: idmap of the idmapped mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 * @kacl: pointer to the posix acls
 *
 * Prevent modifying posix acls causing the EVM HMAC to be re-calculated
 * and 'security.evm' xattr updated, unless the existing 'security.evm' is
 * valid.
 *
 * Return: zero on success, -EPERM on failure.
 */
static int evm_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                             const char *acl_name, struct posix_acl *kacl)
{
        enum integrity_status evm_status;

        /* Policy permits modification of the protected xattrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        if ((evm_status == INTEGRITY_PASS) ||
            (evm_status == INTEGRITY_NOXATTRS))
                return 0;

        /* Exception if the HMAC is not going to be calculated. */
        if (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
            evm_status == INTEGRITY_UNKNOWN))
                return 0;

        /*
         * Writing other xattrs is safe for portable signatures, as portable
         * signatures are immutable and can never be updated.
         */
        if (evm_status == INTEGRITY_FAIL_IMMUTABLE)
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_inode_set_acl_change(idmap, dentry, acl_name, kacl))
                return 0;

        if (evm_status != INTEGRITY_PASS_IMMUTABLE)
                integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                                    dentry->d_name.name, "appraise_metadata",
                                    integrity_status_msg[evm_status],
                                    -EPERM, 0);
        return -EPERM;
}

/**
 * evm_inode_remove_acl - Protect the EVM extended attribute from posix acls
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 *
 * Prevent removing posix acls causing the EVM HMAC to be re-calculated
 * and 'security.evm' xattr updated, unless the existing 'security.evm' is
 * valid.
 *
 * Return: zero on success, -EPERM on failure.
 */
static int evm_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                                const char *acl_name)
{
        return evm_inode_set_acl(idmap, dentry, acl_name, NULL);
}

static void evm_reset_status(struct inode *inode)
{
        struct evm_iint_cache *iint;

        iint = evm_iint_inode(inode);
        if (iint)
                iint->evm_status = INTEGRITY_UNKNOWN;
}

/**
 * evm_metadata_changed: Detect changes to the metadata
 * @inode: a file's inode
 * @metadata_inode: metadata inode
 *
 * On a stacked filesystem detect whether the metadata has changed. If this is
 * the case reset the evm_status associated with the inode that represents the
 * file.
 */
bool evm_metadata_changed(struct inode *inode, struct inode *metadata_inode)
{
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        bool ret = false;

        if (iint) {
                ret = (!IS_I_VERSION(metadata_inode) ||
                       integrity_inode_attrs_changed(&iint->metadata_inode,
                                                     metadata_inode));
                if (ret)
                        iint->evm_status = INTEGRITY_UNKNOWN;
        }

        return ret;
}

/**
 * evm_revalidate_status - report whether EVM status re-validation is necessary
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Report whether callers of evm_verifyxattr() should re-validate the
 * EVM status.
 *
 * Return true if re-validation is necessary, false otherwise.
 */
bool evm_revalidate_status(const char *xattr_name)
{
        if (!evm_key_loaded())
                return false;

        /* evm_inode_post_setattr() passes NULL */
        if (!xattr_name)
                return true;

        if (!evm_protected_xattr(xattr_name) && !posix_xattr_acl(xattr_name) &&
            strcmp(xattr_name, XATTR_NAME_EVM))
                return false;

        return true;
}

/**
 * evm_inode_post_setxattr - update 'security.evm' to reflect the changes
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 * @xattr_value: pointer to the new extended attribute value
 * @xattr_value_len: pointer to the new extended attribute value length
 * @flags: flags to pass into filesystem operations
 *
 * Update the HMAC stored in 'security.evm' to reflect the change.
 *
 * No need to take the i_mutex lock here, as this function is called from
 * __vfs_setxattr_noperm().  The caller of which has taken the inode's
 * i_mutex lock.
 */
static void evm_inode_post_setxattr(struct dentry *dentry,
                                    const char *xattr_name,
                                    const void *xattr_value,
                                    size_t xattr_value_len,
                                    int flags)
{
        if (!evm_revalidate_status(xattr_name))
                return;

        evm_reset_status(dentry->d_inode);

        if (!strcmp(xattr_name, XATTR_NAME_EVM))
                return;

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        if (is_unsupported_hmac_fs(dentry))
                return;

        evm_update_evmxattr(dentry, xattr_name, xattr_value, xattr_value_len);
}

/**
 * evm_inode_post_set_acl - Update the EVM extended attribute from posix acls
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 * @kacl: pointer to the posix acls
 *
 * Update the 'security.evm' xattr with the EVM HMAC re-calculated after setting
 * posix acls.
 */
static void evm_inode_post_set_acl(struct dentry *dentry, const char *acl_name,
                                   struct posix_acl *kacl)
{
        return evm_inode_post_setxattr(dentry, acl_name, NULL, 0, 0);
}

/**
 * evm_inode_post_removexattr - update 'security.evm' after removing the xattr
 * @dentry: pointer to the affected dentry
 * @xattr_name: pointer to the affected extended attribute name
 *
 * Update the HMAC stored in 'security.evm' to reflect removal of the xattr.
 *
 * No need to take the i_mutex lock here, as this function is called from
 * vfs_removexattr() which takes the i_mutex.
 */
static void evm_inode_post_removexattr(struct dentry *dentry,
                                       const char *xattr_name)
{
        if (!evm_revalidate_status(xattr_name))
                return;

        evm_reset_status(dentry->d_inode);

        if (!strcmp(xattr_name, XATTR_NAME_EVM))
                return;

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        evm_update_evmxattr(dentry, xattr_name, NULL, 0);
}

/**
 * evm_inode_post_remove_acl - Update the EVM extended attribute from posix acls
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @acl_name: name of the posix acl
 *
 * Update the 'security.evm' xattr with the EVM HMAC re-calculated after
 * removing posix acls.
 */
static inline void evm_inode_post_remove_acl(struct mnt_idmap *idmap,
                                             struct dentry *dentry,
                                             const char *acl_name)
{
        evm_inode_post_removexattr(dentry, acl_name);
}

static int evm_attr_change(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *attr)
{
        struct inode *inode = d_backing_inode(dentry);
        unsigned int ia_valid = attr->ia_valid;

        if (!i_uid_needs_update(idmap, attr, inode) &&
            !i_gid_needs_update(idmap, attr, inode) &&
            (!(ia_valid & ATTR_MODE) || attr->ia_mode == inode->i_mode))
                return 0;

        return 1;
}

/**
 * evm_inode_setattr - prevent updating an invalid EVM extended attribute
 * @idmap: idmap of the mount
 * @dentry: pointer to the affected dentry
 * @attr: iattr structure containing the new file attributes
 *
 * Permit update of file attributes when files have a valid EVM signature,
 * except in the case of them having an immutable portable signature.
 */
static int evm_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                             struct iattr *attr)
{
        unsigned int ia_valid = attr->ia_valid;
        enum integrity_status evm_status;

        /* Policy permits modification of the protected attrs even though
         * there's no HMAC key loaded
         */
        if (evm_initialized & EVM_ALLOW_METADATA_WRITES)
                return 0;

        if (is_unsupported_hmac_fs(dentry))
                return 0;

        if (!(ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)))
                return 0;

        evm_status = evm_verify_current_integrity(dentry);
        /*
         * Writing attrs is safe for portable signatures, as portable signatures
         * are immutable and can never be updated.
         */
        if ((evm_status == INTEGRITY_PASS) ||
            (evm_status == INTEGRITY_NOXATTRS) ||
            (evm_status == INTEGRITY_FAIL_IMMUTABLE) ||
            (evm_hmac_disabled() && (evm_status == INTEGRITY_NOLABEL ||
             evm_status == INTEGRITY_UNKNOWN)))
                return 0;

        if (evm_status == INTEGRITY_PASS_IMMUTABLE &&
            !evm_attr_change(idmap, dentry, attr))
                return 0;

        integrity_audit_msg(AUDIT_INTEGRITY_METADATA, d_backing_inode(dentry),
                            dentry->d_name.name, "appraise_metadata",
                            integrity_status_msg[evm_status], -EPERM, 0);
        return -EPERM;
}

/**
 * evm_inode_post_setattr - update 'security.evm' after modifying metadata
 * @idmap: idmap of the idmapped mount
 * @dentry: pointer to the affected dentry
 * @ia_valid: for the UID and GID status
 *
 * For now, update the HMAC stored in 'security.evm' to reflect UID/GID
 * changes.
 *
 * This function is called from notify_change(), which expects the caller
 * to lock the inode's i_mutex.
 */
static void evm_inode_post_setattr(struct mnt_idmap *idmap,
                                   struct dentry *dentry, int ia_valid)
{
        if (!evm_revalidate_status(NULL))
                return;

        evm_reset_status(dentry->d_inode);

        if (!(evm_initialized & EVM_INIT_HMAC))
                return;

        if (is_unsupported_hmac_fs(dentry))
                return;

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))
                evm_update_evmxattr(dentry, NULL, NULL, 0);
}

static int evm_inode_copy_up_xattr(struct dentry *src, const char *name)
{
        struct evm_ima_xattr_data *xattr_data = NULL;
        int rc;

        if (strcmp(name, XATTR_NAME_EVM) != 0)
                return -EOPNOTSUPP;

        /* first need to know the sig type */
        rc = vfs_getxattr_alloc(&nop_mnt_idmap, src, XATTR_NAME_EVM,
                                (char **)&xattr_data, 0, GFP_NOFS);
        if (rc <= 0)
                return -EPERM;

        if (rc < offsetof(struct evm_ima_xattr_data, type) +
                 sizeof(xattr_data->type))
                return -EPERM;

        switch (xattr_data->type) {
        case EVM_XATTR_PORTABLE_DIGSIG:
                rc = 0; /* allow copy-up */
                break;
        case EVM_XATTR_HMAC:
        case EVM_IMA_XATTR_DIGSIG:
        default:
                rc = -ECANCELED; /* discard */
        }

        kfree(xattr_data);
        return rc;
}

/*
 * evm_inode_init_security - initializes security.evm HMAC value
 */
int evm_inode_init_security(struct inode *inode, struct inode *dir,
                            const struct qstr *qstr, struct xattr *xattrs,
                            int *xattr_count)
{
        struct evm_xattr *xattr_data;
        struct xattr *xattr, *evm_xattr;
        bool evm_protected_xattrs = false;
        int rc;

        if (!(evm_initialized & EVM_INIT_HMAC) || !xattrs)
                return 0;

        /*
         * security_inode_init_security() makes sure that the xattrs array is
         * contiguous, there is enough space for security.evm, and that there is
         * a terminator at the end of the array.
         */
        for (xattr = xattrs; xattr->name; xattr++) {
                if (evm_protected_xattr(xattr->name))
                        evm_protected_xattrs = true;
        }

        /* EVM xattr not needed. */
        if (!evm_protected_xattrs)
                return 0;

        evm_xattr = lsm_get_xattr_slot(xattrs, xattr_count);
        /*
         * Array terminator (xattr name = NULL) must be the first non-filled
         * xattr slot.
         */
        WARN_ONCE(evm_xattr != xattr,
                  "%s: xattrs terminator is not the first non-filled slot\n",
                  __func__);

        xattr_data = kzalloc(sizeof(*xattr_data), GFP_NOFS);
        if (!xattr_data)
                return -ENOMEM;

        xattr_data->data.type = EVM_XATTR_HMAC;
        rc = evm_init_hmac(inode, xattrs, xattr_data->digest);
        if (rc < 0)
                goto out;

        evm_xattr->value = xattr_data;
        evm_xattr->value_len = sizeof(*xattr_data);
        evm_xattr->name = XATTR_EVM_SUFFIX;
        return 0;
out:
        kfree(xattr_data);
        return rc;
}
EXPORT_SYMBOL_GPL(evm_inode_init_security);

static int evm_inode_alloc_security(struct inode *inode)
{
        struct evm_iint_cache *iint = evm_iint_inode(inode);

        /* Called by security_inode_alloc(), it cannot be NULL. */
        iint->flags = 0UL;
        iint->evm_status = INTEGRITY_UNKNOWN;

        return 0;
}

static void evm_file_release(struct file *file)
{
        struct inode *inode = file_inode(file);
        struct evm_iint_cache *iint = evm_iint_inode(inode);
        fmode_t mode = file->f_mode;

        if (!S_ISREG(inode->i_mode) || !(mode & FMODE_WRITE))
                return;

        if (iint && iint->flags & EVM_NEW_FILE &&
            atomic_read(&inode->i_writecount) == 1)
                iint->flags &= ~EVM_NEW_FILE;
}

static void evm_post_path_mknod(struct mnt_idmap *idmap, struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);
        struct evm_iint_cache *iint = evm_iint_inode(inode);

        if (!S_ISREG(inode->i_mode))
                return;

        if (iint)
                iint->flags |= EVM_NEW_FILE;
}

#ifdef CONFIG_EVM_LOAD_X509
void __init evm_load_x509(void)
{
        int rc;

        rc = integrity_load_x509(INTEGRITY_KEYRING_EVM, CONFIG_EVM_X509_PATH);
        if (!rc)
                evm_initialized |= EVM_INIT_X509;
}
#endif

static int __init init_evm(void)
{
        int error;
        struct list_head *pos, *q;

        evm_init_config();

        error = integrity_init_keyring(INTEGRITY_KEYRING_EVM);
        if (error)
                goto error;

        error = evm_init_secfs();
        if (error < 0) {
                pr_info("Error registering secfs\n");
                goto error;
        }

error:
        if (error != 0) {
                if (!list_empty(&evm_config_xattrnames)) {
                        list_for_each_safe(pos, q, &evm_config_xattrnames)
                                list_del(pos);
                }
        }

        return error;
}

static struct security_hook_list evm_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_setattr, evm_inode_setattr),
        LSM_HOOK_INIT(inode_post_setattr, evm_inode_post_setattr),
        LSM_HOOK_INIT(inode_copy_up_xattr, evm_inode_copy_up_xattr),
        LSM_HOOK_INIT(inode_setxattr, evm_inode_setxattr),
        LSM_HOOK_INIT(inode_post_setxattr, evm_inode_post_setxattr),
        LSM_HOOK_INIT(inode_set_acl, evm_inode_set_acl),
        LSM_HOOK_INIT(inode_post_set_acl, evm_inode_post_set_acl),
        LSM_HOOK_INIT(inode_remove_acl, evm_inode_remove_acl),
        LSM_HOOK_INIT(inode_post_remove_acl, evm_inode_post_remove_acl),
        LSM_HOOK_INIT(inode_removexattr, evm_inode_removexattr),
        LSM_HOOK_INIT(inode_post_removexattr, evm_inode_post_removexattr),
        LSM_HOOK_INIT(inode_init_security, evm_inode_init_security),
        LSM_HOOK_INIT(inode_alloc_security, evm_inode_alloc_security),
        LSM_HOOK_INIT(file_release, evm_file_release),
        LSM_HOOK_INIT(path_post_mknod, evm_post_path_mknod),
};

static const struct lsm_id evm_lsmid = {
        .name = "evm",
        .id = LSM_ID_EVM,
};

static int __init init_evm_lsm(void)
{
        security_add_hooks(evm_hooks, ARRAY_SIZE(evm_hooks), &evm_lsmid);
        return 0;
}

struct lsm_blob_sizes evm_blob_sizes __ro_after_init = {
        .lbs_inode = sizeof(struct evm_iint_cache),
        .lbs_xattr_count = 1,
};

DEFINE_LSM(evm) = {
        .name = "evm",
        .init = init_evm_lsm,
        .order = LSM_ORDER_LAST,
        .blobs = &evm_blob_sizes,
};

late_initcall(init_evm);















    6 




    5 




   87 
   88 

   88 

























    1 



    1 



















    1 




    1 
























    1 













    1 












































    1 






























   25 






   25 
















   12 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
// SPDX-License-Identifier: GPL-2.0-only
/*
 * KVM dirty ring implementation
 *
 * Copyright 2019 Red Hat, Inc.
 */
#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/vmalloc.h>
#include <linux/kvm_dirty_ring.h>
#include <trace/events/kvm.h>
#include "kvm_mm.h"

int __weak kvm_cpu_dirty_log_size(void)
{
        return 0;
}

u32 kvm_dirty_ring_get_rsvd_entries(void)
{
        return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size();
}

bool kvm_use_dirty_bitmap(struct kvm *kvm)
{
        lockdep_assert_held(&kvm->slots_lock);

        return !kvm->dirty_ring_size || kvm->dirty_ring_with_bitmap;
}

#ifndef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm)
{
        return false;
}
#endif

static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring)
{
        return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index);
}

static bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring)
{
        return kvm_dirty_ring_used(ring) >= ring->soft_limit;
}

static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring)
{
        return kvm_dirty_ring_used(ring) >= ring->size;
}

static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask)
{
        struct kvm_memory_slot *memslot;
        int as_id, id;

        if (!mask)
                return;

        as_id = slot >> 16;
        id = (u16)slot;

        if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
                return;

        memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id);

        if (!memslot || (offset + __fls(mask)) >= memslot->npages)
                return;

        KVM_MMU_LOCK(kvm);
        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask);
        KVM_MMU_UNLOCK(kvm);
}

int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size)
{
        ring->dirty_gfns = vzalloc(size);
        if (!ring->dirty_gfns)
                return -ENOMEM;

        ring->size = size / sizeof(struct kvm_dirty_gfn);
        ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries();
        ring->dirty_index = 0;
        ring->reset_index = 0;
        ring->index = index;

        return 0;
}

static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn)
{
        smp_store_release(&gfn->flags, 0);
}

static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn)
{
        gfn->flags = KVM_DIRTY_GFN_F_DIRTY;
}

static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn)
{
        return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET;
}

int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring)
{
        u32 cur_slot, next_slot;
        u64 cur_offset, next_offset;
        unsigned long mask;
        int count = 0;
        struct kvm_dirty_gfn *entry;
        bool first_round = true;

        /* This is only needed to make compilers happy */
        cur_slot = cur_offset = mask = 0;

        while (true) {
                entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)];

                if (!kvm_dirty_gfn_harvested(entry))
                        break;

                next_slot = READ_ONCE(entry->slot);
                next_offset = READ_ONCE(entry->offset);

                /* Update the flags to reflect that this GFN is reset */
                kvm_dirty_gfn_set_invalid(entry);

                ring->reset_index++;
                count++;
                /*
                 * Try to coalesce the reset operations when the guest is
                 * scanning pages in the same slot.
                 */
                if (!first_round && next_slot == cur_slot) {
                        s64 delta = next_offset - cur_offset;

                        if (delta >= 0 && delta < BITS_PER_LONG) {
                                mask |= 1ull << delta;
                                continue;
                        }

                        /* Backwards visit, careful about overflows!  */
                        if (delta > -BITS_PER_LONG && delta < 0 &&
                            (mask << -delta >> -delta) == mask) {
                                cur_offset = next_offset;
                                mask = (mask << -delta) | 1;
                                continue;
                        }
                }
                kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);
                cur_slot = next_slot;
                cur_offset = next_offset;
                mask = 1;
                first_round = false;
        }

        kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask);

        /*
         * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared
         * by the VCPU thread next time when it enters the guest.
         */

        trace_kvm_dirty_ring_reset(ring);

        return count;
}

void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset)
{
        struct kvm_dirty_ring *ring = &vcpu->dirty_ring;
        struct kvm_dirty_gfn *entry;

        /* It should never get full */
        WARN_ON_ONCE(kvm_dirty_ring_full(ring));

        entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)];

        entry->slot = slot;
        entry->offset = offset;
        /*
         * Make sure the data is filled in before we publish this to
         * the userspace program.  There's no paired kernel-side reader.
         */
        smp_wmb();
        kvm_dirty_gfn_set_dirtied(entry);
        ring->dirty_index++;
        trace_kvm_dirty_ring_push(ring, slot, offset);

        if (kvm_dirty_ring_soft_full(ring))
                kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu);
}

bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu)
{
        /*
         * The VCPU isn't runnable when the dirty ring becomes soft full.
         * The KVM_REQ_DIRTY_RING_SOFT_FULL event is always set to prevent
         * the VCPU from running until the dirty pages are harvested and
         * the dirty ring is reset by userspace.
         */
        if (kvm_check_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu) &&
            kvm_dirty_ring_soft_full(&vcpu->dirty_ring)) {
                kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu);
                vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL;
                trace_kvm_dirty_ring_exit(vcpu);
                return true;
        }

        return false;
}

struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset)
{
        return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE);
}

void kvm_dirty_ring_free(struct kvm_dirty_ring *ring)
{
        vfree(ring->dirty_gfns);
        ring->dirty_gfns = NULL;
}




































 1521 






  316 







  123 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 Huawei Ltd.
 * Author: Jiang Liu <liuj97@gmail.com>
 *
 * Based on arch/arm/include/asm/jump_label.h
 */
#ifndef __ASM_JUMP_LABEL_H
#define __ASM_JUMP_LABEL_H

#ifndef __ASSEMBLY__

#include <linux/types.h>
#include <asm/insn.h>

#define HAVE_JUMP_LABEL_BATCH
#define JUMP_LABEL_NOP_SIZE                AARCH64_INSN_SIZE

#define JUMP_TABLE_ENTRY(key, label)                        \
        ".pushsection        __jump_table, \"aw\"\n\t"        \
        ".align                3\n\t"                                \
        ".long                1b - ., " label " - .\n\t"        \
        ".quad                " key " - .\n\t"                \
        ".popsection\n\t"

/* This macro is also expanded on the Rust side. */
#define ARCH_STATIC_BRANCH_ASM(key, label)                \
        "1:        nop\n\t"                                \
        JUMP_TABLE_ENTRY(key, label)

static __always_inline bool arch_static_branch(struct static_key * const key,
                                               const bool branch)
{
        char *k = &((char *)key)[branch];

        asm goto(
                ARCH_STATIC_BRANCH_ASM("%c0", "%l[l_yes]")
                :  :  "i"(k) :  : l_yes
                );

        return false;
l_yes:
        return true;
}

static __always_inline bool arch_static_branch_jump(struct static_key * const key,
                                                    const bool branch)
{
        char *k = &((char *)key)[branch];

        asm goto(
                "1:        b                %l[l_yes]                \n\t"
                JUMP_TABLE_ENTRY("%c0", "%l[l_yes]")
                :  :  "i"(k) :  : l_yes
                );
        return false;
l_yes:
        return true;
}

#endif  /* __ASSEMBLY__ */
#endif        /* __ASM_JUMP_LABEL_H */











































































































   14 










   14 





















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
/* SPDX-License-Identifier: GPL-2.0
 *
 *        Network memory
 *
 *        Author:        Mina Almasry <almasrymina@google.com>
 */

#ifndef _NET_NETMEM_H
#define _NET_NETMEM_H

#include <linux/mm.h>
#include <net/net_debug.h>

/* net_iov */

DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers);

/*  We overload the LSB of the struct page pointer to indicate whether it's
 *  a page or net_iov.
 */
#define NET_IOV 0x01UL

struct net_iov {
        unsigned long __unused_padding;
        unsigned long pp_magic;
        struct page_pool *pp;
        struct net_iov_area *owner;
        unsigned long dma_addr;
        atomic_long_t pp_ref_count;
};

struct net_iov_area {
        /* Array of net_iovs for this area. */
        struct net_iov *niovs;
        size_t num_niovs;

        /* Offset into the dma-buf where this chunk starts.  */
        unsigned long base_virtual;
};

/* These fields in struct page are used by the page_pool and net stack:
 *
 *        struct {
 *                unsigned long pp_magic;
 *                struct page_pool *pp;
 *                unsigned long _pp_mapping_pad;
 *                unsigned long dma_addr;
 *                atomic_long_t pp_ref_count;
 *        };
 *
 * We mirror the page_pool fields here so the page_pool can access these fields
 * without worrying whether the underlying fields belong to a page or net_iov.
 *
 * The non-net stack fields of struct page are private to the mm stack and must
 * never be mirrored to net_iov.
 */
#define NET_IOV_ASSERT_OFFSET(pg, iov)             \
        static_assert(offsetof(struct page, pg) == \
                      offsetof(struct net_iov, iov))
NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic);
NET_IOV_ASSERT_OFFSET(pp, pp);
NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr);
NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count);
#undef NET_IOV_ASSERT_OFFSET

static inline struct net_iov_area *net_iov_owner(const struct net_iov *niov)
{
        return niov->owner;
}

static inline unsigned int net_iov_idx(const struct net_iov *niov)
{
        return niov - net_iov_owner(niov)->niovs;
}

/* netmem */

/**
 * typedef netmem_ref - a nonexistent type marking a reference to generic
 * network memory.
 *
 * A netmem_ref currently is always a reference to a struct page. This
 * abstraction is introduced so support for new memory types can be added.
 *
 * Use the supplied helpers to obtain the underlying memory pointer and fields.
 */
typedef unsigned long __bitwise netmem_ref;

static inline bool netmem_is_net_iov(const netmem_ref netmem)
{
        return (__force unsigned long)netmem & NET_IOV;
}

/**
 * __netmem_to_page - unsafely get pointer to the &page backing @netmem
 * @netmem: netmem reference to convert
 *
 * Unsafe version of netmem_to_page(). When @netmem is always page-backed,
 * e.g. when it's a header buffer, performs faster and generates smaller
 * object code (no check for the LSB, no WARN). When @netmem points to IOV,
 * provokes undefined behaviour.
 *
 * Return: pointer to the &page (garbage if @netmem is not page-backed).
 */
static inline struct page *__netmem_to_page(netmem_ref netmem)
{
        return (__force struct page *)netmem;
}

/* This conversion fails (returns NULL) if the netmem_ref is not struct page
 * backed.
 */
static inline struct page *netmem_to_page(netmem_ref netmem)
{
        if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
                return NULL;

        return __netmem_to_page(netmem);
}

static inline struct net_iov *netmem_to_net_iov(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return (struct net_iov *)((__force unsigned long)netmem &
                                          ~NET_IOV);

        DEBUG_NET_WARN_ON_ONCE(true);
        return NULL;
}

static inline netmem_ref net_iov_to_netmem(struct net_iov *niov)
{
        return (__force netmem_ref)((unsigned long)niov | NET_IOV);
}

static inline netmem_ref page_to_netmem(struct page *page)
{
        return (__force netmem_ref)page;
}

/**
 * virt_to_netmem - convert virtual memory pointer to a netmem reference
 * @data: host memory pointer to convert
 *
 * Return: netmem reference to the &page backing this virtual address.
 */
static inline netmem_ref virt_to_netmem(const void *data)
{
        return page_to_netmem(virt_to_page(data));
}

static inline int netmem_ref_count(netmem_ref netmem)
{
        /* The non-pp refcount of net_iov is always 1. On net_iov, we only
         * support pp refcounting which uses the pp_ref_count field.
         */
        if (netmem_is_net_iov(netmem))
                return 1;

        return page_ref_count(netmem_to_page(netmem));
}

static inline unsigned long netmem_pfn_trace(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return 0;

        return page_to_pfn(netmem_to_page(netmem));
}

static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem)
{
        return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV);
}

/**
 * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem
 * @netmem: netmem reference to get the pointer from
 *
 * Unsafe version of netmem_get_pp(). When @netmem is always page-backed,
 * e.g. when it's a header buffer, performs faster and generates smaller
 * object code (avoids clearing the LSB). When @netmem points to IOV,
 * provokes invalid memory access.
 *
 * Return: pointer to the &page_pool (garbage if @netmem is not page-backed).
 */
static inline struct page_pool *__netmem_get_pp(netmem_ref netmem)
{
        return __netmem_to_page(netmem)->pp;
}

static inline struct page_pool *netmem_get_pp(netmem_ref netmem)
{
        return __netmem_clear_lsb(netmem)->pp;
}

static inline atomic_long_t *netmem_get_pp_ref_count_ref(netmem_ref netmem)
{
        return &__netmem_clear_lsb(netmem)->pp_ref_count;
}

static inline bool netmem_is_pref_nid(netmem_ref netmem, int pref_nid)
{
        /* NUMA node preference only makes sense if we're allocating
         * system memory. Memory providers (which give us net_iovs)
         * choose for us.
         */
        if (netmem_is_net_iov(netmem))
                return true;

        return page_to_nid(netmem_to_page(netmem)) == pref_nid;
}

static inline netmem_ref netmem_compound_head(netmem_ref netmem)
{
        /* niov are never compounded */
        if (netmem_is_net_iov(netmem))
                return netmem;

        return page_to_netmem(compound_head(netmem_to_page(netmem)));
}

/**
 * __netmem_address - unsafely get pointer to the memory backing @netmem
 * @netmem: netmem reference to get the pointer for
 *
 * Unsafe version of netmem_address(). When @netmem is always page-backed,
 * e.g. when it's a header buffer, performs faster and generates smaller
 * object code (no check for the LSB). When @netmem points to IOV, provokes
 * undefined behaviour.
 *
 * Return: pointer to the memory (garbage if @netmem is not page-backed).
 */
static inline void *__netmem_address(netmem_ref netmem)
{
        return page_address(__netmem_to_page(netmem));
}

static inline void *netmem_address(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return NULL;

        return __netmem_address(netmem);
}

/**
 * netmem_is_pfmemalloc - check if @netmem was allocated under memory pressure
 * @netmem: netmem reference to check
 *
 * Return: true if @netmem is page-backed and the page was allocated under
 * memory pressure, false otherwise.
 */
static inline bool netmem_is_pfmemalloc(netmem_ref netmem)
{
        if (netmem_is_net_iov(netmem))
                return false;

        return page_is_pfmemalloc(netmem_to_page(netmem));
}

static inline unsigned long netmem_get_dma_addr(netmem_ref netmem)
{
        return __netmem_clear_lsb(netmem)->dma_addr;
}

#endif /* _NET_NETMEM_H */























































































































































































































































































































































































































































































































































































































































































    8 




    8 



















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2011 IBM Corporation
 *
 * Author:
 * Mimi Zohar <zohar@us.ibm.com>
 */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/binfmts.h>
#include <linux/fs.h>
#include <linux/xattr.h>
#include <linux/magic.h>
#include <linux/ima.h>
#include <linux/evm.h>
#include <linux/fsverity.h>
#include <keys/system_keyring.h>
#include <uapi/linux/fsverity.h>

#include "ima.h"

#ifdef CONFIG_IMA_APPRAISE_BOOTPARAM
static char *ima_appraise_cmdline_default __initdata;
core_param(ima_appraise, ima_appraise_cmdline_default, charp, 0);

void __init ima_appraise_parse_cmdline(void)
{
        const char *str = ima_appraise_cmdline_default;
        bool sb_state = arch_ima_get_secureboot();
        int appraisal_state = ima_appraise;

        if (!str)
                return;

        if (strncmp(str, "off", 3) == 0)
                appraisal_state = 0;
        else if (strncmp(str, "log", 3) == 0)
                appraisal_state = IMA_APPRAISE_LOG;
        else if (strncmp(str, "fix", 3) == 0)
                appraisal_state = IMA_APPRAISE_FIX;
        else if (strncmp(str, "enforce", 7) == 0)
                appraisal_state = IMA_APPRAISE_ENFORCE;
        else
                pr_err("invalid \"%s\" appraise option", str);

        /* If appraisal state was changed, but secure boot is enabled,
         * keep its default */
        if (sb_state) {
                if (!(appraisal_state & IMA_APPRAISE_ENFORCE))
                        pr_info("Secure boot enabled: ignoring ima_appraise=%s option",
                                str);
        } else {
                ima_appraise = appraisal_state;
        }
}
#endif

/*
 * is_ima_appraise_enabled - return appraise status
 *
 * Only return enabled, if not in ima_appraise="fix" or "log" modes.
 */
bool is_ima_appraise_enabled(void)
{
        return ima_appraise & IMA_APPRAISE_ENFORCE;
}

/*
 * ima_must_appraise - set appraise flag
 *
 * Return 1 to appraise or hash
 */
int ima_must_appraise(struct mnt_idmap *idmap, struct inode *inode,
                      int mask, enum ima_hooks func)
{
        struct lsm_prop prop;

        if (!ima_appraise)
                return 0;

        security_current_getlsmprop_subj(&prop);
        return ima_match_policy(idmap, inode, current_cred(), &prop,
                                func, mask, IMA_APPRAISE | IMA_HASH, NULL,
                                NULL, NULL, NULL);
}

static int ima_fix_xattr(struct dentry *dentry, struct ima_iint_cache *iint)
{
        int rc, offset;
        u8 algo = iint->ima_hash->algo;

        if (algo <= HASH_ALGO_SHA1) {
                offset = 1;
                iint->ima_hash->xattr.sha1.type = IMA_XATTR_DIGEST;
        } else {
                offset = 0;
                iint->ima_hash->xattr.ng.type = IMA_XATTR_DIGEST_NG;
                iint->ima_hash->xattr.ng.algo = algo;
        }
        rc = __vfs_setxattr_noperm(&nop_mnt_idmap, dentry, XATTR_NAME_IMA,
                                   &iint->ima_hash->xattr.data[offset],
                                   (sizeof(iint->ima_hash->xattr) - offset) +
                                   iint->ima_hash->length, 0);
        return rc;
}

/* Return specific func appraised cached result */
enum integrity_status ima_get_cache_status(struct ima_iint_cache *iint,
                                           enum ima_hooks func)
{
        switch (func) {
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
                return iint->ima_mmap_status;
        case BPRM_CHECK:
                return iint->ima_bprm_status;
        case CREDS_CHECK:
                return iint->ima_creds_status;
        case FILE_CHECK:
        case POST_SETATTR:
                return iint->ima_file_status;
        case MODULE_CHECK ... MAX_CHECK - 1:
        default:
                return iint->ima_read_status;
        }
}

static void ima_set_cache_status(struct ima_iint_cache *iint,
                                 enum ima_hooks func,
                                 enum integrity_status status)
{
        switch (func) {
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
                iint->ima_mmap_status = status;
                break;
        case BPRM_CHECK:
                iint->ima_bprm_status = status;
                break;
        case CREDS_CHECK:
                iint->ima_creds_status = status;
                break;
        case FILE_CHECK:
        case POST_SETATTR:
                iint->ima_file_status = status;
                break;
        case MODULE_CHECK ... MAX_CHECK - 1:
        default:
                iint->ima_read_status = status;
                break;
        }
}

static void ima_cache_flags(struct ima_iint_cache *iint, enum ima_hooks func)
{
        switch (func) {
        case MMAP_CHECK:
        case MMAP_CHECK_REQPROT:
                iint->flags |= (IMA_MMAP_APPRAISED | IMA_APPRAISED);
                break;
        case BPRM_CHECK:
                iint->flags |= (IMA_BPRM_APPRAISED | IMA_APPRAISED);
                break;
        case CREDS_CHECK:
                iint->flags |= (IMA_CREDS_APPRAISED | IMA_APPRAISED);
                break;
        case FILE_CHECK:
        case POST_SETATTR:
                iint->flags |= (IMA_FILE_APPRAISED | IMA_APPRAISED);
                break;
        case MODULE_CHECK ... MAX_CHECK - 1:
        default:
                iint->flags |= (IMA_READ_APPRAISED | IMA_APPRAISED);
                break;
        }
}

enum hash_algo ima_get_hash_algo(const struct evm_ima_xattr_data *xattr_value,
                                 int xattr_len)
{
        struct signature_v2_hdr *sig;
        enum hash_algo ret;

        if (!xattr_value || xattr_len < 2)
                /* return default hash algo */
                return ima_hash_algo;

        switch (xattr_value->type) {
        case IMA_VERITY_DIGSIG:
                sig = (typeof(sig))xattr_value;
                if (sig->version != 3 || xattr_len <= sizeof(*sig) ||
                    sig->hash_algo >= HASH_ALGO__LAST)
                        return ima_hash_algo;
                return sig->hash_algo;
        case EVM_IMA_XATTR_DIGSIG:
                sig = (typeof(sig))xattr_value;
                if (sig->version != 2 || xattr_len <= sizeof(*sig)
                    || sig->hash_algo >= HASH_ALGO__LAST)
                        return ima_hash_algo;
                return sig->hash_algo;
        case IMA_XATTR_DIGEST_NG:
                /* first byte contains algorithm id */
                ret = xattr_value->data[0];
                if (ret < HASH_ALGO__LAST)
                        return ret;
                break;
        case IMA_XATTR_DIGEST:
                /* this is for backward compatibility */
                if (xattr_len == 21) {
                        unsigned int zero = 0;
                        if (!memcmp(&xattr_value->data[16], &zero, 4))
                                return HASH_ALGO_MD5;
                        else
                                return HASH_ALGO_SHA1;
                } else if (xattr_len == 17)
                        return HASH_ALGO_MD5;
                break;
        }

        /* return default hash algo */
        return ima_hash_algo;
}

int ima_read_xattr(struct dentry *dentry,
                   struct evm_ima_xattr_data **xattr_value, int xattr_len)
{
        int ret;

        ret = vfs_getxattr_alloc(&nop_mnt_idmap, dentry, XATTR_NAME_IMA,
                                 (char **)xattr_value, xattr_len, GFP_NOFS);
        if (ret == -EOPNOTSUPP)
                ret = 0;
        return ret;
}

/*
 * calc_file_id_hash - calculate the hash of the ima_file_id struct data
 * @type: xattr type [enum evm_ima_xattr_type]
 * @algo: hash algorithm [enum hash_algo]
 * @digest: pointer to the digest to be hashed
 * @hash: (out) pointer to the hash
 *
 * IMA signature version 3 disambiguates the data that is signed by
 * indirectly signing the hash of the ima_file_id structure data.
 *
 * Signing the ima_file_id struct is currently only supported for
 * IMA_VERITY_DIGSIG type xattrs.
 *
 * Return 0 on success, error code otherwise.
 */
static int calc_file_id_hash(enum evm_ima_xattr_type type,
                             enum hash_algo algo, const u8 *digest,
                             struct ima_digest_data *hash)
{
        struct ima_file_id file_id = {
                .hash_type = IMA_VERITY_DIGSIG, .hash_algorithm = algo};
        unsigned int unused = HASH_MAX_DIGESTSIZE - hash_digest_size[algo];

        if (type != IMA_VERITY_DIGSIG)
                return -EINVAL;

        memcpy(file_id.hash, digest, hash_digest_size[algo]);

        hash->algo = algo;
        hash->length = hash_digest_size[algo];

        return ima_calc_buffer_hash(&file_id, sizeof(file_id) - unused, hash);
}

/*
 * xattr_verify - verify xattr digest or signature
 *
 * Verify whether the hash or signature matches the file contents.
 *
 * Return 0 on success, error code otherwise.
 */
static int xattr_verify(enum ima_hooks func, struct ima_iint_cache *iint,
                        struct evm_ima_xattr_data *xattr_value, int xattr_len,
                        enum integrity_status *status, const char **cause)
{
        struct ima_max_digest_data hash;
        struct signature_v2_hdr *sig;
        int rc = -EINVAL, hash_start = 0;
        int mask;

        switch (xattr_value->type) {
        case IMA_XATTR_DIGEST_NG:
                /* first byte contains algorithm id */
                hash_start = 1;
                fallthrough;
        case IMA_XATTR_DIGEST:
                if (*status != INTEGRITY_PASS_IMMUTABLE) {
                        if (iint->flags & IMA_DIGSIG_REQUIRED) {
                                if (iint->flags & IMA_VERITY_REQUIRED)
                                        *cause = "verity-signature-required";
                                else
                                        *cause = "IMA-signature-required";
                                *status = INTEGRITY_FAIL;
                                break;
                        }
                        clear_bit(IMA_DIGSIG, &iint->atomic_flags);
                } else {
                        set_bit(IMA_DIGSIG, &iint->atomic_flags);
                }
                if (xattr_len - sizeof(xattr_value->type) - hash_start >=
                                iint->ima_hash->length)
                        /*
                         * xattr length may be longer. md5 hash in previous
                         * version occupied 20 bytes in xattr, instead of 16
                         */
                        rc = memcmp(&xattr_value->data[hash_start],
                                    iint->ima_hash->digest,
                                    iint->ima_hash->length);
                else
                        rc = -EINVAL;
                if (rc) {
                        *cause = "invalid-hash";
                        *status = INTEGRITY_FAIL;
                        break;
                }
                *status = INTEGRITY_PASS;
                break;
        case EVM_IMA_XATTR_DIGSIG:
                set_bit(IMA_DIGSIG, &iint->atomic_flags);

                mask = IMA_DIGSIG_REQUIRED | IMA_VERITY_REQUIRED;
                if ((iint->flags & mask) == mask) {
                        *cause = "verity-signature-required";
                        *status = INTEGRITY_FAIL;
                        break;
                }

                sig = (typeof(sig))xattr_value;
                if (sig->version >= 3) {
                        *cause = "invalid-signature-version";
                        *status = INTEGRITY_FAIL;
                        break;
                }
                rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA,
                                             (const char *)xattr_value,
                                             xattr_len,
                                             iint->ima_hash->digest,
                                             iint->ima_hash->length);
                if (rc == -EOPNOTSUPP) {
                        *status = INTEGRITY_UNKNOWN;
                        break;
                }
                if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc &&
                    func == KEXEC_KERNEL_CHECK)
                        rc = integrity_digsig_verify(INTEGRITY_KEYRING_PLATFORM,
                                                     (const char *)xattr_value,
                                                     xattr_len,
                                                     iint->ima_hash->digest,
                                                     iint->ima_hash->length);
                if (rc) {
                        *cause = "invalid-signature";
                        *status = INTEGRITY_FAIL;
                } else {
                        *status = INTEGRITY_PASS;
                }
                break;
        case IMA_VERITY_DIGSIG:
                set_bit(IMA_DIGSIG, &iint->atomic_flags);

                if (iint->flags & IMA_DIGSIG_REQUIRED) {
                        if (!(iint->flags & IMA_VERITY_REQUIRED)) {
                                *cause = "IMA-signature-required";
                                *status = INTEGRITY_FAIL;
                                break;
                        }
                }

                sig = (typeof(sig))xattr_value;
                if (sig->version != 3) {
                        *cause = "invalid-signature-version";
                        *status = INTEGRITY_FAIL;
                        break;
                }

                rc = calc_file_id_hash(IMA_VERITY_DIGSIG, iint->ima_hash->algo,
                                       iint->ima_hash->digest,
                                       container_of(&hash.hdr,
                                               struct ima_digest_data, hdr));
                if (rc) {
                        *cause = "sigv3-hashing-error";
                        *status = INTEGRITY_FAIL;
                        break;
                }

                rc = integrity_digsig_verify(INTEGRITY_KEYRING_IMA,
                                             (const char *)xattr_value,
                                             xattr_len, hash.digest,
                                             hash.hdr.length);
                if (rc) {
                        *cause = "invalid-verity-signature";
                        *status = INTEGRITY_FAIL;
                } else {
                        *status = INTEGRITY_PASS;
                }

                break;
        default:
                *status = INTEGRITY_UNKNOWN;
                *cause = "unknown-ima-data";
                break;
        }

        return rc;
}

/*
 * modsig_verify - verify modsig signature
 *
 * Verify whether the signature matches the file contents.
 *
 * Return 0 on success, error code otherwise.
 */
static int modsig_verify(enum ima_hooks func, const struct modsig *modsig,
                         enum integrity_status *status, const char **cause)
{
        int rc;

        rc = integrity_modsig_verify(INTEGRITY_KEYRING_IMA, modsig);
        if (IS_ENABLED(CONFIG_INTEGRITY_PLATFORM_KEYRING) && rc &&
            func == KEXEC_KERNEL_CHECK)
                rc = integrity_modsig_verify(INTEGRITY_KEYRING_PLATFORM,
                                             modsig);
        if (rc) {
                *cause = "invalid-signature";
                *status = INTEGRITY_FAIL;
        } else {
                *status = INTEGRITY_PASS;
        }

        return rc;
}

/*
 * ima_check_blacklist - determine if the binary is blacklisted.
 *
 * Add the hash of the blacklisted binary to the measurement list, based
 * on policy.
 *
 * Returns -EPERM if the hash is blacklisted.
 */
int ima_check_blacklist(struct ima_iint_cache *iint,
                        const struct modsig *modsig, int pcr)
{
        enum hash_algo hash_algo;
        const u8 *digest = NULL;
        u32 digestsize = 0;
        int rc = 0;

        if (!(iint->flags & IMA_CHECK_BLACKLIST))
                return 0;

        if (iint->flags & IMA_MODSIG_ALLOWED && modsig) {
                ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize);

                rc = is_binary_blacklisted(digest, digestsize);
        } else if (iint->flags & IMA_DIGSIG_REQUIRED && iint->ima_hash)
                rc = is_binary_blacklisted(iint->ima_hash->digest, iint->ima_hash->length);

        if ((rc == -EPERM) && (iint->flags & IMA_MEASURE))
                process_buffer_measurement(&nop_mnt_idmap, NULL, digest, digestsize,
                                           "blacklisted-hash", NONE,
                                           pcr, NULL, false, NULL, 0);

        return rc;
}

static bool is_bprm_creds_for_exec(enum ima_hooks func, struct file *file)
{
        struct linux_binprm *bprm;

        if (func == BPRM_CHECK) {
                bprm = container_of(&file, struct linux_binprm, file);
                return bprm->is_check;
        }
        return false;
}

/*
 * ima_appraise_measurement - appraise file measurement
 *
 * Call evm_verifyxattr() to verify the integrity of 'security.ima'.
 * Assuming success, compare the xattr hash with the collected measurement.
 *
 * Return 0 on success, error code otherwise
 */
int ima_appraise_measurement(enum ima_hooks func, struct ima_iint_cache *iint,
                             struct file *file, const unsigned char *filename,
                             struct evm_ima_xattr_data *xattr_value,
                             int xattr_len, const struct modsig *modsig)
{
        static const char op[] = "appraise_data";
        int audit_msgno = AUDIT_INTEGRITY_DATA;
        const char *cause = "unknown";
        struct dentry *dentry = file_dentry(file);
        struct inode *inode = d_backing_inode(dentry);
        enum integrity_status status = INTEGRITY_UNKNOWN;
        int rc = xattr_len;
        bool try_modsig = iint->flags & IMA_MODSIG_ALLOWED && modsig;

        /* If not appraising a modsig, we need an xattr. */
        if (!(inode->i_opflags & IOP_XATTR) && !try_modsig)
                return INTEGRITY_UNKNOWN;

        /*
         * Unlike any of the other LSM hooks where the kernel enforces file
         * integrity, enforcing file integrity for the bprm_creds_for_exec()
         * LSM hook with the AT_EXECVE_CHECK flag is left up to the discretion
         * of the script interpreter(userspace). Differentiate kernel and
         * userspace enforced integrity audit messages.
         */
        if (is_bprm_creds_for_exec(func, file))
                audit_msgno = AUDIT_INTEGRITY_USERSPACE;

        /* If reading the xattr failed and there's no modsig, error out. */
        if (rc <= 0 && !try_modsig) {
                if (rc && rc != -ENODATA)
                        goto out;

                if (iint->flags & IMA_DIGSIG_REQUIRED) {
                        if (iint->flags & IMA_VERITY_REQUIRED)
                                cause = "verity-signature-required";
                        else
                                cause = "IMA-signature-required";
                } else {
                        cause = "missing-hash";
                }

                status = INTEGRITY_NOLABEL;
                if (file->f_mode & FMODE_CREATED)
                        iint->flags |= IMA_NEW_FILE;
                if ((iint->flags & IMA_NEW_FILE) &&
                    (!(iint->flags & IMA_DIGSIG_REQUIRED) ||
                     (inode->i_size == 0)))
                        status = INTEGRITY_PASS;
                goto out;
        }

        status = evm_verifyxattr(dentry, XATTR_NAME_IMA, xattr_value,
                                 rc < 0 ? 0 : rc);
        switch (status) {
        case INTEGRITY_PASS:
        case INTEGRITY_PASS_IMMUTABLE:
        case INTEGRITY_UNKNOWN:
                break;
        case INTEGRITY_NOXATTRS:        /* No EVM protected xattrs. */
                /* It's fine not to have xattrs when using a modsig. */
                if (try_modsig)
                        break;
                fallthrough;
        case INTEGRITY_NOLABEL:                /* No security.evm xattr. */
                cause = "missing-HMAC";
                goto out;
        case INTEGRITY_FAIL_IMMUTABLE:
                set_bit(IMA_DIGSIG, &iint->atomic_flags);
                cause = "invalid-fail-immutable";
                goto out;
        case INTEGRITY_FAIL:                /* Invalid HMAC/signature. */
                cause = "invalid-HMAC";
                goto out;
        default:
                WARN_ONCE(true, "Unexpected integrity status %d\n", status);
        }

        if (xattr_value)
                rc = xattr_verify(func, iint, xattr_value, xattr_len, &status,
                                  &cause);

        /*
         * If we have a modsig and either no imasig or the imasig's key isn't
         * known, then try verifying the modsig.
         */
        if (try_modsig &&
            (!xattr_value || xattr_value->type == IMA_XATTR_DIGEST_NG ||
             rc == -ENOKEY))
                rc = modsig_verify(func, modsig, &status, &cause);

out:
        /*
         * File signatures on some filesystems can not be properly verified.
         * When such filesystems are mounted by an untrusted mounter or on a
         * system not willing to accept such a risk, fail the file signature
         * verification.
         */
        if ((inode->i_sb->s_iflags & SB_I_IMA_UNVERIFIABLE_SIGNATURE) &&
            ((inode->i_sb->s_iflags & SB_I_UNTRUSTED_MOUNTER) ||
             (iint->flags & IMA_FAIL_UNVERIFIABLE_SIGS))) {
                status = INTEGRITY_FAIL;
                cause = "unverifiable-signature";
                integrity_audit_msg(audit_msgno, inode, filename,
                                    op, cause, rc, 0);
        } else if (status != INTEGRITY_PASS) {
                /* Fix mode, but don't replace file signatures. */
                if ((ima_appraise & IMA_APPRAISE_FIX) && !try_modsig &&
                    (!xattr_value ||
                     xattr_value->type != EVM_IMA_XATTR_DIGSIG)) {
                        if (!ima_fix_xattr(dentry, iint))
                                status = INTEGRITY_PASS;
                }

                /*
                 * Permit new files with file/EVM portable signatures, but
                 * without data.
                 */
                if (inode->i_size == 0 && iint->flags & IMA_NEW_FILE &&
                    test_bit(IMA_DIGSIG, &iint->atomic_flags)) {
                        status = INTEGRITY_PASS;
                }

                integrity_audit_msg(audit_msgno, inode, filename,
                                    op, cause, rc, 0);
        } else {
                ima_cache_flags(iint, func);
        }

        ima_set_cache_status(iint, func, status);
        return status;
}

/*
 * ima_update_xattr - update 'security.ima' hash value
 */
void ima_update_xattr(struct ima_iint_cache *iint, struct file *file)
{
        struct dentry *dentry = file_dentry(file);
        int rc = 0;

        /* do not collect and update hash for digital signatures */
        if (test_bit(IMA_DIGSIG, &iint->atomic_flags))
                return;

        if ((iint->ima_file_status != INTEGRITY_PASS) &&
            !(iint->flags & IMA_HASH))
                return;

        rc = ima_collect_measurement(iint, file, NULL, 0, ima_hash_algo, NULL);
        if (rc < 0)
                return;

        inode_lock(file_inode(file));
        ima_fix_xattr(dentry, iint);
        inode_unlock(file_inode(file));
}

/**
 * ima_inode_post_setattr - reflect file metadata changes
 * @idmap:  idmap of the mount the inode was found from
 * @dentry: pointer to the affected dentry
 * @ia_valid: for the UID and GID status
 *
 * Changes to a dentry's metadata might result in needing to appraise.
 *
 * This function is called from notify_change(), which expects the caller
 * to lock the inode's i_mutex.
 */
static void ima_inode_post_setattr(struct mnt_idmap *idmap,
                                   struct dentry *dentry, int ia_valid)
{
        struct inode *inode = d_backing_inode(dentry);
        struct ima_iint_cache *iint;
        int action;

        if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode)
            || !(inode->i_opflags & IOP_XATTR))
                return;

        action = ima_must_appraise(idmap, inode, MAY_ACCESS, POST_SETATTR);
        iint = ima_iint_find(inode);
        if (iint) {
                set_bit(IMA_CHANGE_ATTR, &iint->atomic_flags);
                if (!action)
                        clear_bit(IMA_UPDATE_XATTR, &iint->atomic_flags);
        }
}

/*
 * ima_protect_xattr - protect 'security.ima'
 *
 * Ensure that not just anyone can modify or remove 'security.ima'.
 */
static int ima_protect_xattr(struct dentry *dentry, const char *xattr_name,
                             const void *xattr_value, size_t xattr_value_len)
{
        if (strcmp(xattr_name, XATTR_NAME_IMA) == 0) {
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
                return 1;
        }
        return 0;
}

static void ima_reset_appraise_flags(struct inode *inode, int digsig)
{
        struct ima_iint_cache *iint;

        if (!(ima_policy_flag & IMA_APPRAISE) || !S_ISREG(inode->i_mode))
                return;

        iint = ima_iint_find(inode);
        if (!iint)
                return;
        iint->measured_pcrs = 0;
        set_bit(IMA_CHANGE_XATTR, &iint->atomic_flags);
        if (digsig)
                set_bit(IMA_DIGSIG, &iint->atomic_flags);
        else
                clear_bit(IMA_DIGSIG, &iint->atomic_flags);
}

/**
 * validate_hash_algo() - Block setxattr with unsupported hash algorithms
 * @dentry: object of the setxattr()
 * @xattr_value: userland supplied xattr value
 * @xattr_value_len: length of xattr_value
 *
 * The xattr value is mapped to its hash algorithm, and this algorithm
 * must be built in the kernel for the setxattr to be allowed.
 *
 * Emit an audit message when the algorithm is invalid.
 *
 * Return: 0 on success, else an error.
 */
static int validate_hash_algo(struct dentry *dentry,
                              const struct evm_ima_xattr_data *xattr_value,
                              size_t xattr_value_len)
{
        char *path = NULL, *pathbuf = NULL;
        enum hash_algo xattr_hash_algo;
        const char *errmsg = "unavailable-hash-algorithm";
        unsigned int allowed_hashes;

        xattr_hash_algo = ima_get_hash_algo(xattr_value, xattr_value_len);

        allowed_hashes = atomic_read(&ima_setxattr_allowed_hash_algorithms);

        if (allowed_hashes) {
                /* success if the algorithm is allowed in the ima policy */
                if (allowed_hashes & (1U << xattr_hash_algo))
                        return 0;

                /*
                 * We use a different audit message when the hash algorithm
                 * is denied by a policy rule, instead of not being built
                 * in the kernel image
                 */
                errmsg = "denied-hash-algorithm";
        } else {
                if (likely(xattr_hash_algo == ima_hash_algo))
                        return 0;

                /* allow any xattr using an algorithm built in the kernel */
                if (crypto_has_alg(hash_algo_name[xattr_hash_algo], 0, 0))
                        return 0;
        }

        pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!pathbuf)
                return -EACCES;

        path = dentry_path(dentry, pathbuf, PATH_MAX);

        integrity_audit_msg(AUDIT_INTEGRITY_DATA, d_inode(dentry), path,
                            "set_data", errmsg, -EACCES, 0);

        kfree(pathbuf);

        return -EACCES;
}

static int ima_inode_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                              const char *xattr_name, const void *xattr_value,
                              size_t xattr_value_len, int flags)
{
        const struct evm_ima_xattr_data *xvalue = xattr_value;
        int digsig = 0;
        int result;
        int err;

        result = ima_protect_xattr(dentry, xattr_name, xattr_value,
                                   xattr_value_len);
        if (result == 1) {
                if (!xattr_value_len || (xvalue->type >= IMA_XATTR_LAST))
                        return -EINVAL;

                err = validate_hash_algo(dentry, xvalue, xattr_value_len);
                if (err)
                        return err;

                digsig = (xvalue->type == EVM_IMA_XATTR_DIGSIG);
        } else if (!strcmp(xattr_name, XATTR_NAME_EVM) && xattr_value_len > 0) {
                digsig = (xvalue->type == EVM_XATTR_PORTABLE_DIGSIG);
        }
        if (result == 1 || evm_revalidate_status(xattr_name)) {
                ima_reset_appraise_flags(d_backing_inode(dentry), digsig);
                if (result == 1)
                        result = 0;
        }
        return result;
}

static int ima_inode_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                             const char *acl_name, struct posix_acl *kacl)
{
        if (evm_revalidate_status(acl_name))
                ima_reset_appraise_flags(d_backing_inode(dentry), 0);

        return 0;
}

static int ima_inode_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 const char *xattr_name)
{
        int result;

        result = ima_protect_xattr(dentry, xattr_name, NULL, 0);
        if (result == 1 || evm_revalidate_status(xattr_name)) {
                ima_reset_appraise_flags(d_backing_inode(dentry), 0);
                if (result == 1)
                        result = 0;
        }
        return result;
}

static int ima_inode_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                                const char *acl_name)
{
        return ima_inode_set_acl(idmap, dentry, acl_name, NULL);
}

static struct security_hook_list ima_appraise_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(inode_post_setattr, ima_inode_post_setattr),
        LSM_HOOK_INIT(inode_setxattr, ima_inode_setxattr),
        LSM_HOOK_INIT(inode_set_acl, ima_inode_set_acl),
        LSM_HOOK_INIT(inode_removexattr, ima_inode_removexattr),
        LSM_HOOK_INIT(inode_remove_acl, ima_inode_remove_acl),
};

void __init init_ima_appraise_lsm(const struct lsm_id *lsmid)
{
        security_add_hooks(ima_appraise_hooks, ARRAY_SIZE(ima_appraise_hooks),
                           lsmid);
}































































  345 
  346 






  346 










  149 
  149 


  149 



















  461 
  462 














  460 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PERCPU_RWSEM_H
#define _LINUX_PERCPU_RWSEM_H

#include <linux/atomic.h>
#include <linux/percpu.h>
#include <linux/rcuwait.h>
#include <linux/wait.h>
#include <linux/rcu_sync.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>

struct percpu_rw_semaphore {
        struct rcu_sync                rss;
        unsigned int __percpu        *read_count;
        struct rcuwait                writer;
        wait_queue_head_t        waiters;
        atomic_t                block;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)        .dep_map = { .name = #lockname },
#else
#define __PERCPU_RWSEM_DEP_MAP_INIT(lockname)
#endif

#define __DEFINE_PERCPU_RWSEM(name, is_static)                                \
static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name);                \
is_static struct percpu_rw_semaphore name = {                                \
        .rss = __RCU_SYNC_INITIALIZER(name.rss),                        \
        .read_count = &__percpu_rwsem_rc_##name,                        \
        .writer = __RCUWAIT_INITIALIZER(name.writer),                        \
        .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters),                \
        .block = ATOMIC_INIT(0),                                        \
        __PERCPU_RWSEM_DEP_MAP_INIT(name)                                \
}

#define DEFINE_PERCPU_RWSEM(name)                \
        __DEFINE_PERCPU_RWSEM(name, /* not static */)
#define DEFINE_STATIC_PERCPU_RWSEM(name)        \
        __DEFINE_PERCPU_RWSEM(name, static)

extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool);

static inline void percpu_down_read(struct percpu_rw_semaphore *sem)
{
        might_sleep();

        rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);

        preempt_disable();
        /*
         * We are in an RCU-sched read-side critical section, so the writer
         * cannot both change sem->state from readers_fast and start checking
         * counters while we are here. So if we see !sem->state, we know that
         * the writer won't be checking until we're past the preempt_enable()
         * and that once the synchronize_rcu() is done, the writer will see
         * anything we did within this RCU-sched read-size critical section.
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                __percpu_down_read(sem, false); /* Unconditional memory barrier */
        /*
         * The preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */
        preempt_enable();
}

static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
        bool ret = true;

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss)))
                this_cpu_inc(*sem->read_count);
        else
                ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */
        preempt_enable();
        /*
         * The barrier() from preempt_enable() prevents the compiler from
         * bleeding the critical section out.
         */

        if (ret)
                rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);

        return ret;
}

static inline void percpu_up_read(struct percpu_rw_semaphore *sem)
{
        rwsem_release(&sem->dep_map, _RET_IP_);

        preempt_disable();
        /*
         * Same as in percpu_down_read().
         */
        if (likely(rcu_sync_is_idle(&sem->rss))) {
                this_cpu_dec(*sem->read_count);
        } else {
                /*
                 * slowpath; reader will only ever wake a single blocked
                 * writer.
                 */
                smp_mb(); /* B matches C */
                /*
                 * In other words, if they see our decrement (presumably to
                 * aggregate zero, as that is the only time it matters) they
                 * will also see our critical section.
                 */
                this_cpu_dec(*sem->read_count);
                rcuwait_wake_up(&sem->writer);
        }
        preempt_enable();
}

extern bool percpu_is_read_locked(struct percpu_rw_semaphore *);
extern void percpu_down_write(struct percpu_rw_semaphore *);
extern void percpu_up_write(struct percpu_rw_semaphore *);

DEFINE_GUARD(percpu_read, struct percpu_rw_semaphore *,
             percpu_down_read(_T), percpu_up_read(_T))
DEFINE_GUARD_COND(percpu_read, _try, percpu_down_read_trylock(_T))

DEFINE_GUARD(percpu_write, struct percpu_rw_semaphore *,
             percpu_down_write(_T), percpu_up_write(_T))

static inline bool percpu_is_write_locked(struct percpu_rw_semaphore *sem)
{
        return atomic_read(&sem->block);
}

extern int __percpu_init_rwsem(struct percpu_rw_semaphore *,
                                const char *, struct lock_class_key *);

extern void percpu_free_rwsem(struct percpu_rw_semaphore *);

#define percpu_init_rwsem(sem)                                        \
({                                                                \
        static struct lock_class_key rwsem_key;                        \
        __percpu_init_rwsem(sem, #sem, &rwsem_key);                \
})

#define percpu_rwsem_is_held(sem)        lockdep_is_held(sem)
#define percpu_rwsem_assert_held(sem)        lockdep_assert_held(sem)

static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
                                        unsigned long ip)
{
        lock_release(&sem->dep_map, ip);
}

static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
                                        bool read, unsigned long ip)
{
        lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip);
}

#endif














   33 







































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Authors:        Thiébaud Weksteen <tweek@google.com>
 *                Peter Enderborg <Peter.Enderborg@sony.com>
 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM avc

#if !defined(_TRACE_SELINUX_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SELINUX_H

#include <linux/tracepoint.h>

TRACE_EVENT(selinux_audited,

        TP_PROTO(struct selinux_audit_data *sad,
                char *scontext,
                char *tcontext,
                const char *tclass
        ),

        TP_ARGS(sad, scontext, tcontext, tclass),

        TP_STRUCT__entry(
                __field(u32, requested)
                __field(u32, denied)
                __field(u32, audited)
                __field(int, result)
                __string(scontext, scontext)
                __string(tcontext, tcontext)
                __string(tclass, tclass)
        ),

        TP_fast_assign(
                __entry->requested        = sad->requested;
                __entry->denied                = sad->denied;
                __entry->audited        = sad->audited;
                __entry->result                = sad->result;
                __assign_str(tcontext);
                __assign_str(scontext);
                __assign_str(tclass);
        ),

        TP_printk("requested=0x%x denied=0x%x audited=0x%x result=%d scontext=%s tcontext=%s tclass=%s",
                __entry->requested, __entry->denied, __entry->audited, __entry->result,
                __get_str(scontext), __get_str(tcontext), __get_str(tclass)
        )
);

#endif

/* This part must be outside protection */
#include <trace/define_trace.h>
























    3 
    3 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2011-2014 Autronica Fire and Security AS
 *
 *        2011-2014 Arvid Brodin, arvid.brodin@alten.se
 *
 * include file for HSR and PRP.
 */

#ifndef __HSR_SLAVE_H
#define __HSR_SLAVE_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include "hsr_main.h"

int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
                 enum hsr_port_type pt, struct netlink_ext_ack *extack);
void hsr_del_port(struct hsr_port *port);
bool hsr_port_exists(const struct net_device *dev);

static inline struct hsr_port *hsr_port_get_rtnl(const struct net_device *dev)
{
        ASSERT_RTNL();
        return hsr_port_exists(dev) ?
                                rtnl_dereference(dev->rx_handler_data) : NULL;
}

static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev)
{
        return hsr_port_exists(dev) ?
                                rcu_dereference(dev->rx_handler_data) : NULL;
}

bool hsr_invalid_dan_ingress_frame(__be16 protocol);

#endif /* __HSR_SLAVE_H */























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  317 









  317 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swapfile.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 */

#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/shmem_fs.h>
#include <linux/blk-cgroup.h>
#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/backing-dev.h>
#include <linux/mutex.h>
#include <linux/capability.h>
#include <linux/syscalls.h>
#include <linux/memcontrol.h>
#include <linux/poll.h>
#include <linux/oom.h>
#include <linux/swapfile.h>
#include <linux/export.h>
#include <linux/sort.h>
#include <linux/completion.h>
#include <linux/suspend.h>
#include <linux/zswap.h>
#include <linux/plist.h>

#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
#include "internal.h"
#include "swap.h"

static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
                                 unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
static void swap_entry_range_free(struct swap_info_struct *si,
                                  struct swap_cluster_info *ci,
                                  swp_entry_t entry, unsigned int nr_pages);
static void swap_range_alloc(struct swap_info_struct *si,
                             unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
                                              unsigned long offset);
static inline void unlock_cluster(struct swap_cluster_info *ci);

static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
/*
 * Some modules use swappable objects and may try to swap them out under
 * memory pressure (via the shrinker). Before doing so, they may wish to
 * check to see if any swap space is available.
 */
EXPORT_SYMBOL_GPL(nr_swap_pages);
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
long total_swap_pages;
static int least_priority = -1;
unsigned long swapfile_maximum_size;
#ifdef CONFIG_MIGRATION
bool swap_migration_ad_supported;
#endif        /* CONFIG_MIGRATION */

static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
static const char Unused_offset[] = "Unused swap offset entry ";

/*
 * all active swap_info_structs
 * protected with swap_lock, and ordered by priority.
 */
static PLIST_HEAD(swap_active_head);

/*
 * all available (active, not full) swap_info_structs
 * protected with swap_avail_lock, ordered by priority.
 * This is used by folio_alloc_swap() instead of swap_active_head
 * because swap_active_head includes all swap_info_structs,
 * but folio_alloc_swap() doesn't need to look at full ones.
 * This uses its own lock instead of swap_lock because when a
 * swap_info_struct changes between not-full/full, it needs to
 * add/remove itself to/from this list, but the swap_info_struct->lock
 * is held and the locking order requires swap_lock to be taken
 * before any swap_info_struct->lock.
 */
static struct plist_head *swap_avail_heads;
static DEFINE_SPINLOCK(swap_avail_lock);

static struct swap_info_struct *swap_info[MAX_SWAPFILES];

static DEFINE_MUTEX(swapon_mutex);

static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
/* Activity counter to indicate that a swapon or swapoff has occurred */
static atomic_t proc_poll_event = ATOMIC_INIT(0);

atomic_t nr_rotate_swap = ATOMIC_INIT(0);

struct percpu_swap_cluster {
        struct swap_info_struct *si[SWAP_NR_ORDERS];
        unsigned long offset[SWAP_NR_ORDERS];
        local_lock_t lock;
};

static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
        .si = { NULL },
        .offset = { SWAP_ENTRY_INVALID },
        .lock = INIT_LOCAL_LOCK(),
};

static struct swap_info_struct *swap_type_to_swap_info(int type)
{
        if (type >= MAX_SWAPFILES)
                return NULL;

        return READ_ONCE(swap_info[type]); /* rcu_dereference() */
}

static inline unsigned char swap_count(unsigned char ent)
{
        return ent & ~SWAP_HAS_CACHE;        /* may include COUNT_CONTINUED flag */
}

/*
 * Use the second highest bit of inuse_pages counter as the indicator
 * if one swap device is on the available plist, so the atomic can
 * still be updated arithmetically while having special data embedded.
 *
 * inuse_pages counter is the only thing indicating if a device should
 * be on avail_lists or not (except swapon / swapoff). By embedding the
 * off-list bit in the atomic counter, updates no longer need any lock
 * to check the list status.
 *
 * This bit will be set if the device is not on the plist and not
 * usable, will be cleared if the device is on the plist.
 */
#define SWAP_USAGE_OFFLIST_BIT (1UL << (BITS_PER_TYPE(atomic_t) - 2))
#define SWAP_USAGE_COUNTER_MASK (~SWAP_USAGE_OFFLIST_BIT)
static long swap_usage_in_pages(struct swap_info_struct *si)
{
        return atomic_long_read(&si->inuse_pages) & SWAP_USAGE_COUNTER_MASK;
}

/* Reclaim the swap entry anyway if possible */
#define TTRS_ANYWAY                0x1
/*
 * Reclaim the swap entry if there are no more mappings of the
 * corresponding page
 */
#define TTRS_UNMAPPED                0x2
/* Reclaim the swap entry if swap is getting full */
#define TTRS_FULL                0x4

static bool swap_only_has_cache(struct swap_info_struct *si,
                              unsigned long offset, int nr_pages)
{
        unsigned char *map = si->swap_map + offset;
        unsigned char *map_end = map + nr_pages;

        do {
                VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
                if (*map != SWAP_HAS_CACHE)
                        return false;
        } while (++map < map_end);

        return true;
}

static bool swap_is_last_map(struct swap_info_struct *si,
                unsigned long offset, int nr_pages, bool *has_cache)
{
        unsigned char *map = si->swap_map + offset;
        unsigned char *map_end = map + nr_pages;
        unsigned char count = *map;

        if (swap_count(count) != 1)
                return false;

        while (++map < map_end) {
                if (*map != count)
                        return false;
        }

        *has_cache = !!(count & SWAP_HAS_CACHE);
        return true;
}

/*
 * returns number of pages in the folio that backs the swap entry. If positive,
 * the folio was reclaimed. If negative, the folio was not reclaimed. If 0, no
 * folio was associated with the swap entry.
 */
static int __try_to_reclaim_swap(struct swap_info_struct *si,
                                 unsigned long offset, unsigned long flags)
{
        swp_entry_t entry = swp_entry(si->type, offset);
        struct address_space *address_space = swap_address_space(entry);
        struct swap_cluster_info *ci;
        struct folio *folio;
        int ret, nr_pages;
        bool need_reclaim;

again:
        folio = filemap_get_folio(address_space, swap_cache_index(entry));
        if (IS_ERR(folio))
                return 0;

        nr_pages = folio_nr_pages(folio);
        ret = -nr_pages;

        /*
         * When this function is called from scan_swap_map_slots() and it's
         * called by vmscan.c at reclaiming folios. So we hold a folio lock
         * here. We have to use trylock for avoiding deadlock. This is a special
         * case and you should use folio_free_swap() with explicit folio_lock()
         * in usual operations.
         */
        if (!folio_trylock(folio))
                goto out;

        /*
         * Offset could point to the middle of a large folio, or folio
         * may no longer point to the expected offset before it's locked.
         */
        entry = folio->swap;
        if (offset < swp_offset(entry) || offset >= swp_offset(entry) + nr_pages) {
                folio_unlock(folio);
                folio_put(folio);
                goto again;
        }
        offset = swp_offset(entry);

        need_reclaim = ((flags & TTRS_ANYWAY) ||
                        ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
                        ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
        if (!need_reclaim || !folio_swapcache_freeable(folio))
                goto out_unlock;

        /*
         * It's safe to delete the folio from swap cache only if the folio's
         * swap_map is HAS_CACHE only, which means the slots have no page table
         * reference or pending writeback, and can't be allocated to others.
         */
        ci = lock_cluster(si, offset);
        need_reclaim = swap_only_has_cache(si, offset, nr_pages);
        unlock_cluster(ci);
        if (!need_reclaim)
                goto out_unlock;

        delete_from_swap_cache(folio);
        folio_set_dirty(folio);
        ret = nr_pages;
out_unlock:
        folio_unlock(folio);
out:
        folio_put(folio);
        return ret;
}

static inline struct swap_extent *first_se(struct swap_info_struct *sis)
{
        struct rb_node *rb = rb_first(&sis->swap_extent_root);
        return rb_entry(rb, struct swap_extent, rb_node);
}

static inline struct swap_extent *next_se(struct swap_extent *se)
{
        struct rb_node *rb = rb_next(&se->rb_node);
        return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
}

/*
 * swapon tell device that all the old swap contents can be discarded,
 * to allow the swap device to optimize its wear-levelling.
 */
static int discard_swap(struct swap_info_struct *si)
{
        struct swap_extent *se;
        sector_t start_block;
        sector_t nr_blocks;
        int err = 0;

        /* Do not discard the swap header page! */
        se = first_se(si);
        start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
                err = blkdev_issue_discard(si->bdev, start_block,
                                nr_blocks, GFP_KERNEL);
                if (err)
                        return err;
                cond_resched();
        }

        for (se = next_se(se); se; se = next_se(se)) {
                start_block = se->start_block << (PAGE_SHIFT - 9);
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);

                err = blkdev_issue_discard(si->bdev, start_block,
                                nr_blocks, GFP_KERNEL);
                if (err)
                        break;

                cond_resched();
        }
        return err;                /* That will often be -EOPNOTSUPP */
}

static struct swap_extent *
offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
{
        struct swap_extent *se;
        struct rb_node *rb;

        rb = sis->swap_extent_root.rb_node;
        while (rb) {
                se = rb_entry(rb, struct swap_extent, rb_node);
                if (offset < se->start_page)
                        rb = rb->rb_left;
                else if (offset >= se->start_page + se->nr_pages)
                        rb = rb->rb_right;
                else
                        return se;
        }
        /* It *must* be present */
        BUG();
}

sector_t swap_folio_sector(struct folio *folio)
{
        struct swap_info_struct *sis = swp_swap_info(folio->swap);
        struct swap_extent *se;
        sector_t sector;
        pgoff_t offset;

        offset = swp_offset(folio->swap);
        se = offset_to_swap_extent(sis, offset);
        sector = se->start_block + (offset - se->start_page);
        return sector << (PAGE_SHIFT - 9);
}

/*
 * swap allocation tell device that a cluster of swap can now be discarded,
 * to allow the swap device to optimize its wear-levelling.
 */
static void discard_swap_cluster(struct swap_info_struct *si,
                                 pgoff_t start_page, pgoff_t nr_pages)
{
        struct swap_extent *se = offset_to_swap_extent(si, start_page);

        while (nr_pages) {
                pgoff_t offset = start_page - se->start_page;
                sector_t start_block = se->start_block + offset;
                sector_t nr_blocks = se->nr_pages - offset;

                if (nr_blocks > nr_pages)
                        nr_blocks = nr_pages;
                start_page += nr_blocks;
                nr_pages -= nr_blocks;

                start_block <<= PAGE_SHIFT - 9;
                nr_blocks <<= PAGE_SHIFT - 9;
                if (blkdev_issue_discard(si->bdev, start_block,
                                        nr_blocks, GFP_NOIO))
                        break;

                se = next_se(se);
        }
}

#ifdef CONFIG_THP_SWAP
#define SWAPFILE_CLUSTER        HPAGE_PMD_NR

#define swap_entry_order(order)        (order)
#else
#define SWAPFILE_CLUSTER        256

/*
 * Define swap_entry_order() as constant to let compiler to optimize
 * out some code if !CONFIG_THP_SWAP
 */
#define swap_entry_order(order)        0
#endif
#define LATENCY_LIMIT                256

static inline bool cluster_is_empty(struct swap_cluster_info *info)
{
        return info->count == 0;
}

static inline bool cluster_is_discard(struct swap_cluster_info *info)
{
        return info->flags == CLUSTER_FLAG_DISCARD;
}

static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
{
        if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
                return false;
        if (!order)
                return true;
        return cluster_is_empty(ci) || order == ci->order;
}

static inline unsigned int cluster_index(struct swap_info_struct *si,
                                         struct swap_cluster_info *ci)
{
        return ci - si->cluster_info;
}

static inline struct swap_cluster_info *offset_to_cluster(struct swap_info_struct *si,
                                                          unsigned long offset)
{
        return &si->cluster_info[offset / SWAPFILE_CLUSTER];
}

static inline unsigned int cluster_offset(struct swap_info_struct *si,
                                          struct swap_cluster_info *ci)
{
        return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}

static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
                                                     unsigned long offset)
{
        struct swap_cluster_info *ci;

        ci = offset_to_cluster(si, offset);
        spin_lock(&ci->lock);

        return ci;
}

static inline void unlock_cluster(struct swap_cluster_info *ci)
{
        spin_unlock(&ci->lock);
}

static void move_cluster(struct swap_info_struct *si,
                         struct swap_cluster_info *ci, struct list_head *list,
                         enum swap_cluster_flags new_flags)
{
        VM_WARN_ON(ci->flags == new_flags);

        BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
        lockdep_assert_held(&ci->lock);

        spin_lock(&si->lock);
        if (ci->flags == CLUSTER_FLAG_NONE)
                list_add_tail(&ci->list, list);
        else
                list_move_tail(&ci->list, list);
        spin_unlock(&si->lock);

        if (ci->flags == CLUSTER_FLAG_FRAG)
                atomic_long_dec(&si->frag_cluster_nr[ci->order]);
        else if (new_flags == CLUSTER_FLAG_FRAG)
                atomic_long_inc(&si->frag_cluster_nr[ci->order]);
        ci->flags = new_flags;
}

/* Add a cluster to discard list and schedule it to do discard */
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
                struct swap_cluster_info *ci)
{
        VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
        move_cluster(si, ci, &si->discard_clusters, CLUSTER_FLAG_DISCARD);
        schedule_work(&si->discard_work);
}

static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
        lockdep_assert_held(&ci->lock);
        move_cluster(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
        ci->order = 0;
}

/*
 * Isolate and lock the first cluster that is not contented on a list,
 * clean its flag before taken off-list. Cluster flag must be in sync
 * with list status, so cluster updaters can always know the cluster
 * list status without touching si lock.
 *
 * Note it's possible that all clusters on a list are contented so
 * this returns NULL for an non-empty list.
 */
static struct swap_cluster_info *isolate_lock_cluster(
                struct swap_info_struct *si, struct list_head *list)
{
        struct swap_cluster_info *ci, *ret = NULL;

        spin_lock(&si->lock);

        if (unlikely(!(si->flags & SWP_WRITEOK)))
                goto out;

        list_for_each_entry(ci, list, list) {
                if (!spin_trylock(&ci->lock))
                        continue;

                /* We may only isolate and clear flags of following lists */
                VM_BUG_ON(!ci->flags);
                VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
                          ci->flags != CLUSTER_FLAG_FULL);

                list_del(&ci->list);
                ci->flags = CLUSTER_FLAG_NONE;
                ret = ci;
                break;
        }
out:
        spin_unlock(&si->lock);

        return ret;
}

/*
 * Doing discard actually. After a cluster discard is finished, the cluster
 * will be added to free cluster list. Discard cluster is a bit special as
 * they don't participate in allocation or reclaim, so clusters marked as
 * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
 */
static bool swap_do_scheduled_discard(struct swap_info_struct *si)
{
        struct swap_cluster_info *ci;
        bool ret = false;
        unsigned int idx;

        spin_lock(&si->lock);
        while (!list_empty(&si->discard_clusters)) {
                ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
                /*
                 * Delete the cluster from list to prepare for discard, but keep
                 * the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
                 * pointing to it, or ran into by relocate_cluster.
                 */
                list_del(&ci->list);
                idx = cluster_index(si, ci);
                spin_unlock(&si->lock);
                discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
                                SWAPFILE_CLUSTER);

                spin_lock(&ci->lock);
                /*
                 * Discard is done, clear its flags as it's off-list, then
                 * return the cluster to allocation list.
                 */
                ci->flags = CLUSTER_FLAG_NONE;
                __free_cluster(si, ci);
                spin_unlock(&ci->lock);
                ret = true;
                spin_lock(&si->lock);
        }
        spin_unlock(&si->lock);
        return ret;
}

static void swap_discard_work(struct work_struct *work)
{
        struct swap_info_struct *si;

        si = container_of(work, struct swap_info_struct, discard_work);

        swap_do_scheduled_discard(si);
}

static void swap_users_ref_free(struct percpu_ref *ref)
{
        struct swap_info_struct *si;

        si = container_of(ref, struct swap_info_struct, users);
        complete(&si->comp);
}

/*
 * Must be called after freeing if ci->count == 0, moves the cluster to free
 * or discard list.
 */
static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
        VM_BUG_ON(ci->count != 0);
        VM_BUG_ON(ci->flags == CLUSTER_FLAG_FREE);
        lockdep_assert_held(&ci->lock);

        /*
         * If the swap is discardable, prepare discard the cluster
         * instead of free it immediately. The cluster will be freed
         * after discard.
         */
        if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
            (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
                swap_cluster_schedule_discard(si, ci);
                return;
        }

        __free_cluster(si, ci);
}

/*
 * Must be called after freeing if ci->count != 0, moves the cluster to
 * nonfull list.
 */
static void partial_free_cluster(struct swap_info_struct *si,
                                 struct swap_cluster_info *ci)
{
        VM_BUG_ON(!ci->count || ci->count == SWAPFILE_CLUSTER);
        lockdep_assert_held(&ci->lock);

        if (ci->flags != CLUSTER_FLAG_NONFULL)
                move_cluster(si, ci, &si->nonfull_clusters[ci->order],
                             CLUSTER_FLAG_NONFULL);
}

/*
 * Must be called after allocation, moves the cluster to full or frag list.
 * Note: allocation doesn't acquire si lock, and may drop the ci lock for
 * reclaim, so the cluster could be any where when called.
 */
static void relocate_cluster(struct swap_info_struct *si,
                             struct swap_cluster_info *ci)
{
        lockdep_assert_held(&ci->lock);

        /* Discard cluster must remain off-list or on discard list */
        if (cluster_is_discard(ci))
                return;

        if (!ci->count) {
                if (ci->flags != CLUSTER_FLAG_FREE)
                        free_cluster(si, ci);
        } else if (ci->count != SWAPFILE_CLUSTER) {
                if (ci->flags != CLUSTER_FLAG_FRAG)
                        move_cluster(si, ci, &si->frag_clusters[ci->order],
                                     CLUSTER_FLAG_FRAG);
        } else {
                if (ci->flags != CLUSTER_FLAG_FULL)
                        move_cluster(si, ci, &si->full_clusters,
                                     CLUSTER_FLAG_FULL);
        }
}

/*
 * The cluster corresponding to page_nr will be used. The cluster will not be
 * added to free cluster list and its usage counter will be increased by 1.
 * Only used for initialization.
 */
static void inc_cluster_info_page(struct swap_info_struct *si,
        struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
        unsigned long idx = page_nr / SWAPFILE_CLUSTER;
        struct swap_cluster_info *ci;

        ci = cluster_info + idx;
        ci->count++;

        VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
        VM_BUG_ON(ci->flags);
}

static bool cluster_reclaim_range(struct swap_info_struct *si,
                                  struct swap_cluster_info *ci,
                                  unsigned long start, unsigned long end)
{
        unsigned char *map = si->swap_map;
        unsigned long offset = start;
        int nr_reclaim;

        spin_unlock(&ci->lock);
        do {
                switch (READ_ONCE(map[offset])) {
                case 0:
                        offset++;
                        break;
                case SWAP_HAS_CACHE:
                        nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
                        if (nr_reclaim > 0)
                                offset += nr_reclaim;
                        else
                                goto out;
                        break;
                default:
                        goto out;
                }
        } while (offset < end);
out:
        spin_lock(&ci->lock);
        /*
         * Recheck the range no matter reclaim succeeded or not, the slot
         * could have been be freed while we are not holding the lock.
         */
        for (offset = start; offset < end; offset++)
                if (READ_ONCE(map[offset]))
                        return false;

        return true;
}

static bool cluster_scan_range(struct swap_info_struct *si,
                               struct swap_cluster_info *ci,
                               unsigned long start, unsigned int nr_pages,
                               bool *need_reclaim)
{
        unsigned long offset, end = start + nr_pages;
        unsigned char *map = si->swap_map;

        if (cluster_is_empty(ci))
                return true;

        for (offset = start; offset < end; offset++) {
                switch (READ_ONCE(map[offset])) {
                case 0:
                        continue;
                case SWAP_HAS_CACHE:
                        if (!vm_swap_full())
                                return false;
                        *need_reclaim = true;
                        continue;
                default:
                        return false;
                }
        }

        return true;
}

static bool cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
                                unsigned int start, unsigned char usage,
                                unsigned int order)
{
        unsigned int nr_pages = 1 << order;

        lockdep_assert_held(&ci->lock);

        if (!(si->flags & SWP_WRITEOK))
                return false;

        /*
         * The first allocation in a cluster makes the
         * cluster exclusive to this order
         */
        if (cluster_is_empty(ci))
                ci->order = order;

        memset(si->swap_map + start, usage, nr_pages);
        swap_range_alloc(si, nr_pages);
        ci->count += nr_pages;

        return true;
}

/* Try use a new cluster for current CPU and allocate from it. */
static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
                                            struct swap_cluster_info *ci,
                                            unsigned long offset,
                                            unsigned int order,
                                            unsigned char usage)
{
        unsigned int next = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
        unsigned long start = ALIGN_DOWN(offset, SWAPFILE_CLUSTER);
        unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
        unsigned int nr_pages = 1 << order;
        bool need_reclaim, ret;

        lockdep_assert_held(&ci->lock);

        if (end < nr_pages || ci->count + nr_pages > SWAPFILE_CLUSTER)
                goto out;

        for (end -= nr_pages; offset <= end; offset += nr_pages) {
                need_reclaim = false;
                if (!cluster_scan_range(si, ci, offset, nr_pages, &need_reclaim))
                        continue;
                if (need_reclaim) {
                        ret = cluster_reclaim_range(si, ci, offset, offset + nr_pages);
                        /*
                         * Reclaim drops ci->lock and cluster could be used
                         * by another order. Not checking flag as off-list
                         * cluster has no flag set, and change of list
                         * won't cause fragmentation.
                         */
                        if (!cluster_is_usable(ci, order))
                                goto out;
                        if (cluster_is_empty(ci))
                                offset = start;
                        /* Reclaim failed but cluster is usable, try next */
                        if (!ret)
                                continue;
                }
                if (!cluster_alloc_range(si, ci, offset, usage, order))
                        break;
                found = offset;
                offset += nr_pages;
                if (ci->count < SWAPFILE_CLUSTER && offset <= end)
                        next = offset;
                break;
        }
out:
        relocate_cluster(si, ci);
        unlock_cluster(ci);
        if (si->flags & SWP_SOLIDSTATE) {
                this_cpu_write(percpu_swap_cluster.offset[order], next);
                this_cpu_write(percpu_swap_cluster.si[order], si);
        } else {
                si->global_cluster->next[order] = next;
        }
        return found;
}

static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force)
{
        long to_scan = 1;
        unsigned long offset, end;
        struct swap_cluster_info *ci;
        unsigned char *map = si->swap_map;
        int nr_reclaim;

        if (force)
                to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER;

        while ((ci = isolate_lock_cluster(si, &si->full_clusters))) {
                offset = cluster_offset(si, ci);
                end = min(si->max, offset + SWAPFILE_CLUSTER);
                to_scan--;

                while (offset < end) {
                        if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
                                spin_unlock(&ci->lock);
                                nr_reclaim = __try_to_reclaim_swap(si, offset,
                                                                   TTRS_ANYWAY);
                                spin_lock(&ci->lock);
                                if (nr_reclaim) {
                                        offset += abs(nr_reclaim);
                                        continue;
                                }
                        }
                        offset++;
                }

                /* in case no swap cache is reclaimed */
                if (ci->flags == CLUSTER_FLAG_NONE)
                        relocate_cluster(si, ci);

                unlock_cluster(ci);
                if (to_scan <= 0)
                        break;
        }
}

static void swap_reclaim_work(struct work_struct *work)
{
        struct swap_info_struct *si;

        si = container_of(work, struct swap_info_struct, reclaim_work);

        swap_reclaim_full_clusters(si, true);
}

/*
 * Try to allocate swap entries with specified order and try set a new
 * cluster for current CPU too.
 */
static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
                                              unsigned char usage)
{
        struct swap_cluster_info *ci;
        unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;

        /*
         * Swapfile is not block device so unable
         * to allocate large entries.
         */
        if (order && !(si->flags & SWP_BLKDEV))
                return 0;

        if (!(si->flags & SWP_SOLIDSTATE)) {
                /* Serialize HDD SWAP allocation for each device. */
                spin_lock(&si->global_cluster_lock);
                offset = si->global_cluster->next[order];
                if (offset == SWAP_ENTRY_INVALID)
                        goto new_cluster;

                ci = lock_cluster(si, offset);
                /* Cluster could have been used by another order */
                if (cluster_is_usable(ci, order)) {
                        if (cluster_is_empty(ci))
                                offset = cluster_offset(si, ci);
                        found = alloc_swap_scan_cluster(si, ci, offset,
                                                        order, usage);
                } else {
                        unlock_cluster(ci);
                }
                if (found)
                        goto done;
        }

new_cluster:
        ci = isolate_lock_cluster(si, &si->free_clusters);
        if (ci) {
                found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
                                                order, usage);
                if (found)
                        goto done;
        }

        /* Try reclaim from full clusters if free clusters list is drained */
        if (vm_swap_full())
                swap_reclaim_full_clusters(si, false);

        if (order < PMD_ORDER) {
                unsigned int frags = 0, frags_existing;

                while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[order]))) {
                        found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
                                                        order, usage);
                        if (found)
                                goto done;
                        /* Clusters failed to allocate are moved to frag_clusters */
                        frags++;
                }

                frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
                while (frags < frags_existing &&
                       (ci = isolate_lock_cluster(si, &si->frag_clusters[order]))) {
                        atomic_long_dec(&si->frag_cluster_nr[order]);
                        /*
                         * Rotate the frag list to iterate, they were all
                         * failing high order allocation or moved here due to
                         * per-CPU usage, but they could contain newly released
                         * reclaimable (eg. lazy-freed swap cache) slots.
                         */
                        found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
                                                        order, usage);
                        if (found)
                                goto done;
                        frags++;
                }
        }

        /*
         * We don't have free cluster but have some clusters in
         * discarding, do discard now and reclaim them, then
         * reread cluster_next_cpu since we dropped si->lock
         */
        if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
                goto new_cluster;

        if (order)
                goto done;

        /* Order 0 stealing from higher order */
        for (int o = 1; o < SWAP_NR_ORDERS; o++) {
                /*
                 * Clusters here have at least one usable slots and can't fail order 0
                 * allocation, but reclaim may drop si->lock and race with another user.
                 */
                while ((ci = isolate_lock_cluster(si, &si->frag_clusters[o]))) {
                        atomic_long_dec(&si->frag_cluster_nr[o]);
                        found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
                                                        0, usage);
                        if (found)
                                goto done;
                }

                while ((ci = isolate_lock_cluster(si, &si->nonfull_clusters[o]))) {
                        found = alloc_swap_scan_cluster(si, ci, cluster_offset(si, ci),
                                                        0, usage);
                        if (found)
                                goto done;
                }
        }
done:
        if (!(si->flags & SWP_SOLIDSTATE))
                spin_unlock(&si->global_cluster_lock);
        return found;
}

/* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */
static void del_from_avail_list(struct swap_info_struct *si, bool swapoff)
{
        int nid;
        unsigned long pages;

        spin_lock(&swap_avail_lock);

        if (swapoff) {
                /*
                 * Forcefully remove it. Clear the SWP_WRITEOK flags for
                 * swapoff here so it's synchronized by both si->lock and
                 * swap_avail_lock, to ensure the result can be seen by
                 * add_to_avail_list.
                 */
                lockdep_assert_held(&si->lock);
                si->flags &= ~SWP_WRITEOK;
                atomic_long_or(SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);
        } else {
                /*
                 * If not called by swapoff, take it off-list only if it's
                 * full and SWAP_USAGE_OFFLIST_BIT is not set (strictly
                 * si->inuse_pages == pages), any concurrent slot freeing,
                 * or device already removed from plist by someone else
                 * will make this return false.
                 */
                pages = si->pages;
                if (!atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
                                             pages | SWAP_USAGE_OFFLIST_BIT))
                        goto skip;
        }

        for_each_node(nid)
                plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);

skip:
        spin_unlock(&swap_avail_lock);
}

/* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */
static void add_to_avail_list(struct swap_info_struct *si, bool swapon)
{
        int nid;
        long val;
        unsigned long pages;

        spin_lock(&swap_avail_lock);

        /* Corresponding to SWP_WRITEOK clearing in del_from_avail_list */
        if (swapon) {
                lockdep_assert_held(&si->lock);
                si->flags |= SWP_WRITEOK;
        } else {
                if (!(READ_ONCE(si->flags) & SWP_WRITEOK))
                        goto skip;
        }

        if (!(atomic_long_read(&si->inuse_pages) & SWAP_USAGE_OFFLIST_BIT))
                goto skip;

        val = atomic_long_fetch_and_relaxed(~SWAP_USAGE_OFFLIST_BIT, &si->inuse_pages);

        /*
         * When device is full and device is on the plist, only one updater will
         * see (inuse_pages == si->pages) and will call del_from_avail_list. If
         * that updater happen to be here, just skip adding.
         */
        pages = si->pages;
        if (val == pages) {
                /* Just like the cmpxchg in del_from_avail_list */
                if (atomic_long_try_cmpxchg(&si->inuse_pages, &pages,
                                            pages | SWAP_USAGE_OFFLIST_BIT))
                        goto skip;
        }

        for_each_node(nid)
                plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);

skip:
        spin_unlock(&swap_avail_lock);
}

/*
 * swap_usage_add / swap_usage_sub of each slot are serialized by ci->lock
 * within each cluster, so the total contribution to the global counter should
 * always be positive and cannot exceed the total number of usable slots.
 */
static bool swap_usage_add(struct swap_info_struct *si, unsigned int nr_entries)
{
        long val = atomic_long_add_return_relaxed(nr_entries, &si->inuse_pages);

        /*
         * If device is full, and SWAP_USAGE_OFFLIST_BIT is not set,
         * remove it from the plist.
         */
        if (unlikely(val == si->pages)) {
                del_from_avail_list(si, false);
                return true;
        }

        return false;
}

static void swap_usage_sub(struct swap_info_struct *si, unsigned int nr_entries)
{
        long val = atomic_long_sub_return_relaxed(nr_entries, &si->inuse_pages);

        /*
         * If device is not full, and SWAP_USAGE_OFFLIST_BIT is set,
         * add it to the plist.
         */
        if (unlikely(val & SWAP_USAGE_OFFLIST_BIT))
                add_to_avail_list(si, false);
}

static void swap_range_alloc(struct swap_info_struct *si,
                             unsigned int nr_entries)
{
        if (swap_usage_add(si, nr_entries)) {
                if (vm_swap_full())
                        schedule_work(&si->reclaim_work);
        }
}

static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
                            unsigned int nr_entries)
{
        unsigned long begin = offset;
        unsigned long end = offset + nr_entries - 1;
        void (*swap_slot_free_notify)(struct block_device *, unsigned long);
        unsigned int i;

        /*
         * Use atomic clear_bit operations only on zeromap instead of non-atomic
         * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
         */
        for (i = 0; i < nr_entries; i++) {
                clear_bit(offset + i, si->zeromap);
                zswap_invalidate(swp_entry(si->type, offset + i));
        }

        if (si->flags & SWP_BLKDEV)
                swap_slot_free_notify =
                        si->bdev->bd_disk->fops->swap_slot_free_notify;
        else
                swap_slot_free_notify = NULL;
        while (offset <= end) {
                arch_swap_invalidate_page(si->type, offset);
                if (swap_slot_free_notify)
                        swap_slot_free_notify(si->bdev, offset);
                offset++;
        }
        clear_shadow_from_swap_cache(si->type, begin, end);

        /*
         * Make sure that try_to_unuse() observes si->inuse_pages reaching 0
         * only after the above cleanups are done.
         */
        smp_wmb();
        atomic_long_add(nr_entries, &nr_swap_pages);
        swap_usage_sub(si, nr_entries);
}

static bool get_swap_device_info(struct swap_info_struct *si)
{
        if (!percpu_ref_tryget_live(&si->users))
                return false;
        /*
         * Guarantee the si->users are checked before accessing other
         * fields of swap_info_struct, and si->flags (SWP_WRITEOK) is
         * up to dated.
         *
         * Paired with the spin_unlock() after setup_swap_info() in
         * enable_swap_info(), and smp_wmb() in swapoff.
         */
        smp_rmb();
        return true;
}

/*
 * Fast path try to get swap entries with specified order from current
 * CPU's swap entry pool (a cluster).
 */
static bool swap_alloc_fast(swp_entry_t *entry,
                            int order)
{
        struct swap_cluster_info *ci;
        struct swap_info_struct *si;
        unsigned int offset, found = SWAP_ENTRY_INVALID;

        /*
         * Once allocated, swap_info_struct will never be completely freed,
         * so checking it's liveness by get_swap_device_info is enough.
         */
        si = this_cpu_read(percpu_swap_cluster.si[order]);
        offset = this_cpu_read(percpu_swap_cluster.offset[order]);
        if (!si || !offset || !get_swap_device_info(si))
                return false;

        ci = lock_cluster(si, offset);
        if (cluster_is_usable(ci, order)) {
                if (cluster_is_empty(ci))
                        offset = cluster_offset(si, ci);
                found = alloc_swap_scan_cluster(si, ci, offset, order, SWAP_HAS_CACHE);
                if (found)
                        *entry = swp_entry(si->type, found);
        } else {
                unlock_cluster(ci);
        }

        put_swap_device(si);
        return !!found;
}

/* Rotate the device and switch to a new cluster */
static bool swap_alloc_slow(swp_entry_t *entry,
                            int order)
{
        int node;
        unsigned long offset;
        struct swap_info_struct *si, *next;

        node = numa_node_id();
        spin_lock(&swap_avail_lock);
start_over:
        plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
                /* Rotate the device and switch to a new cluster */
                plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
                spin_unlock(&swap_avail_lock);
                if (get_swap_device_info(si)) {
                        offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE);
                        put_swap_device(si);
                        if (offset) {
                                *entry = swp_entry(si->type, offset);
                                return true;
                        }
                        if (order)
                                return false;
                }

                spin_lock(&swap_avail_lock);
                /*
                 * if we got here, it's likely that si was almost full before,
                 * and since scan_swap_map_slots() can drop the si->lock,
                 * multiple callers probably all tried to get a page from the
                 * same si and it filled up before we could get one; or, the si
                 * filled up between us dropping swap_avail_lock and taking
                 * si->lock. Since we dropped the swap_avail_lock, the
                 * swap_avail_head list may have been modified; so if next is
                 * still in the swap_avail_head list then try it, otherwise
                 * start over if we have not gotten any slots.
                 */
                if (plist_node_empty(&next->avail_lists[node]))
                        goto start_over;
        }
        spin_unlock(&swap_avail_lock);
        return false;
}

/**
 * folio_alloc_swap - allocate swap space for a folio
 * @folio: folio we want to move to swap
 * @gfp: gfp mask for shadow nodes
 *
 * Allocate swap space for the folio and add the folio to the
 * swap cache.
 *
 * Context: Caller needs to hold the folio lock.
 * Return: Whether the folio was added to the swap cache.
 */
int folio_alloc_swap(struct folio *folio, gfp_t gfp)
{
        unsigned int order = folio_order(folio);
        unsigned int size = 1 << order;
        swp_entry_t entry = {};

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);

        /*
         * Should not even be attempting large allocations when huge
         * page swap is disabled. Warn and fail the allocation.
         */
        if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
                VM_WARN_ON_ONCE(1);
                return -EINVAL;
        }

        local_lock(&percpu_swap_cluster.lock);
        if (!swap_alloc_fast(&entry, order))
                swap_alloc_slow(&entry, order);
        local_unlock(&percpu_swap_cluster.lock);

        /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */
        if (mem_cgroup_try_charge_swap(folio, entry))
                goto out_free;

        if (!entry.val)
                return -ENOMEM;

        /*
         * XArray node allocations from PF_MEMALLOC contexts could
         * completely exhaust the page allocator. __GFP_NOMEMALLOC
         * stops emergency reserves from being allocated.
         *
         * TODO: this could cause a theoretical memory reclaim
         * deadlock in the swap out path.
         */
        if (add_to_swap_cache(folio, entry, gfp | __GFP_NOMEMALLOC, NULL))
                goto out_free;

        atomic_long_sub(size, &nr_swap_pages);
        return 0;

out_free:
        put_swap_folio(folio, entry);
        return -ENOMEM;
}

static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
{
        struct swap_info_struct *si;
        unsigned long offset;

        if (!entry.val)
                goto out;
        si = swp_swap_info(entry);
        if (!si)
                goto bad_nofile;
        if (data_race(!(si->flags & SWP_USED)))
                goto bad_device;
        offset = swp_offset(entry);
        if (offset >= si->max)
                goto bad_offset;
        if (data_race(!si->swap_map[swp_offset(entry)]))
                goto bad_free;
        return si;

bad_free:
        pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
        goto out;
bad_offset:
        pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
        goto out;
bad_device:
        pr_err("%s: %s%08lx\n", __func__, Unused_file, entry.val);
        goto out;
bad_nofile:
        pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
        return NULL;
}

static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
                                              unsigned long offset,
                                              unsigned char usage)
{
        unsigned char count;
        unsigned char has_cache;

        count = si->swap_map[offset];

        has_cache = count & SWAP_HAS_CACHE;
        count &= ~SWAP_HAS_CACHE;

        if (usage == SWAP_HAS_CACHE) {
                VM_BUG_ON(!has_cache);
                has_cache = 0;
        } else if (count == SWAP_MAP_SHMEM) {
                /*
                 * Or we could insist on shmem.c using a special
                 * swap_shmem_free() and free_shmem_swap_and_cache()...
                 */
                count = 0;
        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
                if (count == COUNT_CONTINUED) {
                        if (swap_count_continued(si, offset, count))
                                count = SWAP_MAP_MAX | COUNT_CONTINUED;
                        else
                                count = SWAP_MAP_MAX;
                } else
                        count--;
        }

        usage = count | has_cache;
        if (usage)
                WRITE_ONCE(si->swap_map[offset], usage);
        else
                WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);

        return usage;
}

/*
 * When we get a swap entry, if there aren't some other ways to
 * prevent swapoff, such as the folio in swap cache is locked, RCU
 * reader side is locked, etc., the swap entry may become invalid
 * because of swapoff.  Then, we need to enclose all swap related
 * functions with get_swap_device() and put_swap_device(), unless the
 * swap functions call get/put_swap_device() by themselves.
 *
 * RCU reader side lock (including any spinlock) is sufficient to
 * prevent swapoff, because synchronize_rcu() is called in swapoff()
 * before freeing data structures.
 *
 * Check whether swap entry is valid in the swap device.  If so,
 * return pointer to swap_info_struct, and keep the swap entry valid
 * via preventing the swap device from being swapoff, until
 * put_swap_device() is called.  Otherwise return NULL.
 *
 * Notice that swapoff or swapoff+swapon can still happen before the
 * percpu_ref_tryget_live() in get_swap_device() or after the
 * percpu_ref_put() in put_swap_device() if there isn't any other way
 * to prevent swapoff.  The caller must be prepared for that.  For
 * example, the following situation is possible.
 *
 *   CPU1                                CPU2
 *   do_swap_page()
 *     ...                                swapoff+swapon
 *     __read_swap_cache_async()
 *       swapcache_prepare()
 *         __swap_duplicate()
 *           // check swap_map
 *     // verify PTE not changed
 *
 * In __swap_duplicate(), the swap_map need to be checked before
 * changing partly because the specified swap entry may be for another
 * swap device which has been swapoff.  And in do_swap_page(), after
 * the page is read from the swap device, the PTE is verified not
 * changed with the page table locked to check whether the swap device
 * has been swapoff or swapoff+swapon.
 */
struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
        struct swap_info_struct *si;
        unsigned long offset;

        if (!entry.val)
                goto out;
        si = swp_swap_info(entry);
        if (!si)
                goto bad_nofile;
        if (!get_swap_device_info(si))
                goto out;
        offset = swp_offset(entry);
        if (offset >= si->max)
                goto put_out;

        return si;
bad_nofile:
        pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
out:
        return NULL;
put_out:
        pr_err("%s: %s%08lx\n", __func__, Bad_offset, entry.val);
        percpu_ref_put(&si->users);
        return NULL;
}

static unsigned char __swap_entry_free(struct swap_info_struct *si,
                                       swp_entry_t entry)
{
        struct swap_cluster_info *ci;
        unsigned long offset = swp_offset(entry);
        unsigned char usage;

        ci = lock_cluster(si, offset);
        usage = __swap_entry_free_locked(si, offset, 1);
        if (!usage)
                swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
        unlock_cluster(ci);

        return usage;
}

static bool __swap_entries_free(struct swap_info_struct *si,
                swp_entry_t entry, int nr)
{
        unsigned long offset = swp_offset(entry);
        unsigned int type = swp_type(entry);
        struct swap_cluster_info *ci;
        bool has_cache = false;
        unsigned char count;
        int i;

        if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
                goto fallback;
        /* cross into another cluster */
        if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
                goto fallback;

        ci = lock_cluster(si, offset);
        if (!swap_is_last_map(si, offset, nr, &has_cache)) {
                unlock_cluster(ci);
                goto fallback;
        }
        for (i = 0; i < nr; i++)
                WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
        if (!has_cache)
                swap_entry_range_free(si, ci, entry, nr);
        unlock_cluster(ci);

        return has_cache;

fallback:
        for (i = 0; i < nr; i++) {
                if (data_race(si->swap_map[offset + i])) {
                        count = __swap_entry_free(si, swp_entry(type, offset + i));
                        if (count == SWAP_HAS_CACHE)
                                has_cache = true;
                } else {
                        WARN_ON_ONCE(1);
                }
        }
        return has_cache;
}

/*
 * Drop the last HAS_CACHE flag of swap entries, caller have to
 * ensure all entries belong to the same cgroup.
 */
static void swap_entry_range_free(struct swap_info_struct *si,
                                  struct swap_cluster_info *ci,
                                  swp_entry_t entry, unsigned int nr_pages)
{
        unsigned long offset = swp_offset(entry);
        unsigned char *map = si->swap_map + offset;
        unsigned char *map_end = map + nr_pages;

        /* It should never free entries across different clusters */
        VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1));
        VM_BUG_ON(cluster_is_empty(ci));
        VM_BUG_ON(ci->count < nr_pages);

        ci->count -= nr_pages;
        do {
                VM_BUG_ON(*map != SWAP_HAS_CACHE);
                *map = 0;
        } while (++map < map_end);

        mem_cgroup_uncharge_swap(entry, nr_pages);
        swap_range_free(si, offset, nr_pages);

        if (!ci->count)
                free_cluster(si, ci);
        else
                partial_free_cluster(si, ci);
}

static void cluster_swap_free_nr(struct swap_info_struct *si,
                unsigned long offset, int nr_pages,
                unsigned char usage)
{
        struct swap_cluster_info *ci;
        unsigned long end = offset + nr_pages;

        ci = lock_cluster(si, offset);
        do {
                if (!__swap_entry_free_locked(si, offset, usage))
                        swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
        } while (++offset < end);
        unlock_cluster(ci);
}

/*
 * Caller has made sure that the swap device corresponding to entry
 * is still around or has not been recycled.
 */
void swap_free_nr(swp_entry_t entry, int nr_pages)
{
        int nr;
        struct swap_info_struct *sis;
        unsigned long offset = swp_offset(entry);

        sis = _swap_info_get(entry);
        if (!sis)
                return;

        while (nr_pages) {
                nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
                cluster_swap_free_nr(sis, offset, nr, 1);
                offset += nr;
                nr_pages -= nr;
        }
}

/*
 * Called after dropping swapcache to decrease refcnt to swap entries.
 */
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
        unsigned long offset = swp_offset(entry);
        struct swap_cluster_info *ci;
        struct swap_info_struct *si;
        int size = 1 << swap_entry_order(folio_order(folio));

        si = _swap_info_get(entry);
        if (!si)
                return;

        ci = lock_cluster(si, offset);
        if (swap_only_has_cache(si, offset, size))
                swap_entry_range_free(si, ci, entry, size);
        else {
                for (int i = 0; i < size; i++, entry.val++) {
                        if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE))
                                swap_entry_range_free(si, ci, entry, 1);
                }
        }
        unlock_cluster(ci);
}

int __swap_count(swp_entry_t entry)
{
        struct swap_info_struct *si = swp_swap_info(entry);
        pgoff_t offset = swp_offset(entry);

        return swap_count(si->swap_map[offset]);
}

/*
 * How many references to @entry are currently swapped out?
 * This does not give an exact answer when swap count is continued,
 * but does include the high COUNT_CONTINUED flag to allow for that.
 */
bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
        pgoff_t offset = swp_offset(entry);
        struct swap_cluster_info *ci;
        int count;

        ci = lock_cluster(si, offset);
        count = swap_count(si->swap_map[offset]);
        unlock_cluster(ci);
        return !!count;
}

/*
 * How many references to @entry are currently swapped out?
 * This considers COUNT_CONTINUED so it returns exact answer.
 */
int swp_swapcount(swp_entry_t entry)
{
        int count, tmp_count, n;
        struct swap_info_struct *si;
        struct swap_cluster_info *ci;
        struct page *page;
        pgoff_t offset;
        unsigned char *map;

        si = _swap_info_get(entry);
        if (!si)
                return 0;

        offset = swp_offset(entry);

        ci = lock_cluster(si, offset);

        count = swap_count(si->swap_map[offset]);
        if (!(count & COUNT_CONTINUED))
                goto out;

        count &= ~COUNT_CONTINUED;
        n = SWAP_MAP_MAX + 1;

        page = vmalloc_to_page(si->swap_map + offset);
        offset &= ~PAGE_MASK;
        VM_BUG_ON(page_private(page) != SWP_CONTINUED);

        do {
                page = list_next_entry(page, lru);
                map = kmap_local_page(page);
                tmp_count = map[offset];
                kunmap_local(map);

                count += (tmp_count & ~COUNT_CONTINUED) * n;
                n *= (SWAP_CONT_MAX + 1);
        } while (tmp_count & COUNT_CONTINUED);
out:
        unlock_cluster(ci);
        return count;
}

static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
                                         swp_entry_t entry, int order)
{
        struct swap_cluster_info *ci;
        unsigned char *map = si->swap_map;
        unsigned int nr_pages = 1 << order;
        unsigned long roffset = swp_offset(entry);
        unsigned long offset = round_down(roffset, nr_pages);
        int i;
        bool ret = false;

        ci = lock_cluster(si, offset);
        if (nr_pages == 1) {
                if (swap_count(map[roffset]))
                        ret = true;
                goto unlock_out;
        }
        for (i = 0; i < nr_pages; i++) {
                if (swap_count(map[offset + i])) {
                        ret = true;
                        break;
                }
        }
unlock_out:
        unlock_cluster(ci);
        return ret;
}

static bool folio_swapped(struct folio *folio)
{
        swp_entry_t entry = folio->swap;
        struct swap_info_struct *si = _swap_info_get(entry);

        if (!si)
                return false;

        if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
                return swap_entry_swapped(si, entry);

        return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
}

static bool folio_swapcache_freeable(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (!folio_test_swapcache(folio))
                return false;
        if (folio_test_writeback(folio))
                return false;

        /*
         * Once hibernation has begun to create its image of memory,
         * there's a danger that one of the calls to folio_free_swap()
         * - most probably a call from __try_to_reclaim_swap() while
         * hibernation is allocating its own swap pages for the image,
         * but conceivably even a call from memory reclaim - will free
         * the swap from a folio which has already been recorded in the
         * image as a clean swapcache folio, and then reuse its swap for
         * another page of the image.  On waking from hibernation, the
         * original folio might be freed under memory pressure, then
         * later read back in from swap, now with the wrong data.
         *
         * Hibernation suspends storage while it is writing the image
         * to disk so check that here.
         */
        if (pm_suspended_storage())
                return false;

        return true;
}

/**
 * folio_free_swap() - Free the swap space used for this folio.
 * @folio: The folio to remove.
 *
 * If swap is getting full, or if there are no more mappings of this folio,
 * then call folio_free_swap to free its swap space.
 *
 * Return: true if we were able to release the swap space.
 */
bool folio_free_swap(struct folio *folio)
{
        if (!folio_swapcache_freeable(folio))
                return false;
        if (folio_swapped(folio))
                return false;

        delete_from_swap_cache(folio);
        folio_set_dirty(folio);
        return true;
}

/**
 * free_swap_and_cache_nr() - Release reference on range of swap entries and
 *                            reclaim their cache if no more references remain.
 * @entry: First entry of range.
 * @nr: Number of entries in range.
 *
 * For each swap entry in the contiguous range, release a reference. If any swap
 * entries become free, try to reclaim their underlying folios, if present. The
 * offset range is defined by [entry.offset, entry.offset + nr).
 */
void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
        const unsigned long start_offset = swp_offset(entry);
        const unsigned long end_offset = start_offset + nr;
        struct swap_info_struct *si;
        bool any_only_cache = false;
        unsigned long offset;

        si = get_swap_device(entry);
        if (!si)
                return;

        if (WARN_ON(end_offset > si->max))
                goto out;

        /*
         * First free all entries in the range.
         */
        any_only_cache = __swap_entries_free(si, entry, nr);

        /*
         * Short-circuit the below loop if none of the entries had their
         * reference drop to zero.
         */
        if (!any_only_cache)
                goto out;

        /*
         * Now go back over the range trying to reclaim the swap cache. This is
         * more efficient for large folios because we will only try to reclaim
         * the swap once per folio in the common case. If we do
         * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
         * latter will get a reference and lock the folio for every individual
         * page but will only succeed once the swap slot for every subpage is
         * zero.
         */
        for (offset = start_offset; offset < end_offset; offset += nr) {
                nr = 1;
                if (READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
                        /*
                         * Folios are always naturally aligned in swap so
                         * advance forward to the next boundary. Zero means no
                         * folio was found for the swap entry, so advance by 1
                         * in this case. Negative value means folio was found
                         * but could not be reclaimed. Here we can still advance
                         * to the next boundary.
                         */
                        nr = __try_to_reclaim_swap(si, offset,
                                                   TTRS_UNMAPPED | TTRS_FULL);
                        if (nr == 0)
                                nr = 1;
                        else if (nr < 0)
                                nr = -nr;
                        nr = ALIGN(offset + 1, nr) - offset;
                }
        }

out:
        put_swap_device(si);
}

#ifdef CONFIG_HIBERNATION

swp_entry_t get_swap_page_of_type(int type)
{
        struct swap_info_struct *si = swap_type_to_swap_info(type);
        unsigned long offset;
        swp_entry_t entry = {0};

        if (!si)
                goto fail;

        /* This is called for allocating swap entry, not cache */
        if (get_swap_device_info(si)) {
                if (si->flags & SWP_WRITEOK) {
                        offset = cluster_alloc_swap_entry(si, 0, 1);
                        if (offset) {
                                entry = swp_entry(si->type, offset);
                                atomic_long_dec(&nr_swap_pages);
                        }
                }
                put_swap_device(si);
        }
fail:
        return entry;
}

/*
 * Find the swap type that corresponds to given device (if any).
 *
 * @offset - number of the PAGE_SIZE-sized block of the device, starting
 * from 0, in which the swap header is expected to be located.
 *
 * This is needed for the suspend to disk (aka swsusp).
 */
int swap_type_of(dev_t device, sector_t offset)
{
        int type;

        if (!device)
                return -1;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *sis = swap_info[type];

                if (!(sis->flags & SWP_WRITEOK))
                        continue;

                if (device == sis->bdev->bd_dev) {
                        struct swap_extent *se = first_se(sis);

                        if (se->start_block == offset) {
                                spin_unlock(&swap_lock);
                                return type;
                        }
                }
        }
        spin_unlock(&swap_lock);
        return -ENODEV;
}

int find_first_swap(dev_t *device)
{
        int type;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *sis = swap_info[type];

                if (!(sis->flags & SWP_WRITEOK))
                        continue;
                *device = sis->bdev->bd_dev;
                spin_unlock(&swap_lock);
                return type;
        }
        spin_unlock(&swap_lock);
        return -ENODEV;
}

/*
 * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
 * corresponding to given index in swap_info (swap type).
 */
sector_t swapdev_block(int type, pgoff_t offset)
{
        struct swap_info_struct *si = swap_type_to_swap_info(type);
        struct swap_extent *se;

        if (!si || !(si->flags & SWP_WRITEOK))
                return 0;
        se = offset_to_swap_extent(si, offset);
        return se->start_block + (offset - se->start_page);
}

/*
 * Return either the total number of swap pages of given type, or the number
 * of free pages of that type (depending on @free)
 *
 * This is needed for software suspend
 */
unsigned int count_swap_pages(int type, int free)
{
        unsigned int n = 0;

        spin_lock(&swap_lock);
        if ((unsigned int)type < nr_swapfiles) {
                struct swap_info_struct *sis = swap_info[type];

                spin_lock(&sis->lock);
                if (sis->flags & SWP_WRITEOK) {
                        n = sis->pages;
                        if (free)
                                n -= swap_usage_in_pages(sis);
                }
                spin_unlock(&sis->lock);
        }
        spin_unlock(&swap_lock);
        return n;
}
#endif /* CONFIG_HIBERNATION */

static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
{
        return pte_same(pte_swp_clear_flags(pte), swp_pte);
}

/*
 * No need to decide whether this PTE shares the swap entry with others,
 * just let do_wp_page work it out if a write is requested later - to
 * force COW, vm_page_prot omits write permission from any private vma.
 */
static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct folio *folio)
{
        struct page *page;
        struct folio *swapcache;
        spinlock_t *ptl;
        pte_t *pte, new_pte, old_pte;
        bool hwpoisoned = false;
        int ret = 1;

        swapcache = folio;
        folio = ksm_might_need_to_copy(folio, vma, addr);
        if (unlikely(!folio))
                return -ENOMEM;
        else if (unlikely(folio == ERR_PTR(-EHWPOISON))) {
                hwpoisoned = true;
                folio = swapcache;
        }

        page = folio_file_page(folio, swp_offset(entry));
        if (PageHWPoison(page))
                hwpoisoned = true;

        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte),
                                                swp_entry_to_pte(entry)))) {
                ret = 0;
                goto out;
        }

        old_pte = ptep_get(pte);

        if (unlikely(hwpoisoned || !folio_test_uptodate(folio))) {
                swp_entry_t swp_entry;

                dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
                if (hwpoisoned) {
                        swp_entry = make_hwpoison_entry(page);
                } else {
                        swp_entry = make_poisoned_swp_entry();
                }
                new_pte = swp_entry_to_pte(swp_entry);
                ret = 0;
                goto setpte;
        }

        /*
         * Some architectures may have to restore extra metadata to the page
         * when reading from swap. This metadata may be indexed by swap entry
         * so this must be called before swap_free().
         */
        arch_swap_restore(folio_swap(entry, folio), folio);

        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
        folio_get(folio);
        if (folio == swapcache) {
                rmap_t rmap_flags = RMAP_NONE;

                /*
                 * See do_swap_page(): writeback would be problematic.
                 * However, we do a folio_wait_writeback() just before this
                 * call and have the folio locked.
                 */
                VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
                if (pte_swp_exclusive(old_pte))
                        rmap_flags |= RMAP_EXCLUSIVE;
                /*
                 * We currently only expect small !anon folios, which are either
                 * fully exclusive or fully shared. If we ever get large folios
                 * here, we have to be careful.
                 */
                if (!folio_test_anon(folio)) {
                        VM_WARN_ON_ONCE(folio_test_large(folio));
                        VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
                        folio_add_new_anon_rmap(folio, vma, addr, rmap_flags);
                } else {
                        folio_add_anon_rmap_pte(folio, page, vma, addr, rmap_flags);
                }
        } else { /* ksm created a completely new copy */
                folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
                folio_add_lru_vma(folio, vma);
        }
        new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
        if (pte_swp_soft_dirty(old_pte))
                new_pte = pte_mksoft_dirty(new_pte);
        if (pte_swp_uffd_wp(old_pte))
                new_pte = pte_mkuffd_wp(new_pte);
setpte:
        set_pte_at(vma->vm_mm, addr, pte, new_pte);
        swap_free(entry);
out:
        if (pte)
                pte_unmap_unlock(pte, ptl);
        if (folio != swapcache) {
                folio_unlock(folio);
                folio_put(folio);
        }
        return ret;
}

static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        unsigned long addr, unsigned long end,
                        unsigned int type)
{
        pte_t *pte = NULL;
        struct swap_info_struct *si;

        si = swap_info[type];
        do {
                struct folio *folio;
                unsigned long offset;
                unsigned char swp_count;
                swp_entry_t entry;
                int ret;
                pte_t ptent;

                if (!pte++) {
                        pte = pte_offset_map(pmd, addr);
                        if (!pte)
                                break;
                }

                ptent = ptep_get_lockless(pte);

                if (!is_swap_pte(ptent))
                        continue;

                entry = pte_to_swp_entry(ptent);
                if (swp_type(entry) != type)
                        continue;

                offset = swp_offset(entry);
                pte_unmap(pte);
                pte = NULL;

                folio = swap_cache_get_folio(entry, vma, addr);
                if (!folio) {
                        struct vm_fault vmf = {
                                .vma = vma,
                                .address = addr,
                                .real_address = addr,
                                .pmd = pmd,
                        };

                        folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
                                                &vmf);
                }
                if (!folio) {
                        swp_count = READ_ONCE(si->swap_map[offset]);
                        if (swp_count == 0 || swp_count == SWAP_MAP_BAD)
                                continue;
                        return -ENOMEM;
                }

                folio_lock(folio);
                folio_wait_writeback(folio);
                ret = unuse_pte(vma, pmd, addr, entry, folio);
                if (ret < 0) {
                        folio_unlock(folio);
                        folio_put(folio);
                        return ret;
                }

                folio_free_swap(folio);
                folio_unlock(folio);
                folio_put(folio);
        } while (addr += PAGE_SIZE, addr != end);

        if (pte)
                pte_unmap(pte);
        return 0;
}

static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        pmd_t *pmd;
        unsigned long next;
        int ret;

        pmd = pmd_offset(pud, addr);
        do {
                cond_resched();
                next = pmd_addr_end(addr, end);
                ret = unuse_pte_range(vma, pmd, addr, next, type);
                if (ret)
                        return ret;
        } while (pmd++, addr = next, addr != end);
        return 0;
}

static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        pud_t *pud;
        unsigned long next;
        int ret;

        pud = pud_offset(p4d, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
                ret = unuse_pmd_range(vma, pud, addr, next, type);
                if (ret)
                        return ret;
        } while (pud++, addr = next, addr != end);
        return 0;
}

static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
                                unsigned int type)
{
        p4d_t *p4d;
        unsigned long next;
        int ret;

        p4d = p4d_offset(pgd, addr);
        do {
                next = p4d_addr_end(addr, end);
                if (p4d_none_or_clear_bad(p4d))
                        continue;
                ret = unuse_pud_range(vma, p4d, addr, next, type);
                if (ret)
                        return ret;
        } while (p4d++, addr = next, addr != end);
        return 0;
}

static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
{
        pgd_t *pgd;
        unsigned long addr, end, next;
        int ret;

        addr = vma->vm_start;
        end = vma->vm_end;

        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
                ret = unuse_p4d_range(vma, pgd, addr, next, type);
                if (ret)
                        return ret;
        } while (pgd++, addr = next, addr != end);
        return 0;
}

static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
        struct vm_area_struct *vma;
        int ret = 0;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_read_lock(mm);
        for_each_vma(vmi, vma) {
                if (vma->anon_vma && !is_vm_hugetlb_page(vma)) {
                        ret = unuse_vma(vma, type);
                        if (ret)
                                break;
                }

                cond_resched();
        }
        mmap_read_unlock(mm);
        return ret;
}

/*
 * Scan swap_map from current position to next entry still in use.
 * Return 0 if there are no inuse entries after prev till end of
 * the map.
 */
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                                        unsigned int prev)
{
        unsigned int i;
        unsigned char count;

        /*
         * No need for swap_lock here: we're just looking
         * for whether an entry is in use, not modifying it; false
         * hits are okay, and sys_swapoff() has already prevented new
         * allocations from this area (while holding swap_lock).
         */
        for (i = prev + 1; i < si->max; i++) {
                count = READ_ONCE(si->swap_map[i]);
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
                if ((i % LATENCY_LIMIT) == 0)
                        cond_resched();
        }

        if (i == si->max)
                i = 0;

        return i;
}

static int try_to_unuse(unsigned int type)
{
        struct mm_struct *prev_mm;
        struct mm_struct *mm;
        struct list_head *p;
        int retval = 0;
        struct swap_info_struct *si = swap_info[type];
        struct folio *folio;
        swp_entry_t entry;
        unsigned int i;

        if (!swap_usage_in_pages(si))
                goto success;

retry:
        retval = shmem_unuse(type);
        if (retval)
                return retval;

        prev_mm = &init_mm;
        mmget(prev_mm);

        spin_lock(&mmlist_lock);
        p = &init_mm.mmlist;
        while (swap_usage_in_pages(si) &&
               !signal_pending(current) &&
               (p = p->next) != &init_mm.mmlist) {

                mm = list_entry(p, struct mm_struct, mmlist);
                if (!mmget_not_zero(mm))
                        continue;
                spin_unlock(&mmlist_lock);
                mmput(prev_mm);
                prev_mm = mm;
                retval = unuse_mm(mm, type);
                if (retval) {
                        mmput(prev_mm);
                        return retval;
                }

                /*
                 * Make sure that we aren't completely killing
                 * interactive performance.
                 */
                cond_resched();
                spin_lock(&mmlist_lock);
        }
        spin_unlock(&mmlist_lock);

        mmput(prev_mm);

        i = 0;
        while (swap_usage_in_pages(si) &&
               !signal_pending(current) &&
               (i = find_next_to_unuse(si, i)) != 0) {

                entry = swp_entry(type, i);
                folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
                if (IS_ERR(folio))
                        continue;

                /*
                 * It is conceivable that a racing task removed this folio from
                 * swap cache just before we acquired the page lock. The folio
                 * might even be back in swap cache on another swap area. But
                 * that is okay, folio_free_swap() only removes stale folios.
                 */
                folio_lock(folio);
                folio_wait_writeback(folio);
                folio_free_swap(folio);
                folio_unlock(folio);
                folio_put(folio);
        }

        /*
         * Lets check again to see if there are still swap entries in the map.
         * If yes, we would need to do retry the unuse logic again.
         * Under global memory pressure, swap entries can be reinserted back
         * into process space after the mmlist loop above passes over them.
         *
         * Limit the number of retries? No: when mmget_not_zero()
         * above fails, that mm is likely to be freeing swap from
         * exit_mmap(), which proceeds at its own independent pace;
         * and even shmem_writepage() could have been preempted after
         * folio_alloc_swap(), temporarily hiding that swap.  It's easy
         * and robust (though cpu-intensive) just to keep retrying.
         */
        if (swap_usage_in_pages(si)) {
                if (!signal_pending(current))
                        goto retry;
                return -EINTR;
        }

success:
        /*
         * Make sure that further cleanups after try_to_unuse() returns happen
         * after swap_range_free() reduces si->inuse_pages to 0.
         */
        smp_mb();
        return 0;
}

/*
 * After a successful try_to_unuse, if no swap is now in use, we know
 * we can empty the mmlist.  swap_lock must be held on entry and exit.
 * Note that mmlist_lock nests inside swap_lock, and an mm must be
 * added to the mmlist just after page_duplicate - before would be racy.
 */
static void drain_mmlist(void)
{
        struct list_head *p, *next;
        unsigned int type;

        for (type = 0; type < nr_swapfiles; type++)
                if (swap_usage_in_pages(swap_info[type]))
                        return;
        spin_lock(&mmlist_lock);
        list_for_each_safe(p, next, &init_mm.mmlist)
                list_del_init(p);
        spin_unlock(&mmlist_lock);
}

/*
 * Free all of a swapdev's extent information
 */
static void destroy_swap_extents(struct swap_info_struct *sis)
{
        while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
                struct rb_node *rb = sis->swap_extent_root.rb_node;
                struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);

                rb_erase(rb, &sis->swap_extent_root);
                kfree(se);
        }

        if (sis->flags & SWP_ACTIVATED) {
                struct file *swap_file = sis->swap_file;
                struct address_space *mapping = swap_file->f_mapping;

                sis->flags &= ~SWP_ACTIVATED;
                if (mapping->a_ops->swap_deactivate)
                        mapping->a_ops->swap_deactivate(swap_file);
        }
}

/*
 * Add a block range (and the corresponding page range) into this swapdev's
 * extent tree.
 *
 * This function rather assumes that it is called in ascending page order.
 */
int
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block)
{
        struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
        struct swap_extent *se;
        struct swap_extent *new_se;

        /*
         * place the new node at the right most since the
         * function is called in ascending page order.
         */
        while (*link) {
                parent = *link;
                link = &parent->rb_right;
        }

        if (parent) {
                se = rb_entry(parent, struct swap_extent, rb_node);
                BUG_ON(se->start_page + se->nr_pages != start_page);
                if (se->start_block + se->nr_pages == start_block) {
                        /* Merge it */
                        se->nr_pages += nr_pages;
                        return 0;
                }
        }

        /* No merge, insert a new extent. */
        new_se = kmalloc(sizeof(*se), GFP_KERNEL);
        if (new_se == NULL)
                return -ENOMEM;
        new_se->start_page = start_page;
        new_se->nr_pages = nr_pages;
        new_se->start_block = start_block;

        rb_link_node(&new_se->rb_node, parent, link);
        rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
        return 1;
}
EXPORT_SYMBOL_GPL(add_swap_extent);

/*
 * A `swap extent' is a simple thing which maps a contiguous range of pages
 * onto a contiguous range of disk blocks.  A rbtree of swap extents is
 * built at swapon time and is then used at swap_writepage/swap_read_folio
 * time for locating where on disk a page belongs.
 *
 * If the swapfile is an S_ISBLK block device, a single extent is installed.
 * This is done so that the main operating code can treat S_ISBLK and S_ISREG
 * swap files identically.
 *
 * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
 * extent rbtree operates in PAGE_SIZE disk blocks.  Both S_ISREG and S_ISBLK
 * swapfiles are handled *identically* after swapon time.
 *
 * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
 * and will parse them into a rbtree, in PAGE_SIZE chunks.  If some stray
 * blocks are found which do not fall within the PAGE_SIZE alignment
 * requirements, they are simply tossed out - we will never use those blocks
 * for swapping.
 *
 * For all swap devices we set S_SWAPFILE across the life of the swapon.  This
 * prevents users from writing to the swap device, which will corrupt memory.
 *
 * The amount of disk space which a single swap extent represents varies.
 * Typically it is in the 1-4 megabyte range.  So we can have hundreds of
 * extents in the rbtree. - akpm.
 */
static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
{
        struct file *swap_file = sis->swap_file;
        struct address_space *mapping = swap_file->f_mapping;
        struct inode *inode = mapping->host;
        int ret;

        if (S_ISBLK(inode->i_mode)) {
                ret = add_swap_extent(sis, 0, sis->max, 0);
                *span = sis->pages;
                return ret;
        }

        if (mapping->a_ops->swap_activate) {
                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
                if (ret < 0)
                        return ret;
                sis->flags |= SWP_ACTIVATED;
                if ((sis->flags & SWP_FS_OPS) &&
                    sio_pool_init() != 0) {
                        destroy_swap_extents(sis);
                        return -ENOMEM;
                }
                return ret;
        }

        return generic_swapfile_activate(sis, swap_file, span);
}

static int swap_node(struct swap_info_struct *si)
{
        struct block_device *bdev;

        if (si->bdev)
                bdev = si->bdev;
        else
                bdev = si->swap_file->f_inode->i_sb->s_bdev;

        return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}

static void setup_swap_info(struct swap_info_struct *si, int prio,
                            unsigned char *swap_map,
                            struct swap_cluster_info *cluster_info,
                            unsigned long *zeromap)
{
        int i;

        if (prio >= 0)
                si->prio = prio;
        else
                si->prio = --least_priority;
        /*
         * the plist prio is negated because plist ordering is
         * low-to-high, while swap ordering is high-to-low
         */
        si->list.prio = -si->prio;
        for_each_node(i) {
                if (si->prio >= 0)
                        si->avail_lists[i].prio = -si->prio;
                else {
                        if (swap_node(si) == i)
                                si->avail_lists[i].prio = 1;
                        else
                                si->avail_lists[i].prio = -si->prio;
                }
        }
        si->swap_map = swap_map;
        si->cluster_info = cluster_info;
        si->zeromap = zeromap;
}

static void _enable_swap_info(struct swap_info_struct *si)
{
        atomic_long_add(si->pages, &nr_swap_pages);
        total_swap_pages += si->pages;

        assert_spin_locked(&swap_lock);
        /*
         * both lists are plists, and thus priority ordered.
         * swap_active_head needs to be priority ordered for swapoff(),
         * which on removal of any swap_info_struct with an auto-assigned
         * (i.e. negative) priority increments the auto-assigned priority
         * of any lower-priority swap_info_structs.
         * swap_avail_head needs to be priority ordered for folio_alloc_swap(),
         * which allocates swap pages from the highest available priority
         * swap_info_struct.
         */
        plist_add(&si->list, &swap_active_head);

        /* Add back to available list */
        add_to_avail_list(si, true);
}

static void enable_swap_info(struct swap_info_struct *si, int prio,
                                unsigned char *swap_map,
                                struct swap_cluster_info *cluster_info,
                                unsigned long *zeromap)
{
        spin_lock(&swap_lock);
        spin_lock(&si->lock);
        setup_swap_info(si, prio, swap_map, cluster_info, zeromap);
        spin_unlock(&si->lock);
        spin_unlock(&swap_lock);
        /*
         * Finished initializing swap device, now it's safe to reference it.
         */
        percpu_ref_resurrect(&si->users);
        spin_lock(&swap_lock);
        spin_lock(&si->lock);
        _enable_swap_info(si);
        spin_unlock(&si->lock);
        spin_unlock(&swap_lock);
}

static void reinsert_swap_info(struct swap_info_struct *si)
{
        spin_lock(&swap_lock);
        spin_lock(&si->lock);
        setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap);
        _enable_swap_info(si);
        spin_unlock(&si->lock);
        spin_unlock(&swap_lock);
}

/*
 * Called after clearing SWP_WRITEOK, ensures cluster_alloc_range
 * see the updated flags, so there will be no more allocations.
 */
static void wait_for_allocation(struct swap_info_struct *si)
{
        unsigned long offset;
        unsigned long end = ALIGN(si->max, SWAPFILE_CLUSTER);
        struct swap_cluster_info *ci;

        BUG_ON(si->flags & SWP_WRITEOK);

        for (offset = 0; offset < end; offset += SWAPFILE_CLUSTER) {
                ci = lock_cluster(si, offset);
                unlock_cluster(ci);
        }
}

/*
 * Called after swap device's reference count is dead, so
 * neither scan nor allocation will use it.
 */
static void flush_percpu_swap_cluster(struct swap_info_struct *si)
{
        int cpu, i;
        struct swap_info_struct **pcp_si;

        for_each_possible_cpu(cpu) {
                pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
                /*
                 * Invalidate the percpu swap cluster cache, si->users
                 * is dead, so no new user will point to it, just flush
                 * any existing user.
                 */
                for (i = 0; i < SWAP_NR_ORDERS; i++)
                        cmpxchg(&pcp_si[i], si, NULL);
        }
}


SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
        struct swap_info_struct *p = NULL;
        unsigned char *swap_map;
        unsigned long *zeromap;
        struct swap_cluster_info *cluster_info;
        struct file *swap_file, *victim;
        struct address_space *mapping;
        struct inode *inode;
        struct filename *pathname;
        int err, found = 0;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        BUG_ON(!current->mm);

        pathname = getname(specialfile);
        if (IS_ERR(pathname))
                return PTR_ERR(pathname);

        victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
        err = PTR_ERR(victim);
        if (IS_ERR(victim))
                goto out;

        mapping = victim->f_mapping;
        spin_lock(&swap_lock);
        plist_for_each_entry(p, &swap_active_head, list) {
                if (p->flags & SWP_WRITEOK) {
                        if (p->swap_file->f_mapping == mapping) {
                                found = 1;
                                break;
                        }
                }
        }
        if (!found) {
                err = -EINVAL;
                spin_unlock(&swap_lock);
                goto out_dput;
        }
        if (!security_vm_enough_memory_mm(current->mm, p->pages))
                vm_unacct_memory(p->pages);
        else {
                err = -ENOMEM;
                spin_unlock(&swap_lock);
                goto out_dput;
        }
        spin_lock(&p->lock);
        del_from_avail_list(p, true);
        if (p->prio < 0) {
                struct swap_info_struct *si = p;
                int nid;

                plist_for_each_entry_continue(si, &swap_active_head, list) {
                        si->prio++;
                        si->list.prio--;
                        for_each_node(nid) {
                                if (si->avail_lists[nid].prio != 1)
                                        si->avail_lists[nid].prio--;
                        }
                }
                least_priority++;
        }
        plist_del(&p->list, &swap_active_head);
        atomic_long_sub(p->pages, &nr_swap_pages);
        total_swap_pages -= p->pages;
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);

        wait_for_allocation(p);

        set_current_oom_origin();
        err = try_to_unuse(p->type);
        clear_current_oom_origin();

        if (err) {
                /* re-insert swap space back into swap_list */
                reinsert_swap_info(p);
                goto out_dput;
        }

        /*
         * Wait for swap operations protected by get/put_swap_device()
         * to complete.  Because of synchronize_rcu() here, all swap
         * operations protected by RCU reader side lock (including any
         * spinlock) will be waited too.  This makes it easy to
         * prevent folio_test_swapcache() and the following swap cache
         * operations from racing with swapoff.
         */
        percpu_ref_kill(&p->users);
        synchronize_rcu();
        wait_for_completion(&p->comp);

        flush_work(&p->discard_work);
        flush_work(&p->reclaim_work);
        flush_percpu_swap_cluster(p);

        destroy_swap_extents(p);
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);

        if (!p->bdev || !bdev_nonrot(p->bdev))
                atomic_dec(&nr_rotate_swap);

        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        spin_lock(&p->lock);
        drain_mmlist();

        swap_file = p->swap_file;
        p->swap_file = NULL;
        p->max = 0;
        swap_map = p->swap_map;
        p->swap_map = NULL;
        zeromap = p->zeromap;
        p->zeromap = NULL;
        cluster_info = p->cluster_info;
        p->cluster_info = NULL;
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
        arch_swap_invalidate_area(p->type);
        zswap_swapoff(p->type);
        mutex_unlock(&swapon_mutex);
        kfree(p->global_cluster);
        p->global_cluster = NULL;
        vfree(swap_map);
        kvfree(zeromap);
        kvfree(cluster_info);
        /* Destroy swap account information */
        swap_cgroup_swapoff(p->type);
        exit_swap_address_space(p->type);

        inode = mapping->host;

        inode_lock(inode);
        inode->i_flags &= ~S_SWAPFILE;
        inode_unlock(inode);
        filp_close(swap_file, NULL);

        /*
         * Clear the SWP_USED flag after all resources are freed so that swapon
         * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
         * not hold p->lock after we cleared its SWP_WRITEOK.
         */
        spin_lock(&swap_lock);
        p->flags = 0;
        spin_unlock(&swap_lock);

        err = 0;
        atomic_inc(&proc_poll_event);
        wake_up_interruptible(&proc_poll_wait);

out_dput:
        filp_close(victim, NULL);
out:
        putname(pathname);
        return err;
}

#ifdef CONFIG_PROC_FS
static __poll_t swaps_poll(struct file *file, poll_table *wait)
{
        struct seq_file *seq = file->private_data;

        poll_wait(file, &proc_poll_wait, wait);

        if (seq->poll_event != atomic_read(&proc_poll_event)) {
                seq->poll_event = atomic_read(&proc_poll_event);
                return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
        }

        return EPOLLIN | EPOLLRDNORM;
}

/* iterator */
static void *swap_start(struct seq_file *swap, loff_t *pos)
{
        struct swap_info_struct *si;
        int type;
        loff_t l = *pos;

        mutex_lock(&swapon_mutex);

        if (!l)
                return SEQ_START_TOKEN;

        for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
                if (!(si->flags & SWP_USED) || !si->swap_map)
                        continue;
                if (!--l)
                        return si;
        }

        return NULL;
}

static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
{
        struct swap_info_struct *si = v;
        int type;

        if (v == SEQ_START_TOKEN)
                type = 0;
        else
                type = si->type + 1;

        ++(*pos);
        for (; (si = swap_type_to_swap_info(type)); type++) {
                if (!(si->flags & SWP_USED) || !si->swap_map)
                        continue;
                return si;
        }

        return NULL;
}

static void swap_stop(struct seq_file *swap, void *v)
{
        mutex_unlock(&swapon_mutex);
}

static int swap_show(struct seq_file *swap, void *v)
{
        struct swap_info_struct *si = v;
        struct file *file;
        int len;
        unsigned long bytes, inuse;

        if (si == SEQ_START_TOKEN) {
                seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
                return 0;
        }

        bytes = K(si->pages);
        inuse = K(swap_usage_in_pages(si));

        file = si->swap_file;
        len = seq_file_path(swap, file, " \t\n\\");
        seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
                        len < 40 ? 40 - len : 1, " ",
                        S_ISBLK(file_inode(file)->i_mode) ?
                                "partition" : "file\t",
                        bytes, bytes < 10000000 ? "\t" : "",
                        inuse, inuse < 10000000 ? "\t" : "",
                        si->prio);
        return 0;
}

static const struct seq_operations swaps_op = {
        .start =        swap_start,
        .next =                swap_next,
        .stop =                swap_stop,
        .show =                swap_show
};

static int swaps_open(struct inode *inode, struct file *file)
{
        struct seq_file *seq;
        int ret;

        ret = seq_open(file, &swaps_op);
        if (ret)
                return ret;

        seq = file->private_data;
        seq->poll_event = atomic_read(&proc_poll_event);
        return 0;
}

static const struct proc_ops swaps_proc_ops = {
        .proc_flags        = PROC_ENTRY_PERMANENT,
        .proc_open        = swaps_open,
        .proc_read        = seq_read,
        .proc_lseek        = seq_lseek,
        .proc_release        = seq_release,
        .proc_poll        = swaps_poll,
};

static int __init procswaps_init(void)
{
        proc_create("swaps", 0, NULL, &swaps_proc_ops);
        return 0;
}
__initcall(procswaps_init);
#endif /* CONFIG_PROC_FS */

#ifdef MAX_SWAPFILES_CHECK
static int __init max_swapfiles_check(void)
{
        MAX_SWAPFILES_CHECK();
        return 0;
}
late_initcall(max_swapfiles_check);
#endif

static struct swap_info_struct *alloc_swap_info(void)
{
        struct swap_info_struct *p;
        struct swap_info_struct *defer = NULL;
        unsigned int type;
        int i;

        p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
        if (!p)
                return ERR_PTR(-ENOMEM);

        if (percpu_ref_init(&p->users, swap_users_ref_free,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL)) {
                kvfree(p);
                return ERR_PTR(-ENOMEM);
        }

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                if (!(swap_info[type]->flags & SWP_USED))
                        break;
        }
        if (type >= MAX_SWAPFILES) {
                spin_unlock(&swap_lock);
                percpu_ref_exit(&p->users);
                kvfree(p);
                return ERR_PTR(-EPERM);
        }
        if (type >= nr_swapfiles) {
                p->type = type;
                /*
                 * Publish the swap_info_struct after initializing it.
                 * Note that kvzalloc() above zeroes all its fields.
                 */
                smp_store_release(&swap_info[type], p); /* rcu_assign_pointer() */
                nr_swapfiles++;
        } else {
                defer = p;
                p = swap_info[type];
                /*
                 * Do not memset this entry: a racing procfs swap_next()
                 * would be relying on p->type to remain valid.
                 */
        }
        p->swap_extent_root = RB_ROOT;
        plist_node_init(&p->list, 0);
        for_each_node(i)
                plist_node_init(&p->avail_lists[i], 0);
        p->flags = SWP_USED;
        spin_unlock(&swap_lock);
        if (defer) {
                percpu_ref_exit(&defer->users);
                kvfree(defer);
        }
        spin_lock_init(&p->lock);
        spin_lock_init(&p->cont_lock);
        atomic_long_set(&p->inuse_pages, SWAP_USAGE_OFFLIST_BIT);
        init_completion(&p->comp);

        return p;
}

static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
{
        if (S_ISBLK(inode->i_mode)) {
                si->bdev = I_BDEV(inode);
                /*
                 * Zoned block devices contain zones that have a sequential
                 * write only restriction.  Hence zoned block devices are not
                 * suitable for swapping.  Disallow them here.
                 */
                if (bdev_is_zoned(si->bdev))
                        return -EINVAL;
                si->flags |= SWP_BLKDEV;
        } else if (S_ISREG(inode->i_mode)) {
                si->bdev = inode->i_sb->s_bdev;
        }

        return 0;
}


/*
 * Find out how many pages are allowed for a single swap device. There
 * are two limiting factors:
 * 1) the number of bits for the swap offset in the swp_entry_t type, and
 * 2) the number of bits in the swap pte, as defined by the different
 * architectures.
 *
 * In order to find the largest possible bit mask, a swap entry with
 * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
 * decoded to a swp_entry_t again, and finally the swap offset is
 * extracted.
 *
 * This will mask all the bits from the initial ~0UL mask that can't
 * be encoded in either the swp_entry_t or the architecture definition
 * of a swap pte.
 */
unsigned long generic_max_swapfile_size(void)
{
        return swp_offset(pte_to_swp_entry(
                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
}

/* Can be overridden by an architecture for additional checks. */
__weak unsigned long arch_max_swapfile_size(void)
{
        return generic_max_swapfile_size();
}

static unsigned long read_swap_header(struct swap_info_struct *si,
                                        union swap_header *swap_header,
                                        struct inode *inode)
{
        int i;
        unsigned long maxpages;
        unsigned long swapfilepages;
        unsigned long last_page;

        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
                pr_err("Unable to find swap-space signature\n");
                return 0;
        }

        /* swap partition endianness hack... */
        if (swab32(swap_header->info.version) == 1) {
                swab32s(&swap_header->info.version);
                swab32s(&swap_header->info.last_page);
                swab32s(&swap_header->info.nr_badpages);
                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                        return 0;
                for (i = 0; i < swap_header->info.nr_badpages; i++)
                        swab32s(&swap_header->info.badpages[i]);
        }
        /* Check the swap header's sub-version */
        if (swap_header->info.version != 1) {
                pr_warn("Unable to handle swap header version %d\n",
                        swap_header->info.version);
                return 0;
        }

        maxpages = swapfile_maximum_size;
        last_page = swap_header->info.last_page;
        if (!last_page) {
                pr_warn("Empty swap-file\n");
                return 0;
        }
        if (last_page > maxpages) {
                pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
                        K(maxpages), K(last_page));
        }
        if (maxpages > last_page) {
                maxpages = last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
                if ((unsigned int)maxpages == 0)
                        maxpages = UINT_MAX;
        }

        if (!maxpages)
                return 0;
        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
        if (swapfilepages && maxpages > swapfilepages) {
                pr_warn("Swap area shorter than signature indicates\n");
                return 0;
        }
        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
                return 0;
        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                return 0;

        return maxpages;
}

static int setup_swap_map_and_extents(struct swap_info_struct *si,
                                        union swap_header *swap_header,
                                        unsigned char *swap_map,
                                        unsigned long maxpages,
                                        sector_t *span)
{
        unsigned int nr_good_pages;
        unsigned long i;
        int nr_extents;

        nr_good_pages = maxpages - 1;        /* omit header page */

        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                unsigned int page_nr = swap_header->info.badpages[i];
                if (page_nr == 0 || page_nr > swap_header->info.last_page)
                        return -EINVAL;
                if (page_nr < maxpages) {
                        swap_map[page_nr] = SWAP_MAP_BAD;
                        nr_good_pages--;
                }
        }

        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
                si->max = maxpages;
                si->pages = nr_good_pages;
                nr_extents = setup_swap_extents(si, span);
                if (nr_extents < 0)
                        return nr_extents;
                nr_good_pages = si->pages;
        }
        if (!nr_good_pages) {
                pr_warn("Empty swap-file\n");
                return -EINVAL;
        }

        return nr_extents;
}

#define SWAP_CLUSTER_INFO_COLS                                                \
        DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
#define SWAP_CLUSTER_SPACE_COLS                                                \
        DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
#define SWAP_CLUSTER_COLS                                                \
        max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)

static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
                                                union swap_header *swap_header,
                                                unsigned long maxpages)
{
        unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
        struct swap_cluster_info *cluster_info;
        unsigned long i, j, idx;
        int err = -ENOMEM;

        cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
        if (!cluster_info)
                goto err;

        for (i = 0; i < nr_clusters; i++)
                spin_lock_init(&cluster_info[i].lock);

        if (!(si->flags & SWP_SOLIDSTATE)) {
                si->global_cluster = kmalloc(sizeof(*si->global_cluster),
                                     GFP_KERNEL);
                if (!si->global_cluster)
                        goto err_free;
                for (i = 0; i < SWAP_NR_ORDERS; i++)
                        si->global_cluster->next[i] = SWAP_ENTRY_INVALID;
                spin_lock_init(&si->global_cluster_lock);
        }

        /*
         * Mark unusable pages as unavailable. The clusters aren't
         * marked free yet, so no list operations are involved yet.
         *
         * See setup_swap_map_and_extents(): header page, bad pages,
         * and the EOF part of the last cluster.
         */
        inc_cluster_info_page(si, cluster_info, 0);
        for (i = 0; i < swap_header->info.nr_badpages; i++)
                inc_cluster_info_page(si, cluster_info,
                                      swap_header->info.badpages[i]);
        for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
                inc_cluster_info_page(si, cluster_info, i);

        INIT_LIST_HEAD(&si->free_clusters);
        INIT_LIST_HEAD(&si->full_clusters);
        INIT_LIST_HEAD(&si->discard_clusters);

        for (i = 0; i < SWAP_NR_ORDERS; i++) {
                INIT_LIST_HEAD(&si->nonfull_clusters[i]);
                INIT_LIST_HEAD(&si->frag_clusters[i]);
                atomic_long_set(&si->frag_cluster_nr[i], 0);
        }

        /*
         * Reduce false cache line sharing between cluster_info and
         * sharing same address space.
         */
        for (j = 0; j < SWAP_CLUSTER_COLS; j++) {
                for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
                        struct swap_cluster_info *ci;
                        idx = i * SWAP_CLUSTER_COLS + j;
                        ci = cluster_info + idx;
                        if (idx >= nr_clusters)
                                continue;
                        if (ci->count) {
                                ci->flags = CLUSTER_FLAG_NONFULL;
                                list_add_tail(&ci->list, &si->nonfull_clusters[0]);
                                continue;
                        }
                        ci->flags = CLUSTER_FLAG_FREE;
                        list_add_tail(&ci->list, &si->free_clusters);
                }
        }

        return cluster_info;

err_free:
        kvfree(cluster_info);
err:
        return ERR_PTR(err);
}

SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
        struct swap_info_struct *si;
        struct filename *name;
        struct file *swap_file = NULL;
        struct address_space *mapping;
        struct dentry *dentry;
        int prio;
        int error;
        union swap_header *swap_header;
        int nr_extents;
        sector_t span;
        unsigned long maxpages;
        unsigned char *swap_map = NULL;
        unsigned long *zeromap = NULL;
        struct swap_cluster_info *cluster_info = NULL;
        struct folio *folio = NULL;
        struct inode *inode = NULL;
        bool inced_nr_rotate_swap = false;

        if (swap_flags & ~SWAP_FLAGS_VALID)
                return -EINVAL;

        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;

        if (!swap_avail_heads)
                return -ENOMEM;

        si = alloc_swap_info();
        if (IS_ERR(si))
                return PTR_ERR(si);

        INIT_WORK(&si->discard_work, swap_discard_work);
        INIT_WORK(&si->reclaim_work, swap_reclaim_work);

        name = getname(specialfile);
        if (IS_ERR(name)) {
                error = PTR_ERR(name);
                name = NULL;
                goto bad_swap;
        }
        swap_file = file_open_name(name, O_RDWR | O_LARGEFILE | O_EXCL, 0);
        if (IS_ERR(swap_file)) {
                error = PTR_ERR(swap_file);
                swap_file = NULL;
                goto bad_swap;
        }

        si->swap_file = swap_file;
        mapping = swap_file->f_mapping;
        dentry = swap_file->f_path.dentry;
        inode = mapping->host;

        error = claim_swapfile(si, inode);
        if (unlikely(error))
                goto bad_swap;

        inode_lock(inode);
        if (d_unlinked(dentry) || cant_mount(dentry)) {
                error = -ENOENT;
                goto bad_swap_unlock_inode;
        }
        if (IS_SWAPFILE(inode)) {
                error = -EBUSY;
                goto bad_swap_unlock_inode;
        }

        /*
         * Read the swap header.
         */
        if (!mapping->a_ops->read_folio) {
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }
        folio = read_mapping_folio(mapping, 0, swap_file);
        if (IS_ERR(folio)) {
                error = PTR_ERR(folio);
                goto bad_swap_unlock_inode;
        }
        swap_header = kmap_local_folio(folio, 0);

        maxpages = read_swap_header(si, swap_header, inode);
        if (unlikely(!maxpages)) {
                error = -EINVAL;
                goto bad_swap_unlock_inode;
        }

        /* OK, set up the swap map and apply the bad block list */
        swap_map = vzalloc(maxpages);
        if (!swap_map) {
                error = -ENOMEM;
                goto bad_swap_unlock_inode;
        }

        error = swap_cgroup_swapon(si->type, maxpages);
        if (error)
                goto bad_swap_unlock_inode;

        nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map,
                                                maxpages, &span);
        if (unlikely(nr_extents < 0)) {
                error = nr_extents;
                goto bad_swap_unlock_inode;
        }

        /*
         * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
         * be above MAX_PAGE_ORDER incase of a large swap file.
         */
        zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long),
                                    GFP_KERNEL | __GFP_ZERO);
        if (!zeromap) {
                error = -ENOMEM;
                goto bad_swap_unlock_inode;
        }

        if (si->bdev && bdev_stable_writes(si->bdev))
                si->flags |= SWP_STABLE_WRITES;

        if (si->bdev && bdev_synchronous(si->bdev))
                si->flags |= SWP_SYNCHRONOUS_IO;

        if (si->bdev && bdev_nonrot(si->bdev)) {
                si->flags |= SWP_SOLIDSTATE;
        } else {
                atomic_inc(&nr_rotate_swap);
                inced_nr_rotate_swap = true;
        }

        cluster_info = setup_clusters(si, swap_header, maxpages);
        if (IS_ERR(cluster_info)) {
                error = PTR_ERR(cluster_info);
                cluster_info = NULL;
                goto bad_swap_unlock_inode;
        }

        if ((swap_flags & SWAP_FLAG_DISCARD) &&
            si->bdev && bdev_max_discard_sectors(si->bdev)) {
                /*
                 * When discard is enabled for swap with no particular
                 * policy flagged, we set all swap discard flags here in
                 * order to sustain backward compatibility with older
                 * swapon(8) releases.
                 */
                si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
                             SWP_PAGE_DISCARD);

                /*
                 * By flagging sys_swapon, a sysadmin can tell us to
                 * either do single-time area discards only, or to just
                 * perform discards for released swap page-clusters.
                 * Now it's time to adjust the p->flags accordingly.
                 */
                if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
                        si->flags &= ~SWP_PAGE_DISCARD;
                else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
                        si->flags &= ~SWP_AREA_DISCARD;

                /* issue a swapon-time discard if it's still required */
                if (si->flags & SWP_AREA_DISCARD) {
                        int err = discard_swap(si);
                        if (unlikely(err))
                                pr_err("swapon: discard_swap(%p): %d\n",
                                        si, err);
                }
        }

        error = init_swap_address_space(si->type, maxpages);
        if (error)
                goto bad_swap_unlock_inode;

        error = zswap_swapon(si->type, maxpages);
        if (error)
                goto free_swap_address_space;

        /*
         * Flush any pending IO and dirty mappings before we start using this
         * swap device.
         */
        inode->i_flags |= S_SWAPFILE;
        error = inode_drain_writes(inode);
        if (error) {
                inode->i_flags &= ~S_SWAPFILE;
                goto free_swap_zswap;
        }

        mutex_lock(&swapon_mutex);
        prio = -1;
        if (swap_flags & SWAP_FLAG_PREFER)
                prio = swap_flags & SWAP_FLAG_PRIO_MASK;
        enable_swap_info(si, prio, swap_map, cluster_info, zeromap);

        pr_info("Adding %uk swap on %s.  Priority:%d extents:%d across:%lluk %s%s%s%s\n",
                K(si->pages), name->name, si->prio, nr_extents,
                K((unsigned long long)span),
                (si->flags & SWP_SOLIDSTATE) ? "SS" : "",
                (si->flags & SWP_DISCARDABLE) ? "D" : "",
                (si->flags & SWP_AREA_DISCARD) ? "s" : "",
                (si->flags & SWP_PAGE_DISCARD) ? "c" : "");

        mutex_unlock(&swapon_mutex);
        atomic_inc(&proc_poll_event);
        wake_up_interruptible(&proc_poll_wait);

        error = 0;
        goto out;
free_swap_zswap:
        zswap_swapoff(si->type);
free_swap_address_space:
        exit_swap_address_space(si->type);
bad_swap_unlock_inode:
        inode_unlock(inode);
bad_swap:
        kfree(si->global_cluster);
        si->global_cluster = NULL;
        inode = NULL;
        destroy_swap_extents(si);
        swap_cgroup_swapoff(si->type);
        spin_lock(&swap_lock);
        si->swap_file = NULL;
        si->flags = 0;
        spin_unlock(&swap_lock);
        vfree(swap_map);
        kvfree(zeromap);
        kvfree(cluster_info);
        if (inced_nr_rotate_swap)
                atomic_dec(&nr_rotate_swap);
        if (swap_file)
                filp_close(swap_file, NULL);
out:
        if (!IS_ERR_OR_NULL(folio))
                folio_release_kmap(folio, swap_header);
        if (name)
                putname(name);
        if (inode)
                inode_unlock(inode);
        return error;
}

void si_swapinfo(struct sysinfo *val)
{
        unsigned int type;
        unsigned long nr_to_be_unused = 0;

        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                struct swap_info_struct *si = swap_info[type];

                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
                        nr_to_be_unused += swap_usage_in_pages(si);
        }
        val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
        val->totalswap = total_swap_pages + nr_to_be_unused;
        spin_unlock(&swap_lock);
}

/*
 * Verify that nr swap entries are valid and increment their swap map counts.
 *
 * Returns error code in following case.
 * - success -> 0
 * - swp_entry is invalid -> EINVAL
 * - swap-cache reference is requested but there is already one. -> EEXIST
 * - swap-cache reference is requested but the entry is not used. -> ENOENT
 * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
 */
static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
{
        struct swap_info_struct *si;
        struct swap_cluster_info *ci;
        unsigned long offset;
        unsigned char count;
        unsigned char has_cache;
        int err, i;

        si = swp_swap_info(entry);
        if (WARN_ON_ONCE(!si)) {
                pr_err("%s%08lx\n", Bad_file, entry.val);
                return -EINVAL;
        }

        offset = swp_offset(entry);
        VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
        VM_WARN_ON(usage == 1 && nr > 1);
        ci = lock_cluster(si, offset);

        err = 0;
        for (i = 0; i < nr; i++) {
                count = si->swap_map[offset + i];

                /*
                 * swapin_readahead() doesn't check if a swap entry is valid, so the
                 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
                 */
                if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
                        err = -ENOENT;
                        goto unlock_out;
                }

                has_cache = count & SWAP_HAS_CACHE;
                count &= ~SWAP_HAS_CACHE;

                if (!count && !has_cache) {
                        err = -ENOENT;
                } else if (usage == SWAP_HAS_CACHE) {
                        if (has_cache)
                                err = -EEXIST;
                } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
                        err = -EINVAL;
                }

                if (err)
                        goto unlock_out;
        }

        for (i = 0; i < nr; i++) {
                count = si->swap_map[offset + i];
                has_cache = count & SWAP_HAS_CACHE;
                count &= ~SWAP_HAS_CACHE;

                if (usage == SWAP_HAS_CACHE)
                        has_cache = SWAP_HAS_CACHE;
                else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
                        count += usage;
                else if (swap_count_continued(si, offset + i, count))
                        count = COUNT_CONTINUED;
                else {
                        /*
                         * Don't need to rollback changes, because if
                         * usage == 1, there must be nr == 1.
                         */
                        err = -ENOMEM;
                        goto unlock_out;
                }

                WRITE_ONCE(si->swap_map[offset + i], count | has_cache);
        }

unlock_out:
        unlock_cluster(ci);
        return err;
}

/*
 * Help swapoff by noting that swap entry belongs to shmem/tmpfs
 * (in which case its reference count is never incremented).
 */
void swap_shmem_alloc(swp_entry_t entry, int nr)
{
        __swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
}

/*
 * Increase reference count of swap entry by 1.
 * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
 * but could not be atomically allocated.  Returns 0, just as if it succeeded,
 * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
 * might occur if a page table entry has got corrupted.
 */
int swap_duplicate(swp_entry_t entry)
{
        int err = 0;

        while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
                err = add_swap_count_continuation(entry, GFP_ATOMIC);
        return err;
}

/*
 * @entry: first swap entry from which we allocate nr swap cache.
 *
 * Called when allocating swap cache for existing swap entries,
 * This can return error codes. Returns 0 at success.
 * -EEXIST means there is a swap cache.
 * Note: return code is different from swap_duplicate().
 */
int swapcache_prepare(swp_entry_t entry, int nr)
{
        return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
}

void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
        unsigned long offset = swp_offset(entry);

        cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
}

struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
        return swap_type_to_swap_info(swp_type(entry));
}

/*
 * out-of-line methods to avoid include hell.
 */
struct address_space *swapcache_mapping(struct folio *folio)
{
        return swp_swap_info(folio->swap)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(swapcache_mapping);

pgoff_t __folio_swap_cache_index(struct folio *folio)
{
        return swap_cache_index(folio->swap);
}
EXPORT_SYMBOL_GPL(__folio_swap_cache_index);

/*
 * add_swap_count_continuation - called when a swap count is duplicated
 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
 * page of the original vmalloc'ed swap_map, to hold the continuation count
 * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
 * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
 *
 * These continuation pages are seldom referenced: the common paths all work
 * on the original swap_map, only referring to a continuation page when the
 * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
 *
 * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
 * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
 * can be called after dropping locks.
 */
int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
{
        struct swap_info_struct *si;
        struct swap_cluster_info *ci;
        struct page *head;
        struct page *page;
        struct page *list_page;
        pgoff_t offset;
        unsigned char count;
        int ret = 0;

        /*
         * When debugging, it's easier to use __GFP_ZERO here; but it's better
         * for latency not to zero a page while GFP_ATOMIC and holding locks.
         */
        page = alloc_page(gfp_mask | __GFP_HIGHMEM);

        si = get_swap_device(entry);
        if (!si) {
                /*
                 * An acceptable race has occurred since the failing
                 * __swap_duplicate(): the swap device may be swapoff
                 */
                goto outer;
        }

        offset = swp_offset(entry);

        ci = lock_cluster(si, offset);

        count = swap_count(si->swap_map[offset]);

        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
                /*
                 * The higher the swap count, the more likely it is that tasks
                 * will race to add swap count continuation: we need to avoid
                 * over-provisioning.
                 */
                goto out;
        }

        if (!page) {
                ret = -ENOMEM;
                goto out;
        }

        head = vmalloc_to_page(si->swap_map + offset);
        offset &= ~PAGE_MASK;

        spin_lock(&si->cont_lock);
        /*
         * Page allocation does not initialize the page's lru field,
         * but it does always reset its private field.
         */
        if (!page_private(head)) {
                BUG_ON(count & COUNT_CONTINUED);
                INIT_LIST_HEAD(&head->lru);
                set_page_private(head, SWP_CONTINUED);
                si->flags |= SWP_CONTINUED;
        }

        list_for_each_entry(list_page, &head->lru, lru) {
                unsigned char *map;

                /*
                 * If the previous map said no continuation, but we've found
                 * a continuation page, free our allocation and use this one.
                 */
                if (!(count & COUNT_CONTINUED))
                        goto out_unlock_cont;

                map = kmap_local_page(list_page) + offset;
                count = *map;
                kunmap_local(map);

                /*
                 * If this continuation count now has some space in it,
                 * free our allocation and use this one.
                 */
                if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
                        goto out_unlock_cont;
        }

        list_add_tail(&page->lru, &head->lru);
        page = NULL;                        /* now it's attached, don't free it */
out_unlock_cont:
        spin_unlock(&si->cont_lock);
out:
        unlock_cluster(ci);
        put_swap_device(si);
outer:
        if (page)
                __free_page(page);
        return ret;
}

/*
 * swap_count_continued - when the original swap_map count is incremented
 * from SWAP_MAP_MAX, check if there is already a continuation page to carry
 * into, carry if so, or else fail until a new continuation page is allocated;
 * when the original swap_map count is decremented from 0 with continuation,
 * borrow from the continuation and report whether it still holds more.
 * Called while __swap_duplicate() or caller of __swap_entry_free_locked()
 * holds cluster lock.
 */
static bool swap_count_continued(struct swap_info_struct *si,
                                 pgoff_t offset, unsigned char count)
{
        struct page *head;
        struct page *page;
        unsigned char *map;
        bool ret;

        head = vmalloc_to_page(si->swap_map + offset);
        if (page_private(head) != SWP_CONTINUED) {
                BUG_ON(count & COUNT_CONTINUED);
                return false;                /* need to add count continuation */
        }

        spin_lock(&si->cont_lock);
        offset &= ~PAGE_MASK;
        page = list_next_entry(head, lru);
        map = kmap_local_page(page) + offset;

        if (count == SWAP_MAP_MAX)        /* initial increment from swap_map */
                goto init_map;                /* jump over SWAP_CONT_MAX checks */

        if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
                /*
                 * Think of how you add 1 to 999
                 */
                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
                        kunmap_local(map);
                        page = list_next_entry(page, lru);
                        BUG_ON(page == head);
                        map = kmap_local_page(page) + offset;
                }
                if (*map == SWAP_CONT_MAX) {
                        kunmap_local(map);
                        page = list_next_entry(page, lru);
                        if (page == head) {
                                ret = false;        /* add count continuation */
                                goto out;
                        }
                        map = kmap_local_page(page) + offset;
init_map:                *map = 0;                /* we didn't zero the page */
                }
                *map += 1;
                kunmap_local(map);
                while ((page = list_prev_entry(page, lru)) != head) {
                        map = kmap_local_page(page) + offset;
                        *map = COUNT_CONTINUED;
                        kunmap_local(map);
                }
                ret = true;                        /* incremented */

        } else {                                /* decrementing */
                /*
                 * Think of how you subtract 1 from 1000
                 */
                BUG_ON(count != COUNT_CONTINUED);
                while (*map == COUNT_CONTINUED) {
                        kunmap_local(map);
                        page = list_next_entry(page, lru);
                        BUG_ON(page == head);
                        map = kmap_local_page(page) + offset;
                }
                BUG_ON(*map == 0);
                *map -= 1;
                if (*map == 0)
                        count = 0;
                kunmap_local(map);
                while ((page = list_prev_entry(page, lru)) != head) {
                        map = kmap_local_page(page) + offset;
                        *map = SWAP_CONT_MAX | count;
                        count = COUNT_CONTINUED;
                        kunmap_local(map);
                }
                ret = count == COUNT_CONTINUED;
        }
out:
        spin_unlock(&si->cont_lock);
        return ret;
}

/*
 * free_swap_count_continuations - swapoff free all the continuation pages
 * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
 */
static void free_swap_count_continuations(struct swap_info_struct *si)
{
        pgoff_t offset;

        for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
                struct page *head;
                head = vmalloc_to_page(si->swap_map + offset);
                if (page_private(head)) {
                        struct page *page, *next;

                        list_for_each_entry_safe(page, next, &head->lru, lru) {
                                list_del(&page->lru);
                                __free_page(page);
                        }
                }
        }
}

#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
static bool __has_usable_swap(void)
{
        return !plist_head_empty(&swap_active_head);
}

void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
        struct swap_info_struct *si, *next;
        int nid = folio_nid(folio);

        if (!(gfp & __GFP_IO))
                return;

        if (!__has_usable_swap())
                return;

        if (!blk_cgroup_congested())
                return;

        /*
         * We've already scheduled a throttle, avoid taking the global swap
         * lock.
         */
        if (current->throttle_disk)
                return;

        spin_lock(&swap_avail_lock);
        plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
                                  avail_lists[nid]) {
                if (si->bdev) {
                        blkcg_schedule_throttle(si->bdev->bd_disk, true);
                        break;
                }
        }
        spin_unlock(&swap_avail_lock);
}
#endif

static int __init swapfile_init(void)
{
        int nid;

        swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
                                         GFP_KERNEL);
        if (!swap_avail_heads) {
                pr_emerg("Not enough memory for swap heads, swap is disabled\n");
                return -ENOMEM;
        }

        for_each_node(nid)
                plist_head_init(&swap_avail_heads[nid]);

        swapfile_maximum_size = arch_max_swapfile_size();

#ifdef CONFIG_MIGRATION
        if (swapfile_maximum_size >= (1UL << SWP_MIG_TOTAL_BITS))
                swap_migration_ad_supported = true;
#endif        /* CONFIG_MIGRATION */

        return 0;
}
subsys_initcall(swapfile_init);





















  323 



    1 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
// SPDX-License-Identifier: GPL-2.0-only
/*
 * AArch64-specific system calls implementation
 *
 * Copyright (C) 2012 ARM Ltd.
 * Author: Catalin Marinas <catalin.marinas@arm.com>
 */

#include <linux/compiler.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/syscalls.h>

#include <asm/cpufeature.h>
#include <asm/syscall.h>

SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, off)
{
        if (offset_in_page(off) != 0)
                return -EINVAL;

        return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}

SYSCALL_DEFINE1(arm64_personality, unsigned int, personality)
{
        if (personality(personality) == PER_LINUX32 &&
                !system_supports_32bit_el0())
                return -EINVAL;
        return ksys_personality(personality);
}

asmlinkage long sys_ni_syscall(void);

asmlinkage long __arm64_sys_ni_syscall(const struct pt_regs *__unused)
{
        return sys_ni_syscall();
}

/*
 * Wrappers to pass the pt_regs argument.
 */
#define __arm64_sys_personality                __arm64_sys_arm64_personality

#define __SYSCALL_WITH_COMPAT(nr, native, compat)  __SYSCALL(nr, native)

#undef __SYSCALL
#define __SYSCALL(nr, sym)        asmlinkage long __arm64_##sym(const struct pt_regs *);
#include <asm/syscall_table_64.h>

#undef __SYSCALL
#define __SYSCALL(nr, sym)        [nr] = __arm64_##sym,

const syscall_fn_t sys_call_table[__NR_syscalls] = {
        [0 ... __NR_syscalls - 1] = __arm64_sys_ni_syscall,
#include <asm/syscall_table_64.h>
};















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 






























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
 */

/* Devmaps primary use is as a backend map for XDP BPF helper call
 * bpf_redirect_map(). Because XDP is mostly concerned with performance we
 * spent some effort to ensure the datapath with redirect maps does not use
 * any locking. This is a quick note on the details.
 *
 * We have three possible paths to get into the devmap control plane bpf
 * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
 * will invoke an update, delete, or lookup operation. To ensure updates and
 * deletes appear atomic from the datapath side xchg() is used to modify the
 * netdev_map array. Then because the datapath does a lookup into the netdev_map
 * array (read-only) from an RCU critical section we use call_rcu() to wait for
 * an rcu grace period before free'ing the old data structures. This ensures the
 * datapath always has a valid copy. However, the datapath does a "flush"
 * operation that pushes any pending packets in the driver outside the RCU
 * critical section. Each bpf_dtab_netdev tracks these pending operations using
 * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed  until
 * this list is empty, indicating outstanding flush operations have completed.
 *
 * BPF syscalls may race with BPF program calls on any of the update, delete
 * or lookup operations. As noted above the xchg() operation also keep the
 * netdev_map consistent in this case. From the devmap side BPF programs
 * calling into these operations are the same as multiple user space threads
 * making system calls.
 *
 * Finally, any of the above may race with a netdev_unregister notifier. The
 * unregister notifier must search for net devices in the map structure that
 * contain a reference to the net device and remove them. This is a two step
 * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
 * check to see if the ifindex is the same as the net_device being removed.
 * When removing the dev a cmpxchg() is used to ensure the correct dev is
 * removed, in the case of a concurrent update or delete operation it is
 * possible that the initially referenced dev is no longer in the map. As the
 * notifier hook walks the map we know that new dev references can not be
 * added by the user because core infrastructure ensures dev_get_by_index()
 * calls will fail at this point.
 *
 * The devmap_hash type is a map type which interprets keys as ifindexes and
 * indexes these using a hashmap. This allows maps that use ifindex as key to be
 * densely packed instead of having holes in the lookup array for unused
 * ifindexes. The setup and packet enqueue/send code is shared between the two
 * types of devmap; only the lookup and insertion is different.
 */
#include <linux/bpf.h>
#include <net/xdp.h>
#include <linux/filter.h>
#include <trace/events/xdp.h>
#include <linux/btf_ids.h>

#define DEV_CREATE_FLAG_MASK \
        (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)

struct xdp_dev_bulk_queue {
        struct xdp_frame *q[DEV_MAP_BULK_SIZE];
        struct list_head flush_node;
        struct net_device *dev;
        struct net_device *dev_rx;
        struct bpf_prog *xdp_prog;
        unsigned int count;
};

struct bpf_dtab_netdev {
        struct net_device *dev; /* must be first member, due to tracepoint */
        struct hlist_node index_hlist;
        struct bpf_prog *xdp_prog;
        struct rcu_head rcu;
        unsigned int idx;
        struct bpf_devmap_val val;
};

struct bpf_dtab {
        struct bpf_map map;
        struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
        struct list_head list;

        /* these are only used for DEVMAP_HASH type maps */
        struct hlist_head *dev_index_head;
        spinlock_t index_lock;
        unsigned int items;
        u32 n_buckets;
};

static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);

static struct hlist_head *dev_map_create_hash(unsigned int entries,
                                              int numa_node)
{
        int i;
        struct hlist_head *hash;

        hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
        if (hash != NULL)
                for (i = 0; i < entries; i++)
                        INIT_HLIST_HEAD(&hash[i]);

        return hash;
}

static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
                                                    int idx)
{
        return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
}

static int dev_map_alloc_check(union bpf_attr *attr)
{
        u32 valsize = attr->value_size;

        /* check sanity of attributes. 2 value sizes supported:
         * 4 bytes: ifindex
         * 8 bytes: ifindex + prog fd
         */
        if (attr->max_entries == 0 || attr->key_size != 4 ||
            (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
             valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
            attr->map_flags & ~DEV_CREATE_FLAG_MASK)
                return -EINVAL;

        if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                /* Hash table size must be power of 2; roundup_pow_of_two()
                 * can overflow into UB on 32-bit arches
                 */
                if (attr->max_entries > 1UL << 31)
                        return -EINVAL;
        }

        return 0;
}

static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
{
        /* Lookup returns a pointer straight to dev->ifindex, so make sure the
         * verifier prevents writes from the BPF side
         */
        attr->map_flags |= BPF_F_RDONLY_PROG;
        bpf_map_init_from_attr(&dtab->map, attr);

        if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                /* Hash table size must be power of 2 */
                dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
                dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
                                                           dtab->map.numa_node);
                if (!dtab->dev_index_head)
                        return -ENOMEM;

                spin_lock_init(&dtab->index_lock);
        } else {
                dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
                                                      sizeof(struct bpf_dtab_netdev *),
                                                      dtab->map.numa_node);
                if (!dtab->netdev_map)
                        return -ENOMEM;
        }

        return 0;
}

static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
        struct bpf_dtab *dtab;
        int err;

        dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
        if (!dtab)
                return ERR_PTR(-ENOMEM);

        err = dev_map_init_map(dtab, attr);
        if (err) {
                bpf_map_area_free(dtab);
                return ERR_PTR(err);
        }

        spin_lock(&dev_map_lock);
        list_add_tail_rcu(&dtab->list, &dev_map_list);
        spin_unlock(&dev_map_lock);

        return &dtab->map;
}

static void dev_map_free(struct bpf_map *map)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        u32 i;

        /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
         * so the programs (can be more than one that used this map) were
         * disconnected from events. The following synchronize_rcu() guarantees
         * both rcu read critical sections complete and waits for
         * preempt-disable regions (NAPI being the relevant context here) so we
         * are certain there will be no further reads against the netdev_map and
         * all flush operations are complete. Flush operations can only be done
         * from NAPI context for this reason.
         */

        spin_lock(&dev_map_lock);
        list_del_rcu(&dtab->list);
        spin_unlock(&dev_map_lock);

        /* bpf_redirect_info->map is assigned in __bpf_xdp_redirect_map()
         * during NAPI callback and cleared after the XDP redirect. There is no
         * explicit RCU read section which protects bpf_redirect_info->map but
         * local_bh_disable() also marks the beginning an RCU section. This
         * makes the complete softirq callback RCU protected. Thus after
         * following synchronize_rcu() there no bpf_redirect_info->map == map
         * assignment.
         */
        synchronize_rcu();

        /* Make sure prior __dev_map_entry_free() have completed. */
        rcu_barrier();

        if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                for (i = 0; i < dtab->n_buckets; i++) {
                        struct bpf_dtab_netdev *dev;
                        struct hlist_head *head;
                        struct hlist_node *next;

                        head = dev_map_index_hash(dtab, i);

                        hlist_for_each_entry_safe(dev, next, head, index_hlist) {
                                hlist_del_rcu(&dev->index_hlist);
                                if (dev->xdp_prog)
                                        bpf_prog_put(dev->xdp_prog);
                                dev_put(dev->dev);
                                kfree(dev);
                        }
                }

                bpf_map_area_free(dtab->dev_index_head);
        } else {
                for (i = 0; i < dtab->map.max_entries; i++) {
                        struct bpf_dtab_netdev *dev;

                        dev = rcu_dereference_raw(dtab->netdev_map[i]);
                        if (!dev)
                                continue;

                        if (dev->xdp_prog)
                                bpf_prog_put(dev->xdp_prog);
                        dev_put(dev->dev);
                        kfree(dev);
                }

                bpf_map_area_free(dtab->netdev_map);
        }

        bpf_map_area_free(dtab);
}

static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        u32 index = key ? *(u32 *)key : U32_MAX;
        u32 *next = next_key;

        if (index >= dtab->map.max_entries) {
                *next = 0;
                return 0;
        }

        if (index == dtab->map.max_entries - 1)
                return -ENOENT;
        *next = index + 1;
        return 0;
}

/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
 * by local_bh_disable() (from XDP calls inside NAPI). The
 * rcu_read_lock_bh_held() below makes lockdep accept both.
 */
static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct hlist_head *head = dev_map_index_hash(dtab, key);
        struct bpf_dtab_netdev *dev;

        hlist_for_each_entry_rcu(dev, head, index_hlist,
                                 lockdep_is_held(&dtab->index_lock))
                if (dev->idx == key)
                        return dev;

        return NULL;
}

static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
                                    void *next_key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        u32 idx, *next = next_key;
        struct bpf_dtab_netdev *dev, *next_dev;
        struct hlist_head *head;
        int i = 0;

        if (!key)
                goto find_first;

        idx = *(u32 *)key;

        dev = __dev_map_hash_lookup_elem(map, idx);
        if (!dev)
                goto find_first;

        next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
                                    struct bpf_dtab_netdev, index_hlist);

        if (next_dev) {
                *next = next_dev->idx;
                return 0;
        }

        i = idx & (dtab->n_buckets - 1);
        i++;

 find_first:
        for (; i < dtab->n_buckets; i++) {
                head = dev_map_index_hash(dtab, i);

                next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
                                            struct bpf_dtab_netdev,
                                            index_hlist);
                if (next_dev) {
                        *next = next_dev->idx;
                        return 0;
                }
        }

        return -ENOENT;
}

static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
                                struct xdp_frame **frames, int n,
                                struct net_device *tx_dev,
                                struct net_device *rx_dev)
{
        struct xdp_txq_info txq = { .dev = tx_dev };
        struct xdp_rxq_info rxq = { .dev = rx_dev };
        struct xdp_buff xdp;
        int i, nframes = 0;

        for (i = 0; i < n; i++) {
                struct xdp_frame *xdpf = frames[i];
                u32 act;
                int err;

                xdp_convert_frame_to_buff(xdpf, &xdp);
                xdp.txq = &txq;
                xdp.rxq = &rxq;

                act = bpf_prog_run_xdp(xdp_prog, &xdp);
                switch (act) {
                case XDP_PASS:
                        err = xdp_update_frame_from_buff(&xdp, xdpf);
                        if (unlikely(err < 0))
                                xdp_return_frame_rx_napi(xdpf);
                        else
                                frames[nframes++] = xdpf;
                        break;
                default:
                        bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
                        fallthrough;
                case XDP_ABORTED:
                        trace_xdp_exception(tx_dev, xdp_prog, act);
                        fallthrough;
                case XDP_DROP:
                        xdp_return_frame_rx_napi(xdpf);
                        break;
                }
        }
        return nframes; /* sent frames count */
}

static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
{
        struct net_device *dev = bq->dev;
        unsigned int cnt = bq->count;
        int sent = 0, err = 0;
        int to_send = cnt;
        int i;

        if (unlikely(!cnt))
                return;

        for (i = 0; i < cnt; i++) {
                struct xdp_frame *xdpf = bq->q[i];

                prefetch(xdpf);
        }

        if (bq->xdp_prog) {
                to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx);
                if (!to_send)
                        goto out;
        }

        sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
        if (sent < 0) {
                /* If ndo_xdp_xmit fails with an errno, no frames have
                 * been xmit'ed.
                 */
                err = sent;
                sent = 0;
        }

        /* If not all frames have been transmitted, it is our
         * responsibility to free them
         */
        for (i = sent; unlikely(i < to_send); i++)
                xdp_return_frame_rx_napi(bq->q[i]);

out:
        bq->count = 0;
        trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
}

/* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
 * driver before returning from its napi->poll() routine. See the comment above
 * xdp_do_flush() in filter.c.
 */
void __dev_flush(struct list_head *flush_list)
{
        struct xdp_dev_bulk_queue *bq, *tmp;

        list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
                bq_xmit_all(bq, XDP_XMIT_FLUSH);
                bq->dev_rx = NULL;
                bq->xdp_prog = NULL;
                __list_del_clearprev(&bq->flush_node);
        }
}

/* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
 * by local_bh_disable() (from XDP calls inside NAPI). The
 * rcu_read_lock_bh_held() below makes lockdep accept both.
 */
static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *obj;

        if (key >= map->max_entries)
                return NULL;

        obj = rcu_dereference_check(dtab->netdev_map[key],
                                    rcu_read_lock_bh_held());
        return obj;
}

/* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
 * variable access, and map elements stick around. See comment above
 * xdp_do_flush() in filter.c.
 */
static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                       struct net_device *dev_rx, struct bpf_prog *xdp_prog)
{
        struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);

        if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
                bq_xmit_all(bq, 0);

        /* Ingress dev_rx will be the same for all xdp_frame's in
         * bulk_queue, because bq stored per-CPU and must be flushed
         * from net_device drivers NAPI func end.
         *
         * Do the same with xdp_prog and flush_list since these fields
         * are only ever modified together.
         */
        if (!bq->dev_rx) {
                struct list_head *flush_list = bpf_net_ctx_get_dev_flush_list();

                bq->dev_rx = dev_rx;
                bq->xdp_prog = xdp_prog;
                list_add(&bq->flush_node, flush_list);
        }

        bq->q[bq->count++] = xdpf;
}

static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                                struct net_device *dev_rx,
                                struct bpf_prog *xdp_prog)
{
        int err;

        if (!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
                return -EOPNOTSUPP;

        if (unlikely(!(dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
                     xdp_frame_has_frags(xdpf)))
                return -EOPNOTSUPP;

        err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
        if (unlikely(err))
                return err;

        bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
        return 0;
}

static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
{
        struct xdp_txq_info txq = { .dev = dst->dev };
        struct xdp_buff xdp;
        u32 act;

        if (!dst->xdp_prog)
                return XDP_PASS;

        __skb_pull(skb, skb->mac_len);
        xdp.txq = &txq;

        act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
        switch (act) {
        case XDP_PASS:
                __skb_push(skb, skb->mac_len);
                break;
        default:
                bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
                trace_xdp_exception(dst->dev, dst->xdp_prog, act);
                fallthrough;
        case XDP_DROP:
                kfree_skb(skb);
                break;
        }

        return act;
}

int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}

int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
                    struct net_device *dev_rx)
{
        struct net_device *dev = dst->dev;

        return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
}

static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
        if (!obj)
                return false;

        if (!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT))
                return false;

        if (unlikely(!(obj->dev->xdp_features & NETDEV_XDP_ACT_NDO_XMIT_SG) &&
                     xdp_frame_has_frags(xdpf)))
                return false;

        if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
                return false;

        return true;
}

static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
                                 struct net_device *dev_rx,
                                 struct xdp_frame *xdpf)
{
        struct xdp_frame *nxdpf;

        nxdpf = xdpf_clone(xdpf);
        if (!nxdpf)
                return -ENOMEM;

        bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);

        return 0;
}

static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
{
        while (num_excluded--) {
                if (ifindex == excluded[num_excluded])
                        return true;
        }
        return false;
}

/* Get ifindex of each upper device. 'indexes' must be able to hold at
 * least MAX_NEST_DEV elements.
 * Returns the number of ifindexes added.
 */
static int get_upper_ifindexes(struct net_device *dev, int *indexes)
{
        struct net_device *upper;
        struct list_head *iter;
        int n = 0;

        netdev_for_each_upper_dev_rcu(dev, upper, iter) {
                indexes[n++] = upper->ifindex;
        }
        return n;
}

int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
                          struct bpf_map *map, bool exclude_ingress)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *dst, *last_dst = NULL;
        int excluded_devices[1+MAX_NEST_DEV];
        struct hlist_head *head;
        int num_excluded = 0;
        unsigned int i;
        int err;

        if (exclude_ingress) {
                num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
                excluded_devices[num_excluded++] = dev_rx->ifindex;
        }

        if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
                for (i = 0; i < map->max_entries; i++) {
                        dst = rcu_dereference_check(dtab->netdev_map[i],
                                                    rcu_read_lock_bh_held());
                        if (!is_valid_dst(dst, xdpf))
                                continue;

                        if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
                                continue;

                        /* we only need n-1 clones; last_dst enqueued below */
                        if (!last_dst) {
                                last_dst = dst;
                                continue;
                        }

                        err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
                        if (err)
                                return err;

                        last_dst = dst;
                }
        } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
                for (i = 0; i < dtab->n_buckets; i++) {
                        head = dev_map_index_hash(dtab, i);
                        hlist_for_each_entry_rcu(dst, head, index_hlist,
                                                 lockdep_is_held(&dtab->index_lock)) {
                                if (!is_valid_dst(dst, xdpf))
                                        continue;

                                if (is_ifindex_excluded(excluded_devices, num_excluded,
                                                        dst->dev->ifindex))
                                        continue;

                                /* we only need n-1 clones; last_dst enqueued below */
                                if (!last_dst) {
                                        last_dst = dst;
                                        continue;
                                }

                                err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
                                if (err)
                                        return err;

                                last_dst = dst;
                        }
                }
        }

        /* consume the last copy of the frame */
        if (last_dst)
                bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
        else
                xdp_return_frame_rx_napi(xdpf); /* dtab is empty */

        return 0;
}

int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
                             const struct bpf_prog *xdp_prog)
{
        int err;

        err = xdp_ok_fwd_dev(dst->dev, skb->len);
        if (unlikely(err))
                return err;

        /* Redirect has already succeeded semantically at this point, so we just
         * return 0 even if packet is dropped. Helper below takes care of
         * freeing skb.
         */
        if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
                return 0;

        skb->dev = dst->dev;
        generic_xdp_tx(skb, xdp_prog);

        return 0;
}

static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
                                  struct sk_buff *skb,
                                  const struct bpf_prog *xdp_prog)
{
        struct sk_buff *nskb;
        int err;

        nskb = skb_clone(skb, GFP_ATOMIC);
        if (!nskb)
                return -ENOMEM;

        err = dev_map_generic_redirect(dst, nskb, xdp_prog);
        if (unlikely(err)) {
                consume_skb(nskb);
                return err;
        }

        return 0;
}

int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
                           const struct bpf_prog *xdp_prog,
                           struct bpf_map *map, bool exclude_ingress)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *dst, *last_dst = NULL;
        int excluded_devices[1+MAX_NEST_DEV];
        struct hlist_head *head;
        struct hlist_node *next;
        int num_excluded = 0;
        unsigned int i;
        int err;

        if (exclude_ingress) {
                num_excluded = get_upper_ifindexes(dev, excluded_devices);
                excluded_devices[num_excluded++] = dev->ifindex;
        }

        if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
                for (i = 0; i < map->max_entries; i++) {
                        dst = rcu_dereference_check(dtab->netdev_map[i],
                                                    rcu_read_lock_bh_held());
                        if (!dst)
                                continue;

                        if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
                                continue;

                        /* we only need n-1 clones; last_dst enqueued below */
                        if (!last_dst) {
                                last_dst = dst;
                                continue;
                        }

                        err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
                        if (err)
                                return err;

                        last_dst = dst;

                }
        } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
                for (i = 0; i < dtab->n_buckets; i++) {
                        head = dev_map_index_hash(dtab, i);
                        hlist_for_each_entry_safe(dst, next, head, index_hlist) {
                                if (is_ifindex_excluded(excluded_devices, num_excluded,
                                                        dst->dev->ifindex))
                                        continue;

                                /* we only need n-1 clones; last_dst enqueued below */
                                if (!last_dst) {
                                        last_dst = dst;
                                        continue;
                                }

                                err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
                                if (err)
                                        return err;

                                last_dst = dst;
                        }
                }
        }

        /* consume the first skb and return */
        if (last_dst)
                return dev_map_generic_redirect(last_dst, skb, xdp_prog);

        /* dtab is empty */
        consume_skb(skb);
        return 0;
}

static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);

        return obj ? &obj->val : NULL;
}

static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
{
        struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
                                                                *(u32 *)key);
        return obj ? &obj->val : NULL;
}

static void __dev_map_entry_free(struct rcu_head *rcu)
{
        struct bpf_dtab_netdev *dev;

        dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
        if (dev->xdp_prog)
                bpf_prog_put(dev->xdp_prog);
        dev_put(dev->dev);
        kfree(dev);
}

static long dev_map_delete_elem(struct bpf_map *map, void *key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *old_dev;
        u32 k = *(u32 *)key;

        if (k >= map->max_entries)
                return -EINVAL;

        old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
        if (old_dev) {
                call_rcu(&old_dev->rcu, __dev_map_entry_free);
                atomic_dec((atomic_t *)&dtab->items);
        }
        return 0;
}

static long dev_map_hash_delete_elem(struct bpf_map *map, void *key)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *old_dev;
        u32 k = *(u32 *)key;
        unsigned long flags;
        int ret = -ENOENT;

        spin_lock_irqsave(&dtab->index_lock, flags);

        old_dev = __dev_map_hash_lookup_elem(map, k);
        if (old_dev) {
                dtab->items--;
                hlist_del_init_rcu(&old_dev->index_hlist);
                call_rcu(&old_dev->rcu, __dev_map_entry_free);
                ret = 0;
        }
        spin_unlock_irqrestore(&dtab->index_lock, flags);

        return ret;
}

static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
                                                    struct bpf_dtab *dtab,
                                                    struct bpf_devmap_val *val,
                                                    unsigned int idx)
{
        struct bpf_prog *prog = NULL;
        struct bpf_dtab_netdev *dev;

        dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
                                   GFP_NOWAIT | __GFP_NOWARN,
                                   dtab->map.numa_node);
        if (!dev)
                return ERR_PTR(-ENOMEM);

        dev->dev = dev_get_by_index(net, val->ifindex);
        if (!dev->dev)
                goto err_out;

        if (val->bpf_prog.fd > 0) {
                prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
                                             BPF_PROG_TYPE_XDP, false);
                if (IS_ERR(prog))
                        goto err_put_dev;
                if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
                    !bpf_prog_map_compatible(&dtab->map, prog))
                        goto err_put_prog;
        }

        dev->idx = idx;
        if (prog) {
                dev->xdp_prog = prog;
                dev->val.bpf_prog.id = prog->aux->id;
        } else {
                dev->xdp_prog = NULL;
                dev->val.bpf_prog.id = 0;
        }
        dev->val.ifindex = val->ifindex;

        return dev;
err_put_prog:
        bpf_prog_put(prog);
err_put_dev:
        dev_put(dev->dev);
err_out:
        kfree(dev);
        return ERR_PTR(-EINVAL);
}

static long __dev_map_update_elem(struct net *net, struct bpf_map *map,
                                  void *key, void *value, u64 map_flags)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *dev, *old_dev;
        struct bpf_devmap_val val = {};
        u32 i = *(u32 *)key;

        if (unlikely(map_flags > BPF_EXIST))
                return -EINVAL;
        if (unlikely(i >= dtab->map.max_entries))
                return -E2BIG;
        if (unlikely(map_flags == BPF_NOEXIST))
                return -EEXIST;

        /* already verified value_size <= sizeof val */
        memcpy(&val, value, map->value_size);

        if (!val.ifindex) {
                dev = NULL;
                /* can not specify fd if ifindex is 0 */
                if (val.bpf_prog.fd > 0)
                        return -EINVAL;
        } else {
                dev = __dev_map_alloc_node(net, dtab, &val, i);
                if (IS_ERR(dev))
                        return PTR_ERR(dev);
        }

        /* Use call_rcu() here to ensure rcu critical sections have completed
         * Remembering the driver side flush operation will happen before the
         * net device is removed.
         */
        old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
        if (old_dev)
                call_rcu(&old_dev->rcu, __dev_map_entry_free);
        else
                atomic_inc((atomic_t *)&dtab->items);

        return 0;
}

static long dev_map_update_elem(struct bpf_map *map, void *key, void *value,
                                u64 map_flags)
{
        return __dev_map_update_elem(current->nsproxy->net_ns,
                                     map, key, value, map_flags);
}

static long __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
                                       void *key, void *value, u64 map_flags)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        struct bpf_dtab_netdev *dev, *old_dev;
        struct bpf_devmap_val val = {};
        u32 idx = *(u32 *)key;
        unsigned long flags;
        int err = -EEXIST;

        /* already verified value_size <= sizeof val */
        memcpy(&val, value, map->value_size);

        if (unlikely(map_flags > BPF_EXIST || !val.ifindex))
                return -EINVAL;

        spin_lock_irqsave(&dtab->index_lock, flags);

        old_dev = __dev_map_hash_lookup_elem(map, idx);
        if (old_dev && (map_flags & BPF_NOEXIST))
                goto out_err;

        dev = __dev_map_alloc_node(net, dtab, &val, idx);
        if (IS_ERR(dev)) {
                err = PTR_ERR(dev);
                goto out_err;
        }

        if (old_dev) {
                hlist_del_rcu(&old_dev->index_hlist);
        } else {
                if (dtab->items >= dtab->map.max_entries) {
                        spin_unlock_irqrestore(&dtab->index_lock, flags);
                        call_rcu(&dev->rcu, __dev_map_entry_free);
                        return -E2BIG;
                }
                dtab->items++;
        }

        hlist_add_head_rcu(&dev->index_hlist,
                           dev_map_index_hash(dtab, idx));
        spin_unlock_irqrestore(&dtab->index_lock, flags);

        if (old_dev)
                call_rcu(&old_dev->rcu, __dev_map_entry_free);

        return 0;

out_err:
        spin_unlock_irqrestore(&dtab->index_lock, flags);
        return err;
}

static long dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
                                     u64 map_flags)
{
        return __dev_map_hash_update_elem(current->nsproxy->net_ns,
                                         map, key, value, map_flags);
}

static long dev_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
        return __bpf_xdp_redirect_map(map, ifindex, flags,
                                      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
                                      __dev_map_lookup_elem);
}

static long dev_hash_map_redirect(struct bpf_map *map, u64 ifindex, u64 flags)
{
        return __bpf_xdp_redirect_map(map, ifindex, flags,
                                      BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
                                      __dev_map_hash_lookup_elem);
}

static u64 dev_map_mem_usage(const struct bpf_map *map)
{
        struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
        u64 usage = sizeof(struct bpf_dtab);

        if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)
                usage += (u64)dtab->n_buckets * sizeof(struct hlist_head);
        else
                usage += (u64)map->max_entries * sizeof(struct bpf_dtab_netdev *);
        usage += atomic_read((atomic_t *)&dtab->items) *
                         (u64)sizeof(struct bpf_dtab_netdev);
        return usage;
}

BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
const struct bpf_map_ops dev_map_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = dev_map_alloc_check,
        .map_alloc = dev_map_alloc,
        .map_free = dev_map_free,
        .map_get_next_key = dev_map_get_next_key,
        .map_lookup_elem = dev_map_lookup_elem,
        .map_update_elem = dev_map_update_elem,
        .map_delete_elem = dev_map_delete_elem,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = dev_map_mem_usage,
        .map_btf_id = &dev_map_btf_ids[0],
        .map_redirect = dev_map_redirect,
};

const struct bpf_map_ops dev_map_hash_ops = {
        .map_meta_equal = bpf_map_meta_equal,
        .map_alloc_check = dev_map_alloc_check,
        .map_alloc = dev_map_alloc,
        .map_free = dev_map_free,
        .map_get_next_key = dev_map_hash_get_next_key,
        .map_lookup_elem = dev_map_hash_lookup_elem,
        .map_update_elem = dev_map_hash_update_elem,
        .map_delete_elem = dev_map_hash_delete_elem,
        .map_check_btf = map_check_no_btf,
        .map_mem_usage = dev_map_mem_usage,
        .map_btf_id = &dev_map_btf_ids[0],
        .map_redirect = dev_hash_map_redirect,
};

static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
                                       struct net_device *netdev)
{
        unsigned long flags;
        u32 i;

        spin_lock_irqsave(&dtab->index_lock, flags);
        for (i = 0; i < dtab->n_buckets; i++) {
                struct bpf_dtab_netdev *dev;
                struct hlist_head *head;
                struct hlist_node *next;

                head = dev_map_index_hash(dtab, i);

                hlist_for_each_entry_safe(dev, next, head, index_hlist) {
                        if (netdev != dev->dev)
                                continue;

                        dtab->items--;
                        hlist_del_rcu(&dev->index_hlist);
                        call_rcu(&dev->rcu, __dev_map_entry_free);
                }
        }
        spin_unlock_irqrestore(&dtab->index_lock, flags);
}

static int dev_map_notification(struct notifier_block *notifier,
                                ulong event, void *ptr)
{
        struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
        struct bpf_dtab *dtab;
        int i, cpu;

        switch (event) {
        case NETDEV_REGISTER:
                if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
                        break;

                /* will be freed in free_netdev() */
                netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue);
                if (!netdev->xdp_bulkq)
                        return NOTIFY_BAD;

                for_each_possible_cpu(cpu)
                        per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
                break;
        case NETDEV_UNREGISTER:
                /* This rcu_read_lock/unlock pair is needed because
                 * dev_map_list is an RCU list AND to ensure a delete
                 * operation does not free a netdev_map entry while we
                 * are comparing it against the netdev being unregistered.
                 */
                rcu_read_lock();
                list_for_each_entry_rcu(dtab, &dev_map_list, list) {
                        if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
                                dev_map_hash_remove_netdev(dtab, netdev);
                                continue;
                        }

                        for (i = 0; i < dtab->map.max_entries; i++) {
                                struct bpf_dtab_netdev *dev, *odev;

                                dev = rcu_dereference(dtab->netdev_map[i]);
                                if (!dev || netdev != dev->dev)
                                        continue;
                                odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
                                if (dev == odev) {
                                        call_rcu(&dev->rcu,
                                                 __dev_map_entry_free);
                                        atomic_dec((atomic_t *)&dtab->items);
                                }
                        }
                }
                rcu_read_unlock();
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

static struct notifier_block dev_map_notifier = {
        .notifier_call = dev_map_notification,
};

static int __init dev_map_init(void)
{
        /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
        BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
                     offsetof(struct _bpf_dtab_netdev, dev));
        register_netdevice_notifier(&dev_map_notifier);

        return 0;
}

subsys_initcall(dev_map_init);


































































































































































































































































































































































































































































































   79 
















  828 




























































































































































































































 1272 















   78 
























 1263 




  221 




   79 









   79 









  854 










  320 










  209 




  191 




  614 




  410 









  166 
















































   87 






































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
 */

#ifndef __ASM_CPUFEATURE_H
#define __ASM_CPUFEATURE_H

#include <asm/alternative-macros.h>
#include <asm/cpucaps.h>
#include <asm/cputype.h>
#include <asm/hwcap.h>
#include <asm/sysreg.h>

#define MAX_CPU_FEATURES        192
#define cpu_feature(x)                KERNEL_HWCAP_ ## x

#define ARM64_SW_FEATURE_OVERRIDE_NOKASLR        0
#define ARM64_SW_FEATURE_OVERRIDE_HVHE                4
#define ARM64_SW_FEATURE_OVERRIDE_RODATA_OFF        8

#ifndef __ASSEMBLY__

#include <linux/bug.h>
#include <linux/jump_label.h>
#include <linux/kernel.h>
#include <linux/cpumask.h>

/*
 * CPU feature register tracking
 *
 * The safe value of a CPUID feature field is dependent on the implications
 * of the values assigned to it by the architecture. Based on the relationship
 * between the values, the features are classified into 3 types - LOWER_SAFE,
 * HIGHER_SAFE and EXACT.
 *
 * The lowest value of all the CPUs is chosen for LOWER_SAFE and highest
 * for HIGHER_SAFE. It is expected that all CPUs have the same value for
 * a field when EXACT is specified, failing which, the safe value specified
 * in the table is chosen.
 */

enum ftr_type {
        FTR_EXACT,                        /* Use a predefined safe value */
        FTR_LOWER_SAFE,                        /* Smaller value is safe */
        FTR_HIGHER_SAFE,                /* Bigger value is safe */
        FTR_HIGHER_OR_ZERO_SAFE,        /* Bigger value is safe, but 0 is biggest */
};

#define FTR_STRICT        true        /* SANITY check strict matching required */
#define FTR_NONSTRICT        false        /* SANITY check ignored */

#define FTR_SIGNED        true        /* Value should be treated as signed */
#define FTR_UNSIGNED        false        /* Value should be treated as unsigned */

#define FTR_VISIBLE        true        /* Feature visible to the user space */
#define FTR_HIDDEN        false        /* Feature is hidden from the user */

#define FTR_VISIBLE_IF_IS_ENABLED(config)                \
        (IS_ENABLED(config) ? FTR_VISIBLE : FTR_HIDDEN)

struct arm64_ftr_bits {
        bool                sign;        /* Value is signed ? */
        bool                visible;
        bool                strict;        /* CPU Sanity check: strict matching required ? */
        enum ftr_type        type;
        u8                shift;
        u8                width;
        s64                safe_val; /* safe value for FTR_EXACT features */
};

/*
 * Describe the early feature override to the core override code:
 *
 * @val                        Values that are to be merged into the final
 *                        sanitised value of the register. Only the bitfields
 *                        set to 1 in @mask are valid
 * @mask                Mask of the features that are overridden by @val
 *
 * A @mask field set to full-1 indicates that the corresponding field
 * in @val is a valid override.
 *
 * A @mask field set to full-0 with the corresponding @val field set
 * to full-0 denotes that this field has no override
 *
 * A @mask field set to full-0 with the corresponding @val field set
 * to full-1 denotes that this field has an invalid override.
 */
struct arm64_ftr_override {
        u64                val;
        u64                mask;
};

/*
 * @arm64_ftr_reg - Feature register
 * @strict_mask                Bits which should match across all CPUs for sanity.
 * @sys_val                Safe value across the CPUs (system view)
 */
struct arm64_ftr_reg {
        const char                        *name;
        u64                                strict_mask;
        u64                                user_mask;
        u64                                sys_val;
        u64                                user_val;
        struct arm64_ftr_override        *override;
        const struct arm64_ftr_bits        *ftr_bits;
};

extern struct arm64_ftr_reg arm64_ftr_reg_ctrel0;

/*
 * CPU capabilities:
 *
 * We use arm64_cpu_capabilities to represent system features, errata work
 * arounds (both used internally by kernel and tracked in system_cpucaps) and
 * ELF HWCAPs (which are exposed to user).
 *
 * To support systems with heterogeneous CPUs, we need to make sure that we
 * detect the capabilities correctly on the system and take appropriate
 * measures to ensure there are no incompatibilities.
 *
 * This comment tries to explain how we treat the capabilities.
 * Each capability has the following list of attributes :
 *
 * 1) Scope of Detection : The system detects a given capability by
 *    performing some checks at runtime. This could be, e.g, checking the
 *    value of a field in CPU ID feature register or checking the cpu
 *    model. The capability provides a call back ( @matches() ) to
 *    perform the check. Scope defines how the checks should be performed.
 *    There are three cases:
 *
 *     a) SCOPE_LOCAL_CPU: check all the CPUs and "detect" if at least one
 *        matches. This implies, we have to run the check on all the
 *        booting CPUs, until the system decides that state of the
 *        capability is finalised. (See section 2 below)
 *                Or
 *     b) SCOPE_SYSTEM: check all the CPUs and "detect" if all the CPUs
 *        matches. This implies, we run the check only once, when the
 *        system decides to finalise the state of the capability. If the
 *        capability relies on a field in one of the CPU ID feature
 *        registers, we use the sanitised value of the register from the
 *        CPU feature infrastructure to make the decision.
 *                Or
 *     c) SCOPE_BOOT_CPU: Check only on the primary boot CPU to detect the
 *        feature. This category is for features that are "finalised"
 *        (or used) by the kernel very early even before the SMP cpus
 *        are brought up.
 *
 *    The process of detection is usually denoted by "update" capability
 *    state in the code.
 *
 * 2) Finalise the state : The kernel should finalise the state of a
 *    capability at some point during its execution and take necessary
 *    actions if any. Usually, this is done, after all the boot-time
 *    enabled CPUs are brought up by the kernel, so that it can make
 *    better decision based on the available set of CPUs. However, there
 *    are some special cases, where the action is taken during the early
 *    boot by the primary boot CPU. (e.g, running the kernel at EL2 with
 *    Virtualisation Host Extensions). The kernel usually disallows any
 *    changes to the state of a capability once it finalises the capability
 *    and takes any action, as it may be impossible to execute the actions
 *    safely. A CPU brought up after a capability is "finalised" is
 *    referred to as "Late CPU" w.r.t the capability. e.g, all secondary
 *    CPUs are treated "late CPUs" for capabilities determined by the boot
 *    CPU.
 *
 *    At the moment there are two passes of finalising the capabilities.
 *      a) Boot CPU scope capabilities - Finalised by primary boot CPU via
 *         setup_boot_cpu_capabilities().
 *      b) Everything except (a) - Run via setup_system_capabilities().
 *
 * 3) Verification: When a CPU is brought online (e.g, by user or by the
 *    kernel), the kernel should make sure that it is safe to use the CPU,
 *    by verifying that the CPU is compliant with the state of the
 *    capabilities finalised already. This happens via :
 *
 *        secondary_start_kernel()-> check_local_cpu_capabilities()
 *
 *    As explained in (2) above, capabilities could be finalised at
 *    different points in the execution. Each newly booted CPU is verified
 *    against the capabilities that have been finalised by the time it
 *    boots.
 *
 *        a) SCOPE_BOOT_CPU : All CPUs are verified against the capability
 *        except for the primary boot CPU.
 *
 *        b) SCOPE_LOCAL_CPU, SCOPE_SYSTEM: All CPUs hotplugged on by the
 *        user after the kernel boot are verified against the capability.
 *
 *    If there is a conflict, the kernel takes an action, based on the
 *    severity (e.g, a CPU could be prevented from booting or cause a
 *    kernel panic). The CPU is allowed to "affect" the state of the
 *    capability, if it has not been finalised already. See section 5
 *    for more details on conflicts.
 *
 * 4) Action: As mentioned in (2), the kernel can take an action for each
 *    detected capability, on all CPUs on the system. Appropriate actions
 *    include, turning on an architectural feature, modifying the control
 *    registers (e.g, SCTLR, TCR etc.) or patching the kernel via
 *    alternatives. The kernel patching is batched and performed at later
 *    point. The actions are always initiated only after the capability
 *    is finalised. This is usally denoted by "enabling" the capability.
 *    The actions are initiated as follows :
 *        a) Action is triggered on all online CPUs, after the capability is
 *        finalised, invoked within the stop_machine() context from
 *        enable_cpu_capabilitie().
 *
 *        b) Any late CPU, brought up after (1), the action is triggered via:
 *
 *          check_local_cpu_capabilities() -> verify_local_cpu_capabilities()
 *
 * 5) Conflicts: Based on the state of the capability on a late CPU vs.
 *    the system state, we could have the following combinations :
 *
 *                x-----------------------------x
 *                | Type  | System   | Late CPU |
 *                |-----------------------------|
 *                |  a    |   y      |    n     |
 *                |-----------------------------|
 *                |  b    |   n      |    y     |
 *                x-----------------------------x
 *
 *     Two separate flag bits are defined to indicate whether each kind of
 *     conflict can be allowed:
 *                ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU - Case(a) is allowed
 *                ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU - Case(b) is allowed
 *
 *     Case (a) is not permitted for a capability that the system requires
 *     all CPUs to have in order for the capability to be enabled. This is
 *     typical for capabilities that represent enhanced functionality.
 *
 *     Case (b) is not permitted for a capability that must be enabled
 *     during boot if any CPU in the system requires it in order to run
 *     safely. This is typical for erratum work arounds that cannot be
 *     enabled after the corresponding capability is finalised.
 *
 *     In some non-typical cases either both (a) and (b), or neither,
 *     should be permitted. This can be described by including neither
 *     or both flags in the capability's type field.
 *
 *     In case of a conflict, the CPU is prevented from booting. If the
 *     ARM64_CPUCAP_PANIC_ON_CONFLICT flag is specified for the capability,
 *     then a kernel panic is triggered.
 */


/*
 * Decide how the capability is detected.
 * On any local CPU vs System wide vs the primary boot CPU
 */
#define ARM64_CPUCAP_SCOPE_LOCAL_CPU                ((u16)BIT(0))
#define ARM64_CPUCAP_SCOPE_SYSTEM                ((u16)BIT(1))
/*
 * The capabilitiy is detected on the Boot CPU and is used by kernel
 * during early boot. i.e, the capability should be "detected" and
 * "enabled" as early as possibly on all booting CPUs.
 */
#define ARM64_CPUCAP_SCOPE_BOOT_CPU                ((u16)BIT(2))
#define ARM64_CPUCAP_SCOPE_MASK                        \
        (ARM64_CPUCAP_SCOPE_SYSTEM        |        \
         ARM64_CPUCAP_SCOPE_LOCAL_CPU        |        \
         ARM64_CPUCAP_SCOPE_BOOT_CPU)

#define SCOPE_SYSTEM                                ARM64_CPUCAP_SCOPE_SYSTEM
#define SCOPE_LOCAL_CPU                                ARM64_CPUCAP_SCOPE_LOCAL_CPU
#define SCOPE_BOOT_CPU                                ARM64_CPUCAP_SCOPE_BOOT_CPU
#define SCOPE_ALL                                ARM64_CPUCAP_SCOPE_MASK

/*
 * Is it permitted for a late CPU to have this capability when system
 * hasn't already enabled it ?
 */
#define ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU        ((u16)BIT(4))
/* Is it safe for a late CPU to miss this capability when system has it */
#define ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU        ((u16)BIT(5))
/* Panic when a conflict is detected */
#define ARM64_CPUCAP_PANIC_ON_CONFLICT                ((u16)BIT(6))

/*
 * CPU errata workarounds that need to be enabled at boot time if one or
 * more CPUs in the system requires it. When one of these capabilities
 * has been enabled, it is safe to allow any CPU to boot that doesn't
 * require the workaround. However, it is not safe if a "late" CPU
 * requires a workaround and the system hasn't enabled it already.
 */
#define ARM64_CPUCAP_LOCAL_CPU_ERRATUM                \
        (ARM64_CPUCAP_SCOPE_LOCAL_CPU | ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU)
/*
 * CPU feature detected at boot time based on system-wide value of a
 * feature. It is safe for a late CPU to have this feature even though
 * the system hasn't enabled it, although the feature will not be used
 * by Linux in this case. If the system has enabled this feature already,
 * then every late CPU must have it.
 */
#define ARM64_CPUCAP_SYSTEM_FEATURE        \
        (ARM64_CPUCAP_SCOPE_SYSTEM | ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU)
/*
 * CPU feature detected at boot time based on feature of one or more CPUs.
 * All possible conflicts for a late CPU are ignored.
 * NOTE: this means that a late CPU with the feature will *not* cause the
 * capability to be advertised by cpus_have_*cap()!
 */
#define ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE                \
        (ARM64_CPUCAP_SCOPE_LOCAL_CPU                |        \
         ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU        |        \
         ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU)

/*
 * CPU feature detected at boot time, on one or more CPUs. A late CPU
 * is not allowed to have the capability when the system doesn't have it.
 * It is Ok for a late CPU to miss the feature.
 */
#define ARM64_CPUCAP_BOOT_RESTRICTED_CPU_LOCAL_FEATURE        \
        (ARM64_CPUCAP_SCOPE_LOCAL_CPU                |        \
         ARM64_CPUCAP_OPTIONAL_FOR_LATE_CPU)

/*
 * CPU feature used early in the boot based on the boot CPU. All secondary
 * CPUs must match the state of the capability as detected by the boot CPU. In
 * case of a conflict, a kernel panic is triggered.
 */
#define ARM64_CPUCAP_STRICT_BOOT_CPU_FEATURE                \
        (ARM64_CPUCAP_SCOPE_BOOT_CPU | ARM64_CPUCAP_PANIC_ON_CONFLICT)

/*
 * CPU feature used early in the boot based on the boot CPU. It is safe for a
 * late CPU to have this feature even though the boot CPU hasn't enabled it,
 * although the feature will not be used by Linux in this case. If the boot CPU
 * has enabled this feature already, then every late CPU must have it.
 */
#define ARM64_CPUCAP_BOOT_CPU_FEATURE                  \
        (ARM64_CPUCAP_SCOPE_BOOT_CPU | ARM64_CPUCAP_PERMITTED_FOR_LATE_CPU)

struct arm64_cpu_capabilities {
        const char *desc;
        u16 capability;
        u16 type;
        bool (*matches)(const struct arm64_cpu_capabilities *caps, int scope);
        /*
         * Take the appropriate actions to configure this capability
         * for this CPU. If the capability is detected by the kernel
         * this will be called on all the CPUs in the system,
         * including the hotplugged CPUs, regardless of whether the
         * capability is available on that specific CPU. This is
         * useful for some capabilities (e.g, working around CPU
         * errata), where all the CPUs must take some action (e.g,
         * changing system control/configuration). Thus, if an action
         * is required only if the CPU has the capability, then the
         * routine must check it before taking any action.
         */
        void (*cpu_enable)(const struct arm64_cpu_capabilities *cap);
        union {
                struct {        /* To be used for erratum handling only */
                        struct midr_range midr_range;
                        const struct arm64_midr_revidr {
                                u32 midr_rv;                /* revision/variant */
                                u32 revidr_mask;
                        } * const fixed_revs;
                };

                const struct midr_range *midr_range_list;
                struct {        /* Feature register checking */
                        u32 sys_reg;
                        u8 field_pos;
                        u8 field_width;
                        u8 min_field_value;
                        u8 max_field_value;
                        u8 hwcap_type;
                        bool sign;
                        unsigned long hwcap;
                };
        };

        /*
         * An optional list of "matches/cpu_enable" pair for the same
         * "capability" of the same "type" as described by the parent.
         * Only matches(), cpu_enable() and fields relevant to these
         * methods are significant in the list. The cpu_enable is
         * invoked only if the corresponding entry "matches()".
         * However, if a cpu_enable() method is associated
         * with multiple matches(), care should be taken that either
         * the match criteria are mutually exclusive, or that the
         * method is robust against being called multiple times.
         */
        const struct arm64_cpu_capabilities *match_list;
        const struct cpumask *cpus;
};

static inline int cpucap_default_scope(const struct arm64_cpu_capabilities *cap)
{
        return cap->type & ARM64_CPUCAP_SCOPE_MASK;
}

/*
 * Generic helper for handling capabilities with multiple (match,enable) pairs
 * of call backs, sharing the same capability bit.
 * Iterate over each entry to see if at least one matches.
 */
static inline bool
cpucap_multi_entry_cap_matches(const struct arm64_cpu_capabilities *entry,
                               int scope)
{
        const struct arm64_cpu_capabilities *caps;

        for (caps = entry->match_list; caps->matches; caps++)
                if (caps->matches(caps, scope))
                        return true;

        return false;
}

static __always_inline bool is_vhe_hyp_code(void)
{
        /* Only defined for code run in VHE hyp context */
        return __is_defined(__KVM_VHE_HYPERVISOR__);
}

static __always_inline bool is_nvhe_hyp_code(void)
{
        /* Only defined for code run in NVHE hyp context */
        return __is_defined(__KVM_NVHE_HYPERVISOR__);
}

static __always_inline bool is_hyp_code(void)
{
        return is_vhe_hyp_code() || is_nvhe_hyp_code();
}

extern DECLARE_BITMAP(system_cpucaps, ARM64_NCAPS);

extern DECLARE_BITMAP(boot_cpucaps, ARM64_NCAPS);

#define for_each_available_cap(cap)                \
        for_each_set_bit(cap, system_cpucaps, ARM64_NCAPS)

bool this_cpu_has_cap(unsigned int cap);
void cpu_set_feature(unsigned int num);
bool cpu_have_feature(unsigned int num);
unsigned long cpu_get_elf_hwcap(void);
unsigned long cpu_get_elf_hwcap2(void);
unsigned long cpu_get_elf_hwcap3(void);

#define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name))
#define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name))

static __always_inline bool boot_capabilities_finalized(void)
{
        return alternative_has_cap_likely(ARM64_ALWAYS_BOOT);
}

static __always_inline bool system_capabilities_finalized(void)
{
        return alternative_has_cap_likely(ARM64_ALWAYS_SYSTEM);
}

/*
 * Test for a capability with a runtime check.
 *
 * Before the capability is detected, this returns false.
 */
static __always_inline bool cpus_have_cap(unsigned int num)
{
        if (__builtin_constant_p(num) && !cpucap_is_possible(num))
                return false;
        if (num >= ARM64_NCAPS)
                return false;
        return arch_test_bit(num, system_cpucaps);
}

/*
 * Test for a capability without a runtime check.
 *
 * Before boot capabilities are finalized, this will BUG().
 * After boot capabilities are finalized, this is patched to avoid a runtime
 * check.
 *
 * @num must be a compile-time constant.
 */
static __always_inline bool cpus_have_final_boot_cap(int num)
{
        if (boot_capabilities_finalized())
                return alternative_has_cap_unlikely(num);
        else
                BUG();
}

/*
 * Test for a capability without a runtime check.
 *
 * Before system capabilities are finalized, this will BUG().
 * After system capabilities are finalized, this is patched to avoid a runtime
 * check.
 *
 * @num must be a compile-time constant.
 */
static __always_inline bool cpus_have_final_cap(int num)
{
        if (system_capabilities_finalized())
                return alternative_has_cap_unlikely(num);
        else
                BUG();
}

static inline int __attribute_const__
cpuid_feature_extract_signed_field_width(u64 features, int field, int width)
{
        return (s64)(features << (64 - width - field)) >> (64 - width);
}

static inline int __attribute_const__
cpuid_feature_extract_signed_field(u64 features, int field)
{
        return cpuid_feature_extract_signed_field_width(features, field, 4);
}

static __always_inline unsigned int __attribute_const__
cpuid_feature_extract_unsigned_field_width(u64 features, int field, int width)
{
        return (u64)(features << (64 - width - field)) >> (64 - width);
}

static __always_inline unsigned int __attribute_const__
cpuid_feature_extract_unsigned_field(u64 features, int field)
{
        return cpuid_feature_extract_unsigned_field_width(features, field, 4);
}

static inline u64 arm64_ftr_mask(const struct arm64_ftr_bits *ftrp)
{
        return (u64)GENMASK(ftrp->shift + ftrp->width - 1, ftrp->shift);
}

static inline u64 arm64_ftr_reg_user_value(const struct arm64_ftr_reg *reg)
{
        return (reg->user_val | (reg->sys_val & reg->user_mask));
}

static inline int __attribute_const__
cpuid_feature_extract_field_width(u64 features, int field, int width, bool sign)
{
        if (WARN_ON_ONCE(!width))
                width = 4;
        return (sign) ?
                cpuid_feature_extract_signed_field_width(features, field, width) :
                cpuid_feature_extract_unsigned_field_width(features, field, width);
}

static inline int __attribute_const__
cpuid_feature_extract_field(u64 features, int field, bool sign)
{
        return cpuid_feature_extract_field_width(features, field, 4, sign);
}

static inline s64 arm64_ftr_value(const struct arm64_ftr_bits *ftrp, u64 val)
{
        return (s64)cpuid_feature_extract_field_width(val, ftrp->shift, ftrp->width, ftrp->sign);
}

static inline bool id_aa64mmfr0_mixed_endian_el0(u64 mmfr0)
{
        return cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGEND_SHIFT) == 0x1 ||
                cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_EL1_BIGENDEL0_SHIFT) == 0x1;
}

static inline bool id_aa64pfr0_32bit_el1(u64 pfr0)
{
        u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_EL1_SHIFT);

        return val == ID_AA64PFR0_EL1_EL1_AARCH32;
}

static inline bool id_aa64pfr0_32bit_el0(u64 pfr0)
{
        u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_EL0_SHIFT);

        return val == ID_AA64PFR0_EL1_EL0_AARCH32;
}

static inline bool id_aa64pfr0_sve(u64 pfr0)
{
        u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SVE_SHIFT);

        return val > 0;
}

static inline bool id_aa64pfr1_sme(u64 pfr1)
{
        u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_SME_SHIFT);

        return val > 0;
}

static inline bool id_aa64pfr0_mpam(u64 pfr0)
{
        u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_MPAM_SHIFT);

        return val > 0;
}

static inline bool id_aa64pfr1_mte(u64 pfr1)
{
        u32 val = cpuid_feature_extract_unsigned_field(pfr1, ID_AA64PFR1_EL1_MTE_SHIFT);

        return val >= ID_AA64PFR1_EL1_MTE_MTE2;
}

void __init setup_boot_cpu_features(void);
void __init setup_system_features(void);
void __init setup_user_features(void);

void check_local_cpu_capabilities(void);

u64 read_sanitised_ftr_reg(u32 id);
u64 __read_sysreg_by_encoding(u32 sys_id);

static inline bool cpu_supports_mixed_endian_el0(void)
{
        return id_aa64mmfr0_mixed_endian_el0(read_cpuid(ID_AA64MMFR0_EL1));
}


static inline bool supports_csv2p3(int scope)
{
        u64 pfr0;
        u8 csv2_val;

        if (scope == SCOPE_LOCAL_CPU)
                pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1);
        else
                pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);

        csv2_val = cpuid_feature_extract_unsigned_field(pfr0,
                                                        ID_AA64PFR0_EL1_CSV2_SHIFT);
        return csv2_val == 3;
}

static inline bool supports_clearbhb(int scope)
{
        u64 isar2;

        if (scope == SCOPE_LOCAL_CPU)
                isar2 = read_sysreg_s(SYS_ID_AA64ISAR2_EL1);
        else
                isar2 = read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1);

        return cpuid_feature_extract_unsigned_field(isar2,
                                                    ID_AA64ISAR2_EL1_CLRBHB_SHIFT);
}

const struct cpumask *system_32bit_el0_cpumask(void);
const struct cpumask *fallback_32bit_el0_cpumask(void);
DECLARE_STATIC_KEY_FALSE(arm64_mismatched_32bit_el0);

static inline bool system_supports_32bit_el0(void)
{
        u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);

        return static_branch_unlikely(&arm64_mismatched_32bit_el0) ||
               id_aa64pfr0_32bit_el0(pfr0);
}

static inline bool system_supports_4kb_granule(void)
{
        u64 mmfr0;
        u32 val;

        mmfr0 =        read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        val = cpuid_feature_extract_unsigned_field(mmfr0,
                                                ID_AA64MMFR0_EL1_TGRAN4_SHIFT);

        return (val >= ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MIN) &&
               (val <= ID_AA64MMFR0_EL1_TGRAN4_SUPPORTED_MAX);
}

static inline bool system_supports_64kb_granule(void)
{
        u64 mmfr0;
        u32 val;

        mmfr0 =        read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        val = cpuid_feature_extract_unsigned_field(mmfr0,
                                                ID_AA64MMFR0_EL1_TGRAN64_SHIFT);

        return (val >= ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MIN) &&
               (val <= ID_AA64MMFR0_EL1_TGRAN64_SUPPORTED_MAX);
}

static inline bool system_supports_16kb_granule(void)
{
        u64 mmfr0;
        u32 val;

        mmfr0 =        read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        val = cpuid_feature_extract_unsigned_field(mmfr0,
                                                ID_AA64MMFR0_EL1_TGRAN16_SHIFT);

        return (val >= ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MIN) &&
               (val <= ID_AA64MMFR0_EL1_TGRAN16_SUPPORTED_MAX);
}

static inline bool system_supports_mixed_endian_el0(void)
{
        return id_aa64mmfr0_mixed_endian_el0(read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1));
}

static inline bool system_supports_mixed_endian(void)
{
        u64 mmfr0;
        u32 val;

        mmfr0 =        read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        val = cpuid_feature_extract_unsigned_field(mmfr0,
                                                ID_AA64MMFR0_EL1_BIGEND_SHIFT);

        return val == 0x1;
}

static __always_inline bool system_supports_fpsimd(void)
{
        return alternative_has_cap_likely(ARM64_HAS_FPSIMD);
}

static inline bool system_uses_hw_pan(void)
{
        return alternative_has_cap_unlikely(ARM64_HAS_PAN);
}

static inline bool system_uses_ttbr0_pan(void)
{
        return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) &&
                !system_uses_hw_pan();
}

static __always_inline bool system_supports_sve(void)
{
        return alternative_has_cap_unlikely(ARM64_SVE);
}

static __always_inline bool system_supports_sme(void)
{
        return alternative_has_cap_unlikely(ARM64_SME);
}

static __always_inline bool system_supports_sme2(void)
{
        return alternative_has_cap_unlikely(ARM64_SME2);
}

static __always_inline bool system_supports_fa64(void)
{
        return alternative_has_cap_unlikely(ARM64_SME_FA64);
}

static __always_inline bool system_supports_tpidr2(void)
{
        return system_supports_sme();
}

static __always_inline bool system_supports_fpmr(void)
{
        return alternative_has_cap_unlikely(ARM64_HAS_FPMR);
}

static __always_inline bool system_supports_cnp(void)
{
        return alternative_has_cap_unlikely(ARM64_HAS_CNP);
}

static inline bool system_supports_address_auth(void)
{
        return cpus_have_final_boot_cap(ARM64_HAS_ADDRESS_AUTH);
}

static inline bool system_supports_generic_auth(void)
{
        return alternative_has_cap_unlikely(ARM64_HAS_GENERIC_AUTH);
}

static inline bool system_has_full_ptr_auth(void)
{
        return system_supports_address_auth() && system_supports_generic_auth();
}

static __always_inline bool system_uses_irq_prio_masking(void)
{
        return alternative_has_cap_unlikely(ARM64_HAS_GIC_PRIO_MASKING);
}

static inline bool system_supports_mte(void)
{
        return alternative_has_cap_unlikely(ARM64_MTE);
}

static inline bool system_has_prio_mask_debugging(void)
{
        return IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING) &&
               system_uses_irq_prio_masking();
}

static inline bool system_supports_bti(void)
{
        return cpus_have_final_cap(ARM64_BTI);
}

static inline bool system_supports_bti_kernel(void)
{
        return IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) &&
                cpus_have_final_boot_cap(ARM64_BTI);
}

static inline bool system_supports_tlb_range(void)
{
        return alternative_has_cap_unlikely(ARM64_HAS_TLB_RANGE);
}

static inline bool system_supports_lpa2(void)
{
        return cpus_have_final_cap(ARM64_HAS_LPA2);
}

static inline bool system_supports_poe(void)
{
        return alternative_has_cap_unlikely(ARM64_HAS_S1POE);
}

static inline bool system_supports_gcs(void)
{
        return alternative_has_cap_unlikely(ARM64_HAS_GCS);
}

static inline bool system_supports_haft(void)
{
        return cpus_have_final_cap(ARM64_HAFT);
}

static __always_inline bool system_supports_mpam(void)
{
        return alternative_has_cap_unlikely(ARM64_MPAM);
}

static __always_inline bool system_supports_mpam_hcr(void)
{
        return alternative_has_cap_unlikely(ARM64_MPAM_HCR);
}

static inline bool system_supports_pmuv3(void)
{
        return cpus_have_final_cap(ARM64_HAS_PMUV3);
}

int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
bool try_emulate_mrs(struct pt_regs *regs, u32 isn);

static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
{
        switch (parange) {
        case ID_AA64MMFR0_EL1_PARANGE_32: return 32;
        case ID_AA64MMFR0_EL1_PARANGE_36: return 36;
        case ID_AA64MMFR0_EL1_PARANGE_40: return 40;
        case ID_AA64MMFR0_EL1_PARANGE_42: return 42;
        case ID_AA64MMFR0_EL1_PARANGE_44: return 44;
        case ID_AA64MMFR0_EL1_PARANGE_48: return 48;
        case ID_AA64MMFR0_EL1_PARANGE_52: return 52;
        /*
         * A future PE could use a value unknown to the kernel.
         * However, by the "D10.1.4 Principles of the ID scheme
         * for fields in ID registers", ARM DDI 0487C.a, any new
         * value is guaranteed to be higher than what we know already.
         * As a safe limit, we return the limit supported by the kernel.
         */
        default: return CONFIG_ARM64_PA_BITS;
        }
}

/* Check whether hardware update of the Access flag is supported */
static inline bool cpu_has_hw_af(void)
{
        u64 mmfr1;

        if (!IS_ENABLED(CONFIG_ARM64_HW_AFDBM))
                return false;

        /*
         * Use cached version to avoid emulated msr operation on KVM
         * guests.
         */
        mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
        return cpuid_feature_extract_unsigned_field(mmfr1,
                                                ID_AA64MMFR1_EL1_HAFDBS_SHIFT);
}

static inline bool cpu_has_pan(void)
{
        u64 mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
        return cpuid_feature_extract_unsigned_field(mmfr1,
                                                    ID_AA64MMFR1_EL1_PAN_SHIFT);
}

#ifdef CONFIG_ARM64_AMU_EXTN
/* Check whether the cpu supports the Activity Monitors Unit (AMU) */
extern bool cpu_has_amu_feat(int cpu);
#else
static inline bool cpu_has_amu_feat(int cpu)
{
        return false;
}
#endif

/* Get a cpu that supports the Activity Monitors Unit (AMU) */
extern int get_cpu_with_amu_feat(void);

static inline unsigned int get_vmid_bits(u64 mmfr1)
{
        int vmid_bits;

        vmid_bits = cpuid_feature_extract_unsigned_field(mmfr1,
                                                ID_AA64MMFR1_EL1_VMIDBits_SHIFT);
        if (vmid_bits == ID_AA64MMFR1_EL1_VMIDBits_16)
                return 16;

        /*
         * Return the default here even if any reserved
         * value is fetched from the system register.
         */
        return 8;
}

s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur);
struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id);

extern struct arm64_ftr_override id_aa64mmfr0_override;
extern struct arm64_ftr_override id_aa64mmfr1_override;
extern struct arm64_ftr_override id_aa64mmfr2_override;
extern struct arm64_ftr_override id_aa64pfr0_override;
extern struct arm64_ftr_override id_aa64pfr1_override;
extern struct arm64_ftr_override id_aa64zfr0_override;
extern struct arm64_ftr_override id_aa64smfr0_override;
extern struct arm64_ftr_override id_aa64isar1_override;
extern struct arm64_ftr_override id_aa64isar2_override;

extern struct arm64_ftr_override arm64_sw_feature_override;

static inline
u64 arm64_apply_feature_override(u64 val, int feat, int width,
                                 const struct arm64_ftr_override *override)
{
        u64 oval = override->val;

        /*
         * When it encounters an invalid override (e.g., an override that
         * cannot be honoured due to a missing CPU feature), the early idreg
         * override code will set the mask to 0x0 and the value to non-zero for
         * the field in question. In order to determine whether the override is
         * valid or not for the field we are interested in, we first need to
         * disregard bits belonging to other fields.
         */
        oval &= GENMASK_ULL(feat + width - 1, feat);

        /*
         * The override is valid if all value bits are accounted for in the
         * mask. If so, replace the masked bits with the override value.
         */
        if (oval == (oval & override->mask)) {
                val &= ~override->mask;
                val |= oval;
        }

        /* Extract the field from the updated value */
        return cpuid_feature_extract_unsigned_field(val, feat);
}

static inline bool arm64_test_sw_feature_override(int feat)
{
        /*
         * Software features are pseudo CPU features that have no underlying
         * CPUID system register value to apply the override to.
         */
        return arm64_apply_feature_override(0, feat, 4,
                                            &arm64_sw_feature_override);
}

static inline bool kaslr_disabled_cmdline(void)
{
        return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOKASLR);
}

u32 get_kvm_ipa_limit(void);
void dump_cpu_features(void);

static inline bool cpu_has_bti(void)
{
        if (!IS_ENABLED(CONFIG_ARM64_BTI))
                return false;

        return arm64_apply_feature_override(read_cpuid(ID_AA64PFR1_EL1),
                                            ID_AA64PFR1_EL1_BT_SHIFT, 4,
                                            &id_aa64pfr1_override);
}

static inline bool cpu_has_pac(void)
{
        u64 isar1, isar2;

        if (!IS_ENABLED(CONFIG_ARM64_PTR_AUTH))
                return false;

        isar1 = read_cpuid(ID_AA64ISAR1_EL1);
        isar2 = read_cpuid(ID_AA64ISAR2_EL1);

        if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_APA_SHIFT, 4,
                                         &id_aa64isar1_override))
                return true;

        if (arm64_apply_feature_override(isar1, ID_AA64ISAR1_EL1_API_SHIFT, 4,
                                         &id_aa64isar1_override))
                return true;

        return arm64_apply_feature_override(isar2, ID_AA64ISAR2_EL1_APA3_SHIFT, 4,
                                            &id_aa64isar2_override);
}

static inline bool cpu_has_lva(void)
{
        u64 mmfr2;

        mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1);
        mmfr2 &= ~id_aa64mmfr2_override.mask;
        mmfr2 |= id_aa64mmfr2_override.val;
        return cpuid_feature_extract_unsigned_field(mmfr2,
                                                    ID_AA64MMFR2_EL1_VARange_SHIFT);
}

static inline bool cpu_has_lpa2(void)
{
#ifdef CONFIG_ARM64_LPA2
        u64 mmfr0;
        int feat;

        mmfr0 = read_sysreg(id_aa64mmfr0_el1);
        mmfr0 &= ~id_aa64mmfr0_override.mask;
        mmfr0 |= id_aa64mmfr0_override.val;
        feat = cpuid_feature_extract_signed_field(mmfr0,
                                                  ID_AA64MMFR0_EL1_TGRAN_SHIFT);

        return feat >= ID_AA64MMFR0_EL1_TGRAN_LPA2;
#else
        return false;
#endif
}

#endif /* __ASSEMBLY__ */

#endif































































































































































































































































































































   23 

























  450 


















   14 























  311 
    3 



















    2 














    2 









    2 













































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_DCACHE_H
#define __LINUX_DCACHE_H

#include <linux/atomic.h>
#include <linux/list.h>
#include <linux/math.h>
#include <linux/rculist.h>
#include <linux/rculist_bl.h>
#include <linux/spinlock.h>
#include <linux/seqlock.h>
#include <linux/cache.h>
#include <linux/rcupdate.h>
#include <linux/lockref.h>
#include <linux/stringhash.h>
#include <linux/wait.h>

struct path;
struct file;
struct vfsmount;

/*
 * linux/include/linux/dcache.h
 *
 * Dirent cache data structures
 *
 * (C) Copyright 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

#define IS_ROOT(x) ((x) == (x)->d_parent)

/* The hash is always the low bits of hash_len */
#ifdef __LITTLE_ENDIAN
 #define HASH_LEN_DECLARE u32 hash; u32 len
 #define bytemask_from_count(cnt)        (~(~0ul << (cnt)*8))
#else
 #define HASH_LEN_DECLARE u32 len; u32 hash
 #define bytemask_from_count(cnt)        (~(~0ul >> (cnt)*8))
#endif

/*
 * "quick string" -- eases parameter passing, but more importantly
 * saves "metadata" about the string (ie length and the hash).
 *
 * hash comes first so it snuggles against d_parent in the
 * dentry.
 */
struct qstr {
        union {
                struct {
                        HASH_LEN_DECLARE;
                };
                u64 hash_len;
        };
        const unsigned char *name;
};

#define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n))

extern const struct qstr empty_name;
extern const struct qstr slash_name;
extern const struct qstr dotdot_name;

/*
 * Try to keep struct dentry aligned on 64 byte cachelines (this will
 * give reasonable cacheline footprint with larger lines without the
 * large memory footprint increase).
 */
#ifdef CONFIG_64BIT
# define DNAME_INLINE_WORDS 5 /* 192 bytes */
#else
# ifdef CONFIG_SMP
#  define DNAME_INLINE_WORDS 9 /* 128 bytes */
# else
#  define DNAME_INLINE_WORDS 11 /* 128 bytes */
# endif
#endif

#define DNAME_INLINE_LEN (DNAME_INLINE_WORDS*sizeof(unsigned long))

union shortname_store {
        unsigned char string[DNAME_INLINE_LEN];
        unsigned long words[DNAME_INLINE_WORDS];
};

#define d_lock        d_lockref.lock
#define d_iname d_shortname.string

struct dentry {
        /* RCU lookup touched fields */
        unsigned int d_flags;                /* protected by d_lock */
        seqcount_spinlock_t d_seq;        /* per dentry seqlock */
        struct hlist_bl_node d_hash;        /* lookup hash list */
        struct dentry *d_parent;        /* parent directory */
        struct qstr d_name;
        struct inode *d_inode;                /* Where the name belongs to - NULL is
                                         * negative */
        union shortname_store d_shortname;
        /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */

        /* Ref lookup also touches following */
        const struct dentry_operations *d_op;
        struct super_block *d_sb;        /* The root of the dentry tree */
        unsigned long d_time;                /* used by d_revalidate */
        void *d_fsdata;                        /* fs-specific data */
        /* --- cacheline 2 boundary (128 bytes) --- */
        struct lockref d_lockref;        /* per-dentry lock and refcount
                                         * keep separate from RCU lookup area if
                                         * possible!
                                         */

        union {
                struct list_head d_lru;                /* LRU list */
                wait_queue_head_t *d_wait;        /* in-lookup ones only */
        };
        struct hlist_node d_sib;        /* child of parent list */
        struct hlist_head d_children;        /* our children */
        /*
         * d_alias and d_rcu can share memory
         */
        union {
                struct hlist_node d_alias;        /* inode alias list */
                struct hlist_bl_node d_in_lookup_hash;        /* only for in-lookup ones */
                 struct rcu_head d_rcu;
        } d_u;
};

/*
 * dentry->d_lock spinlock nesting subclasses:
 *
 * 0: normal
 * 1: nested
 */
enum dentry_d_lock_class
{
        DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */
        DENTRY_D_LOCK_NESTED
};

enum d_real_type {
        D_REAL_DATA,
        D_REAL_METADATA,
};

struct dentry_operations {
        int (*d_revalidate)(struct inode *, const struct qstr *,
                            struct dentry *, unsigned int);
        int (*d_weak_revalidate)(struct dentry *, unsigned int);
        int (*d_hash)(const struct dentry *, struct qstr *);
        int (*d_compare)(const struct dentry *,
                        unsigned int, const char *, const struct qstr *);
        int (*d_delete)(const struct dentry *);
        int (*d_init)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_prune)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
        char *(*d_dname)(struct dentry *, char *, int);
        struct vfsmount *(*d_automount)(struct path *);
        int (*d_manage)(const struct path *, bool);
        struct dentry *(*d_real)(struct dentry *, enum d_real_type type);
        bool (*d_unalias_trylock)(const struct dentry *);
        void (*d_unalias_unlock)(const struct dentry *);
} ____cacheline_aligned;

/*
 * Locking rules for dentry_operations callbacks are to be found in
 * Documentation/filesystems/locking.rst. Keep it updated!
 *
 * FUrther descriptions are found in Documentation/filesystems/vfs.rst.
 * Keep it updated too!
 */

/* d_flags entries */
enum dentry_flags {
        DCACHE_OP_HASH                        = BIT(0),
        DCACHE_OP_COMPARE                = BIT(1),
        DCACHE_OP_REVALIDATE                = BIT(2),
        DCACHE_OP_DELETE                = BIT(3),
        DCACHE_OP_PRUNE                        = BIT(4),
        /*
         * This dentry is possibly not currently connected to the dcache tree,
         * in which case its parent will either be itself, or will have this
         * flag as well.  nfsd will not use a dentry with this bit set, but will
         * first endeavour to clear the bit either by discovering that it is
         * connected, or by performing lookup operations.  Any filesystem which
         * supports nfsd_operations MUST have a lookup function which, if it
         * finds a directory inode with a DCACHE_DISCONNECTED dentry, will
         * d_move that dentry into place and return that dentry rather than the
         * passed one, typically using d_splice_alias.
         */
        DCACHE_DISCONNECTED                = BIT(5),
        DCACHE_REFERENCED                = BIT(6),        /* Recently used, don't discard. */
        DCACHE_DONTCACHE                = BIT(7),        /* Purge from memory on final dput() */
        DCACHE_CANT_MOUNT                = BIT(8),
        DCACHE_GENOCIDE                        = BIT(9),
        DCACHE_SHRINK_LIST                = BIT(10),
        DCACHE_OP_WEAK_REVALIDATE        = BIT(11),
        /*
         * this dentry has been "silly renamed" and has to be deleted on the
         * last dput()
         */
        DCACHE_NFSFS_RENAMED                = BIT(12),
        DCACHE_FSNOTIFY_PARENT_WATCHED        = BIT(13),        /* Parent inode is watched by some fsnotify listener */
        DCACHE_DENTRY_KILLED                = BIT(14),
        DCACHE_MOUNTED                        = BIT(15),        /* is a mountpoint */
        DCACHE_NEED_AUTOMOUNT                = BIT(16),        /* handle automount on this dir */
        DCACHE_MANAGE_TRANSIT                = BIT(17),        /* manage transit from this dirent */
        DCACHE_LRU_LIST                        = BIT(18),
        DCACHE_ENTRY_TYPE                = (7 << 19),        /* bits 19..21 are for storing type: */
        DCACHE_MISS_TYPE                = (0 << 19),        /* Negative dentry */
        DCACHE_WHITEOUT_TYPE                = (1 << 19),        /* Whiteout dentry (stop pathwalk) */
        DCACHE_DIRECTORY_TYPE                = (2 << 19),        /* Normal directory */
        DCACHE_AUTODIR_TYPE                = (3 << 19),        /* Lookupless directory (presumed automount) */
        DCACHE_REGULAR_TYPE                = (4 << 19),        /* Regular file type */
        DCACHE_SPECIAL_TYPE                = (5 << 19),        /* Other file type */
        DCACHE_SYMLINK_TYPE                = (6 << 19),        /* Symlink */
        DCACHE_NOKEY_NAME                = BIT(22),        /* Encrypted name encoded without key */
        DCACHE_OP_REAL                        = BIT(23),
        DCACHE_PAR_LOOKUP                = BIT(24),        /* being looked up (with parent locked shared) */
        DCACHE_DENTRY_CURSOR                = BIT(25),
        DCACHE_NORCU                        = BIT(26),        /* No RCU delay for freeing */
};

#define DCACHE_MANAGED_DENTRY \
        (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT)

extern seqlock_t rename_lock;

/*
 * These are the low-level FS interfaces to the dcache..
 */
extern void d_instantiate(struct dentry *, struct inode *);
extern void d_instantiate_new(struct dentry *, struct inode *);
extern void __d_drop(struct dentry *dentry);
extern void d_drop(struct dentry *dentry);
extern void d_delete(struct dentry *);
extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op);

/* allocate/de-allocate */
extern struct dentry * d_alloc(struct dentry *, const struct qstr *);
extern struct dentry * d_alloc_anon(struct super_block *);
extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *,
                                        wait_queue_head_t *);
extern struct dentry * d_splice_alias(struct inode *, struct dentry *);
extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *);
extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
                        const struct qstr *name);
extern struct dentry *d_find_any_alias(struct inode *inode);
extern struct dentry * d_obtain_alias(struct inode *);
extern struct dentry * d_obtain_root(struct inode *);
extern void shrink_dcache_sb(struct super_block *);
extern void shrink_dcache_parent(struct dentry *);
extern void d_invalidate(struct dentry *);

/* only used at mount-time */
extern struct dentry * d_make_root(struct inode *);

extern void d_mark_tmpfile(struct file *, struct inode *);
extern void d_tmpfile(struct file *, struct inode *);

extern struct dentry *d_find_alias(struct inode *);
extern void d_prune_aliases(struct inode *);

extern struct dentry *d_find_alias_rcu(struct inode *);

/* test whether we have any submounts in a subdir tree */
extern int path_has_submounts(const struct path *);

/*
 * This adds the entry to the hash queues.
 */
extern void d_rehash(struct dentry *);
 
extern void d_add(struct dentry *, struct inode *);

/* used for rename() and baskets */
extern void d_move(struct dentry *, struct dentry *);
extern void d_exchange(struct dentry *, struct dentry *);
extern struct dentry *d_ancestor(struct dentry *, struct dentry *);

extern struct dentry *d_lookup(const struct dentry *, const struct qstr *);
extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *);

static inline unsigned d_count(const struct dentry *dentry)
{
        return dentry->d_lockref.count;
}

ino_t d_parent_ino(struct dentry *dentry);

/*
 * helper function for dentry_operations.d_dname() members
 */
extern __printf(3, 4)
char *dynamic_dname(char *, int, const char *, ...);

extern char *__d_path(const struct path *, const struct path *, char *, int);
extern char *d_absolute_path(const struct path *, char *, int);
extern char *d_path(const struct path *, char *, int);
extern char *dentry_path_raw(const struct dentry *, char *, int);
extern char *dentry_path(const struct dentry *, char *, int);

/* Allocation counts.. */

/**
 * dget_dlock -        get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a live dentry, increment the reference count and return the dentry.
 * Caller must hold @dentry->d_lock.  Making sure that dentry is alive is
 * caller's resonsibility.  There are many conditions sufficient to guarantee
 * that; e.g. anything with non-negative refcount is alive, so's anything
 * hashed, anything positive, anyone's parent, etc.
 */
static inline struct dentry *dget_dlock(struct dentry *dentry)
{
        dentry->d_lockref.count++;
        return dentry;
}


/**
 * dget - get a reference to a dentry
 * @dentry: dentry to get a reference to
 *
 * Given a dentry or %NULL pointer increment the reference count
 * if appropriate and return the dentry.  A dentry will not be
 * destroyed when it has references.  Conversely, a dentry with
 * no references can disappear for any number of reasons, starting
 * with memory pressure.  In other words, that primitive is
 * used to clone an existing reference; using it on something with
 * zero refcount is a bug.
 *
 * NOTE: it will spin if @dentry->d_lock is held.  From the deadlock
 * avoidance point of view it is equivalent to spin_lock()/increment
 * refcount/spin_unlock(), so calling it under @dentry->d_lock is
 * always a bug; so's calling it under ->d_lock on any of its descendents.
 *
 */
static inline struct dentry *dget(struct dentry *dentry)
{
        if (dentry)
                lockref_get(&dentry->d_lockref);
        return dentry;
}

extern struct dentry *dget_parent(struct dentry *dentry);

/**
 * d_unhashed - is dentry hashed
 * @dentry: entry to check
 *
 * Returns true if the dentry passed is not currently hashed.
 */
static inline int d_unhashed(const struct dentry *dentry)
{
        return hlist_bl_unhashed(&dentry->d_hash);
}

static inline int d_unlinked(const struct dentry *dentry)
{
        return d_unhashed(dentry) && !IS_ROOT(dentry);
}

static inline int cant_mount(const struct dentry *dentry)
{
        return (dentry->d_flags & DCACHE_CANT_MOUNT);
}

static inline void dont_mount(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        dentry->d_flags |= DCACHE_CANT_MOUNT;
        spin_unlock(&dentry->d_lock);
}

extern void __d_lookup_unhash_wake(struct dentry *dentry);

static inline int d_in_lookup(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_PAR_LOOKUP;
}

static inline void d_lookup_done(struct dentry *dentry)
{
        if (unlikely(d_in_lookup(dentry)))
                __d_lookup_unhash_wake(dentry);
}

extern void dput(struct dentry *);

static inline bool d_managed(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MANAGED_DENTRY;
}

static inline bool d_mountpoint(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_MOUNTED;
}

/*
 * Directory cache entry type accessor functions.
 */
static inline unsigned __d_entry_type(const struct dentry *dentry)
{
        return dentry->d_flags & DCACHE_ENTRY_TYPE;
}

static inline bool d_is_miss(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_MISS_TYPE;
}

static inline bool d_is_whiteout(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE;
}

static inline bool d_can_lookup(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE;
}

static inline bool d_is_autodir(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE;
}

static inline bool d_is_dir(const struct dentry *dentry)
{
        return d_can_lookup(dentry) || d_is_autodir(dentry);
}

static inline bool d_is_symlink(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE;
}

static inline bool d_is_reg(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE;
}

static inline bool d_is_special(const struct dentry *dentry)
{
        return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE;
}

static inline bool d_is_file(const struct dentry *dentry)
{
        return d_is_reg(dentry) || d_is_special(dentry);
}

static inline bool d_is_negative(const struct dentry *dentry)
{
        // TODO: check d_is_whiteout(dentry) also.
        return d_is_miss(dentry);
}

static inline bool d_flags_negative(unsigned flags)
{
        return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE;
}

static inline bool d_is_positive(const struct dentry *dentry)
{
        return !d_is_negative(dentry);
}

/**
 * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents either an absent name or a name that
 * doesn't map to an inode (ie. ->d_inode is NULL).  The dentry could represent
 * a true miss, a whiteout that isn't represented by a 0,0 chardev or a
 * fallthrough marker in an opaque directory.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.  (3) The dentry may have something attached to ->d_lower and the
 * type field of the flags may be set to something other than miss or whiteout.
 */
static inline bool d_really_is_negative(const struct dentry *dentry)
{
        return dentry->d_inode == NULL;
}

/**
 * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs)
 * @dentry: The dentry in question
 *
 * Returns true if the dentry represents a name that maps to an inode
 * (ie. ->d_inode is not NULL).  The dentry might still represent a whiteout if
 * that is represented on medium as a 0,0 chardev.
 *
 * Note!  (1) This should be used *only* by a filesystem to examine its own
 * dentries.  It should not be used to look at some other filesystem's
 * dentries.  (2) It should also be used in combination with d_inode() to get
 * the inode.
 */
static inline bool d_really_is_positive(const struct dentry *dentry)
{
        return dentry->d_inode != NULL;
}

static inline int simple_positive(const struct dentry *dentry)
{
        return d_really_is_positive(dentry) && !d_unhashed(dentry);
}

unsigned long vfs_pressure_ratio(unsigned long val);

/**
 * d_inode - Get the actual inode of this dentry
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode(const struct dentry *dentry)
{
        return dentry->d_inode;
}

/**
 * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE()
 * @dentry: The dentry to query
 *
 * This is the helper normal filesystems should use to get at their own inodes
 * in their own dentries and ignore the layering superimposed upon them.
 */
static inline struct inode *d_inode_rcu(const struct dentry *dentry)
{
        return READ_ONCE(dentry->d_inode);
}

/**
 * d_backing_inode - Get upper or lower inode we should be using
 * @upper: The upper layer
 *
 * This is the helper that should be used to get at the inode that will be used
 * if this dentry were to be opened as a file.  The inode may be on the upper
 * dentry or it may be on a lower dentry pinned by the upper.
 *
 * Normal filesystems should not use this to access their own inodes.
 */
static inline struct inode *d_backing_inode(const struct dentry *upper)
{
        struct inode *inode = upper->d_inode;

        return inode;
}

/**
 * d_real - Return the real dentry
 * @dentry: the dentry to query
 * @type: the type of real dentry (data or metadata)
 *
 * If dentry is on a union/overlay, then return the underlying, real dentry.
 * Otherwise return the dentry itself.
 *
 * See also: Documentation/filesystems/vfs.rst
 */
static inline struct dentry *d_real(struct dentry *dentry, enum d_real_type type)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REAL))
                return dentry->d_op->d_real(dentry, type);
        else
                return dentry;
}

/**
 * d_real_inode - Return the real inode hosting the data
 * @dentry: The dentry to query
 *
 * If dentry is on a union/overlay, then return the underlying, real inode.
 * Otherwise return d_inode().
 */
static inline struct inode *d_real_inode(const struct dentry *dentry)
{
        /* This usage of d_real() results in const dentry */
        return d_inode(d_real((struct dentry *) dentry, D_REAL_DATA));
}

struct name_snapshot {
        struct qstr name;
        union shortname_store inline_name;
};
void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *);
void release_dentry_name_snapshot(struct name_snapshot *);

static inline struct dentry *d_first_child(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib);
}

static inline struct dentry *d_next_sibling(const struct dentry *dentry)
{
        return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib);
}

#endif        /* __LINUX_DCACHE_H */




































































































































































































































    3 



    3 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
// SPDX-License-Identifier: GPL-2.0
/*
 * Supplementary group IDs
 */
#include <linux/cred.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/sort.h>
#include <linux/syscalls.h>
#include <linux/user_namespace.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>

struct group_info *groups_alloc(int gidsetsize)
{
        struct group_info *gi;
        gi = kvmalloc(struct_size(gi, gid, gidsetsize), GFP_KERNEL_ACCOUNT);
        if (!gi)
                return NULL;

        refcount_set(&gi->usage, 1);
        gi->ngroups = gidsetsize;
        return gi;
}

EXPORT_SYMBOL(groups_alloc);

void groups_free(struct group_info *group_info)
{
        kvfree(group_info);
}

EXPORT_SYMBOL(groups_free);

/* export the group_info to a user-space array */
static int groups_to_user(gid_t __user *grouplist,
                          const struct group_info *group_info)
{
        struct user_namespace *user_ns = current_user_ns();
        int i;
        unsigned int count = group_info->ngroups;

        for (i = 0; i < count; i++) {
                gid_t gid;
                gid = from_kgid_munged(user_ns, group_info->gid[i]);
                if (put_user(gid, grouplist+i))
                        return -EFAULT;
        }
        return 0;
}

/* fill a group_info from a user-space array - it must be allocated already */
static int groups_from_user(struct group_info *group_info,
    gid_t __user *grouplist)
{
        struct user_namespace *user_ns = current_user_ns();
        int i;
        unsigned int count = group_info->ngroups;

        for (i = 0; i < count; i++) {
                gid_t gid;
                kgid_t kgid;
                if (get_user(gid, grouplist+i))
                        return -EFAULT;

                kgid = make_kgid(user_ns, gid);
                if (!gid_valid(kgid))
                        return -EINVAL;

                group_info->gid[i] = kgid;
        }
        return 0;
}

static int gid_cmp(const void *_a, const void *_b)
{
        kgid_t a = *(kgid_t *)_a;
        kgid_t b = *(kgid_t *)_b;

        return gid_gt(a, b) - gid_lt(a, b);
}

void groups_sort(struct group_info *group_info)
{
        sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
             gid_cmp, NULL);
}
EXPORT_SYMBOL(groups_sort);

/* a simple bsearch */
int groups_search(const struct group_info *group_info, kgid_t grp)
{
        unsigned int left, right;

        if (!group_info)
                return 0;

        left = 0;
        right = group_info->ngroups;
        while (left < right) {
                unsigned int mid = (left+right)/2;
                if (gid_gt(grp, group_info->gid[mid]))
                        left = mid + 1;
                else if (gid_lt(grp, group_info->gid[mid]))
                        right = mid;
                else
                        return 1;
        }
        return 0;
}

/**
 * set_groups - Change a group subscription in a set of credentials
 * @new: The newly prepared set of credentials to alter
 * @group_info: The group list to install
 */
void set_groups(struct cred *new, struct group_info *group_info)
{
        put_group_info(new->group_info);
        get_group_info(group_info);
        new->group_info = group_info;
}

EXPORT_SYMBOL(set_groups);

/**
 * set_current_groups - Change current's group subscription
 * @group_info: The group list to impose
 *
 * Validate a group subscription and, if valid, impose it upon current's task
 * security record.
 */
int set_current_groups(struct group_info *group_info)
{
        struct cred *new;
        const struct cred *old;
        int retval;

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        old = current_cred();

        set_groups(new, group_info);

        retval = security_task_fix_setgroups(new, old);
        if (retval < 0)
                goto error;

        return commit_creds(new);

error:
        abort_creds(new);
        return retval;
}

EXPORT_SYMBOL(set_current_groups);

SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist)
{
        const struct cred *cred = current_cred();
        int i;

        if (gidsetsize < 0)
                return -EINVAL;

        /* no need to grab task_lock here; it cannot change */
        i = cred->group_info->ngroups;
        if (gidsetsize) {
                if (i > gidsetsize) {
                        i = -EINVAL;
                        goto out;
                }
                if (groups_to_user(grouplist, cred->group_info)) {
                        i = -EFAULT;
                        goto out;
                }
        }
out:
        return i;
}

bool may_setgroups(void)
{
        struct user_namespace *user_ns = current_user_ns();

        return ns_capable_setid(user_ns, CAP_SETGID) &&
                userns_may_setgroups(user_ns);
}

/*
 *        SMP: Our groups are copy-on-write. We can set them safely
 *        without another task interfering.
 */

SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
{
        struct group_info *group_info;
        int retval;

        if (!may_setgroups())
                return -EPERM;
        if ((unsigned)gidsetsize > NGROUPS_MAX)
                return -EINVAL;

        group_info = groups_alloc(gidsetsize);
        if (!group_info)
                return -ENOMEM;
        retval = groups_from_user(group_info, grouplist);
        if (retval) {
                put_group_info(group_info);
                return retval;
        }

        groups_sort(group_info);
        retval = set_current_groups(group_info);
        put_group_info(group_info);

        return retval;
}

/*
 * Check whether we're fsgid/egid or in the supplemental group..
 */
int in_group_p(kgid_t grp)
{
        const struct cred *cred = current_cred();
        int retval = 1;

        if (!gid_eq(grp, cred->fsgid))
                retval = groups_search(cred->group_info, grp);
        return retval;
}

EXPORT_SYMBOL(in_group_p);

int in_egroup_p(kgid_t grp)
{
        const struct cred *cred = current_cred();
        int retval = 1;

        if (!gid_eq(grp, cred->egid))
                retval = groups_search(cred->group_info, grp);
        return retval;
}

EXPORT_SYMBOL(in_egroup_p);






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 





































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
/*
 * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
 * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
 * Copyright (c) 1999-2019, Mellanox Technologies, Inc. All rights reserved.
 * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
 */

#include <linux/completion.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/mutex.h>
#include <linux/random.h>
#include <linux/rbtree.h>
#include <linux/igmp.h>
#include <linux/xarray.h>
#include <linux/inetdevice.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <net/route.h>

#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/netevent.h>
#include <net/tcp.h>
#include <net/ipv6.h>
#include <net/ip_fib.h>
#include <net/ip6_route.h>

#include <rdma/rdma_cm.h>
#include <rdma/rdma_cm_ib.h>
#include <rdma/rdma_netlink.h>
#include <rdma/ib.h>
#include <rdma/ib_cache.h>
#include <rdma/ib_cm.h>
#include <rdma/ib_sa.h>
#include <rdma/iw_cm.h>

#include "core_priv.h"
#include "cma_priv.h"
#include "cma_trace.h"

MODULE_AUTHOR("Sean Hefty");
MODULE_DESCRIPTION("Generic RDMA CM Agent");
MODULE_LICENSE("Dual BSD/GPL");

#define CMA_CM_RESPONSE_TIMEOUT 20
#define CMA_MAX_CM_RETRIES 15
#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
#define CMA_IBOE_PACKET_LIFETIME 16
#define CMA_PREFERRED_ROCE_GID_TYPE IB_GID_TYPE_ROCE_UDP_ENCAP

static const char * const cma_events[] = {
        [RDMA_CM_EVENT_ADDR_RESOLVED]         = "address resolved",
        [RDMA_CM_EVENT_ADDR_ERROR]         = "address error",
        [RDMA_CM_EVENT_ROUTE_RESOLVED]         = "route resolved ",
        [RDMA_CM_EVENT_ROUTE_ERROR]         = "route error",
        [RDMA_CM_EVENT_CONNECT_REQUEST]         = "connect request",
        [RDMA_CM_EVENT_CONNECT_RESPONSE] = "connect response",
        [RDMA_CM_EVENT_CONNECT_ERROR]         = "connect error",
        [RDMA_CM_EVENT_UNREACHABLE]         = "unreachable",
        [RDMA_CM_EVENT_REJECTED]         = "rejected",
        [RDMA_CM_EVENT_ESTABLISHED]         = "established",
        [RDMA_CM_EVENT_DISCONNECTED]         = "disconnected",
        [RDMA_CM_EVENT_DEVICE_REMOVAL]         = "device removal",
        [RDMA_CM_EVENT_MULTICAST_JOIN]         = "multicast join",
        [RDMA_CM_EVENT_MULTICAST_ERROR]         = "multicast error",
        [RDMA_CM_EVENT_ADDR_CHANGE]         = "address change",
        [RDMA_CM_EVENT_TIMEWAIT_EXIT]         = "timewait exit",
};

static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
                              enum ib_gid_type gid_type);

static void cma_netevent_work_handler(struct work_struct *_work);

const char *__attribute_const__ rdma_event_msg(enum rdma_cm_event_type event)
{
        size_t index = event;

        return (index < ARRAY_SIZE(cma_events) && cma_events[index]) ?
                        cma_events[index] : "unrecognized event";
}
EXPORT_SYMBOL(rdma_event_msg);

const char *__attribute_const__ rdma_reject_msg(struct rdma_cm_id *id,
                                                int reason)
{
        if (rdma_ib_or_roce(id->device, id->port_num))
                return ibcm_reject_msg(reason);

        if (rdma_protocol_iwarp(id->device, id->port_num))
                return iwcm_reject_msg(reason);

        WARN_ON_ONCE(1);
        return "unrecognized transport";
}
EXPORT_SYMBOL(rdma_reject_msg);

/**
 * rdma_is_consumer_reject - return true if the consumer rejected the connect
 *                           request.
 * @id: Communication identifier that received the REJECT event.
 * @reason: Value returned in the REJECT event status field.
 */
static bool rdma_is_consumer_reject(struct rdma_cm_id *id, int reason)
{
        if (rdma_ib_or_roce(id->device, id->port_num))
                return reason == IB_CM_REJ_CONSUMER_DEFINED;

        if (rdma_protocol_iwarp(id->device, id->port_num))
                return reason == -ECONNREFUSED;

        WARN_ON_ONCE(1);
        return false;
}

const void *rdma_consumer_reject_data(struct rdma_cm_id *id,
                                      struct rdma_cm_event *ev, u8 *data_len)
{
        const void *p;

        if (rdma_is_consumer_reject(id, ev->status)) {
                *data_len = ev->param.conn.private_data_len;
                p = ev->param.conn.private_data;
        } else {
                *data_len = 0;
                p = NULL;
        }
        return p;
}
EXPORT_SYMBOL(rdma_consumer_reject_data);

/**
 * rdma_iw_cm_id() - return the iw_cm_id pointer for this cm_id.
 * @id: Communication Identifier
 */
struct iw_cm_id *rdma_iw_cm_id(struct rdma_cm_id *id)
{
        struct rdma_id_private *id_priv;

        id_priv = container_of(id, struct rdma_id_private, id);
        if (id->device->node_type == RDMA_NODE_RNIC)
                return id_priv->cm_id.iw;
        return NULL;
}
EXPORT_SYMBOL(rdma_iw_cm_id);

/**
 * rdma_res_to_id() - return the rdma_cm_id pointer for this restrack.
 * @res: rdma resource tracking entry pointer
 */
struct rdma_cm_id *rdma_res_to_id(struct rdma_restrack_entry *res)
{
        struct rdma_id_private *id_priv =
                container_of(res, struct rdma_id_private, res);

        return &id_priv->id;
}
EXPORT_SYMBOL(rdma_res_to_id);

static int cma_add_one(struct ib_device *device);
static void cma_remove_one(struct ib_device *device, void *client_data);

static struct ib_client cma_client = {
        .name   = "cma",
        .add    = cma_add_one,
        .remove = cma_remove_one
};

static struct ib_sa_client sa_client;
static LIST_HEAD(dev_list);
static LIST_HEAD(listen_any_list);
static DEFINE_MUTEX(lock);
static struct rb_root id_table = RB_ROOT;
/* Serialize operations of id_table tree */
static DEFINE_SPINLOCK(id_table_lock);
static struct workqueue_struct *cma_wq;
static unsigned int cma_pernet_id;

struct cma_pernet {
        struct xarray tcp_ps;
        struct xarray udp_ps;
        struct xarray ipoib_ps;
        struct xarray ib_ps;
};

static struct cma_pernet *cma_pernet(struct net *net)
{
        return net_generic(net, cma_pernet_id);
}

static
struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)
{
        struct cma_pernet *pernet = cma_pernet(net);

        switch (ps) {
        case RDMA_PS_TCP:
                return &pernet->tcp_ps;
        case RDMA_PS_UDP:
                return &pernet->udp_ps;
        case RDMA_PS_IPOIB:
                return &pernet->ipoib_ps;
        case RDMA_PS_IB:
                return &pernet->ib_ps;
        default:
                return NULL;
        }
}

struct id_table_entry {
        struct list_head id_list;
        struct rb_node rb_node;
};

struct cma_device {
        struct list_head        list;
        struct ib_device        *device;
        struct completion        comp;
        refcount_t refcount;
        struct list_head        id_list;
        enum ib_gid_type        *default_gid_type;
        u8                        *default_roce_tos;
};

struct rdma_bind_list {
        enum rdma_ucm_port_space ps;
        struct hlist_head        owners;
        unsigned short                port;
};

static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps,
                        struct rdma_bind_list *bind_list, int snum)
{
        struct xarray *xa = cma_pernet_xa(net, ps);

        return xa_insert(xa, snum, bind_list, GFP_KERNEL);
}

static struct rdma_bind_list *cma_ps_find(struct net *net,
                                          enum rdma_ucm_port_space ps, int snum)
{
        struct xarray *xa = cma_pernet_xa(net, ps);

        return xa_load(xa, snum);
}

static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps,
                          int snum)
{
        struct xarray *xa = cma_pernet_xa(net, ps);

        xa_erase(xa, snum);
}

enum {
        CMA_OPTION_AFONLY,
};

void cma_dev_get(struct cma_device *cma_dev)
{
        refcount_inc(&cma_dev->refcount);
}

void cma_dev_put(struct cma_device *cma_dev)
{
        if (refcount_dec_and_test(&cma_dev->refcount))
                complete(&cma_dev->comp);
}

struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter        filter,
                                             void                *cookie)
{
        struct cma_device *cma_dev;
        struct cma_device *found_cma_dev = NULL;

        mutex_lock(&lock);

        list_for_each_entry(cma_dev, &dev_list, list)
                if (filter(cma_dev->device, cookie)) {
                        found_cma_dev = cma_dev;
                        break;
                }

        if (found_cma_dev)
                cma_dev_get(found_cma_dev);
        mutex_unlock(&lock);
        return found_cma_dev;
}

int cma_get_default_gid_type(struct cma_device *cma_dev,
                             u32 port)
{
        if (!rdma_is_port_valid(cma_dev->device, port))
                return -EINVAL;

        return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)];
}

int cma_set_default_gid_type(struct cma_device *cma_dev,
                             u32 port,
                             enum ib_gid_type default_gid_type)
{
        unsigned long supported_gids;

        if (!rdma_is_port_valid(cma_dev->device, port))
                return -EINVAL;

        if (default_gid_type == IB_GID_TYPE_IB &&
            rdma_protocol_roce_eth_encap(cma_dev->device, port))
                default_gid_type = IB_GID_TYPE_ROCE;

        supported_gids = roce_gid_type_mask_support(cma_dev->device, port);

        if (!(supported_gids & 1 << default_gid_type))
                return -EINVAL;

        cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] =
                default_gid_type;

        return 0;
}

int cma_get_default_roce_tos(struct cma_device *cma_dev, u32 port)
{
        if (!rdma_is_port_valid(cma_dev->device, port))
                return -EINVAL;

        return cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)];
}

int cma_set_default_roce_tos(struct cma_device *cma_dev, u32 port,
                             u8 default_roce_tos)
{
        if (!rdma_is_port_valid(cma_dev->device, port))
                return -EINVAL;

        cma_dev->default_roce_tos[port - rdma_start_port(cma_dev->device)] =
                 default_roce_tos;

        return 0;
}
struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev)
{
        return cma_dev->device;
}

/*
 * Device removal can occur at anytime, so we need extra handling to
 * serialize notifying the user of device removal with other callbacks.
 * We do this by disabling removal notification while a callback is in process,
 * and reporting it after the callback completes.
 */

struct cma_multicast {
        struct rdma_id_private *id_priv;
        union {
                struct ib_sa_multicast *sa_mc;
                struct {
                        struct work_struct work;
                        struct rdma_cm_event event;
                } iboe_join;
        };
        struct list_head        list;
        void                        *context;
        struct sockaddr_storage        addr;
        u8                        join_state;
};

struct cma_work {
        struct work_struct        work;
        struct rdma_id_private        *id;
        enum rdma_cm_state        old_state;
        enum rdma_cm_state        new_state;
        struct rdma_cm_event        event;
};

union cma_ip_addr {
        struct in6_addr ip6;
        struct {
                __be32 pad[3];
                __be32 addr;
        } ip4;
};

struct cma_hdr {
        u8 cma_version;
        u8 ip_version;        /* IP version: 7:4 */
        __be16 port;
        union cma_ip_addr src_addr;
        union cma_ip_addr dst_addr;
};

#define CMA_VERSION 0x00

struct cma_req_info {
        struct sockaddr_storage listen_addr_storage;
        struct sockaddr_storage src_addr_storage;
        struct ib_device *device;
        union ib_gid local_gid;
        __be64 service_id;
        int port;
        bool has_gid;
        u16 pkey;
};

static int cma_comp_exch(struct rdma_id_private *id_priv,
                         enum rdma_cm_state comp, enum rdma_cm_state exch)
{
        unsigned long flags;
        int ret;

        /*
         * The FSM uses a funny double locking where state is protected by both
         * the handler_mutex and the spinlock. State is not allowed to change
         * to/from a handler_mutex protected value without also holding
         * handler_mutex.
         */
        if (comp == RDMA_CM_CONNECT || exch == RDMA_CM_CONNECT)
                lockdep_assert_held(&id_priv->handler_mutex);

        spin_lock_irqsave(&id_priv->lock, flags);
        if ((ret = (id_priv->state == comp)))
                id_priv->state = exch;
        spin_unlock_irqrestore(&id_priv->lock, flags);
        return ret;
}

static inline u8 cma_get_ip_ver(const struct cma_hdr *hdr)
{
        return hdr->ip_version >> 4;
}

static void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
{
        hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
}

static struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
{
        return (struct sockaddr *)&id_priv->id.route.addr.src_addr;
}

static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
{
        return (struct sockaddr *)&id_priv->id.route.addr.dst_addr;
}

static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join)
{
        struct in_device *in_dev = NULL;

        if (ndev) {
                rtnl_lock();
                in_dev = __in_dev_get_rtnl(ndev);
                if (in_dev) {
                        if (join)
                                ip_mc_inc_group(in_dev,
                                                *(__be32 *)(mgid->raw + 12));
                        else
                                ip_mc_dec_group(in_dev,
                                                *(__be32 *)(mgid->raw + 12));
                }
                rtnl_unlock();
        }
        return (in_dev) ? 0 : -ENODEV;
}

static int compare_netdev_and_ip(int ifindex_a, struct sockaddr *sa,
                                 struct id_table_entry *entry_b)
{
        struct rdma_id_private *id_priv = list_first_entry(
                &entry_b->id_list, struct rdma_id_private, id_list_entry);
        int ifindex_b = id_priv->id.route.addr.dev_addr.bound_dev_if;
        struct sockaddr *sb = cma_dst_addr(id_priv);

        if (ifindex_a != ifindex_b)
                return (ifindex_a > ifindex_b) ? 1 : -1;

        if (sa->sa_family != sb->sa_family)
                return sa->sa_family - sb->sa_family;

        if (sa->sa_family == AF_INET &&
            __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in)) {
                return memcmp(&((struct sockaddr_in *)sa)->sin_addr,
                              &((struct sockaddr_in *)sb)->sin_addr,
                              sizeof(((struct sockaddr_in *)sa)->sin_addr));
        }

        if (sa->sa_family == AF_INET6 &&
            __builtin_object_size(sa, 0) >= sizeof(struct sockaddr_in6)) {
                return ipv6_addr_cmp(&((struct sockaddr_in6 *)sa)->sin6_addr,
                                     &((struct sockaddr_in6 *)sb)->sin6_addr);
        }

        return -1;
}

static int cma_add_id_to_tree(struct rdma_id_private *node_id_priv)
{
        struct rb_node **new, *parent = NULL;
        struct id_table_entry *this, *node;
        unsigned long flags;
        int result;

        node = kzalloc(sizeof(*node), GFP_KERNEL);
        if (!node)
                return -ENOMEM;

        spin_lock_irqsave(&id_table_lock, flags);
        new = &id_table.rb_node;
        while (*new) {
                this = container_of(*new, struct id_table_entry, rb_node);
                result = compare_netdev_and_ip(
                        node_id_priv->id.route.addr.dev_addr.bound_dev_if,
                        cma_dst_addr(node_id_priv), this);

                parent = *new;
                if (result < 0)
                        new = &((*new)->rb_left);
                else if (result > 0)
                        new = &((*new)->rb_right);
                else {
                        list_add_tail(&node_id_priv->id_list_entry,
                                      &this->id_list);
                        kfree(node);
                        goto unlock;
                }
        }

        INIT_LIST_HEAD(&node->id_list);
        list_add_tail(&node_id_priv->id_list_entry, &node->id_list);

        rb_link_node(&node->rb_node, parent, new);
        rb_insert_color(&node->rb_node, &id_table);

unlock:
        spin_unlock_irqrestore(&id_table_lock, flags);
        return 0;
}

static struct id_table_entry *
node_from_ndev_ip(struct rb_root *root, int ifindex, struct sockaddr *sa)
{
        struct rb_node *node = root->rb_node;
        struct id_table_entry *data;
        int result;

        while (node) {
                data = container_of(node, struct id_table_entry, rb_node);
                result = compare_netdev_and_ip(ifindex, sa, data);
                if (result < 0)
                        node = node->rb_left;
                else if (result > 0)
                        node = node->rb_right;
                else
                        return data;
        }

        return NULL;
}

static void cma_remove_id_from_tree(struct rdma_id_private *id_priv)
{
        struct id_table_entry *data;
        unsigned long flags;

        spin_lock_irqsave(&id_table_lock, flags);
        if (list_empty(&id_priv->id_list_entry))
                goto out;

        data = node_from_ndev_ip(&id_table,
                                 id_priv->id.route.addr.dev_addr.bound_dev_if,
                                 cma_dst_addr(id_priv));
        if (!data)
                goto out;

        list_del_init(&id_priv->id_list_entry);
        if (list_empty(&data->id_list)) {
                rb_erase(&data->rb_node, &id_table);
                kfree(data);
        }
out:
        spin_unlock_irqrestore(&id_table_lock, flags);
}

static void _cma_attach_to_dev(struct rdma_id_private *id_priv,
                               struct cma_device *cma_dev)
{
        cma_dev_get(cma_dev);
        id_priv->cma_dev = cma_dev;
        id_priv->id.device = cma_dev->device;
        id_priv->id.route.addr.dev_addr.transport =
                rdma_node_get_transport(cma_dev->device->node_type);
        list_add_tail(&id_priv->device_item, &cma_dev->id_list);

        trace_cm_id_attach(id_priv, cma_dev->device);
}

static void cma_attach_to_dev(struct rdma_id_private *id_priv,
                              struct cma_device *cma_dev)
{
        _cma_attach_to_dev(id_priv, cma_dev);
        id_priv->gid_type =
                cma_dev->default_gid_type[id_priv->id.port_num -
                                          rdma_start_port(cma_dev->device)];
}

static void cma_release_dev(struct rdma_id_private *id_priv)
{
        mutex_lock(&lock);
        list_del_init(&id_priv->device_item);
        cma_dev_put(id_priv->cma_dev);
        id_priv->cma_dev = NULL;
        id_priv->id.device = NULL;
        if (id_priv->id.route.addr.dev_addr.sgid_attr) {
                rdma_put_gid_attr(id_priv->id.route.addr.dev_addr.sgid_attr);
                id_priv->id.route.addr.dev_addr.sgid_attr = NULL;
        }
        mutex_unlock(&lock);
}

static inline unsigned short cma_family(struct rdma_id_private *id_priv)
{
        return id_priv->id.route.addr.src_addr.ss_family;
}

static int cma_set_default_qkey(struct rdma_id_private *id_priv)
{
        struct ib_sa_mcmember_rec rec;
        int ret = 0;

        switch (id_priv->id.ps) {
        case RDMA_PS_UDP:
        case RDMA_PS_IB:
                id_priv->qkey = RDMA_UDP_QKEY;
                break;
        case RDMA_PS_IPOIB:
                ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
                ret = ib_sa_get_mcmember_rec(id_priv->id.device,
                                             id_priv->id.port_num, &rec.mgid,
                                             &rec);
                if (!ret)
                        id_priv->qkey = be32_to_cpu(rec.qkey);
                break;
        default:
                break;
        }
        return ret;
}

static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
{
        if (!qkey ||
            (id_priv->qkey && (id_priv->qkey != qkey)))
                return -EINVAL;

        id_priv->qkey = qkey;
        return 0;
}

static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
{
        dev_addr->dev_type = ARPHRD_INFINIBAND;
        rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr);
        ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey));
}

static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
{
        int ret;

        if (addr->sa_family != AF_IB) {
                ret = rdma_translate_ip(addr, dev_addr);
        } else {
                cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
                ret = 0;
        }

        return ret;
}

static const struct ib_gid_attr *
cma_validate_port(struct ib_device *device, u32 port,
                  enum ib_gid_type gid_type,
                  union ib_gid *gid,
                  struct rdma_id_private *id_priv)
{
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        const struct ib_gid_attr *sgid_attr = ERR_PTR(-ENODEV);
        int bound_if_index = dev_addr->bound_dev_if;
        int dev_type = dev_addr->dev_type;
        struct net_device *ndev = NULL;
        struct net_device *pdev = NULL;

        if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net))
                goto out;

        if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
                goto out;

        if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port))
                goto out;

        /*
         * For drivers that do not associate more than one net device with
         * their gid tables, such as iWARP drivers, it is sufficient to
         * return the first table entry.
         *
         * Other driver classes might be included in the future.
         */
        if (rdma_protocol_iwarp(device, port)) {
                sgid_attr = rdma_get_gid_attr(device, port, 0);
                if (IS_ERR(sgid_attr))
                        goto out;

                rcu_read_lock();
                ndev = rcu_dereference(sgid_attr->ndev);
                if (ndev->ifindex != bound_if_index) {
                        pdev = dev_get_by_index_rcu(dev_addr->net, bound_if_index);
                        if (pdev) {
                                if (is_vlan_dev(pdev)) {
                                        pdev = vlan_dev_real_dev(pdev);
                                        if (ndev->ifindex == pdev->ifindex)
                                                bound_if_index = pdev->ifindex;
                                }
                                if (is_vlan_dev(ndev)) {
                                        pdev = vlan_dev_real_dev(ndev);
                                        if (bound_if_index == pdev->ifindex)
                                                bound_if_index = ndev->ifindex;
                                }
                        }
                }
                if (!net_eq(dev_net(ndev), dev_addr->net) ||
                    ndev->ifindex != bound_if_index) {
                        rdma_put_gid_attr(sgid_attr);
                        sgid_attr = ERR_PTR(-ENODEV);
                }
                rcu_read_unlock();
                goto out;
        }

        /*
         * For a RXE device, it should work with TUN device and normal ethernet
         * devices. Use driver_id to check if a device is a RXE device or not.
         * ARPHDR_NONE means a TUN device.
         */
        if (device->ops.driver_id == RDMA_DRIVER_RXE) {
                if ((dev_type == ARPHRD_NONE || dev_type == ARPHRD_ETHER)
                        && rdma_protocol_roce(device, port)) {
                        ndev = dev_get_by_index(dev_addr->net, bound_if_index);
                        if (!ndev)
                                goto out;
                }
        } else {
                if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) {
                        ndev = dev_get_by_index(dev_addr->net, bound_if_index);
                        if (!ndev)
                                goto out;
                } else {
                        gid_type = IB_GID_TYPE_IB;
                }
        }

        sgid_attr = rdma_find_gid_by_port(device, gid, gid_type, port, ndev);
        dev_put(ndev);
out:
        return sgid_attr;
}

static void cma_bind_sgid_attr(struct rdma_id_private *id_priv,
                               const struct ib_gid_attr *sgid_attr)
{
        WARN_ON(id_priv->id.route.addr.dev_addr.sgid_attr);
        id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr;
}

/**
 * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute
 * based on source ip address.
 * @id_priv:        cm_id which should be bound to cma device
 *
 * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute
 * based on source IP address. It returns 0 on success or error code otherwise.
 * It is applicable to active and passive side cm_id.
 */
static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv)
{
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        const struct ib_gid_attr *sgid_attr;
        union ib_gid gid, iboe_gid, *gidp;
        struct cma_device *cma_dev;
        enum ib_gid_type gid_type;
        int ret = -ENODEV;
        u32 port;

        if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
            id_priv->id.ps == RDMA_PS_IPOIB)
                return -EINVAL;

        rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
                    &iboe_gid);

        memcpy(&gid, dev_addr->src_dev_addr +
               rdma_addr_gid_offset(dev_addr), sizeof(gid));

        mutex_lock(&lock);
        list_for_each_entry(cma_dev, &dev_list, list) {
                rdma_for_each_port (cma_dev->device, port) {
                        gidp = rdma_protocol_roce(cma_dev->device, port) ?
                               &iboe_gid : &gid;
                        gid_type = cma_dev->default_gid_type[port - 1];
                        sgid_attr = cma_validate_port(cma_dev->device, port,
                                                      gid_type, gidp, id_priv);
                        if (!IS_ERR(sgid_attr)) {
                                id_priv->id.port_num = port;
                                cma_bind_sgid_attr(id_priv, sgid_attr);
                                cma_attach_to_dev(id_priv, cma_dev);
                                ret = 0;
                                goto out;
                        }
                }
        }
out:
        mutex_unlock(&lock);
        return ret;
}

/**
 * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute
 * @id_priv:                cm id to bind to cma device
 * @listen_id_priv:        listener cm id to match against
 * @req:                Pointer to req structure containaining incoming
 *                        request information
 * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when
 * rdma device matches for listen_id and incoming request. It also verifies
 * that a GID table entry is present for the source address.
 * Returns 0 on success, or returns error code otherwise.
 */
static int cma_ib_acquire_dev(struct rdma_id_private *id_priv,
                              const struct rdma_id_private *listen_id_priv,
                              struct cma_req_info *req)
{
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        const struct ib_gid_attr *sgid_attr;
        enum ib_gid_type gid_type;
        union ib_gid gid;

        if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
            id_priv->id.ps == RDMA_PS_IPOIB)
                return -EINVAL;

        if (rdma_protocol_roce(req->device, req->port))
                rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
                            &gid);
        else
                memcpy(&gid, dev_addr->src_dev_addr +
                       rdma_addr_gid_offset(dev_addr), sizeof(gid));

        gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1];
        sgid_attr = cma_validate_port(req->device, req->port,
                                      gid_type, &gid, id_priv);
        if (IS_ERR(sgid_attr))
                return PTR_ERR(sgid_attr);

        id_priv->id.port_num = req->port;
        cma_bind_sgid_attr(id_priv, sgid_attr);
        /* Need to acquire lock to protect against reader
         * of cma_dev->id_list such as cma_netdev_callback() and
         * cma_process_remove().
         */
        mutex_lock(&lock);
        cma_attach_to_dev(id_priv, listen_id_priv->cma_dev);
        mutex_unlock(&lock);
        rdma_restrack_add(&id_priv->res);
        return 0;
}

static int cma_iw_acquire_dev(struct rdma_id_private *id_priv,
                              const struct rdma_id_private *listen_id_priv)
{
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        const struct ib_gid_attr *sgid_attr;
        struct cma_device *cma_dev;
        enum ib_gid_type gid_type;
        int ret = -ENODEV;
        union ib_gid gid;
        u32 port;

        if (dev_addr->dev_type != ARPHRD_INFINIBAND &&
            id_priv->id.ps == RDMA_PS_IPOIB)
                return -EINVAL;

        memcpy(&gid, dev_addr->src_dev_addr +
               rdma_addr_gid_offset(dev_addr), sizeof(gid));

        mutex_lock(&lock);

        cma_dev = listen_id_priv->cma_dev;
        port = listen_id_priv->id.port_num;
        gid_type = listen_id_priv->gid_type;
        sgid_attr = cma_validate_port(cma_dev->device, port,
                                      gid_type, &gid, id_priv);
        if (!IS_ERR(sgid_attr)) {
                id_priv->id.port_num = port;
                cma_bind_sgid_attr(id_priv, sgid_attr);
                ret = 0;
                goto out;
        }

        list_for_each_entry(cma_dev, &dev_list, list) {
                rdma_for_each_port (cma_dev->device, port) {
                        if (listen_id_priv->cma_dev == cma_dev &&
                            listen_id_priv->id.port_num == port)
                                continue;

                        gid_type = cma_dev->default_gid_type[port - 1];
                        sgid_attr = cma_validate_port(cma_dev->device, port,
                                                      gid_type, &gid, id_priv);
                        if (!IS_ERR(sgid_attr)) {
                                id_priv->id.port_num = port;
                                cma_bind_sgid_attr(id_priv, sgid_attr);
                                ret = 0;
                                goto out;
                        }
                }
        }

out:
        if (!ret) {
                cma_attach_to_dev(id_priv, cma_dev);
                rdma_restrack_add(&id_priv->res);
        }

        mutex_unlock(&lock);
        return ret;
}

/*
 * Select the source IB device and address to reach the destination IB address.
 */
static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
{
        struct cma_device *cma_dev, *cur_dev;
        struct sockaddr_ib *addr;
        union ib_gid gid, sgid, *dgid;
        unsigned int p;
        u16 pkey, index;
        enum ib_port_state port_state;
        int ret;
        int i;

        cma_dev = NULL;
        addr = (struct sockaddr_ib *) cma_dst_addr(id_priv);
        dgid = (union ib_gid *) &addr->sib_addr;
        pkey = ntohs(addr->sib_pkey);

        mutex_lock(&lock);
        list_for_each_entry(cur_dev, &dev_list, list) {
                rdma_for_each_port (cur_dev->device, p) {
                        if (!rdma_cap_af_ib(cur_dev->device, p))
                                continue;

                        if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
                                continue;

                        if (ib_get_cached_port_state(cur_dev->device, p, &port_state))
                                continue;

                        for (i = 0; i < cur_dev->device->port_data[p].immutable.gid_tbl_len;
                             ++i) {
                                ret = rdma_query_gid(cur_dev->device, p, i,
                                                     &gid);
                                if (ret)
                                        continue;

                                if (!memcmp(&gid, dgid, sizeof(gid))) {
                                        cma_dev = cur_dev;
                                        sgid = gid;
                                        id_priv->id.port_num = p;
                                        goto found;
                                }

                                if (!cma_dev && (gid.global.subnet_prefix ==
                                    dgid->global.subnet_prefix) &&
                                    port_state == IB_PORT_ACTIVE) {
                                        cma_dev = cur_dev;
                                        sgid = gid;
                                        id_priv->id.port_num = p;
                                        goto found;
                                }
                        }
                }
        }
        mutex_unlock(&lock);
        return -ENODEV;

found:
        cma_attach_to_dev(id_priv, cma_dev);
        rdma_restrack_add(&id_priv->res);
        mutex_unlock(&lock);
        addr = (struct sockaddr_ib *)cma_src_addr(id_priv);
        memcpy(&addr->sib_addr, &sgid, sizeof(sgid));
        cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr);
        return 0;
}

static void cma_id_get(struct rdma_id_private *id_priv)
{
        refcount_inc(&id_priv->refcount);
}

static void cma_id_put(struct rdma_id_private *id_priv)
{
        if (refcount_dec_and_test(&id_priv->refcount))
                complete(&id_priv->comp);
}

static struct rdma_id_private *
__rdma_create_id(struct net *net, rdma_cm_event_handler event_handler,
                 void *context, enum rdma_ucm_port_space ps,
                 enum ib_qp_type qp_type, const struct rdma_id_private *parent)
{
        struct rdma_id_private *id_priv;

        id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
        if (!id_priv)
                return ERR_PTR(-ENOMEM);

        id_priv->state = RDMA_CM_IDLE;
        id_priv->id.context = context;
        id_priv->id.event_handler = event_handler;
        id_priv->id.ps = ps;
        id_priv->id.qp_type = qp_type;
        id_priv->tos_set = false;
        id_priv->timeout_set = false;
        id_priv->min_rnr_timer_set = false;
        id_priv->gid_type = IB_GID_TYPE_IB;
        spin_lock_init(&id_priv->lock);
        mutex_init(&id_priv->qp_mutex);
        init_completion(&id_priv->comp);
        refcount_set(&id_priv->refcount, 1);
        mutex_init(&id_priv->handler_mutex);
        INIT_LIST_HEAD(&id_priv->device_item);
        INIT_LIST_HEAD(&id_priv->id_list_entry);
        INIT_LIST_HEAD(&id_priv->listen_list);
        INIT_LIST_HEAD(&id_priv->mc_list);
        get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
        id_priv->id.route.addr.dev_addr.net = get_net(net);
        id_priv->seq_num &= 0x00ffffff;
        INIT_WORK(&id_priv->id.net_work, cma_netevent_work_handler);

        rdma_restrack_new(&id_priv->res, RDMA_RESTRACK_CM_ID);
        if (parent)
                rdma_restrack_parent_name(&id_priv->res, &parent->res);

        return id_priv;
}

struct rdma_cm_id *
__rdma_create_kernel_id(struct net *net, rdma_cm_event_handler event_handler,
                        void *context, enum rdma_ucm_port_space ps,
                        enum ib_qp_type qp_type, const char *caller)
{
        struct rdma_id_private *ret;

        ret = __rdma_create_id(net, event_handler, context, ps, qp_type, NULL);
        if (IS_ERR(ret))
                return ERR_CAST(ret);

        rdma_restrack_set_name(&ret->res, caller);
        return &ret->id;
}
EXPORT_SYMBOL(__rdma_create_kernel_id);

struct rdma_cm_id *rdma_create_user_id(rdma_cm_event_handler event_handler,
                                       void *context,
                                       enum rdma_ucm_port_space ps,
                                       enum ib_qp_type qp_type)
{
        struct rdma_id_private *ret;

        ret = __rdma_create_id(current->nsproxy->net_ns, event_handler, context,
                               ps, qp_type, NULL);
        if (IS_ERR(ret))
                return ERR_CAST(ret);

        rdma_restrack_set_name(&ret->res, NULL);
        return &ret->id;
}
EXPORT_SYMBOL(rdma_create_user_id);

static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
{
        struct ib_qp_attr qp_attr;
        int qp_attr_mask, ret;

        qp_attr.qp_state = IB_QPS_INIT;
        ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
        if (ret)
                return ret;

        ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
        if (ret)
                return ret;

        qp_attr.qp_state = IB_QPS_RTR;
        ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
        if (ret)
                return ret;

        qp_attr.qp_state = IB_QPS_RTS;
        qp_attr.sq_psn = 0;
        ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);

        return ret;
}

static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
{
        struct ib_qp_attr qp_attr;
        int qp_attr_mask, ret;

        qp_attr.qp_state = IB_QPS_INIT;
        ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
        if (ret)
                return ret;

        return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
}

int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
                   struct ib_qp_init_attr *qp_init_attr)
{
        struct rdma_id_private *id_priv;
        struct ib_qp *qp;
        int ret;

        id_priv = container_of(id, struct rdma_id_private, id);
        if (id->device != pd->device) {
                ret = -EINVAL;
                goto out_err;
        }

        qp_init_attr->port_num = id->port_num;
        qp = ib_create_qp(pd, qp_init_attr);
        if (IS_ERR(qp)) {
                ret = PTR_ERR(qp);
                goto out_err;
        }

        if (id->qp_type == IB_QPT_UD)
                ret = cma_init_ud_qp(id_priv, qp);
        else
                ret = cma_init_conn_qp(id_priv, qp);
        if (ret)
                goto out_destroy;

        id->qp = qp;
        id_priv->qp_num = qp->qp_num;
        id_priv->srq = (qp->srq != NULL);
        trace_cm_qp_create(id_priv, pd, qp_init_attr, 0);
        return 0;
out_destroy:
        ib_destroy_qp(qp);
out_err:
        trace_cm_qp_create(id_priv, pd, qp_init_attr, ret);
        return ret;
}
EXPORT_SYMBOL(rdma_create_qp);

void rdma_destroy_qp(struct rdma_cm_id *id)
{
        struct rdma_id_private *id_priv;

        id_priv = container_of(id, struct rdma_id_private, id);
        trace_cm_qp_destroy(id_priv);
        mutex_lock(&id_priv->qp_mutex);
        ib_destroy_qp(id_priv->id.qp);
        id_priv->id.qp = NULL;
        mutex_unlock(&id_priv->qp_mutex);
}
EXPORT_SYMBOL(rdma_destroy_qp);

static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
                             struct rdma_conn_param *conn_param)
{
        struct ib_qp_attr qp_attr;
        int qp_attr_mask, ret;

        mutex_lock(&id_priv->qp_mutex);
        if (!id_priv->id.qp) {
                ret = 0;
                goto out;
        }

        /* Need to update QP attributes from default values. */
        qp_attr.qp_state = IB_QPS_INIT;
        ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
        if (ret)
                goto out;

        ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
        if (ret)
                goto out;

        qp_attr.qp_state = IB_QPS_RTR;
        ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
        if (ret)
                goto out;

        BUG_ON(id_priv->cma_dev->device != id_priv->id.device);

        if (conn_param)
                qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
        ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
out:
        mutex_unlock(&id_priv->qp_mutex);
        return ret;
}

static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
                             struct rdma_conn_param *conn_param)
{
        struct ib_qp_attr qp_attr;
        int qp_attr_mask, ret;

        mutex_lock(&id_priv->qp_mutex);
        if (!id_priv->id.qp) {
                ret = 0;
                goto out;
        }

        qp_attr.qp_state = IB_QPS_RTS;
        ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
        if (ret)
                goto out;

        if (conn_param)
                qp_attr.max_rd_atomic = conn_param->initiator_depth;
        ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
out:
        mutex_unlock(&id_priv->qp_mutex);
        return ret;
}

static int cma_modify_qp_err(struct rdma_id_private *id_priv)
{
        struct ib_qp_attr qp_attr;
        int ret;

        mutex_lock(&id_priv->qp_mutex);
        if (!id_priv->id.qp) {
                ret = 0;
                goto out;
        }

        qp_attr.qp_state = IB_QPS_ERR;
        ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
out:
        mutex_unlock(&id_priv->qp_mutex);
        return ret;
}

static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
                               struct ib_qp_attr *qp_attr, int *qp_attr_mask)
{
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        int ret;
        u16 pkey;

        if (rdma_cap_eth_ah(id_priv->id.device, id_priv->id.port_num))
                pkey = 0xffff;
        else
                pkey = ib_addr_get_pkey(dev_addr);

        ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
                                  pkey, &qp_attr->pkey_index);
        if (ret)
                return ret;

        qp_attr->port_num = id_priv->id.port_num;
        *qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;

        if (id_priv->id.qp_type == IB_QPT_UD) {
                ret = cma_set_default_qkey(id_priv);
                if (ret)
                        return ret;

                qp_attr->qkey = id_priv->qkey;
                *qp_attr_mask |= IB_QP_QKEY;
        } else {
                qp_attr->qp_access_flags = 0;
                *qp_attr_mask |= IB_QP_ACCESS_FLAGS;
        }
        return 0;
}

int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
                       int *qp_attr_mask)
{
        struct rdma_id_private *id_priv;
        int ret = 0;

        id_priv = container_of(id, struct rdma_id_private, id);
        if (rdma_cap_ib_cm(id->device, id->port_num)) {
                if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
                        ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
                else
                        ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
                                                 qp_attr_mask);

                if (qp_attr->qp_state == IB_QPS_RTR)
                        qp_attr->rq_psn = id_priv->seq_num;
        } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
                if (!id_priv->cm_id.iw) {
                        qp_attr->qp_access_flags = 0;
                        *qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
                } else
                        ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
                                                 qp_attr_mask);
                qp_attr->port_num = id_priv->id.port_num;
                *qp_attr_mask |= IB_QP_PORT;
        } else {
                ret = -ENOSYS;
        }

        if ((*qp_attr_mask & IB_QP_TIMEOUT) && id_priv->timeout_set)
                qp_attr->timeout = id_priv->timeout;

        if ((*qp_attr_mask & IB_QP_MIN_RNR_TIMER) && id_priv->min_rnr_timer_set)
                qp_attr->min_rnr_timer = id_priv->min_rnr_timer;

        return ret;
}
EXPORT_SYMBOL(rdma_init_qp_attr);

static inline bool cma_zero_addr(const struct sockaddr *addr)
{
        switch (addr->sa_family) {
        case AF_INET:
                return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr);
        case AF_INET6:
                return ipv6_addr_any(&((struct sockaddr_in6 *)addr)->sin6_addr);
        case AF_IB:
                return ib_addr_any(&((struct sockaddr_ib *)addr)->sib_addr);
        default:
                return false;
        }
}

static inline bool cma_loopback_addr(const struct sockaddr *addr)
{
        switch (addr->sa_family) {
        case AF_INET:
                return ipv4_is_loopback(
                        ((struct sockaddr_in *)addr)->sin_addr.s_addr);
        case AF_INET6:
                return ipv6_addr_loopback(
                        &((struct sockaddr_in6 *)addr)->sin6_addr);
        case AF_IB:
                return ib_addr_loopback(
                        &((struct sockaddr_ib *)addr)->sib_addr);
        default:
                return false;
        }
}

static inline bool cma_any_addr(const struct sockaddr *addr)
{
        return cma_zero_addr(addr) || cma_loopback_addr(addr);
}

static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst)
{
        if (src->sa_family != dst->sa_family)
                return -1;

        switch (src->sa_family) {
        case AF_INET:
                return ((struct sockaddr_in *)src)->sin_addr.s_addr !=
                       ((struct sockaddr_in *)dst)->sin_addr.s_addr;
        case AF_INET6: {
                struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src;
                struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst;
                bool link_local;

                if (ipv6_addr_cmp(&src_addr6->sin6_addr,
                                          &dst_addr6->sin6_addr))
                        return 1;
                link_local = ipv6_addr_type(&dst_addr6->sin6_addr) &
                             IPV6_ADDR_LINKLOCAL;
                /* Link local must match their scope_ids */
                return link_local ? (src_addr6->sin6_scope_id !=
                                     dst_addr6->sin6_scope_id) :
                                    0;
        }

        default:
                return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
                                   &((struct sockaddr_ib *) dst)->sib_addr);
        }
}

static __be16 cma_port(const struct sockaddr *addr)
{
        struct sockaddr_ib *sib;

        switch (addr->sa_family) {
        case AF_INET:
                return ((struct sockaddr_in *) addr)->sin_port;
        case AF_INET6:
                return ((struct sockaddr_in6 *) addr)->sin6_port;
        case AF_IB:
                sib = (struct sockaddr_ib *) addr;
                return htons((u16) (be64_to_cpu(sib->sib_sid) &
                                    be64_to_cpu(sib->sib_sid_mask)));
        default:
                return 0;
        }
}

static inline int cma_any_port(const struct sockaddr *addr)
{
        return !cma_port(addr);
}

static void cma_save_ib_info(struct sockaddr *src_addr,
                             struct sockaddr *dst_addr,
                             const struct rdma_cm_id *listen_id,
                             const struct sa_path_rec *path)
{
        struct sockaddr_ib *listen_ib, *ib;

        listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
        if (src_addr) {
                ib = (struct sockaddr_ib *)src_addr;
                ib->sib_family = AF_IB;
                if (path) {
                        ib->sib_pkey = path->pkey;
                        ib->sib_flowinfo = path->flow_label;
                        memcpy(&ib->sib_addr, &path->sgid, 16);
                        ib->sib_sid = path->service_id;
                        ib->sib_scope_id = 0;
                } else {
                        ib->sib_pkey = listen_ib->sib_pkey;
                        ib->sib_flowinfo = listen_ib->sib_flowinfo;
                        ib->sib_addr = listen_ib->sib_addr;
                        ib->sib_sid = listen_ib->sib_sid;
                        ib->sib_scope_id = listen_ib->sib_scope_id;
                }
                ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
        }
        if (dst_addr) {
                ib = (struct sockaddr_ib *)dst_addr;
                ib->sib_family = AF_IB;
                if (path) {
                        ib->sib_pkey = path->pkey;
                        ib->sib_flowinfo = path->flow_label;
                        memcpy(&ib->sib_addr, &path->dgid, 16);
                }
        }
}

static void cma_save_ip4_info(struct sockaddr_in *src_addr,
                              struct sockaddr_in *dst_addr,
                              struct cma_hdr *hdr,
                              __be16 local_port)
{
        if (src_addr) {
                *src_addr = (struct sockaddr_in) {
                        .sin_family = AF_INET,
                        .sin_addr.s_addr = hdr->dst_addr.ip4.addr,
                        .sin_port = local_port,
                };
        }

        if (dst_addr) {
                *dst_addr = (struct sockaddr_in) {
                        .sin_family = AF_INET,
                        .sin_addr.s_addr = hdr->src_addr.ip4.addr,
                        .sin_port = hdr->port,
                };
        }
}

static void cma_save_ip6_info(struct sockaddr_in6 *src_addr,
                              struct sockaddr_in6 *dst_addr,
                              struct cma_hdr *hdr,
                              __be16 local_port)
{
        if (src_addr) {
                *src_addr = (struct sockaddr_in6) {
                        .sin6_family = AF_INET6,
                        .sin6_addr = hdr->dst_addr.ip6,
                        .sin6_port = local_port,
                };
        }

        if (dst_addr) {
                *dst_addr = (struct sockaddr_in6) {
                        .sin6_family = AF_INET6,
                        .sin6_addr = hdr->src_addr.ip6,
                        .sin6_port = hdr->port,
                };
        }
}

static u16 cma_port_from_service_id(__be64 service_id)
{
        return (u16)be64_to_cpu(service_id);
}

static int cma_save_ip_info(struct sockaddr *src_addr,
                            struct sockaddr *dst_addr,
                            const struct ib_cm_event *ib_event,
                            __be64 service_id)
{
        struct cma_hdr *hdr;
        __be16 port;

        hdr = ib_event->private_data;
        if (hdr->cma_version != CMA_VERSION)
                return -EINVAL;

        port = htons(cma_port_from_service_id(service_id));

        switch (cma_get_ip_ver(hdr)) {
        case 4:
                cma_save_ip4_info((struct sockaddr_in *)src_addr,
                                  (struct sockaddr_in *)dst_addr, hdr, port);
                break;
        case 6:
                cma_save_ip6_info((struct sockaddr_in6 *)src_addr,
                                  (struct sockaddr_in6 *)dst_addr, hdr, port);
                break;
        default:
                return -EAFNOSUPPORT;
        }

        return 0;
}

static int cma_save_net_info(struct sockaddr *src_addr,
                             struct sockaddr *dst_addr,
                             const struct rdma_cm_id *listen_id,
                             const struct ib_cm_event *ib_event,
                             sa_family_t sa_family, __be64 service_id)
{
        if (sa_family == AF_IB) {
                if (ib_event->event == IB_CM_REQ_RECEIVED)
                        cma_save_ib_info(src_addr, dst_addr, listen_id,
                                         ib_event->param.req_rcvd.primary_path);
                else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
                        cma_save_ib_info(src_addr, dst_addr, listen_id, NULL);
                return 0;
        }

        return cma_save_ip_info(src_addr, dst_addr, ib_event, service_id);
}

static int cma_save_req_info(const struct ib_cm_event *ib_event,
                             struct cma_req_info *req)
{
        const struct ib_cm_req_event_param *req_param =
                &ib_event->param.req_rcvd;
        const struct ib_cm_sidr_req_event_param *sidr_param =
                &ib_event->param.sidr_req_rcvd;

        switch (ib_event->event) {
        case IB_CM_REQ_RECEIVED:
                req->device        = req_param->listen_id->device;
                req->port        = req_param->port;
                memcpy(&req->local_gid, &req_param->primary_path->sgid,
                       sizeof(req->local_gid));
                req->has_gid        = true;
                req->service_id = req_param->primary_path->service_id;
                req->pkey        = be16_to_cpu(req_param->primary_path->pkey);
                if (req->pkey != req_param->bth_pkey)
                        pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and primary path P_Key (0x%x)\n"
                                            "RDMA CMA: in the future this may cause the request to be dropped\n",
                                            req_param->bth_pkey, req->pkey);
                break;
        case IB_CM_SIDR_REQ_RECEIVED:
                req->device        = sidr_param->listen_id->device;
                req->port        = sidr_param->port;
                req->has_gid        = false;
                req->service_id        = sidr_param->service_id;
                req->pkey        = sidr_param->pkey;
                if (req->pkey != sidr_param->bth_pkey)
                        pr_warn_ratelimited("RDMA CMA: got different BTH P_Key (0x%x) and SIDR request payload P_Key (0x%x)\n"
                                            "RDMA CMA: in the future this may cause the request to be dropped\n",
                                            sidr_param->bth_pkey, req->pkey);
                break;
        default:
                return -EINVAL;
        }

        return 0;
}

static bool validate_ipv4_net_dev(struct net_device *net_dev,
                                  const struct sockaddr_in *dst_addr,
                                  const struct sockaddr_in *src_addr)
{
        __be32 daddr = dst_addr->sin_addr.s_addr,
               saddr = src_addr->sin_addr.s_addr;
        struct fib_result res;
        struct flowi4 fl4;
        int err;
        bool ret;

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
            ipv4_is_lbcast(daddr) || ipv4_is_zeronet(saddr) ||
            ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr) ||
            ipv4_is_loopback(saddr))
                return false;

        memset(&fl4, 0, sizeof(fl4));
        fl4.flowi4_oif = net_dev->ifindex;
        fl4.daddr = daddr;
        fl4.saddr = saddr;

        rcu_read_lock();
        err = fib_lookup(dev_net(net_dev), &fl4, &res, 0);
        ret = err == 0 && FIB_RES_DEV(res) == net_dev;
        rcu_read_unlock();

        return ret;
}

static bool validate_ipv6_net_dev(struct net_device *net_dev,
                                  const struct sockaddr_in6 *dst_addr,
                                  const struct sockaddr_in6 *src_addr)
{
#if IS_ENABLED(CONFIG_IPV6)
        const int strict = ipv6_addr_type(&dst_addr->sin6_addr) &
                           IPV6_ADDR_LINKLOCAL;
        struct rt6_info *rt = rt6_lookup(dev_net(net_dev), &dst_addr->sin6_addr,
                                         &src_addr->sin6_addr, net_dev->ifindex,
                                         NULL, strict);
        bool ret;

        if (!rt)
                return false;

        ret = rt->rt6i_idev->dev == net_dev;
        ip6_rt_put(rt);

        return ret;
#else
        return false;
#endif
}

static bool validate_net_dev(struct net_device *net_dev,
                             const struct sockaddr *daddr,
                             const struct sockaddr *saddr)
{
        const struct sockaddr_in *daddr4 = (const struct sockaddr_in *)daddr;
        const struct sockaddr_in *saddr4 = (const struct sockaddr_in *)saddr;
        const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;
        const struct sockaddr_in6 *saddr6 = (const struct sockaddr_in6 *)saddr;

        switch (daddr->sa_family) {
        case AF_INET:
                return saddr->sa_family == AF_INET &&
                       validate_ipv4_net_dev(net_dev, daddr4, saddr4);

        case AF_INET6:
                return saddr->sa_family == AF_INET6 &&
                       validate_ipv6_net_dev(net_dev, daddr6, saddr6);

        default:
                return false;
        }
}

static struct net_device *
roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)
{
        const struct ib_gid_attr *sgid_attr = NULL;
        struct net_device *ndev;

        if (ib_event->event == IB_CM_REQ_RECEIVED)
                sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr;
        else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
                sgid_attr = ib_event->param.sidr_req_rcvd.sgid_attr;

        if (!sgid_attr)
                return NULL;

        rcu_read_lock();
        ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr);
        if (IS_ERR(ndev))
                ndev = NULL;
        else
                dev_hold(ndev);
        rcu_read_unlock();
        return ndev;
}

static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event,
                                          struct cma_req_info *req)
{
        struct sockaddr *listen_addr =
                        (struct sockaddr *)&req->listen_addr_storage;
        struct sockaddr *src_addr = (struct sockaddr *)&req->src_addr_storage;
        struct net_device *net_dev;
        const union ib_gid *gid = req->has_gid ? &req->local_gid : NULL;
        int err;

        err = cma_save_ip_info(listen_addr, src_addr, ib_event,
                               req->service_id);
        if (err)
                return ERR_PTR(err);

        if (rdma_protocol_roce(req->device, req->port))
                net_dev = roce_get_net_dev_by_cm_event(ib_event);
        else
                net_dev = ib_get_net_dev_by_params(req->device, req->port,
                                                   req->pkey,
                                                   gid, listen_addr);
        if (!net_dev)
                return ERR_PTR(-ENODEV);

        return net_dev;
}

static enum rdma_ucm_port_space rdma_ps_from_service_id(__be64 service_id)
{
        return (be64_to_cpu(service_id) >> 16) & 0xffff;
}

static bool cma_match_private_data(struct rdma_id_private *id_priv,
                                   const struct cma_hdr *hdr)
{
        struct sockaddr *addr = cma_src_addr(id_priv);
        __be32 ip4_addr;
        struct in6_addr ip6_addr;

        if (cma_any_addr(addr) && !id_priv->afonly)
                return true;

        switch (addr->sa_family) {
        case AF_INET:
                ip4_addr = ((struct sockaddr_in *)addr)->sin_addr.s_addr;
                if (cma_get_ip_ver(hdr) != 4)
                        return false;
                if (!cma_any_addr(addr) &&
                    hdr->dst_addr.ip4.addr != ip4_addr)
                        return false;
                break;
        case AF_INET6:
                ip6_addr = ((struct sockaddr_in6 *)addr)->sin6_addr;
                if (cma_get_ip_ver(hdr) != 6)
                        return false;
                if (!cma_any_addr(addr) &&
                    memcmp(&hdr->dst_addr.ip6, &ip6_addr, sizeof(ip6_addr)))
                        return false;
                break;
        case AF_IB:
                return true;
        default:
                return false;
        }

        return true;
}

static bool cma_protocol_roce(const struct rdma_cm_id *id)
{
        struct ib_device *device = id->device;
        const u32 port_num = id->port_num ?: rdma_start_port(device);

        return rdma_protocol_roce(device, port_num);
}

static bool cma_is_req_ipv6_ll(const struct cma_req_info *req)
{
        const struct sockaddr *daddr =
                        (const struct sockaddr *)&req->listen_addr_storage;
        const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr;

        /* Returns true if the req is for IPv6 link local */
        return (daddr->sa_family == AF_INET6 &&
                (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL));
}

static bool cma_match_net_dev(const struct rdma_cm_id *id,
                              const struct net_device *net_dev,
                              const struct cma_req_info *req)
{
        const struct rdma_addr *addr = &id->route.addr;

        if (!net_dev)
                /* This request is an AF_IB request */
                return (!id->port_num || id->port_num == req->port) &&
                       (addr->src_addr.ss_family == AF_IB);

        /*
         * If the request is not for IPv6 link local, allow matching
         * request to any netdevice of the one or multiport rdma device.
         */
        if (!cma_is_req_ipv6_ll(req))
                return true;
        /*
         * Net namespaces must match, and if the listner is listening
         * on a specific netdevice than netdevice must match as well.
         */
        if (net_eq(dev_net(net_dev), addr->dev_addr.net) &&
            (!!addr->dev_addr.bound_dev_if ==
             (addr->dev_addr.bound_dev_if == net_dev->ifindex)))
                return true;
        else
                return false;
}

static struct rdma_id_private *cma_find_listener(
                const struct rdma_bind_list *bind_list,
                const struct ib_cm_id *cm_id,
                const struct ib_cm_event *ib_event,
                const struct cma_req_info *req,
                const struct net_device *net_dev)
{
        struct rdma_id_private *id_priv, *id_priv_dev;

        lockdep_assert_held(&lock);

        if (!bind_list)
                return ERR_PTR(-EINVAL);

        hlist_for_each_entry(id_priv, &bind_list->owners, node) {
                if (cma_match_private_data(id_priv, ib_event->private_data)) {
                        if (id_priv->id.device == cm_id->device &&
                            cma_match_net_dev(&id_priv->id, net_dev, req))
                                return id_priv;
                        list_for_each_entry(id_priv_dev,
                                            &id_priv->listen_list,
                                            listen_item) {
                                if (id_priv_dev->id.device == cm_id->device &&
                                    cma_match_net_dev(&id_priv_dev->id,
                                                      net_dev, req))
                                        return id_priv_dev;
                        }
                }
        }

        return ERR_PTR(-EINVAL);
}

static struct rdma_id_private *
cma_ib_id_from_event(struct ib_cm_id *cm_id,
                     const struct ib_cm_event *ib_event,
                     struct cma_req_info *req,
                     struct net_device **net_dev)
{
        struct rdma_bind_list *bind_list;
        struct rdma_id_private *id_priv;
        int err;

        err = cma_save_req_info(ib_event, req);
        if (err)
                return ERR_PTR(err);

        *net_dev = cma_get_net_dev(ib_event, req);
        if (IS_ERR(*net_dev)) {
                if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) {
                        /* Assuming the protocol is AF_IB */
                        *net_dev = NULL;
                } else {
                        return ERR_CAST(*net_dev);
                }
        }

        mutex_lock(&lock);
        /*
         * Net namespace might be getting deleted while route lookup,
         * cm_id lookup is in progress. Therefore, perform netdevice
         * validation, cm_id lookup under rcu lock.
         * RCU lock along with netdevice state check, synchronizes with
         * netdevice migrating to different net namespace and also avoids
         * case where net namespace doesn't get deleted while lookup is in
         * progress.
         * If the device state is not IFF_UP, its properties such as ifindex
         * and nd_net cannot be trusted to remain valid without rcu lock.
         * net/core/dev.c change_net_namespace() ensures to synchronize with
         * ongoing operations on net device after device is closed using
         * synchronize_net().
         */
        rcu_read_lock();
        if (*net_dev) {
                /*
                 * If netdevice is down, it is likely that it is administratively
                 * down or it might be migrating to different namespace.
                 * In that case avoid further processing, as the net namespace
                 * or ifindex may change.
                 */
                if (((*net_dev)->flags & IFF_UP) == 0) {
                        id_priv = ERR_PTR(-EHOSTUNREACH);
                        goto err;
                }

                if (!validate_net_dev(*net_dev,
                                 (struct sockaddr *)&req->src_addr_storage,
                                 (struct sockaddr *)&req->listen_addr_storage)) {
                        id_priv = ERR_PTR(-EHOSTUNREACH);
                        goto err;
                }
        }

        bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net,
                                rdma_ps_from_service_id(req->service_id),
                                cma_port_from_service_id(req->service_id));
        id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev);
err:
        rcu_read_unlock();
        mutex_unlock(&lock);
        if (IS_ERR(id_priv) && *net_dev) {
                dev_put(*net_dev);
                *net_dev = NULL;
        }
        return id_priv;
}

static inline u8 cma_user_data_offset(struct rdma_id_private *id_priv)
{
        return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
}

static void cma_cancel_route(struct rdma_id_private *id_priv)
{
        if (rdma_cap_ib_sa(id_priv->id.device, id_priv->id.port_num)) {
                if (id_priv->query)
                        ib_sa_cancel_query(id_priv->query_id, id_priv->query);
        }
}

static void _cma_cancel_listens(struct rdma_id_private *id_priv)
{
        struct rdma_id_private *dev_id_priv;

        lockdep_assert_held(&lock);

        /*
         * Remove from listen_any_list to prevent added devices from spawning
         * additional listen requests.
         */
        list_del_init(&id_priv->listen_any_item);

        while (!list_empty(&id_priv->listen_list)) {
                dev_id_priv =
                        list_first_entry(&id_priv->listen_list,
                                         struct rdma_id_private, listen_item);
                /* sync with device removal to avoid duplicate destruction */
                list_del_init(&dev_id_priv->device_item);
                list_del_init(&dev_id_priv->listen_item);
                mutex_unlock(&lock);

                rdma_destroy_id(&dev_id_priv->id);
                mutex_lock(&lock);
        }
}

static void cma_cancel_listens(struct rdma_id_private *id_priv)
{
        mutex_lock(&lock);
        _cma_cancel_listens(id_priv);
        mutex_unlock(&lock);
}

static void cma_cancel_operation(struct rdma_id_private *id_priv,
                                 enum rdma_cm_state state)
{
        switch (state) {
        case RDMA_CM_ADDR_QUERY:
                /*
                 * We can avoid doing the rdma_addr_cancel() based on state,
                 * only RDMA_CM_ADDR_QUERY has a work that could still execute.
                 * Notice that the addr_handler work could still be exiting
                 * outside this state, however due to the interaction with the
                 * handler_mutex the work is guaranteed not to touch id_priv
                 * during exit.
                 */
                rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
                break;
        case RDMA_CM_ROUTE_QUERY:
                cma_cancel_route(id_priv);
                break;
        case RDMA_CM_LISTEN:
                if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
                        cma_cancel_listens(id_priv);
                break;
        default:
                break;
        }
}

static void cma_release_port(struct rdma_id_private *id_priv)
{
        struct rdma_bind_list *bind_list = id_priv->bind_list;
        struct net *net = id_priv->id.route.addr.dev_addr.net;

        if (!bind_list)
                return;

        mutex_lock(&lock);
        hlist_del(&id_priv->node);
        if (hlist_empty(&bind_list->owners)) {
                cma_ps_remove(net, bind_list->ps, bind_list->port);
                kfree(bind_list);
        }
        mutex_unlock(&lock);
}

static void destroy_mc(struct rdma_id_private *id_priv,
                       struct cma_multicast *mc)
{
        bool send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);

        if (rdma_cap_ib_mcast(id_priv->id.device, id_priv->id.port_num))
                ib_sa_free_multicast(mc->sa_mc);

        if (rdma_protocol_roce(id_priv->id.device, id_priv->id.port_num)) {
                struct rdma_dev_addr *dev_addr =
                        &id_priv->id.route.addr.dev_addr;
                struct net_device *ndev = NULL;

                if (dev_addr->bound_dev_if)
                        ndev = dev_get_by_index(dev_addr->net,
                                                dev_addr->bound_dev_if);
                if (ndev && !send_only) {
                        enum ib_gid_type gid_type;
                        union ib_gid mgid;

                        gid_type = id_priv->cma_dev->default_gid_type
                                           [id_priv->id.port_num -
                                            rdma_start_port(
                                                    id_priv->cma_dev->device)];
                        cma_iboe_set_mgid((struct sockaddr *)&mc->addr, &mgid,
                                          gid_type);
                        cma_igmp_send(ndev, &mgid, false);
                }
                dev_put(ndev);

                cancel_work_sync(&mc->iboe_join.work);
        }
        kfree(mc);
}

static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
{
        struct cma_multicast *mc;

        while (!list_empty(&id_priv->mc_list)) {
                mc = list_first_entry(&id_priv->mc_list, struct cma_multicast,
                                      list);
                list_del(&mc->list);
                destroy_mc(id_priv, mc);
        }
}

static void _destroy_id(struct rdma_id_private *id_priv,
                        enum rdma_cm_state state)
{
        cma_cancel_operation(id_priv, state);

        rdma_restrack_del(&id_priv->res);
        cma_remove_id_from_tree(id_priv);
        if (id_priv->cma_dev) {
                if (rdma_cap_ib_cm(id_priv->id.device, 1)) {
                        if (id_priv->cm_id.ib)
                                ib_destroy_cm_id(id_priv->cm_id.ib);
                } else if (rdma_cap_iw_cm(id_priv->id.device, 1)) {
                        if (id_priv->cm_id.iw)
                                iw_destroy_cm_id(id_priv->cm_id.iw);
                }
                cma_leave_mc_groups(id_priv);
                cma_release_dev(id_priv);
        }

        cma_release_port(id_priv);
        cma_id_put(id_priv);
        wait_for_completion(&id_priv->comp);

        if (id_priv->internal_id)
                cma_id_put(id_priv->id.context);

        kfree(id_priv->id.route.path_rec);
        kfree(id_priv->id.route.path_rec_inbound);
        kfree(id_priv->id.route.path_rec_outbound);

        put_net(id_priv->id.route.addr.dev_addr.net);
        kfree(id_priv);
}

/*
 * destroy an ID from within the handler_mutex. This ensures that no other
 * handlers can start running concurrently.
 */
static void destroy_id_handler_unlock(struct rdma_id_private *id_priv)
        __releases(&idprv->handler_mutex)
{
        enum rdma_cm_state state;
        unsigned long flags;

        trace_cm_id_destroy(id_priv);

        /*
         * Setting the state to destroyed under the handler mutex provides a
         * fence against calling handler callbacks. If this is invoked due to
         * the failure of a handler callback then it guarentees that no future
         * handlers will be called.
         */
        lockdep_assert_held(&id_priv->handler_mutex);
        spin_lock_irqsave(&id_priv->lock, flags);
        state = id_priv->state;
        id_priv->state = RDMA_CM_DESTROYING;
        spin_unlock_irqrestore(&id_priv->lock, flags);
        mutex_unlock(&id_priv->handler_mutex);
        _destroy_id(id_priv, state);
}

void rdma_destroy_id(struct rdma_cm_id *id)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);

        mutex_lock(&id_priv->handler_mutex);
        destroy_id_handler_unlock(id_priv);
}
EXPORT_SYMBOL(rdma_destroy_id);

static int cma_rep_recv(struct rdma_id_private *id_priv)
{
        int ret;

        ret = cma_modify_qp_rtr(id_priv, NULL);
        if (ret)
                goto reject;

        ret = cma_modify_qp_rts(id_priv, NULL);
        if (ret)
                goto reject;

        trace_cm_send_rtu(id_priv);
        ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
        if (ret)
                goto reject;

        return 0;
reject:
        pr_debug_ratelimited("RDMA CM: CONNECT_ERROR: failed to handle reply. status %d\n", ret);
        cma_modify_qp_err(id_priv);
        trace_cm_send_rej(id_priv);
        ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
                       NULL, 0, NULL, 0);
        return ret;
}

static void cma_set_rep_event_data(struct rdma_cm_event *event,
                                   const struct ib_cm_rep_event_param *rep_data,
                                   void *private_data)
{
        event->param.conn.private_data = private_data;
        event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
        event->param.conn.responder_resources = rep_data->responder_resources;
        event->param.conn.initiator_depth = rep_data->initiator_depth;
        event->param.conn.flow_control = rep_data->flow_control;
        event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
        event->param.conn.srq = rep_data->srq;
        event->param.conn.qp_num = rep_data->remote_qpn;

        event->ece.vendor_id = rep_data->ece.vendor_id;
        event->ece.attr_mod = rep_data->ece.attr_mod;
}

static int cma_cm_event_handler(struct rdma_id_private *id_priv,
                                struct rdma_cm_event *event)
{
        int ret;

        lockdep_assert_held(&id_priv->handler_mutex);

        trace_cm_event_handler(id_priv, event);
        ret = id_priv->id.event_handler(&id_priv->id, event);
        trace_cm_event_done(id_priv, event, ret);
        return ret;
}

static int cma_ib_handler(struct ib_cm_id *cm_id,
                          const struct ib_cm_event *ib_event)
{
        struct rdma_id_private *id_priv = cm_id->context;
        struct rdma_cm_event event = {};
        enum rdma_cm_state state;
        int ret;

        mutex_lock(&id_priv->handler_mutex);
        state = READ_ONCE(id_priv->state);
        if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
             state != RDMA_CM_CONNECT) ||
            (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
             state != RDMA_CM_DISCONNECT))
                goto out;

        switch (ib_event->event) {
        case IB_CM_REQ_ERROR:
        case IB_CM_REP_ERROR:
                event.event = RDMA_CM_EVENT_UNREACHABLE;
                event.status = -ETIMEDOUT;
                break;
        case IB_CM_REP_RECEIVED:
                if (state == RDMA_CM_CONNECT &&
                    (id_priv->id.qp_type != IB_QPT_UD)) {
                        trace_cm_send_mra(id_priv);
                        ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
                }
                if (id_priv->id.qp) {
                        event.status = cma_rep_recv(id_priv);
                        event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
                                                     RDMA_CM_EVENT_ESTABLISHED;
                } else {
                        event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
                }
                cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
                                       ib_event->private_data);
                break;
        case IB_CM_RTU_RECEIVED:
        case IB_CM_USER_ESTABLISHED:
                event.event = RDMA_CM_EVENT_ESTABLISHED;
                break;
        case IB_CM_DREQ_ERROR:
                event.status = -ETIMEDOUT;
                fallthrough;
        case IB_CM_DREQ_RECEIVED:
        case IB_CM_DREP_RECEIVED:
                if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
                                   RDMA_CM_DISCONNECT))
                        goto out;
                event.event = RDMA_CM_EVENT_DISCONNECTED;
                break;
        case IB_CM_TIMEWAIT_EXIT:
                event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
                break;
        case IB_CM_MRA_RECEIVED:
                /* ignore event */
                goto out;
        case IB_CM_REJ_RECEIVED:
                pr_debug_ratelimited("RDMA CM: REJECTED: %s\n", rdma_reject_msg(&id_priv->id,
                                                                                ib_event->param.rej_rcvd.reason));
                cma_modify_qp_err(id_priv);
                event.status = ib_event->param.rej_rcvd.reason;
                event.event = RDMA_CM_EVENT_REJECTED;
                event.param.conn.private_data = ib_event->private_data;
                event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
                break;
        default:
                pr_err("RDMA CMA: unexpected IB CM event: %d\n",
                       ib_event->event);
                goto out;
        }

        ret = cma_cm_event_handler(id_priv, &event);
        if (ret) {
                /* Destroy the CM ID by returning a non-zero value. */
                id_priv->cm_id.ib = NULL;
                destroy_id_handler_unlock(id_priv);
                return ret;
        }
out:
        mutex_unlock(&id_priv->handler_mutex);
        return 0;
}

static struct rdma_id_private *
cma_ib_new_conn_id(const struct rdma_cm_id *listen_id,
                   const struct ib_cm_event *ib_event,
                   struct net_device *net_dev)
{
        struct rdma_id_private *listen_id_priv;
        struct rdma_id_private *id_priv;
        struct rdma_cm_id *id;
        struct rdma_route *rt;
        const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
        struct sa_path_rec *path = ib_event->param.req_rcvd.primary_path;
        const __be64 service_id =
                ib_event->param.req_rcvd.primary_path->service_id;
        int ret;

        listen_id_priv = container_of(listen_id, struct rdma_id_private, id);
        id_priv = __rdma_create_id(listen_id->route.addr.dev_addr.net,
                                   listen_id->event_handler, listen_id->context,
                                   listen_id->ps,
                                   ib_event->param.req_rcvd.qp_type,
                                   listen_id_priv);
        if (IS_ERR(id_priv))
                return NULL;

        id = &id_priv->id;
        if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
                              (struct sockaddr *)&id->route.addr.dst_addr,
                              listen_id, ib_event, ss_family, service_id))
                goto err;

        rt = &id->route;
        rt->num_pri_alt_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
        rt->path_rec = kmalloc_array(rt->num_pri_alt_paths,
                                     sizeof(*rt->path_rec), GFP_KERNEL);
        if (!rt->path_rec)
                goto err;

        rt->path_rec[0] = *path;
        if (rt->num_pri_alt_paths == 2)
                rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;

        if (net_dev) {
                rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev);
        } else {
                if (!cma_protocol_roce(listen_id) &&
                    cma_any_addr(cma_src_addr(id_priv))) {
                        rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
                        rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
                        ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
                } else if (!cma_any_addr(cma_src_addr(id_priv))) {
                        ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
                        if (ret)
                                goto err;
                }
        }
        rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);

        id_priv->state = RDMA_CM_CONNECT;
        return id_priv;

err:
        rdma_destroy_id(id);
        return NULL;
}

static struct rdma_id_private *
cma_ib_new_udp_id(const struct rdma_cm_id *listen_id,
                  const struct ib_cm_event *ib_event,
                  struct net_device *net_dev)
{
        const struct rdma_id_private *listen_id_priv;
        struct rdma_id_private *id_priv;
        struct rdma_cm_id *id;
        const sa_family_t ss_family = listen_id->route.addr.src_addr.ss_family;
        struct net *net = listen_id->route.addr.dev_addr.net;
        int ret;

        listen_id_priv = container_of(listen_id, struct rdma_id_private, id);
        id_priv = __rdma_create_id(net, listen_id->event_handler,
                                   listen_id->context, listen_id->ps, IB_QPT_UD,
                                   listen_id_priv);
        if (IS_ERR(id_priv))
                return NULL;

        id = &id_priv->id;
        if (cma_save_net_info((struct sockaddr *)&id->route.addr.src_addr,
                              (struct sockaddr *)&id->route.addr.dst_addr,
                              listen_id, ib_event, ss_family,
                              ib_event->param.sidr_req_rcvd.service_id))
                goto err;

        if (net_dev) {
                rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev);
        } else {
                if (!cma_any_addr(cma_src_addr(id_priv))) {
                        ret = cma_translate_addr(cma_src_addr(id_priv),
                                                 &id->route.addr.dev_addr);
                        if (ret)
                                goto err;
                }
        }

        id_priv->state = RDMA_CM_CONNECT;
        return id_priv;
err:
        rdma_destroy_id(id);
        return NULL;
}

static void cma_set_req_event_data(struct rdma_cm_event *event,
                                   const struct ib_cm_req_event_param *req_data,
                                   void *private_data, int offset)
{
        event->param.conn.private_data = private_data + offset;
        event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
        event->param.conn.responder_resources = req_data->responder_resources;
        event->param.conn.initiator_depth = req_data->initiator_depth;
        event->param.conn.flow_control = req_data->flow_control;
        event->param.conn.retry_count = req_data->retry_count;
        event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
        event->param.conn.srq = req_data->srq;
        event->param.conn.qp_num = req_data->remote_qpn;

        event->ece.vendor_id = req_data->ece.vendor_id;
        event->ece.attr_mod = req_data->ece.attr_mod;
}

static int cma_ib_check_req_qp_type(const struct rdma_cm_id *id,
                                    const struct ib_cm_event *ib_event)
{
        return (((ib_event->event == IB_CM_REQ_RECEIVED) &&
                 (ib_event->param.req_rcvd.qp_type == id->qp_type)) ||
                ((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) &&
                 (id->qp_type == IB_QPT_UD)) ||
                (!id->qp_type));
}

static int cma_ib_req_handler(struct ib_cm_id *cm_id,
                              const struct ib_cm_event *ib_event)
{
        struct rdma_id_private *listen_id, *conn_id = NULL;
        struct rdma_cm_event event = {};
        struct cma_req_info req = {};
        struct net_device *net_dev;
        u8 offset;
        int ret;

        listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev);
        if (IS_ERR(listen_id))
                return PTR_ERR(listen_id);

        trace_cm_req_handler(listen_id, ib_event->event);
        if (!cma_ib_check_req_qp_type(&listen_id->id, ib_event)) {
                ret = -EINVAL;
                goto net_dev_put;
        }

        mutex_lock(&listen_id->handler_mutex);
        if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN) {
                ret = -ECONNABORTED;
                goto err_unlock;
        }

        offset = cma_user_data_offset(listen_id);
        event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
        if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
                conn_id = cma_ib_new_udp_id(&listen_id->id, ib_event, net_dev);
                event.param.ud.private_data = ib_event->private_data + offset;
                event.param.ud.private_data_len =
                                IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
        } else {
                conn_id = cma_ib_new_conn_id(&listen_id->id, ib_event, net_dev);
                cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
                                       ib_event->private_data, offset);
        }
        if (!conn_id) {
                ret = -ENOMEM;
                goto err_unlock;
        }

        mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
        ret = cma_ib_acquire_dev(conn_id, listen_id, &req);
        if (ret) {
                destroy_id_handler_unlock(conn_id);
                goto err_unlock;
        }

        conn_id->cm_id.ib = cm_id;
        cm_id->context = conn_id;
        cm_id->cm_handler = cma_ib_handler;

        ret = cma_cm_event_handler(conn_id, &event);
        if (ret) {
                /* Destroy the CM ID by returning a non-zero value. */
                conn_id->cm_id.ib = NULL;
                mutex_unlock(&listen_id->handler_mutex);
                destroy_id_handler_unlock(conn_id);
                goto net_dev_put;
        }

        if (READ_ONCE(conn_id->state) == RDMA_CM_CONNECT &&
            conn_id->id.qp_type != IB_QPT_UD) {
                trace_cm_send_mra(cm_id->context);
                ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
        }
        mutex_unlock(&conn_id->handler_mutex);

err_unlock:
        mutex_unlock(&listen_id->handler_mutex);

net_dev_put:
        dev_put(net_dev);

        return ret;
}

__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
{
        if (addr->sa_family == AF_IB)
                return ((struct sockaddr_ib *) addr)->sib_sid;

        return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr)));
}
EXPORT_SYMBOL(rdma_get_service_id);

void rdma_read_gids(struct rdma_cm_id *cm_id, union ib_gid *sgid,
                    union ib_gid *dgid)
{
        struct rdma_addr *addr = &cm_id->route.addr;

        if (!cm_id->device) {
                if (sgid)
                        memset(sgid, 0, sizeof(*sgid));
                if (dgid)
                        memset(dgid, 0, sizeof(*dgid));
                return;
        }

        if (rdma_protocol_roce(cm_id->device, cm_id->port_num)) {
                if (sgid)
                        rdma_ip2gid((struct sockaddr *)&addr->src_addr, sgid);
                if (dgid)
                        rdma_ip2gid((struct sockaddr *)&addr->dst_addr, dgid);
        } else {
                if (sgid)
                        rdma_addr_get_sgid(&addr->dev_addr, sgid);
                if (dgid)
                        rdma_addr_get_dgid(&addr->dev_addr, dgid);
        }
}
EXPORT_SYMBOL(rdma_read_gids);

static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
{
        struct rdma_id_private *id_priv = iw_id->context;
        struct rdma_cm_event event = {};
        int ret = 0;
        struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
        struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;

        mutex_lock(&id_priv->handler_mutex);
        if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT)
                goto out;

        switch (iw_event->event) {
        case IW_CM_EVENT_CLOSE:
                event.event = RDMA_CM_EVENT_DISCONNECTED;
                break;
        case IW_CM_EVENT_CONNECT_REPLY:
                memcpy(cma_src_addr(id_priv), laddr,
                       rdma_addr_size(laddr));
                memcpy(cma_dst_addr(id_priv), raddr,
                       rdma_addr_size(raddr));
                switch (iw_event->status) {
                case 0:
                        event.event = RDMA_CM_EVENT_ESTABLISHED;
                        event.param.conn.initiator_depth = iw_event->ird;
                        event.param.conn.responder_resources = iw_event->ord;
                        break;
                case -ECONNRESET:
                case -ECONNREFUSED:
                        event.event = RDMA_CM_EVENT_REJECTED;
                        break;
                case -ETIMEDOUT:
                        event.event = RDMA_CM_EVENT_UNREACHABLE;
                        break;
                default:
                        event.event = RDMA_CM_EVENT_CONNECT_ERROR;
                        break;
                }
                break;
        case IW_CM_EVENT_ESTABLISHED:
                event.event = RDMA_CM_EVENT_ESTABLISHED;
                event.param.conn.initiator_depth = iw_event->ird;
                event.param.conn.responder_resources = iw_event->ord;
                break;
        default:
                goto out;
        }

        event.status = iw_event->status;
        event.param.conn.private_data = iw_event->private_data;
        event.param.conn.private_data_len = iw_event->private_data_len;
        ret = cma_cm_event_handler(id_priv, &event);
        if (ret) {
                /* Destroy the CM ID by returning a non-zero value. */
                id_priv->cm_id.iw = NULL;
                destroy_id_handler_unlock(id_priv);
                return ret;
        }

out:
        mutex_unlock(&id_priv->handler_mutex);
        return ret;
}

static int iw_conn_req_handler(struct iw_cm_id *cm_id,
                               struct iw_cm_event *iw_event)
{
        struct rdma_id_private *listen_id, *conn_id;
        struct rdma_cm_event event = {};
        int ret = -ECONNABORTED;
        struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
        struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;

        event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
        event.param.conn.private_data = iw_event->private_data;
        event.param.conn.private_data_len = iw_event->private_data_len;
        event.param.conn.initiator_depth = iw_event->ird;
        event.param.conn.responder_resources = iw_event->ord;

        listen_id = cm_id->context;

        mutex_lock(&listen_id->handler_mutex);
        if (READ_ONCE(listen_id->state) != RDMA_CM_LISTEN)
                goto out;

        /* Create a new RDMA id for the new IW CM ID */
        conn_id = __rdma_create_id(listen_id->id.route.addr.dev_addr.net,
                                   listen_id->id.event_handler,
                                   listen_id->id.context, RDMA_PS_TCP,
                                   IB_QPT_RC, listen_id);
        if (IS_ERR(conn_id)) {
                ret = -ENOMEM;
                goto out;
        }
        mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
        conn_id->state = RDMA_CM_CONNECT;

        ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr);
        if (ret) {
                mutex_unlock(&listen_id->handler_mutex);
                destroy_id_handler_unlock(conn_id);
                return ret;
        }

        ret = cma_iw_acquire_dev(conn_id, listen_id);
        if (ret) {
                mutex_unlock(&listen_id->handler_mutex);
                destroy_id_handler_unlock(conn_id);
                return ret;
        }

        conn_id->cm_id.iw = cm_id;
        cm_id->context = conn_id;
        cm_id->cm_handler = cma_iw_handler;

        memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
        memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));

        ret = cma_cm_event_handler(conn_id, &event);
        if (ret) {
                /* User wants to destroy the CM ID */
                conn_id->cm_id.iw = NULL;
                mutex_unlock(&listen_id->handler_mutex);
                destroy_id_handler_unlock(conn_id);
                return ret;
        }

        mutex_unlock(&conn_id->handler_mutex);

out:
        mutex_unlock(&listen_id->handler_mutex);
        return ret;
}

static int cma_ib_listen(struct rdma_id_private *id_priv)
{
        struct sockaddr *addr;
        struct ib_cm_id        *id;
        __be64 svc_id;

        addr = cma_src_addr(id_priv);
        svc_id = rdma_get_service_id(&id_priv->id, addr);
        id = ib_cm_insert_listen(id_priv->id.device,
                                 cma_ib_req_handler, svc_id);
        if (IS_ERR(id))
                return PTR_ERR(id);
        id_priv->cm_id.ib = id;

        return 0;
}

static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
{
        int ret;
        struct iw_cm_id        *id;

        id = iw_create_cm_id(id_priv->id.device,
                             iw_conn_req_handler,
                             id_priv);
        if (IS_ERR(id))
                return PTR_ERR(id);

        mutex_lock(&id_priv->qp_mutex);
        id->tos = id_priv->tos;
        id->tos_set = id_priv->tos_set;
        mutex_unlock(&id_priv->qp_mutex);
        id->afonly = id_priv->afonly;
        id_priv->cm_id.iw = id;

        memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
               rdma_addr_size(cma_src_addr(id_priv)));

        ret = iw_cm_listen(id_priv->cm_id.iw, backlog);

        if (ret) {
                iw_destroy_cm_id(id_priv->cm_id.iw);
                id_priv->cm_id.iw = NULL;
        }

        return ret;
}

static int cma_listen_handler(struct rdma_cm_id *id,
                              struct rdma_cm_event *event)
{
        struct rdma_id_private *id_priv = id->context;

        /* Listening IDs are always destroyed on removal */
        if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL)
                return -1;

        id->context = id_priv->id.context;
        id->event_handler = id_priv->id.event_handler;
        trace_cm_event_handler(id_priv, event);
        return id_priv->id.event_handler(id, event);
}

static int cma_listen_on_dev(struct rdma_id_private *id_priv,
                             struct cma_device *cma_dev,
                             struct rdma_id_private **to_destroy)
{
        struct rdma_id_private *dev_id_priv;
        struct net *net = id_priv->id.route.addr.dev_addr.net;
        int ret;

        lockdep_assert_held(&lock);

        *to_destroy = NULL;
        if (cma_family(id_priv) == AF_IB && !rdma_cap_ib_cm(cma_dev->device, 1))
                return 0;

        dev_id_priv =
                __rdma_create_id(net, cma_listen_handler, id_priv,
                                 id_priv->id.ps, id_priv->id.qp_type, id_priv);
        if (IS_ERR(dev_id_priv))
                return PTR_ERR(dev_id_priv);

        dev_id_priv->state = RDMA_CM_ADDR_BOUND;
        memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
               rdma_addr_size(cma_src_addr(id_priv)));

        _cma_attach_to_dev(dev_id_priv, cma_dev);
        rdma_restrack_add(&dev_id_priv->res);
        cma_id_get(id_priv);
        dev_id_priv->internal_id = 1;
        dev_id_priv->afonly = id_priv->afonly;
        mutex_lock(&id_priv->qp_mutex);
        dev_id_priv->tos_set = id_priv->tos_set;
        dev_id_priv->tos = id_priv->tos;
        mutex_unlock(&id_priv->qp_mutex);

        ret = rdma_listen(&dev_id_priv->id, id_priv->backlog);
        if (ret)
                goto err_listen;
        list_add_tail(&dev_id_priv->listen_item, &id_priv->listen_list);
        return 0;
err_listen:
        /* Caller must destroy this after releasing lock */
        *to_destroy = dev_id_priv;
        dev_warn(&cma_dev->device->dev, "RDMA CMA: %s, error %d\n", __func__, ret);
        return ret;
}

static int cma_listen_on_all(struct rdma_id_private *id_priv)
{
        struct rdma_id_private *to_destroy;
        struct cma_device *cma_dev;
        int ret;

        mutex_lock(&lock);
        list_add_tail(&id_priv->listen_any_item, &listen_any_list);
        list_for_each_entry(cma_dev, &dev_list, list) {
                ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy);
                if (ret) {
                        /* Prevent racing with cma_process_remove() */
                        if (to_destroy)
                                list_del_init(&to_destroy->device_item);
                        goto err_listen;
                }
        }
        mutex_unlock(&lock);
        return 0;

err_listen:
        _cma_cancel_listens(id_priv);
        mutex_unlock(&lock);
        if (to_destroy)
                rdma_destroy_id(&to_destroy->id);
        return ret;
}

void rdma_set_service_type(struct rdma_cm_id *id, int tos)
{
        struct rdma_id_private *id_priv;

        id_priv = container_of(id, struct rdma_id_private, id);
        mutex_lock(&id_priv->qp_mutex);
        id_priv->tos = (u8) tos;
        id_priv->tos_set = true;
        mutex_unlock(&id_priv->qp_mutex);
}
EXPORT_SYMBOL(rdma_set_service_type);

/**
 * rdma_set_ack_timeout() - Set the ack timeout of QP associated
 *                          with a connection identifier.
 * @id: Communication identifier to associated with service type.
 * @timeout: Ack timeout to set a QP, expressed as 4.096 * 2^(timeout) usec.
 *
 * This function should be called before rdma_connect() on active side,
 * and on passive side before rdma_accept(). It is applicable to primary
 * path only. The timeout will affect the local side of the QP, it is not
 * negotiated with remote side and zero disables the timer. In case it is
 * set before rdma_resolve_route, the value will also be used to determine
 * PacketLifeTime for RoCE.
 *
 * Return: 0 for success
 */
int rdma_set_ack_timeout(struct rdma_cm_id *id, u8 timeout)
{
        struct rdma_id_private *id_priv;

        if (id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_INI)
                return -EINVAL;

        id_priv = container_of(id, struct rdma_id_private, id);
        mutex_lock(&id_priv->qp_mutex);
        id_priv->timeout = timeout;
        id_priv->timeout_set = true;
        mutex_unlock(&id_priv->qp_mutex);

        return 0;
}
EXPORT_SYMBOL(rdma_set_ack_timeout);

/**
 * rdma_set_min_rnr_timer() - Set the minimum RNR Retry timer of the
 *                              QP associated with a connection identifier.
 * @id: Communication identifier to associated with service type.
 * @min_rnr_timer: 5-bit value encoded as Table 45: "Encoding for RNR NAK
 *                   Timer Field" in the IBTA specification.
 *
 * This function should be called before rdma_connect() on active
 * side, and on passive side before rdma_accept(). The timer value
 * will be associated with the local QP. When it receives a send it is
 * not read to handle, typically if the receive queue is empty, an RNR
 * Retry NAK is returned to the requester with the min_rnr_timer
 * encoded. The requester will then wait at least the time specified
 * in the NAK before retrying. The default is zero, which translates
 * to a minimum RNR Timer value of 655 ms.
 *
 * Return: 0 for success
 */
int rdma_set_min_rnr_timer(struct rdma_cm_id *id, u8 min_rnr_timer)
{
        struct rdma_id_private *id_priv;

        /* It is a five-bit value */
        if (min_rnr_timer & 0xe0)
                return -EINVAL;

        if (WARN_ON(id->qp_type != IB_QPT_RC && id->qp_type != IB_QPT_XRC_TGT))
                return -EINVAL;

        id_priv = container_of(id, struct rdma_id_private, id);
        mutex_lock(&id_priv->qp_mutex);
        id_priv->min_rnr_timer = min_rnr_timer;
        id_priv->min_rnr_timer_set = true;
        mutex_unlock(&id_priv->qp_mutex);

        return 0;
}
EXPORT_SYMBOL(rdma_set_min_rnr_timer);

static int route_set_path_rec_inbound(struct cma_work *work,
                                      struct sa_path_rec *path_rec)
{
        struct rdma_route *route = &work->id->id.route;

        if (!route->path_rec_inbound) {
                route->path_rec_inbound =
                        kzalloc(sizeof(*route->path_rec_inbound), GFP_KERNEL);
                if (!route->path_rec_inbound)
                        return -ENOMEM;
        }

        *route->path_rec_inbound = *path_rec;
        return 0;
}

static int route_set_path_rec_outbound(struct cma_work *work,
                                       struct sa_path_rec *path_rec)
{
        struct rdma_route *route = &work->id->id.route;

        if (!route->path_rec_outbound) {
                route->path_rec_outbound =
                        kzalloc(sizeof(*route->path_rec_outbound), GFP_KERNEL);
                if (!route->path_rec_outbound)
                        return -ENOMEM;
        }

        *route->path_rec_outbound = *path_rec;
        return 0;
}

static void cma_query_handler(int status, struct sa_path_rec *path_rec,
                              unsigned int num_prs, void *context)
{
        struct cma_work *work = context;
        struct rdma_route *route;
        int i;

        route = &work->id->id.route;

        if (status)
                goto fail;

        for (i = 0; i < num_prs; i++) {
                if (!path_rec[i].flags || (path_rec[i].flags & IB_PATH_GMP))
                        *route->path_rec = path_rec[i];
                else if (path_rec[i].flags & IB_PATH_INBOUND)
                        status = route_set_path_rec_inbound(work, &path_rec[i]);
                else if (path_rec[i].flags & IB_PATH_OUTBOUND)
                        status = route_set_path_rec_outbound(work,
                                                             &path_rec[i]);
                else
                        status = -EINVAL;

                if (status)
                        goto fail;
        }

        route->num_pri_alt_paths = 1;
        queue_work(cma_wq, &work->work);
        return;

fail:
        work->old_state = RDMA_CM_ROUTE_QUERY;
        work->new_state = RDMA_CM_ADDR_RESOLVED;
        work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
        work->event.status = status;
        pr_debug_ratelimited("RDMA CM: ROUTE_ERROR: failed to query path. status %d\n",
                             status);
        queue_work(cma_wq, &work->work);
}

static int cma_query_ib_route(struct rdma_id_private *id_priv,
                              unsigned long timeout_ms, struct cma_work *work)
{
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        struct sa_path_rec path_rec;
        ib_sa_comp_mask comp_mask;
        struct sockaddr_in6 *sin6;
        struct sockaddr_ib *sib;

        memset(&path_rec, 0, sizeof path_rec);

        if (rdma_cap_opa_ah(id_priv->id.device, id_priv->id.port_num))
                path_rec.rec_type = SA_PATH_REC_TYPE_OPA;
        else
                path_rec.rec_type = SA_PATH_REC_TYPE_IB;
        rdma_addr_get_sgid(dev_addr, &path_rec.sgid);
        rdma_addr_get_dgid(dev_addr, &path_rec.dgid);
        path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
        path_rec.numb_path = 1;
        path_rec.reversible = 1;
        path_rec.service_id = rdma_get_service_id(&id_priv->id,
                                                  cma_dst_addr(id_priv));

        comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
                    IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
                    IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;

        switch (cma_family(id_priv)) {
        case AF_INET:
                path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
                comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
                break;
        case AF_INET6:
                sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
                path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
                comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
                break;
        case AF_IB:
                sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
                path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20);
                comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
                break;
        }

        id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
                                               id_priv->id.port_num, &path_rec,
                                               comp_mask, timeout_ms,
                                               GFP_KERNEL, cma_query_handler,
                                               work, &id_priv->query);

        return (id_priv->query_id < 0) ? id_priv->query_id : 0;
}

static void cma_iboe_join_work_handler(struct work_struct *work)
{
        struct cma_multicast *mc =
                container_of(work, struct cma_multicast, iboe_join.work);
        struct rdma_cm_event *event = &mc->iboe_join.event;
        struct rdma_id_private *id_priv = mc->id_priv;
        int ret;

        mutex_lock(&id_priv->handler_mutex);
        if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
            READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
                goto out_unlock;

        ret = cma_cm_event_handler(id_priv, event);
        WARN_ON(ret);

out_unlock:
        mutex_unlock(&id_priv->handler_mutex);
        if (event->event == RDMA_CM_EVENT_MULTICAST_JOIN)
                rdma_destroy_ah_attr(&event->param.ud.ah_attr);
}

static void cma_work_handler(struct work_struct *_work)
{
        struct cma_work *work = container_of(_work, struct cma_work, work);
        struct rdma_id_private *id_priv = work->id;

        mutex_lock(&id_priv->handler_mutex);
        if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
            READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
                goto out_unlock;
        if (work->old_state != 0 || work->new_state != 0) {
                if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
                        goto out_unlock;
        }

        if (cma_cm_event_handler(id_priv, &work->event)) {
                cma_id_put(id_priv);
                destroy_id_handler_unlock(id_priv);
                goto out_free;
        }

out_unlock:
        mutex_unlock(&id_priv->handler_mutex);
        cma_id_put(id_priv);
out_free:
        if (work->event.event == RDMA_CM_EVENT_MULTICAST_JOIN)
                rdma_destroy_ah_attr(&work->event.param.ud.ah_attr);
        kfree(work);
}

static void cma_init_resolve_route_work(struct cma_work *work,
                                        struct rdma_id_private *id_priv)
{
        work->id = id_priv;
        INIT_WORK(&work->work, cma_work_handler);
        work->old_state = RDMA_CM_ROUTE_QUERY;
        work->new_state = RDMA_CM_ROUTE_RESOLVED;
        work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
}

static void enqueue_resolve_addr_work(struct cma_work *work,
                                      struct rdma_id_private *id_priv)
{
        /* Balances with cma_id_put() in cma_work_handler */
        cma_id_get(id_priv);

        work->id = id_priv;
        INIT_WORK(&work->work, cma_work_handler);
        work->old_state = RDMA_CM_ADDR_QUERY;
        work->new_state = RDMA_CM_ADDR_RESOLVED;
        work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;

        queue_work(cma_wq, &work->work);
}

static int cma_resolve_ib_route(struct rdma_id_private *id_priv,
                                unsigned long timeout_ms)
{
        struct rdma_route *route = &id_priv->id.route;
        struct cma_work *work;
        int ret;

        work = kzalloc(sizeof *work, GFP_KERNEL);
        if (!work)
                return -ENOMEM;

        cma_init_resolve_route_work(work, id_priv);

        if (!route->path_rec)
                route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
        if (!route->path_rec) {
                ret = -ENOMEM;
                goto err1;
        }

        ret = cma_query_ib_route(id_priv, timeout_ms, work);
        if (ret)
                goto err2;

        return 0;
err2:
        kfree(route->path_rec);
        route->path_rec = NULL;
err1:
        kfree(work);
        return ret;
}

static enum ib_gid_type cma_route_gid_type(enum rdma_network_type network_type,
                                           unsigned long supported_gids,
                                           enum ib_gid_type default_gid)
{
        if ((network_type == RDMA_NETWORK_IPV4 ||
             network_type == RDMA_NETWORK_IPV6) &&
            test_bit(IB_GID_TYPE_ROCE_UDP_ENCAP, &supported_gids))
                return IB_GID_TYPE_ROCE_UDP_ENCAP;

        return default_gid;
}

/*
 * cma_iboe_set_path_rec_l2_fields() is helper function which sets
 * path record type based on GID type.
 * It also sets up other L2 fields which includes destination mac address
 * netdev ifindex, of the path record.
 * It returns the netdev of the bound interface for this path record entry.
 */
static struct net_device *
cma_iboe_set_path_rec_l2_fields(struct rdma_id_private *id_priv)
{
        struct rdma_route *route = &id_priv->id.route;
        enum ib_gid_type gid_type = IB_GID_TYPE_ROCE;
        struct rdma_addr *addr = &route->addr;
        unsigned long supported_gids;
        struct net_device *ndev;

        if (!addr->dev_addr.bound_dev_if)
                return NULL;

        ndev = dev_get_by_index(addr->dev_addr.net,
                                addr->dev_addr.bound_dev_if);
        if (!ndev)
                return NULL;

        supported_gids = roce_gid_type_mask_support(id_priv->id.device,
                                                    id_priv->id.port_num);
        gid_type = cma_route_gid_type(addr->dev_addr.network,
                                      supported_gids,
                                      id_priv->gid_type);
        /* Use the hint from IP Stack to select GID Type */
        if (gid_type < ib_network_to_gid_type(addr->dev_addr.network))
                gid_type = ib_network_to_gid_type(addr->dev_addr.network);
        route->path_rec->rec_type = sa_conv_gid_to_pathrec_type(gid_type);

        route->path_rec->roce.route_resolved = true;
        sa_path_set_dmac(route->path_rec, addr->dev_addr.dst_dev_addr);
        return ndev;
}

int rdma_set_ib_path(struct rdma_cm_id *id,
                     struct sa_path_rec *path_rec)
{
        struct rdma_id_private *id_priv;
        struct net_device *ndev;
        int ret;

        id_priv = container_of(id, struct rdma_id_private, id);
        if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
                           RDMA_CM_ROUTE_RESOLVED))
                return -EINVAL;

        id->route.path_rec = kmemdup(path_rec, sizeof(*path_rec),
                                     GFP_KERNEL);
        if (!id->route.path_rec) {
                ret = -ENOMEM;
                goto err;
        }

        if (rdma_protocol_roce(id->device, id->port_num)) {
                ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
                if (!ndev) {
                        ret = -ENODEV;
                        goto err_free;
                }
                dev_put(ndev);
        }

        id->route.num_pri_alt_paths = 1;
        return 0;

err_free:
        kfree(id->route.path_rec);
        id->route.path_rec = NULL;
err:
        cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
        return ret;
}
EXPORT_SYMBOL(rdma_set_ib_path);

static int cma_resolve_iw_route(struct rdma_id_private *id_priv)
{
        struct cma_work *work;

        work = kzalloc(sizeof *work, GFP_KERNEL);
        if (!work)
                return -ENOMEM;

        cma_init_resolve_route_work(work, id_priv);
        queue_work(cma_wq, &work->work);
        return 0;
}

static int get_vlan_ndev_tc(struct net_device *vlan_ndev, int prio)
{
        struct net_device *dev;

        dev = vlan_dev_real_dev(vlan_ndev);
        if (dev->num_tc)
                return netdev_get_prio_tc_map(dev, prio);

        return (vlan_dev_get_egress_qos_mask(vlan_ndev, prio) &
                VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
}

struct iboe_prio_tc_map {
        int input_prio;
        int output_tc;
        bool found;
};

static int get_lower_vlan_dev_tc(struct net_device *dev,
                                 struct netdev_nested_priv *priv)
{
        struct iboe_prio_tc_map *map = (struct iboe_prio_tc_map *)priv->data;

        if (is_vlan_dev(dev))
                map->output_tc = get_vlan_ndev_tc(dev, map->input_prio);
        else if (dev->num_tc)
                map->output_tc = netdev_get_prio_tc_map(dev, map->input_prio);
        else
                map->output_tc = 0;
        /* We are interested only in first level VLAN device, so always
         * return 1 to stop iterating over next level devices.
         */
        map->found = true;
        return 1;
}

static int iboe_tos_to_sl(struct net_device *ndev, int tos)
{
        struct iboe_prio_tc_map prio_tc_map = {};
        int prio = rt_tos2priority(tos);
        struct netdev_nested_priv priv;

        /* If VLAN device, get it directly from the VLAN netdev */
        if (is_vlan_dev(ndev))
                return get_vlan_ndev_tc(ndev, prio);

        prio_tc_map.input_prio = prio;
        priv.data = (void *)&prio_tc_map;
        rcu_read_lock();
        netdev_walk_all_lower_dev_rcu(ndev,
                                      get_lower_vlan_dev_tc,
                                      &priv);
        rcu_read_unlock();
        /* If map is found from lower device, use it; Otherwise
         * continue with the current netdevice to get priority to tc map.
         */
        if (prio_tc_map.found)
                return prio_tc_map.output_tc;
        else if (ndev->num_tc)
                return netdev_get_prio_tc_map(ndev, prio);
        else
                return 0;
}

static __be32 cma_get_roce_udp_flow_label(struct rdma_id_private *id_priv)
{
        struct sockaddr_in6 *addr6;
        u16 dport, sport;
        u32 hash, fl;

        addr6 = (struct sockaddr_in6 *)cma_src_addr(id_priv);
        fl = be32_to_cpu(addr6->sin6_flowinfo) & IB_GRH_FLOWLABEL_MASK;
        if ((cma_family(id_priv) != AF_INET6) || !fl) {
                dport = be16_to_cpu(cma_port(cma_dst_addr(id_priv)));
                sport = be16_to_cpu(cma_port(cma_src_addr(id_priv)));
                hash = (u32)sport * 31 + dport;
                fl = hash & IB_GRH_FLOWLABEL_MASK;
        }

        return cpu_to_be32(fl);
}

static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
{
        struct rdma_route *route = &id_priv->id.route;
        struct rdma_addr *addr = &route->addr;
        struct cma_work *work;
        int ret;
        struct net_device *ndev;

        u8 default_roce_tos = id_priv->cma_dev->default_roce_tos[id_priv->id.port_num -
                                        rdma_start_port(id_priv->cma_dev->device)];
        u8 tos;

        mutex_lock(&id_priv->qp_mutex);
        tos = id_priv->tos_set ? id_priv->tos : default_roce_tos;
        mutex_unlock(&id_priv->qp_mutex);

        work = kzalloc(sizeof *work, GFP_KERNEL);
        if (!work)
                return -ENOMEM;

        route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
        if (!route->path_rec) {
                ret = -ENOMEM;
                goto err1;
        }

        route->num_pri_alt_paths = 1;

        ndev = cma_iboe_set_path_rec_l2_fields(id_priv);
        if (!ndev) {
                ret = -ENODEV;
                goto err2;
        }

        rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
                    &route->path_rec->sgid);
        rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
                    &route->path_rec->dgid);

        if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB)
                /* TODO: get the hoplimit from the inet/inet6 device */
                route->path_rec->hop_limit = addr->dev_addr.hoplimit;
        else
                route->path_rec->hop_limit = 1;
        route->path_rec->reversible = 1;
        route->path_rec->pkey = cpu_to_be16(0xffff);
        route->path_rec->mtu_selector = IB_SA_EQ;
        route->path_rec->sl = iboe_tos_to_sl(ndev, tos);
        route->path_rec->traffic_class = tos;
        route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
        route->path_rec->rate_selector = IB_SA_EQ;
        route->path_rec->rate = IB_RATE_PORT_CURRENT;
        dev_put(ndev);
        route->path_rec->packet_life_time_selector = IB_SA_EQ;
        /* In case ACK timeout is set, use this value to calculate
         * PacketLifeTime.  As per IBTA 12.7.34,
         * local ACK timeout = (2 * PacketLifeTime + Local CA’s ACK delay).
         * Assuming a negligible local ACK delay, we can use
         * PacketLifeTime = local ACK timeout/2
         * as a reasonable approximation for RoCE networks.
         */
        mutex_lock(&id_priv->qp_mutex);
        if (id_priv->timeout_set && id_priv->timeout)
                route->path_rec->packet_life_time = id_priv->timeout - 1;
        else
                route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
        mutex_unlock(&id_priv->qp_mutex);

        if (!route->path_rec->mtu) {
                ret = -EINVAL;
                goto err2;
        }

        if (rdma_protocol_roce_udp_encap(id_priv->id.device,
                                         id_priv->id.port_num))
                route->path_rec->flow_label =
                        cma_get_roce_udp_flow_label(id_priv);

        cma_init_resolve_route_work(work, id_priv);
        queue_work(cma_wq, &work->work);

        return 0;

err2:
        kfree(route->path_rec);
        route->path_rec = NULL;
        route->num_pri_alt_paths = 0;
err1:
        kfree(work);
        return ret;
}

int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms)
{
        struct rdma_id_private *id_priv;
        int ret;

        if (!timeout_ms)
                return -EINVAL;

        id_priv = container_of(id, struct rdma_id_private, id);
        if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
                return -EINVAL;

        cma_id_get(id_priv);
        if (rdma_cap_ib_sa(id->device, id->port_num))
                ret = cma_resolve_ib_route(id_priv, timeout_ms);
        else if (rdma_protocol_roce(id->device, id->port_num)) {
                ret = cma_resolve_iboe_route(id_priv);
                if (!ret)
                        cma_add_id_to_tree(id_priv);
        }
        else if (rdma_protocol_iwarp(id->device, id->port_num))
                ret = cma_resolve_iw_route(id_priv);
        else
                ret = -ENOSYS;

        if (ret)
                goto err;

        return 0;
err:
        cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
        cma_id_put(id_priv);
        return ret;
}
EXPORT_SYMBOL(rdma_resolve_route);

static void cma_set_loopback(struct sockaddr *addr)
{
        switch (addr->sa_family) {
        case AF_INET:
                ((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
                break;
        case AF_INET6:
                ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr,
                              0, 0, 0, htonl(1));
                break;
        default:
                ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr,
                            0, 0, 0, htonl(1));
                break;
        }
}

static int cma_bind_loopback(struct rdma_id_private *id_priv)
{
        struct cma_device *cma_dev, *cur_dev;
        union ib_gid gid;
        enum ib_port_state port_state;
        unsigned int p;
        u16 pkey;
        int ret;

        cma_dev = NULL;
        mutex_lock(&lock);
        list_for_each_entry(cur_dev, &dev_list, list) {
                if (cma_family(id_priv) == AF_IB &&
                    !rdma_cap_ib_cm(cur_dev->device, 1))
                        continue;

                if (!cma_dev)
                        cma_dev = cur_dev;

                rdma_for_each_port (cur_dev->device, p) {
                        if (!ib_get_cached_port_state(cur_dev->device, p, &port_state) &&
                            port_state == IB_PORT_ACTIVE) {
                                cma_dev = cur_dev;
                                goto port_found;
                        }
                }
        }

        if (!cma_dev) {
                ret = -ENODEV;
                goto out;
        }

        p = 1;

port_found:
        ret = rdma_query_gid(cma_dev->device, p, 0, &gid);
        if (ret)
                goto out;

        ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
        if (ret)
                goto out;

        id_priv->id.route.addr.dev_addr.dev_type =
                (rdma_protocol_ib(cma_dev->device, p)) ?
                ARPHRD_INFINIBAND : ARPHRD_ETHER;

        rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
        ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
        id_priv->id.port_num = p;
        cma_attach_to_dev(id_priv, cma_dev);
        rdma_restrack_add(&id_priv->res);
        cma_set_loopback(cma_src_addr(id_priv));
out:
        mutex_unlock(&lock);
        return ret;
}

static void addr_handler(int status, struct sockaddr *src_addr,
                         struct rdma_dev_addr *dev_addr, void *context)
{
        struct rdma_id_private *id_priv = context;
        struct rdma_cm_event event = {};
        struct sockaddr *addr;
        struct sockaddr_storage old_addr;

        mutex_lock(&id_priv->handler_mutex);
        if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
                           RDMA_CM_ADDR_RESOLVED))
                goto out;

        /*
         * Store the previous src address, so that if we fail to acquire
         * matching rdma device, old address can be restored back, which helps
         * to cancel the cma listen operation correctly.
         */
        addr = cma_src_addr(id_priv);
        memcpy(&old_addr, addr, rdma_addr_size(addr));
        memcpy(addr, src_addr, rdma_addr_size(src_addr));
        if (!status && !id_priv->cma_dev) {
                status = cma_acquire_dev_by_src_ip(id_priv);
                if (status)
                        pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n",
                                             status);
                rdma_restrack_add(&id_priv->res);
        } else if (status) {
                pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to resolve IP. status %d\n", status);
        }

        if (status) {
                memcpy(addr, &old_addr,
                       rdma_addr_size((struct sockaddr *)&old_addr));
                if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
                                   RDMA_CM_ADDR_BOUND))
                        goto out;
                event.event = RDMA_CM_EVENT_ADDR_ERROR;
                event.status = status;
        } else
                event.event = RDMA_CM_EVENT_ADDR_RESOLVED;

        if (cma_cm_event_handler(id_priv, &event)) {
                destroy_id_handler_unlock(id_priv);
                return;
        }
out:
        mutex_unlock(&id_priv->handler_mutex);
}

static int cma_resolve_loopback(struct rdma_id_private *id_priv)
{
        struct cma_work *work;
        union ib_gid gid;
        int ret;

        work = kzalloc(sizeof *work, GFP_KERNEL);
        if (!work)
                return -ENOMEM;

        if (!id_priv->cma_dev) {
                ret = cma_bind_loopback(id_priv);
                if (ret)
                        goto err;
        }

        rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
        rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);

        enqueue_resolve_addr_work(work, id_priv);
        return 0;
err:
        kfree(work);
        return ret;
}

static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
{
        struct cma_work *work;
        int ret;

        work = kzalloc(sizeof *work, GFP_KERNEL);
        if (!work)
                return -ENOMEM;

        if (!id_priv->cma_dev) {
                ret = cma_resolve_ib_dev(id_priv);
                if (ret)
                        goto err;
        }

        rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
                &(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));

        enqueue_resolve_addr_work(work, id_priv);
        return 0;
err:
        kfree(work);
        return ret;
}

int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
{
        struct rdma_id_private *id_priv;
        unsigned long flags;
        int ret;

        id_priv = container_of(id, struct rdma_id_private, id);
        spin_lock_irqsave(&id_priv->lock, flags);
        if ((reuse && id_priv->state != RDMA_CM_LISTEN) ||
            id_priv->state == RDMA_CM_IDLE) {
                id_priv->reuseaddr = reuse;
                ret = 0;
        } else {
                ret = -EINVAL;
        }
        spin_unlock_irqrestore(&id_priv->lock, flags);
        return ret;
}
EXPORT_SYMBOL(rdma_set_reuseaddr);

int rdma_set_afonly(struct rdma_cm_id *id, int afonly)
{
        struct rdma_id_private *id_priv;
        unsigned long flags;
        int ret;

        id_priv = container_of(id, struct rdma_id_private, id);
        spin_lock_irqsave(&id_priv->lock, flags);
        if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) {
                id_priv->options |= (1 << CMA_OPTION_AFONLY);
                id_priv->afonly = afonly;
                ret = 0;
        } else {
                ret = -EINVAL;
        }
        spin_unlock_irqrestore(&id_priv->lock, flags);
        return ret;
}
EXPORT_SYMBOL(rdma_set_afonly);

static void cma_bind_port(struct rdma_bind_list *bind_list,
                          struct rdma_id_private *id_priv)
{
        struct sockaddr *addr;
        struct sockaddr_ib *sib;
        u64 sid, mask;
        __be16 port;

        lockdep_assert_held(&lock);

        addr = cma_src_addr(id_priv);
        port = htons(bind_list->port);

        switch (addr->sa_family) {
        case AF_INET:
                ((struct sockaddr_in *) addr)->sin_port = port;
                break;
        case AF_INET6:
                ((struct sockaddr_in6 *) addr)->sin6_port = port;
                break;
        case AF_IB:
                sib = (struct sockaddr_ib *) addr;
                sid = be64_to_cpu(sib->sib_sid);
                mask = be64_to_cpu(sib->sib_sid_mask);
                sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port));
                sib->sib_sid_mask = cpu_to_be64(~0ULL);
                break;
        }
        id_priv->bind_list = bind_list;
        hlist_add_head(&id_priv->node, &bind_list->owners);
}

static int cma_alloc_port(enum rdma_ucm_port_space ps,
                          struct rdma_id_private *id_priv, unsigned short snum)
{
        struct rdma_bind_list *bind_list;
        int ret;

        lockdep_assert_held(&lock);

        bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
        if (!bind_list)
                return -ENOMEM;

        ret = cma_ps_alloc(id_priv->id.route.addr.dev_addr.net, ps, bind_list,
                           snum);
        if (ret < 0)
                goto err;

        bind_list->ps = ps;
        bind_list->port = snum;
        cma_bind_port(bind_list, id_priv);
        return 0;
err:
        kfree(bind_list);
        return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
}

static int cma_port_is_unique(struct rdma_bind_list *bind_list,
                              struct rdma_id_private *id_priv)
{
        struct rdma_id_private *cur_id;
        struct sockaddr  *daddr = cma_dst_addr(id_priv);
        struct sockaddr  *saddr = cma_src_addr(id_priv);
        __be16 dport = cma_port(daddr);

        lockdep_assert_held(&lock);

        hlist_for_each_entry(cur_id, &bind_list->owners, node) {
                struct sockaddr  *cur_daddr = cma_dst_addr(cur_id);
                struct sockaddr  *cur_saddr = cma_src_addr(cur_id);
                __be16 cur_dport = cma_port(cur_daddr);

                if (id_priv == cur_id)
                        continue;

                /* different dest port -> unique */
                if (!cma_any_port(daddr) &&
                    !cma_any_port(cur_daddr) &&
                    (dport != cur_dport))
                        continue;

                /* different src address -> unique */
                if (!cma_any_addr(saddr) &&
                    !cma_any_addr(cur_saddr) &&
                    cma_addr_cmp(saddr, cur_saddr))
                        continue;

                /* different dst address -> unique */
                if (!cma_any_addr(daddr) &&
                    !cma_any_addr(cur_daddr) &&
                    cma_addr_cmp(daddr, cur_daddr))
                        continue;

                return -EADDRNOTAVAIL;
        }
        return 0;
}

static int cma_alloc_any_port(enum rdma_ucm_port_space ps,
                              struct rdma_id_private *id_priv)
{
        static unsigned int last_used_port;
        int low, high, remaining;
        unsigned int rover;
        struct net *net = id_priv->id.route.addr.dev_addr.net;

        lockdep_assert_held(&lock);

        inet_get_local_port_range(net, &low, &high);
        remaining = (high - low) + 1;
        rover = get_random_u32_inclusive(low, remaining + low - 1);
retry:
        if (last_used_port != rover) {
                struct rdma_bind_list *bind_list;
                int ret;

                bind_list = cma_ps_find(net, ps, (unsigned short)rover);

                if (!bind_list) {
                        ret = cma_alloc_port(ps, id_priv, rover);
                } else {
                        ret = cma_port_is_unique(bind_list, id_priv);
                        if (!ret)
                                cma_bind_port(bind_list, id_priv);
                }
                /*
                 * Remember previously used port number in order to avoid
                 * re-using same port immediately after it is closed.
                 */
                if (!ret)
                        last_used_port = rover;
                if (ret != -EADDRNOTAVAIL)
                        return ret;
        }
        if (--remaining) {
                rover++;
                if ((rover < low) || (rover > high))
                        rover = low;
                goto retry;
        }
        return -EADDRNOTAVAIL;
}

/*
 * Check that the requested port is available.  This is called when trying to
 * bind to a specific port, or when trying to listen on a bound port.  In
 * the latter case, the provided id_priv may already be on the bind_list, but
 * we still need to check that it's okay to start listening.
 */
static int cma_check_port(struct rdma_bind_list *bind_list,
                          struct rdma_id_private *id_priv, uint8_t reuseaddr)
{
        struct rdma_id_private *cur_id;
        struct sockaddr *addr, *cur_addr;

        lockdep_assert_held(&lock);

        addr = cma_src_addr(id_priv);
        hlist_for_each_entry(cur_id, &bind_list->owners, node) {
                if (id_priv == cur_id)
                        continue;

                if (reuseaddr && cur_id->reuseaddr)
                        continue;

                cur_addr = cma_src_addr(cur_id);
                if (id_priv->afonly && cur_id->afonly &&
                    (addr->sa_family != cur_addr->sa_family))
                        continue;

                if (cma_any_addr(addr) || cma_any_addr(cur_addr))
                        return -EADDRNOTAVAIL;

                if (!cma_addr_cmp(addr, cur_addr))
                        return -EADDRINUSE;
        }
        return 0;
}

static int cma_use_port(enum rdma_ucm_port_space ps,
                        struct rdma_id_private *id_priv)
{
        struct rdma_bind_list *bind_list;
        unsigned short snum;
        int ret;

        lockdep_assert_held(&lock);

        snum = ntohs(cma_port(cma_src_addr(id_priv)));
        if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
                return -EACCES;

        bind_list = cma_ps_find(id_priv->id.route.addr.dev_addr.net, ps, snum);
        if (!bind_list) {
                ret = cma_alloc_port(ps, id_priv, snum);
        } else {
                ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
                if (!ret)
                        cma_bind_port(bind_list, id_priv);
        }
        return ret;
}

static enum rdma_ucm_port_space
cma_select_inet_ps(struct rdma_id_private *id_priv)
{
        switch (id_priv->id.ps) {
        case RDMA_PS_TCP:
        case RDMA_PS_UDP:
        case RDMA_PS_IPOIB:
        case RDMA_PS_IB:
                return id_priv->id.ps;
        default:

                return 0;
        }
}

static enum rdma_ucm_port_space
cma_select_ib_ps(struct rdma_id_private *id_priv)
{
        enum rdma_ucm_port_space ps = 0;
        struct sockaddr_ib *sib;
        u64 sid_ps, mask, sid;

        sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
        mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK;
        sid = be64_to_cpu(sib->sib_sid) & mask;

        if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
                sid_ps = RDMA_IB_IP_PS_IB;
                ps = RDMA_PS_IB;
        } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
                   (sid == (RDMA_IB_IP_PS_TCP & mask))) {
                sid_ps = RDMA_IB_IP_PS_TCP;
                ps = RDMA_PS_TCP;
        } else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
                   (sid == (RDMA_IB_IP_PS_UDP & mask))) {
                sid_ps = RDMA_IB_IP_PS_UDP;
                ps = RDMA_PS_UDP;
        }

        if (ps) {
                sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib)));
                sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK |
                                                be64_to_cpu(sib->sib_sid_mask));
        }
        return ps;
}

static int cma_get_port(struct rdma_id_private *id_priv)
{
        enum rdma_ucm_port_space ps;
        int ret;

        if (cma_family(id_priv) != AF_IB)
                ps = cma_select_inet_ps(id_priv);
        else
                ps = cma_select_ib_ps(id_priv);
        if (!ps)
                return -EPROTONOSUPPORT;

        mutex_lock(&lock);
        if (cma_any_port(cma_src_addr(id_priv)))
                ret = cma_alloc_any_port(ps, id_priv);
        else
                ret = cma_use_port(ps, id_priv);
        mutex_unlock(&lock);

        return ret;
}

static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
                               struct sockaddr *addr)
{
#if IS_ENABLED(CONFIG_IPV6)
        struct sockaddr_in6 *sin6;

        if (addr->sa_family != AF_INET6)
                return 0;

        sin6 = (struct sockaddr_in6 *) addr;

        if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
                return 0;

        if (!sin6->sin6_scope_id)
                        return -EINVAL;

        dev_addr->bound_dev_if = sin6->sin6_scope_id;
#endif
        return 0;
}

int rdma_listen(struct rdma_cm_id *id, int backlog)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);
        int ret;

        if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN)) {
                struct sockaddr_in any_in = {
                        .sin_family = AF_INET,
                        .sin_addr.s_addr = htonl(INADDR_ANY),
                };

                /* For a well behaved ULP state will be RDMA_CM_IDLE */
                ret = rdma_bind_addr(id, (struct sockaddr *)&any_in);
                if (ret)
                        return ret;
                if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND,
                                           RDMA_CM_LISTEN)))
                        return -EINVAL;
        }

        /*
         * Once the ID reaches RDMA_CM_LISTEN it is not allowed to be reusable
         * any more, and has to be unique in the bind list.
         */
        if (id_priv->reuseaddr) {
                mutex_lock(&lock);
                ret = cma_check_port(id_priv->bind_list, id_priv, 0);
                if (!ret)
                        id_priv->reuseaddr = 0;
                mutex_unlock(&lock);
                if (ret)
                        goto err;
        }

        id_priv->backlog = backlog;
        if (id_priv->cma_dev) {
                if (rdma_cap_ib_cm(id->device, 1)) {
                        ret = cma_ib_listen(id_priv);
                        if (ret)
                                goto err;
                } else if (rdma_cap_iw_cm(id->device, 1)) {
                        ret = cma_iw_listen(id_priv, backlog);
                        if (ret)
                                goto err;
                } else {
                        ret = -ENOSYS;
                        goto err;
                }
        } else {
                ret = cma_listen_on_all(id_priv);
                if (ret)
                        goto err;
        }

        return 0;
err:
        id_priv->backlog = 0;
        /*
         * All the failure paths that lead here will not allow the req_handler's
         * to have run.
         */
        cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND);
        return ret;
}
EXPORT_SYMBOL(rdma_listen);

static int rdma_bind_addr_dst(struct rdma_id_private *id_priv,
                              struct sockaddr *addr, const struct sockaddr *daddr)
{
        struct sockaddr *id_daddr;
        int ret;

        if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 &&
            addr->sa_family != AF_IB)
                return -EAFNOSUPPORT;

        if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
                return -EINVAL;

        ret = cma_check_linklocal(&id_priv->id.route.addr.dev_addr, addr);
        if (ret)
                goto err1;

        memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr));
        if (!cma_any_addr(addr)) {
                ret = cma_translate_addr(addr, &id_priv->id.route.addr.dev_addr);
                if (ret)
                        goto err1;

                ret = cma_acquire_dev_by_src_ip(id_priv);
                if (ret)
                        goto err1;
        }

        if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) {
                if (addr->sa_family == AF_INET)
                        id_priv->afonly = 1;
#if IS_ENABLED(CONFIG_IPV6)
                else if (addr->sa_family == AF_INET6) {
                        struct net *net = id_priv->id.route.addr.dev_addr.net;

                        id_priv->afonly = net->ipv6.sysctl.bindv6only;
                }
#endif
        }
        id_daddr = cma_dst_addr(id_priv);
        if (daddr != id_daddr)
                memcpy(id_daddr, daddr, rdma_addr_size(addr));
        id_daddr->sa_family = addr->sa_family;

        ret = cma_get_port(id_priv);
        if (ret)
                goto err2;

        if (!cma_any_addr(addr))
                rdma_restrack_add(&id_priv->res);
        return 0;
err2:
        if (id_priv->cma_dev)
                cma_release_dev(id_priv);
err1:
        cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
        return ret;
}

static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
                         const struct sockaddr *dst_addr)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);
        struct sockaddr_storage zero_sock = {};

        if (src_addr && src_addr->sa_family)
                return rdma_bind_addr_dst(id_priv, src_addr, dst_addr);

        /*
         * When the src_addr is not specified, automatically supply an any addr
         */
        zero_sock.ss_family = dst_addr->sa_family;
        if (IS_ENABLED(CONFIG_IPV6) && dst_addr->sa_family == AF_INET6) {
                struct sockaddr_in6 *src_addr6 =
                        (struct sockaddr_in6 *)&zero_sock;
                struct sockaddr_in6 *dst_addr6 =
                        (struct sockaddr_in6 *)dst_addr;

                src_addr6->sin6_scope_id = dst_addr6->sin6_scope_id;
                if (ipv6_addr_type(&dst_addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)
                        id->route.addr.dev_addr.bound_dev_if =
                                dst_addr6->sin6_scope_id;
        } else if (dst_addr->sa_family == AF_IB) {
                ((struct sockaddr_ib *)&zero_sock)->sib_pkey =
                        ((struct sockaddr_ib *)dst_addr)->sib_pkey;
        }
        return rdma_bind_addr_dst(id_priv, (struct sockaddr *)&zero_sock, dst_addr);
}

/*
 * If required, resolve the source address for bind and leave the id_priv in
 * state RDMA_CM_ADDR_BOUND. This oddly uses the state to determine the prior
 * calls made by ULP, a previously bound ID will not be re-bound and src_addr is
 * ignored.
 */
static int resolve_prepare_src(struct rdma_id_private *id_priv,
                               struct sockaddr *src_addr,
                               const struct sockaddr *dst_addr)
{
        int ret;

        if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) {
                /* For a well behaved ULP state will be RDMA_CM_IDLE */
                ret = cma_bind_addr(&id_priv->id, src_addr, dst_addr);
                if (ret)
                        return ret;
                if (WARN_ON(!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND,
                                           RDMA_CM_ADDR_QUERY)))
                        return -EINVAL;

        } else {
                memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
        }

        if (cma_family(id_priv) != dst_addr->sa_family) {
                ret = -EINVAL;
                goto err_state;
        }
        return 0;

err_state:
        cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
        return ret;
}

int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
                      const struct sockaddr *dst_addr, unsigned long timeout_ms)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);
        int ret;

        ret = resolve_prepare_src(id_priv, src_addr, dst_addr);
        if (ret)
                return ret;

        if (cma_any_addr(dst_addr)) {
                ret = cma_resolve_loopback(id_priv);
        } else {
                if (dst_addr->sa_family == AF_IB) {
                        ret = cma_resolve_ib_addr(id_priv);
                } else {
                        /*
                         * The FSM can return back to RDMA_CM_ADDR_BOUND after
                         * rdma_resolve_ip() is called, eg through the error
                         * path in addr_handler(). If this happens the existing
                         * request must be canceled before issuing a new one.
                         * Since canceling a request is a bit slow and this
                         * oddball path is rare, keep track once a request has
                         * been issued. The track turns out to be a permanent
                         * state since this is the only cancel as it is
                         * immediately before rdma_resolve_ip().
                         */
                        if (id_priv->used_resolve_ip)
                                rdma_addr_cancel(&id->route.addr.dev_addr);
                        else
                                id_priv->used_resolve_ip = 1;
                        ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr,
                                              &id->route.addr.dev_addr,
                                              timeout_ms, addr_handler,
                                              false, id_priv);
                }
        }
        if (ret)
                goto err;

        return 0;
err:
        cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
        return ret;
}
EXPORT_SYMBOL(rdma_resolve_addr);

int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);

        return rdma_bind_addr_dst(id_priv, addr, cma_dst_addr(id_priv));
}
EXPORT_SYMBOL(rdma_bind_addr);

static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv)
{
        struct cma_hdr *cma_hdr;

        cma_hdr = hdr;
        cma_hdr->cma_version = CMA_VERSION;
        if (cma_family(id_priv) == AF_INET) {
                struct sockaddr_in *src4, *dst4;

                src4 = (struct sockaddr_in *) cma_src_addr(id_priv);
                dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv);

                cma_set_ip_ver(cma_hdr, 4);
                cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
                cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
                cma_hdr->port = src4->sin_port;
        } else if (cma_family(id_priv) == AF_INET6) {
                struct sockaddr_in6 *src6, *dst6;

                src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
                dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv);

                cma_set_ip_ver(cma_hdr, 6);
                cma_hdr->src_addr.ip6 = src6->sin6_addr;
                cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
                cma_hdr->port = src6->sin6_port;
        }
        return 0;
}

static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
                                const struct ib_cm_event *ib_event)
{
        struct rdma_id_private *id_priv = cm_id->context;
        struct rdma_cm_event event = {};
        const struct ib_cm_sidr_rep_event_param *rep =
                                &ib_event->param.sidr_rep_rcvd;
        int ret;

        mutex_lock(&id_priv->handler_mutex);
        if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT)
                goto out;

        switch (ib_event->event) {
        case IB_CM_SIDR_REQ_ERROR:
                event.event = RDMA_CM_EVENT_UNREACHABLE;
                event.status = -ETIMEDOUT;
                break;
        case IB_CM_SIDR_REP_RECEIVED:
                event.param.ud.private_data = ib_event->private_data;
                event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
                if (rep->status != IB_SIDR_SUCCESS) {
                        event.event = RDMA_CM_EVENT_UNREACHABLE;
                        event.status = ib_event->param.sidr_rep_rcvd.status;
                        pr_debug_ratelimited("RDMA CM: UNREACHABLE: bad SIDR reply. status %d\n",
                                             event.status);
                        break;
                }
                ret = cma_set_qkey(id_priv, rep->qkey);
                if (ret) {
                        pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to set qkey. status %d\n", ret);
                        event.event = RDMA_CM_EVENT_ADDR_ERROR;
                        event.status = ret;
                        break;
                }
                ib_init_ah_attr_from_path(id_priv->id.device,
                                          id_priv->id.port_num,
                                          id_priv->id.route.path_rec,
                                          &event.param.ud.ah_attr,
                                          rep->sgid_attr);
                event.param.ud.qp_num = rep->qpn;
                event.param.ud.qkey = rep->qkey;
                event.event = RDMA_CM_EVENT_ESTABLISHED;
                event.status = 0;
                break;
        default:
                pr_err("RDMA CMA: unexpected IB CM event: %d\n",
                       ib_event->event);
                goto out;
        }

        ret = cma_cm_event_handler(id_priv, &event);

        rdma_destroy_ah_attr(&event.param.ud.ah_attr);
        if (ret) {
                /* Destroy the CM ID by returning a non-zero value. */
                id_priv->cm_id.ib = NULL;
                destroy_id_handler_unlock(id_priv);
                return ret;
        }
out:
        mutex_unlock(&id_priv->handler_mutex);
        return 0;
}

static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
                              struct rdma_conn_param *conn_param)
{
        struct ib_cm_sidr_req_param req;
        struct ib_cm_id        *id;
        void *private_data;
        u8 offset;
        int ret;

        memset(&req, 0, sizeof req);
        offset = cma_user_data_offset(id_priv);
        if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len))
                return -EINVAL;

        if (req.private_data_len) {
                private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
                if (!private_data)
                        return -ENOMEM;
        } else {
                private_data = NULL;
        }

        if (conn_param->private_data && conn_param->private_data_len)
                memcpy(private_data + offset, conn_param->private_data,
                       conn_param->private_data_len);

        if (private_data) {
                ret = cma_format_hdr(private_data, id_priv);
                if (ret)
                        goto out;
                req.private_data = private_data;
        }

        id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler,
                             id_priv);
        if (IS_ERR(id)) {
                ret = PTR_ERR(id);
                goto out;
        }
        id_priv->cm_id.ib = id;

        req.path = id_priv->id.route.path_rec;
        req.sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
        req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
        req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
        req.max_cm_retries = CMA_MAX_CM_RETRIES;

        trace_cm_send_sidr_req(id_priv);
        ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
        if (ret) {
                ib_destroy_cm_id(id_priv->cm_id.ib);
                id_priv->cm_id.ib = NULL;
        }
out:
        kfree(private_data);
        return ret;
}

static int cma_connect_ib(struct rdma_id_private *id_priv,
                          struct rdma_conn_param *conn_param)
{
        struct ib_cm_req_param req;
        struct rdma_route *route;
        void *private_data;
        struct ib_cm_id        *id;
        u8 offset;
        int ret;

        memset(&req, 0, sizeof req);
        offset = cma_user_data_offset(id_priv);
        if (check_add_overflow(offset, conn_param->private_data_len, &req.private_data_len))
                return -EINVAL;

        if (req.private_data_len) {
                private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
                if (!private_data)
                        return -ENOMEM;
        } else {
                private_data = NULL;
        }

        if (conn_param->private_data && conn_param->private_data_len)
                memcpy(private_data + offset, conn_param->private_data,
                       conn_param->private_data_len);

        id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv);
        if (IS_ERR(id)) {
                ret = PTR_ERR(id);
                goto out;
        }
        id_priv->cm_id.ib = id;

        route = &id_priv->id.route;
        if (private_data) {
                ret = cma_format_hdr(private_data, id_priv);
                if (ret)
                        goto out;
                req.private_data = private_data;
        }

        req.primary_path = &route->path_rec[0];
        req.primary_path_inbound = route->path_rec_inbound;
        req.primary_path_outbound = route->path_rec_outbound;
        if (route->num_pri_alt_paths == 2)
                req.alternate_path = &route->path_rec[1];

        req.ppath_sgid_attr = id_priv->id.route.addr.dev_addr.sgid_attr;
        /* Alternate path SGID attribute currently unsupported */
        req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
        req.qp_num = id_priv->qp_num;
        req.qp_type = id_priv->id.qp_type;
        req.starting_psn = id_priv->seq_num;
        req.responder_resources = conn_param->responder_resources;
        req.initiator_depth = conn_param->initiator_depth;
        req.flow_control = conn_param->flow_control;
        req.retry_count = min_t(u8, 7, conn_param->retry_count);
        req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
        req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
        req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
        req.max_cm_retries = CMA_MAX_CM_RETRIES;
        req.srq = id_priv->srq ? 1 : 0;
        req.ece.vendor_id = id_priv->ece.vendor_id;
        req.ece.attr_mod = id_priv->ece.attr_mod;

        trace_cm_send_req(id_priv);
        ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
out:
        if (ret && !IS_ERR(id)) {
                ib_destroy_cm_id(id);
                id_priv->cm_id.ib = NULL;
        }

        kfree(private_data);
        return ret;
}

static int cma_connect_iw(struct rdma_id_private *id_priv,
                          struct rdma_conn_param *conn_param)
{
        struct iw_cm_id *cm_id;
        int ret;
        struct iw_cm_conn_param iw_param;

        cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
        if (IS_ERR(cm_id))
                return PTR_ERR(cm_id);

        mutex_lock(&id_priv->qp_mutex);
        cm_id->tos = id_priv->tos;
        cm_id->tos_set = id_priv->tos_set;
        mutex_unlock(&id_priv->qp_mutex);

        id_priv->cm_id.iw = cm_id;

        memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
               rdma_addr_size(cma_src_addr(id_priv)));
        memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv),
               rdma_addr_size(cma_dst_addr(id_priv)));

        ret = cma_modify_qp_rtr(id_priv, conn_param);
        if (ret)
                goto out;

        if (conn_param) {
                iw_param.ord = conn_param->initiator_depth;
                iw_param.ird = conn_param->responder_resources;
                iw_param.private_data = conn_param->private_data;
                iw_param.private_data_len = conn_param->private_data_len;
                iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num;
        } else {
                memset(&iw_param, 0, sizeof iw_param);
                iw_param.qpn = id_priv->qp_num;
        }
        ret = iw_cm_connect(cm_id, &iw_param);
out:
        if (ret) {
                iw_destroy_cm_id(cm_id);
                id_priv->cm_id.iw = NULL;
        }
        return ret;
}

/**
 * rdma_connect_locked - Initiate an active connection request.
 * @id: Connection identifier to connect.
 * @conn_param: Connection information used for connected QPs.
 *
 * Same as rdma_connect() but can only be called from the
 * RDMA_CM_EVENT_ROUTE_RESOLVED handler callback.
 */
int rdma_connect_locked(struct rdma_cm_id *id,
                        struct rdma_conn_param *conn_param)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);
        int ret;

        if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
                return -EINVAL;

        if (!id->qp) {
                id_priv->qp_num = conn_param->qp_num;
                id_priv->srq = conn_param->srq;
        }

        if (rdma_cap_ib_cm(id->device, id->port_num)) {
                if (id->qp_type == IB_QPT_UD)
                        ret = cma_resolve_ib_udp(id_priv, conn_param);
                else
                        ret = cma_connect_ib(id_priv, conn_param);
        } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
                ret = cma_connect_iw(id_priv, conn_param);
        } else {
                ret = -ENOSYS;
        }
        if (ret)
                goto err_state;
        return 0;
err_state:
        cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED);
        return ret;
}
EXPORT_SYMBOL(rdma_connect_locked);

/**
 * rdma_connect - Initiate an active connection request.
 * @id: Connection identifier to connect.
 * @conn_param: Connection information used for connected QPs.
 *
 * Users must have resolved a route for the rdma_cm_id to connect with by having
 * called rdma_resolve_route before calling this routine.
 *
 * This call will either connect to a remote QP or obtain remote QP information
 * for unconnected rdma_cm_id's.  The actual operation is based on the
 * rdma_cm_id's port space.
 */
int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);
        int ret;

        mutex_lock(&id_priv->handler_mutex);
        ret = rdma_connect_locked(id, conn_param);
        mutex_unlock(&id_priv->handler_mutex);
        return ret;
}
EXPORT_SYMBOL(rdma_connect);

/**
 * rdma_connect_ece - Initiate an active connection request with ECE data.
 * @id: Connection identifier to connect.
 * @conn_param: Connection information used for connected QPs.
 * @ece: ECE parameters
 *
 * See rdma_connect() explanation.
 */
int rdma_connect_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
                     struct rdma_ucm_ece *ece)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);

        id_priv->ece.vendor_id = ece->vendor_id;
        id_priv->ece.attr_mod = ece->attr_mod;

        return rdma_connect(id, conn_param);
}
EXPORT_SYMBOL(rdma_connect_ece);

static int cma_accept_ib(struct rdma_id_private *id_priv,
                         struct rdma_conn_param *conn_param)
{
        struct ib_cm_rep_param rep;
        int ret;

        ret = cma_modify_qp_rtr(id_priv, conn_param);
        if (ret)
                goto out;

        ret = cma_modify_qp_rts(id_priv, conn_param);
        if (ret)
                goto out;

        memset(&rep, 0, sizeof rep);
        rep.qp_num = id_priv->qp_num;
        rep.starting_psn = id_priv->seq_num;
        rep.private_data = conn_param->private_data;
        rep.private_data_len = conn_param->private_data_len;
        rep.responder_resources = conn_param->responder_resources;
        rep.initiator_depth = conn_param->initiator_depth;
        rep.failover_accepted = 0;
        rep.flow_control = conn_param->flow_control;
        rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
        rep.srq = id_priv->srq ? 1 : 0;
        rep.ece.vendor_id = id_priv->ece.vendor_id;
        rep.ece.attr_mod = id_priv->ece.attr_mod;

        trace_cm_send_rep(id_priv);
        ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
out:
        return ret;
}

static int cma_accept_iw(struct rdma_id_private *id_priv,
                  struct rdma_conn_param *conn_param)
{
        struct iw_cm_conn_param iw_param;
        int ret;

        if (!conn_param)
                return -EINVAL;

        ret = cma_modify_qp_rtr(id_priv, conn_param);
        if (ret)
                return ret;

        iw_param.ord = conn_param->initiator_depth;
        iw_param.ird = conn_param->responder_resources;
        iw_param.private_data = conn_param->private_data;
        iw_param.private_data_len = conn_param->private_data_len;
        if (id_priv->id.qp)
                iw_param.qpn = id_priv->qp_num;
        else
                iw_param.qpn = conn_param->qp_num;

        return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
}

static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
                             enum ib_cm_sidr_status status, u32 qkey,
                             const void *private_data, int private_data_len)
{
        struct ib_cm_sidr_rep_param rep;
        int ret;

        memset(&rep, 0, sizeof rep);
        rep.status = status;
        if (status == IB_SIDR_SUCCESS) {
                if (qkey)
                        ret = cma_set_qkey(id_priv, qkey);
                else
                        ret = cma_set_default_qkey(id_priv);
                if (ret)
                        return ret;
                rep.qp_num = id_priv->qp_num;
                rep.qkey = id_priv->qkey;

                rep.ece.vendor_id = id_priv->ece.vendor_id;
                rep.ece.attr_mod = id_priv->ece.attr_mod;
        }

        rep.private_data = private_data;
        rep.private_data_len = private_data_len;

        trace_cm_send_sidr_rep(id_priv);
        return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
}

/**
 * rdma_accept - Called to accept a connection request or response.
 * @id: Connection identifier associated with the request.
 * @conn_param: Information needed to establish the connection.  This must be
 *   provided if accepting a connection request.  If accepting a connection
 *   response, this parameter must be NULL.
 *
 * Typically, this routine is only called by the listener to accept a connection
 * request.  It must also be called on the active side of a connection if the
 * user is performing their own QP transitions.
 *
 * In the case of error, a reject message is sent to the remote side and the
 * state of the qp associated with the id is modified to error, such that any
 * previously posted receive buffers would be flushed.
 *
 * This function is for use by kernel ULPs and must be called from under the
 * handler callback.
 */
int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);
        int ret;

        lockdep_assert_held(&id_priv->handler_mutex);

        if (READ_ONCE(id_priv->state) != RDMA_CM_CONNECT)
                return -EINVAL;

        if (!id->qp && conn_param) {
                id_priv->qp_num = conn_param->qp_num;
                id_priv->srq = conn_param->srq;
        }

        if (rdma_cap_ib_cm(id->device, id->port_num)) {
                if (id->qp_type == IB_QPT_UD) {
                        if (conn_param)
                                ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
                                                        conn_param->qkey,
                                                        conn_param->private_data,
                                                        conn_param->private_data_len);
                        else
                                ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
                                                        0, NULL, 0);
                } else {
                        if (conn_param)
                                ret = cma_accept_ib(id_priv, conn_param);
                        else
                                ret = cma_rep_recv(id_priv);
                }
        } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
                ret = cma_accept_iw(id_priv, conn_param);
        } else {
                ret = -ENOSYS;
        }
        if (ret)
                goto reject;

        return 0;
reject:
        cma_modify_qp_err(id_priv);
        rdma_reject(id, NULL, 0, IB_CM_REJ_CONSUMER_DEFINED);
        return ret;
}
EXPORT_SYMBOL(rdma_accept);

int rdma_accept_ece(struct rdma_cm_id *id, struct rdma_conn_param *conn_param,
                    struct rdma_ucm_ece *ece)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);

        id_priv->ece.vendor_id = ece->vendor_id;
        id_priv->ece.attr_mod = ece->attr_mod;

        return rdma_accept(id, conn_param);
}
EXPORT_SYMBOL(rdma_accept_ece);

void rdma_lock_handler(struct rdma_cm_id *id)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);

        mutex_lock(&id_priv->handler_mutex);
}
EXPORT_SYMBOL(rdma_lock_handler);

void rdma_unlock_handler(struct rdma_cm_id *id)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);

        mutex_unlock(&id_priv->handler_mutex);
}
EXPORT_SYMBOL(rdma_unlock_handler);

int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
{
        struct rdma_id_private *id_priv;
        int ret;

        id_priv = container_of(id, struct rdma_id_private, id);
        if (!id_priv->cm_id.ib)
                return -EINVAL;

        switch (id->device->node_type) {
        case RDMA_NODE_IB_CA:
                ret = ib_cm_notify(id_priv->cm_id.ib, event);
                break;
        default:
                ret = 0;
                break;
        }
        return ret;
}
EXPORT_SYMBOL(rdma_notify);

int rdma_reject(struct rdma_cm_id *id, const void *private_data,
                u8 private_data_len, u8 reason)
{
        struct rdma_id_private *id_priv;
        int ret;

        id_priv = container_of(id, struct rdma_id_private, id);
        if (!id_priv->cm_id.ib)
                return -EINVAL;

        if (rdma_cap_ib_cm(id->device, id->port_num)) {
                if (id->qp_type == IB_QPT_UD) {
                        ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
                                                private_data, private_data_len);
                } else {
                        trace_cm_send_rej(id_priv);
                        ret = ib_send_cm_rej(id_priv->cm_id.ib, reason, NULL, 0,
                                             private_data, private_data_len);
                }
        } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
                ret = iw_cm_reject(id_priv->cm_id.iw,
                                   private_data, private_data_len);
        } else {
                ret = -ENOSYS;
        }

        return ret;
}
EXPORT_SYMBOL(rdma_reject);

int rdma_disconnect(struct rdma_cm_id *id)
{
        struct rdma_id_private *id_priv;
        int ret;

        id_priv = container_of(id, struct rdma_id_private, id);
        if (!id_priv->cm_id.ib)
                return -EINVAL;

        if (rdma_cap_ib_cm(id->device, id->port_num)) {
                ret = cma_modify_qp_err(id_priv);
                if (ret)
                        goto out;
                /* Initiate or respond to a disconnect. */
                trace_cm_disconnect(id_priv);
                if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0)) {
                        if (!ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0))
                                trace_cm_sent_drep(id_priv);
                } else {
                        trace_cm_sent_dreq(id_priv);
                }
        } else if (rdma_cap_iw_cm(id->device, id->port_num)) {
                ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
        } else
                ret = -EINVAL;

out:
        return ret;
}
EXPORT_SYMBOL(rdma_disconnect);

static void cma_make_mc_event(int status, struct rdma_id_private *id_priv,
                              struct ib_sa_multicast *multicast,
                              struct rdma_cm_event *event,
                              struct cma_multicast *mc)
{
        struct rdma_dev_addr *dev_addr;
        enum ib_gid_type gid_type;
        struct net_device *ndev;

        if (status)
                pr_debug_ratelimited("RDMA CM: MULTICAST_ERROR: failed to join multicast. status %d\n",
                                     status);

        event->status = status;
        event->param.ud.private_data = mc->context;
        if (status) {
                event->event = RDMA_CM_EVENT_MULTICAST_ERROR;
                return;
        }

        dev_addr = &id_priv->id.route.addr.dev_addr;
        ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
        gid_type =
                id_priv->cma_dev
                        ->default_gid_type[id_priv->id.port_num -
                                           rdma_start_port(
                                                   id_priv->cma_dev->device)];

        event->event = RDMA_CM_EVENT_MULTICAST_JOIN;
        if (ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num,
                                     &multicast->rec, ndev, gid_type,
                                     &event->param.ud.ah_attr)) {
                event->event = RDMA_CM_EVENT_MULTICAST_ERROR;
                goto out;
        }

        event->param.ud.qp_num = 0xFFFFFF;
        event->param.ud.qkey = id_priv->qkey;

out:
        dev_put(ndev);
}

static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
{
        struct cma_multicast *mc = multicast->context;
        struct rdma_id_private *id_priv = mc->id_priv;
        struct rdma_cm_event event = {};
        int ret = 0;

        mutex_lock(&id_priv->handler_mutex);
        if (READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL ||
            READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING)
                goto out;

        ret = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
        if (!ret) {
                cma_make_mc_event(status, id_priv, multicast, &event, mc);
                ret = cma_cm_event_handler(id_priv, &event);
        }
        rdma_destroy_ah_attr(&event.param.ud.ah_attr);
        WARN_ON(ret);

out:
        mutex_unlock(&id_priv->handler_mutex);
        return 0;
}

static void cma_set_mgid(struct rdma_id_private *id_priv,
                         struct sockaddr *addr, union ib_gid *mgid)
{
        unsigned char mc_map[MAX_ADDR_LEN];
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        struct sockaddr_in *sin = (struct sockaddr_in *) addr;
        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;

        if (cma_any_addr(addr)) {
                memset(mgid, 0, sizeof *mgid);
        } else if ((addr->sa_family == AF_INET6) &&
                   ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
                                                                 0xFF10A01B)) {
                /* IPv6 address is an SA assigned MGID. */
                memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
        } else if (addr->sa_family == AF_IB) {
                memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid);
        } else if (addr->sa_family == AF_INET6) {
                ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
                if (id_priv->id.ps == RDMA_PS_UDP)
                        mc_map[7] = 0x01;        /* Use RDMA CM signature */
                *mgid = *(union ib_gid *) (mc_map + 4);
        } else {
                ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
                if (id_priv->id.ps == RDMA_PS_UDP)
                        mc_map[7] = 0x01;        /* Use RDMA CM signature */
                *mgid = *(union ib_gid *) (mc_map + 4);
        }
}

static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
                                 struct cma_multicast *mc)
{
        struct ib_sa_mcmember_rec rec;
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        ib_sa_comp_mask comp_mask;
        int ret;

        ib_addr_get_mgid(dev_addr, &rec.mgid);
        ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
                                     &rec.mgid, &rec);
        if (ret)
                return ret;

        if (!id_priv->qkey) {
                ret = cma_set_default_qkey(id_priv);
                if (ret)
                        return ret;
        }

        cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
        rec.qkey = cpu_to_be32(id_priv->qkey);
        rdma_addr_get_sgid(dev_addr, &rec.port_gid);
        rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
        rec.join_state = mc->join_state;

        comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
                    IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
                    IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
                    IB_SA_MCMEMBER_REC_FLOW_LABEL |
                    IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;

        if (id_priv->id.ps == RDMA_PS_IPOIB)
                comp_mask |= IB_SA_MCMEMBER_REC_RATE |
                             IB_SA_MCMEMBER_REC_RATE_SELECTOR |
                             IB_SA_MCMEMBER_REC_MTU_SELECTOR |
                             IB_SA_MCMEMBER_REC_MTU |
                             IB_SA_MCMEMBER_REC_HOP_LIMIT;

        mc->sa_mc = ib_sa_join_multicast(&sa_client, id_priv->id.device,
                                         id_priv->id.port_num, &rec, comp_mask,
                                         GFP_KERNEL, cma_ib_mc_handler, mc);
        return PTR_ERR_OR_ZERO(mc->sa_mc);
}

static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid,
                              enum ib_gid_type gid_type)
{
        struct sockaddr_in *sin = (struct sockaddr_in *)addr;
        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;

        if (cma_any_addr(addr)) {
                memset(mgid, 0, sizeof *mgid);
        } else if (addr->sa_family == AF_INET6) {
                memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
        } else {
                mgid->raw[0] =
                        (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0xff;
                mgid->raw[1] =
                        (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? 0 : 0x0e;
                mgid->raw[2] = 0;
                mgid->raw[3] = 0;
                mgid->raw[4] = 0;
                mgid->raw[5] = 0;
                mgid->raw[6] = 0;
                mgid->raw[7] = 0;
                mgid->raw[8] = 0;
                mgid->raw[9] = 0;
                mgid->raw[10] = 0xff;
                mgid->raw[11] = 0xff;
                *(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
        }
}

static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
                                   struct cma_multicast *mc)
{
        struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
        int err = 0;
        struct sockaddr *addr = (struct sockaddr *)&mc->addr;
        struct net_device *ndev = NULL;
        struct ib_sa_multicast ib = {};
        enum ib_gid_type gid_type;
        bool send_only;

        send_only = mc->join_state == BIT(SENDONLY_FULLMEMBER_JOIN);

        if (cma_zero_addr(addr))
                return -EINVAL;

        gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num -
                   rdma_start_port(id_priv->cma_dev->device)];
        cma_iboe_set_mgid(addr, &ib.rec.mgid, gid_type);

        ib.rec.pkey = cpu_to_be16(0xffff);
        if (dev_addr->bound_dev_if)
                ndev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if);
        if (!ndev)
                return -ENODEV;

        ib.rec.rate = IB_RATE_PORT_CURRENT;
        ib.rec.hop_limit = 1;
        ib.rec.mtu = iboe_get_mtu(ndev->mtu);

        if (addr->sa_family == AF_INET) {
                if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) {
                        ib.rec.hop_limit = IPV6_DEFAULT_HOPLIMIT;
                        if (!send_only) {
                                err = cma_igmp_send(ndev, &ib.rec.mgid,
                                                    true);
                        }
                }
        } else {
                if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP)
                        err = -ENOTSUPP;
        }
        dev_put(ndev);
        if (err || !ib.rec.mtu)
                return err ?: -EINVAL;

        if (!id_priv->qkey)
                cma_set_default_qkey(id_priv);

        rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
                    &ib.rec.port_gid);
        INIT_WORK(&mc->iboe_join.work, cma_iboe_join_work_handler);
        cma_make_mc_event(0, id_priv, &ib, &mc->iboe_join.event, mc);
        queue_work(cma_wq, &mc->iboe_join.work);
        return 0;
}

int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
                        u8 join_state, void *context)
{
        struct rdma_id_private *id_priv =
                container_of(id, struct rdma_id_private, id);
        struct cma_multicast *mc;
        int ret;

        /* Not supported for kernel QPs */
        if (WARN_ON(id->qp))
                return -EINVAL;

        /* ULP is calling this wrong. */
        if (!id->device || (READ_ONCE(id_priv->state) != RDMA_CM_ADDR_BOUND &&
                            READ_ONCE(id_priv->state) != RDMA_CM_ADDR_RESOLVED))
                return -EINVAL;

        if (id_priv->id.qp_type != IB_QPT_UD)
                return -EINVAL;

        mc = kzalloc(sizeof(*mc), GFP_KERNEL);
        if (!mc)
                return -ENOMEM;

        memcpy(&mc->addr, addr, rdma_addr_size(addr));
        mc->context = context;
        mc->id_priv = id_priv;
        mc->join_state = join_state;

        if (rdma_protocol_roce(id->device, id->port_num)) {
                ret = cma_iboe_join_multicast(id_priv, mc);
                if (ret)
                        goto out_err;
        } else if (rdma_cap_ib_mcast(id->device, id->port_num)) {
                ret = cma_join_ib_multicast(id_priv, mc);
                if (ret)
                        goto out_err;
        } else {
                ret = -ENOSYS;
                goto out_err;
        }

        spin_lock(&id_priv->lock);
        list_add(&mc->list, &id_priv->mc_list);
        spin_unlock(&id_priv->lock);

        return 0;
out_err:
        kfree(mc);
        return ret;
}
EXPORT_SYMBOL(rdma_join_multicast);

void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
{
        struct rdma_id_private *id_priv;
        struct cma_multicast *mc;

        id_priv = container_of(id, struct rdma_id_private, id);
        spin_lock_irq(&id_priv->lock);
        list_for_each_entry(mc, &id_priv->mc_list, list) {
                if (memcmp(&mc->addr, addr, rdma_addr_size(addr)) != 0)
                        continue;
                list_del(&mc->list);
                spin_unlock_irq(&id_priv->lock);

                WARN_ON(id_priv->cma_dev->device != id->device);
                destroy_mc(id_priv, mc);
                return;
        }
        spin_unlock_irq(&id_priv->lock);
}
EXPORT_SYMBOL(rdma_leave_multicast);

static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
{
        struct rdma_dev_addr *dev_addr;
        struct cma_work *work;

        dev_addr = &id_priv->id.route.addr.dev_addr;

        if ((dev_addr->bound_dev_if == ndev->ifindex) &&
            (net_eq(dev_net(ndev), dev_addr->net)) &&
            memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
                pr_info("RDMA CM addr change for ndev %s used by id %p\n",
                        ndev->name, &id_priv->id);
                work = kzalloc(sizeof *work, GFP_KERNEL);
                if (!work)
                        return -ENOMEM;

                INIT_WORK(&work->work, cma_work_handler);
                work->id = id_priv;
                work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
                cma_id_get(id_priv);
                queue_work(cma_wq, &work->work);
        }

        return 0;
}

static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
                               void *ptr)
{
        struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
        struct cma_device *cma_dev;
        struct rdma_id_private *id_priv;
        int ret = NOTIFY_DONE;

        if (event != NETDEV_BONDING_FAILOVER)
                return NOTIFY_DONE;

        if (!netif_is_bond_master(ndev))
                return NOTIFY_DONE;

        mutex_lock(&lock);
        list_for_each_entry(cma_dev, &dev_list, list)
                list_for_each_entry(id_priv, &cma_dev->id_list, device_item) {
                        ret = cma_netdev_change(ndev, id_priv);
                        if (ret)
                                goto out;
                }

out:
        mutex_unlock(&lock);
        return ret;
}

static void cma_netevent_work_handler(struct work_struct *_work)
{
        struct rdma_id_private *id_priv =
                container_of(_work, struct rdma_id_private, id.net_work);
        struct rdma_cm_event event = {};

        mutex_lock(&id_priv->handler_mutex);

        if (READ_ONCE(id_priv->state) == RDMA_CM_DESTROYING ||
            READ_ONCE(id_priv->state) == RDMA_CM_DEVICE_REMOVAL)
                goto out_unlock;

        event.event = RDMA_CM_EVENT_UNREACHABLE;
        event.status = -ETIMEDOUT;

        if (cma_cm_event_handler(id_priv, &event)) {
                __acquire(&id_priv->handler_mutex);
                id_priv->cm_id.ib = NULL;
                cma_id_put(id_priv);
                destroy_id_handler_unlock(id_priv);
                return;
        }

out_unlock:
        mutex_unlock(&id_priv->handler_mutex);
        cma_id_put(id_priv);
}

static int cma_netevent_callback(struct notifier_block *self,
                                 unsigned long event, void *ctx)
{
        struct id_table_entry *ips_node = NULL;
        struct rdma_id_private *current_id;
        struct neighbour *neigh = ctx;
        unsigned long flags;

        if (event != NETEVENT_NEIGH_UPDATE)
                return NOTIFY_DONE;

        spin_lock_irqsave(&id_table_lock, flags);
        if (neigh->tbl->family == AF_INET6) {
                struct sockaddr_in6 neigh_sock_6;

                neigh_sock_6.sin6_family = AF_INET6;
                neigh_sock_6.sin6_addr = *(struct in6_addr *)neigh->primary_key;
                ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
                                             (struct sockaddr *)&neigh_sock_6);
        } else if (neigh->tbl->family == AF_INET) {
                struct sockaddr_in neigh_sock_4;

                neigh_sock_4.sin_family = AF_INET;
                neigh_sock_4.sin_addr.s_addr = *(__be32 *)(neigh->primary_key);
                ips_node = node_from_ndev_ip(&id_table, neigh->dev->ifindex,
                                             (struct sockaddr *)&neigh_sock_4);
        } else
                goto out;

        if (!ips_node)
                goto out;

        list_for_each_entry(current_id, &ips_node->id_list, id_list_entry) {
                if (!memcmp(current_id->id.route.addr.dev_addr.dst_dev_addr,
                           neigh->ha, ETH_ALEN))
                        continue;
                cma_id_get(current_id);
                queue_work(cma_wq, &current_id->id.net_work);
        }
out:
        spin_unlock_irqrestore(&id_table_lock, flags);
        return NOTIFY_DONE;
}

static struct notifier_block cma_nb = {
        .notifier_call = cma_netdev_callback
};

static struct notifier_block cma_netevent_cb = {
        .notifier_call = cma_netevent_callback
};

static void cma_send_device_removal_put(struct rdma_id_private *id_priv)
{
        struct rdma_cm_event event = { .event = RDMA_CM_EVENT_DEVICE_REMOVAL };
        enum rdma_cm_state state;
        unsigned long flags;

        mutex_lock(&id_priv->handler_mutex);
        /* Record that we want to remove the device */
        spin_lock_irqsave(&id_priv->lock, flags);
        state = id_priv->state;
        if (state == RDMA_CM_DESTROYING || state == RDMA_CM_DEVICE_REMOVAL) {
                spin_unlock_irqrestore(&id_priv->lock, flags);
                mutex_unlock(&id_priv->handler_mutex);
                cma_id_put(id_priv);
                return;
        }
        id_priv->state = RDMA_CM_DEVICE_REMOVAL;
        spin_unlock_irqrestore(&id_priv->lock, flags);

        if (cma_cm_event_handler(id_priv, &event)) {
                /*
                 * At this point the ULP promises it won't call
                 * rdma_destroy_id() concurrently
                 */
                cma_id_put(id_priv);
                mutex_unlock(&id_priv->handler_mutex);
                trace_cm_id_destroy(id_priv);
                _destroy_id(id_priv, state);
                return;
        }
        mutex_unlock(&id_priv->handler_mutex);

        /*
         * If this races with destroy then the thread that first assigns state
         * to a destroying does the cancel.
         */
        cma_cancel_operation(id_priv, state);
        cma_id_put(id_priv);
}

static void cma_process_remove(struct cma_device *cma_dev)
{
        mutex_lock(&lock);
        while (!list_empty(&cma_dev->id_list)) {
                struct rdma_id_private *id_priv = list_first_entry(
                        &cma_dev->id_list, struct rdma_id_private, device_item);

                list_del_init(&id_priv->listen_item);
                list_del_init(&id_priv->device_item);
                cma_id_get(id_priv);
                mutex_unlock(&lock);

                cma_send_device_removal_put(id_priv);

                mutex_lock(&lock);
        }
        mutex_unlock(&lock);

        cma_dev_put(cma_dev);
        wait_for_completion(&cma_dev->comp);
}

static bool cma_supported(struct ib_device *device)
{
        u32 i;

        rdma_for_each_port(device, i) {
                if (rdma_cap_ib_cm(device, i) || rdma_cap_iw_cm(device, i))
                        return true;
        }
        return false;
}

static int cma_add_one(struct ib_device *device)
{
        struct rdma_id_private *to_destroy;
        struct cma_device *cma_dev;
        struct rdma_id_private *id_priv;
        unsigned long supported_gids = 0;
        int ret;
        u32 i;

        if (!cma_supported(device))
                return -EOPNOTSUPP;

        cma_dev = kmalloc(sizeof(*cma_dev), GFP_KERNEL);
        if (!cma_dev)
                return -ENOMEM;

        cma_dev->device = device;
        cma_dev->default_gid_type = kcalloc(device->phys_port_cnt,
                                            sizeof(*cma_dev->default_gid_type),
                                            GFP_KERNEL);
        if (!cma_dev->default_gid_type) {
                ret = -ENOMEM;
                goto free_cma_dev;
        }

        cma_dev->default_roce_tos = kcalloc(device->phys_port_cnt,
                                            sizeof(*cma_dev->default_roce_tos),
                                            GFP_KERNEL);
        if (!cma_dev->default_roce_tos) {
                ret = -ENOMEM;
                goto free_gid_type;
        }

        rdma_for_each_port (device, i) {
                supported_gids = roce_gid_type_mask_support(device, i);
                WARN_ON(!supported_gids);
                if (supported_gids & (1 << CMA_PREFERRED_ROCE_GID_TYPE))
                        cma_dev->default_gid_type[i - rdma_start_port(device)] =
                                CMA_PREFERRED_ROCE_GID_TYPE;
                else
                        cma_dev->default_gid_type[i - rdma_start_port(device)] =
                                find_first_bit(&supported_gids, BITS_PER_LONG);
                cma_dev->default_roce_tos[i - rdma_start_port(device)] = 0;
        }

        init_completion(&cma_dev->comp);
        refcount_set(&cma_dev->refcount, 1);
        INIT_LIST_HEAD(&cma_dev->id_list);
        ib_set_client_data(device, &cma_client, cma_dev);

        mutex_lock(&lock);
        list_add_tail(&cma_dev->list, &dev_list);
        list_for_each_entry(id_priv, &listen_any_list, listen_any_item) {
                ret = cma_listen_on_dev(id_priv, cma_dev, &to_destroy);
                if (ret)
                        goto free_listen;
        }
        mutex_unlock(&lock);

        trace_cm_add_one(device);
        return 0;

free_listen:
        list_del(&cma_dev->list);
        mutex_unlock(&lock);

        /* cma_process_remove() will delete to_destroy */
        cma_process_remove(cma_dev);
        kfree(cma_dev->default_roce_tos);
free_gid_type:
        kfree(cma_dev->default_gid_type);

free_cma_dev:
        kfree(cma_dev);
        return ret;
}

static void cma_remove_one(struct ib_device *device, void *client_data)
{
        struct cma_device *cma_dev = client_data;

        trace_cm_remove_one(device);

        mutex_lock(&lock);
        list_del(&cma_dev->list);
        mutex_unlock(&lock);

        cma_process_remove(cma_dev);
        kfree(cma_dev->default_roce_tos);
        kfree(cma_dev->default_gid_type);
        kfree(cma_dev);
}

static int cma_init_net(struct net *net)
{
        struct cma_pernet *pernet = cma_pernet(net);

        xa_init(&pernet->tcp_ps);
        xa_init(&pernet->udp_ps);
        xa_init(&pernet->ipoib_ps);
        xa_init(&pernet->ib_ps);

        return 0;
}

static void cma_exit_net(struct net *net)
{
        struct cma_pernet *pernet = cma_pernet(net);

        WARN_ON(!xa_empty(&pernet->tcp_ps));
        WARN_ON(!xa_empty(&pernet->udp_ps));
        WARN_ON(!xa_empty(&pernet->ipoib_ps));
        WARN_ON(!xa_empty(&pernet->ib_ps));
}

static struct pernet_operations cma_pernet_operations = {
        .init = cma_init_net,
        .exit = cma_exit_net,
        .id = &cma_pernet_id,
        .size = sizeof(struct cma_pernet),
};

static int __init cma_init(void)
{
        int ret;

        /*
         * There is a rare lock ordering dependency in cma_netdev_callback()
         * that only happens when bonding is enabled. Teach lockdep that rtnl
         * must never be nested under lock so it can find these without having
         * to test with bonding.
         */
        if (IS_ENABLED(CONFIG_LOCKDEP)) {
                rtnl_lock();
                mutex_lock(&lock);
                mutex_unlock(&lock);
                rtnl_unlock();
        }

        cma_wq = alloc_ordered_workqueue("rdma_cm", WQ_MEM_RECLAIM);
        if (!cma_wq)
                return -ENOMEM;

        ret = register_pernet_subsys(&cma_pernet_operations);
        if (ret)
                goto err_wq;

        ib_sa_register_client(&sa_client);
        register_netdevice_notifier(&cma_nb);
        register_netevent_notifier(&cma_netevent_cb);

        ret = ib_register_client(&cma_client);
        if (ret)
                goto err;

        ret = cma_configfs_init();
        if (ret)
                goto err_ib;

        return 0;

err_ib:
        ib_unregister_client(&cma_client);
err:
        unregister_netevent_notifier(&cma_netevent_cb);
        unregister_netdevice_notifier(&cma_nb);
        ib_sa_unregister_client(&sa_client);
        unregister_pernet_subsys(&cma_pernet_operations);
err_wq:
        destroy_workqueue(cma_wq);
        return ret;
}

static void __exit cma_cleanup(void)
{
        cma_configfs_exit();
        ib_unregister_client(&cma_client);
        unregister_netevent_notifier(&cma_netevent_cb);
        unregister_netdevice_notifier(&cma_nb);
        ib_sa_unregister_client(&sa_client);
        unregister_pernet_subsys(&cma_pernet_operations);
        destroy_workqueue(cma_wq);
}

module_init(cma_init);
module_exit(cma_cleanup);











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  265 













  265 




  265 






















  265 











  265 
















  265 

  265 

  265 





  265 



  265 





  265 







  265 








  265 

















































































































































































































































































































































































































































































































  265 























































































































































































































































































































































































































































































































































































































































































































































































  265 





  265 






  265 




  265 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
// SPDX-License-Identifier: GPL-2.0
/*
 * drivers/base/core.c - core driver model code (device registration, etc)
 *
 * Copyright (c) 2002-3 Patrick Mochel
 * Copyright (c) 2002-3 Open Source Development Labs
 * Copyright (c) 2006 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2006 Novell, Inc.
 */

#include <linux/acpi.h>
#include <linux/blkdev.h>
#include <linux/cleanup.h>
#include <linux/cpufreq.h>
#include <linux/device.h>
#include <linux/dma-map-ops.h> /* for dma_default_coherent */
#include <linux/err.h>
#include <linux/fwnode.h>
#include <linux/init.h>
#include <linux/kdev_t.h>
#include <linux/kstrtox.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/of.h>
#include <linux/of_device.h>
#include <linux/pm_runtime.h>
#include <linux/sched/mm.h>
#include <linux/sched/signal.h>
#include <linux/slab.h>
#include <linux/string_helpers.h>
#include <linux/swiotlb.h>
#include <linux/sysfs.h>

#include "base.h"
#include "physical_location.h"
#include "power/power.h"

/* Device links support. */
static LIST_HEAD(deferred_sync);
static unsigned int defer_sync_state_count = 1;
static DEFINE_MUTEX(fwnode_link_lock);
static bool fw_devlink_is_permissive(void);
static void __fw_devlink_link_to_consumers(struct device *dev);
static bool fw_devlink_drv_reg_done;
static bool fw_devlink_best_effort;
static struct workqueue_struct *device_link_wq;

/**
 * __fwnode_link_add - Create a link between two fwnode_handles.
 * @con: Consumer end of the link.
 * @sup: Supplier end of the link.
 * @flags: Link flags.
 *
 * Create a fwnode link between fwnode handles @con and @sup. The fwnode link
 * represents the detail that the firmware lists @sup fwnode as supplying a
 * resource to @con.
 *
 * The driver core will use the fwnode link to create a device link between the
 * two device objects corresponding to @con and @sup when they are created. The
 * driver core will automatically delete the fwnode link between @con and @sup
 * after doing that.
 *
 * Attempts to create duplicate links between the same pair of fwnode handles
 * are ignored and there is no reference counting.
 */
static int __fwnode_link_add(struct fwnode_handle *con,
                             struct fwnode_handle *sup, u8 flags)
{
        struct fwnode_link *link;

        list_for_each_entry(link, &sup->consumers, s_hook)
                if (link->consumer == con) {
                        link->flags |= flags;
                        return 0;
                }

        link = kzalloc(sizeof(*link), GFP_KERNEL);
        if (!link)
                return -ENOMEM;

        link->supplier = sup;
        INIT_LIST_HEAD(&link->s_hook);
        link->consumer = con;
        INIT_LIST_HEAD(&link->c_hook);
        link->flags = flags;

        list_add(&link->s_hook, &sup->consumers);
        list_add(&link->c_hook, &con->suppliers);
        pr_debug("%pfwf Linked as a fwnode consumer to %pfwf\n",
                 con, sup);

        return 0;
}

int fwnode_link_add(struct fwnode_handle *con, struct fwnode_handle *sup,
                    u8 flags)
{
        guard(mutex)(&fwnode_link_lock);

        return __fwnode_link_add(con, sup, flags);
}

/**
 * __fwnode_link_del - Delete a link between two fwnode_handles.
 * @link: the fwnode_link to be deleted
 *
 * The fwnode_link_lock needs to be held when this function is called.
 */
static void __fwnode_link_del(struct fwnode_link *link)
{
        pr_debug("%pfwf Dropping the fwnode link to %pfwf\n",
                 link->consumer, link->supplier);
        list_del(&link->s_hook);
        list_del(&link->c_hook);
        kfree(link);
}

/**
 * __fwnode_link_cycle - Mark a fwnode link as being part of a cycle.
 * @link: the fwnode_link to be marked
 *
 * The fwnode_link_lock needs to be held when this function is called.
 */
static void __fwnode_link_cycle(struct fwnode_link *link)
{
        pr_debug("%pfwf: cycle: depends on %pfwf\n",
                 link->consumer, link->supplier);
        link->flags |= FWLINK_FLAG_CYCLE;
}

/**
 * fwnode_links_purge_suppliers - Delete all supplier links of fwnode_handle.
 * @fwnode: fwnode whose supplier links need to be deleted
 *
 * Deletes all supplier links connecting directly to @fwnode.
 */
static void fwnode_links_purge_suppliers(struct fwnode_handle *fwnode)
{
        struct fwnode_link *link, *tmp;

        guard(mutex)(&fwnode_link_lock);

        list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook)
                __fwnode_link_del(link);
}

/**
 * fwnode_links_purge_consumers - Delete all consumer links of fwnode_handle.
 * @fwnode: fwnode whose consumer links need to be deleted
 *
 * Deletes all consumer links connecting directly to @fwnode.
 */
static void fwnode_links_purge_consumers(struct fwnode_handle *fwnode)
{
        struct fwnode_link *link, *tmp;

        guard(mutex)(&fwnode_link_lock);

        list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook)
                __fwnode_link_del(link);
}

/**
 * fwnode_links_purge - Delete all links connected to a fwnode_handle.
 * @fwnode: fwnode whose links needs to be deleted
 *
 * Deletes all links connecting directly to a fwnode.
 */
void fwnode_links_purge(struct fwnode_handle *fwnode)
{
        fwnode_links_purge_suppliers(fwnode);
        fwnode_links_purge_consumers(fwnode);
}

void fw_devlink_purge_absent_suppliers(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *child;

        /* Don't purge consumer links of an added child */
        if (fwnode->dev)
                return;

        fwnode->flags |= FWNODE_FLAG_NOT_DEVICE;
        fwnode_links_purge_consumers(fwnode);

        fwnode_for_each_available_child_node(fwnode, child)
                fw_devlink_purge_absent_suppliers(child);
}
EXPORT_SYMBOL_GPL(fw_devlink_purge_absent_suppliers);

/**
 * __fwnode_links_move_consumers - Move consumer from @from to @to fwnode_handle
 * @from: move consumers away from this fwnode
 * @to: move consumers to this fwnode
 *
 * Move all consumer links from @from fwnode to @to fwnode.
 */
static void __fwnode_links_move_consumers(struct fwnode_handle *from,
                                          struct fwnode_handle *to)
{
        struct fwnode_link *link, *tmp;

        list_for_each_entry_safe(link, tmp, &from->consumers, s_hook) {
                __fwnode_link_add(link->consumer, to, link->flags);
                __fwnode_link_del(link);
        }
}

/**
 * __fw_devlink_pickup_dangling_consumers - Pick up dangling consumers
 * @fwnode: fwnode from which to pick up dangling consumers
 * @new_sup: fwnode of new supplier
 *
 * If the @fwnode has a corresponding struct device and the device supports
 * probing (that is, added to a bus), then we want to let fw_devlink create
 * MANAGED device links to this device, so leave @fwnode and its descendant's
 * fwnode links alone.
 *
 * Otherwise, move its consumers to the new supplier @new_sup.
 */
static void __fw_devlink_pickup_dangling_consumers(struct fwnode_handle *fwnode,
                                                   struct fwnode_handle *new_sup)
{
        struct fwnode_handle *child;

        if (fwnode->dev && fwnode->dev->bus)
                return;

        fwnode->flags |= FWNODE_FLAG_NOT_DEVICE;
        __fwnode_links_move_consumers(fwnode, new_sup);

        fwnode_for_each_available_child_node(fwnode, child)
                __fw_devlink_pickup_dangling_consumers(child, new_sup);
}

static DEFINE_MUTEX(device_links_lock);
DEFINE_STATIC_SRCU(device_links_srcu);

static inline void device_links_write_lock(void)
{
        mutex_lock(&device_links_lock);
}

static inline void device_links_write_unlock(void)
{
        mutex_unlock(&device_links_lock);
}

int device_links_read_lock(void) __acquires(&device_links_srcu)
{
        return srcu_read_lock(&device_links_srcu);
}

void device_links_read_unlock(int idx) __releases(&device_links_srcu)
{
        srcu_read_unlock(&device_links_srcu, idx);
}

int device_links_read_lock_held(void)
{
        return srcu_read_lock_held(&device_links_srcu);
}

static void device_link_synchronize_removal(void)
{
        synchronize_srcu(&device_links_srcu);
}

static void device_link_remove_from_lists(struct device_link *link)
{
        list_del_rcu(&link->s_node);
        list_del_rcu(&link->c_node);
}

static bool device_is_ancestor(struct device *dev, struct device *target)
{
        while (target->parent) {
                target = target->parent;
                if (dev == target)
                        return true;
        }
        return false;
}

#define DL_MARKER_FLAGS                (DL_FLAG_INFERRED | \
                                 DL_FLAG_CYCLE | \
                                 DL_FLAG_MANAGED)
static inline bool device_link_flag_is_sync_state_only(u32 flags)
{
        return (flags & ~DL_MARKER_FLAGS) == DL_FLAG_SYNC_STATE_ONLY;
}

/**
 * device_is_dependent - Check if one device depends on another one
 * @dev: Device to check dependencies for.
 * @target: Device to check against.
 *
 * Check if @target depends on @dev or any device dependent on it (its child or
 * its consumer etc).  Return 1 if that is the case or 0 otherwise.
 */
static int device_is_dependent(struct device *dev, void *target)
{
        struct device_link *link;
        int ret;

        /*
         * The "ancestors" check is needed to catch the case when the target
         * device has not been completely initialized yet and it is still
         * missing from the list of children of its parent device.
         */
        if (dev == target || device_is_ancestor(dev, target))
                return 1;

        ret = device_for_each_child(dev, target, device_is_dependent);
        if (ret)
                return ret;

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (device_link_flag_is_sync_state_only(link->flags))
                        continue;

                if (link->consumer == target)
                        return 1;

                ret = device_is_dependent(link->consumer, target);
                if (ret)
                        break;
        }
        return ret;
}

static void device_link_init_status(struct device_link *link,
                                    struct device *consumer,
                                    struct device *supplier)
{
        switch (supplier->links.status) {
        case DL_DEV_PROBING:
                switch (consumer->links.status) {
                case DL_DEV_PROBING:
                        /*
                         * A consumer driver can create a link to a supplier
                         * that has not completed its probing yet as long as it
                         * knows that the supplier is already functional (for
                         * example, it has just acquired some resources from the
                         * supplier).
                         */
                        link->status = DL_STATE_CONSUMER_PROBE;
                        break;
                default:
                        link->status = DL_STATE_DORMANT;
                        break;
                }
                break;
        case DL_DEV_DRIVER_BOUND:
                switch (consumer->links.status) {
                case DL_DEV_PROBING:
                        link->status = DL_STATE_CONSUMER_PROBE;
                        break;
                case DL_DEV_DRIVER_BOUND:
                        link->status = DL_STATE_ACTIVE;
                        break;
                default:
                        link->status = DL_STATE_AVAILABLE;
                        break;
                }
                break;
        case DL_DEV_UNBINDING:
                link->status = DL_STATE_SUPPLIER_UNBIND;
                break;
        default:
                link->status = DL_STATE_DORMANT;
                break;
        }
}

static int device_reorder_to_tail(struct device *dev, void *not_used)
{
        struct device_link *link;

        /*
         * Devices that have not been registered yet will be put to the ends
         * of the lists during the registration, so skip them here.
         */
        if (device_is_registered(dev))
                devices_kset_move_last(dev);

        if (device_pm_initialized(dev))
                device_pm_move_last(dev);

        device_for_each_child(dev, NULL, device_reorder_to_tail);
        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (device_link_flag_is_sync_state_only(link->flags))
                        continue;
                device_reorder_to_tail(link->consumer, NULL);
        }

        return 0;
}

/**
 * device_pm_move_to_tail - Move set of devices to the end of device lists
 * @dev: Device to move
 *
 * This is a device_reorder_to_tail() wrapper taking the requisite locks.
 *
 * It moves the @dev along with all of its children and all of its consumers
 * to the ends of the device_kset and dpm_list, recursively.
 */
void device_pm_move_to_tail(struct device *dev)
{
        int idx;

        idx = device_links_read_lock();
        device_pm_lock();
        device_reorder_to_tail(dev, NULL);
        device_pm_unlock();
        device_links_read_unlock(idx);
}

#define to_devlink(dev)        container_of((dev), struct device_link, link_dev)

static ssize_t status_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        const char *output;

        switch (to_devlink(dev)->status) {
        case DL_STATE_NONE:
                output = "not tracked";
                break;
        case DL_STATE_DORMANT:
                output = "dormant";
                break;
        case DL_STATE_AVAILABLE:
                output = "available";
                break;
        case DL_STATE_CONSUMER_PROBE:
                output = "consumer probing";
                break;
        case DL_STATE_ACTIVE:
                output = "active";
                break;
        case DL_STATE_SUPPLIER_UNBIND:
                output = "supplier unbinding";
                break;
        default:
                output = "unknown";
                break;
        }

        return sysfs_emit(buf, "%s\n", output);
}
static DEVICE_ATTR_RO(status);

static ssize_t auto_remove_on_show(struct device *dev,
                                   struct device_attribute *attr, char *buf)
{
        struct device_link *link = to_devlink(dev);
        const char *output;

        if (link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
                output = "supplier unbind";
        else if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER)
                output = "consumer unbind";
        else
                output = "never";

        return sysfs_emit(buf, "%s\n", output);
}
static DEVICE_ATTR_RO(auto_remove_on);

static ssize_t runtime_pm_show(struct device *dev,
                               struct device_attribute *attr, char *buf)
{
        struct device_link *link = to_devlink(dev);

        return sysfs_emit(buf, "%d\n", !!(link->flags & DL_FLAG_PM_RUNTIME));
}
static DEVICE_ATTR_RO(runtime_pm);

static ssize_t sync_state_only_show(struct device *dev,
                                    struct device_attribute *attr, char *buf)
{
        struct device_link *link = to_devlink(dev);

        return sysfs_emit(buf, "%d\n",
                          !!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
}
static DEVICE_ATTR_RO(sync_state_only);

static struct attribute *devlink_attrs[] = {
        &dev_attr_status.attr,
        &dev_attr_auto_remove_on.attr,
        &dev_attr_runtime_pm.attr,
        &dev_attr_sync_state_only.attr,
        NULL,
};
ATTRIBUTE_GROUPS(devlink);

static void device_link_release_fn(struct work_struct *work)
{
        struct device_link *link = container_of(work, struct device_link, rm_work);

        /* Ensure that all references to the link object have been dropped. */
        device_link_synchronize_removal();

        pm_runtime_release_supplier(link);
        /*
         * If supplier_preactivated is set, the link has been dropped between
         * the pm_runtime_get_suppliers() and pm_runtime_put_suppliers() calls
         * in __driver_probe_device().  In that case, drop the supplier's
         * PM-runtime usage counter to remove the reference taken by
         * pm_runtime_get_suppliers().
         */
        if (link->supplier_preactivated)
                pm_runtime_put_noidle(link->supplier);

        pm_request_idle(link->supplier);

        put_device(link->consumer);
        put_device(link->supplier);
        kfree(link);
}

static void devlink_dev_release(struct device *dev)
{
        struct device_link *link = to_devlink(dev);

        INIT_WORK(&link->rm_work, device_link_release_fn);
        /*
         * It may take a while to complete this work because of the SRCU
         * synchronization in device_link_release_fn() and if the consumer or
         * supplier devices get deleted when it runs, so put it into the
         * dedicated workqueue.
         */
        queue_work(device_link_wq, &link->rm_work);
}

/**
 * device_link_wait_removal - Wait for ongoing devlink removal jobs to terminate
 */
void device_link_wait_removal(void)
{
        /*
         * devlink removal jobs are queued in the dedicated work queue.
         * To be sure that all removal jobs are terminated, ensure that any
         * scheduled work has run to completion.
         */
        flush_workqueue(device_link_wq);
}
EXPORT_SYMBOL_GPL(device_link_wait_removal);

static const struct class devlink_class = {
        .name = "devlink",
        .dev_groups = devlink_groups,
        .dev_release = devlink_dev_release,
};

static int devlink_add_symlinks(struct device *dev)
{
        char *buf_con __free(kfree) = NULL, *buf_sup __free(kfree) = NULL;
        int ret;
        struct device_link *link = to_devlink(dev);
        struct device *sup = link->supplier;
        struct device *con = link->consumer;

        ret = sysfs_create_link(&link->link_dev.kobj, &sup->kobj, "supplier");
        if (ret)
                goto out;

        ret = sysfs_create_link(&link->link_dev.kobj, &con->kobj, "consumer");
        if (ret)
                goto err_con;

        buf_con = kasprintf(GFP_KERNEL, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
        if (!buf_con) {
                ret = -ENOMEM;
                goto err_con_dev;
        }

        ret = sysfs_create_link(&sup->kobj, &link->link_dev.kobj, buf_con);
        if (ret)
                goto err_con_dev;

        buf_sup = kasprintf(GFP_KERNEL, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
        if (!buf_sup) {
                ret = -ENOMEM;
                goto err_sup_dev;
        }

        ret = sysfs_create_link(&con->kobj, &link->link_dev.kobj, buf_sup);
        if (ret)
                goto err_sup_dev;

        goto out;

err_sup_dev:
        sysfs_remove_link(&sup->kobj, buf_con);
err_con_dev:
        sysfs_remove_link(&link->link_dev.kobj, "consumer");
err_con:
        sysfs_remove_link(&link->link_dev.kobj, "supplier");
out:
        return ret;
}

static void devlink_remove_symlinks(struct device *dev)
{
        char *buf_con __free(kfree) = NULL, *buf_sup __free(kfree) = NULL;
        struct device_link *link = to_devlink(dev);
        struct device *sup = link->supplier;
        struct device *con = link->consumer;

        sysfs_remove_link(&link->link_dev.kobj, "consumer");
        sysfs_remove_link(&link->link_dev.kobj, "supplier");

        if (device_is_registered(con)) {
                buf_sup = kasprintf(GFP_KERNEL, "supplier:%s:%s", dev_bus_name(sup), dev_name(sup));
                if (!buf_sup)
                        goto out;
                sysfs_remove_link(&con->kobj, buf_sup);
        }

        buf_con = kasprintf(GFP_KERNEL, "consumer:%s:%s", dev_bus_name(con), dev_name(con));
        if (!buf_con)
                goto out;
        sysfs_remove_link(&sup->kobj, buf_con);

        return;

out:
        WARN(1, "Unable to properly free device link symlinks!\n");
}

static struct class_interface devlink_class_intf = {
        .class = &devlink_class,
        .add_dev = devlink_add_symlinks,
        .remove_dev = devlink_remove_symlinks,
};

static int __init devlink_class_init(void)
{
        int ret;

        ret = class_register(&devlink_class);
        if (ret)
                return ret;

        ret = class_interface_register(&devlink_class_intf);
        if (ret)
                class_unregister(&devlink_class);

        return ret;
}
postcore_initcall(devlink_class_init);

#define DL_MANAGED_LINK_FLAGS (DL_FLAG_AUTOREMOVE_CONSUMER | \
                               DL_FLAG_AUTOREMOVE_SUPPLIER | \
                               DL_FLAG_AUTOPROBE_CONSUMER  | \
                               DL_FLAG_SYNC_STATE_ONLY | \
                               DL_FLAG_INFERRED | \
                               DL_FLAG_CYCLE)

#define DL_ADD_VALID_FLAGS (DL_MANAGED_LINK_FLAGS | DL_FLAG_STATELESS | \
                            DL_FLAG_PM_RUNTIME | DL_FLAG_RPM_ACTIVE)

/**
 * device_link_add - Create a link between two devices.
 * @consumer: Consumer end of the link.
 * @supplier: Supplier end of the link.
 * @flags: Link flags.
 *
 * Return: On success, a device_link struct will be returned.
 *         On error or invalid flag settings, NULL will be returned.
 *
 * The caller is responsible for the proper synchronization of the link creation
 * with runtime PM.  First, setting the DL_FLAG_PM_RUNTIME flag will cause the
 * runtime PM framework to take the link into account.  Second, if the
 * DL_FLAG_RPM_ACTIVE flag is set in addition to it, the supplier devices will
 * be forced into the active meta state and reference-counted upon the creation
 * of the link.  If DL_FLAG_PM_RUNTIME is not set, DL_FLAG_RPM_ACTIVE will be
 * ignored.
 *
 * If DL_FLAG_STATELESS is set in @flags, the caller of this function is
 * expected to release the link returned by it directly with the help of either
 * device_link_del() or device_link_remove().
 *
 * If that flag is not set, however, the caller of this function is handing the
 * management of the link over to the driver core entirely and its return value
 * can only be used to check whether or not the link is present.  In that case,
 * the DL_FLAG_AUTOREMOVE_CONSUMER and DL_FLAG_AUTOREMOVE_SUPPLIER device link
 * flags can be used to indicate to the driver core when the link can be safely
 * deleted.  Namely, setting one of them in @flags indicates to the driver core
 * that the link is not going to be used (by the given caller of this function)
 * after unbinding the consumer or supplier driver, respectively, from its
 * device, so the link can be deleted at that point.  If none of them is set,
 * the link will be maintained until one of the devices pointed to by it (either
 * the consumer or the supplier) is unregistered.
 *
 * Also, if DL_FLAG_STATELESS, DL_FLAG_AUTOREMOVE_CONSUMER and
 * DL_FLAG_AUTOREMOVE_SUPPLIER are not set in @flags (that is, a persistent
 * managed device link is being added), the DL_FLAG_AUTOPROBE_CONSUMER flag can
 * be used to request the driver core to automatically probe for a consumer
 * driver after successfully binding a driver to the supplier device.
 *
 * The combination of DL_FLAG_STATELESS and one of DL_FLAG_AUTOREMOVE_CONSUMER,
 * DL_FLAG_AUTOREMOVE_SUPPLIER, or DL_FLAG_AUTOPROBE_CONSUMER set in @flags at
 * the same time is invalid and will cause NULL to be returned upfront.
 * However, if a device link between the given @consumer and @supplier pair
 * exists already when this function is called for them, the existing link will
 * be returned regardless of its current type and status (the link's flags may
 * be modified then).  The caller of this function is then expected to treat
 * the link as though it has just been created, so (in particular) if
 * DL_FLAG_STATELESS was passed in @flags, the link needs to be released
 * explicitly when not needed any more (as stated above).
 *
 * A side effect of the link creation is re-ordering of dpm_list and the
 * devices_kset list by moving the consumer device and all devices depending
 * on it to the ends of these lists (that does not happen to devices that have
 * not been registered when this function is called).
 *
 * The supplier device is required to be registered when this function is called
 * and NULL will be returned if that is not the case.  The consumer device need
 * not be registered, however.
 */
struct device_link *device_link_add(struct device *consumer,
                                    struct device *supplier, u32 flags)
{
        struct device_link *link;

        if (!consumer || !supplier || consumer == supplier ||
            flags & ~DL_ADD_VALID_FLAGS ||
            (flags & DL_FLAG_STATELESS && flags & DL_MANAGED_LINK_FLAGS) ||
            (flags & DL_FLAG_AUTOPROBE_CONSUMER &&
             flags & (DL_FLAG_AUTOREMOVE_CONSUMER |
                      DL_FLAG_AUTOREMOVE_SUPPLIER)))
                return NULL;

        if (flags & DL_FLAG_PM_RUNTIME && flags & DL_FLAG_RPM_ACTIVE) {
                if (pm_runtime_get_sync(supplier) < 0) {
                        pm_runtime_put_noidle(supplier);
                        return NULL;
                }
        }

        if (!(flags & DL_FLAG_STATELESS))
                flags |= DL_FLAG_MANAGED;

        if (flags & DL_FLAG_SYNC_STATE_ONLY &&
            !device_link_flag_is_sync_state_only(flags))
                return NULL;

        device_links_write_lock();
        device_pm_lock();

        /*
         * If the supplier has not been fully registered yet or there is a
         * reverse (non-SYNC_STATE_ONLY) dependency between the consumer and
         * the supplier already in the graph, return NULL. If the link is a
         * SYNC_STATE_ONLY link, we don't check for reverse dependencies
         * because it only affects sync_state() callbacks.
         */
        if (!device_pm_initialized(supplier)
            || (!(flags & DL_FLAG_SYNC_STATE_ONLY) &&
                  device_is_dependent(consumer, supplier))) {
                link = NULL;
                goto out;
        }

        /*
         * SYNC_STATE_ONLY links are useless once a consumer device has probed.
         * So, only create it if the consumer hasn't probed yet.
         */
        if (flags & DL_FLAG_SYNC_STATE_ONLY &&
            consumer->links.status != DL_DEV_NO_DRIVER &&
            consumer->links.status != DL_DEV_PROBING) {
                link = NULL;
                goto out;
        }

        /*
         * DL_FLAG_AUTOREMOVE_SUPPLIER indicates that the link will be needed
         * longer than for DL_FLAG_AUTOREMOVE_CONSUMER and setting them both
         * together doesn't make sense, so prefer DL_FLAG_AUTOREMOVE_SUPPLIER.
         */
        if (flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
                flags &= ~DL_FLAG_AUTOREMOVE_CONSUMER;

        list_for_each_entry(link, &supplier->links.consumers, s_node) {
                if (link->consumer != consumer)
                        continue;

                if (link->flags & DL_FLAG_INFERRED &&
                    !(flags & DL_FLAG_INFERRED))
                        link->flags &= ~DL_FLAG_INFERRED;

                if (flags & DL_FLAG_PM_RUNTIME) {
                        if (!(link->flags & DL_FLAG_PM_RUNTIME)) {
                                pm_runtime_new_link(consumer);
                                link->flags |= DL_FLAG_PM_RUNTIME;
                        }
                        if (flags & DL_FLAG_RPM_ACTIVE)
                                refcount_inc(&link->rpm_active);
                }

                if (flags & DL_FLAG_STATELESS) {
                        kref_get(&link->kref);
                        if (link->flags & DL_FLAG_SYNC_STATE_ONLY &&
                            !(link->flags & DL_FLAG_STATELESS)) {
                                link->flags |= DL_FLAG_STATELESS;
                                goto reorder;
                        } else {
                                link->flags |= DL_FLAG_STATELESS;
                                goto out;
                        }
                }

                /*
                 * If the life time of the link following from the new flags is
                 * longer than indicated by the flags of the existing link,
                 * update the existing link to stay around longer.
                 */
                if (flags & DL_FLAG_AUTOREMOVE_SUPPLIER) {
                        if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) {
                                link->flags &= ~DL_FLAG_AUTOREMOVE_CONSUMER;
                                link->flags |= DL_FLAG_AUTOREMOVE_SUPPLIER;
                        }
                } else if (!(flags & DL_FLAG_AUTOREMOVE_CONSUMER)) {
                        link->flags &= ~(DL_FLAG_AUTOREMOVE_CONSUMER |
                                         DL_FLAG_AUTOREMOVE_SUPPLIER);
                }
                if (!(link->flags & DL_FLAG_MANAGED)) {
                        kref_get(&link->kref);
                        link->flags |= DL_FLAG_MANAGED;
                        device_link_init_status(link, consumer, supplier);
                }
                if (link->flags & DL_FLAG_SYNC_STATE_ONLY &&
                    !(flags & DL_FLAG_SYNC_STATE_ONLY)) {
                        link->flags &= ~DL_FLAG_SYNC_STATE_ONLY;
                        goto reorder;
                }

                goto out;
        }

        link = kzalloc(sizeof(*link), GFP_KERNEL);
        if (!link)
                goto out;

        refcount_set(&link->rpm_active, 1);

        get_device(supplier);
        link->supplier = supplier;
        INIT_LIST_HEAD(&link->s_node);
        get_device(consumer);
        link->consumer = consumer;
        INIT_LIST_HEAD(&link->c_node);
        link->flags = flags;
        kref_init(&link->kref);

        link->link_dev.class = &devlink_class;
        device_set_pm_not_required(&link->link_dev);
        dev_set_name(&link->link_dev, "%s:%s--%s:%s",
                     dev_bus_name(supplier), dev_name(supplier),
                     dev_bus_name(consumer), dev_name(consumer));
        if (device_register(&link->link_dev)) {
                put_device(&link->link_dev);
                link = NULL;
                goto out;
        }

        if (flags & DL_FLAG_PM_RUNTIME) {
                if (flags & DL_FLAG_RPM_ACTIVE)
                        refcount_inc(&link->rpm_active);

                pm_runtime_new_link(consumer);
        }

        /* Determine the initial link state. */
        if (flags & DL_FLAG_STATELESS)
                link->status = DL_STATE_NONE;
        else
                device_link_init_status(link, consumer, supplier);

        /*
         * Some callers expect the link creation during consumer driver probe to
         * resume the supplier even without DL_FLAG_RPM_ACTIVE.
         */
        if (link->status == DL_STATE_CONSUMER_PROBE &&
            flags & DL_FLAG_PM_RUNTIME)
                pm_runtime_resume(supplier);

        list_add_tail_rcu(&link->s_node, &supplier->links.consumers);
        list_add_tail_rcu(&link->c_node, &consumer->links.suppliers);

        if (flags & DL_FLAG_SYNC_STATE_ONLY) {
                dev_dbg(consumer,
                        "Linked as a sync state only consumer to %s\n",
                        dev_name(supplier));
                goto out;
        }

reorder:
        /*
         * Move the consumer and all of the devices depending on it to the end
         * of dpm_list and the devices_kset list.
         *
         * It is necessary to hold dpm_list locked throughout all that or else
         * we may end up suspending with a wrong ordering of it.
         */
        device_reorder_to_tail(consumer, NULL);

        dev_dbg(consumer, "Linked as a consumer to %s\n", dev_name(supplier));

out:
        device_pm_unlock();
        device_links_write_unlock();

        if ((flags & DL_FLAG_PM_RUNTIME && flags & DL_FLAG_RPM_ACTIVE) && !link)
                pm_runtime_put(supplier);

        return link;
}
EXPORT_SYMBOL_GPL(device_link_add);

static void __device_link_del(struct kref *kref)
{
        struct device_link *link = container_of(kref, struct device_link, kref);

        dev_dbg(link->consumer, "Dropping the link to %s\n",
                dev_name(link->supplier));

        pm_runtime_drop_link(link);

        device_link_remove_from_lists(link);
        device_unregister(&link->link_dev);
}

static void device_link_put_kref(struct device_link *link)
{
        if (link->flags & DL_FLAG_STATELESS)
                kref_put(&link->kref, __device_link_del);
        else if (!device_is_registered(link->consumer))
                __device_link_del(&link->kref);
        else
                WARN(1, "Unable to drop a managed device link reference\n");
}

/**
 * device_link_del - Delete a stateless link between two devices.
 * @link: Device link to delete.
 *
 * The caller must ensure proper synchronization of this function with runtime
 * PM.  If the link was added multiple times, it needs to be deleted as often.
 * Care is required for hotplugged devices:  Their links are purged on removal
 * and calling device_link_del() is then no longer allowed.
 */
void device_link_del(struct device_link *link)
{
        device_links_write_lock();
        device_link_put_kref(link);
        device_links_write_unlock();
}
EXPORT_SYMBOL_GPL(device_link_del);

/**
 * device_link_remove - Delete a stateless link between two devices.
 * @consumer: Consumer end of the link.
 * @supplier: Supplier end of the link.
 *
 * The caller must ensure proper synchronization of this function with runtime
 * PM.
 */
void device_link_remove(void *consumer, struct device *supplier)
{
        struct device_link *link;

        if (WARN_ON(consumer == supplier))
                return;

        device_links_write_lock();

        list_for_each_entry(link, &supplier->links.consumers, s_node) {
                if (link->consumer == consumer) {
                        device_link_put_kref(link);
                        break;
                }
        }

        device_links_write_unlock();
}
EXPORT_SYMBOL_GPL(device_link_remove);

static void device_links_missing_supplier(struct device *dev)
{
        struct device_link *link;

        list_for_each_entry(link, &dev->links.suppliers, c_node) {
                if (link->status != DL_STATE_CONSUMER_PROBE)
                        continue;

                if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
                        WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
                } else {
                        WARN_ON(!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
                        WRITE_ONCE(link->status, DL_STATE_DORMANT);
                }
        }
}

static bool dev_is_best_effort(struct device *dev)
{
        return (fw_devlink_best_effort && dev->can_match) ||
                (dev->fwnode && (dev->fwnode->flags & FWNODE_FLAG_BEST_EFFORT));
}

static struct fwnode_handle *fwnode_links_check_suppliers(
                                                struct fwnode_handle *fwnode)
{
        struct fwnode_link *link;

        if (!fwnode || fw_devlink_is_permissive())
                return NULL;

        list_for_each_entry(link, &fwnode->suppliers, c_hook)
                if (!(link->flags &
                      (FWLINK_FLAG_CYCLE | FWLINK_FLAG_IGNORE)))
                        return link->supplier;

        return NULL;
}

/**
 * device_links_check_suppliers - Check presence of supplier drivers.
 * @dev: Consumer device.
 *
 * Check links from this device to any suppliers.  Walk the list of the device's
 * links to suppliers and see if all of them are available.  If not, simply
 * return -EPROBE_DEFER.
 *
 * We need to guarantee that the supplier will not go away after the check has
 * been positive here.  It only can go away in __device_release_driver() and
 * that function  checks the device's links to consumers.  This means we need to
 * mark the link as "consumer probe in progress" to make the supplier removal
 * wait for us to complete (or bad things may happen).
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
int device_links_check_suppliers(struct device *dev)
{
        struct device_link *link;
        int ret = 0, fwnode_ret = 0;
        struct fwnode_handle *sup_fw;

        /*
         * Device waiting for supplier to become available is not allowed to
         * probe.
         */
        scoped_guard(mutex, &fwnode_link_lock) {
                sup_fw = fwnode_links_check_suppliers(dev->fwnode);
                if (sup_fw) {
                        if (dev_is_best_effort(dev))
                                fwnode_ret = -EAGAIN;
                        else
                                return dev_err_probe(dev, -EPROBE_DEFER,
                                                     "wait for supplier %pfwf\n", sup_fw);
                }
        }

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.suppliers, c_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                if (link->status != DL_STATE_AVAILABLE &&
                    !(link->flags & DL_FLAG_SYNC_STATE_ONLY)) {

                        if (dev_is_best_effort(dev) &&
                            link->flags & DL_FLAG_INFERRED &&
                            !link->supplier->can_match) {
                                ret = -EAGAIN;
                                continue;
                        }

                        device_links_missing_supplier(dev);
                        ret = dev_err_probe(dev, -EPROBE_DEFER,
                                            "supplier %s not ready\n", dev_name(link->supplier));
                        break;
                }
                WRITE_ONCE(link->status, DL_STATE_CONSUMER_PROBE);
        }
        dev->links.status = DL_DEV_PROBING;

        device_links_write_unlock();

        return ret ? ret : fwnode_ret;
}

/**
 * __device_links_queue_sync_state - Queue a device for sync_state() callback
 * @dev: Device to call sync_state() on
 * @list: List head to queue the @dev on
 *
 * Queues a device for a sync_state() callback when the device links write lock
 * isn't held. This allows the sync_state() execution flow to use device links
 * APIs.  The caller must ensure this function is called with
 * device_links_write_lock() held.
 *
 * This function does a get_device() to make sure the device is not freed while
 * on this list.
 *
 * So the caller must also ensure that device_links_flush_sync_list() is called
 * as soon as the caller releases device_links_write_lock().  This is necessary
 * to make sure the sync_state() is called in a timely fashion and the
 * put_device() is called on this device.
 */
static void __device_links_queue_sync_state(struct device *dev,
                                            struct list_head *list)
{
        struct device_link *link;

        if (!dev_has_sync_state(dev))
                return;
        if (dev->state_synced)
                return;

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;
                if (link->status != DL_STATE_ACTIVE)
                        return;
        }

        /*
         * Set the flag here to avoid adding the same device to a list more
         * than once. This can happen if new consumers get added to the device
         * and probed before the list is flushed.
         */
        dev->state_synced = true;

        if (WARN_ON(!list_empty(&dev->links.defer_sync)))
                return;

        get_device(dev);
        list_add_tail(&dev->links.defer_sync, list);
}

/**
 * device_links_flush_sync_list - Call sync_state() on a list of devices
 * @list: List of devices to call sync_state() on
 * @dont_lock_dev: Device for which lock is already held by the caller
 *
 * Calls sync_state() on all the devices that have been queued for it. This
 * function is used in conjunction with __device_links_queue_sync_state(). The
 * @dont_lock_dev parameter is useful when this function is called from a
 * context where a device lock is already held.
 */
static void device_links_flush_sync_list(struct list_head *list,
                                         struct device *dont_lock_dev)
{
        struct device *dev, *tmp;

        list_for_each_entry_safe(dev, tmp, list, links.defer_sync) {
                list_del_init(&dev->links.defer_sync);

                if (dev != dont_lock_dev)
                        device_lock(dev);

                dev_sync_state(dev);

                if (dev != dont_lock_dev)
                        device_unlock(dev);

                put_device(dev);
        }
}

void device_links_supplier_sync_state_pause(void)
{
        device_links_write_lock();
        defer_sync_state_count++;
        device_links_write_unlock();
}

void device_links_supplier_sync_state_resume(void)
{
        struct device *dev, *tmp;
        LIST_HEAD(sync_list);

        device_links_write_lock();
        if (!defer_sync_state_count) {
                WARN(true, "Unmatched sync_state pause/resume!");
                goto out;
        }
        defer_sync_state_count--;
        if (defer_sync_state_count)
                goto out;

        list_for_each_entry_safe(dev, tmp, &deferred_sync, links.defer_sync) {
                /*
                 * Delete from deferred_sync list before queuing it to
                 * sync_list because defer_sync is used for both lists.
                 */
                list_del_init(&dev->links.defer_sync);
                __device_links_queue_sync_state(dev, &sync_list);
        }
out:
        device_links_write_unlock();

        device_links_flush_sync_list(&sync_list, NULL);
}

static int sync_state_resume_initcall(void)
{
        device_links_supplier_sync_state_resume();
        return 0;
}
late_initcall(sync_state_resume_initcall);

static void __device_links_supplier_defer_sync(struct device *sup)
{
        if (list_empty(&sup->links.defer_sync) && dev_has_sync_state(sup))
                list_add_tail(&sup->links.defer_sync, &deferred_sync);
}

static void device_link_drop_managed(struct device_link *link)
{
        link->flags &= ~DL_FLAG_MANAGED;
        WRITE_ONCE(link->status, DL_STATE_NONE);
        kref_put(&link->kref, __device_link_del);
}

static ssize_t waiting_for_supplier_show(struct device *dev,
                                         struct device_attribute *attr,
                                         char *buf)
{
        bool val;

        device_lock(dev);
        scoped_guard(mutex, &fwnode_link_lock)
                val = !!fwnode_links_check_suppliers(dev->fwnode);
        device_unlock(dev);
        return sysfs_emit(buf, "%u\n", val);
}
static DEVICE_ATTR_RO(waiting_for_supplier);

/**
 * device_links_force_bind - Prepares device to be force bound
 * @dev: Consumer device.
 *
 * device_bind_driver() force binds a device to a driver without calling any
 * driver probe functions. So the consumer really isn't going to wait for any
 * supplier before it's bound to the driver. We still want the device link
 * states to be sensible when this happens.
 *
 * In preparation for device_bind_driver(), this function goes through each
 * supplier device links and checks if the supplier is bound. If it is, then
 * the device link status is set to CONSUMER_PROBE. Otherwise, the device link
 * is dropped. Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_force_bind(struct device *dev)
{
        struct device_link *link, *ln;

        device_links_write_lock();

        list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                if (link->status != DL_STATE_AVAILABLE) {
                        device_link_drop_managed(link);
                        continue;
                }
                WRITE_ONCE(link->status, DL_STATE_CONSUMER_PROBE);
        }
        dev->links.status = DL_DEV_PROBING;

        device_links_write_unlock();
}

/**
 * device_links_driver_bound - Update device links after probing its driver.
 * @dev: Device to update the links for.
 *
 * The probe has been successful, so update links from this device to any
 * consumers by changing their status to "available".
 *
 * Also change the status of @dev's links to suppliers to "active".
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_driver_bound(struct device *dev)
{
        struct device_link *link, *ln;
        LIST_HEAD(sync_list);

        /*
         * If a device binds successfully, it's expected to have created all
         * the device links it needs to or make new device links as it needs
         * them. So, fw_devlink no longer needs to create device links to any
         * of the device's suppliers.
         *
         * Also, if a child firmware node of this bound device is not added as a
         * device by now, assume it is never going to be added. Make this bound
         * device the fallback supplier to the dangling consumers of the child
         * firmware node because this bound device is probably implementing the
         * child firmware node functionality and we don't want the dangling
         * consumers to defer probe indefinitely waiting for a device for the
         * child firmware node.
         */
        if (dev->fwnode && dev->fwnode->dev == dev) {
                struct fwnode_handle *child;

                fwnode_links_purge_suppliers(dev->fwnode);

                guard(mutex)(&fwnode_link_lock);

                fwnode_for_each_available_child_node(dev->fwnode, child)
                        __fw_devlink_pickup_dangling_consumers(child,
                                                               dev->fwnode);
                __fw_devlink_link_to_consumers(dev);
        }
        device_remove_file(dev, &dev_attr_waiting_for_supplier);

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                /*
                 * Links created during consumer probe may be in the "consumer
                 * probe" state to start with if the supplier is still probing
                 * when they are created and they may become "active" if the
                 * consumer probe returns first.  Skip them here.
                 */
                if (link->status == DL_STATE_CONSUMER_PROBE ||
                    link->status == DL_STATE_ACTIVE)
                        continue;

                WARN_ON(link->status != DL_STATE_DORMANT);
                WRITE_ONCE(link->status, DL_STATE_AVAILABLE);

                if (link->flags & DL_FLAG_AUTOPROBE_CONSUMER)
                        driver_deferred_probe_add(link->consumer);
        }

        if (defer_sync_state_count)
                __device_links_supplier_defer_sync(dev);
        else
                __device_links_queue_sync_state(dev, &sync_list);

        list_for_each_entry_safe(link, ln, &dev->links.suppliers, c_node) {
                struct device *supplier;

                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                supplier = link->supplier;
                if (link->flags & DL_FLAG_SYNC_STATE_ONLY) {
                        /*
                         * When DL_FLAG_SYNC_STATE_ONLY is set, it means no
                         * other DL_MANAGED_LINK_FLAGS have been set. So, it's
                         * save to drop the managed link completely.
                         */
                        device_link_drop_managed(link);
                } else if (dev_is_best_effort(dev) &&
                           link->flags & DL_FLAG_INFERRED &&
                           link->status != DL_STATE_CONSUMER_PROBE &&
                           !link->supplier->can_match) {
                        /*
                         * When dev_is_best_effort() is true, we ignore device
                         * links to suppliers that don't have a driver.  If the
                         * consumer device still managed to probe, there's no
                         * point in maintaining a device link in a weird state
                         * (consumer probed before supplier). So delete it.
                         */
                        device_link_drop_managed(link);
                } else {
                        WARN_ON(link->status != DL_STATE_CONSUMER_PROBE);
                        WRITE_ONCE(link->status, DL_STATE_ACTIVE);
                }

                /*
                 * This needs to be done even for the deleted
                 * DL_FLAG_SYNC_STATE_ONLY device link in case it was the last
                 * device link that was preventing the supplier from getting a
                 * sync_state() call.
                 */
                if (defer_sync_state_count)
                        __device_links_supplier_defer_sync(supplier);
                else
                        __device_links_queue_sync_state(supplier, &sync_list);
        }

        dev->links.status = DL_DEV_DRIVER_BOUND;

        device_links_write_unlock();

        device_links_flush_sync_list(&sync_list, dev);
}

/**
 * __device_links_no_driver - Update links of a device without a driver.
 * @dev: Device without a drvier.
 *
 * Delete all non-persistent links from this device to any suppliers.
 *
 * Persistent links stay around, but their status is changed to "available",
 * unless they already are in the "supplier unbind in progress" state in which
 * case they need not be updated.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
static void __device_links_no_driver(struct device *dev)
{
        struct device_link *link, *ln;

        list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                if (link->flags & DL_FLAG_AUTOREMOVE_CONSUMER) {
                        device_link_drop_managed(link);
                        continue;
                }

                if (link->status != DL_STATE_CONSUMER_PROBE &&
                    link->status != DL_STATE_ACTIVE)
                        continue;

                if (link->supplier->links.status == DL_DEV_DRIVER_BOUND) {
                        WRITE_ONCE(link->status, DL_STATE_AVAILABLE);
                } else {
                        WARN_ON(!(link->flags & DL_FLAG_SYNC_STATE_ONLY));
                        WRITE_ONCE(link->status, DL_STATE_DORMANT);
                }
        }

        dev->links.status = DL_DEV_NO_DRIVER;
}

/**
 * device_links_no_driver - Update links after failing driver probe.
 * @dev: Device whose driver has just failed to probe.
 *
 * Clean up leftover links to consumers for @dev and invoke
 * %__device_links_no_driver() to update links to suppliers for it as
 * appropriate.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_no_driver(struct device *dev)
{
        struct device_link *link;

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                /*
                 * The probe has failed, so if the status of the link is
                 * "consumer probe" or "active", it must have been added by
                 * a probing consumer while this device was still probing.
                 * Change its state to "dormant", as it represents a valid
                 * relationship, but it is not functionally meaningful.
                 */
                if (link->status == DL_STATE_CONSUMER_PROBE ||
                    link->status == DL_STATE_ACTIVE)
                        WRITE_ONCE(link->status, DL_STATE_DORMANT);
        }

        __device_links_no_driver(dev);

        device_links_write_unlock();
}

/**
 * device_links_driver_cleanup - Update links after driver removal.
 * @dev: Device whose driver has just gone away.
 *
 * Update links to consumers for @dev by changing their status to "dormant" and
 * invoke %__device_links_no_driver() to update links to suppliers for it as
 * appropriate.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_driver_cleanup(struct device *dev)
{
        struct device_link *link, *ln;

        device_links_write_lock();

        list_for_each_entry_safe(link, ln, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                WARN_ON(link->flags & DL_FLAG_AUTOREMOVE_CONSUMER);
                WARN_ON(link->status != DL_STATE_SUPPLIER_UNBIND);

                /*
                 * autoremove the links between this @dev and its consumer
                 * devices that are not active, i.e. where the link state
                 * has moved to DL_STATE_SUPPLIER_UNBIND.
                 */
                if (link->status == DL_STATE_SUPPLIER_UNBIND &&
                    link->flags & DL_FLAG_AUTOREMOVE_SUPPLIER)
                        device_link_drop_managed(link);

                WRITE_ONCE(link->status, DL_STATE_DORMANT);
        }

        list_del_init(&dev->links.defer_sync);
        __device_links_no_driver(dev);

        device_links_write_unlock();
}

/**
 * device_links_busy - Check if there are any busy links to consumers.
 * @dev: Device to check.
 *
 * Check each consumer of the device and return 'true' if its link's status
 * is one of "consumer probe" or "active" (meaning that the given consumer is
 * probing right now or its driver is present).  Otherwise, change the link
 * state to "supplier unbind" to prevent the consumer from being probed
 * successfully going forward.
 *
 * Return 'false' if there are no probing or active consumers.
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
bool device_links_busy(struct device *dev)
{
        struct device_link *link;
        bool ret = false;

        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                if (!(link->flags & DL_FLAG_MANAGED))
                        continue;

                if (link->status == DL_STATE_CONSUMER_PROBE
                    || link->status == DL_STATE_ACTIVE) {
                        ret = true;
                        break;
                }
                WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND);
        }

        dev->links.status = DL_DEV_UNBINDING;

        device_links_write_unlock();
        return ret;
}

/**
 * device_links_unbind_consumers - Force unbind consumers of the given device.
 * @dev: Device to unbind the consumers of.
 *
 * Walk the list of links to consumers for @dev and if any of them is in the
 * "consumer probe" state, wait for all device probes in progress to complete
 * and start over.
 *
 * If that's not the case, change the status of the link to "supplier unbind"
 * and check if the link was in the "active" state.  If so, force the consumer
 * driver to unbind and start over (the consumer will not re-probe as we have
 * changed the state of the link already).
 *
 * Links without the DL_FLAG_MANAGED flag set are ignored.
 */
void device_links_unbind_consumers(struct device *dev)
{
        struct device_link *link;

 start:
        device_links_write_lock();

        list_for_each_entry(link, &dev->links.consumers, s_node) {
                enum device_link_state status;

                if (!(link->flags & DL_FLAG_MANAGED) ||
                    link->flags & DL_FLAG_SYNC_STATE_ONLY)
                        continue;

                status = link->status;
                if (status == DL_STATE_CONSUMER_PROBE) {
                        device_links_write_unlock();

                        wait_for_device_probe();
                        goto start;
                }
                WRITE_ONCE(link->status, DL_STATE_SUPPLIER_UNBIND);
                if (status == DL_STATE_ACTIVE) {
                        struct device *consumer = link->consumer;

                        get_device(consumer);

                        device_links_write_unlock();

                        device_release_driver_internal(consumer, NULL,
                                                       consumer->parent);
                        put_device(consumer);
                        goto start;
                }
        }

        device_links_write_unlock();
}

/**
 * device_links_purge - Delete existing links to other devices.
 * @dev: Target device.
 */
static void device_links_purge(struct device *dev)
{
        struct device_link *link, *ln;

        if (dev->class == &devlink_class)
                return;

        /*
         * Delete all of the remaining links from this device to any other
         * devices (either consumers or suppliers).
         */
        device_links_write_lock();

        list_for_each_entry_safe_reverse(link, ln, &dev->links.suppliers, c_node) {
                WARN_ON(link->status == DL_STATE_ACTIVE);
                __device_link_del(&link->kref);
        }

        list_for_each_entry_safe_reverse(link, ln, &dev->links.consumers, s_node) {
                WARN_ON(link->status != DL_STATE_DORMANT &&
                        link->status != DL_STATE_NONE);
                __device_link_del(&link->kref);
        }

        device_links_write_unlock();
}

#define FW_DEVLINK_FLAGS_PERMISSIVE        (DL_FLAG_INFERRED | \
                                         DL_FLAG_SYNC_STATE_ONLY)
#define FW_DEVLINK_FLAGS_ON                (DL_FLAG_INFERRED | \
                                         DL_FLAG_AUTOPROBE_CONSUMER)
#define FW_DEVLINK_FLAGS_RPM                (FW_DEVLINK_FLAGS_ON | \
                                         DL_FLAG_PM_RUNTIME)

static u32 fw_devlink_flags = FW_DEVLINK_FLAGS_RPM;
static int __init fw_devlink_setup(char *arg)
{
        if (!arg)
                return -EINVAL;

        if (strcmp(arg, "off") == 0) {
                fw_devlink_flags = 0;
        } else if (strcmp(arg, "permissive") == 0) {
                fw_devlink_flags = FW_DEVLINK_FLAGS_PERMISSIVE;
        } else if (strcmp(arg, "on") == 0) {
                fw_devlink_flags = FW_DEVLINK_FLAGS_ON;
        } else if (strcmp(arg, "rpm") == 0) {
                fw_devlink_flags = FW_DEVLINK_FLAGS_RPM;
        }
        return 0;
}
early_param("fw_devlink", fw_devlink_setup);

static bool fw_devlink_strict;
static int __init fw_devlink_strict_setup(char *arg)
{
        return kstrtobool(arg, &fw_devlink_strict);
}
early_param("fw_devlink.strict", fw_devlink_strict_setup);

#define FW_DEVLINK_SYNC_STATE_STRICT        0
#define FW_DEVLINK_SYNC_STATE_TIMEOUT        1

#ifndef CONFIG_FW_DEVLINK_SYNC_STATE_TIMEOUT
static int fw_devlink_sync_state;
#else
static int fw_devlink_sync_state = FW_DEVLINK_SYNC_STATE_TIMEOUT;
#endif

static int __init fw_devlink_sync_state_setup(char *arg)
{
        if (!arg)
                return -EINVAL;

        if (strcmp(arg, "strict") == 0) {
                fw_devlink_sync_state = FW_DEVLINK_SYNC_STATE_STRICT;
                return 0;
        } else if (strcmp(arg, "timeout") == 0) {
                fw_devlink_sync_state = FW_DEVLINK_SYNC_STATE_TIMEOUT;
                return 0;
        }
        return -EINVAL;
}
early_param("fw_devlink.sync_state", fw_devlink_sync_state_setup);

static inline u32 fw_devlink_get_flags(u8 fwlink_flags)
{
        if (fwlink_flags & FWLINK_FLAG_CYCLE)
                return FW_DEVLINK_FLAGS_PERMISSIVE | DL_FLAG_CYCLE;

        return fw_devlink_flags;
}

static bool fw_devlink_is_permissive(void)
{
        return fw_devlink_flags == FW_DEVLINK_FLAGS_PERMISSIVE;
}

bool fw_devlink_is_strict(void)
{
        return fw_devlink_strict && !fw_devlink_is_permissive();
}

static void fw_devlink_parse_fwnode(struct fwnode_handle *fwnode)
{
        if (fwnode->flags & FWNODE_FLAG_LINKS_ADDED)
                return;

        fwnode_call_int_op(fwnode, add_links);
        fwnode->flags |= FWNODE_FLAG_LINKS_ADDED;
}

static void fw_devlink_parse_fwtree(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *child = NULL;

        fw_devlink_parse_fwnode(fwnode);

        while ((child = fwnode_get_next_available_child_node(fwnode, child)))
                fw_devlink_parse_fwtree(child);
}

static void fw_devlink_relax_link(struct device_link *link)
{
        if (!(link->flags & DL_FLAG_INFERRED))
                return;

        if (device_link_flag_is_sync_state_only(link->flags))
                return;

        pm_runtime_drop_link(link);
        link->flags = DL_FLAG_MANAGED | FW_DEVLINK_FLAGS_PERMISSIVE;
        dev_dbg(link->consumer, "Relaxing link with %s\n",
                dev_name(link->supplier));
}

static int fw_devlink_no_driver(struct device *dev, void *data)
{
        struct device_link *link = to_devlink(dev);

        if (!link->supplier->can_match)
                fw_devlink_relax_link(link);

        return 0;
}

void fw_devlink_drivers_done(void)
{
        fw_devlink_drv_reg_done = true;
        device_links_write_lock();
        class_for_each_device(&devlink_class, NULL, NULL,
                              fw_devlink_no_driver);
        device_links_write_unlock();
}

static int fw_devlink_dev_sync_state(struct device *dev, void *data)
{
        struct device_link *link = to_devlink(dev);
        struct device *sup = link->supplier;

        if (!(link->flags & DL_FLAG_MANAGED) ||
            link->status == DL_STATE_ACTIVE || sup->state_synced ||
            !dev_has_sync_state(sup))
                return 0;

        if (fw_devlink_sync_state == FW_DEVLINK_SYNC_STATE_STRICT) {
                dev_warn(sup, "sync_state() pending due to %s\n",
                         dev_name(link->consumer));
                return 0;
        }

        if (!list_empty(&sup->links.defer_sync))
                return 0;

        dev_warn(sup, "Timed out. Forcing sync_state()\n");
        sup->state_synced = true;
        get_device(sup);
        list_add_tail(&sup->links.defer_sync, data);

        return 0;
}

void fw_devlink_probing_done(void)
{
        LIST_HEAD(sync_list);

        device_links_write_lock();
        class_for_each_device(&devlink_class, NULL, &sync_list,
                              fw_devlink_dev_sync_state);
        device_links_write_unlock();
        device_links_flush_sync_list(&sync_list, NULL);
}

/**
 * wait_for_init_devices_probe - Try to probe any device needed for init
 *
 * Some devices might need to be probed and bound successfully before the kernel
 * boot sequence can finish and move on to init/userspace. For example, a
 * network interface might need to be bound to be able to mount a NFS rootfs.
 *
 * With fw_devlink=on by default, some of these devices might be blocked from
 * probing because they are waiting on a optional supplier that doesn't have a
 * driver. While fw_devlink will eventually identify such devices and unblock
 * the probing automatically, it might be too late by the time it unblocks the
 * probing of devices. For example, the IP4 autoconfig might timeout before
 * fw_devlink unblocks probing of the network interface.
 *
 * This function is available to temporarily try and probe all devices that have
 * a driver even if some of their suppliers haven't been added or don't have
 * drivers.
 *
 * The drivers can then decide which of the suppliers are optional vs mandatory
 * and probe the device if possible. By the time this function returns, all such
 * "best effort" probes are guaranteed to be completed. If a device successfully
 * probes in this mode, we delete all fw_devlink discovered dependencies of that
 * device where the supplier hasn't yet probed successfully because they have to
 * be optional dependencies.
 *
 * Any devices that didn't successfully probe go back to being treated as if
 * this function was never called.
 *
 * This also means that some devices that aren't needed for init and could have
 * waited for their optional supplier to probe (when the supplier's module is
 * loaded later on) would end up probing prematurely with limited functionality.
 * So call this function only when boot would fail without it.
 */
void __init wait_for_init_devices_probe(void)
{
        if (!fw_devlink_flags || fw_devlink_is_permissive())
                return;

        /*
         * Wait for all ongoing probes to finish so that the "best effort" is
         * only applied to devices that can't probe otherwise.
         */
        wait_for_device_probe();

        pr_info("Trying to probe devices needed for running init ...\n");
        fw_devlink_best_effort = true;
        driver_deferred_probe_trigger();

        /*
         * Wait for all "best effort" probes to finish before going back to
         * normal enforcement.
         */
        wait_for_device_probe();
        fw_devlink_best_effort = false;
}

static void fw_devlink_unblock_consumers(struct device *dev)
{
        struct device_link *link;

        if (!fw_devlink_flags || fw_devlink_is_permissive())
                return;

        device_links_write_lock();
        list_for_each_entry(link, &dev->links.consumers, s_node)
                fw_devlink_relax_link(link);
        device_links_write_unlock();
}

#define get_dev_from_fwnode(fwnode)        get_device((fwnode)->dev)

static bool fwnode_init_without_drv(struct fwnode_handle *fwnode)
{
        struct device *dev;
        bool ret;

        if (!(fwnode->flags & FWNODE_FLAG_INITIALIZED))
                return false;

        dev = get_dev_from_fwnode(fwnode);
        ret = !dev || dev->links.status == DL_DEV_NO_DRIVER;
        put_device(dev);

        return ret;
}

static bool fwnode_ancestor_init_without_drv(struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent;

        fwnode_for_each_parent_node(fwnode, parent) {
                if (fwnode_init_without_drv(parent)) {
                        fwnode_handle_put(parent);
                        return true;
                }
        }

        return false;
}

/**
 * fwnode_is_ancestor_of - Test if @ancestor is ancestor of @child
 * @ancestor: Firmware which is tested for being an ancestor
 * @child: Firmware which is tested for being the child
 *
 * A node is considered an ancestor of itself too.
 *
 * Return: true if @ancestor is an ancestor of @child. Otherwise, returns false.
 */
static bool fwnode_is_ancestor_of(const struct fwnode_handle *ancestor,
                                  const struct fwnode_handle *child)
{
        struct fwnode_handle *parent;

        if (IS_ERR_OR_NULL(ancestor))
                return false;

        if (child == ancestor)
                return true;

        fwnode_for_each_parent_node(child, parent) {
                if (parent == ancestor) {
                        fwnode_handle_put(parent);
                        return true;
                }
        }
        return false;
}

/**
 * fwnode_get_next_parent_dev - Find device of closest ancestor fwnode
 * @fwnode: firmware node
 *
 * Given a firmware node (@fwnode), this function finds its closest ancestor
 * firmware node that has a corresponding struct device and returns that struct
 * device.
 *
 * The caller is responsible for calling put_device() on the returned device
 * pointer.
 *
 * Return: a pointer to the device of the @fwnode's closest ancestor.
 */
static struct device *fwnode_get_next_parent_dev(const struct fwnode_handle *fwnode)
{
        struct fwnode_handle *parent;
        struct device *dev;

        fwnode_for_each_parent_node(fwnode, parent) {
                dev = get_dev_from_fwnode(parent);
                if (dev) {
                        fwnode_handle_put(parent);
                        return dev;
                }
        }
        return NULL;
}

/**
 * __fw_devlink_relax_cycles - Relax and mark dependency cycles.
 * @con_handle: Potential consumer device fwnode.
 * @sup_handle: Potential supplier's fwnode.
 *
 * Needs to be called with fwnode_lock and device link lock held.
 *
 * Check if @sup_handle or any of its ancestors or suppliers direct/indirectly
 * depend on @con. This function can detect multiple cyles between @sup_handle
 * and @con. When such dependency cycles are found, convert all device links
 * created solely by fw_devlink into SYNC_STATE_ONLY device links. Also, mark
 * all fwnode links in the cycle with FWLINK_FLAG_CYCLE so that when they are
 * converted into a device link in the future, they are created as
 * SYNC_STATE_ONLY device links. This is the equivalent of doing
 * fw_devlink=permissive just between the devices in the cycle. We need to do
 * this because, at this point, fw_devlink can't tell which of these
 * dependencies is not a real dependency.
 *
 * Return true if one or more cycles were found. Otherwise, return false.
 */
static bool __fw_devlink_relax_cycles(struct fwnode_handle *con_handle,
                                 struct fwnode_handle *sup_handle)
{
        struct device *sup_dev = NULL, *par_dev = NULL, *con_dev = NULL;
        struct fwnode_link *link;
        struct device_link *dev_link;
        bool ret = false;

        if (!sup_handle)
                return false;

        /*
         * We aren't trying to find all cycles. Just a cycle between con and
         * sup_handle.
         */
        if (sup_handle->flags & FWNODE_FLAG_VISITED)
                return false;

        sup_handle->flags |= FWNODE_FLAG_VISITED;

        /* Termination condition. */
        if (sup_handle == con_handle) {
                pr_debug("----- cycle: start -----\n");
                ret = true;
                goto out;
        }

        sup_dev = get_dev_from_fwnode(sup_handle);
        con_dev = get_dev_from_fwnode(con_handle);
        /*
         * If sup_dev is bound to a driver and @con hasn't started binding to a
         * driver, sup_dev can't be a consumer of @con. So, no need to check
         * further.
         */
        if (sup_dev && sup_dev->links.status ==  DL_DEV_DRIVER_BOUND &&
            con_dev && con_dev->links.status == DL_DEV_NO_DRIVER) {
                ret = false;
                goto out;
        }

        list_for_each_entry(link, &sup_handle->suppliers, c_hook) {
                if (link->flags & FWLINK_FLAG_IGNORE)
                        continue;

                if (__fw_devlink_relax_cycles(con_handle, link->supplier)) {
                        __fwnode_link_cycle(link);
                        ret = true;
                }
        }

        /*
         * Give priority to device parent over fwnode parent to account for any
         * quirks in how fwnodes are converted to devices.
         */
        if (sup_dev)
                par_dev = get_device(sup_dev->parent);
        else
                par_dev = fwnode_get_next_parent_dev(sup_handle);

        if (par_dev && __fw_devlink_relax_cycles(con_handle, par_dev->fwnode)) {
                pr_debug("%pfwf: cycle: child of %pfwf\n", sup_handle,
                         par_dev->fwnode);
                ret = true;
        }

        if (!sup_dev)
                goto out;

        list_for_each_entry(dev_link, &sup_dev->links.suppliers, c_node) {
                /*
                 * Ignore a SYNC_STATE_ONLY flag only if it wasn't marked as
                 * such due to a cycle.
                 */
                if (device_link_flag_is_sync_state_only(dev_link->flags) &&
                    !(dev_link->flags & DL_FLAG_CYCLE))
                        continue;

                if (__fw_devlink_relax_cycles(con_handle,
                                              dev_link->supplier->fwnode)) {
                        pr_debug("%pfwf: cycle: depends on %pfwf\n", sup_handle,
                                 dev_link->supplier->fwnode);
                        fw_devlink_relax_link(dev_link);
                        dev_link->flags |= DL_FLAG_CYCLE;
                        ret = true;
                }
        }

out:
        sup_handle->flags &= ~FWNODE_FLAG_VISITED;
        put_device(sup_dev);
        put_device(con_dev);
        put_device(par_dev);
        return ret;
}

/**
 * fw_devlink_create_devlink - Create a device link from a consumer to fwnode
 * @con: consumer device for the device link
 * @sup_handle: fwnode handle of supplier
 * @link: fwnode link that's being converted to a device link
 *
 * This function will try to create a device link between the consumer device
 * @con and the supplier device represented by @sup_handle.
 *
 * The supplier has to be provided as a fwnode because incorrect cycles in
 * fwnode links can sometimes cause the supplier device to never be created.
 * This function detects such cases and returns an error if it cannot create a
 * device link from the consumer to a missing supplier.
 *
 * Returns,
 * 0 on successfully creating a device link
 * -EINVAL if the device link cannot be created as expected
 * -EAGAIN if the device link cannot be created right now, but it may be
 *  possible to do that in the future
 */
static int fw_devlink_create_devlink(struct device *con,
                                     struct fwnode_handle *sup_handle,
                                     struct fwnode_link *link)
{
        struct device *sup_dev;
        int ret = 0;
        u32 flags;

        if (link->flags & FWLINK_FLAG_IGNORE)
                return 0;

        /*
         * In some cases, a device P might also be a supplier to its child node
         * C. However, this would defer the probe of C until the probe of P
         * completes successfully. This is perfectly fine in the device driver
         * model. device_add() doesn't guarantee probe completion of the device
         * by the time it returns.
         *
         * However, there are a few drivers that assume C will finish probing
         * as soon as it's added and before P finishes probing. So, we provide
         * a flag to let fw_devlink know not to delay the probe of C until the
         * probe of P completes successfully.
         *
         * When such a flag is set, we can't create device links where P is the
         * supplier of C as that would delay the probe of C.
         */
        if (sup_handle->flags & FWNODE_FLAG_NEEDS_CHILD_BOUND_ON_ADD &&
            fwnode_is_ancestor_of(sup_handle, con->fwnode))
                return -EINVAL;

        /*
         * Don't try to optimize by not calling the cycle detection logic under
         * certain conditions. There's always some corner case that won't get
         * detected.
         */
        device_links_write_lock();
        if (__fw_devlink_relax_cycles(link->consumer, sup_handle)) {
                __fwnode_link_cycle(link);
                pr_debug("----- cycle: end -----\n");
                pr_info("%pfwf: Fixed dependency cycle(s) with %pfwf\n",
                        link->consumer, sup_handle);
        }
        device_links_write_unlock();

        if (con->fwnode == link->consumer)
                flags = fw_devlink_get_flags(link->flags);
        else
                flags = FW_DEVLINK_FLAGS_PERMISSIVE;

        if (sup_handle->flags & FWNODE_FLAG_NOT_DEVICE)
                sup_dev = fwnode_get_next_parent_dev(sup_handle);
        else
                sup_dev = get_dev_from_fwnode(sup_handle);

        if (sup_dev) {
                /*
                 * If it's one of those drivers that don't actually bind to
                 * their device using driver core, then don't wait on this
                 * supplier device indefinitely.
                 */
                if (sup_dev->links.status == DL_DEV_NO_DRIVER &&
                    sup_handle->flags & FWNODE_FLAG_INITIALIZED) {
                        dev_dbg(con,
                                "Not linking %pfwf - dev might never probe\n",
                                sup_handle);
                        ret = -EINVAL;
                        goto out;
                }

                if (con != sup_dev && !device_link_add(con, sup_dev, flags)) {
                        dev_err(con, "Failed to create device link (0x%x) with supplier %s for %pfwf\n",
                                flags, dev_name(sup_dev), link->consumer);
                        ret = -EINVAL;
                }

                goto out;
        }

        /*
         * Supplier or supplier's ancestor already initialized without a struct
         * device or being probed by a driver.
         */
        if (fwnode_init_without_drv(sup_handle) ||
            fwnode_ancestor_init_without_drv(sup_handle)) {
                dev_dbg(con, "Not linking %pfwf - might never become dev\n",
                        sup_handle);
                return -EINVAL;
        }

        ret = -EAGAIN;
out:
        put_device(sup_dev);
        return ret;
}

/**
 * __fw_devlink_link_to_consumers - Create device links to consumers of a device
 * @dev: Device that needs to be linked to its consumers
 *
 * This function looks at all the consumer fwnodes of @dev and creates device
 * links between the consumer device and @dev (supplier).
 *
 * If the consumer device has not been added yet, then this function creates a
 * SYNC_STATE_ONLY link between @dev (supplier) and the closest ancestor device
 * of the consumer fwnode. This is necessary to make sure @dev doesn't get a
 * sync_state() callback before the real consumer device gets to be added and
 * then probed.
 *
 * Once device links are created from the real consumer to @dev (supplier), the
 * fwnode links are deleted.
 */
static void __fw_devlink_link_to_consumers(struct device *dev)
{
        struct fwnode_handle *fwnode = dev->fwnode;
        struct fwnode_link *link, *tmp;

        list_for_each_entry_safe(link, tmp, &fwnode->consumers, s_hook) {
                struct device *con_dev;
                bool own_link = true;
                int ret;

                con_dev = get_dev_from_fwnode(link->consumer);
                /*
                 * If consumer device is not available yet, make a "proxy"
                 * SYNC_STATE_ONLY link from the consumer's parent device to
                 * the supplier device. This is necessary to make sure the
                 * supplier doesn't get a sync_state() callback before the real
                 * consumer can create a device link to the supplier.
                 *
                 * This proxy link step is needed to handle the case where the
                 * consumer's parent device is added before the supplier.
                 */
                if (!con_dev) {
                        con_dev = fwnode_get_next_parent_dev(link->consumer);
                        /*
                         * However, if the consumer's parent device is also the
                         * parent of the supplier, don't create a
                         * consumer-supplier link from the parent to its child
                         * device. Such a dependency is impossible.
                         */
                        if (con_dev &&
                            fwnode_is_ancestor_of(con_dev->fwnode, fwnode)) {
                                put_device(con_dev);
                                con_dev = NULL;
                        } else {
                                own_link = false;
                        }
                }

                if (!con_dev)
                        continue;

                ret = fw_devlink_create_devlink(con_dev, fwnode, link);
                put_device(con_dev);
                if (!own_link || ret == -EAGAIN)
                        continue;

                __fwnode_link_del(link);
        }
}

/**
 * __fw_devlink_link_to_suppliers - Create device links to suppliers of a device
 * @dev: The consumer device that needs to be linked to its suppliers
 * @fwnode: Root of the fwnode tree that is used to create device links
 *
 * This function looks at all the supplier fwnodes of fwnode tree rooted at
 * @fwnode and creates device links between @dev (consumer) and all the
 * supplier devices of the entire fwnode tree at @fwnode.
 *
 * The function creates normal (non-SYNC_STATE_ONLY) device links between @dev
 * and the real suppliers of @dev. Once these device links are created, the
 * fwnode links are deleted.
 *
 * In addition, it also looks at all the suppliers of the entire fwnode tree
 * because some of the child devices of @dev that have not been added yet
 * (because @dev hasn't probed) might already have their suppliers added to
 * driver core. So, this function creates SYNC_STATE_ONLY device links between
 * @dev (consumer) and these suppliers to make sure they don't execute their
 * sync_state() callbacks before these child devices have a chance to create
 * their device links. The fwnode links that correspond to the child devices
 * aren't delete because they are needed later to create the device links
 * between the real consumer and supplier devices.
 */
static void __fw_devlink_link_to_suppliers(struct device *dev,
                                           struct fwnode_handle *fwnode)
{
        bool own_link = (dev->fwnode == fwnode);
        struct fwnode_link *link, *tmp;
        struct fwnode_handle *child = NULL;

        list_for_each_entry_safe(link, tmp, &fwnode->suppliers, c_hook) {
                int ret;
                struct fwnode_handle *sup = link->supplier;

                ret = fw_devlink_create_devlink(dev, sup, link);
                if (!own_link || ret == -EAGAIN)
                        continue;

                __fwnode_link_del(link);
        }

        /*
         * Make "proxy" SYNC_STATE_ONLY device links to represent the needs of
         * all the descendants. This proxy link step is needed to handle the
         * case where the supplier is added before the consumer's parent device
         * (@dev).
         */
        while ((child = fwnode_get_next_available_child_node(fwnode, child)))
                __fw_devlink_link_to_suppliers(dev, child);
}

static void fw_devlink_link_device(struct device *dev)
{
        struct fwnode_handle *fwnode = dev->fwnode;

        if (!fw_devlink_flags)
                return;

        fw_devlink_parse_fwtree(fwnode);

        guard(mutex)(&fwnode_link_lock);

        __fw_devlink_link_to_consumers(dev);
        __fw_devlink_link_to_suppliers(dev, fwnode);
}

/* Device links support end. */

static struct kobject *dev_kobj;

/* /sys/dev/char */
static struct kobject *sysfs_dev_char_kobj;

/* /sys/dev/block */
static struct kobject *sysfs_dev_block_kobj;

static DEFINE_MUTEX(device_hotplug_lock);

void lock_device_hotplug(void)
{
        mutex_lock(&device_hotplug_lock);
}

void unlock_device_hotplug(void)
{
        mutex_unlock(&device_hotplug_lock);
}

int lock_device_hotplug_sysfs(void)
{
        if (mutex_trylock(&device_hotplug_lock))
                return 0;

        /* Avoid busy looping (5 ms of sleep should do). */
        msleep(5);
        return restart_syscall();
}

#ifdef CONFIG_BLOCK
static inline int device_is_not_partition(struct device *dev)
{
        return !(dev->type == &part_type);
}
#else
static inline int device_is_not_partition(struct device *dev)
{
        return 1;
}
#endif

static void device_platform_notify(struct device *dev)
{
        acpi_device_notify(dev);

        software_node_notify(dev);
}

static void device_platform_notify_remove(struct device *dev)
{
        software_node_notify_remove(dev);

        acpi_device_notify_remove(dev);
}

/**
 * dev_driver_string - Return a device's driver name, if at all possible
 * @dev: struct device to get the name of
 *
 * Will return the device's driver's name if it is bound to a device.  If
 * the device is not bound to a driver, it will return the name of the bus
 * it is attached to.  If it is not attached to a bus either, an empty
 * string will be returned.
 */
const char *dev_driver_string(const struct device *dev)
{
        struct device_driver *drv;

        /* dev->driver can change to NULL underneath us because of unbinding,
         * so be careful about accessing it.  dev->bus and dev->class should
         * never change once they are set, so they don't need special care.
         */
        drv = READ_ONCE(dev->driver);
        return drv ? drv->name : dev_bus_name(dev);
}
EXPORT_SYMBOL(dev_driver_string);

#define to_dev_attr(_attr) container_of(_attr, struct device_attribute, attr)

static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr,
                             char *buf)
{
        struct device_attribute *dev_attr = to_dev_attr(attr);
        struct device *dev = kobj_to_dev(kobj);
        ssize_t ret = -EIO;

        if (dev_attr->show)
                ret = dev_attr->show(dev, dev_attr, buf);
        if (ret >= (ssize_t)PAGE_SIZE) {
                printk("dev_attr_show: %pS returned bad count\n",
                                dev_attr->show);
        }
        return ret;
}

static ssize_t dev_attr_store(struct kobject *kobj, struct attribute *attr,
                              const char *buf, size_t count)
{
        struct device_attribute *dev_attr = to_dev_attr(attr);
        struct device *dev = kobj_to_dev(kobj);
        ssize_t ret = -EIO;

        if (dev_attr->store)
                ret = dev_attr->store(dev, dev_attr, buf, count);
        return ret;
}

static const struct sysfs_ops dev_sysfs_ops = {
        .show        = dev_attr_show,
        .store        = dev_attr_store,
};

#define to_ext_attr(x) container_of(x, struct dev_ext_attribute, attr)

ssize_t device_store_ulong(struct device *dev,
                           struct device_attribute *attr,
                           const char *buf, size_t size)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);
        int ret;
        unsigned long new;

        ret = kstrtoul(buf, 0, &new);
        if (ret)
                return ret;
        *(unsigned long *)(ea->var) = new;
        /* Always return full write size even if we didn't consume all */
        return size;
}
EXPORT_SYMBOL_GPL(device_store_ulong);

ssize_t device_show_ulong(struct device *dev,
                          struct device_attribute *attr,
                          char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);
        return sysfs_emit(buf, "%lx\n", *(unsigned long *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_ulong);

ssize_t device_store_int(struct device *dev,
                         struct device_attribute *attr,
                         const char *buf, size_t size)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);
        int ret;
        long new;

        ret = kstrtol(buf, 0, &new);
        if (ret)
                return ret;

        if (new > INT_MAX || new < INT_MIN)
                return -EINVAL;
        *(int *)(ea->var) = new;
        /* Always return full write size even if we didn't consume all */
        return size;
}
EXPORT_SYMBOL_GPL(device_store_int);

ssize_t device_show_int(struct device *dev,
                        struct device_attribute *attr,
                        char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        return sysfs_emit(buf, "%d\n", *(int *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_int);

ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
                          const char *buf, size_t size)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        if (kstrtobool(buf, ea->var) < 0)
                return -EINVAL;

        return size;
}
EXPORT_SYMBOL_GPL(device_store_bool);

ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
                         char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        return sysfs_emit(buf, "%d\n", *(bool *)(ea->var));
}
EXPORT_SYMBOL_GPL(device_show_bool);

ssize_t device_show_string(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        struct dev_ext_attribute *ea = to_ext_attr(attr);

        return sysfs_emit(buf, "%s\n", (char *)ea->var);
}
EXPORT_SYMBOL_GPL(device_show_string);

/**
 * device_release - free device structure.
 * @kobj: device's kobject.
 *
 * This is called once the reference count for the object
 * reaches 0. We forward the call to the device's release
 * method, which should handle actually freeing the structure.
 */
static void device_release(struct kobject *kobj)
{
        struct device *dev = kobj_to_dev(kobj);
        struct device_private *p = dev->p;

        /*
         * Some platform devices are driven without driver attached
         * and managed resources may have been acquired.  Make sure
         * all resources are released.
         *
         * Drivers still can add resources into device after device
         * is deleted but alive, so release devres here to avoid
         * possible memory leak.
         */
        devres_release_all(dev);

        kfree(dev->dma_range_map);

        if (dev->release)
                dev->release(dev);
        else if (dev->type && dev->type->release)
                dev->type->release(dev);
        else if (dev->class && dev->class->dev_release)
                dev->class->dev_release(dev);
        else
                WARN(1, KERN_ERR "Device '%s' does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
                        dev_name(dev));
        kfree(p);
}

static const void *device_namespace(const struct kobject *kobj)
{
        const struct device *dev = kobj_to_dev(kobj);
        const void *ns = NULL;

        if (dev->class && dev->class->namespace)
                ns = dev->class->namespace(dev);

        return ns;
}

static void device_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        const struct device *dev = kobj_to_dev(kobj);

        if (dev->class && dev->class->get_ownership)
                dev->class->get_ownership(dev, uid, gid);
}

static const struct kobj_type device_ktype = {
        .release        = device_release,
        .sysfs_ops        = &dev_sysfs_ops,
        .namespace        = device_namespace,
        .get_ownership        = device_get_ownership,
};


static int dev_uevent_filter(const struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);

        if (ktype == &device_ktype) {
                const struct device *dev = kobj_to_dev(kobj);
                if (dev->bus)
                        return 1;
                if (dev->class)
                        return 1;
        }
        return 0;
}

static const char *dev_uevent_name(const struct kobject *kobj)
{
        const struct device *dev = kobj_to_dev(kobj);

        if (dev->bus)
                return dev->bus->name;
        if (dev->class)
                return dev->class->name;
        return NULL;
}

/*
 * Try filling "DRIVER=<name>" uevent variable for a device. Because this
 * function may race with binding and unbinding the device from a driver,
 * we need to be careful. Binding is generally safe, at worst we miss the
 * fact that the device is already bound to a driver (but the driver
 * information that is delivered through uevents is best-effort, it may
 * become obsolete as soon as it is generated anyways). Unbinding is more
 * risky as driver pointer is transitioning to NULL, so READ_ONCE() should
 * be used to make sure we are dealing with the same pointer, and to
 * ensure that driver structure is not going to disappear from under us
 * we take bus' drivers klist lock. The assumption that only registered
 * driver can be bound to a device, and to unregister a driver bus code
 * will take the same lock.
 */
static void dev_driver_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);

        if (sp) {
                scoped_guard(spinlock, &sp->klist_drivers.k_lock) {
                        struct device_driver *drv = READ_ONCE(dev->driver);
                        if (drv)
                                add_uevent_var(env, "DRIVER=%s", drv->name);
                }

                subsys_put(sp);
        }
}

static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
{
        const struct device *dev = kobj_to_dev(kobj);
        int retval = 0;

        /* add device node properties if present */
        if (MAJOR(dev->devt)) {
                const char *tmp;
                const char *name;
                umode_t mode = 0;
                kuid_t uid = GLOBAL_ROOT_UID;
                kgid_t gid = GLOBAL_ROOT_GID;

                add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
                add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
                name = device_get_devnode(dev, &mode, &uid, &gid, &tmp);
                if (name) {
                        add_uevent_var(env, "DEVNAME=%s", name);
                        if (mode)
                                add_uevent_var(env, "DEVMODE=%#o", mode & 0777);
                        if (!uid_eq(uid, GLOBAL_ROOT_UID))
                                add_uevent_var(env, "DEVUID=%u", from_kuid(&init_user_ns, uid));
                        if (!gid_eq(gid, GLOBAL_ROOT_GID))
                                add_uevent_var(env, "DEVGID=%u", from_kgid(&init_user_ns, gid));
                        kfree(tmp);
                }
        }

        if (dev->type && dev->type->name)
                add_uevent_var(env, "DEVTYPE=%s", dev->type->name);

        /* Add "DRIVER=%s" variable if the device is bound to a driver */
        dev_driver_uevent(dev, env);

        /* Add common DT information about the device */
        of_device_uevent(dev, env);

        /* have the bus specific function add its stuff */
        if (dev->bus && dev->bus->uevent) {
                retval = dev->bus->uevent(dev, env);
                if (retval)
                        pr_debug("device: '%s': %s: bus uevent() returned %d\n",
                                 dev_name(dev), __func__, retval);
        }

        /* have the class specific function add its stuff */
        if (dev->class && dev->class->dev_uevent) {
                retval = dev->class->dev_uevent(dev, env);
                if (retval)
                        pr_debug("device: '%s': %s: class uevent() "
                                 "returned %d\n", dev_name(dev),
                                 __func__, retval);
        }

        /* have the device type specific function add its stuff */
        if (dev->type && dev->type->uevent) {
                retval = dev->type->uevent(dev, env);
                if (retval)
                        pr_debug("device: '%s': %s: dev_type uevent() "
                                 "returned %d\n", dev_name(dev),
                                 __func__, retval);
        }

        return retval;
}

static const struct kset_uevent_ops device_uevent_ops = {
        .filter =        dev_uevent_filter,
        .name =                dev_uevent_name,
        .uevent =        dev_uevent,
};

static ssize_t uevent_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        struct kobject *top_kobj;
        struct kset *kset;
        struct kobj_uevent_env *env = NULL;
        int i;
        int len = 0;
        int retval;

        /* search the kset, the device belongs to */
        top_kobj = &dev->kobj;
        while (!top_kobj->kset && top_kobj->parent)
                top_kobj = top_kobj->parent;
        if (!top_kobj->kset)
                goto out;

        kset = top_kobj->kset;
        if (!kset->uevent_ops || !kset->uevent_ops->uevent)
                goto out;

        /* respect filter */
        if (kset->uevent_ops && kset->uevent_ops->filter)
                if (!kset->uevent_ops->filter(&dev->kobj))
                        goto out;

        env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
        if (!env)
                return -ENOMEM;

        /* let the kset specific function add its keys */
        retval = kset->uevent_ops->uevent(&dev->kobj, env);
        if (retval)
                goto out;

        /* copy keys to file */
        for (i = 0; i < env->envp_idx; i++)
                len += sysfs_emit_at(buf, len, "%s\n", env->envp[i]);
out:
        kfree(env);
        return len;
}

static ssize_t uevent_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        int rc;

        rc = kobject_synth_uevent(&dev->kobj, buf, count);

        if (rc) {
                dev_err(dev, "uevent: failed to send synthetic uevent: %d\n", rc);
                return rc;
        }

        return count;
}
static DEVICE_ATTR_RW(uevent);

static ssize_t online_show(struct device *dev, struct device_attribute *attr,
                           char *buf)
{
        bool val;

        device_lock(dev);
        val = !dev->offline;
        device_unlock(dev);
        return sysfs_emit(buf, "%u\n", val);
}

static ssize_t online_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        bool val;
        int ret;

        ret = kstrtobool(buf, &val);
        if (ret < 0)
                return ret;

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        ret = val ? device_online(dev) : device_offline(dev);
        unlock_device_hotplug();
        return ret < 0 ? ret : count;
}
static DEVICE_ATTR_RW(online);

static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
                              char *buf)
{
        const char *loc;

        switch (dev->removable) {
        case DEVICE_REMOVABLE:
                loc = "removable";
                break;
        case DEVICE_FIXED:
                loc = "fixed";
                break;
        default:
                loc = "unknown";
        }
        return sysfs_emit(buf, "%s\n", loc);
}
static DEVICE_ATTR_RO(removable);

int device_add_groups(struct device *dev, const struct attribute_group **groups)
{
        return sysfs_create_groups(&dev->kobj, groups);
}
EXPORT_SYMBOL_GPL(device_add_groups);

void device_remove_groups(struct device *dev,
                          const struct attribute_group **groups)
{
        sysfs_remove_groups(&dev->kobj, groups);
}
EXPORT_SYMBOL_GPL(device_remove_groups);

union device_attr_group_devres {
        const struct attribute_group *group;
        const struct attribute_group **groups;
};

static void devm_attr_group_remove(struct device *dev, void *res)
{
        union device_attr_group_devres *devres = res;
        const struct attribute_group *group = devres->group;

        dev_dbg(dev, "%s: removing group %p\n", __func__, group);
        sysfs_remove_group(&dev->kobj, group);
}

/**
 * devm_device_add_group - given a device, create a managed attribute group
 * @dev:        The device to create the group for
 * @grp:        The attribute group to create
 *
 * This function creates a group for the first time.  It will explicitly
 * warn and error if any of the attribute files being created already exist.
 *
 * Returns 0 on success or error code on failure.
 */
int devm_device_add_group(struct device *dev, const struct attribute_group *grp)
{
        union device_attr_group_devres *devres;
        int error;

        devres = devres_alloc(devm_attr_group_remove,
                              sizeof(*devres), GFP_KERNEL);
        if (!devres)
                return -ENOMEM;

        error = sysfs_create_group(&dev->kobj, grp);
        if (error) {
                devres_free(devres);
                return error;
        }

        devres->group = grp;
        devres_add(dev, devres);
        return 0;
}
EXPORT_SYMBOL_GPL(devm_device_add_group);

static int device_add_attrs(struct device *dev)
{
        const struct class *class = dev->class;
        const struct device_type *type = dev->type;
        int error;

        if (class) {
                error = device_add_groups(dev, class->dev_groups);
                if (error)
                        return error;
        }

        if (type) {
                error = device_add_groups(dev, type->groups);
                if (error)
                        goto err_remove_class_groups;
        }

        error = device_add_groups(dev, dev->groups);
        if (error)
                goto err_remove_type_groups;

        if (device_supports_offline(dev) && !dev->offline_disabled) {
                error = device_create_file(dev, &dev_attr_online);
                if (error)
                        goto err_remove_dev_groups;
        }

        if (fw_devlink_flags && !fw_devlink_is_permissive() && dev->fwnode) {
                error = device_create_file(dev, &dev_attr_waiting_for_supplier);
                if (error)
                        goto err_remove_dev_online;
        }

        if (dev_removable_is_valid(dev)) {
                error = device_create_file(dev, &dev_attr_removable);
                if (error)
                        goto err_remove_dev_waiting_for_supplier;
        }

        if (dev_add_physical_location(dev)) {
                error = device_add_group(dev,
                        &dev_attr_physical_location_group);
                if (error)
                        goto err_remove_dev_removable;
        }

        return 0;

 err_remove_dev_removable:
        device_remove_file(dev, &dev_attr_removable);
 err_remove_dev_waiting_for_supplier:
        device_remove_file(dev, &dev_attr_waiting_for_supplier);
 err_remove_dev_online:
        device_remove_file(dev, &dev_attr_online);
 err_remove_dev_groups:
        device_remove_groups(dev, dev->groups);
 err_remove_type_groups:
        if (type)
                device_remove_groups(dev, type->groups);
 err_remove_class_groups:
        if (class)
                device_remove_groups(dev, class->dev_groups);

        return error;
}

static void device_remove_attrs(struct device *dev)
{
        const struct class *class = dev->class;
        const struct device_type *type = dev->type;

        if (dev->physical_location) {
                device_remove_group(dev, &dev_attr_physical_location_group);
                kfree(dev->physical_location);
        }

        device_remove_file(dev, &dev_attr_removable);
        device_remove_file(dev, &dev_attr_waiting_for_supplier);
        device_remove_file(dev, &dev_attr_online);
        device_remove_groups(dev, dev->groups);

        if (type)
                device_remove_groups(dev, type->groups);

        if (class)
                device_remove_groups(dev, class->dev_groups);
}

static ssize_t dev_show(struct device *dev, struct device_attribute *attr,
                        char *buf)
{
        return print_dev_t(buf, dev->devt);
}
static DEVICE_ATTR_RO(dev);

/* /sys/devices/ */
struct kset *devices_kset;

/**
 * devices_kset_move_before - Move device in the devices_kset's list.
 * @deva: Device to move.
 * @devb: Device @deva should come before.
 */
static void devices_kset_move_before(struct device *deva, struct device *devb)
{
        if (!devices_kset)
                return;
        pr_debug("devices_kset: Moving %s before %s\n",
                 dev_name(deva), dev_name(devb));
        spin_lock(&devices_kset->list_lock);
        list_move_tail(&deva->kobj.entry, &devb->kobj.entry);
        spin_unlock(&devices_kset->list_lock);
}

/**
 * devices_kset_move_after - Move device in the devices_kset's list.
 * @deva: Device to move
 * @devb: Device @deva should come after.
 */
static void devices_kset_move_after(struct device *deva, struct device *devb)
{
        if (!devices_kset)
                return;
        pr_debug("devices_kset: Moving %s after %s\n",
                 dev_name(deva), dev_name(devb));
        spin_lock(&devices_kset->list_lock);
        list_move(&deva->kobj.entry, &devb->kobj.entry);
        spin_unlock(&devices_kset->list_lock);
}

/**
 * devices_kset_move_last - move the device to the end of devices_kset's list.
 * @dev: device to move
 */
void devices_kset_move_last(struct device *dev)
{
        if (!devices_kset)
                return;
        pr_debug("devices_kset: Moving %s to end of list\n", dev_name(dev));
        spin_lock(&devices_kset->list_lock);
        list_move_tail(&dev->kobj.entry, &devices_kset->list);
        spin_unlock(&devices_kset->list_lock);
}

/**
 * device_create_file - create sysfs attribute file for device.
 * @dev: device.
 * @attr: device attribute descriptor.
 */
int device_create_file(struct device *dev,
                       const struct device_attribute *attr)
{
        int error = 0;

        if (dev) {
                WARN(((attr->attr.mode & S_IWUGO) && !attr->store),
                        "Attribute %s: write permission without 'store'\n",
                        attr->attr.name);
                WARN(((attr->attr.mode & S_IRUGO) && !attr->show),
                        "Attribute %s: read permission without 'show'\n",
                        attr->attr.name);
                error = sysfs_create_file(&dev->kobj, &attr->attr);
        }

        return error;
}
EXPORT_SYMBOL_GPL(device_create_file);

/**
 * device_remove_file - remove sysfs attribute file.
 * @dev: device.
 * @attr: device attribute descriptor.
 */
void device_remove_file(struct device *dev,
                        const struct device_attribute *attr)
{
        if (dev)
                sysfs_remove_file(&dev->kobj, &attr->attr);
}
EXPORT_SYMBOL_GPL(device_remove_file);

/**
 * device_remove_file_self - remove sysfs attribute file from its own method.
 * @dev: device.
 * @attr: device attribute descriptor.
 *
 * See kernfs_remove_self() for details.
 */
bool device_remove_file_self(struct device *dev,
                             const struct device_attribute *attr)
{
        if (dev)
                return sysfs_remove_file_self(&dev->kobj, &attr->attr);
        else
                return false;
}
EXPORT_SYMBOL_GPL(device_remove_file_self);

/**
 * device_create_bin_file - create sysfs binary attribute file for device.
 * @dev: device.
 * @attr: device binary attribute descriptor.
 */
int device_create_bin_file(struct device *dev,
                           const struct bin_attribute *attr)
{
        int error = -EINVAL;
        if (dev)
                error = sysfs_create_bin_file(&dev->kobj, attr);
        return error;
}
EXPORT_SYMBOL_GPL(device_create_bin_file);

/**
 * device_remove_bin_file - remove sysfs binary attribute file
 * @dev: device.
 * @attr: device binary attribute descriptor.
 */
void device_remove_bin_file(struct device *dev,
                            const struct bin_attribute *attr)
{
        if (dev)
                sysfs_remove_bin_file(&dev->kobj, attr);
}
EXPORT_SYMBOL_GPL(device_remove_bin_file);

static void klist_children_get(struct klist_node *n)
{
        struct device_private *p = to_device_private_parent(n);
        struct device *dev = p->device;

        get_device(dev);
}

static void klist_children_put(struct klist_node *n)
{
        struct device_private *p = to_device_private_parent(n);
        struct device *dev = p->device;

        put_device(dev);
}

/**
 * device_initialize - init device structure.
 * @dev: device.
 *
 * This prepares the device for use by other layers by initializing
 * its fields.
 * It is the first half of device_register(), if called by
 * that function, though it can also be called separately, so one
 * may use @dev's fields. In particular, get_device()/put_device()
 * may be used for reference counting of @dev after calling this
 * function.
 *
 * All fields in @dev must be initialized by the caller to 0, except
 * for those explicitly set to some other value.  The simplest
 * approach is to use kzalloc() to allocate the structure containing
 * @dev.
 *
 * NOTE: Use put_device() to give up your reference instead of freeing
 * @dev directly once you have called this function.
 */
void device_initialize(struct device *dev)
{
        dev->kobj.kset = devices_kset;
        kobject_init(&dev->kobj, &device_ktype);
        INIT_LIST_HEAD(&dev->dma_pools);
        mutex_init(&dev->mutex);
        lockdep_set_novalidate_class(&dev->mutex);
        spin_lock_init(&dev->devres_lock);
        INIT_LIST_HEAD(&dev->devres_head);
        device_pm_init(dev);
        set_dev_node(dev, NUMA_NO_NODE);
        INIT_LIST_HEAD(&dev->links.consumers);
        INIT_LIST_HEAD(&dev->links.suppliers);
        INIT_LIST_HEAD(&dev->links.defer_sync);
        dev->links.status = DL_DEV_NO_DRIVER;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
        dev->dma_coherent = dma_default_coherent;
#endif
        swiotlb_dev_init(dev);
}
EXPORT_SYMBOL_GPL(device_initialize);

struct kobject *virtual_device_parent(void)
{
        static struct kobject *virtual_dir = NULL;

        if (!virtual_dir)
                virtual_dir = kobject_create_and_add("virtual",
                                                     &devices_kset->kobj);

        return virtual_dir;
}

struct class_dir {
        struct kobject kobj;
        const struct class *class;
};

#define to_class_dir(obj) container_of(obj, struct class_dir, kobj)

static void class_dir_release(struct kobject *kobj)
{
        struct class_dir *dir = to_class_dir(kobj);
        kfree(dir);
}

static const
struct kobj_ns_type_operations *class_dir_child_ns_type(const struct kobject *kobj)
{
        const struct class_dir *dir = to_class_dir(kobj);
        return dir->class->ns_type;
}

static const struct kobj_type class_dir_ktype = {
        .release        = class_dir_release,
        .sysfs_ops        = &kobj_sysfs_ops,
        .child_ns_type        = class_dir_child_ns_type
};

static struct kobject *class_dir_create_and_add(struct subsys_private *sp,
                                                struct kobject *parent_kobj)
{
        struct class_dir *dir;
        int retval;

        dir = kzalloc(sizeof(*dir), GFP_KERNEL);
        if (!dir)
                return ERR_PTR(-ENOMEM);

        dir->class = sp->class;
        kobject_init(&dir->kobj, &class_dir_ktype);

        dir->kobj.kset = &sp->glue_dirs;

        retval = kobject_add(&dir->kobj, parent_kobj, "%s", sp->class->name);
        if (retval < 0) {
                kobject_put(&dir->kobj);
                return ERR_PTR(retval);
        }
        return &dir->kobj;
}

static DEFINE_MUTEX(gdp_mutex);

static struct kobject *get_device_parent(struct device *dev,
                                         struct device *parent)
{
        struct subsys_private *sp = class_to_subsys(dev->class);
        struct kobject *kobj = NULL;

        if (sp) {
                struct kobject *parent_kobj;
                struct kobject *k;

                /*
                 * If we have no parent, we live in "virtual".
                 * Class-devices with a non class-device as parent, live
                 * in a "glue" directory to prevent namespace collisions.
                 */
                if (parent == NULL)
                        parent_kobj = virtual_device_parent();
                else if (parent->class && !dev->class->ns_type) {
                        subsys_put(sp);
                        return &parent->kobj;
                } else {
                        parent_kobj = &parent->kobj;
                }

                mutex_lock(&gdp_mutex);

                /* find our class-directory at the parent and reference it */
                spin_lock(&sp->glue_dirs.list_lock);
                list_for_each_entry(k, &sp->glue_dirs.list, entry)
                        if (k->parent == parent_kobj) {
                                kobj = kobject_get(k);
                                break;
                        }
                spin_unlock(&sp->glue_dirs.list_lock);
                if (kobj) {
                        mutex_unlock(&gdp_mutex);
                        subsys_put(sp);
                        return kobj;
                }

                /* or create a new class-directory at the parent device */
                k = class_dir_create_and_add(sp, parent_kobj);
                /* do not emit an uevent for this simple "glue" directory */
                mutex_unlock(&gdp_mutex);
                subsys_put(sp);
                return k;
        }

        /* subsystems can specify a default root directory for their devices */
        if (!parent && dev->bus) {
                struct device *dev_root = bus_get_dev_root(dev->bus);

                if (dev_root) {
                        kobj = &dev_root->kobj;
                        put_device(dev_root);
                        return kobj;
                }
        }

        if (parent)
                return &parent->kobj;
        return NULL;
}

static inline bool live_in_glue_dir(struct kobject *kobj,
                                    struct device *dev)
{
        struct subsys_private *sp;
        bool retval;

        if (!kobj || !dev->class)
                return false;

        sp = class_to_subsys(dev->class);
        if (!sp)
                return false;

        if (kobj->kset == &sp->glue_dirs)
                retval = true;
        else
                retval = false;

        subsys_put(sp);
        return retval;
}

static inline struct kobject *get_glue_dir(struct device *dev)
{
        return dev->kobj.parent;
}

/**
 * kobject_has_children - Returns whether a kobject has children.
 * @kobj: the object to test
 *
 * This will return whether a kobject has other kobjects as children.
 *
 * It does NOT account for the presence of attribute files, only sub
 * directories. It also assumes there is no concurrent addition or
 * removal of such children, and thus relies on external locking.
 */
static inline bool kobject_has_children(struct kobject *kobj)
{
        WARN_ON_ONCE(kref_read(&kobj->kref) == 0);

        return kobj->sd && kobj->sd->dir.subdirs;
}

/*
 * make sure cleaning up dir as the last step, we need to make
 * sure .release handler of kobject is run with holding the
 * global lock
 */
static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
{
        unsigned int ref;

        /* see if we live in a "glue" directory */
        if (!live_in_glue_dir(glue_dir, dev))
                return;

        mutex_lock(&gdp_mutex);
        /**
         * There is a race condition between removing glue directory
         * and adding a new device under the glue directory.
         *
         * CPU1:                                         CPU2:
         *
         * device_add()
         *   get_device_parent()
         *     class_dir_create_and_add()
         *       kobject_add_internal()
         *         create_dir()    // create glue_dir
         *
         *                                               device_add()
         *                                                 get_device_parent()
         *                                                   kobject_get() // get glue_dir
         *
         * device_del()
         *   cleanup_glue_dir()
         *     kobject_del(glue_dir)
         *
         *                                               kobject_add()
         *                                                 kobject_add_internal()
         *                                                   create_dir() // in glue_dir
         *                                                     sysfs_create_dir_ns()
         *                                                       kernfs_create_dir_ns(sd)
         *
         *       sysfs_remove_dir() // glue_dir->sd=NULL
         *       sysfs_put()        // free glue_dir->sd
         *
         *                                                         // sd is freed
         *                                                         kernfs_new_node(sd)
         *                                                           kernfs_get(glue_dir)
         *                                                           kernfs_add_one()
         *                                                           kernfs_put()
         *
         * Before CPU1 remove last child device under glue dir, if CPU2 add
         * a new device under glue dir, the glue_dir kobject reference count
         * will be increase to 2 in kobject_get(k). And CPU2 has been called
         * kernfs_create_dir_ns(). Meanwhile, CPU1 call sysfs_remove_dir()
         * and sysfs_put(). This result in glue_dir->sd is freed.
         *
         * Then the CPU2 will see a stale "empty" but still potentially used
         * glue dir around in kernfs_new_node().
         *
         * In order to avoid this happening, we also should make sure that
         * kernfs_node for glue_dir is released in CPU1 only when refcount
         * for glue_dir kobj is 1.
         */
        ref = kref_read(&glue_dir->kref);
        if (!kobject_has_children(glue_dir) && !--ref)
                kobject_del(glue_dir);
        kobject_put(glue_dir);
        mutex_unlock(&gdp_mutex);
}

static int device_add_class_symlinks(struct device *dev)
{
        struct device_node *of_node = dev_of_node(dev);
        struct subsys_private *sp;
        int error;

        if (of_node) {
                error = sysfs_create_link(&dev->kobj, of_node_kobj(of_node), "of_node");
                if (error)
                        dev_warn(dev, "Error %d creating of_node link\n",error);
                /* An error here doesn't warrant bringing down the device */
        }

        sp = class_to_subsys(dev->class);
        if (!sp)
                return 0;

        error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem");
        if (error)
                goto out_devnode;

        if (dev->parent && device_is_not_partition(dev)) {
                error = sysfs_create_link(&dev->kobj, &dev->parent->kobj,
                                          "device");
                if (error)
                        goto out_subsys;
        }

        /* link in the class directory pointing to the device */
        error = sysfs_create_link(&sp->subsys.kobj, &dev->kobj, dev_name(dev));
        if (error)
                goto out_device;
        goto exit;

out_device:
        sysfs_remove_link(&dev->kobj, "device");
out_subsys:
        sysfs_remove_link(&dev->kobj, "subsystem");
out_devnode:
        sysfs_remove_link(&dev->kobj, "of_node");
exit:
        subsys_put(sp);
        return error;
}

static void device_remove_class_symlinks(struct device *dev)
{
        struct subsys_private *sp = class_to_subsys(dev->class);

        if (dev_of_node(dev))
                sysfs_remove_link(&dev->kobj, "of_node");

        if (!sp)
                return;

        if (dev->parent && device_is_not_partition(dev))
                sysfs_remove_link(&dev->kobj, "device");
        sysfs_remove_link(&dev->kobj, "subsystem");
        sysfs_delete_link(&sp->subsys.kobj, &dev->kobj, dev_name(dev));
        subsys_put(sp);
}

/**
 * dev_set_name - set a device name
 * @dev: device
 * @fmt: format string for the device's name
 */
int dev_set_name(struct device *dev, const char *fmt, ...)
{
        va_list vargs;
        int err;

        va_start(vargs, fmt);
        err = kobject_set_name_vargs(&dev->kobj, fmt, vargs);
        va_end(vargs);
        return err;
}
EXPORT_SYMBOL_GPL(dev_set_name);

/* select a /sys/dev/ directory for the device */
static struct kobject *device_to_dev_kobj(struct device *dev)
{
        if (is_blockdev(dev))
                return sysfs_dev_block_kobj;
        else
                return sysfs_dev_char_kobj;
}

static int device_create_sys_dev_entry(struct device *dev)
{
        struct kobject *kobj = device_to_dev_kobj(dev);
        int error = 0;
        char devt_str[15];

        if (kobj) {
                format_dev_t(devt_str, dev->devt);
                error = sysfs_create_link(kobj, &dev->kobj, devt_str);
        }

        return error;
}

static void device_remove_sys_dev_entry(struct device *dev)
{
        struct kobject *kobj = device_to_dev_kobj(dev);
        char devt_str[15];

        if (kobj) {
                format_dev_t(devt_str, dev->devt);
                sysfs_remove_link(kobj, devt_str);
        }
}

static int device_private_init(struct device *dev)
{
        dev->p = kzalloc(sizeof(*dev->p), GFP_KERNEL);
        if (!dev->p)
                return -ENOMEM;
        dev->p->device = dev;
        klist_init(&dev->p->klist_children, klist_children_get,
                   klist_children_put);
        INIT_LIST_HEAD(&dev->p->deferred_probe);
        return 0;
}

/**
 * device_add - add device to device hierarchy.
 * @dev: device.
 *
 * This is part 2 of device_register(), though may be called
 * separately _iff_ device_initialize() has been called separately.
 *
 * This adds @dev to the kobject hierarchy via kobject_add(), adds it
 * to the global and sibling lists for the device, then
 * adds it to the other relevant subsystems of the driver model.
 *
 * Do not call this routine or device_register() more than once for
 * any device structure.  The driver model core is not designed to work
 * with devices that get unregistered and then spring back to life.
 * (Among other things, it's very hard to guarantee that all references
 * to the previous incarnation of @dev have been dropped.)  Allocate
 * and register a fresh new struct device instead.
 *
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up your
 * reference instead.
 *
 * Rule of thumb is: if device_add() succeeds, you should call
 * device_del() when you want to get rid of it. If device_add() has
 * *not* succeeded, use *only* put_device() to drop the reference
 * count.
 */
int device_add(struct device *dev)
{
        struct subsys_private *sp;
        struct device *parent;
        struct kobject *kobj;
        struct class_interface *class_intf;
        int error = -EINVAL;
        struct kobject *glue_dir = NULL;

        dev = get_device(dev);
        if (!dev)
                goto done;

        if (!dev->p) {
                error = device_private_init(dev);
                if (error)
                        goto done;
        }

        /*
         * for statically allocated devices, which should all be converted
         * some day, we need to initialize the name. We prevent reading back
         * the name, and force the use of dev_name()
         */
        if (dev->init_name) {
                error = dev_set_name(dev, "%s", dev->init_name);
                dev->init_name = NULL;
        }

        if (dev_name(dev))
                error = 0;
        /* subsystems can specify simple device enumeration */
        else if (dev->bus && dev->bus->dev_name)
                error = dev_set_name(dev, "%s%u", dev->bus->dev_name, dev->id);
        else
                error = -EINVAL;
        if (error)
                goto name_error;

        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);

        parent = get_device(dev->parent);
        kobj = get_device_parent(dev, parent);
        if (IS_ERR(kobj)) {
                error = PTR_ERR(kobj);
                goto parent_error;
        }
        if (kobj)
                dev->kobj.parent = kobj;

        /* use parent numa_node */
        if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
                set_dev_node(dev, dev_to_node(parent));

        /* first, register with generic layer. */
        /* we require the name to be set before, and pass NULL */
        error = kobject_add(&dev->kobj, dev->kobj.parent, NULL);
        if (error) {
                glue_dir = kobj;
                goto Error;
        }

        /* notify platform of device entry */
        device_platform_notify(dev);

        error = device_create_file(dev, &dev_attr_uevent);
        if (error)
                goto attrError;

        error = device_add_class_symlinks(dev);
        if (error)
                goto SymlinkError;
        error = device_add_attrs(dev);
        if (error)
                goto AttrsError;
        error = bus_add_device(dev);
        if (error)
                goto BusError;
        error = dpm_sysfs_add(dev);
        if (error)
                goto DPMError;
        device_pm_add(dev);

        if (MAJOR(dev->devt)) {
                error = device_create_file(dev, &dev_attr_dev);
                if (error)
                        goto DevAttrError;

                error = device_create_sys_dev_entry(dev);
                if (error)
                        goto SysEntryError;

                devtmpfs_create_node(dev);
        }

        /* Notify clients of device addition.  This call must come
         * after dpm_sysfs_add() and before kobject_uevent().
         */
        bus_notify(dev, BUS_NOTIFY_ADD_DEVICE);
        kobject_uevent(&dev->kobj, KOBJ_ADD);

        /*
         * Check if any of the other devices (consumers) have been waiting for
         * this device (supplier) to be added so that they can create a device
         * link to it.
         *
         * This needs to happen after device_pm_add() because device_link_add()
         * requires the supplier be registered before it's called.
         *
         * But this also needs to happen before bus_probe_device() to make sure
         * waiting consumers can link to it before the driver is bound to the
         * device and the driver sync_state callback is called for this device.
         */
        if (dev->fwnode && !dev->fwnode->dev) {
                dev->fwnode->dev = dev;
                fw_devlink_link_device(dev);
        }

        bus_probe_device(dev);

        /*
         * If all driver registration is done and a newly added device doesn't
         * match with any driver, don't block its consumers from probing in
         * case the consumer device is able to operate without this supplier.
         */
        if (dev->fwnode && fw_devlink_drv_reg_done && !dev->can_match)
                fw_devlink_unblock_consumers(dev);

        if (parent)
                klist_add_tail(&dev->p->knode_parent,
                               &parent->p->klist_children);

        sp = class_to_subsys(dev->class);
        if (sp) {
                mutex_lock(&sp->mutex);
                /* tie the class to the device */
                klist_add_tail(&dev->p->knode_class, &sp->klist_devices);

                /* notify any interfaces that the device is here */
                list_for_each_entry(class_intf, &sp->interfaces, node)
                        if (class_intf->add_dev)
                                class_intf->add_dev(dev);
                mutex_unlock(&sp->mutex);
                subsys_put(sp);
        }
done:
        put_device(dev);
        return error;
 SysEntryError:
        if (MAJOR(dev->devt))
                device_remove_file(dev, &dev_attr_dev);
 DevAttrError:
        device_pm_remove(dev);
        dpm_sysfs_remove(dev);
 DPMError:
        device_set_driver(dev, NULL);
        bus_remove_device(dev);
 BusError:
        device_remove_attrs(dev);
 AttrsError:
        device_remove_class_symlinks(dev);
 SymlinkError:
        device_remove_file(dev, &dev_attr_uevent);
 attrError:
        device_platform_notify_remove(dev);
        kobject_uevent(&dev->kobj, KOBJ_REMOVE);
        glue_dir = get_glue_dir(dev);
        kobject_del(&dev->kobj);
 Error:
        cleanup_glue_dir(dev, glue_dir);
parent_error:
        put_device(parent);
name_error:
        kfree(dev->p);
        dev->p = NULL;
        goto done;
}
EXPORT_SYMBOL_GPL(device_add);

/**
 * device_register - register a device with the system.
 * @dev: pointer to the device structure
 *
 * This happens in two clean steps - initialize the device
 * and add it to the system. The two steps can be called
 * separately, but this is the easiest and most common.
 * I.e. you should only call the two helpers separately if
 * have a clearly defined need to use and refcount the device
 * before it is added to the hierarchy.
 *
 * For more information, see the kerneldoc for device_initialize()
 * and device_add().
 *
 * NOTE: _Never_ directly free @dev after calling this function, even
 * if it returned an error! Always use put_device() to give up the
 * reference initialized in this function instead.
 */
int device_register(struct device *dev)
{
        device_initialize(dev);
        return device_add(dev);
}
EXPORT_SYMBOL_GPL(device_register);

/**
 * get_device - increment reference count for device.
 * @dev: device.
 *
 * This simply forwards the call to kobject_get(), though
 * we do take care to provide for the case that we get a NULL
 * pointer passed in.
 */
struct device *get_device(struct device *dev)
{
        return dev ? kobj_to_dev(kobject_get(&dev->kobj)) : NULL;
}
EXPORT_SYMBOL_GPL(get_device);

/**
 * put_device - decrement reference count.
 * @dev: device in question.
 */
void put_device(struct device *dev)
{
        /* might_sleep(); */
        if (dev)
                kobject_put(&dev->kobj);
}
EXPORT_SYMBOL_GPL(put_device);

bool kill_device(struct device *dev)
{
        /*
         * Require the device lock and set the "dead" flag to guarantee that
         * the update behavior is consistent with the other bitfields near
         * it and that we cannot have an asynchronous probe routine trying
         * to run while we are tearing out the bus/class/sysfs from
         * underneath the device.
         */
        device_lock_assert(dev);

        if (dev->p->dead)
                return false;
        dev->p->dead = true;
        return true;
}
EXPORT_SYMBOL_GPL(kill_device);

/**
 * device_del - delete device from system.
 * @dev: device.
 *
 * This is the first part of the device unregistration
 * sequence. This removes the device from the lists we control
 * from here, has it removed from the other driver model
 * subsystems it was added to in device_add(), and removes it
 * from the kobject hierarchy.
 *
 * NOTE: this should be called manually _iff_ device_add() was
 * also called manually.
 */
void device_del(struct device *dev)
{
        struct subsys_private *sp;
        struct device *parent = dev->parent;
        struct kobject *glue_dir = NULL;
        struct class_interface *class_intf;
        unsigned int noio_flag;

        device_lock(dev);
        kill_device(dev);
        device_unlock(dev);

        if (dev->fwnode && dev->fwnode->dev == dev)
                dev->fwnode->dev = NULL;

        /* Notify clients of device removal.  This call must come
         * before dpm_sysfs_remove().
         */
        noio_flag = memalloc_noio_save();
        bus_notify(dev, BUS_NOTIFY_DEL_DEVICE);

        dpm_sysfs_remove(dev);
        if (parent)
                klist_del(&dev->p->knode_parent);
        if (MAJOR(dev->devt)) {
                devtmpfs_delete_node(dev);
                device_remove_sys_dev_entry(dev);
                device_remove_file(dev, &dev_attr_dev);
        }

        sp = class_to_subsys(dev->class);
        if (sp) {
                device_remove_class_symlinks(dev);

                mutex_lock(&sp->mutex);
                /* notify any interfaces that the device is now gone */
                list_for_each_entry(class_intf, &sp->interfaces, node)
                        if (class_intf->remove_dev)
                                class_intf->remove_dev(dev);
                /* remove the device from the class list */
                klist_del(&dev->p->knode_class);
                mutex_unlock(&sp->mutex);
                subsys_put(sp);
        }
        device_remove_file(dev, &dev_attr_uevent);
        device_remove_attrs(dev);
        bus_remove_device(dev);
        device_pm_remove(dev);
        driver_deferred_probe_del(dev);
        device_platform_notify_remove(dev);
        device_links_purge(dev);

        /*
         * If a device does not have a driver attached, we need to clean
         * up any managed resources. We do this in device_release(), but
         * it's never called (and we leak the device) if a managed
         * resource holds a reference to the device. So release all
         * managed resources here, like we do in driver_detach(). We
         * still need to do so again in device_release() in case someone
         * adds a new resource after this point, though.
         */
        devres_release_all(dev);

        bus_notify(dev, BUS_NOTIFY_REMOVED_DEVICE);
        kobject_uevent(&dev->kobj, KOBJ_REMOVE);
        glue_dir = get_glue_dir(dev);
        kobject_del(&dev->kobj);
        cleanup_glue_dir(dev, glue_dir);
        memalloc_noio_restore(noio_flag);
        put_device(parent);
}
EXPORT_SYMBOL_GPL(device_del);

/**
 * device_unregister - unregister device from system.
 * @dev: device going away.
 *
 * We do this in two parts, like we do device_register(). First,
 * we remove it from all the subsystems with device_del(), then
 * we decrement the reference count via put_device(). If that
 * is the final reference count, the device will be cleaned up
 * via device_release() above. Otherwise, the structure will
 * stick around until the final reference to the device is dropped.
 */
void device_unregister(struct device *dev)
{
        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
        device_del(dev);
        put_device(dev);
}
EXPORT_SYMBOL_GPL(device_unregister);

static struct device *prev_device(struct klist_iter *i)
{
        struct klist_node *n = klist_prev(i);
        struct device *dev = NULL;
        struct device_private *p;

        if (n) {
                p = to_device_private_parent(n);
                dev = p->device;
        }
        return dev;
}

static struct device *next_device(struct klist_iter *i)
{
        struct klist_node *n = klist_next(i);
        struct device *dev = NULL;
        struct device_private *p;

        if (n) {
                p = to_device_private_parent(n);
                dev = p->device;
        }
        return dev;
}

/**
 * device_get_devnode - path of device node file
 * @dev: device
 * @mode: returned file access mode
 * @uid: returned file owner
 * @gid: returned file group
 * @tmp: possibly allocated string
 *
 * Return the relative path of a possible device node.
 * Non-default names may need to allocate a memory to compose
 * a name. This memory is returned in tmp and needs to be
 * freed by the caller.
 */
const char *device_get_devnode(const struct device *dev,
                               umode_t *mode, kuid_t *uid, kgid_t *gid,
                               const char **tmp)
{
        char *s;

        *tmp = NULL;

        /* the device type may provide a specific name */
        if (dev->type && dev->type->devnode)
                *tmp = dev->type->devnode(dev, mode, uid, gid);
        if (*tmp)
                return *tmp;

        /* the class may provide a specific name */
        if (dev->class && dev->class->devnode)
                *tmp = dev->class->devnode(dev, mode);
        if (*tmp)
                return *tmp;

        /* return name without allocation, tmp == NULL */
        if (strchr(dev_name(dev), '!') == NULL)
                return dev_name(dev);

        /* replace '!' in the name with '/' */
        s = kstrdup_and_replace(dev_name(dev), '!', '/', GFP_KERNEL);
        if (!s)
                return NULL;
        return *tmp = s;
}

/**
 * device_for_each_child - device child iterator.
 * @parent: parent struct device.
 * @fn: function to be called for each device.
 * @data: data for the callback.
 *
 * Iterate over @parent's child devices, and call @fn for each,
 * passing it @data.
 *
 * We check the return of @fn each time. If it returns anything
 * other than 0, we break out and return that value.
 */
int device_for_each_child(struct device *parent, void *data,
                          device_iter_t fn)
{
        struct klist_iter i;
        struct device *child;
        int error = 0;

        if (!parent || !parent->p)
                return 0;

        klist_iter_init(&parent->p->klist_children, &i);
        while (!error && (child = next_device(&i)))
                error = fn(child, data);
        klist_iter_exit(&i);
        return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child);

/**
 * device_for_each_child_reverse - device child iterator in reversed order.
 * @parent: parent struct device.
 * @fn: function to be called for each device.
 * @data: data for the callback.
 *
 * Iterate over @parent's child devices, and call @fn for each,
 * passing it @data.
 *
 * We check the return of @fn each time. If it returns anything
 * other than 0, we break out and return that value.
 */
int device_for_each_child_reverse(struct device *parent, void *data,
                                  device_iter_t fn)
{
        struct klist_iter i;
        struct device *child;
        int error = 0;

        if (!parent || !parent->p)
                return 0;

        klist_iter_init(&parent->p->klist_children, &i);
        while ((child = prev_device(&i)) && !error)
                error = fn(child, data);
        klist_iter_exit(&i);
        return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child_reverse);

/**
 * device_for_each_child_reverse_from - device child iterator in reversed order.
 * @parent: parent struct device.
 * @from: optional starting point in child list
 * @fn: function to be called for each device.
 * @data: data for the callback.
 *
 * Iterate over @parent's child devices, starting at @from, and call @fn
 * for each, passing it @data. This helper is identical to
 * device_for_each_child_reverse() when @from is NULL.
 *
 * @fn is checked each iteration. If it returns anything other than 0,
 * iteration stop and that value is returned to the caller of
 * device_for_each_child_reverse_from();
 */
int device_for_each_child_reverse_from(struct device *parent,
                                       struct device *from, void *data,
                                       device_iter_t fn)
{
        struct klist_iter i;
        struct device *child;
        int error = 0;

        if (!parent || !parent->p)
                return 0;

        klist_iter_init_node(&parent->p->klist_children, &i,
                             (from ? &from->p->knode_parent : NULL));
        while ((child = prev_device(&i)) && !error)
                error = fn(child, data);
        klist_iter_exit(&i);
        return error;
}
EXPORT_SYMBOL_GPL(device_for_each_child_reverse_from);

/**
 * device_find_child - device iterator for locating a particular device.
 * @parent: parent struct device
 * @match: Callback function to check device
 * @data: Data to pass to match function
 *
 * This is similar to the device_for_each_child() function above, but it
 * returns a reference to a device that is 'found' for later use, as
 * determined by the @match callback.
 *
 * The callback should return 0 if the device doesn't match and non-zero
 * if it does.  If the callback returns non-zero and a reference to the
 * current device can be obtained, this function will return to the caller
 * and not iterate over any more devices.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
struct device *device_find_child(struct device *parent, const void *data,
                                 device_match_t match)
{
        struct klist_iter i;
        struct device *child;

        if (!parent || !parent->p)
                return NULL;

        klist_iter_init(&parent->p->klist_children, &i);
        while ((child = next_device(&i))) {
                if (match(child, data)) {
                        get_device(child);
                        break;
                }
        }
        klist_iter_exit(&i);
        return child;
}
EXPORT_SYMBOL_GPL(device_find_child);

int __init devices_init(void)
{
        devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
        if (!devices_kset)
                return -ENOMEM;
        dev_kobj = kobject_create_and_add("dev", NULL);
        if (!dev_kobj)
                goto dev_kobj_err;
        sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj);
        if (!sysfs_dev_block_kobj)
                goto block_kobj_err;
        sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj);
        if (!sysfs_dev_char_kobj)
                goto char_kobj_err;
        device_link_wq = alloc_workqueue("device_link_wq", 0, 0);
        if (!device_link_wq)
                goto wq_err;

        return 0;

 wq_err:
        kobject_put(sysfs_dev_char_kobj);
 char_kobj_err:
        kobject_put(sysfs_dev_block_kobj);
 block_kobj_err:
        kobject_put(dev_kobj);
 dev_kobj_err:
        kset_unregister(devices_kset);
        return -ENOMEM;
}

static int device_check_offline(struct device *dev, void *not_used)
{
        int ret;

        ret = device_for_each_child(dev, NULL, device_check_offline);
        if (ret)
                return ret;

        return device_supports_offline(dev) && !dev->offline ? -EBUSY : 0;
}

/**
 * device_offline - Prepare the device for hot-removal.
 * @dev: Device to be put offline.
 *
 * Execute the device bus type's .offline() callback, if present, to prepare
 * the device for a subsequent hot-removal.  If that succeeds, the device must
 * not be used until either it is removed or its bus type's .online() callback
 * is executed.
 *
 * Call under device_hotplug_lock.
 */
int device_offline(struct device *dev)
{
        int ret;

        if (dev->offline_disabled)
                return -EPERM;

        ret = device_for_each_child(dev, NULL, device_check_offline);
        if (ret)
                return ret;

        device_lock(dev);
        if (device_supports_offline(dev)) {
                if (dev->offline) {
                        ret = 1;
                } else {
                        ret = dev->bus->offline(dev);
                        if (!ret) {
                                kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
                                dev->offline = true;
                        }
                }
        }
        device_unlock(dev);

        return ret;
}

/**
 * device_online - Put the device back online after successful device_offline().
 * @dev: Device to be put back online.
 *
 * If device_offline() has been successfully executed for @dev, but the device
 * has not been removed subsequently, execute its bus type's .online() callback
 * to indicate that the device can be used again.
 *
 * Call under device_hotplug_lock.
 */
int device_online(struct device *dev)
{
        int ret = 0;

        device_lock(dev);
        if (device_supports_offline(dev)) {
                if (dev->offline) {
                        ret = dev->bus->online(dev);
                        if (!ret) {
                                kobject_uevent(&dev->kobj, KOBJ_ONLINE);
                                dev->offline = false;
                        }
                } else {
                        ret = 1;
                }
        }
        device_unlock(dev);

        return ret;
}

struct root_device {
        struct device dev;
        struct module *owner;
};

static inline struct root_device *to_root_device(struct device *d)
{
        return container_of(d, struct root_device, dev);
}

static void root_device_release(struct device *dev)
{
        kfree(to_root_device(dev));
}

/**
 * __root_device_register - allocate and register a root device
 * @name: root device name
 * @owner: owner module of the root device, usually THIS_MODULE
 *
 * This function allocates a root device and registers it
 * using device_register(). In order to free the returned
 * device, use root_device_unregister().
 *
 * Root devices are dummy devices which allow other devices
 * to be grouped under /sys/devices. Use this function to
 * allocate a root device and then use it as the parent of
 * any device which should appear under /sys/devices/{name}
 *
 * The /sys/devices/{name} directory will also contain a
 * 'module' symlink which points to the @owner directory
 * in sysfs.
 *
 * Returns &struct device pointer on success, or ERR_PTR() on error.
 *
 * Note: You probably want to use root_device_register().
 */
struct device *__root_device_register(const char *name, struct module *owner)
{
        struct root_device *root;
        int err = -ENOMEM;

        root = kzalloc(sizeof(struct root_device), GFP_KERNEL);
        if (!root)
                return ERR_PTR(err);

        err = dev_set_name(&root->dev, "%s", name);
        if (err) {
                kfree(root);
                return ERR_PTR(err);
        }

        root->dev.release = root_device_release;

        err = device_register(&root->dev);
        if (err) {
                put_device(&root->dev);
                return ERR_PTR(err);
        }

#ifdef CONFIG_MODULES        /* gotta find a "cleaner" way to do this */
        if (owner) {
                struct module_kobject *mk = &owner->mkobj;

                err = sysfs_create_link(&root->dev.kobj, &mk->kobj, "module");
                if (err) {
                        device_unregister(&root->dev);
                        return ERR_PTR(err);
                }
                root->owner = owner;
        }
#endif

        return &root->dev;
}
EXPORT_SYMBOL_GPL(__root_device_register);

/**
 * root_device_unregister - unregister and free a root device
 * @dev: device going away
 *
 * This function unregisters and cleans up a device that was created by
 * root_device_register().
 */
void root_device_unregister(struct device *dev)
{
        struct root_device *root = to_root_device(dev);

        if (root->owner)
                sysfs_remove_link(&root->dev.kobj, "module");

        device_unregister(dev);
}
EXPORT_SYMBOL_GPL(root_device_unregister);


static void device_create_release(struct device *dev)
{
        pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
        kfree(dev);
}

static __printf(6, 0) struct device *
device_create_groups_vargs(const struct class *class, struct device *parent,
                           dev_t devt, void *drvdata,
                           const struct attribute_group **groups,
                           const char *fmt, va_list args)
{
        struct device *dev = NULL;
        int retval = -ENODEV;

        if (IS_ERR_OR_NULL(class))
                goto error;

        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
        if (!dev) {
                retval = -ENOMEM;
                goto error;
        }

        device_initialize(dev);
        dev->devt = devt;
        dev->class = class;
        dev->parent = parent;
        dev->groups = groups;
        dev->release = device_create_release;
        dev_set_drvdata(dev, drvdata);

        retval = kobject_set_name_vargs(&dev->kobj, fmt, args);
        if (retval)
                goto error;

        retval = device_add(dev);
        if (retval)
                goto error;

        return dev;

error:
        put_device(dev);
        return ERR_PTR(retval);
}

/**
 * device_create - creates a device and registers it with sysfs
 * @class: pointer to the struct class that this device should be registered to
 * @parent: pointer to the parent struct device of this new device, if any
 * @devt: the dev_t for the char device to be added
 * @drvdata: the data to be added to the device for callbacks
 * @fmt: string for the device's name
 *
 * This function can be used by char device classes.  A struct device
 * will be created in sysfs, registered to the specified class.
 *
 * A "dev" file will be created, showing the dev_t for the device, if
 * the dev_t is not 0,0.
 * If a pointer to a parent struct device is passed in, the newly created
 * struct device will be a child of that device in sysfs.
 * The pointer to the struct device will be returned from the call.
 * Any further sysfs files that might be required can be created using this
 * pointer.
 *
 * Returns &struct device pointer on success, or ERR_PTR() on error.
 */
struct device *device_create(const struct class *class, struct device *parent,
                             dev_t devt, void *drvdata, const char *fmt, ...)
{
        va_list vargs;
        struct device *dev;

        va_start(vargs, fmt);
        dev = device_create_groups_vargs(class, parent, devt, drvdata, NULL,
                                          fmt, vargs);
        va_end(vargs);
        return dev;
}
EXPORT_SYMBOL_GPL(device_create);

/**
 * device_create_with_groups - creates a device and registers it with sysfs
 * @class: pointer to the struct class that this device should be registered to
 * @parent: pointer to the parent struct device of this new device, if any
 * @devt: the dev_t for the char device to be added
 * @drvdata: the data to be added to the device for callbacks
 * @groups: NULL-terminated list of attribute groups to be created
 * @fmt: string for the device's name
 *
 * This function can be used by char device classes.  A struct device
 * will be created in sysfs, registered to the specified class.
 * Additional attributes specified in the groups parameter will also
 * be created automatically.
 *
 * A "dev" file will be created, showing the dev_t for the device, if
 * the dev_t is not 0,0.
 * If a pointer to a parent struct device is passed in, the newly created
 * struct device will be a child of that device in sysfs.
 * The pointer to the struct device will be returned from the call.
 * Any further sysfs files that might be required can be created using this
 * pointer.
 *
 * Returns &struct device pointer on success, or ERR_PTR() on error.
 */
struct device *device_create_with_groups(const struct class *class,
                                         struct device *parent, dev_t devt,
                                         void *drvdata,
                                         const struct attribute_group **groups,
                                         const char *fmt, ...)
{
        va_list vargs;
        struct device *dev;

        va_start(vargs, fmt);
        dev = device_create_groups_vargs(class, parent, devt, drvdata, groups,
                                         fmt, vargs);
        va_end(vargs);
        return dev;
}
EXPORT_SYMBOL_GPL(device_create_with_groups);

/**
 * device_destroy - removes a device that was created with device_create()
 * @class: pointer to the struct class that this device was registered with
 * @devt: the dev_t of the device that was previously registered
 *
 * This call unregisters and cleans up a device that was created with a
 * call to device_create().
 */
void device_destroy(const struct class *class, dev_t devt)
{
        struct device *dev;

        dev = class_find_device_by_devt(class, devt);
        if (dev) {
                put_device(dev);
                device_unregister(dev);
        }
}
EXPORT_SYMBOL_GPL(device_destroy);

/**
 * device_rename - renames a device
 * @dev: the pointer to the struct device to be renamed
 * @new_name: the new name of the device
 *
 * It is the responsibility of the caller to provide mutual
 * exclusion between two different calls of device_rename
 * on the same device to ensure that new_name is valid and
 * won't conflict with other devices.
 *
 * Note: given that some subsystems (networking and infiniband) use this
 * function, with no immediate plans for this to change, we cannot assume or
 * require that this function not be called at all.
 *
 * However, if you're writing new code, do not call this function. The following
 * text from Kay Sievers offers some insight:
 *
 * Renaming devices is racy at many levels, symlinks and other stuff are not
 * replaced atomically, and you get a "move" uevent, but it's not easy to
 * connect the event to the old and new device. Device nodes are not renamed at
 * all, there isn't even support for that in the kernel now.
 *
 * In the meantime, during renaming, your target name might be taken by another
 * driver, creating conflicts. Or the old name is taken directly after you
 * renamed it -- then you get events for the same DEVPATH, before you even see
 * the "move" event. It's just a mess, and nothing new should ever rely on
 * kernel device renaming. Besides that, it's not even implemented now for
 * other things than (driver-core wise very simple) network devices.
 *
 * Make up a "real" name in the driver before you register anything, or add
 * some other attributes for userspace to find the device, or use udev to add
 * symlinks -- but never rename kernel devices later, it's a complete mess. We
 * don't even want to get into that and try to implement the missing pieces in
 * the core. We really have other pieces to fix in the driver core mess. :)
 */
int device_rename(struct device *dev, const char *new_name)
{
        struct subsys_private *sp = NULL;
        struct kobject *kobj = &dev->kobj;
        char *old_device_name = NULL;
        int error;
        bool is_link_renamed = false;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        dev_dbg(dev, "renaming to %s\n", new_name);

        old_device_name = kstrdup(dev_name(dev), GFP_KERNEL);
        if (!old_device_name) {
                error = -ENOMEM;
                goto out;
        }

        if (dev->class) {
                sp = class_to_subsys(dev->class);

                if (!sp) {
                        error = -EINVAL;
                        goto out;
                }

                error = sysfs_rename_link_ns(&sp->subsys.kobj, kobj, old_device_name,
                                             new_name, kobject_namespace(kobj));
                if (error)
                        goto out;

                is_link_renamed = true;
        }

        error = kobject_rename(kobj, new_name);
out:
        if (error && is_link_renamed)
                sysfs_rename_link_ns(&sp->subsys.kobj, kobj, new_name,
                                     old_device_name, kobject_namespace(kobj));
        subsys_put(sp);

        put_device(dev);

        kfree(old_device_name);

        return error;
}
EXPORT_SYMBOL_GPL(device_rename);

static int device_move_class_links(struct device *dev,
                                   struct device *old_parent,
                                   struct device *new_parent)
{
        int error = 0;

        if (old_parent)
                sysfs_remove_link(&dev->kobj, "device");
        if (new_parent)
                error = sysfs_create_link(&dev->kobj, &new_parent->kobj,
                                          "device");
        return error;
}

/**
 * device_move - moves a device to a new parent
 * @dev: the pointer to the struct device to be moved
 * @new_parent: the new parent of the device (can be NULL)
 * @dpm_order: how to reorder the dpm_list
 */
int device_move(struct device *dev, struct device *new_parent,
                enum dpm_order dpm_order)
{
        int error;
        struct device *old_parent;
        struct kobject *new_parent_kobj;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        device_pm_lock();
        new_parent = get_device(new_parent);
        new_parent_kobj = get_device_parent(dev, new_parent);
        if (IS_ERR(new_parent_kobj)) {
                error = PTR_ERR(new_parent_kobj);
                put_device(new_parent);
                goto out;
        }

        pr_debug("device: '%s': %s: moving to '%s'\n", dev_name(dev),
                 __func__, new_parent ? dev_name(new_parent) : "<NULL>");
        error = kobject_move(&dev->kobj, new_parent_kobj);
        if (error) {
                cleanup_glue_dir(dev, new_parent_kobj);
                put_device(new_parent);
                goto out;
        }
        old_parent = dev->parent;
        dev->parent = new_parent;
        if (old_parent)
                klist_remove(&dev->p->knode_parent);
        if (new_parent) {
                klist_add_tail(&dev->p->knode_parent,
                               &new_parent->p->klist_children);
                set_dev_node(dev, dev_to_node(new_parent));
        }

        if (dev->class) {
                error = device_move_class_links(dev, old_parent, new_parent);
                if (error) {
                        /* We ignore errors on cleanup since we're hosed anyway... */
                        device_move_class_links(dev, new_parent, old_parent);
                        if (!kobject_move(&dev->kobj, &old_parent->kobj)) {
                                if (new_parent)
                                        klist_remove(&dev->p->knode_parent);
                                dev->parent = old_parent;
                                if (old_parent) {
                                        klist_add_tail(&dev->p->knode_parent,
                                                       &old_parent->p->klist_children);
                                        set_dev_node(dev, dev_to_node(old_parent));
                                }
                        }
                        cleanup_glue_dir(dev, new_parent_kobj);
                        put_device(new_parent);
                        goto out;
                }
        }
        switch (dpm_order) {
        case DPM_ORDER_NONE:
                break;
        case DPM_ORDER_DEV_AFTER_PARENT:
                device_pm_move_after(dev, new_parent);
                devices_kset_move_after(dev, new_parent);
                break;
        case DPM_ORDER_PARENT_BEFORE_DEV:
                device_pm_move_before(new_parent, dev);
                devices_kset_move_before(new_parent, dev);
                break;
        case DPM_ORDER_DEV_LAST:
                device_pm_move_last(dev);
                devices_kset_move_last(dev);
                break;
        }

        put_device(old_parent);
out:
        device_pm_unlock();
        put_device(dev);
        return error;
}
EXPORT_SYMBOL_GPL(device_move);

static int device_attrs_change_owner(struct device *dev, kuid_t kuid,
                                     kgid_t kgid)
{
        struct kobject *kobj = &dev->kobj;
        const struct class *class = dev->class;
        const struct device_type *type = dev->type;
        int error;

        if (class) {
                /*
                 * Change the device groups of the device class for @dev to
                 * @kuid/@kgid.
                 */
                error = sysfs_groups_change_owner(kobj, class->dev_groups, kuid,
                                                  kgid);
                if (error)
                        return error;
        }

        if (type) {
                /*
                 * Change the device groups of the device type for @dev to
                 * @kuid/@kgid.
                 */
                error = sysfs_groups_change_owner(kobj, type->groups, kuid,
                                                  kgid);
                if (error)
                        return error;
        }

        /* Change the device groups of @dev to @kuid/@kgid. */
        error = sysfs_groups_change_owner(kobj, dev->groups, kuid, kgid);
        if (error)
                return error;

        if (device_supports_offline(dev) && !dev->offline_disabled) {
                /* Change online device attributes of @dev to @kuid/@kgid. */
                error = sysfs_file_change_owner(kobj, dev_attr_online.attr.name,
                                                kuid, kgid);
                if (error)
                        return error;
        }

        return 0;
}

/**
 * device_change_owner - change the owner of an existing device.
 * @dev: device.
 * @kuid: new owner's kuid
 * @kgid: new owner's kgid
 *
 * This changes the owner of @dev and its corresponding sysfs entries to
 * @kuid/@kgid. This function closely mirrors how @dev was added via driver
 * core.
 *
 * Returns 0 on success or error code on failure.
 */
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
{
        int error;
        struct kobject *kobj = &dev->kobj;
        struct subsys_private *sp;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        /*
         * Change the kobject and the default attributes and groups of the
         * ktype associated with it to @kuid/@kgid.
         */
        error = sysfs_change_owner(kobj, kuid, kgid);
        if (error)
                goto out;

        /*
         * Change the uevent file for @dev to the new owner. The uevent file
         * was created in a separate step when @dev got added and we mirror
         * that step here.
         */
        error = sysfs_file_change_owner(kobj, dev_attr_uevent.attr.name, kuid,
                                        kgid);
        if (error)
                goto out;

        /*
         * Change the device groups, the device groups associated with the
         * device class, and the groups associated with the device type of @dev
         * to @kuid/@kgid.
         */
        error = device_attrs_change_owner(dev, kuid, kgid);
        if (error)
                goto out;

        error = dpm_sysfs_change_owner(dev, kuid, kgid);
        if (error)
                goto out;

        /*
         * Change the owner of the symlink located in the class directory of
         * the device class associated with @dev which points to the actual
         * directory entry for @dev to @kuid/@kgid. This ensures that the
         * symlink shows the same permissions as its target.
         */
        sp = class_to_subsys(dev->class);
        if (!sp) {
                error = -EINVAL;
                goto out;
        }
        error = sysfs_link_change_owner(&sp->subsys.kobj, &dev->kobj, dev_name(dev), kuid, kgid);
        subsys_put(sp);

out:
        put_device(dev);
        return error;
}
EXPORT_SYMBOL_GPL(device_change_owner);

/**
 * device_shutdown - call ->shutdown() on each device to shutdown.
 */
void device_shutdown(void)
{
        struct device *dev, *parent;

        wait_for_device_probe();
        device_block_probing();

        cpufreq_suspend();

        spin_lock(&devices_kset->list_lock);
        /*
         * Walk the devices list backward, shutting down each in turn.
         * Beware that device unplug events may also start pulling
         * devices offline, even as the system is shutting down.
         */
        while (!list_empty(&devices_kset->list)) {
                dev = list_entry(devices_kset->list.prev, struct device,
                                kobj.entry);

                /*
                 * hold reference count of device's parent to
                 * prevent it from being freed because parent's
                 * lock is to be held
                 */
                parent = get_device(dev->parent);
                get_device(dev);
                /*
                 * Make sure the device is off the kset list, in the
                 * event that dev->*->shutdown() doesn't remove it.
                 */
                list_del_init(&dev->kobj.entry);
                spin_unlock(&devices_kset->list_lock);

                /* hold lock to avoid race with probe/release */
                if (parent)
                        device_lock(parent);
                device_lock(dev);

                /* Don't allow any more runtime suspends */
                pm_runtime_get_noresume(dev);
                pm_runtime_barrier(dev);

                if (dev->class && dev->class->shutdown_pre) {
                        if (initcall_debug)
                                dev_info(dev, "shutdown_pre\n");
                        dev->class->shutdown_pre(dev);
                }
                if (dev->bus && dev->bus->shutdown) {
                        if (initcall_debug)
                                dev_info(dev, "shutdown\n");
                        dev->bus->shutdown(dev);
                } else if (dev->driver && dev->driver->shutdown) {
                        if (initcall_debug)
                                dev_info(dev, "shutdown\n");
                        dev->driver->shutdown(dev);
                }

                device_unlock(dev);
                if (parent)
                        device_unlock(parent);

                put_device(dev);
                put_device(parent);

                spin_lock(&devices_kset->list_lock);
        }
        spin_unlock(&devices_kset->list_lock);
}

/*
 * Device logging functions
 */

#ifdef CONFIG_PRINTK
static void
set_dev_info(const struct device *dev, struct dev_printk_info *dev_info)
{
        const char *subsys;

        memset(dev_info, 0, sizeof(*dev_info));

        if (dev->class)
                subsys = dev->class->name;
        else if (dev->bus)
                subsys = dev->bus->name;
        else
                return;

        strscpy(dev_info->subsystem, subsys);

        /*
         * Add device identifier DEVICE=:
         *   b12:8         block dev_t
         *   c127:3        char dev_t
         *   n8            netdev ifindex
         *   +sound:card0  subsystem:devname
         */
        if (MAJOR(dev->devt)) {
                char c;

                if (strcmp(subsys, "block") == 0)
                        c = 'b';
                else
                        c = 'c';

                snprintf(dev_info->device, sizeof(dev_info->device),
                         "%c%u:%u", c, MAJOR(dev->devt), MINOR(dev->devt));
        } else if (strcmp(subsys, "net") == 0) {
                struct net_device *net = to_net_dev(dev);

                snprintf(dev_info->device, sizeof(dev_info->device),
                         "n%u", net->ifindex);
        } else {
                snprintf(dev_info->device, sizeof(dev_info->device),
                         "+%s:%s", subsys, dev_name(dev));
        }
}

int dev_vprintk_emit(int level, const struct device *dev,
                     const char *fmt, va_list args)
{
        struct dev_printk_info dev_info;

        set_dev_info(dev, &dev_info);

        return vprintk_emit(0, level, &dev_info, fmt, args);
}
EXPORT_SYMBOL(dev_vprintk_emit);

int dev_printk_emit(int level, const struct device *dev, const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);

        r = dev_vprintk_emit(level, dev, fmt, args);

        va_end(args);

        return r;
}
EXPORT_SYMBOL(dev_printk_emit);

static void __dev_printk(const char *level, const struct device *dev,
                        struct va_format *vaf)
{
        if (dev)
                dev_printk_emit(level[1] - '0', dev, "%s %s: %pV",
                                dev_driver_string(dev), dev_name(dev), vaf);
        else
                printk("%s(NULL device *): %pV", level, vaf);
}

void _dev_printk(const char *level, const struct device *dev,
                 const char *fmt, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, fmt);

        vaf.fmt = fmt;
        vaf.va = &args;

        __dev_printk(level, dev, &vaf);

        va_end(args);
}
EXPORT_SYMBOL(_dev_printk);

#define define_dev_printk_level(func, kern_level)                \
void func(const struct device *dev, const char *fmt, ...)        \
{                                                                \
        struct va_format vaf;                                        \
        va_list args;                                                \
                                                                \
        va_start(args, fmt);                                        \
                                                                \
        vaf.fmt = fmt;                                                \
        vaf.va = &args;                                                \
                                                                \
        __dev_printk(kern_level, dev, &vaf);                        \
                                                                \
        va_end(args);                                                \
}                                                                \
EXPORT_SYMBOL(func);

define_dev_printk_level(_dev_emerg, KERN_EMERG);
define_dev_printk_level(_dev_alert, KERN_ALERT);
define_dev_printk_level(_dev_crit, KERN_CRIT);
define_dev_printk_level(_dev_err, KERN_ERR);
define_dev_printk_level(_dev_warn, KERN_WARNING);
define_dev_printk_level(_dev_notice, KERN_NOTICE);
define_dev_printk_level(_dev_info, KERN_INFO);

#endif

static void __dev_probe_failed(const struct device *dev, int err, bool fatal,
                               const char *fmt, va_list vargsp)
{
        struct va_format vaf;
        va_list vargs;

        /*
         * On x86_64 and possibly on other architectures, va_list is actually a
         * size-1 array containing a structure.  As a result, function parameter
         * vargsp decays from T[1] to T*, and &vargsp has type T** rather than
         * T(*)[1], which is expected by its assignment to vaf.va below.
         *
         * One standard way to solve this mess is by creating a copy in a local
         * variable of type va_list and then using a pointer to that local copy
         * instead, which is the approach employed here.
         */
        va_copy(vargs, vargsp);

        vaf.fmt = fmt;
        vaf.va = &vargs;

        switch (err) {
        case -EPROBE_DEFER:
                device_set_deferred_probe_reason(dev, &vaf);
                dev_dbg(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
                break;

        case -ENOMEM:
                /* Don't print anything on -ENOMEM, there's already enough output */
                break;

        default:
                /* Log fatal final failures as errors, otherwise produce warnings */
                if (fatal)
                        dev_err(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
                else
                        dev_warn(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
                break;
        }

        va_end(vargs);
}

/**
 * dev_err_probe - probe error check and log helper
 * @dev: the pointer to the struct device
 * @err: error value to test
 * @fmt: printf-style format string
 * @...: arguments as specified in the format string
 *
 * This helper implements common pattern present in probe functions for error
 * checking: print debug or error message depending if the error value is
 * -EPROBE_DEFER and propagate error upwards.
 * In case of -EPROBE_DEFER it sets also defer probe reason, which can be
 * checked later by reading devices_deferred debugfs attribute.
 * It replaces the following code sequence::
 *
 *         if (err != -EPROBE_DEFER)
 *                 dev_err(dev, ...);
 *         else
 *                 dev_dbg(dev, ...);
 *         return err;
 *
 * with::
 *
 *         return dev_err_probe(dev, err, ...);
 *
 * Using this helper in your probe function is totally fine even if @err
 * is known to never be -EPROBE_DEFER.
 * The benefit compared to a normal dev_err() is the standardized format
 * of the error code, which is emitted symbolically (i.e. you get "EAGAIN"
 * instead of "-35"), and having the error code returned allows more
 * compact error paths.
 *
 * Returns @err.
 */
int dev_err_probe(const struct device *dev, int err, const char *fmt, ...)
{
        va_list vargs;

        va_start(vargs, fmt);

        /* Use dev_err() for logging when err doesn't equal -EPROBE_DEFER */
        __dev_probe_failed(dev, err, true, fmt, vargs);

        va_end(vargs);

        return err;
}
EXPORT_SYMBOL_GPL(dev_err_probe);

/**
 * dev_warn_probe - probe error check and log helper
 * @dev: the pointer to the struct device
 * @err: error value to test
 * @fmt: printf-style format string
 * @...: arguments as specified in the format string
 *
 * This helper implements common pattern present in probe functions for error
 * checking: print debug or warning message depending if the error value is
 * -EPROBE_DEFER and propagate error upwards.
 * In case of -EPROBE_DEFER it sets also defer probe reason, which can be
 * checked later by reading devices_deferred debugfs attribute.
 * It replaces the following code sequence::
 *
 *         if (err != -EPROBE_DEFER)
 *                 dev_warn(dev, ...);
 *         else
 *                 dev_dbg(dev, ...);
 *         return err;
 *
 * with::
 *
 *         return dev_warn_probe(dev, err, ...);
 *
 * Using this helper in your probe function is totally fine even if @err
 * is known to never be -EPROBE_DEFER.
 * The benefit compared to a normal dev_warn() is the standardized format
 * of the error code, which is emitted symbolically (i.e. you get "EAGAIN"
 * instead of "-35"), and having the error code returned allows more
 * compact error paths.
 *
 * Returns @err.
 */
int dev_warn_probe(const struct device *dev, int err, const char *fmt, ...)
{
        va_list vargs;

        va_start(vargs, fmt);

        /* Use dev_warn() for logging when err doesn't equal -EPROBE_DEFER */
        __dev_probe_failed(dev, err, false, fmt, vargs);

        va_end(vargs);

        return err;
}
EXPORT_SYMBOL_GPL(dev_warn_probe);

static inline bool fwnode_is_primary(struct fwnode_handle *fwnode)
{
        return fwnode && !IS_ERR(fwnode->secondary);
}

/**
 * set_primary_fwnode - Change the primary firmware node of a given device.
 * @dev: Device to handle.
 * @fwnode: New primary firmware node of the device.
 *
 * Set the device's firmware node pointer to @fwnode, but if a secondary
 * firmware node of the device is present, preserve it.
 *
 * Valid fwnode cases are:
 *  - primary --> secondary --> -ENODEV
 *  - primary --> NULL
 *  - secondary --> -ENODEV
 *  - NULL
 */
void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode)
{
        struct device *parent = dev->parent;
        struct fwnode_handle *fn = dev->fwnode;

        if (fwnode) {
                if (fwnode_is_primary(fn))
                        fn = fn->secondary;

                if (fn) {
                        WARN_ON(fwnode->secondary);
                        fwnode->secondary = fn;
                }
                dev->fwnode = fwnode;
        } else {
                if (fwnode_is_primary(fn)) {
                        dev->fwnode = fn->secondary;

                        /* Skip nullifying fn->secondary if the primary is shared */
                        if (parent && fn == parent->fwnode)
                                return;

                        /* Set fn->secondary = NULL, so fn remains the primary fwnode */
                        fn->secondary = NULL;
                } else {
                        dev->fwnode = NULL;
                }
        }
}
EXPORT_SYMBOL_GPL(set_primary_fwnode);

/**
 * set_secondary_fwnode - Change the secondary firmware node of a given device.
 * @dev: Device to handle.
 * @fwnode: New secondary firmware node of the device.
 *
 * If a primary firmware node of the device is present, set its secondary
 * pointer to @fwnode.  Otherwise, set the device's firmware node pointer to
 * @fwnode.
 */
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode)
{
        if (fwnode)
                fwnode->secondary = ERR_PTR(-ENODEV);

        if (fwnode_is_primary(dev->fwnode))
                dev->fwnode->secondary = fwnode;
        else
                dev->fwnode = fwnode;
}
EXPORT_SYMBOL_GPL(set_secondary_fwnode);

/**
 * device_remove_of_node - Remove an of_node from a device
 * @dev: device whose device tree node is being removed
 */
void device_remove_of_node(struct device *dev)
{
        dev = get_device(dev);
        if (!dev)
                return;

        if (!dev->of_node)
                goto end;

        if (dev->fwnode == of_fwnode_handle(dev->of_node))
                dev->fwnode = NULL;

        of_node_put(dev->of_node);
        dev->of_node = NULL;

end:
        put_device(dev);
}
EXPORT_SYMBOL_GPL(device_remove_of_node);

/**
 * device_add_of_node - Add an of_node to an existing device
 * @dev: device whose device tree node is being added
 * @of_node: of_node to add
 *
 * Return: 0 on success or error code on failure.
 */
int device_add_of_node(struct device *dev, struct device_node *of_node)
{
        int ret;

        if (!of_node)
                return -EINVAL;

        dev = get_device(dev);
        if (!dev)
                return -EINVAL;

        if (dev->of_node) {
                dev_err(dev, "Cannot replace node %pOF with %pOF\n",
                        dev->of_node, of_node);
                ret = -EBUSY;
                goto end;
        }

        dev->of_node = of_node_get(of_node);

        if (!dev->fwnode)
                dev->fwnode = of_fwnode_handle(of_node);

        ret = 0;
end:
        put_device(dev);
        return ret;
}
EXPORT_SYMBOL_GPL(device_add_of_node);

/**
 * device_set_of_node_from_dev - reuse device-tree node of another device
 * @dev: device whose device-tree node is being set
 * @dev2: device whose device-tree node is being reused
 *
 * Takes another reference to the new device-tree node after first dropping
 * any reference held to the old node.
 */
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2)
{
        of_node_put(dev->of_node);
        dev->of_node = of_node_get(dev2->of_node);
        dev->of_node_reused = true;
}
EXPORT_SYMBOL_GPL(device_set_of_node_from_dev);

void device_set_node(struct device *dev, struct fwnode_handle *fwnode)
{
        dev->fwnode = fwnode;
        dev->of_node = to_of_node(fwnode);
}
EXPORT_SYMBOL_GPL(device_set_node);

int device_match_name(struct device *dev, const void *name)
{
        return sysfs_streq(dev_name(dev), name);
}
EXPORT_SYMBOL_GPL(device_match_name);

int device_match_type(struct device *dev, const void *type)
{
        return dev->type == type;
}
EXPORT_SYMBOL_GPL(device_match_type);

int device_match_of_node(struct device *dev, const void *np)
{
        return np && dev->of_node == np;
}
EXPORT_SYMBOL_GPL(device_match_of_node);

int device_match_fwnode(struct device *dev, const void *fwnode)
{
        return fwnode && dev_fwnode(dev) == fwnode;
}
EXPORT_SYMBOL_GPL(device_match_fwnode);

int device_match_devt(struct device *dev, const void *pdevt)
{
        return dev->devt == *(dev_t *)pdevt;
}
EXPORT_SYMBOL_GPL(device_match_devt);

int device_match_acpi_dev(struct device *dev, const void *adev)
{
        return adev && ACPI_COMPANION(dev) == adev;
}
EXPORT_SYMBOL(device_match_acpi_dev);

int device_match_acpi_handle(struct device *dev, const void *handle)
{
        return handle && ACPI_HANDLE(dev) == handle;
}
EXPORT_SYMBOL(device_match_acpi_handle);

int device_match_any(struct device *dev, const void *unused)
{
        return 1;
}
EXPORT_SYMBOL_GPL(device_match_any);


























































































































































































   14 









   14 



















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/crypto/hooks.c
 *
 * Encryption hooks for higher-level filesystem operations.
 */

#include "fscrypt_private.h"

/**
 * fscrypt_file_open() - prepare to open a possibly-encrypted regular file
 * @inode: the inode being opened
 * @filp: the struct file being set up
 *
 * Currently, an encrypted regular file can only be opened if its encryption key
 * is available; access to the raw encrypted contents is not supported.
 * Therefore, we first set up the inode's encryption key (if not already done)
 * and return an error if it's unavailable.
 *
 * We also verify that if the parent directory (from the path via which the file
 * is being opened) is encrypted, then the inode being opened uses the same
 * encryption policy.  This is needed as part of the enforcement that all files
 * in an encrypted directory tree use the same encryption policy, as a
 * protection against certain types of offline attacks.  Note that this check is
 * needed even when opening an *unencrypted* file, since it's forbidden to have
 * an unencrypted file in an encrypted directory.
 *
 * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
 */
int fscrypt_file_open(struct inode *inode, struct file *filp)
{
        int err;
        struct dentry *dentry, *dentry_parent;
        struct inode *inode_parent;

        err = fscrypt_require_key(inode);
        if (err)
                return err;

        dentry = file_dentry(filp);

        /*
         * Getting a reference to the parent dentry is needed for the actual
         * encryption policy comparison, but it's expensive on multi-core
         * systems.  Since this function runs on unencrypted files too, start
         * with a lightweight RCU-mode check for the parent directory being
         * unencrypted (in which case it's fine for the child to be either
         * unencrypted, or encrypted with any policy).  Only continue on to the
         * full policy check if the parent directory is actually encrypted.
         */
        rcu_read_lock();
        dentry_parent = READ_ONCE(dentry->d_parent);
        inode_parent = d_inode_rcu(dentry_parent);
        if (inode_parent != NULL && !IS_ENCRYPTED(inode_parent)) {
                rcu_read_unlock();
                return 0;
        }
        rcu_read_unlock();

        dentry_parent = dget_parent(dentry);
        if (!fscrypt_has_permitted_context(d_inode(dentry_parent), inode)) {
                fscrypt_warn(inode,
                             "Inconsistent encryption context (parent directory: %lu)",
                             d_inode(dentry_parent)->i_ino);
                err = -EPERM;
        }
        dput(dentry_parent);
        return err;
}
EXPORT_SYMBOL_GPL(fscrypt_file_open);

int __fscrypt_prepare_link(struct inode *inode, struct inode *dir,
                           struct dentry *dentry)
{
        if (fscrypt_is_nokey_name(dentry))
                return -ENOKEY;
        /*
         * We don't need to separately check that the directory inode's key is
         * available, as it's implied by the dentry not being a no-key name.
         */

        if (!fscrypt_has_permitted_context(dir, inode))
                return -EXDEV;

        return 0;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_link);

int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry,
                             unsigned int flags)
{
        if (fscrypt_is_nokey_name(old_dentry) ||
            fscrypt_is_nokey_name(new_dentry))
                return -ENOKEY;
        /*
         * We don't need to separately check that the directory inodes' keys are
         * available, as it's implied by the dentries not being no-key names.
         */

        if (old_dir != new_dir) {
                if (IS_ENCRYPTED(new_dir) &&
                    !fscrypt_has_permitted_context(new_dir,
                                                   d_inode(old_dentry)))
                        return -EXDEV;

                if ((flags & RENAME_EXCHANGE) &&
                    IS_ENCRYPTED(old_dir) &&
                    !fscrypt_has_permitted_context(old_dir,
                                                   d_inode(new_dentry)))
                        return -EXDEV;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename);

int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
                             struct fscrypt_name *fname)
{
        int err = fscrypt_setup_filename(dir, &dentry->d_name, 1, fname);

        if (err && err != -ENOENT)
                return err;

        fscrypt_prepare_dentry(dentry, fname->is_nokey_name);

        return err;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);

/**
 * fscrypt_prepare_lookup_partial() - prepare lookup without filename setup
 * @dir: the encrypted directory being searched
 * @dentry: the dentry being looked up in @dir
 *
 * This function should be used by the ->lookup and ->atomic_open methods of
 * filesystems that handle filename encryption and no-key name encoding
 * themselves and thus can't use fscrypt_prepare_lookup().  Like
 * fscrypt_prepare_lookup(), this will try to set up the directory's encryption
 * key and will set DCACHE_NOKEY_NAME on the dentry if the key is unavailable.
 * However, this function doesn't set up a struct fscrypt_name for the filename.
 *
 * Return: 0 on success; -errno on error.  Note that the encryption key being
 *           unavailable is not considered an error.  It is also not an error if
 *           the encryption policy is unsupported by this kernel; that is treated
 *           like the key being unavailable, so that files can still be deleted.
 */
int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry)
{
        int err = fscrypt_get_encryption_info(dir, true);
        bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir));

        fscrypt_prepare_dentry(dentry, is_nokey_name);

        return err;
}
EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial);

int __fscrypt_prepare_readdir(struct inode *dir)
{
        return fscrypt_get_encryption_info(dir, true);
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_readdir);

int __fscrypt_prepare_setattr(struct dentry *dentry, struct iattr *attr)
{
        if (attr->ia_valid & ATTR_SIZE)
                return fscrypt_require_key(d_inode(dentry));
        return 0;
}
EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr);

/**
 * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS
 * @inode: the inode on which flags are being changed
 * @oldflags: the old flags
 * @flags: the new flags
 *
 * The caller should be holding i_rwsem for write.
 *
 * Return: 0 on success; -errno if the flags change isn't allowed or if
 *           another error occurs.
 */
int fscrypt_prepare_setflags(struct inode *inode,
                             unsigned int oldflags, unsigned int flags)
{
        struct fscrypt_inode_info *ci;
        struct fscrypt_master_key *mk;
        int err;

        /*
         * When the CASEFOLD flag is set on an encrypted directory, we must
         * derive the secret key needed for the dirhash.  This is only possible
         * if the directory uses a v2 encryption policy.
         */
        if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) {
                err = fscrypt_require_key(inode);
                if (err)
                        return err;
                ci = inode->i_crypt_info;
                if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
                        return -EINVAL;
                mk = ci->ci_master_key;
                down_read(&mk->mk_sem);
                if (mk->mk_present)
                        err = fscrypt_derive_dirhash_key(ci, mk);
                else
                        err = -ENOKEY;
                up_read(&mk->mk_sem);
                return err;
        }
        return 0;
}

/**
 * fscrypt_prepare_symlink() - prepare to create a possibly-encrypted symlink
 * @dir: directory in which the symlink is being created
 * @target: plaintext symlink target
 * @len: length of @target excluding null terminator
 * @max_len: space the filesystem has available to store the symlink target
 * @disk_link: (out) the on-disk symlink target being prepared
 *
 * This function computes the size the symlink target will require on-disk,
 * stores it in @disk_link->len, and validates it against @max_len.  An
 * encrypted symlink may be longer than the original.
 *
 * Additionally, @disk_link->name is set to @target if the symlink will be
 * unencrypted, but left NULL if the symlink will be encrypted.  For encrypted
 * symlinks, the filesystem must call fscrypt_encrypt_symlink() to create the
 * on-disk target later.  (The reason for the two-step process is that some
 * filesystems need to know the size of the symlink target before creating the
 * inode, e.g. to determine whether it will be a "fast" or "slow" symlink.)
 *
 * Return: 0 on success, -ENAMETOOLONG if the symlink target is too long,
 * -ENOKEY if the encryption key is missing, or another -errno code if a problem
 * occurred while setting up the encryption key.
 */
int fscrypt_prepare_symlink(struct inode *dir, const char *target,
                            unsigned int len, unsigned int max_len,
                            struct fscrypt_str *disk_link)
{
        const union fscrypt_policy *policy;

        /*
         * To calculate the size of the encrypted symlink target we need to know
         * the amount of NUL padding, which is determined by the flags set in
         * the encryption policy which will be inherited from the directory.
         */
        policy = fscrypt_policy_to_inherit(dir);
        if (policy == NULL) {
                /* Not encrypted */
                disk_link->name = (unsigned char *)target;
                disk_link->len = len + 1;
                if (disk_link->len > max_len)
                        return -ENAMETOOLONG;
                return 0;
        }
        if (IS_ERR(policy))
                return PTR_ERR(policy);

        /*
         * Calculate the size of the encrypted symlink and verify it won't
         * exceed max_len.  Note that for historical reasons, encrypted symlink
         * targets are prefixed with the ciphertext length, despite this
         * actually being redundant with i_size.  This decreases by 2 bytes the
         * longest symlink target we can accept.
         *
         * We could recover 1 byte by not counting a null terminator, but
         * counting it (even though it is meaningless for ciphertext) is simpler
         * for now since filesystems will assume it is there and subtract it.
         */
        if (!__fscrypt_fname_encrypted_size(policy, len,
                                            max_len - sizeof(struct fscrypt_symlink_data) - 1,
                                            &disk_link->len))
                return -ENAMETOOLONG;
        disk_link->len += sizeof(struct fscrypt_symlink_data) + 1;

        disk_link->name = NULL;
        return 0;
}
EXPORT_SYMBOL_GPL(fscrypt_prepare_symlink);

int __fscrypt_encrypt_symlink(struct inode *inode, const char *target,
                              unsigned int len, struct fscrypt_str *disk_link)
{
        int err;
        struct qstr iname = QSTR_INIT(target, len);
        struct fscrypt_symlink_data *sd;
        unsigned int ciphertext_len;

        /*
         * fscrypt_prepare_new_inode() should have already set up the new
         * symlink inode's encryption key.  We don't wait until now to do it,
         * since we may be in a filesystem transaction now.
         */
        if (WARN_ON_ONCE(!fscrypt_has_encryption_key(inode)))
                return -ENOKEY;

        if (disk_link->name) {
                /* filesystem-provided buffer */
                sd = (struct fscrypt_symlink_data *)disk_link->name;
        } else {
                sd = kmalloc(disk_link->len, GFP_NOFS);
                if (!sd)
                        return -ENOMEM;
        }
        ciphertext_len = disk_link->len - sizeof(*sd) - 1;
        sd->len = cpu_to_le16(ciphertext_len);

        err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path,
                                    ciphertext_len);
        if (err)
                goto err_free_sd;

        /*
         * Null-terminating the ciphertext doesn't make sense, but we still
         * count the null terminator in the length, so we might as well
         * initialize it just in case the filesystem writes it out.
         */
        sd->encrypted_path[ciphertext_len] = '\0';

        /* Cache the plaintext symlink target for later use by get_link() */
        err = -ENOMEM;
        inode->i_link = kmemdup(target, len + 1, GFP_NOFS);
        if (!inode->i_link)
                goto err_free_sd;

        if (!disk_link->name)
                disk_link->name = (unsigned char *)sd;
        return 0;

err_free_sd:
        if (!disk_link->name)
                kfree(sd);
        return err;
}
EXPORT_SYMBOL_GPL(__fscrypt_encrypt_symlink);

/**
 * fscrypt_get_symlink() - get the target of an encrypted symlink
 * @inode: the symlink inode
 * @caddr: the on-disk contents of the symlink
 * @max_size: size of @caddr buffer
 * @done: if successful, will be set up to free the returned target if needed
 *
 * If the symlink's encryption key is available, we decrypt its target.
 * Otherwise, we encode its target for presentation.
 *
 * This may sleep, so the filesystem must have dropped out of RCU mode already.
 *
 * Return: the presentable symlink target or an ERR_PTR()
 */
const char *fscrypt_get_symlink(struct inode *inode, const void *caddr,
                                unsigned int max_size,
                                struct delayed_call *done)
{
        const struct fscrypt_symlink_data *sd;
        struct fscrypt_str cstr, pstr;
        bool has_key;
        int err;

        /* This is for encrypted symlinks only */
        if (WARN_ON_ONCE(!IS_ENCRYPTED(inode)))
                return ERR_PTR(-EINVAL);

        /* If the decrypted target is already cached, just return it. */
        pstr.name = READ_ONCE(inode->i_link);
        if (pstr.name)
                return pstr.name;

        /*
         * Try to set up the symlink's encryption key, but we can continue
         * regardless of whether the key is available or not.
         */
        err = fscrypt_get_encryption_info(inode, false);
        if (err)
                return ERR_PTR(err);
        has_key = fscrypt_has_encryption_key(inode);

        /*
         * For historical reasons, encrypted symlink targets are prefixed with
         * the ciphertext length, even though this is redundant with i_size.
         */

        if (max_size < sizeof(*sd) + 1)
                return ERR_PTR(-EUCLEAN);
        sd = caddr;
        cstr.name = (unsigned char *)sd->encrypted_path;
        cstr.len = le16_to_cpu(sd->len);

        if (cstr.len == 0)
                return ERR_PTR(-EUCLEAN);

        if (cstr.len + sizeof(*sd) > max_size)
                return ERR_PTR(-EUCLEAN);

        err = fscrypt_fname_alloc_buffer(cstr.len, &pstr);
        if (err)
                return ERR_PTR(err);

        err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
        if (err)
                goto err_kfree;

        err = -EUCLEAN;
        if (pstr.name[0] == '\0')
                goto err_kfree;

        pstr.name[pstr.len] = '\0';

        /*
         * Cache decrypted symlink targets in i_link for later use.  Don't cache
         * symlink targets encoded without the key, since those become outdated
         * once the key is added.  This pairs with the READ_ONCE() above and in
         * the VFS path lookup code.
         */
        if (!has_key ||
            cmpxchg_release(&inode->i_link, NULL, pstr.name) != NULL)
                set_delayed_call(done, kfree_link, pstr.name);

        return pstr.name;

err_kfree:
        kfree(pstr.name);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(fscrypt_get_symlink);

/**
 * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks
 * @path: the path for the encrypted symlink being queried
 * @stat: the struct being filled with the symlink's attributes
 *
 * Override st_size of encrypted symlinks to be the length of the decrypted
 * symlink target (or the no-key encoded symlink target, if the key is
 * unavailable) rather than the length of the encrypted symlink target.  This is
 * necessary for st_size to match the symlink target that userspace actually
 * sees.  POSIX requires this, and some userspace programs depend on it.
 *
 * This requires reading the symlink target from disk if needed, setting up the
 * inode's encryption key if possible, and then decrypting or encoding the
 * symlink target.  This makes lstat() more heavyweight than is normally the
 * case.  However, decrypted symlink targets will be cached in ->i_link, so
 * usually the symlink won't have to be read and decrypted again later if/when
 * it is actually followed, readlink() is called, or lstat() is called again.
 *
 * Return: 0 on success, -errno on failure
 */
int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat)
{
        struct dentry *dentry = path->dentry;
        struct inode *inode = d_inode(dentry);
        const char *link;
        DEFINE_DELAYED_CALL(done);

        /*
         * To get the symlink target that userspace will see (whether it's the
         * decrypted target or the no-key encoded target), we can just get it in
         * the same way the VFS does during path resolution and readlink().
         */
        link = READ_ONCE(inode->i_link);
        if (!link) {
                link = inode->i_op->get_link(dentry, inode, &done);
                if (IS_ERR(link))
                        return PTR_ERR(link);
        }
        stat->size = strlen(link);
        do_delayed_call(&done);
        return 0;
}
EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr);



































































    4 
  470 




   13 






































































































































































































































































































































































   16 















































































































  506 

  504 





  382 



















































  169 





    7 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_VMSTAT_H
#define _LINUX_VMSTAT_H

#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/mmzone.h>
#include <linux/vm_event_item.h>
#include <linux/atomic.h>
#include <linux/static_key.h>
#include <linux/mmdebug.h>

#ifdef CONFIG_NUMA
DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key);
#endif

struct reclaim_stat {
        unsigned nr_dirty;
        unsigned nr_unqueued_dirty;
        unsigned nr_congested;
        unsigned nr_writeback;
        unsigned nr_immediate;
        unsigned nr_pageout;
        unsigned nr_activate[ANON_AND_FILE];
        unsigned nr_ref_keep;
        unsigned nr_unmap_fail;
        unsigned nr_lazyfree_fail;
        unsigned nr_demoted;
};

/* Stat data for system wide items */
enum vm_stat_item {
        NR_DIRTY_THRESHOLD,
        NR_DIRTY_BG_THRESHOLD,
        NR_MEMMAP_PAGES,        /* page metadata allocated through buddy allocator */
        NR_MEMMAP_BOOT_PAGES,        /* page metadata allocated through boot allocator */
        NR_VM_STAT_ITEMS,
};

#ifdef CONFIG_VM_EVENT_COUNTERS
/*
 * Light weight per cpu counter implementation.
 *
 * Counters should only be incremented and no critical kernel component
 * should rely on the counter values.
 *
 * Counters are handled completely inline. On many platforms the code
 * generated will simply be the increment of a global address.
 */

struct vm_event_state {
        unsigned long event[NR_VM_EVENT_ITEMS];
};

DECLARE_PER_CPU(struct vm_event_state, vm_event_states);

/*
 * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the
 * local_irq_disable overhead.
 */
static inline void __count_vm_event(enum vm_event_item item)
{
        raw_cpu_inc(vm_event_states.event[item]);
}

static inline void count_vm_event(enum vm_event_item item)
{
        this_cpu_inc(vm_event_states.event[item]);
}

static inline void __count_vm_events(enum vm_event_item item, long delta)
{
        raw_cpu_add(vm_event_states.event[item], delta);
}

static inline void count_vm_events(enum vm_event_item item, long delta)
{
        this_cpu_add(vm_event_states.event[item], delta);
}

extern void all_vm_events(unsigned long *);

extern void vm_events_fold_cpu(int cpu);

#else

/* Disable counters */
static inline void count_vm_event(enum vm_event_item item)
{
}
static inline void count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void __count_vm_event(enum vm_event_item item)
{
}
static inline void __count_vm_events(enum vm_event_item item, long delta)
{
}
static inline void all_vm_events(unsigned long *ret)
{
}
static inline void vm_events_fold_cpu(int cpu)
{
}

#endif /* CONFIG_VM_EVENT_COUNTERS */

#ifdef CONFIG_NUMA_BALANCING
#define count_vm_numa_event(x)     count_vm_event(x)
#define count_vm_numa_events(x, y) count_vm_events(x, y)
#else
#define count_vm_numa_event(x) do {} while (0)
#define count_vm_numa_events(x, y) do { (void)(y); } while (0)
#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_DEBUG_TLBFLUSH
#define count_vm_tlb_event(x)           count_vm_event(x)
#define count_vm_tlb_events(x, y)  count_vm_events(x, y)
#else
#define count_vm_tlb_event(x)     do {} while (0)
#define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
#endif

#ifdef CONFIG_PER_VMA_LOCK_STATS
#define count_vm_vma_lock_event(x) count_vm_event(x)
#else
#define count_vm_vma_lock_event(x) do {} while (0)
#endif

#define __count_zid_vm_events(item, zid, delta) \
        __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)

/*
 * Zone and node-based page accounting with per cpu differentials.
 */
extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
extern atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS];

#ifdef CONFIG_NUMA
static inline void zone_numa_event_add(long x, struct zone *zone,
                                enum numa_stat_item item)
{
        atomic_long_add(x, &zone->vm_numa_event[item]);
        atomic_long_add(x, &vm_numa_event[item]);
}

static inline unsigned long zone_numa_event_state(struct zone *zone,
                                        enum numa_stat_item item)
{
        return atomic_long_read(&zone->vm_numa_event[item]);
}

static inline unsigned long
global_numa_event_state(enum numa_stat_item item)
{
        return atomic_long_read(&vm_numa_event[item]);
}
#endif /* CONFIG_NUMA */

static inline void zone_page_state_add(long x, struct zone *zone,
                                 enum zone_stat_item item)
{
        atomic_long_add(x, &zone->vm_stat[item]);
        atomic_long_add(x, &vm_zone_stat[item]);
}

static inline void node_page_state_add(long x, struct pglist_data *pgdat,
                                 enum node_stat_item item)
{
        atomic_long_add(x, &pgdat->vm_stat[item]);
        atomic_long_add(x, &vm_node_stat[item]);
}

static inline unsigned long global_zone_page_state(enum zone_stat_item item)
{
        long x = atomic_long_read(&vm_zone_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline
unsigned long global_node_page_state_pages(enum node_stat_item item)
{
        long x = atomic_long_read(&vm_node_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

static inline unsigned long global_node_page_state(enum node_stat_item item)
{
        VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));

        return global_node_page_state_pages(item);
}

static inline unsigned long zone_page_state(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);
#ifdef CONFIG_SMP
        if (x < 0)
                x = 0;
#endif
        return x;
}

/*
 * More accurate version that also considers the currently pending
 * deltas. For that we need to loop over all cpus to find the current
 * deltas. There is no synchronization so the result cannot be
 * exactly accurate either.
 */
static inline unsigned long zone_page_state_snapshot(struct zone *zone,
                                        enum zone_stat_item item)
{
        long x = atomic_long_read(&zone->vm_stat[item]);

#ifdef CONFIG_SMP
        int cpu;
        for_each_online_cpu(cpu)
                x += per_cpu_ptr(zone->per_cpu_zonestats, cpu)->vm_stat_diff[item];

        if (x < 0)
                x = 0;
#endif
        return x;
}

#ifdef CONFIG_NUMA
/* See __count_vm_event comment on why raw_cpu_inc is used. */
static inline void
__count_numa_event(struct zone *zone, enum numa_stat_item item)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_inc(pzstats->vm_numa_event[item]);
}

static inline void
__count_numa_events(struct zone *zone, enum numa_stat_item item, long delta)
{
        struct per_cpu_zonestat __percpu *pzstats = zone->per_cpu_zonestats;

        raw_cpu_add(pzstats->vm_numa_event[item], delta);
}

extern unsigned long sum_zone_node_page_state(int node,
                                              enum zone_stat_item item);
extern unsigned long sum_zone_numa_event_state(int node, enum numa_stat_item item);
extern unsigned long node_page_state(struct pglist_data *pgdat,
                                                enum node_stat_item item);
extern unsigned long node_page_state_pages(struct pglist_data *pgdat,
                                           enum node_stat_item item);
extern void fold_vm_numa_events(void);
#else
#define sum_zone_node_page_state(node, item) global_zone_page_state(item)
#define node_page_state(node, item) global_node_page_state(item)
#define node_page_state_pages(node, item) global_node_page_state_pages(item)
static inline void fold_vm_numa_events(void)
{
}
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long);
void __inc_zone_page_state(struct page *, enum zone_stat_item);
void __dec_zone_page_state(struct page *, enum zone_stat_item);

void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long);
void __inc_node_page_state(struct page *, enum node_stat_item);
void __dec_node_page_state(struct page *, enum node_stat_item);

void mod_zone_page_state(struct zone *, enum zone_stat_item, long);
void inc_zone_page_state(struct page *, enum zone_stat_item);
void dec_zone_page_state(struct page *, enum zone_stat_item);

void mod_node_page_state(struct pglist_data *, enum node_stat_item, long);
void inc_node_page_state(struct page *, enum node_stat_item);
void dec_node_page_state(struct page *, enum node_stat_item);

extern void inc_node_state(struct pglist_data *, enum node_stat_item);
extern void __inc_zone_state(struct zone *, enum zone_stat_item);
extern void __inc_node_state(struct pglist_data *, enum node_stat_item);
extern void dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_zone_state(struct zone *, enum zone_stat_item);
extern void __dec_node_state(struct pglist_data *, enum node_stat_item);

void quiet_vmstat(void);
void cpu_vm_stats_fold(int cpu);
void refresh_zone_stat_thresholds(void);

void drain_zonestat(struct zone *zone, struct per_cpu_zonestat *);

int calculate_pressure_threshold(struct zone *zone);
int calculate_normal_threshold(struct zone *zone);
void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                                int (*calculate_pressure)(struct zone *));
#else /* CONFIG_SMP */

/*
 * We do not maintain differentials in a single processor configuration.
 * The functions directly modify the zone and global counters.
 */
static inline void __mod_zone_page_state(struct zone *zone,
                        enum zone_stat_item item, long delta)
{
        zone_page_state_add(delta, zone, item);
}

static inline void __mod_node_page_state(struct pglist_data *pgdat,
                        enum node_stat_item item, int delta)
{
        if (vmstat_item_in_bytes(item)) {
                /*
                 * Only cgroups use subpage accounting right now; at
                 * the global level, these items still change in
                 * multiples of whole pages. Store them as pages
                 * internally to keep the per-cpu counters compact.
                 */
                VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
                delta >>= PAGE_SHIFT;
        }

        node_page_state_add(delta, pgdat, item);
}

static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_inc(&zone->vm_stat[item]);
        atomic_long_inc(&vm_zone_stat[item]);
}

static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_inc(&pgdat->vm_stat[item]);
        atomic_long_inc(&vm_node_stat[item]);
}

static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
{
        atomic_long_dec(&zone->vm_stat[item]);
        atomic_long_dec(&vm_zone_stat[item]);
}

static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
{
        atomic_long_dec(&pgdat->vm_stat[item]);
        atomic_long_dec(&vm_node_stat[item]);
}

static inline void __inc_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __inc_zone_state(page_zone(page), item);
}

static inline void __inc_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __inc_node_state(page_pgdat(page), item);
}


static inline void __dec_zone_page_state(struct page *page,
                        enum zone_stat_item item)
{
        __dec_zone_state(page_zone(page), item);
}

static inline void __dec_node_page_state(struct page *page,
                        enum node_stat_item item)
{
        __dec_node_state(page_pgdat(page), item);
}


/*
 * We only use atomic operations to update counters. So there is no need to
 * disable interrupts.
 */
#define inc_zone_page_state __inc_zone_page_state
#define dec_zone_page_state __dec_zone_page_state
#define mod_zone_page_state __mod_zone_page_state

#define inc_node_page_state __inc_node_page_state
#define dec_node_page_state __dec_node_page_state
#define mod_node_page_state __mod_node_page_state

#define inc_zone_state __inc_zone_state
#define inc_node_state __inc_node_state
#define dec_zone_state __dec_zone_state

#define set_pgdat_percpu_threshold(pgdat, callback) { }

static inline void refresh_zone_stat_thresholds(void) { }
static inline void cpu_vm_stats_fold(int cpu) { }
static inline void quiet_vmstat(void) { }

static inline void drain_zonestat(struct zone *zone,
                        struct per_cpu_zonestat *pzstats) { }
#endif                /* CONFIG_SMP */

static inline void __zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        __mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void __zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void __zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        __mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void zone_stat_mod_folio(struct folio *folio,
                enum zone_stat_item item, long nr)
{
        mod_zone_page_state(folio_zone(folio), item, nr);
}

static inline void zone_stat_add_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, folio_nr_pages(folio));
}

static inline void zone_stat_sub_folio(struct folio *folio,
                enum zone_stat_item item)
{
        mod_zone_page_state(folio_zone(folio), item, -folio_nr_pages(folio));
}

static inline void __node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        __mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void __node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void __node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        __mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

static inline void node_stat_mod_folio(struct folio *folio,
                enum node_stat_item item, long nr)
{
        mod_node_page_state(folio_pgdat(folio), item, nr);
}

static inline void node_stat_add_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, folio_nr_pages(folio));
}

static inline void node_stat_sub_folio(struct folio *folio,
                enum node_stat_item item)
{
        mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
}

extern const char * const vmstat_text[];

static inline const char *zone_stat_name(enum zone_stat_item item)
{
        return vmstat_text[item];
}

#ifdef CONFIG_NUMA
static inline const char *numa_stat_name(enum numa_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_NUMA */

static inline const char *node_stat_name(enum node_stat_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           item];
}

static inline const char *lru_list_name(enum lru_list lru)
{
        return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
}

#if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG)
static inline const char *vm_event_name(enum vm_event_item item)
{
        return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
                           NR_VM_NUMA_EVENT_ITEMS +
                           NR_VM_NODE_STAT_ITEMS +
                           NR_VM_STAT_ITEMS +
                           item];
}
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */

#ifdef CONFIG_MEMCG

void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                        int val);

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __mod_lruvec_state(lruvec, idx, val);
        local_irq_restore(flags);
}

void __lruvec_stat_mod_folio(struct folio *folio,
                             enum node_stat_item idx, int val);

static inline void lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        unsigned long flags;

        local_irq_save(flags);
        __lruvec_stat_mod_folio(folio, idx, val);
        local_irq_restore(flags);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        lruvec_stat_mod_folio(page_folio(page), idx, val);
}

#else

static inline void __mod_lruvec_state(struct lruvec *lruvec,
                                      enum node_stat_item idx, int val)
{
        __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void mod_lruvec_state(struct lruvec *lruvec,
                                    enum node_stat_item idx, int val)
{
        mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
}

static inline void __lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        __mod_node_page_state(folio_pgdat(folio), idx, val);
}

static inline void lruvec_stat_mod_folio(struct folio *folio,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(folio_pgdat(folio), idx, val);
}

static inline void mod_lruvec_page_state(struct page *page,
                                         enum node_stat_item idx, int val)
{
        mod_node_page_state(page_pgdat(page), idx, val);
}

#endif /* CONFIG_MEMCG */

static inline void __lruvec_stat_add_folio(struct folio *folio,
                                           enum node_stat_item idx)
{
        __lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
}

static inline void __lruvec_stat_sub_folio(struct folio *folio,
                                           enum node_stat_item idx)
{
        __lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}

static inline void lruvec_stat_add_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, folio_nr_pages(folio));
}

static inline void lruvec_stat_sub_folio(struct folio *folio,
                                         enum node_stat_item idx)
{
        lruvec_stat_mod_folio(folio, idx, -folio_nr_pages(folio));
}

void memmap_boot_pages_add(long delta);
void memmap_pages_add(long delta);
#endif /* _LINUX_VMSTAT_H */































































































































































































   35 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
/*
 * Linux Security Module interfaces
 *
 * Copyright (C) 2001 WireX Communications, Inc <chris@wirex.com>
 * Copyright (C) 2001 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (C) 2001 Networks Associates Technology, Inc <ssmalley@nai.com>
 * Copyright (C) 2001 James Morris <jmorris@intercode.com.au>
 * Copyright (C) 2001 Silicon Graphics, Inc. (Trust Technology Group)
 * Copyright (C) 2015 Intel Corporation.
 * Copyright (C) 2015 Casey Schaufler <casey@schaufler-ca.com>
 * Copyright (C) 2016 Mellanox Techonologies
 *
 *        This program is free software; you can redistribute it and/or modify
 *        it under the terms of the GNU General Public License as published by
 *        the Free Software Foundation; either version 2 of the License, or
 *        (at your option) any later version.
 *
 *        Due to this file being licensed under the GPL there is controversy over
 *        whether this permits you to write a module that #includes this file
 *        without placing your module under the GPL.  Please consult a lawyer for
 *        advice before doing this.
 *
 */

#ifndef __LINUX_LSM_HOOKS_H
#define __LINUX_LSM_HOOKS_H

#include <uapi/linux/lsm.h>
#include <linux/security.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <linux/xattr.h>
#include <linux/static_call.h>
#include <linux/unroll.h>
#include <linux/jump_label.h>
#include <linux/lsm_count.h>

union security_list_options {
        #define LSM_HOOK(RET, DEFAULT, NAME, ...) RET (*NAME)(__VA_ARGS__);
        #include "lsm_hook_defs.h"
        #undef LSM_HOOK
        void *lsm_func_addr;
};

/*
 * @key: static call key as defined by STATIC_CALL_KEY
 * @trampoline: static call trampoline as defined by STATIC_CALL_TRAMP
 * @hl: The security_hook_list as initialized by the owning LSM.
 * @active: Enabled when the static call has an LSM hook associated.
 */
struct lsm_static_call {
        struct static_call_key *key;
        void *trampoline;
        struct security_hook_list *hl;
        /* this needs to be true or false based on what the key defaults to */
        struct static_key_false *active;
} __randomize_layout;

/*
 * Table of the static calls for each LSM hook.
 * Once the LSMs are initialized, their callbacks will be copied to these
 * tables such that the calls are filled backwards (from last to first).
 * This way, we can jump directly to the first used static call, and execute
 * all of them after. This essentially makes the entry point
 * dynamic to adapt the number of static calls to the number of callbacks.
 */
struct lsm_static_calls_table {
        #define LSM_HOOK(RET, DEFAULT, NAME, ...) \
                struct lsm_static_call NAME[MAX_LSM_COUNT];
        #include <linux/lsm_hook_defs.h>
        #undef LSM_HOOK
} __packed __randomize_layout;

/**
 * struct lsm_id - Identify a Linux Security Module.
 * @lsm: name of the LSM, must be approved by the LSM maintainers
 * @id: LSM ID number from uapi/linux/lsm.h
 *
 * Contains the information that identifies the LSM.
 */
struct lsm_id {
        const char *name;
        u64 id;
};

/*
 * Security module hook list structure.
 * For use with generic list macros for common operations.
 *
 * struct security_hook_list - Contents of a cacheable, mappable object.
 * @scalls: The beginning of the array of static calls assigned to this hook.
 * @hook: The callback for the hook.
 * @lsm: The name of the lsm that owns this hook.
 */
struct security_hook_list {
        struct lsm_static_call *scalls;
        union security_list_options hook;
        const struct lsm_id *lsmid;
} __randomize_layout;

/*
 * Security blob size or offset data.
 */
struct lsm_blob_sizes {
        int lbs_cred;
        int lbs_file;
        int lbs_ib;
        int lbs_inode;
        int lbs_sock;
        int lbs_superblock;
        int lbs_ipc;
        int lbs_key;
        int lbs_msg_msg;
        int lbs_perf_event;
        int lbs_task;
        int lbs_xattr_count; /* number of xattr slots in new_xattrs array */
        int lbs_tun_dev;
        int lbs_bdev;
};

/*
 * LSM_RET_VOID is used as the default value in LSM_HOOK definitions for void
 * LSM hooks (in include/linux/lsm_hook_defs.h).
 */
#define LSM_RET_VOID ((void) 0)

/*
 * Initializing a security_hook_list structure takes
 * up a lot of space in a source file. This macro takes
 * care of the common case and reduces the amount of
 * text involved.
 */
#define LSM_HOOK_INIT(NAME, HOOK)                        \
        {                                                \
                .scalls = static_calls_table.NAME,        \
                .hook = { .NAME = HOOK }                \
        }

extern void security_add_hooks(struct security_hook_list *hooks, int count,
                               const struct lsm_id *lsmid);

#define LSM_FLAG_LEGACY_MAJOR        BIT(0)
#define LSM_FLAG_EXCLUSIVE        BIT(1)

enum lsm_order {
        LSM_ORDER_FIRST = -1,        /* This is only for capabilities. */
        LSM_ORDER_MUTABLE = 0,
        LSM_ORDER_LAST = 1,        /* This is only for integrity. */
};

struct lsm_info {
        const char *name;        /* Required. */
        enum lsm_order order;        /* Optional: default is LSM_ORDER_MUTABLE */
        unsigned long flags;        /* Optional: flags describing LSM */
        int *enabled;                /* Optional: controlled by CONFIG_LSM */
        int (*init)(void);        /* Required. */
        struct lsm_blob_sizes *blobs; /* Optional: for blob sharing. */
};

#define DEFINE_LSM(lsm)                                                        \
        static struct lsm_info __lsm_##lsm                                \
                __used __section(".lsm_info.init")                        \
                __aligned(sizeof(unsigned long))

#define DEFINE_EARLY_LSM(lsm)                                                \
        static struct lsm_info __early_lsm_##lsm                        \
                __used __section(".early_lsm_info.init")                \
                __aligned(sizeof(unsigned long))

/* DO NOT tamper with these variables outside of the LSM framework */
extern char *lsm_names;
extern struct lsm_static_calls_table static_calls_table __ro_after_init;
extern struct lsm_info __start_lsm_info[], __end_lsm_info[];
extern struct lsm_info __start_early_lsm_info[], __end_early_lsm_info[];

/**
 * lsm_get_xattr_slot - Return the next available slot and increment the index
 * @xattrs: array storing LSM-provided xattrs
 * @xattr_count: number of already stored xattrs (updated)
 *
 * Retrieve the first available slot in the @xattrs array to fill with an xattr,
 * and increment @xattr_count.
 *
 * Return: The slot to fill in @xattrs if non-NULL, NULL otherwise.
 */
static inline struct xattr *lsm_get_xattr_slot(struct xattr *xattrs,
                                               int *xattr_count)
{
        if (unlikely(!xattrs))
                return NULL;
        return &xattrs[(*xattr_count)++];
}

#endif /* ! __LINUX_LSM_HOOKS_H */















































































































































































  194 



  189 





















































































































































































  194 

  194 


  153 

































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
/*
 * memfd_create system call and file sealing support
 *
 * Code was originally included in shmem.c, and broken out to facilitate
 * use by hugetlbfs as well as tmpfs.
 *
 * This file is released under the GPL.
 */

#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/mm.h>
#include <linux/sched/signal.h>
#include <linux/khugepaged.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
#include <uapi/linux/memfd.h>

/*
 * We need a tag: a new tag would expand every xa_node by 8 bytes,
 * so reuse a tag which we firmly believe is never set or cleared on tmpfs
 * or hugetlbfs because they are memory only filesystems.
 */
#define MEMFD_TAG_PINNED        PAGECACHE_TAG_TOWRITE
#define LAST_SCAN               4       /* about 150ms max */

static bool memfd_folio_has_extra_refs(struct folio *folio)
{
        return folio_ref_count(folio) - folio_mapcount(folio) !=
               folio_nr_pages(folio);
}

static void memfd_tag_pins(struct xa_state *xas)
{
        struct folio *folio;
        int latency = 0;

        lru_add_drain();

        xas_lock_irq(xas);
        xas_for_each(xas, folio, ULONG_MAX) {
                if (!xa_is_value(folio) && memfd_folio_has_extra_refs(folio))
                        xas_set_mark(xas, MEMFD_TAG_PINNED);

                if (++latency < XA_CHECK_SCHED)
                        continue;
                latency = 0;

                xas_pause(xas);
                xas_unlock_irq(xas);
                cond_resched();
                xas_lock_irq(xas);
        }
        xas_unlock_irq(xas);
}

/*
 * This is a helper function used by memfd_pin_user_pages() in GUP (gup.c).
 * It is mainly called to allocate a folio in a memfd when the caller
 * (memfd_pin_folios()) cannot find a folio in the page cache at a given
 * index in the mapping.
 */
struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
{
#ifdef CONFIG_HUGETLB_PAGE
        struct folio *folio;
        gfp_t gfp_mask;
        int err;

        if (is_file_hugepages(memfd)) {
                /*
                 * The folio would most likely be accessed by a DMA driver,
                 * therefore, we have zone memory constraints where we can
                 * alloc from. Also, the folio will be pinned for an indefinite
                 * amount of time, so it is not expected to be migrated away.
                 */
                struct hstate *h = hstate_file(memfd);

                gfp_mask = htlb_alloc_mask(h);
                gfp_mask &= ~(__GFP_HIGHMEM | __GFP_MOVABLE);
                idx >>= huge_page_order(h);

                folio = alloc_hugetlb_folio_reserve(h,
                                                    numa_node_id(),
                                                    NULL,
                                                    gfp_mask);
                if (folio) {
                        err = hugetlb_add_to_page_cache(folio,
                                                        memfd->f_mapping,
                                                        idx);
                        if (err) {
                                folio_put(folio);
                                return ERR_PTR(err);
                        }
                        folio_unlock(folio);
                        return folio;
                }
                return ERR_PTR(-ENOMEM);
        }
#endif
        return shmem_read_folio(memfd->f_mapping, idx);
}

/*
 * Setting SEAL_WRITE requires us to verify there's no pending writer. However,
 * via get_user_pages(), drivers might have some pending I/O without any active
 * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all folios
 * and see whether it has an elevated ref-count. If so, we tag them and wait for
 * them to be dropped.
 * The caller must guarantee that no new user will acquire writable references
 * to those folios to avoid races.
 */
static int memfd_wait_for_pins(struct address_space *mapping)
{
        XA_STATE(xas, &mapping->i_pages, 0);
        struct folio *folio;
        int error, scan;

        memfd_tag_pins(&xas);

        error = 0;
        for (scan = 0; scan <= LAST_SCAN; scan++) {
                int latency = 0;

                if (!xas_marked(&xas, MEMFD_TAG_PINNED))
                        break;

                if (!scan)
                        lru_add_drain_all();
                else if (schedule_timeout_killable((HZ << scan) / 200))
                        scan = LAST_SCAN;

                xas_set(&xas, 0);
                xas_lock_irq(&xas);
                xas_for_each_marked(&xas, folio, ULONG_MAX, MEMFD_TAG_PINNED) {
                        bool clear = true;

                        if (!xa_is_value(folio) &&
                            memfd_folio_has_extra_refs(folio)) {
                                /*
                                 * On the last scan, we clean up all those tags
                                 * we inserted; but make a note that we still
                                 * found folios pinned.
                                 */
                                if (scan == LAST_SCAN)
                                        error = -EBUSY;
                                else
                                        clear = false;
                        }
                        if (clear)
                                xas_clear_mark(&xas, MEMFD_TAG_PINNED);

                        if (++latency < XA_CHECK_SCHED)
                                continue;
                        latency = 0;

                        xas_pause(&xas);
                        xas_unlock_irq(&xas);
                        cond_resched();
                        xas_lock_irq(&xas);
                }
                xas_unlock_irq(&xas);
        }

        return error;
}

static unsigned int *memfd_file_seals_ptr(struct file *file)
{
        if (shmem_file(file))
                return &SHMEM_I(file_inode(file))->seals;

#ifdef CONFIG_HUGETLBFS
        if (is_file_hugepages(file))
                return &HUGETLBFS_I(file_inode(file))->seals;
#endif

        return NULL;
}

#define F_ALL_SEALS (F_SEAL_SEAL | \
                     F_SEAL_EXEC | \
                     F_SEAL_SHRINK | \
                     F_SEAL_GROW | \
                     F_SEAL_WRITE | \
                     F_SEAL_FUTURE_WRITE)

static int memfd_add_seals(struct file *file, unsigned int seals)
{
        struct inode *inode = file_inode(file);
        unsigned int *file_seals;
        int error;

        /*
         * SEALING
         * Sealing allows multiple parties to share a tmpfs or hugetlbfs file
         * but restrict access to a specific subset of file operations. Seals
         * can only be added, but never removed. This way, mutually untrusted
         * parties can share common memory regions with a well-defined policy.
         * A malicious peer can thus never perform unwanted operations on a
         * shared object.
         *
         * Seals are only supported on special tmpfs or hugetlbfs files and
         * always affect the whole underlying inode. Once a seal is set, it
         * may prevent some kinds of access to the file. Currently, the
         * following seals are defined:
         *   SEAL_SEAL: Prevent further seals from being set on this file
         *   SEAL_SHRINK: Prevent the file from shrinking
         *   SEAL_GROW: Prevent the file from growing
         *   SEAL_WRITE: Prevent write access to the file
         *   SEAL_EXEC: Prevent modification of the exec bits in the file mode
         *
         * As we don't require any trust relationship between two parties, we
         * must prevent seals from being removed. Therefore, sealing a file
         * only adds a given set of seals to the file, it never touches
         * existing seals. Furthermore, the "setting seals"-operation can be
         * sealed itself, which basically prevents any further seal from being
         * added.
         *
         * Semantics of sealing are only defined on volatile files. Only
         * anonymous tmpfs and hugetlbfs files support sealing. More
         * importantly, seals are never written to disk. Therefore, there's
         * no plan to support it on other file types.
         */

        if (!(file->f_mode & FMODE_WRITE))
                return -EPERM;
        if (seals & ~(unsigned int)F_ALL_SEALS)
                return -EINVAL;

        inode_lock(inode);

        file_seals = memfd_file_seals_ptr(file);
        if (!file_seals) {
                error = -EINVAL;
                goto unlock;
        }

        if (*file_seals & F_SEAL_SEAL) {
                error = -EPERM;
                goto unlock;
        }

        if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) {
                error = mapping_deny_writable(file->f_mapping);
                if (error)
                        goto unlock;

                error = memfd_wait_for_pins(file->f_mapping);
                if (error) {
                        mapping_allow_writable(file->f_mapping);
                        goto unlock;
                }
        }

        /*
         * SEAL_EXEC implies SEAL_WRITE, making W^X from the start.
         */
        if (seals & F_SEAL_EXEC && inode->i_mode & 0111)
                seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE;

        *file_seals |= seals;
        error = 0;

unlock:
        inode_unlock(inode);
        return error;
}

static int memfd_get_seals(struct file *file)
{
        unsigned int *seals = memfd_file_seals_ptr(file);

        return seals ? *seals : -EINVAL;
}

long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
        long error;

        switch (cmd) {
        case F_ADD_SEALS:
                error = memfd_add_seals(file, arg);
                break;
        case F_GET_SEALS:
                error = memfd_get_seals(file);
                break;
        default:
                error = -EINVAL;
                break;
        }

        return error;
}

#define MFD_NAME_PREFIX "memfd:"
#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)

#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC)

static int check_sysctl_memfd_noexec(unsigned int *flags)
{
#ifdef CONFIG_SYSCTL
        struct pid_namespace *ns = task_active_pid_ns(current);
        int sysctl = pidns_memfd_noexec_scope(ns);

        if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) {
                if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL)
                        *flags |= MFD_NOEXEC_SEAL;
                else
                        *flags |= MFD_EXEC;
        }

        if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) {
                pr_err_ratelimited(
                        "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n",
                        current->comm, task_pid_nr(current), sysctl);
                return -EACCES;
        }
#endif
        return 0;
}

static inline bool is_write_sealed(unsigned int seals)
{
        return seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE);
}

static int check_write_seal(unsigned long *vm_flags_ptr)
{
        unsigned long vm_flags = *vm_flags_ptr;
        unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE);

        /* If a private mapping then writability is irrelevant. */
        if (!(mask & VM_SHARED))
                return 0;

        /*
         * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
         * write seals are active.
         */
        if (mask & VM_WRITE)
                return -EPERM;

        /*
         * This is a read-only mapping, disallow mprotect() from making a
         * write-sealed mapping writable in future.
         */
        *vm_flags_ptr &= ~VM_MAYWRITE;

        return 0;
}

int memfd_check_seals_mmap(struct file *file, unsigned long *vm_flags_ptr)
{
        int err = 0;
        unsigned int *seals_ptr = memfd_file_seals_ptr(file);
        unsigned int seals = seals_ptr ? *seals_ptr : 0;

        if (is_write_sealed(seals))
                err = check_write_seal(vm_flags_ptr);

        return err;
}

static int sanitize_flags(unsigned int *flags_ptr)
{
        unsigned int flags = *flags_ptr;

        if (!(flags & MFD_HUGETLB)) {
                if (flags & ~(unsigned int)MFD_ALL_FLAGS)
                        return -EINVAL;
        } else {
                /* Allow huge page size encoding in flags. */
                if (flags & ~(unsigned int)(MFD_ALL_FLAGS |
                                (MFD_HUGE_MASK << MFD_HUGE_SHIFT)))
                        return -EINVAL;
        }

        /* Invalid if both EXEC and NOEXEC_SEAL are set.*/
        if ((flags & MFD_EXEC) && (flags & MFD_NOEXEC_SEAL))
                return -EINVAL;

        return check_sysctl_memfd_noexec(flags_ptr);
}

static char *alloc_name(const char __user *uname)
{
        int error;
        char *name;
        long len;

        name = kmalloc(NAME_MAX + 1, GFP_KERNEL);
        if (!name)
                return ERR_PTR(-ENOMEM);

        strcpy(name, MFD_NAME_PREFIX);
        /* returned length does not include terminating zero */
        len = strncpy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, MFD_NAME_MAX_LEN + 1);
        if (len < 0) {
                error = -EFAULT;
                goto err_name;
        } else if (len > MFD_NAME_MAX_LEN) {
                error = -EINVAL;
                goto err_name;
        }

        return name;

err_name:
        kfree(name);
        return ERR_PTR(error);
}

static struct file *alloc_file(const char *name, unsigned int flags)
{
        unsigned int *file_seals;
        struct file *file;

        if (flags & MFD_HUGETLB) {
                file = hugetlb_file_setup(name, 0, VM_NORESERVE,
                                        HUGETLB_ANONHUGE_INODE,
                                        (flags >> MFD_HUGE_SHIFT) &
                                        MFD_HUGE_MASK);
        } else {
                file = shmem_file_setup(name, 0, VM_NORESERVE);
        }
        if (IS_ERR(file))
                return file;
        file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
        file->f_flags |= O_LARGEFILE;

        if (flags & MFD_NOEXEC_SEAL) {
                struct inode *inode = file_inode(file);

                inode->i_mode &= ~0111;
                file_seals = memfd_file_seals_ptr(file);
                if (file_seals) {
                        *file_seals &= ~F_SEAL_SEAL;
                        *file_seals |= F_SEAL_EXEC;
                }
        } else if (flags & MFD_ALLOW_SEALING) {
                /* MFD_EXEC and MFD_ALLOW_SEALING are set */
                file_seals = memfd_file_seals_ptr(file);
                if (file_seals)
                        *file_seals &= ~F_SEAL_SEAL;
        }

        return file;
}

SYSCALL_DEFINE2(memfd_create,
                const char __user *, uname,
                unsigned int, flags)
{
        struct file *file;
        int fd, error;
        char *name;

        error = sanitize_flags(&flags);
        if (error < 0)
                return error;

        name = alloc_name(uname);
        if (IS_ERR(name))
                return PTR_ERR(name);

        fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
        if (fd < 0) {
                error = fd;
                goto err_name;
        }

        file = alloc_file(name, flags);
        if (IS_ERR(file)) {
                error = PTR_ERR(file);
                goto err_fd;
        }

        fd_install(fd, file);
        kfree(name);
        return fd;

err_fd:
        put_unused_fd(fd);
err_name:
        kfree(name);
        return error;
}
























   34 






   39 










    8 





   10 



  212 

  212 











    7 





    5 








    1 

    1 





    1 

















    6 


    1 











    5 



    5 






    5 


    5 


    5 
    4 















    3 





    1 


















    1 







    1 





    2 








    1 
    1 








    1 













    1 


















   40 













   37 



    3 







    1 


    1 






   34 

    7 


   34 












    1 

    1 

















    1 
    1 






    1 


    1 







    1 






























   38 










   38 











   38 











   38 
























































   36 




    7 







   36 

















   36 




    7 







   36 























































    1 

    1 







































































































































































   58 


















   35 








   35 












    6 



























   31 








    7 











   33 

   33 













































    1 



    1 






    2 





    1 








   34 




    1 

















   33 


    1 




    4 



    4 


    4 

    1 


    4 




   37 



   38 








   33 














    9 








    4 





    3 









    2 


    1 





    2 


    3 
















    1 












    1 


































    2 











    1 

    1 


    1 











    1 








   14 










    5 










    3 
    1 



    1 

    2 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
// SPDX-License-Identifier: GPL-2.0-only
/*
 * VGICv3 MMIO handling functions
 */

#include <linux/bitfield.h>
#include <linux/irqchip/arm-gic-v3.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/interrupt.h>
#include <kvm/iodev.h>
#include <kvm/arm_vgic.h>

#include <asm/kvm_emulate.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>

#include "vgic.h"
#include "vgic-mmio.h"

/* extract @num bytes at @offset bytes offset in data */
unsigned long extract_bytes(u64 data, unsigned int offset,
                            unsigned int num)
{
        return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
}

/* allows updates of any half of a 64-bit register (or the whole thing) */
u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
                     unsigned long val)
{
        int lower = (offset & 4) * 8;
        int upper = lower + 8 * len - 1;

        reg &= ~GENMASK_ULL(upper, lower);
        val &= GENMASK_ULL(len * 8 - 1, 0);

        return reg | ((u64)val << lower);
}

bool vgic_has_its(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;

        if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
                return false;

        return dist->has_its;
}

bool vgic_supports_direct_msis(struct kvm *kvm)
{
        return (kvm_vgic_global_state.has_gicv4_1 ||
                (kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm)));
}

/*
 * The Revision field in the IIDR have the following meanings:
 *
 * Revision 2: Interrupt groups are guest-configurable and signaled using
 *                their configured groups.
 */

static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
                                            gpa_t addr, unsigned int len)
{
        struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
        u32 value = 0;

        switch (addr & 0x0c) {
        case GICD_CTLR:
                if (vgic->enabled)
                        value |= GICD_CTLR_ENABLE_SS_G1;
                value |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
                if (vgic->nassgireq)
                        value |= GICD_CTLR_nASSGIreq;
                break;
        case GICD_TYPER:
                value = vgic->nr_spis + VGIC_NR_PRIVATE_IRQS;
                value = (value >> 5) - 1;
                if (vgic_has_its(vcpu->kvm)) {
                        value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
                        value |= GICD_TYPER_LPIS;
                } else {
                        value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
                }
                break;
        case GICD_TYPER2:
                if (kvm_vgic_global_state.has_gicv4_1 && gic_cpuif_has_vsgi())
                        value = GICD_TYPER2_nASSGIcap;
                break;
        case GICD_IIDR:
                value = (PRODUCT_ID_KVM << GICD_IIDR_PRODUCT_ID_SHIFT) |
                        (vgic->implementation_rev << GICD_IIDR_REVISION_SHIFT) |
                        (IMPLEMENTER_ARM << GICD_IIDR_IMPLEMENTER_SHIFT);
                break;
        default:
                return 0;
        }

        return value;
}

static void vgic_mmio_write_v3_misc(struct kvm_vcpu *vcpu,
                                    gpa_t addr, unsigned int len,
                                    unsigned long val)
{
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;

        switch (addr & 0x0c) {
        case GICD_CTLR: {
                bool was_enabled, is_hwsgi;

                mutex_lock(&vcpu->kvm->arch.config_lock);

                was_enabled = dist->enabled;
                is_hwsgi = dist->nassgireq;

                dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;

                /* Not a GICv4.1? No HW SGIs */
                if (!kvm_vgic_global_state.has_gicv4_1 || !gic_cpuif_has_vsgi())
                        val &= ~GICD_CTLR_nASSGIreq;

                /* Dist stays enabled? nASSGIreq is RO */
                if (was_enabled && dist->enabled) {
                        val &= ~GICD_CTLR_nASSGIreq;
                        val |= FIELD_PREP(GICD_CTLR_nASSGIreq, is_hwsgi);
                }

                /* Switching HW SGIs? */
                dist->nassgireq = val & GICD_CTLR_nASSGIreq;
                if (is_hwsgi != dist->nassgireq)
                        vgic_v4_configure_vsgis(vcpu->kvm);

                if (kvm_vgic_global_state.has_gicv4_1 &&
                    was_enabled != dist->enabled)
                        kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_RELOAD_GICv4);
                else if (!was_enabled && dist->enabled)
                        vgic_kick_vcpus(vcpu->kvm);

                mutex_unlock(&vcpu->kvm->arch.config_lock);
                break;
        }
        case GICD_TYPER:
        case GICD_TYPER2:
        case GICD_IIDR:
                /* This is at best for documentation purposes... */
                return;
        }
}

static int vgic_mmio_uaccess_write_v3_misc(struct kvm_vcpu *vcpu,
                                           gpa_t addr, unsigned int len,
                                           unsigned long val)
{
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        u32 reg;

        switch (addr & 0x0c) {
        case GICD_TYPER2:
                if (val != vgic_mmio_read_v3_misc(vcpu, addr, len))
                        return -EINVAL;
                return 0;
        case GICD_IIDR:
                reg = vgic_mmio_read_v3_misc(vcpu, addr, len);
                if ((reg ^ val) & ~GICD_IIDR_REVISION_MASK)
                        return -EINVAL;

                reg = FIELD_GET(GICD_IIDR_REVISION_MASK, reg);
                switch (reg) {
                case KVM_VGIC_IMP_REV_2:
                case KVM_VGIC_IMP_REV_3:
                        dist->implementation_rev = reg;
                        return 0;
                default:
                        return -EINVAL;
                }
        case GICD_CTLR:
                /* Not a GICv4.1? No HW SGIs */
                if (!kvm_vgic_global_state.has_gicv4_1)
                        val &= ~GICD_CTLR_nASSGIreq;

                dist->enabled = val & GICD_CTLR_ENABLE_SS_G1;
                dist->nassgireq = val & GICD_CTLR_nASSGIreq;
                return 0;
        }

        vgic_mmio_write_v3_misc(vcpu, addr, len, val);
        return 0;
}

static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
                                            gpa_t addr, unsigned int len)
{
        int intid = VGIC_ADDR_TO_INTID(addr, 64);
        struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, intid);
        unsigned long ret = 0;

        if (!irq)
                return 0;

        /* The upper word is RAZ for us. */
        if (!(addr & 4))
                ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);

        vgic_put_irq(vcpu->kvm, irq);
        return ret;
}

static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
                                    gpa_t addr, unsigned int len,
                                    unsigned long val)
{
        int intid = VGIC_ADDR_TO_INTID(addr, 64);
        struct vgic_irq *irq;
        unsigned long flags;

        /* The upper word is WI for us since we don't implement Aff3. */
        if (addr & 4)
                return;

        irq = vgic_get_irq(vcpu->kvm, intid);

        if (!irq)
                return;

        raw_spin_lock_irqsave(&irq->irq_lock, flags);

        /* We only care about and preserve Aff0, Aff1 and Aff2. */
        irq->mpidr = val & GENMASK(23, 0);
        irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);

        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
        vgic_put_irq(vcpu->kvm, irq);
}

bool vgic_lpis_enabled(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;

        return atomic_read(&vgic_cpu->ctlr) == GICR_CTLR_ENABLE_LPIS;
}

static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
                                             gpa_t addr, unsigned int len)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        unsigned long val;

        val = atomic_read(&vgic_cpu->ctlr);
        if (vgic_get_implementation_rev(vcpu) >= KVM_VGIC_IMP_REV_3)
                val |= GICR_CTLR_IR | GICR_CTLR_CES;

        return val;
}

static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
                                     gpa_t addr, unsigned int len,
                                     unsigned long val)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        u32 ctlr;

        if (!vgic_has_its(vcpu->kvm))
                return;

        if (!(val & GICR_CTLR_ENABLE_LPIS)) {
                /*
                 * Don't disable if RWP is set, as there already an
                 * ongoing disable. Funky guest...
                 */
                ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr,
                                              GICR_CTLR_ENABLE_LPIS,
                                              GICR_CTLR_RWP);
                if (ctlr != GICR_CTLR_ENABLE_LPIS)
                        return;

                vgic_flush_pending_lpis(vcpu);
                vgic_its_invalidate_all_caches(vcpu->kvm);
                atomic_set_release(&vgic_cpu->ctlr, 0);
        } else {
                ctlr = atomic_cmpxchg_acquire(&vgic_cpu->ctlr, 0,
                                              GICR_CTLR_ENABLE_LPIS);
                if (ctlr != 0)
                        return;

                vgic_enable_lpis(vcpu);
        }
}

static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu)
{
        struct vgic_dist *vgic = &vcpu->kvm->arch.vgic;
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_redist_region *iter, *rdreg = vgic_cpu->rdreg;

        if (!rdreg)
                return false;

        if (vgic_cpu->rdreg_index < rdreg->free_index - 1) {
                return false;
        } else if (rdreg->count && vgic_cpu->rdreg_index == (rdreg->count - 1)) {
                struct list_head *rd_regions = &vgic->rd_regions;
                gpa_t end = rdreg->base + rdreg->count * KVM_VGIC_V3_REDIST_SIZE;

                /*
                 * the rdist is the last one of the redist region,
                 * check whether there is no other contiguous rdist region
                 */
                list_for_each_entry(iter, rd_regions, list) {
                        if (iter->base == end && iter->free_index > 0)
                                return false;
                }
        }
        return true;
}

static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
                                              gpa_t addr, unsigned int len)
{
        unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
        int target_vcpu_id = vcpu->vcpu_id;
        u64 value;

        value = (u64)(mpidr & GENMASK(23, 0)) << 32;
        value |= ((target_vcpu_id & 0xffff) << 8);

        if (vgic_has_its(vcpu->kvm))
                value |= GICR_TYPER_PLPIS;

        if (vgic_mmio_vcpu_rdist_is_last(vcpu))
                value |= GICR_TYPER_LAST;

        return extract_bytes(value, addr & 7, len);
}

static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
                                             gpa_t addr, unsigned int len)
{
        return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
}

static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
                                              gpa_t addr, unsigned int len)
{
        switch (addr & 0xffff) {
        case GICD_PIDR2:
                /* report a GICv3 compliant implementation */
                return 0x3b;
        }

        return 0;
}

static int vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu,
                                         gpa_t addr, unsigned int len,
                                         unsigned long val)
{
        int ret;

        ret = vgic_uaccess_write_spending(vcpu, addr, len, val);
        if (ret)
                return ret;

        return vgic_uaccess_write_cpending(vcpu, addr, len, ~val);
}

/* We want to avoid outer shareable. */
u64 vgic_sanitise_shareability(u64 field)
{
        switch (field) {
        case GIC_BASER_OuterShareable:
                return GIC_BASER_InnerShareable;
        default:
                return field;
        }
}

/* Avoid any inner non-cacheable mapping. */
u64 vgic_sanitise_inner_cacheability(u64 field)
{
        switch (field) {
        case GIC_BASER_CACHE_nCnB:
        case GIC_BASER_CACHE_nC:
                return GIC_BASER_CACHE_RaWb;
        default:
                return field;
        }
}

/* Non-cacheable or same-as-inner are OK. */
u64 vgic_sanitise_outer_cacheability(u64 field)
{
        switch (field) {
        case GIC_BASER_CACHE_SameAsInner:
        case GIC_BASER_CACHE_nC:
                return field;
        default:
                return GIC_BASER_CACHE_SameAsInner;
        }
}

u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
                        u64 (*sanitise_fn)(u64))
{
        u64 field = (reg & field_mask) >> field_shift;

        field = sanitise_fn(field) << field_shift;
        return (reg & ~field_mask) | field;
}

#define PROPBASER_RES0_MASK                                                \
        (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
#define PENDBASER_RES0_MASK                                                \
        (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) |        \
         GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))

static u64 vgic_sanitise_pendbaser(u64 reg)
{
        reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
                                  GICR_PENDBASER_SHAREABILITY_SHIFT,
                                  vgic_sanitise_shareability);
        reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
                                  GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_inner_cacheability);
        reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
                                  GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_outer_cacheability);

        reg &= ~PENDBASER_RES0_MASK;

        return reg;
}

static u64 vgic_sanitise_propbaser(u64 reg)
{
        reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
                                  GICR_PROPBASER_SHAREABILITY_SHIFT,
                                  vgic_sanitise_shareability);
        reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
                                  GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_inner_cacheability);
        reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
                                  GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
                                  vgic_sanitise_outer_cacheability);

        reg &= ~PROPBASER_RES0_MASK;
        return reg;
}

static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
                                             gpa_t addr, unsigned int len)
{
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;

        return extract_bytes(dist->propbaser, addr & 7, len);
}

static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
                                     gpa_t addr, unsigned int len,
                                     unsigned long val)
{
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        u64 old_propbaser, propbaser;

        /* Storing a value with LPIs already enabled is undefined */
        if (vgic_lpis_enabled(vcpu))
                return;

        do {
                old_propbaser = READ_ONCE(dist->propbaser);
                propbaser = old_propbaser;
                propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
                propbaser = vgic_sanitise_propbaser(propbaser);
        } while (cmpxchg64(&dist->propbaser, old_propbaser,
                           propbaser) != old_propbaser);
}

static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
                                             gpa_t addr, unsigned int len)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        u64 value = vgic_cpu->pendbaser;

        value &= ~GICR_PENDBASER_PTZ;

        return extract_bytes(value, addr & 7, len);
}

static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
                                     gpa_t addr, unsigned int len,
                                     unsigned long val)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        u64 old_pendbaser, pendbaser;

        /* Storing a value with LPIs already enabled is undefined */
        if (vgic_lpis_enabled(vcpu))
                return;

        do {
                old_pendbaser = READ_ONCE(vgic_cpu->pendbaser);
                pendbaser = old_pendbaser;
                pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
                pendbaser = vgic_sanitise_pendbaser(pendbaser);
        } while (cmpxchg64(&vgic_cpu->pendbaser, old_pendbaser,
                           pendbaser) != old_pendbaser);
}

static unsigned long vgic_mmio_read_sync(struct kvm_vcpu *vcpu,
                                         gpa_t addr, unsigned int len)
{
        return !!atomic_read(&vcpu->arch.vgic_cpu.syncr_busy);
}

static void vgic_set_rdist_busy(struct kvm_vcpu *vcpu, bool busy)
{
        if (busy) {
                atomic_inc(&vcpu->arch.vgic_cpu.syncr_busy);
                smp_mb__after_atomic();
        } else {
                smp_mb__before_atomic();
                atomic_dec(&vcpu->arch.vgic_cpu.syncr_busy);
        }
}

static void vgic_mmio_write_invlpi(struct kvm_vcpu *vcpu,
                                   gpa_t addr, unsigned int len,
                                   unsigned long val)
{
        struct vgic_irq *irq;
        u32 intid;

        /*
         * If the guest wrote only to the upper 32bit part of the
         * register, drop the write on the floor, as it is only for
         * vPEs (which we don't support for obvious reasons).
         *
         * Also discard the access if LPIs are not enabled.
         */
        if ((addr & 4) || !vgic_lpis_enabled(vcpu))
                return;

        intid = lower_32_bits(val);
        if (intid < VGIC_MIN_LPI)
                return;

        vgic_set_rdist_busy(vcpu, true);

        irq = vgic_get_irq(vcpu->kvm, intid);
        if (irq) {
                vgic_its_inv_lpi(vcpu->kvm, irq);
                vgic_put_irq(vcpu->kvm, irq);
        }

        vgic_set_rdist_busy(vcpu, false);
}

static void vgic_mmio_write_invall(struct kvm_vcpu *vcpu,
                                   gpa_t addr, unsigned int len,
                                   unsigned long val)
{
        /* See vgic_mmio_write_invlpi() for the early return rationale */
        if ((addr & 4) || !vgic_lpis_enabled(vcpu))
                return;

        vgic_set_rdist_busy(vcpu, true);
        vgic_its_invall(vcpu);
        vgic_set_rdist_busy(vcpu, false);
}

/*
 * The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
 * redistributors, while SPIs are covered by registers in the distributor
 * block. Trying to set private IRQs in this block gets ignored.
 * We take some special care here to fix the calculation of the register
 * offset.
 */
#define REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(off, rd, wr, ur, uw, bpi, acc) \
        {                                                                \
                .reg_offset = off,                                        \
                .bits_per_irq = bpi,                                        \
                .len = (bpi * VGIC_NR_PRIVATE_IRQS) / 8,                \
                .access_flags = acc,                                        \
                .read = vgic_mmio_read_raz,                                \
                .write = vgic_mmio_write_wi,                                \
        }, {                                                                \
                .reg_offset = off + (bpi * VGIC_NR_PRIVATE_IRQS) / 8,        \
                .bits_per_irq = bpi,                                        \
                .len = (bpi * (1024 - VGIC_NR_PRIVATE_IRQS)) / 8,        \
                .access_flags = acc,                                        \
                .read = rd,                                                \
                .write = wr,                                                \
                .uaccess_read = ur,                                        \
                .uaccess_write = uw,                                        \
        }

static const struct vgic_register_region vgic_v3_dist_registers[] = {
        REGISTER_DESC_WITH_LENGTH_UACCESS(GICD_CTLR,
                vgic_mmio_read_v3_misc, vgic_mmio_write_v3_misc,
                NULL, vgic_mmio_uaccess_write_v3_misc,
                16, VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICD_STATUSR,
                vgic_mmio_read_rao, vgic_mmio_write_wi, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGROUPR,
                vgic_mmio_read_group, vgic_mmio_write_group, NULL, NULL, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISENABLER,
                vgic_mmio_read_enable, vgic_mmio_write_senable,
                NULL, vgic_uaccess_write_senable, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICENABLER,
                vgic_mmio_read_enable, vgic_mmio_write_cenable,
               NULL, vgic_uaccess_write_cenable, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISPENDR,
                vgic_mmio_read_pending, vgic_mmio_write_spending,
                vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICPENDR,
                vgic_mmio_read_pending, vgic_mmio_write_cpending,
                vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
                vgic_mmio_read_active, vgic_mmio_write_sactive,
                vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
                vgic_mmio_read_active, vgic_mmio_write_cactive,
                vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive,
                1, VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
                vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
                8, VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ITARGETSR,
                vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 8,
                VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICFGR,
                vgic_mmio_read_config, vgic_mmio_write_config, NULL, NULL, 2,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IGRPMODR,
                vgic_mmio_read_raz, vgic_mmio_write_wi, NULL, NULL, 1,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IROUTER,
                vgic_mmio_read_irouter, vgic_mmio_write_irouter, NULL, NULL, 64,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICD_IDREGS,
                vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
                VGIC_ACCESS_32bit),
};

static const struct vgic_register_region vgic_v3_rd_registers[] = {
        /* RD_base registers */
        REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
                vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_STATUSR,
                vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
                vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER,
                vgic_mmio_read_v3r_typer, vgic_mmio_write_wi,
                NULL, vgic_mmio_uaccess_write_wi, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
                vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
                vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
                vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_INVLPIR,
                vgic_mmio_read_raz, vgic_mmio_write_invlpi, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_INVALLR,
                vgic_mmio_read_raz, vgic_mmio_write_invall, 8,
                VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_SYNCR,
                vgic_mmio_read_sync, vgic_mmio_write_wi, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
                vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
                VGIC_ACCESS_32bit),
        /* SGI_base registers */
        REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGROUPR0,
                vgic_mmio_read_group, vgic_mmio_write_group, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISENABLER0,
                vgic_mmio_read_enable, vgic_mmio_write_senable,
                NULL, vgic_uaccess_write_senable, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICENABLER0,
                vgic_mmio_read_enable, vgic_mmio_write_cenable,
                NULL, vgic_uaccess_write_cenable, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISPENDR0,
                vgic_mmio_read_pending, vgic_mmio_write_spending,
                vgic_uaccess_read_pending, vgic_v3_uaccess_write_pending, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICPENDR0,
                vgic_mmio_read_pending, vgic_mmio_write_cpending,
                vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0,
                vgic_mmio_read_active, vgic_mmio_write_sactive,
                vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0,
                vgic_mmio_read_active, vgic_mmio_write_cactive,
                vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0,
                vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
                VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
        REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_ICFGR0,
                vgic_mmio_read_config, vgic_mmio_write_config, 8,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IGRPMODR0,
                vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
                VGIC_ACCESS_32bit),
        REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_NSACR,
                vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
                VGIC_ACCESS_32bit),
};

unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
{
        dev->regions = vgic_v3_dist_registers;
        dev->nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);

        kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);

        return SZ_64K;
}

/**
 * vgic_register_redist_iodev - register a single redist iodev
 * @vcpu:    The VCPU to which the redistributor belongs
 *
 * Register a KVM iodev for this VCPU's redistributor using the address
 * provided.
 *
 * Return 0 on success, -ERRNO otherwise.
 */
int vgic_register_redist_iodev(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        struct vgic_dist *vgic = &kvm->arch.vgic;
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
        struct vgic_redist_region *rdreg;
        gpa_t rd_base;
        int ret = 0;

        lockdep_assert_held(&kvm->slots_lock);
        mutex_lock(&kvm->arch.config_lock);

        if (!IS_VGIC_ADDR_UNDEF(vgic_cpu->rd_iodev.base_addr))
                goto out_unlock;

        /*
         * We may be creating VCPUs before having set the base address for the
         * redistributor region, in which case we will come back to this
         * function for all VCPUs when the base address is set.  Just return
         * without doing any work for now.
         */
        rdreg = vgic_v3_rdist_free_slot(&vgic->rd_regions);
        if (!rdreg)
                goto out_unlock;

        if (!vgic_v3_check_base(kvm)) {
                ret = -EINVAL;
                goto out_unlock;
        }

        vgic_cpu->rdreg = rdreg;
        vgic_cpu->rdreg_index = rdreg->free_index;

        rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE;

        kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
        rd_dev->base_addr = rd_base;
        rd_dev->iodev_type = IODEV_REDIST;
        rd_dev->regions = vgic_v3_rd_registers;
        rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
        rd_dev->redist_vcpu = vcpu;

        mutex_unlock(&kvm->arch.config_lock);

        ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, rd_base,
                                      2 * SZ_64K, &rd_dev->dev);
        if (ret)
                return ret;

        /* Protected by slots_lock */
        rdreg->free_index++;
        return 0;

out_unlock:
        mutex_unlock(&kvm->arch.config_lock);
        return ret;
}

void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu)
{
        struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;

        kvm_io_bus_unregister_dev(vcpu->kvm, KVM_MMIO_BUS, &rd_dev->dev);
}

static int vgic_register_all_redist_iodevs(struct kvm *kvm)
{
        struct kvm_vcpu *vcpu;
        unsigned long c;
        int ret = 0;

        lockdep_assert_held(&kvm->slots_lock);

        kvm_for_each_vcpu(c, vcpu, kvm) {
                ret = vgic_register_redist_iodev(vcpu);
                if (ret)
                        break;
        }

        if (ret) {
                /* The current c failed, so iterate over the previous ones. */
                int i;

                for (i = 0; i < c; i++) {
                        vcpu = kvm_get_vcpu(kvm, i);
                        vgic_unregister_redist_iodev(vcpu);
                }
        }

        return ret;
}

/**
 * vgic_v3_alloc_redist_region - Allocate a new redistributor region
 *
 * Performs various checks before inserting the rdist region in the list.
 * Those tests depend on whether the size of the rdist region is known
 * (ie. count != 0). The list is sorted by rdist region index.
 *
 * @kvm: kvm handle
 * @index: redist region index
 * @base: base of the new rdist region
 * @count: number of redistributors the region is made of (0 in the old style
 * single region, whose size is induced from the number of vcpus)
 *
 * Return 0 on success, < 0 otherwise
 */
static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index,
                                       gpa_t base, uint32_t count)
{
        struct vgic_dist *d = &kvm->arch.vgic;
        struct vgic_redist_region *rdreg;
        struct list_head *rd_regions = &d->rd_regions;
        int nr_vcpus = atomic_read(&kvm->online_vcpus);
        size_t size = count ? count * KVM_VGIC_V3_REDIST_SIZE
                            : nr_vcpus * KVM_VGIC_V3_REDIST_SIZE;
        int ret;

        /* cross the end of memory ? */
        if (base + size < base)
                return -EINVAL;

        if (list_empty(rd_regions)) {
                if (index != 0)
                        return -EINVAL;
        } else {
                rdreg = list_last_entry(rd_regions,
                                        struct vgic_redist_region, list);

                /* Don't mix single region and discrete redist regions */
                if (!count && rdreg->count)
                        return -EINVAL;

                if (!count)
                        return -EEXIST;

                if (index != rdreg->index + 1)
                        return -EINVAL;
        }

        /*
         * For legacy single-region redistributor regions (!count),
         * check that the redistributor region does not overlap with the
         * distributor's address space.
         */
        if (!count && !IS_VGIC_ADDR_UNDEF(d->vgic_dist_base) &&
                vgic_dist_overlap(kvm, base, size))
                return -EINVAL;

        /* collision with any other rdist region? */
        if (vgic_v3_rdist_overlap(kvm, base, size))
                return -EINVAL;

        rdreg = kzalloc(sizeof(*rdreg), GFP_KERNEL_ACCOUNT);
        if (!rdreg)
                return -ENOMEM;

        rdreg->base = VGIC_ADDR_UNDEF;

        ret = vgic_check_iorange(kvm, rdreg->base, base, SZ_64K, size);
        if (ret)
                goto free;

        rdreg->base = base;
        rdreg->count = count;
        rdreg->free_index = 0;
        rdreg->index = index;

        list_add_tail(&rdreg->list, rd_regions);
        return 0;
free:
        kfree(rdreg);
        return ret;
}

void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg)
{
        struct kvm_vcpu *vcpu;
        unsigned long c;

        lockdep_assert_held(&kvm->arch.config_lock);

        /* Garbage collect the region */
        kvm_for_each_vcpu(c, vcpu, kvm) {
                if (vcpu->arch.vgic_cpu.rdreg == rdreg)
                        vcpu->arch.vgic_cpu.rdreg = NULL;
        }

        list_del(&rdreg->list);
        kfree(rdreg);
}

int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count)
{
        int ret;

        mutex_lock(&kvm->arch.config_lock);
        ret = vgic_v3_alloc_redist_region(kvm, index, addr, count);
        mutex_unlock(&kvm->arch.config_lock);
        if (ret)
                return ret;

        /*
         * Register iodevs for each existing VCPU.  Adding more VCPUs
         * afterwards will register the iodevs when needed.
         */
        ret = vgic_register_all_redist_iodevs(kvm);
        if (ret) {
                struct vgic_redist_region *rdreg;

                mutex_lock(&kvm->arch.config_lock);
                rdreg = vgic_v3_rdist_region_from_index(kvm, index);
                vgic_v3_free_redist_region(kvm, rdreg);
                mutex_unlock(&kvm->arch.config_lock);
                return ret;
        }

        return 0;
}

int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
{
        const struct vgic_register_region *region;
        struct vgic_io_device iodev;
        struct vgic_reg_attr reg_attr;
        struct kvm_vcpu *vcpu;
        gpa_t addr;
        int ret;

        ret = vgic_v3_parse_attr(dev, attr, &reg_attr);
        if (ret)
                return ret;

        vcpu = reg_attr.vcpu;
        addr = reg_attr.addr;

        switch (attr->group) {
        case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
                iodev.regions = vgic_v3_dist_registers;
                iodev.nr_regions = ARRAY_SIZE(vgic_v3_dist_registers);
                iodev.base_addr = 0;
                break;
        case KVM_DEV_ARM_VGIC_GRP_REDIST_REGS:{
                iodev.regions = vgic_v3_rd_registers;
                iodev.nr_regions = ARRAY_SIZE(vgic_v3_rd_registers);
                iodev.base_addr = 0;
                break;
        }
        case KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS:
                return vgic_v3_has_cpu_sysregs_attr(vcpu, attr);
        default:
                return -ENXIO;
        }

        /* We only support aligned 32-bit accesses. */
        if (addr & 3)
                return -ENXIO;

        region = vgic_get_mmio_region(vcpu, &iodev, addr, sizeof(u32));
        if (!region)
                return -ENXIO;

        return 0;
}

/*
 * The ICC_SGI* registers encode the affinity differently from the MPIDR,
 * so provide a wrapper to use the existing defines to isolate a certain
 * affinity level.
 */
#define SGI_AFFINITY_LEVEL(reg, level) \
        ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
        >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))

static void vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, u32 sgi, bool allow_group1)
{
        struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, sgi);
        unsigned long flags;

        raw_spin_lock_irqsave(&irq->irq_lock, flags);

        /*
         * An access targeting Group0 SGIs can only generate
         * those, while an access targeting Group1 SGIs can
         * generate interrupts of either group.
         */
        if (!irq->group || allow_group1) {
                if (!irq->hw) {
                        irq->pending_latch = true;
                        vgic_queue_irq_unlock(vcpu->kvm, irq, flags);
                } else {
                        /* HW SGI? Ask the GIC to inject it */
                        int err;
                        err = irq_set_irqchip_state(irq->host_irq,
                                                    IRQCHIP_STATE_PENDING,
                                                    true);
                        WARN_RATELIMIT(err, "IRQ %d", irq->host_irq);
                        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                }
        } else {
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
        }

        vgic_put_irq(vcpu->kvm, irq);
}

/**
 * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
 * @vcpu: The VCPU requesting a SGI
 * @reg: The value written into ICC_{ASGI1,SGI0,SGI1}R by that VCPU
 * @allow_group1: Does the sysreg access allow generation of G1 SGIs
 *
 * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
 * This will trap in sys_regs.c and call this function.
 * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
 * target processors as well as a bitmask of 16 Aff0 CPUs.
 *
 * If the interrupt routing mode bit is not set, we iterate over the Aff0
 * bits and signal the VCPUs matching the provided Aff{3,2,1}.
 *
 * If this bit is set, we signal all, but not the calling VCPU.
 */
void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1)
{
        struct kvm *kvm = vcpu->kvm;
        struct kvm_vcpu *c_vcpu;
        unsigned long target_cpus;
        u64 mpidr;
        u32 sgi, aff0;
        unsigned long c;

        sgi = FIELD_GET(ICC_SGI1R_SGI_ID_MASK, reg);

        /* Broadcast */
        if (unlikely(reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT))) {
                kvm_for_each_vcpu(c, c_vcpu, kvm) {
                        /* Don't signal the calling VCPU */
                        if (c_vcpu == vcpu)
                                continue;

                        vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1);
                }

                return;
        }

        /* We iterate over affinities to find the corresponding vcpus */
        mpidr = SGI_AFFINITY_LEVEL(reg, 3);
        mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
        mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
        target_cpus = FIELD_GET(ICC_SGI1R_TARGET_LIST_MASK, reg);

        for_each_set_bit(aff0, &target_cpus, hweight_long(ICC_SGI1R_TARGET_LIST_MASK)) {
                c_vcpu = kvm_mpidr_to_vcpu(kvm, mpidr | aff0);
                if (c_vcpu)
                        vgic_v3_queue_sgi(c_vcpu, sgi, allow_group1);
        }
}

int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
                         int offset, u32 *val)
{
        struct vgic_io_device dev = {
                .regions = vgic_v3_dist_registers,
                .nr_regions = ARRAY_SIZE(vgic_v3_dist_registers),
        };

        return vgic_uaccess(vcpu, &dev, is_write, offset, val);
}

int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
                           int offset, u32 *val)
{
        struct vgic_io_device rd_dev = {
                .regions = vgic_v3_rd_registers,
                .nr_regions = ARRAY_SIZE(vgic_v3_rd_registers),
        };

        return vgic_uaccess(vcpu, &rd_dev, is_write, offset, val);
}

int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
                                    u32 intid, u32 *val)
{
        if (intid % 32)
                return -EINVAL;

        if (is_write)
                vgic_write_irq_line_level_info(vcpu, intid, *val);
        else
                *val = vgic_read_irq_line_level_info(vcpu, intid);

        return 0;
}


























  239 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM capability

#if !defined(_TRACE_CAPABILITY_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CAPABILITY_H

#include <linux/cred.h>
#include <linux/tracepoint.h>
#include <linux/user_namespace.h>

/**
 * cap_capable - called after it's determined if a task has a particular
 * effective capability
 *
 * @cred: The credentials used
 * @target_ns: The user namespace of the resource being accessed
 * @capable_ns: The user namespace in which the credential provides the
 *              capability to access the targeted resource.
 *              This will be NULL if ret is not 0.
 * @cap: The capability to check for
 * @ret: The return value of the check: 0 if it does, -ve if it does not
 *
 * Allows to trace calls to cap_capable in commoncap.c
 */
TRACE_EVENT(cap_capable,

        TP_PROTO(const struct cred *cred, struct user_namespace *target_ns,
                const struct user_namespace *capable_ns, int cap, int ret),

        TP_ARGS(cred, target_ns, capable_ns, cap, ret),

        TP_STRUCT__entry(
                __field(const struct cred *, cred)
                __field(struct user_namespace *, target_ns)
                __field(const struct user_namespace *, capable_ns)
                __field(int, cap)
                __field(int, ret)
        ),

        TP_fast_assign(
                __entry->cred       = cred;
                __entry->target_ns    = target_ns;
                __entry->capable_ns = ret == 0 ? capable_ns : NULL;
                __entry->cap        = cap;
                __entry->ret        = ret;
        ),

        TP_printk("cred %p, target_ns %p, capable_ns %p, cap %d, ret %d",
                __entry->cred, __entry->target_ns, __entry->capable_ns, __entry->cap,
                __entry->ret)
);

#endif /* _TRACE_CAPABILITY_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
























































































































































































































   17 



   17 








































  412 





  303 







































































































  150 






  149 



















































   34 











    1 






























    1 









    1 











  304 



































  155 
















  409 

















   23 


























  185 






  345 



  413 




    8 




  410 

  304 



   39 


  155 







    6 






  156 









   19 
  143 

  147 








   16 








  154 





   12 
   12 





  211 










  211 






    4 






  212 



  304 








  142 













   33 





    6 

























   27 












  408 


  407 
  407 

    1 

































































































































  411 



  409 































































































  244 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/mm/fault.c
 *
 * Copyright (C) 1995  Linus Torvalds
 * Copyright (C) 1995-2004 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/acpi.h>
#include <linux/bitfield.h>
#include <linux/extable.h>
#include <linux/kfence.h>
#include <linux/signal.h>
#include <linux/mm.h>
#include <linux/hardirq.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
#include <linux/highmem.h>
#include <linux/perf_event.h>
#include <linux/pkeys.h>
#include <linux/preempt.h>
#include <linux/hugetlb.h>

#include <asm/acpi.h>
#include <asm/bug.h>
#include <asm/cmpxchg.h>
#include <asm/cpufeature.h>
#include <asm/efi.h>
#include <asm/exception.h>
#include <asm/daifflags.h>
#include <asm/debug-monitors.h>
#include <asm/esr.h>
#include <asm/kprobes.h>
#include <asm/mte.h>
#include <asm/processor.h>
#include <asm/sysreg.h>
#include <asm/system_misc.h>
#include <asm/tlbflush.h>
#include <asm/traps.h>

struct fault_info {
        int        (*fn)(unsigned long far, unsigned long esr,
                      struct pt_regs *regs);
        int        sig;
        int        code;
        const char *name;
};

static const struct fault_info fault_info[];
static struct fault_info debug_fault_info[];

static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
{
        return fault_info + (esr & ESR_ELx_FSC);
}

static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
{
        return debug_fault_info + DBG_ESR_EVT(esr);
}

static void data_abort_decode(unsigned long esr)
{
        unsigned long iss2 = ESR_ELx_ISS2(esr);

        pr_alert("Data abort info:\n");

        if (esr & ESR_ELx_ISV) {
                pr_alert("  Access size = %u byte(s)\n",
                         1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT));
                pr_alert("  SSE = %lu, SRT = %lu\n",
                         (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT,
                         (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT);
                pr_alert("  SF = %lu, AR = %lu\n",
                         (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
                         (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
        } else {
                pr_alert("  ISV = 0, ISS = 0x%08lx, ISS2 = 0x%08lx\n",
                         esr & ESR_ELx_ISS_MASK, iss2);
        }

        pr_alert("  CM = %lu, WnR = %lu, TnD = %lu, TagAccess = %lu\n",
                 (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT,
                 (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT,
                 (iss2 & ESR_ELx_TnD) >> ESR_ELx_TnD_SHIFT,
                 (iss2 & ESR_ELx_TagAccess) >> ESR_ELx_TagAccess_SHIFT);

        pr_alert("  GCS = %ld, Overlay = %lu, DirtyBit = %lu, Xs = %llu\n",
                 (iss2 & ESR_ELx_GCS) >> ESR_ELx_GCS_SHIFT,
                 (iss2 & ESR_ELx_Overlay) >> ESR_ELx_Overlay_SHIFT,
                 (iss2 & ESR_ELx_DirtyBit) >> ESR_ELx_DirtyBit_SHIFT,
                 (iss2 & ESR_ELx_Xs_MASK) >> ESR_ELx_Xs_SHIFT);
}

static void mem_abort_decode(unsigned long esr)
{
        pr_alert("Mem abort info:\n");

        pr_alert("  ESR = 0x%016lx\n", esr);
        pr_alert("  EC = 0x%02lx: %s, IL = %u bits\n",
                 ESR_ELx_EC(esr), esr_get_class_string(esr),
                 (esr & ESR_ELx_IL) ? 32 : 16);
        pr_alert("  SET = %lu, FnV = %lu\n",
                 (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
                 (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT);
        pr_alert("  EA = %lu, S1PTW = %lu\n",
                 (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
                 (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
        pr_alert("  FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC),
                 esr_to_fault_info(esr)->name);

        if (esr_is_data_abort(esr))
                data_abort_decode(esr);
}

static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
{
        /* Either init_pg_dir or swapper_pg_dir */
        if (mm == &init_mm)
                return __pa_symbol(mm->pgd);

        return (unsigned long)virt_to_phys(mm->pgd);
}

/*
 * Dump out the page tables associated with 'addr' in the currently active mm.
 */
static void show_pte(unsigned long addr)
{
        struct mm_struct *mm;
        pgd_t *pgdp;
        pgd_t pgd;

        if (is_ttbr0_addr(addr)) {
                /* TTBR0 */
                mm = current->active_mm;
                if (mm == &init_mm) {
                        pr_alert("[%016lx] user address but active_mm is swapper\n",
                                 addr);
                        return;
                }
        } else if (is_ttbr1_addr(addr)) {
                /* TTBR1 */
                mm = &init_mm;
        } else {
                pr_alert("[%016lx] address between user and kernel address ranges\n",
                         addr);
                return;
        }

        pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
                 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
                 vabits_actual, mm_to_pgd_phys(mm));
        pgdp = pgd_offset(mm, addr);
        pgd = READ_ONCE(*pgdp);
        pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));

        do {
                p4d_t *p4dp, p4d;
                pud_t *pudp, pud;
                pmd_t *pmdp, pmd;
                pte_t *ptep, pte;

                if (pgd_none(pgd) || pgd_bad(pgd))
                        break;

                p4dp = p4d_offset(pgdp, addr);
                p4d = READ_ONCE(*p4dp);
                pr_cont(", p4d=%016llx", p4d_val(p4d));
                if (p4d_none(p4d) || p4d_bad(p4d))
                        break;

                pudp = pud_offset(p4dp, addr);
                pud = READ_ONCE(*pudp);
                pr_cont(", pud=%016llx", pud_val(pud));
                if (pud_none(pud) || pud_bad(pud))
                        break;

                pmdp = pmd_offset(pudp, addr);
                pmd = READ_ONCE(*pmdp);
                pr_cont(", pmd=%016llx", pmd_val(pmd));
                if (pmd_none(pmd) || pmd_bad(pmd))
                        break;

                ptep = pte_offset_map(pmdp, addr);
                if (!ptep)
                        break;

                pte = __ptep_get(ptep);
                pr_cont(", pte=%016llx", pte_val(pte));
                pte_unmap(ptep);
        } while(0);

        pr_cont("\n");
}

/*
 * This function sets the access flags (dirty, accessed), as well as write
 * permission, and only to a more permissive setting.
 *
 * It needs to cope with hardware update of the accessed/dirty state by other
 * agents in the system and can safely skip the __sync_icache_dcache() call as,
 * like __set_ptes(), the PTE is never changed from no-exec to exec here.
 *
 * Returns whether or not the PTE actually changed.
 */
int __ptep_set_access_flags(struct vm_area_struct *vma,
                            unsigned long address, pte_t *ptep,
                            pte_t entry, int dirty)
{
        pteval_t old_pteval, pteval;
        pte_t pte = __ptep_get(ptep);

        if (pte_same(pte, entry))
                return 0;

        /* only preserve the access flags and write permission */
        pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;

        /*
         * Setting the flags must be done atomically to avoid racing with the
         * hardware update of the access/dirty state. The PTE_RDONLY bit must
         * be set to the most permissive (lowest value) of *ptep and entry
         * (calculated as: a & b == ~(~a | ~b)).
         */
        pte_val(entry) ^= PTE_RDONLY;
        pteval = pte_val(pte);
        do {
                old_pteval = pteval;
                pteval ^= PTE_RDONLY;
                pteval |= pte_val(entry);
                pteval ^= PTE_RDONLY;
                pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
        } while (pteval != old_pteval);

        /* Invalidate a stale read-only entry */
        if (dirty)
                flush_tlb_page(vma, address);
        return 1;
}

static bool is_el1_instruction_abort(unsigned long esr)
{
        return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
}

static bool is_el1_data_abort(unsigned long esr)
{
        return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
}

static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr,
                                           struct pt_regs *regs)
{
        if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
                return false;

        if (esr_fsc_is_permission_fault(esr))
                return true;

        if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
                return esr_fsc_is_translation_fault(esr) &&
                        (regs->pstate & PSR_PAN_BIT);

        return false;
}

static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
                                                        unsigned long esr,
                                                        struct pt_regs *regs)
{
        unsigned long flags;
        u64 par, dfsc;

        if (!is_el1_data_abort(esr) || !esr_fsc_is_translation_fault(esr))
                return false;

        local_irq_save(flags);
        asm volatile("at s1e1r, %0" :: "r" (addr));
        isb();
        par = read_sysreg_par();
        local_irq_restore(flags);

        /*
         * If we now have a valid translation, treat the translation fault as
         * spurious.
         */
        if (!(par & SYS_PAR_EL1_F))
                return true;

        /*
         * If we got a different type of fault from the AT instruction,
         * treat the translation fault as spurious.
         */
        dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
        return !esr_fsc_is_translation_fault(dfsc);
}

static void die_kernel_fault(const char *msg, unsigned long addr,
                             unsigned long esr, struct pt_regs *regs)
{
        bust_spinlocks(1);

        pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
                 addr);

        kasan_non_canonical_hook(addr);

        mem_abort_decode(esr);

        show_pte(addr);
        die("Oops", regs, esr);
        bust_spinlocks(0);
        make_task_dead(SIGKILL);
}

#ifdef CONFIG_KASAN_HW_TAGS
static void report_tag_fault(unsigned long addr, unsigned long esr,
                             struct pt_regs *regs)
{
        /*
         * SAS bits aren't set for all faults reported in EL1, so we can't
         * find out access size.
         */
        bool is_write = !!(esr & ESR_ELx_WNR);
        kasan_report((void *)addr, 0, is_write, regs->pc);
}
#else
/* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
static inline void report_tag_fault(unsigned long addr, unsigned long esr,
                                    struct pt_regs *regs) { }
#endif

static void do_tag_recovery(unsigned long addr, unsigned long esr,
                           struct pt_regs *regs)
{

        report_tag_fault(addr, esr, regs);

        /*
         * Disable MTE Tag Checking on the local CPU for the current EL.
         * It will be done lazily on the other CPUs when they will hit a
         * tag fault.
         */
        sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF_MASK,
                         SYS_FIELD_PREP_ENUM(SCTLR_EL1, TCF, NONE));
        isb();
}

static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
{
        unsigned long fsc = esr & ESR_ELx_FSC;

        if (!is_el1_data_abort(esr))
                return false;

        if (fsc == ESR_ELx_FSC_MTE)
                return true;

        return false;
}

static void __do_kernel_fault(unsigned long addr, unsigned long esr,
                              struct pt_regs *regs)
{
        const char *msg;

        /*
         * Are we prepared to handle this kernel fault?
         * We are almost certainly not prepared to handle instruction faults.
         */
        if (!is_el1_instruction_abort(esr) && fixup_exception(regs, esr))
                return;

        if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
            "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
                return;

        if (is_el1_mte_sync_tag_check_fault(esr)) {
                do_tag_recovery(addr, esr, regs);

                return;
        }

        if (is_el1_permission_fault(addr, esr, regs)) {
                if (esr & ESR_ELx_WNR)
                        msg = "write to read-only memory";
                else if (is_el1_instruction_abort(esr))
                        msg = "execute from non-executable memory";
                else
                        msg = "read from unreadable memory";
        } else if (addr < PAGE_SIZE) {
                msg = "NULL pointer dereference";
        } else {
                if (esr_fsc_is_translation_fault(esr) &&
                    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
                        return;

                msg = "paging request";
        }

        if (efi_runtime_fixup_exception(regs, msg))
                return;

        die_kernel_fault(msg, addr, esr, regs);
}

static void set_thread_esr(unsigned long address, unsigned long esr)
{
        current->thread.fault_address = address;

        /*
         * If the faulting address is in the kernel, we must sanitize the ESR.
         * From userspace's point of view, kernel-only mappings don't exist
         * at all, so we report them as level 0 translation faults.
         * (This is not quite the way that "no mapping there at all" behaves:
         * an alignment fault not caused by the memory type would take
         * precedence over translation fault for a real access to empty
         * space. Unfortunately we can't easily distinguish "alignment fault
         * not caused by memory type" from "alignment fault caused by memory
         * type", so we ignore this wrinkle and just return the translation
         * fault.)
         */
        if (!is_ttbr0_addr(current->thread.fault_address)) {
                switch (ESR_ELx_EC(esr)) {
                case ESR_ELx_EC_DABT_LOW:
                        /*
                         * These bits provide only information about the
                         * faulting instruction, which userspace knows already.
                         * We explicitly clear bits which are architecturally
                         * RES0 in case they are given meanings in future.
                         * We always report the ESR as if the fault was taken
                         * to EL1 and so ISV and the bits in ISS[23:14] are
                         * clear. (In fact it always will be a fault to EL1.)
                         */
                        esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
                                ESR_ELx_CM | ESR_ELx_WNR;
                        esr |= ESR_ELx_FSC_FAULT;
                        break;
                case ESR_ELx_EC_IABT_LOW:
                        /*
                         * Claim a level 0 translation fault.
                         * All other bits are architecturally RES0 for faults
                         * reported with that DFSC value, so we clear them.
                         */
                        esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
                        esr |= ESR_ELx_FSC_FAULT;
                        break;
                default:
                        /*
                         * This should never happen (entry.S only brings us
                         * into this code for insn and data aborts from a lower
                         * exception level). Fail safe by not providing an ESR
                         * context record at all.
                         */
                        WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr);
                        esr = 0;
                        break;
                }
        }

        current->thread.fault_code = esr;
}

static void do_bad_area(unsigned long far, unsigned long esr,
                        struct pt_regs *regs)
{
        unsigned long addr = untagged_addr(far);

        /*
         * If we are in kernel mode at this point, we have no context to
         * handle this fault with.
         */
        if (user_mode(regs)) {
                const struct fault_info *inf = esr_to_fault_info(esr);

                set_thread_esr(addr, esr);
                arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
        } else {
                __do_kernel_fault(addr, esr, regs);
        }
}

static bool fault_from_pkey(unsigned long esr, struct vm_area_struct *vma,
                        unsigned int mm_flags)
{
        unsigned long iss2 = ESR_ELx_ISS2(esr);

        if (!system_supports_poe())
                return false;

        if (esr_fsc_is_permission_fault(esr) && (iss2 & ESR_ELx_Overlay))
                return true;

        return !arch_vma_access_permitted(vma,
                        mm_flags & FAULT_FLAG_WRITE,
                        mm_flags & FAULT_FLAG_INSTRUCTION,
                        false);
}

static bool is_gcs_fault(unsigned long esr)
{
        if (!esr_is_data_abort(esr))
                return false;

        return ESR_ELx_ISS2(esr) & ESR_ELx_GCS;
}

static bool is_el0_instruction_abort(unsigned long esr)
{
        return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
}

/*
 * Note: not valid for EL1 DC IVAC, but we never use that such that it
 * should fault. EL0 cannot issue DC IVAC (undef).
 */
static bool is_write_abort(unsigned long esr)
{
        return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
}

static bool is_invalid_gcs_access(struct vm_area_struct *vma, u64 esr)
{
        if (!system_supports_gcs())
                return false;

        if (unlikely(is_gcs_fault(esr))) {
                /* GCS accesses must be performed on a GCS page */
                if (!(vma->vm_flags & VM_SHADOW_STACK))
                        return true;
        } else if (unlikely(vma->vm_flags & VM_SHADOW_STACK)) {
                /* Only GCS operations can write to a GCS page */
                return esr_is_data_abort(esr) && is_write_abort(esr);
        }

        return false;
}

static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
                                   struct pt_regs *regs)
{
        const struct fault_info *inf;
        struct mm_struct *mm = current->mm;
        vm_fault_t fault;
        unsigned long vm_flags;
        unsigned int mm_flags = FAULT_FLAG_DEFAULT;
        unsigned long addr = untagged_addr(far);
        struct vm_area_struct *vma;
        int si_code;
        int pkey = -1;

        if (kprobe_page_fault(regs, esr))
                return 0;

        /*
         * If we're in an interrupt or have no user context, we must not take
         * the fault.
         */
        if (faulthandler_disabled() || !mm)
                goto no_context;

        if (user_mode(regs))
                mm_flags |= FAULT_FLAG_USER;

        /*
         * vm_flags tells us what bits we must have in vma->vm_flags
         * for the fault to be benign, __do_page_fault() would check
         * vma->vm_flags & vm_flags and returns an error if the
         * intersection is empty
         */
        if (is_el0_instruction_abort(esr)) {
                /* It was exec fault */
                vm_flags = VM_EXEC;
                mm_flags |= FAULT_FLAG_INSTRUCTION;
        } else if (is_gcs_fault(esr)) {
                /*
                 * The GCS permission on a page implies both read and
                 * write so always handle any GCS fault as a write fault,
                 * we need to trigger CoW even for GCS reads.
                 */
                vm_flags = VM_WRITE;
                mm_flags |= FAULT_FLAG_WRITE;
        } else if (is_write_abort(esr)) {
                /* It was write fault */
                vm_flags = VM_WRITE;
                mm_flags |= FAULT_FLAG_WRITE;
        } else {
                /* It was read fault */
                vm_flags = VM_READ;
                /* Write implies read */
                vm_flags |= VM_WRITE;
                /* If EPAN is absent then exec implies read */
                if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN))
                        vm_flags |= VM_EXEC;
        }

        if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
                if (is_el1_instruction_abort(esr))
                        die_kernel_fault("execution of user memory",
                                         addr, esr, regs);

                if (!insn_may_access_user(regs->pc, esr))
                        die_kernel_fault("access to user memory outside uaccess routines",
                                         addr, esr, regs);
        }

        perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);

        if (!(mm_flags & FAULT_FLAG_USER))
                goto lock_mmap;

        vma = lock_vma_under_rcu(mm, addr);
        if (!vma)
                goto lock_mmap;

        if (is_invalid_gcs_access(vma, esr)) {
                vma_end_read(vma);
                fault = 0;
                si_code = SEGV_ACCERR;
                goto bad_area;
        }

        if (!(vma->vm_flags & vm_flags)) {
                vma_end_read(vma);
                fault = 0;
                si_code = SEGV_ACCERR;
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                goto bad_area;
        }

        if (fault_from_pkey(esr, vma, mm_flags)) {
                pkey = vma_pkey(vma);
                vma_end_read(vma);
                fault = 0;
                si_code = SEGV_PKUERR;
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                goto bad_area;
        }

        fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs);
        if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
                vma_end_read(vma);

        if (!(fault & VM_FAULT_RETRY)) {
                count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
                goto done;
        }
        count_vm_vma_lock_event(VMA_LOCK_RETRY);
        if (fault & VM_FAULT_MAJOR)
                mm_flags |= FAULT_FLAG_TRIED;

        /* Quick path to respond to signals */
        if (fault_signal_pending(fault, regs)) {
                if (!user_mode(regs))
                        goto no_context;
                return 0;
        }
lock_mmap:

retry:
        vma = lock_mm_and_find_vma(mm, addr, regs);
        if (unlikely(!vma)) {
                fault = 0;
                si_code = SEGV_MAPERR;
                goto bad_area;
        }

        if (!(vma->vm_flags & vm_flags)) {
                mmap_read_unlock(mm);
                fault = 0;
                si_code = SEGV_ACCERR;
                goto bad_area;
        }

        if (fault_from_pkey(esr, vma, mm_flags)) {
                pkey = vma_pkey(vma);
                mmap_read_unlock(mm);
                fault = 0;
                si_code = SEGV_PKUERR;
                goto bad_area;
        }

        fault = handle_mm_fault(vma, addr, mm_flags, regs);

        /* Quick path to respond to signals */
        if (fault_signal_pending(fault, regs)) {
                if (!user_mode(regs))
                        goto no_context;
                return 0;
        }

        /* The fault is fully completed (including releasing mmap lock) */
        if (fault & VM_FAULT_COMPLETED)
                return 0;

        if (fault & VM_FAULT_RETRY) {
                mm_flags |= FAULT_FLAG_TRIED;
                goto retry;
        }
        mmap_read_unlock(mm);

done:
        /* Handle the "normal" (no error) case first. */
        if (likely(!(fault & VM_FAULT_ERROR)))
                return 0;

        si_code = SEGV_MAPERR;
bad_area:
        /*
         * If we are in kernel mode at this point, we have no context to
         * handle this fault with.
         */
        if (!user_mode(regs))
                goto no_context;

        if (fault & VM_FAULT_OOM) {
                /*
                 * We ran out of memory, call the OOM killer, and return to
                 * userspace (which will retry the fault, or kill us if we got
                 * oom-killed).
                 */
                pagefault_out_of_memory();
                return 0;
        }

        inf = esr_to_fault_info(esr);
        set_thread_esr(addr, esr);
        if (fault & VM_FAULT_SIGBUS) {
                /*
                 * We had some memory, but were unable to successfully fix up
                 * this page fault.
                 */
                arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
        } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
                unsigned int lsb;

                lsb = PAGE_SHIFT;
                if (fault & VM_FAULT_HWPOISON_LARGE)
                        lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));

                arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
        } else {
                /*
                 * The pkey value that we return to userspace can be different
                 * from the pkey that caused the fault.
                 *
                 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
                 * 2. T1   : set POR_EL0 to deny access to pkey=4, touches, page
                 * 3. T1   : faults...
                 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
                 * 5. T1   : enters fault handler, takes mmap_lock, etc...
                 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
                 *             faulted on a pte with its pkey=4.
                 */
                /* Something tried to access memory that out of memory map */
                if (si_code == SEGV_PKUERR)
                        arm64_force_sig_fault_pkey(far, inf->name, pkey);
                else
                        arm64_force_sig_fault(SIGSEGV, si_code, far, inf->name);
        }

        return 0;

no_context:
        __do_kernel_fault(addr, esr, regs);
        return 0;
}

static int __kprobes do_translation_fault(unsigned long far,
                                          unsigned long esr,
                                          struct pt_regs *regs)
{
        unsigned long addr = untagged_addr(far);

        if (is_ttbr0_addr(addr))
                return do_page_fault(far, esr, regs);

        do_bad_area(far, esr, regs);
        return 0;
}

static int do_alignment_fault(unsigned long far, unsigned long esr,
                              struct pt_regs *regs)
{
        if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) &&
            compat_user_mode(regs))
                return do_compat_alignment_fixup(far, regs);
        do_bad_area(far, esr, regs);
        return 0;
}

static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
{
        return 1; /* "fault" */
}

static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
{
        const struct fault_info *inf;
        unsigned long siaddr;

        inf = esr_to_fault_info(esr);

        if (user_mode(regs) && apei_claim_sea(regs) == 0) {
                /*
                 * APEI claimed this as a firmware-first notification.
                 * Some processing deferred to task_work before ret_to_user().
                 */
                return 0;
        }

        if (esr & ESR_ELx_FnV) {
                siaddr = 0;
        } else {
                /*
                 * The architecture specifies that the tag bits of FAR_EL1 are
                 * UNKNOWN for synchronous external aborts. Mask them out now
                 * so that userspace doesn't see them.
                 */
                siaddr  = untagged_addr(far);
        }
        arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);

        return 0;
}

static int do_tag_check_fault(unsigned long far, unsigned long esr,
                              struct pt_regs *regs)
{
        /*
         * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
         * for tag check faults. Set them to corresponding bits in the untagged
         * address.
         */
        far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
        do_bad_area(far, esr, regs);
        return 0;
}

static const struct fault_info fault_info[] = {
        { do_bad,                SIGKILL, SI_KERNEL,        "ttbr address size fault"        },
        { do_bad,                SIGKILL, SI_KERNEL,        "level 1 address size fault"        },
        { do_bad,                SIGKILL, SI_KERNEL,        "level 2 address size fault"        },
        { do_bad,                SIGKILL, SI_KERNEL,        "level 3 address size fault"        },
        { do_translation_fault,        SIGSEGV, SEGV_MAPERR,        "level 0 translation fault"        },
        { do_translation_fault,        SIGSEGV, SEGV_MAPERR,        "level 1 translation fault"        },
        { do_translation_fault,        SIGSEGV, SEGV_MAPERR,        "level 2 translation fault"        },
        { do_translation_fault,        SIGSEGV, SEGV_MAPERR,        "level 3 translation fault"        },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,        "level 0 access flag fault"        },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,        "level 1 access flag fault"        },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,        "level 2 access flag fault"        },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,        "level 3 access flag fault"        },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,        "level 0 permission fault"        },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,        "level 1 permission fault"        },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,        "level 2 permission fault"        },
        { do_page_fault,        SIGSEGV, SEGV_ACCERR,        "level 3 permission fault"        },
        { do_sea,                SIGBUS,  BUS_OBJERR,        "synchronous external abort"        },
        { do_tag_check_fault,        SIGSEGV, SEGV_MTESERR,        "synchronous tag check fault"        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 18"                        },
        { do_sea,                SIGKILL, SI_KERNEL,        "level -1 (translation table walk)"        },
        { do_sea,                SIGKILL, SI_KERNEL,        "level 0 (translation table walk)"        },
        { do_sea,                SIGKILL, SI_KERNEL,        "level 1 (translation table walk)"        },
        { do_sea,                SIGKILL, SI_KERNEL,        "level 2 (translation table walk)"        },
        { do_sea,                SIGKILL, SI_KERNEL,        "level 3 (translation table walk)"        },
        { do_sea,                SIGBUS,  BUS_OBJERR,        "synchronous parity or ECC error" },        // Reserved when RAS is implemented
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 25"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 26"                        },
        { do_sea,                SIGKILL, SI_KERNEL,        "level -1 synchronous parity error (translation table walk)"        },        // Reserved when RAS is implemented
        { do_sea,                SIGKILL, SI_KERNEL,        "level 0 synchronous parity error (translation table walk)"        },        // Reserved when RAS is implemented
        { do_sea,                SIGKILL, SI_KERNEL,        "level 1 synchronous parity error (translation table walk)"        },        // Reserved when RAS is implemented
        { do_sea,                SIGKILL, SI_KERNEL,        "level 2 synchronous parity error (translation table walk)"        },        // Reserved when RAS is implemented
        { do_sea,                SIGKILL, SI_KERNEL,        "level 3 synchronous parity error (translation table walk)"        },        // Reserved when RAS is implemented
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 32"                        },
        { do_alignment_fault,        SIGBUS,  BUS_ADRALN,        "alignment fault"                },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 34"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 35"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 36"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 37"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 38"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 39"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 40"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "level -1 address size fault"        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 42"                        },
        { do_translation_fault,        SIGSEGV, SEGV_MAPERR,        "level -1 translation fault"        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 44"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 45"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 46"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 47"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "TLB conflict abort"                },
        { do_bad,                SIGKILL, SI_KERNEL,        "Unsupported atomic hardware update fault"        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 50"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 51"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "implementation fault (lockdown abort)" },
        { do_bad,                SIGBUS,  BUS_OBJERR,        "implementation fault (unsupported exclusive)" },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 54"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 55"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 56"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 57"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 58"                         },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 59"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 60"                        },
        { do_bad,                SIGKILL, SI_KERNEL,        "section domain fault"                },
        { do_bad,                SIGKILL, SI_KERNEL,        "page domain fault"                },
        { do_bad,                SIGKILL, SI_KERNEL,        "unknown 63"                        },
};

void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
{
        const struct fault_info *inf = esr_to_fault_info(esr);
        unsigned long addr = untagged_addr(far);

        if (!inf->fn(far, esr, regs))
                return;

        if (!user_mode(regs))
                die_kernel_fault(inf->name, addr, esr, regs);

        /*
         * At this point we have an unrecognized fault type whose tag bits may
         * have been defined as UNKNOWN. Therefore we only expose the untagged
         * address to the signal handler.
         */
        arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
}
NOKPROBE_SYMBOL(do_mem_abort);

void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
{
        arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
                         addr, esr);
}
NOKPROBE_SYMBOL(do_sp_pc_abort);

/*
 * __refdata because early_brk64 is __init, but the reference to it is
 * clobbered at arch_initcall time.
 * See traps.c and debug-monitors.c:debug_traps_init().
 */
static struct fault_info __refdata debug_fault_info[] = {
        { do_bad,        SIGTRAP,        TRAP_HWBKPT,        "hardware breakpoint"        },
        { do_bad,        SIGTRAP,        TRAP_HWBKPT,        "hardware single-step"        },
        { do_bad,        SIGTRAP,        TRAP_HWBKPT,        "hardware watchpoint"        },
        { do_bad,        SIGKILL,        SI_KERNEL,        "unknown 3"                },
        { do_bad,        SIGTRAP,        TRAP_BRKPT,        "aarch32 BKPT"                },
        { do_bad,        SIGKILL,        SI_KERNEL,        "aarch32 vector catch"        },
        { early_brk64,        SIGTRAP,        TRAP_BRKPT,        "aarch64 BRK"                },
        { do_bad,        SIGKILL,        SI_KERNEL,        "unknown 7"                },
};

void __init hook_debug_fault_code(int nr,
                                  int (*fn)(unsigned long, unsigned long, struct pt_regs *),
                                  int sig, int code, const char *name)
{
        BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));

        debug_fault_info[nr].fn                = fn;
        debug_fault_info[nr].sig        = sig;
        debug_fault_info[nr].code        = code;
        debug_fault_info[nr].name        = name;
}

/*
 * In debug exception context, we explicitly disable preemption despite
 * having interrupts disabled.
 * This serves two purposes: it makes it much less likely that we would
 * accidentally schedule in exception context and it will force a warning
 * if we somehow manage to schedule by accident.
 */
static void debug_exception_enter(struct pt_regs *regs)
{
        preempt_disable();

        /* This code is a bit fragile.  Test it. */
        RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
}
NOKPROBE_SYMBOL(debug_exception_enter);

static void debug_exception_exit(struct pt_regs *regs)
{
        preempt_enable_no_resched();
}
NOKPROBE_SYMBOL(debug_exception_exit);

void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
                        struct pt_regs *regs)
{
        const struct fault_info *inf = esr_to_debug_fault_info(esr);
        unsigned long pc = instruction_pointer(regs);

        debug_exception_enter(regs);

        if (user_mode(regs) && !is_ttbr0_addr(pc))
                arm64_apply_bp_hardening();

        if (inf->fn(addr_if_watchpoint, esr, regs)) {
                arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
        }

        debug_exception_exit(regs);
}
NOKPROBE_SYMBOL(do_debug_exception);

/*
 * Used during anonymous page fault handling.
 */
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
                                                unsigned long vaddr)
{
        gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO;

        /*
         * If the page is mapped with PROT_MTE, initialise the tags at the
         * point of allocation and page zeroing as this is usually faster than
         * separate DC ZVA and STGM.
         */
        if (vma->vm_flags & VM_MTE)
                flags |= __GFP_ZEROTAGS;

        return vma_alloc_folio(flags, 0, vma, vaddr);
}

void tag_clear_highpage(struct page *page)
{
        /* Newly allocated page, shouldn't have been tagged yet */
        WARN_ON_ONCE(!try_page_mte_tagging(page));
        mte_zero_clear_page_tags(page_address(page));
        set_page_mte_tagged(page);
}












































































































 1306 











  209 
  209 



































 1396 
 1374 

  124 
 1401 





















    1 
   74 


    1 
   74 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _LINUX_FILE_REF_H
#define _LINUX_FILE_REF_H

#include <linux/atomic.h>
#include <linux/preempt.h>
#include <linux/types.h>

/*
 * file_ref is a reference count implementation specifically for use by
 * files. It takes inspiration from rcuref but differs in key aspects
 * such as support for SLAB_TYPESAFE_BY_RCU type caches.
 *
 * FILE_REF_ONEREF                FILE_REF_MAXREF
 * 0x0000000000000000UL      0x7FFFFFFFFFFFFFFFUL
 * <-------------------valid ------------------->
 *
 *                       FILE_REF_SATURATED
 * 0x8000000000000000UL 0xA000000000000000UL 0xBFFFFFFFFFFFFFFFUL
 * <-----------------------saturation zone---------------------->
 *
 * FILE_REF_RELEASED                   FILE_REF_DEAD
 * 0xC000000000000000UL         0xE000000000000000UL
 * <-------------------dead zone------------------->
 *
 * FILE_REF_NOREF
 * 0xFFFFFFFFFFFFFFFFUL
 */

#ifdef CONFIG_64BIT
#define FILE_REF_ONEREF                0x0000000000000000UL
#define FILE_REF_MAXREF                0x7FFFFFFFFFFFFFFFUL
#define FILE_REF_SATURATED        0xA000000000000000UL
#define FILE_REF_RELEASED        0xC000000000000000UL
#define FILE_REF_DEAD                0xE000000000000000UL
#define FILE_REF_NOREF                0xFFFFFFFFFFFFFFFFUL
#else
#define FILE_REF_ONEREF                0x00000000U
#define FILE_REF_MAXREF                0x7FFFFFFFU
#define FILE_REF_SATURATED        0xA0000000U
#define FILE_REF_RELEASED        0xC0000000U
#define FILE_REF_DEAD                0xE0000000U
#define FILE_REF_NOREF                0xFFFFFFFFU
#endif

typedef struct {
#ifdef CONFIG_64BIT
        atomic64_t refcnt;
#else
        atomic_t refcnt;
#endif
} file_ref_t;

/**
 * file_ref_init - Initialize a file reference count
 * @ref: Pointer to the reference count
 * @cnt: The initial reference count typically '1'
 */
static inline void file_ref_init(file_ref_t *ref, unsigned long cnt)
{
        atomic_long_set(&ref->refcnt, cnt - 1);
}

bool __file_ref_put(file_ref_t *ref, unsigned long cnt);

/**
 * file_ref_get - Acquire one reference on a file
 * @ref: Pointer to the reference count
 *
 * Similar to atomic_inc_not_zero() but saturates at FILE_REF_MAXREF.
 *
 * Provides full memory ordering.
 *
 * Return: False if the attempt to acquire a reference failed. This happens
 *         when the last reference has been put already. True if a reference
 *         was successfully acquired
 */
static __always_inline __must_check bool file_ref_get(file_ref_t *ref)
{
        /*
         * Unconditionally increase the reference count with full
         * ordering. The saturation and dead zones provide enough
         * tolerance for this.
         *
         * If this indicates negative the file in question the fail can
         * be freed and immediately reused due to SLAB_TYPSAFE_BY_RCU.
         * Hence, unconditionally altering the file reference count to
         * e.g., reset the file reference count back to the middle of
         * the deadzone risk end up marking someone else's file as dead
         * behind their back.
         *
         * It would be possible to do a careful:
         *
         * cnt = atomic_long_inc_return();
         * if (likely(cnt >= 0))
         *        return true;
         *
         * and then something like:
         *
         * if (cnt >= FILE_REF_RELEASE)
         *        atomic_long_try_cmpxchg(&ref->refcnt, &cnt, FILE_REF_DEAD),
         *
         * to set the value back to the middle of the deadzone. But it's
         * practically impossible to go from FILE_REF_DEAD to
         * FILE_REF_ONEREF. It would need 2305843009213693952/2^61
         * file_ref_get()s to resurrect such a dead file.
         */
        return !atomic_long_add_negative(1, &ref->refcnt);
}

/**
 * file_ref_inc - Acquire one reference on a file
 * @ref: Pointer to the reference count
 *
 * Acquire an additional reference on a file. Warns if the caller didn't
 * already hold a reference.
 */
static __always_inline void file_ref_inc(file_ref_t *ref)
{
        long prior = atomic_long_fetch_inc_relaxed(&ref->refcnt);
        WARN_ONCE(prior < 0, "file_ref_inc() on a released file reference");
}

/**
 * file_ref_put -- Release a file reference
 * @ref:        Pointer to the reference count
 *
 * Provides release memory ordering, such that prior loads and stores
 * are done before, and provides an acquire ordering on success such
 * that free() must come after.
 *
 * Return: True if this was the last reference with no future references
 *         possible. This signals the caller that it can safely release
 *         the object which is protected by the reference counter.
 *         False if there are still active references or the put() raced
 *         with a concurrent get()/put() pair. Caller is not allowed to
 *         release the protected object.
 */
static __always_inline __must_check bool file_ref_put(file_ref_t *ref)
{
        long cnt;

        /*
         * While files are SLAB_TYPESAFE_BY_RCU and thus file_ref_put()
         * calls don't risk UAFs when a file is recyclyed, it is still
         * vulnerable to UAFs caused by freeing the whole slab page once
         * it becomes unused. Prevent file_ref_put() from being
         * preempted protects against this.
         */
        guard(preempt)();
        /*
         * Unconditionally decrease the reference count. The saturation
         * and dead zones provide enough tolerance for this. If this
         * fails then we need to handle the last reference drop and
         * cases inside the saturation and dead zones.
         */
        cnt = atomic_long_dec_return(&ref->refcnt);
        if (cnt >= 0)
                return false;
        return __file_ref_put(ref, cnt);
}

/**
 * file_ref_put_close - drop a reference expecting it would transition to FILE_REF_NOREF
 * @ref:        Pointer to the reference count
 *
 * Semantically it is equivalent to calling file_ref_put(), but it trades lower
 * performance in face of other CPUs also modifying the refcount for higher
 * performance when this happens to be the last reference.
 *
 * For the last reference file_ref_put() issues 2 atomics. One to drop the
 * reference and another to transition it to FILE_REF_DEAD. This routine does
 * the work in one step, but in order to do it has to pre-read the variable which
 * decreases scalability.
 *
 * Use with close() et al, stick to file_ref_put() by default.
 */
static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref)
{
        long old;

        old = atomic_long_read(&ref->refcnt);
        if (likely(old == FILE_REF_ONEREF)) {
                if (likely(atomic_long_try_cmpxchg(&ref->refcnt, &old, FILE_REF_DEAD)))
                        return true;
        }
        return file_ref_put(ref);
}

/**
 * file_ref_read - Read the number of file references
 * @ref: Pointer to the reference count
 *
 * Return: The number of held references (0 ... N)
 */
static inline unsigned long file_ref_read(file_ref_t *ref)
{
        unsigned long c = atomic_long_read(&ref->refcnt);

        /* Return 0 if within the DEAD zone. */
        return c >= FILE_REF_RELEASED ? 0 : c + 1;
}

/*
 * __file_ref_read_raw - Return the value stored in ref->refcnt
 * @ref: Pointer to the reference count
 *
 * Return: The raw value found in the counter
 *
 * A hack for file_needs_f_pos_lock(), you probably want to use
 * file_ref_read() instead.
 */
static inline unsigned long __file_ref_read_raw(file_ref_t *ref)
{
        return atomic_long_read(&ref->refcnt);
}

#endif














































































































































































































































































































































































































































































































































































































































































































   22 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  NOHZ implementation for low and high resolution timers
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 */
#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/hrtimer.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/percpu.h>
#include <linux/nmi.h>
#include <linux/profile.h>
#include <linux/sched/signal.h>
#include <linux/sched/clock.h>
#include <linux/sched/stat.h>
#include <linux/sched/nohz.h>
#include <linux/sched/loadavg.h>
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
#include <linux/mm.h>

#include <asm/irq_regs.h>

#include "tick-internal.h"

#include <trace/events/timer.h>

/*
 * Per-CPU nohz control structure
 */
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);

struct tick_sched *tick_get_tick_sched(int cpu)
{
        return &per_cpu(tick_cpu_sched, cpu);
}

/*
 * The time when the last jiffy update happened. Write access must hold
 * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
 * consistent view of jiffies and last_jiffies_update.
 */
static ktime_t last_jiffies_update;

/*
 * Must be called with interrupts disabled !
 */
static void tick_do_update_jiffies64(ktime_t now)
{
        unsigned long ticks = 1;
        ktime_t delta, nextp;

        /*
         * 64-bit can do a quick check without holding the jiffies lock and
         * without looking at the sequence count. The smp_load_acquire()
         * pairs with the update done later in this function.
         *
         * 32-bit cannot do that because the store of 'tick_next_period'
         * consists of two 32-bit stores, and the first store could be
         * moved by the CPU to a random point in the future.
         */
        if (IS_ENABLED(CONFIG_64BIT)) {
                if (ktime_before(now, smp_load_acquire(&tick_next_period)))
                        return;
        } else {
                unsigned int seq;

                /*
                 * Avoid contention on 'jiffies_lock' and protect the quick
                 * check with the sequence count.
                 */
                do {
                        seq = read_seqcount_begin(&jiffies_seq);
                        nextp = tick_next_period;
                } while (read_seqcount_retry(&jiffies_seq, seq));

                if (ktime_before(now, nextp))
                        return;
        }

        /* Quick check failed, i.e. update is required. */
        raw_spin_lock(&jiffies_lock);
        /*
         * Re-evaluate with the lock held. Another CPU might have done the
         * update already.
         */
        if (ktime_before(now, tick_next_period)) {
                raw_spin_unlock(&jiffies_lock);
                return;
        }

        write_seqcount_begin(&jiffies_seq);

        delta = ktime_sub(now, tick_next_period);
        if (unlikely(delta >= TICK_NSEC)) {
                /* Slow path for long idle sleep times */
                s64 incr = TICK_NSEC;

                ticks += ktime_divns(delta, incr);

                last_jiffies_update = ktime_add_ns(last_jiffies_update,
                                                   incr * ticks);
        } else {
                last_jiffies_update = ktime_add_ns(last_jiffies_update,
                                                   TICK_NSEC);
        }

        /* Advance jiffies to complete the 'jiffies_seq' protected job */
        jiffies_64 += ticks;

        /* Keep the tick_next_period variable up to date */
        nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);

        if (IS_ENABLED(CONFIG_64BIT)) {
                /*
                 * Pairs with smp_load_acquire() in the lockless quick
                 * check above, and ensures that the update to 'jiffies_64' is
                 * not reordered vs. the store to 'tick_next_period', neither
                 * by the compiler nor by the CPU.
                 */
                smp_store_release(&tick_next_period, nextp);
        } else {
                /*
                 * A plain store is good enough on 32-bit, as the quick check
                 * above is protected by the sequence count.
                 */
                tick_next_period = nextp;
        }

        /*
         * Release the sequence count. calc_global_load() below is not
         * protected by it, but 'jiffies_lock' needs to be held to prevent
         * concurrent invocations.
         */
        write_seqcount_end(&jiffies_seq);

        calc_global_load();

        raw_spin_unlock(&jiffies_lock);
        update_wall_time();
}

/*
 * Initialize and return retrieve the jiffies update.
 */
static ktime_t tick_init_jiffy_update(void)
{
        ktime_t period;

        raw_spin_lock(&jiffies_lock);
        write_seqcount_begin(&jiffies_seq);

        /* Have we started the jiffies update yet ? */
        if (last_jiffies_update == 0) {
                u32 rem;

                /*
                 * Ensure that the tick is aligned to a multiple of
                 * TICK_NSEC.
                 */
                div_u64_rem(tick_next_period, TICK_NSEC, &rem);
                if (rem)
                        tick_next_period += TICK_NSEC - rem;

                last_jiffies_update = tick_next_period;
        }
        period = last_jiffies_update;

        write_seqcount_end(&jiffies_seq);
        raw_spin_unlock(&jiffies_lock);

        return period;
}

static inline int tick_sched_flag_test(struct tick_sched *ts,
                                       unsigned long flag)
{
        return !!(ts->flags & flag);
}

static inline void tick_sched_flag_set(struct tick_sched *ts,
                                       unsigned long flag)
{
        lockdep_assert_irqs_disabled();
        ts->flags |= flag;
}

static inline void tick_sched_flag_clear(struct tick_sched *ts,
                                         unsigned long flag)
{
        lockdep_assert_irqs_disabled();
        ts->flags &= ~flag;
}

#define MAX_STALLED_JIFFIES 5

static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
        int tick_cpu, cpu = smp_processor_id();

        /*
         * Check if the do_timer duty was dropped. We don't care about
         * concurrency: This happens only when the CPU in charge went
         * into a long sleep. If two CPUs happen to assign themselves to
         * this duty, then the jiffies update is still serialized by
         * 'jiffies_lock'.
         *
         * If nohz_full is enabled, this should not happen because the
         * 'tick_do_timer_cpu' CPU never relinquishes.
         */
        tick_cpu = READ_ONCE(tick_do_timer_cpu);

        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && unlikely(tick_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
                WARN_ON_ONCE(tick_nohz_full_running);
#endif
                WRITE_ONCE(tick_do_timer_cpu, cpu);
                tick_cpu = cpu;
        }

        /* Check if jiffies need an update */
        if (tick_cpu == cpu)
                tick_do_update_jiffies64(now);

        /*
         * If the jiffies update stalled for too long (timekeeper in stop_machine()
         * or VMEXIT'ed for several msecs), force an update.
         */
        if (ts->last_tick_jiffies != jiffies) {
                ts->stalled_jiffies = 0;
                ts->last_tick_jiffies = READ_ONCE(jiffies);
        } else {
                if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
                        tick_do_update_jiffies64(now);
                        ts->stalled_jiffies = 0;
                        ts->last_tick_jiffies = READ_ONCE(jiffies);
                }
        }

        if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
                ts->got_idle_tick = 1;
}

static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
{
        /*
         * When we are idle and the tick is stopped, we have to touch
         * the watchdog as we might not schedule for a really long
         * time. This happens on completely idle SMP systems while
         * waiting on the login prompt. We also increment the "start of
         * idle" jiffy stamp so the idle accounting adjustment we do
         * when we go busy again does not account too many ticks.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) &&
            tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                touch_softlockup_watchdog_sched();
                if (is_idle_task(current))
                        ts->idle_jiffies++;
                /*
                 * In case the current tick fired too early past its expected
                 * expiration, make sure we don't bypass the next clock reprogramming
                 * to the same deadline.
                 */
                ts->next_tick = 0;
        }

        update_process_times(user_mode(regs));
        profile_tick(CPU_PROFILING);
}

/*
 * We rearm the timer until we get disabled by the idle code.
 * Called with interrupts disabled.
 */
static enum hrtimer_restart tick_nohz_handler(struct hrtimer *timer)
{
        struct tick_sched *ts =        container_of(timer, struct tick_sched, sched_timer);
        struct pt_regs *regs = get_irq_regs();
        ktime_t now = ktime_get();

        tick_sched_do_timer(ts, now);

        /*
         * Do not call when we are not in IRQ context and have
         * no valid 'regs' pointer
         */
        if (regs)
                tick_sched_handle(ts, regs);
        else
                ts->next_tick = 0;

        /*
         * In dynticks mode, tick reprogram is deferred:
         * - to the idle task if in dynticks-idle
         * - to IRQ exit if in full-dynticks.
         */
        if (unlikely(tick_sched_flag_test(ts, TS_FLAG_STOPPED)))
                return HRTIMER_NORESTART;

        hrtimer_forward(timer, now, TICK_NSEC);

        return HRTIMER_RESTART;
}

#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
EXPORT_SYMBOL_GPL(tick_nohz_full_mask);
bool tick_nohz_full_running;
EXPORT_SYMBOL_GPL(tick_nohz_full_running);
static atomic_t tick_dep_mask;

static bool check_tick_dependency(atomic_t *dep)
{
        int val = atomic_read(dep);

        if (val & TICK_DEP_MASK_POSIX_TIMER) {
                trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
                return true;
        }

        if (val & TICK_DEP_MASK_PERF_EVENTS) {
                trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
                return true;
        }

        if (val & TICK_DEP_MASK_SCHED) {
                trace_tick_stop(0, TICK_DEP_MASK_SCHED);
                return true;
        }

        if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) {
                trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
                return true;
        }

        if (val & TICK_DEP_MASK_RCU) {
                trace_tick_stop(0, TICK_DEP_MASK_RCU);
                return true;
        }

        if (val & TICK_DEP_MASK_RCU_EXP) {
                trace_tick_stop(0, TICK_DEP_MASK_RCU_EXP);
                return true;
        }

        return false;
}

static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
        lockdep_assert_irqs_disabled();

        if (unlikely(!cpu_online(cpu)))
                return false;

        if (check_tick_dependency(&tick_dep_mask))
                return false;

        if (check_tick_dependency(&ts->tick_dep_mask))
                return false;

        if (check_tick_dependency(&current->tick_dep_mask))
                return false;

        if (check_tick_dependency(&current->signal->tick_dep_mask))
                return false;

        return true;
}

static void nohz_full_kick_func(struct irq_work *work)
{
        /* Empty, the tick restart happens on tick_nohz_irq_exit() */
}

static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) =
        IRQ_WORK_INIT_HARD(nohz_full_kick_func);

/*
 * Kick this CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
 * is NMI safe.
 */
static void tick_nohz_full_kick(void)
{
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        irq_work_queue(this_cpu_ptr(&nohz_full_kick_work));
}

/*
 * Kick the CPU if it's full dynticks in order to force it to
 * re-evaluate its dependency on the tick and restart it if necessary.
 */
void tick_nohz_full_kick_cpu(int cpu)
{
        if (!tick_nohz_full_cpu(cpu))
                return;

        irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}

static void tick_nohz_kick_task(struct task_struct *tsk)
{
        int cpu;

        /*
         * If the task is not running, run_posix_cpu_timers()
         * has nothing to elapse, and an IPI can then be optimized out.
         *
         * activate_task()                      STORE p->tick_dep_mask
         *   STORE p->on_rq
         * __schedule() (switch to task 'p')    smp_mb() (atomic_fetch_or())
         *   LOCK rq->lock                      LOAD p->on_rq
         *   smp_mb__after_spin_lock()
         *   tick_nohz_task_switch()
         *     LOAD p->tick_dep_mask
         *
         * XXX given a task picks up the dependency on schedule(), should we
         * only care about tasks that are currently on the CPU instead of all
         * that are on the runqueue?
         *
         * That is, does this want to be: task_on_cpu() / task_curr()?
         */
        if (!sched_task_on_rq(tsk))
                return;

        /*
         * If the task concurrently migrates to another CPU,
         * we guarantee it sees the new tick dependency upon
         * schedule.
         *
         * set_task_cpu(p, cpu);
         *   STORE p->cpu = @cpu
         * __schedule() (switch to task 'p')
         *   LOCK rq->lock
         *   smp_mb__after_spin_lock()          STORE p->tick_dep_mask
         *   tick_nohz_task_switch()            smp_mb() (atomic_fetch_or())
         *      LOAD p->tick_dep_mask           LOAD p->cpu
         */
        cpu = task_cpu(tsk);

        preempt_disable();
        if (cpu_online(cpu))
                tick_nohz_full_kick_cpu(cpu);
        preempt_enable();
}

/*
 * Kick all full dynticks CPUs in order to force these to re-evaluate
 * their dependency on the tick and restart it if necessary.
 */
static void tick_nohz_full_kick_all(void)
{
        int cpu;

        if (!tick_nohz_full_running)
                return;

        preempt_disable();
        for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
                tick_nohz_full_kick_cpu(cpu);
        preempt_enable();
}

static void tick_nohz_dep_set_all(atomic_t *dep,
                                  enum tick_dep_bits bit)
{
        int prev;

        prev = atomic_fetch_or(BIT(bit), dep);
        if (!prev)
                tick_nohz_full_kick_all();
}

/*
 * Set a global tick dependency. Used by perf events that rely on freq and
 * unstable clocks.
 */
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
        tick_nohz_dep_set_all(&tick_dep_mask, bit);
}

void tick_nohz_dep_clear(enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &tick_dep_mask);
}

/*
 * Set per-CPU tick dependency. Used by scheduler and perf events in order to
 * manage event-throttling.
 */
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
        int prev;
        struct tick_sched *ts;

        ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        prev = atomic_fetch_or(BIT(bit), &ts->tick_dep_mask);
        if (!prev) {
                preempt_disable();
                /* Perf needs local kick that is NMI safe */
                if (cpu == smp_processor_id()) {
                        tick_nohz_full_kick();
                } else {
                        /* Remote IRQ work not NMI-safe */
                        if (!WARN_ON_ONCE(in_nmi()))
                                tick_nohz_full_kick_cpu(cpu);
                }
                preempt_enable();
        }
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_cpu);

void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        atomic_andnot(BIT(bit), &ts->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);

/*
 * Set a per-task tick dependency. RCU needs this. Also posix CPU timers
 * in order to elapse per task timers.
 */
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
        if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask))
                tick_nohz_kick_task(tsk);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);

void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &tsk->tick_dep_mask);
}
EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_task);

/*
 * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
 * per process timers.
 */
void tick_nohz_dep_set_signal(struct task_struct *tsk,
                              enum tick_dep_bits bit)
{
        int prev;
        struct signal_struct *sig = tsk->signal;

        prev = atomic_fetch_or(BIT(bit), &sig->tick_dep_mask);
        if (!prev) {
                struct task_struct *t;

                lockdep_assert_held(&tsk->sighand->siglock);
                __for_each_thread(sig, t)
                        tick_nohz_kick_task(t);
        }
}

void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
        atomic_andnot(BIT(bit), &sig->tick_dep_mask);
}

/*
 * Re-evaluate the need for the tick as we switch the current task.
 * It might need the tick due to per task/process properties:
 * perf events, posix CPU timers, ...
 */
void __tick_nohz_task_switch(void)
{
        struct tick_sched *ts;

        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                if (atomic_read(&current->tick_dep_mask) ||
                    atomic_read(&current->signal->tick_dep_mask))
                        tick_nohz_full_kick();
        }
}

/* Get the boot-time nohz CPU list from the kernel parameters. */
void __init tick_nohz_full_setup(cpumask_var_t cpumask)
{
        alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
        cpumask_copy(tick_nohz_full_mask, cpumask);
        tick_nohz_full_running = true;
}

bool tick_nohz_cpu_hotpluggable(unsigned int cpu)
{
        /*
         * The 'tick_do_timer_cpu' CPU handles housekeeping duty (unbound
         * timers, workqueues, timekeeping, ...) on behalf of full dynticks
         * CPUs. It must remain online when nohz full is enabled.
         */
        if (tick_nohz_full_running && READ_ONCE(tick_do_timer_cpu) == cpu)
                return false;
        return true;
}

static int tick_nohz_cpu_down(unsigned int cpu)
{
        return tick_nohz_cpu_hotpluggable(cpu) ? 0 : -EBUSY;
}

void __init tick_nohz_init(void)
{
        int cpu, ret;

        if (!tick_nohz_full_running)
                return;

        /*
         * Full dynticks uses IRQ work to drive the tick rescheduling on safe
         * locking contexts. But then we need IRQ work to raise its own
         * interrupts to avoid circular dependency on the tick.
         */
        if (!arch_irq_work_has_interrupt()) {
                pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support IRQ work self-IPIs\n");
                cpumask_clear(tick_nohz_full_mask);
                tick_nohz_full_running = false;
                return;
        }

        if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) &&
                        !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
                cpu = smp_processor_id();

                if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
                        pr_warn("NO_HZ: Clearing %d from nohz_full range "
                                "for timekeeping\n", cpu);
                        cpumask_clear_cpu(cpu, tick_nohz_full_mask);
                }
        }

        for_each_cpu(cpu, tick_nohz_full_mask)
                ct_cpu_track_user(cpu);

        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
                                        "kernel/nohz:predown", NULL,
                                        tick_nohz_cpu_down);
        WARN_ON(ret < 0);
        pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
                cpumask_pr_args(tick_nohz_full_mask));
}
#endif /* #ifdef CONFIG_NO_HZ_FULL */

/*
 * NOHZ - aka dynamic tick functionality
 */
#ifdef CONFIG_NO_HZ_COMMON
/*
 * NO HZ enabled ?
 */
bool tick_nohz_enabled __read_mostly  = true;
unsigned long tick_nohz_active  __read_mostly;
/*
 * Enable / Disable tickless mode
 */
static int __init setup_tick_nohz(char *str)
{
        return (kstrtobool(str, &tick_nohz_enabled) == 0);
}

__setup("nohz=", setup_tick_nohz);

bool tick_nohz_tick_stopped(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}

bool tick_nohz_tick_stopped_cpu(int cpu)
{
        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);

        return tick_sched_flag_test(ts, TS_FLAG_STOPPED);
}

/**
 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 * @now: current ktime_t
 *
 * Called from interrupt entry when the CPU was idle
 *
 * In case the sched_tick was stopped on this CPU, we have to check if jiffies
 * must be updated. Otherwise an interrupt handler could use a stale jiffy
 * value. We do this unconditionally on any CPU, as we don't know whether the
 * CPU, which has the update task assigned, is in a long sleep.
 */
static void tick_nohz_update_jiffies(ktime_t now)
{
        unsigned long flags;

        __this_cpu_write(tick_cpu_sched.idle_waketime, now);

        local_irq_save(flags);
        tick_do_update_jiffies64(now);
        local_irq_restore(flags);

        touch_softlockup_watchdog_sched();
}

static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
{
        ktime_t delta;

        if (WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE)))
                return;

        delta = ktime_sub(now, ts->idle_entrytime);

        write_seqcount_begin(&ts->idle_sleeptime_seq);
        if (nr_iowait_cpu(smp_processor_id()) > 0)
                ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
        else
                ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);

        ts->idle_entrytime = now;
        tick_sched_flag_clear(ts, TS_FLAG_IDLE_ACTIVE);
        write_seqcount_end(&ts->idle_sleeptime_seq);

        sched_clock_idle_wakeup_event();
}

static void tick_nohz_start_idle(struct tick_sched *ts)
{
        write_seqcount_begin(&ts->idle_sleeptime_seq);
        ts->idle_entrytime = ktime_get();
        tick_sched_flag_set(ts, TS_FLAG_IDLE_ACTIVE);
        write_seqcount_end(&ts->idle_sleeptime_seq);

        sched_clock_idle_sleep_event();
}

static u64 get_cpu_sleep_time_us(struct tick_sched *ts, ktime_t *sleeptime,
                                 bool compute_delta, u64 *last_update_time)
{
        ktime_t now, idle;
        unsigned int seq;

        if (!tick_nohz_active)
                return -1;

        now = ktime_get();
        if (last_update_time)
                *last_update_time = ktime_to_us(now);

        do {
                seq = read_seqcount_begin(&ts->idle_sleeptime_seq);

                if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE) && compute_delta) {
                        ktime_t delta = ktime_sub(now, ts->idle_entrytime);

                        idle = ktime_add(*sleeptime, delta);
                } else {
                        idle = *sleeptime;
                }
        } while (read_seqcount_retry(&ts->idle_sleeptime_seq, seq));

        return ktime_to_us(idle);

}

/**
 * get_cpu_idle_time_us - get the total idle time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
 * Return the cumulative idle time (since boot) for a given
 * CPU, in microseconds. Note that this is partially broken due to
 * the counter of iowait tasks that can be remotely updated without
 * any synchronization. Therefore it is possible to observe backward
 * values within two consecutive reads.
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
 *
 * Return: -1 if NOHZ is not enabled, else total idle time of the @cpu
 */
u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);

        return get_cpu_sleep_time_us(ts, &ts->idle_sleeptime,
                                     !nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);

/**
 * get_cpu_iowait_time_us - get the total iowait time of a CPU
 * @cpu: CPU number to query
 * @last_update_time: variable to store update time in. Do not update
 * counters if NULL.
 *
 * Return the cumulative iowait time (since boot) for a given
 * CPU, in microseconds. Note this is partially broken due to
 * the counter of iowait tasks that can be remotely updated without
 * any synchronization. Therefore it is possible to observe backward
 * values within two consecutive reads.
 *
 * This time is measured via accounting rather than sampling,
 * and is as accurate as ktime_get() is.
 *
 * Return: -1 if NOHZ is not enabled, else total iowait time of @cpu
 */
u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);

        return get_cpu_sleep_time_us(ts, &ts->iowait_sleeptime,
                                     nr_iowait_cpu(cpu), last_update_time);
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);

static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
{
        hrtimer_cancel(&ts->sched_timer);
        hrtimer_set_expires(&ts->sched_timer, ts->last_tick);

        /* Forward the time to expire in the future */
        hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);

        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
                hrtimer_start_expires(&ts->sched_timer,
                                      HRTIMER_MODE_ABS_PINNED_HARD);
        } else {
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
        }

        /*
         * Reset to make sure the next tick stop doesn't get fooled by past
         * cached clock deadline.
         */
        ts->next_tick = 0;
}

static inline bool local_timer_softirq_pending(void)
{
        return local_timers_pending() & BIT(TIMER_SOFTIRQ);
}

/*
 * Read jiffies and the time when jiffies were updated last
 */
u64 get_jiffies_update(unsigned long *basej)
{
        unsigned long basejiff;
        unsigned int seq;
        u64 basemono;

        do {
                seq = read_seqcount_begin(&jiffies_seq);
                basemono = last_jiffies_update;
                basejiff = jiffies;
        } while (read_seqcount_retry(&jiffies_seq, seq));
        *basej = basejiff;
        return basemono;
}

/**
 * tick_nohz_next_event() - return the clock monotonic based next event
 * @ts:                pointer to tick_sched struct
 * @cpu:        CPU number
 *
 * Return:
 * *%0                - When the next event is a maximum of TICK_NSEC in the future
 *                  and the tick is not stopped yet
 * *%next_event        - Next event based on clock monotonic
 */
static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
{
        u64 basemono, next_tick, delta, expires;
        unsigned long basejiff;
        int tick_cpu;

        basemono = get_jiffies_update(&basejiff);
        ts->last_jiffies = basejiff;
        ts->timer_expires_base = basemono;

        /*
         * Keep the periodic tick, when RCU, architecture or irq_work
         * requests it.
         * Aside of that, check whether the local timer softirq is
         * pending. If so, its a bad idea to call get_next_timer_interrupt(),
         * because there is an already expired timer, so it will request
         * immediate expiry, which rearms the hardware timer with a
         * minimal delta, which brings us back to this place
         * immediately. Lather, rinse and repeat...
         */
        if (rcu_needs_cpu() || arch_needs_cpu() ||
            irq_work_needs_cpu() || local_timer_softirq_pending()) {
                next_tick = basemono + TICK_NSEC;
        } else {
                /*
                 * Get the next pending timer. If high resolution
                 * timers are enabled this only takes the timer wheel
                 * timers into account. If high resolution timers are
                 * disabled this also looks at the next expiring
                 * hrtimer.
                 */
                next_tick = get_next_timer_interrupt(basejiff, basemono);
                ts->next_timer = next_tick;
        }

        /* Make sure next_tick is never before basemono! */
        if (WARN_ON_ONCE(basemono > next_tick))
                next_tick = basemono;

        /*
         * If the tick is due in the next period, keep it ticking or
         * force prod the timer.
         */
        delta = next_tick - basemono;
        if (delta <= (u64)TICK_NSEC) {
                /*
                 * We've not stopped the tick yet, and there's a timer in the
                 * next period, so no point in stopping it either, bail.
                 */
                if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                        ts->timer_expires = 0;
                        goto out;
                }
        }

        /*
         * If this CPU is the one which had the do_timer() duty last, we limit
         * the sleep time to the timekeeping 'max_deferment' value.
         * Otherwise we can sleep as long as we want.
         */
        delta = timekeeping_max_deferment();
        tick_cpu = READ_ONCE(tick_do_timer_cpu);
        if (tick_cpu != cpu &&
            (tick_cpu != TICK_DO_TIMER_NONE || !tick_sched_flag_test(ts, TS_FLAG_DO_TIMER_LAST)))
                delta = KTIME_MAX;

        /* Calculate the next expiry time */
        if (delta < (KTIME_MAX - basemono))
                expires = basemono + delta;
        else
                expires = KTIME_MAX;

        ts->timer_expires = min_t(u64, expires, next_tick);

out:
        return ts->timer_expires;
}

static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        unsigned long basejiff = ts->last_jiffies;
        u64 basemono = ts->timer_expires_base;
        bool timer_idle = tick_sched_flag_test(ts, TS_FLAG_STOPPED);
        int tick_cpu;
        u64 expires;

        /* Make sure we won't be trying to stop it twice in a row. */
        ts->timer_expires_base = 0;

        /*
         * Now the tick should be stopped definitely - so the timer base needs
         * to be marked idle as well to not miss a newly queued timer.
         */
        expires = timer_base_try_to_set_idle(basejiff, basemono, &timer_idle);
        if (expires > ts->timer_expires) {
                /*
                 * This path could only happen when the first timer was removed
                 * between calculating the possible sleep length and now (when
                 * high resolution mode is not active, timer could also be a
                 * hrtimer).
                 *
                 * We have to stick to the original calculated expiry value to
                 * not stop the tick for too long with a shallow C-state (which
                 * was programmed by cpuidle because of an early next expiration
                 * value).
                 */
                expires = ts->timer_expires;
        }

        /* If the timer base is not idle, retain the not yet stopped tick. */
        if (!timer_idle)
                return;

        /*
         * If this CPU is the one which updates jiffies, then give up
         * the assignment and let it be taken by the CPU which runs
         * the tick timer next, which might be this CPU as well. If we
         * don't drop this here, the jiffies might be stale and
         * do_timer() never gets invoked. Keep track of the fact that it
         * was the one which had the do_timer() duty last.
         */
        tick_cpu = READ_ONCE(tick_do_timer_cpu);
        if (tick_cpu == cpu) {
                WRITE_ONCE(tick_do_timer_cpu, TICK_DO_TIMER_NONE);
                tick_sched_flag_set(ts, TS_FLAG_DO_TIMER_LAST);
        } else if (tick_cpu != TICK_DO_TIMER_NONE) {
                tick_sched_flag_clear(ts, TS_FLAG_DO_TIMER_LAST);
        }

        /* Skip reprogram of event if it's not changed */
        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED) && (expires == ts->next_tick)) {
                /* Sanity check: make sure clockevent is actually programmed */
                if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
                        return;

                WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu "
                          "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick,
                          dev->next_event, hrtimer_active(&ts->sched_timer),
                          hrtimer_get_expires(&ts->sched_timer));
        }

        /*
         * tick_nohz_stop_tick() can be called several times before
         * tick_nohz_restart_sched_tick() is called. This happens when
         * interrupts arrive which do not cause a reschedule. In the first
         * call we save the current tick time, so we can restart the
         * scheduler tick in tick_nohz_restart_sched_tick().
         */
        if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                calc_load_nohz_start();
                quiet_vmstat();

                ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
                tick_sched_flag_set(ts, TS_FLAG_STOPPED);
                trace_tick_stop(1, TICK_DEP_MASK_NONE);
        }

        ts->next_tick = expires;

        /*
         * If the expiration time == KTIME_MAX, then we simply stop
         * the tick timer.
         */
        if (unlikely(expires == KTIME_MAX)) {
                if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
                        hrtimer_cancel(&ts->sched_timer);
                else
                        tick_program_event(KTIME_MAX, 1);
                return;
        }

        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES)) {
                hrtimer_start(&ts->sched_timer, expires,
                              HRTIMER_MODE_ABS_PINNED_HARD);
        } else {
                hrtimer_set_expires(&ts->sched_timer, expires);
                tick_program_event(expires, 1);
        }
}

static void tick_nohz_retain_tick(struct tick_sched *ts)
{
        ts->timer_expires_base = 0;
}

#ifdef CONFIG_NO_HZ_FULL
static void tick_nohz_full_stop_tick(struct tick_sched *ts, int cpu)
{
        if (tick_nohz_next_event(ts, cpu))
                tick_nohz_stop_tick(ts, cpu);
        else
                tick_nohz_retain_tick(ts);
}
#endif /* CONFIG_NO_HZ_FULL */

static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
        /* Update jiffies first */
        tick_do_update_jiffies64(now);

        /*
         * Clear the timer idle flag, so we avoid IPIs on remote queueing and
         * the clock forward checks in the enqueue path:
         */
        timer_clear_idle();

        calc_load_nohz_stop();
        touch_softlockup_watchdog_sched();

        /* Cancel the scheduled timer and restore the tick: */
        tick_sched_flag_clear(ts, TS_FLAG_STOPPED);
        tick_nohz_restart(ts, now);
}

static void __tick_nohz_full_update_tick(struct tick_sched *ts,
                                         ktime_t now)
{
#ifdef CONFIG_NO_HZ_FULL
        int cpu = smp_processor_id();

        if (can_stop_full_tick(cpu, ts))
                tick_nohz_full_stop_tick(ts, cpu);
        else if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
                tick_nohz_restart_sched_tick(ts, now);
#endif
}

static void tick_nohz_full_update_tick(struct tick_sched *ts)
{
        if (!tick_nohz_full_cpu(smp_processor_id()))
                return;

        if (!tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                return;

        __tick_nohz_full_update_tick(ts, ktime_get());
}

/*
 * A pending softirq outside an IRQ (or softirq disabled section) context
 * should be waiting for ksoftirqd to handle it. Therefore we shouldn't
 * reach this code due to the need_resched() early check in can_stop_idle_tick().
 *
 * However if we are between CPUHP_AP_SMPBOOT_THREADS and CPU_TEARDOWN_CPU on the
 * cpu_down() process, softirqs can still be raised while ksoftirqd is parked,
 * triggering the code below, since wakep_softirqd() is ignored.
 *
 */
static bool report_idle_softirq(void)
{
        static int ratelimit;
        unsigned int pending = local_softirq_pending();

        if (likely(!pending))
                return false;

        /* Some softirqs claim to be safe against hotplug and ksoftirqd parking */
        if (!cpu_active(smp_processor_id())) {
                pending &= ~SOFTIRQ_HOTPLUG_SAFE_MASK;
                if (!pending)
                        return false;
        }

        if (ratelimit >= 10)
                return false;

        /* On RT, softirq handling may be waiting on some lock */
        if (local_bh_blocked())
                return false;

        pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n",
                pending);
        ratelimit++;

        return true;
}

static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
        WARN_ON_ONCE(cpu_is_offline(cpu));

        if (unlikely(!tick_sched_flag_test(ts, TS_FLAG_NOHZ)))
                return false;

        if (need_resched())
                return false;

        if (unlikely(report_idle_softirq()))
                return false;

        if (tick_nohz_full_enabled()) {
                int tick_cpu = READ_ONCE(tick_do_timer_cpu);

                /*
                 * Keep the tick alive to guarantee timekeeping progression
                 * if there are full dynticks CPUs around
                 */
                if (tick_cpu == cpu)
                        return false;

                /* Should not happen for nohz-full */
                if (WARN_ON_ONCE(tick_cpu == TICK_DO_TIMER_NONE))
                        return false;
        }

        return true;
}

/**
 * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
 *
 * When the next event is more than a tick into the future, stop the idle tick
 */
void tick_nohz_idle_stop_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        int cpu = smp_processor_id();
        ktime_t expires;

        /*
         * If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
         * tick timer expiration time is known already.
         */
        if (ts->timer_expires_base)
                expires = ts->timer_expires;
        else if (can_stop_idle_tick(cpu, ts))
                expires = tick_nohz_next_event(ts, cpu);
        else
                return;

        ts->idle_calls++;

        if (expires > 0LL) {
                int was_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);

                tick_nohz_stop_tick(ts, cpu);

                ts->idle_sleeps++;
                ts->idle_expires = expires;

                if (!was_stopped && tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                        ts->idle_jiffies = ts->last_jiffies;
                        nohz_balance_enter_idle(cpu);
                }
        } else {
                tick_nohz_retain_tick(ts);
        }
}

void tick_nohz_idle_retain_tick(void)
{
        tick_nohz_retain_tick(this_cpu_ptr(&tick_cpu_sched));
}

/**
 * tick_nohz_idle_enter - prepare for entering idle on the current CPU
 *
 * Called when we start the idle loop.
 */
void tick_nohz_idle_enter(void)
{
        struct tick_sched *ts;

        lockdep_assert_irqs_enabled();

        local_irq_disable();

        ts = this_cpu_ptr(&tick_cpu_sched);

        WARN_ON_ONCE(ts->timer_expires_base);

        tick_sched_flag_set(ts, TS_FLAG_INIDLE);
        tick_nohz_start_idle(ts);

        local_irq_enable();
}

/**
 * tick_nohz_irq_exit - Notify the tick about IRQ exit
 *
 * A timer may have been added/modified/deleted either by the current IRQ,
 * or by another place using this IRQ as a notification. This IRQ may have
 * also updated the RCU callback list. These events may require a
 * re-evaluation of the next tick. Depending on the context:
 *
 * 1) If the CPU is idle and no resched is pending, just proceed with idle
 *    time accounting. The next tick will be re-evaluated on the next idle
 *    loop iteration.
 *
 * 2) If the CPU is nohz_full:
 *
 *    2.1) If there is any tick dependency, restart the tick if stopped.
 *
 *    2.2) If there is no tick dependency, (re-)evaluate the next tick and
 *         stop/update it accordingly.
 */
void tick_nohz_irq_exit(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_INIDLE))
                tick_nohz_start_idle(ts);
        else
                tick_nohz_full_update_tick(ts);
}

/**
 * tick_nohz_idle_got_tick - Check whether or not the tick handler has run
 *
 * Return: %true if the tick handler has run, otherwise %false
 */
bool tick_nohz_idle_got_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (ts->got_idle_tick) {
                ts->got_idle_tick = 0;
                return true;
        }
        return false;
}

/**
 * tick_nohz_get_next_hrtimer - return the next expiration time for the hrtimer
 * or the tick, whichever expires first. Note that, if the tick has been
 * stopped, it returns the next hrtimer.
 *
 * Called from power state control code with interrupts disabled
 *
 * Return: the next expiration time
 */
ktime_t tick_nohz_get_next_hrtimer(void)
{
        return __this_cpu_read(tick_cpu_device.evtdev)->next_event;
}

/**
 * tick_nohz_get_sleep_length - return the expected length of the current sleep
 * @delta_next: duration until the next event if the tick cannot be stopped
 *
 * Called from power state control code with interrupts disabled.
 *
 * The return value of this function and/or the value returned by it through the
 * @delta_next pointer can be negative which must be taken into account by its
 * callers.
 *
 * Return: the expected length of the current sleep
 */
ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
{
        struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        int cpu = smp_processor_id();
        /*
         * The idle entry time is expected to be a sufficient approximation of
         * the current time at this point.
         */
        ktime_t now = ts->idle_entrytime;
        ktime_t next_event;

        WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));

        *delta_next = ktime_sub(dev->next_event, now);

        if (!can_stop_idle_tick(cpu, ts))
                return *delta_next;

        next_event = tick_nohz_next_event(ts, cpu);
        if (!next_event)
                return *delta_next;

        /*
         * If the next highres timer to expire is earlier than 'next_event', the
         * idle governor needs to know that.
         */
        next_event = min_t(u64, next_event,
                           hrtimer_next_event_without(&ts->sched_timer));

        return ktime_sub(next_event, now);
}

/**
 * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value
 * for a particular CPU.
 * @cpu: target CPU number
 *
 * Called from the schedutil frequency scaling governor in scheduler context.
 *
 * Return: the current idle calls counter value for @cpu
 */
unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
{
        struct tick_sched *ts = tick_get_tick_sched(cpu);

        return ts->idle_calls;
}

static void tick_nohz_account_idle_time(struct tick_sched *ts,
                                        ktime_t now)
{
        unsigned long ticks;

        ts->idle_exittime = now;

        if (vtime_accounting_enabled_this_cpu())
                return;
        /*
         * We stopped the tick in idle. update_process_times() would miss the
         * time we slept, as it does only a 1 tick accounting.
         * Enforce that this is accounted to idle !
         */
        ticks = jiffies - ts->idle_jiffies;
        /*
         * We might be one off. Do not randomly account a huge number of ticks!
         */
        if (ticks && ticks < LONG_MAX)
                account_idle_ticks(ticks);
}

void tick_nohz_idle_restart_tick(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED)) {
                ktime_t now = ktime_get();
                tick_nohz_restart_sched_tick(ts, now);
                tick_nohz_account_idle_time(ts, now);
        }
}

static void tick_nohz_idle_update_tick(struct tick_sched *ts, ktime_t now)
{
        if (tick_nohz_full_cpu(smp_processor_id()))
                __tick_nohz_full_update_tick(ts, now);
        else
                tick_nohz_restart_sched_tick(ts, now);

        tick_nohz_account_idle_time(ts, now);
}

/**
 * tick_nohz_idle_exit - Update the tick upon idle task exit
 *
 * When the idle task exits, update the tick depending on the
 * following situations:
 *
 * 1) If the CPU is not in nohz_full mode (most cases), then
 *    restart the tick.
 *
 * 2) If the CPU is in nohz_full mode (corner case):
 *   2.1) If the tick can be kept stopped (no tick dependencies)
 *        then re-evaluate the next tick and try to keep it stopped
 *        as long as possible.
 *   2.2) If the tick has dependencies, restart the tick.
 *
 */
void tick_nohz_idle_exit(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        bool idle_active, tick_stopped;
        ktime_t now;

        local_irq_disable();

        WARN_ON_ONCE(!tick_sched_flag_test(ts, TS_FLAG_INIDLE));
        WARN_ON_ONCE(ts->timer_expires_base);

        tick_sched_flag_clear(ts, TS_FLAG_INIDLE);
        idle_active = tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE);
        tick_stopped = tick_sched_flag_test(ts, TS_FLAG_STOPPED);

        if (idle_active || tick_stopped)
                now = ktime_get();

        if (idle_active)
                tick_nohz_stop_idle(ts, now);

        if (tick_stopped)
                tick_nohz_idle_update_tick(ts, now);

        local_irq_enable();
}

/*
 * In low-resolution mode, the tick handler must be implemented directly
 * at the clockevent level. hrtimer can't be used instead, because its
 * infrastructure actually relies on the tick itself as a backend in
 * low-resolution mode (see hrtimer_run_queues()).
 */
static void tick_nohz_lowres_handler(struct clock_event_device *dev)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        dev->next_event = KTIME_MAX;

        if (likely(tick_nohz_handler(&ts->sched_timer) == HRTIMER_RESTART))
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}

static inline void tick_nohz_activate(struct tick_sched *ts)
{
        if (!tick_nohz_enabled)
                return;
        tick_sched_flag_set(ts, TS_FLAG_NOHZ);
        /* One update is enough */
        if (!test_and_set_bit(0, &tick_nohz_active))
                timers_update_nohz();
}

/**
 * tick_nohz_switch_to_nohz - switch to NOHZ mode
 */
static void tick_nohz_switch_to_nohz(void)
{
        if (!tick_nohz_enabled)
                return;

        if (tick_switch_to_oneshot(tick_nohz_lowres_handler))
                return;

        /*
         * Recycle the hrtimer in 'ts', so we can share the
         * highres code.
         */
        tick_setup_sched_timer(false);
}

static inline void tick_nohz_irq_enter(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
        ktime_t now;

        if (!tick_sched_flag_test(ts, TS_FLAG_STOPPED | TS_FLAG_IDLE_ACTIVE))
                return;
        now = ktime_get();
        if (tick_sched_flag_test(ts, TS_FLAG_IDLE_ACTIVE))
                tick_nohz_stop_idle(ts, now);
        /*
         * If all CPUs are idle we may need to update a stale jiffies value.
         * Note nohz_full is a special case: a timekeeper is guaranteed to stay
         * alive but it might be busy looping with interrupts disabled in some
         * rare case (typically stop machine). So we must make sure we have a
         * last resort.
         */
        if (tick_sched_flag_test(ts, TS_FLAG_STOPPED))
                tick_nohz_update_jiffies(now);
}

#else

static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_nohz_irq_enter(void) { }
static inline void tick_nohz_activate(struct tick_sched *ts) { }

#endif /* CONFIG_NO_HZ_COMMON */

/*
 * Called from irq_enter() to notify about the possible interruption of idle()
 */
void tick_irq_enter(void)
{
        tick_check_oneshot_broadcast_this_cpu();
        tick_nohz_irq_enter();
}

static int sched_skew_tick;

static int __init skew_tick(char *str)
{
        get_option(&str, &sched_skew_tick);

        return 0;
}
early_param("skew_tick", skew_tick);

/**
 * tick_setup_sched_timer - setup the tick emulation timer
 * @hrtimer: whether to use the hrtimer or not
 */
void tick_setup_sched_timer(bool hrtimer)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        /* Emulate tick processing via per-CPU hrtimers: */
        hrtimer_setup(&ts->sched_timer, tick_nohz_handler, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);

        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
                tick_sched_flag_set(ts, TS_FLAG_HIGHRES);

        /* Get the next period (per-CPU) */
        hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());

        /* Offset the tick to avert 'jiffies_lock' contention. */
        if (sched_skew_tick) {
                u64 offset = TICK_NSEC >> 1;
                do_div(offset, num_possible_cpus());
                offset *= smp_processor_id();
                hrtimer_add_expires_ns(&ts->sched_timer, offset);
        }

        hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
        if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS) && hrtimer)
                hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
        else
                tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
        tick_nohz_activate(ts);
}

/*
 * Shut down the tick and make sure the CPU won't try to retake the timekeeping
 * duty before disabling IRQs in idle for the last time.
 */
void tick_sched_timer_dying(int cpu)
{
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        ktime_t idle_sleeptime, iowait_sleeptime;
        unsigned long idle_calls, idle_sleeps;

        /* This must happen before hrtimers are migrated! */
        if (tick_sched_flag_test(ts, TS_FLAG_HIGHRES))
                hrtimer_cancel(&ts->sched_timer);

        idle_sleeptime = ts->idle_sleeptime;
        iowait_sleeptime = ts->iowait_sleeptime;
        idle_calls = ts->idle_calls;
        idle_sleeps = ts->idle_sleeps;
        memset(ts, 0, sizeof(*ts));
        ts->idle_sleeptime = idle_sleeptime;
        ts->iowait_sleeptime = iowait_sleeptime;
        ts->idle_calls = idle_calls;
        ts->idle_sleeps = idle_sleeps;
}

/*
 * Async notification about clocksource changes
 */
void tick_clock_notify(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks);
}

/*
 * Async notification about clock event changes
 */
void tick_oneshot_notify(void)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        set_bit(0, &ts->check_clocks);
}

/*
 * Check if a change happened, which makes oneshot possible.
 *
 * Called cyclically from the hrtimer softirq (driven by the timer
 * softirq). 'allow_nohz' signals that we can switch into low-res NOHZ
 * mode, because high resolution timers are disabled (either compile
 * or runtime). Called with interrupts disabled.
 */
int tick_check_oneshot_change(int allow_nohz)
{
        struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);

        if (!test_and_clear_bit(0, &ts->check_clocks))
                return 0;

        if (tick_sched_flag_test(ts, TS_FLAG_NOHZ))
                return 0;

        if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
                return 0;

        if (!allow_nohz)
                return 1;

        tick_nohz_switch_to_nohz();
        return 0;
}


































  462 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * RCU-based infrastructure for lightweight reader-writer locking
 *
 * Copyright (c) 2015, Red Hat, Inc.
 *
 * Author: Oleg Nesterov <oleg@redhat.com>
 */

#ifndef _LINUX_RCU_SYNC_H_
#define _LINUX_RCU_SYNC_H_

#include <linux/wait.h>
#include <linux/rcupdate.h>

/* Structure to mediate between updaters and fastpath-using readers.  */
struct rcu_sync {
        int                        gp_state;
        int                        gp_count;
        wait_queue_head_t        gp_wait;

        struct rcu_head                cb_head;
};

/**
 * rcu_sync_is_idle() - Are readers permitted to use their fastpaths?
 * @rsp: Pointer to rcu_sync structure to use for synchronization
 *
 * Returns true if readers are permitted to use their fastpaths.  Must be
 * invoked within some flavor of RCU read-side critical section.
 */
static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
{
        RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
                         "suspicious rcu_sync_is_idle() usage");
        return !READ_ONCE(rsp->gp_state); /* GP_IDLE */
}

extern void rcu_sync_init(struct rcu_sync *);
extern void rcu_sync_enter(struct rcu_sync *);
extern void rcu_sync_exit(struct rcu_sync *);
extern void rcu_sync_dtor(struct rcu_sync *);

#define __RCU_SYNC_INITIALIZER(name) {                                        \
                .gp_state = 0,                                                \
                .gp_count = 0,                                                \
                .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait),        \
        }

#define        DEFINE_RCU_SYNC(name)        \
        struct rcu_sync name = __RCU_SYNC_INITIALIZER(name)

#endif /* _LINUX_RCU_SYNC_H_ */












































































































































  163 



















   68 






  166 





  166 




  126 




  126 






    8 




















    8 








    8 



    8 

    8 












    8 

































    8 








    8 


    8 


    8 
    8 




























   68 












   68 







   68 
   68 




















  140 





  143 



  119 










  140 






  140 






  141 
  140 








  142 









   96 
   95 



    1 









  134 





  134 

  134 













































   24 



   24 









    6 




























































































  206 
  205 































   25 




   31 





   31 



   31 




   31 

  206 


































   31 







    2 

   31 





   31 

   31 


   31 












   25 







   31 





  206 










  206 






























  206 
   31 






   31 














   31 





   31 




  206 











  206 




  206 







   16 
  205 
















































   24 





   24 




























































   12 


    7 






   14 


   14 


















   24 


   24 
    4 





















    1 





   23 






  246 
















    2 
















  244 














  244 


























   24 






















  248 




































































    2 























  246 



















    2 
    2 
    2 
    2 






    2 


















   24 
    9 











   24 

   24 




   24 
   24 


   24 

















   24 

    1 

   24 




   24 

   24 
   24 








   24 
   24 



   69 
  114 














   34 
   34 
   24 

















   26 











































   72 
   10 


   20 











   73 




   59 

   57 












   15 






   20 



















   82 
   82 



   71 
   20 


   20 
    3 

   20 
   10 

















   82 









   72 
   20 


    5 
   74 







































   82 





   82 









   82 
    6 





















































   10 

   65 
   69 
    2 
    1 




   69 
    1 






    5 










   80 
   80 




    7 
   10 

   72 

   67 
   70 






   75 





   10 







    5 
    2 


























   81 










   18 


















   18 





































   71 











   10 












    5 












    2 






   83 
































   75 





   18 


   84 








    3 


    2 












   71 

   10 

    5 

    2 












   80 






   78 


   78 
    8 








   94 








   94 


    1 






    1 


    1 


    1 
    2 



   88 



    1 























    4 





   10 





   74 






   73 





    8 
    1 



    8 

    7 





   78 





















   75 





    5 





















   95 
    1 


   94 





















































































    1 




    2 


    2 
    1 
    1 








    1 


















    1 













    1 


    1 


























    5 




    5 
























    1 




    2 


    1 


   12 
    8 
    2 






    2 
    2 
    2 






    4 



    1 




    1 


    1 







    1 





    2 

    1 







   16 




   16 

















































































































































































































  155 
  186 





































   42 
   42 

   41 





































  142 





  135 


    1 





    5 





    8 





    3 




    8 
    8 



















  156 
  132 


  142 




























  119 

  121 




   21 
  131 



  135 



  135 
  135 




  121 



  121 









  135 

  117 
  126 



























  140 







  130 
   17 






























  129 








  117 



  114 

    1 





  124 





























































  139 









  140 


  129 







    7 












    7 





    7 














  130 
  119 









  140 














  140 



















































































































   50 






    7 


   48 







   49 
   50 















   50 





   47 

   11 

































































   12 






    3 


    9 
    6 

    3 





   12 
   12 
















   11 





    3 

    9 


















































































































































































  122 
  123 


  121 


    2 


  115 






    9 





    2 


    2 














  162 
  160 








    4 



  125 
  121 


    2 













    1 




























    2 



    1 





    2 




    2 









    2 








    2 


    2 

    1 









    1 
































    1 






    2 













    2 
    2 






    2 





    2 















    1 












    1 




    1 













    1 





    1 






    1 













   26 


    1 










    2 











   24 





   25 
























































































































































































   49 








   94 




   90 






    6 

    4 





   90 









   52 



   52 
   52 








    2 


















   69 









































   73 













    1 










    4 


















   72 







   71 


    1 





    6 





























   69 






























    1 



    1 














    1 














    1 

















    1 



























































  503 




















  509 






    2 


    2 







  503 

















    1 









    8 




    7 


  163 





















    1 














    1 


    1 









    1 














    1 









    1 








    2 

    3 






    1 


    1 











    2 

    4 










    1 



    1 







    2 








    1 














    1 



    1 

















  357 




















































    1 


    1 












    1 


    6 


  179 




  186 


    1 




  135 

   32 

   27 

    2 







   11 



   10 

    1 



















































   55 






    1 




    2 















    6 



   49 


   49 


   49 














   49 




  105 
   17 
























    1 



    1 








    1 




    2 



    7 





















   77 










    1 




    1 


    1 




    1 




    1 



    1 








    3 




    1 




    1 


    1 

    1 












    5 


    5 


    5 









   24 
    7 







    1 

    1 




    3 

















    1 


    6 




    2 









    1 
    1 








    9 





























    1 














    1 















  436 




    2 

  118 

   73 





    1 

   23 







    7 




   89 













   96 



   95 










    1 

    5 







    1 

   16 








    1 

    4 






    2 

   11 







    2 

   24 






    1 

   40 







    1 

    7 









    2 




   16 



    7 
    3 














    2 






    1 

    3 



   13 
    1 

























    2 



    8 



   50 






   33 


    3 


    1 














   87 















































































































  248 
    2 
















  246 













  339 




    1 




  248 


   66 


    1 










   24 

































































































































  246 




  246 











































   24 












































    5 
   12 






   24 


    5 










   32 







    4 
   71 










   59 
   59 




















   42 



   90 



   98 


   96 
   11 


    1 
   48 
   48 











   99 









   99 


   98 







































   44 
    7 


    2 

   43 











   46 









   46 


   46 





  102 




  102 

  102 


















   48 
   78 





  102 










   16 



   16 

   16 
    3 


    5 
   10 





























   11 






    4 






    5 



    4 
    4 





    5 















































































































































































































  265 







   24 
  246 
















  246 


   24 



    3 





  261 















































  161 




































  220 




  223 


































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Kernel-based Virtual Machine (KVM) Hypervisor
 *
 * Copyright (C) 2006 Qumranet, Inc.
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 */

#include <kvm/iodev.h>

#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/percpu.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
#include <linux/syscore_ops.h>
#include <linux/cpu.h>
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/sched/stat.h>
#include <linux/cpumask.h>
#include <linux/smp.h>
#include <linux/anon_inodes.h>
#include <linux/profile.h>
#include <linux/kvm_para.h>
#include <linux/pagemap.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/compat.h>
#include <linux/srcu.h>
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/bsearch.h>
#include <linux/io.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/suspend.h>

#include <asm/processor.h>
#include <asm/ioctl.h>
#include <linux/uaccess.h>

#include "coalesced_mmio.h"
#include "async_pf.h"
#include "kvm_mm.h"
#include "vfio.h"

#include <trace/events/ipi.h>

#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>

#include <linux/kvm_dirty_ring.h>


/* Worst case buffer size needed for holding an integer. */
#define ITOA_MAX_LEN 12

MODULE_AUTHOR("Qumranet");
MODULE_DESCRIPTION("Kernel-based Virtual Machine (KVM) Hypervisor");
MODULE_LICENSE("GPL");

/* Architectures should define their poll value according to the halt latency */
unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
module_param(halt_poll_ns, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns);

/* Default doubles per-vcpu halt_poll_ns. */
unsigned int halt_poll_ns_grow = 2;
module_param(halt_poll_ns_grow, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow);

/* The start value to grow halt_poll_ns from */
unsigned int halt_poll_ns_grow_start = 10000; /* 10us */
module_param(halt_poll_ns_grow_start, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);

/* Default halves per-vcpu halt_poll_ns. */
unsigned int halt_poll_ns_shrink = 2;
module_param(halt_poll_ns_shrink, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);

/*
 * Allow direct access (from KVM or the CPU) without MMU notifier protection
 * to unpinned pages.
 */
static bool allow_unsafe_mappings;
module_param(allow_unsafe_mappings, bool, 0444);

/*
 * Ordering of locks:
 *
 *        kvm->lock --> kvm->slots_lock --> kvm->irq_lock
 */

DEFINE_MUTEX(kvm_lock);
LIST_HEAD(vm_list);

static struct kmem_cache *kvm_vcpu_cache;

static __read_mostly struct preempt_ops kvm_preempt_ops;
static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);

static struct dentry *kvm_debugfs_dir;

static const struct file_operations stat_fops_per_vm;

static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
#ifdef CONFIG_KVM_COMPAT
static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
                                  unsigned long arg);
#define KVM_COMPAT(c)        .compat_ioctl        = (c)
#else
/*
 * For architectures that don't implement a compat infrastructure,
 * adopt a double line of defense:
 * - Prevent a compat task from opening /dev/kvm
 * - If the open has been done by a 64bit task, and the KVM fd
 *   passed to a compat task, let the ioctls fail.
 */
static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
                                unsigned long arg) { return -EINVAL; }

static int kvm_no_compat_open(struct inode *inode, struct file *file)
{
        return is_compat_task() ? -ENODEV : 0;
}
#define KVM_COMPAT(c)        .compat_ioctl        = kvm_no_compat_ioctl,        \
                        .open                = kvm_no_compat_open
#endif
static int kvm_enable_virtualization(void);
static void kvm_disable_virtualization(void);

static void kvm_io_bus_destroy(struct kvm_io_bus *bus);

#define KVM_EVENT_CREATE_VM 0
#define KVM_EVENT_DESTROY_VM 1
static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
static unsigned long long kvm_createvm_count;
static unsigned long long kvm_active_vms;

static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);

__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
{
}

/*
 * Switches to specified vcpu, until a matching vcpu_put()
 */
void vcpu_load(struct kvm_vcpu *vcpu)
{
        int cpu = get_cpu();

        __this_cpu_write(kvm_running_vcpu, vcpu);
        preempt_notifier_register(&vcpu->preempt_notifier);
        kvm_arch_vcpu_load(vcpu, cpu);
        put_cpu();
}
EXPORT_SYMBOL_GPL(vcpu_load);

void vcpu_put(struct kvm_vcpu *vcpu)
{
        preempt_disable();
        kvm_arch_vcpu_put(vcpu);
        preempt_notifier_unregister(&vcpu->preempt_notifier);
        __this_cpu_write(kvm_running_vcpu, NULL);
        preempt_enable();
}
EXPORT_SYMBOL_GPL(vcpu_put);

/* TODO: merge with kvm_arch_vcpu_should_kick */
static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
{
        int mode = kvm_vcpu_exiting_guest_mode(vcpu);

        /*
         * We need to wait for the VCPU to reenable interrupts and get out of
         * READING_SHADOW_PAGE_TABLES mode.
         */
        if (req & KVM_REQUEST_WAIT)
                return mode != OUTSIDE_GUEST_MODE;

        /*
         * Need to kick a running VCPU, but otherwise there is nothing to do.
         */
        return mode == IN_GUEST_MODE;
}

static void ack_kick(void *_completed)
{
}

static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
{
        if (cpumask_empty(cpus))
                return false;

        smp_call_function_many(cpus, ack_kick, NULL, wait);
        return true;
}

static void kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsigned int req,
                                  struct cpumask *tmp, int current_cpu)
{
        int cpu;

        if (likely(!(req & KVM_REQUEST_NO_ACTION)))
                __kvm_make_request(req, vcpu);

        if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
                return;

        /*
         * Note, the vCPU could get migrated to a different pCPU at any point
         * after kvm_request_needs_ipi(), which could result in sending an IPI
         * to the previous pCPU.  But, that's OK because the purpose of the IPI
         * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
         * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
         * after this point is also OK, as the requirement is only that KVM wait
         * for vCPUs that were reading SPTEs _before_ any changes were
         * finalized. See kvm_vcpu_kick() for more details on handling requests.
         */
        if (kvm_request_needs_ipi(vcpu, req)) {
                cpu = READ_ONCE(vcpu->cpu);
                if (cpu != -1 && cpu != current_cpu)
                        __cpumask_set_cpu(cpu, tmp);
        }
}

bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req,
                                 unsigned long *vcpu_bitmap)
{
        struct kvm_vcpu *vcpu;
        struct cpumask *cpus;
        int i, me;
        bool called;

        me = get_cpu();

        cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
        cpumask_clear(cpus);

        for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
                vcpu = kvm_get_vcpu(kvm, i);
                if (!vcpu)
                        continue;
                kvm_make_vcpu_request(vcpu, req, cpus, me);
        }

        called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
        put_cpu();

        return called;
}

bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
{
        struct kvm_vcpu *vcpu;
        struct cpumask *cpus;
        unsigned long i;
        bool called;
        int me;

        me = get_cpu();

        cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
        cpumask_clear(cpus);

        kvm_for_each_vcpu(i, vcpu, kvm)
                kvm_make_vcpu_request(vcpu, req, cpus, me);

        called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT));
        put_cpu();

        return called;
}
EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);

void kvm_flush_remote_tlbs(struct kvm *kvm)
{
        ++kvm->stat.generic.remote_tlb_flush_requests;

        /*
         * We want to publish modifications to the page tables before reading
         * mode. Pairs with a memory barrier in arch-specific code.
         * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
         * and smp_mb in walk_shadow_page_lockless_begin/end.
         * - powerpc: smp_mb in kvmppc_prepare_to_enter.
         *
         * There is already an smp_mb__after_atomic() before
         * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
         * barrier here.
         */
        if (!kvm_arch_flush_remote_tlbs(kvm)
            || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
                ++kvm->stat.generic.remote_tlb_flush;
}
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);

void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
{
        if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
                return;

        /*
         * Fall back to a flushing entire TLBs if the architecture range-based
         * TLB invalidation is unsupported or can't be performed for whatever
         * reason.
         */
        kvm_flush_remote_tlbs(kvm);
}

void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
                                   const struct kvm_memory_slot *memslot)
{
        /*
         * All current use cases for flushing the TLBs for a specific memslot
         * are related to dirty logging, and many do the TLB flush out of
         * mmu_lock. The interaction between the various operations on memslot
         * must be serialized by slots_locks to ensure the TLB flush from one
         * operation is observed by any other operation on the same memslot.
         */
        lockdep_assert_held(&kvm->slots_lock);
        kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
}

static void kvm_flush_shadow_all(struct kvm *kvm)
{
        kvm_arch_flush_shadow_all(kvm);
        kvm_arch_guest_memory_reclaimed(kvm);
}

#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc,
                                               gfp_t gfp_flags)
{
        void *page;

        gfp_flags |= mc->gfp_zero;

        if (mc->kmem_cache)
                return kmem_cache_alloc(mc->kmem_cache, gfp_flags);

        page = (void *)__get_free_page(gfp_flags);
        if (page && mc->init_value)
                memset64(page, mc->init_value, PAGE_SIZE / sizeof(u64));
        return page;
}

int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
{
        gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
        void *obj;

        if (mc->nobjs >= min)
                return 0;

        if (unlikely(!mc->objects)) {
                if (WARN_ON_ONCE(!capacity))
                        return -EIO;

                /*
                 * Custom init values can be used only for page allocations,
                 * and obviously conflict with __GFP_ZERO.
                 */
                if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero)))
                        return -EIO;

                mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp);
                if (!mc->objects)
                        return -ENOMEM;

                mc->capacity = capacity;
        }

        /* It is illegal to request a different capacity across topups. */
        if (WARN_ON_ONCE(mc->capacity != capacity))
                return -EIO;

        while (mc->nobjs < mc->capacity) {
                obj = mmu_memory_cache_alloc_obj(mc, gfp);
                if (!obj)
                        return mc->nobjs >= min ? 0 : -ENOMEM;
                mc->objects[mc->nobjs++] = obj;
        }
        return 0;
}

int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
{
        return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
}

int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
{
        return mc->nobjs;
}

void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{
        while (mc->nobjs) {
                if (mc->kmem_cache)
                        kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]);
                else
                        free_page((unsigned long)mc->objects[--mc->nobjs]);
        }

        kvfree(mc->objects);

        mc->objects = NULL;
        mc->capacity = 0;
}

void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
{
        void *p;

        if (WARN_ON(!mc->nobjs))
                p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT);
        else
                p = mc->objects[--mc->nobjs];
        BUG_ON(!p);
        return p;
}
#endif

static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
        mutex_init(&vcpu->mutex);
        vcpu->cpu = -1;
        vcpu->kvm = kvm;
        vcpu->vcpu_id = id;
        vcpu->pid = NULL;
        rwlock_init(&vcpu->pid_lock);
#ifndef __KVM_HAVE_ARCH_WQP
        rcuwait_init(&vcpu->wait);
#endif
        kvm_async_pf_vcpu_init(vcpu);

        kvm_vcpu_set_in_spin_loop(vcpu, false);
        kvm_vcpu_set_dy_eligible(vcpu, false);
        vcpu->preempted = false;
        vcpu->ready = false;
        preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
        vcpu->last_used_slot = NULL;

        /* Fill the stats id string for the vcpu */
        snprintf(vcpu->stats_id, sizeof(vcpu->stats_id), "kvm-%d/vcpu-%d",
                 task_pid_nr(current), id);
}

static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
{
        kvm_arch_vcpu_destroy(vcpu);
        kvm_dirty_ring_free(&vcpu->dirty_ring);

        /*
         * No need for rcu_read_lock as VCPU_RUN is the only place that changes
         * the vcpu->pid pointer, and at destruction time all file descriptors
         * are already gone.
         */
        put_pid(vcpu->pid);

        free_page((unsigned long)vcpu->run);
        kmem_cache_free(kvm_vcpu_cache, vcpu);
}

void kvm_destroy_vcpus(struct kvm *kvm)
{
        unsigned long i;
        struct kvm_vcpu *vcpu;

        kvm_for_each_vcpu(i, vcpu, kvm) {
                kvm_vcpu_destroy(vcpu);
                xa_erase(&kvm->vcpu_array, i);

                /*
                 * Assert that the vCPU isn't visible in any way, to ensure KVM
                 * doesn't trigger a use-after-free if destroying vCPUs results
                 * in VM-wide request, e.g. to flush remote TLBs when tearing
                 * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires.
                 */
                WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i));
        }

        atomic_set(&kvm->online_vcpus, 0);
}
EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);

#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
{
        return container_of(mn, struct kvm, mmu_notifier);
}

typedef bool (*gfn_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range);

typedef void (*on_lock_fn_t)(struct kvm *kvm);

struct kvm_mmu_notifier_range {
        /*
         * 64-bit addresses, as KVM notifiers can operate on host virtual
         * addresses (unsigned long) and guest physical addresses (64-bit).
         */
        u64 start;
        u64 end;
        union kvm_mmu_notifier_arg arg;
        gfn_handler_t handler;
        on_lock_fn_t on_lock;
        bool flush_on_ret;
        bool may_block;
        bool lockless;
};

/*
 * The inner-most helper returns a tuple containing the return value from the
 * arch- and action-specific handler, plus a flag indicating whether or not at
 * least one memslot was found, i.e. if the handler found guest memory.
 *
 * Note, most notifiers are averse to booleans, so even though KVM tracks the
 * return from arch code as a bool, outer helpers will cast it to an int. :-(
 */
typedef struct kvm_mmu_notifier_return {
        bool ret;
        bool found_memslot;
} kvm_mn_ret_t;

/*
 * Use a dedicated stub instead of NULL to indicate that there is no callback
 * function/handler.  The compiler technically can't guarantee that a real
 * function will have a non-zero address, and so it will generate code to
 * check for !NULL, whereas comparing against a stub will be elided at compile
 * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
 */
static void kvm_null_fn(void)
{

}
#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)

/* Iterate over each memslot intersecting [start, last] (inclusive) range */
#define kvm_for_each_memslot_in_hva_range(node, slots, start, last)             \
        for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
             node;                                                             \
             node = interval_tree_iter_next(node, start, last))             \

static __always_inline kvm_mn_ret_t kvm_handle_hva_range(struct kvm *kvm,
                                                         const struct kvm_mmu_notifier_range *range)
{
        struct kvm_mmu_notifier_return r = {
                .ret = false,
                .found_memslot = false,
        };
        struct kvm_gfn_range gfn_range;
        struct kvm_memory_slot *slot;
        struct kvm_memslots *slots;
        int i, idx;

        if (WARN_ON_ONCE(range->end <= range->start))
                return r;

        /* A null handler is allowed if and only if on_lock() is provided. */
        if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
                         IS_KVM_NULL_FN(range->handler)))
                return r;

        /* on_lock will never be called for lockless walks */
        if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock)))
                return r;

        idx = srcu_read_lock(&kvm->srcu);

        for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                struct interval_tree_node *node;

                slots = __kvm_memslots(kvm, i);
                kvm_for_each_memslot_in_hva_range(node, slots,
                                                  range->start, range->end - 1) {
                        unsigned long hva_start, hva_end;

                        slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
                        hva_start = max_t(unsigned long, range->start, slot->userspace_addr);
                        hva_end = min_t(unsigned long, range->end,
                                        slot->userspace_addr + (slot->npages << PAGE_SHIFT));

                        /*
                         * To optimize for the likely case where the address
                         * range is covered by zero or one memslots, don't
                         * bother making these conditional (to avoid writes on
                         * the second or later invocation of the handler).
                         */
                        gfn_range.arg = range->arg;
                        gfn_range.may_block = range->may_block;
                        /*
                         * HVA-based notifications aren't relevant to private
                         * mappings as they don't have a userspace mapping.
                         */
                        gfn_range.attr_filter = KVM_FILTER_SHARED;

                        /*
                         * {gfn(page) | page intersects with [hva_start, hva_end)} =
                         * {gfn_start, gfn_start+1, ..., gfn_end-1}.
                         */
                        gfn_range.start = hva_to_gfn_memslot(hva_start, slot);
                        gfn_range.end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, slot);
                        gfn_range.slot = slot;
                        gfn_range.lockless = range->lockless;

                        if (!r.found_memslot) {
                                r.found_memslot = true;
                                if (!range->lockless) {
                                        KVM_MMU_LOCK(kvm);
                                        if (!IS_KVM_NULL_FN(range->on_lock))
                                                range->on_lock(kvm);

                                        if (IS_KVM_NULL_FN(range->handler))
                                                goto mmu_unlock;
                                }
                        }
                        r.ret |= range->handler(kvm, &gfn_range);
                }
        }

        if (range->flush_on_ret && r.ret)
                kvm_flush_remote_tlbs(kvm);

mmu_unlock:
        if (r.found_memslot && !range->lockless)
                KVM_MMU_UNLOCK(kvm);

        srcu_read_unlock(&kvm->srcu, idx);

        return r;
}

static __always_inline int kvm_age_hva_range(struct mmu_notifier *mn,
                                                unsigned long start,
                                                unsigned long end,
                                                gfn_handler_t handler,
                                                bool flush_on_ret)
{
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
        const struct kvm_mmu_notifier_range range = {
                .start                = start,
                .end                = end,
                .handler        = handler,
                .on_lock        = (void *)kvm_null_fn,
                .flush_on_ret        = flush_on_ret,
                .may_block        = false,
                .lockless        = IS_ENABLED(CONFIG_KVM_MMU_LOCKLESS_AGING),
        };

        return kvm_handle_hva_range(kvm, &range).ret;
}

static __always_inline int kvm_age_hva_range_no_flush(struct mmu_notifier *mn,
                                                      unsigned long start,
                                                      unsigned long end,
                                                      gfn_handler_t handler)
{
        return kvm_age_hva_range(mn, start, end, handler, false);
}

void kvm_mmu_invalidate_begin(struct kvm *kvm)
{
        lockdep_assert_held_write(&kvm->mmu_lock);
        /*
         * The count increase must become visible at unlock time as no
         * spte can be established without taking the mmu_lock and
         * count is also read inside the mmu_lock critical section.
         */
        kvm->mmu_invalidate_in_progress++;

        if (likely(kvm->mmu_invalidate_in_progress == 1)) {
                kvm->mmu_invalidate_range_start = INVALID_GPA;
                kvm->mmu_invalidate_range_end = INVALID_GPA;
        }
}

void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
{
        lockdep_assert_held_write(&kvm->mmu_lock);

        WARN_ON_ONCE(!kvm->mmu_invalidate_in_progress);

        if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
                kvm->mmu_invalidate_range_start = start;
                kvm->mmu_invalidate_range_end = end;
        } else {
                /*
                 * Fully tracking multiple concurrent ranges has diminishing
                 * returns. Keep things simple and just find the minimal range
                 * which includes the current and new ranges. As there won't be
                 * enough information to subtract a range after its invalidate
                 * completes, any ranges invalidated concurrently will
                 * accumulate and persist until all outstanding invalidates
                 * complete.
                 */
                kvm->mmu_invalidate_range_start =
                        min(kvm->mmu_invalidate_range_start, start);
                kvm->mmu_invalidate_range_end =
                        max(kvm->mmu_invalidate_range_end, end);
        }
}

bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
        kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
        return kvm_unmap_gfn_range(kvm, range);
}

static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
                                        const struct mmu_notifier_range *range)
{
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
        const struct kvm_mmu_notifier_range hva_range = {
                .start                = range->start,
                .end                = range->end,
                .handler        = kvm_mmu_unmap_gfn_range,
                .on_lock        = kvm_mmu_invalidate_begin,
                .flush_on_ret        = true,
                .may_block        = mmu_notifier_range_blockable(range),
        };

        trace_kvm_unmap_hva_range(range->start, range->end);

        /*
         * Prevent memslot modification between range_start() and range_end()
         * so that conditionally locking provides the same result in both
         * functions.  Without that guarantee, the mmu_invalidate_in_progress
         * adjustments will be imbalanced.
         *
         * Pairs with the decrement in range_end().
         */
        spin_lock(&kvm->mn_invalidate_lock);
        kvm->mn_active_invalidate_count++;
        spin_unlock(&kvm->mn_invalidate_lock);

        /*
         * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
         * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
         * each cache's lock.  There are relatively few caches in existence at
         * any given time, and the caches themselves can check for hva overlap,
         * i.e. don't need to rely on memslot overlap checks for performance.
         * Because this runs without holding mmu_lock, the pfn caches must use
         * mn_active_invalidate_count (see above) instead of
         * mmu_invalidate_in_progress.
         */
        gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);

        /*
         * If one or more memslots were found and thus zapped, notify arch code
         * that guest memory has been reclaimed.  This needs to be done *after*
         * dropping mmu_lock, as x86's reclaim path is slooooow.
         */
        if (kvm_handle_hva_range(kvm, &hva_range).found_memslot)
                kvm_arch_guest_memory_reclaimed(kvm);

        return 0;
}

void kvm_mmu_invalidate_end(struct kvm *kvm)
{
        lockdep_assert_held_write(&kvm->mmu_lock);

        /*
         * This sequence increase will notify the kvm page fault that
         * the page that is going to be mapped in the spte could have
         * been freed.
         */
        kvm->mmu_invalidate_seq++;
        smp_wmb();
        /*
         * The above sequence increase must be visible before the
         * below count decrease, which is ensured by the smp_wmb above
         * in conjunction with the smp_rmb in mmu_invalidate_retry().
         */
        kvm->mmu_invalidate_in_progress--;
        KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);

        /*
         * Assert that at least one range was added between start() and end().
         * Not adding a range isn't fatal, but it is a KVM bug.
         */
        WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
}

static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
                                        const struct mmu_notifier_range *range)
{
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
        const struct kvm_mmu_notifier_range hva_range = {
                .start                = range->start,
                .end                = range->end,
                .handler        = (void *)kvm_null_fn,
                .on_lock        = kvm_mmu_invalidate_end,
                .flush_on_ret        = false,
                .may_block        = mmu_notifier_range_blockable(range),
        };
        bool wake;

        kvm_handle_hva_range(kvm, &hva_range);

        /* Pairs with the increment in range_start(). */
        spin_lock(&kvm->mn_invalidate_lock);
        if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
                --kvm->mn_active_invalidate_count;
        wake = !kvm->mn_active_invalidate_count;
        spin_unlock(&kvm->mn_invalidate_lock);

        /*
         * There can only be one waiter, since the wait happens under
         * slots_lock.
         */
        if (wake)
                rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
}

static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
                                              struct mm_struct *mm,
                                              unsigned long start,
                                              unsigned long end)
{
        trace_kvm_age_hva(start, end);

        return kvm_age_hva_range(mn, start, end, kvm_age_gfn,
                                 !IS_ENABLED(CONFIG_KVM_ELIDE_TLB_FLUSH_IF_YOUNG));
}

static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
                                        struct mm_struct *mm,
                                        unsigned long start,
                                        unsigned long end)
{
        trace_kvm_age_hva(start, end);

        /*
         * Even though we do not flush TLB, this will still adversely
         * affect performance on pre-Haswell Intel EPT, where there is
         * no EPT Access Bit to clear so that we have to tear down EPT
         * tables instead. If we find this unacceptable, we can always
         * add a parameter to kvm_age_hva so that it effectively doesn't
         * do anything on clear_young.
         *
         * Also note that currently we never issue secondary TLB flushes
         * from clear_young, leaving this job up to the regular system
         * cadence. If we find this inaccurate, we might come up with a
         * more sophisticated heuristic later.
         */
        return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn);
}

static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
                                       struct mm_struct *mm,
                                       unsigned long address)
{
        trace_kvm_test_age_hva(address);

        return kvm_age_hva_range_no_flush(mn, address, address + 1,
                                          kvm_test_age_gfn);
}

static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
                                     struct mm_struct *mm)
{
        struct kvm *kvm = mmu_notifier_to_kvm(mn);
        int idx;

        idx = srcu_read_lock(&kvm->srcu);
        kvm_flush_shadow_all(kvm);
        srcu_read_unlock(&kvm->srcu, idx);
}

static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
        .invalidate_range_start        = kvm_mmu_notifier_invalidate_range_start,
        .invalidate_range_end        = kvm_mmu_notifier_invalidate_range_end,
        .clear_flush_young        = kvm_mmu_notifier_clear_flush_young,
        .clear_young                = kvm_mmu_notifier_clear_young,
        .test_young                = kvm_mmu_notifier_test_young,
        .release                = kvm_mmu_notifier_release,
};

static int kvm_init_mmu_notifier(struct kvm *kvm)
{
        kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
        return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
}

#else  /* !CONFIG_KVM_GENERIC_MMU_NOTIFIER */

static int kvm_init_mmu_notifier(struct kvm *kvm)
{
        return 0;
}

#endif /* CONFIG_KVM_GENERIC_MMU_NOTIFIER */

#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
static int kvm_pm_notifier_call(struct notifier_block *bl,
                                unsigned long state,
                                void *unused)
{
        struct kvm *kvm = container_of(bl, struct kvm, pm_notifier);

        return kvm_arch_pm_notifier(kvm, state);
}

static void kvm_init_pm_notifier(struct kvm *kvm)
{
        kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
        /* Suspend KVM before we suspend ftrace, RCU, etc. */
        kvm->pm_notifier.priority = INT_MAX;
        register_pm_notifier(&kvm->pm_notifier);
}

static void kvm_destroy_pm_notifier(struct kvm *kvm)
{
        unregister_pm_notifier(&kvm->pm_notifier);
}
#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
static void kvm_init_pm_notifier(struct kvm *kvm)
{
}

static void kvm_destroy_pm_notifier(struct kvm *kvm)
{
}
#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */

static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
{
        if (!memslot->dirty_bitmap)
                return;

        vfree(memslot->dirty_bitmap);
        memslot->dirty_bitmap = NULL;
}

/* This does not remove the slot from struct kvm_memslots data structures */
static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
        if (slot->flags & KVM_MEM_GUEST_MEMFD)
                kvm_gmem_unbind(slot);

        kvm_destroy_dirty_bitmap(slot);

        kvm_arch_free_memslot(kvm, slot);

        kfree(slot);
}

static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
{
        struct hlist_node *idnode;
        struct kvm_memory_slot *memslot;
        int bkt;

        /*
         * The same memslot objects live in both active and inactive sets,
         * arbitrarily free using index '1' so the second invocation of this
         * function isn't operating over a structure with dangling pointers
         * (even though this function isn't actually touching them).
         */
        if (!slots->node_idx)
                return;

        hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[1])
                kvm_free_memslot(kvm, memslot);
}

static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
{
        switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
        case KVM_STATS_TYPE_INSTANT:
                return 0444;
        case KVM_STATS_TYPE_CUMULATIVE:
        case KVM_STATS_TYPE_PEAK:
        default:
                return 0644;
        }
}


static void kvm_destroy_vm_debugfs(struct kvm *kvm)
{
        int i;
        int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
                                      kvm_vcpu_stats_header.num_desc;

        if (IS_ERR(kvm->debugfs_dentry))
                return;

        debugfs_remove_recursive(kvm->debugfs_dentry);

        if (kvm->debugfs_stat_data) {
                for (i = 0; i < kvm_debugfs_num_entries; i++)
                        kfree(kvm->debugfs_stat_data[i]);
                kfree(kvm->debugfs_stat_data);
        }
}

static int kvm_create_vm_debugfs(struct kvm *kvm, const char *fdname)
{
        static DEFINE_MUTEX(kvm_debugfs_lock);
        struct dentry *dent;
        char dir_name[ITOA_MAX_LEN * 2];
        struct kvm_stat_data *stat_data;
        const struct _kvm_stats_desc *pdesc;
        int i, ret = -ENOMEM;
        int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
                                      kvm_vcpu_stats_header.num_desc;

        if (!debugfs_initialized())
                return 0;

        snprintf(dir_name, sizeof(dir_name), "%d-%s", task_pid_nr(current), fdname);
        mutex_lock(&kvm_debugfs_lock);
        dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
        if (dent) {
                pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
                dput(dent);
                mutex_unlock(&kvm_debugfs_lock);
                return 0;
        }
        dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
        mutex_unlock(&kvm_debugfs_lock);
        if (IS_ERR(dent))
                return 0;

        kvm->debugfs_dentry = dent;
        kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
                                         sizeof(*kvm->debugfs_stat_data),
                                         GFP_KERNEL_ACCOUNT);
        if (!kvm->debugfs_stat_data)
                goto out_err;

        for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
                pdesc = &kvm_vm_stats_desc[i];
                stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
                if (!stat_data)
                        goto out_err;

                stat_data->kvm = kvm;
                stat_data->desc = pdesc;
                stat_data->kind = KVM_STAT_VM;
                kvm->debugfs_stat_data[i] = stat_data;
                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
                                    kvm->debugfs_dentry, stat_data,
                                    &stat_fops_per_vm);
        }

        for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
                pdesc = &kvm_vcpu_stats_desc[i];
                stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
                if (!stat_data)
                        goto out_err;

                stat_data->kvm = kvm;
                stat_data->desc = pdesc;
                stat_data->kind = KVM_STAT_VCPU;
                kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
                                    kvm->debugfs_dentry, stat_data,
                                    &stat_fops_per_vm);
        }

        kvm_arch_create_vm_debugfs(kvm);
        return 0;
out_err:
        kvm_destroy_vm_debugfs(kvm);
        return ret;
}

/*
 * Called just after removing the VM from the vm_list, but before doing any
 * other destruction.
 */
void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
{
}

/*
 * Called after per-vm debugfs created.  When called kvm->debugfs_dentry should
 * be setup already, so we can create arch-specific debugfs entries under it.
 * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
 * a per-arch destroy interface is not needed.
 */
void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
}

static struct kvm *kvm_create_vm(unsigned long type, const char *fdname)
{
        struct kvm *kvm = kvm_arch_alloc_vm();
        struct kvm_memslots *slots;
        int r, i, j;

        if (!kvm)
                return ERR_PTR(-ENOMEM);

        KVM_MMU_LOCK_INIT(kvm);
        mmgrab(current->mm);
        kvm->mm = current->mm;
        kvm_eventfd_init(kvm);
        mutex_init(&kvm->lock);
        mutex_init(&kvm->irq_lock);
        mutex_init(&kvm->slots_lock);
        mutex_init(&kvm->slots_arch_lock);
        spin_lock_init(&kvm->mn_invalidate_lock);
        rcuwait_init(&kvm->mn_memslots_update_rcuwait);
        xa_init(&kvm->vcpu_array);
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
        xa_init(&kvm->mem_attr_array);
#endif

        INIT_LIST_HEAD(&kvm->gpc_list);
        spin_lock_init(&kvm->gpc_lock);

        INIT_LIST_HEAD(&kvm->devices);
        kvm->max_vcpus = KVM_MAX_VCPUS;

        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);

        /*
         * Force subsequent debugfs file creations to fail if the VM directory
         * is not created (by kvm_create_vm_debugfs()).
         */
        kvm->debugfs_dentry = ERR_PTR(-ENOENT);

        snprintf(kvm->stats_id, sizeof(kvm->stats_id), "kvm-%d",
                 task_pid_nr(current));

        r = -ENOMEM;
        if (init_srcu_struct(&kvm->srcu))
                goto out_err_no_srcu;
        if (init_srcu_struct(&kvm->irq_srcu))
                goto out_err_no_irq_srcu;

        r = kvm_init_irq_routing(kvm);
        if (r)
                goto out_err_no_irq_routing;

        refcount_set(&kvm->users_count, 1);

        for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                for (j = 0; j < 2; j++) {
                        slots = &kvm->__memslots[i][j];

                        atomic_long_set(&slots->last_used_slot, (unsigned long)NULL);
                        slots->hva_tree = RB_ROOT_CACHED;
                        slots->gfn_tree = RB_ROOT;
                        hash_init(slots->id_hash);
                        slots->node_idx = j;

                        /* Generations must be different for each address space. */
                        slots->generation = i;
                }

                rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][0]);
        }

        r = -ENOMEM;
        for (i = 0; i < KVM_NR_BUSES; i++) {
                rcu_assign_pointer(kvm->buses[i],
                        kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
                if (!kvm->buses[i])
                        goto out_err_no_arch_destroy_vm;
        }

        r = kvm_arch_init_vm(kvm, type);
        if (r)
                goto out_err_no_arch_destroy_vm;

        r = kvm_enable_virtualization();
        if (r)
                goto out_err_no_disable;

#ifdef CONFIG_HAVE_KVM_IRQCHIP
        INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
#endif

        r = kvm_init_mmu_notifier(kvm);
        if (r)
                goto out_err_no_mmu_notifier;

        r = kvm_coalesced_mmio_init(kvm);
        if (r < 0)
                goto out_no_coalesced_mmio;

        r = kvm_create_vm_debugfs(kvm, fdname);
        if (r)
                goto out_err_no_debugfs;

        mutex_lock(&kvm_lock);
        list_add(&kvm->vm_list, &vm_list);
        mutex_unlock(&kvm_lock);

        preempt_notifier_inc();
        kvm_init_pm_notifier(kvm);

        return kvm;

out_err_no_debugfs:
        kvm_coalesced_mmio_free(kvm);
out_no_coalesced_mmio:
#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
        if (kvm->mmu_notifier.ops)
                mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
#endif
out_err_no_mmu_notifier:
        kvm_disable_virtualization();
out_err_no_disable:
        kvm_arch_destroy_vm(kvm);
out_err_no_arch_destroy_vm:
        WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
        for (i = 0; i < KVM_NR_BUSES; i++)
                kfree(kvm_get_bus(kvm, i));
        kvm_free_irq_routing(kvm);
out_err_no_irq_routing:
        cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
        cleanup_srcu_struct(&kvm->srcu);
out_err_no_srcu:
        kvm_arch_free_vm(kvm);
        mmdrop(current->mm);
        return ERR_PTR(r);
}

static void kvm_destroy_devices(struct kvm *kvm)
{
        struct kvm_device *dev, *tmp;

        /*
         * We do not need to take the kvm->lock here, because nobody else
         * has a reference to the struct kvm at this point and therefore
         * cannot access the devices list anyhow.
         *
         * The device list is generally managed as an rculist, but list_del()
         * is used intentionally here. If a bug in KVM introduced a reader that
         * was not backed by a reference on the kvm struct, the hope is that
         * it'd consume the poisoned forward pointer instead of suffering a
         * use-after-free, even though this cannot be guaranteed.
         */
        list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
                list_del(&dev->vm_node);
                dev->ops->destroy(dev);
        }
}

static void kvm_destroy_vm(struct kvm *kvm)
{
        int i;
        struct mm_struct *mm = kvm->mm;

        kvm_destroy_pm_notifier(kvm);
        kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
        kvm_destroy_vm_debugfs(kvm);
        mutex_lock(&kvm_lock);
        list_del(&kvm->vm_list);
        mutex_unlock(&kvm_lock);
        kvm_arch_pre_destroy_vm(kvm);

        kvm_free_irq_routing(kvm);
        for (i = 0; i < KVM_NR_BUSES; i++) {
                struct kvm_io_bus *bus = kvm_get_bus(kvm, i);

                if (bus)
                        kvm_io_bus_destroy(bus);
                kvm->buses[i] = NULL;
        }
        kvm_coalesced_mmio_free(kvm);
#ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
        /*
         * At this point, pending calls to invalidate_range_start()
         * have completed but no more MMU notifiers will run, so
         * mn_active_invalidate_count may remain unbalanced.
         * No threads can be waiting in kvm_swap_active_memslots() as the
         * last reference on KVM has been dropped, but freeing
         * memslots would deadlock without this manual intervention.
         *
         * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU
         * notifier between a start() and end(), then there shouldn't be any
         * in-progress invalidations.
         */
        WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
        if (kvm->mn_active_invalidate_count)
                kvm->mn_active_invalidate_count = 0;
        else
                WARN_ON(kvm->mmu_invalidate_in_progress);
#else
        kvm_flush_shadow_all(kvm);
#endif
        kvm_arch_destroy_vm(kvm);
        kvm_destroy_devices(kvm);
        for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
                kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
        }
        cleanup_srcu_struct(&kvm->irq_srcu);
        cleanup_srcu_struct(&kvm->srcu);
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
        xa_destroy(&kvm->mem_attr_array);
#endif
        kvm_arch_free_vm(kvm);
        preempt_notifier_dec();
        kvm_disable_virtualization();
        mmdrop(mm);
}

void kvm_get_kvm(struct kvm *kvm)
{
        refcount_inc(&kvm->users_count);
}
EXPORT_SYMBOL_GPL(kvm_get_kvm);

/*
 * Make sure the vm is not during destruction, which is a safe version of
 * kvm_get_kvm().  Return true if kvm referenced successfully, false otherwise.
 */
bool kvm_get_kvm_safe(struct kvm *kvm)
{
        return refcount_inc_not_zero(&kvm->users_count);
}
EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);

void kvm_put_kvm(struct kvm *kvm)
{
        if (refcount_dec_and_test(&kvm->users_count))
                kvm_destroy_vm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_put_kvm);

/*
 * Used to put a reference that was taken on behalf of an object associated
 * with a user-visible file descriptor, e.g. a vcpu or device, if installation
 * of the new file descriptor fails and the reference cannot be transferred to
 * its final owner.  In such cases, the caller is still actively using @kvm and
 * will fail miserably if the refcount unexpectedly hits zero.
 */
void kvm_put_kvm_no_destroy(struct kvm *kvm)
{
        WARN_ON(refcount_dec_and_test(&kvm->users_count));
}
EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);

static int kvm_vm_release(struct inode *inode, struct file *filp)
{
        struct kvm *kvm = filp->private_data;

        kvm_irqfd_release(kvm);

        kvm_put_kvm(kvm);
        return 0;
}

/*
 * Allocation size is twice as large as the actual dirty bitmap size.
 * See kvm_vm_ioctl_get_dirty_log() why this is needed.
 */
static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
{
        unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);

        memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT);
        if (!memslot->dirty_bitmap)
                return -ENOMEM;

        return 0;
}

static struct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
{
        struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
        int node_idx_inactive = active->node_idx ^ 1;

        return &kvm->__memslots[as_id][node_idx_inactive];
}

/*
 * Helper to get the address space ID when one of memslot pointers may be NULL.
 * This also serves as a sanity that at least one of the pointers is non-NULL,
 * and that their address space IDs don't diverge.
 */
static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
                                  struct kvm_memory_slot *b)
{
        if (WARN_ON_ONCE(!a && !b))
                return 0;

        if (!a)
                return b->as_id;
        if (!b)
                return a->as_id;

        WARN_ON_ONCE(a->as_id != b->as_id);
        return a->as_id;
}

static void kvm_insert_gfn_node(struct kvm_memslots *slots,
                                struct kvm_memory_slot *slot)
{
        struct rb_root *gfn_tree = &slots->gfn_tree;
        struct rb_node **node, *parent;
        int idx = slots->node_idx;

        parent = NULL;
        for (node = &gfn_tree->rb_node; *node; ) {
                struct kvm_memory_slot *tmp;

                tmp = container_of(*node, struct kvm_memory_slot, gfn_node[idx]);
                parent = *node;
                if (slot->base_gfn < tmp->base_gfn)
                        node = &(*node)->rb_left;
                else if (slot->base_gfn > tmp->base_gfn)
                        node = &(*node)->rb_right;
                else
                        BUG();
        }

        rb_link_node(&slot->gfn_node[idx], parent, node);
        rb_insert_color(&slot->gfn_node[idx], gfn_tree);
}

static void kvm_erase_gfn_node(struct kvm_memslots *slots,
                               struct kvm_memory_slot *slot)
{
        rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
}

static void kvm_replace_gfn_node(struct kvm_memslots *slots,
                                 struct kvm_memory_slot *old,
                                 struct kvm_memory_slot *new)
{
        int idx = slots->node_idx;

        WARN_ON_ONCE(old->base_gfn != new->base_gfn);

        rb_replace_node(&old->gfn_node[idx], &new->gfn_node[idx],
                        &slots->gfn_tree);
}

/*
 * Replace @old with @new in the inactive memslots.
 *
 * With NULL @old this simply adds @new.
 * With NULL @new this simply removes @old.
 *
 * If @new is non-NULL its hva_node[slots_idx] range has to be set
 * appropriately.
 */
static void kvm_replace_memslot(struct kvm *kvm,
                                struct kvm_memory_slot *old,
                                struct kvm_memory_slot *new)
{
        int as_id = kvm_memslots_get_as_id(old, new);
        struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
        int idx = slots->node_idx;

        if (old) {
                hash_del(&old->id_node[idx]);
                interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);

                if ((long)old == atomic_long_read(&slots->last_used_slot))
                        atomic_long_set(&slots->last_used_slot, (long)new);

                if (!new) {
                        kvm_erase_gfn_node(slots, old);
                        return;
                }
        }

        /*
         * Initialize @new's hva range.  Do this even when replacing an @old
         * slot, kvm_copy_memslot() deliberately does not touch node data.
         */
        new->hva_node[idx].start = new->userspace_addr;
        new->hva_node[idx].last = new->userspace_addr +
                                  (new->npages << PAGE_SHIFT) - 1;

        /*
         * (Re)Add the new memslot.  There is no O(1) interval_tree_replace(),
         * hva_node needs to be swapped with remove+insert even though hva can't
         * change when replacing an existing slot.
         */
        hash_add(slots->id_hash, &new->id_node[idx], new->id);
        interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);

        /*
         * If the memslot gfn is unchanged, rb_replace_node() can be used to
         * switch the node in the gfn tree instead of removing the old and
         * inserting the new as two separate operations. Replacement is a
         * single O(1) operation versus two O(log(n)) operations for
         * remove+insert.
         */
        if (old && old->base_gfn == new->base_gfn) {
                kvm_replace_gfn_node(slots, old, new);
        } else {
                if (old)
                        kvm_erase_gfn_node(slots, old);
                kvm_insert_gfn_node(slots, new);
        }
}

/*
 * Flags that do not access any of the extra space of struct
 * kvm_userspace_memory_region2.  KVM_SET_USER_MEMORY_REGION_V1_FLAGS
 * only allows these.
 */
#define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
        (KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)

static int check_memory_region_flags(struct kvm *kvm,
                                     const struct kvm_userspace_memory_region2 *mem)
{
        u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;

        if (kvm_arch_has_private_mem(kvm))
                valid_flags |= KVM_MEM_GUEST_MEMFD;

        /* Dirty logging private memory is not currently supported. */
        if (mem->flags & KVM_MEM_GUEST_MEMFD)
                valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;

        /*
         * GUEST_MEMFD is incompatible with read-only memslots, as writes to
         * read-only memslots have emulated MMIO, not page fault, semantics,
         * and KVM doesn't allow emulated MMIO for private memory.
         */
        if (kvm_arch_has_readonly_mem(kvm) &&
            !(mem->flags & KVM_MEM_GUEST_MEMFD))
                valid_flags |= KVM_MEM_READONLY;

        if (mem->flags & ~valid_flags)
                return -EINVAL;

        return 0;
}

static void kvm_swap_active_memslots(struct kvm *kvm, int as_id)
{
        struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);

        /* Grab the generation from the activate memslots. */
        u64 gen = __kvm_memslots(kvm, as_id)->generation;

        WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
        slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;

        /*
         * Do not store the new memslots while there are invalidations in
         * progress, otherwise the locking in invalidate_range_start and
         * invalidate_range_end will be unbalanced.
         */
        spin_lock(&kvm->mn_invalidate_lock);
        prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait);
        while (kvm->mn_active_invalidate_count) {
                set_current_state(TASK_UNINTERRUPTIBLE);
                spin_unlock(&kvm->mn_invalidate_lock);
                schedule();
                spin_lock(&kvm->mn_invalidate_lock);
        }
        finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
        rcu_assign_pointer(kvm->memslots[as_id], slots);
        spin_unlock(&kvm->mn_invalidate_lock);

        /*
         * Acquired in kvm_set_memslot. Must be released before synchronize
         * SRCU below in order to avoid deadlock with another thread
         * acquiring the slots_arch_lock in an srcu critical section.
         */
        mutex_unlock(&kvm->slots_arch_lock);

        synchronize_srcu_expedited(&kvm->srcu);

        /*
         * Increment the new memslot generation a second time, dropping the
         * update in-progress flag and incrementing the generation based on
         * the number of address spaces.  This provides a unique and easily
         * identifiable generation number while the memslots are in flux.
         */
        gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;

        /*
         * Generations must be unique even across address spaces.  We do not need
         * a global counter for that, instead the generation space is evenly split
         * across address spaces.  For example, with two address spaces, address
         * space 0 will use generations 0, 2, 4, ... while address space 1 will
         * use generations 1, 3, 5, ...
         */
        gen += kvm_arch_nr_memslot_as_ids(kvm);

        kvm_arch_memslots_updated(kvm, gen);

        slots->generation = gen;
}

static int kvm_prepare_memory_region(struct kvm *kvm,
                                     const struct kvm_memory_slot *old,
                                     struct kvm_memory_slot *new,
                                     enum kvm_mr_change change)
{
        int r;

        /*
         * If dirty logging is disabled, nullify the bitmap; the old bitmap
         * will be freed on "commit".  If logging is enabled in both old and
         * new, reuse the existing bitmap.  If logging is enabled only in the
         * new and KVM isn't using a ring buffer, allocate and initialize a
         * new bitmap.
         */
        if (change != KVM_MR_DELETE) {
                if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
                        new->dirty_bitmap = NULL;
                else if (old && old->dirty_bitmap)
                        new->dirty_bitmap = old->dirty_bitmap;
                else if (kvm_use_dirty_bitmap(kvm)) {
                        r = kvm_alloc_dirty_bitmap(new);
                        if (r)
                                return r;

                        if (kvm_dirty_log_manual_protect_and_init_set(kvm))
                                bitmap_set(new->dirty_bitmap, 0, new->npages);
                }
        }

        r = kvm_arch_prepare_memory_region(kvm, old, new, change);

        /* Free the bitmap on failure if it was allocated above. */
        if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
                kvm_destroy_dirty_bitmap(new);

        return r;
}

static void kvm_commit_memory_region(struct kvm *kvm,
                                     struct kvm_memory_slot *old,
                                     const struct kvm_memory_slot *new,
                                     enum kvm_mr_change change)
{
        int old_flags = old ? old->flags : 0;
        int new_flags = new ? new->flags : 0;
        /*
         * Update the total number of memslot pages before calling the arch
         * hook so that architectures can consume the result directly.
         */
        if (change == KVM_MR_DELETE)
                kvm->nr_memslot_pages -= old->npages;
        else if (change == KVM_MR_CREATE)
                kvm->nr_memslot_pages += new->npages;

        if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
                int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? 1 : -1;
                atomic_set(&kvm->nr_memslots_dirty_logging,
                           atomic_read(&kvm->nr_memslots_dirty_logging) + change);
        }

        kvm_arch_commit_memory_region(kvm, old, new, change);

        switch (change) {
        case KVM_MR_CREATE:
                /* Nothing more to do. */
                break;
        case KVM_MR_DELETE:
                /* Free the old memslot and all its metadata. */
                kvm_free_memslot(kvm, old);
                break;
        case KVM_MR_MOVE:
        case KVM_MR_FLAGS_ONLY:
                /*
                 * Free the dirty bitmap as needed; the below check encompasses
                 * both the flags and whether a ring buffer is being used)
                 */
                if (old->dirty_bitmap && !new->dirty_bitmap)
                        kvm_destroy_dirty_bitmap(old);

                /*
                 * The final quirk.  Free the detached, old slot, but only its
                 * memory, not any metadata.  Metadata, including arch specific
                 * data, may be reused by @new.
                 */
                kfree(old);
                break;
        default:
                BUG();
        }
}

/*
 * Activate @new, which must be installed in the inactive slots by the caller,
 * by swapping the active slots and then propagating @new to @old once @old is
 * unreachable and can be safely modified.
 *
 * With NULL @old this simply adds @new to @active (while swapping the sets).
 * With NULL @new this simply removes @old from @active and frees it
 * (while also swapping the sets).
 */
static void kvm_activate_memslot(struct kvm *kvm,
                                 struct kvm_memory_slot *old,
                                 struct kvm_memory_slot *new)
{
        int as_id = kvm_memslots_get_as_id(old, new);

        kvm_swap_active_memslots(kvm, as_id);

        /* Propagate the new memslot to the now inactive memslots. */
        kvm_replace_memslot(kvm, old, new);
}

static void kvm_copy_memslot(struct kvm_memory_slot *dest,
                             const struct kvm_memory_slot *src)
{
        dest->base_gfn = src->base_gfn;
        dest->npages = src->npages;
        dest->dirty_bitmap = src->dirty_bitmap;
        dest->arch = src->arch;
        dest->userspace_addr = src->userspace_addr;
        dest->flags = src->flags;
        dest->id = src->id;
        dest->as_id = src->as_id;
}

static void kvm_invalidate_memslot(struct kvm *kvm,
                                   struct kvm_memory_slot *old,
                                   struct kvm_memory_slot *invalid_slot)
{
        /*
         * Mark the current slot INVALID.  As with all memslot modifications,
         * this must be done on an unreachable slot to avoid modifying the
         * current slot in the active tree.
         */
        kvm_copy_memslot(invalid_slot, old);
        invalid_slot->flags |= KVM_MEMSLOT_INVALID;
        kvm_replace_memslot(kvm, old, invalid_slot);

        /*
         * Activate the slot that is now marked INVALID, but don't propagate
         * the slot to the now inactive slots. The slot is either going to be
         * deleted or recreated as a new slot.
         */
        kvm_swap_active_memslots(kvm, old->as_id);

        /*
         * From this point no new shadow pages pointing to a deleted, or moved,
         * memslot will be created.  Validation of sp->gfn happens in:
         *        - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
         *        - kvm_is_visible_gfn (mmu_check_root)
         */
        kvm_arch_flush_shadow_memslot(kvm, old);
        kvm_arch_guest_memory_reclaimed(kvm);

        /* Was released by kvm_swap_active_memslots(), reacquire. */
        mutex_lock(&kvm->slots_arch_lock);

        /*
         * Copy the arch-specific field of the newly-installed slot back to the
         * old slot as the arch data could have changed between releasing
         * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
         * above.  Writers are required to retrieve memslots *after* acquiring
         * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
         */
        old->arch = invalid_slot->arch;
}

static void kvm_create_memslot(struct kvm *kvm,
                               struct kvm_memory_slot *new)
{
        /* Add the new memslot to the inactive set and activate. */
        kvm_replace_memslot(kvm, NULL, new);
        kvm_activate_memslot(kvm, NULL, new);
}

static void kvm_delete_memslot(struct kvm *kvm,
                               struct kvm_memory_slot *old,
                               struct kvm_memory_slot *invalid_slot)
{
        /*
         * Remove the old memslot (in the inactive memslots) by passing NULL as
         * the "new" slot, and for the invalid version in the active slots.
         */
        kvm_replace_memslot(kvm, old, NULL);
        kvm_activate_memslot(kvm, invalid_slot, NULL);
}

static void kvm_move_memslot(struct kvm *kvm,
                             struct kvm_memory_slot *old,
                             struct kvm_memory_slot *new,
                             struct kvm_memory_slot *invalid_slot)
{
        /*
         * Replace the old memslot in the inactive slots, and then swap slots
         * and replace the current INVALID with the new as well.
         */
        kvm_replace_memslot(kvm, old, new);
        kvm_activate_memslot(kvm, invalid_slot, new);
}

static void kvm_update_flags_memslot(struct kvm *kvm,
                                     struct kvm_memory_slot *old,
                                     struct kvm_memory_slot *new)
{
        /*
         * Similar to the MOVE case, but the slot doesn't need to be zapped as
         * an intermediate step. Instead, the old memslot is simply replaced
         * with a new, updated copy in both memslot sets.
         */
        kvm_replace_memslot(kvm, old, new);
        kvm_activate_memslot(kvm, old, new);
}

static int kvm_set_memslot(struct kvm *kvm,
                           struct kvm_memory_slot *old,
                           struct kvm_memory_slot *new,
                           enum kvm_mr_change change)
{
        struct kvm_memory_slot *invalid_slot;
        int r;

        /*
         * Released in kvm_swap_active_memslots().
         *
         * Must be held from before the current memslots are copied until after
         * the new memslots are installed with rcu_assign_pointer, then
         * released before the synchronize srcu in kvm_swap_active_memslots().
         *
         * When modifying memslots outside of the slots_lock, must be held
         * before reading the pointer to the current memslots until after all
         * changes to those memslots are complete.
         *
         * These rules ensure that installing new memslots does not lose
         * changes made to the previous memslots.
         */
        mutex_lock(&kvm->slots_arch_lock);

        /*
         * Invalidate the old slot if it's being deleted or moved.  This is
         * done prior to actually deleting/moving the memslot to allow vCPUs to
         * continue running by ensuring there are no mappings or shadow pages
         * for the memslot when it is deleted/moved.  Without pre-invalidation
         * (and without a lock), a window would exist between effecting the
         * delete/move and committing the changes in arch code where KVM or a
         * guest could access a non-existent memslot.
         *
         * Modifications are done on a temporary, unreachable slot.  The old
         * slot needs to be preserved in case a later step fails and the
         * invalidation needs to be reverted.
         */
        if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
                invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
                if (!invalid_slot) {
                        mutex_unlock(&kvm->slots_arch_lock);
                        return -ENOMEM;
                }
                kvm_invalidate_memslot(kvm, old, invalid_slot);
        }

        r = kvm_prepare_memory_region(kvm, old, new, change);
        if (r) {
                /*
                 * For DELETE/MOVE, revert the above INVALID change.  No
                 * modifications required since the original slot was preserved
                 * in the inactive slots.  Changing the active memslots also
                 * release slots_arch_lock.
                 */
                if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
                        kvm_activate_memslot(kvm, invalid_slot, old);
                        kfree(invalid_slot);
                } else {
                        mutex_unlock(&kvm->slots_arch_lock);
                }
                return r;
        }

        /*
         * For DELETE and MOVE, the working slot is now active as the INVALID
         * version of the old slot.  MOVE is particularly special as it reuses
         * the old slot and returns a copy of the old slot (in working_slot).
         * For CREATE, there is no old slot.  For DELETE and FLAGS_ONLY, the
         * old slot is detached but otherwise preserved.
         */
        if (change == KVM_MR_CREATE)
                kvm_create_memslot(kvm, new);
        else if (change == KVM_MR_DELETE)
                kvm_delete_memslot(kvm, old, invalid_slot);
        else if (change == KVM_MR_MOVE)
                kvm_move_memslot(kvm, old, new, invalid_slot);
        else if (change == KVM_MR_FLAGS_ONLY)
                kvm_update_flags_memslot(kvm, old, new);
        else
                BUG();

        /* Free the temporary INVALID slot used for DELETE and MOVE. */
        if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
                kfree(invalid_slot);

        /*
         * No need to refresh new->arch, changes after dropping slots_arch_lock
         * will directly hit the final, active memslot.  Architectures are
         * responsible for knowing that new->arch may be stale.
         */
        kvm_commit_memory_region(kvm, old, new, change);

        return 0;
}

static bool kvm_check_memslot_overlap(struct kvm_memslots *slots, int id,
                                      gfn_t start, gfn_t end)
{
        struct kvm_memslot_iter iter;

        kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
                if (iter.slot->id != id)
                        return true;
        }

        return false;
}

static int kvm_set_memory_region(struct kvm *kvm,
                                 const struct kvm_userspace_memory_region2 *mem)
{
        struct kvm_memory_slot *old, *new;
        struct kvm_memslots *slots;
        enum kvm_mr_change change;
        unsigned long npages;
        gfn_t base_gfn;
        int as_id, id;
        int r;

        lockdep_assert_held(&kvm->slots_lock);

        r = check_memory_region_flags(kvm, mem);
        if (r)
                return r;

        as_id = mem->slot >> 16;
        id = (u16)mem->slot;

        /* General sanity checks */
        if ((mem->memory_size & (PAGE_SIZE - 1)) ||
            (mem->memory_size != (unsigned long)mem->memory_size))
                return -EINVAL;
        if (mem->guest_phys_addr & (PAGE_SIZE - 1))
                return -EINVAL;
        /* We can read the guest memory with __xxx_user() later on. */
        if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
            (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
             !access_ok((void __user *)(unsigned long)mem->userspace_addr,
                        mem->memory_size))
                return -EINVAL;
        if (mem->flags & KVM_MEM_GUEST_MEMFD &&
            (mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
             mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset))
                return -EINVAL;
        if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM)
                return -EINVAL;
        if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
                return -EINVAL;

        /*
         * The size of userspace-defined memory regions is restricted in order
         * to play nice with dirty bitmap operations, which are indexed with an
         * "unsigned int".  KVM's internal memory regions don't support dirty
         * logging, and so are exempt.
         */
        if (id < KVM_USER_MEM_SLOTS &&
            (mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
                return -EINVAL;

        slots = __kvm_memslots(kvm, as_id);

        /*
         * Note, the old memslot (and the pointer itself!) may be invalidated
         * and/or destroyed by kvm_set_memslot().
         */
        old = id_to_memslot(slots, id);

        if (!mem->memory_size) {
                if (!old || !old->npages)
                        return -EINVAL;

                if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
                        return -EIO;

                return kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE);
        }

        base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
        npages = (mem->memory_size >> PAGE_SHIFT);

        if (!old || !old->npages) {
                change = KVM_MR_CREATE;

                /*
                 * To simplify KVM internals, the total number of pages across
                 * all memslots must fit in an unsigned long.
                 */
                if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
                        return -EINVAL;
        } else { /* Modify an existing slot. */
                /* Private memslots are immutable, they can only be deleted. */
                if (mem->flags & KVM_MEM_GUEST_MEMFD)
                        return -EINVAL;
                if ((mem->userspace_addr != old->userspace_addr) ||
                    (npages != old->npages) ||
                    ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
                        return -EINVAL;

                if (base_gfn != old->base_gfn)
                        change = KVM_MR_MOVE;
                else if (mem->flags != old->flags)
                        change = KVM_MR_FLAGS_ONLY;
                else /* Nothing to change. */
                        return 0;
        }

        if ((change == KVM_MR_CREATE || change == KVM_MR_MOVE) &&
            kvm_check_memslot_overlap(slots, id, base_gfn, base_gfn + npages))
                return -EEXIST;

        /* Allocate a slot that will persist in the memslot. */
        new = kzalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
        if (!new)
                return -ENOMEM;

        new->as_id = as_id;
        new->id = id;
        new->base_gfn = base_gfn;
        new->npages = npages;
        new->flags = mem->flags;
        new->userspace_addr = mem->userspace_addr;
        if (mem->flags & KVM_MEM_GUEST_MEMFD) {
                r = kvm_gmem_bind(kvm, new, mem->guest_memfd, mem->guest_memfd_offset);
                if (r)
                        goto out;
        }

        r = kvm_set_memslot(kvm, old, new, change);
        if (r)
                goto out_unbind;

        return 0;

out_unbind:
        if (mem->flags & KVM_MEM_GUEST_MEMFD)
                kvm_gmem_unbind(new);
out:
        kfree(new);
        return r;
}

int kvm_set_internal_memslot(struct kvm *kvm,
                             const struct kvm_userspace_memory_region2 *mem)
{
        if (WARN_ON_ONCE(mem->slot < KVM_USER_MEM_SLOTS))
                return -EINVAL;

        if (WARN_ON_ONCE(mem->flags))
                return -EINVAL;

        return kvm_set_memory_region(kvm, mem);
}
EXPORT_SYMBOL_GPL(kvm_set_internal_memslot);

static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
                                          struct kvm_userspace_memory_region2 *mem)
{
        if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
                return -EINVAL;

        guard(mutex)(&kvm->slots_lock);
        return kvm_set_memory_region(kvm, mem);
}

#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
/**
 * kvm_get_dirty_log - get a snapshot of dirty pages
 * @kvm:        pointer to kvm instance
 * @log:        slot id and address to which we copy the log
 * @is_dirty:        set to '1' if any dirty pages were found
 * @memslot:        set to the associated memslot, always valid on success
 */
int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log,
                      int *is_dirty, struct kvm_memory_slot **memslot)
{
        struct kvm_memslots *slots;
        int i, as_id, id;
        unsigned long n;
        unsigned long any = 0;

        /* Dirty ring tracking may be exclusive to dirty log tracking */
        if (!kvm_use_dirty_bitmap(kvm))
                return -ENXIO;

        *memslot = NULL;
        *is_dirty = 0;

        as_id = log->slot >> 16;
        id = (u16)log->slot;
        if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
                return -EINVAL;

        slots = __kvm_memslots(kvm, as_id);
        *memslot = id_to_memslot(slots, id);
        if (!(*memslot) || !(*memslot)->dirty_bitmap)
                return -ENOENT;

        kvm_arch_sync_dirty_log(kvm, *memslot);

        n = kvm_dirty_bitmap_bytes(*memslot);

        for (i = 0; !any && i < n/sizeof(long); ++i)
                any = (*memslot)->dirty_bitmap[i];

        if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
                return -EFAULT;

        if (any)
                *is_dirty = 1;
        return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dirty_log);

#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
/**
 * kvm_get_dirty_log_protect - get a snapshot of dirty pages
 *        and reenable dirty page tracking for the corresponding pages.
 * @kvm:        pointer to kvm instance
 * @log:        slot id and address to which we copy the log
 *
 * We need to keep it in mind that VCPU threads can write to the bitmap
 * concurrently. So, to avoid losing track of dirty pages we keep the
 * following order:
 *
 *    1. Take a snapshot of the bit and clear it if needed.
 *    2. Write protect the corresponding page.
 *    3. Copy the snapshot to the userspace.
 *    4. Upon return caller flushes TLB's if needed.
 *
 * Between 2 and 4, the guest may write to the page using the remaining TLB
 * entry.  This is not a problem because the page is reported dirty using
 * the snapshot taken before and step 4 ensures that writes done after
 * exiting to userspace will be logged for the next call.
 *
 */
static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
{
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int i, as_id, id;
        unsigned long n;
        unsigned long *dirty_bitmap;
        unsigned long *dirty_bitmap_buffer;
        bool flush;

        /* Dirty ring tracking may be exclusive to dirty log tracking */
        if (!kvm_use_dirty_bitmap(kvm))
                return -ENXIO;

        as_id = log->slot >> 16;
        id = (u16)log->slot;
        if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
                return -EINVAL;

        slots = __kvm_memslots(kvm, as_id);
        memslot = id_to_memslot(slots, id);
        if (!memslot || !memslot->dirty_bitmap)
                return -ENOENT;

        dirty_bitmap = memslot->dirty_bitmap;

        kvm_arch_sync_dirty_log(kvm, memslot);

        n = kvm_dirty_bitmap_bytes(memslot);
        flush = false;
        if (kvm->manual_dirty_log_protect) {
                /*
                 * Unlike kvm_get_dirty_log, we always return false in *flush,
                 * because no flush is needed until KVM_CLEAR_DIRTY_LOG.  There
                 * is some code duplication between this function and
                 * kvm_get_dirty_log, but hopefully all architecture
                 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
                 * can be eliminated.
                 */
                dirty_bitmap_buffer = dirty_bitmap;
        } else {
                dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
                memset(dirty_bitmap_buffer, 0, n);

                KVM_MMU_LOCK(kvm);
                for (i = 0; i < n / sizeof(long); i++) {
                        unsigned long mask;
                        gfn_t offset;

                        if (!dirty_bitmap[i])
                                continue;

                        flush = true;
                        mask = xchg(&dirty_bitmap[i], 0);
                        dirty_bitmap_buffer[i] = mask;

                        offset = i * BITS_PER_LONG;
                        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
                                                                offset, mask);
                }
                KVM_MMU_UNLOCK(kvm);
        }

        if (flush)
                kvm_flush_remote_tlbs_memslot(kvm, memslot);

        if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
                return -EFAULT;
        return 0;
}


/**
 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
 * @kvm: kvm instance
 * @log: slot id and address to which we copy the log
 *
 * Steps 1-4 below provide general overview of dirty page logging. See
 * kvm_get_dirty_log_protect() function description for additional details.
 *
 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
 * always flush the TLB (step 4) even if previous step failed  and the dirty
 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
 * does not preclude user space subsequent dirty log read. Flushing TLB ensures
 * writes will be marked dirty for next log read.
 *
 *   1. Take a snapshot of the bit and clear it if needed.
 *   2. Write protect the corresponding page.
 *   3. Copy the snapshot to the userspace.
 *   4. Flush TLB's if needed.
 */
static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
                                      struct kvm_dirty_log *log)
{
        int r;

        mutex_lock(&kvm->slots_lock);

        r = kvm_get_dirty_log_protect(kvm, log);

        mutex_unlock(&kvm->slots_lock);
        return r;
}

/**
 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
 *        and reenable dirty page tracking for the corresponding pages.
 * @kvm:        pointer to kvm instance
 * @log:        slot id and address from which to fetch the bitmap of dirty pages
 */
static int kvm_clear_dirty_log_protect(struct kvm *kvm,
                                       struct kvm_clear_dirty_log *log)
{
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int as_id, id;
        gfn_t offset;
        unsigned long i, n;
        unsigned long *dirty_bitmap;
        unsigned long *dirty_bitmap_buffer;
        bool flush;

        /* Dirty ring tracking may be exclusive to dirty log tracking */
        if (!kvm_use_dirty_bitmap(kvm))
                return -ENXIO;

        as_id = log->slot >> 16;
        id = (u16)log->slot;
        if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS)
                return -EINVAL;

        if (log->first_page & 63)
                return -EINVAL;

        slots = __kvm_memslots(kvm, as_id);
        memslot = id_to_memslot(slots, id);
        if (!memslot || !memslot->dirty_bitmap)
                return -ENOENT;

        dirty_bitmap = memslot->dirty_bitmap;

        n = ALIGN(log->num_pages, BITS_PER_LONG) / 8;

        if (log->first_page > memslot->npages ||
            log->num_pages > memslot->npages - log->first_page ||
            (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63)))
            return -EINVAL;

        kvm_arch_sync_dirty_log(kvm, memslot);

        flush = false;
        dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
        if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n))
                return -EFAULT;

        KVM_MMU_LOCK(kvm);
        for (offset = log->first_page, i = offset / BITS_PER_LONG,
                 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
             i++, offset += BITS_PER_LONG) {
                unsigned long mask = *dirty_bitmap_buffer++;
                atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i];
                if (!mask)
                        continue;

                mask &= atomic_long_fetch_andnot(mask, p);

                /*
                 * mask contains the bits that really have been cleared.  This
                 * never includes any bits beyond the length of the memslot (if
                 * the length is not aligned to 64 pages), therefore it is not
                 * a problem if userspace sets them in log->dirty_bitmap.
                */
                if (mask) {
                        flush = true;
                        kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot,
                                                                offset, mask);
                }
        }
        KVM_MMU_UNLOCK(kvm);

        if (flush)
                kvm_flush_remote_tlbs_memslot(kvm, memslot);

        return 0;
}

static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
                                        struct kvm_clear_dirty_log *log)
{
        int r;

        mutex_lock(&kvm->slots_lock);

        r = kvm_clear_dirty_log_protect(kvm, log);

        mutex_unlock(&kvm->slots_lock);
        return r;
}
#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */

#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
        if (!kvm || kvm_arch_has_private_mem(kvm))
                return KVM_MEMORY_ATTRIBUTE_PRIVATE;

        return 0;
}

/*
 * Returns true if _all_ gfns in the range [@start, @end) have attributes
 * such that the bits in @mask match @attrs.
 */
bool kvm_range_has_memory_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
                                     unsigned long mask, unsigned long attrs)
{
        XA_STATE(xas, &kvm->mem_attr_array, start);
        unsigned long index;
        void *entry;

        mask &= kvm_supported_mem_attributes(kvm);
        if (attrs & ~mask)
                return false;

        if (end == start + 1)
                return (kvm_get_memory_attributes(kvm, start) & mask) == attrs;

        guard(rcu)();
        if (!attrs)
                return !xas_find(&xas, end - 1);

        for (index = start; index < end; index++) {
                do {
                        entry = xas_next(&xas);
                } while (xas_retry(&xas, entry));

                if (xas.xa_index != index ||
                    (xa_to_value(entry) & mask) != attrs)
                        return false;
        }

        return true;
}

static __always_inline void kvm_handle_gfn_range(struct kvm *kvm,
                                                 struct kvm_mmu_notifier_range *range)
{
        struct kvm_gfn_range gfn_range;
        struct kvm_memory_slot *slot;
        struct kvm_memslots *slots;
        struct kvm_memslot_iter iter;
        bool found_memslot = false;
        bool ret = false;
        int i;

        gfn_range.arg = range->arg;
        gfn_range.may_block = range->may_block;

        /*
         * If/when KVM supports more attributes beyond private .vs shared, this
         * _could_ set KVM_FILTER_{SHARED,PRIVATE} appropriately if the entire target
         * range already has the desired private vs. shared state (it's unclear
         * if that is a net win).  For now, KVM reaches this point if and only
         * if the private flag is being toggled, i.e. all mappings are in play.
         */

        for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                slots = __kvm_memslots(kvm, i);

                kvm_for_each_memslot_in_gfn_range(&iter, slots, range->start, range->end) {
                        slot = iter.slot;
                        gfn_range.slot = slot;

                        gfn_range.start = max(range->start, slot->base_gfn);
                        gfn_range.end = min(range->end, slot->base_gfn + slot->npages);
                        if (gfn_range.start >= gfn_range.end)
                                continue;

                        if (!found_memslot) {
                                found_memslot = true;
                                KVM_MMU_LOCK(kvm);
                                if (!IS_KVM_NULL_FN(range->on_lock))
                                        range->on_lock(kvm);
                        }

                        ret |= range->handler(kvm, &gfn_range);
                }
        }

        if (range->flush_on_ret && ret)
                kvm_flush_remote_tlbs(kvm);

        if (found_memslot)
                KVM_MMU_UNLOCK(kvm);
}

static bool kvm_pre_set_memory_attributes(struct kvm *kvm,
                                          struct kvm_gfn_range *range)
{
        /*
         * Unconditionally add the range to the invalidation set, regardless of
         * whether or not the arch callback actually needs to zap SPTEs.  E.g.
         * if KVM supports RWX attributes in the future and the attributes are
         * going from R=>RW, zapping isn't strictly necessary.  Unconditionally
         * adding the range allows KVM to require that MMU invalidations add at
         * least one range between begin() and end(), e.g. allows KVM to detect
         * bugs where the add() is missed.  Relaxing the rule *might* be safe,
         * but it's not obvious that allowing new mappings while the attributes
         * are in flux is desirable or worth the complexity.
         */
        kvm_mmu_invalidate_range_add(kvm, range->start, range->end);

        return kvm_arch_pre_set_memory_attributes(kvm, range);
}

/* Set @attributes for the gfn range [@start, @end). */
static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
                                     unsigned long attributes)
{
        struct kvm_mmu_notifier_range pre_set_range = {
                .start = start,
                .end = end,
                .arg.attributes = attributes,
                .handler = kvm_pre_set_memory_attributes,
                .on_lock = kvm_mmu_invalidate_begin,
                .flush_on_ret = true,
                .may_block = true,
        };
        struct kvm_mmu_notifier_range post_set_range = {
                .start = start,
                .end = end,
                .arg.attributes = attributes,
                .handler = kvm_arch_post_set_memory_attributes,
                .on_lock = kvm_mmu_invalidate_end,
                .may_block = true,
        };
        unsigned long i;
        void *entry;
        int r = 0;

        entry = attributes ? xa_mk_value(attributes) : NULL;

        mutex_lock(&kvm->slots_lock);

        /* Nothing to do if the entire range as the desired attributes. */
        if (kvm_range_has_memory_attributes(kvm, start, end, ~0, attributes))
                goto out_unlock;

        /*
         * Reserve memory ahead of time to avoid having to deal with failures
         * partway through setting the new attributes.
         */
        for (i = start; i < end; i++) {
                r = xa_reserve(&kvm->mem_attr_array, i, GFP_KERNEL_ACCOUNT);
                if (r)
                        goto out_unlock;
        }

        kvm_handle_gfn_range(kvm, &pre_set_range);

        for (i = start; i < end; i++) {
                r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
                                    GFP_KERNEL_ACCOUNT));
                KVM_BUG_ON(r, kvm);
        }

        kvm_handle_gfn_range(kvm, &post_set_range);

out_unlock:
        mutex_unlock(&kvm->slots_lock);

        return r;
}
static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
                                           struct kvm_memory_attributes *attrs)
{
        gfn_t start, end;

        /* flags is currently not used. */
        if (attrs->flags)
                return -EINVAL;
        if (attrs->attributes & ~kvm_supported_mem_attributes(kvm))
                return -EINVAL;
        if (attrs->size == 0 || attrs->address + attrs->size < attrs->address)
                return -EINVAL;
        if (!PAGE_ALIGNED(attrs->address) || !PAGE_ALIGNED(attrs->size))
                return -EINVAL;

        start = attrs->address >> PAGE_SHIFT;
        end = (attrs->address + attrs->size) >> PAGE_SHIFT;

        /*
         * xarray tracks data using "unsigned long", and as a result so does
         * KVM.  For simplicity, supports generic attributes only on 64-bit
         * architectures.
         */
        BUILD_BUG_ON(sizeof(attrs->attributes) != sizeof(unsigned long));

        return kvm_vm_set_mem_attributes(kvm, start, end, attrs->attributes);
}
#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */

struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
{
        return __gfn_to_memslot(kvm_memslots(kvm), gfn);
}
EXPORT_SYMBOL_GPL(gfn_to_memslot);

struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn)
{
        struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
        u64 gen = slots->generation;
        struct kvm_memory_slot *slot;

        /*
         * This also protects against using a memslot from a different address space,
         * since different address spaces have different generation numbers.
         */
        if (unlikely(gen != vcpu->last_used_slot_gen)) {
                vcpu->last_used_slot = NULL;
                vcpu->last_used_slot_gen = gen;
        }

        slot = try_get_memslot(vcpu->last_used_slot, gfn);
        if (slot)
                return slot;

        /*
         * Fall back to searching all memslots. We purposely use
         * search_memslots() instead of __gfn_to_memslot() to avoid
         * thrashing the VM-wide last_used_slot in kvm_memslots.
         */
        slot = search_memslots(slots, gfn, false);
        if (slot) {
                vcpu->last_used_slot = slot;
                return slot;
        }

        return NULL;
}

bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
        struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);

        return kvm_is_visible_memslot(memslot);
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);

bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
{
        struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);

        return kvm_is_visible_memslot(memslot);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);

unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
{
        struct vm_area_struct *vma;
        unsigned long addr, size;

        size = PAGE_SIZE;

        addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
        if (kvm_is_error_hva(addr))
                return PAGE_SIZE;

        mmap_read_lock(current->mm);
        vma = find_vma(current->mm, addr);
        if (!vma)
                goto out;

        size = vma_kernel_pagesize(vma);

out:
        mmap_read_unlock(current->mm);

        return size;
}

static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
{
        return slot->flags & KVM_MEM_READONLY;
}

static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
                                       gfn_t *nr_pages, bool write)
{
        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
                return KVM_HVA_ERR_BAD;

        if (memslot_is_readonly(slot) && write)
                return KVM_HVA_ERR_RO_BAD;

        if (nr_pages)
                *nr_pages = slot->npages - (gfn - slot->base_gfn);

        return __gfn_to_hva_memslot(slot, gfn);
}

static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
                                     gfn_t *nr_pages)
{
        return __gfn_to_hva_many(slot, gfn, nr_pages, true);
}

unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
                                        gfn_t gfn)
{
        return gfn_to_hva_many(slot, gfn, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);

unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
        return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL);
}
EXPORT_SYMBOL_GPL(gfn_to_hva);

unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
{
        return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);

/*
 * Return the hva of a @gfn and the R/W attribute if possible.
 *
 * @slot: the kvm_memory_slot which contains @gfn
 * @gfn: the gfn to be translated
 * @writable: used to return the read/write attribute of the @slot if the hva
 * is valid and @writable is not NULL
 */
unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
                                      gfn_t gfn, bool *writable)
{
        unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false);

        if (!kvm_is_error_hva(hva) && writable)
                *writable = !memslot_is_readonly(slot);

        return hva;
}

unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable)
{
        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);

        return gfn_to_hva_memslot_prot(slot, gfn, writable);
}

unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable)
{
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);

        return gfn_to_hva_memslot_prot(slot, gfn, writable);
}

static bool kvm_is_ad_tracked_page(struct page *page)
{
        /*
         * Per page-flags.h, pages tagged PG_reserved "should in general not be
         * touched (e.g. set dirty) except by its owner".
         */
        return !PageReserved(page);
}

static void kvm_set_page_dirty(struct page *page)
{
        if (kvm_is_ad_tracked_page(page))
                SetPageDirty(page);
}

static void kvm_set_page_accessed(struct page *page)
{
        if (kvm_is_ad_tracked_page(page))
                mark_page_accessed(page);
}

void kvm_release_page_clean(struct page *page)
{
        if (!page)
                return;

        kvm_set_page_accessed(page);
        put_page(page);
}
EXPORT_SYMBOL_GPL(kvm_release_page_clean);

void kvm_release_page_dirty(struct page *page)
{
        if (!page)
                return;

        kvm_set_page_dirty(page);
        kvm_release_page_clean(page);
}
EXPORT_SYMBOL_GPL(kvm_release_page_dirty);

static kvm_pfn_t kvm_resolve_pfn(struct kvm_follow_pfn *kfp, struct page *page,
                                 struct follow_pfnmap_args *map, bool writable)
{
        kvm_pfn_t pfn;

        WARN_ON_ONCE(!!page == !!map);

        if (kfp->map_writable)
                *kfp->map_writable = writable;

        if (map)
                pfn = map->pfn;
        else
                pfn = page_to_pfn(page);

        *kfp->refcounted_page = page;

        return pfn;
}

/*
 * The fast path to get the writable pfn which will be stored in @pfn,
 * true indicates success, otherwise false is returned.
 */
static bool hva_to_pfn_fast(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
{
        struct page *page;
        bool r;

        /*
         * Try the fast-only path when the caller wants to pin/get the page for
         * writing.  If the caller only wants to read the page, KVM must go
         * down the full, slow path in order to avoid racing an operation that
         * breaks Copy-on-Write (CoW), e.g. so that KVM doesn't end up pointing
         * at the old, read-only page while mm/ points at a new, writable page.
         */
        if (!((kfp->flags & FOLL_WRITE) || kfp->map_writable))
                return false;

        if (kfp->pin)
                r = pin_user_pages_fast(kfp->hva, 1, FOLL_WRITE, &page) == 1;
        else
                r = get_user_page_fast_only(kfp->hva, FOLL_WRITE, &page);

        if (r) {
                *pfn = kvm_resolve_pfn(kfp, page, NULL, true);
                return true;
        }

        return false;
}

/*
 * The slow path to get the pfn of the specified host virtual address,
 * 1 indicates success, -errno is returned if error is detected.
 */
static int hva_to_pfn_slow(struct kvm_follow_pfn *kfp, kvm_pfn_t *pfn)
{
        /*
         * When a VCPU accesses a page that is not mapped into the secondary
         * MMU, we lookup the page using GUP to map it, so the guest VCPU can
         * make progress. We always want to honor NUMA hinting faults in that
         * case, because GUP usage corresponds to memory accesses from the VCPU.
         * Otherwise, we'd not trigger NUMA hinting faults once a page is
         * mapped into the secondary MMU and gets accessed by a VCPU.
         *
         * Note that get_user_page_fast_only() and FOLL_WRITE for now
         * implicitly honor NUMA hinting faults and don't need this flag.
         */
        unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT | kfp->flags;
        struct page *page, *wpage;
        int npages;

        if (kfp->pin)
                npages = pin_user_pages_unlocked(kfp->hva, 1, &page, flags);
        else
                npages = get_user_pages_unlocked(kfp->hva, 1, &page, flags);
        if (npages != 1)
                return npages;

        /*
         * Pinning is mutually exclusive with opportunistically mapping a read
         * fault as writable, as KVM should never pin pages when mapping memory
         * into the guest (pinning is only for direct accesses from KVM).
         */
        if (WARN_ON_ONCE(kfp->map_writable && kfp->pin))
                goto out;

        /* map read fault as writable if possible */
        if (!(flags & FOLL_WRITE) && kfp->map_writable &&
            get_user_page_fast_only(kfp->hva, FOLL_WRITE, &wpage)) {
                put_page(page);
                page = wpage;
                flags |= FOLL_WRITE;
        }

out:
        *pfn = kvm_resolve_pfn(kfp, page, NULL, flags & FOLL_WRITE);
        return npages;
}

static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
{
        if (unlikely(!(vma->vm_flags & VM_READ)))
                return false;

        if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
                return false;

        return true;
}

static int hva_to_pfn_remapped(struct vm_area_struct *vma,
                               struct kvm_follow_pfn *kfp, kvm_pfn_t *p_pfn)
{
        struct follow_pfnmap_args args = { .vma = vma, .address = kfp->hva };
        bool write_fault = kfp->flags & FOLL_WRITE;
        int r;

        /*
         * Remapped memory cannot be pinned in any meaningful sense.  Bail if
         * the caller wants to pin the page, i.e. access the page outside of
         * MMU notifier protection, and unsafe umappings are disallowed.
         */
        if (kfp->pin && !allow_unsafe_mappings)
                return -EINVAL;

        r = follow_pfnmap_start(&args);
        if (r) {
                /*
                 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
                 * not call the fault handler, so do it here.
                 */
                bool unlocked = false;
                r = fixup_user_fault(current->mm, kfp->hva,
                                     (write_fault ? FAULT_FLAG_WRITE : 0),
                                     &unlocked);
                if (unlocked)
                        return -EAGAIN;
                if (r)
                        return r;

                r = follow_pfnmap_start(&args);
                if (r)
                        return r;
        }

        if (write_fault && !args.writable) {
                *p_pfn = KVM_PFN_ERR_RO_FAULT;
                goto out;
        }

        *p_pfn = kvm_resolve_pfn(kfp, NULL, &args, args.writable);
out:
        follow_pfnmap_end(&args);
        return r;
}

kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp)
{
        struct vm_area_struct *vma;
        kvm_pfn_t pfn;
        int npages, r;

        might_sleep();

        if (WARN_ON_ONCE(!kfp->refcounted_page))
                return KVM_PFN_ERR_FAULT;

        if (hva_to_pfn_fast(kfp, &pfn))
                return pfn;

        npages = hva_to_pfn_slow(kfp, &pfn);
        if (npages == 1)
                return pfn;
        if (npages == -EINTR || npages == -EAGAIN)
                return KVM_PFN_ERR_SIGPENDING;
        if (npages == -EHWPOISON)
                return KVM_PFN_ERR_HWPOISON;

        mmap_read_lock(current->mm);
retry:
        vma = vma_lookup(current->mm, kfp->hva);

        if (vma == NULL)
                pfn = KVM_PFN_ERR_FAULT;
        else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
                r = hva_to_pfn_remapped(vma, kfp, &pfn);
                if (r == -EAGAIN)
                        goto retry;
                if (r < 0)
                        pfn = KVM_PFN_ERR_FAULT;
        } else {
                if ((kfp->flags & FOLL_NOWAIT) &&
                    vma_is_valid(vma, kfp->flags & FOLL_WRITE))
                        pfn = KVM_PFN_ERR_NEEDS_IO;
                else
                        pfn = KVM_PFN_ERR_FAULT;
        }
        mmap_read_unlock(current->mm);
        return pfn;
}

static kvm_pfn_t kvm_follow_pfn(struct kvm_follow_pfn *kfp)
{
        kfp->hva = __gfn_to_hva_many(kfp->slot, kfp->gfn, NULL,
                                     kfp->flags & FOLL_WRITE);

        if (kfp->hva == KVM_HVA_ERR_RO_BAD)
                return KVM_PFN_ERR_RO_FAULT;

        if (kvm_is_error_hva(kfp->hva))
                return KVM_PFN_NOSLOT;

        if (memslot_is_readonly(kfp->slot) && kfp->map_writable) {
                *kfp->map_writable = false;
                kfp->map_writable = NULL;
        }

        return hva_to_pfn(kfp);
}

kvm_pfn_t __kvm_faultin_pfn(const struct kvm_memory_slot *slot, gfn_t gfn,
                            unsigned int foll, bool *writable,
                            struct page **refcounted_page)
{
        struct kvm_follow_pfn kfp = {
                .slot = slot,
                .gfn = gfn,
                .flags = foll,
                .map_writable = writable,
                .refcounted_page = refcounted_page,
        };

        if (WARN_ON_ONCE(!writable || !refcounted_page))
                return KVM_PFN_ERR_FAULT;

        *writable = false;
        *refcounted_page = NULL;

        return kvm_follow_pfn(&kfp);
}
EXPORT_SYMBOL_GPL(__kvm_faultin_pfn);

int kvm_prefetch_pages(struct kvm_memory_slot *slot, gfn_t gfn,
                       struct page **pages, int nr_pages)
{
        unsigned long addr;
        gfn_t entry = 0;

        addr = gfn_to_hva_many(slot, gfn, &entry);
        if (kvm_is_error_hva(addr))
                return -1;

        if (entry < nr_pages)
                return 0;

        return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages);
}
EXPORT_SYMBOL_GPL(kvm_prefetch_pages);

/*
 * Don't use this API unless you are absolutely, positively certain that KVM
 * needs to get a struct page, e.g. to pin the page for firmware DMA.
 *
 * FIXME: Users of this API likely need to FOLL_PIN the page, not just elevate
 *          its refcount.
 */
struct page *__gfn_to_page(struct kvm *kvm, gfn_t gfn, bool write)
{
        struct page *refcounted_page = NULL;
        struct kvm_follow_pfn kfp = {
                .slot = gfn_to_memslot(kvm, gfn),
                .gfn = gfn,
                .flags = write ? FOLL_WRITE : 0,
                .refcounted_page = &refcounted_page,
        };

        (void)kvm_follow_pfn(&kfp);
        return refcounted_page;
}
EXPORT_SYMBOL_GPL(__gfn_to_page);

int __kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
                   bool writable)
{
        struct kvm_follow_pfn kfp = {
                .slot = gfn_to_memslot(vcpu->kvm, gfn),
                .gfn = gfn,
                .flags = writable ? FOLL_WRITE : 0,
                .refcounted_page = &map->pinned_page,
                .pin = true,
        };

        map->pinned_page = NULL;
        map->page = NULL;
        map->hva = NULL;
        map->gfn = gfn;
        map->writable = writable;

        map->pfn = kvm_follow_pfn(&kfp);
        if (is_error_noslot_pfn(map->pfn))
                return -EINVAL;

        if (pfn_valid(map->pfn)) {
                map->page = pfn_to_page(map->pfn);
                map->hva = kmap(map->page);
#ifdef CONFIG_HAS_IOMEM
        } else {
                map->hva = memremap(pfn_to_hpa(map->pfn), PAGE_SIZE, MEMREMAP_WB);
#endif
        }

        return map->hva ? 0 : -EFAULT;
}
EXPORT_SYMBOL_GPL(__kvm_vcpu_map);

void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map)
{
        if (!map->hva)
                return;

        if (map->page)
                kunmap(map->page);
#ifdef CONFIG_HAS_IOMEM
        else
                memunmap(map->hva);
#endif

        if (map->writable)
                kvm_vcpu_mark_page_dirty(vcpu, map->gfn);

        if (map->pinned_page) {
                if (map->writable)
                        kvm_set_page_dirty(map->pinned_page);
                kvm_set_page_accessed(map->pinned_page);
                unpin_user_page(map->pinned_page);
        }

        map->hva = NULL;
        map->page = NULL;
        map->pinned_page = NULL;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);

static int next_segment(unsigned long len, int offset)
{
        if (len > PAGE_SIZE - offset)
                return PAGE_SIZE - offset;
        else
                return len;
}

/* Copy @len bytes from guest memory at '(@gfn * PAGE_SIZE) + @offset' to @data */
static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
                                 void *data, int offset, int len)
{
        int r;
        unsigned long addr;

        if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
                return -EFAULT;

        addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        r = __copy_from_user(data, (void __user *)addr + offset, len);
        if (r)
                return -EFAULT;
        return 0;
}

int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
                        int len)
{
        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);

        return __kvm_read_guest_page(slot, gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page);

int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data,
                             int offset, int len)
{
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);

        return __kvm_read_guest_page(slot, gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);

int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
{
        gfn_t gfn = gpa >> PAGE_SHIFT;
        int seg;
        int offset = offset_in_page(gpa);
        int ret;

        while ((seg = next_segment(len, offset)) != 0) {
                ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
                if (ret < 0)
                        return ret;
                offset = 0;
                len -= seg;
                data += seg;
                ++gfn;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(kvm_read_guest);

int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len)
{
        gfn_t gfn = gpa >> PAGE_SHIFT;
        int seg;
        int offset = offset_in_page(gpa);
        int ret;

        while ((seg = next_segment(len, offset)) != 0) {
                ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
                if (ret < 0)
                        return ret;
                offset = 0;
                len -= seg;
                data += seg;
                ++gfn;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);

static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
                                   void *data, int offset, unsigned long len)
{
        int r;
        unsigned long addr;

        if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
                return -EFAULT;

        addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        pagefault_disable();
        r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
        pagefault_enable();
        if (r)
                return -EFAULT;
        return 0;
}

int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
                               void *data, unsigned long len)
{
        gfn_t gfn = gpa >> PAGE_SHIFT;
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
        int offset = offset_in_page(gpa);

        return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);

/* Copy @len bytes from @data into guest memory at '(@gfn * PAGE_SIZE) + @offset' */
static int __kvm_write_guest_page(struct kvm *kvm,
                                  struct kvm_memory_slot *memslot, gfn_t gfn,
                                  const void *data, int offset, int len)
{
        int r;
        unsigned long addr;

        if (WARN_ON_ONCE(offset + len > PAGE_SIZE))
                return -EFAULT;

        addr = gfn_to_hva_memslot(memslot, gfn);
        if (kvm_is_error_hva(addr))
                return -EFAULT;
        r = __copy_to_user((void __user *)addr + offset, data, len);
        if (r)
                return -EFAULT;
        mark_page_dirty_in_slot(kvm, memslot, gfn);
        return 0;
}

int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
                         const void *data, int offset, int len)
{
        struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);

        return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_write_guest_page);

int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
                              const void *data, int offset, int len)
{
        struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);

        return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);

int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
                    unsigned long len)
{
        gfn_t gfn = gpa >> PAGE_SHIFT;
        int seg;
        int offset = offset_in_page(gpa);
        int ret;

        while ((seg = next_segment(len, offset)) != 0) {
                ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
                if (ret < 0)
                        return ret;
                offset = 0;
                len -= seg;
                data += seg;
                ++gfn;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_guest);

int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
                         unsigned long len)
{
        gfn_t gfn = gpa >> PAGE_SHIFT;
        int seg;
        int offset = offset_in_page(gpa);
        int ret;

        while ((seg = next_segment(len, offset)) != 0) {
                ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
                if (ret < 0)
                        return ret;
                offset = 0;
                len -= seg;
                data += seg;
                ++gfn;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);

static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
                                       struct gfn_to_hva_cache *ghc,
                                       gpa_t gpa, unsigned long len)
{
        int offset = offset_in_page(gpa);
        gfn_t start_gfn = gpa >> PAGE_SHIFT;
        gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
        gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
        gfn_t nr_pages_avail;

        /* Update ghc->generation before performing any error checks. */
        ghc->generation = slots->generation;

        if (start_gfn > end_gfn) {
                ghc->hva = KVM_HVA_ERR_BAD;
                return -EINVAL;
        }

        /*
         * If the requested region crosses two memslots, we still
         * verify that the entire region is valid here.
         */
        for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
                ghc->memslot = __gfn_to_memslot(slots, start_gfn);
                ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
                                           &nr_pages_avail);
                if (kvm_is_error_hva(ghc->hva))
                        return -EFAULT;
        }

        /* Use the slow path for cross page reads and writes. */
        if (nr_pages_needed == 1)
                ghc->hva += offset;
        else
                ghc->memslot = NULL;

        ghc->gpa = gpa;
        ghc->len = len;
        return 0;
}

int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                              gpa_t gpa, unsigned long len)
{
        struct kvm_memslots *slots = kvm_memslots(kvm);
        return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
}
EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);

int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                                  void *data, unsigned int offset,
                                  unsigned long len)
{
        struct kvm_memslots *slots = kvm_memslots(kvm);
        int r;
        gpa_t gpa = ghc->gpa + offset;

        if (WARN_ON_ONCE(len + offset > ghc->len))
                return -EINVAL;

        if (slots->generation != ghc->generation) {
                if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
                        return -EFAULT;
        }

        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;

        if (unlikely(!ghc->memslot))
                return kvm_write_guest(kvm, gpa, data, len);

        r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
        if (r)
                return -EFAULT;
        mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT);

        return 0;
}
EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);

int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                           void *data, unsigned long len)
{
        return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len);
}
EXPORT_SYMBOL_GPL(kvm_write_guest_cached);

int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                                 void *data, unsigned int offset,
                                 unsigned long len)
{
        struct kvm_memslots *slots = kvm_memslots(kvm);
        int r;
        gpa_t gpa = ghc->gpa + offset;

        if (WARN_ON_ONCE(len + offset > ghc->len))
                return -EINVAL;

        if (slots->generation != ghc->generation) {
                if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
                        return -EFAULT;
        }

        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;

        if (unlikely(!ghc->memslot))
                return kvm_read_guest(kvm, gpa, data, len);

        r = __copy_from_user(data, (void __user *)ghc->hva + offset, len);
        if (r)
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);

int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
                          void *data, unsigned long len)
{
        return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len);
}
EXPORT_SYMBOL_GPL(kvm_read_guest_cached);

int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
{
        const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
        gfn_t gfn = gpa >> PAGE_SHIFT;
        int seg;
        int offset = offset_in_page(gpa);
        int ret;

        while ((seg = next_segment(len, offset)) != 0) {
                ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, seg);
                if (ret < 0)
                        return ret;
                offset = 0;
                len -= seg;
                ++gfn;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(kvm_clear_guest);

void mark_page_dirty_in_slot(struct kvm *kvm,
                             const struct kvm_memory_slot *memslot,
                              gfn_t gfn)
{
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();

#ifdef CONFIG_HAVE_KVM_DIRTY_RING
        if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
                return;

        WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
#endif

        if (memslot && kvm_slot_dirty_track_enabled(memslot)) {
                unsigned long rel_gfn = gfn - memslot->base_gfn;
                u32 slot = (memslot->as_id << 16) | memslot->id;

                if (kvm->dirty_ring_size && vcpu)
                        kvm_dirty_ring_push(vcpu, slot, rel_gfn);
                else if (memslot->dirty_bitmap)
                        set_bit_le(rel_gfn, memslot->dirty_bitmap);
        }
}
EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);

void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
{
        struct kvm_memory_slot *memslot;

        memslot = gfn_to_memslot(kvm, gfn);
        mark_page_dirty_in_slot(kvm, memslot, gfn);
}
EXPORT_SYMBOL_GPL(mark_page_dirty);

void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
{
        struct kvm_memory_slot *memslot;

        memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
        mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);

void kvm_sigset_activate(struct kvm_vcpu *vcpu)
{
        if (!vcpu->sigset_active)
                return;

        /*
         * This does a lockless modification of ->real_blocked, which is fine
         * because, only current can change ->real_blocked and all readers of
         * ->real_blocked don't care as long ->real_blocked is always a subset
         * of ->blocked.
         */
        sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
}

void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
{
        if (!vcpu->sigset_active)
                return;

        sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
        sigemptyset(&current->real_blocked);
}

static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
{
        unsigned int old, val, grow, grow_start;

        old = val = vcpu->halt_poll_ns;
        grow_start = READ_ONCE(halt_poll_ns_grow_start);
        grow = READ_ONCE(halt_poll_ns_grow);
        if (!grow)
                goto out;

        val *= grow;
        if (val < grow_start)
                val = grow_start;

        vcpu->halt_poll_ns = val;
out:
        trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
}

static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
{
        unsigned int old, val, shrink, grow_start;

        old = val = vcpu->halt_poll_ns;
        shrink = READ_ONCE(halt_poll_ns_shrink);
        grow_start = READ_ONCE(halt_poll_ns_grow_start);
        if (shrink == 0)
                val = 0;
        else
                val /= shrink;

        if (val < grow_start)
                val = 0;

        vcpu->halt_poll_ns = val;
        trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
}

static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
{
        int ret = -EINTR;
        int idx = srcu_read_lock(&vcpu->kvm->srcu);

        if (kvm_arch_vcpu_runnable(vcpu))
                goto out;
        if (kvm_cpu_has_pending_timer(vcpu))
                goto out;
        if (signal_pending(current))
                goto out;
        if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
                goto out;

        ret = 0;
out:
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        return ret;
}

/*
 * Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
 * pending.  This is mostly used when halting a vCPU, but may also be used
 * directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
 */
bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
{
        struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
        bool waited = false;

        vcpu->stat.generic.blocking = 1;

        preempt_disable();
        kvm_arch_vcpu_blocking(vcpu);
        prepare_to_rcuwait(wait);
        preempt_enable();

        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);

                if (kvm_vcpu_check_block(vcpu) < 0)
                        break;

                waited = true;
                schedule();
        }

        preempt_disable();
        finish_rcuwait(wait);
        kvm_arch_vcpu_unblocking(vcpu);
        preempt_enable();

        vcpu->stat.generic.blocking = 0;

        return waited;
}

static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
                                          ktime_t end, bool success)
{
        struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
        u64 poll_ns = ktime_to_ns(ktime_sub(end, start));

        ++vcpu->stat.generic.halt_attempted_poll;

        if (success) {
                ++vcpu->stat.generic.halt_successful_poll;

                if (!vcpu_valid_wakeup(vcpu))
                        ++vcpu->stat.generic.halt_poll_invalid;

                stats->halt_poll_success_ns += poll_ns;
                KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
        } else {
                stats->halt_poll_fail_ns += poll_ns;
                KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
        }
}

static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;

        if (kvm->override_halt_poll_ns) {
                /*
                 * Ensure kvm->max_halt_poll_ns is not read before
                 * kvm->override_halt_poll_ns.
                 *
                 * Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
                 */
                smp_rmb();
                return READ_ONCE(kvm->max_halt_poll_ns);
        }

        return READ_ONCE(halt_poll_ns);
}

/*
 * Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc...  If halt
 * polling is enabled, busy wait for a short time before blocking to avoid the
 * expensive block+unblock sequence if a wake event arrives soon after the vCPU
 * is halted.
 */
void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
{
        unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
        bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
        ktime_t start, cur, poll_end;
        bool waited = false;
        bool do_halt_poll;
        u64 halt_ns;

        if (vcpu->halt_poll_ns > max_halt_poll_ns)
                vcpu->halt_poll_ns = max_halt_poll_ns;

        do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;

        start = cur = poll_end = ktime_get();
        if (do_halt_poll) {
                ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);

                do {
                        if (kvm_vcpu_check_block(vcpu) < 0)
                                goto out;
                        cpu_relax();
                        poll_end = cur = ktime_get();
                } while (kvm_vcpu_can_poll(cur, stop));
        }

        waited = kvm_vcpu_block(vcpu);

        cur = ktime_get();
        if (waited) {
                vcpu->stat.generic.halt_wait_ns +=
                        ktime_to_ns(cur) - ktime_to_ns(poll_end);
                KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
                                ktime_to_ns(cur) - ktime_to_ns(poll_end));
        }
out:
        /* The total time the vCPU was "halted", including polling time. */
        halt_ns = ktime_to_ns(cur) - ktime_to_ns(start);

        /*
         * Note, halt-polling is considered successful so long as the vCPU was
         * never actually scheduled out, i.e. even if the wake event arrived
         * after of the halt-polling loop itself, but before the full wait.
         */
        if (do_halt_poll)
                update_halt_poll_stats(vcpu, start, poll_end, !waited);

        if (halt_poll_allowed) {
                /* Recompute the max halt poll time in case it changed. */
                max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);

                if (!vcpu_valid_wakeup(vcpu)) {
                        shrink_halt_poll_ns(vcpu);
                } else if (max_halt_poll_ns) {
                        if (halt_ns <= vcpu->halt_poll_ns)
                                ;
                        /* we had a long block, shrink polling */
                        else if (vcpu->halt_poll_ns &&
                                 halt_ns > max_halt_poll_ns)
                                shrink_halt_poll_ns(vcpu);
                        /* we had a short halt and our poll time is too small */
                        else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
                                 halt_ns < max_halt_poll_ns)
                                grow_halt_poll_ns(vcpu);
                } else {
                        vcpu->halt_poll_ns = 0;
                }
        }

        trace_kvm_vcpu_wakeup(halt_ns, waited, vcpu_valid_wakeup(vcpu));
}
EXPORT_SYMBOL_GPL(kvm_vcpu_halt);

bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
{
        if (__kvm_vcpu_wake_up(vcpu)) {
                WRITE_ONCE(vcpu->ready, true);
                ++vcpu->stat.generic.halt_wakeup;
                return true;
        }

        return false;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);

#ifndef CONFIG_S390
/*
 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
 */
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{
        int me, cpu;

        if (kvm_vcpu_wake_up(vcpu))
                return;

        me = get_cpu();
        /*
         * The only state change done outside the vcpu mutex is IN_GUEST_MODE
         * to EXITING_GUEST_MODE.  Therefore the moderately expensive "should
         * kick" check does not need atomic operations if kvm_vcpu_kick is used
         * within the vCPU thread itself.
         */
        if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
                if (vcpu->mode == IN_GUEST_MODE)
                        WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
                goto out;
        }

        /*
         * Note, the vCPU could get migrated to a different pCPU at any point
         * after kvm_arch_vcpu_should_kick(), which could result in sending an
         * IPI to the previous pCPU.  But, that's ok because the purpose of the
         * IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
         * vCPU also requires it to leave IN_GUEST_MODE.
         */
        if (kvm_arch_vcpu_should_kick(vcpu)) {
                cpu = READ_ONCE(vcpu->cpu);
                if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
                        smp_send_reschedule(cpu);
        }
out:
        put_cpu();
}
EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
#endif /* !CONFIG_S390 */

int kvm_vcpu_yield_to(struct kvm_vcpu *target)
{
        struct task_struct *task = NULL;
        int ret;

        if (!read_trylock(&target->pid_lock))
                return 0;

        if (target->pid)
                task = get_pid_task(target->pid, PIDTYPE_PID);

        read_unlock(&target->pid_lock);

        if (!task)
                return 0;
        ret = yield_to(task, 1);
        put_task_struct(task);

        return ret;
}
EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);

/*
 * Helper that checks whether a VCPU is eligible for directed yield.
 * Most eligible candidate to yield is decided by following heuristics:
 *
 *  (a) VCPU which has not done pl-exit or cpu relax intercepted recently
 *  (preempted lock holder), indicated by @in_spin_loop.
 *  Set at the beginning and cleared at the end of interception/PLE handler.
 *
 *  (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
 *  chance last time (mostly it has become eligible now since we have probably
 *  yielded to lockholder in last iteration. This is done by toggling
 *  @dy_eligible each time a VCPU checked for eligibility.)
 *
 *  Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
 *  to preempted lock-holder could result in wrong VCPU selection and CPU
 *  burning. Giving priority for a potential lock-holder increases lock
 *  progress.
 *
 *  Since algorithm is based on heuristics, accessing another VCPU data without
 *  locking does not harm. It may result in trying to yield to  same VCPU, fail
 *  and continue with next VCPU and so on.
 */
static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
        bool eligible;

        eligible = !vcpu->spin_loop.in_spin_loop ||
                    vcpu->spin_loop.dy_eligible;

        if (vcpu->spin_loop.in_spin_loop)
                kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible);

        return eligible;
#else
        return true;
#endif
}

/*
 * Unlike kvm_arch_vcpu_runnable, this function is called outside
 * a vcpu_load/vcpu_put pair.  However, for most architectures
 * kvm_arch_vcpu_runnable does not require vcpu_load.
 */
bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
{
        return kvm_arch_vcpu_runnable(vcpu);
}

static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
{
        if (kvm_arch_dy_runnable(vcpu))
                return true;

#ifdef CONFIG_KVM_ASYNC_PF
        if (!list_empty_careful(&vcpu->async_pf.done))
                return true;
#endif

        return false;
}

/*
 * By default, simply query the target vCPU's current mode when checking if a
 * vCPU was preempted in kernel mode.  All architectures except x86 (or more
 * specifical, except VMX) allow querying whether or not a vCPU is in kernel
 * mode even if the vCPU is NOT loaded, i.e. using kvm_arch_vcpu_in_kernel()
 * directly for cross-vCPU checks is functionally correct and accurate.
 */
bool __weak kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu)
{
        return kvm_arch_vcpu_in_kernel(vcpu);
}

bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
{
        return false;
}

void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
{
        int nr_vcpus, start, i, idx, yielded;
        struct kvm *kvm = me->kvm;
        struct kvm_vcpu *vcpu;
        int try = 3;

        nr_vcpus = atomic_read(&kvm->online_vcpus);
        if (nr_vcpus < 2)
                return;

        /* Pairs with the smp_wmb() in kvm_vm_ioctl_create_vcpu(). */
        smp_rmb();

        kvm_vcpu_set_in_spin_loop(me, true);

        /*
         * The current vCPU ("me") is spinning in kernel mode, i.e. is likely
         * waiting for a resource to become available.  Attempt to yield to a
         * vCPU that is runnable, but not currently running, e.g. because the
         * vCPU was preempted by a higher priority task.  With luck, the vCPU
         * that was preempted is holding a lock or some other resource that the
         * current vCPU is waiting to acquire, and yielding to the other vCPU
         * will allow it to make forward progress and release the lock (or kick
         * the spinning vCPU, etc).
         *
         * Since KVM has no insight into what exactly the guest is doing,
         * approximate a round-robin selection by iterating over all vCPUs,
         * starting at the last boosted vCPU.  I.e. if N=kvm->last_boosted_vcpu,
         * iterate over vCPU[N+1]..vCPU[N-1], wrapping as needed.
         *
         * Note, this is inherently racy, e.g. if multiple vCPUs are spinning,
         * they may all try to yield to the same vCPU(s).  But as above, this
         * is all best effort due to KVM's lack of visibility into the guest.
         */
        start = READ_ONCE(kvm->last_boosted_vcpu) + 1;
        for (i = 0; i < nr_vcpus; i++) {
                idx = (start + i) % nr_vcpus;
                if (idx == me->vcpu_idx)
                        continue;

                vcpu = xa_load(&kvm->vcpu_array, idx);
                if (!READ_ONCE(vcpu->ready))
                        continue;
                if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
                        continue;

                /*
                 * Treat the target vCPU as being in-kernel if it has a pending
                 * interrupt, as the vCPU trying to yield may be spinning
                 * waiting on IPI delivery, i.e. the target vCPU is in-kernel
                 * for the purposes of directed yield.
                 */
                if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
                    !kvm_arch_dy_has_pending_interrupt(vcpu) &&
                    !kvm_arch_vcpu_preempted_in_kernel(vcpu))
                        continue;

                if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
                        continue;

                yielded = kvm_vcpu_yield_to(vcpu);
                if (yielded > 0) {
                        WRITE_ONCE(kvm->last_boosted_vcpu, i);
                        break;
                } else if (yielded < 0 && !--try) {
                        break;
                }
        }
        kvm_vcpu_set_in_spin_loop(me, false);

        /* Ensure vcpu is not eligible during next spinloop */
        kvm_vcpu_set_dy_eligible(me, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);

static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff)
{
#ifdef CONFIG_HAVE_KVM_DIRTY_RING
        return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
            (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
             kvm->dirty_ring_size / PAGE_SIZE);
#else
        return false;
#endif
}

static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
{
        struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
        struct page *page;

        if (vmf->pgoff == 0)
                page = virt_to_page(vcpu->run);
#ifdef CONFIG_X86
        else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
                page = virt_to_page(vcpu->arch.pio_data);
#endif
#ifdef CONFIG_KVM_MMIO
        else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
                page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
#endif
        else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff))
                page = kvm_dirty_ring_get_page(
                    &vcpu->dirty_ring,
                    vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
        else
                return kvm_arch_vcpu_fault(vcpu, vmf);
        get_page(page);
        vmf->page = page;
        return 0;
}

static const struct vm_operations_struct kvm_vcpu_vm_ops = {
        .fault = kvm_vcpu_fault,
};

static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct kvm_vcpu *vcpu = file->private_data;
        unsigned long pages = vma_pages(vma);

        if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) ||
             kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) &&
            ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED)))
                return -EINVAL;

        vma->vm_ops = &kvm_vcpu_vm_ops;
        return 0;
}

static int kvm_vcpu_release(struct inode *inode, struct file *filp)
{
        struct kvm_vcpu *vcpu = filp->private_data;

        kvm_put_kvm(vcpu->kvm);
        return 0;
}

static struct file_operations kvm_vcpu_fops = {
        .release        = kvm_vcpu_release,
        .unlocked_ioctl = kvm_vcpu_ioctl,
        .mmap           = kvm_vcpu_mmap,
        .llseek                = noop_llseek,
        KVM_COMPAT(kvm_vcpu_compat_ioctl),
};

/*
 * Allocates an inode for the vcpu.
 */
static int create_vcpu_fd(struct kvm_vcpu *vcpu)
{
        char name[8 + 1 + ITOA_MAX_LEN + 1];

        snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id);
        return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC);
}

#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
static int vcpu_get_pid(void *data, u64 *val)
{
        struct kvm_vcpu *vcpu = data;

        read_lock(&vcpu->pid_lock);
        *val = pid_nr(vcpu->pid);
        read_unlock(&vcpu->pid_lock);
        return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");

static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
{
        struct dentry *debugfs_dentry;
        char dir_name[ITOA_MAX_LEN * 2];

        if (!debugfs_initialized())
                return;

        snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id);
        debugfs_dentry = debugfs_create_dir(dir_name,
                                            vcpu->kvm->debugfs_dentry);
        debugfs_create_file("pid", 0444, debugfs_dentry, vcpu,
                            &vcpu_get_pid_fops);

        kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
}
#endif

/*
 * Creates some virtual cpus.  Good luck creating more than one.
 */
static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id)
{
        int r;
        struct kvm_vcpu *vcpu;
        struct page *page;

        /*
         * KVM tracks vCPU IDs as 'int', be kind to userspace and reject
         * too-large values instead of silently truncating.
         *
         * Ensure KVM_MAX_VCPU_IDS isn't pushed above INT_MAX without first
         * changing the storage type (at the very least, IDs should be tracked
         * as unsigned ints).
         */
        BUILD_BUG_ON(KVM_MAX_VCPU_IDS > INT_MAX);
        if (id >= KVM_MAX_VCPU_IDS)
                return -EINVAL;

        mutex_lock(&kvm->lock);
        if (kvm->created_vcpus >= kvm->max_vcpus) {
                mutex_unlock(&kvm->lock);
                return -EINVAL;
        }

        r = kvm_arch_vcpu_precreate(kvm, id);
        if (r) {
                mutex_unlock(&kvm->lock);
                return r;
        }

        kvm->created_vcpus++;
        mutex_unlock(&kvm->lock);

        vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
        if (!vcpu) {
                r = -ENOMEM;
                goto vcpu_decrement;
        }

        BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
        page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        if (!page) {
                r = -ENOMEM;
                goto vcpu_free;
        }
        vcpu->run = page_address(page);

        kvm_vcpu_init(vcpu, kvm, id);

        r = kvm_arch_vcpu_create(vcpu);
        if (r)
                goto vcpu_free_run_page;

        if (kvm->dirty_ring_size) {
                r = kvm_dirty_ring_alloc(&vcpu->dirty_ring,
                                         id, kvm->dirty_ring_size);
                if (r)
                        goto arch_vcpu_destroy;
        }

        mutex_lock(&kvm->lock);

        if (kvm_get_vcpu_by_id(kvm, id)) {
                r = -EEXIST;
                goto unlock_vcpu_destroy;
        }

        vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
        r = xa_insert(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, GFP_KERNEL_ACCOUNT);
        WARN_ON_ONCE(r == -EBUSY);
        if (r)
                goto unlock_vcpu_destroy;

        /*
         * Now it's all set up, let userspace reach it.  Grab the vCPU's mutex
         * so that userspace can't invoke vCPU ioctl()s until the vCPU is fully
         * visible (per online_vcpus), e.g. so that KVM doesn't get tricked
         * into a NULL-pointer dereference because KVM thinks the _current_
         * vCPU doesn't exist.  As a bonus, taking vcpu->mutex ensures lockdep
         * knows it's taken *inside* kvm->lock.
         */
        mutex_lock(&vcpu->mutex);
        kvm_get_kvm(kvm);
        r = create_vcpu_fd(vcpu);
        if (r < 0)
                goto kvm_put_xa_erase;

        /*
         * Pairs with smp_rmb() in kvm_get_vcpu.  Store the vcpu
         * pointer before kvm->online_vcpu's incremented value.
         */
        smp_wmb();
        atomic_inc(&kvm->online_vcpus);
        mutex_unlock(&vcpu->mutex);

        mutex_unlock(&kvm->lock);
        kvm_arch_vcpu_postcreate(vcpu);
        kvm_create_vcpu_debugfs(vcpu);
        return r;

kvm_put_xa_erase:
        mutex_unlock(&vcpu->mutex);
        kvm_put_kvm_no_destroy(kvm);
        xa_erase(&kvm->vcpu_array, vcpu->vcpu_idx);
unlock_vcpu_destroy:
        mutex_unlock(&kvm->lock);
        kvm_dirty_ring_free(&vcpu->dirty_ring);
arch_vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
vcpu_free_run_page:
        free_page((unsigned long)vcpu->run);
vcpu_free:
        kmem_cache_free(kvm_vcpu_cache, vcpu);
vcpu_decrement:
        mutex_lock(&kvm->lock);
        kvm->created_vcpus--;
        mutex_unlock(&kvm->lock);
        return r;
}

static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
{
        if (sigset) {
                sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
                vcpu->sigset_active = 1;
                vcpu->sigset = *sigset;
        } else
                vcpu->sigset_active = 0;
        return 0;
}

static ssize_t kvm_vcpu_stats_read(struct file *file, char __user *user_buffer,
                              size_t size, loff_t *offset)
{
        struct kvm_vcpu *vcpu = file->private_data;

        return kvm_stats_read(vcpu->stats_id, &kvm_vcpu_stats_header,
                        &kvm_vcpu_stats_desc[0], &vcpu->stat,
                        sizeof(vcpu->stat), user_buffer, size, offset);
}

static int kvm_vcpu_stats_release(struct inode *inode, struct file *file)
{
        struct kvm_vcpu *vcpu = file->private_data;

        kvm_put_kvm(vcpu->kvm);
        return 0;
}

static const struct file_operations kvm_vcpu_stats_fops = {
        .owner = THIS_MODULE,
        .read = kvm_vcpu_stats_read,
        .release = kvm_vcpu_stats_release,
        .llseek = noop_llseek,
};

static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
{
        int fd;
        struct file *file;
        char name[15 + ITOA_MAX_LEN + 1];

        snprintf(name, sizeof(name), "kvm-vcpu-stats:%d", vcpu->vcpu_id);

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
                return fd;

        file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu,
                                        O_RDONLY, FMODE_PREAD);
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                return PTR_ERR(file);
        }

        kvm_get_kvm(vcpu->kvm);
        fd_install(fd, file);

        return fd;
}

#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
static int kvm_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu,
                                     struct kvm_pre_fault_memory *range)
{
        int idx;
        long r;
        u64 full_size;

        if (range->flags)
                return -EINVAL;

        if (!PAGE_ALIGNED(range->gpa) ||
            !PAGE_ALIGNED(range->size) ||
            range->gpa + range->size <= range->gpa)
                return -EINVAL;

        vcpu_load(vcpu);
        idx = srcu_read_lock(&vcpu->kvm->srcu);

        full_size = range->size;
        do {
                if (signal_pending(current)) {
                        r = -EINTR;
                        break;
                }

                r = kvm_arch_vcpu_pre_fault_memory(vcpu, range);
                if (WARN_ON_ONCE(r == 0 || r == -EIO))
                        break;

                if (r < 0)
                        break;

                range->size -= r;
                range->gpa += r;
                cond_resched();
        } while (range->size);

        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        vcpu_put(vcpu);

        /* Return success if at least one page was mapped successfully.  */
        return full_size == range->size ? r : 0;
}
#endif

static int kvm_wait_for_vcpu_online(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;

        /*
         * In practice, this happy path will always be taken, as a well-behaved
         * VMM will never invoke a vCPU ioctl() before KVM_CREATE_VCPU returns.
         */
        if (likely(vcpu->vcpu_idx < atomic_read(&kvm->online_vcpus)))
                return 0;

        /*
         * Acquire and release the vCPU's mutex to wait for vCPU creation to
         * complete (kvm_vm_ioctl_create_vcpu() holds the mutex until the vCPU
         * is fully online).
         */
        if (mutex_lock_killable(&vcpu->mutex))
                return -EINTR;

        mutex_unlock(&vcpu->mutex);

        if (WARN_ON_ONCE(!kvm_get_vcpu(kvm, vcpu->vcpu_idx)))
                return -EIO;

        return 0;
}

static long kvm_vcpu_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
{
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = (void __user *)arg;
        int r;
        struct kvm_fpu *fpu = NULL;
        struct kvm_sregs *kvm_sregs = NULL;

        if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
                return -EIO;

        if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
                return -EINVAL;

        /*
         * Wait for the vCPU to be online before handling the ioctl(), as KVM
         * assumes the vCPU is reachable via vcpu_array, i.e. may dereference
         * a NULL pointer if userspace invokes an ioctl() before KVM is ready.
         */
        r = kvm_wait_for_vcpu_online(vcpu);
        if (r)
                return r;

        /*
         * Some architectures have vcpu ioctls that are asynchronous to vcpu
         * execution; mutex_lock() would break them.
         */
        r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
        if (r != -ENOIOCTLCMD)
                return r;

        if (mutex_lock_killable(&vcpu->mutex))
                return -EINTR;
        switch (ioctl) {
        case KVM_RUN: {
                struct pid *oldpid;
                r = -EINVAL;
                if (arg)
                        goto out;

                /*
                 * Note, vcpu->pid is primarily protected by vcpu->mutex. The
                 * dedicated r/w lock allows other tasks, e.g. other vCPUs, to
                 * read vcpu->pid while this vCPU is in KVM_RUN, e.g. to yield
                 * directly to this vCPU
                 */
                oldpid = vcpu->pid;
                if (unlikely(oldpid != task_pid(current))) {
                        /* The thread running this VCPU changed. */
                        struct pid *newpid;

                        r = kvm_arch_vcpu_run_pid_change(vcpu);
                        if (r)
                                break;

                        newpid = get_task_pid(current, PIDTYPE_PID);
                        write_lock(&vcpu->pid_lock);
                        vcpu->pid = newpid;
                        write_unlock(&vcpu->pid_lock);

                        put_pid(oldpid);
                }
                vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe);
                r = kvm_arch_vcpu_ioctl_run(vcpu);
                vcpu->wants_to_run = false;

                trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
                break;
        }
        case KVM_GET_REGS: {
                struct kvm_regs *kvm_regs;

                r = -ENOMEM;
                kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL);
                if (!kvm_regs)
                        goto out;
                r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs);
                if (r)
                        goto out_free1;
                r = -EFAULT;
                if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs)))
                        goto out_free1;
                r = 0;
out_free1:
                kfree(kvm_regs);
                break;
        }
        case KVM_SET_REGS: {
                struct kvm_regs *kvm_regs;

                kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
                if (IS_ERR(kvm_regs)) {
                        r = PTR_ERR(kvm_regs);
                        goto out;
                }
                r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs);
                kfree(kvm_regs);
                break;
        }
        case KVM_GET_SREGS: {
                kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL);
                r = -ENOMEM;
                if (!kvm_sregs)
                        goto out;
                r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs);
                if (r)
                        goto out;
                r = -EFAULT;
                if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs)))
                        goto out;
                r = 0;
                break;
        }
        case KVM_SET_SREGS: {
                kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
                if (IS_ERR(kvm_sregs)) {
                        r = PTR_ERR(kvm_sregs);
                        kvm_sregs = NULL;
                        goto out;
                }
                r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs);
                break;
        }
        case KVM_GET_MP_STATE: {
                struct kvm_mp_state mp_state;

                r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state);
                if (r)
                        goto out;
                r = -EFAULT;
                if (copy_to_user(argp, &mp_state, sizeof(mp_state)))
                        goto out;
                r = 0;
                break;
        }
        case KVM_SET_MP_STATE: {
                struct kvm_mp_state mp_state;

                r = -EFAULT;
                if (copy_from_user(&mp_state, argp, sizeof(mp_state)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state);
                break;
        }
        case KVM_TRANSLATE: {
                struct kvm_translation tr;

                r = -EFAULT;
                if (copy_from_user(&tr, argp, sizeof(tr)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
                if (r)
                        goto out;
                r = -EFAULT;
                if (copy_to_user(argp, &tr, sizeof(tr)))
                        goto out;
                r = 0;
                break;
        }
        case KVM_SET_GUEST_DEBUG: {
                struct kvm_guest_debug dbg;

                r = -EFAULT;
                if (copy_from_user(&dbg, argp, sizeof(dbg)))
                        goto out;
                r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg);
                break;
        }
        case KVM_SET_SIGNAL_MASK: {
                struct kvm_signal_mask __user *sigmask_arg = argp;
                struct kvm_signal_mask kvm_sigmask;
                sigset_t sigset, *p;

                p = NULL;
                if (argp) {
                        r = -EFAULT;
                        if (copy_from_user(&kvm_sigmask, argp,
                                           sizeof(kvm_sigmask)))
                                goto out;
                        r = -EINVAL;
                        if (kvm_sigmask.len != sizeof(sigset))
                                goto out;
                        r = -EFAULT;
                        if (copy_from_user(&sigset, sigmask_arg->sigset,
                                           sizeof(sigset)))
                                goto out;
                        p = &sigset;
                }
                r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
                break;
        }
        case KVM_GET_FPU: {
                fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL);
                r = -ENOMEM;
                if (!fpu)
                        goto out;
                r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
                if (r)
                        goto out;
                r = -EFAULT;
                if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu)))
                        goto out;
                r = 0;
                break;
        }
        case KVM_SET_FPU: {
                fpu = memdup_user(argp, sizeof(*fpu));
                if (IS_ERR(fpu)) {
                        r = PTR_ERR(fpu);
                        fpu = NULL;
                        goto out;
                }
                r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
                break;
        }
        case KVM_GET_STATS_FD: {
                r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
                break;
        }
#ifdef CONFIG_KVM_GENERIC_PRE_FAULT_MEMORY
        case KVM_PRE_FAULT_MEMORY: {
                struct kvm_pre_fault_memory range;

                r = -EFAULT;
                if (copy_from_user(&range, argp, sizeof(range)))
                        break;
                r = kvm_vcpu_pre_fault_memory(vcpu, &range);
                /* Pass back leftover range. */
                if (copy_to_user(argp, &range, sizeof(range)))
                        r = -EFAULT;
                break;
        }
#endif
        default:
                r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
        }
out:
        mutex_unlock(&vcpu->mutex);
        kfree(fpu);
        kfree(kvm_sregs);
        return r;
}

#ifdef CONFIG_KVM_COMPAT
static long kvm_vcpu_compat_ioctl(struct file *filp,
                                  unsigned int ioctl, unsigned long arg)
{
        struct kvm_vcpu *vcpu = filp->private_data;
        void __user *argp = compat_ptr(arg);
        int r;

        if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead)
                return -EIO;

        switch (ioctl) {
        case KVM_SET_SIGNAL_MASK: {
                struct kvm_signal_mask __user *sigmask_arg = argp;
                struct kvm_signal_mask kvm_sigmask;
                sigset_t sigset;

                if (argp) {
                        r = -EFAULT;
                        if (copy_from_user(&kvm_sigmask, argp,
                                           sizeof(kvm_sigmask)))
                                goto out;
                        r = -EINVAL;
                        if (kvm_sigmask.len != sizeof(compat_sigset_t))
                                goto out;
                        r = -EFAULT;
                        if (get_compat_sigset(&sigset,
                                              (compat_sigset_t __user *)sigmask_arg->sigset))
                                goto out;
                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
                } else
                        r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
                break;
        }
        default:
                r = kvm_vcpu_ioctl(filp, ioctl, arg);
        }

out:
        return r;
}
#endif

static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma)
{
        struct kvm_device *dev = filp->private_data;

        if (dev->ops->mmap)
                return dev->ops->mmap(dev, vma);

        return -ENODEV;
}

static int kvm_device_ioctl_attr(struct kvm_device *dev,
                                 int (*accessor)(struct kvm_device *dev,
                                                 struct kvm_device_attr *attr),
                                 unsigned long arg)
{
        struct kvm_device_attr attr;

        if (!accessor)
                return -EPERM;

        if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
                return -EFAULT;

        return accessor(dev, &attr);
}

static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
                             unsigned long arg)
{
        struct kvm_device *dev = filp->private_data;

        if (dev->kvm->mm != current->mm || dev->kvm->vm_dead)
                return -EIO;

        switch (ioctl) {
        case KVM_SET_DEVICE_ATTR:
                return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
        case KVM_GET_DEVICE_ATTR:
                return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
        case KVM_HAS_DEVICE_ATTR:
                return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
        default:
                if (dev->ops->ioctl)
                        return dev->ops->ioctl(dev, ioctl, arg);

                return -ENOTTY;
        }
}

static int kvm_device_release(struct inode *inode, struct file *filp)
{
        struct kvm_device *dev = filp->private_data;
        struct kvm *kvm = dev->kvm;

        if (dev->ops->release) {
                mutex_lock(&kvm->lock);
                list_del_rcu(&dev->vm_node);
                synchronize_rcu();
                dev->ops->release(dev);
                mutex_unlock(&kvm->lock);
        }

        kvm_put_kvm(kvm);
        return 0;
}

static struct file_operations kvm_device_fops = {
        .unlocked_ioctl = kvm_device_ioctl,
        .release = kvm_device_release,
        KVM_COMPAT(kvm_device_ioctl),
        .mmap = kvm_device_mmap,
};

struct kvm_device *kvm_device_from_filp(struct file *filp)
{
        if (filp->f_op != &kvm_device_fops)
                return NULL;

        return filp->private_data;
}

static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
#ifdef CONFIG_KVM_MPIC
        [KVM_DEV_TYPE_FSL_MPIC_20]        = &kvm_mpic_ops,
        [KVM_DEV_TYPE_FSL_MPIC_42]        = &kvm_mpic_ops,
#endif
};

int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
{
        if (type >= ARRAY_SIZE(kvm_device_ops_table))
                return -ENOSPC;

        if (kvm_device_ops_table[type] != NULL)
                return -EEXIST;

        kvm_device_ops_table[type] = ops;
        return 0;
}

void kvm_unregister_device_ops(u32 type)
{
        if (kvm_device_ops_table[type] != NULL)
                kvm_device_ops_table[type] = NULL;
}

static int kvm_ioctl_create_device(struct kvm *kvm,
                                   struct kvm_create_device *cd)
{
        const struct kvm_device_ops *ops;
        struct kvm_device *dev;
        bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
        int type;
        int ret;

        if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
                return -ENODEV;

        type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
        ops = kvm_device_ops_table[type];
        if (ops == NULL)
                return -ENODEV;

        if (test)
                return 0;

        dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
        if (!dev)
                return -ENOMEM;

        dev->ops = ops;
        dev->kvm = kvm;

        mutex_lock(&kvm->lock);
        ret = ops->create(dev, type);
        if (ret < 0) {
                mutex_unlock(&kvm->lock);
                kfree(dev);
                return ret;
        }
        list_add_rcu(&dev->vm_node, &kvm->devices);
        mutex_unlock(&kvm->lock);

        if (ops->init)
                ops->init(dev);

        kvm_get_kvm(kvm);
        ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
        if (ret < 0) {
                kvm_put_kvm_no_destroy(kvm);
                mutex_lock(&kvm->lock);
                list_del_rcu(&dev->vm_node);
                synchronize_rcu();
                if (ops->release)
                        ops->release(dev);
                mutex_unlock(&kvm->lock);
                if (ops->destroy)
                        ops->destroy(dev);
                return ret;
        }

        cd->fd = ret;
        return 0;
}

static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
{
        switch (arg) {
        case KVM_CAP_USER_MEMORY:
        case KVM_CAP_USER_MEMORY2:
        case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
        case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
        case KVM_CAP_INTERNAL_ERROR_DATA:
#ifdef CONFIG_HAVE_KVM_MSI
        case KVM_CAP_SIGNAL_MSI:
#endif
#ifdef CONFIG_HAVE_KVM_IRQCHIP
        case KVM_CAP_IRQFD:
#endif
        case KVM_CAP_IOEVENTFD_ANY_LENGTH:
        case KVM_CAP_CHECK_EXTENSION_VM:
        case KVM_CAP_ENABLE_CAP_VM:
        case KVM_CAP_HALT_POLL:
                return 1;
#ifdef CONFIG_KVM_MMIO
        case KVM_CAP_COALESCED_MMIO:
                return KVM_COALESCED_MMIO_PAGE_OFFSET;
        case KVM_CAP_COALESCED_PIO:
                return 1;
#endif
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
        case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
                return KVM_DIRTY_LOG_MANUAL_CAPS;
#endif
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
        case KVM_CAP_IRQ_ROUTING:
                return KVM_MAX_IRQ_ROUTES;
#endif
#if KVM_MAX_NR_ADDRESS_SPACES > 1
        case KVM_CAP_MULTI_ADDRESS_SPACE:
                if (kvm)
                        return kvm_arch_nr_memslot_as_ids(kvm);
                return KVM_MAX_NR_ADDRESS_SPACES;
#endif
        case KVM_CAP_NR_MEMSLOTS:
                return KVM_USER_MEM_SLOTS;
        case KVM_CAP_DIRTY_LOG_RING:
#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
                return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
#else
                return 0;
#endif
        case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
                return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
#else
                return 0;
#endif
#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
        case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
#endif
        case KVM_CAP_BINARY_STATS_FD:
        case KVM_CAP_SYSTEM_EVENT_DATA:
        case KVM_CAP_DEVICE_CTRL:
                return 1;
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
        case KVM_CAP_MEMORY_ATTRIBUTES:
                return kvm_supported_mem_attributes(kvm);
#endif
#ifdef CONFIG_KVM_PRIVATE_MEM
        case KVM_CAP_GUEST_MEMFD:
                return !kvm || kvm_arch_has_private_mem(kvm);
#endif
        default:
                break;
        }
        return kvm_vm_ioctl_check_extension(kvm, arg);
}

static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
{
        int r;

        if (!KVM_DIRTY_LOG_PAGE_OFFSET)
                return -EINVAL;

        /* the size should be power of 2 */
        if (!size || (size & (size - 1)))
                return -EINVAL;

        /* Should be bigger to keep the reserved entries, or a page */
        if (size < kvm_dirty_ring_get_rsvd_entries() *
            sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE)
                return -EINVAL;

        if (size > KVM_DIRTY_RING_MAX_ENTRIES *
            sizeof(struct kvm_dirty_gfn))
                return -E2BIG;

        /* We only allow it to set once */
        if (kvm->dirty_ring_size)
                return -EINVAL;

        mutex_lock(&kvm->lock);

        if (kvm->created_vcpus) {
                /* We don't allow to change this value after vcpu created */
                r = -EINVAL;
        } else {
                kvm->dirty_ring_size = size;
                r = 0;
        }

        mutex_unlock(&kvm->lock);
        return r;
}

static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
{
        unsigned long i;
        struct kvm_vcpu *vcpu;
        int cleared = 0;

        if (!kvm->dirty_ring_size)
                return -EINVAL;

        mutex_lock(&kvm->slots_lock);

        kvm_for_each_vcpu(i, vcpu, kvm)
                cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring);

        mutex_unlock(&kvm->slots_lock);

        if (cleared)
                kvm_flush_remote_tlbs(kvm);

        return cleared;
}

int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
                                                  struct kvm_enable_cap *cap)
{
        return -EINVAL;
}

bool kvm_are_all_memslots_empty(struct kvm *kvm)
{
        int i;

        lockdep_assert_held(&kvm->slots_lock);

        for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
                if (!kvm_memslots_empty(__kvm_memslots(kvm, i)))
                        return false;
        }

        return true;
}
EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);

static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
                                           struct kvm_enable_cap *cap)
{
        switch (cap->cap) {
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
        case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
                u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;

                if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
                        allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;

                if (cap->flags || (cap->args[0] & ~allowed_options))
                        return -EINVAL;
                kvm->manual_dirty_log_protect = cap->args[0];
                return 0;
        }
#endif
        case KVM_CAP_HALT_POLL: {
                if (cap->flags || cap->args[0] != (unsigned int)cap->args[0])
                        return -EINVAL;

                kvm->max_halt_poll_ns = cap->args[0];

                /*
                 * Ensure kvm->override_halt_poll_ns does not become visible
                 * before kvm->max_halt_poll_ns.
                 *
                 * Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
                 */
                smp_wmb();
                kvm->override_halt_poll_ns = true;

                return 0;
        }
        case KVM_CAP_DIRTY_LOG_RING:
        case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
                if (!kvm_vm_ioctl_check_extension_generic(kvm, cap->cap))
                        return -EINVAL;

                return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]);
        case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
                int r = -EINVAL;

                if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) ||
                    !kvm->dirty_ring_size || cap->flags)
                        return r;

                mutex_lock(&kvm->slots_lock);

                /*
                 * For simplicity, allow enabling ring+bitmap if and only if
                 * there are no memslots, e.g. to ensure all memslots allocate
                 * a bitmap after the capability is enabled.
                 */
                if (kvm_are_all_memslots_empty(kvm)) {
                        kvm->dirty_ring_with_bitmap = true;
                        r = 0;
                }

                mutex_unlock(&kvm->slots_lock);

                return r;
        }
        default:
                return kvm_vm_ioctl_enable_cap(kvm, cap);
        }
}

static ssize_t kvm_vm_stats_read(struct file *file, char __user *user_buffer,
                              size_t size, loff_t *offset)
{
        struct kvm *kvm = file->private_data;

        return kvm_stats_read(kvm->stats_id, &kvm_vm_stats_header,
                                &kvm_vm_stats_desc[0], &kvm->stat,
                                sizeof(kvm->stat), user_buffer, size, offset);
}

static int kvm_vm_stats_release(struct inode *inode, struct file *file)
{
        struct kvm *kvm = file->private_data;

        kvm_put_kvm(kvm);
        return 0;
}

static const struct file_operations kvm_vm_stats_fops = {
        .owner = THIS_MODULE,
        .read = kvm_vm_stats_read,
        .release = kvm_vm_stats_release,
        .llseek = noop_llseek,
};

static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
{
        int fd;
        struct file *file;

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
                return fd;

        file = anon_inode_getfile_fmode("kvm-vm-stats",
                        &kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD);
        if (IS_ERR(file)) {
                put_unused_fd(fd);
                return PTR_ERR(file);
        }

        kvm_get_kvm(kvm);
        fd_install(fd, file);

        return fd;
}

#define SANITY_CHECK_MEM_REGION_FIELD(field)                                        \
do {                                                                                \
        BUILD_BUG_ON(offsetof(struct kvm_userspace_memory_region, field) !=                \
                     offsetof(struct kvm_userspace_memory_region2, field));        \
        BUILD_BUG_ON(sizeof_field(struct kvm_userspace_memory_region, field) !=                \
                     sizeof_field(struct kvm_userspace_memory_region2, field));        \
} while (0)

static long kvm_vm_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
{
        struct kvm *kvm = filp->private_data;
        void __user *argp = (void __user *)arg;
        int r;

        if (kvm->mm != current->mm || kvm->vm_dead)
                return -EIO;
        switch (ioctl) {
        case KVM_CREATE_VCPU:
                r = kvm_vm_ioctl_create_vcpu(kvm, arg);
                break;
        case KVM_ENABLE_CAP: {
                struct kvm_enable_cap cap;

                r = -EFAULT;
                if (copy_from_user(&cap, argp, sizeof(cap)))
                        goto out;
                r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap);
                break;
        }
        case KVM_SET_USER_MEMORY_REGION2:
        case KVM_SET_USER_MEMORY_REGION: {
                struct kvm_userspace_memory_region2 mem;
                unsigned long size;

                if (ioctl == KVM_SET_USER_MEMORY_REGION) {
                        /*
                         * Fields beyond struct kvm_userspace_memory_region shouldn't be
                         * accessed, but avoid leaking kernel memory in case of a bug.
                         */
                        memset(&mem, 0, sizeof(mem));
                        size = sizeof(struct kvm_userspace_memory_region);
                } else {
                        size = sizeof(struct kvm_userspace_memory_region2);
                }

                /* Ensure the common parts of the two structs are identical. */
                SANITY_CHECK_MEM_REGION_FIELD(slot);
                SANITY_CHECK_MEM_REGION_FIELD(flags);
                SANITY_CHECK_MEM_REGION_FIELD(guest_phys_addr);
                SANITY_CHECK_MEM_REGION_FIELD(memory_size);
                SANITY_CHECK_MEM_REGION_FIELD(userspace_addr);

                r = -EFAULT;
                if (copy_from_user(&mem, argp, size))
                        goto out;

                r = -EINVAL;
                if (ioctl == KVM_SET_USER_MEMORY_REGION &&
                    (mem.flags & ~KVM_SET_USER_MEMORY_REGION_V1_FLAGS))
                        goto out;

                r = kvm_vm_ioctl_set_memory_region(kvm, &mem);
                break;
        }
        case KVM_GET_DIRTY_LOG: {
                struct kvm_dirty_log log;

                r = -EFAULT;
                if (copy_from_user(&log, argp, sizeof(log)))
                        goto out;
                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
                break;
        }
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
        case KVM_CLEAR_DIRTY_LOG: {
                struct kvm_clear_dirty_log log;

                r = -EFAULT;
                if (copy_from_user(&log, argp, sizeof(log)))
                        goto out;
                r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
                break;
        }
#endif
#ifdef CONFIG_KVM_MMIO
        case KVM_REGISTER_COALESCED_MMIO: {
                struct kvm_coalesced_mmio_zone zone;

                r = -EFAULT;
                if (copy_from_user(&zone, argp, sizeof(zone)))
                        goto out;
                r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone);
                break;
        }
        case KVM_UNREGISTER_COALESCED_MMIO: {
                struct kvm_coalesced_mmio_zone zone;

                r = -EFAULT;
                if (copy_from_user(&zone, argp, sizeof(zone)))
                        goto out;
                r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone);
                break;
        }
#endif
        case KVM_IRQFD: {
                struct kvm_irqfd data;

                r = -EFAULT;
                if (copy_from_user(&data, argp, sizeof(data)))
                        goto out;
                r = kvm_irqfd(kvm, &data);
                break;
        }
        case KVM_IOEVENTFD: {
                struct kvm_ioeventfd data;

                r = -EFAULT;
                if (copy_from_user(&data, argp, sizeof(data)))
                        goto out;
                r = kvm_ioeventfd(kvm, &data);
                break;
        }
#ifdef CONFIG_HAVE_KVM_MSI
        case KVM_SIGNAL_MSI: {
                struct kvm_msi msi;

                r = -EFAULT;
                if (copy_from_user(&msi, argp, sizeof(msi)))
                        goto out;
                r = kvm_send_userspace_msi(kvm, &msi);
                break;
        }
#endif
#ifdef __KVM_HAVE_IRQ_LINE
        case KVM_IRQ_LINE_STATUS:
        case KVM_IRQ_LINE: {
                struct kvm_irq_level irq_event;

                r = -EFAULT;
                if (copy_from_user(&irq_event, argp, sizeof(irq_event)))
                        goto out;

                r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
                                        ioctl == KVM_IRQ_LINE_STATUS);
                if (r)
                        goto out;

                r = -EFAULT;
                if (ioctl == KVM_IRQ_LINE_STATUS) {
                        if (copy_to_user(argp, &irq_event, sizeof(irq_event)))
                                goto out;
                }

                r = 0;
                break;
        }
#endif
#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
        case KVM_SET_GSI_ROUTING: {
                struct kvm_irq_routing routing;
                struct kvm_irq_routing __user *urouting;
                struct kvm_irq_routing_entry *entries = NULL;

                r = -EFAULT;
                if (copy_from_user(&routing, argp, sizeof(routing)))
                        goto out;
                r = -EINVAL;
                if (!kvm_arch_can_set_irq_routing(kvm))
                        goto out;
                if (routing.nr > KVM_MAX_IRQ_ROUTES)
                        goto out;
                if (routing.flags)
                        goto out;
                if (routing.nr) {
                        urouting = argp;
                        entries = vmemdup_array_user(urouting->entries,
                                                     routing.nr, sizeof(*entries));
                        if (IS_ERR(entries)) {
                                r = PTR_ERR(entries);
                                goto out;
                        }
                }
                r = kvm_set_irq_routing(kvm, entries, routing.nr,
                                        routing.flags);
                kvfree(entries);
                break;
        }
#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
        case KVM_SET_MEMORY_ATTRIBUTES: {
                struct kvm_memory_attributes attrs;

                r = -EFAULT;
                if (copy_from_user(&attrs, argp, sizeof(attrs)))
                        goto out;

                r = kvm_vm_ioctl_set_mem_attributes(kvm, &attrs);
                break;
        }
#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
        case KVM_CREATE_DEVICE: {
                struct kvm_create_device cd;

                r = -EFAULT;
                if (copy_from_user(&cd, argp, sizeof(cd)))
                        goto out;

                r = kvm_ioctl_create_device(kvm, &cd);
                if (r)
                        goto out;

                r = -EFAULT;
                if (copy_to_user(argp, &cd, sizeof(cd)))
                        goto out;

                r = 0;
                break;
        }
        case KVM_CHECK_EXTENSION:
                r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
                break;
        case KVM_RESET_DIRTY_RINGS:
                r = kvm_vm_ioctl_reset_dirty_pages(kvm);
                break;
        case KVM_GET_STATS_FD:
                r = kvm_vm_ioctl_get_stats_fd(kvm);
                break;
#ifdef CONFIG_KVM_PRIVATE_MEM
        case KVM_CREATE_GUEST_MEMFD: {
                struct kvm_create_guest_memfd guest_memfd;

                r = -EFAULT;
                if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
                        goto out;

                r = kvm_gmem_create(kvm, &guest_memfd);
                break;
        }
#endif
        default:
                r = kvm_arch_vm_ioctl(filp, ioctl, arg);
        }
out:
        return r;
}

#ifdef CONFIG_KVM_COMPAT
struct compat_kvm_dirty_log {
        __u32 slot;
        __u32 padding1;
        union {
                compat_uptr_t dirty_bitmap; /* one bit per page */
                __u64 padding2;
        };
};

struct compat_kvm_clear_dirty_log {
        __u32 slot;
        __u32 num_pages;
        __u64 first_page;
        union {
                compat_uptr_t dirty_bitmap; /* one bit per page */
                __u64 padding2;
        };
};

long __weak kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
                                     unsigned long arg)
{
        return -ENOTTY;
}

static long kvm_vm_compat_ioctl(struct file *filp,
                           unsigned int ioctl, unsigned long arg)
{
        struct kvm *kvm = filp->private_data;
        int r;

        if (kvm->mm != current->mm || kvm->vm_dead)
                return -EIO;

        r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
        if (r != -ENOTTY)
                return r;

        switch (ioctl) {
#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
        case KVM_CLEAR_DIRTY_LOG: {
                struct compat_kvm_clear_dirty_log compat_log;
                struct kvm_clear_dirty_log log;

                if (copy_from_user(&compat_log, (void __user *)arg,
                                   sizeof(compat_log)))
                        return -EFAULT;
                log.slot         = compat_log.slot;
                log.num_pages         = compat_log.num_pages;
                log.first_page         = compat_log.first_page;
                log.padding2         = compat_log.padding2;
                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);

                r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
                break;
        }
#endif
        case KVM_GET_DIRTY_LOG: {
                struct compat_kvm_dirty_log compat_log;
                struct kvm_dirty_log log;

                if (copy_from_user(&compat_log, (void __user *)arg,
                                   sizeof(compat_log)))
                        return -EFAULT;
                log.slot         = compat_log.slot;
                log.padding1         = compat_log.padding1;
                log.padding2         = compat_log.padding2;
                log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);

                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
                break;
        }
        default:
                r = kvm_vm_ioctl(filp, ioctl, arg);
        }
        return r;
}
#endif

static struct file_operations kvm_vm_fops = {
        .release        = kvm_vm_release,
        .unlocked_ioctl = kvm_vm_ioctl,
        .llseek                = noop_llseek,
        KVM_COMPAT(kvm_vm_compat_ioctl),
};

bool file_is_kvm(struct file *file)
{
        return file && file->f_op == &kvm_vm_fops;
}
EXPORT_SYMBOL_GPL(file_is_kvm);

static int kvm_dev_ioctl_create_vm(unsigned long type)
{
        char fdname[ITOA_MAX_LEN + 1];
        int r, fd;
        struct kvm *kvm;
        struct file *file;

        fd = get_unused_fd_flags(O_CLOEXEC);
        if (fd < 0)
                return fd;

        snprintf(fdname, sizeof(fdname), "%d", fd);

        kvm = kvm_create_vm(type, fdname);
        if (IS_ERR(kvm)) {
                r = PTR_ERR(kvm);
                goto put_fd;
        }

        file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
        if (IS_ERR(file)) {
                r = PTR_ERR(file);
                goto put_kvm;
        }

        /*
         * Don't call kvm_put_kvm anymore at this point; file->f_op is
         * already set, with ->release() being kvm_vm_release().  In error
         * cases it will be called by the final fput(file) and will take
         * care of doing kvm_put_kvm(kvm).
         */
        kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);

        fd_install(fd, file);
        return fd;

put_kvm:
        kvm_put_kvm(kvm);
put_fd:
        put_unused_fd(fd);
        return r;
}

static long kvm_dev_ioctl(struct file *filp,
                          unsigned int ioctl, unsigned long arg)
{
        int r = -EINVAL;

        switch (ioctl) {
        case KVM_GET_API_VERSION:
                if (arg)
                        goto out;
                r = KVM_API_VERSION;
                break;
        case KVM_CREATE_VM:
                r = kvm_dev_ioctl_create_vm(arg);
                break;
        case KVM_CHECK_EXTENSION:
                r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
                break;
        case KVM_GET_VCPU_MMAP_SIZE:
                if (arg)
                        goto out;
                r = PAGE_SIZE;     /* struct kvm_run */
#ifdef CONFIG_X86
                r += PAGE_SIZE;    /* pio data page */
#endif
#ifdef CONFIG_KVM_MMIO
                r += PAGE_SIZE;    /* coalesced mmio ring page */
#endif
                break;
        default:
                return kvm_arch_dev_ioctl(filp, ioctl, arg);
        }
out:
        return r;
}

static struct file_operations kvm_chardev_ops = {
        .unlocked_ioctl = kvm_dev_ioctl,
        .llseek                = noop_llseek,
        KVM_COMPAT(kvm_dev_ioctl),
};

static struct miscdevice kvm_dev = {
        KVM_MINOR,
        "kvm",
        &kvm_chardev_ops,
};

#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
static bool enable_virt_at_load = true;
module_param(enable_virt_at_load, bool, 0444);

__visible bool kvm_rebooting;
EXPORT_SYMBOL_GPL(kvm_rebooting);

static DEFINE_PER_CPU(bool, virtualization_enabled);
static DEFINE_MUTEX(kvm_usage_lock);
static int kvm_usage_count;

__weak void kvm_arch_enable_virtualization(void)
{

}

__weak void kvm_arch_disable_virtualization(void)
{

}

static int kvm_enable_virtualization_cpu(void)
{
        if (__this_cpu_read(virtualization_enabled))
                return 0;

        if (kvm_arch_enable_virtualization_cpu()) {
                pr_info("kvm: enabling virtualization on CPU%d failed\n",
                        raw_smp_processor_id());
                return -EIO;
        }

        __this_cpu_write(virtualization_enabled, true);
        return 0;
}

static int kvm_online_cpu(unsigned int cpu)
{
        /*
         * Abort the CPU online process if hardware virtualization cannot
         * be enabled. Otherwise running VMs would encounter unrecoverable
         * errors when scheduled to this CPU.
         */
        return kvm_enable_virtualization_cpu();
}

static void kvm_disable_virtualization_cpu(void *ign)
{
        if (!__this_cpu_read(virtualization_enabled))
                return;

        kvm_arch_disable_virtualization_cpu();

        __this_cpu_write(virtualization_enabled, false);
}

static int kvm_offline_cpu(unsigned int cpu)
{
        kvm_disable_virtualization_cpu(NULL);
        return 0;
}

static void kvm_shutdown(void)
{
        /*
         * Disable hardware virtualization and set kvm_rebooting to indicate
         * that KVM has asynchronously disabled hardware virtualization, i.e.
         * that relevant errors and exceptions aren't entirely unexpected.
         * Some flavors of hardware virtualization need to be disabled before
         * transferring control to firmware (to perform shutdown/reboot), e.g.
         * on x86, virtualization can block INIT interrupts, which are used by
         * firmware to pull APs back under firmware control.  Note, this path
         * is used for both shutdown and reboot scenarios, i.e. neither name is
         * 100% comprehensive.
         */
        pr_info("kvm: exiting hardware virtualization\n");
        kvm_rebooting = true;
        on_each_cpu(kvm_disable_virtualization_cpu, NULL, 1);
}

static int kvm_suspend(void)
{
        /*
         * Secondary CPUs and CPU hotplug are disabled across the suspend/resume
         * callbacks, i.e. no need to acquire kvm_usage_lock to ensure the usage
         * count is stable.  Assert that kvm_usage_lock is not held to ensure
         * the system isn't suspended while KVM is enabling hardware.  Hardware
         * enabling can be preempted, but the task cannot be frozen until it has
         * dropped all locks (userspace tasks are frozen via a fake signal).
         */
        lockdep_assert_not_held(&kvm_usage_lock);
        lockdep_assert_irqs_disabled();

        kvm_disable_virtualization_cpu(NULL);
        return 0;
}

static void kvm_resume(void)
{
        lockdep_assert_not_held(&kvm_usage_lock);
        lockdep_assert_irqs_disabled();

        WARN_ON_ONCE(kvm_enable_virtualization_cpu());
}

static struct syscore_ops kvm_syscore_ops = {
        .suspend = kvm_suspend,
        .resume = kvm_resume,
        .shutdown = kvm_shutdown,
};

static int kvm_enable_virtualization(void)
{
        int r;

        guard(mutex)(&kvm_usage_lock);

        if (kvm_usage_count++)
                return 0;

        kvm_arch_enable_virtualization();

        r = cpuhp_setup_state(CPUHP_AP_KVM_ONLINE, "kvm/cpu:online",
                              kvm_online_cpu, kvm_offline_cpu);
        if (r)
                goto err_cpuhp;

        register_syscore_ops(&kvm_syscore_ops);

        /*
         * Undo virtualization enabling and bail if the system is going down.
         * If userspace initiated a forced reboot, e.g. reboot -f, then it's
         * possible for an in-flight operation to enable virtualization after
         * syscore_shutdown() is called, i.e. without kvm_shutdown() being
         * invoked.  Note, this relies on system_state being set _before_
         * kvm_shutdown(), e.g. to ensure either kvm_shutdown() is invoked
         * or this CPU observes the impending shutdown.  Which is why KVM uses
         * a syscore ops hook instead of registering a dedicated reboot
         * notifier (the latter runs before system_state is updated).
         */
        if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF ||
            system_state == SYSTEM_RESTART) {
                r = -EBUSY;
                goto err_rebooting;
        }

        return 0;

err_rebooting:
        unregister_syscore_ops(&kvm_syscore_ops);
        cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
err_cpuhp:
        kvm_arch_disable_virtualization();
        --kvm_usage_count;
        return r;
}

static void kvm_disable_virtualization(void)
{
        guard(mutex)(&kvm_usage_lock);

        if (--kvm_usage_count)
                return;

        unregister_syscore_ops(&kvm_syscore_ops);
        cpuhp_remove_state(CPUHP_AP_KVM_ONLINE);
        kvm_arch_disable_virtualization();
}

static int kvm_init_virtualization(void)
{
        if (enable_virt_at_load)
                return kvm_enable_virtualization();

        return 0;
}

static void kvm_uninit_virtualization(void)
{
        if (enable_virt_at_load)
                kvm_disable_virtualization();
}
#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
static int kvm_enable_virtualization(void)
{
        return 0;
}

static int kvm_init_virtualization(void)
{
        return 0;
}

static void kvm_disable_virtualization(void)
{

}

static void kvm_uninit_virtualization(void)
{

}
#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */

static void kvm_iodevice_destructor(struct kvm_io_device *dev)
{
        if (dev->ops->destructor)
                dev->ops->destructor(dev);
}

static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
{
        int i;

        for (i = 0; i < bus->dev_count; i++) {
                struct kvm_io_device *pos = bus->range[i].dev;

                kvm_iodevice_destructor(pos);
        }
        kfree(bus);
}

static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
                                 const struct kvm_io_range *r2)
{
        gpa_t addr1 = r1->addr;
        gpa_t addr2 = r2->addr;

        if (addr1 < addr2)
                return -1;

        /* If r2->len == 0, match the exact address.  If r2->len != 0,
         * accept any overlapping write.  Any order is acceptable for
         * overlapping ranges, because kvm_io_bus_get_first_dev ensures
         * we process all of them.
         */
        if (r2->len) {
                addr1 += r1->len;
                addr2 += r2->len;
        }

        if (addr1 > addr2)
                return 1;

        return 0;
}

static int kvm_io_bus_sort_cmp(const void *p1, const void *p2)
{
        return kvm_io_bus_cmp(p1, p2);
}

static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
                             gpa_t addr, int len)
{
        struct kvm_io_range *range, key;
        int off;

        key = (struct kvm_io_range) {
                .addr = addr,
                .len = len,
        };

        range = bsearch(&key, bus->range, bus->dev_count,
                        sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp);
        if (range == NULL)
                return -ENOENT;

        off = range - bus->range;

        while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0)
                off--;

        return off;
}

static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
                              struct kvm_io_range *range, const void *val)
{
        int idx;

        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
        if (idx < 0)
                return -EOPNOTSUPP;

        while (idx < bus->dev_count &&
                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
                if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
                                        range->len, val))
                        return idx;
                idx++;
        }

        return -EOPNOTSUPP;
}

/* kvm_io_bus_write - called under kvm->slots_lock */
int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                     int len, const void *val)
{
        struct kvm_io_bus *bus;
        struct kvm_io_range range;
        int r;

        range = (struct kvm_io_range) {
                .addr = addr,
                .len = len,
        };

        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
        if (!bus)
                return -ENOMEM;
        r = __kvm_io_bus_write(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
}
EXPORT_SYMBOL_GPL(kvm_io_bus_write);

/* kvm_io_bus_write_cookie - called under kvm->slots_lock */
int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
                            gpa_t addr, int len, const void *val, long cookie)
{
        struct kvm_io_bus *bus;
        struct kvm_io_range range;

        range = (struct kvm_io_range) {
                .addr = addr,
                .len = len,
        };

        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
        if (!bus)
                return -ENOMEM;

        /* First try the device referenced by cookie. */
        if ((cookie >= 0) && (cookie < bus->dev_count) &&
            (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
                if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
                                        val))
                        return cookie;

        /*
         * cookie contained garbage; fall back to search and return the
         * correct cookie value.
         */
        return __kvm_io_bus_write(vcpu, bus, &range, val);
}

static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
                             struct kvm_io_range *range, void *val)
{
        int idx;

        idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len);
        if (idx < 0)
                return -EOPNOTSUPP;

        while (idx < bus->dev_count &&
                kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
                if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
                                       range->len, val))
                        return idx;
                idx++;
        }

        return -EOPNOTSUPP;
}

/* kvm_io_bus_read - called under kvm->slots_lock */
int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val)
{
        struct kvm_io_bus *bus;
        struct kvm_io_range range;
        int r;

        range = (struct kvm_io_range) {
                .addr = addr,
                .len = len,
        };

        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
        if (!bus)
                return -ENOMEM;
        r = __kvm_io_bus_read(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
}

int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev)
{
        int i;
        struct kvm_io_bus *new_bus, *bus;
        struct kvm_io_range range;

        lockdep_assert_held(&kvm->slots_lock);

        bus = kvm_get_bus(kvm, bus_idx);
        if (!bus)
                return -ENOMEM;

        /* exclude ioeventfd which is limited by maximum fd */
        if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
                return -ENOSPC;

        new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1),
                          GFP_KERNEL_ACCOUNT);
        if (!new_bus)
                return -ENOMEM;

        range = (struct kvm_io_range) {
                .addr = addr,
                .len = len,
                .dev = dev,
        };

        for (i = 0; i < bus->dev_count; i++)
                if (kvm_io_bus_cmp(&bus->range[i], &range) > 0)
                        break;

        memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
        new_bus->dev_count++;
        new_bus->range[i] = range;
        memcpy(new_bus->range + i + 1, bus->range + i,
                (bus->dev_count - i) * sizeof(struct kvm_io_range));
        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
        synchronize_srcu_expedited(&kvm->srcu);
        kfree(bus);

        return 0;
}

int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                              struct kvm_io_device *dev)
{
        int i;
        struct kvm_io_bus *new_bus, *bus;

        lockdep_assert_held(&kvm->slots_lock);

        bus = kvm_get_bus(kvm, bus_idx);
        if (!bus)
                return 0;

        for (i = 0; i < bus->dev_count; i++) {
                if (bus->range[i].dev == dev) {
                        break;
                }
        }

        if (i == bus->dev_count)
                return 0;

        new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
                          GFP_KERNEL_ACCOUNT);
        if (new_bus) {
                memcpy(new_bus, bus, struct_size(bus, range, i));
                new_bus->dev_count--;
                memcpy(new_bus->range + i, bus->range + i + 1,
                                flex_array_size(new_bus, range, new_bus->dev_count - i));
        }

        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
        synchronize_srcu_expedited(&kvm->srcu);

        /*
         * If NULL bus is installed, destroy the old bus, including all the
         * attached devices. Otherwise, destroy the caller's device only.
         */
        if (!new_bus) {
                pr_err("kvm: failed to shrink bus, removing it completely\n");
                kvm_io_bus_destroy(bus);
                return -ENOMEM;
        }

        kvm_iodevice_destructor(dev);
        kfree(bus);
        return 0;
}

struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                                         gpa_t addr)
{
        struct kvm_io_bus *bus;
        int dev_idx, srcu_idx;
        struct kvm_io_device *iodev = NULL;

        srcu_idx = srcu_read_lock(&kvm->srcu);

        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
        if (!bus)
                goto out_unlock;

        dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
        if (dev_idx < 0)
                goto out_unlock;

        iodev = bus->range[dev_idx].dev;

out_unlock:
        srcu_read_unlock(&kvm->srcu, srcu_idx);

        return iodev;
}
EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);

static int kvm_debugfs_open(struct inode *inode, struct file *file,
                           int (*get)(void *, u64 *), int (*set)(void *, u64),
                           const char *fmt)
{
        int ret;
        struct kvm_stat_data *stat_data = inode->i_private;

        /*
         * The debugfs files are a reference to the kvm struct which
        * is still valid when kvm_destroy_vm is called.  kvm_get_kvm_safe
        * avoids the race between open and the removal of the debugfs directory.
         */
        if (!kvm_get_kvm_safe(stat_data->kvm))
                return -ENOENT;

        ret = simple_attr_open(inode, file, get,
                               kvm_stats_debugfs_mode(stat_data->desc) & 0222
                               ? set : NULL, fmt);
        if (ret)
                kvm_put_kvm(stat_data->kvm);

        return ret;
}

static int kvm_debugfs_release(struct inode *inode, struct file *file)
{
        struct kvm_stat_data *stat_data = inode->i_private;

        simple_attr_release(inode, file);
        kvm_put_kvm(stat_data->kvm);

        return 0;
}

static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
{
        *val = *(u64 *)((void *)(&kvm->stat) + offset);

        return 0;
}

static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
{
        *(u64 *)((void *)(&kvm->stat) + offset) = 0;

        return 0;
}

static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
{
        unsigned long i;
        struct kvm_vcpu *vcpu;

        *val = 0;

        kvm_for_each_vcpu(i, vcpu, kvm)
                *val += *(u64 *)((void *)(&vcpu->stat) + offset);

        return 0;
}

static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
{
        unsigned long i;
        struct kvm_vcpu *vcpu;

        kvm_for_each_vcpu(i, vcpu, kvm)
                *(u64 *)((void *)(&vcpu->stat) + offset) = 0;

        return 0;
}

static int kvm_stat_data_get(void *data, u64 *val)
{
        int r = -EFAULT;
        struct kvm_stat_data *stat_data = data;

        switch (stat_data->kind) {
        case KVM_STAT_VM:
                r = kvm_get_stat_per_vm(stat_data->kvm,
                                        stat_data->desc->desc.offset, val);
                break;
        case KVM_STAT_VCPU:
                r = kvm_get_stat_per_vcpu(stat_data->kvm,
                                          stat_data->desc->desc.offset, val);
                break;
        }

        return r;
}

static int kvm_stat_data_clear(void *data, u64 val)
{
        int r = -EFAULT;
        struct kvm_stat_data *stat_data = data;

        if (val)
                return -EINVAL;

        switch (stat_data->kind) {
        case KVM_STAT_VM:
                r = kvm_clear_stat_per_vm(stat_data->kvm,
                                          stat_data->desc->desc.offset);
                break;
        case KVM_STAT_VCPU:
                r = kvm_clear_stat_per_vcpu(stat_data->kvm,
                                            stat_data->desc->desc.offset);
                break;
        }

        return r;
}

static int kvm_stat_data_open(struct inode *inode, struct file *file)
{
        __simple_attr_check_format("%llu\n", 0ull);
        return kvm_debugfs_open(inode, file, kvm_stat_data_get,
                                kvm_stat_data_clear, "%llu\n");
}

static const struct file_operations stat_fops_per_vm = {
        .owner = THIS_MODULE,
        .open = kvm_stat_data_open,
        .release = kvm_debugfs_release,
        .read = simple_attr_read,
        .write = simple_attr_write,
};

static int vm_stat_get(void *_offset, u64 *val)
{
        unsigned offset = (long)_offset;
        struct kvm *kvm;
        u64 tmp_val;

        *val = 0;
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                kvm_get_stat_per_vm(kvm, offset, &tmp_val);
                *val += tmp_val;
        }
        mutex_unlock(&kvm_lock);
        return 0;
}

static int vm_stat_clear(void *_offset, u64 val)
{
        unsigned offset = (long)_offset;
        struct kvm *kvm;

        if (val)
                return -EINVAL;

        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                kvm_clear_stat_per_vm(kvm, offset);
        }
        mutex_unlock(&kvm_lock);

        return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");

static int vcpu_stat_get(void *_offset, u64 *val)
{
        unsigned offset = (long)_offset;
        struct kvm *kvm;
        u64 tmp_val;

        *val = 0;
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
                *val += tmp_val;
        }
        mutex_unlock(&kvm_lock);
        return 0;
}

static int vcpu_stat_clear(void *_offset, u64 val)
{
        unsigned offset = (long)_offset;
        struct kvm *kvm;

        if (val)
                return -EINVAL;

        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
                kvm_clear_stat_per_vcpu(kvm, offset);
        }
        mutex_unlock(&kvm_lock);

        return 0;
}

DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
                        "%llu\n");
DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");

static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
{
        struct kobj_uevent_env *env;
        unsigned long long created, active;

        if (!kvm_dev.this_device || !kvm)
                return;

        mutex_lock(&kvm_lock);
        if (type == KVM_EVENT_CREATE_VM) {
                kvm_createvm_count++;
                kvm_active_vms++;
        } else if (type == KVM_EVENT_DESTROY_VM) {
                kvm_active_vms--;
        }
        created = kvm_createvm_count;
        active = kvm_active_vms;
        mutex_unlock(&kvm_lock);

        env = kzalloc(sizeof(*env), GFP_KERNEL);
        if (!env)
                return;

        add_uevent_var(env, "CREATED=%llu", created);
        add_uevent_var(env, "COUNT=%llu", active);

        if (type == KVM_EVENT_CREATE_VM) {
                add_uevent_var(env, "EVENT=create");
                kvm->userspace_pid = task_pid_nr(current);
        } else if (type == KVM_EVENT_DESTROY_VM) {
                add_uevent_var(env, "EVENT=destroy");
        }
        add_uevent_var(env, "PID=%d", kvm->userspace_pid);

        if (!IS_ERR(kvm->debugfs_dentry)) {
                char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL);

                if (p) {
                        tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
                        if (!IS_ERR(tmp))
                                add_uevent_var(env, "STATS_PATH=%s", tmp);
                        kfree(p);
                }
        }
        /* no need for checks, since we are adding at most only 5 keys */
        env->envp[env->envp_idx++] = NULL;
        kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp);
        kfree(env);
}

static void kvm_init_debug(void)
{
        const struct file_operations *fops;
        const struct _kvm_stats_desc *pdesc;
        int i;

        kvm_debugfs_dir = debugfs_create_dir("kvm", NULL);

        for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
                pdesc = &kvm_vm_stats_desc[i];
                if (kvm_stats_debugfs_mode(pdesc) & 0222)
                        fops = &vm_stat_fops;
                else
                        fops = &vm_stat_readonly_fops;
                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
                                kvm_debugfs_dir,
                                (void *)(long)pdesc->desc.offset, fops);
        }

        for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
                pdesc = &kvm_vcpu_stats_desc[i];
                if (kvm_stats_debugfs_mode(pdesc) & 0222)
                        fops = &vcpu_stat_fops;
                else
                        fops = &vcpu_stat_readonly_fops;
                debugfs_create_file(pdesc->name, kvm_stats_debugfs_mode(pdesc),
                                kvm_debugfs_dir,
                                (void *)(long)pdesc->desc.offset, fops);
        }
}

static inline
struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
{
        return container_of(pn, struct kvm_vcpu, preempt_notifier);
}

static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
{
        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);

        WRITE_ONCE(vcpu->preempted, false);
        WRITE_ONCE(vcpu->ready, false);

        __this_cpu_write(kvm_running_vcpu, vcpu);
        kvm_arch_vcpu_load(vcpu, cpu);

        WRITE_ONCE(vcpu->scheduled_out, false);
}

static void kvm_sched_out(struct preempt_notifier *pn,
                          struct task_struct *next)
{
        struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);

        WRITE_ONCE(vcpu->scheduled_out, true);

        if (task_is_runnable(current) && vcpu->wants_to_run) {
                WRITE_ONCE(vcpu->preempted, true);
                WRITE_ONCE(vcpu->ready, true);
        }
        kvm_arch_vcpu_put(vcpu);
        __this_cpu_write(kvm_running_vcpu, NULL);
}

/**
 * kvm_get_running_vcpu - get the vcpu running on the current CPU.
 *
 * We can disable preemption locally around accessing the per-CPU variable,
 * and use the resolved vcpu pointer after enabling preemption again,
 * because even if the current thread is migrated to another CPU, reading
 * the per-CPU value later will give us the same value as we update the
 * per-CPU variable in the preempt notifier handlers.
 */
struct kvm_vcpu *kvm_get_running_vcpu(void)
{
        struct kvm_vcpu *vcpu;

        preempt_disable();
        vcpu = __this_cpu_read(kvm_running_vcpu);
        preempt_enable();

        return vcpu;
}
EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);

/**
 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
 */
struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
{
        return &kvm_running_vcpu;
}

#ifdef CONFIG_GUEST_PERF_EVENTS
static unsigned int kvm_guest_state(void)
{
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
        unsigned int state;

        if (!kvm_arch_pmi_in_guest(vcpu))
                return 0;

        state = PERF_GUEST_ACTIVE;
        if (!kvm_arch_vcpu_in_kernel(vcpu))
                state |= PERF_GUEST_USER;

        return state;
}

static unsigned long kvm_guest_get_ip(void)
{
        struct kvm_vcpu *vcpu = kvm_get_running_vcpu();

        /* Retrieving the IP must be guarded by a call to kvm_guest_state(). */
        if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
                return 0;

        return kvm_arch_vcpu_get_ip(vcpu);
}

static struct perf_guest_info_callbacks kvm_guest_cbs = {
        .state                        = kvm_guest_state,
        .get_ip                        = kvm_guest_get_ip,
        .handle_intel_pt_intr        = NULL,
};

void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void))
{
        kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
        perf_register_guest_info_callbacks(&kvm_guest_cbs);
}
void kvm_unregister_perf_callbacks(void)
{
        perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
}
#endif

int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
{
        int r;
        int cpu;

        /* A kmem cache lets us meet the alignment requirements of fx_save. */
        if (!vcpu_align)
                vcpu_align = __alignof__(struct kvm_vcpu);
        kvm_vcpu_cache =
                kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align,
                                           SLAB_ACCOUNT,
                                           offsetof(struct kvm_vcpu, arch),
                                           offsetofend(struct kvm_vcpu, stats_id)
                                           - offsetof(struct kvm_vcpu, arch),
                                           NULL);
        if (!kvm_vcpu_cache)
                return -ENOMEM;

        for_each_possible_cpu(cpu) {
                if (!alloc_cpumask_var_node(&per_cpu(cpu_kick_mask, cpu),
                                            GFP_KERNEL, cpu_to_node(cpu))) {
                        r = -ENOMEM;
                        goto err_cpu_kick_mask;
                }
        }

        r = kvm_irqfd_init();
        if (r)
                goto err_irqfd;

        r = kvm_async_pf_init();
        if (r)
                goto err_async_pf;

        kvm_chardev_ops.owner = module;
        kvm_vm_fops.owner = module;
        kvm_vcpu_fops.owner = module;
        kvm_device_fops.owner = module;

        kvm_preempt_ops.sched_in = kvm_sched_in;
        kvm_preempt_ops.sched_out = kvm_sched_out;

        kvm_init_debug();

        r = kvm_vfio_ops_init();
        if (WARN_ON_ONCE(r))
                goto err_vfio;

        kvm_gmem_init(module);

        r = kvm_init_virtualization();
        if (r)
                goto err_virt;

        /*
         * Registration _must_ be the very last thing done, as this exposes
         * /dev/kvm to userspace, i.e. all infrastructure must be setup!
         */
        r = misc_register(&kvm_dev);
        if (r) {
                pr_err("kvm: misc device register failed\n");
                goto err_register;
        }

        return 0;

err_register:
        kvm_uninit_virtualization();
err_virt:
        kvm_vfio_ops_exit();
err_vfio:
        kvm_async_pf_deinit();
err_async_pf:
        kvm_irqfd_exit();
err_irqfd:
err_cpu_kick_mask:
        for_each_possible_cpu(cpu)
                free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
        kmem_cache_destroy(kvm_vcpu_cache);
        return r;
}
EXPORT_SYMBOL_GPL(kvm_init);

void kvm_exit(void)
{
        int cpu;

        /*
         * Note, unregistering /dev/kvm doesn't strictly need to come first,
         * fops_get(), a.k.a. try_module_get(), prevents acquiring references
         * to KVM while the module is being stopped.
         */
        misc_deregister(&kvm_dev);

        kvm_uninit_virtualization();

        debugfs_remove_recursive(kvm_debugfs_dir);
        for_each_possible_cpu(cpu)
                free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
        kmem_cache_destroy(kvm_vcpu_cache);
        kvm_vfio_ops_exit();
        kvm_async_pf_deinit();
        kvm_irqfd_exit();
}
EXPORT_SYMBOL_GPL(kvm_exit);










  153 

























































   86 
























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM hugetlbfs

#if !defined(_TRACE_HUGETLBFS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_HUGETLBFS_H

#include <linux/tracepoint.h>

TRACE_EVENT(hugetlbfs_alloc_inode,

        TP_PROTO(struct inode *inode, struct inode *dir, int mode),

        TP_ARGS(inode, dir, mode),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(ino_t,                dir)
                __field(__u16,                mode)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->dir                = dir ? dir->i_ino : 0;
                __entry->mode                = mode;
        ),

        TP_printk("dev %d,%d ino %lu dir %lu mode 0%o",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                (unsigned long) __entry->ino,
                (unsigned long) __entry->dir, __entry->mode)
);

DECLARE_EVENT_CLASS(hugetlbfs__inode,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(__u16,                mode)
                __field(loff_t,                size)
                __field(unsigned int,        nlink)
                __field(unsigned int,        seals)
                __field(blkcnt_t,        blocks)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->mode                = inode->i_mode;
                __entry->size                = inode->i_size;
                __entry->nlink                = inode->i_nlink;
                __entry->seals                = HUGETLBFS_I(inode)->seals;
                __entry->blocks                = inode->i_blocks;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o size %lld nlink %u seals %u blocks %llu",
                MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
                __entry->mode, __entry->size, __entry->nlink, __entry->seals,
                (unsigned long long)__entry->blocks)
);

DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_evict_inode,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(hugetlbfs__inode, hugetlbfs_free_inode,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

TRACE_EVENT(hugetlbfs_setattr,

        TP_PROTO(struct inode *inode, struct dentry *dentry,
                struct iattr *attr),

        TP_ARGS(inode, dentry, attr),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(unsigned int,        d_len)
                __string(d_name,        dentry->d_name.name)
                __field(unsigned int,        ia_valid)
                __field(unsigned int,        ia_mode)
                __field(loff_t,                old_size)
                __field(loff_t,                ia_size)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->d_len                = dentry->d_name.len;
                __assign_str(d_name);
                __entry->ia_valid        = attr->ia_valid;
                __entry->ia_mode        = attr->ia_mode;
                __entry->old_size        = inode->i_size;
                __entry->ia_size        = attr->ia_size;
        ),

        TP_printk("dev %d,%d ino %lu name %.*s valid %#x mode 0%o old_size %lld size %lld",
                MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino,
                __entry->d_len, __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
                __entry->old_size, __entry->ia_size)
);

TRACE_EVENT(hugetlbfs_fallocate,

        TP_PROTO(struct inode *inode, int mode,
                loff_t offset, loff_t len, int ret),

        TP_ARGS(inode, mode, offset, len, ret),

        TP_STRUCT__entry(
                __field(dev_t,                dev)
                __field(ino_t,                ino)
                __field(int,                mode)
                __field(loff_t,                offset)
                __field(loff_t,                len)
                __field(loff_t,                size)
                __field(int,                ret)
        ),

        TP_fast_assign(
                __entry->dev                = inode->i_sb->s_dev;
                __entry->ino                = inode->i_ino;
                __entry->mode                = mode;
                __entry->offset                = offset;
                __entry->len                = len;
                __entry->size                = inode->i_size;
                __entry->ret                = ret;
        ),

        TP_printk("dev %d,%d ino %lu mode 0%o offset %lld len %lld size %lld ret %d",
                MAJOR(__entry->dev), MINOR(__entry->dev),
                (unsigned long)__entry->ino, __entry->mode,
                (unsigned long long)__entry->offset,
                (unsigned long long)__entry->len,
                (unsigned long long)__entry->size,
                __entry->ret)
);

#endif /* _TRACE_HUGETLBFS_H */

 /* This part must be outside protection */
#include <trace/define_trace.h>












































































































  157 









  157 






























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CONTEXT_TRACKING_STATE_H
#define _LINUX_CONTEXT_TRACKING_STATE_H

#include <linux/percpu.h>
#include <linux/static_key.h>
#include <linux/context_tracking_irq.h>

/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
#define CT_NESTING_IRQ_NONIDLE        ((LONG_MAX / 2) + 1)

enum ctx_state {
        CT_STATE_DISABLED        = -1,        /* returned by ct_state() if unknown */
        CT_STATE_KERNEL                = 0,
        CT_STATE_IDLE                = 1,
        CT_STATE_USER                = 2,
        CT_STATE_GUEST                = 3,
        CT_STATE_MAX                = 4,
};

/* Odd value for watching, else even. */
#define CT_RCU_WATCHING CT_STATE_MAX

#define CT_STATE_MASK (CT_STATE_MAX - 1)
#define CT_RCU_WATCHING_MASK (~CT_STATE_MASK)

struct context_tracking {
#ifdef CONFIG_CONTEXT_TRACKING_USER
        /*
         * When active is false, probes are unset in order
         * to minimize overhead: TIF flags are cleared
         * and calls to user_enter/exit are ignored. This
         * may be further optimized using static keys.
         */
        bool active;
        int recursion;
#endif
#ifdef CONFIG_CONTEXT_TRACKING
        atomic_t state;
#endif
#ifdef CONFIG_CONTEXT_TRACKING_IDLE
        long nesting;                /* Track process nesting level. */
        long nmi_nesting;        /* Track irq/NMI nesting level. */
#endif
};

#ifdef CONFIG_CONTEXT_TRACKING
DECLARE_PER_CPU(struct context_tracking, context_tracking);
#endif

#ifdef CONFIG_CONTEXT_TRACKING_USER
static __always_inline int __ct_state(void)
{
        return raw_atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_STATE_MASK;
}
#endif

#ifdef CONFIG_CONTEXT_TRACKING_IDLE
static __always_inline int ct_rcu_watching(void)
{
        return atomic_read(this_cpu_ptr(&context_tracking.state)) & CT_RCU_WATCHING_MASK;
}

static __always_inline int ct_rcu_watching_cpu(int cpu)
{
        struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);

        return atomic_read(&ct->state) & CT_RCU_WATCHING_MASK;
}

static __always_inline int ct_rcu_watching_cpu_acquire(int cpu)
{
        struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);

        return atomic_read_acquire(&ct->state) & CT_RCU_WATCHING_MASK;
}

static __always_inline long ct_nesting(void)
{
        return __this_cpu_read(context_tracking.nesting);
}

static __always_inline long ct_nesting_cpu(int cpu)
{
        struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);

        return ct->nesting;
}

static __always_inline long ct_nmi_nesting(void)
{
        return __this_cpu_read(context_tracking.nmi_nesting);
}

static __always_inline long ct_nmi_nesting_cpu(int cpu)
{
        struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);

        return ct->nmi_nesting;
}
#endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */

#ifdef CONFIG_CONTEXT_TRACKING_USER
extern struct static_key_false context_tracking_key;

static __always_inline bool context_tracking_enabled(void)
{
        return static_branch_unlikely(&context_tracking_key);
}

static __always_inline bool context_tracking_enabled_cpu(int cpu)
{
        return context_tracking_enabled() && per_cpu(context_tracking.active, cpu);
}

static __always_inline bool context_tracking_enabled_this_cpu(void)
{
        return context_tracking_enabled() && __this_cpu_read(context_tracking.active);
}

/**
 * ct_state() - return the current context tracking state if known
 *
 * Returns the current cpu's context tracking state if context tracking
 * is enabled.  If context tracking is disabled, returns
 * CT_STATE_DISABLED.  This should be used primarily for debugging.
 */
static __always_inline int ct_state(void)
{
        int ret;

        if (!context_tracking_enabled())
                return CT_STATE_DISABLED;

        preempt_disable();
        ret = __ct_state();
        preempt_enable();

        return ret;
}

#else
static __always_inline bool context_tracking_enabled(void) { return false; }
static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; }
static __always_inline bool context_tracking_enabled_this_cpu(void) { return false; }
#endif /* CONFIG_CONTEXT_TRACKING_USER */

#endif









































































































































































































































































  165 



  165 




















  165 






































  165 

  165 













  165 







































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  145 





  147 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  175 




  177 







  176 















  177 
  176 


  177 



















   17 






  165 










  164 

















  165 











































  165 
































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/fs-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
 * Contains all the functions related to writing back and waiting
 * upon dirty inodes against superblocks, and writing back dirty
 * pages against inodes.  ie: data writeback.  Writeout of the
 * inode itself is not handled here.
 *
 * 10Apr2002        Andrew Morton
 *                Split out of fs/inode.c
 *                Additions for address_space-based writeback
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/tracepoint.h>
#include <linux/device.h>
#include <linux/memcontrol.h>
#include "internal.h"

/*
 * 4MB minimal write chunk size
 */
#define MIN_WRITEBACK_PAGES        (4096UL >> (PAGE_SHIFT - 10))

/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
struct wb_writeback_work {
        long nr_pages;
        struct super_block *sb;
        enum writeback_sync_modes sync_mode;
        unsigned int tagged_writepages:1;
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
        unsigned int auto_free:1;        /* free on completion */
        enum wb_reason reason;                /* why was writeback initiated? */

        struct list_head list;                /* pending work list */
        struct wb_completion *done;        /* set if the caller waits */
};

/*
 * If an inode is constantly having its pages dirtied, but then the
 * updates stop dirtytime_expire_interval seconds in the past, it's
 * possible for the worst case time between when an inode has its
 * timestamps updated and when they finally get written out to be two
 * dirtytime_expire_intervals.  We set the default to 12 hours (in
 * seconds), which means most of the time inodes will have their
 * timestamps written to disk after 12 hours, but in the worst case a
 * few inodes might not their timestamps updated for 24 hours.
 */
static unsigned int dirtytime_expire_interval = 12 * 60 * 60;

static inline struct inode *wb_inode(struct list_head *head)
{
        return list_entry(head, struct inode, i_io_list);
}

/*
 * Include the creation of the trace points after defining the
 * wb_writeback_work structure and inline functions so that the definition
 * remains local to this file.
 */
#define CREATE_TRACE_POINTS
#include <trace/events/writeback.h>

EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);

static bool wb_io_lists_populated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb)) {
                return false;
        } else {
                set_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(!wb->avg_write_bandwidth);
                atomic_long_add(wb->avg_write_bandwidth,
                                &wb->bdi->tot_write_bandwidth);
                return true;
        }
}

static void wb_io_lists_depopulated(struct bdi_writeback *wb)
{
        if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
            list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
                clear_bit(WB_has_dirty_io, &wb->state);
                WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
                                        &wb->bdi->tot_write_bandwidth) < 0);
        }
}

/**
 * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list
 * @inode: inode to be moved
 * @wb: target bdi_writeback
 * @head: one of @wb->b_{dirty|io|more_io|dirty_time}
 *
 * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io.
 * Returns %true if @inode is the first occupant of the !dirty_time IO
 * lists; otherwise, %false.
 */
static bool inode_io_list_move_locked(struct inode *inode,
                                      struct bdi_writeback *wb,
                                      struct list_head *head)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode->i_state & I_FREEING);

        list_move(&inode->i_io_list, head);

        /* dirty_time doesn't count as dirty_io until expiration */
        if (head != &wb->b_dirty_time)
                return wb_io_lists_populated(wb);

        wb_io_lists_depopulated(wb);
        return false;
}

static void wb_wakeup(struct bdi_writeback *wb)
{
        spin_lock_irq(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        spin_unlock_irq(&wb->work_lock);
}

/*
 * This function is used when the first inode for this wb is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
 * periodic background write-out of dirty inodes. Since the write-out would
 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 * set up a timer which wakes the bdi thread up later.
 *
 * Note, we wouldn't bother setting up the timer, but this function is on the
 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 * by delaying the wake-up.
 *
 * We have to be careful not to postpone flush work if it is scheduled for
 * earlier. Thus we use queue_delayed_work().
 */
static void wb_wakeup_delayed(struct bdi_writeback *wb)
{
        unsigned long timeout;

        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
        spin_lock_irq(&wb->work_lock);
        if (test_bit(WB_registered, &wb->state))
                queue_delayed_work(bdi_wq, &wb->dwork, timeout);
        spin_unlock_irq(&wb->work_lock);
}

static void finish_writeback_work(struct wb_writeback_work *work)
{
        struct wb_completion *done = work->done;

        if (work->auto_free)
                kfree(work);
        if (done) {
                wait_queue_head_t *waitq = done->waitq;

                /* @done can't be accessed after the following dec */
                if (atomic_dec_and_test(&done->cnt))
                        wake_up_all(waitq);
        }
}

static void wb_queue_work(struct bdi_writeback *wb,
                          struct wb_writeback_work *work)
{
        trace_writeback_queue(wb, work);

        if (work->done)
                atomic_inc(&work->done->cnt);

        spin_lock_irq(&wb->work_lock);

        if (test_bit(WB_registered, &wb->state)) {
                list_add_tail(&work->list, &wb->work_list);
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        } else
                finish_writeback_work(work);

        spin_unlock_irq(&wb->work_lock);
}

/**
 * wb_wait_for_completion - wait for completion of bdi_writeback_works
 * @done: target wb_completion
 *
 * Wait for one or more work items issued to @bdi with their ->done field
 * set to @done, which should have been initialized with
 * DEFINE_WB_COMPLETION().  This function returns after all such work items
 * are completed.  Work items which are waited upon aren't freed
 * automatically on completion.
 */
void wb_wait_for_completion(struct wb_completion *done)
{
        atomic_dec(&done->cnt);                /* put down the initial count */
        wait_event(*done->waitq, !atomic_read(&done->cnt));
}

#ifdef CONFIG_CGROUP_WRITEBACK

/*
 * Parameters for foreign inode detection, see wbc_detach_inode() to see
 * how they're used.
 *
 * These paramters are inherently heuristical as the detection target
 * itself is fuzzy.  All we want to do is detaching an inode from the
 * current owner if it's being written to by some other cgroups too much.
 *
 * The current cgroup writeback is built on the assumption that multiple
 * cgroups writing to the same inode concurrently is very rare and a mode
 * of operation which isn't well supported.  As such, the goal is not
 * taking too long when a different cgroup takes over an inode while
 * avoiding too aggressive flip-flops from occasional foreign writes.
 *
 * We record, very roughly, 2s worth of IO time history and if more than
 * half of that is foreign, trigger the switch.  The recording is quantized
 * to 16 slots.  To avoid tiny writes from swinging the decision too much,
 * writes smaller than 1/8 of avg size are ignored.
 */
#define WB_FRN_TIME_SHIFT        13        /* 1s = 2^13, upto 8 secs w/ 16bit */
#define WB_FRN_TIME_AVG_SHIFT        3        /* avg = avg * 7/8 + new * 1/8 */
#define WB_FRN_TIME_CUT_DIV        8        /* ignore rounds < avg / 8 */
#define WB_FRN_TIME_PERIOD        (2 * (1 << WB_FRN_TIME_SHIFT))        /* 2s */

#define WB_FRN_HIST_SLOTS        16        /* inode->i_wb_frn_history is 16bit */
#define WB_FRN_HIST_UNIT        (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
                                        /* each slot's duration is 2s / 16 */
#define WB_FRN_HIST_THR_SLOTS        (WB_FRN_HIST_SLOTS / 2)
                                        /* if foreign slots >= 8, switch */
#define WB_FRN_HIST_MAX_SLOTS        (WB_FRN_HIST_THR_SLOTS / 2 + 1)
                                        /* one round can affect upto 5 slots */
#define WB_FRN_MAX_IN_FLIGHT        1024        /* don't queue too many concurrently */

/*
 * Maximum inodes per isw.  A specific value has been chosen to make
 * struct inode_switch_wbs_context fit into 1024 bytes kmalloc.
 */
#define WB_MAX_INODES_PER_ISW  ((1024UL - sizeof(struct inode_switch_wbs_context)) \
                                / sizeof(struct inode *))

static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
static struct workqueue_struct *isw_wq;

void __inode_attach_wb(struct inode *inode, struct folio *folio)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;

        if (inode_cgwb_enabled(inode)) {
                struct cgroup_subsys_state *memcg_css;

                if (folio) {
                        memcg_css = mem_cgroup_css_from_folio(folio);
                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                } else {
                        /* must pin memcg_css, see wb_get_create() */
                        memcg_css = task_get_css(current, memory_cgrp_id);
                        wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
                        css_put(memcg_css);
                }
        }

        if (!wb)
                wb = &bdi->wb;

        /*
         * There may be multiple instances of this function racing to
         * update the same inode.  Use cmpxchg() to tell the winner.
         */
        if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
                wb_put(wb);
}

/**
 * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
 * @inode: inode of interest with i_lock held
 * @wb: target bdi_writeback
 *
 * Remove the inode from wb's io lists and if necessarily put onto b_attached
 * list.  Only inodes attached to cgwb's are kept on this list.
 */
static void inode_cgwb_move_to_attached(struct inode *inode,
                                        struct bdi_writeback *wb)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode->i_state & I_FREEING);

        inode->i_state &= ~I_SYNC_QUEUED;
        if (wb != &wb->bdi->wb)
                list_move(&inode->i_io_list, &wb->b_attached);
        else
                list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);
}

/**
 * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
 * @inode: inode of interest with i_lock held
 *
 * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
 * held on entry and is released on return.  The returned wb is guaranteed
 * to stay @inode's associated wb until its list_lock is released.
 */
static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        while (true) {
                struct bdi_writeback *wb = inode_to_wb(inode);

                /*
                 * inode_to_wb() association is protected by both
                 * @inode->i_lock and @wb->list_lock but list_lock nests
                 * outside i_lock.  Drop i_lock and verify that the
                 * association hasn't changed after acquiring list_lock.
                 */
                wb_get(wb);
                spin_unlock(&inode->i_lock);
                spin_lock(&wb->list_lock);

                /* i_wb may have changed inbetween, can't use inode_to_wb() */
                if (likely(wb == inode->i_wb)) {
                        wb_put(wb);        /* @inode already has ref */
                        return wb;
                }

                spin_unlock(&wb->list_lock);
                wb_put(wb);
                cpu_relax();
                spin_lock(&inode->i_lock);
        }
}

/**
 * inode_to_wb_and_lock_list - determine an inode's wb and lock it
 * @inode: inode of interest
 *
 * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
 * on entry.
 */
static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        spin_lock(&inode->i_lock);
        return locked_inode_to_wb_and_lock_list(inode);
}

struct inode_switch_wbs_context {
        struct rcu_work                work;

        /*
         * Multiple inodes can be switched at once.  The switching procedure
         * consists of two parts, separated by a RCU grace period.  To make
         * sure that the second part is executed for each inode gone through
         * the first part, all inode pointers are placed into a NULL-terminated
         * array embedded into struct inode_switch_wbs_context.  Otherwise
         * an inode could be left in a non-consistent state.
         */
        struct bdi_writeback        *new_wb;
        struct inode                *inodes[];
};

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        down_write(&bdi->wb_switch_rwsem);
}

static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
        up_write(&bdi->wb_switch_rwsem);
}

static bool inode_do_switch_wbs(struct inode *inode,
                                struct bdi_writeback *old_wb,
                                struct bdi_writeback *new_wb)
{
        struct address_space *mapping = inode->i_mapping;
        XA_STATE(xas, &mapping->i_pages, 0);
        struct folio *folio;
        bool switched = false;

        spin_lock(&inode->i_lock);
        xa_lock_irq(&mapping->i_pages);

        /*
         * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction
         * path owns the inode and we shouldn't modify ->i_io_list.
         */
        if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE)))
                goto skip_switch;

        trace_inode_switch_wbs(inode, old_wb, new_wb);

        /*
         * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
         * to possibly dirty folios while PAGECACHE_TAG_WRITEBACK points to
         * folios actually under writeback.
         */
        xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_DIRTY) {
                if (folio_test_dirty(folio)) {
                        long nr = folio_nr_pages(folio);
                        wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
                        wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
                }
        }

        xas_set(&xas, 0);
        xas_for_each_marked(&xas, folio, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) {
                long nr = folio_nr_pages(folio);
                WARN_ON_ONCE(!folio_test_writeback(folio));
                wb_stat_mod(old_wb, WB_WRITEBACK, -nr);
                wb_stat_mod(new_wb, WB_WRITEBACK, nr);
        }

        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
                atomic_dec(&old_wb->writeback_inodes);
                atomic_inc(&new_wb->writeback_inodes);
        }

        wb_get(new_wb);

        /*
         * Transfer to @new_wb's IO list if necessary.  If the @inode is dirty,
         * the specific list @inode was on is ignored and the @inode is put on
         * ->b_dirty which is always correct including from ->b_dirty_time.
         * The transfer preserves @inode->dirtied_when ordering.  If the @inode
         * was clean, it means it was on the b_attached list, so move it onto
         * the b_attached list of @new_wb.
         */
        if (!list_empty(&inode->i_io_list)) {
                inode->i_wb = new_wb;

                if (inode->i_state & I_DIRTY_ALL) {
                        struct inode *pos;

                        list_for_each_entry(pos, &new_wb->b_dirty, i_io_list)
                                if (time_after_eq(inode->dirtied_when,
                                                  pos->dirtied_when))
                                        break;
                        inode_io_list_move_locked(inode, new_wb,
                                                  pos->i_io_list.prev);
                } else {
                        inode_cgwb_move_to_attached(inode, new_wb);
                }
        } else {
                inode->i_wb = new_wb;
        }

        /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
        inode->i_wb_frn_winner = 0;
        inode->i_wb_frn_avg_time = 0;
        inode->i_wb_frn_history = 0;
        switched = true;
skip_switch:
        /*
         * Paired with load_acquire in unlocked_inode_to_wb_begin() and
         * ensures that the new wb is visible if they see !I_WB_SWITCH.
         */
        smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);

        xa_unlock_irq(&mapping->i_pages);
        spin_unlock(&inode->i_lock);

        return switched;
}

static void inode_switch_wbs_work_fn(struct work_struct *work)
{
        struct inode_switch_wbs_context *isw =
                container_of(to_rcu_work(work), struct inode_switch_wbs_context, work);
        struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]);
        struct bdi_writeback *old_wb = isw->inodes[0]->i_wb;
        struct bdi_writeback *new_wb = isw->new_wb;
        unsigned long nr_switched = 0;
        struct inode **inodep;

        /*
         * If @inode switches cgwb membership while sync_inodes_sb() is
         * being issued, sync_inodes_sb() might miss it.  Synchronize.
         */
        down_read(&bdi->wb_switch_rwsem);

        /*
         * By the time control reaches here, RCU grace period has passed
         * since I_WB_SWITCH assertion and all wb stat update transactions
         * between unlocked_inode_to_wb_begin/end() are guaranteed to be
         * synchronizing against the i_pages lock.
         *
         * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock
         * gives us exclusion against all wb related operations on @inode
         * including IO list manipulations and stat updates.
         */
        if (old_wb < new_wb) {
                spin_lock(&old_wb->list_lock);
                spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
        } else {
                spin_lock(&new_wb->list_lock);
                spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
        }

        for (inodep = isw->inodes; *inodep; inodep++) {
                WARN_ON_ONCE((*inodep)->i_wb != old_wb);
                if (inode_do_switch_wbs(*inodep, old_wb, new_wb))
                        nr_switched++;
        }

        spin_unlock(&new_wb->list_lock);
        spin_unlock(&old_wb->list_lock);

        up_read(&bdi->wb_switch_rwsem);

        if (nr_switched) {
                wb_wakeup(new_wb);
                wb_put_many(old_wb, nr_switched);
        }

        for (inodep = isw->inodes; *inodep; inodep++)
                iput(*inodep);
        wb_put(new_wb);
        kfree(isw);
        atomic_dec(&isw_nr_in_flight);
}

static bool inode_prepare_wbs_switch(struct inode *inode,
                                     struct bdi_writeback *new_wb)
{
        /*
         * Paired with smp_mb() in cgroup_writeback_umount().
         * isw_nr_in_flight must be increased before checking SB_ACTIVE and
         * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
         * in cgroup_writeback_umount() and the isw_wq will be not flushed.
         */
        smp_mb();

        if (IS_DAX(inode))
                return false;

        /* while holding I_WB_SWITCH, no one else can update the association */
        spin_lock(&inode->i_lock);
        if (!(inode->i_sb->s_flags & SB_ACTIVE) ||
            inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
            inode_to_wb(inode) == new_wb) {
                spin_unlock(&inode->i_lock);
                return false;
        }
        inode->i_state |= I_WB_SWITCH;
        __iget(inode);
        spin_unlock(&inode->i_lock);

        return true;
}

/**
 * inode_switch_wbs - change the wb association of an inode
 * @inode: target inode
 * @new_wb_id: ID of the new wb
 *
 * Switch @inode's wb association to the wb identified by @new_wb_id.  The
 * switching is performed asynchronously and may fail silently.
 */
static void inode_switch_wbs(struct inode *inode, int new_wb_id)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;

        /* noop if seems to be already in progress */
        if (inode->i_state & I_WB_SWITCH)
                return;

        /* avoid queueing a new switch if too many are already in flight */
        if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT)
                return;

        isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC);
        if (!isw)
                return;

        atomic_inc(&isw_nr_in_flight);

        /* find and pin the new wb */
        rcu_read_lock();
        memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css)
                goto out_free;

        isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
        css_put(memcg_css);
        if (!isw->new_wb)
                goto out_free;

        if (!inode_prepare_wbs_switch(inode, isw->new_wb))
                goto out_free;

        isw->inodes[0] = inode;

        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the i_page
         * lock so that stat transfer can synchronize against them.
         * Let's continue after I_WB_SWITCH is guaranteed to be visible.
         */
        INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
        queue_rcu_work(isw_wq, &isw->work);
        return;

out_free:
        atomic_dec(&isw_nr_in_flight);
        if (isw->new_wb)
                wb_put(isw->new_wb);
        kfree(isw);
}

static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw,
                                   struct list_head *list, int *nr)
{
        struct inode *inode;

        list_for_each_entry(inode, list, i_io_list) {
                if (!inode_prepare_wbs_switch(inode, isw->new_wb))
                        continue;

                isw->inodes[*nr] = inode;
                (*nr)++;

                if (*nr >= WB_MAX_INODES_PER_ISW - 1)
                        return true;
        }
        return false;
}

/**
 * cleanup_offline_cgwb - detach associated inodes
 * @wb: target wb
 *
 * Switch all inodes attached to @wb to a nearest living ancestor's wb in order
 * to eventually release the dying @wb.  Returns %true if not all inodes were
 * switched and the function has to be restarted.
 */
bool cleanup_offline_cgwb(struct bdi_writeback *wb)
{
        struct cgroup_subsys_state *memcg_css;
        struct inode_switch_wbs_context *isw;
        int nr;
        bool restart = false;

        isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW),
                      GFP_KERNEL);
        if (!isw)
                return restart;

        atomic_inc(&isw_nr_in_flight);

        for (memcg_css = wb->memcg_css->parent; memcg_css;
             memcg_css = memcg_css->parent) {
                isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL);
                if (isw->new_wb)
                        break;
        }
        if (unlikely(!isw->new_wb))
                isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */

        nr = 0;
        spin_lock(&wb->list_lock);
        /*
         * In addition to the inodes that have completed writeback, also switch
         * cgwbs for those inodes only with dirty timestamps. Otherwise, those
         * inodes won't be written back for a long time when lazytime is
         * enabled, and thus pinning the dying cgwbs. It won't break the
         * bandwidth restrictions, as writeback of inode metadata is not
         * accounted for.
         */
        restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr);
        if (!restart)
                restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr);
        spin_unlock(&wb->list_lock);

        /* no attached inodes? bail out */
        if (nr == 0) {
                atomic_dec(&isw_nr_in_flight);
                wb_put(isw->new_wb);
                kfree(isw);
                return restart;
        }

        /*
         * In addition to synchronizing among switchers, I_WB_SWITCH tells
         * the RCU protected stat update paths to grab the i_page
         * lock so that stat transfer can synchronize against them.
         * Let's continue after I_WB_SWITCH is guaranteed to be visible.
         */
        INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn);
        queue_rcu_work(isw_wq, &isw->work);

        return restart;
}

/**
 * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * @inode is locked and about to be written back under the control of @wbc.
 * Record @inode's writeback context into @wbc and unlock the i_lock.  On
 * writeback completion, wbc_detach_inode() should be called.  This is used
 * to track the cgroup writeback context.
 */
static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                struct inode *inode)
        __releases(&inode->i_lock)
{
        if (!inode_cgwb_enabled(inode)) {
                spin_unlock(&inode->i_lock);
                return;
        }

        wbc->wb = inode_to_wb(inode);
        wbc->inode = inode;

        wbc->wb_id = wbc->wb->memcg_css->id;
        wbc->wb_lcand_id = inode->i_wb_frn_winner;
        wbc->wb_tcand_id = 0;
        wbc->wb_bytes = 0;
        wbc->wb_lcand_bytes = 0;
        wbc->wb_tcand_bytes = 0;

        wb_get(wbc->wb);
        spin_unlock(&inode->i_lock);

        /*
         * A dying wb indicates that either the blkcg associated with the
         * memcg changed or the associated memcg is dying.  In the first
         * case, a replacement wb should already be available and we should
         * refresh the wb immediately.  In the second case, trying to
         * refresh will keep failing.
         */
        if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
                inode_switch_wbs(inode, wbc->wb_id);
}

/**
 * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
 * @wbc: writeback_control of interest
 * @inode: target inode
 *
 * This function is to be used by __filemap_fdatawrite_range(), which is an
 * alternative entry point into writeback code, and first ensures @inode is
 * associated with a bdi_writeback and attaches it to @wbc.
 */
void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                struct inode *inode)
{
        spin_lock(&inode->i_lock);
        inode_attach_wb(inode, NULL);
        wbc_attach_and_unlock_inode(wbc, inode);
}
EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode);

/**
 * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
 * @wbc: writeback_control of the just finished writeback
 *
 * To be called after a writeback attempt of an inode finishes and undoes
 * wbc_attach_and_unlock_inode().  Can be called under any context.
 *
 * As concurrent write sharing of an inode is expected to be very rare and
 * memcg only tracks page ownership on first-use basis severely confining
 * the usefulness of such sharing, cgroup writeback tracks ownership
 * per-inode.  While the support for concurrent write sharing of an inode
 * is deemed unnecessary, an inode being written to by different cgroups at
 * different points in time is a lot more common, and, more importantly,
 * charging only by first-use can too readily lead to grossly incorrect
 * behaviors (single foreign page can lead to gigabytes of writeback to be
 * incorrectly attributed).
 *
 * To resolve this issue, cgroup writeback detects the majority dirtier of
 * an inode and transfers the ownership to it.  To avoid unnecessary
 * oscillation, the detection mechanism keeps track of history and gives
 * out the switch verdict only if the foreign usage pattern is stable over
 * a certain amount of time and/or writeback attempts.
 *
 * On each writeback attempt, @wbc tries to detect the majority writer
 * using Boyer-Moore majority vote algorithm.  In addition to the byte
 * count from the majority voting, it also counts the bytes written for the
 * current wb and the last round's winner wb (max of last round's current
 * wb, the winner from two rounds ago, and the last round's majority
 * candidate).  Keeping track of the historical winner helps the algorithm
 * to semi-reliably detect the most active writer even when it's not the
 * absolute majority.
 *
 * Once the winner of the round is determined, whether the winner is
 * foreign or not and how much IO time the round consumed is recorded in
 * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
 * over a certain threshold, the switch verdict is given.
 */
void wbc_detach_inode(struct writeback_control *wbc)
{
        struct bdi_writeback *wb = wbc->wb;
        struct inode *inode = wbc->inode;
        unsigned long avg_time, max_bytes, max_time;
        u16 history;
        int max_id;

        if (!wb)
                return;

        history = inode->i_wb_frn_history;
        avg_time = inode->i_wb_frn_avg_time;

        /* pick the winner of this round */
        if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
            wbc->wb_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_id;
                max_bytes = wbc->wb_bytes;
        } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
                max_id = wbc->wb_lcand_id;
                max_bytes = wbc->wb_lcand_bytes;
        } else {
                max_id = wbc->wb_tcand_id;
                max_bytes = wbc->wb_tcand_bytes;
        }

        /*
         * Calculate the amount of IO time the winner consumed and fold it
         * into the running average kept per inode.  If the consumed IO
         * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
         * deciding whether to switch or not.  This is to prevent one-off
         * small dirtiers from skewing the verdict.
         */
        max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
                                wb->avg_write_bandwidth);
        if (avg_time)
                avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
                            (avg_time >> WB_FRN_TIME_AVG_SHIFT);
        else
                avg_time = max_time;        /* immediate catch up on first run */

        if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
                int slots;

                /*
                 * The switch verdict is reached if foreign wb's consume
                 * more than a certain proportion of IO time in a
                 * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
                 * history mask where each bit represents one sixteenth of
                 * the period.  Determine the number of slots to shift into
                 * history from @max_time.
                 */
                slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
                            (unsigned long)WB_FRN_HIST_MAX_SLOTS);
                history <<= slots;
                if (wbc->wb_id != max_id)
                        history |= (1U << slots) - 1;

                if (history)
                        trace_inode_foreign_history(inode, wbc, history);

                /*
                 * Switch if the current wb isn't the consistent winner.
                 * If there are multiple closely competing dirtiers, the
                 * inode may switch across them repeatedly over time, which
                 * is okay.  The main goal is avoiding keeping an inode on
                 * the wrong wb for an extended period of time.
                 */
                if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
                        inode_switch_wbs(inode, max_id);
        }

        /*
         * Multiple instances of this function may race to update the
         * following fields but we don't mind occassional inaccuracies.
         */
        inode->i_wb_frn_winner = max_id;
        inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
        inode->i_wb_frn_history = history;

        wb_put(wbc->wb);
        wbc->wb = NULL;
}
EXPORT_SYMBOL_GPL(wbc_detach_inode);

/**
 * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
 * @wbc: writeback_control of the writeback in progress
 * @folio: folio being written out
 * @bytes: number of bytes being written out
 *
 * @bytes from @folio are about to written out during the writeback
 * controlled by @wbc.  Keep the book for foreign inode detection.  See
 * wbc_detach_inode().
 */
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
                              size_t bytes)
{
        struct cgroup_subsys_state *css;
        int id;

        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (!wbc->wb || wbc->no_cgroup_owner)
                return;

        css = mem_cgroup_css_from_folio(folio);
        /* dead cgroups shouldn't contribute to inode ownership arbitration */
        if (!(css->flags & CSS_ONLINE))
                return;

        id = css->id;

        if (id == wbc->wb_id) {
                wbc->wb_bytes += bytes;
                return;
        }

        if (id == wbc->wb_lcand_id)
                wbc->wb_lcand_bytes += bytes;

        /* Boyer-Moore majority vote algorithm */
        if (!wbc->wb_tcand_bytes)
                wbc->wb_tcand_id = id;
        if (id == wbc->wb_tcand_id)
                wbc->wb_tcand_bytes += bytes;
        else
                wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
}
EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);

/**
 * wb_split_bdi_pages - split nr_pages to write according to bandwidth
 * @wb: target bdi_writeback to split @nr_pages to
 * @nr_pages: number of pages to write for the whole bdi
 *
 * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
 * relation to the total write bandwidth of all wb's w/ dirty inodes on
 * @wb->bdi.
 */
static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        unsigned long this_bw = wb->avg_write_bandwidth;
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);

        if (nr_pages == LONG_MAX)
                return LONG_MAX;

        /*
         * This may be called on clean wb's and proportional distribution
         * may not make sense, just use the original @nr_pages in those
         * cases.  In general, we wanna err on the side of writing more.
         */
        if (!tot_bw || this_bw >= tot_bw)
                return nr_pages;
        else
                return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
}

/**
 * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
 * @bdi: target backing_dev_info
 * @base_work: wb_writeback_work to issue
 * @skip_if_busy: skip wb's which already have writeback in progress
 *
 * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
 * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
 * distributed to the busy wbs according to each wb's proportion in the
 * total active write bandwidth of @bdi.
 */
static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        struct bdi_writeback *last_wb = NULL;
        struct bdi_writeback *wb = list_entry(&bdi->wb_list,
                                              struct bdi_writeback, bdi_node);

        might_sleep();
restart:
        rcu_read_lock();
        list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                DEFINE_WB_COMPLETION(fallback_work_done, bdi);
                struct wb_writeback_work fallback_work;
                struct wb_writeback_work *work;
                long nr_pages;

                if (last_wb) {
                        wb_put(last_wb);
                        last_wb = NULL;
                }

                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
                     list_empty(&wb->b_dirty_time)))
                        continue;
                if (skip_if_busy && writeback_in_progress(wb))
                        continue;

                nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);

                work = kmalloc(sizeof(*work), GFP_ATOMIC);
                if (work) {
                        *work = *base_work;
                        work->nr_pages = nr_pages;
                        work->auto_free = 1;
                        wb_queue_work(wb, work);
                        continue;
                }

                /*
                 * If wb_tryget fails, the wb has been shutdown, skip it.
                 *
                 * Pin @wb so that it stays on @bdi->wb_list.  This allows
                 * continuing iteration from @wb after dropping and
                 * regrabbing rcu read lock.
                 */
                if (!wb_tryget(wb))
                        continue;

                /* alloc failed, execute synchronously using on-stack fallback */
                work = &fallback_work;
                *work = *base_work;
                work->nr_pages = nr_pages;
                work->auto_free = 0;
                work->done = &fallback_work_done;

                wb_queue_work(wb, work);
                last_wb = wb;

                rcu_read_unlock();
                wb_wait_for_completion(&fallback_work_done);
                goto restart;
        }
        rcu_read_unlock();

        if (last_wb)
                wb_put(last_wb);
}

/**
 * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs
 * @bdi_id: target bdi id
 * @memcg_id: target memcg css id
 * @reason: reason why some writeback work initiated
 * @done: target wb_completion
 *
 * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id
 * with the specified parameters.
 */
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done)
{
        struct backing_dev_info *bdi;
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;
        struct wb_writeback_work *work;
        unsigned long dirty;
        int ret;

        /* lookup bdi and memcg */
        bdi = bdi_get_by_id(bdi_id);
        if (!bdi)
                return -ENOENT;

        rcu_read_lock();
        memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys);
        if (memcg_css && !css_tryget(memcg_css))
                memcg_css = NULL;
        rcu_read_unlock();
        if (!memcg_css) {
                ret = -ENOENT;
                goto out_bdi_put;
        }

        /*
         * And find the associated wb.  If the wb isn't there already
         * there's nothing to flush, don't create one.
         */
        wb = wb_get_lookup(bdi, memcg_css);
        if (!wb) {
                ret = -ENOENT;
                goto out_css_put;
        }

        /*
         * The caller is attempting to write out most of
         * the currently dirty pages.  Let's take the current dirty page
         * count and inflate it by 25% which should be large enough to
         * flush out most dirty pages while avoiding getting livelocked by
         * concurrent dirtiers.
         *
         * BTW the memcg stats are flushed periodically and this is best-effort
         * estimation, so some potential error is ok.
         */
        dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY);
        dirty = dirty * 10 / 8;

        /* issue the writeback work */
        work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
        if (work) {
                work->nr_pages = dirty;
                work->sync_mode = WB_SYNC_NONE;
                work->range_cyclic = 1;
                work->reason = reason;
                work->done = done;
                work->auto_free = 1;
                wb_queue_work(wb, work);
                ret = 0;
        } else {
                ret = -ENOMEM;
        }

        wb_put(wb);
out_css_put:
        css_put(memcg_css);
out_bdi_put:
        bdi_put(bdi);
        return ret;
}

/**
 * cgroup_writeback_umount - flush inode wb switches for umount
 * @sb: target super_block
 *
 * This function is called when a super_block is about to be destroyed and
 * flushes in-flight inode wb switches.  An inode wb switch goes through
 * RCU and then workqueue, so the two need to be flushed in order to ensure
 * that all previously scheduled switches are finished.  As wb switches are
 * rare occurrences and synchronize_rcu() can take a while, perform
 * flushing iff wb switches are in flight.
 */
void cgroup_writeback_umount(struct super_block *sb)
{

        if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK))
                return;

        /*
         * SB_ACTIVE should be reliably cleared before checking
         * isw_nr_in_flight, see generic_shutdown_super().
         */
        smp_mb();

        if (atomic_read(&isw_nr_in_flight)) {
                /*
                 * Use rcu_barrier() to wait for all pending callbacks to
                 * ensure that all in-flight wb switches are in the workqueue.
                 */
                rcu_barrier();
                flush_workqueue(isw_wq);
        }
}

static int __init cgroup_writeback_init(void)
{
        isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
        if (!isw_wq)
                return -ENOMEM;
        return 0;
}
fs_initcall(cgroup_writeback_init);

#else        /* CONFIG_CGROUP_WRITEBACK */

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }

static void inode_cgwb_move_to_attached(struct inode *inode,
                                        struct bdi_writeback *wb)
{
        assert_spin_locked(&wb->list_lock);
        assert_spin_locked(&inode->i_lock);
        WARN_ON_ONCE(inode->i_state & I_FREEING);

        inode->i_state &= ~I_SYNC_QUEUED;
        list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);
}

static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
        __releases(&inode->i_lock)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_unlock(&inode->i_lock);
        spin_lock(&wb->list_lock);
        return wb;
}

static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
        __acquires(&wb->list_lock)
{
        struct bdi_writeback *wb = inode_to_wb(inode);

        spin_lock(&wb->list_lock);
        return wb;
}

static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
{
        return nr_pages;
}

static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
{
        might_sleep();

        if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) {
                base_work->auto_free = 0;
                wb_queue_work(&bdi->wb, base_work);
        }
}

static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
                                               struct inode *inode)
        __releases(&inode->i_lock)
{
        spin_unlock(&inode->i_lock);
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * Add in the number of potentially dirty inodes, because each inode
 * write can dirty pagecache in the underlying blockdev.
 */
static unsigned long get_nr_dirty_pages(void)
{
        return global_node_page_state(NR_FILE_DIRTY) +
                get_nr_dirty_inodes();
}

static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
{
        if (!wb_has_dirty_io(wb))
                return;

        /*
         * All callers of this function want to start writeback of all
         * dirty pages. Places like vmscan can call this at a very
         * high frequency, causing pointless allocations of tons of
         * work items and keeping the flusher threads busy retrieving
         * that work. Ensure that we only allow one of them pending and
         * inflight at the time.
         */
        if (test_bit(WB_start_all, &wb->state) ||
            test_and_set_bit(WB_start_all, &wb->state))
                return;

        wb->start_all_reason = reason;
        wb_wakeup(wb);
}

/**
 * wb_start_background_writeback - start background writeback
 * @wb: bdi_writback to write from
 *
 * Description:
 *   This makes sure WB_SYNC_NONE background writeback happens. When
 *   this function returns, it is only guaranteed that for given wb
 *   some IO is happening if we are over background dirty threshold.
 *   Caller need not hold sb s_umount semaphore.
 */
void wb_start_background_writeback(struct bdi_writeback *wb)
{
        /*
         * We just wake up the flusher thread. It will perform background
         * writeback as soon as there is no other work to do.
         */
        trace_writeback_wake_background(wb);
        wb_wakeup(wb);
}

/*
 * Remove the inode from the writeback list it is on.
 */
void inode_io_list_del(struct inode *inode)
{
        struct bdi_writeback *wb;

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);

        inode->i_state &= ~I_SYNC_QUEUED;
        list_del_init(&inode->i_io_list);
        wb_io_lists_depopulated(wb);

        spin_unlock(&inode->i_lock);
        spin_unlock(&wb->list_lock);
}
EXPORT_SYMBOL(inode_io_list_del);

/*
 * mark an inode as under writeback on the sb
 */
void sb_mark_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (list_empty(&inode->i_wb_list)) {
                        list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb);
                        trace_sb_mark_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * clear an inode as under writeback on the sb
 */
void sb_clear_inode_writeback(struct inode *inode)
{
        struct super_block *sb = inode->i_sb;
        unsigned long flags;

        if (!list_empty(&inode->i_wb_list)) {
                spin_lock_irqsave(&sb->s_inode_wblist_lock, flags);
                if (!list_empty(&inode->i_wb_list)) {
                        list_del_init(&inode->i_wb_list);
                        trace_sb_clear_inode_writeback(inode);
                }
                spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags);
        }
}

/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
 *
 * Before stamping the inode's ->dirtied_when, we check to see whether it is
 * already the most-recently-dirtied inode on the b_dirty list.  If that is
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb)
{
        assert_spin_locked(&inode->i_lock);

        inode->i_state &= ~I_SYNC_QUEUED;
        /*
         * When the inode is being freed just don't bother with dirty list
         * tracking. Flush worker will ignore this inode anyway and it will
         * trigger assertions in inode_io_list_move_locked().
         */
        if (inode->i_state & I_FREEING) {
                list_del_init(&inode->i_io_list);
                wb_io_lists_depopulated(wb);
                return;
        }
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;

                tail = wb_inode(wb->b_dirty.next);
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
        inode_io_list_move_locked(inode, wb, &wb->b_dirty);
}

static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
        spin_lock(&inode->i_lock);
        redirty_tail_locked(inode, wb);
        spin_unlock(&inode->i_lock);
}

/*
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
 */
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
        inode_io_list_move_locked(inode, wb, &wb->b_more_io);
}

static void inode_sync_complete(struct inode *inode)
{
        assert_spin_locked(&inode->i_lock);

        inode->i_state &= ~I_SYNC;
        /* If inode is clean an unused, put it into LRU now... */
        inode_add_lru(inode);
        /* Called with inode->i_lock which ensures memory ordering. */
        inode_wake_up_bit(inode, __I_SYNC);
}

static bool inode_dirtied_after(struct inode *inode, unsigned long t)
{
        bool ret = time_after(inode->dirtied_when, t);
#ifndef CONFIG_64BIT
        /*
         * For inodes being constantly redirtied, dirtied_when can get stuck.
         * It _appears_ to be in the future, but is actually in distant past.
         * This test is necessary to prevent such wrapped-around relative times
         * from permanently stopping the whole bdi writeback.
         */
        ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
        return ret;
}

/*
 * Move expired (dirtied before dirtied_before) dirty inodes from
 * @delaying_queue to @dispatch_queue.
 */
static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
                               unsigned long dirtied_before)
{
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
        struct super_block *sb = NULL;
        struct inode *inode;
        int do_sb_sort = 0;
        int moved = 0;

        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
                if (inode_dirtied_after(inode, dirtied_before))
                        break;
                spin_lock(&inode->i_lock);
                list_move(&inode->i_io_list, &tmp);
                moved++;
                inode->i_state |= I_SYNC_QUEUED;
                spin_unlock(&inode->i_lock);
                if (sb_is_blkdev_sb(inode->i_sb))
                        continue;
                if (sb && sb != inode->i_sb)
                        do_sb_sort = 1;
                sb = inode->i_sb;
        }

        /* just one sb in list, splice to dispatch_queue and we're done */
        if (!do_sb_sort) {
                list_splice(&tmp, dispatch_queue);
                goto out;
        }

        /*
         * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue',
         * we don't take inode->i_lock here because it is just a pointless overhead.
         * Inode is already marked as I_SYNC_QUEUED so writeback list handling is
         * fully under our control.
         */
        while (!list_empty(&tmp)) {
                sb = wb_inode(tmp.prev)->i_sb;
                list_for_each_prev_safe(pos, node, &tmp) {
                        inode = wb_inode(pos);
                        if (inode->i_sb == sb)
                                list_move(&inode->i_io_list, dispatch_queue);
                }
        }
out:
        return moved;
}

/*
 * Queue all expired dirty inodes for io, eldest first.
 * Before
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    gf         edc     BA
 * After
 *         newly dirtied     b_dirty    b_io    b_more_io
 *         =============>    g          fBAedc
 *                                           |
 *                                           +--> dequeue for IO
 */
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work,
                     unsigned long dirtied_before)
{
        int moved;
        unsigned long time_expire_jif = dirtied_before;

        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before);
        if (!work->for_sync)
                time_expire_jif = jiffies - dirtytime_expire_interval * HZ;
        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
                                     time_expire_jif);
        if (moved)
                wb_io_lists_populated(wb);
        trace_writeback_queue_io(wb, work, dirtied_before, moved);
}

static int write_inode(struct inode *inode, struct writeback_control *wbc)
{
        int ret;

        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) {
                trace_writeback_write_inode_start(inode, wbc);
                ret = inode->i_sb->s_op->write_inode(inode, wbc);
                trace_writeback_write_inode(inode, wbc);
                return ret;
        }
        return 0;
}

/*
 * Wait for writeback on an inode to complete. Called with i_lock held.
 * Caller must make sure inode cannot go away when we drop i_lock.
 */
void inode_wait_for_writeback(struct inode *inode)
{
        struct wait_bit_queue_entry wqe;
        struct wait_queue_head *wq_head;

        assert_spin_locked(&inode->i_lock);

        if (!(inode->i_state & I_SYNC))
                return;

        wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
        for (;;) {
                prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
                /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
                if (!(inode->i_state & I_SYNC))
                        break;
                spin_unlock(&inode->i_lock);
                schedule();
                spin_lock(&inode->i_lock);
        }
        finish_wait(wq_head, &wqe.wq_entry);
}

/*
 * Sleep until I_SYNC is cleared. This function must be called with i_lock
 * held and drops it. It is aimed for callers not holding any inode reference
 * so once i_lock is dropped, inode can go away.
 */
static void inode_sleep_on_writeback(struct inode *inode)
        __releases(inode->i_lock)
{
        struct wait_bit_queue_entry wqe;
        struct wait_queue_head *wq_head;
        bool sleep;

        assert_spin_locked(&inode->i_lock);

        wq_head = inode_bit_waitqueue(&wqe, inode, __I_SYNC);
        prepare_to_wait_event(wq_head, &wqe.wq_entry, TASK_UNINTERRUPTIBLE);
        /* Checking I_SYNC with inode->i_lock guarantees memory ordering. */
        sleep = !!(inode->i_state & I_SYNC);
        spin_unlock(&inode->i_lock);
        if (sleep)
                schedule();
        finish_wait(wq_head, &wqe.wq_entry);
}

/*
 * Find proper writeback list for the inode depending on its current state and
 * possibly also change of its state while we were doing writeback.  Here we
 * handle things such as livelock prevention or fairness of writeback among
 * inodes. This function can be called only by flusher thread - noone else
 * processes all inodes in writeback lists and requeueing inodes behind flusher
 * thread's back can have unexpected consequences.
 */
static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                          struct writeback_control *wbc,
                          unsigned long dirtied_before)
{
        if (inode->i_state & I_FREEING)
                return;

        /*
         * Sync livelock prevention. Each inode is tagged and synced in one
         * shot. If still dirty, it will be redirty_tail()'ed below.  Update
         * the dirty time to prevent enqueue and sync it again.
         */
        if ((inode->i_state & I_DIRTY) &&
            (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
                inode->dirtied_when = jiffies;

        if (wbc->pages_skipped) {
                /*
                 * Writeback is not making progress due to locked buffers.
                 * Skip this inode for now. Although having skipped pages
                 * is odd for clean inodes, it can happen for some
                 * filesystems so handle that gracefully.
                 */
                if (inode->i_state & I_DIRTY_ALL)
                        redirty_tail_locked(inode, wb);
                else
                        inode_cgwb_move_to_attached(inode, wb);
                return;
        }

        if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
                /*
                 * We didn't write back all the pages.  nfs_writepages()
                 * sometimes bales out without doing anything.
                 */
                if (wbc->nr_to_write <= 0 &&
                    !inode_dirtied_after(inode, dirtied_before)) {
                        /* Slice used up. Queue for next turn. */
                        requeue_io(inode, wb);
                } else {
                        /*
                         * Writeback blocked by something other than
                         * congestion. Delay the inode for some time to
                         * avoid spinning on the CPU (100% iowait)
                         * retrying writeback of the dirty page/inode
                         * that cannot be performed immediately.
                         */
                        redirty_tail_locked(inode, wb);
                }
        } else if (inode->i_state & I_DIRTY) {
                /*
                 * Filesystems can dirty the inode during writeback operations,
                 * such as delayed allocation during submission or metadata
                 * updates after data IO completion.
                 */
                redirty_tail_locked(inode, wb);
        } else if (inode->i_state & I_DIRTY_TIME) {
                inode->dirtied_when = jiffies;
                inode_io_list_move_locked(inode, wb, &wb->b_dirty_time);
                inode->i_state &= ~I_SYNC_QUEUED;
        } else {
                /* The inode is clean. Remove from writeback lists. */
                inode_cgwb_move_to_attached(inode, wb);
        }
}

/*
 * Write out an inode and its dirty pages (or some of its dirty pages, depending
 * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state.
 *
 * This doesn't remove the inode from the writeback list it is on, except
 * potentially to move it from b_dirty_time to b_dirty due to timestamp
 * expiration.  The caller is otherwise responsible for writeback list handling.
 *
 * The caller is also responsible for setting the I_SYNC flag beforehand and
 * calling inode_sync_complete() to clear it afterwards.
 */
static int
__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
{
        struct address_space *mapping = inode->i_mapping;
        long nr_to_write = wbc->nr_to_write;
        unsigned dirty;
        int ret;

        WARN_ON(!(inode->i_state & I_SYNC));

        trace_writeback_single_inode_start(inode, wbc, nr_to_write);

        ret = do_writepages(mapping, wbc);

        /*
         * Make sure to wait on the data before writing out the metadata.
         * This is important for filesystems that modify metadata on data
         * I/O completion. We don't do it for sync(2) writeback because it has a
         * separate, external IO completion path and ->sync_fs for guaranteeing
         * inode metadata is written back correctly.
         */
        if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) {
                int err = filemap_fdatawait(mapping);
                if (ret == 0)
                        ret = err;
        }

        /*
         * If the inode has dirty timestamps and we need to write them, call
         * mark_inode_dirty_sync() to notify the filesystem about it and to
         * change I_DIRTY_TIME into I_DIRTY_SYNC.
         */
        if ((inode->i_state & I_DIRTY_TIME) &&
            (wbc->sync_mode == WB_SYNC_ALL ||
             time_after(jiffies, inode->dirtied_time_when +
                        dirtytime_expire_interval * HZ))) {
                trace_writeback_lazytime(inode);
                mark_inode_dirty_sync(inode);
        }

        /*
         * Get and clear the dirty flags from i_state.  This needs to be done
         * after calling writepages because some filesystems may redirty the
         * inode during writepages due to delalloc.  It also needs to be done
         * after handling timestamp expiration, as that may dirty the inode too.
         */
        spin_lock(&inode->i_lock);
        dirty = inode->i_state & I_DIRTY;
        inode->i_state &= ~dirty;

        /*
         * Paired with smp_mb() in __mark_inode_dirty().  This allows
         * __mark_inode_dirty() to test i_state without grabbing i_lock -
         * either they see the I_DIRTY bits cleared or we see the dirtied
         * inode.
         *
         * I_DIRTY_PAGES is always cleared together above even if @mapping
         * still has dirty pages.  The flag is reinstated after smp_mb() if
         * necessary.  This guarantees that either __mark_inode_dirty()
         * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
         */
        smp_mb();

        if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                inode->i_state |= I_DIRTY_PAGES;
        else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
                if (!(inode->i_state & I_DIRTY_PAGES)) {
                        inode->i_state &= ~I_PINNING_NETFS_WB;
                        wbc->unpinned_netfs_wb = true;
                        dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
                }
        }

        spin_unlock(&inode->i_lock);

        /* Don't write the inode if only I_DIRTY_PAGES was set */
        if (dirty & ~I_DIRTY_PAGES) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
        }
        wbc->unpinned_netfs_wb = false;
        trace_writeback_single_inode(inode, wbc, nr_to_write);
        return ret;
}

/*
 * Write out an inode's dirty data and metadata on-demand, i.e. separately from
 * the regular batched writeback done by the flusher threads in
 * writeback_sb_inodes().  @wbc controls various aspects of the write, such as
 * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE).
 *
 * To prevent the inode from going away, either the caller must have a reference
 * to the inode, or the inode must have I_WILL_FREE or I_FREEING set.
 */
static int writeback_single_inode(struct inode *inode,
                                  struct writeback_control *wbc)
{
        struct bdi_writeback *wb;
        int ret = 0;

        spin_lock(&inode->i_lock);
        if (!atomic_read(&inode->i_count))
                WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
        else
                WARN_ON(inode->i_state & I_WILL_FREE);

        if (inode->i_state & I_SYNC) {
                /*
                 * Writeback is already running on the inode.  For WB_SYNC_NONE,
                 * that's enough and we can just return.  For WB_SYNC_ALL, we
                 * must wait for the existing writeback to complete, then do
                 * writeback again if there's anything left.
                 */
                if (wbc->sync_mode != WB_SYNC_ALL)
                        goto out;
                inode_wait_for_writeback(inode);
        }
        WARN_ON(inode->i_state & I_SYNC);
        /*
         * If the inode is already fully clean, then there's nothing to do.
         *
         * For data-integrity syncs we also need to check whether any pages are
         * still under writeback, e.g. due to prior WB_SYNC_NONE writeback.  If
         * there are any such pages, we'll need to wait for them.
         */
        if (!(inode->i_state & I_DIRTY_ALL) &&
            (wbc->sync_mode != WB_SYNC_ALL ||
             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
        inode->i_state |= I_SYNC;
        wbc_attach_and_unlock_inode(wbc, inode);

        ret = __writeback_single_inode(inode, wbc);

        wbc_detach_inode(wbc);

        wb = inode_to_wb_and_lock_list(inode);
        spin_lock(&inode->i_lock);
        /*
         * If the inode is freeing, its i_io_list shoudn't be updated
         * as it can be finally deleted at this moment.
         */
        if (!(inode->i_state & I_FREEING)) {
                /*
                 * If the inode is now fully clean, then it can be safely
                 * removed from its writeback list (if any). Otherwise the
                 * flusher threads are responsible for the writeback lists.
                 */
                if (!(inode->i_state & I_DIRTY_ALL))
                        inode_cgwb_move_to_attached(inode, wb);
                else if (!(inode->i_state & I_SYNC_QUEUED)) {
                        if ((inode->i_state & I_DIRTY))
                                redirty_tail_locked(inode, wb);
                        else if (inode->i_state & I_DIRTY_TIME) {
                                inode->dirtied_when = jiffies;
                                inode_io_list_move_locked(inode,
                                                          wb,
                                                          &wb->b_dirty_time);
                        }
                }
        }

        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
out:
        spin_unlock(&inode->i_lock);
        return ret;
}

static long writeback_chunk_size(struct bdi_writeback *wb,
                                 struct wb_writeback_work *work)
{
        long pages;

        /*
         * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
         * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
         * here avoids calling into writeback_inodes_wb() more than once.
         *
         * The intended call sequence for WB_SYNC_ALL writeback is:
         *
         *      wb_writeback()
         *          writeback_sb_inodes()       <== called only once
         *              write_cache_pages()     <== called once for each inode
         *                   (quickly) tag currently dirty pages
         *                   (maybe slowly) sync all tagged pages
         */
        if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
                pages = LONG_MAX;
        else {
                pages = min(wb->avg_write_bandwidth / 2,
                            global_wb_domain.dirty_limit / DIRTY_SCOPE);
                pages = min(pages, work->nr_pages);
                pages = round_down(pages + MIN_WRITEBACK_PAGES,
                                   MIN_WRITEBACK_PAGES);
        }

        return pages;
}

/*
 * Write a portion of b_io inodes which belong to @sb.
 *
 * Return the number of pages and/or inodes written.
 *
 * NOTE! This is called with wb->list_lock held, and will
 * unlock and relock that for each inode it ends up doing
 * IO for.
 */
static long writeback_sb_inodes(struct super_block *sb,
                                struct bdi_writeback *wb,
                                struct wb_writeback_work *work)
{
        struct writeback_control wbc = {
                .sync_mode                = work->sync_mode,
                .tagged_writepages        = work->tagged_writepages,
                .for_kupdate                = work->for_kupdate,
                .for_background                = work->for_background,
                .for_sync                = work->for_sync,
                .range_cyclic                = work->range_cyclic,
                .range_start                = 0,
                .range_end                = LLONG_MAX,
        };
        unsigned long start_time = jiffies;
        long write_chunk;
        long total_wrote = 0;  /* count both pages and inodes */
        unsigned long dirtied_before = jiffies;

        if (work->for_kupdate)
                dirtied_before = jiffies -
                        msecs_to_jiffies(dirty_expire_interval * 10);

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct bdi_writeback *tmp_wb;
                long wrote;

                if (inode->i_sb != sb) {
                        if (work->sb) {
                                /*
                                 * We only want to write back data for this
                                 * superblock, move all inodes not belonging
                                 * to it back onto the dirty list.
                                 */
                                redirty_tail(inode, wb);
                                continue;
                        }

                        /*
                         * The inode belongs to a different superblock.
                         * Bounce back to the caller to unpin this and
                         * pin the next superblock.
                         */
                        break;
                }

                /*
                 * Don't bother with new inodes or inodes being freed, first
                 * kind does not need periodic writeout yet, and for the latter
                 * kind writeout is handled by the freer.
                 */
                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
                        redirty_tail_locked(inode, wb);
                        spin_unlock(&inode->i_lock);
                        continue;
                }
                if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
                        /*
                         * If this inode is locked for writeback and we are not
                         * doing writeback-for-data-integrity, move it to
                         * b_more_io so that writeback can proceed with the
                         * other inodes on s_io.
                         *
                         * We'll have another go at writing back this inode
                         * when we completed a full scan of b_io.
                         */
                        requeue_io(inode, wb);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_sb_inodes_requeue(inode);
                        continue;
                }
                spin_unlock(&wb->list_lock);

                /*
                 * We already requeued the inode if it had I_SYNC set and we
                 * are doing WB_SYNC_NONE writeback. So this catches only the
                 * WB_SYNC_ALL case.
                 */
                if (inode->i_state & I_SYNC) {
                        /* Wait for I_SYNC. This function drops i_lock... */
                        inode_sleep_on_writeback(inode);
                        /* Inode may be gone, start again */
                        spin_lock(&wb->list_lock);
                        continue;
                }
                inode->i_state |= I_SYNC;
                wbc_attach_and_unlock_inode(&wbc, inode);

                write_chunk = writeback_chunk_size(wb, work);
                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;

                /*
                 * We use I_SYNC to pin the inode in memory. While it is set
                 * evict_inode() will wait so the inode cannot be freed.
                 */
                __writeback_single_inode(inode, &wbc);

                wbc_detach_inode(&wbc);
                work->nr_pages -= write_chunk - wbc.nr_to_write;
                wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
                wrote = wrote < 0 ? 0 : wrote;
                total_wrote += wrote;

                if (need_resched()) {
                        /*
                         * We're trying to balance between building up a nice
                         * long list of IOs to improve our merge rate, and
                         * getting those IOs out quickly for anyone throttling
                         * in balance_dirty_pages().  cond_resched() doesn't
                         * unplug, so get our IOs out the door before we
                         * give up the CPU.
                         */
                        blk_flush_plug(current->plug, false);
                        cond_resched();
                }

                /*
                 * Requeue @inode if still dirty.  Be careful as @inode may
                 * have been switched to another wb in the meantime.
                 */
                tmp_wb = inode_to_wb_and_lock_list(inode);
                spin_lock(&inode->i_lock);
                if (!(inode->i_state & I_DIRTY_ALL))
                        total_wrote++;
                requeue_inode(inode, tmp_wb, &wbc, dirtied_before);
                inode_sync_complete(inode);
                spin_unlock(&inode->i_lock);

                if (unlikely(tmp_wb != wb)) {
                        spin_unlock(&tmp_wb->list_lock);
                        spin_lock(&wb->list_lock);
                }

                /*
                 * bail out to wb_writeback() often enough to check
                 * background threshold and other termination conditions.
                 */
                if (total_wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        return total_wrote;
}

static long __writeback_inodes_wb(struct bdi_writeback *wb,
                                  struct wb_writeback_work *work)
{
        unsigned long start_time = jiffies;
        long wrote = 0;

        while (!list_empty(&wb->b_io)) {
                struct inode *inode = wb_inode(wb->b_io.prev);
                struct super_block *sb = inode->i_sb;

                if (!super_trylock_shared(sb)) {
                        /*
                         * super_trylock_shared() may fail consistently due to
                         * s_umount being grabbed by someone else. Don't use
                         * requeue_io() to avoid busy retrying the inode/sb.
                         */
                        redirty_tail(inode, wb);
                        continue;
                }
                wrote += writeback_sb_inodes(sb, wb, work);
                up_read(&sb->s_umount);

                /* refer to the same tests at the end of writeback_sb_inodes */
                if (wrote) {
                        if (time_is_before_jiffies(start_time + HZ / 10UL))
                                break;
                        if (work->nr_pages <= 0)
                                break;
                }
        }
        /* Leave any unwritten inodes on b_io */
        return wrote;
}

static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
                                enum wb_reason reason)
{
        struct wb_writeback_work work = {
                .nr_pages        = nr_pages,
                .sync_mode        = WB_SYNC_NONE,
                .range_cyclic        = 1,
                .reason                = reason,
        };
        struct blk_plug plug;

        blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
        if (list_empty(&wb->b_io))
                queue_io(wb, &work, jiffies);
        __writeback_inodes_wb(wb, &work);
        spin_unlock(&wb->list_lock);
        blk_finish_plug(&plug);

        return nr_pages - work.nr_pages;
}

/*
 * Explicit flushing or periodic writeback of "old" data.
 *
 * Define "old": the first time one of an inode's pages is dirtied, we mark the
 * dirtying-time in the inode's address_space.  So this periodic writeback code
 * just walks the superblock inode list, writing back any inodes which are
 * older than a specific point in time.
 *
 * Try to run once per dirty_writeback_interval.  But if a writeback event
 * takes longer than a dirty_writeback_interval interval, then leave a
 * one-second gap.
 *
 * dirtied_before takes precedence over nr_to_write.  So we'll only write back
 * all dirty pages if they are all attached to "old" mappings.
 */
static long wb_writeback(struct bdi_writeback *wb,
                         struct wb_writeback_work *work)
{
        long nr_pages = work->nr_pages;
        unsigned long dirtied_before = jiffies;
        struct inode *inode;
        long progress;
        struct blk_plug plug;
        bool queued = false;

        blk_start_plug(&plug);
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
                 */
                if (work->nr_pages <= 0)
                        break;

                /*
                 * Background writeout and kupdate-style writeback may
                 * run forever. Stop them if there is other work to do
                 * so that e.g. sync can proceed. They'll be restarted
                 * after the other works are all done.
                 */
                if ((work->for_background || work->for_kupdate) &&
                    !list_empty(&wb->work_list))
                        break;

                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
                if (work->for_background && !wb_over_bg_thresh(wb))
                        break;


                spin_lock(&wb->list_lock);

                trace_writeback_start(wb, work);
                if (list_empty(&wb->b_io)) {
                        /*
                         * Kupdate and background works are special and we want
                         * to include all inodes that need writing. Livelock
                         * avoidance is handled by these works yielding to any
                         * other work so we are safe.
                         */
                        if (work->for_kupdate) {
                                dirtied_before = jiffies -
                                        msecs_to_jiffies(dirty_expire_interval *
                                                         10);
                        } else if (work->for_background)
                                dirtied_before = jiffies;

                        queue_io(wb, work, dirtied_before);
                        queued = true;
                }
                if (work->sb)
                        progress = writeback_sb_inodes(work->sb, wb, work);
                else
                        progress = __writeback_inodes_wb(wb, work);
                trace_writeback_written(wb, work);

                /*
                 * Did we write something? Try for more
                 *
                 * Dirty inodes are moved to b_io for writeback in batches.
                 * The completion of the current batch does not necessarily
                 * mean the overall work is done. So we keep looping as long
                 * as made some progress on cleaning pages or inodes.
                 */
                if (progress || !queued) {
                        spin_unlock(&wb->list_lock);
                        continue;
                }

                /*
                 * No more inodes for IO, bail
                 */
                if (list_empty(&wb->b_more_io)) {
                        spin_unlock(&wb->list_lock);
                        break;
                }

                /*
                 * Nothing written. Wait for some inode to
                 * become available for writeback. Otherwise
                 * we'll just busyloop.
                 */
                trace_writeback_wait(wb, work);
                inode = wb_inode(wb->b_more_io.prev);
                spin_lock(&inode->i_lock);
                spin_unlock(&wb->list_lock);
                /* This function drops i_lock... */
                inode_sleep_on_writeback(inode);
        }
        blk_finish_plug(&plug);

        return nr_pages - work->nr_pages;
}

/*
 * Return the next wb_writeback_work struct that hasn't been processed yet.
 */
static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work = NULL;

        spin_lock_irq(&wb->work_lock);
        if (!list_empty(&wb->work_list)) {
                work = list_entry(wb->work_list.next,
                                  struct wb_writeback_work, list);
                list_del_init(&work->list);
        }
        spin_unlock_irq(&wb->work_lock);
        return work;
}

static long wb_check_background_flush(struct bdi_writeback *wb)
{
        if (wb_over_bg_thresh(wb)) {

                struct wb_writeback_work work = {
                        .nr_pages        = LONG_MAX,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_background        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_BACKGROUND,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
        unsigned long expired;
        long nr_pages;

        /*
         * When set to zero, disable periodic writeback
         */
        if (!dirty_writeback_interval)
                return 0;

        expired = wb->last_old_flush +
                        msecs_to_jiffies(dirty_writeback_interval * 10);
        if (time_before(jiffies, expired))
                return 0;

        wb->last_old_flush = jiffies;
        nr_pages = get_nr_dirty_pages();

        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = nr_pages,
                        .sync_mode        = WB_SYNC_NONE,
                        .for_kupdate        = 1,
                        .range_cyclic        = 1,
                        .reason                = WB_REASON_PERIODIC,
                };

                return wb_writeback(wb, &work);
        }

        return 0;
}

static long wb_check_start_all(struct bdi_writeback *wb)
{
        long nr_pages;

        if (!test_bit(WB_start_all, &wb->state))
                return 0;

        nr_pages = get_nr_dirty_pages();
        if (nr_pages) {
                struct wb_writeback_work work = {
                        .nr_pages        = wb_split_bdi_pages(wb, nr_pages),
                        .sync_mode        = WB_SYNC_NONE,
                        .range_cyclic        = 1,
                        .reason                = wb->start_all_reason,
                };

                nr_pages = wb_writeback(wb, &work);
        }

        clear_bit(WB_start_all, &wb->state);
        return nr_pages;
}


/*
 * Retrieve work items and do the writeback they describe
 */
static long wb_do_writeback(struct bdi_writeback *wb)
{
        struct wb_writeback_work *work;
        long wrote = 0;

        set_bit(WB_writeback_running, &wb->state);
        while ((work = get_next_work_item(wb)) != NULL) {
                trace_writeback_exec(wb, work);
                wrote += wb_writeback(wb, work);
                finish_writeback_work(work);
        }

        /*
         * Check for a flush-everything request
         */
        wrote += wb_check_start_all(wb);

        /*
         * Check for periodic writeback, kupdated() style
         */
        wrote += wb_check_old_data_flush(wb);
        wrote += wb_check_background_flush(wb);
        clear_bit(WB_writeback_running, &wb->state);

        return wrote;
}

/*
 * Handle writeback of dirty data for the device backed by this bdi. Also
 * reschedules periodically and does kupdated style flushing.
 */
void wb_workfn(struct work_struct *work)
{
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, dwork);
        long pages_written;

        set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));

        if (likely(!current_is_workqueue_rescuer() ||
                   !test_bit(WB_registered, &wb->state))) {
                /*
                 * The normal path.  Keep writing back @wb until its
                 * work_list is empty.  Note that this path is also taken
                 * if @wb is shutting down even when we're running off the
                 * rescuer as work_list needs to be drained.
                 */
                do {
                        pages_written = wb_do_writeback(wb);
                        trace_writeback_pages_written(pages_written);
                } while (!list_empty(&wb->work_list));
        } else {
                /*
                 * bdi_wq can't get enough workers and we're running off
                 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
                 * enough for efficient IO.
                 */
                pages_written = writeback_inodes_wb(wb, 1024,
                                                    WB_REASON_FORKER_THREAD);
                trace_writeback_pages_written(pages_written);
        }

        if (!list_empty(&wb->work_list))
                wb_wakeup(wb);
        else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
                wb_wakeup_delayed(wb);
}

/*
 * Start writeback of all dirty pages on this bdi.
 */
static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                         enum wb_reason reason)
{
        struct bdi_writeback *wb;

        if (!bdi_has_dirty_io(bdi))
                return;

        list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                wb_start_writeback(wb, reason);
}

void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason)
{
        rcu_read_lock();
        __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wakeup the flusher threads to start writeback of all currently dirty pages
 */
void wakeup_flusher_threads(enum wb_reason reason)
{
        struct backing_dev_info *bdi;

        /*
         * If we are expecting writeback progress we must submit plugged IO.
         */
        blk_flush_plug(current->plug, true);

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
                __wakeup_flusher_threads_bdi(bdi, reason);
        rcu_read_unlock();
}

/*
 * Wake up bdi's periodically to make sure dirtytime inodes gets
 * written back periodically.  We deliberately do *not* check the
 * b_dirtytime list in wb_has_dirty_io(), since this would cause the
 * kernel to be constantly waking up once there are any dirtytime
 * inodes on the system.  So instead we define a separate delayed work
 * function which gets called much more rarely.  (By default, only
 * once every 12 hours.)
 *
 * If there is any other write activity going on in the file system,
 * this function won't be necessary.  But if the only thing that has
 * happened on the file system is a dirtytime inode caused by an atime
 * update, we need this infrastructure below to make sure that inode
 * eventually gets pushed out to disk.
 */
static void wakeup_dirtytime_writeback(struct work_struct *w);
static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);

static void wakeup_dirtytime_writeback(struct work_struct *w)
{
        struct backing_dev_info *bdi;

        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;

                list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                        if (!list_empty(&wb->b_dirty_time))
                                wb_wakeup(wb);
        }
        rcu_read_unlock();
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
}

static int dirtytime_interval_handler(const struct ctl_table *table, int write,
                               void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                mod_delayed_work(system_wq, &dirtytime_work, 0);
        return ret;
}

static const struct ctl_table vm_fs_writeback_table[] = {
        {
                .procname        = "dirtytime_expire_seconds",
                .data                = &dirtytime_expire_interval,
                .maxlen                = sizeof(dirtytime_expire_interval),
                .mode                = 0644,
                .proc_handler        = dirtytime_interval_handler,
                .extra1                = SYSCTL_ZERO,
        },
};

static int __init start_dirtytime_writeback(void)
{
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
        register_sysctl_init("vm", vm_fs_writeback_table);
        return 0;
}
__initcall(start_dirtytime_writeback);

/**
 * __mark_inode_dirty -        internal function to mark an inode dirty
 *
 * @inode: inode to mark
 * @flags: what kind of dirty, e.g. I_DIRTY_SYNC.  This can be a combination of
 *           multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined
 *           with I_DIRTY_PAGES.
 *
 * Mark an inode as dirty.  We notify the filesystem, then update the inode's
 * dirty flags.  Then, if needed we add the inode to the appropriate dirty list.
 *
 * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync()
 * instead of calling this directly.
 *
 * CAREFUL!  We only add the inode to the dirty list if it is hashed or if it
 * refers to a blockdev.  Unhashed inodes will never be added to the dirty list
 * even if they are later hashed, as they will have been marked dirty already.
 *
 * In short, ensure you hash any inodes _before_ you start marking them dirty.
 *
 * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
 * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
 * the kernel-internal blockdev inode represents the dirtying time of the
 * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
 * page->mapping->host, so the page-dirtying time is recorded in the internal
 * blockdev inode.
 */
void __mark_inode_dirty(struct inode *inode, int flags)
{
        struct super_block *sb = inode->i_sb;
        int dirtytime = 0;
        struct bdi_writeback *wb = NULL;

        trace_writeback_mark_inode_dirty(inode, flags);

        if (flags & I_DIRTY_INODE) {
                /*
                 * Inode timestamp update will piggback on this dirtying.
                 * We tell ->dirty_inode callback that timestamps need to
                 * be updated by setting I_DIRTY_TIME in flags.
                 */
                if (inode->i_state & I_DIRTY_TIME) {
                        spin_lock(&inode->i_lock);
                        if (inode->i_state & I_DIRTY_TIME) {
                                inode->i_state &= ~I_DIRTY_TIME;
                                flags |= I_DIRTY_TIME;
                        }
                        spin_unlock(&inode->i_lock);
                }

                /*
                 * Notify the filesystem about the inode being dirtied, so that
                 * (if needed) it can update on-disk fields and journal the
                 * inode.  This is only needed when the inode itself is being
                 * dirtied now.  I.e. it's only needed for I_DIRTY_INODE, not
                 * for just I_DIRTY_PAGES or I_DIRTY_TIME.
                 */
                trace_writeback_dirty_inode_start(inode, flags);
                if (sb->s_op->dirty_inode)
                        sb->s_op->dirty_inode(inode,
                                flags & (I_DIRTY_INODE | I_DIRTY_TIME));
                trace_writeback_dirty_inode(inode, flags);

                /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */
                flags &= ~I_DIRTY_TIME;
        } else {
                /*
                 * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing.
                 * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME
                 * in one call to __mark_inode_dirty().)
                 */
                dirtytime = flags & I_DIRTY_TIME;
                WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME);
        }

        /*
         * Paired with smp_mb() in __writeback_single_inode() for the
         * following lockless i_state test.  See there for details.
         */
        smp_mb();

        if ((inode->i_state & flags) == flags)
                return;

        spin_lock(&inode->i_lock);
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;

                inode_attach_wb(inode, NULL);

                inode->i_state |= flags;

                /*
                 * Grab inode's wb early because it requires dropping i_lock and we
                 * need to make sure following checks happen atomically with dirty
                 * list handling so that we don't move inodes under flush worker's
                 * hands.
                 */
                if (!was_dirty) {
                        wb = locked_inode_to_wb_and_lock_list(inode);
                        spin_lock(&inode->i_lock);
                }

                /*
                 * If the inode is queued for writeback by flush worker, just
                 * update its dirty state. Once the flush worker is done with
                 * the inode it will place it on the appropriate superblock
                 * list, based upon its state.
                 */
                if (inode->i_state & I_SYNC_QUEUED)
                        goto out_unlock;

                /*
                 * Only add valid (hashed) inodes to the superblock's
                 * dirty list.  Add blockdev inodes as well.
                 */
                if (!S_ISBLK(inode->i_mode)) {
                        if (inode_unhashed(inode))
                                goto out_unlock;
                }
                if (inode->i_state & I_FREEING)
                        goto out_unlock;

                /*
                 * If the inode was already on b_dirty/b_io/b_more_io, don't
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
                        struct list_head *dirty_list;
                        bool wakeup_bdi = false;

                        inode->dirtied_when = jiffies;
                        if (dirtytime)
                                inode->dirtied_time_when = jiffies;

                        if (inode->i_state & I_DIRTY)
                                dirty_list = &wb->b_dirty;
                        else
                                dirty_list = &wb->b_dirty_time;

                        wakeup_bdi = inode_io_list_move_locked(inode, wb,
                                                               dirty_list);

                        spin_unlock(&wb->list_lock);
                        spin_unlock(&inode->i_lock);
                        trace_writeback_dirty_inode_enqueue(inode);

                        /*
                         * If this is the first dirty inode for this bdi,
                         * we have to wake-up the corresponding bdi thread
                         * to make sure background write-back happens
                         * later.
                         */
                        if (wakeup_bdi &&
                            (wb->bdi->capabilities & BDI_CAP_WRITEBACK))
                                wb_wakeup_delayed(wb);
                        return;
                }
        }
out_unlock:
        if (wb)
                spin_unlock(&wb->list_lock);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(__mark_inode_dirty);

/*
 * The @s_sync_lock is used to serialise concurrent sync operations
 * to avoid lock contention problems with concurrent wait_sb_inodes() calls.
 * Concurrent callers will block on the s_sync_lock rather than doing contending
 * walks. The queueing maintains sync(2) required behaviour as all the IO that
 * has been issued up to the time this function is enter is guaranteed to be
 * completed by the time we have gained the lock and waited for all IO that is
 * in progress regardless of the order callers are granted the lock.
 */
static void wait_sb_inodes(struct super_block *sb)
{
        LIST_HEAD(sync_list);

        /*
         * We need to be protected against the filesystem going from
         * r/o to r/w or vice versa.
         */
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        mutex_lock(&sb->s_sync_lock);

        /*
         * Splice the writeback list onto a temporary list to avoid waiting on
         * inodes that have started writeback after this point.
         *
         * Use rcu_read_lock() to keep the inodes around until we have a
         * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as
         * the local list because inodes can be dropped from either by writeback
         * completion.
         */
        rcu_read_lock();
        spin_lock_irq(&sb->s_inode_wblist_lock);
        list_splice_init(&sb->s_inodes_wb, &sync_list);

        /*
         * Data integrity sync. Must wait for all pages under writeback, because
         * there may have been pages dirtied before our sync call, but which had
         * writeout started before we write it out.  In which case, the inode
         * may not be on the dirty list, but we still have to wait for that
         * writeout.
         */
        while (!list_empty(&sync_list)) {
                struct inode *inode = list_first_entry(&sync_list, struct inode,
                                                       i_wb_list);
                struct address_space *mapping = inode->i_mapping;

                /*
                 * Move each inode back to the wb list before we drop the lock
                 * to preserve consistency between i_wb_list and the mapping
                 * writeback tag. Writeback completion is responsible to remove
                 * the inode from either list once the writeback tag is cleared.
                 */
                list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb);

                /*
                 * The mapping can appear untagged while still on-list since we
                 * do not have the mapping lock. Skip it here, wb completion
                 * will remove it.
                 */
                if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                        continue;

                spin_unlock_irq(&sb->s_inode_wblist_lock);

                spin_lock(&inode->i_lock);
                if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
                        spin_unlock(&inode->i_lock);

                        spin_lock_irq(&sb->s_inode_wblist_lock);
                        continue;
                }
                __iget(inode);
                spin_unlock(&inode->i_lock);
                rcu_read_unlock();

                /*
                 * We keep the error status of individual mapping so that
                 * applications can catch the writeback error using fsync(2).
                 * See filemap_fdatawait_keep_errors() for details.
                 */
                filemap_fdatawait_keep_errors(mapping);

                cond_resched();

                iput(inode);

                rcu_read_lock();
                spin_lock_irq(&sb->s_inode_wblist_lock);
        }
        spin_unlock_irq(&sb->s_inode_wblist_lock);
        rcu_read_unlock();
        mutex_unlock(&sb->s_sync_lock);
}

static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
                                     enum wb_reason reason, bool skip_if_busy)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                        = sb,
                .sync_mode                = WB_SYNC_NONE,
                .tagged_writepages        = 1,
                .done                        = &done,
                .nr_pages                = nr,
                .reason                        = reason,
        };

        if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
        wb_wait_for_completion(&done);
}

/**
 * writeback_inodes_sb_nr -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @nr: the number of pages to write
 * @reason: reason why some writeback work initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb_nr(struct super_block *sb,
                            unsigned long nr,
                            enum wb_reason reason)
{
        __writeback_inodes_sb_nr(sb, nr, reason, false);
}
EXPORT_SYMBOL(writeback_inodes_sb_nr);

/**
 * writeback_inodes_sb        -        writeback dirty inodes from given super_block
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Start writeback on some inodes on this super_block. No guarantees are made
 * on how many (if any) will be written, and this function does not wait
 * for IO completion of submitted IO.
 */
void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
}
EXPORT_SYMBOL(writeback_inodes_sb);

/**
 * try_to_writeback_inodes_sb - try to start writeback if none underway
 * @sb: the superblock
 * @reason: reason why some writeback work was initiated
 *
 * Invoke __writeback_inodes_sb_nr if no writeback is currently underway.
 */
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
{
        if (!down_read_trylock(&sb->s_umount))
                return;

        __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true);
        up_read(&sb->s_umount);
}
EXPORT_SYMBOL(try_to_writeback_inodes_sb);

/**
 * sync_inodes_sb        -        sync sb inode pages
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
 * super_block.
 */
void sync_inodes_sb(struct super_block *sb)
{
        struct backing_dev_info *bdi = sb->s_bdi;
        DEFINE_WB_COMPLETION(done, bdi);
        struct wb_writeback_work work = {
                .sb                = sb,
                .sync_mode        = WB_SYNC_ALL,
                .nr_pages        = LONG_MAX,
                .range_cyclic        = 0,
                .done                = &done,
                .reason                = WB_REASON_SYNC,
                .for_sync        = 1,
        };

        /*
         * Can't skip on !bdi_has_dirty() because we should wait for !dirty
         * inodes under writeback and I_DIRTY_TIME inodes ignored by
         * bdi_has_dirty() need to be written out too.
         */
        if (bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));

        /* protect against inode wb switch, see inode_switch_wbs_work_fn() */
        bdi_down_write_wb_switch_rwsem(bdi);
        bdi_split_work_to_wbs(bdi, &work, false);
        wb_wait_for_completion(&done);
        bdi_up_write_wb_switch_rwsem(bdi);

        wait_sb_inodes(sb);
}
EXPORT_SYMBOL(sync_inodes_sb);

/**
 * write_inode_now        -        write an inode to disk
 * @inode: inode to write to disk
 * @sync: whether the write should be synchronous or not
 *
 * This function commits an inode to disk immediately if it is dirty. This is
 * primarily needed by knfsd.
 *
 * The caller must either have a ref on the inode or must have set I_WILL_FREE.
 */
int write_inode_now(struct inode *inode, int sync)
{
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
                .range_start = 0,
                .range_end = LLONG_MAX,
        };

        if (!mapping_can_writeback(inode->i_mapping))
                wbc.nr_to_write = 0;

        might_sleep();
        return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(write_inode_now);

/**
 * sync_inode_metadata - write an inode to disk
 * @inode: the inode to sync
 * @wait: wait for I/O to complete.
 *
 * Write an inode to disk and adjust its dirty state after completion.
 *
 * Note: only writes the actual inode, no associated data or other metadata.
 */
int sync_inode_metadata(struct inode *inode, int wait)
{
        struct writeback_control wbc = {
                .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
                .nr_to_write = 0, /* metadata-only */
        };

        return writeback_single_inode(inode, &wbc);
}
EXPORT_SYMBOL(sync_inode_metadata);










































  246 












































































  136 















  135 
























































































































































































































































  213 




  121 
  134 






  121 
  135 














































































































































  246 

















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2020 Google LLC
 * Author: Will Deacon <will@kernel.org>
 */

#ifndef __ARM64_KVM_PGTABLE_H__
#define __ARM64_KVM_PGTABLE_H__

#include <linux/bits.h>
#include <linux/kvm_host.h>
#include <linux/types.h>

#define KVM_PGTABLE_FIRST_LEVEL                -1
#define KVM_PGTABLE_LAST_LEVEL                3

/*
 * The largest supported block sizes for KVM (no 52-bit PA support):
 *  - 4K (level 1):        1GB
 *  - 16K (level 2):        32MB
 *  - 64K (level 2):        512MB
 */
#ifdef CONFIG_ARM64_4K_PAGES
#define KVM_PGTABLE_MIN_BLOCK_LEVEL        1
#else
#define KVM_PGTABLE_MIN_BLOCK_LEVEL        2
#endif

#define kvm_lpa2_is_enabled()                system_supports_lpa2()

static inline u64 kvm_get_parange_max(void)
{
        if (kvm_lpa2_is_enabled() ||
           (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && PAGE_SHIFT == 16))
                return ID_AA64MMFR0_EL1_PARANGE_52;
        else
                return ID_AA64MMFR0_EL1_PARANGE_48;
}

static inline u64 kvm_get_parange(u64 mmfr0)
{
        u64 parange_max = kvm_get_parange_max();
        u64 parange = cpuid_feature_extract_unsigned_field(mmfr0,
                                ID_AA64MMFR0_EL1_PARANGE_SHIFT);
        if (parange > parange_max)
                parange = parange_max;

        return parange;
}

typedef u64 kvm_pte_t;

#define KVM_PTE_VALID                        BIT(0)

#define KVM_PTE_ADDR_MASK                GENMASK(47, PAGE_SHIFT)
#define KVM_PTE_ADDR_51_48                GENMASK(15, 12)
#define KVM_PTE_ADDR_MASK_LPA2                GENMASK(49, PAGE_SHIFT)
#define KVM_PTE_ADDR_51_50_LPA2                GENMASK(9, 8)

#define KVM_PHYS_INVALID                (-1ULL)

#define KVM_PTE_TYPE                        BIT(1)
#define KVM_PTE_TYPE_BLOCK                0
#define KVM_PTE_TYPE_PAGE                1
#define KVM_PTE_TYPE_TABLE                1

#define KVM_PTE_LEAF_ATTR_LO                GENMASK(11, 2)

#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX        GENMASK(4, 2)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP        GENMASK(7, 6)
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO                \
        ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; })
#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW                \
        ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; })
#define KVM_PTE_LEAF_ATTR_LO_S1_SH        GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS        3
#define KVM_PTE_LEAF_ATTR_LO_S1_AF        BIT(10)

#define KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR        GENMASK(5, 2)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R        BIT(6)
#define KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W        BIT(7)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH        GENMASK(9, 8)
#define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS        3
#define KVM_PTE_LEAF_ATTR_LO_S2_AF        BIT(10)

#define KVM_PTE_LEAF_ATTR_HI                GENMASK(63, 50)

#define KVM_PTE_LEAF_ATTR_HI_SW                GENMASK(58, 55)

#define KVM_PTE_LEAF_ATTR_HI_S1_XN        BIT(54)

#define KVM_PTE_LEAF_ATTR_HI_S2_XN        BIT(54)

#define KVM_PTE_LEAF_ATTR_HI_S1_GP        BIT(50)

#define KVM_PTE_LEAF_ATTR_S2_PERMS        (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \
                                         KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \
                                         KVM_PTE_LEAF_ATTR_HI_S2_XN)

#define KVM_INVALID_PTE_OWNER_MASK        GENMASK(9, 2)
#define KVM_MAX_OWNER_ID                1

/*
 * Used to indicate a pte for which a 'break-before-make' sequence is in
 * progress.
 */
#define KVM_INVALID_PTE_LOCKED                BIT(10)

static inline bool kvm_pte_valid(kvm_pte_t pte)
{
        return pte & KVM_PTE_VALID;
}

static inline u64 kvm_pte_to_phys(kvm_pte_t pte)
{
        u64 pa;

        if (kvm_lpa2_is_enabled()) {
                pa = pte & KVM_PTE_ADDR_MASK_LPA2;
                pa |= FIELD_GET(KVM_PTE_ADDR_51_50_LPA2, pte) << 50;
        } else {
                pa = pte & KVM_PTE_ADDR_MASK;
                if (PAGE_SHIFT == 16)
                        pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48;
        }

        return pa;
}

static inline kvm_pte_t kvm_phys_to_pte(u64 pa)
{
        kvm_pte_t pte;

        if (kvm_lpa2_is_enabled()) {
                pte = pa & KVM_PTE_ADDR_MASK_LPA2;
                pa &= GENMASK(51, 50);
                pte |= FIELD_PREP(KVM_PTE_ADDR_51_50_LPA2, pa >> 50);
        } else {
                pte = pa & KVM_PTE_ADDR_MASK;
                if (PAGE_SHIFT == 16) {
                        pa &= GENMASK(51, 48);
                        pte |= FIELD_PREP(KVM_PTE_ADDR_51_48, pa >> 48);
                }
        }

        return pte;
}

static inline kvm_pfn_t kvm_pte_to_pfn(kvm_pte_t pte)
{
        return __phys_to_pfn(kvm_pte_to_phys(pte));
}

static inline u64 kvm_granule_shift(s8 level)
{
        /* Assumes KVM_PGTABLE_LAST_LEVEL is 3 */
        return ARM64_HW_PGTABLE_LEVEL_SHIFT(level);
}

static inline u64 kvm_granule_size(s8 level)
{
        return BIT(kvm_granule_shift(level));
}

static inline bool kvm_level_supports_block_mapping(s8 level)
{
        return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL;
}

static inline u32 kvm_supported_block_sizes(void)
{
        s8 level = KVM_PGTABLE_MIN_BLOCK_LEVEL;
        u32 r = 0;

        for (; level <= KVM_PGTABLE_LAST_LEVEL; level++)
                r |= BIT(kvm_granule_shift(level));

        return r;
}

static inline bool kvm_is_block_size_supported(u64 size)
{
        bool is_power_of_two = IS_ALIGNED(size, size);

        return is_power_of_two && (size & kvm_supported_block_sizes());
}

/**
 * struct kvm_pgtable_mm_ops - Memory management callbacks.
 * @zalloc_page:                Allocate a single zeroed memory page.
 *                                The @arg parameter can be used by the walker
 *                                to pass a memcache. The initial refcount of
 *                                the page is 1.
 * @zalloc_pages_exact:                Allocate an exact number of zeroed memory pages.
 *                                The @size parameter is in bytes, and is rounded
 *                                up to the next page boundary. The resulting
 *                                allocation is physically contiguous.
 * @free_pages_exact:                Free an exact number of memory pages previously
 *                                allocated by zalloc_pages_exact.
 * @free_unlinked_table:        Free an unlinked paging structure by unlinking and
 *                                dropping references.
 * @get_page:                        Increment the refcount on a page.
 * @put_page:                        Decrement the refcount on a page. When the
 *                                refcount reaches 0 the page is automatically
 *                                freed.
 * @page_count:                        Return the refcount of a page.
 * @phys_to_virt:                Convert a physical address into a virtual
 *                                address        mapped in the current context.
 * @virt_to_phys:                Convert a virtual address mapped in the current
 *                                context into a physical address.
 * @dcache_clean_inval_poc:        Clean and invalidate the data cache to the PoC
 *                                for the        specified memory address range.
 * @icache_inval_pou:                Invalidate the instruction cache to the PoU
 *                                for the specified memory address range.
 */
struct kvm_pgtable_mm_ops {
        void*                (*zalloc_page)(void *arg);
        void*                (*zalloc_pages_exact)(size_t size);
        void                (*free_pages_exact)(void *addr, size_t size);
        void                (*free_unlinked_table)(void *addr, s8 level);
        void                (*get_page)(void *addr);
        void                (*put_page)(void *addr);
        int                (*page_count)(void *addr);
        void*                (*phys_to_virt)(phys_addr_t phys);
        phys_addr_t        (*virt_to_phys)(void *addr);
        void                (*dcache_clean_inval_poc)(void *addr, size_t size);
        void                (*icache_inval_pou)(void *addr, size_t size);
};

/**
 * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags.
 * @KVM_PGTABLE_S2_NOFWB:        Don't enforce Normal-WB even if the CPUs have
 *                                ARM64_HAS_STAGE2_FWB.
 * @KVM_PGTABLE_S2_IDMAP:        Only use identity mappings.
 */
enum kvm_pgtable_stage2_flags {
        KVM_PGTABLE_S2_NOFWB                        = BIT(0),
        KVM_PGTABLE_S2_IDMAP                        = BIT(1),
};

/**
 * enum kvm_pgtable_prot - Page-table permissions and attributes.
 * @KVM_PGTABLE_PROT_X:                Execute permission.
 * @KVM_PGTABLE_PROT_W:                Write permission.
 * @KVM_PGTABLE_PROT_R:                Read permission.
 * @KVM_PGTABLE_PROT_DEVICE:        Device attributes.
 * @KVM_PGTABLE_PROT_NORMAL_NC:        Normal noncacheable attributes.
 * @KVM_PGTABLE_PROT_SW0:        Software bit 0.
 * @KVM_PGTABLE_PROT_SW1:        Software bit 1.
 * @KVM_PGTABLE_PROT_SW2:        Software bit 2.
 * @KVM_PGTABLE_PROT_SW3:        Software bit 3.
 */
enum kvm_pgtable_prot {
        KVM_PGTABLE_PROT_X                        = BIT(0),
        KVM_PGTABLE_PROT_W                        = BIT(1),
        KVM_PGTABLE_PROT_R                        = BIT(2),

        KVM_PGTABLE_PROT_DEVICE                        = BIT(3),
        KVM_PGTABLE_PROT_NORMAL_NC                = BIT(4),

        KVM_PGTABLE_PROT_SW0                        = BIT(55),
        KVM_PGTABLE_PROT_SW1                        = BIT(56),
        KVM_PGTABLE_PROT_SW2                        = BIT(57),
        KVM_PGTABLE_PROT_SW3                        = BIT(58),
};

#define KVM_PGTABLE_PROT_RW        (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W)
#define KVM_PGTABLE_PROT_RWX        (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X)

#define PKVM_HOST_MEM_PROT        KVM_PGTABLE_PROT_RWX
#define PKVM_HOST_MMIO_PROT        KVM_PGTABLE_PROT_RW

#define PAGE_HYP                KVM_PGTABLE_PROT_RW
#define PAGE_HYP_EXEC                (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X)
#define PAGE_HYP_RO                (KVM_PGTABLE_PROT_R)
#define PAGE_HYP_DEVICE                (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE)

typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end,
                                           enum kvm_pgtable_prot prot);

/**
 * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk.
 * @KVM_PGTABLE_WALK_LEAF:                Visit leaf entries, including invalid
 *                                        entries.
 * @KVM_PGTABLE_WALK_TABLE_PRE:                Visit table entries before their
 *                                        children.
 * @KVM_PGTABLE_WALK_TABLE_POST:        Visit table entries after their
 *                                        children.
 * @KVM_PGTABLE_WALK_SHARED:                Indicates the page-tables may be shared
 *                                        with other software walkers.
 * @KVM_PGTABLE_WALK_HANDLE_FAULT:        Indicates the page-table walk was
 *                                        invoked from a fault handler.
 * @KVM_PGTABLE_WALK_SKIP_BBM_TLBI:        Visit and update table entries
 *                                        without Break-before-make's
 *                                        TLB invalidation.
 * @KVM_PGTABLE_WALK_SKIP_CMO:                Visit and update table entries
 *                                        without Cache maintenance
 *                                        operations required.
 */
enum kvm_pgtable_walk_flags {
        KVM_PGTABLE_WALK_LEAF                        = BIT(0),
        KVM_PGTABLE_WALK_TABLE_PRE                = BIT(1),
        KVM_PGTABLE_WALK_TABLE_POST                = BIT(2),
        KVM_PGTABLE_WALK_SHARED                        = BIT(3),
        KVM_PGTABLE_WALK_HANDLE_FAULT                = BIT(4),
        KVM_PGTABLE_WALK_SKIP_BBM_TLBI                = BIT(5),
        KVM_PGTABLE_WALK_SKIP_CMO                = BIT(6),
};

struct kvm_pgtable_visit_ctx {
        kvm_pte_t                                *ptep;
        kvm_pte_t                                old;
        void                                        *arg;
        struct kvm_pgtable_mm_ops                *mm_ops;
        u64                                        start;
        u64                                        addr;
        u64                                        end;
        s8                                        level;
        enum kvm_pgtable_walk_flags                flags;
};

typedef int (*kvm_pgtable_visitor_fn_t)(const struct kvm_pgtable_visit_ctx *ctx,
                                        enum kvm_pgtable_walk_flags visit);

static inline bool kvm_pgtable_walk_shared(const struct kvm_pgtable_visit_ctx *ctx)
{
        return ctx->flags & KVM_PGTABLE_WALK_SHARED;
}

/**
 * struct kvm_pgtable_walker - Hook into a page-table walk.
 * @cb:                Callback function to invoke during the walk.
 * @arg:        Argument passed to the callback function.
 * @flags:        Bitwise-OR of flags to identify the entry types on which to
 *                invoke the callback function.
 */
struct kvm_pgtable_walker {
        const kvm_pgtable_visitor_fn_t                cb;
        void * const                                arg;
        const enum kvm_pgtable_walk_flags        flags;
};

/*
 * RCU cannot be used in a non-kernel context such as the hyp. As such, page
 * table walkers used in hyp do not call into RCU and instead use other
 * synchronization mechanisms (such as a spinlock).
 */
#if defined(__KVM_NVHE_HYPERVISOR__) || defined(__KVM_VHE_HYPERVISOR__)

typedef kvm_pte_t *kvm_pteref_t;

static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker,
                                                kvm_pteref_t pteref)
{
        return pteref;
}

static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
{
        /*
         * Due to the lack of RCU (or a similar protection scheme), only
         * non-shared table walkers are allowed in the hypervisor.
         */
        if (walker->flags & KVM_PGTABLE_WALK_SHARED)
                return -EPERM;

        return 0;
}

static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker) {}

static inline bool kvm_pgtable_walk_lock_held(void)
{
        return true;
}

#else

typedef kvm_pte_t __rcu *kvm_pteref_t;

static inline kvm_pte_t *kvm_dereference_pteref(struct kvm_pgtable_walker *walker,
                                                kvm_pteref_t pteref)
{
        return rcu_dereference_check(pteref, !(walker->flags & KVM_PGTABLE_WALK_SHARED));
}

static inline int kvm_pgtable_walk_begin(struct kvm_pgtable_walker *walker)
{
        if (walker->flags & KVM_PGTABLE_WALK_SHARED)
                rcu_read_lock();

        return 0;
}

static inline void kvm_pgtable_walk_end(struct kvm_pgtable_walker *walker)
{
        if (walker->flags & KVM_PGTABLE_WALK_SHARED)
                rcu_read_unlock();
}

static inline bool kvm_pgtable_walk_lock_held(void)
{
        return rcu_read_lock_held();
}

#endif

/**
 * struct kvm_pgtable - KVM page-table.
 * @ia_bits:                Maximum input address size, in bits.
 * @start_level:        Level at which the page-table walk starts.
 * @pgd:                Pointer to the first top-level entry of the page-table.
 * @mm_ops:                Memory management callbacks.
 * @mmu:                Stage-2 KVM MMU struct. Unused for stage-1 page-tables.
 * @flags:                Stage-2 page-table flags.
 * @force_pte_cb:        Function that returns true if page level mappings must
 *                        be used instead of block mappings.
 */
struct kvm_pgtable {
        union {
                struct rb_root_cached                                pkvm_mappings;
                struct {
                        u32                                        ia_bits;
                        s8                                        start_level;
                        kvm_pteref_t                                pgd;
                        struct kvm_pgtable_mm_ops                *mm_ops;

                        /* Stage-2 only */
                        enum kvm_pgtable_stage2_flags                flags;
                        kvm_pgtable_force_pte_cb_t                force_pte_cb;
                };
        };
        struct kvm_s2_mmu                                        *mmu;
};

/**
 * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table.
 * @pgt:        Uninitialised page-table structure to initialise.
 * @va_bits:        Maximum virtual address bits.
 * @mm_ops:        Memory management callbacks.
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits,
                         struct kvm_pgtable_mm_ops *mm_ops);

/**
 * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table.
 * @pgt:        Page-table structure initialised by kvm_pgtable_hyp_init().
 *
 * The page-table is assumed to be unreachable by any hardware walkers prior
 * to freeing and therefore no TLB invalidation is performed.
 */
void kvm_pgtable_hyp_destroy(struct kvm_pgtable *pgt);

/**
 * kvm_pgtable_hyp_map() - Install a mapping in a hypervisor stage-1 page-table.
 * @pgt:        Page-table structure initialised by kvm_pgtable_hyp_init().
 * @addr:        Virtual address at which to place the mapping.
 * @size:        Size of the mapping.
 * @phys:        Physical address of the memory to map.
 * @prot:        Permissions and attributes for the mapping.
 *
 * The offset of @addr within a page is ignored, @size is rounded-up to
 * the next page boundary and @phys is rounded-down to the previous page
 * boundary.
 *
 * If device attributes are not explicitly requested in @prot, then the
 * mapping will be normal, cacheable. Attempts to install a new mapping
 * for a virtual address that is already mapped will be rejected with an
 * error and a WARN().
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys,
                        enum kvm_pgtable_prot prot);

/**
 * kvm_pgtable_hyp_unmap() - Remove a mapping from a hypervisor stage-1 page-table.
 * @pgt:        Page-table structure initialised by kvm_pgtable_hyp_init().
 * @addr:        Virtual address from which to remove the mapping.
 * @size:        Size of the mapping.
 *
 * The offset of @addr within a page is ignored, @size is rounded-up to
 * the next page boundary and @phys is rounded-down to the previous page
 * boundary.
 *
 * TLB invalidation is performed for each page-table entry cleared during the
 * unmapping operation and the reference count for the page-table page
 * containing the cleared entry is decremented, with unreferenced pages being
 * freed. The unmapping operation will stop early if it encounters either an
 * invalid page-table entry or a valid block mapping which maps beyond the range
 * being unmapped.
 *
 * Return: Number of bytes unmapped, which may be 0.
 */
u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);

/**
 * kvm_get_vtcr() - Helper to construct VTCR_EL2
 * @mmfr0:        Sanitized value of SYS_ID_AA64MMFR0_EL1 register.
 * @mmfr1:        Sanitized value of SYS_ID_AA64MMFR1_EL1 register.
 * @phys_shfit:        Value to set in VTCR_EL2.T0SZ.
 *
 * The VTCR value is common across all the physical CPUs on the system.
 * We use system wide sanitised values to fill in different fields,
 * except for Hardware Management of Access Flags. HA Flag is set
 * unconditionally on all CPUs, as it is safe to run with or without
 * the feature and the bit is RES0 on CPUs that don't support it.
 *
 * Return: VTCR_EL2 value
 */
u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);

/**
 * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD
 * @vtcr:        Content of the VTCR register.
 *
 * Return: the size (in bytes) of the stage-2 PGD
 */
size_t kvm_pgtable_stage2_pgd_size(u64 vtcr);

/**
 * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table.
 * @pgt:        Uninitialised page-table structure to initialise.
 * @mmu:        S2 MMU context for this S2 translation
 * @mm_ops:        Memory management callbacks.
 * @flags:        Stage-2 configuration flags.
 * @force_pte_cb: Function that returns true if page level mappings must
 *                be used instead of block mappings.
 *
 * Return: 0 on success, negative error code on failure.
 */
int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
                              struct kvm_pgtable_mm_ops *mm_ops,
                              enum kvm_pgtable_stage2_flags flags,
                              kvm_pgtable_force_pte_cb_t force_pte_cb);

static inline int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu,
                                          struct kvm_pgtable_mm_ops *mm_ops)
{
        return __kvm_pgtable_stage2_init(pgt, mmu, mm_ops, 0, NULL);
}

/**
 * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 *
 * The page-table is assumed to be unreachable by any hardware walkers prior
 * to freeing and therefore no TLB invalidation is performed.
 */
void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt);

/**
 * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure.
 * @mm_ops:        Memory management callbacks.
 * @pgtable:        Unlinked stage-2 paging structure to be freed.
 * @level:        Level of the stage-2 paging structure to be freed.
 *
 * The page-table is assumed to be unreachable by any hardware walkers prior to
 * freeing and therefore no TLB invalidation is performed.
 */
void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level);

/**
 * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @phys:        Physical address of the memory to map.
 * @level:        Starting level of the stage-2 paging structure to be created.
 * @prot:        Permissions and attributes for the mapping.
 * @mc:                Cache of pre-allocated and zeroed memory from which to allocate
 *                page-table pages.
 * @force_pte:  Force mappings to PAGE_SIZE granularity.
 *
 * Returns an unlinked page-table tree.  This new page-table tree is
 * not reachable (i.e., it is unlinked) from the root pgd and it's
 * therefore unreachableby the hardware page-table walker. No TLB
 * invalidation or CMOs are performed.
 *
 * If device attributes are not explicitly requested in @prot, then the
 * mapping will be normal, cacheable.
 *
 * Return: The fully populated (unlinked) stage-2 paging structure, or
 * an ERR_PTR(error) on failure.
 */
kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt,
                                              u64 phys, s8 level,
                                              enum kvm_pgtable_prot prot,
                                              void *mc, bool force_pte);

/**
 * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:        Intermediate physical address at which to place the mapping.
 * @size:        Size of the mapping.
 * @phys:        Physical address of the memory to map.
 * @prot:        Permissions and attributes for the mapping.
 * @mc:                Cache of pre-allocated and zeroed memory from which to allocate
 *                page-table pages.
 * @flags:        Flags to control the page-table walk (ex. a shared walk)
 *
 * The offset of @addr within a page is ignored, @size is rounded-up to
 * the next page boundary and @phys is rounded-down to the previous page
 * boundary.
 *
 * If device attributes are not explicitly requested in @prot, then the
 * mapping will be normal, cacheable.
 *
 * Note that the update of a valid leaf PTE in this function will be aborted,
 * if it's trying to recreate the exact same mapping or only change the access
 * permissions. Instead, the vCPU will exit one more time from guest if still
 * needed and then go through the path of relaxing permissions.
 *
 * Note that this function will both coalesce existing table entries and split
 * existing block mappings, relying on page-faults to fault back areas outside
 * of the new mapping lazily.
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
                           u64 phys, enum kvm_pgtable_prot prot,
                           void *mc, enum kvm_pgtable_walk_flags flags);

/**
 * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to
 *                                    track ownership.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:        Base intermediate physical address to annotate.
 * @size:        Size of the annotated range.
 * @mc:                Cache of pre-allocated and zeroed memory from which to allocate
 *                page-table pages.
 * @owner_id:        Unique identifier for the owner of the page.
 *
 * By default, all page-tables are owned by identifier 0. This function can be
 * used to mark portions of the IPA space as owned by other entities. When a
 * stage 2 is used with identity-mappings, these annotations allow to use the
 * page-table data structure as a simple rmap.
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size,
                                 void *mc, u8 owner_id);

/**
 * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:        Intermediate physical address from which to remove the mapping.
 * @size:        Size of the mapping.
 *
 * The offset of @addr within a page is ignored and @size is rounded-up to
 * the next page boundary.
 *
 * TLB invalidation is performed for each page-table entry cleared during the
 * unmapping operation and the reference count for the page-table page
 * containing the cleared entry is decremented, with unreferenced pages being
 * freed. Unmapping a cacheable page will ensure that it is clean to the PoC if
 * FWB is not supported by the CPU.
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);

/**
 * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range
 *                                  without TLB invalidation.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:        Intermediate physical address from which to write-protect,
 * @size:        Size of the range.
 *
 * The offset of @addr within a page is ignored and @size is rounded-up to
 * the next page boundary.
 *
 * Note that it is the caller's responsibility to invalidate the TLB after
 * calling this function to ensure that the updated permissions are visible
 * to the CPUs.
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_stage2_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);

/**
 * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:        Intermediate physical address to identify the page-table entry.
 * @flags:        Flags to control the page-table walk (ex. a shared walk)
 *
 * The offset of @addr within a page is ignored.
 *
 * If there is a valid, leaf page-table entry used to translate @addr, then
 * set the access flag in that entry.
 */
void kvm_pgtable_stage2_mkyoung(struct kvm_pgtable *pgt, u64 addr,
                                enum kvm_pgtable_walk_flags flags);

/**
 * kvm_pgtable_stage2_test_clear_young() - Test and optionally clear the access
 *                                           flag in a page-table entry.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:        Intermediate physical address to identify the page-table entry.
 * @size:        Size of the address range to visit.
 * @mkold:        True if the access flag should be cleared.
 *
 * The offset of @addr within a page is ignored.
 *
 * Tests and conditionally clears the access flag for every valid, leaf
 * page-table entry used to translate the range [@addr, @addr + @size).
 *
 * Note that it is the caller's responsibility to invalidate the TLB after
 * calling this function to ensure that the updated permissions are visible
 * to the CPUs.
 *
 * Return: True if any of the visited PTEs had the access flag set.
 */
bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
                                         u64 size, bool mkold);

/**
 * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a
 *                                      page-table entry.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:        Intermediate physical address to identify the page-table entry.
 * @prot:        Additional permissions to grant for the mapping.
 * @flags:        Flags to control the page-table walk (ex. a shared walk)
 *
 * The offset of @addr within a page is ignored.
 *
 * If there is a valid, leaf page-table entry used to translate @addr, then
 * relax the permissions in that entry according to the read, write and
 * execute permissions specified by @prot. No permissions are removed, and
 * TLB invalidation is performed after updating the entry. Software bits cannot
 * be set or cleared using kvm_pgtable_stage2_relax_perms().
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
                                   enum kvm_pgtable_prot prot,
                                   enum kvm_pgtable_walk_flags flags);

/**
 * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point
 *                                       of Coherency for guest stage-2 address
 *                                      range.
 * @pgt:        Page-table structure initialised by kvm_pgtable_stage2_init*().
 * @addr:        Intermediate physical address from which to flush.
 * @size:        Size of the range.
 *
 * The offset of @addr within a page is ignored and @size is rounded-up to
 * the next page boundary.
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);

/**
 * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing
 *                                to PAGE_SIZE guest pages.
 * @pgt:         Page-table structure initialised by kvm_pgtable_stage2_init().
 * @addr:         Intermediate physical address from which to split.
 * @size:         Size of the range.
 * @mc:                 Cache of pre-allocated and zeroed memory from which to allocate
 *                 page-table pages.
 *
 * The function tries to split any level 1 or 2 entry that overlaps
 * with the input range (given by @addr and @size).
 *
 * Return: 0 on success, negative error code on failure. Note that
 * kvm_pgtable_stage2_split() is best effort: it tries to break as many
 * blocks in the input range as allowed by @mc_capacity.
 */
int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size,
                             struct kvm_mmu_memory_cache *mc);

/**
 * kvm_pgtable_walk() - Walk a page-table.
 * @pgt:        Page-table structure initialised by kvm_pgtable_*_init().
 * @addr:        Input address for the start of the walk.
 * @size:        Size of the range to walk.
 * @walker:        Walker callback description.
 *
 * The offset of @addr within a page is ignored and @size is rounded-up to
 * the next page boundary.
 *
 * The walker will walk the page-table entries corresponding to the input
 * address range specified, visiting entries according to the walker flags.
 * Invalid entries are treated as leaf entries. The visited page table entry is
 * reloaded after invoking the walker callback, allowing the walker to descend
 * into a newly installed table.
 *
 * Returning a negative error code from the walker callback function will
 * terminate the walk immediately with the same error code.
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size,
                     struct kvm_pgtable_walker *walker);

/**
 * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry
 *                            with its level.
 * @pgt:        Page-table structure initialised by kvm_pgtable_*_init()
 *                or a similar initialiser.
 * @addr:        Input address for the start of the walk.
 * @ptep:        Pointer to storage for the retrieved PTE.
 * @level:        Pointer to storage for the level of the retrieved PTE.
 *
 * The offset of @addr within a page is ignored.
 *
 * The walker will walk the page-table entries corresponding to the input
 * address specified, retrieving the leaf corresponding to this address.
 * Invalid entries are treated as leaf entries.
 *
 * Return: 0 on success, negative error code on failure.
 */
int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr,
                         kvm_pte_t *ptep, s8 *level);

/**
 * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a
 *                                   stage-2 Page-Table Entry.
 * @pte:        Page-table entry
 *
 * Return: protection attributes of the page-table entry in the enum
 *           kvm_pgtable_prot format.
 */
enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte);

/**
 * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1
 *                                Page-Table Entry.
 * @pte:        Page-table entry
 *
 * Return: protection attributes of the page-table entry in the enum
 *           kvm_pgtable_prot format.
 */
enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte);

/**
 * kvm_tlb_flush_vmid_range() - Invalidate/flush a range of TLB entries
 *
 * @mmu:        Stage-2 KVM MMU struct
 * @addr:        The base Intermediate physical address from which to invalidate
 * @size:        Size of the range from the base to invalidate
 */
void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
                                phys_addr_t addr, size_t size);
#endif        /* __ARM64_KVM_PGTABLE_H__ */



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 


















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
// SPDX-License-Identifier: GPL-2.0
/*
 *  ext4.h
 *
 * Copyright (C) 1992, 1993, 1994, 1995
 * Remy Card (card@masi.ibp.fr)
 * Laboratoire MASI - Institut Blaise Pascal
 * Universite Pierre et Marie Curie (Paris VI)
 *
 *  from
 *
 *  linux/include/linux/minix_fs.h
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#ifndef _EXT4_H
#define _EXT4_H

#include <linux/refcount.h>
#include <linux/types.h>
#include <linux/blkdev.h>
#include <linux/magic.h>
#include <linux/jbd2.h>
#include <linux/quota.h>
#include <linux/rwsem.h>
#include <linux/rbtree.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/sched/signal.h>
#include <linux/blockgroup_lock.h>
#include <linux/percpu_counter.h>
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/falloc.h>
#include <linux/percpu-rwsem.h>
#include <linux/fiemap.h>
#ifdef __KERNEL__
#include <linux/compat.h>
#endif
#include <uapi/linux/ext4.h>

#include <linux/fscrypt.h>
#include <linux/fsverity.h>

#include <linux/compiler.h>

/*
 * The fourth extended filesystem constants/structures
 */

/*
 * with AGGRESSIVE_CHECK allocator runs consistency checks over
 * structures. these checks slow things down a lot
 */
#define AGGRESSIVE_CHECK__

/*
 * with DOUBLE_CHECK defined mballoc creates persistent in-core
 * bitmaps, maintains and uses them to check for double allocations
 */
#define DOUBLE_CHECK__

/*
 * Define EXT4FS_DEBUG to produce debug messages
 */
#undef EXT4FS_DEBUG

/*
 * Debug code
 */
#ifdef EXT4FS_DEBUG
#define ext4_debug(f, a...)                                                \
        do {                                                                \
                printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:",        \
                        __FILE__, __LINE__, __func__);                        \
                printk(KERN_DEBUG f, ## a);                                \
        } while (0)
#else
#define ext4_debug(fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

 /*
  * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c
  */
#define EXT_DEBUG__

/*
 * Dynamic printk for controlled extents debugging.
 */
#ifdef CONFIG_EXT4_DEBUG
#define ext_debug(ino, fmt, ...)                                        \
        pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt,        \
                 current->comm, task_pid_nr(current),                        \
                 ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__,        \
                 __func__, ##__VA_ARGS__)
#else
#define ext_debug(ino, fmt, ...)        no_printk(fmt, ##__VA_ARGS__)
#endif

#define ASSERT(assert)                                                \
do {                                                                        \
        if (unlikely(!(assert))) {                                        \
                printk(KERN_EMERG                                        \
                       "Assertion failure in %s() at %s:%d: '%s'\n",        \
                       __func__, __FILE__, __LINE__, #assert);                \
                BUG();                                                        \
        }                                                                \
} while (0)

/* data type for block offset of block group */
typedef int ext4_grpblk_t;

/* data type for filesystem-wide blocks number */
typedef unsigned long long ext4_fsblk_t;

/* data type for file logical block number */
typedef __u32 ext4_lblk_t;

/* data type for block group number */
typedef unsigned int ext4_group_t;

enum SHIFT_DIRECTION {
        SHIFT_LEFT = 0,
        SHIFT_RIGHT,
};

/*
 * For each criteria, mballoc has slightly different way of finding
 * the required blocks nad usually, higher the criteria the slower the
 * allocation.  We start at lower criterias and keep falling back to
 * higher ones if we are not able to find any blocks.  Lower (earlier)
 * criteria are faster.
 */
enum criteria {
        /*
         * Used when number of blocks needed is a power of 2. This
         * doesn't trigger any disk IO except prefetch and is the
         * fastest criteria.
         */
        CR_POWER2_ALIGNED,

        /*
         * Tries to lookup in-memory data structures to find the most
         * suitable group that satisfies goal request. No disk IO
         * except block prefetch.
         */
        CR_GOAL_LEN_FAST,

        /*
         * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal
         * length to the best available length for faster allocation.
         */
        CR_BEST_AVAIL_LEN,

        /*
         * Reads each block group sequentially, performing disk IO if
         * necessary, to find find_suitable block group. Tries to
         * allocate goal length but might trim the request if nothing
         * is found after enough tries.
         */
        CR_GOAL_LEN_SLOW,

        /*
         * Finds the first free set of blocks and allocates
         * those. This is only used in rare cases when
         * CR_GOAL_LEN_SLOW also fails to allocate anything.
         */
        CR_ANY_FREE,

        /*
         * Number of criterias defined.
         */
        EXT4_MB_NUM_CRS
};

/*
 * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
 */

/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE                0x0001
/* blocks already reserved */
#define EXT4_MB_HINT_RESERVED                0x0002
/* metadata is being allocated */
#define EXT4_MB_HINT_METADATA                0x0004
/* first blocks in the file */
#define EXT4_MB_HINT_FIRST                0x0008
/* search for the best chunk */
#define EXT4_MB_HINT_BEST                0x0010
/* data is being allocated */
#define EXT4_MB_HINT_DATA                0x0020
/* don't preallocate (for tails) */
#define EXT4_MB_HINT_NOPREALLOC                0x0040
/* allocate for locality group */
#define EXT4_MB_HINT_GROUP_ALLOC        0x0080
/* allocate goal blocks or none */
#define EXT4_MB_HINT_GOAL_ONLY                0x0100
/* goal is meaningful */
#define EXT4_MB_HINT_TRY_GOAL                0x0200
/* blocks already pre-reserved by delayed allocation */
#define EXT4_MB_DELALLOC_RESERVED        0x0400
/* We are doing stream allocation */
#define EXT4_MB_STREAM_ALLOC                0x0800
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS                0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED                0x2000
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK                0x4000
/* Large fragment size list lookup succeeded at least once for
 * CR_POWER2_ALIGNED */
#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED                0x8000
/* Avg fragment size rb tree lookup succeeded at least once for
 * CR_GOAL_LEN_FAST */
#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED                0x00010000
/* Avg fragment size rb tree lookup succeeded at least once for
 * CR_BEST_AVAIL_LEN */
#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED                0x00020000

struct ext4_allocation_request {
        /* target inode for block we're allocating */
        struct inode *inode;
        /* how many blocks we want to allocate */
        unsigned int len;
        /* logical block in target inode */
        ext4_lblk_t logical;
        /* the closest logical allocated block to the left */
        ext4_lblk_t lleft;
        /* the closest logical allocated block to the right */
        ext4_lblk_t lright;
        /* phys. target (a hint) */
        ext4_fsblk_t goal;
        /* phys. block for the closest logical allocated block to the left */
        ext4_fsblk_t pleft;
        /* phys. block for the closest logical allocated block to the right */
        ext4_fsblk_t pright;
        /* flags. see above EXT4_MB_HINT_* */
        unsigned int flags;
};

/*
 * Logical to physical block mapping, used by ext4_map_blocks()
 *
 * This structure is used to pass requests into ext4_map_blocks() as
 * well as to store the information returned by ext4_map_blocks().  It
 * takes less room on the stack than a struct buffer_head.
 */
#define EXT4_MAP_NEW                BIT(BH_New)
#define EXT4_MAP_MAPPED                BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN        BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY        BIT(BH_Boundary)
#define EXT4_MAP_DELAYED        BIT(BH_Delay)
#define EXT4_MAP_FLAGS                (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
                                 EXT4_MAP_DELAYED)

struct ext4_map_blocks {
        ext4_fsblk_t m_pblk;
        ext4_lblk_t m_lblk;
        unsigned int m_len;
        unsigned int m_flags;
};

/*
 * Block validity checking, system zone rbtree.
 */
struct ext4_system_blocks {
        struct rb_root root;
        struct rcu_head rcu;
};

/*
 * Flags for ext4_io_end->flags
 */
#define EXT4_IO_END_UNWRITTEN        0x0001
#define EXT4_IO_END_FAILED        0x0002

#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED)

struct ext4_io_end_vec {
        struct list_head list;                /* list of io_end_vec */
        loff_t offset;                        /* offset in the file */
        ssize_t size;                        /* size of the extent */
};

/*
 * For converting unwritten extents on a work queue. 'handle' is used for
 * buffered writeback.
 */
typedef struct ext4_io_end {
        struct list_head        list;                /* per-file finished IO list */
        handle_t                *handle;        /* handle reserved for extent
                                                 * conversion */
        struct inode                *inode;                /* file being written to */
        struct bio                *bio;                /* Linked list of completed
                                                 * bios covering the extent */
        unsigned int                flag;                /* unwritten or not */
        refcount_t                count;                /* reference counter */
        struct list_head        list_vec;        /* list of ext4_io_end_vec */
} ext4_io_end_t;

struct ext4_io_submit {
        struct writeback_control *io_wbc;
        struct bio                *io_bio;
        ext4_io_end_t                *io_end;
        sector_t                io_next_block;
};

/*
 * Special inodes numbers
 */
#define        EXT4_BAD_INO                 1        /* Bad blocks inode */
#define EXT4_ROOT_INO                 2        /* Root inode */
#define EXT4_USR_QUOTA_INO         3        /* User quota inode */
#define EXT4_GRP_QUOTA_INO         4        /* Group quota inode */
#define EXT4_BOOT_LOADER_INO         5        /* Boot loader inode */
#define EXT4_UNDEL_DIR_INO         6        /* Undelete directory inode */
#define EXT4_RESIZE_INO                 7        /* Reserved group descriptors inode */
#define EXT4_JOURNAL_INO         8        /* Journal inode */

/* First non-reserved inode for old ext4 filesystems */
#define EXT4_GOOD_OLD_FIRST_INO        11

/*
 * Maximal count of links to a file
 */
#define EXT4_LINK_MAX                65000

/*
 * Macro-instructions used to manage several block sizes
 */
#define EXT4_MIN_BLOCK_SIZE                1024
#define        EXT4_MAX_BLOCK_SIZE                65536
#define EXT4_MIN_BLOCK_LOG_SIZE                10
#define EXT4_MAX_BLOCK_LOG_SIZE                16
#define EXT4_MAX_CLUSTER_LOG_SIZE        30
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE(s)                ((s)->s_blocksize)
#else
# define EXT4_BLOCK_SIZE(s)                (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size)
#endif
#define        EXT4_ADDR_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / sizeof(__u32))
#define EXT4_CLUSTER_SIZE(s)                (EXT4_BLOCK_SIZE(s) << \
                                         EXT4_SB(s)->s_cluster_bits)
#ifdef __KERNEL__
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_blocksize_bits)
# define EXT4_CLUSTER_BITS(s)                (EXT4_SB(s)->s_cluster_bits)
#else
# define EXT4_BLOCK_SIZE_BITS(s)        ((s)->s_log_block_size + 10)
#endif
#ifdef __KERNEL__
#define        EXT4_ADDR_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_addr_per_block_bits)
#define EXT4_INODE_SIZE(s)                (EXT4_SB(s)->s_inode_size)
#define EXT4_FIRST_INO(s)                (EXT4_SB(s)->s_first_ino)
#else
#define EXT4_INODE_SIZE(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_INODE_SIZE : \
                                 (s)->s_inode_size)
#define EXT4_FIRST_INO(s)        (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \
                                 EXT4_GOOD_OLD_FIRST_INO : \
                                 (s)->s_first_ino)
#endif
#define EXT4_BLOCK_ALIGN(size, blkbits)                ALIGN((size), (1 << (blkbits)))
#define EXT4_MAX_BLOCKS(size, offset, blkbits) \
        ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \
                                                                  blkbits))
#define EXT4_B_TO_LBLK(inode, offset) \
        (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits)

/* Translate a block number to a cluster number */
#define EXT4_B2C(sbi, blk)        ((blk) >> (sbi)->s_cluster_bits)
/* Translate a cluster number to a block number */
#define EXT4_C2B(sbi, cluster)        ((cluster) << (sbi)->s_cluster_bits)
/* Translate # of blks to # of clusters */
#define EXT4_NUM_B2C(sbi, blks)        (((blks) + (sbi)->s_cluster_ratio - 1) >> \
                                 (sbi)->s_cluster_bits)
/* Mask out the low bits to get the starting block of the cluster */
#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &                                \
                                  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &                                \
                                  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
/* Fill in the low bits to get the last block of the cluster */
#define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) |                                \
                                    ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1))
/* Get the cluster offset */
#define EXT4_PBLK_COFF(s, pblk) ((pblk) &                                \
                                 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
#define EXT4_LBLK_COFF(s, lblk) ((lblk) &                                \
                                 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))

/*
 * Structure of a blocks group descriptor
 */
struct ext4_group_desc
{
        __le32        bg_block_bitmap_lo;        /* Blocks bitmap block */
        __le32        bg_inode_bitmap_lo;        /* Inodes bitmap block */
        __le32        bg_inode_table_lo;        /* Inodes table block */
        __le16        bg_free_blocks_count_lo;/* Free blocks count */
        __le16        bg_free_inodes_count_lo;/* Free inodes count */
        __le16        bg_used_dirs_count_lo;        /* Directories count */
        __le16        bg_flags;                /* EXT4_BG_flags (INODE_UNINIT, etc) */
        __le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
        __le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
        __le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
        __le16  bg_itable_unused_lo;        /* Unused inodes count */
        __le16  bg_checksum;                /* crc16(sb_uuid+group+desc) */
        __le32        bg_block_bitmap_hi;        /* Blocks bitmap block MSB */
        __le32        bg_inode_bitmap_hi;        /* Inodes bitmap block MSB */
        __le32        bg_inode_table_hi;        /* Inodes table block MSB */
        __le16        bg_free_blocks_count_hi;/* Free blocks count MSB */
        __le16        bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16        bg_used_dirs_count_hi;        /* Directories count MSB */
        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
        __le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
        __le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
        __le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
        __u32   bg_reserved;
};

#define EXT4_BG_INODE_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
         sizeof(__le16))
#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END        \
        (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
         sizeof(__le16))

/*
 * Structure of a flex block group info
 */

struct flex_groups {
        atomic64_t        free_clusters;
        atomic_t        free_inodes;
        atomic_t        used_dirs;
};

#define EXT4_BG_INODE_UNINIT        0x0001 /* Inode table/bitmap not in use */
#define EXT4_BG_BLOCK_UNINIT        0x0002 /* Block bitmap not in use */
#define EXT4_BG_INODE_ZEROED        0x0004 /* On-disk itable initialized to zero */

/*
 * Macro-instructions used to manage group descriptors
 */
#define EXT4_MIN_DESC_SIZE                32
#define EXT4_MIN_DESC_SIZE_64BIT        64
#define        EXT4_MAX_DESC_SIZE                EXT4_MIN_BLOCK_SIZE
#define EXT4_DESC_SIZE(s)                (EXT4_SB(s)->s_desc_size)
#ifdef __KERNEL__
# define EXT4_BLOCKS_PER_GROUP(s)        (EXT4_SB(s)->s_blocks_per_group)
# define EXT4_CLUSTERS_PER_GROUP(s)        (EXT4_SB(s)->s_clusters_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_SB(s)->s_desc_per_block)
# define EXT4_INODES_PER_GROUP(s)        (EXT4_SB(s)->s_inodes_per_group)
# define EXT4_DESC_PER_BLOCK_BITS(s)        (EXT4_SB(s)->s_desc_per_block_bits)
#else
# define EXT4_BLOCKS_PER_GROUP(s)        ((s)->s_blocks_per_group)
# define EXT4_DESC_PER_BLOCK(s)                (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s))
# define EXT4_INODES_PER_GROUP(s)        ((s)->s_inodes_per_group)
#endif

/*
 * Constants relative to the data blocks
 */
#define        EXT4_NDIR_BLOCKS                12
#define        EXT4_IND_BLOCK                        EXT4_NDIR_BLOCKS
#define        EXT4_DIND_BLOCK                        (EXT4_IND_BLOCK + 1)
#define        EXT4_TIND_BLOCK                        (EXT4_DIND_BLOCK + 1)
#define        EXT4_N_BLOCKS                        (EXT4_TIND_BLOCK + 1)

/*
 * Inode flags
 */
#define        EXT4_SECRM_FL                        0x00000001 /* Secure deletion */
#define        EXT4_UNRM_FL                        0x00000002 /* Undelete */
#define        EXT4_COMPR_FL                        0x00000004 /* Compress file */
#define EXT4_SYNC_FL                        0x00000008 /* Synchronous updates */
#define EXT4_IMMUTABLE_FL                0x00000010 /* Immutable file */
#define EXT4_APPEND_FL                        0x00000020 /* writes to file may only append */
#define EXT4_NODUMP_FL                        0x00000040 /* do not dump file */
#define EXT4_NOATIME_FL                        0x00000080 /* do not update atime */
/* Reserved for compression usage... */
#define EXT4_DIRTY_FL                        0x00000100
#define EXT4_COMPRBLK_FL                0x00000200 /* One or more compressed clusters */
#define EXT4_NOCOMPR_FL                        0x00000400 /* Don't compress */
        /* nb: was previously EXT2_ECOMPR_FL */
#define EXT4_ENCRYPT_FL                        0x00000800 /* encrypted file */
/* End compression flags --- maybe not all used */
#define EXT4_INDEX_FL                        0x00001000 /* hash-indexed directory */
#define EXT4_IMAGIC_FL                        0x00002000 /* AFS directory */
#define EXT4_JOURNAL_DATA_FL                0x00004000 /* file data should be journaled */
#define EXT4_NOTAIL_FL                        0x00008000 /* file tail should not be merged */
#define EXT4_DIRSYNC_FL                        0x00010000 /* dirsync behaviour (directories only) */
#define EXT4_TOPDIR_FL                        0x00020000 /* Top of directory hierarchies*/
#define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
#define EXT4_EXTENTS_FL                        0x00080000 /* Inode uses extents */
#define EXT4_VERITY_FL                        0x00100000 /* Verity protected inode */
#define EXT4_EA_INODE_FL                0x00200000 /* Inode used for large EA */
/* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */

#define EXT4_DAX_FL                        0x02000000 /* Inode is DAX */

#define EXT4_INLINE_DATA_FL                0x10000000 /* Inode has inline data. */
#define EXT4_PROJINHERIT_FL                0x20000000 /* Create with parents projid */
#define EXT4_CASEFOLD_FL                0x40000000 /* Casefolded directory */
#define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */

/* User modifiable flags */
#define EXT4_FL_USER_MODIFIABLE                (EXT4_SECRM_FL | \
                                         EXT4_UNRM_FL | \
                                         EXT4_COMPR_FL | \
                                         EXT4_SYNC_FL | \
                                         EXT4_IMMUTABLE_FL | \
                                         EXT4_APPEND_FL | \
                                         EXT4_NODUMP_FL | \
                                         EXT4_NOATIME_FL | \
                                         EXT4_JOURNAL_DATA_FL | \
                                         EXT4_NOTAIL_FL | \
                                         EXT4_DIRSYNC_FL | \
                                         EXT4_TOPDIR_FL | \
                                         EXT4_EXTENTS_FL | \
                                         0x00400000 /* EXT4_EOFBLOCKS_FL */ | \
                                         EXT4_DAX_FL | \
                                         EXT4_PROJINHERIT_FL | \
                                         EXT4_CASEFOLD_FL)

/* User visible flags */
#define EXT4_FL_USER_VISIBLE                (EXT4_FL_USER_MODIFIABLE | \
                                         EXT4_DIRTY_FL | \
                                         EXT4_COMPRBLK_FL | \
                                         EXT4_NOCOMPR_FL | \
                                         EXT4_ENCRYPT_FL | \
                                         EXT4_INDEX_FL | \
                                         EXT4_VERITY_FL | \
                                         EXT4_INLINE_DATA_FL)

/* Flags that should be inherited by new inodes from their parent. */
#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
                           EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
                           EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\
                           EXT4_DAX_FL)

/* Flags that are appropriate for regular files (all but dir-specific ones). */
#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\
                           EXT4_PROJINHERIT_FL))

/* Flags that are appropriate for non-directories/regular files. */
#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)

/* The only flags that should be swapped */
#define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL)

/* Flags which are mutually exclusive to DAX */
#define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\
                           EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL)

/* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
{
        if (S_ISDIR(mode))
                return flags;
        else if (S_ISREG(mode))
                return flags & EXT4_REG_FLMASK;
        else
                return flags & EXT4_OTHER_FLMASK;
}

/*
 * Inode flags used for atomic set/get
 */
enum {
        EXT4_INODE_SECRM        = 0,        /* Secure deletion */
        EXT4_INODE_UNRM                = 1,        /* Undelete */
        EXT4_INODE_COMPR        = 2,        /* Compress file */
        EXT4_INODE_SYNC                = 3,        /* Synchronous updates */
        EXT4_INODE_IMMUTABLE        = 4,        /* Immutable file */
        EXT4_INODE_APPEND        = 5,        /* writes to file may only append */
        EXT4_INODE_NODUMP        = 6,        /* do not dump file */
        EXT4_INODE_NOATIME        = 7,        /* do not update atime */
/* Reserved for compression usage... */
        EXT4_INODE_DIRTY        = 8,
        EXT4_INODE_COMPRBLK        = 9,        /* One or more compressed clusters */
        EXT4_INODE_NOCOMPR        = 10,        /* Don't compress */
        EXT4_INODE_ENCRYPT        = 11,        /* Encrypted file */
/* End compression flags --- maybe not all used */
        EXT4_INODE_INDEX        = 12,        /* hash-indexed directory */
        EXT4_INODE_IMAGIC        = 13,        /* AFS directory */
        EXT4_INODE_JOURNAL_DATA        = 14,        /* file data should be journaled */
        EXT4_INODE_NOTAIL        = 15,        /* file tail should not be merged */
        EXT4_INODE_DIRSYNC        = 16,        /* dirsync behaviour (directories only) */
        EXT4_INODE_TOPDIR        = 17,        /* Top of directory hierarchies*/
        EXT4_INODE_HUGE_FILE        = 18,        /* Set to each huge file */
        EXT4_INODE_EXTENTS        = 19,        /* Inode uses extents */
        EXT4_INODE_VERITY        = 20,        /* Verity protected inode */
        EXT4_INODE_EA_INODE        = 21,        /* Inode used for large EA */
/* 22 was formerly EXT4_INODE_EOFBLOCKS */
        EXT4_INODE_DAX                = 25,        /* Inode is DAX */
        EXT4_INODE_INLINE_DATA        = 28,        /* Data in inode. */
        EXT4_INODE_PROJINHERIT        = 29,        /* Create with parents projid */
        EXT4_INODE_CASEFOLD        = 30,        /* Casefolded directory */
        EXT4_INODE_RESERVED        = 31,        /* reserved for ext4 lib */
};

/*
 * Since it's pretty easy to mix up bit numbers and hex values, we use a
 * build-time check to make sure that EXT4_XXX_FL is consistent with respect to
 * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost
 * any extra space in the compiled kernel image, otherwise, the build will fail.
 * It's important that these values are the same, since we are using
 * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent
 * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk
 * values found in ext2, ext3 and ext4 filesystems, and of course the values
 * defined in e2fsprogs.
 *
 * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
 */
#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))

static inline void ext4_check_flag_values(void)
{
        CHECK_FLAG_VALUE(SECRM);
        CHECK_FLAG_VALUE(UNRM);
        CHECK_FLAG_VALUE(COMPR);
        CHECK_FLAG_VALUE(SYNC);
        CHECK_FLAG_VALUE(IMMUTABLE);
        CHECK_FLAG_VALUE(APPEND);
        CHECK_FLAG_VALUE(NODUMP);
        CHECK_FLAG_VALUE(NOATIME);
        CHECK_FLAG_VALUE(DIRTY);
        CHECK_FLAG_VALUE(COMPRBLK);
        CHECK_FLAG_VALUE(NOCOMPR);
        CHECK_FLAG_VALUE(ENCRYPT);
        CHECK_FLAG_VALUE(INDEX);
        CHECK_FLAG_VALUE(IMAGIC);
        CHECK_FLAG_VALUE(JOURNAL_DATA);
        CHECK_FLAG_VALUE(NOTAIL);
        CHECK_FLAG_VALUE(DIRSYNC);
        CHECK_FLAG_VALUE(TOPDIR);
        CHECK_FLAG_VALUE(HUGE_FILE);
        CHECK_FLAG_VALUE(EXTENTS);
        CHECK_FLAG_VALUE(VERITY);
        CHECK_FLAG_VALUE(EA_INODE);
        CHECK_FLAG_VALUE(INLINE_DATA);
        CHECK_FLAG_VALUE(PROJINHERIT);
        CHECK_FLAG_VALUE(CASEFOLD);
        CHECK_FLAG_VALUE(RESERVED);
}

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
struct compat_ext4_new_group_input {
        u32 group;
        compat_u64 block_bitmap;
        compat_u64 inode_bitmap;
        compat_u64 inode_table;
        u32 blocks_count;
        u16 reserved_blocks;
        u16 unused;
};
#endif

/* The struct ext4_new_group_input in kernel space, with free_blocks_count */
struct ext4_new_group_data {
        __u32 group;
        __u64 block_bitmap;
        __u64 inode_bitmap;
        __u64 inode_table;
        __u32 blocks_count;
        __u16 reserved_blocks;
        __u16 mdata_blocks;
        __u32 free_clusters_count;
};

/* Indexes used to index group tables in ext4_new_group_data */
enum {
        BLOCK_BITMAP = 0,        /* block bitmap */
        INODE_BITMAP,                /* inode bitmap */
        INODE_TABLE,                /* inode tables */
        GROUP_TABLE_COUNT,
};

/*
 * Flags used by ext4_map_blocks()
 */
        /* Allocate any needed blocks and/or convert an unwritten
           extent to be an initialized ext4 */
#define EXT4_GET_BLOCKS_CREATE                        0x0001
        /* Request the creation of an unwritten extent */
#define EXT4_GET_BLOCKS_UNWRIT_EXT                0x0002
#define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT        (EXT4_GET_BLOCKS_UNWRIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path
         * finally doing the actual allocation of delayed blocks */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
        unwritten extents if not allocated, split the unwritten
        extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_PRE_IO                        0x0008
#define EXT4_GET_BLOCKS_CONVERT                        0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT                (EXT4_GET_BLOCKS_PRE_IO|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT                (EXT4_GET_BLOCKS_CONVERT|\
                                         EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
        /* Eventual metadata allocation (due to growing extent tree)
         * should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL                0x0020
        /* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE                0x0040
        /* Convert written extents to unwritten */
#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN        0x0100
        /* Write zeros to newly created written extents */
#define EXT4_GET_BLOCKS_ZERO                        0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO                (EXT4_GET_BLOCKS_CREATE |\
                                        EXT4_GET_BLOCKS_ZERO)
        /* Caller will submit data before dropping transaction handle. This
         * allows jbd2 to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT                0x0400
        /* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT                0x0800

/*
 * The bit position of these flags must not overlap with any of the
 * EXT4_GET_BLOCKS_*.  They are used by ext4_find_extent(),
 * read_extent_tree_block(), ext4_split_extent_at(),
 * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
 * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
 * caching the extents when reading from the extent tree while a
 * truncate or punch hole operation is in progress.
 */
#define EXT4_EX_NOCACHE                                0x40000000
#define EXT4_EX_FORCE_CACHE                        0x20000000
#define EXT4_EX_NOFAIL                                0x10000000

/*
 * Flags used by ext4_free_blocks
 */
#define EXT4_FREE_BLOCKS_METADATA                0x0001
#define EXT4_FREE_BLOCKS_FORGET                        0x0002
#define EXT4_FREE_BLOCKS_VALIDATED                0x0004
#define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE                0x0008
#define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER        0x0010
#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER        0x0020
#define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER      0x0040

#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
 * ioctl commands in 32 bit emulation
 */
#define EXT4_IOC32_GETVERSION                _IOR('f', 3, int)
#define EXT4_IOC32_SETVERSION                _IOW('f', 4, int)
#define EXT4_IOC32_GETRSVSZ                _IOR('f', 5, int)
#define EXT4_IOC32_SETRSVSZ                _IOW('f', 6, int)
#define EXT4_IOC32_GROUP_EXTEND                _IOW('f', 7, unsigned int)
#define EXT4_IOC32_GROUP_ADD                _IOW('f', 8, struct compat_ext4_new_group_input)
#define EXT4_IOC32_GETVERSION_OLD        FS_IOC32_GETVERSION
#define EXT4_IOC32_SETVERSION_OLD        FS_IOC32_SETVERSION
#endif

/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF

/* Max logical block we can support */
#define EXT4_MAX_LOGICAL_BLOCK                0xFFFFFFFE

/*
 * Structure of an inode on the disk
 */
struct ext4_inode {
        __le16        i_mode;                /* File mode */
        __le16        i_uid;                /* Low 16 bits of Owner Uid */
        __le32        i_size_lo;        /* Size in bytes */
        __le32        i_atime;        /* Access time */
        __le32        i_ctime;        /* Inode Change time */
        __le32        i_mtime;        /* Modification time */
        __le32        i_dtime;        /* Deletion Time */
        __le16        i_gid;                /* Low 16 bits of Group Id */
        __le16        i_links_count;        /* Links count */
        __le32        i_blocks_lo;        /* Blocks count */
        __le32        i_flags;        /* File flags */
        union {
                struct {
                        __le32  l_i_version;
                } linux1;
                struct {
                        __u32  h_i_translator;
                } hurd1;
                struct {
                        __u32  m_i_reserved1;
                } masix1;
        } osd1;                                /* OS dependent 1 */
        __le32        i_block[EXT4_N_BLOCKS];/* Pointers to blocks */
        __le32        i_generation;        /* File version (for NFS) */
        __le32        i_file_acl_lo;        /* File ACL */
        __le32        i_size_high;
        __le32        i_obso_faddr;        /* Obsoleted fragment address */
        union {
                struct {
                        __le16        l_i_blocks_high; /* were l_i_reserved1 */
                        __le16        l_i_file_acl_high;
                        __le16        l_i_uid_high;        /* these 2 fields */
                        __le16        l_i_gid_high;        /* were reserved2[0] */
                        __le16        l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
                        __le16        l_i_reserved;
                } linux2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __u16        h_i_mode_high;
                        __u16        h_i_uid_high;
                        __u16        h_i_gid_high;
                        __u32        h_i_author;
                } hurd2;
                struct {
                        __le16        h_i_reserved1;        /* Obsoleted fragment number/size which are removed in ext4 */
                        __le16        m_i_file_acl_high;
                        __u32        m_i_reserved2[2];
                } masix2;
        } osd2;                                /* OS dependent 2 */
        __le16        i_extra_isize;
        __le16        i_checksum_hi;        /* crc32c(uuid+inum+inode) BE */
        __le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
        __le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
        __le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
        __le32  i_crtime;       /* File Creation time */
        __le32  i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
        __le32  i_version_hi;        /* high 32 bits for 64-bit version */
        __le32        i_projid;        /* Project ID */
};

#define EXT4_EPOCH_BITS 2
#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
#define EXT4_NSEC_MASK  (~0UL << EXT4_EPOCH_BITS)

/*
 * Extended fields will fit into an inode if the filesystem was formatted
 * with large inodes (-I 256 or larger) and there are not currently any EAs
 * consuming all of the available space. For new inodes we always reserve
 * enough space for the kernel's known extended fields, but for inodes
 * created with an old kernel this might not have been the case. None of
 * the extended inode fields is critical for correct filesystem operation.
 * This macro checks if a certain field fits in the inode. Note that
 * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize
 */
#define EXT4_FITS_IN_INODE(ext4_inode, einode, field)        \
        ((offsetof(typeof(*ext4_inode), field) +        \
          sizeof((ext4_inode)->field))                        \
        <= (EXT4_GOOD_OLD_INODE_SIZE +                        \
            (einode)->i_extra_isize))                        \

/*
 * We use an encoding that preserves the times for extra epoch "00":
 *
 * extra  msb of                         adjust for signed
 * epoch  32-bit                         32-bit tv_sec to
 * bits   time    decoded 64-bit tv_sec  64-bit tv_sec      valid time range
 * 0 0    1    -0x80000000..-0x00000001  0x000000000 1901-12-13..1969-12-31
 * 0 0    0    0x000000000..0x07fffffff  0x000000000 1970-01-01..2038-01-19
 * 0 1    1    0x080000000..0x0ffffffff  0x100000000 2038-01-19..2106-02-07
 * 0 1    0    0x100000000..0x17fffffff  0x100000000 2106-02-07..2174-02-25
 * 1 0    1    0x180000000..0x1ffffffff  0x200000000 2174-02-25..2242-03-16
 * 1 0    0    0x200000000..0x27fffffff  0x200000000 2242-03-16..2310-04-04
 * 1 1    1    0x280000000..0x2ffffffff  0x300000000 2310-04-04..2378-04-22
 * 1 1    0    0x300000000..0x37fffffff  0x300000000 2378-04-22..2446-05-10
 *
 * Note that previous versions of the kernel on 64-bit systems would
 * incorrectly use extra epoch bits 1,1 for dates between 1901 and
 * 1970.  e2fsck will correct this, assuming that it is run on the
 * affected filesystem before 2242.
 */

static inline __le32 ext4_encode_extra_time(struct timespec64 ts)
{
        u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK;
        return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS));
}

static inline struct timespec64 ext4_decode_extra_time(__le32 base,
                                                       __le32 extra)
{
        struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) };

        if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK)))
                ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32;
        ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS;
        return ts;
}

#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts)                        \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {        \
                (raw_inode)->xtime = cpu_to_le32((ts).tv_sec);                        \
                (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts);        \
        } else                                                                        \
                (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX));        \
} while (0)

#define EXT4_INODE_SET_ATIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode))

#define EXT4_INODE_SET_MTIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode))

#define EXT4_INODE_SET_CTIME(inode, raw_inode)                                                \
        EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode))

#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode)                                \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                        \
                EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode),                \
                                         raw_inode, (einode)->xtime)

#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode)                        \
        (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ?        \
                ext4_decode_extra_time((raw_inode)->xtime,                                \
                                       (raw_inode)->xtime ## _extra) :                \
                (struct timespec64) {                                                \
                        .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime)        \
                })

#define EXT4_INODE_GET_ATIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_atime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode));                \
} while (0)

#define EXT4_INODE_GET_MTIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_mtime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode));                \
} while (0)

#define EXT4_INODE_GET_CTIME(inode, raw_inode)                                        \
do {                                                                                \
        inode_set_ctime_to_ts(inode,                                                \
                EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode));                \
} while (0)

#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode)                                \
do {                                                                                \
        if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime))                         \
                (einode)->xtime =                                                \
                        EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode),        \
                                                 raw_inode);                        \
        else                                                                        \
                (einode)->xtime = (struct timespec64){0, 0};                        \
} while (0)

#define i_disk_version osd1.linux1.l_i_version

#if defined(__KERNEL__) || defined(__linux__)
#define i_reserved1        osd1.linux1.l_i_reserved1
#define i_file_acl_high        osd2.linux2.l_i_file_acl_high
#define i_blocks_high        osd2.linux2.l_i_blocks_high
#define i_uid_low        i_uid
#define i_gid_low        i_gid
#define i_uid_high        osd2.linux2.l_i_uid_high
#define i_gid_high        osd2.linux2.l_i_gid_high
#define i_checksum_lo        osd2.linux2.l_i_checksum_lo

#elif defined(__GNU__)

#define i_translator        osd1.hurd1.h_i_translator
#define i_uid_high        osd2.hurd2.h_i_uid_high
#define i_gid_high        osd2.hurd2.h_i_gid_high
#define i_author        osd2.hurd2.h_i_author

#elif defined(__masix__)

#define i_reserved1        osd1.masix1.m_i_reserved1
#define i_file_acl_high        osd2.masix2.m_i_file_acl_high
#define i_reserved2        osd2.masix2.m_i_reserved2

#endif /* defined(__KERNEL__) || defined(__linux__) */

#include "extents_status.h"
#include "fast_commit.h"

/*
 * Lock subclasses for i_data_sem in the ext4_inode_info structure.
 *
 * These are needed to avoid lockdep false positives when we need to
 * allocate blocks to the quota inode during ext4_map_blocks(), while
 * holding i_data_sem for a normal (non-quota) inode.  Since we don't
 * do quota tracking for the quota inode, this avoids deadlock (as
 * well as infinite recursion, since it isn't turtles all the way
 * down...)
 *
 *  I_DATA_SEM_NORMAL - Used for most inodes
 *  I_DATA_SEM_OTHER  - Used by move_inode.c for the second normal inode
 *                          where the second inode has larger inode number
 *                          than the first
 *  I_DATA_SEM_QUOTA  - Used for quota inodes only
 *  I_DATA_SEM_EA     - Used for ea_inodes only
 */
enum {
        I_DATA_SEM_NORMAL = 0,
        I_DATA_SEM_OTHER,
        I_DATA_SEM_QUOTA,
        I_DATA_SEM_EA
};


/*
 * fourth extended file system inode data in memory
 */
struct ext4_inode_info {
        __le32        i_data[15];        /* unconverted */
        __u32        i_dtime;
        ext4_fsblk_t        i_file_acl;

        /*
         * i_block_group is the number of the block group which contains
         * this file's inode.  Constant across the lifetime of the inode,
         * it is used for making block allocation decisions - we try to
         * place a file's data blocks near its inode block, and new inodes
         * near to their parent directory's inode.
         */
        ext4_group_t        i_block_group;
        ext4_lblk_t        i_dir_start_lookup;
#if (BITS_PER_LONG < 64)
        unsigned long        i_state_flags;                /* Dynamic state flags */
#endif
        unsigned long        i_flags;

        /*
         * Extended attributes can be read independently of the main file
         * data. Taking i_rwsem even when reading would cause contention
         * between readers of EAs and writers of regular file data, so
         * instead we synchronize on xattr_sem when reading or changing
         * EAs.
         */
        struct rw_semaphore xattr_sem;

        /*
         * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise
         * i_orphan is used.
         */
        union {
                struct list_head i_orphan;        /* unlinked but open inodes */
                unsigned int i_orphan_idx;        /* Index in orphan file */
        };

        /* Fast commit related info */

        /* For tracking dentry create updates */
        struct list_head i_fc_dilist;
        struct list_head i_fc_list;        /*
                                         * inodes that need fast commit
                                         * protected by sbi->s_fc_lock.
                                         */

        /* Start of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_start;

        /* End of lblk range that needs to be committed in this fast commit */
        ext4_lblk_t i_fc_lblk_len;

        /* Number of ongoing updates on this inode */
        atomic_t  i_fc_updates;

        spinlock_t i_raw_lock;        /* protects updates to the raw inode */

        /* Fast commit wait queue for this inode */
        wait_queue_head_t i_fc_wait;

        /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
        struct mutex i_fc_lock;

        /*
         * i_disksize keeps track of what the inode size is ON DISK, not
         * in memory.  During truncate, i_size is set to the new size by
         * the VFS prior to calling ext4_truncate(), but the filesystem won't
         * set i_disksize to 0 until the truncate is actually under way.
         *
         * The intent is that i_disksize always represents the blocks which
         * are used by this file.  This allows recovery to restart truncate
         * on orphans if we crash during truncate.  We actually write i_disksize
         * into the on-disk inode when writing inodes out, instead of i_size.
         *
         * The only time when i_disksize and i_size may be different is when
         * a truncate is in progress.  The only things which change i_disksize
         * are ext4_get_block (growth) and ext4_truncate (shrinkth).
         */
        loff_t        i_disksize;

        /*
         * i_data_sem is for serialising ext4_truncate() against
         * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
         * data tree are chopped off during truncate. We can't do that in
         * ext4 because whenever we perform intermediate commits during
         * truncate, the inode and all the metadata blocks *must* be in a
         * consistent state which allows truncation of the orphans to restart
         * during recovery.  Hence we must fix the get_block-vs-truncate race
         * by other means, so we have i_data_sem.
         */
        struct rw_semaphore i_data_sem;
        struct inode vfs_inode;
        struct jbd2_inode *jinode;

        /*
         * File creation time. Its function is same as that of
         * struct timespec64 i_{a,c,m}time in the generic inode.
         */
        struct timespec64 i_crtime;

        /* mballoc */
        atomic_t i_prealloc_active;

        /* allocation reservation info for delalloc */
        /* In case of bigalloc, this refer to clusters rather than blocks */
        unsigned int i_reserved_data_blocks;
        struct rb_root i_prealloc_node;
        rwlock_t i_prealloc_lock;

        /* extents status tree */
        struct ext4_es_tree i_es_tree;
        rwlock_t i_es_lock;
        struct list_head i_es_list;
        unsigned int i_es_all_nr;        /* protected by i_es_lock */
        unsigned int i_es_shk_nr;        /* protected by i_es_lock */
        ext4_lblk_t i_es_shrink_lblk;        /* Offset where we start searching for
                                           extents to shrink. Protected by
                                           i_es_lock  */

        /* ialloc */
        ext4_group_t        i_last_alloc_group;

        /* pending cluster reservations for bigalloc file systems */
        struct ext4_pending_tree i_pending_tree;

        /* on-disk additional length */
        __u16 i_extra_isize;

        /* Indicate the inline data space. */
        u16 i_inline_off;
        u16 i_inline_size;

#ifdef CONFIG_QUOTA
        /* quota space reservation, managed internally by quota code */
        qsize_t i_reserved_quota;
#endif
        spinlock_t i_block_reservation_lock;

        /* Lock protecting lists below */
        spinlock_t i_completed_io_lock;
        /*
         * Completed IOs that need unwritten extents handling and have
         * transaction reserved
         */
        struct list_head i_rsv_conversion_list;
        struct work_struct i_rsv_conversion_work;

        /*
         * Transactions that contain inode's metadata needed to complete
         * fsync and fdatasync, respectively.
         */
        tid_t i_sync_tid;
        tid_t i_datasync_tid;

#ifdef CONFIG_QUOTA
        struct dquot __rcu *i_dquot[MAXQUOTAS];
#endif

        /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
        __u32 i_csum_seed;

        kprojid_t i_projid;
};

/*
 * File system states
 */
#define        EXT4_VALID_FS                        0x0001        /* Unmounted cleanly */
#define        EXT4_ERROR_FS                        0x0002        /* Errors detected */
#define        EXT4_ORPHAN_FS                        0x0004        /* Orphans being recovered */
#define EXT4_FC_REPLAY                        0x0020        /* Fast commit replay ongoing */

/*
 * Misc. filesystem flags
 */
#define EXT2_FLAGS_SIGNED_HASH                0x0001  /* Signed dirhash in use */
#define EXT2_FLAGS_UNSIGNED_HASH        0x0002  /* Unsigned dirhash in use */
#define EXT2_FLAGS_TEST_FILESYS                0x0004        /* to test development code */

/*
 * Mount flags set via mount options or defaults
 */
#define EXT4_MOUNT_NO_MBCACHE                0x00001 /* Do not use mbcache */
#define EXT4_MOUNT_GRPID                0x00004        /* Create files with directory's group */
#define EXT4_MOUNT_DEBUG                0x00008        /* Some debugging messages */
#define EXT4_MOUNT_ERRORS_CONT                0x00010        /* Continue on errors */
#define EXT4_MOUNT_ERRORS_RO                0x00020        /* Remount fs ro on errors */
#define EXT4_MOUNT_ERRORS_PANIC                0x00040        /* Panic on errors */
#define EXT4_MOUNT_ERRORS_MASK                0x00070
#define EXT4_MOUNT_MINIX_DF                0x00080        /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD                0x00100        /* Don't use existing journal*/
#ifdef CONFIG_FS_DAX
#define EXT4_MOUNT_DAX_ALWAYS                0x00200        /* Direct Access */
#else
#define EXT4_MOUNT_DAX_ALWAYS                0
#endif
#define EXT4_MOUNT_DATA_FLAGS                0x00C00        /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA                0x00400        /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA                0x00800        /* Flush data before commit */
#define EXT4_MOUNT_WRITEBACK_DATA        0x00C00        /* No data ordering */
#define EXT4_MOUNT_UPDATE_JOURNAL        0x01000        /* Update the journal format */
#define EXT4_MOUNT_NO_UID32                0x02000  /* Disable 32-bit UIDs */
#define EXT4_MOUNT_XATTR_USER                0x04000        /* Extended user attributes */
#define EXT4_MOUNT_POSIX_ACL                0x08000        /* POSIX Access Control Lists */
#define EXT4_MOUNT_NO_AUTO_DA_ALLOC        0x10000        /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER                0x20000 /* Use block barriers */
#define EXT4_MOUNT_QUOTA                0x40000 /* Some quota option set */
#define EXT4_MOUNT_USRQUOTA                0x80000 /* "old" user quota,
                                                 * enable enforcement for hidden
                                                 * quota files */
#define EXT4_MOUNT_GRPQUOTA                0x100000 /* "old" group quota, enable
                                                  * enforcement for hidden quota
                                                  * files */
#define EXT4_MOUNT_PRJQUOTA                0x200000 /* Enable project quota
                                                  * enforcement */
#define EXT4_MOUNT_DIOREAD_NOLOCK        0x400000 /* Enable support for dio read nolocking */
#define EXT4_MOUNT_JOURNAL_CHECKSUM        0x800000 /* Journal checksums */
#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
#define EXT4_MOUNT_WARN_ON_ERROR        0x2000000 /* Trigger WARN_ON on error */
#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000
#define EXT4_MOUNT_DELALLOC                0x8000000 /* Delalloc support */
#define EXT4_MOUNT_DATA_ERR_ABORT        0x10000000 /* Abort on file data write */
#define EXT4_MOUNT_BLOCK_VALIDITY        0x20000000 /* Block validity checking */
#define EXT4_MOUNT_DISCARD                0x40000000 /* Issue DISCARD requests */
#define EXT4_MOUNT_INIT_INODE_TABLE        0x80000000 /* Initialize uninitialized itables */

/*
 * Mount flags set either automatically (could not be set by mount option)
 * based on per file system feature or property or in special cases such as
 * distinguishing between explicit mount option definition and default.
 */
#define EXT4_MOUNT2_EXPLICIT_DELALLOC        0x00000001 /* User explicitly
                                                      specified delalloc */
#define EXT4_MOUNT2_STD_GROUP_SIZE        0x00000002 /* We have standard group
                                                      size of blocksize * 8
                                                      blocks */
#define EXT4_MOUNT2_HURD_COMPAT                0x00000004 /* Support HURD-castrated
                                                      file systems */
#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM        0x00000008 /* User explicitly
                                                specified journal checksum */

#define EXT4_MOUNT2_JOURNAL_FAST_COMMIT        0x00000010 /* Journal fast commit */
#define EXT4_MOUNT2_DAX_NEVER                0x00000020 /* Do not allow Direct Access */
#define EXT4_MOUNT2_DAX_INODE                0x00000040 /* For printing options only */
#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN        0x00000080 /* Optimize group
                                                    * scanning in mballoc
                                                    */
#define EXT4_MOUNT2_ABORT                0x00000100 /* Abort filesystem */

#define clear_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt &= \
                                                ~EXT4_MOUNT_##opt
#define set_opt(sb, opt)                EXT4_SB(sb)->s_mount_opt |= \
                                                EXT4_MOUNT_##opt
#define test_opt(sb, opt)                (EXT4_SB(sb)->s_mount_opt & \
                                         EXT4_MOUNT_##opt)

#define clear_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 &= \
                                                ~EXT4_MOUNT2_##opt
#define set_opt2(sb, opt)                EXT4_SB(sb)->s_mount_opt2 |= \
                                                EXT4_MOUNT2_##opt
#define test_opt2(sb, opt)                (EXT4_SB(sb)->s_mount_opt2 & \
                                         EXT4_MOUNT2_##opt)

#define ext4_test_and_set_bit                __test_and_set_bit_le
#define ext4_set_bit                        __set_bit_le
#define ext4_test_and_clear_bit                __test_and_clear_bit_le
#define ext4_clear_bit                        __clear_bit_le
#define ext4_test_bit                        test_bit_le
#define ext4_find_next_zero_bit                find_next_zero_bit_le
#define ext4_find_next_bit                find_next_bit_le

extern void mb_set_bits(void *bm, int cur, int len);

/*
 * Maximal mount counts between two filesystem checks
 */
#define EXT4_DFL_MAX_MNT_COUNT                20        /* Allow 20 mounts */
#define EXT4_DFL_CHECKINTERVAL                0        /* Don't use interval check */

/*
 * Behaviour when detecting errors
 */
#define EXT4_ERRORS_CONTINUE                1        /* Continue execution */
#define EXT4_ERRORS_RO                        2        /* Remount fs read-only */
#define EXT4_ERRORS_PANIC                3        /* Panic */
#define EXT4_ERRORS_DEFAULT                EXT4_ERRORS_CONTINUE

/* Metadata checksum algorithm codes */
#define EXT4_CRC32C_CHKSUM                1

#define EXT4_LABEL_MAX                        16

/*
 * Structure of the super block
 */
struct ext4_super_block {
/*00*/        __le32        s_inodes_count;                /* Inodes count */
        __le32        s_blocks_count_lo;        /* Blocks count */
        __le32        s_r_blocks_count_lo;        /* Reserved blocks count */
        __le32        s_free_blocks_count_lo;        /* Free blocks count */
/*10*/        __le32        s_free_inodes_count;        /* Free inodes count */
        __le32        s_first_data_block;        /* First Data Block */
        __le32        s_log_block_size;        /* Block size */
        __le32        s_log_cluster_size;        /* Allocation cluster size */
/*20*/        __le32        s_blocks_per_group;        /* # Blocks per group */
        __le32        s_clusters_per_group;        /* # Clusters per group */
        __le32        s_inodes_per_group;        /* # Inodes per group */
        __le32        s_mtime;                /* Mount time */
/*30*/        __le32        s_wtime;                /* Write time */
        __le16        s_mnt_count;                /* Mount count */
        __le16        s_max_mnt_count;        /* Maximal mount count */
        __le16        s_magic;                /* Magic signature */
        __le16        s_state;                /* File system state */
        __le16        s_errors;                /* Behaviour when detecting errors */
        __le16        s_minor_rev_level;        /* minor revision level */
/*40*/        __le32        s_lastcheck;                /* time of last check */
        __le32        s_checkinterval;        /* max. time between checks */
        __le32        s_creator_os;                /* OS */
        __le32        s_rev_level;                /* Revision level */
/*50*/        __le16        s_def_resuid;                /* Default uid for reserved blocks */
        __le16        s_def_resgid;                /* Default gid for reserved blocks */
        /*
         * These fields are for EXT4_DYNAMIC_REV superblocks only.
         *
         * Note: the difference between the compatible feature set and
         * the incompatible feature set is that if there is a bit set
         * in the incompatible feature set that the kernel doesn't
         * know about, it should refuse to mount the filesystem.
         *
         * e2fsck's requirements are more strict; if it doesn't know
         * about a feature in either the compatible or incompatible
         * feature set, it must abort and not try to meddle with
         * things it doesn't understand...
         */
        __le32        s_first_ino;                /* First non-reserved inode */
        __le16  s_inode_size;                /* size of inode structure */
        __le16        s_block_group_nr;        /* block group # of this superblock */
        __le32        s_feature_compat;        /* compatible feature set */
/*60*/        __le32        s_feature_incompat;        /* incompatible feature set */
        __le32        s_feature_ro_compat;        /* readonly-compatible feature set */
/*68*/        __u8        s_uuid[16];                /* 128-bit uuid for volume */
/*78*/        char        s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */
/*88*/        char        s_last_mounted[64] __nonstring;        /* directory where last mounted */
/*C8*/        __le32        s_algorithm_usage_bitmap; /* For compression */
        /*
         * Performance hints.  Directory preallocation should only
         * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
         */
        __u8        s_prealloc_blocks;        /* Nr of blocks to try to preallocate*/
        __u8        s_prealloc_dir_blocks;        /* Nr to preallocate for dirs */
        __le16        s_reserved_gdt_blocks;        /* Per group desc for online growth */
        /*
         * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set.
         */
/*D0*/        __u8        s_journal_uuid[16];        /* uuid of journal superblock */
/*E0*/        __le32        s_journal_inum;                /* inode number of journal file */
        __le32        s_journal_dev;                /* device number of journal file */
        __le32        s_last_orphan;                /* start of list of inodes to delete */
        __le32        s_hash_seed[4];                /* HTREE hash seed */
        __u8        s_def_hash_version;        /* Default hash version to use */
        __u8        s_jnl_backup_type;
        __le16  s_desc_size;                /* size of group descriptor */
/*100*/        __le32        s_default_mount_opts;
        __le32        s_first_meta_bg;        /* First metablock block group */
        __le32        s_mkfs_time;                /* When the filesystem was created */
        __le32        s_jnl_blocks[17];        /* Backup of the journal inode */
        /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */
/*150*/        __le32        s_blocks_count_hi;        /* Blocks count */
        __le32        s_r_blocks_count_hi;        /* Reserved blocks count */
        __le32        s_free_blocks_count_hi;        /* Free blocks count */
        __le16        s_min_extra_isize;        /* All inodes have at least # bytes */
        __le16        s_want_extra_isize;         /* New inodes should reserve # bytes */
        __le32        s_flags;                /* Miscellaneous flags */
        __le16  s_raid_stride;                /* RAID stride */
        __le16  s_mmp_update_interval;  /* # seconds to wait in MMP checking */
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8        s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8        s_checksum_type;        /* metadata checksum algorithm used */
        __u8        s_encryption_level;        /* versioning level for encryption */
        __u8        s_reserved_pad;                /* Padding to next 32bits */
        __le64        s_kbytes_written;        /* nr of lifetime kilobytes written */
        __le32        s_snapshot_inum;        /* Inode number of active snapshot */
        __le32        s_snapshot_id;                /* sequential ID of active snapshot */
        __le64        s_snapshot_r_blocks_count; /* reserved blocks for active
                                              snapshot's future use */
        __le32        s_snapshot_list;        /* inode number of the head of the
                                           on-disk snapshot list */
#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count)
        __le32        s_error_count;                /* number of fs errors */
        __le32        s_first_error_time;        /* first time an error happened */
        __le32        s_first_error_ino;        /* inode involved in first error */
        __le64        s_first_error_block;        /* block involved of first error */
        __u8        s_first_error_func[32] __nonstring;        /* function where the error happened */
        __le32        s_first_error_line;        /* line number where error happened */
        __le32        s_last_error_time;        /* most recent time of an error */
        __le32        s_last_error_ino;        /* inode involved in last error */
        __le32        s_last_error_line;        /* line number where error happened */
        __le64        s_last_error_block;        /* block involved of last error */
        __u8        s_last_error_func[32] __nonstring;        /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
        __u8        s_mount_opts[64];
        __le32        s_usr_quota_inum;        /* inode for tracking user quota */
        __le32        s_grp_quota_inum;        /* inode for tracking group quota */
        __le32        s_overhead_clusters;        /* overhead blocks/clusters in fs */
        __le32        s_backup_bgs[2];        /* groups with sparse_super2 SBs */
        __u8        s_encrypt_algos[4];        /* Encryption algorithms in use  */
        __u8        s_encrypt_pw_salt[16];        /* Salt used for string2key algorithm */
        __le32        s_lpf_ino;                /* Location of the lost+found inode */
        __le32        s_prj_quota_inum;        /* inode for tracking project quota */
        __le32        s_checksum_seed;        /* crc32c(uuid) if csum_seed set */
        __u8        s_wtime_hi;
        __u8        s_mtime_hi;
        __u8        s_mkfs_time_hi;
        __u8        s_lastcheck_hi;
        __u8        s_first_error_time_hi;
        __u8        s_last_error_time_hi;
        __u8        s_first_error_errcode;
        __u8    s_last_error_errcode;
        __le16  s_encoding;                /* Filename charset encoding */
        __le16  s_encoding_flags;        /* Filename charset encoding flags */
        __le32  s_orphan_file_inum;        /* Inode for tracking orphan inodes */
        __le32        s_reserved[94];                /* Padding to the end of the block */
        __le32        s_checksum;                /* crc32c(superblock) */
};

#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)

#ifdef __KERNEL__

/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3

#define EXT4_ENC_UTF8_12_1        1

/* Types of ext4 journal triggers */
enum ext4_journal_trigger_type {
        EXT4_JTR_ORPHAN_FILE,
        EXT4_JTR_NONE        /* This must be the last entry for indexing to work! */
};

#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE

struct ext4_journal_trigger {
        struct jbd2_buffer_trigger_type tr_triggers;
        struct super_block *sb;
};

static inline struct ext4_journal_trigger *EXT4_TRIGGER(
                                struct jbd2_buffer_trigger_type *trigger)
{
        return container_of(trigger, struct ext4_journal_trigger, tr_triggers);
}

#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04

/* Structure at the tail of orphan block */
struct ext4_orphan_block_tail {
        __le32 ob_magic;
        __le32 ob_checksum;
};

static inline int ext4_inodes_per_orphan_block(struct super_block *sb)
{
        return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) /
                        sizeof(u32);
}

struct ext4_orphan_block {
        atomic_t ob_free_entries;        /* Number of free orphan entries in block */
        struct buffer_head *ob_bh;        /* Buffer for orphan block */
};

/*
 * Info about orphan file.
 */
struct ext4_orphan_info {
        int of_blocks;                        /* Number of orphan blocks in a file */
        __u32 of_csum_seed;                /* Checksum seed for orphan file */
        struct ext4_orphan_block *of_binfo;        /* Array with info about orphan
                                                 * file blocks */
};

/*
 * fourth extended-fs super-block data in memory
 */
struct ext4_sb_info {
        unsigned long s_desc_size;        /* Size of a group descriptor in bytes */
        unsigned long s_inodes_per_block;/* Number of inodes per block */
        unsigned long s_blocks_per_group;/* Number of blocks in a group */
        unsigned long s_clusters_per_group; /* Number of clusters in a group */
        unsigned long s_inodes_per_group;/* Number of inodes in a group */
        unsigned long s_itb_per_group;        /* Number of inode table blocks per group */
        unsigned long s_gdb_count;        /* Number of group descriptor blocks */
        unsigned long s_desc_per_block;        /* Number of group descriptors per block */
        ext4_group_t s_groups_count;        /* Number of groups in the fs */
        ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
        unsigned long s_overhead;  /* # of fs overhead clusters */
        unsigned int s_cluster_ratio;        /* Number of blocks per cluster */
        unsigned int s_cluster_bits;        /* log2 of s_cluster_ratio */
        loff_t s_bitmap_maxbytes;        /* max bytes for bitmap files */
        struct buffer_head * s_sbh;        /* Buffer containing the super block */
        struct ext4_super_block *s_es;        /* Pointer to the super block in the buffer */
        /* Array of bh's for the block group descriptors */
        struct buffer_head * __rcu *s_group_desc;
        unsigned int s_mount_opt;
        unsigned int s_mount_opt2;
        unsigned long s_mount_flags;
        unsigned int s_def_mount_opt;
        unsigned int s_def_mount_opt2;
        ext4_fsblk_t s_sb_block;
        atomic64_t s_resv_clusters;
        kuid_t s_resuid;
        kgid_t s_resgid;
        unsigned short s_mount_state;
        unsigned short s_pad;
        int s_addr_per_block_bits;
        int s_desc_per_block_bits;
        int s_inode_size;
        int s_first_ino;
        unsigned int s_inode_readahead_blks;
        unsigned int s_inode_goal;
        u32 s_hash_seed[4];
        int s_def_hash_version;
        int s_hash_unsigned;        /* 3 if hash should be unsigned, 0 if not */
        struct percpu_counter s_freeclusters_counter;
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyclusters_counter;
        struct percpu_counter s_sra_exceeded_retry_limit;
        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
        struct completion s_kobj_unregister;
        struct super_block *s_sb;
        struct buffer_head *s_mmp_bh;

        /* Journaling */
        struct journal_s *s_journal;
        unsigned long s_ext4_flags;                /* Ext4 superblock flags */
        struct mutex s_orphan_lock;        /* Protects on disk list changes */
        struct list_head s_orphan;        /* List of orphaned inodes in on disk
                                           list */
        struct ext4_orphan_info s_orphan_info;
        unsigned long s_commit_interval;
        u32 s_max_batch_time;
        u32 s_min_batch_time;
        struct file *s_journal_bdev_file;
#ifdef CONFIG_QUOTA
        /* Names of quota files with journalled quota */
        char __rcu *s_qf_names[EXT4_MAXQUOTAS];
        int s_jquota_fmt;                        /* Format of quota to use */
#endif
        unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
        struct ext4_system_blocks __rcu *s_system_blks;

#ifdef EXTENTS_STATS
        /* ext4 extents stats */
        unsigned long s_ext_min;
        unsigned long s_ext_max;
        unsigned long s_depth_max;
        spinlock_t s_ext_stats_lock;
        unsigned long s_ext_blocks;
        unsigned long s_ext_extents;
#endif

        /* for buddy allocator */
        struct ext4_group_info ** __rcu *s_group_info;
        struct inode *s_buddy_cache;
        spinlock_t s_md_lock;
        unsigned short *s_mb_offsets;
        unsigned int *s_mb_maxs;
        unsigned int s_group_info_size;
        unsigned int s_mb_free_pending;
        struct list_head s_freed_data_list[2];        /* List of blocks to be freed
                                                   after commit completed */
        struct list_head s_discard_list;
        struct work_struct s_discard_work;
        atomic_t s_retry_alloc_pending;
        struct list_head *s_mb_avg_fragment_size;
        rwlock_t *s_mb_avg_fragment_size_locks;
        struct list_head *s_mb_largest_free_orders;
        rwlock_t *s_mb_largest_free_orders_locks;

        /* tunables */
        unsigned long s_stripe;
        unsigned int s_mb_max_linear_groups;
        unsigned int s_mb_stream_request;
        unsigned int s_mb_max_to_scan;
        unsigned int s_mb_min_to_scan;
        unsigned int s_mb_stats;
        unsigned int s_mb_order2_reqs;
        unsigned int s_mb_group_prealloc;
        unsigned int s_max_dir_size_kb;
        /* where last allocation was done - for stream allocation */
        unsigned long s_mb_last_group;
        unsigned long s_mb_last_start;
        unsigned int s_mb_prefetch;
        unsigned int s_mb_prefetch_limit;
        unsigned int s_mb_best_avail_max_trim_order;
        unsigned int s_sb_update_sec;
        unsigned int s_sb_update_kb;

        /* stats for buddy allocator */
        atomic_t s_bal_reqs;        /* number of reqs with len > 1 */
        atomic_t s_bal_success;        /* we found long enough chunks */
        atomic_t s_bal_allocated;        /* in blocks */
        atomic_t s_bal_ex_scanned;        /* total extents scanned */
        atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS];        /* total extents scanned */
        atomic_t s_bal_groups_scanned;        /* number of groups scanned */
        atomic_t s_bal_goals;        /* goal hits */
        atomic_t s_bal_len_goals;        /* len goal hits */
        atomic_t s_bal_breaks;        /* too long searches */
        atomic_t s_bal_2orders;        /* 2^order hits */
        atomic_t s_bal_p2_aligned_bad_suggestions;
        atomic_t s_bal_goal_fast_bad_suggestions;
        atomic_t s_bal_best_avail_bad_suggestions;
        atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
        atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
        atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS];                /* cX loop didn't find blocks */
        atomic_t s_mb_buddies_generated;        /* number of buddies generated */
        atomic64_t s_mb_generation_time;
        atomic_t s_mb_lost_chunks;
        atomic_t s_mb_preallocated;
        atomic_t s_mb_discarded;
        atomic_t s_lock_busy;

        /* locality groups */
        struct ext4_locality_group __percpu *s_locality_groups;

        /* for write statistics */
        unsigned long s_sectors_written_start;
        u64 s_kbytes_written;

        /* the size of zero-out chunk */
        unsigned int s_extent_max_zeroout_kb;

        unsigned int s_log_groups_per_flex;
        struct flex_groups * __rcu *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;

        /* workqueue for reserved extent conversions (buffered io) */
        struct workqueue_struct *rsv_conversion_wq;

        /* timer for periodic error stats printing */
        struct timer_list s_err_report;

        /* Lazy inode table initialization info */
        struct ext4_li_request *s_li_request;
        /* Wait multiplier for lazy initialization thread */
        unsigned int s_li_wait_mult;

        /* Kernel thread for multiple mount protection */
        struct task_struct *s_mmp_tsk;

        /* record the last minlen when FITRIM is called. */
        unsigned long s_last_trim_minblks;

        /* Precomputed FS UUID checksum for seeding other checksums */
        __u32 s_csum_seed;

        /* Reclaim extents from extent status tree */
        struct shrinker *s_es_shrinker;
        struct list_head s_es_list;        /* List of inodes with reclaimable extents */
        long s_es_nr_inode;
        struct ext4_es_stats s_es_stats;
        struct mb_cache *s_ea_block_cache;
        struct mb_cache *s_ea_inode_cache;
        spinlock_t s_es_lock ____cacheline_aligned_in_smp;

        /* Journal triggers for checksum computation */
        struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT];

        /* Ratelimit ext4 messages. */
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
        atomic_t s_warning_count;
        atomic_t s_msg_count;

        /* Encryption policy for '-o test_dummy_encryption' */
        struct fscrypt_dummy_policy s_dummy_enc_policy;

        /*
         * Barrier between writepages ops and changing any inode's JOURNAL_DATA
         * or EXTENTS flag or between writepages ops and changing DELALLOC or
         * DIOREAD_NOLOCK mount options on remount.
         */
        struct percpu_rw_semaphore s_writepages_rwsem;
        struct dax_device *s_daxdev;
        u64 s_dax_part_off;
#ifdef CONFIG_EXT4_DEBUG
        unsigned long s_simulate_fail;
#endif
        /* Record the errseq of the backing block device */
        errseq_t s_bdev_wb_err;
        spinlock_t s_bdev_wb_lock;

        /* Information about errors that happened during this mount */
        spinlock_t s_error_lock;
        int s_add_error_count;
        int s_first_error_code;
        __u32 s_first_error_line;
        __u32 s_first_error_ino;
        __u64 s_first_error_block;
        const char *s_first_error_func;
        time64_t s_first_error_time;
        int s_last_error_code;
        __u32 s_last_error_line;
        __u32 s_last_error_ino;
        __u64 s_last_error_block;
        const char *s_last_error_func;
        time64_t s_last_error_time;
        /*
         * If we are in a context where we cannot update the on-disk
         * superblock, we queue the work here.  This is used to update
         * the error information in the superblock, and for periodic
         * updates of the superblock called from the commit callback
         * function.
         */
        struct work_struct s_sb_upd_work;

        /* Atomic write unit values in bytes */
        unsigned int s_awu_min;
        unsigned int s_awu_max;

        /* Ext4 fast commit sub transaction ID */
        atomic_t s_fc_subtid;

        /*
         * After commit starts, the main queue gets locked, and the further
         * updates get added in the staging queue.
         */
#define FC_Q_MAIN        0
#define FC_Q_STAGING        1
        struct list_head s_fc_q[2];        /* Inodes staged for fast commit
                                         * that have data changes in them.
                                         */
        struct list_head s_fc_dentry_q[2];        /* directory entry updates */
        unsigned int s_fc_bytes;
        /*
         * Main fast commit lock. This lock protects accesses to the
         * following fields:
         * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
         */
        spinlock_t s_fc_lock;
        struct buffer_head *s_fc_bh;
        struct ext4_fc_stats s_fc_stats;
        tid_t s_fc_ineligible_tid;
#ifdef CONFIG_EXT4_DEBUG
        int s_fc_debug_max_replay;
#endif
        struct ext4_fc_replay_state s_fc_replay_state;
};

static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
{
        return sb->s_fs_info;
}
static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
{
        return container_of(inode, struct ext4_inode_info, vfs_inode);
}

static inline int ext4_writepages_down_read(struct super_block *sb)
{
        percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem);
        return memalloc_nofs_save();
}

static inline void ext4_writepages_up_read(struct super_block *sb, int ctx)
{
        memalloc_nofs_restore(ctx);
        percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_writepages_down_write(struct super_block *sb)
{
        percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem);
        return memalloc_nofs_save();
}

static inline void ext4_writepages_up_write(struct super_block *sb, int ctx)
{
        memalloc_nofs_restore(ctx);
        percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem);
}

static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
{
        return ino == EXT4_ROOT_INO ||
                (ino >= EXT4_FIRST_INO(sb) &&
                 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
}

/*
 * Returns: sbi->field[index]
 * Used to access an array element from the following sbi fields which require
 * rcu protection to avoid dereferencing an invalid pointer due to reassignment
 * - s_group_desc
 * - s_group_info
 * - s_flex_group
 */
#define sbi_array_rcu_deref(sbi, field, index)                                   \
({                                                                           \
        typeof(*((sbi)->field)) _v;                                           \
        rcu_read_lock();                                                   \
        _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index];           \
        rcu_read_unlock();                                                   \
        _v;                                                                   \
})

/*
 * run-time mount flags
 */
enum {
        EXT4_MF_MNTDIR_SAMPLED,
        EXT4_MF_FC_INELIGIBLE,        /* Fast commit ineligible */
        EXT4_MF_JOURNAL_DESTROY        /* Journal is in process of destroying */
};

static inline void ext4_set_mount_flag(struct super_block *sb, int bit)
{
        set_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline void ext4_clear_mount_flag(struct super_block *sb, int bit)
{
        clear_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}

static inline int ext4_test_mount_flag(struct super_block *sb, int bit)
{
        return test_bit(bit, &EXT4_SB(sb)->s_mount_flags);
}


/*
 * Simulate_fail codes
 */
#define EXT4_SIM_BBITMAP_EIO        1
#define EXT4_SIM_BBITMAP_CRC        2
#define EXT4_SIM_IBITMAP_EIO        3
#define EXT4_SIM_IBITMAP_CRC        4
#define EXT4_SIM_INODE_EIO        5
#define EXT4_SIM_INODE_CRC        6
#define EXT4_SIM_DIRBLOCK_EIO        7
#define EXT4_SIM_DIRBLOCK_CRC        8

static inline bool ext4_simulate_fail(struct super_block *sb,
                                     unsigned long code)
{
#ifdef CONFIG_EXT4_DEBUG
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        if (unlikely(sbi->s_simulate_fail == code)) {
                sbi->s_simulate_fail = 0;
                return true;
        }
#endif
        return false;
}

/*
 * Error number codes for s_{first,last}_error_errno
 *
 * Linux errno numbers are architecture specific, so we need to translate
 * them into something which is architecture independent.   We don't define
 * codes for all errno's; just the ones which are most likely to be the cause
 * of an ext4_error() call.
 */
#define EXT4_ERR_UNKNOWN         1
#define EXT4_ERR_EIO                 2
#define EXT4_ERR_ENOMEM                 3
#define EXT4_ERR_EFSBADCRC         4
#define EXT4_ERR_EFSCORRUPTED         5
#define EXT4_ERR_ENOSPC                 6
#define EXT4_ERR_ENOKEY                 7
#define EXT4_ERR_EROFS                 8
#define EXT4_ERR_EFBIG                 9
#define EXT4_ERR_EEXIST                10
#define EXT4_ERR_ERANGE                11
#define EXT4_ERR_EOVERFLOW        12
#define EXT4_ERR_EBUSY                13
#define EXT4_ERR_ENOTDIR        14
#define EXT4_ERR_ENOTEMPTY        15
#define EXT4_ERR_ESHUTDOWN        16
#define EXT4_ERR_EFAULT                17

/*
 * Inode dynamic state flags
 */
enum {
        EXT4_STATE_NEW,                        /* inode is newly created */
        EXT4_STATE_XATTR,                /* has in-inode xattrs */
        EXT4_STATE_NO_EXPAND,                /* No space for expansion */
        EXT4_STATE_DA_ALLOC_CLOSE,        /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,                /* Inode is migrating */
        EXT4_STATE_NEWENTRY,                /* File just added to dir */
        EXT4_STATE_MAY_INLINE_DATA,        /* may have in-inode data */
        EXT4_STATE_EXT_PRECACHED,        /* extents have been precached */
        EXT4_STATE_LUSTRE_EA_INODE,        /* Lustre-style ea_inode */
        EXT4_STATE_VERITY_IN_PROGRESS,        /* building fs-verity Merkle tree */
        EXT4_STATE_FC_COMMITTING,        /* Fast commit ongoing */
        EXT4_STATE_ORPHAN_FILE,                /* Inode orphaned in orphan file */
};

#define EXT4_INODE_BIT_FNS(name, field, offset)                                \
static inline int ext4_test_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        return test_bit(bit + (offset), &EXT4_I(inode)->i_##field);        \
}                                                                        \
static inline void ext4_set_inode_##name(struct inode *inode, int bit)        \
{                                                                        \
        set_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}                                                                        \
static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
{                                                                        \
        clear_bit(bit + (offset), &EXT4_I(inode)->i_##field);                \
}

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_flag(struct inode *inode, int bit);
static inline void ext4_set_inode_flag(struct inode *inode, int bit);
static inline void ext4_clear_inode_flag(struct inode *inode, int bit);
EXT4_INODE_BIT_FNS(flag, flags, 0)

/* Add these declarations here only so that these functions can be
 * found by name.  Otherwise, they are very hard to locate. */
static inline int ext4_test_inode_state(struct inode *inode, int bit);
static inline void ext4_set_inode_state(struct inode *inode, int bit);
static inline void ext4_clear_inode_state(struct inode *inode, int bit);
#if (BITS_PER_LONG < 64)
EXT4_INODE_BIT_FNS(state, state_flags, 0)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        (ei)->i_state_flags = 0;
}
#else
EXT4_INODE_BIT_FNS(state, flags, 32)

static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
{
        /* We depend on the fact that callers will set i_flags */
}
#endif
#else
/* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
 * macros from user land. */
#define EXT4_SB(sb)        (sb)
#endif

static inline bool ext4_verity_in_progress(struct inode *inode)
{
        return IS_ENABLED(CONFIG_FS_VERITY) &&
               ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
}

#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime

/*
 * Codes for operating systems
 */
#define EXT4_OS_LINUX                0
#define EXT4_OS_HURD                1
#define EXT4_OS_MASIX                2
#define EXT4_OS_FREEBSD                3
#define EXT4_OS_LITES                4

/*
 * Revision levels
 */
#define EXT4_GOOD_OLD_REV        0        /* The good old (original) format */
#define EXT4_DYNAMIC_REV        1        /* V2 format w/ dynamic inode sizes */

#define EXT4_MAX_SUPP_REV        EXT4_DYNAMIC_REV

#define EXT4_GOOD_OLD_INODE_SIZE 128

#define EXT4_EXTRA_TIMESTAMP_MAX        (((s64)1 << 34) - 1  + S32_MIN)
#define EXT4_NON_EXTRA_TIMESTAMP_MAX        S32_MAX
#define EXT4_TIMESTAMP_MIN                S32_MIN

/*
 * Feature set definitions
 */

#define EXT4_FEATURE_COMPAT_DIR_PREALLOC        0x0001
#define EXT4_FEATURE_COMPAT_IMAGIC_INODES        0x0002
#define EXT4_FEATURE_COMPAT_HAS_JOURNAL                0x0004
#define EXT4_FEATURE_COMPAT_EXT_ATTR                0x0008
#define EXT4_FEATURE_COMPAT_RESIZE_INODE        0x0010
#define EXT4_FEATURE_COMPAT_DIR_INDEX                0x0020
#define EXT4_FEATURE_COMPAT_SPARSE_SUPER2        0x0200
/*
 * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes
 * incompatible only if fast commit blocks are present in the FS. Since we
 * clear the journal (and thus the fast commit blocks), we don't mark FS as
 * incompatible. We also have a JBD2 incompat feature, which gets set when
 * there are fast commit blocks present in the journal.
 */
#define EXT4_FEATURE_COMPAT_FAST_COMMIT                0x0400
#define EXT4_FEATURE_COMPAT_STABLE_INODES        0x0800
#define EXT4_FEATURE_COMPAT_ORPHAN_FILE                0x1000        /* Orphan file exists */

#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER        0x0001
#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE        0x0002
#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR        0x0004
#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE        0x0008
#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM                0x0010
#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK        0x0020
#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE        0x0040
#define EXT4_FEATURE_RO_COMPAT_QUOTA                0x0100
#define EXT4_FEATURE_RO_COMPAT_BIGALLOC                0x0200
/*
 * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
 * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
 * all other data structures' checksums.  However, the METADATA_CSUM and
 * GDT_CSUM bits are mutually exclusive.
 */
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM        0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY                0x1000
#define EXT4_FEATURE_RO_COMPAT_PROJECT                0x2000
#define EXT4_FEATURE_RO_COMPAT_VERITY                0x8000
#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT        0x10000 /* Orphan file may be
                                                           non-empty */

#define EXT4_FEATURE_INCOMPAT_COMPRESSION        0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE                0x0002
#define EXT4_FEATURE_INCOMPAT_RECOVER                0x0004 /* Needs recovery */
#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV        0x0008 /* Journal device */
#define EXT4_FEATURE_INCOMPAT_META_BG                0x0010
#define EXT4_FEATURE_INCOMPAT_EXTENTS                0x0040 /* extents support */
#define EXT4_FEATURE_INCOMPAT_64BIT                0x0080
#define EXT4_FEATURE_INCOMPAT_MMP               0x0100
#define EXT4_FEATURE_INCOMPAT_FLEX_BG                0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE                0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA                0x1000 /* data in dirent */
#define EXT4_FEATURE_INCOMPAT_CSUM_SEED                0x2000
#define EXT4_FEATURE_INCOMPAT_LARGEDIR                0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA        0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT                0x10000
#define EXT4_FEATURE_INCOMPAT_CASEFOLD                0x20000

extern void ext4_update_dynamic_rev(struct super_block *sb);

#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_compat |= \
                cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
}

#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
                cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
                ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
}

#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline bool ext4_has_feature_##name(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
} \
static inline void ext4_set_feature_##name(struct super_block *sb) \
{ \
        ext4_update_dynamic_rev(sb); \
        EXT4_SB(sb)->s_es->s_feature_incompat |= \
                cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
} \
static inline void ext4_clear_feature_##name(struct super_block *sb) \
{ \
        EXT4_SB(sb)->s_es->s_feature_incompat &= \
                ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
}

EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc,                DIR_PREALLOC)
EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes,        IMAGIC_INODES)
EXT4_FEATURE_COMPAT_FUNCS(journal,                HAS_JOURNAL)
EXT4_FEATURE_COMPAT_FUNCS(xattr,                EXT_ATTR)
EXT4_FEATURE_COMPAT_FUNCS(resize_inode,                RESIZE_INODE)
EXT4_FEATURE_COMPAT_FUNCS(dir_index,                DIR_INDEX)
EXT4_FEATURE_COMPAT_FUNCS(sparse_super2,        SPARSE_SUPER2)
EXT4_FEATURE_COMPAT_FUNCS(fast_commit,                FAST_COMMIT)
EXT4_FEATURE_COMPAT_FUNCS(stable_inodes,        STABLE_INODES)
EXT4_FEATURE_COMPAT_FUNCS(orphan_file,                ORPHAN_FILE)

EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super,        SPARSE_SUPER)
EXT4_FEATURE_RO_COMPAT_FUNCS(large_file,        LARGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir,                BTREE_DIR)
EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file,                HUGE_FILE)
EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum,                GDT_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink,                DIR_NLINK)
EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize,        EXTRA_ISIZE)
EXT4_FEATURE_RO_COMPAT_FUNCS(quota,                QUOTA)
EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc,                BIGALLOC)
EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum,        METADATA_CSUM)
EXT4_FEATURE_RO_COMPAT_FUNCS(readonly,                READONLY)
EXT4_FEATURE_RO_COMPAT_FUNCS(project,                PROJECT)
EXT4_FEATURE_RO_COMPAT_FUNCS(verity,                VERITY)
EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present,        ORPHAN_PRESENT)

EXT4_FEATURE_INCOMPAT_FUNCS(compression,        COMPRESSION)
EXT4_FEATURE_INCOMPAT_FUNCS(filetype,                FILETYPE)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery,        RECOVER)
EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev,        JOURNAL_DEV)
EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg,                META_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(extents,                EXTENTS)
EXT4_FEATURE_INCOMPAT_FUNCS(64bit,                64BIT)
EXT4_FEATURE_INCOMPAT_FUNCS(mmp,                MMP)
EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg,                FLEX_BG)
EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode,                EA_INODE)
EXT4_FEATURE_INCOMPAT_FUNCS(dirdata,                DIRDATA)
EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed,                CSUM_SEED)
EXT4_FEATURE_INCOMPAT_FUNCS(largedir,                LARGEDIR)
EXT4_FEATURE_INCOMPAT_FUNCS(inline_data,        INLINE_DATA)
EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,                ENCRYPT)
EXT4_FEATURE_INCOMPAT_FUNCS(casefold,                CASEFOLD)

#define EXT2_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT2_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT3_FEATURE_COMPAT_SUPP        EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT3_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG)
#define EXT3_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR)

#define EXT4_FEATURE_COMPAT_SUPP        (EXT4_FEATURE_COMPAT_EXT_ATTR| \
                                         EXT4_FEATURE_COMPAT_ORPHAN_FILE)
#define EXT4_FEATURE_INCOMPAT_SUPP        (EXT4_FEATURE_INCOMPAT_FILETYPE| \
                                         EXT4_FEATURE_INCOMPAT_RECOVER| \
                                         EXT4_FEATURE_INCOMPAT_META_BG| \
                                         EXT4_FEATURE_INCOMPAT_EXTENTS| \
                                         EXT4_FEATURE_INCOMPAT_64BIT| \
                                         EXT4_FEATURE_INCOMPAT_FLEX_BG| \
                                         EXT4_FEATURE_INCOMPAT_EA_INODE| \
                                         EXT4_FEATURE_INCOMPAT_MMP | \
                                         EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
                                         EXT4_FEATURE_INCOMPAT_ENCRYPT | \
                                         EXT4_FEATURE_INCOMPAT_CASEFOLD | \
                                         EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
                                         EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP        (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
                                         EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \
                                         EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
                                         EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
                                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
                                         EXT4_FEATURE_RO_COMPAT_QUOTA |\
                                         EXT4_FEATURE_RO_COMPAT_PROJECT |\
                                         EXT4_FEATURE_RO_COMPAT_VERITY |\
                                         EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT)

#define EXTN_FEATURE_FUNCS(ver) \
static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
                cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
} \
static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
{ \
        return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
                cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
}

EXTN_FEATURE_FUNCS(2)
EXTN_FEATURE_FUNCS(3)
EXTN_FEATURE_FUNCS(4)

static inline bool ext4_has_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
}
static inline bool ext4_has_ro_compat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
}
static inline bool ext4_has_incompat_features(struct super_block *sb)
{
        return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
}

extern int ext4_feature_set_ok(struct super_block *sb, int readonly);

/*
 * Superblock flags
 */
enum {
        EXT4_FLAGS_RESIZING,        /* Avoid superblock update and resize race */
        EXT4_FLAGS_SHUTDOWN,        /* Prevent access to the file system */
        EXT4_FLAGS_BDEV_IS_DAX,        /* Current block device support DAX */
        EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */
};

static inline int ext4_forced_shutdown(struct super_block *sb)
{
        return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags);
}

static inline int ext4_emergency_ro(struct super_block *sb)
{
        return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags);
}

static inline int ext4_emergency_state(struct super_block *sb)
{
        if (unlikely(ext4_forced_shutdown(sb)))
                return -EIO;
        if (unlikely(ext4_emergency_ro(sb)))
                return -EROFS;
        return 0;
}

/*
 * Default values for user and/or group using reserved blocks
 */
#define        EXT4_DEF_RESUID                0
#define        EXT4_DEF_RESGID                0

/*
 * Default project ID
 */
#define        EXT4_DEF_PROJID                0

#define EXT4_DEF_INODE_READAHEAD_BLKS        32

/*
 * Default mount options
 */
#define EXT4_DEFM_DEBUG                0x0001
#define EXT4_DEFM_BSDGROUPS        0x0002
#define EXT4_DEFM_XATTR_USER        0x0004
#define EXT4_DEFM_ACL                0x0008
#define EXT4_DEFM_UID16                0x0010
#define EXT4_DEFM_JMODE                0x0060
#define EXT4_DEFM_JMODE_DATA        0x0020
#define EXT4_DEFM_JMODE_ORDERED        0x0040
#define EXT4_DEFM_JMODE_WBACK        0x0060
#define EXT4_DEFM_NOBARRIER        0x0100
#define EXT4_DEFM_BLOCK_VALIDITY 0x0200
#define EXT4_DEFM_DISCARD        0x0400
#define EXT4_DEFM_NODELALLOC        0x0800

/*
 * Default journal batch times
 */
#define EXT4_DEF_MIN_BATCH_TIME        0
#define EXT4_DEF_MAX_BATCH_TIME        15000 /* 15ms */

/*
 * Default values for superblock update
 */
#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */
#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */


/*
 * Minimum number of groups in a flexgroup before we separate out
 * directories into the first block group of a flexgroup
 */
#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME        4

/*
 * Structure of a directory entry
 */
#define EXT4_NAME_LEN 255
/*
 * Base length of the ext4 directory entry excluding the name length
 */
#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)

struct ext4_dir_entry {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __le16        name_len;                /* Name length */
        char        name[EXT4_NAME_LEN];        /* File name */
};


/*
 * Encrypted Casefolded entries require saving the hash on disk. This structure
 * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned
 * boundary.
 */
struct ext4_dir_entry_hash {
        __le32 hash;
        __le32 minor_hash;
};

/*
 * The new version of the directory entry.  Since EXT4 structures are
 * stored in intel byte order, and the name_len field could never be
 * bigger than 255 chars, it's safe to reclaim the extra byte for the
 * file_type field.
 */
struct ext4_dir_entry_2 {
        __le32        inode;                        /* Inode number */
        __le16        rec_len;                /* Directory entry length */
        __u8        name_len;                /* Name length */
        __u8        file_type;                /* See file type macros EXT4_FT_* below */
        char        name[EXT4_NAME_LEN];        /* File name */
};

/*
 * Access the hashes at the end of ext4_dir_entry_2
 */
#define EXT4_DIRENT_HASHES(entry) \
        ((struct ext4_dir_entry_hash *) \
                (((void *)(entry)) + \
                ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND)))
#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash)
#define EXT4_DIRENT_MINOR_HASH(entry) \
                le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash)

static inline bool ext4_hash_in_dirent(const struct inode *inode)
{
        return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode);
}

/*
 * This is a bogus directory entry at the end of each leaf block that
 * records checksums.
 */
struct ext4_dir_entry_tail {
        __le32        det_reserved_zero1;        /* Pretend to be unused */
        __le16        det_rec_len;                /* 12 */
        __u8        det_reserved_zero2;        /* Zero name length */
        __u8        det_reserved_ft;        /* 0xDE, fake file type */
        __le32        det_checksum;                /* crc32c(uuid+inum+dirblock) */
};

#define EXT4_DIRENT_TAIL(block, blocksize) \
        ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
                                        ((blocksize) - \
                                         sizeof(struct ext4_dir_entry_tail))))

/*
 * Ext4 directory file types.  Only the low 3 bits are used.  The
 * other bits are reserved for now.
 */
#define EXT4_FT_UNKNOWN                0
#define EXT4_FT_REG_FILE        1
#define EXT4_FT_DIR                2
#define EXT4_FT_CHRDEV                3
#define EXT4_FT_BLKDEV                4
#define EXT4_FT_FIFO                5
#define EXT4_FT_SOCK                6
#define EXT4_FT_SYMLINK                7

#define EXT4_FT_MAX                8

#define EXT4_FT_DIR_CSUM        0xDE

/*
 * EXT4_DIR_PAD defines the directory entries boundaries
 *
 * NOTE: It must be a multiple of 4
 */
#define EXT4_DIR_PAD                        4
#define EXT4_DIR_ROUND                        (EXT4_DIR_PAD - 1)
#define EXT4_MAX_REC_LEN                ((1<<16)-1)

/*
 * The rec_len is dependent on the type of directory. Directories that are
 * casefolded and encrypted need to store the hash as well, so we add room for
 * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should
 * pass NULL for dir, as those entries do not use the extra fields.
 */
static inline unsigned int ext4_dir_rec_len(__u8 name_len,
                                                const struct inode *dir)
{
        int rec_len = (name_len + 8 + EXT4_DIR_ROUND);

        if (dir && ext4_hash_in_dirent(dir))
                rec_len += sizeof(struct ext4_dir_entry_hash);
        return (rec_len & ~EXT4_DIR_ROUND);
}

/*
 * If we ever get support for fs block sizes > page_size, we'll need
 * to remove the #if statements in the next two functions...
 */
static inline unsigned int
ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
{
        unsigned len = le16_to_cpu(dlen);

#if (PAGE_SIZE >= 65536)
        if (len == EXT4_MAX_REC_LEN || len == 0)
                return blocksize;
        return (len & 65532) | ((len & 3) << 16);
#else
        return len;
#endif
}

static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
{
        BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3));
#if (PAGE_SIZE >= 65536)
        if (len < 65536)
                return cpu_to_le16(len);
        if (len == blocksize) {
                if (blocksize == 65536)
                        return cpu_to_le16(EXT4_MAX_REC_LEN);
                else
                        return cpu_to_le16(0);
        }
        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
#else
        return cpu_to_le16(len);
#endif
}

/*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
 */

#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \
                    !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir)))
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)

/* Legal values for the dx_root hash_version field: */

#define DX_HASH_LEGACY                        0
#define DX_HASH_HALF_MD4                1
#define DX_HASH_TEA                        2
#define DX_HASH_LEGACY_UNSIGNED                3
#define DX_HASH_HALF_MD4_UNSIGNED        4
#define DX_HASH_TEA_UNSIGNED                5
#define DX_HASH_SIPHASH                        6
#define DX_HASH_LAST                         DX_HASH_SIPHASH

static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
                              const void *address, unsigned int length)
{
        return crc32c(crc, address, length);
}

#ifdef __KERNEL__

/* hash info structure used by the directory hash */
struct dx_hash_info
{
        u32                hash;
        u32                minor_hash;
        int                hash_version;
        u32                *seed;
};


/* 32 and 64 bit signed EOF for dx directories */
#define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
#define EXT4_HTREE_EOF_64BIT   ((1ULL << (64 - 1)) - 1)


/*
 * Control parameters used by ext4_htree_next_block
 */
#define HASH_NB_ALWAYS                1

struct ext4_filename {
        const struct qstr *usr_fname;
        struct fscrypt_str disk_name;
        struct dx_hash_info hinfo;
#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_str crypto_buf;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
        struct qstr cf_name;
#endif
};

#define fname_name(p) ((p)->disk_name.name)
#define fname_usr_name(p) ((p)->usr_fname->name)
#define fname_len(p)  ((p)->disk_name.len)

/*
 * Describe an inode's exact location on disk and in memory
 */
struct ext4_iloc
{
        struct buffer_head *bh;
        unsigned long offset;
        ext4_group_t block_group;
};

static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc)
{
        return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset);
}

static inline bool ext4_is_quota_file(struct inode *inode)
{
        return IS_NOQUOTA(inode) &&
               !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL);
}

/*
 * This structure is stuffed into the struct file's private_data field
 * for directories.  It is where we put information so that we can do
 * readdir operations in hash tree order.
 */
struct dir_private_info {
        struct rb_root        root;
        struct rb_node        *curr_node;
        struct fname        *extra_fname;
        loff_t                last_pos;
        __u32                curr_hash;
        __u32                curr_minor_hash;
        __u32                next_hash;
        u64                cookie;
        bool                initialized;
};

/* calculate the first block number of the group */
static inline ext4_fsblk_t
ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
{
        return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) +
                le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
}

/*
 * Special error return code only used by dx_probe() and its callers.
 */
#define ERR_BAD_DX_DIR        (-(MAX_ERRNO - 1))

/* htree levels for ext4 */
#define        EXT4_HTREE_LEVEL_COMPAT        2
#define        EXT4_HTREE_LEVEL        3

static inline int ext4_dir_htree_level(struct super_block *sb)
{
        return ext4_has_feature_largedir(sb) ?
                EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
}

/*
 * Timeout and state flag for lazy initialization inode thread.
 */
#define EXT4_DEF_LI_WAIT_MULT                        10
#define EXT4_DEF_LI_MAX_START_DELAY                5
#define EXT4_LAZYINIT_QUIT                        0x0001
#define EXT4_LAZYINIT_RUNNING                        0x0002

/*
 * Lazy inode table initialization info
 */
struct ext4_lazy_init {
        unsigned long                li_state;
        struct list_head        li_request_list;
        struct mutex                li_list_mtx;
};

enum ext4_li_mode {
        EXT4_LI_MODE_PREFETCH_BBITMAP,
        EXT4_LI_MODE_ITABLE,
};

struct ext4_li_request {
        struct super_block        *lr_super;
        enum ext4_li_mode        lr_mode;
        ext4_group_t                lr_first_not_zeroed;
        ext4_group_t                lr_next_group;
        struct list_head        lr_request;
        unsigned long                lr_next_sched;
        unsigned long                lr_timeout;
};

struct ext4_features {
        struct kobject f_kobj;
        struct completion f_kobj_unregister;
};

/*
 * This structure will be used for multiple mount protection. It will be
 * written into the block number saved in the s_mmp_block field in the
 * superblock. Programs that check MMP should assume that if
 * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe
 * to use the filesystem, regardless of how old the timestamp is.
 */
#define EXT4_MMP_MAGIC     0x004D4D50U /* ASCII for MMP */
#define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */
#define EXT4_MMP_SEQ_FSCK  0xE24D4D50U /* mmp_seq value when being fscked */
#define EXT4_MMP_SEQ_MAX   0xE24D4D4FU /* maximum valid mmp_seq value */

struct mmp_struct {
        __le32        mmp_magic;                /* Magic number for MMP */
        __le32        mmp_seq;                /* Sequence no. updated periodically */

        /*
         * mmp_time, mmp_nodename & mmp_bdevname are only used for information
         * purposes and do not affect the correctness of the algorithm
         */
        __le64        mmp_time;                /* Time last updated */
        char        mmp_nodename[64];        /* Node which last updated MMP block */
        char        mmp_bdevname[32];        /* Bdev which last updated MMP block */

        /*
         * mmp_check_interval is used to verify if the MMP block has been
         * updated on the block device. The value is updated based on the
         * maximum time to write the MMP block during an update cycle.
         */
        __le16        mmp_check_interval;

        __le16        mmp_pad1;
        __le32        mmp_pad2[226];
        __le32        mmp_checksum;                /* crc32c(uuid+mmp_block) */
};

/* arguments passed to the mmp thread */
struct mmpd_data {
        struct buffer_head *bh; /* bh from initial read_mmp_block() */
        struct super_block *sb;  /* super block of the fs */
};

/*
 * Check interval multiplier
 * The MMP block is written every update interval and initially checked every
 * update interval x the multiplier (the value is then adapted based on the
 * write latency). The reason is that writes can be delayed under load and we
 * don't want readers to incorrectly assume that the filesystem is no longer
 * in use.
 */
#define EXT4_MMP_CHECK_MULT                2UL

/*
 * Minimum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MIN_CHECK_INTERVAL        5UL

/*
 * Maximum interval for MMP checking in seconds.
 */
#define EXT4_MMP_MAX_CHECK_INTERVAL        300UL

/*
 * Function prototypes
 */

/*
 * Ok, these declarations are also in <linux/kernel.h> but none of the
 * ext4 source programs needs to include it so they are duplicated here.
 */
# define NORET_TYPE        /**/
# define ATTRIB_NORET        __attribute__((noreturn))
# define NORET_AND        noreturn,

/* bitmap.c */
extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
void ext4_inode_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh);
int ext4_inode_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh);
void ext4_block_bitmap_csum_set(struct super_block *sb,
                                struct ext4_group_desc *gdp,
                                struct buffer_head *bh);
int ext4_block_bitmap_csum_verify(struct super_block *sb,
                                  struct ext4_group_desc *gdp,
                                  struct buffer_head *bh);

/* balloc.c */
extern void ext4_get_group_no_and_offset(struct super_block *sb,
                                         ext4_fsblk_t blocknr,
                                         ext4_group_t *blockgrpp,
                                         ext4_grpblk_t *offsetp);
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
                                          ext4_fsblk_t block);

extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
                        ext4_group_t group);
extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
                                         ext4_fsblk_t goal,
                                         unsigned int flags,
                                         unsigned long *count,
                                         int *errp);
extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
                                    s64 nclusters, unsigned int flags);
extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *);
extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
                                                   ext4_group_t group);
extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);

extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
                                                ext4_group_t block_group,
                                                bool ignore_locked);
extern int ext4_wait_block_bitmap(struct super_block *sb,
                                  ext4_group_t block_group,
                                  struct buffer_head *bh);
extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
                                                  ext4_group_t block_group);
extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
                                              ext4_group_t block_group,
                                              struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);

#if IS_ENABLED(CONFIG_UNICODE)
extern int ext4_fname_setup_ci_filename(struct inode *dir,
                                        const struct qstr *iname,
                                        struct ext4_filename *fname);

static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
{
        kfree(fname->cf_name.name);
        fname->cf_name.name = NULL;
}
#else
static inline int ext4_fname_setup_ci_filename(struct inode *dir,
                                               const struct qstr *iname,
                                               struct ext4_filename *fname)
{
        return 0;
}

static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname)
{
}
#endif

/* ext4 encryption related stuff goes here crypto.c */
#ifdef CONFIG_FS_ENCRYPTION
extern const struct fscrypt_operations ext4_cryptops;

int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname,
                              int lookup, struct ext4_filename *fname);

int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry,
                              struct ext4_filename *fname);

void ext4_fname_free_filename(struct ext4_filename *fname);

int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg);

#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
                                            const struct qstr *iname,
                                            int lookup,
                                            struct ext4_filename *fname)
{
        fname->usr_fname = iname;
        fname->disk_name.name = (unsigned char *) iname->name;
        fname->disk_name.len = iname->len;

        return ext4_fname_setup_ci_filename(dir, iname, fname);
}

static inline int ext4_fname_prepare_lookup(struct inode *dir,
                                            struct dentry *dentry,
                                            struct ext4_filename *fname)
{
        return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
}

static inline void ext4_fname_free_filename(struct ext4_filename *fname)
{
        ext4_fname_free_ci_filename(fname);
}

static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp,
                                                   void __user *arg)
{
        return -EOPNOTSUPP;
}
#endif /* !CONFIG_FS_ENCRYPTION */

/* dir.c */
extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *,
                                  struct file *,
                                  struct ext4_dir_entry_2 *,
                                  struct buffer_head *, char *, int,
                                  unsigned int);
#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \
        unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \
                                (de), (bh), (buf), (size), (offset)))
extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
                                __u32 minor_hash,
                                struct ext4_dir_entry_2 *dirent,
                                struct fscrypt_str *ent_name);
extern void ext4_htree_free_dir_info(struct dir_private_info *p);
extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh,
                             void *buf, int buf_size,
                             struct ext4_filename *fname,
                             struct ext4_dir_entry_2 **dest_de);
void ext4_insert_dentry(struct inode *dir, struct inode *inode,
                        struct ext4_dir_entry_2 *de,
                        int buf_size,
                        struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
        if (!ext4_has_feature_dir_index(inode->i_sb) &&
            ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
                /* ext4_iget() should have caught this... */
                WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
        }
}
static const unsigned char ext4_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
};

static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
{
        if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
                return DT_UNKNOWN;

        return ext4_filetype_table[filetype];
}
extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
                             void *buf, int buf_size);

/* fsync.c */
extern int ext4_sync_file(struct file *, loff_t, loff_t, int);

/* hash.c */
extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
                          struct dx_hash_info *hinfo);

/* ialloc.c */
extern int ext4_mark_inode_used(struct super_block *sb, int ino);
extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *,
                                      struct inode *, umode_t,
                                      const struct qstr *qstr, __u32 goal,
                                      uid_t *owner, __u32 i_flags,
                                      int handle_type, unsigned int line_no,
                                      int nblocks);

#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags)          \
        __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr),      \
                         (goal), (owner), i_flags, 0, 0, 0)
#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \
                                    type, nblocks)                    \
        __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \
                         0, (type), __LINE__, (nblocks))


extern void ext4_free_inode(handle_t *, struct inode *);
extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
extern unsigned long ext4_count_free_inodes(struct super_block *);
extern unsigned long ext4_count_dirs(struct super_block *);
extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
extern int ext4_init_inode_table(struct super_block *sb,
                                 ext4_group_t group, int barrier);
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);

/* fast_commit.c */
int ext4_fc_info_show(struct seq_file *seq, void *v);
void ext4_fc_init(struct super_block *sb, journal_t *journal);
void ext4_fc_init_inode(struct inode *inode);
void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start,
                         ext4_lblk_t end);
void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
        struct dentry *dentry);
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
                            struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
void ext4_fc_start_update(struct inode *inode);
void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
int ext4_fc_commit(journal_t *journal, tid_t commit_tid);
int __init ext4_fc_init_dentry_cache(void);
void ext4_fc_destroy_dentry_cache(void);
int ext4_fc_record_regions(struct super_block *sb, int ino,
                           ext4_lblk_t lblk, ext4_fsblk_t pblk,
                           int len, int replay);

/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
extern void ext4_discard_preallocations(struct inode *);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
                                     ext4_group_t group,
                                     unsigned int nr, int *cnt);
extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
                                  unsigned int nr);

extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
                             struct buffer_head *bh, ext4_fsblk_t block,
                             unsigned long count, int flags);
extern int ext4_mb_alloc_groupinfo(struct super_block *sb,
                                   ext4_group_t ngroups);
extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
                                ext4_fsblk_t block, unsigned long count);
extern int ext4_trim_fs(struct super_block *, struct fstrim_range *);
extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid);
extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block,
                            int len, bool state);
static inline bool ext4_mb_cr_expensive(enum criteria cr)
{
        return cr >= CR_GOAL_LEN_SLOW;
}

/* inode.c */
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
                         struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
                     bool wait, struct buffer_head **bhs);
int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
                             struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create);
int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                           struct buffer_head *bh, int create);
int ext4_walk_page_buffers(handle_t *handle,
                           struct inode *inode,
                           struct buffer_head *head,
                           unsigned from,
                           unsigned to,
                           int *partial,
                           int (*fn)(handle_t *handle, struct inode *inode,
                                     struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
                                struct buffer_head *bh);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA         2

typedef enum {
        EXT4_IGET_NORMAL =        0,
        EXT4_IGET_SPECIAL =        0x0001, /* OK to iget a system inode */
        EXT4_IGET_HANDLE =         0x0002,        /* Inode # is from a handle */
        EXT4_IGET_BAD =                0x0004, /* Allow to iget a bad inode */
        EXT4_IGET_EA_INODE =        0x0008        /* Inode should contain an EA value */
} ext4_iget_flags;

extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
                                 ext4_iget_flags flags, const char *function,
                                 unsigned int line);

#define ext4_iget(sb, ino, flags) \
        __ext4_iget((sb), (ino), (flags), __func__, __LINE__)

extern int  ext4_write_inode(struct inode *, struct writeback_control *);
extern int  ext4_setattr(struct mnt_idmap *, struct dentry *,
                         struct iattr *);
extern u32  ext4_dio_alignment(struct inode *inode);
extern int  ext4_getattr(struct mnt_idmap *, const struct path *,
                         struct kstat *, u32, unsigned int);
extern void ext4_evict_inode(struct inode *);
extern void ext4_clear_inode(struct inode *);
extern int  ext4_file_getattr(struct mnt_idmap *, const struct path *,
                              struct kstat *, u32, unsigned int);
extern void ext4_dirty_inode(struct inode *, int);
extern int ext4_change_inode_journal_flag(struct inode *, int);
extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino,
                          struct ext4_iloc *iloc);
extern int ext4_inode_attach_jinode(struct inode *inode);
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
extern int ext4_truncate_page_cache_block_range(struct inode *inode,
                                                loff_t start, loff_t end);
extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
                             loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
extern void ext4_da_release_space(struct inode *inode, int to_free);
extern void ext4_da_update_reserve_space(struct inode *inode,
                                        int used, int quota_claim);
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
                              ext4_fsblk_t pblk, ext4_lblk_t len);

/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
                                struct ext4_map_blocks *map, int flags);
extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
extern void ext4_ind_truncate(handle_t *, struct inode *inode);
extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode,
                                 ext4_lblk_t start, ext4_lblk_t end);

/* ioctl.c */
extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
int ext4_fileattr_set(struct mnt_idmap *idmap,
                      struct dentry *dentry, struct fileattr *fa);
int ext4_fileattr_get(struct dentry *dentry, struct fileattr *fa);
extern void ext4_reset_inode_seed(struct inode *inode);
int ext4_update_overhead(struct super_block *sb, bool force);
int ext4_force_shutdown(struct super_block *sb, u32 flags);

/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
extern int ext4_ind_migrate(struct inode *inode);

/* namei.c */
extern int ext4_init_new_dir(handle_t *handle, struct inode *dir,
                             struct inode *inode);
extern int ext4_dirblock_csum_verify(struct inode *inode,
                                     struct buffer_head *bh);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
                                __u32 start_minor_hash, __u32 *next_hash);
extern int ext4_search_dir(struct buffer_head *bh,
                           char *search_buf,
                           int buf_size,
                           struct inode *dir,
                           struct ext4_filename *fname,
                           unsigned int offset,
                           struct ext4_dir_entry_2 **res_dir);
extern int ext4_generic_delete_entry(struct inode *dir,
                                     struct ext4_dir_entry_2 *de_del,
                                     struct buffer_head *bh,
                                     void *entry_buf,
                                     int buf_size,
                                     int csum_size);
extern bool ext4_empty_dir(struct inode *inode);

/* resize.c */
extern void ext4_kvfree_array_rcu(void *to_free);
extern int ext4_group_add(struct super_block *sb,
                                struct ext4_new_group_data *input);
extern int ext4_group_extend(struct super_block *sb,
                                struct ext4_super_block *es,
                                ext4_fsblk_t n_blocks_count);
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
extern unsigned int ext4_list_backups(struct super_block *sb,
                                      unsigned int *three, unsigned int *five,
                                      unsigned int *seven);

/* super.c */
extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
                                         sector_t block, blk_opf_t op_flags);
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
                                                   sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
                                bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
                        bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern __le32 ext4_superblock_csum(struct super_block *sb,
                                   struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
                                    ext4_group_t ngroup);
extern const char *ext4_decode_error(struct super_block *sb, int errno,
                                     char nbuf[16]);
extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
                                             ext4_group_t block_group,
                                             unsigned int flags);
extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb,
                                              ext4_group_t block_group);

extern __printf(7, 8)
void __ext4_error(struct super_block *, const char *, unsigned int, bool,
                  int, __u64, const char *, ...);
extern __printf(6, 7)
void __ext4_error_inode(struct inode *, const char *, unsigned int,
                        ext4_fsblk_t, int, const char *, ...);
extern __printf(5, 6)
void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
                     const char *, ...);
extern void __ext4_std_error(struct super_block *, const char *,
                             unsigned int, int);
extern __printf(4, 5)
void __ext4_warning(struct super_block *, const char *, unsigned int,
                    const char *, ...);
extern __printf(4, 5)
void __ext4_warning_inode(const struct inode *inode, const char *function,
                          unsigned int line, const char *fmt, ...);
extern __printf(3, 4)
void __ext4_msg(struct super_block *, const char *, const char *, ...);
extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
                           const char *, unsigned int, const char *);
extern __printf(7, 8)
void __ext4_grp_locked_error(const char *, unsigned int,
                             struct super_block *, ext4_group_t,
                             unsigned long, ext4_fsblk_t,
                             const char *, ...);

#define EXT4_ERROR_INODE(inode, fmt, a...) \
        ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a)

#define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...)                        \
        __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a)

#define ext4_error_inode_block(inode, block, err, fmt, a...)                \
        __ext4_error_inode((inode), __func__, __LINE__, (block), (err),        \
                           (fmt), ## a)

#define EXT4_ERROR_FILE(file, block, fmt, a...)                                \
        ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a)

#define ext4_abort(sb, err, fmt, a...)                                        \
        __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a)

#ifdef CONFIG_PRINTK

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
        __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
        __ext4_error_inode((inode), (func), (line), (block),                 \
                           (err), (fmt), ##__VA_ARGS__)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
        __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
#define ext4_error(sb, fmt, ...)                                        \
        __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt),        \
                ##__VA_ARGS__)
#define ext4_error_err(sb, err, fmt, ...)                                \
        __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt),        \
                ##__VA_ARGS__)
#define ext4_warning(sb, fmt, ...)                                        \
        __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_warning_inode(inode, fmt, ...)                                \
        __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__)
#define ext4_msg(sb, level, fmt, ...)                                \
        __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
        __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
                                fmt, ##__VA_ARGS__)

#else

#define ext4_error_inode(inode, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, 0, " ");                \
} while (0)
#define ext4_error_inode_err(inode, func, line, block, err, fmt, ...)        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_inode(inode, "", 0, block, err, " ");                \
} while (0)
#define ext4_error_file(file, func, line, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error_file(file, "", 0, block, " ");                        \
} while (0)
#define ext4_error(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, false, 0, 0, " ");                        \
} while (0)
#define ext4_error_err(sb, err, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_error(sb, "", 0, false, err, 0, " ");                        \
} while (0)
#define ext4_warning(sb, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning(sb, "", 0, " ");                                        \
} while (0)
#define ext4_warning_inode(inode, fmt, ...)                                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_warning_inode(inode, "", 0, " ");                        \
} while (0)
#define ext4_msg(sb, level, fmt, ...)                                        \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                        \
        __ext4_msg(sb, "", " ");                                        \
} while (0)
#define dump_mmp_msg(sb, mmp, msg)                                        \
        __dump_mmp_msg(sb, mmp, "", 0, "")
#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...)                \
do {                                                                        \
        no_printk(fmt, ##__VA_ARGS__);                                \
        __ext4_grp_locked_error("", 0, sb, grp, ino, block, " ");        \
} while (0)

#endif

extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern ext4_fsblk_t ext4_inode_table(struct super_block *sb,
                                     struct ext4_group_desc *bg);
extern __u32 ext4_free_group_clusters(struct super_block *sb,
                                      struct ext4_group_desc *bg);
extern __u32 ext4_free_inodes_count(struct super_block *sb,
                                 struct ext4_group_desc *bg);
extern __u32 ext4_used_dirs_count(struct super_block *sb,
                                struct ext4_group_desc *bg);
extern __u32 ext4_itable_unused_count(struct super_block *sb,
                                   struct ext4_group_desc *bg);
extern void ext4_block_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_bitmap_set(struct super_block *sb,
                                  struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_inode_table_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, ext4_fsblk_t blk);
extern void ext4_free_group_clusters_set(struct super_block *sb,
                                         struct ext4_group_desc *bg,
                                         __u32 count);
extern void ext4_free_inodes_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
                                       struct ext4_group_desc *gdp);
extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
                                     struct ext4_group_desc *gdp);
extern int ext4_register_li_request(struct super_block *sb,
                                    ext4_group_t first_not_zeroed);

static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
        return ext4_has_feature_gdt_csum(sb) ||
               ext4_has_feature_metadata_csum(sb);
}

#define ext4_read_incompat_64bit_val(es, name) \
        (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \
                ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \
                le32_to_cpu(es->name##_lo))

static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_blocks_count);
}

static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_r_blocks_count);
}

static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es)
{
        return ext4_read_incompat_64bit_val(es, s_free_blocks_count);
}

static inline void ext4_blocks_count_set(struct ext4_super_block *es,
                                         ext4_fsblk_t blk)
{
        es->s_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_free_blocks_count_set(struct ext4_super_block *es,
                                              ext4_fsblk_t blk)
{
        es->s_free_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
                                           ext4_fsblk_t blk)
{
        es->s_r_blocks_count_lo = cpu_to_le32((u32)blk);
        es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
}

static inline loff_t ext4_isize(struct super_block *sb,
                                struct ext4_inode *raw_inode)
{
        if (ext4_has_feature_largedir(sb) ||
            S_ISREG(le16_to_cpu(raw_inode->i_mode)))
                return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
                        le32_to_cpu(raw_inode->i_size_lo);

        return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
}

static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
{
        raw_inode->i_size_lo = cpu_to_le32(i_size);
        raw_inode->i_size_high = cpu_to_le32(i_size >> 32);
}

/*
 * Reading s_groups_count requires using smp_rmb() afterwards.  See
 * the locking protocol documented in the comments of ext4_group_add()
 * in resize.c
 */
static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
{
        ext4_group_t        ngroups = EXT4_SB(sb)->s_groups_count;

        smp_rmb();
        return ngroups;
}

static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
{
        return block_group >> sbi->s_log_groups_per_flex;
}

static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
{
        return 1 << sbi->s_log_groups_per_flex;
}

#define ext4_std_error(sb, errno)                                \
do {                                                                \
        if ((errno))                                                \
                __ext4_std_error((sb), __func__, __LINE__, (errno));        \
} while (0)

#ifdef CONFIG_SMP
/* Each CPU can accumulate percpu_counter_batch clusters in their local
 * counters. So we need to make sure we have free clusters more
 * than percpu_counter_batch  * nr_cpu_ids. Also add a window of 4 times.
 */
#define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids))
#else
#define EXT4_FREECLUSTERS_WATERMARK 0
#endif

/* Update i_disksize. Requires i_rwsem to avoid races with truncate */
static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
{
        WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
                     !inode_is_locked(inode));
        down_write(&EXT4_I(inode)->i_data_sem);
        if (newsize > EXT4_I(inode)->i_disksize)
                WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
        up_write(&EXT4_I(inode)->i_data_sem);
}

/* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */
static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
{
        int changed = 0;

        if (newsize > inode->i_size) {
                i_size_write(inode, newsize);
                changed = 1;
        }
        if (newsize > EXT4_I(inode)->i_disksize) {
                ext4_update_i_disksize(inode, newsize);
                changed |= 2;
        }
        return changed;
}

int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
                                      loff_t len);

struct ext4_group_info {
        unsigned long   bb_state;
#ifdef AGGRESSIVE_CHECK
        unsigned long        bb_check_counter;
#endif
        struct rb_root  bb_free_root;
        ext4_grpblk_t        bb_first_free;        /* first free block */
        ext4_grpblk_t        bb_free;        /* total free blocks */
        ext4_grpblk_t        bb_fragments;        /* nr of freespace fragments */
        int                bb_avg_fragment_size_order;        /* order of average
                                                           fragment in BG */
        ext4_grpblk_t        bb_largest_free_order;/* order of largest frag in BG */
        ext4_group_t        bb_group;        /* Group number */
        struct          list_head bb_prealloc_list;
#ifdef DOUBLE_CHECK
        void            *bb_bitmap;
#endif
        struct rw_semaphore alloc_sem;
        struct list_head bb_avg_fragment_size_node;
        struct list_head bb_largest_free_order_node;
        ext4_grpblk_t        bb_counters[];        /* Nr of free power-of-two-block
                                         * regions, index is order.
                                         * bb_counters[3] = 5 means
                                         * 5 free 8-block regions. */
};

#define EXT4_GROUP_INFO_NEED_INIT_BIT                0
#define EXT4_GROUP_INFO_WAS_TRIMMED_BIT                1
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT        2
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT        3
#define EXT4_GROUP_INFO_BBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_IBITMAP_CORRUPT                \
        (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT)
#define EXT4_GROUP_INFO_BBITMAP_READ_BIT        4

#define EXT4_MB_GRP_NEED_INIT(grp)        \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)        \
        (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))

#define EXT4_MB_GRP_WAS_TRIMMED(grp)        \
        (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_SET_TRIMMED(grp)        \
        (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_CLEAR_TRIMMED(grp)        \
        (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
#define EXT4_MB_GRP_TEST_AND_SET_READ(grp)        \
        (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state)))

#define EXT4_MAX_CONTENTION                8
#define EXT4_CONTENTION_THRESHOLD        2

static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
                                              ext4_group_t group)
{
        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
}

/*
 * Returns true if the filesystem is busy enough that attempts to
 * access the block group locks has run into contention.
 */
static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
{
        return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
}

static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
{
        spinlock_t *lock = ext4_group_lock_ptr(sb, group);
        if (spin_trylock(lock))
                /*
                 * We're able to grab the lock right away, so drop the
                 * lock contention counter.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
        else {
                /*
                 * The lock is busy, so bump the contention counter,
                 * and then wait on the spin lock.
                 */
                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
                                  EXT4_MAX_CONTENTION);
                spin_lock(lock);
        }
}

static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
{
        spin_unlock(ext4_group_lock_ptr(sb, group));
}

#ifdef CONFIG_QUOTA
static inline bool ext4_quota_capable(struct super_block *sb)
{
        return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb));
}

static inline bool ext4_is_quota_journalled(struct super_block *sb)
{
        struct ext4_sb_info *sbi = EXT4_SB(sb);

        return (ext4_has_feature_quota(sb) ||
                sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]);
}
int ext4_enable_quotas(struct super_block *sb);
#endif

/*
 * Block validity checking
 */
#define ext4_check_indirect_blockref(inode, bh)                                \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            (__le32 *)(bh)->b_data,                        \
                            EXT4_ADDR_PER_BLOCK((inode)->i_sb))

#define ext4_ind_check_inode(inode)                                        \
        ext4_check_blockref(__func__, __LINE__, inode,                        \
                            EXT4_I(inode)->i_data,                        \
                            EXT4_NDIR_BLOCKS)

/*
 * Inodes and files operations
 */

/* dir.c */
extern const struct file_operations ext4_dir_operations;

/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);

/* inline.c */
extern int ext4_get_max_inline_size(struct inode *inode);
extern int ext4_find_inline_data_nolock(struct inode *inode);
extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode);

int ext4_readpage_inline(struct inode *inode, struct folio *folio);
extern int ext4_try_to_write_inline_data(struct address_space *mapping,
                                         struct inode *inode,
                                         loff_t pos, unsigned len,
                                         struct folio **foliop);
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
                               unsigned copied, struct folio *folio);
extern int ext4_generic_write_inline_data(struct address_space *mapping,
                                          struct inode *inode,
                                          loff_t pos, unsigned len,
                                          struct folio **foliop,
                                          void **fsdata, bool da);
extern int ext4_try_add_inline_entry(handle_t *handle,
                                     struct ext4_filename *fname,
                                     struct inode *dir, struct inode *inode);
extern int ext4_try_create_inline_dir(handle_t *handle,
                                      struct inode *parent,
                                      struct inode *inode);
extern int ext4_read_inline_dir(struct file *filp,
                                struct dir_context *ctx,
                                int *has_inline_data);
extern int ext4_inlinedir_to_tree(struct file *dir_file,
                                  struct inode *dir, ext4_lblk_t block,
                                  struct dx_hash_info *hinfo,
                                  __u32 start_hash, __u32 start_minor_hash,
                                  int *has_inline_data);
extern struct buffer_head *ext4_find_inline_entry(struct inode *dir,
                                        struct ext4_filename *fname,
                                        struct ext4_dir_entry_2 **res_dir,
                                        int *has_inline_data);
extern int ext4_delete_inline_entry(handle_t *handle,
                                    struct inode *dir,
                                    struct ext4_dir_entry_2 *de_del,
                                    struct buffer_head *bh,
                                    int *has_inline_data);
extern bool empty_inline_dir(struct inode *dir, int *has_inline_data);
extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode,
                                        struct ext4_dir_entry_2 **parent_de,
                                        int *retval);
extern void *ext4_read_inline_link(struct inode *inode);

struct iomap;
extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap);

extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline);

extern int ext4_convert_inline_data(struct inode *inode);

static inline int ext4_has_inline_data(struct inode *inode)
{
        return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) &&
               EXT4_I(inode)->i_inline_off;
}

/* namei.c */
extern const struct inode_operations ext4_dir_inode_operations;
extern const struct inode_operations ext4_special_inode_operations;
extern struct dentry *ext4_get_parent(struct dentry *child);
extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
                                 struct ext4_dir_entry_2 *de,
                                 int blocksize, int csum_size,
                                 unsigned int parent_ino, int dotdot_real_len);
extern void ext4_initialize_dirent_tail(struct buffer_head *bh,
                                        unsigned int blocksize);
extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
                                      struct buffer_head *bh);
extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name,
                         struct inode *inode, struct dentry *dentry);
extern int __ext4_link(struct inode *dir, struct inode *inode,
                       struct dentry *dentry);

#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
        [S_IFREG >> S_SHIFT]        = EXT4_FT_REG_FILE,
        [S_IFDIR >> S_SHIFT]        = EXT4_FT_DIR,
        [S_IFCHR >> S_SHIFT]        = EXT4_FT_CHRDEV,
        [S_IFBLK >> S_SHIFT]        = EXT4_FT_BLKDEV,
        [S_IFIFO >> S_SHIFT]        = EXT4_FT_FIFO,
        [S_IFSOCK >> S_SHIFT]        = EXT4_FT_SOCK,
        [S_IFLNK >> S_SHIFT]        = EXT4_FT_SYMLINK,
};

static inline void ext4_set_de_type(struct super_block *sb,
                                struct ext4_dir_entry_2 *de,
                                umode_t mode) {
        if (ext4_has_feature_filetype(sb))
                de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}

/* readpages.c */
extern int ext4_mpage_readpages(struct inode *inode,
                struct readahead_control *rac, struct folio *folio);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);

/* symlink.c */
extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;

/* sysfs.c */
extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi);
extern int ext4_register_sysfs(struct super_block *sb);
extern void ext4_unregister_sysfs(struct super_block *sb);
extern int __init ext4_init_sysfs(void);
extern void ext4_exit_sysfs(void);

/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
extern int __init ext4_init_system_zone(void);
extern void ext4_exit_system_zone(void);
extern int ext4_inode_block_valid(struct inode *inode,
                                  ext4_fsblk_t start_blk,
                                  unsigned int count);
extern int ext4_check_blockref(const char *, unsigned int,
                               struct inode *, __le32 *, unsigned int);
extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode,
                                ext4_fsblk_t start_blk, unsigned int count);


/* extents.c */
struct ext4_ext_path;
struct ext4_extent;

/*
 * Maximum number of logical blocks in a file; ext4_extent's ee_block is
 * __le32.
 */
#define EXT_MAX_BLOCKS        0xffffffff

extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode);
extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
                               struct ext4_map_blocks *map, int flags);
extern int ext4_ext_truncate(handle_t *, struct inode *);
extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
                                 ext4_lblk_t end);
extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
                          loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
                                          loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
                                             ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
                                                   int num,
                                                   struct ext4_ext_path *path);
extern struct ext4_ext_path *ext4_ext_insert_extent(
                                handle_t *handle, struct inode *inode,
                                struct ext4_ext_path *path,
                                struct ext4_extent *newext, int gb_flags);
extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t,
                                              struct ext4_ext_path *,
                                              int flags);
extern void ext4_free_ext_path(struct ext4_ext_path *);
extern int ext4_ext_check_inode(struct inode *inode);
extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
extern int ext4_get_es_cache(struct inode *inode,
                             struct fiemap_extent_info *fieinfo,
                             __u64 start, __u64 len);
extern int ext4_ext_precache(struct inode *inode);
extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
                                struct inode *inode2, ext4_lblk_t lblk1,
                             ext4_lblk_t lblk2,  ext4_lblk_t count,
                             int mark_unwritten,int *err);
extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu);
extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode,
                                       int check_cred, int restart_cred,
                                       int revoke_cred);
extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end);
extern int ext4_ext_replay_set_iblocks(struct inode *inode);
extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start,
                int len, int unwritten, ext4_fsblk_t pblk);
extern int ext4_ext_clear_bb(struct inode *inode);


/* move_extent.c */
extern void ext4_double_down_write_data_sem(struct inode *first,
                                            struct inode *second);
extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
                                          struct inode *donor_inode);
extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
                             __u64 start_orig, __u64 start_donor,
                             __u64 len, __u64 *moved_len);

/* page-io.c */
extern int __init ext4_init_pageio(void);
extern void ext4_exit_pageio(void);
extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
extern int ext4_put_io_end(ext4_io_end_t *io_end);
extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
extern void ext4_io_submit_init(struct ext4_io_submit *io,
                                struct writeback_control *wbc);
extern void ext4_end_io_rsv_work(struct work_struct *work);
extern void ext4_io_submit(struct ext4_io_submit *io);
int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page,
                size_t len);
extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end);
extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end);

/* mmp.c */
extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);

/* mmp.c */
extern void ext4_stop_mmpd(struct ext4_sb_info *sbi);

/* verity.c */
extern const struct fsverity_operations ext4_verityops;

/* orphan.c */
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern void ext4_orphan_cleanup(struct super_block *sb,
                                struct ext4_super_block *es);
extern void ext4_release_orphan_info(struct super_block *sb);
extern int ext4_init_orphan_info(struct super_block *sb);
extern int ext4_orphan_file_empty(struct super_block *sb);
extern void ext4_orphan_file_block_trigger(
                                struct jbd2_buffer_trigger_type *triggers,
                                struct buffer_head *bh,
                                void *data, size_t size);

/*
 * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
 * to mark the bitmap uptodate. We need to also zero-out the bitmap
 */
#define BH_BITMAP_UPTODATE BH_JBDPrivateStart

static inline int bitmap_uptodate(struct buffer_head *bh)
{
        return (buffer_uptodate(bh) &&
                        test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state));
}
static inline void set_bitmap_uptodate(struct buffer_head *bh)
{
        set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
}

extern int ext4_resize_begin(struct super_block *sb);
extern int ext4_resize_end(struct super_block *sb, bool update_backups);

static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end)
{
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN))
                io_end->flag |= EXT4_IO_END_UNWRITTEN;
}

static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
{
        if (io_end->flag & EXT4_IO_END_UNWRITTEN)
                io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
}

extern const struct iomap_ops ext4_iomap_ops;
extern const struct iomap_ops ext4_iomap_overwrite_ops;
extern const struct iomap_ops ext4_iomap_report_ops;

static inline int ext4_buffer_uptodate(struct buffer_head *bh)
{
        /*
         * If the buffer has the write error flag, we have failed
         * to write out data in the block.  In this  case, we don't
         * have to read the block because we may read the old data
         * successfully.
         */
        if (buffer_write_io_error(bh))
                set_buffer_uptodate(bh);
        return buffer_uptodate(bh);
}

static inline bool ext4_inode_can_atomic_write(struct inode *inode)
{

        return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
}

extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
                                  loff_t pos, unsigned len,
                                  get_block_t *get_block);
#endif        /* __KERNEL__ */

#define EFSBADCRC        EBADMSG                /* Bad CRC detected */
#define EFSCORRUPTED        EUCLEAN                /* Filesystem is corrupted */

#endif        /* _EXT4_H */






























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_NULLS_H
#define _LINUX_RCULIST_NULLS_H

#ifdef __KERNEL__

/*
 * RCU-protected list version
 */
#include <linux/list_nulls.h>
#include <linux/rcupdate.h>

/**
 * hlist_nulls_del_init_rcu - deletes entry from hash list with re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_nulls_unhashed() on the node return true after this. It is
 * useful for RCU based read lockfree traversal if the writer side
 * must know if the list entry is still hashed or already unhashed.
 *
 * In particular, it means that we can not poison the forward pointers
 * that may still be used for walking the hash list and we can only
 * zero the pprev pointer so list_unhashed() will return true after
 * this.
 *
 * The caller must take whatever precautions are necessary (such as
 * holding appropriate locks) to avoid racing with another
 * list-mutation primitive, such as hlist_nulls_add_head_rcu() or
 * hlist_nulls_del_rcu(), running on this same list.  However, it is
 * perfectly legal to run concurrently with the _rcu list-traversal
 * primitives, such as hlist_nulls_for_each_entry_rcu().
 */
static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
{
        if (!hlist_nulls_unhashed(n)) {
                __hlist_nulls_del(n);
                WRITE_ONCE(n->pprev, NULL);
        }
}

/**
 * hlist_nulls_first_rcu - returns the first element of the hash list.
 * @head: the head of the list.
 */
#define hlist_nulls_first_rcu(head) \
        (*((struct hlist_nulls_node __rcu __force **)&(head)->first))

/**
 * hlist_nulls_next_rcu - returns the element of the list after @node.
 * @node: element of the list.
 */
#define hlist_nulls_next_rcu(node) \
        (*((struct hlist_nulls_node __rcu __force **)&(node)->next))

/**
 * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_nulls_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry().
 */
static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n)
{
        __hlist_nulls_del(n);
        WRITE_ONCE(n->pprev, LIST_POISON2);
}

/**
 * hlist_nulls_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_nulls,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
                                        struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *first = h->first;

        WRITE_ONCE(n->next, first);
        WRITE_ONCE(n->pprev, &h->first);
        rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
        if (!is_a_nulls(first))
                WRITE_ONCE(first->pprev, &n->next);
}

/**
 * hlist_nulls_add_tail_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_nulls,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_nulls_add_head_rcu()
 * or hlist_nulls_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n,
                                            struct hlist_nulls_head *h)
{
        struct hlist_nulls_node *i, *last = NULL;

        /* Note: write side code, so rcu accessors are not needed. */
        for (i = h->first; !is_a_nulls(i); i = i->next)
                last = i;

        if (last) {
                WRITE_ONCE(n->next, last->next);
                n->pprev = &last->next;
                rcu_assign_pointer(hlist_nulls_next_rcu(last), n);
        } else {
                hlist_nulls_add_head_rcu(n, h);
        }
}

/* after that hlist_nulls_del will work */
static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
{
        n->pprev = &n->next;
        n->next = (struct hlist_nulls_node *)NULLS_MARKER(NULL);
}

/**
 * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_nulls_node to use as a loop cursor.
 * @head:        the head of the list.
 * @member:        the name of the hlist_nulls_node within the struct.
 *
 * The barrier() is needed to make sure compiler doesn't cache first element [1],
 * as this loop can be restarted [2]
 * [1] Documentation/memory-barriers.txt around line 1533
 * [2] Documentation/RCU/rculist_nulls.rst around line 146
 */
#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)                        \
        for (({barrier();}),                                                        \
             pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));                \
                (!is_a_nulls(pos)) &&                                                \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))

/**
 * hlist_nulls_for_each_entry_safe -
 *   iterate over list of given type safe against removal of list entry
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_nulls_node to use as a loop cursor.
 * @head:        the head of the list.
 * @member:        the name of the hlist_nulls_node within the struct.
 */
#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member)                \
        for (({barrier();}),                                                        \
             pos = rcu_dereference_raw(hlist_nulls_first_rcu(head));                \
                (!is_a_nulls(pos)) &&                                                \
                ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member);        \
                   pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });)
#endif
#endif





































































































   46 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/pagevec.h
 *
 * In many places it is efficient to batch an operation up against multiple
 * folios.  A folio_batch is a container which is used for that.
 */

#ifndef _LINUX_PAGEVEC_H
#define _LINUX_PAGEVEC_H

#include <linux/types.h>

/* 31 pointers + header align the folio_batch structure to a power of two */
#define PAGEVEC_SIZE        31

struct folio;

/**
 * struct folio_batch - A collection of folios.
 *
 * The folio_batch is used to amortise the cost of retrieving and
 * operating on a set of folios.  The order of folios in the batch may be
 * significant (eg delete_from_page_cache_batch()).  Some users of the
 * folio_batch store "exceptional" entries in it which can be removed
 * by calling folio_batch_remove_exceptionals().
 */
struct folio_batch {
        unsigned char nr;
        unsigned char i;
        bool percpu_pvec_drained;
        struct folio *folios[PAGEVEC_SIZE];
};

/**
 * folio_batch_init() - Initialise a batch of folios
 * @fbatch: The folio batch.
 *
 * A freshly initialised folio_batch contains zero folios.
 */
static inline void folio_batch_init(struct folio_batch *fbatch)
{
        fbatch->nr = 0;
        fbatch->i = 0;
        fbatch->percpu_pvec_drained = false;
}

static inline void folio_batch_reinit(struct folio_batch *fbatch)
{
        fbatch->nr = 0;
        fbatch->i = 0;
}

static inline unsigned int folio_batch_count(struct folio_batch *fbatch)
{
        return fbatch->nr;
}

static inline unsigned int folio_batch_space(struct folio_batch *fbatch)
{
        return PAGEVEC_SIZE - fbatch->nr;
}

/**
 * folio_batch_add() - Add a folio to a batch.
 * @fbatch: The folio batch.
 * @folio: The folio to add.
 *
 * The folio is added to the end of the batch.
 * The batch must have previously been initialised using folio_batch_init().
 *
 * Return: The number of slots still available.
 */
static inline unsigned folio_batch_add(struct folio_batch *fbatch,
                struct folio *folio)
{
        fbatch->folios[fbatch->nr++] = folio;
        return folio_batch_space(fbatch);
}

/**
 * folio_batch_next - Return the next folio to process.
 * @fbatch: The folio batch being processed.
 *
 * Use this function to implement a queue of folios.
 *
 * Return: The next folio in the queue, or NULL if the queue is empty.
 */
static inline struct folio *folio_batch_next(struct folio_batch *fbatch)
{
        if (fbatch->i == fbatch->nr)
                return NULL;
        return fbatch->folios[fbatch->i++];
}

void __folio_batch_release(struct folio_batch *pvec);

static inline void folio_batch_release(struct folio_batch *fbatch)
{
        if (folio_batch_count(fbatch))
                __folio_batch_release(fbatch);
}

void folio_batch_remove_exceptionals(struct folio_batch *fbatch);
#endif /* _LINUX_PAGEVEC_H */

























































 1273 




    2 
 1255 














   44 
   24 
   69 


 1198 

   69 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Wrapper functions for accessing the file_struct fd array.
 */

#ifndef __LINUX_FILE_H
#define __LINUX_FILE_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/posix_types.h>
#include <linux/errno.h>
#include <linux/cleanup.h>
#include <linux/err.h>

struct file;

extern void fput(struct file *);

struct file_operations;
struct task_struct;
struct vfsmount;
struct dentry;
struct inode;
struct path;
extern struct file *alloc_file_pseudo(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_pseudo_noaccount(struct inode *, struct vfsmount *,
        const char *, int flags, const struct file_operations *);
extern struct file *alloc_file_clone(struct file *, int flags,
        const struct file_operations *);

/* either a reference to struct file + flags
 * (cloned vs. borrowed, pos locked), with
 * flags stored in lower bits of value,
 * or empty (represented by 0).
 */
struct fd {
        unsigned long word;
};
#define FDPUT_FPUT       1
#define FDPUT_POS_UNLOCK 2

#define fd_file(f) ((struct file *)((f).word & ~(FDPUT_FPUT|FDPUT_POS_UNLOCK)))
static inline bool fd_empty(struct fd f)
{
        return unlikely(!f.word);
}

#define EMPTY_FD (struct fd){0}
static inline struct fd BORROWED_FD(struct file *f)
{
        return (struct fd){(unsigned long)f};
}
static inline struct fd CLONED_FD(struct file *f)
{
        return (struct fd){(unsigned long)f | FDPUT_FPUT};
}

static inline void fdput(struct fd fd)
{
        if (fd.word & FDPUT_FPUT)
                fput(fd_file(fd));
}

extern struct file *fget(unsigned int fd);
extern struct file *fget_raw(unsigned int fd);
extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern struct file *fget_task_next(struct task_struct *task, unsigned int *fd);
extern void __f_unlock_pos(struct file *);

struct fd fdget(unsigned int fd);
struct fd fdget_raw(unsigned int fd);
struct fd fdget_pos(unsigned int fd);

static inline void fdput_pos(struct fd f)
{
        if (f.word & FDPUT_POS_UNLOCK)
                __f_unlock_pos(fd_file(f));
        fdput(f);
}

DEFINE_CLASS(fd, struct fd, fdput(_T), fdget(fd), int fd)
DEFINE_CLASS(fd_raw, struct fd, fdput(_T), fdget_raw(fd), int fd)
DEFINE_CLASS(fd_pos, struct fd, fdput_pos(_T), fdget_pos(fd), int fd)

extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
extern void set_close_on_exec(unsigned int fd, int flag);
extern bool get_close_on_exec(unsigned int fd);
extern int __get_unused_fd_flags(unsigned flags, unsigned long nofile);
extern int get_unused_fd_flags(unsigned flags);
extern void put_unused_fd(unsigned int fd);

DEFINE_CLASS(get_unused_fd, int, if (_T >= 0) put_unused_fd(_T),
             get_unused_fd_flags(flags), unsigned flags)
DEFINE_FREE(fput, struct file *, if (!IS_ERR_OR_NULL(_T)) fput(_T))

/*
 * take_fd() will take care to set @fd to -EBADF ensuring that
 * CLASS(get_unused_fd) won't call put_unused_fd(). This makes it
 * easier to rely on CLASS(get_unused_fd):
 *
 * struct file *f;
 *
 * CLASS(get_unused_fd, fd)(O_CLOEXEC);
 * if (fd < 0)
 *         return fd;
 *
 * f = dentry_open(&path, O_RDONLY, current_cred());
 * if (IS_ERR(f))
 *         return PTR_ERR(f);
 *
 * fd_install(fd, f);
 * return take_fd(fd);
 */
#define take_fd(fd) __get_and_null(fd, -EBADF)

extern void fd_install(unsigned int fd, struct file *file);

int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags);

int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags);

extern void flush_delayed_fput(void);
extern void __fput_sync(struct file *);

extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;

#endif /* __LINUX_FILE_H */









































































































































































































































































  419 





















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
/* SPDX-License-Identifier: GPL-2.0-or-later */
#ifndef _LINUX_MEMBLOCK_H
#define _LINUX_MEMBLOCK_H

/*
 * Logical memory blocks.
 *
 * Copyright (C) 2001 Peter Bergner, IBM Corp.
 */

#include <linux/init.h>
#include <linux/mm.h>
#include <asm/dma.h>

extern unsigned long max_low_pfn;
extern unsigned long min_low_pfn;

/*
 * highest page
 */
extern unsigned long max_pfn;
/*
 * highest possible page
 */
extern unsigned long long max_possible_pfn;

/**
 * enum memblock_flags - definition of memory region attributes
 * @MEMBLOCK_NONE: no special request
 * @MEMBLOCK_HOTPLUG: memory region indicated in the firmware-provided memory
 * map during early boot as hot(un)pluggable system RAM (e.g., memory range
 * that might get hotunplugged later). With "movable_node" set on the kernel
 * commandline, try keeping this memory region hotunpluggable. Does not apply
 * to memblocks added ("hotplugged") after early boot.
 * @MEMBLOCK_MIRROR: mirrored region
 * @MEMBLOCK_NOMAP: don't add to kernel direct mapping and treat as
 * reserved in the memory map; refer to memblock_mark_nomap() description
 * for further details
 * @MEMBLOCK_DRIVER_MANAGED: memory region that is always detected and added
 * via a driver, and never indicated in the firmware-provided memory map as
 * system RAM. This corresponds to IORESOURCE_SYSRAM_DRIVER_MANAGED in the
 * kernel resource tree.
 * @MEMBLOCK_RSRV_NOINIT: memory region for which struct pages are
 * not initialized (only for reserved regions).
 */
enum memblock_flags {
        MEMBLOCK_NONE                = 0x0,        /* No special request */
        MEMBLOCK_HOTPLUG        = 0x1,        /* hotpluggable region */
        MEMBLOCK_MIRROR                = 0x2,        /* mirrored region */
        MEMBLOCK_NOMAP                = 0x4,        /* don't add to kernel direct mapping */
        MEMBLOCK_DRIVER_MANAGED = 0x8,        /* always detected via a driver */
        MEMBLOCK_RSRV_NOINIT        = 0x10,        /* don't initialize struct pages */
};

/**
 * struct memblock_region - represents a memory region
 * @base: base address of the region
 * @size: size of the region
 * @flags: memory region attributes
 * @nid: NUMA node id
 */
struct memblock_region {
        phys_addr_t base;
        phys_addr_t size;
        enum memblock_flags flags;
#ifdef CONFIG_NUMA
        int nid;
#endif
};

/**
 * struct memblock_type - collection of memory regions of certain type
 * @cnt: number of regions
 * @max: size of the allocated array
 * @total_size: size of all regions
 * @regions: array of regions
 * @name: the memory type symbolic name
 */
struct memblock_type {
        unsigned long cnt;
        unsigned long max;
        phys_addr_t total_size;
        struct memblock_region *regions;
        char *name;
};

/**
 * struct memblock - memblock allocator metadata
 * @bottom_up: is bottom up direction?
 * @current_limit: physical address of the current allocation limit
 * @memory: usable memory regions
 * @reserved: reserved memory regions
 */
struct memblock {
        bool bottom_up;  /* is bottom up direction? */
        phys_addr_t current_limit;
        struct memblock_type memory;
        struct memblock_type reserved;
};

extern struct memblock memblock;

#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
#define __init_memblock __meminit
#define __initdata_memblock __meminitdata
void memblock_discard(void);
#else
#define __init_memblock
#define __initdata_memblock
static inline void memblock_discard(void) {}
#endif

void memblock_allow_resize(void);
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid,
                      enum memblock_flags flags);
int memblock_add(phys_addr_t base, phys_addr_t size);
int memblock_remove(phys_addr_t base, phys_addr_t size);
int memblock_phys_free(phys_addr_t base, phys_addr_t size);
int memblock_reserve(phys_addr_t base, phys_addr_t size);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
int memblock_physmem_add(phys_addr_t base, phys_addr_t size);
#endif
void memblock_trim_memory(phys_addr_t align);
unsigned long memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
                                     phys_addr_t base2, phys_addr_t size2);
bool memblock_overlaps_region(struct memblock_type *type,
                              phys_addr_t base, phys_addr_t size);
bool memblock_validate_numa_coverage(unsigned long threshold_bytes);
int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size);
int memblock_mark_mirror(phys_addr_t base, phys_addr_t size);
int memblock_mark_nomap(phys_addr_t base, phys_addr_t size);
int memblock_clear_nomap(phys_addr_t base, phys_addr_t size);
int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size);

void memblock_free(void *ptr, size_t size);
void reset_all_zones_managed_pages(void);

/* Low level functions */
void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
                      struct memblock_type *type_a,
                      struct memblock_type *type_b, phys_addr_t *out_start,
                      phys_addr_t *out_end, int *out_nid);

void __next_mem_range_rev(u64 *idx, int nid, enum memblock_flags flags,
                          struct memblock_type *type_a,
                          struct memblock_type *type_b, phys_addr_t *out_start,
                          phys_addr_t *out_end, int *out_nid);

void memblock_free_late(phys_addr_t base, phys_addr_t size);

#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
                                        phys_addr_t *out_start,
                                        phys_addr_t *out_end)
{
        extern struct memblock_type physmem;

        __next_mem_range(idx, NUMA_NO_NODE, MEMBLOCK_NONE, &physmem, type,
                         out_start, out_end, NULL);
}

/**
 * for_each_physmem_range - iterate through physmem areas not included in type.
 * @i: u64 used as loop variable
 * @type: ptr to memblock_type which excludes from the iteration, can be %NULL
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 */
#define for_each_physmem_range(i, type, p_start, p_end)                        \
        for (i = 0, __next_physmem_range(&i, type, p_start, p_end);        \
             i != (u64)ULLONG_MAX;                                        \
             __next_physmem_range(&i, type, p_start, p_end))
#endif /* CONFIG_HAVE_MEMBLOCK_PHYS_MAP */

/**
 * __for_each_mem_range - iterate through memblock areas from type_a and not
 * included in type_b. Or just type_a if type_b is NULL.
 * @i: u64 used as loop variable
 * @type_a: ptr to memblock_type to iterate
 * @type_b: ptr to memblock_type which excludes from the iteration
 * @nid: node selector, %NUMA_NO_NODE for all nodes
 * @flags: pick from blocks based on memory attributes
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @p_nid: ptr to int for nid of the range, can be %NULL
 */
#define __for_each_mem_range(i, type_a, type_b, nid, flags,                \
                           p_start, p_end, p_nid)                        \
        for (i = 0, __next_mem_range(&i, nid, flags, type_a, type_b,        \
                                     p_start, p_end, p_nid);                \
             i != (u64)ULLONG_MAX;                                        \
             __next_mem_range(&i, nid, flags, type_a, type_b,                \
                              p_start, p_end, p_nid))

/**
 * __for_each_mem_range_rev - reverse iterate through memblock areas from
 * type_a and not included in type_b. Or just type_a if type_b is NULL.
 * @i: u64 used as loop variable
 * @type_a: ptr to memblock_type to iterate
 * @type_b: ptr to memblock_type which excludes from the iteration
 * @nid: node selector, %NUMA_NO_NODE for all nodes
 * @flags: pick from blocks based on memory attributes
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @p_nid: ptr to int for nid of the range, can be %NULL
 */
#define __for_each_mem_range_rev(i, type_a, type_b, nid, flags,                \
                                 p_start, p_end, p_nid)                        \
        for (i = (u64)ULLONG_MAX,                                        \
                     __next_mem_range_rev(&i, nid, flags, type_a, type_b, \
                                          p_start, p_end, p_nid);        \
             i != (u64)ULLONG_MAX;                                        \
             __next_mem_range_rev(&i, nid, flags, type_a, type_b,        \
                                  p_start, p_end, p_nid))

/**
 * for_each_mem_range - iterate through memory areas.
 * @i: u64 used as loop variable
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 */
#define for_each_mem_range(i, p_start, p_end) \
        __for_each_mem_range(i, &memblock.memory, NULL, NUMA_NO_NODE,        \
                             MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED, \
                             p_start, p_end, NULL)

/**
 * for_each_mem_range_rev - reverse iterate through memblock areas from
 * type_a and not included in type_b. Or just type_a if type_b is NULL.
 * @i: u64 used as loop variable
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 */
#define for_each_mem_range_rev(i, p_start, p_end)                        \
        __for_each_mem_range_rev(i, &memblock.memory, NULL, NUMA_NO_NODE, \
                                 MEMBLOCK_HOTPLUG | MEMBLOCK_DRIVER_MANAGED,\
                                 p_start, p_end, NULL)

/**
 * for_each_reserved_mem_range - iterate over all reserved memblock areas
 * @i: u64 used as loop variable
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 *
 * Walks over reserved areas of memblock. Available as soon as memblock
 * is initialized.
 */
#define for_each_reserved_mem_range(i, p_start, p_end)                        \
        __for_each_mem_range(i, &memblock.reserved, NULL, NUMA_NO_NODE,        \
                             MEMBLOCK_NONE, p_start, p_end, NULL)

static inline bool memblock_is_hotpluggable(struct memblock_region *m)
{
        return m->flags & MEMBLOCK_HOTPLUG;
}

static inline bool memblock_is_mirror(struct memblock_region *m)
{
        return m->flags & MEMBLOCK_MIRROR;
}

static inline bool memblock_is_nomap(struct memblock_region *m)
{
        return m->flags & MEMBLOCK_NOMAP;
}

static inline bool memblock_is_reserved_noinit(struct memblock_region *m)
{
        return m->flags & MEMBLOCK_RSRV_NOINIT;
}

static inline bool memblock_is_driver_managed(struct memblock_region *m)
{
        return m->flags & MEMBLOCK_DRIVER_MANAGED;
}

int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
                            unsigned long  *end_pfn);
void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
                          unsigned long *out_end_pfn, int *out_nid);

/**
 * for_each_mem_pfn_range - early memory pfn range iterator
 * @i: an integer used as loop variable
 * @nid: node selector, %MAX_NUMNODES for all nodes
 * @p_start: ptr to ulong for start pfn of the range, can be %NULL
 * @p_end: ptr to ulong for end pfn of the range, can be %NULL
 * @p_nid: ptr to int for nid of the range, can be %NULL
 *
 * Walks over configured memory ranges.
 */
#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid)                \
        for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \
             i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid))

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
                                  unsigned long *out_spfn,
                                  unsigned long *out_epfn);

/**
 * for_each_free_mem_pfn_range_in_zone_from - iterate through zone specific
 * free memblock areas from a given point
 * @i: u64 used as loop variable
 * @zone: zone in which all of the memory blocks reside
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 *
 * Walks over free (memory && !reserved) areas of memblock in a specific
 * zone, continuing from current position. Available as soon as memblock is
 * initialized.
 */
#define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
        for (; i != U64_MAX;                                          \
             __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))

#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */

/**
 * for_each_free_mem_range - iterate through free memblock areas
 * @i: u64 used as loop variable
 * @nid: node selector, %NUMA_NO_NODE for all nodes
 * @flags: pick from blocks based on memory attributes
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @p_nid: ptr to int for nid of the range, can be %NULL
 *
 * Walks over free (memory && !reserved) areas of memblock.  Available as
 * soon as memblock is initialized.
 */
#define for_each_free_mem_range(i, nid, flags, p_start, p_end, p_nid)        \
        __for_each_mem_range(i, &memblock.memory, &memblock.reserved,        \
                             nid, flags, p_start, p_end, p_nid)

/**
 * for_each_free_mem_range_reverse - rev-iterate through free memblock areas
 * @i: u64 used as loop variable
 * @nid: node selector, %NUMA_NO_NODE for all nodes
 * @flags: pick from blocks based on memory attributes
 * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
 * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
 * @p_nid: ptr to int for nid of the range, can be %NULL
 *
 * Walks over free (memory && !reserved) areas of memblock in reverse
 * order.  Available as soon as memblock is initialized.
 */
#define for_each_free_mem_range_reverse(i, nid, flags, p_start, p_end,        \
                                        p_nid)                                \
        __for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \
                                 nid, flags, p_start, p_end, p_nid)

int memblock_set_node(phys_addr_t base, phys_addr_t size,
                      struct memblock_type *type, int nid);

#ifdef CONFIG_NUMA
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
        r->nid = nid;
}

static inline int memblock_get_region_node(const struct memblock_region *r)
{
        return r->nid;
}
#else
static inline void memblock_set_region_node(struct memblock_region *r, int nid)
{
}

static inline int memblock_get_region_node(const struct memblock_region *r)
{
        return 0;
}
#endif /* CONFIG_NUMA */

/* Flags for memblock allocation APIs */
#define MEMBLOCK_ALLOC_ANYWHERE        (~(phys_addr_t)0)
#define MEMBLOCK_ALLOC_ACCESSIBLE        0
/*
 *  MEMBLOCK_ALLOC_NOLEAKTRACE avoids kmemleak tracing. It implies
 *  MEMBLOCK_ALLOC_ACCESSIBLE
 */
#define MEMBLOCK_ALLOC_NOLEAKTRACE        1

/* We are using top down, so it is safe to use 0 here */
#define MEMBLOCK_LOW_LIMIT 0

#ifndef ARCH_LOW_ADDRESS_LIMIT
#define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
#endif

phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
                                      phys_addr_t start, phys_addr_t end);
phys_addr_t memblock_alloc_range_nid(phys_addr_t size,
                                      phys_addr_t align, phys_addr_t start,
                                      phys_addr_t end, int nid, bool exact_nid);
phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);

static __always_inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
                                                       phys_addr_t align)
{
        return memblock_phys_alloc_range(size, align, 0,
                                         MEMBLOCK_ALLOC_ACCESSIBLE);
}

void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align,
                                 phys_addr_t min_addr, phys_addr_t max_addr,
                                 int nid);
void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
                                 phys_addr_t min_addr, phys_addr_t max_addr,
                                 int nid);
void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
                             phys_addr_t min_addr, phys_addr_t max_addr,
                             int nid);

static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align)
{
        return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
                                      MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
}

void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align,
                                const char *func);

#define memblock_alloc_or_panic(size, align)    \
         __memblock_alloc_or_panic(size, align, __func__)

static inline void *memblock_alloc_raw(phys_addr_t size,
                                               phys_addr_t align)
{
        return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
                                          MEMBLOCK_ALLOC_ACCESSIBLE,
                                          NUMA_NO_NODE);
}

static inline void *memblock_alloc_from(phys_addr_t size,
                                                phys_addr_t align,
                                                phys_addr_t min_addr)
{
        return memblock_alloc_try_nid(size, align, min_addr,
                                      MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE);
}

static inline void *memblock_alloc_low(phys_addr_t size,
                                               phys_addr_t align)
{
        return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
                                      ARCH_LOW_ADDRESS_LIMIT, NUMA_NO_NODE);
}

static inline void *memblock_alloc_node(phys_addr_t size,
                                                phys_addr_t align, int nid)
{
        return memblock_alloc_try_nid(size, align, MEMBLOCK_LOW_LIMIT,
                                      MEMBLOCK_ALLOC_ACCESSIBLE, nid);
}

/*
 * Set the allocation direction to bottom-up or top-down.
 */
static inline __init_memblock void memblock_set_bottom_up(bool enable)
{
        memblock.bottom_up = enable;
}

/*
 * Check if the allocation direction is bottom-up or not.
 * if this is true, that said, memblock will allocate memory
 * in bottom-up direction.
 */
static inline __init_memblock bool memblock_bottom_up(void)
{
        return memblock.bottom_up;
}

phys_addr_t memblock_phys_mem_size(void);
phys_addr_t memblock_reserved_size(void);
unsigned long memblock_estimated_nr_free_pages(void);
phys_addr_t memblock_start_of_DRAM(void);
phys_addr_t memblock_end_of_DRAM(void);
void memblock_enforce_memory_limit(phys_addr_t memory_limit);
void memblock_cap_memory_range(phys_addr_t base, phys_addr_t size);
void memblock_mem_limit_remove_map(phys_addr_t limit);
bool memblock_is_memory(phys_addr_t addr);
bool memblock_is_map_memory(phys_addr_t addr);
bool memblock_is_region_memory(phys_addr_t base, phys_addr_t size);
bool memblock_is_reserved(phys_addr_t addr);
bool memblock_is_region_reserved(phys_addr_t base, phys_addr_t size);

void memblock_dump_all(void);

/**
 * memblock_set_current_limit - Set the current allocation limit to allow
 *                         limiting allocations to what is currently
 *                         accessible during boot
 * @limit: New limit value (physical address)
 */
void memblock_set_current_limit(phys_addr_t limit);


phys_addr_t memblock_get_current_limit(void);

/*
 * pfn conversion functions
 *
 * While the memory MEMBLOCKs should always be page aligned, the reserved
 * MEMBLOCKs may not be. This accessor attempt to provide a very clear
 * idea of what they return for such non aligned MEMBLOCKs.
 */

/**
 * memblock_region_memory_base_pfn - get the lowest pfn of the memory region
 * @reg: memblock_region structure
 *
 * Return: the lowest pfn intersecting with the memory region
 */
static inline unsigned long memblock_region_memory_base_pfn(const struct memblock_region *reg)
{
        return PFN_UP(reg->base);
}

/**
 * memblock_region_memory_end_pfn - get the end pfn of the memory region
 * @reg: memblock_region structure
 *
 * Return: the end_pfn of the reserved region
 */
static inline unsigned long memblock_region_memory_end_pfn(const struct memblock_region *reg)
{
        return PFN_DOWN(reg->base + reg->size);
}

/**
 * memblock_region_reserved_base_pfn - get the lowest pfn of the reserved region
 * @reg: memblock_region structure
 *
 * Return: the lowest pfn intersecting with the reserved region
 */
static inline unsigned long memblock_region_reserved_base_pfn(const struct memblock_region *reg)
{
        return PFN_DOWN(reg->base);
}

/**
 * memblock_region_reserved_end_pfn - get the end pfn of the reserved region
 * @reg: memblock_region structure
 *
 * Return: the end_pfn of the reserved region
 */
static inline unsigned long memblock_region_reserved_end_pfn(const struct memblock_region *reg)
{
        return PFN_UP(reg->base + reg->size);
}

/**
 * for_each_mem_region - iterate over memory regions
 * @region: loop variable
 */
#define for_each_mem_region(region)                                        \
        for (region = memblock.memory.regions;                                \
             region < (memblock.memory.regions + memblock.memory.cnt);        \
             region++)

/**
 * for_each_reserved_mem_region - itereate over reserved memory regions
 * @region: loop variable
 */
#define for_each_reserved_mem_region(region)                                \
        for (region = memblock.reserved.regions;                        \
             region < (memblock.reserved.regions + memblock.reserved.cnt); \
             region++)

extern void *alloc_large_system_hash(const char *tablename,
                                     unsigned long bucketsize,
                                     unsigned long numentries,
                                     int scale,
                                     int flags,
                                     unsigned int *_hash_shift,
                                     unsigned int *_hash_mask,
                                     unsigned long low_limit,
                                     unsigned long high_limit);

#define HASH_EARLY        0x00000001        /* Allocating during early boot? */
#define HASH_ZERO        0x00000002        /* Zero allocated hash table */

/* Only NUMA needs hash distribution. 64bit NUMA architectures have
 * sufficient vmalloc space.
 */
#ifdef CONFIG_NUMA
#define HASHDIST_DEFAULT IS_ENABLED(CONFIG_64BIT)
extern int hashdist;                /* Distribute hashes across NUMA nodes? */
#else
#define hashdist (0)
#endif

#ifdef CONFIG_MEMTEST
void early_memtest(phys_addr_t start, phys_addr_t end);
void memtest_report_meminfo(struct seq_file *m);
#else
static inline void early_memtest(phys_addr_t start, phys_addr_t end) { }
static inline void memtest_report_meminfo(struct seq_file *m) { }
#endif


#endif /* _LINUX_MEMBLOCK_H */




























































































































































































































































  108 




  108 





























































  109 

  705 











  109 




  109 




























































































  202 








  200 






















































  201 




  202 







  202 
























  199 



  201 















   41 




   41 















   41 









   14 












   29 



   13 
   16 














  203 

  200 
  203 












   41 






   41 
   39 
   30 



   29 
   29 






































































































































  213 

    3 




  213 

  213 



















































  172 




  172 





















    3 




    3 


    3 


    3 































































































































































































































































































































































































































































































































































































  363 



  362 










  300 
  300 











  108 







































  576 
  298 


  363 







  677 

  677 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/namespace.c
 *
 * (C) Copyright Al Viro 2000, 2001
 *
 * Based on code from fs/super.c, copyright Linus Torvalds and others.
 * Heavily rewritten.
 */

#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/capability.h>
#include <linux/mnt_namespace.h>
#include <linux/user_namespace.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/idr.h>
#include <linux/init.h>                /* init_rootfs */
#include <linux/fs_struct.h>        /* get_fs_root et.al. */
#include <linux/fsnotify.h>        /* fsnotify_vfsmount_delete */
#include <linux/file.h>
#include <linux/uaccess.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/memblock.h>
#include <linux/proc_fs.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/pidfs.h>

#include "pnode.h"
#include "internal.h"

/* Maximum number of mounts in a mount namespace */
static unsigned int sysctl_mount_max __read_mostly = 100000;

static unsigned int m_hash_mask __ro_after_init;
static unsigned int m_hash_shift __ro_after_init;
static unsigned int mp_hash_mask __ro_after_init;
static unsigned int mp_hash_shift __ro_after_init;

static __initdata unsigned long mhash_entries;
static int __init set_mhash_entries(char *str)
{
        if (!str)
                return 0;
        mhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mhash_entries=", set_mhash_entries);

static __initdata unsigned long mphash_entries;
static int __init set_mphash_entries(char *str)
{
        if (!str)
                return 0;
        mphash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("mphash_entries=", set_mphash_entries);

static u64 event;
static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
static DEFINE_IDA(mnt_group_ida);

/* Don't allow confusion with old 32bit mount ID */
#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;

static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted);        /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
static DEFINE_SEQLOCK(mnt_ns_tree_lock);

#ifdef CONFIG_FSNOTIFY
LIST_HEAD(notify_list); /* protected by namespace_sem */
#endif
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */

enum mount_kattr_flags_t {
        MOUNT_KATTR_RECURSE                = (1 << 0),
        MOUNT_KATTR_IDMAP_REPLACE        = (1 << 1),
};

struct mount_kattr {
        unsigned int attr_set;
        unsigned int attr_clr;
        unsigned int propagation;
        unsigned int lookup_flags;
        enum mount_kattr_flags_t kflags;
        struct user_namespace *mnt_userns;
        struct mnt_idmap *mnt_idmap;
};

/* /sys/fs */
struct kobject *fs_kobj __ro_after_init;
EXPORT_SYMBOL_GPL(fs_kobj);

/*
 * vfsmount lock may be taken for read to prevent changes to the
 * vfsmount hash, ie. during mountpoint lookups or walking back
 * up the tree.
 *
 * It should be taken for write in all cases where the vfsmount
 * tree or hash is modified or when a vfsmount structure is modified.
 */
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);

static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
        if (!node)
                return NULL;
        return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
}

static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
{
        struct mnt_namespace *ns_a = node_to_mnt_ns(a);
        struct mnt_namespace *ns_b = node_to_mnt_ns(b);
        u64 seq_a = ns_a->seq;
        u64 seq_b = ns_b->seq;

        if (seq_a < seq_b)
                return -1;
        if (seq_a > seq_b)
                return 1;
        return 0;
}

static inline void mnt_ns_tree_write_lock(void)
{
        write_seqlock(&mnt_ns_tree_lock);
}

static inline void mnt_ns_tree_write_unlock(void)
{
        write_sequnlock(&mnt_ns_tree_lock);
}

static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
        struct rb_node *node, *prev;

        mnt_ns_tree_write_lock();
        node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
        /*
         * If there's no previous entry simply add it after the
         * head and if there is add it after the previous entry.
         */
        prev = rb_prev(&ns->mnt_ns_tree_node);
        if (!prev)
                list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
        else
                list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
        mnt_ns_tree_write_unlock();

        WARN_ON_ONCE(node);
}

static void mnt_ns_release(struct mnt_namespace *ns)
{
        /* keep alive for {list,stat}mount() */
        if (refcount_dec_and_test(&ns->passive)) {
                fsnotify_mntns_delete(ns);
                put_user_ns(ns->user_ns);
                kfree(ns);
        }
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))

static void mnt_ns_release_rcu(struct rcu_head *rcu)
{
        mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
}

static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
        /* remove from global mount namespace list */
        if (!is_anon_ns(ns)) {
                mnt_ns_tree_write_lock();
                rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
                list_bidir_del_rcu(&ns->mnt_ns_list);
                mnt_ns_tree_write_unlock();
        }

        call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
}

static int mnt_ns_find(const void *key, const struct rb_node *node)
{
        const u64 mnt_ns_id = *(u64 *)key;
        const struct mnt_namespace *ns = node_to_mnt_ns(node);

        if (mnt_ns_id < ns->seq)
                return -1;
        if (mnt_ns_id > ns->seq)
                return 1;
        return 0;
}

/*
 * Lookup a mount namespace by id and take a passive reference count. Taking a
 * passive reference means the mount namespace can be emptied if e.g., the last
 * task holding an active reference exits. To access the mounts of the
 * namespace the @namespace_sem must first be acquired. If the namespace has
 * already shut down before acquiring @namespace_sem, {list,stat}mount() will
 * see that the mount rbtree of the namespace is empty.
 *
 * Note the lookup is lockless protected by a sequence counter. We only
 * need to guard against false negatives as false positives aren't
 * possible. So if we didn't find a mount namespace and the sequence
 * counter has changed we need to retry. If the sequence counter is
 * still the same we know the search actually failed.
 */
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
        struct mnt_namespace *ns;
        struct rb_node *node;
        unsigned int seq;

        guard(rcu)();
        do {
                seq = read_seqbegin(&mnt_ns_tree_lock);
                node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
                if (node)
                        break;
        } while (read_seqretry(&mnt_ns_tree_lock, seq));

        if (!node)
                return NULL;

        /*
         * The last reference count is put with RCU delay so we can
         * unconditonally acquire a reference here.
         */
        ns = node_to_mnt_ns(node);
        refcount_inc(&ns->passive);
        return ns;
}

static inline void lock_mount_hash(void)
{
        write_seqlock(&mount_lock);
}

static inline void unlock_mount_hash(void)
{
        write_sequnlock(&mount_lock);
}

static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
        tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> m_hash_shift);
        return &mount_hashtable[tmp & m_hash_mask];
}

static inline struct hlist_head *mp_hash(struct dentry *dentry)
{
        unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
        tmp = tmp + (tmp >> mp_hash_shift);
        return &mountpoint_hashtable[tmp & mp_hash_mask];
}

static int mnt_alloc_id(struct mount *mnt)
{
        int res;

        xa_lock(&mnt_id_xa);
        res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
        if (!res)
                mnt->mnt_id_unique = ++mnt_id_ctr;
        xa_unlock(&mnt_id_xa);
        return res;
}

static void mnt_free_id(struct mount *mnt)
{
        xa_erase(&mnt_id_xa, mnt->mnt_id);
}

/*
 * Allocate a new peer group ID
 */
static int mnt_alloc_group_id(struct mount *mnt)
{
        int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);

        if (res < 0)
                return res;
        mnt->mnt_group_id = res;
        return 0;
}

/*
 * Release a peer group ID
 */
void mnt_release_group_id(struct mount *mnt)
{
        ida_free(&mnt_group_ida, mnt->mnt_group_id);
        mnt->mnt_group_id = 0;
}

/*
 * vfsmount lock must be held for read
 */
static inline void mnt_add_count(struct mount *mnt, int n)
{
#ifdef CONFIG_SMP
        this_cpu_add(mnt->mnt_pcp->mnt_count, n);
#else
        preempt_disable();
        mnt->mnt_count += n;
        preempt_enable();
#endif
}

/*
 * vfsmount lock must be held for write
 */
int mnt_get_count(struct mount *mnt)
{
#ifdef CONFIG_SMP
        int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
        }

        return count;
#else
        return mnt->mnt_count;
#endif
}

static struct mount *alloc_vfsmnt(const char *name)
{
        struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
        if (mnt) {
                int err;

                err = mnt_alloc_id(mnt);
                if (err)
                        goto out_free_cache;

                if (name) {
                        mnt->mnt_devname = kstrdup_const(name,
                                                         GFP_KERNEL_ACCOUNT);
                        if (!mnt->mnt_devname)
                                goto out_free_id;
                }

#ifdef CONFIG_SMP
                mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
                if (!mnt->mnt_pcp)
                        goto out_free_devname;

                this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
#else
                mnt->mnt_count = 1;
                mnt->mnt_writers = 0;
#endif

                INIT_HLIST_NODE(&mnt->mnt_hash);
                INIT_LIST_HEAD(&mnt->mnt_child);
                INIT_LIST_HEAD(&mnt->mnt_mounts);
                INIT_LIST_HEAD(&mnt->mnt_list);
                INIT_LIST_HEAD(&mnt->mnt_expire);
                INIT_LIST_HEAD(&mnt->mnt_share);
                INIT_LIST_HEAD(&mnt->mnt_slave_list);
                INIT_LIST_HEAD(&mnt->mnt_slave);
                INIT_HLIST_NODE(&mnt->mnt_mp_list);
                INIT_LIST_HEAD(&mnt->mnt_umounting);
                INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
                RB_CLEAR_NODE(&mnt->mnt_node);
                mnt->mnt.mnt_idmap = &nop_mnt_idmap;
        }
        return mnt;

#ifdef CONFIG_SMP
out_free_devname:
        kfree_const(mnt->mnt_devname);
#endif
out_free_id:
        mnt_free_id(mnt);
out_free_cache:
        kmem_cache_free(mnt_cache, mnt);
        return NULL;
}

/*
 * Most r/o checks on a fs are for operations that take
 * discrete amounts of time, like a write() or unlink().
 * We must keep track of when those operations start
 * (for permission checks) and when they end, so that
 * we can determine when writes are able to occur to
 * a filesystem.
 */
/*
 * __mnt_is_readonly: check whether a mount is read-only
 * @mnt: the mount to check for its write status
 *
 * This shouldn't be used directly ouside of the VFS.
 * It does not guarantee that the filesystem will stay
 * r/w, just that it is right *now*.  This can not and
 * should not be used in place of IS_RDONLY(inode).
 * mnt_want/drop_write() will _keep_ the filesystem
 * r/w.
 */
bool __mnt_is_readonly(struct vfsmount *mnt)
{
        return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(__mnt_is_readonly);

static inline void mnt_inc_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_inc(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers++;
#endif
}

static inline void mnt_dec_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        this_cpu_dec(mnt->mnt_pcp->mnt_writers);
#else
        mnt->mnt_writers--;
#endif
}

static unsigned int mnt_get_writers(struct mount *mnt)
{
#ifdef CONFIG_SMP
        unsigned int count = 0;
        int cpu;

        for_each_possible_cpu(cpu) {
                count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
        }

        return count;
#else
        return mnt->mnt_writers;
#endif
}

static int mnt_is_readonly(struct vfsmount *mnt)
{
        if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
                return 1;
        /*
         * The barrier pairs with the barrier in sb_start_ro_state_change()
         * making sure if we don't see s_readonly_remount set yet, we also will
         * not see any superblock / mount flag changes done by remount.
         * It also pairs with the barrier in sb_end_ro_state_change()
         * assuring that if we see s_readonly_remount already cleared, we will
         * see the values of superblock / mount flags updated by remount.
         */
        smp_rmb();
        return __mnt_is_readonly(mnt);
}

/*
 * Most r/o & frozen checks on a fs are for operations that take discrete
 * amounts of time, like a write() or unlink().  We must keep track of when
 * those operations start (for permission checks) and when they end, so that we
 * can determine when writes are able to occur to a filesystem.
 */
/**
 * mnt_get_write_access - get write access to a mount without freeze protection
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mnt it read-write) before
 * returning success. This operation does not protect against filesystem being
 * frozen. When the write operation is finished, mnt_put_write_access() must be
 * called. This is effectively a refcount.
 */
int mnt_get_write_access(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int ret = 0;

        preempt_disable();
        mnt_inc_writers(mnt);
        /*
         * The store to mnt_inc_writers must be visible before we pass
         * MNT_WRITE_HOLD loop below, so that the slowpath can see our
         * incremented count after it has set MNT_WRITE_HOLD.
         */
        smp_mb();
        might_lock(&mount_lock.lock);
        while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
                if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
                        cpu_relax();
                } else {
                        /*
                         * This prevents priority inversion, if the task
                         * setting MNT_WRITE_HOLD got preempted on a remote
                         * CPU, and it prevents life lock if the task setting
                         * MNT_WRITE_HOLD has a lower priority and is bound to
                         * the same CPU as the task that is spinning here.
                         */
                        preempt_enable();
                        lock_mount_hash();
                        unlock_mount_hash();
                        preempt_disable();
                }
        }
        /*
         * The barrier pairs with the barrier sb_start_ro_state_change() making
         * sure that if we see MNT_WRITE_HOLD cleared, we will also see
         * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
         * mnt_is_readonly() and bail in case we are racing with remount
         * read-only.
         */
        smp_rmb();
        if (mnt_is_readonly(m)) {
                mnt_dec_writers(mnt);
                ret = -EROFS;
        }
        preempt_enable();

        return ret;
}
EXPORT_SYMBOL_GPL(mnt_get_write_access);

/**
 * mnt_want_write - get write access to a mount
 * @m: the mount on which to take a write
 *
 * This tells the low-level filesystem that a write is about to be performed to
 * it, and makes sure that writes are allowed (mount is read-write, filesystem
 * is not frozen) before returning success.  When the write operation is
 * finished, mnt_drop_write() must be called.  This is effectively a refcount.
 */
int mnt_want_write(struct vfsmount *m)
{
        int ret;

        sb_start_write(m->mnt_sb);
        ret = mnt_get_write_access(m);
        if (ret)
                sb_end_write(m->mnt_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write);

/**
 * mnt_get_write_access_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_get_write_access, but if @file is already open for write it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the check for emergency r/o remounts.  This must be
 * paired with mnt_put_write_access_file.
 */
int mnt_get_write_access_file(struct file *file)
{
        if (file->f_mode & FMODE_WRITER) {
                /*
                 * Superblock may have become readonly while there are still
                 * writable fd's, e.g. due to a fs error with errors=remount-ro
                 */
                if (__mnt_is_readonly(file->f_path.mnt))
                        return -EROFS;
                return 0;
        }
        return mnt_get_write_access(file->f_path.mnt);
}

/**
 * mnt_want_write_file - get write access to a file's mount
 * @file: the file who's mount on which to take a write
 *
 * This is like mnt_want_write, but if the file is already open for writing it
 * skips incrementing mnt_writers (since the open file already has a reference)
 * and instead only does the freeze protection and the check for emergency r/o
 * remounts.  This must be paired with mnt_drop_write_file.
 */
int mnt_want_write_file(struct file *file)
{
        int ret;

        sb_start_write(file_inode(file)->i_sb);
        ret = mnt_get_write_access_file(file);
        if (ret)
                sb_end_write(file_inode(file)->i_sb);
        return ret;
}
EXPORT_SYMBOL_GPL(mnt_want_write_file);

/**
 * mnt_put_write_access - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done
 * performing writes to it.  Must be matched with
 * mnt_get_write_access() call above.
 */
void mnt_put_write_access(struct vfsmount *mnt)
{
        preempt_disable();
        mnt_dec_writers(real_mount(mnt));
        preempt_enable();
}
EXPORT_SYMBOL_GPL(mnt_put_write_access);

/**
 * mnt_drop_write - give up write access to a mount
 * @mnt: the mount on which to give up write access
 *
 * Tells the low-level filesystem that we are done performing writes to it and
 * also allows filesystem to be frozen again.  Must be matched with
 * mnt_want_write() call above.
 */
void mnt_drop_write(struct vfsmount *mnt)
{
        mnt_put_write_access(mnt);
        sb_end_write(mnt->mnt_sb);
}
EXPORT_SYMBOL_GPL(mnt_drop_write);

void mnt_put_write_access_file(struct file *file)
{
        if (!(file->f_mode & FMODE_WRITER))
                mnt_put_write_access(file->f_path.mnt);
}

void mnt_drop_write_file(struct file *file)
{
        mnt_put_write_access_file(file);
        sb_end_write(file_inode(file)->i_sb);
}
EXPORT_SYMBOL(mnt_drop_write_file);

/**
 * mnt_hold_writers - prevent write access to the given mount
 * @mnt: mnt to prevent write access to
 *
 * Prevents write access to @mnt if there are no active writers for @mnt.
 * This function needs to be called and return successfully before changing
 * properties of @mnt that need to remain stable for callers with write access
 * to @mnt.
 *
 * After this functions has been called successfully callers must pair it with
 * a call to mnt_unhold_writers() in order to stop preventing write access to
 * @mnt.
 *
 * Context: This function expects lock_mount_hash() to be held serializing
 *          setting MNT_WRITE_HOLD.
 * Return: On success 0 is returned.
 *           On error, -EBUSY is returned.
 */
static inline int mnt_hold_writers(struct mount *mnt)
{
        mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
         * should be visible before we do.
         */
        smp_mb();

        /*
         * With writers on hold, if this value is zero, then there are
         * definitely no active writers (although held writers may subsequently
         * increment the count, they'll have to wait, and decrement it after
         * seeing MNT_READONLY).
         *
         * It is OK to have counter incremented on one CPU and decremented on
         * another: the sum will add up correctly. The danger would be when we
         * sum up each counter, if we read a counter before it is incremented,
         * but then read another CPU's count which it has been subsequently
         * decremented from -- we would see more decrements than we should.
         * MNT_WRITE_HOLD protects against this scenario, because
         * mnt_want_write first increments count, then smp_mb, then spins on
         * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
         * we're counting up here.
         */
        if (mnt_get_writers(mnt) > 0)
                return -EBUSY;

        return 0;
}

/**
 * mnt_unhold_writers - stop preventing write access to the given mount
 * @mnt: mnt to stop preventing write access to
 *
 * Stop preventing write access to @mnt allowing callers to gain write access
 * to @mnt again.
 *
 * This function can only be called after a successful call to
 * mnt_hold_writers().
 *
 * Context: This function expects lock_mount_hash() to be held.
 */
static inline void mnt_unhold_writers(struct mount *mnt)
{
        /*
         * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
         * that become unheld will see MNT_READONLY.
         */
        smp_wmb();
        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
}

static int mnt_make_readonly(struct mount *mnt)
{
        int ret;

        ret = mnt_hold_writers(mnt);
        if (!ret)
                mnt->mnt.mnt_flags |= MNT_READONLY;
        mnt_unhold_writers(mnt);
        return ret;
}

int sb_prepare_remount_readonly(struct super_block *sb)
{
        struct mount *mnt;
        int err = 0;

        /* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
        if (atomic_long_read(&sb->s_remove_count))
                return -EBUSY;

        lock_mount_hash();
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
                        err = mnt_hold_writers(mnt);
                        if (err)
                                break;
                }
        }
        if (!err && atomic_long_read(&sb->s_remove_count))
                err = -EBUSY;

        if (!err)
                sb_start_ro_state_change(sb);
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
                        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
        }
        unlock_mount_hash();

        return err;
}

static void free_vfsmnt(struct mount *mnt)
{
        mnt_idmap_put(mnt_idmap(&mnt->mnt));
        kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
#endif
        kmem_cache_free(mnt_cache, mnt);
}

static void delayed_free_vfsmnt(struct rcu_head *head)
{
        free_vfsmnt(container_of(head, struct mount, mnt_rcu));
}

/* call under rcu_read_lock */
int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        struct mount *mnt;
        if (read_seqretry(&mount_lock, seq))
                return 1;
        if (bastard == NULL)
                return 0;
        mnt = real_mount(bastard);
        mnt_add_count(mnt, 1);
        smp_mb();                        // see mntput_no_expire()
        if (likely(!read_seqretry(&mount_lock, seq)))
                return 0;
        if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
                mnt_add_count(mnt, -1);
                return 1;
        }
        lock_mount_hash();
        if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
                mnt_add_count(mnt, -1);
                unlock_mount_hash();
                return 1;
        }
        unlock_mount_hash();
        /* caller will mntput() */
        return -1;
}

/* call under rcu_read_lock */
static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
{
        int res = __legitimize_mnt(bastard, seq);
        if (likely(!res))
                return true;
        if (unlikely(res < 0)) {
                rcu_read_unlock();
                mntput(bastard);
                rcu_read_lock();
        }
        return false;
}

/**
 * __lookup_mnt - find first child mount
 * @mnt:        parent mount
 * @dentry:        mountpoint
 *
 * If @mnt has a child mount @c mounted @dentry find and return it.
 *
 * Note that the child mount @c need not be unique. There are cases
 * where shadow mounts are created. For example, during mount
 * propagation when a source mount @mnt whose root got overmounted by a
 * mount @o after path lookup but before @namespace_sem could be
 * acquired gets copied and propagated. So @mnt gets copied including
 * @o. When @mnt is propagated to a destination mount @d that already
 * has another mount @n mounted at the same mountpoint then the source
 * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
 * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
 * on @dentry.
 *
 * Return: The first child of @mnt mounted @dentry or NULL.
 */
struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
{
        struct hlist_head *head = m_hash(mnt, dentry);
        struct mount *p;

        hlist_for_each_entry_rcu(p, head, mnt_hash)
                if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
                        return p;
        return NULL;
}

/*
 * lookup_mnt - Return the first child mount mounted at path
 *
 * "First" means first mounted chronologically.  If you create the
 * following mounts:
 *
 * mount /dev/sda1 /mnt
 * mount /dev/sda2 /mnt
 * mount /dev/sda3 /mnt
 *
 * Then lookup_mnt() on the base /mnt dentry in the root mount will
 * return successively the root dentry and vfsmount of /dev/sda1, then
 * /dev/sda2, then /dev/sda3, then NULL.
 *
 * lookup_mnt takes a reference to the found vfsmount.
 */
struct vfsmount *lookup_mnt(const struct path *path)
{
        struct mount *child_mnt;
        struct vfsmount *m;
        unsigned seq;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                child_mnt = __lookup_mnt(path->mnt, path->dentry);
                m = child_mnt ? &child_mnt->mnt : NULL;
        } while (!legitimize_mnt(m, seq));
        rcu_read_unlock();
        return m;
}

/*
 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
 *                         current mount namespace.
 *
 * The common case is dentries are not mountpoints at all and that
 * test is handled inline.  For the slow case when we are actually
 * dealing with a mountpoint of some kind, walk through all of the
 * mounts in the current mount namespace and test to see if the dentry
 * is a mountpoint.
 *
 * The mount_hashtable is not usable in the context because we
 * need to identify all mounts that may be in the current mount
 * namespace not just a mount that happens to have some specified
 * parent mount.
 */
bool __is_local_mountpoint(struct dentry *dentry)
{
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        struct mount *mnt, *n;
        bool is_covered = false;

        down_read(&namespace_sem);
        rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
                is_covered = (mnt->mnt_mountpoint == dentry);
                if (is_covered)
                        break;
        }
        up_read(&namespace_sem);

        return is_covered;
}

static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
{
        struct hlist_head *chain = mp_hash(dentry);
        struct mountpoint *mp;

        hlist_for_each_entry(mp, chain, m_hash) {
                if (mp->m_dentry == dentry) {
                        mp->m_count++;
                        return mp;
                }
        }
        return NULL;
}

static struct mountpoint *get_mountpoint(struct dentry *dentry)
{
        struct mountpoint *mp, *new = NULL;
        int ret;

        if (d_mountpoint(dentry)) {
                /* might be worth a WARN_ON() */
                if (d_unlinked(dentry))
                        return ERR_PTR(-ENOENT);
mountpoint:
                read_seqlock_excl(&mount_lock);
                mp = lookup_mountpoint(dentry);
                read_sequnlock_excl(&mount_lock);
                if (mp)
                        goto done;
        }

        if (!new)
                new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
        if (!new)
                return ERR_PTR(-ENOMEM);


        /* Exactly one processes may set d_mounted */
        ret = d_set_mounted(dentry);

        /* Someone else set d_mounted? */
        if (ret == -EBUSY)
                goto mountpoint;

        /* The dentry is not available as a mountpoint? */
        mp = ERR_PTR(ret);
        if (ret)
                goto done;

        /* Add the new mountpoint to the hash table */
        read_seqlock_excl(&mount_lock);
        new->m_dentry = dget(dentry);
        new->m_count = 1;
        hlist_add_head(&new->m_hash, mp_hash(dentry));
        INIT_HLIST_HEAD(&new->m_list);
        read_sequnlock_excl(&mount_lock);

        mp = new;
        new = NULL;
done:
        kfree(new);
        return mp;
}

/*
 * vfsmount lock must be held.  Additionally, the caller is responsible
 * for serializing calls for given disposal list.
 */
static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
{
        if (!--mp->m_count) {
                struct dentry *dentry = mp->m_dentry;
                BUG_ON(!hlist_empty(&mp->m_list));
                spin_lock(&dentry->d_lock);
                dentry->d_flags &= ~DCACHE_MOUNTED;
                spin_unlock(&dentry->d_lock);
                dput_to_list(dentry, list);
                hlist_del(&mp->m_hash);
                kfree(mp);
        }
}

/* called with namespace_lock and vfsmount lock */
static void put_mountpoint(struct mountpoint *mp)
{
        __put_mountpoint(mp, &ex_mountpoints);
}

static inline int check_mnt(struct mount *mnt)
{
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
}

static inline bool check_anonymous_mnt(struct mount *mnt)
{
        u64 seq;

        if (!is_anon_ns(mnt->mnt_ns))
                return false;

        seq = mnt->mnt_ns->seq_origin;
        return !seq || (seq == current->nsproxy->mnt_ns->seq);
}

/*
 * vfsmount lock must be held for write
 */
static void touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns) {
                ns->event = ++event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static void __touch_mnt_namespace(struct mnt_namespace *ns)
{
        if (ns && ns->event != event) {
                ns->event = event;
                wake_up_interruptible(&ns->poll);
        }
}

/*
 * vfsmount lock must be held for write
 */
static struct mountpoint *unhash_mnt(struct mount *mnt)
{
        struct mountpoint *mp;
        mnt->mnt_parent = mnt;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        list_del_init(&mnt->mnt_child);
        hlist_del_init_rcu(&mnt->mnt_hash);
        hlist_del_init(&mnt->mnt_mp_list);
        mp = mnt->mnt_mp;
        mnt->mnt_mp = NULL;
        return mp;
}

/*
 * vfsmount lock must be held for write
 */
static void umount_mnt(struct mount *mnt)
{
        put_mountpoint(unhash_mnt(mnt));
}

/*
 * vfsmount lock must be held for write
 */
void mnt_set_mountpoint(struct mount *mnt,
                        struct mountpoint *mp,
                        struct mount *child_mnt)
{
        mp->m_count++;
        mnt_add_count(mnt, 1);        /* essentially, that's mntget */
        child_mnt->mnt_mountpoint = mp->m_dentry;
        child_mnt->mnt_parent = mnt;
        child_mnt->mnt_mp = mp;
        hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}

/**
 * mnt_set_mountpoint_beneath - mount a mount beneath another one
 *
 * @new_parent: the source mount
 * @top_mnt:    the mount beneath which @new_parent is mounted
 * @new_mp:     the new mountpoint of @top_mnt on @new_parent
 *
 * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
 * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
 * @new_mp. And mount @new_parent on the old parent and old
 * mountpoint of @top_mnt.
 *
 * Context: This function expects namespace_lock() and lock_mount_hash()
 *          to have been acquired in that order.
 */
static void mnt_set_mountpoint_beneath(struct mount *new_parent,
                                       struct mount *top_mnt,
                                       struct mountpoint *new_mp)
{
        struct mount *old_top_parent = top_mnt->mnt_parent;
        struct mountpoint *old_top_mp = top_mnt->mnt_mp;

        mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
        mnt_change_mountpoint(new_parent, new_mp, top_mnt);
}


static void __attach_mnt(struct mount *mnt, struct mount *parent)
{
        hlist_add_head_rcu(&mnt->mnt_hash,
                           m_hash(&parent->mnt, mnt->mnt_mountpoint));
        list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
}

/**
 * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
 *              list of child mounts
 * @parent:  the parent
 * @mnt:     the new mount
 * @mp:      the new mountpoint
 * @beneath: whether to mount @mnt beneath or on top of @parent
 *
 * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
 * to @parent's child mount list and to @mount_hashtable.
 *
 * If @beneath is true, remove @mnt from its current parent and
 * mountpoint and mount it on @mp on @parent, and mount @parent on the
 * old parent and old mountpoint of @mnt. Finally, attach @parent to
 * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
 *
 * Note, when __attach_mnt() is called @mnt->mnt_parent already points
 * to the correct parent.
 *
 * Context: This function expects namespace_lock() and lock_mount_hash()
 *          to have been acquired in that order.
 */
static void attach_mnt(struct mount *mnt, struct mount *parent,
                       struct mountpoint *mp, bool beneath)
{
        if (beneath)
                mnt_set_mountpoint_beneath(mnt, parent, mp);
        else
                mnt_set_mountpoint(parent, mp, mnt);
        /*
         * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
         * beneath @parent then @mnt will need to be attached to
         * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
         * isn't the same mount as @parent.
         */
        __attach_mnt(mnt, mnt->mnt_parent);
}

void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
        struct mountpoint *old_mp = mnt->mnt_mp;
        struct mount *old_parent = mnt->mnt_parent;

        list_del_init(&mnt->mnt_child);
        hlist_del_init(&mnt->mnt_mp_list);
        hlist_del_init_rcu(&mnt->mnt_hash);

        attach_mnt(mnt, parent, mp, false);

        put_mountpoint(old_mp);
        mnt_add_count(old_parent, -1);
}

static inline struct mount *node_to_mount(struct rb_node *node)
{
        return node ? rb_entry(node, struct mount, mnt_node) : NULL;
}

static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
{
        struct rb_node **link = &ns->mounts.rb_node;
        struct rb_node *parent = NULL;
        bool mnt_first_node = true, mnt_last_node = true;

        WARN_ON(mnt_ns_attached(mnt));
        mnt->mnt_ns = ns;
        while (*link) {
                parent = *link;
                if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
                        link = &parent->rb_left;
                        mnt_last_node = false;
                } else {
                        link = &parent->rb_right;
                        mnt_first_node = false;
                }
        }

        if (mnt_last_node)
                ns->mnt_last_node = &mnt->mnt_node;
        if (mnt_first_node)
                ns->mnt_first_node = &mnt->mnt_node;
        rb_link_node(&mnt->mnt_node, parent, link);
        rb_insert_color(&mnt->mnt_node, &ns->mounts);

        mnt_notify_add(mnt);
}

/*
 * vfsmount lock must be held for write
 */
static void commit_tree(struct mount *mnt)
{
        struct mount *parent = mnt->mnt_parent;
        struct mount *m;
        LIST_HEAD(head);
        struct mnt_namespace *n = parent->mnt_ns;

        BUG_ON(parent == mnt);

        list_add_tail(&head, &mnt->mnt_list);
        while (!list_empty(&head)) {
                m = list_first_entry(&head, typeof(*m), mnt_list);
                list_del(&m->mnt_list);

                mnt_add_to_ns(n, m);
        }
        n->nr_mounts += n->pending_mounts;
        n->pending_mounts = 0;

        __attach_mnt(mnt, parent);
        touch_mnt_namespace(n);
}

static struct mount *next_mnt(struct mount *p, struct mount *root)
{
        struct list_head *next = p->mnt_mounts.next;
        if (next == &p->mnt_mounts) {
                while (1) {
                        if (p == root)
                                return NULL;
                        next = p->mnt_child.next;
                        if (next != &p->mnt_parent->mnt_mounts)
                                break;
                        p = p->mnt_parent;
                }
        }
        return list_entry(next, struct mount, mnt_child);
}

static struct mount *skip_mnt_tree(struct mount *p)
{
        struct list_head *prev = p->mnt_mounts.prev;
        while (prev != &p->mnt_mounts) {
                p = list_entry(prev, struct mount, mnt_child);
                prev = p->mnt_mounts.prev;
        }
        return p;
}

/**
 * vfs_create_mount - Create a mount for a configured superblock
 * @fc: The configuration context with the superblock attached
 *
 * Create a mount to an already configured superblock.  If necessary, the
 * caller should invoke vfs_get_tree() before calling this.
 *
 * Note that this does not attach the mount to anything.
 */
struct vfsmount *vfs_create_mount(struct fs_context *fc)
{
        struct mount *mnt;

        if (!fc->root)
                return ERR_PTR(-EINVAL);

        mnt = alloc_vfsmnt(fc->source ?: "none");
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (fc->sb_flags & SB_KERNMOUNT)
                mnt->mnt.mnt_flags = MNT_INTERNAL;

        atomic_inc(&fc->root->d_sb->s_active);
        mnt->mnt.mnt_sb                = fc->root->d_sb;
        mnt->mnt.mnt_root        = dget(fc->root);
        mnt->mnt_mountpoint        = mnt->mnt.mnt_root;
        mnt->mnt_parent                = mnt;

        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
        unlock_mount_hash();
        return &mnt->mnt;
}
EXPORT_SYMBOL(vfs_create_mount);

struct vfsmount *fc_mount(struct fs_context *fc)
{
        int err = vfs_get_tree(fc);
        if (!err) {
                up_write(&fc->root->d_sb->s_umount);
                return vfs_create_mount(fc);
        }
        return ERR_PTR(err);
}
EXPORT_SYMBOL(fc_mount);

struct vfsmount *vfs_kern_mount(struct file_system_type *type,
                                int flags, const char *name,
                                void *data)
{
        struct fs_context *fc;
        struct vfsmount *mnt;
        int ret = 0;

        if (!type)
                return ERR_PTR(-EINVAL);

        fc = fs_context_for_mount(type, flags);
        if (IS_ERR(fc))
                return ERR_CAST(fc);

        if (name)
                ret = vfs_parse_fs_string(fc, "source",
                                          name, strlen(name));
        if (!ret)
                ret = parse_monolithic_mount_data(fc, data);
        if (!ret)
                mnt = fc_mount(fc);
        else
                mnt = ERR_PTR(ret);

        put_fs_context(fc);
        return mnt;
}
EXPORT_SYMBOL_GPL(vfs_kern_mount);

struct vfsmount *
vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
             const char *name, void *data)
{
        /* Until it is worked out how to pass the user namespace
         * through from the parent mount to the submount don't support
         * unprivileged mounts with submounts.
         */
        if (mountpoint->d_sb->s_user_ns != &init_user_ns)
                return ERR_PTR(-EPERM);

        return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
}
EXPORT_SYMBOL_GPL(vfs_submount);

static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                                        int flag)
{
        struct super_block *sb = old->mnt.mnt_sb;
        struct mount *mnt;
        int err;

        mnt = alloc_vfsmnt(old->mnt_devname);
        if (!mnt)
                return ERR_PTR(-ENOMEM);

        if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
                mnt->mnt_group_id = 0; /* not a peer of original */
        else
                mnt->mnt_group_id = old->mnt_group_id;

        if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
                err = mnt_alloc_group_id(mnt);
                if (err)
                        goto out_free;
        }

        mnt->mnt.mnt_flags = old->mnt.mnt_flags;
        mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);

        atomic_inc(&sb->s_active);
        mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));

        mnt->mnt.mnt_sb = sb;
        mnt->mnt.mnt_root = dget(root);
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        mnt->mnt_parent = mnt;
        lock_mount_hash();
        list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
        unlock_mount_hash();

        if ((flag & CL_SLAVE) ||
            ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
                list_add(&mnt->mnt_slave, &old->mnt_slave_list);
                mnt->mnt_master = old;
                CLEAR_MNT_SHARED(mnt);
        } else if (!(flag & CL_PRIVATE)) {
                if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
                        list_add(&mnt->mnt_share, &old->mnt_share);
                if (IS_MNT_SLAVE(old))
                        list_add(&mnt->mnt_slave, &old->mnt_slave);
                mnt->mnt_master = old->mnt_master;
        } else {
                CLEAR_MNT_SHARED(mnt);
        }
        if (flag & CL_MAKE_SHARED)
                set_mnt_shared(mnt);

        /* stick the duplicate mount on the same expiry list
         * as the original if that was on one */
        if (flag & CL_EXPIRE) {
                if (!list_empty(&old->mnt_expire))
                        list_add(&mnt->mnt_expire, &old->mnt_expire);
        }

        return mnt;

 out_free:
        mnt_free_id(mnt);
        free_vfsmnt(mnt);
        return ERR_PTR(err);
}

static void cleanup_mnt(struct mount *mnt)
{
        struct hlist_node *p;
        struct mount *m;
        /*
         * The warning here probably indicates that somebody messed
         * up a mnt_want/drop_write() pair.  If this happens, the
         * filesystem was probably unable to make r/w->r/o transitions.
         * The locking used to deal with mnt_count decrement provides barriers,
         * so mnt_get_writers() below is safe.
         */
        WARN_ON(mnt_get_writers(mnt));
        if (unlikely(mnt->mnt_pins.first))
                mnt_pin_kill(mnt);
        hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
        fsnotify_vfsmount_delete(&mnt->mnt);
        dput(mnt->mnt.mnt_root);
        deactivate_super(mnt->mnt.mnt_sb);
        mnt_free_id(mnt);
        call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
}

static void __cleanup_mnt(struct rcu_head *head)
{
        cleanup_mnt(container_of(head, struct mount, mnt_rcu));
}

static LLIST_HEAD(delayed_mntput_list);
static void delayed_mntput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_mntput_list);
        struct mount *m, *t;

        llist_for_each_entry_safe(m, t, node, mnt_llist)
                cleanup_mnt(m);
}
static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);

static void mntput_no_expire(struct mount *mnt)
{
        LIST_HEAD(list);
        int count;

        rcu_read_lock();
        if (likely(READ_ONCE(mnt->mnt_ns))) {
                /*
                 * Since we don't do lock_mount_hash() here,
                 * ->mnt_ns can change under us.  However, if it's
                 * non-NULL, then there's a reference that won't
                 * be dropped until after an RCU delay done after
                 * turning ->mnt_ns NULL.  So if we observe it
                 * non-NULL under rcu_read_lock(), the reference
                 * we are dropping is not the final one.
                 */
                mnt_add_count(mnt, -1);
                rcu_read_unlock();
                return;
        }
        lock_mount_hash();
        /*
         * make sure that if __legitimize_mnt() has not seen us grab
         * mount_lock, we'll see their refcount increment here.
         */
        smp_mb();
        mnt_add_count(mnt, -1);
        count = mnt_get_count(mnt);
        if (count != 0) {
                WARN_ON(count < 0);
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
                rcu_read_unlock();
                unlock_mount_hash();
                return;
        }
        mnt->mnt.mnt_flags |= MNT_DOOMED;
        rcu_read_unlock();

        list_del(&mnt->mnt_instance);

        if (unlikely(!list_empty(&mnt->mnt_mounts))) {
                struct mount *p, *tmp;
                list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
                        __put_mountpoint(unhash_mnt(p), &list);
                        hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
                }
        }
        unlock_mount_hash();
        shrink_dentry_list(&list);

        if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
                struct task_struct *task = current;
                if (likely(!(task->flags & PF_KTHREAD))) {
                        init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
                        if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
                                return;
                }
                if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
                        schedule_delayed_work(&delayed_mntput_work, 1);
                return;
        }
        cleanup_mnt(mnt);
}

void mntput(struct vfsmount *mnt)
{
        if (mnt) {
                struct mount *m = real_mount(mnt);
                /* avoid cacheline pingpong */
                if (unlikely(m->mnt_expiry_mark))
                        WRITE_ONCE(m->mnt_expiry_mark, 0);
                mntput_no_expire(m);
        }
}
EXPORT_SYMBOL(mntput);

struct vfsmount *mntget(struct vfsmount *mnt)
{
        if (mnt)
                mnt_add_count(real_mount(mnt), 1);
        return mnt;
}
EXPORT_SYMBOL(mntget);

/*
 * Make a mount point inaccessible to new lookups.
 * Because there may still be current users, the caller MUST WAIT
 * for an RCU grace period before destroying the mount point.
 */
void mnt_make_shortterm(struct vfsmount *mnt)
{
        if (mnt)
                real_mount(mnt)->mnt_ns = NULL;
}

/**
 * path_is_mountpoint() - Check if path is a mount in the current namespace.
 * @path: path to check
 *
 *  d_mountpoint() can only be used reliably to establish if a dentry is
 *  not mounted in any namespace and that common case is handled inline.
 *  d_mountpoint() isn't aware of the possibility there may be multiple
 *  mounts using a given dentry in a different namespace. This function
 *  checks if the passed in path is a mountpoint rather than the dentry
 *  alone.
 */
bool path_is_mountpoint(const struct path *path)
{
        unsigned seq;
        bool res;

        if (!d_mountpoint(path->dentry))
                return false;

        rcu_read_lock();
        do {
                seq = read_seqbegin(&mount_lock);
                res = __path_is_mountpoint(path);
        } while (read_seqretry(&mount_lock, seq));
        rcu_read_unlock();

        return res;
}
EXPORT_SYMBOL(path_is_mountpoint);

struct vfsmount *mnt_clone_internal(const struct path *path)
{
        struct mount *p;
        p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
        if (IS_ERR(p))
                return ERR_CAST(p);
        p->mnt.mnt_flags |= MNT_INTERNAL;
        return &p->mnt;
}

/*
 * Returns the mount which either has the specified mnt_id, or has the next
 * smallest id afer the specified one.
 */
static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
{
        struct rb_node *node = ns->mounts.rb_node;
        struct mount *ret = NULL;

        while (node) {
                struct mount *m = node_to_mount(node);

                if (mnt_id <= m->mnt_id_unique) {
                        ret = node_to_mount(node);
                        if (mnt_id == m->mnt_id_unique)
                                break;
                        node = node->rb_left;
                } else {
                        node = node->rb_right;
                }
        }
        return ret;
}

/*
 * Returns the mount which either has the specified mnt_id, or has the next
 * greater id before the specified one.
 */
static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
{
        struct rb_node *node = ns->mounts.rb_node;
        struct mount *ret = NULL;

        while (node) {
                struct mount *m = node_to_mount(node);

                if (mnt_id >= m->mnt_id_unique) {
                        ret = node_to_mount(node);
                        if (mnt_id == m->mnt_id_unique)
                                break;
                        node = node->rb_right;
                } else {
                        node = node->rb_left;
                }
        }
        return ret;
}

#ifdef CONFIG_PROC_FS

/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
        struct proc_mounts *p = m->private;

        down_read(&namespace_sem);

        return mnt_find_id_at(p->ns, *pos);
}

static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
        struct mount *next = NULL, *mnt = v;
        struct rb_node *node = rb_next(&mnt->mnt_node);

        ++*pos;
        if (node) {
                next = node_to_mount(node);
                *pos = next->mnt_id_unique;
        }
        return next;
}

static void m_stop(struct seq_file *m, void *v)
{
        up_read(&namespace_sem);
}

static int m_show(struct seq_file *m, void *v)
{
        struct proc_mounts *p = m->private;
        struct mount *r = v;
        return p->show(m, &r->mnt);
}

const struct seq_operations mounts_op = {
        .start        = m_start,
        .next        = m_next,
        .stop        = m_stop,
        .show        = m_show,
};

#endif  /* CONFIG_PROC_FS */

/**
 * may_umount_tree - check if a mount tree is busy
 * @m: root of mount tree
 *
 * This is called to check if a tree of mounts has any
 * open files, pwds, chroots or sub mounts that are
 * busy.
 */
int may_umount_tree(struct vfsmount *m)
{
        struct mount *mnt = real_mount(m);
        int actual_refs = 0;
        int minimum_refs = 0;
        struct mount *p;
        BUG_ON(!m);

        /* write lock needed for mnt_get_count */
        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
        unlock_mount_hash();

        if (actual_refs > minimum_refs)
                return 0;

        return 1;
}

EXPORT_SYMBOL(may_umount_tree);

/**
 * may_umount - check if a mount point is busy
 * @mnt: root of mount
 *
 * This is called to check if a mount point has any
 * open files, pwds, chroots or sub mounts. If the
 * mount has sub mounts this will return busy
 * regardless of whether the sub mounts are busy.
 *
 * Doesn't take quota and stuff into account. IOW, in some cases it will
 * give false negatives. The main reason why it's here is that we need
 * a non-destructive way to look for easily umountable filesystems.
 */
int may_umount(struct vfsmount *mnt)
{
        int ret = 1;
        down_read(&namespace_sem);
        lock_mount_hash();
        if (propagate_mount_busy(real_mount(mnt), 2))
                ret = 0;
        unlock_mount_hash();
        up_read(&namespace_sem);
        return ret;
}

EXPORT_SYMBOL(may_umount);

#ifdef CONFIG_FSNOTIFY
static void mnt_notify(struct mount *p)
{
        if (!p->prev_ns && p->mnt_ns) {
                fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
        } else if (p->prev_ns && !p->mnt_ns) {
                fsnotify_mnt_detach(p->prev_ns, &p->mnt);
        } else if (p->prev_ns == p->mnt_ns) {
                fsnotify_mnt_move(p->mnt_ns, &p->mnt);
        } else {
                fsnotify_mnt_detach(p->prev_ns, &p->mnt);
                fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
        }
        p->prev_ns = p->mnt_ns;
}

static void notify_mnt_list(void)
{
        struct mount *m, *tmp;
        /*
         * Notify about mounts that were added/reparented/detached/remain
         * connected after unmount.
         */
        list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
                mnt_notify(m);
                list_del_init(&m->to_notify);
        }
}

static bool need_notify_mnt_list(void)
{
        return !list_empty(&notify_list);
}
#else
static void notify_mnt_list(void)
{
}

static bool need_notify_mnt_list(void)
{
        return false;
}
#endif

static void namespace_unlock(void)
{
        struct hlist_head head;
        struct hlist_node *p;
        struct mount *m;
        LIST_HEAD(list);

        hlist_move_list(&unmounted, &head);
        list_splice_init(&ex_mountpoints, &list);

        if (need_notify_mnt_list()) {
                /*
                 * No point blocking out concurrent readers while notifications
                 * are sent. This will also allow statmount()/listmount() to run
                 * concurrently.
                 */
                downgrade_write(&namespace_sem);
                notify_mnt_list();
                up_read(&namespace_sem);
        } else {
                up_write(&namespace_sem);
        }

        shrink_dentry_list(&list);

        if (likely(hlist_empty(&head)))
                return;

        synchronize_rcu_expedited();

        hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
                hlist_del(&m->mnt_umount);
                mntput(&m->mnt);
        }
}

static inline void namespace_lock(void)
{
        down_write(&namespace_sem);
}

DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock())

enum umount_tree_flags {
        UMOUNT_SYNC = 1,
        UMOUNT_PROPAGATE = 2,
        UMOUNT_CONNECTED = 4,
};

static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
{
        /* Leaving mounts connected is only valid for lazy umounts */
        if (how & UMOUNT_SYNC)
                return true;

        /* A mount without a parent has nothing to be connected to */
        if (!mnt_has_parent(mnt))
                return true;

        /* Because the reference counting rules change when mounts are
         * unmounted and connected, umounted mounts may not be
         * connected to mounted mounts.
         */
        if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
                return true;

        /* Has it been requested that the mount remain connected? */
        if (how & UMOUNT_CONNECTED)
                return false;

        /* Is the mount locked such that it needs to remain connected? */
        if (IS_MNT_LOCKED(mnt))
                return false;

        /* By default disconnect the mount */
        return true;
}

/*
 * mount_lock must be held
 * namespace_sem must be held for write
 */
static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
        LIST_HEAD(tmp_list);
        struct mount *p;

        if (how & UMOUNT_PROPAGATE)
                propagate_mount_unlock(mnt);

        /* Gather the mounts to umount */
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                p->mnt.mnt_flags |= MNT_UMOUNT;
                if (mnt_ns_attached(p))
                        move_from_ns(p, &tmp_list);
                else
                        list_move(&p->mnt_list, &tmp_list);
        }

        /* Hide the mounts from mnt_mounts */
        list_for_each_entry(p, &tmp_list, mnt_list) {
                list_del_init(&p->mnt_child);
        }

        /* Add propagated mounts to the tmp_list */
        if (how & UMOUNT_PROPAGATE)
                propagate_umount(&tmp_list);

        while (!list_empty(&tmp_list)) {
                struct mnt_namespace *ns;
                bool disconnect;
                p = list_first_entry(&tmp_list, struct mount, mnt_list);
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                ns = p->mnt_ns;
                if (ns) {
                        ns->nr_mounts--;
                        __touch_mnt_namespace(ns);
                }
                p->mnt_ns = NULL;
                if (how & UMOUNT_SYNC)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;

                disconnect = disconnect_mount(p, how);
                if (mnt_has_parent(p)) {
                        mnt_add_count(p->mnt_parent, -1);
                        if (!disconnect) {
                                /* Don't forget about p */
                                list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
                        } else {
                                umount_mnt(p);
                        }
                }
                change_mnt_propagation(p, MS_PRIVATE);
                if (disconnect)
                        hlist_add_head(&p->mnt_umount, &unmounted);

                /*
                 * At this point p->mnt_ns is NULL, notification will be queued
                 * only if
                 *
                 *  - p->prev_ns is non-NULL *and*
                 *  - p->prev_ns->n_fsnotify_marks is non-NULL
                 *
                 * This will preclude queuing the mount if this is a cleanup
                 * after a failed copy_tree() or destruction of an anonymous
                 * namespace, etc.
                 */
                mnt_notify_add(p);
        }
}

static void shrink_submounts(struct mount *mnt);

static int do_umount_root(struct super_block *sb)
{
        int ret = 0;

        down_write(&sb->s_umount);
        if (!sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
                                                SB_RDONLY);
                if (IS_ERR(fc)) {
                        ret = PTR_ERR(fc);
                } else {
                        ret = parse_monolithic_mount_data(fc, NULL);
                        if (!ret)
                                ret = reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
        up_write(&sb->s_umount);
        return ret;
}

static int do_umount(struct mount *mnt, int flags)
{
        struct super_block *sb = mnt->mnt.mnt_sb;
        int retval;

        retval = security_sb_umount(&mnt->mnt, flags);
        if (retval)
                return retval;

        /*
         * Allow userspace to request a mountpoint be expired rather than
         * unmounting unconditionally. Unmount only happens if:
         *  (1) the mark is already set (the mark is cleared by mntput())
         *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
         */
        if (flags & MNT_EXPIRE) {
                if (&mnt->mnt == current->fs->root.mnt ||
                    flags & (MNT_FORCE | MNT_DETACH))
                        return -EINVAL;

                /*
                 * probably don't strictly need the lock here if we examined
                 * all race cases, but it's a slowpath.
                 */
                lock_mount_hash();
                if (mnt_get_count(mnt) != 2) {
                        unlock_mount_hash();
                        return -EBUSY;
                }
                unlock_mount_hash();

                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
        }

        /*
         * If we may have to abort operations to get out of this
         * mount, and they will themselves hold resources we must
         * allow the fs to do things. In the Unix tradition of
         * 'Gee thats tricky lets do it in userspace' the umount_begin
         * might fail to complete on the first run through as other tasks
         * must return, and the like. Thats for the mount program to worry
         * about for the moment.
         */

        if (flags & MNT_FORCE && sb->s_op->umount_begin) {
                sb->s_op->umount_begin(sb);
        }

        /*
         * No sense to grab the lock for this test, but test itself looks
         * somewhat bogus. Suggestions for better replacement?
         * Ho-hum... In principle, we might treat that as umount + switch
         * to rootfs. GC would eventually take care of the old vfsmount.
         * Actually it makes sense, especially if rootfs would contain a
         * /reboot - static binary that would close all descriptors and
         * call reboot(9). Then init(8) could umount root and exec /reboot.
         */
        if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
                /*
                 * Special case for "unmounting" root ...
                 * we just try to remount it readonly.
                 */
                if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                        return -EPERM;
                return do_umount_root(sb);
        }

        namespace_lock();
        lock_mount_hash();

        /* Recheck MNT_LOCKED with the locks held */
        retval = -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        event++;
        if (flags & MNT_DETACH) {
                if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
                        umount_tree(mnt, UMOUNT_PROPAGATE);
                retval = 0;
        } else {
                shrink_submounts(mnt);
                retval = -EBUSY;
                if (!propagate_mount_busy(mnt, 2)) {
                        if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
                                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                        retval = 0;
                }
        }
out:
        unlock_mount_hash();
        namespace_unlock();
        return retval;
}

/*
 * __detach_mounts - lazily unmount all mounts on the specified dentry
 *
 * During unlink, rmdir, and d_drop it is possible to loose the path
 * to an existing mountpoint, and wind up leaking the mount.
 * detach_mounts allows lazily unmounting those mounts instead of
 * leaking them.
 *
 * The caller may hold dentry->d_inode->i_mutex.
 */
void __detach_mounts(struct dentry *dentry)
{
        struct mountpoint *mp;
        struct mount *mnt;

        namespace_lock();
        lock_mount_hash();
        mp = lookup_mountpoint(dentry);
        if (!mp)
                goto out_unlock;

        event++;
        while (!hlist_empty(&mp->m_list)) {
                mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
                if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
                        umount_mnt(mnt);
                        hlist_add_head(&mnt->mnt_umount, &unmounted);
                }
                else umount_tree(mnt, UMOUNT_CONNECTED);
        }
        put_mountpoint(mp);
out_unlock:
        unlock_mount_hash();
        namespace_unlock();
}

/*
 * Is the caller allowed to modify his namespace?
 */
bool may_mount(void)
{
        return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
}

static void warn_mandlock(void)
{
        pr_warn_once("=======================================================\n"
                     "WARNING: The mand mount option has been deprecated and\n"
                     "         and is ignored by this kernel. Remove the mand\n"
                     "         option from the mount to silence this warning.\n"
                     "=======================================================\n");
}

static int can_umount(const struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);
        struct super_block *sb = path->dentry->d_sb;

        if (!may_mount())
                return -EPERM;
        if (!path_mounted(path))
                return -EINVAL;
        if (!check_mnt(mnt))
                return -EINVAL;
        if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
                return -EINVAL;
        if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;
        return 0;
}

// caller is responsible for flags being sane
int path_umount(struct path *path, int flags)
{
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        ret = can_umount(path, flags);
        if (!ret)
                ret = do_umount(mnt, flags);

        /* we mustn't call path_put() as that would clear mnt_expiry_mark */
        dput(path->dentry);
        mntput_no_expire(mnt);
        return ret;
}

static int ksys_umount(char __user *name, int flags)
{
        int lookup_flags = LOOKUP_MOUNTPOINT;
        struct path path;
        int ret;

        // basic validity checks done first
        if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
                return -EINVAL;

        if (!(flags & UMOUNT_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
        ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
        if (ret)
                return ret;
        return path_umount(&path, flags);
}

SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
{
        return ksys_umount(name, flags);
}

#ifdef __ARCH_WANT_SYS_OLDUMOUNT

/*
 *        The 2.0 compatible umount. No flags.
 */
SYSCALL_DEFINE1(oldumount, char __user *, name)
{
        return ksys_umount(name, 0);
}

#endif

static bool is_mnt_ns_file(struct dentry *dentry)
{
        struct ns_common *ns;

        /* Is this a proxy for a mount namespace? */
        if (dentry->d_op != &ns_dentry_operations)
                return false;

        ns = d_inode(dentry)->i_private;

        return ns->ops == &mntns_operations;
}

struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
{
        return &mnt->ns;
}

struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
        guard(rcu)();

        for (;;) {
                struct list_head *list;

                if (previous)
                        list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
                else
                        list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
                if (list_is_head(list, &mnt_ns_list))
                        return ERR_PTR(-ENOENT);

                mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);

                /*
                 * The last passive reference count is put with RCU
                 * delay so accessing the mount namespace is not just
                 * safe but all relevant members are still valid.
                 */
                if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
                        continue;

                /*
                 * We need an active reference count as we're persisting
                 * the mount namespace and it might already be on its
                 * deathbed.
                 */
                if (!refcount_inc_not_zero(&mntns->ns.count))
                        continue;

                return mntns;
        }
}

struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
{
        if (!is_mnt_ns_file(dentry))
                return NULL;

        return to_mnt_ns(get_proc_ns(dentry->d_inode));
}

static bool mnt_ns_loop(struct dentry *dentry)
{
        /* Could bind mounting the mount namespace inode cause a
         * mount namespace loop?
         */
        struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);

        if (!mnt_ns)
                return false;

        return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}

struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
                                        int flag)
{
        struct mount *res, *src_parent, *src_root_child, *src_mnt,
                *dst_parent, *dst_mnt;

        if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
                return ERR_PTR(-EINVAL);

        if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
                return ERR_PTR(-EINVAL);

        res = dst_mnt = clone_mnt(src_root, dentry, flag);
        if (IS_ERR(dst_mnt))
                return dst_mnt;

        src_parent = src_root;
        dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;

        list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
                if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
                        continue;

                for (src_mnt = src_root_child; src_mnt;
                    src_mnt = next_mnt(src_mnt, src_root_child)) {
                        if (!(flag & CL_COPY_UNBINDABLE) &&
                            IS_MNT_UNBINDABLE(src_mnt)) {
                                if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
                                        /* Both unbindable and locked. */
                                        dst_mnt = ERR_PTR(-EPERM);
                                        goto out;
                                } else {
                                        src_mnt = skip_mnt_tree(src_mnt);
                                        continue;
                                }
                        }
                        if (!(flag & CL_COPY_MNT_NS_FILE) &&
                            is_mnt_ns_file(src_mnt->mnt.mnt_root)) {
                                src_mnt = skip_mnt_tree(src_mnt);
                                continue;
                        }
                        while (src_parent != src_mnt->mnt_parent) {
                                src_parent = src_parent->mnt_parent;
                                dst_mnt = dst_mnt->mnt_parent;
                        }

                        src_parent = src_mnt;
                        dst_parent = dst_mnt;
                        dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag);
                        if (IS_ERR(dst_mnt))
                                goto out;
                        lock_mount_hash();
                        list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
                        attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false);
                        unlock_mount_hash();
                }
        }
        return res;

out:
        if (res) {
                lock_mount_hash();
                umount_tree(res, UMOUNT_SYNC);
                unlock_mount_hash();
        }
        return dst_mnt;
}

/* Caller should check returned pointer for errors */

struct vfsmount *collect_mounts(const struct path *path)
{
        struct mount *tree;
        namespace_lock();
        if (!check_mnt(real_mount(path->mnt)))
                tree = ERR_PTR(-EINVAL);
        else
                tree = copy_tree(real_mount(path->mnt), path->dentry,
                                 CL_COPY_ALL | CL_PRIVATE);
        namespace_unlock();
        if (IS_ERR(tree))
                return ERR_CAST(tree);
        return &tree->mnt;
}

static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);

static inline bool must_dissolve(struct mnt_namespace *mnt_ns)
{
        /*
        * This mount belonged to an anonymous mount namespace
        * but was moved to a non-anonymous mount namespace and
        * then unmounted.
        */
        if (unlikely(!mnt_ns))
                return false;

        /*
        * This mount belongs to a non-anonymous mount namespace
        * and we know that such a mount can never transition to
        * an anonymous mount namespace again.
        */
        if (!is_anon_ns(mnt_ns)) {
                /*
                 * A detached mount either belongs to an anonymous mount
                 * namespace or a non-anonymous mount namespace. It
                 * should never belong to something purely internal.
                 */
                VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL);
                return false;
        }

        return true;
}

void dissolve_on_fput(struct vfsmount *mnt)
{
        struct mnt_namespace *ns;
        struct mount *m = real_mount(mnt);

        scoped_guard(rcu) {
                if (!must_dissolve(READ_ONCE(m->mnt_ns)))
                        return;
        }

        scoped_guard(namespace_lock, &namespace_sem) {
                ns = m->mnt_ns;
                if (!must_dissolve(ns))
                        return;

                /*
                 * After must_dissolve() we know that this is a detached
                 * mount in an anonymous mount namespace.
                 *
                 * Now when mnt_has_parent() reports that this mount
                 * tree has a parent, we know that this anonymous mount
                 * tree has been moved to another anonymous mount
                 * namespace.
                 *
                 * So when closing this file we cannot unmount the mount
                 * tree. This will be done when the file referring to
                 * the root of the anonymous mount namespace will be
                 * closed (It could already be closed but it would sync
                 * on @namespace_sem and wait for us to finish.).
                 */
                if (mnt_has_parent(m))
                        return;

                lock_mount_hash();
                umount_tree(m, UMOUNT_CONNECTED);
                unlock_mount_hash();
        }

        /* Make sure we notice when we leak mounts. */
        VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
        free_mnt_ns(ns);
}

void drop_collected_mounts(struct vfsmount *mnt)
{
        namespace_lock();
        lock_mount_hash();
        umount_tree(real_mount(mnt), 0);
        unlock_mount_hash();
        namespace_unlock();
}

bool has_locked_children(struct mount *mnt, struct dentry *dentry)
{
        struct mount *child;

        list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                if (!is_subdir(child->mnt_mountpoint, dentry))
                        continue;

                if (child->mnt.mnt_flags & MNT_LOCKED)
                        return true;
        }
        return false;
}

/*
 * Check that there aren't references to earlier/same mount namespaces in the
 * specified subtree.  Such references can act as pins for mount namespaces
 * that aren't checked by the mount-cycle checking code, thereby allowing
 * cycles to be made.
 */
static bool check_for_nsfs_mounts(struct mount *subtree)
{
        struct mount *p;
        bool ret = false;

        lock_mount_hash();
        for (p = subtree; p; p = next_mnt(p, subtree))
                if (mnt_ns_loop(p->mnt.mnt_root))
                        goto out;

        ret = true;
out:
        unlock_mount_hash();
        return ret;
}

/**
 * clone_private_mount - create a private clone of a path
 * @path: path to clone
 *
 * This creates a new vfsmount, which will be the clone of @path.  The new mount
 * will not be attached anywhere in the namespace and will be private (i.e.
 * changes to the originating mount won't be propagated into this).
 *
 * This assumes caller has called or done the equivalent of may_mount().
 *
 * Release with mntput().
 */
struct vfsmount *clone_private_mount(const struct path *path)
{
        struct mount *old_mnt = real_mount(path->mnt);
        struct mount *new_mnt;

        guard(rwsem_read)(&namespace_sem);

        if (IS_MNT_UNBINDABLE(old_mnt))
                return ERR_PTR(-EINVAL);

        if (mnt_has_parent(old_mnt)) {
                if (!check_mnt(old_mnt))
                        return ERR_PTR(-EINVAL);
        } else {
                if (!is_mounted(&old_mnt->mnt))
                        return ERR_PTR(-EINVAL);

                /* Make sure this isn't something purely kernel internal. */
                if (!is_anon_ns(old_mnt->mnt_ns))
                        return ERR_PTR(-EINVAL);

                /* Make sure we don't create mount namespace loops. */
                if (!check_for_nsfs_mounts(old_mnt))
                        return ERR_PTR(-EINVAL);
        }

        if (has_locked_children(old_mnt, path->dentry))
                return ERR_PTR(-EINVAL);

        new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
        if (IS_ERR(new_mnt))
                return ERR_PTR(-EINVAL);

        /* Longterm mount to be removed by kern_unmount*() */
        new_mnt->mnt_ns = MNT_NS_INTERNAL;
        return &new_mnt->mnt;
}
EXPORT_SYMBOL_GPL(clone_private_mount);

int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
                   struct vfsmount *root)
{
        struct mount *mnt;
        int res = f(root, arg);
        if (res)
                return res;
        list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
                res = f(&mnt->mnt, arg);
                if (res)
                        return res;
        }
        return 0;
}

static void lock_mnt_tree(struct mount *mnt)
{
        struct mount *p;

        for (p = mnt; p; p = next_mnt(p, mnt)) {
                int flags = p->mnt.mnt_flags;
                /* Don't allow unprivileged users to change mount flags */
                flags |= MNT_LOCK_ATIME;

                if (flags & MNT_READONLY)
                        flags |= MNT_LOCK_READONLY;

                if (flags & MNT_NODEV)
                        flags |= MNT_LOCK_NODEV;

                if (flags & MNT_NOSUID)
                        flags |= MNT_LOCK_NOSUID;

                if (flags & MNT_NOEXEC)
                        flags |= MNT_LOCK_NOEXEC;
                /* Don't allow unprivileged users to reveal what is under a mount */
                if (list_empty(&p->mnt_expire))
                        flags |= MNT_LOCKED;
                p->mnt.mnt_flags = flags;
        }
}

static void cleanup_group_ids(struct mount *mnt, struct mount *end)
{
        struct mount *p;

        for (p = mnt; p != end; p = next_mnt(p, mnt)) {
                if (p->mnt_group_id && !IS_MNT_SHARED(p))
                        mnt_release_group_id(p);
        }
}

static int invent_group_ids(struct mount *mnt, bool recurse)
{
        struct mount *p;

        for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
                if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
                        int err = mnt_alloc_group_id(p);
                        if (err) {
                                cleanup_group_ids(mnt, p);
                                return err;
                        }
                }
        }

        return 0;
}

int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
{
        unsigned int max = READ_ONCE(sysctl_mount_max);
        unsigned int mounts = 0;
        struct mount *p;

        if (ns->nr_mounts >= max)
                return -ENOSPC;
        max -= ns->nr_mounts;
        if (ns->pending_mounts >= max)
                return -ENOSPC;
        max -= ns->pending_mounts;

        for (p = mnt; p; p = next_mnt(p, mnt))
                mounts++;

        if (mounts > max)
                return -ENOSPC;

        ns->pending_mounts += mounts;
        return 0;
}

enum mnt_tree_flags_t {
        MNT_TREE_MOVE = BIT(0),
        MNT_TREE_BENEATH = BIT(1),
        MNT_TREE_PROPAGATION = BIT(2),
};

/**
 * attach_recursive_mnt - attach a source mount tree
 * @source_mnt: mount tree to be attached
 * @top_mnt:    mount that @source_mnt will be mounted on or mounted beneath
 * @dest_mp:    the mountpoint @source_mnt will be mounted at
 * @flags:      modify how @source_mnt is supposed to be attached
 *
 *  NOTE: in the table below explains the semantics when a source mount
 *  of a given type is attached to a destination mount of a given type.
 * ---------------------------------------------------------------------------
 * |         BIND MOUNT OPERATION                                            |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
 * ***************************************************************************
 * A bind operation clones the source mount and mounts the clone on the
 * destination mount.
 *
 * (++)  the cloned mount is propagated to all the mounts in the propagation
 *          tree of the destination mount and the cloned mount is added to
 *          the peer group of the source mount.
 * (+)   the cloned mount is created under the destination mount and is marked
 *       as shared. The cloned mount is added to the peer group of the source
 *       mount.
 * (+++) the mount is propagated to all the mounts in the propagation tree
 *       of the destination mount and the cloned mount is made slave
 *       of the same master as that of the source mount. The cloned mount
 *       is marked as 'shared and slave'.
 * (*)   the cloned mount is made a slave of the same master as that of the
 *          source mount.
 *
 * ---------------------------------------------------------------------------
 * |                         MOVE MOUNT OPERATION                                 |
 * |**************************************************************************
 * | source-->| shared        |       private  |       slave    | unbindable |
 * | dest     |               |                |                |            |
 * |   |      |               |                |                |            |
 * |   v      |               |                |                |            |
 * |**************************************************************************
 * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
 * |          |               |                |                |            |
 * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
 * ***************************************************************************
 *
 * (+)  the mount is moved to the destination. And is then propagated to
 *         all the mounts in the propagation tree of the destination mount.
 * (+*)  the mount is moved to the destination.
 * (+++)  the mount is moved to the destination and is then propagated to
 *         all the mounts belonging to the destination mount's propagation tree.
 *         the mount is marked as 'shared and slave'.
 * (*)        the mount continues to be a slave at the new location.
 *
 * if the source mount is a tree, the operations explained above is
 * applied to each mount in the tree.
 * Must be called without spinlocks held, since this function can sleep
 * in allocations.
 *
 * Context: The function expects namespace_lock() to be held.
 * Return: If @source_mnt was successfully attached 0 is returned.
 *         Otherwise a negative error code is returned.
 */
static int attach_recursive_mnt(struct mount *source_mnt,
                                struct mount *top_mnt,
                                struct mountpoint *dest_mp,
                                enum mnt_tree_flags_t flags)
{
        struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
        HLIST_HEAD(tree_list);
        struct mnt_namespace *ns = top_mnt->mnt_ns;
        struct mountpoint *smp;
        struct mount *child, *dest_mnt, *p;
        struct hlist_node *n;
        int err = 0;
        bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;

        /*
         * Preallocate a mountpoint in case the new mounts need to be
         * mounted beneath mounts on the same mountpoint.
         */
        smp = get_mountpoint(source_mnt->mnt.mnt_root);
        if (IS_ERR(smp))
                return PTR_ERR(smp);

        /* Is there space to add these mounts to the mount namespace? */
        if (!moving) {
                err = count_mounts(ns, source_mnt);
                if (err)
                        goto out;
        }

        if (beneath)
                dest_mnt = top_mnt->mnt_parent;
        else
                dest_mnt = top_mnt;

        if (IS_MNT_SHARED(dest_mnt)) {
                err = invent_group_ids(source_mnt, true);
                if (err)
                        goto out;
                err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
        }
        lock_mount_hash();
        if (err)
                goto out_cleanup_ids;

        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                        set_mnt_shared(p);
        }

        if (moving) {
                if (beneath)
                        dest_mp = smp;
                unhash_mnt(source_mnt);
                attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
                mnt_notify_add(source_mnt);
                touch_mnt_namespace(source_mnt->mnt_ns);
        } else {
                if (source_mnt->mnt_ns) {
                        LIST_HEAD(head);

                        /* move from anon - the caller will destroy */
                        for (p = source_mnt; p; p = next_mnt(p, source_mnt))
                                move_from_ns(p, &head);
                        list_del_init(&head);
                }
                if (beneath)
                        mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
                else
                        mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
                commit_tree(source_mnt);
        }

        hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
                struct mount *q;
                hlist_del_init(&child->mnt_hash);
                q = __lookup_mnt(&child->mnt_parent->mnt,
                                 child->mnt_mountpoint);
                if (q)
                        mnt_change_mountpoint(child, smp, q);
                /* Notice when we are propagating across user namespaces */
                if (child->mnt_parent->mnt_ns->user_ns != user_ns)
                        lock_mnt_tree(child);
                child->mnt.mnt_flags &= ~MNT_LOCKED;
                commit_tree(child);
        }
        put_mountpoint(smp);
        unlock_mount_hash();

        return 0;

 out_cleanup_ids:
        while (!hlist_empty(&tree_list)) {
                child = hlist_entry(tree_list.first, struct mount, mnt_hash);
                child->mnt_parent->mnt_ns->pending_mounts = 0;
                umount_tree(child, UMOUNT_SYNC);
        }
        unlock_mount_hash();
        cleanup_group_ids(source_mnt, NULL);
 out:
        ns->pending_mounts = 0;

        read_seqlock_excl(&mount_lock);
        put_mountpoint(smp);
        read_sequnlock_excl(&mount_lock);

        return err;
}

/**
 * do_lock_mount - lock mount and mountpoint
 * @path:    target path
 * @beneath: whether the intention is to mount beneath @path
 *
 * Follow the mount stack on @path until the top mount @mnt is found. If
 * the initial @path->{mnt,dentry} is a mountpoint lookup the first
 * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
 * until nothing is stacked on top of it anymore.
 *
 * Acquire the inode_lock() on the top mount's ->mnt_root to protect
 * against concurrent removal of the new mountpoint from another mount
 * namespace.
 *
 * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
 * @mp on @mnt->mnt_parent must be acquired. This protects against a
 * concurrent unlink of @mp->mnt_dentry from another mount namespace
 * where @mnt doesn't have a child mount mounted @mp. A concurrent
 * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
 * on top of it for @beneath.
 *
 * In addition, @beneath needs to make sure that @mnt hasn't been
 * unmounted or moved from its current mountpoint in between dropping
 * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
 * being unmounted would be detected later by e.g., calling
 * check_mnt(mnt) in the function it's called from. For the @beneath
 * case however, it's useful to detect it directly in do_lock_mount().
 * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
 * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
 * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
 *
 * Return: Either the target mountpoint on the top mount or the top
 *         mount's mountpoint.
 */
static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
{
        struct vfsmount *mnt = path->mnt;
        struct dentry *dentry;
        struct mountpoint *mp = ERR_PTR(-ENOENT);
        struct path under = {};

        for (;;) {
                struct mount *m = real_mount(mnt);

                if (beneath) {
                        path_put(&under);
                        read_seqlock_excl(&mount_lock);
                        under.mnt = mntget(&m->mnt_parent->mnt);
                        under.dentry = dget(m->mnt_mountpoint);
                        read_sequnlock_excl(&mount_lock);
                        dentry = under.dentry;
                } else {
                        dentry = path->dentry;
                }

                inode_lock(dentry->d_inode);
                namespace_lock();

                if (unlikely(cant_mount(dentry) || !is_mounted(mnt)))
                        break;                // not to be mounted on

                if (beneath && unlikely(m->mnt_mountpoint != dentry ||
                                        &m->mnt_parent->mnt != under.mnt)) {
                        namespace_unlock();
                        inode_unlock(dentry->d_inode);
                        continue;        // got moved
                }

                mnt = lookup_mnt(path);
                if (unlikely(mnt)) {
                        namespace_unlock();
                        inode_unlock(dentry->d_inode);
                        path_put(path);
                        path->mnt = mnt;
                        path->dentry = dget(mnt->mnt_root);
                        continue;        // got overmounted
                }
                mp = get_mountpoint(dentry);
                if (IS_ERR(mp))
                        break;
                if (beneath) {
                        /*
                         * @under duplicates the references that will stay
                         * at least until namespace_unlock(), so the path_put()
                         * below is safe (and OK to do under namespace_lock -
                         * we are not dropping the final references here).
                         */
                        path_put(&under);
                }
                return mp;
        }
        namespace_unlock();
        inode_unlock(dentry->d_inode);
        if (beneath)
                path_put(&under);
        return mp;
}

static inline struct mountpoint *lock_mount(struct path *path)
{
        return do_lock_mount(path, false);
}

static void unlock_mount(struct mountpoint *where)
{
        inode_unlock(where->m_dentry->d_inode);
        read_seqlock_excl(&mount_lock);
        put_mountpoint(where);
        read_sequnlock_excl(&mount_lock);
        namespace_unlock();
}

static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
{
        if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
                return -EINVAL;

        if (d_is_dir(mp->m_dentry) !=
              d_is_dir(mnt->mnt.mnt_root))
                return -ENOTDIR;

        return attach_recursive_mnt(mnt, p, mp, 0);
}

/*
 * Sanity check the flags to change_mnt_propagation.
 */

static int flags_to_propagation_type(int ms_flags)
{
        int type = ms_flags & ~(MS_REC | MS_SILENT);

        /* Fail if any non-propagation flags are set */
        if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return 0;
        /* Only one propagation flag should be set */
        if (!is_power_of_2(type))
                return 0;
        return type;
}

/*
 * recursively change the type of the mountpoint.
 */
static int do_change_type(struct path *path, int ms_flags)
{
        struct mount *m;
        struct mount *mnt = real_mount(path->mnt);
        int recurse = ms_flags & MS_REC;
        int type;
        int err = 0;

        if (!path_mounted(path))
                return -EINVAL;

        type = flags_to_propagation_type(ms_flags);
        if (!type)
                return -EINVAL;

        namespace_lock();
        if (type == MS_SHARED) {
                err = invent_group_ids(mnt, recurse);
                if (err)
                        goto out_unlock;
        }

        lock_mount_hash();
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
        unlock_mount_hash();

 out_unlock:
        namespace_unlock();
        return err;
}

/* may_copy_tree() - check if a mount tree can be copied
 * @path: path to the mount tree to be copied
 *
 * This helper checks if the caller may copy the mount tree starting
 * from @path->mnt. The caller may copy the mount tree under the
 * following circumstances:
 *
 * (1) The caller is located in the mount namespace of the mount tree.
 *     This also implies that the mount does not belong to an anonymous
 *     mount namespace.
 * (2) The caller tries to copy an nfs mount referring to a mount
 *     namespace, i.e., the caller is trying to copy a mount namespace
 *     entry from nsfs.
 * (3) The caller tries to copy a pidfs mount referring to a pidfd.
 * (4) The caller is trying to copy a mount tree that belongs to an
 *     anonymous mount namespace.
 *
 *     For that to be safe, this helper enforces that the origin mount
 *     namespace the anonymous mount namespace was created from is the
 *     same as the caller's mount namespace by comparing the sequence
 *     numbers.
 *
 *     This is not strictly necessary. The current semantics of the new
 *     mount api enforce that the caller must be located in the same
 *     mount namespace as the mount tree it interacts with. Using the
 *     origin sequence number preserves these semantics even for
 *     anonymous mount namespaces. However, one could envision extending
 *     the api to directly operate across mount namespace if needed.
 *
 *     The ownership of a non-anonymous mount namespace such as the
 *     caller's cannot change.
 *     => We know that the caller's mount namespace is stable.
 *
 *     If the origin sequence number of the anonymous mount namespace is
 *     the same as the sequence number of the caller's mount namespace.
 *     => The owning namespaces are the same.
 *
 *     ==> The earlier capability check on the owning namespace of the
 *         caller's mount namespace ensures that the caller has the
 *         ability to copy the mount tree.
 *
 * Returns true if the mount tree can be copied, false otherwise.
 */
static inline bool may_copy_tree(struct path *path)
{
        struct mount *mnt = real_mount(path->mnt);
        const struct dentry_operations *d_op;

        if (check_mnt(mnt))
                return true;

        d_op = path->dentry->d_op;
        if (d_op == &ns_dentry_operations)
                return true;

        if (d_op == &pidfs_dentry_operations)
                return true;

        if (!is_mounted(path->mnt))
                return false;

        return check_anonymous_mnt(mnt);
}


static struct mount *__do_loopback(struct path *old_path, int recurse)
{
        struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);

        if (IS_MNT_UNBINDABLE(old))
                return mnt;

        if (!may_copy_tree(old_path))
                return mnt;

        if (!recurse && has_locked_children(old, old_path->dentry))
                return mnt;

        if (recurse)
                mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
        else
                mnt = clone_mnt(old, old_path->dentry, 0);

        if (!IS_ERR(mnt))
                mnt->mnt.mnt_flags &= ~MNT_LOCKED;

        return mnt;
}

/*
 * do loopback mount.
 */
static int do_loopback(struct path *path, const char *old_name,
                                int recurse)
{
        struct path old_path;
        struct mount *mnt = NULL, *parent;
        struct mountpoint *mp;
        int err;
        if (!old_name || !*old_name)
                return -EINVAL;
        err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
        if (err)
                return err;

        err = -EINVAL;
        if (mnt_ns_loop(old_path.dentry))
                goto out;

        mp = lock_mount(path);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto out;
        }

        parent = real_mount(path->mnt);
        if (!check_mnt(parent))
                goto out2;

        mnt = __do_loopback(&old_path, recurse);
        if (IS_ERR(mnt)) {
                err = PTR_ERR(mnt);
                goto out2;
        }

        err = graft_tree(mnt, parent, mp);
        if (err) {
                lock_mount_hash();
                umount_tree(mnt, UMOUNT_SYNC);
                unlock_mount_hash();
        }
out2:
        unlock_mount(mp);
out:
        path_put(&old_path);
        return err;
}

static struct file *open_detached_copy(struct path *path, bool recursive)
{
        struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
        struct user_namespace *user_ns = mnt_ns->user_ns;
        struct mount *mnt, *p;
        struct file *file;

        ns = alloc_mnt_ns(user_ns, true);
        if (IS_ERR(ns))
                return ERR_CAST(ns);

        namespace_lock();

        /*
         * Record the sequence number of the source mount namespace.
         * This needs to hold namespace_sem to ensure that the mount
         * doesn't get attached.
         */
        if (is_mounted(path->mnt)) {
                src_mnt_ns = real_mount(path->mnt)->mnt_ns;
                if (is_anon_ns(src_mnt_ns))
                        ns->seq_origin = src_mnt_ns->seq_origin;
                else
                        ns->seq_origin = src_mnt_ns->seq;
        }

        mnt = __do_loopback(path, recursive);
        if (IS_ERR(mnt)) {
                namespace_unlock();
                free_mnt_ns(ns);
                return ERR_CAST(mnt);
        }

        lock_mount_hash();
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                mnt_add_to_ns(ns, p);
                ns->nr_mounts++;
        }
        ns->root = mnt;
        mntget(&mnt->mnt);
        unlock_mount_hash();
        namespace_unlock();

        mntput(path->mnt);
        path->mnt = &mnt->mnt;
        file = dentry_open(path, O_PATH, current_cred());
        if (IS_ERR(file))
                dissolve_on_fput(path->mnt);
        else
                file->f_mode |= FMODE_NEED_UNMOUNT;
        return file;
}

static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
{
        int ret;
        struct path path __free(path_put) = {};
        int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
        bool detached = flags & OPEN_TREE_CLONE;

        BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);

        if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
                      OPEN_TREE_CLOEXEC))
                return ERR_PTR(-EINVAL);

        if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
                return ERR_PTR(-EINVAL);

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        if (detached && !may_mount())
                return ERR_PTR(-EPERM);

        ret = user_path_at(dfd, filename, lookup_flags, &path);
        if (unlikely(ret))
                return ERR_PTR(ret);

        if (detached)
                return open_detached_copy(&path, flags & AT_RECURSIVE);

        return dentry_open(&path, O_PATH, current_cred());
}

SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
{
        int fd;
        struct file *file __free(fput) = NULL;

        file = vfs_open_tree(dfd, filename, flags);
        if (IS_ERR(file))
                return PTR_ERR(file);

        fd = get_unused_fd_flags(flags & O_CLOEXEC);
        if (fd < 0)
                return fd;

        fd_install(fd, no_free_ptr(file));
        return fd;
}

/*
 * Don't allow locked mount flags to be cleared.
 *
 * No locks need to be held here while testing the various MNT_LOCK
 * flags because those flags can never be cleared once they are set.
 */
static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
{
        unsigned int fl = mnt->mnt.mnt_flags;

        if ((fl & MNT_LOCK_READONLY) &&
            !(mnt_flags & MNT_READONLY))
                return false;

        if ((fl & MNT_LOCK_NODEV) &&
            !(mnt_flags & MNT_NODEV))
                return false;

        if ((fl & MNT_LOCK_NOSUID) &&
            !(mnt_flags & MNT_NOSUID))
                return false;

        if ((fl & MNT_LOCK_NOEXEC) &&
            !(mnt_flags & MNT_NOEXEC))
                return false;

        if ((fl & MNT_LOCK_ATIME) &&
            ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
                return false;

        return true;
}

static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
{
        bool readonly_request = (mnt_flags & MNT_READONLY);

        if (readonly_request == __mnt_is_readonly(&mnt->mnt))
                return 0;

        if (readonly_request)
                return mnt_make_readonly(mnt);

        mnt->mnt.mnt_flags &= ~MNT_READONLY;
        return 0;
}

static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
{
        mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
        mnt->mnt.mnt_flags = mnt_flags;
        touch_mnt_namespace(mnt->mnt_ns);
}

static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
{
        struct super_block *sb = mnt->mnt_sb;

        if (!__mnt_is_readonly(mnt) &&
           (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
           (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
                char *buf, *mntpath;

                buf = (char *)__get_free_page(GFP_KERNEL);
                if (buf)
                        mntpath = d_path(mountpoint, buf, PAGE_SIZE);
                else
                        mntpath = ERR_PTR(-ENOMEM);
                if (IS_ERR(mntpath))
                        mntpath = "(unknown)";

                pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
                        sb->s_type->name,
                        is_mounted(mnt) ? "remounted" : "mounted",
                        mntpath, &sb->s_time_max,
                        (unsigned long long)sb->s_time_max);

                sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
                if (buf)
                        free_page((unsigned long)buf);
        }
}

/*
 * Handle reconfiguration of the mountpoint only without alteration of the
 * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
 * to mount(2).
 */
static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
{
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        int ret;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (!path_mounted(path))
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        /*
         * We're only checking whether the superblock is read-only not
         * changing it, so only take down_read(&sb->s_umount).
         */
        down_read(&sb->s_umount);
        lock_mount_hash();
        ret = change_mount_ro_state(mnt, mnt_flags);
        if (ret == 0)
                set_mount_attributes(mnt, mnt_flags);
        unlock_mount_hash();
        up_read(&sb->s_umount);

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        return ret;
}

/*
 * change filesystem flags. dir should be a physical root of filesystem.
 * If you've mounted a non-root directory somewhere and want to do remount
 * on it - tough luck.
 */
static int do_remount(struct path *path, int ms_flags, int sb_flags,
                      int mnt_flags, void *data)
{
        int err;
        struct super_block *sb = path->mnt->mnt_sb;
        struct mount *mnt = real_mount(path->mnt);
        struct fs_context *fc;

        if (!check_mnt(mnt))
                return -EINVAL;

        if (!path_mounted(path))
                return -EINVAL;

        if (!can_change_locked_flags(mnt, mnt_flags))
                return -EPERM;

        fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the remount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        err = parse_monolithic_mount_data(fc, data);
        if (!err) {
                down_write(&sb->s_umount);
                err = -EPERM;
                if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
                        err = reconfigure_super(fc);
                        if (!err) {
                                lock_mount_hash();
                                set_mount_attributes(mnt, mnt_flags);
                                unlock_mount_hash();
                        }
                }
                up_write(&sb->s_umount);
        }

        mnt_warn_timestamp_expiry(path, &mnt->mnt);

        put_fs_context(fc);
        return err;
}

static inline int tree_contains_unbindable(struct mount *mnt)
{
        struct mount *p;
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                if (IS_MNT_UNBINDABLE(p))
                        return 1;
        }
        return 0;
}

static int do_set_group(struct path *from_path, struct path *to_path)
{
        struct mount *from, *to;
        int err;

        from = real_mount(from_path->mnt);
        to = real_mount(to_path->mnt);

        namespace_lock();

        err = -EINVAL;
        /* To and From must be mounted */
        if (!is_mounted(&from->mnt))
                goto out;
        if (!is_mounted(&to->mnt))
                goto out;

        err = -EPERM;
        /* We should be allowed to modify mount namespaces of both mounts */
        if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
                goto out;
        if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
                goto out;

        err = -EINVAL;
        /* To and From paths should be mount roots */
        if (!path_mounted(from_path))
                goto out;
        if (!path_mounted(to_path))
                goto out;

        /* Setting sharing groups is only allowed across same superblock */
        if (from->mnt.mnt_sb != to->mnt.mnt_sb)
                goto out;

        /* From mount root should be wider than To mount root */
        if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
                goto out;

        /* From mount should not have locked children in place of To's root */
        if (has_locked_children(from, to->mnt.mnt_root))
                goto out;

        /* Setting sharing groups is only allowed on private mounts */
        if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
                goto out;

        /* From should not be private */
        if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
                goto out;

        if (IS_MNT_SLAVE(from)) {
                struct mount *m = from->mnt_master;

                list_add(&to->mnt_slave, &m->mnt_slave_list);
                to->mnt_master = m;
        }

        if (IS_MNT_SHARED(from)) {
                to->mnt_group_id = from->mnt_group_id;
                list_add(&to->mnt_share, &from->mnt_share);
                lock_mount_hash();
                set_mnt_shared(to);
                unlock_mount_hash();
        }

        err = 0;
out:
        namespace_unlock();
        return err;
}

/**
 * path_overmounted - check if path is overmounted
 * @path: path to check
 *
 * Check if path is overmounted, i.e., if there's a mount on top of
 * @path->mnt with @path->dentry as mountpoint.
 *
 * Context: This function expects namespace_lock() to be held.
 * Return: If path is overmounted true is returned, false if not.
 */
static inline bool path_overmounted(const struct path *path)
{
        rcu_read_lock();
        if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
                rcu_read_unlock();
                return true;
        }
        rcu_read_unlock();
        return false;
}

/**
 * can_move_mount_beneath - check that we can mount beneath the top mount
 * @from: mount to mount beneath
 * @to:   mount under which to mount
 * @mp:   mountpoint of @to
 *
 * - Make sure that @to->dentry is actually the root of a mount under
 *   which we can mount another mount.
 * - Make sure that nothing can be mounted beneath the caller's current
 *   root or the rootfs of the namespace.
 * - Make sure that the caller can unmount the topmost mount ensuring
 *   that the caller could reveal the underlying mountpoint.
 * - Ensure that nothing has been mounted on top of @from before we
 *   grabbed @namespace_sem to avoid creating pointless shadow mounts.
 * - Prevent mounting beneath a mount if the propagation relationship
 *   between the source mount, parent mount, and top mount would lead to
 *   nonsensical mount trees.
 *
 * Context: This function expects namespace_lock() to be held.
 * Return: On success 0, and on error a negative error code is returned.
 */
static int can_move_mount_beneath(const struct path *from,
                                  const struct path *to,
                                  const struct mountpoint *mp)
{
        struct mount *mnt_from = real_mount(from->mnt),
                     *mnt_to = real_mount(to->mnt),
                     *parent_mnt_to = mnt_to->mnt_parent;

        if (!mnt_has_parent(mnt_to))
                return -EINVAL;

        if (!path_mounted(to))
                return -EINVAL;

        if (IS_MNT_LOCKED(mnt_to))
                return -EINVAL;

        /* Avoid creating shadow mounts during mount propagation. */
        if (path_overmounted(from))
                return -EINVAL;

        /*
         * Mounting beneath the rootfs only makes sense when the
         * semantics of pivot_root(".", ".") are used.
         */
        if (&mnt_to->mnt == current->fs->root.mnt)
                return -EINVAL;
        if (parent_mnt_to == current->nsproxy->mnt_ns->root)
                return -EINVAL;

        for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
                if (p == mnt_to)
                        return -EINVAL;

        /*
         * If the parent mount propagates to the child mount this would
         * mean mounting @mnt_from on @mnt_to->mnt_parent and then
         * propagating a copy @c of @mnt_from on top of @mnt_to. This
         * defeats the whole purpose of mounting beneath another mount.
         */
        if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
                return -EINVAL;

        /*
         * If @mnt_to->mnt_parent propagates to @mnt_from this would
         * mean propagating a copy @c of @mnt_from on top of @mnt_from.
         * Afterwards @mnt_from would be mounted on top of
         * @mnt_to->mnt_parent and @mnt_to would be unmounted from
         * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
         * already mounted on @mnt_from, @mnt_to would ultimately be
         * remounted on top of @c. Afterwards, @mnt_from would be
         * covered by a copy @c of @mnt_from and @c would be covered by
         * @mnt_from itself. This defeats the whole purpose of mounting
         * @mnt_from beneath @mnt_to.
         */
        if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
                return -EINVAL;

        return 0;
}

/* may_use_mount() - check if a mount tree can be used
 * @mnt: vfsmount to be used
 *
 * This helper checks if the caller may use the mount tree starting
 * from @path->mnt. The caller may use the mount tree under the
 * following circumstances:
 *
 * (1) The caller is located in the mount namespace of the mount tree.
 *     This also implies that the mount does not belong to an anonymous
 *     mount namespace.
 * (2) The caller is trying to use a mount tree that belongs to an
 *     anonymous mount namespace.
 *
 *     For that to be safe, this helper enforces that the origin mount
 *     namespace the anonymous mount namespace was created from is the
 *     same as the caller's mount namespace by comparing the sequence
 *     numbers.
 *
 *     The ownership of a non-anonymous mount namespace such as the
 *     caller's cannot change.
 *     => We know that the caller's mount namespace is stable.
 *
 *     If the origin sequence number of the anonymous mount namespace is
 *     the same as the sequence number of the caller's mount namespace.
 *     => The owning namespaces are the same.
 *
 *     ==> The earlier capability check on the owning namespace of the
 *         caller's mount namespace ensures that the caller has the
 *         ability to use the mount tree.
 *
 * Returns true if the mount tree can be used, false otherwise.
 */
static inline bool may_use_mount(struct mount *mnt)
{
        if (check_mnt(mnt))
                return true;

        /*
         * Make sure that noone unmounted the target path or somehow
         * managed to get their hands on something purely kernel
         * internal.
         */
        if (!is_mounted(&mnt->mnt))
                return false;

        return check_anonymous_mnt(mnt);
}

static int do_move_mount(struct path *old_path,
                         struct path *new_path, enum mnt_tree_flags_t flags)
{
        struct mnt_namespace *ns;
        struct mount *p;
        struct mount *old;
        struct mount *parent;
        struct mountpoint *mp, *old_mp;
        int err;
        bool attached, beneath = flags & MNT_TREE_BENEATH;

        mp = do_lock_mount(new_path, beneath);
        if (IS_ERR(mp))
                return PTR_ERR(mp);

        old = real_mount(old_path->mnt);
        p = real_mount(new_path->mnt);
        parent = old->mnt_parent;
        attached = mnt_has_parent(old);
        if (attached)
                flags |= MNT_TREE_MOVE;
        old_mp = old->mnt_mp;
        ns = old->mnt_ns;

        err = -EINVAL;
        if (!may_use_mount(p))
                goto out;

        /* The thing moved must be mounted... */
        if (!is_mounted(&old->mnt))
                goto out;

        /* ... and either ours or the root of anon namespace */
        if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
                goto out;

        if (is_anon_ns(ns)) {
                /*
                 * Ending up with two files referring to the root of the
                 * same anonymous mount namespace would cause an error
                 * as this would mean trying to move the same mount
                 * twice into the mount tree which would be rejected
                 * later. But be explicit about it right here.
                 */
                if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns))
                        goto out;

                /*
                 * If this is an anonymous mount tree ensure that mount
                 * propagation can detect mounts that were just
                 * propagated to the target mount tree so we don't
                 * propagate onto them.
                 */
                ns->mntns_flags |= MNTNS_PROPAGATING;
        } else if (is_anon_ns(p->mnt_ns)) {
                /*
                 * Don't allow moving an attached mount tree to an
                 * anonymous mount tree.
                 */
                goto out;
        }

        if (old->mnt.mnt_flags & MNT_LOCKED)
                goto out;

        if (!path_mounted(old_path))
                goto out;

        if (d_is_dir(new_path->dentry) !=
            d_is_dir(old_path->dentry))
                goto out;
        /*
         * Don't move a mount residing in a shared parent.
         */
        if (attached && IS_MNT_SHARED(parent))
                goto out;

        if (beneath) {
                err = can_move_mount_beneath(old_path, new_path, mp);
                if (err)
                        goto out;

                err = -EINVAL;
                p = p->mnt_parent;
                flags |= MNT_TREE_BENEATH;
        }

        /*
         * Don't move a mount tree containing unbindable mounts to a destination
         * mount which is shared.
         */
        if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
                goto out;
        err = -ELOOP;
        if (!check_for_nsfs_mounts(old))
                goto out;
        for (; mnt_has_parent(p); p = p->mnt_parent)
                if (p == old)
                        goto out;

        err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
        if (err)
                goto out;

        if (is_anon_ns(ns))
                ns->mntns_flags &= ~MNTNS_PROPAGATING;

        /* if the mount is moved, it should no longer be expire
         * automatically */
        list_del_init(&old->mnt_expire);
        if (attached)
                put_mountpoint(old_mp);
out:
        unlock_mount(mp);
        if (!err) {
                if (attached) {
                        mntput_no_expire(parent);
                } else {
                        /* Make sure we notice when we leak mounts. */
                        VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
                        free_mnt_ns(ns);
                }
        }
        return err;
}

static int do_move_mount_old(struct path *path, const char *old_name)
{
        struct path old_path;
        int err;

        if (!old_name || !*old_name)
                return -EINVAL;

        err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
        if (err)
                return err;

        err = do_move_mount(&old_path, path, 0);
        path_put(&old_path);
        return err;
}

/*
 * add a mount into a namespace's mount tree
 */
static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
                        const struct path *path, int mnt_flags)
{
        struct mount *parent = real_mount(path->mnt);

        mnt_flags &= ~MNT_INTERNAL_FLAGS;

        if (unlikely(!check_mnt(parent))) {
                /* that's acceptable only for automounts done in private ns */
                if (!(mnt_flags & MNT_SHRINKABLE))
                        return -EINVAL;
                /* ... and for those we'd better have mountpoint still alive */
                if (!parent->mnt_ns)
                        return -EINVAL;
        }

        /* Refuse the same filesystem on the same mount point */
        if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
                return -EBUSY;

        if (d_is_symlink(newmnt->mnt.mnt_root))
                return -EINVAL;

        newmnt->mnt.mnt_flags = mnt_flags;
        return graft_tree(newmnt, parent, mp);
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);

/*
 * Create a new mount using a superblock configuration and request it
 * be added to the namespace tree.
 */
static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
                           unsigned int mnt_flags)
{
        struct vfsmount *mnt;
        struct mountpoint *mp;
        struct super_block *sb = fc->root->d_sb;
        int error;

        error = security_sb_kern_mount(sb);
        if (!error && mount_too_revealing(sb, &mnt_flags))
                error = -EPERM;

        if (unlikely(error)) {
                fc_drop_locked(fc);
                return error;
        }

        up_write(&sb->s_umount);

        mnt = vfs_create_mount(fc);
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);

        mnt_warn_timestamp_expiry(mountpoint, mnt);

        mp = lock_mount(mountpoint);
        if (IS_ERR(mp)) {
                mntput(mnt);
                return PTR_ERR(mp);
        }
        error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
        unlock_mount(mp);
        if (error < 0)
                mntput(mnt);
        return error;
}

/*
 * create a new mount for userspace and request it to be added into the
 * namespace's tree
 */
static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
                        int mnt_flags, const char *name, void *data)
{
        struct file_system_type *type;
        struct fs_context *fc;
        const char *subtype = NULL;
        int err = 0;

        if (!fstype)
                return -EINVAL;

        type = get_fs_type(fstype);
        if (!type)
                return -ENODEV;

        if (type->fs_flags & FS_HAS_SUBTYPE) {
                subtype = strchr(fstype, '.');
                if (subtype) {
                        subtype++;
                        if (!*subtype) {
                                put_filesystem(type);
                                return -EINVAL;
                        }
                }
        }

        fc = fs_context_for_mount(type, sb_flags);
        put_filesystem(type);
        if (IS_ERR(fc))
                return PTR_ERR(fc);

        /*
         * Indicate to the filesystem that the mount request is coming
         * from the legacy mount system call.
         */
        fc->oldapi = true;

        if (subtype)
                err = vfs_parse_fs_string(fc, "subtype",
                                          subtype, strlen(subtype));
        if (!err && name)
                err = vfs_parse_fs_string(fc, "source", name, strlen(name));
        if (!err)
                err = parse_monolithic_mount_data(fc, data);
        if (!err && !mount_capable(fc))
                err = -EPERM;
        if (!err)
                err = vfs_get_tree(fc);
        if (!err)
                err = do_new_mount_fc(fc, path, mnt_flags);

        put_fs_context(fc);
        return err;
}

int finish_automount(struct vfsmount *m, const struct path *path)
{
        struct dentry *dentry = path->dentry;
        struct mountpoint *mp;
        struct mount *mnt;
        int err;

        if (!m)
                return 0;
        if (IS_ERR(m))
                return PTR_ERR(m);

        mnt = real_mount(m);
        /* The new mount record should have at least 2 refs to prevent it being
         * expired before we get a chance to add it
         */
        BUG_ON(mnt_get_count(mnt) < 2);

        if (m->mnt_sb == path->mnt->mnt_sb &&
            m->mnt_root == dentry) {
                err = -ELOOP;
                goto discard;
        }

        /*
         * we don't want to use lock_mount() - in this case finding something
         * that overmounts our mountpoint to be means "quitely drop what we've
         * got", not "try to mount it on top".
         */
        inode_lock(dentry->d_inode);
        namespace_lock();
        if (unlikely(cant_mount(dentry))) {
                err = -ENOENT;
                goto discard_locked;
        }
        if (path_overmounted(path)) {
                err = 0;
                goto discard_locked;
        }
        mp = get_mountpoint(dentry);
        if (IS_ERR(mp)) {
                err = PTR_ERR(mp);
                goto discard_locked;
        }

        err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
        unlock_mount(mp);
        if (unlikely(err))
                goto discard;
        mntput(m);
        return 0;

discard_locked:
        namespace_unlock();
        inode_unlock(dentry->d_inode);
discard:
        /* remove m from any expiration list it may be on */
        if (!list_empty(&mnt->mnt_expire)) {
                namespace_lock();
                list_del_init(&mnt->mnt_expire);
                namespace_unlock();
        }
        mntput(m);
        mntput(m);
        return err;
}

/**
 * mnt_set_expiry - Put a mount on an expiration list
 * @mnt: The mount to list.
 * @expiry_list: The list to add the mount to.
 */
void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
{
        namespace_lock();

        list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);

        namespace_unlock();
}
EXPORT_SYMBOL(mnt_set_expiry);

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * mountpoints that aren't in use and haven't been touched since last we came
 * here
 */
void mark_mounts_for_expiry(struct list_head *mounts)
{
        struct mount *mnt, *next;
        LIST_HEAD(graveyard);

        if (list_empty(mounts))
                return;

        namespace_lock();
        lock_mount_hash();

        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
         * - only referenced by its parent vfsmount
         * - still marked for expiry (marked on the last call here; marks are
         *   cleared by mntput())
         */
        list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
                if (!xchg(&mnt->mnt_expiry_mark, 1) ||
                        propagate_mount_busy(mnt, 1))
                        continue;
                list_move(&mnt->mnt_expire, &graveyard);
        }
        while (!list_empty(&graveyard)) {
                mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
        }
        unlock_mount_hash();
        namespace_unlock();
}

EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);

/*
 * Ripoff of 'select_parent()'
 *
 * search the list of submounts for a given mountpoint, and move any
 * shrinkable submounts to the 'graveyard' list.
 */
static int select_submounts(struct mount *parent, struct list_head *graveyard)
{
        struct mount *this_parent = parent;
        struct list_head *next;
        int found = 0;

repeat:
        next = this_parent->mnt_mounts.next;
resume:
        while (next != &this_parent->mnt_mounts) {
                struct list_head *tmp = next;
                struct mount *mnt = list_entry(tmp, struct mount, mnt_child);

                next = tmp->next;
                if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
                        continue;
                /*
                 * Descend a level if the d_mounts list is non-empty.
                 */
                if (!list_empty(&mnt->mnt_mounts)) {
                        this_parent = mnt;
                        goto repeat;
                }

                if (!propagate_mount_busy(mnt, 1)) {
                        list_move_tail(&mnt->mnt_expire, graveyard);
                        found++;
                }
        }
        /*
         * All done at this level ... ascend and resume the search
         */
        if (this_parent != parent) {
                next = this_parent->mnt_child.next;
                this_parent = this_parent->mnt_parent;
                goto resume;
        }
        return found;
}

/*
 * process a list of expirable mountpoints with the intent of discarding any
 * submounts of a specific parent mountpoint
 *
 * mount_lock must be held for write
 */
static void shrink_submounts(struct mount *mnt)
{
        LIST_HEAD(graveyard);
        struct mount *m;

        /* extract submounts of 'mountpoint' from the expiration list */
        while (select_submounts(mnt, &graveyard)) {
                while (!list_empty(&graveyard)) {
                        m = list_first_entry(&graveyard, struct mount,
                                                mnt_expire);
                        touch_mnt_namespace(m->mnt_ns);
                        umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
                }
        }
}

static void *copy_mount_options(const void __user * data)
{
        char *copy;
        unsigned left, offset;

        if (!data)
                return NULL;

        copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
        if (!copy)
                return ERR_PTR(-ENOMEM);

        left = copy_from_user(copy, data, PAGE_SIZE);

        /*
         * Not all architectures have an exact copy_from_user(). Resort to
         * byte at a time.
         */
        offset = PAGE_SIZE - left;
        while (left) {
                char c;
                if (get_user(c, (const char __user *)data + offset))
                        break;
                copy[offset] = c;
                left--;
                offset++;
        }

        if (left == PAGE_SIZE) {
                kfree(copy);
                return ERR_PTR(-EFAULT);
        }

        return copy;
}

static char *copy_mount_string(const void __user *data)
{
        return data ? strndup_user(data, PATH_MAX) : NULL;
}

/*
 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
 * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
 *
 * data is a (void *) that can point to any structure up to
 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
 * information (or be NULL).
 *
 * Pre-0.97 versions of mount() didn't have a flags word.
 * When the flags word was introduced its top half was required
 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
 * Therefore, if this magic number is present, it carries no information
 * and must be discarded.
 */
int path_mount(const char *dev_name, struct path *path,
                const char *type_page, unsigned long flags, void *data_page)
{
        unsigned int mnt_flags = 0, sb_flags;
        int ret;

        /* Discard magic */
        if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
                flags &= ~MS_MGC_MSK;

        /* Basic sanity checks */
        if (data_page)
                ((char *)data_page)[PAGE_SIZE - 1] = 0;

        if (flags & MS_NOUSER)
                return -EINVAL;

        ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
        if (ret)
                return ret;
        if (!may_mount())
                return -EPERM;
        if (flags & SB_MANDLOCK)
                warn_mandlock();

        /* Default to relatime unless overriden */
        if (!(flags & MS_NOATIME))
                mnt_flags |= MNT_RELATIME;

        /* Separate the per-mountpoint flags */
        if (flags & MS_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (flags & MS_NODEV)
                mnt_flags |= MNT_NODEV;
        if (flags & MS_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (flags & MS_NOATIME)
                mnt_flags |= MNT_NOATIME;
        if (flags & MS_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (flags & MS_STRICTATIME)
                mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
        if (flags & MS_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (flags & MS_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        /* The default atime for remount is preservation */
        if ((flags & MS_REMOUNT) &&
            ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
                       MS_STRICTATIME)) == 0)) {
                mnt_flags &= ~MNT_ATIME_MASK;
                mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
        }

        sb_flags = flags & (SB_RDONLY |
                            SB_SYNCHRONOUS |
                            SB_MANDLOCK |
                            SB_DIRSYNC |
                            SB_SILENT |
                            SB_POSIXACL |
                            SB_LAZYTIME |
                            SB_I_VERSION);

        if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
                return do_reconfigure_mnt(path, mnt_flags);
        if (flags & MS_REMOUNT)
                return do_remount(path, flags, sb_flags, mnt_flags, data_page);
        if (flags & MS_BIND)
                return do_loopback(path, dev_name, flags & MS_REC);
        if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
                return do_change_type(path, flags);
        if (flags & MS_MOVE)
                return do_move_mount_old(path, dev_name);

        return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
                            data_page);
}

int do_mount(const char *dev_name, const char __user *dir_name,
                const char *type_page, unsigned long flags, void *data_page)
{
        struct path path;
        int ret;

        ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
        if (ret)
                return ret;
        ret = path_mount(dev_name, &path, type_page, flags, data_page);
        path_put(&path);
        return ret;
}

static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
}

static void dec_mnt_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
}

static void free_mnt_ns(struct mnt_namespace *ns)
{
        if (!is_anon_ns(ns))
                ns_free_inum(&ns->ns);
        dec_mnt_namespaces(ns->ucounts);
        mnt_ns_tree_remove(ns);
}

/*
 * Assign a sequence number so we can detect when we attempt to bind
 * mount a reference to an older mount namespace into the current
 * mount namespace, preventing reference counting loops.  A 64bit
 * number incrementing at 10Ghz will take 12,427 years to wrap which
 * is effectively never, so we can ignore the possibility.
 */
static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);

static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
{
        struct mnt_namespace *new_ns;
        struct ucounts *ucounts;
        int ret;

        ucounts = inc_mnt_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
        if (!new_ns) {
                dec_mnt_namespaces(ucounts);
                return ERR_PTR(-ENOMEM);
        }
        if (!anon) {
                ret = ns_alloc_inum(&new_ns->ns);
                if (ret) {
                        kfree(new_ns);
                        dec_mnt_namespaces(ucounts);
                        return ERR_PTR(ret);
                }
        }
        new_ns->ns.ops = &mntns_operations;
        if (!anon)
                new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
        refcount_set(&new_ns->ns.count, 1);
        refcount_set(&new_ns->passive, 1);
        new_ns->mounts = RB_ROOT;
        INIT_LIST_HEAD(&new_ns->mnt_ns_list);
        RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
        init_waitqueue_head(&new_ns->poll);
        new_ns->user_ns = get_user_ns(user_ns);
        new_ns->ucounts = ucounts;
        return new_ns;
}

__latent_entropy
struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
                struct user_namespace *user_ns, struct fs_struct *new_fs)
{
        struct mnt_namespace *new_ns;
        struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
        struct mount *p, *q;
        struct mount *old;
        struct mount *new;
        int copy_flags;

        BUG_ON(!ns);

        if (likely(!(flags & CLONE_NEWNS))) {
                get_mnt_ns(ns);
                return ns;
        }

        old = ns->root;

        new_ns = alloc_mnt_ns(user_ns, false);
        if (IS_ERR(new_ns))
                return new_ns;

        namespace_lock();
        /* First pass: copy the tree topology */
        copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
        if (user_ns != ns->user_ns)
                copy_flags |= CL_SHARED_TO_SLAVE;
        new = copy_tree(old, old->mnt.mnt_root, copy_flags);
        if (IS_ERR(new)) {
                namespace_unlock();
                ns_free_inum(&new_ns->ns);
                dec_mnt_namespaces(new_ns->ucounts);
                mnt_ns_release(new_ns);
                return ERR_CAST(new);
        }
        if (user_ns != ns->user_ns) {
                lock_mount_hash();
                lock_mnt_tree(new);
                unlock_mount_hash();
        }
        new_ns->root = new;

        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
         * as belonging to new namespace.  We have already acquired a private
         * fs_struct, so tsk->fs->lock is not needed.
         */
        p = old;
        q = new;
        while (p) {
                mnt_add_to_ns(new_ns, q);
                new_ns->nr_mounts++;
                if (new_fs) {
                        if (&p->mnt == new_fs->root.mnt) {
                                new_fs->root.mnt = mntget(&q->mnt);
                                rootmnt = &p->mnt;
                        }
                        if (&p->mnt == new_fs->pwd.mnt) {
                                new_fs->pwd.mnt = mntget(&q->mnt);
                                pwdmnt = &p->mnt;
                        }
                }
                p = next_mnt(p, old);
                q = next_mnt(q, new);
                if (!q)
                        break;
                // an mntns binding we'd skipped?
                while (p->mnt.mnt_root != q->mnt.mnt_root)
                        p = next_mnt(skip_mnt_tree(p), old);
        }
        namespace_unlock();

        if (rootmnt)
                mntput(rootmnt);
        if (pwdmnt)
                mntput(pwdmnt);

        mnt_ns_tree_add(new_ns);
        return new_ns;
}

struct dentry *mount_subtree(struct vfsmount *m, const char *name)
{
        struct mount *mnt = real_mount(m);
        struct mnt_namespace *ns;
        struct super_block *s;
        struct path path;
        int err;

        ns = alloc_mnt_ns(&init_user_ns, true);
        if (IS_ERR(ns)) {
                mntput(m);
                return ERR_CAST(ns);
        }
        ns->root = mnt;
        ns->nr_mounts++;
        mnt_add_to_ns(ns, mnt);

        err = vfs_path_lookup(m->mnt_root, m,
                        name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);

        put_mnt_ns(ns);

        if (err)
                return ERR_PTR(err);

        /* trade a vfsmount reference for active sb one */
        s = path.mnt->mnt_sb;
        atomic_inc(&s->s_active);
        mntput(path.mnt);
        /* lock the sucker */
        down_write(&s->s_umount);
        /* ... and return the root of (sub)tree on it */
        return path.dentry;
}
EXPORT_SYMBOL(mount_subtree);

SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
                char __user *, type, unsigned long, flags, void __user *, data)
{
        int ret;
        char *kernel_type;
        char *kernel_dev;
        void *options;

        kernel_type = copy_mount_string(type);
        ret = PTR_ERR(kernel_type);
        if (IS_ERR(kernel_type))
                goto out_type;

        kernel_dev = copy_mount_string(dev_name);
        ret = PTR_ERR(kernel_dev);
        if (IS_ERR(kernel_dev))
                goto out_dev;

        options = copy_mount_options(data);
        ret = PTR_ERR(options);
        if (IS_ERR(options))
                goto out_data;

        ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);

        kfree(options);
out_data:
        kfree(kernel_dev);
out_dev:
        kfree(kernel_type);
out_type:
        return ret;
}

#define FSMOUNT_VALID_FLAGS                                                    \
        (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |            \
         MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME |       \
         MOUNT_ATTR_NOSYMFOLLOW)

#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)

#define MOUNT_SETATTR_PROPAGATION_FLAGS \
        (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)

static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
{
        unsigned int mnt_flags = 0;

        if (attr_flags & MOUNT_ATTR_RDONLY)
                mnt_flags |= MNT_READONLY;
        if (attr_flags & MOUNT_ATTR_NOSUID)
                mnt_flags |= MNT_NOSUID;
        if (attr_flags & MOUNT_ATTR_NODEV)
                mnt_flags |= MNT_NODEV;
        if (attr_flags & MOUNT_ATTR_NOEXEC)
                mnt_flags |= MNT_NOEXEC;
        if (attr_flags & MOUNT_ATTR_NODIRATIME)
                mnt_flags |= MNT_NODIRATIME;
        if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
                mnt_flags |= MNT_NOSYMFOLLOW;

        return mnt_flags;
}

/*
 * Create a kernel mount representation for a new, prepared superblock
 * (specified by fs_fd) and attach to an open_tree-like file descriptor.
 */
SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
                unsigned int, attr_flags)
{
        struct mnt_namespace *ns;
        struct fs_context *fc;
        struct file *file;
        struct path newmount;
        struct mount *mnt;
        unsigned int mnt_flags = 0;
        long ret;

        if (!may_mount())
                return -EPERM;

        if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
                return -EINVAL;

        if (attr_flags & ~FSMOUNT_VALID_FLAGS)
                return -EINVAL;

        mnt_flags = attr_flags_to_mnt_flags(attr_flags);

        switch (attr_flags & MOUNT_ATTR__ATIME) {
        case MOUNT_ATTR_STRICTATIME:
                break;
        case MOUNT_ATTR_NOATIME:
                mnt_flags |= MNT_NOATIME;
                break;
        case MOUNT_ATTR_RELATIME:
                mnt_flags |= MNT_RELATIME;
                break;
        default:
                return -EINVAL;
        }

        CLASS(fd, f)(fs_fd);
        if (fd_empty(f))
                return -EBADF;

        if (fd_file(f)->f_op != &fscontext_fops)
                return -EINVAL;

        fc = fd_file(f)->private_data;

        ret = mutex_lock_interruptible(&fc->uapi_mutex);
        if (ret < 0)
                return ret;

        /* There must be a valid superblock or we can't mount it */
        ret = -EINVAL;
        if (!fc->root)
                goto err_unlock;

        ret = -EPERM;
        if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
                pr_warn("VFS: Mount too revealing\n");
                goto err_unlock;
        }

        ret = -EBUSY;
        if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
                goto err_unlock;

        if (fc->sb_flags & SB_MANDLOCK)
                warn_mandlock();

        newmount.mnt = vfs_create_mount(fc);
        if (IS_ERR(newmount.mnt)) {
                ret = PTR_ERR(newmount.mnt);
                goto err_unlock;
        }
        newmount.dentry = dget(fc->root);
        newmount.mnt->mnt_flags = mnt_flags;

        /* We've done the mount bit - now move the file context into more or
         * less the same state as if we'd done an fspick().  We don't want to
         * do any memory allocation or anything like that at this point as we
         * don't want to have to handle any errors incurred.
         */
        vfs_clean_context(fc);

        ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
        if (IS_ERR(ns)) {
                ret = PTR_ERR(ns);
                goto err_path;
        }
        mnt = real_mount(newmount.mnt);
        ns->root = mnt;
        ns->nr_mounts = 1;
        mnt_add_to_ns(ns, mnt);
        mntget(newmount.mnt);

        /* Attach to an apparent O_PATH fd with a note that we need to unmount
         * it, not just simply put it.
         */
        file = dentry_open(&newmount, O_PATH, fc->cred);
        if (IS_ERR(file)) {
                dissolve_on_fput(newmount.mnt);
                ret = PTR_ERR(file);
                goto err_path;
        }
        file->f_mode |= FMODE_NEED_UNMOUNT;

        ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
        if (ret >= 0)
                fd_install(ret, file);
        else
                fput(file);

err_path:
        path_put(&newmount);
err_unlock:
        mutex_unlock(&fc->uapi_mutex);
        return ret;
}

static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
                                 enum mnt_tree_flags_t mflags)
{
        int ret;

        ret = security_move_mount(from_path, to_path);
        if (ret)
                return ret;

        if (mflags & MNT_TREE_PROPAGATION)
                return do_set_group(from_path, to_path);

        return do_move_mount(from_path, to_path, mflags);
}

/*
 * Move a mount from one place to another.  In combination with
 * fsopen()/fsmount() this is used to install a new mount and in combination
 * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
 * a mount subtree.
 *
 * Note the flags value is a combination of MOVE_MOUNT_* flags.
 */
SYSCALL_DEFINE5(move_mount,
                int, from_dfd, const char __user *, from_pathname,
                int, to_dfd, const char __user *, to_pathname,
                unsigned int, flags)
{
        struct path to_path __free(path_put) = {};
        struct path from_path __free(path_put) = {};
        struct filename *to_name __free(putname) = NULL;
        struct filename *from_name __free(putname) = NULL;
        unsigned int lflags, uflags;
        enum mnt_tree_flags_t mflags = 0;
        int ret = 0;

        if (!may_mount())
                return -EPERM;

        if (flags & ~MOVE_MOUNT__MASK)
                return -EINVAL;

        if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
            (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
                return -EINVAL;

        if (flags & MOVE_MOUNT_SET_GROUP)        mflags |= MNT_TREE_PROPAGATION;
        if (flags & MOVE_MOUNT_BENEATH)                mflags |= MNT_TREE_BENEATH;

        lflags = 0;
        if (flags & MOVE_MOUNT_F_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_F_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        uflags = 0;
        if (flags & MOVE_MOUNT_F_EMPTY_PATH)        uflags = AT_EMPTY_PATH;
        from_name = getname_maybe_null(from_pathname, uflags);
        if (IS_ERR(from_name))
                return PTR_ERR(from_name);

        lflags = 0;
        if (flags & MOVE_MOUNT_T_SYMLINKS)        lflags |= LOOKUP_FOLLOW;
        if (flags & MOVE_MOUNT_T_AUTOMOUNTS)        lflags |= LOOKUP_AUTOMOUNT;
        uflags = 0;
        if (flags & MOVE_MOUNT_T_EMPTY_PATH)        uflags = AT_EMPTY_PATH;
        to_name = getname_maybe_null(to_pathname, uflags);
        if (IS_ERR(to_name))
                return PTR_ERR(to_name);

        if (!to_name && to_dfd >= 0) {
                CLASS(fd_raw, f_to)(to_dfd);
                if (fd_empty(f_to))
                        return -EBADF;

                to_path = fd_file(f_to)->f_path;
                path_get(&to_path);
        } else {
                ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
                if (ret)
                        return ret;
        }

        if (!from_name && from_dfd >= 0) {
                CLASS(fd_raw, f_from)(from_dfd);
                if (fd_empty(f_from))
                        return -EBADF;

                return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
        }

        ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
        if (ret)
                return ret;

        return vfs_move_mount(&from_path, &to_path, mflags);
}

/*
 * Return true if path is reachable from root
 *
 * namespace_sem or mount_lock is held
 */
bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
                         const struct path *root)
{
        while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
                dentry = mnt->mnt_mountpoint;
                mnt = mnt->mnt_parent;
        }
        return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
}

bool path_is_under(const struct path *path1, const struct path *path2)
{
        bool res;
        read_seqlock_excl(&mount_lock);
        res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
        read_sequnlock_excl(&mount_lock);
        return res;
}
EXPORT_SYMBOL(path_is_under);

/*
 * pivot_root Semantics:
 * Moves the root file system of the current process to the directory put_old,
 * makes new_root as the new root file system of the current process, and sets
 * root/cwd of all processes which had them on the current root to new_root.
 *
 * Restrictions:
 * The new_root and put_old must be directories, and  must not be on the
 * same file  system as the current process root. The put_old  must  be
 * underneath new_root,  i.e. adding a non-zero number of /.. to the string
 * pointed to by put_old must yield the same directory as new_root. No other
 * file system may be mounted on put_old. After all, new_root is a mountpoint.
 *
 * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
 * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
 * in this situation.
 *
 * Notes:
 *  - we don't move root/cwd if they are not at the root (reason: if something
 *    cared enough to change them, it's probably wrong to force them elsewhere)
 *  - it's okay to pick a root that isn't the root of a file system, e.g.
 *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
 *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
 *    first.
 */
SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                const char __user *, put_old)
{
        struct path new, old, root;
        struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
        struct mountpoint *old_mp, *root_mp;
        int error;

        if (!may_mount())
                return -EPERM;

        error = user_path_at(AT_FDCWD, new_root,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
        if (error)
                goto out0;

        error = user_path_at(AT_FDCWD, put_old,
                             LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
        if (error)
                goto out1;

        error = security_sb_pivotroot(&old, &new);
        if (error)
                goto out2;

        get_fs_root(current->fs, &root);
        old_mp = lock_mount(&old);
        error = PTR_ERR(old_mp);
        if (IS_ERR(old_mp))
                goto out3;

        error = -EINVAL;
        new_mnt = real_mount(new.mnt);
        root_mnt = real_mount(root.mnt);
        old_mnt = real_mount(old.mnt);
        ex_parent = new_mnt->mnt_parent;
        root_parent = root_mnt->mnt_parent;
        if (IS_MNT_SHARED(old_mnt) ||
                IS_MNT_SHARED(ex_parent) ||
                IS_MNT_SHARED(root_parent))
                goto out4;
        if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
                goto out4;
        if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
                goto out4;
        error = -ENOENT;
        if (d_unlinked(new.dentry))
                goto out4;
        error = -EBUSY;
        if (new_mnt == root_mnt || old_mnt == root_mnt)
                goto out4; /* loop, on the same file system  */
        error = -EINVAL;
        if (!path_mounted(&root))
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(root_mnt))
                goto out4; /* not attached */
        if (!path_mounted(&new))
                goto out4; /* not a mountpoint */
        if (!mnt_has_parent(new_mnt))
                goto out4; /* not attached */
        /* make sure we can reach put_old from new_root */
        if (!is_path_reachable(old_mnt, old.dentry, &new))
                goto out4;
        /* make certain new is below the root */
        if (!is_path_reachable(new_mnt, new.dentry, &root))
                goto out4;
        lock_mount_hash();
        umount_mnt(new_mnt);
        root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
        if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
                new_mnt->mnt.mnt_flags |= MNT_LOCKED;
                root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
        }
        /* mount old root on put_old */
        attach_mnt(root_mnt, old_mnt, old_mp, false);
        /* mount new_root on / */
        attach_mnt(new_mnt, root_parent, root_mp, false);
        mnt_add_count(root_parent, -1);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
        /* A moved mount should not expire automatically */
        list_del_init(&new_mnt->mnt_expire);
        put_mountpoint(root_mp);
        unlock_mount_hash();
        mnt_notify_add(root_mnt);
        mnt_notify_add(new_mnt);
        chroot_fs_refs(&root, &new);
        error = 0;
out4:
        unlock_mount(old_mp);
        if (!error)
                mntput_no_expire(ex_parent);
out3:
        path_put(&root);
out2:
        path_put(&old);
out1:
        path_put(&new);
out0:
        return error;
}

static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
{
        unsigned int flags = mnt->mnt.mnt_flags;

        /*  flags to clear */
        flags &= ~kattr->attr_clr;
        /* flags to raise */
        flags |= kattr->attr_set;

        return flags;
}

static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
        struct vfsmount *m = &mnt->mnt;
        struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;

        if (!kattr->mnt_idmap)
                return 0;

        /*
         * Creating an idmapped mount with the filesystem wide idmapping
         * doesn't make sense so block that. We don't allow mushy semantics.
         */
        if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
                return -EINVAL;

        /*
         * We only allow an mount to change it's idmapping if it has
         * never been accessible to userspace.
         */
        if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
                return -EPERM;

        /* The underlying filesystem doesn't support idmapped mounts yet. */
        if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
                return -EINVAL;

        /* The filesystem has turned off idmapped mounts. */
        if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
                return -EINVAL;

        /* We're not controlling the superblock. */
        if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Mount has already been visible in the filesystem hierarchy. */
        if (!is_anon_ns(mnt->mnt_ns))
                return -EINVAL;

        return 0;
}

/**
 * mnt_allow_writers() - check whether the attribute change allows writers
 * @kattr: the new mount attributes
 * @mnt: the mount to which @kattr will be applied
 *
 * Check whether thew new mount attributes in @kattr allow concurrent writers.
 *
 * Return: true if writers need to be held, false if not
 */
static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
                                     const struct mount *mnt)
{
        return (!(kattr->attr_set & MNT_READONLY) ||
                (mnt->mnt.mnt_flags & MNT_READONLY)) &&
               !kattr->mnt_idmap;
}

static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
{
        struct mount *m;
        int err;

        for (m = mnt; m; m = next_mnt(m, mnt)) {
                if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
                        err = -EPERM;
                        break;
                }

                err = can_idmap_mount(kattr, m);
                if (err)
                        break;

                if (!mnt_allow_writers(kattr, m)) {
                        err = mnt_hold_writers(m);
                        if (err)
                                break;
                }

                if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
                        return 0;
        }

        if (err) {
                struct mount *p;

                /*
                 * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
                 * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
                 * mounts and needs to take care to include the first mount.
                 */
                for (p = mnt; p; p = next_mnt(p, mnt)) {
                        /* If we had to hold writers unblock them. */
                        if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
                                mnt_unhold_writers(p);

                        /*
                         * We're done once the first mount we changed got
                         * MNT_WRITE_HOLD unset.
                         */
                        if (p == m)
                                break;
                }
        }
        return err;
}

static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
        struct mnt_idmap *old_idmap;

        if (!kattr->mnt_idmap)
                return;

        old_idmap = mnt_idmap(&mnt->mnt);

        /* Pairs with smp_load_acquire() in mnt_idmap(). */
        smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
        mnt_idmap_put(old_idmap);
}

static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
{
        struct mount *m;

        for (m = mnt; m; m = next_mnt(m, mnt)) {
                unsigned int flags;

                do_idmap_mount(kattr, m);
                flags = recalc_flags(kattr, m);
                WRITE_ONCE(m->mnt.mnt_flags, flags);

                /* If we had to hold writers unblock them. */
                if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
                        mnt_unhold_writers(m);

                if (kattr->propagation)
                        change_mnt_propagation(m, kattr->propagation);
                if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
                        break;
        }
        touch_mnt_namespace(mnt->mnt_ns);
}

static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
{
        struct mount *mnt = real_mount(path->mnt);
        int err = 0;

        if (!path_mounted(path))
                return -EINVAL;

        if (kattr->mnt_userns) {
                struct mnt_idmap *mnt_idmap;

                mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
                if (IS_ERR(mnt_idmap))
                        return PTR_ERR(mnt_idmap);
                kattr->mnt_idmap = mnt_idmap;
        }

        if (kattr->propagation) {
                /*
                 * Only take namespace_lock() if we're actually changing
                 * propagation.
                 */
                namespace_lock();
                if (kattr->propagation == MS_SHARED) {
                        err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
                        if (err) {
                                namespace_unlock();
                                return err;
                        }
                }
        }

        err = -EINVAL;
        lock_mount_hash();

        /* Ensure that this isn't anything purely vfs internal. */
        if (!is_mounted(&mnt->mnt))
                goto out;

        /*
         * If this is an attached mount make sure it's located in the callers
         * mount namespace. If it's not don't let the caller interact with it.
         *
         * If this mount doesn't have a parent it's most often simply a
         * detached mount with an anonymous mount namespace. IOW, something
         * that's simply not attached yet. But there are apparently also users
         * that do change mount properties on the rootfs itself. That obviously
         * neither has a parent nor is it a detached mount so we cannot
         * unconditionally check for detached mounts.
         */
        if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
                goto out;

        /*
         * First, we get the mount tree in a shape where we can change mount
         * properties without failure. If we succeeded to do so we commit all
         * changes and if we failed we clean up.
         */
        err = mount_setattr_prepare(kattr, mnt);
        if (!err)
                mount_setattr_commit(kattr, mnt);

out:
        unlock_mount_hash();

        if (kattr->propagation) {
                if (err)
                        cleanup_group_ids(mnt, NULL);
                namespace_unlock();
        }

        return err;
}

static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
                                struct mount_kattr *kattr)
{
        struct ns_common *ns;
        struct user_namespace *mnt_userns;

        if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
                return 0;

        if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
                /*
                 * We can only remove an idmapping if it's never been
                 * exposed to userspace.
                 */
                if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
                        return -EINVAL;

                /*
                 * Removal of idmappings is equivalent to setting
                 * nop_mnt_idmap.
                 */
                if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
                        kattr->mnt_idmap = &nop_mnt_idmap;
                        return 0;
                }
        }

        if (attr->userns_fd > INT_MAX)
                return -EINVAL;

        CLASS(fd, f)(attr->userns_fd);
        if (fd_empty(f))
                return -EBADF;

        if (!proc_ns_file(fd_file(f)))
                return -EINVAL;

        ns = get_proc_ns(file_inode(fd_file(f)));
        if (ns->ops->type != CLONE_NEWUSER)
                return -EINVAL;

        /*
         * The initial idmapping cannot be used to create an idmapped
         * mount. We use the initial idmapping as an indicator of a mount
         * that is not idmapped. It can simply be passed into helpers that
         * are aware of idmapped mounts as a convenient shortcut. A user
         * can just create a dedicated identity mapping to achieve the same
         * result.
         */
        mnt_userns = container_of(ns, struct user_namespace, ns);
        if (mnt_userns == &init_user_ns)
                return -EPERM;

        /* We're not controlling the target namespace. */
        if (!ns_capable(mnt_userns, CAP_SYS_ADMIN))
                return -EPERM;

        kattr->mnt_userns = get_user_ns(mnt_userns);
        return 0;
}

static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
                             struct mount_kattr *kattr)
{
        if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
                return -EINVAL;
        if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
                return -EINVAL;
        kattr->propagation = attr->propagation;

        if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
                return -EINVAL;

        kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
        kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);

        /*
         * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
         * users wanting to transition to a different atime setting cannot
         * simply specify the atime setting in @attr_set, but must also
         * specify MOUNT_ATTR__ATIME in the @attr_clr field.
         * So ensure that MOUNT_ATTR__ATIME can't be partially set in
         * @attr_clr and that @attr_set can't have any atime bits set if
         * MOUNT_ATTR__ATIME isn't set in @attr_clr.
         */
        if (attr->attr_clr & MOUNT_ATTR__ATIME) {
                if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
                        return -EINVAL;

                /*
                 * Clear all previous time settings as they are mutually
                 * exclusive.
                 */
                kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
                switch (attr->attr_set & MOUNT_ATTR__ATIME) {
                case MOUNT_ATTR_RELATIME:
                        kattr->attr_set |= MNT_RELATIME;
                        break;
                case MOUNT_ATTR_NOATIME:
                        kattr->attr_set |= MNT_NOATIME;
                        break;
                case MOUNT_ATTR_STRICTATIME:
                        break;
                default:
                        return -EINVAL;
                }
        } else {
                if (attr->attr_set & MOUNT_ATTR__ATIME)
                        return -EINVAL;
        }

        return build_mount_idmapped(attr, usize, kattr);
}

static void finish_mount_kattr(struct mount_kattr *kattr)
{
        if (kattr->mnt_userns) {
                put_user_ns(kattr->mnt_userns);
                kattr->mnt_userns = NULL;
        }

        if (kattr->mnt_idmap)
                mnt_idmap_put(kattr->mnt_idmap);
}

static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize,
                               struct mount_kattr *kattr)
{
        int ret;
        struct mount_attr attr;

        BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);

        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
                return -EINVAL;

        if (!may_mount())
                return -EPERM;

        ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
        if (ret)
                return ret;

        /* Don't bother walking through the mounts if this is a nop. */
        if (attr.attr_set == 0 &&
            attr.attr_clr == 0 &&
            attr.propagation == 0)
                return 0; /* Tell caller to not bother. */

        ret = build_mount_kattr(&attr, usize, kattr);
        if (ret < 0)
                return ret;

        return 1;
}

SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
                unsigned int, flags, struct mount_attr __user *, uattr,
                size_t, usize)
{
        int err;
        struct path target;
        struct mount_kattr kattr;
        unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;

        if (flags & ~(AT_EMPTY_PATH |
                      AT_RECURSIVE |
                      AT_SYMLINK_NOFOLLOW |
                      AT_NO_AUTOMOUNT))
                return -EINVAL;

        if (flags & AT_NO_AUTOMOUNT)
                lookup_flags &= ~LOOKUP_AUTOMOUNT;
        if (flags & AT_SYMLINK_NOFOLLOW)
                lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;

        kattr = (struct mount_kattr) {
                .lookup_flags        = lookup_flags,
        };

        if (flags & AT_RECURSIVE)
                kattr.kflags |= MOUNT_KATTR_RECURSE;

        err = wants_mount_setattr(uattr, usize, &kattr);
        if (err <= 0)
                return err;

        err = user_path_at(dfd, path, kattr.lookup_flags, &target);
        if (!err) {
                err = do_mount_setattr(&target, &kattr);
                path_put(&target);
        }
        finish_mount_kattr(&kattr);
        return err;
}

SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
                unsigned, flags, struct mount_attr __user *, uattr,
                size_t, usize)
{
        struct file __free(fput) *file = NULL;
        int fd;

        if (!uattr && usize)
                return -EINVAL;

        file = vfs_open_tree(dfd, filename, flags);
        if (IS_ERR(file))
                return PTR_ERR(file);

        if (uattr) {
                int ret;
                struct mount_kattr kattr = {};

                kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
                if (flags & AT_RECURSIVE)
                        kattr.kflags |= MOUNT_KATTR_RECURSE;

                ret = wants_mount_setattr(uattr, usize, &kattr);
                if (ret < 0)
                        return ret;

                if (ret) {
                        ret = do_mount_setattr(&file->f_path, &kattr);
                        if (ret)
                                return ret;

                        finish_mount_kattr(&kattr);
                }
        }

        fd = get_unused_fd_flags(flags & O_CLOEXEC);
        if (fd < 0)
                return fd;

        fd_install(fd, no_free_ptr(file));
        return fd;
}

int show_path(struct seq_file *m, struct dentry *root)
{
        if (root->d_sb->s_op->show_path)
                return root->d_sb->s_op->show_path(m, root);

        seq_dentry(m, root, " \t\n\\");
        return 0;
}

static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
{
        struct mount *mnt = mnt_find_id_at(ns, id);

        if (!mnt || mnt->mnt_id_unique != id)
                return NULL;

        return &mnt->mnt;
}

struct kstatmount {
        struct statmount __user *buf;
        size_t bufsize;
        struct vfsmount *mnt;
        struct mnt_idmap *idmap;
        u64 mask;
        struct path root;
        struct seq_file seq;

        /* Must be last --ends in a flexible-array member. */
        struct statmount sm;
};

static u64 mnt_to_attr_flags(struct vfsmount *mnt)
{
        unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
        u64 attr_flags = 0;

        if (mnt_flags & MNT_READONLY)
                attr_flags |= MOUNT_ATTR_RDONLY;
        if (mnt_flags & MNT_NOSUID)
                attr_flags |= MOUNT_ATTR_NOSUID;
        if (mnt_flags & MNT_NODEV)
                attr_flags |= MOUNT_ATTR_NODEV;
        if (mnt_flags & MNT_NOEXEC)
                attr_flags |= MOUNT_ATTR_NOEXEC;
        if (mnt_flags & MNT_NODIRATIME)
                attr_flags |= MOUNT_ATTR_NODIRATIME;
        if (mnt_flags & MNT_NOSYMFOLLOW)
                attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;

        if (mnt_flags & MNT_NOATIME)
                attr_flags |= MOUNT_ATTR_NOATIME;
        else if (mnt_flags & MNT_RELATIME)
                attr_flags |= MOUNT_ATTR_RELATIME;
        else
                attr_flags |= MOUNT_ATTR_STRICTATIME;

        if (is_idmapped_mnt(mnt))
                attr_flags |= MOUNT_ATTR_IDMAP;

        return attr_flags;
}

static u64 mnt_to_propagation_flags(struct mount *m)
{
        u64 propagation = 0;

        if (IS_MNT_SHARED(m))
                propagation |= MS_SHARED;
        if (IS_MNT_SLAVE(m))
                propagation |= MS_SLAVE;
        if (IS_MNT_UNBINDABLE(m))
                propagation |= MS_UNBINDABLE;
        if (!propagation)
                propagation |= MS_PRIVATE;

        return propagation;
}

static void statmount_sb_basic(struct kstatmount *s)
{
        struct super_block *sb = s->mnt->mnt_sb;

        s->sm.mask |= STATMOUNT_SB_BASIC;
        s->sm.sb_dev_major = MAJOR(sb->s_dev);
        s->sm.sb_dev_minor = MINOR(sb->s_dev);
        s->sm.sb_magic = sb->s_magic;
        s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
}

static void statmount_mnt_basic(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);

        s->sm.mask |= STATMOUNT_MNT_BASIC;
        s->sm.mnt_id = m->mnt_id_unique;
        s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
        s->sm.mnt_id_old = m->mnt_id;
        s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
        s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
        s->sm.mnt_propagation = mnt_to_propagation_flags(m);
        s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
        s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
}

static void statmount_propagate_from(struct kstatmount *s)
{
        struct mount *m = real_mount(s->mnt);

        s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
        if (IS_MNT_SLAVE(m))
                s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
}

static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
{
        int ret;
        size_t start = seq->count;

        ret = show_path(seq, s->mnt->mnt_root);
        if (ret)
                return ret;

        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        /*
         * Unescape the result. It would be better if supplied string was not
         * escaped in the first place, but that's a pretty invasive change.
         */
        seq->buf[seq->count] = '\0';
        seq->count = start;
        seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
        return 0;
}

static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
        int err;

        err = seq_path_root(seq, &mnt_path, &s->root, "");
        return err == SEQ_SKIP ? 0 : err;
}

static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
{
        struct super_block *sb = s->mnt->mnt_sb;

        seq_puts(seq, sb->s_type->name);
        return 0;
}

static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
{
        struct super_block *sb = s->mnt->mnt_sb;

        if (sb->s_subtype)
                seq_puts(seq, sb->s_subtype);
}

static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
{
        struct super_block *sb = s->mnt->mnt_sb;
        struct mount *r = real_mount(s->mnt);

        if (sb->s_op->show_devname) {
                size_t start = seq->count;
                int ret;

                ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
                if (ret)
                        return ret;

                if (unlikely(seq_has_overflowed(seq)))
                        return -EAGAIN;

                /* Unescape the result */
                seq->buf[seq->count] = '\0';
                seq->count = start;
                seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
        } else if (r->mnt_devname) {
                seq_puts(seq, r->mnt_devname);
        }
        return 0;
}

static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
        s->sm.mask |= STATMOUNT_MNT_NS_ID;
        s->sm.mnt_ns_id = ns->seq;
}

static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct super_block *sb = mnt->mnt_sb;
        size_t start = seq->count;
        int err;

        err = security_sb_show_options(seq, sb);
        if (err)
                return err;

        if (sb->s_op->show_options) {
                err = sb->s_op->show_options(seq, mnt->mnt_root);
                if (err)
                        return err;
        }

        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        if (seq->count == start)
                return 0;

        /* skip leading comma */
        memmove(seq->buf + start, seq->buf + start + 1,
                seq->count - start - 1);
        seq->count--;

        return 0;
}

static inline int statmount_opt_process(struct seq_file *seq, size_t start)
{
        char *buf_end, *opt_end, *src, *dst;
        int count = 0;

        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        buf_end = seq->buf + seq->count;
        dst = seq->buf + start;
        src = dst + 1;        /* skip initial comma */

        if (src >= buf_end) {
                seq->count = start;
                return 0;
        }

        *buf_end = '\0';
        for (; src < buf_end; src = opt_end + 1) {
                opt_end = strchrnul(src, ',');
                *opt_end = '\0';
                dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1;
                if (WARN_ON_ONCE(++count == INT_MAX))
                        return -EOVERFLOW;
        }
        seq->count = dst - 1 - seq->buf;
        return count;
}

static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct super_block *sb = mnt->mnt_sb;
        size_t start = seq->count;
        int err;

        if (!sb->s_op->show_options)
                return 0;

        err = sb->s_op->show_options(seq, mnt->mnt_root);
        if (err)
                return err;

        err = statmount_opt_process(seq, start);
        if (err < 0)
                return err;

        s->sm.opt_num = err;
        return 0;
}

static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
{
        struct vfsmount *mnt = s->mnt;
        struct super_block *sb = mnt->mnt_sb;
        size_t start = seq->count;
        int err;

        err = security_sb_show_options(seq, sb);
        if (err)
                return err;

        err = statmount_opt_process(seq, start);
        if (err < 0)
                return err;

        s->sm.opt_sec_num = err;
        return 0;
}

static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq)
{
        int ret;

        ret = statmount_mnt_idmap(s->idmap, seq, true);
        if (ret < 0)
                return ret;

        s->sm.mnt_uidmap_num = ret;
        /*
         * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
         * mappings. This allows userspace to distinguish between a
         * non-idmapped mount and an idmapped mount where none of the
         * individual mappings are valid in the caller's idmapping.
         */
        if (is_valid_mnt_idmap(s->idmap))
                s->sm.mask |= STATMOUNT_MNT_UIDMAP;
        return 0;
}

static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq)
{
        int ret;

        ret = statmount_mnt_idmap(s->idmap, seq, false);
        if (ret < 0)
                return ret;

        s->sm.mnt_gidmap_num = ret;
        /*
         * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
         * mappings. This allows userspace to distinguish between a
         * non-idmapped mount and an idmapped mount where none of the
         * individual mappings are valid in the caller's idmapping.
         */
        if (is_valid_mnt_idmap(s->idmap))
                s->sm.mask |= STATMOUNT_MNT_GIDMAP;
        return 0;
}

static int statmount_string(struct kstatmount *s, u64 flag)
{
        int ret = 0;
        size_t kbufsize;
        struct seq_file *seq = &s->seq;
        struct statmount *sm = &s->sm;
        u32 start, *offp;

        /* Reserve an empty string at the beginning for any unset offsets */
        if (!seq->count)
                seq_putc(seq, 0);

        start = seq->count;

        switch (flag) {
        case STATMOUNT_FS_TYPE:
                offp = &sm->fs_type;
                ret = statmount_fs_type(s, seq);
                break;
        case STATMOUNT_MNT_ROOT:
                offp = &sm->mnt_root;
                ret = statmount_mnt_root(s, seq);
                break;
        case STATMOUNT_MNT_POINT:
                offp = &sm->mnt_point;
                ret = statmount_mnt_point(s, seq);
                break;
        case STATMOUNT_MNT_OPTS:
                offp = &sm->mnt_opts;
                ret = statmount_mnt_opts(s, seq);
                break;
        case STATMOUNT_OPT_ARRAY:
                offp = &sm->opt_array;
                ret = statmount_opt_array(s, seq);
                break;
        case STATMOUNT_OPT_SEC_ARRAY:
                offp = &sm->opt_sec_array;
                ret = statmount_opt_sec_array(s, seq);
                break;
        case STATMOUNT_FS_SUBTYPE:
                offp = &sm->fs_subtype;
                statmount_fs_subtype(s, seq);
                break;
        case STATMOUNT_SB_SOURCE:
                offp = &sm->sb_source;
                ret = statmount_sb_source(s, seq);
                break;
        case STATMOUNT_MNT_UIDMAP:
                sm->mnt_uidmap = start;
                ret = statmount_mnt_uidmap(s, seq);
                break;
        case STATMOUNT_MNT_GIDMAP:
                sm->mnt_gidmap = start;
                ret = statmount_mnt_gidmap(s, seq);
                break;
        default:
                WARN_ON_ONCE(true);
                return -EINVAL;
        }

        /*
         * If nothing was emitted, return to avoid setting the flag
         * and terminating the buffer.
         */
        if (seq->count == start)
                return ret;
        if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
                return -EOVERFLOW;
        if (kbufsize >= s->bufsize)
                return -EOVERFLOW;

        /* signal a retry */
        if (unlikely(seq_has_overflowed(seq)))
                return -EAGAIN;

        if (ret)
                return ret;

        seq->buf[seq->count++] = '\0';
        sm->mask |= flag;
        *offp = start;
        return 0;
}

static int copy_statmount_to_user(struct kstatmount *s)
{
        struct statmount *sm = &s->sm;
        struct seq_file *seq = &s->seq;
        char __user *str = ((char __user *)s->buf) + sizeof(*sm);
        size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));

        if (seq->count && copy_to_user(str, seq->buf, seq->count))
                return -EFAULT;

        /* Return the number of bytes copied to the buffer */
        sm->size = copysize + seq->count;
        if (copy_to_user(s->buf, sm, copysize))
                return -EFAULT;

        return 0;
}

static struct mount *listmnt_next(struct mount *curr, bool reverse)
{
        struct rb_node *node;

        if (reverse)
                node = rb_prev(&curr->mnt_node);
        else
                node = rb_next(&curr->mnt_node);

        return node_to_mount(node);
}

static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
{
        struct mount *first, *child;

        rwsem_assert_held(&namespace_sem);

        /* We're looking at our own ns, just use get_fs_root. */
        if (ns == current->nsproxy->mnt_ns) {
                get_fs_root(current->fs, root);
                return 0;
        }

        /*
         * We have to find the first mount in our ns and use that, however it
         * may not exist, so handle that properly.
         */
        if (mnt_ns_empty(ns))
                return -ENOENT;

        first = child = ns->root;
        for (;;) {
                child = listmnt_next(child, false);
                if (!child)
                        return -ENOENT;
                if (child->mnt_parent == first)
                        break;
        }

        root->mnt = mntget(&child->mnt);
        root->dentry = dget(root->mnt->mnt_root);
        return 0;
}

/* This must be updated whenever a new flag is added */
#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
                             STATMOUNT_MNT_BASIC | \
                             STATMOUNT_PROPAGATE_FROM | \
                             STATMOUNT_MNT_ROOT | \
                             STATMOUNT_MNT_POINT | \
                             STATMOUNT_FS_TYPE | \
                             STATMOUNT_MNT_NS_ID | \
                             STATMOUNT_MNT_OPTS | \
                             STATMOUNT_FS_SUBTYPE | \
                             STATMOUNT_SB_SOURCE | \
                             STATMOUNT_OPT_ARRAY | \
                             STATMOUNT_OPT_SEC_ARRAY | \
                             STATMOUNT_SUPPORTED_MASK)

static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
                        struct mnt_namespace *ns)
{
        struct path root __free(path_put) = {};
        struct mount *m;
        int err;

        /* Has the namespace already been emptied? */
        if (mnt_ns_id && mnt_ns_empty(ns))
                return -ENOENT;

        s->mnt = lookup_mnt_in_ns(mnt_id, ns);
        if (!s->mnt)
                return -ENOENT;

        err = grab_requested_root(ns, &root);
        if (err)
                return err;

        /*
         * Don't trigger audit denials. We just want to determine what
         * mounts to show users.
         */
        m = real_mount(s->mnt);
        if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
            !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        err = security_sb_statfs(s->mnt->mnt_root);
        if (err)
                return err;

        s->root = root;
        s->idmap = mnt_idmap(s->mnt);
        if (s->mask & STATMOUNT_SB_BASIC)
                statmount_sb_basic(s);

        if (s->mask & STATMOUNT_MNT_BASIC)
                statmount_mnt_basic(s);

        if (s->mask & STATMOUNT_PROPAGATE_FROM)
                statmount_propagate_from(s);

        if (s->mask & STATMOUNT_FS_TYPE)
                err = statmount_string(s, STATMOUNT_FS_TYPE);

        if (!err && s->mask & STATMOUNT_MNT_ROOT)
                err = statmount_string(s, STATMOUNT_MNT_ROOT);

        if (!err && s->mask & STATMOUNT_MNT_POINT)
                err = statmount_string(s, STATMOUNT_MNT_POINT);

        if (!err && s->mask & STATMOUNT_MNT_OPTS)
                err = statmount_string(s, STATMOUNT_MNT_OPTS);

        if (!err && s->mask & STATMOUNT_OPT_ARRAY)
                err = statmount_string(s, STATMOUNT_OPT_ARRAY);

        if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
                err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);

        if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
                err = statmount_string(s, STATMOUNT_FS_SUBTYPE);

        if (!err && s->mask & STATMOUNT_SB_SOURCE)
                err = statmount_string(s, STATMOUNT_SB_SOURCE);

        if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
                err = statmount_string(s, STATMOUNT_MNT_UIDMAP);

        if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
                err = statmount_string(s, STATMOUNT_MNT_GIDMAP);

        if (!err && s->mask & STATMOUNT_MNT_NS_ID)
                statmount_mnt_ns_id(s, ns);

        if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
                s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
                s->sm.supported_mask = STATMOUNT_SUPPORTED;
        }

        if (err)
                return err;

        /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
        WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);

        return 0;
}

static inline bool retry_statmount(const long ret, size_t *seq_size)
{
        if (likely(ret != -EAGAIN))
                return false;
        if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
                return false;
        if (unlikely(*seq_size > MAX_RW_COUNT))
                return false;
        return true;
}

#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
                              STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
                              STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
                              STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \
                              STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP)

static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
                              struct statmount __user *buf, size_t bufsize,
                              size_t seq_size)
{
        if (!access_ok(buf, bufsize))
                return -EFAULT;

        memset(ks, 0, sizeof(*ks));
        ks->mask = kreq->param;
        ks->buf = buf;
        ks->bufsize = bufsize;

        if (ks->mask & STATMOUNT_STRING_REQ) {
                if (bufsize == sizeof(ks->sm))
                        return -EOVERFLOW;

                ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
                if (!ks->seq.buf)
                        return -ENOMEM;

                ks->seq.size = seq_size;
        }

        return 0;
}

static int copy_mnt_id_req(const struct mnt_id_req __user *req,
                           struct mnt_id_req *kreq)
{
        int ret;
        size_t usize;

        BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);

        ret = get_user(usize, &req->size);
        if (ret)
                return -EFAULT;
        if (unlikely(usize > PAGE_SIZE))
                return -E2BIG;
        if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
                return -EINVAL;
        memset(kreq, 0, sizeof(*kreq));
        ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
        if (ret)
                return ret;
        if (kreq->spare != 0)
                return -EINVAL;
        /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
        if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
                return -EINVAL;
        return 0;
}

/*
 * If the user requested a specific mount namespace id, look that up and return
 * that, or if not simply grab a passive reference on our mount namespace and
 * return that.
 */
static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
{
        struct mnt_namespace *mnt_ns;

        if (kreq->mnt_ns_id && kreq->spare)
                return ERR_PTR(-EINVAL);

        if (kreq->mnt_ns_id)
                return lookup_mnt_ns(kreq->mnt_ns_id);

        if (kreq->spare) {
                struct ns_common *ns;

                CLASS(fd, f)(kreq->spare);
                if (fd_empty(f))
                        return ERR_PTR(-EBADF);

                if (!proc_ns_file(fd_file(f)))
                        return ERR_PTR(-EINVAL);

                ns = get_proc_ns(file_inode(fd_file(f)));
                if (ns->ops->type != CLONE_NEWNS)
                        return ERR_PTR(-EINVAL);

                mnt_ns = to_mnt_ns(ns);
        } else {
                mnt_ns = current->nsproxy->mnt_ns;
        }

        refcount_inc(&mnt_ns->passive);
        return mnt_ns;
}

SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
                struct statmount __user *, buf, size_t, bufsize,
                unsigned int, flags)
{
        struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
        struct kstatmount *ks __free(kfree) = NULL;
        struct mnt_id_req kreq;
        /* We currently support retrieval of 3 strings. */
        size_t seq_size = 3 * PATH_MAX;
        int ret;

        if (flags)
                return -EINVAL;

        ret = copy_mnt_id_req(req, &kreq);
        if (ret)
                return ret;

        ns = grab_requested_mnt_ns(&kreq);
        if (!ns)
                return -ENOENT;

        if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
            !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
                return -ENOENT;

        ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
        if (!ks)
                return -ENOMEM;

retry:
        ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size);
        if (ret)
                return ret;

        scoped_guard(rwsem_read, &namespace_sem)
                ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);

        if (!ret)
                ret = copy_statmount_to_user(ks);
        kvfree(ks->seq.buf);
        if (retry_statmount(ret, &seq_size))
                goto retry;
        return ret;
}

static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
                            u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
                            bool reverse)
{
        struct path root __free(path_put) = {};
        struct path orig;
        struct mount *r, *first;
        ssize_t ret;

        rwsem_assert_held(&namespace_sem);

        ret = grab_requested_root(ns, &root);
        if (ret)
                return ret;

        if (mnt_parent_id == LSMT_ROOT) {
                orig = root;
        } else {
                orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
                if (!orig.mnt)
                        return -ENOENT;
                orig.dentry = orig.mnt->mnt_root;
        }

        /*
         * Don't trigger audit denials. We just want to determine what
         * mounts to show users.
         */
        if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
            !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        ret = security_sb_statfs(orig.dentry);
        if (ret)
                return ret;

        if (!last_mnt_id) {
                if (reverse)
                        first = node_to_mount(ns->mnt_last_node);
                else
                        first = node_to_mount(ns->mnt_first_node);
        } else {
                if (reverse)
                        first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
                else
                        first = mnt_find_id_at(ns, last_mnt_id + 1);
        }

        for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) {
                if (r->mnt_id_unique == mnt_parent_id)
                        continue;
                if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
                        continue;
                *mnt_ids = r->mnt_id_unique;
                mnt_ids++;
                nr_mnt_ids--;
                ret++;
        }
        return ret;
}

SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
                u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
        u64 *kmnt_ids __free(kvfree) = NULL;
        const size_t maxcount = 1000000;
        struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
        struct mnt_id_req kreq;
        u64 last_mnt_id;
        ssize_t ret;

        if (flags & ~LISTMOUNT_REVERSE)
                return -EINVAL;

        /*
         * If the mount namespace really has more than 1 million mounts the
         * caller must iterate over the mount namespace (and reconsider their
         * system design...).
         */
        if (unlikely(nr_mnt_ids > maxcount))
                return -EOVERFLOW;

        if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
                return -EFAULT;

        ret = copy_mnt_id_req(req, &kreq);
        if (ret)
                return ret;

        last_mnt_id = kreq.param;
        /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
        if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
                return -EINVAL;

        kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
                                  GFP_KERNEL_ACCOUNT);
        if (!kmnt_ids)
                return -ENOMEM;

        ns = grab_requested_mnt_ns(&kreq);
        if (!ns)
                return -ENOENT;

        if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
            !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
                return -ENOENT;

        scoped_guard(rwsem_read, &namespace_sem)
                ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
                                   nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
        if (ret <= 0)
                return ret;

        if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
                return -EFAULT;

        return ret;
}

static void __init init_mount_tree(void)
{
        struct vfsmount *mnt;
        struct mount *m;
        struct mnt_namespace *ns;
        struct path root;

        mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
        if (IS_ERR(mnt))
                panic("Can't create rootfs");

        ns = alloc_mnt_ns(&init_user_ns, false);
        if (IS_ERR(ns))
                panic("Can't allocate initial namespace");
        m = real_mount(mnt);
        ns->root = m;
        ns->nr_mounts = 1;
        mnt_add_to_ns(ns, m);
        init_task.nsproxy->mnt_ns = ns;
        get_mnt_ns(ns);

        root.mnt = mnt;
        root.dentry = mnt->mnt_root;
        mnt->mnt_flags |= MNT_LOCKED;

        set_fs_pwd(current->fs, &root);
        set_fs_root(current->fs, &root);

        mnt_ns_tree_add(ns);
}

void __init mnt_init(void)
{
        int err;

        mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);

        mount_hashtable = alloc_large_system_hash("Mount-cache",
                                sizeof(struct hlist_head),
                                mhash_entries, 19,
                                HASH_ZERO,
                                &m_hash_shift, &m_hash_mask, 0, 0);
        mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
                                sizeof(struct hlist_head),
                                mphash_entries, 19,
                                HASH_ZERO,
                                &mp_hash_shift, &mp_hash_mask, 0, 0);

        if (!mount_hashtable || !mountpoint_hashtable)
                panic("Failed to allocate mount hash table\n");

        kernfs_init();

        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
                        __func__, err);
        fs_kobj = kobject_create_and_add("fs", NULL);
        if (!fs_kobj)
                printk(KERN_WARNING "%s: kobj create error\n", __func__);
        shmem_init();
        init_rootfs();
        init_mount_tree();
}

void put_mnt_ns(struct mnt_namespace *ns)
{
        if (!refcount_dec_and_test(&ns->ns.count))
                return;
        drop_collected_mounts(&ns->root->mnt);
        free_mnt_ns(ns);
}

struct vfsmount *kern_mount(struct file_system_type *type)
{
        struct vfsmount *mnt;
        mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
        if (!IS_ERR(mnt)) {
                /*
                 * it is a longterm mount, don't release mnt until
                 * we unmount before file sys is unregistered
                */
                real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
        }
        return mnt;
}
EXPORT_SYMBOL_GPL(kern_mount);

void kern_unmount(struct vfsmount *mnt)
{
        /* release long term mount so mount point can be released */
        if (!IS_ERR(mnt)) {
                mnt_make_shortterm(mnt);
                synchronize_rcu();        /* yecchhh... */
                mntput(mnt);
        }
}
EXPORT_SYMBOL(kern_unmount);

void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
{
        unsigned int i;

        for (i = 0; i < num; i++)
                mnt_make_shortterm(mnt[i]);
        synchronize_rcu_expedited();
        for (i = 0; i < num; i++)
                mntput(mnt[i]);
}
EXPORT_SYMBOL(kern_unmount_array);

bool our_mnt(struct vfsmount *mnt)
{
        return check_mnt(real_mount(mnt));
}

bool current_chrooted(void)
{
        /* Does the current process have a non-standard root */
        struct path ns_root;
        struct path fs_root;
        bool chrooted;

        /* Find the namespace root */
        ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
        ns_root.dentry = ns_root.mnt->mnt_root;
        path_get(&ns_root);
        while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
                ;

        get_fs_root(current->fs, &fs_root);

        chrooted = !path_equal(&fs_root, &ns_root);

        path_put(&fs_root);
        path_put(&ns_root);

        return chrooted;
}

static bool mnt_already_visible(struct mnt_namespace *ns,
                                const struct super_block *sb,
                                int *new_mnt_flags)
{
        int new_flags = *new_mnt_flags;
        struct mount *mnt, *n;
        bool visible = false;

        down_read(&namespace_sem);
        rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
                struct mount *child;
                int mnt_flags;

                if (mnt->mnt.mnt_sb->s_type != sb->s_type)
                        continue;

                /* This mount is not fully visible if it's root directory
                 * is not the root directory of the filesystem.
                 */
                if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
                        continue;

                /* A local view of the mount flags */
                mnt_flags = mnt->mnt.mnt_flags;

                /* Don't miss readonly hidden in the superblock flags */
                if (sb_rdonly(mnt->mnt.mnt_sb))
                        mnt_flags |= MNT_LOCK_READONLY;

                /* Verify the mount flags are equal to or more permissive
                 * than the proposed new mount.
                 */
                if ((mnt_flags & MNT_LOCK_READONLY) &&
                    !(new_flags & MNT_READONLY))
                        continue;
                if ((mnt_flags & MNT_LOCK_ATIME) &&
                    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
                        continue;

                /* This mount is not fully visible if there are any
                 * locked child mounts that cover anything except for
                 * empty directories.
                 */
                list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
                        struct inode *inode = child->mnt_mountpoint->d_inode;
                        /* Only worry about locked mounts */
                        if (!(child->mnt.mnt_flags & MNT_LOCKED))
                                continue;
                        /* Is the directory permanently empty? */
                        if (!is_empty_dir_inode(inode))
                                goto next;
                }
                /* Preserve the locked attributes */
                *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
                                               MNT_LOCK_ATIME);
                visible = true;
                goto found;
        next:        ;
        }
found:
        up_read(&namespace_sem);
        return visible;
}

static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
{
        const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
        struct mnt_namespace *ns = current->nsproxy->mnt_ns;
        unsigned long s_iflags;

        if (ns->user_ns == &init_user_ns)
                return false;

        /* Can this filesystem be too revealing? */
        s_iflags = sb->s_iflags;
        if (!(s_iflags & SB_I_USERNS_VISIBLE))
                return false;

        if ((s_iflags & required_iflags) != required_iflags) {
                WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
                          required_iflags);
                return true;
        }

        return !mnt_already_visible(ns, sb, new_mnt_flags);
}

bool mnt_may_suid(struct vfsmount *mnt)
{
        /*
         * Foreign mounts (accessed via fchdir or through /proc
         * symlinks) are always treated as if they are nosuid.  This
         * prevents namespaces from trusting potentially unsafe
         * suid/sgid bits, file caps, or security labels that originate
         * in other namespaces.
         */
        return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
               current_in_userns(mnt->mnt_sb->s_user_ns);
}

static struct ns_common *mntns_get(struct task_struct *task)
{
        struct ns_common *ns = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy) {
                ns = &nsproxy->mnt_ns->ns;
                get_mnt_ns(to_mnt_ns(ns));
        }
        task_unlock(task);

        return ns;
}

static void mntns_put(struct ns_common *ns)
{
        put_mnt_ns(to_mnt_ns(ns));
}

static int mntns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct fs_struct *fs = nsset->fs;
        struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
        struct user_namespace *user_ns = nsset->cred->user_ns;
        struct path root;
        int err;

        if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(user_ns, CAP_SYS_CHROOT) ||
            !ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        if (is_anon_ns(mnt_ns))
                return -EINVAL;

        if (fs->users != 1)
                return -EINVAL;

        get_mnt_ns(mnt_ns);
        old_mnt_ns = nsproxy->mnt_ns;
        nsproxy->mnt_ns = mnt_ns;

        /* Find the root */
        err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
                                "/", LOOKUP_DOWN, &root);
        if (err) {
                /* revert to old namespace */
                nsproxy->mnt_ns = old_mnt_ns;
                put_mnt_ns(mnt_ns);
                return err;
        }

        put_mnt_ns(old_mnt_ns);

        /* Update the pwd and root */
        set_fs_pwd(fs, &root);
        set_fs_root(fs, &root);

        path_put(&root);
        return 0;
}

static struct user_namespace *mntns_owner(struct ns_common *ns)
{
        return to_mnt_ns(ns)->user_ns;
}

const struct proc_ns_operations mntns_operations = {
        .name                = "mnt",
        .type                = CLONE_NEWNS,
        .get                = mntns_get,
        .put                = mntns_put,
        .install        = mntns_install,
        .owner                = mntns_owner,
};

#ifdef CONFIG_SYSCTL
static const struct ctl_table fs_namespace_sysctls[] = {
        {
                .procname        = "mount-max",
                .data                = &sysctl_mount_max,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ONE,
        },
};

static int __init init_fs_namespace_sysctls(void)
{
        register_sysctl_init("fs", fs_namespace_sysctls);
        return 0;
}
fs_initcall(init_fs_namespace_sysctls);

#endif /* CONFIG_SYSCTL */















































































































































































































































   82 































   25 
   29 


   29 

   25 
    4 


   25 


   29 
   29 
   25 
































































































    1 


























































































































































































































































    4 
    4 
    4 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_H
#define _LINUX_HIGHMEM_H

#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/bug.h>
#include <linux/cacheflush.h>
#include <linux/kmsan.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>

#include "highmem-internal.h"

/**
 * kmap - Map a page for long term usage
 * @page:        Pointer to the page to be mapped
 *
 * Returns: The virtual address of the mapping
 *
 * Can only be invoked from preemptible task context because on 32bit
 * systems with CONFIG_HIGHMEM enabled this function might sleep.
 *
 * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area
 * this returns the virtual address of the direct kernel mapping.
 *
 * The returned virtual address is globally visible and valid up to the
 * point where it is unmapped via kunmap(). The pointer can be handed to
 * other contexts.
 *
 * For highmem pages on 32bit systems this can be slow as the mapping space
 * is limited and protected by a global lock. In case that there is no
 * mapping slot available the function blocks until a slot is released via
 * kunmap().
 */
static inline void *kmap(struct page *page);

/**
 * kunmap - Unmap the virtual address mapped by kmap()
 * @page:        Pointer to the page which was mapped by kmap()
 *
 * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of
 * pages in the low memory area.
 */
static inline void kunmap(struct page *page);

/**
 * kmap_to_page - Get the page for a kmap'ed address
 * @addr:        The address to look up
 *
 * Returns: The page which is mapped to @addr.
 */
static inline struct page *kmap_to_page(void *addr);

/**
 * kmap_flush_unused - Flush all unused kmap mappings in order to
 *                       remove stray mappings
 */
static inline void kmap_flush_unused(void);

/**
 * kmap_local_page - Map a page for temporary usage
 * @page: Pointer to the page to be mapped
 *
 * Returns: The virtual address of the mapping
 *
 * Can be invoked from any context, including interrupts.
 *
 * Requires careful handling when nesting multiple mappings because the map
 * management is stack based. The unmap has to be in the reverse order of
 * the map operation:
 *
 * addr1 = kmap_local_page(page1);
 * addr2 = kmap_local_page(page2);
 * ...
 * kunmap_local(addr2);
 * kunmap_local(addr1);
 *
 * Unmapping addr1 before addr2 is invalid and causes malfunction.
 *
 * Contrary to kmap() mappings the mapping is only valid in the context of
 * the caller and cannot be handed to other contexts.
 *
 * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
 * virtual address of the direct mapping. Only real highmem pages are
 * temporarily mapped.
 *
 * While kmap_local_page() is significantly faster than kmap() for the highmem
 * case it comes with restrictions about the pointer validity.
 *
 * On HIGHMEM enabled systems mapping a highmem page has the side effect of
 * disabling migration in order to keep the virtual address stable across
 * preemption. No caller of kmap_local_page() can rely on this side effect.
 */
static inline void *kmap_local_page(struct page *page);

/**
 * kmap_local_folio - Map a page in this folio for temporary usage
 * @folio: The folio containing the page.
 * @offset: The byte offset within the folio which identifies the page.
 *
 * Requires careful handling when nesting multiple mappings because the map
 * management is stack based. The unmap has to be in the reverse order of
 * the map operation::
 *
 *   addr1 = kmap_local_folio(folio1, offset1);
 *   addr2 = kmap_local_folio(folio2, offset2);
 *   ...
 *   kunmap_local(addr2);
 *   kunmap_local(addr1);
 *
 * Unmapping addr1 before addr2 is invalid and causes malfunction.
 *
 * Contrary to kmap() mappings the mapping is only valid in the context of
 * the caller and cannot be handed to other contexts.
 *
 * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the
 * virtual address of the direct mapping. Only real highmem pages are
 * temporarily mapped.
 *
 * While it is significantly faster than kmap() for the highmem case it
 * comes with restrictions about the pointer validity.
 *
 * On HIGHMEM enabled systems mapping a highmem page has the side effect of
 * disabling migration in order to keep the virtual address stable across
 * preemption. No caller of kmap_local_folio() can rely on this side effect.
 *
 * Context: Can be invoked from any context.
 * Return: The virtual address of @offset.
 */
static inline void *kmap_local_folio(struct folio *folio, size_t offset);

/**
 * kmap_atomic - Atomically map a page for temporary usage - Deprecated!
 * @page:        Pointer to the page to be mapped
 *
 * Returns: The virtual address of the mapping
 *
 * In fact a wrapper around kmap_local_page() which also disables pagefaults
 * and, depending on PREEMPT_RT configuration, also CPU migration and
 * preemption. Therefore users should not count on the latter two side effects.
 *
 * Mappings should always be released by kunmap_atomic().
 *
 * Do not use in new code. Use kmap_local_page() instead.
 *
 * It is used in atomic context when code wants to access the contents of a
 * page that might be allocated from high memory (see __GFP_HIGHMEM), for
 * example a page in the pagecache.  The API has two functions, and they
 * can be used in a manner similar to the following::
 *
 *   // Find the page of interest.
 *   struct page *page = find_get_page(mapping, offset);
 *
 *   // Gain access to the contents of that page.
 *   void *vaddr = kmap_atomic(page);
 *
 *   // Do something to the contents of that page.
 *   memset(vaddr, 0, PAGE_SIZE);
 *
 *   // Unmap that page.
 *   kunmap_atomic(vaddr);
 *
 * Note that the kunmap_atomic() call takes the result of the kmap_atomic()
 * call, not the argument.
 *
 * If you need to map two pages because you want to copy from one page to
 * another you need to keep the kmap_atomic calls strictly nested, like:
 *
 * vaddr1 = kmap_atomic(page1);
 * vaddr2 = kmap_atomic(page2);
 *
 * memcpy(vaddr1, vaddr2, PAGE_SIZE);
 *
 * kunmap_atomic(vaddr2);
 * kunmap_atomic(vaddr1);
 */
static inline void *kmap_atomic(struct page *page);

/* Highmem related interfaces for management code */
static inline unsigned long nr_free_highpages(void);
static inline unsigned long totalhigh_pages(void);

#ifndef ARCH_HAS_FLUSH_ANON_PAGE
static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr)
{
}
#endif

#ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
static inline void flush_kernel_vmap_range(void *vaddr, int size)
{
}
static inline void invalidate_kernel_vmap_range(void *vaddr, int size)
{
}
#endif

/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
#ifndef clear_user_highpage
static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
{
        void *addr = kmap_local_page(page);
        clear_user_page(addr, vaddr, page);
        kunmap_local(addr);
}
#endif

#ifndef vma_alloc_zeroed_movable_folio
/**
 * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA.
 * @vma: The VMA the page is to be allocated for.
 * @vaddr: The virtual address the page will be inserted into.
 *
 * This function will allocate a page suitable for inserting into this
 * VMA at this virtual address.  It may be allocated from highmem or
 * the movable zone.  An architecture may provide its own implementation.
 *
 * Return: A folio containing one allocated and zeroed page or NULL if
 * we are out of memory.
 */
static inline
struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma,
                                   unsigned long vaddr)
{
        struct folio *folio;

        folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr);
        if (folio && user_alloc_needs_zeroing())
                clear_user_highpage(&folio->page, vaddr);

        return folio;
}
#endif

static inline void clear_highpage(struct page *page)
{
        void *kaddr = kmap_local_page(page);
        clear_page(kaddr);
        kunmap_local(kaddr);
}

static inline void clear_highpage_kasan_tagged(struct page *page)
{
        void *kaddr = kmap_local_page(page);

        clear_page(kasan_reset_tag(kaddr));
        kunmap_local(kaddr);
}

#ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE

static inline void tag_clear_highpage(struct page *page)
{
}

#endif

/*
 * If we pass in a base or tail page, we can zero up to PAGE_SIZE.
 * If we pass in a head page, we can zero up to the size of the compound page.
 */
#ifdef CONFIG_HIGHMEM
void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
                unsigned start2, unsigned end2);
#else
static inline void zero_user_segments(struct page *page,
                unsigned start1, unsigned end1,
                unsigned start2, unsigned end2)
{
        void *kaddr = kmap_local_page(page);
        unsigned int i;

        BUG_ON(end1 > page_size(page) || end2 > page_size(page));

        if (end1 > start1)
                memset(kaddr + start1, 0, end1 - start1);

        if (end2 > start2)
                memset(kaddr + start2, 0, end2 - start2);

        kunmap_local(kaddr);
        for (i = 0; i < compound_nr(page); i++)
                flush_dcache_page(page + i);
}
#endif

static inline void zero_user_segment(struct page *page,
        unsigned start, unsigned end)
{
        zero_user_segments(page, start, end, 0, 0);
}

static inline void zero_user(struct page *page,
        unsigned start, unsigned size)
{
        zero_user_segments(page, start, start + size, 0, 0);
}

#ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE

static inline void copy_user_highpage(struct page *to, struct page *from,
        unsigned long vaddr, struct vm_area_struct *vma)
{
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        copy_user_page(vto, vfrom, vaddr, to);
        kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
        kunmap_local(vto);
        kunmap_local(vfrom);
}

#endif

#ifndef __HAVE_ARCH_COPY_HIGHPAGE

static inline void copy_highpage(struct page *to, struct page *from)
{
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        copy_page(vto, vfrom);
        kmsan_copy_page_meta(to, from);
        kunmap_local(vto);
        kunmap_local(vfrom);
}

#endif

#ifdef copy_mc_to_kernel
/*
 * If architecture supports machine check exception handling, define the
 * #MC versions of copy_user_highpage and copy_highpage. They copy a memory
 * page with #MC in source page (@from) handled, and return the number
 * of bytes not copied if there was a #MC, otherwise 0 for success.
 */
static inline int copy_mc_user_highpage(struct page *to, struct page *from,
                                        unsigned long vaddr, struct vm_area_struct *vma)
{
        unsigned long ret;
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
        if (!ret)
                kmsan_unpoison_memory(page_address(to), PAGE_SIZE);
        kunmap_local(vto);
        kunmap_local(vfrom);

        if (ret)
                memory_failure_queue(page_to_pfn(from), 0);

        return ret;
}

static inline int copy_mc_highpage(struct page *to, struct page *from)
{
        unsigned long ret;
        char *vfrom, *vto;

        vfrom = kmap_local_page(from);
        vto = kmap_local_page(to);
        ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
        if (!ret)
                kmsan_copy_page_meta(to, from);
        kunmap_local(vto);
        kunmap_local(vfrom);

        if (ret)
                memory_failure_queue(page_to_pfn(from), 0);

        return ret;
}
#else
static inline int copy_mc_user_highpage(struct page *to, struct page *from,
                                        unsigned long vaddr, struct vm_area_struct *vma)
{
        copy_user_highpage(to, from, vaddr, vma);
        return 0;
}

static inline int copy_mc_highpage(struct page *to, struct page *from)
{
        copy_highpage(to, from);
        return 0;
}
#endif

static inline void memcpy_page(struct page *dst_page, size_t dst_off,
                               struct page *src_page, size_t src_off,
                               size_t len)
{
        char *dst = kmap_local_page(dst_page);
        char *src = kmap_local_page(src_page);

        VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE);
        memcpy(dst + dst_off, src + src_off, len);
        kunmap_local(src);
        kunmap_local(dst);
}

static inline void memset_page(struct page *page, size_t offset, int val,
                               size_t len)
{
        char *addr = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memset(addr + offset, val, len);
        kunmap_local(addr);
}

static inline void memcpy_from_page(char *to, struct page *page,
                                    size_t offset, size_t len)
{
        char *from = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memcpy(to, from + offset, len);
        kunmap_local(from);
}

static inline void memcpy_to_page(struct page *page, size_t offset,
                                  const char *from, size_t len)
{
        char *to = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memcpy(to + offset, from, len);
        flush_dcache_page(page);
        kunmap_local(to);
}

static inline void memzero_page(struct page *page, size_t offset, size_t len)
{
        char *addr = kmap_local_page(page);

        VM_BUG_ON(offset + len > PAGE_SIZE);
        memset(addr + offset, 0, len);
        flush_dcache_page(page);
        kunmap_local(addr);
}

/**
 * memcpy_from_folio - Copy a range of bytes from a folio.
 * @to: The memory to copy to.
 * @folio: The folio to read from.
 * @offset: The first byte in the folio to read.
 * @len: The number of bytes to copy.
 */
static inline void memcpy_from_folio(char *to, struct folio *folio,
                size_t offset, size_t len)
{
        VM_BUG_ON(offset + len > folio_size(folio));

        do {
                const char *from = kmap_local_folio(folio, offset);
                size_t chunk = len;

                if (folio_test_highmem(folio) &&
                    chunk > PAGE_SIZE - offset_in_page(offset))
                        chunk = PAGE_SIZE - offset_in_page(offset);
                memcpy(to, from, chunk);
                kunmap_local(from);

                to += chunk;
                offset += chunk;
                len -= chunk;
        } while (len > 0);
}

/**
 * memcpy_to_folio - Copy a range of bytes to a folio.
 * @folio: The folio to write to.
 * @offset: The first byte in the folio to store to.
 * @from: The memory to copy from.
 * @len: The number of bytes to copy.
 */
static inline void memcpy_to_folio(struct folio *folio, size_t offset,
                const char *from, size_t len)
{
        VM_BUG_ON(offset + len > folio_size(folio));

        do {
                char *to = kmap_local_folio(folio, offset);
                size_t chunk = len;

                if (folio_test_highmem(folio) &&
                    chunk > PAGE_SIZE - offset_in_page(offset))
                        chunk = PAGE_SIZE - offset_in_page(offset);
                memcpy(to, from, chunk);
                kunmap_local(to);

                from += chunk;
                offset += chunk;
                len -= chunk;
        } while (len > 0);

        flush_dcache_folio(folio);
}

/**
 * folio_zero_tail - Zero the tail of a folio.
 * @folio: The folio to zero.
 * @offset: The byte offset in the folio to start zeroing at.
 * @kaddr: The address the folio is currently mapped to.
 *
 * If you have already used kmap_local_folio() to map a folio, written
 * some data to it and now need to zero the end of the folio (and flush
 * the dcache), you can use this function.  If you do not have the
 * folio kmapped (eg the folio has been partially populated by DMA),
 * use folio_zero_range() or folio_zero_segment() instead.
 *
 * Return: An address which can be passed to kunmap_local().
 */
static inline __must_check void *folio_zero_tail(struct folio *folio,
                size_t offset, void *kaddr)
{
        size_t len = folio_size(folio) - offset;

        if (folio_test_highmem(folio)) {
                size_t max = PAGE_SIZE - offset_in_page(offset);

                while (len > max) {
                        memset(kaddr, 0, max);
                        kunmap_local(kaddr);
                        len -= max;
                        offset += max;
                        max = PAGE_SIZE;
                        kaddr = kmap_local_folio(folio, offset);
                }
        }

        memset(kaddr, 0, len);
        flush_dcache_folio(folio);

        return kaddr;
}

/**
 * folio_fill_tail - Copy some data to a folio and pad with zeroes.
 * @folio: The destination folio.
 * @offset: The offset into @folio at which to start copying.
 * @from: The data to copy.
 * @len: How many bytes of data to copy.
 *
 * This function is most useful for filesystems which support inline data.
 * When they want to copy data from the inode into the page cache, this
 * function does everything for them.  It supports large folios even on
 * HIGHMEM configurations.
 */
static inline void folio_fill_tail(struct folio *folio, size_t offset,
                const char *from, size_t len)
{
        char *to = kmap_local_folio(folio, offset);

        VM_BUG_ON(offset + len > folio_size(folio));

        if (folio_test_highmem(folio)) {
                size_t max = PAGE_SIZE - offset_in_page(offset);

                while (len > max) {
                        memcpy(to, from, max);
                        kunmap_local(to);
                        len -= max;
                        from += max;
                        offset += max;
                        max = PAGE_SIZE;
                        to = kmap_local_folio(folio, offset);
                }
        }

        memcpy(to, from, len);
        to = folio_zero_tail(folio, offset + len, to + len);
        kunmap_local(to);
}

/**
 * memcpy_from_file_folio - Copy some bytes from a file folio.
 * @to: The destination buffer.
 * @folio: The folio to copy from.
 * @pos: The position in the file.
 * @len: The maximum number of bytes to copy.
 *
 * Copy up to @len bytes from this folio.  This may be limited by PAGE_SIZE
 * if the folio comes from HIGHMEM, and by the size of the folio.
 *
 * Return: The number of bytes copied from the folio.
 */
static inline size_t memcpy_from_file_folio(char *to, struct folio *folio,
                loff_t pos, size_t len)
{
        size_t offset = offset_in_folio(folio, pos);
        char *from = kmap_local_folio(folio, offset);

        if (folio_test_highmem(folio)) {
                offset = offset_in_page(offset);
                len = min_t(size_t, len, PAGE_SIZE - offset);
        } else
                len = min(len, folio_size(folio) - offset);

        memcpy(to, from, len);
        kunmap_local(from);

        return len;
}

/**
 * folio_zero_segments() - Zero two byte ranges in a folio.
 * @folio: The folio to write to.
 * @start1: The first byte to zero.
 * @xend1: One more than the last byte in the first range.
 * @start2: The first byte to zero in the second range.
 * @xend2: One more than the last byte in the second range.
 */
static inline void folio_zero_segments(struct folio *folio,
                size_t start1, size_t xend1, size_t start2, size_t xend2)
{
        zero_user_segments(&folio->page, start1, xend1, start2, xend2);
}

/**
 * folio_zero_segment() - Zero a byte range in a folio.
 * @folio: The folio to write to.
 * @start: The first byte to zero.
 * @xend: One more than the last byte to zero.
 */
static inline void folio_zero_segment(struct folio *folio,
                size_t start, size_t xend)
{
        zero_user_segments(&folio->page, start, xend, 0, 0);
}

/**
 * folio_zero_range() - Zero a byte range in a folio.
 * @folio: The folio to write to.
 * @start: The first byte to zero.
 * @length: The number of bytes to zero.
 */
static inline void folio_zero_range(struct folio *folio,
                size_t start, size_t length)
{
        zero_user_segments(&folio->page, start, start + length, 0, 0);
}

/**
 * folio_release_kmap - Unmap a folio and drop a refcount.
 * @folio: The folio to release.
 * @addr: The address previously returned by a call to kmap_local_folio().
 *
 * It is common, eg in directory handling to kmap a folio.  This function
 * unmaps the folio and drops the refcount that was being held to keep the
 * folio alive while we accessed it.
 */
static inline void folio_release_kmap(struct folio *folio, void *addr)
{
        kunmap_local(addr);
        folio_put(folio);
}

static inline void unmap_and_put_page(struct page *page, void *addr)
{
        folio_release_kmap(page_folio(page), addr);
}

#endif /* _LINUX_HIGHMEM_H */









































































































    7 


  180 

  180 




































































  180 








  180 
  180 










    7 









    7 






















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Implementation of the access vector table type.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

/* Updated: Frank Mayer <mayerf@tresys.com> and
 *          Karl MacMillan <kmacmillan@tresys.com>
 *          Added conditional policy language extensions
 *          Copyright (C) 2003 Tresys Technology, LLC
 *
 * Updated: Yuichi Nakamura <ynakam@hitachisoft.jp>
 *          Tuned number of hash slots for avtab to reduce memory usage
 */

#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include "avtab.h"
#include "policydb.h"

static struct kmem_cache *avtab_node_cachep __ro_after_init;
static struct kmem_cache *avtab_xperms_cachep __ro_after_init;

/* Based on MurmurHash3, written by Austin Appleby and placed in the
 * public domain.
 */
static inline u32 avtab_hash(const struct avtab_key *keyp, u32 mask)
{
        static const u32 c1 = 0xcc9e2d51;
        static const u32 c2 = 0x1b873593;
        static const u32 r1 = 15;
        static const u32 r2 = 13;
        static const u32 m = 5;
        static const u32 n = 0xe6546b64;

        u32 hash = 0;

#define mix(input)                                         \
        do {                                               \
                u32 v = input;                             \
                v *= c1;                                   \
                v = (v << r1) | (v >> (32 - r1));          \
                v *= c2;                                   \
                hash ^= v;                                 \
                hash = (hash << r2) | (hash >> (32 - r2)); \
                hash = hash * m + n;                       \
        } while (0)

        mix(keyp->target_class);
        mix(keyp->target_type);
        mix(keyp->source_type);

#undef mix

        hash ^= hash >> 16;
        hash *= 0x85ebca6b;
        hash ^= hash >> 13;
        hash *= 0xc2b2ae35;
        hash ^= hash >> 16;

        return hash & mask;
}

static struct avtab_node *avtab_insert_node(struct avtab *h,
                                            struct avtab_node **dst,
                                            const struct avtab_key *key,
                                            const struct avtab_datum *datum)
{
        struct avtab_node *newnode;
        struct avtab_extended_perms *xperms;
        newnode = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL);
        if (newnode == NULL)
                return NULL;
        newnode->key = *key;

        if (key->specified & AVTAB_XPERMS) {
                xperms = kmem_cache_zalloc(avtab_xperms_cachep, GFP_KERNEL);
                if (xperms == NULL) {
                        kmem_cache_free(avtab_node_cachep, newnode);
                        return NULL;
                }
                *xperms = *(datum->u.xperms);
                newnode->datum.u.xperms = xperms;
        } else {
                newnode->datum.u.data = datum->u.data;
        }

        newnode->next = *dst;
        *dst = newnode;

        h->nel++;
        return newnode;
}

static int avtab_node_cmp(const struct avtab_key *key1,
                          const struct avtab_key *key2)
{
        u16 specified = key1->specified & ~(AVTAB_ENABLED | AVTAB_ENABLED_OLD);

        if (key1->source_type == key2->source_type &&
            key1->target_type == key2->target_type &&
            key1->target_class == key2->target_class &&
            (specified & key2->specified))
                return 0;
        if (key1->source_type < key2->source_type)
                return -1;
        if (key1->source_type == key2->source_type &&
            key1->target_type < key2->target_type)
                return -1;
        if (key1->source_type == key2->source_type &&
            key1->target_type == key2->target_type &&
            key1->target_class < key2->target_class)
                return -1;
        return 1;
}

static int avtab_insert(struct avtab *h, const struct avtab_key *key,
                        const struct avtab_datum *datum)
{
        u32 hvalue;
        struct avtab_node *prev, *cur, *newnode;
        int cmp;

        if (!h || !h->nslot || h->nel == U32_MAX)
                return -EINVAL;

        hvalue = avtab_hash(key, h->mask);
        for (prev = NULL, cur = h->htable[hvalue]; cur;
             prev = cur, cur = cur->next) {
                cmp = avtab_node_cmp(key, &cur->key);
                /* extended perms may not be unique */
                if (cmp == 0 && !(key->specified & AVTAB_XPERMS))
                        return -EEXIST;
                if (cmp <= 0)
                        break;
        }

        newnode = avtab_insert_node(h, prev ? &prev->next : &h->htable[hvalue],
                                    key, datum);
        if (!newnode)
                return -ENOMEM;

        return 0;
}

/* Unlike avtab_insert(), this function allow multiple insertions of the same
 * key/specified mask into the table, as needed by the conditional avtab.
 * It also returns a pointer to the node inserted.
 */
struct avtab_node *avtab_insert_nonunique(struct avtab *h,
                                          const struct avtab_key *key,
                                          const struct avtab_datum *datum)
{
        u32 hvalue;
        struct avtab_node *prev, *cur;
        int cmp;

        if (!h || !h->nslot || h->nel == U32_MAX)
                return NULL;
        hvalue = avtab_hash(key, h->mask);
        for (prev = NULL, cur = h->htable[hvalue]; cur;
             prev = cur, cur = cur->next) {
                cmp = avtab_node_cmp(key, &cur->key);
                if (cmp <= 0)
                        break;
        }
        return avtab_insert_node(h, prev ? &prev->next : &h->htable[hvalue],
                                 key, datum);
}

/* This search function returns a node pointer, and can be used in
 * conjunction with avtab_search_next_node()
 */
struct avtab_node *avtab_search_node(struct avtab *h,
                                     const struct avtab_key *key)
{
        u32 hvalue;
        struct avtab_node *cur;
        int cmp;

        if (!h || !h->nslot)
                return NULL;

        hvalue = avtab_hash(key, h->mask);
        for (cur = h->htable[hvalue]; cur; cur = cur->next) {
                cmp = avtab_node_cmp(key, &cur->key);
                if (cmp == 0)
                        return cur;
                if (cmp < 0)
                        break;
        }
        return NULL;
}

struct avtab_node *avtab_search_node_next(struct avtab_node *node,
                                          u16 specified)
{
        struct avtab_key tmp_key;
        struct avtab_node *cur;
        int cmp;

        if (!node)
                return NULL;
        tmp_key = node->key;
        tmp_key.specified = specified;
        for (cur = node->next; cur; cur = cur->next) {
                cmp = avtab_node_cmp(&tmp_key, &cur->key);
                if (cmp == 0)
                        return cur;
                if (cmp < 0)
                        break;
        }
        return NULL;
}

void avtab_destroy(struct avtab *h)
{
        u32 i;
        struct avtab_node *cur, *temp;

        if (!h)
                return;

        for (i = 0; i < h->nslot; i++) {
                cur = h->htable[i];
                while (cur) {
                        temp = cur;
                        cur = cur->next;
                        if (temp->key.specified & AVTAB_XPERMS)
                                kmem_cache_free(avtab_xperms_cachep,
                                                temp->datum.u.xperms);
                        kmem_cache_free(avtab_node_cachep, temp);
                }
        }
        kvfree(h->htable);
        h->htable = NULL;
        h->nel = 0;
        h->nslot = 0;
        h->mask = 0;
}

void avtab_init(struct avtab *h)
{
        h->htable = NULL;
        h->nel = 0;
        h->nslot = 0;
        h->mask = 0;
}

static int avtab_alloc_common(struct avtab *h, u32 nslot)
{
        if (!nslot)
                return 0;

        h->htable = kvcalloc(nslot, sizeof(void *), GFP_KERNEL);
        if (!h->htable)
                return -ENOMEM;

        h->nslot = nslot;
        h->mask = nslot - 1;
        return 0;
}

int avtab_alloc(struct avtab *h, u32 nrules)
{
        int rc;
        u32 nslot = 0;

        if (nrules != 0) {
                nslot = nrules > 3 ? rounddown_pow_of_two(nrules / 2) : 2;
                if (nslot > MAX_AVTAB_HASH_BUCKETS)
                        nslot = MAX_AVTAB_HASH_BUCKETS;

                rc = avtab_alloc_common(h, nslot);
                if (rc)
                        return rc;
        }

        pr_debug("SELinux: %d avtab hash slots, %d rules.\n", nslot, nrules);
        return 0;
}

int avtab_alloc_dup(struct avtab *new, const struct avtab *orig)
{
        return avtab_alloc_common(new, orig->nslot);
}

#ifdef CONFIG_SECURITY_SELINUX_DEBUG
void avtab_hash_eval(struct avtab *h, const char *tag)
{
        u32 i, chain_len, slots_used, max_chain_len;
        unsigned long long chain2_len_sum;
        struct avtab_node *cur;

        slots_used = 0;
        max_chain_len = 0;
        chain2_len_sum = 0;
        for (i = 0; i < h->nslot; i++) {
                cur = h->htable[i];
                if (cur) {
                        slots_used++;
                        chain_len = 0;
                        while (cur) {
                                chain_len++;
                                cur = cur->next;
                        }

                        if (chain_len > max_chain_len)
                                max_chain_len = chain_len;
                        chain2_len_sum +=
                                (unsigned long long)chain_len * chain_len;
                }
        }

        pr_debug("SELinux: %s:  %d entries and %d/%d buckets used, "
                 "longest chain length %d, sum of chain length^2 %llu\n",
                 tag, h->nel, slots_used, h->nslot, max_chain_len,
                 chain2_len_sum);
}
#endif /* CONFIG_SECURITY_SELINUX_DEBUG */

/* clang-format off */
static const uint16_t spec_order[] = {
        AVTAB_ALLOWED,
        AVTAB_AUDITDENY,
        AVTAB_AUDITALLOW,
        AVTAB_TRANSITION,
        AVTAB_CHANGE,
        AVTAB_MEMBER,
        AVTAB_XPERMS_ALLOWED,
        AVTAB_XPERMS_AUDITALLOW,
        AVTAB_XPERMS_DONTAUDIT
};
/* clang-format on */

int avtab_read_item(struct avtab *a, struct policy_file *fp, struct policydb *pol,
                    int (*insertf)(struct avtab *a, const struct avtab_key *k,
                                   const struct avtab_datum *d, void *p),
                    void *p, bool conditional)
{
        __le16 buf16[4];
        u16 enabled;
        u32 items, items2, val, i;
        struct avtab_key key;
        struct avtab_datum datum;
        struct avtab_extended_perms xperms;
        __le32 buf32[ARRAY_SIZE(xperms.perms.p)];
        int rc;
        unsigned int set, vers = pol->policyvers;

        memset(&key, 0, sizeof(struct avtab_key));
        memset(&datum, 0, sizeof(struct avtab_datum));

        if (vers < POLICYDB_VERSION_AVTAB) {
                rc = next_entry(buf32, fp, sizeof(u32));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                items2 = le32_to_cpu(buf32[0]);
                if (items2 > ARRAY_SIZE(buf32)) {
                        pr_err("SELinux: avtab: entry overflow\n");
                        return -EINVAL;
                }
                rc = next_entry(buf32, fp, sizeof(u32) * items2);
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                items = 0;

                val = le32_to_cpu(buf32[items++]);
                key.source_type = (u16)val;
                if (key.source_type != val) {
                        pr_err("SELinux: avtab: truncated source type\n");
                        return -EINVAL;
                }
                val = le32_to_cpu(buf32[items++]);
                key.target_type = (u16)val;
                if (key.target_type != val) {
                        pr_err("SELinux: avtab: truncated target type\n");
                        return -EINVAL;
                }
                val = le32_to_cpu(buf32[items++]);
                key.target_class = (u16)val;
                if (key.target_class != val) {
                        pr_err("SELinux: avtab: truncated target class\n");
                        return -EINVAL;
                }

                val = le32_to_cpu(buf32[items++]);
                enabled = (val & AVTAB_ENABLED_OLD) ? AVTAB_ENABLED : 0;

                if (!(val & (AVTAB_AV | AVTAB_TYPE))) {
                        pr_err("SELinux: avtab: null entry\n");
                        return -EINVAL;
                }
                if ((val & AVTAB_AV) && (val & AVTAB_TYPE)) {
                        pr_err("SELinux: avtab: entry has both access vectors and types\n");
                        return -EINVAL;
                }
                if (val & AVTAB_XPERMS) {
                        pr_err("SELinux: avtab: entry has extended permissions\n");
                        return -EINVAL;
                }

                for (i = 0; i < ARRAY_SIZE(spec_order); i++) {
                        if (val & spec_order[i]) {
                                key.specified = spec_order[i] | enabled;
                                datum.u.data = le32_to_cpu(buf32[items++]);
                                rc = insertf(a, &key, &datum, p);
                                if (rc)
                                        return rc;
                        }
                }

                if (items != items2) {
                        pr_err("SELinux: avtab: entry only had %d items, expected %d\n",
                               items2, items);
                        return -EINVAL;
                }
                return 0;
        }

        rc = next_entry(buf16, fp, sizeof(u16) * 4);
        if (rc) {
                pr_err("SELinux: avtab: truncated entry\n");
                return rc;
        }

        items = 0;
        key.source_type = le16_to_cpu(buf16[items++]);
        key.target_type = le16_to_cpu(buf16[items++]);
        key.target_class = le16_to_cpu(buf16[items++]);
        key.specified = le16_to_cpu(buf16[items++]);

        if (!policydb_type_isvalid(pol, key.source_type) ||
            !policydb_type_isvalid(pol, key.target_type) ||
            !policydb_class_isvalid(pol, key.target_class)) {
                pr_err("SELinux: avtab: invalid type or class\n");
                return -EINVAL;
        }

        set = hweight16(key.specified & (AVTAB_XPERMS | AVTAB_TYPE | AVTAB_AV));
        if (!set || set > 1) {
                pr_err("SELinux:  avtab:  more than one specifier\n");
                return -EINVAL;
        }

        if ((vers < POLICYDB_VERSION_XPERMS_IOCTL) &&
            (key.specified & AVTAB_XPERMS)) {
                pr_err("SELinux:  avtab:  policy version %u does not "
                       "support extended permissions rules and one "
                       "was specified\n",
                       vers);
                return -EINVAL;
        } else if ((vers < POLICYDB_VERSION_COND_XPERMS) &&
                   (key.specified & AVTAB_XPERMS) && conditional) {
                pr_err("SELinux:  avtab:  policy version %u does not "
                       "support extended permissions rules in conditional "
                       "policies and one was specified\n",
                       vers);
                return -EINVAL;
        } else if (key.specified & AVTAB_XPERMS) {
                memset(&xperms, 0, sizeof(struct avtab_extended_perms));
                rc = next_entry(&xperms.specified, fp, sizeof(u8));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                rc = next_entry(&xperms.driver, fp, sizeof(u8));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                rc = next_entry(buf32, fp,
                                sizeof(u32) * ARRAY_SIZE(xperms.perms.p));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                for (i = 0; i < ARRAY_SIZE(xperms.perms.p); i++)
                        xperms.perms.p[i] = le32_to_cpu(buf32[i]);
                datum.u.xperms = &xperms;
        } else {
                rc = next_entry(buf32, fp, sizeof(u32));
                if (rc) {
                        pr_err("SELinux: avtab: truncated entry\n");
                        return rc;
                }
                datum.u.data = le32_to_cpu(*buf32);
        }
        if ((key.specified & AVTAB_TYPE) &&
            !policydb_type_isvalid(pol, datum.u.data)) {
                pr_err("SELinux: avtab: invalid type\n");
                return -EINVAL;
        }
        return insertf(a, &key, &datum, p);
}

static int avtab_insertf(struct avtab *a, const struct avtab_key *k,
                         const struct avtab_datum *d, void *p)
{
        return avtab_insert(a, k, d);
}

int avtab_read(struct avtab *a, struct policy_file *fp, struct policydb *pol)
{
        int rc;
        __le32 buf[1];
        u32 nel, i;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc < 0) {
                pr_err("SELinux: avtab: truncated table\n");
                goto bad;
        }
        nel = le32_to_cpu(buf[0]);
        if (!nel) {
                pr_err("SELinux: avtab: table is empty\n");
                rc = -EINVAL;
                goto bad;
        }

        rc = avtab_alloc(a, nel);
        if (rc)
                goto bad;

        for (i = 0; i < nel; i++) {
                rc = avtab_read_item(a, fp, pol, avtab_insertf, NULL, false);
                if (rc) {
                        if (rc == -ENOMEM)
                                pr_err("SELinux: avtab: out of memory\n");
                        else if (rc == -EEXIST)
                                pr_err("SELinux: avtab: duplicate entry\n");

                        goto bad;
                }
        }

        rc = 0;
out:
        return rc;

bad:
        avtab_destroy(a);
        goto out;
}

int avtab_write_item(struct policydb *p, const struct avtab_node *cur, struct policy_file *fp)
{
        __le16 buf16[4];
        __le32 buf32[ARRAY_SIZE(cur->datum.u.xperms->perms.p)];
        int rc;
        unsigned int i;

        buf16[0] = cpu_to_le16(cur->key.source_type);
        buf16[1] = cpu_to_le16(cur->key.target_type);
        buf16[2] = cpu_to_le16(cur->key.target_class);
        buf16[3] = cpu_to_le16(cur->key.specified);
        rc = put_entry(buf16, sizeof(u16), 4, fp);
        if (rc)
                return rc;

        if (cur->key.specified & AVTAB_XPERMS) {
                rc = put_entry(&cur->datum.u.xperms->specified, sizeof(u8), 1,
                               fp);
                if (rc)
                        return rc;
                rc = put_entry(&cur->datum.u.xperms->driver, sizeof(u8), 1, fp);
                if (rc)
                        return rc;
                for (i = 0; i < ARRAY_SIZE(cur->datum.u.xperms->perms.p); i++)
                        buf32[i] = cpu_to_le32(cur->datum.u.xperms->perms.p[i]);
                rc = put_entry(buf32, sizeof(u32),
                               ARRAY_SIZE(cur->datum.u.xperms->perms.p), fp);
        } else {
                buf32[0] = cpu_to_le32(cur->datum.u.data);
                rc = put_entry(buf32, sizeof(u32), 1, fp);
        }
        if (rc)
                return rc;
        return 0;
}

int avtab_write(struct policydb *p, struct avtab *a, struct policy_file *fp)
{
        u32 i;
        int rc = 0;
        struct avtab_node *cur;
        __le32 buf[1];

        buf[0] = cpu_to_le32(a->nel);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        for (i = 0; i < a->nslot; i++) {
                for (cur = a->htable[i]; cur; cur = cur->next) {
                        rc = avtab_write_item(p, cur, fp);
                        if (rc)
                                return rc;
                }
        }

        return rc;
}

void __init avtab_cache_init(void)
{
        avtab_node_cachep = KMEM_CACHE(avtab_node, SLAB_PANIC);
        avtab_xperms_cachep = KMEM_CACHE(avtab_extended_perms, SLAB_PANIC);
}
















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __FS_NOTIFY_FSNOTIFY_H_
#define __FS_NOTIFY_FSNOTIFY_H_

#include <linux/list.h>
#include <linux/fsnotify.h>
#include <linux/srcu.h>
#include <linux/types.h>

#include "../mount.h"

/*
 * fsnotify_connp_t is what we embed in objects which connector can be attached
 * to.
 */
typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t;

static inline struct inode *fsnotify_conn_inode(
                                struct fsnotify_mark_connector *conn)
{
        return conn->obj;
}

static inline struct mount *fsnotify_conn_mount(
                                struct fsnotify_mark_connector *conn)
{
        return real_mount(conn->obj);
}

static inline struct super_block *fsnotify_conn_sb(
                                struct fsnotify_mark_connector *conn)
{
        return conn->obj;
}

static inline struct mnt_namespace *fsnotify_conn_mntns(
                                struct fsnotify_mark_connector *conn)
{
        return conn->obj;
}

static inline struct super_block *fsnotify_object_sb(void *obj,
                        enum fsnotify_obj_type obj_type)
{
        switch (obj_type) {
        case FSNOTIFY_OBJ_TYPE_INODE:
                return ((struct inode *)obj)->i_sb;
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                return ((struct vfsmount *)obj)->mnt_sb;
        case FSNOTIFY_OBJ_TYPE_SB:
                return (struct super_block *)obj;
        default:
                return NULL;
        }
}

static inline struct super_block *fsnotify_connector_sb(
                                struct fsnotify_mark_connector *conn)
{
        return fsnotify_object_sb(conn->obj, conn->type);
}

static inline fsnotify_connp_t *fsnotify_sb_marks(struct super_block *sb)
{
        struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb);

        return sbinfo ? &sbinfo->sb_marks : NULL;
}

/* destroy all events sitting in this groups notification queue */
extern void fsnotify_flush_notify(struct fsnotify_group *group);

/* protects reads of inode and vfsmount marks list */
extern struct srcu_struct fsnotify_mark_srcu;

/* compare two groups for sorting of marks lists */
extern int fsnotify_compare_groups(struct fsnotify_group *a,
                                   struct fsnotify_group *b);

/* Destroy all marks attached to an object via connector */
extern void fsnotify_destroy_marks(fsnotify_connp_t *connp);
/* run the list of all marks associated with inode and destroy them */
static inline void fsnotify_clear_marks_by_inode(struct inode *inode)
{
        fsnotify_destroy_marks(&inode->i_fsnotify_marks);
}
/* run the list of all marks associated with vfsmount and destroy them */
static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt)
{
        fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks);
}
/* run the list of all marks associated with sb and destroy them */
static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
{
        fsnotify_destroy_marks(fsnotify_sb_marks(sb));
}

static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns)
{
        fsnotify_destroy_marks(&mntns->n_fsnotify_marks);
}

/*
 * update the dentry->d_flags of all of inode's children to indicate if inode cares
 * about events that happen to its children.
 */
extern void fsnotify_set_children_dentry_flags(struct inode *inode);

extern struct kmem_cache *fsnotify_mark_connector_cachep;

#endif        /* __FS_NOTIFY_FSNOTIFY_H_ */












  595 

















  598 










  599 

  597 

  595 
  600 


















  208 







  208 

  209 

  208 
  208 































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Access kernel or user memory without faulting.
 */
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
#include <asm/tlb.h>

bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
                size_t size)
{
        return true;
}

/*
 * The below only uses kmsan_check_memory() to ensure uninitialized kernel
 * memory isn't leaked.
 */
#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label)        \
        while (len >= sizeof(type)) {                                        \
                __get_kernel_nofault(dst, src, type, err_label);        \
                kmsan_check_memory(src, sizeof(type));                        \
                dst += sizeof(type);                                        \
                src += sizeof(type);                                        \
                len -= sizeof(type);                                        \
        }

long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
{
        unsigned long align = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                align = (unsigned long)dst | (unsigned long)src;

        if (!copy_from_kernel_nofault_allowed(src, size))
                return -ERANGE;

        pagefault_disable();
        if (!(align & 7))
                copy_from_kernel_nofault_loop(dst, src, size, u64, Efault);
        if (!(align & 3))
                copy_from_kernel_nofault_loop(dst, src, size, u32, Efault);
        if (!(align & 1))
                copy_from_kernel_nofault_loop(dst, src, size, u16, Efault);
        copy_from_kernel_nofault_loop(dst, src, size, u8, Efault);
        pagefault_enable();
        return 0;
Efault:
        pagefault_enable();
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);

#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label)        \
        while (len >= sizeof(type)) {                                        \
                __put_kernel_nofault(dst, src, type, err_label);        \
                instrument_write(dst, sizeof(type));                        \
                dst += sizeof(type);                                        \
                src += sizeof(type);                                        \
                len -= sizeof(type);                                        \
        }

long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
{
        unsigned long align = 0;

        if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
                align = (unsigned long)dst | (unsigned long)src;

        pagefault_disable();
        if (!(align & 7))
                copy_to_kernel_nofault_loop(dst, src, size, u64, Efault);
        if (!(align & 3))
                copy_to_kernel_nofault_loop(dst, src, size, u32, Efault);
        if (!(align & 1))
                copy_to_kernel_nofault_loop(dst, src, size, u16, Efault);
        copy_to_kernel_nofault_loop(dst, src, size, u8, Efault);
        pagefault_enable();
        return 0;
Efault:
        pagefault_enable();
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(copy_to_kernel_nofault);

long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
        const void *src = unsafe_addr;

        if (unlikely(count <= 0))
                return 0;
        if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
                return -ERANGE;

        pagefault_disable();
        do {
                __get_kernel_nofault(dst, src, u8, Efault);
                dst++;
                src++;
        } while (dst[-1] && src - unsafe_addr < count);
        pagefault_enable();

        dst[-1] = '\0';
        return src - unsafe_addr;
Efault:
        pagefault_enable();
        dst[0] = '\0';
        return -EFAULT;
}

/**
 * copy_from_user_nofault(): safely attempt to read from a user-space location
 * @dst: pointer to the buffer that shall take the data
 * @src: address to read from. This must be a user address.
 * @size: size of the data chunk
 *
 * Safely read from user address @src to the buffer at @dst. If a kernel fault
 * happens, handle that and return -EFAULT.
 */
long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
{
        long ret = -EFAULT;

        if (!__access_ok(src, size))
                return ret;

        if (!nmi_uaccess_okay())
                return ret;

        pagefault_disable();
        ret = __copy_from_user_inatomic(dst, src, size);
        pagefault_enable();

        if (ret)
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(copy_from_user_nofault);

/**
 * copy_to_user_nofault(): safely attempt to write to a user-space location
 * @dst: address to write to
 * @src: pointer to the data that shall be written
 * @size: size of the data chunk
 *
 * Safely write to address @dst from the buffer at @src.  If a kernel fault
 * happens, handle that and return -EFAULT.
 */
long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
{
        long ret = -EFAULT;

        if (access_ok(dst, size)) {
                pagefault_disable();
                ret = __copy_to_user_inatomic(dst, src, size);
                pagefault_enable();
        }

        if (ret)
                return -EFAULT;
        return 0;
}
EXPORT_SYMBOL_GPL(copy_to_user_nofault);

/**
 * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user
 *                                address.
 * @dst:   Destination address, in kernel space.  This buffer must be at
 *         least @count bytes long.
 * @unsafe_addr: Unsafe user address.
 * @count: Maximum number of bytes to copy, including the trailing NUL.
 *
 * Copies a NUL-terminated string from unsafe user address to kernel buffer.
 *
 * On success, returns the length of the string INCLUDING the trailing NUL.
 *
 * If access fails, returns -EFAULT (some data may have been copied
 * and the trailing NUL added).
 *
 * If @count is smaller than the length of the string, copies @count-1 bytes,
 * sets the last byte of @dst buffer to NUL and returns @count.
 */
long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
                              long count)
{
        long ret;

        if (unlikely(count <= 0))
                return 0;

        pagefault_disable();
        ret = strncpy_from_user(dst, unsafe_addr, count);
        pagefault_enable();

        if (ret >= count) {
                ret = count;
                dst[ret - 1] = '\0';
        } else if (ret > 0) {
                ret++;
        }

        return ret;
}

/**
 * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL.
 * @unsafe_addr: The string to measure.
 * @count: Maximum count (including NUL)
 *
 * Get the size of a NUL-terminated string in user space without pagefault.
 *
 * Returns the size of the string INCLUDING the terminating NUL.
 *
 * If the string is too long, returns a number larger than @count. User
 * has to check the return value against "> count".
 * On exception (or invalid count), returns 0.
 *
 * Unlike strnlen_user, this can be used from IRQ handler etc. because
 * it disables pagefaults.
 */
long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
        int ret;

        pagefault_disable();
        ret = strnlen_user(unsafe_addr, count);
        pagefault_enable();

        return ret;
}

void __copy_overflow(int size, unsigned long count)
{
        WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
}
EXPORT_SYMBOL(__copy_overflow);
















































































































































































    4 































    4 




    4 























    4 




    4 








    4 








    4 























    4 




    4 




































    4 

    4 



























































    4 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/fs/ext4/readpage.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 * Copyright (C) 2015, Google, Inc.
 *
 * This was originally taken from fs/mpage.c
 *
 * The ext4_mpage_readpages() function here is intended to
 * replace mpage_readahead() in the general case, not just for
 * encrypted files.  It has some limitations (see below), where it
 * will fall back to read_block_full_page(), but these limitations
 * should only be hit when page_size != block_size.
 *
 * This will allow us to attach a callback function to support ext4
 * encryption.
 *
 * If anything unusual happens, such as:
 *
 * - encountering a page which has buffers
 * - encountering a page which has a non-hole after a hole
 * - encountering a page with non-contiguous blocks
 *
 * then this code just gives up and calls the buffer_head-based read function.
 * It does handle a page which has holes at the end - that is a common case:
 * the end-of-file on blocksize < PAGE_SIZE setups.
 *
 */

#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/kdev_t.h>
#include <linux/gfp.h>
#include <linux/bio.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
#include <linux/highmem.h>
#include <linux/prefetch.h>
#include <linux/mpage.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>

#include "ext4.h"

#define NUM_PREALLOC_POST_READ_CTXS        128

static struct kmem_cache *bio_post_read_ctx_cache;
static mempool_t *bio_post_read_ctx_pool;

/* postprocessing steps for read bios */
enum bio_post_read_step {
        STEP_INITIAL = 0,
        STEP_DECRYPT,
        STEP_VERITY,
        STEP_MAX,
};

struct bio_post_read_ctx {
        struct bio *bio;
        struct work_struct work;
        unsigned int cur_step;
        unsigned int enabled_steps;
};

static void __read_end_io(struct bio *bio)
{
        struct folio_iter fi;

        bio_for_each_folio_all(fi, bio)
                folio_end_read(fi.folio, bio->bi_status == 0);
        if (bio->bi_private)
                mempool_free(bio->bi_private, bio_post_read_ctx_pool);
        bio_put(bio);
}

static void bio_post_read_processing(struct bio_post_read_ctx *ctx);

static void decrypt_work(struct work_struct *work)
{
        struct bio_post_read_ctx *ctx =
                container_of(work, struct bio_post_read_ctx, work);
        struct bio *bio = ctx->bio;

        if (fscrypt_decrypt_bio(bio))
                bio_post_read_processing(ctx);
        else
                __read_end_io(bio);
}

static void verity_work(struct work_struct *work)
{
        struct bio_post_read_ctx *ctx =
                container_of(work, struct bio_post_read_ctx, work);
        struct bio *bio = ctx->bio;

        /*
         * fsverity_verify_bio() may call readahead() again, and although verity
         * will be disabled for that, decryption may still be needed, causing
         * another bio_post_read_ctx to be allocated.  So to guarantee that
         * mempool_alloc() never deadlocks we must free the current ctx first.
         * This is safe because verity is the last post-read step.
         */
        BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX);
        mempool_free(ctx, bio_post_read_ctx_pool);
        bio->bi_private = NULL;

        fsverity_verify_bio(bio);

        __read_end_io(bio);
}

static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
{
        /*
         * We use different work queues for decryption and for verity because
         * verity may require reading metadata pages that need decryption, and
         * we shouldn't recurse to the same workqueue.
         */
        switch (++ctx->cur_step) {
        case STEP_DECRYPT:
                if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
                        INIT_WORK(&ctx->work, decrypt_work);
                        fscrypt_enqueue_decrypt_work(&ctx->work);
                        return;
                }
                ctx->cur_step++;
                fallthrough;
        case STEP_VERITY:
                if (ctx->enabled_steps & (1 << STEP_VERITY)) {
                        INIT_WORK(&ctx->work, verity_work);
                        fsverity_enqueue_verify_work(&ctx->work);
                        return;
                }
                ctx->cur_step++;
                fallthrough;
        default:
                __read_end_io(ctx->bio);
        }
}

static bool bio_post_read_required(struct bio *bio)
{
        return bio->bi_private && !bio->bi_status;
}

/*
 * I/O completion handler for multipage BIOs.
 *
 * The mpage code never puts partial pages into a BIO (except for end-of-file).
 * If a page does not map to a contiguous run of blocks then it simply falls
 * back to block_read_full_folio().
 *
 * Why is this?  If a page's completion depends on a number of different BIOs
 * which can complete in any order (or at the same time) then determining the
 * status of that page is hard.  See end_buffer_async_read() for the details.
 * There is no point in duplicating all that complexity.
 */
static void mpage_end_io(struct bio *bio)
{
        if (bio_post_read_required(bio)) {
                struct bio_post_read_ctx *ctx = bio->bi_private;

                ctx->cur_step = STEP_INITIAL;
                bio_post_read_processing(ctx);
                return;
        }
        __read_end_io(bio);
}

static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx)
{
        return fsverity_active(inode) &&
               idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
}

static void ext4_set_bio_post_read_ctx(struct bio *bio,
                                       const struct inode *inode,
                                       pgoff_t first_idx)
{
        unsigned int post_read_steps = 0;

        if (fscrypt_inode_uses_fs_layer_crypto(inode))
                post_read_steps |= 1 << STEP_DECRYPT;

        if (ext4_need_verity(inode, first_idx))
                post_read_steps |= 1 << STEP_VERITY;

        if (post_read_steps) {
                /* Due to the mempool, this never fails. */
                struct bio_post_read_ctx *ctx =
                        mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);

                ctx->bio = bio;
                ctx->enabled_steps = post_read_steps;
                bio->bi_private = ctx;
        }
}

static inline loff_t ext4_readpage_limit(struct inode *inode)
{
        if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
                return inode->i_sb->s_maxbytes;

        return i_size_read(inode);
}

int ext4_mpage_readpages(struct inode *inode,
                struct readahead_control *rac, struct folio *folio)
{
        struct bio *bio = NULL;
        sector_t last_block_in_bio = 0;

        const unsigned blkbits = inode->i_blkbits;
        const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
        const unsigned blocksize = 1 << blkbits;
        sector_t next_block;
        sector_t block_in_file;
        sector_t last_block;
        sector_t last_block_in_file;
        sector_t first_block;
        unsigned page_block;
        struct block_device *bdev = inode->i_sb->s_bdev;
        int length;
        unsigned relative_block = 0;
        struct ext4_map_blocks map;
        unsigned int nr_pages = rac ? readahead_count(rac) : 1;

        map.m_pblk = 0;
        map.m_lblk = 0;
        map.m_len = 0;
        map.m_flags = 0;

        for (; nr_pages; nr_pages--) {
                int fully_mapped = 1;
                unsigned first_hole = blocks_per_page;

                if (rac)
                        folio = readahead_folio(rac);
                prefetchw(&folio->flags);

                if (folio_buffers(folio))
                        goto confused;

                block_in_file = next_block =
                        (sector_t)folio->index << (PAGE_SHIFT - blkbits);
                last_block = block_in_file + nr_pages * blocks_per_page;
                last_block_in_file = (ext4_readpage_limit(inode) +
                                      blocksize - 1) >> blkbits;
                if (last_block > last_block_in_file)
                        last_block = last_block_in_file;
                page_block = 0;

                /*
                 * Map blocks using the previous result first.
                 */
                if ((map.m_flags & EXT4_MAP_MAPPED) &&
                    block_in_file > map.m_lblk &&
                    block_in_file < (map.m_lblk + map.m_len)) {
                        unsigned map_offset = block_in_file - map.m_lblk;
                        unsigned last = map.m_len - map_offset;

                        first_block = map.m_pblk + map_offset;
                        for (relative_block = 0; ; relative_block++) {
                                if (relative_block == last) {
                                        /* needed? */
                                        map.m_flags &= ~EXT4_MAP_MAPPED;
                                        break;
                                }
                                if (page_block == blocks_per_page)
                                        break;
                                page_block++;
                                block_in_file++;
                        }
                }

                /*
                 * Then do more ext4_map_blocks() calls until we are
                 * done with this folio.
                 */
                while (page_block < blocks_per_page) {
                        if (block_in_file < last_block) {
                                map.m_lblk = block_in_file;
                                map.m_len = last_block - block_in_file;

                                if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
                                set_error_page:
                                        folio_zero_segment(folio, 0,
                                                          folio_size(folio));
                                        folio_unlock(folio);
                                        goto next_page;
                                }
                        }
                        if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
                                fully_mapped = 0;
                                if (first_hole == blocks_per_page)
                                        first_hole = page_block;
                                page_block++;
                                block_in_file++;
                                continue;
                        }
                        if (first_hole != blocks_per_page)
                                goto confused;                /* hole -> non-hole */

                        /* Contiguous blocks? */
                        if (!page_block)
                                first_block = map.m_pblk;
                        else if (first_block + page_block != map.m_pblk)
                                goto confused;
                        for (relative_block = 0; ; relative_block++) {
                                if (relative_block == map.m_len) {
                                        /* needed? */
                                        map.m_flags &= ~EXT4_MAP_MAPPED;
                                        break;
                                } else if (page_block == blocks_per_page)
                                        break;
                                page_block++;
                                block_in_file++;
                        }
                }
                if (first_hole != blocks_per_page) {
                        folio_zero_segment(folio, first_hole << blkbits,
                                          folio_size(folio));
                        if (first_hole == 0) {
                                if (ext4_need_verity(inode, folio->index) &&
                                    !fsverity_verify_folio(folio))
                                        goto set_error_page;
                                folio_end_read(folio, true);
                                continue;
                        }
                } else if (fully_mapped) {
                        folio_set_mappedtodisk(folio);
                }

                /*
                 * This folio will go to BIO.  Do we need to send this
                 * BIO off first?
                 */
                if (bio && (last_block_in_bio != first_block - 1 ||
                            !fscrypt_mergeable_bio(bio, inode, next_block))) {
                submit_and_realloc:
                        submit_bio(bio);
                        bio = NULL;
                }
                if (bio == NULL) {
                        /*
                         * bio_alloc will _always_ be able to allocate a bio if
                         * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset().
                         */
                        bio = bio_alloc(bdev, bio_max_segs(nr_pages),
                                        REQ_OP_READ, GFP_KERNEL);
                        fscrypt_set_bio_crypt_ctx(bio, inode, next_block,
                                                  GFP_KERNEL);
                        ext4_set_bio_post_read_ctx(bio, inode, folio->index);
                        bio->bi_iter.bi_sector = first_block << (blkbits - 9);
                        bio->bi_end_io = mpage_end_io;
                        if (rac)
                                bio->bi_opf |= REQ_RAHEAD;
                }

                length = first_hole << blkbits;
                if (!bio_add_folio(bio, folio, length, 0))
                        goto submit_and_realloc;

                if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
                     (relative_block == map.m_len)) ||
                    (first_hole != blocks_per_page)) {
                        submit_bio(bio);
                        bio = NULL;
                } else
                        last_block_in_bio = first_block + blocks_per_page - 1;
                continue;
        confused:
                if (bio) {
                        submit_bio(bio);
                        bio = NULL;
                }
                if (!folio_test_uptodate(folio))
                        block_read_full_folio(folio, ext4_get_block);
                else
                        folio_unlock(folio);
next_page:
                ; /* A label shall be followed by a statement until C23 */
        }
        if (bio)
                submit_bio(bio);
        return 0;
}

int __init ext4_init_post_read_processing(void)
{
        bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT);

        if (!bio_post_read_ctx_cache)
                goto fail;
        bio_post_read_ctx_pool =
                mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
                                         bio_post_read_ctx_cache);
        if (!bio_post_read_ctx_pool)
                goto fail_free_cache;
        return 0;

fail_free_cache:
        kmem_cache_destroy(bio_post_read_ctx_cache);
fail:
        return -ENOMEM;
}

void ext4_exit_post_read_processing(void)
{
        mempool_destroy(bio_post_read_ctx_pool);
        kmem_cache_destroy(bio_post_read_ctx_cache);
}














   58 

































  960 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/bitops.h>
#include <asm/types.h>

/**
 * hweightN - returns the hamming weight of a N-bit word
 * @x: the word to weigh
 *
 * The Hamming Weight of a number is the total number of bits set in it.
 */

unsigned int __sw_hweight32(unsigned int w)
{
#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
        w -= (w >> 1) & 0x55555555;
        w =  (w & 0x33333333) + ((w >> 2) & 0x33333333);
        w =  (w + (w >> 4)) & 0x0f0f0f0f;
        return (w * 0x01010101) >> 24;
#else
        unsigned int res = w - ((w >> 1) & 0x55555555);
        res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
        res = (res + (res >> 4)) & 0x0F0F0F0F;
        res = res + (res >> 8);
        return (res + (res >> 16)) & 0x000000FF;
#endif
}
EXPORT_SYMBOL(__sw_hweight32);

unsigned int __sw_hweight16(unsigned int w)
{
        unsigned int res = w - ((w >> 1) & 0x5555);
        res = (res & 0x3333) + ((res >> 2) & 0x3333);
        res = (res + (res >> 4)) & 0x0F0F;
        return (res + (res >> 8)) & 0x00FF;
}
EXPORT_SYMBOL(__sw_hweight16);

unsigned int __sw_hweight8(unsigned int w)
{
        unsigned int res = w - ((w >> 1) & 0x55);
        res = (res & 0x33) + ((res >> 2) & 0x33);
        return (res + (res >> 4)) & 0x0F;
}
EXPORT_SYMBOL(__sw_hweight8);

unsigned long __sw_hweight64(__u64 w)
{
#if BITS_PER_LONG == 32
        return __sw_hweight32((unsigned int)(w >> 32)) +
               __sw_hweight32((unsigned int)w);
#elif BITS_PER_LONG == 64
#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER
        w -= (w >> 1) & 0x5555555555555555ul;
        w =  (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul);
        w =  (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful;
        return (w * 0x0101010101010101ul) >> 56;
#else
        __u64 res = w - ((w >> 1) & 0x5555555555555555ul);
        res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
        res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful;
        res = res + (res >> 8);
        res = res + (res >> 16);
        return (res + (res >> 32)) & 0x00000000000000FFul;
#endif
#endif
}
EXPORT_SYMBOL(__sw_hweight64);






































































































































































































   32 




































   18 












    8 



























    8 



    8 







    8 






















































































































































   33 















   33 












   32 

   33 


   32 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
// SPDX-License-Identifier: GPL-2.0-only
/*
 * common LSM auditing functions
 *
 * Based on code written for SELinux by :
 *                        Stephen Smalley
 *                         James Morris <jmorris@redhat.com>
 * Author : Etienne Basset, <etienne.basset@ensta.org>
 */

#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <net/sock.h>
#include <linux/un.h>
#include <net/af_unix.h>
#include <linux/audit.h>
#include <linux/ipv6.h>
#include <linux/ip.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/dccp.h>
#include <linux/sctp.h>
#include <linux/lsm_audit.h>
#include <linux/security.h>

/**
 * ipv4_skb_to_auditdata : fill auditdata from skb
 * @skb : the skb
 * @ad : the audit data to fill
 * @proto : the layer 4 protocol
 *
 * return  0 on success
 */
int ipv4_skb_to_auditdata(struct sk_buff *skb,
                struct common_audit_data *ad, u8 *proto)
{
        int ret = 0;
        struct iphdr *ih;

        ih = ip_hdr(skb);
        ad->u.net->v4info.saddr = ih->saddr;
        ad->u.net->v4info.daddr = ih->daddr;

        if (proto)
                *proto = ih->protocol;
        /* non initial fragment */
        if (ntohs(ih->frag_off) & IP_OFFSET)
                return 0;

        switch (ih->protocol) {
        case IPPROTO_TCP: {
                struct tcphdr *th = tcp_hdr(skb);

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }
        case IPPROTO_UDP: {
                struct udphdr *uh = udp_hdr(skb);

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }
        case IPPROTO_DCCP: {
                struct dccp_hdr *dh = dccp_hdr(skb);

                ad->u.net->sport = dh->dccph_sport;
                ad->u.net->dport = dh->dccph_dport;
                break;
        }
        case IPPROTO_SCTP: {
                struct sctphdr *sh = sctp_hdr(skb);

                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
        default:
                ret = -EINVAL;
        }
        return ret;
}
#if IS_ENABLED(CONFIG_IPV6)
/**
 * ipv6_skb_to_auditdata : fill auditdata from skb
 * @skb : the skb
 * @ad : the audit data to fill
 * @proto : the layer 4 protocol
 *
 * return  0 on success
 */
int ipv6_skb_to_auditdata(struct sk_buff *skb,
                struct common_audit_data *ad, u8 *proto)
{
        int offset, ret = 0;
        struct ipv6hdr *ip6;
        u8 nexthdr;
        __be16 frag_off;

        ip6 = ipv6_hdr(skb);
        ad->u.net->v6info.saddr = ip6->saddr;
        ad->u.net->v6info.daddr = ip6->daddr;
        /* IPv6 can have several extension header before the Transport header
         * skip them */
        offset = skb_network_offset(skb);
        offset += sizeof(*ip6);
        nexthdr = ip6->nexthdr;
        offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
        if (offset < 0)
                return 0;
        if (proto)
                *proto = nexthdr;
        switch (nexthdr) {
        case IPPROTO_TCP: {
                struct tcphdr _tcph, *th;

                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th == NULL)
                        break;

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }
        case IPPROTO_UDP: {
                struct udphdr _udph, *uh;

                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh == NULL)
                        break;

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }
        case IPPROTO_DCCP: {
                struct dccp_hdr _dccph, *dh;

                dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
                if (dh == NULL)
                        break;

                ad->u.net->sport = dh->dccph_sport;
                ad->u.net->dport = dh->dccph_dport;
                break;
        }
        case IPPROTO_SCTP: {
                struct sctphdr _sctph, *sh;

                sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                if (sh == NULL)
                        break;
                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
        default:
                ret = -EINVAL;
        }
        return ret;
}
#endif


static inline void print_ipv6_addr(struct audit_buffer *ab,
                                   const struct in6_addr *addr, __be16 port,
                                   const char *name1, const char *name2)
{
        if (!ipv6_addr_any(addr))
                audit_log_format(ab, " %s=%pI6c", name1, addr);
        if (port)
                audit_log_format(ab, " %s=%d", name2, ntohs(port));
}

static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr,
                                   __be16 port, const char *name1, const char *name2)
{
        if (addr)
                audit_log_format(ab, " %s=%pI4", name1, &addr);
        if (port)
                audit_log_format(ab, " %s=%d", name2, ntohs(port));
}

/**
 * audit_log_lsm_data - helper to log common LSM audit data
 * @ab : the audit buffer
 * @a : common audit data
 */
void audit_log_lsm_data(struct audit_buffer *ab,
                        const struct common_audit_data *a)
{
        /*
         * To keep stack sizes in check force programmers to notice if they
         * start making this union too large!  See struct lsm_network_audit
         * as an example of how to deal with large data.
         */
        BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2);

        switch (a->type) {
        case LSM_AUDIT_DATA_NONE:
                return;
        case LSM_AUDIT_DATA_IPC:
                audit_log_format(ab, " ipc_key=%d ", a->u.ipc_id);
                break;
        case LSM_AUDIT_DATA_CAP:
                audit_log_format(ab, " capability=%d ", a->u.cap);
                break;
        case LSM_AUDIT_DATA_PATH: {
                struct inode *inode;

                audit_log_d_path(ab, " path=", &a->u.path);

                inode = d_backing_inode(a->u.path.dentry);
                if (inode) {
                        audit_log_format(ab, " dev=");
                        audit_log_untrustedstring(ab, inode->i_sb->s_id);
                        audit_log_format(ab, " ino=%lu", inode->i_ino);
                }
                break;
        }
        case LSM_AUDIT_DATA_FILE: {
                struct inode *inode;

                audit_log_d_path(ab, " path=", &a->u.file->f_path);

                inode = file_inode(a->u.file);
                if (inode) {
                        audit_log_format(ab, " dev=");
                        audit_log_untrustedstring(ab, inode->i_sb->s_id);
                        audit_log_format(ab, " ino=%lu", inode->i_ino);
                }
                break;
        }
        case LSM_AUDIT_DATA_IOCTL_OP: {
                struct inode *inode;

                audit_log_d_path(ab, " path=", &a->u.op->path);

                inode = a->u.op->path.dentry->d_inode;
                if (inode) {
                        audit_log_format(ab, " dev=");
                        audit_log_untrustedstring(ab, inode->i_sb->s_id);
                        audit_log_format(ab, " ino=%lu", inode->i_ino);
                }

                audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd);
                break;
        }
        case LSM_AUDIT_DATA_DENTRY: {
                struct inode *inode;

                audit_log_format(ab, " name=");
                spin_lock(&a->u.dentry->d_lock);
                audit_log_untrustedstring(ab, a->u.dentry->d_name.name);
                spin_unlock(&a->u.dentry->d_lock);

                inode = d_backing_inode(a->u.dentry);
                if (inode) {
                        audit_log_format(ab, " dev=");
                        audit_log_untrustedstring(ab, inode->i_sb->s_id);
                        audit_log_format(ab, " ino=%lu", inode->i_ino);
                }
                break;
        }
        case LSM_AUDIT_DATA_INODE: {
                struct dentry *dentry;
                struct inode *inode;

                rcu_read_lock();
                inode = a->u.inode;
                dentry = d_find_alias_rcu(inode);
                if (dentry) {
                        audit_log_format(ab, " name=");
                        spin_lock(&dentry->d_lock);
                        audit_log_untrustedstring(ab, dentry->d_name.name);
                        spin_unlock(&dentry->d_lock);
                }
                audit_log_format(ab, " dev=");
                audit_log_untrustedstring(ab, inode->i_sb->s_id);
                audit_log_format(ab, " ino=%lu", inode->i_ino);
                rcu_read_unlock();
                break;
        }
        case LSM_AUDIT_DATA_TASK: {
                struct task_struct *tsk = a->u.tsk;
                if (tsk) {
                        pid_t pid = task_tgid_nr(tsk);
                        if (pid) {
                                char tskcomm[sizeof(tsk->comm)];
                                audit_log_format(ab, " opid=%d ocomm=", pid);
                                audit_log_untrustedstring(ab,
                                    get_task_comm(tskcomm, tsk));
                        }
                }
                break;
        }
        case LSM_AUDIT_DATA_NET:
                if (a->u.net->sk) {
                        const struct sock *sk = a->u.net->sk;
                        const struct unix_sock *u;
                        struct unix_address *addr;
                        int len = 0;
                        char *p = NULL;

                        switch (sk->sk_family) {
                        case AF_INET: {
                                const struct inet_sock *inet = inet_sk(sk);

                                print_ipv4_addr(ab, inet->inet_rcv_saddr,
                                                inet->inet_sport,
                                                "laddr", "lport");
                                print_ipv4_addr(ab, inet->inet_daddr,
                                                inet->inet_dport,
                                                "faddr", "fport");
                                break;
                        }
#if IS_ENABLED(CONFIG_IPV6)
                        case AF_INET6: {
                                const struct inet_sock *inet = inet_sk(sk);

                                print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr,
                                                inet->inet_sport,
                                                "laddr", "lport");
                                print_ipv6_addr(ab, &sk->sk_v6_daddr,
                                                inet->inet_dport,
                                                "faddr", "fport");
                                break;
                        }
#endif
                        case AF_UNIX:
                                u = unix_sk(sk);
                                addr = smp_load_acquire(&u->addr);
                                if (!addr)
                                        break;
                                if (u->path.dentry) {
                                        audit_log_d_path(ab, " path=", &u->path);
                                        break;
                                }
                                len = addr->len-sizeof(short);
                                p = &addr->name->sun_path[0];
                                audit_log_format(ab, " path=");
                                if (*p)
                                        audit_log_untrustedstring(ab, p);
                                else
                                        audit_log_n_hex(ab, p, len);
                                break;
                        }
                }

                switch (a->u.net->family) {
                case AF_INET:
                        print_ipv4_addr(ab, a->u.net->v4info.saddr,
                                        a->u.net->sport,
                                        "saddr", "src");
                        print_ipv4_addr(ab, a->u.net->v4info.daddr,
                                        a->u.net->dport,
                                        "daddr", "dest");
                        break;
                case AF_INET6:
                        print_ipv6_addr(ab, &a->u.net->v6info.saddr,
                                        a->u.net->sport,
                                        "saddr", "src");
                        print_ipv6_addr(ab, &a->u.net->v6info.daddr,
                                        a->u.net->dport,
                                        "daddr", "dest");
                        break;
                }
                if (a->u.net->netif > 0) {
                        struct net_device *dev;

                        /* NOTE: we always use init's namespace */
                        dev = dev_get_by_index(&init_net, a->u.net->netif);
                        if (dev) {
                                audit_log_format(ab, " netif=%s", dev->name);
                                dev_put(dev);
                        }
                }
                break;
#ifdef CONFIG_KEYS
        case LSM_AUDIT_DATA_KEY:
                audit_log_format(ab, " key_serial=%u", a->u.key_struct.key);
                if (a->u.key_struct.key_desc) {
                        audit_log_format(ab, " key_desc=");
                        audit_log_untrustedstring(ab, a->u.key_struct.key_desc);
                }
                break;
#endif
        case LSM_AUDIT_DATA_KMOD:
                audit_log_format(ab, " kmod=");
                audit_log_untrustedstring(ab, a->u.kmod_name);
                break;
        case LSM_AUDIT_DATA_IBPKEY: {
                struct in6_addr sbn_pfx;

                memset(&sbn_pfx.s6_addr, 0,
                       sizeof(sbn_pfx.s6_addr));
                memcpy(&sbn_pfx.s6_addr, &a->u.ibpkey->subnet_prefix,
                       sizeof(a->u.ibpkey->subnet_prefix));
                audit_log_format(ab, " pkey=0x%x subnet_prefix=%pI6c",
                                 a->u.ibpkey->pkey, &sbn_pfx);
                break;
        }
        case LSM_AUDIT_DATA_IBENDPORT:
                audit_log_format(ab, " device=%s port_num=%u",
                                 a->u.ibendport->dev_name,
                                 a->u.ibendport->port);
                break;
        case LSM_AUDIT_DATA_LOCKDOWN:
                audit_log_format(ab, " lockdown_reason=\"%s\"",
                                 lockdown_reasons[a->u.reason]);
                break;
        case LSM_AUDIT_DATA_ANONINODE:
                audit_log_format(ab, " anonclass=%s", a->u.anonclass);
                break;
        case LSM_AUDIT_DATA_NLMSGTYPE:
                audit_log_format(ab, " nl-msgtype=%hu", a->u.nlmsg_type);
                break;
        } /* switch (a->type) */
}

/**
 * dump_common_audit_data - helper to dump common audit data
 * @ab : the audit buffer
 * @a : common audit data
 */
static void dump_common_audit_data(struct audit_buffer *ab,
                                   const struct common_audit_data *a)
{
        char comm[sizeof(current->comm)];

        audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
        audit_log_untrustedstring(ab, get_task_comm(comm, current));
        audit_log_lsm_data(ab, a);
}

/**
 * common_lsm_audit - generic LSM auditing function
 * @a:  auxiliary audit data
 * @pre_audit: lsm-specific pre-audit callback
 * @post_audit: lsm-specific post-audit callback
 *
 * setup the audit buffer for common security information
 * uses callback to print LSM specific information
 */
void common_lsm_audit(struct common_audit_data *a,
        void (*pre_audit)(struct audit_buffer *, void *),
        void (*post_audit)(struct audit_buffer *, void *))
{
        struct audit_buffer *ab;

        if (a == NULL)
                return;
        /* we use GFP_ATOMIC so we won't sleep */
        ab = audit_log_start(audit_context(), GFP_ATOMIC | __GFP_NOWARN,
                             AUDIT_AVC);

        if (ab == NULL)
                return;

        if (pre_audit)
                pre_audit(ab, a);

        dump_common_audit_data(ab, a);

        if (post_audit)
                post_audit(ab, a);

        audit_log_end(ab);
}






































































































































   99 














































  668 




  667 



  329 





















  551 

















  156 




  156 



   16 




  156 






































  566 






  567 





















   26 




  191 

















  191 






























  567 
















  567 





  742 





  351 



  726 



























  561 




  739 

















  674 




   57 




   57 















   57 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Variant of atomic_t specialized for reference counts.
 *
 * The interface matches the atomic_t interface (to aid in porting) but only
 * provides the few functions one should use for reference counting.
 *
 * Saturation semantics
 * ====================
 *
 * refcount_t differs from atomic_t in that the counter saturates at
 * REFCOUNT_SATURATED and will not move once there. This avoids wrapping the
 * counter and causing 'spurious' use-after-free issues. In order to avoid the
 * cost associated with introducing cmpxchg() loops into all of the saturating
 * operations, we temporarily allow the counter to take on an unchecked value
 * and then explicitly set it to REFCOUNT_SATURATED on detecting that underflow
 * or overflow has occurred. Although this is racy when multiple threads
 * access the refcount concurrently, by placing REFCOUNT_SATURATED roughly
 * equidistant from 0 and INT_MAX we minimise the scope for error:
 *
 *                                    INT_MAX     REFCOUNT_SATURATED   UINT_MAX
 *   0                          (0x7fff_ffff)    (0xc000_0000)    (0xffff_ffff)
 *   +--------------------------------+----------------+----------------+
 *                                     <---------- bad value! ---------->
 *
 * (in a signed view of the world, the "bad value" range corresponds to
 * a negative counter value).
 *
 * As an example, consider a refcount_inc() operation that causes the counter
 * to overflow:
 *
 *         int old = atomic_fetch_add_relaxed(r);
 *        // old is INT_MAX, refcount now INT_MIN (0x8000_0000)
 *        if (old < 0)
 *                atomic_set(r, REFCOUNT_SATURATED);
 *
 * If another thread also performs a refcount_inc() operation between the two
 * atomic operations, then the count will continue to edge closer to 0. If it
 * reaches a value of 1 before /any/ of the threads reset it to the saturated
 * value, then a concurrent refcount_dec_and_test() may erroneously free the
 * underlying object.
 * Linux limits the maximum number of tasks to PID_MAX_LIMIT, which is currently
 * 0x400000 (and can't easily be raised in the future beyond FUTEX_TID_MASK).
 * With the current PID limit, if no batched refcounting operations are used and
 * the attacker can't repeatedly trigger kernel oopses in the middle of refcount
 * operations, this makes it impossible for a saturated refcount to leave the
 * saturation range, even if it is possible for multiple uses of the same
 * refcount to nest in the context of a single task:
 *
 *     (UINT_MAX+1-REFCOUNT_SATURATED) / PID_MAX_LIMIT =
 *     0x40000000 / 0x400000 = 0x100 = 256
 *
 * If hundreds of references are added/removed with a single refcounting
 * operation, it may potentially be possible to leave the saturation range; but
 * given the precise timing details involved with the round-robin scheduling of
 * each thread manipulating the refcount and the need to hit the race multiple
 * times in succession, there doesn't appear to be a practical avenue of attack
 * even if using refcount_add() operations with larger increments.
 *
 * Memory ordering
 * ===============
 *
 * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
 * and provide only what is strictly required for refcounts.
 *
 * The increments are fully relaxed; these will not provide ordering. The
 * rationale is that whatever is used to obtain the object we're increasing the
 * reference count on will provide the ordering. For locked data structures,
 * its the lock acquire, for RCU/lockless data structures its the dependent
 * load.
 *
 * Do note that inc_not_zero() provides a control dependency which will order
 * future stores against the inc, this ensures we'll never modify the object
 * if we did not in fact acquire a reference.
 *
 * The decrements will provide release order, such that all the prior loads and
 * stores will be issued before, it also provides a control dependency, which
 * will order us against the subsequent free().
 *
 * The control dependency is against the load of the cmpxchg (ll/sc) that
 * succeeded. This means the stores aren't fully ordered, but this is fine
 * because the 1->0 transition indicates no concurrency.
 *
 * Note that the allocator is responsible for ordering things between free()
 * and alloc().
 *
 * The decrements dec_and_test() and sub_and_test() also provide acquire
 * ordering on success.
 *
 * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide
 * acquire and release ordering for cases when the memory occupied by the
 * object might be reused to store another object. This is important for the
 * cases where secondary validation is required to detect such reuse, e.g.
 * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after
 * the refcount is taken, hence acquire order is necessary. Similarly, when the
 * object is initialized, all stores to its attributes should be visible before
 * the refcount is set, otherwise a stale attribute value might be used by
 * another task which succeeds in taking a refcount to the new object.
 */

#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/compiler.h>
#include <linux/limits.h>
#include <linux/refcount_types.h>
#include <linux/spinlock_types.h>

struct mutex;

#define REFCOUNT_INIT(n)        { .refs = ATOMIC_INIT(n), }
#define REFCOUNT_MAX                INT_MAX
#define REFCOUNT_SATURATED        (INT_MIN / 2)

enum refcount_saturation_type {
        REFCOUNT_ADD_NOT_ZERO_OVF,
        REFCOUNT_ADD_OVF,
        REFCOUNT_ADD_UAF,
        REFCOUNT_SUB_UAF,
        REFCOUNT_DEC_LEAK,
};

void refcount_warn_saturate(refcount_t *r, enum refcount_saturation_type t);

/**
 * refcount_set - set a refcount's value
 * @r: the refcount
 * @n: value to which the refcount will be set
 */
static inline void refcount_set(refcount_t *r, int n)
{
        atomic_set(&r->refs, n);
}

/**
 * refcount_set_release - set a refcount's value with release ordering
 * @r: the refcount
 * @n: value to which the refcount will be set
 *
 * This function should be used when memory occupied by the object might be
 * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
 *
 * Provides release memory ordering which will order previous memory operations
 * against this store. This ensures all updates to this object are visible
 * once the refcount is set and stale values from the object previously
 * occupying this memory are overwritten with new ones.
 *
 * This function should be called only after new object is fully initialized.
 * After this call the object should be considered visible to other tasks even
 * if it was not yet added into an object collection normally used to discover
 * it. This is because other tasks might have discovered the object previously
 * occupying the same memory and after memory reuse they can succeed in taking
 * refcount to the new object and start using it.
 */
static inline void refcount_set_release(refcount_t *r, int n)
{
        atomic_set_release(&r->refs, n);
}

/**
 * refcount_read - get a refcount's value
 * @r: the refcount
 *
 * Return: the refcount's value
 */
static inline unsigned int refcount_read(const refcount_t *r)
{
        return atomic_read(&r->refs);
}

static inline __must_check __signed_wrap
bool __refcount_add_not_zero(int i, refcount_t *r, int *oldp)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;
        } while (!atomic_try_cmpxchg_relaxed(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

/**
 * refcount_add_not_zero - add a value to a refcount unless it is 0
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r)
{
        return __refcount_add_not_zero(i, r, NULL);
}

static inline __must_check __signed_wrap
bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp,
                                             int limit)
{
        int old = refcount_read(r);

        do {
                if (!old)
                        break;

                if (i > limit - old) {
                        if (oldp)
                                *oldp = old;
                        return false;
                }
        } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i));

        if (oldp)
                *oldp = old;

        if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF);

        return old;
}

static inline __must_check bool
__refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit)
{
        return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit);
}

static inline __must_check __signed_wrap
bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX);
}

/**
 * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0
 *
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Will saturate at REFCOUNT_SATURATED and WARN.
 *
 * This function should be used when memory occupied by the object might be
 * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
 *
 * Provides acquire memory ordering on success, it is assumed the caller has
 * guaranteed the object memory to be stable (RCU, etc.). It does provide a
 * control dependency and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc_not_zero_acquire() should instead be used to increment a
 * reference count.
 *
 * Return: false if the passed refcount is 0, true otherwise
 */
static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r)
{
        return __refcount_add_not_zero_acquire(i, r, NULL);
}

static inline __signed_wrap
void __refcount_add(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_add_relaxed(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(!old))
                refcount_warn_saturate(r, REFCOUNT_ADD_UAF);
        else if (unlikely(old < 0 || old + i < 0))
                refcount_warn_saturate(r, REFCOUNT_ADD_OVF);
}

/**
 * refcount_add - add a value to a refcount
 * @i: the value to add to the refcount
 * @r: the refcount
 *
 * Similar to atomic_add(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_inc(), or one of its variants, should instead be used to
 * increment a reference count.
 */
static inline void refcount_add(int i, refcount_t *r)
{
        __refcount_add(i, r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero(1, r, oldp);
}

/**
 * refcount_inc_not_zero - increment a refcount unless it is 0
 * @r: the refcount to increment
 *
 * Similar to atomic_inc_not_zero(), but will saturate at REFCOUNT_SATURATED
 * and WARN.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
{
        return __refcount_inc_not_zero(r, NULL);
}

static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp)
{
        return __refcount_add_not_zero_acquire(1, r, oldp);
}

/**
 * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0
 * @r: the refcount to increment
 *
 * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on
 * success.
 *
 * This function should be used when memory occupied by the object might be
 * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU.
 *
 * Provides acquire memory ordering on success, it is assumed the caller has
 * guaranteed the object memory to be stable (RCU, etc.). It does provide a
 * control dependency and thereby orders future stores. See the comment on top.
 *
 * Return: true if the increment was successful, false otherwise
 */
static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r)
{
        return __refcount_inc_not_zero_acquire(r, NULL);
}

static inline void __refcount_inc(refcount_t *r, int *oldp)
{
        __refcount_add(1, r, oldp);
}

/**
 * refcount_inc - increment a refcount
 * @r: the refcount to increment
 *
 * Similar to atomic_inc(), but will saturate at REFCOUNT_SATURATED and WARN.
 *
 * Provides no memory ordering, it is assumed the caller already has a
 * reference on the object.
 *
 * Will WARN if the refcount is 0, as this represents a possible use-after-free
 * condition.
 */
static inline void refcount_inc(refcount_t *r)
{
        __refcount_inc(r, NULL);
}

static inline __must_check __signed_wrap
bool __refcount_sub_and_test(int i, refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(i, &r->refs);

        if (oldp)
                *oldp = old;

        if (old > 0 && old == i) {
                smp_acquire__after_ctrl_dep();
                return true;
        }

        if (unlikely(old <= 0 || old - i < 0))
                refcount_warn_saturate(r, REFCOUNT_SUB_UAF);

        return false;
}

/**
 * refcount_sub_and_test - subtract from a refcount and test if it is 0
 * @i: amount to subtract from the refcount
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), but it will WARN, return false and
 * ultimately leak on underflow and will fail to decrement when saturated
 * at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Use of this function is not recommended for the normal reference counting
 * use case in which references are taken and released one at a time.  In these
 * cases, refcount_dec(), or one of its variants, should instead be used to
 * decrement a reference count.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_sub_and_test(int i, refcount_t *r)
{
        return __refcount_sub_and_test(i, r, NULL);
}

static inline __must_check bool __refcount_dec_and_test(refcount_t *r, int *oldp)
{
        return __refcount_sub_and_test(1, r, oldp);
}

/**
 * refcount_dec_and_test - decrement a refcount and test if it is 0
 * @r: the refcount
 *
 * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to
 * decrement when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return: true if the resulting refcount is 0, false otherwise
 */
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
        return __refcount_dec_and_test(r, NULL);
}

static inline void __refcount_dec(refcount_t *r, int *oldp)
{
        int old = atomic_fetch_sub_release(1, &r->refs);

        if (oldp)
                *oldp = old;

        if (unlikely(old <= 1))
                refcount_warn_saturate(r, REFCOUNT_DEC_LEAK);
}

/**
 * refcount_dec - decrement a refcount
 * @r: the refcount
 *
 * Similar to atomic_dec(), it will WARN on underflow and fail to decrement
 * when saturated at REFCOUNT_SATURATED.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before.
 */
static inline void refcount_dec(refcount_t *r)
{
        __refcount_dec(r, NULL);
}

extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) __cond_acquires(lock);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) __cond_acquires(lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
                                                       spinlock_t *lock,
                                                       unsigned long *flags) __cond_acquires(lock);
#endif /* _LINUX_REFCOUNT_H */

































































































  196 



   34 
   34 













  124 




 1257 




   16 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
/* SPDX-License-Identifier: GPL-2.0 */
/* thread_info.h: common low-level thread information accessors
 *
 * Copyright (C) 2002  David Howells (dhowells@redhat.com)
 * - Incorporating suggestions made by Linus Torvalds
 */

#ifndef _LINUX_THREAD_INFO_H
#define _LINUX_THREAD_INFO_H

#include <linux/types.h>
#include <linux/limits.h>
#include <linux/bug.h>
#include <linux/restart_block.h>
#include <linux/errno.h>

#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
 * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 * including <asm/current.h> can cause a circular dependency on some platforms.
 */
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif

#include <linux/bitops.h>

/*
 * For per-arch arch_within_stack_frames() implementations, defined in
 * asm/thread_info.h.
 */
enum {
        BAD_STACK = -1,
        NOT_STACK = 0,
        GOOD_FRAME,
        GOOD_STACK,
};

#ifdef CONFIG_GENERIC_ENTRY
enum syscall_work_bit {
        SYSCALL_WORK_BIT_SECCOMP,
        SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT,
        SYSCALL_WORK_BIT_SYSCALL_TRACE,
        SYSCALL_WORK_BIT_SYSCALL_EMU,
        SYSCALL_WORK_BIT_SYSCALL_AUDIT,
        SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH,
        SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP,
};

#define SYSCALL_WORK_SECCOMP                BIT(SYSCALL_WORK_BIT_SECCOMP)
#define SYSCALL_WORK_SYSCALL_TRACEPOINT        BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT)
#define SYSCALL_WORK_SYSCALL_TRACE        BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE)
#define SYSCALL_WORK_SYSCALL_EMU        BIT(SYSCALL_WORK_BIT_SYSCALL_EMU)
#define SYSCALL_WORK_SYSCALL_AUDIT        BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT)
#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH)
#define SYSCALL_WORK_SYSCALL_EXIT_TRAP        BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP)
#endif

#include <asm/thread_info.h>

#ifndef TIF_NEED_RESCHED_LAZY
#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
#error Inconsistent PREEMPT_LAZY
#endif
#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
#endif

#ifdef __KERNEL__

#ifndef arch_set_restart_data
#define arch_set_restart_data(restart) do { } while (0)
#endif

static inline long set_restart_fn(struct restart_block *restart,
                                        long (*fn)(struct restart_block *))
{
        restart->fn = fn;
        arch_set_restart_data(restart);
        return -ERESTART_RESTARTBLOCK;
}

#ifndef THREAD_ALIGN
#define THREAD_ALIGN        THREAD_SIZE
#endif

#define THREADINFO_GFP                (GFP_KERNEL_ACCOUNT | __GFP_ZERO)

/*
 * flag set/clear/test wrappers
 * - pass TIF_xxxx constants to these functions
 */

static inline void set_ti_thread_flag(struct thread_info *ti, int flag)
{
        set_bit(flag, (unsigned long *)&ti->flags);
}

static inline void clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline void update_ti_thread_flag(struct thread_info *ti, int flag,
                                         bool value)
{
        if (value)
                set_ti_thread_flag(ti, flag);
        else
                clear_ti_thread_flag(ti, flag);
}

static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_set_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_and_clear_bit(flag, (unsigned long *)&ti->flags);
}

static inline int test_ti_thread_flag(struct thread_info *ti, int flag)
{
        return test_bit(flag, (unsigned long *)&ti->flags);
}

/*
 * This may be used in noinstr code, and needs to be __always_inline to prevent
 * inadvertent instrumentation.
 */
static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti)
{
        return READ_ONCE(ti->flags);
}

#define set_thread_flag(flag) \
        set_ti_thread_flag(current_thread_info(), flag)
#define clear_thread_flag(flag) \
        clear_ti_thread_flag(current_thread_info(), flag)
#define update_thread_flag(flag, value) \
        update_ti_thread_flag(current_thread_info(), flag, value)
#define test_and_set_thread_flag(flag) \
        test_and_set_ti_thread_flag(current_thread_info(), flag)
#define test_and_clear_thread_flag(flag) \
        test_and_clear_ti_thread_flag(current_thread_info(), flag)
#define test_thread_flag(flag) \
        test_ti_thread_flag(current_thread_info(), flag)
#define read_thread_flags() \
        read_ti_thread_flags(current_thread_info())

#define read_task_thread_flags(t) \
        read_ti_thread_flags(task_thread_info(t))

#ifdef CONFIG_GENERIC_ENTRY
#define set_syscall_work(fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define test_syscall_work(fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)
#define clear_syscall_work(fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &current_thread_info()->syscall_work)

#define set_task_syscall_work(t, fl) \
        set_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define test_task_syscall_work(t, fl) \
        test_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)
#define clear_task_syscall_work(t, fl) \
        clear_bit(SYSCALL_WORK_BIT_##fl, &task_thread_info(t)->syscall_work)

#else /* CONFIG_GENERIC_ENTRY */

#define set_syscall_work(fl)                                                \
        set_ti_thread_flag(current_thread_info(), TIF_##fl)
#define test_syscall_work(fl) \
        test_ti_thread_flag(current_thread_info(), TIF_##fl)
#define clear_syscall_work(fl) \
        clear_ti_thread_flag(current_thread_info(), TIF_##fl)

#define set_task_syscall_work(t, fl) \
        set_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define test_task_syscall_work(t, fl) \
        test_ti_thread_flag(task_thread_info(t), TIF_##fl)
#define clear_task_syscall_work(t, fl) \
        clear_ti_thread_flag(task_thread_info(t), TIF_##fl)
#endif /* !CONFIG_GENERIC_ENTRY */

#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H

static __always_inline bool tif_test_bit(int bit)
{
        return arch_test_bit(bit,
                             (unsigned long *)(&current_thread_info()->flags));
}

#else

static __always_inline bool tif_test_bit(int bit)
{
        return test_bit(bit,
                        (unsigned long *)(&current_thread_info()->flags));
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */

static __always_inline bool tif_need_resched(void)
{
        return tif_test_bit(TIF_NEED_RESCHED);
}

#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
                                           const void * const stackend,
                                           const void *obj, unsigned long len)
{
        return 0;
}
#endif

#ifndef arch_setup_new_exec
static inline void arch_setup_new_exec(void) { }
#endif

void arch_task_cache_init(void); /* for CONFIG_SH */
void arch_release_task_struct(struct task_struct *tsk);
int arch_dup_task_struct(struct task_struct *dst,
                                struct task_struct *src);

#endif        /* __KERNEL__ */

#endif /* _LINUX_THREAD_INFO_H */



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 


















































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
// SPDX-License-Identifier: GPL-2.0-only
/*
 * This is a module which is used for queueing packets and communicating with
 * userspace via nfnetlink.
 *
 * (C) 2005 by Harald Welte <laforge@netfilter.org>
 * (C) 2007 by Patrick McHardy <kaber@trash.net>
 *
 * Based on the old ipv4-only ip_queue.c:
 * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
 * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/netfilter.h>
#include <linux/proc_fs.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nfnetlink_queue.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/list.h>
#include <linux/cgroup-defs.h>
#include <net/gso.h>
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/netfilter/nf_queue.h>
#include <net/netns/generic.h>

#include <linux/atomic.h>

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
#include "../bridge/br_private.h"
#endif

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
#endif

#define NFQNL_QMAX_DEFAULT 1024

/* We're using struct nlattr which has 16bit nla_len. Note that nla_len
 * includes the header length. Thus, the maximum packet length that we
 * support is 65531 bytes. We send truncated packets if the specified length
 * is larger than that.  Userspace can check for presence of NFQA_CAP_LEN
 * attribute to detect truncation.
 */
#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN)

struct nfqnl_instance {
        struct hlist_node hlist;                /* global list of queues */
        struct rcu_head rcu;

        u32 peer_portid;
        unsigned int queue_maxlen;
        unsigned int copy_range;
        unsigned int queue_dropped;
        unsigned int queue_user_dropped;


        u_int16_t queue_num;                        /* number of this queue */
        u_int8_t copy_mode;
        u_int32_t flags;                        /* Set using NFQA_CFG_FLAGS */
/*
 * Following fields are dirtied for each queued packet,
 * keep them in same cache line if possible.
 */
        spinlock_t        lock        ____cacheline_aligned_in_smp;
        unsigned int        queue_total;
        unsigned int        id_sequence;                /* 'sequence' of pkt ids */
        struct list_head queue_list;                /* packets in queue */
};

typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);

static unsigned int nfnl_queue_net_id __read_mostly;

#define INSTANCE_BUCKETS        16
struct nfnl_queue_net {
        spinlock_t instances_lock;
        struct hlist_head instance_table[INSTANCE_BUCKETS];
};

static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net)
{
        return net_generic(net, nfnl_queue_net_id);
}

static inline u_int8_t instance_hashfn(u_int16_t queue_num)
{
        return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
}

static struct nfqnl_instance *
instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
{
        struct hlist_head *head;
        struct nfqnl_instance *inst;

        head = &q->instance_table[instance_hashfn(queue_num)];
        hlist_for_each_entry_rcu(inst, head, hlist) {
                if (inst->queue_num == queue_num)
                        return inst;
        }
        return NULL;
}

static struct nfqnl_instance *
instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
{
        struct nfqnl_instance *inst;
        unsigned int h;
        int err;

        spin_lock(&q->instances_lock);
        if (instance_lookup(q, queue_num)) {
                err = -EEXIST;
                goto out_unlock;
        }

        inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
        if (!inst) {
                err = -ENOMEM;
                goto out_unlock;
        }

        inst->queue_num = queue_num;
        inst->peer_portid = portid;
        inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
        inst->copy_range = NFQNL_MAX_COPY_RANGE;
        inst->copy_mode = NFQNL_COPY_NONE;
        spin_lock_init(&inst->lock);
        INIT_LIST_HEAD(&inst->queue_list);

        if (!try_module_get(THIS_MODULE)) {
                err = -EAGAIN;
                goto out_free;
        }

        h = instance_hashfn(queue_num);
        hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]);

        spin_unlock(&q->instances_lock);

        return inst;

out_free:
        kfree(inst);
out_unlock:
        spin_unlock(&q->instances_lock);
        return ERR_PTR(err);
}

static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn,
                        unsigned long data);

static void
instance_destroy_rcu(struct rcu_head *head)
{
        struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance,
                                                   rcu);

        rcu_read_lock();
        nfqnl_flush(inst, NULL, 0);
        rcu_read_unlock();
        kfree(inst);
        module_put(THIS_MODULE);
}

static void
__instance_destroy(struct nfqnl_instance *inst)
{
        hlist_del_rcu(&inst->hlist);
        call_rcu(&inst->rcu, instance_destroy_rcu);
}

static void
instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
{
        spin_lock(&q->instances_lock);
        __instance_destroy(inst);
        spin_unlock(&q->instances_lock);
}

static inline void
__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
{
       list_add_tail(&entry->list, &queue->queue_list);
       queue->queue_total++;
}

static void
__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
{
        list_del(&entry->list);
        queue->queue_total--;
}

static struct nf_queue_entry *
find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
{
        struct nf_queue_entry *entry = NULL, *i;

        spin_lock_bh(&queue->lock);

        list_for_each_entry(i, &queue->queue_list, list) {
                if (i->id == id) {
                        entry = i;
                        break;
                }
        }

        if (entry)
                __dequeue_entry(queue, entry);

        spin_unlock_bh(&queue->lock);

        return entry;
}

static unsigned int nf_iterate(struct sk_buff *skb,
                               struct nf_hook_state *state,
                               const struct nf_hook_entries *hooks,
                               unsigned int *index)
{
        const struct nf_hook_entry *hook;
        unsigned int verdict, i = *index;

        while (i < hooks->num_hook_entries) {
                hook = &hooks->hooks[i];
repeat:
                verdict = nf_hook_entry_hookfn(hook, skb, state);
                if (verdict != NF_ACCEPT) {
                        *index = i;
                        if (verdict != NF_REPEAT)
                                return verdict;
                        goto repeat;
                }
                i++;
        }

        *index = i;
        return NF_ACCEPT;
}

static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum)
{
        switch (pf) {
#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
        case NFPROTO_BRIDGE:
                return rcu_dereference(net->nf.hooks_bridge[hooknum]);
#endif
        case NFPROTO_IPV4:
                return rcu_dereference(net->nf.hooks_ipv4[hooknum]);
        case NFPROTO_IPV6:
                return rcu_dereference(net->nf.hooks_ipv6[hooknum]);
        default:
                WARN_ON_ONCE(1);
                return NULL;
        }

        return NULL;
}

static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
{
#ifdef CONFIG_INET
        const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);

        if (entry->state.hook == NF_INET_LOCAL_OUT) {
                const struct iphdr *iph = ip_hdr(skb);

                if (!(iph->tos == rt_info->tos &&
                      skb->mark == rt_info->mark &&
                      iph->daddr == rt_info->daddr &&
                      iph->saddr == rt_info->saddr))
                        return ip_route_me_harder(entry->state.net, entry->state.sk,
                                                  skb, RTN_UNSPEC);
        }
#endif
        return 0;
}

static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry)
{
        const struct nf_ipv6_ops *v6ops;
        int ret = 0;

        switch (entry->state.pf) {
        case AF_INET:
                ret = nf_ip_reroute(skb, entry);
                break;
        case AF_INET6:
                v6ops = rcu_dereference(nf_ipv6_ops);
                if (v6ops)
                        ret = v6ops->reroute(skb, entry);
                break;
        }
        return ret;
}

/* caller must hold rcu read-side lock */
static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
        const struct nf_hook_entry *hook_entry;
        const struct nf_hook_entries *hooks;
        struct sk_buff *skb = entry->skb;
        const struct net *net;
        unsigned int i;
        int err;
        u8 pf;

        net = entry->state.net;
        pf = entry->state.pf;

        hooks = nf_hook_entries_head(net, pf, entry->state.hook);

        i = entry->hook_index;
        if (!hooks || i >= hooks->num_hook_entries) {
                kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP);
                nf_queue_entry_free(entry);
                return;
        }

        hook_entry = &hooks->hooks[i];

        /* Continue traversal iff userspace said ok... */
        if (verdict == NF_REPEAT)
                verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state);

        if (verdict == NF_ACCEPT) {
                if (nf_reroute(skb, entry) < 0)
                        verdict = NF_DROP;
        }

        if (verdict == NF_ACCEPT) {
next_hook:
                ++i;
                verdict = nf_iterate(skb, &entry->state, hooks, &i);
        }

        switch (verdict & NF_VERDICT_MASK) {
        case NF_ACCEPT:
        case NF_STOP:
                local_bh_disable();
                entry->state.okfn(entry->state.net, entry->state.sk, skb);
                local_bh_enable();
                break;
        case NF_QUEUE:
                err = nf_queue(skb, &entry->state, i, verdict);
                if (err == 1)
                        goto next_hook;
                break;
        case NF_STOLEN:
                break;
        default:
                kfree_skb(skb);
        }

        nf_queue_entry_free(entry);
}

static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
        const struct nf_ct_hook *ct_hook;

        if (verdict == NF_ACCEPT ||
            verdict == NF_REPEAT ||
            verdict == NF_STOP) {
                unsigned int ct_verdict = verdict;

                rcu_read_lock();
                ct_hook = rcu_dereference(nf_ct_hook);
                if (ct_hook)
                        ct_verdict = ct_hook->update(entry->state.net, entry->skb);
                rcu_read_unlock();

                switch (ct_verdict & NF_VERDICT_MASK) {
                case NF_ACCEPT:
                        /* follow userspace verdict, could be REPEAT */
                        break;
                case NF_STOLEN:
                        nf_queue_entry_free(entry);
                        return;
                default:
                        verdict = ct_verdict & NF_VERDICT_MASK;
                        break;
                }
        }
        nf_reinject(entry, verdict);
}

static void
nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
{
        struct nf_queue_entry *entry, *next;

        spin_lock_bh(&queue->lock);
        list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
                if (!cmpfn || cmpfn(entry, data)) {
                        list_del(&entry->list);
                        queue->queue_total--;
                        nfqnl_reinject(entry, NF_DROP);
                }
        }
        spin_unlock_bh(&queue->lock);
}

static int
nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet,
                      bool csum_verify)
{
        __u32 flags = 0;

        if (packet->ip_summed == CHECKSUM_PARTIAL)
                flags = NFQA_SKB_CSUMNOTREADY;
        else if (csum_verify)
                flags = NFQA_SKB_CSUM_NOTVERIFIED;

        if (skb_is_gso(packet))
                flags |= NFQA_SKB_GSO;

        return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0;
}

static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk)
{
        const struct cred *cred;

        if (!sk_fullsock(sk))
                return 0;

        read_lock_bh(&sk->sk_callback_lock);
        if (sk->sk_socket && sk->sk_socket->file) {
                cred = sk->sk_socket->file->f_cred;
                if (nla_put_be32(skb, NFQA_UID,
                    htonl(from_kuid_munged(&init_user_ns, cred->fsuid))))
                        goto nla_put_failure;
                if (nla_put_be32(skb, NFQA_GID,
                    htonl(from_kgid_munged(&init_user_ns, cred->fsgid))))
                        goto nla_put_failure;
        }
        read_unlock_bh(&sk->sk_callback_lock);
        return 0;

nla_put_failure:
        read_unlock_bh(&sk->sk_callback_lock);
        return -1;
}

static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk)
{
#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
        if (sk && sk_fullsock(sk)) {
                u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data);

                if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid)))
                        return -1;
        }
#endif
        return 0;
}

static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx)
{
        int seclen = 0;
#if IS_ENABLED(CONFIG_NETWORK_SECMARK)

        if (!skb || !sk_fullsock(skb->sk))
                return 0;

        read_lock_bh(&skb->sk->sk_callback_lock);

        if (skb->secmark)
                seclen = security_secid_to_secctx(skb->secmark, ctx);
        read_unlock_bh(&skb->sk->sk_callback_lock);
#endif
        return seclen;
}

static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry)
{
        struct sk_buff *entskb = entry->skb;
        u32 nlalen = 0;

        if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
                return 0;

        if (skb_vlan_tag_present(entskb))
                nlalen += nla_total_size(nla_total_size(sizeof(__be16)) +
                                         nla_total_size(sizeof(__be16)));

        if (entskb->network_header > entskb->mac_header)
                nlalen += nla_total_size((entskb->network_header -
                                          entskb->mac_header));

        return nlalen;
}

static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb)
{
        struct sk_buff *entskb = entry->skb;

        if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb))
                return 0;

        if (skb_vlan_tag_present(entskb)) {
                struct nlattr *nest;

                nest = nla_nest_start(skb, NFQA_VLAN);
                if (!nest)
                        goto nla_put_failure;

                if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) ||
                    nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto))
                        goto nla_put_failure;

                nla_nest_end(skb, nest);
        }

        if (entskb->mac_header < entskb->network_header) {
                int len = (int)(entskb->network_header - entskb->mac_header);

                if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb)))
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -1;
}

static int nf_queue_checksum_help(struct sk_buff *entskb)
{
        if (skb_csum_is_sctp(entskb))
                return skb_crc32c_csum_help(entskb);

        return skb_checksum_help(entskb);
}

static struct sk_buff *
nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
                           struct nf_queue_entry *entry,
                           __be32 **packet_id_ptr)
{
        size_t size;
        size_t data_len = 0, cap_len = 0;
        unsigned int hlen = 0;
        struct sk_buff *skb;
        struct nlattr *nla;
        struct nfqnl_msg_packet_hdr *pmsg;
        struct nlmsghdr *nlh;
        struct sk_buff *entskb = entry->skb;
        struct net_device *indev;
        struct net_device *outdev;
        struct nf_conn *ct = NULL;
        enum ip_conntrack_info ctinfo = 0;
        const struct nfnl_ct_hook *nfnl_ct;
        bool csum_verify;
        struct lsm_context ctx = { NULL, 0, 0 };
        int seclen = 0;
        ktime_t tstamp;

        size = nlmsg_total_size(sizeof(struct nfgenmsg))
                + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr))
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
                + nla_total_size(sizeof(u_int32_t))        /* ifindex */
#endif
                + nla_total_size(sizeof(u_int32_t))        /* mark */
                + nla_total_size(sizeof(u_int32_t))        /* priority */
                + nla_total_size(sizeof(struct nfqnl_msg_packet_hw))
                + nla_total_size(sizeof(u_int32_t))        /* skbinfo */
#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID)
                + nla_total_size(sizeof(u_int32_t))        /* classid */
#endif
                + nla_total_size(sizeof(u_int32_t));        /* cap_len */

        tstamp = skb_tstamp_cond(entskb, false);
        if (tstamp)
                size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp));

        size += nfqnl_get_bridge_size(entry);

        if (entry->state.hook <= NF_INET_FORWARD ||
           (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL))
                csum_verify = !skb_csum_unnecessary(entskb);
        else
                csum_verify = false;

        outdev = entry->state.out;

        switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) {
        case NFQNL_COPY_META:
        case NFQNL_COPY_NONE:
                break;

        case NFQNL_COPY_PACKET:
                if (!(queue->flags & NFQA_CFG_F_GSO) &&
                    entskb->ip_summed == CHECKSUM_PARTIAL &&
                    nf_queue_checksum_help(entskb))
                        return NULL;

                data_len = READ_ONCE(queue->copy_range);
                if (data_len > entskb->len)
                        data_len = entskb->len;

                hlen = skb_zerocopy_headlen(entskb);
                hlen = min_t(unsigned int, hlen, data_len);
                size += sizeof(struct nlattr) + hlen;
                cap_len = entskb->len;
                break;
        }

        nfnl_ct = rcu_dereference(nfnl_ct_hook);

#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        if (queue->flags & NFQA_CFG_F_CONNTRACK) {
                if (nfnl_ct != NULL) {
                        ct = nf_ct_get(entskb, &ctinfo);
                        if (ct != NULL)
                                size += nfnl_ct->build_size(ct);
                }
        }
#endif

        if (queue->flags & NFQA_CFG_F_UID_GID) {
                size += (nla_total_size(sizeof(u_int32_t))        /* uid */
                        + nla_total_size(sizeof(u_int32_t)));        /* gid */
        }

        if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) {
                seclen = nfqnl_get_sk_secctx(entskb, &ctx);
                if (seclen < 0)
                        return NULL;
                if (seclen)
                        size += nla_total_size(seclen);
        }

        skb = alloc_skb(size, GFP_ATOMIC);
        if (!skb) {
                skb_tx_error(entskb);
                goto nlmsg_failure;
        }

        nlh = nfnl_msg_put(skb, 0, 0,
                           nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET),
                           0, entry->state.pf, NFNETLINK_V0,
                           htons(queue->queue_num));
        if (!nlh) {
                skb_tx_error(entskb);
                kfree_skb(skb);
                goto nlmsg_failure;
        }

        nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg));
        pmsg = nla_data(nla);
        pmsg->hw_protocol        = entskb->protocol;
        pmsg->hook                = entry->state.hook;
        *packet_id_ptr                = &pmsg->packet_id;

        indev = entry->state.in;
        if (indev) {
#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex)))
                        goto nla_put_failure;
#else
                if (entry->state.pf == PF_BRIDGE) {
                        /* Case 1: indev is physical input device, we need to
                         * look for bridge group (when called from
                         * netfilter_bridge) */
                        if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
                                         htonl(indev->ifindex)) ||
                        /* this is the bridge group "brX" */
                        /* rcu_read_lock()ed by __nf_queue */
                            nla_put_be32(skb, NFQA_IFINDEX_INDEV,
                                         htonl(br_port_get_rcu(indev)->br->dev->ifindex)))
                                goto nla_put_failure;
                } else {
                        int physinif;

                        /* Case 2: indev is bridge group, we need to look for
                         * physical device (when called from ipv4) */
                        if (nla_put_be32(skb, NFQA_IFINDEX_INDEV,
                                         htonl(indev->ifindex)))
                                goto nla_put_failure;

                        physinif = nf_bridge_get_physinif(entskb);
                        if (physinif &&
                            nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV,
                                         htonl(physinif)))
                                goto nla_put_failure;
                }
#endif
        }

        if (outdev) {
#if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
                if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex)))
                        goto nla_put_failure;
#else
                if (entry->state.pf == PF_BRIDGE) {
                        /* Case 1: outdev is physical output device, we need to
                         * look for bridge group (when called from
                         * netfilter_bridge) */
                        if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
                                         htonl(outdev->ifindex)) ||
                        /* this is the bridge group "brX" */
                        /* rcu_read_lock()ed by __nf_queue */
                            nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
                                         htonl(br_port_get_rcu(outdev)->br->dev->ifindex)))
                                goto nla_put_failure;
                } else {
                        int physoutif;

                        /* Case 2: outdev is bridge group, we need to look for
                         * physical output device (when called from ipv4) */
                        if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV,
                                         htonl(outdev->ifindex)))
                                goto nla_put_failure;

                        physoutif = nf_bridge_get_physoutif(entskb);
                        if (physoutif &&
                            nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV,
                                         htonl(physoutif)))
                                goto nla_put_failure;
                }
#endif
        }

        if (entskb->mark &&
            nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark)))
                goto nla_put_failure;

        if (entskb->priority &&
            nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority)))
                goto nla_put_failure;

        if (indev && entskb->dev &&
            skb_mac_header_was_set(entskb) &&
            skb_mac_header_len(entskb) != 0) {
                struct nfqnl_msg_packet_hw phw;
                int len;

                memset(&phw, 0, sizeof(phw));
                len = dev_parse_header(entskb, phw.hw_addr);
                if (len) {
                        phw.hw_addrlen = htons(len);
                        if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw))
                                goto nla_put_failure;
                }
        }

        if (nfqnl_put_bridge(entry, skb) < 0)
                goto nla_put_failure;

        if (entry->state.hook <= NF_INET_FORWARD && tstamp) {
                struct nfqnl_msg_packet_timestamp ts;
                struct timespec64 kts = ktime_to_timespec64(tstamp);

                ts.sec = cpu_to_be64(kts.tv_sec);
                ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC);

                if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts))
                        goto nla_put_failure;
        }

        if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk &&
            nfqnl_put_sk_uidgid(skb, entskb->sk) < 0)
                goto nla_put_failure;

        if (nfqnl_put_sk_classid(skb, entskb->sk) < 0)
                goto nla_put_failure;

        if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context))
                goto nla_put_failure;

        if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0)
                goto nla_put_failure;

        if (cap_len > data_len &&
            nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len)))
                goto nla_put_failure;

        if (nfqnl_put_packet_info(skb, entskb, csum_verify))
                goto nla_put_failure;

        if (data_len) {
                struct nlattr *nla;

                if (skb_tailroom(skb) < sizeof(*nla) + hlen)
                        goto nla_put_failure;

                nla = skb_put(skb, sizeof(*nla));
                nla->nla_type = NFQA_PAYLOAD;
                nla->nla_len = nla_attr_size(data_len);

                if (skb_zerocopy(skb, entskb, data_len, hlen))
                        goto nla_put_failure;
        }

        nlh->nlmsg_len = skb->len;
        if (seclen >= 0)
                security_release_secctx(&ctx);
        return skb;

nla_put_failure:
        skb_tx_error(entskb);
        kfree_skb(skb);
        net_err_ratelimited("nf_queue: error creating packet message\n");
nlmsg_failure:
        if (seclen >= 0)
                security_release_secctx(&ctx);
        return NULL;
}

static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
        struct nf_conn *ct = (void *)skb_nfct(entry->skb);
        unsigned long status;
        unsigned int use;

        if (!ct)
                return false;

        status = READ_ONCE(ct->status);
        if ((status & flags) == IPS_DYING)
                return true;

        if (status & IPS_CONFIRMED)
                return false;

        /* in some cases skb_clone() can occur after initial conntrack
         * pickup, but conntrack assumes exclusive skb->_nfct ownership for
         * unconfirmed entries.
         *
         * This happens for br_netfilter and with ip multicast routing.
         * We can't be solved with serialization here because one clone could
         * have been queued for local delivery.
         */
        use = refcount_read(&ct->ct_general.use);
        if (likely(use == 1))
                return false;

        /* Can't decrement further? Exclusive ownership. */
        if (!refcount_dec_not_one(&ct->ct_general.use))
                return false;

        skb_set_nfct(entry->skb, 0);
        /* No nf_ct_put(): we already decremented .use and it cannot
         * drop down to 0.
         */
        return true;
#endif
        return false;
}

static int
__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
                        struct nf_queue_entry *entry)
{
        struct sk_buff *nskb;
        int err = -ENOBUFS;
        __be32 *packet_id_ptr;
        int failopen = 0;

        nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr);
        if (nskb == NULL) {
                err = -ENOMEM;
                goto err_out;
        }
        spin_lock_bh(&queue->lock);

        if (nf_ct_drop_unconfirmed(entry))
                goto err_out_free_nskb;

        if (queue->queue_total >= queue->queue_maxlen) {
                if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
                        failopen = 1;
                        err = 0;
                } else {
                        queue->queue_dropped++;
                        net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
                                             queue->queue_total);
                }
                goto err_out_free_nskb;
        }
        entry->id = ++queue->id_sequence;
        *packet_id_ptr = htonl(entry->id);

        /* nfnetlink_unicast will either free the nskb or add it to a socket */
        err = nfnetlink_unicast(nskb, net, queue->peer_portid);
        if (err < 0) {
                if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
                        failopen = 1;
                        err = 0;
                } else {
                        queue->queue_user_dropped++;
                }
                goto err_out_unlock;
        }

        __enqueue_entry(queue, entry);

        spin_unlock_bh(&queue->lock);
        return 0;

err_out_free_nskb:
        kfree_skb(nskb);
err_out_unlock:
        spin_unlock_bh(&queue->lock);
        if (failopen)
                nfqnl_reinject(entry, NF_ACCEPT);
err_out:
        return err;
}

static struct nf_queue_entry *
nf_queue_entry_dup(struct nf_queue_entry *e)
{
        struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);

        if (!entry)
                return NULL;

        if (nf_queue_entry_get_refs(entry))
                return entry;

        kfree(entry);
        return NULL;
}

#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
/* When called from bridge netfilter, skb->data must point to MAC header
 * before calling skb_gso_segment(). Else, original MAC header is lost
 * and segmented skbs will be sent to wrong destination.
 */
static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
{
        if (nf_bridge_info_get(skb))
                __skb_push(skb, skb->network_header - skb->mac_header);
}

static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
{
        if (nf_bridge_info_get(skb))
                __skb_pull(skb, skb->network_header - skb->mac_header);
}
#else
#define nf_bridge_adjust_skb_data(s) do {} while (0)
#define nf_bridge_adjust_segmented_data(s) do {} while (0)
#endif

static int
__nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
                           struct sk_buff *skb, struct nf_queue_entry *entry)
{
        int ret = -ENOMEM;
        struct nf_queue_entry *entry_seg;

        nf_bridge_adjust_segmented_data(skb);

        if (skb->next == NULL) { /* last packet, no need to copy entry */
                struct sk_buff *gso_skb = entry->skb;
                entry->skb = skb;
                ret = __nfqnl_enqueue_packet(net, queue, entry);
                if (ret)
                        entry->skb = gso_skb;
                return ret;
        }

        skb_mark_not_on_list(skb);

        entry_seg = nf_queue_entry_dup(entry);
        if (entry_seg) {
                entry_seg->skb = skb;
                ret = __nfqnl_enqueue_packet(net, queue, entry_seg);
                if (ret)
                        nf_queue_entry_free(entry_seg);
        }
        return ret;
}

static int
nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
        unsigned int queued;
        struct nfqnl_instance *queue;
        struct sk_buff *skb, *segs, *nskb;
        int err = -ENOBUFS;
        struct net *net = entry->state.net;
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);

        /* rcu_read_lock()ed by nf_hook_thresh */
        queue = instance_lookup(q, queuenum);
        if (!queue)
                return -ESRCH;

        if (queue->copy_mode == NFQNL_COPY_NONE)
                return -EINVAL;

        skb = entry->skb;

        switch (entry->state.pf) {
        case NFPROTO_IPV4:
                skb->protocol = htons(ETH_P_IP);
                break;
        case NFPROTO_IPV6:
                skb->protocol = htons(ETH_P_IPV6);
                break;
        }

        if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
                return __nfqnl_enqueue_packet(net, queue, entry);

        nf_bridge_adjust_skb_data(skb);
        segs = skb_gso_segment(skb, 0);
        /* Does not use PTR_ERR to limit the number of error codes that can be
         * returned by nf_queue.  For instance, callers rely on -ESRCH to
         * mean 'ignore this hook'.
         */
        if (IS_ERR_OR_NULL(segs))
                goto out_err;
        queued = 0;
        err = 0;
        skb_list_walk_safe(segs, segs, nskb) {
                if (err == 0)
                        err = __nfqnl_enqueue_packet_gso(net, queue,
                                                        segs, entry);
                if (err == 0)
                        queued++;
                else
                        kfree_skb(segs);
        }

        if (queued) {
                if (err) /* some segments are already queued */
                        nf_queue_entry_free(entry);
                kfree_skb(skb);
                return 0;
        }
 out_err:
        nf_bridge_adjust_segmented_data(skb);
        return err;
}

static int
nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff)
{
        struct sk_buff *nskb;

        if (diff < 0) {
                unsigned int min_len = skb_transport_offset(e->skb);

                if (data_len < min_len)
                        return -EINVAL;

                if (pskb_trim(e->skb, data_len))
                        return -ENOMEM;
        } else if (diff > 0) {
                if (data_len > 0xFFFF)
                        return -EINVAL;
                if (diff > skb_tailroom(e->skb)) {
                        nskb = skb_copy_expand(e->skb, skb_headroom(e->skb),
                                               diff, GFP_ATOMIC);
                        if (!nskb)
                                return -ENOMEM;
                        kfree_skb(e->skb);
                        e->skb = nskb;
                }
                skb_put(e->skb, diff);
        }
        if (skb_ensure_writable(e->skb, data_len))
                return -ENOMEM;
        skb_copy_to_linear_data(e->skb, data, data_len);
        e->skb->ip_summed = CHECKSUM_NONE;
        return 0;
}

static int
nfqnl_set_mode(struct nfqnl_instance *queue,
               unsigned char mode, unsigned int range)
{
        int status = 0;

        spin_lock_bh(&queue->lock);
        switch (mode) {
        case NFQNL_COPY_NONE:
        case NFQNL_COPY_META:
                queue->copy_mode = mode;
                queue->copy_range = 0;
                break;

        case NFQNL_COPY_PACKET:
                queue->copy_mode = mode;
                if (range == 0 || range > NFQNL_MAX_COPY_RANGE)
                        queue->copy_range = NFQNL_MAX_COPY_RANGE;
                else
                        queue->copy_range = range;
                break;

        default:
                status = -EINVAL;

        }
        spin_unlock_bh(&queue->lock);

        return status;
}

static int
dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex)
{
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        int physinif, physoutif;

        physinif = nf_bridge_get_physinif(entry->skb);
        physoutif = nf_bridge_get_physoutif(entry->skb);

        if (physinif == ifindex || physoutif == ifindex)
                return 1;
#endif
        if (entry->state.in)
                if (entry->state.in->ifindex == ifindex)
                        return 1;
        if (entry->state.out)
                if (entry->state.out->ifindex == ifindex)
                        return 1;

        return 0;
}

/* drop all packets with either indev or outdev == ifindex from all queue
 * instances */
static void
nfqnl_dev_drop(struct net *net, int ifindex)
{
        int i;
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);

        rcu_read_lock();

        for (i = 0; i < INSTANCE_BUCKETS; i++) {
                struct nfqnl_instance *inst;
                struct hlist_head *head = &q->instance_table[i];

                hlist_for_each_entry_rcu(inst, head, hlist)
                        nfqnl_flush(inst, dev_cmp, ifindex);
        }

        rcu_read_unlock();
}

static int
nfqnl_rcv_dev_event(struct notifier_block *this,
                    unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        /* Drop any packets associated with the downed device */
        if (event == NETDEV_DOWN)
                nfqnl_dev_drop(dev_net(dev), dev->ifindex);
        return NOTIFY_DONE;
}

static struct notifier_block nfqnl_dev_notifier = {
        .notifier_call        = nfqnl_rcv_dev_event,
};

static void nfqnl_nf_hook_drop(struct net *net)
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);
        int i;

        /* This function is also called on net namespace error unwind,
         * when pernet_ops->init() failed and ->exit() functions of the
         * previous pernet_ops gets called.
         *
         * This may result in a call to nfqnl_nf_hook_drop() before
         * struct nfnl_queue_net was allocated.
         */
        if (!q)
                return;

        for (i = 0; i < INSTANCE_BUCKETS; i++) {
                struct nfqnl_instance *inst;
                struct hlist_head *head = &q->instance_table[i];

                hlist_for_each_entry_rcu(inst, head, hlist)
                        nfqnl_flush(inst, NULL, 0);
        }
}

static int
nfqnl_rcv_nl_event(struct notifier_block *this,
                   unsigned long event, void *ptr)
{
        struct netlink_notify *n = ptr;
        struct nfnl_queue_net *q = nfnl_queue_pernet(n->net);

        if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
                int i;

                /* destroy all instances for this portid */
                spin_lock(&q->instances_lock);
                for (i = 0; i < INSTANCE_BUCKETS; i++) {
                        struct hlist_node *t2;
                        struct nfqnl_instance *inst;
                        struct hlist_head *head = &q->instance_table[i];

                        hlist_for_each_entry_safe(inst, t2, head, hlist) {
                                if (n->portid == inst->peer_portid)
                                        __instance_destroy(inst);
                        }
                }
                spin_unlock(&q->instances_lock);
        }
        return NOTIFY_DONE;
}

static struct notifier_block nfqnl_rtnl_notifier = {
        .notifier_call        = nfqnl_rcv_nl_event,
};

static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = {
        [NFQA_VLAN_TCI]                = { .type = NLA_U16},
        [NFQA_VLAN_PROTO]        = { .type = NLA_U16},
};

static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = {
        [NFQA_VERDICT_HDR]        = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
        [NFQA_MARK]                = { .type = NLA_U32 },
        [NFQA_PAYLOAD]                = { .type = NLA_UNSPEC },
        [NFQA_CT]                = { .type = NLA_UNSPEC },
        [NFQA_EXP]                = { .type = NLA_UNSPEC },
        [NFQA_VLAN]                = { .type = NLA_NESTED },
        [NFQA_PRIORITY]                = { .type = NLA_U32 },
};

static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = {
        [NFQA_VERDICT_HDR]        = { .len = sizeof(struct nfqnl_msg_verdict_hdr) },
        [NFQA_MARK]                = { .type = NLA_U32 },
        [NFQA_PRIORITY]                = { .type = NLA_U32 },
};

static struct nfqnl_instance *
verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid)
{
        struct nfqnl_instance *queue;

        queue = instance_lookup(q, queue_num);
        if (!queue)
                return ERR_PTR(-ENODEV);

        if (queue->peer_portid != nlportid)
                return ERR_PTR(-EPERM);

        return queue;
}

static struct nfqnl_msg_verdict_hdr*
verdicthdr_get(const struct nlattr * const nfqa[])
{
        struct nfqnl_msg_verdict_hdr *vhdr;
        unsigned int verdict;

        if (!nfqa[NFQA_VERDICT_HDR])
                return NULL;

        vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]);
        verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK;
        if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN)
                return NULL;
        return vhdr;
}

static int nfq_id_after(unsigned int id, unsigned int max)
{
        return (int)(id - max) > 0;
}

static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
                                    const struct nfnl_info *info,
                                    const struct nlattr * const nfqa[])
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
        u16 queue_num = ntohs(info->nfmsg->res_id);
        struct nf_queue_entry *entry, *tmp;
        struct nfqnl_msg_verdict_hdr *vhdr;
        struct nfqnl_instance *queue;
        unsigned int verdict, maxid;
        LIST_HEAD(batch_list);

        queue = verdict_instance_lookup(q, queue_num,
                                        NETLINK_CB(skb).portid);
        if (IS_ERR(queue))
                return PTR_ERR(queue);

        vhdr = verdicthdr_get(nfqa);
        if (!vhdr)
                return -EINVAL;

        verdict = ntohl(vhdr->verdict);
        maxid = ntohl(vhdr->id);

        spin_lock_bh(&queue->lock);

        list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) {
                if (nfq_id_after(entry->id, maxid))
                        break;
                __dequeue_entry(queue, entry);
                list_add_tail(&entry->list, &batch_list);
        }

        spin_unlock_bh(&queue->lock);

        if (list_empty(&batch_list))
                return -ENOENT;

        list_for_each_entry_safe(entry, tmp, &batch_list, list) {
                if (nfqa[NFQA_MARK])
                        entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));

                if (nfqa[NFQA_PRIORITY])
                        entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));

                nfqnl_reinject(entry, verdict);
        }
        return 0;
}

static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct,
                                      const struct nlmsghdr *nlh,
                                      const struct nlattr * const nfqa[],
                                      struct nf_queue_entry *entry,
                                      enum ip_conntrack_info *ctinfo)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        struct nf_conn *ct;

        ct = nf_ct_get(entry->skb, ctinfo);
        if (ct == NULL)
                return NULL;

        if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0)
                return NULL;

        if (nfqa[NFQA_EXP])
                nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct,
                                      NETLINK_CB(entry->skb).portid,
                                      nlmsg_report(nlh));
        return ct;
#else
        return NULL;
#endif
}

static int nfqa_parse_bridge(struct nf_queue_entry *entry,
                             const struct nlattr * const nfqa[])
{
        if (nfqa[NFQA_VLAN]) {
                struct nlattr *tb[NFQA_VLAN_MAX + 1];
                int err;

                err = nla_parse_nested_deprecated(tb, NFQA_VLAN_MAX,
                                                  nfqa[NFQA_VLAN],
                                                  nfqa_vlan_policy, NULL);
                if (err < 0)
                        return err;

                if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO])
                        return -EINVAL;

                __vlan_hwaccel_put_tag(entry->skb,
                        nla_get_be16(tb[NFQA_VLAN_PROTO]),
                        ntohs(nla_get_be16(tb[NFQA_VLAN_TCI])));
        }

        if (nfqa[NFQA_L2HDR]) {
                int mac_header_len = entry->skb->network_header -
                        entry->skb->mac_header;

                if (mac_header_len != nla_len(nfqa[NFQA_L2HDR]))
                        return -EINVAL;
                else if (mac_header_len > 0)
                        memcpy(skb_mac_header(entry->skb),
                               nla_data(nfqa[NFQA_L2HDR]),
                               mac_header_len);
        }

        return 0;
}

static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
                              const struct nlattr * const nfqa[])
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
        u_int16_t queue_num = ntohs(info->nfmsg->res_id);
        const struct nfnl_ct_hook *nfnl_ct;
        struct nfqnl_msg_verdict_hdr *vhdr;
        enum ip_conntrack_info ctinfo;
        struct nfqnl_instance *queue;
        struct nf_queue_entry *entry;
        struct nf_conn *ct = NULL;
        unsigned int verdict;
        int err;

        queue = verdict_instance_lookup(q, queue_num,
                                        NETLINK_CB(skb).portid);
        if (IS_ERR(queue))
                return PTR_ERR(queue);

        vhdr = verdicthdr_get(nfqa);
        if (!vhdr)
                return -EINVAL;

        verdict = ntohl(vhdr->verdict);

        entry = find_dequeue_entry(queue, ntohl(vhdr->id));
        if (entry == NULL)
                return -ENOENT;

        /* rcu lock already held from nfnl->call_rcu. */
        nfnl_ct = rcu_dereference(nfnl_ct_hook);

        if (nfqa[NFQA_CT]) {
                if (nfnl_ct != NULL)
                        ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry,
                                            &ctinfo);
        }

        if (entry->state.pf == PF_BRIDGE) {
                err = nfqa_parse_bridge(entry, nfqa);
                if (err < 0)
                        return err;
        }

        if (nfqa[NFQA_PAYLOAD]) {
                u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]);
                int diff = payload_len - entry->skb->len;

                if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]),
                                 payload_len, entry, diff) < 0)
                        verdict = NF_DROP;

                if (ct && diff)
                        nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff);
        }

        if (nfqa[NFQA_MARK])
                entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK]));

        if (nfqa[NFQA_PRIORITY])
                entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY]));

        nfqnl_reinject(entry, verdict);
        return 0;
}

static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const cda[])
{
        return -ENOTSUPP;
}

static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = {
        [NFQA_CFG_CMD]                = { .len = sizeof(struct nfqnl_msg_config_cmd) },
        [NFQA_CFG_PARAMS]        = { .len = sizeof(struct nfqnl_msg_config_params) },
        [NFQA_CFG_QUEUE_MAXLEN]        = { .type = NLA_U32 },
        [NFQA_CFG_MASK]                = { .type = NLA_U32 },
        [NFQA_CFG_FLAGS]        = { .type = NLA_U32 },
};

static const struct nf_queue_handler nfqh = {
        .outfn                = nfqnl_enqueue_packet,
        .nf_hook_drop        = nfqnl_nf_hook_drop,
};

static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
                             const struct nlattr * const nfqa[])
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
        u_int16_t queue_num = ntohs(info->nfmsg->res_id);
        struct nfqnl_msg_config_cmd *cmd = NULL;
        struct nfqnl_instance *queue;
        __u32 flags = 0, mask = 0;
        int ret = 0;

        if (nfqa[NFQA_CFG_CMD]) {
                cmd = nla_data(nfqa[NFQA_CFG_CMD]);

                /* Obsolete commands without queue context */
                switch (cmd->command) {
                case NFQNL_CFG_CMD_PF_BIND: return 0;
                case NFQNL_CFG_CMD_PF_UNBIND: return 0;
                }
        }

        /* Check if we support these flags in first place, dependencies should
         * be there too not to break atomicity.
         */
        if (nfqa[NFQA_CFG_FLAGS]) {
                if (!nfqa[NFQA_CFG_MASK]) {
                        /* A mask is needed to specify which flags are being
                         * changed.
                         */
                        return -EINVAL;
                }

                flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS]));
                mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK]));

                if (flags >= NFQA_CFG_F_MAX)
                        return -EOPNOTSUPP;

#if !IS_ENABLED(CONFIG_NETWORK_SECMARK)
                if (flags & mask & NFQA_CFG_F_SECCTX)
                        return -EOPNOTSUPP;
#endif
                if ((flags & mask & NFQA_CFG_F_CONNTRACK) &&
                    !rcu_access_pointer(nfnl_ct_hook)) {
#ifdef CONFIG_MODULES
                        nfnl_unlock(NFNL_SUBSYS_QUEUE);
                        request_module("ip_conntrack_netlink");
                        nfnl_lock(NFNL_SUBSYS_QUEUE);
                        if (rcu_access_pointer(nfnl_ct_hook))
                                return -EAGAIN;
#endif
                        return -EOPNOTSUPP;
                }
        }

        rcu_read_lock();
        queue = instance_lookup(q, queue_num);
        if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
                ret = -EPERM;
                goto err_out_unlock;
        }

        if (cmd != NULL) {
                switch (cmd->command) {
                case NFQNL_CFG_CMD_BIND:
                        if (queue) {
                                ret = -EBUSY;
                                goto err_out_unlock;
                        }
                        queue = instance_create(q, queue_num,
                                                NETLINK_CB(skb).portid);
                        if (IS_ERR(queue)) {
                                ret = PTR_ERR(queue);
                                goto err_out_unlock;
                        }
                        break;
                case NFQNL_CFG_CMD_UNBIND:
                        if (!queue) {
                                ret = -ENODEV;
                                goto err_out_unlock;
                        }
                        instance_destroy(q, queue);
                        goto err_out_unlock;
                case NFQNL_CFG_CMD_PF_BIND:
                case NFQNL_CFG_CMD_PF_UNBIND:
                        break;
                default:
                        ret = -ENOTSUPP;
                        goto err_out_unlock;
                }
        }

        if (!queue) {
                ret = -ENODEV;
                goto err_out_unlock;
        }

        if (nfqa[NFQA_CFG_PARAMS]) {
                struct nfqnl_msg_config_params *params =
                        nla_data(nfqa[NFQA_CFG_PARAMS]);

                nfqnl_set_mode(queue, params->copy_mode,
                                ntohl(params->copy_range));
        }

        if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) {
                __be32 *queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]);

                spin_lock_bh(&queue->lock);
                queue->queue_maxlen = ntohl(*queue_maxlen);
                spin_unlock_bh(&queue->lock);
        }

        if (nfqa[NFQA_CFG_FLAGS]) {
                spin_lock_bh(&queue->lock);
                queue->flags &= ~mask;
                queue->flags |= flags & mask;
                spin_unlock_bh(&queue->lock);
        }

err_out_unlock:
        rcu_read_unlock();
        return ret;
}

static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
        [NFQNL_MSG_PACKET]        = {
                .call                = nfqnl_recv_unsupp,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFQA_MAX,
        },
        [NFQNL_MSG_VERDICT]        = {
                .call                = nfqnl_recv_verdict,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFQA_MAX,
                .policy                = nfqa_verdict_policy
        },
        [NFQNL_MSG_CONFIG]        = {
                .call                = nfqnl_recv_config,
                .type                = NFNL_CB_MUTEX,
                .attr_count        = NFQA_CFG_MAX,
                .policy                = nfqa_cfg_policy
        },
        [NFQNL_MSG_VERDICT_BATCH] = {
                .call                = nfqnl_recv_verdict_batch,
                .type                = NFNL_CB_RCU,
                .attr_count        = NFQA_MAX,
                .policy                = nfqa_verdict_batch_policy
        },
};

static const struct nfnetlink_subsystem nfqnl_subsys = {
        .name                = "nf_queue",
        .subsys_id        = NFNL_SUBSYS_QUEUE,
        .cb_count        = NFQNL_MSG_MAX,
        .cb                = nfqnl_cb,
};

#ifdef CONFIG_PROC_FS
struct iter_state {
        struct seq_net_private p;
        unsigned int bucket;
};

static struct hlist_node *get_first(struct seq_file *seq)
{
        struct iter_state *st = seq->private;
        struct net *net;
        struct nfnl_queue_net *q;

        if (!st)
                return NULL;

        net = seq_file_net(seq);
        q = nfnl_queue_pernet(net);
        for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
                if (!hlist_empty(&q->instance_table[st->bucket]))
                        return q->instance_table[st->bucket].first;
        }
        return NULL;
}

static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h)
{
        struct iter_state *st = seq->private;
        struct net *net = seq_file_net(seq);

        h = h->next;
        while (!h) {
                struct nfnl_queue_net *q;

                if (++st->bucket >= INSTANCE_BUCKETS)
                        return NULL;

                q = nfnl_queue_pernet(net);
                h = q->instance_table[st->bucket].first;
        }
        return h;
}

static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos)
{
        struct hlist_node *head;
        head = get_first(seq);

        if (head)
                while (pos && (head = get_next(seq, head)))
                        pos--;
        return pos ? NULL : head;
}

static void *seq_start(struct seq_file *s, loff_t *pos)
        __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
{
        spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
        return get_idx(s, *pos);
}

static void *seq_next(struct seq_file *s, void *v, loff_t *pos)
{
        (*pos)++;
        return get_next(s, v);
}

static void seq_stop(struct seq_file *s, void *v)
        __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock)
{
        spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock);
}

static int seq_show(struct seq_file *s, void *v)
{
        const struct nfqnl_instance *inst = v;

        seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n",
                   inst->queue_num,
                   inst->peer_portid, inst->queue_total,
                   inst->copy_mode, inst->copy_range,
                   inst->queue_dropped, inst->queue_user_dropped,
                   inst->id_sequence, 1);
        return 0;
}

static const struct seq_operations nfqnl_seq_ops = {
        .start        = seq_start,
        .next        = seq_next,
        .stop        = seq_stop,
        .show        = seq_show,
};
#endif /* PROC_FS */

static int __net_init nfnl_queue_net_init(struct net *net)
{
        unsigned int i;
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);

        for (i = 0; i < INSTANCE_BUCKETS; i++)
                INIT_HLIST_HEAD(&q->instance_table[i]);

        spin_lock_init(&q->instances_lock);

#ifdef CONFIG_PROC_FS
        if (!proc_create_net("nfnetlink_queue", 0440, net->nf.proc_netfilter,
                        &nfqnl_seq_ops, sizeof(struct iter_state)))
                return -ENOMEM;
#endif
        return 0;
}

static void __net_exit nfnl_queue_net_exit(struct net *net)
{
        struct nfnl_queue_net *q = nfnl_queue_pernet(net);
        unsigned int i;

#ifdef CONFIG_PROC_FS
        remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
#endif
        for (i = 0; i < INSTANCE_BUCKETS; i++)
                WARN_ON_ONCE(!hlist_empty(&q->instance_table[i]));
}

static struct pernet_operations nfnl_queue_net_ops = {
        .init                = nfnl_queue_net_init,
        .exit                = nfnl_queue_net_exit,
        .id                = &nfnl_queue_net_id,
        .size                = sizeof(struct nfnl_queue_net),
};

static int __init nfnetlink_queue_init(void)
{
        int status;

        status = register_pernet_subsys(&nfnl_queue_net_ops);
        if (status < 0) {
                pr_err("failed to register pernet ops\n");
                goto out;
        }

        netlink_register_notifier(&nfqnl_rtnl_notifier);
        status = nfnetlink_subsys_register(&nfqnl_subsys);
        if (status < 0) {
                pr_err("failed to create netlink socket\n");
                goto cleanup_netlink_notifier;
        }

        status = register_netdevice_notifier(&nfqnl_dev_notifier);
        if (status < 0) {
                pr_err("failed to register netdevice notifier\n");
                goto cleanup_netlink_subsys;
        }

        nf_register_queue_handler(&nfqh);

        return status;

cleanup_netlink_subsys:
        nfnetlink_subsys_unregister(&nfqnl_subsys);
cleanup_netlink_notifier:
        netlink_unregister_notifier(&nfqnl_rtnl_notifier);
        unregister_pernet_subsys(&nfnl_queue_net_ops);
out:
        return status;
}

static void __exit nfnetlink_queue_fini(void)
{
        nf_unregister_queue_handler();
        unregister_netdevice_notifier(&nfqnl_dev_notifier);
        nfnetlink_subsys_unregister(&nfqnl_subsys);
        netlink_unregister_notifier(&nfqnl_rtnl_notifier);
        unregister_pernet_subsys(&nfnl_queue_net_ops);

        rcu_barrier(); /* Wait for completion of call_rcu()'s */
}

MODULE_DESCRIPTION("netfilter packet queue handler");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE);

module_init(nfnetlink_queue_init);
module_exit(nfnetlink_queue_fini);



















































































































































































































































































  265 


  265 

  265 



















































































































































































































































































































































































































































    2 


    2 





























































































































































































































































































































































































































































































































































































































































































































































































    2 
    2 

















    1 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/workqueue.h>
#include <linux/rtnetlink.h>
#include <linux/cache.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/idr.h>
#include <linux/rculist.h>
#include <linux/nsproxy.h>
#include <linux/fs.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
#include <linux/uidgid.h>
#include <linux/cookie.h>
#include <linux/proc_fs.h>

#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>

/*
 *        Our network namespace constructor/destructor lists
 */

static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;

LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);

/* Protects net_namespace_list. Nests iside rtnl_lock() */
DECLARE_RWSEM(net_rwsem);
EXPORT_SYMBOL_GPL(net_rwsem);

#ifdef CONFIG_KEYS
static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
#endif

struct net init_net;
EXPORT_SYMBOL(init_net);

static bool init_net_initialized;
/*
 * pernet_ops_rwsem: protects: pernet_list, net_generic_ids,
 * init_net_initialized and first_device pointer.
 * This is internal net namespace object. Please, don't use it
 * outside.
 */
DECLARE_RWSEM(pernet_ops_rwsem);

#define MIN_PERNET_OPS_ID        \
        ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))

#define INITIAL_NET_GEN_PTRS        13 /* +1 for len +2 for rcu_head */

static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;

DEFINE_COOKIE(net_cookie);

static struct net_generic *net_alloc_generic(void)
{
        unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
        unsigned int generic_size;
        struct net_generic *ng;

        generic_size = offsetof(struct net_generic, ptr[gen_ptrs]);

        ng = kzalloc(generic_size, GFP_KERNEL);
        if (ng)
                ng->s.len = gen_ptrs;

        return ng;
}

static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
        struct net_generic *ng, *old_ng;

        BUG_ON(id < MIN_PERNET_OPS_ID);

        old_ng = rcu_dereference_protected(net->gen,
                                           lockdep_is_held(&pernet_ops_rwsem));
        if (old_ng->s.len > id) {
                old_ng->ptr[id] = data;
                return 0;
        }

        ng = net_alloc_generic();
        if (!ng)
                return -ENOMEM;

        /*
         * Some synchronisation notes:
         *
         * The net_generic explores the net->gen array inside rcu
         * read section. Besides once set the net->gen->ptr[x]
         * pointer never changes (see rules in netns/generic.h).
         *
         * That said, we simply duplicate this array and schedule
         * the old copy for kfree after a grace period.
         */

        memcpy(&ng->ptr[MIN_PERNET_OPS_ID], &old_ng->ptr[MIN_PERNET_OPS_ID],
               (old_ng->s.len - MIN_PERNET_OPS_ID) * sizeof(void *));
        ng->ptr[id] = data;

        rcu_assign_pointer(net->gen, ng);
        kfree_rcu(old_ng, s.rcu);
        return 0;
}

static int ops_init(const struct pernet_operations *ops, struct net *net)
{
        struct net_generic *ng;
        int err = -ENOMEM;
        void *data = NULL;

        if (ops->id) {
                data = kzalloc(ops->size, GFP_KERNEL);
                if (!data)
                        goto out;

                err = net_assign_generic(net, *ops->id, data);
                if (err)
                        goto cleanup;
        }
        err = 0;
        if (ops->init)
                err = ops->init(net);
        if (!err)
                return 0;

        if (ops->id) {
                ng = rcu_dereference_protected(net->gen,
                                               lockdep_is_held(&pernet_ops_rwsem));
                ng->ptr[*ops->id] = NULL;
        }

cleanup:
        kfree(data);

out:
        return err;
}

static void ops_pre_exit_list(const struct pernet_operations *ops,
                              struct list_head *net_exit_list)
{
        struct net *net;

        if (ops->pre_exit) {
                list_for_each_entry(net, net_exit_list, exit_list)
                        ops->pre_exit(net);
        }
}

static void ops_exit_list(const struct pernet_operations *ops,
                          struct list_head *net_exit_list)
{
        struct net *net;
        if (ops->exit) {
                list_for_each_entry(net, net_exit_list, exit_list) {
                        ops->exit(net);
                        cond_resched();
                }
        }
        if (ops->exit_batch)
                ops->exit_batch(net_exit_list);
}

static void ops_free_list(const struct pernet_operations *ops,
                          struct list_head *net_exit_list)
{
        struct net *net;

        if (ops->id) {
                list_for_each_entry(net, net_exit_list, exit_list)
                        kfree(net_generic(net, *ops->id));
        }
}

/* should be called with nsid_lock held */
static int alloc_netid(struct net *net, struct net *peer, int reqid)
{
        int min = 0, max = 0;

        if (reqid >= 0) {
                min = reqid;
                max = reqid + 1;
        }

        return idr_alloc(&net->netns_ids, peer, min, max, GFP_ATOMIC);
}

/* This function is used by idr_for_each(). If net is equal to peer, the
 * function returns the id so that idr_for_each() stops. Because we cannot
 * returns the id 0 (idr_for_each() will not stop), we return the magic value
 * NET_ID_ZERO (-1) for it.
 */
#define NET_ID_ZERO -1
static int net_eq_idr(int id, void *net, void *peer)
{
        if (net_eq(net, peer))
                return id ? : NET_ID_ZERO;
        return 0;
}

/* Must be called from RCU-critical section or with nsid_lock held */
static int __peernet2id(const struct net *net, struct net *peer)
{
        int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);

        /* Magic value for id 0. */
        if (id == NET_ID_ZERO)
                return 0;
        if (id > 0)
                return id;

        return NETNSA_NSID_NOT_ASSIGNED;
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
                              struct nlmsghdr *nlh, gfp_t gfp);
/* This function returns the id of a peer netns. If no id is assigned, one will
 * be allocated and returned.
 */
int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
        int id;

        if (refcount_read(&net->ns.count) == 0)
                return NETNSA_NSID_NOT_ASSIGNED;

        spin_lock_bh(&net->nsid_lock);
        id = __peernet2id(net, peer);
        if (id >= 0) {
                spin_unlock_bh(&net->nsid_lock);
                return id;
        }

        /* When peer is obtained from RCU lists, we may race with
         * its cleanup. Check whether it's alive, and this guarantees
         * we never hash a peer back to net->netns_ids, after it has
         * just been idr_remove()'d from there in cleanup_net().
         */
        if (!maybe_get_net(peer)) {
                spin_unlock_bh(&net->nsid_lock);
                return NETNSA_NSID_NOT_ASSIGNED;
        }

        id = alloc_netid(net, peer, -1);
        spin_unlock_bh(&net->nsid_lock);

        put_net(peer);
        if (id < 0)
                return NETNSA_NSID_NOT_ASSIGNED;

        rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp);

        return id;
}
EXPORT_SYMBOL_GPL(peernet2id_alloc);

/* This function returns, if assigned, the id of a peer netns. */
int peernet2id(const struct net *net, struct net *peer)
{
        int id;

        rcu_read_lock();
        id = __peernet2id(net, peer);
        rcu_read_unlock();

        return id;
}
EXPORT_SYMBOL(peernet2id);

/* This function returns true is the peer netns has an id assigned into the
 * current netns.
 */
bool peernet_has_id(const struct net *net, struct net *peer)
{
        return peernet2id(net, peer) >= 0;
}

struct net *get_net_ns_by_id(const struct net *net, int id)
{
        struct net *peer;

        if (id < 0)
                return NULL;

        rcu_read_lock();
        peer = idr_find(&net->netns_ids, id);
        if (peer)
                peer = maybe_get_net(peer);
        rcu_read_unlock();

        return peer;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_id);

static __net_init void preinit_net_sysctl(struct net *net)
{
        net->core.sysctl_somaxconn = SOMAXCONN;
        /* Limits per socket sk_omem_alloc usage.
         * TCP zerocopy regular usage needs 128 KB.
         */
        net->core.sysctl_optmem_max = 128 * 1024;
        net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
        net->core.sysctl_tstamp_allow_data = 1;
}

/* init code that must occur even if setup_net() is not called. */
static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns)
{
        refcount_set(&net->passive, 1);
        refcount_set(&net->ns.count, 1);
        ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt");
        ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net notrefcnt");

        get_random_bytes(&net->hash_mix, sizeof(u32));
        net->dev_base_seq = 1;
        net->user_ns = user_ns;

        idr_init(&net->netns_ids);
        spin_lock_init(&net->nsid_lock);
        mutex_init(&net->ipv4.ra_mutex);

#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
        mutex_init(&net->rtnl_mutex);
        lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
#endif

        INIT_LIST_HEAD(&net->ptype_all);
        INIT_LIST_HEAD(&net->ptype_specific);
        preinit_net_sysctl(net);
}

/*
 * setup_net runs the initializers for the network namespace object.
 */
static __net_init int setup_net(struct net *net)
{
        /* Must be called with pernet_ops_rwsem held */
        const struct pernet_operations *ops, *saved_ops;
        LIST_HEAD(net_exit_list);
        LIST_HEAD(dev_kill_list);
        int error = 0;

        preempt_disable();
        net->net_cookie = gen_cookie_next(&net_cookie);
        preempt_enable();

        list_for_each_entry(ops, &pernet_list, list) {
                error = ops_init(ops, net);
                if (error < 0)
                        goto out_undo;
        }
        down_write(&net_rwsem);
        list_add_tail_rcu(&net->list, &net_namespace_list);
        up_write(&net_rwsem);
out:
        return error;

out_undo:
        /* Walk through the list backwards calling the exit functions
         * for the pernet modules whose init functions did not fail.
         */
        list_add(&net->exit_list, &net_exit_list);
        saved_ops = ops;
        list_for_each_entry_continue_reverse(ops, &pernet_list, list)
                ops_pre_exit_list(ops, &net_exit_list);

        synchronize_rcu();

        ops = saved_ops;
        rtnl_lock();
        list_for_each_entry_continue_reverse(ops, &pernet_list, list) {
                if (ops->exit_batch_rtnl)
                        ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
        }
        unregister_netdevice_many(&dev_kill_list);
        rtnl_unlock();

        ops = saved_ops;
        list_for_each_entry_continue_reverse(ops, &pernet_list, list)
                ops_exit_list(ops, &net_exit_list);

        ops = saved_ops;
        list_for_each_entry_continue_reverse(ops, &pernet_list, list)
                ops_free_list(ops, &net_exit_list);

        rcu_barrier();
        goto out;
}

#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
{
        return inc_ucount(ns, current_euid(), UCOUNT_NET_NAMESPACES);
}

static void dec_net_namespaces(struct ucounts *ucounts)
{
        dec_ucount(ucounts, UCOUNT_NET_NAMESPACES);
}

static struct kmem_cache *net_cachep __ro_after_init;
static struct workqueue_struct *netns_wq;

static struct net *net_alloc(void)
{
        struct net *net = NULL;
        struct net_generic *ng;

        ng = net_alloc_generic();
        if (!ng)
                goto out;

        net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
        if (!net)
                goto out_free;

#ifdef CONFIG_KEYS
        net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL);
        if (!net->key_domain)
                goto out_free_2;
        refcount_set(&net->key_domain->usage, 1);
#endif

        rcu_assign_pointer(net->gen, ng);
out:
        return net;

#ifdef CONFIG_KEYS
out_free_2:
        kmem_cache_free(net_cachep, net);
        net = NULL;
#endif
out_free:
        kfree(ng);
        goto out;
}

static LLIST_HEAD(defer_free_list);

static void net_complete_free(void)
{
        struct llist_node *kill_list;
        struct net *net, *next;

        /* Get the list of namespaces to free from last round. */
        kill_list = llist_del_all(&defer_free_list);

        llist_for_each_entry_safe(net, next, kill_list, defer_free_list)
                kmem_cache_free(net_cachep, net);

}

void net_passive_dec(struct net *net)
{
        if (refcount_dec_and_test(&net->passive)) {
                kfree(rcu_access_pointer(net->gen));

                /* There should not be any trackers left there. */
                ref_tracker_dir_exit(&net->notrefcnt_tracker);

                /* Wait for an extra rcu_barrier() before final free. */
                llist_add(&net->defer_free_list, &defer_free_list);
        }
}

void net_drop_ns(void *p)
{
        struct net *net = (struct net *)p;

        if (net)
                net_passive_dec(net);
}

struct net *copy_net_ns(unsigned long flags,
                        struct user_namespace *user_ns, struct net *old_net)
{
        struct ucounts *ucounts;
        struct net *net;
        int rv;

        if (!(flags & CLONE_NEWNET))
                return get_net(old_net);

        ucounts = inc_net_namespaces(user_ns);
        if (!ucounts)
                return ERR_PTR(-ENOSPC);

        net = net_alloc();
        if (!net) {
                rv = -ENOMEM;
                goto dec_ucounts;
        }

        preinit_net(net, user_ns);
        net->ucounts = ucounts;
        get_user_ns(user_ns);

        rv = down_read_killable(&pernet_ops_rwsem);
        if (rv < 0)
                goto put_userns;

        rv = setup_net(net);

        up_read(&pernet_ops_rwsem);

        if (rv < 0) {
put_userns:
#ifdef CONFIG_KEYS
                key_remove_domain(net->key_domain);
#endif
                put_user_ns(user_ns);
                net_passive_dec(net);
dec_ucounts:
                dec_net_namespaces(ucounts);
                return ERR_PTR(rv);
        }
        return net;
}

/**
 * net_ns_get_ownership - get sysfs ownership data for @net
 * @net: network namespace in question (can be NULL)
 * @uid: kernel user ID for sysfs objects
 * @gid: kernel group ID for sysfs objects
 *
 * Returns the uid/gid pair of root in the user namespace associated with the
 * given network namespace.
 */
void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
{
        if (net) {
                kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
                kgid_t ns_root_gid = make_kgid(net->user_ns, 0);

                if (uid_valid(ns_root_uid))
                        *uid = ns_root_uid;

                if (gid_valid(ns_root_gid))
                        *gid = ns_root_gid;
        } else {
                *uid = GLOBAL_ROOT_UID;
                *gid = GLOBAL_ROOT_GID;
        }
}
EXPORT_SYMBOL_GPL(net_ns_get_ownership);

static void unhash_nsid(struct net *net, struct net *last)
{
        struct net *tmp;
        /* This function is only called from cleanup_net() work,
         * and this work is the only process, that may delete
         * a net from net_namespace_list. So, when the below
         * is executing, the list may only grow. Thus, we do not
         * use for_each_net_rcu() or net_rwsem.
         */
        for_each_net(tmp) {
                int id;

                spin_lock_bh(&tmp->nsid_lock);
                id = __peernet2id(tmp, net);
                if (id >= 0)
                        idr_remove(&tmp->netns_ids, id);
                spin_unlock_bh(&tmp->nsid_lock);
                if (id >= 0)
                        rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
                                          GFP_KERNEL);
                if (tmp == last)
                        break;
        }
        spin_lock_bh(&net->nsid_lock);
        idr_destroy(&net->netns_ids);
        spin_unlock_bh(&net->nsid_lock);
}

static LLIST_HEAD(cleanup_list);

struct task_struct *cleanup_net_task;

static void cleanup_net(struct work_struct *work)
{
        const struct pernet_operations *ops;
        struct net *net, *tmp, *last;
        struct llist_node *net_kill_list;
        LIST_HEAD(net_exit_list);
        LIST_HEAD(dev_kill_list);

        cleanup_net_task = current;

        /* Atomically snapshot the list of namespaces to cleanup */
        net_kill_list = llist_del_all(&cleanup_list);

        down_read(&pernet_ops_rwsem);

        /* Don't let anyone else find us. */
        down_write(&net_rwsem);
        llist_for_each_entry(net, net_kill_list, cleanup_list)
                list_del_rcu(&net->list);
        /* Cache last net. After we unlock rtnl, no one new net
         * added to net_namespace_list can assign nsid pointer
         * to a net from net_kill_list (see peernet2id_alloc()).
         * So, we skip them in unhash_nsid().
         *
         * Note, that unhash_nsid() does not delete nsid links
         * between net_kill_list's nets, as they've already
         * deleted from net_namespace_list. But, this would be
         * useless anyway, as netns_ids are destroyed there.
         */
        last = list_last_entry(&net_namespace_list, struct net, list);
        up_write(&net_rwsem);

        llist_for_each_entry(net, net_kill_list, cleanup_list) {
                unhash_nsid(net, last);
                list_add_tail(&net->exit_list, &net_exit_list);
        }

        /* Run all of the network namespace pre_exit methods */
        list_for_each_entry_reverse(ops, &pernet_list, list)
                ops_pre_exit_list(ops, &net_exit_list);

        /*
         * Another CPU might be rcu-iterating the list, wait for it.
         * This needs to be before calling the exit() notifiers, so
         * the rcu_barrier() below isn't sufficient alone.
         * Also the pre_exit() and exit() methods need this barrier.
         */
        synchronize_rcu_expedited();

        rtnl_lock();
        list_for_each_entry_reverse(ops, &pernet_list, list) {
                if (ops->exit_batch_rtnl)
                        ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list);
        }
        unregister_netdevice_many(&dev_kill_list);
        rtnl_unlock();

        /* Run all of the network namespace exit methods */
        list_for_each_entry_reverse(ops, &pernet_list, list)
                ops_exit_list(ops, &net_exit_list);

        /* Free the net generic variables */
        list_for_each_entry_reverse(ops, &pernet_list, list)
                ops_free_list(ops, &net_exit_list);

        up_read(&pernet_ops_rwsem);

        /* Ensure there are no outstanding rcu callbacks using this
         * network namespace.
         */
        rcu_barrier();

        net_complete_free();

        /* Finally it is safe to free my network namespace structure */
        list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
                list_del_init(&net->exit_list);
                dec_net_namespaces(net->ucounts);
#ifdef CONFIG_KEYS
                key_remove_domain(net->key_domain);
#endif
                put_user_ns(net->user_ns);
                net_passive_dec(net);
        }
        cleanup_net_task = NULL;
}

/**
 * net_ns_barrier - wait until concurrent net_cleanup_work is done
 *
 * cleanup_net runs from work queue and will first remove namespaces
 * from the global list, then run net exit functions.
 *
 * Call this in module exit path to make sure that all netns
 * ->exit ops have been invoked before the function is removed.
 */
void net_ns_barrier(void)
{
        down_write(&pernet_ops_rwsem);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL(net_ns_barrier);

static DECLARE_WORK(net_cleanup_work, cleanup_net);

void __put_net(struct net *net)
{
        ref_tracker_dir_exit(&net->refcnt_tracker);
        /* Cleanup the network namespace in process context */
        if (llist_add(&net->cleanup_list, &cleanup_list))
                queue_work(netns_wq, &net_cleanup_work);
}
EXPORT_SYMBOL_GPL(__put_net);

/**
 * get_net_ns - increment the refcount of the network namespace
 * @ns: common namespace (net)
 *
 * Returns the net's common namespace or ERR_PTR() if ref is zero.
 */
struct ns_common *get_net_ns(struct ns_common *ns)
{
        struct net *net;

        net = maybe_get_net(container_of(ns, struct net, ns));
        if (net)
                return &net->ns;
        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns);

struct net *get_net_ns_by_fd(int fd)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return ERR_PTR(-EBADF);

        if (proc_ns_file(fd_file(f))) {
                struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
                if (ns->ops == &netns_operations)
                        return get_net(container_of(ns, struct net, ns));
        }

        return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
#endif

struct net *get_net_ns_by_pid(pid_t pid)
{
        struct task_struct *tsk;
        struct net *net;

        /* Lookup the network namespace */
        net = ERR_PTR(-ESRCH);
        rcu_read_lock();
        tsk = find_task_by_vpid(pid);
        if (tsk) {
                struct nsproxy *nsproxy;
                task_lock(tsk);
                nsproxy = tsk->nsproxy;
                if (nsproxy)
                        net = get_net(nsproxy->net_ns);
                task_unlock(tsk);
        }
        rcu_read_unlock();
        return net;
}
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);

static __net_init int net_ns_net_init(struct net *net)
{
#ifdef CONFIG_NET_NS
        net->ns.ops = &netns_operations;
#endif
        return ns_alloc_inum(&net->ns);
}

static __net_exit void net_ns_net_exit(struct net *net)
{
        ns_free_inum(&net->ns);
}

static struct pernet_operations __net_initdata net_ns_ops = {
        .init = net_ns_net_init,
        .exit = net_ns_net_exit,
};

static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
        [NETNSA_NONE]                = { .type = NLA_UNSPEC },
        [NETNSA_NSID]                = { .type = NLA_S32 },
        [NETNSA_PID]                = { .type = NLA_U32 },
        [NETNSA_FD]                = { .type = NLA_U32 },
        [NETNSA_TARGET_NSID]        = { .type = NLA_S32 },
};

static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[NETNSA_MAX + 1];
        struct nlattr *nla;
        struct net *peer;
        int nsid, err;

        err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb,
                                     NETNSA_MAX, rtnl_net_policy, extack);
        if (err < 0)
                return err;
        if (!tb[NETNSA_NSID]) {
                NL_SET_ERR_MSG(extack, "nsid is missing");
                return -EINVAL;
        }
        nsid = nla_get_s32(tb[NETNSA_NSID]);

        if (tb[NETNSA_PID]) {
                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
                nla = tb[NETNSA_PID];
        } else if (tb[NETNSA_FD]) {
                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
                nla = tb[NETNSA_FD];
        } else {
                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
                return -EINVAL;
        }
        if (IS_ERR(peer)) {
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
                return PTR_ERR(peer);
        }

        spin_lock_bh(&net->nsid_lock);
        if (__peernet2id(net, peer) >= 0) {
                spin_unlock_bh(&net->nsid_lock);
                err = -EEXIST;
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack,
                               "Peer netns already has a nsid assigned");
                goto out;
        }

        err = alloc_netid(net, peer, nsid);
        spin_unlock_bh(&net->nsid_lock);
        if (err >= 0) {
                rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
                                  nlh, GFP_KERNEL);
                err = 0;
        } else if (err == -ENOSPC && nsid >= 0) {
                err = -EEXIST;
                NL_SET_BAD_ATTR(extack, tb[NETNSA_NSID]);
                NL_SET_ERR_MSG(extack, "The specified nsid is already used");
        }
out:
        put_net(peer);
        return err;
}

static int rtnl_net_get_size(void)
{
        return NLMSG_ALIGN(sizeof(struct rtgenmsg))
               + nla_total_size(sizeof(s32)) /* NETNSA_NSID */
               + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
               ;
}

struct net_fill_args {
        u32 portid;
        u32 seq;
        int flags;
        int cmd;
        int nsid;
        bool add_ref;
        int ref_nsid;
};

static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
{
        struct nlmsghdr *nlh;
        struct rtgenmsg *rth;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth),
                        args->flags);
        if (!nlh)
                return -EMSGSIZE;

        rth = nlmsg_data(nlh);
        rth->rtgen_family = AF_UNSPEC;

        if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
                goto nla_put_failure;

        if (args->add_ref &&
            nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int rtnl_net_valid_getid_req(struct sk_buff *skb,
                                    const struct nlmsghdr *nlh,
                                    struct nlattr **tb,
                                    struct netlink_ext_ack *extack)
{
        int i, err;

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg),
                                              tb, NETNSA_MAX, rtnl_net_policy,
                                              extack);

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
                                            NETNSA_MAX, rtnl_net_policy,
                                            extack);
        if (err)
                return err;

        for (i = 0; i <= NETNSA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NETNSA_PID:
                case NETNSA_FD:
                case NETNSA_NSID:
                case NETNSA_TARGET_NSID:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[NETNSA_MAX + 1];
        struct net_fill_args fillargs = {
                .portid = NETLINK_CB(skb).portid,
                .seq = nlh->nlmsg_seq,
                .cmd = RTM_NEWNSID,
        };
        struct net *peer, *target = net;
        struct nlattr *nla;
        struct sk_buff *msg;
        int err;

        err = rtnl_net_valid_getid_req(skb, nlh, tb, extack);
        if (err < 0)
                return err;
        if (tb[NETNSA_PID]) {
                peer = get_net_ns_by_pid(nla_get_u32(tb[NETNSA_PID]));
                nla = tb[NETNSA_PID];
        } else if (tb[NETNSA_FD]) {
                peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
                nla = tb[NETNSA_FD];
        } else if (tb[NETNSA_NSID]) {
                peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID]));
                if (!peer)
                        peer = ERR_PTR(-ENOENT);
                nla = tb[NETNSA_NSID];
        } else {
                NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
                return -EINVAL;
        }

        if (IS_ERR(peer)) {
                NL_SET_BAD_ATTR(extack, nla);
                NL_SET_ERR_MSG(extack, "Peer netns reference is invalid");
                return PTR_ERR(peer);
        }

        if (tb[NETNSA_TARGET_NSID]) {
                int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);

                target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
                if (IS_ERR(target)) {
                        NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
                        NL_SET_ERR_MSG(extack,
                                       "Target netns reference is invalid");
                        err = PTR_ERR(target);
                        goto out;
                }
                fillargs.add_ref = true;
                fillargs.ref_nsid = peernet2id(net, peer);
        }

        msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
        if (!msg) {
                err = -ENOMEM;
                goto out;
        }

        fillargs.nsid = peernet2id(target, peer);
        err = rtnl_net_fill(msg, &fillargs);
        if (err < 0)
                goto err_out;

        err = rtnl_unicast(msg, net, NETLINK_CB(skb).portid);
        goto out;

err_out:
        nlmsg_free(msg);
out:
        if (fillargs.add_ref)
                put_net(target);
        put_net(peer);
        return err;
}

struct rtnl_net_dump_cb {
        struct net *tgt_net;
        struct net *ref_net;
        struct sk_buff *skb;
        struct net_fill_args fillargs;
        int idx;
        int s_idx;
};

/* Runs in RCU-critical section. */
static int rtnl_net_dumpid_one(int id, void *peer, void *data)
{
        struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
        int ret;

        if (net_cb->idx < net_cb->s_idx)
                goto cont;

        net_cb->fillargs.nsid = id;
        if (net_cb->fillargs.add_ref)
                net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
        ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
        if (ret < 0)
                return ret;

cont:
        net_cb->idx++;
        return 0;
}

static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
                                   struct rtnl_net_dump_cb *net_cb,
                                   struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[NETNSA_MAX + 1];
        int err, i;

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
                                            NETNSA_MAX, rtnl_net_policy,
                                            extack);
        if (err < 0)
                return err;

        for (i = 0; i <= NETNSA_MAX; i++) {
                if (!tb[i])
                        continue;

                if (i == NETNSA_TARGET_NSID) {
                        struct net *net;

                        net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
                        if (IS_ERR(net)) {
                                NL_SET_BAD_ATTR(extack, tb[i]);
                                NL_SET_ERR_MSG(extack,
                                               "Invalid target network namespace id");
                                return PTR_ERR(net);
                        }
                        net_cb->fillargs.add_ref = true;
                        net_cb->ref_net = net_cb->tgt_net;
                        net_cb->tgt_net = net;
                } else {
                        NL_SET_BAD_ATTR(extack, tb[i]);
                        NL_SET_ERR_MSG(extack,
                                       "Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rtnl_net_dump_cb net_cb = {
                .tgt_net = sock_net(skb->sk),
                .skb = skb,
                .fillargs = {
                        .portid = NETLINK_CB(cb->skb).portid,
                        .seq = cb->nlh->nlmsg_seq,
                        .flags = NLM_F_MULTI,
                        .cmd = RTM_NEWNSID,
                },
                .idx = 0,
                .s_idx = cb->args[0],
        };
        int err = 0;

        if (cb->strict_check) {
                err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
                if (err < 0)
                        goto end;
        }

        rcu_read_lock();
        idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
        rcu_read_unlock();

        cb->args[0] = net_cb.idx;
end:
        if (net_cb.fillargs.add_ref)
                put_net(net_cb.tgt_net);
        return err;
}

static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
                              struct nlmsghdr *nlh, gfp_t gfp)
{
        struct net_fill_args fillargs = {
                .portid = portid,
                .seq = nlh ? nlh->nlmsg_seq : 0,
                .cmd = cmd,
                .nsid = id,
        };
        struct sk_buff *msg;
        int err = -ENOMEM;

        msg = nlmsg_new(rtnl_net_get_size(), gfp);
        if (!msg)
                goto out;

        err = rtnl_net_fill(msg, &fillargs);
        if (err < 0)
                goto err_out;

        rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp);
        return;

err_out:
        nlmsg_free(msg);
out:
        rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}

#ifdef CONFIG_NET_NS
static void __init netns_ipv4_struct_check(void)
{
        /* TX readonly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_early_retrans);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_tso_win_divisor);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_tso_rtt_log);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_autocorking);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_min_snd_mss);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_notsent_lowat);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_limit_output_bytes);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_min_rtt_wlen);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_tcp_wmem);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
                                      sysctl_ip_fwd_use_pmtu);
        CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33);

        /* TXRX readonly hotpath cache lines */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx,
                                      sysctl_tcp_moderate_rcvbuf);
        CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1);

        /* RX readonly hotpath cache line */
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_ip_early_demux);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_early_demux);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_l3mdev_accept);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_reordering);
        CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
                                      sysctl_tcp_rmem);
        CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22);
}
#endif

static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = {
        {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid,
         .flags = RTNL_FLAG_DOIT_UNLOCKED},
        {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid,
         .dumpit = rtnl_net_dumpid,
         .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
};

void __init net_ns_init(void)
{
        struct net_generic *ng;

#ifdef CONFIG_NET_NS
        netns_ipv4_struct_check();
        net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
                                        SMP_CACHE_BYTES,
                                        SLAB_PANIC|SLAB_ACCOUNT, NULL);

        /* Create workqueue for cleanup */
        netns_wq = create_singlethread_workqueue("netns");
        if (!netns_wq)
                panic("Could not create netns workq");
#endif

        ng = net_alloc_generic();
        if (!ng)
                panic("Could not allocate generic netns");

        rcu_assign_pointer(init_net.gen, ng);

#ifdef CONFIG_KEYS
        init_net.key_domain = &init_net_key_domain;
#endif
        preinit_net(&init_net, &init_user_ns);

        down_write(&pernet_ops_rwsem);
        if (setup_net(&init_net))
                panic("Could not setup the initial network namespace");

        init_net_initialized = true;
        up_write(&pernet_ops_rwsem);

        if (register_pernet_subsys(&net_ns_ops))
                panic("Could not register network namespace subsystems");

        rtnl_register_many(net_ns_rtnl_msg_handlers);
}

static void free_exit_list(struct pernet_operations *ops, struct list_head *net_exit_list)
{
        ops_pre_exit_list(ops, net_exit_list);
        synchronize_rcu();

        if (ops->exit_batch_rtnl) {
                LIST_HEAD(dev_kill_list);

                rtnl_lock();
                ops->exit_batch_rtnl(net_exit_list, &dev_kill_list);
                unregister_netdevice_many(&dev_kill_list);
                rtnl_unlock();
        }
        ops_exit_list(ops, net_exit_list);

        ops_free_list(ops, net_exit_list);
}

#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
                                        struct pernet_operations *ops)
{
        struct net *net;
        int error;
        LIST_HEAD(net_exit_list);

        list_add_tail(&ops->list, list);
        if (ops->init || ops->id) {
                /* We held write locked pernet_ops_rwsem, and parallel
                 * setup_net() and cleanup_net() are not possible.
                 */
                for_each_net(net) {
                        error = ops_init(ops, net);
                        if (error)
                                goto out_undo;
                        list_add_tail(&net->exit_list, &net_exit_list);
                }
        }
        return 0;

out_undo:
        /* If I have an error cleanup all namespaces I initialized */
        list_del(&ops->list);
        free_exit_list(ops, &net_exit_list);
        return error;
}

static void __unregister_pernet_operations(struct pernet_operations *ops)
{
        struct net *net;
        LIST_HEAD(net_exit_list);

        list_del(&ops->list);
        /* See comment in __register_pernet_operations() */
        for_each_net(net)
                list_add_tail(&net->exit_list, &net_exit_list);

        free_exit_list(ops, &net_exit_list);
}

#else

static int __register_pernet_operations(struct list_head *list,
                                        struct pernet_operations *ops)
{
        if (!init_net_initialized) {
                list_add_tail(&ops->list, list);
                return 0;
        }

        return ops_init(ops, &init_net);
}

static void __unregister_pernet_operations(struct pernet_operations *ops)
{
        if (!init_net_initialized) {
                list_del(&ops->list);
        } else {
                LIST_HEAD(net_exit_list);
                list_add(&init_net.exit_list, &net_exit_list);
                free_exit_list(ops, &net_exit_list);
        }
}

#endif /* CONFIG_NET_NS */

static DEFINE_IDA(net_generic_ids);

static int register_pernet_operations(struct list_head *list,
                                      struct pernet_operations *ops)
{
        int error;

        if (WARN_ON(!!ops->id ^ !!ops->size))
                return -EINVAL;

        if (ops->id) {
                error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
                                GFP_KERNEL);
                if (error < 0)
                        return error;
                *ops->id = error;
                /* This does not require READ_ONCE as writers already hold
                 * pernet_ops_rwsem. But WRITE_ONCE is needed to protect
                 * net_alloc_generic.
                 */
                WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1));
        }
        error = __register_pernet_operations(list, ops);
        if (error) {
                rcu_barrier();
                if (ops->id)
                        ida_free(&net_generic_ids, *ops->id);
        }

        return error;
}

static void unregister_pernet_operations(struct pernet_operations *ops)
{
        __unregister_pernet_operations(ops);
        rcu_barrier();
        if (ops->id)
                ida_free(&net_generic_ids, *ops->id);
}

/**
 *      register_pernet_subsys - register a network namespace subsystem
 *        @ops:  pernet operations structure for the subsystem
 *
 *        Register a subsystem which has init and exit functions
 *        that are called when network namespaces are created and
 *        destroyed respectively.
 *
 *        When registered all network namespace init functions are
 *        called for every existing network namespace.  Allowing kernel
 *        modules to have a race free view of the set of network namespaces.
 *
 *        When a new network namespace is created all of the init
 *        methods are called in the order in which they were registered.
 *
 *        When a network namespace is destroyed all of the exit methods
 *        are called in the reverse of the order with which they were
 *        registered.
 */
int register_pernet_subsys(struct pernet_operations *ops)
{
        int error;
        down_write(&pernet_ops_rwsem);
        error =  register_pernet_operations(first_device, ops);
        up_write(&pernet_ops_rwsem);
        return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);

/**
 *      unregister_pernet_subsys - unregister a network namespace subsystem
 *        @ops: pernet operations structure to manipulate
 *
 *        Remove the pernet operations structure from the list to be
 *        used when network namespaces are created or destroyed.  In
 *        addition run the exit method for all existing network
 *        namespaces.
 */
void unregister_pernet_subsys(struct pernet_operations *ops)
{
        down_write(&pernet_ops_rwsem);
        unregister_pernet_operations(ops);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);

/**
 *      register_pernet_device - register a network namespace device
 *        @ops:  pernet operations structure for the subsystem
 *
 *        Register a device which has init and exit functions
 *        that are called when network namespaces are created and
 *        destroyed respectively.
 *
 *        When registered all network namespace init functions are
 *        called for every existing network namespace.  Allowing kernel
 *        modules to have a race free view of the set of network namespaces.
 *
 *        When a new network namespace is created all of the init
 *        methods are called in the order in which they were registered.
 *
 *        When a network namespace is destroyed all of the exit methods
 *        are called in the reverse of the order with which they were
 *        registered.
 */
int register_pernet_device(struct pernet_operations *ops)
{
        int error;
        down_write(&pernet_ops_rwsem);
        error = register_pernet_operations(&pernet_list, ops);
        if (!error && (first_device == &pernet_list))
                first_device = &ops->list;
        up_write(&pernet_ops_rwsem);
        return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);

/**
 *      unregister_pernet_device - unregister a network namespace netdevice
 *        @ops: pernet operations structure to manipulate
 *
 *        Remove the pernet operations structure from the list to be
 *        used when network namespaces are created or destroyed.  In
 *        addition run the exit method for all existing network
 *        namespaces.
 */
void unregister_pernet_device(struct pernet_operations *ops)
{
        down_write(&pernet_ops_rwsem);
        if (&ops->list == first_device)
                first_device = first_device->next;
        unregister_pernet_operations(ops);
        up_write(&pernet_ops_rwsem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);

#ifdef CONFIG_NET_NS
static struct ns_common *netns_get(struct task_struct *task)
{
        struct net *net = NULL;
        struct nsproxy *nsproxy;

        task_lock(task);
        nsproxy = task->nsproxy;
        if (nsproxy)
                net = get_net(nsproxy->net_ns);
        task_unlock(task);

        return net ? &net->ns : NULL;
}

static inline struct net *to_net_ns(struct ns_common *ns)
{
        return container_of(ns, struct net, ns);
}

static void netns_put(struct ns_common *ns)
{
        put_net(to_net_ns(ns));
}

static int netns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct nsproxy *nsproxy = nsset->nsproxy;
        struct net *net = to_net_ns(ns);

        if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
            !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        put_net(nsproxy->net_ns);
        nsproxy->net_ns = get_net(net);
        return 0;
}

static struct user_namespace *netns_owner(struct ns_common *ns)
{
        return to_net_ns(ns)->user_ns;
}

const struct proc_ns_operations netns_operations = {
        .name                = "net",
        .type                = CLONE_NEWNET,
        .get                = netns_get,
        .put                = netns_put,
        .install        = netns_install,
        .owner                = netns_owner,
};
#endif
































































































































































































































































    3 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_INETDEVICE_H
#define _LINUX_INETDEVICE_H

#ifdef __KERNEL__

#include <linux/bitmap.h>
#include <linux/if.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/rcupdate.h>
#include <linux/timer.h>
#include <linux/sysctl.h>
#include <linux/rtnetlink.h>
#include <linux/refcount.h>

struct ipv4_devconf {
        void        *sysctl;
        int        data[IPV4_DEVCONF_MAX];
        DECLARE_BITMAP(state, IPV4_DEVCONF_MAX);
};

#define MC_HASH_SZ_LOG 9

struct in_device {
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        refcount_t                refcnt;
        int                        dead;
        struct in_ifaddr        __rcu *ifa_list;/* IP ifaddr chain                */

        struct ip_mc_list __rcu        *mc_list;        /* IP multicast filter chain    */
        struct ip_mc_list __rcu        * __rcu *mc_hash;

        int                        mc_count;        /* Number of installed mcasts        */
        spinlock_t                mc_tomb_lock;
        struct ip_mc_list        *mc_tomb;
        unsigned long                mr_v1_seen;
        unsigned long                mr_v2_seen;
        unsigned long                mr_maxdelay;
        unsigned long                mr_qi;                /* Query Interval */
        unsigned long                mr_qri;                /* Query Response Interval */
        unsigned char                mr_qrv;                /* Query Robustness Variable */
        unsigned char                mr_gq_running;
        u32                        mr_ifc_count;
        struct timer_list        mr_gq_timer;        /* general query timer */
        struct timer_list        mr_ifc_timer;        /* interface change timer */

        struct neigh_parms        *arp_parms;
        struct ipv4_devconf        cnf;
        struct rcu_head                rcu_head;
};

#define IPV4_DEVCONF(cnf, attr) ((cnf).data[IPV4_DEVCONF_ ## attr - 1])
#define IPV4_DEVCONF_RO(cnf, attr) READ_ONCE(IPV4_DEVCONF(cnf, attr))
#define IPV4_DEVCONF_ALL(net, attr) \
        IPV4_DEVCONF((*(net)->ipv4.devconf_all), attr)
#define IPV4_DEVCONF_ALL_RO(net, attr) READ_ONCE(IPV4_DEVCONF_ALL(net, attr))

static inline int ipv4_devconf_get(const struct in_device *in_dev, int index)
{
        index--;
        return READ_ONCE(in_dev->cnf.data[index]);
}

static inline void ipv4_devconf_set(struct in_device *in_dev, int index,
                                    int val)
{
        index--;
        set_bit(index, in_dev->cnf.state);
        WRITE_ONCE(in_dev->cnf.data[index], val);
}

static inline void ipv4_devconf_setall(struct in_device *in_dev)
{
        bitmap_fill(in_dev->cnf.state, IPV4_DEVCONF_MAX);
}

#define IN_DEV_CONF_GET(in_dev, attr) \
        ipv4_devconf_get((in_dev), IPV4_DEVCONF_ ## attr)
#define IN_DEV_CONF_SET(in_dev, attr, val) \
        ipv4_devconf_set((in_dev), IPV4_DEVCONF_ ## attr, (val))

#define IN_DEV_ANDCONF(in_dev, attr) \
        (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr) && \
         IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_NET_ORCONF(in_dev, net, attr) \
        (IPV4_DEVCONF_ALL_RO(net, attr) || \
         IN_DEV_CONF_GET((in_dev), attr))

#define IN_DEV_ORCONF(in_dev, attr) \
        IN_DEV_NET_ORCONF(in_dev, dev_net(in_dev->dev), attr)

#define IN_DEV_MAXCONF(in_dev, attr) \
        (max(IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), attr), \
             IN_DEV_CONF_GET((in_dev), attr)))

#define IN_DEV_FORWARD(in_dev)                IN_DEV_CONF_GET((in_dev), FORWARDING)
#define IN_DEV_MFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), MC_FORWARDING)
#define IN_DEV_BFORWARD(in_dev)                IN_DEV_ANDCONF((in_dev), BC_FORWARDING)
#define IN_DEV_RPFILTER(in_dev)                IN_DEV_MAXCONF((in_dev), RP_FILTER)
#define IN_DEV_SRC_VMARK(in_dev)            IN_DEV_ORCONF((in_dev), SRC_VMARK)
#define IN_DEV_SOURCE_ROUTE(in_dev)        IN_DEV_ANDCONF((in_dev), \
                                                       ACCEPT_SOURCE_ROUTE)
#define IN_DEV_ACCEPT_LOCAL(in_dev)        IN_DEV_ORCONF((in_dev), ACCEPT_LOCAL)
#define IN_DEV_BOOTP_RELAY(in_dev)        IN_DEV_ANDCONF((in_dev), BOOTP_RELAY)

#define IN_DEV_LOG_MARTIANS(in_dev)        IN_DEV_ORCONF((in_dev), LOG_MARTIANS)
#define IN_DEV_PROXY_ARP(in_dev)        IN_DEV_ORCONF((in_dev), PROXY_ARP)
#define IN_DEV_PROXY_ARP_PVLAN(in_dev)        IN_DEV_ORCONF((in_dev), PROXY_ARP_PVLAN)
#define IN_DEV_SHARED_MEDIA(in_dev)        IN_DEV_ORCONF((in_dev), SHARED_MEDIA)
#define IN_DEV_TX_REDIRECTS(in_dev)        IN_DEV_ORCONF((in_dev), SEND_REDIRECTS)
#define IN_DEV_SEC_REDIRECTS(in_dev)        IN_DEV_ORCONF((in_dev), \
                                                      SECURE_REDIRECTS)
#define IN_DEV_IDTAG(in_dev)                IN_DEV_CONF_GET(in_dev, TAG)
#define IN_DEV_MEDIUM_ID(in_dev)        IN_DEV_CONF_GET(in_dev, MEDIUM_ID)
#define IN_DEV_PROMOTE_SECONDARIES(in_dev) \
                                        IN_DEV_ORCONF((in_dev), \
                                                      PROMOTE_SECONDARIES)
#define IN_DEV_ROUTE_LOCALNET(in_dev)        IN_DEV_ORCONF(in_dev, ROUTE_LOCALNET)
#define IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)        \
        IN_DEV_NET_ORCONF(in_dev, net, ROUTE_LOCALNET)

#define IN_DEV_RX_REDIRECTS(in_dev) \
        ((IN_DEV_FORWARD(in_dev) && \
          IN_DEV_ANDCONF((in_dev), ACCEPT_REDIRECTS)) \
         || (!IN_DEV_FORWARD(in_dev) && \
          IN_DEV_ORCONF((in_dev), ACCEPT_REDIRECTS)))

#define IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) \
        IN_DEV_ORCONF((in_dev), IGNORE_ROUTES_WITH_LINKDOWN)

#define IN_DEV_ARPFILTER(in_dev)        IN_DEV_ORCONF((in_dev), ARPFILTER)
#define IN_DEV_ARP_ACCEPT(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_ACCEPT)
#define IN_DEV_ARP_ANNOUNCE(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
#define IN_DEV_ARP_IGNORE(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
#define IN_DEV_ARP_NOTIFY(in_dev)        IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
#define IN_DEV_ARP_EVICT_NOCARRIER(in_dev) IN_DEV_ANDCONF((in_dev), \
                                                          ARP_EVICT_NOCARRIER)

struct in_ifaddr {
        struct hlist_node        addr_lst;
        struct in_ifaddr        __rcu *ifa_next;
        struct in_device        *ifa_dev;
        struct rcu_head                rcu_head;
        __be32                        ifa_local;
        __be32                        ifa_address;
        __be32                        ifa_mask;
        __u32                        ifa_rt_priority;
        __be32                        ifa_broadcast;
        unsigned char                ifa_scope;
        unsigned char                ifa_prefixlen;
        unsigned char                ifa_proto;
        __u32                        ifa_flags;
        char                        ifa_label[IFNAMSIZ];

        /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
        __u32                        ifa_valid_lft;
        __u32                        ifa_preferred_lft;
        unsigned long                ifa_cstamp; /* created timestamp */
        unsigned long                ifa_tstamp; /* updated timestamp */
};

struct in_validator_info {
        __be32                        ivi_addr;
        struct in_device        *ivi_dev;
        struct netlink_ext_ack        *extack;
};

int register_inetaddr_notifier(struct notifier_block *nb);
int unregister_inetaddr_notifier(struct notifier_block *nb);
int register_inetaddr_validator_notifier(struct notifier_block *nb);
int unregister_inetaddr_validator_notifier(struct notifier_block *nb);

void inet_netconf_notify_devconf(struct net *net, int event, int type,
                                 int ifindex, struct ipv4_devconf *devconf);

struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref);
static inline struct net_device *ip_dev_find(struct net *net, __be32 addr)
{
        return __ip_dev_find(net, addr, true);
}

int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b);
int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *);
#ifdef CONFIG_INET
int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size);
#else
static inline int inet_gifconf(struct net_device *dev, char __user *buf,
                               int len, int size)
{
        return 0;
}
#endif
void devinet_init(void);
struct in_device *inetdev_by_index(struct net *, int);
__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope);
__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev, __be32 dst,
                         __be32 local, int scope);
struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
                                    __be32 mask);
struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr);
static inline bool inet_ifa_match(__be32 addr, const struct in_ifaddr *ifa)
{
        return !((addr^ifa->ifa_address)&ifa->ifa_mask);
}

/*
 *        Check if a mask is acceptable.
 */
 
static __inline__ bool bad_mask(__be32 mask, __be32 addr)
{
        __u32 hmask;
        if (addr & (mask = ~mask))
                return true;
        hmask = ntohl(mask);
        if (hmask & (hmask+1))
                return true;
        return false;
}

#define in_dev_for_each_ifa_rtnl(ifa, in_dev)                        \
        for (ifa = rtnl_dereference((in_dev)->ifa_list); ifa;        \
             ifa = rtnl_dereference(ifa->ifa_next))

#define in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev)                        \
        for (ifa = rtnl_net_dereference(net, (in_dev)->ifa_list); ifa;        \
             ifa = rtnl_net_dereference(net, ifa->ifa_next))

#define in_dev_for_each_ifa_rcu(ifa, in_dev)                        \
        for (ifa = rcu_dereference((in_dev)->ifa_list); ifa;        \
             ifa = rcu_dereference(ifa->ifa_next))

static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
{
        return rcu_dereference(dev->ip_ptr);
}

static inline struct in_device *in_dev_get(const struct net_device *dev)
{
        struct in_device *in_dev;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);
        if (in_dev)
                refcount_inc(&in_dev->refcnt);
        rcu_read_unlock();
        return in_dev;
}

static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev)
{
        return rtnl_dereference(dev->ip_ptr);
}

static inline struct in_device *__in_dev_get_rtnl_net(const struct net_device *dev)
{
        return rtnl_net_dereference(dev_net(dev), dev->ip_ptr);
}

/* called with rcu_read_lock or rtnl held */
static inline bool ip_ignore_linkdown(const struct net_device *dev)
{
        struct in_device *in_dev;
        bool rc = false;

        in_dev = rcu_dereference_rtnl(dev->ip_ptr);
        if (in_dev &&
            IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
                rc = true;

        return rc;
}

static inline struct neigh_parms *__in_dev_arp_parms_get_rcu(const struct net_device *dev)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);

        return in_dev ? in_dev->arp_parms : NULL;
}

void in_dev_finish_destroy(struct in_device *idev);

static inline void in_dev_put(struct in_device *idev)
{
        if (refcount_dec_and_test(&idev->refcnt))
                in_dev_finish_destroy(idev);
}

#define __in_dev_put(idev)  refcount_dec(&(idev)->refcnt)
#define in_dev_hold(idev)   refcount_inc(&(idev)->refcnt)

#endif /* __KERNEL__ */

static __inline__ __be32 inet_make_mask(int logmask)
{
        if (logmask)
                return htonl(~((1U<<(32-logmask))-1));
        return 0;
}

static __inline__ int inet_mask_len(__be32 mask)
{
        __u32 hmask = ntohl(mask);
        if (!hmask)
                return 0;
        return 32 - ffz(~hmask);
}


#endif /* _LINUX_INETDEVICE_H */
















































    4 


 1135 



























































































































































    1 






























    2 










    4 






    2 

    3 















    6 


   11 



    3 











































   23 




    5 


    5 

    7 










    2 























    7 





    3 
    2 



































    1 










    7 





    1 













    4 




















   18 



   13 
    5 
   14 
    4 
   14 
    4 
   15 
    3 
   16 
    2 
   15 
    3 
   14 
    4 












   28 



   25 
    4 
   24 
    4 
   24 
    5 
   25 
    3 
   23 
    3 
   26 
    3 
   25 
    4 
















    2 














    1 









    1 





















    2 

















   14 























   14 





   13 


   13 



    9 
    2 






    9 
    2 



    3 
    8 
    4 
    7 






















   29 




   16 










    2 





   12 

   14 














    1 

    4 











   13 
















    1 

    1 











    2 



   18 











    1 




    3 






    1 





















 1208 





    1 



    2 



    2 


    7 


    2 


    1 









    1 


    1 






    2 


    3 


    5 


    7 



    8 

    1 



    5 


   13 


    2 


   20 


    4 


    1 


 1106 
   30 






 1222 




  420 







   86 
 1131 


 1198 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/ioctl.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/capability.h>
#include <linux/compat.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/uaccess.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h>
#include <linux/falloc.h>
#include <linux/sched/signal.h>
#include <linux/fiemap.h>
#include <linux/mount.h>
#include <linux/fscrypt.h>
#include <linux/fileattr.h>

#include "internal.h"

#include <asm/ioctls.h>

/* So that the fiemap access checks can't overflow on 32 bit machines. */
#define FIEMAP_MAX_EXTENTS        (UINT_MAX / sizeof(struct fiemap_extent))

/**
 * vfs_ioctl - call filesystem specific ioctl methods
 * @filp:        open file to invoke ioctl method on
 * @cmd:        ioctl command to execute
 * @arg:        command-specific argument for ioctl
 *
 * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise
 * returns -ENOTTY.
 *
 * Returns 0 on success, -errno on error.
 */
int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        int error = -ENOTTY;

        if (!filp->f_op->unlocked_ioctl)
                goto out;

        error = filp->f_op->unlocked_ioctl(filp, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = -ENOTTY;
 out:
        return error;
}
EXPORT_SYMBOL(vfs_ioctl);

static int ioctl_fibmap(struct file *filp, int __user *p)
{
        struct inode *inode = file_inode(filp);
        struct super_block *sb = inode->i_sb;
        int error, ur_block;
        sector_t block;

        if (!capable(CAP_SYS_RAWIO))
                return -EPERM;

        error = get_user(ur_block, p);
        if (error)
                return error;

        if (ur_block < 0)
                return -EINVAL;

        block = ur_block;
        error = bmap(inode, &block);

        if (block > INT_MAX) {
                error = -ERANGE;
                pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n",
                                    current->comm, task_pid_nr(current),
                                    sb->s_id, filp);
        }

        if (error)
                ur_block = 0;
        else
                ur_block = block;

        if (put_user(ur_block, p))
                error = -EFAULT;

        return error;
}

/**
 * fiemap_fill_next_extent - Fiemap helper function
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @logical:        Extent logical start offset, in bytes
 * @phys:        Extent physical start offset, in bytes
 * @len:        Extent length, in bytes
 * @flags:        FIEMAP_EXTENT flags that describe this extent
 *
 * Called from file system ->fiemap callback. Will populate extent
 * info as passed in via arguments and copy to user memory. On
 * success, extent count on fieinfo is incremented.
 *
 * Returns 0 on success, -errno on error, 1 if this was the last
 * extent that will fit in user array.
 */
int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical,
                            u64 phys, u64 len, u32 flags)
{
        struct fiemap_extent extent;
        struct fiemap_extent __user *dest = fieinfo->fi_extents_start;

        /* only count the extents */
        if (fieinfo->fi_extents_max == 0) {
                fieinfo->fi_extents_mapped++;
                return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
        }

        if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max)
                return 1;

#define SET_UNKNOWN_FLAGS        (FIEMAP_EXTENT_DELALLOC)
#define SET_NO_UNMOUNTED_IO_FLAGS        (FIEMAP_EXTENT_DATA_ENCRYPTED)
#define SET_NOT_ALIGNED_FLAGS        (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE)

        if (flags & SET_UNKNOWN_FLAGS)
                flags |= FIEMAP_EXTENT_UNKNOWN;
        if (flags & SET_NO_UNMOUNTED_IO_FLAGS)
                flags |= FIEMAP_EXTENT_ENCODED;
        if (flags & SET_NOT_ALIGNED_FLAGS)
                flags |= FIEMAP_EXTENT_NOT_ALIGNED;

        memset(&extent, 0, sizeof(extent));
        extent.fe_logical = logical;
        extent.fe_physical = phys;
        extent.fe_length = len;
        extent.fe_flags = flags;

        dest += fieinfo->fi_extents_mapped;
        if (copy_to_user(dest, &extent, sizeof(extent)))
                return -EFAULT;

        fieinfo->fi_extents_mapped++;
        if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max)
                return 1;
        return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0;
}
EXPORT_SYMBOL(fiemap_fill_next_extent);

/**
 * fiemap_prep - check validity of requested flags for fiemap
 * @inode:        Inode to operate on
 * @fieinfo:        Fiemap context passed into ->fiemap
 * @start:        Start of the mapped range
 * @len:        Length of the mapped range, can be truncated by this function.
 * @supported_flags:        Set of fiemap flags that the file system understands
 *
 * This function must be called from each ->fiemap instance to validate the
 * fiemap request against the file system parameters.
 *
 * Returns 0 on success, or a negative error on failure.
 */
int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo,
                u64 start, u64 *len, u32 supported_flags)
{
        u64 maxbytes = inode->i_sb->s_maxbytes;
        u32 incompat_flags;
        int ret = 0;

        if (*len == 0)
                return -EINVAL;
        if (start >= maxbytes)
                return -EFBIG;

        /*
         * Shrink request scope to what the fs can actually handle.
         */
        if (*len > maxbytes || (maxbytes - *len) < start)
                *len = maxbytes - start;

        supported_flags |= FIEMAP_FLAG_SYNC;
        supported_flags &= FIEMAP_FLAGS_COMPAT;
        incompat_flags = fieinfo->fi_flags & ~supported_flags;
        if (incompat_flags) {
                fieinfo->fi_flags = incompat_flags;
                return -EBADR;
        }

        if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
                ret = filemap_write_and_wait(inode->i_mapping);
        return ret;
}
EXPORT_SYMBOL(fiemap_prep);

static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
{
        struct fiemap fiemap;
        struct fiemap_extent_info fieinfo = { 0, };
        struct inode *inode = file_inode(filp);
        int error;

        if (!inode->i_op->fiemap)
                return -EOPNOTSUPP;

        if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap)))
                return -EFAULT;

        if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS)
                return -EINVAL;

        fieinfo.fi_flags = fiemap.fm_flags;
        fieinfo.fi_extents_max = fiemap.fm_extent_count;
        fieinfo.fi_extents_start = ufiemap->fm_extents;

        error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start,
                        fiemap.fm_length);

        fiemap.fm_flags = fieinfo.fi_flags;
        fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped;
        if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap)))
                error = -EFAULT;

        return error;
}

static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
                            u64 off, u64 olen, u64 destoff)
{
        CLASS(fd, src_file)(srcfd);
        loff_t cloned;
        int ret;

        if (fd_empty(src_file))
                return -EBADF;
        cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff,
                                      olen, 0);
        if (cloned < 0)
                ret = cloned;
        else if (olen && cloned != olen)
                ret = -EINVAL;
        else
                ret = 0;
        return ret;
}

static int ioctl_file_clone_range(struct file *file,
                                  struct file_clone_range __user *argp)
{
        struct file_clone_range args;

        if (copy_from_user(&args, argp, sizeof(args)))
                return -EFAULT;
        return ioctl_file_clone(file, args.src_fd, args.src_offset,
                                args.src_length, args.dest_offset);
}

/*
 * This provides compatibility with legacy XFS pre-allocation ioctls
 * which predate the fallocate syscall.
 *
 * Only the l_start, l_len and l_whence fields of the 'struct space_resv'
 * are used here, rest are ignored.
 */
static int ioctl_preallocate(struct file *filp, int mode, void __user *argp)
{
        struct inode *inode = file_inode(filp);
        struct space_resv sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += filp->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start,
                        sr.l_len);
}

/* on ia32 l_start is on a 32-bit boundary */
#if defined CONFIG_COMPAT && defined(CONFIG_X86_64)
/* just account for different alignment */
static int compat_ioctl_preallocate(struct file *file, int mode,
                                    struct space_resv_32 __user *argp)
{
        struct inode *inode = file_inode(file);
        struct space_resv_32 sr;

        if (copy_from_user(&sr, argp, sizeof(sr)))
                return -EFAULT;

        switch (sr.l_whence) {
        case SEEK_SET:
                break;
        case SEEK_CUR:
                sr.l_start += file->f_pos;
                break;
        case SEEK_END:
                sr.l_start += i_size_read(inode);
                break;
        default:
                return -EINVAL;
        }

        return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len);
}
#endif

static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p)
{
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
        case FS_IOC_RESVSP:
        case FS_IOC_RESVSP64:
                return ioctl_preallocate(filp, 0, p);
        case FS_IOC_UNRESVSP:
        case FS_IOC_UNRESVSP64:
                return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p);
        case FS_IOC_ZERO_RANGE:
                return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p);
        }

        return -ENOIOCTLCMD;
}

static int ioctl_fionbio(struct file *filp, int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = O_NONBLOCK;
#ifdef __sparc__
        /* SunOS compatibility item. */
        if (O_NONBLOCK != O_NDELAY)
                flag |= O_NDELAY;
#endif
        spin_lock(&filp->f_lock);
        if (on)
                filp->f_flags |= flag;
        else
                filp->f_flags &= ~flag;
        spin_unlock(&filp->f_lock);
        return error;
}

static int ioctl_fioasync(unsigned int fd, struct file *filp,
                          int __user *argp)
{
        unsigned int flag;
        int on, error;

        error = get_user(on, argp);
        if (error)
                return error;
        flag = on ? FASYNC : 0;

        /* Did FASYNC state change ? */
        if ((flag ^ filp->f_flags) & FASYNC) {
                if (filp->f_op->fasync)
                        /* fasync() adjusts filp->f_flags */
                        error = filp->f_op->fasync(fd, filp, on);
                else
                        error = -ENOTTY;
        }
        return error < 0 ? error : 0;
}

static int ioctl_fsfreeze(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* If filesystem doesn't support freeze feature, return. */
        if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL)
                return -EOPNOTSUPP;

        /* Freeze */
        if (sb->s_op->freeze_super)
                return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
        return freeze_super(sb, FREEZE_HOLDER_USERSPACE);
}

static int ioctl_fsthaw(struct file *filp)
{
        struct super_block *sb = file_inode(filp)->i_sb;

        if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        /* Thaw */
        if (sb->s_op->thaw_super)
                return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
        return thaw_super(sb, FREEZE_HOLDER_USERSPACE);
}

static int ioctl_file_dedupe_range(struct file *file,
                                   struct file_dedupe_range __user *argp)
{
        struct file_dedupe_range *same = NULL;
        int ret;
        unsigned long size;
        u16 count;

        if (get_user(count, &argp->dest_count)) {
                ret = -EFAULT;
                goto out;
        }

        size = offsetof(struct file_dedupe_range, info[count]);
        if (size > PAGE_SIZE) {
                ret = -ENOMEM;
                goto out;
        }

        same = memdup_user(argp, size);
        if (IS_ERR(same)) {
                ret = PTR_ERR(same);
                same = NULL;
                goto out;
        }

        same->dest_count = count;
        ret = vfs_dedupe_file_range(file, same);
        if (ret)
                goto out;

        ret = copy_to_user(argp, same, size);
        if (ret)
                ret = -EFAULT;

out:
        kfree(same);
        return ret;
}

/**
 * fileattr_fill_xflags - initialize fileattr with xflags
 * @fa:                fileattr pointer
 * @xflags:        FS_XFLAG_* flags
 *
 * Set ->fsx_xflags, ->fsx_valid and ->flags (translated xflags).  All
 * other fields are zeroed.
 */
void fileattr_fill_xflags(struct fileattr *fa, u32 xflags)
{
        memset(fa, 0, sizeof(*fa));
        fa->fsx_valid = true;
        fa->fsx_xflags = xflags;
        if (fa->fsx_xflags & FS_XFLAG_IMMUTABLE)
                fa->flags |= FS_IMMUTABLE_FL;
        if (fa->fsx_xflags & FS_XFLAG_APPEND)
                fa->flags |= FS_APPEND_FL;
        if (fa->fsx_xflags & FS_XFLAG_SYNC)
                fa->flags |= FS_SYNC_FL;
        if (fa->fsx_xflags & FS_XFLAG_NOATIME)
                fa->flags |= FS_NOATIME_FL;
        if (fa->fsx_xflags & FS_XFLAG_NODUMP)
                fa->flags |= FS_NODUMP_FL;
        if (fa->fsx_xflags & FS_XFLAG_DAX)
                fa->flags |= FS_DAX_FL;
        if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
                fa->flags |= FS_PROJINHERIT_FL;
}
EXPORT_SYMBOL(fileattr_fill_xflags);

/**
 * fileattr_fill_flags - initialize fileattr with flags
 * @fa:                fileattr pointer
 * @flags:        FS_*_FL flags
 *
 * Set ->flags, ->flags_valid and ->fsx_xflags (translated flags).
 * All other fields are zeroed.
 */
void fileattr_fill_flags(struct fileattr *fa, u32 flags)
{
        memset(fa, 0, sizeof(*fa));
        fa->flags_valid = true;
        fa->flags = flags;
        if (fa->flags & FS_SYNC_FL)
                fa->fsx_xflags |= FS_XFLAG_SYNC;
        if (fa->flags & FS_IMMUTABLE_FL)
                fa->fsx_xflags |= FS_XFLAG_IMMUTABLE;
        if (fa->flags & FS_APPEND_FL)
                fa->fsx_xflags |= FS_XFLAG_APPEND;
        if (fa->flags & FS_NODUMP_FL)
                fa->fsx_xflags |= FS_XFLAG_NODUMP;
        if (fa->flags & FS_NOATIME_FL)
                fa->fsx_xflags |= FS_XFLAG_NOATIME;
        if (fa->flags & FS_DAX_FL)
                fa->fsx_xflags |= FS_XFLAG_DAX;
        if (fa->flags & FS_PROJINHERIT_FL)
                fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
}
EXPORT_SYMBOL(fileattr_fill_flags);

/**
 * vfs_fileattr_get - retrieve miscellaneous file attributes
 * @dentry:        the object to retrieve from
 * @fa:                fileattr pointer
 *
 * Call i_op->fileattr_get() callback, if exists.
 *
 * Return: 0 on success, or a negative error on failure.
 */
int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);

        if (!inode->i_op->fileattr_get)
                return -ENOIOCTLCMD;

        return inode->i_op->fileattr_get(dentry, fa);
}
EXPORT_SYMBOL(vfs_fileattr_get);

/**
 * copy_fsxattr_to_user - copy fsxattr to userspace.
 * @fa:                fileattr pointer
 * @ufa:        fsxattr user pointer
 *
 * Return: 0 on success, or -EFAULT on failure.
 */
int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa)
{
        struct fsxattr xfa;

        memset(&xfa, 0, sizeof(xfa));
        xfa.fsx_xflags = fa->fsx_xflags;
        xfa.fsx_extsize = fa->fsx_extsize;
        xfa.fsx_nextents = fa->fsx_nextents;
        xfa.fsx_projid = fa->fsx_projid;
        xfa.fsx_cowextsize = fa->fsx_cowextsize;

        if (copy_to_user(ufa, &xfa, sizeof(xfa)))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(copy_fsxattr_to_user);

static int copy_fsxattr_from_user(struct fileattr *fa,
                                  struct fsxattr __user *ufa)
{
        struct fsxattr xfa;

        if (copy_from_user(&xfa, ufa, sizeof(xfa)))
                return -EFAULT;

        fileattr_fill_xflags(fa, xfa.fsx_xflags);
        fa->fsx_extsize = xfa.fsx_extsize;
        fa->fsx_nextents = xfa.fsx_nextents;
        fa->fsx_projid = xfa.fsx_projid;
        fa->fsx_cowextsize = xfa.fsx_cowextsize;

        return 0;
}

/*
 * Generic function to check FS_IOC_FSSETXATTR/FS_IOC_SETFLAGS values and reject
 * any invalid configurations.
 *
 * Note: must be called with inode lock held.
 */
static int fileattr_set_prepare(struct inode *inode,
                              const struct fileattr *old_ma,
                              struct fileattr *fa)
{
        int err;

        /*
         * The IMMUTABLE and APPEND_ONLY flags can only be changed by
         * the relevant capability.
         */
        if ((fa->flags ^ old_ma->flags) & (FS_APPEND_FL | FS_IMMUTABLE_FL) &&
            !capable(CAP_LINUX_IMMUTABLE))
                return -EPERM;

        err = fscrypt_prepare_setflags(inode, old_ma->flags, fa->flags);
        if (err)
                return err;

        /*
         * Project Quota ID state is only allowed to change from within the init
         * namespace. Enforce that restriction only if we are trying to change
         * the quota ID state. Everything else is allowed in user namespaces.
         */
        if (current_user_ns() != &init_user_ns) {
                if (old_ma->fsx_projid != fa->fsx_projid)
                        return -EINVAL;
                if ((old_ma->fsx_xflags ^ fa->fsx_xflags) &
                                FS_XFLAG_PROJINHERIT)
                        return -EINVAL;
        } else {
                /*
                 * Caller is allowed to change the project ID. If it is being
                 * changed, make sure that the new value is valid.
                 */
                if (old_ma->fsx_projid != fa->fsx_projid &&
                    !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid)))
                        return -EINVAL;
        }

        /* Check extent size hints. */
        if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
                        !S_ISDIR(inode->i_mode))
                return -EINVAL;

        if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
            !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                return -EINVAL;

        /*
         * It is only valid to set the DAX flag on regular files and
         * directories on filesystems.
         */
        if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
            !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
                return -EINVAL;

        /* Extent size hints of zero turn off the flags. */
        if (fa->fsx_extsize == 0)
                fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
        if (fa->fsx_cowextsize == 0)
                fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;

        return 0;
}

/**
 * vfs_fileattr_set - change miscellaneous file attributes
 * @idmap:        idmap of the mount
 * @dentry:        the object to change
 * @fa:                fileattr pointer
 *
 * After verifying permissions, call i_op->fileattr_set() callback, if
 * exists.
 *
 * Verifying attributes involves retrieving current attributes with
 * i_op->fileattr_get(), this also allows initializing attributes that have
 * not been set by the caller to current values.  Inode lock is held
 * thoughout to prevent racing with another instance.
 *
 * Return: 0 on success, or a negative error on failure.
 */
int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
                     struct fileattr *fa)
{
        struct inode *inode = d_inode(dentry);
        struct fileattr old_ma = {};
        int err;

        if (!inode->i_op->fileattr_set)
                return -ENOIOCTLCMD;

        if (!inode_owner_or_capable(idmap, inode))
                return -EPERM;

        inode_lock(inode);
        err = vfs_fileattr_get(dentry, &old_ma);
        if (!err) {
                /* initialize missing bits from old_ma */
                if (fa->flags_valid) {
                        fa->fsx_xflags |= old_ma.fsx_xflags & ~FS_XFLAG_COMMON;
                        fa->fsx_extsize = old_ma.fsx_extsize;
                        fa->fsx_nextents = old_ma.fsx_nextents;
                        fa->fsx_projid = old_ma.fsx_projid;
                        fa->fsx_cowextsize = old_ma.fsx_cowextsize;
                } else {
                        fa->flags |= old_ma.flags & ~FS_COMMON_FL;
                }
                err = fileattr_set_prepare(inode, &old_ma, fa);
                if (!err)
                        err = inode->i_op->fileattr_set(idmap, dentry, fa);
        }
        inode_unlock(inode);

        return err;
}
EXPORT_SYMBOL(vfs_fileattr_set);

static int ioctl_getflags(struct file *file, unsigned int __user *argp)
{
        struct fileattr fa = { .flags_valid = true }; /* hint only */
        int err;

        err = vfs_fileattr_get(file->f_path.dentry, &fa);
        if (!err)
                err = put_user(fa.flags, argp);
        return err;
}

static int ioctl_setflags(struct file *file, unsigned int __user *argp)
{
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        struct dentry *dentry = file->f_path.dentry;
        struct fileattr fa;
        unsigned int flags;
        int err;

        err = get_user(flags, argp);
        if (!err) {
                err = mnt_want_write_file(file);
                if (!err) {
                        fileattr_fill_flags(&fa, flags);
                        err = vfs_fileattr_set(idmap, dentry, &fa);
                        mnt_drop_write_file(file);
                }
        }
        return err;
}

static int ioctl_fsgetxattr(struct file *file, void __user *argp)
{
        struct fileattr fa = { .fsx_valid = true }; /* hint only */
        int err;

        err = vfs_fileattr_get(file->f_path.dentry, &fa);
        if (!err)
                err = copy_fsxattr_to_user(&fa, argp);

        return err;
}

static int ioctl_fssetxattr(struct file *file, void __user *argp)
{
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        struct dentry *dentry = file->f_path.dentry;
        struct fileattr fa;
        int err;

        err = copy_fsxattr_from_user(&fa, argp);
        if (!err) {
                err = mnt_want_write_file(file);
                if (!err) {
                        err = vfs_fileattr_set(idmap, dentry, &fa);
                        mnt_drop_write_file(file);
                }
        }
        return err;
}

static int ioctl_getfsuuid(struct file *file, void __user *argp)
{
        struct super_block *sb = file_inode(file)->i_sb;
        struct fsuuid2 u = { .len = sb->s_uuid_len, };

        if (!sb->s_uuid_len)
                return -ENOTTY;

        memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len);

        return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
}

static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp)
{
        struct super_block *sb = file_inode(file)->i_sb;

        if (!strlen(sb->s_sysfs_name))
                return -ENOTTY;

        struct fs_sysfs_path u = {};

        u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name);

        return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
}

/*
 * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
 * It's just a simple helper for sys_ioctl and compat_sys_ioctl.
 *
 * When you add any new common ioctls to the switches above and below,
 * please ensure they have compatible arguments in compat mode.
 *
 * The LSM mailing list should also be notified of any command additions or
 * changes, as specific LSMs may be affected.
 */
static int do_vfs_ioctl(struct file *filp, unsigned int fd,
                        unsigned int cmd, unsigned long arg)
{
        void __user *argp = (void __user *)arg;
        struct inode *inode = file_inode(filp);

        switch (cmd) {
        case FIOCLEX:
                set_close_on_exec(fd, 1);
                return 0;

        case FIONCLEX:
                set_close_on_exec(fd, 0);
                return 0;

        case FIONBIO:
                return ioctl_fionbio(filp, argp);

        case FIOASYNC:
                return ioctl_fioasync(fd, filp, argp);

        case FIOQSIZE:
                if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) ||
                    S_ISLNK(inode->i_mode)) {
                        loff_t res = inode_get_bytes(inode);
                        return copy_to_user(argp, &res, sizeof(res)) ?
                                            -EFAULT : 0;
                }

                return -ENOTTY;

        case FIFREEZE:
                return ioctl_fsfreeze(filp);

        case FITHAW:
                return ioctl_fsthaw(filp);

        case FS_IOC_FIEMAP:
                return ioctl_fiemap(filp, argp);

        case FIGETBSZ:
                /* anon_bdev filesystems may not have a block size */
                if (!inode->i_sb->s_blocksize)
                        return -EINVAL;

                return put_user(inode->i_sb->s_blocksize, (int __user *)argp);

        case FICLONE:
                return ioctl_file_clone(filp, arg, 0, 0, 0);

        case FICLONERANGE:
                return ioctl_file_clone_range(filp, argp);

        case FIDEDUPERANGE:
                return ioctl_file_dedupe_range(filp, argp);

        case FIONREAD:
                if (!S_ISREG(inode->i_mode))
                        return vfs_ioctl(filp, cmd, arg);

                return put_user(i_size_read(inode) - filp->f_pos,
                                (int __user *)argp);

        case FS_IOC_GETFLAGS:
                return ioctl_getflags(filp, argp);

        case FS_IOC_SETFLAGS:
                return ioctl_setflags(filp, argp);

        case FS_IOC_FSGETXATTR:
                return ioctl_fsgetxattr(filp, argp);

        case FS_IOC_FSSETXATTR:
                return ioctl_fssetxattr(filp, argp);

        case FS_IOC_GETFSUUID:
                return ioctl_getfsuuid(filp, argp);

        case FS_IOC_GETFSSYSFSPATH:
                return ioctl_get_fs_sysfs_path(filp, argp);

        default:
                if (S_ISREG(inode->i_mode))
                        return file_ioctl(filp, cmd, argp);
                break;
        }

        return -ENOIOCTLCMD;
}

SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
        CLASS(fd, f)(fd);
        int error;

        if (fd_empty(f))
                return -EBADF;

        error = security_file_ioctl(fd_file(f), cmd, arg);
        if (error)
                return error;

        error = do_vfs_ioctl(fd_file(f), fd, cmd, arg);
        if (error == -ENOIOCTLCMD)
                error = vfs_ioctl(fd_file(f), cmd, arg);

        return error;
}

#ifdef CONFIG_COMPAT
/**
 * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
 * @file: The file to operate on.
 * @cmd: The ioctl command number.
 * @arg: The argument to the ioctl.
 *
 * This is not normally called as a function, but instead set in struct
 * file_operations as
 *
 *     .compat_ioctl = compat_ptr_ioctl,
 *
 * On most architectures, the compat_ptr_ioctl() just passes all arguments
 * to the corresponding ->ioctl handler. The exception is arch/s390, where
 * compat_ptr() clears the top bit of a 32-bit pointer value, so user space
 * pointers to the second 2GB alias the first 2GB, as is the case for
 * native 32-bit s390 user space.
 *
 * The compat_ptr_ioctl() function must therefore be used only with ioctl
 * functions that either ignore the argument or pass a pointer to a
 * compatible data type.
 *
 * If any ioctl command handled by fops->unlocked_ioctl passes a plain
 * integer instead of a pointer, or any of the passed data types
 * is incompatible between 32-bit and 64-bit architectures, a proper
 * handler is required instead of compat_ptr_ioctl.
 */
long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        if (!file->f_op->unlocked_ioctl)
                return -ENOIOCTLCMD;

        return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
}
EXPORT_SYMBOL(compat_ptr_ioctl);

COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
                       compat_ulong_t, arg)
{
        CLASS(fd, f)(fd);
        int error;

        if (fd_empty(f))
                return -EBADF;

        error = security_file_ioctl_compat(fd_file(f), cmd, arg);
        if (error)
                return error;

        switch (cmd) {
        /* FICLONE takes an int argument, so don't use compat_ptr() */
        case FICLONE:
                error = ioctl_file_clone(fd_file(f), arg, 0, 0, 0);
                break;

#if defined(CONFIG_X86_64)
        /* these get messy on amd64 due to alignment differences */
        case FS_IOC_RESVSP_32:
        case FS_IOC_RESVSP64_32:
                error = compat_ioctl_preallocate(fd_file(f), 0, compat_ptr(arg));
                break;
        case FS_IOC_UNRESVSP_32:
        case FS_IOC_UNRESVSP64_32:
                error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_PUNCH_HOLE,
                                compat_ptr(arg));
                break;
        case FS_IOC_ZERO_RANGE_32:
                error = compat_ioctl_preallocate(fd_file(f), FALLOC_FL_ZERO_RANGE,
                                compat_ptr(arg));
                break;
#endif

        /*
         * These access 32-bit values anyway so no further handling is
         * necessary.
         */
        case FS_IOC32_GETFLAGS:
        case FS_IOC32_SETFLAGS:
                cmd = (cmd == FS_IOC32_GETFLAGS) ?
                        FS_IOC_GETFLAGS : FS_IOC_SETFLAGS;
                fallthrough;
        /*
         * everything else in do_vfs_ioctl() takes either a compatible
         * pointer argument or no argument -- call it with a modified
         * argument.
         */
        default:
                error = do_vfs_ioctl(fd_file(f), fd, cmd,
                                     (unsigned long)compat_ptr(arg));
                if (error != -ENOIOCTLCMD)
                        break;

                if (fd_file(f)->f_op->compat_ioctl)
                        error = fd_file(f)->f_op->compat_ioctl(fd_file(f), cmd, arg);
                if (error == -ENOIOCTLCMD)
                        error = -ENOTTY;
                break;
        }
        return error;
}
#endif









































































































































  163 








  163 


  163 



























  163 

  163 






  265 


  265 

  265 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
// SPDX-License-Identifier: GPL-2.0
/*
 * linux/drivers/char/misc.c
 *
 * Generic misc open routine by Johan Myreen
 *
 * Based on code from Linus
 *
 * Teemu Rantanen's Microsoft Busmouse support and Derrick Cole's
 *   changes incorporated into 0.97pl4
 *   by Peter Cervasio (pete%q106fm.uucp@wupost.wustl.edu) (08SEP92)
 *   See busmouse.c for particulars.
 *
 * Made things a lot mode modular - easy to compile in just one or two
 * of the misc drivers, as they are now completely independent. Linus.
 *
 * Support for loadable modules. 8-Sep-95 Philip Blundell <pjb27@cam.ac.uk>
 *
 * Fixed a failing symbol register to free the device registration
 *                Alan Cox <alan@lxorguk.ukuu.org.uk> 21-Jan-96
 *
 * Dynamic minors and /proc/mice by Alessandro Rubini. 26-Mar-96
 *
 * Renamed to misc and miscdevice to be more accurate. Alan Cox 26-Mar-96
 *
 * Handling of mouse minor numbers for kerneld:
 *  Idea by Jacques Gelinas <jack@solucorp.qc.ca>,
 *  adapted by Bjorn Ekwall <bj0rn@blox.se>
 *  corrected by Alan Cox <alan@lxorguk.ukuu.org.uk>
 *
 * Changes for kmod (from kerneld):
 *        Cyrus Durgin <cider@speakeasy.org>
 *
 * Added devfs support. Richard Gooch <rgooch@atnf.csiro.au>  10-Jan-1998
 */

#include <linux/module.h>

#include <linux/fs.h>
#include <linux/errno.h>
#include <linux/miscdevice.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/device.h>
#include <linux/tty.h>
#include <linux/kmod.h>
#include <linux/gfp.h>

/*
 * Head entry for the doubly linked miscdevice list
 */
static LIST_HEAD(misc_list);
static DEFINE_MUTEX(misc_mtx);

/*
 * Assigned numbers, used for dynamic minors
 */
#define DYNAMIC_MINORS 128 /* like dynamic majors */
static DEFINE_IDA(misc_minors_ida);

static int misc_minor_alloc(int minor)
{
        int ret = 0;

        if (minor == MISC_DYNAMIC_MINOR) {
                /* allocate free id */
                ret = ida_alloc_max(&misc_minors_ida, DYNAMIC_MINORS - 1, GFP_KERNEL);
                if (ret >= 0) {
                        ret = DYNAMIC_MINORS - ret - 1;
                } else {
                        ret = ida_alloc_range(&misc_minors_ida, MISC_DYNAMIC_MINOR + 1,
                                              MINORMASK, GFP_KERNEL);
                }
        } else {
                /* specific minor, check if it is in dynamic or misc dynamic range  */
                if (minor < DYNAMIC_MINORS) {
                        minor = DYNAMIC_MINORS - minor - 1;
                        ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL);
                } else if (minor > MISC_DYNAMIC_MINOR) {
                        ret = ida_alloc_range(&misc_minors_ida, minor, minor, GFP_KERNEL);
                } else {
                        /* case of non-dynamic minors, no need to allocate id */
                        ret = 0;
                }
        }
        return ret;
}

static void misc_minor_free(int minor)
{
        if (minor < DYNAMIC_MINORS)
                ida_free(&misc_minors_ida, DYNAMIC_MINORS - minor - 1);
        else if (minor > MISC_DYNAMIC_MINOR)
                ida_free(&misc_minors_ida, minor);
}

#ifdef CONFIG_PROC_FS
static void *misc_seq_start(struct seq_file *seq, loff_t *pos)
{
        mutex_lock(&misc_mtx);
        return seq_list_start(&misc_list, *pos);
}

static void *misc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        return seq_list_next(v, &misc_list, pos);
}

static void misc_seq_stop(struct seq_file *seq, void *v)
{
        mutex_unlock(&misc_mtx);
}

static int misc_seq_show(struct seq_file *seq, void *v)
{
        const struct miscdevice *p = list_entry(v, struct miscdevice, list);

        seq_printf(seq, "%3i %s\n", p->minor, p->name ? p->name : "");
        return 0;
}


static const struct seq_operations misc_seq_ops = {
        .start = misc_seq_start,
        .next  = misc_seq_next,
        .stop  = misc_seq_stop,
        .show  = misc_seq_show,
};
#endif

static int misc_open(struct inode *inode, struct file *file)
{
        int minor = iminor(inode);
        struct miscdevice *c = NULL, *iter;
        int err = -ENODEV;
        const struct file_operations *new_fops = NULL;

        mutex_lock(&misc_mtx);

        list_for_each_entry(iter, &misc_list, list) {
                if (iter->minor != minor)
                        continue;
                c = iter;
                new_fops = fops_get(iter->fops);
                break;
        }

        if (!new_fops) {
                mutex_unlock(&misc_mtx);
                request_module("char-major-%d-%d", MISC_MAJOR, minor);
                mutex_lock(&misc_mtx);

                list_for_each_entry(iter, &misc_list, list) {
                        if (iter->minor != minor)
                                continue;
                        c = iter;
                        new_fops = fops_get(iter->fops);
                        break;
                }
                if (!new_fops)
                        goto fail;
        }

        /*
         * Place the miscdevice in the file's
         * private_data so it can be used by the
         * file operations, including f_op->open below
         */
        file->private_data = c;

        err = 0;
        replace_fops(file, new_fops);
        if (file->f_op->open)
                err = file->f_op->open(inode, file);
fail:
        mutex_unlock(&misc_mtx);
        return err;
}

static char *misc_devnode(const struct device *dev, umode_t *mode)
{
        const struct miscdevice *c = dev_get_drvdata(dev);

        if (mode && c->mode)
                *mode = c->mode;
        if (c->nodename)
                return kstrdup(c->nodename, GFP_KERNEL);
        return NULL;
}

static const struct class misc_class = {
        .name                = "misc",
        .devnode        = misc_devnode,
};

static const struct file_operations misc_fops = {
        .owner                = THIS_MODULE,
        .open                = misc_open,
        .llseek                = noop_llseek,
};

/**
 *        misc_register        -        register a miscellaneous device
 *        @misc: device structure
 *
 *        Register a miscellaneous device with the kernel. If the minor
 *        number is set to %MISC_DYNAMIC_MINOR a minor number is assigned
 *        and placed in the minor field of the structure. For other cases
 *        the minor number requested is used.
 *
 *        The structure passed is linked into the kernel and may not be
 *        destroyed until it has been unregistered. By default, an open()
 *        syscall to the device sets file->private_data to point to the
 *        structure. Drivers don't need open in fops for this.
 *
 *        A zero is returned on success and a negative errno code for
 *        failure.
 */

int misc_register(struct miscdevice *misc)
{
        dev_t dev;
        int err = 0;
        bool is_dynamic = (misc->minor == MISC_DYNAMIC_MINOR);

        INIT_LIST_HEAD(&misc->list);

        mutex_lock(&misc_mtx);

        if (is_dynamic) {
                int i = misc_minor_alloc(misc->minor);

                if (i < 0) {
                        err = -EBUSY;
                        goto out;
                }
                misc->minor = i;
        } else {
                struct miscdevice *c;
                int i;

                list_for_each_entry(c, &misc_list, list) {
                        if (c->minor == misc->minor) {
                                err = -EBUSY;
                                goto out;
                        }
                }

                i = misc_minor_alloc(misc->minor);
                if (i < 0) {
                        err = -EBUSY;
                        goto out;
                }
        }

        dev = MKDEV(MISC_MAJOR, misc->minor);

        misc->this_device =
                device_create_with_groups(&misc_class, misc->parent, dev,
                                          misc, misc->groups, "%s", misc->name);
        if (IS_ERR(misc->this_device)) {
                misc_minor_free(misc->minor);
                if (is_dynamic) {
                        misc->minor = MISC_DYNAMIC_MINOR;
                }
                err = PTR_ERR(misc->this_device);
                goto out;
        }

        /*
         * Add it to the front, so that later devices can "override"
         * earlier defaults
         */
        list_add(&misc->list, &misc_list);
 out:
        mutex_unlock(&misc_mtx);
        return err;
}
EXPORT_SYMBOL(misc_register);

/**
 *        misc_deregister - unregister a miscellaneous device
 *        @misc: device to unregister
 *
 *        Unregister a miscellaneous device that was previously
 *        successfully registered with misc_register().
 */

void misc_deregister(struct miscdevice *misc)
{
        if (WARN_ON(list_empty(&misc->list)))
                return;

        mutex_lock(&misc_mtx);
        list_del(&misc->list);
        device_destroy(&misc_class, MKDEV(MISC_MAJOR, misc->minor));
        misc_minor_free(misc->minor);
        mutex_unlock(&misc_mtx);
}
EXPORT_SYMBOL(misc_deregister);

static int __init misc_init(void)
{
        int err;
        struct proc_dir_entry *ret;

        ret = proc_create_seq("misc", 0, NULL, &misc_seq_ops);
        err = class_register(&misc_class);
        if (err)
                goto fail_remove;

        err = -EIO;
        if (__register_chrdev(MISC_MAJOR, 0, MINORMASK + 1, "misc", &misc_fops))
                goto fail_printk;
        return 0;

fail_printk:
        pr_err("unable to get major %d for misc devices\n", MISC_MAJOR);
        class_unregister(&misc_class);
fail_remove:
        if (ret)
                remove_proc_entry("misc", NULL);
        return err;
}
subsys_initcall(misc_init);





















   34 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *  include/linux/signalfd.h
 *
 *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
 *
 */
#ifndef _LINUX_SIGNALFD_H
#define _LINUX_SIGNALFD_H

#include <uapi/linux/signalfd.h>
#include <linux/sched/signal.h>

#ifdef CONFIG_SIGNALFD

/*
 * Deliver the signal to listening signalfd.
 */
static inline void signalfd_notify(struct task_struct *tsk, int sig)
{
        if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh)))
                wake_up(&tsk->sighand->signalfd_wqh);
}

extern void signalfd_cleanup(struct sighand_struct *sighand);

#else /* CONFIG_SIGNALFD */

static inline void signalfd_notify(struct task_struct *tsk, int sig) { }

static inline void signalfd_cleanup(struct sighand_struct *sighand) { }

#endif /* CONFIG_SIGNALFD */

#endif /* _LINUX_SIGNALFD_H */





















































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 







    3 
























































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
// SPDX-License-Identifier: GPL-2.0-only

#include <net/netdev_lock.h>
#include <net/netdev_queues.h>
#include <net/sock.h>
#include <linux/ethtool_netlink.h>
#include <linux/phy_link_topology.h>
#include <linux/pm_runtime.h>
#include "netlink.h"
#include "module_fw.h"

static struct genl_family ethtool_genl_family;

static bool ethnl_ok __read_mostly;
static u32 ethnl_bcast_seq;

#define ETHTOOL_FLAGS_BASIC (ETHTOOL_FLAG_COMPACT_BITSETS |        \
                             ETHTOOL_FLAG_OMIT_REPLY)
#define ETHTOOL_FLAGS_STATS (ETHTOOL_FLAGS_BASIC | ETHTOOL_FLAG_STATS)

const struct nla_policy ethnl_header_policy[] = {
        [ETHTOOL_A_HEADER_DEV_INDEX]        = { .type = NLA_U32 },
        [ETHTOOL_A_HEADER_DEV_NAME]        = { .type = NLA_NUL_STRING,
                                            .len = ALTIFNAMSIZ - 1 },
        [ETHTOOL_A_HEADER_FLAGS]        = NLA_POLICY_MASK(NLA_U32,
                                                          ETHTOOL_FLAGS_BASIC),
};

const struct nla_policy ethnl_header_policy_stats[] = {
        [ETHTOOL_A_HEADER_DEV_INDEX]        = { .type = NLA_U32 },
        [ETHTOOL_A_HEADER_DEV_NAME]        = { .type = NLA_NUL_STRING,
                                            .len = ALTIFNAMSIZ - 1 },
        [ETHTOOL_A_HEADER_FLAGS]        = NLA_POLICY_MASK(NLA_U32,
                                                          ETHTOOL_FLAGS_STATS),
};

const struct nla_policy ethnl_header_policy_phy[] = {
        [ETHTOOL_A_HEADER_DEV_INDEX]        = { .type = NLA_U32 },
        [ETHTOOL_A_HEADER_DEV_NAME]        = { .type = NLA_NUL_STRING,
                                            .len = ALTIFNAMSIZ - 1 },
        [ETHTOOL_A_HEADER_FLAGS]        = NLA_POLICY_MASK(NLA_U32,
                                                          ETHTOOL_FLAGS_BASIC),
        [ETHTOOL_A_HEADER_PHY_INDEX]                = NLA_POLICY_MIN(NLA_U32, 1),
};

const struct nla_policy ethnl_header_policy_phy_stats[] = {
        [ETHTOOL_A_HEADER_DEV_INDEX]        = { .type = NLA_U32 },
        [ETHTOOL_A_HEADER_DEV_NAME]        = { .type = NLA_NUL_STRING,
                                            .len = ALTIFNAMSIZ - 1 },
        [ETHTOOL_A_HEADER_FLAGS]        = NLA_POLICY_MASK(NLA_U32,
                                                          ETHTOOL_FLAGS_STATS),
        [ETHTOOL_A_HEADER_PHY_INDEX]                = NLA_POLICY_MIN(NLA_U32, 1),
};

int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid,
                        enum ethnl_sock_type type)
{
        struct ethnl_sock_priv *sk_priv;

        sk_priv = genl_sk_priv_get(&ethtool_genl_family, NETLINK_CB(skb).sk);
        if (IS_ERR(sk_priv))
                return PTR_ERR(sk_priv);

        sk_priv->dev = dev;
        sk_priv->portid = portid;
        sk_priv->type = type;

        return 0;
}

static void ethnl_sock_priv_destroy(void *priv)
{
        struct ethnl_sock_priv *sk_priv = priv;

        switch (sk_priv->type) {
        case ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH:
                ethnl_module_fw_flash_sock_destroy(sk_priv);
                break;
        default:
                break;
        }
}

int ethnl_ops_begin(struct net_device *dev)
{
        int ret;

        if (!dev)
                return -ENODEV;

        if (dev->dev.parent)
                pm_runtime_get_sync(dev->dev.parent);

        netdev_ops_assert_locked(dev);

        if (!netif_device_present(dev) ||
            dev->reg_state >= NETREG_UNREGISTERING) {
                ret = -ENODEV;
                goto err;
        }

        if (dev->ethtool_ops->begin) {
                ret = dev->ethtool_ops->begin(dev);
                if (ret)
                        goto err;
        }

        return 0;
err:
        if (dev->dev.parent)
                pm_runtime_put(dev->dev.parent);

        return ret;
}

void ethnl_ops_complete(struct net_device *dev)
{
        if (dev->ethtool_ops->complete)
                dev->ethtool_ops->complete(dev);

        if (dev->dev.parent)
                pm_runtime_put(dev->dev.parent);
}

/**
 * ethnl_parse_header_dev_get() - parse request header
 * @req_info:    structure to put results into
 * @header:      nest attribute with request header
 * @net:         request netns
 * @extack:      netlink extack for error reporting
 * @require_dev: fail if no device identified in header
 *
 * Parse request header in nested attribute @nest and puts results into
 * the structure pointed to by @req_info. Extack from @info is used for error
 * reporting. If req_info->dev is not null on return, reference to it has
 * been taken. If error is returned, *req_info is null initialized and no
 * reference is held.
 *
 * Return: 0 on success or negative error code
 */
int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
                               const struct nlattr *header, struct net *net,
                               struct netlink_ext_ack *extack, bool require_dev)
{
        struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)];
        const struct nlattr *devname_attr;
        struct net_device *dev = NULL;
        u32 flags = 0;
        int ret;

        if (!header) {
                if (!require_dev)
                        return 0;
                NL_SET_ERR_MSG(extack, "request header missing");
                return -EINVAL;
        }
        /* No validation here, command policy should have a nested policy set
         * for the header, therefore validation should have already been done.
         */
        ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header,
                               NULL, extack);
        if (ret < 0)
                return ret;
        if (tb[ETHTOOL_A_HEADER_FLAGS])
                flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]);

        devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME];
        if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) {
                u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]);

                dev = netdev_get_by_index(net, ifindex, &req_info->dev_tracker,
                                          GFP_KERNEL);
                if (!dev) {
                        NL_SET_ERR_MSG_ATTR(extack,
                                            tb[ETHTOOL_A_HEADER_DEV_INDEX],
                                            "no device matches ifindex");
                        return -ENODEV;
                }
                /* if both ifindex and ifname are passed, they must match */
                if (devname_attr &&
                    strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) {
                        netdev_put(dev, &req_info->dev_tracker);
                        NL_SET_ERR_MSG_ATTR(extack, header,
                                            "ifindex and name do not match");
                        return -ENODEV;
                }
        } else if (devname_attr) {
                dev = netdev_get_by_name(net, nla_data(devname_attr),
                                         &req_info->dev_tracker, GFP_KERNEL);
                if (!dev) {
                        NL_SET_ERR_MSG_ATTR(extack, devname_attr,
                                            "no device matches name");
                        return -ENODEV;
                }
        } else if (require_dev) {
                NL_SET_ERR_MSG_ATTR(extack, header,
                                    "neither ifindex nor name specified");
                return -EINVAL;
        }

        if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) {
                if (dev) {
                        req_info->phy_index = nla_get_u32(tb[ETHTOOL_A_HEADER_PHY_INDEX]);
                } else {
                        NL_SET_ERR_MSG_ATTR(extack, header,
                                            "phy_index set without a netdev");
                        return -EINVAL;
                }
        }

        req_info->dev = dev;
        req_info->flags = flags;
        return 0;
}

struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info,
                                        struct nlattr **tb, unsigned int header,
                                        struct netlink_ext_ack *extack)
{
        struct phy_device *phydev;

        ASSERT_RTNL();

        if (!req_info->dev)
                return NULL;

        if (!req_info->phy_index)
                return req_info->dev->phydev;

        phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index);
        if (!phydev && tb) {
                NL_SET_ERR_MSG_ATTR(extack, tb[header],
                                    "no phy matching phyindex");
                return ERR_PTR(-ENODEV);
        }

        return phydev;
}

/**
 * ethnl_fill_reply_header() - Put common header into a reply message
 * @skb:      skb with the message
 * @dev:      network device to describe in header
 * @attrtype: attribute type to use for the nest
 *
 * Create a nested attribute with attributes describing given network device.
 *
 * Return: 0 on success, error value (-EMSGSIZE only) on error
 */
int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
                            u16 attrtype)
{
        struct nlattr *nest;

        if (!dev)
                return 0;
        nest = nla_nest_start(skb, attrtype);
        if (!nest)
                return -EMSGSIZE;

        if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) ||
            nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name))
                goto nla_put_failure;
        /* If more attributes are put into reply header, ethnl_header_size()
         * must be updated to account for them.
         */

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -EMSGSIZE;
}

/**
 * ethnl_reply_init() - Create skb for a reply and fill device identification
 * @payload:      payload length (without netlink and genetlink header)
 * @dev:          device the reply is about (may be null)
 * @cmd:          ETHTOOL_MSG_* message type for reply
 * @hdr_attrtype: attribute type for common header
 * @info:         genetlink info of the received packet we respond to
 * @ehdrp:        place to store payload pointer returned by genlmsg_new()
 *
 * Return: pointer to allocated skb on success, NULL on error
 */
struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
                                 u16 hdr_attrtype, struct genl_info *info,
                                 void **ehdrp)
{
        struct sk_buff *skb;

        skb = genlmsg_new(payload, GFP_KERNEL);
        if (!skb)
                goto err;
        *ehdrp = genlmsg_put_reply(skb, info, &ethtool_genl_family, 0, cmd);
        if (!*ehdrp)
                goto err_free;

        if (dev) {
                int ret;

                ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype);
                if (ret < 0)
                        goto err_free;
        }
        return skb;

err_free:
        nlmsg_free(skb);
err:
        if (info)
                GENL_SET_ERR_MSG(info, "failed to setup reply message");
        return NULL;
}

void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd)
{
        return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                           &ethtool_genl_family, 0, cmd);
}

void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd)
{
        return genlmsg_put(skb, 0, ++ethnl_bcast_seq, &ethtool_genl_family, 0,
                           cmd);
}

void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd)
{
        return genlmsg_put(skb, portid, seq, &ethtool_genl_family, 0, cmd);
}

int ethnl_multicast(struct sk_buff *skb, struct net_device *dev)
{
        return genlmsg_multicast_netns(&ethtool_genl_family, dev_net(dev), skb,
                                       0, ETHNL_MCGRP_MONITOR, GFP_KERNEL);
}

/* GET request helpers */

/**
 * struct ethnl_dump_ctx - context structure for generic dumpit() callback
 * @ops:        request ops of currently processed message type
 * @req_info:   parsed request header of processed request
 * @reply_data: data needed to compose the reply
 * @pos_ifindex: saved iteration position - ifindex
 *
 * These parameters are kept in struct netlink_callback as context preserved
 * between iterations. They are initialized by ethnl_default_start() and used
 * in ethnl_default_dumpit() and ethnl_default_done().
 */
struct ethnl_dump_ctx {
        const struct ethnl_request_ops        *ops;
        struct ethnl_req_info                *req_info;
        struct ethnl_reply_data                *reply_data;
        unsigned long                        pos_ifindex;
};

static const struct ethnl_request_ops *
ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
        [ETHTOOL_MSG_STRSET_GET]        = &ethnl_strset_request_ops,
        [ETHTOOL_MSG_LINKINFO_GET]        = &ethnl_linkinfo_request_ops,
        [ETHTOOL_MSG_LINKINFO_SET]        = &ethnl_linkinfo_request_ops,
        [ETHTOOL_MSG_LINKMODES_GET]        = &ethnl_linkmodes_request_ops,
        [ETHTOOL_MSG_LINKMODES_SET]        = &ethnl_linkmodes_request_ops,
        [ETHTOOL_MSG_LINKSTATE_GET]        = &ethnl_linkstate_request_ops,
        [ETHTOOL_MSG_DEBUG_GET]                = &ethnl_debug_request_ops,
        [ETHTOOL_MSG_DEBUG_SET]                = &ethnl_debug_request_ops,
        [ETHTOOL_MSG_WOL_GET]                = &ethnl_wol_request_ops,
        [ETHTOOL_MSG_WOL_SET]                = &ethnl_wol_request_ops,
        [ETHTOOL_MSG_FEATURES_GET]        = &ethnl_features_request_ops,
        [ETHTOOL_MSG_PRIVFLAGS_GET]        = &ethnl_privflags_request_ops,
        [ETHTOOL_MSG_PRIVFLAGS_SET]        = &ethnl_privflags_request_ops,
        [ETHTOOL_MSG_RINGS_GET]                = &ethnl_rings_request_ops,
        [ETHTOOL_MSG_RINGS_SET]                = &ethnl_rings_request_ops,
        [ETHTOOL_MSG_CHANNELS_GET]        = &ethnl_channels_request_ops,
        [ETHTOOL_MSG_CHANNELS_SET]        = &ethnl_channels_request_ops,
        [ETHTOOL_MSG_COALESCE_GET]        = &ethnl_coalesce_request_ops,
        [ETHTOOL_MSG_COALESCE_SET]        = &ethnl_coalesce_request_ops,
        [ETHTOOL_MSG_PAUSE_GET]                = &ethnl_pause_request_ops,
        [ETHTOOL_MSG_PAUSE_SET]                = &ethnl_pause_request_ops,
        [ETHTOOL_MSG_EEE_GET]                = &ethnl_eee_request_ops,
        [ETHTOOL_MSG_EEE_SET]                = &ethnl_eee_request_ops,
        [ETHTOOL_MSG_FEC_GET]                = &ethnl_fec_request_ops,
        [ETHTOOL_MSG_FEC_SET]                = &ethnl_fec_request_ops,
        [ETHTOOL_MSG_TSINFO_GET]        = &ethnl_tsinfo_request_ops,
        [ETHTOOL_MSG_MODULE_EEPROM_GET]        = &ethnl_module_eeprom_request_ops,
        [ETHTOOL_MSG_STATS_GET]                = &ethnl_stats_request_ops,
        [ETHTOOL_MSG_PHC_VCLOCKS_GET]        = &ethnl_phc_vclocks_request_ops,
        [ETHTOOL_MSG_MODULE_GET]        = &ethnl_module_request_ops,
        [ETHTOOL_MSG_MODULE_SET]        = &ethnl_module_request_ops,
        [ETHTOOL_MSG_PSE_GET]                = &ethnl_pse_request_ops,
        [ETHTOOL_MSG_PSE_SET]                = &ethnl_pse_request_ops,
        [ETHTOOL_MSG_RSS_GET]                = &ethnl_rss_request_ops,
        [ETHTOOL_MSG_PLCA_GET_CFG]        = &ethnl_plca_cfg_request_ops,
        [ETHTOOL_MSG_PLCA_SET_CFG]        = &ethnl_plca_cfg_request_ops,
        [ETHTOOL_MSG_PLCA_GET_STATUS]        = &ethnl_plca_status_request_ops,
        [ETHTOOL_MSG_MM_GET]                = &ethnl_mm_request_ops,
        [ETHTOOL_MSG_MM_SET]                = &ethnl_mm_request_ops,
        [ETHTOOL_MSG_TSCONFIG_GET]        = &ethnl_tsconfig_request_ops,
        [ETHTOOL_MSG_TSCONFIG_SET]        = &ethnl_tsconfig_request_ops,
};

static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
{
        return (struct ethnl_dump_ctx *)cb->ctx;
}

/**
 * ethnl_default_parse() - Parse request message
 * @req_info:    pointer to structure to put data into
 * @info:         genl_info from the request
 * @request_ops: struct request_ops for request type
 * @require_dev: fail if no device identified in header
 *
 * Parse universal request header and call request specific ->parse_request()
 * callback (if defined) to parse the rest of the message.
 *
 * Return: 0 on success or negative error code
 */
static int ethnl_default_parse(struct ethnl_req_info *req_info,
                               const struct genl_info *info,
                               const struct ethnl_request_ops *request_ops,
                               bool require_dev)
{
        struct nlattr **tb = info->attrs;
        int ret;

        ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr],
                                         genl_info_net(info), info->extack,
                                         require_dev);
        if (ret < 0)
                return ret;

        if (request_ops->parse_request) {
                ret = request_ops->parse_request(req_info, tb, info->extack);
                if (ret < 0)
                        return ret;
        }

        return 0;
}

/**
 * ethnl_init_reply_data() - Initialize reply data for GET request
 * @reply_data: pointer to embedded struct ethnl_reply_data
 * @ops:        instance of struct ethnl_request_ops describing the layout
 * @dev:        network device to initialize the reply for
 *
 * Fills the reply data part with zeros and sets the dev member. Must be called
 * before calling the ->fill_reply() callback (for each iteration when handling
 * dump requests).
 */
static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data,
                                  const struct ethnl_request_ops *ops,
                                  struct net_device *dev)
{
        memset(reply_data, 0, ops->reply_data_size);
        reply_data->dev = dev;
}

/* default ->doit() handler for GET type requests */
static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct ethnl_reply_data *reply_data = NULL;
        struct ethnl_req_info *req_info = NULL;
        const u8 cmd = info->genlhdr->cmd;
        const struct ethnl_request_ops *ops;
        int hdr_len, reply_len;
        struct sk_buff *rskb;
        void *reply_payload;
        int ret;

        ops = ethnl_default_requests[cmd];
        if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd))
                return -EOPNOTSUPP;
        if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr))
                return -EINVAL;

        req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
        if (!req_info)
                return -ENOMEM;
        reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
        if (!reply_data) {
                kfree(req_info);
                return -ENOMEM;
        }

        ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do);
        if (ret < 0)
                goto err_dev;
        ethnl_init_reply_data(reply_data, ops, req_info->dev);

        rtnl_lock();
        if (req_info->dev)
                netdev_lock_ops(req_info->dev);
        ret = ops->prepare_data(req_info, reply_data, info);
        if (req_info->dev)
                netdev_unlock_ops(req_info->dev);
        rtnl_unlock();
        if (ret < 0)
                goto err_dev;
        ret = ops->reply_size(req_info, reply_data);
        if (ret < 0)
                goto err_cleanup;
        reply_len = ret;
        ret = -ENOMEM;
        rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(),
                                req_info->dev, ops->reply_cmd,
                                ops->hdr_attr, info, &reply_payload);
        if (!rskb)
                goto err_cleanup;
        hdr_len = rskb->len;
        ret = ops->fill_reply(rskb, req_info, reply_data);
        if (ret < 0)
                goto err_msg;
        WARN_ONCE(rskb->len - hdr_len > reply_len,
                  "ethnl cmd %d: calculated reply length %d, but consumed %d\n",
                  cmd, reply_len, rskb->len - hdr_len);
        if (ops->cleanup_data)
                ops->cleanup_data(reply_data);

        genlmsg_end(rskb, reply_payload);
        netdev_put(req_info->dev, &req_info->dev_tracker);
        kfree(reply_data);
        kfree(req_info);
        return genlmsg_reply(rskb, info);

err_msg:
        WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len);
        nlmsg_free(rskb);
err_cleanup:
        if (ops->cleanup_data)
                ops->cleanup_data(reply_data);
err_dev:
        netdev_put(req_info->dev, &req_info->dev_tracker);
        kfree(reply_data);
        kfree(req_info);
        return ret;
}

static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
                                  const struct ethnl_dump_ctx *ctx,
                                  const struct genl_info *info)
{
        void *ehdr;
        int ret;

        ehdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
                           &ethtool_genl_family, NLM_F_MULTI,
                           ctx->ops->reply_cmd);
        if (!ehdr)
                return -EMSGSIZE;

        ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev);
        rtnl_lock();
        netdev_lock_ops(dev);
        ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info);
        netdev_unlock_ops(dev);
        rtnl_unlock();
        if (ret < 0)
                goto out_cancel;
        ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr);
        if (ret < 0)
                goto out;
        ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data);

out:
        if (ctx->ops->cleanup_data)
                ctx->ops->cleanup_data(ctx->reply_data);
out_cancel:
        ctx->reply_data->dev = NULL;
        if (ret < 0)
                genlmsg_cancel(skb, ehdr);
        else
                genlmsg_end(skb, ehdr);
        return ret;
}

/* Default ->dumpit() handler for GET requests. */
static int ethnl_default_dumpit(struct sk_buff *skb,
                                struct netlink_callback *cb)
{
        struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
        struct net *net = sock_net(skb->sk);
        struct net_device *dev;
        int ret = 0;

        rcu_read_lock();
        for_each_netdev_dump(net, dev, ctx->pos_ifindex) {
                dev_hold(dev);
                rcu_read_unlock();

                ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb));

                rcu_read_lock();
                dev_put(dev);

                if (ret < 0 && ret != -EOPNOTSUPP) {
                        if (likely(skb->len))
                                ret = skb->len;
                        break;
                }
                ret = 0;
        }
        rcu_read_unlock();

        return ret;
}

/* generic ->start() handler for GET requests */
static int ethnl_default_start(struct netlink_callback *cb)
{
        const struct genl_dumpit_info *info = genl_dumpit_info(cb);
        struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
        struct ethnl_reply_data *reply_data;
        const struct ethnl_request_ops *ops;
        struct ethnl_req_info *req_info;
        struct genlmsghdr *ghdr;
        int ret;

        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));

        ghdr = nlmsg_data(cb->nlh);
        ops = ethnl_default_requests[ghdr->cmd];
        if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd))
                return -EOPNOTSUPP;
        req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
        if (!req_info)
                return -ENOMEM;
        reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
        if (!reply_data) {
                ret = -ENOMEM;
                goto free_req_info;
        }

        ret = ethnl_default_parse(req_info, &info->info, ops, false);
        if (req_info->dev) {
                /* We ignore device specification in dump requests but as the
                 * same parser as for non-dump (doit) requests is used, it
                 * would take reference to the device if it finds one
                 */
                netdev_put(req_info->dev, &req_info->dev_tracker);
                req_info->dev = NULL;
        }
        if (ret < 0)
                goto free_reply_data;

        ctx->ops = ops;
        ctx->req_info = req_info;
        ctx->reply_data = reply_data;
        ctx->pos_ifindex = 0;

        return 0;

free_reply_data:
        kfree(reply_data);
free_req_info:
        kfree(req_info);

        return ret;
}

/* default ->done() handler for GET requests */
static int ethnl_default_done(struct netlink_callback *cb)
{
        struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);

        kfree(ctx->reply_data);
        kfree(ctx->req_info);

        return 0;
}

static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info)
{
        const struct ethnl_request_ops *ops;
        struct ethnl_req_info req_info = {};
        const u8 cmd = info->genlhdr->cmd;
        struct net_device *dev;
        int ret;

        ops = ethnl_default_requests[cmd];
        if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd))
                return -EOPNOTSUPP;
        if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr))
                return -EINVAL;

        ret = ethnl_parse_header_dev_get(&req_info, info->attrs[ops->hdr_attr],
                                         genl_info_net(info), info->extack,
                                         true);
        if (ret < 0)
                return ret;

        if (ops->set_validate) {
                ret = ops->set_validate(&req_info, info);
                /* 0 means nothing to do */
                if (ret <= 0)
                        goto out_dev;
        }

        dev = req_info.dev;

        rtnl_lock();
        netdev_lock_ops(dev);
        dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg),
                                   GFP_KERNEL_ACCOUNT);
        if (!dev->cfg_pending) {
                ret = -ENOMEM;
                goto out_tie_cfg;
        }

        ret = ethnl_ops_begin(dev);
        if (ret < 0)
                goto out_free_cfg;

        ret = ops->set(&req_info, info);
        if (ret < 0)
                goto out_ops;

        swap(dev->cfg, dev->cfg_pending);
        if (!ret)
                goto out_ops;
        ethtool_notify(dev, ops->set_ntf_cmd, NULL);

        ret = 0;
out_ops:
        ethnl_ops_complete(dev);
out_free_cfg:
        kfree(dev->cfg_pending);
out_tie_cfg:
        dev->cfg_pending = dev->cfg;
        netdev_unlock_ops(dev);
        rtnl_unlock();
out_dev:
        ethnl_parse_header_dev_put(&req_info);
        return ret;
}

static const struct ethnl_request_ops *
ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
        [ETHTOOL_MSG_LINKINFO_NTF]        = &ethnl_linkinfo_request_ops,
        [ETHTOOL_MSG_LINKMODES_NTF]        = &ethnl_linkmodes_request_ops,
        [ETHTOOL_MSG_DEBUG_NTF]                = &ethnl_debug_request_ops,
        [ETHTOOL_MSG_WOL_NTF]                = &ethnl_wol_request_ops,
        [ETHTOOL_MSG_FEATURES_NTF]        = &ethnl_features_request_ops,
        [ETHTOOL_MSG_PRIVFLAGS_NTF]        = &ethnl_privflags_request_ops,
        [ETHTOOL_MSG_RINGS_NTF]                = &ethnl_rings_request_ops,
        [ETHTOOL_MSG_CHANNELS_NTF]        = &ethnl_channels_request_ops,
        [ETHTOOL_MSG_COALESCE_NTF]        = &ethnl_coalesce_request_ops,
        [ETHTOOL_MSG_PAUSE_NTF]                = &ethnl_pause_request_ops,
        [ETHTOOL_MSG_EEE_NTF]                = &ethnl_eee_request_ops,
        [ETHTOOL_MSG_FEC_NTF]                = &ethnl_fec_request_ops,
        [ETHTOOL_MSG_MODULE_NTF]        = &ethnl_module_request_ops,
        [ETHTOOL_MSG_PLCA_NTF]                = &ethnl_plca_cfg_request_ops,
        [ETHTOOL_MSG_MM_NTF]                = &ethnl_mm_request_ops,
};

/* default notification handler */
static void ethnl_default_notify(struct net_device *dev, unsigned int cmd,
                                 const void *data)
{
        struct ethnl_reply_data *reply_data;
        const struct ethnl_request_ops *ops;
        struct ethnl_req_info *req_info;
        struct genl_info info;
        struct sk_buff *skb;
        void *reply_payload;
        int reply_len;
        int ret;

        genl_info_init_ntf(&info, &ethtool_genl_family, cmd);

        if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX ||
                      !ethnl_default_notify_ops[cmd],
                      "unexpected notification type %u\n", cmd))
                return;
        ops = ethnl_default_notify_ops[cmd];
        req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
        if (!req_info)
                return;
        reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
        if (!reply_data) {
                kfree(req_info);
                return;
        }

        req_info->dev = dev;
        req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS;

        netdev_ops_assert_locked(dev);

        ethnl_init_reply_data(reply_data, ops, dev);
        ret = ops->prepare_data(req_info, reply_data, &info);
        if (ret < 0)
                goto err_rep;
        ret = ops->reply_size(req_info, reply_data);
        if (ret < 0)
                goto err_cleanup;
        reply_len = ret + ethnl_reply_header_size();
        skb = genlmsg_new(reply_len, GFP_KERNEL);
        if (!skb)
                goto err_cleanup;
        reply_payload = ethnl_bcastmsg_put(skb, cmd);
        if (!reply_payload)
                goto err_skb;
        ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr);
        if (ret < 0)
                goto err_msg;
        ret = ops->fill_reply(skb, req_info, reply_data);
        if (ret < 0)
                goto err_msg;
        if (ops->cleanup_data)
                ops->cleanup_data(reply_data);

        genlmsg_end(skb, reply_payload);
        kfree(reply_data);
        kfree(req_info);
        ethnl_multicast(skb, dev);
        return;

err_msg:
        WARN_ONCE(ret == -EMSGSIZE,
                  "calculated message payload length (%d) not sufficient\n",
                  reply_len);
err_skb:
        nlmsg_free(skb);
err_cleanup:
        if (ops->cleanup_data)
                ops->cleanup_data(reply_data);
err_rep:
        kfree(reply_data);
        kfree(req_info);
        return;
}

/* notifications */

typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd,
                                       const void *data);

static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
        [ETHTOOL_MSG_LINKINFO_NTF]        = ethnl_default_notify,
        [ETHTOOL_MSG_LINKMODES_NTF]        = ethnl_default_notify,
        [ETHTOOL_MSG_DEBUG_NTF]                = ethnl_default_notify,
        [ETHTOOL_MSG_WOL_NTF]                = ethnl_default_notify,
        [ETHTOOL_MSG_FEATURES_NTF]        = ethnl_default_notify,
        [ETHTOOL_MSG_PRIVFLAGS_NTF]        = ethnl_default_notify,
        [ETHTOOL_MSG_RINGS_NTF]                = ethnl_default_notify,
        [ETHTOOL_MSG_CHANNELS_NTF]        = ethnl_default_notify,
        [ETHTOOL_MSG_COALESCE_NTF]        = ethnl_default_notify,
        [ETHTOOL_MSG_PAUSE_NTF]                = ethnl_default_notify,
        [ETHTOOL_MSG_EEE_NTF]                = ethnl_default_notify,
        [ETHTOOL_MSG_FEC_NTF]                = ethnl_default_notify,
        [ETHTOOL_MSG_MODULE_NTF]        = ethnl_default_notify,
        [ETHTOOL_MSG_PLCA_NTF]                = ethnl_default_notify,
        [ETHTOOL_MSG_MM_NTF]                = ethnl_default_notify,
};

void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data)
{
        if (unlikely(!ethnl_ok))
                return;
        ASSERT_RTNL();

        if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) &&
                   ethnl_notify_handlers[cmd]))
                ethnl_notify_handlers[cmd](dev, cmd, data);
        else
                WARN_ONCE(1, "notification %u not implemented (dev=%s)\n",
                          cmd, netdev_name(dev));
}
EXPORT_SYMBOL(ethtool_notify);

static void ethnl_notify_features(struct netdev_notifier_info *info)
{
        struct net_device *dev = netdev_notifier_info_to_dev(info);

        ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL);
}

static int ethnl_netdev_event(struct notifier_block *this, unsigned long event,
                              void *ptr)
{
        struct netdev_notifier_info *info = ptr;
        struct netlink_ext_ack *extack;
        struct net_device *dev;

        dev = netdev_notifier_info_to_dev(info);
        extack = netdev_notifier_info_to_extack(info);

        switch (event) {
        case NETDEV_FEAT_CHANGE:
                ethnl_notify_features(ptr);
                break;
        case NETDEV_PRE_UP:
                if (dev->ethtool->module_fw_flash_in_progress) {
                        NL_SET_ERR_MSG(extack, "Can't set port up while flashing module firmware");
                        return NOTIFY_BAD;
                }
        }

        return NOTIFY_DONE;
}

static struct notifier_block ethnl_netdev_notifier = {
        .notifier_call = ethnl_netdev_event,
};

/* genetlink setup */

static const struct genl_ops ethtool_genl_ops[] = {
        {
                .cmd        = ETHTOOL_MSG_STRSET_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_strset_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_strset_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_LINKINFO_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_linkinfo_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_linkinfo_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_LINKINFO_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_linkinfo_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_LINKMODES_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_linkmodes_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_linkmodes_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_LINKMODES_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_linkmodes_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_LINKSTATE_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_linkstate_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_linkstate_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_DEBUG_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_debug_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_debug_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_DEBUG_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_debug_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_WOL_GET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_wol_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_wol_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_WOL_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_wol_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_FEATURES_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_features_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_features_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_FEATURES_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_set_features,
                .policy = ethnl_features_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_features_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PRIVFLAGS_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_privflags_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_privflags_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PRIVFLAGS_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_privflags_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_RINGS_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_rings_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_rings_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_RINGS_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_rings_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_CHANNELS_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_channels_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_channels_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_CHANNELS_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_channels_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_COALESCE_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_coalesce_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_coalesce_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_COALESCE_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_coalesce_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PAUSE_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_pause_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_pause_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PAUSE_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_pause_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_EEE_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_eee_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_eee_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_EEE_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_eee_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_TSINFO_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_tsinfo_start,
                .dumpit        = ethnl_tsinfo_dumpit,
                .done        = ethnl_tsinfo_done,
                .policy = ethnl_tsinfo_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_CABLE_TEST_ACT,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_act_cable_test,
                .policy = ethnl_cable_test_act_policy,
                .maxattr = ARRAY_SIZE(ethnl_cable_test_act_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_CABLE_TEST_TDR_ACT,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_act_cable_test_tdr,
                .policy = ethnl_cable_test_tdr_act_policy,
                .maxattr = ARRAY_SIZE(ethnl_cable_test_tdr_act_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_TUNNEL_INFO_GET,
                .doit        = ethnl_tunnel_info_doit,
                .start        = ethnl_tunnel_info_start,
                .dumpit        = ethnl_tunnel_info_dumpit,
                .policy = ethnl_tunnel_info_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_FEC_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_fec_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_FEC_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_fec_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_MODULE_EEPROM_GET,
                .flags  = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_module_eeprom_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_STATS_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_stats_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PHC_VCLOCKS_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_phc_vclocks_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_MODULE_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_module_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_module_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_MODULE_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_module_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PSE_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_pse_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PSE_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_pse_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_RSS_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_rss_dump_start,
                .dumpit        = ethnl_rss_dumpit,
                .policy = ethnl_rss_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PLCA_GET_CFG,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_plca_get_cfg_policy,
                .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PLCA_SET_CFG,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_plca_set_cfg_policy,
                .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PLCA_GET_STATUS,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_plca_get_status_policy,
                .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_MM_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_mm_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_mm_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_MM_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_mm_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_MODULE_FW_FLASH_ACT,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_act_module_fw_flash,
                .policy        = ethnl_module_fw_flash_act_policy,
                .maxattr = ARRAY_SIZE(ethnl_module_fw_flash_act_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_PHY_GET,
                .doit        = ethnl_phy_doit,
                .start        = ethnl_phy_start,
                .dumpit        = ethnl_phy_dumpit,
                .done        = ethnl_phy_done,
                .policy = ethnl_phy_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_TSCONFIG_GET,
                .doit        = ethnl_default_doit,
                .start        = ethnl_default_start,
                .dumpit        = ethnl_default_dumpit,
                .done        = ethnl_default_done,
                .policy = ethnl_tsconfig_get_policy,
                .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1,
        },
        {
                .cmd        = ETHTOOL_MSG_TSCONFIG_SET,
                .flags        = GENL_UNS_ADMIN_PERM,
                .doit        = ethnl_default_set_doit,
                .policy = ethnl_tsconfig_set_policy,
                .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1,
        },
};

static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
        [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME },
};

static struct genl_family ethtool_genl_family __ro_after_init = {
        .name                = ETHTOOL_GENL_NAME,
        .version        = ETHTOOL_GENL_VERSION,
        .netnsok        = true,
        .parallel_ops        = true,
        .ops                = ethtool_genl_ops,
        .n_ops                = ARRAY_SIZE(ethtool_genl_ops),
        .resv_start_op        = ETHTOOL_MSG_MODULE_GET + 1,
        .mcgrps                = ethtool_nl_mcgrps,
        .n_mcgrps        = ARRAY_SIZE(ethtool_nl_mcgrps),
        .sock_priv_size                = sizeof(struct ethnl_sock_priv),
        .sock_priv_destroy        = ethnl_sock_priv_destroy,
};

/* module setup */

static int __init ethnl_init(void)
{
        int ret;

        ret = genl_register_family(&ethtool_genl_family);
        if (WARN(ret < 0, "ethtool: genetlink family registration failed"))
                return ret;
        ethnl_ok = true;

        ret = register_netdevice_notifier(&ethnl_netdev_notifier);
        WARN(ret < 0, "ethtool: net device notifier registration failed");
        return ret;
}

subsys_initcall(ethnl_init);













































































































































































































































































   12 










































   12 























  161 














































































  163 




  398 
  398 
  395 

    4 
    4 





  161 

  268 









  164 



  163 
  163 



   44 
  145 
  163 


























































































  267 









  267 

  267 
  267 
  267 




  166 
  249 








































  162 

  270 



















   23 

  270 






  399 












  398 
    2 

  397 




  399 

  163 

  252 
  400 
  168 
  262 



  164 

  270 


  398 




  399 
  399 


















  398 





  398 





































  167 









  249 


















































  577 


















  578 
































  577 


































  576 
   31 


  576 

  469 





  397 




























































































































































    8 











    8 
















































































































































































































   23 






   23 




   23 












   23 
































   23 

   23 


















   23 





   23 
















































































































   23 



   23 























































   23 



























   23 





















































   23 

   23 











   23 

















   23 






















  570 
















    3 


  561 








    7 




























  570 









  570 














    2 









  316 







    3 






























  534 






  347 
  219 















  569 
  570 
  569 






  249 

  392 

  392 

  391 

  392 

  350 
  311 
  389 
    3 
  392 


















  244 




  347 




  499 


  497 
  347 


  564 





  563 
  564 
  564 







  563 

  564 



  564 



















  564 


  563 























































































































































































































    2 














































































  440 





























  440 




















  243 



    2 

  181 



















  313 




  313 

    8 

  311 




















  315 


























  316 

  316 

  242 








   11 









  315 








































































  311 


  311 
  311 
  311 


















  311 
  311 


  311 



  311 






  311 




















  314 



  314 







  312 

  314 
















    5 








  314 












  314 
    2 






































  314 



  314 
  314 














  314 



  314 


  314 

  314 


  314 






    3 









  311 





  311 



  311 









  311 

  311 













  310 
  311 






































































































































































































































































































































































































    2 





    2 









    2 




    2 




























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
// SPDX-License-Identifier: GPL-2.0-only
/*
 * fs/dcache.c
 *
 * Complete reimplementation
 * (C) 1997 Thomas Schoebel-Theuer,
 * with heavy changes by Linus Torvalds
 */

/*
 * Notes on the allocation strategy:
 *
 * The dcache is a master of the icache - whenever a dcache entry
 * exists, the inode will always exist. "iput()" is done either when
 * the dcache entry is deleted or garbage collected.
 */

#include <linux/ratelimit.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/cache.h>
#include <linux/export.h>
#include <linux/security.h>
#include <linux/seqlock.h>
#include <linux/memblock.h>
#include <linux/bit_spinlock.h>
#include <linux/rculist_bl.h>
#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"

#include <asm/runtime-const.h>

/*
 * Usage:
 * dcache->d_inode->i_lock protects:
 *   - i_dentry, d_u.d_alias, d_inode of aliases
 * dcache_hash_bucket lock protects:
 *   - the dcache hash table
 * s_roots bl list spinlock protects:
 *   - the s_roots list (see __d_drop)
 * dentry->d_sb->s_dentry_lru_lock protects:
 *   - the dcache lru lists and counters
 * d_lock protects:
 *   - d_flags
 *   - d_name
 *   - d_lru
 *   - d_count
 *   - d_unhashed()
 *   - d_parent and d_chilren
 *   - childrens' d_sib and d_parent
 *   - d_u.d_alias, d_inode
 *
 * Ordering:
 * dentry->d_inode->i_lock
 *   dentry->d_lock
 *     dentry->d_sb->s_dentry_lru_lock
 *     dcache_hash_bucket lock
 *     s_roots lock
 *
 * If there is an ancestor relationship:
 * dentry->d_parent->...->d_parent->d_lock
 *   ...
 *     dentry->d_parent->d_lock
 *       dentry->d_lock
 *
 * If no ancestor relationship:
 * arbitrary, since it's serialized on rename_lock
 */
static int sysctl_vfs_cache_pressure __read_mostly = 100;

unsigned long vfs_pressure_ratio(unsigned long val)
{
        return mult_frac(val, sysctl_vfs_cache_pressure, 100);
}
EXPORT_SYMBOL_GPL(vfs_pressure_ratio);

__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);

EXPORT_SYMBOL(rename_lock);

static struct kmem_cache *dentry_cache __ro_after_init;

const struct qstr empty_name = QSTR_INIT("", 0);
EXPORT_SYMBOL(empty_name);
const struct qstr slash_name = QSTR_INIT("/", 1);
EXPORT_SYMBOL(slash_name);
const struct qstr dotdot_name = QSTR_INIT("..", 2);
EXPORT_SYMBOL(dotdot_name);

/*
 * This is the single most critical data structure when it comes
 * to the dcache: the hashtable for lookups. Somebody should try
 * to make this good - I've just made it work.
 *
 * This hash-function tries to avoid losing too many bits of hash
 * information, yet avoid using a prime hash-size or similar.
 *
 * Marking the variables "used" ensures that the compiler doesn't
 * optimize them away completely on architectures with runtime
 * constant infrastructure, this allows debuggers to see their
 * values. But updating these values has no effect on those arches.
 */

static unsigned int d_hash_shift __ro_after_init __used;

static struct hlist_bl_head *dentry_hashtable __ro_after_init __used;

static inline struct hlist_bl_head *d_hash(unsigned long hashlen)
{
        return runtime_const_ptr(dentry_hashtable) +
                runtime_const_shift_right_32(hashlen, d_hash_shift);
}

#define IN_LOOKUP_SHIFT 10
static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT];

static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent,
                                        unsigned int hash)
{
        hash += (unsigned long) parent / L1_CACHE_BYTES;
        return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT);
}

struct dentry_stat_t {
        long nr_dentry;
        long nr_unused;
        long age_limit;                /* age in seconds */
        long want_pages;        /* pages requested by system */
        long nr_negative;        /* # of unused negative dentries */
        long dummy;                /* Reserved for future use */
};

static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);
static int dentry_negative_policy;

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
/* Statistics gathering. */
static struct dentry_stat_t dentry_stat = {
        .age_limit = 45,
};

/*
 * Here we resort to our own counters instead of using generic per-cpu counters
 * for consistency with what the vfs inode code does. We are expected to harvest
 * better code and performance by having our own specialized counters.
 *
 * Please note that the loop is done over all possible CPUs, not over all online
 * CPUs. The reason for this is that we don't want to play games with CPUs going
 * on and off. If one of them goes off, we will just keep their counters.
 *
 * glommer: See cffbc8a for details, and if you ever intend to change this,
 * please update all vfs counters to match.
 */
static long get_nr_dentry(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_unused(void)
{
        int i;
        long sum = 0;
        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_unused, i);
        return sum < 0 ? 0 : sum;
}

static long get_nr_dentry_negative(void)
{
        int i;
        long sum = 0;

        for_each_possible_cpu(i)
                sum += per_cpu(nr_dentry_negative, i);
        return sum < 0 ? 0 : sum;
}

static int proc_nr_dentry(const struct ctl_table *table, int write, void *buffer,
                          size_t *lenp, loff_t *ppos)
{
        dentry_stat.nr_dentry = get_nr_dentry();
        dentry_stat.nr_unused = get_nr_dentry_unused();
        dentry_stat.nr_negative = get_nr_dentry_negative();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table fs_dcache_sysctls[] = {
        {
                .procname        = "dentry-state",
                .data                = &dentry_stat,
                .maxlen                = 6*sizeof(long),
                .mode                = 0444,
                .proc_handler        = proc_nr_dentry,
        },
        {
                .procname        = "dentry-negative",
                .data                = &dentry_negative_policy,
                .maxlen                = sizeof(dentry_negative_policy),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static const struct ctl_table vm_dcache_sysctls[] = {
        {
                .procname        = "vfs_cache_pressure",
                .data                = &sysctl_vfs_cache_pressure,
                .maxlen                = sizeof(sysctl_vfs_cache_pressure),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
        },
};

static int __init init_fs_dcache_sysctls(void)
{
        register_sysctl_init("vm", vm_dcache_sysctls);
        register_sysctl_init("fs", fs_dcache_sysctls);
        return 0;
}
fs_initcall(init_fs_dcache_sysctls);
#endif

/*
 * Compare 2 name strings, return 0 if they match, otherwise non-zero.
 * The strings are both count bytes long, and count is non-zero.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>
/*
 * NOTE! 'cs' and 'scount' come from a dentry, so it has a
 * aligned allocation for this particular component. We don't
 * strictly need the load_unaligned_zeropad() safety, but it
 * doesn't hurt either.
 *
 * In contrast, 'ct' and 'tcount' can be from a pathname, and do
 * need the careful unaligned handling.
 */
static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        unsigned long a,b,mask;

        for (;;) {
                a = read_word_at_a_time(cs);
                b = load_unaligned_zeropad(ct);
                if (tcount < sizeof(unsigned long))
                        break;
                if (unlikely(a != b))
                        return 1;
                cs += sizeof(unsigned long);
                ct += sizeof(unsigned long);
                tcount -= sizeof(unsigned long);
                if (!tcount)
                        return 0;
        }
        mask = bytemask_from_count(tcount);
        return unlikely(!!((a ^ b) & mask));
}

#else

static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount)
{
        do {
                if (*cs != *ct)
                        return 1;
                cs++;
                ct++;
                tcount--;
        } while (tcount);
        return 0;
}

#endif

static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
        /*
         * Be careful about RCU walk racing with rename:
         * use 'READ_ONCE' to fetch the name pointer.
         *
         * NOTE! Even if a rename will mean that the length
         * was not loaded atomically, we don't care. The
         * RCU walk will check the sequence count eventually,
         * and catch it. And we won't overrun the buffer,
         * because we're reading the name pointer atomically,
         * and a dentry name is guaranteed to be properly
         * terminated with a NUL byte.
         *
         * End result: even if 'len' is wrong, we'll exit
         * early because the data cannot match (there can
         * be no NUL in the ct/tcount data)
         */
        const unsigned char *cs = READ_ONCE(dentry->d_name.name);

        return dentry_string_cmp(cs, ct, tcount);
}

/*
 * long names are allocated separately from dentry and never modified.
 * Refcounted, freeing is RCU-delayed.  See take_dentry_name_snapshot()
 * for the reason why ->count and ->head can't be combined into a union.
 * dentry_string_cmp() relies upon ->name[] being word-aligned.
 */
struct external_name {
        atomic_t count;
        struct rcu_head head;
        unsigned char name[] __aligned(sizeof(unsigned long));
};

static inline struct external_name *external_name(struct dentry *dentry)
{
        return container_of(dentry->d_name.name, struct external_name, name[0]);
}

static void __d_free(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);

        kmem_cache_free(dentry_cache, dentry); 
}

static void __d_free_external(struct rcu_head *head)
{
        struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu);
        kfree(external_name(dentry));
        kmem_cache_free(dentry_cache, dentry);
}

static inline int dname_external(const struct dentry *dentry)
{
        return dentry->d_name.name != dentry->d_shortname.string;
}

void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry)
{
        unsigned seq;
        const unsigned char *s;

        rcu_read_lock();
retry:
        seq = read_seqcount_begin(&dentry->d_seq);
        s = READ_ONCE(dentry->d_name.name);
        name->name.hash_len = dentry->d_name.hash_len;
        name->name.name = name->inline_name.string;
        if (likely(s == dentry->d_shortname.string)) {
                name->inline_name = dentry->d_shortname;
        } else {
                struct external_name *p;
                p = container_of(s, struct external_name, name[0]);
                // get a valid reference
                if (unlikely(!atomic_inc_not_zero(&p->count)))
                        goto retry;
                name->name.name = s;
        }
        if (read_seqcount_retry(&dentry->d_seq, seq)) {
                release_dentry_name_snapshot(name);
                goto retry;
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(take_dentry_name_snapshot);

void release_dentry_name_snapshot(struct name_snapshot *name)
{
        if (unlikely(name->name.name != name->inline_name.string)) {
                struct external_name *p;
                p = container_of(name->name.name, struct external_name, name[0]);
                if (unlikely(atomic_dec_and_test(&p->count)))
                        kfree_rcu(p, head);
        }
}
EXPORT_SYMBOL(release_dentry_name_snapshot);

static inline void __d_set_inode_and_type(struct dentry *dentry,
                                          struct inode *inode,
                                          unsigned type_flags)
{
        unsigned flags;

        dentry->d_inode = inode;
        flags = READ_ONCE(dentry->d_flags);
        flags &= ~DCACHE_ENTRY_TYPE;
        flags |= type_flags;
        smp_store_release(&dentry->d_flags, flags);
}

static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
        unsigned flags = READ_ONCE(dentry->d_flags);

        flags &= ~DCACHE_ENTRY_TYPE;
        WRITE_ONCE(dentry->d_flags, flags);
        dentry->d_inode = NULL;
        /*
         * The negative counter only tracks dentries on the LRU. Don't inc if
         * d_lru is on another list.
         */
        if ((flags & (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
                this_cpu_inc(nr_dentry_negative);
}

static void dentry_free(struct dentry *dentry)
{
        WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias));
        if (unlikely(dname_external(dentry))) {
                struct external_name *p = external_name(dentry);
                if (likely(atomic_dec_and_test(&p->count))) {
                        call_rcu(&dentry->d_u.d_rcu, __d_free_external);
                        return;
                }
        }
        /* if dentry was never visible to RCU, immediate free is OK */
        if (dentry->d_flags & DCACHE_NORCU)
                __d_free(&dentry->d_u.d_rcu);
        else
                call_rcu(&dentry->d_u.d_rcu, __d_free);
}

/*
 * Release the dentry's inode, using the filesystem
 * d_iput() operation if defined.
 */
static void dentry_unlink_inode(struct dentry * dentry)
        __releases(dentry->d_lock)
        __releases(dentry->d_inode->i_lock)
{
        struct inode *inode = dentry->d_inode;

        raw_write_seqcount_begin(&dentry->d_seq);
        __d_clear_type_and_inode(dentry);
        hlist_del_init(&dentry->d_u.d_alias);
        raw_write_seqcount_end(&dentry->d_seq);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&inode->i_lock);
        if (!inode->i_nlink)
                fsnotify_inoderemove(inode);
        if (dentry->d_op && dentry->d_op->d_iput)
                dentry->d_op->d_iput(dentry, inode);
        else
                iput(inode);
}

/*
 * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
 * is in use - which includes both the "real" per-superblock
 * LRU list _and_ the DCACHE_SHRINK_LIST use.
 *
 * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
 * on the shrink list (ie not on the superblock LRU list).
 *
 * The per-cpu "nr_dentry_unused" counters are updated with
 * the DCACHE_LRU_LIST bit.
 *
 * The per-cpu "nr_dentry_negative" counters are only updated
 * when deleted from or added to the per-superblock LRU list, not
 * from/to the shrink list. That is to avoid an unneeded dec/inc
 * pair when moving from LRU to shrink list in select_collect().
 *
 * These helper functions make sure we always follow the
 * rules. d_lock must be held by the caller.
 */
#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
static void d_lru_add(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, 0);
        dentry->d_flags |= DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_inc(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_add_obj(
                        &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_lru_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        WARN_ON_ONCE(!list_lru_del_obj(
                        &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}

static void d_shrink_del(struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        list_del_init(&dentry->d_lru);
        dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
        this_cpu_dec(nr_dentry_unused);
}

static void d_shrink_add(struct dentry *dentry, struct list_head *list)
{
        D_FLAG_VERIFY(dentry, 0);
        list_add(&dentry->d_lru, list);
        dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
        this_cpu_inc(nr_dentry_unused);
}

/*
 * These can only be called under the global LRU lock, ie during the
 * callback for freeing the LRU list. "isolate" removes it from the
 * LRU lists entirely, while shrink_move moves it to the indicated
 * private list.
 */
static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate(lru, &dentry->d_lru);
}

static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
                              struct list_head *list)
{
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags |= DCACHE_SHRINK_LIST;
        if (d_is_negative(dentry))
                this_cpu_dec(nr_dentry_negative);
        list_lru_isolate_move(lru, &dentry->d_lru, list);
}

static void ___d_drop(struct dentry *dentry)
{
        struct hlist_bl_head *b;
        /*
         * Hashed dentries are normally on the dentry hashtable,
         * with the exception of those newly allocated by
         * d_obtain_root, which are always IS_ROOT:
         */
        if (unlikely(IS_ROOT(dentry)))
                b = &dentry->d_sb->s_roots;
        else
                b = d_hash(dentry->d_name.hash);

        hlist_bl_lock(b);
        __hlist_bl_del(&dentry->d_hash);
        hlist_bl_unlock(b);
}

void __d_drop(struct dentry *dentry)
{
        if (!d_unhashed(dentry)) {
                ___d_drop(dentry);
                dentry->d_hash.pprev = NULL;
                write_seqcount_invalidate(&dentry->d_seq);
        }
}
EXPORT_SYMBOL(__d_drop);

/**
 * d_drop - drop a dentry
 * @dentry: dentry to drop
 *
 * d_drop() unhashes the entry from the parent dentry hashes, so that it won't
 * be found through a VFS lookup any more. Note that this is different from
 * deleting the dentry - d_delete will try to mark the dentry negative if
 * possible, giving a successful _negative_ lookup, while d_drop will
 * just make the cache lookup fail.
 *
 * d_drop() is used mainly for stuff that wants to invalidate a dentry for some
 * reason (NFS timeouts or autofs deletes).
 *
 * __d_drop requires dentry->d_lock
 *
 * ___d_drop doesn't mark dentry as "unhashed"
 * (dentry->d_hash.pprev will be LIST_POISON2, not NULL).
 */
void d_drop(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(d_drop);

static inline void dentry_unlist(struct dentry *dentry)
{
        struct dentry *next;
        /*
         * Inform d_walk() and shrink_dentry_list() that we are no longer
         * attached to the dentry tree
         */
        dentry->d_flags |= DCACHE_DENTRY_KILLED;
        if (unlikely(hlist_unhashed(&dentry->d_sib)))
                return;
        __hlist_del(&dentry->d_sib);
        /*
         * Cursors can move around the list of children.  While we'd been
         * a normal list member, it didn't matter - ->d_sib.next would've
         * been updated.  However, from now on it won't be and for the
         * things like d_walk() it might end up with a nasty surprise.
         * Normally d_walk() doesn't care about cursors moving around -
         * ->d_lock on parent prevents that and since a cursor has no children
         * of its own, we get through it without ever unlocking the parent.
         * There is one exception, though - if we ascend from a child that
         * gets killed as soon as we unlock it, the next sibling is found
         * using the value left in its ->d_sib.next.  And if _that_
         * pointed to a cursor, and cursor got moved (e.g. by lseek())
         * before d_walk() regains parent->d_lock, we'll end up skipping
         * everything the cursor had been moved past.
         *
         * Solution: make sure that the pointer left behind in ->d_sib.next
         * points to something that won't be moving around.  I.e. skip the
         * cursors.
         */
        while (dentry->d_sib.next) {
                next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib);
                if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
                        break;
                dentry->d_sib.next = next->d_sib.next;
        }
}

static struct dentry *__dentry_kill(struct dentry *dentry)
{
        struct dentry *parent = NULL;
        bool can_free = true;

        /*
         * The dentry is now unrecoverably dead to the world.
         */
        lockref_mark_dead(&dentry->d_lockref);

        /*
         * inform the fs via d_prune that this dentry is about to be
         * unhashed and destroyed.
         */
        if (dentry->d_flags & DCACHE_OP_PRUNE)
                dentry->d_op->d_prune(dentry);

        if (dentry->d_flags & DCACHE_LRU_LIST) {
                if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
                        d_lru_del(dentry);
        }
        /* if it was on the hash then remove it */
        __d_drop(dentry);
        if (dentry->d_inode)
                dentry_unlink_inode(dentry);
        else
                spin_unlock(&dentry->d_lock);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);

        cond_resched();
        /* now that it's negative, ->d_parent is stable */
        if (!IS_ROOT(dentry)) {
                parent = dentry->d_parent;
                spin_lock(&parent->d_lock);
        }
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry_unlist(dentry);
        if (dentry->d_flags & DCACHE_SHRINK_LIST)
                can_free = false;
        spin_unlock(&dentry->d_lock);
        if (likely(can_free))
                dentry_free(dentry);
        if (parent && --parent->d_lockref.count) {
                spin_unlock(&parent->d_lock);
                return NULL;
        }
        return parent;
}

/*
 * Lock a dentry for feeding it to __dentry_kill().
 * Called under rcu_read_lock() and dentry->d_lock; the former
 * guarantees that nothing we access will be freed under us.
 * Note that dentry is *not* protected from concurrent dentry_kill(),
 * d_delete(), etc.
 *
 * Return false if dentry is busy.  Otherwise, return true and have
 * that dentry's inode locked.
 */

static bool lock_for_kill(struct dentry *dentry)
{
        struct inode *inode = dentry->d_inode;

        if (unlikely(dentry->d_lockref.count))
                return false;

        if (!inode || likely(spin_trylock(&inode->i_lock)))
                return true;

        do {
                spin_unlock(&dentry->d_lock);
                spin_lock(&inode->i_lock);
                spin_lock(&dentry->d_lock);
                if (likely(inode == dentry->d_inode))
                        break;
                spin_unlock(&inode->i_lock);
                inode = dentry->d_inode;
        } while (inode);
        if (likely(!dentry->d_lockref.count))
                return true;
        if (inode)
                spin_unlock(&inode->i_lock);
        return false;
}

/*
 * Decide if dentry is worth retaining.  Usually this is called with dentry
 * locked; if not locked, we are more limited and might not be able to tell
 * without a lock.  False in this case means "punt to locked path and recheck".
 *
 * In case we aren't locked, these predicates are not "stable". However, it is
 * sufficient that at some point after we dropped the reference the dentry was
 * hashed and the flags had the proper value. Other dentry users may have
 * re-gotten a reference to the dentry and change that, but our work is done -
 * we can leave the dentry around with a zero refcount.
 */
static inline bool retain_dentry(struct dentry *dentry, bool locked)
{
        unsigned int d_flags;

        smp_rmb();
        d_flags = READ_ONCE(dentry->d_flags);

        // Unreachable? Nobody would be able to look it up, no point retaining
        if (unlikely(d_unhashed(dentry)))
                return false;

        // Same if it's disconnected
        if (unlikely(d_flags & DCACHE_DISCONNECTED))
                return false;

        // ->d_delete() might tell us not to bother, but that requires
        // ->d_lock; can't decide without it
        if (unlikely(d_flags & DCACHE_OP_DELETE)) {
                if (!locked || dentry->d_op->d_delete(dentry))
                        return false;
        }

        // Explicitly told not to bother
        if (unlikely(d_flags & DCACHE_DONTCACHE))
                return false;

        // At this point it looks like we ought to keep it.  We also might
        // need to do something - put it on LRU if it wasn't there already
        // and mark it referenced if it was on LRU, but not marked yet.
        // Unfortunately, both actions require ->d_lock, so in lockless
        // case we'd have to punt rather than doing those.
        if (unlikely(!(d_flags & DCACHE_LRU_LIST))) {
                if (!locked)
                        return false;
                d_lru_add(dentry);
        } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) {
                if (!locked)
                        return false;
                dentry->d_flags |= DCACHE_REFERENCED;
        }
        return true;
}

void d_mark_dontcache(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&de->d_lock);
                de->d_flags |= DCACHE_DONTCACHE;
                spin_unlock(&de->d_lock);
        }
        inode->i_state |= I_DONTCACHE;
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_mark_dontcache);

/*
 * Try to do a lockless dput(), and return whether that was successful.
 *
 * If unsuccessful, we return false, having already taken the dentry lock.
 * In that case refcount is guaranteed to be zero and we have already
 * decided that it's not worth keeping around.
 *
 * The caller needs to hold the RCU read lock, so that the dentry is
 * guaranteed to stay around even if the refcount goes down to zero!
 */
static inline bool fast_dput(struct dentry *dentry)
{
        int ret;

        /*
         * try to decrement the lockref optimistically.
         */
        ret = lockref_put_return(&dentry->d_lockref);

        /*
         * If the lockref_put_return() failed due to the lock being held
         * by somebody else, the fast path has failed. We will need to
         * get the lock, and then check the count again.
         */
        if (unlikely(ret < 0)) {
                spin_lock(&dentry->d_lock);
                if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
                        spin_unlock(&dentry->d_lock);
                        return true;
                }
                dentry->d_lockref.count--;
                goto locked;
        }

        /*
         * If we weren't the last ref, we're done.
         */
        if (ret)
                return true;

        /*
         * Can we decide that decrement of refcount is all we needed without
         * taking the lock?  There's a very common case when it's all we need -
         * dentry looks like it ought to be retained and there's nothing else
         * to do.
         */
        if (retain_dentry(dentry, false))
                return true;

        /*
         * Either not worth retaining or we can't tell without the lock.
         * Get the lock, then.  We've already decremented the refcount to 0,
         * but we'll need to re-check the situation after getting the lock.
         */
        spin_lock(&dentry->d_lock);

        /*
         * Did somebody else grab a reference to it in the meantime, and
         * we're no longer the last user after all? Alternatively, somebody
         * else could have killed it and marked it dead. Either way, we
         * don't need to do anything else.
         */
locked:
        if (dentry->d_lockref.count || retain_dentry(dentry, true)) {
                spin_unlock(&dentry->d_lock);
                return true;
        }
        return false;
}


/* 
 * This is dput
 *
 * This is complicated by the fact that we do not want to put
 * dentries that are no longer on any hash chain on the unused
 * list: we'd much rather just get rid of them immediately.
 *
 * However, that implies that we have to traverse the dentry
 * tree upwards to the parents which might _also_ now be
 * scheduled for deletion (it may have been only waiting for
 * its last child to go away).
 *
 * This tail recursion is done by hand as we don't want to depend
 * on the compiler to always get this right (gcc generally doesn't).
 * Real recursion would eat up our stack space.
 */

/*
 * dput - release a dentry
 * @dentry: dentry to release 
 *
 * Release a dentry. This will drop the usage count and if appropriate
 * call the dentry unlink method as well as removing it from the queues and
 * releasing its resources. If the parent dentries were scheduled for release
 * they too may now get deleted.
 */
void dput(struct dentry *dentry)
{
        if (!dentry)
                return;
        might_sleep();
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        while (lock_for_kill(dentry)) {
                rcu_read_unlock();
                dentry = __dentry_kill(dentry);
                if (!dentry)
                        return;
                if (retain_dentry(dentry, true)) {
                        spin_unlock(&dentry->d_lock);
                        return;
                }
                rcu_read_lock();
        }
        rcu_read_unlock();
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(dput);

static void to_shrink_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
        if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
                if (dentry->d_flags & DCACHE_LRU_LIST)
                        d_lru_del(dentry);
                d_shrink_add(dentry, list);
        }
}

void dput_to_list(struct dentry *dentry, struct list_head *list)
{
        rcu_read_lock();
        if (likely(fast_dput(dentry))) {
                rcu_read_unlock();
                return;
        }
        rcu_read_unlock();
        to_shrink_list(dentry, list);
        spin_unlock(&dentry->d_lock);
}

struct dentry *dget_parent(struct dentry *dentry)
{
        int gotref;
        struct dentry *ret;
        unsigned seq;

        /*
         * Do optimistic parent lookup without any
         * locking.
         */
        rcu_read_lock();
        seq = raw_seqcount_begin(&dentry->d_seq);
        ret = READ_ONCE(dentry->d_parent);
        gotref = lockref_get_not_zero(&ret->d_lockref);
        rcu_read_unlock();
        if (likely(gotref)) {
                if (!read_seqcount_retry(&dentry->d_seq, seq))
                        return ret;
                dput(ret);
        }

repeat:
        /*
         * Don't need rcu_dereference because we re-check it was correct under
         * the lock.
         */
        rcu_read_lock();
        ret = dentry->d_parent;
        spin_lock(&ret->d_lock);
        if (unlikely(ret != dentry->d_parent)) {
                spin_unlock(&ret->d_lock);
                rcu_read_unlock();
                goto repeat;
        }
        rcu_read_unlock();
        BUG_ON(!ret->d_lockref.count);
        ret->d_lockref.count++;
        spin_unlock(&ret->d_lock);
        return ret;
}
EXPORT_SYMBOL(dget_parent);

static struct dentry * __d_find_any_alias(struct inode *inode)
{
        struct dentry *alias;

        if (hlist_empty(&inode->i_dentry))
                return NULL;
        alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
        lockref_get(&alias->d_lockref);
        return alias;
}

/**
 * d_find_any_alias - find any alias for a given inode
 * @inode: inode to find an alias for
 *
 * If any aliases exist for the given inode, take and return a
 * reference for one of them.  If no aliases exist, return %NULL.
 */
struct dentry *d_find_any_alias(struct inode *inode)
{
        struct dentry *de;

        spin_lock(&inode->i_lock);
        de = __d_find_any_alias(inode);
        spin_unlock(&inode->i_lock);
        return de;
}
EXPORT_SYMBOL(d_find_any_alias);

static struct dentry *__d_find_alias(struct inode *inode)
{
        struct dentry *alias;

        if (S_ISDIR(inode->i_mode))
                return __d_find_any_alias(inode);

        hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&alias->d_lock);
                 if (!d_unhashed(alias)) {
                        dget_dlock(alias);
                        spin_unlock(&alias->d_lock);
                        return alias;
                }
                spin_unlock(&alias->d_lock);
        }
        return NULL;
}

/**
 * d_find_alias - grab a hashed alias of inode
 * @inode: inode in question
 *
 * If inode has a hashed alias, or is a directory and has any alias,
 * acquire the reference to alias and return it. Otherwise return NULL.
 * Notice that if inode is a directory there can be only one alias and
 * it can be unhashed only if it has no children, or if it is the root
 * of a filesystem, or if the directory was renamed and d_revalidate
 * was the first vfs operation to notice.
 *
 * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
 * any other hashed alias over that one.
 */
struct dentry *d_find_alias(struct inode *inode)
{
        struct dentry *de = NULL;

        if (!hlist_empty(&inode->i_dentry)) {
                spin_lock(&inode->i_lock);
                de = __d_find_alias(inode);
                spin_unlock(&inode->i_lock);
        }
        return de;
}
EXPORT_SYMBOL(d_find_alias);

/*
 *  Caller MUST be holding rcu_read_lock() and be guaranteed
 *  that inode won't get freed until rcu_read_unlock().
 */
struct dentry *d_find_alias_rcu(struct inode *inode)
{
        struct hlist_head *l = &inode->i_dentry;
        struct dentry *de = NULL;

        spin_lock(&inode->i_lock);
        // ->i_dentry and ->i_rcu are colocated, but the latter won't be
        // used without having I_FREEING set, which means no aliases left
        if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) {
                if (S_ISDIR(inode->i_mode)) {
                        de = hlist_entry(l->first, struct dentry, d_u.d_alias);
                } else {
                        hlist_for_each_entry(de, l, d_u.d_alias)
                                if (!d_unhashed(de))
                                        break;
                }
        }
        spin_unlock(&inode->i_lock);
        return de;
}

/*
 *        Try to kill dentries associated with this inode.
 * WARNING: you must own a reference to inode.
 */
void d_prune_aliases(struct inode *inode)
{
        LIST_HEAD(dispose);
        struct dentry *dentry;

        spin_lock(&inode->i_lock);
        hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
                spin_lock(&dentry->d_lock);
                if (!dentry->d_lockref.count)
                        to_shrink_list(dentry, &dispose);
                spin_unlock(&dentry->d_lock);
        }
        spin_unlock(&inode->i_lock);
        shrink_dentry_list(&dispose);
}
EXPORT_SYMBOL(d_prune_aliases);

static inline void shrink_kill(struct dentry *victim)
{
        do {
                rcu_read_unlock();
                victim = __dentry_kill(victim);
                rcu_read_lock();
        } while (victim && lock_for_kill(victim));
        rcu_read_unlock();
        if (victim)
                spin_unlock(&victim->d_lock);
}

void shrink_dentry_list(struct list_head *list)
{
        while (!list_empty(list)) {
                struct dentry *dentry;

                dentry = list_entry(list->prev, struct dentry, d_lru);
                spin_lock(&dentry->d_lock);
                rcu_read_lock();
                if (!lock_for_kill(dentry)) {
                        bool can_free;
                        rcu_read_unlock();
                        d_shrink_del(dentry);
                        can_free = dentry->d_flags & DCACHE_DENTRY_KILLED;
                        spin_unlock(&dentry->d_lock);
                        if (can_free)
                                dentry_free(dentry);
                        continue;
                }
                d_shrink_del(dentry);
                shrink_kill(dentry);
        }
}

static enum lru_status dentry_lru_isolate(struct list_head *item,
                struct list_lru_one *lru, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);


        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        /*
         * Referenced dentries are still in use. If they have active
         * counts, just remove them from the LRU. Otherwise give them
         * another pass through the LRU.
         */
        if (dentry->d_lockref.count) {
                d_lru_isolate(lru, dentry);
                spin_unlock(&dentry->d_lock);
                return LRU_REMOVED;
        }

        if (dentry->d_flags & DCACHE_REFERENCED) {
                dentry->d_flags &= ~DCACHE_REFERENCED;
                spin_unlock(&dentry->d_lock);

                /*
                 * The list move itself will be made by the common LRU code. At
                 * this point, we've dropped the dentry->d_lock but keep the
                 * lru lock. This is safe to do, since every list movement is
                 * protected by the lru lock even if both locks are held.
                 *
                 * This is guaranteed by the fact that all LRU management
                 * functions are intermediated by the LRU API calls like
                 * list_lru_add_obj and list_lru_del_obj. List movement in this file
                 * only ever occur through this functions or through callbacks
                 * like this one, that are called from the LRU API.
                 *
                 * The only exceptions to this are functions like
                 * shrink_dentry_list, and code that first checks for the
                 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
                 * operating only with stack provided lists after they are
                 * properly isolated from the main list.  It is thus, always a
                 * local access.
                 */
                return LRU_ROTATE;
        }

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}

/**
 * prune_dcache_sb - shrink the dcache
 * @sb: superblock
 * @sc: shrink control, passed to list_lru_shrink_walk()
 *
 * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
 * is done when we need more memory and called from the superblock shrinker
 * function.
 *
 * This function may fail to free any resources if all the dentries are in
 * use.
 */
long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
{
        LIST_HEAD(dispose);
        long freed;

        freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
                                     dentry_lru_isolate, &dispose);
        shrink_dentry_list(&dispose);
        return freed;
}

static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
                struct list_lru_one *lru, void *arg)
{
        struct list_head *freeable = arg;
        struct dentry        *dentry = container_of(item, struct dentry, d_lru);

        /*
         * we are inverting the lru lock/dentry->d_lock here,
         * so use a trylock. If we fail to get the lock, just skip
         * it
         */
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;

        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);

        return LRU_REMOVED;
}


/**
 * shrink_dcache_sb - shrink dcache for a superblock
 * @sb: superblock
 *
 * Shrink the dcache for the specified super block. This is used to free
 * the dcache before unmounting a file system.
 */
void shrink_dcache_sb(struct super_block *sb)
{
        do {
                LIST_HEAD(dispose);

                list_lru_walk(&sb->s_dentry_lru,
                        dentry_lru_isolate_shrink, &dispose, 1024);
                shrink_dentry_list(&dispose);
        } while (list_lru_count(&sb->s_dentry_lru) > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);

/**
 * enum d_walk_ret - action to talke during tree walk
 * @D_WALK_CONTINUE:        contrinue walk
 * @D_WALK_QUIT:        quit walk
 * @D_WALK_NORETRY:        quit when retry is needed
 * @D_WALK_SKIP:        skip this dentry and its children
 */
enum d_walk_ret {
        D_WALK_CONTINUE,
        D_WALK_QUIT,
        D_WALK_NORETRY,
        D_WALK_SKIP,
};

/**
 * d_walk - walk the dentry tree
 * @parent:        start of walk
 * @data:        data passed to @enter() and @finish()
 * @enter:        callback when first entering the dentry
 *
 * The @enter() callbacks are called with d_lock held.
 */
static void d_walk(struct dentry *parent, void *data,
                   enum d_walk_ret (*enter)(void *, struct dentry *))
{
        struct dentry *this_parent, *dentry;
        unsigned seq = 0;
        enum d_walk_ret ret;
        bool retry = true;

again:
        read_seqbegin_or_lock(&rename_lock, &seq);
        this_parent = parent;
        spin_lock(&this_parent->d_lock);

        ret = enter(data, this_parent);
        switch (ret) {
        case D_WALK_CONTINUE:
                break;
        case D_WALK_QUIT:
        case D_WALK_SKIP:
                goto out_unlock;
        case D_WALK_NORETRY:
                retry = false;
                break;
        }
repeat:
        dentry = d_first_child(this_parent);
resume:
        hlist_for_each_entry_from(dentry, d_sib) {
                if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
                        continue;

                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);

                ret = enter(data, dentry);
                switch (ret) {
                case D_WALK_CONTINUE:
                        break;
                case D_WALK_QUIT:
                        spin_unlock(&dentry->d_lock);
                        goto out_unlock;
                case D_WALK_NORETRY:
                        retry = false;
                        break;
                case D_WALK_SKIP:
                        spin_unlock(&dentry->d_lock);
                        continue;
                }

                if (!hlist_empty(&dentry->d_children)) {
                        spin_unlock(&this_parent->d_lock);
                        spin_release(&dentry->d_lock.dep_map, _RET_IP_);
                        this_parent = dentry;
                        spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
                        goto repeat;
                }
                spin_unlock(&dentry->d_lock);
        }
        /*
         * All done at this level ... ascend and resume the search.
         */
        rcu_read_lock();
ascend:
        if (this_parent != parent) {
                dentry = this_parent;
                this_parent = dentry->d_parent;

                spin_unlock(&dentry->d_lock);
                spin_lock(&this_parent->d_lock);

                /* might go back up the wrong parent if we have had a rename. */
                if (need_seqretry(&rename_lock, seq))
                        goto rename_retry;
                /* go into the first sibling still alive */
                hlist_for_each_entry_continue(dentry, d_sib) {
                        if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
                                rcu_read_unlock();
                                goto resume;
                        }
                }
                goto ascend;
        }
        if (need_seqretry(&rename_lock, seq))
                goto rename_retry;
        rcu_read_unlock();

out_unlock:
        spin_unlock(&this_parent->d_lock);
        done_seqretry(&rename_lock, seq);
        return;

rename_retry:
        spin_unlock(&this_parent->d_lock);
        rcu_read_unlock();
        BUG_ON(seq & 1);
        if (!retry)
                return;
        seq = 1;
        goto again;
}

struct check_mount {
        struct vfsmount *mnt;
        unsigned int mounted;
};

static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
{
        struct check_mount *info = data;
        struct path path = { .mnt = info->mnt, .dentry = dentry };

        if (likely(!d_mountpoint(dentry)))
                return D_WALK_CONTINUE;
        if (__path_is_mountpoint(&path)) {
                info->mounted = 1;
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * path_has_submounts - check for mounts over a dentry in the
 *                      current namespace.
 * @parent: path to check.
 *
 * Return true if the parent or its subdirectories contain
 * a mount point in the current namespace.
 */
int path_has_submounts(const struct path *parent)
{
        struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };

        read_seqlock_excl(&mount_lock);
        d_walk(parent->dentry, &data, path_check_mount);
        read_sequnlock_excl(&mount_lock);

        return data.mounted;
}
EXPORT_SYMBOL(path_has_submounts);

/*
 * Called by mount code to set a mountpoint and check if the mountpoint is
 * reachable (e.g. NFS can unhash a directory dentry and then the complete
 * subtree can become unreachable).
 *
 * Only one of d_invalidate() and d_set_mounted() must succeed.  For
 * this reason take rename_lock and d_lock on dentry and ancestors.
 */
int d_set_mounted(struct dentry *dentry)
{
        struct dentry *p;
        int ret = -ENOENT;
        write_seqlock(&rename_lock);
        for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) {
                /* Need exclusion wrt. d_invalidate() */
                spin_lock(&p->d_lock);
                if (unlikely(d_unhashed(p))) {
                        spin_unlock(&p->d_lock);
                        goto out;
                }
                spin_unlock(&p->d_lock);
        }
        spin_lock(&dentry->d_lock);
        if (!d_unlinked(dentry)) {
                ret = -EBUSY;
                if (!d_mountpoint(dentry)) {
                        dentry->d_flags |= DCACHE_MOUNTED;
                        ret = 0;
                }
        }
         spin_unlock(&dentry->d_lock);
out:
        write_sequnlock(&rename_lock);
        return ret;
}

/*
 * Search the dentry child list of the specified parent,
 * and move any unused dentries to the end of the unused
 * list for prune_dcache(). We descend to the next level
 * whenever the d_children list is non-empty and continue
 * searching.
 *
 * It returns zero iff there are no unused children,
 * otherwise  it returns the number of children moved to
 * the end of the unused list. This may not be the total
 * number of unused children, because select_parent can
 * drop the lock and return early due to latency
 * constraints.
 */

struct select_data {
        struct dentry *start;
        union {
                long found;
                struct dentry *victim;
        };
        struct list_head dispose;
};

static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                data->found++;
        } else if (!dentry->d_lockref.count) {
                to_shrink_list(dentry, &data->dispose);
                data->found++;
        } else if (dentry->d_lockref.count < 0) {
                data->found++;
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
{
        struct select_data *data = _data;
        enum d_walk_ret ret = D_WALK_CONTINUE;

        if (data->start == dentry)
                goto out;

        if (!dentry->d_lockref.count) {
                if (dentry->d_flags & DCACHE_SHRINK_LIST) {
                        rcu_read_lock();
                        data->victim = dentry;
                        return D_WALK_QUIT;
                }
                to_shrink_list(dentry, &data->dispose);
        }
        /*
         * We can return to the caller if we have found some (this
         * ensures forward progress). We'll be coming back to find
         * the rest.
         */
        if (!list_empty(&data->dispose))
                ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
out:
        return ret;
}

/**
 * shrink_dcache_parent - prune dcache
 * @parent: parent of entries to prune
 *
 * Prune the dcache to remove unused children of the parent dentry.
 */
void shrink_dcache_parent(struct dentry *parent)
{
        for (;;) {
                struct select_data data = {.start = parent};

                INIT_LIST_HEAD(&data.dispose);
                d_walk(parent, &data, select_collect);

                if (!list_empty(&data.dispose)) {
                        shrink_dentry_list(&data.dispose);
                        continue;
                }

                cond_resched();
                if (!data.found)
                        break;
                data.victim = NULL;
                d_walk(parent, &data, select_collect2);
                if (data.victim) {
                        spin_lock(&data.victim->d_lock);
                        if (!lock_for_kill(data.victim)) {
                                spin_unlock(&data.victim->d_lock);
                                rcu_read_unlock();
                        } else {
                                shrink_kill(data.victim);
                        }
                }
                if (!list_empty(&data.dispose))
                        shrink_dentry_list(&data.dispose);
        }
}
EXPORT_SYMBOL(shrink_dcache_parent);

static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
        /* it has busy descendents; complain about those instead */
        if (!hlist_empty(&dentry->d_children))
                return D_WALK_CONTINUE;

        /* root with refcount 1 is fine */
        if (dentry == _data && dentry->d_lockref.count == 1)
                return D_WALK_CONTINUE;

        WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} "
                        " still in use (%d) [unmount of %s %s]\n",
                       dentry,
                       dentry->d_inode ?
                       dentry->d_inode->i_ino : 0UL,
                       dentry,
                       dentry->d_lockref.count,
                       dentry->d_sb->s_type->name,
                       dentry->d_sb->s_id);
        return D_WALK_CONTINUE;
}

static void do_one_tree(struct dentry *dentry)
{
        shrink_dcache_parent(dentry);
        d_walk(dentry, dentry, umount_check);
        d_drop(dentry);
        dput(dentry);
}

/*
 * destroy the dentries attached to a superblock on unmounting
 */
void shrink_dcache_for_umount(struct super_block *sb)
{
        struct dentry *dentry;

        rwsem_assert_held_write(&sb->s_umount);

        dentry = sb->s_root;
        sb->s_root = NULL;
        do_one_tree(dentry);

        while (!hlist_bl_empty(&sb->s_roots)) {
                dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash));
                do_one_tree(dentry);
        }
}

static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
        struct dentry **victim = _data;
        if (d_mountpoint(dentry)) {
                *victim = dget_dlock(dentry);
                return D_WALK_QUIT;
        }
        return D_WALK_CONTINUE;
}

/**
 * d_invalidate - detach submounts, prune dcache, and drop
 * @dentry: dentry to invalidate (aka detach, prune and drop)
 */
void d_invalidate(struct dentry *dentry)
{
        bool had_submounts = false;
        spin_lock(&dentry->d_lock);
        if (d_unhashed(dentry)) {
                spin_unlock(&dentry->d_lock);
                return;
        }
        __d_drop(dentry);
        spin_unlock(&dentry->d_lock);

        /* Negative dentries can be dropped without further checks */
        if (!dentry->d_inode)
                return;

        shrink_dcache_parent(dentry);
        for (;;) {
                struct dentry *victim = NULL;
                d_walk(dentry, &victim, find_submount);
                if (!victim) {
                        if (had_submounts)
                                shrink_dcache_parent(dentry);
                        return;
                }
                had_submounts = true;
                detach_mounts(victim);
                dput(victim);
        }
}
EXPORT_SYMBOL(d_invalidate);

/**
 * __d_alloc        -        allocate a dcache entry
 * @sb: filesystem it will belong to
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
 
static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
        struct dentry *dentry;
        char *dname;
        int err;

        dentry = kmem_cache_alloc_lru(dentry_cache, &sb->s_dentry_lru,
                                      GFP_KERNEL);
        if (!dentry)
                return NULL;

        /*
         * We guarantee that the inline name is always NUL-terminated.
         * This way the memcpy() done by the name switching in rename
         * will still always have a NUL at the end, even if we might
         * be overwriting an internal NUL character
         */
        dentry->d_shortname.string[DNAME_INLINE_LEN-1] = 0;
        if (unlikely(!name)) {
                name = &slash_name;
                dname = dentry->d_shortname.string;
        } else if (name->len > DNAME_INLINE_LEN-1) {
                size_t size = offsetof(struct external_name, name[1]);
                struct external_name *p = kmalloc(size + name->len,
                                                  GFP_KERNEL_ACCOUNT |
                                                  __GFP_RECLAIMABLE);
                if (!p) {
                        kmem_cache_free(dentry_cache, dentry); 
                        return NULL;
                }
                atomic_set(&p->count, 1);
                dname = p->name;
        } else  {
                dname = dentry->d_shortname.string;
        }        

        dentry->d_name.len = name->len;
        dentry->d_name.hash = name->hash;
        memcpy(dname, name->name, name->len);
        dname[name->len] = 0;

        /* Make sure we always see the terminating NUL character */
        smp_store_release(&dentry->d_name.name, dname); /* ^^^ */

        dentry->d_flags = 0;
        lockref_init(&dentry->d_lockref);
        seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock);
        dentry->d_inode = NULL;
        dentry->d_parent = dentry;
        dentry->d_sb = sb;
        dentry->d_op = NULL;
        dentry->d_fsdata = NULL;
        INIT_HLIST_BL_NODE(&dentry->d_hash);
        INIT_LIST_HEAD(&dentry->d_lru);
        INIT_HLIST_HEAD(&dentry->d_children);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_HLIST_NODE(&dentry->d_sib);
        d_set_d_op(dentry, dentry->d_sb->s_d_op);

        if (dentry->d_op && dentry->d_op->d_init) {
                err = dentry->d_op->d_init(dentry);
                if (err) {
                        if (dname_external(dentry))
                                kfree(external_name(dentry));
                        kmem_cache_free(dentry_cache, dentry);
                        return NULL;
                }
        }

        this_cpu_inc(nr_dentry);

        return dentry;
}

/**
 * d_alloc        -        allocate a dcache entry
 * @parent: parent of entry to allocate
 * @name: qstr of the name
 *
 * Allocates a dentry. It returns %NULL if there is insufficient memory
 * available. On a success the dentry is returned. The name passed in is
 * copied and the copy passed in may be reused after this call.
 */
struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
{
        struct dentry *dentry = __d_alloc(parent->d_sb, name);
        if (!dentry)
                return NULL;
        spin_lock(&parent->d_lock);
        /*
         * don't need child lock because it is not subject
         * to concurrency here
         */
        dentry->d_parent = dget_dlock(parent);
        hlist_add_head(&dentry->d_sib, &parent->d_children);
        spin_unlock(&parent->d_lock);

        return dentry;
}
EXPORT_SYMBOL(d_alloc);

struct dentry *d_alloc_anon(struct super_block *sb)
{
        return __d_alloc(sb, NULL);
}
EXPORT_SYMBOL(d_alloc_anon);

struct dentry *d_alloc_cursor(struct dentry * parent)
{
        struct dentry *dentry = d_alloc_anon(parent->d_sb);
        if (dentry) {
                dentry->d_flags |= DCACHE_DENTRY_CURSOR;
                dentry->d_parent = dget(parent);
        }
        return dentry;
}

/**
 * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems)
 * @sb: the superblock
 * @name: qstr of the name
 *
 * For a filesystem that just pins its dentries in memory and never
 * performs lookups at all, return an unhashed IS_ROOT dentry.
 * This is used for pipes, sockets et.al. - the stuff that should
 * never be anyone's children or parents.  Unlike all other
 * dentries, these will not have RCU delay between dropping the
 * last reference and freeing them.
 *
 * The only user is alloc_file_pseudo() and that's what should
 * be considered a public interface.  Don't use directly.
 */
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
        static const struct dentry_operations anon_ops = {
                .d_dname = simple_dname
        };
        struct dentry *dentry = __d_alloc(sb, name);
        if (likely(dentry)) {
                dentry->d_flags |= DCACHE_NORCU;
                if (!sb->s_d_op)
                        d_set_d_op(dentry, &anon_ops);
        }
        return dentry;
}

struct dentry *d_alloc_name(struct dentry *parent, const char *name)
{
        struct qstr q;

        q.name = name;
        q.hash_len = hashlen_string(parent, name);
        return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);

void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
{
        WARN_ON_ONCE(dentry->d_op);
        WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH        |
                                DCACHE_OP_COMPARE        |
                                DCACHE_OP_REVALIDATE        |
                                DCACHE_OP_WEAK_REVALIDATE        |
                                DCACHE_OP_DELETE        |
                                DCACHE_OP_REAL));
        dentry->d_op = op;
        if (!op)
                return;
        if (op->d_hash)
                dentry->d_flags |= DCACHE_OP_HASH;
        if (op->d_compare)
                dentry->d_flags |= DCACHE_OP_COMPARE;
        if (op->d_revalidate)
                dentry->d_flags |= DCACHE_OP_REVALIDATE;
        if (op->d_weak_revalidate)
                dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE;
        if (op->d_delete)
                dentry->d_flags |= DCACHE_OP_DELETE;
        if (op->d_prune)
                dentry->d_flags |= DCACHE_OP_PRUNE;
        if (op->d_real)
                dentry->d_flags |= DCACHE_OP_REAL;

}
EXPORT_SYMBOL(d_set_d_op);

static unsigned d_flags_for_inode(struct inode *inode)
{
        unsigned add_flags = DCACHE_REGULAR_TYPE;

        if (!inode)
                return DCACHE_MISS_TYPE;

        if (S_ISDIR(inode->i_mode)) {
                add_flags = DCACHE_DIRECTORY_TYPE;
                if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) {
                        if (unlikely(!inode->i_op->lookup))
                                add_flags = DCACHE_AUTODIR_TYPE;
                        else
                                inode->i_opflags |= IOP_LOOKUP;
                }
                goto type_determined;
        }

        if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
                if (unlikely(inode->i_op->get_link)) {
                        add_flags = DCACHE_SYMLINK_TYPE;
                        goto type_determined;
                }
                inode->i_opflags |= IOP_NOFOLLOW;
        }

        if (unlikely(!S_ISREG(inode->i_mode)))
                add_flags = DCACHE_SPECIAL_TYPE;

type_determined:
        if (unlikely(IS_AUTOMOUNT(inode)))
                add_flags |= DCACHE_NEED_AUTOMOUNT;
        return add_flags;
}

static void __d_instantiate(struct dentry *dentry, struct inode *inode)
{
        unsigned add_flags = d_flags_for_inode(inode);
        WARN_ON(d_in_lookup(dentry));

        spin_lock(&dentry->d_lock);
        /*
         * The negative counter only tracks dentries on the LRU. Don't dec if
         * d_lru is on another list.
         */
        if ((dentry->d_flags &
             (DCACHE_LRU_LIST|DCACHE_SHRINK_LIST)) == DCACHE_LRU_LIST)
                this_cpu_dec(nr_dentry_negative);
        hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
        raw_write_seqcount_begin(&dentry->d_seq);
        __d_set_inode_and_type(dentry, inode, add_flags);
        raw_write_seqcount_end(&dentry->d_seq);
        fsnotify_update_flags(dentry);
        spin_unlock(&dentry->d_lock);
}

/**
 * d_instantiate - fill in inode information for a dentry
 * @entry: dentry to complete
 * @inode: inode to attach to this dentry
 *
 * Fill in inode information in the entry.
 *
 * This turns negative dentries into productive full members
 * of society.
 *
 * NOTE! This assumes that the inode count has been incremented
 * (or otherwise set) by the caller to indicate that it is now
 * in use by the dcache.
 */
 
void d_instantiate(struct dentry *entry, struct inode * inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
                __d_instantiate(entry, inode);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_instantiate);

/*
 * This should be equivalent to d_instantiate() + unlock_new_inode(),
 * with lockdep-related part of unlock_new_inode() done before
 * anything else.  Use that instead of open-coding d_instantiate()/
 * unlock_new_inode() combinations.
 */
void d_instantiate_new(struct dentry *entry, struct inode *inode)
{
        BUG_ON(!hlist_unhashed(&entry->d_u.d_alias));
        BUG_ON(!inode);
        lockdep_annotate_inode_mutex_key(inode);
        security_d_instantiate(entry, inode);
        spin_lock(&inode->i_lock);
        __d_instantiate(entry, inode);
        WARN_ON(!(inode->i_state & I_NEW));
        inode->i_state &= ~I_NEW & ~I_CREATING;
        /*
         * Pairs with the barrier in prepare_to_wait_event() to make sure
         * ___wait_var_event() either sees the bit cleared or
         * waitqueue_active() check in wake_up_var() sees the waiter.
         */
        smp_mb();
        inode_wake_up_bit(inode, __I_NEW);
        spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL(d_instantiate_new);

struct dentry *d_make_root(struct inode *root_inode)
{
        struct dentry *res = NULL;

        if (root_inode) {
                res = d_alloc_anon(root_inode->i_sb);
                if (res)
                        d_instantiate(res, root_inode);
                else
                        iput(root_inode);
        }
        return res;
}
EXPORT_SYMBOL(d_make_root);

static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
{
        struct super_block *sb;
        struct dentry *new, *res;

        if (!inode)
                return ERR_PTR(-ESTALE);
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        sb = inode->i_sb;

        res = d_find_any_alias(inode); /* existing alias? */
        if (res)
                goto out;

        new = d_alloc_anon(sb);
        if (!new) {
                res = ERR_PTR(-ENOMEM);
                goto out;
        }

        security_d_instantiate(new, inode);
        spin_lock(&inode->i_lock);
        res = __d_find_any_alias(inode); /* recheck under lock */
        if (likely(!res)) { /* still no alias, attach a disconnected dentry */
                unsigned add_flags = d_flags_for_inode(inode);

                if (disconnected)
                        add_flags |= DCACHE_DISCONNECTED;

                spin_lock(&new->d_lock);
                __d_set_inode_and_type(new, inode, add_flags);
                hlist_add_head(&new->d_u.d_alias, &inode->i_dentry);
                if (!disconnected) {
                        hlist_bl_lock(&sb->s_roots);
                        hlist_bl_add_head(&new->d_hash, &sb->s_roots);
                        hlist_bl_unlock(&sb->s_roots);
                }
                spin_unlock(&new->d_lock);
                spin_unlock(&inode->i_lock);
                inode = NULL; /* consumed by new->d_inode */
                res = new;
        } else {
                spin_unlock(&inode->i_lock);
                dput(new);
        }

 out:
        iput(inode);
        return res;
}

/**
 * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain a dentry for an inode resulting from NFS filehandle conversion or
 * similar open by handle operations.  The returned dentry may be anonymous,
 * or may have a full name (if the inode was already in the cache).
 *
 * When called on a directory inode, we must ensure that the inode only ever
 * has one dentry.  If a dentry is found, that is returned instead of
 * allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is released.
 * To make it easier to use in export operations a %NULL or IS_ERR inode may
 * be passed in and the error will be propagated to the return value,
 * with a %NULL @inode replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_alias(struct inode *inode)
{
        return __d_obtain_alias(inode, true);
}
EXPORT_SYMBOL(d_obtain_alias);

/**
 * d_obtain_root - find or allocate a dentry for a given inode
 * @inode: inode to allocate the dentry for
 *
 * Obtain an IS_ROOT dentry for the root of a filesystem.
 *
 * We must ensure that directory inodes only ever have one dentry.  If a
 * dentry is found, that is returned instead of allocating a new one.
 *
 * On successful return, the reference to the inode has been transferred
 * to the dentry.  In case of an error the reference on the inode is
 * released.  A %NULL or IS_ERR inode may be passed in and will be the
 * error will be propagate to the return value, with a %NULL @inode
 * replaced by ERR_PTR(-ESTALE).
 */
struct dentry *d_obtain_root(struct inode *inode)
{
        return __d_obtain_alias(inode, false);
}
EXPORT_SYMBOL(d_obtain_root);

/**
 * d_add_ci - lookup or allocate new dentry with case-exact name
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @inode:  the inode case-insensitive lookup has found
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * This is to avoid filling the dcache with case-insensitive names to the
 * same inode, only the actual correct case is stored in the dcache for
 * case-insensitive filesystems.
 *
 * For a case-insensitive lookup match and if the case-exact dentry
 * already exists in the dcache, use it and return it.
 *
 * If no entry exists with the exact case name, allocate new dentry with
 * the exact case, and return the spliced entry.
 */
struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
                        struct qstr *name)
{
        struct dentry *found, *res;

        /*
         * First check if a dentry matching the name already exists,
         * if not go ahead and create it now.
         */
        found = d_hash_and_lookup(dentry->d_parent, name);
        if (found) {
                iput(inode);
                return found;
        }
        if (d_in_lookup(dentry)) {
                found = d_alloc_parallel(dentry->d_parent, name,
                                        dentry->d_wait);
                if (IS_ERR(found) || !d_in_lookup(found)) {
                        iput(inode);
                        return found;
                }
        } else {
                found = d_alloc(dentry->d_parent, name);
                if (!found) {
                        iput(inode);
                        return ERR_PTR(-ENOMEM);
                } 
        }
        res = d_splice_alias(inode, found);
        if (res) {
                d_lookup_done(found);
                dput(found);
                return res;
        }
        return found;
}
EXPORT_SYMBOL(d_add_ci);

/**
 * d_same_name - compare dentry name with case-exact name
 * @dentry: the negative dentry that was passed to the parent's lookup func
 * @parent: parent dentry
 * @name:   the case-exact name to be associated with the returned dentry
 *
 * Return: true if names are same, or false
 */
bool d_same_name(const struct dentry *dentry, const struct dentry *parent,
                 const struct qstr *name)
{
        if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
                if (dentry->d_name.len != name->len)
                        return false;
                return dentry_cmp(dentry, name->name, name->len) == 0;
        }
        return parent->d_op->d_compare(dentry,
                                       dentry->d_name.len, dentry->d_name.name,
                                       name) == 0;
}
EXPORT_SYMBOL_GPL(d_same_name);

/*
 * This is __d_lookup_rcu() when the parent dentry has
 * DCACHE_OP_COMPARE, which makes things much nastier.
 */
static noinline struct dentry *__d_lookup_rcu_op_compare(
        const struct dentry *parent,
        const struct qstr *name,
        unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        struct hlist_bl_head *b = d_hash(hashlen);
        struct hlist_bl_node *node;
        struct dentry *dentry;

        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                int tlen;
                const char *tname;
                unsigned seq;

seqretry:
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;
                if (dentry->d_name.hash != hashlen_hash(hashlen))
                        continue;
                tlen = dentry->d_name.len;
                tname = dentry->d_name.name;
                /* we want a consistent (name,len) pair */
                if (read_seqcount_retry(&dentry->d_seq, seq)) {
                        cpu_relax();
                        goto seqretry;
                }
                if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0)
                        continue;
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * __d_lookup_rcu - search for a dentry (racy, store-free)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * @seqp: returns d_seq value at the point where the dentry was found
 * Returns: dentry, or NULL
 *
 * __d_lookup_rcu is the dcache lookup function for rcu-walk name
 * resolution (store-free path walking) design described in
 * Documentation/filesystems/path-lookup.txt.
 *
 * This is not to be used outside core vfs.
 *
 * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock
 * held, and rcu_read_lock held. The returned dentry must not be stored into
 * without taking d_lock and checking d_seq sequence count against @seq
 * returned here.
 *
 * Alternatively, __d_lookup_rcu may be called again to look up the child of
 * the returned dentry, so long as its parent's seqlock is checked after the
 * child is looked up. Thus, an interlocking stepping of sequence lock checks
 * is formed, giving integrity down the path walk.
 *
 * NOTE! The caller *has* to check the resulting dentry against the sequence
 * number we've returned before using any of the resulting dentry state!
 */
struct dentry *__d_lookup_rcu(const struct dentry *parent,
                                const struct qstr *name,
                                unsigned *seqp)
{
        u64 hashlen = name->hash_len;
        const unsigned char *str = name->name;
        struct hlist_bl_head *b = d_hash(hashlen);
        struct hlist_bl_node *node;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        if (unlikely(parent->d_flags & DCACHE_OP_COMPARE))
                return __d_lookup_rcu_op_compare(parent, name, seqp);

        /*
         * The hash list is protected using RCU.
         *
         * Carefully use d_seq when comparing a candidate dentry, to avoid
         * races with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {
                unsigned seq;

                /*
                 * The dentry sequence count protects us from concurrent
                 * renames, and thus protects parent and name fields.
                 *
                 * The caller must perform a seqcount check in order
                 * to do anything useful with the returned dentry.
                 *
                 * NOTE! We do a "raw" seqcount_begin here. That means that
                 * we don't wait for the sequence count to stabilize if it
                 * is in the middle of a sequence change. If we do the slow
                 * dentry compare, we will do seqretries until it is stable,
                 * and if we end up with a successful lookup, we actually
                 * want to exit RCU lookup anyway.
                 *
                 * Note that raw_seqcount_begin still *does* smp_rmb(), so
                 * we are still guaranteed NUL-termination of ->d_name.name.
                 */
                seq = raw_seqcount_begin(&dentry->d_seq);
                if (dentry->d_parent != parent)
                        continue;
                if (d_unhashed(dentry))
                        continue;
                if (dentry->d_name.hash_len != hashlen)
                        continue;
                if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
                        continue;
                *seqp = seq;
                return dentry;
        }
        return NULL;
}

/**
 * d_lookup - search for a dentry
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * d_lookup searches the children of the parent dentry for the name in
 * question. If the dentry is found its reference count is incremented and the
 * dentry is returned. The caller must use dput to free the entry when it has
 * finished using it. %NULL is returned if the dentry does not exist.
 */
struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name)
{
        struct dentry *dentry;
        unsigned seq;

        do {
                seq = read_seqbegin(&rename_lock);
                dentry = __d_lookup(parent, name);
                if (dentry)
                        break;
        } while (read_seqretry(&rename_lock, seq));
        return dentry;
}
EXPORT_SYMBOL(d_lookup);

/**
 * __d_lookup - search for a dentry (racy)
 * @parent: parent dentry
 * @name: qstr of name we wish to find
 * Returns: dentry, or NULL
 *
 * __d_lookup is like d_lookup, however it may (rarely) return a
 * false-negative result due to unrelated rename activity.
 *
 * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
 * however it must be used carefully, eg. with a following d_lookup in
 * the case of failure.
 *
 * __d_lookup callers must be commented.
 */
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = d_hash(hash);
        struct hlist_bl_node *node;
        struct dentry *found = NULL;
        struct dentry *dentry;

        /*
         * Note: There is significant duplication with __d_lookup_rcu which is
         * required to prevent single threaded performance regressions
         * especially on architectures where smp_rmb (in seqcounts) are costly.
         * Keep the two functions in sync.
         */

        /*
         * The hash list is protected using RCU.
         *
         * Take d_lock when comparing a candidate dentry, to avoid races
         * with d_move().
         *
         * It is possible that concurrent renames can mess up our list
         * walk here and result in missing our dentry, resulting in the
         * false-negative result. d_lookup() protects against concurrent
         * renames using rename_lock seqlock.
         *
         * See Documentation/filesystems/path-lookup.txt for more details.
         */
        rcu_read_lock();
        
        hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) {

                if (dentry->d_name.hash != hash)
                        continue;

                spin_lock(&dentry->d_lock);
                if (dentry->d_parent != parent)
                        goto next;
                if (d_unhashed(dentry))
                        goto next;

                if (!d_same_name(dentry, parent, name))
                        goto next;

                dentry->d_lockref.count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
next:
                spin_unlock(&dentry->d_lock);
         }
         rcu_read_unlock();

         return found;
}

/**
 * d_hash_and_lookup - hash the qstr then search for a dentry
 * @dir: Directory to search in
 * @name: qstr of name we wish to find
 *
 * On lookup failure NULL is returned; on bad name - ERR_PTR(-error)
 */
struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
{
        /*
         * Check for a fs-specific hash function. Note that we must
         * calculate the standard hash first, as the d_op->d_hash()
         * routine may choose to leave the hash value unchanged.
         */
        name->hash = full_name_hash(dir, name->name, name->len);
        if (dir->d_flags & DCACHE_OP_HASH) {
                int err = dir->d_op->d_hash(dir, name);
                if (unlikely(err < 0))
                        return ERR_PTR(err);
        }
        return d_lookup(dir, name);
}
EXPORT_SYMBOL(d_hash_and_lookup);

/*
 * When a file is deleted, we have two options:
 * - turn this dentry into a negative dentry
 * - unhash this dentry and free it.
 *
 * Usually, we want to just turn this into
 * a negative dentry, but if anybody else is
 * currently using the dentry or the inode
 * we can't do that and we fall back on removing
 * it from the hash queues and waiting for
 * it to be deleted later when it has no users
 */
 
/**
 * d_delete - delete a dentry
 * @dentry: The dentry to delete
 *
 * Turn the dentry into a negative dentry if possible, otherwise
 * remove it from the hash queues so it can be deleted later
 */
 
void d_delete(struct dentry * dentry)
{
        struct inode *inode = dentry->d_inode;

        spin_lock(&inode->i_lock);
        spin_lock(&dentry->d_lock);
        /*
         * Are we the only user?
         */
        if (dentry->d_lockref.count == 1) {
                if (dentry_negative_policy)
                        __d_drop(dentry);
                dentry->d_flags &= ~DCACHE_CANT_MOUNT;
                dentry_unlink_inode(dentry);
        } else {
                __d_drop(dentry);
                spin_unlock(&dentry->d_lock);
                spin_unlock(&inode->i_lock);
        }
}
EXPORT_SYMBOL(d_delete);

static void __d_rehash(struct dentry *entry)
{
        struct hlist_bl_head *b = d_hash(entry->d_name.hash);

        hlist_bl_lock(b);
        hlist_bl_add_head_rcu(&entry->d_hash, b);
        hlist_bl_unlock(b);
}

/**
 * d_rehash        - add an entry back to the hash
 * @entry: dentry to add to the hash
 *
 * Adds a dentry to the hash according to its name.
 */
 
void d_rehash(struct dentry * entry)
{
        spin_lock(&entry->d_lock);
        __d_rehash(entry);
        spin_unlock(&entry->d_lock);
}
EXPORT_SYMBOL(d_rehash);

static inline unsigned start_dir_add(struct inode *dir)
{
        preempt_disable_nested();
        for (;;) {
                unsigned n = dir->i_dir_seq;
                if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
                        return n;
                cpu_relax();
        }
}

static inline void end_dir_add(struct inode *dir, unsigned int n,
                               wait_queue_head_t *d_wait)
{
        smp_store_release(&dir->i_dir_seq, n + 2);
        preempt_enable_nested();
        if (wq_has_sleeper(d_wait))
                wake_up_all(d_wait);
}

static void d_wait_lookup(struct dentry *dentry)
{
        if (d_in_lookup(dentry)) {
                DECLARE_WAITQUEUE(wait, current);
                add_wait_queue(dentry->d_wait, &wait);
                do {
                        set_current_state(TASK_UNINTERRUPTIBLE);
                        spin_unlock(&dentry->d_lock);
                        schedule();
                        spin_lock(&dentry->d_lock);
                } while (d_in_lookup(dentry));
        }
}

struct dentry *d_alloc_parallel(struct dentry *parent,
                                const struct qstr *name,
                                wait_queue_head_t *wq)
{
        unsigned int hash = name->hash;
        struct hlist_bl_head *b = in_lookup_hash(parent, hash);
        struct hlist_bl_node *node;
        struct dentry *new = d_alloc(parent, name);
        struct dentry *dentry;
        unsigned seq, r_seq, d_seq;

        if (unlikely(!new))
                return ERR_PTR(-ENOMEM);

retry:
        rcu_read_lock();
        seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
        r_seq = read_seqbegin(&rename_lock);
        dentry = __d_lookup_rcu(parent, name, &d_seq);
        if (unlikely(dentry)) {
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (read_seqcount_retry(&dentry->d_seq, d_seq)) {
                        rcu_read_unlock();
                        dput(dentry);
                        goto retry;
                }
                rcu_read_unlock();
                dput(new);
                return dentry;
        }
        if (unlikely(read_seqretry(&rename_lock, r_seq))) {
                rcu_read_unlock();
                goto retry;
        }

        if (unlikely(seq & 1)) {
                rcu_read_unlock();
                goto retry;
        }

        hlist_bl_lock(b);
        if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
                hlist_bl_unlock(b);
                rcu_read_unlock();
                goto retry;
        }
        /*
         * No changes for the parent since the beginning of d_lookup().
         * Since all removals from the chain happen with hlist_bl_lock(),
         * any potential in-lookup matches are going to stay here until
         * we unlock the chain.  All fields are stable in everything
         * we encounter.
         */
        hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) {
                if (dentry->d_name.hash != hash)
                        continue;
                if (dentry->d_parent != parent)
                        continue;
                if (!d_same_name(dentry, parent, name))
                        continue;
                hlist_bl_unlock(b);
                /* now we can try to grab a reference */
                if (!lockref_get_not_dead(&dentry->d_lockref)) {
                        rcu_read_unlock();
                        goto retry;
                }

                rcu_read_unlock();
                /*
                 * somebody is likely to be still doing lookup for it;
                 * wait for them to finish
                 */
                spin_lock(&dentry->d_lock);
                d_wait_lookup(dentry);
                /*
                 * it's not in-lookup anymore; in principle we should repeat
                 * everything from dcache lookup, but it's likely to be what
                 * d_lookup() would've found anyway.  If it is, just return it;
                 * otherwise we really have to repeat the whole thing.
                 */
                if (unlikely(dentry->d_name.hash != hash))
                        goto mismatch;
                if (unlikely(dentry->d_parent != parent))
                        goto mismatch;
                if (unlikely(d_unhashed(dentry)))
                        goto mismatch;
                if (unlikely(!d_same_name(dentry, parent, name)))
                        goto mismatch;
                /* OK, it *is* a hashed match; return it */
                spin_unlock(&dentry->d_lock);
                dput(new);
                return dentry;
        }
        rcu_read_unlock();
        /* we can't take ->d_lock here; it's OK, though. */
        new->d_flags |= DCACHE_PAR_LOOKUP;
        new->d_wait = wq;
        hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
        hlist_bl_unlock(b);
        return new;
mismatch:
        spin_unlock(&dentry->d_lock);
        dput(dentry);
        goto retry;
}
EXPORT_SYMBOL(d_alloc_parallel);

/*
 * - Unhash the dentry
 * - Retrieve and clear the waitqueue head in dentry
 * - Return the waitqueue head
 */
static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry)
{
        wait_queue_head_t *d_wait;
        struct hlist_bl_head *b;

        lockdep_assert_held(&dentry->d_lock);

        b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash);
        hlist_bl_lock(b);
        dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
        __hlist_bl_del(&dentry->d_u.d_in_lookup_hash);
        d_wait = dentry->d_wait;
        dentry->d_wait = NULL;
        hlist_bl_unlock(b);
        INIT_HLIST_NODE(&dentry->d_u.d_alias);
        INIT_LIST_HEAD(&dentry->d_lru);
        return d_wait;
}

void __d_lookup_unhash_wake(struct dentry *dentry)
{
        spin_lock(&dentry->d_lock);
        wake_up_all(__d_lookup_unhash(dentry));
        spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(__d_lookup_unhash_wake);

/* inode->i_lock held if inode is non-NULL */

static inline void __d_add(struct dentry *dentry, struct inode *inode)
{
        wait_queue_head_t *d_wait;
        struct inode *dir = NULL;
        unsigned n;
        spin_lock(&dentry->d_lock);
        if (unlikely(d_in_lookup(dentry))) {
                dir = dentry->d_parent->d_inode;
                n = start_dir_add(dir);
                d_wait = __d_lookup_unhash(dentry);
        }
        if (inode) {
                unsigned add_flags = d_flags_for_inode(inode);
                hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
                raw_write_seqcount_begin(&dentry->d_seq);
                __d_set_inode_and_type(dentry, inode, add_flags);
                raw_write_seqcount_end(&dentry->d_seq);
                fsnotify_update_flags(dentry);
        }
        __d_rehash(dentry);
        if (dir)
                end_dir_add(dir, n, d_wait);
        spin_unlock(&dentry->d_lock);
        if (inode)
                spin_unlock(&inode->i_lock);
}

/**
 * d_add - add dentry to hash queues
 * @entry: dentry to add
 * @inode: The inode to attach to this dentry
 *
 * This adds the entry to the hash queues and initializes @inode.
 * The entry was actually filled in earlier during d_alloc().
 */

void d_add(struct dentry *entry, struct inode *inode)
{
        if (inode) {
                security_d_instantiate(entry, inode);
                spin_lock(&inode->i_lock);
        }
        __d_add(entry, inode);
}
EXPORT_SYMBOL(d_add);

static void swap_names(struct dentry *dentry, struct dentry *target)
{
        if (unlikely(dname_external(target))) {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * Both external: swap the pointers
                         */
                        swap(target->d_name.name, dentry->d_name.name);
                } else {
                        /*
                         * dentry:internal, target:external.  Steal target's
                         * storage and make target internal.
                         */
                        dentry->d_name.name = target->d_name.name;
                        target->d_shortname = dentry->d_shortname;
                        target->d_name.name = target->d_shortname.string;
                }
        } else {
                if (unlikely(dname_external(dentry))) {
                        /*
                         * dentry:external, target:internal.  Give dentry's
                         * storage to target and make dentry internal
                         */
                        target->d_name.name = dentry->d_name.name;
                        dentry->d_shortname = target->d_shortname;
                        dentry->d_name.name = dentry->d_shortname.string;
                } else {
                        /*
                         * Both are internal.
                         */
                        for (int i = 0; i < DNAME_INLINE_WORDS; i++)
                                swap(dentry->d_shortname.words[i],
                                     target->d_shortname.words[i]);
                }
        }
        swap(dentry->d_name.hash_len, target->d_name.hash_len);
}

static void copy_name(struct dentry *dentry, struct dentry *target)
{
        struct external_name *old_name = NULL;
        if (unlikely(dname_external(dentry)))
                old_name = external_name(dentry);
        if (unlikely(dname_external(target))) {
                atomic_inc(&external_name(target)->count);
                dentry->d_name = target->d_name;
        } else {
                dentry->d_shortname = target->d_shortname;
                dentry->d_name.name = dentry->d_shortname.string;
                dentry->d_name.hash_len = target->d_name.hash_len;
        }
        if (old_name && likely(atomic_dec_and_test(&old_name->count)))
                kfree_rcu(old_name, head);
}

/*
 * __d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 * @exchange: exchange the two dentries
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. Caller must hold
 * rename_lock, the i_mutex of the source and target directories,
 * and the sb->s_vfs_rename_mutex if they differ. See lock_rename().
 */
static void __d_move(struct dentry *dentry, struct dentry *target,
                     bool exchange)
{
        struct dentry *old_parent, *p;
        wait_queue_head_t *d_wait;
        struct inode *dir = NULL;
        unsigned n;

        WARN_ON(!dentry->d_inode);
        if (WARN_ON(dentry == target))
                return;

        BUG_ON(d_ancestor(target, dentry));
        old_parent = dentry->d_parent;
        p = d_ancestor(old_parent, target);
        if (IS_ROOT(dentry)) {
                BUG_ON(p);
                spin_lock(&target->d_parent->d_lock);
        } else if (!p) {
                /* target is not a descendent of dentry->d_parent */
                spin_lock(&target->d_parent->d_lock);
                spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED);
        } else {
                BUG_ON(p == dentry);
                spin_lock(&old_parent->d_lock);
                if (p != target)
                        spin_lock_nested(&target->d_parent->d_lock,
                                        DENTRY_D_LOCK_NESTED);
        }
        spin_lock_nested(&dentry->d_lock, 2);
        spin_lock_nested(&target->d_lock, 3);

        if (unlikely(d_in_lookup(target))) {
                dir = target->d_parent->d_inode;
                n = start_dir_add(dir);
                d_wait = __d_lookup_unhash(target);
        }

        write_seqcount_begin(&dentry->d_seq);
        write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);

        /* unhash both */
        if (!d_unhashed(dentry))
                ___d_drop(dentry);
        if (!d_unhashed(target))
                ___d_drop(target);

        /* ... and switch them in the tree */
        dentry->d_parent = target->d_parent;
        if (!exchange) {
                copy_name(dentry, target);
                target->d_hash.pprev = NULL;
                dentry->d_parent->d_lockref.count++;
                if (dentry != old_parent) /* wasn't IS_ROOT */
                        WARN_ON(!--old_parent->d_lockref.count);
        } else {
                target->d_parent = old_parent;
                swap_names(dentry, target);
                if (!hlist_unhashed(&target->d_sib))
                        __hlist_del(&target->d_sib);
                hlist_add_head(&target->d_sib, &target->d_parent->d_children);
                __d_rehash(target);
                fsnotify_update_flags(target);
        }
        if (!hlist_unhashed(&dentry->d_sib))
                __hlist_del(&dentry->d_sib);
        hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children);
        __d_rehash(dentry);
        fsnotify_update_flags(dentry);
        fscrypt_handle_d_move(dentry);

        write_seqcount_end(&target->d_seq);
        write_seqcount_end(&dentry->d_seq);

        if (dir)
                end_dir_add(dir, n, d_wait);

        if (dentry->d_parent != old_parent)
                spin_unlock(&dentry->d_parent->d_lock);
        if (dentry != old_parent)
                spin_unlock(&old_parent->d_lock);
        spin_unlock(&target->d_lock);
        spin_unlock(&dentry->d_lock);
}

/*
 * d_move - move a dentry
 * @dentry: entry to move
 * @target: new dentry
 *
 * Update the dcache to reflect the move of a file name. Negative
 * dcache entries should not be moved in this way. See the locking
 * requirements for __d_move.
 */
void d_move(struct dentry *dentry, struct dentry *target)
{
        write_seqlock(&rename_lock);
        __d_move(dentry, target, false);
        write_sequnlock(&rename_lock);
}
EXPORT_SYMBOL(d_move);

/*
 * d_exchange - exchange two dentries
 * @dentry1: first dentry
 * @dentry2: second dentry
 */
void d_exchange(struct dentry *dentry1, struct dentry *dentry2)
{
        write_seqlock(&rename_lock);

        WARN_ON(!dentry1->d_inode);
        WARN_ON(!dentry2->d_inode);
        WARN_ON(IS_ROOT(dentry1));
        WARN_ON(IS_ROOT(dentry2));

        __d_move(dentry1, dentry2, true);

        write_sequnlock(&rename_lock);
}

/**
 * d_ancestor - search for an ancestor
 * @p1: ancestor dentry
 * @p2: child dentry
 *
 * Returns the ancestor dentry of p2 which is a child of p1, if p1 is
 * an ancestor of p2, else NULL.
 */
struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p;

        for (p = p2; !IS_ROOT(p); p = p->d_parent) {
                if (p->d_parent == p1)
                        return p;
        }
        return NULL;
}

/*
 * This helper attempts to cope with remotely renamed directories
 *
 * It assumes that the caller is already holding
 * dentry->d_parent->d_inode->i_mutex, and rename_lock
 *
 * Note: If ever the locking in lock_rename() changes, then please
 * remember to update this too...
 */
static int __d_unalias(struct dentry *dentry, struct dentry *alias)
{
        struct mutex *m1 = NULL;
        struct rw_semaphore *m2 = NULL;
        int ret = -ESTALE;

        /* If alias and dentry share a parent, then no extra locks required */
        if (alias->d_parent == dentry->d_parent)
                goto out_unalias;

        /* See lock_rename() */
        if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
                goto out_err;
        m1 = &dentry->d_sb->s_vfs_rename_mutex;
        if (!inode_trylock_shared(alias->d_parent->d_inode))
                goto out_err;
        m2 = &alias->d_parent->d_inode->i_rwsem;
out_unalias:
        if (alias->d_op && alias->d_op->d_unalias_trylock &&
            !alias->d_op->d_unalias_trylock(alias))
                goto out_err;
        __d_move(alias, dentry, false);
        if (alias->d_op && alias->d_op->d_unalias_unlock)
                alias->d_op->d_unalias_unlock(alias);
        ret = 0;
out_err:
        if (m2)
                up_read(m2);
        if (m1)
                mutex_unlock(m1);
        return ret;
}

/**
 * d_splice_alias - splice a disconnected dentry into the tree if one exists
 * @inode:  the inode which may have a disconnected dentry
 * @dentry: a negative dentry which we want to point to the inode.
 *
 * If inode is a directory and has an IS_ROOT alias, then d_move that in
 * place of the given dentry and return it, else simply d_add the inode
 * to the dentry and return NULL.
 *
 * If a non-IS_ROOT directory is found, the filesystem is corrupt, and
 * we should error out: directories can't have multiple aliases.
 *
 * This is needed in the lookup routine of any filesystem that is exportable
 * (via knfsd) so that we can build dcache paths to directories effectively.
 *
 * If a dentry was found and moved, then it is returned.  Otherwise NULL
 * is returned.  This matches the expected return value of ->lookup.
 *
 * Cluster filesystems may call this function with a negative, hashed dentry.
 * In that case, we know that the inode will be a regular file, and also this
 * will only occur during atomic_open. So we need to check for the dentry
 * being already hashed only in the final case.
 */
struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
{
        if (IS_ERR(inode))
                return ERR_CAST(inode);

        BUG_ON(!d_unhashed(dentry));

        if (!inode)
                goto out;

        security_d_instantiate(dentry, inode);
        spin_lock(&inode->i_lock);
        if (S_ISDIR(inode->i_mode)) {
                struct dentry *new = __d_find_any_alias(inode);
                if (unlikely(new)) {
                        /* The reference to new ensures it remains an alias */
                        spin_unlock(&inode->i_lock);
                        write_seqlock(&rename_lock);
                        if (unlikely(d_ancestor(new, dentry))) {
                                write_sequnlock(&rename_lock);
                                dput(new);
                                new = ERR_PTR(-ELOOP);
                                pr_warn_ratelimited(
                                        "VFS: Lookup of '%s' in %s %s"
                                        " would have caused loop\n",
                                        dentry->d_name.name,
                                        inode->i_sb->s_type->name,
                                        inode->i_sb->s_id);
                        } else if (!IS_ROOT(new)) {
                                struct dentry *old_parent = dget(new->d_parent);
                                int err = __d_unalias(dentry, new);
                                write_sequnlock(&rename_lock);
                                if (err) {
                                        dput(new);
                                        new = ERR_PTR(err);
                                }
                                dput(old_parent);
                        } else {
                                __d_move(new, dentry, false);
                                write_sequnlock(&rename_lock);
                        }
                        iput(inode);
                        return new;
                }
        }
out:
        __d_add(dentry, inode);
        return NULL;
}
EXPORT_SYMBOL(d_splice_alias);

/*
 * Test whether new_dentry is a subdirectory of old_dentry.
 *
 * Trivially implemented using the dcache structure
 */

/**
 * is_subdir - is new dentry a subdirectory of old_dentry
 * @new_dentry: new dentry
 * @old_dentry: old dentry
 *
 * Returns true if new_dentry is a subdirectory of the parent (at any depth).
 * Returns false otherwise.
 * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
 */
  
bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
{
        bool subdir;
        unsigned seq;

        if (new_dentry == old_dentry)
                return true;

        /* Access d_parent under rcu as d_move() may change it. */
        rcu_read_lock();
        seq = read_seqbegin(&rename_lock);
        subdir = d_ancestor(old_dentry, new_dentry);
         /* Try lockless once... */
        if (read_seqretry(&rename_lock, seq)) {
                /* ...else acquire lock for progress even on deep chains. */
                read_seqlock_excl(&rename_lock);
                subdir = d_ancestor(old_dentry, new_dentry);
                read_sequnlock_excl(&rename_lock);
        }
        rcu_read_unlock();
        return subdir;
}
EXPORT_SYMBOL(is_subdir);

static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry)
{
        struct dentry *root = data;
        if (dentry != root) {
                if (d_unhashed(dentry) || !dentry->d_inode)
                        return D_WALK_SKIP;

                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
                        dentry->d_flags |= DCACHE_GENOCIDE;
                        dentry->d_lockref.count--;
                }
        }
        return D_WALK_CONTINUE;
}

void d_genocide(struct dentry *parent)
{
        d_walk(parent, parent, d_genocide_kill);
}

void d_mark_tmpfile(struct file *file, struct inode *inode)
{
        struct dentry *dentry = file->f_path.dentry;

        BUG_ON(dname_external(dentry) ||
                !hlist_unhashed(&dentry->d_u.d_alias) ||
                !d_unlinked(dentry));
        spin_lock(&dentry->d_parent->d_lock);
        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
        dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu",
                                (unsigned long long)inode->i_ino);
        spin_unlock(&dentry->d_lock);
        spin_unlock(&dentry->d_parent->d_lock);
}
EXPORT_SYMBOL(d_mark_tmpfile);

void d_tmpfile(struct file *file, struct inode *inode)
{
        struct dentry *dentry = file->f_path.dentry;

        inode_dec_link_count(inode);
        d_mark_tmpfile(file, inode);
        d_instantiate(dentry, inode);
}
EXPORT_SYMBOL(d_tmpfile);

/*
 * Obtain inode number of the parent dentry.
 */
ino_t d_parent_ino(struct dentry *dentry)
{
        struct dentry *parent;
        struct inode *iparent;
        unsigned seq;
        ino_t ret;

        scoped_guard(rcu) {
                seq = raw_seqcount_begin(&dentry->d_seq);
                parent = READ_ONCE(dentry->d_parent);
                iparent = d_inode_rcu(parent);
                if (likely(iparent)) {
                        ret = iparent->i_ino;
                        if (!read_seqcount_retry(&dentry->d_seq, seq))
                                return ret;
                }
        }

        spin_lock(&dentry->d_lock);
        ret = dentry->d_parent->d_inode->i_ino;
        spin_unlock(&dentry->d_lock);
        return ret;
}
EXPORT_SYMBOL(d_parent_ino);

static __initdata unsigned long dhash_entries;
static int __init set_dhash_entries(char *str)
{
        if (!str)
                return 0;
        dhash_entries = simple_strtoul(str, &str, 0);
        return 1;
}
__setup("dhash_entries=", set_dhash_entries);

static void __init dcache_init_early(void)
{
        /* If hashes are distributed across NUMA nodes, defer
         * hash allocation until vmalloc space is available.
         */
        if (hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_EARLY | HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;

        runtime_const_init(shift, d_hash_shift);
        runtime_const_init(ptr, dentry_hashtable);
}

static void __init dcache_init(void)
{
        /*
         * A constructor could be added for stable state like the lists,
         * but it is probably not worth it because of the cache nature
         * of the dcache.
         */
        dentry_cache = KMEM_CACHE_USERCOPY(dentry,
                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
                d_shortname.string);

        /* Hash may have been set up in dcache_init_early */
        if (!hashdist)
                return;

        dentry_hashtable =
                alloc_large_system_hash("Dentry cache",
                                        sizeof(struct hlist_bl_head),
                                        dhash_entries,
                                        13,
                                        HASH_ZERO,
                                        &d_hash_shift,
                                        NULL,
                                        0,
                                        0);
        d_hash_shift = 32 - d_hash_shift;

        runtime_const_init(shift, d_hash_shift);
        runtime_const_init(ptr, dentry_hashtable);
}

/* SLAB cache for __getname() consumers */
struct kmem_cache *names_cachep __ro_after_init;
EXPORT_SYMBOL(names_cachep);

void __init vfs_caches_init_early(void)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++)
                INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]);

        dcache_init_early();
        inode_init_early();
}

void __init vfs_caches_init(void)
{
        names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0,
                        SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL);

        dcache_init();
        inode_init();
        files_init();
        files_maxfiles_init();
        mnt_init();
        bdev_cache_init();
        chrdev_init();
}




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _DELAYED_CALL_H
#define _DELAYED_CALL_H

/*
 * Poor man's closures; I wish we could've done them sanely polymorphic,
 * but...
 */

struct delayed_call {
        void (*fn)(void *);
        void *arg;
};

#define DEFINE_DELAYED_CALL(name) struct delayed_call name = {NULL, NULL}

/* I really wish we had closures with sane typechecking... */
static inline void set_delayed_call(struct delayed_call *call,
                void (*fn)(void *), void *arg)
{
        call->fn = fn;
        call->arg = arg;
}

static inline void do_delayed_call(struct delayed_call *call)
{
        if (call->fn)
                call->fn(call->arg);
}

static inline void clear_delayed_call(struct delayed_call *call)
{
        call->fn = NULL;
}
#endif































































































  152 

  152 



































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
// SPDX-License-Identifier: GPL-2.0
/*
 * SafeSetID Linux Security Module
 *
 * Author: Micah Morton <mortonm@chromium.org>
 *
 * Copyright (C) 2018 The Chromium OS Authors.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2, as
 * published by the Free Software Foundation.
 *
 */

#define pr_fmt(fmt) "SafeSetID: " fmt

#include <linux/lsm_hooks.h>
#include <linux/module.h>
#include <linux/ptrace.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <uapi/linux/lsm.h>
#include "lsm.h"

/* Flag indicating whether initialization completed */
int safesetid_initialized __initdata;

struct setid_ruleset __rcu *safesetid_setuid_rules;
struct setid_ruleset __rcu *safesetid_setgid_rules;


/* Compute a decision for a transition from @src to @dst under @policy. */
enum sid_policy_type _setid_policy_lookup(struct setid_ruleset *policy,
                kid_t src, kid_t dst)
{
        struct setid_rule *rule;
        enum sid_policy_type result = SIDPOL_DEFAULT;

        if (policy->type == UID) {
                hash_for_each_possible(policy->rules, rule, next, __kuid_val(src.uid)) {
                        if (!uid_eq(rule->src_id.uid, src.uid))
                                continue;
                        if (uid_eq(rule->dst_id.uid, dst.uid))
                                return SIDPOL_ALLOWED;
                        result = SIDPOL_CONSTRAINED;
                }
        } else if (policy->type == GID) {
                hash_for_each_possible(policy->rules, rule, next, __kgid_val(src.gid)) {
                        if (!gid_eq(rule->src_id.gid, src.gid))
                                continue;
                        if (gid_eq(rule->dst_id.gid, dst.gid)){
                                return SIDPOL_ALLOWED;
                        }
                        result = SIDPOL_CONSTRAINED;
                }
        } else {
                /* Should not reach here, report the ID as contrainsted */
                result = SIDPOL_CONSTRAINED;
        }
        return result;
}

/*
 * Compute a decision for a transition from @src to @dst under the active
 * policy.
 */
static enum sid_policy_type setid_policy_lookup(kid_t src, kid_t dst, enum setid_type new_type)
{
        enum sid_policy_type result = SIDPOL_DEFAULT;
        struct setid_ruleset *pol;

        rcu_read_lock();
        if (new_type == UID)
                pol = rcu_dereference(safesetid_setuid_rules);
        else if (new_type == GID)
                pol = rcu_dereference(safesetid_setgid_rules);
        else { /* Should not reach here */
                result = SIDPOL_CONSTRAINED;
                rcu_read_unlock();
                return result;
        }

        if (pol) {
                pol->type = new_type;
                result = _setid_policy_lookup(pol, src, dst);
        }
        rcu_read_unlock();
        return result;
}

static int safesetid_security_capable(const struct cred *cred,
                                      struct user_namespace *ns,
                                      int cap,
                                      unsigned int opts)
{
        /* We're only interested in CAP_SETUID and CAP_SETGID. */
        if (cap != CAP_SETUID && cap != CAP_SETGID)
                return 0;

        /*
         * If CAP_SET{U/G}ID is currently used for a setid or setgroups syscall, we
         * want to let it go through here; the real security check happens later, in
         * the task_fix_set{u/g}id or task_fix_setgroups hooks.
         */
        if ((opts & CAP_OPT_INSETID) != 0)
                return 0;

        switch (cap) {
        case CAP_SETUID:
                /*
                * If no policy applies to this task, allow the use of CAP_SETUID for
                * other purposes.
                */
                if (setid_policy_lookup((kid_t){.uid = cred->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT)
                        return 0;
                /*
                 * Reject use of CAP_SETUID for functionality other than calling
                 * set*uid() (e.g. setting up userns uid mappings).
                 */
                pr_warn("Operation requires CAP_SETUID, which is not available to UID %u for operations besides approved set*uid transitions\n",
                        __kuid_val(cred->uid));
                return -EPERM;
        case CAP_SETGID:
                /*
                * If no policy applies to this task, allow the use of CAP_SETGID for
                * other purposes.
                */
                if (setid_policy_lookup((kid_t){.gid = cred->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                        return 0;
                /*
                 * Reject use of CAP_SETUID for functionality other than calling
                 * set*gid() (e.g. setting up userns gid mappings).
                 */
                pr_warn("Operation requires CAP_SETGID, which is not available to GID %u for operations besides approved set*gid transitions\n",
                        __kgid_val(cred->gid));
                return -EPERM;
        default:
                /* Error, the only capabilities were checking for is CAP_SETUID/GID */
                return 0;
        }
        return 0;
}

/*
 * Check whether a caller with old credentials @old is allowed to switch to
 * credentials that contain @new_id.
 */
static bool id_permitted_for_cred(const struct cred *old, kid_t new_id, enum setid_type new_type)
{
        bool permitted;

        /* If our old creds already had this ID in it, it's fine. */
        if (new_type == UID) {
                if (uid_eq(new_id.uid, old->uid) || uid_eq(new_id.uid, old->euid) ||
                        uid_eq(new_id.uid, old->suid))
                        return true;
        } else if (new_type == GID){
                if (gid_eq(new_id.gid, old->gid) || gid_eq(new_id.gid, old->egid) ||
                        gid_eq(new_id.gid, old->sgid))
                        return true;
        } else /* Error, new_type is an invalid type */
                return false;

        /*
         * Transitions to new UIDs require a check against the policy of the old
         * RUID.
         */
        permitted =
            setid_policy_lookup((kid_t){.uid = old->uid}, new_id, new_type) != SIDPOL_CONSTRAINED;

        if (!permitted) {
                if (new_type == UID) {
                        pr_warn("UID transition ((%d,%d,%d) -> %d) blocked\n",
                                __kuid_val(old->uid), __kuid_val(old->euid),
                                __kuid_val(old->suid), __kuid_val(new_id.uid));
                } else if (new_type == GID) {
                        pr_warn("GID transition ((%d,%d,%d) -> %d) blocked\n",
                                __kgid_val(old->gid), __kgid_val(old->egid),
                                __kgid_val(old->sgid), __kgid_val(new_id.gid));
                } else /* Error, new_type is an invalid type */
                        return false;
        }
        return permitted;
}

/*
 * Check whether there is either an exception for user under old cred struct to
 * set*uid to user under new cred struct, or the UID transition is allowed (by
 * Linux set*uid rules) even without CAP_SETUID.
 */
static int safesetid_task_fix_setuid(struct cred *new,
                                     const struct cred *old,
                                     int flags)
{

        /* Do nothing if there are no setuid restrictions for our old RUID. */
        if (setid_policy_lookup((kid_t){.uid = old->uid}, INVALID_ID, UID) == SIDPOL_DEFAULT)
                return 0;

        if (id_permitted_for_cred(old, (kid_t){.uid = new->uid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->euid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->suid}, UID) &&
            id_permitted_for_cred(old, (kid_t){.uid = new->fsuid}, UID))
                return 0;

        /*
         * Kill this process to avoid potential security vulnerabilities
         * that could arise from a missing allowlist entry preventing a
         * privileged process from dropping to a lesser-privileged one.
         */
        force_sig(SIGKILL);
        return -EACCES;
}

static int safesetid_task_fix_setgid(struct cred *new,
                                     const struct cred *old,
                                     int flags)
{

        /* Do nothing if there are no setgid restrictions for our old RGID. */
        if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                return 0;

        if (id_permitted_for_cred(old, (kid_t){.gid = new->gid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->egid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->sgid}, GID) &&
            id_permitted_for_cred(old, (kid_t){.gid = new->fsgid}, GID))
                return 0;

        /*
         * Kill this process to avoid potential security vulnerabilities
         * that could arise from a missing allowlist entry preventing a
         * privileged process from dropping to a lesser-privileged one.
         */
        force_sig(SIGKILL);
        return -EACCES;
}

static int safesetid_task_fix_setgroups(struct cred *new, const struct cred *old)
{
        int i;

        /* Do nothing if there are no setgid restrictions for our old RGID. */
        if (setid_policy_lookup((kid_t){.gid = old->gid}, INVALID_ID, GID) == SIDPOL_DEFAULT)
                return 0;

        get_group_info(new->group_info);
        for (i = 0; i < new->group_info->ngroups; i++) {
                if (!id_permitted_for_cred(old, (kid_t){.gid = new->group_info->gid[i]}, GID)) {
                        put_group_info(new->group_info);
                        /*
                         * Kill this process to avoid potential security vulnerabilities
                         * that could arise from a missing allowlist entry preventing a
                         * privileged process from dropping to a lesser-privileged one.
                         */
                        force_sig(SIGKILL);
                        return -EACCES;
                }
        }

        put_group_info(new->group_info);
        return 0;
}

static const struct lsm_id safesetid_lsmid = {
        .name = "safesetid",
        .id = LSM_ID_SAFESETID,
};

static struct security_hook_list safesetid_security_hooks[] = {
        LSM_HOOK_INIT(task_fix_setuid, safesetid_task_fix_setuid),
        LSM_HOOK_INIT(task_fix_setgid, safesetid_task_fix_setgid),
        LSM_HOOK_INIT(task_fix_setgroups, safesetid_task_fix_setgroups),
        LSM_HOOK_INIT(capable, safesetid_security_capable)
};

static int __init safesetid_security_init(void)
{
        security_add_hooks(safesetid_security_hooks,
                           ARRAY_SIZE(safesetid_security_hooks),
                           &safesetid_lsmid);

        /* Report that SafeSetID successfully initialized */
        safesetid_initialized = 1;

        return 0;
}

DEFINE_LSM(safesetid_security_init) = {
        .init = safesetid_security_init,
        .name = "safesetid",
};






























  220 













  257 
   57 










   57 
















  140 

















  257 

   48 


  220 







































































  141 







  256 


  254 



  256 







  220 






  220 





  141 

  141 

































  218 


  152 
  115 









  221 







  221 

   48 


  220 



  220 






  218 





  143 



   32 


  141 
  141 


  141 
  141 



  141 









































  221 

  220 


   48 
   48 
   48 




















  143 

  143 




   32 
   32 












  321 




















  255 









  321 





   24 




























  322 
















































  322 








































































































  478 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef LINUX_MM_INLINE_H
#define LINUX_MM_INLINE_H

#include <linux/atomic.h>
#include <linux/huge_mm.h>
#include <linux/mm_types.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/userfaultfd_k.h>
#include <linux/swapops.h>

/**
 * folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
 * @folio: The folio to test.
 *
 * We would like to get this info without a page flag, but the state
 * needs to survive until the folio is last deleted from the LRU, which
 * could be as far down as __page_cache_release.
 *
 * Return: An integer (not a boolean!) used to sort a folio onto the
 * right LRU list and to account folios correctly.
 * 1 if @folio is a regular filesystem backed page cache folio
 * or a lazily freed anonymous folio (e.g. via MADV_FREE).
 * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise
 * ram or swap backed folio.
 */
static inline int folio_is_file_lru(struct folio *folio)
{
        return !folio_test_swapbacked(folio);
}

static inline int page_is_file_lru(struct page *page)
{
        return folio_is_file_lru(page_folio(page));
}

static __always_inline void __update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                long nr_pages)
{
        struct pglist_data *pgdat = lruvec_pgdat(lruvec);

        lockdep_assert_held(&lruvec->lru_lock);
        WARN_ON_ONCE(nr_pages != (int)nr_pages);

        __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
        __mod_zone_page_state(&pgdat->node_zones[zid],
                                NR_ZONE_LRU_BASE + lru, nr_pages);
}

static __always_inline void update_lru_size(struct lruvec *lruvec,
                                enum lru_list lru, enum zone_type zid,
                                long nr_pages)
{
        __update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
        mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}

/**
 * __folio_clear_lru_flags - Clear page lru flags before releasing a page.
 * @folio: The folio that was on lru and now has a zero reference.
 */
static __always_inline void __folio_clear_lru_flags(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio);

        __folio_clear_lru(folio);

        /* this shouldn't happen, so leave the flags to bad_page() */
        if (folio_test_active(folio) && folio_test_unevictable(folio))
                return;

        __folio_clear_active(folio);
        __folio_clear_unevictable(folio);
}

/**
 * folio_lru_list - Which LRU list should a folio be on?
 * @folio: The folio to test.
 *
 * Return: The LRU list a folio should be on, as an index
 * into the array of LRU lists.
 */
static __always_inline enum lru_list folio_lru_list(struct folio *folio)
{
        enum lru_list lru;

        VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);

        if (folio_test_unevictable(folio))
                return LRU_UNEVICTABLE;

        lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON;
        if (folio_test_active(folio))
                lru += LRU_ACTIVE;

        return lru;
}

#ifdef CONFIG_LRU_GEN

#ifdef CONFIG_LRU_GEN_ENABLED
static inline bool lru_gen_enabled(void)
{
        DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);

        return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
}
#else
static inline bool lru_gen_enabled(void)
{
        DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);

        return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
}
#endif

static inline bool lru_gen_in_fault(void)
{
        return current->in_lru_fault;
}

static inline int lru_gen_from_seq(unsigned long seq)
{
        return seq % MAX_NR_GENS;
}

static inline int lru_hist_from_seq(unsigned long seq)
{
        return seq % NR_HIST_GENS;
}

static inline int lru_tier_from_refs(int refs, bool workingset)
{
        VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH));

        /* see the comment on MAX_NR_TIERS */
        return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs);
}

static inline int folio_lru_refs(struct folio *folio)
{
        unsigned long flags = READ_ONCE(folio->flags);

        if (!(flags & BIT(PG_referenced)))
                return 0;
        /*
         * Return the total number of accesses including PG_referenced. Also see
         * the comment on LRU_REFS_FLAGS.
         */
        return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1;
}

static inline int folio_lru_gen(struct folio *folio)
{
        unsigned long flags = READ_ONCE(folio->flags);

        return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
}

static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
{
        unsigned long max_seq = lruvec->lrugen.max_seq;

        VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);

        /* see the comment on MIN_NR_GENS */
        return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
}

static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio,
                                       int old_gen, int new_gen)
{
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        int delta = folio_nr_pages(folio);
        enum lru_list lru = type * LRU_INACTIVE_FILE;
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
        VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
        VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);

        if (old_gen >= 0)
                WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
                           lrugen->nr_pages[old_gen][type][zone] - delta);
        if (new_gen >= 0)
                WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
                           lrugen->nr_pages[new_gen][type][zone] + delta);

        /* addition */
        if (old_gen < 0) {
                if (lru_gen_is_active(lruvec, new_gen))
                        lru += LRU_ACTIVE;
                __update_lru_size(lruvec, lru, zone, delta);
                return;
        }

        /* deletion */
        if (new_gen < 0) {
                if (lru_gen_is_active(lruvec, old_gen))
                        lru += LRU_ACTIVE;
                __update_lru_size(lruvec, lru, zone, -delta);
                return;
        }

        /* promotion */
        if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
                __update_lru_size(lruvec, lru, zone, -delta);
                __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
        }

        /* demotion requires isolation, e.g., lru_deactivate_fn() */
        VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
}

static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio,
                                              bool reclaiming)
{
        int gen;
        int type = folio_is_file_lru(folio);
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        /*
         * +-----------------------------------+-----------------------------------+
         * | Accessed through page tables and  | Accessed through file descriptors |
         * | promoted by folio_update_gen()    | and protected by folio_inc_gen()  |
         * +-----------------------------------+-----------------------------------+
         * | PG_active (set while isolated)    |                                   |
         * +-----------------+-----------------+-----------------+-----------------+
         * |  PG_workingset  |  PG_referenced  |  PG_workingset  |  LRU_REFS_FLAGS |
         * +-----------------------------------+-----------------------------------+
         * |<---------- MIN_NR_GENS ---------->|                                   |
         * |<---------------------------- MAX_NR_GENS ---------------------------->|
         */
        if (folio_test_active(folio))
                gen = MIN_NR_GENS - folio_test_workingset(folio);
        else if (reclaiming)
                gen = MAX_NR_GENS;
        else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) ||
                 (folio_test_reclaim(folio) &&
                  (folio_test_dirty(folio) || folio_test_writeback(folio))))
                gen = MIN_NR_GENS;
        else
                gen = MAX_NR_GENS - folio_test_workingset(folio);

        return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type]));
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        unsigned long seq;
        unsigned long flags;
        int gen = folio_lru_gen(folio);
        int type = folio_is_file_lru(folio);
        int zone = folio_zonenum(folio);
        struct lru_gen_folio *lrugen = &lruvec->lrugen;

        VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);

        if (folio_test_unevictable(folio) || !lrugen->enabled)
                return false;

        seq = lru_gen_folio_seq(lruvec, folio, reclaiming);
        gen = lru_gen_from_seq(seq);
        flags = (gen + 1UL) << LRU_GEN_PGOFF;
        /* see the comment on MIN_NR_GENS about PG_active */
        set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags);

        lru_gen_update_size(lruvec, folio, -1, gen);
        /* for folio_rotate_reclaimable() */
        if (reclaiming)
                list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
        else
                list_add(&folio->lru, &lrugen->folios[gen][type][zone]);

        return true;
}

static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        unsigned long flags;
        int gen = folio_lru_gen(folio);

        if (gen < 0)
                return false;

        VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
        VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);

        /* for folio_migrate_flags() */
        flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
        flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags);
        gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;

        lru_gen_update_size(lruvec, folio, gen, -1);
        list_del(&folio->lru);

        return true;
}

static inline void folio_migrate_refs(struct folio *new, struct folio *old)
{
        unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK;

        set_mask_bits(&new->flags, LRU_REFS_MASK, refs);
}
#else /* !CONFIG_LRU_GEN */

static inline bool lru_gen_enabled(void)
{
        return false;
}

static inline bool lru_gen_in_fault(void)
{
        return false;
}

static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        return false;
}

static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
{
        return false;
}

static inline void folio_migrate_refs(struct folio *new, struct folio *old)
{

}
#endif /* CONFIG_LRU_GEN */

static __always_inline
void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        if (lru_gen_add_folio(lruvec, folio, false))
                return;

        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        folio_nr_pages(folio));
        if (lru != LRU_UNEVICTABLE)
                list_add(&folio->lru, &lruvec->lists[lru]);
}

static __always_inline
void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        if (lru_gen_add_folio(lruvec, folio, true))
                return;

        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        folio_nr_pages(folio));
        /* This is not expected to be used on LRU_UNEVICTABLE */
        list_add_tail(&folio->lru, &lruvec->lists[lru]);
}

static __always_inline
void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio)
{
        enum lru_list lru = folio_lru_list(folio);

        if (lru_gen_del_folio(lruvec, folio, false))
                return;

        if (lru != LRU_UNEVICTABLE)
                list_del(&folio->lru);
        update_lru_size(lruvec, lru, folio_zonenum(folio),
                        -folio_nr_pages(folio));
}

#ifdef CONFIG_ANON_VMA_NAME
/* mmap_lock should be read-locked */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name)
{
        if (anon_name)
                kref_get(&anon_name->kref);
}

static inline void anon_vma_name_put(struct anon_vma_name *anon_name)
{
        if (anon_name)
                kref_put(&anon_name->kref, anon_vma_name_free);
}

static inline
struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name)
{
        /* Prevent anon_name refcount saturation early on */
        if (kref_read(&anon_name->kref) < REFCOUNT_MAX) {
                anon_vma_name_get(anon_name);
                return anon_name;

        }
        return anon_vma_name_alloc(anon_name->name);
}

static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
                                     struct vm_area_struct *new_vma)
{
        struct anon_vma_name *anon_name = anon_vma_name(orig_vma);

        if (anon_name)
                new_vma->anon_name = anon_vma_name_reuse(anon_name);
}

static inline void free_anon_vma_name(struct vm_area_struct *vma)
{
        /*
         * Not using anon_vma_name because it generates a warning if mmap_lock
         * is not held, which might be the case here.
         */
        anon_vma_name_put(vma->anon_name);
}

static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
                                    struct anon_vma_name *anon_name2)
{
        if (anon_name1 == anon_name2)
                return true;

        return anon_name1 && anon_name2 &&
                !strcmp(anon_name1->name, anon_name2->name);
}

#else /* CONFIG_ANON_VMA_NAME */
static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {}
static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {}
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
                                     struct vm_area_struct *new_vma) {}
static inline void free_anon_vma_name(struct vm_area_struct *vma) {}

static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1,
                                    struct anon_vma_name *anon_name2)
{
        return true;
}

#endif  /* CONFIG_ANON_VMA_NAME */

static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_set(&mm->tlb_flush_pending, 0);
}

static inline void inc_tlb_flush_pending(struct mm_struct *mm)
{
        atomic_inc(&mm->tlb_flush_pending);
        /*
         * The only time this value is relevant is when there are indeed pages
         * to flush. And we'll only flush pages after changing them, which
         * requires the PTL.
         *
         * So the ordering here is:
         *
         *        atomic_inc(&mm->tlb_flush_pending);
         *        spin_lock(&ptl);
         *        ...
         *        set_pte_at();
         *        spin_unlock(&ptl);
         *
         *                                spin_lock(&ptl)
         *                                mm_tlb_flush_pending();
         *                                ....
         *                                spin_unlock(&ptl);
         *
         *        flush_tlb_range();
         *        atomic_dec(&mm->tlb_flush_pending);
         *
         * Where the increment if constrained by the PTL unlock, it thus
         * ensures that the increment is visible if the PTE modification is
         * visible. After all, if there is no PTE modification, nobody cares
         * about TLB flushes either.
         *
         * This very much relies on users (mm_tlb_flush_pending() and
         * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
         * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
         * locks (PPC) the unlock of one doesn't order against the lock of
         * another PTL.
         *
         * The decrement is ordered by the flush_tlb_range(), such that
         * mm_tlb_flush_pending() will not return false unless all flushes have
         * completed.
         */
}

static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
        /*
         * See inc_tlb_flush_pending().
         *
         * This cannot be smp_mb__before_atomic() because smp_mb() simply does
         * not order against TLB invalidate completion, which is what we need.
         *
         * Therefore we must rely on tlb_flush_*() to guarantee order.
         */
        atomic_dec(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
        /*
         * Must be called after having acquired the PTL; orders against that
         * PTLs release and therefore ensures that if we observe the modified
         * PTE we must also observe the increment from inc_tlb_flush_pending().
         *
         * That is, it only guarantees to return true if there is a flush
         * pending for _this_ PTL.
         */
        return atomic_read(&mm->tlb_flush_pending);
}

static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
{
        /*
         * Similar to mm_tlb_flush_pending(), we must have acquired the PTL
         * for which there is a TLB flush pending in order to guarantee
         * we've seen both that PTE modification and the increment.
         *
         * (no requirement on actually still holding the PTL, that is irrelevant)
         */
        return atomic_read(&mm->tlb_flush_pending) > 1;
}

#ifdef CONFIG_MMU
/*
 * Computes the pte marker to copy from the given source entry into dst_vma.
 * If no marker should be copied, returns 0.
 * The caller should insert a new pte created with make_pte_marker().
 */
static inline pte_marker copy_pte_marker(
                swp_entry_t entry, struct vm_area_struct *dst_vma)
{
        pte_marker srcm = pte_marker_get(entry);
        /* Always copy error entries. */
        pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD);

        /* Only copy PTE markers if UFFD register matches. */
        if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma))
                dstm |= PTE_MARKER_UFFD_WP;

        return dstm;
}
#endif

/*
 * If this pte is wr-protected by uffd-wp in any form, arm the special pte to
 * replace a none pte.  NOTE!  This should only be called when *pte is already
 * cleared so we will never accidentally replace something valuable.  Meanwhile
 * none pte also means we are not demoting the pte so tlb flushed is not needed.
 * E.g., when pte cleared the caller should have taken care of the tlb flush.
 *
 * Must be called with pgtable lock held so that no thread will see the none
 * pte, and if they see it, they'll fault and serialize at the pgtable lock.
 *
 * Returns true if an uffd-wp pte was installed, false otherwise.
 */
static inline bool
pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr,
                              pte_t *pte, pte_t pteval)
{
#ifdef CONFIG_PTE_MARKER_UFFD_WP
        bool arm_uffd_pte = false;

        /* The current status of the pte should be "cleared" before calling */
        WARN_ON_ONCE(!pte_none(ptep_get(pte)));

        /*
         * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole
         * thing, because when zapping either it means it's dropping the
         * page, or in TTU where the present pte will be quickly replaced
         * with a swap pte.  There's no way of leaking the bit.
         */
        if (vma_is_anonymous(vma) || !userfaultfd_wp(vma))
                return false;

        /* A uffd-wp wr-protected normal pte */
        if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval)))
                arm_uffd_pte = true;

        /*
         * A uffd-wp wr-protected swap pte.  Note: this should even cover an
         * existing pte marker with uffd-wp bit set.
         */
        if (unlikely(pte_swp_uffd_wp_any(pteval)))
                arm_uffd_pte = true;

        if (unlikely(arm_uffd_pte)) {
                set_pte_at(vma->vm_mm, addr, pte,
                           make_pte_marker(PTE_MARKER_UFFD_WP));
                return true;
        }
#endif
        return false;
}

static inline bool vma_has_recency(struct vm_area_struct *vma)
{
        if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
                return false;

        if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
                return false;

        return true;
}

#endif

































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/* SPDX-License-Identifier: GPL-2.0-only */

#ifndef __KVM_MM_H__
#define __KVM_MM_H__ 1

/*
 * Architectures can choose whether to use an rwlock or spinlock
 * for the mmu_lock.  These macros, for use in common code
 * only, avoids using #ifdefs in places that must deal with
 * multiple architectures.
 */

#ifdef KVM_HAVE_MMU_RWLOCK
#define KVM_MMU_LOCK_INIT(kvm)                rwlock_init(&(kvm)->mmu_lock)
#define KVM_MMU_LOCK(kvm)                write_lock(&(kvm)->mmu_lock)
#define KVM_MMU_UNLOCK(kvm)                write_unlock(&(kvm)->mmu_lock)
#else
#define KVM_MMU_LOCK_INIT(kvm)                spin_lock_init(&(kvm)->mmu_lock)
#define KVM_MMU_LOCK(kvm)                spin_lock(&(kvm)->mmu_lock)
#define KVM_MMU_UNLOCK(kvm)                spin_unlock(&(kvm)->mmu_lock)
#endif /* KVM_HAVE_MMU_RWLOCK */


struct kvm_follow_pfn {
        const struct kvm_memory_slot *slot;
        const gfn_t gfn;

        unsigned long hva;

        /* FOLL_* flags modifying lookup behavior, e.g. FOLL_WRITE. */
        unsigned int flags;

        /*
         * Pin the page (effectively FOLL_PIN, which is an mm/ internal flag).
         * The page *must* be pinned if KVM will write to the page via a kernel
         * mapping, e.g. via kmap(), mremap(), etc.
         */
        bool pin;

        /*
         * If non-NULL, try to get a writable mapping even for a read fault.
         * Set to true if a writable mapping was obtained.
         */
        bool *map_writable;

        /*
         * Optional output.  Set to a valid "struct page" if the returned pfn
         * is for a refcounted or pinned struct page, NULL if the returned pfn
         * has no struct page or if the struct page is not being refcounted
         * (e.g. tail pages of non-compound higher order allocations from
         * IO/PFNMAP mappings).
         */
        struct page **refcounted_page;
};

kvm_pfn_t hva_to_pfn(struct kvm_follow_pfn *kfp);

#ifdef CONFIG_HAVE_KVM_PFNCACHE
void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
                                       unsigned long start,
                                       unsigned long end);
#else
static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
                                                     unsigned long start,
                                                     unsigned long end)
{
}
#endif /* HAVE_KVM_PFNCACHE */

#ifdef CONFIG_KVM_PRIVATE_MEM
void kvm_gmem_init(struct module *module);
int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
                  unsigned int fd, loff_t offset);
void kvm_gmem_unbind(struct kvm_memory_slot *slot);
#else
static inline void kvm_gmem_init(struct module *module)
{

}

static inline int kvm_gmem_bind(struct kvm *kvm,
                                         struct kvm_memory_slot *slot,
                                         unsigned int fd, loff_t offset)
{
        WARN_ON_ONCE(1);
        return -EIO;
}

static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
{
        WARN_ON_ONCE(1);
}
#endif /* CONFIG_KVM_PRIVATE_MEM */

#endif /* __KVM_MM_H__ */











    4 






    4 






























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
// SPDX-License-Identifier: GPL-2.0
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
#include <linux/netpoll.h>
#include <linux/export.h>
#include <net/gro.h>
#include "vlan.h"

bool vlan_do_receive(struct sk_buff **skbp)
{
        struct sk_buff *skb = *skbp;
        __be16 vlan_proto = skb->vlan_proto;
        u16 vlan_id = skb_vlan_tag_get_id(skb);
        struct net_device *vlan_dev;
        struct vlan_pcpu_stats *rx_stats;

        vlan_dev = vlan_find_dev(skb->dev, vlan_proto, vlan_id);
        if (!vlan_dev)
                return false;

        skb = *skbp = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                return false;

        if (unlikely(!(vlan_dev->flags & IFF_UP))) {
                kfree_skb(skb);
                *skbp = NULL;
                return false;
        }

        skb->dev = vlan_dev;
        if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) {
                /* Our lower layer thinks this is not local, let's make sure.
                 * This allows the VLAN to have a different MAC than the
                 * underlying device, and still route correctly. */
                if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, vlan_dev->dev_addr))
                        skb->pkt_type = PACKET_HOST;
        }

        if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) &&
            !netif_is_macvlan_port(vlan_dev) &&
            !netif_is_bridge_port(vlan_dev)) {
                unsigned int offset = skb->data - skb_mac_header(skb);

                /*
                 * vlan_insert_tag expect skb->data pointing to mac header.
                 * So change skb->data before calling it and change back to
                 * original position later
                 */
                skb_push(skb, offset);
                skb = *skbp = vlan_insert_inner_tag(skb, skb->vlan_proto,
                                                    skb->vlan_tci, skb->mac_len);
                if (!skb)
                        return false;
                skb_pull(skb, offset + VLAN_HLEN);
                skb_reset_mac_len(skb);
        }

        skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
        __vlan_hwaccel_clear_tag(skb);

        rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);

        u64_stats_update_begin(&rx_stats->syncp);
        u64_stats_inc(&rx_stats->rx_packets);
        u64_stats_add(&rx_stats->rx_bytes, skb->len);
        if (skb->pkt_type == PACKET_MULTICAST)
                u64_stats_inc(&rx_stats->rx_multicast);
        u64_stats_update_end(&rx_stats->syncp);

        return true;
}

/* Must be invoked with rcu_read_lock. */
struct net_device *__vlan_find_dev_deep_rcu(struct net_device *dev,
                                        __be16 vlan_proto, u16 vlan_id)
{
        struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info);

        if (vlan_info) {
                return vlan_group_get_device(&vlan_info->grp,
                                             vlan_proto, vlan_id);
        } else {
                /*
                 * Lower devices of master uppers (bonding, team) do not have
                 * grp assigned to themselves. Grp is assigned to upper device
                 * instead.
                 */
                struct net_device *upper_dev;

                upper_dev = netdev_master_upper_dev_get_rcu(dev);
                if (upper_dev)
                        return __vlan_find_dev_deep_rcu(upper_dev,
                                                    vlan_proto, vlan_id);
        }

        return NULL;
}
EXPORT_SYMBOL(__vlan_find_dev_deep_rcu);

struct net_device *vlan_dev_real_dev(const struct net_device *dev)
{
        struct net_device *ret = vlan_dev_priv(dev)->real_dev;

        while (is_vlan_dev(ret))
                ret = vlan_dev_priv(ret)->real_dev;

        return ret;
}
EXPORT_SYMBOL(vlan_dev_real_dev);

u16 vlan_dev_vlan_id(const struct net_device *dev)
{
        return vlan_dev_priv(dev)->vlan_id;
}
EXPORT_SYMBOL(vlan_dev_vlan_id);

__be16 vlan_dev_vlan_proto(const struct net_device *dev)
{
        return vlan_dev_priv(dev)->vlan_proto;
}
EXPORT_SYMBOL(vlan_dev_vlan_proto);

/*
 * vlan info and vid list
 */

static void vlan_group_free(struct vlan_group *grp)
{
        int i, j;

        for (i = 0; i < VLAN_PROTO_NUM; i++)
                for (j = 0; j < VLAN_GROUP_ARRAY_SPLIT_PARTS; j++)
                        kfree(grp->vlan_devices_arrays[i][j]);
}

static void vlan_info_free(struct vlan_info *vlan_info)
{
        vlan_group_free(&vlan_info->grp);
        kfree(vlan_info);
}

static void vlan_info_rcu_free(struct rcu_head *rcu)
{
        vlan_info_free(container_of(rcu, struct vlan_info, rcu));
}

static struct vlan_info *vlan_info_alloc(struct net_device *dev)
{
        struct vlan_info *vlan_info;

        vlan_info = kzalloc(sizeof(struct vlan_info), GFP_KERNEL);
        if (!vlan_info)
                return NULL;

        vlan_info->real_dev = dev;
        INIT_LIST_HEAD(&vlan_info->vid_list);
        return vlan_info;
}

struct vlan_vid_info {
        struct list_head list;
        __be16 proto;
        u16 vid;
        int refcount;
};

static bool vlan_hw_filter_capable(const struct net_device *dev, __be16 proto)
{
        if (proto == htons(ETH_P_8021Q) &&
            dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
                return true;
        if (proto == htons(ETH_P_8021AD) &&
            dev->features & NETIF_F_HW_VLAN_STAG_FILTER)
                return true;
        return false;
}

static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info,
                                               __be16 proto, u16 vid)
{
        struct vlan_vid_info *vid_info;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                if (vid_info->proto == proto && vid_info->vid == vid)
                        return vid_info;
        }
        return NULL;
}

static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid)
{
        struct vlan_vid_info *vid_info;

        vid_info = kzalloc(sizeof(struct vlan_vid_info), GFP_KERNEL);
        if (!vid_info)
                return NULL;
        vid_info->proto = proto;
        vid_info->vid = vid;

        return vid_info;
}

static int vlan_add_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
{
        if (!vlan_hw_filter_capable(dev, proto))
                return 0;

        if (netif_device_present(dev))
                return dev->netdev_ops->ndo_vlan_rx_add_vid(dev, proto, vid);
        else
                return -ENODEV;
}

static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vid)
{
        if (!vlan_hw_filter_capable(dev, proto))
                return 0;

        if (netif_device_present(dev))
                return dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
        else
                return -ENODEV;
}

int vlan_for_each(struct net_device *dev,
                  int (*action)(struct net_device *dev, int vid, void *arg),
                  void *arg)
{
        struct vlan_vid_info *vid_info;
        struct vlan_info *vlan_info;
        struct net_device *vdev;
        int ret;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                return 0;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto,
                                             vid_info->vid);
                ret = action(vdev, vid_info->vid, arg);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL(vlan_for_each);

int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto)
{
        struct net_device *real_dev = vlan_info->real_dev;
        struct vlan_vid_info *vlan_vid_info;
        int err;

        list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list) {
                if (vlan_vid_info->proto == proto) {
                        err = vlan_add_rx_filter_info(real_dev, proto,
                                                      vlan_vid_info->vid);
                        if (err)
                                goto unwind;
                }
        }

        return 0;

unwind:
        list_for_each_entry_continue_reverse(vlan_vid_info,
                                             &vlan_info->vid_list, list) {
                if (vlan_vid_info->proto == proto)
                        vlan_kill_rx_filter_info(real_dev, proto,
                                                 vlan_vid_info->vid);
        }

        return err;
}
EXPORT_SYMBOL(vlan_filter_push_vids);

void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto)
{
        struct vlan_vid_info *vlan_vid_info;

        list_for_each_entry(vlan_vid_info, &vlan_info->vid_list, list)
                if (vlan_vid_info->proto == proto)
                        vlan_kill_rx_filter_info(vlan_info->real_dev,
                                                 vlan_vid_info->proto,
                                                 vlan_vid_info->vid);
}
EXPORT_SYMBOL(vlan_filter_drop_vids);

static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
                          struct vlan_vid_info **pvid_info)
{
        struct net_device *dev = vlan_info->real_dev;
        struct vlan_vid_info *vid_info;
        int err;

        vid_info = vlan_vid_info_alloc(proto, vid);
        if (!vid_info)
                return -ENOMEM;

        err = vlan_add_rx_filter_info(dev, proto, vid);
        if (err) {
                kfree(vid_info);
                return err;
        }

        list_add(&vid_info->list, &vlan_info->vid_list);
        vlan_info->nr_vids++;
        *pvid_info = vid_info;
        return 0;
}

int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
{
        struct vlan_info *vlan_info;
        struct vlan_vid_info *vid_info;
        bool vlan_info_created = false;
        int err;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info) {
                vlan_info = vlan_info_alloc(dev);
                if (!vlan_info)
                        return -ENOMEM;
                vlan_info_created = true;
        }
        vid_info = vlan_vid_info_get(vlan_info, proto, vid);
        if (!vid_info) {
                err = __vlan_vid_add(vlan_info, proto, vid, &vid_info);
                if (err)
                        goto out_free_vlan_info;
        }
        vid_info->refcount++;

        if (vlan_info_created)
                rcu_assign_pointer(dev->vlan_info, vlan_info);

        return 0;

out_free_vlan_info:
        if (vlan_info_created)
                kfree(vlan_info);
        return err;
}
EXPORT_SYMBOL(vlan_vid_add);

static void __vlan_vid_del(struct vlan_info *vlan_info,
                           struct vlan_vid_info *vid_info)
{
        struct net_device *dev = vlan_info->real_dev;
        __be16 proto = vid_info->proto;
        u16 vid = vid_info->vid;
        int err;

        err = vlan_kill_rx_filter_info(dev, proto, vid);
        if (err && dev->reg_state != NETREG_UNREGISTERING)
                netdev_warn(dev, "failed to kill vid %04x/%d\n", proto, vid);

        list_del(&vid_info->list);
        kfree(vid_info);
        vlan_info->nr_vids--;
}

void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
{
        struct vlan_info *vlan_info;
        struct vlan_vid_info *vid_info;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                return;

        vid_info = vlan_vid_info_get(vlan_info, proto, vid);
        if (!vid_info)
                return;
        vid_info->refcount--;
        if (vid_info->refcount == 0) {
                __vlan_vid_del(vlan_info, vid_info);
                if (vlan_info->nr_vids == 0) {
                        RCU_INIT_POINTER(dev->vlan_info, NULL);
                        call_rcu(&vlan_info->rcu, vlan_info_rcu_free);
                }
        }
}
EXPORT_SYMBOL(vlan_vid_del);

int vlan_vids_add_by_dev(struct net_device *dev,
                         const struct net_device *by_dev)
{
        struct vlan_vid_info *vid_info;
        struct vlan_info *vlan_info;
        int err;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(by_dev->vlan_info);
        if (!vlan_info)
                return 0;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
                        continue;
                err = vlan_vid_add(dev, vid_info->proto, vid_info->vid);
                if (err)
                        goto unwind;
        }
        return 0;

unwind:
        list_for_each_entry_continue_reverse(vid_info,
                                             &vlan_info->vid_list,
                                             list) {
                if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
                        continue;
                vlan_vid_del(dev, vid_info->proto, vid_info->vid);
        }

        return err;
}
EXPORT_SYMBOL(vlan_vids_add_by_dev);

void vlan_vids_del_by_dev(struct net_device *dev,
                          const struct net_device *by_dev)
{
        struct vlan_vid_info *vid_info;
        struct vlan_info *vlan_info;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(by_dev->vlan_info);
        if (!vlan_info)
                return;

        list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
                if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
                        continue;
                vlan_vid_del(dev, vid_info->proto, vid_info->vid);
        }
}
EXPORT_SYMBOL(vlan_vids_del_by_dev);

bool vlan_uses_dev(const struct net_device *dev)
{
        struct vlan_info *vlan_info;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                return false;
        return vlan_info->grp.nr_vlan_devs ? true : false;
}
EXPORT_SYMBOL(vlan_uses_dev);

static struct sk_buff *vlan_gro_receive(struct list_head *head,
                                        struct sk_buff *skb)
{
        const struct packet_offload *ptype;
        unsigned int hlen, off_vlan;
        struct sk_buff *pp = NULL;
        struct vlan_hdr *vhdr;
        struct sk_buff *p;
        __be16 type;
        int flush = 1;

        off_vlan = skb_gro_offset(skb);
        hlen = off_vlan + sizeof(*vhdr);
        vhdr = skb_gro_header(skb, hlen, off_vlan);
        if (unlikely(!vhdr))
                goto out;

        NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen;

        type = vhdr->h_vlan_encapsulated_proto;

        ptype = gro_find_receive_by_type(type);
        if (!ptype)
                goto out;

        flush = 0;

        list_for_each_entry(p, head, list) {
                struct vlan_hdr *vhdr2;

                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
                if (compare_vlan_header(vhdr, vhdr2))
                        NAPI_GRO_CB(p)->same_flow = 0;
        }

        skb_gro_pull(skb, sizeof(*vhdr));
        skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));

        pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
                                            ipv6_gro_receive, inet_gro_receive,
                                            head, skb);

out:
        skb_gro_flush_final(skb, pp, flush);

        return pp;
}

static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
{
        struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
        __be16 type = vhdr->h_vlan_encapsulated_proto;
        struct packet_offload *ptype;
        int err = -ENOENT;

        ptype = gro_find_complete_by_type(type);
        if (ptype)
                err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
                                         ipv6_gro_complete, inet_gro_complete,
                                         skb, nhoff + sizeof(*vhdr));

        return err;
}

static struct packet_offload vlan_packet_offloads[] __read_mostly = {
        {
                .type = cpu_to_be16(ETH_P_8021Q),
                .priority = 10,
                .callbacks = {
                        .gro_receive = vlan_gro_receive,
                        .gro_complete = vlan_gro_complete,
                },
        },
        {
                .type = cpu_to_be16(ETH_P_8021AD),
                .priority = 10,
                .callbacks = {
                        .gro_receive = vlan_gro_receive,
                        .gro_complete = vlan_gro_complete,
                },
        },
};

static int __init vlan_offload_init(void)
{
        unsigned int i;

        for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
                dev_add_offload(&vlan_packet_offloads[i]);

        return 0;
}

fs_initcall(vlan_offload_init);






















































































































































































































   21 





  274 






















  262 


  262 





























































































  246 





  246 










  275 









  274 

    1 














  274 







  274 
    2 




































  275 








    3 

























  274 







  275 















































































































  244 


























  244 




















































































































   23 


















































   23 
   23 
   23 

















   23 



   23 

































































































  246 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
// SPDX-License-Identifier: GPL-2.0
/*
 *  inode.c - part of debugfs, a tiny little debug file system
 *
 *  Copyright (C) 2004,2019 Greg Kroah-Hartman <greg@kroah.com>
 *  Copyright (C) 2004 IBM Inc.
 *  Copyright (C) 2019 Linux Foundation <gregkh@linuxfoundation.org>
 *
 *  debugfs is for people to use instead of /proc or /sys.
 *  See ./Documentation/core-api/kernel-api.rst for more details.
 */

#define pr_fmt(fmt)        "debugfs: " fmt

#include <linux/module.h>
#include <linux/fs.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/kobject.h>
#include <linux/namei.h>
#include <linux/debugfs.h>
#include <linux/fsnotify.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/security.h>

#include "internal.h"

#define DEBUGFS_DEFAULT_MODE        0700

static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS;

/*
 * Don't allow access attributes to be changed whilst the kernel is locked down
 * so that we can use the file mode as part of a heuristic to determine whether
 * to lock down individual files.
 */
static int debugfs_setattr(struct mnt_idmap *idmap,
                           struct dentry *dentry, struct iattr *ia)
{
        int ret;

        if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) {
                ret = security_locked_down(LOCKDOWN_DEBUGFS);
                if (ret)
                        return ret;
        }
        return simple_setattr(&nop_mnt_idmap, dentry, ia);
}

static const struct inode_operations debugfs_file_inode_operations = {
        .setattr        = debugfs_setattr,
};
static const struct inode_operations debugfs_dir_inode_operations = {
        .lookup                = simple_lookup,
        .setattr        = debugfs_setattr,
};
static const struct inode_operations debugfs_symlink_inode_operations = {
        .get_link        = simple_get_link,
        .setattr        = debugfs_setattr,
};

static struct inode *debugfs_get_inode(struct super_block *sb)
{
        struct inode *inode = new_inode(sb);
        if (inode) {
                inode->i_ino = get_next_ino();
                simple_inode_init_ts(inode);
        }
        return inode;
}

struct debugfs_fs_info {
        kuid_t uid;
        kgid_t gid;
        umode_t mode;
        /* Opt_* bitfield. */
        unsigned int opts;
};

enum {
        Opt_uid,
        Opt_gid,
        Opt_mode,
        Opt_source,
};

static const struct fs_parameter_spec debugfs_param_specs[] = {
        fsparam_gid        ("gid",                Opt_gid),
        fsparam_u32oct        ("mode",        Opt_mode),
        fsparam_uid        ("uid",                Opt_uid),
        fsparam_string        ("source",        Opt_source),
        {}
};

static int debugfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
        struct debugfs_fs_info *opts = fc->s_fs_info;
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, debugfs_param_specs, param, &result);
        if (opt < 0) {
                /*
                * We might like to report bad mount options here; but
                * traditionally debugfs has ignored all mount options
                */
                if (opt == -ENOPARAM)
                        return 0;

                return opt;
        }

        switch (opt) {
        case Opt_uid:
                opts->uid = result.uid;
                break;
        case Opt_gid:
                opts->gid = result.gid;
                break;
        case Opt_mode:
                opts->mode = result.uint_32 & S_IALLUGO;
                break;
        case Opt_source:
                if (fc->source)
                        return invalfc(fc, "Multiple sources specified");
                fc->source = param->string;
                param->string = NULL;
                break;
        /*
         * We might like to report bad mount options here;
         * but traditionally debugfs has ignored all mount options
         */
        }

        opts->opts |= BIT(opt);

        return 0;
}

static void _debugfs_apply_options(struct super_block *sb, bool remount)
{
        struct debugfs_fs_info *fsi = sb->s_fs_info;
        struct inode *inode = d_inode(sb->s_root);

        /*
         * On remount, only reset mode/uid/gid if they were provided as mount
         * options.
         */

        if (!remount || fsi->opts & BIT(Opt_mode)) {
                inode->i_mode &= ~S_IALLUGO;
                inode->i_mode |= fsi->mode;
        }

        if (!remount || fsi->opts & BIT(Opt_uid))
                inode->i_uid = fsi->uid;

        if (!remount || fsi->opts & BIT(Opt_gid))
                inode->i_gid = fsi->gid;
}

static void debugfs_apply_options(struct super_block *sb)
{
        _debugfs_apply_options(sb, false);
}

static void debugfs_apply_options_remount(struct super_block *sb)
{
        _debugfs_apply_options(sb, true);
}

static int debugfs_reconfigure(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        struct debugfs_fs_info *sb_opts = sb->s_fs_info;
        struct debugfs_fs_info *new_opts = fc->s_fs_info;

        sync_filesystem(sb);

        /* structure copy of new mount options to sb */
        *sb_opts = *new_opts;
        debugfs_apply_options_remount(sb);

        return 0;
}

static int debugfs_show_options(struct seq_file *m, struct dentry *root)
{
        struct debugfs_fs_info *fsi = root->d_sb->s_fs_info;

        if (!uid_eq(fsi->uid, GLOBAL_ROOT_UID))
                seq_printf(m, ",uid=%u",
                           from_kuid_munged(&init_user_ns, fsi->uid));
        if (!gid_eq(fsi->gid, GLOBAL_ROOT_GID))
                seq_printf(m, ",gid=%u",
                           from_kgid_munged(&init_user_ns, fsi->gid));
        if (fsi->mode != DEBUGFS_DEFAULT_MODE)
                seq_printf(m, ",mode=%o", fsi->mode);

        return 0;
}

static struct kmem_cache *debugfs_inode_cachep __ro_after_init;

static void init_once(void *foo)
{
        struct debugfs_inode_info *info = foo;
        inode_init_once(&info->vfs_inode);
}

static struct inode *debugfs_alloc_inode(struct super_block *sb)
{
        struct debugfs_inode_info *info;
        info = alloc_inode_sb(sb, debugfs_inode_cachep, GFP_KERNEL);
        if (!info)
                return NULL;
        return &info->vfs_inode;
}

static void debugfs_free_inode(struct inode *inode)
{
        if (S_ISLNK(inode->i_mode))
                kfree(inode->i_link);
        kmem_cache_free(debugfs_inode_cachep, DEBUGFS_I(inode));
}

static const struct super_operations debugfs_super_operations = {
        .statfs                = simple_statfs,
        .show_options        = debugfs_show_options,
        .alloc_inode        = debugfs_alloc_inode,
        .free_inode        = debugfs_free_inode,
};

static void debugfs_release_dentry(struct dentry *dentry)
{
        struct debugfs_fsdata *fsd = dentry->d_fsdata;

        if (fsd) {
                WARN_ON(!list_empty(&fsd->cancellations));
                mutex_destroy(&fsd->cancellations_mtx);
        }
        kfree(fsd);
}

static struct vfsmount *debugfs_automount(struct path *path)
{
        struct inode *inode = path->dentry->d_inode;

        return DEBUGFS_I(inode)->automount(path->dentry, inode->i_private);
}

static const struct dentry_operations debugfs_dops = {
        .d_delete = always_delete_dentry,
        .d_release = debugfs_release_dentry,
        .d_automount = debugfs_automount,
};

static int debugfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
        static const struct tree_descr debug_files[] = {{""}};
        int err;

        err = simple_fill_super(sb, DEBUGFS_MAGIC, debug_files);
        if (err)
                return err;

        sb->s_op = &debugfs_super_operations;
        sb->s_d_op = &debugfs_dops;

        debugfs_apply_options(sb);

        return 0;
}

static int debugfs_get_tree(struct fs_context *fc)
{
        if (!(debugfs_allow & DEBUGFS_ALLOW_API))
                return -EPERM;

        return get_tree_single(fc, debugfs_fill_super);
}

static void debugfs_free_fc(struct fs_context *fc)
{
        kfree(fc->s_fs_info);
}

static const struct fs_context_operations debugfs_context_ops = {
        .free                = debugfs_free_fc,
        .parse_param        = debugfs_parse_param,
        .get_tree        = debugfs_get_tree,
        .reconfigure        = debugfs_reconfigure,
};

static int debugfs_init_fs_context(struct fs_context *fc)
{
        struct debugfs_fs_info *fsi;

        fsi = kzalloc(sizeof(struct debugfs_fs_info), GFP_KERNEL);
        if (!fsi)
                return -ENOMEM;

        fsi->mode = DEBUGFS_DEFAULT_MODE;

        fc->s_fs_info = fsi;
        fc->ops = &debugfs_context_ops;
        return 0;
}

static struct file_system_type debug_fs_type = {
        .owner =        THIS_MODULE,
        .name =                "debugfs",
        .init_fs_context = debugfs_init_fs_context,
        .parameters =        debugfs_param_specs,
        .kill_sb =        kill_litter_super,
};
MODULE_ALIAS_FS("debugfs");

/**
 * debugfs_lookup() - look up an existing debugfs file
 * @name: a pointer to a string containing the name of the file to look up.
 * @parent: a pointer to the parent dentry of the file.
 *
 * This function will return a pointer to a dentry if it succeeds.  If the file
 * doesn't exist or an error occurs, %NULL will be returned.  The returned
 * dentry must be passed to dput() when it is no longer needed.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.
 */
struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
{
        struct dentry *dentry;

        if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent))
                return NULL;

        if (!parent)
                parent = debugfs_mount->mnt_root;

        dentry = lookup_positive_unlocked(name, parent, strlen(name));
        if (IS_ERR(dentry))
                return NULL;
        return dentry;
}
EXPORT_SYMBOL_GPL(debugfs_lookup);

static struct dentry *start_creating(const char *name, struct dentry *parent)
{
        struct dentry *dentry;
        int error;

        if (!(debugfs_allow & DEBUGFS_ALLOW_API))
                return ERR_PTR(-EPERM);

        if (!debugfs_initialized())
                return ERR_PTR(-ENOENT);

        pr_debug("creating file '%s'\n", name);

        if (IS_ERR(parent))
                return parent;

        error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
                              &debugfs_mount_count);
        if (error) {
                pr_err("Unable to pin filesystem for file '%s'\n", name);
                return ERR_PTR(error);
        }

        /* If the parent is not specified, we create it in the root.
         * We need the root dentry to do this, which is in the super
         * block. A pointer to that is in the struct vfsmount that we
         * have around.
         */
        if (!parent)
                parent = debugfs_mount->mnt_root;

        inode_lock(d_inode(parent));
        if (unlikely(IS_DEADDIR(d_inode(parent))))
                dentry = ERR_PTR(-ENOENT);
        else
                dentry = lookup_one_len(name, parent, strlen(name));
        if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
                if (d_is_dir(dentry))
                        pr_err("Directory '%s' with parent '%s' already present!\n",
                               name, parent->d_name.name);
                else
                        pr_err("File '%s' in directory '%s' already present!\n",
                               name, parent->d_name.name);
                dput(dentry);
                dentry = ERR_PTR(-EEXIST);
        }

        if (IS_ERR(dentry)) {
                inode_unlock(d_inode(parent));
                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        }

        return dentry;
}

static struct dentry *failed_creating(struct dentry *dentry)
{
        inode_unlock(d_inode(dentry->d_parent));
        dput(dentry);
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        return ERR_PTR(-ENOMEM);
}

static struct dentry *end_creating(struct dentry *dentry)
{
        inode_unlock(d_inode(dentry->d_parent));
        return dentry;
}

static struct dentry *__debugfs_create_file(const char *name, umode_t mode,
                                struct dentry *parent, void *data,
                                const void *aux,
                                const struct file_operations *proxy_fops,
                                const void *real_fops)
{
        struct dentry *dentry;
        struct inode *inode;

        if (!(mode & S_IFMT))
                mode |= S_IFREG;
        BUG_ON(!S_ISREG(mode));
        dentry = start_creating(name, parent);

        if (IS_ERR(dentry))
                return dentry;

        if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
                failed_creating(dentry);
                return ERR_PTR(-EPERM);
        }

        inode = debugfs_get_inode(dentry->d_sb);
        if (unlikely(!inode)) {
                pr_err("out of free dentries, can not create file '%s'\n",
                       name);
                return failed_creating(dentry);
        }

        inode->i_mode = mode;
        inode->i_private = data;

        inode->i_op = &debugfs_file_inode_operations;
        if (!real_fops)
                proxy_fops = &debugfs_noop_file_operations;
        inode->i_fop = proxy_fops;
        DEBUGFS_I(inode)->raw = real_fops;
        DEBUGFS_I(inode)->aux = aux;

        d_instantiate(dentry, inode);
        fsnotify_create(d_inode(dentry->d_parent), dentry);
        return end_creating(dentry);
}

struct dentry *debugfs_create_file_full(const char *name, umode_t mode,
                                        struct dentry *parent, void *data,
                                        const void *aux,
                                        const struct file_operations *fops)
{
        return __debugfs_create_file(name, mode, parent, data, aux,
                                &debugfs_full_proxy_file_operations,
                                fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_full);

struct dentry *debugfs_create_file_short(const char *name, umode_t mode,
                                        struct dentry *parent, void *data,
                                        const void *aux,
                                        const struct debugfs_short_fops *fops)
{
        return __debugfs_create_file(name, mode, parent, data, aux,
                                &debugfs_full_short_proxy_file_operations,
                                fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_short);

/**
 * debugfs_create_file_unsafe - create a file in the debugfs filesystem
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @data: a pointer to something that the caller will want to get to later
 *        on.  The inode.i_private pointer will point to this value on
 *        the open() call.
 * @fops: a pointer to a struct file_operations that should be used for
 *        this file.
 *
 * debugfs_create_file_unsafe() is completely analogous to
 * debugfs_create_file(), the only difference being that the fops
 * handed it will not get protected against file removals by the
 * debugfs core.
 *
 * It is your responsibility to protect your struct file_operation
 * methods against file removals by means of debugfs_file_get()
 * and debugfs_file_put(). ->open() is still protected by
 * debugfs though.
 *
 * Any struct file_operations defined by means of
 * DEFINE_DEBUGFS_ATTRIBUTE() is protected against file removals and
 * thus, may be used here.
 */
struct dentry *debugfs_create_file_unsafe(const char *name, umode_t mode,
                                   struct dentry *parent, void *data,
                                   const struct file_operations *fops)
{

        return __debugfs_create_file(name, mode, parent, data, NULL,
                                &debugfs_open_proxy_file_operations,
                                fops);
}
EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe);

/**
 * debugfs_create_file_size - create a file in the debugfs filesystem
 * @name: a pointer to a string containing the name of the file to create.
 * @mode: the permission that the file should have.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @data: a pointer to something that the caller will want to get to later
 *        on.  The inode.i_private pointer will point to this value on
 *        the open() call.
 * @fops: a pointer to a struct file_operations that should be used for
 *        this file.
 * @file_size: initial file size
 *
 * This is the basic "create a file" function for debugfs.  It allows for a
 * wide range of flexibility in creating a file, or a directory (if you want
 * to create a directory, the debugfs_create_dir() function is
 * recommended to be used instead.)
 */
void debugfs_create_file_size(const char *name, umode_t mode,
                              struct dentry *parent, void *data,
                              const struct file_operations *fops,
                              loff_t file_size)
{
        struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);

        if (!IS_ERR(de))
                d_inode(de)->i_size = file_size;
}
EXPORT_SYMBOL_GPL(debugfs_create_file_size);

/**
 * debugfs_create_dir - create a directory in the debugfs filesystem
 * @name: a pointer to a string containing the name of the directory to
 *        create.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is NULL, then the
 *          directory will be created in the root of the debugfs filesystem.
 *
 * This function creates a directory in debugfs with the given name.
 *
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the file is
 * to be removed (no automatic cleanup happens if your module is unloaded,
 * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
 * returned.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.
 *
 * NOTE: it's expected that most callers should _ignore_ the errors returned
 * by this function. Other debugfs functions handle the fact that the "dentry"
 * passed to them could be an error and they don't crash in that case.
 * Drivers should generally work fine even if debugfs fails to init anyway.
 */
struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
{
        struct dentry *dentry = start_creating(name, parent);
        struct inode *inode;

        if (IS_ERR(dentry))
                return dentry;

        if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
                failed_creating(dentry);
                return ERR_PTR(-EPERM);
        }

        inode = debugfs_get_inode(dentry->d_sb);
        if (unlikely(!inode)) {
                pr_err("out of free dentries, can not create directory '%s'\n",
                       name);
                return failed_creating(dentry);
        }

        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
        inode->i_op = &debugfs_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;

        /* directory inodes start off with i_nlink == 2 (for "." entry) */
        inc_nlink(inode);
        d_instantiate(dentry, inode);
        inc_nlink(d_inode(dentry->d_parent));
        fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
        return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_dir);

/**
 * debugfs_create_automount - create automount point in the debugfs filesystem
 * @name: a pointer to a string containing the name of the file to create.
 * @parent: a pointer to the parent dentry for this file.  This should be a
 *          directory dentry if set.  If this parameter is NULL, then the
 *          file will be created in the root of the debugfs filesystem.
 * @f: function to be called when pathname resolution steps on that one.
 * @data: opaque argument to pass to f().
 *
 * @f should return what ->d_automount() would.
 */
struct dentry *debugfs_create_automount(const char *name,
                                        struct dentry *parent,
                                        debugfs_automount_t f,
                                        void *data)
{
        struct dentry *dentry = start_creating(name, parent);
        struct inode *inode;

        if (IS_ERR(dentry))
                return dentry;

        if (!(debugfs_allow & DEBUGFS_ALLOW_API)) {
                failed_creating(dentry);
                return ERR_PTR(-EPERM);
        }

        inode = debugfs_get_inode(dentry->d_sb);
        if (unlikely(!inode)) {
                pr_err("out of free dentries, can not create automount '%s'\n",
                       name);
                return failed_creating(dentry);
        }

        make_empty_dir_inode(inode);
        inode->i_flags |= S_AUTOMOUNT;
        inode->i_private = data;
        DEBUGFS_I(inode)->automount = f;
        /* directory inodes start off with i_nlink == 2 (for "." entry) */
        inc_nlink(inode);
        d_instantiate(dentry, inode);
        inc_nlink(d_inode(dentry->d_parent));
        fsnotify_mkdir(d_inode(dentry->d_parent), dentry);
        return end_creating(dentry);
}
EXPORT_SYMBOL(debugfs_create_automount);

/**
 * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
 * @name: a pointer to a string containing the name of the symbolic link to
 *        create.
 * @parent: a pointer to the parent dentry for this symbolic link.  This
 *          should be a directory dentry if set.  If this parameter is NULL,
 *          then the symbolic link will be created in the root of the debugfs
 *          filesystem.
 * @target: a pointer to a string containing the path to the target of the
 *          symbolic link.
 *
 * This function creates a symbolic link with the given name in debugfs that
 * links to the given target path.
 *
 * This function will return a pointer to a dentry if it succeeds.  This
 * pointer must be passed to the debugfs_remove() function when the symbolic
 * link is to be removed (no automatic cleanup happens if your module is
 * unloaded, you are responsible here.)  If an error occurs, ERR_PTR(-ERROR)
 * will be returned.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.
 */
struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
                                      const char *target)
{
        struct dentry *dentry;
        struct inode *inode;
        char *link = kstrdup(target, GFP_KERNEL);
        if (!link)
                return ERR_PTR(-ENOMEM);

        dentry = start_creating(name, parent);
        if (IS_ERR(dentry)) {
                kfree(link);
                return dentry;
        }

        inode = debugfs_get_inode(dentry->d_sb);
        if (unlikely(!inode)) {
                pr_err("out of free dentries, can not create symlink '%s'\n",
                       name);
                kfree(link);
                return failed_creating(dentry);
        }
        inode->i_mode = S_IFLNK | S_IRWXUGO;
        inode->i_op = &debugfs_symlink_inode_operations;
        inode->i_link = link;
        d_instantiate(dentry, inode);
        return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_symlink);

static void __debugfs_file_removed(struct dentry *dentry)
{
        struct debugfs_fsdata *fsd;

        /*
         * Paired with the closing smp_mb() implied by a successful
         * cmpxchg() in debugfs_file_get(): either
         * debugfs_file_get() must see a dead dentry or we must see a
         * debugfs_fsdata instance at ->d_fsdata here (or both).
         */
        smp_mb();
        fsd = READ_ONCE(dentry->d_fsdata);
        if (!fsd)
                return;

        /* if this was the last reference, we're done */
        if (refcount_dec_and_test(&fsd->active_users))
                return;

        /*
         * If there's still a reference, the code that obtained it can
         * be in different states:
         *  - The common case of not using cancellations, or already
         *    after debugfs_leave_cancellation(), where we just need
         *    to wait for debugfs_file_put() which signals the completion;
         *  - inside a cancellation section, i.e. between
         *    debugfs_enter_cancellation() and debugfs_leave_cancellation(),
         *    in which case we need to trigger the ->cancel() function,
         *    and then wait for debugfs_file_put() just like in the
         *    previous case;
         *  - before debugfs_enter_cancellation() (but obviously after
         *    debugfs_file_get()), in which case we may not see the
         *    cancellation in the list on the first round of the loop,
         *    but debugfs_enter_cancellation() signals the completion
         *    after adding it, so this code gets woken up to call the
         *    ->cancel() function.
         */
        while (refcount_read(&fsd->active_users)) {
                struct debugfs_cancellation *c;

                /*
                 * Lock the cancellations. Note that the cancellations
                 * structs are meant to be on the stack, so we need to
                 * ensure we either use them here or don't touch them,
                 * and debugfs_leave_cancellation() will wait for this
                 * to be finished processing before exiting one. It may
                 * of course win and remove the cancellation, but then
                 * chances are we never even got into this bit, we only
                 * do if the refcount isn't zero already.
                 */
                mutex_lock(&fsd->cancellations_mtx);
                while ((c = list_first_entry_or_null(&fsd->cancellations,
                                                     typeof(*c), list))) {
                        list_del_init(&c->list);
                        c->cancel(dentry, c->cancel_data);
                }
                mutex_unlock(&fsd->cancellations_mtx);

                wait_for_completion(&fsd->active_users_drained);
        }
}

static void remove_one(struct dentry *victim)
{
        if (d_is_reg(victim))
                __debugfs_file_removed(victim);
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}

/**
 * debugfs_remove - recursively removes a directory
 * @dentry: a pointer to a the dentry of the directory to be removed.  If this
 *          parameter is NULL or an error value, nothing will be done.
 *
 * This function recursively removes a directory tree in debugfs that
 * was previously created with a call to another debugfs function
 * (like debugfs_create_file() or variants thereof.)
 *
 * This function is required to be called in order for the file to be
 * removed, no automatic cleanup of files will happen when a module is
 * removed, you are responsible here.
 */
void debugfs_remove(struct dentry *dentry)
{
        if (IS_ERR_OR_NULL(dentry))
                return;

        simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count);
        simple_recursive_removal(dentry, remove_one);
        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
}
EXPORT_SYMBOL_GPL(debugfs_remove);

/**
 * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it
 * @name: a pointer to a string containing the name of the item to look up.
 * @parent: a pointer to the parent dentry of the item.
 *
 * This is the equlivant of doing something like
 * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting
 * handled for the directory being looked up.
 */
void debugfs_lookup_and_remove(const char *name, struct dentry *parent)
{
        struct dentry *dentry;

        dentry = debugfs_lookup(name, parent);
        if (!dentry)
                return;

        debugfs_remove(dentry);
        dput(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove);

/**
 * debugfs_change_name - rename a file/directory in the debugfs filesystem
 * @dentry: dentry of an object to be renamed.
 * @fmt: format for new name
 *
 * This function renames a file/directory in debugfs.  The target must not
 * exist for rename to succeed.
 *
 * This function will return 0 on success and -E... on failure.
 *
 * If debugfs is not enabled in the kernel, the value -%ENODEV will be
 * returned.
 */
int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, ...)
{
        int error = 0;
        const char *new_name;
        struct name_snapshot old_name;
        struct dentry *parent, *target;
        struct inode *dir;
        va_list ap;

        if (IS_ERR_OR_NULL(dentry))
                return 0;

        va_start(ap, fmt);
        new_name = kvasprintf_const(GFP_KERNEL, fmt, ap);
        va_end(ap);
        if (!new_name)
                return -ENOMEM;

        parent = dget_parent(dentry);
        dir = d_inode(parent);
        inode_lock(dir);

        take_dentry_name_snapshot(&old_name, dentry);

        if (WARN_ON_ONCE(dentry->d_parent != parent)) {
                error = -EINVAL;
                goto out;
        }
        if (strcmp(old_name.name.name, new_name) == 0)
                goto out;
        target = lookup_one_len(new_name, parent, strlen(new_name));
        if (IS_ERR(target)) {
                error = PTR_ERR(target);
                goto out;
        }
        if (d_really_is_positive(target)) {
                dput(target);
                error = -EINVAL;
                goto out;
        }
        simple_rename_timestamp(dir, dentry, dir, target);
        d_move(dentry, target);
        dput(target);
        fsnotify_move(dir, dir, &old_name.name, d_is_dir(dentry), NULL, dentry);
out:
        release_dentry_name_snapshot(&old_name);
        inode_unlock(dir);
        dput(parent);
        kfree_const(new_name);
        return error;
}
EXPORT_SYMBOL_GPL(debugfs_change_name);

/**
 * debugfs_initialized - Tells whether debugfs has been registered
 */
bool debugfs_initialized(void)
{
        return debugfs_registered;
}
EXPORT_SYMBOL_GPL(debugfs_initialized);

static int __init debugfs_kernel(char *str)
{
        if (str) {
                if (!strcmp(str, "on"))
                        debugfs_allow = DEBUGFS_ALLOW_API | DEBUGFS_ALLOW_MOUNT;
                else if (!strcmp(str, "no-mount"))
                        debugfs_allow = DEBUGFS_ALLOW_API;
                else if (!strcmp(str, "off"))
                        debugfs_allow = 0;
        }

        return 0;
}
early_param("debugfs", debugfs_kernel);
static int __init debugfs_init(void)
{
        int retval;

        if (!(debugfs_allow & DEBUGFS_ALLOW_MOUNT))
                return -EPERM;

        retval = sysfs_create_mount_point(kernel_kobj, "debug");
        if (retval)
                return retval;

        debugfs_inode_cachep = kmem_cache_create("debugfs_inode_cache",
                                sizeof(struct debugfs_inode_info), 0,
                                SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
                                init_once);
        if (debugfs_inode_cachep == NULL) {
                sysfs_remove_mount_point(kernel_kobj, "debug");
                return -ENOMEM;
        }

        retval = register_filesystem(&debug_fs_type);
        if (retval) { // Really not going to happen
                sysfs_remove_mount_point(kernel_kobj, "debug");
                kmem_cache_destroy(debugfs_inode_cachep);
                return retval;
        }
        debugfs_registered = true;
        return 0;
}
core_initcall(debugfs_init);

















































































































































































































































































































































































































































































































































  147 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/buffer.c
 *
 *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
 */

/*
 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
 *
 * Removed a lot of unnecessary code and simplified things now that
 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
 *
 * Speed up hash, lru, and free list operations.  Use gfp() for allocating
 * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
 *
 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
 *
 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
 */

#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/syscalls.h>
#include <linux/fs.h>
#include <linux/iomap.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/capability.h>
#include <linux/blkdev.h>
#include <linux/file.h>
#include <linux/quotaops.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>
#include <linux/hash.h>
#include <linux/suspend.h>
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
#include <linux/mpage.h>
#include <linux/bit_spinlock.h>
#include <linux/pagevec.h>
#include <linux/sched/mm.h>
#include <trace/events/block.h>
#include <linux/fscrypt.h>
#include <linux/fsverity.h>
#include <linux/sched/isolation.h>

#include "internal.h"

static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
                          enum rw_hint hint, struct writeback_control *wbc);

#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)

inline void touch_buffer(struct buffer_head *bh)
{
        trace_block_touch_buffer(bh);
        folio_mark_accessed(bh->b_folio);
}
EXPORT_SYMBOL(touch_buffer);

void __lock_buffer(struct buffer_head *bh)
{
        wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_buffer);

void unlock_buffer(struct buffer_head *bh)
{
        clear_bit_unlock(BH_Lock, &bh->b_state);
        smp_mb__after_atomic();
        wake_up_bit(&bh->b_state, BH_Lock);
}
EXPORT_SYMBOL(unlock_buffer);

/*
 * Returns if the folio has dirty or writeback buffers. If all the buffers
 * are unlocked and clean then the folio_test_dirty information is stale. If
 * any of the buffers are locked, it is assumed they are locked for IO.
 */
void buffer_check_dirty_writeback(struct folio *folio,
                                     bool *dirty, bool *writeback)
{
        struct buffer_head *head, *bh;
        *dirty = false;
        *writeback = false;

        BUG_ON(!folio_test_locked(folio));

        head = folio_buffers(folio);
        if (!head)
                return;

        if (folio_test_writeback(folio))
                *writeback = true;

        bh = head;
        do {
                if (buffer_locked(bh))
                        *writeback = true;

                if (buffer_dirty(bh))
                        *dirty = true;

                bh = bh->b_this_page;
        } while (bh != head);
}

/*
 * Block until a buffer comes unlocked.  This doesn't stop it
 * from becoming locked again - you have to lock it yourself
 * if you want to preserve its state.
 */
void __wait_on_buffer(struct buffer_head * bh)
{
        wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__wait_on_buffer);

static void buffer_io_error(struct buffer_head *bh, char *msg)
{
        if (!test_bit(BH_Quiet, &bh->b_state))
                printk_ratelimited(KERN_ERR
                        "Buffer I/O error on dev %pg, logical block %llu%s\n",
                        bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
}

/*
 * End-of-IO handler helper function which does not touch the bh after
 * unlocking it.
 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
 * a race there is benign: unlock_buffer() only use the bh's address for
 * hashing after unlocking the buffer, so it doesn't actually touch the bh
 * itself.
 */
static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                /* This happens, due to failed read-ahead attempts. */
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
}

/*
 * Default synchronous end-of-IO handler..  Just mark it up-to-date and
 * unlock the buffer.
 */
void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
{
        __end_buffer_read_notouch(bh, uptodate);
        put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_read_sync);

void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost sync page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
        }
        unlock_buffer(bh);
        put_bh(bh);
}
EXPORT_SYMBOL(end_buffer_write_sync);

static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic)
{
        struct address_space *bd_mapping = bdev->bd_mapping;
        const int blkbits = bd_mapping->host->i_blkbits;
        struct buffer_head *ret = NULL;
        pgoff_t index;
        struct buffer_head *bh;
        struct buffer_head *head;
        struct folio *folio;
        int all_mapped = 1;
        static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);

        index = ((loff_t)block << blkbits) / PAGE_SIZE;
        folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
        if (IS_ERR(folio))
                goto out;

        /*
         * Folio lock protects the buffers. Callers that cannot block
         * will fallback to serializing vs try_to_free_buffers() via
         * the i_private_lock.
         */
        if (atomic)
                spin_lock(&bd_mapping->i_private_lock);
        else
                folio_lock(folio);

        head = folio_buffers(folio);
        if (!head)
                goto out_unlock;
        /*
         * Upon a noref migration, the folio lock serializes here;
         * otherwise bail.
         */
        if (test_bit_acquire(BH_Migrate, &head->b_state)) {
                WARN_ON(!atomic);
                goto out_unlock;
        }

        bh = head;
        do {
                if (!buffer_mapped(bh))
                        all_mapped = 0;
                else if (bh->b_blocknr == block) {
                        ret = bh;
                        get_bh(bh);
                        goto out_unlock;
                }
                bh = bh->b_this_page;
        } while (bh != head);

        /* we might be here because some of the buffers on this page are
         * not mapped.  This is due to various races between
         * file io on the block device and getblk.  It gets dealt with
         * elsewhere, don't buffer_error if we had some unmapped buffers
         */
        ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
        if (all_mapped && __ratelimit(&last_warned)) {
                printk("__find_get_block_slow() failed. block=%llu, "
                       "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
                       "device %pg blocksize: %d\n",
                       (unsigned long long)block,
                       (unsigned long long)bh->b_blocknr,
                       bh->b_state, bh->b_size, bdev,
                       1 << blkbits);
        }
out_unlock:
        if (atomic)
                spin_unlock(&bd_mapping->i_private_lock);
        else
                folio_unlock(folio);
        folio_put(folio);
out:
        return ret;
}

static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct folio *folio;
        int folio_uptodate = 1;

        BUG_ON(!buffer_async_read(bh));

        folio = bh->b_folio;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                clear_buffer_uptodate(bh);
                buffer_io_error(bh, ", async page read");
        }

        /*
         * Be _very_ careful from here on. Bad things can happen if
         * two buffer heads end IO at almost the same time and both
         * decide that the page is now completely done.
         */
        first = folio_buffers(folio);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);
        clear_buffer_async_read(bh);
        unlock_buffer(bh);
        tmp = bh;
        do {
                if (!buffer_uptodate(tmp))
                        folio_uptodate = 0;
                if (buffer_async_read(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        } while (tmp != bh);
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);

        folio_end_read(folio, folio_uptodate);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        return;
}

struct postprocess_bh_ctx {
        struct work_struct work;
        struct buffer_head *bh;
};

static void verify_bh(struct work_struct *work)
{
        struct postprocess_bh_ctx *ctx =
                container_of(work, struct postprocess_bh_ctx, work);
        struct buffer_head *bh = ctx->bh;
        bool valid;

        valid = fsverity_verify_blocks(bh->b_folio, bh->b_size, bh_offset(bh));
        end_buffer_async_read(bh, valid);
        kfree(ctx);
}

static bool need_fsverity(struct buffer_head *bh)
{
        struct folio *folio = bh->b_folio;
        struct inode *inode = folio->mapping->host;

        return fsverity_active(inode) &&
                /* needed by ext4 */
                folio->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
}

static void decrypt_bh(struct work_struct *work)
{
        struct postprocess_bh_ctx *ctx =
                container_of(work, struct postprocess_bh_ctx, work);
        struct buffer_head *bh = ctx->bh;
        int err;

        err = fscrypt_decrypt_pagecache_blocks(bh->b_folio, bh->b_size,
                                               bh_offset(bh));
        if (err == 0 && need_fsverity(bh)) {
                /*
                 * We use different work queues for decryption and for verity
                 * because verity may require reading metadata pages that need
                 * decryption, and we shouldn't recurse to the same workqueue.
                 */
                INIT_WORK(&ctx->work, verify_bh);
                fsverity_enqueue_verify_work(&ctx->work);
                return;
        }
        end_buffer_async_read(bh, err == 0);
        kfree(ctx);
}

/*
 * I/O completion handler for block_read_full_folio() - pages
 * which come unlocked at the end of I/O.
 */
static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
{
        struct inode *inode = bh->b_folio->mapping->host;
        bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
        bool verify = need_fsverity(bh);

        /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
        if (uptodate && (decrypt || verify)) {
                struct postprocess_bh_ctx *ctx =
                        kmalloc(sizeof(*ctx), GFP_ATOMIC);

                if (ctx) {
                        ctx->bh = bh;
                        if (decrypt) {
                                INIT_WORK(&ctx->work, decrypt_bh);
                                fscrypt_enqueue_decrypt_work(&ctx->work);
                        } else {
                                INIT_WORK(&ctx->work, verify_bh);
                                fsverity_enqueue_verify_work(&ctx->work);
                        }
                        return;
                }
                uptodate = 0;
        }
        end_buffer_async_read(bh, uptodate);
}

/*
 * Completion handler for block_write_full_folio() - folios which are unlocked
 * during I/O, and which have the writeback flag cleared upon I/O completion.
 */
static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
        unsigned long flags;
        struct buffer_head *first;
        struct buffer_head *tmp;
        struct folio *folio;

        BUG_ON(!buffer_async_write(bh));

        folio = bh->b_folio;
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
                buffer_io_error(bh, ", lost async page write");
                mark_buffer_write_io_error(bh);
                clear_buffer_uptodate(bh);
        }

        first = folio_buffers(folio);
        spin_lock_irqsave(&first->b_uptodate_lock, flags);

        clear_buffer_async_write(bh);
        unlock_buffer(bh);
        tmp = bh->b_this_page;
        while (tmp != bh) {
                if (buffer_async_write(tmp)) {
                        BUG_ON(!buffer_locked(tmp));
                        goto still_busy;
                }
                tmp = tmp->b_this_page;
        }
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        folio_end_writeback(folio);
        return;

still_busy:
        spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
        return;
}

/*
 * If a page's buffers are under async readin (end_buffer_async_read
 * completion) then there is a possibility that another thread of
 * control could lock one of the buffers after it has completed
 * but while some of the other buffers have not completed.  This
 * locked buffer would confuse end_buffer_async_read() into not unlocking
 * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
 * that this buffer is not under async I/O.
 *
 * The page comes unlocked when it has no locked buffer_async buffers
 * left.
 *
 * PageLocked prevents anyone starting new async I/O reads any of
 * the buffers.
 *
 * PageWriteback is used to prevent simultaneous writeout of the same
 * page.
 *
 * PageLocked prevents anyone from starting writeback of a page which is
 * under read I/O (PageWriteback is only ever set against a locked page).
 */
static void mark_buffer_async_read(struct buffer_head *bh)
{
        bh->b_end_io = end_buffer_async_read_io;
        set_buffer_async_read(bh);
}

static void mark_buffer_async_write_endio(struct buffer_head *bh,
                                          bh_end_io_t *handler)
{
        bh->b_end_io = handler;
        set_buffer_async_write(bh);
}

void mark_buffer_async_write(struct buffer_head *bh)
{
        mark_buffer_async_write_endio(bh, end_buffer_async_write);
}
EXPORT_SYMBOL(mark_buffer_async_write);


/*
 * fs/buffer.c contains helper functions for buffer-backed address space's
 * fsync functions.  A common requirement for buffer-based filesystems is
 * that certain data from the backing blockdev needs to be written out for
 * a successful fsync().  For example, ext2 indirect blocks need to be
 * written back and waited upon before fsync() returns.
 *
 * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
 * management of a list of dependent buffers at ->i_mapping->i_private_list.
 *
 * Locking is a little subtle: try_to_free_buffers() will remove buffers
 * from their controlling inode's queue when they are being freed.  But
 * try_to_free_buffers() will be operating against the *blockdev* mapping
 * at the time, not against the S_ISREG file which depends on those buffers.
 * So the locking for i_private_list is via the i_private_lock in the address_space
 * which backs the buffers.  Which is different from the address_space 
 * against which the buffers are listed.  So for a particular address_space,
 * mapping->i_private_lock does *not* protect mapping->i_private_list!  In fact,
 * mapping->i_private_list will always be protected by the backing blockdev's
 * ->i_private_lock.
 *
 * Which introduces a requirement: all buffers on an address_space's
 * ->i_private_list must be from the same address_space: the blockdev's.
 *
 * address_spaces which do not place buffers at ->i_private_list via these
 * utility functions are free to use i_private_lock and i_private_list for
 * whatever they want.  The only requirement is that list_empty(i_private_list)
 * be true at clear_inode() time.
 *
 * FIXME: clear_inode should not call invalidate_inode_buffers().  The
 * filesystems should do that.  invalidate_inode_buffers() should just go
 * BUG_ON(!list_empty).
 *
 * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
 * take an address_space, not an inode.  And it should be called
 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
 * queued up.
 *
 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
 * list if it is already on a list.  Because if the buffer is on a list,
 * it *must* already be on the right one.  If not, the filesystem is being
 * silly.  This will save a ton of locking.  But first we have to ensure
 * that buffers are taken *off* the old inode's list when they are freed
 * (presumably in truncate).  That requires careful auditing of all
 * filesystems (do it inside bforget()).  It could also be done by bringing
 * b_inode back.
 */

/*
 * The buffer's backing address_space's i_private_lock must be held
 */
static void __remove_assoc_queue(struct buffer_head *bh)
{
        list_del_init(&bh->b_assoc_buffers);
        WARN_ON(!bh->b_assoc_map);
        bh->b_assoc_map = NULL;
}

int inode_has_buffers(struct inode *inode)
{
        return !list_empty(&inode->i_data.i_private_list);
}

/*
 * osync is designed to support O_SYNC io.  It waits synchronously for
 * all already-submitted IO to complete, but does not queue any new
 * writes to the disk.
 *
 * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
 * as you dirty the buffers, and then use osync_inode_buffers to wait for
 * completion.  Any other dirty buffers which are not yet queued for
 * write will not be flushed to disk by the osync.
 */
static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
{
        struct buffer_head *bh;
        struct list_head *p;
        int err = 0;

        spin_lock(lock);
repeat:
        list_for_each_prev(p, list) {
                bh = BH_ENTRY(p);
                if (buffer_locked(bh)) {
                        get_bh(bh);
                        spin_unlock(lock);
                        wait_on_buffer(bh);
                        if (!buffer_uptodate(bh))
                                err = -EIO;
                        brelse(bh);
                        spin_lock(lock);
                        goto repeat;
                }
        }
        spin_unlock(lock);
        return err;
}

/**
 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 * @mapping: the mapping which wants those buffers written
 *
 * Starts I/O against the buffers at mapping->i_private_list, and waits upon
 * that I/O.
 *
 * Basically, this is a convenience function for fsync().
 * @mapping is a file or directory which needs those buffers to be written for
 * a successful fsync().
 */
int sync_mapping_buffers(struct address_space *mapping)
{
        struct address_space *buffer_mapping = mapping->i_private_data;

        if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
                return 0;

        return fsync_buffers_list(&buffer_mapping->i_private_lock,
                                        &mapping->i_private_list);
}
EXPORT_SYMBOL(sync_mapping_buffers);

/**
 * generic_buffers_fsync_noflush - generic buffer fsync implementation
 * for simple filesystems with no inode lock
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure.
 */
int generic_buffers_fsync_noflush(struct file *file, loff_t start, loff_t end,
                                  bool datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;

        err = file_write_and_wait_range(file, start, end);
        if (err)
                return err;

        ret = sync_mapping_buffers(inode->i_mapping);
        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;

        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;

out:
        /* check and advance again to catch errors after syncing out buffers */
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        return ret;
}
EXPORT_SYMBOL(generic_buffers_fsync_noflush);

/**
 * generic_buffers_fsync - generic buffer fsync implementation
 * for simple filesystems with no inode lock
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure. This also makes sure that
 * a device cache flush operation is called at the end.
 */
int generic_buffers_fsync(struct file *file, loff_t start, loff_t end,
                          bool datasync)
{
        struct inode *inode = file->f_mapping->host;
        int ret;

        ret = generic_buffers_fsync_noflush(file, start, end, datasync);
        if (!ret)
                ret = blkdev_issue_flush(inode->i_sb->s_bdev);
        return ret;
}
EXPORT_SYMBOL(generic_buffers_fsync);

/*
 * Called when we've recently written block `bblock', and it is known that
 * `bblock' was for a buffer_boundary() buffer.  This means that the block at
 * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
 * dirty, schedule it for IO.  So that indirects merge nicely with their data.
 */
void write_boundary_block(struct block_device *bdev,
                        sector_t bblock, unsigned blocksize)
{
        struct buffer_head *bh;

        bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize);
        if (bh) {
                if (buffer_dirty(bh))
                        write_dirty_buffer(bh, 0);
                put_bh(bh);
        }
}

void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
{
        struct address_space *mapping = inode->i_mapping;
        struct address_space *buffer_mapping = bh->b_folio->mapping;

        mark_buffer_dirty(bh);
        if (!mapping->i_private_data) {
                mapping->i_private_data = buffer_mapping;
        } else {
                BUG_ON(mapping->i_private_data != buffer_mapping);
        }
        if (!bh->b_assoc_map) {
                spin_lock(&buffer_mapping->i_private_lock);
                list_move_tail(&bh->b_assoc_buffers,
                                &mapping->i_private_list);
                bh->b_assoc_map = mapping;
                spin_unlock(&buffer_mapping->i_private_lock);
        }
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);

/**
 * block_dirty_folio - Mark a folio as dirty.
 * @mapping: The address space containing this folio.
 * @folio: The folio to mark dirty.
 *
 * Filesystems which use buffer_heads can use this function as their
 * ->dirty_folio implementation.  Some filesystems need to do a little
 * work before calling this function.  Filesystems which do not use
 * buffer_heads should call filemap_dirty_folio() instead.
 *
 * If the folio has buffers, the uptodate buffers are set dirty, to
 * preserve dirty-state coherency between the folio and the buffers.
 * Buffers added to a dirty folio are created dirty.
 *
 * The buffers are dirtied before the folio is dirtied.  There's a small
 * race window in which writeback may see the folio cleanness but not the
 * buffer dirtiness.  That's fine.  If this code were to set the folio
 * dirty before the buffers, writeback could clear the folio dirty flag,
 * see a bunch of clean buffers and we'd end up with dirty buffers/clean
 * folio on the dirty folio list.
 *
 * We use i_private_lock to lock against try_to_free_buffers() while
 * using the folio's buffer list.  This also prevents clean buffers
 * being added to the folio after it was set dirty.
 *
 * Context: May only be called from process context.  Does not sleep.
 * Caller must ensure that @folio cannot be truncated during this call,
 * typically by holding the folio lock or having a page in the folio
 * mapped and holding the page table lock.
 *
 * Return: True if the folio was dirtied; false if it was already dirtied.
 */
bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        struct buffer_head *head;
        bool newly_dirty;

        spin_lock(&mapping->i_private_lock);
        head = folio_buffers(folio);
        if (head) {
                struct buffer_head *bh = head;

                do {
                        set_buffer_dirty(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        /*
         * Lock out page's memcg migration to keep PageDirty
         * synchronized with per-memcg dirty page counters.
         */
        newly_dirty = !folio_test_set_dirty(folio);
        spin_unlock(&mapping->i_private_lock);

        if (newly_dirty)
                __folio_mark_dirty(folio, mapping, 1);

        if (newly_dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);

        return newly_dirty;
}
EXPORT_SYMBOL(block_dirty_folio);

/*
 * Write out and wait upon a list of buffers.
 *
 * We have conflicting pressures: we want to make sure that all
 * initially dirty buffers get waited on, but that any subsequently
 * dirtied buffers don't.  After all, we don't want fsync to last
 * forever if somebody is actively writing to the file.
 *
 * Do this in two main stages: first we copy dirty buffers to a
 * temporary inode list, queueing the writes as we go.  Then we clean
 * up, waiting for those writes to complete.
 * 
 * During this second stage, any subsequent updates to the file may end
 * up refiling the buffer on the original inode's dirty list again, so
 * there is a chance we will end up with a buffer queued for write but
 * not yet completed on that list.  So, as a final cleanup we go through
 * the osync code to catch these locked, dirty buffers without requeuing
 * any newly dirty buffers for write.
 */
static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
{
        struct buffer_head *bh;
        struct address_space *mapping;
        int err = 0, err2;
        struct blk_plug plug;
        LIST_HEAD(tmp);

        blk_start_plug(&plug);

        spin_lock(lock);
        while (!list_empty(list)) {
                bh = BH_ENTRY(list->next);
                mapping = bh->b_assoc_map;
                __remove_assoc_queue(bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh) || buffer_locked(bh)) {
                        list_add(&bh->b_assoc_buffers, &tmp);
                        bh->b_assoc_map = mapping;
                        if (buffer_dirty(bh)) {
                                get_bh(bh);
                                spin_unlock(lock);
                                /*
                                 * Ensure any pending I/O completes so that
                                 * write_dirty_buffer() actually writes the
                                 * current contents - it is a noop if I/O is
                                 * still in flight on potentially older
                                 * contents.
                                 */
                                write_dirty_buffer(bh, REQ_SYNC);

                                /*
                                 * Kick off IO for the previous mapping. Note
                                 * that we will not run the very last mapping,
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
                                brelse(bh);
                                spin_lock(lock);
                        }
                }
        }

        spin_unlock(lock);
        blk_finish_plug(&plug);
        spin_lock(lock);

        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
                mapping = bh->b_assoc_map;
                __remove_assoc_queue(bh);
                /* Avoid race with mark_buffer_dirty_inode() which does
                 * a lockless check and we rely on seeing the dirty bit */
                smp_mb();
                if (buffer_dirty(bh)) {
                        list_add(&bh->b_assoc_buffers,
                                 &mapping->i_private_list);
                        bh->b_assoc_map = mapping;
                }
                spin_unlock(lock);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        err = -EIO;
                brelse(bh);
                spin_lock(lock);
        }
        
        spin_unlock(lock);
        err2 = osync_buffers_list(lock, list);
        if (err)
                return err;
        else
                return err2;
}

/*
 * Invalidate any and all dirty buffers on a given inode.  We are
 * probably unmounting the fs, but that doesn't mean we have already
 * done a sync().  Just drop the buffers from the inode list.
 *
 * NOTE: we take the inode's blockdev's mapping's i_private_lock.  Which
 * assumes that all the buffers are against the blockdev.
 */
void invalidate_inode_buffers(struct inode *inode)
{
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->i_private_list;
                struct address_space *buffer_mapping = mapping->i_private_data;

                spin_lock(&buffer_mapping->i_private_lock);
                while (!list_empty(list))
                        __remove_assoc_queue(BH_ENTRY(list->next));
                spin_unlock(&buffer_mapping->i_private_lock);
        }
}
EXPORT_SYMBOL(invalidate_inode_buffers);

/*
 * Remove any clean buffers from the inode's buffer list.  This is called
 * when we're trying to free the inode itself.  Those buffers can pin it.
 *
 * Returns true if all buffers were removed.
 */
int remove_inode_buffers(struct inode *inode)
{
        int ret = 1;

        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->i_private_list;
                struct address_space *buffer_mapping = mapping->i_private_data;

                spin_lock(&buffer_mapping->i_private_lock);
                while (!list_empty(list)) {
                        struct buffer_head *bh = BH_ENTRY(list->next);
                        if (buffer_dirty(bh)) {
                                ret = 0;
                                break;
                        }
                        __remove_assoc_queue(bh);
                }
                spin_unlock(&buffer_mapping->i_private_lock);
        }
        return ret;
}

/*
 * Create the appropriate buffers when given a folio for data area and
 * the size of each buffer.. Use the bh->b_this_page linked list to
 * follow the buffers created.  Return NULL if unable to create more
 * buffers.
 *
 * The retry flag is used to differentiate async IO (paging, swapping)
 * which may not fail from ordinary buffer allocations.
 */
struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size,
                                        gfp_t gfp)
{
        struct buffer_head *bh, *head;
        long offset;
        struct mem_cgroup *memcg, *old_memcg;

        /* The folio lock pins the memcg */
        memcg = folio_memcg(folio);
        old_memcg = set_active_memcg(memcg);

        head = NULL;
        offset = folio_size(folio);
        while ((offset -= size) >= 0) {
                bh = alloc_buffer_head(gfp);
                if (!bh)
                        goto no_grow;

                bh->b_this_page = head;
                bh->b_blocknr = -1;
                head = bh;

                bh->b_size = size;

                /* Link the buffer to its folio */
                folio_set_bh(bh, folio, offset);
        }
out:
        set_active_memcg(old_memcg);
        return head;
/*
 * In case anything failed, we just free everything we got.
 */
no_grow:
        if (head) {
                do {
                        bh = head;
                        head = head->b_this_page;
                        free_buffer_head(bh);
                } while (head);
        }

        goto out;
}
EXPORT_SYMBOL_GPL(folio_alloc_buffers);

struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size)
{
        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;

        return folio_alloc_buffers(page_folio(page), size, gfp);
}
EXPORT_SYMBOL_GPL(alloc_page_buffers);

static inline void link_dev_buffers(struct folio *folio,
                struct buffer_head *head)
{
        struct buffer_head *bh, *tail;

        bh = head;
        do {
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;
        folio_attach_private(folio, head);
}

static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
{
        sector_t retval = ~((sector_t)0);
        loff_t sz = bdev_nr_bytes(bdev);

        if (sz) {
                unsigned int sizebits = blksize_bits(size);
                retval = (sz >> sizebits);
        }
        return retval;
}

/*
 * Initialise the state of a blockdev folio's buffers.
 */ 
static sector_t folio_init_buffers(struct folio *folio,
                struct block_device *bdev, unsigned size)
{
        struct buffer_head *head = folio_buffers(folio);
        struct buffer_head *bh = head;
        bool uptodate = folio_test_uptodate(folio);
        sector_t block = div_u64(folio_pos(folio), size);
        sector_t end_block = blkdev_max_block(bdev, size);

        do {
                if (!buffer_mapped(bh)) {
                        bh->b_end_io = NULL;
                        bh->b_private = NULL;
                        bh->b_bdev = bdev;
                        bh->b_blocknr = block;
                        if (uptodate)
                                set_buffer_uptodate(bh);
                        if (block < end_block)
                                set_buffer_mapped(bh);
                }
                block++;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * Caller needs to validate requested block against end of device.
         */
        return end_block;
}

/*
 * Create the page-cache folio that contains the requested block.
 *
 * This is used purely for blockdev mappings.
 *
 * Returns false if we have a failure which cannot be cured by retrying
 * without sleeping.  Returns true if we succeeded, or the caller should retry.
 */
static bool grow_dev_folio(struct block_device *bdev, sector_t block,
                pgoff_t index, unsigned size, gfp_t gfp)
{
        struct address_space *mapping = bdev->bd_mapping;
        struct folio *folio;
        struct buffer_head *bh;
        sector_t end_block = 0;

        folio = __filemap_get_folio(mapping, index,
                        FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
        if (IS_ERR(folio))
                return false;

        bh = folio_buffers(folio);
        if (bh) {
                if (bh->b_size == size) {
                        end_block = folio_init_buffers(folio, bdev, size);
                        goto unlock;
                }

                /*
                 * Retrying may succeed; for example the folio may finish
                 * writeback, or buffers may be cleaned.  This should not
                 * happen very often; maybe we have old buffers attached to
                 * this blockdev's page cache and we're trying to change
                 * the block size?
                 */
                if (!try_to_free_buffers(folio)) {
                        end_block = ~0ULL;
                        goto unlock;
                }
        }

        bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
        if (!bh)
                goto unlock;

        /*
         * Link the folio to the buffers and initialise them.  Take the
         * lock to be atomic wrt __find_get_block(), which does not
         * run under the folio lock.
         */
        spin_lock(&mapping->i_private_lock);
        link_dev_buffers(folio, bh);
        end_block = folio_init_buffers(folio, bdev, size);
        spin_unlock(&mapping->i_private_lock);
unlock:
        folio_unlock(folio);
        folio_put(folio);
        return block < end_block;
}

/*
 * Create buffers for the specified block device block's folio.  If
 * that folio was dirty, the buffers are set dirty also.  Returns false
 * if we've hit a permanent error.
 */
static bool grow_buffers(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        loff_t pos;

        /*
         * Check for a block which lies outside our maximum possible
         * pagecache index.
         */
        if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
                printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
                        __func__, (unsigned long long)block,
                        bdev);
                return false;
        }

        /* Create a folio with the proper size buffers */
        return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
}

static struct buffer_head *
__getblk_slow(struct block_device *bdev, sector_t block,
             unsigned size, gfp_t gfp)
{
        /* Size must be multiple of hard sectorsize */
        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
                        (size < 512 || size > PAGE_SIZE))) {
                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
                                        size);
                printk(KERN_ERR "logical block size: %d\n",
                                        bdev_logical_block_size(bdev));

                dump_stack();
                return NULL;
        }

        for (;;) {
                struct buffer_head *bh;

                bh = __find_get_block(bdev, block, size);
                if (bh)
                        return bh;

                if (!grow_buffers(bdev, block, size, gfp))
                        return NULL;
        }
}

/*
 * The relationship between dirty buffers and dirty pages:
 *
 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
 * the page is tagged dirty in the page cache.
 *
 * At all times, the dirtiness of the buffers represents the dirtiness of
 * subsections of the page.  If the page has buffers, the page dirty bit is
 * merely a hint about the true dirty state.
 *
 * When a page is set dirty in its entirety, all its buffers are marked dirty
 * (if the page has buffers).
 *
 * When a buffer is marked dirty, its page is dirtied, but the page's other
 * buffers are not.
 *
 * Also.  When blockdev buffers are explicitly read with bread(), they
 * individually become uptodate.  But their backing page remains not
 * uptodate - even if all of its buffers are uptodate.  A subsequent
 * block_read_full_folio() against that folio will discover all the uptodate
 * buffers, will set the folio uptodate and will perform no I/O.
 */

/**
 * mark_buffer_dirty - mark a buffer_head as needing writeout
 * @bh: the buffer_head to mark dirty
 *
 * mark_buffer_dirty() will set the dirty bit against the buffer, then set
 * its backing page dirty, then tag the page as dirty in the page cache
 * and then attach the address_space's inode to its superblock's dirty
 * inode list.
 *
 * mark_buffer_dirty() is atomic.  It takes bh->b_folio->mapping->i_private_lock,
 * i_pages lock and mapping->host->i_lock.
 */
void mark_buffer_dirty(struct buffer_head *bh)
{
        WARN_ON_ONCE(!buffer_uptodate(bh));

        trace_block_dirty_buffer(bh);

        /*
         * Very *carefully* optimize the it-is-already-dirty case.
         *
         * Don't let the final "is it dirty" escape to before we
         * perhaps modified the buffer.
         */
        if (buffer_dirty(bh)) {
                smp_mb();
                if (buffer_dirty(bh))
                        return;
        }

        if (!test_set_buffer_dirty(bh)) {
                struct folio *folio = bh->b_folio;
                struct address_space *mapping = NULL;

                if (!folio_test_set_dirty(folio)) {
                        mapping = folio->mapping;
                        if (mapping)
                                __folio_mark_dirty(folio, mapping, 0);
                }
                if (mapping)
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
}
EXPORT_SYMBOL(mark_buffer_dirty);

void mark_buffer_write_io_error(struct buffer_head *bh)
{
        set_buffer_write_io_error(bh);
        /* FIXME: do we need to set this in both places? */
        if (bh->b_folio && bh->b_folio->mapping)
                mapping_set_error(bh->b_folio->mapping, -EIO);
        if (bh->b_assoc_map) {
                mapping_set_error(bh->b_assoc_map, -EIO);
                errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO);
        }
}
EXPORT_SYMBOL(mark_buffer_write_io_error);

/**
 * __brelse - Release a buffer.
 * @bh: The buffer to release.
 *
 * This variant of brelse() can be called if @bh is guaranteed to not be NULL.
 */
void __brelse(struct buffer_head *bh)
{
        if (atomic_read(&bh->b_count)) {
                put_bh(bh);
                return;
        }
        WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
}
EXPORT_SYMBOL(__brelse);

/**
 * __bforget - Discard any dirty data in a buffer.
 * @bh: The buffer to forget.
 *
 * This variant of bforget() can be called if @bh is guaranteed to not
 * be NULL.
 */
void __bforget(struct buffer_head *bh)
{
        clear_buffer_dirty(bh);
        if (bh->b_assoc_map) {
                struct address_space *buffer_mapping = bh->b_folio->mapping;

                spin_lock(&buffer_mapping->i_private_lock);
                list_del_init(&bh->b_assoc_buffers);
                bh->b_assoc_map = NULL;
                spin_unlock(&buffer_mapping->i_private_lock);
        }
        __brelse(bh);
}
EXPORT_SYMBOL(__bforget);

static struct buffer_head *__bread_slow(struct buffer_head *bh)
{
        lock_buffer(bh);
        if (buffer_uptodate(bh)) {
                unlock_buffer(bh);
                return bh;
        } else {
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
                submit_bh(REQ_OP_READ, bh);
                wait_on_buffer(bh);
                if (buffer_uptodate(bh))
                        return bh;
        }
        brelse(bh);
        return NULL;
}

/*
 * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
 * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
 * refcount elevated by one when they're in an LRU.  A buffer can only appear
 * once in a particular CPU's LRU.  A single buffer can be present in multiple
 * CPU's LRUs at the same time.
 *
 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
 * sb_find_get_block().
 *
 * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
 * a local interrupt disable for that.
 */

#define BH_LRU_SIZE        16

struct bh_lru {
        struct buffer_head *bhs[BH_LRU_SIZE];
};

static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};

#ifdef CONFIG_SMP
#define bh_lru_lock()        local_irq_disable()
#define bh_lru_unlock()        local_irq_enable()
#else
#define bh_lru_lock()        preempt_disable()
#define bh_lru_unlock()        preempt_enable()
#endif

static inline void check_irqs_on(void)
{
#ifdef irqs_disabled
        BUG_ON(irqs_disabled());
#endif
}

/*
 * Install a buffer_head into this cpu's LRU.  If not already in the LRU, it is
 * inserted at the front, and the buffer_head at the back if any is evicted.
 * Or, if already in the LRU it is moved to the front.
 */
static void bh_lru_install(struct buffer_head *bh)
{
        struct buffer_head *evictee = bh;
        struct bh_lru *b;
        int i;

        check_irqs_on();
        bh_lru_lock();

        /*
         * the refcount of buffer_head in bh_lru prevents dropping the
         * attached page(i.e., try_to_free_buffers) so it could cause
         * failing page migration.
         * Skip putting upcoming bh into bh_lru until migration is done.
         */
        if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) {
                bh_lru_unlock();
                return;
        }

        b = this_cpu_ptr(&bh_lrus);
        for (i = 0; i < BH_LRU_SIZE; i++) {
                swap(evictee, b->bhs[i]);
                if (evictee == bh) {
                        bh_lru_unlock();
                        return;
                }
        }

        get_bh(bh);
        bh_lru_unlock();
        brelse(evictee);
}

/*
 * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
 */
static struct buffer_head *
lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *ret = NULL;
        unsigned int i;

        check_irqs_on();
        bh_lru_lock();
        if (cpu_is_isolated(smp_processor_id())) {
                bh_lru_unlock();
                return NULL;
        }
        for (i = 0; i < BH_LRU_SIZE; i++) {
                struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);

                if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
                    bh->b_size == size) {
                        if (i) {
                                while (i) {
                                        __this_cpu_write(bh_lrus.bhs[i],
                                                __this_cpu_read(bh_lrus.bhs[i - 1]));
                                        i--;
                                }
                                __this_cpu_write(bh_lrus.bhs[0], bh);
                        }
                        get_bh(bh);
                        ret = bh;
                        break;
                }
        }
        bh_lru_unlock();
        return ret;
}

/*
 * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
 * it in the LRU and mark it as accessed.  If it is not present then return
 * NULL. Atomic context callers may also return NULL if the buffer is being
 * migrated; similarly the page is not marked accessed either.
 */
static struct buffer_head *
find_get_block_common(struct block_device *bdev, sector_t block,
                        unsigned size, bool atomic)
{
        struct buffer_head *bh = lookup_bh_lru(bdev, block, size);

        if (bh == NULL) {
                /* __find_get_block_slow will mark the page accessed */
                bh = __find_get_block_slow(bdev, block, atomic);
                if (bh)
                        bh_lru_install(bh);
        } else
                touch_buffer(bh);

        return bh;
}

struct buffer_head *
__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
{
        return find_get_block_common(bdev, block, size, true);
}
EXPORT_SYMBOL(__find_get_block);

/* same as __find_get_block() but allows sleeping contexts */
struct buffer_head *
__find_get_block_nonatomic(struct block_device *bdev, sector_t block,
                           unsigned size)
{
        return find_get_block_common(bdev, block, size, false);
}
EXPORT_SYMBOL(__find_get_block_nonatomic);

/**
 * bdev_getblk - Get a buffer_head in a block device's buffer cache.
 * @bdev: The block device.
 * @block: The block number.
 * @size: The size of buffer_heads for this @bdev.
 * @gfp: The memory allocation flags to use.
 *
 * The returned buffer head has its reference count incremented, but is
 * not locked.  The caller should call brelse() when it has finished
 * with the buffer.  The buffer may not be uptodate.  If needed, the
 * caller can bring it uptodate either by reading it or overwriting it.
 *
 * Return: The buffer head, or NULL if memory could not be allocated.
 */
struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        struct buffer_head *bh;

        if (gfpflags_allow_blocking(gfp))
                bh = __find_get_block_nonatomic(bdev, block, size);
        else
                bh = __find_get_block(bdev, block, size);

        might_alloc(gfp);
        if (bh)
                return bh;

        return __getblk_slow(bdev, block, size, gfp);
}
EXPORT_SYMBOL(bdev_getblk);

/*
 * Do async read-ahead on a buffer..
 */
void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
{
        struct buffer_head *bh = bdev_getblk(bdev, block, size,
                        GFP_NOWAIT | __GFP_MOVABLE);

        if (likely(bh)) {
                bh_readahead(bh, REQ_RAHEAD);
                brelse(bh);
        }
}
EXPORT_SYMBOL(__breadahead);

/**
 * __bread_gfp() - Read a block.
 * @bdev: The block device to read from.
 * @block: Block number in units of block size.
 * @size: The block size of this device in bytes.
 * @gfp: Not page allocation flags; see below.
 *
 * You are not expected to call this function.  You should use one of
 * sb_bread(), sb_bread_unmovable() or __bread().
 *
 * Read a specified block, and return the buffer head that refers to it.
 * If @gfp is 0, the memory will be allocated using the block device's
 * default GFP flags.  If @gfp is __GFP_MOVABLE, the memory may be
 * allocated from a movable area.  Do not pass in a complete set of
 * GFP flags.
 *
 * The returned buffer head has its refcount increased.  The caller should
 * call brelse() when it has finished with the buffer.
 *
 * Context: May sleep waiting for I/O.
 * Return: NULL if the block was unreadable.
 */
struct buffer_head *__bread_gfp(struct block_device *bdev, sector_t block,
                unsigned size, gfp_t gfp)
{
        struct buffer_head *bh;

        gfp |= mapping_gfp_constraint(bdev->bd_mapping, ~__GFP_FS);

        /*
         * Prefer looping in the allocator rather than here, at least that
         * code knows what it's doing.
         */
        gfp |= __GFP_NOFAIL;

        bh = bdev_getblk(bdev, block, size, gfp);

        if (likely(bh) && !buffer_uptodate(bh))
                bh = __bread_slow(bh);
        return bh;
}
EXPORT_SYMBOL(__bread_gfp);

static void __invalidate_bh_lrus(struct bh_lru *b)
{
        int i;

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
}
/*
 * invalidate_bh_lrus() is called rarely - but not only at unmount.
 * This doesn't race because it runs in each cpu either in irq
 * or with preempt disabled.
 */
static void invalidate_bh_lru(void *arg)
{
        struct bh_lru *b = &get_cpu_var(bh_lrus);

        __invalidate_bh_lrus(b);
        put_cpu_var(bh_lrus);
}

bool has_bh_in_lru(int cpu, void *dummy)
{
        struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
        int i;
        
        for (i = 0; i < BH_LRU_SIZE; i++) {
                if (b->bhs[i])
                        return true;
        }

        return false;
}

void invalidate_bh_lrus(void)
{
        on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);

/*
 * It's called from workqueue context so we need a bh_lru_lock to close
 * the race with preemption/irq.
 */
void invalidate_bh_lrus_cpu(void)
{
        struct bh_lru *b;

        bh_lru_lock();
        b = this_cpu_ptr(&bh_lrus);
        __invalidate_bh_lrus(b);
        bh_lru_unlock();
}

void folio_set_bh(struct buffer_head *bh, struct folio *folio,
                  unsigned long offset)
{
        bh->b_folio = folio;
        BUG_ON(offset >= folio_size(folio));
        if (folio_test_highmem(folio))
                /*
                 * This catches illegal uses and preserves the offset:
                 */
                bh->b_data = (char *)(0 + offset);
        else
                bh->b_data = folio_address(folio) + offset;
}
EXPORT_SYMBOL(folio_set_bh);

/*
 * Called when truncating a buffer on a page completely.
 */

/* Bits that are cleared during an invalidate */
#define BUFFER_FLAGS_DISCARD \
        (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
         1 << BH_Delay | 1 << BH_Unwritten)

static void discard_buffer(struct buffer_head * bh)
{
        unsigned long b_state;

        lock_buffer(bh);
        clear_buffer_dirty(bh);
        bh->b_bdev = NULL;
        b_state = READ_ONCE(bh->b_state);
        do {
        } while (!try_cmpxchg(&bh->b_state, &b_state,
                              b_state & ~BUFFER_FLAGS_DISCARD));
        unlock_buffer(bh);
}

/**
 * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
 * @folio: The folio which is affected.
 * @offset: start of the range to invalidate
 * @length: length of the range to invalidate
 *
 * block_invalidate_folio() is called when all or part of the folio has been
 * invalidated by a truncate operation.
 *
 * block_invalidate_folio() does not have to release all buffers, but it must
 * ensure that no dirty buffer is left outside @offset and that no I/O
 * is underway against any of the blocks which are outside the truncation
 * point.  Because the caller is about to free (and possibly reuse) those
 * blocks on-disk.
 */
void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
{
        struct buffer_head *head, *bh, *next;
        size_t curr_off = 0;
        size_t stop = length + offset;

        BUG_ON(!folio_test_locked(folio));

        /*
         * Check for overflow
         */
        BUG_ON(stop > folio_size(folio) || stop < length);

        head = folio_buffers(folio);
        if (!head)
                return;

        bh = head;
        do {
                size_t next_off = curr_off + bh->b_size;
                next = bh->b_this_page;

                /*
                 * Are we still fully in range ?
                 */
                if (next_off > stop)
                        goto out;

                /*
                 * is this block fully invalidated?
                 */
                if (offset <= curr_off)
                        discard_buffer(bh);
                curr_off = next_off;
                bh = next;
        } while (bh != head);

        /*
         * We release buffers only if the entire folio is being invalidated.
         * The get_block cached value has been unconditionally invalidated,
         * so real IO is not possible anymore.
         */
        if (length == folio_size(folio))
                filemap_release_folio(folio, 0);
out:
        folio_clear_mappedtodisk(folio);
        return;
}
EXPORT_SYMBOL(block_invalidate_folio);

/*
 * We attach and possibly dirty the buffers atomically wrt
 * block_dirty_folio() via i_private_lock.  try_to_free_buffers
 * is already excluded via the folio lock.
 */
struct buffer_head *create_empty_buffers(struct folio *folio,
                unsigned long blocksize, unsigned long b_state)
{
        struct buffer_head *bh, *head, *tail;
        gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL;

        head = folio_alloc_buffers(folio, blocksize, gfp);
        bh = head;
        do {
                bh->b_state |= b_state;
                tail = bh;
                bh = bh->b_this_page;
        } while (bh);
        tail->b_this_page = head;

        spin_lock(&folio->mapping->i_private_lock);
        if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
                bh = head;
                do {
                        if (folio_test_dirty(folio))
                                set_buffer_dirty(bh);
                        if (folio_test_uptodate(folio))
                                set_buffer_uptodate(bh);
                        bh = bh->b_this_page;
                } while (bh != head);
        }
        folio_attach_private(folio, head);
        spin_unlock(&folio->mapping->i_private_lock);

        return head;
}
EXPORT_SYMBOL(create_empty_buffers);

/**
 * clean_bdev_aliases: clean a range of buffers in block device
 * @bdev: Block device to clean buffers in
 * @block: Start of a range of blocks to clean
 * @len: Number of blocks to clean
 *
 * We are taking a range of blocks for data and we don't want writeback of any
 * buffer-cache aliases starting from return from this function and until the
 * moment when something will explicitly mark the buffer dirty (hopefully that
 * will not happen until we will free that block ;-) We don't even need to mark
 * it not-uptodate - nobody can expect anything from a newly allocated buffer
 * anyway. We used to use unmap_buffer() for such invalidation, but that was
 * wrong. We definitely don't want to mark the alias unmapped, for example - it
 * would confuse anyone who might pick it with bread() afterwards...
 *
 * Also..  Note that bforget() doesn't lock the buffer.  So there can be
 * writeout I/O going on against recently-freed buffers.  We don't wait on that
 * I/O in bforget() - it's more efficient to wait on the I/O only if we really
 * need to.  That happens here.
 */
void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
{
        struct address_space *bd_mapping = bdev->bd_mapping;
        const int blkbits = bd_mapping->host->i_blkbits;
        struct folio_batch fbatch;
        pgoff_t index = ((loff_t)block << blkbits) / PAGE_SIZE;
        pgoff_t end;
        int i, count;
        struct buffer_head *bh;
        struct buffer_head *head;

        end = ((loff_t)(block + len - 1) << blkbits) / PAGE_SIZE;
        folio_batch_init(&fbatch);
        while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
                count = folio_batch_count(&fbatch);
                for (i = 0; i < count; i++) {
                        struct folio *folio = fbatch.folios[i];

                        if (!folio_buffers(folio))
                                continue;
                        /*
                         * We use folio lock instead of bd_mapping->i_private_lock
                         * to pin buffers here since we can afford to sleep and
                         * it scales better than a global spinlock lock.
                         */
                        folio_lock(folio);
                        /* Recheck when the folio is locked which pins bhs */
                        head = folio_buffers(folio);
                        if (!head)
                                goto unlock_page;
                        bh = head;
                        do {
                                if (!buffer_mapped(bh) || (bh->b_blocknr < block))
                                        goto next;
                                if (bh->b_blocknr >= block + len)
                                        break;
                                clear_buffer_dirty(bh);
                                wait_on_buffer(bh);
                                clear_buffer_req(bh);
next:
                                bh = bh->b_this_page;
                        } while (bh != head);
unlock_page:
                        folio_unlock(folio);
                }
                folio_batch_release(&fbatch);
                cond_resched();
                /* End of range already reached? */
                if (index > end || !index)
                        break;
        }
}
EXPORT_SYMBOL(clean_bdev_aliases);

static struct buffer_head *folio_create_buffers(struct folio *folio,
                                                struct inode *inode,
                                                unsigned int b_state)
{
        struct buffer_head *bh;

        BUG_ON(!folio_test_locked(folio));

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio,
                                1 << READ_ONCE(inode->i_blkbits), b_state);
        return bh;
}

/*
 * NOTE! All mapped/uptodate combinations are valid:
 *
 *        Mapped        Uptodate        Meaning
 *
 *        No        No                "unknown" - must do get_block()
 *        No        Yes                "hole" - zero-filled
 *        Yes        No                "allocated" - allocated on disk, not read in
 *        Yes        Yes                "valid" - allocated and up-to-date in memory.
 *
 * "Dirty" is valid only with the last case (mapped+uptodate).
 */

/*
 * While block_write_full_folio is writing back the dirty buffers under
 * the page lock, whoever dirtied the buffers may decide to clean them
 * again at any time.  We handle that by only looking at the buffer
 * state inside lock_buffer().
 *
 * If block_write_full_folio() is called for regular writeback
 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
 * locked buffer.   This only can happen if someone has written the buffer
 * directly, with submit_bh().  At the address_space level PageWriteback
 * prevents this contention from occurring.
 *
 * If block_write_full_folio() is called with wbc->sync_mode ==
 * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
 * causes the writes to be flagged as synchronous writes.
 */
int __block_write_full_folio(struct inode *inode, struct folio *folio,
                        get_block_t *get_block, struct writeback_control *wbc)
{
        int err;
        sector_t block;
        sector_t last_block;
        struct buffer_head *bh, *head;
        size_t blocksize;
        int nr_underway = 0;
        blk_opf_t write_flags = wbc_to_write_flags(wbc);

        head = folio_create_buffers(folio, inode,
                                    (1 << BH_Dirty) | (1 << BH_Uptodate));

        /*
         * Be very careful.  We have no exclusion from block_dirty_folio
         * here, and the (potentially unmapped) buffers may become dirty at
         * any time.  If a buffer becomes dirty here after we've inspected it
         * then we just miss that fact, and the folio stays dirty.
         *
         * Buffers outside i_size may be dirtied by block_dirty_folio;
         * handle that here by just cleaning them.
         */

        bh = head;
        blocksize = bh->b_size;

        block = div_u64(folio_pos(folio), blocksize);
        last_block = div_u64(i_size_read(inode) - 1, blocksize);

        /*
         * Get all the dirty buffers mapped to disk addresses and
         * handle any aliases from the underlying blockdev's mapping.
         */
        do {
                if (block > last_block) {
                        /*
                         * mapped buffers outside i_size will occur, because
                         * this folio can be outside i_size when there is a
                         * truncate in progress.
                         */
                        /*
                         * The buffer was zeroed by block_write_full_folio()
                         */
                        clear_buffer_dirty(bh);
                        set_buffer_uptodate(bh);
                } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
                           buffer_dirty(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        err = get_block(inode, block, bh, 1);
                        if (err)
                                goto recover;
                        clear_buffer_delay(bh);
                        if (buffer_new(bh)) {
                                /* blockdev mappings never come here */
                                clear_buffer_new(bh);
                                clean_bdev_bh_alias(bh);
                        }
                }
                bh = bh->b_this_page;
                block++;
        } while (bh != head);

        do {
                if (!buffer_mapped(bh))
                        continue;
                /*
                 * If it's a fully non-blocking write attempt and we cannot
                 * lock the buffer then redirty the folio.  Note that this can
                 * potentially cause a busy-wait loop from writeback threads
                 * and kswapd activity, but those code paths have their own
                 * higher-level throttling.
                 */
                if (wbc->sync_mode != WB_SYNC_NONE) {
                        lock_buffer(bh);
                } else if (!trylock_buffer(bh)) {
                        folio_redirty_for_writepage(wbc, folio);
                        continue;
                }
                if (test_clear_buffer_dirty(bh)) {
                        mark_buffer_async_write_endio(bh,
                                end_buffer_async_write);
                } else {
                        unlock_buffer(bh);
                }
        } while ((bh = bh->b_this_page) != head);

        /*
         * The folio and its buffers are protected by the writeback flag,
         * so we can drop the bh refcounts early.
         */
        BUG_ON(folio_test_writeback(folio));
        folio_start_writeback(folio);

        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
                                      inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        folio_unlock(folio);

        err = 0;
done:
        if (nr_underway == 0) {
                /*
                 * The folio was marked dirty, but the buffers were
                 * clean.  Someone wrote them back by hand with
                 * write_dirty_buffer/submit_bh.  A rare case.
                 */
                folio_end_writeback(folio);

                /*
                 * The folio and buffer_heads can be released at any time from
                 * here on.
                 */
        }
        return err;

recover:
        /*
         * ENOSPC, or some other error.  We may already have added some
         * blocks to the file, so we need to write these out to avoid
         * exposing stale data.
         * The folio is currently locked and not marked for writeback
         */
        bh = head;
        /* Recovery: lock and submit the mapped buffers */
        do {
                if (buffer_mapped(bh) && buffer_dirty(bh) &&
                    !buffer_delay(bh)) {
                        lock_buffer(bh);
                        mark_buffer_async_write_endio(bh,
                                end_buffer_async_write);
                } else {
                        /*
                         * The buffer may have been set dirty during
                         * attachment to a dirty folio.
                         */
                        clear_buffer_dirty(bh);
                }
        } while ((bh = bh->b_this_page) != head);
        BUG_ON(folio_test_writeback(folio));
        mapping_set_error(folio->mapping, err);
        folio_start_writeback(folio);
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
                        submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
                                      inode->i_write_hint, wbc);
                        nr_underway++;
                }
                bh = next;
        } while (bh != head);
        folio_unlock(folio);
        goto done;
}
EXPORT_SYMBOL(__block_write_full_folio);

/*
 * If a folio has any new buffers, zero them out here, and mark them uptodate
 * and dirty so they'll be written out (in order to prevent uninitialised
 * block data from leaking). And clear the new bit.
 */
void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to)
{
        size_t block_start, block_end;
        struct buffer_head *head, *bh;

        BUG_ON(!folio_test_locked(folio));
        head = folio_buffers(folio);
        if (!head)
                return;

        bh = head;
        block_start = 0;
        do {
                block_end = block_start + bh->b_size;

                if (buffer_new(bh)) {
                        if (block_end > from && block_start < to) {
                                if (!folio_test_uptodate(folio)) {
                                        size_t start, xend;

                                        start = max(from, block_start);
                                        xend = min(to, block_end);

                                        folio_zero_segment(folio, start, xend);
                                        set_buffer_uptodate(bh);
                                }

                                clear_buffer_new(bh);
                                mark_buffer_dirty(bh);
                        }
                }

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);
}
EXPORT_SYMBOL(folio_zero_new_buffers);

static int
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
                const struct iomap *iomap)
{
        loff_t offset = (loff_t)block << inode->i_blkbits;

        bh->b_bdev = iomap->bdev;

        /*
         * Block points to offset in file we need to map, iomap contains
         * the offset at which the map starts. If the map ends before the
         * current block, then do not map the buffer and let the caller
         * handle it.
         */
        if (offset >= iomap->offset + iomap->length)
                return -EIO;

        switch (iomap->type) {
        case IOMAP_HOLE:
                /*
                 * If the buffer is not up to date or beyond the current EOF,
                 * we need to mark it as new to ensure sub-block zeroing is
                 * executed if necessary.
                 */
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                return 0;
        case IOMAP_DELALLOC:
                if (!buffer_uptodate(bh) ||
                    (offset >= i_size_read(inode)))
                        set_buffer_new(bh);
                set_buffer_uptodate(bh);
                set_buffer_mapped(bh);
                set_buffer_delay(bh);
                return 0;
        case IOMAP_UNWRITTEN:
                /*
                 * For unwritten regions, we always need to ensure that regions
                 * in the block we are not writing to are zeroed. Mark the
                 * buffer as new to ensure this.
                 */
                set_buffer_new(bh);
                set_buffer_unwritten(bh);
                fallthrough;
        case IOMAP_MAPPED:
                if ((iomap->flags & IOMAP_F_NEW) ||
                    offset >= i_size_read(inode)) {
                        /*
                         * This can happen if truncating the block device races
                         * with the check in the caller as i_size updates on
                         * block devices aren't synchronized by i_rwsem for
                         * block devices.
                         */
                        if (S_ISBLK(inode->i_mode))
                                return -EIO;
                        set_buffer_new(bh);
                }
                bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
                                inode->i_blkbits;
                set_buffer_mapped(bh);
                return 0;
        default:
                WARN_ON_ONCE(1);
                return -EIO;
        }
}

int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block, const struct iomap *iomap)
{
        size_t from = offset_in_folio(folio, pos);
        size_t to = from + len;
        struct inode *inode = folio->mapping->host;
        size_t block_start, block_end;
        sector_t block;
        int err = 0;
        size_t blocksize;
        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

        BUG_ON(!folio_test_locked(folio));
        BUG_ON(to > folio_size(folio));
        BUG_ON(from > to);

        head = folio_create_buffers(folio, inode, 0);
        blocksize = head->b_size;
        block = div_u64(folio_pos(folio), blocksize);

        for (bh = head, block_start = 0; bh != head || !block_start;
            block++, block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (folio_test_uptodate(folio)) {
                                if (!buffer_uptodate(bh))
                                        set_buffer_uptodate(bh);
                        }
                        continue;
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);
                if (!buffer_mapped(bh)) {
                        WARN_ON(bh->b_size != blocksize);
                        if (get_block)
                                err = get_block(inode, block, bh, 1);
                        else
                                err = iomap_to_bh(inode, block, bh, iomap);
                        if (err)
                                break;

                        if (buffer_new(bh)) {
                                clean_bdev_bh_alias(bh);
                                if (folio_test_uptodate(folio)) {
                                        clear_buffer_new(bh);
                                        set_buffer_uptodate(bh);
                                        mark_buffer_dirty(bh);
                                        continue;
                                }
                                if (block_end > to || block_start < from)
                                        folio_zero_segments(folio,
                                                to, block_end,
                                                block_start, from);
                                continue;
                        }
                }
                if (folio_test_uptodate(folio)) {
                        if (!buffer_uptodate(bh))
                                set_buffer_uptodate(bh);
                        continue; 
                }
                if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
                    !buffer_unwritten(bh) &&
                     (block_start < from || block_end > to)) {
                        bh_read_nowait(bh, 0);
                        *wait_bh++=bh;
                }
        }
        /*
         * If we issued read requests - let them complete.
         */
        while(wait_bh > wait) {
                wait_on_buffer(*--wait_bh);
                if (!buffer_uptodate(*wait_bh))
                        err = -EIO;
        }
        if (unlikely(err))
                folio_zero_new_buffers(folio, from, to);
        return err;
}

int __block_write_begin(struct folio *folio, loff_t pos, unsigned len,
                get_block_t *get_block)
{
        return __block_write_begin_int(folio, pos, len, get_block, NULL);
}
EXPORT_SYMBOL(__block_write_begin);

void block_commit_write(struct folio *folio, size_t from, size_t to)
{
        size_t block_start, block_end;
        bool partial = false;
        unsigned blocksize;
        struct buffer_head *bh, *head;

        bh = head = folio_buffers(folio);
        if (!bh)
                return;
        blocksize = bh->b_size;

        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (!buffer_uptodate(bh))
                                partial = true;
                } else {
                        set_buffer_uptodate(bh);
                        mark_buffer_dirty(bh);
                }
                if (buffer_new(bh))
                        clear_buffer_new(bh);

                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        /*
         * If this is a partial write which happened to make all buffers
         * uptodate then we can optimize away a bogus read_folio() for
         * the next read(). Here we 'discover' whether the folio went
         * uptodate as a result of this (potentially partial) write.
         */
        if (!partial)
                folio_mark_uptodate(folio);
}
EXPORT_SYMBOL(block_commit_write);

/*
 * block_write_begin takes care of the basic task of block allocation and
 * bringing partial write blocks uptodate first.
 *
 * The filesystem needs to handle block truncation upon failure.
 */
int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
                struct folio **foliop, get_block_t *get_block)
{
        pgoff_t index = pos >> PAGE_SHIFT;
        struct folio *folio;
        int status;

        folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        status = __block_write_begin_int(folio, pos, len, get_block, NULL);
        if (unlikely(status)) {
                folio_unlock(folio);
                folio_put(folio);
                folio = NULL;
        }

        *foliop = folio;
        return status;
}
EXPORT_SYMBOL(block_write_begin);

int block_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio, void *fsdata)
{
        size_t start = pos - folio_pos(folio);

        if (unlikely(copied < len)) {
                /*
                 * The buffers that were written will now be uptodate, so
                 * we don't have to worry about a read_folio reading them
                 * and overwriting a partial write. However if we have
                 * encountered a short write and only partially written
                 * into a buffer, it will not be marked uptodate, so a
                 * read_folio might come in and destroy our partial write.
                 *
                 * Do the simplest thing, and just treat any short write to a
                 * non uptodate folio as a zero-length write, and force the
                 * caller to redo the whole thing.
                 */
                if (!folio_test_uptodate(folio))
                        copied = 0;

                folio_zero_new_buffers(folio, start+copied, start+len);
        }
        flush_dcache_folio(folio);

        /* This could be a short (even 0-length) commit */
        block_commit_write(folio, start, start + copied);

        return copied;
}
EXPORT_SYMBOL(block_write_end);

int generic_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio, void *fsdata)
{
        struct inode *inode = mapping->host;
        loff_t old_size = inode->i_size;
        bool i_size_changed = false;

        copied = block_write_end(file, mapping, pos, len, copied, folio, fsdata);

        /*
         * No need to use i_size_read() here, the i_size cannot change under us
         * because we hold i_rwsem.
         *
         * But it's important to update i_size while still holding folio lock:
         * page writeout could otherwise come in and zero beyond i_size.
         */
        if (pos + copied > inode->i_size) {
                i_size_write(inode, pos + copied);
                i_size_changed = true;
        }

        folio_unlock(folio);
        folio_put(folio);

        if (old_size < pos)
                pagecache_isize_extended(inode, old_size, pos);
        /*
         * Don't mark the inode dirty under page lock. First, it unnecessarily
         * makes the holding time of page lock longer. Second, it forces lock
         * ordering of page lock and transaction start for journaling
         * filesystems.
         */
        if (i_size_changed)
                mark_inode_dirty(inode);
        return copied;
}
EXPORT_SYMBOL(generic_write_end);

/*
 * block_is_partially_uptodate checks whether buffers within a folio are
 * uptodate or not.
 *
 * Returns true if all buffers which correspond to the specified part
 * of the folio are uptodate.
 */
bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
{
        unsigned block_start, block_end, blocksize;
        unsigned to;
        struct buffer_head *bh, *head;
        bool ret = true;

        head = folio_buffers(folio);
        if (!head)
                return false;
        blocksize = head->b_size;
        to = min_t(unsigned, folio_size(folio) - from, count);
        to = from + to;
        if (from < blocksize && to > folio_size(folio) - blocksize)
                return false;

        bh = head;
        block_start = 0;
        do {
                block_end = block_start + blocksize;
                if (block_end > from && block_start < to) {
                        if (!buffer_uptodate(bh)) {
                                ret = false;
                                break;
                        }
                        if (block_end >= to)
                                break;
                }
                block_start = block_end;
                bh = bh->b_this_page;
        } while (bh != head);

        return ret;
}
EXPORT_SYMBOL(block_is_partially_uptodate);

/*
 * Generic "read_folio" function for block devices that have the normal
 * get_block functionality. This is most of the block device filesystems.
 * Reads the folio asynchronously --- the unlock_buffer() and
 * set/clear_buffer_uptodate() functions propagate buffer state into the
 * folio once IO has completed.
 */
int block_read_full_folio(struct folio *folio, get_block_t *get_block)
{
        struct inode *inode = folio->mapping->host;
        sector_t iblock, lblock;
        struct buffer_head *bh, *head, *prev = NULL;
        size_t blocksize;
        int fully_mapped = 1;
        bool page_error = false;
        loff_t limit = i_size_read(inode);

        /* This is needed for ext4. */
        if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
                limit = inode->i_sb->s_maxbytes;

        head = folio_create_buffers(folio, inode, 0);
        blocksize = head->b_size;

        iblock = div_u64(folio_pos(folio), blocksize);
        lblock = div_u64(limit + blocksize - 1, blocksize);
        bh = head;

        do {
                if (buffer_uptodate(bh))
                        continue;

                if (!buffer_mapped(bh)) {
                        int err = 0;

                        fully_mapped = 0;
                        if (iblock < lblock) {
                                WARN_ON(bh->b_size != blocksize);
                                err = get_block(inode, iblock, bh, 0);
                                if (err)
                                        page_error = true;
                        }
                        if (!buffer_mapped(bh)) {
                                folio_zero_range(folio, bh_offset(bh),
                                                blocksize);
                                if (!err)
                                        set_buffer_uptodate(bh);
                                continue;
                        }
                        /*
                         * get_block() might have updated the buffer
                         * synchronously
                         */
                        if (buffer_uptodate(bh))
                                continue;
                }

                lock_buffer(bh);
                if (buffer_uptodate(bh)) {
                        unlock_buffer(bh);
                        continue;
                }

                mark_buffer_async_read(bh);
                if (prev)
                        submit_bh(REQ_OP_READ, prev);
                prev = bh;
        } while (iblock++, (bh = bh->b_this_page) != head);

        if (fully_mapped)
                folio_set_mappedtodisk(folio);

        /*
         * All buffers are uptodate or get_block() returned an error
         * when trying to map them - we must finish the read because
         * end_buffer_async_read() will never be called on any buffer
         * in this folio.
         */
        if (prev)
                submit_bh(REQ_OP_READ, prev);
        else
                folio_end_read(folio, !page_error);

        return 0;
}
EXPORT_SYMBOL(block_read_full_folio);

/* utility function for filesystems that need to do work on expanding
 * truncates.  Uses filesystem pagecache writes to allow the filesystem to
 * deal with the hole.  
 */
int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
        struct address_space *mapping = inode->i_mapping;
        const struct address_space_operations *aops = mapping->a_ops;
        struct folio *folio;
        void *fsdata = NULL;
        int err;

        err = inode_newsize_ok(inode, size);
        if (err)
                goto out;

        err = aops->write_begin(NULL, mapping, size, 0, &folio, &fsdata);
        if (err)
                goto out;

        err = aops->write_end(NULL, mapping, size, 0, 0, folio, fsdata);
        BUG_ON(err > 0);

out:
        return err;
}
EXPORT_SYMBOL(generic_cont_expand_simple);

static int cont_expand_zero(struct file *file, struct address_space *mapping,
                            loff_t pos, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        const struct address_space_operations *aops = mapping->a_ops;
        unsigned int blocksize = i_blocksize(inode);
        struct folio *folio;
        void *fsdata = NULL;
        pgoff_t index, curidx;
        loff_t curpos;
        unsigned zerofrom, offset, len;
        int err = 0;

        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;

        while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
                zerofrom = curpos & ~PAGE_MASK;
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = PAGE_SIZE - zerofrom;

                err = aops->write_begin(file, mapping, curpos, len,
                                            &folio, &fsdata);
                if (err)
                        goto out;
                folio_zero_range(folio, offset_in_folio(folio, curpos), len);
                err = aops->write_end(file, mapping, curpos, len, len,
                                                folio, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;

                balance_dirty_pages_ratelimited(mapping);

                if (fatal_signal_pending(current)) {
                        err = -EINTR;
                        goto out;
                }
        }

        /* page covers the boundary, find the boundary offset */
        if (index == curidx) {
                zerofrom = curpos & ~PAGE_MASK;
                /* if we will expand the thing last block will be filled */
                if (offset <= zerofrom) {
                        goto out;
                }
                if (zerofrom & (blocksize-1)) {
                        *bytes |= (blocksize-1);
                        (*bytes)++;
                }
                len = offset - zerofrom;

                err = aops->write_begin(file, mapping, curpos, len,
                                            &folio, &fsdata);
                if (err)
                        goto out;
                folio_zero_range(folio, offset_in_folio(folio, curpos), len);
                err = aops->write_end(file, mapping, curpos, len, len,
                                                folio, fsdata);
                if (err < 0)
                        goto out;
                BUG_ON(err != len);
                err = 0;
        }
out:
        return err;
}

/*
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
int cont_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct folio **foliop, void **fsdata,
                        get_block_t *get_block, loff_t *bytes)
{
        struct inode *inode = mapping->host;
        unsigned int blocksize = i_blocksize(inode);
        unsigned int zerofrom;
        int err;

        err = cont_expand_zero(file, mapping, pos, bytes);
        if (err)
                return err;

        zerofrom = *bytes & ~PAGE_MASK;
        if (pos+len > *bytes && zerofrom & (blocksize-1)) {
                *bytes |= (blocksize-1);
                (*bytes)++;
        }

        return block_write_begin(mapping, pos, len, foliop, get_block);
}
EXPORT_SYMBOL(cont_write_begin);

/*
 * block_page_mkwrite() is not allowed to change the file size as it gets
 * called from a page fault handler when a page is first dirtied. Hence we must
 * be careful to check for EOF conditions here. We set the page up correctly
 * for a written page which means we get ENOSPC checking when writing into
 * holes and correct delalloc and unwritten extent mapping on filesystems that
 * support these features.
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
 * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 *
 * Direct callers of this function should protect against filesystem freezing
 * using sb_start_pagefault() - sb_end_pagefault() functions.
 */
int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                         get_block_t get_block)
{
        struct folio *folio = page_folio(vmf->page);
        struct inode *inode = file_inode(vma->vm_file);
        unsigned long end;
        loff_t size;
        int ret;

        folio_lock(folio);
        size = i_size_read(inode);
        if ((folio->mapping != inode->i_mapping) ||
            (folio_pos(folio) >= size)) {
                /* We overload EFAULT to mean page got truncated */
                ret = -EFAULT;
                goto out_unlock;
        }

        end = folio_size(folio);
        /* folio is wholly or partially inside EOF */
        if (folio_pos(folio) + end > size)
                end = size - folio_pos(folio);

        ret = __block_write_begin_int(folio, 0, end, get_block, NULL);
        if (unlikely(ret))
                goto out_unlock;

        block_commit_write(folio, 0, end);

        folio_mark_dirty(folio);
        folio_wait_stable(folio);
        return 0;
out_unlock:
        folio_unlock(folio);
        return ret;
}
EXPORT_SYMBOL(block_page_mkwrite);

int block_truncate_page(struct address_space *mapping,
                        loff_t from, get_block_t *get_block)
{
        pgoff_t index = from >> PAGE_SHIFT;
        unsigned blocksize;
        sector_t iblock;
        size_t offset, length, pos;
        struct inode *inode = mapping->host;
        struct folio *folio;
        struct buffer_head *bh;
        int err = 0;

        blocksize = i_blocksize(inode);
        length = from & (blocksize - 1);

        /* Block boundary? Nothing to do */
        if (!length)
                return 0;

        length = blocksize - length;
        iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;

        folio = filemap_grab_folio(mapping, index);
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        bh = folio_buffers(folio);
        if (!bh)
                bh = create_empty_buffers(folio, blocksize, 0);

        /* Find the buffer that contains "offset" */
        offset = offset_in_folio(folio, from);
        pos = blocksize;
        while (offset >= pos) {
                bh = bh->b_this_page;
                iblock++;
                pos += blocksize;
        }

        if (!buffer_mapped(bh)) {
                WARN_ON(bh->b_size != blocksize);
                err = get_block(inode, iblock, bh, 0);
                if (err)
                        goto unlock;
                /* unmapped? It's a hole - nothing to do */
                if (!buffer_mapped(bh))
                        goto unlock;
        }

        /* Ok, it's mapped. Make sure it's up-to-date */
        if (folio_test_uptodate(folio))
                set_buffer_uptodate(bh);

        if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
                err = bh_read(bh, 0);
                /* Uhhuh. Read error. Complain and punt. */
                if (err < 0)
                        goto unlock;
        }

        folio_zero_range(folio, offset, length);
        mark_buffer_dirty(bh);

unlock:
        folio_unlock(folio);
        folio_put(folio);

        return err;
}
EXPORT_SYMBOL(block_truncate_page);

/*
 * The generic ->writepage function for buffer-backed address_spaces
 */
int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
                void *get_block)
{
        struct inode * const inode = folio->mapping->host;
        loff_t i_size = i_size_read(inode);

        /* Is the folio fully inside i_size? */
        if (folio_pos(folio) + folio_size(folio) <= i_size)
                return __block_write_full_folio(inode, folio, get_block, wbc);

        /* Is the folio fully outside i_size? (truncate in progress) */
        if (folio_pos(folio) >= i_size) {
                folio_unlock(folio);
                return 0; /* don't care */
        }

        /*
         * The folio straddles i_size.  It must be zeroed out on each and every
         * writepage invocation because it may be mmapped.  "A file is mapped
         * in multiples of the page size.  For a file that is not a multiple of
         * the page size, the remaining memory is zeroed when mapped, and
         * writes to that region are not written out to the file."
         */
        folio_zero_segment(folio, offset_in_folio(folio, i_size),
                        folio_size(folio));
        return __block_write_full_folio(inode, folio, get_block, wbc);
}

sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
                            get_block_t *get_block)
{
        struct inode *inode = mapping->host;
        struct buffer_head tmp = {
                .b_size = i_blocksize(inode),
        };

        get_block(inode, block, &tmp, 0);
        return tmp.b_blocknr;
}
EXPORT_SYMBOL(generic_block_bmap);

static void end_bio_bh_io_sync(struct bio *bio)
{
        struct buffer_head *bh = bio->bi_private;

        if (unlikely(bio_flagged(bio, BIO_QUIET)))
                set_bit(BH_Quiet, &bh->b_state);

        bh->b_end_io(bh, !bio->bi_status);
        bio_put(bio);
}

static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
                          enum rw_hint write_hint,
                          struct writeback_control *wbc)
{
        const enum req_op op = opf & REQ_OP_MASK;
        struct bio *bio;

        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
        BUG_ON(buffer_delay(bh));
        BUG_ON(buffer_unwritten(bh));

        /*
         * Only clear out a write error when rewriting
         */
        if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
                clear_buffer_write_io_error(bh);

        if (buffer_meta(bh))
                opf |= REQ_META;
        if (buffer_prio(bh))
                opf |= REQ_PRIO;

        bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);

        fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);

        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_write_hint = write_hint;

        bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));

        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;

        /* Take care of bh's that straddle the end of the device */
        guard_bio_eod(bio);

        if (wbc) {
                wbc_init_bio(wbc, bio);
                wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
        }

        submit_bio(bio);
}

void submit_bh(blk_opf_t opf, struct buffer_head *bh)
{
        submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
}
EXPORT_SYMBOL(submit_bh);

void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
        lock_buffer(bh);
        if (!test_clear_buffer_dirty(bh)) {
                unlock_buffer(bh);
                return;
        }
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
        submit_bh(REQ_OP_WRITE | op_flags, bh);
}
EXPORT_SYMBOL(write_dirty_buffer);

/*
 * For a data-integrity writeout, we need to wait upon any in-progress I/O
 * and then start new I/O and then wait upon it.  The caller must have a ref on
 * the buffer_head.
 */
int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
{
        WARN_ON(atomic_read(&bh->b_count) < 1);
        lock_buffer(bh);
        if (test_clear_buffer_dirty(bh)) {
                /*
                 * The bh should be mapped, but it might not be if the
                 * device was hot-removed. Not much we can do but fail the I/O.
                 */
                if (!buffer_mapped(bh)) {
                        unlock_buffer(bh);
                        return -EIO;
                }

                get_bh(bh);
                bh->b_end_io = end_buffer_write_sync;
                submit_bh(REQ_OP_WRITE | op_flags, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        return -EIO;
        } else {
                unlock_buffer(bh);
        }
        return 0;
}
EXPORT_SYMBOL(__sync_dirty_buffer);

int sync_dirty_buffer(struct buffer_head *bh)
{
        return __sync_dirty_buffer(bh, REQ_SYNC);
}
EXPORT_SYMBOL(sync_dirty_buffer);

static inline int buffer_busy(struct buffer_head *bh)
{
        return atomic_read(&bh->b_count) |
                (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
}

static bool
drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
{
        struct buffer_head *head = folio_buffers(folio);
        struct buffer_head *bh;

        bh = head;
        do {
                if (buffer_busy(bh))
                        goto failed;
                bh = bh->b_this_page;
        } while (bh != head);

        do {
                struct buffer_head *next = bh->b_this_page;

                if (bh->b_assoc_map)
                        __remove_assoc_queue(bh);
                bh = next;
        } while (bh != head);
        *buffers_to_free = head;
        folio_detach_private(folio);
        return true;
failed:
        return false;
}

/**
 * try_to_free_buffers - Release buffers attached to this folio.
 * @folio: The folio.
 *
 * If any buffers are in use (dirty, under writeback, elevated refcount),
 * no buffers will be freed.
 *
 * If the folio is dirty but all the buffers are clean then we need to
 * be sure to mark the folio clean as well.  This is because the folio
 * may be against a block device, and a later reattachment of buffers
 * to a dirty folio will set *all* buffers dirty.  Which would corrupt
 * filesystem data on the same device.
 *
 * The same applies to regular filesystem folios: if all the buffers are
 * clean then we set the folio clean and proceed.  To do that, we require
 * total exclusion from block_dirty_folio().  That is obtained with
 * i_private_lock.
 *
 * Exclusion against try_to_free_buffers may be obtained by either
 * locking the folio or by holding its mapping's i_private_lock.
 *
 * Context: Process context.  @folio must be locked.  Will not sleep.
 * Return: true if all buffers attached to this folio were freed.
 */
bool try_to_free_buffers(struct folio *folio)
{
        struct address_space * const mapping = folio->mapping;
        struct buffer_head *buffers_to_free = NULL;
        bool ret = 0;

        BUG_ON(!folio_test_locked(folio));
        if (folio_test_writeback(folio))
                return false;

        if (mapping == NULL) {                /* can this still happen? */
                ret = drop_buffers(folio, &buffers_to_free);
                goto out;
        }

        spin_lock(&mapping->i_private_lock);
        ret = drop_buffers(folio, &buffers_to_free);

        /*
         * If the filesystem writes its buffers by hand (eg ext3)
         * then we can have clean buffers against a dirty folio.  We
         * clean the folio here; otherwise the VM will never notice
         * that the filesystem did any IO at all.
         *
         * Also, during truncate, discard_buffer will have marked all
         * the folio's buffers clean.  We discover that here and clean
         * the folio also.
         *
         * i_private_lock must be held over this entire operation in order
         * to synchronise against block_dirty_folio and prevent the
         * dirty bit from being lost.
         */
        if (ret)
                folio_cancel_dirty(folio);
        spin_unlock(&mapping->i_private_lock);
out:
        if (buffers_to_free) {
                struct buffer_head *bh = buffers_to_free;

                do {
                        struct buffer_head *next = bh->b_this_page;
                        free_buffer_head(bh);
                        bh = next;
                } while (bh != buffers_to_free);
        }
        return ret;
}
EXPORT_SYMBOL(try_to_free_buffers);

/*
 * Buffer-head allocation
 */
static struct kmem_cache *bh_cachep __ro_after_init;

/*
 * Once the number of bh's in the machine exceeds this level, we start
 * stripping them in writeback.
 */
static unsigned long max_buffer_heads __ro_after_init;

int buffer_heads_over_limit;

struct bh_accounting {
        int nr;                        /* Number of live bh's */
        int ratelimit;                /* Limit cacheline bouncing */
};

static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};

static void recalc_bh_state(void)
{
        int i;
        int tot = 0;

        if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
                return;
        __this_cpu_write(bh_accounting.ratelimit, 0);
        for_each_online_cpu(i)
                tot += per_cpu(bh_accounting, i).nr;
        buffer_heads_over_limit = (tot > max_buffer_heads);
}

struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
{
        struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
        if (ret) {
                INIT_LIST_HEAD(&ret->b_assoc_buffers);
                spin_lock_init(&ret->b_uptodate_lock);
                preempt_disable();
                __this_cpu_inc(bh_accounting.nr);
                recalc_bh_state();
                preempt_enable();
        }
        return ret;
}
EXPORT_SYMBOL(alloc_buffer_head);

void free_buffer_head(struct buffer_head *bh)
{
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
        preempt_disable();
        __this_cpu_dec(bh_accounting.nr);
        recalc_bh_state();
        preempt_enable();
}
EXPORT_SYMBOL(free_buffer_head);

static int buffer_exit_cpu_dead(unsigned int cpu)
{
        int i;
        struct bh_lru *b = &per_cpu(bh_lrus, cpu);

        for (i = 0; i < BH_LRU_SIZE; i++) {
                brelse(b->bhs[i]);
                b->bhs[i] = NULL;
        }
        this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
        per_cpu(bh_accounting, cpu).nr = 0;
        return 0;
}

/**
 * bh_uptodate_or_lock - Test whether the buffer is uptodate
 * @bh: struct buffer_head
 *
 * Return true if the buffer is up-to-date and false,
 * with the buffer locked, if not.
 */
int bh_uptodate_or_lock(struct buffer_head *bh)
{
        if (!buffer_uptodate(bh)) {
                lock_buffer(bh);
                if (!buffer_uptodate(bh))
                        return 0;
                unlock_buffer(bh);
        }
        return 1;
}
EXPORT_SYMBOL(bh_uptodate_or_lock);

/**
 * __bh_read - Submit read for a locked buffer
 * @bh: struct buffer_head
 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
 * @wait: wait until reading finish
 *
 * Returns zero on success or don't wait, and -EIO on error.
 */
int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
        int ret = 0;

        BUG_ON(!buffer_locked(bh));

        get_bh(bh);
        bh->b_end_io = end_buffer_read_sync;
        submit_bh(REQ_OP_READ | op_flags, bh);
        if (wait) {
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        ret = -EIO;
        }
        return ret;
}
EXPORT_SYMBOL(__bh_read);

/**
 * __bh_read_batch - Submit read for a batch of unlocked buffers
 * @nr: entry number of the buffer batch
 * @bhs: a batch of struct buffer_head
 * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
 * @force_lock: force to get a lock on the buffer if set, otherwise drops any
 *              buffer that cannot lock.
 *
 * Returns zero on success or don't wait, and -EIO on error.
 */
void __bh_read_batch(int nr, struct buffer_head *bhs[],
                     blk_opf_t op_flags, bool force_lock)
{
        int i;

        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];

                if (buffer_uptodate(bh))
                        continue;

                if (force_lock)
                        lock_buffer(bh);
                else
                        if (!trylock_buffer(bh))
                                continue;

                if (buffer_uptodate(bh)) {
                        unlock_buffer(bh);
                        continue;
                }

                bh->b_end_io = end_buffer_read_sync;
                get_bh(bh);
                submit_bh(REQ_OP_READ | op_flags, bh);
        }
}
EXPORT_SYMBOL(__bh_read_batch);

void __init buffer_init(void)
{
        unsigned long nrpages;
        int ret;

        bh_cachep = KMEM_CACHE(buffer_head,
                                SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
        /*
         * Limit the bh occupancy to 10% of ZONE_NORMAL
         */
        nrpages = (nr_free_buffer_pages() * 10) / 100;
        max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
        ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
                                        NULL, buffer_exit_cpu_dead);
        WARN_ON(ret < 0);
}






























































  228 












  227 














































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal schedule timeout and sleeping functions
 */

#include <linux/delay.h>
#include <linux/jiffies.h>
#include <linux/timer.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>

#include "tick-internal.h"

/*
 * Since schedule_timeout()'s timer is defined on the stack, it must store
 * the target task on the stack as well.
 */
struct process_timer {
        struct timer_list timer;
        struct task_struct *task;
};

static void process_timeout(struct timer_list *t)
{
        struct process_timer *timeout = from_timer(timeout, t, timer);

        wake_up_process(timeout->task);
}

/**
 * schedule_timeout - sleep until timeout
 * @timeout: timeout value in jiffies
 *
 * Make the current task sleep until @timeout jiffies have elapsed.
 * The function behavior depends on the current task state
 * (see also set_current_state() description):
 *
 * %TASK_RUNNING - the scheduler is called, but the task does not sleep
 * at all. That happens because sched_submit_work() does nothing for
 * tasks in %TASK_RUNNING state.
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be %TASK_RUNNING when this
 * routine returns.
 *
 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 * the CPU away without a bound on the timeout. In this case the return
 * value will be %MAX_SCHEDULE_TIMEOUT.
 *
 * Returns: 0 when the timer has expired otherwise the remaining time in
 * jiffies will be returned. In all cases the return value is guaranteed
 * to be non-negative.
 */
signed long __sched schedule_timeout(signed long timeout)
{
        struct process_timer timer;
        unsigned long expire;

        switch (timeout) {
        case MAX_SCHEDULE_TIMEOUT:
                /*
                 * These two special cases are useful to be comfortable
                 * in the caller. Nothing more. We could take
                 * MAX_SCHEDULE_TIMEOUT from one of the negative value
                 * but I' d like to return a valid offset (>=0) to allow
                 * the caller to do everything it want with the retval.
                 */
                schedule();
                goto out;
        default:
                /*
                 * Another bit of PARANOID. Note that the retval will be
                 * 0 since no piece of kernel is supposed to do a check
                 * for a negative retval of schedule_timeout() (since it
                 * should never happens anyway). You just have the printk()
                 * that will tell you if something is gone wrong and where.
                 */
                if (timeout < 0) {
                        pr_err("%s: wrong timeout value %lx\n", __func__, timeout);
                        dump_stack();
                        __set_current_state(TASK_RUNNING);
                        goto out;
                }
        }

        expire = timeout + jiffies;

        timer.task = current;
        timer_setup_on_stack(&timer.timer, process_timeout, 0);
        timer.timer.expires = expire;
        add_timer(&timer.timer);
        schedule();
        timer_delete_sync(&timer.timer);

        /* Remove the timer from the object tracker */
        destroy_timer_on_stack(&timer.timer);

        timeout = expire - jiffies;

 out:
        return timeout < 0 ? 0 : timeout;
}
EXPORT_SYMBOL(schedule_timeout);

/*
 * __set_current_state() can be used in schedule_timeout_*() functions, because
 * schedule_timeout() calls schedule() unconditionally.
 */

/**
 * schedule_timeout_interruptible - sleep until timeout (interruptible)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_INTERRUPTIBLE before starting the timeout.
 */
signed long __sched schedule_timeout_interruptible(signed long timeout)
{
        __set_current_state(TASK_INTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_interruptible);

/**
 * schedule_timeout_killable - sleep until timeout (killable)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_KILLABLE before starting the timeout.
 */
signed long __sched schedule_timeout_killable(signed long timeout)
{
        __set_current_state(TASK_KILLABLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_killable);

/**
 * schedule_timeout_uninterruptible - sleep until timeout (uninterruptible)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_UNINTERRUPTIBLE before starting the timeout.
 */
signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
        __set_current_state(TASK_UNINTERRUPTIBLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);

/**
 * schedule_timeout_idle - sleep until timeout (idle)
 * @timeout: timeout value in jiffies
 *
 * See schedule_timeout() for details.
 *
 * Task state is set to TASK_IDLE before starting the timeout. It is similar to
 * schedule_timeout_uninterruptible(), except this task will not contribute to
 * load average.
 */
signed long __sched schedule_timeout_idle(signed long timeout)
{
        __set_current_state(TASK_IDLE);
        return schedule_timeout(timeout);
}
EXPORT_SYMBOL(schedule_timeout_idle);

/**
 * schedule_hrtimeout_range_clock - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t)
 * @mode:        timer mode
 * @clock_id:        timer clock to be used
 *
 * Details are explained in schedule_hrtimeout_range() function description as
 * this function is commonly used.
 */
int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
                                           const enum hrtimer_mode mode, clockid_t clock_id)
{
        struct hrtimer_sleeper t;

        /*
         * Optimize when a zero timeout value is given. It does not
         * matter whether this is an absolute or a relative time.
         */
        if (expires && *expires == 0) {
                __set_current_state(TASK_RUNNING);
                return 0;
        }

        /*
         * A NULL parameter means "infinite"
         */
        if (!expires) {
                schedule();
                return -EINTR;
        }

        hrtimer_setup_sleeper_on_stack(&t, clock_id, mode);
        hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
        hrtimer_sleeper_start_expires(&t, mode);

        if (likely(t.task))
                schedule();

        hrtimer_cancel(&t.timer);
        destroy_hrtimer_on_stack(&t.timer);

        __set_current_state(TASK_RUNNING);

        return !t.task ? 0 : -EINTR;
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);

/**
 * schedule_hrtimeout_range - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @delta:        slack in expires timeout (ktime_t)
 * @mode:        timer mode
 *
 * Make the current task sleep until the given expiry time has
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * The @delta argument gives the kernel the freedom to schedule the
 * actual wakeup to a time that is both power and performance friendly
 * for regular (non RT/DL) tasks.
 * The kernel give the normal best effort behavior for "@expires+@delta",
 * but may decide to fire the timer earlier, but no earlier than @expires.
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
 * pass before the routine returns unless the current task is explicitly
 * woken up, (e.g. by wake_up_process()).
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task or the current task is explicitly woken
 * up.
 *
 * The current task state is guaranteed to be TASK_RUNNING when this
 * routine returns.
 *
 * Returns: 0 when the timer has expired. If the task was woken before the
 * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or
 * by an explicit wakeup, it returns -EINTR.
 */
int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
                                     const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range_clock(expires, delta, mode,
                                              CLOCK_MONOTONIC);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);

/**
 * schedule_hrtimeout - sleep until timeout
 * @expires:        timeout value (ktime_t)
 * @mode:        timer mode
 *
 * See schedule_hrtimeout_range() for details. @delta argument of
 * schedule_hrtimeout_range() is set to 0 and has therefore no impact.
 */
int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode)
{
        return schedule_hrtimeout_range(expires, 0, mode);
}
EXPORT_SYMBOL_GPL(schedule_hrtimeout);

/**
 * msleep - sleep safely even with waitqueue interruptions
 * @msecs:        Requested sleep duration in milliseconds
 *
 * msleep() uses jiffy based timeouts for the sleep duration. Because of the
 * design of the timer wheel, the maximum additional percentage delay (slack) is
 * 12.5%. This is only valid for timers which will end up in level 1 or a higher
 * level of the timer wheel. For explanation of those 12.5% please check the
 * detailed description about the basics of the timer wheel.
 *
 * The slack of timers which will end up in level 0 depends on sleep duration
 * (msecs) and HZ configuration and can be calculated in the following way (with
 * the timer wheel design restriction that the slack is not less than 12.5%):
 *
 *   ``slack = MSECS_PER_TICK / msecs``
 *
 * When the allowed slack of the callsite is known, the calculation could be
 * turned around to find the minimal allowed sleep duration to meet the
 * constraints. For example:
 *
 * * ``HZ=1000`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 1 / (1/4) = 4``:
 *   all sleep durations greater or equal 4ms will meet the constraints.
 * * ``HZ=1000`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 1 / (1/8) = 8``:
 *   all sleep durations greater or equal 8ms will meet the constraints.
 * * ``HZ=250`` with ``slack=25%``: ``MSECS_PER_TICK / slack = 4 / (1/4) = 16``:
 *   all sleep durations greater or equal 16ms will meet the constraints.
 * * ``HZ=250`` with ``slack=12.5%``: ``MSECS_PER_TICK / slack = 4 / (1/8) = 32``:
 *   all sleep durations greater or equal 32ms will meet the constraints.
 *
 * See also the signal aware variant msleep_interruptible().
 */
void msleep(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs);

        while (timeout)
                timeout = schedule_timeout_uninterruptible(timeout);
}
EXPORT_SYMBOL(msleep);

/**
 * msleep_interruptible - sleep waiting for signals
 * @msecs:        Requested sleep duration in milliseconds
 *
 * See msleep() for some basic information.
 *
 * The difference between msleep() and msleep_interruptible() is that the sleep
 * could be interrupted by a signal delivery and then returns early.
 *
 * Returns: The remaining time of the sleep duration transformed to msecs (see
 * schedule_timeout() for details).
 */
unsigned long msleep_interruptible(unsigned int msecs)
{
        unsigned long timeout = msecs_to_jiffies(msecs);

        while (timeout && !signal_pending(current))
                timeout = schedule_timeout_interruptible(timeout);
        return jiffies_to_msecs(timeout);
}
EXPORT_SYMBOL(msleep_interruptible);

/**
 * usleep_range_state - Sleep for an approximate time in a given state
 * @min:        Minimum time in usecs to sleep
 * @max:        Maximum time in usecs to sleep
 * @state:        State of the current task that will be while sleeping
 *
 * usleep_range_state() sleeps at least for the minimum specified time but not
 * longer than the maximum specified amount of time. The range might reduce
 * power usage by allowing hrtimers to coalesce an already scheduled interrupt
 * with this hrtimer. In the worst case, an interrupt is scheduled for the upper
 * bound.
 *
 * The sleeping task is set to the specified state before starting the sleep.
 *
 * In non-atomic context where the exact wakeup time is flexible, use
 * usleep_range() or its variants instead of udelay(). The sleep improves
 * responsiveness by avoiding the CPU-hogging busy-wait of udelay().
 */
void __sched usleep_range_state(unsigned long min, unsigned long max, unsigned int state)
{
        ktime_t exp = ktime_add_us(ktime_get(), min);
        u64 delta = (u64)(max - min) * NSEC_PER_USEC;

        if (WARN_ON_ONCE(max < min))
                delta = 0;

        for (;;) {
                __set_current_state(state);
                /* Do not return before the requested sleep time has elapsed */
                if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
                        break;
        }
}
EXPORT_SYMBOL(usleep_range_state);






















































































































































































































































































































































































































































































































































































































































































































































































































  265 



























































































































































































































  316 
























































































































































































































































































































































































































































































































































































  265 































































    1 






















































































































    1 




















































































































































































  265 





























































































































































































































































































































































































































   26 




























































































   22 





















































  265 













































































   22 



















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the AF_INET socket handler.
 *
 * Version:        @(#)sock.h        1.0.4        05/13/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Florian La Roche <flla@stud.uni-sb.de>
 *
 * Fixes:
 *                Alan Cox        :        Volatiles in skbuff pointers. See
 *                                        skbuff comments. May be overdone,
 *                                        better to prove they can be removed
 *                                        than the reverse.
 *                Alan Cox        :        Added a zapped field for tcp to note
 *                                        a socket is reset and must stay shut up
 *                Alan Cox        :        New fields for options
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Eliminate low level recv/recvfrom
 *                David S. Miller        :        New socket lookup architecture.
 *              Steve Whitehouse:       Default routines for sock_ops
 *              Arnaldo C. Melo :        removed net_pinfo, tp_pinfo and made
 *                                      protinfo be just a void pointer, as the
 *                                      protocol specific parts were moved to
 *                                      respective headers and ipv4/v6, etc now
 *                                      use private slabcaches for its socks
 *              Pedro Hortas        :        New flags field for socket options
 */
#ifndef _SOCK_H
#define _SOCK_H

#include <linux/hardirq.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/list_nulls.h>
#include <linux/timer.h>
#include <linux/cache.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>        /* struct sk_buff */
#include <linux/mm.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/static_key.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cgroup-defs.h>
#include <linux/rbtree.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
#include <linux/sockptr.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/atomic.h>
#include <linux/refcount.h>
#include <linux/llist.h>
#include <net/dst.h>
#include <net/checksum.h>
#include <net/tcp_states.h>
#include <linux/net_tstamp.h>
#include <net/l3mdev.h>
#include <uapi/linux/socket.h>

/*
 * This structure really needs to be cleaned up.
 * Most of it is for TCP, and not used by any of
 * the other protocols.
 */

/* This is the per-socket lock.  The spinlock provides a synchronization
 * between user contexts and software interrupt processing, whereas the
 * mini-semaphore synchronizes multiple users amongst themselves.
 */
typedef struct {
        spinlock_t                slock;
        int                        owned;
        wait_queue_head_t        wq;
        /*
         * We express the mutex-alike socket_lock semantics
         * to the lock validator by explicitly managing
         * the slock as a lock variant (in addition to
         * the slock itself):
         */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map dep_map;
#endif
} socket_lock_t;

struct sock;
struct proto;
struct net;

typedef __u32 __bitwise __portpair;
typedef __u64 __bitwise __addrpair;

/**
 *        struct sock_common - minimal network layer representation of sockets
 *        @skc_daddr: Foreign IPv4 addr
 *        @skc_rcv_saddr: Bound local IPv4 addr
 *        @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr
 *        @skc_hash: hash value used with various protocol lookup tables
 *        @skc_u16hashes: two u16 hash values used by UDP lookup tables
 *        @skc_dport: placeholder for inet_dport/tw_dport
 *        @skc_num: placeholder for inet_num/tw_num
 *        @skc_portpair: __u32 union of @skc_dport & @skc_num
 *        @skc_family: network address family
 *        @skc_state: Connection state
 *        @skc_reuse: %SO_REUSEADDR setting
 *        @skc_reuseport: %SO_REUSEPORT setting
 *        @skc_ipv6only: socket is IPV6 only
 *        @skc_net_refcnt: socket is using net ref counting
 *        @skc_bound_dev_if: bound device index if != 0
 *        @skc_bind_node: bind hash linkage for various protocol lookup tables
 *        @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol
 *        @skc_prot: protocol handlers inside a network family
 *        @skc_net: reference to the network namespace of this socket
 *        @skc_v6_daddr: IPV6 destination address
 *        @skc_v6_rcv_saddr: IPV6 source address
 *        @skc_cookie: socket's cookie value
 *        @skc_node: main hash linkage for various protocol lookup tables
 *        @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
 *        @skc_tx_queue_mapping: tx queue number for this connection
 *        @skc_rx_queue_mapping: rx queue number for this connection
 *        @skc_flags: place holder for sk_flags
 *                %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 *                %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
 *        @skc_listener: connection request listener socket (aka rsk_listener)
 *                [union with @skc_flags]
 *        @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row
 *                [union with @skc_flags]
 *        @skc_incoming_cpu: record/match cpu processing incoming packets
 *        @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled)
 *                [union with @skc_incoming_cpu]
 *        @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number
 *                [union with @skc_incoming_cpu]
 *        @skc_refcnt: reference count
 *
 *        This is the minimal network layer representation of sockets, the header
 *        for struct sock and struct inet_timewait_sock.
 */
struct sock_common {
        union {
                __addrpair        skc_addrpair;
                struct {
                        __be32        skc_daddr;
                        __be32        skc_rcv_saddr;
                };
        };
        union  {
                unsigned int        skc_hash;
                __u16                skc_u16hashes[2];
        };
        /* skc_dport && skc_num must be grouped as well */
        union {
                __portpair        skc_portpair;
                struct {
                        __be16        skc_dport;
                        __u16        skc_num;
                };
        };

        unsigned short                skc_family;
        volatile unsigned char        skc_state;
        unsigned char                skc_reuse:4;
        unsigned char                skc_reuseport:1;
        unsigned char                skc_ipv6only:1;
        unsigned char                skc_net_refcnt:1;
        int                        skc_bound_dev_if;
        union {
                struct hlist_node        skc_bind_node;
                struct hlist_node        skc_portaddr_node;
        };
        struct proto                *skc_prot;
        possible_net_t                skc_net;

#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr                skc_v6_daddr;
        struct in6_addr                skc_v6_rcv_saddr;
#endif

        atomic64_t                skc_cookie;

        /* following fields are padding to force
         * offset(struct sock, sk_refcnt) == 128 on 64bit arches
         * assuming IPV6 is enabled. We use this padding differently
         * for different kind of 'sockets'
         */
        union {
                unsigned long        skc_flags;
                struct sock        *skc_listener; /* request_sock */
                struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
        };
        /*
         * fields between dontcopy_begin/dontcopy_end
         * are not copied in sock_copy()
         */
        /* private: */
        int                        skc_dontcopy_begin[0];
        /* public: */
        union {
                struct hlist_node        skc_node;
                struct hlist_nulls_node skc_nulls_node;
        };
        unsigned short                skc_tx_queue_mapping;
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        unsigned short                skc_rx_queue_mapping;
#endif
        union {
                int                skc_incoming_cpu;
                u32                skc_rcv_wnd;
                u32                skc_tw_rcv_nxt; /* struct tcp_timewait_sock  */
        };

        refcount_t                skc_refcnt;
        /* private: */
        int                     skc_dontcopy_end[0];
        union {
                u32                skc_rxhash;
                u32                skc_window_clamp;
                u32                skc_tw_snd_nxt; /* struct tcp_timewait_sock */
        };
        /* public: */
};

struct bpf_local_storage;
struct sk_filter;

/**
  *        struct sock - network layer representation of sockets
  *        @__sk_common: shared layout with inet_timewait_sock
  *        @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  *        @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *        @sk_lock:        synchronizer
  *        @sk_kern_sock: True if sock is using kernel lock classes
  *        @sk_rcvbuf: size of receive buffer in bytes
  *        @sk_wq: sock wait queue and async head
  *        @sk_rx_dst: receive input route used by early demux
  *        @sk_rx_dst_ifindex: ifindex for @sk_rx_dst
  *        @sk_rx_dst_cookie: cookie for @sk_rx_dst
  *        @sk_dst_cache: destination cache
  *        @sk_dst_pending_confirm: need to confirm neighbour
  *        @sk_policy: flow policy
  *        @sk_receive_queue: incoming packets
  *        @sk_wmem_alloc: transmit queue bytes committed
  *        @sk_tsq_flags: TCP Small Queues flags
  *        @sk_write_queue: Packet sending queue
  *        @sk_omem_alloc: "o" is "option" or "other"
  *        @sk_wmem_queued: persistent queue size
  *        @sk_forward_alloc: space allocated forward
  *        @sk_reserved_mem: space reserved and non-reclaimable for the socket
  *        @sk_napi_id: id of the last napi context to receive data for sk
  *        @sk_ll_usec: usecs to busypoll when there is no data
  *        @sk_allocation: allocation mode
  *        @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
  *        @sk_pacing_status: Pacing status (requested, handled by sch_fq)
  *        @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
  *        @sk_sndbuf: size of send buffer in bytes
  *        @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
  *        @sk_no_check_rx: allow zero checksum in RX packets
  *        @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
  *        @sk_gso_disabled: if set, NETIF_F_GSO_MASK is forbidden.
  *        @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
  *        @sk_gso_max_size: Maximum GSO segment size to build
  *        @sk_gso_max_segs: Maximum number of GSO segments
  *        @sk_pacing_shift: scaling factor for TCP Small Queues
  *        @sk_lingertime: %SO_LINGER l_linger setting
  *        @sk_backlog: always used with the per-socket spinlock held
  *        @sk_callback_lock: used with the callbacks in the end of this struct
  *        @sk_error_queue: rarely used
  *        @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
  *                          IPV6_ADDRFORM for instance)
  *        @sk_err: last error
  *        @sk_err_soft: errors that don't cause failure but are the cause of a
  *                      persistent failure not just 'timed out'
  *        @sk_drops: raw/udp drops counter
  *        @sk_ack_backlog: current listen backlog
  *        @sk_max_ack_backlog: listen backlog set in listen()
  *        @sk_uid: user id of owner
  *        @sk_prefer_busy_poll: prefer busypolling over softirq processing
  *        @sk_busy_poll_budget: napi processing budget when busypolling
  *        @sk_priority: %SO_PRIORITY setting
  *        @sk_type: socket type (%SOCK_STREAM, etc)
  *        @sk_protocol: which protocol this socket belongs in this network family
  *        @sk_peer_lock: lock protecting @sk_peer_pid and @sk_peer_cred
  *        @sk_peer_pid: &struct pid for this socket's peer
  *        @sk_peer_cred: %SO_PEERCRED setting
  *        @sk_rcvlowat: %SO_RCVLOWAT setting
  *        @sk_rcvtimeo: %SO_RCVTIMEO setting
  *        @sk_sndtimeo: %SO_SNDTIMEO setting
  *        @sk_txhash: computed flow hash for use on transmit
  *        @sk_txrehash: enable TX hash rethink
  *        @sk_filter: socket filtering instructions
  *        @sk_timer: sock cleanup timer
  *        @sk_stamp: time stamp of last packet received
  *        @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
  *        @sk_tsflags: SO_TIMESTAMPING flags
  *        @sk_bpf_cb_flags: used in bpf_setsockopt()
  *        @sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
  *                           Sockets that can be used under memory reclaim should
  *                           set this to false.
  *        @sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
  *                      for timestamping
  *        @sk_tskey: counter to disambiguate concurrent tstamp requests
  *        @sk_zckey: counter to order MSG_ZEROCOPY notifications
  *        @sk_socket: Identd and reporting IO signals
  *        @sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
  *        @sk_frag: cached page frag
  *        @sk_peek_off: current peek_offset value
  *        @sk_send_head: front of stuff to transmit
  *        @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head]
  *        @sk_security: used by security modules
  *        @sk_mark: generic packet mark
  *        @sk_cgrp_data: cgroup data for this cgroup
  *        @sk_memcg: this socket's memory cgroup association
  *        @sk_write_pending: a write to stream socket waits to start
  *        @sk_disconnects: number of disconnect operations performed on this sock
  *        @sk_state_change: callback to indicate change in the state of the sock
  *        @sk_data_ready: callback to indicate there is data to be processed
  *        @sk_write_space: callback to indicate there is bf sending space available
  *        @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
  *        @sk_backlog_rcv: callback to process the backlog
  *        @sk_validate_xmit_skb: ptr to an optional validate function
  *        @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
  *        @sk_reuseport_cb: reuseport group container
  *        @sk_bpf_storage: ptr to cache and control for bpf_sk_storage
  *        @sk_rcu: used during RCU grace period
  *        @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
  *        @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
  *        @sk_txtime_report_errors: set report errors mode for SO_TXTIME
  *        @sk_txtime_unused: unused txtime flags
  *        @ns_tracker: tracker for netns reference
  *        @sk_user_frags: xarray of pages the user is holding a reference on.
  *        @sk_owner: reference to the real owner of the socket that calls
  *                   sock_lock_init_class_and_name().
  */
struct sock {
        /*
         * Now struct inet_timewait_sock also uses sock_common, so please just
         * don't add nothing before this first member (__sk_common) --acme
         */
        struct sock_common        __sk_common;
#define sk_node                        __sk_common.skc_node
#define sk_nulls_node                __sk_common.skc_nulls_node
#define sk_refcnt                __sk_common.skc_refcnt
#define sk_tx_queue_mapping        __sk_common.skc_tx_queue_mapping
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
#define sk_rx_queue_mapping        __sk_common.skc_rx_queue_mapping
#endif

#define sk_dontcopy_begin        __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end                __sk_common.skc_dontcopy_end
#define sk_hash                        __sk_common.skc_hash
#define sk_portpair                __sk_common.skc_portpair
#define sk_num                        __sk_common.skc_num
#define sk_dport                __sk_common.skc_dport
#define sk_addrpair                __sk_common.skc_addrpair
#define sk_daddr                __sk_common.skc_daddr
#define sk_rcv_saddr                __sk_common.skc_rcv_saddr
#define sk_family                __sk_common.skc_family
#define sk_state                __sk_common.skc_state
#define sk_reuse                __sk_common.skc_reuse
#define sk_reuseport                __sk_common.skc_reuseport
#define sk_ipv6only                __sk_common.skc_ipv6only
#define sk_net_refcnt                __sk_common.skc_net_refcnt
#define sk_bound_dev_if                __sk_common.skc_bound_dev_if
#define sk_bind_node                __sk_common.skc_bind_node
#define sk_prot                        __sk_common.skc_prot
#define sk_net                        __sk_common.skc_net
#define sk_v6_daddr                __sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr        __sk_common.skc_v6_rcv_saddr
#define sk_cookie                __sk_common.skc_cookie
#define sk_incoming_cpu                __sk_common.skc_incoming_cpu
#define sk_flags                __sk_common.skc_flags
#define sk_rxhash                __sk_common.skc_rxhash

        __cacheline_group_begin(sock_write_rx);

        atomic_t                sk_drops;
        __s32                        sk_peek_off;
        struct sk_buff_head        sk_error_queue;
        struct sk_buff_head        sk_receive_queue;
        /*
         * The backlog queue is special, it is always used with
         * the per-socket spinlock held and requires low latency
         * access. Therefore we special case it's implementation.
         * Note : rmem_alloc is in this structure to fill a hole
         * on 64bit arches, not because its logically part of
         * backlog.
         */
        struct {
                atomic_t        rmem_alloc;
                int                len;
                struct sk_buff        *head;
                struct sk_buff        *tail;
        } sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc

        __cacheline_group_end(sock_write_rx);

        __cacheline_group_begin(sock_read_rx);
        /* early demux fields */
        struct dst_entry __rcu        *sk_rx_dst;
        int                        sk_rx_dst_ifindex;
        u32                        sk_rx_dst_cookie;

#ifdef CONFIG_NET_RX_BUSY_POLL
        unsigned int                sk_ll_usec;
        unsigned int                sk_napi_id;
        u16                        sk_busy_poll_budget;
        u8                        sk_prefer_busy_poll;
#endif
        u8                        sk_userlocks;
        int                        sk_rcvbuf;

        struct sk_filter __rcu        *sk_filter;
        union {
                struct socket_wq __rcu        *sk_wq;
                /* private: */
                struct socket_wq        *sk_wq_raw;
                /* public: */
        };

        void                        (*sk_data_ready)(struct sock *sk);
        long                        sk_rcvtimeo;
        int                        sk_rcvlowat;
        __cacheline_group_end(sock_read_rx);

        __cacheline_group_begin(sock_read_rxtx);
        int                        sk_err;
        struct socket                *sk_socket;
        struct mem_cgroup        *sk_memcg;
#ifdef CONFIG_XFRM
        struct xfrm_policy __rcu *sk_policy[2];
#endif
        __cacheline_group_end(sock_read_rxtx);

        __cacheline_group_begin(sock_write_rxtx);
        socket_lock_t                sk_lock;
        u32                        sk_reserved_mem;
        int                        sk_forward_alloc;
        u32                        sk_tsflags;
        __cacheline_group_end(sock_write_rxtx);

        __cacheline_group_begin(sock_write_tx);
        int                        sk_write_pending;
        atomic_t                sk_omem_alloc;
        int                        sk_sndbuf;

        int                        sk_wmem_queued;
        refcount_t                sk_wmem_alloc;
        unsigned long                sk_tsq_flags;
        union {
                struct sk_buff        *sk_send_head;
                struct rb_root        tcp_rtx_queue;
        };
        struct sk_buff_head        sk_write_queue;
        u32                        sk_dst_pending_confirm;
        u32                        sk_pacing_status; /* see enum sk_pacing */
        struct page_frag        sk_frag;
        struct timer_list        sk_timer;

        unsigned long                sk_pacing_rate; /* bytes per second */
        atomic_t                sk_zckey;
        atomic_t                sk_tskey;
        __cacheline_group_end(sock_write_tx);

        __cacheline_group_begin(sock_read_tx);
        unsigned long                sk_max_pacing_rate;
        long                        sk_sndtimeo;
        u32                        sk_priority;
        u32                        sk_mark;
        struct dst_entry __rcu        *sk_dst_cache;
        netdev_features_t        sk_route_caps;
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sk_buff*                (*sk_validate_xmit_skb)(struct sock *sk,
                                                        struct net_device *dev,
                                                        struct sk_buff *skb);
#endif
        u16                        sk_gso_type;
        u16                        sk_gso_max_segs;
        unsigned int                sk_gso_max_size;
        gfp_t                        sk_allocation;
        u32                        sk_txhash;
        u8                        sk_pacing_shift;
        bool                        sk_use_task_frag;
        __cacheline_group_end(sock_read_tx);

        /*
         * Because of non atomicity rules, all
         * changes are protected by socket lock.
         */
        u8                        sk_gso_disabled : 1,
                                sk_kern_sock : 1,
                                sk_no_check_tx : 1,
                                sk_no_check_rx : 1;
        u8                        sk_shutdown;
        u16                        sk_type;
        u16                        sk_protocol;
        unsigned long                sk_lingertime;
        struct proto                *sk_prot_creator;
        rwlock_t                sk_callback_lock;
        int                        sk_err_soft;
        u32                        sk_ack_backlog;
        u32                        sk_max_ack_backlog;
        kuid_t                        sk_uid;
        spinlock_t                sk_peer_lock;
        int                        sk_bind_phc;
        struct pid                *sk_peer_pid;
        const struct cred        *sk_peer_cred;

        ktime_t                        sk_stamp;
#if BITS_PER_LONG==32
        seqlock_t                sk_stamp_seq;
#endif
        int                        sk_disconnects;

        u8                        sk_txrehash;
        u8                        sk_clockid;
        u8                        sk_txtime_deadline_mode : 1,
                                sk_txtime_report_errors : 1,
                                sk_txtime_unused : 6;
#define SK_BPF_CB_FLAG_TEST(SK, FLAG) ((SK)->sk_bpf_cb_flags & (FLAG))
        u8                        sk_bpf_cb_flags;

        void                        *sk_user_data;
#ifdef CONFIG_SECURITY
        void                        *sk_security;
#endif
        struct sock_cgroup_data        sk_cgrp_data;
        void                        (*sk_state_change)(struct sock *sk);
        void                        (*sk_write_space)(struct sock *sk);
        void                        (*sk_error_report)(struct sock *sk);
        int                        (*sk_backlog_rcv)(struct sock *sk,
                                                  struct sk_buff *skb);
        void                    (*sk_destruct)(struct sock *sk);
        struct sock_reuseport __rcu        *sk_reuseport_cb;
#ifdef CONFIG_BPF_SYSCALL
        struct bpf_local_storage __rcu        *sk_bpf_storage;
#endif
        struct rcu_head                sk_rcu;
        netns_tracker                ns_tracker;
        struct xarray                sk_user_frags;

#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
        struct module                *sk_owner;
#endif
};

struct sock_bh_locked {
        struct sock *sock;
        local_lock_t bh_lock;
};

enum sk_pacing {
        SK_PACING_NONE                = 0,
        SK_PACING_NEEDED        = 1,
        SK_PACING_FQ                = 2,
};

/* flag bits in sk_user_data
 *
 * - SK_USER_DATA_NOCOPY:      Pointer stored in sk_user_data might
 *   not be suitable for copying when cloning the socket. For instance,
 *   it can point to a reference counted object. sk_user_data bottom
 *   bit is set if pointer must not be copied.
 *
 * - SK_USER_DATA_BPF:         Mark whether sk_user_data field is
 *   managed/owned by a BPF reuseport array. This bit should be set
 *   when sk_user_data's sk is added to the bpf's reuseport_array.
 *
 * - SK_USER_DATA_PSOCK:       Mark whether pointer stored in
 *   sk_user_data points to psock type. This bit should be set
 *   when sk_user_data is assigned to a psock object.
 */
#define SK_USER_DATA_NOCOPY        1UL
#define SK_USER_DATA_BPF        2UL
#define SK_USER_DATA_PSOCK        4UL
#define SK_USER_DATA_PTRMASK        ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
                                  SK_USER_DATA_PSOCK)

/**
 * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
 * @sk: socket
 */
static inline bool sk_user_data_is_nocopy(const struct sock *sk)
{
        return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY);
}

#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))

/**
 * __locked_read_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 *
 * The caller must be holding sk->sk_callback_lock.
 */
static inline void *
__locked_read_sk_user_data_with_flags(const struct sock *sk,
                                      uintptr_t flags)
{
        uintptr_t sk_user_data =
                (uintptr_t)rcu_dereference_check(__sk_user_data(sk),
                                                 lockdep_is_held(&sk->sk_callback_lock));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

/**
 * __rcu_dereference_sk_user_data_with_flags - return the pointer
 * only if argument flags all has been set in sk_user_data. Otherwise
 * return NULL
 *
 * @sk: socket
 * @flags: flag bits
 */
static inline void *
__rcu_dereference_sk_user_data_with_flags(const struct sock *sk,
                                          uintptr_t flags)
{
        uintptr_t sk_user_data = (uintptr_t)rcu_dereference(__sk_user_data(sk));

        WARN_ON_ONCE(flags & SK_USER_DATA_PTRMASK);

        if ((sk_user_data & flags) == flags)
                return (void *)(sk_user_data & SK_USER_DATA_PTRMASK);
        return NULL;
}

#define rcu_dereference_sk_user_data(sk)                                \
        __rcu_dereference_sk_user_data_with_flags(sk, 0)
#define __rcu_assign_sk_user_data_with_flags(sk, ptr, flags)                \
({                                                                        \
        uintptr_t __tmp1 = (uintptr_t)(ptr),                                \
                  __tmp2 = (uintptr_t)(flags);                                \
        WARN_ON_ONCE(__tmp1 & ~SK_USER_DATA_PTRMASK);                        \
        WARN_ON_ONCE(__tmp2 & SK_USER_DATA_PTRMASK);                        \
        rcu_assign_pointer(__sk_user_data((sk)),                        \
                           __tmp1 | __tmp2);                                \
})
#define rcu_assign_sk_user_data(sk, ptr)                                \
        __rcu_assign_sk_user_data_with_flags(sk, ptr, 0)

static inline
struct net *sock_net(const struct sock *sk)
{
        return read_pnet(&sk->sk_net);
}

static inline
void sock_net_set(struct sock *sk, struct net *net)
{
        write_pnet(&sk->sk_net, net);
}

/*
 * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK
 * or not whether his port will be reused by someone else. SK_FORCE_REUSE
 * on a socket means that the socket will reuse everybody else's port
 * without looking at the other's sk_reuse value.
 */

#define SK_NO_REUSE        0
#define SK_CAN_REUSE        1
#define SK_FORCE_REUSE        2

int sk_set_peek_off(struct sock *sk, int val);

static inline int sk_peek_offset(const struct sock *sk, int flags)
{
        if (unlikely(flags & MSG_PEEK)) {
                return READ_ONCE(sk->sk_peek_off);
        }

        return 0;
}

static inline void sk_peek_offset_bwd(struct sock *sk, int val)
{
        s32 off = READ_ONCE(sk->sk_peek_off);

        if (unlikely(off >= 0)) {
                off = max_t(s32, off - val, 0);
                WRITE_ONCE(sk->sk_peek_off, off);
        }
}

static inline void sk_peek_offset_fwd(struct sock *sk, int val)
{
        sk_peek_offset_bwd(sk, -val);
}

/*
 * Hashed lists helper routines
 */
static inline struct sock *sk_entry(const struct hlist_node *node)
{
        return hlist_entry(node, struct sock, sk_node);
}

static inline struct sock *__sk_head(const struct hlist_head *head)
{
        return hlist_entry(head->first, struct sock, sk_node);
}

static inline struct sock *sk_head(const struct hlist_head *head)
{
        return hlist_empty(head) ? NULL : __sk_head(head);
}

static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_entry(head->first, struct sock, sk_nulls_node);
}

static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head)
{
        return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head);
}

static inline struct sock *sk_next(const struct sock *sk)
{
        return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node);
}

static inline struct sock *sk_nulls_next(const struct sock *sk)
{
        return (!is_a_nulls(sk->sk_nulls_node.next)) ?
                hlist_nulls_entry(sk->sk_nulls_node.next,
                                  struct sock, sk_nulls_node) :
                NULL;
}

static inline bool sk_unhashed(const struct sock *sk)
{
        return hlist_unhashed(&sk->sk_node);
}

static inline bool sk_hashed(const struct sock *sk)
{
        return !sk_unhashed(sk);
}

static inline void sk_node_init(struct hlist_node *node)
{
        node->pprev = NULL;
}

static inline void __sk_del_node(struct sock *sk)
{
        __hlist_del(&sk->sk_node);
}

/* NB: equivalent to hlist_del_init_rcu */
static inline bool __sk_del_node_init(struct sock *sk)
{
        if (sk_hashed(sk)) {
                __sk_del_node(sk);
                sk_node_init(&sk->sk_node);
                return true;
        }
        return false;
}

/* Grab socket reference count. This operation is valid only
   when sk is ALREADY grabbed f.e. it is found in hash table
   or a list and the lookup is made under lock preventing hash table
   modifications.
 */

static __always_inline void sock_hold(struct sock *sk)
{
        refcount_inc(&sk->sk_refcnt);
}

/* Ungrab socket in the context, which assumes that socket refcnt
   cannot hit zero, f.e. it is true in context of any socketcall.
 */
static __always_inline void __sock_put(struct sock *sk)
{
        refcount_dec(&sk->sk_refcnt);
}

static inline bool sk_del_node_init(struct sock *sk)
{
        bool rc = __sk_del_node_init(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}
#define sk_del_node_init_rcu(sk)        sk_del_node_init(sk)

static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk)
{
        if (sk_hashed(sk)) {
                hlist_nulls_del_init_rcu(&sk->sk_nulls_node);
                return true;
        }
        return false;
}

static inline bool sk_nulls_del_node_init_rcu(struct sock *sk)
{
        bool rc = __sk_nulls_del_node_init_rcu(sk);

        if (rc) {
                /* paranoid for a while -acme */
                WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
                __sock_put(sk);
        }
        return rc;
}

static inline void __sk_add_node(struct sock *sk, struct hlist_head *list)
{
        hlist_add_head(&sk->sk_node, list);
}

static inline void sk_add_node(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        __sk_add_node(sk, list);
}

static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
            sk->sk_family == AF_INET6)
                hlist_add_tail_rcu(&sk->sk_node, list);
        else
                hlist_add_head_rcu(&sk->sk_node, list);
}

static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list)
{
        sock_hold(sk);
        hlist_add_tail_rcu(&sk->sk_node, list);
}

static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
}

static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list);
}

static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
        sock_hold(sk);
        __sk_nulls_add_node_rcu(sk, list);
}

static inline void __sk_del_bind_node(struct sock *sk)
{
        __hlist_del(&sk->sk_bind_node);
}

static inline void sk_add_bind_node(struct sock *sk,
                                        struct hlist_head *list)
{
        hlist_add_head(&sk->sk_bind_node, list);
}

#define sk_for_each(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_node)
#define sk_for_each_rcu(__sk, list) \
        hlist_for_each_entry_rcu(__sk, list, sk_node)
#define sk_nulls_for_each(__sk, node, list) \
        hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
#define sk_nulls_for_each_rcu(__sk, node, list) \
        hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
#define sk_for_each_from(__sk) \
        hlist_for_each_entry_from(__sk, sk_node)
#define sk_nulls_for_each_from(__sk, node) \
        if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \
                hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node)
#define sk_for_each_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
        hlist_for_each_entry(__sk, list, sk_bind_node)
#define sk_for_each_bound_safe(__sk, tmp, list) \
        hlist_for_each_entry_safe(__sk, tmp, list, sk_bind_node)

/**
 * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_node to use as a loop cursor.
 * @head:        the head for your list.
 * @offset:        offset of hlist_node within the struct.
 *
 */
#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)                       \
        for (pos = rcu_dereference(hlist_first_rcu(head));                       \
             pos != NULL &&                                                       \
                ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
             pos = rcu_dereference(hlist_next_rcu(pos)))

static inline struct user_namespace *sk_user_ns(const struct sock *sk)
{
        /* Careful only use this in a context where these parameters
         * can not change and must all be valid, such as recvmsg from
         * userspace.
         */
        return sk->sk_socket->file->f_cred->user_ns;
}

/* Sock flags */
enum sock_flags {
        SOCK_DEAD,
        SOCK_DONE,
        SOCK_URGINLINE,
        SOCK_KEEPOPEN,
        SOCK_LINGER,
        SOCK_DESTROY,
        SOCK_BROADCAST,
        SOCK_TIMESTAMP,
        SOCK_ZAPPED,
        SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */
        SOCK_DBG, /* %SO_DEBUG setting */
        SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
        SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
        SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
        SOCK_MEMALLOC, /* VM depends on this socket for swapping */
        SOCK_TIMESTAMPING_RX_SOFTWARE,  /* %SOF_TIMESTAMPING_RX_SOFTWARE */
        SOCK_FASYNC, /* fasync() active */
        SOCK_RXQ_OVFL,
        SOCK_ZEROCOPY, /* buffers from userspace */
        SOCK_WIFI_STATUS, /* push wifi status to userspace */
        SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS.
                     * Will use last 4 bytes of packet sent from
                     * user-space instead.
                     */
        SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */
        SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */
        SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */
        SOCK_TXTIME,
        SOCK_XDP, /* XDP is attached */
        SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */
        SOCK_RCVMARK, /* Receive SO_MARK  ancillary data with packet */
        SOCK_RCVPRIORITY, /* Receive SO_PRIORITY ancillary data with packet */
        SOCK_TIMESTAMPING_ANY, /* Copy of sk_tsflags & TSFLAGS_ANY */
};

#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
/*
 * The highest bit of sk_tsflags is reserved for kernel-internal
 * SOCKCM_FLAG_TS_OPT_ID. There is a check in core/sock.c to control that
 * SOF_TIMESTAMPING* values do not reach this reserved area
 */
#define SOCKCM_FLAG_TS_OPT_ID        BIT(31)

static inline void sock_copy_flags(struct sock *nsk, const struct sock *osk)
{
        nsk->sk_flags = osk->sk_flags;
}

static inline void sock_set_flag(struct sock *sk, enum sock_flags flag)
{
        __set_bit(flag, &sk->sk_flags);
}

static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag)
{
        __clear_bit(flag, &sk->sk_flags);
}

static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit,
                                     int valbool)
{
        if (valbool)
                sock_set_flag(sk, bit);
        else
                sock_reset_flag(sk, bit);
}

static inline bool sock_flag(const struct sock *sk, enum sock_flags flag)
{
        return test_bit(flag, &sk->sk_flags);
}

#ifdef CONFIG_NET
DECLARE_STATIC_KEY_FALSE(memalloc_socks_key);
static inline int sk_memalloc_socks(void)
{
        return static_branch_unlikely(&memalloc_socks_key);
}

void __receive_sock(struct file *file);
#else

static inline int sk_memalloc_socks(void)
{
        return 0;
}

static inline void __receive_sock(struct file *file)
{ }
#endif

static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask)
{
        return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC);
}

static inline void sk_acceptq_removed(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1);
}

static inline void sk_acceptq_added(struct sock *sk)
{
        WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1);
}

/* Note: If you think the test should be:
 *        return READ_ONCE(sk->sk_ack_backlog) >= READ_ONCE(sk->sk_max_ack_backlog);
 * Then please take a look at commit 64a146513f8f ("[NET]: Revert incorrect accept queue backlog changes.")
 */
static inline bool sk_acceptq_is_full(const struct sock *sk)
{
        return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog);
}

/*
 * Compute minimal free write space needed to queue new packets.
 */
static inline int sk_stream_min_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_wmem_queued) >> 1;
}

static inline int sk_stream_wspace(const struct sock *sk)
{
        return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued);
}

static inline void sk_wmem_queued_add(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val);
}

static inline void sk_forward_alloc_add(struct sock *sk, int val)
{
        /* Paired with lockless reads of sk->sk_forward_alloc */
        WRITE_ONCE(sk->sk_forward_alloc, sk->sk_forward_alloc + val);
}

void sk_stream_write_space(struct sock *sk);

/* OOB backlog add */
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
        /* dont let skb dst not refcounted, we are going to leave rcu lock */
        skb_dst_force(skb);

        if (!sk->sk_backlog.tail)
                WRITE_ONCE(sk->sk_backlog.head, skb);
        else
                sk->sk_backlog.tail->next = skb;

        WRITE_ONCE(sk->sk_backlog.tail, skb);
        skb->next = NULL;
}

/*
 * Take into account size of receive queue and backlog queue
 * Do not take into account this skb truesize,
 * to allow even a single big packet to come.
 */
static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit)
{
        unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc);

        return qsize > limit;
}

/* The per-socket spinlock must be held here. */
static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb,
                                              unsigned int limit)
{
        if (sk_rcvqueues_full(sk, limit))
                return -ENOBUFS;

        /*
         * If the skb was allocated from pfmemalloc reserves, only
         * allow SOCK_MEMALLOC sockets to use it as this socket is
         * helping free memory
         */
        if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
                return -ENOMEM;

        __sk_add_backlog(sk, skb);
        sk->sk_backlog.len += skb->truesize;
        return 0;
}

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);

INDIRECT_CALLABLE_DECLARE(int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb));

static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        if (sk_memalloc_socks() && skb_pfmemalloc(skb))
                return __sk_backlog_rcv(sk, skb);

        return INDIRECT_CALL_INET(sk->sk_backlog_rcv,
                                  tcp_v6_do_rcv,
                                  tcp_v4_do_rcv,
                                  sk, skb);
}

static inline void sk_incoming_cpu_update(struct sock *sk)
{
        int cpu = raw_smp_processor_id();

        if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu))
                WRITE_ONCE(sk->sk_incoming_cpu, cpu);
}


static inline void sock_rps_save_rxhash(struct sock *sk,
                                        const struct sk_buff *skb)
{
#ifdef CONFIG_RPS
        /* The following WRITE_ONCE() is paired with the READ_ONCE()
         * here, and another one in sock_rps_record_flow().
         */
        if (unlikely(READ_ONCE(sk->sk_rxhash) != skb->hash))
                WRITE_ONCE(sk->sk_rxhash, skb->hash);
#endif
}

static inline void sock_rps_reset_rxhash(struct sock *sk)
{
#ifdef CONFIG_RPS
        /* Paired with READ_ONCE() in sock_rps_record_flow() */
        WRITE_ONCE(sk->sk_rxhash, 0);
#endif
}

#define sk_wait_event(__sk, __timeo, __condition, __wait)                \
        ({        int __rc, __dis = __sk->sk_disconnects;                        \
                release_sock(__sk);                                        \
                __rc = __condition;                                        \
                if (!__rc) {                                                \
                        *(__timeo) = wait_woken(__wait,                        \
                                                TASK_INTERRUPTIBLE,        \
                                                *(__timeo));                \
                }                                                        \
                sched_annotate_sleep();                                        \
                lock_sock(__sk);                                        \
                __rc = __dis == __sk->sk_disconnects ? __condition : -EPIPE; \
                __rc;                                                        \
        })

int sk_stream_wait_connect(struct sock *sk, long *timeo_p);
int sk_stream_wait_memory(struct sock *sk, long *timeo_p);
void sk_stream_wait_close(struct sock *sk, long timeo_p);
int sk_stream_error(struct sock *sk, int flags, int err);
void sk_stream_kill_queues(struct sock *sk);
void sk_set_memalloc(struct sock *sk);
void sk_clear_memalloc(struct sock *sk);

void __sk_flush_backlog(struct sock *sk);

static inline bool sk_flush_backlog(struct sock *sk)
{
        if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
                __sk_flush_backlog(sk);
                return true;
        }
        return false;
}

int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);

struct request_sock_ops;
struct timewait_sock_ops;
struct inet_hashinfo;
struct raw_hashinfo;
struct smc_hashinfo;
struct module;
struct sk_psock;

/*
 * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes
 * un-modified. Special care is taken when initializing object to zero.
 */
static inline void sk_prot_clear_nulls(struct sock *sk, int size)
{
        if (offsetof(struct sock, sk_node.next) != 0)
                memset(sk, 0, offsetof(struct sock, sk_node.next));
        memset(&sk->sk_node.pprev, 0,
               size - offsetof(struct sock, sk_node.pprev));
}

struct proto_accept_arg {
        int flags;
        int err;
        int is_empty;
        bool kern;
};

/* Networking protocol blocks we attach to sockets.
 * socket layer -> transport layer interface
 */
struct proto {
        void                        (*close)(struct sock *sk,
                                        long timeout);
        int                        (*pre_connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*connect)(struct sock *sk,
                                        struct sockaddr *uaddr,
                                        int addr_len);
        int                        (*disconnect)(struct sock *sk, int flags);

        struct sock *                (*accept)(struct sock *sk,
                                          struct proto_accept_arg *arg);

        int                        (*ioctl)(struct sock *sk, int cmd,
                                         int *karg);
        int                        (*init)(struct sock *sk);
        void                        (*destroy)(struct sock *sk);
        void                        (*shutdown)(struct sock *sk, int how);
        int                        (*setsockopt)(struct sock *sk, int level,
                                        int optname, sockptr_t optval,
                                        unsigned int optlen);
        int                        (*getsockopt)(struct sock *sk, int level,
                                        int optname, char __user *optval,
                                        int __user *option);
        void                        (*keepalive)(struct sock *sk, int valbool);
#ifdef CONFIG_COMPAT
        int                        (*compat_ioctl)(struct sock *sk,
                                        unsigned int cmd, unsigned long arg);
#endif
        int                        (*sendmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len);
        int                        (*recvmsg)(struct sock *sk, struct msghdr *msg,
                                           size_t len, int flags, int *addr_len);
        void                        (*splice_eof)(struct socket *sock);
        int                        (*bind)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);
        int                        (*bind_add)(struct sock *sk,
                                        struct sockaddr *addr, int addr_len);

        int                        (*backlog_rcv) (struct sock *sk,
                                                struct sk_buff *skb);
        bool                        (*bpf_bypass_getsockopt)(int level,
                                                         int optname);

        void                (*release_cb)(struct sock *sk);

        /* Keeping track of sk's, looking them up, and port selection methods. */
        int                        (*hash)(struct sock *sk);
        void                        (*unhash)(struct sock *sk);
        void                        (*rehash)(struct sock *sk);
        int                        (*get_port)(struct sock *sk, unsigned short snum);
        void                        (*put_port)(struct sock *sk);
#ifdef CONFIG_BPF_SYSCALL
        int                        (*psock_update_sk_prot)(struct sock *sk,
                                                        struct sk_psock *psock,
                                                        bool restore);
#endif

        /* Keeping track of sockets in use */
#ifdef CONFIG_PROC_FS
        unsigned int                inuse_idx;
#endif

        bool                        (*stream_memory_free)(const struct sock *sk, int wake);
        bool                        (*sock_is_readable)(struct sock *sk);
        /* Memory pressure */
        void                        (*enter_memory_pressure)(struct sock *sk);
        void                        (*leave_memory_pressure)(struct sock *sk);
        atomic_long_t                *memory_allocated;        /* Current allocated memory. */
        int  __percpu                *per_cpu_fw_alloc;
        struct percpu_counter        *sockets_allocated;        /* Current number of sockets. */

        /*
         * Pressure flag: try to collapse.
         * Technical note: it is used by multiple contexts non atomically.
         * Make sure to use READ_ONCE()/WRITE_ONCE() for all reads/writes.
         * All the __sk_mem_schedule() is of this nature: accounting
         * is strict, actions are advisory and have some latency.
         */
        unsigned long                *memory_pressure;
        long                        *sysctl_mem;

        int                        *sysctl_wmem;
        int                        *sysctl_rmem;
        u32                        sysctl_wmem_offset;
        u32                        sysctl_rmem_offset;

        int                        max_header;
        bool                        no_autobind;

        struct kmem_cache        *slab;
        unsigned int                obj_size;
        unsigned int                ipv6_pinfo_offset;
        slab_flags_t                slab_flags;
        unsigned int                useroffset;        /* Usercopy region offset */
        unsigned int                usersize;        /* Usercopy region size */

        unsigned int __percpu        *orphan_count;

        struct request_sock_ops        *rsk_prot;
        struct timewait_sock_ops *twsk_prot;

        union {
                struct inet_hashinfo        *hashinfo;
                struct udp_table        *udp_table;
                struct raw_hashinfo        *raw_hash;
                struct smc_hashinfo        *smc_hash;
        } h;

        struct module                *owner;

        char                        name[32];

        struct list_head        node;
        int                        (*diag_destroy)(struct sock *sk, int err);
} __randomize_layout;

int proto_register(struct proto *prot, int alloc_slab);
void proto_unregister(struct proto *prot);
int sock_load_diag_module(int family, int protocol);

INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake));

static inline bool __sk_stream_memory_free(const struct sock *sk, int wake)
{
        if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf))
                return false;

        return sk->sk_prot->stream_memory_free ?
                INDIRECT_CALL_INET_1(sk->sk_prot->stream_memory_free,
                                     tcp_stream_memory_free, sk, wake) : true;
}

static inline bool sk_stream_memory_free(const struct sock *sk)
{
        return __sk_stream_memory_free(sk, 0);
}

static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake)
{
        return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
               __sk_stream_memory_free(sk, wake);
}

static inline bool sk_stream_is_writeable(const struct sock *sk)
{
        return __sk_stream_is_writeable(sk, 0);
}

static inline int sk_under_cgroup_hierarchy(struct sock *sk,
                                            struct cgroup *ancestor)
{
#ifdef CONFIG_SOCK_CGROUP_DATA
        return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data),
                                    ancestor);
#else
        return -ENOTSUPP;
#endif
}

#define SK_ALLOC_PERCPU_COUNTER_BATCH 16

static inline void sk_sockets_allocated_dec(struct sock *sk)
{
        percpu_counter_add_batch(sk->sk_prot->sockets_allocated, -1,
                                 SK_ALLOC_PERCPU_COUNTER_BATCH);
}

static inline void sk_sockets_allocated_inc(struct sock *sk)
{
        percpu_counter_add_batch(sk->sk_prot->sockets_allocated, 1,
                                 SK_ALLOC_PERCPU_COUNTER_BATCH);
}

static inline u64
sk_sockets_allocated_read_positive(struct sock *sk)
{
        return percpu_counter_read_positive(sk->sk_prot->sockets_allocated);
}

static inline int
proto_sockets_allocated_sum_positive(struct proto *prot)
{
        return percpu_counter_sum_positive(prot->sockets_allocated);
}

#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR        64        /* should be enough for the first time */
struct prot_inuse {
        int all;
        int val[PROTO_INUSE_NR];
};

static inline void sock_prot_inuse_add(const struct net *net,
                                       const struct proto *prot, int val)
{
        this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
}

static inline void sock_inuse_add(const struct net *net, int val)
{
        this_cpu_add(net->core.prot_inuse->all, val);
}

int sock_prot_inuse_get(struct net *net, struct proto *proto);
int sock_inuse_get(struct net *net);
#else
static inline void sock_prot_inuse_add(const struct net *net,
                                       const struct proto *prot, int val)
{
}

static inline void sock_inuse_add(const struct net *net, int val)
{
}
#endif


/* With per-bucket locks this operation is not-atomic, so that
 * this version is not worse.
 */
static inline int __sk_prot_rehash(struct sock *sk)
{
        sk->sk_prot->unhash(sk);
        return sk->sk_prot->hash(sk);
}

/* About 10 seconds */
#define SOCK_DESTROY_TIME (10*HZ)

/* Sockets 0-1023 can't be bound to unless you are superuser */
#define PROT_SOCK        1024

#define SHUTDOWN_MASK        3
#define RCV_SHUTDOWN        1
#define SEND_SHUTDOWN        2

#define SOCK_BINDADDR_LOCK        4
#define SOCK_BINDPORT_LOCK        8

struct socket_alloc {
        struct socket socket;
        struct inode vfs_inode;
};

static inline struct socket *SOCKET_I(struct inode *inode)
{
        return &container_of(inode, struct socket_alloc, vfs_inode)->socket;
}

static inline struct inode *SOCK_INODE(struct socket *socket)
{
        return &container_of(socket, struct socket_alloc, socket)->vfs_inode;
}

/*
 * Functions for memory accounting
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind);
int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reduce_allocated(struct sock *sk, int amount);
void __sk_mem_reclaim(struct sock *sk, int amount);

#define SK_MEM_SEND        0
#define SK_MEM_RECV        1

/* sysctl_mem values are in pages */
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
        return READ_ONCE(sk->sk_prot->sysctl_mem[index]);
}

static inline int sk_mem_pages(int amt)
{
        return (amt + PAGE_SIZE - 1) >> PAGE_SHIFT;
}

static inline bool sk_has_account(struct sock *sk)
{
        /* return true if protocol supports memory accounting */
        return !!sk->sk_prot->memory_allocated;
}

static inline bool sk_wmem_schedule(struct sock *sk, int size)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_SEND);
}

static inline bool
__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc)
{
        int delta;

        if (!sk_has_account(sk))
                return true;
        delta = size - sk->sk_forward_alloc;
        return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) ||
               pfmemalloc;
}

static inline bool
sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
{
        return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb));
}

static inline int sk_unused_reserved_mem(const struct sock *sk)
{
        int unused_mem;

        if (likely(!sk->sk_reserved_mem))
                return 0;

        unused_mem = sk->sk_reserved_mem - sk->sk_wmem_queued -
                        atomic_read(&sk->sk_rmem_alloc);

        return unused_mem > 0 ? unused_mem : 0;
}

static inline void sk_mem_reclaim(struct sock *sk)
{
        int reclaimable;

        if (!sk_has_account(sk))
                return;

        reclaimable = sk->sk_forward_alloc - sk_unused_reserved_mem(sk);

        if (reclaimable >= (int)PAGE_SIZE)
                __sk_mem_reclaim(sk, reclaimable);
}

static inline void sk_mem_reclaim_final(struct sock *sk)
{
        sk->sk_reserved_mem = 0;
        sk_mem_reclaim(sk);
}

static inline void sk_mem_charge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk_forward_alloc_add(sk, -size);
}

static inline void sk_mem_uncharge(struct sock *sk, int size)
{
        if (!sk_has_account(sk))
                return;
        sk_forward_alloc_add(sk, size);
        sk_mem_reclaim(sk);
}

#if IS_ENABLED(CONFIG_PROVE_LOCKING) && IS_ENABLED(CONFIG_MODULES)
static inline void sk_owner_set(struct sock *sk, struct module *owner)
{
        __module_get(owner);
        sk->sk_owner = owner;
}

static inline void sk_owner_clear(struct sock *sk)
{
        sk->sk_owner = NULL;
}

static inline void sk_owner_put(struct sock *sk)
{
        module_put(sk->sk_owner);
}
#else
static inline void sk_owner_set(struct sock *sk, struct module *owner)
{
}

static inline void sk_owner_clear(struct sock *sk)
{
}

static inline void sk_owner_put(struct sock *sk)
{
}
#endif
/*
 * Macro so as to not evaluate some arguments when
 * lockdep is not enabled.
 *
 * Mark both the sk_lock and the sk_lock.slock as a
 * per-address-family lock class.
 */
#define sock_lock_init_class_and_name(sk, sname, skey, name, key)        \
do {                                                                        \
        sk_owner_set(sk, THIS_MODULE);                                        \
        sk->sk_lock.owned = 0;                                                \
        init_waitqueue_head(&sk->sk_lock.wq);                                \
        spin_lock_init(&(sk)->sk_lock.slock);                                \
        debug_check_no_locks_freed((void *)&(sk)->sk_lock,                \
                                   sizeof((sk)->sk_lock));                \
        lockdep_set_class_and_name(&(sk)->sk_lock.slock,                \
                                   (skey), (sname));                        \
        lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0);        \
} while (0)

static inline bool lockdep_sock_is_held(const struct sock *sk)
{
        return lockdep_is_held(&sk->sk_lock) ||
               lockdep_is_held(&sk->sk_lock.slock);
}

void lock_sock_nested(struct sock *sk, int subclass);

static inline void lock_sock(struct sock *sk)
{
        lock_sock_nested(sk, 0);
}

void __lock_sock(struct sock *sk);
void __release_sock(struct sock *sk);
void release_sock(struct sock *sk);

/* BH context may only use the following locking interface. */
#define bh_lock_sock(__sk)        spin_lock(&((__sk)->sk_lock.slock))
#define bh_lock_sock_nested(__sk) \
                                spin_lock_nested(&((__sk)->sk_lock.slock), \
                                SINGLE_DEPTH_NESTING)
#define bh_unlock_sock(__sk)        spin_unlock(&((__sk)->sk_lock.slock))

bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock);

/**
 * lock_sock_fast - fast version of lock_sock
 * @sk: socket
 *
 * This version should be used for very small section, where process won't block
 * return false if fast path is taken:
 *
 *   sk_lock.slock locked, owned = 0, BH disabled
 *
 * return true if slow path is taken:
 *
 *   sk_lock.slock unlocked, owned = 1, BH enabled
 */
static inline bool lock_sock_fast(struct sock *sk)
{
        /* The sk_lock has mutex_lock() semantics here. */
        mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);

        return __lock_sock_fast(sk);
}

/* fast socket lock variant for caller already holding a [different] socket lock */
static inline bool lock_sock_fast_nested(struct sock *sk)
{
        mutex_acquire(&sk->sk_lock.dep_map, SINGLE_DEPTH_NESTING, 0, _RET_IP_);

        return __lock_sock_fast(sk);
}

/**
 * unlock_sock_fast - complement of lock_sock_fast
 * @sk: socket
 * @slow: slow mode
 *
 * fast unlock socket for user context.
 * If slow mode is on, we call regular release_sock()
 */
static inline void unlock_sock_fast(struct sock *sk, bool slow)
        __releases(&sk->sk_lock.slock)
{
        if (slow) {
                release_sock(sk);
                __release(&sk->sk_lock.slock);
        } else {
                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
                spin_unlock_bh(&sk->sk_lock.slock);
        }
}

void sockopt_lock_sock(struct sock *sk);
void sockopt_release_sock(struct sock *sk);
bool sockopt_ns_capable(struct user_namespace *ns, int cap);
bool sockopt_capable(int cap);

/* Used by processes to "lock" a socket state, so that
 * interrupts and bottom half handlers won't change it
 * from under us. It essentially blocks any incoming
 * packets, so that we won't get any new data or any
 * packets that change the state of the socket.
 *
 * While locked, BH processing will add new packets to
 * the backlog queue.  This queue is processed by the
 * owner of the socket lock right before it is released.
 *
 * Since ~2.3.5 it is also exclusive sleep lock serializing
 * accesses from user process context.
 */

static inline void sock_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline void sock_not_owned_by_me(const struct sock *sk)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks);
#endif
}

static inline bool sock_owned_by_user(const struct sock *sk)
{
        sock_owned_by_me(sk);
        return sk->sk_lock.owned;
}

static inline bool sock_owned_by_user_nocheck(const struct sock *sk)
{
        return sk->sk_lock.owned;
}

static inline void sock_release_ownership(struct sock *sk)
{
        DEBUG_NET_WARN_ON_ONCE(!sock_owned_by_user_nocheck(sk));
        sk->sk_lock.owned = 0;

        /* The sk_lock has mutex_unlock() semantics: */
        mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
}

/* no reclassification while locks are held */
static inline bool sock_allow_reclassification(const struct sock *csk)
{
        struct sock *sk = (struct sock *)csk;

        return !sock_owned_by_user_nocheck(sk) &&
                !spin_is_locked(&sk->sk_lock.slock);
}

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern);
void sk_free(struct sock *sk);
void sk_net_refcnt_upgrade(struct sock *sk);
void sk_destruct(struct sock *sk);
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority);
void sk_free_unlock_clone(struct sock *sk);

struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority);
void __sock_wfree(struct sk_buff *skb);
void sock_wfree(struct sk_buff *skb);
struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority);
void skb_orphan_partial(struct sk_buff *skb);
void sock_rfree(struct sk_buff *skb);
void sock_efree(struct sk_buff *skb);
#ifdef CONFIG_INET
void sock_edemux(struct sk_buff *skb);
void sock_pfree(struct sk_buff *skb);

static inline void skb_set_owner_edemux(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        if (refcount_inc_not_zero(&sk->sk_refcnt)) {
                skb->sk = sk;
                skb->destructor = sock_edemux;
        }
}
#else
#define sock_edemux sock_efree
#endif

int sk_setsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, unsigned int optlen);
int sock_setsockopt(struct socket *sock, int level, int op,
                    sockptr_t optval, unsigned int optlen);
int do_sock_setsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, int optlen);
int do_sock_getsockopt(struct socket *sock, bool compat, int level,
                       int optname, sockptr_t optval, sockptr_t optlen);

int sk_getsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, sockptr_t optlen);
int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32);
struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order);

static inline struct sk_buff *sock_alloc_send_skb(struct sock *sk,
                                                  unsigned long size,
                                                  int noblock, int *errcode)
{
        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
}

void *sock_kmalloc(struct sock *sk, int size, gfp_t priority);
void *sock_kmemdup(struct sock *sk, const void *src,
                   int size, gfp_t priority);
void sock_kfree_s(struct sock *sk, void *mem, int size);
void sock_kzfree_s(struct sock *sk, void *mem, int size);
void sk_send_sigurg(struct sock *sk);

static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
{
        if (sk->sk_socket)
                clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
        WRITE_ONCE(sk->sk_prot, proto);
}

struct sockcm_cookie {
        u64 transmit_time;
        u32 mark;
        u32 tsflags;
        u32 ts_opt_id;
        u32 priority;
};

static inline void sockcm_init(struct sockcm_cookie *sockc,
                               const struct sock *sk)
{
        *sockc = (struct sockcm_cookie) {
                .mark = READ_ONCE(sk->sk_mark),
                .tsflags = READ_ONCE(sk->sk_tsflags),
                .priority = READ_ONCE(sk->sk_priority),
        };
}

int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc);
int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * does not implement a particular function.
 */
int sock_no_bind(struct socket *, struct sockaddr *, int);
int sock_no_connect(struct socket *, struct sockaddr *, int, int);
int sock_no_socketpair(struct socket *, struct socket *);
int sock_no_accept(struct socket *, struct socket *, struct proto_accept_arg *);
int sock_no_getname(struct socket *, struct sockaddr *, int);
int sock_no_ioctl(struct socket *, unsigned int, unsigned long);
int sock_no_listen(struct socket *, int);
int sock_no_shutdown(struct socket *, int);
int sock_no_sendmsg(struct socket *, struct msghdr *, size_t);
int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len);
int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int);
int sock_no_mmap(struct file *file, struct socket *sock,
                 struct vm_area_struct *vma);

/*
 * Functions to fill in entries in struct proto_ops when a protocol
 * uses the inet style.
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                                  char __user *optval, int __user *optlen);
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags);
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen);

void sk_common_release(struct sock *sk);

/*
 *        Default socket callbacks and setup code
 */

/* Initialise core socket variables using an explicit uid. */
void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid);

/* Initialise core socket variables.
 * Assumes struct socket *sock is embedded in a struct socket_alloc.
 */
void sock_init_data(struct socket *sock, struct sock *sk);

/*
 * Socket reference counting postulates.
 *
 * * Each user of socket SHOULD hold a reference count.
 * * Each access point to socket (an hash table bucket, reference from a list,
 *   running timer, skb in flight MUST hold a reference count.
 * * When reference count hits 0, it means it will never increase back.
 * * When reference count hits 0, it means that no references from
 *   outside exist to this socket and current process on current CPU
 *   is last user and may/should destroy this socket.
 * * sk_free is called from any context: process, BH, IRQ. When
 *   it is called, socket has no references from outside -> sk_free
 *   may release descendant resources allocated by the socket, but
 *   to the time when it is called, socket is NOT referenced by any
 *   hash tables, lists etc.
 * * Packets, delivered from outside (from network or from another process)
 *   and enqueued on receive/error queues SHOULD NOT grab reference count,
 *   when they sit in queue. Otherwise, packets will leak to hole, when
 *   socket is looked up by one cpu and unhasing is made by another CPU.
 *   It is true for udp/raw, netlink (leak to receive and error queues), tcp
 *   (leak to backlog). Packet socket does all the processing inside
 *   BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets
 *   use separate SMP lock, so that they are prone too.
 */

/* Ungrab socket and destroy it, if it was the last reference. */
static inline void sock_put(struct sock *sk)
{
        if (refcount_dec_and_test(&sk->sk_refcnt))
                sk_free(sk);
}
/* Generic version of sock_put(), dealing with all sockets
 * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...)
 */
void sock_gen_put(struct sock *sk);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested,
                     unsigned int trim_cap, bool refcounted);
static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                                 const int nested)
{
        return __sk_receive_skb(sk, skb, nested, 1, true);
}

static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
{
        /* sk_tx_queue_mapping accept only upto a 16-bit value */
        if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
                return;
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, tx_queue);
}

#define NO_QUEUE_MAPPING        USHRT_MAX

static inline void sk_tx_queue_clear(struct sock *sk)
{
        /* Paired with READ_ONCE() in sk_tx_queue_get() and
         * other WRITE_ONCE() because socket lock might be not held.
         */
        WRITE_ONCE(sk->sk_tx_queue_mapping, NO_QUEUE_MAPPING);
}

static inline int sk_tx_queue_get(const struct sock *sk)
{
        if (sk) {
                /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
                 * and sk_tx_queue_set().
                 */
                int val = READ_ONCE(sk->sk_tx_queue_mapping);

                if (val != NO_QUEUE_MAPPING)
                        return val;
        }
        return -1;
}

static inline void __sk_rx_queue_set(struct sock *sk,
                                     const struct sk_buff *skb,
                                     bool force_set)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (skb_rx_queue_recorded(skb)) {
                u16 rx_queue = skb_get_rx_queue(skb);

                if (force_set ||
                    unlikely(READ_ONCE(sk->sk_rx_queue_mapping) != rx_queue))
                        WRITE_ONCE(sk->sk_rx_queue_mapping, rx_queue);
        }
#endif
}

static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
{
        __sk_rx_queue_set(sk, skb, true);
}

static inline void sk_rx_queue_update(struct sock *sk, const struct sk_buff *skb)
{
        __sk_rx_queue_set(sk, skb, false);
}

static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        WRITE_ONCE(sk->sk_rx_queue_mapping, NO_QUEUE_MAPPING);
#endif
}

static inline int sk_rx_queue_get(const struct sock *sk)
{
#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
        if (sk) {
                int res = READ_ONCE(sk->sk_rx_queue_mapping);

                if (res != NO_QUEUE_MAPPING)
                        return res;
        }
#endif

        return -1;
}

static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
        sk->sk_socket = sock;
}

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
        BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
        return &rcu_dereference_raw(sk->sk_wq)->wait;
}
/* Detach socket from process context.
 * Announce socket dead, detach it from wait queue and inode.
 * Note that parent inode held reference count on this struct sock,
 * we do not release it in this function, because protocol
 * probably wants some additional cleanups or even continuing
 * to work with this socket (TCP).
 */
static inline void sock_orphan(struct sock *sk)
{
        write_lock_bh(&sk->sk_callback_lock);
        sock_set_flag(sk, SOCK_DEAD);
        sk_set_socket(sk, NULL);
        sk->sk_wq  = NULL;
        write_unlock_bh(&sk->sk_callback_lock);
}

static inline void sock_graft(struct sock *sk, struct socket *parent)
{
        WARN_ON(parent->sk);
        write_lock_bh(&sk->sk_callback_lock);
        rcu_assign_pointer(sk->sk_wq, &parent->wq);
        parent->sk = sk;
        sk_set_socket(sk, parent);
        sk->sk_uid = SOCK_INODE(parent)->i_uid;
        security_sock_graft(sk, parent);
        write_unlock_bh(&sk->sk_callback_lock);
}

kuid_t sock_i_uid(struct sock *sk);
unsigned long __sock_i_ino(struct sock *sk);
unsigned long sock_i_ino(struct sock *sk);

static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk)
{
        return sk ? sk->sk_uid : make_kuid(net->user_ns, 0);
}

static inline u32 net_tx_rndhash(void)
{
        u32 v = get_random_u32();

        return v ?: 1;
}

static inline void sk_set_txhash(struct sock *sk)
{
        /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */
        WRITE_ONCE(sk->sk_txhash, net_tx_rndhash());
}

static inline bool sk_rethink_txhash(struct sock *sk)
{
        if (sk->sk_txhash && sk->sk_txrehash == SOCK_TXREHASH_ENABLED) {
                sk_set_txhash(sk);
                return true;
        }
        return false;
}

static inline struct dst_entry *
__sk_dst_get(const struct sock *sk)
{
        return rcu_dereference_check(sk->sk_dst_cache,
                                     lockdep_sock_is_held(sk));
}

static inline struct dst_entry *
sk_dst_get(const struct sock *sk)
{
        struct dst_entry *dst;

        rcu_read_lock();
        dst = rcu_dereference(sk->sk_dst_cache);
        if (dst && !rcuref_get(&dst->__rcuref))
                dst = NULL;
        rcu_read_unlock();
        return dst;
}

static inline void __dst_negative_advice(struct sock *sk)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && dst->ops->negative_advice)
                dst->ops->negative_advice(sk, dst);
}

static inline void dst_negative_advice(struct sock *sk)
{
        sk_rethink_txhash(sk);
        __dst_negative_advice(sk);
}

static inline void
__sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = rcu_dereference_protected(sk->sk_dst_cache,
                                            lockdep_sock_is_held(sk));
        rcu_assign_pointer(sk->sk_dst_cache, dst);
        dst_release(old_dst);
}

static inline void
sk_dst_set(struct sock *sk, struct dst_entry *dst)
{
        struct dst_entry *old_dst;

        sk_tx_queue_clear(sk);
        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
        old_dst = unrcu_pointer(xchg(&sk->sk_dst_cache, RCU_INITIALIZER(dst)));
        dst_release(old_dst);
}

static inline void
__sk_dst_reset(struct sock *sk)
{
        __sk_dst_set(sk, NULL);
}

static inline void
sk_dst_reset(struct sock *sk)
{
        sk_dst_set(sk, NULL);
}

struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie);

static inline void sk_dst_confirm(struct sock *sk)
{
        if (!READ_ONCE(sk->sk_dst_pending_confirm))
                WRITE_ONCE(sk->sk_dst_pending_confirm, 1);
}

static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n)
{
        if (skb_get_dst_pending_confirm(skb)) {
                struct sock *sk = skb->sk;

                if (sk && READ_ONCE(sk->sk_dst_pending_confirm))
                        WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                neigh_confirm(n);
        }
}

bool sk_mc_loop(const struct sock *sk);

static inline bool sk_can_gso(const struct sock *sk)
{
        return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst);

static inline void sk_gso_disable(struct sock *sk)
{
        sk->sk_gso_disabled = 1;
        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
}

static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb,
                                           struct iov_iter *from, char *to,
                                           int copy, int offset)
{
        if (skb->ip_summed == CHECKSUM_NONE) {
                __wsum csum = 0;
                if (!csum_and_copy_from_iter_full(to, copy, &csum, from))
                        return -EFAULT;
                skb->csum = csum_block_add(skb->csum, csum, offset);
        } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) {
                if (!copy_from_iter_full_nocache(to, copy, from))
                        return -EFAULT;
        } else if (!copy_from_iter_full(to, copy, from))
                return -EFAULT;

        return 0;
}

static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb,
                                       struct iov_iter *from, int copy)
{
        int err, offset = skb->len;

        err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy),
                                       copy, offset);
        if (err)
                __skb_trim(skb, offset);

        return err;
}

static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from,
                                           struct sk_buff *skb,
                                           struct page *page,
                                           int off, int copy)
{
        int err;

        err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off,
                                       copy, skb->len);
        if (err)
                return err;

        skb_len_add(skb, copy);
        sk_wmem_queued_add(sk, copy);
        sk_mem_charge(sk, copy);
        return 0;
}

/**
 * sk_wmem_alloc_get - returns write allocations
 * @sk: socket
 *
 * Return: sk_wmem_alloc minus initial offset of one
 */
static inline int sk_wmem_alloc_get(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) - 1;
}

/**
 * sk_rmem_alloc_get - returns read allocations
 * @sk: socket
 *
 * Return: sk_rmem_alloc
 */
static inline int sk_rmem_alloc_get(const struct sock *sk)
{
        return atomic_read(&sk->sk_rmem_alloc);
}

/**
 * sk_has_allocations - check if allocations are outstanding
 * @sk: socket
 *
 * Return: true if socket has write or read allocations
 */
static inline bool sk_has_allocations(const struct sock *sk)
{
        return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk);
}

/**
 * skwq_has_sleeper - check if there are any waiting processes
 * @wq: struct socket_wq
 *
 * Return: true if socket_wq has waiting processes
 *
 * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory
 * barrier call. They were added due to the race found within the tcp code.
 *
 * Consider following tcp code paths::
 *
 *   CPU1                CPU2
 *   sys_select          receive packet
 *   ...                 ...
 *   __add_wait_queue    update tp->rcv_nxt
 *   ...                 ...
 *   tp->rcv_nxt check   sock_def_readable
 *   ...                 {
 *   schedule               rcu_read_lock();
 *                          wq = rcu_dereference(sk->sk_wq);
 *                          if (wq && waitqueue_active(&wq->wait))
 *                              wake_up_interruptible(&wq->wait)
 *                          ...
 *                       }
 *
 * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay
 * in its cache, and so does the tp->rcv_nxt update on CPU2 side.  The CPU1
 * could then endup calling schedule and sleep forever if there are no more
 * data on the socket.
 *
 */
static inline bool skwq_has_sleeper(struct socket_wq *wq)
{
        return wq && wq_has_sleeper(&wq->wait);
}

/**
 * sock_poll_wait - wrapper for the poll_wait call.
 * @filp:           file
 * @sock:           socket to wait on
 * @p:              poll_table
 *
 * See the comments in the wq_has_sleeper function.
 */
static inline void sock_poll_wait(struct file *filp, struct socket *sock,
                                  poll_table *p)
{
        /* Provides a barrier we need to be sure we are in sync
         * with the socket flags modification.
         *
         * This memory barrier is paired in the wq_has_sleeper.
         */
        poll_wait(filp, &sock->wq.wait, p);
}

static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk)
{
        /* This pairs with WRITE_ONCE() in sk_set_txhash() */
        u32 txhash = READ_ONCE(sk->sk_txhash);

        if (txhash) {
                skb->l4_hash = 1;
                skb->hash = txhash;
        }
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk);

/*
 *        Queue a received datagram if it will fit. Stream and sequenced
 *        protocols can't normally use this as they need to fit buffers in
 *        and play with them.
 *
 *        Inlined as it's very short and called for pretty much every
 *        packet ever received.
 */
static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rfree;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        sk_mem_charge(sk, skb->truesize);
}

static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk)
{
        if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) {
                skb_orphan(skb);
                skb->destructor = sock_efree;
                skb->sk = sk;
                return true;
        }
        return false;
}

static inline struct sk_buff *skb_clone_and_charge_r(struct sk_buff *skb, struct sock *sk)
{
        skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
        if (skb) {
                if (sk_rmem_schedule(sk, skb, skb->truesize)) {
                        skb_set_owner_r(skb, sk);
                        return skb;
                }
                __kfree_skb(skb);
        }
        return NULL;
}

static inline void skb_prepare_for_gro(struct sk_buff *skb)
{
        if (skb->destructor != sock_wfree) {
                skb_orphan(skb);
                return;
        }
        skb->slow_gro = 1;
}

void sk_reset_timer(struct sock *sk, struct timer_list *timer,
                    unsigned long expires);

void sk_stop_timer(struct sock *sk, struct timer_list *timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer);

int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
                        struct sk_buff *skb, unsigned int flags,
                        void (*destructor)(struct sock *sk,
                                           struct sk_buff *skb));
int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);

int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
                              enum skb_drop_reason *reason);

static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        return sock_queue_rcv_skb_reason(sk, skb, NULL);
}

int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb);
struct sk_buff *sock_dequeue_err_skb(struct sock *sk);

/*
 *        Recover an error report and clear atomically
 */

static inline int sock_error(struct sock *sk)
{
        int err;

        /* Avoid an atomic operation for the common case.
         * This is racy since another cpu/thread can change sk_err under us.
         */
        if (likely(data_race(!sk->sk_err)))
                return 0;

        err = xchg(&sk->sk_err, 0);
        return -err;
}

void sk_error_report(struct sock *sk);

static inline unsigned long sock_wspace(struct sock *sk)
{
        int amt = 0;

        if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
                amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc);
                if (amt < 0)
                        amt = 0;
        }
        return amt;
}

/* Note:
 *  We use sk->sk_wq_raw, from contexts knowing this
 *  pointer is not NULL and cannot disappear/change.
 */
static inline void sk_set_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        set_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_clear_bit(int nr, struct sock *sk)
{
        if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) &&
            !sock_flag(sk, SOCK_FASYNC))
                return;

        clear_bit(nr, &sk->sk_wq_raw->flags);
}

static inline void sk_wake_async(const struct sock *sk, int how, int band)
{
        if (sock_flag(sk, SOCK_FASYNC)) {
                rcu_read_lock();
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
                rcu_read_unlock();
        }
}

static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band)
{
        if (unlikely(sock_flag(sk, SOCK_FASYNC)))
                sock_wake_async(rcu_dereference(sk->sk_wq), how, band);
}

/* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might
 * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak.
 * Note: for send buffers, TCP works better if we can build two skbs at
 * minimum.
 */
#define TCP_SKB_MIN_TRUESIZE        (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff)))

#define SOCK_MIN_SNDBUF                (TCP_SKB_MIN_TRUESIZE * 2)
#define SOCK_MIN_RCVBUF                 TCP_SKB_MIN_TRUESIZE

static inline void sk_stream_moderate_sndbuf(struct sock *sk)
{
        u32 val;

        if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
                return;

        val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1);
        val = max_t(u32, val, sk_unused_reserved_mem(sk));

        WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}

/**
 * sk_page_frag - return an appropriate page_frag
 * @sk: socket
 *
 * Use the per task page_frag instead of the per socket one for
 * optimization when we know that we're in process context and own
 * everything that's associated with %current.
 *
 * Both direct reclaim and page faults can nest inside other
 * socket operations and end up recursing into sk_page_frag()
 * while it's already in use: explicitly avoid task page_frag
 * when users disable sk_use_task_frag.
 *
 * Return: a per task page_frag if context allows that,
 * otherwise a per socket one.
 */
static inline struct page_frag *sk_page_frag(struct sock *sk)
{
        if (sk->sk_use_task_frag)
                return &current->task_frag;

        return &sk->sk_frag;
}

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag);

/*
 *        Default write policy as shown to user space via poll/select/SIGIO
 */
static inline bool sock_writeable(const struct sock *sk)
{
        return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1);
}

static inline gfp_t gfp_any(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

static inline gfp_t gfp_memcg_charge(void)
{
        return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
}

static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : sk->sk_rcvtimeo;
}

static inline long sock_sndtimeo(const struct sock *sk, bool noblock)
{
        return noblock ? 0 : sk->sk_sndtimeo;
}

static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len)
{
        int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len);

        return v ?: 1;
}

/* Alas, with timeout socket operations are not restartable.
 * Compare this to poll().
 */
static inline int sock_intr_errno(long timeo)
{
        return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
}

struct sock_skb_cb {
        u32 dropcount;
};

/* Store sock_skb_cb at the end of skb->cb[] so protocol families
 * using skb->cb[] would keep using it directly and utilize its
 * alignment guarantee.
 */
#define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \
                            sizeof(struct sock_skb_cb)))

#define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \
                            SOCK_SKB_CB_OFFSET))

#define sock_skb_cb_check_size(size) \
        BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET)

static inline void
sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb)
{
        SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ?
                                                atomic_read(&sk->sk_drops) : 0;
}

static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb)
{
        int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs);

        atomic_add(segs, &sk->sk_drops);
}

static inline ktime_t sock_read_timestamp(struct sock *sk)
{
#if BITS_PER_LONG==32
        unsigned int seq;
        ktime_t kt;

        do {
                seq = read_seqbegin(&sk->sk_stamp_seq);
                kt = sk->sk_stamp;
        } while (read_seqretry(&sk->sk_stamp_seq, seq));

        return kt;
#else
        return READ_ONCE(sk->sk_stamp);
#endif
}

static inline void sock_write_timestamp(struct sock *sk, ktime_t kt)
{
#if BITS_PER_LONG==32
        write_seqlock(&sk->sk_stamp_seq);
        sk->sk_stamp = kt;
        write_sequnlock(&sk->sk_stamp_seq);
#else
        WRITE_ONCE(sk->sk_stamp, kt);
#endif
}

void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
                           struct sk_buff *skb);
void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
                             struct sk_buff *skb);

static inline void
sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
{
        struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
        u32 tsflags = READ_ONCE(sk->sk_tsflags);
        ktime_t kt = skb->tstamp;
        /*
         * generate control messages if
         * - receive time stamping in software requested
         * - software time stamp available and wanted
         * - hardware time stamps available and wanted
         */
        if (sock_flag(sk, SOCK_RCVTSTAMP) ||
            (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
            (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
            (hwtstamps->hwtstamp &&
             (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
                __sock_recv_timestamp(msg, sk, skb);
        else
                sock_write_timestamp(sk, kt);

        if (sock_flag(sk, SOCK_WIFI_STATUS) && skb_wifi_acked_valid(skb))
                __sock_recv_wifi_status(msg, sk, skb);
}

void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                       struct sk_buff *skb);

#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC)
static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
                                   struct sk_buff *skb)
{
#define FLAGS_RECV_CMSGS ((1UL << SOCK_RXQ_OVFL)                        | \
                           (1UL << SOCK_RCVTSTAMP)                        | \
                           (1UL << SOCK_RCVMARK)                        | \
                           (1UL << SOCK_RCVPRIORITY)                        | \
                           (1UL << SOCK_TIMESTAMPING_ANY))
#define TSFLAGS_ANY          (SOF_TIMESTAMPING_SOFTWARE                        | \
                           SOF_TIMESTAMPING_RAW_HARDWARE)

        if (READ_ONCE(sk->sk_flags) & FLAGS_RECV_CMSGS)
                __sock_recv_cmsgs(msg, sk, skb);
        else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
                sock_write_timestamp(sk, skb->tstamp);
        else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP))
                sock_write_timestamp(sk, 0);
}

void __sock_tx_timestamp(__u32 tsflags, __u8 *tx_flags);

/**
 * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped
 * @sk:                socket sending this packet
 * @sockc:        pointer to socket cmsg cookie to get timestamping info
 * @tx_flags:        completed with instructions for time stamping
 * @tskey:      filled in with next sk_tskey (not for TCP, which uses seqno)
 *
 * Note: callers should take care of initial ``*tx_flags`` value (usually 0)
 */
static inline void _sock_tx_timestamp(struct sock *sk,
                                      const struct sockcm_cookie *sockc,
                                      __u8 *tx_flags, __u32 *tskey)
{
        __u32 tsflags = sockc->tsflags;

        if (unlikely(tsflags)) {
                __sock_tx_timestamp(tsflags, tx_flags);
                if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey &&
                    tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) {
                        if (tsflags & SOCKCM_FLAG_TS_OPT_ID)
                                *tskey = sockc->ts_opt_id;
                        else
                                *tskey = atomic_inc_return(&sk->sk_tskey) - 1;
                }
        }
        if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS)))
                *tx_flags |= SKBTX_WIFI_STATUS;
}

static inline void sock_tx_timestamp(struct sock *sk,
                                     const struct sockcm_cookie *sockc,
                                     __u8 *tx_flags)
{
        _sock_tx_timestamp(sk, sockc, tx_flags, NULL);
}

static inline void skb_setup_tx_timestamp(struct sk_buff *skb,
                                          const struct sockcm_cookie *sockc)
{
        _sock_tx_timestamp(skb->sk, sockc, &skb_shinfo(skb)->tx_flags,
                           &skb_shinfo(skb)->tskey);
}

static inline bool sk_is_inet(const struct sock *sk)
{
        int family = READ_ONCE(sk->sk_family);

        return family == AF_INET || family == AF_INET6;
}

static inline bool sk_is_tcp(const struct sock *sk)
{
        return sk_is_inet(sk) &&
               sk->sk_type == SOCK_STREAM &&
               sk->sk_protocol == IPPROTO_TCP;
}

static inline bool sk_is_udp(const struct sock *sk)
{
        return sk_is_inet(sk) &&
               sk->sk_type == SOCK_DGRAM &&
               sk->sk_protocol == IPPROTO_UDP;
}

static inline bool sk_is_stream_unix(const struct sock *sk)
{
        return sk->sk_family == AF_UNIX && sk->sk_type == SOCK_STREAM;
}

static inline bool sk_is_vsock(const struct sock *sk)
{
        return sk->sk_family == AF_VSOCK;
}

/**
 * sk_eat_skb - Release a skb if it is no longer needed
 * @sk: socket to eat this skb from
 * @skb: socket buffer to eat
 *
 * This routine must be called with interrupts disabled or with the socket
 * locked so that the sk_buff queue operation is ok.
*/
static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb)
{
        __skb_unlink(skb, &sk->sk_receive_queue);
        __kfree_skb(skb);
}

static inline bool
skb_sk_is_prefetched(struct sk_buff *skb)
{
#ifdef CONFIG_INET
        return skb->destructor == sock_pfree;
#else
        return false;
#endif /* CONFIG_INET */
}

/* This helper checks if a socket is a full socket,
 * ie _not_ a timewait or request socket.
 */
static inline bool sk_fullsock(const struct sock *sk)
{
        return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV);
}

static inline bool
sk_is_refcounted(struct sock *sk)
{
        /* Only full sockets have sk->sk_flags. */
        return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE);
}

/* Checks if this SKB belongs to an HW offloaded socket
 * and whether any SW fallbacks are required based on dev.
 * Check decrypted mark in case skb_orphan() cleared socket.
 */
static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
                                                   struct net_device *dev)
{
#ifdef CONFIG_SOCK_VALIDATE_XMIT
        struct sock *sk = skb->sk;

        if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) {
                skb = sk->sk_validate_xmit_skb(sk, dev, skb);
        } else if (unlikely(skb_is_decrypted(skb))) {
                pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
                kfree_skb(skb);
                skb = NULL;
        }
#endif

        return skb;
}

/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV
 * SYNACK messages can be attached to either ones (depending on SYNCOOKIE)
 */
static inline bool sk_listener(const struct sock *sk)
{
        return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV);
}

/* This helper checks if a socket is a LISTEN or NEW_SYN_RECV or TIME_WAIT
 * TCP SYNACK messages can be attached to LISTEN or NEW_SYN_RECV (depending on SYNCOOKIE)
 * TCP RST and ACK can be attached to TIME_WAIT.
 */
static inline bool sk_listener_or_tw(const struct sock *sk)
{
        return (1 << READ_ONCE(sk->sk_state)) &
               (TCPF_LISTEN | TCPF_NEW_SYN_RECV | TCPF_TIME_WAIT);
}

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag);
int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level,
                       int type);

bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap);
bool sk_capable(const struct sock *sk, int cap);
bool sk_net_capable(const struct sock *sk, int cap);

void sk_get_meminfo(const struct sock *sk, u32 *meminfo);

/* Take into consideration the size of the struct sk_buff overhead in the
 * determination of these values, since that is non-constant across
 * platforms.  This makes socket queueing behavior and performance
 * not depend upon such differences.
 */
#define _SK_MEM_PACKETS                256
#define _SK_MEM_OVERHEAD        SKB_TRUESIZE(256)
#define SK_WMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX                (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)

extern __u32 sysctl_wmem_max;
extern __u32 sysctl_rmem_max;

extern __u32 sysctl_wmem_default;
extern __u32 sysctl_rmem_default;

#define SKB_FRAG_PAGE_ORDER        get_order(32768)
DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_wmem ? */
        if (proto->sysctl_wmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset));

        return READ_ONCE(*proto->sysctl_wmem);
}

static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
{
        /* Does this proto have per netns sysctl_rmem ? */
        if (proto->sysctl_rmem_offset)
                return READ_ONCE(*(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset));

        return READ_ONCE(*proto->sysctl_rmem);
}

/* Default TCP Small queue budget is ~1 ms of data (1sec >> 10)
 * Some wifi drivers need to tweak it to get more chunks.
 * They can use this helper from their ndo_start_xmit()
 */
static inline void sk_pacing_shift_update(struct sock *sk, int val)
{
        if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val)
                return;
        WRITE_ONCE(sk->sk_pacing_shift, val);
}

/* if a socket is bound to a device, check that the given device
 * index is either the same or that the socket is bound to an L3
 * master device and the given device index is also enslaved to
 * that L3 master
 */
static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif)
{
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
        int mdif;

        if (!bound_dev_if || bound_dev_if == dif)
                return true;

        mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif);
        if (mdif && mdif == bound_dev_if)
                return true;

        return false;
}

void sock_def_readable(struct sock *sk);

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk);
void sock_set_timestamp(struct sock *sk, int optname, bool valbool);
int sock_set_timestamping(struct sock *sk, int optname,
                          struct so_timestamping timestamping);

void sock_enable_timestamps(struct sock *sk);
#if defined(CONFIG_CGROUP_BPF)
void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op);
#else
static inline void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
{
}
#endif
void sock_no_linger(struct sock *sk);
void sock_set_keepalive(struct sock *sk);
void sock_set_priority(struct sock *sk, u32 priority);
void sock_set_rcvbuf(struct sock *sk, int val);
void sock_set_mark(struct sock *sk, u32 val);
void sock_set_reuseaddr(struct sock *sk);
void sock_set_reuseport(struct sock *sk);
void sock_set_sndtimeo(struct sock *sk, s64 secs);

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len);

int sock_get_timeout(long timeo, void *optval, bool old_timeval);
int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
                           sockptr_t optval, int optlen, bool old_timeval);

int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
                     void __user *arg, void *karg, size_t size);
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg);
static inline bool sk_is_readable(struct sock *sk)
{
        if (sk->sk_prot->sock_is_readable)
                return sk->sk_prot->sock_is_readable(sk);
        return false;
}
#endif        /* _SOCK_H */











































































































































































































































































































































































    3 











    3 






    3 









    3 
    3 































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                802.1Q VLAN
 *                Ethernet-type device handling.
 *
 * Authors:        Ben Greear <greearb@candelatech.com>
 *              Please send support related email to: netdev@vger.kernel.org
 *              VLAN Home Page: http://www.candelatech.com/~greear/vlan.html
 *
 * Fixes:
 *              Fix for packet capture - Nick Eggleston <nick@dccinc.com>;
 *                Add HW acceleration hooks - David S. Miller <davem@redhat.com>;
 *                Correct all the locking - David S. Miller <davem@redhat.com>;
 *                Use hash table for VLAN groups - David S. Miller <davem@redhat.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/capability.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
#include <net/p8022.h>
#include <net/arp.h>
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <linux/uaccess.h>

#include <linux/if_vlan.h>
#include "vlan.h"
#include "vlanproc.h"

#define DRV_VERSION "1.8"

/* Global VLAN variables */

unsigned int vlan_net_id __read_mostly;

const char vlan_fullname[] = "802.1Q VLAN Support";
const char vlan_version[] = DRV_VERSION;

/* End of global variables definitions. */

static int vlan_group_prealloc_vid(struct vlan_group *vg,
                                   __be16 vlan_proto, u16 vlan_id)
{
        struct net_device **array;
        unsigned int vidx;
        unsigned int size;
        int pidx;

        ASSERT_RTNL();

        pidx  = vlan_proto_idx(vlan_proto);
        if (pidx < 0)
                return -EINVAL;

        vidx  = vlan_id / VLAN_GROUP_ARRAY_PART_LEN;
        array = vg->vlan_devices_arrays[pidx][vidx];
        if (array != NULL)
                return 0;

        size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
        array = kzalloc(size, GFP_KERNEL_ACCOUNT);
        if (array == NULL)
                return -ENOBUFS;

        /* paired with smp_rmb() in __vlan_group_get_device() */
        smp_wmb();

        vg->vlan_devices_arrays[pidx][vidx] = array;
        return 0;
}

static void vlan_stacked_transfer_operstate(const struct net_device *rootdev,
                                            struct net_device *dev,
                                            struct vlan_dev_priv *vlan)
{
        if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
                netif_stacked_transfer_operstate(rootdev, dev);
}

void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
{
        struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
        struct net_device *real_dev = vlan->real_dev;
        struct vlan_info *vlan_info;
        struct vlan_group *grp;
        u16 vlan_id = vlan->vlan_id;

        ASSERT_RTNL();

        vlan_info = rtnl_dereference(real_dev->vlan_info);
        BUG_ON(!vlan_info);

        grp = &vlan_info->grp;

        grp->nr_vlan_devs--;

        if (vlan->flags & VLAN_FLAG_MVRP)
                vlan_mvrp_request_leave(dev);
        if (vlan->flags & VLAN_FLAG_GVRP)
                vlan_gvrp_request_leave(dev);

        vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL);

        netdev_upper_dev_unlink(real_dev, dev);
        /* Because unregister_netdevice_queue() makes sure at least one rcu
         * grace period is respected before device freeing,
         * we dont need to call synchronize_net() here.
         */
        unregister_netdevice_queue(dev, head);

        if (grp->nr_vlan_devs == 0) {
                vlan_mvrp_uninit_applicant(real_dev);
                vlan_gvrp_uninit_applicant(real_dev);
        }

        vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
}

int vlan_check_real_dev(struct net_device *real_dev,
                        __be16 protocol, u16 vlan_id,
                        struct netlink_ext_ack *extack)
{
        const char *name = real_dev->name;

        if (real_dev->features & NETIF_F_VLAN_CHALLENGED ||
            real_dev->type != ARPHRD_ETHER) {
                pr_info("VLANs not supported on %s\n", name);
                NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device");
                return -EOPNOTSUPP;
        }

        if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL) {
                NL_SET_ERR_MSG_MOD(extack, "VLAN device already exists");
                return -EEXIST;
        }

        return 0;
}

int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
{
        struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
        struct net_device *real_dev = vlan->real_dev;
        u16 vlan_id = vlan->vlan_id;
        struct vlan_info *vlan_info;
        struct vlan_group *grp;
        int err;

        err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id);
        if (err)
                return err;

        vlan_info = rtnl_dereference(real_dev->vlan_info);
        /* vlan_info should be there now. vlan_vid_add took care of it */
        BUG_ON(!vlan_info);

        grp = &vlan_info->grp;
        if (grp->nr_vlan_devs == 0) {
                err = vlan_gvrp_init_applicant(real_dev);
                if (err < 0)
                        goto out_vid_del;
                err = vlan_mvrp_init_applicant(real_dev);
                if (err < 0)
                        goto out_uninit_gvrp;
        }

        err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id);
        if (err < 0)
                goto out_uninit_mvrp;

        err = register_netdevice(dev);
        if (err < 0)
                goto out_uninit_mvrp;

        err = netdev_upper_dev_link(real_dev, dev, extack);
        if (err)
                goto out_unregister_netdev;

        vlan_stacked_transfer_operstate(real_dev, dev, vlan);
        linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */

        /* So, got the sucker initialized, now lets place
         * it into our local structure.
         */
        vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev);
        grp->nr_vlan_devs++;

        return 0;

out_unregister_netdev:
        unregister_netdevice(dev);
out_uninit_mvrp:
        if (grp->nr_vlan_devs == 0)
                vlan_mvrp_uninit_applicant(real_dev);
out_uninit_gvrp:
        if (grp->nr_vlan_devs == 0)
                vlan_gvrp_uninit_applicant(real_dev);
out_vid_del:
        vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
        return err;
}

/*  Attach a VLAN device to a mac address (ie Ethernet Card).
 *  Returns 0 if the device was created or a negative error code otherwise.
 */
static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
{
        struct net_device *new_dev;
        struct vlan_dev_priv *vlan;
        struct net *net = dev_net(real_dev);
        struct vlan_net *vn = net_generic(net, vlan_net_id);
        char name[IFNAMSIZ];
        int err;

        if (vlan_id >= VLAN_VID_MASK)
                return -ERANGE;

        err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id,
                                  NULL);
        if (err < 0)
                return err;

        /* Gotta set up the fields for the device. */
        switch (vn->name_type) {
        case VLAN_NAME_TYPE_RAW_PLUS_VID:
                /* name will look like:         eth1.0005 */
                snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, vlan_id);
                break;
        case VLAN_NAME_TYPE_PLUS_VID_NO_PAD:
                /* Put our vlan.VID in the name.
                 * Name will look like:         vlan5
                 */
                snprintf(name, IFNAMSIZ, "vlan%i", vlan_id);
                break;
        case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD:
                /* Put our vlan.VID in the name.
                 * Name will look like:         eth0.5
                 */
                snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, vlan_id);
                break;
        case VLAN_NAME_TYPE_PLUS_VID:
                /* Put our vlan.VID in the name.
                 * Name will look like:         vlan0005
                 */
        default:
                snprintf(name, IFNAMSIZ, "vlan%.4i", vlan_id);
        }

        new_dev = alloc_netdev(sizeof(struct vlan_dev_priv), name,
                               NET_NAME_UNKNOWN, vlan_setup);

        if (new_dev == NULL)
                return -ENOBUFS;

        dev_net_set(new_dev, net);
        /* need 4 bytes for extra VLAN header info,
         * hope the underlying device can handle it.
         */
        new_dev->mtu = real_dev->mtu;

        vlan = vlan_dev_priv(new_dev);
        vlan->vlan_proto = htons(ETH_P_8021Q);
        vlan->vlan_id = vlan_id;
        vlan->real_dev = real_dev;
        vlan->dent = NULL;
        vlan->flags = VLAN_FLAG_REORDER_HDR;

        new_dev->rtnl_link_ops = &vlan_link_ops;
        err = register_vlan_dev(new_dev, NULL);
        if (err < 0)
                goto out_free_newdev;

        return 0;

out_free_newdev:
        free_netdev(new_dev);
        return err;
}

static void vlan_sync_address(struct net_device *dev,
                              struct net_device *vlandev)
{
        struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);

        /* May be called without an actual change */
        if (ether_addr_equal(vlan->real_dev_addr, dev->dev_addr))
                return;

        /* vlan continues to inherit address of lower device */
        if (vlan_dev_inherit_address(vlandev, dev))
                goto out;

        /* vlan address was different from the old address and is equal to
         * the new address */
        if (!ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
            ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
                dev_uc_del(dev, vlandev->dev_addr);

        /* vlan address was equal to the old address and is different from
         * the new address */
        if (ether_addr_equal(vlandev->dev_addr, vlan->real_dev_addr) &&
            !ether_addr_equal(vlandev->dev_addr, dev->dev_addr))
                dev_uc_add(dev, vlandev->dev_addr);

out:
        ether_addr_copy(vlan->real_dev_addr, dev->dev_addr);
}

static void vlan_transfer_features(struct net_device *dev,
                                   struct net_device *vlandev)
{
        struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);

        netif_inherit_tso_max(vlandev, dev);

        if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
                vlandev->hard_header_len = dev->hard_header_len;
        else
                vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN;

#if IS_ENABLED(CONFIG_FCOE)
        vlandev->fcoe_ddp_xid = dev->fcoe_ddp_xid;
#endif

        vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
        vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
        vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev);

        netdev_update_features(vlandev);
}

static int __vlan_device_event(struct net_device *dev, unsigned long event)
{
        int err = 0;

        switch (event) {
        case NETDEV_CHANGENAME:
                vlan_proc_rem_dev(dev);
                err = vlan_proc_add_dev(dev);
                break;
        case NETDEV_REGISTER:
                err = vlan_proc_add_dev(dev);
                break;
        case NETDEV_UNREGISTER:
                vlan_proc_rem_dev(dev);
                break;
        }

        return err;
}

static int vlan_device_event(struct notifier_block *unused, unsigned long event,
                             void *ptr)
{
        struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct vlan_group *grp;
        struct vlan_info *vlan_info;
        int i, flgs;
        struct net_device *vlandev;
        struct vlan_dev_priv *vlan;
        bool last = false;
        LIST_HEAD(list);
        int err;

        if (is_vlan_dev(dev)) {
                int err = __vlan_device_event(dev, event);

                if (err)
                        return notifier_from_errno(err);
        }

        if ((event == NETDEV_UP) &&
            (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) {
                pr_info("adding VLAN 0 to HW filter on device %s\n",
                        dev->name);
                vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
        }
        if (event == NETDEV_DOWN &&
            (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
                vlan_vid_del(dev, htons(ETH_P_8021Q), 0);

        vlan_info = rtnl_dereference(dev->vlan_info);
        if (!vlan_info)
                goto out;
        grp = &vlan_info->grp;

        /* It is OK that we do not hold the group lock right now,
         * as we run under the RTNL lock.
         */

        switch (event) {
        case NETDEV_CHANGE:
                /* Propagate real device state to vlan devices */
                vlan_group_for_each_dev(grp, i, vlandev)
                        vlan_stacked_transfer_operstate(dev, vlandev,
                                                        vlan_dev_priv(vlandev));
                break;

        case NETDEV_CHANGEADDR:
                /* Adjust unicast filters on underlying device */
                vlan_group_for_each_dev(grp, i, vlandev) {
                        flgs = vlandev->flags;
                        if (!(flgs & IFF_UP))
                                continue;

                        vlan_sync_address(dev, vlandev);
                }
                break;

        case NETDEV_CHANGEMTU:
                vlan_group_for_each_dev(grp, i, vlandev) {
                        if (vlandev->mtu <= dev->mtu)
                                continue;

                        dev_set_mtu(vlandev, dev->mtu);
                }
                break;

        case NETDEV_FEAT_CHANGE:
                /* Propagate device features to underlying device */
                vlan_group_for_each_dev(grp, i, vlandev)
                        vlan_transfer_features(dev, vlandev);
                break;

        case NETDEV_DOWN: {
                struct net_device *tmp;
                LIST_HEAD(close_list);

                /* Put all VLANs for this dev in the down state too.  */
                vlan_group_for_each_dev(grp, i, vlandev) {
                        flgs = vlandev->flags;
                        if (!(flgs & IFF_UP))
                                continue;

                        vlan = vlan_dev_priv(vlandev);
                        if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
                                list_add(&vlandev->close_list, &close_list);
                }

                dev_close_many(&close_list, false);

                list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) {
                        vlan_stacked_transfer_operstate(dev, vlandev,
                                                        vlan_dev_priv(vlandev));
                        list_del_init(&vlandev->close_list);
                }
                list_del(&close_list);
                break;
        }
        case NETDEV_UP:
                /* Put all VLANs for this dev in the up state too.  */
                vlan_group_for_each_dev(grp, i, vlandev) {
                        flgs = dev_get_flags(vlandev);
                        if (flgs & IFF_UP)
                                continue;

                        vlan = vlan_dev_priv(vlandev);
                        if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
                                dev_change_flags(vlandev, flgs | IFF_UP,
                                                 extack);
                        vlan_stacked_transfer_operstate(dev, vlandev, vlan);
                }
                break;

        case NETDEV_UNREGISTER:
                /* twiddle thumbs on netns device moves */
                if (dev->reg_state != NETREG_UNREGISTERING)
                        break;

                vlan_group_for_each_dev(grp, i, vlandev) {
                        /* removal of last vid destroys vlan_info, abort
                         * afterwards */
                        if (vlan_info->nr_vids == 1)
                                last = true;

                        unregister_vlan_dev(vlandev, &list);
                        if (last)
                                break;
                }
                unregister_netdevice_many(&list);
                break;

        case NETDEV_PRE_TYPE_CHANGE:
                /* Forbid underlaying device to change its type. */
                if (vlan_uses_dev(dev))
                        return NOTIFY_BAD;
                break;

        case NETDEV_NOTIFY_PEERS:
        case NETDEV_BONDING_FAILOVER:
        case NETDEV_RESEND_IGMP:
                /* Propagate to vlan devices */
                vlan_group_for_each_dev(grp, i, vlandev)
                        call_netdevice_notifiers(event, vlandev);
                break;

        case NETDEV_CVLAN_FILTER_PUSH_INFO:
                err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021Q));
                if (err)
                        return notifier_from_errno(err);
                break;

        case NETDEV_CVLAN_FILTER_DROP_INFO:
                vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021Q));
                break;

        case NETDEV_SVLAN_FILTER_PUSH_INFO:
                err = vlan_filter_push_vids(vlan_info, htons(ETH_P_8021AD));
                if (err)
                        return notifier_from_errno(err);
                break;

        case NETDEV_SVLAN_FILTER_DROP_INFO:
                vlan_filter_drop_vids(vlan_info, htons(ETH_P_8021AD));
                break;
        }

out:
        return NOTIFY_DONE;
}

static struct notifier_block vlan_notifier_block __read_mostly = {
        .notifier_call = vlan_device_event,
};

/*
 *        VLAN IOCTL handler.
 *        o execute requested action or pass command to the device driver
 *   arg is really a struct vlan_ioctl_args __user *.
 */
static int vlan_ioctl_handler(struct net *net, void __user *arg)
{
        int err;
        struct vlan_ioctl_args args;
        struct net_device *dev = NULL;

        if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args)))
                return -EFAULT;

        /* Null terminate this sucker, just in case. */
        args.device1[sizeof(args.device1) - 1] = 0;
        args.u.device2[sizeof(args.u.device2) - 1] = 0;

        rtnl_lock();

        switch (args.cmd) {
        case SET_VLAN_INGRESS_PRIORITY_CMD:
        case SET_VLAN_EGRESS_PRIORITY_CMD:
        case SET_VLAN_FLAG_CMD:
        case ADD_VLAN_CMD:
        case DEL_VLAN_CMD:
        case GET_VLAN_REALDEV_NAME_CMD:
        case GET_VLAN_VID_CMD:
                err = -ENODEV;
                dev = __dev_get_by_name(net, args.device1);
                if (!dev)
                        goto out;

                err = -EINVAL;
                if (args.cmd != ADD_VLAN_CMD && !is_vlan_dev(dev))
                        goto out;
        }

        switch (args.cmd) {
        case SET_VLAN_INGRESS_PRIORITY_CMD:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        break;
                vlan_dev_set_ingress_priority(dev,
                                              args.u.skb_priority,
                                              args.vlan_qos);
                err = 0;
                break;

        case SET_VLAN_EGRESS_PRIORITY_CMD:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        break;
                err = vlan_dev_set_egress_priority(dev,
                                                   args.u.skb_priority,
                                                   args.vlan_qos);
                break;

        case SET_VLAN_FLAG_CMD:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        break;
                err = vlan_dev_change_flags(dev,
                                            args.vlan_qos ? args.u.flag : 0,
                                            args.u.flag);
                break;

        case SET_VLAN_NAME_TYPE_CMD:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        break;
                if (args.u.name_type < VLAN_NAME_TYPE_HIGHEST) {
                        struct vlan_net *vn;

                        vn = net_generic(net, vlan_net_id);
                        vn->name_type = args.u.name_type;
                        err = 0;
                } else {
                        err = -EINVAL;
                }
                break;

        case ADD_VLAN_CMD:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        break;
                err = register_vlan_device(dev, args.u.VID);
                break;

        case DEL_VLAN_CMD:
                err = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        break;
                unregister_vlan_dev(dev, NULL);
                err = 0;
                break;

        case GET_VLAN_REALDEV_NAME_CMD:
                err = 0;
                vlan_dev_get_realdev_name(dev, args.u.device2,
                                          sizeof(args.u.device2));
                if (copy_to_user(arg, &args,
                                 sizeof(struct vlan_ioctl_args)))
                        err = -EFAULT;
                break;

        case GET_VLAN_VID_CMD:
                err = 0;
                args.u.VID = vlan_dev_vlan_id(dev);
                if (copy_to_user(arg, &args,
                                 sizeof(struct vlan_ioctl_args)))
                      err = -EFAULT;
                break;

        default:
                err = -EOPNOTSUPP;
                break;
        }
out:
        rtnl_unlock();
        return err;
}

static int __net_init vlan_init_net(struct net *net)
{
        struct vlan_net *vn = net_generic(net, vlan_net_id);
        int err;

        vn->name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD;

        err = vlan_proc_init(net);

        return err;
}

static void __net_exit vlan_exit_net(struct net *net)
{
        vlan_proc_cleanup(net);
}

static struct pernet_operations vlan_net_ops = {
        .init = vlan_init_net,
        .exit = vlan_exit_net,
        .id   = &vlan_net_id,
        .size = sizeof(struct vlan_net),
};

static int __init vlan_proto_init(void)
{
        int err;

        pr_info("%s v%s\n", vlan_fullname, vlan_version);

        err = register_pernet_subsys(&vlan_net_ops);
        if (err < 0)
                goto err0;

        err = register_netdevice_notifier(&vlan_notifier_block);
        if (err < 0)
                goto err2;

        err = vlan_gvrp_init();
        if (err < 0)
                goto err3;

        err = vlan_mvrp_init();
        if (err < 0)
                goto err4;

        err = vlan_netlink_init();
        if (err < 0)
                goto err5;

        vlan_ioctl_set(vlan_ioctl_handler);
        return 0;

err5:
        vlan_mvrp_uninit();
err4:
        vlan_gvrp_uninit();
err3:
        unregister_netdevice_notifier(&vlan_notifier_block);
err2:
        unregister_pernet_subsys(&vlan_net_ops);
err0:
        return err;
}

static void __exit vlan_cleanup_module(void)
{
        vlan_ioctl_set(NULL);

        vlan_netlink_fini();

        unregister_netdevice_notifier(&vlan_notifier_block);

        unregister_pernet_subsys(&vlan_net_ops);
        rcu_barrier(); /* Wait for completion of call_rcu()'s */

        vlan_mvrp_uninit();
        vlan_gvrp_uninit();
}

module_init(vlan_proto_init);
module_exit(vlan_cleanup_module);

MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol");
MODULE_LICENSE("GPL");
MODULE_VERSION(DRV_VERSION);





































































































































































































































































































































































































































































































































  179 
  179 



















































































































































































































































































































































































































































  178 












  179 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Implementation of the policy database.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

/*
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *          Support for enhanced MLS infrastructure.
 *          Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 *
 * Updated: Frank Mayer <mayerf@tresys.com> and
 *          Karl MacMillan <kmacmillan@tresys.com>
 *          Added conditional policy language extensions
 *          Copyright (C) 2003-2004 Tresys Technology, LLC
 *
 * Updated: Hewlett-Packard <paul@paul-moore.com>
 *          Added support for the policy capability bitmap
 *          Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
 *
 * Update: Mellanox Techonologies
 *         Added Infiniband support
 *         Copyright (C) 2016 Mellanox Techonologies
 */

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/audit.h>
#include "security.h"

#include "policydb.h"
#include "conditional.h"
#include "mls.h"
#include "services.h"

#ifdef CONFIG_SECURITY_SELINUX_DEBUG
/* clang-format off */
static const char *const symtab_name[SYM_NUM] = {
        "common prefixes",
        "classes",
        "roles",
        "types",
        "users",
        "bools",
        "levels",
        "categories",
};
/* clang-format off */
#endif

struct policydb_compat_info {
        unsigned int version;
        unsigned int sym_num;
        unsigned int ocon_num;
};

/* These need to be updated if SYM_NUM or OCON_NUM changes */
static const struct policydb_compat_info policydb_compat[] = {
        {
                .version = POLICYDB_VERSION_BASE,
                .sym_num = SYM_NUM - 3,
                .ocon_num = OCON_NUM - 3,
        },
        {
                .version = POLICYDB_VERSION_BOOL,
                .sym_num = SYM_NUM - 2,
                .ocon_num = OCON_NUM - 3,
        },
        {
                .version = POLICYDB_VERSION_IPV6,
                .sym_num = SYM_NUM - 2,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_NLCLASS,
                .sym_num = SYM_NUM - 2,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_MLS,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_AVTAB,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_RANGETRANS,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_POLCAP,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_PERMISSIVE,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_BOUNDARY,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_FILENAME_TRANS,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_ROLETRANS,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_NEW_OBJECT_DEFAULTS,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_DEFAULT_TYPE,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_CONSTRAINT_NAMES,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_XPERMS_IOCTL,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM - 2,
        },
        {
                .version = POLICYDB_VERSION_INFINIBAND,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM,
        },
        {
                .version = POLICYDB_VERSION_GLBLUB,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM,
        },
        {
                .version = POLICYDB_VERSION_COMP_FTRANS,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM,
        },
        {
                .version = POLICYDB_VERSION_COND_XPERMS,
                .sym_num = SYM_NUM,
                .ocon_num = OCON_NUM,
        },
};

static const struct policydb_compat_info *
policydb_lookup_compat(unsigned int version)
{
        unsigned int i;

        for (i = 0; i < ARRAY_SIZE(policydb_compat); i++) {
                if (policydb_compat[i].version == version)
                        return &policydb_compat[i];
        }

        return NULL;
}

/*
 * The following *_destroy functions are used to
 * free any memory allocated for each kind of
 * symbol data in the policy database.
 */

static int perm_destroy(void *key, void *datum, void *p)
{
        kfree(key);
        kfree(datum);
        return 0;
}

static int common_destroy(void *key, void *datum, void *p)
{
        struct common_datum *comdatum;

        kfree(key);
        if (datum) {
                comdatum = datum;
                hashtab_map(&comdatum->permissions.table, perm_destroy, NULL);
                hashtab_destroy(&comdatum->permissions.table);
        }
        kfree(datum);
        return 0;
}

static void constraint_expr_destroy(struct constraint_expr *expr)
{
        if (expr) {
                ebitmap_destroy(&expr->names);
                if (expr->type_names) {
                        ebitmap_destroy(&expr->type_names->types);
                        ebitmap_destroy(&expr->type_names->negset);
                        kfree(expr->type_names);
                }
                kfree(expr);
        }
}

static int cls_destroy(void *key, void *datum, void *p)
{
        struct class_datum *cladatum;
        struct constraint_node *constraint, *ctemp;
        struct constraint_expr *e, *etmp;

        kfree(key);
        if (datum) {
                cladatum = datum;
                hashtab_map(&cladatum->permissions.table, perm_destroy, NULL);
                hashtab_destroy(&cladatum->permissions.table);
                constraint = cladatum->constraints;
                while (constraint) {
                        e = constraint->expr;
                        while (e) {
                                etmp = e;
                                e = e->next;
                                constraint_expr_destroy(etmp);
                        }
                        ctemp = constraint;
                        constraint = constraint->next;
                        kfree(ctemp);
                }

                constraint = cladatum->validatetrans;
                while (constraint) {
                        e = constraint->expr;
                        while (e) {
                                etmp = e;
                                e = e->next;
                                constraint_expr_destroy(etmp);
                        }
                        ctemp = constraint;
                        constraint = constraint->next;
                        kfree(ctemp);
                }
                kfree(cladatum->comkey);
        }
        kfree(datum);
        return 0;
}

static int role_destroy(void *key, void *datum, void *p)
{
        struct role_datum *role;

        kfree(key);
        if (datum) {
                role = datum;
                ebitmap_destroy(&role->dominates);
                ebitmap_destroy(&role->types);
        }
        kfree(datum);
        return 0;
}

static int type_destroy(void *key, void *datum, void *p)
{
        kfree(key);
        kfree(datum);
        return 0;
}

static int user_destroy(void *key, void *datum, void *p)
{
        struct user_datum *usrdatum;

        kfree(key);
        if (datum) {
                usrdatum = datum;
                ebitmap_destroy(&usrdatum->roles);
                ebitmap_destroy(&usrdatum->range.level[0].cat);
                ebitmap_destroy(&usrdatum->range.level[1].cat);
                ebitmap_destroy(&usrdatum->dfltlevel.cat);
        }
        kfree(datum);
        return 0;
}

static int sens_destroy(void *key, void *datum, void *p)
{
        struct level_datum *levdatum;

        kfree(key);
        if (datum) {
                levdatum = datum;
                ebitmap_destroy(&levdatum->level.cat);
        }
        kfree(datum);
        return 0;
}

static int cat_destroy(void *key, void *datum, void *p)
{
        kfree(key);
        kfree(datum);
        return 0;
}

/* clang-format off */
static int (*const destroy_f[SYM_NUM])(void *key, void *datum, void *datap) = {
        common_destroy,
        cls_destroy,
        role_destroy,
        type_destroy,
        user_destroy,
        cond_destroy_bool,
        sens_destroy,
        cat_destroy,
};
/* clang-format on */

static int filenametr_destroy(void *key, void *datum, void *p)
{
        struct filename_trans_key *ft = key;
        struct filename_trans_datum *next, *d = datum;

        kfree(ft->name);
        kfree(key);
        do {
                ebitmap_destroy(&d->stypes);
                next = d->next;
                kfree(d);
                d = next;
        } while (unlikely(d));
        cond_resched();
        return 0;
}

static int range_tr_destroy(void *key, void *datum, void *p)
{
        struct mls_range *rt = datum;

        kfree(key);
        ebitmap_destroy(&rt->level[0].cat);
        ebitmap_destroy(&rt->level[1].cat);
        kfree(datum);
        cond_resched();
        return 0;
}

static int role_tr_destroy(void *key, void *datum, void *p)
{
        kfree(key);
        kfree(datum);
        return 0;
}

static void ocontext_destroy(struct ocontext *c, unsigned int i)
{
        if (!c)
                return;

        context_destroy(&c->context[0]);
        context_destroy(&c->context[1]);
        if (i == OCON_ISID || i == OCON_FS || i == OCON_NETIF ||
            i == OCON_FSUSE)
                kfree(c->u.name);
        kfree(c);
}

/*
 * Initialize the role table.
 */
static int roles_init(struct policydb *p)
{
        char *key = NULL;
        int rc;
        struct role_datum *role;

        role = kzalloc(sizeof(*role), GFP_KERNEL);
        if (!role)
                return -ENOMEM;

        rc = -EINVAL;
        role->value = ++p->p_roles.nprim;
        if (role->value != OBJECT_R_VAL)
                goto out;

        rc = -ENOMEM;
        key = kstrdup(OBJECT_R, GFP_KERNEL);
        if (!key)
                goto out;

        rc = symtab_insert(&p->p_roles, key, role);
        if (rc)
                goto out;

        return 0;
out:
        kfree(key);
        kfree(role);
        return rc;
}

static u32 filenametr_hash(const void *k)
{
        const struct filename_trans_key *ft = k;
        unsigned long salt = ft->ttype ^ ft->tclass;

        return full_name_hash((void *)salt, ft->name, strlen(ft->name));
}

static int filenametr_cmp(const void *k1, const void *k2)
{
        const struct filename_trans_key *ft1 = k1;
        const struct filename_trans_key *ft2 = k2;
        int v;

        v = ft1->ttype - ft2->ttype;
        if (v)
                return v;

        v = ft1->tclass - ft2->tclass;
        if (v)
                return v;

        return strcmp(ft1->name, ft2->name);
}

static const struct hashtab_key_params filenametr_key_params = {
        .hash = filenametr_hash,
        .cmp = filenametr_cmp,
};

struct filename_trans_datum *
policydb_filenametr_search(struct policydb *p, struct filename_trans_key *key)
{
        return hashtab_search(&p->filename_trans, key, filenametr_key_params);
}

static u32 rangetr_hash(const void *k)
{
        const struct range_trans *key = k;

        return key->source_type + (key->target_type << 3) +
               (key->target_class << 5);
}

static int rangetr_cmp(const void *k1, const void *k2)
{
        const struct range_trans *key1 = k1, *key2 = k2;
        int v;

        v = key1->source_type - key2->source_type;
        if (v)
                return v;

        v = key1->target_type - key2->target_type;
        if (v)
                return v;

        v = key1->target_class - key2->target_class;

        return v;
}

static const struct hashtab_key_params rangetr_key_params = {
        .hash = rangetr_hash,
        .cmp = rangetr_cmp,
};

struct mls_range *policydb_rangetr_search(struct policydb *p,
                                          struct range_trans *key)
{
        return hashtab_search(&p->range_tr, key, rangetr_key_params);
}

static u32 role_trans_hash(const void *k)
{
        const struct role_trans_key *key = k;

        return jhash_3words(key->role, key->type,
                            (u32)key->tclass << 16 | key->tclass, 0);
}

static int role_trans_cmp(const void *k1, const void *k2)
{
        const struct role_trans_key *key1 = k1, *key2 = k2;
        int v;

        v = key1->role - key2->role;
        if (v)
                return v;

        v = key1->type - key2->type;
        if (v)
                return v;

        return key1->tclass - key2->tclass;
}

static const struct hashtab_key_params roletr_key_params = {
        .hash = role_trans_hash,
        .cmp = role_trans_cmp,
};

struct role_trans_datum *policydb_roletr_search(struct policydb *p,
                                                struct role_trans_key *key)
{
        return hashtab_search(&p->role_tr, key, roletr_key_params);
}

/*
 * Initialize a policy database structure.
 */
static void policydb_init(struct policydb *p)
{
        memset(p, 0, sizeof(*p));

        avtab_init(&p->te_avtab);
        cond_policydb_init(p);

        ebitmap_init(&p->filename_trans_ttypes);
        ebitmap_init(&p->policycaps);
        ebitmap_init(&p->permissive_map);
}

/*
 * The following *_index functions are used to
 * define the val_to_name and val_to_struct arrays
 * in a policy database structure.  The val_to_name
 * arrays are used when converting security context
 * structures into string representations.  The
 * val_to_struct arrays are used when the attributes
 * of a class, role, or user are needed.
 */

static int common_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct common_datum *comdatum;

        comdatum = datum;
        p = datap;
        if (!comdatum->value || comdatum->value > p->p_commons.nprim)
                return -EINVAL;

        p->sym_val_to_name[SYM_COMMONS][comdatum->value - 1] = key;

        return 0;
}

static int class_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct class_datum *cladatum;

        cladatum = datum;
        p = datap;
        if (!cladatum->value || cladatum->value > p->p_classes.nprim)
                return -EINVAL;

        p->sym_val_to_name[SYM_CLASSES][cladatum->value - 1] = key;
        p->class_val_to_struct[cladatum->value - 1] = cladatum;
        return 0;
}

static int role_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct role_datum *role;

        role = datum;
        p = datap;
        if (!role->value || role->value > p->p_roles.nprim ||
            role->bounds > p->p_roles.nprim)
                return -EINVAL;

        p->sym_val_to_name[SYM_ROLES][role->value - 1] = key;
        p->role_val_to_struct[role->value - 1] = role;
        return 0;
}

static int type_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct type_datum *typdatum;

        typdatum = datum;
        p = datap;

        if (typdatum->primary) {
                if (!typdatum->value || typdatum->value > p->p_types.nprim ||
                    typdatum->bounds > p->p_types.nprim)
                        return -EINVAL;
                p->sym_val_to_name[SYM_TYPES][typdatum->value - 1] = key;
                p->type_val_to_struct[typdatum->value - 1] = typdatum;
        }

        return 0;
}

static int user_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct user_datum *usrdatum;

        usrdatum = datum;
        p = datap;
        if (!usrdatum->value || usrdatum->value > p->p_users.nprim ||
            usrdatum->bounds > p->p_users.nprim)
                return -EINVAL;

        p->sym_val_to_name[SYM_USERS][usrdatum->value - 1] = key;
        p->user_val_to_struct[usrdatum->value - 1] = usrdatum;
        return 0;
}

static int sens_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct level_datum *levdatum;

        levdatum = datum;
        p = datap;

        if (!levdatum->isalias) {
                if (!levdatum->level.sens ||
                    levdatum->level.sens > p->p_levels.nprim)
                        return -EINVAL;

                p->sym_val_to_name[SYM_LEVELS][levdatum->level.sens - 1] = key;
        }

        return 0;
}

static int cat_index(void *key, void *datum, void *datap)
{
        struct policydb *p;
        struct cat_datum *catdatum;

        catdatum = datum;
        p = datap;

        if (!catdatum->isalias) {
                if (!catdatum->value || catdatum->value > p->p_cats.nprim)
                        return -EINVAL;

                p->sym_val_to_name[SYM_CATS][catdatum->value - 1] = key;
        }

        return 0;
}

/* clang-format off */
static int (*const index_f[SYM_NUM])(void *key, void *datum, void *datap) = {
        common_index,
        class_index,
        role_index,
        type_index,
        user_index,
        cond_index_bool,
        sens_index,
        cat_index,
};
/* clang-format on */

#ifdef CONFIG_SECURITY_SELINUX_DEBUG
static void hash_eval(struct hashtab *h, const char *hash_name,
                      const char *hash_details)
{
        struct hashtab_info info;

        hashtab_stat(h, &info);
        pr_debug(
                "SELinux: %s%s%s:  %d entries and %d/%d buckets used, longest chain length %d, sum of chain length^2 %llu\n",
                hash_name, hash_details ? "@" : "", hash_details ?: "", h->nel,
                info.slots_used, h->size, info.max_chain_len,
                info.chain2_len_sum);
}

static void symtab_hash_eval(struct symtab *s)
{
        int i;

        for (i = 0; i < SYM_NUM; i++)
                hash_eval(&s[i].table, symtab_name[i], NULL);
}

#else
static inline void hash_eval(struct hashtab *h, const char *hash_name,
                             const char *hash_details)
{
}
static inline void symtab_hash_eval(struct symtab *s)
{
}
#endif /* CONFIG_SECURITY_SELINUX_DEBUG */

/*
 * Define the other val_to_name and val_to_struct arrays
 * in a policy database structure.
 *
 * Caller must clean up on failure.
 */
static int policydb_index(struct policydb *p)
{
        int i, rc;

        if (p->mls_enabled)
                pr_debug(
                        "SELinux:  %d users, %d roles, %d types, %d bools, %d sens, %d cats\n",
                        p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim,
                        p->p_bools.nprim, p->p_levels.nprim, p->p_cats.nprim);
        else
                pr_debug("SELinux:  %d users, %d roles, %d types, %d bools\n",
                         p->p_users.nprim, p->p_roles.nprim, p->p_types.nprim,
                         p->p_bools.nprim);

        pr_debug("SELinux:  %d classes, %d rules\n", p->p_classes.nprim,
                 p->te_avtab.nel);

        avtab_hash_eval(&p->te_avtab, "rules");
        symtab_hash_eval(p->symtab);

        p->class_val_to_struct = kcalloc(p->p_classes.nprim,
                                         sizeof(*p->class_val_to_struct),
                                         GFP_KERNEL);
        if (!p->class_val_to_struct)
                return -ENOMEM;

        p->role_val_to_struct = kcalloc(
                p->p_roles.nprim, sizeof(*p->role_val_to_struct), GFP_KERNEL);
        if (!p->role_val_to_struct)
                return -ENOMEM;

        p->user_val_to_struct = kcalloc(
                p->p_users.nprim, sizeof(*p->user_val_to_struct), GFP_KERNEL);
        if (!p->user_val_to_struct)
                return -ENOMEM;

        p->type_val_to_struct = kvcalloc(
                p->p_types.nprim, sizeof(*p->type_val_to_struct), GFP_KERNEL);
        if (!p->type_val_to_struct)
                return -ENOMEM;

        rc = cond_init_bool_indexes(p);
        if (rc)
                goto out;

        for (i = 0; i < SYM_NUM; i++) {
                p->sym_val_to_name[i] = kvcalloc(p->symtab[i].nprim,
                                                 sizeof(char *), GFP_KERNEL);
                if (!p->sym_val_to_name[i])
                        return -ENOMEM;

                rc = hashtab_map(&p->symtab[i].table, index_f[i], p);
                if (rc)
                        goto out;
        }
        rc = 0;
out:
        return rc;
}

/*
 * Free any memory allocated by a policy database structure.
 */
void policydb_destroy(struct policydb *p)
{
        struct ocontext *c, *ctmp;
        struct genfs *g, *gtmp;
        u32 i;
        struct role_allow *ra, *lra = NULL;

        for (i = 0; i < SYM_NUM; i++) {
                cond_resched();
                hashtab_map(&p->symtab[i].table, destroy_f[i], NULL);
                hashtab_destroy(&p->symtab[i].table);
        }

        for (i = 0; i < SYM_NUM; i++)
                kvfree(p->sym_val_to_name[i]);

        kfree(p->class_val_to_struct);
        kfree(p->role_val_to_struct);
        kfree(p->user_val_to_struct);
        kvfree(p->type_val_to_struct);

        avtab_destroy(&p->te_avtab);

        for (i = 0; i < OCON_NUM; i++) {
                cond_resched();
                c = p->ocontexts[i];
                while (c) {
                        ctmp = c;
                        c = c->next;
                        ocontext_destroy(ctmp, i);
                }
                p->ocontexts[i] = NULL;
        }

        g = p->genfs;
        while (g) {
                cond_resched();
                kfree(g->fstype);
                c = g->head;
                while (c) {
                        ctmp = c;
                        c = c->next;
                        ocontext_destroy(ctmp, OCON_FSUSE);
                }
                gtmp = g;
                g = g->next;
                kfree(gtmp);
        }
        p->genfs = NULL;

        cond_policydb_destroy(p);

        hashtab_map(&p->role_tr, role_tr_destroy, NULL);
        hashtab_destroy(&p->role_tr);

        for (ra = p->role_allow; ra; ra = ra->next) {
                cond_resched();
                kfree(lra);
                lra = ra;
        }
        kfree(lra);

        hashtab_map(&p->filename_trans, filenametr_destroy, NULL);
        hashtab_destroy(&p->filename_trans);

        hashtab_map(&p->range_tr, range_tr_destroy, NULL);
        hashtab_destroy(&p->range_tr);

        if (p->type_attr_map_array) {
                for (i = 0; i < p->p_types.nprim; i++)
                        ebitmap_destroy(&p->type_attr_map_array[i]);
                kvfree(p->type_attr_map_array);
        }

        ebitmap_destroy(&p->filename_trans_ttypes);
        ebitmap_destroy(&p->policycaps);
        ebitmap_destroy(&p->permissive_map);
}

/*
 * Load the initial SIDs specified in a policy database
 * structure into a SID table.
 */
int policydb_load_isids(struct policydb *p, struct sidtab *s)
{
        struct ocontext *head, *c;
        bool isid_init;
        int rc;

        rc = sidtab_init(s);
        if (rc) {
                pr_err("SELinux:  out of memory on SID table init\n");
                return rc;
        }

        isid_init = ebitmap_get_bit(&p->policycaps,
                                    POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT);

        head = p->ocontexts[OCON_ISID];
        for (c = head; c; c = c->next) {
                u32 sid = c->sid[0];
                const char *name = security_get_initial_sid_context(sid);

                if (sid == SECSID_NULL) {
                        pr_err("SELinux:  SID 0 was assigned a context.\n");
                        sidtab_destroy(s);
                        return -EINVAL;
                }

                /* Ignore initial SIDs unused by this kernel. */
                if (!name)
                        continue;

                /*
                 * Also ignore SECINITSID_INIT if the policy doesn't declare
                 * support for it
                 */
                if (sid == SECINITSID_INIT && !isid_init)
                        continue;

                rc = sidtab_set_initial(s, sid, &c->context[0]);
                if (rc) {
                        pr_err("SELinux:  unable to load initial SID %s.\n",
                               name);
                        sidtab_destroy(s);
                        return rc;
                }

                /*
                 * If the policy doesn't support the "userspace_initial_context"
                 * capability, set SECINITSID_INIT to the same context as
                 * SECINITSID_KERNEL. This ensures the same behavior as before
                 * the reintroduction of SECINITSID_INIT, where all tasks
                 * started before policy load would initially get the context
                 * corresponding to SECINITSID_KERNEL.
                 */
                if (sid == SECINITSID_KERNEL && !isid_init) {
                        rc = sidtab_set_initial(s, SECINITSID_INIT,
                                                &c->context[0]);
                        if (rc) {
                                pr_err("SELinux:  unable to load initial SID %s.\n",
                                       name);
                                sidtab_destroy(s);
                                return rc;
                        }
                }
        }
        return 0;
}

int policydb_class_isvalid(struct policydb *p, unsigned int class)
{
        if (!class || class > p->p_classes.nprim)
                return 0;
        return 1;
}

int policydb_role_isvalid(struct policydb *p, unsigned int role)
{
        if (!role || role > p->p_roles.nprim)
                return 0;
        return 1;
}

int policydb_type_isvalid(struct policydb *p, unsigned int type)
{
        if (!type || type > p->p_types.nprim)
                return 0;
        return 1;
}

/*
 * Return 1 if the fields in the security context
 * structure `c' are valid.  Return 0 otherwise.
 */
int policydb_context_isvalid(struct policydb *p, struct context *c)
{
        struct role_datum *role;
        struct user_datum *usrdatum;

        if (!c->role || c->role > p->p_roles.nprim)
                return 0;

        if (!c->user || c->user > p->p_users.nprim)
                return 0;

        if (!c->type || c->type > p->p_types.nprim)
                return 0;

        if (c->role != OBJECT_R_VAL) {
                /*
                 * Role must be authorized for the type.
                 */
                role = p->role_val_to_struct[c->role - 1];
                if (!role || !ebitmap_get_bit(&role->types, c->type - 1))
                        /* role may not be associated with type */
                        return 0;

                /*
                 * User must be authorized for the role.
                 */
                usrdatum = p->user_val_to_struct[c->user - 1];
                if (!usrdatum)
                        return 0;

                if (!ebitmap_get_bit(&usrdatum->roles, c->role - 1))
                        /* user may not be associated with role */
                        return 0;
        }

        if (!mls_context_isvalid(p, c))
                return 0;

        return 1;
}

/*
 * Read a MLS range structure from a policydb binary
 * representation file.
 */
static int mls_read_range_helper(struct mls_range *r, struct policy_file *fp)
{
        __le32 buf[2];
        u32 items;
        int rc;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                goto out;

        rc = -EINVAL;
        items = le32_to_cpu(buf[0]);
        if (items > ARRAY_SIZE(buf)) {
                pr_err("SELinux: mls:  range overflow\n");
                goto out;
        }

        rc = next_entry(buf, fp, sizeof(u32) * items);
        if (rc) {
                pr_err("SELinux: mls:  truncated range\n");
                goto out;
        }

        r->level[0].sens = le32_to_cpu(buf[0]);
        if (items > 1)
                r->level[1].sens = le32_to_cpu(buf[1]);
        else
                r->level[1].sens = r->level[0].sens;

        rc = ebitmap_read(&r->level[0].cat, fp);
        if (rc) {
                pr_err("SELinux: mls:  error reading low categories\n");
                goto out;
        }
        if (items > 1) {
                rc = ebitmap_read(&r->level[1].cat, fp);
                if (rc) {
                        pr_err("SELinux: mls:  error reading high categories\n");
                        goto bad_high;
                }
        } else {
                rc = ebitmap_cpy(&r->level[1].cat, &r->level[0].cat);
                if (rc) {
                        pr_err("SELinux: mls:  out of memory\n");
                        goto bad_high;
                }
        }

        return 0;
bad_high:
        ebitmap_destroy(&r->level[0].cat);
out:
        return rc;
}

/*
 * Read and validate a security context structure
 * from a policydb binary representation file.
 */
static int context_read_and_validate(struct context *c, struct policydb *p,
                                     struct policy_file *fp)
{
        __le32 buf[3];
        int rc;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc) {
                pr_err("SELinux: context truncated\n");
                goto out;
        }
        c->user = le32_to_cpu(buf[0]);
        c->role = le32_to_cpu(buf[1]);
        c->type = le32_to_cpu(buf[2]);
        if (p->policyvers >= POLICYDB_VERSION_MLS) {
                rc = mls_read_range_helper(&c->range, fp);
                if (rc) {
                        pr_err("SELinux: error reading MLS range of context\n");
                        goto out;
                }
        }

        rc = -EINVAL;
        if (!policydb_context_isvalid(p, c)) {
                pr_err("SELinux:  invalid security context\n");
                context_destroy(c);
                goto out;
        }
        rc = 0;
out:
        return rc;
}

/*
 * The following *_read functions are used to
 * read the symbol data from a policy database
 * binary representation file.
 */

int str_read(char **strp, gfp_t flags, struct policy_file *fp, u32 len)
{
        int rc;
        char *str;

        if ((len == 0) || (len == (u32)-1))
                return -EINVAL;

        str = kmalloc(len + 1, flags | __GFP_NOWARN);
        if (!str)
                return -ENOMEM;

        rc = next_entry(str, fp, len);
        if (rc) {
                kfree(str);
                return rc;
        }

        str[len] = '\0';
        *strp = str;
        return 0;
}

static int perm_read(struct policydb *p, struct symtab *s, struct policy_file *fp)
{
        char *key = NULL;
        struct perm_datum *perdatum;
        int rc;
        __le32 buf[2];
        u32 len;

        perdatum = kzalloc(sizeof(*perdatum), GFP_KERNEL);
        if (!perdatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        perdatum->value = le32_to_cpu(buf[1]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = symtab_insert(s, key, perdatum);
        if (rc)
                goto bad;

        return 0;
bad:
        perm_destroy(key, perdatum, NULL);
        return rc;
}

static int common_read(struct policydb *p, struct symtab *s, struct policy_file *fp)
{
        char *key = NULL;
        struct common_datum *comdatum;
        __le32 buf[4];
        u32 i, len, nel;
        int rc;

        comdatum = kzalloc(sizeof(*comdatum), GFP_KERNEL);
        if (!comdatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        comdatum->value = le32_to_cpu(buf[1]);
        nel = le32_to_cpu(buf[3]);

        rc = symtab_init(&comdatum->permissions, nel);
        if (rc)
                goto bad;
        comdatum->permissions.nprim = le32_to_cpu(buf[2]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        for (i = 0; i < nel; i++) {
                rc = perm_read(p, &comdatum->permissions, fp);
                if (rc)
                        goto bad;
        }

        hash_eval(&comdatum->permissions.table, "common_permissions", key);

        rc = symtab_insert(s, key, comdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        common_destroy(key, comdatum, NULL);
        return rc;
}

static void type_set_init(struct type_set *t)
{
        ebitmap_init(&t->types);
        ebitmap_init(&t->negset);
}

static int type_set_read(struct type_set *t, struct policy_file *fp)
{
        __le32 buf[1];
        int rc;

        if (ebitmap_read(&t->types, fp))
                return -EINVAL;
        if (ebitmap_read(&t->negset, fp))
                return -EINVAL;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc < 0)
                return -EINVAL;
        t->flags = le32_to_cpu(buf[0]);

        return 0;
}

static int read_cons_helper(struct policydb *p, struct constraint_node **nodep,
                            u32 ncons, int allowxtarget, struct policy_file *fp)
{
        struct constraint_node *c, *lc;
        struct constraint_expr *e, *le;
        __le32 buf[3];
        u32 i, j, nexpr;
        int rc, depth;

        lc = NULL;
        for (i = 0; i < ncons; i++) {
                c = kzalloc(sizeof(*c), GFP_KERNEL);
                if (!c)
                        return -ENOMEM;

                if (lc)
                        lc->next = c;
                else
                        *nodep = c;

                rc = next_entry(buf, fp, (sizeof(u32) * 2));
                if (rc)
                        return rc;
                c->permissions = le32_to_cpu(buf[0]);
                nexpr = le32_to_cpu(buf[1]);
                le = NULL;
                depth = -1;
                for (j = 0; j < nexpr; j++) {
                        e = kzalloc(sizeof(*e), GFP_KERNEL);
                        if (!e)
                                return -ENOMEM;

                        if (le)
                                le->next = e;
                        else
                                c->expr = e;

                        rc = next_entry(buf, fp, (sizeof(u32) * 3));
                        if (rc)
                                return rc;
                        e->expr_type = le32_to_cpu(buf[0]);
                        e->attr = le32_to_cpu(buf[1]);
                        e->op = le32_to_cpu(buf[2]);

                        switch (e->expr_type) {
                        case CEXPR_NOT:
                                if (depth < 0)
                                        return -EINVAL;
                                break;
                        case CEXPR_AND:
                        case CEXPR_OR:
                                if (depth < 1)
                                        return -EINVAL;
                                depth--;
                                break;
                        case CEXPR_ATTR:
                                if (depth == (CEXPR_MAXDEPTH - 1))
                                        return -EINVAL;
                                depth++;
                                break;
                        case CEXPR_NAMES:
                                if (!allowxtarget && (e->attr & CEXPR_XTARGET))
                                        return -EINVAL;
                                if (depth == (CEXPR_MAXDEPTH - 1))
                                        return -EINVAL;
                                depth++;
                                rc = ebitmap_read(&e->names, fp);
                                if (rc)
                                        return rc;
                                if (p->policyvers >=
                                    POLICYDB_VERSION_CONSTRAINT_NAMES) {
                                        e->type_names =
                                                kzalloc(sizeof(*e->type_names),
                                                        GFP_KERNEL);
                                        if (!e->type_names)
                                                return -ENOMEM;
                                        type_set_init(e->type_names);
                                        rc = type_set_read(e->type_names, fp);
                                        if (rc)
                                                return rc;
                                }
                                break;
                        default:
                                return -EINVAL;
                        }
                        le = e;
                }
                if (depth != 0)
                        return -EINVAL;
                lc = c;
        }

        return 0;
}

static int class_read(struct policydb *p, struct symtab *s, struct policy_file *fp)
{
        char *key = NULL;
        struct class_datum *cladatum;
        __le32 buf[6];
        u32 i, len, len2, ncons, nel;
        int rc;

        cladatum = kzalloc(sizeof(*cladatum), GFP_KERNEL);
        if (!cladatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof(u32) * 6);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        len2 = le32_to_cpu(buf[1]);
        cladatum->value = le32_to_cpu(buf[2]);
        nel = le32_to_cpu(buf[4]);

        rc = symtab_init(&cladatum->permissions, nel);
        if (rc)
                goto bad;
        cladatum->permissions.nprim = le32_to_cpu(buf[3]);

        ncons = le32_to_cpu(buf[5]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        if (len2) {
                rc = str_read(&cladatum->comkey, GFP_KERNEL, fp, len2);
                if (rc)
                        goto bad;

                rc = -EINVAL;
                cladatum->comdatum =
                        symtab_search(&p->p_commons, cladatum->comkey);
                if (!cladatum->comdatum) {
                        pr_err("SELinux:  unknown common %s\n",
                               cladatum->comkey);
                        goto bad;
                }
        }
        for (i = 0; i < nel; i++) {
                rc = perm_read(p, &cladatum->permissions, fp);
                if (rc)
                        goto bad;
        }

        hash_eval(&cladatum->permissions.table, "class_permissions", key);

        rc = read_cons_helper(p, &cladatum->constraints, ncons, 0, fp);
        if (rc)
                goto bad;

        if (p->policyvers >= POLICYDB_VERSION_VALIDATETRANS) {
                /* grab the validatetrans rules */
                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto bad;
                ncons = le32_to_cpu(buf[0]);
                rc = read_cons_helper(p, &cladatum->validatetrans, ncons, 1,
                                      fp);
                if (rc)
                        goto bad;
        }

        if (p->policyvers >= POLICYDB_VERSION_NEW_OBJECT_DEFAULTS) {
                rc = next_entry(buf, fp, sizeof(u32) * 3);
                if (rc)
                        goto bad;

                cladatum->default_user = le32_to_cpu(buf[0]);
                cladatum->default_role = le32_to_cpu(buf[1]);
                cladatum->default_range = le32_to_cpu(buf[2]);
        }

        if (p->policyvers >= POLICYDB_VERSION_DEFAULT_TYPE) {
                rc = next_entry(buf, fp, sizeof(u32) * 1);
                if (rc)
                        goto bad;
                cladatum->default_type = le32_to_cpu(buf[0]);
        }

        rc = symtab_insert(s, key, cladatum);
        if (rc)
                goto bad;

        return 0;
bad:
        cls_destroy(key, cladatum, NULL);
        return rc;
}

static int role_read(struct policydb *p, struct symtab *s, struct policy_file *fp)
{
        char *key = NULL;
        struct role_datum *role;
        int rc;
        unsigned int to_read = 2;
        __le32 buf[3];
        u32 len;

        role = kzalloc(sizeof(*role), GFP_KERNEL);
        if (!role)
                return -ENOMEM;

        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                to_read = 3;

        rc = next_entry(buf, fp, sizeof(buf[0]) * to_read);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        role->value = le32_to_cpu(buf[1]);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                role->bounds = le32_to_cpu(buf[2]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = ebitmap_read(&role->dominates, fp);
        if (rc)
                goto bad;

        rc = ebitmap_read(&role->types, fp);
        if (rc)
                goto bad;

        if (strcmp(key, OBJECT_R) == 0) {
                rc = -EINVAL;
                if (role->value != OBJECT_R_VAL) {
                        pr_err("SELinux: Role %s has wrong value %d\n",
                               OBJECT_R, role->value);
                        goto bad;
                }
                rc = 0;
                goto bad;
        }

        rc = symtab_insert(s, key, role);
        if (rc)
                goto bad;
        return 0;
bad:
        role_destroy(key, role, NULL);
        return rc;
}

static int type_read(struct policydb *p, struct symtab *s, struct policy_file *fp)
{
        char *key = NULL;
        struct type_datum *typdatum;
        int rc;
        unsigned int to_read = 3;
        __le32 buf[4];
        u32 len;

        typdatum = kzalloc(sizeof(*typdatum), GFP_KERNEL);
        if (!typdatum)
                return -ENOMEM;

        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                to_read = 4;

        rc = next_entry(buf, fp, sizeof(buf[0]) * to_read);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        typdatum->value = le32_to_cpu(buf[1]);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) {
                u32 prop = le32_to_cpu(buf[2]);

                if (prop & TYPEDATUM_PROPERTY_PRIMARY)
                        typdatum->primary = 1;
                if (prop & TYPEDATUM_PROPERTY_ATTRIBUTE)
                        typdatum->attribute = 1;

                typdatum->bounds = le32_to_cpu(buf[3]);
        } else {
                typdatum->primary = le32_to_cpu(buf[2]);
        }

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = symtab_insert(s, key, typdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        type_destroy(key, typdatum, NULL);
        return rc;
}

/*
 * Read a MLS level structure from a policydb binary
 * representation file.
 */
static int mls_read_level(struct mls_level *lp, struct policy_file *fp)
{
        __le32 buf[1];
        int rc;

        memset(lp, 0, sizeof(*lp));

        rc = next_entry(buf, fp, sizeof buf);
        if (rc) {
                pr_err("SELinux: mls: truncated level\n");
                return rc;
        }
        lp->sens = le32_to_cpu(buf[0]);

        rc = ebitmap_read(&lp->cat, fp);
        if (rc) {
                pr_err("SELinux: mls:  error reading level categories\n");
                return rc;
        }
        return 0;
}

static int user_read(struct policydb *p, struct symtab *s, struct policy_file *fp)
{
        char *key = NULL;
        struct user_datum *usrdatum;
        int rc;
        unsigned int to_read = 2;
        __le32 buf[3];
        u32 len;

        usrdatum = kzalloc(sizeof(*usrdatum), GFP_KERNEL);
        if (!usrdatum)
                return -ENOMEM;

        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                to_read = 3;

        rc = next_entry(buf, fp, sizeof(buf[0]) * to_read);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        usrdatum->value = le32_to_cpu(buf[1]);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                usrdatum->bounds = le32_to_cpu(buf[2]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = ebitmap_read(&usrdatum->roles, fp);
        if (rc)
                goto bad;

        if (p->policyvers >= POLICYDB_VERSION_MLS) {
                rc = mls_read_range_helper(&usrdatum->range, fp);
                if (rc)
                        goto bad;
                rc = mls_read_level(&usrdatum->dfltlevel, fp);
                if (rc)
                        goto bad;
        }

        rc = symtab_insert(s, key, usrdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        user_destroy(key, usrdatum, NULL);
        return rc;
}

static int sens_read(struct policydb *p, struct symtab *s, struct policy_file *fp)
{
        char *key = NULL;
        struct level_datum *levdatum;
        int rc;
        __le32 buf[2];
        u32 len;

        levdatum = kzalloc(sizeof(*levdatum), GFP_KERNEL);
        if (!levdatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        levdatum->isalias = le32_to_cpu(buf[1]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = mls_read_level(&levdatum->level, fp);
        if (rc)
                goto bad;

        rc = symtab_insert(s, key, levdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        sens_destroy(key, levdatum, NULL);
        return rc;
}

static int cat_read(struct policydb *p, struct symtab *s, struct policy_file *fp)
{
        char *key = NULL;
        struct cat_datum *catdatum;
        int rc;
        __le32 buf[3];
        u32 len;

        catdatum = kzalloc(sizeof(*catdatum), GFP_KERNEL);
        if (!catdatum)
                return -ENOMEM;

        rc = next_entry(buf, fp, sizeof buf);
        if (rc)
                goto bad;

        len = le32_to_cpu(buf[0]);
        catdatum->value = le32_to_cpu(buf[1]);
        catdatum->isalias = le32_to_cpu(buf[2]);

        rc = str_read(&key, GFP_KERNEL, fp, len);
        if (rc)
                goto bad;

        rc = symtab_insert(s, key, catdatum);
        if (rc)
                goto bad;
        return 0;
bad:
        cat_destroy(key, catdatum, NULL);
        return rc;
}

/* clang-format off */
static int (*const read_f[SYM_NUM])(struct policydb *p, struct symtab *s,
                                    struct policy_file *fp) = {
        common_read,
        class_read,
        role_read,
        type_read,
        user_read,
        cond_read_bool,
        sens_read,
        cat_read,
};
/* clang-format on */

static int user_bounds_sanity_check(void *key, void *datum, void *datap)
{
        struct user_datum *upper, *user;
        struct policydb *p = datap;
        int depth = 0;

        upper = user = datum;
        while (upper->bounds) {
                struct ebitmap_node *node;
                u32 bit;

                if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
                        pr_err("SELinux: user %s: "
                               "too deep or looped boundary\n",
                               (char *)key);
                        return -EINVAL;
                }

                upper = p->user_val_to_struct[upper->bounds - 1];
                ebitmap_for_each_positive_bit(&user->roles, node, bit)
                {
                        if (ebitmap_get_bit(&upper->roles, bit))
                                continue;

                        pr_err("SELinux: boundary violated policy: "
                               "user=%s role=%s bounds=%s\n",
                               sym_name(p, SYM_USERS, user->value - 1),
                               sym_name(p, SYM_ROLES, bit),
                               sym_name(p, SYM_USERS, upper->value - 1));

                        return -EINVAL;
                }
        }

        return 0;
}

static int role_bounds_sanity_check(void *key, void *datum, void *datap)
{
        struct role_datum *upper, *role;
        struct policydb *p = datap;
        int depth = 0;

        upper = role = datum;
        while (upper->bounds) {
                struct ebitmap_node *node;
                u32 bit;

                if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
                        pr_err("SELinux: role %s: "
                               "too deep or looped bounds\n",
                               (char *)key);
                        return -EINVAL;
                }

                upper = p->role_val_to_struct[upper->bounds - 1];
                ebitmap_for_each_positive_bit(&role->types, node, bit)
                {
                        if (ebitmap_get_bit(&upper->types, bit))
                                continue;

                        pr_err("SELinux: boundary violated policy: "
                               "role=%s type=%s bounds=%s\n",
                               sym_name(p, SYM_ROLES, role->value - 1),
                               sym_name(p, SYM_TYPES, bit),
                               sym_name(p, SYM_ROLES, upper->value - 1));

                        return -EINVAL;
                }
        }

        return 0;
}

static int type_bounds_sanity_check(void *key, void *datum, void *datap)
{
        struct type_datum *upper;
        struct policydb *p = datap;
        int depth = 0;

        upper = datum;
        while (upper->bounds) {
                if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
                        pr_err("SELinux: type %s: "
                               "too deep or looped boundary\n",
                               (char *)key);
                        return -EINVAL;
                }

                upper = p->type_val_to_struct[upper->bounds - 1];
                BUG_ON(!upper);

                if (upper->attribute) {
                        pr_err("SELinux: type %s: "
                               "bounded by attribute %s\n",
                               (char *)key,
                               sym_name(p, SYM_TYPES, upper->value - 1));
                        return -EINVAL;
                }
        }

        return 0;
}

static int policydb_bounds_sanity_check(struct policydb *p)
{
        int rc;

        if (p->policyvers < POLICYDB_VERSION_BOUNDARY)
                return 0;

        rc = hashtab_map(&p->p_users.table, user_bounds_sanity_check, p);
        if (rc)
                return rc;

        rc = hashtab_map(&p->p_roles.table, role_bounds_sanity_check, p);
        if (rc)
                return rc;

        rc = hashtab_map(&p->p_types.table, type_bounds_sanity_check, p);
        if (rc)
                return rc;

        return 0;
}

u16 string_to_security_class(struct policydb *p, const char *name)
{
        struct class_datum *cladatum;

        cladatum = symtab_search(&p->p_classes, name);
        if (!cladatum)
                return 0;

        return cladatum->value;
}

u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name)
{
        struct class_datum *cladatum;
        struct perm_datum *perdatum = NULL;
        struct common_datum *comdatum;

        if (!tclass || tclass > p->p_classes.nprim)
                return 0;

        cladatum = p->class_val_to_struct[tclass - 1];
        comdatum = cladatum->comdatum;
        if (comdatum)
                perdatum = symtab_search(&comdatum->permissions, name);
        if (!perdatum)
                perdatum = symtab_search(&cladatum->permissions, name);
        if (!perdatum)
                return 0;

        return 1U << (perdatum->value - 1);
}

static int range_read(struct policydb *p, struct policy_file *fp)
{
        struct range_trans *rt = NULL;
        struct mls_range *r = NULL;
        int rc;
        __le32 buf[2];
        u32 i, nel;

        if (p->policyvers < POLICYDB_VERSION_MLS)
                return 0;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;

        nel = le32_to_cpu(buf[0]);

        rc = hashtab_init(&p->range_tr, nel);
        if (rc)
                return rc;

        for (i = 0; i < nel; i++) {
                rc = -ENOMEM;
                rt = kzalloc(sizeof(*rt), GFP_KERNEL);
                if (!rt)
                        goto out;

                rc = next_entry(buf, fp, (sizeof(u32) * 2));
                if (rc)
                        goto out;

                rt->source_type = le32_to_cpu(buf[0]);
                rt->target_type = le32_to_cpu(buf[1]);
                if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) {
                        rc = next_entry(buf, fp, sizeof(u32));
                        if (rc)
                                goto out;
                        rt->target_class = le32_to_cpu(buf[0]);
                } else
                        rt->target_class = p->process_class;

                rc = -EINVAL;
                if (!policydb_type_isvalid(p, rt->source_type) ||
                    !policydb_type_isvalid(p, rt->target_type) ||
                    !policydb_class_isvalid(p, rt->target_class))
                        goto out;

                rc = -ENOMEM;
                r = kzalloc(sizeof(*r), GFP_KERNEL);
                if (!r)
                        goto out;

                rc = mls_read_range_helper(r, fp);
                if (rc)
                        goto out;

                rc = -EINVAL;
                if (!mls_range_isvalid(p, r)) {
                        pr_warn("SELinux:  rangetrans:  invalid range\n");
                        goto out;
                }

                rc = hashtab_insert(&p->range_tr, rt, r, rangetr_key_params);
                if (rc)
                        goto out;

                rt = NULL;
                r = NULL;
        }
        hash_eval(&p->range_tr, "rangetr", NULL);
        rc = 0;
out:
        kfree(rt);
        kfree(r);
        return rc;
}

static int filename_trans_read_helper_compat(struct policydb *p, struct policy_file *fp)
{
        struct filename_trans_key key, *ft = NULL;
        struct filename_trans_datum *last, *datum = NULL;
        char *name = NULL;
        u32 len, stype, otype;
        __le32 buf[4];
        int rc;

        /* length of the path component string */
        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;
        len = le32_to_cpu(buf[0]);

        /* path component string */
        rc = str_read(&name, GFP_KERNEL, fp, len);
        if (rc)
                return rc;

        rc = next_entry(buf, fp, sizeof(u32) * 4);
        if (rc)
                goto out;

        stype = le32_to_cpu(buf[0]);
        key.ttype = le32_to_cpu(buf[1]);
        key.tclass = le32_to_cpu(buf[2]);
        key.name = name;

        otype = le32_to_cpu(buf[3]);

        last = NULL;
        datum = policydb_filenametr_search(p, &key);
        while (datum) {
                if (unlikely(ebitmap_get_bit(&datum->stypes, stype - 1))) {
                        /* conflicting/duplicate rules are ignored */
                        datum = NULL;
                        rc = 0;
                        goto out;
                }
                if (likely(datum->otype == otype))
                        break;
                last = datum;
                datum = datum->next;
        }
        if (!datum) {
                rc = -ENOMEM;
                datum = kmalloc(sizeof(*datum), GFP_KERNEL);
                if (!datum)
                        goto out;

                ebitmap_init(&datum->stypes);
                datum->otype = otype;
                datum->next = NULL;

                if (unlikely(last)) {
                        last->next = datum;
                } else {
                        rc = -ENOMEM;
                        ft = kmemdup(&key, sizeof(key), GFP_KERNEL);
                        if (!ft)
                                goto out;

                        rc = hashtab_insert(&p->filename_trans, ft, datum,
                                            filenametr_key_params);
                        if (rc)
                                goto out;
                        name = NULL;

                        rc = ebitmap_set_bit(&p->filename_trans_ttypes,
                                             key.ttype, 1);
                        if (rc)
                                return rc;
                }
        }
        kfree(name);
        return ebitmap_set_bit(&datum->stypes, stype - 1, 1);

out:
        kfree(ft);
        kfree(name);
        kfree(datum);
        return rc;
}

static int filename_trans_read_helper(struct policydb *p, struct policy_file *fp)
{
        struct filename_trans_key *ft = NULL;
        struct filename_trans_datum **dst, *datum, *first = NULL;
        char *name = NULL;
        u32 len, ttype, tclass, ndatum, i;
        __le32 buf[3];
        int rc;

        /* length of the path component string */
        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;
        len = le32_to_cpu(buf[0]);

        /* path component string */
        rc = str_read(&name, GFP_KERNEL, fp, len);
        if (rc)
                return rc;

        rc = next_entry(buf, fp, sizeof(u32) * 3);
        if (rc)
                goto out;

        ttype = le32_to_cpu(buf[0]);
        tclass = le32_to_cpu(buf[1]);

        ndatum = le32_to_cpu(buf[2]);
        if (ndatum == 0) {
                pr_err("SELinux:  Filename transition key with no datum\n");
                rc = -ENOENT;
                goto out;
        }

        dst = &first;
        for (i = 0; i < ndatum; i++) {
                rc = -ENOMEM;
                datum = kmalloc(sizeof(*datum), GFP_KERNEL);
                if (!datum)
                        goto out;

                datum->next = NULL;
                *dst = datum;

                /* ebitmap_read() will at least init the bitmap */
                rc = ebitmap_read(&datum->stypes, fp);
                if (rc)
                        goto out;

                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto out;

                datum->otype = le32_to_cpu(buf[0]);

                dst = &datum->next;
        }

        rc = -ENOMEM;
        ft = kmalloc(sizeof(*ft), GFP_KERNEL);
        if (!ft)
                goto out;

        ft->ttype = ttype;
        ft->tclass = tclass;
        ft->name = name;

        rc = hashtab_insert(&p->filename_trans, ft, first,
                            filenametr_key_params);
        if (rc == -EEXIST)
                pr_err("SELinux:  Duplicate filename transition key\n");
        if (rc)
                goto out;

        return ebitmap_set_bit(&p->filename_trans_ttypes, ttype, 1);

out:
        kfree(ft);
        kfree(name);
        while (first) {
                datum = first;
                first = first->next;

                ebitmap_destroy(&datum->stypes);
                kfree(datum);
        }
        return rc;
}

static int filename_trans_read(struct policydb *p, struct policy_file *fp)
{
        u32 nel, i;
        __le32 buf[1];
        int rc;

        if (p->policyvers < POLICYDB_VERSION_FILENAME_TRANS)
                return 0;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;
        nel = le32_to_cpu(buf[0]);

        if (p->policyvers < POLICYDB_VERSION_COMP_FTRANS) {
                p->compat_filename_trans_count = nel;

                rc = hashtab_init(&p->filename_trans, (1 << 11));
                if (rc)
                        return rc;

                for (i = 0; i < nel; i++) {
                        rc = filename_trans_read_helper_compat(p, fp);
                        if (rc)
                                return rc;
                }
        } else {
                rc = hashtab_init(&p->filename_trans, nel);
                if (rc)
                        return rc;

                for (i = 0; i < nel; i++) {
                        rc = filename_trans_read_helper(p, fp);
                        if (rc)
                                return rc;
                }
        }
        hash_eval(&p->filename_trans, "filenametr", NULL);
        return 0;
}

static int genfs_read(struct policydb *p, struct policy_file *fp)
{
        int rc;
        u32 i, j, nel, nel2, len, len2;
        __le32 buf[1];
        struct ocontext *l, *c;
        struct ocontext *newc = NULL;
        struct genfs *genfs_p, *genfs;
        struct genfs *newgenfs = NULL;

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                return rc;
        nel = le32_to_cpu(buf[0]);

        for (i = 0; i < nel; i++) {
                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto out;
                len = le32_to_cpu(buf[0]);

                rc = -ENOMEM;
                newgenfs = kzalloc(sizeof(*newgenfs), GFP_KERNEL);
                if (!newgenfs)
                        goto out;

                rc = str_read(&newgenfs->fstype, GFP_KERNEL, fp, len);
                if (rc)
                        goto out;

                for (genfs_p = NULL, genfs = p->genfs; genfs;
                     genfs_p = genfs, genfs = genfs->next) {
                        rc = -EINVAL;
                        if (strcmp(newgenfs->fstype, genfs->fstype) == 0) {
                                pr_err("SELinux:  dup genfs fstype %s\n",
                                       newgenfs->fstype);
                                goto out;
                        }
                        if (strcmp(newgenfs->fstype, genfs->fstype) < 0)
                                break;
                }
                newgenfs->next = genfs;
                if (genfs_p)
                        genfs_p->next = newgenfs;
                else
                        p->genfs = newgenfs;
                genfs = newgenfs;
                newgenfs = NULL;

                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto out;

                nel2 = le32_to_cpu(buf[0]);
                for (j = 0; j < nel2; j++) {
                        rc = next_entry(buf, fp, sizeof(u32));
                        if (rc)
                                goto out;
                        len = le32_to_cpu(buf[0]);

                        rc = -ENOMEM;
                        newc = kzalloc(sizeof(*newc), GFP_KERNEL);
                        if (!newc)
                                goto out;

                        rc = str_read(&newc->u.name, GFP_KERNEL, fp, len);
                        if (rc)
                                goto out;

                        rc = next_entry(buf, fp, sizeof(u32));
                        if (rc)
                                goto out;

                        newc->v.sclass = le32_to_cpu(buf[0]);
                        rc = context_read_and_validate(&newc->context[0], p,
                                                       fp);
                        if (rc)
                                goto out;

                        for (l = NULL, c = genfs->head; c; l = c, c = c->next) {
                                rc = -EINVAL;
                                if (!strcmp(newc->u.name, c->u.name) &&
                                    (!c->v.sclass || !newc->v.sclass ||
                                     newc->v.sclass == c->v.sclass)) {
                                        pr_err("SELinux:  dup genfs entry (%s,%s)\n",
                                               genfs->fstype, c->u.name);
                                        goto out;
                                }
                                len = strlen(newc->u.name);
                                len2 = strlen(c->u.name);
                                if (len > len2)
                                        break;
                        }

                        newc->next = c;
                        if (l)
                                l->next = newc;
                        else
                                genfs->head = newc;
                        newc = NULL;
                }
        }
        rc = 0;
out:
        if (newgenfs) {
                kfree(newgenfs->fstype);
                kfree(newgenfs);
        }
        ocontext_destroy(newc, OCON_FSUSE);

        return rc;
}

static int ocontext_read(struct policydb *p,
                         const struct policydb_compat_info *info, struct policy_file *fp)
{
        int rc;
        unsigned int i;
        u32 j, nel, len;
        __be64 prefixbuf[1];
        __le32 buf[3];
        struct ocontext *l, *c;
        u32 nodebuf[8];

        for (i = 0; i < info->ocon_num; i++) {
                rc = next_entry(buf, fp, sizeof(u32));
                if (rc)
                        goto out;
                nel = le32_to_cpu(buf[0]);

                l = NULL;
                for (j = 0; j < nel; j++) {
                        rc = -ENOMEM;
                        c = kzalloc(sizeof(*c), GFP_KERNEL);
                        if (!c)
                                goto out;
                        if (l)
                                l->next = c;
                        else
                                p->ocontexts[i] = c;
                        l = c;

                        switch (i) {
                        case OCON_ISID:
                                rc = next_entry(buf, fp, sizeof(u32));
                                if (rc)
                                        goto out;

                                c->sid[0] = le32_to_cpu(buf[0]);
                                rc = context_read_and_validate(&c->context[0],
                                                               p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_FS:
                        case OCON_NETIF:
                                rc = next_entry(buf, fp, sizeof(u32));
                                if (rc)
                                        goto out;
                                len = le32_to_cpu(buf[0]);

                                rc = str_read(&c->u.name, GFP_KERNEL, fp, len);
                                if (rc)
                                        goto out;

                                if (i == OCON_FS)
                                        pr_warn("SELinux:  void and deprecated fs ocon %s\n",
                                                c->u.name);

                                rc = context_read_and_validate(&c->context[0],
                                                               p, fp);
                                if (rc)
                                        goto out;
                                rc = context_read_and_validate(&c->context[1],
                                                               p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_PORT:
                                rc = next_entry(buf, fp, sizeof(u32) * 3);
                                if (rc)
                                        goto out;
                                c->u.port.protocol = le32_to_cpu(buf[0]);
                                c->u.port.low_port = le32_to_cpu(buf[1]);
                                c->u.port.high_port = le32_to_cpu(buf[2]);
                                rc = context_read_and_validate(&c->context[0],
                                                               p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_NODE:
                                rc = next_entry(nodebuf, fp, sizeof(u32) * 2);
                                if (rc)
                                        goto out;
                                c->u.node.addr = nodebuf[0]; /* network order */
                                c->u.node.mask = nodebuf[1]; /* network order */
                                rc = context_read_and_validate(&c->context[0],
                                                               p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_FSUSE:
                                rc = next_entry(buf, fp, sizeof(u32) * 2);
                                if (rc)
                                        goto out;

                                rc = -EINVAL;
                                c->v.behavior = le32_to_cpu(buf[0]);
                                /* Determined at runtime, not in policy DB. */
                                if (c->v.behavior == SECURITY_FS_USE_MNTPOINT)
                                        goto out;
                                if (c->v.behavior > SECURITY_FS_USE_MAX)
                                        goto out;

                                len = le32_to_cpu(buf[1]);
                                rc = str_read(&c->u.name, GFP_KERNEL, fp, len);
                                if (rc)
                                        goto out;

                                rc = context_read_and_validate(&c->context[0],
                                                               p, fp);
                                if (rc)
                                        goto out;
                                break;
                        case OCON_NODE6: {
                                int k;

                                rc = next_entry(nodebuf, fp, sizeof(u32) * 8);
                                if (rc)
                                        goto out;
                                for (k = 0; k < 4; k++)
                                        c->u.node6.addr[k] = nodebuf[k];
                                for (k = 0; k < 4; k++)
                                        c->u.node6.mask[k] = nodebuf[k + 4];
                                rc = context_read_and_validate(&c->context[0],
                                                               p, fp);
                                if (rc)
                                        goto out;
                                break;
                        }
                        case OCON_IBPKEY: {
                                u32 pkey_lo, pkey_hi;

                                rc = next_entry(prefixbuf, fp, sizeof(u64));
                                if (rc)
                                        goto out;

                                /* we need to have subnet_prefix in CPU order */
                                c->u.ibpkey.subnet_prefix =
                                        be64_to_cpu(prefixbuf[0]);

                                rc = next_entry(buf, fp, sizeof(u32) * 2);
                                if (rc)
                                        goto out;

                                pkey_lo = le32_to_cpu(buf[0]);
                                pkey_hi = le32_to_cpu(buf[1]);

                                if (pkey_lo > U16_MAX || pkey_hi > U16_MAX) {
                                        rc = -EINVAL;
                                        goto out;
                                }

                                c->u.ibpkey.low_pkey = pkey_lo;
                                c->u.ibpkey.high_pkey = pkey_hi;

                                rc = context_read_and_validate(&c->context[0],
                                                               p, fp);
                                if (rc)
                                        goto out;
                                break;
                        }
                        case OCON_IBENDPORT: {
                                u32 port;

                                rc = next_entry(buf, fp, sizeof(u32) * 2);
                                if (rc)
                                        goto out;
                                len = le32_to_cpu(buf[0]);

                                rc = str_read(&c->u.ibendport.dev_name,
                                              GFP_KERNEL, fp, len);
                                if (rc)
                                        goto out;

                                port = le32_to_cpu(buf[1]);
                                if (port > U8_MAX || port == 0) {
                                        rc = -EINVAL;
                                        goto out;
                                }

                                c->u.ibendport.port = port;

                                rc = context_read_and_validate(&c->context[0],
                                                               p, fp);
                                if (rc)
                                        goto out;
                                break;
                        } /* end case */
                        } /* end switch */
                }
        }
        rc = 0;
out:
        return rc;
}

/*
 * Read the configuration data from a policy database binary
 * representation file into a policy database structure.
 */
int policydb_read(struct policydb *p, struct policy_file *fp)
{
        struct role_allow *ra, *lra;
        struct role_trans_key *rtk = NULL;
        struct role_trans_datum *rtd = NULL;
        int rc;
        __le32 buf[4];
        u32 i, j, len, nprim, nel, perm;

        char *policydb_str;
        const struct policydb_compat_info *info;

        policydb_init(p);

        /* Read the magic number and string length. */
        rc = next_entry(buf, fp, sizeof(u32) * 2);
        if (rc)
                goto bad;

        rc = -EINVAL;
        if (le32_to_cpu(buf[0]) != POLICYDB_MAGIC) {
                pr_err("SELinux:  policydb magic number 0x%x does "
                       "not match expected magic number 0x%x\n",
                       le32_to_cpu(buf[0]), POLICYDB_MAGIC);
                goto bad;
        }

        rc = -EINVAL;
        len = le32_to_cpu(buf[1]);
        if (len != strlen(POLICYDB_STRING)) {
                pr_err("SELinux:  policydb string length %d does not "
                       "match expected length %zu\n",
                       len, strlen(POLICYDB_STRING));
                goto bad;
        }

        rc = str_read(&policydb_str, GFP_KERNEL, fp, len);
        if (rc) {
                if (rc == -ENOMEM) {
                        pr_err("SELinux:  unable to allocate memory for policydb string of length %d\n",
                               len);
                } else {
                        pr_err("SELinux:  truncated policydb string identifier\n");
                }
                goto bad;
        }

        rc = -EINVAL;
        if (strcmp(policydb_str, POLICYDB_STRING)) {
                pr_err("SELinux:  policydb string %s does not match "
                       "my string %s\n",
                       policydb_str, POLICYDB_STRING);
                kfree(policydb_str);
                goto bad;
        }
        /* Done with policydb_str. */
        kfree(policydb_str);
        policydb_str = NULL;

        /* Read the version and table sizes. */
        rc = next_entry(buf, fp, sizeof(u32) * 4);
        if (rc)
                goto bad;

        rc = -EINVAL;
        p->policyvers = le32_to_cpu(buf[0]);
        if (p->policyvers < POLICYDB_VERSION_MIN ||
            p->policyvers > POLICYDB_VERSION_MAX) {
                pr_err("SELinux:  policydb version %d does not match "
                       "my version range %d-%d\n",
                       le32_to_cpu(buf[0]), POLICYDB_VERSION_MIN,
                       POLICYDB_VERSION_MAX);
                goto bad;
        }

        if ((le32_to_cpu(buf[1]) & POLICYDB_CONFIG_MLS)) {
                p->mls_enabled = 1;

                rc = -EINVAL;
                if (p->policyvers < POLICYDB_VERSION_MLS) {
                        pr_err("SELinux: security policydb version %d "
                               "(MLS) not backwards compatible\n",
                               p->policyvers);
                        goto bad;
                }
        }
        p->reject_unknown = !!(le32_to_cpu(buf[1]) & REJECT_UNKNOWN);
        p->allow_unknown = !!(le32_to_cpu(buf[1]) & ALLOW_UNKNOWN);

        if (p->policyvers >= POLICYDB_VERSION_POLCAP) {
                rc = ebitmap_read(&p->policycaps, fp);
                if (rc)
                        goto bad;
        }

        if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) {
                rc = ebitmap_read(&p->permissive_map, fp);
                if (rc)
                        goto bad;
        }

        rc = -EINVAL;
        info = policydb_lookup_compat(p->policyvers);
        if (!info) {
                pr_err("SELinux:  unable to find policy compat info "
                       "for version %d\n",
                       p->policyvers);
                goto bad;
        }

        rc = -EINVAL;
        if (le32_to_cpu(buf[2]) != info->sym_num ||
            le32_to_cpu(buf[3]) != info->ocon_num) {
                pr_err("SELinux:  policydb table sizes (%d,%d) do "
                       "not match mine (%d,%d)\n",
                       le32_to_cpu(buf[2]), le32_to_cpu(buf[3]), info->sym_num,
                       info->ocon_num);
                goto bad;
        }

        for (i = 0; i < info->sym_num; i++) {
                rc = next_entry(buf, fp, sizeof(u32) * 2);
                if (rc)
                        goto bad;
                nprim = le32_to_cpu(buf[0]);
                nel = le32_to_cpu(buf[1]);

                rc = symtab_init(&p->symtab[i], nel);
                if (rc)
                        goto out;

                if (i == SYM_ROLES) {
                        rc = roles_init(p);
                        if (rc)
                                goto out;
                }

                for (j = 0; j < nel; j++) {
                        rc = read_f[i](p, &p->symtab[i], fp);
                        if (rc)
                                goto bad;
                }

                p->symtab[i].nprim = nprim;
        }

        rc = -EINVAL;
        p->process_class = string_to_security_class(p, "process");
        if (!p->process_class) {
                pr_err("SELinux: process class is required, not defined in policy\n");
                goto bad;
        }

        rc = avtab_read(&p->te_avtab, fp, p);
        if (rc)
                goto bad;

        if (p->policyvers >= POLICYDB_VERSION_BOOL) {
                rc = cond_read_list(p, fp);
                if (rc)
                        goto bad;
        }

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                goto bad;
        nel = le32_to_cpu(buf[0]);

        rc = hashtab_init(&p->role_tr, nel);
        if (rc)
                goto bad;
        for (i = 0; i < nel; i++) {
                rc = -ENOMEM;
                rtk = kmalloc(sizeof(*rtk), GFP_KERNEL);
                if (!rtk)
                        goto bad;

                rc = -ENOMEM;
                rtd = kmalloc(sizeof(*rtd), GFP_KERNEL);
                if (!rtd)
                        goto bad;

                rc = next_entry(buf, fp, sizeof(u32) * 3);
                if (rc)
                        goto bad;

                rtk->role = le32_to_cpu(buf[0]);
                rtk->type = le32_to_cpu(buf[1]);
                rtd->new_role = le32_to_cpu(buf[2]);
                if (p->policyvers >= POLICYDB_VERSION_ROLETRANS) {
                        rc = next_entry(buf, fp, sizeof(u32));
                        if (rc)
                                goto bad;
                        rtk->tclass = le32_to_cpu(buf[0]);
                } else
                        rtk->tclass = p->process_class;

                rc = -EINVAL;
                if (!policydb_role_isvalid(p, rtk->role) ||
                    !policydb_type_isvalid(p, rtk->type) ||
                    !policydb_class_isvalid(p, rtk->tclass) ||
                    !policydb_role_isvalid(p, rtd->new_role))
                        goto bad;

                rc = hashtab_insert(&p->role_tr, rtk, rtd, roletr_key_params);
                if (rc)
                        goto bad;

                rtk = NULL;
                rtd = NULL;
        }

        hash_eval(&p->role_tr, "roletr", NULL);

        rc = next_entry(buf, fp, sizeof(u32));
        if (rc)
                goto bad;
        nel = le32_to_cpu(buf[0]);
        lra = NULL;
        for (i = 0; i < nel; i++) {
                rc = -ENOMEM;
                ra = kzalloc(sizeof(*ra), GFP_KERNEL);
                if (!ra)
                        goto bad;
                if (lra)
                        lra->next = ra;
                else
                        p->role_allow = ra;
                rc = next_entry(buf, fp, sizeof(u32) * 2);
                if (rc)
                        goto bad;

                rc = -EINVAL;
                ra->role = le32_to_cpu(buf[0]);
                ra->new_role = le32_to_cpu(buf[1]);
                if (!policydb_role_isvalid(p, ra->role) ||
                    !policydb_role_isvalid(p, ra->new_role))
                        goto bad;
                lra = ra;
        }

        rc = filename_trans_read(p, fp);
        if (rc)
                goto bad;

        rc = policydb_index(p);
        if (rc)
                goto bad;

        rc = -EINVAL;
        perm = string_to_av_perm(p, p->process_class, "transition");
        if (!perm) {
                pr_err("SELinux: process transition permission is required, not defined in policy\n");
                goto bad;
        }
        p->process_trans_perms = perm;
        perm = string_to_av_perm(p, p->process_class, "dyntransition");
        if (!perm) {
                pr_err("SELinux: process dyntransition permission is required, not defined in policy\n");
                goto bad;
        }
        p->process_trans_perms |= perm;

        rc = ocontext_read(p, info, fp);
        if (rc)
                goto bad;

        rc = genfs_read(p, fp);
        if (rc)
                goto bad;

        rc = range_read(p, fp);
        if (rc)
                goto bad;

        rc = -ENOMEM;
        p->type_attr_map_array = kvcalloc(
                p->p_types.nprim, sizeof(*p->type_attr_map_array), GFP_KERNEL);
        if (!p->type_attr_map_array)
                goto bad;

        /* just in case ebitmap_init() becomes more than just a memset(0): */
        for (i = 0; i < p->p_types.nprim; i++)
                ebitmap_init(&p->type_attr_map_array[i]);

        for (i = 0; i < p->p_types.nprim; i++) {
                struct ebitmap *e = &p->type_attr_map_array[i];

                if (p->policyvers >= POLICYDB_VERSION_AVTAB) {
                        rc = ebitmap_read(e, fp);
                        if (rc)
                                goto bad;
                }
                /* add the type itself as the degenerate case */
                rc = ebitmap_set_bit(e, i, 1);
                if (rc)
                        goto bad;
        }

        rc = policydb_bounds_sanity_check(p);
        if (rc)
                goto bad;

        rc = 0;
out:
        return rc;
bad:
        kfree(rtk);
        kfree(rtd);
        policydb_destroy(p);
        goto out;
}

/*
 * Write a MLS level structure to a policydb binary
 * representation file.
 */
static int mls_write_level(struct mls_level *l, struct policy_file *fp)
{
        __le32 buf[1];
        int rc;

        buf[0] = cpu_to_le32(l->sens);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&l->cat, fp);
        if (rc)
                return rc;

        return 0;
}

/*
 * Write a MLS range structure to a policydb binary
 * representation file.
 */
static int mls_write_range_helper(struct mls_range *r, struct policy_file *fp)
{
        __le32 buf[3];
        size_t items;
        int rc, eq;

        eq = mls_level_eq(&r->level[1], &r->level[0]);

        if (eq)
                items = 2;
        else
                items = 3;
        buf[0] = cpu_to_le32(items - 1);
        buf[1] = cpu_to_le32(r->level[0].sens);
        if (!eq)
                buf[2] = cpu_to_le32(r->level[1].sens);

        BUG_ON(items > ARRAY_SIZE(buf));

        rc = put_entry(buf, sizeof(u32), items, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&r->level[0].cat, fp);
        if (rc)
                return rc;
        if (!eq) {
                rc = ebitmap_write(&r->level[1].cat, fp);
                if (rc)
                        return rc;
        }

        return 0;
}

static int sens_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct level_datum *levdatum = datum;
        struct policy_data *pd = ptr;
        struct policy_file *fp = pd->fp;
        __le32 buf[2];
        size_t len;
        int rc;

        len = strlen(key);
        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(levdatum->isalias);
        rc = put_entry(buf, sizeof(u32), 2, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        rc = mls_write_level(&levdatum->level, fp);
        if (rc)
                return rc;

        return 0;
}

static int cat_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct cat_datum *catdatum = datum;
        struct policy_data *pd = ptr;
        struct policy_file *fp = pd->fp;
        __le32 buf[3];
        size_t len;
        int rc;

        len = strlen(key);
        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(catdatum->value);
        buf[2] = cpu_to_le32(catdatum->isalias);
        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        return 0;
}

static int role_trans_write_one(void *key, void *datum, void *ptr)
{
        struct role_trans_key *rtk = key;
        struct role_trans_datum *rtd = datum;
        struct policy_data *pd = ptr;
        struct policy_file *fp = pd->fp;
        struct policydb *p = pd->p;
        __le32 buf[3];
        int rc;

        buf[0] = cpu_to_le32(rtk->role);
        buf[1] = cpu_to_le32(rtk->type);
        buf[2] = cpu_to_le32(rtd->new_role);
        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;
        if (p->policyvers >= POLICYDB_VERSION_ROLETRANS) {
                buf[0] = cpu_to_le32(rtk->tclass);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
        }
        return 0;
}

static int role_trans_write(struct policydb *p, struct policy_file *fp)
{
        struct policy_data pd = { .p = p, .fp = fp };
        __le32 buf[1];
        int rc;

        buf[0] = cpu_to_le32(p->role_tr.nel);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        return hashtab_map(&p->role_tr, role_trans_write_one, &pd);
}

static int role_allow_write(struct role_allow *r, struct policy_file *fp)
{
        struct role_allow *ra;
        __le32 buf[2];
        size_t nel;
        int rc;

        nel = 0;
        for (ra = r; ra; ra = ra->next)
                nel++;
        buf[0] = cpu_to_le32(nel);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;
        for (ra = r; ra; ra = ra->next) {
                buf[0] = cpu_to_le32(ra->role);
                buf[1] = cpu_to_le32(ra->new_role);
                rc = put_entry(buf, sizeof(u32), 2, fp);
                if (rc)
                        return rc;
        }
        return 0;
}

/*
 * Write a security context structure
 * to a policydb binary representation file.
 */
static int context_write(struct policydb *p, struct context *c, struct policy_file *fp)
{
        int rc;
        __le32 buf[3];

        buf[0] = cpu_to_le32(c->user);
        buf[1] = cpu_to_le32(c->role);
        buf[2] = cpu_to_le32(c->type);

        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;

        rc = mls_write_range_helper(&c->range, fp);
        if (rc)
                return rc;

        return 0;
}

/*
 * The following *_write functions are used to
 * write the symbol data to a policy database
 * binary representation file.
 */

static int perm_write(void *vkey, void *datum, void *fp)
{
        char *key = vkey;
        struct perm_datum *perdatum = datum;
        __le32 buf[2];
        size_t len;
        int rc;

        len = strlen(key);
        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(perdatum->value);
        rc = put_entry(buf, sizeof(u32), 2, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        return 0;
}

static int common_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct common_datum *comdatum = datum;
        struct policy_data *pd = ptr;
        struct policy_file *fp = pd->fp;
        __le32 buf[4];
        size_t len;
        int rc;

        len = strlen(key);
        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(comdatum->value);
        buf[2] = cpu_to_le32(comdatum->permissions.nprim);
        buf[3] = cpu_to_le32(comdatum->permissions.table.nel);
        rc = put_entry(buf, sizeof(u32), 4, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        rc = hashtab_map(&comdatum->permissions.table, perm_write, fp);
        if (rc)
                return rc;

        return 0;
}

static int type_set_write(struct type_set *t, struct policy_file *fp)
{
        int rc;
        __le32 buf[1];

        if (ebitmap_write(&t->types, fp))
                return -EINVAL;
        if (ebitmap_write(&t->negset, fp))
                return -EINVAL;

        buf[0] = cpu_to_le32(t->flags);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return -EINVAL;

        return 0;
}

static int write_cons_helper(struct policydb *p, struct constraint_node *node,
                             struct policy_file *fp)
{
        struct constraint_node *c;
        struct constraint_expr *e;
        __le32 buf[3];
        u32 nel;
        int rc;

        for (c = node; c; c = c->next) {
                nel = 0;
                for (e = c->expr; e; e = e->next)
                        nel++;
                buf[0] = cpu_to_le32(c->permissions);
                buf[1] = cpu_to_le32(nel);
                rc = put_entry(buf, sizeof(u32), 2, fp);
                if (rc)
                        return rc;
                for (e = c->expr; e; e = e->next) {
                        buf[0] = cpu_to_le32(e->expr_type);
                        buf[1] = cpu_to_le32(e->attr);
                        buf[2] = cpu_to_le32(e->op);
                        rc = put_entry(buf, sizeof(u32), 3, fp);
                        if (rc)
                                return rc;

                        switch (e->expr_type) {
                        case CEXPR_NAMES:
                                rc = ebitmap_write(&e->names, fp);
                                if (rc)
                                        return rc;
                                if (p->policyvers >=
                                    POLICYDB_VERSION_CONSTRAINT_NAMES) {
                                        rc = type_set_write(e->type_names, fp);
                                        if (rc)
                                                return rc;
                                }
                                break;
                        default:
                                break;
                        }
                }
        }

        return 0;
}

static int class_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct class_datum *cladatum = datum;
        struct policy_data *pd = ptr;
        struct policy_file *fp = pd->fp;
        struct policydb *p = pd->p;
        struct constraint_node *c;
        __le32 buf[6];
        u32 ncons;
        size_t len, len2;
        int rc;

        len = strlen(key);
        if (cladatum->comkey)
                len2 = strlen(cladatum->comkey);
        else
                len2 = 0;

        ncons = 0;
        for (c = cladatum->constraints; c; c = c->next)
                ncons++;

        buf[0] = cpu_to_le32(len);
        buf[1] = cpu_to_le32(len2);
        buf[2] = cpu_to_le32(cladatum->value);
        buf[3] = cpu_to_le32(cladatum->permissions.nprim);
        buf[4] = cpu_to_le32(cladatum->permissions.table.nel);
        buf[5] = cpu_to_le32(ncons);
        rc = put_entry(buf, sizeof(u32), 6, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        if (cladatum->comkey) {
                rc = put_entry(cladatum->comkey, 1, len2, fp);
                if (rc)
                        return rc;
        }

        rc = hashtab_map(&cladatum->permissions.table, perm_write, fp);
        if (rc)
                return rc;

        rc = write_cons_helper(p, cladatum->constraints, fp);
        if (rc)
                return rc;

        /* write out the validatetrans rule */
        ncons = 0;
        for (c = cladatum->validatetrans; c; c = c->next)
                ncons++;

        buf[0] = cpu_to_le32(ncons);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        rc = write_cons_helper(p, cladatum->validatetrans, fp);
        if (rc)
                return rc;

        if (p->policyvers >= POLICYDB_VERSION_NEW_OBJECT_DEFAULTS) {
                buf[0] = cpu_to_le32(cladatum->default_user);
                buf[1] = cpu_to_le32(cladatum->default_role);
                buf[2] = cpu_to_le32(cladatum->default_range);

                rc = put_entry(buf, sizeof(uint32_t), 3, fp);
                if (rc)
                        return rc;
        }

        if (p->policyvers >= POLICYDB_VERSION_DEFAULT_TYPE) {
                buf[0] = cpu_to_le32(cladatum->default_type);
                rc = put_entry(buf, sizeof(uint32_t), 1, fp);
                if (rc)
                        return rc;
        }

        return 0;
}

static int role_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct role_datum *role = datum;
        struct policy_data *pd = ptr;
        struct policy_file *fp = pd->fp;
        struct policydb *p = pd->p;
        __le32 buf[3];
        size_t items, len;
        int rc;

        len = strlen(key);
        items = 0;
        buf[items++] = cpu_to_le32(len);
        buf[items++] = cpu_to_le32(role->value);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                buf[items++] = cpu_to_le32(role->bounds);

        BUG_ON(items > ARRAY_SIZE(buf));

        rc = put_entry(buf, sizeof(u32), items, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&role->dominates, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&role->types, fp);
        if (rc)
                return rc;

        return 0;
}

static int type_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct type_datum *typdatum = datum;
        struct policy_data *pd = ptr;
        struct policydb *p = pd->p;
        struct policy_file *fp = pd->fp;
        __le32 buf[4];
        int rc;
        size_t items, len;

        len = strlen(key);
        items = 0;
        buf[items++] = cpu_to_le32(len);
        buf[items++] = cpu_to_le32(typdatum->value);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) {
                u32 properties = 0;

                if (typdatum->primary)
                        properties |= TYPEDATUM_PROPERTY_PRIMARY;

                if (typdatum->attribute)
                        properties |= TYPEDATUM_PROPERTY_ATTRIBUTE;

                buf[items++] = cpu_to_le32(properties);
                buf[items++] = cpu_to_le32(typdatum->bounds);
        } else {
                buf[items++] = cpu_to_le32(typdatum->primary);
        }
        BUG_ON(items > ARRAY_SIZE(buf));
        rc = put_entry(buf, sizeof(u32), items, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        return 0;
}

static int user_write(void *vkey, void *datum, void *ptr)
{
        char *key = vkey;
        struct user_datum *usrdatum = datum;
        struct policy_data *pd = ptr;
        struct policydb *p = pd->p;
        struct policy_file *fp = pd->fp;
        __le32 buf[3];
        size_t items, len;
        int rc;

        len = strlen(key);
        items = 0;
        buf[items++] = cpu_to_le32(len);
        buf[items++] = cpu_to_le32(usrdatum->value);
        if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
                buf[items++] = cpu_to_le32(usrdatum->bounds);
        BUG_ON(items > ARRAY_SIZE(buf));
        rc = put_entry(buf, sizeof(u32), items, fp);
        if (rc)
                return rc;

        rc = put_entry(key, 1, len, fp);
        if (rc)
                return rc;

        rc = ebitmap_write(&usrdatum->roles, fp);
        if (rc)
                return rc;

        rc = mls_write_range_helper(&usrdatum->range, fp);
        if (rc)
                return rc;

        rc = mls_write_level(&usrdatum->dfltlevel, fp);
        if (rc)
                return rc;

        return 0;
}

/* clang-format off */
static int (*const write_f[SYM_NUM])(void *key, void *datum, void *datap) = {
        common_write,
        class_write,
        role_write,
        type_write,
        user_write,
        cond_write_bool,
        sens_write,
        cat_write,
};
/* clang-format on */

static int ocontext_write(struct policydb *p,
                          const struct policydb_compat_info *info,
                          struct policy_file *fp)
{
        unsigned int i, j;
        int rc;
        size_t nel, len;
        __be64 prefixbuf[1];
        __le32 buf[3];
        u32 nodebuf[8];
        struct ocontext *c;
        for (i = 0; i < info->ocon_num; i++) {
                nel = 0;
                for (c = p->ocontexts[i]; c; c = c->next)
                        nel++;
                buf[0] = cpu_to_le32(nel);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
                for (c = p->ocontexts[i]; c; c = c->next) {
                        switch (i) {
                        case OCON_ISID:
                                buf[0] = cpu_to_le32(c->sid[0]);
                                rc = put_entry(buf, sizeof(u32), 1, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_FS:
                        case OCON_NETIF:
                                len = strlen(c->u.name);
                                buf[0] = cpu_to_le32(len);
                                rc = put_entry(buf, sizeof(u32), 1, fp);
                                if (rc)
                                        return rc;
                                rc = put_entry(c->u.name, 1, len, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[1], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_PORT:
                                buf[0] = cpu_to_le32(c->u.port.protocol);
                                buf[1] = cpu_to_le32(c->u.port.low_port);
                                buf[2] = cpu_to_le32(c->u.port.high_port);
                                rc = put_entry(buf, sizeof(u32), 3, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_NODE:
                                nodebuf[0] = c->u.node.addr; /* network order */
                                nodebuf[1] = c->u.node.mask; /* network order */
                                rc = put_entry(nodebuf, sizeof(u32), 2, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_FSUSE:
                                buf[0] = cpu_to_le32(c->v.behavior);
                                len = strlen(c->u.name);
                                buf[1] = cpu_to_le32(len);
                                rc = put_entry(buf, sizeof(u32), 2, fp);
                                if (rc)
                                        return rc;
                                rc = put_entry(c->u.name, 1, len, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_NODE6:
                                for (j = 0; j < 4; j++)
                                        nodebuf[j] =
                                                c->u.node6.addr
                                                        [j]; /* network order */
                                for (j = 0; j < 4; j++)
                                        nodebuf[j + 4] =
                                                c->u.node6.mask
                                                        [j]; /* network order */
                                rc = put_entry(nodebuf, sizeof(u32), 8, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_IBPKEY:
                                /* subnet_prefix is in CPU order */
                                prefixbuf[0] =
                                        cpu_to_be64(c->u.ibpkey.subnet_prefix);

                                rc = put_entry(prefixbuf, sizeof(u64), 1, fp);
                                if (rc)
                                        return rc;

                                buf[0] = cpu_to_le32(c->u.ibpkey.low_pkey);
                                buf[1] = cpu_to_le32(c->u.ibpkey.high_pkey);

                                rc = put_entry(buf, sizeof(u32), 2, fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        case OCON_IBENDPORT:
                                len = strlen(c->u.ibendport.dev_name);
                                buf[0] = cpu_to_le32(len);
                                buf[1] = cpu_to_le32(c->u.ibendport.port);
                                rc = put_entry(buf, sizeof(u32), 2, fp);
                                if (rc)
                                        return rc;
                                rc = put_entry(c->u.ibendport.dev_name, 1, len,
                                               fp);
                                if (rc)
                                        return rc;
                                rc = context_write(p, &c->context[0], fp);
                                if (rc)
                                        return rc;
                                break;
                        }
                }
        }
        return 0;
}

static int genfs_write(struct policydb *p, struct policy_file *fp)
{
        struct genfs *genfs;
        struct ocontext *c;
        size_t len;
        __le32 buf[1];
        int rc;

        len = 0;
        for (genfs = p->genfs; genfs; genfs = genfs->next)
                len++;
        buf[0] = cpu_to_le32(len);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;
        for (genfs = p->genfs; genfs; genfs = genfs->next) {
                len = strlen(genfs->fstype);
                buf[0] = cpu_to_le32(len);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
                rc = put_entry(genfs->fstype, 1, len, fp);
                if (rc)
                        return rc;
                len = 0;
                for (c = genfs->head; c; c = c->next)
                        len++;
                buf[0] = cpu_to_le32(len);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
                for (c = genfs->head; c; c = c->next) {
                        len = strlen(c->u.name);
                        buf[0] = cpu_to_le32(len);
                        rc = put_entry(buf, sizeof(u32), 1, fp);
                        if (rc)
                                return rc;
                        rc = put_entry(c->u.name, 1, len, fp);
                        if (rc)
                                return rc;
                        buf[0] = cpu_to_le32(c->v.sclass);
                        rc = put_entry(buf, sizeof(u32), 1, fp);
                        if (rc)
                                return rc;
                        rc = context_write(p, &c->context[0], fp);
                        if (rc)
                                return rc;
                }
        }
        return 0;
}

static int range_write_helper(void *key, void *data, void *ptr)
{
        __le32 buf[2];
        struct range_trans *rt = key;
        struct mls_range *r = data;
        struct policy_data *pd = ptr;
        struct policy_file *fp = pd->fp;
        struct policydb *p = pd->p;
        int rc;

        buf[0] = cpu_to_le32(rt->source_type);
        buf[1] = cpu_to_le32(rt->target_type);
        rc = put_entry(buf, sizeof(u32), 2, fp);
        if (rc)
                return rc;
        if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) {
                buf[0] = cpu_to_le32(rt->target_class);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;
        }
        rc = mls_write_range_helper(r, fp);
        if (rc)
                return rc;

        return 0;
}

static int range_write(struct policydb *p, struct policy_file *fp)
{
        __le32 buf[1];
        int rc;
        struct policy_data pd;

        pd.p = p;
        pd.fp = fp;

        buf[0] = cpu_to_le32(p->range_tr.nel);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        /* actually write all of the entries */
        rc = hashtab_map(&p->range_tr, range_write_helper, &pd);
        if (rc)
                return rc;

        return 0;
}

static int filename_write_helper_compat(void *key, void *data, void *ptr)
{
        struct filename_trans_key *ft = key;
        struct filename_trans_datum *datum = data;
        struct ebitmap_node *node;
        struct policy_file *fp = ptr;
        __le32 buf[4];
        int rc;
        u32 bit, len = strlen(ft->name);

        do {
                ebitmap_for_each_positive_bit(&datum->stypes, node, bit)
                {
                        buf[0] = cpu_to_le32(len);
                        rc = put_entry(buf, sizeof(u32), 1, fp);
                        if (rc)
                                return rc;

                        rc = put_entry(ft->name, sizeof(char), len, fp);
                        if (rc)
                                return rc;

                        buf[0] = cpu_to_le32(bit + 1);
                        buf[1] = cpu_to_le32(ft->ttype);
                        buf[2] = cpu_to_le32(ft->tclass);
                        buf[3] = cpu_to_le32(datum->otype);

                        rc = put_entry(buf, sizeof(u32), 4, fp);
                        if (rc)
                                return rc;
                }

                datum = datum->next;
        } while (unlikely(datum));

        return 0;
}

static int filename_write_helper(void *key, void *data, void *ptr)
{
        struct filename_trans_key *ft = key;
        struct filename_trans_datum *datum;
        struct policy_file *fp = ptr;
        __le32 buf[3];
        int rc;
        u32 ndatum, len = strlen(ft->name);

        buf[0] = cpu_to_le32(len);
        rc = put_entry(buf, sizeof(u32), 1, fp);
        if (rc)
                return rc;

        rc = put_entry(ft->name, sizeof(char), len, fp);
        if (rc)
                return rc;

        ndatum = 0;
        datum = data;
        do {
                ndatum++;
                datum = datum->next;
        } while (unlikely(datum));

        buf[0] = cpu_to_le32(ft->ttype);
        buf[1] = cpu_to_le32(ft->tclass);
        buf[2] = cpu_to_le32(ndatum);
        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;

        datum = data;
        do {
                rc = ebitmap_write(&datum->stypes, fp);
                if (rc)
                        return rc;

                buf[0] = cpu_to_le32(datum->otype);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;

                datum = datum->next;
        } while (unlikely(datum));

        return 0;
}

static int filename_trans_write(struct policydb *p, struct policy_file *fp)
{
        __le32 buf[1];
        int rc;

        if (p->policyvers < POLICYDB_VERSION_FILENAME_TRANS)
                return 0;

        if (p->policyvers < POLICYDB_VERSION_COMP_FTRANS) {
                buf[0] = cpu_to_le32(p->compat_filename_trans_count);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;

                rc = hashtab_map(&p->filename_trans,
                                 filename_write_helper_compat, fp);
        } else {
                buf[0] = cpu_to_le32(p->filename_trans.nel);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;

                rc = hashtab_map(&p->filename_trans, filename_write_helper, fp);
        }
        return rc;
}

/*
 * Write the configuration data in a policy database
 * structure to a policy database binary representation
 * file.
 */
int policydb_write(struct policydb *p, struct policy_file *fp)
{
        unsigned int num_syms;
        int rc;
        __le32 buf[4];
        u32 config, i;
        size_t len;
        const struct policydb_compat_info *info;

        /*
         * refuse to write policy older than compressed avtab
         * to simplify the writer.  There are other tests dropped
         * since we assume this throughout the writer code.  Be
         * careful if you ever try to remove this restriction
         */
        if (p->policyvers < POLICYDB_VERSION_AVTAB) {
                pr_err("SELinux: refusing to write policy version %d."
                       "  Because it is less than version %d\n",
                       p->policyvers, POLICYDB_VERSION_AVTAB);
                return -EINVAL;
        }

        config = 0;
        if (p->mls_enabled)
                config |= POLICYDB_CONFIG_MLS;

        if (p->reject_unknown)
                config |= REJECT_UNKNOWN;
        if (p->allow_unknown)
                config |= ALLOW_UNKNOWN;

        /* Write the magic number and string identifiers. */
        buf[0] = cpu_to_le32(POLICYDB_MAGIC);
        len = strlen(POLICYDB_STRING);
        buf[1] = cpu_to_le32(len);
        rc = put_entry(buf, sizeof(u32), 2, fp);
        if (rc)
                return rc;
        rc = put_entry(POLICYDB_STRING, 1, len, fp);
        if (rc)
                return rc;

        /* Write the version, config, and table sizes. */
        info = policydb_lookup_compat(p->policyvers);
        if (!info) {
                pr_err("SELinux: compatibility lookup failed for policy "
                       "version %d\n",
                       p->policyvers);
                return -EINVAL;
        }

        buf[0] = cpu_to_le32(p->policyvers);
        buf[1] = cpu_to_le32(config);
        buf[2] = cpu_to_le32(info->sym_num);
        buf[3] = cpu_to_le32(info->ocon_num);

        rc = put_entry(buf, sizeof(u32), 4, fp);
        if (rc)
                return rc;

        if (p->policyvers >= POLICYDB_VERSION_POLCAP) {
                rc = ebitmap_write(&p->policycaps, fp);
                if (rc)
                        return rc;
        }

        if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) {
                rc = ebitmap_write(&p->permissive_map, fp);
                if (rc)
                        return rc;
        }

        num_syms = info->sym_num;
        for (i = 0; i < num_syms; i++) {
                struct policy_data pd;

                pd.fp = fp;
                pd.p = p;

                buf[0] = cpu_to_le32(p->symtab[i].nprim);
                buf[1] = cpu_to_le32(p->symtab[i].table.nel);

                rc = put_entry(buf, sizeof(u32), 2, fp);
                if (rc)
                        return rc;
                rc = hashtab_map(&p->symtab[i].table, write_f[i], &pd);
                if (rc)
                        return rc;
        }

        rc = avtab_write(p, &p->te_avtab, fp);
        if (rc)
                return rc;

        rc = cond_write_list(p, fp);
        if (rc)
                return rc;

        rc = role_trans_write(p, fp);
        if (rc)
                return rc;

        rc = role_allow_write(p->role_allow, fp);
        if (rc)
                return rc;

        rc = filename_trans_write(p, fp);
        if (rc)
                return rc;

        rc = ocontext_write(p, info, fp);
        if (rc)
                return rc;

        rc = genfs_write(p, fp);
        if (rc)
                return rc;

        rc = range_write(p, fp);
        if (rc)
                return rc;

        for (i = 0; i < p->p_types.nprim; i++) {
                struct ebitmap *e = &p->type_attr_map_array[i];

                rc = ebitmap_write(e, fp);
                if (rc)
                        return rc;
        }

        return 0;
}














































































































































































































































































   22 














   22 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2015 Imagination Technologies Ltd
 * Author: Qais Yousef <qais.yousef@imgtec.com>
 *
 * This file contains driver APIs to the IPI subsystem.
 */

#define pr_fmt(fmt) "genirq/ipi: " fmt

#include <linux/irqdomain.h>
#include <linux/irq.h>

/**
 * irq_reserve_ipi() - Setup an IPI to destination cpumask
 * @domain:        IPI domain
 * @dest:        cpumask of CPUs which can receive the IPI
 *
 * Allocate a virq that can be used to send IPI to any CPU in dest mask.
 *
 * Return: Linux IRQ number on success or error code on failure
 */
int irq_reserve_ipi(struct irq_domain *domain,
                             const struct cpumask *dest)
{
        unsigned int nr_irqs, offset;
        struct irq_data *data;
        int virq, i;

        if (!domain ||!irq_domain_is_ipi(domain)) {
                pr_warn("Reservation on a non IPI domain\n");
                return -EINVAL;
        }

        if (!cpumask_subset(dest, cpu_possible_mask)) {
                pr_warn("Reservation is not in possible_cpu_mask\n");
                return -EINVAL;
        }

        nr_irqs = cpumask_weight(dest);
        if (!nr_irqs) {
                pr_warn("Reservation for empty destination mask\n");
                return -EINVAL;
        }

        if (irq_domain_is_ipi_single(domain)) {
                /*
                 * If the underlying implementation uses a single HW irq on
                 * all cpus then we only need a single Linux irq number for
                 * it. We have no restrictions vs. the destination mask. The
                 * underlying implementation can deal with holes nicely.
                 */
                nr_irqs = 1;
                offset = 0;
        } else {
                unsigned int next;

                /*
                 * The IPI requires a separate HW irq on each CPU. We require
                 * that the destination mask is consecutive. If an
                 * implementation needs to support holes, it can reserve
                 * several IPI ranges.
                 */
                offset = cpumask_first(dest);
                /*
                 * Find a hole and if found look for another set bit after the
                 * hole. For now we don't support this scenario.
                 */
                next = cpumask_next_zero(offset, dest);
                if (next < nr_cpu_ids)
                        next = cpumask_next(next, dest);
                if (next < nr_cpu_ids) {
                        pr_warn("Destination mask has holes\n");
                        return -EINVAL;
                }
        }

        virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE, NULL);
        if (virq <= 0) {
                pr_warn("Can't reserve IPI, failed to alloc descs\n");
                return -ENOMEM;
        }

        virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
                                       (void *) dest, true, NULL);

        if (virq <= 0) {
                pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
                goto free_descs;
        }

        for (i = 0; i < nr_irqs; i++) {
                data = irq_get_irq_data(virq + i);
                cpumask_copy(data->common->affinity, dest);
                data->common->ipi_offset = offset;
                irq_set_status_flags(virq + i, IRQ_NO_BALANCING);
        }
        return virq;

free_descs:
        irq_free_descs(virq, nr_irqs);
        return -EBUSY;
}

/**
 * irq_destroy_ipi() - unreserve an IPI that was previously allocated
 * @irq:        Linux IRQ number to be destroyed
 * @dest:        cpumask of CPUs which should have the IPI removed
 *
 * The IPIs allocated with irq_reserve_ipi() are returned to the system
 * destroying all virqs associated with them.
 *
 * Return: %0 on success or error code on failure.
 */
int irq_destroy_ipi(unsigned int irq, const struct cpumask *dest)
{
        struct irq_data *data = irq_get_irq_data(irq);
        const struct cpumask *ipimask;
        struct irq_domain *domain;
        unsigned int nr_irqs;

        if (!irq || !data)
                return -EINVAL;

        domain = data->domain;
        if (WARN_ON(domain == NULL))
                return -EINVAL;

        if (!irq_domain_is_ipi(domain)) {
                pr_warn("Trying to destroy a non IPI domain!\n");
                return -EINVAL;
        }

        ipimask = irq_data_get_affinity_mask(data);
        if (!ipimask || WARN_ON(!cpumask_subset(dest, ipimask)))
                /*
                 * Must be destroying a subset of CPUs to which this IPI
                 * was set up to target
                 */
                return -EINVAL;

        if (irq_domain_is_ipi_per_cpu(domain)) {
                irq = irq + cpumask_first(dest) - data->common->ipi_offset;
                nr_irqs = cpumask_weight(dest);
        } else {
                nr_irqs = 1;
        }

        irq_domain_free_irqs(irq, nr_irqs);
        return 0;
}

/**
 * ipi_get_hwirq - Get the hwirq associated with an IPI to a CPU
 * @irq:        Linux IRQ number
 * @cpu:        the target CPU
 *
 * When dealing with coprocessors IPI, we need to inform the coprocessor of
 * the hwirq it needs to use to receive and send IPIs.
 *
 * Return: hwirq value on success or INVALID_HWIRQ on failure.
 */
irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
{
        struct irq_data *data = irq_get_irq_data(irq);
        const struct cpumask *ipimask;

        if (!data || cpu >= nr_cpu_ids)
                return INVALID_HWIRQ;

        ipimask = irq_data_get_affinity_mask(data);
        if (!ipimask || !cpumask_test_cpu(cpu, ipimask))
                return INVALID_HWIRQ;

        /*
         * Get the real hardware irq number if the underlying implementation
         * uses a separate irq per cpu. If the underlying implementation uses
         * a single hardware irq for all cpus then the IPI send mechanism
         * needs to take care of the cpu destinations.
         */
        if (irq_domain_is_ipi_per_cpu(data->domain))
                data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);

        return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
}
EXPORT_SYMBOL_GPL(ipi_get_hwirq);

static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
                           const struct cpumask *dest, unsigned int cpu)
{
        const struct cpumask *ipimask;

        if (!chip || !data)
                return -EINVAL;

        if (!chip->ipi_send_single && !chip->ipi_send_mask)
                return -EINVAL;

        if (cpu >= nr_cpu_ids)
                return -EINVAL;

        ipimask = irq_data_get_affinity_mask(data);
        if (!ipimask)
                return -EINVAL;

        if (dest) {
                if (!cpumask_subset(dest, ipimask))
                        return -EINVAL;
        } else {
                if (!cpumask_test_cpu(cpu, ipimask))
                        return -EINVAL;
        }
        return 0;
}

/**
 * __ipi_send_single - send an IPI to a target Linux SMP CPU
 * @desc:        pointer to irq_desc of the IRQ
 * @cpu:        destination CPU, must in the destination mask passed to
 *                irq_reserve_ipi()
 *
 * This function is for architecture or core code to speed up IPI sending. Not
 * usable from driver code.
 *
 * Return: %0 on success or negative error number on failure.
 */
int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
{
        struct irq_data *data = irq_desc_get_irq_data(desc);
        struct irq_chip *chip = irq_data_get_irq_chip(data);

#ifdef DEBUG
        /*
         * Minimise the overhead by omitting the checks for Linux SMP IPIs.
         * Since the callers should be arch or core code which is generally
         * trusted, only check for errors when debugging.
         */
        if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
                return -EINVAL;
#endif
        if (!chip->ipi_send_single) {
                chip->ipi_send_mask(data, cpumask_of(cpu));
                return 0;
        }

        /* FIXME: Store this information in irqdata flags */
        if (irq_domain_is_ipi_per_cpu(data->domain) &&
            cpu != data->common->ipi_offset) {
                /* use the correct data for that cpu */
                unsigned irq = data->irq + cpu - data->common->ipi_offset;

                data = irq_get_irq_data(irq);
        }
        chip->ipi_send_single(data, cpu);
        return 0;
}

/**
 * __ipi_send_mask - send an IPI to target Linux SMP CPU(s)
 * @desc:        pointer to irq_desc of the IRQ
 * @dest:        dest CPU(s), must be a subset of the mask passed to
 *                irq_reserve_ipi()
 *
 * This function is for architecture or core code to speed up IPI sending. Not
 * usable from driver code.
 *
 * Return: %0 on success or negative error number on failure.
 */
int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
{
        struct irq_data *data = irq_desc_get_irq_data(desc);
        struct irq_chip *chip = irq_data_get_irq_chip(data);
        unsigned int cpu;

#ifdef DEBUG
        /*
         * Minimise the overhead by omitting the checks for Linux SMP IPIs.
         * Since the callers should be arch or core code which is generally
         * trusted, only check for errors when debugging.
         */
        if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
                return -EINVAL;
#endif
        if (chip->ipi_send_mask) {
                chip->ipi_send_mask(data, dest);
                return 0;
        }

        if (irq_domain_is_ipi_per_cpu(data->domain)) {
                unsigned int base = data->irq;

                for_each_cpu(cpu, dest) {
                        unsigned irq = base + cpu - data->common->ipi_offset;

                        data = irq_get_irq_data(irq);
                        chip->ipi_send_single(data, cpu);
                }
        } else {
                for_each_cpu(cpu, dest)
                        chip->ipi_send_single(data, cpu);
        }
        return 0;
}

/**
 * ipi_send_single - Send an IPI to a single CPU
 * @virq:        Linux IRQ number from irq_reserve_ipi()
 * @cpu:        destination CPU, must in the destination mask passed to
 *                irq_reserve_ipi()
 *
 * Return: %0 on success or negative error number on failure.
 */
int ipi_send_single(unsigned int virq, unsigned int cpu)
{
        struct irq_desc *desc = irq_to_desc(virq);
        struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
        struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;

        if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
                return -EINVAL;

        return __ipi_send_single(desc, cpu);
}
EXPORT_SYMBOL_GPL(ipi_send_single);

/**
 * ipi_send_mask - Send an IPI to target CPU(s)
 * @virq:        Linux IRQ number from irq_reserve_ipi()
 * @dest:        dest CPU(s), must be a subset of the mask passed to
 *                irq_reserve_ipi()
 *
 * Return: %0 on success or negative error number on failure.
 */
int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
{
        struct irq_desc *desc = irq_to_desc(virq);
        struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
        struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;

        if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
                return -EINVAL;

        return __ipi_send_mask(desc, dest);
}
EXPORT_SYMBOL_GPL(ipi_send_mask);





































   17 























































    5 





    5 


    5 
    5 









































































    4 


















    4 









    1 





    3 





    1 
    2 
    3 



    3 















    1 










   11 







   11 






   11 



    3 

    9 







































    6 

















    1 






    1 





    4 




    2 








    8 

    2 





    1 








    8 




























    5 



    6 






   11 









   11 










   10 
    1 








    3 
    5 

    3 
    5 


    8 




   17 















































    8 

    8 








    5 
















    2 



    5 
    3 






    2 


    2 


















   24 
    1 



    7 

   17 








   26 




   26 
    1 
















   34 




   34 








    2 























































































    6 




















    1 































    1 


    1 












    6 


    6 













   15 
   10 
    7 
    4 
    2 
    4 


















   20 







    2 















   10 

   11 




   18 











   11 
   11 


















   14 








    2 





   11 
    8 
    3 
    1 
    3 



    7 



    4 

    4 
















   15 













   26 











    4 







    1 



   12 





    8 

    3 













   40 

   15 

   26 




  248 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
// SPDX-License-Identifier: GPL-2.0-only
/*
 * kvm eventfd support - use eventfd objects to signal various KVM events
 *
 * Copyright 2009 Novell.  All Rights Reserved.
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 *
 * Author:
 *        Gregory Haskins <ghaskins@novell.com>
 */

#include <linux/kvm_host.h>
#include <linux/kvm.h>
#include <linux/kvm_irqfd.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/poll.h>
#include <linux/file.h>
#include <linux/list.h>
#include <linux/eventfd.h>
#include <linux/kernel.h>
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/seqlock.h>
#include <linux/irqbypass.h>
#include <trace/events/kvm.h>

#include <kvm/iodev.h>

#ifdef CONFIG_HAVE_KVM_IRQCHIP

static struct workqueue_struct *irqfd_cleanup_wq;

bool __attribute__((weak))
kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args)
{
        return true;
}

static void
irqfd_inject(struct work_struct *work)
{
        struct kvm_kernel_irqfd *irqfd =
                container_of(work, struct kvm_kernel_irqfd, inject);
        struct kvm *kvm = irqfd->kvm;

        if (!irqfd->resampler) {
                kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
                                false);
                kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
                                false);
        } else
                kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
                            irqfd->gsi, 1, false);
}

static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler)
{
        struct kvm_kernel_irqfd *irqfd;

        list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link,
                                 srcu_read_lock_held(&resampler->kvm->irq_srcu))
                eventfd_signal(irqfd->resamplefd);
}

/*
 * Since resampler irqfds share an IRQ source ID, we de-assert once
 * then notify all of the resampler irqfds using this GSI.  We can't
 * do multiple de-asserts or we risk racing with incoming re-asserts.
 */
static void
irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
{
        struct kvm_kernel_irqfd_resampler *resampler;
        struct kvm *kvm;
        int idx;

        resampler = container_of(kian,
                        struct kvm_kernel_irqfd_resampler, notifier);
        kvm = resampler->kvm;

        kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
                    resampler->notifier.gsi, 0, false);

        idx = srcu_read_lock(&kvm->irq_srcu);
        irqfd_resampler_notify(resampler);
        srcu_read_unlock(&kvm->irq_srcu, idx);
}

static void
irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
{
        struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
        struct kvm *kvm = resampler->kvm;

        mutex_lock(&kvm->irqfds.resampler_lock);

        list_del_rcu(&irqfd->resampler_link);

        if (list_empty(&resampler->list)) {
                list_del_rcu(&resampler->link);
                kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
                /*
                 * synchronize_srcu_expedited(&kvm->irq_srcu) already called
                 * in kvm_unregister_irq_ack_notifier().
                 */
                kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
                            resampler->notifier.gsi, 0, false);
                kfree(resampler);
        } else {
                synchronize_srcu_expedited(&kvm->irq_srcu);
        }

        mutex_unlock(&kvm->irqfds.resampler_lock);
}

/*
 * Race-free decouple logic (ordering is critical)
 */
static void
irqfd_shutdown(struct work_struct *work)
{
        struct kvm_kernel_irqfd *irqfd =
                container_of(work, struct kvm_kernel_irqfd, shutdown);
        struct kvm *kvm = irqfd->kvm;
        u64 cnt;

        /* Make sure irqfd has been initialized in assign path. */
        synchronize_srcu_expedited(&kvm->irq_srcu);

        /*
         * Synchronize with the wait-queue and unhook ourselves to prevent
         * further events.
         */
        eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt);

        /*
         * We know no new events will be scheduled at this point, so block
         * until all previously outstanding events have completed
         */
        flush_work(&irqfd->inject);

        if (irqfd->resampler) {
                irqfd_resampler_shutdown(irqfd);
                eventfd_ctx_put(irqfd->resamplefd);
        }

        /*
         * It is now safe to release the object's resources
         */
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
        irq_bypass_unregister_consumer(&irqfd->consumer);
#endif
        eventfd_ctx_put(irqfd->eventfd);
        kfree(irqfd);
}


/* assumes kvm->irqfds.lock is held */
static bool
irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
{
        return list_empty(&irqfd->list) ? false : true;
}

/*
 * Mark the irqfd as inactive and schedule it for removal
 *
 * assumes kvm->irqfds.lock is held
 */
static void
irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
{
        BUG_ON(!irqfd_is_active(irqfd));

        list_del_init(&irqfd->list);

        queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
}

int __attribute__((weak)) kvm_arch_set_irq_inatomic(
                                struct kvm_kernel_irq_routing_entry *irq,
                                struct kvm *kvm, int irq_source_id,
                                int level,
                                bool line_status)
{
        return -EWOULDBLOCK;
}

/*
 * Called with wqh->lock held and interrupts disabled
 */
static int
irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        struct kvm_kernel_irqfd *irqfd =
                container_of(wait, struct kvm_kernel_irqfd, wait);
        __poll_t flags = key_to_poll(key);
        struct kvm_kernel_irq_routing_entry irq;
        struct kvm *kvm = irqfd->kvm;
        unsigned seq;
        int idx;
        int ret = 0;

        if (flags & EPOLLIN) {
                u64 cnt;
                eventfd_ctx_do_read(irqfd->eventfd, &cnt);

                idx = srcu_read_lock(&kvm->irq_srcu);
                do {
                        seq = read_seqcount_begin(&irqfd->irq_entry_sc);
                        irq = irqfd->irq_entry;
                } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
                /* An event has been signaled, inject an interrupt */
                if (kvm_arch_set_irq_inatomic(&irq, kvm,
                                              KVM_USERSPACE_IRQ_SOURCE_ID, 1,
                                              false) == -EWOULDBLOCK)
                        schedule_work(&irqfd->inject);
                srcu_read_unlock(&kvm->irq_srcu, idx);
                ret = 1;
        }

        if (flags & EPOLLHUP) {
                /* The eventfd is closing, detach from KVM */
                unsigned long iflags;

                spin_lock_irqsave(&kvm->irqfds.lock, iflags);

                /*
                 * We must check if someone deactivated the irqfd before
                 * we could acquire the irqfds.lock since the item is
                 * deactivated from the KVM side before it is unhooked from
                 * the wait-queue.  If it is already deactivated, we can
                 * simply return knowing the other side will cleanup for us.
                 * We cannot race against the irqfd going away since the
                 * other side is required to acquire wqh->lock, which we hold
                 */
                if (irqfd_is_active(irqfd))
                        irqfd_deactivate(irqfd);

                spin_unlock_irqrestore(&kvm->irqfds.lock, iflags);
        }

        return ret;
}

static void
irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
                        poll_table *pt)
{
        struct kvm_kernel_irqfd *irqfd =
                container_of(pt, struct kvm_kernel_irqfd, pt);
        add_wait_queue_priority(wqh, &irqfd->wait);
}

/* Must be called under irqfds.lock */
static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
{
        struct kvm_kernel_irq_routing_entry *e;
        struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
        int n_entries;

        n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);

        write_seqcount_begin(&irqfd->irq_entry_sc);

        e = entries;
        if (n_entries == 1)
                irqfd->irq_entry = *e;
        else
                irqfd->irq_entry.type = 0;

        write_seqcount_end(&irqfd->irq_entry_sc);
}

#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
void __attribute__((weak)) kvm_arch_irq_bypass_stop(
                                struct irq_bypass_consumer *cons)
{
}

void __attribute__((weak)) kvm_arch_irq_bypass_start(
                                struct irq_bypass_consumer *cons)
{
}

int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
                                struct kvm *kvm, unsigned int host_irq,
                                uint32_t guest_irq, bool set)
{
        return 0;
}

bool __attribute__((weak)) kvm_arch_irqfd_route_changed(
                                struct kvm_kernel_irq_routing_entry *old,
                                struct kvm_kernel_irq_routing_entry *new)
{
        return true;
}
#endif

static int
kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
{
        struct kvm_kernel_irqfd *irqfd, *tmp;
        struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
        int ret;
        __poll_t events;
        int idx;

        if (!kvm_arch_intc_initialized(kvm))
                return -EAGAIN;

        if (!kvm_arch_irqfd_allowed(kvm, args))
                return -EINVAL;

        irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
        if (!irqfd)
                return -ENOMEM;

        irqfd->kvm = kvm;
        irqfd->gsi = args->gsi;
        INIT_LIST_HEAD(&irqfd->list);
        INIT_WORK(&irqfd->inject, irqfd_inject);
        INIT_WORK(&irqfd->shutdown, irqfd_shutdown);
        seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock);

        CLASS(fd, f)(args->fd);
        if (fd_empty(f)) {
                ret = -EBADF;
                goto out;
        }

        eventfd = eventfd_ctx_fileget(fd_file(f));
        if (IS_ERR(eventfd)) {
                ret = PTR_ERR(eventfd);
                goto out;
        }

        irqfd->eventfd = eventfd;

        if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
                struct kvm_kernel_irqfd_resampler *resampler;

                resamplefd = eventfd_ctx_fdget(args->resamplefd);
                if (IS_ERR(resamplefd)) {
                        ret = PTR_ERR(resamplefd);
                        goto fail;
                }

                irqfd->resamplefd = resamplefd;
                INIT_LIST_HEAD(&irqfd->resampler_link);

                mutex_lock(&kvm->irqfds.resampler_lock);

                list_for_each_entry(resampler,
                                    &kvm->irqfds.resampler_list, link) {
                        if (resampler->notifier.gsi == irqfd->gsi) {
                                irqfd->resampler = resampler;
                                break;
                        }
                }

                if (!irqfd->resampler) {
                        resampler = kzalloc(sizeof(*resampler),
                                            GFP_KERNEL_ACCOUNT);
                        if (!resampler) {
                                ret = -ENOMEM;
                                mutex_unlock(&kvm->irqfds.resampler_lock);
                                goto fail;
                        }

                        resampler->kvm = kvm;
                        INIT_LIST_HEAD(&resampler->list);
                        resampler->notifier.gsi = irqfd->gsi;
                        resampler->notifier.irq_acked = irqfd_resampler_ack;
                        INIT_LIST_HEAD(&resampler->link);

                        list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list);
                        kvm_register_irq_ack_notifier(kvm,
                                                      &resampler->notifier);
                        irqfd->resampler = resampler;
                }

                list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list);
                synchronize_srcu_expedited(&kvm->irq_srcu);

                mutex_unlock(&kvm->irqfds.resampler_lock);
        }

        /*
         * Install our own custom wake-up handling so we are notified via
         * a callback whenever someone signals the underlying eventfd
         */
        init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup);
        init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc);

        spin_lock_irq(&kvm->irqfds.lock);

        ret = 0;
        list_for_each_entry(tmp, &kvm->irqfds.items, list) {
                if (irqfd->eventfd != tmp->eventfd)
                        continue;
                /* This fd is used for another irq already. */
                ret = -EBUSY;
                spin_unlock_irq(&kvm->irqfds.lock);
                goto fail;
        }

        idx = srcu_read_lock(&kvm->irq_srcu);
        irqfd_update(kvm, irqfd);

        list_add_tail(&irqfd->list, &kvm->irqfds.items);

        spin_unlock_irq(&kvm->irqfds.lock);

        /*
         * Check if there was an event already pending on the eventfd
         * before we registered, and trigger it as if we didn't miss it.
         */
        events = vfs_poll(fd_file(f), &irqfd->pt);

        if (events & EPOLLIN)
                schedule_work(&irqfd->inject);

#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
        if (kvm_arch_has_irq_bypass()) {
                irqfd->consumer.token = (void *)irqfd->eventfd;
                irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
                irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
                irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
                irqfd->consumer.start = kvm_arch_irq_bypass_start;
                ret = irq_bypass_register_consumer(&irqfd->consumer);
                if (ret)
                        pr_info("irq bypass consumer (token %p) registration fails: %d\n",
                                irqfd->consumer.token, ret);
        }
#endif

        srcu_read_unlock(&kvm->irq_srcu, idx);
        return 0;

fail:
        if (irqfd->resampler)
                irqfd_resampler_shutdown(irqfd);

        if (resamplefd && !IS_ERR(resamplefd))
                eventfd_ctx_put(resamplefd);

        if (eventfd && !IS_ERR(eventfd))
                eventfd_ctx_put(eventfd);

out:
        kfree(irqfd);
        return ret;
}

bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
        struct kvm_irq_ack_notifier *kian;
        int gsi, idx;

        idx = srcu_read_lock(&kvm->irq_srcu);
        gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
        if (gsi != -1)
                hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
                                          link, srcu_read_lock_held(&kvm->irq_srcu))
                        if (kian->gsi == gsi) {
                                srcu_read_unlock(&kvm->irq_srcu, idx);
                                return true;
                        }

        srcu_read_unlock(&kvm->irq_srcu, idx);

        return false;
}
EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);

void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
{
        struct kvm_irq_ack_notifier *kian;

        hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list,
                                  link, srcu_read_lock_held(&kvm->irq_srcu))
                if (kian->gsi == gsi)
                        kian->irq_acked(kian);
}

void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
        int gsi, idx;

        trace_kvm_ack_irq(irqchip, pin);

        idx = srcu_read_lock(&kvm->irq_srcu);
        gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
        if (gsi != -1)
                kvm_notify_acked_gsi(kvm, gsi);
        srcu_read_unlock(&kvm->irq_srcu, idx);
}

void kvm_register_irq_ack_notifier(struct kvm *kvm,
                                   struct kvm_irq_ack_notifier *kian)
{
        mutex_lock(&kvm->irq_lock);
        hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
        mutex_unlock(&kvm->irq_lock);
        kvm_arch_post_irq_ack_notifier_list_update(kvm);
}

void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
                                    struct kvm_irq_ack_notifier *kian)
{
        mutex_lock(&kvm->irq_lock);
        hlist_del_init_rcu(&kian->link);
        mutex_unlock(&kvm->irq_lock);
        synchronize_srcu_expedited(&kvm->irq_srcu);
        kvm_arch_post_irq_ack_notifier_list_update(kvm);
}

/*
 * shutdown any irqfd's that match fd+gsi
 */
static int
kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
{
        struct kvm_kernel_irqfd *irqfd, *tmp;
        struct eventfd_ctx *eventfd;

        eventfd = eventfd_ctx_fdget(args->fd);
        if (IS_ERR(eventfd))
                return PTR_ERR(eventfd);

        spin_lock_irq(&kvm->irqfds.lock);

        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) {
                if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) {
                        /*
                         * This clearing of irq_entry.type is needed for when
                         * another thread calls kvm_irq_routing_update before
                         * we flush workqueue below (we synchronize with
                         * kvm_irq_routing_update using irqfds.lock).
                         */
                        write_seqcount_begin(&irqfd->irq_entry_sc);
                        irqfd->irq_entry.type = 0;
                        write_seqcount_end(&irqfd->irq_entry_sc);
                        irqfd_deactivate(irqfd);
                }
        }

        spin_unlock_irq(&kvm->irqfds.lock);
        eventfd_ctx_put(eventfd);

        /*
         * Block until we know all outstanding shutdown jobs have completed
         * so that we guarantee there will not be any more interrupts on this
         * gsi once this deassign function returns.
         */
        flush_workqueue(irqfd_cleanup_wq);

        return 0;
}

int
kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
{
        if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE))
                return -EINVAL;

        if (args->flags & KVM_IRQFD_FLAG_DEASSIGN)
                return kvm_irqfd_deassign(kvm, args);

        return kvm_irqfd_assign(kvm, args);
}

/*
 * This function is called as the kvm VM fd is being released. Shutdown all
 * irqfds that still remain open
 */
void
kvm_irqfd_release(struct kvm *kvm)
{
        struct kvm_kernel_irqfd *irqfd, *tmp;

        spin_lock_irq(&kvm->irqfds.lock);

        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list)
                irqfd_deactivate(irqfd);

        spin_unlock_irq(&kvm->irqfds.lock);

        /*
         * Block until we know all outstanding shutdown jobs have completed
         * since we do not take a kvm* reference.
         */
        flush_workqueue(irqfd_cleanup_wq);

}

/*
 * Take note of a change in irq routing.
 * Caller must invoke synchronize_srcu_expedited(&kvm->irq_srcu) afterwards.
 */
void kvm_irq_routing_update(struct kvm *kvm)
{
        struct kvm_kernel_irqfd *irqfd;

        spin_lock_irq(&kvm->irqfds.lock);

        list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
                /* Under irqfds.lock, so can read irq_entry safely */
                struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry;
#endif

                irqfd_update(kvm, irqfd);

#if IS_ENABLED(CONFIG_HAVE_KVM_IRQ_BYPASS)
                if (irqfd->producer &&
                    kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) {
                        int ret = kvm_arch_update_irqfd_routing(
                                        irqfd->kvm, irqfd->producer->irq,
                                        irqfd->gsi, 1);
                        WARN_ON(ret);
                }
#endif
        }

        spin_unlock_irq(&kvm->irqfds.lock);
}

bool kvm_notify_irqfd_resampler(struct kvm *kvm,
                                unsigned int irqchip,
                                unsigned int pin)
{
        struct kvm_kernel_irqfd_resampler *resampler;
        int gsi, idx;

        idx = srcu_read_lock(&kvm->irq_srcu);
        gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
        if (gsi != -1) {
                list_for_each_entry_srcu(resampler,
                                         &kvm->irqfds.resampler_list, link,
                                         srcu_read_lock_held(&kvm->irq_srcu)) {
                        if (resampler->notifier.gsi == gsi) {
                                irqfd_resampler_notify(resampler);
                                srcu_read_unlock(&kvm->irq_srcu, idx);
                                return true;
                        }
                }
        }
        srcu_read_unlock(&kvm->irq_srcu, idx);

        return false;
}

/*
 * create a host-wide workqueue for issuing deferred shutdown requests
 * aggregated from all vm* instances. We need our own isolated
 * queue to ease flushing work items when a VM exits.
 */
int kvm_irqfd_init(void)
{
        irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0);
        if (!irqfd_cleanup_wq)
                return -ENOMEM;

        return 0;
}

void kvm_irqfd_exit(void)
{
        destroy_workqueue(irqfd_cleanup_wq);
}
#endif

/*
 * --------------------------------------------------------------------
 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
 *
 * userspace can register a PIO/MMIO address with an eventfd for receiving
 * notification when the memory has been touched.
 * --------------------------------------------------------------------
 */

struct _ioeventfd {
        struct list_head     list;
        u64                  addr;
        int                  length;
        struct eventfd_ctx  *eventfd;
        u64                  datamatch;
        struct kvm_io_device dev;
        u8                   bus_idx;
        bool                 wildcard;
};

static inline struct _ioeventfd *
to_ioeventfd(struct kvm_io_device *dev)
{
        return container_of(dev, struct _ioeventfd, dev);
}

static void
ioeventfd_release(struct _ioeventfd *p)
{
        eventfd_ctx_put(p->eventfd);
        list_del(&p->list);
        kfree(p);
}

static bool
ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
{
        u64 _val;

        if (addr != p->addr)
                /* address must be precise for a hit */
                return false;

        if (!p->length)
                /* length = 0 means only look at the address, so always a hit */
                return true;

        if (len != p->length)
                /* address-range must be precise for a hit */
                return false;

        if (p->wildcard)
                /* all else equal, wildcard is always a hit */
                return true;

        /* otherwise, we have to actually compare the data */

        BUG_ON(!IS_ALIGNED((unsigned long)val, len));

        switch (len) {
        case 1:
                _val = *(u8 *)val;
                break;
        case 2:
                _val = *(u16 *)val;
                break;
        case 4:
                _val = *(u32 *)val;
                break;
        case 8:
                _val = *(u64 *)val;
                break;
        default:
                return false;
        }

        return _val == p->datamatch;
}

/* MMIO/PIO writes trigger an event if the addr/val match */
static int
ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
                int len, const void *val)
{
        struct _ioeventfd *p = to_ioeventfd(this);

        if (!ioeventfd_in_range(p, addr, len, val))
                return -EOPNOTSUPP;

        eventfd_signal(p->eventfd);
        return 0;
}

/*
 * This function is called as KVM is completely shutting down.  We do not
 * need to worry about locking just nuke anything we have as quickly as possible
 */
static void
ioeventfd_destructor(struct kvm_io_device *this)
{
        struct _ioeventfd *p = to_ioeventfd(this);

        ioeventfd_release(p);
}

static const struct kvm_io_device_ops ioeventfd_ops = {
        .write      = ioeventfd_write,
        .destructor = ioeventfd_destructor,
};

/* assumes kvm->slots_lock held */
static bool
ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p)
{
        struct _ioeventfd *_p;

        list_for_each_entry(_p, &kvm->ioeventfds, list)
                if (_p->bus_idx == p->bus_idx &&
                    _p->addr == p->addr &&
                    (!_p->length || !p->length ||
                     (_p->length == p->length &&
                      (_p->wildcard || p->wildcard ||
                       _p->datamatch == p->datamatch))))
                        return true;

        return false;
}

static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags)
{
        if (flags & KVM_IOEVENTFD_FLAG_PIO)
                return KVM_PIO_BUS;
        if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY)
                return KVM_VIRTIO_CCW_NOTIFY_BUS;
        return KVM_MMIO_BUS;
}

static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
                                enum kvm_bus bus_idx,
                                struct kvm_ioeventfd *args)
{

        struct eventfd_ctx *eventfd;
        struct _ioeventfd *p;
        int ret;

        eventfd = eventfd_ctx_fdget(args->fd);
        if (IS_ERR(eventfd))
                return PTR_ERR(eventfd);

        p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
        if (!p) {
                ret = -ENOMEM;
                goto fail;
        }

        INIT_LIST_HEAD(&p->list);
        p->addr    = args->addr;
        p->bus_idx = bus_idx;
        p->length  = args->len;
        p->eventfd = eventfd;

        /* The datamatch feature is optional, otherwise this is a wildcard */
        if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
                p->datamatch = args->datamatch;
        else
                p->wildcard = true;

        mutex_lock(&kvm->slots_lock);

        /* Verify that there isn't a match already */
        if (ioeventfd_check_collision(kvm, p)) {
                ret = -EEXIST;
                goto unlock_fail;
        }

        kvm_iodevice_init(&p->dev, &ioeventfd_ops);

        ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length,
                                      &p->dev);
        if (ret < 0)
                goto unlock_fail;

        kvm_get_bus(kvm, bus_idx)->ioeventfd_count++;
        list_add_tail(&p->list, &kvm->ioeventfds);

        mutex_unlock(&kvm->slots_lock);

        return 0;

unlock_fail:
        mutex_unlock(&kvm->slots_lock);
        kfree(p);

fail:
        eventfd_ctx_put(eventfd);

        return ret;
}

static int
kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
                           struct kvm_ioeventfd *args)
{
        struct _ioeventfd        *p;
        struct eventfd_ctx       *eventfd;
        struct kvm_io_bus         *bus;
        int                       ret = -ENOENT;
        bool                      wildcard;

        eventfd = eventfd_ctx_fdget(args->fd);
        if (IS_ERR(eventfd))
                return PTR_ERR(eventfd);

        wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);

        mutex_lock(&kvm->slots_lock);

        list_for_each_entry(p, &kvm->ioeventfds, list) {
                if (p->bus_idx != bus_idx ||
                    p->eventfd != eventfd  ||
                    p->addr != args->addr  ||
                    p->length != args->len ||
                    p->wildcard != wildcard)
                        continue;

                if (!p->wildcard && p->datamatch != args->datamatch)
                        continue;

                kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
                bus = kvm_get_bus(kvm, bus_idx);
                if (bus)
                        bus->ioeventfd_count--;
                ret = 0;
                break;
        }

        mutex_unlock(&kvm->slots_lock);

        eventfd_ctx_put(eventfd);

        return ret;
}

static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
        enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags);
        int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);

        if (!args->len && bus_idx == KVM_MMIO_BUS)
                kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);

        return ret;
}

static int
kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
        enum kvm_bus              bus_idx;
        int ret;

        bus_idx = ioeventfd_bus_from_flags(args->flags);
        /* must be natural-word sized, or 0 to ignore length */
        switch (args->len) {
        case 0:
        case 1:
        case 2:
        case 4:
        case 8:
                break;
        default:
                return -EINVAL;
        }

        /* check for range overflow */
        if (args->addr + args->len < args->addr)
                return -EINVAL;

        /* check for extra flags that we don't understand */
        if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
                return -EINVAL;

        /* ioeventfd with no length can't be combined with DATAMATCH */
        if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
                return -EINVAL;

        ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
        if (ret)
                goto fail;

        /* When length is ignored, MMIO is also put on a separate bus, for
         * faster lookups.
         */
        if (!args->len && bus_idx == KVM_MMIO_BUS) {
                ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args);
                if (ret < 0)
                        goto fast_fail;
        }

        return 0;

fast_fail:
        kvm_deassign_ioeventfd_idx(kvm, bus_idx, args);
fail:
        return ret;
}

int
kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
        if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
                return kvm_deassign_ioeventfd(kvm, args);

        return kvm_assign_ioeventfd(kvm, args);
}

void
kvm_eventfd_init(struct kvm *kvm)
{
#ifdef CONFIG_HAVE_KVM_IRQCHIP
        spin_lock_init(&kvm->irqfds.lock);
        INIT_LIST_HEAD(&kvm->irqfds.items);
        INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
        mutex_init(&kvm->irqfds.resampler_lock);
#endif
        INIT_LIST_HEAD(&kvm->ioeventfds);
}





































































































































































































































































































































    4 




  419 




  418 















































  266 
    6 





































  267 
  266 





























































































































































































































































































































  255 





  253 


  255 

  255 



  255 
















































































































































































 1253 


































    1 









    1 


    1 






    1 


    1 
    1 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   33 




   32 



   32 












  537 






  537 


    2 



  538 







































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel timekeeping code and accessor functions. Based on code from
 *  timer.c, moved in commit 8524070b7982.
 */
#include <linux/timekeeper_internal.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/nmi.h>
#include <linux/sched.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/clock.h>
#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <linux/tick.h>
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
#include <linux/compiler.h>
#include <linux/audit.h>
#include <linux/random.h>

#include "tick-internal.h"
#include "ntp_internal.h"
#include "timekeeping_internal.h"

#define TK_CLEAR_NTP                (1 << 0)
#define TK_CLOCK_WAS_SET        (1 << 1)

#define TK_UPDATE_ALL                (TK_CLEAR_NTP | TK_CLOCK_WAS_SET)

enum timekeeping_adv_mode {
        /* Update timekeeper when a tick has passed */
        TK_ADV_TICK,

        /* Update timekeeper on a direct frequency change */
        TK_ADV_FREQ
};

/*
 * The most important data for readout fits into a single 64 byte
 * cache line.
 */
struct tk_data {
        seqcount_raw_spinlock_t        seq;
        struct timekeeper        timekeeper;
        struct timekeeper        shadow_timekeeper;
        raw_spinlock_t                lock;
} ____cacheline_aligned;

static struct tk_data tk_core;

/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;

/**
 * struct tk_fast - NMI safe timekeeper
 * @seq:        Sequence counter for protecting updates. The lowest bit
 *                is the index for the tk_read_base array
 * @base:        tk_read_base array. Access is indexed by the lowest bit of
 *                @seq.
 *
 * See @update_fast_timekeeper() below.
 */
struct tk_fast {
        seqcount_latch_t        seq;
        struct tk_read_base        base[2];
};

/* Suspend-time cycles value for halted fast timekeeper. */
static u64 cycles_at_suspend;

static u64 dummy_clock_read(struct clocksource *cs)
{
        if (timekeeping_suspended)
                return cycles_at_suspend;
        return local_clock();
}

static struct clocksource dummy_clock = {
        .read = dummy_clock_read,
};

/*
 * Boot time initialization which allows local_clock() to be utilized
 * during early boot when clocksources are not available. local_clock()
 * returns nanoseconds already so no conversion is required, hence mult=1
 * and shift=0. When the first proper clocksource is installed then
 * the fast time keepers are updated with the correct values.
 */
#define FAST_TK_INIT                                                \
        {                                                        \
                .clock                = &dummy_clock,                        \
                .mask                = CLOCKSOURCE_MASK(64),                \
                .mult                = 1,                                \
                .shift                = 0,                                \
        }

static struct tk_fast tk_fast_mono ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

static struct tk_fast tk_fast_raw  ____cacheline_aligned = {
        .seq     = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
        .base[0] = FAST_TK_INIT,
        .base[1] = FAST_TK_INIT,
};

unsigned long timekeeper_lock_irqsave(void)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&tk_core.lock, flags);
        return flags;
}

void timekeeper_unlock_irqrestore(unsigned long flags)
{
        raw_spin_unlock_irqrestore(&tk_core.lock, flags);
}

/*
 * Multigrain timestamps require tracking the latest fine-grained timestamp
 * that has been issued, and never returning a coarse-grained timestamp that is
 * earlier than that value.
 *
 * mg_floor represents the latest fine-grained time that has been handed out as
 * a file timestamp on the system. This is tracked as a monotonic ktime_t, and
 * converted to a realtime clock value on an as-needed basis.
 *
 * Maintaining mg_floor ensures the multigrain interfaces never issue a
 * timestamp earlier than one that has been previously issued.
 *
 * The exception to this rule is when there is a backward realtime clock jump. If
 * such an event occurs, a timestamp can appear to be earlier than a previous one.
 */
static __cacheline_aligned_in_smp atomic64_t mg_floor;

static inline void tk_normalize_xtime(struct timekeeper *tk)
{
        while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
                tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
                tk->xtime_sec++;
        }
        while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
                tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
                tk->raw_sec++;
        }
}

static inline struct timespec64 tk_xtime(const struct timekeeper *tk)
{
        struct timespec64 ts;

        ts.tv_sec = tk->xtime_sec;
        ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        return ts;
}

static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec = ts->tv_sec;
        tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
}

static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
        tk->xtime_sec += ts->tv_sec;
        tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
        tk_normalize_xtime(tk);
}

static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
{
        struct timespec64 tmp;

        /*
         * Verify consistency of: offset_real = -wall_to_monotonic
         * before modifying anything
         */
        set_normalized_timespec64(&tmp, -tk->wall_to_monotonic.tv_sec,
                                        -tk->wall_to_monotonic.tv_nsec);
        WARN_ON_ONCE(tk->offs_real != timespec64_to_ktime(tmp));
        tk->wall_to_monotonic = wtm;
        set_normalized_timespec64(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
        /* Paired with READ_ONCE() in ktime_mono_to_any() */
        WRITE_ONCE(tk->offs_real, timespec64_to_ktime(tmp));
        WRITE_ONCE(tk->offs_tai, ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0)));
}

static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
        /* Paired with READ_ONCE() in ktime_mono_to_any() */
        WRITE_ONCE(tk->offs_boot, ktime_add(tk->offs_boot, delta));
        /*
         * Timespec representation for VDSO update to avoid 64bit division
         * on every update.
         */
        tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot);
}

/*
 * tk_clock_read - atomic clocksource read() helper
 *
 * This helper is necessary to use in the read paths because, while the
 * seqcount ensures we don't return a bad value while structures are updated,
 * it doesn't protect from potential crashes. There is the possibility that
 * the tkr's clocksource may change between the read reference, and the
 * clock reference passed to the read function.  This can cause crashes if
 * the wrong clocksource is passed to the wrong read function.
 * This isn't necessary to use when holding the tk_core.lock or doing
 * a read of the fast-timekeeper tkrs (which is protected by its own locking
 * and update logic).
 */
static inline u64 tk_clock_read(const struct tk_read_base *tkr)
{
        struct clocksource *clock = READ_ONCE(tkr->clock);

        return clock->read(clock);
}

/**
 * tk_setup_internals - Set up internals to use clocksource clock.
 *
 * @tk:                The target timekeeper to setup.
 * @clock:                Pointer to clocksource.
 *
 * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
 * pair and interval request.
 *
 * Unless you're the timekeeping code, you should not be using this!
 */
static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
{
        u64 interval;
        u64 tmp, ntpinterval;
        struct clocksource *old_clock;

        ++tk->cs_was_changed_seq;
        old_clock = tk->tkr_mono.clock;
        tk->tkr_mono.clock = clock;
        tk->tkr_mono.mask = clock->mask;
        tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);

        tk->tkr_raw.clock = clock;
        tk->tkr_raw.mask = clock->mask;
        tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;

        /* Do the ns -> cycle conversion first, using original mult */
        tmp = NTP_INTERVAL_LENGTH;
        tmp <<= clock->shift;
        ntpinterval = tmp;
        tmp += clock->mult/2;
        do_div(tmp, clock->mult);
        if (tmp == 0)
                tmp = 1;

        interval = (u64) tmp;
        tk->cycle_interval = interval;

        /* Go back from cycles -> shifted ns */
        tk->xtime_interval = interval * clock->mult;
        tk->xtime_remainder = ntpinterval - tk->xtime_interval;
        tk->raw_interval = interval * clock->mult;

         /* if changing clocks, convert xtime_nsec shift units */
        if (old_clock) {
                int shift_change = clock->shift - old_clock->shift;
                if (shift_change < 0) {
                        tk->tkr_mono.xtime_nsec >>= -shift_change;
                        tk->tkr_raw.xtime_nsec >>= -shift_change;
                } else {
                        tk->tkr_mono.xtime_nsec <<= shift_change;
                        tk->tkr_raw.xtime_nsec <<= shift_change;
                }
        }

        tk->tkr_mono.shift = clock->shift;
        tk->tkr_raw.shift = clock->shift;

        tk->ntp_error = 0;
        tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
        tk->ntp_tick = ntpinterval << tk->ntp_error_shift;

        /*
         * The timekeeper keeps its own mult values for the currently
         * active clocksource. These value will be adjusted via NTP
         * to counteract clock drifting.
         */
        tk->tkr_mono.mult = clock->mult;
        tk->tkr_raw.mult = clock->mult;
        tk->ntp_err_mult = 0;
        tk->skip_second_overflow = 0;
}

/* Timekeeper helper functions. */
static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta)
{
        return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift);
}

static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles)
{
        /* Calculate the delta since the last update_wall_time() */
        u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask;

        /*
         * This detects both negative motion and the case where the delta
         * overflows the multiplication with tkr->mult.
         */
        if (unlikely(delta > tkr->clock->max_cycles)) {
                /*
                 * Handle clocksource inconsistency between CPUs to prevent
                 * time from going backwards by checking for the MSB of the
                 * mask being set in the delta.
                 */
                if (delta & ~(mask >> 1))
                        return tkr->xtime_nsec >> tkr->shift;

                return delta_to_ns_safe(tkr, delta);
        }

        return ((delta * tkr->mult) + tkr->xtime_nsec) >> tkr->shift;
}

static __always_inline u64 timekeeping_get_ns(const struct tk_read_base *tkr)
{
        return timekeeping_cycles_to_ns(tkr, tk_clock_read(tkr));
}

/**
 * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
 * @tkr: Timekeeping readout base from which we take the update
 * @tkf: Pointer to NMI safe timekeeper
 *
 * We want to use this from any context including NMI and tracing /
 * instrumenting the timekeeping code itself.
 *
 * Employ the latch technique; see @write_seqcount_latch.
 *
 * So if a NMI hits the update of base[0] then it will use base[1]
 * which is still consistent. In the worst case this can result is a
 * slightly wrong timestamp (a few nanoseconds). See
 * @ktime_get_mono_fast_ns.
 */
static void update_fast_timekeeper(const struct tk_read_base *tkr,
                                   struct tk_fast *tkf)
{
        struct tk_read_base *base = tkf->base;

        /* Force readers off to base[1] */
        write_seqcount_latch_begin(&tkf->seq);

        /* Update base[0] */
        memcpy(base, tkr, sizeof(*base));

        /* Force readers back to base[0] */
        write_seqcount_latch(&tkf->seq);

        /* Update base[1] */
        memcpy(base + 1, base, sizeof(*base));

        write_seqcount_latch_end(&tkf->seq);
}

static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
{
        struct tk_read_base *tkr;
        unsigned int seq;
        u64 now;

        do {
                seq = read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                now = ktime_to_ns(tkr->base);
                now += timekeeping_get_ns(tkr);
        } while (read_seqcount_latch_retry(&tkf->seq, seq));

        return now;
}

/**
 * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
 *
 * This timestamp is not guaranteed to be monotonic across an update.
 * The timestamp is calculated by:
 *
 *        now = base_mono + clock_delta * slope
 *
 * So if the update lowers the slope, readers who are forced to the
 * not yet updated second array are still using the old steeper slope.
 *
 * tmono
 * ^
 * |    o  n
 * |   o n
 * |  u
 * | o
 * |o
 * |12345678---> reader order
 *
 * o = old slope
 * u = update
 * n = new slope
 *
 * So reader 6 will observe time going backwards versus reader 5.
 *
 * While other CPUs are likely to be able to observe that, the only way
 * for a CPU local observation is when an NMI hits in the middle of
 * the update. Timestamps taken from that NMI context might be ahead
 * of the following timestamps. Callers need to be aware of that and
 * deal with it.
 */
u64 notrace ktime_get_mono_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_mono);
}
EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);

/**
 * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
 *
 * Contrary to ktime_get_mono_fast_ns() this is always correct because the
 * conversion factor is not affected by NTP/PTP correction.
 */
u64 notrace ktime_get_raw_fast_ns(void)
{
        return __ktime_get_fast_ns(&tk_fast_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);

/**
 * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
 *
 * To keep it NMI safe since we're accessing from tracing, we're not using a
 * separate timekeeper with updates to monotonic clock and boot offset
 * protected with seqcounts. This has the following minor side effects:
 *
 * (1) Its possible that a timestamp be taken after the boot offset is updated
 * but before the timekeeper is updated. If this happens, the new boot offset
 * is added to the old timekeeping making the clock appear to update slightly
 * earlier:
 *    CPU 0                                        CPU 1
 *    timekeeping_inject_sleeptime64()
 *    __timekeeping_inject_sleeptime(tk, delta);
 *                                                 timestamp();
 *    timekeeping_update_staged(tkd, TK_CLEAR_NTP...);
 *
 * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
 * partially updated.  Since the tk->offs_boot update is a rare event, this
 * should be a rare occurrence which postprocessing should be able to handle.
 *
 * The caveats vs. timestamp ordering as documented for ktime_get_mono_fast_ns()
 * apply as well.
 */
u64 notrace ktime_get_boot_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_boot)));
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);

/**
 * ktime_get_tai_fast_ns - NMI safe and fast access to tai clock.
 *
 * The same limitations as described for ktime_get_boot_fast_ns() apply. The
 * mono time and the TAI offset are not read atomically which may yield wrong
 * readouts. However, an update of the TAI offset is an rare event e.g., caused
 * by settime or adjtimex with an offset. The user of this function has to deal
 * with the possibility of wrong timestamps in post processing.
 */
u64 notrace ktime_get_tai_fast_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return (ktime_get_mono_fast_ns() + ktime_to_ns(data_race(tk->offs_tai)));
}
EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);

/**
 * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
 *
 * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
 */
u64 ktime_get_real_fast_ns(void)
{
        struct tk_fast *tkf = &tk_fast_mono;
        struct tk_read_base *tkr;
        u64 baser, delta;
        unsigned int seq;

        do {
                seq = raw_read_seqcount_latch(&tkf->seq);
                tkr = tkf->base + (seq & 0x01);
                baser = ktime_to_ns(tkr->base_real);
                delta = timekeeping_get_ns(tkr);
        } while (raw_read_seqcount_latch_retry(&tkf->seq, seq));

        return baser + delta;
}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);

/**
 * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
 * @tk: Timekeeper to snapshot.
 *
 * It generally is unsafe to access the clocksource after timekeeping has been
 * suspended, so take a snapshot of the readout base of @tk and use it as the
 * fast timekeeper's readout base while suspended.  It will return the same
 * number of cycles every time until timekeeping is resumed at which time the
 * proper readout base for the fast timekeeper will be restored automatically.
 */
static void halt_fast_timekeeper(const struct timekeeper *tk)
{
        static struct tk_read_base tkr_dummy;
        const struct tk_read_base *tkr = &tk->tkr_mono;

        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        cycles_at_suspend = tk_clock_read(tkr);
        tkr_dummy.clock = &dummy_clock;
        tkr_dummy.base_real = tkr->base + tk->offs_real;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);

        tkr = &tk->tkr_raw;
        memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
        tkr_dummy.clock = &dummy_clock;
        update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}

static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);

static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
{
        raw_notifier_call_chain(&pvclock_gtod_chain, was_set, tk);
}

/**
 * pvclock_gtod_register_notifier - register a pvclock timedata update listener
 * @nb: Pointer to the notifier block to register
 */
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        int ret;

        guard(raw_spinlock_irqsave)(&tk_core.lock);
        ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
        update_pvclock_gtod(tk, true);

        return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);

/**
 * pvclock_gtod_unregister_notifier - unregister a pvclock
 * timedata update listener
 * @nb: Pointer to the notifier block to unregister
 */
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
        guard(raw_spinlock_irqsave)(&tk_core.lock);
        return raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);

/*
 * tk_update_leap_state - helper to update the next_leap_ktime
 */
static inline void tk_update_leap_state(struct timekeeper *tk)
{
        tk->next_leap_ktime = ntp_get_next_leap();
        if (tk->next_leap_ktime != KTIME_MAX)
                /* Convert to monotonic time */
                tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
}

/*
 * Leap state update for both shadow and the real timekeeper
 * Separate to spare a full memcpy() of the timekeeper.
 */
static void tk_update_leap_state_all(struct tk_data *tkd)
{
        write_seqcount_begin(&tkd->seq);
        tk_update_leap_state(&tkd->shadow_timekeeper);
        tkd->timekeeper.next_leap_ktime = tkd->shadow_timekeeper.next_leap_ktime;
        write_seqcount_end(&tkd->seq);
}

/*
 * Update the ktime_t based scalar nsec members of the timekeeper
 */
static inline void tk_update_ktime_data(struct timekeeper *tk)
{
        u64 seconds;
        u32 nsec;

        /*
         * The xtime based monotonic readout is:
         *        nsec = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec + now();
         * The ktime based monotonic readout is:
         *        nsec = base_mono + now();
         * ==> base_mono = (xtime_sec + wtm_sec) * 1e9 + wtm_nsec
         */
        seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
        nsec = (u32) tk->wall_to_monotonic.tv_nsec;
        tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);

        /*
         * The sum of the nanoseconds portions of xtime and
         * wall_to_monotonic can be greater/equal one second. Take
         * this into account before updating tk->ktime_sec.
         */
        nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
        if (nsec >= NSEC_PER_SEC)
                seconds++;
        tk->ktime_sec = seconds;

        /* Update the monotonic raw base */
        tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}

/*
 * Restore the shadow timekeeper from the real timekeeper.
 */
static void timekeeping_restore_shadow(struct tk_data *tkd)
{
        lockdep_assert_held(&tkd->lock);
        memcpy(&tkd->shadow_timekeeper, &tkd->timekeeper, sizeof(tkd->timekeeper));
}

static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int action)
{
        struct timekeeper *tk = &tk_core.shadow_timekeeper;

        lockdep_assert_held(&tkd->lock);

        /*
         * Block out readers before running the updates below because that
         * updates VDSO and other time related infrastructure. Not blocking
         * the readers might let a reader see time going backwards when
         * reading from the VDSO after the VDSO update and then reading in
         * the kernel from the timekeeper before that got updated.
         */
        write_seqcount_begin(&tkd->seq);

        if (action & TK_CLEAR_NTP) {
                tk->ntp_error = 0;
                ntp_clear();
        }

        tk_update_leap_state(tk);
        tk_update_ktime_data(tk);

        update_vsyscall(tk);
        update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);

        tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real;
        update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
        update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);

        if (action & TK_CLOCK_WAS_SET)
                tk->clock_was_set_seq++;

        /*
         * Update the real timekeeper.
         *
         * We could avoid this memcpy() by switching pointers, but that has
         * the downside that the reader side does not longer benefit from
         * the cacheline optimized data layout of the timekeeper and requires
         * another indirection.
         */
        memcpy(&tkd->timekeeper, tk, sizeof(*tk));
        write_seqcount_end(&tkd->seq);
}

/**
 * timekeeping_forward_now - update clock to the current time
 * @tk:                Pointer to the timekeeper to update
 *
 * Forward the current clock to update its state since the last call to
 * update_wall_time(). This is useful before significant clock changes,
 * as it avoids having to deal with this time offset explicitly.
 */
static void timekeeping_forward_now(struct timekeeper *tk)
{
        u64 cycle_now, delta;

        cycle_now = tk_clock_read(&tk->tkr_mono);
        delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
                                  tk->tkr_mono.clock->max_raw_delta);
        tk->tkr_mono.cycle_last = cycle_now;
        tk->tkr_raw.cycle_last  = cycle_now;

        while (delta > 0) {
                u64 max = tk->tkr_mono.clock->max_cycles;
                u64 incr = delta < max ? delta : max;

                tk->tkr_mono.xtime_nsec += incr * tk->tkr_mono.mult;
                tk->tkr_raw.xtime_nsec += incr * tk->tkr_raw.mult;
                tk_normalize_xtime(tk);
                delta -= incr;
        }
}

/**
 * ktime_get_real_ts64 - Returns the time of day in a timespec64.
 * @ts:                pointer to the timespec to be set
 *
 * Returns the time of day in a timespec64 (WARN if suspended).
 */
void ktime_get_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ts->tv_sec = tk->xtime_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_real_ts64);

ktime_t ktime_get(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get);

u32 ktime_get_resolution_ns(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u32 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return nsecs;
}
EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);

static ktime_t *offsets[TK_OFFS_MAX] = {
        [TK_OFFS_REAL]        = &tk_core.timekeeper.offs_real,
        [TK_OFFS_BOOT]        = &tk_core.timekeeper.offs_boot,
        [TK_OFFS_TAI]        = &tk_core.timekeeper.offs_tai,
};

ktime_t ktime_get_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = timekeeping_get_ns(&tk->tkr_mono);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);

}
EXPORT_SYMBOL_GPL(ktime_get_with_offset);

ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base, *offset = offsets[offs];
        u64 nsecs;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = ktime_add(tk->tkr_mono.base, *offset);
                nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset);

/**
 * ktime_mono_to_any() - convert monotonic time to any other time
 * @tmono:        time to convert.
 * @offs:        which offset to use
 */
ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
{
        ktime_t *offset = offsets[offs];
        unsigned int seq;
        ktime_t tconv;

        if (IS_ENABLED(CONFIG_64BIT)) {
                /*
                 * Paired with WRITE_ONCE()s in tk_set_wall_to_mono() and
                 * tk_update_sleep_time().
                 */
                return ktime_add(tmono, READ_ONCE(*offset));
        }

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                tconv = ktime_add(tmono, *offset);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return tconv;
}
EXPORT_SYMBOL_GPL(ktime_mono_to_any);

/**
 * ktime_get_raw - Returns the raw monotonic time in ktime_t format
 */
ktime_t ktime_get_raw(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                base = tk->tkr_raw.base;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ktime_add_ns(base, nsecs);
}
EXPORT_SYMBOL_GPL(ktime_get_raw);

/**
 * ktime_get_ts64 - get the monotonic clock in timespec64 format
 * @ts:                pointer to timespec variable
 *
 * The function calculates the monotonic clock from the realtime
 * clock and the wall_to_monotonic offset and stores the result
 * in normalized timespec64 format in the variable pointed to by @ts.
 */
void ktime_get_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 tomono;
        unsigned int seq;
        u64 nsec;

        WARN_ON(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->xtime_sec;
                nsec = timekeeping_get_ns(&tk->tkr_mono);
                tomono = tk->wall_to_monotonic;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_sec += tomono.tv_sec;
        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);

/**
 * ktime_get_seconds - Get the seconds portion of CLOCK_MONOTONIC
 *
 * Returns the seconds portion of CLOCK_MONOTONIC with a single non
 * serialized read. tk->ktime_sec is of type 'unsigned long' so this
 * works on both 32 and 64 bit systems. On 32 bit systems the readout
 * covers ~136 years of uptime which should be enough to prevent
 * premature wrap arounds.
 */
time64_t ktime_get_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        WARN_ON(timekeeping_suspended);
        return tk->ktime_sec;
}
EXPORT_SYMBOL_GPL(ktime_get_seconds);

/**
 * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME
 *
 * Returns the wall clock seconds since 1970.
 *
 * For 64bit systems the fast access to tk->xtime_sec is preserved. On
 * 32bit systems the access must be protected with the sequence
 * counter to provide "atomic" access to the 64bit tk->xtime_sec
 * value.
 */
time64_t ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        time64_t seconds;
        unsigned int seq;

        if (IS_ENABLED(CONFIG_64BIT))
                return tk->xtime_sec;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                seconds = tk->xtime_sec;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return seconds;
}
EXPORT_SYMBOL_GPL(ktime_get_real_seconds);

/**
 * __ktime_get_real_seconds - The same as ktime_get_real_seconds
 * but without the sequence counter protect. This internal function
 * is called just when timekeeping lock is already held.
 */
noinstr time64_t __ktime_get_real_seconds(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        return tk->xtime_sec;
}

/**
 * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
 * @systime_snapshot:        pointer to struct receiving the system time snapshot
 */
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base_raw;
        ktime_t base_real;
        ktime_t base_boot;
        u64 nsec_raw;
        u64 nsec_real;
        u64 now;

        WARN_ON_ONCE(timekeeping_suspended);

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                now = tk_clock_read(&tk->tkr_mono);
                systime_snapshot->cs_id = tk->tkr_mono.clock->id;
                systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
                systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_boot = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_boot);
                base_raw = tk->tkr_raw.base;
                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
                nsec_raw  = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        systime_snapshot->cycles = now;
        systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
        systime_snapshot->boot = ktime_add_ns(base_boot, nsec_real);
        systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL_GPL(ktime_get_snapshot);

/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
{
        u64 tmp, rem;

        tmp = div64_u64_rem(*base, div, &rem);

        if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
            ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
                return -EOVERFLOW;
        tmp *= mult;

        rem = div64_u64(rem * mult, div);
        *base = tmp + rem;
        return 0;
}

/**
 * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
 * @history:                        Snapshot representing start of history
 * @partial_history_cycles:        Cycle offset into history (fractional part)
 * @total_history_cycles:        Total history length in cycles
 * @discontinuity:                True indicates clock was set on history period
 * @ts:                                Cross timestamp that should be adjusted using
 *        partial/total ratio
 *
 * Helper function used by get_device_system_crosststamp() to correct the
 * crosstimestamp corresponding to the start of the current interval to the
 * system counter value (timestamp point) provided by the driver. The
 * total_history_* quantities are the total history starting at the provided
 * reference point and ending at the start of the current interval. The cycle
 * count between the driver timestamp point and the start of the current
 * interval is partial_history_cycles.
 */
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
                                         u64 partial_history_cycles,
                                         u64 total_history_cycles,
                                         bool discontinuity,
                                         struct system_device_crosststamp *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 corr_raw, corr_real;
        bool interp_forward;
        int ret;

        if (total_history_cycles == 0 || partial_history_cycles == 0)
                return 0;

        /* Interpolate shortest distance from beginning or end of history */
        interp_forward = partial_history_cycles > total_history_cycles / 2;
        partial_history_cycles = interp_forward ?
                total_history_cycles - partial_history_cycles :
                partial_history_cycles;

        /*
         * Scale the monotonic raw time delta by:
         *        partial_history_cycles / total_history_cycles
         */
        corr_raw = (u64)ktime_to_ns(
                ktime_sub(ts->sys_monoraw, history->raw));
        ret = scale64_check_overflow(partial_history_cycles,
                                     total_history_cycles, &corr_raw);
        if (ret)
                return ret;

        /*
         * If there is a discontinuity in the history, scale monotonic raw
         *        correction by:
         *        mult(real)/mult(raw) yielding the realtime correction
         * Otherwise, calculate the realtime correction similar to monotonic
         *        raw calculation
         */
        if (discontinuity) {
                corr_real = mul_u64_u32_div
                        (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
        } else {
                corr_real = (u64)ktime_to_ns(
                        ktime_sub(ts->sys_realtime, history->real));
                ret = scale64_check_overflow(partial_history_cycles,
                                             total_history_cycles, &corr_real);
                if (ret)
                        return ret;
        }

        /* Fixup monotonic raw and real time time values */
        if (interp_forward) {
                ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
                ts->sys_realtime = ktime_add_ns(history->real, corr_real);
        } else {
                ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
                ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
        }

        return 0;
}

/*
 * timestamp_in_interval - true if ts is chronologically in [start, end]
 *
 * True if ts occurs chronologically at or after start, and before or at end.
 */
static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
{
        if (ts >= start && ts <= end)
                return true;
        if (start > end && (ts >= start || ts <= end))
                return true;
        return false;
}

static bool convert_clock(u64 *val, u32 numerator, u32 denominator)
{
        u64 rem, res;

        if (!numerator || !denominator)
                return false;

        res = div64_u64_rem(*val, denominator, &rem) * numerator;
        *val = res + div_u64(rem * numerator, denominator);
        return true;
}

static bool convert_base_to_cs(struct system_counterval_t *scv)
{
        struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
        struct clocksource_base *base;
        u32 num, den;

        /* The timestamp was taken from the time keeper clock source */
        if (cs->id == scv->cs_id)
                return true;

        /*
         * Check whether cs_id matches the base clock. Prevent the compiler from
         * re-evaluating @base as the clocksource might change concurrently.
         */
        base = READ_ONCE(cs->base);
        if (!base || base->id != scv->cs_id)
                return false;

        num = scv->use_nsecs ? cs->freq_khz : base->numerator;
        den = scv->use_nsecs ? USEC_PER_SEC : base->denominator;

        if (!convert_clock(&scv->cycles, num, den))
                return false;

        scv->cycles += base->offset;
        return true;
}

static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id)
{
        struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
        struct clocksource_base *base;

        /*
         * Check whether base_id matches the base clock. Prevent the compiler from
         * re-evaluating @base as the clocksource might change concurrently.
         */
        base = READ_ONCE(cs->base);
        if (!base || base->id != base_id)
                return false;

        *cycles -= base->offset;
        if (!convert_clock(cycles, base->denominator, base->numerator))
                return false;
        return true;
}

static bool convert_ns_to_cs(u64 *delta)
{
        struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;

        if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta))
                return false;

        *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult);
        return true;
}

/**
 * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp
 * @treal:        CLOCK_REALTIME timestamp to convert
 * @base_id:        base clocksource id
 * @cycles:        pointer to store the converted base clock timestamp
 *
 * Converts a supplied, future realtime clock value to the corresponding base clock value.
 *
 * Return:  true if the conversion is successful, false otherwise.
 */
bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 delta;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                if ((u64)treal < tk->tkr_mono.base_real)
                        return false;
                delta = (u64)treal - tk->tkr_mono.base_real;
                if (!convert_ns_to_cs(&delta))
                        return false;
                *cycles = tk->tkr_mono.cycle_last + delta;
                if (!convert_cs_to_base(cycles, base_id))
                        return false;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        return true;
}
EXPORT_SYMBOL_GPL(ktime_real_to_base_clock);

/**
 * get_device_system_crosststamp - Synchronously capture system/device timestamp
 * @get_time_fn:        Callback to get simultaneous device time and
 *        system counter from the device driver
 * @ctx:                Context passed to get_time_fn()
 * @history_begin:        Historical reference point used to interpolate system
 *        time when counter provided by the driver is before the current interval
 * @xtstamp:                Receives simultaneously captured system and device time
 *
 * Reads a timestamp from a device and correlates it to system time
 */
int get_device_system_crosststamp(int (*get_time_fn)
                                  (ktime_t *device_time,
                                   struct system_counterval_t *sys_counterval,
                                   void *ctx),
                                  void *ctx,
                                  struct system_time_snapshot *history_begin,
                                  struct system_device_crosststamp *xtstamp)
{
        struct system_counterval_t system_counterval;
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 cycles, now, interval_start;
        unsigned int clock_was_set_seq = 0;
        ktime_t base_real, base_raw;
        u64 nsec_real, nsec_raw;
        u8 cs_was_changed_seq;
        unsigned int seq;
        bool do_interp;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                /*
                 * Try to synchronously capture device time and a system
                 * counter value calling back into the device driver
                 */
                ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
                if (ret)
                        return ret;

                /*
                 * Verify that the clocksource ID associated with the captured
                 * system counter value is the same as for the currently
                 * installed timekeeper clocksource
                 */
                if (system_counterval.cs_id == CSID_GENERIC ||
                    !convert_base_to_cs(&system_counterval))
                        return -ENODEV;
                cycles = system_counterval.cycles;

                /*
                 * Check whether the system counter value provided by the
                 * device driver is on the current timekeeping interval.
                 */
                now = tk_clock_read(&tk->tkr_mono);
                interval_start = tk->tkr_mono.cycle_last;
                if (!timestamp_in_interval(interval_start, now, cycles)) {
                        clock_was_set_seq = tk->clock_was_set_seq;
                        cs_was_changed_seq = tk->cs_was_changed_seq;
                        cycles = interval_start;
                        do_interp = true;
                } else {
                        do_interp = false;
                }

                base_real = ktime_add(tk->tkr_mono.base,
                                      tk_core.timekeeper.offs_real);
                base_raw = tk->tkr_raw.base;

                nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, cycles);
                nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, cycles);
        } while (read_seqcount_retry(&tk_core.seq, seq));

        xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
        xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);

        /*
         * Interpolate if necessary, adjusting back from the start of the
         * current interval
         */
        if (do_interp) {
                u64 partial_history_cycles, total_history_cycles;
                bool discontinuity;

                /*
                 * Check that the counter value is not before the provided
                 * history reference and that the history doesn't cross a
                 * clocksource change
                 */
                if (!history_begin ||
                    !timestamp_in_interval(history_begin->cycles,
                                           cycles, system_counterval.cycles) ||
                    history_begin->cs_was_changed_seq != cs_was_changed_seq)
                        return -EINVAL;
                partial_history_cycles = cycles - system_counterval.cycles;
                total_history_cycles = cycles - history_begin->cycles;
                discontinuity =
                        history_begin->clock_was_set_seq != clock_was_set_seq;

                ret = adjust_historical_crosststamp(history_begin,
                                                    partial_history_cycles,
                                                    total_history_cycles,
                                                    discontinuity, xtstamp);
                if (ret)
                        return ret;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);

/**
 * timekeeping_clocksource_has_base - Check whether the current clocksource
 *                                      is based on given a base clock
 * @id:                base clocksource ID
 *
 * Note:        The return value is a snapshot which can become invalid right
 *                after the function returns.
 *
 * Return:        true if the timekeeper clocksource has a base clock with @id,
 *                false otherwise
 */
bool timekeeping_clocksource_has_base(enum clocksource_ids id)
{
        /*
         * This is a snapshot, so no point in using the sequence
         * count. Just prevent the compiler from re-evaluating @base as the
         * clocksource might change concurrently.
         */
        struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base);

        return base ? base->id == id : false;
}
EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base);

/**
 * do_settimeofday64 - Sets the time of day.
 * @ts:     pointer to the timespec64 variable containing the new time
 *
 * Sets the time of day to the new time and update NTP and notify hrtimers
 */
int do_settimeofday64(const struct timespec64 *ts)
{
        struct timespec64 ts_delta, xt;

        if (!timespec64_valid_settod(ts))
                return -EINVAL;

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;

                timekeeping_forward_now(tks);

                xt = tk_xtime(tks);
                ts_delta = timespec64_sub(*ts, xt);

                if (timespec64_compare(&tks->wall_to_monotonic, &ts_delta) > 0) {
                        timekeeping_restore_shadow(&tk_core);
                        return -EINVAL;
                }

                tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, ts_delta));
                tk_set_xtime(tks, ts);
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL);

        audit_tk_injoffset(ts_delta);
        add_device_randomness(ts, sizeof(*ts));
        return 0;
}
EXPORT_SYMBOL(do_settimeofday64);

/**
 * timekeeping_inject_offset - Adds or subtracts from the current time.
 * @ts:                Pointer to the timespec variable containing the offset
 *
 * Adds or subtracts an offset value from the current time.
 */
static int timekeeping_inject_offset(const struct timespec64 *ts)
{
        if (ts->tv_nsec < 0 || ts->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;
                struct timespec64 tmp;

                timekeeping_forward_now(tks);

                /* Make sure the proposed value is valid */
                tmp = timespec64_add(tk_xtime(tks), *ts);
                if (timespec64_compare(&tks->wall_to_monotonic, ts) > 0 ||
                    !timespec64_valid_settod(&tmp)) {
                        timekeeping_restore_shadow(&tk_core);
                        return -EINVAL;
                }

                tk_xtime_add(tks, ts);
                tk_set_wall_to_mono(tks, timespec64_sub(tks->wall_to_monotonic, *ts));
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL);
        return 0;
}

/*
 * Indicates if there is an offset between the system clock and the hardware
 * clock/persistent clock/rtc.
 */
int persistent_clock_is_local;

/*
 * Adjust the time obtained from the CMOS to be UTC time instead of
 * local time.
 *
 * This is ugly, but preferable to the alternatives.  Otherwise we
 * would either need to write a program to do it in /etc/rc (and risk
 * confusion if the program gets run more than once; it would also be
 * hard to make the program warp the clock precisely n hours)  or
 * compile in the timezone information into the kernel.  Bad, bad....
 *
 *                                                - TYT, 1992-01-01
 *
 * The best thing to do is to keep the CMOS clock in universal time (UTC)
 * as real UNIX machines always do it. This avoids all headaches about
 * daylight saving times and warping kernel clocks.
 */
void timekeeping_warp_clock(void)
{
        if (sys_tz.tz_minuteswest != 0) {
                struct timespec64 adjust;

                persistent_clock_is_local = 1;
                adjust.tv_sec = sys_tz.tz_minuteswest * 60;
                adjust.tv_nsec = 0;
                timekeeping_inject_offset(&adjust);
        }
}

/*
 * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
 */
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
        tk->tai_offset = tai_offset;
        tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}

/*
 * change_clocksource - Swaps clocksources if a new one is available
 *
 * Accumulates current time interval and initializes new clocksource
 */
static int change_clocksource(void *data)
{
        struct clocksource *new = data, *old = NULL;

        /*
         * If the clocksource is in a module, get a module reference.
         * Succeeds for built-in code (owner == NULL) as well. Abort if the
         * reference can't be acquired.
         */
        if (!try_module_get(new->owner))
                return 0;

        /* Abort if the device can't be enabled */
        if (new->enable && new->enable(new) != 0) {
                module_put(new->owner);
                return 0;
        }

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;

                timekeeping_forward_now(tks);
                old = tks->tkr_mono.clock;
                tk_setup_internals(tks, new);
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        if (old) {
                if (old->disable)
                        old->disable(old);
                module_put(old->owner);
        }

        return 0;
}

/**
 * timekeeping_notify - Install a new clock source
 * @clock:                pointer to the clock source
 *
 * This function is called from clocksource.c after a new, better clock
 * source has been registered. The caller holds the clocksource_mutex.
 */
int timekeeping_notify(struct clocksource *clock)
{
        struct timekeeper *tk = &tk_core.timekeeper;

        if (tk->tkr_mono.clock == clock)
                return 0;
        stop_machine(change_clocksource, clock, NULL);
        tick_clock_notify();
        return tk->tkr_mono.clock == clock ? 0 : -1;
}

/**
 * ktime_get_raw_ts64 - Returns the raw monotonic time in a timespec
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the raw monotonic time (completely un-modified by ntp)
 */
void ktime_get_raw_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                ts->tv_sec = tk->raw_sec;
                nsecs = timekeeping_get_ns(&tk->tkr_raw);

        } while (read_seqcount_retry(&tk_core.seq, seq));

        ts->tv_nsec = 0;
        timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(ktime_get_raw_ts64);


/**
 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
 */
int timekeeping_valid_for_hres(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        int ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
 */
u64 timekeeping_max_deferment(void)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        u64 ret;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ret = tk->tkr_mono.clock->max_idle_ns;

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return ret;
}

/**
 * read_persistent_clock64 -  Return time from the persistent clock.
 * @ts: Pointer to the storage for the readout value
 *
 * Weak dummy function for arches that do not yet support it.
 * Reads the time from the battery backed persistent clock.
 * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
 *
 *  XXX - Do be sure to remove it once all arches implement it.
 */
void __weak read_persistent_clock64(struct timespec64 *ts)
{
        ts->tv_sec = 0;
        ts->tv_nsec = 0;
}

/**
 * read_persistent_wall_and_boot_offset - Read persistent clock, and also offset
 *                                        from the boot.
 * @wall_time:          current time as returned by persistent clock
 * @boot_offset:  offset that is defined as wall_time - boot_time
 *
 * Weak dummy function for arches that do not yet support it.
 *
 * The default function calculates offset based on the current value of
 * local_clock(). This way architectures that support sched_clock() but don't
 * support dedicated boot time clock will provide the best estimate of the
 * boot time.
 */
void __weak __init
read_persistent_wall_and_boot_offset(struct timespec64 *wall_time,
                                     struct timespec64 *boot_offset)
{
        read_persistent_clock64(wall_time);
        *boot_offset = ns_to_timespec64(local_clock());
}

static __init void tkd_basic_setup(struct tk_data *tkd)
{
        raw_spin_lock_init(&tkd->lock);
        seqcount_raw_spinlock_init(&tkd->seq, &tkd->lock);
}

/*
 * Flag reflecting whether timekeeping_resume() has injected sleeptime.
 *
 * The flag starts of false and is only set when a suspend reaches
 * timekeeping_suspend(), timekeeping_resume() sets it to false when the
 * timekeeper clocksource is not stopping across suspend and has been
 * used to update sleep time. If the timekeeper clocksource has stopped
 * then the flag stays true and is used by the RTC resume code to decide
 * whether sleeptime must be injected and if so the flag gets false then.
 *
 * If a suspend fails before reaching timekeeping_resume() then the flag
 * stays false and prevents erroneous sleeptime injection.
 */
static bool suspend_timing_needed;

/* Flag for if there is a persistent clock on this platform */
static bool persistent_clock_exists;

/*
 * timekeeping_init - Initializes the clocksource and common timekeeping values
 */
void __init timekeeping_init(void)
{
        struct timespec64 wall_time, boot_offset, wall_to_mono;
        struct timekeeper *tks = &tk_core.shadow_timekeeper;
        struct clocksource *clock;

        tkd_basic_setup(&tk_core);

        read_persistent_wall_and_boot_offset(&wall_time, &boot_offset);
        if (timespec64_valid_settod(&wall_time) &&
            timespec64_to_ns(&wall_time) > 0) {
                persistent_clock_exists = true;
        } else if (timespec64_to_ns(&wall_time) != 0) {
                pr_warn("Persistent clock returned invalid value");
                wall_time = (struct timespec64){0};
        }

        if (timespec64_compare(&wall_time, &boot_offset) < 0)
                boot_offset = (struct timespec64){0};

        /*
         * We want set wall_to_mono, so the following is true:
         * wall time + wall_to_mono = boot time
         */
        wall_to_mono = timespec64_sub(boot_offset, wall_time);

        guard(raw_spinlock_irqsave)(&tk_core.lock);

        ntp_init();

        clock = clocksource_default_clock();
        if (clock->enable)
                clock->enable(clock);
        tk_setup_internals(tks, clock);

        tk_set_xtime(tks, &wall_time);
        tks->raw_sec = 0;

        tk_set_wall_to_mono(tks, wall_to_mono);

        timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
}

/* time in seconds when suspend began for persistent clock */
static struct timespec64 timekeeping_suspend_time;

/**
 * __timekeeping_inject_sleeptime - Internal function to add sleep interval
 * @tk:                Pointer to the timekeeper to be updated
 * @delta:        Pointer to the delta value in timespec64 format
 *
 * Takes a timespec offset measuring a suspend interval and properly
 * adds the sleep offset to the timekeeping variables.
 */
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
                                           const struct timespec64 *delta)
{
        if (!timespec64_valid_strict(delta)) {
                printk_deferred(KERN_WARNING
                                "__timekeeping_inject_sleeptime: Invalid "
                                "sleep delta value!\n");
                return;
        }
        tk_xtime_add(tk, delta);
        tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
        tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
        tk_debug_account_sleep_time(delta);
}

#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE)
/*
 * We have three kinds of time sources to use for sleep time
 * injection, the preference order is:
 * 1) non-stop clocksource
 * 2) persistent clock (ie: RTC accessible when irqs are off)
 * 3) RTC
 *
 * 1) and 2) are used by timekeeping, 3) by RTC subsystem.
 * If system has neither 1) nor 2), 3) will be used finally.
 *
 *
 * If timekeeping has injected sleeptime via either 1) or 2),
 * 3) becomes needless, so in this case we don't need to call
 * rtc_resume(), and this is what timekeeping_rtc_skipresume()
 * means.
 */
bool timekeeping_rtc_skipresume(void)
{
        return !suspend_timing_needed;
}

/*
 * 1) can be determined whether to use or not only when doing
 * timekeeping_resume() which is invoked after rtc_suspend(),
 * so we can't skip rtc_suspend() surely if system has 1).
 *
 * But if system has 2), 2) will definitely be used, so in this
 * case we don't need to call rtc_suspend(), and this is what
 * timekeeping_rtc_skipsuspend() means.
 */
bool timekeeping_rtc_skipsuspend(void)
{
        return persistent_clock_exists;
}

/**
 * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
 * @delta: pointer to a timespec64 delta value
 *
 * This hook is for architectures that cannot support read_persistent_clock64
 * because their RTC/persistent clock is only accessible when irqs are enabled.
 * and also don't have an effective nonstop clocksource.
 *
 * This function should only be called by rtc_resume(), and allows
 * a suspend offset to be injected into the timekeeping values.
 */
void timekeeping_inject_sleeptime64(const struct timespec64 *delta)
{
        scoped_guard(raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;

                suspend_timing_needed = false;
                timekeeping_forward_now(tks);
                __timekeeping_inject_sleeptime(tks, delta);
                timekeeping_update_from_shadow(&tk_core, TK_UPDATE_ALL);
        }

        /* Signal hrtimers about time change */
        clock_was_set(CLOCK_SET_WALL | CLOCK_SET_BOOT);
}
#endif

/**
 * timekeeping_resume - Resumes the generic timekeeping subsystem.
 */
void timekeeping_resume(void)
{
        struct timekeeper *tks = &tk_core.shadow_timekeeper;
        struct clocksource *clock = tks->tkr_mono.clock;
        struct timespec64 ts_new, ts_delta;
        bool inject_sleeptime = false;
        u64 cycle_now, nsec;
        unsigned long flags;

        read_persistent_clock64(&ts_new);

        clockevents_resume();
        clocksource_resume();

        raw_spin_lock_irqsave(&tk_core.lock, flags);

        /*
         * After system resumes, we need to calculate the suspended time and
         * compensate it for the OS time. There are 3 sources that could be
         * used: Nonstop clocksource during suspend, persistent clock and rtc
         * device.
         *
         * One specific platform may have 1 or 2 or all of them, and the
         * preference will be:
         *        suspend-nonstop clocksource -> persistent clock -> rtc
         * The less preferred source will only be tried if there is no better
         * usable source. The rtc part is handled separately in rtc core code.
         */
        cycle_now = tk_clock_read(&tks->tkr_mono);
        nsec = clocksource_stop_suspend_timing(clock, cycle_now);
        if (nsec > 0) {
                ts_delta = ns_to_timespec64(nsec);
                inject_sleeptime = true;
        } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) {
                ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time);
                inject_sleeptime = true;
        }

        if (inject_sleeptime) {
                suspend_timing_needed = false;
                __timekeeping_inject_sleeptime(tks, &ts_delta);
        }

        /* Re-base the last cycle value */
        tks->tkr_mono.cycle_last = cycle_now;
        tks->tkr_raw.cycle_last  = cycle_now;

        tks->ntp_error = 0;
        timekeeping_suspended = 0;
        timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
        raw_spin_unlock_irqrestore(&tk_core.lock, flags);

        touch_softlockup_watchdog();

        /* Resume the clockevent device(s) and hrtimers */
        tick_resume();
        /* Notify timerfd as resume is equivalent to clock_was_set() */
        timerfd_resume();
}

int timekeeping_suspend(void)
{
        struct timekeeper *tks = &tk_core.shadow_timekeeper;
        struct timespec64 delta, delta_delta;
        static struct timespec64 old_delta;
        struct clocksource *curr_clock;
        unsigned long flags;
        u64 cycle_now;

        read_persistent_clock64(&timekeeping_suspend_time);

        /*
         * On some systems the persistent_clock can not be detected at
         * timekeeping_init by its return value, so if we see a valid
         * value returned, update the persistent_clock_exists flag.
         */
        if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
                persistent_clock_exists = true;

        suspend_timing_needed = true;

        raw_spin_lock_irqsave(&tk_core.lock, flags);
        timekeeping_forward_now(tks);
        timekeeping_suspended = 1;

        /*
         * Since we've called forward_now, cycle_last stores the value
         * just read from the current clocksource. Save this to potentially
         * use in suspend timing.
         */
        curr_clock = tks->tkr_mono.clock;
        cycle_now = tks->tkr_mono.cycle_last;
        clocksource_start_suspend_timing(curr_clock, cycle_now);

        if (persistent_clock_exists) {
                /*
                 * To avoid drift caused by repeated suspend/resumes,
                 * which each can add ~1 second drift error,
                 * try to compensate so the difference in system time
                 * and persistent_clock time stays close to constant.
                 */
                delta = timespec64_sub(tk_xtime(tks), timekeeping_suspend_time);
                delta_delta = timespec64_sub(delta, old_delta);
                if (abs(delta_delta.tv_sec) >= 2) {
                        /*
                         * if delta_delta is too large, assume time correction
                         * has occurred and set old_delta to the current delta.
                         */
                        old_delta = delta;
                } else {
                        /* Otherwise try to adjust old_system to compensate */
                        timekeeping_suspend_time =
                                timespec64_add(timekeeping_suspend_time, delta_delta);
                }
        }

        timekeeping_update_from_shadow(&tk_core, 0);
        halt_fast_timekeeper(tks);
        raw_spin_unlock_irqrestore(&tk_core.lock, flags);

        tick_suspend();
        clocksource_suspend();
        clockevents_suspend();

        return 0;
}

/* sysfs resume/suspend bits for timekeeping */
static struct syscore_ops timekeeping_syscore_ops = {
        .resume                = timekeeping_resume,
        .suspend        = timekeeping_suspend,
};

static int __init timekeeping_init_ops(void)
{
        register_syscore_ops(&timekeeping_syscore_ops);
        return 0;
}
device_initcall(timekeeping_init_ops);

/*
 * Apply a multiplier adjustment to the timekeeper
 */
static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
                                                         s64 offset,
                                                         s32 mult_adj)
{
        s64 interval = tk->cycle_interval;

        if (mult_adj == 0) {
                return;
        } else if (mult_adj == -1) {
                interval = -interval;
                offset = -offset;
        } else if (mult_adj != 1) {
                interval *= mult_adj;
                offset *= mult_adj;
        }

        /*
         * So the following can be confusing.
         *
         * To keep things simple, lets assume mult_adj == 1 for now.
         *
         * When mult_adj != 1, remember that the interval and offset values
         * have been appropriately scaled so the math is the same.
         *
         * The basic idea here is that we're increasing the multiplier
         * by one, this causes the xtime_interval to be incremented by
         * one cycle_interval. This is because:
         *        xtime_interval = cycle_interval * mult
         * So if mult is being incremented by one:
         *        xtime_interval = cycle_interval * (mult + 1)
         * Its the same as:
         *        xtime_interval = (cycle_interval * mult) + cycle_interval
         * Which can be shortened to:
         *        xtime_interval += cycle_interval
         *
         * So offset stores the non-accumulated cycles. Thus the current
         * time (in shifted nanoseconds) is:
         *        now = (offset * adj) + xtime_nsec
         * Now, even though we're adjusting the clock frequency, we have
         * to keep time consistent. In other words, we can't jump back
         * in time, and we also want to avoid jumping forward in time.
         *
         * So given the same offset value, we need the time to be the same
         * both before and after the freq adjustment.
         *        now = (offset * adj_1) + xtime_nsec_1
         *        now = (offset * adj_2) + xtime_nsec_2
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_2) + xtime_nsec_2
         * And we know:
         *        adj_2 = adj_1 + 1
         * So:
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * (adj_1+1)) + xtime_nsec_2
         *        (offset * adj_1) + xtime_nsec_1 =
         *                (offset * adj_1) + offset + xtime_nsec_2
         * Canceling the sides:
         *        xtime_nsec_1 = offset + xtime_nsec_2
         * Which gives us:
         *        xtime_nsec_2 = xtime_nsec_1 - offset
         * Which simplifies to:
         *        xtime_nsec -= offset
         */
        if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
                /* NTP adjustment caused clocksource mult overflow */
                WARN_ON_ONCE(1);
                return;
        }

        tk->tkr_mono.mult += mult_adj;
        tk->xtime_interval += interval;
        tk->tkr_mono.xtime_nsec -= offset;
}

/*
 * Adjust the timekeeper's multiplier to the correct frequency
 * and also to reduce the accumulated error value.
 */
static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
{
        u64 ntp_tl = ntp_tick_length();
        u32 mult;

        /*
         * Determine the multiplier from the current NTP tick length.
         * Avoid expensive division when the tick length doesn't change.
         */
        if (likely(tk->ntp_tick == ntp_tl)) {
                mult = tk->tkr_mono.mult - tk->ntp_err_mult;
        } else {
                tk->ntp_tick = ntp_tl;
                mult = div64_u64((tk->ntp_tick >> tk->ntp_error_shift) -
                                 tk->xtime_remainder, tk->cycle_interval);
        }

        /*
         * If the clock is behind the NTP time, increase the multiplier by 1
         * to catch up with it. If it's ahead and there was a remainder in the
         * tick division, the clock will slow down. Otherwise it will stay
         * ahead until the tick length changes to a non-divisible value.
         */
        tk->ntp_err_mult = tk->ntp_error > 0 ? 1 : 0;
        mult += tk->ntp_err_mult;

        timekeeping_apply_adjustment(tk, offset, mult - tk->tkr_mono.mult);

        if (unlikely(tk->tkr_mono.clock->maxadj &&
                (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
                        > tk->tkr_mono.clock->maxadj))) {
                printk_once(KERN_WARNING
                        "Adjusting %s more than 11%% (%ld vs %ld)\n",
                        tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
                        (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
        }

        /*
         * It may be possible that when we entered this function, xtime_nsec
         * was very small.  Further, if we're slightly speeding the clocksource
         * in the code above, its possible the required corrective factor to
         * xtime_nsec could cause it to underflow.
         *
         * Now, since we have already accumulated the second and the NTP
         * subsystem has been notified via second_overflow(), we need to skip
         * the next update.
         */
        if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
                tk->tkr_mono.xtime_nsec += (u64)NSEC_PER_SEC <<
                                                        tk->tkr_mono.shift;
                tk->xtime_sec--;
                tk->skip_second_overflow = 1;
        }
}

/*
 * accumulate_nsecs_to_secs - Accumulates nsecs into secs
 *
 * Helper function that accumulates the nsecs greater than a second
 * from the xtime_nsec field to the xtime_secs field.
 * It also calls into the NTP code to handle leapsecond processing.
 */
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
        u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
        unsigned int clock_set = 0;

        while (tk->tkr_mono.xtime_nsec >= nsecps) {
                int leap;

                tk->tkr_mono.xtime_nsec -= nsecps;
                tk->xtime_sec++;

                /*
                 * Skip NTP update if this second was accumulated before,
                 * i.e. xtime_nsec underflowed in timekeeping_adjust()
                 */
                if (unlikely(tk->skip_second_overflow)) {
                        tk->skip_second_overflow = 0;
                        continue;
                }

                /* Figure out if its a leap sec and apply if needed */
                leap = second_overflow(tk->xtime_sec);
                if (unlikely(leap)) {
                        struct timespec64 ts;

                        tk->xtime_sec += leap;

                        ts.tv_sec = leap;
                        ts.tv_nsec = 0;
                        tk_set_wall_to_mono(tk,
                                timespec64_sub(tk->wall_to_monotonic, ts));

                        __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);

                        clock_set = TK_CLOCK_WAS_SET;
                }
        }
        return clock_set;
}

/*
 * logarithmic_accumulation - shifted accumulation of cycles
 *
 * This functions accumulates a shifted interval of cycles into
 * a shifted interval nanoseconds. Allows for O(log) accumulation
 * loop.
 *
 * Returns the unconsumed cycles.
 */
static u64 logarithmic_accumulation(struct timekeeper *tk, u64 offset,
                                    u32 shift, unsigned int *clock_set)
{
        u64 interval = tk->cycle_interval << shift;
        u64 snsec_per_sec;

        /* If the offset is smaller than a shifted interval, do nothing */
        if (offset < interval)
                return offset;

        /* Accumulate one shifted interval */
        offset -= interval;
        tk->tkr_mono.cycle_last += interval;
        tk->tkr_raw.cycle_last  += interval;

        tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
        *clock_set |= accumulate_nsecs_to_secs(tk);

        /* Accumulate raw time */
        tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
        snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
        while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
                tk->tkr_raw.xtime_nsec -= snsec_per_sec;
                tk->raw_sec++;
        }

        /* Accumulate error between NTP and clock interval */
        tk->ntp_error += tk->ntp_tick << shift;
        tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
                                                (tk->ntp_error_shift + shift);

        return offset;
}

/*
 * timekeeping_advance - Updates the timekeeper to the current time and
 * current NTP tick length
 */
static bool timekeeping_advance(enum timekeeping_adv_mode mode)
{
        struct timekeeper *tk = &tk_core.shadow_timekeeper;
        struct timekeeper *real_tk = &tk_core.timekeeper;
        unsigned int clock_set = 0;
        int shift = 0, maxshift;
        u64 offset;

        guard(raw_spinlock_irqsave)(&tk_core.lock);

        /* Make sure we're fully resumed: */
        if (unlikely(timekeeping_suspended))
                return false;

        offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
                                   tk->tkr_mono.cycle_last, tk->tkr_mono.mask,
                                   tk->tkr_mono.clock->max_raw_delta);

        /* Check if there's really nothing to do */
        if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK)
                return false;

        /*
         * With NO_HZ we may have to accumulate many cycle_intervals
         * (think "ticks") worth of time at once. To do this efficiently,
         * we calculate the largest doubling multiple of cycle_intervals
         * that is smaller than the offset.  We then accumulate that
         * chunk in one go, and then try to consume the next smaller
         * doubled multiple.
         */
        shift = ilog2(offset) - ilog2(tk->cycle_interval);
        shift = max(0, shift);
        /* Bound shift to one less than what overflows tick_length */
        maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
        shift = min(shift, maxshift);
        while (offset >= tk->cycle_interval) {
                offset = logarithmic_accumulation(tk, offset, shift, &clock_set);
                if (offset < tk->cycle_interval<<shift)
                        shift--;
        }

        /* Adjust the multiplier to correct NTP error */
        timekeeping_adjust(tk, offset);

        /*
         * Finally, make sure that after the rounding
         * xtime_nsec isn't larger than NSEC_PER_SEC
         */
        clock_set |= accumulate_nsecs_to_secs(tk);

        timekeeping_update_from_shadow(&tk_core, clock_set);

        return !!clock_set;
}

/**
 * update_wall_time - Uses the current clocksource to increment the wall time
 *
 */
void update_wall_time(void)
{
        if (timekeeping_advance(TK_ADV_TICK))
                clock_was_set_delayed();
}

/**
 * getboottime64 - Return the real time of system boot.
 * @ts:                pointer to the timespec64 to be set
 *
 * Returns the wall-time of boot in a timespec64.
 *
 * This is based on the wall_to_monotonic offset and the total suspend
 * time. Calls to settimeofday will affect the value returned (which
 * basically means that however wrong your real time clock is at boot time,
 * you get the right time here).
 */
void getboottime64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);

        *ts = ktime_to_timespec64(t);
}
EXPORT_SYMBOL_GPL(getboottime64);

void ktime_get_coarse_real_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                *ts = tk_xtime(tk);
        } while (read_seqcount_retry(&tk_core.seq, seq));
}
EXPORT_SYMBOL(ktime_get_coarse_real_ts64);

/**
 * ktime_get_coarse_real_ts64_mg - return latter of coarse grained time or floor
 * @ts:                timespec64 to be filled
 *
 * Fetch the global mg_floor value, convert it to realtime and compare it
 * to the current coarse-grained time. Fill @ts with whichever is
 * latest. Note that this is a filesystem-specific interface and should be
 * avoided outside of that context.
 */
void ktime_get_coarse_real_ts64_mg(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        u64 floor = atomic64_read(&mg_floor);
        ktime_t f_real, offset, coarse;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);
                *ts = tk_xtime(tk);
                offset = tk_core.timekeeper.offs_real;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        coarse = timespec64_to_ktime(*ts);
        f_real = ktime_add(floor, offset);
        if (ktime_after(f_real, coarse))
                *ts = ktime_to_timespec64(f_real);
}

/**
 * ktime_get_real_ts64_mg - attempt to update floor value and return result
 * @ts:                pointer to the timespec to be set
 *
 * Get a monotonic fine-grained time value and attempt to swap it into
 * mg_floor. If that succeeds then accept the new floor value. If it fails
 * then another task raced in during the interim time and updated the
 * floor.  Since any update to the floor must be later than the previous
 * floor, either outcome is acceptable.
 *
 * Typically this will be called after calling ktime_get_coarse_real_ts64_mg(),
 * and determining that the resulting coarse-grained timestamp did not effect
 * a change in ctime. Any more recent floor value would effect a change to
 * ctime, so there is no need to retry the atomic64_try_cmpxchg() on failure.
 *
 * @ts will be filled with the latest floor value, regardless of the outcome of
 * the cmpxchg. Note that this is a filesystem specific interface and should be
 * avoided outside of that context.
 */
void ktime_get_real_ts64_mg(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        ktime_t old = atomic64_read(&mg_floor);
        ktime_t offset, mono;
        unsigned int seq;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                ts->tv_sec = tk->xtime_sec;
                mono = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                offset = tk_core.timekeeper.offs_real;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        mono = ktime_add_ns(mono, nsecs);

        /*
         * Attempt to update the floor with the new time value. As any
         * update must be later then the existing floor, and would effect
         * a change to ctime from the perspective of the current task,
         * accept the resulting floor value regardless of the outcome of
         * the swap.
         */
        if (atomic64_try_cmpxchg(&mg_floor, &old, mono)) {
                ts->tv_nsec = 0;
                timespec64_add_ns(ts, nsecs);
                timekeeping_inc_mg_floor_swaps();
        } else {
                /*
                 * Another task changed mg_floor since "old" was fetched.
                 * "old" has been updated with the latest value of "mg_floor".
                 * That value is newer than the previous floor value, which
                 * is enough to effect a change to ctime. Accept it.
                 */
                *ts = ktime_to_timespec64(ktime_add(old, offset));
        }
}

void ktime_get_coarse_ts64(struct timespec64 *ts)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        struct timespec64 now, mono;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                now = tk_xtime(tk);
                mono = tk->wall_to_monotonic;
        } while (read_seqcount_retry(&tk_core.seq, seq));

        set_normalized_timespec64(ts, now.tv_sec + mono.tv_sec,
                                now.tv_nsec + mono.tv_nsec);
}
EXPORT_SYMBOL(ktime_get_coarse_ts64);

/*
 * Must hold jiffies_lock
 */
void do_timer(unsigned long ticks)
{
        jiffies_64 += ticks;
        calc_global_load();
}

/**
 * ktime_get_update_offsets_now - hrtimer helper
 * @cwsseq:        pointer to check and store the clock was set sequence number
 * @offs_real:        pointer to storage for monotonic -> realtime offset
 * @offs_boot:        pointer to storage for monotonic -> boottime offset
 * @offs_tai:        pointer to storage for monotonic -> clock tai offset
 *
 * Returns current monotonic time and updates the offsets if the
 * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
 * different.
 *
 * Called from hrtimer_interrupt() or retrigger_next_event()
 */
ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
                                     ktime_t *offs_boot, ktime_t *offs_tai)
{
        struct timekeeper *tk = &tk_core.timekeeper;
        unsigned int seq;
        ktime_t base;
        u64 nsecs;

        do {
                seq = read_seqcount_begin(&tk_core.seq);

                base = tk->tkr_mono.base;
                nsecs = timekeeping_get_ns(&tk->tkr_mono);
                base = ktime_add_ns(base, nsecs);

                if (*cwsseq != tk->clock_was_set_seq) {
                        *cwsseq = tk->clock_was_set_seq;
                        *offs_real = tk->offs_real;
                        *offs_boot = tk->offs_boot;
                        *offs_tai = tk->offs_tai;
                }

                /* Handle leapsecond insertion adjustments */
                if (unlikely(base >= tk->next_leap_ktime))
                        *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));

        } while (read_seqcount_retry(&tk_core.seq, seq));

        return base;
}

/*
 * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
 */
static int timekeeping_validate_timex(const struct __kernel_timex *txc)
{
        if (txc->modes & ADJ_ADJTIME) {
                /* singleshot must not be used with any other mode bits */
                if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
                        return -EINVAL;
                if (!(txc->modes & ADJ_OFFSET_READONLY) &&
                    !capable(CAP_SYS_TIME))
                        return -EPERM;
        } else {
                /* In order to modify anything, you gotta be super-user! */
                if (txc->modes && !capable(CAP_SYS_TIME))
                        return -EPERM;
                /*
                 * if the quartz is off by more than 10% then
                 * something is VERY wrong!
                 */
                if (txc->modes & ADJ_TICK &&
                    (txc->tick <  900000/USER_HZ ||
                     txc->tick > 1100000/USER_HZ))
                        return -EINVAL;
        }

        if (txc->modes & ADJ_SETOFFSET) {
                /* In order to inject time, you gotta be super-user! */
                if (!capable(CAP_SYS_TIME))
                        return -EPERM;

                /*
                 * Validate if a timespec/timeval used to inject a time
                 * offset is valid.  Offsets can be positive or negative, so
                 * we don't check tv_sec. The value of the timeval/timespec
                 * is the sum of its fields,but *NOTE*:
                 * The field tv_usec/tv_nsec must always be non-negative and
                 * we can't have more nanoseconds/microseconds than a second.
                 */
                if (txc->time.tv_usec < 0)
                        return -EINVAL;

                if (txc->modes & ADJ_NANO) {
                        if (txc->time.tv_usec >= NSEC_PER_SEC)
                                return -EINVAL;
                } else {
                        if (txc->time.tv_usec >= USEC_PER_SEC)
                                return -EINVAL;
                }
        }

        /*
         * Check for potential multiplication overflows that can
         * only happen on 64-bit systems:
         */
        if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) {
                if (LLONG_MIN / PPM_SCALE > txc->freq)
                        return -EINVAL;
                if (LLONG_MAX / PPM_SCALE < txc->freq)
                        return -EINVAL;
        }

        return 0;
}

/**
 * random_get_entropy_fallback - Returns the raw clock source value,
 * used by random.c for platforms with no valid random_get_entropy().
 */
unsigned long random_get_entropy_fallback(void)
{
        struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
        struct clocksource *clock = READ_ONCE(tkr->clock);

        if (unlikely(timekeeping_suspended || !clock))
                return 0;
        return clock->read(clock);
}
EXPORT_SYMBOL_GPL(random_get_entropy_fallback);

/**
 * do_adjtimex() - Accessor function to NTP __do_adjtimex function
 * @txc:        Pointer to kernel_timex structure containing NTP parameters
 */
int do_adjtimex(struct __kernel_timex *txc)
{
        struct audit_ntp_data ad;
        bool offset_set = false;
        bool clock_set = false;
        struct timespec64 ts;
        int ret;

        /* Validate the data before disabling interrupts */
        ret = timekeeping_validate_timex(txc);
        if (ret)
                return ret;
        add_device_randomness(txc, sizeof(*txc));

        if (txc->modes & ADJ_SETOFFSET) {
                struct timespec64 delta;

                delta.tv_sec  = txc->time.tv_sec;
                delta.tv_nsec = txc->time.tv_usec;
                if (!(txc->modes & ADJ_NANO))
                        delta.tv_nsec *= 1000;
                ret = timekeeping_inject_offset(&delta);
                if (ret)
                        return ret;

                offset_set = delta.tv_sec != 0;
                audit_tk_injoffset(delta);
        }

        audit_ntp_init(&ad);

        ktime_get_real_ts64(&ts);
        add_device_randomness(&ts, sizeof(ts));

        scoped_guard (raw_spinlock_irqsave, &tk_core.lock) {
                struct timekeeper *tks = &tk_core.shadow_timekeeper;
                s32 orig_tai, tai;

                orig_tai = tai = tks->tai_offset;
                ret = __do_adjtimex(txc, &ts, &tai, &ad);

                if (tai != orig_tai) {
                        __timekeeping_set_tai_offset(tks, tai);
                        timekeeping_update_from_shadow(&tk_core, TK_CLOCK_WAS_SET);
                        clock_set = true;
                } else {
                        tk_update_leap_state_all(&tk_core);
                }
        }

        audit_ntp_log(&ad);

        /* Update the multiplier immediately if frequency was set directly */
        if (txc->modes & (ADJ_FREQUENCY | ADJ_TICK))
                clock_set |= timekeeping_advance(TK_ADV_FREQ);

        if (clock_set)
                clock_was_set(CLOCK_SET_WALL);

        ntp_notify_cmos_timer(offset_set);

        return ret;
}

#ifdef CONFIG_NTP_PPS
/**
 * hardpps() - Accessor function to NTP __hardpps function
 * @phase_ts:        Pointer to timespec64 structure representing phase timestamp
 * @raw_ts:        Pointer to timespec64 structure representing raw timestamp
 */
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
        guard(raw_spinlock_irqsave)(&tk_core.lock);
        __hardpps(phase_ts, raw_ts);
}
EXPORT_SYMBOL(hardpps);
#endif /* CONFIG_NTP_PPS */























































































































    3 





    3 


    3 




    1 









   12 














































































































   29 


    3 


   23 




    3 














































































   83 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2015, 2016 ARM Ltd.
 */
#ifndef __KVM_ARM_VGIC_NEW_H__
#define __KVM_ARM_VGIC_NEW_H__

#include <linux/irqchip/arm-gic-common.h>
#include <asm/kvm_mmu.h>

#define PRODUCT_ID_KVM                0x4b        /* ASCII code K */
#define IMPLEMENTER_ARM                0x43b

#define VGIC_ADDR_UNDEF                (-1)
#define IS_VGIC_ADDR_UNDEF(_x)  ((_x) == VGIC_ADDR_UNDEF)

#define INTERRUPT_ID_BITS_SPIS        10
#define INTERRUPT_ID_BITS_ITS        16
#define VGIC_LPI_MAX_INTID        ((1 << INTERRUPT_ID_BITS_ITS) - 1)
#define VGIC_PRI_BITS                5

#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)

#define VGIC_AFFINITY_0_SHIFT 0
#define VGIC_AFFINITY_0_MASK (0xffUL << VGIC_AFFINITY_0_SHIFT)
#define VGIC_AFFINITY_1_SHIFT 8
#define VGIC_AFFINITY_1_MASK (0xffUL << VGIC_AFFINITY_1_SHIFT)
#define VGIC_AFFINITY_2_SHIFT 16
#define VGIC_AFFINITY_2_MASK (0xffUL << VGIC_AFFINITY_2_SHIFT)
#define VGIC_AFFINITY_3_SHIFT 24
#define VGIC_AFFINITY_3_MASK (0xffUL << VGIC_AFFINITY_3_SHIFT)

#define VGIC_AFFINITY_LEVEL(reg, level) \
        ((((reg) & VGIC_AFFINITY_## level ##_MASK) \
        >> VGIC_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))

/*
 * The Userspace encodes the affinity differently from the MPIDR,
 * Below macro converts vgic userspace format to MPIDR reg format.
 */
#define VGIC_TO_MPIDR(val) (VGIC_AFFINITY_LEVEL(val, 0) | \
                            VGIC_AFFINITY_LEVEL(val, 1) | \
                            VGIC_AFFINITY_LEVEL(val, 2) | \
                            VGIC_AFFINITY_LEVEL(val, 3))

/*
 * As per Documentation/virt/kvm/devices/arm-vgic-v3.rst,
 * below macros are defined for CPUREG encoding.
 */
#define KVM_REG_ARM_VGIC_SYSREG_OP0_MASK   0x000000000000c000
#define KVM_REG_ARM_VGIC_SYSREG_OP0_SHIFT  14
#define KVM_REG_ARM_VGIC_SYSREG_OP1_MASK   0x0000000000003800
#define KVM_REG_ARM_VGIC_SYSREG_OP1_SHIFT  11
#define KVM_REG_ARM_VGIC_SYSREG_CRN_MASK   0x0000000000000780
#define KVM_REG_ARM_VGIC_SYSREG_CRN_SHIFT  7
#define KVM_REG_ARM_VGIC_SYSREG_CRM_MASK   0x0000000000000078
#define KVM_REG_ARM_VGIC_SYSREG_CRM_SHIFT  3
#define KVM_REG_ARM_VGIC_SYSREG_OP2_MASK   0x0000000000000007
#define KVM_REG_ARM_VGIC_SYSREG_OP2_SHIFT  0

#define KVM_DEV_ARM_VGIC_SYSREG_MASK (KVM_REG_ARM_VGIC_SYSREG_OP0_MASK | \
                                      KVM_REG_ARM_VGIC_SYSREG_OP1_MASK | \
                                      KVM_REG_ARM_VGIC_SYSREG_CRN_MASK | \
                                      KVM_REG_ARM_VGIC_SYSREG_CRM_MASK | \
                                      KVM_REG_ARM_VGIC_SYSREG_OP2_MASK)

/*
 * As per Documentation/virt/kvm/devices/arm-vgic-its.rst,
 * below macros are defined for ITS table entry encoding.
 */
#define KVM_ITS_CTE_VALID_SHIFT                63
#define KVM_ITS_CTE_VALID_MASK                BIT_ULL(63)
#define KVM_ITS_CTE_RDBASE_SHIFT        16
#define KVM_ITS_CTE_ICID_MASK                GENMASK_ULL(15, 0)
#define KVM_ITS_ITE_NEXT_SHIFT                48
#define KVM_ITS_ITE_PINTID_SHIFT        16
#define KVM_ITS_ITE_PINTID_MASK                GENMASK_ULL(47, 16)
#define KVM_ITS_ITE_ICID_MASK                GENMASK_ULL(15, 0)
#define KVM_ITS_DTE_VALID_SHIFT                63
#define KVM_ITS_DTE_VALID_MASK                BIT_ULL(63)
#define KVM_ITS_DTE_NEXT_SHIFT                49
#define KVM_ITS_DTE_NEXT_MASK                GENMASK_ULL(62, 49)
#define KVM_ITS_DTE_ITTADDR_SHIFT        5
#define KVM_ITS_DTE_ITTADDR_MASK        GENMASK_ULL(48, 5)
#define KVM_ITS_DTE_SIZE_MASK                GENMASK_ULL(4, 0)
#define KVM_ITS_L1E_VALID_MASK                BIT_ULL(63)
/* we only support 64 kB translation table page size */
#define KVM_ITS_L1E_ADDR_MASK                GENMASK_ULL(51, 16)

#define KVM_VGIC_V3_RDIST_INDEX_MASK        GENMASK_ULL(11, 0)
#define KVM_VGIC_V3_RDIST_FLAGS_MASK        GENMASK_ULL(15, 12)
#define KVM_VGIC_V3_RDIST_FLAGS_SHIFT        12
#define KVM_VGIC_V3_RDIST_BASE_MASK        GENMASK_ULL(51, 16)
#define KVM_VGIC_V3_RDIST_COUNT_MASK        GENMASK_ULL(63, 52)
#define KVM_VGIC_V3_RDIST_COUNT_SHIFT        52

#ifdef CONFIG_DEBUG_SPINLOCK
#define DEBUG_SPINLOCK_BUG_ON(p) BUG_ON(p)
#else
#define DEBUG_SPINLOCK_BUG_ON(p)
#endif

static inline u32 vgic_get_implementation_rev(struct kvm_vcpu *vcpu)
{
        return vcpu->kvm->arch.vgic.implementation_rev;
}

/* Requires the irq_lock to be held by the caller. */
static inline bool irq_is_pending(struct vgic_irq *irq)
{
        if (irq->config == VGIC_CONFIG_EDGE)
                return irq->pending_latch;
        else
                return irq->pending_latch || irq->line_level;
}

static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
{
        return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
}

static inline int vgic_irq_get_lr_count(struct vgic_irq *irq)
{
        /* Account for the active state as an interrupt */
        if (vgic_irq_is_sgi(irq->intid) && irq->source)
                return hweight8(irq->source) + irq->active;

        return irq_is_pending(irq) || irq->active;
}

static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq)
{
        return vgic_irq_get_lr_count(irq) > 1;
}

static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa,
                                        const void *data, unsigned long len)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        int ret;

        dist->table_write_in_progress = true;
        ret = kvm_write_guest_lock(kvm, gpa, data, len);
        dist->table_write_in_progress = false;

        return ret;
}

/*
 * This struct provides an intermediate representation of the fields contained
 * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
 * state to userspace can generate either GICv2 or GICv3 CPU interface
 * registers regardless of the hardware backed GIC used.
 */
struct vgic_vmcr {
        u32        grpen0;
        u32        grpen1;

        u32        ackctl;
        u32        fiqen;
        u32        cbpr;
        u32        eoim;

        u32        abpr;
        u32        bpr;
        u32        pmr;  /* Priority mask field in the GICC_PMR and
                       * ICC_PMR_EL1 priority field format */
};

struct vgic_reg_attr {
        struct kvm_vcpu *vcpu;
        gpa_t addr;
};

struct its_device {
        struct list_head dev_list;

        /* the head for the list of ITTEs */
        struct list_head itt_head;
        u32 num_eventid_bits;
        gpa_t itt_addr;
        u32 device_id;
};

#define COLLECTION_NOT_MAPPED ((u32)~0)

struct its_collection {
        struct list_head coll_list;

        u32 collection_id;
        u32 target_addr;
};

#define its_is_collection_mapped(coll) ((coll) && \
                                ((coll)->target_addr != COLLECTION_NOT_MAPPED))

struct its_ite {
        struct list_head ite_list;

        struct vgic_irq *irq;
        struct its_collection *collection;
        u32 event_id;
};

int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
                       struct vgic_reg_attr *reg_attr);
int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
                       struct vgic_reg_attr *reg_attr);
const struct vgic_register_region *
vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
                     gpa_t addr, int len);
struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid);
struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid);
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
bool vgic_get_phys_line_level(struct vgic_irq *irq);
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
                           unsigned long flags) __releases(&irq->irq_lock);
void vgic_kick_vcpus(struct kvm *kvm);
void vgic_irq_handle_resampling(struct vgic_irq *irq,
                                bool lr_deactivated, bool lr_pending);

int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr,
                       phys_addr_t addr, phys_addr_t alignment,
                       phys_addr_t size);

void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
                         int offset, u32 *val);
int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
                          int offset, u32 *val);
void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v2_enable(struct kvm_vcpu *vcpu);
int vgic_v2_probe(const struct gic_kvm_info *info);
int vgic_v2_map_resources(struct kvm *kvm);
int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
                             enum vgic_type);

void vgic_v2_init_lrs(void);
void vgic_v2_load(struct kvm_vcpu *vcpu);
void vgic_v2_put(struct kvm_vcpu *vcpu);

void vgic_v2_save_state(struct kvm_vcpu *vcpu);
void vgic_v2_restore_state(struct kvm_vcpu *vcpu);

static inline bool vgic_try_get_irq_kref(struct vgic_irq *irq)
{
        if (!irq)
                return false;

        if (irq->intid < VGIC_MIN_LPI)
                return true;

        return kref_get_unless_zero(&irq->refcount);
}

static inline void vgic_get_irq_kref(struct vgic_irq *irq)
{
        WARN_ON_ONCE(!vgic_try_get_irq_kref(irq));
}

void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v3_enable(struct kvm_vcpu *vcpu);
int vgic_v3_probe(const struct gic_kvm_info *info);
int vgic_v3_map_resources(struct kvm *kvm);
int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
int vgic_v3_save_pending_tables(struct kvm *kvm);
int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count);
int vgic_register_redist_iodev(struct kvm_vcpu *vcpu);
void vgic_unregister_redist_iodev(struct kvm_vcpu *vcpu);
bool vgic_v3_check_base(struct kvm *kvm);

void vgic_v3_load(struct kvm_vcpu *vcpu);
void vgic_v3_put(struct kvm_vcpu *vcpu);

bool vgic_has_its(struct kvm *kvm);
int kvm_vgic_register_its_device(void);
void vgic_enable_lpis(struct kvm_vcpu *vcpu);
void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu);
int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
int vgic_v3_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
int vgic_v3_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
                         int offset, u32 *val);
int vgic_v3_redist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
                         int offset, u32 *val);
int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu,
                                struct kvm_device_attr *attr, bool is_write);
int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
int vgic_v3_line_level_info_uaccess(struct kvm_vcpu *vcpu, bool is_write,
                                    u32 intid, u32 *val);
int kvm_register_vgic_device(unsigned long type);
void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
int vgic_lazy_init(struct kvm *kvm);
int vgic_init(struct kvm *kvm);

void vgic_debug_init(struct kvm *kvm);
void vgic_debug_destroy(struct kvm *kvm);

static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *cpu_if = &vcpu->arch.vgic_cpu;

        /*
         * num_pri_bits are initialized with HW supported values.
         * We can rely safely on num_pri_bits even if VM has not
         * restored ICC_CTLR_EL1 before restoring APnR registers.
         */
        switch (cpu_if->num_pri_bits) {
        case 7: return 3;
        case 6: return 1;
        default: return 0;
        }
}

static inline bool
vgic_v3_redist_region_full(struct vgic_redist_region *region)
{
        if (!region->count)
                return false;

        return (region->free_index >= region->count);
}

struct vgic_redist_region *vgic_v3_rdist_free_slot(struct list_head *rdregs);

static inline size_t
vgic_v3_rd_region_size(struct kvm *kvm, struct vgic_redist_region *rdreg)
{
        if (!rdreg->count)
                return atomic_read(&kvm->online_vcpus) * KVM_VGIC_V3_REDIST_SIZE;
        else
                return rdreg->count * KVM_VGIC_V3_REDIST_SIZE;
}

struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm,
                                                           u32 index);
void vgic_v3_free_redist_region(struct kvm *kvm, struct vgic_redist_region *rdreg);

bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size);

static inline bool vgic_dist_overlap(struct kvm *kvm, gpa_t base, size_t size)
{
        struct vgic_dist *d = &kvm->arch.vgic;

        return (base + size > d->vgic_dist_base) &&
                (base < d->vgic_dist_base + KVM_VGIC_V3_DIST_SIZE);
}

bool vgic_lpis_enabled(struct kvm_vcpu *vcpu);
int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its,
                         u32 devid, u32 eventid, struct vgic_irq **irq);
struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi);
int vgic_its_inject_cached_translation(struct kvm *kvm, struct kvm_msi *msi);
void vgic_its_invalidate_all_caches(struct kvm *kvm);

/* GICv4.1 MMIO interface */
int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
int vgic_its_invall(struct kvm_vcpu *vcpu);

bool vgic_supports_direct_msis(struct kvm *kvm);
int vgic_v4_init(struct kvm *kvm);
void vgic_v4_teardown(struct kvm *kvm);
void vgic_v4_configure_vsgis(struct kvm *kvm);
void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val);
int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq);

void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu);

static inline bool kvm_has_gicv3(struct kvm *kvm)
{
        return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP);
}

void vgic_v3_sync_nested(struct kvm_vcpu *vcpu);
void vgic_v3_load_nested(struct kvm_vcpu *vcpu);
void vgic_v3_put_nested(struct kvm_vcpu *vcpu);
void vgic_v3_handle_nested_maint_irq(struct kvm_vcpu *vcpu);
void vgic_v3_nested_update_mi(struct kvm_vcpu *vcpu);

int vgic_its_debug_init(struct kvm_device *dev);
void vgic_its_debug_destroy(struct kvm_device *dev);

#endif









































































































































































































































































































































































































































  307 
  307 


  307 





  307 







  307 










  302 













































   43 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * vma.h
 *
 * Core VMA manipulation API implemented in vma.c.
 */
#ifndef __MM_VMA_H
#define __MM_VMA_H

/*
 * VMA lock generalization
 */
struct vma_prepare {
        struct vm_area_struct *vma;
        struct vm_area_struct *adj_next;
        struct file *file;
        struct address_space *mapping;
        struct anon_vma *anon_vma;
        struct vm_area_struct *insert;
        struct vm_area_struct *remove;
        struct vm_area_struct *remove2;
};

struct unlink_vma_file_batch {
        int count;
        struct vm_area_struct *vmas[8];
};

/*
 * vma munmap operation
 */
struct vma_munmap_struct {
        struct vma_iterator *vmi;
        struct vm_area_struct *vma;     /* The first vma to munmap */
        struct vm_area_struct *prev;    /* vma before the munmap area */
        struct vm_area_struct *next;    /* vma after the munmap area */
        struct list_head *uf;           /* Userfaultfd list_head */
        unsigned long start;            /* Aligned start addr (inclusive) */
        unsigned long end;              /* Aligned end addr (exclusive) */
        unsigned long unmap_start;      /* Unmap PTE start */
        unsigned long unmap_end;        /* Unmap PTE end */
        int vma_count;                  /* Number of vmas that will be removed */
        bool unlock;                    /* Unlock after the munmap */
        bool clear_ptes;                /* If there are outstanding PTE to be cleared */
        /* 2 byte hole */
        unsigned long nr_pages;         /* Number of pages being removed */
        unsigned long locked_vm;        /* Number of locked pages */
        unsigned long nr_accounted;     /* Number of VM_ACCOUNT pages */
        unsigned long exec_vm;
        unsigned long stack_vm;
        unsigned long data_vm;
};

enum vma_merge_state {
        VMA_MERGE_START,
        VMA_MERGE_ERROR_NOMEM,
        VMA_MERGE_NOMERGE,
        VMA_MERGE_SUCCESS,
};

/*
 * Describes a VMA merge operation and is threaded throughout it.
 *
 * Any of the fields may be mutated by the merge operation, so no guarantees are
 * made to the contents of this structure after a merge operation has completed.
 */
struct vma_merge_struct {
        struct mm_struct *mm;
        struct vma_iterator *vmi;
        /*
         * Adjacent VMAs, any of which may be NULL if not present:
         *
         * |------|--------|------|
         * | prev | middle | next |
         * |------|--------|------|
         *
         * middle may not yet exist in the case of a proposed new VMA being
         * merged, or it may be an existing VMA.
         *
         * next may be assigned by the caller.
         */
        struct vm_area_struct *prev;
        struct vm_area_struct *middle;
        struct vm_area_struct *next;
        /* This is the VMA we ultimately target to become the merged VMA. */
        struct vm_area_struct *target;
        /*
         * Initially, the start, end, pgoff fields are provided by the caller
         * and describe the proposed new VMA range, whether modifying an
         * existing VMA (which will be 'middle'), or adding a new one.
         *
         * During the merge process these fields are updated to describe the new
         * range _including those VMAs which will be merged_.
         */
        unsigned long start;
        unsigned long end;
        pgoff_t pgoff;

        unsigned long flags;
        struct file *file;
        struct anon_vma *anon_vma;
        struct mempolicy *policy;
        struct vm_userfaultfd_ctx uffd_ctx;
        struct anon_vma_name *anon_name;
        enum vma_merge_state state;

        /* Flags which callers can use to modify merge behaviour: */

        /*
         * If we can expand, simply do so. We know there is nothing to merge to
         * the right. Does not reset state upon failure to merge. The VMA
         * iterator is assumed to be positioned at the previous VMA, rather than
         * at the gap.
         */
        bool just_expand :1;

        /*
         * If a merge is possible, but an OOM error occurs, give up and don't
         * execute the merge, returning NULL.
         */
        bool give_up_on_oom :1;

        /* Internal flags set during merge process: */

        /*
         * Internal flag indicating the merge increases vmg->middle->vm_start
         * (and thereby, vmg->prev->vm_end).
         */
        bool __adjust_middle_start :1;
        /*
         * Internal flag indicating the merge decreases vmg->next->vm_start
         * (and thereby, vmg->middle->vm_end).
         */
        bool __adjust_next_start :1;
        /*
         * Internal flag used during the merge operation to indicate we will
         * remove vmg->middle.
         */
        bool __remove_middle :1;
        /*
         * Internal flag used during the merge operationr to indicate we will
         * remove vmg->next.
         */
        bool __remove_next :1;

};

static inline bool vmg_nomem(struct vma_merge_struct *vmg)
{
        return vmg->state == VMA_MERGE_ERROR_NOMEM;
}

/* Assumes addr >= vma->vm_start. */
static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
                                       unsigned long addr)
{
        return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
}

#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_)        \
        struct vma_merge_struct name = {                                \
                .mm = mm_,                                                \
                .vmi = vmi_,                                                \
                .start = start_,                                        \
                .end = end_,                                                \
                .flags = flags_,                                        \
                .pgoff = pgoff_,                                        \
                .state = VMA_MERGE_START,                                \
        }

#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_)        \
        struct vma_merge_struct name = {                        \
                .mm = vma_->vm_mm,                                \
                .vmi = vmi_,                                        \
                .prev = prev_,                                        \
                .middle = vma_,                                        \
                .next = NULL,                                        \
                .start = start_,                                \
                .end = end_,                                        \
                .flags = vma_->vm_flags,                        \
                .pgoff = vma_pgoff_offset(vma_, start_),        \
                .file = vma_->vm_file,                                \
                .anon_vma = vma_->anon_vma,                        \
                .policy = vma_policy(vma_),                        \
                .uffd_ctx = vma_->vm_userfaultfd_ctx,                \
                .anon_name = anon_vma_name(vma_),                \
                .state = VMA_MERGE_START,                        \
        }

#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
void validate_mm(struct mm_struct *mm);
#else
#define validate_mm(mm) do { } while (0)
#endif

__must_check int vma_expand(struct vma_merge_struct *vmg);
__must_check int vma_shrink(struct vma_iterator *vmi,
                struct vm_area_struct *vma,
                unsigned long start, unsigned long end, pgoff_t pgoff);

static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
                        struct vm_area_struct *vma, gfp_t gfp)

{
        if (vmi->mas.status != ma_start &&
            ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
                vma_iter_invalidate(vmi);

        __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
        mas_store_gfp(&vmi->mas, vma, gfp);
        if (unlikely(mas_is_err(&vmi->mas)))
                return -ENOMEM;

        vma_mark_attached(vma);
        return 0;
}

int
do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                    struct mm_struct *mm, unsigned long start,
                    unsigned long end, struct list_head *uf, bool unlock);

int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                  unsigned long start, size_t len, struct list_head *uf,
                  bool unlock);

void remove_vma(struct vm_area_struct *vma);

void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct vm_area_struct *next);

/* We are about to modify the VMA's flags. */
__must_check struct vm_area_struct
*vma_modify_flags(struct vma_iterator *vmi,
                struct vm_area_struct *prev, struct vm_area_struct *vma,
                unsigned long start, unsigned long end,
                unsigned long new_flags);

/* We are about to modify the VMA's flags and/or anon_name. */
__must_check struct vm_area_struct
*vma_modify_flags_name(struct vma_iterator *vmi,
                       struct vm_area_struct *prev,
                       struct vm_area_struct *vma,
                       unsigned long start,
                       unsigned long end,
                       unsigned long new_flags,
                       struct anon_vma_name *new_name);

/* We are about to modify the VMA's memory policy. */
__must_check struct vm_area_struct
*vma_modify_policy(struct vma_iterator *vmi,
                   struct vm_area_struct *prev,
                   struct vm_area_struct *vma,
                   unsigned long start, unsigned long end,
                   struct mempolicy *new_pol);

/* We are about to modify the VMA's flags and/or uffd context. */
__must_check struct vm_area_struct
*vma_modify_flags_uffd(struct vma_iterator *vmi,
                       struct vm_area_struct *prev,
                       struct vm_area_struct *vma,
                       unsigned long start, unsigned long end,
                       unsigned long new_flags,
                       struct vm_userfaultfd_ctx new_ctx,
                       bool give_up_on_oom);

__must_check struct vm_area_struct
*vma_merge_new_range(struct vma_merge_struct *vmg);

__must_check struct vm_area_struct
*vma_merge_extend(struct vma_iterator *vmi,
                  struct vm_area_struct *vma,
                  unsigned long delta);

void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);

void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb);

void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
                               struct vm_area_struct *vma);

void unlink_file_vma(struct vm_area_struct *vma);

void vma_link_file(struct vm_area_struct *vma);

int vma_link(struct mm_struct *mm, struct vm_area_struct *vma);

struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks);

struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma);

bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);

int mm_take_all_locks(struct mm_struct *mm);
void mm_drop_all_locks(struct mm_struct *mm);

unsigned long mmap_region(struct file *file, unsigned long addr,
                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
                struct list_head *uf);

int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma,
                 unsigned long addr, unsigned long request, unsigned long flags);

unsigned long unmapped_area(struct vm_unmapped_area_info *info);
unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info);

static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
{
        /*
         * We want to check manually if we can change individual PTEs writable
         * if we can't do that automatically for all PTEs in a mapping. For
         * private mappings, that's always the case when we have write
         * permissions as we properly have to handle COW.
         */
        if (vma->vm_flags & VM_SHARED)
                return vma_wants_writenotify(vma, vma->vm_page_prot);
        return !!(vma->vm_flags & VM_WRITE);
}

#ifdef CONFIG_MMU
static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
        return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
}
#endif

static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
                                                    unsigned long min)
{
        return mas_prev(&vmi->mas, min);
}

/*
 * These three helpers classifies VMAs for virtual memory accounting.
 */

/*
 * Executable code area - executable, not writable, not stack
 */
static inline bool is_exec_mapping(vm_flags_t flags)
{
        return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
}

/*
 * Stack area (including shadow stacks)
 *
 * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
 * do_mmap() forbids all other combinations.
 */
static inline bool is_stack_mapping(vm_flags_t flags)
{
        return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
}

/*
 * Data area - private, writable, not stack
 */
static inline bool is_data_mapping(vm_flags_t flags)
{
        return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
}


static inline void vma_iter_config(struct vma_iterator *vmi,
                unsigned long index, unsigned long last)
{
        __mas_set_range(&vmi->mas, index, last - 1);
}

static inline void vma_iter_reset(struct vma_iterator *vmi)
{
        mas_reset(&vmi->mas);
}

static inline
struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
{
        return mas_prev_range(&vmi->mas, min);
}

static inline
struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
{
        return mas_next_range(&vmi->mas, max);
}

static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
                                       unsigned long max, unsigned long size)
{
        return mas_empty_area(&vmi->mas, min, max - 1, size);
}

static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
                                        unsigned long max, unsigned long size)
{
        return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
}

/*
 * VMA Iterator functions shared between nommu and mmap
 */
static inline int vma_iter_prealloc(struct vma_iterator *vmi,
                struct vm_area_struct *vma)
{
        return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
}

static inline void vma_iter_clear(struct vma_iterator *vmi)
{
        mas_store_prealloc(&vmi->mas, NULL);
}

static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
{
        return mas_walk(&vmi->mas);
}

/* Store a VMA with preallocated memory */
static inline void vma_iter_store_overwrite(struct vma_iterator *vmi,
                                            struct vm_area_struct *vma)
{
        vma_assert_attached(vma);

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
        if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
                        vmi->mas.index > vma->vm_start)) {
                pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
                        vmi->mas.index, vma->vm_start, vma->vm_start,
                        vma->vm_end, vmi->mas.index, vmi->mas.last);
        }
        if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
                        vmi->mas.last <  vma->vm_start)) {
                pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
                       vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
                       vmi->mas.index, vmi->mas.last);
        }
#endif

        if (vmi->mas.status != ma_start &&
            ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
                vma_iter_invalidate(vmi);

        __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
        mas_store_prealloc(&vmi->mas, vma);
}

static inline void vma_iter_store_new(struct vma_iterator *vmi,
                                      struct vm_area_struct *vma)
{
        vma_mark_attached(vma);
        vma_iter_store_overwrite(vmi, vma);
}

static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
{
        return vmi->mas.index;
}

static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
{
        return vmi->mas.last + 1;
}

static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
                                      unsigned long count)
{
        return mas_expected_entries(&vmi->mas, count);
}

static inline
struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
{
        return mas_prev_range(&vmi->mas, 0);
}

/*
 * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or
 * if no previous VMA, to index 0.
 */
static inline
struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
                struct vm_area_struct **pprev)
{
        struct vm_area_struct *next = vma_next(vmi);
        struct vm_area_struct *prev = vma_prev(vmi);

        /*
         * Consider the case where no previous VMA exists. We advance to the
         * next VMA, skipping any gap, then rewind to the start of the range.
         *
         * If we were to unconditionally advance to the next range we'd wind up
         * at the next VMA again, so we check to ensure there is a previous VMA
         * to skip over.
         */
        if (prev)
                vma_iter_next_range(vmi);

        if (pprev)
                *pprev = prev;

        return next;
}

#ifdef CONFIG_64BIT

static inline bool vma_is_sealed(struct vm_area_struct *vma)
{
        return (vma->vm_flags & VM_SEALED);
}

/*
 * check if a vma is sealed for modification.
 * return true, if modification is allowed.
 */
static inline bool can_modify_vma(struct vm_area_struct *vma)
{
        if (unlikely(vma_is_sealed(vma)))
                return false;

        return true;
}

bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);

#else

static inline bool can_modify_vma(struct vm_area_struct *vma)
{
        return true;
}

static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
{
        return true;
}

#endif

#if defined(CONFIG_STACK_GROWSUP)
int expand_upwards(struct vm_area_struct *vma, unsigned long address);
#endif

int expand_downwards(struct vm_area_struct *vma, unsigned long address);

int __vm_munmap(unsigned long start, size_t len, bool unlock);

#endif        /* __MM_VMA_H */







































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Common values and helper functions for the ChaCha and XChaCha stream ciphers.
 *
 * XChaCha extends ChaCha's nonce to 192 bits, while provably retaining ChaCha's
 * security.  Here they share the same key size, tfm context, and setkey
 * function; only their IV size and encrypt/decrypt function differ.
 *
 * The ChaCha paper specifies 20, 12, and 8-round variants.  In general, it is
 * recommended to use the 20-round variant ChaCha20.  However, the other
 * variants can be needed in some performance-sensitive scenarios.  The generic
 * ChaCha code currently allows only the 20 and 12-round variants.
 */

#ifndef _CRYPTO_CHACHA_H
#define _CRYPTO_CHACHA_H

#include <linux/unaligned.h>
#include <linux/types.h>

/* 32-bit stream position, then 96-bit nonce (RFC7539 convention) */
#define CHACHA_IV_SIZE                16

#define CHACHA_KEY_SIZE                32
#define CHACHA_BLOCK_SIZE        64
#define CHACHAPOLY_IV_SIZE        12

#define CHACHA_STATE_WORDS        (CHACHA_BLOCK_SIZE / sizeof(u32))

/* 192-bit nonce, then 64-bit stream position */
#define XCHACHA_IV_SIZE                32

void chacha_block_generic(u32 *state, u8 *stream, int nrounds);
static inline void chacha20_block(u32 *state, u8 *stream)
{
        chacha_block_generic(state, stream, 20);
}

void hchacha_block_arch(const u32 *state, u32 *out, int nrounds);
void hchacha_block_generic(const u32 *state, u32 *out, int nrounds);

static inline void hchacha_block(const u32 *state, u32 *out, int nrounds)
{
        if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
                hchacha_block_arch(state, out, nrounds);
        else
                hchacha_block_generic(state, out, nrounds);
}

enum chacha_constants { /* expand 32-byte k */
        CHACHA_CONSTANT_EXPA = 0x61707865U,
        CHACHA_CONSTANT_ND_3 = 0x3320646eU,
        CHACHA_CONSTANT_2_BY = 0x79622d32U,
        CHACHA_CONSTANT_TE_K = 0x6b206574U
};

static inline void chacha_init_consts(u32 *state)
{
        state[0]  = CHACHA_CONSTANT_EXPA;
        state[1]  = CHACHA_CONSTANT_ND_3;
        state[2]  = CHACHA_CONSTANT_2_BY;
        state[3]  = CHACHA_CONSTANT_TE_K;
}

static inline void chacha_init(u32 *state, const u32 *key, const u8 *iv)
{
        chacha_init_consts(state);
        state[4]  = key[0];
        state[5]  = key[1];
        state[6]  = key[2];
        state[7]  = key[3];
        state[8]  = key[4];
        state[9]  = key[5];
        state[10] = key[6];
        state[11] = key[7];
        state[12] = get_unaligned_le32(iv +  0);
        state[13] = get_unaligned_le32(iv +  4);
        state[14] = get_unaligned_le32(iv +  8);
        state[15] = get_unaligned_le32(iv + 12);
}

void chacha_crypt_arch(u32 *state, u8 *dst, const u8 *src,
                       unsigned int bytes, int nrounds);
void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src,
                          unsigned int bytes, int nrounds);

static inline void chacha_crypt(u32 *state, u8 *dst, const u8 *src,
                                unsigned int bytes, int nrounds)
{
        if (IS_ENABLED(CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA))
                chacha_crypt_arch(state, dst, src, bytes, nrounds);
        else
                chacha_crypt_generic(state, dst, src, bytes, nrounds);
}

static inline void chacha20_crypt(u32 *state, u8 *dst, const u8 *src,
                                  unsigned int bytes)
{
        chacha_crypt(state, dst, src, bytes, 20);
}

#endif /* _CRYPTO_CHACHA_H */

























































































































































  164 
  164 





  163 

  162 




  163 

































  164 




  164 




  164 




  163 





  162 
  163 






































































































































































































































































































































































































































































































































































































































































  164 




















  164 


  164 










  164 

  164 












  164 















  162 




  164 


  164 




  164 




  162 







  164 




  163 







  163 




  164 


  164 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2025 Google LLC
 * Author: Marc Zyngier <maz@kernel.org>
 */

#include <linux/kvm_host.h>
#include <asm/sysreg.h>

struct reg_bits_to_feat_map {
        u64                bits;

#define        NEVER_FGU        BIT(0)        /* Can trap, but never UNDEF */
#define        CALL_FUNC        BIT(1)        /* Needs to evaluate tons of crap */
#define        FIXED_VALUE        BIT(2)        /* RAZ/WI or RAO/WI in KVM */
        unsigned long        flags;

        union {
                struct {
                        u8        regidx;
                        u8        shift;
                        u8        width;
                        bool        sign;
                        s8        lo_lim;
                };
                bool        (*match)(struct kvm *);
                bool        (*fval)(struct kvm *, u64 *);
        };
};

#define __NEEDS_FEAT_3(m, f, id, fld, lim)                \
        {                                                \
                .bits        = (m),                                \
                .flags = (f),                                \
                .regidx        = IDREG_IDX(SYS_ ## id),        \
                .shift        = id ##_## fld ## _SHIFT,        \
                .width        = id ##_## fld ## _WIDTH,        \
                .sign        = id ##_## fld ## _SIGNED,        \
                .lo_lim        = id ##_## fld ##_## lim        \
        }

#define __NEEDS_FEAT_2(m, f, fun, dummy)                \
        {                                                \
                .bits        = (m),                                \
                .flags = (f) | CALL_FUNC,                \
                .fval = (fun),                                \
        }

#define __NEEDS_FEAT_1(m, f, fun)                        \
        {                                                \
                .bits        = (m),                                \
                .flags = (f) | CALL_FUNC,                \
                .match = (fun),                                \
        }

#define NEEDS_FEAT_FLAG(m, f, ...)                        \
        CONCATENATE(__NEEDS_FEAT_, COUNT_ARGS(__VA_ARGS__))(m, f, __VA_ARGS__)

#define NEEDS_FEAT_FIXED(m, ...)                        \
        NEEDS_FEAT_FLAG(m, FIXED_VALUE, __VA_ARGS__, 0)

#define NEEDS_FEAT(m, ...)        NEEDS_FEAT_FLAG(m, 0, __VA_ARGS__)

#define FEAT_SPE                ID_AA64DFR0_EL1, PMSVer, IMP
#define FEAT_SPE_FnE                ID_AA64DFR0_EL1, PMSVer, V1P2
#define FEAT_BRBE                ID_AA64DFR0_EL1, BRBE, IMP
#define FEAT_TRC_SR                ID_AA64DFR0_EL1, TraceVer, IMP
#define FEAT_PMUv3                ID_AA64DFR0_EL1, PMUVer, IMP
#define FEAT_PMUv3p9                ID_AA64DFR0_EL1, PMUVer, V3P9
#define FEAT_TRBE                ID_AA64DFR0_EL1, TraceBuffer, IMP
#define FEAT_TRBEv1p1                ID_AA64DFR0_EL1, TraceBuffer, TRBE_V1P1
#define FEAT_DoubleLock                ID_AA64DFR0_EL1, DoubleLock, IMP
#define FEAT_TRF                ID_AA64DFR0_EL1, TraceFilt, IMP
#define FEAT_AA32EL0                ID_AA64PFR0_EL1, EL0, AARCH32
#define FEAT_AA32EL1                ID_AA64PFR0_EL1, EL1, AARCH32
#define FEAT_AA64EL1                ID_AA64PFR0_EL1, EL1, IMP
#define FEAT_AA64EL3                ID_AA64PFR0_EL1, EL3, IMP
#define FEAT_AIE                ID_AA64MMFR3_EL1, AIE, IMP
#define FEAT_S2POE                ID_AA64MMFR3_EL1, S2POE, IMP
#define FEAT_S1POE                ID_AA64MMFR3_EL1, S1POE, IMP
#define FEAT_S1PIE                ID_AA64MMFR3_EL1, S1PIE, IMP
#define FEAT_THE                ID_AA64PFR1_EL1, THE, IMP
#define FEAT_SME                ID_AA64PFR1_EL1, SME, IMP
#define FEAT_GCS                ID_AA64PFR1_EL1, GCS, IMP
#define FEAT_LS64                ID_AA64ISAR1_EL1, LS64, LS64
#define FEAT_LS64_V                ID_AA64ISAR1_EL1, LS64, LS64_V
#define FEAT_LS64_ACCDATA        ID_AA64ISAR1_EL1, LS64, LS64_ACCDATA
#define FEAT_RAS                ID_AA64PFR0_EL1, RAS, IMP
#define FEAT_RASv2                ID_AA64PFR0_EL1, RAS, V2
#define FEAT_GICv3                ID_AA64PFR0_EL1, GIC, IMP
#define FEAT_LOR                ID_AA64MMFR1_EL1, LO, IMP
#define FEAT_SPEv1p4                ID_AA64DFR0_EL1, PMSVer, V1P4
#define FEAT_SPEv1p5                ID_AA64DFR0_EL1, PMSVer, V1P5
#define FEAT_ATS1A                ID_AA64ISAR2_EL1, ATS1A, IMP
#define FEAT_SPECRES2                ID_AA64ISAR1_EL1, SPECRES, COSP_RCTX
#define FEAT_SPECRES                ID_AA64ISAR1_EL1, SPECRES, IMP
#define FEAT_TLBIRANGE                ID_AA64ISAR0_EL1, TLB, RANGE
#define FEAT_TLBIOS                ID_AA64ISAR0_EL1, TLB, OS
#define FEAT_PAN2                ID_AA64MMFR1_EL1, PAN, PAN2
#define FEAT_DPB2                ID_AA64ISAR1_EL1, DPB, DPB2
#define FEAT_AMUv1                ID_AA64PFR0_EL1, AMU, IMP
#define FEAT_AMUv1p1                ID_AA64PFR0_EL1, AMU, V1P1
#define FEAT_CMOW                ID_AA64MMFR1_EL1, CMOW, IMP
#define FEAT_D128                ID_AA64MMFR3_EL1, D128, IMP
#define FEAT_DoubleFault2        ID_AA64PFR1_EL1, DF2, IMP
#define FEAT_FPMR                ID_AA64PFR2_EL1, FPMR, IMP
#define FEAT_MOPS                ID_AA64ISAR2_EL1, MOPS, IMP
#define FEAT_NMI                ID_AA64PFR1_EL1, NMI, IMP
#define FEAT_SCTLR2                ID_AA64MMFR3_EL1, SCTLRX, IMP
#define FEAT_SYSREG128                ID_AA64ISAR2_EL1, SYSREG_128, IMP
#define FEAT_TCR2                ID_AA64MMFR3_EL1, TCRX, IMP
#define FEAT_XS                        ID_AA64ISAR1_EL1, XS, IMP
#define FEAT_EVT                ID_AA64MMFR2_EL1, EVT, IMP
#define FEAT_EVT_TTLBxS                ID_AA64MMFR2_EL1, EVT, TTLBxS
#define FEAT_MTE2                ID_AA64PFR1_EL1, MTE, MTE2
#define FEAT_RME                ID_AA64PFR0_EL1, RME, IMP
#define FEAT_MPAM                ID_AA64PFR0_EL1, MPAM, 1
#define FEAT_S2FWB                ID_AA64MMFR2_EL1, FWB, IMP
#define FEAT_TME                ID_AA64ISAR0_EL1, TME, IMP
#define FEAT_TWED                ID_AA64MMFR1_EL1, TWED, IMP
#define FEAT_E2H0                ID_AA64MMFR4_EL1, E2H0, IMP
#define FEAT_SRMASK                ID_AA64MMFR4_EL1, SRMASK, IMP
#define FEAT_PoPS                ID_AA64MMFR4_EL1, PoPS, IMP
#define FEAT_PFAR                ID_AA64PFR1_EL1, PFAR, IMP
#define FEAT_Debugv8p9                ID_AA64DFR0_EL1, PMUVer, V3P9
#define FEAT_PMUv3_SS                ID_AA64DFR0_EL1, PMSS, IMP
#define FEAT_SEBEP                ID_AA64DFR0_EL1, SEBEP, IMP
#define FEAT_EBEP                ID_AA64DFR1_EL1, EBEP, IMP
#define FEAT_ITE                ID_AA64DFR1_EL1, ITE, IMP
#define FEAT_PMUv3_ICNTR        ID_AA64DFR1_EL1, PMICNTR, IMP
#define FEAT_SPMU                ID_AA64DFR1_EL1, SPMU, IMP
#define FEAT_SPE_nVM                ID_AA64DFR2_EL1, SPE_nVM, IMP
#define FEAT_STEP2                ID_AA64DFR2_EL1, STEP, IMP

static bool not_feat_aa64el3(struct kvm *kvm)
{
        return !kvm_has_feat(kvm, FEAT_AA64EL3);
}

static bool feat_nv2(struct kvm *kvm)
{
        return ((kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY) &&
                 kvm_has_feat_enum(kvm, ID_AA64MMFR2_EL1, NV, NI)) ||
                kvm_has_feat(kvm, ID_AA64MMFR2_EL1, NV, NV2));
}

static bool feat_nv2_e2h0_ni(struct kvm *kvm)
{
        return feat_nv2(kvm) && !kvm_has_feat(kvm, FEAT_E2H0);
}

static bool feat_rasv1p1(struct kvm *kvm)
{
        return (kvm_has_feat(kvm, ID_AA64PFR0_EL1, RAS, V1P1) ||
                (kvm_has_feat_enum(kvm, ID_AA64PFR0_EL1, RAS, IMP) &&
                 kvm_has_feat(kvm, ID_AA64PFR1_EL1, RAS_frac, RASv1p1)));
}

static bool feat_csv2_2_csv2_1p2(struct kvm *kvm)
{
        return (kvm_has_feat(kvm,  ID_AA64PFR0_EL1, CSV2, CSV2_2) ||
                (kvm_has_feat(kvm, ID_AA64PFR1_EL1, CSV2_frac, CSV2_1p2) &&
                 kvm_has_feat_enum(kvm,  ID_AA64PFR0_EL1, CSV2, IMP)));
}

static bool feat_pauth(struct kvm *kvm)
{
        return kvm_has_pauth(kvm, PAuth);
}

static bool feat_pauth_lr(struct kvm *kvm)
{
        return kvm_has_pauth(kvm, PAuth_LR);
}

static bool feat_aderr(struct kvm *kvm)
{
        return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ADERR, FEAT_ADERR) &&
                kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SDERR, FEAT_ADERR));
}

static bool feat_anerr(struct kvm *kvm)
{
        return (kvm_has_feat(kvm, ID_AA64MMFR3_EL1, ANERR, FEAT_ANERR) &&
                kvm_has_feat(kvm, ID_AA64MMFR3_EL1, SNERR, FEAT_ANERR));
}

static bool feat_sme_smps(struct kvm *kvm)
{
        /*
         * Revists this if KVM ever supports SME -- this really should
         * look at the guest's view of SMIDR_EL1. Funnily enough, this
         * is not captured in the JSON file, but only as a note in the
         * ARM ARM.
         */
        return (kvm_has_feat(kvm, FEAT_SME) &&
                (read_sysreg_s(SYS_SMIDR_EL1) & SMIDR_EL1_SMPS));
}

static bool feat_spe_fds(struct kvm *kvm)
{
        /*
         * Revists this if KVM ever supports SPE -- this really should
         * look at the guest's view of PMSIDR_EL1.
         */
        return (kvm_has_feat(kvm, FEAT_SPEv1p4) &&
                (read_sysreg_s(SYS_PMSIDR_EL1) & PMSIDR_EL1_FDS));
}

static bool feat_trbe_mpam(struct kvm *kvm)
{
        /*
         * Revists this if KVM ever supports both MPAM and TRBE --
         * this really should look at the guest's view of TRBIDR_EL1.
         */
        return (kvm_has_feat(kvm, FEAT_TRBE) &&
                kvm_has_feat(kvm, FEAT_MPAM) &&
                (read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_EL1_MPAM));
}

static bool feat_ebep_pmuv3_ss(struct kvm *kvm)
{
        return kvm_has_feat(kvm, FEAT_EBEP) || kvm_has_feat(kvm, FEAT_PMUv3_SS);
}

static bool compute_hcr_rw(struct kvm *kvm, u64 *bits)
{
        /* This is purely academic: AArch32 and NV are mutually exclusive */
        if (bits) {
                if (kvm_has_feat(kvm, FEAT_AA32EL1))
                        *bits &= ~HCR_EL2_RW;
                else
                        *bits |= HCR_EL2_RW;
        }

        return true;
}

static bool compute_hcr_e2h(struct kvm *kvm, u64 *bits)
{
        if (bits) {
                if (kvm_has_feat(kvm, FEAT_E2H0))
                        *bits &= ~HCR_EL2_E2H;
                else
                        *bits |= HCR_EL2_E2H;
        }

        return true;
}

static const struct reg_bits_to_feat_map hfgrtr_feat_map[] = {
        NEEDS_FEAT(HFGRTR_EL2_nAMAIR2_EL1        |
                   HFGRTR_EL2_nMAIR2_EL1,
                   FEAT_AIE),
        NEEDS_FEAT(HFGRTR_EL2_nS2POR_EL1, FEAT_S2POE),
        NEEDS_FEAT(HFGRTR_EL2_nPOR_EL1                |
                   HFGRTR_EL2_nPOR_EL0,
                   FEAT_S1POE),
        NEEDS_FEAT(HFGRTR_EL2_nPIR_EL1                |
                   HFGRTR_EL2_nPIRE0_EL1,
                   FEAT_S1PIE),
        NEEDS_FEAT(HFGRTR_EL2_nRCWMASK_EL1, FEAT_THE),
        NEEDS_FEAT(HFGRTR_EL2_nTPIDR2_EL0        |
                   HFGRTR_EL2_nSMPRI_EL1,
                   FEAT_SME),
        NEEDS_FEAT(HFGRTR_EL2_nGCS_EL1                |
                   HFGRTR_EL2_nGCS_EL0,
                   FEAT_GCS),
        NEEDS_FEAT(HFGRTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA),
        NEEDS_FEAT(HFGRTR_EL2_ERXADDR_EL1        |
                   HFGRTR_EL2_ERXMISCn_EL1        |
                   HFGRTR_EL2_ERXSTATUS_EL1        |
                   HFGRTR_EL2_ERXCTLR_EL1        |
                   HFGRTR_EL2_ERXFR_EL1                |
                   HFGRTR_EL2_ERRSELR_EL1        |
                   HFGRTR_EL2_ERRIDR_EL1,
                   FEAT_RAS),
        NEEDS_FEAT(HFGRTR_EL2_ERXPFGCDN_EL1        |
                   HFGRTR_EL2_ERXPFGCTL_EL1        |
                   HFGRTR_EL2_ERXPFGF_EL1,
                   feat_rasv1p1),
        NEEDS_FEAT(HFGRTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3),
        NEEDS_FEAT(HFGRTR_EL2_SCXTNUM_EL0        |
                   HFGRTR_EL2_SCXTNUM_EL1,
                   feat_csv2_2_csv2_1p2),
        NEEDS_FEAT(HFGRTR_EL2_LORSA_EL1                |
                   HFGRTR_EL2_LORN_EL1                |
                   HFGRTR_EL2_LORID_EL1                |
                   HFGRTR_EL2_LOREA_EL1                |
                   HFGRTR_EL2_LORC_EL1,
                   FEAT_LOR),
        NEEDS_FEAT(HFGRTR_EL2_APIBKey                |
                   HFGRTR_EL2_APIAKey                |
                   HFGRTR_EL2_APGAKey                |
                   HFGRTR_EL2_APDBKey                |
                   HFGRTR_EL2_APDAKey,
                   feat_pauth),
        NEEDS_FEAT_FLAG(HFGRTR_EL2_VBAR_EL1        |
                        HFGRTR_EL2_TTBR1_EL1        |
                        HFGRTR_EL2_TTBR0_EL1        |
                        HFGRTR_EL2_TPIDR_EL0        |
                        HFGRTR_EL2_TPIDRRO_EL0        |
                        HFGRTR_EL2_TPIDR_EL1        |
                        HFGRTR_EL2_TCR_EL1        |
                        HFGRTR_EL2_SCTLR_EL1        |
                        HFGRTR_EL2_REVIDR_EL1        |
                        HFGRTR_EL2_PAR_EL1        |
                        HFGRTR_EL2_MPIDR_EL1        |
                        HFGRTR_EL2_MIDR_EL1        |
                        HFGRTR_EL2_MAIR_EL1        |
                        HFGRTR_EL2_ISR_EL1        |
                        HFGRTR_EL2_FAR_EL1        |
                        HFGRTR_EL2_ESR_EL1        |
                        HFGRTR_EL2_DCZID_EL0        |
                        HFGRTR_EL2_CTR_EL0        |
                        HFGRTR_EL2_CSSELR_EL1        |
                        HFGRTR_EL2_CPACR_EL1        |
                        HFGRTR_EL2_CONTEXTIDR_EL1|
                        HFGRTR_EL2_CLIDR_EL1        |
                        HFGRTR_EL2_CCSIDR_EL1        |
                        HFGRTR_EL2_AMAIR_EL1        |
                        HFGRTR_EL2_AIDR_EL1        |
                        HFGRTR_EL2_AFSR1_EL1        |
                        HFGRTR_EL2_AFSR0_EL1,
                        NEVER_FGU, FEAT_AA64EL1),
};

static const struct reg_bits_to_feat_map hfgwtr_feat_map[] = {
        NEEDS_FEAT(HFGWTR_EL2_nAMAIR2_EL1        |
                   HFGWTR_EL2_nMAIR2_EL1,
                   FEAT_AIE),
        NEEDS_FEAT(HFGWTR_EL2_nS2POR_EL1, FEAT_S2POE),
        NEEDS_FEAT(HFGWTR_EL2_nPOR_EL1                |
                   HFGWTR_EL2_nPOR_EL0,
                   FEAT_S1POE),
        NEEDS_FEAT(HFGWTR_EL2_nPIR_EL1                |
                   HFGWTR_EL2_nPIRE0_EL1,
                   FEAT_S1PIE),
        NEEDS_FEAT(HFGWTR_EL2_nRCWMASK_EL1, FEAT_THE),
        NEEDS_FEAT(HFGWTR_EL2_nTPIDR2_EL0        |
                   HFGWTR_EL2_nSMPRI_EL1,
                   FEAT_SME),
        NEEDS_FEAT(HFGWTR_EL2_nGCS_EL1                |
                   HFGWTR_EL2_nGCS_EL0,
                   FEAT_GCS),
        NEEDS_FEAT(HFGWTR_EL2_nACCDATA_EL1, FEAT_LS64_ACCDATA),
        NEEDS_FEAT(HFGWTR_EL2_ERXADDR_EL1        |
                   HFGWTR_EL2_ERXMISCn_EL1        |
                   HFGWTR_EL2_ERXSTATUS_EL1        |
                   HFGWTR_EL2_ERXCTLR_EL1        |
                   HFGWTR_EL2_ERRSELR_EL1,
                   FEAT_RAS),
        NEEDS_FEAT(HFGWTR_EL2_ERXPFGCDN_EL1        |
                   HFGWTR_EL2_ERXPFGCTL_EL1,
                   feat_rasv1p1),
        NEEDS_FEAT(HFGWTR_EL2_ICC_IGRPENn_EL1, FEAT_GICv3),
        NEEDS_FEAT(HFGWTR_EL2_SCXTNUM_EL0        |
                   HFGWTR_EL2_SCXTNUM_EL1,
                   feat_csv2_2_csv2_1p2),
        NEEDS_FEAT(HFGWTR_EL2_LORSA_EL1                |
                   HFGWTR_EL2_LORN_EL1                |
                   HFGWTR_EL2_LOREA_EL1                |
                   HFGWTR_EL2_LORC_EL1,
                   FEAT_LOR),
        NEEDS_FEAT(HFGWTR_EL2_APIBKey                |
                   HFGWTR_EL2_APIAKey                |
                   HFGWTR_EL2_APGAKey                |
                   HFGWTR_EL2_APDBKey                |
                   HFGWTR_EL2_APDAKey,
                   feat_pauth),
        NEEDS_FEAT_FLAG(HFGWTR_EL2_VBAR_EL1        |
                        HFGWTR_EL2_TTBR1_EL1        |
                        HFGWTR_EL2_TTBR0_EL1        |
                        HFGWTR_EL2_TPIDR_EL0        |
                        HFGWTR_EL2_TPIDRRO_EL0        |
                        HFGWTR_EL2_TPIDR_EL1        |
                        HFGWTR_EL2_TCR_EL1        |
                        HFGWTR_EL2_SCTLR_EL1        |
                        HFGWTR_EL2_PAR_EL1        |
                        HFGWTR_EL2_MAIR_EL1        |
                        HFGWTR_EL2_FAR_EL1        |
                        HFGWTR_EL2_ESR_EL1        |
                        HFGWTR_EL2_CSSELR_EL1        |
                        HFGWTR_EL2_CPACR_EL1        |
                        HFGWTR_EL2_CONTEXTIDR_EL1|
                        HFGWTR_EL2_AMAIR_EL1        |
                        HFGWTR_EL2_AFSR1_EL1        |
                        HFGWTR_EL2_AFSR0_EL1,
                        NEVER_FGU, FEAT_AA64EL1),
};

static const struct reg_bits_to_feat_map hdfgrtr_feat_map[] = {
        NEEDS_FEAT(HDFGRTR_EL2_PMBIDR_EL1        |
                   HDFGRTR_EL2_PMSLATFR_EL1        |
                   HDFGRTR_EL2_PMSIRR_EL1        |
                   HDFGRTR_EL2_PMSIDR_EL1        |
                   HDFGRTR_EL2_PMSICR_EL1        |
                   HDFGRTR_EL2_PMSFCR_EL1        |
                   HDFGRTR_EL2_PMSEVFR_EL1        |
                   HDFGRTR_EL2_PMSCR_EL1        |
                   HDFGRTR_EL2_PMBSR_EL1        |
                   HDFGRTR_EL2_PMBPTR_EL1        |
                   HDFGRTR_EL2_PMBLIMITR_EL1,
                   FEAT_SPE),
        NEEDS_FEAT(HDFGRTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE),
        NEEDS_FEAT(HDFGRTR_EL2_nBRBDATA                |
                   HDFGRTR_EL2_nBRBCTL                |
                   HDFGRTR_EL2_nBRBIDR,
                   FEAT_BRBE),
        NEEDS_FEAT(HDFGRTR_EL2_TRCVICTLR        |
                   HDFGRTR_EL2_TRCSTATR                |
                   HDFGRTR_EL2_TRCSSCSRn        |
                   HDFGRTR_EL2_TRCSEQSTR        |
                   HDFGRTR_EL2_TRCPRGCTLR        |
                   HDFGRTR_EL2_TRCOSLSR                |
                   HDFGRTR_EL2_TRCIMSPECn        |
                   HDFGRTR_EL2_TRCID                |
                   HDFGRTR_EL2_TRCCNTVRn        |
                   HDFGRTR_EL2_TRCCLAIM                |
                   HDFGRTR_EL2_TRCAUXCTLR        |
                   HDFGRTR_EL2_TRCAUTHSTATUS        |
                   HDFGRTR_EL2_TRC,
                   FEAT_TRC_SR),
        NEEDS_FEAT(HDFGRTR_EL2_PMCEIDn_EL0        |
                   HDFGRTR_EL2_PMUSERENR_EL0        |
                   HDFGRTR_EL2_PMMIR_EL1        |
                   HDFGRTR_EL2_PMSELR_EL0        |
                   HDFGRTR_EL2_PMOVS                |
                   HDFGRTR_EL2_PMINTEN                |
                   HDFGRTR_EL2_PMCNTEN                |
                   HDFGRTR_EL2_PMCCNTR_EL0        |
                   HDFGRTR_EL2_PMCCFILTR_EL0        |
                   HDFGRTR_EL2_PMEVTYPERn_EL0        |
                   HDFGRTR_EL2_PMEVCNTRn_EL0,
                   FEAT_PMUv3),
        NEEDS_FEAT(HDFGRTR_EL2_TRBTRG_EL1        |
                   HDFGRTR_EL2_TRBSR_EL1        |
                   HDFGRTR_EL2_TRBPTR_EL1        |
                   HDFGRTR_EL2_TRBMAR_EL1        |
                   HDFGRTR_EL2_TRBLIMITR_EL1        |
                   HDFGRTR_EL2_TRBIDR_EL1        |
                   HDFGRTR_EL2_TRBBASER_EL1,
                   FEAT_TRBE),
        NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSDLR_EL1, NEVER_FGU,
                        FEAT_DoubleLock),
        NEEDS_FEAT_FLAG(HDFGRTR_EL2_OSECCR_EL1        |
                        HDFGRTR_EL2_OSLSR_EL1        |
                        HDFGRTR_EL2_DBGPRCR_EL1        |
                        HDFGRTR_EL2_DBGAUTHSTATUS_EL1|
                        HDFGRTR_EL2_DBGCLAIM        |
                        HDFGRTR_EL2_MDSCR_EL1        |
                        HDFGRTR_EL2_DBGWVRn_EL1        |
                        HDFGRTR_EL2_DBGWCRn_EL1        |
                        HDFGRTR_EL2_DBGBVRn_EL1        |
                        HDFGRTR_EL2_DBGBCRn_EL1,
                        NEVER_FGU, FEAT_AA64EL1)
};

static const struct reg_bits_to_feat_map hdfgwtr_feat_map[] = {
        NEEDS_FEAT(HDFGWTR_EL2_PMSLATFR_EL1        |
                   HDFGWTR_EL2_PMSIRR_EL1        |
                   HDFGWTR_EL2_PMSICR_EL1        |
                   HDFGWTR_EL2_PMSFCR_EL1        |
                   HDFGWTR_EL2_PMSEVFR_EL1        |
                   HDFGWTR_EL2_PMSCR_EL1        |
                   HDFGWTR_EL2_PMBSR_EL1        |
                   HDFGWTR_EL2_PMBPTR_EL1        |
                   HDFGWTR_EL2_PMBLIMITR_EL1,
                   FEAT_SPE),
        NEEDS_FEAT(HDFGWTR_EL2_nPMSNEVFR_EL1, FEAT_SPE_FnE),
        NEEDS_FEAT(HDFGWTR_EL2_nBRBDATA                |
                   HDFGWTR_EL2_nBRBCTL,
                   FEAT_BRBE),
        NEEDS_FEAT(HDFGWTR_EL2_TRCVICTLR        |
                   HDFGWTR_EL2_TRCSSCSRn        |
                   HDFGWTR_EL2_TRCSEQSTR        |
                   HDFGWTR_EL2_TRCPRGCTLR        |
                   HDFGWTR_EL2_TRCOSLAR                |
                   HDFGWTR_EL2_TRCIMSPECn        |
                   HDFGWTR_EL2_TRCCNTVRn        |
                   HDFGWTR_EL2_TRCCLAIM                |
                   HDFGWTR_EL2_TRCAUXCTLR        |
                   HDFGWTR_EL2_TRC,
                   FEAT_TRC_SR),
        NEEDS_FEAT(HDFGWTR_EL2_PMUSERENR_EL0        |
                   HDFGWTR_EL2_PMCR_EL0                |
                   HDFGWTR_EL2_PMSWINC_EL0        |
                   HDFGWTR_EL2_PMSELR_EL0        |
                   HDFGWTR_EL2_PMOVS                |
                   HDFGWTR_EL2_PMINTEN                |
                   HDFGWTR_EL2_PMCNTEN                |
                   HDFGWTR_EL2_PMCCNTR_EL0        |
                   HDFGWTR_EL2_PMCCFILTR_EL0        |
                   HDFGWTR_EL2_PMEVTYPERn_EL0        |
                   HDFGWTR_EL2_PMEVCNTRn_EL0,
                   FEAT_PMUv3),
        NEEDS_FEAT(HDFGWTR_EL2_TRBTRG_EL1        |
                   HDFGWTR_EL2_TRBSR_EL1        |
                   HDFGWTR_EL2_TRBPTR_EL1        |
                   HDFGWTR_EL2_TRBMAR_EL1        |
                   HDFGWTR_EL2_TRBLIMITR_EL1        |
                   HDFGWTR_EL2_TRBBASER_EL1,
                   FEAT_TRBE),
        NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSDLR_EL1,
                        NEVER_FGU, FEAT_DoubleLock),
        NEEDS_FEAT_FLAG(HDFGWTR_EL2_OSECCR_EL1        |
                        HDFGWTR_EL2_OSLAR_EL1        |
                        HDFGWTR_EL2_DBGPRCR_EL1        |
                        HDFGWTR_EL2_DBGCLAIM        |
                        HDFGWTR_EL2_MDSCR_EL1        |
                        HDFGWTR_EL2_DBGWVRn_EL1        |
                        HDFGWTR_EL2_DBGWCRn_EL1        |
                        HDFGWTR_EL2_DBGBVRn_EL1        |
                        HDFGWTR_EL2_DBGBCRn_EL1,
                        NEVER_FGU, FEAT_AA64EL1),
        NEEDS_FEAT(HDFGWTR_EL2_TRFCR_EL1, FEAT_TRF),
};


static const struct reg_bits_to_feat_map hfgitr_feat_map[] = {
        NEEDS_FEAT(HFGITR_EL2_PSBCSYNC, FEAT_SPEv1p5),
        NEEDS_FEAT(HFGITR_EL2_ATS1E1A, FEAT_ATS1A),
        NEEDS_FEAT(HFGITR_EL2_COSPRCTX, FEAT_SPECRES2),
        NEEDS_FEAT(HFGITR_EL2_nGCSEPP                |
                   HFGITR_EL2_nGCSSTR_EL1        |
                   HFGITR_EL2_nGCSPUSHM_EL1,
                   FEAT_GCS),
        NEEDS_FEAT(HFGITR_EL2_nBRBIALL                |
                   HFGITR_EL2_nBRBINJ,
                   FEAT_BRBE),
        NEEDS_FEAT(HFGITR_EL2_CPPRCTX                |
                   HFGITR_EL2_DVPRCTX                |
                   HFGITR_EL2_CFPRCTX,
                   FEAT_SPECRES),
        NEEDS_FEAT(HFGITR_EL2_TLBIRVAALE1        |
                   HFGITR_EL2_TLBIRVALE1        |
                   HFGITR_EL2_TLBIRVAAE1        |
                   HFGITR_EL2_TLBIRVAE1                |
                   HFGITR_EL2_TLBIRVAALE1IS        |
                   HFGITR_EL2_TLBIRVALE1IS        |
                   HFGITR_EL2_TLBIRVAAE1IS        |
                   HFGITR_EL2_TLBIRVAE1IS        |
                   HFGITR_EL2_TLBIRVAALE1OS        |
                   HFGITR_EL2_TLBIRVALE1OS        |
                   HFGITR_EL2_TLBIRVAAE1OS        |
                   HFGITR_EL2_TLBIRVAE1OS,
                   FEAT_TLBIRANGE),
        NEEDS_FEAT(HFGITR_EL2_TLBIVAALE1OS        |
                   HFGITR_EL2_TLBIVALE1OS        |
                   HFGITR_EL2_TLBIVAAE1OS        |
                   HFGITR_EL2_TLBIASIDE1OS        |
                   HFGITR_EL2_TLBIVAE1OS        |
                   HFGITR_EL2_TLBIVMALLE1OS,
                   FEAT_TLBIOS),
        NEEDS_FEAT(HFGITR_EL2_ATS1E1WP                |
                   HFGITR_EL2_ATS1E1RP,
                   FEAT_PAN2),
        NEEDS_FEAT(HFGITR_EL2_DCCVADP, FEAT_DPB2),
        NEEDS_FEAT_FLAG(HFGITR_EL2_DCCVAC        |
                        HFGITR_EL2_SVC_EL1        |
                        HFGITR_EL2_SVC_EL0        |
                        HFGITR_EL2_ERET                |
                        HFGITR_EL2_TLBIVAALE1        |
                        HFGITR_EL2_TLBIVALE1        |
                        HFGITR_EL2_TLBIVAAE1        |
                        HFGITR_EL2_TLBIASIDE1        |
                        HFGITR_EL2_TLBIVAE1        |
                        HFGITR_EL2_TLBIVMALLE1        |
                        HFGITR_EL2_TLBIVAALE1IS        |
                        HFGITR_EL2_TLBIVALE1IS        |
                        HFGITR_EL2_TLBIVAAE1IS        |
                        HFGITR_EL2_TLBIASIDE1IS        |
                        HFGITR_EL2_TLBIVAE1IS        |
                        HFGITR_EL2_TLBIVMALLE1IS|
                        HFGITR_EL2_ATS1E0W        |
                        HFGITR_EL2_ATS1E0R        |
                        HFGITR_EL2_ATS1E1W        |
                        HFGITR_EL2_ATS1E1R        |
                        HFGITR_EL2_DCZVA        |
                        HFGITR_EL2_DCCIVAC        |
                        HFGITR_EL2_DCCVAP        |
                        HFGITR_EL2_DCCVAU        |
                        HFGITR_EL2_DCCISW        |
                        HFGITR_EL2_DCCSW        |
                        HFGITR_EL2_DCISW        |
                        HFGITR_EL2_DCIVAC        |
                        HFGITR_EL2_ICIVAU        |
                        HFGITR_EL2_ICIALLU        |
                        HFGITR_EL2_ICIALLUIS,
                        NEVER_FGU, FEAT_AA64EL1),
};

static const struct reg_bits_to_feat_map hafgrtr_feat_map[] = {
        NEEDS_FEAT(HAFGRTR_EL2_AMEVTYPER115_EL0        |
                   HAFGRTR_EL2_AMEVTYPER114_EL0        |
                   HAFGRTR_EL2_AMEVTYPER113_EL0        |
                   HAFGRTR_EL2_AMEVTYPER112_EL0        |
                   HAFGRTR_EL2_AMEVTYPER111_EL0        |
                   HAFGRTR_EL2_AMEVTYPER110_EL0        |
                   HAFGRTR_EL2_AMEVTYPER19_EL0        |
                   HAFGRTR_EL2_AMEVTYPER18_EL0        |
                   HAFGRTR_EL2_AMEVTYPER17_EL0        |
                   HAFGRTR_EL2_AMEVTYPER16_EL0        |
                   HAFGRTR_EL2_AMEVTYPER15_EL0        |
                   HAFGRTR_EL2_AMEVTYPER14_EL0        |
                   HAFGRTR_EL2_AMEVTYPER13_EL0        |
                   HAFGRTR_EL2_AMEVTYPER12_EL0        |
                   HAFGRTR_EL2_AMEVTYPER11_EL0        |
                   HAFGRTR_EL2_AMEVTYPER10_EL0        |
                   HAFGRTR_EL2_AMEVCNTR115_EL0        |
                   HAFGRTR_EL2_AMEVCNTR114_EL0        |
                   HAFGRTR_EL2_AMEVCNTR113_EL0        |
                   HAFGRTR_EL2_AMEVCNTR112_EL0        |
                   HAFGRTR_EL2_AMEVCNTR111_EL0        |
                   HAFGRTR_EL2_AMEVCNTR110_EL0        |
                   HAFGRTR_EL2_AMEVCNTR19_EL0        |
                   HAFGRTR_EL2_AMEVCNTR18_EL0        |
                   HAFGRTR_EL2_AMEVCNTR17_EL0        |
                   HAFGRTR_EL2_AMEVCNTR16_EL0        |
                   HAFGRTR_EL2_AMEVCNTR15_EL0        |
                   HAFGRTR_EL2_AMEVCNTR14_EL0        |
                   HAFGRTR_EL2_AMEVCNTR13_EL0        |
                   HAFGRTR_EL2_AMEVCNTR12_EL0        |
                   HAFGRTR_EL2_AMEVCNTR11_EL0        |
                   HAFGRTR_EL2_AMEVCNTR10_EL0        |
                   HAFGRTR_EL2_AMCNTEN1                |
                   HAFGRTR_EL2_AMCNTEN0                |
                   HAFGRTR_EL2_AMEVCNTR03_EL0        |
                   HAFGRTR_EL2_AMEVCNTR02_EL0        |
                   HAFGRTR_EL2_AMEVCNTR01_EL0        |
                   HAFGRTR_EL2_AMEVCNTR00_EL0,
                   FEAT_AMUv1),
};

static const struct reg_bits_to_feat_map hfgitr2_feat_map[] = {
        NEEDS_FEAT(HFGITR2_EL2_nDCCIVAPS, FEAT_PoPS),
        NEEDS_FEAT(HFGITR2_EL2_TSBCSYNC, FEAT_TRBEv1p1)
};

static const struct reg_bits_to_feat_map hfgrtr2_feat_map[] = {
        NEEDS_FEAT(HFGRTR2_EL2_nPFAR_EL1, FEAT_PFAR),
        NEEDS_FEAT(HFGRTR2_EL2_nERXGSR_EL1, FEAT_RASv2),
        NEEDS_FEAT(HFGRTR2_EL2_nACTLRALIAS_EL1        |
                   HFGRTR2_EL2_nACTLRMASK_EL1        |
                   HFGRTR2_EL2_nCPACRALIAS_EL1        |
                   HFGRTR2_EL2_nCPACRMASK_EL1        |
                   HFGRTR2_EL2_nSCTLR2MASK_EL1        |
                   HFGRTR2_EL2_nSCTLRALIAS2_EL1        |
                   HFGRTR2_EL2_nSCTLRALIAS_EL1        |
                   HFGRTR2_EL2_nSCTLRMASK_EL1        |
                   HFGRTR2_EL2_nTCR2ALIAS_EL1        |
                   HFGRTR2_EL2_nTCR2MASK_EL1        |
                   HFGRTR2_EL2_nTCRALIAS_EL1        |
                   HFGRTR2_EL2_nTCRMASK_EL1,
                   FEAT_SRMASK),
        NEEDS_FEAT(HFGRTR2_EL2_nRCWSMASK_EL1, FEAT_THE),
};

static const struct reg_bits_to_feat_map hfgwtr2_feat_map[] = {
        NEEDS_FEAT(HFGWTR2_EL2_nPFAR_EL1, FEAT_PFAR),
        NEEDS_FEAT(HFGWTR2_EL2_nACTLRALIAS_EL1        |
                   HFGWTR2_EL2_nACTLRMASK_EL1        |
                   HFGWTR2_EL2_nCPACRALIAS_EL1        |
                   HFGWTR2_EL2_nCPACRMASK_EL1        |
                   HFGWTR2_EL2_nSCTLR2MASK_EL1        |
                   HFGWTR2_EL2_nSCTLRALIAS2_EL1        |
                   HFGWTR2_EL2_nSCTLRALIAS_EL1        |
                   HFGWTR2_EL2_nSCTLRMASK_EL1        |
                   HFGWTR2_EL2_nTCR2ALIAS_EL1        |
                   HFGWTR2_EL2_nTCR2MASK_EL1        |
                   HFGWTR2_EL2_nTCRALIAS_EL1        |
                   HFGWTR2_EL2_nTCRMASK_EL1,
                   FEAT_SRMASK),
        NEEDS_FEAT(HFGWTR2_EL2_nRCWSMASK_EL1, FEAT_THE),
};

static const struct reg_bits_to_feat_map hdfgrtr2_feat_map[] = {
        NEEDS_FEAT(HDFGRTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9),
        NEEDS_FEAT(HDFGRTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss),
        NEEDS_FEAT(HDFGRTR2_EL2_nTRCITECR_EL1, FEAT_ITE),
        NEEDS_FEAT(HDFGRTR2_EL2_nPMICFILTR_EL0        |
                   HDFGRTR2_EL2_nPMICNTR_EL0,
                   FEAT_PMUv3_ICNTR),
        NEEDS_FEAT(HDFGRTR2_EL2_nPMUACR_EL1, FEAT_PMUv3p9),
        NEEDS_FEAT(HDFGRTR2_EL2_nPMSSCR_EL1        |
                   HDFGRTR2_EL2_nPMSSDATA,
                   FEAT_PMUv3_SS),
        NEEDS_FEAT(HDFGRTR2_EL2_nPMIAR_EL1, FEAT_SEBEP),
        NEEDS_FEAT(HDFGRTR2_EL2_nPMSDSFR_EL1, feat_spe_fds),
        NEEDS_FEAT(HDFGRTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM),
        NEEDS_FEAT(HDFGRTR2_EL2_nSPMACCESSR_EL1        |
                   HDFGRTR2_EL2_nSPMCNTEN        |
                   HDFGRTR2_EL2_nSPMCR_EL0        |
                   HDFGRTR2_EL2_nSPMDEVAFF_EL1        |
                   HDFGRTR2_EL2_nSPMEVCNTRn_EL0        |
                   HDFGRTR2_EL2_nSPMEVTYPERn_EL0|
                   HDFGRTR2_EL2_nSPMID                |
                   HDFGRTR2_EL2_nSPMINTEN        |
                   HDFGRTR2_EL2_nSPMOVS                |
                   HDFGRTR2_EL2_nSPMSCR_EL1        |
                   HDFGRTR2_EL2_nSPMSELR_EL0,
                   FEAT_SPMU),
        NEEDS_FEAT(HDFGRTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2),
        NEEDS_FEAT(HDFGRTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam),
};

static const struct reg_bits_to_feat_map hdfgwtr2_feat_map[] = {
        NEEDS_FEAT(HDFGWTR2_EL2_nMDSELR_EL1, FEAT_Debugv8p9),
        NEEDS_FEAT(HDFGWTR2_EL2_nPMECR_EL1, feat_ebep_pmuv3_ss),
        NEEDS_FEAT(HDFGWTR2_EL2_nTRCITECR_EL1, FEAT_ITE),
        NEEDS_FEAT(HDFGWTR2_EL2_nPMICFILTR_EL0        |
                   HDFGWTR2_EL2_nPMICNTR_EL0,
                   FEAT_PMUv3_ICNTR),
        NEEDS_FEAT(HDFGWTR2_EL2_nPMUACR_EL1        |
                   HDFGWTR2_EL2_nPMZR_EL0,
                   FEAT_PMUv3p9),
        NEEDS_FEAT(HDFGWTR2_EL2_nPMSSCR_EL1, FEAT_PMUv3_SS),
        NEEDS_FEAT(HDFGWTR2_EL2_nPMIAR_EL1, FEAT_SEBEP),
        NEEDS_FEAT(HDFGWTR2_EL2_nPMSDSFR_EL1, feat_spe_fds),
        NEEDS_FEAT(HDFGWTR2_EL2_nPMBMAR_EL1, FEAT_SPE_nVM),
        NEEDS_FEAT(HDFGWTR2_EL2_nSPMACCESSR_EL1        |
                   HDFGWTR2_EL2_nSPMCNTEN        |
                   HDFGWTR2_EL2_nSPMCR_EL0        |
                   HDFGWTR2_EL2_nSPMEVCNTRn_EL0        |
                   HDFGWTR2_EL2_nSPMEVTYPERn_EL0|
                   HDFGWTR2_EL2_nSPMINTEN        |
                   HDFGWTR2_EL2_nSPMOVS                |
                   HDFGWTR2_EL2_nSPMSCR_EL1        |
                   HDFGWTR2_EL2_nSPMSELR_EL0,
                   FEAT_SPMU),
        NEEDS_FEAT(HDFGWTR2_EL2_nMDSTEPOP_EL1, FEAT_STEP2),
        NEEDS_FEAT(HDFGWTR2_EL2_nTRBMPAM_EL1, feat_trbe_mpam),
};

static const struct reg_bits_to_feat_map hcrx_feat_map[] = {
        NEEDS_FEAT(HCRX_EL2_PACMEn, feat_pauth_lr),
        NEEDS_FEAT(HCRX_EL2_EnFPM, FEAT_FPMR),
        NEEDS_FEAT(HCRX_EL2_GCSEn, FEAT_GCS),
        NEEDS_FEAT(HCRX_EL2_EnIDCP128, FEAT_SYSREG128),
        NEEDS_FEAT(HCRX_EL2_EnSDERR, feat_aderr),
        NEEDS_FEAT(HCRX_EL2_TMEA, FEAT_DoubleFault2),
        NEEDS_FEAT(HCRX_EL2_EnSNERR, feat_anerr),
        NEEDS_FEAT(HCRX_EL2_D128En, FEAT_D128),
        NEEDS_FEAT(HCRX_EL2_PTTWI, FEAT_THE),
        NEEDS_FEAT(HCRX_EL2_SCTLR2En, FEAT_SCTLR2),
        NEEDS_FEAT(HCRX_EL2_TCR2En, FEAT_TCR2),
        NEEDS_FEAT(HCRX_EL2_MSCEn                |
                   HCRX_EL2_MCE2,
                   FEAT_MOPS),
        NEEDS_FEAT(HCRX_EL2_CMOW, FEAT_CMOW),
        NEEDS_FEAT(HCRX_EL2_VFNMI                |
                   HCRX_EL2_VINMI                |
                   HCRX_EL2_TALLINT,
                   FEAT_NMI),
        NEEDS_FEAT(HCRX_EL2_SMPME, feat_sme_smps),
        NEEDS_FEAT(HCRX_EL2_FGTnXS                |
                   HCRX_EL2_FnXS,
                   FEAT_XS),
        NEEDS_FEAT(HCRX_EL2_EnASR, FEAT_LS64_V),
        NEEDS_FEAT(HCRX_EL2_EnALS, FEAT_LS64),
        NEEDS_FEAT(HCRX_EL2_EnAS0, FEAT_LS64_ACCDATA),
};

static const struct reg_bits_to_feat_map hcr_feat_map[] = {
        NEEDS_FEAT(HCR_EL2_TID0, FEAT_AA32EL0),
        NEEDS_FEAT_FIXED(HCR_EL2_RW, compute_hcr_rw),
        NEEDS_FEAT(HCR_EL2_HCD, not_feat_aa64el3),
        NEEDS_FEAT(HCR_EL2_AMO                |
                   HCR_EL2_BSU                |
                   HCR_EL2_CD                |
                   HCR_EL2_DC                |
                   HCR_EL2_FB                |
                   HCR_EL2_FMO                |
                   HCR_EL2_ID                |
                   HCR_EL2_IMO                |
                   HCR_EL2_MIOCNCE        |
                   HCR_EL2_PTW                |
                   HCR_EL2_SWIO                |
                   HCR_EL2_TACR                |
                   HCR_EL2_TDZ                |
                   HCR_EL2_TGE                |
                   HCR_EL2_TID1                |
                   HCR_EL2_TID2                |
                   HCR_EL2_TID3                |
                   HCR_EL2_TIDCP        |
                   HCR_EL2_TPCP                |
                   HCR_EL2_TPU                |
                   HCR_EL2_TRVM                |
                   HCR_EL2_TSC                |
                   HCR_EL2_TSW                |
                   HCR_EL2_TTLB                |
                   HCR_EL2_TVM                |
                   HCR_EL2_TWE                |
                   HCR_EL2_TWI                |
                   HCR_EL2_VF                |
                   HCR_EL2_VI                |
                   HCR_EL2_VM                |
                   HCR_EL2_VSE,
                   FEAT_AA64EL1),
        NEEDS_FEAT(HCR_EL2_AMVOFFEN, FEAT_AMUv1p1),
        NEEDS_FEAT(HCR_EL2_EnSCXT, feat_csv2_2_csv2_1p2),
        NEEDS_FEAT(HCR_EL2_TICAB        |
                   HCR_EL2_TID4                |
                   HCR_EL2_TOCU,
                   FEAT_EVT),
        NEEDS_FEAT(HCR_EL2_TTLBIS        |
                   HCR_EL2_TTLBOS,
                   FEAT_EVT_TTLBxS),
        NEEDS_FEAT(HCR_EL2_TLOR, FEAT_LOR),
        NEEDS_FEAT(HCR_EL2_ATA                |
                   HCR_EL2_DCT                |
                   HCR_EL2_TID5,
                   FEAT_MTE2),
        NEEDS_FEAT(HCR_EL2_AT                | /* Ignore the original FEAT_NV */
                   HCR_EL2_NV2                |
                   HCR_EL2_NV,
                   feat_nv2),
        NEEDS_FEAT(HCR_EL2_NV1, feat_nv2_e2h0_ni), /* Missing from JSON */
        NEEDS_FEAT(HCR_EL2_API                |
                   HCR_EL2_APK,
                   feat_pauth),
        NEEDS_FEAT(HCR_EL2_TEA                |
                   HCR_EL2_TERR,
                   FEAT_RAS),
        NEEDS_FEAT(HCR_EL2_FIEN, feat_rasv1p1),
        NEEDS_FEAT(HCR_EL2_GPF, FEAT_RME),
        NEEDS_FEAT(HCR_EL2_FWB, FEAT_S2FWB),
        NEEDS_FEAT(HCR_EL2_TME, FEAT_TME),
        NEEDS_FEAT(HCR_EL2_TWEDEL        |
                   HCR_EL2_TWEDEn,
                   FEAT_TWED),
        NEEDS_FEAT_FIXED(HCR_EL2_E2H, compute_hcr_e2h),
};

static void __init check_feat_map(const struct reg_bits_to_feat_map *map,
                                  int map_size, u64 res0, const char *str)
{
        u64 mask = 0;

        for (int i = 0; i < map_size; i++)
                mask |= map[i].bits;

        if (mask != ~res0)
                kvm_err("Undefined %s behaviour, bits %016llx\n",
                        str, mask ^ ~res0);
}

void __init check_feature_map(void)
{
        check_feat_map(hfgrtr_feat_map, ARRAY_SIZE(hfgrtr_feat_map),
                       hfgrtr_masks.res0, hfgrtr_masks.str);
        check_feat_map(hfgwtr_feat_map, ARRAY_SIZE(hfgwtr_feat_map),
                       hfgwtr_masks.res0, hfgwtr_masks.str);
        check_feat_map(hfgitr_feat_map, ARRAY_SIZE(hfgitr_feat_map),
                       hfgitr_masks.res0, hfgitr_masks.str);
        check_feat_map(hdfgrtr_feat_map, ARRAY_SIZE(hdfgrtr_feat_map),
                       hdfgrtr_masks.res0, hdfgrtr_masks.str);
        check_feat_map(hdfgwtr_feat_map, ARRAY_SIZE(hdfgwtr_feat_map),
                       hdfgwtr_masks.res0, hdfgwtr_masks.str);
        check_feat_map(hafgrtr_feat_map, ARRAY_SIZE(hafgrtr_feat_map),
                       hafgrtr_masks.res0, hafgrtr_masks.str);
        check_feat_map(hcrx_feat_map, ARRAY_SIZE(hcrx_feat_map),
                       __HCRX_EL2_RES0, "HCRX_EL2");
        check_feat_map(hcr_feat_map, ARRAY_SIZE(hcr_feat_map),
                       HCR_EL2_RES0, "HCR_EL2");
}

static bool idreg_feat_match(struct kvm *kvm, const struct reg_bits_to_feat_map *map)
{
        u64 regval = kvm->arch.id_regs[map->regidx];
        u64 regfld = (regval >> map->shift) & GENMASK(map->width - 1, 0);

        if (map->sign) {
                s64 sfld = sign_extend64(regfld, map->width - 1);
                s64 slim = sign_extend64(map->lo_lim, map->width - 1);
                return sfld >= slim;
        } else {
                return regfld >= map->lo_lim;
        }
}

static u64 __compute_fixed_bits(struct kvm *kvm,
                                const struct reg_bits_to_feat_map *map,
                                int map_size,
                                u64 *fixed_bits,
                                unsigned long require,
                                unsigned long exclude)
{
        u64 val = 0;

        for (int i = 0; i < map_size; i++) {
                bool match;

                if ((map[i].flags & require) != require)
                        continue;

                if (map[i].flags & exclude)
                        continue;

                if (map[i].flags & CALL_FUNC)
                        match = (map[i].flags & FIXED_VALUE) ?
                                map[i].fval(kvm, fixed_bits) :
                                map[i].match(kvm);
                else
                        match = idreg_feat_match(kvm, &map[i]);

                if (!match || (map[i].flags & FIXED_VALUE))
                        val |= map[i].bits;
        }

        return val;
}

static u64 compute_res0_bits(struct kvm *kvm,
                             const struct reg_bits_to_feat_map *map,
                             int map_size,
                             unsigned long require,
                             unsigned long exclude)
{
        return __compute_fixed_bits(kvm, map, map_size, NULL,
                                    require, exclude | FIXED_VALUE);
}

static u64 compute_fixed_bits(struct kvm *kvm,
                              const struct reg_bits_to_feat_map *map,
                              int map_size,
                              u64 *fixed_bits,
                              unsigned long require,
                              unsigned long exclude)
{
        return __compute_fixed_bits(kvm, map, map_size, fixed_bits,
                                    require | FIXED_VALUE, exclude);
}

void compute_fgu(struct kvm *kvm, enum fgt_group_id fgt)
{
        u64 val = 0;

        switch (fgt) {
        case HFGRTR_GROUP:
                val |= compute_res0_bits(kvm, hfgrtr_feat_map,
                                         ARRAY_SIZE(hfgrtr_feat_map),
                                         0, NEVER_FGU);
                val |= compute_res0_bits(kvm, hfgwtr_feat_map,
                                         ARRAY_SIZE(hfgwtr_feat_map),
                                         0, NEVER_FGU);
                break;
        case HFGITR_GROUP:
                val |= compute_res0_bits(kvm, hfgitr_feat_map,
                                         ARRAY_SIZE(hfgitr_feat_map),
                                         0, NEVER_FGU);
                break;
        case HDFGRTR_GROUP:
                val |= compute_res0_bits(kvm, hdfgrtr_feat_map,
                                         ARRAY_SIZE(hdfgrtr_feat_map),
                                         0, NEVER_FGU);
                val |= compute_res0_bits(kvm, hdfgwtr_feat_map,
                                         ARRAY_SIZE(hdfgwtr_feat_map),
                                         0, NEVER_FGU);
                break;
        case HAFGRTR_GROUP:
                val |= compute_res0_bits(kvm, hafgrtr_feat_map,
                                         ARRAY_SIZE(hafgrtr_feat_map),
                                         0, NEVER_FGU);
                break;
        case HFGRTR2_GROUP:
                val |= compute_res0_bits(kvm, hfgrtr2_feat_map,
                                         ARRAY_SIZE(hfgrtr2_feat_map),
                                         0, NEVER_FGU);
                val |= compute_res0_bits(kvm, hfgwtr2_feat_map,
                                         ARRAY_SIZE(hfgwtr2_feat_map),
                                         0, NEVER_FGU);
                break;
        case HFGITR2_GROUP:
                val |= compute_res0_bits(kvm, hfgitr2_feat_map,
                                         ARRAY_SIZE(hfgitr2_feat_map),
                                         0, NEVER_FGU);
                break;
        case HDFGRTR2_GROUP:
                val |= compute_res0_bits(kvm, hdfgrtr2_feat_map,
                                         ARRAY_SIZE(hdfgrtr2_feat_map),
                                         0, NEVER_FGU);
                val |= compute_res0_bits(kvm, hdfgwtr2_feat_map,
                                         ARRAY_SIZE(hdfgwtr2_feat_map),
                                         0, NEVER_FGU);
                break;
        default:
                BUG();
        }

        kvm->arch.fgu[fgt] = val;
}

void get_reg_fixed_bits(struct kvm *kvm, enum vcpu_sysreg reg, u64 *res0, u64 *res1)
{
        u64 fixed = 0, mask;

        switch (reg) {
        case HFGRTR_EL2:
                *res0 = compute_res0_bits(kvm, hfgrtr_feat_map,
                                          ARRAY_SIZE(hfgrtr_feat_map), 0, 0);
                *res0 |= hfgrtr_masks.res0;
                *res1 = HFGRTR_EL2_RES1;
                break;
        case HFGWTR_EL2:
                *res0 = compute_res0_bits(kvm, hfgwtr_feat_map,
                                          ARRAY_SIZE(hfgwtr_feat_map), 0, 0);
                *res0 |= hfgwtr_masks.res0;
                *res1 = HFGWTR_EL2_RES1;
                break;
        case HFGITR_EL2:
                *res0 = compute_res0_bits(kvm, hfgitr_feat_map,
                                          ARRAY_SIZE(hfgitr_feat_map), 0, 0);
                *res0 |= hfgitr_masks.res0;
                *res1 = HFGITR_EL2_RES1;
                break;
        case HDFGRTR_EL2:
                *res0 = compute_res0_bits(kvm, hdfgrtr_feat_map,
                                          ARRAY_SIZE(hdfgrtr_feat_map), 0, 0);
                *res0 |= hdfgrtr_masks.res0;
                *res1 = HDFGRTR_EL2_RES1;
                break;
        case HDFGWTR_EL2:
                *res0 = compute_res0_bits(kvm, hdfgwtr_feat_map,
                                          ARRAY_SIZE(hdfgwtr_feat_map), 0, 0);
                *res0 |= hdfgwtr_masks.res0;
                *res1 = HDFGWTR_EL2_RES1;
                break;
        case HAFGRTR_EL2:
                *res0 = compute_res0_bits(kvm, hafgrtr_feat_map,
                                          ARRAY_SIZE(hafgrtr_feat_map), 0, 0);
                *res0 |= hafgrtr_masks.res0;
                *res1 = HAFGRTR_EL2_RES1;
                break;
        case HFGRTR2_EL2:
                *res0 = compute_res0_bits(kvm, hfgrtr2_feat_map,
                                          ARRAY_SIZE(hfgrtr2_feat_map), 0, 0);
                *res0 |= hfgrtr2_masks.res0;
                *res1 = HFGRTR2_EL2_RES1;
                break;
        case HFGWTR2_EL2:
                *res0 = compute_res0_bits(kvm, hfgwtr2_feat_map,
                                          ARRAY_SIZE(hfgwtr2_feat_map), 0, 0);
                *res0 |= hfgwtr2_masks.res0;
                *res1 = HFGWTR2_EL2_RES1;
                break;
        case HFGITR2_EL2:
                *res0 = compute_res0_bits(kvm, hfgitr2_feat_map,
                                          ARRAY_SIZE(hfgitr2_feat_map), 0, 0);
                *res0 |= hfgitr2_masks.res0;
                *res1 = HFGITR2_EL2_RES1;
                break;
        case HDFGRTR2_EL2:
                *res0 = compute_res0_bits(kvm, hdfgrtr2_feat_map,
                                          ARRAY_SIZE(hdfgrtr2_feat_map), 0, 0);
                *res0 |= hdfgrtr2_masks.res0;
                *res1 = HDFGRTR2_EL2_RES1;
                break;
        case HDFGWTR2_EL2:
                *res0 = compute_res0_bits(kvm, hdfgwtr2_feat_map,
                                          ARRAY_SIZE(hdfgwtr2_feat_map), 0, 0);
                *res0 |= hdfgwtr2_masks.res0;
                *res1 = HDFGWTR2_EL2_RES1;
                break;
        case HCRX_EL2:
                *res0 = compute_res0_bits(kvm, hcrx_feat_map,
                                          ARRAY_SIZE(hcrx_feat_map), 0, 0);
                *res0 |= __HCRX_EL2_RES0;
                *res1 = __HCRX_EL2_RES1;
                break;
        case HCR_EL2:
                mask = compute_fixed_bits(kvm, hcr_feat_map,
                                          ARRAY_SIZE(hcr_feat_map), &fixed,
                                          0, 0);
                *res0 = compute_res0_bits(kvm, hcr_feat_map,
                                          ARRAY_SIZE(hcr_feat_map), 0, 0);
                *res0 |= HCR_EL2_RES0 | (mask & ~fixed);
                *res1 = HCR_EL2_RES1 | (mask & fixed);
                break;
        default:
                WARN_ON_ONCE(1);
                *res0 = *res1 = 0;
                break;
        }
}





















    1 




    2 














   44 


    1 




   38 




    5 

















    5 
















    8 


    2 


    1 


    5 









    3 



    1 








    1 








    1 






   34 









   34 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015, 2016 ARM Ltd.
 */

#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <trace/events/kvm.h>
#include <kvm/arm_vgic.h>
#include "vgic.h"

/*
 * vgic_irqfd_set_irq: inject the IRQ corresponding to the
 * irqchip routing entry
 *
 * This is the entry point for irqfd IRQ injection
 */
static int vgic_irqfd_set_irq(struct kvm_kernel_irq_routing_entry *e,
                        struct kvm *kvm, int irq_source_id,
                        int level, bool line_status)
{
        unsigned int spi_id = e->irqchip.pin + VGIC_NR_PRIVATE_IRQS;

        if (!vgic_valid_spi(kvm, spi_id))
                return -EINVAL;
        return kvm_vgic_inject_irq(kvm, NULL, spi_id, level, NULL);
}

/**
 * kvm_set_routing_entry: populate a kvm routing entry
 * from a user routing entry
 *
 * @kvm: the VM this entry is applied to
 * @e: kvm kernel routing entry handle
 * @ue: user api routing entry handle
 * return 0 on success, -EINVAL on errors.
 */
int kvm_set_routing_entry(struct kvm *kvm,
                          struct kvm_kernel_irq_routing_entry *e,
                          const struct kvm_irq_routing_entry *ue)
{
        int r = -EINVAL;

        switch (ue->type) {
        case KVM_IRQ_ROUTING_IRQCHIP:
                e->set = vgic_irqfd_set_irq;
                e->irqchip.irqchip = ue->u.irqchip.irqchip;
                e->irqchip.pin = ue->u.irqchip.pin;
                if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) ||
                    (e->irqchip.irqchip >= KVM_NR_IRQCHIPS))
                        goto out;
                break;
        case KVM_IRQ_ROUTING_MSI:
                e->set = kvm_set_msi;
                e->msi.address_lo = ue->u.msi.address_lo;
                e->msi.address_hi = ue->u.msi.address_hi;
                e->msi.data = ue->u.msi.data;
                e->msi.flags = ue->flags;
                e->msi.devid = ue->u.msi.devid;
                break;
        default:
                goto out;
        }
        r = 0;
out:
        return r;
}

static void kvm_populate_msi(struct kvm_kernel_irq_routing_entry *e,
                             struct kvm_msi *msi)
{
        msi->address_lo = e->msi.address_lo;
        msi->address_hi = e->msi.address_hi;
        msi->data = e->msi.data;
        msi->flags = e->msi.flags;
        msi->devid = e->msi.devid;
}

/*
 * kvm_set_msi: inject the MSI corresponding to the
 * MSI routing entry
 *
 * This is the entry point for irqfd MSI injection
 * and userspace MSI injection.
 */
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
                struct kvm *kvm, int irq_source_id,
                int level, bool line_status)
{
        struct kvm_msi msi;

        if (!vgic_has_its(kvm))
                return -ENODEV;

        if (!level)
                return -1;

        kvm_populate_msi(e, &msi);
        return vgic_its_inject_msi(kvm, &msi);
}

/*
 * kvm_arch_set_irq_inatomic: fast-path for irqfd injection
 */
int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
                              struct kvm *kvm, int irq_source_id, int level,
                              bool line_status)
{
        if (!level)
                return -EWOULDBLOCK;

        switch (e->type) {
        case KVM_IRQ_ROUTING_MSI: {
                struct kvm_msi msi;

                if (!vgic_has_its(kvm))
                        break;

                kvm_populate_msi(e, &msi);
                return vgic_its_inject_cached_translation(kvm, &msi);
        }

        case KVM_IRQ_ROUTING_IRQCHIP:
                /*
                 * Injecting SPIs is always possible in atomic context
                 * as long as the damn vgic is initialized.
                 */
                if (unlikely(!vgic_initialized(kvm)))
                        break;
                return vgic_irqfd_set_irq(e, kvm, irq_source_id, 1, line_status);
        }

        return -EWOULDBLOCK;
}

int kvm_vgic_setup_default_irq_routing(struct kvm *kvm)
{
        struct kvm_irq_routing_entry *entries;
        struct vgic_dist *dist = &kvm->arch.vgic;
        u32 nr = dist->nr_spis;
        int i, ret;

        entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL_ACCOUNT);
        if (!entries)
                return -ENOMEM;

        for (i = 0; i < nr; i++) {
                entries[i].gsi = i;
                entries[i].type = KVM_IRQ_ROUTING_IRQCHIP;
                entries[i].u.irqchip.irqchip = 0;
                entries[i].u.irqchip.pin = i;
        }
        ret = kvm_set_irq_routing(kvm, entries, nr, 0);
        kfree(entries);
        return ret;
}






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   23 


























   22 

    1 







   22 









































































    1 












    1 




   21 










   20 


   20 
















   20 












































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2016 - Linaro and Columbia University
 * Author: Jintack Lim <jintack.lim@linaro.org>
 */

#include <linux/kvm.h>
#include <linux/kvm_host.h>

#include <asm/kvm_emulate.h>
#include <asm/kvm_nested.h>

#include "hyp/include/hyp/adjust_pc.h"

#include "trace.h"

enum trap_behaviour {
        BEHAVE_HANDLE_LOCALLY        = 0,

        BEHAVE_FORWARD_READ        = BIT(0),
        BEHAVE_FORWARD_WRITE        = BIT(1),
        BEHAVE_FORWARD_RW        = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE,

        /* Traps that take effect in Host EL0, this is rare! */
        BEHAVE_FORWARD_IN_HOST_EL0        = BIT(2),
};

struct trap_bits {
        const enum vcpu_sysreg                index;
        const enum trap_behaviour        behaviour;
        const u64                        value;
        const u64                        mask;
};

/* Coarse Grained Trap definitions */
enum cgt_group_id {
        /* Indicates no coarse trap control */
        __RESERVED__,

        /*
         * The first batch of IDs denote coarse trapping that are used
         * on their own instead of being part of a combination of
         * trap controls.
         */
        CGT_HCR_TID1,
        CGT_HCR_TID2,
        CGT_HCR_TID3,
        CGT_HCR_IMO,
        CGT_HCR_FMO,
        CGT_HCR_TIDCP,
        CGT_HCR_TACR,
        CGT_HCR_TSW,
        CGT_HCR_TPC,
        CGT_HCR_TPU,
        CGT_HCR_TTLB,
        CGT_HCR_TVM,
        CGT_HCR_TDZ,
        CGT_HCR_TRVM,
        CGT_HCR_TLOR,
        CGT_HCR_TERR,
        CGT_HCR_APK,
        CGT_HCR_NV,
        CGT_HCR_NV_nNV2,
        CGT_HCR_NV1_nNV2,
        CGT_HCR_AT,
        CGT_HCR_nFIEN,
        CGT_HCR_TID4,
        CGT_HCR_TICAB,
        CGT_HCR_TOCU,
        CGT_HCR_ENSCXT,
        CGT_HCR_TTLBIS,
        CGT_HCR_TTLBOS,

        CGT_MDCR_TPMCR,
        CGT_MDCR_TPM,
        CGT_MDCR_TDE,
        CGT_MDCR_TDA,
        CGT_MDCR_TDOSA,
        CGT_MDCR_TDRA,
        CGT_MDCR_E2PB,
        CGT_MDCR_TPMS,
        CGT_MDCR_TTRF,
        CGT_MDCR_E2TB,
        CGT_MDCR_TDCC,

        CGT_CPTR_TAM,
        CGT_CPTR_TCPAC,

        CGT_HCRX_EnFPM,
        CGT_HCRX_TCR2En,

        CGT_CNTHCTL_EL1TVT,
        CGT_CNTHCTL_EL1TVCT,

        CGT_ICH_HCR_TC,
        CGT_ICH_HCR_TALL0,
        CGT_ICH_HCR_TALL1,
        CGT_ICH_HCR_TDIR,

        /*
         * Anything after this point is a combination of coarse trap
         * controls, which must all be evaluated to decide what to do.
         */
        __MULTIPLE_CONTROL_BITS__,
        CGT_HCR_IMO_FMO_ICH_HCR_TC = __MULTIPLE_CONTROL_BITS__,
        CGT_HCR_TID2_TID4,
        CGT_HCR_TTLB_TTLBIS,
        CGT_HCR_TTLB_TTLBOS,
        CGT_HCR_TVM_TRVM,
        CGT_HCR_TVM_TRVM_HCRX_TCR2En,
        CGT_HCR_TPU_TICAB,
        CGT_HCR_TPU_TOCU,
        CGT_HCR_NV1_nNV2_ENSCXT,
        CGT_MDCR_TPM_TPMCR,
        CGT_MDCR_TPM_HPMN,
        CGT_MDCR_TDE_TDA,
        CGT_MDCR_TDE_TDOSA,
        CGT_MDCR_TDE_TDRA,
        CGT_MDCR_TDCC_TDE_TDA,

        CGT_ICH_HCR_TC_TDIR,

        /*
         * Anything after this point requires a callback evaluating a
         * complex trap condition. Ugly stuff.
         */
        __COMPLEX_CONDITIONS__,
        CGT_CNTHCTL_EL1PCTEN = __COMPLEX_CONDITIONS__,
        CGT_CNTHCTL_EL1PTEN,
        CGT_CNTHCTL_EL1NVPCT,
        CGT_CNTHCTL_EL1NVVCT,

        CGT_CPTR_TTA,
        CGT_MDCR_HPMN,

        /* Must be last */
        __NR_CGT_GROUP_IDS__
};

static const struct trap_bits coarse_trap_bits[] = {
        [CGT_HCR_TID1] = {
                .index                = HCR_EL2,
                .value                 = HCR_TID1,
                .mask                = HCR_TID1,
                .behaviour        = BEHAVE_FORWARD_READ,
        },
        [CGT_HCR_TID2] = {
                .index                = HCR_EL2,
                .value                 = HCR_TID2,
                .mask                = HCR_TID2,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TID3] = {
                .index                = HCR_EL2,
                .value                 = HCR_TID3,
                .mask                = HCR_TID3,
                .behaviour        = BEHAVE_FORWARD_READ,
        },
        [CGT_HCR_IMO] = {
                .index                = HCR_EL2,
                .value                 = HCR_IMO,
                .mask                = HCR_IMO,
                .behaviour        = BEHAVE_FORWARD_WRITE,
        },
        [CGT_HCR_FMO] = {
                .index                = HCR_EL2,
                .value                 = HCR_FMO,
                .mask                = HCR_FMO,
                .behaviour        = BEHAVE_FORWARD_WRITE,
        },
        [CGT_HCR_TIDCP] = {
                .index                = HCR_EL2,
                .value                = HCR_TIDCP,
                .mask                = HCR_TIDCP,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TACR] = {
                .index                = HCR_EL2,
                .value                = HCR_TACR,
                .mask                = HCR_TACR,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TSW] = {
                .index                = HCR_EL2,
                .value                = HCR_TSW,
                .mask                = HCR_TSW,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TPC] = { /* Also called TCPC when FEAT_DPB is implemented */
                .index                = HCR_EL2,
                .value                = HCR_TPC,
                .mask                = HCR_TPC,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TPU] = {
                .index                = HCR_EL2,
                .value                = HCR_TPU,
                .mask                = HCR_TPU,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TTLB] = {
                .index                = HCR_EL2,
                .value                = HCR_TTLB,
                .mask                = HCR_TTLB,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TVM] = {
                .index                = HCR_EL2,
                .value                = HCR_TVM,
                .mask                = HCR_TVM,
                .behaviour        = BEHAVE_FORWARD_WRITE,
        },
        [CGT_HCR_TDZ] = {
                .index                = HCR_EL2,
                .value                = HCR_TDZ,
                .mask                = HCR_TDZ,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TRVM] = {
                .index                = HCR_EL2,
                .value                = HCR_TRVM,
                .mask                = HCR_TRVM,
                .behaviour        = BEHAVE_FORWARD_READ,
        },
        [CGT_HCR_TLOR] = {
                .index                = HCR_EL2,
                .value                = HCR_TLOR,
                .mask                = HCR_TLOR,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TERR] = {
                .index                = HCR_EL2,
                .value                = HCR_TERR,
                .mask                = HCR_TERR,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_APK] = {
                .index                = HCR_EL2,
                .value                = 0,
                .mask                = HCR_APK,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_NV] = {
                .index                = HCR_EL2,
                .value                = HCR_NV,
                .mask                = HCR_NV,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_NV_nNV2] = {
                .index                = HCR_EL2,
                .value                = HCR_NV,
                .mask                = HCR_NV | HCR_NV2,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_NV1_nNV2] = {
                .index                = HCR_EL2,
                .value                = HCR_NV | HCR_NV1,
                .mask                = HCR_NV | HCR_NV1 | HCR_NV2,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_AT] = {
                .index                = HCR_EL2,
                .value                = HCR_AT,
                .mask                = HCR_AT,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_nFIEN] = {
                .index                = HCR_EL2,
                .value                = 0,
                .mask                = HCR_FIEN,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TID4] = {
                .index                = HCR_EL2,
                .value                 = HCR_TID4,
                .mask                = HCR_TID4,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TICAB] = {
                .index                = HCR_EL2,
                .value                 = HCR_TICAB,
                .mask                = HCR_TICAB,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TOCU] = {
                .index                = HCR_EL2,
                .value                 = HCR_TOCU,
                .mask                = HCR_TOCU,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_ENSCXT] = {
                .index                = HCR_EL2,
                .value                 = 0,
                .mask                = HCR_ENSCXT,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TTLBIS] = {
                .index                = HCR_EL2,
                .value                = HCR_TTLBIS,
                .mask                = HCR_TTLBIS,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCR_TTLBOS] = {
                .index                = HCR_EL2,
                .value                = HCR_TTLBOS,
                .mask                = HCR_TTLBOS,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_MDCR_TPMCR] = {
                .index                = MDCR_EL2,
                .value                = MDCR_EL2_TPMCR,
                .mask                = MDCR_EL2_TPMCR,
                .behaviour        = BEHAVE_FORWARD_RW |
                                  BEHAVE_FORWARD_IN_HOST_EL0,
        },
        [CGT_MDCR_TPM] = {
                .index                = MDCR_EL2,
                .value                = MDCR_EL2_TPM,
                .mask                = MDCR_EL2_TPM,
                .behaviour        = BEHAVE_FORWARD_RW |
                                  BEHAVE_FORWARD_IN_HOST_EL0,
        },
        [CGT_MDCR_TDE] = {
                .index                = MDCR_EL2,
                .value                = MDCR_EL2_TDE,
                .mask                = MDCR_EL2_TDE,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_MDCR_TDA] = {
                .index                = MDCR_EL2,
                .value                = MDCR_EL2_TDA,
                .mask                = MDCR_EL2_TDA,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_MDCR_TDOSA] = {
                .index                = MDCR_EL2,
                .value                = MDCR_EL2_TDOSA,
                .mask                = MDCR_EL2_TDOSA,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_MDCR_TDRA] = {
                .index                = MDCR_EL2,
                .value                = MDCR_EL2_TDRA,
                .mask                = MDCR_EL2_TDRA,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_MDCR_E2PB] = {
                .index                = MDCR_EL2,
                .value                = 0,
                .mask                = BIT(MDCR_EL2_E2PB_SHIFT),
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_MDCR_TPMS] = {
                .index                = MDCR_EL2,
                .value                = MDCR_EL2_TPMS,
                .mask                = MDCR_EL2_TPMS,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_MDCR_TTRF] = {
                .index                = MDCR_EL2,
                .value                = MDCR_EL2_TTRF,
                .mask                = MDCR_EL2_TTRF,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_MDCR_E2TB] = {
                .index                = MDCR_EL2,
                .value                = 0,
                .mask                = BIT(MDCR_EL2_E2TB_SHIFT),
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_MDCR_TDCC] = {
                .index                = MDCR_EL2,
                .value                = MDCR_EL2_TDCC,
                .mask                = MDCR_EL2_TDCC,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_CPTR_TAM] = {
                .index                = CPTR_EL2,
                .value                = CPTR_EL2_TAM,
                .mask                = CPTR_EL2_TAM,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_CPTR_TCPAC] = {
                .index                = CPTR_EL2,
                .value                = CPTR_EL2_TCPAC,
                .mask                = CPTR_EL2_TCPAC,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCRX_EnFPM] = {
                .index                = HCRX_EL2,
                .value                 = 0,
                .mask                = HCRX_EL2_EnFPM,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_HCRX_TCR2En] = {
                .index                = HCRX_EL2,
                .value                 = 0,
                .mask                = HCRX_EL2_TCR2En,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_CNTHCTL_EL1TVT] = {
                .index                = CNTHCTL_EL2,
                .value                = CNTHCTL_EL1TVT,
                .mask                = CNTHCTL_EL1TVT,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_CNTHCTL_EL1TVCT] = {
                .index                = CNTHCTL_EL2,
                .value                = CNTHCTL_EL1TVCT,
                .mask                = CNTHCTL_EL1TVCT,
                .behaviour        = BEHAVE_FORWARD_READ,
        },
        [CGT_ICH_HCR_TC] = {
                .index                = ICH_HCR_EL2,
                .value                = ICH_HCR_EL2_TC,
                .mask                = ICH_HCR_EL2_TC,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_ICH_HCR_TALL0] = {
                .index                = ICH_HCR_EL2,
                .value                = ICH_HCR_EL2_TALL0,
                .mask                = ICH_HCR_EL2_TALL0,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_ICH_HCR_TALL1] = {
                .index                = ICH_HCR_EL2,
                .value                = ICH_HCR_EL2_TALL1,
                .mask                = ICH_HCR_EL2_TALL1,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
        [CGT_ICH_HCR_TDIR] = {
                .index                = ICH_HCR_EL2,
                .value                = ICH_HCR_EL2_TDIR,
                .mask                = ICH_HCR_EL2_TDIR,
                .behaviour        = BEHAVE_FORWARD_RW,
        },
};

#define MCB(id, ...)                                                \
        [id - __MULTIPLE_CONTROL_BITS__]        =                \
                (const enum cgt_group_id[]){                        \
                __VA_ARGS__, __RESERVED__                        \
                }

static const enum cgt_group_id *coarse_control_combo[] = {
        MCB(CGT_HCR_TID2_TID4,                CGT_HCR_TID2, CGT_HCR_TID4),
        MCB(CGT_HCR_TTLB_TTLBIS,        CGT_HCR_TTLB, CGT_HCR_TTLBIS),
        MCB(CGT_HCR_TTLB_TTLBOS,        CGT_HCR_TTLB, CGT_HCR_TTLBOS),
        MCB(CGT_HCR_TVM_TRVM,                CGT_HCR_TVM, CGT_HCR_TRVM),
        MCB(CGT_HCR_TVM_TRVM_HCRX_TCR2En,
                                        CGT_HCR_TVM, CGT_HCR_TRVM, CGT_HCRX_TCR2En),
        MCB(CGT_HCR_TPU_TICAB,                CGT_HCR_TPU, CGT_HCR_TICAB),
        MCB(CGT_HCR_TPU_TOCU,                CGT_HCR_TPU, CGT_HCR_TOCU),
        MCB(CGT_HCR_NV1_nNV2_ENSCXT,        CGT_HCR_NV1_nNV2, CGT_HCR_ENSCXT),
        MCB(CGT_MDCR_TPM_TPMCR,                CGT_MDCR_TPM, CGT_MDCR_TPMCR),
        MCB(CGT_MDCR_TPM_HPMN,                CGT_MDCR_TPM, CGT_MDCR_HPMN),
        MCB(CGT_MDCR_TDE_TDA,                CGT_MDCR_TDE, CGT_MDCR_TDA),
        MCB(CGT_MDCR_TDE_TDOSA,                CGT_MDCR_TDE, CGT_MDCR_TDOSA),
        MCB(CGT_MDCR_TDE_TDRA,                CGT_MDCR_TDE, CGT_MDCR_TDRA),
        MCB(CGT_MDCR_TDCC_TDE_TDA,        CGT_MDCR_TDCC, CGT_MDCR_TDE, CGT_MDCR_TDA),

        MCB(CGT_HCR_IMO_FMO_ICH_HCR_TC,        CGT_HCR_IMO, CGT_HCR_FMO, CGT_ICH_HCR_TC),
        MCB(CGT_ICH_HCR_TC_TDIR,        CGT_ICH_HCR_TC, CGT_ICH_HCR_TDIR),
};

typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *);

/*
 * Warning, maximum confusion ahead.
 *
 * When E2H=0, CNTHCTL_EL2[1:0] are defined as EL1PCEN:EL1PCTEN
 * When E2H=1, CNTHCTL_EL2[11:10] are defined as EL1PTEN:EL1PCTEN
 *
 * Note the single letter difference? Yet, the bits have the same
 * function despite a different layout and a different name.
 *
 * We don't try to reconcile this mess. We just use the E2H=0 bits
 * to generate something that is in the E2H=1 format, and live with
 * it. You're welcome.
 */
static u64 get_sanitized_cnthctl(struct kvm_vcpu *vcpu)
{
        u64 val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);

        if (!vcpu_el2_e2h_is_set(vcpu))
                val = (val & (CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN)) << 10;

        return val & ((CNTHCTL_EL1PCEN | CNTHCTL_EL1PCTEN) << 10);
}

static enum trap_behaviour check_cnthctl_el1pcten(struct kvm_vcpu *vcpu)
{
        if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCTEN << 10))
                return BEHAVE_HANDLE_LOCALLY;

        return BEHAVE_FORWARD_RW;
}

static enum trap_behaviour check_cnthctl_el1pten(struct kvm_vcpu *vcpu)
{
        if (get_sanitized_cnthctl(vcpu) & (CNTHCTL_EL1PCEN << 10))
                return BEHAVE_HANDLE_LOCALLY;

        return BEHAVE_FORWARD_RW;
}

static bool is_nested_nv2_guest(struct kvm_vcpu *vcpu)
{
        u64 val;

        val = __vcpu_sys_reg(vcpu, HCR_EL2);
        return ((val & (HCR_E2H | HCR_TGE | HCR_NV2 | HCR_NV1 | HCR_NV)) == (HCR_E2H | HCR_NV2 | HCR_NV));
}

static enum trap_behaviour check_cnthctl_el1nvpct(struct kvm_vcpu *vcpu)
{
        if (!is_nested_nv2_guest(vcpu) ||
            !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVPCT))
                return BEHAVE_HANDLE_LOCALLY;

        return BEHAVE_FORWARD_RW;
}

static enum trap_behaviour check_cnthctl_el1nvvct(struct kvm_vcpu *vcpu)
{
        if (!is_nested_nv2_guest(vcpu) ||
            !(__vcpu_sys_reg(vcpu, CNTHCTL_EL2) & CNTHCTL_EL1NVVCT))
                return BEHAVE_HANDLE_LOCALLY;

        return BEHAVE_FORWARD_RW;
}

static enum trap_behaviour check_cptr_tta(struct kvm_vcpu *vcpu)
{
        u64 val = __vcpu_sys_reg(vcpu, CPTR_EL2);

        if (!vcpu_el2_e2h_is_set(vcpu))
                val = translate_cptr_el2_to_cpacr_el1(val);

        if (val & CPACR_EL1_TTA)
                return BEHAVE_FORWARD_RW;

        return BEHAVE_HANDLE_LOCALLY;
}

static enum trap_behaviour check_mdcr_hpmn(struct kvm_vcpu *vcpu)
{
        u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
        unsigned int idx;


        switch (sysreg) {
        case SYS_PMEVTYPERn_EL0(0) ... SYS_PMEVTYPERn_EL0(30):
        case SYS_PMEVCNTRn_EL0(0) ... SYS_PMEVCNTRn_EL0(30):
                idx = (sys_reg_CRm(sysreg) & 0x3) << 3 | sys_reg_Op2(sysreg);
                break;
        case SYS_PMXEVTYPER_EL0:
        case SYS_PMXEVCNTR_EL0:
                idx = SYS_FIELD_GET(PMSELR_EL0, SEL,
                                    __vcpu_sys_reg(vcpu, PMSELR_EL0));
                break;
        default:
                /* Someone used this trap helper for something else... */
                KVM_BUG_ON(1, vcpu->kvm);
                return BEHAVE_HANDLE_LOCALLY;
        }

        if (kvm_pmu_counter_is_hyp(vcpu, idx))
                return BEHAVE_FORWARD_RW | BEHAVE_FORWARD_IN_HOST_EL0;

        return BEHAVE_HANDLE_LOCALLY;
}

#define CCC(id, fn)                                \
        [id - __COMPLEX_CONDITIONS__] = fn

static const complex_condition_check ccc[] = {
        CCC(CGT_CNTHCTL_EL1PCTEN, check_cnthctl_el1pcten),
        CCC(CGT_CNTHCTL_EL1PTEN, check_cnthctl_el1pten),
        CCC(CGT_CNTHCTL_EL1NVPCT, check_cnthctl_el1nvpct),
        CCC(CGT_CNTHCTL_EL1NVVCT, check_cnthctl_el1nvvct),
        CCC(CGT_CPTR_TTA, check_cptr_tta),
        CCC(CGT_MDCR_HPMN, check_mdcr_hpmn),
};

/*
 * Bit assignment for the trap controls. We use a 64bit word with the
 * following layout for each trapped sysreg:
 *
 * [9:0]        enum cgt_group_id (10 bits)
 * [13:10]        enum fgt_group_id (4 bits)
 * [19:14]        bit number in the FGT register (6 bits)
 * [20]                trap polarity (1 bit)
 * [25:21]        FG filter (5 bits)
 * [35:26]        Main SysReg table index (10 bits)
 * [62:36]        Unused (27 bits)
 * [63]                RES0 - Must be zero, as lost on insertion in the xarray
 */
#define TC_CGT_BITS        10
#define TC_FGT_BITS        4
#define TC_FGF_BITS        5
#define TC_SRI_BITS        10

union trap_config {
        u64        val;
        struct {
                unsigned long        cgt:TC_CGT_BITS; /* Coarse Grained Trap id */
                unsigned long        fgt:TC_FGT_BITS; /* Fine Grained Trap id */
                unsigned long        bit:6;                 /* Bit number */
                unsigned long        pol:1;                 /* Polarity */
                unsigned long        fgf:TC_FGF_BITS; /* Fine Grained Filter */
                unsigned long        sri:TC_SRI_BITS; /* SysReg Index */
                unsigned long        unused:27;         /* Unused, should be zero */
                unsigned long        mbz:1;                 /* Must Be Zero */
        };
};

struct encoding_to_trap_config {
        const u32                        encoding;
        const u32                        end;
        const union trap_config                tc;
        const unsigned int                line;
};

/*
 * WARNING: using ranges is a treacherous endeavour, as sysregs that
 * are part of an architectural range are not necessarily contiguous
 * in the [Op0,Op1,CRn,CRm,Ops] space. Tread carefully.
 */
#define SR_RANGE_TRAP(sr_start, sr_end, trap_id)                        \
        {                                                                \
                .encoding        = sr_start,                                \
                .end                = sr_end,                                \
                .tc                = {                                        \
                        .cgt                = trap_id,                        \
                },                                                        \
                .line = __LINE__,                                        \
        }

#define SR_TRAP(sr, trap_id)                SR_RANGE_TRAP(sr, sr, trap_id)

/*
 * Map encoding to trap bits for exception reported with EC=0x18.
 * These must only be evaluated when running a nested hypervisor, but
 * that the current context is not a hypervisor context. When the
 * trapped access matches one of the trap controls, the exception is
 * re-injected in the nested hypervisor.
 */
static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = {
        SR_TRAP(SYS_REVIDR_EL1,                CGT_HCR_TID1),
        SR_TRAP(SYS_AIDR_EL1,                CGT_HCR_TID1),
        SR_TRAP(SYS_SMIDR_EL1,                CGT_HCR_TID1),
        SR_TRAP(SYS_CTR_EL0,                CGT_HCR_TID2),
        SR_TRAP(SYS_CCSIDR_EL1,                CGT_HCR_TID2_TID4),
        SR_TRAP(SYS_CCSIDR2_EL1,        CGT_HCR_TID2_TID4),
        SR_TRAP(SYS_CLIDR_EL1,                CGT_HCR_TID2_TID4),
        SR_TRAP(SYS_CSSELR_EL1,                CGT_HCR_TID2_TID4),
        SR_RANGE_TRAP(SYS_ID_PFR0_EL1,
                      sys_reg(3, 0, 0, 7, 7), CGT_HCR_TID3),
        SR_TRAP(SYS_ICC_SGI0R_EL1,        CGT_HCR_IMO_FMO_ICH_HCR_TC),
        SR_TRAP(SYS_ICC_ASGI1R_EL1,        CGT_HCR_IMO_FMO_ICH_HCR_TC),
        SR_TRAP(SYS_ICC_SGI1R_EL1,        CGT_HCR_IMO_FMO_ICH_HCR_TC),
        SR_RANGE_TRAP(sys_reg(3, 0, 11, 0, 0),
                      sys_reg(3, 0, 11, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 1, 11, 0, 0),
                      sys_reg(3, 1, 11, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 2, 11, 0, 0),
                      sys_reg(3, 2, 11, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 3, 11, 0, 0),
                      sys_reg(3, 3, 11, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 4, 11, 0, 0),
                      sys_reg(3, 4, 11, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 5, 11, 0, 0),
                      sys_reg(3, 5, 11, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 6, 11, 0, 0),
                      sys_reg(3, 6, 11, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 7, 11, 0, 0),
                      sys_reg(3, 7, 11, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 0, 15, 0, 0),
                      sys_reg(3, 0, 15, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 1, 15, 0, 0),
                      sys_reg(3, 1, 15, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 2, 15, 0, 0),
                      sys_reg(3, 2, 15, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 3, 15, 0, 0),
                      sys_reg(3, 3, 15, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 4, 15, 0, 0),
                      sys_reg(3, 4, 15, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 5, 15, 0, 0),
                      sys_reg(3, 5, 15, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 6, 15, 0, 0),
                      sys_reg(3, 6, 15, 15, 7), CGT_HCR_TIDCP),
        SR_RANGE_TRAP(sys_reg(3, 7, 15, 0, 0),
                      sys_reg(3, 7, 15, 15, 7), CGT_HCR_TIDCP),
        SR_TRAP(SYS_ACTLR_EL1,                CGT_HCR_TACR),
        SR_TRAP(SYS_DC_ISW,                CGT_HCR_TSW),
        SR_TRAP(SYS_DC_CSW,                CGT_HCR_TSW),
        SR_TRAP(SYS_DC_CISW,                CGT_HCR_TSW),
        SR_TRAP(SYS_DC_IGSW,                CGT_HCR_TSW),
        SR_TRAP(SYS_DC_IGDSW,                CGT_HCR_TSW),
        SR_TRAP(SYS_DC_CGSW,                CGT_HCR_TSW),
        SR_TRAP(SYS_DC_CGDSW,                CGT_HCR_TSW),
        SR_TRAP(SYS_DC_CIGSW,                CGT_HCR_TSW),
        SR_TRAP(SYS_DC_CIGDSW,                CGT_HCR_TSW),
        SR_TRAP(SYS_DC_CIVAC,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CVAC,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CVAP,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CVADP,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_IVAC,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CIGVAC,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CIGDVAC,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_IGVAC,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_IGDVAC,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CGVAC,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CGDVAC,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CGVAP,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CGDVAP,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CGVADP,                CGT_HCR_TPC),
        SR_TRAP(SYS_DC_CGDVADP,                CGT_HCR_TPC),
        SR_TRAP(SYS_IC_IVAU,                CGT_HCR_TPU_TOCU),
        SR_TRAP(SYS_IC_IALLU,                CGT_HCR_TPU_TOCU),
        SR_TRAP(SYS_IC_IALLUIS,                CGT_HCR_TPU_TICAB),
        SR_TRAP(SYS_DC_CVAU,                CGT_HCR_TPU_TOCU),
        SR_TRAP(OP_TLBI_RVAE1,                CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_RVAAE1,                CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_RVALE1,                CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_RVAALE1,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VMALLE1,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VAE1,                CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_ASIDE1,                CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VAAE1,                CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VALE1,                CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VAALE1,                CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_RVAE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_RVAAE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_RVALE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_RVAALE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VMALLE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VAE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_ASIDE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VAAE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VALE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_VAALE1NXS,        CGT_HCR_TTLB),
        SR_TRAP(OP_TLBI_RVAE1IS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_RVAAE1IS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_RVALE1IS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_RVAALE1IS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VMALLE1IS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VAE1IS,                CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_ASIDE1IS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VAAE1IS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VALE1IS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VAALE1IS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_RVAE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_RVAAE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_RVALE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_RVAALE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VMALLE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VAE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_ASIDE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VAAE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VALE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VAALE1ISNXS,        CGT_HCR_TTLB_TTLBIS),
        SR_TRAP(OP_TLBI_VMALLE1OS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_VAE1OS,                CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_ASIDE1OS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_VAAE1OS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_VALE1OS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_VAALE1OS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_RVAE1OS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_RVAAE1OS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_RVALE1OS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_RVAALE1OS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_VMALLE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_VAE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_ASIDE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_VAAE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_VALE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_VAALE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_RVAE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_RVAAE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_RVALE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(OP_TLBI_RVAALE1OSNXS,        CGT_HCR_TTLB_TTLBOS),
        SR_TRAP(SYS_SCTLR_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_TTBR0_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_TTBR1_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_TCR_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_ESR_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_FAR_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_AFSR0_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_AFSR1_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_MAIR_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_AMAIR_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_CONTEXTIDR_EL1,        CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_PIR_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_PIRE0_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_POR_EL0,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_POR_EL1,                CGT_HCR_TVM_TRVM),
        SR_TRAP(SYS_TCR2_EL1,                CGT_HCR_TVM_TRVM_HCRX_TCR2En),
        SR_TRAP(SYS_DC_ZVA,                CGT_HCR_TDZ),
        SR_TRAP(SYS_DC_GVA,                CGT_HCR_TDZ),
        SR_TRAP(SYS_DC_GZVA,                CGT_HCR_TDZ),
        SR_TRAP(SYS_LORSA_EL1,                CGT_HCR_TLOR),
        SR_TRAP(SYS_LOREA_EL1,                 CGT_HCR_TLOR),
        SR_TRAP(SYS_LORN_EL1,                 CGT_HCR_TLOR),
        SR_TRAP(SYS_LORC_EL1,                 CGT_HCR_TLOR),
        SR_TRAP(SYS_LORID_EL1,                CGT_HCR_TLOR),
        SR_TRAP(SYS_ERRIDR_EL1,                CGT_HCR_TERR),
        SR_TRAP(SYS_ERRSELR_EL1,        CGT_HCR_TERR),
        SR_TRAP(SYS_ERXADDR_EL1,        CGT_HCR_TERR),
        SR_TRAP(SYS_ERXCTLR_EL1,        CGT_HCR_TERR),
        SR_TRAP(SYS_ERXFR_EL1,                CGT_HCR_TERR),
        SR_TRAP(SYS_ERXMISC0_EL1,        CGT_HCR_TERR),
        SR_TRAP(SYS_ERXMISC1_EL1,        CGT_HCR_TERR),
        SR_TRAP(SYS_ERXMISC2_EL1,        CGT_HCR_TERR),
        SR_TRAP(SYS_ERXMISC3_EL1,        CGT_HCR_TERR),
        SR_TRAP(SYS_ERXSTATUS_EL1,        CGT_HCR_TERR),
        SR_TRAP(SYS_APIAKEYLO_EL1,        CGT_HCR_APK),
        SR_TRAP(SYS_APIAKEYHI_EL1,        CGT_HCR_APK),
        SR_TRAP(SYS_APIBKEYLO_EL1,        CGT_HCR_APK),
        SR_TRAP(SYS_APIBKEYHI_EL1,        CGT_HCR_APK),
        SR_TRAP(SYS_APDAKEYLO_EL1,        CGT_HCR_APK),
        SR_TRAP(SYS_APDAKEYHI_EL1,        CGT_HCR_APK),
        SR_TRAP(SYS_APDBKEYLO_EL1,        CGT_HCR_APK),
        SR_TRAP(SYS_APDBKEYHI_EL1,        CGT_HCR_APK),
        SR_TRAP(SYS_APGAKEYLO_EL1,        CGT_HCR_APK),
        SR_TRAP(SYS_APGAKEYHI_EL1,        CGT_HCR_APK),
        /* All _EL2 registers */
        SR_TRAP(SYS_BRBCR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_VPIDR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_VMPIDR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_SCTLR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_ACTLR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_SCTLR2_EL2,                CGT_HCR_NV),
        SR_RANGE_TRAP(SYS_HCR_EL2,
                      SYS_HCRX_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_SMPRIMAP_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_SMCR_EL2,                CGT_HCR_NV),
        SR_RANGE_TRAP(SYS_TTBR0_EL2,
                      SYS_TCR2_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_VTTBR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_VTCR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_VNCR_EL2,                CGT_HCR_NV),
        SR_RANGE_TRAP(SYS_HDFGRTR_EL2,
                      SYS_HAFGRTR_EL2,        CGT_HCR_NV),
        /* Skip the SP_EL1 encoding... */
        SR_TRAP(SYS_SPSR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_ELR_EL2,                CGT_HCR_NV),
        /* Skip SPSR_irq, SPSR_abt, SPSR_und, SPSR_fiq */
        SR_TRAP(SYS_AFSR0_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_AFSR1_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_ESR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_VSESR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_TFSR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_FAR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_HPFAR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_PMSCR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_MAIR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_AMAIR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_MPAMHCR_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_MPAMVPMV_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_MPAM2_EL2,                CGT_HCR_NV),
        SR_RANGE_TRAP(SYS_MPAMVPM0_EL2,
                      SYS_MPAMVPM7_EL2,        CGT_HCR_NV),
        /*
         * Note that the spec. describes a group of MEC registers
         * whose access should not trap, therefore skip the following:
         * MECID_A0_EL2, MECID_A1_EL2, MECID_P0_EL2,
         * MECID_P1_EL2, MECIDR_EL2, VMECID_A_EL2,
         * VMECID_P_EL2.
         */
        SR_RANGE_TRAP(SYS_VBAR_EL2,
                      SYS_RMR_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_VDISR_EL2,                CGT_HCR_NV),
        /* ICH_AP0R<m>_EL2 */
        SR_RANGE_TRAP(SYS_ICH_AP0R0_EL2,
                      SYS_ICH_AP0R3_EL2, CGT_HCR_NV),
        /* ICH_AP1R<m>_EL2 */
        SR_RANGE_TRAP(SYS_ICH_AP1R0_EL2,
                      SYS_ICH_AP1R3_EL2, CGT_HCR_NV),
        SR_TRAP(SYS_ICC_SRE_EL2,        CGT_HCR_NV),
        SR_RANGE_TRAP(SYS_ICH_HCR_EL2,
                      SYS_ICH_EISR_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_ICH_ELRSR_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_ICH_VMCR_EL2,        CGT_HCR_NV),
        /* ICH_LR<m>_EL2 */
        SR_RANGE_TRAP(SYS_ICH_LR0_EL2,
                      SYS_ICH_LR15_EL2, CGT_HCR_NV),
        SR_TRAP(SYS_CONTEXTIDR_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_TPIDR_EL2,                CGT_HCR_NV),
        SR_TRAP(SYS_SCXTNUM_EL2,        CGT_HCR_NV),
        /* AMEVCNTVOFF0<n>_EL2, AMEVCNTVOFF1<n>_EL2  */
        SR_RANGE_TRAP(SYS_AMEVCNTVOFF0n_EL2(0),
                      SYS_AMEVCNTVOFF1n_EL2(15), CGT_HCR_NV),
        /* CNT*_EL2 */
        SR_TRAP(SYS_CNTVOFF_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_CNTPOFF_EL2,        CGT_HCR_NV),
        SR_TRAP(SYS_CNTHCTL_EL2,        CGT_HCR_NV),
        SR_RANGE_TRAP(SYS_CNTHP_TVAL_EL2,
                      SYS_CNTHP_CVAL_EL2, CGT_HCR_NV),
        SR_RANGE_TRAP(SYS_CNTHV_TVAL_EL2,
                      SYS_CNTHV_CVAL_EL2, CGT_HCR_NV),
        /* All _EL02, _EL12 registers up to CNTKCTL_EL12*/
        SR_RANGE_TRAP(sys_reg(3, 5, 0, 0, 0),
                      sys_reg(3, 5, 10, 15, 7), CGT_HCR_NV),
        SR_RANGE_TRAP(sys_reg(3, 5, 12, 0, 0),
                      sys_reg(3, 5, 14, 1, 0), CGT_HCR_NV),
        SR_TRAP(SYS_CNTP_CTL_EL02,        CGT_CNTHCTL_EL1NVPCT),
        SR_TRAP(SYS_CNTP_CVAL_EL02,        CGT_CNTHCTL_EL1NVPCT),
        SR_TRAP(SYS_CNTV_CTL_EL02,        CGT_CNTHCTL_EL1NVVCT),
        SR_TRAP(SYS_CNTV_CVAL_EL02,        CGT_CNTHCTL_EL1NVVCT),
        SR_TRAP(OP_AT_S1E2R,                CGT_HCR_NV),
        SR_TRAP(OP_AT_S1E2W,                CGT_HCR_NV),
        SR_TRAP(OP_AT_S12E1R,                CGT_HCR_NV),
        SR_TRAP(OP_AT_S12E1W,                CGT_HCR_NV),
        SR_TRAP(OP_AT_S12E0R,                CGT_HCR_NV),
        SR_TRAP(OP_AT_S12E0W,                CGT_HCR_NV),
        SR_TRAP(OP_AT_S1E2A,                CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2E1,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2E1,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2LE1,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2LE1,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVAE2,                CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVALE2,                CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE2,                CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VAE2,                CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE1,                CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VALE2,                CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VMALLS12E1,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2E1NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2E1NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2LE1NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2LE1NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVAE2NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVALE2NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE2NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VAE2NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE1NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VALE2NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VMALLS12E1NXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2E1IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2E1IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2LE1IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2LE1IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVAE2IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVALE2IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE2IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VAE2IS,                CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE1IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VALE2IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VMALLS12E1IS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2E1ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2E1ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2LE1ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2LE1ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVAE2ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVALE2ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE2ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VAE2ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE1ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VALE2ISNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VMALLS12E1ISNXS,CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE2OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VAE2OS,                CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE1OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VALE2OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VMALLS12E1OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2E1OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2E1OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2LE1OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2LE1OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVAE2OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVALE2OS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE2OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VAE2OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_ALLE1OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VALE2OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_VMALLS12E1OSNXS,CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2E1OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2E1OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_IPAS2LE1OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RIPAS2LE1OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVAE2OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_TLBI_RVALE2OSNXS,        CGT_HCR_NV),
        SR_TRAP(OP_CPP_RCTX,                 CGT_HCR_NV),
        SR_TRAP(OP_DVP_RCTX,                 CGT_HCR_NV),
        SR_TRAP(OP_CFP_RCTX,                 CGT_HCR_NV),
        SR_TRAP(SYS_SP_EL1,                CGT_HCR_NV_nNV2),
        SR_TRAP(SYS_VBAR_EL1,                CGT_HCR_NV1_nNV2),
        SR_TRAP(SYS_ELR_EL1,                CGT_HCR_NV1_nNV2),
        SR_TRAP(SYS_SPSR_EL1,                CGT_HCR_NV1_nNV2),
        SR_TRAP(SYS_SCXTNUM_EL1,        CGT_HCR_NV1_nNV2_ENSCXT),
        SR_TRAP(SYS_SCXTNUM_EL0,        CGT_HCR_ENSCXT),
        SR_TRAP(OP_AT_S1E1R,                 CGT_HCR_AT),
        SR_TRAP(OP_AT_S1E1W,                 CGT_HCR_AT),
        SR_TRAP(OP_AT_S1E0R,                 CGT_HCR_AT),
        SR_TRAP(OP_AT_S1E0W,                 CGT_HCR_AT),
        SR_TRAP(OP_AT_S1E1RP,                 CGT_HCR_AT),
        SR_TRAP(OP_AT_S1E1WP,                 CGT_HCR_AT),
        SR_TRAP(OP_AT_S1E1A,                CGT_HCR_AT),
        SR_TRAP(SYS_ERXPFGF_EL1,        CGT_HCR_nFIEN),
        SR_TRAP(SYS_ERXPFGCTL_EL1,        CGT_HCR_nFIEN),
        SR_TRAP(SYS_ERXPFGCDN_EL1,        CGT_HCR_nFIEN),
        SR_TRAP(SYS_PMCR_EL0,                CGT_MDCR_TPM_TPMCR),
        SR_TRAP(SYS_PMCNTENSET_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMCNTENCLR_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMOVSSET_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMOVSCLR_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMCEID0_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMCEID1_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMXEVTYPER_EL0,        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMSWINC_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMSELR_EL0,                CGT_MDCR_TPM),
        SR_TRAP(SYS_PMXEVCNTR_EL0,        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMCCNTR_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMUSERENR_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMINTENSET_EL1,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMINTENCLR_EL1,        CGT_MDCR_TPM),
        SR_TRAP(SYS_PMMIR_EL1,                CGT_MDCR_TPM),
        SR_TRAP(SYS_PMEVCNTRn_EL0(0),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(1),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(2),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(3),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(4),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(5),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(6),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(7),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(8),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(9),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(10),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(11),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(12),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(13),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(14),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(15),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(16),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(17),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(18),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(19),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(20),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(21),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(22),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(23),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(24),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(25),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(26),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(27),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(28),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(29),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVCNTRn_EL0(30),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(0),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(1),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(2),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(3),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(4),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(5),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(6),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(7),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(8),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(9),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(10),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(11),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(12),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(13),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(14),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(15),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(16),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(17),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(18),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(19),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(20),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(21),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(22),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(23),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(24),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(25),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(26),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(27),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(28),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(29),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMEVTYPERn_EL0(30),        CGT_MDCR_TPM_HPMN),
        SR_TRAP(SYS_PMCCFILTR_EL0,        CGT_MDCR_TPM),
        SR_TRAP(SYS_MDCCSR_EL0,                CGT_MDCR_TDCC_TDE_TDA),
        SR_TRAP(SYS_MDCCINT_EL1,        CGT_MDCR_TDCC_TDE_TDA),
        SR_TRAP(SYS_OSDTRRX_EL1,        CGT_MDCR_TDCC_TDE_TDA),
        SR_TRAP(SYS_OSDTRTX_EL1,        CGT_MDCR_TDCC_TDE_TDA),
        SR_TRAP(SYS_DBGDTR_EL0,                CGT_MDCR_TDCC_TDE_TDA),
        /*
         * Also covers DBGDTRRX_EL0, which has the same encoding as
         * SYS_DBGDTRTX_EL0...
         */
        SR_TRAP(SYS_DBGDTRTX_EL0,        CGT_MDCR_TDCC_TDE_TDA),
        SR_TRAP(SYS_MDSCR_EL1,                CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_OSECCR_EL1,                CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(0),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(1),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(2),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(3),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(4),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(5),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(6),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(7),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(8),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(9),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(10),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(11),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(12),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(13),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(14),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBVRn_EL1(15),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(0),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(1),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(2),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(3),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(4),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(5),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(6),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(7),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(8),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(9),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(10),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(11),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(12),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(13),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(14),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGBCRn_EL1(15),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(0),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(1),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(2),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(3),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(4),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(5),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(6),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(7),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(8),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(9),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(10),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(11),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(12),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(13),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(14),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWVRn_EL1(15),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(0),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(1),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(2),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(3),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(4),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(5),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(6),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(7),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(8),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(9),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(10),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(11),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(12),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(13),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGWCRn_EL1(14),        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGCLAIMSET_EL1,        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGCLAIMCLR_EL1,        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_DBGAUTHSTATUS_EL1,        CGT_MDCR_TDE_TDA),
        SR_TRAP(SYS_OSLAR_EL1,                CGT_MDCR_TDE_TDOSA),
        SR_TRAP(SYS_OSLSR_EL1,                CGT_MDCR_TDE_TDOSA),
        SR_TRAP(SYS_OSDLR_EL1,                CGT_MDCR_TDE_TDOSA),
        SR_TRAP(SYS_DBGPRCR_EL1,        CGT_MDCR_TDE_TDOSA),
        SR_TRAP(SYS_MDRAR_EL1,                CGT_MDCR_TDE_TDRA),
        SR_TRAP(SYS_PMBLIMITR_EL1,        CGT_MDCR_E2PB),
        SR_TRAP(SYS_PMBPTR_EL1,                CGT_MDCR_E2PB),
        SR_TRAP(SYS_PMBSR_EL1,                CGT_MDCR_E2PB),
        SR_TRAP(SYS_PMSCR_EL1,                CGT_MDCR_TPMS),
        SR_TRAP(SYS_PMSEVFR_EL1,        CGT_MDCR_TPMS),
        SR_TRAP(SYS_PMSFCR_EL1,                CGT_MDCR_TPMS),
        SR_TRAP(SYS_PMSICR_EL1,                CGT_MDCR_TPMS),
        SR_TRAP(SYS_PMSIDR_EL1,                CGT_MDCR_TPMS),
        SR_TRAP(SYS_PMSIRR_EL1,                CGT_MDCR_TPMS),
        SR_TRAP(SYS_PMSLATFR_EL1,        CGT_MDCR_TPMS),
        SR_TRAP(SYS_PMSNEVFR_EL1,        CGT_MDCR_TPMS),
        SR_TRAP(SYS_TRFCR_EL1,                CGT_MDCR_TTRF),
        SR_TRAP(SYS_TRBBASER_EL1,        CGT_MDCR_E2TB),
        SR_TRAP(SYS_TRBLIMITR_EL1,        CGT_MDCR_E2TB),
        SR_TRAP(SYS_TRBMAR_EL1,         CGT_MDCR_E2TB),
        SR_TRAP(SYS_TRBPTR_EL1,         CGT_MDCR_E2TB),
        SR_TRAP(SYS_TRBSR_EL1,                 CGT_MDCR_E2TB),
        SR_TRAP(SYS_TRBTRG_EL1,                CGT_MDCR_E2TB),
        SR_TRAP(SYS_CPACR_EL1,                CGT_CPTR_TCPAC),
        SR_TRAP(SYS_AMUSERENR_EL0,        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMCFGR_EL0,                CGT_CPTR_TAM),
        SR_TRAP(SYS_AMCGCR_EL0,                CGT_CPTR_TAM),
        SR_TRAP(SYS_AMCNTENCLR0_EL0,        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMCNTENCLR1_EL0,        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMCNTENSET0_EL0,        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMCNTENSET1_EL0,        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMCR_EL0,                CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR0_EL0(0),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR0_EL0(1),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR0_EL0(2),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR0_EL0(3),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(0),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(1),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(2),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(3),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(4),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(5),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(6),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(7),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(8),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(9),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(10),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(11),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(12),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(13),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(14),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVCNTR1_EL0(15),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER0_EL0(0),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER0_EL0(1),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER0_EL0(2),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER0_EL0(3),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(0),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(1),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(2),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(3),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(4),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(5),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(6),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(7),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(8),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(9),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(10),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(11),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(12),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(13),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(14),        CGT_CPTR_TAM),
        SR_TRAP(SYS_AMEVTYPER1_EL0(15),        CGT_CPTR_TAM),
        /* op0=2, op1=1, and CRn<0b1000 */
        SR_RANGE_TRAP(sys_reg(2, 1, 0, 0, 0),
                      sys_reg(2, 1, 7, 15, 7), CGT_CPTR_TTA),
        SR_TRAP(SYS_CNTP_TVAL_EL0,        CGT_CNTHCTL_EL1PTEN),
        SR_TRAP(SYS_CNTP_CVAL_EL0,        CGT_CNTHCTL_EL1PTEN),
        SR_TRAP(SYS_CNTP_CTL_EL0,        CGT_CNTHCTL_EL1PTEN),
        SR_TRAP(SYS_CNTPCT_EL0,                CGT_CNTHCTL_EL1PCTEN),
        SR_TRAP(SYS_CNTPCTSS_EL0,        CGT_CNTHCTL_EL1PCTEN),
        SR_TRAP(SYS_CNTV_TVAL_EL0,        CGT_CNTHCTL_EL1TVT),
        SR_TRAP(SYS_CNTV_CVAL_EL0,        CGT_CNTHCTL_EL1TVT),
        SR_TRAP(SYS_CNTV_CTL_EL0,        CGT_CNTHCTL_EL1TVT),
        SR_TRAP(SYS_CNTVCT_EL0,                CGT_CNTHCTL_EL1TVCT),
        SR_TRAP(SYS_CNTVCTSS_EL0,        CGT_CNTHCTL_EL1TVCT),
        SR_TRAP(SYS_FPMR,                CGT_HCRX_EnFPM),
        /*
         * IMPDEF choice:
         * We treat ICC_SRE_EL2.{SRE,Enable) and ICV_SRE_EL1.SRE as
         * RAO/WI. We therefore never consider ICC_SRE_EL2.Enable for
         * ICC_SRE_EL1 access, and always handle it locally.
         */
        SR_TRAP(SYS_ICC_AP0R0_EL1,        CGT_ICH_HCR_TALL0),
        SR_TRAP(SYS_ICC_AP0R1_EL1,        CGT_ICH_HCR_TALL0),
        SR_TRAP(SYS_ICC_AP0R2_EL1,        CGT_ICH_HCR_TALL0),
        SR_TRAP(SYS_ICC_AP0R3_EL1,        CGT_ICH_HCR_TALL0),
        SR_TRAP(SYS_ICC_AP1R0_EL1,        CGT_ICH_HCR_TALL1),
        SR_TRAP(SYS_ICC_AP1R1_EL1,        CGT_ICH_HCR_TALL1),
        SR_TRAP(SYS_ICC_AP1R2_EL1,        CGT_ICH_HCR_TALL1),
        SR_TRAP(SYS_ICC_AP1R3_EL1,        CGT_ICH_HCR_TALL1),
        SR_TRAP(SYS_ICC_BPR0_EL1,        CGT_ICH_HCR_TALL0),
        SR_TRAP(SYS_ICC_BPR1_EL1,        CGT_ICH_HCR_TALL1),
        SR_TRAP(SYS_ICC_CTLR_EL1,        CGT_ICH_HCR_TC),
        SR_TRAP(SYS_ICC_DIR_EL1,        CGT_ICH_HCR_TC_TDIR),
        SR_TRAP(SYS_ICC_EOIR0_EL1,        CGT_ICH_HCR_TALL0),
        SR_TRAP(SYS_ICC_EOIR1_EL1,        CGT_ICH_HCR_TALL1),
        SR_TRAP(SYS_ICC_HPPIR0_EL1,        CGT_ICH_HCR_TALL0),
        SR_TRAP(SYS_ICC_HPPIR1_EL1,        CGT_ICH_HCR_TALL1),
        SR_TRAP(SYS_ICC_IAR0_EL1,        CGT_ICH_HCR_TALL0),
        SR_TRAP(SYS_ICC_IAR1_EL1,        CGT_ICH_HCR_TALL1),
        SR_TRAP(SYS_ICC_IGRPEN0_EL1,        CGT_ICH_HCR_TALL0),
        SR_TRAP(SYS_ICC_IGRPEN1_EL1,        CGT_ICH_HCR_TALL1),
        SR_TRAP(SYS_ICC_PMR_EL1,        CGT_ICH_HCR_TC),
        SR_TRAP(SYS_ICC_RPR_EL1,        CGT_ICH_HCR_TC),
};

static DEFINE_XARRAY(sr_forward_xa);

enum fg_filter_id {
        __NO_FGF__,
        HCRX_FGTnXS,

        /* Must be last */
        __NR_FG_FILTER_IDS__
};

#define __FGT(g, b, p, f)                                        \
                {                                                \
                        .fgt = g ## _GROUP,                        \
                        .bit = g ## _EL2_ ## b ## _SHIFT,        \
                        .pol = p,                                \
                        .fgf = f,                                \
                }

#define FGT(g, b, p)                __FGT(g, b, p, __NO_FGF__)

/*
 * See the warning next to SR_RANGE_TRAP(), and apply the same
 * level of caution.
 */
#define SR_FGF_RANGE(sr, e, g, b, p, f)                                \
        {                                                        \
                .encoding        = sr,                                \
                .end                = e,                                \
                .tc                = __FGT(g, b, p, f),                \
                .line = __LINE__,                                \
        }

#define SR_FGF(sr, g, b, p, f)         SR_FGF_RANGE(sr, sr, g, b, p, f)
#define SR_FGT(sr, g, b, p)        SR_FGF_RANGE(sr, sr, g, b, p, __NO_FGF__)
#define SR_FGT_RANGE(sr, end, g, b, p)        \
                                SR_FGF_RANGE(sr, end, g, b, p, __NO_FGF__)

static const struct encoding_to_trap_config encoding_to_fgt[] __initconst = {
        /* HFGRTR_EL2, HFGWTR_EL2 */
        SR_FGT(SYS_AMAIR2_EL1,                HFGRTR, nAMAIR2_EL1, 0),
        SR_FGT(SYS_MAIR2_EL1,                HFGRTR, nMAIR2_EL1, 0),
        SR_FGT(SYS_S2POR_EL1,                HFGRTR, nS2POR_EL1, 0),
        SR_FGT(SYS_POR_EL1,                HFGRTR, nPOR_EL1, 0),
        SR_FGT(SYS_POR_EL0,                HFGRTR, nPOR_EL0, 0),
        SR_FGT(SYS_PIR_EL1,                HFGRTR, nPIR_EL1, 0),
        SR_FGT(SYS_PIRE0_EL1,                HFGRTR, nPIRE0_EL1, 0),
        SR_FGT(SYS_RCWMASK_EL1,                HFGRTR, nRCWMASK_EL1, 0),
        SR_FGT(SYS_TPIDR2_EL0,                HFGRTR, nTPIDR2_EL0, 0),
        SR_FGT(SYS_SMPRI_EL1,                HFGRTR, nSMPRI_EL1, 0),
        SR_FGT(SYS_GCSCR_EL1,                HFGRTR, nGCS_EL1, 0),
        SR_FGT(SYS_GCSPR_EL1,                HFGRTR, nGCS_EL1, 0),
        SR_FGT(SYS_GCSCRE0_EL1,                HFGRTR, nGCS_EL0, 0),
        SR_FGT(SYS_GCSPR_EL0,                HFGRTR, nGCS_EL0, 0),
        SR_FGT(SYS_ACCDATA_EL1,                HFGRTR, nACCDATA_EL1, 0),
        SR_FGT(SYS_ERXADDR_EL1,                HFGRTR, ERXADDR_EL1, 1),
        SR_FGT(SYS_ERXPFGCDN_EL1,        HFGRTR, ERXPFGCDN_EL1, 1),
        SR_FGT(SYS_ERXPFGCTL_EL1,        HFGRTR, ERXPFGCTL_EL1, 1),
        SR_FGT(SYS_ERXPFGF_EL1,                HFGRTR, ERXPFGF_EL1, 1),
        SR_FGT(SYS_ERXMISC0_EL1,        HFGRTR, ERXMISCn_EL1, 1),
        SR_FGT(SYS_ERXMISC1_EL1,        HFGRTR, ERXMISCn_EL1, 1),
        SR_FGT(SYS_ERXMISC2_EL1,        HFGRTR, ERXMISCn_EL1, 1),
        SR_FGT(SYS_ERXMISC3_EL1,        HFGRTR, ERXMISCn_EL1, 1),
        SR_FGT(SYS_ERXSTATUS_EL1,        HFGRTR, ERXSTATUS_EL1, 1),
        SR_FGT(SYS_ERXCTLR_EL1,                HFGRTR, ERXCTLR_EL1, 1),
        SR_FGT(SYS_ERXFR_EL1,                HFGRTR, ERXFR_EL1, 1),
        SR_FGT(SYS_ERRSELR_EL1,                HFGRTR, ERRSELR_EL1, 1),
        SR_FGT(SYS_ERRIDR_EL1,                HFGRTR, ERRIDR_EL1, 1),
        SR_FGT(SYS_ICC_IGRPEN0_EL1,        HFGRTR, ICC_IGRPENn_EL1, 1),
        SR_FGT(SYS_ICC_IGRPEN1_EL1,        HFGRTR, ICC_IGRPENn_EL1, 1),
        SR_FGT(SYS_VBAR_EL1,                HFGRTR, VBAR_EL1, 1),
        SR_FGT(SYS_TTBR1_EL1,                HFGRTR, TTBR1_EL1, 1),
        SR_FGT(SYS_TTBR0_EL1,                HFGRTR, TTBR0_EL1, 1),
        SR_FGT(SYS_TPIDR_EL0,                HFGRTR, TPIDR_EL0, 1),
        SR_FGT(SYS_TPIDRRO_EL0,                HFGRTR, TPIDRRO_EL0, 1),
        SR_FGT(SYS_TPIDR_EL1,                HFGRTR, TPIDR_EL1, 1),
        SR_FGT(SYS_TCR_EL1,                HFGRTR, TCR_EL1, 1),
        SR_FGT(SYS_TCR2_EL1,                HFGRTR, TCR_EL1, 1),
        SR_FGT(SYS_SCXTNUM_EL0,                HFGRTR, SCXTNUM_EL0, 1),
        SR_FGT(SYS_SCXTNUM_EL1,         HFGRTR, SCXTNUM_EL1, 1),
        SR_FGT(SYS_SCTLR_EL1,                 HFGRTR, SCTLR_EL1, 1),
        SR_FGT(SYS_REVIDR_EL1,                 HFGRTR, REVIDR_EL1, 1),
        SR_FGT(SYS_PAR_EL1,                 HFGRTR, PAR_EL1, 1),
        SR_FGT(SYS_MPIDR_EL1,                 HFGRTR, MPIDR_EL1, 1),
        SR_FGT(SYS_MIDR_EL1,                 HFGRTR, MIDR_EL1, 1),
        SR_FGT(SYS_MAIR_EL1,                 HFGRTR, MAIR_EL1, 1),
        SR_FGT(SYS_LORSA_EL1,                 HFGRTR, LORSA_EL1, 1),
        SR_FGT(SYS_LORN_EL1,                 HFGRTR, LORN_EL1, 1),
        SR_FGT(SYS_LORID_EL1,                 HFGRTR, LORID_EL1, 1),
        SR_FGT(SYS_LOREA_EL1,                 HFGRTR, LOREA_EL1, 1),
        SR_FGT(SYS_LORC_EL1,                 HFGRTR, LORC_EL1, 1),
        SR_FGT(SYS_ISR_EL1,                 HFGRTR, ISR_EL1, 1),
        SR_FGT(SYS_FAR_EL1,                 HFGRTR, FAR_EL1, 1),
        SR_FGT(SYS_ESR_EL1,                 HFGRTR, ESR_EL1, 1),
        SR_FGT(SYS_DCZID_EL0,                 HFGRTR, DCZID_EL0, 1),
        SR_FGT(SYS_CTR_EL0,                 HFGRTR, CTR_EL0, 1),
        SR_FGT(SYS_CSSELR_EL1,                 HFGRTR, CSSELR_EL1, 1),
        SR_FGT(SYS_CPACR_EL1,                 HFGRTR, CPACR_EL1, 1),
        SR_FGT(SYS_CONTEXTIDR_EL1,         HFGRTR, CONTEXTIDR_EL1, 1),
        SR_FGT(SYS_CLIDR_EL1,                 HFGRTR, CLIDR_EL1, 1),
        SR_FGT(SYS_CCSIDR_EL1,                 HFGRTR, CCSIDR_EL1, 1),
        SR_FGT(SYS_APIBKEYLO_EL1,         HFGRTR, APIBKey, 1),
        SR_FGT(SYS_APIBKEYHI_EL1,         HFGRTR, APIBKey, 1),
        SR_FGT(SYS_APIAKEYLO_EL1,         HFGRTR, APIAKey, 1),
        SR_FGT(SYS_APIAKEYHI_EL1,         HFGRTR, APIAKey, 1),
        SR_FGT(SYS_APGAKEYLO_EL1,         HFGRTR, APGAKey, 1),
        SR_FGT(SYS_APGAKEYHI_EL1,         HFGRTR, APGAKey, 1),
        SR_FGT(SYS_APDBKEYLO_EL1,         HFGRTR, APDBKey, 1),
        SR_FGT(SYS_APDBKEYHI_EL1,         HFGRTR, APDBKey, 1),
        SR_FGT(SYS_APDAKEYLO_EL1,         HFGRTR, APDAKey, 1),
        SR_FGT(SYS_APDAKEYHI_EL1,         HFGRTR, APDAKey, 1),
        SR_FGT(SYS_AMAIR_EL1,                 HFGRTR, AMAIR_EL1, 1),
        SR_FGT(SYS_AIDR_EL1,                 HFGRTR, AIDR_EL1, 1),
        SR_FGT(SYS_AFSR1_EL1,                 HFGRTR, AFSR1_EL1, 1),
        SR_FGT(SYS_AFSR0_EL1,                 HFGRTR, AFSR0_EL1, 1),

        /* HFGRTR2_EL2, HFGWTR2_EL2 */
        SR_FGT(SYS_ACTLRALIAS_EL1,        HFGRTR2, nACTLRALIAS_EL1, 0),
        SR_FGT(SYS_ACTLRMASK_EL1,        HFGRTR2, nACTLRMASK_EL1, 0),
        SR_FGT(SYS_CPACRALIAS_EL1,        HFGRTR2, nCPACRALIAS_EL1, 0),
        SR_FGT(SYS_CPACRMASK_EL1,        HFGRTR2, nCPACRMASK_EL1, 0),
        SR_FGT(SYS_PFAR_EL1,                HFGRTR2, nPFAR_EL1, 0),
        SR_FGT(SYS_RCWSMASK_EL1,        HFGRTR2, nRCWSMASK_EL1, 0),
        SR_FGT(SYS_SCTLR2ALIAS_EL1,        HFGRTR2, nSCTLRALIAS2_EL1, 0),
        SR_FGT(SYS_SCTLR2MASK_EL1,        HFGRTR2, nSCTLR2MASK_EL1, 0),
        SR_FGT(SYS_SCTLRALIAS_EL1,        HFGRTR2, nSCTLRALIAS_EL1, 0),
        SR_FGT(SYS_SCTLRMASK_EL1,        HFGRTR2, nSCTLRMASK_EL1, 0),
        SR_FGT(SYS_TCR2ALIAS_EL1,        HFGRTR2, nTCR2ALIAS_EL1, 0),
        SR_FGT(SYS_TCR2MASK_EL1,        HFGRTR2, nTCR2MASK_EL1, 0),
        SR_FGT(SYS_TCRALIAS_EL1,        HFGRTR2, nTCRALIAS_EL1, 0),
        SR_FGT(SYS_TCRMASK_EL1,                HFGRTR2, nTCRMASK_EL1, 0),
        SR_FGT(SYS_ERXGSR_EL1,                HFGRTR2, nERXGSR_EL1, 0),

        /* HFGITR_EL2 */
        SR_FGT(OP_AT_S1E1A,                 HFGITR, ATS1E1A, 1),
        SR_FGT(OP_COSP_RCTX,                 HFGITR, COSPRCTX, 1),
        SR_FGT(OP_GCSPUSHX,                 HFGITR, nGCSEPP, 0),
        SR_FGT(OP_GCSPOPX,                 HFGITR, nGCSEPP, 0),
        SR_FGT(OP_GCSPUSHM,                 HFGITR, nGCSPUSHM_EL1, 0),
        SR_FGT(OP_BRB_IALL,                 HFGITR, nBRBIALL, 0),
        SR_FGT(OP_BRB_INJ,                 HFGITR, nBRBINJ, 0),
        SR_FGT(SYS_DC_CVAC,                 HFGITR, DCCVAC, 1),
        SR_FGT(SYS_DC_CGVAC,                 HFGITR, DCCVAC, 1),
        SR_FGT(SYS_DC_CGDVAC,                 HFGITR, DCCVAC, 1),
        SR_FGT(OP_CPP_RCTX,                 HFGITR, CPPRCTX, 1),
        SR_FGT(OP_DVP_RCTX,                 HFGITR, DVPRCTX, 1),
        SR_FGT(OP_CFP_RCTX,                 HFGITR, CFPRCTX, 1),
        SR_FGT(OP_TLBI_VAALE1,                 HFGITR, TLBIVAALE1, 1),
        SR_FGT(OP_TLBI_VALE1,                 HFGITR, TLBIVALE1, 1),
        SR_FGT(OP_TLBI_VAAE1,                 HFGITR, TLBIVAAE1, 1),
        SR_FGT(OP_TLBI_ASIDE1,                 HFGITR, TLBIASIDE1, 1),
        SR_FGT(OP_TLBI_VAE1,                 HFGITR, TLBIVAE1, 1),
        SR_FGT(OP_TLBI_VMALLE1,         HFGITR, TLBIVMALLE1, 1),
        SR_FGT(OP_TLBI_RVAALE1,         HFGITR, TLBIRVAALE1, 1),
        SR_FGT(OP_TLBI_RVALE1,                 HFGITR, TLBIRVALE1, 1),
        SR_FGT(OP_TLBI_RVAAE1,                 HFGITR, TLBIRVAAE1, 1),
        SR_FGT(OP_TLBI_RVAE1,                 HFGITR, TLBIRVAE1, 1),
        SR_FGT(OP_TLBI_RVAALE1IS,         HFGITR, TLBIRVAALE1IS, 1),
        SR_FGT(OP_TLBI_RVALE1IS,         HFGITR, TLBIRVALE1IS, 1),
        SR_FGT(OP_TLBI_RVAAE1IS,         HFGITR, TLBIRVAAE1IS, 1),
        SR_FGT(OP_TLBI_RVAE1IS,         HFGITR, TLBIRVAE1IS, 1),
        SR_FGT(OP_TLBI_VAALE1IS,         HFGITR, TLBIVAALE1IS, 1),
        SR_FGT(OP_TLBI_VALE1IS,         HFGITR, TLBIVALE1IS, 1),
        SR_FGT(OP_TLBI_VAAE1IS,         HFGITR, TLBIVAAE1IS, 1),
        SR_FGT(OP_TLBI_ASIDE1IS,         HFGITR, TLBIASIDE1IS, 1),
        SR_FGT(OP_TLBI_VAE1IS,                 HFGITR, TLBIVAE1IS, 1),
        SR_FGT(OP_TLBI_VMALLE1IS,         HFGITR, TLBIVMALLE1IS, 1),
        SR_FGT(OP_TLBI_RVAALE1OS,         HFGITR, TLBIRVAALE1OS, 1),
        SR_FGT(OP_TLBI_RVALE1OS,         HFGITR, TLBIRVALE1OS, 1),
        SR_FGT(OP_TLBI_RVAAE1OS,         HFGITR, TLBIRVAAE1OS, 1),
        SR_FGT(OP_TLBI_RVAE1OS,         HFGITR, TLBIRVAE1OS, 1),
        SR_FGT(OP_TLBI_VAALE1OS,         HFGITR, TLBIVAALE1OS, 1),
        SR_FGT(OP_TLBI_VALE1OS,         HFGITR, TLBIVALE1OS, 1),
        SR_FGT(OP_TLBI_VAAE1OS,         HFGITR, TLBIVAAE1OS, 1),
        SR_FGT(OP_TLBI_ASIDE1OS,         HFGITR, TLBIASIDE1OS, 1),
        SR_FGT(OP_TLBI_VAE1OS,                 HFGITR, TLBIVAE1OS, 1),
        SR_FGT(OP_TLBI_VMALLE1OS,         HFGITR, TLBIVMALLE1OS, 1),
        /* nXS variants must be checked against HCRX_EL2.FGTnXS */
        SR_FGF(OP_TLBI_VAALE1NXS,         HFGITR, TLBIVAALE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VALE1NXS,         HFGITR, TLBIVALE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VAAE1NXS,         HFGITR, TLBIVAAE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_ASIDE1NXS,         HFGITR, TLBIASIDE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VAE1NXS,         HFGITR, TLBIVAE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VMALLE1NXS,         HFGITR, TLBIVMALLE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVAALE1NXS,         HFGITR, TLBIRVAALE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVALE1NXS,         HFGITR, TLBIRVALE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVAAE1NXS,         HFGITR, TLBIRVAAE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVAE1NXS,         HFGITR, TLBIRVAE1, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVAALE1ISNXS,         HFGITR, TLBIRVAALE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVALE1ISNXS,         HFGITR, TLBIRVALE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVAAE1ISNXS,         HFGITR, TLBIRVAAE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVAE1ISNXS,         HFGITR, TLBIRVAE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VAALE1ISNXS,         HFGITR, TLBIVAALE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VALE1ISNXS,         HFGITR, TLBIVALE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VAAE1ISNXS,         HFGITR, TLBIVAAE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_ASIDE1ISNXS,         HFGITR, TLBIASIDE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VAE1ISNXS,         HFGITR, TLBIVAE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VMALLE1ISNXS,         HFGITR, TLBIVMALLE1IS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVAALE1OSNXS,         HFGITR, TLBIRVAALE1OS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVALE1OSNXS,         HFGITR, TLBIRVALE1OS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVAAE1OSNXS,         HFGITR, TLBIRVAAE1OS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_RVAE1OSNXS,         HFGITR, TLBIRVAE1OS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VAALE1OSNXS,         HFGITR, TLBIVAALE1OS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VALE1OSNXS,         HFGITR, TLBIVALE1OS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VAAE1OSNXS,         HFGITR, TLBIVAAE1OS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_ASIDE1OSNXS,         HFGITR, TLBIASIDE1OS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VAE1OSNXS,         HFGITR, TLBIVAE1OS, 1, HCRX_FGTnXS),
        SR_FGF(OP_TLBI_VMALLE1OSNXS,         HFGITR, TLBIVMALLE1OS, 1, HCRX_FGTnXS),
        SR_FGT(OP_AT_S1E1WP,                 HFGITR, ATS1E1WP, 1),
        SR_FGT(OP_AT_S1E1RP,                 HFGITR, ATS1E1RP, 1),
        SR_FGT(OP_AT_S1E0W,                 HFGITR, ATS1E0W, 1),
        SR_FGT(OP_AT_S1E0R,                 HFGITR, ATS1E0R, 1),
        SR_FGT(OP_AT_S1E1W,                 HFGITR, ATS1E1W, 1),
        SR_FGT(OP_AT_S1E1R,                 HFGITR, ATS1E1R, 1),
        SR_FGT(SYS_DC_ZVA,                 HFGITR, DCZVA, 1),
        SR_FGT(SYS_DC_GVA,                 HFGITR, DCZVA, 1),
        SR_FGT(SYS_DC_GZVA,                 HFGITR, DCZVA, 1),
        SR_FGT(SYS_DC_CIVAC,                 HFGITR, DCCIVAC, 1),
        SR_FGT(SYS_DC_CIGVAC,                 HFGITR, DCCIVAC, 1),
        SR_FGT(SYS_DC_CIGDVAC,                 HFGITR, DCCIVAC, 1),
        SR_FGT(SYS_DC_CVADP,                 HFGITR, DCCVADP, 1),
        SR_FGT(SYS_DC_CGVADP,                 HFGITR, DCCVADP, 1),
        SR_FGT(SYS_DC_CGDVADP,                 HFGITR, DCCVADP, 1),
        SR_FGT(SYS_DC_CVAP,                 HFGITR, DCCVAP, 1),
        SR_FGT(SYS_DC_CGVAP,                 HFGITR, DCCVAP, 1),
        SR_FGT(SYS_DC_CGDVAP,                 HFGITR, DCCVAP, 1),
        SR_FGT(SYS_DC_CVAU,                 HFGITR, DCCVAU, 1),
        SR_FGT(SYS_DC_CISW,                 HFGITR, DCCISW, 1),
        SR_FGT(SYS_DC_CIGSW,                 HFGITR, DCCISW, 1),
        SR_FGT(SYS_DC_CIGDSW,                 HFGITR, DCCISW, 1),
        SR_FGT(SYS_DC_CSW,                 HFGITR, DCCSW, 1),
        SR_FGT(SYS_DC_CGSW,                 HFGITR, DCCSW, 1),
        SR_FGT(SYS_DC_CGDSW,                 HFGITR, DCCSW, 1),
        SR_FGT(SYS_DC_ISW,                 HFGITR, DCISW, 1),
        SR_FGT(SYS_DC_IGSW,                 HFGITR, DCISW, 1),
        SR_FGT(SYS_DC_IGDSW,                 HFGITR, DCISW, 1),
        SR_FGT(SYS_DC_IVAC,                 HFGITR, DCIVAC, 1),
        SR_FGT(SYS_DC_IGVAC,                 HFGITR, DCIVAC, 1),
        SR_FGT(SYS_DC_IGDVAC,                 HFGITR, DCIVAC, 1),
        SR_FGT(SYS_IC_IVAU,                 HFGITR, ICIVAU, 1),
        SR_FGT(SYS_IC_IALLU,                 HFGITR, ICIALLU, 1),
        SR_FGT(SYS_IC_IALLUIS,                 HFGITR, ICIALLUIS, 1),

        /* HFGITR2_EL2 */
        SR_FGT(SYS_DC_CIGDVAPS,                HFGITR2, nDCCIVAPS, 0),
        SR_FGT(SYS_DC_CIVAPS,                HFGITR2, nDCCIVAPS, 0),

        /* HDFGRTR_EL2 */
        SR_FGT(SYS_PMBIDR_EL1,                 HDFGRTR, PMBIDR_EL1, 1),
        SR_FGT(SYS_PMSNEVFR_EL1,         HDFGRTR, nPMSNEVFR_EL1, 0),
        SR_FGT(SYS_BRBINF_EL1(0),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(1),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(2),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(3),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(4),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(5),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(6),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(7),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(8),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(9),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(10),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(11),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(12),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(13),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(14),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(15),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(16),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(17),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(18),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(19),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(20),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(21),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(22),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(23),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(24),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(25),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(26),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(27),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(28),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(29),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(30),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINF_EL1(31),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBINFINJ_EL1,         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(0),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(1),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(2),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(3),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(4),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(5),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(6),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(7),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(8),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(9),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(10),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(11),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(12),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(13),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(14),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(15),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(16),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(17),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(18),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(19),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(20),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(21),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(22),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(23),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(24),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(25),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(26),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(27),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(28),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(29),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(30),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRC_EL1(31),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBSRCINJ_EL1,         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(0),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(1),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(2),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(3),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(4),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(5),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(6),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(7),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(8),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(9),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(10),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(11),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(12),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(13),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(14),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(15),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(16),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(17),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(18),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(19),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(20),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(21),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(22),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(23),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(24),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(25),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(26),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(27),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(28),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(29),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(30),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGT_EL1(31),         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTGTINJ_EL1,         HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBTS_EL1,                 HDFGRTR, nBRBDATA, 0),
        SR_FGT(SYS_BRBCR_EL1,                 HDFGRTR, nBRBCTL, 0),
        SR_FGT(SYS_BRBFCR_EL1,                 HDFGRTR, nBRBCTL, 0),
        SR_FGT(SYS_BRBIDR0_EL1,         HDFGRTR, nBRBIDR, 0),
        SR_FGT(SYS_PMCEID0_EL0,         HDFGRTR, PMCEIDn_EL0, 1),
        SR_FGT(SYS_PMCEID1_EL0,         HDFGRTR, PMCEIDn_EL0, 1),
        SR_FGT(SYS_PMUSERENR_EL0,         HDFGRTR, PMUSERENR_EL0, 1),
        SR_FGT(SYS_TRBTRG_EL1,                 HDFGRTR, TRBTRG_EL1, 1),
        SR_FGT(SYS_TRBSR_EL1,                 HDFGRTR, TRBSR_EL1, 1),
        SR_FGT(SYS_TRBPTR_EL1,                 HDFGRTR, TRBPTR_EL1, 1),
        SR_FGT(SYS_TRBMAR_EL1,                 HDFGRTR, TRBMAR_EL1, 1),
        SR_FGT(SYS_TRBLIMITR_EL1,         HDFGRTR, TRBLIMITR_EL1, 1),
        SR_FGT(SYS_TRBIDR_EL1,                 HDFGRTR, TRBIDR_EL1, 1),
        SR_FGT(SYS_TRBBASER_EL1,         HDFGRTR, TRBBASER_EL1, 1),
        SR_FGT(SYS_TRCVICTLR,                 HDFGRTR, TRCVICTLR, 1),
        SR_FGT(SYS_TRCSTATR,                 HDFGRTR, TRCSTATR, 1),
        SR_FGT(SYS_TRCSSCSR(0),         HDFGRTR, TRCSSCSRn, 1),
        SR_FGT(SYS_TRCSSCSR(1),         HDFGRTR, TRCSSCSRn, 1),
        SR_FGT(SYS_TRCSSCSR(2),         HDFGRTR, TRCSSCSRn, 1),
        SR_FGT(SYS_TRCSSCSR(3),         HDFGRTR, TRCSSCSRn, 1),
        SR_FGT(SYS_TRCSSCSR(4),         HDFGRTR, TRCSSCSRn, 1),
        SR_FGT(SYS_TRCSSCSR(5),         HDFGRTR, TRCSSCSRn, 1),
        SR_FGT(SYS_TRCSSCSR(6),         HDFGRTR, TRCSSCSRn, 1),
        SR_FGT(SYS_TRCSSCSR(7),         HDFGRTR, TRCSSCSRn, 1),
        SR_FGT(SYS_TRCSEQSTR,                 HDFGRTR, TRCSEQSTR, 1),
        SR_FGT(SYS_TRCPRGCTLR,                 HDFGRTR, TRCPRGCTLR, 1),
        SR_FGT(SYS_TRCOSLSR,                 HDFGRTR, TRCOSLSR, 1),
        SR_FGT(SYS_TRCIMSPEC(0),         HDFGRTR, TRCIMSPECn, 1),
        SR_FGT(SYS_TRCIMSPEC(1),         HDFGRTR, TRCIMSPECn, 1),
        SR_FGT(SYS_TRCIMSPEC(2),         HDFGRTR, TRCIMSPECn, 1),
        SR_FGT(SYS_TRCIMSPEC(3),         HDFGRTR, TRCIMSPECn, 1),
        SR_FGT(SYS_TRCIMSPEC(4),         HDFGRTR, TRCIMSPECn, 1),
        SR_FGT(SYS_TRCIMSPEC(5),         HDFGRTR, TRCIMSPECn, 1),
        SR_FGT(SYS_TRCIMSPEC(6),         HDFGRTR, TRCIMSPECn, 1),
        SR_FGT(SYS_TRCIMSPEC(7),         HDFGRTR, TRCIMSPECn, 1),
        SR_FGT(SYS_TRCDEVARCH,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCDEVID,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR0,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR1,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR2,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR3,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR4,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR5,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR6,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR7,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR8,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR9,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR10,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR11,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR12,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCIDR13,                 HDFGRTR, TRCID, 1),
        SR_FGT(SYS_TRCCNTVR(0),         HDFGRTR, TRCCNTVRn, 1),
        SR_FGT(SYS_TRCCNTVR(1),         HDFGRTR, TRCCNTVRn, 1),
        SR_FGT(SYS_TRCCNTVR(2),         HDFGRTR, TRCCNTVRn, 1),
        SR_FGT(SYS_TRCCNTVR(3),         HDFGRTR, TRCCNTVRn, 1),
        SR_FGT(SYS_TRCCLAIMCLR,         HDFGRTR, TRCCLAIM, 1),
        SR_FGT(SYS_TRCCLAIMSET,         HDFGRTR, TRCCLAIM, 1),
        SR_FGT(SYS_TRCAUXCTLR,                 HDFGRTR, TRCAUXCTLR, 1),
        SR_FGT(SYS_TRCAUTHSTATUS,         HDFGRTR, TRCAUTHSTATUS, 1),
        SR_FGT(SYS_TRCACATR(0),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(1),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(3),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(4),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(5),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(6),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(7),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(8),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(9),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(10),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(11),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(12),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(13),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(14),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACATR(15),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(0),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(1),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(2),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(3),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(4),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(5),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(6),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(7),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(8),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(9),                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(10),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(11),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(12),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(13),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(14),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCACVR(15),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCBBCTLR,                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCCCTLR,                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCCTLR0,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCCTLR1,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCVR(0),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCVR(1),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCVR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCVR(3),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCVR(4),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCVR(5),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCVR(6),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCIDCVR(7),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCNTCTLR(0),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCNTCTLR(1),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCNTCTLR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCNTCTLR(3),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCNTRLDVR(0),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCNTRLDVR(1),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCNTRLDVR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCNTRLDVR(3),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCCONFIGR,                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCEVENTCTL0R,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCEVENTCTL1R,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCEXTINSELR(0),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCEXTINSELR(1),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCEXTINSELR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCEXTINSELR(3),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCQCTLR,                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(3),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(4),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(5),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(6),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(7),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(8),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(9),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(10),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(11),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(12),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(13),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(14),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(15),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(16),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(17),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(18),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(19),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(20),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(21),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(22),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(23),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(24),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(25),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(26),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(27),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(28),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(29),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(30),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSCTLR(31),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCRSR,                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSEQEVR(0),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSEQEVR(1),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSEQEVR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSEQRSTEVR,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSCCR(0),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSCCR(1),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSCCR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSCCR(3),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSCCR(4),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSCCR(5),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSCCR(6),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSCCR(7),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSPCICR(0),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSPCICR(1),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSPCICR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSPCICR(3),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSPCICR(4),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSPCICR(5),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSPCICR(6),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSSPCICR(7),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSTALLCTLR,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCSYNCPR,                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCTRACEIDR,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCTSCTLR,                 HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVIIECTLR,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVIPCSSCTLR,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVISSCTLR,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCCTLR0,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCCTLR1,         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCVR(0),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCVR(1),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCVR(2),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCVR(3),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCVR(4),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCVR(5),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCVR(6),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_TRCVMIDCVR(7),         HDFGRTR, TRC, 1),
        SR_FGT(SYS_PMSLATFR_EL1,         HDFGRTR, PMSLATFR_EL1, 1),
        SR_FGT(SYS_PMSIRR_EL1,                 HDFGRTR, PMSIRR_EL1, 1),
        SR_FGT(SYS_PMSIDR_EL1,                 HDFGRTR, PMSIDR_EL1, 1),
        SR_FGT(SYS_PMSICR_EL1,                 HDFGRTR, PMSICR_EL1, 1),
        SR_FGT(SYS_PMSFCR_EL1,                 HDFGRTR, PMSFCR_EL1, 1),
        SR_FGT(SYS_PMSEVFR_EL1,         HDFGRTR, PMSEVFR_EL1, 1),
        SR_FGT(SYS_PMSCR_EL1,                 HDFGRTR, PMSCR_EL1, 1),
        SR_FGT(SYS_PMBSR_EL1,                 HDFGRTR, PMBSR_EL1, 1),
        SR_FGT(SYS_PMBPTR_EL1,                 HDFGRTR, PMBPTR_EL1, 1),
        SR_FGT(SYS_PMBLIMITR_EL1,         HDFGRTR, PMBLIMITR_EL1, 1),
        SR_FGT(SYS_PMMIR_EL1,                 HDFGRTR, PMMIR_EL1, 1),
        SR_FGT(SYS_PMSELR_EL0,                 HDFGRTR, PMSELR_EL0, 1),
        SR_FGT(SYS_PMOVSCLR_EL0,         HDFGRTR, PMOVS, 1),
        SR_FGT(SYS_PMOVSSET_EL0,         HDFGRTR, PMOVS, 1),
        SR_FGT(SYS_PMINTENCLR_EL1,         HDFGRTR, PMINTEN, 1),
        SR_FGT(SYS_PMINTENSET_EL1,         HDFGRTR, PMINTEN, 1),
        SR_FGT(SYS_PMCNTENCLR_EL0,         HDFGRTR, PMCNTEN, 1),
        SR_FGT(SYS_PMCNTENSET_EL0,         HDFGRTR, PMCNTEN, 1),
        SR_FGT(SYS_PMCCNTR_EL0,         HDFGRTR, PMCCNTR_EL0, 1),
        SR_FGT(SYS_PMCCFILTR_EL0,         HDFGRTR, PMCCFILTR_EL0, 1),
        SR_FGT_RANGE(SYS_PMEVTYPERn_EL0(0),
                     SYS_PMEVTYPERn_EL0(30),
                     HDFGRTR, PMEVTYPERn_EL0, 1),
        SR_FGT_RANGE(SYS_PMEVCNTRn_EL0(0),
                     SYS_PMEVCNTRn_EL0(30),
                     HDFGRTR, PMEVCNTRn_EL0, 1),
        SR_FGT(SYS_OSDLR_EL1,                 HDFGRTR, OSDLR_EL1, 1),
        SR_FGT(SYS_OSECCR_EL1,                 HDFGRTR, OSECCR_EL1, 1),
        SR_FGT(SYS_OSLSR_EL1,                 HDFGRTR, OSLSR_EL1, 1),
        SR_FGT(SYS_DBGPRCR_EL1,         HDFGRTR, DBGPRCR_EL1, 1),
        SR_FGT(SYS_DBGAUTHSTATUS_EL1,         HDFGRTR, DBGAUTHSTATUS_EL1, 1),
        SR_FGT(SYS_DBGCLAIMSET_EL1,         HDFGRTR, DBGCLAIM, 1),
        SR_FGT(SYS_DBGCLAIMCLR_EL1,         HDFGRTR, DBGCLAIM, 1),
        SR_FGT(SYS_MDSCR_EL1,                 HDFGRTR, MDSCR_EL1, 1),
        /*
         * The trap bits capture *64* debug registers per bit, but the
         * ARM ARM only describes the encoding for the first 16, and
         * we don't really support more than that anyway.
         */
        SR_FGT(SYS_DBGWVRn_EL1(0),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(1),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(2),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(3),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(4),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(5),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(6),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(7),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(8),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(9),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(10),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(11),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(12),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(13),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(14),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWVRn_EL1(15),         HDFGRTR, DBGWVRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(0),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(1),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(2),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(3),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(4),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(5),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(6),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(7),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(8),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(9),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(10),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(11),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(12),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(13),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(14),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGWCRn_EL1(15),         HDFGRTR, DBGWCRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(0),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(1),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(2),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(3),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(4),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(5),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(6),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(7),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(8),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(9),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(10),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(11),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(12),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(13),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(14),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBVRn_EL1(15),         HDFGRTR, DBGBVRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(0),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(1),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(2),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(3),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(4),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(5),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(6),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(7),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(8),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(9),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(10),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(11),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(12),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(13),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(14),         HDFGRTR, DBGBCRn_EL1, 1),
        SR_FGT(SYS_DBGBCRn_EL1(15),         HDFGRTR, DBGBCRn_EL1, 1),

        /* HDFGRTR2_EL2 */
        SR_FGT(SYS_MDSELR_EL1,                HDFGRTR2, nMDSELR_EL1, 0),
        SR_FGT(SYS_MDSTEPOP_EL1,        HDFGRTR2, nMDSTEPOP_EL1, 0),
        SR_FGT(SYS_PMCCNTSVR_EL1,        HDFGRTR2, nPMSSDATA, 0),
        SR_FGT_RANGE(SYS_PMEVCNTSVRn_EL1(0),
                     SYS_PMEVCNTSVRn_EL1(30),
                     HDFGRTR2, nPMSSDATA, 0),
        SR_FGT(SYS_PMICNTSVR_EL1,        HDFGRTR2, nPMSSDATA, 0),
        SR_FGT(SYS_PMECR_EL1,                HDFGRTR2, nPMECR_EL1, 0),
        SR_FGT(SYS_PMIAR_EL1,                HDFGRTR2, nPMIAR_EL1, 0),
        SR_FGT(SYS_PMICFILTR_EL0,        HDFGRTR2, nPMICFILTR_EL0, 0),
        SR_FGT(SYS_PMICNTR_EL0,                HDFGRTR2, nPMICNTR_EL0, 0),
        SR_FGT(SYS_PMSSCR_EL1,                HDFGRTR2, nPMSSCR_EL1, 0),
        SR_FGT(SYS_PMUACR_EL1,                HDFGRTR2, nPMUACR_EL1, 0),
        SR_FGT(SYS_SPMACCESSR_EL1,        HDFGRTR2, nSPMACCESSR_EL1, 0),
        SR_FGT(SYS_SPMCFGR_EL1,                HDFGRTR2, nSPMID, 0),
        SR_FGT(SYS_SPMDEVARCH_EL1,        HDFGRTR2, nSPMID, 0),
        SR_FGT(SYS_SPMCGCRn_EL1(0),        HDFGRTR2, nSPMID, 0),
        SR_FGT(SYS_SPMCGCRn_EL1(1),        HDFGRTR2, nSPMID, 0),
        SR_FGT(SYS_SPMIIDR_EL1,                HDFGRTR2, nSPMID, 0),
        SR_FGT(SYS_SPMCNTENCLR_EL0,        HDFGRTR2, nSPMCNTEN, 0),
        SR_FGT(SYS_SPMCNTENSET_EL0,        HDFGRTR2, nSPMCNTEN, 0),
        SR_FGT(SYS_SPMCR_EL0,                HDFGRTR2, nSPMCR_EL0, 0),
        SR_FGT(SYS_SPMDEVAFF_EL1,        HDFGRTR2, nSPMDEVAFF_EL1, 0),
        /*
         * We have up to 64 of these registers in ranges of 16, banked via
         * SPMSELR_EL0.BANK. We're only concerned with the accessors here,
         * not the architectural registers.
         */
        SR_FGT_RANGE(SYS_SPMEVCNTRn_EL0(0),
                     SYS_SPMEVCNTRn_EL0(15),
                     HDFGRTR2, nSPMEVCNTRn_EL0, 0),
        SR_FGT_RANGE(SYS_SPMEVFILT2Rn_EL0(0),
                     SYS_SPMEVFILT2Rn_EL0(15),
                     HDFGRTR2, nSPMEVTYPERn_EL0, 0),
        SR_FGT_RANGE(SYS_SPMEVFILTRn_EL0(0),
                     SYS_SPMEVFILTRn_EL0(15),
                     HDFGRTR2, nSPMEVTYPERn_EL0, 0),
        SR_FGT_RANGE(SYS_SPMEVTYPERn_EL0(0),
                     SYS_SPMEVTYPERn_EL0(15),
                     HDFGRTR2, nSPMEVTYPERn_EL0, 0),
        SR_FGT(SYS_SPMINTENCLR_EL1,        HDFGRTR2, nSPMINTEN, 0),
        SR_FGT(SYS_SPMINTENSET_EL1,        HDFGRTR2, nSPMINTEN, 0),
        SR_FGT(SYS_SPMOVSCLR_EL0,        HDFGRTR2, nSPMOVS, 0),
        SR_FGT(SYS_SPMOVSSET_EL0,        HDFGRTR2, nSPMOVS, 0),
        SR_FGT(SYS_SPMSCR_EL1,                HDFGRTR2, nSPMSCR_EL1, 0),
        SR_FGT(SYS_SPMSELR_EL0,                HDFGRTR2, nSPMSELR_EL0, 0),
        SR_FGT(SYS_TRCITECR_EL1,        HDFGRTR2, nTRCITECR_EL1, 0),
        SR_FGT(SYS_PMBMAR_EL1,                HDFGRTR2, nPMBMAR_EL1, 0),
        SR_FGT(SYS_PMSDSFR_EL1,                HDFGRTR2, nPMSDSFR_EL1, 0),
        SR_FGT(SYS_TRBMPAM_EL1,                HDFGRTR2, nTRBMPAM_EL1, 0),

        /*
         * HDFGWTR_EL2
         *
         * Although HDFGRTR_EL2 and HDFGWTR_EL2 registers largely
         * overlap in their bit assignment, there are a number of bits
         * that are RES0 on one side, and an actual trap bit on the
         * other.  The policy chosen here is to describe all the
         * read-side mappings, and only the write-side mappings that
         * differ from the read side, and the trap handler will pick
         * the correct shadow register based on the access type.
         *
         * Same model applies to the FEAT_FGT2 registers.
         */
        SR_FGT(SYS_TRFCR_EL1,                HDFGWTR, TRFCR_EL1, 1),
        SR_FGT(SYS_TRCOSLAR,                HDFGWTR, TRCOSLAR, 1),
        SR_FGT(SYS_PMCR_EL0,                HDFGWTR, PMCR_EL0, 1),
        SR_FGT(SYS_PMSWINC_EL0,                HDFGWTR, PMSWINC_EL0, 1),
        SR_FGT(SYS_OSLAR_EL1,                HDFGWTR, OSLAR_EL1, 1),

        /* HDFGWTR2_EL2 */
        SR_FGT(SYS_PMZR_EL0,                HDFGWTR2, nPMZR_EL0, 0),
        SR_FGT(SYS_SPMZR_EL0,                HDFGWTR2, nSPMEVCNTRn_EL0, 0),

        /*
         * HAFGRTR_EL2
         */
        SR_FGT(SYS_AMEVTYPER1_EL0(15),        HAFGRTR, AMEVTYPER115_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(14),        HAFGRTR, AMEVTYPER114_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(13),        HAFGRTR, AMEVTYPER113_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(12),        HAFGRTR, AMEVTYPER112_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(11),        HAFGRTR, AMEVTYPER111_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(10),        HAFGRTR, AMEVTYPER110_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(9),        HAFGRTR, AMEVTYPER19_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(8),        HAFGRTR, AMEVTYPER18_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(7),        HAFGRTR, AMEVTYPER17_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(6),        HAFGRTR, AMEVTYPER16_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(5),        HAFGRTR, AMEVTYPER15_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(4),        HAFGRTR, AMEVTYPER14_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(3),        HAFGRTR, AMEVTYPER13_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(2),        HAFGRTR, AMEVTYPER12_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(1),        HAFGRTR, AMEVTYPER11_EL0, 1),
        SR_FGT(SYS_AMEVTYPER1_EL0(0),        HAFGRTR, AMEVTYPER10_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(15),        HAFGRTR, AMEVCNTR115_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(14),        HAFGRTR, AMEVCNTR114_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(13),        HAFGRTR, AMEVCNTR113_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(12),        HAFGRTR, AMEVCNTR112_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(11),        HAFGRTR, AMEVCNTR111_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(10),        HAFGRTR, AMEVCNTR110_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(9),        HAFGRTR, AMEVCNTR19_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(8),        HAFGRTR, AMEVCNTR18_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(7),        HAFGRTR, AMEVCNTR17_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(6),        HAFGRTR, AMEVCNTR16_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(5),        HAFGRTR, AMEVCNTR15_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(4),        HAFGRTR, AMEVCNTR14_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(3),        HAFGRTR, AMEVCNTR13_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(2),        HAFGRTR, AMEVCNTR12_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(1),        HAFGRTR, AMEVCNTR11_EL0, 1),
        SR_FGT(SYS_AMEVCNTR1_EL0(0),        HAFGRTR, AMEVCNTR10_EL0, 1),
        SR_FGT(SYS_AMCNTENCLR1_EL0,        HAFGRTR, AMCNTEN1, 1),
        SR_FGT(SYS_AMCNTENSET1_EL0,        HAFGRTR, AMCNTEN1, 1),
        SR_FGT(SYS_AMCNTENCLR0_EL0,        HAFGRTR, AMCNTEN0, 1),
        SR_FGT(SYS_AMCNTENSET0_EL0,        HAFGRTR, AMCNTEN0, 1),
        SR_FGT(SYS_AMEVCNTR0_EL0(3),        HAFGRTR, AMEVCNTR03_EL0, 1),
        SR_FGT(SYS_AMEVCNTR0_EL0(2),        HAFGRTR, AMEVCNTR02_EL0, 1),
        SR_FGT(SYS_AMEVCNTR0_EL0(1),        HAFGRTR, AMEVCNTR01_EL0, 1),
        SR_FGT(SYS_AMEVCNTR0_EL0(0),        HAFGRTR, AMEVCNTR00_EL0, 1),
};

/*
 * Additional FGTs that do not fire with ESR_EL2.EC==0x18. This table
 * isn't used for exception routing, but only as a promise that the
 * trap is handled somewhere else.
 */
static const union trap_config non_0x18_fgt[] __initconst = {
        FGT(HFGITR, PSBCSYNC, 1),
        FGT(HFGITR, nGCSSTR_EL1, 0),
        FGT(HFGITR, SVC_EL1, 1),
        FGT(HFGITR, SVC_EL0, 1),
        FGT(HFGITR, ERET, 1),
        FGT(HFGITR2, TSBCSYNC, 1),
};

static union trap_config get_trap_config(u32 sysreg)
{
        return (union trap_config) {
                .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)),
        };
}

static __init void print_nv_trap_error(const struct encoding_to_trap_config *tc,
                                       const char *type, int err)
{
        kvm_err("%s line %d encoding range "
                "(%d, %d, %d, %d, %d) - (%d, %d, %d, %d, %d) (err=%d)\n",
                type, tc->line,
                sys_reg_Op0(tc->encoding), sys_reg_Op1(tc->encoding),
                sys_reg_CRn(tc->encoding), sys_reg_CRm(tc->encoding),
                sys_reg_Op2(tc->encoding),
                sys_reg_Op0(tc->end), sys_reg_Op1(tc->end),
                sys_reg_CRn(tc->end), sys_reg_CRm(tc->end),
                sys_reg_Op2(tc->end),
                err);
}

static u32 encoding_next(u32 encoding)
{
        u8 op0, op1, crn, crm, op2;

        op0 = sys_reg_Op0(encoding);
        op1 = sys_reg_Op1(encoding);
        crn = sys_reg_CRn(encoding);
        crm = sys_reg_CRm(encoding);
        op2 = sys_reg_Op2(encoding);

        if (op2 < Op2_mask)
                return sys_reg(op0, op1, crn, crm, op2 + 1);
        if (crm < CRm_mask)
                return sys_reg(op0, op1, crn, crm + 1, 0);
        if (crn < CRn_mask)
                return sys_reg(op0, op1, crn + 1, 0, 0);
        if (op1 < Op1_mask)
                return sys_reg(op0, op1 + 1, 0, 0, 0);

        return sys_reg(op0 + 1, 0, 0, 0, 0);
}

#define FGT_MASKS(__n, __m)                                                \
        struct fgt_masks __n = { .str = #__m, .res0 = __m, }

FGT_MASKS(hfgrtr_masks, HFGRTR_EL2_RES0);
FGT_MASKS(hfgwtr_masks, HFGWTR_EL2_RES0);
FGT_MASKS(hfgitr_masks, HFGITR_EL2_RES0);
FGT_MASKS(hdfgrtr_masks, HDFGRTR_EL2_RES0);
FGT_MASKS(hdfgwtr_masks, HDFGWTR_EL2_RES0);
FGT_MASKS(hafgrtr_masks, HAFGRTR_EL2_RES0);
FGT_MASKS(hfgrtr2_masks, HFGRTR2_EL2_RES0);
FGT_MASKS(hfgwtr2_masks, HFGWTR2_EL2_RES0);
FGT_MASKS(hfgitr2_masks, HFGITR2_EL2_RES0);
FGT_MASKS(hdfgrtr2_masks, HDFGRTR2_EL2_RES0);
FGT_MASKS(hdfgwtr2_masks, HDFGWTR2_EL2_RES0);

static __init bool aggregate_fgt(union trap_config tc)
{
        struct fgt_masks *rmasks, *wmasks;

        switch (tc.fgt) {
        case HFGRTR_GROUP:
                rmasks = &hfgrtr_masks;
                wmasks = &hfgwtr_masks;
                break;
        case HDFGRTR_GROUP:
                rmasks = &hdfgrtr_masks;
                wmasks = &hdfgwtr_masks;
                break;
        case HAFGRTR_GROUP:
                rmasks = &hafgrtr_masks;
                wmasks = NULL;
                break;
        case HFGITR_GROUP:
                rmasks = &hfgitr_masks;
                wmasks = NULL;
                break;
        case HFGRTR2_GROUP:
                rmasks = &hfgrtr2_masks;
                wmasks = &hfgwtr2_masks;
                break;
        case HDFGRTR2_GROUP:
                rmasks = &hdfgrtr2_masks;
                wmasks = &hdfgwtr2_masks;
                break;
        case HFGITR2_GROUP:
                rmasks = &hfgitr2_masks;
                wmasks = NULL;
                break;
        }

        /*
         * A bit can be reserved in either the R or W register, but
         * not both.
         */
        if ((BIT(tc.bit) & rmasks->res0) &&
            (!wmasks || (BIT(tc.bit) & wmasks->res0)))
                return false;

        if (tc.pol)
                rmasks->mask |= BIT(tc.bit) & ~rmasks->res0;
        else
                rmasks->nmask |= BIT(tc.bit) & ~rmasks->res0;

        if (wmasks) {
                if (tc.pol)
                        wmasks->mask |= BIT(tc.bit) & ~wmasks->res0;
                else
                        wmasks->nmask |= BIT(tc.bit) & ~wmasks->res0;
        }

        return true;
}

static __init int check_fgt_masks(struct fgt_masks *masks)
{
        unsigned long duplicate = masks->mask & masks->nmask;
        u64 res0 = masks->res0;
        int ret = 0;

        if (duplicate) {
                int i;

                for_each_set_bit(i, &duplicate, 64) {
                        kvm_err("%s[%d] bit has both polarities\n",
                                masks->str, i);
                }

                ret = -EINVAL;
        }

        masks->res0 = ~(masks->mask | masks->nmask);
        if (masks->res0 != res0)
                kvm_info("Implicit %s = %016llx, expecting %016llx\n",
                         masks->str, masks->res0, res0);

        return ret;
}

static __init int check_all_fgt_masks(int ret)
{
        static struct fgt_masks * const masks[] __initconst = {
                &hfgrtr_masks,
                &hfgwtr_masks,
                &hfgitr_masks,
                &hdfgrtr_masks,
                &hdfgwtr_masks,
                &hafgrtr_masks,
                &hfgrtr2_masks,
                &hfgwtr2_masks,
                &hfgitr2_masks,
                &hdfgrtr2_masks,
                &hdfgwtr2_masks,
        };
        int err = 0;

        for (int i = 0; i < ARRAY_SIZE(masks); i++)
                err |= check_fgt_masks(masks[i]);

        return ret ?: err;
}

#define for_each_encoding_in(__x, __s, __e)        \
        for (u32 __x = __s; __x <= __e; __x = encoding_next(__x))

int __init populate_nv_trap_config(void)
{
        int ret = 0;

        BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *));
        BUILD_BUG_ON(__NR_CGT_GROUP_IDS__ > BIT(TC_CGT_BITS));
        BUILD_BUG_ON(__NR_FGT_GROUP_IDS__ > BIT(TC_FGT_BITS));
        BUILD_BUG_ON(__NR_FG_FILTER_IDS__ > BIT(TC_FGF_BITS));
        BUILD_BUG_ON(__HCRX_EL2_MASK & __HCRX_EL2_nMASK);

        for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) {
                const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i];
                void *prev;

                if (cgt->tc.val & BIT(63)) {
                        kvm_err("CGT[%d] has MBZ bit set\n", i);
                        ret = -EINVAL;
                }

                for_each_encoding_in(enc, cgt->encoding, cgt->end) {
                        prev = xa_store(&sr_forward_xa, enc,
                                        xa_mk_value(cgt->tc.val), GFP_KERNEL);
                        if (prev && !xa_is_err(prev)) {
                                ret = -EINVAL;
                                print_nv_trap_error(cgt, "Duplicate CGT", ret);
                        }

                        if (xa_is_err(prev)) {
                                ret = xa_err(prev);
                                print_nv_trap_error(cgt, "Failed CGT insertion", ret);
                        }
                }
        }

        if (__HCRX_EL2_RES0 != HCRX_EL2_RES0)
                kvm_info("Sanitised HCR_EL2_RES0 = %016llx, expecting %016llx\n",
                         __HCRX_EL2_RES0, HCRX_EL2_RES0);

        kvm_info("nv: %ld coarse grained trap handlers\n",
                 ARRAY_SIZE(encoding_to_cgt));

        if (!cpus_have_final_cap(ARM64_HAS_FGT))
                goto check_mcb;

        for (int i = 0; i < ARRAY_SIZE(encoding_to_fgt); i++) {
                const struct encoding_to_trap_config *fgt = &encoding_to_fgt[i];
                union trap_config tc;
                void *prev;

                if (fgt->tc.fgt >= __NR_FGT_GROUP_IDS__) {
                        ret = -EINVAL;
                        print_nv_trap_error(fgt, "Invalid FGT", ret);
                }

                for_each_encoding_in(enc, fgt->encoding, fgt->end) {
                        tc = get_trap_config(enc);

                        if (tc.fgt) {
                                ret = -EINVAL;
                                print_nv_trap_error(fgt, "Duplicate FGT", ret);
                        }

                        tc.val |= fgt->tc.val;
                        prev = xa_store(&sr_forward_xa, enc,
                                        xa_mk_value(tc.val), GFP_KERNEL);

                        if (xa_is_err(prev)) {
                                ret = xa_err(prev);
                                print_nv_trap_error(fgt, "Failed FGT insertion", ret);
                        }

                        if (!aggregate_fgt(tc)) {
                                ret = -EINVAL;
                                print_nv_trap_error(fgt, "FGT bit is reserved", ret);
                        }
                }
        }

        for (int i = 0; i < ARRAY_SIZE(non_0x18_fgt); i++) {
                if (!aggregate_fgt(non_0x18_fgt[i])) {
                        ret = -EINVAL;
                        kvm_err("non_0x18_fgt[%d] is reserved\n", i);
                }
        }

        ret = check_all_fgt_masks(ret);

        kvm_info("nv: %ld fine grained trap handlers\n",
                 ARRAY_SIZE(encoding_to_fgt));

check_mcb:
        for (int id = __MULTIPLE_CONTROL_BITS__; id < __COMPLEX_CONDITIONS__; id++) {
                const enum cgt_group_id *cgids;

                cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__];

                for (int i = 0; cgids[i] != __RESERVED__; i++) {
                        if (cgids[i] >= __MULTIPLE_CONTROL_BITS__ &&
                            cgids[i] < __COMPLEX_CONDITIONS__) {
                                kvm_err("Recursive MCB %d/%d\n", id, cgids[i]);
                                ret = -EINVAL;
                        }
                }
        }

        if (ret)
                xa_destroy(&sr_forward_xa);

        return ret;
}

int __init populate_sysreg_config(const struct sys_reg_desc *sr,
                                  unsigned int idx)
{
        union trap_config tc;
        u32 encoding;
        void *ret;

        /*
         * 0 is a valid value for the index, but not for the storage.
         * We'll store (idx+1), so check against an offset'd limit.
         */
        if (idx >= (BIT(TC_SRI_BITS) - 1)) {
                kvm_err("sysreg %s (%d) out of range\n", sr->name, idx);
                return -EINVAL;
        }

        encoding = sys_reg(sr->Op0, sr->Op1, sr->CRn, sr->CRm, sr->Op2);
        tc = get_trap_config(encoding);

        if (tc.sri) {
                kvm_err("sysreg %s (%d) duplicate entry (%d)\n",
                        sr->name, idx - 1, tc.sri);
                return -EINVAL;
        }

        tc.sri = idx + 1;
        ret = xa_store(&sr_forward_xa, encoding,
                       xa_mk_value(tc.val), GFP_KERNEL);

        return xa_err(ret);
}

static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu,
                                         const struct trap_bits *tb)
{
        enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY;
        u64 val;

        val = __vcpu_sys_reg(vcpu, tb->index);
        if ((val & tb->mask) == tb->value)
                b |= tb->behaviour;

        return b;
}

static enum trap_behaviour __compute_trap_behaviour(struct kvm_vcpu *vcpu,
                                                    const enum cgt_group_id id,
                                                    enum trap_behaviour b)
{
        switch (id) {
                const enum cgt_group_id *cgids;

        case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1:
                if (likely(id != __RESERVED__))
                        b |= get_behaviour(vcpu, &coarse_trap_bits[id]);
                break;
        case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1:
                /* Yes, this is recursive. Don't do anything stupid. */
                cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__];
                for (int i = 0; cgids[i] != __RESERVED__; i++)
                        b |= __compute_trap_behaviour(vcpu, cgids[i], b);
                break;
        default:
                if (ARRAY_SIZE(ccc))
                        b |= ccc[id -  __COMPLEX_CONDITIONS__](vcpu);
                break;
        }

        return b;
}

static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu,
                                                  const union trap_config tc)
{
        enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY;

        return __compute_trap_behaviour(vcpu, tc.cgt, b);
}

static u64 kvm_get_sysreg_res0(struct kvm *kvm, enum vcpu_sysreg sr)
{
        struct kvm_sysreg_masks *masks;

        /* Only handle the VNCR-backed regs for now */
        if (sr < __VNCR_START__)
                return 0;

        masks = kvm->arch.sysreg_masks;

        return masks->mask[sr - __VNCR_START__].res0;
}

static bool check_fgt_bit(struct kvm_vcpu *vcpu, enum vcpu_sysreg sr,
                          const union trap_config tc)
{
        struct kvm *kvm = vcpu->kvm;
        u64 val;

        /*
         * KVM doesn't know about any FGTs that apply to the host, and hopefully
         * that'll remain the case.
         */
        if (is_hyp_ctxt(vcpu))
                return false;

        val = __vcpu_sys_reg(vcpu, sr);

        if (tc.pol)
                return (val & BIT(tc.bit));

        /*
         * FGTs with negative polarities are an absolute nightmare, as
         * we need to evaluate the bit in the light of the feature
         * that defines it. WTF were they thinking?
         *
         * So let's check if the bit has been earmarked as RES0, as
         * this indicates an unimplemented feature.
         */
        if (val & BIT(tc.bit))
                return false;

        return !(kvm_get_sysreg_res0(kvm, sr) & BIT(tc.bit));
}

bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index)
{
        enum vcpu_sysreg fgtreg;
        union trap_config tc;
        enum trap_behaviour b;
        bool is_read;
        u32 sysreg;
        u64 esr;

        esr = kvm_vcpu_get_esr(vcpu);
        sysreg = esr_sys64_to_sysreg(esr);
        is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;

        tc = get_trap_config(sysreg);

        /*
         * A value of 0 for the whole entry means that we know nothing
         * for this sysreg, and that it cannot be re-injected into the
         * nested hypervisor. In this situation, let's cut it short.
         */
        if (!tc.val)
                goto local;

        /*
         * If a sysreg can be trapped using a FGT, first check whether we
         * trap for the purpose of forbidding the feature. In that case,
         * inject an UNDEF.
         */
        if (tc.fgt != __NO_FGT_GROUP__ &&
            (vcpu->kvm->arch.fgu[tc.fgt] & BIT(tc.bit))) {
                kvm_inject_undefined(vcpu);
                return true;
        }

        /*
         * If we're not nesting, immediately return to the caller, with the
         * sysreg index, should we have it.
         */
        if (!vcpu_has_nv(vcpu))
                goto local;

        /*
         * There are a few traps that take effect InHost, but are constrained
         * to EL0. Don't bother with computing the trap behaviour if the vCPU
         * isn't in EL0.
         */
        if (is_hyp_ctxt(vcpu) && !vcpu_is_host_el0(vcpu))
                goto local;

        switch ((enum fgt_group_id)tc.fgt) {
        case __NO_FGT_GROUP__:
                break;

        case HFGRTR_GROUP:
                fgtreg = is_read ? HFGRTR_EL2 : HFGWTR_EL2;
                break;

        case HDFGRTR_GROUP:
                fgtreg = is_read ? HDFGRTR_EL2 : HDFGWTR_EL2;
                break;

        case HAFGRTR_GROUP:
                fgtreg = HAFGRTR_EL2;
                break;

        case HFGITR_GROUP:
                fgtreg = HFGITR_EL2;
                switch (tc.fgf) {
                        u64 tmp;

                case __NO_FGF__:
                        break;

                case HCRX_FGTnXS:
                        tmp = __vcpu_sys_reg(vcpu, HCRX_EL2);
                        if (tmp & HCRX_EL2_FGTnXS)
                                tc.fgt = __NO_FGT_GROUP__;
                }
                break;

        case HFGRTR2_GROUP:
                fgtreg = is_read ? HFGRTR2_EL2 : HFGWTR2_EL2;
                break;

        case HDFGRTR2_GROUP:
                fgtreg = is_read ? HDFGRTR2_EL2 : HDFGWTR2_EL2;
                break;

        case HFGITR2_GROUP:
                fgtreg = HFGITR2_EL2;
                break;

        default:
                /* Something is really wrong, bail out */
                WARN_ONCE(1, "Bad FGT group (encoding %08x, config %016llx)\n",
                          sysreg, tc.val);
                goto local;
        }

        if (tc.fgt != __NO_FGT_GROUP__ && check_fgt_bit(vcpu, fgtreg, tc))
                goto inject;

        b = compute_trap_behaviour(vcpu, tc);

        if (!(b & BEHAVE_FORWARD_IN_HOST_EL0) && vcpu_is_host_el0(vcpu))
                goto local;

        if (((b & BEHAVE_FORWARD_READ) && is_read) ||
            ((b & BEHAVE_FORWARD_WRITE) && !is_read))
                goto inject;

local:
        if (!tc.sri) {
                struct sys_reg_params params;

                params = esr_sys64_to_params(esr);

                /*
                 * Check for the IMPDEF range, as per DDI0487 J.a,
                 * D18.3.2 Reserved encodings for IMPLEMENTATION
                 * DEFINED registers.
                 */
                if (!(params.Op0 == 3 && (params.CRn & 0b1011) == 0b1011))
                        print_sys_reg_msg(&params,
                                          "Unsupported guest access at: %lx\n",
                                          *vcpu_pc(vcpu));
                kvm_inject_undefined(vcpu);
                return true;
        }

        *sr_index = tc.sri - 1;
        return false;

inject:
        trace_kvm_forward_sysreg_trap(vcpu, sysreg, is_read);

        kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
        return true;
}

static bool __forward_traps(struct kvm_vcpu *vcpu, unsigned int reg, u64 control_bit)
{
        bool control_bit_set;

        if (!vcpu_has_nv(vcpu))
                return false;

        control_bit_set = __vcpu_sys_reg(vcpu, reg) & control_bit;
        if (!is_hyp_ctxt(vcpu) && control_bit_set) {
                kvm_inject_nested_sync(vcpu, kvm_vcpu_get_esr(vcpu));
                return true;
        }
        return false;
}

static bool forward_hcr_traps(struct kvm_vcpu *vcpu, u64 control_bit)
{
        return __forward_traps(vcpu, HCR_EL2, control_bit);
}

bool forward_smc_trap(struct kvm_vcpu *vcpu)
{
        return forward_hcr_traps(vcpu, HCR_TSC);
}

static bool forward_mdcr_traps(struct kvm_vcpu *vcpu, u64 control_bit)
{
        return __forward_traps(vcpu, MDCR_EL2, control_bit);
}

bool forward_debug_exception(struct kvm_vcpu *vcpu)
{
        return forward_mdcr_traps(vcpu, MDCR_EL2_TDE);
}

static u64 kvm_check_illegal_exception_return(struct kvm_vcpu *vcpu, u64 spsr)
{
        u64 mode = spsr & PSR_MODE_MASK;

        /*
         * Possible causes for an Illegal Exception Return from EL2:
         * - trying to return to EL3
         * - trying to return to an illegal M value
         * - trying to return to a 32bit EL
         * - trying to return to EL1 with HCR_EL2.TGE set
         */
        if (mode == PSR_MODE_EL3t   || mode == PSR_MODE_EL3h ||
            mode == 0b00001         || (mode & BIT(1))       ||
            (spsr & PSR_MODE32_BIT) ||
            (vcpu_el2_tge_is_set(vcpu) && (mode == PSR_MODE_EL1t ||
                                           mode == PSR_MODE_EL1h))) {
                /*
                 * The guest is playing with our nerves. Preserve EL, SP,
                 * masks, flags from the existing PSTATE, and set IL.
                 * The HW will then generate an Illegal State Exception
                 * immediately after ERET.
                 */
                spsr = *vcpu_cpsr(vcpu);

                spsr &= (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT |
                         PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT |
                         PSR_MODE_MASK | PSR_MODE32_BIT);
                spsr |= PSR_IL_BIT;
        }

        return spsr;
}

void kvm_emulate_nested_eret(struct kvm_vcpu *vcpu)
{
        u64 spsr, elr, esr;

        spsr = vcpu_read_sys_reg(vcpu, SPSR_EL2);
        spsr = kvm_check_illegal_exception_return(vcpu, spsr);

        /* Check for an ERETAx */
        esr = kvm_vcpu_get_esr(vcpu);
        if (esr_iss_is_eretax(esr) && !kvm_auth_eretax(vcpu, &elr)) {
                /*
                 * Oh no, ERETAx failed to authenticate.
                 *
                 * If we have FPACCOMBINE and we don't have a pending
                 * Illegal Execution State exception (which has priority
                 * over FPAC), deliver an exception right away.
                 *
                 * Otherwise, let the mangled ELR value trickle down the
                 * ERET handling, and the guest will have a little surprise.
                 */
                if (kvm_has_pauth(vcpu->kvm, FPACCOMBINE) && !(spsr & PSR_IL_BIT)) {
                        esr &= ESR_ELx_ERET_ISS_ERETA;
                        esr |= FIELD_PREP(ESR_ELx_EC_MASK, ESR_ELx_EC_FPAC);
                        kvm_inject_nested_sync(vcpu, esr);
                        return;
                }
        }

        preempt_disable();
        vcpu_set_flag(vcpu, IN_NESTED_ERET);
        kvm_arch_vcpu_put(vcpu);

        if (!esr_iss_is_eretax(esr))
                elr = __vcpu_sys_reg(vcpu, ELR_EL2);

        trace_kvm_nested_eret(vcpu, elr, spsr);

        *vcpu_pc(vcpu) = elr;
        *vcpu_cpsr(vcpu) = spsr;

        kvm_arch_vcpu_load(vcpu, smp_processor_id());
        vcpu_clear_flag(vcpu, IN_NESTED_ERET);
        preempt_enable();

        if (kvm_vcpu_has_pmu(vcpu))
                kvm_pmu_nested_transition(vcpu);
}

static void kvm_inject_el2_exception(struct kvm_vcpu *vcpu, u64 esr_el2,
                                     enum exception_type type)
{
        trace_kvm_inject_nested_exception(vcpu, esr_el2, type);

        switch (type) {
        case except_type_sync:
                kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
                vcpu_write_sys_reg(vcpu, esr_el2, ESR_EL2);
                break;
        case except_type_irq:
                kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_IRQ);
                break;
        default:
                WARN_ONCE(1, "Unsupported EL2 exception injection %d\n", type);
        }
}

/*
 * Emulate taking an exception to EL2.
 * See ARM ARM J8.1.2 AArch64.TakeException()
 */
static int kvm_inject_nested(struct kvm_vcpu *vcpu, u64 esr_el2,
                             enum exception_type type)
{
        u64 pstate, mode;
        bool direct_inject;

        if (!vcpu_has_nv(vcpu)) {
                kvm_err("Unexpected call to %s for the non-nesting configuration\n",
                                __func__);
                return -EINVAL;
        }

        /*
         * As for ERET, we can avoid doing too much on the injection path by
         * checking that we either took the exception from a VHE host
         * userspace or from vEL2. In these cases, there is no change in
         * translation regime (or anything else), so let's do as little as
         * possible.
         */
        pstate = *vcpu_cpsr(vcpu);
        mode = pstate & (PSR_MODE_MASK | PSR_MODE32_BIT);

        direct_inject  = (mode == PSR_MODE_EL0t &&
                          vcpu_el2_e2h_is_set(vcpu) &&
                          vcpu_el2_tge_is_set(vcpu));
        direct_inject |= (mode == PSR_MODE_EL2h || mode == PSR_MODE_EL2t);

        if (direct_inject) {
                kvm_inject_el2_exception(vcpu, esr_el2, type);
                return 1;
        }

        preempt_disable();

        /*
         * We may have an exception or PC update in the EL0/EL1 context.
         * Commit it before entering EL2.
         */
        __kvm_adjust_pc(vcpu);

        kvm_arch_vcpu_put(vcpu);

        kvm_inject_el2_exception(vcpu, esr_el2, type);

        /*
         * A hard requirement is that a switch between EL1 and EL2
         * contexts has to happen between a put/load, so that we can
         * pick the correct timer and interrupt configuration, among
         * other things.
         *
         * Make sure the exception actually took place before we load
         * the new context.
         */
        __kvm_adjust_pc(vcpu);

        kvm_arch_vcpu_load(vcpu, smp_processor_id());
        preempt_enable();

        if (kvm_vcpu_has_pmu(vcpu))
                kvm_pmu_nested_transition(vcpu);

        return 1;
}

int kvm_inject_nested_sync(struct kvm_vcpu *vcpu, u64 esr_el2)
{
        return kvm_inject_nested(vcpu, esr_el2, except_type_sync);
}

int kvm_inject_nested_irq(struct kvm_vcpu *vcpu)
{
        /*
         * Do not inject an irq if the:
         *  - Current exception level is EL2, and
         *  - virtual HCR_EL2.TGE == 0
         *  - virtual HCR_EL2.IMO == 0
         *
         * See Table D1-17 "Physical interrupt target and masking when EL3 is
         * not implemented and EL2 is implemented" in ARM DDI 0487C.a.
         */

        if (vcpu_is_el2(vcpu) && !vcpu_el2_tge_is_set(vcpu) &&
            !(__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_IMO))
                return 1;

        /* esr_el2 value doesn't matter for exits due to irqs. */
        return kvm_inject_nested(vcpu, 0, except_type_irq);
}













































































































































































































































































































































































































































  319 





  319 












































    5 




    5 



















































































  316 











  265 




















   57 







    3 
  315 





























  315 








  316 



  314 































  314 











    1 






























































































































































































































































































































    5 






































  177 

   23 



  203 



    5 



  198 




  289 



  289 


  203 







  202 
   13 


  203 




















  289 




  290 
























  289 
  289 
  268 
   26 



  290 

  289 







  289 













  289 








   58 







   24 


















   58 
   58 
   25 




























































































































































































  269 
  269 



  268 




























































































  265 




  265 
  265 
  265 











































  265 









  265 














  265 












































































































































































































































































































































































































































































  265 





  265 


  265 












  265 












    3 

    3 



























































































































































    3 










    3 























    3 

























    3 











    3 






















































































































































































































































































  319 
  318 







































    5 
    5 





































    3 

    3 







    1 







    1 




    1 















    1 


    1 










































    1 


    1 








    2 
    3 












    3 













































































































































































    2 




    2 























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  294 


























































































































































































































































































































































































































































































































































    4 



    4 
    4 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 




    4 

    4 




    4 









    5 









    5 


    5 







    4 

    4 



    4 

    4 
    4 























































































































































































































































































































































































































   21 











    1 


   21 


    2 







   13 





    2 




   14 














































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Routines having to do with the 'struct sk_buff' memory handlers.
 *
 *        Authors:        Alan Cox <alan@lxorguk.ukuu.org.uk>
 *                        Florian La Roche <rzsfl@rz.uni-sb.de>
 *
 *        Fixes:
 *                Alan Cox        :        Fixed the worst of the load
 *                                        balancer bugs.
 *                Dave Platt        :        Interrupt stacking fix.
 *        Richard Kooijman        :        Timestamp fixes.
 *                Alan Cox        :        Changed buffer format.
 *                Alan Cox        :        destructor hook for AF_UNIX etc.
 *                Linus Torvalds        :        Better skb_clone.
 *                Alan Cox        :        Added skb_copy.
 *                Alan Cox        :        Added all the changed routines Linus
 *                                        only put in the headers
 *                Ray VanTassle        :        Fixed --skb->lock in free
 *                Alan Cox        :        skb_copy copy arp field
 *                Andi Kleen        :        slabified it.
 *                Robert Olsson        :        Removed skb_head_pool
 *
 *        NOTE:
 *                The __skb_ routines should be called with interrupts
 *        disabled, or you better be *real* sure that the operation is atomic
 *        with respect to whatever list is being frobbed (e.g. via lock_sock()
 *        or via disabling bottom half handlers, etc).
 */

/*
 *        The functions in this file will not compile correctly with gcc 2.4.x
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/slab.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/netdevice.h>
#ifdef CONFIG_NET_CLS_ACT
#include <net/pkt_sched.h>
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/skbuff_ref.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/bitfield.h>
#include <linux/if_vlan.h>
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>

#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/gro.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/page_pool/helpers.h>
#include <net/dropreason.h>

#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <linux/textsearch.h>

#include "dev.h"
#include "netmem_priv.h"
#include "sock_destructor.h"

#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif

#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
                                               GRO_MAX_HEAD_PAD))

/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
 * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
 * size, and we can differentiate heads from skb_small_head_cache
 * vs system slabs by looking at their size (skb_end_offset()).
 */
#define SKB_SMALL_HEAD_CACHE_SIZE                                        \
        (is_power_of_2(SKB_SMALL_HEAD_SIZE) ?                        \
                (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) :        \
                SKB_SMALL_HEAD_SIZE)

#define SKB_SMALL_HEAD_HEADROOM                                                \
        SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)

/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
 * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
 * netmem is a page.
 */
static_assert(offsetof(struct bio_vec, bv_page) ==
              offsetof(skb_frag_t, netmem));
static_assert(sizeof_field(struct bio_vec, bv_page) ==
              sizeof_field(skb_frag_t, netmem));

static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
static_assert(sizeof_field(struct bio_vec, bv_len) ==
              sizeof_field(skb_frag_t, len));

static_assert(offsetof(struct bio_vec, bv_offset) ==
              offsetof(skb_frag_t, offset));
static_assert(sizeof_field(struct bio_vec, bv_offset) ==
              sizeof_field(skb_frag_t, offset));

#undef FN
#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
static const char * const drop_reasons[] = {
        [SKB_CONSUMED] = "CONSUMED",
        DEFINE_DROP_REASON(FN, FN)
};

static const struct drop_reason_list drop_reasons_core = {
        .reasons = drop_reasons,
        .n_reasons = ARRAY_SIZE(drop_reasons),
};

const struct drop_reason_list __rcu *
drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
        [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
};
EXPORT_SYMBOL(drop_reasons_by_subsys);

/**
 * drop_reasons_register_subsys - register another drop reason subsystem
 * @subsys: the subsystem to register, must not be the core
 * @list: the list of drop reasons within the subsystem, must point to
 *        a statically initialized list
 */
void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
                                  const struct drop_reason_list *list)
{
        if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
                 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
                 "invalid subsystem %d\n", subsys))
                return;

        /* must point to statically allocated memory, so INIT is OK */
        RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
}
EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);

/**
 * drop_reasons_unregister_subsys - unregister a drop reason subsystem
 * @subsys: the subsystem to remove, must not be the core
 *
 * Note: This will synchronize_rcu() to ensure no users when it returns.
 */
void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
{
        if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
                 subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
                 "invalid subsystem %d\n", subsys))
                return;

        RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);

        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);

/**
 *        skb_panic - private function for out-of-line support
 *        @skb:        buffer
 *        @sz:        size
 *        @addr:        address
 *        @msg:        skb_over_panic or skb_under_panic
 *
 *        Out-of-line support for skb_put() and skb_push().
 *        Called via the wrapper skb_over_panic() or skb_under_panic().
 *        Keep out of line to prevent kernel bloat.
 *        __builtin_return_address is not used because it is not always reliable.
 */
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
                      const char msg[])
{
        pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
                 msg, addr, skb->len, sz, skb->head, skb->data,
                 (unsigned long)skb->tail, (unsigned long)skb->end,
                 skb->dev ? skb->dev->name : "<NULL>");
        BUG();
}

static void skb_over_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
{
        skb_panic(skb, sz, addr, __func__);
}

#define NAPI_SKB_CACHE_SIZE        64
#define NAPI_SKB_CACHE_BULK        16
#define NAPI_SKB_CACHE_HALF        (NAPI_SKB_CACHE_SIZE / 2)

struct napi_alloc_cache {
        local_lock_t bh_lock;
        struct page_frag_cache page;
        unsigned int skb_count;
        void *skb_cache[NAPI_SKB_CACHE_SIZE];
};

static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
        .bh_lock = INIT_LOCAL_LOCK(bh_lock),
};

void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        void *data;

        fragsz = SKB_DATA_ALIGN(fragsz);

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        data = __page_frag_alloc_align(&nc->page, fragsz,
                                       GFP_ATOMIC | __GFP_NOWARN, align_mask);
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
        return data;

}
EXPORT_SYMBOL(__napi_alloc_frag_align);

void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
        void *data;

        if (in_hardirq() || irqs_disabled()) {
                struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);

                fragsz = SKB_DATA_ALIGN(fragsz);
                data = __page_frag_alloc_align(nc, fragsz,
                                               GFP_ATOMIC | __GFP_NOWARN,
                                               align_mask);
        } else {
                local_bh_disable();
                data = __napi_alloc_frag_align(fragsz, align_mask);
                local_bh_enable();
        }
        return data;
}
EXPORT_SYMBOL(__netdev_alloc_frag_align);

static struct sk_buff *napi_skb_cache_get(void)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        struct sk_buff *skb;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        if (unlikely(!nc->skb_count)) {
                nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                                      GFP_ATOMIC | __GFP_NOWARN,
                                                      NAPI_SKB_CACHE_BULK,
                                                      nc->skb_cache);
                if (unlikely(!nc->skb_count)) {
                        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
                        return NULL;
                }
        }

        skb = nc->skb_cache[--nc->skb_count];
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
        kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));

        return skb;
}

/**
 * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
 * @skbs: pointer to an at least @n-sized array to fill with skb pointers
 * @n: number of entries to provide
 *
 * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
 * the pointers into the provided array @skbs. If there are less entries
 * available, tries to replenish the cache and bulk-allocates the diff from
 * the MM layer if needed.
 * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
 * ready for {,__}build_skb_around() and don't have any data buffers attached.
 * Must be called *only* from the BH context.
 *
 * Return: number of successfully allocated skbs (@n if no actual allocation
 *           needed or kmem_cache_alloc_bulk() didn't fail).
 */
u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        u32 bulk, total = n;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);

        if (nc->skb_count >= n)
                goto get;

        /* No enough cached skbs. Try refilling the cache first */
        bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
        nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                               GFP_ATOMIC | __GFP_NOWARN, bulk,
                                               &nc->skb_cache[nc->skb_count]);
        if (likely(nc->skb_count >= n))
                goto get;

        /* Still not enough. Bulk-allocate the missing part directly, zeroed */
        n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
                                   GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
                                   n - nc->skb_count, &skbs[nc->skb_count]);
        if (likely(nc->skb_count >= n))
                goto get;

        /* kmem_cache didn't allocate the number we need, limit the output */
        total -= n - nc->skb_count;
        n = nc->skb_count;

get:
        for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
                u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache);

                skbs[i] = nc->skb_cache[base + i];

                kasan_mempool_unpoison_object(skbs[i], cache_size);
                memset(skbs[i], 0, offsetof(struct sk_buff, tail));
        }

        nc->skb_count -= n;
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);

        return total;
}
EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);

static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
                                         unsigned int size)
{
        struct skb_shared_info *shinfo;

        size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        /* Assumes caller memset cleared SKB */
        skb->truesize = SKB_TRUESIZE(size);
        refcount_set(&skb->users, 1);
        skb->head = data;
        skb->data = data;
        skb_reset_tail_pointer(skb);
        skb_set_end_offset(skb, size);
        skb->mac_header = (typeof(skb->mac_header))~0U;
        skb->transport_header = (typeof(skb->transport_header))~0U;
        skb->alloc_cpu = raw_smp_processor_id();
        /* make sure we initialize shinfo sequentially */
        shinfo = skb_shinfo(skb);
        memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
        atomic_set(&shinfo->dataref, 1);

        skb_set_kcov_handle(skb, kcov_common_handle());
}

static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
                                     unsigned int *size)
{
        void *resized;

        /* Must find the allocation size (and grow it to match). */
        *size = ksize(data);
        /* krealloc() will immediately return "data" when
         * "ksize(data)" is requested: it is the existing upper
         * bounds. As a result, GFP_ATOMIC will be ignored. Note
         * that this "new" pointer needs to be passed back to the
         * caller for use so the __alloc_size hinting will be
         * tracked correctly.
         */
        resized = krealloc(data, *size, GFP_ATOMIC);
        WARN_ON_ONCE(resized != data);
        return resized;
}

/* build_skb() variant which can operate on slab buffers.
 * Note that this should be used sparingly as slab buffers
 * cannot be combined efficiently by GRO!
 */
struct sk_buff *slab_build_skb(void *data)
{
        struct sk_buff *skb;
        unsigned int size;

        skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));
        data = __slab_build_skb(skb, data, &size);
        __finalize_skb_around(skb, data, size);

        return skb;
}
EXPORT_SYMBOL(slab_build_skb);

/* Caller must provide SKB that is memset cleared */
static void __build_skb_around(struct sk_buff *skb, void *data,
                               unsigned int frag_size)
{
        unsigned int size = frag_size;

        /* frag_size == 0 is considered deprecated now. Callers
         * using slab buffer should use slab_build_skb() instead.
         */
        if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
                data = __slab_build_skb(skb, data, &size);

        __finalize_skb_around(skb, data, size);
}

/**
 * __build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data (must not be 0)
 *
 * Allocate a new &sk_buff. Caller provides space holding head and
 * skb_shared_info. @data must have been allocated from the page
 * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
 * allocation is deprecated, and callers should use slab_build_skb()
 * instead.)
 * The return is the new skb buffer.
 * On a failure the return is %NULL, and @data is not freed.
 * Notes :
 *  Before IO, driver allocates only data buffer where NIC put incoming frame
 *  Driver should add room at head (NET_SKB_PAD) and
 *  MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
 *  After IO, driver calls build_skb(), to allocate sk_buff and populate it
 *  before giving packet to stack.
 *  RX rings only contains data buffers, not full skbs.
 */
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb;

        skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
                               GFP_ATOMIC | __GFP_NOWARN);
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));
        __build_skb_around(skb, data, frag_size);

        return skb;
}

/* build_skb() is wrapper over __build_skb(), that specifically
 * takes care of skb->head and skb->pfmemalloc
 */
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb = __build_skb(data, frag_size);

        if (likely(skb && frag_size)) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }
        return skb;
}
EXPORT_SYMBOL(build_skb);

/**
 * build_skb_around - build a network buffer around provided skb
 * @skb: sk_buff provide by caller, must be memset cleared
 * @data: data buffer provided by caller
 * @frag_size: size of data
 */
struct sk_buff *build_skb_around(struct sk_buff *skb,
                                 void *data, unsigned int frag_size)
{
        if (unlikely(!skb))
                return NULL;

        __build_skb_around(skb, data, frag_size);

        if (frag_size) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }
        return skb;
}
EXPORT_SYMBOL(build_skb_around);

/**
 * __napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __build_skb() that uses NAPI percpu caches to obtain
 * skbuff_head instead of inplace allocation.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb;

        skb = napi_skb_cache_get();
        if (unlikely(!skb))
                return NULL;

        memset(skb, 0, offsetof(struct sk_buff, tail));
        __build_skb_around(skb, data, frag_size);

        return skb;
}

/**
 * napi_build_skb - build a network buffer
 * @data: data buffer provided by caller
 * @frag_size: size of data
 *
 * Version of __napi_build_skb() that takes care of skb->head_frag
 * and skb->pfmemalloc when the data is a page or page fragment.
 *
 * Returns a new &sk_buff on success, %NULL on allocation failure.
 */
struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
        struct sk_buff *skb = __napi_build_skb(data, frag_size);

        if (likely(skb) && frag_size) {
                skb->head_frag = 1;
                skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
        }

        return skb;
}
EXPORT_SYMBOL(napi_build_skb);

/*
 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
 * the caller if emergency pfmemalloc reserves are being used. If it is and
 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
 * may be used. Otherwise, the packet data may be discarded until enough
 * memory is free
 */
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
                             bool *pfmemalloc)
{
        bool ret_pfmemalloc = false;
        size_t obj_size;
        void *obj;

        obj_size = SKB_HEAD_ALIGN(*size);
        if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
            !(flags & KMALLOC_NOT_NORMAL_BITS)) {
                obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
                                flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
                                node);
                *size = SKB_SMALL_HEAD_CACHE_SIZE;
                if (obj || !(gfp_pfmemalloc_allowed(flags)))
                        goto out;
                /* Try again but now we are using pfmemalloc reserves */
                ret_pfmemalloc = true;
                obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
                goto out;
        }

        obj_size = kmalloc_size_roundup(obj_size);
        /* The following cast might truncate high-order bits of obj_size, this
         * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
         */
        *size = (unsigned int)obj_size;

        /*
         * Try a regular allocation, when that fails and we're not entitled
         * to the reserves, fail.
         */
        obj = kmalloc_node_track_caller(obj_size,
                                        flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
                                        node);
        if (obj || !(gfp_pfmemalloc_allowed(flags)))
                goto out;

        /* Try again but now we are using pfmemalloc reserves */
        ret_pfmemalloc = true;
        obj = kmalloc_node_track_caller(obj_size, flags, node);

out:
        if (pfmemalloc)
                *pfmemalloc = ret_pfmemalloc;

        return obj;
}

/*         Allocate a new skbuff. We do this ourselves so we can fill in a few
 *        'private' fields and also do memory statistics to find all the
 *        [BEEP] leaks.
 *
 */

/**
 *        __alloc_skb        -        allocate a network buffer
 *        @size: size to allocate
 *        @gfp_mask: allocation mask
 *        @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
 *                instead of head cache and allocate a cloned (child) skb.
 *                If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
 *                allocations in case the data is required for writeback
 *        @node: numa node to allocate memory on
 *
 *        Allocate a new &sk_buff. The returned buffer has no headroom and a
 *        tail room of at least size bytes. The object has a reference count
 *        of one. The return is the buffer. On a failure the return is %NULL.
 *
 *        Buffers may only be allocated from interrupts using a @gfp_mask of
 *        %GFP_ATOMIC.
 */
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
                            int flags, int node)
{
        struct kmem_cache *cache;
        struct sk_buff *skb;
        bool pfmemalloc;
        u8 *data;

        cache = (flags & SKB_ALLOC_FCLONE)
                ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;

        if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
                gfp_mask |= __GFP_MEMALLOC;

        /* Get the HEAD */
        if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
            likely(node == NUMA_NO_NODE || node == numa_mem_id()))
                skb = napi_skb_cache_get();
        else
                skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
        if (unlikely(!skb))
                return NULL;
        prefetchw(skb);

        /* We do our best to align skb_shared_info on a separate cache
         * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
         * aligned memory blocks, unless SLUB/SLAB debug is enabled.
         * Both skb->head and skb_shared_info are cache line aligned.
         */
        data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
        if (unlikely(!data))
                goto nodata;
        /* kmalloc_size_roundup() might give us more room than requested.
         * Put skb_shared_info exactly at the end of allocated zone,
         * to allow max possible filling before reallocation.
         */
        prefetchw(data + SKB_WITH_OVERHEAD(size));

        /*
         * Only clear those fields we need to clear, not those that we will
         * actually initialise below. Hence, don't put any more fields after
         * the tail pointer in struct sk_buff!
         */
        memset(skb, 0, offsetof(struct sk_buff, tail));
        __build_skb_around(skb, data, size);
        skb->pfmemalloc = pfmemalloc;

        if (flags & SKB_ALLOC_FCLONE) {
                struct sk_buff_fclones *fclones;

                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                skb->fclone = SKB_FCLONE_ORIG;
                refcount_set(&fclones->fclone_ref, 1);
        }

        return skb;

nodata:
        kmem_cache_free(cache, skb);
        return NULL;
}
EXPORT_SYMBOL(__alloc_skb);

/**
 *        __netdev_alloc_skb - allocate an skbuff for rx on a specific device
 *        @dev: network device to receive on
 *        @len: length to allocate
 *        @gfp_mask: get_free_pages mask, passed to alloc_skb
 *
 *        Allocate a new &sk_buff and assign it a usage count of one. The
 *        buffer has NET_SKB_PAD headroom built in. Users should allocate
 *        the headroom they think they need without accounting for the
 *        built in space. The built in space is used for optimisations.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
                                   gfp_t gfp_mask)
{
        struct page_frag_cache *nc;
        struct sk_buff *skb;
        bool pfmemalloc;
        void *data;

        len += NET_SKB_PAD;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         */
        if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        len = SKB_HEAD_ALIGN(len);

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        if (in_hardirq() || irqs_disabled()) {
                nc = this_cpu_ptr(&netdev_alloc_cache);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
        } else {
                local_bh_disable();
                local_lock_nested_bh(&napi_alloc_cache.bh_lock);

                nc = this_cpu_ptr(&napi_alloc_cache.page);
                data = page_frag_alloc(nc, len, gfp_mask);
                pfmemalloc = page_frag_cache_is_pfmemalloc(nc);

                local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
                local_bh_enable();
        }

        if (unlikely(!data))
                return NULL;

        skb = __build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD);
        skb->dev = dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(__netdev_alloc_skb);

/**
 *        napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
 *        @napi: napi instance this buffer was allocated for
 *        @len: length to allocate
 *
 *        Allocate a new sk_buff for use in NAPI receive.  This buffer will
 *        attempt to allocate the head from a special reserved region used
 *        only for NAPI Rx allocation.  By doing this we can save several
 *        CPU cycles by avoiding having to disable and re-enable IRQs.
 *
 *        %NULL is returned if there is no free memory.
 */
struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
{
        gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
        struct napi_alloc_cache *nc;
        struct sk_buff *skb;
        bool pfmemalloc;
        void *data;

        DEBUG_NET_WARN_ON_ONCE(!in_softirq());
        len += NET_SKB_PAD + NET_IP_ALIGN;

        /* If requested length is either too small or too big,
         * we use kmalloc() for skb->head allocation.
         */
        if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
            len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
            (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
                skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
                                  NUMA_NO_NODE);
                if (!skb)
                        goto skb_fail;
                goto skb_success;
        }

        len = SKB_HEAD_ALIGN(len);

        if (sk_memalloc_socks())
                gfp_mask |= __GFP_MEMALLOC;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        nc = this_cpu_ptr(&napi_alloc_cache);

        data = page_frag_alloc(&nc->page, len, gfp_mask);
        pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);

        if (unlikely(!data))
                return NULL;

        skb = __napi_build_skb(data, len);
        if (unlikely(!skb)) {
                skb_free_frag(data);
                return NULL;
        }

        if (pfmemalloc)
                skb->pfmemalloc = 1;
        skb->head_frag = 1;

skb_success:
        skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
        skb->dev = napi->dev;

skb_fail:
        return skb;
}
EXPORT_SYMBOL(napi_alloc_skb);

void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
                            int off, int size, unsigned int truesize)
{
        DEBUG_NET_WARN_ON_ONCE(size > truesize);

        skb_fill_netmem_desc(skb, i, netmem, off, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_add_rx_frag_netmem);

void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
                          unsigned int truesize)
{
        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

        DEBUG_NET_WARN_ON_ONCE(size > truesize);

        skb_frag_size_add(frag, size);
        skb->len += size;
        skb->data_len += size;
        skb->truesize += truesize;
}
EXPORT_SYMBOL(skb_coalesce_rx_frag);

static void skb_drop_list(struct sk_buff **listp)
{
        kfree_skb_list(*listp);
        *listp = NULL;
}

static inline void skb_drop_fraglist(struct sk_buff *skb)
{
        skb_drop_list(&skb_shinfo(skb)->frag_list);
}

static void skb_clone_fraglist(struct sk_buff *skb)
{
        struct sk_buff *list;

        skb_walk_frags(skb, list)
                skb_get(list);
}

static bool is_pp_netmem(netmem_ref netmem)
{
        return (netmem_get_pp_magic(netmem) & ~0x3UL) == PP_SIGNATURE;
}

int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
                    unsigned int headroom)
{
#if IS_ENABLED(CONFIG_PAGE_POOL)
        u32 size, truesize, len, max_head_size, off;
        struct sk_buff *skb = *pskb, *nskb;
        int err, i, head_off;
        void *data;

        /* XDP does not support fraglist so we need to linearize
         * the skb.
         */
        if (skb_has_frag_list(skb))
                return -EOPNOTSUPP;

        max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
        if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
                return -ENOMEM;

        size = min_t(u32, skb->len, max_head_size);
        truesize = SKB_HEAD_ALIGN(size) + headroom;
        data = page_pool_dev_alloc_va(pool, &truesize);
        if (!data)
                return -ENOMEM;

        nskb = napi_build_skb(data, truesize);
        if (!nskb) {
                page_pool_free_va(pool, data, true);
                return -ENOMEM;
        }

        skb_reserve(nskb, headroom);
        skb_copy_header(nskb, skb);
        skb_mark_for_recycle(nskb);

        err = skb_copy_bits(skb, 0, nskb->data, size);
        if (err) {
                consume_skb(nskb);
                return err;
        }
        skb_put(nskb, size);

        head_off = skb_headroom(nskb) - skb_headroom(skb);
        skb_headers_offset_update(nskb, head_off);

        off = size;
        len = skb->len - off;
        for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
                struct page *page;
                u32 page_off;

                size = min_t(u32, len, PAGE_SIZE);
                truesize = size;

                page = page_pool_dev_alloc(pool, &page_off, &truesize);
                if (!page) {
                        consume_skb(nskb);
                        return -ENOMEM;
                }

                skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
                err = skb_copy_bits(skb, off, page_address(page) + page_off,
                                    size);
                if (err) {
                        consume_skb(nskb);
                        return err;
                }

                len -= size;
                off += size;
        }

        consume_skb(skb);
        *pskb = nskb;

        return 0;
#else
        return -EOPNOTSUPP;
#endif
}
EXPORT_SYMBOL(skb_pp_cow_data);

int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
                         const struct bpf_prog *prog)
{
        if (!prog->aux->xdp_has_frags)
                return -EINVAL;

        return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
}
EXPORT_SYMBOL(skb_cow_data_for_xdp);

#if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(netmem_ref netmem)
{
        netmem = netmem_compound_head(netmem);

        /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
         * in order to preserve any existing bits, such as bit 0 for the
         * head page of compound page and bit 1 for pfmemalloc page, so
         * mask those bits for freeing side when doing below checking,
         * and page_is_pfmemalloc() is checked in __page_pool_put_page()
         * to avoid recycling the pfmemalloc page.
         */
        if (unlikely(!is_pp_netmem(netmem)))
                return false;

        page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);

        return true;
}
EXPORT_SYMBOL(napi_pp_put_page);
#endif

static bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
        if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
                return false;
        return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
}

/**
 * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
 * @skb:        page pool aware skb
 *
 * Increase the fragment reference count (pp_ref_count) of a skb. This is
 * intended to gain fragment references only for page pool aware skbs,
 * i.e. when skb->pp_recycle is true, and not for fragments in a
 * non-pp-recycling skb. It has a fallback to increase references on normal
 * pages, as page pool aware skbs may also have normal page fragments.
 */
static int skb_pp_frag_ref(struct sk_buff *skb)
{
        struct skb_shared_info *shinfo;
        netmem_ref head_netmem;
        int i;

        if (!skb->pp_recycle)
                return -EINVAL;

        shinfo = skb_shinfo(skb);

        for (i = 0; i < shinfo->nr_frags; i++) {
                head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
                if (likely(is_pp_netmem(head_netmem)))
                        page_pool_ref_netmem(head_netmem);
                else
                        page_ref_inc(netmem_to_page(head_netmem));
        }
        return 0;
}

static void skb_kfree_head(void *head, unsigned int end_offset)
{
        if (end_offset == SKB_SMALL_HEAD_HEADROOM)
                kmem_cache_free(net_hotdata.skb_small_head_cache, head);
        else
                kfree(head);
}

static void skb_free_head(struct sk_buff *skb)
{
        unsigned char *head = skb->head;

        if (skb->head_frag) {
                if (skb_pp_recycle(skb, head))
                        return;
                skb_free_frag(head);
        } else {
                skb_kfree_head(head, skb_end_offset(skb));
        }
}

static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);
        int i;

        if (!skb_data_unref(skb, shinfo))
                goto exit;

        if (skb_zcopy(skb)) {
                bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;

                skb_zcopy_clear(skb, true);
                if (skip_unref)
                        goto free_head;
        }

        for (i = 0; i < shinfo->nr_frags; i++)
                __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);

free_head:
        if (shinfo->frag_list)
                kfree_skb_list_reason(shinfo->frag_list, reason);

        skb_free_head(skb);
exit:
        /* When we clone an SKB we copy the reycling bit. The pp_recycle
         * bit is only set on the head though, so in order to avoid races
         * while trying to recycle fragments on __skb_frag_unref() we need
         * to make one SKB responsible for triggering the recycle path.
         * So disable the recycling bit if an SKB is cloned and we have
         * additional references to the fragmented part of the SKB.
         * Eventually the last SKB will have the recycling bit set and it's
         * dataref set to 0, which will trigger the recycling
         */
        skb->pp_recycle = 0;
}

/*
 *        Free an skbuff by memory without cleaning the state.
 */
static void kfree_skbmem(struct sk_buff *skb)
{
        struct sk_buff_fclones *fclones;

        switch (skb->fclone) {
        case SKB_FCLONE_UNAVAILABLE:
                kmem_cache_free(net_hotdata.skbuff_cache, skb);
                return;

        case SKB_FCLONE_ORIG:
                fclones = container_of(skb, struct sk_buff_fclones, skb1);

                /* We usually free the clone (TX completion) before original skb
                 * This test would have no chance to be true for the clone,
                 * while here, branch prediction will be good.
                 */
                if (refcount_read(&fclones->fclone_ref) == 1)
                        goto fastpath;
                break;

        default: /* SKB_FCLONE_CLONE */
                fclones = container_of(skb, struct sk_buff_fclones, skb2);
                break;
        }
        if (!refcount_dec_and_test(&fclones->fclone_ref))
                return;
fastpath:
        kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
}

void skb_release_head_state(struct sk_buff *skb)
{
        skb_dst_drop(skb);
        if (skb->destructor) {
                DEBUG_NET_WARN_ON_ONCE(in_hardirq());
                skb->destructor(skb);
        }
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        nf_conntrack_put(skb_nfct(skb));
#endif
        skb_ext_put(skb);
}

/* Free everything but the sk_buff shell. */
static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
        skb_release_head_state(skb);
        if (likely(skb->head))
                skb_release_data(skb, reason);
}

/**
 *        __kfree_skb - private function
 *        @skb: buffer
 *
 *        Free an sk_buff. Release anything attached to the buffer.
 *        Clean the state. This is an internal helper function. Users should
 *        always call kfree_skb
 */

void __kfree_skb(struct sk_buff *skb)
{
        skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
        kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);

static __always_inline
bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
                          enum skb_drop_reason reason)
{
        if (unlikely(!skb_unref(skb)))
                return false;

        DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
                               u32_get_bits(reason,
                                            SKB_DROP_REASON_SUBSYS_MASK) >=
                                SKB_DROP_REASON_SUBSYS_NUM);

        if (reason == SKB_CONSUMED)
                trace_consume_skb(skb, __builtin_return_address(0));
        else
                trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
        return true;
}

/**
 *        sk_skb_reason_drop - free an sk_buff with special reason
 *        @sk: the socket to receive @skb, or NULL if not applicable
 *        @skb: buffer to free
 *        @reason: reason why this skb is dropped
 *
 *        Drop a reference to the buffer and free it if the usage count has hit
 *        zero. Meanwhile, pass the receiving socket and drop reason to
 *        'kfree_skb' tracepoint.
 */
void __fix_address
sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
        if (__sk_skb_reason_drop(sk, skb, reason))
                __kfree_skb(skb);
}
EXPORT_SYMBOL(sk_skb_reason_drop);

#define KFREE_SKB_BULK_SIZE        16

struct skb_free_array {
        unsigned int skb_count;
        void *skb_array[KFREE_SKB_BULK_SIZE];
};

static void kfree_skb_add_bulk(struct sk_buff *skb,
                               struct skb_free_array *sa,
                               enum skb_drop_reason reason)
{
        /* if SKB is a clone, don't handle this case */
        if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
                __kfree_skb(skb);
                return;
        }

        skb_release_all(skb, reason);
        sa->skb_array[sa->skb_count++] = skb;

        if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
                kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
                                     sa->skb_array);
                sa->skb_count = 0;
        }
}

void __fix_address
kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
{
        struct skb_free_array sa;

        sa.skb_count = 0;

        while (segs) {
                struct sk_buff *next = segs->next;

                if (__sk_skb_reason_drop(NULL, segs, reason)) {
                        skb_poison_list(segs);
                        kfree_skb_add_bulk(segs, &sa, reason);
                }

                segs = next;
        }

        if (sa.skb_count)
                kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
}
EXPORT_SYMBOL(kfree_skb_list_reason);

/* Dump skb information and contents.
 *
 * Must only be called from net_ratelimit()-ed paths.
 *
 * Dumps whole packets if full_pkt, only headers otherwise.
 */
void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
{
        struct skb_shared_info *sh = skb_shinfo(skb);
        struct net_device *dev = skb->dev;
        struct sock *sk = skb->sk;
        struct sk_buff *list_skb;
        bool has_mac, has_trans;
        int headroom, tailroom;
        int i, len, seg_len;

        if (full_pkt)
                len = skb->len;
        else
                len = min_t(int, skb->len, MAX_HEADER + 128);

        headroom = skb_headroom(skb);
        tailroom = skb_tailroom(skb);

        has_mac = skb_mac_header_was_set(skb);
        has_trans = skb_transport_header_was_set(skb);

        printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
               "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
               "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
               "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
               "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
               "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
               "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
               level, skb->len, headroom, skb_headlen(skb), tailroom,
               has_mac ? skb->mac_header : -1,
               has_mac ? skb_mac_header_len(skb) : -1,
               skb->mac_len,
               skb->network_header,
               has_trans ? skb_network_header_len(skb) : -1,
               has_trans ? skb->transport_header : -1,
               sh->tx_flags, sh->nr_frags,
               sh->gso_size, sh->gso_type, sh->gso_segs,
               skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
               skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
               skb->hash, skb->sw_hash, skb->l4_hash,
               ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
               skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
               skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
               skb->inner_network_header, skb->inner_transport_header);

        if (dev)
                printk("%sdev name=%s feat=%pNF\n",
                       level, dev->name, &dev->features);
        if (sk)
                printk("%ssk family=%hu type=%u proto=%u\n",
                       level, sk->sk_family, sk->sk_type, sk->sk_protocol);

        if (full_pkt && headroom)
                print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->head, headroom, false);

        seg_len = min_t(int, skb_headlen(skb), len);
        if (seg_len)
                print_hex_dump(level, "skb linear:   ", DUMP_PREFIX_OFFSET,
                               16, 1, skb->data, seg_len, false);
        len -= seg_len;

        if (full_pkt && tailroom)
                print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
                               16, 1, skb_tail_pointer(skb), tailroom, false);

        for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                if (skb_frag_is_net_iov(frag)) {
                        printk("%sskb frag %d: not readable\n", level, i);
                        len -= skb_frag_size(frag);
                        if (!len)
                                break;
                        continue;
                }

                skb_frag_foreach_page(frag, skb_frag_off(frag),
                                      skb_frag_size(frag), p, p_off, p_len,
                                      copied) {
                        seg_len = min_t(int, p_len, len);
                        vaddr = kmap_atomic(p);
                        print_hex_dump(level, "skb frag:     ",
                                       DUMP_PREFIX_OFFSET,
                                       16, 1, vaddr + p_off, seg_len, false);
                        kunmap_atomic(vaddr);
                        len -= seg_len;
                        if (!len)
                                break;
                }
        }

        if (full_pkt && skb_has_frag_list(skb)) {
                printk("skb fraglist:\n");
                skb_walk_frags(skb, list_skb)
                        skb_dump(level, list_skb, true);
        }
}
EXPORT_SYMBOL(skb_dump);

/**
 *        skb_tx_error - report an sk_buff xmit error
 *        @skb: buffer that triggered an error
 *
 *        Report xmit error if a device callback is tracking this skb.
 *        skb must be freed afterwards.
 */
void skb_tx_error(struct sk_buff *skb)
{
        if (skb) {
                skb_zcopy_downgrade_managed(skb);
                skb_zcopy_clear(skb, true);
        }
}
EXPORT_SYMBOL(skb_tx_error);

#ifdef CONFIG_TRACEPOINTS
/**
 *        consume_skb - free an skbuff
 *        @skb: buffer to free
 *
 *        Drop a ref to the buffer and free it if the usage count has hit zero
 *        Functions identically to kfree_skb, but kfree_skb assumes that the frame
 *        is being dropped after a failure and notes that
 */
void consume_skb(struct sk_buff *skb)
{
        if (!skb_unref(skb))
                return;

        trace_consume_skb(skb, __builtin_return_address(0));
        __kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
#endif

/**
 *        __consume_stateless_skb - free an skbuff, assuming it is stateless
 *        @skb: buffer to free
 *
 *        Alike consume_skb(), but this variant assumes that this is the last
 *        skb reference and all the head states have been already dropped
 */
void __consume_stateless_skb(struct sk_buff *skb)
{
        trace_consume_skb(skb, __builtin_return_address(0));
        skb_release_data(skb, SKB_CONSUMED);
        kfree_skbmem(skb);
}

static void napi_skb_cache_put(struct sk_buff *skb)
{
        struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
        u32 i;

        if (!kasan_mempool_poison_object(skb))
                return;

        local_lock_nested_bh(&napi_alloc_cache.bh_lock);
        nc->skb_cache[nc->skb_count++] = skb;

        if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
                for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
                        kasan_mempool_unpoison_object(nc->skb_cache[i],
                                                kmem_cache_size(net_hotdata.skbuff_cache));

                kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF,
                                     nc->skb_cache + NAPI_SKB_CACHE_HALF);
                nc->skb_count = NAPI_SKB_CACHE_HALF;
        }
        local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}

void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
        skb_release_all(skb, reason);
        napi_skb_cache_put(skb);
}

void napi_skb_free_stolen_head(struct sk_buff *skb)
{
        if (unlikely(skb->slow_gro)) {
                nf_reset_ct(skb);
                skb_dst_drop(skb);
                skb_ext_put(skb);
                skb_orphan(skb);
                skb->slow_gro = 0;
        }
        napi_skb_cache_put(skb);
}

void napi_consume_skb(struct sk_buff *skb, int budget)
{
        /* Zero budget indicate non-NAPI context called us, like netpoll */
        if (unlikely(!budget)) {
                dev_consume_skb_any(skb);
                return;
        }

        DEBUG_NET_WARN_ON_ONCE(!in_softirq());

        if (!skb_unref(skb))
                return;

        /* if reaching here SKB is ready to free */
        trace_consume_skb(skb, __builtin_return_address(0));

        /* if SKB is a clone, don't handle this case */
        if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
                __kfree_skb(skb);
                return;
        }

        skb_release_all(skb, SKB_CONSUMED);
        napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);

/* Make sure a field is contained by headers group */
#define CHECK_SKB_FIELD(field) \
        BUILD_BUG_ON(offsetof(struct sk_buff, field) !=                \
                     offsetof(struct sk_buff, headers.field));        \

static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
        new->tstamp                = old->tstamp;
        /* We do not copy old->sk */
        new->dev                = old->dev;
        memcpy(new->cb, old->cb, sizeof(old->cb));
        skb_dst_copy(new, old);
        __skb_ext_copy(new, old);
        __nf_copy(new, old, false);

        /* Note : this field could be in the headers group.
         * It is not yet because we do not want to have a 16 bit hole
         */
        new->queue_mapping = old->queue_mapping;

        memcpy(&new->headers, &old->headers, sizeof(new->headers));
        CHECK_SKB_FIELD(protocol);
        CHECK_SKB_FIELD(csum);
        CHECK_SKB_FIELD(hash);
        CHECK_SKB_FIELD(priority);
        CHECK_SKB_FIELD(skb_iif);
        CHECK_SKB_FIELD(vlan_proto);
        CHECK_SKB_FIELD(vlan_tci);
        CHECK_SKB_FIELD(transport_header);
        CHECK_SKB_FIELD(network_header);
        CHECK_SKB_FIELD(mac_header);
        CHECK_SKB_FIELD(inner_protocol);
        CHECK_SKB_FIELD(inner_transport_header);
        CHECK_SKB_FIELD(inner_network_header);
        CHECK_SKB_FIELD(inner_mac_header);
        CHECK_SKB_FIELD(mark);
#ifdef CONFIG_NETWORK_SECMARK
        CHECK_SKB_FIELD(secmark);
#endif
#ifdef CONFIG_NET_RX_BUSY_POLL
        CHECK_SKB_FIELD(napi_id);
#endif
        CHECK_SKB_FIELD(alloc_cpu);
#ifdef CONFIG_XPS
        CHECK_SKB_FIELD(sender_cpu);
#endif
#ifdef CONFIG_NET_SCHED
        CHECK_SKB_FIELD(tc_index);
#endif

}

/*
 * You should not add any new code to this function.  Add it to
 * __copy_skb_header above instead.
 */
static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
{
#define C(x) n->x = skb->x

        n->next = n->prev = NULL;
        n->sk = NULL;
        __copy_skb_header(n, skb);

        C(len);
        C(data_len);
        C(mac_len);
        n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
        n->cloned = 1;
        n->nohdr = 0;
        n->peeked = 0;
        C(pfmemalloc);
        C(pp_recycle);
        n->destructor = NULL;
        C(tail);
        C(end);
        C(head);
        C(head_frag);
        C(data);
        C(truesize);
        refcount_set(&n->users, 1);

        atomic_inc(&(skb_shinfo(skb)->dataref));
        skb->cloned = 1;

        return n;
#undef C
}

/**
 * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
 * @first: first sk_buff of the msg
 */
struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
{
        struct sk_buff *n;

        n = alloc_skb(0, GFP_ATOMIC);
        if (!n)
                return NULL;

        n->len = first->len;
        n->data_len = first->len;
        n->truesize = first->truesize;

        skb_shinfo(n)->frag_list = first;

        __copy_skb_header(n, first);
        n->destructor = NULL;

        return n;
}
EXPORT_SYMBOL_GPL(alloc_skb_for_msg);

/**
 *        skb_morph        -        morph one skb into another
 *        @dst: the skb to receive the contents
 *        @src: the skb to supply the contents
 *
 *        This is identical to skb_clone except that the target skb is
 *        supplied by the user.
 *
 *        The target skb is returned upon exit.
 */
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
        skb_release_all(dst, SKB_CONSUMED);
        return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);

int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
        unsigned long max_pg, num_pg, new_pg, old_pg, rlim;
        struct user_struct *user;

        if (capable(CAP_IPC_LOCK) || !size)
                return 0;

        rlim = rlimit(RLIMIT_MEMLOCK);
        if (rlim == RLIM_INFINITY)
                return 0;

        num_pg = (size >> PAGE_SHIFT) + 2;        /* worst case */
        max_pg = rlim >> PAGE_SHIFT;
        user = mmp->user ? : current_user();

        old_pg = atomic_long_read(&user->locked_vm);
        do {
                new_pg = old_pg + num_pg;
                if (new_pg > max_pg)
                        return -ENOBUFS;
        } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));

        if (!mmp->user) {
                mmp->user = get_uid(user);
                mmp->num_pg = num_pg;
        } else {
                mmp->num_pg += num_pg;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);

void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
        if (mmp->user) {
                atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
                free_uid(mmp->user);
        }
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);

static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
{
        struct ubuf_info_msgzc *uarg;
        struct sk_buff *skb;

        WARN_ON_ONCE(!in_task());

        skb = sock_omalloc(sk, 0, GFP_KERNEL);
        if (!skb)
                return NULL;

        BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb));
        uarg = (void *)skb->cb;
        uarg->mmp.user = NULL;

        if (mm_account_pinned_pages(&uarg->mmp, size)) {
                kfree_skb(skb);
                return NULL;
        }

        uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
        uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
        uarg->len = 1;
        uarg->bytelen = size;
        uarg->zerocopy = 1;
        uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
        refcount_set(&uarg->ubuf.refcnt, 1);
        sock_hold(sk);

        return &uarg->ubuf;
}

static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
{
        return container_of((void *)uarg, struct sk_buff, cb);
}

struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
                                       struct ubuf_info *uarg)
{
        if (uarg) {
                struct ubuf_info_msgzc *uarg_zc;
                const u32 byte_limit = 1 << 19;                /* limit to a few TSO */
                u32 bytelen, next;

                /* there might be non MSG_ZEROCOPY users */
                if (uarg->ops != &msg_zerocopy_ubuf_ops)
                        return NULL;

                /* realloc only when socket is locked (TCP, UDP cork),
                 * so uarg->len and sk_zckey access is serialized
                 */
                if (!sock_owned_by_user(sk)) {
                        WARN_ON_ONCE(1);
                        return NULL;
                }

                uarg_zc = uarg_to_msgzc(uarg);
                bytelen = uarg_zc->bytelen + size;
                if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
                        /* TCP can create new skb to attach new uarg */
                        if (sk->sk_type == SOCK_STREAM)
                                goto new_alloc;
                        return NULL;
                }

                next = (u32)atomic_read(&sk->sk_zckey);
                if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
                        if (mm_account_pinned_pages(&uarg_zc->mmp, size))
                                return NULL;
                        uarg_zc->len++;
                        uarg_zc->bytelen = bytelen;
                        atomic_set(&sk->sk_zckey, ++next);

                        /* no extra ref when appending to datagram (MSG_MORE) */
                        if (sk->sk_type == SOCK_STREAM)
                                net_zcopy_get(uarg);

                        return uarg;
                }
        }

new_alloc:
        return msg_zerocopy_alloc(sk, size);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);

static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
        struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
        u32 old_lo, old_hi;
        u64 sum_len;

        old_lo = serr->ee.ee_info;
        old_hi = serr->ee.ee_data;
        sum_len = old_hi - old_lo + 1ULL + len;

        if (sum_len >= (1ULL << 32))
                return false;

        if (lo != old_hi + 1)
                return false;

        serr->ee.ee_data += len;
        return true;
}

static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
{
        struct sk_buff *tail, *skb = skb_from_uarg(uarg);
        struct sock_exterr_skb *serr;
        struct sock *sk = skb->sk;
        struct sk_buff_head *q;
        unsigned long flags;
        bool is_zerocopy;
        u32 lo, hi;
        u16 len;

        mm_unaccount_pinned_pages(&uarg->mmp);

        /* if !len, there was only 1 call, and it was aborted
         * so do not queue a completion notification
         */
        if (!uarg->len || sock_flag(sk, SOCK_DEAD))
                goto release;

        len = uarg->len;
        lo = uarg->id;
        hi = uarg->id + len - 1;
        is_zerocopy = uarg->zerocopy;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = 0;
        serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
        serr->ee.ee_data = hi;
        serr->ee.ee_info = lo;
        if (!is_zerocopy)
                serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;

        q = &sk->sk_error_queue;
        spin_lock_irqsave(&q->lock, flags);
        tail = skb_peek_tail(q);
        if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
            !skb_zerocopy_notify_extend(tail, lo, len)) {
                __skb_queue_tail(q, skb);
                skb = NULL;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        sk_error_report(sk);

release:
        consume_skb(skb);
        sock_put(sk);
}

static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
                                  bool success)
{
        struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);

        uarg_zc->zerocopy = uarg_zc->zerocopy & success;

        if (refcount_dec_and_test(&uarg->refcnt))
                __msg_zerocopy_callback(uarg_zc);
}

void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
        struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;

        atomic_dec(&sk->sk_zckey);
        uarg_to_msgzc(uarg)->len--;

        if (have_uref)
                msg_zerocopy_complete(NULL, uarg, true);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);

const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
        .complete = msg_zerocopy_complete,
};
EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
                             struct msghdr *msg, int len,
                             struct ubuf_info *uarg)
{
        int err, orig_len = skb->len;

        if (uarg->ops->link_skb) {
                err = uarg->ops->link_skb(skb, uarg);
                if (err)
                        return err;
        } else {
                struct ubuf_info *orig_uarg = skb_zcopy(skb);

                /* An skb can only point to one uarg. This edge case happens
                 * when TCP appends to an skb, but zerocopy_realloc triggered
                 * a new alloc.
                 */
                if (orig_uarg && uarg != orig_uarg)
                        return -EEXIST;
        }

        err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
        if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
                struct sock *save_sk = skb->sk;

                /* Streams do not free skb on error. Reset to prev state. */
                iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
                skb->sk = sk;
                ___pskb_trim(skb, orig_len);
                skb->sk = save_sk;
                return err;
        }

        skb_zcopy_set(skb, uarg, NULL);
        return skb->len - orig_len;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);

void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
{
        int i;

        skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                skb_frag_ref(skb, i);
}
EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);

static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
                              gfp_t gfp_mask)
{
        if (skb_zcopy(orig)) {
                if (skb_zcopy(nskb)) {
                        /* !gfp_mask callers are verified to !skb_zcopy(nskb) */
                        if (!gfp_mask) {
                                WARN_ON_ONCE(1);
                                return -ENOMEM;
                        }
                        if (skb_uarg(nskb) == skb_uarg(orig))
                                return 0;
                        if (skb_copy_ubufs(nskb, GFP_ATOMIC))
                                return -EIO;
                }
                skb_zcopy_set(nskb, skb_uarg(orig), NULL);
        }
        return 0;
}

/**
 *        skb_copy_ubufs        -        copy userspace skb frags buffers to kernel
 *        @skb: the skb to modify
 *        @gfp_mask: allocation priority
 *
 *        This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
 *        It will copy all frags into kernel and drop the reference
 *        to userspace pages.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 *
 *        Returns 0 on success or a negative error code on failure
 *        to allocate kernel memory to copy to.
 */
int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{
        int num_frags = skb_shinfo(skb)->nr_frags;
        struct page *page, *head = NULL;
        int i, order, psize, new_frags;
        u32 d_off;

        if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
                return -EINVAL;

        if (!skb_frags_readable(skb))
                return -EFAULT;

        if (!num_frags)
                goto release;

        /* We might have to allocate high order pages, so compute what minimum
         * page order is needed.
         */
        order = 0;
        while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb))
                order++;
        psize = (PAGE_SIZE << order);

        new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order);
        for (i = 0; i < new_frags; i++) {
                page = alloc_pages(gfp_mask | __GFP_COMP, order);
                if (!page) {
                        while (head) {
                                struct page *next = (struct page *)page_private(head);
                                put_page(head);
                                head = next;
                        }
                        return -ENOMEM;
                }
                set_page_private(page, (unsigned long)head);
                head = page;
        }

        page = head;
        d_off = 0;
        for (i = 0; i < num_frags; i++) {
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];
                u32 p_off, p_len, copied;
                struct page *p;
                u8 *vaddr;

                skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
                                      p, p_off, p_len, copied) {
                        u32 copy, done = 0;
                        vaddr = kmap_atomic(p);

                        while (done < p_len) {
                                if (d_off == psize) {
                                        d_off = 0;
                                        page = (struct page *)page_private(page);
                                }
                                copy = min_t(u32, psize - d_off, p_len - done);
                                memcpy(page_address(page) + d_off,
                                       vaddr + p_off + done, copy);
                                done += copy;
                                d_off += copy;
                        }
                        kunmap_atomic(vaddr);
                }
        }

        /* skb frags release userspace buffers */
        for (i = 0; i < num_frags; i++)
                skb_frag_unref(skb, i);

        /* skb frags point to kernel buffers */
        for (i = 0; i < new_frags - 1; i++) {
                __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
                head = (struct page *)page_private(head);
        }
        __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
                               d_off);
        skb_shinfo(skb)->nr_frags = new_frags;

release:
        skb_zcopy_clear(skb, false);
        return 0;
}
EXPORT_SYMBOL_GPL(skb_copy_ubufs);

/**
 *        skb_clone        -        duplicate an sk_buff
 *        @skb: buffer to clone
 *        @gfp_mask: allocation priority
 *
 *        Duplicate an &sk_buff. The new one is not owned by a socket. Both
 *        copies share the same packet data but not structure. The new
 *        buffer has a reference count of 1. If the allocation fails the
 *        function returns %NULL otherwise the new buffer is returned.
 *
 *        If this function is called from an interrupt gfp_mask() must be
 *        %GFP_ATOMIC.
 */

struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff_fclones *fclones = container_of(skb,
                                                       struct sk_buff_fclones,
                                                       skb1);
        struct sk_buff *n;

        if (skb_orphan_frags(skb, gfp_mask))
                return NULL;

        if (skb->fclone == SKB_FCLONE_ORIG &&
            refcount_read(&fclones->fclone_ref) == 1) {
                n = &fclones->skb2;
                refcount_set(&fclones->fclone_ref, 2);
                n->fclone = SKB_FCLONE_CLONE;
        } else {
                if (skb_pfmemalloc(skb))
                        gfp_mask |= __GFP_MEMALLOC;

                n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
                if (!n)
                        return NULL;

                n->fclone = SKB_FCLONE_UNAVAILABLE;
        }

        return __skb_clone(n, skb);
}
EXPORT_SYMBOL(skb_clone);

void skb_headers_offset_update(struct sk_buff *skb, int off)
{
        /* Only adjust this if it actually is csum_start rather than csum */
        if (skb->ip_summed == CHECKSUM_PARTIAL)
                skb->csum_start += off;
        /* {transport,network,mac}_header and tail are relative to skb->head */
        skb->transport_header += off;
        skb->network_header   += off;
        if (skb_mac_header_was_set(skb))
                skb->mac_header += off;
        skb->inner_transport_header += off;
        skb->inner_network_header += off;
        skb->inner_mac_header += off;
}
EXPORT_SYMBOL(skb_headers_offset_update);

void skb_copy_header(struct sk_buff *new, const struct sk_buff *old)
{
        __copy_skb_header(new, old);

        skb_shinfo(new)->gso_size = skb_shinfo(old)->gso_size;
        skb_shinfo(new)->gso_segs = skb_shinfo(old)->gso_segs;
        skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
}
EXPORT_SYMBOL(skb_copy_header);

static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
{
        if (skb_pfmemalloc(skb))
                return SKB_ALLOC_RX;
        return 0;
}

/**
 *        skb_copy        -        create private copy of an sk_buff
 *        @skb: buffer to copy
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data. This is used when the
 *        caller wishes to modify the data and needs a private copy of the
 *        data to alter. Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        As by-product this function converts non-linear &sk_buff to linear
 *        one, so that &sk_buff becomes completely private and caller is allowed
 *        to modify all the data of returned buffer. This means that this
 *        function is not recommended for use in circumstances when only
 *        header is going to be modified. Use pskb_copy() instead.
 */

struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
        struct sk_buff *n;
        unsigned int size;
        int headerlen;

        if (!skb_frags_readable(skb))
                return NULL;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        headerlen = skb_headroom(skb);
        size = skb_end_offset(skb) + skb->data_len;
        n = __alloc_skb(size, gfp_mask,
                        skb_alloc_rx_flag(skb), NUMA_NO_NODE);
        if (!n)
                return NULL;

        /* Set the data pointer */
        skb_reserve(n, headerlen);
        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len));

        skb_copy_header(n, skb);
        return n;
}
EXPORT_SYMBOL(skb_copy);

/**
 *        __pskb_copy_fclone        -  create copy of an sk_buff with private head.
 *        @skb: buffer to copy
 *        @headroom: headroom of new skb
 *        @gfp_mask: allocation priority
 *        @fclone: if true allocate the copy of the skb from the fclone
 *        cache instead of the head cache; it is recommended to set this
 *        to true for the cases where the copy will likely be cloned
 *
 *        Make a copy of both an &sk_buff and part of its data, located
 *        in header. Fragmented data remain shared. This is used when
 *        the caller wishes to modify only header of &sk_buff and needs
 *        private copy of the header to alter. Returns %NULL on failure
 *        or the pointer to the buffer on success.
 *        The returned buffer has a reference count of 1.
 */

struct sk_buff *__pskb_copy_fclone(struct sk_buff *skb, int headroom,
                                   gfp_t gfp_mask, bool fclone)
{
        unsigned int size = skb_headlen(skb) + headroom;
        int flags = skb_alloc_rx_flag(skb) | (fclone ? SKB_ALLOC_FCLONE : 0);
        struct sk_buff *n = __alloc_skb(size, gfp_mask, flags, NUMA_NO_NODE);

        if (!n)
                goto out;

        /* Set the data pointer */
        skb_reserve(n, headroom);
        /* Set the tail pointer and length */
        skb_put(n, skb_headlen(skb));
        /* Copy the bytes */
        skb_copy_from_linear_data(skb, n->data, n->len);

        n->truesize += skb->data_len;
        n->data_len  = skb->data_len;
        n->len             = skb->len;

        if (skb_shinfo(skb)->nr_frags) {
                int i;

                if (skb_orphan_frags(skb, gfp_mask) ||
                    skb_zerocopy_clone(n, skb, gfp_mask)) {
                        kfree_skb(n);
                        n = NULL;
                        goto out;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
                        skb_frag_ref(skb, i);
                }
                skb_shinfo(n)->nr_frags = i;
        }

        if (skb_has_frag_list(skb)) {
                skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
                skb_clone_fraglist(n);
        }

        skb_copy_header(n, skb);
out:
        return n;
}
EXPORT_SYMBOL(__pskb_copy_fclone);

/**
 *        pskb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @nhead: room to add at head
 *        @ntail: room to add at tail
 *        @gfp_mask: allocation priority
 *
 *        Expands (or creates identical copy, if @nhead and @ntail are zero)
 *        header of @skb. &sk_buff itself is not changed. &sk_buff MUST have
 *        reference count of 1. Returns zero in the case of success or error,
 *        if expansion failed. In the last case, &sk_buff is not changed.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 */

int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
                     gfp_t gfp_mask)
{
        unsigned int osize = skb_end_offset(skb);
        unsigned int size = osize + nhead + ntail;
        long off;
        u8 *data;
        int i;

        BUG_ON(nhead < 0);

        BUG_ON(skb_shared(skb));

        skb_zcopy_downgrade_managed(skb);

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                goto nodata;
        size = SKB_WITH_OVERHEAD(size);

        /* Copy only real data... and, alas, header. This should be
         * optimized for the cases when header is void.
         */
        memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head);

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags]));

        /*
         * if shinfo is shared we must drop the old head gracefully, but if it
         * is not we can just drop the old head and let the existing refcount
         * be since all we did is relocate the values
         */
        if (skb_cloned(skb)) {
                if (skb_orphan_frags(skb, gfp_mask))
                        goto nofrags;
                if (skb_zcopy(skb))
                        refcount_inc(&skb_uarg(skb)->refcnt);
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);

                skb_release_data(skb, SKB_CONSUMED);
        } else {
                skb_free_head(skb);
        }
        off = (data + nhead) - skb->head;

        skb->head     = data;
        skb->head_frag = 0;
        skb->data    += off;

        skb_set_end_offset(skb, size);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
        off           = nhead;
#endif
        skb->tail              += off;
        skb_headers_offset_update(skb, nhead);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        skb_metadata_clear(skb);

        /* It is not generally safe to change skb->truesize.
         * For the moment, we really care of rx path, or
         * when skb is orphaned (not attached to a socket).
         */
        if (!skb->sk || skb->destructor == sock_edemux)
                skb->truesize += size - osize;

        return 0;

nofrags:
        skb_kfree_head(data, size);
nodata:
        return -ENOMEM;
}
EXPORT_SYMBOL(pskb_expand_head);

/* Make private copy of skb with writable head and some headroom */

struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
{
        struct sk_buff *skb2;
        int delta = headroom - skb_headroom(skb);

        if (delta <= 0)
                skb2 = pskb_copy(skb, GFP_ATOMIC);
        else {
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0,
                                             GFP_ATOMIC)) {
                        kfree_skb(skb2);
                        skb2 = NULL;
                }
        }
        return skb2;
}
EXPORT_SYMBOL(skb_realloc_headroom);

/* Note: We plan to rework this in linux-6.4 */
int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
{
        unsigned int saved_end_offset, saved_truesize;
        struct skb_shared_info *shinfo;
        int res;

        saved_end_offset = skb_end_offset(skb);
        saved_truesize = skb->truesize;

        res = pskb_expand_head(skb, 0, 0, pri);
        if (res)
                return res;

        skb->truesize = saved_truesize;

        if (likely(skb_end_offset(skb) == saved_end_offset))
                return 0;

        /* We can not change skb->end if the original or new value
         * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().
         */
        if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM ||
            skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) {
                /* We think this path should not be taken.
                 * Add a temporary trace to warn us just in case.
                 */
                pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n",
                            saved_end_offset, skb_end_offset(skb));
                WARN_ON_ONCE(1);
                return 0;
        }

        shinfo = skb_shinfo(skb);

        /* We are about to change back skb->end,
         * we need to move skb_shinfo() to its new location.
         */
        memmove(skb->head + saved_end_offset,
                shinfo,
                offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));

        skb_set_end_offset(skb, saved_end_offset);

        return 0;
}

/**
 *        skb_expand_head - reallocate header of &sk_buff
 *        @skb: buffer to reallocate
 *        @headroom: needed headroom
 *
 *        Unlike skb_realloc_headroom, this one does not allocate a new skb
 *        if possible; copies skb->sk to new skb as needed
 *        and frees original skb in case of failures.
 *
 *        It expect increased headroom and generates warning otherwise.
 */

struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
{
        int delta = headroom - skb_headroom(skb);
        int osize = skb_end_offset(skb);
        struct sock *sk = skb->sk;

        if (WARN_ONCE(delta <= 0,
                      "%s is expecting an increase in the headroom", __func__))
                return skb;

        delta = SKB_DATA_ALIGN(delta);
        /* pskb_expand_head() might crash, if skb is shared. */
        if (skb_shared(skb) || !is_skb_wmem(skb)) {
                struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

                if (unlikely(!nskb))
                        goto fail;

                if (sk)
                        skb_set_owner_w(nskb, sk);
                consume_skb(skb);
                skb = nskb;
        }
        if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
                goto fail;

        if (sk && is_skb_wmem(skb)) {
                delta = skb_end_offset(skb) - osize;
                refcount_add(delta, &sk->sk_wmem_alloc);
                skb->truesize += delta;
        }
        return skb;

fail:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_expand_head);

/**
 *        skb_copy_expand        -        copy and expand sk_buff
 *        @skb: buffer to copy
 *        @newheadroom: new free bytes at head
 *        @newtailroom: new free bytes at tail
 *        @gfp_mask: allocation priority
 *
 *        Make a copy of both an &sk_buff and its data and while doing so
 *        allocate additional space.
 *
 *        This is used when the caller wishes to modify the data and needs a
 *        private copy of the data to alter as well as more space for new fields.
 *        Returns %NULL on failure or the pointer to the buffer
 *        on success. The returned buffer has a reference count of 1.
 *
 *        You must pass %GFP_ATOMIC as the allocation priority if this function
 *        is called from an interrupt.
 */
struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
                                int newheadroom, int newtailroom,
                                gfp_t gfp_mask)
{
        /*
         *        Allocate the copy buffer
         */
        int head_copy_len, head_copy_off;
        struct sk_buff *n;
        int oldheadroom;

        if (!skb_frags_readable(skb))
                return NULL;

        if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
                return NULL;

        oldheadroom = skb_headroom(skb);
        n = __alloc_skb(newheadroom + skb->len + newtailroom,
                        gfp_mask, skb_alloc_rx_flag(skb),
                        NUMA_NO_NODE);
        if (!n)
                return NULL;

        skb_reserve(n, newheadroom);

        /* Set the tail pointer and length */
        skb_put(n, skb->len);

        head_copy_len = oldheadroom;
        head_copy_off = 0;
        if (newheadroom <= head_copy_len)
                head_copy_len = newheadroom;
        else
                head_copy_off = newheadroom - head_copy_len;

        /* Copy the linear header and data. */
        BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off,
                             skb->len + head_copy_len));

        skb_copy_header(n, skb);

        skb_headers_offset_update(n, newheadroom - oldheadroom);

        return n;
}
EXPORT_SYMBOL(skb_copy_expand);

/**
 *        __skb_pad                -        zero pad the tail of an skb
 *        @skb: buffer to pad
 *        @pad: space to pad
 *        @free_on_error: free buffer on error
 *
 *        Ensure that a buffer is followed by a padding area that is zero
 *        filled. Used by network drivers which may DMA or transfer data
 *        beyond the buffer end onto the wire.
 *
 *        May return error in out of memory cases. The skb is freed on error
 *        if @free_on_error is true.
 */

int __skb_pad(struct sk_buff *skb, int pad, bool free_on_error)
{
        int err;
        int ntail;

        /* If the skbuff is non linear tailroom is always zero.. */
        if (!skb_cloned(skb) && skb_tailroom(skb) >= pad) {
                memset(skb->data+skb->len, 0, pad);
                return 0;
        }

        ntail = skb->data_len + pad - (skb->end - skb->tail);
        if (likely(skb_cloned(skb) || ntail > 0)) {
                err = pskb_expand_head(skb, 0, ntail, GFP_ATOMIC);
                if (unlikely(err))
                        goto free_skb;
        }

        /* FIXME: The use of this function with non-linear skb's really needs
         * to be audited.
         */
        err = skb_linearize(skb);
        if (unlikely(err))
                goto free_skb;

        memset(skb->data + skb->len, 0, pad);
        return 0;

free_skb:
        if (free_on_error)
                kfree_skb(skb);
        return err;
}
EXPORT_SYMBOL(__skb_pad);

/**
 *        pskb_put - add data to the tail of a potentially fragmented buffer
 *        @skb: start of the buffer to use
 *        @tail: tail fragment of the buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the potentially
 *        fragmented buffer. @tail must be the last fragment of @skb -- or
 *        @skb itself. If this would exceed the total buffer size the kernel
 *        will panic. A pointer to the first byte of the extra data is
 *        returned.
 */

void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len)
{
        if (tail != skb) {
                skb->data_len += len;
                skb->len += len;
        }
        return skb_put(tail, len);
}
EXPORT_SYMBOL_GPL(pskb_put);

/**
 *        skb_put - add data to a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer. If this would
 *        exceed the total buffer size the kernel will panic. A pointer to the
 *        first byte of the extra data is returned.
 */
void *skb_put(struct sk_buff *skb, unsigned int len)
{
        void *tmp = skb_tail_pointer(skb);
        SKB_LINEAR_ASSERT(skb);
        skb->tail += len;
        skb->len  += len;
        if (unlikely(skb->tail > skb->end))
                skb_over_panic(skb, len, __builtin_return_address(0));
        return tmp;
}
EXPORT_SYMBOL(skb_put);

/**
 *        skb_push - add data to the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to add
 *
 *        This function extends the used data area of the buffer at the buffer
 *        start. If this would exceed the total buffer headroom the kernel will
 *        panic. A pointer to the first byte of the extra data is returned.
 */
void *skb_push(struct sk_buff *skb, unsigned int len)
{
        skb->data -= len;
        skb->len  += len;
        if (unlikely(skb->data < skb->head))
                skb_under_panic(skb, len, __builtin_return_address(0));
        return skb->data;
}
EXPORT_SYMBOL(skb_push);

/**
 *        skb_pull - remove data from the start of a buffer
 *        @skb: buffer to use
 *        @len: amount of data to remove
 *
 *        This function removes data from the start of a buffer, returning
 *        the memory to the headroom. A pointer to the next data in the buffer
 *        is returned. Once the data has been pulled future pushes will overwrite
 *        the old data.
 */
void *skb_pull(struct sk_buff *skb, unsigned int len)
{
        return skb_pull_inline(skb, len);
}
EXPORT_SYMBOL(skb_pull);

/**
 *        skb_pull_data - remove data from the start of a buffer returning its
 *        original position.
 *        @skb: buffer to use
 *        @len: amount of data to remove
 *
 *        This function removes data from the start of a buffer, returning
 *        the memory to the headroom. A pointer to the original data in the buffer
 *        is returned after checking if there is enough data to pull. Once the
 *        data has been pulled future pushes will overwrite the old data.
 */
void *skb_pull_data(struct sk_buff *skb, size_t len)
{
        void *data = skb->data;

        if (skb->len < len)
                return NULL;

        skb_pull(skb, len);

        return data;
}
EXPORT_SYMBOL(skb_pull_data);

/**
 *        skb_trim - remove end from a buffer
 *        @skb: buffer to alter
 *        @len: new length
 *
 *        Cut the length of a buffer down by removing data from the tail. If
 *        the buffer is already under the length specified it is not modified.
 *        The skb must be linear.
 */
void skb_trim(struct sk_buff *skb, unsigned int len)
{
        if (skb->len > len)
                __skb_trim(skb, len);
}
EXPORT_SYMBOL(skb_trim);

/* Trims skb to length len. It can change skb pointers.
 */

int ___pskb_trim(struct sk_buff *skb, unsigned int len)
{
        struct sk_buff **fragp;
        struct sk_buff *frag;
        int offset = skb_headlen(skb);
        int nfrags = skb_shinfo(skb)->nr_frags;
        int i;
        int err;

        if (skb_cloned(skb) &&
            unlikely((err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))))
                return err;

        i = 0;
        if (offset >= len)
                goto drop_pages;

        for (; i < nfrags; i++) {
                int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (end < len) {
                        offset = end;
                        continue;
                }

                skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);

drop_pages:
                skb_shinfo(skb)->nr_frags = i;

                for (; i < nfrags; i++)
                        skb_frag_unref(skb, i);

                if (skb_has_frag_list(skb))
                        skb_drop_fraglist(skb);
                goto done;
        }

        for (fragp = &skb_shinfo(skb)->frag_list; (frag = *fragp);
             fragp = &frag->next) {
                int end = offset + frag->len;

                if (skb_shared(frag)) {
                        struct sk_buff *nfrag;

                        nfrag = skb_clone(frag, GFP_ATOMIC);
                        if (unlikely(!nfrag))
                                return -ENOMEM;

                        nfrag->next = frag->next;
                        consume_skb(frag);
                        frag = nfrag;
                        *fragp = frag;
                }

                if (end < len) {
                        offset = end;
                        continue;
                }

                if (end > len &&
                    unlikely((err = pskb_trim(frag, len - offset))))
                        return err;

                if (frag->next)
                        skb_drop_list(&frag->next);
                break;
        }

done:
        if (len > skb_headlen(skb)) {
                skb->data_len -= skb->len - len;
                skb->len       = len;
        } else {
                skb->len       = len;
                skb->data_len  = 0;
                skb_set_tail_pointer(skb, len);
        }

        if (!skb->sk || skb->destructor == sock_edemux)
                skb_condense(skb);
        return 0;
}
EXPORT_SYMBOL(___pskb_trim);

/* Note : use pskb_trim_rcsum() instead of calling this directly
 */
int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                int delta = skb->len - len;

                skb->csum = csum_block_sub(skb->csum,
                                           skb_checksum(skb, len, delta, 0),
                                           len);
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
                int offset = skb_checksum_start_offset(skb) + skb->csum_offset;

                if (offset + sizeof(__sum16) > hdlen)
                        return -EINVAL;
        }
        return __pskb_trim(skb, len);
}
EXPORT_SYMBOL(pskb_trim_rcsum_slow);

/**
 *        __pskb_pull_tail - advance tail of skb header
 *        @skb: buffer to reallocate
 *        @delta: number of bytes to advance tail
 *
 *        The function makes a sense only on a fragmented &sk_buff,
 *        it expands header moving its tail forward and copying necessary
 *        data from fragmented part.
 *
 *        &sk_buff MUST have reference count of 1.
 *
 *        Returns %NULL (and &sk_buff does not change) if pull failed
 *        or value of new tail of skb in the case of success.
 *
 *        All the pointers pointing into skb header may change and must be
 *        reloaded after call to this function.
 */

/* Moves tail of skb head forward, copying data from fragmented part,
 * when it is necessary.
 * 1. It may fail due to malloc failure.
 * 2. It may change skb pointers.
 *
 * It is pretty complicated. Luckily, it is called only in exceptional cases.
 */
void *__pskb_pull_tail(struct sk_buff *skb, int delta)
{
        /* If skb has not enough free space at tail, get new one
         * plus 128 bytes for future expansions. If we have enough
         * room at tail, reallocate without expansion only if skb is cloned.
         */
        int i, k, eat = (skb->tail + delta) - skb->end;

        if (!skb_frags_readable(skb))
                return NULL;

        if (eat > 0 || skb_cloned(skb)) {
                if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
                                     GFP_ATOMIC))
                        return NULL;
        }

        BUG_ON(skb_copy_bits(skb, skb_headlen(skb),
                             skb_tail_pointer(skb), delta));

        /* Optimization: no fragments, no reasons to preestimate
         * size of pulled pages. Superb.
         */
        if (!skb_has_frag_list(skb))
                goto pull_pages;

        /* Estimate size of pulled pages. */
        eat = delta;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size >= eat)
                        goto pull_pages;
                eat -= size;
        }

        /* If we need update frag list, we are in troubles.
         * Certainly, it is possible to add an offset to skb data,
         * but taking into account that pulling is expected to
         * be very rare operation, it is worth to fight against
         * further bloating skb head and crucify ourselves here instead.
         * Pure masohism, indeed. 8)8)
         */
        if (eat) {
                struct sk_buff *list = skb_shinfo(skb)->frag_list;
                struct sk_buff *clone = NULL;
                struct sk_buff *insp = NULL;

                do {
                        if (list->len <= eat) {
                                /* Eaten as whole. */
                                eat -= list->len;
                                list = list->next;
                                insp = list;
                        } else {
                                /* Eaten partially. */
                                if (skb_is_gso(skb) && !list->head_frag &&
                                    skb_headlen(list))
                                        skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;

                                if (skb_shared(list)) {
                                        /* Sucks! We need to fork list. :-( */
                                        clone = skb_clone(list, GFP_ATOMIC);
                                        if (!clone)
                                                return NULL;
                                        insp = list->next;
                                        list = clone;
                                } else {
                                        /* This may be pulled without
                                         * problems. */
                                        insp = list;
                                }
                                if (!pskb_pull(list, eat)) {
                                        kfree_skb(clone);
                                        return NULL;
                                }
                                break;
                        }
                } while (eat);

                /* Free pulled out fragments. */
                while ((list = skb_shinfo(skb)->frag_list) != insp) {
                        skb_shinfo(skb)->frag_list = list->next;
                        consume_skb(list);
                }
                /* And insert new clone at head. */
                if (clone) {
                        clone->next = list;
                        skb_shinfo(skb)->frag_list = clone;
                }
        }
        /* Success! Now we may commit changes to skb data. */

pull_pages:
        eat = delta;
        k = 0;
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (size <= eat) {
                        skb_frag_unref(skb, i);
                        eat -= size;
                } else {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[k];

                        *frag = skb_shinfo(skb)->frags[i];
                        if (eat) {
                                skb_frag_off_add(frag, eat);
                                skb_frag_size_sub(frag, eat);
                                if (!i)
                                        goto end;
                                eat = 0;
                        }
                        k++;
                }
        }
        skb_shinfo(skb)->nr_frags = k;

end:
        skb->tail     += delta;
        skb->data_len -= delta;

        if (!skb->data_len)
                skb_zcopy_clear(skb, false);

        return skb_tail_pointer(skb);
}
EXPORT_SYMBOL(__pskb_pull_tail);

/**
 *        skb_copy_bits - copy bits from skb to kernel buffer
 *        @skb: source skb
 *        @offset: offset in source
 *        @to: destination buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source skb to the
 *        destination buffer.
 *
 *        CAUTION ! :
 *                If its prototype is ever changed,
 *                check arch/{*}/net/{*}.S files,
 *                since it is called from BPF assembly code.
 */
int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        /* Copy header. */
        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_from_linear_data_offset(skb, offset, to, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                to     += copy;
        }

        if (!skb_frags_readable(skb))
                goto fault;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *f = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(f);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(f,
                                              skb_frag_off(f) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(to + copied, vaddr + p_off, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_copy_bits(frag_iter, offset - start, to, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_copy_bits);

/*
 * Callback from splice_to_pipe(), if we need to release some pages
 * at the end of the spd in case we error'ed out in filling the pipe.
 */
static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i)
{
        put_page(spd->pages[i]);
}

static struct page *linear_to_page(struct page *page, unsigned int *len,
                                   unsigned int *offset,
                                   struct sock *sk)
{
        struct page_frag *pfrag = sk_page_frag(sk);

        if (!sk_page_frag_refill(sk, pfrag))
                return NULL;

        *len = min_t(unsigned int, *len, pfrag->size - pfrag->offset);

        memcpy(page_address(pfrag->page) + pfrag->offset,
               page_address(page) + *offset, *len);
        *offset = pfrag->offset;
        pfrag->offset += *len;

        return pfrag->page;
}

static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
                             struct page *page,
                             unsigned int offset)
{
        return        spd->nr_pages &&
                spd->pages[spd->nr_pages - 1] == page &&
                (spd->partial[spd->nr_pages - 1].offset +
                 spd->partial[spd->nr_pages - 1].len == offset);
}

/*
 * Fill page/offset/length into spd, if it can hold more pages.
 */
static bool spd_fill_page(struct splice_pipe_desc *spd,
                          struct pipe_inode_info *pipe, struct page *page,
                          unsigned int *len, unsigned int offset,
                          bool linear,
                          struct sock *sk)
{
        if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
                return true;

        if (linear) {
                page = linear_to_page(page, len, &offset, sk);
                if (!page)
                        return true;
        }
        if (spd_can_coalesce(spd, page, offset)) {
                spd->partial[spd->nr_pages - 1].len += *len;
                return false;
        }
        get_page(page);
        spd->pages[spd->nr_pages] = page;
        spd->partial[spd->nr_pages].len = *len;
        spd->partial[spd->nr_pages].offset = offset;
        spd->nr_pages++;

        return false;
}

static bool __splice_segment(struct page *page, unsigned int poff,
                             unsigned int plen, unsigned int *off,
                             unsigned int *len,
                             struct splice_pipe_desc *spd, bool linear,
                             struct sock *sk,
                             struct pipe_inode_info *pipe)
{
        if (!*len)
                return true;

        /* skip this segment if already processed */
        if (*off >= plen) {
                *off -= plen;
                return false;
        }

        /* ignore any bits we already processed */
        poff += *off;
        plen -= *off;
        *off = 0;

        do {
                unsigned int flen = min(*len, plen);

                if (spd_fill_page(spd, pipe, page, &flen, poff,
                                  linear, sk))
                        return true;
                poff += flen;
                plen -= flen;
                *len -= flen;
        } while (*len && plen);

        return false;
}

/*
 * Map linear and fragment data from the skb to spd. It reports true if the
 * pipe is full or if we already spliced the requested length.
 */
static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
                              unsigned int *offset, unsigned int *len,
                              struct splice_pipe_desc *spd, struct sock *sk)
{
        int seg;
        struct sk_buff *iter;

        /* map the linear part :
         * If skb->head_frag is set, this 'linear' part is backed by a
         * fragment, and if the head is not shared with any clones then
         * we can avoid a copy since we own the head portion of this page.
         */
        if (__splice_segment(virt_to_page(skb->data),
                             (unsigned long) skb->data & (PAGE_SIZE - 1),
                             skb_headlen(skb),
                             offset, len, spd,
                             skb_head_is_locked(skb),
                             sk, pipe))
                return true;

        /*
         * then map the fragments
         */
        if (!skb_frags_readable(skb))
                return false;

        for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
                const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];

                if (WARN_ON_ONCE(!skb_frag_page(f)))
                        return false;

                if (__splice_segment(skb_frag_page(f),
                                     skb_frag_off(f), skb_frag_size(f),
                                     offset, len, spd, false, sk, pipe))
                        return true;
        }

        skb_walk_frags(skb, iter) {
                if (*offset >= iter->len) {
                        *offset -= iter->len;
                        continue;
                }
                /* __skb_splice_bits() only fails if the output has no room
                 * left, so no point in going over the frag_list for the error
                 * case.
                 */
                if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
                        return true;
        }

        return false;
}

/*
 * Map data from the skb to a pipe. Should handle both the linear part,
 * the fragments, and the frag list.
 */
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
                    struct pipe_inode_info *pipe, unsigned int tlen,
                    unsigned int flags)
{
        struct partial_page partial[MAX_SKB_FRAGS];
        struct page *pages[MAX_SKB_FRAGS];
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
                .nr_pages_max = MAX_SKB_FRAGS,
                .ops = &nosteal_pipe_buf_ops,
                .spd_release = sock_spd_release,
        };
        int ret = 0;

        __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);

        if (spd.nr_pages)
                ret = splice_to_pipe(pipe, &spd);

        return ret;
}
EXPORT_SYMBOL_GPL(skb_splice_bits);

static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
{
        struct socket *sock = sk->sk_socket;
        size_t size = msg_data_left(msg);

        if (!sock)
                return -EINVAL;

        if (!sock->ops->sendmsg_locked)
                return sock_no_sendmsg_locked(sk, msg, size);

        return sock->ops->sendmsg_locked(sk, msg, size);
}

static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
{
        struct socket *sock = sk->sk_socket;

        if (!sock)
                return -EINVAL;
        return sock_sendmsg(sock, msg);
}

typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
                           int len, sendmsg_func sendmsg)
{
        unsigned int orig_len = len;
        struct sk_buff *head = skb;
        unsigned short fragidx;
        int slen, ret;

do_frag_list:

        /* Deal with head data */
        while (offset < skb_headlen(skb) && len) {
                struct kvec kv;
                struct msghdr msg;

                slen = min_t(int, len, skb_headlen(skb) - offset);
                kv.iov_base = skb->data + offset;
                kv.iov_len = slen;
                memset(&msg, 0, sizeof(msg));
                msg.msg_flags = MSG_DONTWAIT;

                iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
                ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
                                      sendmsg_unlocked, sk, &msg);
                if (ret <= 0)
                        goto error;

                offset += ret;
                len -= ret;
        }

        /* All the data was skb head? */
        if (!len)
                goto out;

        /* Make offset relative to start of frags */
        offset -= skb_headlen(skb);

        /* Find where we are in frag list */
        for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                if (offset < skb_frag_size(frag))
                        break;

                offset -= skb_frag_size(frag);
        }

        for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
                skb_frag_t *frag  = &skb_shinfo(skb)->frags[fragidx];

                slen = min_t(size_t, len, skb_frag_size(frag) - offset);

                while (slen) {
                        struct bio_vec bvec;
                        struct msghdr msg = {
                                .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
                        };

                        bvec_set_page(&bvec, skb_frag_page(frag), slen,
                                      skb_frag_off(frag) + offset);
                        iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
                                      slen);

                        ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
                                              sendmsg_unlocked, sk, &msg);
                        if (ret <= 0)
                                goto error;

                        len -= ret;
                        offset += ret;
                        slen -= ret;
                }

                offset = 0;
        }

        if (len) {
                /* Process any frag lists */

                if (skb == head) {
                        if (skb_has_frag_list(skb)) {
                                skb = skb_shinfo(skb)->frag_list;
                                goto do_frag_list;
                        }
                } else if (skb->next) {
                        skb = skb->next;
                        goto do_frag_list;
                }
        }

out:
        return orig_len - len;

error:
        return orig_len == len ? ret : orig_len - len;
}

/* Send skb data on a socket. Socket must be locked. */
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
                         int len)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);

/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
        return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
}

/**
 *        skb_store_bits - store bits from kernel buffer to skb
 *        @skb: destination buffer
 *        @offset: offset in destination
 *        @from: source buffer
 *        @len: number of bytes to copy
 *
 *        Copy the specified number of bytes from the source buffer to the
 *        destination skb.  This function handles all the messy bits of
 *        traversing fragment lists and such.
 */

int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
{
        int start = skb_headlen(skb);
        struct sk_buff *frag_iter;
        int i, copy;

        if (offset > (int)skb->len - len)
                goto fault;

        if ((copy = start - offset) > 0) {
                if (copy > len)
                        copy = len;
                skb_copy_to_linear_data_offset(skb, offset, from, copy);
                if ((len -= copy) == 0)
                        return 0;
                offset += copy;
                from += copy;
        }

        if (!skb_frags_readable(skb))
                goto fault;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                memcpy(vaddr + p_off, from + copied, p_len);
                                kunmap_atomic(vaddr);
                        }

                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        if (skb_store_bits(frag_iter, offset - start,
                                           from, copy))
                                goto fault;
                        if ((len -= copy) == 0)
                                return 0;
                        offset += copy;
                        from += copy;
                }
                start = end;
        }
        if (!len)
                return 0;

fault:
        return -EFAULT;
}
EXPORT_SYMBOL(skb_store_bits);

/* Checksum skb data. */
__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
                      __wsum csum, const struct skb_checksum_ops *ops)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;

        /* Checksum header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
                                       skb->data + offset, copy, csum);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                pos        = copy;
        }

        if (WARN_ON_ONCE(!skb_frags_readable(skb)))
                return 0;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;
                skb_frag_t *frag = &skb_shinfo(skb)->frags[i];

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(frag);
                if ((copy = end - offset) > 0) {
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = INDIRECT_CALL_1(ops->update,
                                                        csum_partial_ext,
                                                        vaddr + p_off, p_len, 0);
                                kunmap_atomic(vaddr);
                                csum = INDIRECT_CALL_1(ops->combine,
                                                       csum_block_add_ext, csum,
                                                       csum2, pos, p_len);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        __wsum csum2;
                        if (copy > len)
                                copy = len;
                        csum2 = __skb_checksum(frag_iter, offset - start,
                                               copy, 0, ops);
                        csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
                                               csum, csum2, pos, copy);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);

        return csum;
}
EXPORT_SYMBOL(__skb_checksum);

__wsum skb_checksum(const struct sk_buff *skb, int offset,
                    int len, __wsum csum)
{
        const struct skb_checksum_ops ops = {
                .update  = csum_partial_ext,
                .combine = csum_block_add_ext,
        };

        return __skb_checksum(skb, offset, len, csum, &ops);
}
EXPORT_SYMBOL(skb_checksum);

/* Both of above in one bottle. */

__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
                                    u8 *to, int len)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int pos = 0;
        __wsum csum = 0;

        /* Copy header. */
        if (copy > 0) {
                if (copy > len)
                        copy = len;
                csum = csum_partial_copy_nocheck(skb->data + offset, to,
                                                 copy);
                if ((len -= copy) == 0)
                        return csum;
                offset += copy;
                to     += copy;
                pos        = copy;
        }

        if (!skb_frags_readable(skb))
                return 0;

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        u32 p_off, p_len, copied;
                        struct page *p;
                        __wsum csum2;
                        u8 *vaddr;

                        if (copy > len)
                                copy = len;

                        skb_frag_foreach_page(frag,
                                              skb_frag_off(frag) + offset - start,
                                              copy, p, p_off, p_len, copied) {
                                vaddr = kmap_atomic(p);
                                csum2 = csum_partial_copy_nocheck(vaddr + p_off,
                                                                  to + copied,
                                                                  p_len);
                                kunmap_atomic(vaddr);
                                csum = csum_block_add(csum, csum2, pos);
                                pos += p_len;
                        }

                        if (!(len -= copy))
                                return csum;
                        offset += copy;
                        to     += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                __wsum csum2;
                int end;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (copy > len)
                                copy = len;
                        csum2 = skb_copy_and_csum_bits(frag_iter,
                                                       offset - start,
                                                       to, copy);
                        csum = csum_block_add(csum, csum2, pos);
                        if ((len -= copy) == 0)
                                return csum;
                        offset += copy;
                        to     += copy;
                        pos    += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return csum;
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);

__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
        __sum16 sum;

        sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
        /* See comments in __skb_checksum_complete(). */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }
        if (!skb_shared(skb))
                skb->csum_valid = !sum;
        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete_head);

/* This function assumes skb->csum already holds pseudo header's checksum,
 * which has been changed from the hardware checksum, for example, by
 * __skb_checksum_validate_complete(). And, the original skb->csum must
 * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
 *
 * It returns non-zero if the recomputed checksum is still invalid, otherwise
 * zero. The new checksum is stored back into skb->csum unless the skb is
 * shared.
 */
__sum16 __skb_checksum_complete(struct sk_buff *skb)
{
        __wsum csum;
        __sum16 sum;

        csum = skb_checksum(skb, 0, skb->len, 0);

        sum = csum_fold(csum_add(skb->csum, csum));
        /* This check is inverted, because we already knew the hardware
         * checksum is invalid before calling this function. So, if the
         * re-computed checksum is valid instead, then we have a mismatch
         * between the original skb->csum and skb_checksum(). This means either
         * the original hardware checksum is incorrect or we screw up skb->csum
         * when moving skb->data around.
         */
        if (likely(!sum)) {
                if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
                    !skb->csum_complete_sw)
                        netdev_rx_csum_fault(skb->dev, skb);
        }

        if (!skb_shared(skb)) {
                /* Save full packet checksum */
                skb->csum = csum;
                skb->ip_summed = CHECKSUM_COMPLETE;
                skb->csum_complete_sw = 1;
                skb->csum_valid = !sum;
        }

        return sum;
}
EXPORT_SYMBOL(__skb_checksum_complete);

static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
{
        net_warn_ratelimited(
                "%s: attempt to compute crc32c without libcrc32c.ko\n",
                __func__);
        return 0;
}

static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
                                       int offset, int len)
{
        net_warn_ratelimited(
                "%s: attempt to compute crc32c without libcrc32c.ko\n",
                __func__);
        return 0;
}

static const struct skb_checksum_ops default_crc32c_ops = {
        .update  = warn_crc32c_csum_update,
        .combine = warn_crc32c_csum_combine,
};

const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
        &default_crc32c_ops;
EXPORT_SYMBOL(crc32c_csum_stub);

 /**
 *        skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
 *        @from: source buffer
 *
 *        Calculates the amount of linear headroom needed in the 'to' skb passed
 *        into skb_zerocopy().
 */
unsigned int
skb_zerocopy_headlen(const struct sk_buff *from)
{
        unsigned int hlen = 0;

        if (!from->head_frag ||
            skb_headlen(from) < L1_CACHE_BYTES ||
            skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
                hlen = skb_headlen(from);
                if (!hlen)
                        hlen = from->len;
        }

        if (skb_has_frag_list(from))
                hlen = from->len;

        return hlen;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_headlen);

/**
 *        skb_zerocopy - Zero copy skb to skb
 *        @to: destination buffer
 *        @from: source buffer
 *        @len: number of bytes to copy from source buffer
 *        @hlen: size of linear headroom in destination buffer
 *
 *        Copies up to `len` bytes from `from` to `to` by creating references
 *        to the frags in the source buffer.
 *
 *        The `hlen` as calculated by skb_zerocopy_headlen() specifies the
 *        headroom in the `to` buffer.
 *
 *        Return value:
 *        0: everything is OK
 *        -ENOMEM: couldn't orphan frags of @from due to lack of memory
 *        -EFAULT: skb_copy_bits() found some problem with skb geometry
 */
int
skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
{
        int i, j = 0;
        int plen = 0; /* length of skb->head fragment */
        int ret;
        struct page *page;
        unsigned int offset;

        BUG_ON(!from->head_frag && !hlen);

        /* dont bother with small payloads */
        if (len <= skb_tailroom(to))
                return skb_copy_bits(from, 0, skb_put(to, len), len);

        if (hlen) {
                ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen);
                if (unlikely(ret))
                        return ret;
                len -= hlen;
        } else {
                plen = min_t(int, skb_headlen(from), len);
                if (plen) {
                        page = virt_to_head_page(from->head);
                        offset = from->data - (unsigned char *)page_address(page);
                        __skb_fill_netmem_desc(to, 0, page_to_netmem(page),
                                               offset, plen);
                        get_page(page);
                        j = 1;
                        len -= plen;
                }
        }

        skb_len_add(to, len + plen);

        if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
                skb_tx_error(from);
                return -ENOMEM;
        }
        skb_zerocopy_clone(to, from, GFP_ATOMIC);

        for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
                int size;

                if (!len)
                        break;
                skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
                size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
                                        len);
                skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
                len -= size;
                skb_frag_ref(to, j);
                j++;
        }
        skb_shinfo(to)->nr_frags = j;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_zerocopy);

void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
{
        __wsum csum;
        long csstart;

        if (skb->ip_summed == CHECKSUM_PARTIAL)
                csstart = skb_checksum_start_offset(skb);
        else
                csstart = skb_headlen(skb);

        BUG_ON(csstart > skb_headlen(skb));

        skb_copy_from_linear_data(skb, to, csstart);

        csum = 0;
        if (csstart != skb->len)
                csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
                                              skb->len - csstart);

        if (skb->ip_summed == CHECKSUM_PARTIAL) {
                long csstuff = csstart + skb->csum_offset;

                *((__sum16 *)(to + csstuff)) = csum_fold(csum);
        }
}
EXPORT_SYMBOL(skb_copy_and_csum_dev);

/**
 *        skb_dequeue - remove from the head of the queue
 *        @list: list to dequeue from
 *
 *        Remove the head of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The head item is
 *        returned or %NULL if the list is empty.
 */

struct sk_buff *skb_dequeue(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue);

/**
 *        skb_dequeue_tail - remove from the tail of the queue
 *        @list: list to dequeue from
 *
 *        Remove the tail of the list. The list lock is taken so the function
 *        may be used safely with other locking list functions. The tail item is
 *        returned or %NULL if the list is empty.
 */
struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
{
        unsigned long flags;
        struct sk_buff *result;

        spin_lock_irqsave(&list->lock, flags);
        result = __skb_dequeue_tail(list);
        spin_unlock_irqrestore(&list->lock, flags);
        return result;
}
EXPORT_SYMBOL(skb_dequeue_tail);

/**
 *        skb_queue_purge_reason - empty a list
 *        @list: list to empty
 *        @reason: drop reason
 *
 *        Delete all buffers on an &sk_buff list. Each buffer is removed from
 *        the list and one reference dropped. This function takes the list
 *        lock and is atomic with respect to other list locking functions.
 */
void skb_queue_purge_reason(struct sk_buff_head *list,
                            enum skb_drop_reason reason)
{
        struct sk_buff_head tmp;
        unsigned long flags;

        if (skb_queue_empty_lockless(list))
                return;

        __skb_queue_head_init(&tmp);

        spin_lock_irqsave(&list->lock, flags);
        skb_queue_splice_init(list, &tmp);
        spin_unlock_irqrestore(&list->lock, flags);

        __skb_queue_purge_reason(&tmp, reason);
}
EXPORT_SYMBOL(skb_queue_purge_reason);

/**
 *        skb_rbtree_purge - empty a skb rbtree
 *        @root: root of the rbtree to empty
 *        Return value: the sum of truesizes of all purged skbs.
 *
 *        Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
 *        the list and one reference dropped. This function does not take
 *        any lock. Synchronization should be handled by the caller (e.g., TCP
 *        out-of-order queue is protected by the socket lock).
 */
unsigned int skb_rbtree_purge(struct rb_root *root)
{
        struct rb_node *p = rb_first(root);
        unsigned int sum = 0;

        while (p) {
                struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);

                p = rb_next(p);
                rb_erase(&skb->rbnode, root);
                sum += skb->truesize;
                kfree_skb(skb);
        }
        return sum;
}

void skb_errqueue_purge(struct sk_buff_head *list)
{
        struct sk_buff *skb, *next;
        struct sk_buff_head kill;
        unsigned long flags;

        __skb_queue_head_init(&kill);

        spin_lock_irqsave(&list->lock, flags);
        skb_queue_walk_safe(list, skb, next) {
                if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
                    SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
                        continue;
                __skb_unlink(skb, list);
                __skb_queue_tail(&kill, skb);
        }
        spin_unlock_irqrestore(&list->lock, flags);
        __skb_queue_purge(&kill);
}
EXPORT_SYMBOL(skb_errqueue_purge);

/**
 *        skb_queue_head - queue a buffer at the list head
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the start of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_head(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_head);

/**
 *        skb_queue_tail - queue a buffer at the list tail
 *        @list: list to use
 *        @newsk: buffer to queue
 *
 *        Queue a buffer at the tail of the list. This function takes the
 *        list lock and can be used safely with other locking &sk_buff functions
 *        safely.
 *
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_tail(list, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_queue_tail);

/**
 *        skb_unlink        -        remove a buffer from a list
 *        @skb: buffer to remove
 *        @list: list to use
 *
 *        Remove a packet from a list. The list locks are taken and this
 *        function is atomic with respect to other list locked calls
 *
 *        You must know what list the SKB is on.
 */
void skb_unlink(struct sk_buff *skb, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_unlink(skb, list);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_unlink);

/**
 *        skb_append        -        append a buffer
 *        @old: buffer to insert after
 *        @newsk: buffer to insert
 *        @list: list to use
 *
 *        Place a packet after a given packet in a list. The list locks are taken
 *        and this function is atomic with respect to other list locked calls.
 *        A buffer cannot be placed on two lists at the same time.
 */
void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
{
        unsigned long flags;

        spin_lock_irqsave(&list->lock, flags);
        __skb_queue_after(list, old, newsk);
        spin_unlock_irqrestore(&list->lock, flags);
}
EXPORT_SYMBOL(skb_append);

static inline void skb_split_inside_header(struct sk_buff *skb,
                                           struct sk_buff* skb1,
                                           const u32 len, const int pos)
{
        int i;

        skb_copy_from_linear_data_offset(skb, len, skb_put(skb1, pos - len),
                                         pos - len);
        /* And move data appendix as is. */
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];

        skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
        skb1->unreadable           = skb->unreadable;
        skb_shinfo(skb)->nr_frags  = 0;
        skb1->data_len                   = skb->data_len;
        skb1->len                   += skb1->data_len;
        skb->data_len                   = 0;
        skb->len                   = len;
        skb_set_tail_pointer(skb, len);
}

static inline void skb_split_no_header(struct sk_buff *skb,
                                       struct sk_buff* skb1,
                                       const u32 len, int pos)
{
        int i, k = 0;
        const int nfrags = skb_shinfo(skb)->nr_frags;

        skb_shinfo(skb)->nr_frags = 0;
        skb1->len                  = skb1->data_len = skb->len - len;
        skb->len                  = len;
        skb->data_len                  = len - pos;

        for (i = 0; i < nfrags; i++) {
                int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + size > len) {
                        skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < len) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_ref(skb, i);
                                skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
                                skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
                                skb_shinfo(skb)->nr_frags++;
                        }
                        k++;
                } else
                        skb_shinfo(skb)->nr_frags++;
                pos += size;
        }
        skb_shinfo(skb1)->nr_frags = k;

        skb1->unreadable = skb->unreadable;
}

/**
 * skb_split - Split fragmented skb to two parts at length len.
 * @skb: the buffer to split
 * @skb1: the buffer to receive the second part
 * @len: new length for skb
 */
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
        int pos = skb_headlen(skb);
        const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;

        skb_zcopy_downgrade_managed(skb);

        skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
        skb_zerocopy_clone(skb1, skb, 0);
        if (len < pos)        /* Split line is inside header. */
                skb_split_inside_header(skb, skb1, len, pos);
        else                /* Second chunk has no header, nothing to copy. */
                skb_split_no_header(skb, skb1, len, pos);
}
EXPORT_SYMBOL(skb_split);

/* Shifting from/to a cloned skb is a no-go.
 *
 * Caller cannot keep skb_shinfo related pointers past calling here!
 */
static int skb_prepare_for_shift(struct sk_buff *skb)
{
        return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
}

/**
 * skb_shift - Shifts paged data partially from skb to another
 * @tgt: buffer into which tail data gets added
 * @skb: buffer from which the paged data comes from
 * @shiftlen: shift up to this many bytes
 *
 * Attempts to shift up to shiftlen worth of bytes, which may be less than
 * the length of the skb, from skb to tgt. Returns number bytes shifted.
 * It's up to caller to free skb if everything was shifted.
 *
 * If @tgt runs out of frags, the whole operation is aborted.
 *
 * Skb cannot include anything else but paged data while tgt is allowed
 * to have non-paged data as well.
 *
 * TODO: full sized shift could be optimized but that would need
 * specialized skb free'er to handle frags without up-to-date nr_frags.
 */
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
{
        int from, to, merge, todo;
        skb_frag_t *fragfrom, *fragto;

        BUG_ON(shiftlen > skb->len);

        if (skb_headlen(skb))
                return 0;
        if (skb_zcopy(tgt) || skb_zcopy(skb))
                return 0;

        DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
        DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));

        todo = shiftlen;
        from = 0;
        to = skb_shinfo(tgt)->nr_frags;
        fragfrom = &skb_shinfo(skb)->frags[from];

        /* Actual merge is delayed until the point when we know we can
         * commit all, so that we don't have to undo partial changes
         */
        if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
                              skb_frag_off(fragfrom))) {
                merge = -1;
        } else {
                merge = to - 1;

                todo -= skb_frag_size(fragfrom);
                if (todo < 0) {
                        if (skb_prepare_for_shift(skb) ||
                            skb_prepare_for_shift(tgt))
                                return 0;

                        /* All previous frag pointers might be stale! */
                        fragfrom = &skb_shinfo(skb)->frags[from];
                        fragto = &skb_shinfo(tgt)->frags[merge];

                        skb_frag_size_add(fragto, shiftlen);
                        skb_frag_size_sub(fragfrom, shiftlen);
                        skb_frag_off_add(fragfrom, shiftlen);

                        goto onlymerged;
                }

                from++;
        }

        /* Skip full, not-fitting skb to avoid expensive operations */
        if ((shiftlen == skb->len) &&
            (skb_shinfo(skb)->nr_frags - from) > (MAX_SKB_FRAGS - to))
                return 0;

        if (skb_prepare_for_shift(skb) || skb_prepare_for_shift(tgt))
                return 0;

        while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) {
                if (to == MAX_SKB_FRAGS)
                        return 0;

                fragfrom = &skb_shinfo(skb)->frags[from];
                fragto = &skb_shinfo(tgt)->frags[to];

                if (todo >= skb_frag_size(fragfrom)) {
                        *fragto = *fragfrom;
                        todo -= skb_frag_size(fragfrom);
                        from++;
                        to++;

                } else {
                        __skb_frag_ref(fragfrom);
                        skb_frag_page_copy(fragto, fragfrom);
                        skb_frag_off_copy(fragto, fragfrom);
                        skb_frag_size_set(fragto, todo);

                        skb_frag_off_add(fragfrom, todo);
                        skb_frag_size_sub(fragfrom, todo);
                        todo = 0;

                        to++;
                        break;
                }
        }

        /* Ready to "commit" this state change to tgt */
        skb_shinfo(tgt)->nr_frags = to;

        if (merge >= 0) {
                fragfrom = &skb_shinfo(skb)->frags[0];
                fragto = &skb_shinfo(tgt)->frags[merge];

                skb_frag_size_add(fragto, skb_frag_size(fragfrom));
                __skb_frag_unref(fragfrom, skb->pp_recycle);
        }

        /* Reposition in the original skb */
        to = 0;
        while (from < skb_shinfo(skb)->nr_frags)
                skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++];
        skb_shinfo(skb)->nr_frags = to;

        BUG_ON(todo > 0 && !skb_shinfo(skb)->nr_frags);

onlymerged:
        /* Most likely the tgt won't ever need its checksum anymore, skb on
         * the other hand might need it if it needs to be resent
         */
        tgt->ip_summed = CHECKSUM_PARTIAL;
        skb->ip_summed = CHECKSUM_PARTIAL;

        skb_len_add(skb, -shiftlen);
        skb_len_add(tgt, shiftlen);

        return shiftlen;
}

/**
 * skb_prepare_seq_read - Prepare a sequential read of skb data
 * @skb: the buffer to read
 * @from: lower offset of data to be read
 * @to: upper offset of data to be read
 * @st: state variable
 *
 * Initializes the specified state variable. Must be called before
 * invoking skb_seq_read() for the first time.
 */
void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
                          unsigned int to, struct skb_seq_state *st)
{
        st->lower_offset = from;
        st->upper_offset = to;
        st->root_skb = st->cur_skb = skb;
        st->frag_idx = st->stepped_offset = 0;
        st->frag_data = NULL;
        st->frag_off = 0;
}
EXPORT_SYMBOL(skb_prepare_seq_read);

/**
 * skb_seq_read - Sequentially read skb data
 * @consumed: number of bytes consumed by the caller so far
 * @data: destination pointer for data to be returned
 * @st: state variable
 *
 * Reads a block of skb data at @consumed relative to the
 * lower offset specified to skb_prepare_seq_read(). Assigns
 * the head of the data block to @data and returns the length
 * of the block or 0 if the end of the skb data or the upper
 * offset has been reached.
 *
 * The caller is not required to consume all of the data
 * returned, i.e. @consumed is typically set to the number
 * of bytes already consumed and the next call to
 * skb_seq_read() will return the remaining part of the block.
 *
 * Note 1: The size of each block of data returned can be arbitrary,
 *       this limitation is the cost for zerocopy sequential
 *       reads of potentially non linear data.
 *
 * Note 2: Fragment lists within fragments are not implemented
 *       at the moment, state->root_skb could be replaced with
 *       a stack for this purpose.
 */
unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
                          struct skb_seq_state *st)
{
        unsigned int block_limit, abs_offset = consumed + st->lower_offset;
        skb_frag_t *frag;

        if (unlikely(abs_offset >= st->upper_offset)) {
                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }
                return 0;
        }

next_skb:
        block_limit = skb_headlen(st->cur_skb) + st->stepped_offset;

        if (abs_offset < block_limit && !st->frag_data) {
                *data = st->cur_skb->data + (abs_offset - st->stepped_offset);
                return block_limit - abs_offset;
        }

        if (!skb_frags_readable(st->cur_skb))
                return 0;

        if (st->frag_idx == 0 && !st->frag_data)
                st->stepped_offset += skb_headlen(st->cur_skb);

        while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
                unsigned int pg_idx, pg_off, pg_sz;

                frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];

                pg_idx = 0;
                pg_off = skb_frag_off(frag);
                pg_sz = skb_frag_size(frag);

                if (skb_frag_must_loop(skb_frag_page(frag))) {
                        pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
                        pg_off = offset_in_page(pg_off + st->frag_off);
                        pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
                                                    PAGE_SIZE - pg_off);
                }

                block_limit = pg_sz + st->stepped_offset;
                if (abs_offset < block_limit) {
                        if (!st->frag_data)
                                st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);

                        *data = (u8 *)st->frag_data + pg_off +
                                (abs_offset - st->stepped_offset);

                        return block_limit - abs_offset;
                }

                if (st->frag_data) {
                        kunmap_atomic(st->frag_data);
                        st->frag_data = NULL;
                }

                st->stepped_offset += pg_sz;
                st->frag_off += pg_sz;
                if (st->frag_off == skb_frag_size(frag)) {
                        st->frag_off = 0;
                        st->frag_idx++;
                }
        }

        if (st->frag_data) {
                kunmap_atomic(st->frag_data);
                st->frag_data = NULL;
        }

        if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
                st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
                st->frag_idx = 0;
                goto next_skb;
        } else if (st->cur_skb->next) {
                st->cur_skb = st->cur_skb->next;
                st->frag_idx = 0;
                goto next_skb;
        }

        return 0;
}
EXPORT_SYMBOL(skb_seq_read);

/**
 * skb_abort_seq_read - Abort a sequential read of skb data
 * @st: state variable
 *
 * Must be called if skb_seq_read() was not called until it
 * returned 0.
 */
void skb_abort_seq_read(struct skb_seq_state *st)
{
        if (st->frag_data)
                kunmap_atomic(st->frag_data);
}
EXPORT_SYMBOL(skb_abort_seq_read);

/**
 * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
 * @st: source skb_seq_state
 * @offset: offset in source
 * @to: destination buffer
 * @len: number of bytes to copy
 *
 * Copy @len bytes from @offset bytes into the source @st to the destination
 * buffer @to. `offset` should increase (or be unchanged) with each subsequent
 * call to this function. If offset needs to decrease from the previous use `st`
 * should be reset first.
 *
 * Return: 0 on success or -EINVAL if the copy ended early
 */
int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
{
        const u8 *data;
        u32 sqlen;

        for (;;) {
                sqlen = skb_seq_read(offset, &data, st);
                if (sqlen == 0)
                        return -EINVAL;
                if (sqlen >= len) {
                        memcpy(to, data, len);
                        return 0;
                }
                memcpy(to, data, sqlen);
                to += sqlen;
                offset += sqlen;
                len -= sqlen;
        }
}
EXPORT_SYMBOL(skb_copy_seq_read);

#define TS_SKB_CB(state)        ((struct skb_seq_state *) &((state)->cb))

static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
                                          struct ts_config *conf,
                                          struct ts_state *state)
{
        return skb_seq_read(offset, text, TS_SKB_CB(state));
}

static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
{
        skb_abort_seq_read(TS_SKB_CB(state));
}

/**
 * skb_find_text - Find a text pattern in skb data
 * @skb: the buffer to look in
 * @from: search offset
 * @to: search limit
 * @config: textsearch configuration
 *
 * Finds a pattern in the skb data according to the specified
 * textsearch configuration. Use textsearch_next() to retrieve
 * subsequent occurrences of the pattern. Returns the offset
 * to the first occurrence or UINT_MAX if no match was found.
 */
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
                           unsigned int to, struct ts_config *config)
{
        unsigned int patlen = config->ops->get_pattern_len(config);
        struct ts_state state;
        unsigned int ret;

        BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));

        config->get_next_block = skb_ts_get_next_block;
        config->finish = skb_ts_finish;

        skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));

        ret = textsearch_find(config, &state);
        return (ret + patlen <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);

int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
                         int offset, size_t size, size_t max_frags)
{
        int i = skb_shinfo(skb)->nr_frags;

        if (skb_can_coalesce(skb, i, page, offset)) {
                skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
        } else if (i < max_frags) {
                skb_zcopy_downgrade_managed(skb);
                get_page(page);
                skb_fill_page_desc_noacc(skb, i, page, offset, size);
        } else {
                return -EMSGSIZE;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(skb_append_pagefrags);

/**
 *        skb_pull_rcsum - pull skb and update receive checksum
 *        @skb: buffer to update
 *        @len: length of data pulled
 *
 *        This function performs an skb_pull on the packet and updates
 *        the CHECKSUM_COMPLETE checksum.  It should be used on
 *        receive path processing instead of skb_pull unless you know
 *        that the checksum difference is zero (e.g., a valid IP header)
 *        or you are setting ip_summed to CHECKSUM_NONE.
 */
void *skb_pull_rcsum(struct sk_buff *skb, unsigned int len)
{
        unsigned char *data = skb->data;

        BUG_ON(len > skb->len);
        __skb_pull(skb, len);
        skb_postpull_rcsum(skb, data, len);
        return skb->data;
}
EXPORT_SYMBOL_GPL(skb_pull_rcsum);

static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
{
        skb_frag_t head_frag;
        struct page *page;

        page = virt_to_head_page(frag_skb->head);
        skb_frag_fill_page_desc(&head_frag, page, frag_skb->data -
                                (unsigned char *)page_address(page),
                                skb_headlen(frag_skb));
        return head_frag;
}

struct sk_buff *skb_segment_list(struct sk_buff *skb,
                                 netdev_features_t features,
                                 unsigned int offset)
{
        struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
        unsigned int tnl_hlen = skb_tnl_header_len(skb);
        unsigned int delta_truesize = 0;
        unsigned int delta_len = 0;
        struct sk_buff *tail = NULL;
        struct sk_buff *nskb, *tmp;
        int len_diff, err;

        skb_push(skb, -skb_network_offset(skb) + offset);

        /* Ensure the head is writeable before touching the shared info */
        err = skb_unclone(skb, GFP_ATOMIC);
        if (err)
                goto err_linearize;

        skb_shinfo(skb)->frag_list = NULL;

        while (list_skb) {
                nskb = list_skb;
                list_skb = list_skb->next;

                err = 0;
                delta_truesize += nskb->truesize;
                if (skb_shared(nskb)) {
                        tmp = skb_clone(nskb, GFP_ATOMIC);
                        if (tmp) {
                                consume_skb(nskb);
                                nskb = tmp;
                                err = skb_unclone(nskb, GFP_ATOMIC);
                        } else {
                                err = -ENOMEM;
                        }
                }

                if (!tail)
                        skb->next = nskb;
                else
                        tail->next = nskb;

                if (unlikely(err)) {
                        nskb->next = list_skb;
                        goto err_linearize;
                }

                tail = nskb;

                delta_len += nskb->len;

                skb_push(nskb, -skb_network_offset(nskb) + offset);

                skb_release_head_state(nskb);
                len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
                __copy_skb_header(nskb, skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
                nskb->transport_header += len_diff;
                skb_copy_from_linear_data_offset(skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 offset + tnl_hlen);

                if (skb_needs_linearize(nskb, features) &&
                    __skb_linearize(nskb))
                        goto err_linearize;
        }

        skb->truesize = skb->truesize - delta_truesize;
        skb->data_len = skb->data_len - delta_len;
        skb->len = skb->len - delta_len;

        skb_gso_reset(skb);

        skb->prev = tail;

        if (skb_needs_linearize(skb, features) &&
            __skb_linearize(skb))
                goto err_linearize;

        skb_get(skb);

        return skb;

err_linearize:
        kfree_skb_list(skb->next);
        skb->next = NULL;
        return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL_GPL(skb_segment_list);

/**
 *        skb_segment - Perform protocol segmentation on skb.
 *        @head_skb: buffer to segment
 *        @features: features for the output path (see dev->features)
 *
 *        This function performs segmentation on the given skb.  It returns
 *        a pointer to the first in a list of new skbs for the segments.
 *        In case of error it returns ERR_PTR(err).
 */
struct sk_buff *skb_segment(struct sk_buff *head_skb,
                            netdev_features_t features)
{
        struct sk_buff *segs = NULL;
        struct sk_buff *tail = NULL;
        struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
        unsigned int mss = skb_shinfo(head_skb)->gso_size;
        unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
        unsigned int offset = doffset;
        unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
        unsigned int partial_segs = 0;
        unsigned int headroom;
        unsigned int len = head_skb->len;
        struct sk_buff *frag_skb;
        skb_frag_t *frag;
        __be16 proto;
        bool csum, sg;
        int err = -ENOMEM;
        int i = 0;
        int nfrags, pos;

        if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
            mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
                struct sk_buff *check_skb;

                for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
                        if (skb_headlen(check_skb) && !check_skb->head_frag) {
                                /* gso_size is untrusted, and we have a frag_list with
                                 * a linear non head_frag item.
                                 *
                                 * If head_skb's headlen does not fit requested gso_size,
                                 * it means that the frag_list members do NOT terminate
                                 * on exact gso_size boundaries. Hence we cannot perform
                                 * skb_frag_t page sharing. Therefore we must fallback to
                                 * copying the frag_list skbs; we do so by disabling SG.
                                 */
                                features &= ~NETIF_F_SG;
                                break;
                        }
                }
        }

        __skb_push(head_skb, doffset);
        proto = skb_network_protocol(head_skb, NULL);
        if (unlikely(!proto))
                return ERR_PTR(-EINVAL);

        sg = !!(features & NETIF_F_SG);
        csum = !!can_checksum_protocol(features, proto);

        if (sg && csum && (mss != GSO_BY_FRAGS))  {
                if (!(features & NETIF_F_GSO_PARTIAL)) {
                        struct sk_buff *iter;
                        unsigned int frag_len;

                        if (!list_skb ||
                            !net_gso_ok(features, skb_shinfo(head_skb)->gso_type))
                                goto normal;

                        /* If we get here then all the required
                         * GSO features except frag_list are supported.
                         * Try to split the SKB to multiple GSO SKBs
                         * with no frag_list.
                         * Currently we can do that only when the buffers don't
                         * have a linear part and all the buffers except
                         * the last are of the same length.
                         */
                        frag_len = list_skb->len;
                        skb_walk_frags(head_skb, iter) {
                                if (frag_len != iter->len && iter->next)
                                        goto normal;
                                if (skb_headlen(iter) && !iter->head_frag)
                                        goto normal;

                                len -= iter->len;
                        }

                        if (len != frag_len)
                                goto normal;
                }

                /* GSO partial only requires that we trim off any excess that
                 * doesn't fit into an MSS sized block, so take care of that
                 * now.
                 * Cap len to not accidentally hit GSO_BY_FRAGS.
                 */
                partial_segs = min(len, GSO_BY_FRAGS - 1) / mss;
                if (partial_segs > 1)
                        mss *= partial_segs;
                else
                        partial_segs = 0;
        }

normal:
        headroom = skb_headroom(head_skb);
        pos = skb_headlen(head_skb);

        if (skb_orphan_frags(head_skb, GFP_ATOMIC))
                return ERR_PTR(-ENOMEM);

        nfrags = skb_shinfo(head_skb)->nr_frags;
        frag = skb_shinfo(head_skb)->frags;
        frag_skb = head_skb;

        do {
                struct sk_buff *nskb;
                skb_frag_t *nskb_frag;
                int hsize;
                int size;

                if (unlikely(mss == GSO_BY_FRAGS)) {
                        len = list_skb->len;
                } else {
                        len = head_skb->len - offset;
                        if (len > mss)
                                len = mss;
                }

                hsize = skb_headlen(head_skb) - offset;

                if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
                    (skb_headlen(list_skb) == len || sg)) {
                        BUG_ON(skb_headlen(list_skb) > len);

                        nskb = skb_clone(list_skb, GFP_ATOMIC);
                        if (unlikely(!nskb))
                                goto err;

                        i = 0;
                        nfrags = skb_shinfo(list_skb)->nr_frags;
                        frag = skb_shinfo(list_skb)->frags;
                        frag_skb = list_skb;
                        pos += skb_headlen(list_skb);

                        while (pos < offset + len) {
                                BUG_ON(i >= nfrags);

                                size = skb_frag_size(frag);
                                if (pos + size > offset + len)
                                        break;

                                i++;
                                pos += size;
                                frag++;
                        }

                        list_skb = list_skb->next;

                        if (unlikely(pskb_trim(nskb, len))) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        hsize = skb_end_offset(nskb);
                        if (skb_cow_head(nskb, doffset + headroom)) {
                                kfree_skb(nskb);
                                goto err;
                        }

                        nskb->truesize += skb_end_offset(nskb) - hsize;
                        skb_release_head_state(nskb);
                        __skb_push(nskb, doffset);
                } else {
                        if (hsize < 0)
                                hsize = 0;
                        if (hsize > len || !sg)
                                hsize = len;

                        nskb = __alloc_skb(hsize + doffset + headroom,
                                           GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
                                           NUMA_NO_NODE);

                        if (unlikely(!nskb))
                                goto err;

                        skb_reserve(nskb, headroom);
                        __skb_put(nskb, doffset);
                }

                if (segs)
                        tail->next = nskb;
                else
                        segs = nskb;
                tail = nskb;

                __copy_skb_header(nskb, head_skb);

                skb_headers_offset_update(nskb, skb_headroom(nskb) - headroom);
                skb_reset_mac_len(nskb);

                skb_copy_from_linear_data_offset(head_skb, -tnl_hlen,
                                                 nskb->data - tnl_hlen,
                                                 doffset + tnl_hlen);

                if (nskb->len == len + doffset)
                        goto perform_csum_check;

                if (!sg) {
                        if (!csum) {
                                if (!nskb->remcsum_offload)
                                        nskb->ip_summed = CHECKSUM_NONE;
                                SKB_GSO_CB(nskb)->csum =
                                        skb_copy_and_csum_bits(head_skb, offset,
                                                               skb_put(nskb,
                                                                       len),
                                                               len);
                                SKB_GSO_CB(nskb)->csum_start =
                                        skb_headroom(nskb) + doffset;
                        } else {
                                if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
                                        goto err;
                        }
                        continue;
                }

                nskb_frag = skb_shinfo(nskb)->frags;

                skb_copy_from_linear_data_offset(head_skb, offset,
                                                 skb_put(nskb, hsize), hsize);

                skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
                                           SKBFL_SHARED_FRAG;

                if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
                        goto err;

                while (pos < offset + len) {
                        if (i >= nfrags) {
                                if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
                                    skb_zerocopy_clone(nskb, list_skb,
                                                       GFP_ATOMIC))
                                        goto err;

                                i = 0;
                                nfrags = skb_shinfo(list_skb)->nr_frags;
                                frag = skb_shinfo(list_skb)->frags;
                                frag_skb = list_skb;
                                if (!skb_headlen(list_skb)) {
                                        BUG_ON(!nfrags);
                                } else {
                                        BUG_ON(!list_skb->head_frag);

                                        /* to make room for head_frag. */
                                        i--;
                                        frag--;
                                }

                                list_skb = list_skb->next;
                        }

                        if (unlikely(skb_shinfo(nskb)->nr_frags >=
                                     MAX_SKB_FRAGS)) {
                                net_warn_ratelimited(
                                        "skb_segment: too many frags: %u %u\n",
                                        pos, mss);
                                err = -EINVAL;
                                goto err;
                        }

                        *nskb_frag = (i < 0) ? skb_head_frag_to_page_desc(frag_skb) : *frag;
                        __skb_frag_ref(nskb_frag);
                        size = skb_frag_size(nskb_frag);

                        if (pos < offset) {
                                skb_frag_off_add(nskb_frag, offset - pos);
                                skb_frag_size_sub(nskb_frag, offset - pos);
                        }

                        skb_shinfo(nskb)->nr_frags++;

                        if (pos + size <= offset + len) {
                                i++;
                                frag++;
                                pos += size;
                        } else {
                                skb_frag_size_sub(nskb_frag, pos + size - (offset + len));
                                goto skip_fraglist;
                        }

                        nskb_frag++;
                }

skip_fraglist:
                nskb->data_len = len - hsize;
                nskb->len += nskb->data_len;
                nskb->truesize += nskb->data_len;

perform_csum_check:
                if (!csum) {
                        if (skb_has_shared_frag(nskb) &&
                            __skb_linearize(nskb))
                                goto err;

                        if (!nskb->remcsum_offload)
                                nskb->ip_summed = CHECKSUM_NONE;
                        SKB_GSO_CB(nskb)->csum =
                                skb_checksum(nskb, doffset,
                                             nskb->len - doffset, 0);
                        SKB_GSO_CB(nskb)->csum_start =
                                skb_headroom(nskb) + doffset;
                }
        } while ((offset += len) < head_skb->len);

        /* Some callers want to get the end of the list.
         * Put it in segs->prev to avoid walking the list.
         * (see validate_xmit_skb_list() for example)
         */
        segs->prev = tail;

        if (partial_segs) {
                struct sk_buff *iter;
                int type = skb_shinfo(head_skb)->gso_type;
                unsigned short gso_size = skb_shinfo(head_skb)->gso_size;

                /* Update type to add partial and then remove dodgy if set */
                type |= (features & NETIF_F_GSO_PARTIAL) / NETIF_F_GSO_PARTIAL * SKB_GSO_PARTIAL;
                type &= ~SKB_GSO_DODGY;

                /* Update GSO info and prepare to start updating headers on
                 * our way back down the stack of protocols.
                 */
                for (iter = segs; iter; iter = iter->next) {
                        skb_shinfo(iter)->gso_size = gso_size;
                        skb_shinfo(iter)->gso_segs = partial_segs;
                        skb_shinfo(iter)->gso_type = type;
                        SKB_GSO_CB(iter)->data_offset = skb_headroom(iter) + doffset;
                }

                if (tail->len - doffset <= gso_size)
                        skb_shinfo(tail)->gso_size = 0;
                else if (tail != segs)
                        skb_shinfo(tail)->gso_segs = DIV_ROUND_UP(tail->len - doffset, gso_size);
        }

        /* Following permits correct backpressure, for protocols
         * using skb_set_owner_w().
         * Idea is to tranfert ownership from head_skb to last segment.
         */
        if (head_skb->destructor == sock_wfree) {
                swap(tail->truesize, head_skb->truesize);
                swap(tail->destructor, head_skb->destructor);
                swap(tail->sk, head_skb->sk);
        }
        return segs;

err:
        kfree_skb_list(segs);
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(skb_segment);

#ifdef CONFIG_SKB_EXTENSIONS
#define SKB_EXT_ALIGN_VALUE        8
#define SKB_EXT_CHUNKSIZEOF(x)        (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)

static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
        [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
#endif
#ifdef CONFIG_XFRM
        [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
#endif
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
#endif
#if IS_ENABLED(CONFIG_MPTCP)
        [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
#endif
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
        [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
#endif
};

static __always_inline unsigned int skb_ext_total_length(void)
{
        unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
        int i;

        for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
                l += skb_ext_type_len[i];

        return l;
}

static void skb_extensions_init(void)
{
        BUILD_BUG_ON(SKB_EXT_NUM >= 8);
#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
        BUILD_BUG_ON(skb_ext_total_length() > 255);
#endif

        skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
                                             SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
                                             0,
                                             SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                             NULL);
}
#else
static void skb_extensions_init(void) {}
#endif

/* The SKB kmem_cache slab is critical for network performance.  Never
 * merge/alias the slab with similar sized objects.  This avoids fragmentation
 * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
 */
#ifndef CONFIG_SLUB_TINY
#define FLAG_SKB_NO_MERGE        SLAB_NO_MERGE
#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
#define FLAG_SKB_NO_MERGE        0
#endif

void __init skb_init(void)
{
        net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
                                              sizeof(struct sk_buff),
                                              0,
                                              SLAB_HWCACHE_ALIGN|SLAB_PANIC|
                                                FLAG_SKB_NO_MERGE,
                                              offsetof(struct sk_buff, cb),
                                              sizeof_field(struct sk_buff, cb),
                                              NULL);
        net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
                                                sizeof(struct sk_buff_fclones),
                                                0,
                                                SLAB_HWCACHE_ALIGN|SLAB_PANIC,
                                                NULL);
        /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
         * struct skb_shared_info is located at the end of skb->head,
         * and should not be copied to/from user.
         */
        net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
                                                SKB_SMALL_HEAD_CACHE_SIZE,
                                                0,
                                                SLAB_HWCACHE_ALIGN | SLAB_PANIC,
                                                0,
                                                SKB_SMALL_HEAD_HEADROOM,
                                                NULL);
        skb_extensions_init();
}

static int
__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
               unsigned int recursion_level)
{
        int start = skb_headlen(skb);
        int i, copy = start - offset;
        struct sk_buff *frag_iter;
        int elt = 0;

        if (unlikely(recursion_level >= 24))
                return -EMSGSIZE;

        if (copy > 0) {
                if (copy > len)
                        copy = len;
                sg_set_buf(sg, skb->data + offset, copy);
                elt++;
                if ((len -= copy) == 0)
                        return elt;
                offset += copy;
        }

        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                int end;

                WARN_ON(start > offset + len);

                end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
                if ((copy = end - offset) > 0) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        sg_set_page(&sg[elt], skb_frag_page(frag), copy,
                                    skb_frag_off(frag) + offset - start);
                        elt++;
                        if (!(len -= copy))
                                return elt;
                        offset += copy;
                }
                start = end;
        }

        skb_walk_frags(skb, frag_iter) {
                int end, ret;

                WARN_ON(start > offset + len);

                end = start + frag_iter->len;
                if ((copy = end - offset) > 0) {
                        if (unlikely(elt && sg_is_last(&sg[elt - 1])))
                                return -EMSGSIZE;

                        if (copy > len)
                                copy = len;
                        ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
                                              copy, recursion_level + 1);
                        if (unlikely(ret < 0))
                                return ret;
                        elt += ret;
                        if ((len -= copy) == 0)
                                return elt;
                        offset += copy;
                }
                start = end;
        }
        BUG_ON(len);
        return elt;
}

/**
 *        skb_to_sgvec - Fill a scatter-gather list from a socket buffer
 *        @skb: Socket buffer containing the buffers to be mapped
 *        @sg: The scatter-gather list to map into
 *        @offset: The offset into the buffer's contents to start mapping
 *        @len: Length of buffer space to be mapped
 *
 *        Fill the specified scatter-gather list with mappings/pointers into a
 *        region of the buffer space attached to a socket buffer. Returns either
 *        the number of scatterlist items used, or -EMSGSIZE if the contents
 *        could not fit.
 */
int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
{
        int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);

        if (nsg <= 0)
                return nsg;

        sg_mark_end(&sg[nsg - 1]);

        return nsg;
}
EXPORT_SYMBOL_GPL(skb_to_sgvec);

/* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
 * sglist without mark the sg which contain last skb data as the end.
 * So the caller can mannipulate sg list as will when padding new data after
 * the first call without calling sg_unmark_end to expend sg list.
 *
 * Scenario to use skb_to_sgvec_nomark:
 * 1. sg_init_table
 * 2. skb_to_sgvec_nomark(payload1)
 * 3. skb_to_sgvec_nomark(payload2)
 *
 * This is equivalent to:
 * 1. sg_init_table
 * 2. skb_to_sgvec(payload1)
 * 3. sg_unmark_end
 * 4. skb_to_sgvec(payload2)
 *
 * When mapping multiple payload conditionally, skb_to_sgvec_nomark
 * is more preferable.
 */
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
                        int offset, int len)
{
        return __skb_to_sgvec(skb, sg, offset, len, 0);
}
EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);



/**
 *        skb_cow_data - Check that a socket buffer's data buffers are writable
 *        @skb: The socket buffer to check.
 *        @tailbits: Amount of trailing space to be added
 *        @trailer: Returned pointer to the skb where the @tailbits space begins
 *
 *        Make sure that the data buffers attached to a socket buffer are
 *        writable. If they are not, private copies are made of the data buffers
 *        and the socket buffer is set to use these instead.
 *
 *        If @tailbits is given, make sure that there is space to write @tailbits
 *        bytes of data beyond current end of socket buffer.  @trailer will be
 *        set to point to the skb in which this space begins.
 *
 *        The number of scatterlist elements required to completely map the
 *        COW'd and extended socket buffer will be returned.
 */
int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
{
        int copyflag;
        int elt;
        struct sk_buff *skb1, **skb_p;

        /* If skb is cloned or its head is paged, reallocate
         * head pulling out all the pages (pages are considered not writable
         * at the moment even if they are anonymous).
         */
        if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
            !__pskb_pull_tail(skb, __skb_pagelen(skb)))
                return -ENOMEM;

        /* Easy case. Most of packets will go this way. */
        if (!skb_has_frag_list(skb)) {
                /* A little of trouble, not enough of space for trailer.
                 * This should not happen, when stack is tuned to generate
                 * good frames. OK, on miss we reallocate and reserve even more
                 * space, 128 bytes is fair. */

                if (skb_tailroom(skb) < tailbits &&
                    pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC))
                        return -ENOMEM;

                /* Voila! */
                *trailer = skb;
                return 1;
        }

        /* Misery. We are in troubles, going to mincer fragments... */

        elt = 1;
        skb_p = &skb_shinfo(skb)->frag_list;
        copyflag = 0;

        while ((skb1 = *skb_p) != NULL) {
                int ntail = 0;

                /* The fragment is partially pulled by someone,
                 * this can happen on input. Copy it and everything
                 * after it. */

                if (skb_shared(skb1))
                        copyflag = 1;

                /* If the skb is the last, worry about trailer. */

                if (skb1->next == NULL && tailbits) {
                        if (skb_shinfo(skb1)->nr_frags ||
                            skb_has_frag_list(skb1) ||
                            skb_tailroom(skb1) < tailbits)
                                ntail = tailbits + 128;
                }

                if (copyflag ||
                    skb_cloned(skb1) ||
                    ntail ||
                    skb_shinfo(skb1)->nr_frags ||
                    skb_has_frag_list(skb1)) {
                        struct sk_buff *skb2;

                        /* Fuck, we are miserable poor guys... */
                        if (ntail == 0)
                                skb2 = skb_copy(skb1, GFP_ATOMIC);
                        else
                                skb2 = skb_copy_expand(skb1,
                                                       skb_headroom(skb1),
                                                       ntail,
                                                       GFP_ATOMIC);
                        if (unlikely(skb2 == NULL))
                                return -ENOMEM;

                        if (skb1->sk)
                                skb_set_owner_w(skb2, skb1->sk);

                        /* Looking around. Are we still alive?
                         * OK, link new skb, drop old one */

                        skb2->next = skb1->next;
                        *skb_p = skb2;
                        kfree_skb(skb1);
                        skb1 = skb2;
                }
                elt++;
                *trailer = skb1;
                skb_p = &skb1->next;
        }

        return elt;
}
EXPORT_SYMBOL_GPL(skb_cow_data);

static void sock_rmem_free(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
}

static void skb_set_err_queue(struct sk_buff *skb)
{
        /* pkt_type of skbs received on local sockets is never PACKET_OUTGOING.
         * So, it is safe to (mis)use it to mark skbs on the error queue.
         */
        skb->pkt_type = PACKET_OUTGOING;
        BUILD_BUG_ON(PACKET_OUTGOING == 0);
}

/*
 * Note: We dont mem charge error packets (no sk_forward_alloc changes)
 */
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
            (unsigned int)READ_ONCE(sk->sk_rcvbuf))
                return -ENOMEM;

        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = sock_rmem_free;
        atomic_add(skb->truesize, &sk->sk_rmem_alloc);
        skb_set_err_queue(skb);

        /* before exiting rcu section, make sure dst is refcounted */
        skb_dst_force(skb);

        skb_queue_tail(&sk->sk_error_queue, skb);
        if (!sock_flag(sk, SOCK_DEAD))
                sk_error_report(sk);
        return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);

static bool is_icmp_err_skb(const struct sk_buff *skb)
{
        return skb && (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP ||
                       SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ICMP6);
}

struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
{
        struct sk_buff_head *q = &sk->sk_error_queue;
        struct sk_buff *skb, *skb_next = NULL;
        bool icmp_next = false;
        unsigned long flags;

        if (skb_queue_empty_lockless(q))
                return NULL;

        spin_lock_irqsave(&q->lock, flags);
        skb = __skb_dequeue(q);
        if (skb && (skb_next = skb_peek(q))) {
                icmp_next = is_icmp_err_skb(skb_next);
                if (icmp_next)
                        sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
        }
        spin_unlock_irqrestore(&q->lock, flags);

        if (is_icmp_err_skb(skb) && !icmp_next)
                sk->sk_err = 0;

        if (skb_next)
                sk_error_report(sk);

        return skb;
}
EXPORT_SYMBOL(sock_dequeue_err_skb);

/**
 * skb_clone_sk - create clone of skb, and take reference to socket
 * @skb: the skb to clone
 *
 * This function creates a clone of a buffer that holds a reference on
 * sk_refcnt.  Buffers created via this function are meant to be
 * returned using sock_queue_err_skb, or free via kfree_skb.
 *
 * When passing buffers allocated with this function to sock_queue_err_skb
 * it is necessary to wrap the call with sock_hold/sock_put in order to
 * prevent the socket from being released prior to being enqueued on
 * the sk_error_queue.
 */
struct sk_buff *skb_clone_sk(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        struct sk_buff *clone;

        if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
                return NULL;

        clone = skb_clone(skb, GFP_ATOMIC);
        if (!clone) {
                sock_put(sk);
                return NULL;
        }

        clone->sk = sk;
        clone->destructor = sock_efree;

        return clone;
}
EXPORT_SYMBOL(skb_clone_sk);

static void __skb_complete_tx_timestamp(struct sk_buff *skb,
                                        struct sock *sk,
                                        int tstype,
                                        bool opt_stats)
{
        struct sock_exterr_skb *serr;
        int err;

        BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
        serr->ee.ee_info = tstype;
        serr->opt_stats = opt_stats;
        serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
        if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
                serr->ee.ee_data = skb_shinfo(skb)->tskey;
                if (sk_is_tcp(sk))
                        serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
        }

        err = sock_queue_err_skb(sk, skb);

        if (err)
                kfree_skb(skb);
}

static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
        bool ret;

        if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
                return true;

        read_lock_bh(&sk->sk_callback_lock);
        ret = sk->sk_socket && sk->sk_socket->file &&
              file_ns_capable(sk->sk_socket->file, &init_user_ns, CAP_NET_RAW);
        read_unlock_bh(&sk->sk_callback_lock);
        return ret;
}

void skb_complete_tx_timestamp(struct sk_buff *skb,
                               struct skb_shared_hwtstamps *hwtstamps)
{
        struct sock *sk = skb->sk;

        if (!skb_may_tx_timestamp(sk, false))
                goto err;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                *skb_hwtstamps(skb) = *hwtstamps;
                __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
                sock_put(sk);
                return;
        }

err:
        kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);

static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
                                                 struct skb_shared_hwtstamps *hwtstamps,
                                                 int tstype)
{
        switch (tstype) {
        case SCM_TSTAMP_SCHED:
                return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
        case SCM_TSTAMP_SND:
                return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
                                                    SKBTX_SW_TSTAMP);
        case SCM_TSTAMP_ACK:
                return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
        case SCM_TSTAMP_COMPLETION:
                return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
        }

        return false;
}

static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
                                                  struct skb_shared_hwtstamps *hwtstamps,
                                                  struct sock *sk,
                                                  int tstype)
{
        int op;

        switch (tstype) {
        case SCM_TSTAMP_SCHED:
                op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
                break;
        case SCM_TSTAMP_SND:
                if (hwtstamps) {
                        op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
                        *skb_hwtstamps(skb) = *hwtstamps;
                } else {
                        op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
                }
                break;
        case SCM_TSTAMP_ACK:
                op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
                break;
        default:
                return;
        }

        bpf_skops_tx_timestamping(sk, skb, op);
}

void __skb_tstamp_tx(struct sk_buff *orig_skb,
                     const struct sk_buff *ack_skb,
                     struct skb_shared_hwtstamps *hwtstamps,
                     struct sock *sk, int tstype)
{
        struct sk_buff *skb;
        bool tsonly, opt_stats = false;
        u32 tsflags;

        if (!sk)
                return;

        if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
                skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
                                                      sk, tstype);

        if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
                return;

        tsflags = READ_ONCE(sk->sk_tsflags);
        if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
            skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
                return;

        tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
        if (!skb_may_tx_timestamp(sk, tsonly))
                return;

        if (tsonly) {
#ifdef CONFIG_INET
                if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
                    sk_is_tcp(sk)) {
                        skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
                                                             ack_skb);
                        opt_stats = true;
                } else
#endif
                        skb = alloc_skb(0, GFP_ATOMIC);
        } else {
                skb = skb_clone(orig_skb, GFP_ATOMIC);

                if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
                        kfree_skb(skb);
                        return;
                }
        }
        if (!skb)
                return;

        if (tsonly) {
                skb_shinfo(skb)->tx_flags |= skb_shinfo(orig_skb)->tx_flags &
                                             SKBTX_ANY_TSTAMP;
                skb_shinfo(skb)->tskey = skb_shinfo(orig_skb)->tskey;
        }

        if (hwtstamps)
                *skb_hwtstamps(skb) = *hwtstamps;
        else
                __net_timestamp(skb);

        __skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
EXPORT_SYMBOL_GPL(__skb_tstamp_tx);

void skb_tstamp_tx(struct sk_buff *orig_skb,
                   struct skb_shared_hwtstamps *hwtstamps)
{
        return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
                               SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);

#ifdef CONFIG_WIRELESS
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
        struct sock *sk = skb->sk;
        struct sock_exterr_skb *serr;
        int err = 1;

        skb->wifi_acked_valid = 1;
        skb->wifi_acked = acked;

        serr = SKB_EXT_ERR(skb);
        memset(serr, 0, sizeof(*serr));
        serr->ee.ee_errno = ENOMSG;
        serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;

        /* Take a reference to prevent skb_orphan() from freeing the socket,
         * but only if the socket refcount is not zero.
         */
        if (likely(refcount_inc_not_zero(&sk->sk_refcnt))) {
                err = sock_queue_err_skb(sk, skb);
                sock_put(sk);
        }
        if (err)
                kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
#endif /* CONFIG_WIRELESS */

/**
 * skb_partial_csum_set - set up and verify partial csum values for packet
 * @skb: the skb to set
 * @start: the number of bytes after skb->data to start checksumming.
 * @off: the offset from start to place the checksum.
 *
 * For untrusted partially-checksummed packets, we need to make sure the values
 * for skb->csum_start and skb->csum_offset are valid so we don't oops.
 *
 * This function checks and sets those values and skb->ip_summed: if this
 * returns false you should drop the packet.
 */
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
        u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
        u32 csum_start = skb_headroom(skb) + (u32)start;

        if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) {
                net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
                                     start, off, skb_headroom(skb), skb_headlen(skb));
                return false;
        }
        skb->ip_summed = CHECKSUM_PARTIAL;
        skb->csum_start = csum_start;
        skb->csum_offset = off;
        skb->transport_header = csum_start;
        return true;
}
EXPORT_SYMBOL_GPL(skb_partial_csum_set);

static int skb_maybe_pull_tail(struct sk_buff *skb, unsigned int len,
                               unsigned int max)
{
        if (skb_headlen(skb) >= len)
                return 0;

        /* If we need to pullup then pullup to the max, so we
         * won't need to do it again.
         */
        if (max > skb->len)
                max = skb->len;

        if (__pskb_pull_tail(skb, max - skb_headlen(skb)) == NULL)
                return -ENOMEM;

        if (skb_headlen(skb) < len)
                return -EPROTO;

        return 0;
}

#define MAX_TCP_HDR_LEN (15 * 4)

static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
                                      typeof(IPPROTO_IP) proto,
                                      unsigned int off)
{
        int err;

        switch (proto) {
        case IPPROTO_TCP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
                                          off + MAX_TCP_HDR_LEN);
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct tcphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &tcp_hdr(skb)->check;

        case IPPROTO_UDP:
                err = skb_maybe_pull_tail(skb, off + sizeof(struct udphdr),
                                          off + sizeof(struct udphdr));
                if (!err && !skb_partial_csum_set(skb, off,
                                                  offsetof(struct udphdr,
                                                           check)))
                        err = -EPROTO;
                return err ? ERR_PTR(err) : &udp_hdr(skb)->check;
        }

        return ERR_PTR(-EPROTO);
}

/* This value should be large enough to cover a tagged ethernet header plus
 * maximally sized IP and TCP or UDP headers.
 */
#define MAX_IP_HDR_LEN 128

static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
{
        unsigned int off;
        bool fragment;
        __sum16 *csum;
        int err;

        fragment = false;

        err = skb_maybe_pull_tail(skb,
                                  sizeof(struct iphdr),
                                  MAX_IP_HDR_LEN);
        if (err < 0)
                goto out;

        if (ip_is_fragment(ip_hdr(skb)))
                fragment = true;

        off = ip_hdrlen(skb);

        err = -EPROTO;

        if (fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, ip_hdr(skb)->protocol, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_tcpudp_magic(ip_hdr(skb)->saddr,
                                           ip_hdr(skb)->daddr,
                                           skb->len - off,
                                           ip_hdr(skb)->protocol, 0);
        err = 0;

out:
        return err;
}

/* This value should be large enough to cover a tagged ethernet header plus
 * an IPv6 header, all options, and a maximal TCP or UDP header.
 */
#define MAX_IPV6_HDR_LEN 256

#define OPT_HDR(type, skb, off) \
        (type *)(skb_network_header(skb) + (off))

static int skb_checksum_setup_ipv6(struct sk_buff *skb, bool recalculate)
{
        int err;
        u8 nexthdr;
        unsigned int off;
        unsigned int len;
        bool fragment;
        bool done;
        __sum16 *csum;

        fragment = false;
        done = false;

        off = sizeof(struct ipv6hdr);

        err = skb_maybe_pull_tail(skb, off, MAX_IPV6_HDR_LEN);
        if (err < 0)
                goto out;

        nexthdr = ipv6_hdr(skb)->nexthdr;

        len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
        while (off <= len && !done) {
                switch (nexthdr) {
                case IPPROTO_DSTOPTS:
                case IPPROTO_HOPOPTS:
                case IPPROTO_ROUTING: {
                        struct ipv6_opt_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ipv6_opt_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ipv6_opt_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_optlen(hp);
                        break;
                }
                case IPPROTO_AH: {
                        struct ip_auth_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct ip_auth_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct ip_auth_hdr, skb, off);
                        nexthdr = hp->nexthdr;
                        off += ipv6_authlen(hp);
                        break;
                }
                case IPPROTO_FRAGMENT: {
                        struct frag_hdr *hp;

                        err = skb_maybe_pull_tail(skb,
                                                  off +
                                                  sizeof(struct frag_hdr),
                                                  MAX_IPV6_HDR_LEN);
                        if (err < 0)
                                goto out;

                        hp = OPT_HDR(struct frag_hdr, skb, off);

                        if (hp->frag_off & htons(IP6_OFFSET | IP6_MF))
                                fragment = true;

                        nexthdr = hp->nexthdr;
                        off += sizeof(struct frag_hdr);
                        break;
                }
                default:
                        done = true;
                        break;
                }
        }

        err = -EPROTO;

        if (!done || fragment)
                goto out;

        csum = skb_checksum_setup_ip(skb, nexthdr, off);
        if (IS_ERR(csum))
                return PTR_ERR(csum);

        if (recalculate)
                *csum = ~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
                                         &ipv6_hdr(skb)->daddr,
                                         skb->len - off, nexthdr, 0);
        err = 0;

out:
        return err;
}

/**
 * skb_checksum_setup - set up partial checksum offset
 * @skb: the skb to set up
 * @recalculate: if true the pseudo-header checksum will be recalculated
 */
int skb_checksum_setup(struct sk_buff *skb, bool recalculate)
{
        int err;

        switch (skb->protocol) {
        case htons(ETH_P_IP):
                err = skb_checksum_setup_ipv4(skb, recalculate);
                break;

        case htons(ETH_P_IPV6):
                err = skb_checksum_setup_ipv6(skb, recalculate);
                break;

        default:
                err = -EPROTO;
                break;
        }

        return err;
}
EXPORT_SYMBOL(skb_checksum_setup);

/**
 * skb_checksum_maybe_trim - maybe trims the given skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 *
 * Checks whether the given skb has data beyond the given transport length.
 * If so, returns a cloned skb trimmed to this transport length.
 * Otherwise returns the provided skb. Returns NULL in error cases
 * (e.g. transport_len exceeds skb length or out-of-memory).
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb,
                                               unsigned int transport_len)
{
        struct sk_buff *skb_chk;
        unsigned int len = skb_transport_offset(skb) + transport_len;
        int ret;

        if (skb->len < len)
                return NULL;
        else if (skb->len == len)
                return skb;

        skb_chk = skb_clone(skb, GFP_ATOMIC);
        if (!skb_chk)
                return NULL;

        ret = pskb_trim_rcsum(skb_chk, len);
        if (ret) {
                kfree_skb(skb_chk);
                return NULL;
        }

        return skb_chk;
}

/**
 * skb_checksum_trimmed - validate checksum of an skb
 * @skb: the skb to check
 * @transport_len: the data length beyond the network header
 * @skb_chkf: checksum function to use
 *
 * Applies the given checksum function skb_chkf to the provided skb.
 * Returns a checked and maybe trimmed skb. Returns NULL on error.
 *
 * If the skb has data beyond the given transport length, then a
 * trimmed & cloned skb is checked and returned.
 *
 * Caller needs to set the skb transport header and free any returned skb if it
 * differs from the provided skb.
 */
struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
                                     unsigned int transport_len,
                                     __sum16(*skb_chkf)(struct sk_buff *skb))
{
        struct sk_buff *skb_chk;
        unsigned int offset = skb_transport_offset(skb);
        __sum16 ret;

        skb_chk = skb_checksum_maybe_trim(skb, transport_len);
        if (!skb_chk)
                goto err;

        if (!pskb_may_pull(skb_chk, offset))
                goto err;

        skb_pull_rcsum(skb_chk, offset);
        ret = skb_chkf(skb_chk);
        skb_push_rcsum(skb_chk, offset);

        if (ret)
                goto err;

        return skb_chk;

err:
        if (skb_chk && skb_chk != skb)
                kfree_skb(skb_chk);

        return NULL;

}
EXPORT_SYMBOL(skb_checksum_trimmed);

void __skb_warn_lro_forwarding(const struct sk_buff *skb)
{
        net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n",
                             skb->dev->name);
}
EXPORT_SYMBOL(__skb_warn_lro_forwarding);

void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
        if (head_stolen) {
                skb_release_head_state(skb);
                kmem_cache_free(net_hotdata.skbuff_cache, skb);
        } else {
                __kfree_skb(skb);
        }
}
EXPORT_SYMBOL(kfree_skb_partial);

/**
 * skb_try_coalesce - try to merge skb to prior one
 * @to: prior buffer
 * @from: buffer to add
 * @fragstolen: pointer to boolean
 * @delta_truesize: how much more was allocated than was requested
 */
bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
                      bool *fragstolen, int *delta_truesize)
{
        struct skb_shared_info *to_shinfo, *from_shinfo;
        int i, delta, len = from->len;

        *fragstolen = false;

        if (skb_cloned(to))
                return false;

        /* In general, avoid mixing page_pool and non-page_pool allocated
         * pages within the same SKB. In theory we could take full
         * references if @from is cloned and !@to->pp_recycle but its
         * tricky (due to potential race with the clone disappearing) and
         * rare, so not worth dealing with.
         */
        if (to->pp_recycle != from->pp_recycle)
                return false;

        if (skb_frags_readable(from) != skb_frags_readable(to))
                return false;

        if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
                if (len)
                        BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
                *delta_truesize = 0;
                return true;
        }

        to_shinfo = skb_shinfo(to);
        from_shinfo = skb_shinfo(from);
        if (to_shinfo->frag_list || from_shinfo->frag_list)
                return false;
        if (skb_zcopy(to) || skb_zcopy(from))
                return false;

        if (skb_headlen(from) != 0) {
                struct page *page;
                unsigned int offset;

                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags >= MAX_SKB_FRAGS)
                        return false;

                if (skb_head_is_locked(from))
                        return false;

                delta = from->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));

                page = virt_to_head_page(from->head);
                offset = from->data - (unsigned char *)page_address(page);

                skb_fill_page_desc(to, to_shinfo->nr_frags,
                                   page, offset, skb_headlen(from));
                *fragstolen = true;
        } else {
                if (to_shinfo->nr_frags +
                    from_shinfo->nr_frags > MAX_SKB_FRAGS)
                        return false;

                delta = from->truesize - SKB_TRUESIZE(skb_end_offset(from));
        }

        WARN_ON_ONCE(delta < len);

        memcpy(to_shinfo->frags + to_shinfo->nr_frags,
               from_shinfo->frags,
               from_shinfo->nr_frags * sizeof(skb_frag_t));
        to_shinfo->nr_frags += from_shinfo->nr_frags;

        if (!skb_cloned(from))
                from_shinfo->nr_frags = 0;

        /* if the skb is not cloned this does nothing
         * since we set nr_frags to 0.
         */
        if (skb_pp_frag_ref(from)) {
                for (i = 0; i < from_shinfo->nr_frags; i++)
                        __skb_frag_ref(&from_shinfo->frags[i]);
        }

        to->truesize += delta;
        to->len += len;
        to->data_len += len;

        *delta_truesize = delta;
        return true;
}
EXPORT_SYMBOL(skb_try_coalesce);

/**
 * skb_scrub_packet - scrub an skb
 *
 * @skb: buffer to clean
 * @xnet: packet is crossing netns
 *
 * skb_scrub_packet can be used after encapsulating or decapsulating a packet
 * into/from a tunnel. Some information have to be cleared during these
 * operations.
 * skb_scrub_packet can also be used to clean a skb before injecting it in
 * another namespace (@xnet == true). We have to clear all information in the
 * skb that could impact namespace isolation.
 */
void skb_scrub_packet(struct sk_buff *skb, bool xnet)
{
        skb->pkt_type = PACKET_HOST;
        skb->skb_iif = 0;
        skb->ignore_df = 0;
        skb_dst_drop(skb);
        skb_ext_reset(skb);
        nf_reset_ct(skb);
        nf_reset_trace(skb);

#ifdef CONFIG_NET_SWITCHDEV
        skb->offload_fwd_mark = 0;
        skb->offload_l3_fwd_mark = 0;
#endif
        ipvs_reset(skb);

        if (!xnet)
                return;

        skb->mark = 0;
        skb_clear_tstamp(skb);
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);

static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
        int mac_len, meta_len;
        void *meta;

        if (skb_cow(skb, skb_headroom(skb)) < 0) {
                kfree_skb(skb);
                return NULL;
        }

        mac_len = skb->data - skb_mac_header(skb);
        if (likely(mac_len > VLAN_HLEN + ETH_TLEN)) {
                memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
                        mac_len - VLAN_HLEN - ETH_TLEN);
        }

        meta_len = skb_metadata_len(skb);
        if (meta_len) {
                meta = skb_metadata_end(skb) - meta_len;
                memmove(meta + VLAN_HLEN, meta, meta_len);
        }

        skb->mac_header += VLAN_HLEN;
        return skb;
}

struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
{
        struct vlan_hdr *vhdr;
        u16 vlan_tci;

        if (unlikely(skb_vlan_tag_present(skb))) {
                /* vlan_tci is already set-up so leave this for another time */
                return skb;
        }

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                goto err_free;
        /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
        if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
                goto err_free;

        vhdr = (struct vlan_hdr *)skb->data;
        vlan_tci = ntohs(vhdr->h_vlan_TCI);
        __vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);

        skb_pull_rcsum(skb, VLAN_HLEN);
        vlan_set_encap_proto(skb, vhdr);

        skb = skb_reorder_vlan_header(skb);
        if (unlikely(!skb))
                goto err_free;

        skb_reset_network_header(skb);
        if (!skb_transport_header_was_set(skb))
                skb_reset_transport_header(skb);
        skb_reset_mac_len(skb);

        return skb;

err_free:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(skb_vlan_untag);

int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
{
        if (!pskb_may_pull(skb, write_len))
                return -ENOMEM;

        if (!skb_frags_readable(skb))
                return -EFAULT;

        if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
                return 0;

        return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable);

int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
{
        int needed_headroom = dev->needed_headroom;
        int needed_tailroom = dev->needed_tailroom;

        /* For tail taggers, we need to pad short frames ourselves, to ensure
         * that the tail tag does not fail at its role of being at the end of
         * the packet, once the conduit interface pads the frame. Account for
         * that pad length here, and pad later.
         */
        if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
                needed_tailroom += ETH_ZLEN - skb->len;
        /* skb_headroom() returns unsigned int... */
        needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
        needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);

        if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
                /* No reallocation needed, yay! */
                return 0;

        return pskb_expand_head(skb, needed_headroom, needed_tailroom,
                                GFP_ATOMIC);
}
EXPORT_SYMBOL(skb_ensure_writable_head_tail);

/* remove VLAN header from packet and update csum accordingly.
 * expects a non skb_vlan_tag_present skb with a vlan tag payload
 */
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
        int offset = skb->data - skb_mac_header(skb);
        int err;

        if (WARN_ONCE(offset,
                      "__skb_vlan_pop got skb with skb->data not at mac header (offset %d)\n",
                      offset)) {
                return -EINVAL;
        }

        err = skb_ensure_writable(skb, VLAN_ETH_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);

        vlan_remove_tag(skb, vlan_tci);

        skb->mac_header += VLAN_HLEN;

        if (skb_network_offset(skb) < ETH_HLEN)
                skb_set_network_header(skb, ETH_HLEN);

        skb_reset_mac_len(skb);

        return err;
}
EXPORT_SYMBOL(__skb_vlan_pop);

/* Pop a vlan tag either from hwaccel or from payload.
 * Expects skb->data at mac header.
 */
int skb_vlan_pop(struct sk_buff *skb)
{
        u16 vlan_tci;
        __be16 vlan_proto;
        int err;

        if (likely(skb_vlan_tag_present(skb))) {
                __vlan_hwaccel_clear_tag(skb);
        } else {
                if (unlikely(!eth_type_vlan(skb->protocol)))
                        return 0;

                err = __skb_vlan_pop(skb, &vlan_tci);
                if (err)
                        return err;
        }
        /* move next vlan tag to hw accel tag */
        if (likely(!eth_type_vlan(skb->protocol)))
                return 0;

        vlan_proto = skb->protocol;
        err = __skb_vlan_pop(skb, &vlan_tci);
        if (unlikely(err))
                return err;

        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_pop);

/* Push a vlan tag either into hwaccel or into payload (if hwaccel tag present).
 * Expects skb->data at mac header.
 */
int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
{
        if (skb_vlan_tag_present(skb)) {
                int offset = skb->data - skb_mac_header(skb);
                int err;

                if (WARN_ONCE(offset,
                              "skb_vlan_push got skb with skb->data not at mac header (offset %d)\n",
                              offset)) {
                        return -EINVAL;
                }

                err = __vlan_insert_tag(skb, skb->vlan_proto,
                                        skb_vlan_tag_get(skb));
                if (err)
                        return err;

                skb->protocol = skb->vlan_proto;
                skb->network_header -= VLAN_HLEN;

                skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
        }
        __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
        return 0;
}
EXPORT_SYMBOL(skb_vlan_push);

/**
 * skb_eth_pop() - Drop the Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 *
 * Drop the Ethernet header of @skb.
 *
 * Expects that skb->data points to the mac header and that no VLAN tags are
 * present.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_pop(struct sk_buff *skb)
{
        if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
            skb_network_offset(skb) < ETH_HLEN)
                return -EPROTO;

        skb_pull_rcsum(skb, ETH_HLEN);
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        return 0;
}
EXPORT_SYMBOL(skb_eth_pop);

/**
 * skb_eth_push() - Add a new Ethernet header at the head of a packet
 *
 * @skb: Socket buffer to modify
 * @dst: Destination MAC address of the new header
 * @src: Source MAC address of the new header
 *
 * Prepend @skb with a new Ethernet header.
 *
 * Expects that skb->data points to the mac header, which must be empty.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
                 const unsigned char *src)
{
        struct ethhdr *eth;
        int err;

        if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
                return -EPROTO;

        err = skb_cow_head(skb, sizeof(*eth));
        if (err < 0)
                return err;

        skb_push(skb, sizeof(*eth));
        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        eth = eth_hdr(skb);
        ether_addr_copy(eth->h_dest, dst);
        ether_addr_copy(eth->h_source, src);
        eth->h_proto = skb->protocol;

        skb_postpush_rcsum(skb, eth, sizeof(*eth));

        return 0;
}
EXPORT_SYMBOL(skb_eth_push);

/* Update the ethertype of hdr and the skb csum value if required. */
static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
                             __be16 ethertype)
{
        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be16 diff[] = { ~hdr->h_proto, ethertype };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        hdr->h_proto = ethertype;
}

/**
 * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
 *                   the packet
 *
 * @skb: buffer
 * @mpls_lse: MPLS label stack entry to push
 * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
 *            ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
                  int mac_len, bool ethernet)
{
        struct mpls_shim_hdr *lse;
        int err;

        if (unlikely(!eth_p_mpls(mpls_proto)))
                return -EINVAL;

        /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
        if (skb->encapsulation)
                return -EINVAL;

        err = skb_cow_head(skb, MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (!skb->inner_protocol) {
                skb_set_inner_network_header(skb, skb_network_offset(skb));
                skb_set_inner_protocol(skb, skb->protocol);
        }

        skb_push(skb, MPLS_HLEN);
        memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
                mac_len);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);
        skb_reset_mac_len(skb);

        lse = mpls_hdr(skb);
        lse->label_stack_entry = mpls_lse;
        skb_postpush_rcsum(skb, lse, MPLS_HLEN);

        if (ethernet && mac_len >= ETH_HLEN)
                skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
        skb->protocol = mpls_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_push);

/**
 * skb_mpls_pop() - pop the outermost MPLS header
 *
 * @skb: buffer
 * @next_proto: ethertype of header after popped MPLS header
 * @mac_len: length of the MAC header
 * @ethernet: flag to indicate if the packet is ethernet
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
                 bool ethernet)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return 0;

        err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
        memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
                mac_len);

        __skb_pull(skb, MPLS_HLEN);
        skb_reset_mac_header(skb);
        skb_set_network_header(skb, mac_len);

        if (ethernet && mac_len >= ETH_HLEN) {
                struct ethhdr *hdr;

                /* use mpls_hdr() to get ethertype to account for VLANs. */
                hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
                skb_mod_eth_type(skb, hdr, next_proto);
        }
        skb->protocol = next_proto;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_pop);

/**
 * skb_mpls_update_lse() - modify outermost MPLS header and update csum
 *
 * @skb: buffer
 * @mpls_lse: new MPLS label stack entry to update to
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
{
        int err;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
        if (unlikely(err))
                return err;

        if (skb->ip_summed == CHECKSUM_COMPLETE) {
                __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };

                skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
        }

        mpls_hdr(skb)->label_stack_entry = mpls_lse;

        return 0;
}
EXPORT_SYMBOL_GPL(skb_mpls_update_lse);

/**
 * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
 *
 * @skb: buffer
 *
 * Expects skb->data at mac header.
 *
 * Returns 0 on success, -errno otherwise.
 */
int skb_mpls_dec_ttl(struct sk_buff *skb)
{
        u32 lse;
        u8 ttl;

        if (unlikely(!eth_p_mpls(skb->protocol)))
                return -EINVAL;

        if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
                return -ENOMEM;

        lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
        ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
        if (!--ttl)
                return -EINVAL;

        lse &= ~MPLS_LS_TTL_MASK;
        lse |= ttl << MPLS_LS_TTL_SHIFT;

        return skb_mpls_update_lse(skb, cpu_to_be32(lse));
}
EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);

/**
 * alloc_skb_with_frags - allocate skb with page frags
 *
 * @header_len: size of linear part
 * @data_len: needed length in frags
 * @order: max page order desired.
 * @errcode: pointer to error code if any
 * @gfp_mask: allocation mask
 *
 * This can be used to allocate a paged skb, given a maximal order for frags.
 */
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
                                     unsigned long data_len,
                                     int order,
                                     int *errcode,
                                     gfp_t gfp_mask)
{
        unsigned long chunk;
        struct sk_buff *skb;
        struct page *page;
        int nr_frags = 0;

        *errcode = -EMSGSIZE;
        if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
                return NULL;

        *errcode = -ENOBUFS;
        skb = alloc_skb(header_len, gfp_mask);
        if (!skb)
                return NULL;

        while (data_len) {
                if (nr_frags == MAX_SKB_FRAGS - 1)
                        goto failure;
                while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
                        order--;

                if (order) {
                        page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
                                           __GFP_COMP |
                                           __GFP_NOWARN,
                                           order);
                        if (!page) {
                                order--;
                                continue;
                        }
                } else {
                        page = alloc_page(gfp_mask);
                        if (!page)
                                goto failure;
                }
                chunk = min_t(unsigned long, data_len,
                              PAGE_SIZE << order);
                skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
                nr_frags++;
                skb->truesize += (PAGE_SIZE << order);
                data_len -= chunk;
        }
        return skb;

failure:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(alloc_skb_with_frags);

/* carve out the first off bytes from skb when off < headlen */
static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
                                    const int headlen, gfp_t gfp_mask)
{
        int i;
        unsigned int size = skb_end_offset(skb);
        int new_hlen = headlen - off;
        u8 *data;

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;
        size = SKB_WITH_OVERHEAD(size);

        /* Copy real data, and all frags */
        skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
        skb->len -= off;

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb),
               offsetof(struct skb_shared_info,
                        frags[skb_shinfo(skb)->nr_frags]));
        if (skb_cloned(skb)) {
                /* drop the old head gracefully */
                if (skb_orphan_frags(skb, gfp_mask)) {
                        skb_kfree_head(data, size);
                        return -ENOMEM;
                }
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
                        skb_frag_ref(skb, i);
                if (skb_has_frag_list(skb))
                        skb_clone_fraglist(skb);
                skb_release_data(skb, SKB_CONSUMED);
        } else {
                /* we can reuse existing recount- all we did was
                 * relocate values
                 */
                skb_free_head(skb);
        }

        skb->head = data;
        skb->data = data;
        skb->head_frag = 0;
        skb_set_end_offset(skb, size);
        skb_set_tail_pointer(skb, skb_headlen(skb));
        skb_headers_offset_update(skb, 0);
        skb->cloned = 0;
        skb->hdr_len = 0;
        skb->nohdr = 0;
        atomic_set(&skb_shinfo(skb)->dataref, 1);

        return 0;
}

static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);

/* carve out the first eat bytes from skb's frag_list. May recurse into
 * pskb_carve()
 */
static int pskb_carve_frag_list(struct sk_buff *skb,
                                struct skb_shared_info *shinfo, int eat,
                                gfp_t gfp_mask)
{
        struct sk_buff *list = shinfo->frag_list;
        struct sk_buff *clone = NULL;
        struct sk_buff *insp = NULL;

        do {
                if (!list) {
                        pr_err("Not enough bytes to eat. Want %d\n", eat);
                        return -EFAULT;
                }
                if (list->len <= eat) {
                        /* Eaten as whole. */
                        eat -= list->len;
                        list = list->next;
                        insp = list;
                } else {
                        /* Eaten partially. */
                        if (skb_shared(list)) {
                                clone = skb_clone(list, gfp_mask);
                                if (!clone)
                                        return -ENOMEM;
                                insp = list->next;
                                list = clone;
                        } else {
                                /* This may be pulled without problems. */
                                insp = list;
                        }
                        if (pskb_carve(list, eat, gfp_mask) < 0) {
                                kfree_skb(clone);
                                return -ENOMEM;
                        }
                        break;
                }
        } while (eat);

        /* Free pulled out fragments. */
        while ((list = shinfo->frag_list) != insp) {
                shinfo->frag_list = list->next;
                consume_skb(list);
        }
        /* And insert new clone at head. */
        if (clone) {
                clone->next = list;
                shinfo->frag_list = clone;
        }
        return 0;
}

/* carve off first len bytes from skb. Split line (off) is in the
 * non-linear part of skb
 */
static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
                                       int pos, gfp_t gfp_mask)
{
        int i, k = 0;
        unsigned int size = skb_end_offset(skb);
        u8 *data;
        const int nfrags = skb_shinfo(skb)->nr_frags;
        struct skb_shared_info *shinfo;

        if (skb_pfmemalloc(skb))
                gfp_mask |= __GFP_MEMALLOC;

        data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
        if (!data)
                return -ENOMEM;
        size = SKB_WITH_OVERHEAD(size);

        memcpy((struct skb_shared_info *)(data + size),
               skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
        if (skb_orphan_frags(skb, gfp_mask)) {
                skb_kfree_head(data, size);
                return -ENOMEM;
        }
        shinfo = (struct skb_shared_info *)(data + size);
        for (i = 0; i < nfrags; i++) {
                int fsize = skb_frag_size(&skb_shinfo(skb)->frags[i]);

                if (pos + fsize > off) {
                        shinfo->frags[k] = skb_shinfo(skb)->frags[i];

                        if (pos < off) {
                                /* Split frag.
                                 * We have two variants in this case:
                                 * 1. Move all the frag to the second
                                 *    part, if it is possible. F.e.
                                 *    this approach is mandatory for TUX,
                                 *    where splitting is expensive.
                                 * 2. Split is accurately. We make this.
                                 */
                                skb_frag_off_add(&shinfo->frags[0], off - pos);
                                skb_frag_size_sub(&shinfo->frags[0], off - pos);
                        }
                        skb_frag_ref(skb, i);
                        k++;
                }
                pos += fsize;
        }
        shinfo->nr_frags = k;
        if (skb_has_frag_list(skb))
                skb_clone_fraglist(skb);

        /* split line is in frag list */
        if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
                /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
                if (skb_has_frag_list(skb))
                        kfree_skb_list(skb_shinfo(skb)->frag_list);
                skb_kfree_head(data, size);
                return -ENOMEM;
        }
        skb_release_data(skb, SKB_CONSUMED);

        skb->head = data;
        skb->head_frag = 0;
        skb->data = data;
        skb_set_end_offset(skb, size);
        skb_reset_tail_pointer(skb);
        skb_headers_offset_update(skb, 0);
        skb->cloned   = 0;
        skb->hdr_len  = 0;
        skb->nohdr    = 0;
        skb->len -= off;
        skb->data_len = skb->len;
        atomic_set(&skb_shinfo(skb)->dataref, 1);
        return 0;
}

/* remove len bytes from the beginning of the skb */
static int pskb_carve(struct sk_buff *skb, const u32 len, gfp_t gfp)
{
        int headlen = skb_headlen(skb);

        if (len < headlen)
                return pskb_carve_inside_header(skb, len, headlen, gfp);
        else
                return pskb_carve_inside_nonlinear(skb, len, headlen, gfp);
}

/* Extract to_copy bytes starting at off from skb, and return this in
 * a new skb
 */
struct sk_buff *pskb_extract(struct sk_buff *skb, int off,
                             int to_copy, gfp_t gfp)
{
        struct sk_buff  *clone = skb_clone(skb, gfp);

        if (!clone)
                return NULL;

        if (pskb_carve(clone, off, gfp) < 0 ||
            pskb_trim(clone, to_copy)) {
                kfree_skb(clone);
                return NULL;
        }
        return clone;
}
EXPORT_SYMBOL(pskb_extract);

/**
 * skb_condense - try to get rid of fragments/frag_list if possible
 * @skb: buffer
 *
 * Can be used to save memory before skb is added to a busy queue.
 * If packet has bytes in frags and enough tail room in skb->head,
 * pull all of them, so that we can free the frags right now and adjust
 * truesize.
 * Notes:
 *        We do not reallocate skb->head thus can not fail.
 *        Caller must re-evaluate skb->truesize if needed.
 */
void skb_condense(struct sk_buff *skb)
{
        if (skb->data_len) {
                if (skb->data_len > skb->end - skb->tail ||
                    skb_cloned(skb) || !skb_frags_readable(skb))
                        return;

                /* Nice, we can free page frag(s) right now */
                __pskb_pull_tail(skb, skb->data_len);
        }
        /* At this point, skb->truesize might be over estimated,
         * because skb had a fragment, and fragments do not tell
         * their truesize.
         * When we pulled its content into skb->head, fragment
         * was freed, but __pskb_pull_tail() could not possibly
         * adjust skb->truesize, not knowing the frag truesize.
         */
        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
EXPORT_SYMBOL(skb_condense);

#ifdef CONFIG_SKB_EXTENSIONS
static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
{
        return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
}

/**
 * __skb_ext_alloc - allocate a new skb extensions storage
 *
 * @flags: See kmalloc().
 *
 * Returns the newly allocated pointer. The pointer can later attached to a
 * skb via __skb_ext_set().
 * Note: caller must handle the skb_ext as an opaque data.
 */
struct skb_ext *__skb_ext_alloc(gfp_t flags)
{
        struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);

        if (new) {
                memset(new->offset, 0, sizeof(new->offset));
                refcount_set(&new->refcnt, 1);
        }

        return new;
}

static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
                                         unsigned int old_active)
{
        struct skb_ext *new;

        if (refcount_read(&old->refcnt) == 1)
                return old;

        new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
        if (!new)
                return NULL;

        memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
        refcount_set(&new->refcnt, 1);

#ifdef CONFIG_XFRM
        if (old_active & (1 << SKB_EXT_SEC_PATH)) {
                struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
                unsigned int i;

                for (i = 0; i < sp->len; i++)
                        xfrm_state_hold(sp->xvec[i]);
        }
#endif
#ifdef CONFIG_MCTP_FLOWS
        if (old_active & (1 << SKB_EXT_MCTP)) {
                struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP);

                if (flow->key)
                        refcount_inc(&flow->key->refs);
        }
#endif
        __skb_ext_put(old);
        return new;
}

/**
 * __skb_ext_set - attach the specified extension storage to this skb
 * @skb: buffer
 * @id: extension id
 * @ext: extension storage previously allocated via __skb_ext_alloc()
 *
 * Existing extensions, if any, are cleared.
 *
 * Returns the pointer to the extension.
 */
void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
                    struct skb_ext *ext)
{
        unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);

        skb_ext_put(skb);
        newlen = newoff + skb_ext_type_len[id];
        ext->chunks = newlen;
        ext->offset[id] = newoff;
        skb->extensions = ext;
        skb->active_extensions = 1 << id;
        return skb_ext_get_ptr(ext, id);
}

/**
 * skb_ext_add - allocate space for given extension, COW if needed
 * @skb: buffer
 * @id: extension to allocate space for
 *
 * Allocates enough space for the given extension.
 * If the extension is already present, a pointer to that extension
 * is returned.
 *
 * If the skb was cloned, COW applies and the returned memory can be
 * modified without changing the extension space of clones buffers.
 *
 * Returns pointer to the extension or NULL on allocation failure.
 */
void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *new, *old = NULL;
        unsigned int newlen, newoff;

        if (skb->active_extensions) {
                old = skb->extensions;

                new = skb_ext_maybe_cow(old, skb->active_extensions);
                if (!new)
                        return NULL;

                if (__skb_ext_exist(new, id))
                        goto set_active;

                newoff = new->chunks;
        } else {
                newoff = SKB_EXT_CHUNKSIZEOF(*new);

                new = __skb_ext_alloc(GFP_ATOMIC);
                if (!new)
                        return NULL;
        }

        newlen = newoff + skb_ext_type_len[id];
        new->chunks = newlen;
        new->offset[id] = newoff;
set_active:
        skb->slow_gro = 1;
        skb->extensions = new;
        skb->active_extensions |= 1 << id;
        return skb_ext_get_ptr(new, id);
}
EXPORT_SYMBOL(skb_ext_add);

#ifdef CONFIG_XFRM
static void skb_ext_put_sp(struct sec_path *sp)
{
        unsigned int i;

        for (i = 0; i < sp->len; i++)
                xfrm_state_put(sp->xvec[i]);
}
#endif

#ifdef CONFIG_MCTP_FLOWS
static void skb_ext_put_mctp(struct mctp_flow *flow)
{
        if (flow->key)
                mctp_key_unref(flow->key);
}
#endif

void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
{
        struct skb_ext *ext = skb->extensions;

        skb->active_extensions &= ~(1 << id);
        if (skb->active_extensions == 0) {
                skb->extensions = NULL;
                __skb_ext_put(ext);
#ifdef CONFIG_XFRM
        } else if (id == SKB_EXT_SEC_PATH &&
                   refcount_read(&ext->refcnt) == 1) {
                struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);

                skb_ext_put_sp(sp);
                sp->len = 0;
#endif
        }
}
EXPORT_SYMBOL(__skb_ext_del);

void __skb_ext_put(struct skb_ext *ext)
{
        /* If this is last clone, nothing can increment
         * it after check passes.  Avoids one atomic op.
         */
        if (refcount_read(&ext->refcnt) == 1)
                goto free_now;

        if (!refcount_dec_and_test(&ext->refcnt))
                return;
free_now:
#ifdef CONFIG_XFRM
        if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
                skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
#endif
#ifdef CONFIG_MCTP_FLOWS
        if (__skb_ext_exist(ext, SKB_EXT_MCTP))
                skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
#endif

        kmem_cache_free(skbuff_ext_cache, ext);
}
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */

static void kfree_skb_napi_cache(struct sk_buff *skb)
{
        /* if SKB is a clone, don't handle this case */
        if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
                __kfree_skb(skb);
                return;
        }

        local_bh_disable();
        __napi_kfree_skb(skb, SKB_CONSUMED);
        local_bh_enable();
}

/**
 * skb_attempt_defer_free - queue skb for remote freeing
 * @skb: buffer
 *
 * Put @skb in a per-cpu list, using the cpu which
 * allocated the skb/pages to reduce false sharing
 * and memory zone spinlock contention.
 */
void skb_attempt_defer_free(struct sk_buff *skb)
{
        int cpu = skb->alloc_cpu;
        struct softnet_data *sd;
        unsigned int defer_max;
        bool kick;

        if (cpu == raw_smp_processor_id() ||
            WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
            !cpu_online(cpu)) {
nodefer:        kfree_skb_napi_cache(skb);
                return;
        }

        DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
        DEBUG_NET_WARN_ON_ONCE(skb->destructor);

        sd = &per_cpu(softnet_data, cpu);
        defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
        if (READ_ONCE(sd->defer_count) >= defer_max)
                goto nodefer;

        spin_lock_bh(&sd->defer_lock);
        /* Send an IPI every time queue reaches half capacity. */
        kick = sd->defer_count == (defer_max >> 1);
        /* Paired with the READ_ONCE() few lines above */
        WRITE_ONCE(sd->defer_count, sd->defer_count + 1);

        skb->next = sd->defer_list;
        /* Paired with READ_ONCE() in skb_defer_free_flush() */
        WRITE_ONCE(sd->defer_list, skb);
        spin_unlock_bh(&sd->defer_lock);

        /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
         * if we are unlucky enough (this seems very unlikely).
         */
        if (unlikely(kick))
                kick_defer_list_purge(sd, cpu);
}

static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
                                 size_t offset, size_t len)
{
        const char *kaddr;
        __wsum csum;

        kaddr = kmap_local_page(page);
        csum = csum_partial(kaddr + offset, len, 0);
        kunmap_local(kaddr);
        skb->csum = csum_block_add(skb->csum, csum, skb->len);
}

/**
 * skb_splice_from_iter - Splice (or copy) pages to skbuff
 * @skb: The buffer to add pages to
 * @iter: Iterator representing the pages to be added
 * @maxsize: Maximum amount of pages to be added
 * @gfp: Allocation flags
 *
 * This is a common helper function for supporting MSG_SPLICE_PAGES.  It
 * extracts pages from an iterator and adds them to the socket buffer if
 * possible, copying them to fragments if not possible (such as if they're slab
 * pages).
 *
 * Returns the amount of data spliced/copied or -EMSGSIZE if there's
 * insufficient space in the buffer to transfer anything.
 */
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
                             ssize_t maxsize, gfp_t gfp)
{
        size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
        struct page *pages[8], **ppages = pages;
        ssize_t spliced = 0, ret = 0;
        unsigned int i;

        while (iter->count > 0) {
                ssize_t space, nr, len;
                size_t off;

                ret = -EMSGSIZE;
                space = frag_limit - skb_shinfo(skb)->nr_frags;
                if (space < 0)
                        break;

                /* We might be able to coalesce without increasing nr_frags */
                nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));

                len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
                if (len <= 0) {
                        ret = len ?: -EIO;
                        break;
                }

                i = 0;
                do {
                        struct page *page = pages[i++];
                        size_t part = min_t(size_t, PAGE_SIZE - off, len);

                        ret = -EIO;
                        if (WARN_ON_ONCE(!sendpage_ok(page)))
                                goto out;

                        ret = skb_append_pagefrags(skb, page, off, part,
                                                   frag_limit);
                        if (ret < 0) {
                                iov_iter_revert(iter, len);
                                goto out;
                        }

                        if (skb->ip_summed == CHECKSUM_NONE)
                                skb_splice_csum_page(skb, page, off, part);

                        off = 0;
                        spliced += part;
                        maxsize -= part;
                        len -= part;
                } while (len > 0);

                if (maxsize <= 0)
                        break;
        }

out:
        skb_len_add(skb, spliced);
        return spliced ?: ret;
}
EXPORT_SYMBOL(skb_splice_from_iter);

static __always_inline
size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
                             size_t len, void *to, void *priv2)
{
        __wsum *csum = priv2;
        __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);

        *csum = csum_block_add(*csum, next, progress);
        return 0;
}

static __always_inline
size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
                                size_t len, void *to, void *priv2)
{
        __wsum next, *csum = priv2;

        next = csum_and_copy_from_user(iter_from, to + progress, len);
        *csum = csum_block_add(*csum, next, progress);
        return next ? 0 : len;
}

bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
                                  __wsum *csum, struct iov_iter *i)
{
        size_t copied;

        if (WARN_ON_ONCE(!i->data_source))
                return false;
        copied = iterate_and_advance2(i, bytes, addr, csum,
                                      copy_from_user_iter_csum,
                                      memcpy_from_iter_csum);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 

    3 

    3 


















    3 











































































































































































































































































































































































































































    3 
    3 









    3 
    3 































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        NET3        IP device support routines.
 *
 *        Derived from the IP parts of dev.c 1.0.19
 *                 Authors:        Ross Biro
 *                                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *
 *        Additional Authors:
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 *        Changes:
 *                Alexey Kuznetsov:        pa_* fields are replaced with ifaddr
 *                                        lists.
 *                Cyrus Durgin:                updated for kmod
 *                Matthias Andree:        in devinet_ioctl, compare label and
 *                                        address (4.4BSD alias style support),
 *                                        fall back to comparing just the label
 *                                        if no match found.
 */


#include <linux/uaccess.h>
#include <linux/bitops.h>
#include <linux/capability.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_addr.h>
#include <linux/if_ether.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include "igmp_internal.h"
#include <linux/slab.h>
#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/kmod.h>
#include <linux/netconf.h>

#include <net/arp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/addrconf.h>

#define IPV6ONLY_FLAGS        \
                (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \
                 IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \
                 IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY)

static struct ipv4_devconf ipv4_devconf = {
        .data = {
                [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
                [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
                [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
                [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
        },
};

static struct ipv4_devconf ipv4_devconf_dflt = {
        .data = {
                [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
                [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
                [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
                [IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
                [IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] =  1000 /*ms*/,
                [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
        },
};

#define IPV4_DEVCONF_DFLT(net, attr) \
        IPV4_DEVCONF((*net->ipv4.devconf_dflt), attr)

static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
        [IFA_LOCAL]             = { .type = NLA_U32 },
        [IFA_ADDRESS]           = { .type = NLA_U32 },
        [IFA_BROADCAST]         = { .type = NLA_U32 },
        [IFA_LABEL]             = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
        [IFA_CACHEINFO]                = { .len = sizeof(struct ifa_cacheinfo) },
        [IFA_FLAGS]                = { .type = NLA_U32 },
        [IFA_RT_PRIORITY]        = { .type = NLA_U32 },
        [IFA_TARGET_NETNSID]        = { .type = NLA_S32 },
        [IFA_PROTO]                = { .type = NLA_U8 },
};

#define IN4_ADDR_HSIZE_SHIFT        8
#define IN4_ADDR_HSIZE                (1U << IN4_ADDR_HSIZE_SHIFT)

static u32 inet_addr_hash(const struct net *net, __be32 addr)
{
        u32 val = __ipv4_addr_hash(addr, net_hash_mix(net));

        return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
}

static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
{
        u32 hash = inet_addr_hash(net, ifa->ifa_local);

        ASSERT_RTNL();
        hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]);
}

static void inet_hash_remove(struct in_ifaddr *ifa)
{
        ASSERT_RTNL();
        hlist_del_init_rcu(&ifa->addr_lst);
}

/**
 * __ip_dev_find - find the first device with a given source address.
 * @net: the net namespace
 * @addr: the source address
 * @devref: if true, take a reference on the found device
 *
 * If a caller uses devref=false, it should be protected by RCU, or RTNL
 */
struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
{
        struct net_device *result = NULL;
        struct in_ifaddr *ifa;

        rcu_read_lock();
        ifa = inet_lookup_ifaddr_rcu(net, addr);
        if (!ifa) {
                struct flowi4 fl4 = { .daddr = addr };
                struct fib_result res = { 0 };
                struct fib_table *local;

                /* Fallback to FIB local table so that communication
                 * over loopback subnets work.
                 */
                local = fib_get_table(net, RT_TABLE_LOCAL);
                if (local &&
                    !fib_table_lookup(local, &fl4, &res, FIB_LOOKUP_NOREF) &&
                    res.type == RTN_LOCAL)
                        result = FIB_RES_DEV(res);
        } else {
                result = ifa->ifa_dev->dev;
        }
        if (result && devref)
                dev_hold(result);
        rcu_read_unlock();
        return result;
}
EXPORT_SYMBOL(__ip_dev_find);

/* called under RCU lock */
struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr)
{
        u32 hash = inet_addr_hash(net, addr);
        struct in_ifaddr *ifa;

        hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst)
                if (ifa->ifa_local == addr)
                        return ifa;

        return NULL;
}

static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);

static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
static void inet_del_ifa(struct in_device *in_dev,
                         struct in_ifaddr __rcu **ifap,
                         int destroy);
#ifdef CONFIG_SYSCTL
static int devinet_sysctl_register(struct in_device *idev);
static void devinet_sysctl_unregister(struct in_device *idev);
#else
static int devinet_sysctl_register(struct in_device *idev)
{
        return 0;
}
static void devinet_sysctl_unregister(struct in_device *idev)
{
}
#endif

/* Locks all the inet devices. */

static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev)
{
        struct in_ifaddr *ifa;

        ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT);
        if (!ifa)
                return NULL;

        in_dev_hold(in_dev);
        ifa->ifa_dev = in_dev;

        INIT_HLIST_NODE(&ifa->addr_lst);

        return ifa;
}

static void inet_rcu_free_ifa(struct rcu_head *head)
{
        struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);

        in_dev_put(ifa->ifa_dev);
        kfree(ifa);
}

static void inet_free_ifa(struct in_ifaddr *ifa)
{
        /* Our reference to ifa->ifa_dev must be freed ASAP
         * to release the reference to the netdev the same way.
         * in_dev_put() -> in_dev_finish_destroy() -> netdev_put()
         */
        call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa);
}

static void in_dev_free_rcu(struct rcu_head *head)
{
        struct in_device *idev = container_of(head, struct in_device, rcu_head);

        kfree(rcu_dereference_protected(idev->mc_hash, 1));
        kfree(idev);
}

void in_dev_finish_destroy(struct in_device *idev)
{
        struct net_device *dev = idev->dev;

        WARN_ON(idev->ifa_list);
        WARN_ON(idev->mc_list);
#ifdef NET_REFCNT_DEBUG
        pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
        netdev_put(dev, &idev->dev_tracker);
        if (!idev->dead)
                pr_err("Freeing alive in_device %p\n", idev);
        else
                call_rcu(&idev->rcu_head, in_dev_free_rcu);
}
EXPORT_SYMBOL(in_dev_finish_destroy);

static struct in_device *inetdev_init(struct net_device *dev)
{
        struct in_device *in_dev;
        int err = -ENOMEM;

        ASSERT_RTNL();

        in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);
        if (!in_dev)
                goto out;
        memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,
                        sizeof(in_dev->cnf));
        in_dev->cnf.sysctl = NULL;
        in_dev->dev = dev;
        in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
        if (!in_dev->arp_parms)
                goto out_kfree;
        if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
                netif_disable_lro(dev);
        /* Reference in_dev->dev */
        netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL);
        /* Account for reference dev->ip_ptr (below) */
        refcount_set(&in_dev->refcnt, 1);

        if (dev != blackhole_netdev) {
                err = devinet_sysctl_register(in_dev);
                if (err) {
                        in_dev->dead = 1;
                        neigh_parms_release(&arp_tbl, in_dev->arp_parms);
                        in_dev_put(in_dev);
                        in_dev = NULL;
                        goto out;
                }
                ip_mc_init_dev(in_dev);
                if (dev->flags & IFF_UP)
                        ip_mc_up(in_dev);
        }

        /* we can receive as soon as ip_ptr is set -- do this last */
        rcu_assign_pointer(dev->ip_ptr, in_dev);
out:
        return in_dev ?: ERR_PTR(err);
out_kfree:
        kfree(in_dev);
        in_dev = NULL;
        goto out;
}

static void inetdev_destroy(struct in_device *in_dev)
{
        struct net_device *dev;
        struct in_ifaddr *ifa;

        ASSERT_RTNL();

        dev = in_dev->dev;

        in_dev->dead = 1;

        ip_mc_destroy_dev(in_dev);

        while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) {
                inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
                inet_free_ifa(ifa);
        }

        RCU_INIT_POINTER(dev->ip_ptr, NULL);

        devinet_sysctl_unregister(in_dev);
        neigh_parms_release(&arp_tbl, in_dev->arp_parms);
        arp_ifdown(dev);

        in_dev_put(in_dev);
}

static int __init inet_blackhole_dev_init(void)
{
        int err = 0;

        rtnl_lock();
        if (!inetdev_init(blackhole_netdev))
                err = -ENOMEM;
        rtnl_unlock();

        return err;
}
late_initcall(inet_blackhole_dev_init);

int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
{
        const struct in_ifaddr *ifa;

        rcu_read_lock();
        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (inet_ifa_match(a, ifa)) {
                        if (!b || inet_ifa_match(b, ifa)) {
                                rcu_read_unlock();
                                return 1;
                        }
                }
        }
        rcu_read_unlock();
        return 0;
}

static void __inet_del_ifa(struct in_device *in_dev,
                           struct in_ifaddr __rcu **ifap,
                           int destroy, struct nlmsghdr *nlh, u32 portid)
{
        struct in_ifaddr *promote = NULL;
        struct in_ifaddr *ifa, *ifa1;
        struct in_ifaddr __rcu **last_prim;
        struct in_ifaddr *prev_prom = NULL;
        int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);

        ASSERT_RTNL();

        ifa1 = rtnl_dereference(*ifap);
        last_prim = ifap;
        if (in_dev->dead)
                goto no_promotions;

        /* 1. Deleting primary ifaddr forces deletion all secondaries
         * unless alias promotion is set
         **/

        if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
                struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next;

                while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
                        if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
                            ifa1->ifa_scope <= ifa->ifa_scope)
                                last_prim = &ifa->ifa_next;

                        if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
                            ifa1->ifa_mask != ifa->ifa_mask ||
                            !inet_ifa_match(ifa1->ifa_address, ifa)) {
                                ifap1 = &ifa->ifa_next;
                                prev_prom = ifa;
                                continue;
                        }

                        if (!do_promote) {
                                inet_hash_remove(ifa);
                                *ifap1 = ifa->ifa_next;

                                rtmsg_ifa(RTM_DELADDR, ifa, nlh, portid);
                                blocking_notifier_call_chain(&inetaddr_chain,
                                                NETDEV_DOWN, ifa);
                                inet_free_ifa(ifa);
                        } else {
                                promote = ifa;
                                break;
                        }
                }
        }

        /* On promotion all secondaries from subnet are changing
         * the primary IP, we must remove all their routes silently
         * and later to add them back with new prefsrc. Do this
         * while all addresses are on the device list.
         */
        for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) {
                if (ifa1->ifa_mask == ifa->ifa_mask &&
                    inet_ifa_match(ifa1->ifa_address, ifa))
                        fib_del_ifaddr(ifa, ifa1);
        }

no_promotions:
        /* 2. Unlink it */

        *ifap = ifa1->ifa_next;
        inet_hash_remove(ifa1);

        /* 3. Announce address deletion */

        /* Send message first, then call notifier.
           At first sight, FIB update triggered by notifier
           will refer to already deleted ifaddr, that could confuse
           netlink listeners. It is not true: look, gated sees
           that route deleted and if it still thinks that ifaddr
           is valid, it will try to restore deleted routes... Grr.
           So that, this order is correct.
         */
        rtmsg_ifa(RTM_DELADDR, ifa1, nlh, portid);
        blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);

        if (promote) {
                struct in_ifaddr *next_sec;

                next_sec = rtnl_dereference(promote->ifa_next);
                if (prev_prom) {
                        struct in_ifaddr *last_sec;

                        rcu_assign_pointer(prev_prom->ifa_next, next_sec);

                        last_sec = rtnl_dereference(*last_prim);
                        rcu_assign_pointer(promote->ifa_next, last_sec);
                        rcu_assign_pointer(*last_prim, promote);
                }

                promote->ifa_flags &= ~IFA_F_SECONDARY;
                rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
                blocking_notifier_call_chain(&inetaddr_chain,
                                NETDEV_UP, promote);
                for (ifa = next_sec; ifa;
                     ifa = rtnl_dereference(ifa->ifa_next)) {
                        if (ifa1->ifa_mask != ifa->ifa_mask ||
                            !inet_ifa_match(ifa1->ifa_address, ifa))
                                        continue;
                        fib_add_ifaddr(ifa);
                }

        }
        if (destroy)
                inet_free_ifa(ifa1);
}

static void inet_del_ifa(struct in_device *in_dev,
                         struct in_ifaddr __rcu **ifap,
                         int destroy)
{
        __inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
}

static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
                             u32 portid, struct netlink_ext_ack *extack)
{
        struct in_ifaddr __rcu **last_primary, **ifap;
        struct in_device *in_dev = ifa->ifa_dev;
        struct net *net = dev_net(in_dev->dev);
        struct in_validator_info ivi;
        struct in_ifaddr *ifa1;
        int ret;

        ASSERT_RTNL();

        ifa->ifa_flags &= ~IFA_F_SECONDARY;
        last_primary = &in_dev->ifa_list;

        /* Don't set IPv6 only flags to IPv4 addresses */
        ifa->ifa_flags &= ~IPV6ONLY_FLAGS;

        ifap = &in_dev->ifa_list;
        ifa1 = rtnl_dereference(*ifap);

        while (ifa1) {
                if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
                    ifa->ifa_scope <= ifa1->ifa_scope)
                        last_primary = &ifa1->ifa_next;
                if (ifa1->ifa_mask == ifa->ifa_mask &&
                    inet_ifa_match(ifa1->ifa_address, ifa)) {
                        if (ifa1->ifa_local == ifa->ifa_local) {
                                inet_free_ifa(ifa);
                                return -EEXIST;
                        }
                        if (ifa1->ifa_scope != ifa->ifa_scope) {
                                NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value");
                                inet_free_ifa(ifa);
                                return -EINVAL;
                        }
                        ifa->ifa_flags |= IFA_F_SECONDARY;
                }

                ifap = &ifa1->ifa_next;
                ifa1 = rtnl_dereference(*ifap);
        }

        /* Allow any devices that wish to register ifaddr validtors to weigh
         * in now, before changes are committed.  The rntl lock is serializing
         * access here, so the state should not change between a validator call
         * and a final notify on commit.  This isn't invoked on promotion under
         * the assumption that validators are checking the address itself, and
         * not the flags.
         */
        ivi.ivi_addr = ifa->ifa_address;
        ivi.ivi_dev = ifa->ifa_dev;
        ivi.extack = extack;
        ret = blocking_notifier_call_chain(&inetaddr_validator_chain,
                                           NETDEV_UP, &ivi);
        ret = notifier_to_errno(ret);
        if (ret) {
                inet_free_ifa(ifa);
                return ret;
        }

        if (!(ifa->ifa_flags & IFA_F_SECONDARY))
                ifap = last_primary;

        rcu_assign_pointer(ifa->ifa_next, *ifap);
        rcu_assign_pointer(*ifap, ifa);

        inet_hash_insert(dev_net(in_dev->dev), ifa);

        cancel_delayed_work(&net->ipv4.addr_chk_work);
        queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0);

        /* Send message first, then call notifier.
           Notifier will trigger FIB update, so that
           listeners of netlink will know about new ifaddr */
        rtmsg_ifa(RTM_NEWADDR, ifa, nlh, portid);
        blocking_notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);

        return 0;
}

static int inet_insert_ifa(struct in_ifaddr *ifa)
{
        if (!ifa->ifa_local) {
                inet_free_ifa(ifa);
                return 0;
        }

        return __inet_insert_ifa(ifa, NULL, 0, NULL);
}

static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
{
        struct in_device *in_dev = __in_dev_get_rtnl_net(dev);

        ipv4_devconf_setall(in_dev);
        neigh_parms_data_state_setall(in_dev->arp_parms);

        if (ipv4_is_loopback(ifa->ifa_local))
                ifa->ifa_scope = RT_SCOPE_HOST;
        return inet_insert_ifa(ifa);
}

/* Caller must hold RCU or RTNL :
 * We dont take a reference on found in_device
 */
struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct in_device *in_dev = NULL;

        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        if (dev)
                in_dev = rcu_dereference_rtnl(dev->ip_ptr);
        rcu_read_unlock();
        return in_dev;
}
EXPORT_SYMBOL(inetdev_by_index);

/* Called only from RTNL semaphored context. No locks. */

struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
                                    __be32 mask)
{
        struct in_ifaddr *ifa;

        ASSERT_RTNL();

        in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
                        return ifa;
        }
        return NULL;
}

static int ip_mc_autojoin_config(struct net *net, bool join,
                                 const struct in_ifaddr *ifa)
{
#if defined(CONFIG_IP_MULTICAST)
        struct ip_mreqn mreq = {
                .imr_multiaddr.s_addr = ifa->ifa_address,
                .imr_ifindex = ifa->ifa_dev->dev->ifindex,
        };
        struct sock *sk = net->ipv4.mc_autojoin_sk;
        int ret;

        ASSERT_RTNL_NET(net);

        lock_sock(sk);
        if (join)
                ret = ip_mc_join_group(sk, &mreq);
        else
                ret = ip_mc_leave_group(sk, &mreq);
        release_sock(sk);

        return ret;
#else
        return -EOPNOTSUPP;
#endif
}

static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct in_ifaddr __rcu **ifap;
        struct nlattr *tb[IFA_MAX+1];
        struct in_device *in_dev;
        struct ifaddrmsg *ifm;
        struct in_ifaddr *ifa;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                     ifa_ipv4_policy, extack);
        if (err < 0)
                goto out;

        ifm = nlmsg_data(nlh);

        rtnl_net_lock(net);

        in_dev = inetdev_by_index(net, ifm->ifa_index);
        if (!in_dev) {
                NL_SET_ERR_MSG(extack, "ipv4: Device not found");
                err = -ENODEV;
                goto unlock;
        }

        for (ifap = &in_dev->ifa_list;
             (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
             ifap = &ifa->ifa_next) {
                if (tb[IFA_LOCAL] &&
                    ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
                        continue;

                if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
                        continue;

                if (tb[IFA_ADDRESS] &&
                    (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
                    !inet_ifa_match(nla_get_in_addr(tb[IFA_ADDRESS]), ifa)))
                        continue;

                if (ipv4_is_multicast(ifa->ifa_address))
                        ip_mc_autojoin_config(net, false, ifa);

                __inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
                goto unlock;
        }

        NL_SET_ERR_MSG(extack, "ipv4: Address not found");
        err = -EADDRNOTAVAIL;
unlock:
        rtnl_net_unlock(net);
out:
        return err;
}

static void check_lifetime(struct work_struct *work)
{
        unsigned long now, next, next_sec, next_sched;
        struct in_ifaddr *ifa;
        struct hlist_node *n;
        struct net *net;
        int i;

        net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work);
        now = jiffies;
        next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);

        for (i = 0; i < IN4_ADDR_HSIZE; i++) {
                struct hlist_head *head = &net->ipv4.inet_addr_lst[i];
                bool change_needed = false;

                rcu_read_lock();
                hlist_for_each_entry_rcu(ifa, head, addr_lst) {
                        unsigned long age, tstamp;
                        u32 preferred_lft;
                        u32 valid_lft;
                        u32 flags;

                        flags = READ_ONCE(ifa->ifa_flags);
                        if (flags & IFA_F_PERMANENT)
                                continue;

                        preferred_lft = READ_ONCE(ifa->ifa_preferred_lft);
                        valid_lft = READ_ONCE(ifa->ifa_valid_lft);
                        tstamp = READ_ONCE(ifa->ifa_tstamp);
                        /* We try to batch several events at once. */
                        age = (now - tstamp +
                               ADDRCONF_TIMER_FUZZ_MINUS) / HZ;

                        if (valid_lft != INFINITY_LIFE_TIME &&
                            age >= valid_lft) {
                                change_needed = true;
                        } else if (preferred_lft ==
                                   INFINITY_LIFE_TIME) {
                                continue;
                        } else if (age >= preferred_lft) {
                                if (time_before(tstamp + valid_lft * HZ, next))
                                        next = tstamp + valid_lft * HZ;

                                if (!(flags & IFA_F_DEPRECATED))
                                        change_needed = true;
                        } else if (time_before(tstamp + preferred_lft * HZ,
                                               next)) {
                                next = tstamp + preferred_lft * HZ;
                        }
                }
                rcu_read_unlock();
                if (!change_needed)
                        continue;

                rtnl_net_lock(net);
                hlist_for_each_entry_safe(ifa, n, head, addr_lst) {
                        unsigned long age;

                        if (ifa->ifa_flags & IFA_F_PERMANENT)
                                continue;

                        /* We try to batch several events at once. */
                        age = (now - ifa->ifa_tstamp +
                               ADDRCONF_TIMER_FUZZ_MINUS) / HZ;

                        if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
                            age >= ifa->ifa_valid_lft) {
                                struct in_ifaddr __rcu **ifap;
                                struct in_ifaddr *tmp;

                                ifap = &ifa->ifa_dev->ifa_list;
                                tmp = rtnl_net_dereference(net, *ifap);
                                while (tmp) {
                                        if (tmp == ifa) {
                                                inet_del_ifa(ifa->ifa_dev,
                                                             ifap, 1);
                                                break;
                                        }
                                        ifap = &tmp->ifa_next;
                                        tmp = rtnl_net_dereference(net, *ifap);
                                }
                        } else if (ifa->ifa_preferred_lft !=
                                   INFINITY_LIFE_TIME &&
                                   age >= ifa->ifa_preferred_lft &&
                                   !(ifa->ifa_flags & IFA_F_DEPRECATED)) {
                                ifa->ifa_flags |= IFA_F_DEPRECATED;
                                rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
                        }
                }
                rtnl_net_unlock(net);
        }

        next_sec = round_jiffies_up(next);
        next_sched = next;

        /* If rounded timeout is accurate enough, accept it. */
        if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
                next_sched = next_sec;

        now = jiffies;
        /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
        if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
                next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;

        queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work,
                           next_sched - now);
}

static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
                             __u32 prefered_lft)
{
        unsigned long timeout;
        u32 flags;

        flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);

        timeout = addrconf_timeout_fixup(valid_lft, HZ);
        if (addrconf_finite_timeout(timeout))
                WRITE_ONCE(ifa->ifa_valid_lft, timeout);
        else
                flags |= IFA_F_PERMANENT;

        timeout = addrconf_timeout_fixup(prefered_lft, HZ);
        if (addrconf_finite_timeout(timeout)) {
                if (timeout == 0)
                        flags |= IFA_F_DEPRECATED;
                WRITE_ONCE(ifa->ifa_preferred_lft, timeout);
        }
        WRITE_ONCE(ifa->ifa_flags, flags);
        WRITE_ONCE(ifa->ifa_tstamp, jiffies);
        if (!ifa->ifa_cstamp)
                WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp);
}

static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb,
                             struct netlink_ext_ack *extack,
                             __u32 *valid_lft, __u32 *prefered_lft)
{
        struct ifaddrmsg *ifm = nlmsg_data(nlh);
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                     ifa_ipv4_policy, extack);
        if (err < 0)
                return err;

        if (ifm->ifa_prefixlen > 32) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length");
                return -EINVAL;
        }

        if (!tb[IFA_LOCAL]) {
                NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied");
                return -EINVAL;
        }

        if (tb[IFA_CACHEINFO]) {
                struct ifa_cacheinfo *ci;

                ci = nla_data(tb[IFA_CACHEINFO]);
                if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
                        NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid");
                        return -EINVAL;
                }

                *valid_lft = ci->ifa_valid;
                *prefered_lft = ci->ifa_prefered;
        }

        return 0;
}

static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh,
                                         struct nlattr **tb,
                                         struct netlink_ext_ack *extack)
{
        struct ifaddrmsg *ifm = nlmsg_data(nlh);
        struct in_device *in_dev;
        struct net_device *dev;
        struct in_ifaddr *ifa;
        int err;

        dev = __dev_get_by_index(net, ifm->ifa_index);
        err = -ENODEV;
        if (!dev) {
                NL_SET_ERR_MSG(extack, "ipv4: Device not found");
                goto errout;
        }

        in_dev = __in_dev_get_rtnl_net(dev);
        err = -ENOBUFS;
        if (!in_dev)
                goto errout;

        ifa = inet_alloc_ifa(in_dev);
        if (!ifa)
                /*
                 * A potential indev allocation can be left alive, it stays
                 * assigned to its device and is destroy with it.
                 */
                goto errout;

        ipv4_devconf_setall(in_dev);
        neigh_parms_data_state_setall(in_dev->arp_parms);

        if (!tb[IFA_ADDRESS])
                tb[IFA_ADDRESS] = tb[IFA_LOCAL];

        ifa->ifa_prefixlen = ifm->ifa_prefixlen;
        ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
        ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
        ifa->ifa_scope = ifm->ifa_scope;
        ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
        ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);

        if (tb[IFA_BROADCAST])
                ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);

        if (tb[IFA_LABEL])
                nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
        else
                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);

        if (tb[IFA_RT_PRIORITY])
                ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);

        if (tb[IFA_PROTO])
                ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]);

        return ifa;

errout:
        return ERR_PTR(err);
}

static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa)
{
        struct in_device *in_dev = ifa->ifa_dev;
        struct in_ifaddr *ifa1;

        in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) {
                if (ifa1->ifa_mask == ifa->ifa_mask &&
                    inet_ifa_match(ifa1->ifa_address, ifa) &&
                    ifa1->ifa_local == ifa->ifa_local)
                        return ifa1;
        }

        return NULL;
}

static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
                            struct netlink_ext_ack *extack)
{
        __u32 prefered_lft = INFINITY_LIFE_TIME;
        __u32 valid_lft = INFINITY_LIFE_TIME;
        struct net *net = sock_net(skb->sk);
        struct in_ifaddr *ifa_existing;
        struct nlattr *tb[IFA_MAX + 1];
        struct in_ifaddr *ifa;
        int ret;

        ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft);
        if (ret < 0)
                return ret;

        if (!nla_get_in_addr(tb[IFA_LOCAL]))
                return 0;

        rtnl_net_lock(net);

        ifa = inet_rtm_to_ifa(net, nlh, tb, extack);
        if (IS_ERR(ifa)) {
                ret = PTR_ERR(ifa);
                goto unlock;
        }

        ifa_existing = find_matching_ifa(net, ifa);
        if (!ifa_existing) {
                /* It would be best to check for !NLM_F_CREATE here but
                 * userspace already relies on not having to provide this.
                 */
                set_ifa_lifetime(ifa, valid_lft, prefered_lft);
                if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
                        ret = ip_mc_autojoin_config(net, true, ifa);
                        if (ret < 0) {
                                NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed");
                                inet_free_ifa(ifa);
                                goto unlock;
                        }
                }

                ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack);
        } else {
                u32 new_metric = ifa->ifa_rt_priority;
                u8 new_proto = ifa->ifa_proto;

                inet_free_ifa(ifa);

                if (nlh->nlmsg_flags & NLM_F_EXCL ||
                    !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
                        NL_SET_ERR_MSG(extack, "ipv4: Address already assigned");
                        ret = -EEXIST;
                        goto unlock;
                }
                ifa = ifa_existing;

                if (ifa->ifa_rt_priority != new_metric) {
                        fib_modify_prefix_metric(ifa, new_metric);
                        ifa->ifa_rt_priority = new_metric;
                }

                ifa->ifa_proto = new_proto;

                set_ifa_lifetime(ifa, valid_lft, prefered_lft);
                cancel_delayed_work(&net->ipv4.addr_chk_work);
                queue_delayed_work(system_power_efficient_wq,
                                   &net->ipv4.addr_chk_work, 0);
                rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
        }

unlock:
        rtnl_net_unlock(net);

        return ret;
}

/*
 *        Determine a default network mask, based on the IP address.
 */

static int inet_abc_len(__be32 addr)
{
        int rc = -1;        /* Something else, probably a multicast. */

        if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
                rc = 0;
        else {
                __u32 haddr = ntohl(addr);
                if (IN_CLASSA(haddr))
                        rc = 8;
                else if (IN_CLASSB(haddr))
                        rc = 16;
                else if (IN_CLASSC(haddr))
                        rc = 24;
                else if (IN_CLASSE(haddr))
                        rc = 32;
        }

        return rc;
}


int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
{
        struct sockaddr_in sin_orig;
        struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
        struct in_ifaddr __rcu **ifap = NULL;
        struct in_device *in_dev;
        struct in_ifaddr *ifa = NULL;
        struct net_device *dev;
        char *colon;
        int ret = -EFAULT;
        int tryaddrmatch = 0;

        ifr->ifr_name[IFNAMSIZ - 1] = 0;

        /* save original address for comparison */
        memcpy(&sin_orig, sin, sizeof(*sin));

        colon = strchr(ifr->ifr_name, ':');
        if (colon)
                *colon = 0;

        dev_load(net, ifr->ifr_name);

        switch (cmd) {
        case SIOCGIFADDR:        /* Get interface address */
        case SIOCGIFBRDADDR:        /* Get the broadcast address */
        case SIOCGIFDSTADDR:        /* Get the destination address */
        case SIOCGIFNETMASK:        /* Get the netmask for the interface */
                /* Note that these ioctls will not sleep,
                   so that we do not impose a lock.
                   One day we will be forced to put shlock here (I mean SMP)
                 */
                tryaddrmatch = (sin_orig.sin_family == AF_INET);
                memset(sin, 0, sizeof(*sin));
                sin->sin_family = AF_INET;
                break;

        case SIOCSIFFLAGS:
                ret = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto out;
                break;
        case SIOCSIFADDR:        /* Set interface address (and family) */
        case SIOCSIFBRDADDR:        /* Set the broadcast address */
        case SIOCSIFDSTADDR:        /* Set the destination address */
        case SIOCSIFNETMASK:         /* Set the netmask for the interface */
                ret = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto out;
                ret = -EINVAL;
                if (sin->sin_family != AF_INET)
                        goto out;
                break;
        default:
                ret = -EINVAL;
                goto out;
        }

        rtnl_net_lock(net);

        ret = -ENODEV;
        dev = __dev_get_by_name(net, ifr->ifr_name);
        if (!dev)
                goto done;

        if (colon)
                *colon = ':';

        in_dev = __in_dev_get_rtnl_net(dev);
        if (in_dev) {
                if (tryaddrmatch) {
                        /* Matthias Andree */
                        /* compare label and address (4.4BSD style) */
                        /* note: we only do this for a limited set of ioctls
                           and only if the original address family was AF_INET.
                           This is checked above. */

                        for (ifap = &in_dev->ifa_list;
                             (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
                             ifap = &ifa->ifa_next) {
                                if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
                                    sin_orig.sin_addr.s_addr ==
                                                        ifa->ifa_local) {
                                        break; /* found */
                                }
                        }
                }
                /* we didn't get a match, maybe the application is
                   4.3BSD-style and passed in junk so we fall back to
                   comparing just the label */
                if (!ifa) {
                        for (ifap = &in_dev->ifa_list;
                             (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
                             ifap = &ifa->ifa_next)
                                if (!strcmp(ifr->ifr_name, ifa->ifa_label))
                                        break;
                }
        }

        ret = -EADDRNOTAVAIL;
        if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
                goto done;

        switch (cmd) {
        case SIOCGIFADDR:        /* Get interface address */
                ret = 0;
                sin->sin_addr.s_addr = ifa->ifa_local;
                break;

        case SIOCGIFBRDADDR:        /* Get the broadcast address */
                ret = 0;
                sin->sin_addr.s_addr = ifa->ifa_broadcast;
                break;

        case SIOCGIFDSTADDR:        /* Get the destination address */
                ret = 0;
                sin->sin_addr.s_addr = ifa->ifa_address;
                break;

        case SIOCGIFNETMASK:        /* Get the netmask for the interface */
                ret = 0;
                sin->sin_addr.s_addr = ifa->ifa_mask;
                break;

        case SIOCSIFFLAGS:
                if (colon) {
                        ret = -EADDRNOTAVAIL;
                        if (!ifa)
                                break;
                        ret = 0;
                        if (!(ifr->ifr_flags & IFF_UP))
                                inet_del_ifa(in_dev, ifap, 1);
                        break;
                }

                /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */
                ASSERT_RTNL();
                ret = dev_change_flags(dev, ifr->ifr_flags, NULL);
                break;

        case SIOCSIFADDR:        /* Set interface address (and family) */
                ret = -EINVAL;
                if (inet_abc_len(sin->sin_addr.s_addr) < 0)
                        break;

                if (!ifa) {
                        ret = -ENOBUFS;
                        if (!in_dev)
                                break;
                        ifa = inet_alloc_ifa(in_dev);
                        if (!ifa)
                                break;

                        if (colon)
                                memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ);
                        else
                                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
                } else {
                        ret = 0;
                        if (ifa->ifa_local == sin->sin_addr.s_addr)
                                break;
                        inet_del_ifa(in_dev, ifap, 0);
                        ifa->ifa_broadcast = 0;
                        ifa->ifa_scope = 0;
                }

                ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;

                if (!(dev->flags & IFF_POINTOPOINT)) {
                        ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
                        ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
                        if ((dev->flags & IFF_BROADCAST) &&
                            ifa->ifa_prefixlen < 31)
                                ifa->ifa_broadcast = ifa->ifa_address |
                                                     ~ifa->ifa_mask;
                } else {
                        ifa->ifa_prefixlen = 32;
                        ifa->ifa_mask = inet_make_mask(32);
                }
                set_ifa_lifetime(ifa, INFINITY_LIFE_TIME, INFINITY_LIFE_TIME);
                ret = inet_set_ifa(dev, ifa);
                break;

        case SIOCSIFBRDADDR:        /* Set the broadcast address */
                ret = 0;
                if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
                        inet_del_ifa(in_dev, ifap, 0);
                        ifa->ifa_broadcast = sin->sin_addr.s_addr;
                        inet_insert_ifa(ifa);
                }
                break;

        case SIOCSIFDSTADDR:        /* Set the destination address */
                ret = 0;
                if (ifa->ifa_address == sin->sin_addr.s_addr)
                        break;
                ret = -EINVAL;
                if (inet_abc_len(sin->sin_addr.s_addr) < 0)
                        break;
                ret = 0;
                inet_del_ifa(in_dev, ifap, 0);
                ifa->ifa_address = sin->sin_addr.s_addr;
                inet_insert_ifa(ifa);
                break;

        case SIOCSIFNETMASK:         /* Set the netmask for the interface */

                /*
                 *        The mask we set must be legal.
                 */
                ret = -EINVAL;
                if (bad_mask(sin->sin_addr.s_addr, 0))
                        break;
                ret = 0;
                if (ifa->ifa_mask != sin->sin_addr.s_addr) {
                        __be32 old_mask = ifa->ifa_mask;
                        inet_del_ifa(in_dev, ifap, 0);
                        ifa->ifa_mask = sin->sin_addr.s_addr;
                        ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);

                        /* See if current broadcast address matches
                         * with current netmask, then recalculate
                         * the broadcast address. Otherwise it's a
                         * funny address, so don't touch it since
                         * the user seems to know what (s)he's doing...
                         */
                        if ((dev->flags & IFF_BROADCAST) &&
                            (ifa->ifa_prefixlen < 31) &&
                            (ifa->ifa_broadcast ==
                             (ifa->ifa_local|~old_mask))) {
                                ifa->ifa_broadcast = (ifa->ifa_local |
                                                      ~sin->sin_addr.s_addr);
                        }
                        inet_insert_ifa(ifa);
                }
                break;
        }
done:
        rtnl_net_unlock(net);
out:
        return ret;
}

int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
{
        struct in_device *in_dev = __in_dev_get_rtnl_net(dev);
        const struct in_ifaddr *ifa;
        struct ifreq ifr;
        int done = 0;

        if (WARN_ON(size > sizeof(struct ifreq)))
                goto out;

        if (!in_dev)
                goto out;

        in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) {
                if (!buf) {
                        done += size;
                        continue;
                }
                if (len < size)
                        break;
                memset(&ifr, 0, sizeof(struct ifreq));
                strcpy(ifr.ifr_name, ifa->ifa_label);

                (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
                (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
                                                                ifa->ifa_local;

                if (copy_to_user(buf + done, &ifr, size)) {
                        done = -EFAULT;
                        break;
                }
                len  -= size;
                done += size;
        }
out:
        return done;
}

static __be32 in_dev_select_addr(const struct in_device *in_dev,
                                 int scope)
{
        const struct in_ifaddr *ifa;

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
                        continue;
                if (ifa->ifa_scope != RT_SCOPE_LINK &&
                    ifa->ifa_scope <= scope)
                        return ifa->ifa_local;
        }

        return 0;
}

__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
{
        const struct in_ifaddr *ifa;
        __be32 addr = 0;
        unsigned char localnet_scope = RT_SCOPE_HOST;
        struct in_device *in_dev;
        struct net *net;
        int master_idx;

        rcu_read_lock();
        net = dev_net_rcu(dev);
        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                goto no_in_dev;

        if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
                localnet_scope = RT_SCOPE_LINK;

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
                        continue;
                if (min(ifa->ifa_scope, localnet_scope) > scope)
                        continue;
                if (!dst || inet_ifa_match(dst, ifa)) {
                        addr = ifa->ifa_local;
                        break;
                }
                if (!addr)
                        addr = ifa->ifa_local;
        }

        if (addr)
                goto out_unlock;
no_in_dev:
        master_idx = l3mdev_master_ifindex_rcu(dev);

        /* For VRFs, the VRF device takes the place of the loopback device,
         * with addresses on it being preferred.  Note in such cases the
         * loopback device will be among the devices that fail the master_idx
         * equality check in the loop below.
         */
        if (master_idx &&
            (dev = dev_get_by_index_rcu(net, master_idx)) &&
            (in_dev = __in_dev_get_rcu(dev))) {
                addr = in_dev_select_addr(in_dev, scope);
                if (addr)
                        goto out_unlock;
        }

        /* Not loopback addresses on loopback should be preferred
           in this case. It is important that lo is the first interface
           in dev_base list.
         */
        for_each_netdev_rcu(net, dev) {
                if (l3mdev_master_ifindex_rcu(dev) != master_idx)
                        continue;

                in_dev = __in_dev_get_rcu(dev);
                if (!in_dev)
                        continue;

                addr = in_dev_select_addr(in_dev, scope);
                if (addr)
                        goto out_unlock;
        }
out_unlock:
        rcu_read_unlock();
        return addr;
}
EXPORT_SYMBOL(inet_select_addr);

static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
                              __be32 local, int scope)
{
        unsigned char localnet_scope = RT_SCOPE_HOST;
        const struct in_ifaddr *ifa;
        __be32 addr = 0;
        int same = 0;

        if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
                localnet_scope = RT_SCOPE_LINK;

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                unsigned char min_scope = min(ifa->ifa_scope, localnet_scope);

                if (!addr &&
                    (local == ifa->ifa_local || !local) &&
                    min_scope <= scope) {
                        addr = ifa->ifa_local;
                        if (same)
                                break;
                }
                if (!same) {
                        same = (!local || inet_ifa_match(local, ifa)) &&
                                (!dst || inet_ifa_match(dst, ifa));
                        if (same && addr) {
                                if (local || !dst)
                                        break;
                                /* Is the selected addr into dst subnet? */
                                if (inet_ifa_match(addr, ifa))
                                        break;
                                /* No, then can we use new local src? */
                                if (min_scope <= scope) {
                                        addr = ifa->ifa_local;
                                        break;
                                }
                                /* search for large dst subnet for addr */
                                same = 0;
                        }
                }
        }

        return same ? addr : 0;
}

/*
 * Confirm that local IP address exists using wildcards:
 * - net: netns to check, cannot be NULL
 * - in_dev: only on this interface, NULL=any interface
 * - dst: only in the same subnet as dst, 0=any dst
 * - local: address, 0=autoselect the local address
 * - scope: maximum allowed scope value for the local address
 */
__be32 inet_confirm_addr(struct net *net, struct in_device *in_dev,
                         __be32 dst, __be32 local, int scope)
{
        __be32 addr = 0;
        struct net_device *dev;

        if (in_dev)
                return confirm_addr_indev(in_dev, dst, local, scope);

        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                in_dev = __in_dev_get_rcu(dev);
                if (in_dev) {
                        addr = confirm_addr_indev(in_dev, dst, local, scope);
                        if (addr)
                                break;
                }
        }
        rcu_read_unlock();

        return addr;
}
EXPORT_SYMBOL(inet_confirm_addr);

/*
 *        Device notifier
 */

int register_inetaddr_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&inetaddr_chain, nb);
}
EXPORT_SYMBOL(register_inetaddr_notifier);

int unregister_inetaddr_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
}
EXPORT_SYMBOL(unregister_inetaddr_notifier);

int register_inetaddr_validator_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_register(&inetaddr_validator_chain, nb);
}
EXPORT_SYMBOL(register_inetaddr_validator_notifier);

int unregister_inetaddr_validator_notifier(struct notifier_block *nb)
{
        return blocking_notifier_chain_unregister(&inetaddr_validator_chain,
            nb);
}
EXPORT_SYMBOL(unregister_inetaddr_validator_notifier);

/* Rename ifa_labels for a device name change. Make some effort to preserve
 * existing alias numbering and to create unique labels if possible.
*/
static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
{
        struct in_ifaddr *ifa;
        int named = 0;

        in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                char old[IFNAMSIZ], *dot;

                memcpy(old, ifa->ifa_label, IFNAMSIZ);
                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
                if (named++ == 0)
                        goto skip;
                dot = strchr(old, ':');
                if (!dot) {
                        sprintf(old, ":%d", named);
                        dot = old;
                }
                if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
                        strcat(ifa->ifa_label, dot);
                else
                        strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
skip:
                rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
        }
}

static void inetdev_send_gratuitous_arp(struct net_device *dev,
                                        struct in_device *in_dev)

{
        const struct in_ifaddr *ifa;

        in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                arp_send(ARPOP_REQUEST, ETH_P_ARP,
                         ifa->ifa_local, dev,
                         ifa->ifa_local, NULL,
                         dev->dev_addr, NULL);
        }
}

/* Called only under RTNL semaphore */

static int inetdev_event(struct notifier_block *this, unsigned long event,
                         void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct in_device *in_dev = __in_dev_get_rtnl(dev);

        ASSERT_RTNL();

        if (!in_dev) {
                if (event == NETDEV_REGISTER) {
                        in_dev = inetdev_init(dev);
                        if (IS_ERR(in_dev))
                                return notifier_from_errno(PTR_ERR(in_dev));
                        if (dev->flags & IFF_LOOPBACK) {
                                IN_DEV_CONF_SET(in_dev, NOXFRM, 1);
                                IN_DEV_CONF_SET(in_dev, NOPOLICY, 1);
                        }
                } else if (event == NETDEV_CHANGEMTU) {
                        /* Re-enabling IP */
                        if (inetdev_valid_mtu(dev->mtu))
                                in_dev = inetdev_init(dev);
                }
                goto out;
        }

        switch (event) {
        case NETDEV_REGISTER:
                pr_debug("%s: bug\n", __func__);
                RCU_INIT_POINTER(dev->ip_ptr, NULL);
                break;
        case NETDEV_UP:
                if (!inetdev_valid_mtu(dev->mtu))
                        break;
                if (dev->flags & IFF_LOOPBACK) {
                        struct in_ifaddr *ifa = inet_alloc_ifa(in_dev);

                        if (ifa) {
                                ifa->ifa_local =
                                  ifa->ifa_address = htonl(INADDR_LOOPBACK);
                                ifa->ifa_prefixlen = 8;
                                ifa->ifa_mask = inet_make_mask(8);
                                ifa->ifa_scope = RT_SCOPE_HOST;
                                memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
                                set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
                                                 INFINITY_LIFE_TIME);
                                ipv4_devconf_setall(in_dev);
                                neigh_parms_data_state_setall(in_dev->arp_parms);
                                inet_insert_ifa(ifa);
                        }
                }
                ip_mc_up(in_dev);
                fallthrough;
        case NETDEV_CHANGEADDR:
                if (!IN_DEV_ARP_NOTIFY(in_dev))
                        break;
                fallthrough;
        case NETDEV_NOTIFY_PEERS:
                /* Send gratuitous ARP to notify of link change */
                inetdev_send_gratuitous_arp(dev, in_dev);
                break;
        case NETDEV_DOWN:
                ip_mc_down(in_dev);
                break;
        case NETDEV_PRE_TYPE_CHANGE:
                ip_mc_unmap(in_dev);
                break;
        case NETDEV_POST_TYPE_CHANGE:
                ip_mc_remap(in_dev);
                break;
        case NETDEV_CHANGEMTU:
                if (inetdev_valid_mtu(dev->mtu))
                        break;
                /* disable IP when MTU is not enough */
                fallthrough;
        case NETDEV_UNREGISTER:
                inetdev_destroy(in_dev);
                break;
        case NETDEV_CHANGENAME:
                /* Do not notify about label change, this event is
                 * not interesting to applications using netlink.
                 */
                inetdev_changename(dev, in_dev);

                devinet_sysctl_unregister(in_dev);
                devinet_sysctl_register(in_dev);
                break;
        }
out:
        return NOTIFY_DONE;
}

static struct notifier_block ip_netdev_notifier = {
        .notifier_call = inetdev_event,
};

static size_t inet_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
               + nla_total_size(4) /* IFA_ADDRESS */
               + nla_total_size(4) /* IFA_LOCAL */
               + nla_total_size(4) /* IFA_BROADCAST */
               + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
               + nla_total_size(4)  /* IFA_FLAGS */
               + nla_total_size(1)  /* IFA_PROTO */
               + nla_total_size(4)  /* IFA_RT_PRIORITY */
               + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}

static inline u32 cstamp_delta(unsigned long cstamp)
{
        return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
}

static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
                         unsigned long tstamp, u32 preferred, u32 valid)
{
        struct ifa_cacheinfo ci;

        ci.cstamp = cstamp_delta(cstamp);
        ci.tstamp = cstamp_delta(tstamp);
        ci.ifa_prefered = preferred;
        ci.ifa_valid = valid;

        return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
}

static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa,
                            struct inet_fill_args *args)
{
        struct ifaddrmsg *ifm;
        struct nlmsghdr  *nlh;
        unsigned long tstamp;
        u32 preferred, valid;
        u32 flags;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm),
                        args->flags);
        if (!nlh)
                return -EMSGSIZE;

        ifm = nlmsg_data(nlh);
        ifm->ifa_family = AF_INET;
        ifm->ifa_prefixlen = ifa->ifa_prefixlen;

        flags = READ_ONCE(ifa->ifa_flags);
        /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits.
         * The 32bit value is given in IFA_FLAGS attribute.
         */
        ifm->ifa_flags = (__u8)flags;

        ifm->ifa_scope = ifa->ifa_scope;
        ifm->ifa_index = ifa->ifa_dev->dev->ifindex;

        if (args->netnsid >= 0 &&
            nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
                goto nla_put_failure;

        tstamp = READ_ONCE(ifa->ifa_tstamp);
        if (!(flags & IFA_F_PERMANENT)) {
                preferred = READ_ONCE(ifa->ifa_preferred_lft);
                valid = READ_ONCE(ifa->ifa_valid_lft);
                if (preferred != INFINITY_LIFE_TIME) {
                        long tval = (jiffies - tstamp) / HZ;

                        if (preferred > tval)
                                preferred -= tval;
                        else
                                preferred = 0;
                        if (valid != INFINITY_LIFE_TIME) {
                                if (valid > tval)
                                        valid -= tval;
                                else
                                        valid = 0;
                        }
                }
        } else {
                preferred = INFINITY_LIFE_TIME;
                valid = INFINITY_LIFE_TIME;
        }
        if ((ifa->ifa_address &&
             nla_put_in_addr(skb, IFA_ADDRESS, ifa->ifa_address)) ||
            (ifa->ifa_local &&
             nla_put_in_addr(skb, IFA_LOCAL, ifa->ifa_local)) ||
            (ifa->ifa_broadcast &&
             nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
            (ifa->ifa_label[0] &&
             nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
            (ifa->ifa_proto &&
             nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) ||
            nla_put_u32(skb, IFA_FLAGS, flags) ||
            (ifa->ifa_rt_priority &&
             nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
            put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp,
                          preferred, valid))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
                                      struct inet_fill_args *fillargs,
                                      struct net **tgt_net, struct sock *sk,
                                      struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[IFA_MAX+1];
        struct ifaddrmsg *ifm;
        int err, i;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
                return -EINVAL;
        }

        ifm = nlmsg_data(nlh);
        if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
                return -EINVAL;
        }

        fillargs->ifindex = ifm->ifa_index;
        if (fillargs->ifindex) {
                cb->answer_flags |= NLM_F_DUMP_FILTERED;
                fillargs->flags |= NLM_F_DUMP_FILTERED;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
                                            ifa_ipv4_policy, extack);
        if (err < 0)
                return err;

        for (i = 0; i <= IFA_MAX; ++i) {
                if (!tb[i])
                        continue;

                if (i == IFA_TARGET_NETNSID) {
                        struct net *net;

                        fillargs->netnsid = nla_get_s32(tb[i]);

                        net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
                        if (IS_ERR(net)) {
                                fillargs->netnsid = -1;
                                NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id");
                                return PTR_ERR(net);
                        }
                        *tgt_net = net;
                } else {
                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb,
                                struct netlink_callback *cb, int *s_ip_idx,
                                struct inet_fill_args *fillargs)
{
        struct ip_mc_list *im;
        int ip_idx = 0;
        int err;

        for (im = rcu_dereference(in_dev->mc_list);
             im;
             im = rcu_dereference(im->next_rcu)) {
                if (ip_idx < *s_ip_idx) {
                        ip_idx++;
                        continue;
                }
                err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs);
                if (err < 0)
                        goto done;

                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
                ip_idx++;
        }
        err = 0;
        ip_idx = 0;
done:
        *s_ip_idx = ip_idx;
        return err;
}

static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb,
                              struct netlink_callback *cb, int *s_ip_idx,
                              struct inet_fill_args *fillargs)
{
        struct in_ifaddr *ifa;
        int ip_idx = 0;
        int err;

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (ip_idx < *s_ip_idx) {
                        ip_idx++;
                        continue;
                }
                err = inet_fill_ifaddr(skb, ifa, fillargs);
                if (err < 0)
                        goto done;

                nl_dump_check_consistent(cb, nlmsg_hdr(skb));
                ip_idx++;
        }
        err = 0;
        ip_idx = 0;
done:
        *s_ip_idx = ip_idx;

        return err;
}

static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
                            struct netlink_callback *cb, int *s_ip_idx,
                            struct inet_fill_args *fillargs)
{
        switch (fillargs->event) {
        case RTM_NEWADDR:
                return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs);
        case RTM_GETMULTICAST:
                return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx,
                                            fillargs);
        default:
                return -EINVAL;
        }
}

/* Combine dev_addr_genid and dev_base_seq to detect changes.
 */
static u32 inet_base_seq(const struct net *net)
{
        u32 res = atomic_read(&net->ipv4.dev_addr_genid) +
                  READ_ONCE(net->dev_base_seq);

        /* Must not return 0 (see nl_dump_check_consistent()).
         * Chose a value far away from 0.
         */
        if (!res)
                res = 0x80000000;
        return res;
}

static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
                          int event)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct inet_fill_args fillargs = {
                .portid = NETLINK_CB(cb->skb).portid,
                .seq = nlh->nlmsg_seq,
                .event = event,
                .flags = NLM_F_MULTI,
                .netnsid = -1,
        };
        struct net *net = sock_net(skb->sk);
        struct net *tgt_net = net;
        struct {
                unsigned long ifindex;
                int ip_idx;
        } *ctx = (void *)cb->ctx;
        struct in_device *in_dev;
        struct net_device *dev;
        int err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
                                                 skb->sk, cb);
                if (err < 0)
                        goto done;

                if (fillargs.ifindex) {
                        dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex);
                        if (!dev) {
                                err = -ENODEV;
                                goto done;
                        }
                        in_dev = __in_dev_get_rcu(dev);
                        if (!in_dev)
                                goto done;
                        err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx,
                                               &fillargs);
                        goto done;
                }
        }

        cb->seq = inet_base_seq(tgt_net);

        for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
                in_dev = __in_dev_get_rcu(dev);
                if (!in_dev)
                        continue;
                err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx,
                                       &fillargs);
                if (err < 0)
                        goto done;
        }
done:
        if (fillargs.netnsid >= 0)
                put_net(tgt_net);
        rcu_read_unlock();
        return err;
}

static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        return inet_dump_addr(skb, cb, RTM_NEWADDR);
}

static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        return inet_dump_addr(skb, cb, RTM_GETMULTICAST);
}

static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
                      u32 portid)
{
        struct inet_fill_args fillargs = {
                .portid = portid,
                .seq = nlh ? nlh->nlmsg_seq : 0,
                .event = event,
                .flags = 0,
                .netnsid = -1,
        };
        struct sk_buff *skb;
        int err = -ENOBUFS;
        struct net *net;

        net = dev_net(ifa->ifa_dev->dev);
        skb = nlmsg_new(inet_nlmsg_size(), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet_fill_ifaddr(skb, ifa, &fillargs);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
}

static size_t inet_get_link_af_size(const struct net_device *dev,
                                    u32 ext_filter_mask)
{
        struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);

        if (!in_dev)
                return 0;

        return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
}

static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
                             u32 ext_filter_mask)
{
        struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
        struct nlattr *nla;
        int i;

        if (!in_dev)
                return -ENODATA;

        nla = nla_reserve(skb, IFLA_INET_CONF, IPV4_DEVCONF_MAX * 4);
        if (!nla)
                return -EMSGSIZE;

        for (i = 0; i < IPV4_DEVCONF_MAX; i++)
                ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]);

        return 0;
}

static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
        [IFLA_INET_CONF]        = { .type = NLA_NESTED },
};

static int inet_validate_link_af(const struct net_device *dev,
                                 const struct nlattr *nla,
                                 struct netlink_ext_ack *extack)
{
        struct nlattr *a, *tb[IFLA_INET_MAX+1];
        int err, rem;

        if (dev && !__in_dev_get_rtnl(dev))
                return -EAFNOSUPPORT;

        err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla,
                                          inet_af_policy, extack);
        if (err < 0)
                return err;

        if (tb[IFLA_INET_CONF]) {
                nla_for_each_nested(a, tb[IFLA_INET_CONF], rem) {
                        int cfgid = nla_type(a);

                        if (nla_len(a) < 4)
                                return -EINVAL;

                        if (cfgid <= 0 || cfgid > IPV4_DEVCONF_MAX)
                                return -EINVAL;
                }
        }

        return 0;
}

static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla,
                            struct netlink_ext_ack *extack)
{
        struct in_device *in_dev = __in_dev_get_rtnl(dev);
        struct nlattr *a, *tb[IFLA_INET_MAX+1];
        int rem;

        if (!in_dev)
                return -EAFNOSUPPORT;

        if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0)
                return -EINVAL;

        if (tb[IFLA_INET_CONF]) {
                nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
                        ipv4_devconf_set(in_dev, nla_type(a), nla_get_u32(a));
        }

        return 0;
}

static int inet_netconf_msgsize_devconf(int type)
{
        int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
                   + nla_total_size(4);        /* NETCONFA_IFINDEX */
        bool all = false;

        if (type == NETCONFA_ALL)
                all = true;

        if (all || type == NETCONFA_FORWARDING)
                size += nla_total_size(4);
        if (all || type == NETCONFA_RP_FILTER)
                size += nla_total_size(4);
        if (all || type == NETCONFA_MC_FORWARDING)
                size += nla_total_size(4);
        if (all || type == NETCONFA_BC_FORWARDING)
                size += nla_total_size(4);
        if (all || type == NETCONFA_PROXY_NEIGH)
                size += nla_total_size(4);
        if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
                size += nla_total_size(4);

        return size;
}

static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
                                     const struct ipv4_devconf *devconf,
                                     u32 portid, u32 seq, int event,
                                     unsigned int flags, int type)
{
        struct nlmsghdr  *nlh;
        struct netconfmsg *ncm;
        bool all = false;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
                        flags);
        if (!nlh)
                return -EMSGSIZE;

        if (type == NETCONFA_ALL)
                all = true;

        ncm = nlmsg_data(nlh);
        ncm->ncm_family = AF_INET;

        if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
                goto nla_put_failure;

        if (!devconf)
                goto out;

        if ((all || type == NETCONFA_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_FORWARDING,
                        IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_RP_FILTER) &&
            nla_put_s32(skb, NETCONFA_RP_FILTER,
                        IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_MC_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_MC_FORWARDING,
                        IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_BC_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_BC_FORWARDING,
                        IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_PROXY_NEIGH) &&
            nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
                        IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0)
                goto nla_put_failure;
        if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
            nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                        IPV4_DEVCONF_RO(*devconf,
                                        IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
                goto nla_put_failure;

out:
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

void inet_netconf_notify_devconf(struct net *net, int event, int type,
                                 int ifindex, struct ipv4_devconf *devconf)
{
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(inet_netconf_msgsize_devconf(type), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
                                        event, 0, type);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
}

static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
        [NETCONFA_IFINDEX]        = { .len = sizeof(int) },
        [NETCONFA_FORWARDING]        = { .len = sizeof(int) },
        [NETCONFA_RP_FILTER]        = { .len = sizeof(int) },
        [NETCONFA_PROXY_NEIGH]        = { .len = sizeof(int) },
        [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN]        = { .len = sizeof(int) },
};

static int inet_netconf_valid_get_req(struct sk_buff *skb,
                                      const struct nlmsghdr *nlh,
                                      struct nlattr **tb,
                                      struct netlink_ext_ack *extack)
{
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
                                              tb, NETCONFA_MAX,
                                              devconf_ipv4_policy, extack);

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
                                            tb, NETCONFA_MAX,
                                            devconf_ipv4_policy, extack);
        if (err)
                return err;

        for (i = 0; i <= NETCONFA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NETCONFA_IFINDEX:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet_netconf_get_devconf(struct sk_buff *in_skb,
                                    struct nlmsghdr *nlh,
                                    struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[NETCONFA_MAX + 1];
        const struct ipv4_devconf *devconf;
        struct in_device *in_dev = NULL;
        struct net_device *dev = NULL;
        struct sk_buff *skb;
        int ifindex;
        int err;

        err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack);
        if (err)
                return err;

        if (!tb[NETCONFA_IFINDEX])
                return -EINVAL;

        ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
        switch (ifindex) {
        case NETCONFA_IFINDEX_ALL:
                devconf = net->ipv4.devconf_all;
                break;
        case NETCONFA_IFINDEX_DEFAULT:
                devconf = net->ipv4.devconf_dflt;
                break;
        default:
                err = -ENODEV;
                dev = dev_get_by_index(net, ifindex);
                if (dev)
                        in_dev = in_dev_get(dev);
                if (!in_dev)
                        goto errout;
                devconf = &in_dev->cnf;
                break;
        }

        err = -ENOBUFS;
        skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet_netconf_fill_devconf(skb, ifindex, devconf,
                                        NETLINK_CB(in_skb).portid,
                                        nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
                                        NETCONFA_ALL);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
        if (in_dev)
                in_dev_put(in_dev);
        dev_put(dev);
        return err;
}

static int inet_netconf_dump_devconf(struct sk_buff *skb,
                                     struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        struct {
                unsigned long ifindex;
                unsigned int all_default;
        } *ctx = (void *)cb->ctx;
        const struct in_device *in_dev;
        struct net_device *dev;
        int err = 0;

        if (cb->strict_check) {
                struct netlink_ext_ack *extack = cb->extack;
                struct netconfmsg *ncm;

                if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
                        NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request");
                        return -EINVAL;
                }

                if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
                        NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request");
                        return -EINVAL;
                }
        }

        rcu_read_lock();
        for_each_netdev_dump(net, dev, ctx->ifindex) {
                in_dev = __in_dev_get_rcu(dev);
                if (!in_dev)
                        continue;
                err = inet_netconf_fill_devconf(skb, dev->ifindex,
                                                &in_dev->cnf,
                                                NETLINK_CB(cb->skb).portid,
                                                nlh->nlmsg_seq,
                                                RTM_NEWNETCONF, NLM_F_MULTI,
                                                NETCONFA_ALL);
                if (err < 0)
                        goto done;
        }
        if (ctx->all_default == 0) {
                err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
                                                net->ipv4.devconf_all,
                                                NETLINK_CB(cb->skb).portid,
                                                nlh->nlmsg_seq,
                                                RTM_NEWNETCONF, NLM_F_MULTI,
                                                NETCONFA_ALL);
                if (err < 0)
                        goto done;
                ctx->all_default++;
        }
        if (ctx->all_default == 1) {
                err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
                                                net->ipv4.devconf_dflt,
                                                NETLINK_CB(cb->skb).portid,
                                                nlh->nlmsg_seq,
                                                RTM_NEWNETCONF, NLM_F_MULTI,
                                                NETCONFA_ALL);
                if (err < 0)
                        goto done;
                ctx->all_default++;
        }
done:
        rcu_read_unlock();
        return err;
}

#ifdef CONFIG_SYSCTL

static void devinet_copy_dflt_conf(struct net *net, int i)
{
        struct net_device *dev;

        rcu_read_lock();
        for_each_netdev_rcu(net, dev) {
                struct in_device *in_dev;

                in_dev = __in_dev_get_rcu(dev);
                if (in_dev && !test_bit(i, in_dev->cnf.state))
                        in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
        }
        rcu_read_unlock();
}

/* called with RTNL locked */
static void inet_forward_change(struct net *net)
{
        struct net_device *dev;
        int on = IPV4_DEVCONF_ALL(net, FORWARDING);

        IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
        IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                    NETCONFA_FORWARDING,
                                    NETCONFA_IFINDEX_ALL,
                                    net->ipv4.devconf_all);
        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                    NETCONFA_FORWARDING,
                                    NETCONFA_IFINDEX_DEFAULT,
                                    net->ipv4.devconf_dflt);

        for_each_netdev(net, dev) {
                struct in_device *in_dev;

                if (on)
                        dev_disable_lro(dev);

                in_dev = __in_dev_get_rtnl_net(dev);
                if (in_dev) {
                        IN_DEV_CONF_SET(in_dev, FORWARDING, on);
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_FORWARDING,
                                                    dev->ifindex, &in_dev->cnf);
                }
        }
}

static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
{
        if (cnf == net->ipv4.devconf_dflt)
                return NETCONFA_IFINDEX_DEFAULT;
        else if (cnf == net->ipv4.devconf_all)
                return NETCONFA_IFINDEX_ALL;
        else {
                struct in_device *idev
                        = container_of(cnf, struct in_device, cnf);
                return idev->dev->ifindex;
        }
}

static int devinet_conf_proc(const struct ctl_table *ctl, int write,
                             void *buffer, size_t *lenp, loff_t *ppos)
{
        int old_value = *(int *)ctl->data;
        int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        int new_value = *(int *)ctl->data;

        if (write) {
                struct ipv4_devconf *cnf = ctl->extra1;
                struct net *net = ctl->extra2;
                int i = (int *)ctl->data - cnf->data;
                int ifindex;

                set_bit(i, cnf->state);

                if (cnf == net->ipv4.devconf_dflt)
                        devinet_copy_dflt_conf(net, i);
                if (i == IPV4_DEVCONF_ACCEPT_LOCAL - 1 ||
                    i == IPV4_DEVCONF_ROUTE_LOCALNET - 1)
                        if ((new_value == 0) && (old_value != 0))
                                rt_cache_flush(net);

                if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
                    new_value != old_value)
                        rt_cache_flush(net);

                if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
                    new_value != old_value) {
                        ifindex = devinet_conf_ifindex(net, cnf);
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_RP_FILTER,
                                                    ifindex, cnf);
                }
                if (i == IPV4_DEVCONF_PROXY_ARP - 1 &&
                    new_value != old_value) {
                        ifindex = devinet_conf_ifindex(net, cnf);
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_PROXY_NEIGH,
                                                    ifindex, cnf);
                }
                if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 &&
                    new_value != old_value) {
                        ifindex = devinet_conf_ifindex(net, cnf);
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                                                    ifindex, cnf);
                }
        }

        return ret;
}

static int devinet_sysctl_forward(const struct ctl_table *ctl, int write,
                                  void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct net *net = ctl->extra2;
        int ret;

        if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        ret = proc_dointvec(ctl, write, buffer, lenp, ppos);

        if (write && *valp != val) {
                if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
                        if (!rtnl_net_trylock(net)) {
                                /* Restore the original values before restarting */
                                *valp = val;
                                *ppos = pos;
                                return restart_syscall();
                        }
                        if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
                                inet_forward_change(net);
                        } else {
                                struct ipv4_devconf *cnf = ctl->extra1;
                                struct in_device *idev =
                                        container_of(cnf, struct in_device, cnf);
                                if (*valp)
                                        dev_disable_lro(idev->dev);
                                inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                            NETCONFA_FORWARDING,
                                                            idev->dev->ifindex,
                                                            cnf);
                        }
                        rtnl_net_unlock(net);
                        rt_cache_flush(net);
                } else
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_FORWARDING,
                                                    NETCONFA_IFINDEX_DEFAULT,
                                                    net->ipv4.devconf_dflt);
        }

        return ret;
}

static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        struct net *net = ctl->extra2;

        if (write && *valp != val)
                rt_cache_flush(net);

        return ret;
}

#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
        { \
                .procname        = name, \
                .data                = ipv4_devconf.data + \
                                  IPV4_DEVCONF_ ## attr - 1, \
                .maxlen                = sizeof(int), \
                .mode                = mval, \
                .proc_handler        = proc, \
                .extra1                = &ipv4_devconf, \
        }

#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
        DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)

#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
        DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)

#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
        DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)

#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
        DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)

static struct devinet_sysctl_table {
        struct ctl_table_header *sysctl_header;
        struct ctl_table devinet_vars[IPV4_DEVCONF_MAX];
} devinet_sysctl = {
        .devinet_vars = {
                DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
                                             devinet_sysctl_forward),
                DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
                DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),

                DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
                DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
                DEVINET_SYSCTL_RW_ENTRY(SHARED_MEDIA, "shared_media"),
                DEVINET_SYSCTL_RW_ENTRY(RP_FILTER, "rp_filter"),
                DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
                DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
                                        "accept_source_route"),
                DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
                DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
                DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
                DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
                DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
                DEVINET_SYSCTL_RW_ENTRY(LOG_MARTIANS, "log_martians"),
                DEVINET_SYSCTL_RW_ENTRY(TAG, "tag"),
                DEVINET_SYSCTL_RW_ENTRY(ARPFILTER, "arp_filter"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
                DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER,
                                        "arp_evict_nocarrier"),
                DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
                DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
                                        "force_igmp_version"),
                DEVINET_SYSCTL_RW_ENTRY(IGMPV2_UNSOLICITED_REPORT_INTERVAL,
                                        "igmpv2_unsolicited_report_interval"),
                DEVINET_SYSCTL_RW_ENTRY(IGMPV3_UNSOLICITED_REPORT_INTERVAL,
                                        "igmpv3_unsolicited_report_interval"),
                DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
                                        "ignore_routes_with_linkdown"),
                DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
                                        "drop_gratuitous_arp"),

                DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(PROMOTE_SECONDARIES,
                                              "promote_secondaries"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
                                              "route_localnet"),
                DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
                                              "drop_unicast_in_l2_multicast"),
        },
};

static int __devinet_sysctl_register(struct net *net, char *dev_name,
                                     int ifindex, struct ipv4_devconf *p)
{
        int i;
        struct devinet_sysctl_table *t;
        char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];

        t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT);
        if (!t)
                goto out;

        for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) {
                t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
                t->devinet_vars[i].extra1 = p;
                t->devinet_vars[i].extra2 = net;
        }

        snprintf(path, sizeof(path), "net/ipv4/conf/%s", dev_name);

        t->sysctl_header = register_net_sysctl(net, path, t->devinet_vars);
        if (!t->sysctl_header)
                goto free;

        p->sysctl = t;

        inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL,
                                    ifindex, p);
        return 0;

free:
        kfree(t);
out:
        return -ENOMEM;
}

static void __devinet_sysctl_unregister(struct net *net,
                                        struct ipv4_devconf *cnf, int ifindex)
{
        struct devinet_sysctl_table *t = cnf->sysctl;

        if (t) {
                cnf->sysctl = NULL;
                unregister_net_sysctl_table(t->sysctl_header);
                kfree(t);
        }

        inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL);
}

static int devinet_sysctl_register(struct in_device *idev)
{
        int err;

        if (!sysctl_dev_name_is_allowed(idev->dev->name))
                return -EINVAL;

        err = neigh_sysctl_register(idev->dev, idev->arp_parms, NULL);
        if (err)
                return err;
        err = __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
                                        idev->dev->ifindex, &idev->cnf);
        if (err)
                neigh_sysctl_unregister(idev->arp_parms);
        return err;
}

static void devinet_sysctl_unregister(struct in_device *idev)
{
        struct net *net = dev_net(idev->dev);

        __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex);
        neigh_sysctl_unregister(idev->arp_parms);
}

static struct ctl_table ctl_forward_entry[] = {
        {
                .procname        = "ip_forward",
                .data                = &ipv4_devconf.data[
                                        IPV4_DEVCONF_FORWARDING - 1],
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = devinet_sysctl_forward,
                .extra1                = &ipv4_devconf,
                .extra2                = &init_net,
        },
};
#endif

static __net_init int devinet_init_net(struct net *net)
{
#ifdef CONFIG_SYSCTL
        struct ctl_table_header *forw_hdr;
        struct ctl_table *tbl;
#endif
        struct ipv4_devconf *all, *dflt;
        int err;
        int i;

        err = -ENOMEM;
        net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE,
                                                sizeof(struct hlist_head),
                                                GFP_KERNEL);
        if (!net->ipv4.inet_addr_lst)
                goto err_alloc_hash;

        all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL);
        if (!all)
                goto err_alloc_all;

        dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
        if (!dflt)
                goto err_alloc_dflt;

#ifdef CONFIG_SYSCTL
        tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL);
        if (!tbl)
                goto err_alloc_ctl;

        tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
        tbl[0].extra1 = all;
        tbl[0].extra2 = net;
#endif

        if (!net_eq(net, &init_net)) {
                switch (net_inherit_devconf()) {
                case 3:
                        /* copy from the current netns */
                        memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all,
                               sizeof(ipv4_devconf));
                        memcpy(dflt,
                               current->nsproxy->net_ns->ipv4.devconf_dflt,
                               sizeof(ipv4_devconf_dflt));
                        break;
                case 0:
                case 1:
                        /* copy from init_net */
                        memcpy(all, init_net.ipv4.devconf_all,
                               sizeof(ipv4_devconf));
                        memcpy(dflt, init_net.ipv4.devconf_dflt,
                               sizeof(ipv4_devconf_dflt));
                        break;
                case 2:
                        /* use compiled values */
                        break;
                }
        }

#ifdef CONFIG_SYSCTL
        err = __devinet_sysctl_register(net, "all", NETCONFA_IFINDEX_ALL, all);
        if (err < 0)
                goto err_reg_all;

        err = __devinet_sysctl_register(net, "default",
                                        NETCONFA_IFINDEX_DEFAULT, dflt);
        if (err < 0)
                goto err_reg_dflt;

        err = -ENOMEM;
        forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl,
                                          ARRAY_SIZE(ctl_forward_entry));
        if (!forw_hdr)
                goto err_reg_ctl;
        net->ipv4.forw_hdr = forw_hdr;
#endif

        for (i = 0; i < IN4_ADDR_HSIZE; i++)
                INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]);

        INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime);

        net->ipv4.devconf_all = all;
        net->ipv4.devconf_dflt = dflt;
        return 0;

#ifdef CONFIG_SYSCTL
err_reg_ctl:
        __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT);
err_reg_dflt:
        __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
        kfree(tbl);
err_alloc_ctl:
#endif
        kfree(dflt);
err_alloc_dflt:
        kfree(all);
err_alloc_all:
        kfree(net->ipv4.inet_addr_lst);
err_alloc_hash:
        return err;
}

static __net_exit void devinet_exit_net(struct net *net)
{
#ifdef CONFIG_SYSCTL
        const struct ctl_table *tbl;
#endif

        cancel_delayed_work_sync(&net->ipv4.addr_chk_work);

#ifdef CONFIG_SYSCTL
        tbl = net->ipv4.forw_hdr->ctl_table_arg;
        unregister_net_sysctl_table(net->ipv4.forw_hdr);
        __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt,
                                    NETCONFA_IFINDEX_DEFAULT);
        __devinet_sysctl_unregister(net, net->ipv4.devconf_all,
                                    NETCONFA_IFINDEX_ALL);
        kfree(tbl);
#endif
        kfree(net->ipv4.devconf_dflt);
        kfree(net->ipv4.devconf_all);
        kfree(net->ipv4.inet_addr_lst);
}

static __net_initdata struct pernet_operations devinet_ops = {
        .init = devinet_init_net,
        .exit = devinet_exit_net,
};

static struct rtnl_af_ops inet_af_ops __read_mostly = {
        .family                  = AF_INET,
        .fill_link_af          = inet_fill_link_af,
        .get_link_af_size = inet_get_link_af_size,
        .validate_link_af = inet_validate_link_af,
        .set_link_af          = inet_set_link_af,
};

static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = {
        {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr,
         .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr,
         .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr,
         .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
        {.protocol = PF_INET, .msgtype = RTM_GETNETCONF,
         .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf,
         .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
        {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST,
         .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED},
};

void __init devinet_init(void)
{
        register_pernet_subsys(&devinet_ops);
        register_netdevice_notifier(&ip_netdev_notifier);

        if (rtnl_af_register(&inet_af_ops))
                panic("Unable to register inet_af_ops\n");

        rtnl_register_many(devinet_rtnl_msg_handlers);
}



























   23 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Global definitions for the Ethernet IEEE 802.3 interface.
 *
 * Version:        @(#)if_ether.h        1.0.1a        02/08/94
 *
 * Author:        Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Donald Becker, <becker@super.org>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Steve Whitehouse, <gw7rrm@eeshack3.swan.ac.uk>
 */
#ifndef _LINUX_IF_ETHER_H
#define _LINUX_IF_ETHER_H

#include <linux/skbuff.h>
#include <uapi/linux/if_ether.h>

/* XX:XX:XX:XX:XX:XX */
#define MAC_ADDR_STR_LEN (3 * ETH_ALEN - 1)

static inline struct ethhdr *eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb_mac_header(skb);
}

/* Prefer this version in TX path, instead of
 * skb_reset_mac_header() + eth_hdr()
 */
static inline struct ethhdr *skb_eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb->data;
}

static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb)
{
        return (struct ethhdr *)skb_inner_mac_header(skb);
}

int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr);

extern ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len);

#endif        /* _LINUX_IF_ETHER_H */




























































































































  202 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Landlock - Credential hooks
 *
 * Copyright © 2019-2020 Mickaël Salaün <mic@digikod.net>
 * Copyright © 2019-2020 ANSSI
 * Copyright © 2021-2025 Microsoft Corporation
 */

#ifndef _SECURITY_LANDLOCK_CRED_H
#define _SECURITY_LANDLOCK_CRED_H

#include <linux/container_of.h>
#include <linux/cred.h>
#include <linux/init.h>
#include <linux/rcupdate.h>

#include "access.h"
#include "limits.h"
#include "ruleset.h"
#include "setup.h"

/**
 * struct landlock_cred_security - Credential security blob
 *
 * This structure is packed to minimize the size of struct
 * landlock_file_security.  However, it is always aligned in the LSM cred blob,
 * see lsm_set_blob_size().
 */
struct landlock_cred_security {
        /**
         * @domain: Immutable ruleset enforced on a task.
         */
        struct landlock_ruleset *domain;

#ifdef CONFIG_AUDIT
        /**
         * @domain_exec: Bitmask identifying the domain layers that were enforced by
         * the current task's executed file (i.e. no new execve(2) since
         * landlock_restrict_self(2)).
         */
        u16 domain_exec;
        /**
         * @log_subdomains_off: Set if the domain descendants's log_status should be
         * set to %LANDLOCK_LOG_DISABLED.  This is not a landlock_hierarchy
         * configuration because it applies to future descendant domains and it does
         * not require a current domain.
         */
        u8 log_subdomains_off : 1;
#endif /* CONFIG_AUDIT */
} __packed;

#ifdef CONFIG_AUDIT

/* Makes sure all layer executions can be stored. */
static_assert(BITS_PER_TYPE(typeof_member(struct landlock_cred_security,
                                          domain_exec)) >=
              LANDLOCK_MAX_NUM_LAYERS);

#endif /* CONFIG_AUDIT */

static inline struct landlock_cred_security *
landlock_cred(const struct cred *cred)
{
        return cred->security + landlock_blob_sizes.lbs_cred;
}

static inline struct landlock_ruleset *landlock_get_current_domain(void)
{
        return landlock_cred(current_cred())->domain;
}

/*
 * The call needs to come from an RCU read-side critical section.
 */
static inline const struct landlock_ruleset *
landlock_get_task_domain(const struct task_struct *const task)
{
        return landlock_cred(__task_cred(task))->domain;
}

static inline bool landlocked(const struct task_struct *const task)
{
        bool has_dom;

        if (task == current)
                return !!landlock_get_current_domain();

        rcu_read_lock();
        has_dom = !!landlock_get_task_domain(task);
        rcu_read_unlock();
        return has_dom;
}

/**
 * landlock_get_applicable_subject - Return the subject's Landlock credential
 *                                   if its enforced domain applies to (i.e.
 *                                   handles) at least one of the access rights
 *                                   specified in @masks
 *
 * @cred: credential
 * @masks: access masks
 * @handle_layer: returned youngest layer handling a subset of @masks.  Not set
 *                if the function returns NULL.
 *
 * Returns: landlock_cred(@cred) if any access rights specified in @masks is
 * handled, or NULL otherwise.
 */
static inline const struct landlock_cred_security *
landlock_get_applicable_subject(const struct cred *const cred,
                                const struct access_masks masks,
                                size_t *const handle_layer)
{
        const union access_masks_all masks_all = {
                .masks = masks,
        };
        const struct landlock_ruleset *domain;
        ssize_t layer_level;

        if (!cred)
                return NULL;

        domain = landlock_cred(cred)->domain;
        if (!domain)
                return NULL;

        for (layer_level = domain->num_layers - 1; layer_level >= 0;
             layer_level--) {
                union access_masks_all layer = {
                        .masks = domain->access_masks[layer_level],
                };

                if (layer.all & masks_all.all) {
                        if (handle_layer)
                                *handle_layer = layer_level;

                        return landlock_cred(cred);
                }
        }

        return NULL;
}

__init void landlock_add_cred_hooks(void);

#endif /* _SECURITY_LANDLOCK_CRED_H */
















































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
/*
 * Copyright IBM Corporation, 2012
 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 */

#ifndef _LINUX_HUGETLB_CGROUP_H
#define _LINUX_HUGETLB_CGROUP_H

#include <linux/mmdebug.h>

struct hugetlb_cgroup;
struct resv_map;
struct file_region;

#ifdef CONFIG_CGROUP_HUGETLB
enum hugetlb_memory_event {
        HUGETLB_MAX,
        HUGETLB_NR_MEMORY_EVENTS,
};

struct hugetlb_cgroup_per_node {
        /* hugetlb usage in pages over all hstates. */
        unsigned long usage[HUGE_MAX_HSTATE];
};

struct hugetlb_cgroup {
        struct cgroup_subsys_state css;

        /*
         * the counter to account for hugepages from hugetlb.
         */
        struct page_counter hugepage[HUGE_MAX_HSTATE];

        /*
         * the counter to account for hugepage reservations from hugetlb.
         */
        struct page_counter rsvd_hugepage[HUGE_MAX_HSTATE];

        atomic_long_t events[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];
        atomic_long_t events_local[HUGE_MAX_HSTATE][HUGETLB_NR_MEMORY_EVENTS];

        /* Handle for "hugetlb.events" */
        struct cgroup_file events_file[HUGE_MAX_HSTATE];

        /* Handle for "hugetlb.events.local" */
        struct cgroup_file events_local_file[HUGE_MAX_HSTATE];

        struct hugetlb_cgroup_per_node *nodeinfo[];
};

static inline struct hugetlb_cgroup *
__hugetlb_cgroup_from_folio(struct folio *folio, bool rsvd)
{
        VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        if (rsvd)
                return folio->_hugetlb_cgroup_rsvd;
        else
                return folio->_hugetlb_cgroup;
}

static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio)
{
        return __hugetlb_cgroup_from_folio(folio, false);
}

static inline struct hugetlb_cgroup *
hugetlb_cgroup_from_folio_rsvd(struct folio *folio)
{
        return __hugetlb_cgroup_from_folio(folio, true);
}

static inline void __set_hugetlb_cgroup(struct folio *folio,
                                       struct hugetlb_cgroup *h_cg, bool rsvd)
{
        VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        if (rsvd)
                folio->_hugetlb_cgroup_rsvd = h_cg;
        else
                folio->_hugetlb_cgroup = h_cg;
}

static inline void set_hugetlb_cgroup(struct folio *folio,
                                     struct hugetlb_cgroup *h_cg)
{
        __set_hugetlb_cgroup(folio, h_cg, false);
}

static inline void set_hugetlb_cgroup_rsvd(struct folio *folio,
                                          struct hugetlb_cgroup *h_cg)
{
        __set_hugetlb_cgroup(folio, h_cg, true);
}

static inline bool hugetlb_cgroup_disabled(void)
{
        return !cgroup_subsys_enabled(hugetlb_cgrp_subsys);
}

static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
{
        css_put(&h_cg->css);
}

static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
                                                struct resv_map *resv_map)
{
        if (resv_map->css)
                css_get(resv_map->css);
}

static inline void resv_map_put_hugetlb_cgroup_uncharge_info(
                                                struct resv_map *resv_map)
{
        if (resv_map->css)
                css_put(resv_map->css);
}

extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                        struct hugetlb_cgroup **ptr);
extern int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                             struct hugetlb_cgroup **ptr);
extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
                                         struct hugetlb_cgroup *h_cg,
                                         struct folio *folio);
extern void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
                                              struct hugetlb_cgroup *h_cg,
                                              struct folio *folio);
extern void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
                                         struct folio *folio);
extern void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
                                              struct folio *folio);

extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
                                           struct hugetlb_cgroup *h_cg);
extern void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                                struct hugetlb_cgroup *h_cg);
extern void hugetlb_cgroup_uncharge_counter(struct resv_map *resv,
                                            unsigned long start,
                                            unsigned long end);

extern void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
                                                struct file_region *rg,
                                                unsigned long nr_pages,
                                                bool region_del);

extern void hugetlb_cgroup_file_init(void) __init;
extern void hugetlb_cgroup_migrate(struct folio *old_folio,
                                   struct folio *new_folio);

#else
static inline void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
                                                       struct file_region *rg,
                                                       unsigned long nr_pages,
                                                       bool region_del)
{
}

static inline struct hugetlb_cgroup *hugetlb_cgroup_from_folio(struct folio *folio)
{
        return NULL;
}

static inline struct hugetlb_cgroup *
hugetlb_cgroup_from_folio_rsvd(struct folio *folio)
{
        return NULL;
}

static inline void set_hugetlb_cgroup(struct folio *folio,
                                     struct hugetlb_cgroup *h_cg)
{
}

static inline void set_hugetlb_cgroup_rsvd(struct folio *folio,
                                          struct hugetlb_cgroup *h_cg)
{
}

static inline bool hugetlb_cgroup_disabled(void)
{
        return true;
}

static inline void hugetlb_cgroup_put_rsvd_cgroup(struct hugetlb_cgroup *h_cg)
{
}

static inline void resv_map_dup_hugetlb_cgroup_uncharge_info(
                                                struct resv_map *resv_map)
{
}

static inline void resv_map_put_hugetlb_cgroup_uncharge_info(
                                                struct resv_map *resv_map)
{
}

static inline int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                               struct hugetlb_cgroup **ptr)
{
        return 0;
}

static inline int hugetlb_cgroup_charge_cgroup_rsvd(int idx,
                                                    unsigned long nr_pages,
                                                    struct hugetlb_cgroup **ptr)
{
        return 0;
}

static inline void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
                                                struct hugetlb_cgroup *h_cg,
                                                struct folio *folio)
{
}

static inline void
hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
                                  struct hugetlb_cgroup *h_cg,
                                  struct folio *folio)
{
}

static inline void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
                                                struct folio *folio)
{
}

static inline void hugetlb_cgroup_uncharge_folio_rsvd(int idx,
                                                     unsigned long nr_pages,
                                                     struct folio *folio)
{
}
static inline void hugetlb_cgroup_uncharge_cgroup(int idx,
                                                  unsigned long nr_pages,
                                                  struct hugetlb_cgroup *h_cg)
{
}

static inline void
hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                    struct hugetlb_cgroup *h_cg)
{
}

static inline void hugetlb_cgroup_uncharge_counter(struct resv_map *resv,
                                                   unsigned long start,
                                                   unsigned long end)
{
}

static inline void hugetlb_cgroup_file_init(void)
{
}

static inline void hugetlb_cgroup_migrate(struct folio *old_folio,
                                          struct folio *new_folio)
{
}

#endif  /* CONFIG_MEM_RES_CTLR_HUGETLB */
#endif







































































































































































































































































  198 


















  197 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2013 Huawei Ltd.
 * Author: Jiang Liu <liuj97@gmail.com>
 *
 * Copyright (C) 2014-2016 Zi Shen Lim <zlim.lnx@gmail.com>
 */
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/printk.h>
#include <linux/sizes.h>
#include <linux/types.h>

#include <asm/debug-monitors.h>
#include <asm/errno.h>
#include <asm/insn.h>
#include <asm/kprobes.h>

#define AARCH64_INSN_SF_BIT        BIT(31)
#define AARCH64_INSN_N_BIT        BIT(22)
#define AARCH64_INSN_LSL_12        BIT(22)

static int __kprobes aarch64_get_imm_shift_mask(enum aarch64_insn_imm_type type,
                                                u32 *maskp, int *shiftp)
{
        u32 mask;
        int shift;

        switch (type) {
        case AARCH64_INSN_IMM_26:
                mask = BIT(26) - 1;
                shift = 0;
                break;
        case AARCH64_INSN_IMM_19:
                mask = BIT(19) - 1;
                shift = 5;
                break;
        case AARCH64_INSN_IMM_16:
                mask = BIT(16) - 1;
                shift = 5;
                break;
        case AARCH64_INSN_IMM_14:
                mask = BIT(14) - 1;
                shift = 5;
                break;
        case AARCH64_INSN_IMM_12:
                mask = BIT(12) - 1;
                shift = 10;
                break;
        case AARCH64_INSN_IMM_9:
                mask = BIT(9) - 1;
                shift = 12;
                break;
        case AARCH64_INSN_IMM_7:
                mask = BIT(7) - 1;
                shift = 15;
                break;
        case AARCH64_INSN_IMM_6:
        case AARCH64_INSN_IMM_S:
                mask = BIT(6) - 1;
                shift = 10;
                break;
        case AARCH64_INSN_IMM_R:
                mask = BIT(6) - 1;
                shift = 16;
                break;
        case AARCH64_INSN_IMM_N:
                mask = 1;
                shift = 22;
                break;
        default:
                return -EINVAL;
        }

        *maskp = mask;
        *shiftp = shift;

        return 0;
}

#define ADR_IMM_HILOSPLIT        2
#define ADR_IMM_SIZE                SZ_2M
#define ADR_IMM_LOMASK                ((1 << ADR_IMM_HILOSPLIT) - 1)
#define ADR_IMM_HIMASK                ((ADR_IMM_SIZE >> ADR_IMM_HILOSPLIT) - 1)
#define ADR_IMM_LOSHIFT                29
#define ADR_IMM_HISHIFT                5

u64 aarch64_insn_decode_immediate(enum aarch64_insn_imm_type type, u32 insn)
{
        u32 immlo, immhi, mask;
        int shift;

        switch (type) {
        case AARCH64_INSN_IMM_ADR:
                shift = 0;
                immlo = (insn >> ADR_IMM_LOSHIFT) & ADR_IMM_LOMASK;
                immhi = (insn >> ADR_IMM_HISHIFT) & ADR_IMM_HIMASK;
                insn = (immhi << ADR_IMM_HILOSPLIT) | immlo;
                mask = ADR_IMM_SIZE - 1;
                break;
        default:
                if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
                        pr_err("%s: unknown immediate encoding %d\n", __func__,
                               type);
                        return 0;
                }
        }

        return (insn >> shift) & mask;
}

u32 __kprobes aarch64_insn_encode_immediate(enum aarch64_insn_imm_type type,
                                  u32 insn, u64 imm)
{
        u32 immlo, immhi, mask;
        int shift;

        if (insn == AARCH64_BREAK_FAULT)
                return AARCH64_BREAK_FAULT;

        switch (type) {
        case AARCH64_INSN_IMM_ADR:
                shift = 0;
                immlo = (imm & ADR_IMM_LOMASK) << ADR_IMM_LOSHIFT;
                imm >>= ADR_IMM_HILOSPLIT;
                immhi = (imm & ADR_IMM_HIMASK) << ADR_IMM_HISHIFT;
                imm = immlo | immhi;
                mask = ((ADR_IMM_LOMASK << ADR_IMM_LOSHIFT) |
                        (ADR_IMM_HIMASK << ADR_IMM_HISHIFT));
                break;
        default:
                if (aarch64_get_imm_shift_mask(type, &mask, &shift) < 0) {
                        pr_err("%s: unknown immediate encoding %d\n", __func__,
                               type);
                        return AARCH64_BREAK_FAULT;
                }
        }

        /* Update the immediate field. */
        insn &= ~(mask << shift);
        insn |= (imm & mask) << shift;

        return insn;
}

u32 aarch64_insn_decode_register(enum aarch64_insn_register_type type,
                                        u32 insn)
{
        int shift;

        switch (type) {
        case AARCH64_INSN_REGTYPE_RT:
        case AARCH64_INSN_REGTYPE_RD:
                shift = 0;
                break;
        case AARCH64_INSN_REGTYPE_RN:
                shift = 5;
                break;
        case AARCH64_INSN_REGTYPE_RT2:
        case AARCH64_INSN_REGTYPE_RA:
                shift = 10;
                break;
        case AARCH64_INSN_REGTYPE_RM:
                shift = 16;
                break;
        default:
                pr_err("%s: unknown register type encoding %d\n", __func__,
                       type);
                return 0;
        }

        return (insn >> shift) & GENMASK(4, 0);
}

static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
                                        u32 insn,
                                        enum aarch64_insn_register reg)
{
        int shift;

        if (insn == AARCH64_BREAK_FAULT)
                return AARCH64_BREAK_FAULT;

        if (reg < AARCH64_INSN_REG_0 || reg > AARCH64_INSN_REG_SP) {
                pr_err("%s: unknown register encoding %d\n", __func__, reg);
                return AARCH64_BREAK_FAULT;
        }

        switch (type) {
        case AARCH64_INSN_REGTYPE_RT:
        case AARCH64_INSN_REGTYPE_RD:
                shift = 0;
                break;
        case AARCH64_INSN_REGTYPE_RN:
                shift = 5;
                break;
        case AARCH64_INSN_REGTYPE_RT2:
        case AARCH64_INSN_REGTYPE_RA:
                shift = 10;
                break;
        case AARCH64_INSN_REGTYPE_RM:
        case AARCH64_INSN_REGTYPE_RS:
                shift = 16;
                break;
        default:
                pr_err("%s: unknown register type encoding %d\n", __func__,
                       type);
                return AARCH64_BREAK_FAULT;
        }

        insn &= ~(GENMASK(4, 0) << shift);
        insn |= reg << shift;

        return insn;
}

static const u32 aarch64_insn_ldst_size[] = {
        [AARCH64_INSN_SIZE_8] = 0,
        [AARCH64_INSN_SIZE_16] = 1,
        [AARCH64_INSN_SIZE_32] = 2,
        [AARCH64_INSN_SIZE_64] = 3,
};

static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type,
                                         u32 insn)
{
        u32 size;

        if (type < AARCH64_INSN_SIZE_8 || type > AARCH64_INSN_SIZE_64) {
                pr_err("%s: unknown size encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        size = aarch64_insn_ldst_size[type];
        insn &= ~GENMASK(31, 30);
        insn |= size << 30;

        return insn;
}

static inline long label_imm_common(unsigned long pc, unsigned long addr,
                                     long range)
{
        long offset;

        if ((pc & 0x3) || (addr & 0x3)) {
                pr_err("%s: A64 instructions must be word aligned\n", __func__);
                return range;
        }

        offset = ((long)addr - (long)pc);

        if (offset < -range || offset >= range) {
                pr_err("%s: offset out of range\n", __func__);
                return range;
        }

        return offset;
}

u32 __kprobes aarch64_insn_gen_branch_imm(unsigned long pc, unsigned long addr,
                                          enum aarch64_insn_branch_type type)
{
        u32 insn;
        long offset;

        /*
         * B/BL support [-128M, 128M) offset
         * ARM64 virtual address arrangement guarantees all kernel and module
         * texts are within +/-128M.
         */
        offset = label_imm_common(pc, addr, SZ_128M);
        if (offset >= SZ_128M)
                return AARCH64_BREAK_FAULT;

        switch (type) {
        case AARCH64_INSN_BRANCH_LINK:
                insn = aarch64_insn_get_bl_value();
                break;
        case AARCH64_INSN_BRANCH_NOLINK:
                insn = aarch64_insn_get_b_value();
                break;
        default:
                pr_err("%s: unknown branch encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
                                             offset >> 2);
}

u32 aarch64_insn_gen_comp_branch_imm(unsigned long pc, unsigned long addr,
                                     enum aarch64_insn_register reg,
                                     enum aarch64_insn_variant variant,
                                     enum aarch64_insn_branch_type type)
{
        u32 insn;
        long offset;

        offset = label_imm_common(pc, addr, SZ_1M);
        if (offset >= SZ_1M)
                return AARCH64_BREAK_FAULT;

        switch (type) {
        case AARCH64_INSN_BRANCH_COMP_ZERO:
                insn = aarch64_insn_get_cbz_value();
                break;
        case AARCH64_INSN_BRANCH_COMP_NONZERO:
                insn = aarch64_insn_get_cbnz_value();
                break;
        default:
                pr_err("%s: unknown branch encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT;
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
                                             offset >> 2);
}

u32 aarch64_insn_gen_cond_branch_imm(unsigned long pc, unsigned long addr,
                                     enum aarch64_insn_condition cond)
{
        u32 insn;
        long offset;

        offset = label_imm_common(pc, addr, SZ_1M);

        insn = aarch64_insn_get_bcond_value();

        if (cond < AARCH64_INSN_COND_EQ || cond > AARCH64_INSN_COND_AL) {
                pr_err("%s: unknown condition encoding %d\n", __func__, cond);
                return AARCH64_BREAK_FAULT;
        }
        insn |= cond;

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
                                             offset >> 2);
}

u32 aarch64_insn_gen_branch_reg(enum aarch64_insn_register reg,
                                enum aarch64_insn_branch_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_BRANCH_NOLINK:
                insn = aarch64_insn_get_br_value();
                break;
        case AARCH64_INSN_BRANCH_LINK:
                insn = aarch64_insn_get_blr_value();
                break;
        case AARCH64_INSN_BRANCH_RETURN:
                insn = aarch64_insn_get_ret_value();
                break;
        default:
                pr_err("%s: unknown branch encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, reg);
}

u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
                                    enum aarch64_insn_register base,
                                    enum aarch64_insn_register offset,
                                    enum aarch64_insn_size_type size,
                                    enum aarch64_insn_ldst_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_LDST_LOAD_REG_OFFSET:
                insn = aarch64_insn_get_ldr_reg_value();
                break;
        case AARCH64_INSN_LDST_SIGNED_LOAD_REG_OFFSET:
                insn = aarch64_insn_get_signed_ldr_reg_value();
                break;
        case AARCH64_INSN_LDST_STORE_REG_OFFSET:
                insn = aarch64_insn_get_str_reg_value();
                break;
        default:
                pr_err("%s: unknown load/store encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_ldst_size(size, insn);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
                                            base);

        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
                                            offset);
}

u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg,
                                    enum aarch64_insn_register base,
                                    unsigned int imm,
                                    enum aarch64_insn_size_type size,
                                    enum aarch64_insn_ldst_type type)
{
        u32 insn;
        u32 shift;

        if (size < AARCH64_INSN_SIZE_8 || size > AARCH64_INSN_SIZE_64) {
                pr_err("%s: unknown size encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        shift = aarch64_insn_ldst_size[size];
        if (imm & ~(BIT(12 + shift) - BIT(shift))) {
                pr_err("%s: invalid imm: %d\n", __func__, imm);
                return AARCH64_BREAK_FAULT;
        }

        imm >>= shift;

        switch (type) {
        case AARCH64_INSN_LDST_LOAD_IMM_OFFSET:
                insn = aarch64_insn_get_ldr_imm_value();
                break;
        case AARCH64_INSN_LDST_SIGNED_LOAD_IMM_OFFSET:
                insn = aarch64_insn_get_signed_load_imm_value();
                break;
        case AARCH64_INSN_LDST_STORE_IMM_OFFSET:
                insn = aarch64_insn_get_str_imm_value();
                break;
        default:
                pr_err("%s: unknown load/store encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_ldst_size(size, insn);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
                                            base);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
}

u32 aarch64_insn_gen_load_literal(unsigned long pc, unsigned long addr,
                                  enum aarch64_insn_register reg,
                                  bool is64bit)
{
        u32 insn;
        long offset;

        offset = label_imm_common(pc, addr, SZ_1M);
        if (offset >= SZ_1M)
                return AARCH64_BREAK_FAULT;

        insn = aarch64_insn_get_ldr_lit_value();

        if (is64bit)
                insn |= BIT(30);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
                                             offset >> 2);
}

u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
                                     enum aarch64_insn_register reg2,
                                     enum aarch64_insn_register base,
                                     int offset,
                                     enum aarch64_insn_variant variant,
                                     enum aarch64_insn_ldst_type type)
{
        u32 insn;
        int shift;

        switch (type) {
        case AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX:
                insn = aarch64_insn_get_ldp_pre_value();
                break;
        case AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX:
                insn = aarch64_insn_get_stp_pre_value();
                break;
        case AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX:
                insn = aarch64_insn_get_ldp_post_value();
                break;
        case AARCH64_INSN_LDST_STORE_PAIR_POST_INDEX:
                insn = aarch64_insn_get_stp_post_value();
                break;
        default:
                pr_err("%s: unknown load/store encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                if ((offset & 0x3) || (offset < -256) || (offset > 252)) {
                        pr_err("%s: offset must be multiples of 4 in the range of [-256, 252] %d\n",
                               __func__, offset);
                        return AARCH64_BREAK_FAULT;
                }
                shift = 2;
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                if ((offset & 0x7) || (offset < -512) || (offset > 504)) {
                        pr_err("%s: offset must be multiples of 8 in the range of [-512, 504] %d\n",
                               __func__, offset);
                        return AARCH64_BREAK_FAULT;
                }
                shift = 3;
                insn |= AARCH64_INSN_SF_BIT;
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
                                            reg1);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
                                            reg2);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
                                            base);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_7, insn,
                                             offset >> shift);
}

u32 aarch64_insn_gen_load_acq_store_rel(enum aarch64_insn_register reg,
                                        enum aarch64_insn_register base,
                                        enum aarch64_insn_size_type size,
                                        enum aarch64_insn_ldst_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_LDST_LOAD_ACQ:
                insn = aarch64_insn_get_load_acq_value();
                break;
        case AARCH64_INSN_LDST_STORE_REL:
                insn = aarch64_insn_get_store_rel_value();
                break;
        default:
                pr_err("%s: unknown load-acquire/store-release encoding %d\n",
                       __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_ldst_size(size, insn);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
                                            reg);

        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
                                            base);
}

u32 aarch64_insn_gen_load_store_ex(enum aarch64_insn_register reg,
                                   enum aarch64_insn_register base,
                                   enum aarch64_insn_register state,
                                   enum aarch64_insn_size_type size,
                                   enum aarch64_insn_ldst_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_LDST_LOAD_EX:
        case AARCH64_INSN_LDST_LOAD_ACQ_EX:
                insn = aarch64_insn_get_load_ex_value();
                if (type == AARCH64_INSN_LDST_LOAD_ACQ_EX)
                        insn |= BIT(15);
                break;
        case AARCH64_INSN_LDST_STORE_EX:
        case AARCH64_INSN_LDST_STORE_REL_EX:
                insn = aarch64_insn_get_store_ex_value();
                if (type == AARCH64_INSN_LDST_STORE_REL_EX)
                        insn |= BIT(15);
                break;
        default:
                pr_err("%s: unknown load/store exclusive encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_ldst_size(size, insn);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
                                            reg);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
                                            base);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT2, insn,
                                            AARCH64_INSN_REG_ZR);

        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
                                            state);
}

#ifdef CONFIG_ARM64_LSE_ATOMICS
static u32 aarch64_insn_encode_ldst_order(enum aarch64_insn_mem_order_type type,
                                          u32 insn)
{
        u32 order;

        switch (type) {
        case AARCH64_INSN_MEM_ORDER_NONE:
                order = 0;
                break;
        case AARCH64_INSN_MEM_ORDER_ACQ:
                order = 2;
                break;
        case AARCH64_INSN_MEM_ORDER_REL:
                order = 1;
                break;
        case AARCH64_INSN_MEM_ORDER_ACQREL:
                order = 3;
                break;
        default:
                pr_err("%s: unknown mem order %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        insn &= ~GENMASK(23, 22);
        insn |= order << 22;

        return insn;
}

u32 aarch64_insn_gen_atomic_ld_op(enum aarch64_insn_register result,
                                  enum aarch64_insn_register address,
                                  enum aarch64_insn_register value,
                                  enum aarch64_insn_size_type size,
                                  enum aarch64_insn_mem_atomic_op op,
                                  enum aarch64_insn_mem_order_type order)
{
        u32 insn;

        switch (op) {
        case AARCH64_INSN_MEM_ATOMIC_ADD:
                insn = aarch64_insn_get_ldadd_value();
                break;
        case AARCH64_INSN_MEM_ATOMIC_CLR:
                insn = aarch64_insn_get_ldclr_value();
                break;
        case AARCH64_INSN_MEM_ATOMIC_EOR:
                insn = aarch64_insn_get_ldeor_value();
                break;
        case AARCH64_INSN_MEM_ATOMIC_SET:
                insn = aarch64_insn_get_ldset_value();
                break;
        case AARCH64_INSN_MEM_ATOMIC_SWP:
                insn = aarch64_insn_get_swp_value();
                break;
        default:
                pr_err("%s: unimplemented mem atomic op %d\n", __func__, op);
                return AARCH64_BREAK_FAULT;
        }

        switch (size) {
        case AARCH64_INSN_SIZE_32:
        case AARCH64_INSN_SIZE_64:
                break;
        default:
                pr_err("%s: unimplemented size encoding %d\n", __func__, size);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_ldst_size(size, insn);

        insn = aarch64_insn_encode_ldst_order(order, insn);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
                                            result);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
                                            address);

        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
                                            value);
}

static u32 aarch64_insn_encode_cas_order(enum aarch64_insn_mem_order_type type,
                                         u32 insn)
{
        u32 order;

        switch (type) {
        case AARCH64_INSN_MEM_ORDER_NONE:
                order = 0;
                break;
        case AARCH64_INSN_MEM_ORDER_ACQ:
                order = BIT(22);
                break;
        case AARCH64_INSN_MEM_ORDER_REL:
                order = BIT(15);
                break;
        case AARCH64_INSN_MEM_ORDER_ACQREL:
                order = BIT(15) | BIT(22);
                break;
        default:
                pr_err("%s: unknown mem order %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        insn &= ~(BIT(15) | BIT(22));
        insn |= order;

        return insn;
}

u32 aarch64_insn_gen_cas(enum aarch64_insn_register result,
                         enum aarch64_insn_register address,
                         enum aarch64_insn_register value,
                         enum aarch64_insn_size_type size,
                         enum aarch64_insn_mem_order_type order)
{
        u32 insn;

        switch (size) {
        case AARCH64_INSN_SIZE_32:
        case AARCH64_INSN_SIZE_64:
                break;
        default:
                pr_err("%s: unimplemented size encoding %d\n", __func__, size);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_get_cas_value();

        insn = aarch64_insn_encode_ldst_size(size, insn);

        insn = aarch64_insn_encode_cas_order(order, insn);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
                                            result);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
                                            address);

        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
                                            value);
}
#endif

u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
                                 enum aarch64_insn_register src,
                                 int imm, enum aarch64_insn_variant variant,
                                 enum aarch64_insn_adsb_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_ADSB_ADD:
                insn = aarch64_insn_get_add_imm_value();
                break;
        case AARCH64_INSN_ADSB_SUB:
                insn = aarch64_insn_get_sub_imm_value();
                break;
        case AARCH64_INSN_ADSB_ADD_SETFLAGS:
                insn = aarch64_insn_get_adds_imm_value();
                break;
        case AARCH64_INSN_ADSB_SUB_SETFLAGS:
                insn = aarch64_insn_get_subs_imm_value();
                break;
        default:
                pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT;
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        /* We can't encode more than a 24bit value (12bit + 12bit shift) */
        if (imm & ~(BIT(24) - 1))
                goto out;

        /* If we have something in the top 12 bits... */
        if (imm & ~(SZ_4K - 1)) {
                /* ... and in the low 12 bits -> error */
                if (imm & (SZ_4K - 1))
                        goto out;

                imm >>= 12;
                insn |= AARCH64_INSN_LSL_12;
        }

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);

out:
        pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
        return AARCH64_BREAK_FAULT;
}

u32 aarch64_insn_gen_bitfield(enum aarch64_insn_register dst,
                              enum aarch64_insn_register src,
                              int immr, int imms,
                              enum aarch64_insn_variant variant,
                              enum aarch64_insn_bitfield_type type)
{
        u32 insn;
        u32 mask;

        switch (type) {
        case AARCH64_INSN_BITFIELD_MOVE:
                insn = aarch64_insn_get_bfm_value();
                break;
        case AARCH64_INSN_BITFIELD_MOVE_UNSIGNED:
                insn = aarch64_insn_get_ubfm_value();
                break;
        case AARCH64_INSN_BITFIELD_MOVE_SIGNED:
                insn = aarch64_insn_get_sbfm_value();
                break;
        default:
                pr_err("%s: unknown bitfield encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                mask = GENMASK(4, 0);
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT | AARCH64_INSN_N_BIT;
                mask = GENMASK(5, 0);
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        if (immr & ~mask) {
                pr_err("%s: invalid immr encoding %d\n", __func__, immr);
                return AARCH64_BREAK_FAULT;
        }
        if (imms & ~mask) {
                pr_err("%s: invalid imms encoding %d\n", __func__, imms);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);

        insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
}

u32 aarch64_insn_gen_movewide(enum aarch64_insn_register dst,
                              int imm, int shift,
                              enum aarch64_insn_variant variant,
                              enum aarch64_insn_movewide_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_MOVEWIDE_ZERO:
                insn = aarch64_insn_get_movz_value();
                break;
        case AARCH64_INSN_MOVEWIDE_KEEP:
                insn = aarch64_insn_get_movk_value();
                break;
        case AARCH64_INSN_MOVEWIDE_INVERSE:
                insn = aarch64_insn_get_movn_value();
                break;
        default:
                pr_err("%s: unknown movewide encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        if (imm & ~(SZ_64K - 1)) {
                pr_err("%s: invalid immediate encoding %d\n", __func__, imm);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                if (shift != 0 && shift != 16) {
                        pr_err("%s: invalid shift encoding %d\n", __func__,
                               shift);
                        return AARCH64_BREAK_FAULT;
                }
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT;
                if (shift != 0 && shift != 16 && shift != 32 && shift != 48) {
                        pr_err("%s: invalid shift encoding %d\n", __func__,
                               shift);
                        return AARCH64_BREAK_FAULT;
                }
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        insn |= (shift >> 4) << 21;

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_16, insn, imm);
}

u32 aarch64_insn_gen_add_sub_shifted_reg(enum aarch64_insn_register dst,
                                         enum aarch64_insn_register src,
                                         enum aarch64_insn_register reg,
                                         int shift,
                                         enum aarch64_insn_variant variant,
                                         enum aarch64_insn_adsb_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_ADSB_ADD:
                insn = aarch64_insn_get_add_value();
                break;
        case AARCH64_INSN_ADSB_SUB:
                insn = aarch64_insn_get_sub_value();
                break;
        case AARCH64_INSN_ADSB_ADD_SETFLAGS:
                insn = aarch64_insn_get_adds_value();
                break;
        case AARCH64_INSN_ADSB_SUB_SETFLAGS:
                insn = aarch64_insn_get_subs_value();
                break;
        default:
                pr_err("%s: unknown add/sub encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                if (shift & ~(SZ_32 - 1)) {
                        pr_err("%s: invalid shift encoding %d\n", __func__,
                               shift);
                        return AARCH64_BREAK_FAULT;
                }
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT;
                if (shift & ~(SZ_64 - 1)) {
                        pr_err("%s: invalid shift encoding %d\n", __func__,
                               shift);
                        return AARCH64_BREAK_FAULT;
                }
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }


        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
}

u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
                           enum aarch64_insn_register src,
                           enum aarch64_insn_variant variant,
                           enum aarch64_insn_data1_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_DATA1_REVERSE_16:
                insn = aarch64_insn_get_rev16_value();
                break;
        case AARCH64_INSN_DATA1_REVERSE_32:
                insn = aarch64_insn_get_rev32_value();
                break;
        case AARCH64_INSN_DATA1_REVERSE_64:
                if (variant != AARCH64_INSN_VARIANT_64BIT) {
                        pr_err("%s: invalid variant for reverse64 %d\n",
                               __func__, variant);
                        return AARCH64_BREAK_FAULT;
                }
                insn = aarch64_insn_get_rev64_value();
                break;
        default:
                pr_err("%s: unknown data1 encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT;
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);

        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);
}

u32 aarch64_insn_gen_data2(enum aarch64_insn_register dst,
                           enum aarch64_insn_register src,
                           enum aarch64_insn_register reg,
                           enum aarch64_insn_variant variant,
                           enum aarch64_insn_data2_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_DATA2_UDIV:
                insn = aarch64_insn_get_udiv_value();
                break;
        case AARCH64_INSN_DATA2_SDIV:
                insn = aarch64_insn_get_sdiv_value();
                break;
        case AARCH64_INSN_DATA2_LSLV:
                insn = aarch64_insn_get_lslv_value();
                break;
        case AARCH64_INSN_DATA2_LSRV:
                insn = aarch64_insn_get_lsrv_value();
                break;
        case AARCH64_INSN_DATA2_ASRV:
                insn = aarch64_insn_get_asrv_value();
                break;
        case AARCH64_INSN_DATA2_RORV:
                insn = aarch64_insn_get_rorv_value();
                break;
        default:
                pr_err("%s: unknown data2 encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT;
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);

        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);
}

u32 aarch64_insn_gen_data3(enum aarch64_insn_register dst,
                           enum aarch64_insn_register src,
                           enum aarch64_insn_register reg1,
                           enum aarch64_insn_register reg2,
                           enum aarch64_insn_variant variant,
                           enum aarch64_insn_data3_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_DATA3_MADD:
                insn = aarch64_insn_get_madd_value();
                break;
        case AARCH64_INSN_DATA3_MSUB:
                insn = aarch64_insn_get_msub_value();
                break;
        default:
                pr_err("%s: unknown data3 encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT;
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RA, insn, src);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
                                            reg1);

        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn,
                                            reg2);
}

u32 aarch64_insn_gen_logical_shifted_reg(enum aarch64_insn_register dst,
                                         enum aarch64_insn_register src,
                                         enum aarch64_insn_register reg,
                                         int shift,
                                         enum aarch64_insn_variant variant,
                                         enum aarch64_insn_logic_type type)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_LOGIC_AND:
                insn = aarch64_insn_get_and_value();
                break;
        case AARCH64_INSN_LOGIC_BIC:
                insn = aarch64_insn_get_bic_value();
                break;
        case AARCH64_INSN_LOGIC_ORR:
                insn = aarch64_insn_get_orr_value();
                break;
        case AARCH64_INSN_LOGIC_ORN:
                insn = aarch64_insn_get_orn_value();
                break;
        case AARCH64_INSN_LOGIC_EOR:
                insn = aarch64_insn_get_eor_value();
                break;
        case AARCH64_INSN_LOGIC_EON:
                insn = aarch64_insn_get_eon_value();
                break;
        case AARCH64_INSN_LOGIC_AND_SETFLAGS:
                insn = aarch64_insn_get_ands_value();
                break;
        case AARCH64_INSN_LOGIC_BIC_SETFLAGS:
                insn = aarch64_insn_get_bics_value();
                break;
        default:
                pr_err("%s: unknown logical encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                if (shift & ~(SZ_32 - 1)) {
                        pr_err("%s: invalid shift encoding %d\n", __func__,
                               shift);
                        return AARCH64_BREAK_FAULT;
                }
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT;
                if (shift & ~(SZ_64 - 1)) {
                        pr_err("%s: invalid shift encoding %d\n", __func__,
                               shift);
                        return AARCH64_BREAK_FAULT;
                }
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }


        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, dst);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, src);

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, reg);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_6, insn, shift);
}

/*
 * MOV (register) is architecturally an alias of ORR (shifted register) where
 * MOV <*d>, <*m> is equivalent to ORR <*d>, <*ZR>, <*m>
 */
u32 aarch64_insn_gen_move_reg(enum aarch64_insn_register dst,
                              enum aarch64_insn_register src,
                              enum aarch64_insn_variant variant)
{
        return aarch64_insn_gen_logical_shifted_reg(dst, AARCH64_INSN_REG_ZR,
                                                    src, 0, variant,
                                                    AARCH64_INSN_LOGIC_ORR);
}

u32 aarch64_insn_gen_adr(unsigned long pc, unsigned long addr,
                         enum aarch64_insn_register reg,
                         enum aarch64_insn_adr_type type)
{
        u32 insn;
        s32 offset;

        switch (type) {
        case AARCH64_INSN_ADR_TYPE_ADR:
                insn = aarch64_insn_get_adr_value();
                offset = addr - pc;
                break;
        case AARCH64_INSN_ADR_TYPE_ADRP:
                insn = aarch64_insn_get_adrp_value();
                offset = (addr - ALIGN_DOWN(pc, SZ_4K)) >> 12;
                break;
        default:
                pr_err("%s: unknown adr encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        if (offset < -SZ_1M || offset >= SZ_1M)
                return AARCH64_BREAK_FAULT;

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, reg);

        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn, offset);
}

/*
 * Decode the imm field of a branch, and return the byte offset as a
 * signed value (so it can be used when computing a new branch
 * target).
 */
s32 aarch64_get_branch_offset(u32 insn)
{
        s32 imm;

        if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) {
                imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_26, insn);
                return (imm << 6) >> 4;
        }

        if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
            aarch64_insn_is_bcond(insn)) {
                imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_19, insn);
                return (imm << 13) >> 11;
        }

        if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) {
                imm = aarch64_insn_decode_immediate(AARCH64_INSN_IMM_14, insn);
                return (imm << 18) >> 16;
        }

        /* Unhandled instruction */
        BUG();
}

/*
 * Encode the displacement of a branch in the imm field and return the
 * updated instruction.
 */
u32 aarch64_set_branch_offset(u32 insn, s32 offset)
{
        if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn))
                return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_26, insn,
                                                     offset >> 2);

        if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn) ||
            aarch64_insn_is_bcond(insn))
                return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_19, insn,
                                                     offset >> 2);

        if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn))
                return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_14, insn,
                                                     offset >> 2);

        /* Unhandled instruction */
        BUG();
}

s32 aarch64_insn_adrp_get_offset(u32 insn)
{
        BUG_ON(!aarch64_insn_is_adrp(insn));
        return aarch64_insn_decode_immediate(AARCH64_INSN_IMM_ADR, insn) << 12;
}

u32 aarch64_insn_adrp_set_offset(u32 insn, s32 offset)
{
        BUG_ON(!aarch64_insn_is_adrp(insn));
        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_ADR, insn,
                                                offset >> 12);
}

/*
 * Extract the Op/CR data from a msr/mrs instruction.
 */
u32 aarch64_insn_extract_system_reg(u32 insn)
{
        return (insn & 0x1FFFE0) >> 5;
}

bool aarch32_insn_is_wide(u32 insn)
{
        return insn >= 0xe800;
}

/*
 * Macros/defines for extracting register numbers from instruction.
 */
u32 aarch32_insn_extract_reg_num(u32 insn, int offset)
{
        return (insn & (0xf << offset)) >> offset;
}

#define OPC2_MASK        0x7
#define OPC2_OFFSET        5
u32 aarch32_insn_mcr_extract_opc2(u32 insn)
{
        return (insn & (OPC2_MASK << OPC2_OFFSET)) >> OPC2_OFFSET;
}

#define CRM_MASK        0xf
u32 aarch32_insn_mcr_extract_crm(u32 insn)
{
        return insn & CRM_MASK;
}

static bool range_of_ones(u64 val)
{
        /* Doesn't handle full ones or full zeroes */
        u64 sval = val >> __ffs64(val);

        /* One of Sean Eron Anderson's bithack tricks */
        return ((sval + 1) & (sval)) == 0;
}

static u32 aarch64_encode_immediate(u64 imm,
                                    enum aarch64_insn_variant variant,
                                    u32 insn)
{
        unsigned int immr, imms, n, ones, ror, esz, tmp;
        u64 mask;

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                esz = 32;
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                insn |= AARCH64_INSN_SF_BIT;
                esz = 64;
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        mask = GENMASK(esz - 1, 0);

        /* Can't encode full zeroes, full ones, or value wider than the mask */
        if (!imm || imm == mask || imm & ~mask)
                return AARCH64_BREAK_FAULT;

        /*
         * Inverse of Replicate(). Try to spot a repeating pattern
         * with a pow2 stride.
         */
        for (tmp = esz / 2; tmp >= 2; tmp /= 2) {
                u64 emask = BIT(tmp) - 1;

                if ((imm & emask) != ((imm >> tmp) & emask))
                        break;

                esz = tmp;
                mask = emask;
        }

        /* N is only set if we're encoding a 64bit value */
        n = esz == 64;

        /* Trim imm to the element size */
        imm &= mask;

        /* That's how many ones we need to encode */
        ones = hweight64(imm);

        /*
         * imms is set to (ones - 1), prefixed with a string of ones
         * and a zero if they fit. Cap it to 6 bits.
         */
        imms  = ones - 1;
        imms |= 0xf << ffs(esz);
        imms &= BIT(6) - 1;

        /* Compute the rotation */
        if (range_of_ones(imm)) {
                /*
                 * Pattern: 0..01..10..0
                 *
                 * Compute how many rotate we need to align it right
                 */
                ror = __ffs64(imm);
        } else {
                /*
                 * Pattern: 0..01..10..01..1
                 *
                 * Fill the unused top bits with ones, and check if
                 * the result is a valid immediate (all ones with a
                 * contiguous ranges of zeroes).
                 */
                imm |= ~mask;
                if (!range_of_ones(~imm))
                        return AARCH64_BREAK_FAULT;

                /*
                 * Compute the rotation to get a continuous set of
                 * ones, with the first bit set at position 0
                 */
                ror = fls64(~imm);
        }

        /*
         * immr is the number of bits we need to rotate back to the
         * original set of ones. Note that this is relative to the
         * element size...
         */
        immr = (esz - ror) % esz;

        insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, n);
        insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_R, insn, immr);
        return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, imms);
}

u32 aarch64_insn_gen_logical_immediate(enum aarch64_insn_logic_type type,
                                       enum aarch64_insn_variant variant,
                                       enum aarch64_insn_register Rn,
                                       enum aarch64_insn_register Rd,
                                       u64 imm)
{
        u32 insn;

        switch (type) {
        case AARCH64_INSN_LOGIC_AND:
                insn = aarch64_insn_get_and_imm_value();
                break;
        case AARCH64_INSN_LOGIC_ORR:
                insn = aarch64_insn_get_orr_imm_value();
                break;
        case AARCH64_INSN_LOGIC_EOR:
                insn = aarch64_insn_get_eor_imm_value();
                break;
        case AARCH64_INSN_LOGIC_AND_SETFLAGS:
                insn = aarch64_insn_get_ands_imm_value();
                break;
        default:
                pr_err("%s: unknown logical encoding %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
        return aarch64_encode_immediate(imm, variant, insn);
}

u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant,
                          enum aarch64_insn_register Rm,
                          enum aarch64_insn_register Rn,
                          enum aarch64_insn_register Rd,
                          u8 lsb)
{
        u32 insn;

        insn = aarch64_insn_get_extr_value();

        switch (variant) {
        case AARCH64_INSN_VARIANT_32BIT:
                if (lsb > 31)
                        return AARCH64_BREAK_FAULT;
                break;
        case AARCH64_INSN_VARIANT_64BIT:
                if (lsb > 63)
                        return AARCH64_BREAK_FAULT;
                insn |= AARCH64_INSN_SF_BIT;
                insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_N, insn, 1);
                break;
        default:
                pr_err("%s: unknown variant encoding %d\n", __func__, variant);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_encode_immediate(AARCH64_INSN_IMM_S, insn, lsb);
        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RD, insn, Rd);
        insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, Rn);
        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm);
}

u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type)
{
        u32 opt;
        u32 insn;

        switch (type) {
        case AARCH64_INSN_MB_SY:
                opt = 0xf;
                break;
        case AARCH64_INSN_MB_ST:
                opt = 0xe;
                break;
        case AARCH64_INSN_MB_LD:
                opt = 0xd;
                break;
        case AARCH64_INSN_MB_ISH:
                opt = 0xb;
                break;
        case AARCH64_INSN_MB_ISHST:
                opt = 0xa;
                break;
        case AARCH64_INSN_MB_ISHLD:
                opt = 0x9;
                break;
        case AARCH64_INSN_MB_NSH:
                opt = 0x7;
                break;
        case AARCH64_INSN_MB_NSHST:
                opt = 0x6;
                break;
        case AARCH64_INSN_MB_NSHLD:
                opt = 0x5;
                break;
        default:
                pr_err("%s: unknown dmb type %d\n", __func__, type);
                return AARCH64_BREAK_FAULT;
        }

        insn = aarch64_insn_get_dmb_value();
        insn &= ~GENMASK(11, 8);
        insn |= (opt << 8);

        return insn;
}

u32 aarch64_insn_gen_mrs(enum aarch64_insn_register result,
                         enum aarch64_insn_system_register sysreg)
{
        u32 insn = aarch64_insn_get_mrs_value();

        insn &= ~GENMASK(19, 0);
        insn |= sysreg << 5;
        return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT,
                                            insn, result);
}


















    9 
    9 
    9 
    9 
    9 







    9 


































    8 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/mm/copypage.c
 *
 * Copyright (C) 2002 Deep Blue Solutions Ltd, All Rights Reserved.
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/bitops.h>
#include <linux/mm.h>

#include <asm/page.h>
#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/mte.h>

void copy_highpage(struct page *to, struct page *from)
{
        void *kto = page_address(to);
        void *kfrom = page_address(from);
        struct folio *src = page_folio(from);
        struct folio *dst = page_folio(to);
        unsigned int i, nr_pages;

        copy_page(kto, kfrom);

        if (kasan_hw_tags_enabled())
                page_kasan_tag_reset(to);

        if (!system_supports_mte())
                return;

        if (folio_test_hugetlb(src)) {
                if (!folio_test_hugetlb_mte_tagged(src) ||
                    from != folio_page(src, 0))
                        return;

                WARN_ON_ONCE(!folio_try_hugetlb_mte_tagging(dst));

                /*
                 * Populate tags for all subpages.
                 *
                 * Don't assume the first page is head page since
                 * huge page copy may start from any subpage.
                 */
                nr_pages = folio_nr_pages(src);
                for (i = 0; i < nr_pages; i++) {
                        kfrom = page_address(folio_page(src, i));
                        kto = page_address(folio_page(dst, i));
                        mte_copy_page_tags(kto, kfrom);
                }
                folio_set_hugetlb_mte_tagged(dst);
        } else if (page_mte_tagged(from)) {
                /* It's a new page, shouldn't have been tagged yet */
                WARN_ON_ONCE(!try_page_mte_tagging(to));

                mte_copy_page_tags(kto, kfrom);
                set_page_mte_tagged(to);
        }
}
EXPORT_SYMBOL(copy_highpage);

void copy_user_highpage(struct page *to, struct page *from,
                        unsigned long vaddr, struct vm_area_struct *vma)
{
        copy_highpage(to, from);
        flush_dcache_page(to);
}
EXPORT_SYMBOL_GPL(copy_user_highpage);



































































  188 


  188 
  189 
  189 















































































  671 


  672 






  670 





















































  672 







  672 



















  672 









































































  535 





   49 
  498 
  534 
    2 

  530 
    7 




  533 
    2 
















  536 

















  535 








  536 



























































  170 










  171 




  169 



  168 
    1 


  132 
   41 
  167 

    6 

  171 

  171 

  170 

















  126 























  144 



   24 





  126 













 1396 
 1407 
  124 

























   50 
   50 










   25 
   25 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/file_table.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
 */

#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/eventpoll.h>
#include <linux/rcupdate.h>
#include <linux/mount.h>
#include <linux/capability.h>
#include <linux/cdev.h>
#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <linux/percpu_counter.h>
#include <linux/percpu.h>
#include <linux/task_work.h>
#include <linux/swap.h>
#include <linux/kmemleak.h>

#include <linux/atomic.h>

#include "internal.h"

/* sysctl tunables... */
static struct files_stat_struct files_stat = {
        .max_files = NR_FILE
};

/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init;
static struct kmem_cache *bfilp_cachep __ro_after_init;

static struct percpu_counter nr_files __cacheline_aligned_in_smp;

/* Container for backing file with optional user path */
struct backing_file {
        struct file file;
        union {
                struct path user_path;
                freeptr_t bf_freeptr;
        };
};

static inline struct backing_file *backing_file(struct file *f)
{
        return container_of(f, struct backing_file, file);
}

struct path *backing_file_user_path(struct file *f)
{
        return &backing_file(f)->user_path;
}
EXPORT_SYMBOL_GPL(backing_file_user_path);

static inline void file_free(struct file *f)
{
        security_file_free(f);
        if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
                percpu_counter_dec(&nr_files);
        put_cred(f->f_cred);
        if (unlikely(f->f_mode & FMODE_BACKING)) {
                path_put(backing_file_user_path(f));
                kmem_cache_free(bfilp_cachep, backing_file(f));
        } else {
                kmem_cache_free(filp_cachep, f);
        }
}

/*
 * Return the total number of open files in the system
 */
static long get_nr_files(void)
{
        return percpu_counter_read_positive(&nr_files);
}

/*
 * Return the maximum number of open files in the system
 */
unsigned long get_max_files(void)
{
        return files_stat.max_files;
}
EXPORT_SYMBOL_GPL(get_max_files);

#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)

/*
 * Handle nr_files sysctl
 */
static int proc_nr_files(const struct ctl_table *table, int write, void *buffer,
                         size_t *lenp, loff_t *ppos)
{
        files_stat.nr_files = get_nr_files();
        return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}

static const struct ctl_table fs_stat_sysctls[] = {
        {
                .procname        = "file-nr",
                .data                = &files_stat,
                .maxlen                = sizeof(files_stat),
                .mode                = 0444,
                .proc_handler        = proc_nr_files,
        },
        {
                .procname        = "file-max",
                .data                = &files_stat.max_files,
                .maxlen                = sizeof(files_stat.max_files),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
                .extra1                = SYSCTL_LONG_ZERO,
                .extra2                = SYSCTL_LONG_MAX,
        },
        {
                .procname        = "nr_open",
                .data                = &sysctl_nr_open,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = proc_douintvec_minmax,
                .extra1                = &sysctl_nr_open_min,
                .extra2                = &sysctl_nr_open_max,
        },
};

static int __init init_fs_stat_sysctls(void)
{
        register_sysctl_init("fs", fs_stat_sysctls);
        if (IS_ENABLED(CONFIG_BINFMT_MISC)) {
                struct ctl_table_header *hdr;

                hdr = register_sysctl_mount_point("fs/binfmt_misc");
                kmemleak_not_leak(hdr);
        }
        return 0;
}
fs_initcall(init_fs_stat_sysctls);
#endif

static int init_file(struct file *f, int flags, const struct cred *cred)
{
        int error;

        f->f_cred = get_cred(cred);
        error = security_file_alloc(f);
        if (unlikely(error)) {
                put_cred(f->f_cred);
                return error;
        }

        spin_lock_init(&f->f_lock);
        /*
         * Note that f_pos_lock is only used for files raising
         * FMODE_ATOMIC_POS and directories. Other files such as pipes
         * don't need it and since f_pos_lock is in a union may reuse
         * the space for other purposes. They are expected to initialize
         * the respective member when opening the file.
         */
        mutex_init(&f->f_pos_lock);
        memset(&f->f_path, 0, sizeof(f->f_path));
        memset(&f->f_ra, 0, sizeof(f->f_ra));

        f->f_flags        = flags;
        f->f_mode        = OPEN_FMODE(flags);

        f->f_op                = NULL;
        f->f_mapping        = NULL;
        f->private_data = NULL;
        f->f_inode        = NULL;
        f->f_owner        = NULL;
#ifdef CONFIG_EPOLL
        f->f_ep                = NULL;
#endif

        f->f_iocb_flags = 0;
        f->f_pos        = 0;
        f->f_wb_err        = 0;
        f->f_sb_err        = 0;

        /*
         * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
         * fget-rcu pattern users need to be able to handle spurious
         * refcount bumps we should reinitialize the reused file first.
         */
        file_ref_init(&f->f_ref, 1);
        /*
         * Disable permission and pre-content events for all files by default.
         * They may be enabled later by file_set_fsnotify_mode_from_watchers().
         */
        file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM);
        return 0;
}

/* Find an unused file structure and return a pointer to it.
 * Returns an error pointer if some error happend e.g. we over file
 * structures limit, run out of memory or operation is not permitted.
 *
 * Be very careful using this.  You are responsible for
 * getting write access to any mount that you might assign
 * to this filp, if it is opened for write.  If this is not
 * done, you will imbalance int the mount's writer count
 * and a warning at __fput() time.
 */
struct file *alloc_empty_file(int flags, const struct cred *cred)
{
        static long old_max;
        struct file *f;
        int error;

        /*
         * Privileged users can go above max_files
         */
        if (unlikely(get_nr_files() >= files_stat.max_files) &&
            !capable(CAP_SYS_ADMIN)) {
                /*
                 * percpu_counters are inaccurate.  Do an expensive check before
                 * we go and fail.
                 */
                if (percpu_counter_sum_positive(&nr_files) >= files_stat.max_files)
                        goto over;
        }

        f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cachep, f);
                return ERR_PTR(error);
        }

        percpu_counter_inc(&nr_files);

        return f;

over:
        /* Ran out of filps - report that */
        if (get_nr_files() > old_max) {
                pr_info("VFS: file-max limit %lu reached\n", get_max_files());
                old_max = get_nr_files();
        }
        return ERR_PTR(-ENFILE);
}

/*
 * Variant of alloc_empty_file() that doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
{
        struct file *f;
        int error;

        f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
        if (unlikely(!f))
                return ERR_PTR(-ENOMEM);

        error = init_file(f, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(filp_cachep, f);
                return ERR_PTR(error);
        }

        f->f_mode |= FMODE_NOACCOUNT;

        return f;
}

/*
 * Variant of alloc_empty_file() that allocates a backing_file container
 * and doesn't check and modify nr_files.
 *
 * This is only for kernel internal use, and the allocate file must not be
 * installed into file tables or such.
 */
struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
{
        struct backing_file *ff;
        int error;

        ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
        if (unlikely(!ff))
                return ERR_PTR(-ENOMEM);

        error = init_file(&ff->file, flags, cred);
        if (unlikely(error)) {
                kmem_cache_free(bfilp_cachep, ff);
                return ERR_PTR(error);
        }

        ff->file.f_mode |= FMODE_BACKING | FMODE_NOACCOUNT;
        return &ff->file;
}

/**
 * file_init_path - initialize a 'struct file' based on path
 *
 * @file: the file to set up
 * @path: the (dentry, vfsmount) pair for the new file
 * @fop: the 'struct file_operations' for the new file
 */
static void file_init_path(struct file *file, const struct path *path,
                           const struct file_operations *fop)
{
        file->f_path = *path;
        file->f_inode = path->dentry->d_inode;
        file->f_mapping = path->dentry->d_inode->i_mapping;
        file->f_wb_err = filemap_sample_wb_err(file->f_mapping);
        file->f_sb_err = file_sample_sb_err(file);
        if (fop->llseek)
                file->f_mode |= FMODE_LSEEK;
        if ((file->f_mode & FMODE_READ) &&
             likely(fop->read || fop->read_iter))
                file->f_mode |= FMODE_CAN_READ;
        if ((file->f_mode & FMODE_WRITE) &&
             likely(fop->write || fop->write_iter))
                file->f_mode |= FMODE_CAN_WRITE;
        file->f_iocb_flags = iocb_flags(file);
        file->f_mode |= FMODE_OPENED;
        file->f_op = fop;
        if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
                i_readcount_inc(path->dentry->d_inode);
}

/**
 * alloc_file - allocate and initialize a 'struct file'
 *
 * @path: the (dentry, vfsmount) pair for the new file
 * @flags: O_... flags with which the new file will be opened
 * @fop: the 'struct file_operations' for the new file
 */
static struct file *alloc_file(const struct path *path, int flags,
                const struct file_operations *fop)
{
        struct file *file;

        file = alloc_empty_file(flags, current_cred());
        if (!IS_ERR(file))
                file_init_path(file, path, fop);
        return file;
}

static inline int alloc_path_pseudo(const char *name, struct inode *inode,
                                    struct vfsmount *mnt, struct path *path)
{
        path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name));
        if (!path->dentry)
                return -ENOMEM;
        path->mnt = mntget(mnt);
        d_instantiate(path->dentry, inode);
        return 0;
}

struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
                               const char *name, int flags,
                               const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_file(&path, flags, fops);
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
                return file;
        }
        /*
         * Disable all fsnotify events for pseudo files by default.
         * They may be enabled by caller with file_set_fsnotify_mode().
         */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY);
        return file;
}
EXPORT_SYMBOL(alloc_file_pseudo);

struct file *alloc_file_pseudo_noaccount(struct inode *inode,
                                         struct vfsmount *mnt, const char *name,
                                         int flags,
                                         const struct file_operations *fops)
{
        int ret;
        struct path path;
        struct file *file;

        ret = alloc_path_pseudo(name, inode, mnt, &path);
        if (ret)
                return ERR_PTR(ret);

        file = alloc_empty_file_noaccount(flags, current_cred());
        if (IS_ERR(file)) {
                ihold(inode);
                path_put(&path);
                return file;
        }
        file_init_path(file, &path, fops);
        /*
         * Disable all fsnotify events for pseudo files by default.
         * They may be enabled by caller with file_set_fsnotify_mode().
         */
        file_set_fsnotify_mode(file, FMODE_NONOTIFY);
        return file;
}
EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);

struct file *alloc_file_clone(struct file *base, int flags,
                                const struct file_operations *fops)
{
        struct file *f;

        f = alloc_file(&base->f_path, flags, fops);
        if (!IS_ERR(f)) {
                path_get(&f->f_path);
                f->f_mapping = base->f_mapping;
        }
        return f;
}

/* the real guts of fput() - releasing the last reference to file
 */
static void __fput(struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
        struct inode *inode = file->f_inode;
        fmode_t mode = file->f_mode;

        if (unlikely(!(file->f_mode & FMODE_OPENED)))
                goto out;

        might_sleep();

        fsnotify_close(file);
        /*
         * The function eventpoll_release() should be the first called
         * in the file cleanup chain.
         */
        eventpoll_release(file);
        locks_remove_file(file);

        security_file_release(file);
        if (unlikely(file->f_flags & FASYNC)) {
                if (file->f_op->fasync)
                        file->f_op->fasync(-1, file, 0);
        }
        if (file->f_op->release)
                file->f_op->release(inode, file);
        if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL &&
                     !(mode & FMODE_PATH))) {
                cdev_put(inode->i_cdev);
        }
        fops_put(file->f_op);
        file_f_owner_release(file);
        put_file_access(file);
        dput(dentry);
        if (unlikely(mode & FMODE_NEED_UNMOUNT))
                dissolve_on_fput(mnt);
        mntput(mnt);
out:
        file_free(file);
}

static LLIST_HEAD(delayed_fput_list);
static void delayed_fput(struct work_struct *unused)
{
        struct llist_node *node = llist_del_all(&delayed_fput_list);
        struct file *f, *t;

        llist_for_each_entry_safe(f, t, node, f_llist)
                __fput(f);
}

static void ____fput(struct callback_head *work)
{
        __fput(container_of(work, struct file, f_task_work));
}

static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);

/*
 * If kernel thread really needs to have the final fput() it has done
 * to complete, call this.  The only user right now is the boot - we
 * *do* need to make sure our writes to binaries on initramfs has
 * not left us with opened struct file waiting for __fput() - execve()
 * won't work without that.  Please, don't add more callers without
 * very good reasons; in particular, never call that with locks
 * held and never call that from a thread that might need to do
 * some work on any kind of umount.
 */
void flush_delayed_fput(void)
{
        delayed_fput(NULL);
        flush_delayed_work(&delayed_fput_work);
}
EXPORT_SYMBOL_GPL(flush_delayed_fput);

static void __fput_deferred(struct file *file)
{
        struct task_struct *task = current;

        if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
                file_free(file);
                return;
        }

        if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
                init_task_work(&file->f_task_work, ____fput);
                if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
                        return;
                /*
                 * After this task has run exit_task_work(),
                 * task_work_add() will fail.  Fall through to delayed
                 * fput to avoid leaking *file.
                 */
        }

        if (llist_add(&file->f_llist, &delayed_fput_list))
                schedule_delayed_work(&delayed_fput_work, 1);
}

void fput(struct file *file)
{
        if (unlikely(file_ref_put(&file->f_ref)))
                __fput_deferred(file);
}
EXPORT_SYMBOL(fput);

/*
 * synchronous analog of fput(); for kernel threads that might be needed
 * in some umount() (and thus can't use flush_delayed_fput() without
 * risking deadlocks), need to wait for completion of __fput() and know
 * for this specific struct file it won't involve anything that would
 * need them.  Use only if you really need it - at the very least,
 * don't blindly convert fput() by kernel thread to that.
 */
void __fput_sync(struct file *file)
{
        if (file_ref_put(&file->f_ref))
                __fput(file);
}
EXPORT_SYMBOL(__fput_sync);

/*
 * Equivalent to __fput_sync(), but optimized for being called with the last
 * reference.
 *
 * See file_ref_put_close() for details.
 */
void fput_close_sync(struct file *file)
{
        if (likely(file_ref_put_close(&file->f_ref)))
                __fput(file);
}

/*
 * Equivalent to fput(), but optimized for being called with the last
 * reference.
 *
 * See file_ref_put_close() for details.
 */
void fput_close(struct file *file)
{
        if (file_ref_put_close(&file->f_ref))
                __fput_deferred(file);
}

void __init files_init(void)
{
        struct kmem_cache_args args = {
                .use_freeptr_offset = true,
                .freeptr_offset = offsetof(struct file, f_freeptr),
        };

        filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
                                SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);

        args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
        bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
                                &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
                                SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
        percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}

/*
 * One file with associated inode and dcache is very roughly 1K. Per default
 * do not use more than 10% of our memory for files.
 */
void __init files_maxfiles_init(void)
{
        unsigned long n;
        unsigned long nr_pages = totalram_pages();
        unsigned long memreserve = (nr_pages - nr_free_pages()) * 3/2;

        memreserve = min(memreserve, nr_pages - 1);
        n = ((nr_pages - memreserve) * (PAGE_SIZE / 1024)) / 10;

        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
}


































































































































































































    2 












    1 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_WAIT_H
#define _LINUX_WAIT_H
/*
 * Linux wait queue related types and methods
 */
#include <linux/list.h>
#include <linux/stddef.h>
#include <linux/spinlock.h>

#include <asm/current.h>

typedef struct wait_queue_entry wait_queue_entry_t;

typedef int (*wait_queue_func_t)(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);
int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key);

/* wait_queue_entry::flags */
#define WQ_FLAG_EXCLUSIVE        0x01
#define WQ_FLAG_WOKEN                0x02
#define WQ_FLAG_CUSTOM                0x04
#define WQ_FLAG_DONE                0x08
#define WQ_FLAG_PRIORITY        0x10

/*
 * A single wait-queue entry structure:
 */
struct wait_queue_entry {
        unsigned int                flags;
        void                        *private;
        wait_queue_func_t        func;
        struct list_head        entry;
};

struct wait_queue_head {
        spinlock_t                lock;
        struct list_head        head;
};
typedef struct wait_queue_head wait_queue_head_t;

struct task_struct;

/*
 * Macros for declaration and initialisaton of the datatypes
 */

#define __WAITQUEUE_INITIALIZER(name, tsk) {                                        \
        .private        = tsk,                                                        \
        .func                = default_wake_function,                                \
        .entry                = { NULL, NULL } }

#define DECLARE_WAITQUEUE(name, tsk)                                                \
        struct wait_queue_entry name = __WAITQUEUE_INITIALIZER(name, tsk)

#define __WAIT_QUEUE_HEAD_INITIALIZER(name) {                                        \
        .lock                = __SPIN_LOCK_UNLOCKED(name.lock),                        \
        .head                = LIST_HEAD_INIT(name.head) }

#define DECLARE_WAIT_QUEUE_HEAD(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INITIALIZER(name)

extern void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *);

#define init_waitqueue_head(wq_head)                                                \
        do {                                                                        \
                static struct lock_class_key __key;                                \
                                                                                \
                __init_waitqueue_head((wq_head), #wq_head, &__key);                \
        } while (0)

#ifdef CONFIG_LOCKDEP
# define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \
        ({ init_waitqueue_head(&name); name; })
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) \
        struct wait_queue_head name = __WAIT_QUEUE_HEAD_INIT_ONSTACK(name)
#else
# define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
#endif

static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, struct task_struct *p)
{
        wq_entry->flags                = 0;
        wq_entry->private        = p;
        wq_entry->func                = default_wake_function;
}

static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
        wq_entry->flags                = 0;
        wq_entry->private        = NULL;
        wq_entry->func                = func;
}

/**
 * waitqueue_active -- locklessly test for waiters on the queue
 * @wq_head: the waitqueue to test for waiters
 *
 * returns true if the wait list is not empty
 *
 * NOTE: this function is lockless and requires care, incorrect usage _will_
 * lead to sporadic and non-obvious failure.
 *
 * Use either while holding wait_queue_head::lock or when used for wakeups
 * with an extra smp_mb() like::
 *
 *      CPU0 - waker                    CPU1 - waiter
 *
 *                                      for (;;) {
 *      @cond = true;                     prepare_to_wait(&wq_head, &wait, state);
 *      smp_mb();                         // smp_mb() from set_current_state()
 *      if (waitqueue_active(wq_head))         if (@cond)
 *        wake_up(wq_head);                      break;
 *                                        schedule();
 *                                      }
 *                                      finish_wait(&wq_head, &wait);
 *
 * Because without the explicit smp_mb() it's possible for the
 * waitqueue_active() load to get hoisted over the @cond store such that we'll
 * observe an empty wait list while the waiter might not observe @cond.
 *
 * Also note that this 'optimization' trades a spin_lock() for an smp_mb(),
 * which (when the lock is uncontended) are of roughly equal cost.
 */
static inline int waitqueue_active(struct wait_queue_head *wq_head)
{
        return !list_empty(&wq_head->head);
}

/**
 * wq_has_single_sleeper - check if there is only one sleeper
 * @wq_head: wait queue head
 *
 * Returns true of wq_head has only one sleeper on the list.
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_single_sleeper(struct wait_queue_head *wq_head)
{
        return list_is_singular(&wq_head->head);
}

/**
 * wq_has_sleeper - check if there are any waiting processes
 * @wq_head: wait queue head
 *
 * Returns true if wq_head has waiting processes
 *
 * Please refer to the comment for waitqueue_active.
 */
static inline bool wq_has_sleeper(struct wait_queue_head *wq_head)
{
        /*
         * We need to be sure we are in sync with the
         * add_wait_queue modifications to the wait queue.
         *
         * This memory barrier should be paired with one on the
         * waiting side.
         */
        smp_mb();
        return waitqueue_active(wq_head);
}

extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);

static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        struct list_head *head = &wq_head->head;
        struct wait_queue_entry *wq;

        list_for_each_entry(wq, &wq_head->head, entry) {
                if (!(wq->flags & WQ_FLAG_PRIORITY))
                        break;
                head = &wq->entry;
        }
        list_add(&wq_entry->entry, head);
}

/*
 * Used for wake-one threads:
 */
static inline void
__add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq_head, wq_entry);
}

static inline void __add_wait_queue_entry_tail(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_add_tail(&wq_entry->entry, &wq_head->head);
}

static inline void
__add_wait_queue_entry_tail_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        wq_entry->flags |= WQ_FLAG_EXCLUSIVE;
        __add_wait_queue_entry_tail(wq_head, wq_entry);
}

static inline void
__remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry)
{
        list_del(&wq_entry->entry);
}

int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_on_current_cpu(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_sync_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr);
void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode);
void __wake_up_pollfree(struct wait_queue_head *wq_head);

#define wake_up(x)                        __wake_up(x, TASK_NORMAL, 1, NULL)
#define wake_up_nr(x, nr)                __wake_up(x, TASK_NORMAL, nr, NULL)
#define wake_up_all(x)                        __wake_up(x, TASK_NORMAL, 0, NULL)
#define wake_up_locked(x)                __wake_up_locked((x), TASK_NORMAL, 1)
#define wake_up_all_locked(x)                __wake_up_locked((x), TASK_NORMAL, 0)
#define wake_up_sync(x)                        __wake_up_sync(x, TASK_NORMAL)

#define wake_up_interruptible(x)        __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL)
#define wake_up_interruptible_nr(x, nr)        __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL)
#define wake_up_interruptible_all(x)        __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
#define wake_up_interruptible_sync(x)        __wake_up_sync((x), TASK_INTERRUPTIBLE)

/*
 * Wakeup macros to be used to report events to the targets.
 */
#define poll_to_key(m) ((void *)(__force uintptr_t)(__poll_t)(m))
#define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
#define wake_up_poll(x, m)                                                        \
        __wake_up(x, TASK_NORMAL, 1, poll_to_key(m))
#define wake_up_poll_on_current_cpu(x, m)                                        \
        __wake_up_on_current_cpu(x, TASK_NORMAL, poll_to_key(m))
#define wake_up_locked_poll(x, m)                                                \
        __wake_up_locked_key((x), TASK_NORMAL, poll_to_key(m))
#define wake_up_interruptible_poll(x, m)                                        \
        __wake_up(x, TASK_INTERRUPTIBLE, 1, poll_to_key(m))
#define wake_up_interruptible_sync_poll(x, m)                                        \
        __wake_up_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))
#define wake_up_interruptible_sync_poll_locked(x, m)                                \
        __wake_up_locked_sync_key((x), TASK_INTERRUPTIBLE, poll_to_key(m))

/**
 * wake_up_pollfree - signal that a polled waitqueue is going away
 * @wq_head: the wait queue head
 *
 * In the very rare cases where a ->poll() implementation uses a waitqueue whose
 * lifetime is tied to a task rather than to the 'struct file' being polled,
 * this function must be called before the waitqueue is freed so that
 * non-blocking polls (e.g. epoll) are notified that the queue is going away.
 *
 * The caller must also RCU-delay the freeing of the wait_queue_head, e.g. via
 * an explicit synchronize_rcu() or call_rcu(), or via SLAB_TYPESAFE_BY_RCU.
 */
static inline void wake_up_pollfree(struct wait_queue_head *wq_head)
{
        /*
         * For performance reasons, we don't always take the queue lock here.
         * Therefore, we might race with someone removing the last entry from
         * the queue, and proceed while they still hold the queue lock.
         * However, rcu_read_lock() is required to be held in such cases, so we
         * can safely proceed with an RCU-delayed free.
         */
        if (waitqueue_active(wq_head))
                __wake_up_pollfree(wq_head);
}

#define ___wait_cond_timeout(condition)                                                \
({                                                                                \
        bool __cond = (condition);                                                \
        if (__cond && !__ret)                                                        \
                __ret = 1;                                                        \
        __cond || !__ret;                                                        \
})

#define ___wait_is_interruptible(state)                                                \
        (!__builtin_constant_p(state) ||                                        \
         (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))

extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);

/*
 * The below macro ___wait_event() has an explicit shadow of the __ret
 * variable when used from the wait_event_*() macros.
 *
 * This is so that both can use the ___wait_cond_timeout() construct
 * to wrap the condition.
 *
 * The type inconsistency of the wait_event_*() __ret variable is also
 * on purpose; we use long where we can return timeout values and int
 * otherwise.
 */

#define ___wait_event(wq_head, condition, state, exclusive, ret, cmd)                \
({                                                                                \
        __label__ __out;                                                        \
        struct wait_queue_entry __wq_entry;                                        \
        long __ret = ret;        /* explicit shadow */                                \
                                                                                \
        init_wait_entry(&__wq_entry, exclusive ? WQ_FLAG_EXCLUSIVE : 0);        \
        for (;;) {                                                                \
                long __int = prepare_to_wait_event(&wq_head, &__wq_entry, state);\
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
                                                                                \
                if (___wait_is_interruptible(state) && __int) {                        \
                        __ret = __int;                                                \
                        goto __out;                                                \
                }                                                                \
                                                                                \
                cmd;                                                                \
                                                                                \
                if (condition)                                                        \
                        break;                                                        \
        }                                                                        \
        finish_wait(&wq_head, &__wq_entry);                                        \
__out:        __ret;                                                                        \
})

#define __wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            schedule())

/**
 * wait_event - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event(wq_head, condition)                                                \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event(wq_head, condition);                                        \
} while (0)

#define __io_wait_event(wq_head, condition)                                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            io_schedule())

/*
 * io_wait_event() -- like wait_event() but with io_schedule()
 */
#define io_wait_event(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (condition)                                                                \
                break;                                                                \
        __io_wait_event(wq_head, condition);                                        \
} while (0)

#define __wait_event_freezable(wq_head, condition)                                \
        ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE),        \
                        0, 0, schedule())

/**
 * wait_event_freezable - sleep (or freeze) until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE -- so as not to contribute
 * to system load) until the @condition evaluates to true. The
 * @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_freezable(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_UNINTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_timeout(wq_head, condition, timeout)                                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_freezable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout,                \
                      __ret = schedule_timeout(__ret))

/*
 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid
 * increasing load and is freezable.
 */
#define wait_event_freezable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_freezable_timeout(wq_head, condition, timeout); \
        __ret;                                                                        \
})

#define __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 1, 0,        \
                            cmd1; schedule(); cmd2)
/*
 * Just like wait_event_cmd(), except it sets exclusive flag
 */
#define wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2)                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_exclusive_cmd(wq_head, condition, cmd1, cmd2);                \
} while (0)

#define __wait_event_cmd(wq_head, condition, cmd1, cmd2)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            cmd1; schedule(); cmd2)

/**
 * wait_event_cmd - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @cmd1: the command will be executed before sleep
 * @cmd2: the command will be executed after sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 */
#define wait_event_cmd(wq_head, condition, cmd1, cmd2)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_cmd(wq_head, condition, cmd1, cmd2);                        \
} while (0)

#define __wait_event_interruptible(wq_head, condition)                                \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      schedule())

/**
 * wait_event_interruptible - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible(wq_head, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_INTERRUPTIBLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_interruptible_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a signal.
 */
#define wait_event_interruptible_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_interruptible_timeout(wq_head,                \
                                                condition, timeout);                \
        __ret;                                                                        \
})

#define __wait_event_hrtimeout(wq_head, condition, timeout, state)                \
({                                                                                \
        int __ret = 0;                                                                \
        struct hrtimer_sleeper __t;                                                \
                                                                                \
        hrtimer_setup_sleeper_on_stack(&__t, CLOCK_MONOTONIC,                        \
                                       HRTIMER_MODE_REL);                        \
        if ((timeout) != KTIME_MAX) {                                                \
                hrtimer_set_expires_range_ns(&__t.timer, timeout,                \
                                        current->timer_slack_ns);                \
                hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL);                \
        }                                                                        \
                                                                                \
        __ret = ___wait_event(wq_head, condition, state, 0, 0,                        \
                if (!__t.task) {                                                \
                        __ret = -ETIME;                                                \
                        break;                                                        \
                }                                                                \
                schedule());                                                        \
                                                                                \
        hrtimer_cancel(&__t.timer);                                                \
        destroy_hrtimer_on_stack(&__t.timer);                                        \
        __ret;                                                                        \
})

/**
 * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, or -ETIME if the timeout
 * elapsed.
 */
#define wait_event_hrtimeout(wq_head, condition, timeout)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq_head, condition, timeout,        \
                                               TASK_UNINTERRUPTIBLE);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, as a ktime_t
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function returns 0 if @condition became true, -ERESTARTSYS if it was
 * interrupted by a signal, or -ETIME if the timeout elapsed.
 */
#define wait_event_interruptible_hrtimeout(wq, condition, timeout)                \
({                                                                                \
        long __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_hrtimeout(wq, condition, timeout,                \
                                               TASK_INTERRUPTIBLE);                \
        __ret;                                                                        \
})

#define __wait_event_interruptible_exclusive(wq, condition)                        \
        ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0,                        \
                      schedule())

#define wait_event_interruptible_exclusive(wq, condition)                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_exclusive(wq, condition);        \
        __ret;                                                                        \
})

#define __wait_event_killable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, TASK_KILLABLE, 1, 0,                        \
                      schedule())

#define wait_event_killable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable_exclusive(wq, condition);                \
        __ret;                                                                        \
})


#define __wait_event_freezable_exclusive(wq, condition)                                \
        ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\
                        schedule())

#define wait_event_freezable_exclusive(wq, condition)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_freezable_exclusive(wq, condition);        \
        __ret;                                                                        \
})

/**
 * wait_event_idle - wait for a condition without contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle(wq_head, condition)                                        \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 0, 0, schedule());        \
} while (0)

/**
 * wait_event_idle_exclusive - wait for a condition with contributing to system load
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 */
#define wait_event_idle_exclusive(wq_head, condition)                                \
do {                                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                ___wait_event(wq_head, condition, TASK_IDLE, 1, 0, schedule());        \
} while (0)

#define __wait_event_idle_timeout(wq_head, condition, timeout)                        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 0, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_timeout(wq_head, condition, timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_timeout(wq_head, condition, timeout);        \
        __ret;                                                                        \
})

#define __wait_event_idle_exclusive_timeout(wq_head, condition, timeout)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_IDLE, 1, timeout,                                        \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_idle_exclusive_timeout - sleep without load until a condition becomes true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_IDLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus if other processes wait on the same list, when this
 * process is woken further processes are not considered.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
#define wait_event_idle_exclusive_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_idle_exclusive_timeout(wq_head, condition, timeout);\
        __ret;                                                                        \
})

extern int do_wait_intr(wait_queue_head_t *, wait_queue_entry_t *);
extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);

#define __wait_event_interruptible_locked(wq, condition, exclusive, fn)                \
({                                                                                \
        int __ret;                                                                \
        DEFINE_WAIT(__wait);                                                        \
        if (exclusive)                                                                \
                __wait.flags |= WQ_FLAG_EXCLUSIVE;                                \
        do {                                                                        \
                __ret = fn(&(wq), &__wait);                                        \
                if (__ret)                                                        \
                        break;                                                        \
        } while (!(condition));                                                        \
        __remove_wait_queue(&(wq), &__wait);                                        \
        __set_current_state(TASK_RUNNING);                                        \
        __ret;                                                                        \
})


/**
 * wait_event_interruptible_locked - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked(wq, condition)                                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr))

/**
 * wait_event_interruptible_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_locked_irq(wq, condition)                        \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 0, do_wait_intr_irq))

/**
 * wait_event_interruptible_exclusive_locked - sleep exclusively until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock()/spin_unlock()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr))

/**
 * wait_event_interruptible_exclusive_locked_irq - sleep until a condition gets true
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq is woken up.
 *
 * It must be called with wq.lock being held.  This spinlock is
 * unlocked while sleeping but @condition testing is done while lock
 * is held and when this macro exits the lock is held.
 *
 * The lock is locked/unlocked using spin_lock_irq()/spin_unlock_irq()
 * functions which must match the way they are locked/unlocked outside
 * of this macro.
 *
 * The process is put on the wait queue with an WQ_FLAG_EXCLUSIVE flag
 * set thus when other process waits process on the list if this
 * process is awaken further processes are not considered.
 *
 * wake_up_locked() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_exclusive_locked_irq(wq, condition)                \
        ((condition)                                                                \
         ? 0 : __wait_event_interruptible_locked(wq, condition, 1, do_wait_intr_irq))


#define __wait_event_killable(wq, condition)                                        \
        ___wait_event(wq, condition, TASK_KILLABLE, 0, 0, schedule())

/**
 * wait_event_killable - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a
 * signal and 0 if @condition evaluated to true.
 */
#define wait_event_killable(wq_head, condition)                                        \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_killable(wq_head, condition);                \
        __ret;                                                                        \
})

#define __wait_event_state(wq, condition, state)                                \
        ___wait_event(wq, condition, state, 0, 0, schedule())

/**
 * wait_event_state - sleep until a condition gets true
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @state: state to sleep in
 *
 * The process is put to sleep (@state) until the @condition evaluates to true
 * or a signal is received (when allowed by @state).  The @condition is checked
 * each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * The function will return -ERESTARTSYS if it was interrupted by a signal
 * (when allowed by @state) and 0 if @condition evaluated to true.
 */
#define wait_event_state(wq_head, condition, state)                                \
({                                                                                \
        int __ret = 0;                                                                \
        might_sleep();                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_state(wq_head, condition, state);                \
        __ret;                                                                        \
})

#define __wait_event_killable_timeout(wq_head, condition, timeout)                \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      TASK_KILLABLE, 0, timeout,                                \
                      __ret = schedule_timeout(__ret))

/**
 * wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_KILLABLE) until the
 * @condition evaluates to true or a kill signal is received.
 * The @condition is checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * Returns:
 * 0 if the @condition evaluated to %false after the @timeout elapsed,
 * 1 if the @condition evaluated to %true after the @timeout elapsed,
 * the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
 * interrupted by a kill signal.
 *
 * Only kill signals interrupt this process.
 */
#define wait_event_killable_timeout(wq_head, condition, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        might_sleep();                                                                \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_killable_timeout(wq_head,                        \
                                                condition, timeout);                \
        __ret;                                                                        \
})


#define __wait_event_lock_irq(wq_head, condition, lock, cmd)                        \
        (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0,        \
                            spin_unlock_irq(&lock);                                \
                            cmd;                                                \
                            schedule();                                                \
                            spin_lock_irq(&lock))

/**
 * wait_event_lock_irq_cmd - sleep until a condition gets true. The
 *                             condition is checked under the lock. This
 *                             is expected to be called with the lock
 *                             taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd
 *          and schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 */
#define wait_event_lock_irq_cmd(wq_head, condition, lock, cmd)                        \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, cmd);                        \
} while (0)

/**
 * wait_event_lock_irq - sleep until a condition gets true. The
 *                         condition is checked under the lock. This
 *                         is expected to be called with the lock
 *                         taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
 * @condition evaluates to true. The @condition is checked each time
 * the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 */
#define wait_event_lock_irq(wq_head, condition, lock)                                \
do {                                                                                \
        if (condition)                                                                \
                break;                                                                \
        __wait_event_lock_irq(wq_head, condition, lock, );                        \
} while (0)


#define __wait_event_interruptible_lock_irq(wq_head, condition, lock, cmd)        \
        ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0,                \
                      spin_unlock_irq(&lock);                                        \
                      cmd;                                                        \
                      schedule();                                                \
                      spin_lock_irq(&lock))

/**
 * wait_event_interruptible_lock_irq_cmd - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected to
 *                be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before cmd and
 *          schedule() and reacquired afterwards.
 * @cmd: a command which is invoked outside the critical section before
 *         sleep
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or a signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before invoking the cmd and going to sleep and is reacquired
 * afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq_cmd(wq_head, condition, lock, cmd)        \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock, cmd);                \
        __ret;                                                                        \
})

/**
 * wait_event_interruptible_lock_irq - sleep until a condition gets true.
 *                The condition is checked under the lock. This is expected
 *                to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The macro will return -ERESTARTSYS if it was interrupted by a signal
 * and 0 if @condition evaluated to true.
 */
#define wait_event_interruptible_lock_irq(wq_head, condition, lock)                \
({                                                                                \
        int __ret = 0;                                                                \
        if (!(condition))                                                        \
                __ret = __wait_event_interruptible_lock_irq(wq_head,                \
                                                condition, lock,);                \
        __ret;                                                                        \
})

#define __wait_event_lock_irq_timeout(wq_head, condition, lock, timeout, state)        \
        ___wait_event(wq_head, ___wait_cond_timeout(condition),                        \
                      state, 0, timeout,                                        \
                      spin_unlock_irq(&lock);                                        \
                      __ret = schedule_timeout(__ret);                                \
                      spin_lock_irq(&lock));

/**
 * wait_event_interruptible_lock_irq_timeout - sleep until a condition gets
 *                true or a timeout elapses. The condition is checked under
 *                the lock. This is expected to be called with the lock taken.
 * @wq_head: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @lock: a locked spinlock_t, which will be released before schedule()
 *          and reacquired afterwards.
 * @timeout: timeout, in jiffies
 *
 * The process is put to sleep (TASK_INTERRUPTIBLE) until the
 * @condition evaluates to true or signal is received. The @condition is
 * checked each time the waitqueue @wq_head is woken up.
 *
 * wake_up() has to be called after changing any variable that could
 * change the result of the wait condition.
 *
 * This is supposed to be called while holding the lock. The lock is
 * dropped before going to sleep and is reacquired afterwards.
 *
 * The function returns 0 if the @timeout elapsed, -ERESTARTSYS if it
 * was interrupted by a signal, and the remaining jiffies otherwise
 * if the condition evaluated to true before the timeout elapsed.
 */
#define wait_event_interruptible_lock_irq_timeout(wq_head, condition, lock,        \
                                                  timeout)                        \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_INTERRUPTIBLE);                        \
        __ret;                                                                        \
})

#define wait_event_lock_irq_timeout(wq_head, condition, lock, timeout)                \
({                                                                                \
        long __ret = timeout;                                                        \
        if (!___wait_cond_timeout(condition))                                        \
                __ret = __wait_event_lock_irq_timeout(                                \
                                        wq_head, condition, lock, timeout,        \
                                        TASK_UNINTERRUPTIBLE);                        \
        __ret;                                                                        \
})

/*
 * Waitqueues which are removed from the waitqueue_head at wakeup time
 */
void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);
int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key);

#define DEFINE_WAIT_FUNC(name, function)                                        \
        struct wait_queue_entry name = {                                        \
                .private        = current,                                        \
                .func                = function,                                        \
                .entry                = LIST_HEAD_INIT((name).entry),                        \
        }

#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)

#define init_wait_func(wait, function)                                                \
        do {                                                                        \
                (wait)->private = current;                                        \
                (wait)->func = function;                                        \
                INIT_LIST_HEAD(&(wait)->entry);                                        \
                (wait)->flags = 0;                                                \
        } while (0)

#define init_wait(wait)        init_wait_func(wait, autoremove_wake_function)

typedef int (*task_call_f)(struct task_struct *p, void *arg);
extern int task_call_func(struct task_struct *p, task_call_f func, void *arg);

#endif /* _LINUX_WAIT_H */






















































































































































































































































































  370 













  370 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM cgroup

#if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CGROUP_H

#include <linux/cgroup.h>
#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(cgroup_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        u16,                ss_mask                        )
                __string(        name,                root->name                )
        ),

        TP_fast_assign(
                __entry->root = root->hierarchy_id;
                __entry->ss_mask = root->subsys_mask;
                __assign_str(name);
        ),

        TP_printk("root=%d ss_mask=%#x name=%s",
                  __entry->root, __entry->ss_mask, __get_str(name))
);

DEFINE_EVENT(cgroup_root, cgroup_setup_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DEFINE_EVENT(cgroup_root, cgroup_destroy_root,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DEFINE_EVENT(cgroup_root, cgroup_remount,

        TP_PROTO(struct cgroup_root *root),

        TP_ARGS(root)
);

DECLARE_EVENT_CLASS(cgroup,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __string(        path,                path                        )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __assign_str(path);
        ),

        TP_printk("root=%d id=%llu level=%d path=%s",
                  __entry->root, __entry->id, __entry->level, __get_str(path))
);

DEFINE_EVENT(cgroup, cgroup_mkdir,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_rmdir,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_release,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_rename,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_freeze,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DEFINE_EVENT(cgroup, cgroup_unfreeze,

        TP_PROTO(struct cgroup *cgrp, const char *path),

        TP_ARGS(cgrp, path)
);

DECLARE_EVENT_CLASS(cgroup_migrate,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup),

        TP_STRUCT__entry(
                __field(        int,                dst_root                )
                __field(        int,                dst_level                )
                __field(        u64,                dst_id                        )
                __field(        int,                pid                        )
                __string(        dst_path,        path                        )
                __string(        comm,                task->comm                )
        ),

        TP_fast_assign(
                __entry->dst_root = dst_cgrp->root->hierarchy_id;
                __entry->dst_id = cgroup_id(dst_cgrp);
                __entry->dst_level = dst_cgrp->level;
                __assign_str(dst_path);
                __entry->pid = task->pid;
                __assign_str(comm);
        ),

        TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s",
                  __entry->dst_root, __entry->dst_id, __entry->dst_level,
                  __get_str(dst_path), __entry->pid, __get_str(comm))
);

DEFINE_EVENT(cgroup_migrate, cgroup_attach_task,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup)
);

DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks,

        TP_PROTO(struct cgroup *dst_cgrp, const char *path,
                 struct task_struct *task, bool threadgroup),

        TP_ARGS(dst_cgrp, path, task, threadgroup)
);

DECLARE_EVENT_CLASS(cgroup_event,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __string(        path,                path                        )
                __field(        int,                val                        )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __assign_str(path);
                __entry->val = val;
        ),

        TP_printk("root=%d id=%llu level=%d path=%s val=%d",
                  __entry->root, __entry->id, __entry->level, __get_str(path),
                  __entry->val)
);

DEFINE_EVENT(cgroup_event, cgroup_notify_populated,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val)
);

DEFINE_EVENT(cgroup_event, cgroup_notify_frozen,

        TP_PROTO(struct cgroup *cgrp, const char *path, int val),

        TP_ARGS(cgrp, path, val)
);

DECLARE_EVENT_CLASS(cgroup_rstat,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended),

        TP_STRUCT__entry(
                __field(        int,                root                        )
                __field(        int,                level                        )
                __field(        u64,                id                        )
                __field(        int,                cpu                        )
                __field(        bool,                contended                )
        ),

        TP_fast_assign(
                __entry->root = cgrp->root->hierarchy_id;
                __entry->id = cgroup_id(cgrp);
                __entry->level = cgrp->level;
                __entry->cpu = cpu;
                __entry->contended = contended;
        ),

        TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d",
                  __entry->root, __entry->id, __entry->level,
                  __entry->cpu, __entry->contended)
);

/* Related to global: cgroup_rstat_lock */
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

/* Related to per CPU: cgroup_rstat_cpu_lock */
DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath,

        TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),

        TP_ARGS(cgrp, cpu, contended)
);

#endif /* _TRACE_CGROUP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>

















































































































































































































































































































































































































































































































































































































































































































































































































   56 


   57 
   56 

   56 
   56 






   57 
















    1 




    1 





















































































































































































































































































    1 












































































































































































































































































































































































    1 

   21 









   24 





   24 

























































    5 







    5 



    5 



























































    6 
    6 



    6 





    6 



    1 








    5 



























































   29 















   29 








   28 











   29 





   28 













   22 




    6 

    1 

    5 



















   22 




    1 





   21 

   21 



































   24 











   24 




   24 
   24 







   24 
   24 
   24 















   24 



   23 


























































   23 







   24 

   24 





    2 



    2 
    2 


    4 








   28 









   29 




   29 






















































































































































































































































































    3 
















    3 




    3 



    3 












    3 























   26 






   26 
























































































































































































































































































































































































































































































    1 
    1 








    1 


    1 





























    1 








    1 



    1 
    1 













































































    2 



    2 






















































   32 













    2 
    1 
    3 


   30 






    1 






    2 























    1 

















   26 







    1 


    1 

    1 







    1 












    1 


















    1 












    2 






    1 





















    1 



    1 







    1 















    1 





    1 


















    1 











    1 







    1 





    1 



    1 




    2 









    1 



    7 



    1 





   28 





   33 




































































































































































































































































































    3 







    3 




































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  TUN - Universal TUN/TAP device driver.
 *  Copyright (C) 1999-2002 Maxim Krasnyansky <maxk@qualcomm.com>
 *
 *  $Id: tun.c,v 1.15 2002/03/01 02:44:24 maxk Exp $
 */

/*
 *  Changes:
 *
 *  Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
 *    Add TUNSETLINK ioctl to set the link encapsulation
 *
 *  Mark Smith <markzzzsmith@yahoo.com.au>
 *    Use eth_random_addr() for tap MAC address.
 *
 *  Harald Roelle <harald.roelle@ifi.lmu.de>  2004/04/20
 *    Fixes in packet dropping, queue length setting and queue wakeup.
 *    Increased default tx queue length.
 *    Added ethtool API.
 *    Minor cleanups
 *
 *  Daniel Podlejski <underley@underley.eu.org>
 *    Modifications for 2.3.99-pre5 kernel.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#define DRV_NAME        "tun"
#define DRV_VERSION        "1.6"
#define DRV_DESCRIPTION        "Universal TUN/TAP device driver"
#define DRV_COPYRIGHT        "(C) 1999-2004 Max Krasnyansky <maxk@qualcomm.com>"

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/major.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fcntl.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/miscdevice.h>
#include <linux/ethtool.h>
#include <linux/rtnetlink.h>
#include <linux/compat.h>
#include <linux/if.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
#include <linux/if_tun.h>
#include <linux/if_vlan.h>
#include <linux/crc32.h>
#include <linux/math.h>
#include <linux/nsproxy.h>
#include <linux/virtio_net.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <net/xdp.h>
#include <net/ip_tunnels.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/mutex.h>
#include <linux/ieee802154.h>
#include <uapi/linux/if_ltalk.h>
#include <uapi/linux/if_fddi.h>
#include <uapi/linux/if_hippi.h>
#include <uapi/linux/if_fc.h>
#include <net/ax25.h>
#include <net/rose.h>
#include <net/6lowpan.h>
#include <net/rps.h>

#include <linux/uaccess.h>
#include <linux/proc_fs.h>

#include "tun_vnet.h"

static void tun_default_link_ksettings(struct net_device *dev,
                                       struct ethtool_link_ksettings *cmd);

#define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)

/* TUN device flags */

/* IFF_ATTACH_QUEUE is never stored in device flags,
 * overload it to mean fasync when stored there.
 */
#define TUN_FASYNC        IFF_ATTACH_QUEUE

#define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
                      IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS)

#define GOODCOPY_LEN 128

#define FLT_EXACT_COUNT 8
struct tap_filter {
        unsigned int    count;    /* Number of addrs. Zero means disabled */
        u32             mask[2];  /* Mask of the hashed addrs */
        unsigned char        addr[FLT_EXACT_COUNT][ETH_ALEN];
};

/* MAX_TAP_QUEUES 256 is chosen to allow rx/tx queues to be equal
 * to max number of VCPUs in guest. */
#define MAX_TAP_QUEUES 256
#define MAX_TAP_FLOWS  4096

#define TUN_FLOW_EXPIRE (3 * HZ)

/* A tun_file connects an open character device to a tuntap netdevice. It
 * also contains all socket related structures (except sock_fprog and tap_filter)
 * to serve as one transmit queue for tuntap device. The sock_fprog and
 * tap_filter were kept in tun_struct since they were used for filtering for the
 * netdevice not for a specific queue (at least I didn't see the requirement for
 * this).
 *
 * RCU usage:
 * The tun_file and tun_struct are loosely coupled, the pointer from one to the
 * other can only be read while rcu_read_lock or rtnl_lock is held.
 */
struct tun_file {
        struct sock sk;
        struct socket socket;
        struct tun_struct __rcu *tun;
        struct fasync_struct *fasync;
        /* only used for fasnyc */
        unsigned int flags;
        union {
                u16 queue_index;
                unsigned int ifindex;
        };
        struct napi_struct napi;
        bool napi_enabled;
        bool napi_frags_enabled;
        struct mutex napi_mutex;        /* Protects access to the above napi */
        struct list_head next;
        struct tun_struct *detached;
        struct ptr_ring tx_ring;
        struct xdp_rxq_info xdp_rxq;
};

struct tun_page {
        struct page *page;
        int count;
};

struct tun_flow_entry {
        struct hlist_node hash_link;
        struct rcu_head rcu;
        struct tun_struct *tun;

        u32 rxhash;
        u32 rps_rxhash;
        int queue_index;
        unsigned long updated ____cacheline_aligned_in_smp;
};

#define TUN_NUM_FLOW_ENTRIES 1024
#define TUN_MASK_FLOW_ENTRIES (TUN_NUM_FLOW_ENTRIES - 1)

struct tun_prog {
        struct rcu_head rcu;
        struct bpf_prog *prog;
};

/* Since the socket were moved to tun_file, to preserve the behavior of persist
 * device, socket filter, sndbuf and vnet header size were restore when the
 * file were attached to a persist device.
 */
struct tun_struct {
        struct tun_file __rcu        *tfiles[MAX_TAP_QUEUES];
        unsigned int            numqueues;
        unsigned int                 flags;
        kuid_t                        owner;
        kgid_t                        group;

        struct net_device        *dev;
        netdev_features_t        set_features;
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
                          NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4)

        int                        align;
        int                        vnet_hdr_sz;
        int                        sndbuf;
        struct tap_filter        txflt;
        struct sock_fprog        fprog;
        /* protected by rtnl lock */
        bool                        filter_attached;
        u32                        msg_enable;
        spinlock_t lock;
        struct hlist_head flows[TUN_NUM_FLOW_ENTRIES];
        struct timer_list flow_gc_timer;
        unsigned long ageing_time;
        unsigned int numdisabled;
        struct list_head disabled;
        void *security;
        u32 flow_count;
        u32 rx_batched;
        atomic_long_t rx_frame_errors;
        struct bpf_prog __rcu *xdp_prog;
        struct tun_prog __rcu *steering_prog;
        struct tun_prog __rcu *filter_prog;
        struct ethtool_link_ksettings link_ksettings;
        /* init args */
        struct file *file;
        struct ifreq *ifr;
};

struct veth {
        __be16 h_vlan_proto;
        __be16 h_vlan_TCI;
};

static void tun_flow_init(struct tun_struct *tun);
static void tun_flow_uninit(struct tun_struct *tun);

static int tun_napi_receive(struct napi_struct *napi, int budget)
{
        struct tun_file *tfile = container_of(napi, struct tun_file, napi);
        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
        struct sk_buff_head process_queue;
        struct sk_buff *skb;
        int received = 0;

        __skb_queue_head_init(&process_queue);

        spin_lock(&queue->lock);
        skb_queue_splice_tail_init(queue, &process_queue);
        spin_unlock(&queue->lock);

        while (received < budget && (skb = __skb_dequeue(&process_queue))) {
                napi_gro_receive(napi, skb);
                ++received;
        }

        if (!skb_queue_empty(&process_queue)) {
                spin_lock(&queue->lock);
                skb_queue_splice(&process_queue, queue);
                spin_unlock(&queue->lock);
        }

        return received;
}

static int tun_napi_poll(struct napi_struct *napi, int budget)
{
        unsigned int received;

        received = tun_napi_receive(napi, budget);

        if (received < budget)
                napi_complete_done(napi, received);

        return received;
}

static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile,
                          bool napi_en, bool napi_frags)
{
        tfile->napi_enabled = napi_en;
        tfile->napi_frags_enabled = napi_en && napi_frags;
        if (napi_en) {
                netif_napi_add_tx(tun->dev, &tfile->napi, tun_napi_poll);
                napi_enable(&tfile->napi);
        }
}

static void tun_napi_enable(struct tun_file *tfile)
{
        if (tfile->napi_enabled)
                napi_enable(&tfile->napi);
}

static void tun_napi_disable(struct tun_file *tfile)
{
        if (tfile->napi_enabled)
                napi_disable(&tfile->napi);
}

static void tun_napi_del(struct tun_file *tfile)
{
        if (tfile->napi_enabled)
                netif_napi_del(&tfile->napi);
}

static bool tun_napi_frags_enabled(const struct tun_file *tfile)
{
        return tfile->napi_frags_enabled;
}

static inline u32 tun_hashfn(u32 rxhash)
{
        return rxhash & TUN_MASK_FLOW_ENTRIES;
}

static struct tun_flow_entry *tun_flow_find(struct hlist_head *head, u32 rxhash)
{
        struct tun_flow_entry *e;

        hlist_for_each_entry_rcu(e, head, hash_link) {
                if (e->rxhash == rxhash)
                        return e;
        }
        return NULL;
}

static struct tun_flow_entry *tun_flow_create(struct tun_struct *tun,
                                              struct hlist_head *head,
                                              u32 rxhash, u16 queue_index)
{
        struct tun_flow_entry *e = kmalloc(sizeof(*e), GFP_ATOMIC);

        if (e) {
                netif_info(tun, tx_queued, tun->dev,
                           "create flow: hash %u index %u\n",
                           rxhash, queue_index);
                e->updated = jiffies;
                e->rxhash = rxhash;
                e->rps_rxhash = 0;
                e->queue_index = queue_index;
                e->tun = tun;
                hlist_add_head_rcu(&e->hash_link, head);
                ++tun->flow_count;
        }
        return e;
}

static void tun_flow_delete(struct tun_struct *tun, struct tun_flow_entry *e)
{
        netif_info(tun, tx_queued, tun->dev, "delete flow: hash %u index %u\n",
                   e->rxhash, e->queue_index);
        hlist_del_rcu(&e->hash_link);
        kfree_rcu(e, rcu);
        --tun->flow_count;
}

static void tun_flow_flush(struct tun_struct *tun)
{
        int i;

        spin_lock_bh(&tun->lock);
        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
                struct tun_flow_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link)
                        tun_flow_delete(tun, e);
        }
        spin_unlock_bh(&tun->lock);
}

static void tun_flow_delete_by_queue(struct tun_struct *tun, u16 queue_index)
{
        int i;

        spin_lock_bh(&tun->lock);
        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
                struct tun_flow_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
                        if (e->queue_index == queue_index)
                                tun_flow_delete(tun, e);
                }
        }
        spin_unlock_bh(&tun->lock);
}

static void tun_flow_cleanup(struct timer_list *t)
{
        struct tun_struct *tun = from_timer(tun, t, flow_gc_timer);
        unsigned long delay = tun->ageing_time;
        unsigned long next_timer = jiffies + delay;
        unsigned long count = 0;
        int i;

        spin_lock(&tun->lock);
        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++) {
                struct tun_flow_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &tun->flows[i], hash_link) {
                        unsigned long this_timer;

                        this_timer = e->updated + delay;
                        if (time_before_eq(this_timer, jiffies)) {
                                tun_flow_delete(tun, e);
                                continue;
                        }
                        count++;
                        if (time_before(this_timer, next_timer))
                                next_timer = this_timer;
                }
        }

        if (count)
                mod_timer(&tun->flow_gc_timer, round_jiffies_up(next_timer));
        spin_unlock(&tun->lock);
}

static void tun_flow_update(struct tun_struct *tun, u32 rxhash,
                            struct tun_file *tfile)
{
        struct hlist_head *head;
        struct tun_flow_entry *e;
        unsigned long delay = tun->ageing_time;
        u16 queue_index = tfile->queue_index;

        head = &tun->flows[tun_hashfn(rxhash)];

        rcu_read_lock();

        e = tun_flow_find(head, rxhash);
        if (likely(e)) {
                /* TODO: keep queueing to old queue until it's empty? */
                if (READ_ONCE(e->queue_index) != queue_index)
                        WRITE_ONCE(e->queue_index, queue_index);
                if (e->updated != jiffies)
                        e->updated = jiffies;
                sock_rps_record_flow_hash(e->rps_rxhash);
        } else {
                spin_lock_bh(&tun->lock);
                if (!tun_flow_find(head, rxhash) &&
                    tun->flow_count < MAX_TAP_FLOWS)
                        tun_flow_create(tun, head, rxhash, queue_index);

                if (!timer_pending(&tun->flow_gc_timer))
                        mod_timer(&tun->flow_gc_timer,
                                  round_jiffies_up(jiffies + delay));
                spin_unlock_bh(&tun->lock);
        }

        rcu_read_unlock();
}

/* Save the hash received in the stack receive path and update the
 * flow_hash table accordingly.
 */
static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
{
        if (unlikely(e->rps_rxhash != hash))
                e->rps_rxhash = hash;
}

/* We try to identify a flow through its rxhash. The reason that
 * we do not check rxq no. is because some cards(e.g 82599), chooses
 * the rxq based on the txq where the last packet of the flow comes. As
 * the userspace application move between processors, we may get a
 * different rxq no. here.
 */
static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
        struct tun_flow_entry *e;
        u32 txq, numqueues;

        numqueues = READ_ONCE(tun->numqueues);

        txq = __skb_get_hash_symmetric(skb);
        e = tun_flow_find(&tun->flows[tun_hashfn(txq)], txq);
        if (e) {
                tun_flow_save_rps_rxhash(e, txq);
                txq = e->queue_index;
        } else {
                txq = reciprocal_scale(txq, numqueues);
        }

        return txq;
}

static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
        struct tun_prog *prog;
        u32 numqueues;
        u16 ret = 0;

        numqueues = READ_ONCE(tun->numqueues);
        if (!numqueues)
                return 0;

        prog = rcu_dereference(tun->steering_prog);
        if (prog)
                ret = bpf_prog_run_clear_cb(prog->prog, skb);

        return ret % numqueues;
}

static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
                            struct net_device *sb_dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        u16 ret;

        rcu_read_lock();
        if (rcu_dereference(tun->steering_prog))
                ret = tun_ebpf_select_queue(tun, skb);
        else
                ret = tun_automq_select_queue(tun, skb);
        rcu_read_unlock();

        return ret;
}

static inline bool tun_not_capable(struct tun_struct *tun)
{
        const struct cred *cred = current_cred();
        struct net *net = dev_net(tun->dev);

        return ((uid_valid(tun->owner) && !uid_eq(cred->euid, tun->owner)) ||
                (gid_valid(tun->group) && !in_egroup_p(tun->group))) &&
                !ns_capable(net->user_ns, CAP_NET_ADMIN);
}

static void tun_set_real_num_queues(struct tun_struct *tun)
{
        netif_set_real_num_tx_queues(tun->dev, tun->numqueues);
        netif_set_real_num_rx_queues(tun->dev, tun->numqueues);
}

static void tun_disable_queue(struct tun_struct *tun, struct tun_file *tfile)
{
        tfile->detached = tun;
        list_add_tail(&tfile->next, &tun->disabled);
        ++tun->numdisabled;
}

static struct tun_struct *tun_enable_queue(struct tun_file *tfile)
{
        struct tun_struct *tun = tfile->detached;

        tfile->detached = NULL;
        list_del_init(&tfile->next);
        --tun->numdisabled;
        return tun;
}

void tun_ptr_free(void *ptr)
{
        if (!ptr)
                return;
        if (tun_is_xdp_frame(ptr)) {
                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);

                xdp_return_frame(xdpf);
        } else {
                __skb_array_destroy_skb(ptr);
        }
}
EXPORT_SYMBOL_GPL(tun_ptr_free);

static void tun_queue_purge(struct tun_file *tfile)
{
        void *ptr;

        while ((ptr = ptr_ring_consume(&tfile->tx_ring)) != NULL)
                tun_ptr_free(ptr);

        skb_queue_purge(&tfile->sk.sk_write_queue);
        skb_queue_purge(&tfile->sk.sk_error_queue);
}

static void __tun_detach(struct tun_file *tfile, bool clean)
{
        struct tun_file *ntfile;
        struct tun_struct *tun;

        tun = rtnl_dereference(tfile->tun);

        if (tun && clean) {
                if (!tfile->detached)
                        tun_napi_disable(tfile);
                tun_napi_del(tfile);
        }

        if (tun && !tfile->detached) {
                u16 index = tfile->queue_index;
                BUG_ON(index >= tun->numqueues);

                rcu_assign_pointer(tun->tfiles[index],
                                   tun->tfiles[tun->numqueues - 1]);
                ntfile = rtnl_dereference(tun->tfiles[index]);
                ntfile->queue_index = index;
                ntfile->xdp_rxq.queue_index = index;
                rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
                                   NULL);

                --tun->numqueues;
                if (clean) {
                        RCU_INIT_POINTER(tfile->tun, NULL);
                        sock_put(&tfile->sk);
                } else {
                        tun_disable_queue(tun, tfile);
                        tun_napi_disable(tfile);
                }

                synchronize_net();
                tun_flow_delete_by_queue(tun, tun->numqueues + 1);
                /* Drop read queue */
                tun_queue_purge(tfile);
                tun_set_real_num_queues(tun);
        } else if (tfile->detached && clean) {
                tun = tun_enable_queue(tfile);
                sock_put(&tfile->sk);
        }

        if (clean) {
                if (tun && tun->numqueues == 0 && tun->numdisabled == 0) {
                        netif_carrier_off(tun->dev);

                        if (!(tun->flags & IFF_PERSIST) &&
                            tun->dev->reg_state == NETREG_REGISTERED)
                                unregister_netdevice(tun->dev);
                }
                if (tun)
                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
                ptr_ring_cleanup(&tfile->tx_ring, tun_ptr_free);
        }
}

static void tun_detach(struct tun_file *tfile, bool clean)
{
        struct tun_struct *tun;
        struct net_device *dev;

        rtnl_lock();
        tun = rtnl_dereference(tfile->tun);
        dev = tun ? tun->dev : NULL;
        __tun_detach(tfile, clean);
        if (dev)
                netdev_state_change(dev);
        rtnl_unlock();

        if (clean)
                sock_put(&tfile->sk);
}

static void tun_detach_all(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct tun_file *tfile, *tmp;
        int i, n = tun->numqueues;

        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                BUG_ON(!tfile);
                tun_napi_disable(tfile);
                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
                RCU_INIT_POINTER(tfile->tun, NULL);
                --tun->numqueues;
        }
        list_for_each_entry(tfile, &tun->disabled, next) {
                tfile->socket.sk->sk_shutdown = RCV_SHUTDOWN;
                tfile->socket.sk->sk_data_ready(tfile->socket.sk);
                RCU_INIT_POINTER(tfile->tun, NULL);
        }
        BUG_ON(tun->numqueues != 0);

        synchronize_net();
        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                tun_napi_del(tfile);
                /* Drop read queue */
                tun_queue_purge(tfile);
                xdp_rxq_info_unreg(&tfile->xdp_rxq);
                sock_put(&tfile->sk);
        }
        list_for_each_entry_safe(tfile, tmp, &tun->disabled, next) {
                tun_napi_del(tfile);
                tun_enable_queue(tfile);
                tun_queue_purge(tfile);
                xdp_rxq_info_unreg(&tfile->xdp_rxq);
                sock_put(&tfile->sk);
        }
        BUG_ON(tun->numdisabled != 0);

        if (tun->flags & IFF_PERSIST)
                module_put(THIS_MODULE);
}

static int tun_attach(struct tun_struct *tun, struct file *file,
                      bool skip_filter, bool napi, bool napi_frags,
                      bool publish_tun)
{
        struct tun_file *tfile = file->private_data;
        struct net_device *dev = tun->dev;
        int err;

        err = security_tun_dev_attach(tfile->socket.sk, tun->security);
        if (err < 0)
                goto out;

        err = -EINVAL;
        if (rtnl_dereference(tfile->tun) && !tfile->detached)
                goto out;

        err = -EBUSY;
        if (!(tun->flags & IFF_MULTI_QUEUE) && tun->numqueues == 1)
                goto out;

        err = -E2BIG;
        if (!tfile->detached &&
            tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
                goto out;

        err = 0;

        /* Re-attach the filter to persist device */
        if (!skip_filter && (tun->filter_attached == true)) {
                lock_sock(tfile->socket.sk);
                err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
                release_sock(tfile->socket.sk);
                if (!err)
                        goto out;
        }

        if (!tfile->detached &&
            ptr_ring_resize(&tfile->tx_ring, dev->tx_queue_len,
                            GFP_KERNEL, tun_ptr_free)) {
                err = -ENOMEM;
                goto out;
        }

        tfile->queue_index = tun->numqueues;
        tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;

        if (tfile->detached) {
                /* Re-attach detached tfile, updating XDP queue_index */
                WARN_ON(!xdp_rxq_info_is_reg(&tfile->xdp_rxq));

                if (tfile->xdp_rxq.queue_index    != tfile->queue_index)
                        tfile->xdp_rxq.queue_index = tfile->queue_index;
        } else {
                /* Setup XDP RX-queue info, for new tfile getting attached */
                err = xdp_rxq_info_reg(&tfile->xdp_rxq,
                                       tun->dev, tfile->queue_index, 0);
                if (err < 0)
                        goto out;
                err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
                                                 MEM_TYPE_PAGE_SHARED, NULL);
                if (err < 0) {
                        xdp_rxq_info_unreg(&tfile->xdp_rxq);
                        goto out;
                }
                err = 0;
        }

        if (tfile->detached) {
                tun_enable_queue(tfile);
                tun_napi_enable(tfile);
        } else {
                sock_hold(&tfile->sk);
                tun_napi_init(tun, tfile, napi, napi_frags);
        }

        if (rtnl_dereference(tun->xdp_prog))
                sock_set_flag(&tfile->sk, SOCK_XDP);

        /* device is allowed to go away first, so no need to hold extra
         * refcnt.
         */

        /* Publish tfile->tun and tun->tfiles only after we've fully
         * initialized tfile; otherwise we risk using half-initialized
         * object.
         */
        if (publish_tun)
                rcu_assign_pointer(tfile->tun, tun);
        rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
        tun->numqueues++;
        tun_set_real_num_queues(tun);
out:
        return err;
}

static struct tun_struct *tun_get(struct tun_file *tfile)
{
        struct tun_struct *tun;

        rcu_read_lock();
        tun = rcu_dereference(tfile->tun);
        if (tun)
                dev_hold(tun->dev);
        rcu_read_unlock();

        return tun;
}

static void tun_put(struct tun_struct *tun)
{
        dev_put(tun->dev);
}

/* TAP filtering */
static void addr_hash_set(u32 *mask, const u8 *addr)
{
        int n = ether_crc(ETH_ALEN, addr) >> 26;
        mask[n >> 5] |= (1 << (n & 31));
}

static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
{
        int n = ether_crc(ETH_ALEN, addr) >> 26;
        return mask[n >> 5] & (1 << (n & 31));
}

static int update_filter(struct tap_filter *filter, void __user *arg)
{
        struct { u8 u[ETH_ALEN]; } *addr;
        struct tun_filter uf;
        int err, alen, n, nexact;

        if (copy_from_user(&uf, arg, sizeof(uf)))
                return -EFAULT;

        if (!uf.count) {
                /* Disabled */
                filter->count = 0;
                return 0;
        }

        alen = ETH_ALEN * uf.count;
        addr = memdup_user(arg + sizeof(uf), alen);
        if (IS_ERR(addr))
                return PTR_ERR(addr);

        /* The filter is updated without holding any locks. Which is
         * perfectly safe. We disable it first and in the worst
         * case we'll accept a few undesired packets. */
        filter->count = 0;
        wmb();

        /* Use first set of addresses as an exact filter */
        for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
                memcpy(filter->addr[n], addr[n].u, ETH_ALEN);

        nexact = n;

        /* Remaining multicast addresses are hashed,
         * unicast will leave the filter disabled. */
        memset(filter->mask, 0, sizeof(filter->mask));
        for (; n < uf.count; n++) {
                if (!is_multicast_ether_addr(addr[n].u)) {
                        err = 0; /* no filter */
                        goto free_addr;
                }
                addr_hash_set(filter->mask, addr[n].u);
        }

        /* For ALLMULTI just set the mask to all ones.
         * This overrides the mask populated above. */
        if ((uf.flags & TUN_FLT_ALLMULTI))
                memset(filter->mask, ~0, sizeof(filter->mask));

        /* Now enable the filter */
        wmb();
        filter->count = nexact;

        /* Return the number of exact filters */
        err = nexact;
free_addr:
        kfree(addr);
        return err;
}

/* Returns: 0 - drop, !=0 - accept */
static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
        /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
         * at this point. */
        struct ethhdr *eh = (struct ethhdr *) skb->data;
        int i;

        /* Exact match */
        for (i = 0; i < filter->count; i++)
                if (ether_addr_equal(eh->h_dest, filter->addr[i]))
                        return 1;

        /* Inexact match (multicast only) */
        if (is_multicast_ether_addr(eh->h_dest))
                return addr_hash_test(filter->mask, eh->h_dest);

        return 0;
}

/*
 * Checks whether the packet is accepted or not.
 * Returns: 0 - drop, !=0 - accept
 */
static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
{
        if (!filter->count)
                return 1;

        return run_filter(filter, skb);
}

/* Network device part of the driver */

static const struct ethtool_ops tun_ethtool_ops;

static int tun_net_init(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct ifreq *ifr = tun->ifr;
        int err;

        spin_lock_init(&tun->lock);

        err = security_tun_dev_alloc_security(&tun->security);
        if (err < 0)
                return err;

        tun_flow_init(tun);

        dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
        dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
                           TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
                           NETIF_F_HW_VLAN_STAG_TX;
        dev->features = dev->hw_features;
        dev->vlan_features = dev->features &
                             ~(NETIF_F_HW_VLAN_CTAG_TX |
                               NETIF_F_HW_VLAN_STAG_TX);
        dev->lltx = true;

        tun->flags = (tun->flags & ~TUN_FEATURES) |
                      (ifr->ifr_flags & TUN_FEATURES);

        INIT_LIST_HEAD(&tun->disabled);
        err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI,
                         ifr->ifr_flags & IFF_NAPI_FRAGS, false);
        if (err < 0) {
                tun_flow_uninit(tun);
                security_tun_dev_free_security(tun->security);
                return err;
        }
        return 0;
}

/* Net device detach from fd. */
static void tun_net_uninit(struct net_device *dev)
{
        tun_detach_all(dev);
}

/* Net device open. */
static int tun_net_open(struct net_device *dev)
{
        netif_tx_start_all_queues(dev);

        return 0;
}

/* Net device close. */
static int tun_net_close(struct net_device *dev)
{
        netif_tx_stop_all_queues(dev);
        return 0;
}

/* Net device start xmit */
static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{
#ifdef CONFIG_RPS
        if (tun->numqueues == 1 && static_branch_unlikely(&rps_needed)) {
                /* Select queue was not called for the skbuff, so we extract the
                 * RPS hash and save it into the flow_table here.
                 */
                struct tun_flow_entry *e;
                __u32 rxhash;

                rxhash = __skb_get_hash_symmetric(skb);
                e = tun_flow_find(&tun->flows[tun_hashfn(rxhash)], rxhash);
                if (e)
                        tun_flow_save_rps_rxhash(e, rxhash);
        }
#endif
}

static unsigned int run_ebpf_filter(struct tun_struct *tun,
                                    struct sk_buff *skb,
                                    int len)
{
        struct tun_prog *prog = rcu_dereference(tun->filter_prog);

        if (prog)
                len = bpf_prog_run_clear_cb(prog->prog, skb);

        return len;
}

/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);
        enum skb_drop_reason drop_reason;
        int txq = skb->queue_mapping;
        struct netdev_queue *queue;
        struct tun_file *tfile;
        int len = skb->len;

        rcu_read_lock();
        tfile = rcu_dereference(tun->tfiles[txq]);

        /* Drop packet if interface is not attached */
        if (!tfile) {
                drop_reason = SKB_DROP_REASON_DEV_READY;
                goto drop;
        }

        if (!rcu_dereference(tun->steering_prog))
                tun_automq_xmit(tun, skb);

        netif_info(tun, tx_queued, tun->dev, "%s %d\n", __func__, skb->len);

        /* Drop if the filter does not like it.
         * This is a noop if the filter is disabled.
         * Filter can be enabled only for the TAP devices. */
        if (!check_filter(&tun->txflt, skb)) {
                drop_reason = SKB_DROP_REASON_TAP_TXFILTER;
                goto drop;
        }

        if (tfile->socket.sk->sk_filter &&
            sk_filter(tfile->socket.sk, skb)) {
                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                goto drop;
        }

        len = run_ebpf_filter(tun, skb, len);
        if (len == 0) {
                drop_reason = SKB_DROP_REASON_TAP_FILTER;
                goto drop;
        }

        if (pskb_trim(skb, len)) {
                drop_reason = SKB_DROP_REASON_NOMEM;
                goto drop;
        }

        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
                drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
                goto drop;
        }

        skb_tx_timestamp(skb);

        /* Orphan the skb - required as we might hang on to it
         * for indefinite time.
         */
        skb_orphan(skb);

        nf_reset_ct(skb);

        if (ptr_ring_produce(&tfile->tx_ring, skb)) {
                drop_reason = SKB_DROP_REASON_FULL_RING;
                goto drop;
        }

        /* dev->lltx requires to do our own update of trans_start */
        queue = netdev_get_tx_queue(dev, txq);
        txq_trans_cond_update(queue);

        /* Notify and wake up reader process */
        if (tfile->flags & TUN_FASYNC)
                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
        tfile->socket.sk->sk_data_ready(tfile->socket.sk);

        rcu_read_unlock();
        return NETDEV_TX_OK;

drop:
        dev_core_stats_tx_dropped_inc(dev);
        skb_tx_error(skb);
        kfree_skb_reason(skb, drop_reason);
        rcu_read_unlock();
        return NET_XMIT_DROP;
}

static void tun_net_mclist(struct net_device *dev)
{
        /*
         * This callback is supposed to deal with mc filter in
         * _rx_ path and has nothing to do with the _tx_ path.
         * In rx path we always accept everything userspace gives us.
         */
}

static netdev_features_t tun_net_fix_features(struct net_device *dev,
        netdev_features_t features)
{
        struct tun_struct *tun = netdev_priv(dev);

        return (features & tun->set_features) | (features & ~TUN_USER_FEATURES);
}

static void tun_set_headroom(struct net_device *dev, int new_hr)
{
        struct tun_struct *tun = netdev_priv(dev);

        if (new_hr < NET_SKB_PAD)
                new_hr = NET_SKB_PAD;

        tun->align = new_hr;
}

static void
tun_net_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
{
        struct tun_struct *tun = netdev_priv(dev);

        dev_get_tstats64(dev, stats);

        stats->rx_frame_errors +=
                (unsigned long)atomic_long_read(&tun->rx_frame_errors);
}

static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
                       struct netlink_ext_ack *extack)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct tun_file *tfile;
        struct bpf_prog *old_prog;
        int i;

        old_prog = rtnl_dereference(tun->xdp_prog);
        rcu_assign_pointer(tun->xdp_prog, prog);
        if (old_prog)
                bpf_prog_put(old_prog);

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                if (prog)
                        sock_set_flag(&tfile->sk, SOCK_XDP);
                else
                        sock_reset_flag(&tfile->sk, SOCK_XDP);
        }
        list_for_each_entry(tfile, &tun->disabled, next) {
                if (prog)
                        sock_set_flag(&tfile->sk, SOCK_XDP);
                else
                        sock_reset_flag(&tfile->sk, SOCK_XDP);
        }

        return 0;
}

static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
{
        switch (xdp->command) {
        case XDP_SETUP_PROG:
                return tun_xdp_set(dev, xdp->prog, xdp->extack);
        default:
                return -EINVAL;
        }
}

static int tun_net_change_carrier(struct net_device *dev, bool new_carrier)
{
        if (new_carrier) {
                struct tun_struct *tun = netdev_priv(dev);

                if (!tun->numqueues)
                        return -EPERM;

                netif_carrier_on(dev);
        } else {
                netif_carrier_off(dev);
        }
        return 0;
}

static const struct net_device_ops tun_netdev_ops = {
        .ndo_init                = tun_net_init,
        .ndo_uninit                = tun_net_uninit,
        .ndo_open                = tun_net_open,
        .ndo_stop                = tun_net_close,
        .ndo_start_xmit                = tun_net_xmit,
        .ndo_fix_features        = tun_net_fix_features,
        .ndo_select_queue        = tun_select_queue,
        .ndo_set_rx_headroom        = tun_set_headroom,
        .ndo_get_stats64        = tun_net_get_stats64,
        .ndo_change_carrier        = tun_net_change_carrier,
};

static void __tun_xdp_flush_tfile(struct tun_file *tfile)
{
        /* Notify and wake up reader process */
        if (tfile->flags & TUN_FASYNC)
                kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
        tfile->socket.sk->sk_data_ready(tfile->socket.sk);
}

static int tun_xdp_xmit(struct net_device *dev, int n,
                        struct xdp_frame **frames, u32 flags)
{
        struct tun_struct *tun = netdev_priv(dev);
        struct tun_file *tfile;
        u32 numqueues;
        int nxmit = 0;
        int i;

        if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
                return -EINVAL;

        rcu_read_lock();

resample:
        numqueues = READ_ONCE(tun->numqueues);
        if (!numqueues) {
                rcu_read_unlock();
                return -ENXIO; /* Caller will free/return all frames */
        }

        tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
                                            numqueues]);
        if (unlikely(!tfile))
                goto resample;

        spin_lock(&tfile->tx_ring.producer_lock);
        for (i = 0; i < n; i++) {
                struct xdp_frame *xdp = frames[i];
                /* Encode the XDP flag into lowest bit for consumer to differ
                 * XDP buffer from sk_buff.
                 */
                void *frame = tun_xdp_to_ptr(xdp);

                if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
                        dev_core_stats_tx_dropped_inc(dev);
                        break;
                }
                nxmit++;
        }
        spin_unlock(&tfile->tx_ring.producer_lock);

        if (flags & XDP_XMIT_FLUSH)
                __tun_xdp_flush_tfile(tfile);

        rcu_read_unlock();
        return nxmit;
}

static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
{
        struct xdp_frame *frame = xdp_convert_buff_to_frame(xdp);
        int nxmit;

        if (unlikely(!frame))
                return -EOVERFLOW;

        nxmit = tun_xdp_xmit(dev, 1, &frame, XDP_XMIT_FLUSH);
        if (!nxmit)
                xdp_return_frame_rx_napi(frame);
        return nxmit;
}

static const struct net_device_ops tap_netdev_ops = {
        .ndo_init                = tun_net_init,
        .ndo_uninit                = tun_net_uninit,
        .ndo_open                = tun_net_open,
        .ndo_stop                = tun_net_close,
        .ndo_start_xmit                = tun_net_xmit,
        .ndo_fix_features        = tun_net_fix_features,
        .ndo_set_rx_mode        = tun_net_mclist,
        .ndo_set_mac_address        = eth_mac_addr,
        .ndo_validate_addr        = eth_validate_addr,
        .ndo_select_queue        = tun_select_queue,
        .ndo_features_check        = passthru_features_check,
        .ndo_set_rx_headroom        = tun_set_headroom,
        .ndo_bpf                = tun_xdp,
        .ndo_xdp_xmit                = tun_xdp_xmit,
        .ndo_change_carrier        = tun_net_change_carrier,
};

static void tun_flow_init(struct tun_struct *tun)
{
        int i;

        for (i = 0; i < TUN_NUM_FLOW_ENTRIES; i++)
                INIT_HLIST_HEAD(&tun->flows[i]);

        tun->ageing_time = TUN_FLOW_EXPIRE;
        timer_setup(&tun->flow_gc_timer, tun_flow_cleanup, 0);
        mod_timer(&tun->flow_gc_timer,
                  round_jiffies_up(jiffies + tun->ageing_time));
}

static void tun_flow_uninit(struct tun_struct *tun)
{
        timer_delete_sync(&tun->flow_gc_timer);
        tun_flow_flush(tun);
}

#define MIN_MTU 68
#define MAX_MTU 65535

/* Initialize net device. */
static void tun_net_initialize(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        switch (tun->flags & TUN_TYPE_MASK) {
        case IFF_TUN:
                dev->netdev_ops = &tun_netdev_ops;
                dev->header_ops = &ip_tunnel_header_ops;

                /* Point-to-Point TUN Device */
                dev->hard_header_len = 0;
                dev->addr_len = 0;
                dev->mtu = 1500;

                /* Zero header length */
                dev->type = ARPHRD_NONE;
                dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
                break;

        case IFF_TAP:
                dev->netdev_ops = &tap_netdev_ops;
                /* Ethernet TAP Device */
                ether_setup(dev);
                dev->priv_flags &= ~IFF_TX_SKB_SHARING;
                dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;

                eth_hw_addr_random(dev);

                /* Currently tun does not support XDP, only tap does. */
                dev->xdp_features = NETDEV_XDP_ACT_BASIC |
                                    NETDEV_XDP_ACT_REDIRECT |
                                    NETDEV_XDP_ACT_NDO_XMIT;

                break;
        }

        dev->min_mtu = MIN_MTU;
        dev->max_mtu = MAX_MTU - dev->hard_header_len;
}

static bool tun_sock_writeable(struct tun_struct *tun, struct tun_file *tfile)
{
        struct sock *sk = tfile->socket.sk;

        return (tun->dev->flags & IFF_UP) && sock_writeable(sk);
}

/* Character device part */

/* Poll */
static __poll_t tun_chr_poll(struct file *file, poll_table *wait)
{
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = tun_get(tfile);
        struct sock *sk;
        __poll_t mask = 0;

        if (!tun)
                return EPOLLERR;

        sk = tfile->socket.sk;

        poll_wait(file, sk_sleep(sk), wait);

        if (!ptr_ring_empty(&tfile->tx_ring))
                mask |= EPOLLIN | EPOLLRDNORM;

        /* Make sure SOCKWQ_ASYNC_NOSPACE is set if not writable to
         * guarantee EPOLLOUT to be raised by either here or
         * tun_sock_write_space(). Then process could get notification
         * after it writes to a down device and meets -EIO.
         */
        if (tun_sock_writeable(tun, tfile) ||
            (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
             tun_sock_writeable(tun, tfile)))
                mask |= EPOLLOUT | EPOLLWRNORM;

        if (tun->dev->reg_state != NETREG_REGISTERED)
                mask = EPOLLERR;

        tun_put(tun);
        return mask;
}

static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
                                            size_t len,
                                            const struct iov_iter *it)
{
        struct sk_buff *skb;
        size_t linear;
        int err;
        int i;

        if (it->nr_segs > MAX_SKB_FRAGS + 1 ||
            len > (ETH_MAX_MTU - NET_SKB_PAD - NET_IP_ALIGN))
                return ERR_PTR(-EMSGSIZE);

        local_bh_disable();
        skb = napi_get_frags(&tfile->napi);
        local_bh_enable();
        if (!skb)
                return ERR_PTR(-ENOMEM);

        linear = iov_iter_single_seg_count(it);
        err = __skb_grow(skb, linear);
        if (err)
                goto free;

        skb->len = len;
        skb->data_len = len - linear;
        skb->truesize += skb->data_len;

        for (i = 1; i < it->nr_segs; i++) {
                const struct iovec *iov = iter_iov(it) + i;
                size_t fragsz = iov->iov_len;
                struct page *page;
                void *frag;

                if (fragsz == 0 || fragsz > PAGE_SIZE) {
                        err = -EINVAL;
                        goto free;
                }
                frag = netdev_alloc_frag(fragsz);
                if (!frag) {
                        err = -ENOMEM;
                        goto free;
                }
                page = virt_to_head_page(frag);
                skb_fill_page_desc(skb, i - 1, page,
                                   frag - page_address(page), fragsz);
        }

        return skb;
free:
        /* frees skb and all frags allocated with napi_alloc_frag() */
        napi_free_frags(&tfile->napi);
        return ERR_PTR(err);
}

/* prepad is the amount to reserve at front.  len is length after that.
 * linear is a hint as to how much to copy (usually headers). */
static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
                                     size_t prepad, size_t len,
                                     size_t linear, int noblock)
{
        struct sock *sk = tfile->socket.sk;
        struct sk_buff *skb;
        int err;

        /* Under a page?  Don't bother with paged skb. */
        if (prepad + len < PAGE_SIZE)
                linear = len;

        if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
                linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER);
        skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
                                   &err, PAGE_ALLOC_COSTLY_ORDER);
        if (!skb)
                return ERR_PTR(err);

        skb_reserve(skb, prepad);
        skb_put(skb, linear);
        skb->data_len = len - linear;
        skb->len += len - linear;

        return skb;
}

static void tun_rx_batched(struct tun_struct *tun, struct tun_file *tfile,
                           struct sk_buff *skb, int more)
{
        struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
        struct sk_buff_head process_queue;
        u32 rx_batched = tun->rx_batched;
        bool rcv = false;

        if (!rx_batched || (!more && skb_queue_empty(queue))) {
                local_bh_disable();
                skb_record_rx_queue(skb, tfile->queue_index);
                netif_receive_skb(skb);
                local_bh_enable();
                return;
        }

        spin_lock(&queue->lock);
        if (!more || skb_queue_len(queue) == rx_batched) {
                __skb_queue_head_init(&process_queue);
                skb_queue_splice_tail_init(queue, &process_queue);
                rcv = true;
        } else {
                __skb_queue_tail(queue, skb);
        }
        spin_unlock(&queue->lock);

        if (rcv) {
                struct sk_buff *nskb;

                local_bh_disable();
                while ((nskb = __skb_dequeue(&process_queue))) {
                        skb_record_rx_queue(nskb, tfile->queue_index);
                        netif_receive_skb(nskb);
                }
                skb_record_rx_queue(skb, tfile->queue_index);
                netif_receive_skb(skb);
                local_bh_enable();
        }
}

static bool tun_can_build_skb(struct tun_struct *tun, struct tun_file *tfile,
                              int len, int noblock, bool zerocopy)
{
        if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                return false;

        if (tfile->socket.sk->sk_sndbuf != INT_MAX)
                return false;

        if (!noblock)
                return false;

        if (zerocopy)
                return false;

        if (SKB_DATA_ALIGN(len + TUN_RX_PAD + XDP_PACKET_HEADROOM) +
            SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) > PAGE_SIZE)
                return false;

        return true;
}

static struct sk_buff *__tun_build_skb(struct tun_file *tfile,
                                       struct page_frag *alloc_frag, char *buf,
                                       int buflen, int len, int pad,
                                       int metasize)
{
        struct sk_buff *skb = build_skb(buf, buflen);

        if (!skb)
                return ERR_PTR(-ENOMEM);

        skb_reserve(skb, pad);
        skb_put(skb, len);
        if (metasize)
                skb_metadata_set(skb, metasize);
        skb_set_owner_w(skb, tfile->socket.sk);

        get_page(alloc_frag->page);
        alloc_frag->offset += buflen;

        return skb;
}

static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
                       struct xdp_buff *xdp, u32 act)
{
        int err;

        switch (act) {
        case XDP_REDIRECT:
                err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
                if (err) {
                        dev_core_stats_rx_dropped_inc(tun->dev);
                        return err;
                }
                dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
                break;
        case XDP_TX:
                err = tun_xdp_tx(tun->dev, xdp);
                if (err < 0) {
                        dev_core_stats_rx_dropped_inc(tun->dev);
                        return err;
                }
                dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
                break;
        case XDP_PASS:
                break;
        default:
                bpf_warn_invalid_xdp_action(tun->dev, xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
                trace_xdp_exception(tun->dev, xdp_prog, act);
                fallthrough;
        case XDP_DROP:
                dev_core_stats_rx_dropped_inc(tun->dev);
                break;
        }

        return act;
}

static struct sk_buff *tun_build_skb(struct tun_struct *tun,
                                     struct tun_file *tfile,
                                     struct iov_iter *from,
                                     struct virtio_net_hdr *hdr,
                                     int len, int *skb_xdp)
{
        struct page_frag *alloc_frag = &current->task_frag;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        struct bpf_prog *xdp_prog;
        int buflen = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
        char *buf;
        size_t copied;
        int pad = TUN_RX_PAD;
        int metasize = 0;
        int err = 0;

        rcu_read_lock();
        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog)
                pad += XDP_PACKET_HEADROOM;
        buflen += SKB_DATA_ALIGN(len + pad);
        rcu_read_unlock();

        alloc_frag->offset = ALIGN((u64)alloc_frag->offset, SMP_CACHE_BYTES);
        if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, GFP_KERNEL)))
                return ERR_PTR(-ENOMEM);

        buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
        copied = copy_page_from_iter(alloc_frag->page,
                                     alloc_frag->offset + pad,
                                     len, from);
        if (copied != len)
                return ERR_PTR(-EFAULT);

        /* There's a small window that XDP may be set after the check
         * of xdp_prog above, this should be rare and for simplicity
         * we do XDP on skb in case the headroom is not enough.
         */
        if (hdr->gso_type || !xdp_prog) {
                *skb_xdp = 1;
                return __tun_build_skb(tfile, alloc_frag, buf, buflen, len,
                                       pad, metasize);
        }

        *skb_xdp = 0;

        local_bh_disable();
        rcu_read_lock();
        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog) {
                struct xdp_buff xdp;
                u32 act;

                xdp_init_buff(&xdp, buflen, &tfile->xdp_rxq);
                xdp_prepare_buff(&xdp, buf, pad, len, true);

                act = bpf_prog_run_xdp(xdp_prog, &xdp);
                if (act == XDP_REDIRECT || act == XDP_TX) {
                        get_page(alloc_frag->page);
                        alloc_frag->offset += buflen;
                }
                err = tun_xdp_act(tun, xdp_prog, &xdp, act);
                if (err < 0) {
                        if (act == XDP_REDIRECT || act == XDP_TX)
                                put_page(alloc_frag->page);
                        goto out;
                }

                if (err == XDP_REDIRECT)
                        xdp_do_flush();
                if (err != XDP_PASS)
                        goto out;

                pad = xdp.data - xdp.data_hard_start;
                len = xdp.data_end - xdp.data;

                /* It is known that the xdp_buff was prepared with metadata
                 * support, so the metasize will never be negative.
                 */
                metasize = xdp.data - xdp.data_meta;
        }
        bpf_net_ctx_clear(bpf_net_ctx);
        rcu_read_unlock();
        local_bh_enable();

        return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad,
                               metasize);

out:
        bpf_net_ctx_clear(bpf_net_ctx);
        rcu_read_unlock();
        local_bh_enable();
        return NULL;
}

/* Get packet from user space buffer */
static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
                            void *msg_control, struct iov_iter *from,
                            int noblock, bool more)
{
        struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
        struct sk_buff *skb;
        size_t total_len = iov_iter_count(from);
        size_t len = total_len, align = tun->align, linear;
        struct virtio_net_hdr gso = { 0 };
        int good_linear;
        int copylen;
        int hdr_len = 0;
        bool zerocopy = false;
        int err;
        u32 rxhash = 0;
        int skb_xdp = 1;
        bool frags = tun_napi_frags_enabled(tfile);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;

        if (!(tun->flags & IFF_NO_PI)) {
                if (len < sizeof(pi))
                        return -EINVAL;
                len -= sizeof(pi);

                if (!copy_from_iter_full(&pi, sizeof(pi), from))
                        return -EFAULT;
        }

        if (tun->flags & IFF_VNET_HDR) {
                int vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);

                hdr_len = tun_vnet_hdr_get(vnet_hdr_sz, tun->flags, from, &gso);
                if (hdr_len < 0)
                        return hdr_len;

                len -= vnet_hdr_sz;
        }

        if ((tun->flags & TUN_TYPE_MASK) == IFF_TAP) {
                align += NET_IP_ALIGN;
                if (unlikely(len < ETH_HLEN || (hdr_len && hdr_len < ETH_HLEN)))
                        return -EINVAL;
        }

        good_linear = SKB_MAX_HEAD(align);

        if (msg_control) {
                struct iov_iter i = *from;

                /* There are 256 bytes to be copied in skb, so there is
                 * enough room for skb expand head in case it is used.
                 * The rest of the buffer is mapped from userspace.
                 */
                copylen = min(hdr_len ? hdr_len : GOODCOPY_LEN, good_linear);
                linear = copylen;
                iov_iter_advance(&i, copylen);
                if (iov_iter_npages(&i, INT_MAX) <= MAX_SKB_FRAGS)
                        zerocopy = true;
        }

        if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
                /* For the packet that is not easy to be processed
                 * (e.g gso or jumbo packet), we will do it at after
                 * skb was created with generic XDP routine.
                 */
                skb = tun_build_skb(tun, tfile, from, &gso, len, &skb_xdp);
                err = PTR_ERR_OR_ZERO(skb);
                if (err)
                        goto drop;
                if (!skb)
                        return total_len;
        } else {
                if (!zerocopy) {
                        copylen = len;
                        linear = min(hdr_len, good_linear);
                }

                if (frags) {
                        mutex_lock(&tfile->napi_mutex);
                        skb = tun_napi_alloc_frags(tfile, copylen, from);
                        /* tun_napi_alloc_frags() enforces a layout for the skb.
                         * If zerocopy is enabled, then this layout will be
                         * overwritten by zerocopy_sg_from_iter().
                         */
                        zerocopy = false;
                } else {
                        if (!linear)
                                linear = min_t(size_t, good_linear, copylen);

                        skb = tun_alloc_skb(tfile, align, copylen, linear,
                                            noblock);
                }

                err = PTR_ERR_OR_ZERO(skb);
                if (err)
                        goto drop;

                if (zerocopy)
                        err = zerocopy_sg_from_iter(skb, from);
                else
                        err = skb_copy_datagram_from_iter(skb, 0, from, len);

                if (err) {
                        err = -EFAULT;
                        drop_reason = SKB_DROP_REASON_SKB_UCOPY_FAULT;
                        goto drop;
                }
        }

        if (tun_vnet_hdr_to_skb(tun->flags, skb, &gso)) {
                atomic_long_inc(&tun->rx_frame_errors);
                err = -EINVAL;
                goto free_skb;
        }

        switch (tun->flags & TUN_TYPE_MASK) {
        case IFF_TUN:
                if (tun->flags & IFF_NO_PI) {
                        u8 ip_version = skb->len ? (skb->data[0] >> 4) : 0;

                        switch (ip_version) {
                        case 4:
                                pi.proto = htons(ETH_P_IP);
                                break;
                        case 6:
                                pi.proto = htons(ETH_P_IPV6);
                                break;
                        default:
                                err = -EINVAL;
                                goto drop;
                        }
                }

                skb_reset_mac_header(skb);
                skb->protocol = pi.proto;
                skb->dev = tun->dev;
                break;
        case IFF_TAP:
                if (frags && !pskb_may_pull(skb, ETH_HLEN)) {
                        err = -ENOMEM;
                        drop_reason = SKB_DROP_REASON_HDR_TRUNC;
                        goto drop;
                }
                skb->protocol = eth_type_trans(skb, tun->dev);
                break;
        }

        /* copy skb_ubuf_info for callback when skb has no error */
        if (zerocopy) {
                skb_zcopy_init(skb, msg_control);
        } else if (msg_control) {
                struct ubuf_info *uarg = msg_control;
                uarg->ops->complete(NULL, uarg, false);
        }

        skb_reset_network_header(skb);
        skb_probe_transport_header(skb);
        skb_record_rx_queue(skb, tfile->queue_index);

        if (skb_xdp) {
                struct bpf_prog *xdp_prog;
                int ret;

                local_bh_disable();
                rcu_read_lock();
                xdp_prog = rcu_dereference(tun->xdp_prog);
                if (xdp_prog) {
                        ret = do_xdp_generic(xdp_prog, &skb);
                        if (ret != XDP_PASS) {
                                rcu_read_unlock();
                                local_bh_enable();
                                goto unlock_frags;
                        }
                }
                rcu_read_unlock();
                local_bh_enable();
        }

        /* Compute the costly rx hash only if needed for flow updates.
         * We may get a very small possibility of OOO during switching, not
         * worth to optimize.
         */
        if (!rcu_access_pointer(tun->steering_prog) && tun->numqueues > 1 &&
            !tfile->detached)
                rxhash = __skb_get_hash_symmetric(skb);

        rcu_read_lock();
        if (unlikely(!(tun->dev->flags & IFF_UP))) {
                err = -EIO;
                rcu_read_unlock();
                drop_reason = SKB_DROP_REASON_DEV_READY;
                goto drop;
        }

        if (frags) {
                u32 headlen;

                /* Exercise flow dissector code path. */
                skb_push(skb, ETH_HLEN);
                headlen = eth_get_headlen(tun->dev, skb->data,
                                          skb_headlen(skb));

                if (unlikely(headlen > skb_headlen(skb))) {
                        WARN_ON_ONCE(1);
                        err = -ENOMEM;
                        dev_core_stats_rx_dropped_inc(tun->dev);
napi_busy:
                        napi_free_frags(&tfile->napi);
                        rcu_read_unlock();
                        mutex_unlock(&tfile->napi_mutex);
                        return err;
                }

                if (likely(napi_schedule_prep(&tfile->napi))) {
                        local_bh_disable();
                        napi_gro_frags(&tfile->napi);
                        napi_complete(&tfile->napi);
                        local_bh_enable();
                } else {
                        err = -EBUSY;
                        goto napi_busy;
                }
                mutex_unlock(&tfile->napi_mutex);
        } else if (tfile->napi_enabled) {
                struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
                int queue_len;

                spin_lock_bh(&queue->lock);

                if (unlikely(tfile->detached)) {
                        spin_unlock_bh(&queue->lock);
                        rcu_read_unlock();
                        err = -EBUSY;
                        goto free_skb;
                }

                __skb_queue_tail(queue, skb);
                queue_len = skb_queue_len(queue);
                spin_unlock(&queue->lock);

                if (!more || queue_len > NAPI_POLL_WEIGHT)
                        napi_schedule(&tfile->napi);

                local_bh_enable();
        } else if (!IS_ENABLED(CONFIG_4KSTACKS)) {
                tun_rx_batched(tun, tfile, skb, more);
        } else {
                netif_rx(skb);
        }
        rcu_read_unlock();

        preempt_disable();
        dev_sw_netstats_rx_add(tun->dev, len);
        preempt_enable();

        if (rxhash)
                tun_flow_update(tun, rxhash, tfile);

        return total_len;

drop:
        if (err != -EAGAIN)
                dev_core_stats_rx_dropped_inc(tun->dev);

free_skb:
        if (!IS_ERR_OR_NULL(skb))
                kfree_skb_reason(skb, drop_reason);

unlock_frags:
        if (frags) {
                tfile->napi.skb = NULL;
                mutex_unlock(&tfile->napi_mutex);
        }

        return err ?: total_len;
}

static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = tun_get(tfile);
        ssize_t result;
        int noblock = 0;

        if (!tun)
                return -EBADFD;

        if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
                noblock = 1;

        result = tun_get_user(tun, tfile, NULL, from, noblock, false);

        tun_put(tun);
        return result;
}

static ssize_t tun_put_user_xdp(struct tun_struct *tun,
                                struct tun_file *tfile,
                                struct xdp_frame *xdp_frame,
                                struct iov_iter *iter)
{
        int vnet_hdr_sz = 0;
        size_t size = xdp_frame->len;
        ssize_t ret;

        if (tun->flags & IFF_VNET_HDR) {
                struct virtio_net_hdr gso = { 0 };

                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
                ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
                if (ret)
                        return ret;
        }

        ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;

        preempt_disable();
        dev_sw_netstats_tx_add(tun->dev, 1, ret);
        preempt_enable();

        return ret;
}

/* Put packet to the user space buffer */
static ssize_t tun_put_user(struct tun_struct *tun,
                            struct tun_file *tfile,
                            struct sk_buff *skb,
                            struct iov_iter *iter)
{
        struct tun_pi pi = { 0, skb->protocol };
        ssize_t total;
        int vlan_offset = 0;
        int vlan_hlen = 0;
        int vnet_hdr_sz = 0;
        int ret;

        if (skb_vlan_tag_present(skb))
                vlan_hlen = VLAN_HLEN;

        if (tun->flags & IFF_VNET_HDR)
                vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);

        total = skb->len + vlan_hlen + vnet_hdr_sz;

        if (!(tun->flags & IFF_NO_PI)) {
                if (iov_iter_count(iter) < sizeof(pi))
                        return -EINVAL;

                total += sizeof(pi);
                if (iov_iter_count(iter) < total) {
                        /* Packet will be striped */
                        pi.flags |= TUN_PKT_STRIP;
                }

                if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
                        return -EFAULT;
        }

        if (vnet_hdr_sz) {
                struct virtio_net_hdr gso;

                ret = tun_vnet_hdr_from_skb(tun->flags, tun->dev, skb, &gso);
                if (ret)
                        return ret;

                ret = tun_vnet_hdr_put(vnet_hdr_sz, iter, &gso);
                if (ret)
                        return ret;
        }

        if (vlan_hlen) {
                int ret;
                struct veth veth;

                veth.h_vlan_proto = skb->vlan_proto;
                veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));

                vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);

                ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
                if (ret || !iov_iter_count(iter))
                        goto done;

                ret = copy_to_iter(&veth, sizeof(veth), iter);
                if (ret != sizeof(veth) || !iov_iter_count(iter))
                        goto done;
        }

        skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);

done:
        /* caller is in process context, */
        preempt_disable();
        dev_sw_netstats_tx_add(tun->dev, 1, skb->len + vlan_hlen);
        preempt_enable();

        return total;
}

static void *tun_ring_recv(struct tun_file *tfile, int noblock, int *err)
{
        DECLARE_WAITQUEUE(wait, current);
        void *ptr = NULL;
        int error = 0;

        ptr = ptr_ring_consume(&tfile->tx_ring);
        if (ptr)
                goto out;
        if (noblock) {
                error = -EAGAIN;
                goto out;
        }

        add_wait_queue(&tfile->socket.wq.wait, &wait);

        while (1) {
                set_current_state(TASK_INTERRUPTIBLE);
                ptr = ptr_ring_consume(&tfile->tx_ring);
                if (ptr)
                        break;
                if (signal_pending(current)) {
                        error = -ERESTARTSYS;
                        break;
                }
                if (tfile->socket.sk->sk_shutdown & RCV_SHUTDOWN) {
                        error = -EFAULT;
                        break;
                }

                schedule();
        }

        __set_current_state(TASK_RUNNING);
        remove_wait_queue(&tfile->socket.wq.wait, &wait);

out:
        *err = error;
        return ptr;
}

static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
                           struct iov_iter *to,
                           int noblock, void *ptr)
{
        ssize_t ret;
        int err;

        if (!iov_iter_count(to)) {
                tun_ptr_free(ptr);
                return 0;
        }

        if (!ptr) {
                /* Read frames from ring */
                ptr = tun_ring_recv(tfile, noblock, &err);
                if (!ptr)
                        return err;
        }

        if (tun_is_xdp_frame(ptr)) {
                struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);

                ret = tun_put_user_xdp(tun, tfile, xdpf, to);
                xdp_return_frame(xdpf);
        } else {
                struct sk_buff *skb = ptr;

                ret = tun_put_user(tun, tfile, skb, to);
                if (unlikely(ret < 0))
                        kfree_skb(skb);
                else
                        consume_skb(skb);
        }

        return ret;
}

static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
        struct file *file = iocb->ki_filp;
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun = tun_get(tfile);
        ssize_t len = iov_iter_count(to), ret;
        int noblock = 0;

        if (!tun)
                return -EBADFD;

        if ((file->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT))
                noblock = 1;

        ret = tun_do_read(tun, tfile, to, noblock, NULL);
        ret = min_t(ssize_t, ret, len);
        if (ret > 0)
                iocb->ki_pos = ret;
        tun_put(tun);
        return ret;
}

static void tun_prog_free(struct rcu_head *rcu)
{
        struct tun_prog *prog = container_of(rcu, struct tun_prog, rcu);

        bpf_prog_destroy(prog->prog);
        kfree(prog);
}

static int __tun_set_ebpf(struct tun_struct *tun,
                          struct tun_prog __rcu **prog_p,
                          struct bpf_prog *prog)
{
        struct tun_prog *old, *new = NULL;

        if (prog) {
                new = kmalloc(sizeof(*new), GFP_KERNEL);
                if (!new)
                        return -ENOMEM;
                new->prog = prog;
        }

        spin_lock_bh(&tun->lock);
        old = rcu_dereference_protected(*prog_p,
                                        lockdep_is_held(&tun->lock));
        rcu_assign_pointer(*prog_p, new);
        spin_unlock_bh(&tun->lock);

        if (old)
                call_rcu(&old->rcu, tun_prog_free);

        return 0;
}

static void tun_free_netdev(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        BUG_ON(!(list_empty(&tun->disabled)));

        tun_flow_uninit(tun);
        security_tun_dev_free_security(tun->security);
        __tun_set_ebpf(tun, &tun->steering_prog, NULL);
        __tun_set_ebpf(tun, &tun->filter_prog, NULL);
}

static void tun_setup(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        tun->owner = INVALID_UID;
        tun->group = INVALID_GID;
        tun_default_link_ksettings(dev, &tun->link_ksettings);

        dev->ethtool_ops = &tun_ethtool_ops;
        dev->needs_free_netdev = true;
        dev->priv_destructor = tun_free_netdev;
        /* We prefer our own queue length */
        dev->tx_queue_len = TUN_READQ_SIZE;
}

/* Trivial set of netlink ops to allow deleting tun or tap
 * device with netlink.
 */
static int tun_validate(struct nlattr *tb[], struct nlattr *data[],
                        struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack,
                       "tun/tap creation via rtnetlink is not supported.");
        return -EOPNOTSUPP;
}

static size_t tun_get_size(const struct net_device *dev)
{
        BUILD_BUG_ON(sizeof(u32) != sizeof(uid_t));
        BUILD_BUG_ON(sizeof(u32) != sizeof(gid_t));

        return nla_total_size(sizeof(uid_t)) + /* OWNER */
               nla_total_size(sizeof(gid_t)) + /* GROUP */
               nla_total_size(sizeof(u8)) + /* TYPE */
               nla_total_size(sizeof(u8)) + /* PI */
               nla_total_size(sizeof(u8)) + /* VNET_HDR */
               nla_total_size(sizeof(u8)) + /* PERSIST */
               nla_total_size(sizeof(u8)) + /* MULTI_QUEUE */
               nla_total_size(sizeof(u32)) + /* NUM_QUEUES */
               nla_total_size(sizeof(u32)) + /* NUM_DISABLED_QUEUES */
               0;
}

static int tun_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        if (nla_put_u8(skb, IFLA_TUN_TYPE, tun->flags & TUN_TYPE_MASK))
                goto nla_put_failure;
        if (uid_valid(tun->owner) &&
            nla_put_u32(skb, IFLA_TUN_OWNER,
                        from_kuid_munged(current_user_ns(), tun->owner)))
                goto nla_put_failure;
        if (gid_valid(tun->group) &&
            nla_put_u32(skb, IFLA_TUN_GROUP,
                        from_kgid_munged(current_user_ns(), tun->group)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_PI, !(tun->flags & IFF_NO_PI)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_VNET_HDR, !!(tun->flags & IFF_VNET_HDR)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_PERSIST, !!(tun->flags & IFF_PERSIST)))
                goto nla_put_failure;
        if (nla_put_u8(skb, IFLA_TUN_MULTI_QUEUE,
                       !!(tun->flags & IFF_MULTI_QUEUE)))
                goto nla_put_failure;
        if (tun->flags & IFF_MULTI_QUEUE) {
                if (nla_put_u32(skb, IFLA_TUN_NUM_QUEUES, tun->numqueues))
                        goto nla_put_failure;
                if (nla_put_u32(skb, IFLA_TUN_NUM_DISABLED_QUEUES,
                                tun->numdisabled))
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static struct rtnl_link_ops tun_link_ops __read_mostly = {
        .kind                = DRV_NAME,
        .priv_size        = sizeof(struct tun_struct),
        .setup                = tun_setup,
        .validate        = tun_validate,
        .get_size       = tun_get_size,
        .fill_info      = tun_fill_info,
};

static void tun_sock_write_space(struct sock *sk)
{
        struct tun_file *tfile;
        wait_queue_head_t *wqueue;

        if (!sock_writeable(sk))
                return;

        if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags))
                return;

        wqueue = sk_sleep(sk);
        if (wqueue && waitqueue_active(wqueue))
                wake_up_interruptible_sync_poll(wqueue, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

        tfile = container_of(sk, struct tun_file, sk);
        kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
}

static void tun_put_page(struct tun_page *tpage)
{
        if (tpage->page)
                __page_frag_cache_drain(tpage->page, tpage->count);
}

static int tun_xdp_one(struct tun_struct *tun,
                       struct tun_file *tfile,
                       struct xdp_buff *xdp, int *flush,
                       struct tun_page *tpage)
{
        unsigned int datasize = xdp->data_end - xdp->data;
        struct tun_xdp_hdr *hdr = xdp->data_hard_start;
        struct virtio_net_hdr *gso = &hdr->gso;
        struct bpf_prog *xdp_prog;
        struct sk_buff *skb = NULL;
        struct sk_buff_head *queue;
        u32 rxhash = 0, act;
        int buflen = hdr->buflen;
        int metasize = 0;
        int ret = 0;
        bool skb_xdp = false;
        struct page *page;

        if (unlikely(datasize < ETH_HLEN))
                return -EINVAL;

        xdp_prog = rcu_dereference(tun->xdp_prog);
        if (xdp_prog) {
                if (gso->gso_type) {
                        skb_xdp = true;
                        goto build;
                }

                xdp_init_buff(xdp, buflen, &tfile->xdp_rxq);

                act = bpf_prog_run_xdp(xdp_prog, xdp);
                ret = tun_xdp_act(tun, xdp_prog, xdp, act);
                if (ret < 0) {
                        put_page(virt_to_head_page(xdp->data));
                        return ret;
                }

                switch (ret) {
                case XDP_REDIRECT:
                        *flush = true;
                        fallthrough;
                case XDP_TX:
                        return 0;
                case XDP_PASS:
                        break;
                default:
                        page = virt_to_head_page(xdp->data);
                        if (tpage->page == page) {
                                ++tpage->count;
                        } else {
                                tun_put_page(tpage);
                                tpage->page = page;
                                tpage->count = 1;
                        }
                        return 0;
                }
        }

build:
        skb = build_skb(xdp->data_hard_start, buflen);
        if (!skb) {
                ret = -ENOMEM;
                goto out;
        }

        skb_reserve(skb, xdp->data - xdp->data_hard_start);
        skb_put(skb, xdp->data_end - xdp->data);

        /* The externally provided xdp_buff may have no metadata support, which
         * is marked by xdp->data_meta being xdp->data + 1. This will lead to a
         * metasize of -1 and is the reason why the condition checks for > 0.
         */
        metasize = xdp->data - xdp->data_meta;
        if (metasize > 0)
                skb_metadata_set(skb, metasize);

        if (tun_vnet_hdr_to_skb(tun->flags, skb, gso)) {
                atomic_long_inc(&tun->rx_frame_errors);
                kfree_skb(skb);
                ret = -EINVAL;
                goto out;
        }

        skb->protocol = eth_type_trans(skb, tun->dev);
        skb_reset_network_header(skb);
        skb_probe_transport_header(skb);
        skb_record_rx_queue(skb, tfile->queue_index);

        if (skb_xdp) {
                ret = do_xdp_generic(xdp_prog, &skb);
                if (ret != XDP_PASS) {
                        ret = 0;
                        goto out;
                }
        }

        if (!rcu_dereference(tun->steering_prog) && tun->numqueues > 1 &&
            !tfile->detached)
                rxhash = __skb_get_hash_symmetric(skb);

        if (tfile->napi_enabled) {
                queue = &tfile->sk.sk_write_queue;
                spin_lock(&queue->lock);

                if (unlikely(tfile->detached)) {
                        spin_unlock(&queue->lock);
                        kfree_skb(skb);
                        return -EBUSY;
                }

                __skb_queue_tail(queue, skb);
                spin_unlock(&queue->lock);
                ret = 1;
        } else {
                netif_receive_skb(skb);
                ret = 0;
        }

        /* No need to disable preemption here since this function is
         * always called with bh disabled
         */
        dev_sw_netstats_rx_add(tun->dev, datasize);

        if (rxhash)
                tun_flow_update(tun, rxhash, tfile);

out:
        return ret;
}

static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
{
        int ret, i;
        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
        struct tun_struct *tun = tun_get(tfile);
        struct tun_msg_ctl *ctl = m->msg_control;
        struct xdp_buff *xdp;

        if (!tun)
                return -EBADFD;

        if (m->msg_controllen == sizeof(struct tun_msg_ctl) &&
            ctl && ctl->type == TUN_MSG_PTR) {
                struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
                struct tun_page tpage;
                int n = ctl->num;
                int flush = 0, queued = 0;

                memset(&tpage, 0, sizeof(tpage));

                local_bh_disable();
                rcu_read_lock();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

                for (i = 0; i < n; i++) {
                        xdp = &((struct xdp_buff *)ctl->ptr)[i];
                        ret = tun_xdp_one(tun, tfile, xdp, &flush, &tpage);
                        if (ret > 0)
                                queued += ret;
                }

                if (flush)
                        xdp_do_flush();

                if (tfile->napi_enabled && queued > 0)
                        napi_schedule(&tfile->napi);

                bpf_net_ctx_clear(bpf_net_ctx);
                rcu_read_unlock();
                local_bh_enable();

                tun_put_page(&tpage);

                ret = total_len;
                goto out;
        }

        ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
                           m->msg_flags & MSG_DONTWAIT,
                           m->msg_flags & MSG_MORE);
out:
        tun_put(tun);
        return ret;
}

static int tun_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len,
                       int flags)
{
        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
        struct tun_struct *tun = tun_get(tfile);
        void *ptr = m->msg_control;
        int ret;

        if (!tun) {
                ret = -EBADFD;
                goto out_free;
        }

        if (flags & ~(MSG_DONTWAIT|MSG_TRUNC|MSG_ERRQUEUE)) {
                ret = -EINVAL;
                goto out_put_tun;
        }
        if (flags & MSG_ERRQUEUE) {
                ret = sock_recv_errqueue(sock->sk, m, total_len,
                                         SOL_PACKET, TUN_TX_TIMESTAMP);
                goto out;
        }
        ret = tun_do_read(tun, tfile, &m->msg_iter, flags & MSG_DONTWAIT, ptr);
        if (ret > (ssize_t)total_len) {
                m->msg_flags |= MSG_TRUNC;
                ret = flags & MSG_TRUNC ? ret : total_len;
        }
out:
        tun_put(tun);
        return ret;

out_put_tun:
        tun_put(tun);
out_free:
        tun_ptr_free(ptr);
        return ret;
}

static int tun_ptr_peek_len(void *ptr)
{
        if (likely(ptr)) {
                if (tun_is_xdp_frame(ptr)) {
                        struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);

                        return xdpf->len;
                }
                return __skb_array_len_with_tag(ptr);
        } else {
                return 0;
        }
}

static int tun_peek_len(struct socket *sock)
{
        struct tun_file *tfile = container_of(sock, struct tun_file, socket);
        struct tun_struct *tun;
        int ret = 0;

        tun = tun_get(tfile);
        if (!tun)
                return 0;

        ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
        tun_put(tun);

        return ret;
}

/* Ops structure to mimic raw sockets with tun */
static const struct proto_ops tun_socket_ops = {
        .peek_len = tun_peek_len,
        .sendmsg = tun_sendmsg,
        .recvmsg = tun_recvmsg,
};

static struct proto tun_proto = {
        .name                = "tun",
        .owner                = THIS_MODULE,
        .obj_size        = sizeof(struct tun_file),
};

static int tun_flags(struct tun_struct *tun)
{
        return tun->flags & (TUN_FEATURES | IFF_PERSIST | IFF_TUN | IFF_TAP);
}

static ssize_t tun_flags_show(struct device *dev, struct device_attribute *attr,
                              char *buf)
{
        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
        return sysfs_emit(buf, "0x%x\n", tun_flags(tun));
}

static ssize_t owner_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
        return uid_valid(tun->owner)?
                sysfs_emit(buf, "%u\n",
                           from_kuid_munged(current_user_ns(), tun->owner)) :
                sysfs_emit(buf, "-1\n");
}

static ssize_t group_show(struct device *dev, struct device_attribute *attr,
                          char *buf)
{
        struct tun_struct *tun = netdev_priv(to_net_dev(dev));
        return gid_valid(tun->group) ?
                sysfs_emit(buf, "%u\n",
                           from_kgid_munged(current_user_ns(), tun->group)) :
                sysfs_emit(buf, "-1\n");
}

static DEVICE_ATTR_RO(tun_flags);
static DEVICE_ATTR_RO(owner);
static DEVICE_ATTR_RO(group);

static struct attribute *tun_dev_attrs[] = {
        &dev_attr_tun_flags.attr,
        &dev_attr_owner.attr,
        &dev_attr_group.attr,
        NULL
};

static const struct attribute_group tun_attr_group = {
        .attrs = tun_dev_attrs
};

static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
{
        struct tun_struct *tun;
        struct tun_file *tfile = file->private_data;
        struct net_device *dev;
        int err;

        if (tfile->detached)
                return -EINVAL;

        if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) {
                if (!capable(CAP_NET_ADMIN))
                        return -EPERM;

                if (!(ifr->ifr_flags & IFF_NAPI) ||
                    (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP)
                        return -EINVAL;
        }

        dev = __dev_get_by_name(net, ifr->ifr_name);
        if (dev) {
                if (ifr->ifr_flags & IFF_TUN_EXCL)
                        return -EBUSY;
                if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
                        tun = netdev_priv(dev);
                else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
                        tun = netdev_priv(dev);
                else
                        return -EINVAL;

                if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
                    !!(tun->flags & IFF_MULTI_QUEUE))
                        return -EINVAL;

                if (tun_not_capable(tun))
                        return -EPERM;
                err = security_tun_dev_open(tun->security);
                if (err < 0)
                        return err;

                err = tun_attach(tun, file, ifr->ifr_flags & IFF_NOFILTER,
                                 ifr->ifr_flags & IFF_NAPI,
                                 ifr->ifr_flags & IFF_NAPI_FRAGS, true);
                if (err < 0)
                        return err;

                if (tun->flags & IFF_MULTI_QUEUE &&
                    (tun->numqueues + tun->numdisabled > 1)) {
                        /* One or more queue has already been attached, no need
                         * to initialize the device again.
                         */
                        netdev_state_change(dev);
                        return 0;
                }

                tun->flags = (tun->flags & ~TUN_FEATURES) |
                              (ifr->ifr_flags & TUN_FEATURES);

                netdev_state_change(dev);
        } else {
                char *name;
                unsigned long flags = 0;
                int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
                             MAX_TAP_QUEUES : 1;

                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                err = security_tun_dev_create();
                if (err < 0)
                        return err;

                /* Set dev type */
                if (ifr->ifr_flags & IFF_TUN) {
                        /* TUN device */
                        flags |= IFF_TUN;
                        name = "tun%d";
                } else if (ifr->ifr_flags & IFF_TAP) {
                        /* TAP device */
                        flags |= IFF_TAP;
                        name = "tap%d";
                } else
                        return -EINVAL;

                if (*ifr->ifr_name)
                        name = ifr->ifr_name;

                dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
                                       NET_NAME_UNKNOWN, tun_setup, queues,
                                       queues);

                if (!dev)
                        return -ENOMEM;

                dev_net_set(dev, net);
                dev->rtnl_link_ops = &tun_link_ops;
                dev->ifindex = tfile->ifindex;
                dev->sysfs_groups[0] = &tun_attr_group;

                tun = netdev_priv(dev);
                tun->dev = dev;
                tun->flags = flags;
                tun->txflt.count = 0;
                tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);

                tun->align = NET_SKB_PAD;
                tun->filter_attached = false;
                tun->sndbuf = tfile->socket.sk->sk_sndbuf;
                tun->rx_batched = 0;
                RCU_INIT_POINTER(tun->steering_prog, NULL);

                tun->ifr = ifr;
                tun->file = file;

                tun_net_initialize(dev);

                err = register_netdevice(tun->dev);
                if (err < 0) {
                        free_netdev(dev);
                        return err;
                }
                /* free_netdev() won't check refcnt, to avoid race
                 * with dev_put() we need publish tun after registration.
                 */
                rcu_assign_pointer(tfile->tun, tun);
        }

        if (ifr->ifr_flags & IFF_NO_CARRIER)
                netif_carrier_off(tun->dev);
        else
                netif_carrier_on(tun->dev);

        /* Make sure persistent devices do not get stuck in
         * xoff state.
         */
        if (netif_running(tun->dev))
                netif_tx_wake_all_queues(tun->dev);

        strcpy(ifr->ifr_name, tun->dev->name);
        return 0;
}

static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
{
        strcpy(ifr->ifr_name, tun->dev->name);

        ifr->ifr_flags = tun_flags(tun);

}

/* This is like a cut-down ethtool ops, except done via tun fd so no
 * privs required. */
static int set_offload(struct tun_struct *tun, unsigned long arg)
{
        netdev_features_t features = 0;

        if (arg & TUN_F_CSUM) {
                features |= NETIF_F_HW_CSUM;
                arg &= ~TUN_F_CSUM;

                if (arg & (TUN_F_TSO4|TUN_F_TSO6)) {
                        if (arg & TUN_F_TSO_ECN) {
                                features |= NETIF_F_TSO_ECN;
                                arg &= ~TUN_F_TSO_ECN;
                        }
                        if (arg & TUN_F_TSO4)
                                features |= NETIF_F_TSO;
                        if (arg & TUN_F_TSO6)
                                features |= NETIF_F_TSO6;
                        arg &= ~(TUN_F_TSO4|TUN_F_TSO6);
                }

                arg &= ~TUN_F_UFO;

                /* TODO: for now USO4 and USO6 should work simultaneously */
                if (arg & TUN_F_USO4 && arg & TUN_F_USO6) {
                        features |= NETIF_F_GSO_UDP_L4;
                        arg &= ~(TUN_F_USO4 | TUN_F_USO6);
                }
        }

        /* This gives the user a way to test for new features in future by
         * trying to set them. */
        if (arg)
                return -EINVAL;

        tun->set_features = features;
        tun->dev->wanted_features &= ~TUN_USER_FEATURES;
        tun->dev->wanted_features |= features;
        netdev_update_features(tun->dev);

        return 0;
}

static void tun_detach_filter(struct tun_struct *tun, int n)
{
        int i;
        struct tun_file *tfile;

        for (i = 0; i < n; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                lock_sock(tfile->socket.sk);
                sk_detach_filter(tfile->socket.sk);
                release_sock(tfile->socket.sk);
        }

        tun->filter_attached = false;
}

static int tun_attach_filter(struct tun_struct *tun)
{
        int i, ret = 0;
        struct tun_file *tfile;

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                lock_sock(tfile->socket.sk);
                ret = sk_attach_filter(&tun->fprog, tfile->socket.sk);
                release_sock(tfile->socket.sk);
                if (ret) {
                        tun_detach_filter(tun, i);
                        return ret;
                }
        }

        tun->filter_attached = true;
        return ret;
}

static void tun_set_sndbuf(struct tun_struct *tun)
{
        struct tun_file *tfile;
        int i;

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                tfile->socket.sk->sk_sndbuf = tun->sndbuf;
        }
}

static int tun_set_queue(struct file *file, struct ifreq *ifr)
{
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun;
        int ret = 0;

        rtnl_lock();

        if (ifr->ifr_flags & IFF_ATTACH_QUEUE) {
                tun = tfile->detached;
                if (!tun) {
                        ret = -EINVAL;
                        goto unlock;
                }
                ret = security_tun_dev_attach_queue(tun->security);
                if (ret < 0)
                        goto unlock;
                ret = tun_attach(tun, file, false, tun->flags & IFF_NAPI,
                                 tun->flags & IFF_NAPI_FRAGS, true);
        } else if (ifr->ifr_flags & IFF_DETACH_QUEUE) {
                tun = rtnl_dereference(tfile->tun);
                if (!tun || !(tun->flags & IFF_MULTI_QUEUE) || tfile->detached)
                        ret = -EINVAL;
                else
                        __tun_detach(tfile, false);
        } else
                ret = -EINVAL;

        if (ret >= 0)
                netdev_state_change(tun->dev);

unlock:
        rtnl_unlock();
        return ret;
}

static int tun_set_ebpf(struct tun_struct *tun, struct tun_prog __rcu **prog_p,
                        void __user *data)
{
        struct bpf_prog *prog;
        int fd;

        if (copy_from_user(&fd, data, sizeof(fd)))
                return -EFAULT;

        if (fd == -1) {
                prog = NULL;
        } else {
                prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);
        }

        return __tun_set_ebpf(tun, prog_p, prog);
}

/* Return correct value for tun->dev->addr_len based on tun->dev->type. */
static unsigned char tun_get_addr_len(unsigned short type)
{
        switch (type) {
        case ARPHRD_IP6GRE:
        case ARPHRD_TUNNEL6:
                return sizeof(struct in6_addr);
        case ARPHRD_IPGRE:
        case ARPHRD_TUNNEL:
        case ARPHRD_SIT:
                return 4;
        case ARPHRD_ETHER:
                return ETH_ALEN;
        case ARPHRD_IEEE802154:
        case ARPHRD_IEEE802154_MONITOR:
                return IEEE802154_EXTENDED_ADDR_LEN;
        case ARPHRD_PHONET_PIPE:
        case ARPHRD_PPP:
        case ARPHRD_NONE:
                return 0;
        case ARPHRD_6LOWPAN:
                return EUI64_ADDR_LEN;
        case ARPHRD_FDDI:
                return FDDI_K_ALEN;
        case ARPHRD_HIPPI:
                return HIPPI_ALEN;
        case ARPHRD_IEEE802:
                return FC_ALEN;
        case ARPHRD_ROSE:
                return ROSE_ADDR_LEN;
        case ARPHRD_NETROM:
                return AX25_ADDR_LEN;
        case ARPHRD_LOCALTLK:
                return LTALK_ALEN;
        default:
                return 0;
        }
}

static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg, int ifreq_len)
{
        struct tun_file *tfile = file->private_data;
        struct net *net = sock_net(&tfile->sk);
        struct tun_struct *tun;
        void __user* argp = (void __user*)arg;
        unsigned int carrier;
        struct ifreq ifr;
        kuid_t owner;
        kgid_t group;
        int ifindex;
        int sndbuf;
        int ret;
        bool do_notify = false;

        if (cmd == TUNSETIFF || cmd == TUNSETQUEUE ||
            (_IOC_TYPE(cmd) == SOCK_IOC_TYPE && cmd != SIOCGSKNS)) {
                if (copy_from_user(&ifr, argp, ifreq_len))
                        return -EFAULT;
        } else {
                memset(&ifr, 0, sizeof(ifr));
        }
        if (cmd == TUNGETFEATURES) {
                /* Currently this just means: "what IFF flags are valid?".
                 * This is needed because we never checked for invalid flags on
                 * TUNSETIFF.
                 */
                return put_user(IFF_TUN | IFF_TAP | IFF_NO_CARRIER |
                                TUN_FEATURES, (unsigned int __user*)argp);
        } else if (cmd == TUNSETQUEUE) {
                return tun_set_queue(file, &ifr);
        } else if (cmd == SIOCGSKNS) {
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                return open_related_ns(&net->ns, get_net_ns);
        }

        rtnl_lock();

        tun = tun_get(tfile);
        if (cmd == TUNSETIFF) {
                ret = -EEXIST;
                if (tun)
                        goto unlock;

                ifr.ifr_name[IFNAMSIZ-1] = '\0';

                ret = tun_set_iff(net, file, &ifr);

                if (ret)
                        goto unlock;

                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                goto unlock;
        }
        if (cmd == TUNSETIFINDEX) {
                ret = -EPERM;
                if (tun)
                        goto unlock;

                ret = -EFAULT;
                if (copy_from_user(&ifindex, argp, sizeof(ifindex)))
                        goto unlock;
                ret = -EINVAL;
                if (ifindex < 0)
                        goto unlock;
                ret = 0;
                tfile->ifindex = ifindex;
                goto unlock;
        }

        ret = -EBADFD;
        if (!tun)
                goto unlock;

        netif_info(tun, drv, tun->dev, "tun_chr_ioctl cmd %u\n", cmd);

        net = dev_net(tun->dev);
        ret = 0;
        switch (cmd) {
        case TUNGETIFF:
                tun_get_iff(tun, &ifr);

                if (tfile->detached)
                        ifr.ifr_flags |= IFF_DETACH_QUEUE;
                if (!tfile->socket.sk->sk_filter)
                        ifr.ifr_flags |= IFF_NOFILTER;

                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                break;

        case TUNSETNOCSUM:
                /* Disable/Enable checksum */

                /* [unimplemented] */
                netif_info(tun, drv, tun->dev, "ignored: set checksum %s\n",
                           arg ? "disabled" : "enabled");
                break;

        case TUNSETPERSIST:
                /* Disable/Enable persist mode. Keep an extra reference to the
                 * module to prevent the module being unprobed.
                 */
                if (arg && !(tun->flags & IFF_PERSIST)) {
                        tun->flags |= IFF_PERSIST;
                        __module_get(THIS_MODULE);
                        do_notify = true;
                }
                if (!arg && (tun->flags & IFF_PERSIST)) {
                        tun->flags &= ~IFF_PERSIST;
                        module_put(THIS_MODULE);
                        do_notify = true;
                }

                netif_info(tun, drv, tun->dev, "persist %s\n",
                           arg ? "enabled" : "disabled");
                break;

        case TUNSETOWNER:
                /* Set owner of the device */
                owner = make_kuid(current_user_ns(), arg);
                if (!uid_valid(owner)) {
                        ret = -EINVAL;
                        break;
                }
                tun->owner = owner;
                do_notify = true;
                netif_info(tun, drv, tun->dev, "owner set to %u\n",
                           from_kuid(&init_user_ns, tun->owner));
                break;

        case TUNSETGROUP:
                /* Set group of the device */
                group = make_kgid(current_user_ns(), arg);
                if (!gid_valid(group)) {
                        ret = -EINVAL;
                        break;
                }
                tun->group = group;
                do_notify = true;
                netif_info(tun, drv, tun->dev, "group set to %u\n",
                           from_kgid(&init_user_ns, tun->group));
                break;

        case TUNSETLINK:
                /* Only allow setting the type when the interface is down */
                if (tun->dev->flags & IFF_UP) {
                        netif_info(tun, drv, tun->dev,
                                   "Linktype set failed because interface is up\n");
                        ret = -EBUSY;
                } else {
                        ret = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
                                                       tun->dev);
                        ret = notifier_to_errno(ret);
                        if (ret) {
                                netif_info(tun, drv, tun->dev,
                                           "Refused to change device type\n");
                                break;
                        }
                        tun->dev->type = (int) arg;
                        tun->dev->addr_len = tun_get_addr_len(tun->dev->type);
                        netif_info(tun, drv, tun->dev, "linktype set to %d\n",
                                   tun->dev->type);
                        call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
                                                 tun->dev);
                }
                break;

        case TUNSETDEBUG:
                tun->msg_enable = (u32)arg;
                break;

        case TUNSETOFFLOAD:
                ret = set_offload(tun, arg);
                break;

        case TUNSETTXFILTER:
                /* Can be set only for TAPs */
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = update_filter(&tun->txflt, (void __user *)arg);
                break;

        case SIOCGIFHWADDR:
                /* Get hw address */
                dev_get_mac_address(&ifr.ifr_hwaddr, net, tun->dev->name);
                if (copy_to_user(argp, &ifr, ifreq_len))
                        ret = -EFAULT;
                break;

        case SIOCSIFHWADDR:
                /* Set hw address */
                ret = dev_set_mac_address_user(tun->dev, &ifr.ifr_hwaddr, NULL);
                break;

        case TUNGETSNDBUF:
                sndbuf = tfile->socket.sk->sk_sndbuf;
                if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
                        ret = -EFAULT;
                break;

        case TUNSETSNDBUF:
                if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
                        ret = -EFAULT;
                        break;
                }
                if (sndbuf <= 0) {
                        ret = -EINVAL;
                        break;
                }

                tun->sndbuf = sndbuf;
                tun_set_sndbuf(tun);
                break;

        case TUNATTACHFILTER:
                /* Can be set only for TAPs */
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = -EFAULT;
                if (copy_from_user(&tun->fprog, argp, sizeof(tun->fprog)))
                        break;

                ret = tun_attach_filter(tun);
                break;

        case TUNDETACHFILTER:
                /* Can be set only for TAPs */
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = 0;
                tun_detach_filter(tun, tun->numqueues);
                break;

        case TUNGETFILTER:
                ret = -EINVAL;
                if ((tun->flags & TUN_TYPE_MASK) != IFF_TAP)
                        break;
                ret = -EFAULT;
                if (copy_to_user(argp, &tun->fprog, sizeof(tun->fprog)))
                        break;
                ret = 0;
                break;

        case TUNSETSTEERINGEBPF:
                ret = tun_set_ebpf(tun, &tun->steering_prog, argp);
                break;

        case TUNSETFILTEREBPF:
                ret = tun_set_ebpf(tun, &tun->filter_prog, argp);
                break;

        case TUNSETCARRIER:
                ret = -EFAULT;
                if (copy_from_user(&carrier, argp, sizeof(carrier)))
                        goto unlock;

                ret = tun_net_change_carrier(tun->dev, (bool)carrier);
                break;

        case TUNGETDEVNETNS:
                ret = -EPERM;
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        goto unlock;
                ret = open_related_ns(&net->ns, get_net_ns);
                break;

        default:
                ret = tun_vnet_ioctl(&tun->vnet_hdr_sz, &tun->flags, cmd, argp);
                break;
        }

        if (do_notify)
                netdev_state_change(tun->dev);

unlock:
        rtnl_unlock();
        if (tun)
                tun_put(tun);
        return ret;
}

static long tun_chr_ioctl(struct file *file,
                          unsigned int cmd, unsigned long arg)
{
        return __tun_chr_ioctl(file, cmd, arg, sizeof (struct ifreq));
}

#ifdef CONFIG_COMPAT
static long tun_chr_compat_ioctl(struct file *file,
                         unsigned int cmd, unsigned long arg)
{
        switch (cmd) {
        case TUNSETIFF:
        case TUNGETIFF:
        case TUNSETTXFILTER:
        case TUNGETSNDBUF:
        case TUNSETSNDBUF:
        case SIOCGIFHWADDR:
        case SIOCSIFHWADDR:
                arg = (unsigned long)compat_ptr(arg);
                break;
        default:
                arg = (compat_ulong_t)arg;
                break;
        }

        /*
         * compat_ifreq is shorter than ifreq, so we must not access beyond
         * the end of that structure. All fields that are used in this
         * driver are compatible though, we don't need to convert the
         * contents.
         */
        return __tun_chr_ioctl(file, cmd, arg, sizeof(struct compat_ifreq));
}
#endif /* CONFIG_COMPAT */

static int tun_chr_fasync(int fd, struct file *file, int on)
{
        struct tun_file *tfile = file->private_data;
        int ret;

        if (on) {
                ret = file_f_owner_allocate(file);
                if (ret)
                        goto out;
        }

        if ((ret = fasync_helper(fd, file, on, &tfile->fasync)) < 0)
                goto out;

        if (on) {
                __f_setown(file, task_pid(current), PIDTYPE_TGID, 0);
                tfile->flags |= TUN_FASYNC;
        } else
                tfile->flags &= ~TUN_FASYNC;
        ret = 0;
out:
        return ret;
}

static int tun_chr_open(struct inode *inode, struct file * file)
{
        struct net *net = current->nsproxy->net_ns;
        struct tun_file *tfile;

        tfile = (struct tun_file *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL,
                                            &tun_proto, 0);
        if (!tfile)
                return -ENOMEM;
        if (ptr_ring_init(&tfile->tx_ring, 0, GFP_KERNEL)) {
                sk_free(&tfile->sk);
                return -ENOMEM;
        }

        mutex_init(&tfile->napi_mutex);
        RCU_INIT_POINTER(tfile->tun, NULL);
        tfile->flags = 0;
        tfile->ifindex = 0;

        init_waitqueue_head(&tfile->socket.wq.wait);

        tfile->socket.file = file;
        tfile->socket.ops = &tun_socket_ops;

        sock_init_data_uid(&tfile->socket, &tfile->sk, current_fsuid());

        tfile->sk.sk_write_space = tun_sock_write_space;
        tfile->sk.sk_sndbuf = INT_MAX;

        file->private_data = tfile;
        INIT_LIST_HEAD(&tfile->next);

        sock_set_flag(&tfile->sk, SOCK_ZEROCOPY);

        /* tun groks IOCB_NOWAIT just fine, mark it as such */
        file->f_mode |= FMODE_NOWAIT;
        return 0;
}

static int tun_chr_close(struct inode *inode, struct file *file)
{
        struct tun_file *tfile = file->private_data;

        tun_detach(tfile, true);

        return 0;
}

#ifdef CONFIG_PROC_FS
static void tun_chr_show_fdinfo(struct seq_file *m, struct file *file)
{
        struct tun_file *tfile = file->private_data;
        struct tun_struct *tun;
        struct ifreq ifr;

        memset(&ifr, 0, sizeof(ifr));

        rtnl_lock();
        tun = tun_get(tfile);
        if (tun)
                tun_get_iff(tun, &ifr);
        rtnl_unlock();

        if (tun)
                tun_put(tun);

        seq_printf(m, "iff:\t%s\n", ifr.ifr_name);
}
#endif

static const struct file_operations tun_fops = {
        .owner        = THIS_MODULE,
        .read_iter  = tun_chr_read_iter,
        .write_iter = tun_chr_write_iter,
        .poll        = tun_chr_poll,
        .unlocked_ioctl        = tun_chr_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl = tun_chr_compat_ioctl,
#endif
        .open        = tun_chr_open,
        .release = tun_chr_close,
        .fasync = tun_chr_fasync,
#ifdef CONFIG_PROC_FS
        .show_fdinfo = tun_chr_show_fdinfo,
#endif
};

static struct miscdevice tun_miscdev = {
        .minor = TUN_MINOR,
        .name = "tun",
        .nodename = "net/tun",
        .fops = &tun_fops,
};

/* ethtool interface */

static void tun_default_link_ksettings(struct net_device *dev,
                                       struct ethtool_link_ksettings *cmd)
{
        ethtool_link_ksettings_zero_link_mode(cmd, supported);
        ethtool_link_ksettings_zero_link_mode(cmd, advertising);
        cmd->base.speed                = SPEED_10000;
        cmd->base.duplex        = DUPLEX_FULL;
        cmd->base.port                = PORT_TP;
        cmd->base.phy_address        = 0;
        cmd->base.autoneg        = AUTONEG_DISABLE;
}

static int tun_get_link_ksettings(struct net_device *dev,
                                  struct ethtool_link_ksettings *cmd)
{
        struct tun_struct *tun = netdev_priv(dev);

        memcpy(cmd, &tun->link_ksettings, sizeof(*cmd));
        return 0;
}

static int tun_set_link_ksettings(struct net_device *dev,
                                  const struct ethtool_link_ksettings *cmd)
{
        struct tun_struct *tun = netdev_priv(dev);

        memcpy(&tun->link_ksettings, cmd, sizeof(*cmd));
        return 0;
}

static void tun_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
        struct tun_struct *tun = netdev_priv(dev);

        strscpy(info->driver, DRV_NAME, sizeof(info->driver));
        strscpy(info->version, DRV_VERSION, sizeof(info->version));

        switch (tun->flags & TUN_TYPE_MASK) {
        case IFF_TUN:
                strscpy(info->bus_info, "tun", sizeof(info->bus_info));
                break;
        case IFF_TAP:
                strscpy(info->bus_info, "tap", sizeof(info->bus_info));
                break;
        }
}

static u32 tun_get_msglevel(struct net_device *dev)
{
        struct tun_struct *tun = netdev_priv(dev);

        return tun->msg_enable;
}

static void tun_set_msglevel(struct net_device *dev, u32 value)
{
        struct tun_struct *tun = netdev_priv(dev);

        tun->msg_enable = value;
}

static int tun_get_coalesce(struct net_device *dev,
                            struct ethtool_coalesce *ec,
                            struct kernel_ethtool_coalesce *kernel_coal,
                            struct netlink_ext_ack *extack)
{
        struct tun_struct *tun = netdev_priv(dev);

        ec->rx_max_coalesced_frames = tun->rx_batched;

        return 0;
}

static int tun_set_coalesce(struct net_device *dev,
                            struct ethtool_coalesce *ec,
                            struct kernel_ethtool_coalesce *kernel_coal,
                            struct netlink_ext_ack *extack)
{
        struct tun_struct *tun = netdev_priv(dev);

        if (ec->rx_max_coalesced_frames > NAPI_POLL_WEIGHT)
                tun->rx_batched = NAPI_POLL_WEIGHT;
        else
                tun->rx_batched = ec->rx_max_coalesced_frames;

        return 0;
}

static void tun_get_channels(struct net_device *dev,
                             struct ethtool_channels *channels)
{
        struct tun_struct *tun = netdev_priv(dev);

        channels->combined_count = tun->numqueues;
        channels->max_combined = tun->flags & IFF_MULTI_QUEUE ? MAX_TAP_QUEUES : 1;
}

static const struct ethtool_ops tun_ethtool_ops = {
        .supported_coalesce_params = ETHTOOL_COALESCE_RX_MAX_FRAMES,
        .get_drvinfo        = tun_get_drvinfo,
        .get_msglevel        = tun_get_msglevel,
        .set_msglevel        = tun_set_msglevel,
        .get_link        = ethtool_op_get_link,
        .get_channels   = tun_get_channels,
        .get_ts_info        = ethtool_op_get_ts_info,
        .get_coalesce   = tun_get_coalesce,
        .set_coalesce   = tun_set_coalesce,
        .get_link_ksettings = tun_get_link_ksettings,
        .set_link_ksettings = tun_set_link_ksettings,
};

static int tun_queue_resize(struct tun_struct *tun)
{
        struct net_device *dev = tun->dev;
        struct tun_file *tfile;
        struct ptr_ring **rings;
        int n = tun->numqueues + tun->numdisabled;
        int ret, i;

        rings = kmalloc_array(n, sizeof(*rings), GFP_KERNEL);
        if (!rings)
                return -ENOMEM;

        for (i = 0; i < tun->numqueues; i++) {
                tfile = rtnl_dereference(tun->tfiles[i]);
                rings[i] = &tfile->tx_ring;
        }
        list_for_each_entry(tfile, &tun->disabled, next)
                rings[i++] = &tfile->tx_ring;

        ret = ptr_ring_resize_multiple_bh(rings, n,
                                          dev->tx_queue_len, GFP_KERNEL,
                                          tun_ptr_free);

        kfree(rings);
        return ret;
}

static int tun_device_event(struct notifier_block *unused,
                            unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct tun_struct *tun = netdev_priv(dev);
        int i;

        if (dev->rtnl_link_ops != &tun_link_ops)
                return NOTIFY_DONE;

        switch (event) {
        case NETDEV_CHANGE_TX_QUEUE_LEN:
                if (tun_queue_resize(tun))
                        return NOTIFY_BAD;
                break;
        case NETDEV_UP:
                for (i = 0; i < tun->numqueues; i++) {
                        struct tun_file *tfile;

                        tfile = rtnl_dereference(tun->tfiles[i]);
                        tfile->socket.sk->sk_write_space(tfile->socket.sk);
                }
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block tun_notifier_block __read_mostly = {
        .notifier_call        = tun_device_event,
};

static int __init tun_init(void)
{
        int ret = 0;

        pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);

        ret = rtnl_link_register(&tun_link_ops);
        if (ret) {
                pr_err("Can't register link_ops\n");
                goto err_linkops;
        }

        ret = misc_register(&tun_miscdev);
        if (ret) {
                pr_err("Can't register misc device %d\n", TUN_MINOR);
                goto err_misc;
        }

        ret = register_netdevice_notifier(&tun_notifier_block);
        if (ret) {
                pr_err("Can't register netdevice notifier\n");
                goto err_notifier;
        }

        return  0;

err_notifier:
        misc_deregister(&tun_miscdev);
err_misc:
        rtnl_link_unregister(&tun_link_ops);
err_linkops:
        return ret;
}

static void __exit tun_cleanup(void)
{
        misc_deregister(&tun_miscdev);
        rtnl_link_unregister(&tun_link_ops);
        unregister_netdevice_notifier(&tun_notifier_block);
}

/* Get an underlying socket object from tun file.  Returns error unless file is
 * attached to a device.  The returned object works like a packet socket, it
 * can be used for sock_sendmsg/sock_recvmsg.  The caller is responsible for
 * holding a reference to the file for as long as the socket is in use. */
struct socket *tun_get_socket(struct file *file)
{
        struct tun_file *tfile;
        if (file->f_op != &tun_fops)
                return ERR_PTR(-EINVAL);
        tfile = file->private_data;
        if (!tfile)
                return ERR_PTR(-EBADFD);
        return &tfile->socket;
}
EXPORT_SYMBOL_GPL(tun_get_socket);

struct ptr_ring *tun_get_tx_ring(struct file *file)
{
        struct tun_file *tfile;

        if (file->f_op != &tun_fops)
                return ERR_PTR(-EINVAL);
        tfile = file->private_data;
        if (!tfile)
                return ERR_PTR(-EBADFD);
        return &tfile->tx_ring;
}
EXPORT_SYMBOL_GPL(tun_get_tx_ring);

module_init(tun_init);
module_exit(tun_cleanup);
MODULE_DESCRIPTION(DRV_DESCRIPTION);
MODULE_AUTHOR(DRV_COPYRIGHT);
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(TUN_MINOR);
MODULE_ALIAS("devname:net/tun");












































































































  776 















































  776 


































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_PAGE_EXT_H
#define __LINUX_PAGE_EXT_H

#include <linux/types.h>
#include <linux/mmzone.h>
#include <linux/stacktrace.h>

struct pglist_data;

#ifdef CONFIG_PAGE_EXTENSION
/**
 * struct page_ext_operations - per page_ext client operations
 * @offset: Offset to the client's data within page_ext. Offset is returned to
 *          the client by page_ext_init.
 * @size: The size of the client data within page_ext.
 * @need: Function that returns true if client requires page_ext.
 * @init: (optional) Called to initialize client once page_exts are allocated.
 * @need_shared_flags: True when client is using shared page_ext->flags
 *                     field.
 *
 * Each Page Extension client must define page_ext_operations in
 * page_ext_ops array.
 */
struct page_ext_operations {
        size_t offset;
        size_t size;
        bool (*need)(void);
        void (*init)(void);
        bool need_shared_flags;
};

/*
 * The page_ext_flags users must set need_shared_flags to true.
 */
enum page_ext_flags {
        PAGE_EXT_OWNER,
        PAGE_EXT_OWNER_ALLOCATED,
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
        PAGE_EXT_YOUNG,
        PAGE_EXT_IDLE,
#endif
};

/*
 * Page Extension can be considered as an extended mem_map.
 * A page_ext page is associated with every page descriptor. The
 * page_ext helps us add more information about the page.
 * All page_ext are allocated at boot or memory hotplug event,
 * then the page_ext for pfn always exists.
 */
struct page_ext {
        unsigned long flags;
};

extern bool early_page_ext;
extern unsigned long page_ext_size;
extern void pgdat_page_ext_init(struct pglist_data *pgdat);

static inline bool early_page_ext_enabled(void)
{
        return early_page_ext;
}

#ifdef CONFIG_SPARSEMEM
static inline void page_ext_init_flatmem(void)
{
}
extern void page_ext_init(void);
static inline void page_ext_init_flatmem_late(void)
{
}

static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn)
{
        /*
         * page_ext is allocated per memory section. Once we cross a
         * memory section, we have to fetch the new pointer.
         */
        return next_pfn % PAGES_PER_SECTION;
}
#else
extern void page_ext_init_flatmem(void);
extern void page_ext_init_flatmem_late(void);
static inline void page_ext_init(void)
{
}

static inline bool page_ext_iter_next_fast_possible(unsigned long next_pfn)
{
        return true;
}
#endif

extern struct page_ext *page_ext_get(const struct page *page);
extern void page_ext_put(struct page_ext *page_ext);
extern struct page_ext *page_ext_lookup(unsigned long pfn);

static inline void *page_ext_data(struct page_ext *page_ext,
                                  struct page_ext_operations *ops)
{
        return (void *)(page_ext) + ops->offset;
}

static inline struct page_ext *page_ext_next(struct page_ext *curr)
{
        void *next = curr;
        next += page_ext_size;
        return next;
}

struct page_ext_iter {
        unsigned long index;
        unsigned long start_pfn;
        struct page_ext *page_ext;
};

/**
 * page_ext_iter_begin() - Prepare for iterating through page extensions.
 * @iter: page extension iterator.
 * @pfn: PFN of the page we're interested in.
 *
 * Must be called with RCU read lock taken.
 *
 * Return: NULL if no page_ext exists for this page.
 */
static inline struct page_ext *page_ext_iter_begin(struct page_ext_iter *iter,
                                                unsigned long pfn)
{
        iter->index = 0;
        iter->start_pfn = pfn;
        iter->page_ext = page_ext_lookup(pfn);

        return iter->page_ext;
}

/**
 * page_ext_iter_next() - Get next page extension
 * @iter: page extension iterator.
 *
 * Must be called with RCU read lock taken.
 *
 * Return: NULL if no next page_ext exists.
 */
static inline struct page_ext *page_ext_iter_next(struct page_ext_iter *iter)
{
        unsigned long pfn;

        if (WARN_ON_ONCE(!iter->page_ext))
                return NULL;

        iter->index++;
        pfn = iter->start_pfn + iter->index;

        if (page_ext_iter_next_fast_possible(pfn))
                iter->page_ext = page_ext_next(iter->page_ext);
        else
                iter->page_ext = page_ext_lookup(pfn);

        return iter->page_ext;
}

/**
 * page_ext_iter_get() - Get current page extension
 * @iter: page extension iterator.
 *
 * Return: NULL if no page_ext exists for this iterator.
 */
static inline struct page_ext *page_ext_iter_get(const struct page_ext_iter *iter)
{
        return iter->page_ext;
}

/**
 * for_each_page_ext(): iterate through page_ext objects.
 * @__page: the page we're interested in
 * @__pgcount: how many pages to iterate through
 * @__page_ext: struct page_ext pointer where the current page_ext
 *              object is returned
 * @__iter: struct page_ext_iter object (defined in the stack)
 *
 * IMPORTANT: must be called with RCU read lock taken.
 */
#define for_each_page_ext(__page, __pgcount, __page_ext, __iter) \
        for (__page_ext = page_ext_iter_begin(&__iter, page_to_pfn(__page));\
                __page_ext && __iter.index < __pgcount;          \
                __page_ext = page_ext_iter_next(&__iter))

#else /* !CONFIG_PAGE_EXTENSION */
struct page_ext;

static inline bool early_page_ext_enabled(void)
{
        return false;
}

static inline void pgdat_page_ext_init(struct pglist_data *pgdat)
{
}

static inline void page_ext_init(void)
{
}

static inline void page_ext_init_flatmem_late(void)
{
}

static inline void page_ext_init_flatmem(void)
{
}

static inline struct page_ext *page_ext_get(const struct page *page)
{
        return NULL;
}

static inline void page_ext_put(struct page_ext *page_ext)
{
}
#endif /* CONFIG_PAGE_EXTENSION */
#endif /* __LINUX_PAGE_EXT_H */






























   18 














    2 

















































    3 

















    3 





    2 






    3 





























   10 

























   11 










   11 



    4 

    1 

    1 



    1 

    1 




    1 




    1 














    4 




































































































































































    3 










    1 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
// SPDX-License-Identifier: GPL-2.0
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/ktime.h>
#include <linux/seq_file.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>
#include <linux/mnt_namespace.h>

#include "mount.h"
#include "internal.h"

static struct vfsmount *nsfs_mnt;

static long ns_ioctl(struct file *filp, unsigned int ioctl,
                        unsigned long arg);
static const struct file_operations ns_file_operations = {
        .unlocked_ioctl = ns_ioctl,
        .compat_ioctl   = compat_ptr_ioctl,
};

static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        struct ns_common *ns = inode->i_private;
        const struct proc_ns_operations *ns_ops = ns->ops;

        return dynamic_dname(buffer, buflen, "%s:[%lu]",
                ns_ops->name, inode->i_ino);
}

const struct dentry_operations ns_dentry_operations = {
        .d_dname        = ns_dname,
        .d_prune        = stashed_dentry_prune,
};

static void nsfs_evict(struct inode *inode)
{
        struct ns_common *ns = inode->i_private;
        clear_inode(inode);
        ns->ops->put(ns);
}

int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
                     void *private_data)
{
        struct ns_common *ns;

        ns = ns_get_cb(private_data);
        if (!ns)
                return -ENOENT;

        return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path);
}

struct ns_get_path_task_args {
        const struct proc_ns_operations *ns_ops;
        struct task_struct *task;
};

static struct ns_common *ns_get_path_task(void *private_data)
{
        struct ns_get_path_task_args *args = private_data;

        return args->ns_ops->get(args->task);
}

int ns_get_path(struct path *path, struct task_struct *task,
                  const struct proc_ns_operations *ns_ops)
{
        struct ns_get_path_task_args args = {
                .ns_ops        = ns_ops,
                .task        = task,
        };

        return ns_get_path_cb(path, ns_get_path_task, &args);
}

/**
 * open_namespace - open a namespace
 * @ns: the namespace to open
 *
 * This will consume a reference to @ns indendent of success or failure.
 *
 * Return: A file descriptor on success or a negative error code on failure.
 */
int open_namespace(struct ns_common *ns)
{
        struct path path __free(path_put) = {};
        struct file *f;
        int err;

        /* call first to consume reference */
        err = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
        if (err < 0)
                return err;

        CLASS(get_unused_fd, fd)(O_CLOEXEC);
        if (fd < 0)
                return fd;

        f = dentry_open(&path, O_RDONLY, current_cred());
        if (IS_ERR(f))
                return PTR_ERR(f);

        fd_install(fd, f);
        return take_fd(fd);
}

int open_related_ns(struct ns_common *ns,
                   struct ns_common *(*get_ns)(struct ns_common *ns))
{
        struct ns_common *relative;

        relative = get_ns(ns);
        if (IS_ERR(relative))
                return PTR_ERR(relative);

        return open_namespace(relative);
}
EXPORT_SYMBOL_GPL(open_related_ns);

static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns,
                                struct mnt_ns_info __user *uinfo, size_t usize,
                                struct mnt_ns_info *kinfo)
{
        /*
         * If userspace and the kernel have the same struct size it can just
         * be copied. If userspace provides an older struct, only the bits that
         * userspace knows about will be copied. If userspace provides a new
         * struct, only the bits that the kernel knows aobut will be copied and
         * the size value will be set to the size the kernel knows about.
         */
        kinfo->size                = min(usize, sizeof(*kinfo));
        kinfo->mnt_ns_id        = mnt_ns->seq;
        kinfo->nr_mounts        = READ_ONCE(mnt_ns->nr_mounts);
        /* Subtract the root mount of the mount namespace. */
        if (kinfo->nr_mounts)
                kinfo->nr_mounts--;

        if (copy_to_user(uinfo, kinfo, kinfo->size))
                return -EFAULT;

        return 0;
}

static bool nsfs_ioctl_valid(unsigned int cmd)
{
        switch (cmd) {
        case NS_GET_USERNS:
        case NS_GET_PARENT:
        case NS_GET_NSTYPE:
        case NS_GET_OWNER_UID:
        case NS_GET_MNTNS_ID:
        case NS_GET_PID_FROM_PIDNS:
        case NS_GET_TGID_FROM_PIDNS:
        case NS_GET_PID_IN_PIDNS:
        case NS_GET_TGID_IN_PIDNS:
                return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
        }

        /* Extensible ioctls require some extra handling. */
        switch (_IOC_NR(cmd)) {
        case _IOC_NR(NS_MNT_GET_INFO):
        case _IOC_NR(NS_MNT_GET_NEXT):
        case _IOC_NR(NS_MNT_GET_PREV):
                return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd));
        }

        return false;
}

static long ns_ioctl(struct file *filp, unsigned int ioctl,
                        unsigned long arg)
{
        struct user_namespace *user_ns;
        struct pid_namespace *pid_ns;
        struct task_struct *tsk;
        struct ns_common *ns;
        struct mnt_namespace *mnt_ns;
        bool previous = false;
        uid_t __user *argp;
        uid_t uid;
        int ret;

        if (!nsfs_ioctl_valid(ioctl))
                return -ENOIOCTLCMD;

        ns = get_proc_ns(file_inode(filp));
        switch (ioctl) {
        case NS_GET_USERNS:
                return open_related_ns(ns, ns_get_owner);
        case NS_GET_PARENT:
                if (!ns->ops->get_parent)
                        return -EINVAL;
                return open_related_ns(ns, ns->ops->get_parent);
        case NS_GET_NSTYPE:
                return ns->ops->type;
        case NS_GET_OWNER_UID:
                if (ns->ops->type != CLONE_NEWUSER)
                        return -EINVAL;
                user_ns = container_of(ns, struct user_namespace, ns);
                argp = (uid_t __user *) arg;
                uid = from_kuid_munged(current_user_ns(), user_ns->owner);
                return put_user(uid, argp);
        case NS_GET_MNTNS_ID: {
                __u64 __user *idp;
                __u64 id;

                if (ns->ops->type != CLONE_NEWNS)
                        return -EINVAL;

                mnt_ns = container_of(ns, struct mnt_namespace, ns);
                idp = (__u64 __user *)arg;
                id = mnt_ns->seq;
                return put_user(id, idp);
        }
        case NS_GET_PID_FROM_PIDNS:
                fallthrough;
        case NS_GET_TGID_FROM_PIDNS:
                fallthrough;
        case NS_GET_PID_IN_PIDNS:
                fallthrough;
        case NS_GET_TGID_IN_PIDNS: {
                if (ns->ops->type != CLONE_NEWPID)
                        return -EINVAL;

                ret = -ESRCH;
                pid_ns = container_of(ns, struct pid_namespace, ns);

                guard(rcu)();

                if (ioctl == NS_GET_PID_IN_PIDNS ||
                    ioctl == NS_GET_TGID_IN_PIDNS)
                        tsk = find_task_by_vpid(arg);
                else
                        tsk = find_task_by_pid_ns(arg, pid_ns);
                if (!tsk)
                        break;

                switch (ioctl) {
                case NS_GET_PID_FROM_PIDNS:
                        ret = task_pid_vnr(tsk);
                        break;
                case NS_GET_TGID_FROM_PIDNS:
                        ret = task_tgid_vnr(tsk);
                        break;
                case NS_GET_PID_IN_PIDNS:
                        ret = task_pid_nr_ns(tsk, pid_ns);
                        break;
                case NS_GET_TGID_IN_PIDNS:
                        ret = task_tgid_nr_ns(tsk, pid_ns);
                        break;
                default:
                        ret = 0;
                        break;
                }

                if (!ret)
                        ret = -ESRCH;
                return ret;
        }
        }

        /* extensible ioctls */
        switch (_IOC_NR(ioctl)) {
        case _IOC_NR(NS_MNT_GET_INFO): {
                struct mnt_ns_info kinfo = {};
                struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
                size_t usize = _IOC_SIZE(ioctl);

                if (ns->ops->type != CLONE_NEWNS)
                        return -EINVAL;

                if (!uinfo)
                        return -EINVAL;

                if (usize < MNT_NS_INFO_SIZE_VER0)
                        return -EINVAL;

                return copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
        }
        case _IOC_NR(NS_MNT_GET_PREV):
                previous = true;
                fallthrough;
        case _IOC_NR(NS_MNT_GET_NEXT): {
                struct mnt_ns_info kinfo = {};
                struct mnt_ns_info __user *uinfo = (struct mnt_ns_info __user *)arg;
                struct path path __free(path_put) = {};
                struct file *f __free(fput) = NULL;
                size_t usize = _IOC_SIZE(ioctl);

                if (ns->ops->type != CLONE_NEWNS)
                        return -EINVAL;

                if (usize < MNT_NS_INFO_SIZE_VER0)
                        return -EINVAL;

                mnt_ns = get_sequential_mnt_ns(to_mnt_ns(ns), previous);
                if (IS_ERR(mnt_ns))
                        return PTR_ERR(mnt_ns);

                ns = to_ns_common(mnt_ns);
                /* Transfer ownership of @mnt_ns reference to @path. */
                ret = path_from_stashed(&ns->stashed, nsfs_mnt, ns, &path);
                if (ret)
                        return ret;

                CLASS(get_unused_fd, fd)(O_CLOEXEC);
                if (fd < 0)
                        return fd;

                f = dentry_open(&path, O_RDONLY, current_cred());
                if (IS_ERR(f))
                        return PTR_ERR(f);

                if (uinfo) {
                        /*
                         * If @uinfo is passed return all information about the
                         * mount namespace as well.
                         */
                        ret = copy_ns_info_to_user(to_mnt_ns(ns), uinfo, usize, &kinfo);
                        if (ret)
                                return ret;
                }

                /* Transfer reference of @f to caller's fdtable. */
                fd_install(fd, no_free_ptr(f));
                /* File descriptor is live so hand it off to the caller. */
                return take_fd(fd);
        }
        default:
                ret = -ENOTTY;
        }

        return ret;
}

int ns_get_name(char *buf, size_t size, struct task_struct *task,
                        const struct proc_ns_operations *ns_ops)
{
        struct ns_common *ns;
        int res = -ENOENT;
        const char *name;
        ns = ns_ops->get(task);
        if (ns) {
                name = ns_ops->real_ns_name ? : ns_ops->name;
                res = snprintf(buf, size, "%s:[%u]", name, ns->inum);
                ns_ops->put(ns);
        }
        return res;
}

bool proc_ns_file(const struct file *file)
{
        return file->f_op == &ns_file_operations;
}

/**
 * ns_match() - Returns true if current namespace matches dev/ino provided.
 * @ns: current namespace
 * @dev: dev_t from nsfs that will be matched against current nsfs
 * @ino: ino_t from nsfs that will be matched against current nsfs
 *
 * Return: true if dev and ino matches the current nsfs.
 */
bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
{
        return (ns->inum == ino) && (nsfs_mnt->mnt_sb->s_dev == dev);
}


static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);
        const struct ns_common *ns = inode->i_private;
        const struct proc_ns_operations *ns_ops = ns->ops;

        seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
        return 0;
}

static const struct super_operations nsfs_ops = {
        .statfs = simple_statfs,
        .evict_inode = nsfs_evict,
        .show_path = nsfs_show_path,
};

static int nsfs_init_inode(struct inode *inode, void *data)
{
        struct ns_common *ns = data;

        inode->i_private = data;
        inode->i_mode |= S_IRUGO;
        inode->i_fop = &ns_file_operations;
        inode->i_ino = ns->inum;
        return 0;
}

static void nsfs_put_data(void *data)
{
        struct ns_common *ns = data;
        ns->ops->put(ns);
}

static const struct stashed_operations nsfs_stashed_ops = {
        .init_inode = nsfs_init_inode,
        .put_data = nsfs_put_data,
};

static int nsfs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &nsfs_ops;
        ctx->dops = &ns_dentry_operations;
        fc->s_fs_info = (void *)&nsfs_stashed_ops;
        return 0;
}

static struct file_system_type nsfs = {
        .name = "nsfs",
        .init_fs_context = nsfs_init_fs_context,
        .kill_sb = kill_anon_super,
};

void __init nsfs_init(void)
{
        nsfs_mnt = kern_mount(&nsfs);
        if (IS_ERR(nsfs_mnt))
                panic("can't set nsfs up\n");
        nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER;
}









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 
































































































































































































































































































   21 





   21 


   21 




































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *  linux/drivers/net/netconsole.c
 *
 *  Copyright (C) 2001  Ingo Molnar <mingo@redhat.com>
 *
 *  This file contains the implementation of an IRQ-safe, crash-safe
 *  kernel console implementation that outputs kernel messages to the
 *  network.
 *
 * Modification history:
 *
 * 2001-09-17    started by Ingo Molnar.
 * 2003-08-11    2.6 port by Matt Mackall
 *               simplified options
 *               generic card hooks
 *               works non-modular
 * 2003-09-07    rewritten with netpoll api
 */

/****************************************************************
 *
 ****************************************************************/

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/console.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/netpoll.h>
#include <linux/inet.h>
#include <linux/configfs.h>
#include <linux/etherdevice.h>
#include <linux/u64_stats_sync.h>
#include <linux/utsname.h>
#include <linux/rtnetlink.h>

MODULE_AUTHOR("Matt Mackall <mpm@selenic.com>");
MODULE_DESCRIPTION("Console driver for network interfaces");
MODULE_LICENSE("GPL");

#define MAX_PARAM_LENGTH                256
#define MAX_EXTRADATA_ENTRY_LEN                256
#define MAX_EXTRADATA_VALUE_LEN        200
/* The number 3 comes from userdata entry format characters (' ', '=', '\n') */
#define MAX_EXTRADATA_NAME_LEN                (MAX_EXTRADATA_ENTRY_LEN - \
                                        MAX_EXTRADATA_VALUE_LEN - 3)
#define MAX_EXTRADATA_ITEMS                16
#define MAX_PRINT_CHUNK                        1000

static char config[MAX_PARAM_LENGTH];
module_param_string(netconsole, config, MAX_PARAM_LENGTH, 0);
MODULE_PARM_DESC(netconsole, " netconsole=[src-port]@[src-ip]/[dev],[tgt-port]@<tgt-ip>/[tgt-macaddr]");

static bool oops_only;
module_param(oops_only, bool, 0600);
MODULE_PARM_DESC(oops_only, "Only log oops messages");

#define NETCONSOLE_PARAM_TARGET_PREFIX "cmdline"

#ifndef        MODULE
static int __init option_setup(char *opt)
{
        strscpy(config, opt, MAX_PARAM_LENGTH);
        return 1;
}
__setup("netconsole=", option_setup);
#endif        /* MODULE */

/* Linked list of all configured targets */
static LIST_HEAD(target_list);
/* target_cleanup_list is used to track targets that need to be cleaned outside
 * of target_list_lock. It should be cleaned in the same function it is
 * populated.
 */
static LIST_HEAD(target_cleanup_list);

/* This needs to be a spinlock because write_msg() cannot sleep */
static DEFINE_SPINLOCK(target_list_lock);
/* This needs to be a mutex because netpoll_cleanup might sleep */
static DEFINE_MUTEX(target_cleanup_list_lock);

/*
 * Console driver for extended netconsoles.  Registered on the first use to
 * avoid unnecessarily enabling ext message formatting.
 */
static struct console netconsole_ext;

struct netconsole_target_stats  {
        u64_stats_t xmit_drop_count;
        u64_stats_t enomem_count;
        struct u64_stats_sync syncp;
};

/* Features enabled in sysdata. Contrary to userdata, this data is populated by
 * the kernel. The fields are designed as bitwise flags, allowing multiple
 * features to be set in sysdata_fields.
 */
enum sysdata_feature {
        /* Populate the CPU that sends the message */
        SYSDATA_CPU_NR = BIT(0),
        /* Populate the task name (as in current->comm) in sysdata */
        SYSDATA_TASKNAME = BIT(1),
        /* Kernel release/version as part of sysdata */
        SYSDATA_RELEASE = BIT(2),
};

/**
 * struct netconsole_target - Represents a configured netconsole target.
 * @list:        Links this target into the target_list.
 * @group:        Links us into the configfs subsystem hierarchy.
 * @userdata_group:        Links to the userdata configfs hierarchy
 * @extradata_complete:        Cached, formatted string of append
 * @userdata_length:        String length of usedata in extradata_complete.
 * @sysdata_fields:        Sysdata features enabled.
 * @stats:        Packet send stats for the target. Used for debugging.
 * @enabled:        On / off knob to enable / disable target.
 *                Visible from userspace (read-write).
 *                We maintain a strict 1:1 correspondence between this and
 *                whether the corresponding netpoll is active or inactive.
 *                Also, other parameters of a target may be modified at
 *                runtime only when it is disabled (enabled == 0).
 * @extended:        Denotes whether console is extended or not.
 * @release:        Denotes whether kernel release version should be prepended
 *                to the message. Depends on extended console.
 * @np:                The netpoll structure for this target.
 *                Contains the other userspace visible parameters:
 *                dev_name        (read-write)
 *                local_port        (read-write)
 *                remote_port        (read-write)
 *                local_ip        (read-write)
 *                remote_ip        (read-write)
 *                local_mac        (read-only)
 *                remote_mac        (read-write)
 * @buf:        The buffer used to send the full msg to the network stack
 */
struct netconsole_target {
        struct list_head        list;
#ifdef        CONFIG_NETCONSOLE_DYNAMIC
        struct config_group        group;
        struct config_group        userdata_group;
        char extradata_complete[MAX_EXTRADATA_ENTRY_LEN * MAX_EXTRADATA_ITEMS];
        size_t                        userdata_length;
        /* bit-wise with sysdata_feature bits */
        u32                        sysdata_fields;
#endif
        struct netconsole_target_stats stats;
        bool                        enabled;
        bool                        extended;
        bool                        release;
        struct netpoll                np;
        /* protected by target_list_lock */
        char                        buf[MAX_PRINT_CHUNK];
};

#ifdef        CONFIG_NETCONSOLE_DYNAMIC

static struct configfs_subsystem netconsole_subsys;
static DEFINE_MUTEX(dynamic_netconsole_mutex);

static int __init dynamic_netconsole_init(void)
{
        config_group_init(&netconsole_subsys.su_group);
        mutex_init(&netconsole_subsys.su_mutex);
        return configfs_register_subsystem(&netconsole_subsys);
}

static void __exit dynamic_netconsole_exit(void)
{
        configfs_unregister_subsystem(&netconsole_subsys);
}

/*
 * Targets that were created by parsing the boot/module option string
 * do not exist in the configfs hierarchy (and have NULL names) and will
 * never go away, so make these a no-op for them.
 */
static void netconsole_target_get(struct netconsole_target *nt)
{
        if (config_item_name(&nt->group.cg_item))
                config_group_get(&nt->group);
}

static void netconsole_target_put(struct netconsole_target *nt)
{
        if (config_item_name(&nt->group.cg_item))
                config_group_put(&nt->group);
}

#else        /* !CONFIG_NETCONSOLE_DYNAMIC */

static int __init dynamic_netconsole_init(void)
{
        return 0;
}

static void __exit dynamic_netconsole_exit(void)
{
}

/*
 * No danger of targets going away from under us when dynamic
 * reconfigurability is off.
 */
static void netconsole_target_get(struct netconsole_target *nt)
{
}

static void netconsole_target_put(struct netconsole_target *nt)
{
}

static void populate_configfs_item(struct netconsole_target *nt,
                                   int cmdline_count)
{
}
#endif        /* CONFIG_NETCONSOLE_DYNAMIC */

/* Allocate and initialize with defaults.
 * Note that these targets get their config_item fields zeroed-out.
 */
static struct netconsole_target *alloc_and_init(void)
{
        struct netconsole_target *nt;

        nt = kzalloc(sizeof(*nt), GFP_KERNEL);
        if (!nt)
                return nt;

        if (IS_ENABLED(CONFIG_NETCONSOLE_EXTENDED_LOG))
                nt->extended = true;
        if (IS_ENABLED(CONFIG_NETCONSOLE_PREPEND_RELEASE))
                nt->release = true;

        nt->np.name = "netconsole";
        strscpy(nt->np.dev_name, "eth0", IFNAMSIZ);
        nt->np.local_port = 6665;
        nt->np.remote_port = 6666;
        eth_broadcast_addr(nt->np.remote_mac);

        return nt;
}

/* Clean up every target in the cleanup_list and move the clean targets back to
 * the main target_list.
 */
static void netconsole_process_cleanups_core(void)
{
        struct netconsole_target *nt, *tmp;
        unsigned long flags;

        /* The cleanup needs RTNL locked */
        ASSERT_RTNL();

        mutex_lock(&target_cleanup_list_lock);
        list_for_each_entry_safe(nt, tmp, &target_cleanup_list, list) {
                /* all entries in the cleanup_list needs to be disabled */
                WARN_ON_ONCE(nt->enabled);
                do_netpoll_cleanup(&nt->np);
                /* moved the cleaned target to target_list. Need to hold both
                 * locks
                 */
                spin_lock_irqsave(&target_list_lock, flags);
                list_move(&nt->list, &target_list);
                spin_unlock_irqrestore(&target_list_lock, flags);
        }
        WARN_ON_ONCE(!list_empty(&target_cleanup_list));
        mutex_unlock(&target_cleanup_list_lock);
}

#ifdef        CONFIG_NETCONSOLE_DYNAMIC

/*
 * Our subsystem hierarchy is:
 *
 * /sys/kernel/config/netconsole/
 *                                |
 *                                <target>/
 *                                |        enabled
 *                                |        release
 *                                |        dev_name
 *                                |        local_port
 *                                |        remote_port
 *                                |        local_ip
 *                                |        remote_ip
 *                                |        local_mac
 *                                |        remote_mac
 *                                |        transmit_errors
 *                                |        userdata/
 *                                |                <key>/
 *                                |                        value
 *                                |                ...
 *                                |
 *                                <target>/...
 */

static struct netconsole_target *to_target(struct config_item *item)
{
        struct config_group *cfg_group;

        cfg_group = to_config_group(item);
        if (!cfg_group)
                return NULL;
        return container_of(to_config_group(item),
                            struct netconsole_target, group);
}

/* Do the list cleanup with the rtnl lock hold.  rtnl lock is necessary because
 * netdev might be cleaned-up by calling __netpoll_cleanup(),
 */
static void netconsole_process_cleanups(void)
{
        /* rtnl lock is called here, because it has precedence over
         * target_cleanup_list_lock mutex and target_cleanup_list
         */
        rtnl_lock();
        netconsole_process_cleanups_core();
        rtnl_unlock();
}

/* Get rid of possible trailing newline, returning the new length */
static void trim_newline(char *s, size_t maxlen)
{
        size_t len;

        len = strnlen(s, maxlen);
        if (s[len - 1] == '\n')
                s[len - 1] = '\0';
}

/*
 * Attribute operations for netconsole_target.
 */

static ssize_t enabled_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->enabled);
}

static ssize_t extended_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->extended);
}

static ssize_t release_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->release);
}

static ssize_t dev_name_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%s\n", to_target(item)->np.dev_name);
}

static ssize_t local_port_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->np.local_port);
}

static ssize_t remote_port_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%d\n", to_target(item)->np.remote_port);
}

static ssize_t local_ip_show(struct config_item *item, char *buf)
{
        struct netconsole_target *nt = to_target(item);

        if (nt->np.ipv6)
                return sysfs_emit(buf, "%pI6c\n", &nt->np.local_ip.in6);
        else
                return sysfs_emit(buf, "%pI4\n", &nt->np.local_ip);
}

static ssize_t remote_ip_show(struct config_item *item, char *buf)
{
        struct netconsole_target *nt = to_target(item);

        if (nt->np.ipv6)
                return sysfs_emit(buf, "%pI6c\n", &nt->np.remote_ip.in6);
        else
                return sysfs_emit(buf, "%pI4\n", &nt->np.remote_ip);
}

static ssize_t local_mac_show(struct config_item *item, char *buf)
{
        struct net_device *dev = to_target(item)->np.dev;
        static const u8 bcast[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };

        return sysfs_emit(buf, "%pM\n", dev ? dev->dev_addr : bcast);
}

static ssize_t remote_mac_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%pM\n", to_target(item)->np.remote_mac);
}

static ssize_t transmit_errors_show(struct config_item *item, char *buf)
{
        struct netconsole_target *nt = to_target(item);
        u64 xmit_drop_count, enomem_count;
        unsigned int start;

        do {
                start = u64_stats_fetch_begin(&nt->stats.syncp);
                xmit_drop_count = u64_stats_read(&nt->stats.xmit_drop_count);
                enomem_count = u64_stats_read(&nt->stats.enomem_count);
        } while (u64_stats_fetch_retry(&nt->stats.syncp, start));

        return sysfs_emit(buf, "%llu\n", xmit_drop_count + enomem_count);
}

/* configfs helper to display if cpu_nr sysdata feature is enabled */
static ssize_t sysdata_cpu_nr_enabled_show(struct config_item *item, char *buf)
{
        struct netconsole_target *nt = to_target(item->ci_parent);
        bool cpu_nr_enabled;

        mutex_lock(&dynamic_netconsole_mutex);
        cpu_nr_enabled = !!(nt->sysdata_fields & SYSDATA_CPU_NR);
        mutex_unlock(&dynamic_netconsole_mutex);

        return sysfs_emit(buf, "%d\n", cpu_nr_enabled);
}

/* configfs helper to display if taskname sysdata feature is enabled */
static ssize_t sysdata_taskname_enabled_show(struct config_item *item,
                                             char *buf)
{
        struct netconsole_target *nt = to_target(item->ci_parent);
        bool taskname_enabled;

        mutex_lock(&dynamic_netconsole_mutex);
        taskname_enabled = !!(nt->sysdata_fields & SYSDATA_TASKNAME);
        mutex_unlock(&dynamic_netconsole_mutex);

        return sysfs_emit(buf, "%d\n", taskname_enabled);
}

static ssize_t sysdata_release_enabled_show(struct config_item *item,
                                            char *buf)
{
        struct netconsole_target *nt = to_target(item->ci_parent);
        bool release_enabled;

        mutex_lock(&dynamic_netconsole_mutex);
        release_enabled = !!(nt->sysdata_fields & SYSDATA_TASKNAME);
        mutex_unlock(&dynamic_netconsole_mutex);

        return sysfs_emit(buf, "%d\n", release_enabled);
}

/*
 * This one is special -- targets created through the configfs interface
 * are not enabled (and the corresponding netpoll activated) by default.
 * The user is expected to set the desired parameters first (which
 * would enable him to dynamically add new netpoll targets for new
 * network interfaces as and when they come up).
 */
static ssize_t enabled_store(struct config_item *item,
                const char *buf, size_t count)
{
        struct netconsole_target *nt = to_target(item);
        unsigned long flags;
        bool enabled;
        ssize_t ret;

        mutex_lock(&dynamic_netconsole_mutex);
        ret = kstrtobool(buf, &enabled);
        if (ret)
                goto out_unlock;

        ret = -EINVAL;
        if (enabled == nt->enabled) {
                pr_info("network logging has already %s\n",
                        nt->enabled ? "started" : "stopped");
                goto out_unlock;
        }

        if (enabled) {        /* true */
                if (nt->release && !nt->extended) {
                        pr_err("Not enabling netconsole. Release feature requires extended log message");
                        goto out_unlock;
                }

                if (nt->extended && !console_is_registered(&netconsole_ext))
                        register_console(&netconsole_ext);

                /*
                 * Skip netpoll_parse_options() -- all the attributes are
                 * already configured via configfs. Just print them out.
                 */
                netpoll_print_options(&nt->np);

                ret = netpoll_setup(&nt->np);
                if (ret)
                        goto out_unlock;

                nt->enabled = true;
                pr_info("network logging started\n");
        } else {        /* false */
                /* We need to disable the netconsole before cleaning it up
                 * otherwise we might end up in write_msg() with
                 * nt->np.dev == NULL and nt->enabled == true
                 */
                mutex_lock(&target_cleanup_list_lock);
                spin_lock_irqsave(&target_list_lock, flags);
                nt->enabled = false;
                /* Remove the target from the list, while holding
                 * target_list_lock
                 */
                list_move(&nt->list, &target_cleanup_list);
                spin_unlock_irqrestore(&target_list_lock, flags);
                mutex_unlock(&target_cleanup_list_lock);
        }

        ret = strnlen(buf, count);
        /* Deferred cleanup */
        netconsole_process_cleanups();
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

static ssize_t release_store(struct config_item *item, const char *buf,
                             size_t count)
{
        struct netconsole_target *nt = to_target(item);
        bool release;
        ssize_t ret;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                ret = -EINVAL;
                goto out_unlock;
        }

        ret = kstrtobool(buf, &release);
        if (ret)
                goto out_unlock;

        nt->release = release;

        ret = strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

static ssize_t extended_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);
        bool extended;
        ssize_t ret;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                ret = -EINVAL;
                goto out_unlock;
        }

        ret = kstrtobool(buf, &extended);
        if (ret)
                goto out_unlock;

        nt->extended = extended;
        ret = strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

static ssize_t dev_name_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                mutex_unlock(&dynamic_netconsole_mutex);
                return -EINVAL;
        }

        strscpy(nt->np.dev_name, buf, IFNAMSIZ);
        trim_newline(nt->np.dev_name, IFNAMSIZ);

        mutex_unlock(&dynamic_netconsole_mutex);
        return strnlen(buf, count);
}

static ssize_t local_port_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);
        ssize_t ret = -EINVAL;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        ret = kstrtou16(buf, 10, &nt->np.local_port);
        if (ret < 0)
                goto out_unlock;
        ret = strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

static ssize_t remote_port_store(struct config_item *item,
                const char *buf, size_t count)
{
        struct netconsole_target *nt = to_target(item);
        ssize_t ret = -EINVAL;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        ret = kstrtou16(buf, 10, &nt->np.remote_port);
        if (ret < 0)
                goto out_unlock;
        ret = strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

static ssize_t local_ip_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);
        ssize_t ret = -EINVAL;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        if (strnchr(buf, count, ':')) {
                const char *end;

                if (in6_pton(buf, count, nt->np.local_ip.in6.s6_addr, -1, &end) > 0) {
                        if (*end && *end != '\n') {
                                pr_err("invalid IPv6 address at: <%c>\n", *end);
                                goto out_unlock;
                        }
                        nt->np.ipv6 = true;
                } else
                        goto out_unlock;
        } else {
                if (!nt->np.ipv6)
                        nt->np.local_ip.ip = in_aton(buf);
                else
                        goto out_unlock;
        }

        ret = strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

static ssize_t remote_ip_store(struct config_item *item, const char *buf,
               size_t count)
{
        struct netconsole_target *nt = to_target(item);
        ssize_t ret = -EINVAL;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        if (strnchr(buf, count, ':')) {
                const char *end;

                if (in6_pton(buf, count, nt->np.remote_ip.in6.s6_addr, -1, &end) > 0) {
                        if (*end && *end != '\n') {
                                pr_err("invalid IPv6 address at: <%c>\n", *end);
                                goto out_unlock;
                        }
                        nt->np.ipv6 = true;
                } else
                        goto out_unlock;
        } else {
                if (!nt->np.ipv6)
                        nt->np.remote_ip.ip = in_aton(buf);
                else
                        goto out_unlock;
        }

        ret = strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

/* Count number of entries we have in extradata.
 * This is important because the extradata_complete only supports
 * MAX_EXTRADATA_ITEMS entries. Before enabling any new {user,sys}data
 * feature, number of entries needs to checked for available space.
 */
static size_t count_extradata_entries(struct netconsole_target *nt)
{
        size_t entries;

        /* Userdata entries */
        entries = list_count_nodes(&nt->userdata_group.cg_children);
        /* Plus sysdata entries */
        if (nt->sysdata_fields & SYSDATA_CPU_NR)
                entries += 1;
        if (nt->sysdata_fields & SYSDATA_TASKNAME)
                entries += 1;
        if (nt->sysdata_fields & SYSDATA_RELEASE)
                entries += 1;

        return entries;
}

static ssize_t remote_mac_store(struct config_item *item, const char *buf,
                size_t count)
{
        struct netconsole_target *nt = to_target(item);
        u8 remote_mac[ETH_ALEN];
        ssize_t ret = -EINVAL;

        mutex_lock(&dynamic_netconsole_mutex);
        if (nt->enabled) {
                pr_err("target (%s) is enabled, disable to update parameters\n",
                       config_item_name(&nt->group.cg_item));
                goto out_unlock;
        }

        if (!mac_pton(buf, remote_mac))
                goto out_unlock;
        if (buf[MAC_ADDR_STR_LEN] && buf[MAC_ADDR_STR_LEN] != '\n')
                goto out_unlock;
        memcpy(nt->np.remote_mac, remote_mac, ETH_ALEN);

        ret = strnlen(buf, count);
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

struct userdatum {
        struct config_item item;
        char value[MAX_EXTRADATA_VALUE_LEN];
};

static struct userdatum *to_userdatum(struct config_item *item)
{
        return container_of(item, struct userdatum, item);
}

struct userdata {
        struct config_group group;
};

static struct userdata *to_userdata(struct config_item *item)
{
        return container_of(to_config_group(item), struct userdata, group);
}

static struct netconsole_target *userdata_to_target(struct userdata *ud)
{
        struct config_group *netconsole_group;

        netconsole_group = to_config_group(ud->group.cg_item.ci_parent);
        return to_target(&netconsole_group->cg_item);
}

static ssize_t userdatum_value_show(struct config_item *item, char *buf)
{
        return sysfs_emit(buf, "%s\n", &(to_userdatum(item)->value[0]));
}

static void update_userdata(struct netconsole_target *nt)
{
        int complete_idx = 0, child_count = 0;
        struct list_head *entry;

        /* Clear the current string in case the last userdatum was deleted */
        nt->userdata_length = 0;
        nt->extradata_complete[0] = 0;

        list_for_each(entry, &nt->userdata_group.cg_children) {
                struct userdatum *udm_item;
                struct config_item *item;

                if (WARN_ON_ONCE(child_count >= MAX_EXTRADATA_ITEMS))
                        break;
                child_count++;

                item = container_of(entry, struct config_item, ci_entry);
                udm_item = to_userdatum(item);

                /* Skip userdata with no value set */
                if (strnlen(udm_item->value, MAX_EXTRADATA_VALUE_LEN) == 0)
                        continue;

                /* This doesn't overflow extradata_complete since it will write
                 * one entry length (1/MAX_EXTRADATA_ITEMS long), entry count is
                 * checked to not exceed MAX items with child_count above
                 */
                complete_idx += scnprintf(&nt->extradata_complete[complete_idx],
                                          MAX_EXTRADATA_ENTRY_LEN, " %s=%s\n",
                                          item->ci_name, udm_item->value);
        }
        nt->userdata_length = strnlen(nt->extradata_complete,
                                      sizeof(nt->extradata_complete));
}

static ssize_t userdatum_value_store(struct config_item *item, const char *buf,
                                     size_t count)
{
        struct userdatum *udm = to_userdatum(item);
        struct netconsole_target *nt;
        struct userdata *ud;
        ssize_t ret;

        if (count > MAX_EXTRADATA_VALUE_LEN)
                return -EMSGSIZE;

        mutex_lock(&dynamic_netconsole_mutex);

        ret = strscpy(udm->value, buf, sizeof(udm->value));
        if (ret < 0)
                goto out_unlock;
        trim_newline(udm->value, sizeof(udm->value));

        ud = to_userdata(item->ci_parent);
        nt = userdata_to_target(ud);
        update_userdata(nt);
        ret = count;
out_unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

/* disable_sysdata_feature - Disable sysdata feature and clean sysdata
 * @nt: target that is disabling the feature
 * @feature: feature being disabled
 */
static void disable_sysdata_feature(struct netconsole_target *nt,
                                    enum sysdata_feature feature)
{
        nt->sysdata_fields &= ~feature;
        nt->extradata_complete[nt->userdata_length] = 0;
}

static ssize_t sysdata_release_enabled_store(struct config_item *item,
                                             const char *buf, size_t count)
{
        struct netconsole_target *nt = to_target(item->ci_parent);
        bool release_enabled, curr;
        ssize_t ret;

        ret = kstrtobool(buf, &release_enabled);
        if (ret)
                return ret;

        mutex_lock(&dynamic_netconsole_mutex);
        curr = !!(nt->sysdata_fields & SYSDATA_RELEASE);
        if (release_enabled == curr)
                goto unlock_ok;

        if (release_enabled &&
            count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) {
                ret = -ENOSPC;
                goto unlock;
        }

        if (release_enabled)
                nt->sysdata_fields |= SYSDATA_RELEASE;
        else
                disable_sysdata_feature(nt, SYSDATA_RELEASE);

unlock_ok:
        ret = strnlen(buf, count);
unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

static ssize_t sysdata_taskname_enabled_store(struct config_item *item,
                                              const char *buf, size_t count)
{
        struct netconsole_target *nt = to_target(item->ci_parent);
        bool taskname_enabled, curr;
        ssize_t ret;

        ret = kstrtobool(buf, &taskname_enabled);
        if (ret)
                return ret;

        mutex_lock(&dynamic_netconsole_mutex);
        curr = !!(nt->sysdata_fields & SYSDATA_TASKNAME);
        if (taskname_enabled == curr)
                goto unlock_ok;

        if (taskname_enabled &&
            count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) {
                ret = -ENOSPC;
                goto unlock;
        }

        if (taskname_enabled)
                nt->sysdata_fields |= SYSDATA_TASKNAME;
        else
                disable_sysdata_feature(nt, SYSDATA_TASKNAME);

unlock_ok:
        ret = strnlen(buf, count);
unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

/* configfs helper to sysdata cpu_nr feature */
static ssize_t sysdata_cpu_nr_enabled_store(struct config_item *item,
                                            const char *buf, size_t count)
{
        struct netconsole_target *nt = to_target(item->ci_parent);
        bool cpu_nr_enabled, curr;
        ssize_t ret;

        ret = kstrtobool(buf, &cpu_nr_enabled);
        if (ret)
                return ret;

        mutex_lock(&dynamic_netconsole_mutex);
        curr = !!(nt->sysdata_fields & SYSDATA_CPU_NR);
        if (cpu_nr_enabled == curr)
                /* no change requested */
                goto unlock_ok;

        if (cpu_nr_enabled &&
            count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS) {
                /* user wants the new feature, but there is no space in the
                 * buffer.
                 */
                ret = -ENOSPC;
                goto unlock;
        }

        if (cpu_nr_enabled)
                nt->sysdata_fields |= SYSDATA_CPU_NR;
        else
                /* This is special because extradata_complete might have
                 * remaining data from previous sysdata, and it needs to be
                 * cleaned.
                 */
                disable_sysdata_feature(nt, SYSDATA_CPU_NR);

unlock_ok:
        ret = strnlen(buf, count);
unlock:
        mutex_unlock(&dynamic_netconsole_mutex);
        return ret;
}

CONFIGFS_ATTR(userdatum_, value);
CONFIGFS_ATTR(sysdata_, cpu_nr_enabled);
CONFIGFS_ATTR(sysdata_, taskname_enabled);
CONFIGFS_ATTR(sysdata_, release_enabled);

static struct configfs_attribute *userdatum_attrs[] = {
        &userdatum_attr_value,
        NULL,
};

static void userdatum_release(struct config_item *item)
{
        kfree(to_userdatum(item));
}

static struct configfs_item_operations userdatum_ops = {
        .release = userdatum_release,
};

static const struct config_item_type userdatum_type = {
        .ct_item_ops        = &userdatum_ops,
        .ct_attrs        = userdatum_attrs,
        .ct_owner        = THIS_MODULE,
};

static struct config_item *userdatum_make_item(struct config_group *group,
                                               const char *name)
{
        struct netconsole_target *nt;
        struct userdatum *udm;
        struct userdata *ud;

        if (strlen(name) > MAX_EXTRADATA_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);

        ud = to_userdata(&group->cg_item);
        nt = userdata_to_target(ud);
        if (count_extradata_entries(nt) >= MAX_EXTRADATA_ITEMS)
                return ERR_PTR(-ENOSPC);

        udm = kzalloc(sizeof(*udm), GFP_KERNEL);
        if (!udm)
                return ERR_PTR(-ENOMEM);

        config_item_init_type_name(&udm->item, name, &userdatum_type);
        return &udm->item;
}

static void userdatum_drop(struct config_group *group, struct config_item *item)
{
        struct netconsole_target *nt;
        struct userdata *ud;

        ud = to_userdata(&group->cg_item);
        nt = userdata_to_target(ud);

        mutex_lock(&dynamic_netconsole_mutex);
        update_userdata(nt);
        config_item_put(item);
        mutex_unlock(&dynamic_netconsole_mutex);
}

static struct configfs_attribute *userdata_attrs[] = {
        &sysdata_attr_cpu_nr_enabled,
        &sysdata_attr_taskname_enabled,
        &sysdata_attr_release_enabled,
        NULL,
};

static struct configfs_group_operations userdata_ops = {
        .make_item                = userdatum_make_item,
        .drop_item                = userdatum_drop,
};

static const struct config_item_type userdata_type = {
        .ct_item_ops        = &userdatum_ops,
        .ct_group_ops        = &userdata_ops,
        .ct_attrs        = userdata_attrs,
        .ct_owner        = THIS_MODULE,
};

CONFIGFS_ATTR(, enabled);
CONFIGFS_ATTR(, extended);
CONFIGFS_ATTR(, dev_name);
CONFIGFS_ATTR(, local_port);
CONFIGFS_ATTR(, remote_port);
CONFIGFS_ATTR(, local_ip);
CONFIGFS_ATTR(, remote_ip);
CONFIGFS_ATTR_RO(, local_mac);
CONFIGFS_ATTR(, remote_mac);
CONFIGFS_ATTR(, release);
CONFIGFS_ATTR_RO(, transmit_errors);

static struct configfs_attribute *netconsole_target_attrs[] = {
        &attr_enabled,
        &attr_extended,
        &attr_release,
        &attr_dev_name,
        &attr_local_port,
        &attr_remote_port,
        &attr_local_ip,
        &attr_remote_ip,
        &attr_local_mac,
        &attr_remote_mac,
        &attr_transmit_errors,
        NULL,
};

/*
 * Item operations and type for netconsole_target.
 */

static void netconsole_target_release(struct config_item *item)
{
        kfree(to_target(item));
}

static struct configfs_item_operations netconsole_target_item_ops = {
        .release                = netconsole_target_release,
};

static const struct config_item_type netconsole_target_type = {
        .ct_attrs                = netconsole_target_attrs,
        .ct_item_ops                = &netconsole_target_item_ops,
        .ct_owner                = THIS_MODULE,
};

static void init_target_config_group(struct netconsole_target *nt,
                                     const char *name)
{
        config_group_init_type_name(&nt->group, name, &netconsole_target_type);
        config_group_init_type_name(&nt->userdata_group, "userdata",
                                    &userdata_type);
        configfs_add_default_group(&nt->userdata_group, &nt->group);
}

static struct netconsole_target *find_cmdline_target(const char *name)
{
        struct netconsole_target *nt, *ret = NULL;
        unsigned long flags;

        spin_lock_irqsave(&target_list_lock, flags);
        list_for_each_entry(nt, &target_list, list) {
                if (!strcmp(nt->group.cg_item.ci_name, name)) {
                        ret = nt;
                        break;
                }
        }
        spin_unlock_irqrestore(&target_list_lock, flags);

        return ret;
}

/*
 * Group operations and type for netconsole_subsys.
 */

static struct config_group *make_netconsole_target(struct config_group *group,
                                                   const char *name)
{
        struct netconsole_target *nt;
        unsigned long flags;

        /* Checking if a target by this name was created at boot time.  If so,
         * attach a configfs entry to that target.  This enables dynamic
         * control.
         */
        if (!strncmp(name, NETCONSOLE_PARAM_TARGET_PREFIX,
                     strlen(NETCONSOLE_PARAM_TARGET_PREFIX))) {
                nt = find_cmdline_target(name);
                if (nt) {
                        init_target_config_group(nt, name);
                        return &nt->group;
                }
        }

        nt = alloc_and_init();
        if (!nt)
                return ERR_PTR(-ENOMEM);

        /* Initialize the config_group member */
        init_target_config_group(nt, name);

        /* Adding, but it is disabled */
        spin_lock_irqsave(&target_list_lock, flags);
        list_add(&nt->list, &target_list);
        spin_unlock_irqrestore(&target_list_lock, flags);

        return &nt->group;
}

static void drop_netconsole_target(struct config_group *group,
                                   struct config_item *item)
{
        unsigned long flags;
        struct netconsole_target *nt = to_target(item);

        spin_lock_irqsave(&target_list_lock, flags);
        list_del(&nt->list);
        spin_unlock_irqrestore(&target_list_lock, flags);

        /*
         * The target may have never been enabled, or was manually disabled
         * before being removed so netpoll may have already been cleaned up.
         */
        if (nt->enabled)
                netpoll_cleanup(&nt->np);

        config_item_put(&nt->group.cg_item);
}

static struct configfs_group_operations netconsole_subsys_group_ops = {
        .make_group        = make_netconsole_target,
        .drop_item        = drop_netconsole_target,
};

static const struct config_item_type netconsole_subsys_type = {
        .ct_group_ops        = &netconsole_subsys_group_ops,
        .ct_owner        = THIS_MODULE,
};

/* The netconsole configfs subsystem */
static struct configfs_subsystem netconsole_subsys = {
        .su_group        = {
                .cg_item        = {
                        .ci_namebuf        = "netconsole",
                        .ci_type        = &netconsole_subsys_type,
                },
        },
};

static void populate_configfs_item(struct netconsole_target *nt,
                                   int cmdline_count)
{
        char target_name[16];

        snprintf(target_name, sizeof(target_name), "%s%d",
                 NETCONSOLE_PARAM_TARGET_PREFIX, cmdline_count);
        init_target_config_group(nt, target_name);
}

static int sysdata_append_cpu_nr(struct netconsole_target *nt, int offset)
{
        /* Append cpu=%d at extradata_complete after userdata str */
        return scnprintf(&nt->extradata_complete[offset],
                         MAX_EXTRADATA_ENTRY_LEN, " cpu=%u\n",
                         raw_smp_processor_id());
}

static int sysdata_append_taskname(struct netconsole_target *nt, int offset)
{
        return scnprintf(&nt->extradata_complete[offset],
                         MAX_EXTRADATA_ENTRY_LEN, " taskname=%s\n",
                         current->comm);
}

static int sysdata_append_release(struct netconsole_target *nt, int offset)
{
        return scnprintf(&nt->extradata_complete[offset],
                         MAX_EXTRADATA_ENTRY_LEN, " release=%s\n",
                         init_utsname()->release);
}

/*
 * prepare_extradata - append sysdata at extradata_complete in runtime
 * @nt: target to send message to
 */
static int prepare_extradata(struct netconsole_target *nt)
{
        u32 fields = SYSDATA_CPU_NR | SYSDATA_TASKNAME;
        int extradata_len;

        /* userdata was appended when configfs write helper was called
         * by update_userdata().
         */
        extradata_len = nt->userdata_length;

        if (!(nt->sysdata_fields & fields))
                goto out;

        if (nt->sysdata_fields & SYSDATA_CPU_NR)
                extradata_len += sysdata_append_cpu_nr(nt, extradata_len);
        if (nt->sysdata_fields & SYSDATA_TASKNAME)
                extradata_len += sysdata_append_taskname(nt, extradata_len);
        if (nt->sysdata_fields & SYSDATA_RELEASE)
                extradata_len += sysdata_append_release(nt, extradata_len);

        WARN_ON_ONCE(extradata_len >
                     MAX_EXTRADATA_ENTRY_LEN * MAX_EXTRADATA_ITEMS);

out:
        return extradata_len;
}
#else /* CONFIG_NETCONSOLE_DYNAMIC not set */
static int prepare_extradata(struct netconsole_target *nt)
{
        return 0;
}
#endif        /* CONFIG_NETCONSOLE_DYNAMIC */

/* Handle network interface device notifications */
static int netconsole_netdev_event(struct notifier_block *this,
                                   unsigned long event, void *ptr)
{
        unsigned long flags;
        struct netconsole_target *nt, *tmp;
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        bool stopped = false;

        if (!(event == NETDEV_CHANGENAME || event == NETDEV_UNREGISTER ||
              event == NETDEV_RELEASE || event == NETDEV_JOIN))
                goto done;

        mutex_lock(&target_cleanup_list_lock);
        spin_lock_irqsave(&target_list_lock, flags);
        list_for_each_entry_safe(nt, tmp, &target_list, list) {
                netconsole_target_get(nt);
                if (nt->np.dev == dev) {
                        switch (event) {
                        case NETDEV_CHANGENAME:
                                strscpy(nt->np.dev_name, dev->name, IFNAMSIZ);
                                break;
                        case NETDEV_RELEASE:
                        case NETDEV_JOIN:
                        case NETDEV_UNREGISTER:
                                nt->enabled = false;
                                list_move(&nt->list, &target_cleanup_list);
                                stopped = true;
                        }
                }
                netconsole_target_put(nt);
        }
        spin_unlock_irqrestore(&target_list_lock, flags);
        mutex_unlock(&target_cleanup_list_lock);

        if (stopped) {
                const char *msg = "had an event";

                switch (event) {
                case NETDEV_UNREGISTER:
                        msg = "unregistered";
                        break;
                case NETDEV_RELEASE:
                        msg = "released slaves";
                        break;
                case NETDEV_JOIN:
                        msg = "is joining a master device";
                        break;
                }
                pr_info("network logging stopped on interface %s as it %s\n",
                        dev->name, msg);
        }

        /* Process target_cleanup_list entries. By the end, target_cleanup_list
         * should be empty
         */
        netconsole_process_cleanups_core();

done:
        return NOTIFY_DONE;
}

static struct notifier_block netconsole_netdev_notifier = {
        .notifier_call  = netconsole_netdev_event,
};

/**
 * send_udp - Wrapper for netpoll_send_udp that counts errors
 * @nt: target to send message to
 * @msg: message to send
 * @len: length of message
 *
 * Calls netpoll_send_udp and classifies the return value. If an error
 * occurred it increments statistics in nt->stats accordingly.
 * Only calls netpoll_send_udp if CONFIG_NETCONSOLE_DYNAMIC is disabled.
 */
static void send_udp(struct netconsole_target *nt, const char *msg, int len)
{
        int result = netpoll_send_udp(&nt->np, msg, len);

        if (IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC)) {
                if (result == NET_XMIT_DROP) {
                        u64_stats_update_begin(&nt->stats.syncp);
                        u64_stats_inc(&nt->stats.xmit_drop_count);
                        u64_stats_update_end(&nt->stats.syncp);
                } else if (result == -ENOMEM) {
                        u64_stats_update_begin(&nt->stats.syncp);
                        u64_stats_inc(&nt->stats.enomem_count);
                        u64_stats_update_end(&nt->stats.syncp);
                }
        }
}

static void send_msg_no_fragmentation(struct netconsole_target *nt,
                                      const char *msg,
                                      int msg_len,
                                      int release_len)
{
        const char *extradata = NULL;
        const char *release;

#ifdef CONFIG_NETCONSOLE_DYNAMIC
        extradata = nt->extradata_complete;
#endif

        if (release_len) {
                release = init_utsname()->release;

                scnprintf(nt->buf, MAX_PRINT_CHUNK, "%s,%s", release, msg);
                msg_len += release_len;
        } else {
                memcpy(nt->buf, msg, msg_len);
        }

        if (extradata)
                msg_len += scnprintf(&nt->buf[msg_len],
                                     MAX_PRINT_CHUNK - msg_len,
                                     "%s", extradata);

        send_udp(nt, nt->buf, msg_len);
}

static void append_release(char *buf)
{
        const char *release;

        release = init_utsname()->release;
        scnprintf(buf, MAX_PRINT_CHUNK, "%s,", release);
}

static void send_fragmented_body(struct netconsole_target *nt,
                                 const char *msgbody, int header_len,
                                 int msgbody_len, int extradata_len)
{
        int sent_extradata, preceding_bytes;
        const char *extradata = NULL;
        int body_len, offset = 0;

#ifdef CONFIG_NETCONSOLE_DYNAMIC
        extradata = nt->extradata_complete;
#endif

        /* body_len represents the number of bytes that will be sent. This is
         * bigger than MAX_PRINT_CHUNK, thus, it will be split in multiple
         * packets
         */
        body_len = msgbody_len + extradata_len;

        /* In each iteration of the while loop below, we send a packet
         * containing the header and a portion of the body. The body is
         * composed of two parts: msgbody and extradata. We keep track of how
         * many bytes have been sent so far using the offset variable, which
         * ranges from 0 to the total length of the body.
         */
        while (offset < body_len) {
                int this_header = header_len;
                bool msgbody_written = false;
                int this_offset = 0;
                int this_chunk = 0;

                this_header += scnprintf(nt->buf + this_header,
                                         MAX_PRINT_CHUNK - this_header,
                                         ",ncfrag=%d/%d;", offset,
                                         body_len);

                /* Not all msgbody data has been written yet */
                if (offset < msgbody_len) {
                        this_chunk = min(msgbody_len - offset,
                                         MAX_PRINT_CHUNK - this_header);
                        if (WARN_ON_ONCE(this_chunk <= 0))
                                return;
                        memcpy(nt->buf + this_header, msgbody + offset,
                               this_chunk);
                        this_offset += this_chunk;
                }

                /* msgbody was finally written, either in the previous
                 * messages and/or in the current buf. Time to write
                 * the extradata.
                 */
                msgbody_written |= offset + this_offset >= msgbody_len;

                /* Msg body is fully written and there is pending extradata to
                 * write, append extradata in this chunk
                 */
                if (msgbody_written && offset + this_offset < body_len) {
                        /* Track how much user data was already sent. First
                         * time here, sent_userdata is zero
                         */
                        sent_extradata = (offset + this_offset) - msgbody_len;
                        /* offset of bytes used in current buf */
                        preceding_bytes = this_chunk + this_header;

                        if (WARN_ON_ONCE(sent_extradata < 0))
                                return;

                        this_chunk = min(extradata_len - sent_extradata,
                                         MAX_PRINT_CHUNK - preceding_bytes);
                        if (WARN_ON_ONCE(this_chunk < 0))
                                /* this_chunk could be zero if all the previous
                                 * message used all the buffer. This is not a
                                 * problem, extradata will be sent in the next
                                 * iteration
                                 */
                                return;

                        memcpy(nt->buf + this_header + this_offset,
                               extradata + sent_extradata,
                               this_chunk);
                        this_offset += this_chunk;
                }

                send_udp(nt, nt->buf, this_header + this_offset);
                offset += this_offset;
        }
}

static void send_msg_fragmented(struct netconsole_target *nt,
                                const char *msg,
                                int msg_len,
                                int release_len,
                                int extradata_len)
{
        int header_len, msgbody_len;
        const char *msgbody;

        /* need to insert extra header fields, detect header and msgbody */
        msgbody = memchr(msg, ';', msg_len);
        if (WARN_ON_ONCE(!msgbody))
                return;

        header_len = msgbody - msg;
        msgbody_len = msg_len - header_len - 1;
        msgbody++;

        /*
         * Transfer multiple chunks with the following extra header.
         * "ncfrag=<byte-offset>/<total-bytes>"
         */
        if (release_len)
                append_release(nt->buf);

        /* Copy the header into the buffer */
        memcpy(nt->buf + release_len, msg, header_len);
        header_len += release_len;

        /* for now on, the header will be persisted, and the msgbody
         * will be replaced
         */
        send_fragmented_body(nt, msgbody, header_len, msgbody_len,
                             extradata_len);
}

/**
 * send_ext_msg_udp - send extended log message to target
 * @nt: target to send message to
 * @msg: extended log message to send
 * @msg_len: length of message
 *
 * Transfer extended log @msg to @nt.  If @msg is longer than
 * MAX_PRINT_CHUNK, it'll be split and transmitted in multiple chunks with
 * ncfrag header field added to identify them.
 */
static void send_ext_msg_udp(struct netconsole_target *nt, const char *msg,
                             int msg_len)
{
        int release_len = 0;
        int extradata_len;

        extradata_len = prepare_extradata(nt);

        if (nt->release)
                release_len = strlen(init_utsname()->release) + 1;

        if (msg_len + release_len + extradata_len <= MAX_PRINT_CHUNK)
                return send_msg_no_fragmentation(nt, msg, msg_len, release_len);

        return send_msg_fragmented(nt, msg, msg_len, release_len,
                                   extradata_len);
}

static void write_ext_msg(struct console *con, const char *msg,
                          unsigned int len)
{
        struct netconsole_target *nt;
        unsigned long flags;

        if ((oops_only && !oops_in_progress) || list_empty(&target_list))
                return;

        spin_lock_irqsave(&target_list_lock, flags);
        list_for_each_entry(nt, &target_list, list)
                if (nt->extended && nt->enabled && netif_running(nt->np.dev))
                        send_ext_msg_udp(nt, msg, len);
        spin_unlock_irqrestore(&target_list_lock, flags);
}

static void write_msg(struct console *con, const char *msg, unsigned int len)
{
        int frag, left;
        unsigned long flags;
        struct netconsole_target *nt;
        const char *tmp;

        if (oops_only && !oops_in_progress)
                return;
        /* Avoid taking lock and disabling interrupts unnecessarily */
        if (list_empty(&target_list))
                return;

        spin_lock_irqsave(&target_list_lock, flags);
        list_for_each_entry(nt, &target_list, list) {
                if (!nt->extended && nt->enabled && netif_running(nt->np.dev)) {
                        /*
                         * We nest this inside the for-each-target loop above
                         * so that we're able to get as much logging out to
                         * at least one target if we die inside here, instead
                         * of unnecessarily keeping all targets in lock-step.
                         */
                        tmp = msg;
                        for (left = len; left;) {
                                frag = min(left, MAX_PRINT_CHUNK);
                                send_udp(nt, tmp, frag);
                                tmp += frag;
                                left -= frag;
                        }
                }
        }
        spin_unlock_irqrestore(&target_list_lock, flags);
}

/* Allocate new target (from boot/module param) and setup netpoll for it */
static struct netconsole_target *alloc_param_target(char *target_config,
                                                    int cmdline_count)
{
        struct netconsole_target *nt;
        int err;

        nt = alloc_and_init();
        if (!nt) {
                err = -ENOMEM;
                goto fail;
        }

        if (*target_config == '+') {
                nt->extended = true;
                target_config++;
        }

        if (*target_config == 'r') {
                if (!nt->extended) {
                        pr_err("Netconsole configuration error. Release feature requires extended log message");
                        err = -EINVAL;
                        goto fail;
                }
                nt->release = true;
                target_config++;
        }

        /* Parse parameters and setup netpoll */
        err = netpoll_parse_options(&nt->np, target_config);
        if (err)
                goto fail;

        err = netpoll_setup(&nt->np);
        if (err) {
                pr_err("Not enabling netconsole for %s%d. Netpoll setup failed\n",
                       NETCONSOLE_PARAM_TARGET_PREFIX, cmdline_count);
                if (!IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC))
                        /* only fail if dynamic reconfiguration is set,
                         * otherwise, keep the target in the list, but disabled.
                         */
                        goto fail;
        } else {
                nt->enabled = true;
        }
        populate_configfs_item(nt, cmdline_count);

        return nt;

fail:
        kfree(nt);
        return ERR_PTR(err);
}

/* Cleanup netpoll for given target (from boot/module param) and free it */
static void free_param_target(struct netconsole_target *nt)
{
        netpoll_cleanup(&nt->np);
        kfree(nt);
}

static struct console netconsole_ext = {
        .name        = "netcon_ext",
        .flags        = CON_ENABLED | CON_EXTENDED,
        .write        = write_ext_msg,
};

static struct console netconsole = {
        .name        = "netcon",
        .flags        = CON_ENABLED,
        .write        = write_msg,
};

static int __init init_netconsole(void)
{
        int err;
        struct netconsole_target *nt, *tmp;
        unsigned int count = 0;
        bool extended = false;
        unsigned long flags;
        char *target_config;
        char *input = config;

        if (strnlen(input, MAX_PARAM_LENGTH)) {
                while ((target_config = strsep(&input, ";"))) {
                        nt = alloc_param_target(target_config, count);
                        if (IS_ERR(nt)) {
                                if (IS_ENABLED(CONFIG_NETCONSOLE_DYNAMIC))
                                        continue;
                                err = PTR_ERR(nt);
                                goto fail;
                        }
                        /* Dump existing printks when we register */
                        if (nt->extended) {
                                extended = true;
                                netconsole_ext.flags |= CON_PRINTBUFFER;
                        } else {
                                netconsole.flags |= CON_PRINTBUFFER;
                        }

                        spin_lock_irqsave(&target_list_lock, flags);
                        list_add(&nt->list, &target_list);
                        spin_unlock_irqrestore(&target_list_lock, flags);
                        count++;
                }
        }

        err = register_netdevice_notifier(&netconsole_netdev_notifier);
        if (err)
                goto fail;

        err = dynamic_netconsole_init();
        if (err)
                goto undonotifier;

        if (extended)
                register_console(&netconsole_ext);
        register_console(&netconsole);
        pr_info("network logging started\n");

        return err;

undonotifier:
        unregister_netdevice_notifier(&netconsole_netdev_notifier);

fail:
        pr_err("cleaning up\n");

        /*
         * Remove all targets and destroy them (only targets created
         * from the boot/module option exist here). Skipping the list
         * lock is safe here, and netpoll_cleanup() will sleep.
         */
        list_for_each_entry_safe(nt, tmp, &target_list, list) {
                list_del(&nt->list);
                free_param_target(nt);
        }

        return err;
}

static void __exit cleanup_netconsole(void)
{
        struct netconsole_target *nt, *tmp;

        if (console_is_registered(&netconsole_ext))
                unregister_console(&netconsole_ext);
        unregister_console(&netconsole);
        dynamic_netconsole_exit();
        unregister_netdevice_notifier(&netconsole_netdev_notifier);

        /*
         * Targets created via configfs pin references on our module
         * and would first be rmdir(2)'ed from userspace. We reach
         * here only when they are already destroyed, and only those
         * created from the boot/module option are left, so remove and
         * destroy them. Skipping the list lock is safe here, and
         * netpoll_cleanup() will sleep.
         */
        list_for_each_entry_safe(nt, tmp, &target_list, list) {
                list_del(&nt->list);
                free_param_target(nt);
        }
}

/*
 * Use late_initcall to ensure netconsole is
 * initialized after network device driver if built-in.
 *
 * late_initcall() and module_init() are identical if built as module.
 */
late_initcall(init_netconsole);
module_exit(cleanup_netconsole);


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
// SPDX-License-Identifier: GPL-2.0
/*
 * device.h - generic, centralized driver model
 *
 * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2008-2009 Novell Inc.
 *
 * See Documentation/driver-api/driver-model/ for more information.
 */

#ifndef _DEVICE_H_
#define _DEVICE_H_

#include <linux/dev_printk.h>
#include <linux/energy_model.h>
#include <linux/ioport.h>
#include <linux/kobject.h>
#include <linux/klist.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/pm.h>
#include <linux/atomic.h>
#include <linux/uidgid.h>
#include <linux/gfp.h>
#include <linux/device/bus.h>
#include <linux/device/class.h>
#include <linux/device/devres.h>
#include <linux/device/driver.h>
#include <linux/cleanup.h>
#include <asm/device.h>

struct device;
struct device_private;
struct device_driver;
struct driver_private;
struct module;
struct class;
struct subsys_private;
struct device_node;
struct fwnode_handle;
struct iommu_group;
struct dev_pin_info;
struct dev_iommu;
struct msi_device_data;

/**
 * struct subsys_interface - interfaces to device functions
 * @name:       name of the device function
 * @subsys:     subsystem of the devices to attach to
 * @node:       the list of functions registered at the subsystem
 * @add_dev:    device hookup to device function handler
 * @remove_dev: device hookup to device function handler
 *
 * Simple interfaces attached to a subsystem. Multiple interfaces can
 * attach to a subsystem and its devices. Unlike drivers, they do not
 * exclusively claim or control devices. Interfaces usually represent
 * a specific functionality of a subsystem/class of devices.
 */
struct subsys_interface {
        const char *name;
        const struct bus_type *subsys;
        struct list_head node;
        int (*add_dev)(struct device *dev, struct subsys_interface *sif);
        void (*remove_dev)(struct device *dev, struct subsys_interface *sif);
};

int subsys_interface_register(struct subsys_interface *sif);
void subsys_interface_unregister(struct subsys_interface *sif);

int subsys_system_register(const struct bus_type *subsys,
                           const struct attribute_group **groups);
int subsys_virtual_register(const struct bus_type *subsys,
                            const struct attribute_group **groups);

/*
 * The type of device, "struct device" is embedded in. A class
 * or bus can contain devices of different types
 * like "partitions" and "disks", "mouse" and "event".
 * This identifies the device type and carries type-specific
 * information, equivalent to the kobj_type of a kobject.
 * If "name" is specified, the uevent will contain it in
 * the DEVTYPE variable.
 */
struct device_type {
        const char *name;
        const struct attribute_group **groups;
        int (*uevent)(const struct device *dev, struct kobj_uevent_env *env);
        char *(*devnode)(const struct device *dev, umode_t *mode,
                         kuid_t *uid, kgid_t *gid);
        void (*release)(struct device *dev);

        const struct dev_pm_ops *pm;
};

/**
 * struct device_attribute - Interface for exporting device attributes.
 * @attr: sysfs attribute definition.
 * @show: Show handler.
 * @store: Store handler.
 */
struct device_attribute {
        struct attribute        attr;
        ssize_t (*show)(struct device *dev, struct device_attribute *attr,
                        char *buf);
        ssize_t (*store)(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
};

/**
 * struct dev_ext_attribute - Exported device attribute with extra context.
 * @attr: Exported device attribute.
 * @var: Pointer to context.
 */
struct dev_ext_attribute {
        struct device_attribute attr;
        void *var;
};

ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr,
                          char *buf);
ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr,
                           const char *buf, size_t count);
ssize_t device_show_int(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_int(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
ssize_t device_show_bool(struct device *dev, struct device_attribute *attr,
                        char *buf);
ssize_t device_store_bool(struct device *dev, struct device_attribute *attr,
                         const char *buf, size_t count);
ssize_t device_show_string(struct device *dev, struct device_attribute *attr,
                           char *buf);

/**
 * DEVICE_ATTR - Define a device attribute.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_show: Show handler. Optional, but mandatory if attribute is readable.
 * @_store: Store handler. Optional, but mandatory if attribute is writable.
 *
 * Convenience macro for defining a struct device_attribute.
 *
 * For example, ``DEVICE_ATTR(foo, 0644, foo_show, foo_store);`` expands to:
 *
 * .. code-block:: c
 *
 *        struct device_attribute dev_attr_foo = {
 *                .attr        = { .name = "foo", .mode = 0644 },
 *                .show        = foo_show,
 *                .store        = foo_store,
 *        };
 */
#define DEVICE_ATTR(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store)

/**
 * DEVICE_ATTR_PREALLOC - Define a preallocated device attribute.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_show: Show handler. Optional, but mandatory if attribute is readable.
 * @_store: Store handler. Optional, but mandatory if attribute is writable.
 *
 * Like DEVICE_ATTR(), but ``SYSFS_PREALLOC`` is set on @_mode.
 */
#define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name = \
                __ATTR_PREALLOC(_name, _mode, _show, _store)

/**
 * DEVICE_ATTR_RW - Define a read-write device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0644, @_show is <_name>_show,
 * and @_store is <_name>_store.
 */
#define DEVICE_ATTR_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW(_name)

/**
 * DEVICE_ATTR_ADMIN_RW - Define an admin-only read-write device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR_RW(), but @_mode is 0600.
 */
#define DEVICE_ATTR_ADMIN_RW(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600)

/**
 * DEVICE_ATTR_RO - Define a readable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0444 and @_show is <_name>_show.
 */
#define DEVICE_ATTR_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO(_name)

/**
 * DEVICE_ATTR_ADMIN_RO - Define an admin-only readable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR_RO(), but @_mode is 0400.
 */
#define DEVICE_ATTR_ADMIN_RO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400)

/**
 * DEVICE_ATTR_WO - Define an admin-only writable device attribute.
 * @_name: Attribute name.
 *
 * Like DEVICE_ATTR(), but @_mode is 0200 and @_store is <_name>_store.
 */
#define DEVICE_ATTR_WO(_name) \
        struct device_attribute dev_attr_##_name = __ATTR_WO(_name)

/**
 * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of unsigned long.
 *
 * Like DEVICE_ATTR(), but @_show and @_store are automatically provided
 * such that reads and writes to the attribute from userspace affect @_var.
 */
#define DEVICE_ULONG_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) }

/**
 * DEVICE_INT_ATTR - Define a device attribute backed by an int.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of int.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is an int.
 */
#define DEVICE_INT_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) }

/**
 * DEVICE_BOOL_ATTR - Define a device attribute backed by a bool.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of bool.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is a bool.
 */
#define DEVICE_BOOL_ATTR(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) }

/**
 * DEVICE_STRING_ATTR_RO - Define a device attribute backed by a r/o string.
 * @_name: Attribute name.
 * @_mode: File mode.
 * @_var: Identifier of string.
 *
 * Like DEVICE_ULONG_ATTR(), but @_var is a string. Because the length of the
 * string allocation is unknown, the attribute must be read-only.
 */
#define DEVICE_STRING_ATTR_RO(_name, _mode, _var) \
        struct dev_ext_attribute dev_attr_##_name = \
                { __ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) }

#define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
        struct device_attribute dev_attr_##_name =                \
                __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)

int device_create_file(struct device *device,
                       const struct device_attribute *entry);
void device_remove_file(struct device *dev,
                        const struct device_attribute *attr);
bool device_remove_file_self(struct device *dev,
                             const struct device_attribute *attr);
int __must_check device_create_bin_file(struct device *dev,
                                        const struct bin_attribute *attr);
void device_remove_bin_file(struct device *dev,
                            const struct bin_attribute *attr);

/* allows to add/remove a custom action to devres stack */
int devm_remove_action_nowarn(struct device *dev, void (*action)(void *), void *data);

/**
 * devm_remove_action() - removes previously added custom action
 * @dev: Device that owns the action
 * @action: Function implementing the action
 * @data: Pointer to data passed to @action implementation
 *
 * Removes instance of @action previously added by devm_add_action().
 * Both action and data should match one of the existing entries.
 */
static inline
void devm_remove_action(struct device *dev, void (*action)(void *), void *data)
{
        WARN_ON(devm_remove_action_nowarn(dev, action, data));
}

void devm_release_action(struct device *dev, void (*action)(void *), void *data);

int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name);
#define devm_add_action(dev, action, data) \
        __devm_add_action(dev, action, data, #action)

static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(void *),
                                             void *data, const char *name)
{
        int ret;

        ret = __devm_add_action(dev, action, data, name);
        if (ret)
                action(data);

        return ret;
}
#define devm_add_action_or_reset(dev, action, data) \
        __devm_add_action_or_reset(dev, action, data, #action)

/**
 * devm_alloc_percpu - Resource-managed alloc_percpu
 * @dev: Device to allocate per-cpu memory for
 * @type: Type to allocate per-cpu memory for
 *
 * Managed alloc_percpu. Per-cpu memory allocated with this function is
 * automatically freed on driver detach.
 *
 * RETURNS:
 * Pointer to allocated memory on success, NULL on failure.
 */
#define devm_alloc_percpu(dev, type)      \
        ((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \
                                                      __alignof__(type)))

void __percpu *__devm_alloc_percpu(struct device *dev, size_t size,
                                   size_t align);
void devm_free_percpu(struct device *dev, void __percpu *pdata);

struct device_dma_parameters {
        /*
         * a low level driver may set these to teach IOMMU code about
         * sg limitations.
         */
        unsigned int max_segment_size;
        unsigned int min_align_mask;
        unsigned long segment_boundary_mask;
};

/**
 * enum device_link_state - Device link states.
 * @DL_STATE_NONE: The presence of the drivers is not being tracked.
 * @DL_STATE_DORMANT: None of the supplier/consumer drivers is present.
 * @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not.
 * @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present).
 * @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present.
 * @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding.
 */
enum device_link_state {
        DL_STATE_NONE = -1,
        DL_STATE_DORMANT = 0,
        DL_STATE_AVAILABLE,
        DL_STATE_CONSUMER_PROBE,
        DL_STATE_ACTIVE,
        DL_STATE_SUPPLIER_UNBIND,
};

/*
 * Device link flags.
 *
 * STATELESS: The core will not remove this link automatically.
 * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind.
 * PM_RUNTIME: If set, the runtime PM framework will use this link.
 * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation.
 * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind.
 * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds.
 * MANAGED: The core tracks presence of supplier/consumer drivers (internal).
 * SYNC_STATE_ONLY: Link only affects sync_state() behavior.
 * INFERRED: Inferred from data (eg: firmware) and not from driver actions.
 */
#define DL_FLAG_STATELESS                BIT(0)
#define DL_FLAG_AUTOREMOVE_CONSUMER        BIT(1)
#define DL_FLAG_PM_RUNTIME                BIT(2)
#define DL_FLAG_RPM_ACTIVE                BIT(3)
#define DL_FLAG_AUTOREMOVE_SUPPLIER        BIT(4)
#define DL_FLAG_AUTOPROBE_CONSUMER        BIT(5)
#define DL_FLAG_MANAGED                        BIT(6)
#define DL_FLAG_SYNC_STATE_ONLY                BIT(7)
#define DL_FLAG_INFERRED                BIT(8)
#define DL_FLAG_CYCLE                        BIT(9)

/**
 * enum dl_dev_state - Device driver presence tracking information.
 * @DL_DEV_NO_DRIVER: There is no driver attached to the device.
 * @DL_DEV_PROBING: A driver is probing.
 * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device.
 * @DL_DEV_UNBINDING: The driver is unbinding from the device.
 */
enum dl_dev_state {
        DL_DEV_NO_DRIVER = 0,
        DL_DEV_PROBING,
        DL_DEV_DRIVER_BOUND,
        DL_DEV_UNBINDING,
};

/**
 * enum device_removable - Whether the device is removable. The criteria for a
 * device to be classified as removable is determined by its subsystem or bus.
 * @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this
 *                                    device (default).
 * @DEVICE_REMOVABLE_UNKNOWN:  Device location is Unknown.
 * @DEVICE_FIXED: Device is not removable by the user.
 * @DEVICE_REMOVABLE: Device is removable by the user.
 */
enum device_removable {
        DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */
        DEVICE_REMOVABLE_UNKNOWN,
        DEVICE_FIXED,
        DEVICE_REMOVABLE,
};

/**
 * struct dev_links_info - Device data related to device links.
 * @suppliers: List of links to supplier devices.
 * @consumers: List of links to consumer devices.
 * @defer_sync: Hook to global list of devices that have deferred sync_state.
 * @status: Driver status information.
 */
struct dev_links_info {
        struct list_head suppliers;
        struct list_head consumers;
        struct list_head defer_sync;
        enum dl_dev_state status;
};

/**
 * struct dev_msi_info - Device data related to MSI
 * @domain:        The MSI interrupt domain associated to the device
 * @data:        Pointer to MSI device data
 */
struct dev_msi_info {
#ifdef CONFIG_GENERIC_MSI_IRQ
        struct irq_domain        *domain;
        struct msi_device_data        *data;
#endif
};

/**
 * enum device_physical_location_panel - Describes which panel surface of the
 * system's housing the device connection point resides on.
 * @DEVICE_PANEL_TOP: Device connection point is on the top panel.
 * @DEVICE_PANEL_BOTTOM: Device connection point is on the bottom panel.
 * @DEVICE_PANEL_LEFT: Device connection point is on the left panel.
 * @DEVICE_PANEL_RIGHT: Device connection point is on the right panel.
 * @DEVICE_PANEL_FRONT: Device connection point is on the front panel.
 * @DEVICE_PANEL_BACK: Device connection point is on the back panel.
 * @DEVICE_PANEL_UNKNOWN: The panel with device connection point is unknown.
 */
enum device_physical_location_panel {
        DEVICE_PANEL_TOP,
        DEVICE_PANEL_BOTTOM,
        DEVICE_PANEL_LEFT,
        DEVICE_PANEL_RIGHT,
        DEVICE_PANEL_FRONT,
        DEVICE_PANEL_BACK,
        DEVICE_PANEL_UNKNOWN,
};

/**
 * enum device_physical_location_vertical_position - Describes vertical
 * position of the device connection point on the panel surface.
 * @DEVICE_VERT_POS_UPPER: Device connection point is at upper part of panel.
 * @DEVICE_VERT_POS_CENTER: Device connection point is at center part of panel.
 * @DEVICE_VERT_POS_LOWER: Device connection point is at lower part of panel.
 */
enum device_physical_location_vertical_position {
        DEVICE_VERT_POS_UPPER,
        DEVICE_VERT_POS_CENTER,
        DEVICE_VERT_POS_LOWER,
};

/**
 * enum device_physical_location_horizontal_position - Describes horizontal
 * position of the device connection point on the panel surface.
 * @DEVICE_HORI_POS_LEFT: Device connection point is at left part of panel.
 * @DEVICE_HORI_POS_CENTER: Device connection point is at center part of panel.
 * @DEVICE_HORI_POS_RIGHT: Device connection point is at right part of panel.
 */
enum device_physical_location_horizontal_position {
        DEVICE_HORI_POS_LEFT,
        DEVICE_HORI_POS_CENTER,
        DEVICE_HORI_POS_RIGHT,
};

/**
 * struct device_physical_location - Device data related to physical location
 * of the device connection point.
 * @panel: Panel surface of the system's housing that the device connection
 *         point resides on.
 * @vertical_position: Vertical position of the device connection point within
 *                     the panel.
 * @horizontal_position: Horizontal position of the device connection point
 *                       within the panel.
 * @dock: Set if the device connection point resides in a docking station or
 *        port replicator.
 * @lid: Set if this device connection point resides on the lid of laptop
 *       system.
 */
struct device_physical_location {
        enum device_physical_location_panel panel;
        enum device_physical_location_vertical_position vertical_position;
        enum device_physical_location_horizontal_position horizontal_position;
        bool dock;
        bool lid;
};

/**
 * struct device - The basic device structure
 * @parent:        The device's "parent" device, the device to which it is attached.
 *                 In most cases, a parent device is some sort of bus or host
 *                 controller. If parent is NULL, the device, is a top-level device,
 *                 which is not usually what you want.
 * @p:                Holds the private data of the driver core portions of the device.
 *                 See the comment of the struct device_private for detail.
 * @kobj:        A top-level, abstract class from which other classes are derived.
 * @init_name:        Initial name of the device.
 * @type:        The type of device.
 *                 This identifies the device type and carries type-specific
 *                 information.
 * @mutex:        Mutex to synchronize calls to its driver.
 * @bus:        Type of bus device is on.
 * @driver:        Which driver has allocated this
 * @platform_data: Platform data specific to the device.
 *                 Example: For devices on custom boards, as typical of embedded
 *                 and SOC based hardware, Linux often uses platform_data to point
 *                 to board-specific structures describing devices and how they
 *                 are wired.  That can include what ports are available, chip
 *                 variants, which GPIO pins act in what additional roles, and so
 *                 on.  This shrinks the "Board Support Packages" (BSPs) and
 *                 minimizes board-specific #ifdefs in drivers.
 * @driver_data: Private pointer for driver specific info.
 * @links:        Links to suppliers and consumers of this device.
 * @power:        For device power management.
 *                See Documentation/driver-api/pm/devices.rst for details.
 * @pm_domain:        Provide callbacks that are executed during system suspend,
 *                 hibernation, system resume and during runtime PM transitions
 *                 along with subsystem-level and driver-level callbacks.
 * @em_pd:        device's energy model performance domain
 * @pins:        For device pin management.
 *                See Documentation/driver-api/pin-control.rst for details.
 * @msi:        MSI related data
 * @numa_node:        NUMA node this device is close to.
 * @dma_ops:    DMA mapping operations for this device.
 * @dma_mask:        Dma mask (if dma'ble device).
 * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all
 *                 hardware supports 64-bit addresses for consistent allocations
 *                 such descriptors.
 * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller
 *                DMA limit than the device itself supports.
 * @dma_range_map: map for DMA memory ranges relative to that of RAM
 * @dma_parms:        A low level driver may set these to teach IOMMU code about
 *                 segment limitations.
 * @dma_pools:        Dma pools (if dma'ble device).
 * @dma_mem:        Internal for coherent mem override.
 * @cma_area:        Contiguous memory area for dma allocations
 * @dma_io_tlb_mem: Software IO TLB allocator.  Not for driver use.
 * @dma_io_tlb_pools:        List of transient swiotlb memory pools.
 * @dma_io_tlb_lock:        Protects changes to the list of active pools.
 * @dma_uses_io_tlb: %true if device has used the software IO TLB.
 * @archdata:        For arch-specific additions.
 * @of_node:        Associated device tree node.
 * @fwnode:        Associated device node supplied by platform firmware.
 * @devt:        For creating the sysfs "dev".
 * @id:                device instance
 * @devres_lock: Spinlock to protect the resource of the device.
 * @devres_head: The resources list of the device.
 * @class:        The class of the device.
 * @groups:        Optional attribute groups.
 * @release:        Callback to free the device after all references have
 *                 gone away. This should be set by the allocator of the
 *                 device (i.e. the bus driver that discovered the device).
 * @iommu_group: IOMMU group the device belongs to.
 * @iommu:        Per device generic IOMMU runtime data
 * @physical_location: Describes physical location of the device connection
 *                point in the system housing.
 * @removable:  Whether the device can be removed from the system. This
 *              should be set by the subsystem / bus driver that discovered
 *              the device.
 *
 * @offline_disabled: If set, the device is permanently online.
 * @offline:        Set after successful invocation of bus type's .offline().
 * @of_node_reused: Set if the device-tree node is shared with an ancestor
 *              device.
 * @state_synced: The hardware state of this device has been synced to match
 *                  the software state of this device by calling the driver/bus
 *                  sync_state() callback.
 * @can_match:        The device has matched with a driver at least once or it is in
 *                a bus (like AMBA) which can't check for matching drivers until
 *                other devices probe successfully.
 * @dma_coherent: this particular device is dma coherent, even if the
 *                architecture supports non-coherent devices.
 * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the
 *                streaming DMA operations (->map_* / ->unmap_* / ->sync_*),
 *                and optionall (if the coherent mask is large enough) also
 *                for dma allocations.  This flag is managed by the dma ops
 *                instance from ->dma_supported.
 * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers.
 * @dma_iommu: Device is using default IOMMU implementation for DMA and
 *                doesn't rely on dma_ops structure.
 *
 * At the lowest level, every device in a Linux system is represented by an
 * instance of struct device. The device structure contains the information
 * that the device model core needs to model the system. Most subsystems,
 * however, track additional information about the devices they host. As a
 * result, it is rare for devices to be represented by bare device structures;
 * instead, that structure, like kobject structures, is usually embedded within
 * a higher-level representation of the device.
 */
struct device {
        struct kobject kobj;
        struct device                *parent;

        struct device_private        *p;

        const char                *init_name; /* initial name of the device */
        const struct device_type *type;

        const struct bus_type        *bus;        /* type of bus device is on */
        struct device_driver *driver;        /* which driver has allocated this
                                           device */
        void                *platform_data;        /* Platform specific data, device
                                           core doesn't touch it */
        void                *driver_data;        /* Driver data, set and get with
                                           dev_set_drvdata/dev_get_drvdata */
        struct mutex                mutex;        /* mutex to synchronize calls to
                                         * its driver.
                                         */

        struct dev_links_info        links;
        struct dev_pm_info        power;
        struct dev_pm_domain        *pm_domain;

#ifdef CONFIG_ENERGY_MODEL
        struct em_perf_domain        *em_pd;
#endif

#ifdef CONFIG_PINCTRL
        struct dev_pin_info        *pins;
#endif
        struct dev_msi_info        msi;
#ifdef CONFIG_ARCH_HAS_DMA_OPS
        const struct dma_map_ops *dma_ops;
#endif
        u64                *dma_mask;        /* dma mask (if dma'able device) */
        u64                coherent_dma_mask;/* Like dma_mask, but for
                                             alloc_coherent mappings as
                                             not all hardware supports
                                             64 bit addresses for consistent
                                             allocations such descriptors. */
        u64                bus_dma_limit;        /* upstream dma constraint */
        const struct bus_dma_region *dma_range_map;

        struct device_dma_parameters *dma_parms;

        struct list_head        dma_pools;        /* dma pools (if dma'ble) */

#ifdef CONFIG_DMA_DECLARE_COHERENT
        struct dma_coherent_mem        *dma_mem; /* internal for coherent mem
                                             override */
#endif
#ifdef CONFIG_DMA_CMA
        struct cma *cma_area;                /* contiguous memory area for dma
                                           allocations */
#endif
#ifdef CONFIG_SWIOTLB
        struct io_tlb_mem *dma_io_tlb_mem;
#endif
#ifdef CONFIG_SWIOTLB_DYNAMIC
        struct list_head dma_io_tlb_pools;
        spinlock_t dma_io_tlb_lock;
        bool dma_uses_io_tlb;
#endif
        /* arch specific additions */
        struct dev_archdata        archdata;

        struct device_node        *of_node; /* associated device tree node */
        struct fwnode_handle        *fwnode; /* firmware device node */

#ifdef CONFIG_NUMA
        int                numa_node;        /* NUMA node this device is close to */
#endif
        dev_t                        devt;        /* dev_t, creates the sysfs "dev" */
        u32                        id;        /* device instance */

        spinlock_t                devres_lock;
        struct list_head        devres_head;

        const struct class        *class;
        const struct attribute_group **groups;        /* optional groups */

        void        (*release)(struct device *dev);
        struct iommu_group        *iommu_group;
        struct dev_iommu        *iommu;

        struct device_physical_location *physical_location;

        enum device_removable        removable;

        bool                        offline_disabled:1;
        bool                        offline:1;
        bool                        of_node_reused:1;
        bool                        state_synced:1;
        bool                        can_match:1;
#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \
    defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL)
        bool                        dma_coherent:1;
#endif
#ifdef CONFIG_DMA_OPS_BYPASS
        bool                        dma_ops_bypass : 1;
#endif
#ifdef CONFIG_DMA_NEED_SYNC
        bool                        dma_skip_sync:1;
#endif
#ifdef CONFIG_IOMMU_DMA
        bool                        dma_iommu:1;
#endif
};

/**
 * struct device_link - Device link representation.
 * @supplier: The device on the supplier end of the link.
 * @s_node: Hook to the supplier device's list of links to consumers.
 * @consumer: The device on the consumer end of the link.
 * @c_node: Hook to the consumer device's list of links to suppliers.
 * @link_dev: device used to expose link details in sysfs
 * @status: The state of the link (with respect to the presence of drivers).
 * @flags: Link flags.
 * @rpm_active: Whether or not the consumer device is runtime-PM-active.
 * @kref: Count repeated addition of the same link.
 * @rm_work: Work structure used for removing the link.
 * @supplier_preactivated: Supplier has been made active before consumer probe.
 */
struct device_link {
        struct device *supplier;
        struct list_head s_node;
        struct device *consumer;
        struct list_head c_node;
        struct device link_dev;
        enum device_link_state status;
        u32 flags;
        refcount_t rpm_active;
        struct kref kref;
        struct work_struct rm_work;
        bool supplier_preactivated; /* Owned by consumer probe. */
};

#define kobj_to_dev(__kobj)        container_of_const(__kobj, struct device, kobj)

/**
 * device_iommu_mapped - Returns true when the device DMA is translated
 *                         by an IOMMU
 * @dev: Device to perform the check on
 */
static inline bool device_iommu_mapped(struct device *dev)
{
        return (dev->iommu_group != NULL);
}

/* Get the wakeup routines, which depend on struct device */
#include <linux/pm_wakeup.h>

/**
 * dev_name - Return a device's name.
 * @dev: Device with name to get.
 * Return: The kobject name of the device, or its initial name if unavailable.
 */
static inline const char *dev_name(const struct device *dev)
{
        /* Use the init name until the kobject becomes available */
        if (dev->init_name)
                return dev->init_name;

        return kobject_name(&dev->kobj);
}

/**
 * dev_bus_name - Return a device's bus/class name, if at all possible
 * @dev: struct device to get the bus/class name of
 *
 * Will return the name of the bus/class the device is attached to.  If it is
 * not attached to a bus/class, an empty string will be returned.
 */
static inline const char *dev_bus_name(const struct device *dev)
{
        return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : "");
}

__printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...);

#ifdef CONFIG_NUMA
static inline int dev_to_node(struct device *dev)
{
        return dev->numa_node;
}
static inline void set_dev_node(struct device *dev, int node)
{
        dev->numa_node = node;
}
#else
static inline int dev_to_node(struct device *dev)
{
        return NUMA_NO_NODE;
}
static inline void set_dev_node(struct device *dev, int node)
{
}
#endif

static inline struct irq_domain *dev_get_msi_domain(const struct device *dev)
{
#ifdef CONFIG_GENERIC_MSI_IRQ
        return dev->msi.domain;
#else
        return NULL;
#endif
}

static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d)
{
#ifdef CONFIG_GENERIC_MSI_IRQ
        dev->msi.domain = d;
#endif
}

static inline void *dev_get_drvdata(const struct device *dev)
{
        return dev->driver_data;
}

static inline void dev_set_drvdata(struct device *dev, void *data)
{
        dev->driver_data = data;
}

static inline struct pm_subsys_data *dev_to_psd(struct device *dev)
{
        return dev ? dev->power.subsys_data : NULL;
}

static inline unsigned int dev_get_uevent_suppress(const struct device *dev)
{
        return dev->kobj.uevent_suppress;
}

static inline void dev_set_uevent_suppress(struct device *dev, int val)
{
        dev->kobj.uevent_suppress = val;
}

static inline int device_is_registered(struct device *dev)
{
        return dev->kobj.state_in_sysfs;
}

static inline void device_enable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = true;
}

static inline void device_disable_async_suspend(struct device *dev)
{
        if (!dev->power.is_prepared)
                dev->power.async_suspend = false;
}

static inline bool device_async_suspend_enabled(struct device *dev)
{
        return !!dev->power.async_suspend;
}

static inline bool device_pm_not_required(struct device *dev)
{
        return dev->power.no_pm;
}

static inline void device_set_pm_not_required(struct device *dev)
{
        dev->power.no_pm = true;
}

static inline void dev_pm_syscore_device(struct device *dev, bool val)
{
#ifdef CONFIG_PM_SLEEP
        dev->power.syscore = val;
#endif
}

static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags)
{
        dev->power.driver_flags = flags;
}

static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags)
{
        return !!(dev->power.driver_flags & flags);
}

static inline bool dev_pm_smart_suspend(struct device *dev)
{
#ifdef CONFIG_PM_SLEEP
        return dev->power.smart_suspend;
#else
        return false;
#endif
}

static inline void device_lock(struct device *dev)
{
        mutex_lock(&dev->mutex);
}

static inline int device_lock_interruptible(struct device *dev)
{
        return mutex_lock_interruptible(&dev->mutex);
}

static inline int device_trylock(struct device *dev)
{
        return mutex_trylock(&dev->mutex);
}

static inline void device_unlock(struct device *dev)
{
        mutex_unlock(&dev->mutex);
}

DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T))

static inline void device_lock_assert(struct device *dev)
{
        lockdep_assert_held(&dev->mutex);
}

static inline bool dev_has_sync_state(struct device *dev)
{
        if (!dev)
                return false;
        if (dev->driver && dev->driver->sync_state)
                return true;
        if (dev->bus && dev->bus->sync_state)
                return true;
        return false;
}

static inline void dev_set_removable(struct device *dev,
                                     enum device_removable removable)
{
        dev->removable = removable;
}

static inline bool dev_is_removable(struct device *dev)
{
        return dev->removable == DEVICE_REMOVABLE;
}

static inline bool dev_removable_is_valid(struct device *dev)
{
        return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED;
}

/*
 * High level routines for use by the bus drivers
 */
int __must_check device_register(struct device *dev);
void device_unregister(struct device *dev);
void device_initialize(struct device *dev);
int __must_check device_add(struct device *dev);
void device_del(struct device *dev);

DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T))

int device_for_each_child(struct device *parent, void *data,
                          device_iter_t fn);
int device_for_each_child_reverse(struct device *parent, void *data,
                                  device_iter_t fn);
int device_for_each_child_reverse_from(struct device *parent,
                                       struct device *from, void *data,
                                       device_iter_t fn);
struct device *device_find_child(struct device *parent, const void *data,
                                 device_match_t match);
/**
 * device_find_child_by_name - device iterator for locating a child device.
 * @parent: parent struct device
 * @name: name of the child device
 *
 * This is similar to the device_find_child() function above, but it
 * returns a reference to a device that has the name @name.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
static inline struct device *device_find_child_by_name(struct device *parent,
                                                       const char *name)
{
        return device_find_child(parent, name, device_match_name);
}

/**
 * device_find_any_child - device iterator for locating a child device, if any.
 * @parent: parent struct device
 *
 * This is similar to the device_find_child() function above, but it
 * returns a reference to a child device, if any.
 *
 * NOTE: you will need to drop the reference with put_device() after use.
 */
static inline struct device *device_find_any_child(struct device *parent)
{
        return device_find_child(parent, NULL, device_match_any);
}

int device_rename(struct device *dev, const char *new_name);
int device_move(struct device *dev, struct device *new_parent,
                enum dpm_order dpm_order);
int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid);

static inline bool device_supports_offline(struct device *dev)
{
        return dev->bus && dev->bus->offline && dev->bus->online;
}

#define __device_lock_set_class(dev, name, key)                        \
do {                                                                   \
        struct device *__d2 __maybe_unused = dev;                      \
        lock_set_class(&__d2->mutex.dep_map, name, key, 0, _THIS_IP_); \
} while (0)

/**
 * device_lock_set_class - Specify a temporary lock class while a device
 *                           is attached to a driver
 * @dev: device to modify
 * @key: lock class key data
 *
 * This must be called with the device_lock() already held, for example
 * from driver ->probe(). Take care to only override the default
 * lockdep_no_validate class.
 */
#ifdef CONFIG_LOCKDEP
#define device_lock_set_class(dev, key)                                    \
do {                                                                       \
        struct device *__d = dev;                                          \
        dev_WARN_ONCE(__d, !lockdep_match_class(&__d->mutex,               \
                                                &__lockdep_no_validate__), \
                 "overriding existing custom lock class\n");               \
        __device_lock_set_class(__d, #key, key);                           \
} while (0)
#else
#define device_lock_set_class(dev, key) __device_lock_set_class(dev, #key, key)
#endif

/**
 * device_lock_reset_class - Return a device to the default lockdep novalidate state
 * @dev: device to modify
 *
 * This must be called with the device_lock() already held, for example
 * from driver ->remove().
 */
#define device_lock_reset_class(dev) \
do { \
        struct device *__d __maybe_unused = dev;                       \
        lock_set_novalidate_class(&__d->mutex.dep_map, "&dev->mutex",  \
                                  _THIS_IP_);                          \
} while (0)

void lock_device_hotplug(void);
void unlock_device_hotplug(void);
int lock_device_hotplug_sysfs(void);
int device_offline(struct device *dev);
int device_online(struct device *dev);

void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
void device_set_node(struct device *dev, struct fwnode_handle *fwnode);
int device_add_of_node(struct device *dev, struct device_node *of_node);
void device_remove_of_node(struct device *dev);
void device_set_of_node_from_dev(struct device *dev, const struct device *dev2);

static inline struct device_node *dev_of_node(struct device *dev)
{
        if (!IS_ENABLED(CONFIG_OF) || !dev)
                return NULL;
        return dev->of_node;
}

static inline int dev_num_vf(struct device *dev)
{
        if (dev->bus && dev->bus->num_vf)
                return dev->bus->num_vf(dev);
        return 0;
}

/*
 * Root device objects for grouping under /sys/devices
 */
struct device *__root_device_register(const char *name, struct module *owner);

/* This is a macro to avoid include problems with THIS_MODULE */
#define root_device_register(name) \
        __root_device_register(name, THIS_MODULE)

void root_device_unregister(struct device *root);

static inline void *dev_get_platdata(const struct device *dev)
{
        return dev->platform_data;
}

/*
 * Manual binding of a device to driver. See drivers/base/bus.c
 * for information on use.
 */
int __must_check device_driver_attach(const struct device_driver *drv,
                                      struct device *dev);
int __must_check device_bind_driver(struct device *dev);
void device_release_driver(struct device *dev);
int  __must_check device_attach(struct device *dev);
int __must_check driver_attach(const struct device_driver *drv);
void device_initial_probe(struct device *dev);
int __must_check device_reprobe(struct device *dev);

bool device_is_bound(struct device *dev);

/*
 * Easy functions for dynamically creating devices on the fly
 */
__printf(5, 6) struct device *
device_create(const struct class *cls, struct device *parent, dev_t devt,
              void *drvdata, const char *fmt, ...);
__printf(6, 7) struct device *
device_create_with_groups(const struct class *cls, struct device *parent, dev_t devt,
                          void *drvdata, const struct attribute_group **groups,
                          const char *fmt, ...);
void device_destroy(const struct class *cls, dev_t devt);

int __must_check device_add_groups(struct device *dev,
                                   const struct attribute_group **groups);
void device_remove_groups(struct device *dev,
                          const struct attribute_group **groups);

static inline int __must_check device_add_group(struct device *dev,
                                        const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        return device_add_groups(dev, groups);
}

static inline void device_remove_group(struct device *dev,
                                       const struct attribute_group *grp)
{
        const struct attribute_group *groups[] = { grp, NULL };

        device_remove_groups(dev, groups);
}

int __must_check devm_device_add_group(struct device *dev,
                                       const struct attribute_group *grp);

/*
 * get_device - atomically increment the reference count for the device.
 *
 */
struct device *get_device(struct device *dev);
void put_device(struct device *dev);

DEFINE_FREE(put_device, struct device *, if (_T) put_device(_T))

bool kill_device(struct device *dev);

#ifdef CONFIG_DEVTMPFS
int devtmpfs_mount(void);
#else
static inline int devtmpfs_mount(void) { return 0; }
#endif

/* drivers/base/power/shutdown.c */
void device_shutdown(void);

/* debugging and troubleshooting/diagnostic helpers. */
const char *dev_driver_string(const struct device *dev);

/* Device links interface. */
struct device_link *device_link_add(struct device *consumer,
                                    struct device *supplier, u32 flags);
void device_link_del(struct device_link *link);
void device_link_remove(void *consumer, struct device *supplier);
void device_links_supplier_sync_state_pause(void);
void device_links_supplier_sync_state_resume(void);
void device_link_wait_removal(void);

/* Create alias, so I can be autoloaded. */
#define MODULE_ALIAS_CHARDEV(major,minor) \
        MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_CHARDEV_MAJOR(major) \
        MODULE_ALIAS("char-major-" __stringify(major) "-*")

#endif /* _DEVICE_H_ */






















































































































































































































































































































































































































   25 

   26 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/percpu-vm.c - vmalloc area based chunk allocation
 *
 * Copyright (C) 2010                SUSE Linux Products GmbH
 * Copyright (C) 2010                Tejun Heo <tj@kernel.org>
 *
 * Chunks are mapped into vmalloc areas and populated page by page.
 * This is the default chunk allocator.
 */
#include "internal.h"

static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
                                    unsigned int cpu, int page_idx)
{
        /* must not be used on pre-mapped chunk */
        WARN_ON(chunk->immutable);

        return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
}

/**
 * pcpu_get_pages - get temp pages array
 *
 * Returns pointer to array of pointers to struct page which can be indexed
 * with pcpu_page_idx().  Note that there is only one array and accesses
 * should be serialized by pcpu_alloc_mutex.
 *
 * RETURNS:
 * Pointer to temp pages array on success.
 */
static struct page **pcpu_get_pages(void)
{
        static struct page **pages;
        size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);

        lockdep_assert_held(&pcpu_alloc_mutex);

        if (!pages)
                pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL);
        return pages;
}

/**
 * pcpu_free_pages - free pages which were allocated for @chunk
 * @chunk: chunk pages were allocated for
 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
 * @page_start: page index of the first page to be freed
 * @page_end: page index of the last page to be freed + 1
 *
 * Free pages [@page_start and @page_end) in @pages for all units.
 * The pages were allocated for @chunk.
 */
static void pcpu_free_pages(struct pcpu_chunk *chunk,
                            struct page **pages, int page_start, int page_end)
{
        unsigned int cpu;
        int i;

        for_each_possible_cpu(cpu) {
                for (i = page_start; i < page_end; i++) {
                        struct page *page = pages[pcpu_page_idx(cpu, i)];

                        if (page)
                                __free_page(page);
                }
        }
}

/**
 * pcpu_alloc_pages - allocates pages for @chunk
 * @chunk: target chunk
 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
 * @page_start: page index of the first page to be allocated
 * @page_end: page index of the last page to be allocated + 1
 * @gfp: allocation flags passed to the underlying allocator
 *
 * Allocate pages [@page_start,@page_end) into @pages for all units.
 * The allocation is for @chunk.  Percpu core doesn't care about the
 * content of @pages and will pass it verbatim to pcpu_map_pages().
 */
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
                            struct page **pages, int page_start, int page_end,
                            gfp_t gfp)
{
        unsigned int cpu, tcpu;
        int i;

        gfp |= __GFP_HIGHMEM;

        for_each_possible_cpu(cpu) {
                for (i = page_start; i < page_end; i++) {
                        struct page **pagep = &pages[pcpu_page_idx(cpu, i)];

                        *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
                        if (!*pagep)
                                goto err;
                }
        }
        return 0;

err:
        while (--i >= page_start)
                __free_page(pages[pcpu_page_idx(cpu, i)]);

        for_each_possible_cpu(tcpu) {
                if (tcpu == cpu)
                        break;
                for (i = page_start; i < page_end; i++)
                        __free_page(pages[pcpu_page_idx(tcpu, i)]);
        }
        return -ENOMEM;
}

/**
 * pcpu_pre_unmap_flush - flush cache prior to unmapping
 * @chunk: chunk the regions to be flushed belongs to
 * @page_start: page index of the first page to be flushed
 * @page_end: page index of the last page to be flushed + 1
 *
 * Pages in [@page_start,@page_end) of @chunk are about to be
 * unmapped.  Flush cache.  As each flushing trial can be very
 * expensive, issue flush on the whole region at once rather than
 * doing it for each cpu.  This could be an overkill but is more
 * scalable.
 */
static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
                                 int page_start, int page_end)
{
        flush_cache_vunmap(
                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}

static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
        vunmap_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT));
}

/**
 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
 * @chunk: chunk of interest
 * @pages: pages array which can be used to pass information to free
 * @page_start: page index of the first page to unmap
 * @page_end: page index of the last page to unmap + 1
 *
 * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
 * Corresponding elements in @pages were cleared by the caller and can
 * be used to carry information to pcpu_free_pages() which will be
 * called after all unmaps are finished.  The caller should call
 * proper pre/post flush functions.
 */
static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
                             struct page **pages, int page_start, int page_end)
{
        unsigned int cpu;
        int i;

        for_each_possible_cpu(cpu) {
                for (i = page_start; i < page_end; i++) {
                        struct page *page;

                        page = pcpu_chunk_page(chunk, cpu, i);
                        WARN_ON(!page);
                        pages[pcpu_page_idx(cpu, i)] = page;
                }
                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
                                   page_end - page_start);
        }
}

/**
 * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
 * @chunk: pcpu_chunk the regions to be flushed belong to
 * @page_start: page index of the first page to be flushed
 * @page_end: page index of the last page to be flushed + 1
 *
 * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
 * TLB for the regions.  This can be skipped if the area is to be
 * returned to vmalloc as vmalloc will handle TLB flushing lazily.
 *
 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
 * for the whole region.
 */
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
                                      int page_start, int page_end)
{
        flush_tlb_kernel_range(
                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}

static int __pcpu_map_pages(unsigned long addr, struct page **pages,
                            int nr_pages)
{
        return vmap_pages_range_noflush(addr, addr + (nr_pages << PAGE_SHIFT),
                                        PAGE_KERNEL, pages, PAGE_SHIFT);
}

/**
 * pcpu_map_pages - map pages into a pcpu_chunk
 * @chunk: chunk of interest
 * @pages: pages array containing pages to be mapped
 * @page_start: page index of the first page to map
 * @page_end: page index of the last page to map + 1
 *
 * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
 * caller is responsible for calling pcpu_post_map_flush() after all
 * mappings are complete.
 *
 * This function is responsible for setting up whatever is necessary for
 * reverse lookup (addr -> chunk).
 */
static int pcpu_map_pages(struct pcpu_chunk *chunk,
                          struct page **pages, int page_start, int page_end)
{
        unsigned int cpu, tcpu;
        int i, err;

        for_each_possible_cpu(cpu) {
                err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
                                       &pages[pcpu_page_idx(cpu, page_start)],
                                       page_end - page_start);
                if (err < 0)
                        goto err;

                for (i = page_start; i < page_end; i++)
                        pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
                                            chunk);
        }
        return 0;
err:
        for_each_possible_cpu(tcpu) {
                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
                                   page_end - page_start);
                if (tcpu == cpu)
                        break;
        }
        pcpu_post_unmap_tlb_flush(chunk, page_start, page_end);
        return err;
}

/**
 * pcpu_post_map_flush - flush cache after mapping
 * @chunk: pcpu_chunk the regions to be flushed belong to
 * @page_start: page index of the first page to be flushed
 * @page_end: page index of the last page to be flushed + 1
 *
 * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
 * cache.
 *
 * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
 * for the whole region.
 */
static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
                                int page_start, int page_end)
{
        flush_cache_vmap(
                pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start),
                pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end));
}

/**
 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
 * @chunk: chunk of interest
 * @page_start: the start page
 * @page_end: the end page
 * @gfp: allocation flags passed to the underlying memory allocator
 *
 * For each cpu, populate and map pages [@page_start,@page_end) into
 * @chunk.
 *
 * CONTEXT:
 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
 */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp)
{
        struct page **pages;

        pages = pcpu_get_pages();
        if (!pages)
                return -ENOMEM;

        if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp))
                return -ENOMEM;

        if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
                pcpu_free_pages(chunk, pages, page_start, page_end);
                return -ENOMEM;
        }
        pcpu_post_map_flush(chunk, page_start, page_end);

        return 0;
}

/**
 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
 * @chunk: chunk to depopulate
 * @page_start: the start page
 * @page_end: the end page
 *
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 * from @chunk.
 *
 * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the
 * region back to vmalloc() which will lazily flush the tlb.
 *
 * CONTEXT:
 * pcpu_alloc_mutex.
 */
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end)
{
        struct page **pages;

        /*
         * If control reaches here, there must have been at least one
         * successful population attempt so the temp pages array must
         * be available now.
         */
        pages = pcpu_get_pages();
        BUG_ON(!pages);

        /* unmap and free */
        pcpu_pre_unmap_flush(chunk, page_start, page_end);

        pcpu_unmap_pages(chunk, pages, page_start, page_end);

        pcpu_free_pages(chunk, pages, page_start, page_end);
}

static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
{
        struct pcpu_chunk *chunk;
        struct vm_struct **vms;

        chunk = pcpu_alloc_chunk(gfp);
        if (!chunk)
                return NULL;

        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
                                pcpu_nr_groups, pcpu_atom_size);
        if (!vms) {
                pcpu_free_chunk(chunk);
                return NULL;
        }

        chunk->data = vms;
        chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];

        pcpu_stats_chunk_alloc();
        trace_percpu_create_chunk(chunk->base_addr);

        return chunk;
}

static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
{
        if (!chunk)
                return;

        pcpu_stats_chunk_dealloc();
        trace_percpu_destroy_chunk(chunk->base_addr);

        if (chunk->data)
                pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
        pcpu_free_chunk(chunk);
}

static struct page *pcpu_addr_to_page(void *addr)
{
        return vmalloc_to_page(addr);
}

static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
{
        /* no extra restriction */
        return 0;
}

/**
 * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim
 * @chunk: chunk of interest
 *
 * This is the entry point for percpu reclaim.  If a chunk qualifies, it is then
 * isolated and managed in separate lists at the back of pcpu_slot: sidelined
 * and to_depopulate respectively.  The to_depopulate list holds chunks slated
 * for depopulation.  They no longer contribute to pcpu_nr_empty_pop_pages once
 * they are on this list.  Once depopulated, they are moved onto the sidelined
 * list which enables them to be pulled back in for allocation if no other chunk
 * can suffice the allocation.
 */
static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk)
{
        /* do not reclaim either the first chunk or reserved chunk */
        if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk)
                return false;

        /*
         * If it is isolated, it may be on the sidelined list so move it back to
         * the to_depopulate list.  If we hit at least 1/4 pages empty pages AND
         * there is no system-wide shortage of empty pages aside from this
         * chunk, move it to the to_depopulate list.
         */
        return ((chunk->isolated && chunk->nr_empty_pop_pages) ||
                (pcpu_nr_empty_pop_pages >
                 (PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages) &&
                 chunk->nr_empty_pop_pages >= chunk->nr_pages / 4));
}











































































































   20 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Statically sized hash table implementation
 * (C) 2012  Sasha Levin <levinsasha928@gmail.com>
 */

#ifndef _LINUX_HASHTABLE_H
#define _LINUX_HASHTABLE_H

#include <linux/list.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/rculist.h>

#define DEFINE_HASHTABLE(name, bits)                                                \
        struct hlist_head name[1 << (bits)] =                                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DEFINE_READ_MOSTLY_HASHTABLE(name, bits)                                \
        struct hlist_head name[1 << (bits)] __read_mostly =                        \
                        { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT }

#define DECLARE_HASHTABLE(name, bits)                                           \
        struct hlist_head name[1 << (bits)]

#define HASH_SIZE(name) (ARRAY_SIZE(name))
#define HASH_BITS(name) ilog2(HASH_SIZE(name))

/* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */
#define hash_min(val, bits)                                                        \
        (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits))

static inline void __hash_init(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                INIT_HLIST_HEAD(&ht[i]);
}

/**
 * hash_init - initialize a hash table
 * @hashtable: hashtable to be initialized
 *
 * Calculates the size of the hashtable from the given parameter, otherwise
 * same as hash_init_size.
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable))

/**
 * hash_add - add an object to a hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add(hashtable, node, key)                                                \
        hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_add_rcu - add an object to a rcu enabled hashtable
 * @hashtable: hashtable to add to
 * @node: the &struct hlist_node of the object to be added
 * @key: the key of the object to be added
 */
#define hash_add_rcu(hashtable, node, key)                                        \
        hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))])

/**
 * hash_hashed - check whether an object is in any hashtable
 * @node: the &struct hlist_node of the object to be checked
 */
static inline bool hash_hashed(struct hlist_node *node)
{
        return !hlist_unhashed(node);
}

static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz)
{
        unsigned int i;

        for (i = 0; i < sz; i++)
                if (!hlist_empty(&ht[i]))
                        return false;

        return true;
}

/**
 * hash_empty - check whether a hashtable is empty
 * @hashtable: hashtable to check
 *
 * This has to be a macro since HASH_BITS() will not work on pointers since
 * it calculates the size during preprocessing.
 */
#define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable))

/**
 * hash_del - remove an object from a hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del(struct hlist_node *node)
{
        hlist_del_init(node);
}

/**
 * hash_del_rcu - remove an object from a rcu enabled hashtable
 * @node: &struct hlist_node of the object to remove
 */
static inline void hash_del_rcu(struct hlist_node *node)
{
        hlist_del_init_rcu(node);
}

/**
 * hash_for_each - iterate over a hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each(name, bkt, obj, member)                                \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry(obj, &name[bkt], member)

/**
 * hash_for_each_rcu - iterate over a rcu enabled hashtable
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_rcu(name, bkt, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_rcu(obj, &name[bkt], member)

/**
 * hash_for_each_safe - iterate over a hashtable safe against removal of
 * hash entry
 * @name: hashtable to iterate
 * @bkt: integer to use as bucket loop cursor
 * @tmp: a &struct hlist_node used for temporary storage
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 */
#define hash_for_each_safe(name, bkt, tmp, obj, member)                        \
        for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\
                        (bkt)++)\
                hlist_for_each_entry_safe(obj, tmp, &name[bkt], member)

/**
 * hash_for_each_possible - iterate over all possible objects hashing to the
 * same bucket
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible(name, obj, member, key)                        \
        hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_rcu - iterate over all possible objects hashing to the
 * same bucket in an rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_rcu(name, obj, member, key, cond...)        \
        hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\
                member, ## cond)

/**
 * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing
 * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 *
 * This is the same as hash_for_each_possible_rcu() except that it does
 * not do any RCU debugging or tracing.
 */
#define hash_for_each_possible_rcu_notrace(name, obj, member, key) \
        hlist_for_each_entry_rcu_notrace(obj, \
                &name[hash_min(key, HASH_BITS(name))], member)

/**
 * hash_for_each_possible_safe - iterate over all possible objects hashing to the
 * same bucket safe against removals
 * @name: hashtable to iterate
 * @obj: the type * to use as a loop cursor for each entry
 * @tmp: a &struct hlist_node used for temporary storage
 * @member: the name of the hlist_node within the struct
 * @key: the key of the objects to iterate over
 */
#define hash_for_each_possible_safe(name, obj, tmp, member, key)        \
        hlist_for_each_entry_safe(obj, tmp,\
                &name[hash_min(key, HASH_BITS(name))], member)


#endif




















































































































































































  133 































   27 





















   27 



















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HIGHMEM_INTERNAL_H
#define _LINUX_HIGHMEM_INTERNAL_H

/*
 * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft.
 */
#ifdef CONFIG_KMAP_LOCAL
void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot);
void *__kmap_local_page_prot(struct page *page, pgprot_t prot);
void kunmap_local_indexed(const void *vaddr);
void kmap_local_fork(struct task_struct *tsk);
void __kmap_local_sched_out(void);
void __kmap_local_sched_in(void);
static inline void kmap_assert_nomap(void)
{
        DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx);
}
#else
static inline void kmap_local_fork(struct task_struct *tsk) { }
static inline void kmap_assert_nomap(void) { }
#endif

#ifdef CONFIG_HIGHMEM
#include <asm/highmem.h>

#ifndef ARCH_HAS_KMAP_FLUSH_TLB
static inline void kmap_flush_tlb(unsigned long addr) { }
#endif

#ifndef kmap_prot
#define kmap_prot PAGE_KERNEL
#endif

void *kmap_high(struct page *page);
void kunmap_high(struct page *page);
void __kmap_flush_unused(void);
struct page *__kmap_to_page(void *addr);

static inline void *kmap(struct page *page)
{
        void *addr;

        might_sleep();
        if (!PageHighMem(page))
                addr = page_address(page);
        else
                addr = kmap_high(page);
        kmap_flush_tlb((unsigned long)addr);
        return addr;
}

static inline void kunmap(struct page *page)
{
        might_sleep();
        if (!PageHighMem(page))
                return;
        kunmap_high(page);
}

static inline struct page *kmap_to_page(void *addr)
{
        return __kmap_to_page(addr);
}

static inline void kmap_flush_unused(void)
{
        __kmap_flush_unused();
}

static inline void *kmap_local_page(struct page *page)
{
        return __kmap_local_page_prot(page, kmap_prot);
}

static inline void *kmap_local_folio(struct folio *folio, size_t offset)
{
        struct page *page = folio_page(folio, offset / PAGE_SIZE);
        return __kmap_local_page_prot(page, kmap_prot) + offset % PAGE_SIZE;
}

static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
{
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_local(const void *vaddr)
{
        kunmap_local_indexed(vaddr);
}

static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_page_prot(page, prot);
}

static inline void *kmap_atomic(struct page *page)
{
        return kmap_atomic_prot(page, kmap_prot);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();

        pagefault_disable();
        return __kmap_local_pfn_prot(pfn, kmap_prot);
}

static inline void __kunmap_atomic(const void *addr)
{
        kunmap_local_indexed(addr);
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

unsigned long __nr_free_highpages(void);
unsigned long __totalhigh_pages(void);

static inline unsigned long nr_free_highpages(void)
{
        return __nr_free_highpages();
}

static inline unsigned long totalhigh_pages(void)
{
        return __totalhigh_pages();
}

static inline bool is_kmap_addr(const void *x)
{
        unsigned long addr = (unsigned long)x;

        return (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) ||
                (addr >= __fix_to_virt(FIX_KMAP_END) &&
                 addr < __fix_to_virt(FIX_KMAP_BEGIN));
}
#else /* CONFIG_HIGHMEM */

static inline struct page *kmap_to_page(void *addr)
{
        return virt_to_page(addr);
}

static inline void *kmap(struct page *page)
{
        might_sleep();
        return page_address(page);
}

static inline void kunmap_high(struct page *page) { }
static inline void kmap_flush_unused(void) { }

static inline void kunmap(struct page *page)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(page_address(page));
#endif
}

static inline void *kmap_local_page(struct page *page)
{
        return page_address(page);
}

static inline void *kmap_local_folio(struct folio *folio, size_t offset)
{
        return page_address(&folio->page) + offset;
}

static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot)
{
        return kmap_local_page(page);
}

static inline void *kmap_local_pfn(unsigned long pfn)
{
        return kmap_local_page(pfn_to_page(pfn));
}

static inline void __kunmap_local(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
}

static inline void *kmap_atomic(struct page *page)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_disable();
        else
                preempt_disable();
        pagefault_disable();
        return page_address(page);
}

static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot)
{
        return kmap_atomic(page);
}

static inline void *kmap_atomic_pfn(unsigned long pfn)
{
        return kmap_atomic(pfn_to_page(pfn));
}

static inline void __kunmap_atomic(const void *addr)
{
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP
        kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif
        pagefault_enable();
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                migrate_enable();
        else
                preempt_enable();
}

static inline unsigned long nr_free_highpages(void) { return 0; }
static inline unsigned long totalhigh_pages(void) { return 0; }

static inline bool is_kmap_addr(const void *x)
{
        return false;
}

#endif /* CONFIG_HIGHMEM */

/**
 * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated!
 * @__addr:       Virtual address to be unmapped
 *
 * Unmaps an address previously mapped by kmap_atomic() and re-enables
 * pagefaults. Depending on PREEMP_RT configuration, re-enables also
 * migration and preemption. Users should not count on these side effects.
 *
 * Mappings should be unmapped in the reverse order that they were mapped.
 * See kmap_local_page() for details on nesting.
 *
 * @__addr can be any address within the mapped page, so there is no need
 * to subtract any offset that has been added. In contrast to kunmap(),
 * this function takes the address returned from kmap_atomic(), not the
 * page passed to it. The compiler will warn you if you pass the page.
 */
#define kunmap_atomic(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_atomic(__addr);                                \
} while (0)

/**
 * kunmap_local - Unmap a page mapped via kmap_local_page().
 * @__addr: An address within the page mapped
 *
 * @__addr can be any address within the mapped page.  Commonly it is the
 * address return from kmap_local_page(), but it can also include offsets.
 *
 * Unmapping should be done in the reverse order of the mapping.  See
 * kmap_local_page() for details.
 */
#define kunmap_local(__addr)                                        \
do {                                                                \
        BUILD_BUG_ON(__same_type((__addr), struct page *));        \
        __kunmap_local(__addr);                                        \
} while (0)

#endif
























































  126 


  126 













  126 





  126 



















































































































  966 











  962 
  964 




  968 











  964 




  115 

  963 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
// SPDX-License-Identifier: GPL-2.0
#include <linux/irq_work.h>
#include <linux/spinlock.h>
#include <linux/task_work.h>
#include <linux/resume_user_mode.h>

static struct callback_head work_exited; /* all we need is ->next == NULL */

#ifdef CONFIG_IRQ_WORK
static void task_work_set_notify_irq(struct irq_work *entry)
{
        test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
}
static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) =
        IRQ_WORK_INIT_HARD(task_work_set_notify_irq);
#endif

/**
 * task_work_add - ask the @task to execute @work->func()
 * @task: the task which should run the callback
 * @work: the callback to run
 * @notify: how to notify the targeted task
 *
 * Queue @work for task_work_run() below and notify the @task if @notify
 * is @TWA_RESUME, @TWA_SIGNAL, @TWA_SIGNAL_NO_IPI or @TWA_NMI_CURRENT.
 *
 * @TWA_SIGNAL works like signals, in that the it will interrupt the targeted
 * task and run the task_work, regardless of whether the task is currently
 * running in the kernel or userspace.
 * @TWA_SIGNAL_NO_IPI works like @TWA_SIGNAL, except it doesn't send a
 * reschedule IPI to force the targeted task to reschedule and run task_work.
 * This can be advantageous if there's no strict requirement that the
 * task_work be run as soon as possible, just whenever the task enters the
 * kernel anyway.
 * @TWA_RESUME work is run only when the task exits the kernel and returns to
 * user mode, or before entering guest mode.
 * @TWA_NMI_CURRENT works like @TWA_RESUME, except it can only be used for the
 * current @task and if the current context is NMI.
 *
 * Fails if the @task is exiting/exited and thus it can't process this @work.
 * Otherwise @work->func() will be called when the @task goes through one of
 * the aforementioned transitions, or exits.
 *
 * If the targeted task is exiting, then an error is returned and the work item
 * is not queued. It's up to the caller to arrange for an alternative mechanism
 * in that case.
 *
 * Note: there is no ordering guarantee on works queued here. The task_work
 * list is LIFO.
 *
 * RETURNS:
 * 0 if succeeds or -ESRCH.
 */
int task_work_add(struct task_struct *task, struct callback_head *work,
                  enum task_work_notify_mode notify)
{
        struct callback_head *head;

        if (notify == TWA_NMI_CURRENT) {
                if (WARN_ON_ONCE(task != current))
                        return -EINVAL;
                if (!IS_ENABLED(CONFIG_IRQ_WORK))
                        return -EINVAL;
        } else {
                kasan_record_aux_stack(work);
        }

        head = READ_ONCE(task->task_works);
        do {
                if (unlikely(head == &work_exited))
                        return -ESRCH;
                work->next = head;
        } while (!try_cmpxchg(&task->task_works, &head, work));

        switch (notify) {
        case TWA_NONE:
                break;
        case TWA_RESUME:
                set_notify_resume(task);
                break;
        case TWA_SIGNAL:
                set_notify_signal(task);
                break;
        case TWA_SIGNAL_NO_IPI:
                __set_notify_signal(task);
                break;
#ifdef CONFIG_IRQ_WORK
        case TWA_NMI_CURRENT:
                irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume));
                break;
#endif
        default:
                WARN_ON_ONCE(1);
                break;
        }

        return 0;
}

/**
 * task_work_cancel_match - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @match: match function to call
 * @data: data to be passed in to match function
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_match(struct task_struct *task,
                       bool (*match)(struct callback_head *, void *data),
                       void *data)
{
        struct callback_head **pprev = &task->task_works;
        struct callback_head *work;
        unsigned long flags;

        if (likely(!task_work_pending(task)))
                return NULL;
        /*
         * If cmpxchg() fails we continue without updating pprev.
         * Either we raced with task_work_add() which added the
         * new entry before this work, we will find it again. Or
         * we raced with task_work_run(), *pprev == NULL/exited.
         */
        raw_spin_lock_irqsave(&task->pi_lock, flags);
        work = READ_ONCE(*pprev);
        while (work) {
                if (!match(work, data)) {
                        pprev = &work->next;
                        work = READ_ONCE(*pprev);
                } else if (try_cmpxchg(pprev, &work, work->next))
                        break;
        }
        raw_spin_unlock_irqrestore(&task->pi_lock, flags);

        return work;
}

static bool task_work_func_match(struct callback_head *cb, void *data)
{
        return cb->func == data;
}

/**
 * task_work_cancel_func - cancel a pending work matching a function added by task_work_add()
 * @task: the task which should execute the func's work
 * @func: identifies the func to match with a work to remove
 *
 * Find the last queued pending work with ->func == @func and remove
 * it from queue.
 *
 * RETURNS:
 * The found work or NULL if not found.
 */
struct callback_head *
task_work_cancel_func(struct task_struct *task, task_work_func_t func)
{
        return task_work_cancel_match(task, task_work_func_match, func);
}

static bool task_work_match(struct callback_head *cb, void *data)
{
        return cb == data;
}

/**
 * task_work_cancel - cancel a pending work added by task_work_add()
 * @task: the task which should execute the work
 * @cb: the callback to remove if queued
 *
 * Remove a callback from a task's queue if queued.
 *
 * RETURNS:
 * True if the callback was queued and got cancelled, false otherwise.
 */
bool task_work_cancel(struct task_struct *task, struct callback_head *cb)
{
        struct callback_head *ret;

        ret = task_work_cancel_match(task, task_work_match, cb);

        return ret == cb;
}

/**
 * task_work_run - execute the works added by task_work_add()
 *
 * Flush the pending works. Should be used by the core kernel code.
 * Called before the task returns to the user-mode or stops, or when
 * it exits. In the latter case task_work_add() can no longer add the
 * new work after task_work_run() returns.
 */
void task_work_run(void)
{
        struct task_struct *task = current;
        struct callback_head *work, *head, *next;

        for (;;) {
                /*
                 * work->func() can do task_work_add(), do not set
                 * work_exited unless the list is empty.
                 */
                work = READ_ONCE(task->task_works);
                do {
                        head = NULL;
                        if (!work) {
                                if (task->flags & PF_EXITING)
                                        head = &work_exited;
                                else
                                        break;
                        }
                } while (!try_cmpxchg(&task->task_works, &work, head));

                if (!work)
                        break;
                /*
                 * Synchronize with task_work_cancel_match(). It can not remove
                 * the first entry == work, cmpxchg(task_works) must fail.
                 * But it can remove another entry from the ->next list.
                 */
                raw_spin_lock_irq(&task->pi_lock);
                raw_spin_unlock_irq(&task->pi_lock);

                do {
                        next = work->next;
                        work->func(work);
                        work = next;
                        cond_resched();
                } while (work);
        }
}
























































































































































    3 



    3 




















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
// SPDX-License-Identifier: GPL-2.0

#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/inetdevice.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>

#include <net/netfilter/nf_nat_masquerade.h>

struct masq_dev_work {
        struct work_struct work;
        struct net *net;
        netns_tracker ns_tracker;
        union nf_inet_addr addr;
        int ifindex;
        int (*iter)(struct nf_conn *i, void *data);
};

#define MAX_MASQ_WORKER_COUNT        16

static DEFINE_MUTEX(masq_mutex);
static unsigned int masq_refcnt __read_mostly;
static atomic_t masq_worker_count __read_mostly;

unsigned int
nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
                       const struct nf_nat_range2 *range,
                       const struct net_device *out)
{
        struct nf_conn *ct;
        struct nf_conn_nat *nat;
        enum ip_conntrack_info ctinfo;
        struct nf_nat_range2 newrange;
        const struct rtable *rt;
        __be32 newsrc, nh;

        WARN_ON(hooknum != NF_INET_POST_ROUTING);

        ct = nf_ct_get(skb, &ctinfo);

        WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
                         ctinfo == IP_CT_RELATED_REPLY)));

        /* Source address is 0.0.0.0 - locally generated packet that is
         * probably not supposed to be masqueraded.
         */
        if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
                return NF_ACCEPT;

        rt = skb_rtable(skb);
        nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
        newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
        if (!newsrc) {
                pr_info("%s ate my IP address\n", out->name);
                return NF_DROP;
        }

        nat = nf_ct_nat_ext_add(ct);
        if (nat)
                nat->masq_index = out->ifindex;

        /* Transfer from original range. */
        memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
        memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
        newrange.flags       = range->flags | NF_NAT_RANGE_MAP_IPS;
        newrange.min_addr.ip = newsrc;
        newrange.max_addr.ip = newsrc;
        newrange.min_proto   = range->min_proto;
        newrange.max_proto   = range->max_proto;

        /* Hand modified range to generic setup. */
        return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);

static void iterate_cleanup_work(struct work_struct *work)
{
        struct nf_ct_iter_data iter_data = {};
        struct masq_dev_work *w;

        w = container_of(work, struct masq_dev_work, work);

        iter_data.net = w->net;
        iter_data.data = (void *)w;
        nf_ct_iterate_cleanup_net(w->iter, &iter_data);

        put_net_track(w->net, &w->ns_tracker);
        kfree(w);
        atomic_dec(&masq_worker_count);
        module_put(THIS_MODULE);
}

/* Iterate conntrack table in the background and remove conntrack entries
 * that use the device/address being removed.
 *
 * In case too many work items have been queued already or memory allocation
 * fails iteration is skipped, conntrack entries will time out eventually.
 */
static void nf_nat_masq_schedule(struct net *net, union nf_inet_addr *addr,
                                 int ifindex,
                                 int (*iter)(struct nf_conn *i, void *data),
                                 gfp_t gfp_flags)
{
        struct masq_dev_work *w;

        if (atomic_read(&masq_worker_count) > MAX_MASQ_WORKER_COUNT)
                return;

        net = maybe_get_net(net);
        if (!net)
                return;

        if (!try_module_get(THIS_MODULE))
                goto err_module;

        w = kzalloc(sizeof(*w), gfp_flags);
        if (w) {
                /* We can overshoot MAX_MASQ_WORKER_COUNT, no big deal */
                atomic_inc(&masq_worker_count);

                INIT_WORK(&w->work, iterate_cleanup_work);
                w->ifindex = ifindex;
                w->net = net;
                netns_tracker_alloc(net, &w->ns_tracker, gfp_flags);
                w->iter = iter;
                if (addr)
                        w->addr = *addr;
                schedule_work(&w->work);
                return;
        }

        module_put(THIS_MODULE);
 err_module:
        put_net(net);
}

static int device_cmp(struct nf_conn *i, void *arg)
{
        const struct nf_conn_nat *nat = nfct_nat(i);
        const struct masq_dev_work *w = arg;

        if (!nat)
                return 0;
        return nat->masq_index == w->ifindex;
}

static int masq_device_event(struct notifier_block *this,
                             unsigned long event,
                             void *ptr)
{
        const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);

        if (event == NETDEV_DOWN) {
                /* Device was downed.  Search entire table for
                 * conntracks which were associated with that device,
                 * and forget them.
                 */

                nf_nat_masq_schedule(net, NULL, dev->ifindex,
                                     device_cmp, GFP_KERNEL);
        }

        return NOTIFY_DONE;
}

static int inet_cmp(struct nf_conn *ct, void *ptr)
{
        struct nf_conntrack_tuple *tuple;
        struct masq_dev_work *w = ptr;

        if (!device_cmp(ct, ptr))
                return 0;

        tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;

        return nf_inet_addr_cmp(&w->addr, &tuple->dst.u3);
}

static int masq_inet_event(struct notifier_block *this,
                           unsigned long event,
                           void *ptr)
{
        const struct in_ifaddr *ifa = ptr;
        const struct in_device *idev;
        const struct net_device *dev;
        union nf_inet_addr addr;

        if (event != NETDEV_DOWN)
                return NOTIFY_DONE;

        /* The masq_dev_notifier will catch the case of the device going
         * down.  So if the inetdev is dead and being destroyed we have
         * no work to do.  Otherwise this is an individual address removal
         * and we have to perform the flush.
         */
        idev = ifa->ifa_dev;
        if (idev->dead)
                return NOTIFY_DONE;

        memset(&addr, 0, sizeof(addr));

        addr.ip = ifa->ifa_address;

        dev = idev->dev;
        nf_nat_masq_schedule(dev_net(idev->dev), &addr, dev->ifindex,
                             inet_cmp, GFP_KERNEL);

        return NOTIFY_DONE;
}

static struct notifier_block masq_dev_notifier = {
        .notifier_call        = masq_device_event,
};

static struct notifier_block masq_inet_notifier = {
        .notifier_call        = masq_inet_event,
};

#if IS_ENABLED(CONFIG_IPV6)
static int
nat_ipv6_dev_get_saddr(struct net *net, const struct net_device *dev,
                       const struct in6_addr *daddr, unsigned int srcprefs,
                       struct in6_addr *saddr)
{
#ifdef CONFIG_IPV6_MODULE
        const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();

        if (!v6_ops)
                return -EHOSTUNREACH;

        return v6_ops->dev_get_saddr(net, dev, daddr, srcprefs, saddr);
#else
        return ipv6_dev_get_saddr(net, dev, daddr, srcprefs, saddr);
#endif
}

unsigned int
nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
                       const struct net_device *out)
{
        enum ip_conntrack_info ctinfo;
        struct nf_conn_nat *nat;
        struct in6_addr src;
        struct nf_conn *ct;
        struct nf_nat_range2 newrange;

        ct = nf_ct_get(skb, &ctinfo);
        WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
                         ctinfo == IP_CT_RELATED_REPLY)));

        if (nat_ipv6_dev_get_saddr(nf_ct_net(ct), out,
                                   &ipv6_hdr(skb)->daddr, 0, &src) < 0)
                return NF_DROP;

        nat = nf_ct_nat_ext_add(ct);
        if (nat)
                nat->masq_index = out->ifindex;

        newrange.flags                = range->flags | NF_NAT_RANGE_MAP_IPS;
        newrange.min_addr.in6        = src;
        newrange.max_addr.in6        = src;
        newrange.min_proto        = range->min_proto;
        newrange.max_proto        = range->max_proto;

        return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);

/* atomic notifier; can't call nf_ct_iterate_cleanup_net (it can sleep).
 *
 * Defer it to the system workqueue.
 *
 * As we can have 'a lot' of inet_events (depending on amount of ipv6
 * addresses being deleted), we also need to limit work item queue.
 */
static int masq_inet6_event(struct notifier_block *this,
                            unsigned long event, void *ptr)
{
        struct inet6_ifaddr *ifa = ptr;
        const struct net_device *dev;
        union nf_inet_addr addr;

        if (event != NETDEV_DOWN)
                return NOTIFY_DONE;

        dev = ifa->idev->dev;

        memset(&addr, 0, sizeof(addr));

        addr.in6 = ifa->addr;

        nf_nat_masq_schedule(dev_net(dev), &addr, dev->ifindex, inet_cmp,
                             GFP_ATOMIC);
        return NOTIFY_DONE;
}

static struct notifier_block masq_inet6_notifier = {
        .notifier_call        = masq_inet6_event,
};

static int nf_nat_masquerade_ipv6_register_notifier(void)
{
        return register_inet6addr_notifier(&masq_inet6_notifier);
}
#else
static inline int nf_nat_masquerade_ipv6_register_notifier(void) { return 0; }
#endif

int nf_nat_masquerade_inet_register_notifiers(void)
{
        int ret = 0;

        mutex_lock(&masq_mutex);
        if (WARN_ON_ONCE(masq_refcnt == UINT_MAX)) {
                ret = -EOVERFLOW;
                goto out_unlock;
        }

        /* check if the notifier was already set */
        if (++masq_refcnt > 1)
                goto out_unlock;

        /* Register for device down reports */
        ret = register_netdevice_notifier(&masq_dev_notifier);
        if (ret)
                goto err_dec;
        /* Register IP address change reports */
        ret = register_inetaddr_notifier(&masq_inet_notifier);
        if (ret)
                goto err_unregister;

        ret = nf_nat_masquerade_ipv6_register_notifier();
        if (ret)
                goto err_unreg_inet;

        mutex_unlock(&masq_mutex);
        return ret;
err_unreg_inet:
        unregister_inetaddr_notifier(&masq_inet_notifier);
err_unregister:
        unregister_netdevice_notifier(&masq_dev_notifier);
err_dec:
        masq_refcnt--;
out_unlock:
        mutex_unlock(&masq_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_register_notifiers);

void nf_nat_masquerade_inet_unregister_notifiers(void)
{
        mutex_lock(&masq_mutex);
        /* check if the notifiers still have clients */
        if (--masq_refcnt > 0)
                goto out_unlock;

        unregister_netdevice_notifier(&masq_dev_notifier);
        unregister_inetaddr_notifier(&masq_inet_notifier);
#if IS_ENABLED(CONFIG_IPV6)
        unregister_inet6addr_notifier(&masq_inet6_notifier);
#endif
out_unlock:
        mutex_unlock(&masq_mutex);
}
EXPORT_SYMBOL_GPL(nf_nat_masquerade_inet_unregister_notifiers);




















































































































































































































































































































































































































































































































































































































































































































    3 



    3 





















































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * NetLabel Unlabeled Support
 *
 * This file defines functions for dealing with unlabeled packets for the
 * NetLabel system.  The NetLabel system manages static and dynamic label
 * mappings for network protocols such as CIPSO and RIPSO.
 *
 * Author: Paul Moore <paul@paul-moore.com>
 */

/*
 * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 - 2008
 */

#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/socket.h>
#include <linux/string.h>
#include <linux/skbuff.h>
#include <linux/audit.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/genetlink.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/net_namespace.h>
#include <net/netlabel.h>
#include <asm/bug.h>
#include <linux/atomic.h>

#include "netlabel_user.h"
#include "netlabel_addrlist.h"
#include "netlabel_domainhash.h"
#include "netlabel_unlabeled.h"
#include "netlabel_mgmt.h"

/* NOTE: at present we always use init's network namespace since we don't
 *       presently support different namespaces even though the majority of
 *       the functions in this file are "namespace safe" */

/* The unlabeled connection hash table which we use to map network interfaces
 * and addresses of unlabeled packets to a user specified secid value for the
 * LSM.  The hash table is used to lookup the network interface entry
 * (struct netlbl_unlhsh_iface) and then the interface entry is used to
 * lookup an IP address match from an ordered list.  If a network interface
 * match can not be found in the hash table then the default entry
 * (netlbl_unlhsh_def) is used.  The IP address entry list
 * (struct netlbl_unlhsh_addr) is ordered such that the entries with a
 * larger netmask come first.
 */
struct netlbl_unlhsh_tbl {
        struct list_head *tbl;
        u32 size;
};
#define netlbl_unlhsh_addr4_entry(iter) \
        container_of(iter, struct netlbl_unlhsh_addr4, list)
struct netlbl_unlhsh_addr4 {
        u32 secid;

        struct netlbl_af4list list;
        struct rcu_head rcu;
};
#define netlbl_unlhsh_addr6_entry(iter) \
        container_of(iter, struct netlbl_unlhsh_addr6, list)
struct netlbl_unlhsh_addr6 {
        u32 secid;

        struct netlbl_af6list list;
        struct rcu_head rcu;
};
struct netlbl_unlhsh_iface {
        int ifindex;
        struct list_head addr4_list;
        struct list_head addr6_list;

        u32 valid;
        struct list_head list;
        struct rcu_head rcu;
};

/* Argument struct for netlbl_unlhsh_walk() */
struct netlbl_unlhsh_walk_arg {
        struct netlink_callback *nl_cb;
        struct sk_buff *skb;
        u32 seq;
};

/* Unlabeled connection hash table */
/* updates should be so rare that having one spinlock for the entire
 * hash table should be okay */
static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
#define netlbl_unlhsh_rcu_deref(p) \
        rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock))
static struct netlbl_unlhsh_tbl __rcu *netlbl_unlhsh;
static struct netlbl_unlhsh_iface __rcu *netlbl_unlhsh_def;

/* Accept unlabeled packets flag */
static u8 netlabel_unlabel_acceptflg;

/* NetLabel Generic NETLINK unlabeled family */
static struct genl_family netlbl_unlabel_gnl_family;

/* NetLabel Netlink attribute policy */
static const struct nla_policy netlbl_unlabel_genl_policy[NLBL_UNLABEL_A_MAX + 1] = {
        [NLBL_UNLABEL_A_ACPTFLG] = { .type = NLA_U8 },
        [NLBL_UNLABEL_A_IPV6ADDR] = { .type = NLA_BINARY,
                                      .len = sizeof(struct in6_addr) },
        [NLBL_UNLABEL_A_IPV6MASK] = { .type = NLA_BINARY,
                                      .len = sizeof(struct in6_addr) },
        [NLBL_UNLABEL_A_IPV4ADDR] = { .type = NLA_BINARY,
                                      .len = sizeof(struct in_addr) },
        [NLBL_UNLABEL_A_IPV4MASK] = { .type = NLA_BINARY,
                                      .len = sizeof(struct in_addr) },
        [NLBL_UNLABEL_A_IFACE] = { .type = NLA_NUL_STRING,
                                   .len = IFNAMSIZ - 1 },
        [NLBL_UNLABEL_A_SECCTX] = { .type = NLA_BINARY }
};

/*
 * Unlabeled Connection Hash Table Functions
 */

/**
 * netlbl_unlhsh_free_iface - Frees an interface entry from the hash table
 * @entry: the entry's RCU field
 *
 * Description:
 * This function is designed to be used as a callback to the call_rcu()
 * function so that memory allocated to a hash table interface entry can be
 * released safely.  It is important to note that this function does not free
 * the IPv4 and IPv6 address lists contained as part of an interface entry.  It
 * is up to the rest of the code to make sure an interface entry is only freed
 * once it's address lists are empty.
 *
 */
static void netlbl_unlhsh_free_iface(struct rcu_head *entry)
{
        struct netlbl_unlhsh_iface *iface;
        struct netlbl_af4list *iter4;
        struct netlbl_af4list *tmp4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netlbl_af6list *iter6;
        struct netlbl_af6list *tmp6;
#endif /* IPv6 */

        iface = container_of(entry, struct netlbl_unlhsh_iface, rcu);

        /* no need for locks here since we are the only one with access to this
         * structure */

        netlbl_af4list_foreach_safe(iter4, tmp4, &iface->addr4_list) {
                netlbl_af4list_remove_entry(iter4);
                kfree(netlbl_unlhsh_addr4_entry(iter4));
        }
#if IS_ENABLED(CONFIG_IPV6)
        netlbl_af6list_foreach_safe(iter6, tmp6, &iface->addr6_list) {
                netlbl_af6list_remove_entry(iter6);
                kfree(netlbl_unlhsh_addr6_entry(iter6));
        }
#endif /* IPv6 */
        kfree(iface);
}

/**
 * netlbl_unlhsh_hash - Hashing function for the hash table
 * @ifindex: the network interface/device to hash
 *
 * Description:
 * This is the hashing function for the unlabeled hash table, it returns the
 * bucket number for the given device/interface.  The caller is responsible for
 * ensuring that the hash table is protected with either a RCU read lock or
 * the hash table lock.
 *
 */
static u32 netlbl_unlhsh_hash(int ifindex)
{
        return ifindex & (netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->size - 1);
}

/**
 * netlbl_unlhsh_search_iface - Search for a matching interface entry
 * @ifindex: the network interface
 *
 * Description:
 * Searches the unlabeled connection hash table and returns a pointer to the
 * interface entry which matches @ifindex, otherwise NULL is returned.  The
 * caller is responsible for ensuring that the hash table is protected with
 * either a RCU read lock or the hash table lock.
 *
 */
static struct netlbl_unlhsh_iface *netlbl_unlhsh_search_iface(int ifindex)
{
        u32 bkt;
        struct list_head *bkt_list;
        struct netlbl_unlhsh_iface *iter;

        bkt = netlbl_unlhsh_hash(ifindex);
        bkt_list = &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt];
        list_for_each_entry_rcu(iter, bkt_list, list,
                                lockdep_is_held(&netlbl_unlhsh_lock))
                if (iter->valid && iter->ifindex == ifindex)
                        return iter;

        return NULL;
}

/**
 * netlbl_unlhsh_add_addr4 - Add a new IPv4 address entry to the hash table
 * @iface: the associated interface entry
 * @addr: IPv4 address in network byte order
 * @mask: IPv4 address mask in network byte order
 * @secid: LSM secid value for entry
 *
 * Description:
 * Add a new address entry into the unlabeled connection hash table using the
 * interface entry specified by @iface.  On success zero is returned, otherwise
 * a negative value is returned.
 *
 */
static int netlbl_unlhsh_add_addr4(struct netlbl_unlhsh_iface *iface,
                                   const struct in_addr *addr,
                                   const struct in_addr *mask,
                                   u32 secid)
{
        int ret_val;
        struct netlbl_unlhsh_addr4 *entry;

        entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
        if (entry == NULL)
                return -ENOMEM;

        entry->list.addr = addr->s_addr & mask->s_addr;
        entry->list.mask = mask->s_addr;
        entry->list.valid = 1;
        entry->secid = secid;

        spin_lock(&netlbl_unlhsh_lock);
        ret_val = netlbl_af4list_add(&entry->list, &iface->addr4_list);
        spin_unlock(&netlbl_unlhsh_lock);

        if (ret_val != 0)
                kfree(entry);
        return ret_val;
}

#if IS_ENABLED(CONFIG_IPV6)
/**
 * netlbl_unlhsh_add_addr6 - Add a new IPv6 address entry to the hash table
 * @iface: the associated interface entry
 * @addr: IPv6 address in network byte order
 * @mask: IPv6 address mask in network byte order
 * @secid: LSM secid value for entry
 *
 * Description:
 * Add a new address entry into the unlabeled connection hash table using the
 * interface entry specified by @iface.  On success zero is returned, otherwise
 * a negative value is returned.
 *
 */
static int netlbl_unlhsh_add_addr6(struct netlbl_unlhsh_iface *iface,
                                   const struct in6_addr *addr,
                                   const struct in6_addr *mask,
                                   u32 secid)
{
        int ret_val;
        struct netlbl_unlhsh_addr6 *entry;

        entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
        if (entry == NULL)
                return -ENOMEM;

        entry->list.addr = *addr;
        entry->list.addr.s6_addr32[0] &= mask->s6_addr32[0];
        entry->list.addr.s6_addr32[1] &= mask->s6_addr32[1];
        entry->list.addr.s6_addr32[2] &= mask->s6_addr32[2];
        entry->list.addr.s6_addr32[3] &= mask->s6_addr32[3];
        entry->list.mask = *mask;
        entry->list.valid = 1;
        entry->secid = secid;

        spin_lock(&netlbl_unlhsh_lock);
        ret_val = netlbl_af6list_add(&entry->list, &iface->addr6_list);
        spin_unlock(&netlbl_unlhsh_lock);

        if (ret_val != 0)
                kfree(entry);
        return 0;
}
#endif /* IPv6 */

/**
 * netlbl_unlhsh_add_iface - Adds a new interface entry to the hash table
 * @ifindex: network interface
 *
 * Description:
 * Add a new, empty, interface entry into the unlabeled connection hash table.
 * On success a pointer to the new interface entry is returned, on failure NULL
 * is returned.
 *
 */
static struct netlbl_unlhsh_iface *netlbl_unlhsh_add_iface(int ifindex)
{
        u32 bkt;
        struct netlbl_unlhsh_iface *iface;

        iface = kzalloc(sizeof(*iface), GFP_ATOMIC);
        if (iface == NULL)
                return NULL;

        iface->ifindex = ifindex;
        INIT_LIST_HEAD(&iface->addr4_list);
        INIT_LIST_HEAD(&iface->addr6_list);
        iface->valid = 1;

        spin_lock(&netlbl_unlhsh_lock);
        if (ifindex > 0) {
                bkt = netlbl_unlhsh_hash(ifindex);
                if (netlbl_unlhsh_search_iface(ifindex) != NULL)
                        goto add_iface_failure;
                list_add_tail_rcu(&iface->list,
                             &netlbl_unlhsh_rcu_deref(netlbl_unlhsh)->tbl[bkt]);
        } else {
                INIT_LIST_HEAD(&iface->list);
                if (netlbl_unlhsh_rcu_deref(netlbl_unlhsh_def) != NULL)
                        goto add_iface_failure;
                rcu_assign_pointer(netlbl_unlhsh_def, iface);
        }
        spin_unlock(&netlbl_unlhsh_lock);

        return iface;

add_iface_failure:
        spin_unlock(&netlbl_unlhsh_lock);
        kfree(iface);
        return NULL;
}

/**
 * netlbl_unlhsh_add - Adds a new entry to the unlabeled connection hash table
 * @net: network namespace
 * @dev_name: interface name
 * @addr: IP address in network byte order
 * @mask: address mask in network byte order
 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
 * @secid: LSM secid value for the entry
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Adds a new entry to the unlabeled connection hash table.  Returns zero on
 * success, negative values on failure.
 *
 */
int netlbl_unlhsh_add(struct net *net,
                      const char *dev_name,
                      const void *addr,
                      const void *mask,
                      u32 addr_len,
                      u32 secid,
                      struct netlbl_audit *audit_info)
{
        int ret_val;
        int ifindex;
        struct net_device *dev;
        struct netlbl_unlhsh_iface *iface;
        struct audit_buffer *audit_buf = NULL;
        struct lsm_context ctx;

        if (addr_len != sizeof(struct in_addr) &&
            addr_len != sizeof(struct in6_addr))
                return -EINVAL;

        rcu_read_lock();
        if (dev_name != NULL) {
                dev = dev_get_by_name_rcu(net, dev_name);
                if (dev == NULL) {
                        ret_val = -ENODEV;
                        goto unlhsh_add_return;
                }
                ifindex = dev->ifindex;
                iface = netlbl_unlhsh_search_iface(ifindex);
        } else {
                ifindex = 0;
                iface = rcu_dereference(netlbl_unlhsh_def);
        }
        if (iface == NULL) {
                iface = netlbl_unlhsh_add_iface(ifindex);
                if (iface == NULL) {
                        ret_val = -ENOMEM;
                        goto unlhsh_add_return;
                }
        }
        audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCADD,
                                              audit_info);
        switch (addr_len) {
        case sizeof(struct in_addr): {
                const struct in_addr *addr4 = addr;
                const struct in_addr *mask4 = mask;

                ret_val = netlbl_unlhsh_add_addr4(iface, addr4, mask4, secid);
                if (audit_buf != NULL)
                        netlbl_af4list_audit_addr(audit_buf, 1,
                                                  dev_name,
                                                  addr4->s_addr,
                                                  mask4->s_addr);
                break;
        }
#if IS_ENABLED(CONFIG_IPV6)
        case sizeof(struct in6_addr): {
                const struct in6_addr *addr6 = addr;
                const struct in6_addr *mask6 = mask;

                ret_val = netlbl_unlhsh_add_addr6(iface, addr6, mask6, secid);
                if (audit_buf != NULL)
                        netlbl_af6list_audit_addr(audit_buf, 1,
                                                  dev_name,
                                                  addr6, mask6);
                break;
        }
#endif /* IPv6 */
        default:
                ret_val = -EINVAL;
        }
        if (ret_val == 0)
                atomic_inc(&netlabel_mgmt_protocount);

unlhsh_add_return:
        rcu_read_unlock();
        if (audit_buf != NULL) {
                if (security_secid_to_secctx(secid, &ctx) >= 0) {
                        audit_log_format(audit_buf, " sec_obj=%s", ctx.context);
                        security_release_secctx(&ctx);
                }
                audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
                audit_log_end(audit_buf);
        }
        return ret_val;
}

/**
 * netlbl_unlhsh_remove_addr4 - Remove an IPv4 address entry
 * @net: network namespace
 * @iface: interface entry
 * @addr: IP address
 * @mask: IP address mask
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Remove an IP address entry from the unlabeled connection hash table.
 * Returns zero on success, negative values on failure.
 *
 */
static int netlbl_unlhsh_remove_addr4(struct net *net,
                                      struct netlbl_unlhsh_iface *iface,
                                      const struct in_addr *addr,
                                      const struct in_addr *mask,
                                      struct netlbl_audit *audit_info)
{
        struct netlbl_af4list *list_entry;
        struct netlbl_unlhsh_addr4 *entry;
        struct audit_buffer *audit_buf;
        struct net_device *dev;
        struct lsm_context ctx;

        spin_lock(&netlbl_unlhsh_lock);
        list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
                                           &iface->addr4_list);
        spin_unlock(&netlbl_unlhsh_lock);
        if (list_entry != NULL)
                entry = netlbl_unlhsh_addr4_entry(list_entry);
        else
                entry = NULL;

        audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
                                              audit_info);
        if (audit_buf != NULL) {
                dev = dev_get_by_index(net, iface->ifindex);
                netlbl_af4list_audit_addr(audit_buf, 1,
                                          (dev != NULL ? dev->name : NULL),
                                          addr->s_addr, mask->s_addr);
                dev_put(dev);
                if (entry != NULL &&
                    security_secid_to_secctx(entry->secid, &ctx) >= 0) {
                        audit_log_format(audit_buf, " sec_obj=%s", ctx.context);
                        security_release_secctx(&ctx);
                }
                audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
                audit_log_end(audit_buf);
        }

        if (entry == NULL)
                return -ENOENT;

        kfree_rcu(entry, rcu);
        return 0;
}

#if IS_ENABLED(CONFIG_IPV6)
/**
 * netlbl_unlhsh_remove_addr6 - Remove an IPv6 address entry
 * @net: network namespace
 * @iface: interface entry
 * @addr: IP address
 * @mask: IP address mask
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Remove an IP address entry from the unlabeled connection hash table.
 * Returns zero on success, negative values on failure.
 *
 */
static int netlbl_unlhsh_remove_addr6(struct net *net,
                                      struct netlbl_unlhsh_iface *iface,
                                      const struct in6_addr *addr,
                                      const struct in6_addr *mask,
                                      struct netlbl_audit *audit_info)
{
        struct netlbl_af6list *list_entry;
        struct netlbl_unlhsh_addr6 *entry;
        struct audit_buffer *audit_buf;
        struct net_device *dev;
        struct lsm_context ctx;

        spin_lock(&netlbl_unlhsh_lock);
        list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list);
        spin_unlock(&netlbl_unlhsh_lock);
        if (list_entry != NULL)
                entry = netlbl_unlhsh_addr6_entry(list_entry);
        else
                entry = NULL;

        audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_STCDEL,
                                              audit_info);
        if (audit_buf != NULL) {
                dev = dev_get_by_index(net, iface->ifindex);
                netlbl_af6list_audit_addr(audit_buf, 1,
                                          (dev != NULL ? dev->name : NULL),
                                          addr, mask);
                dev_put(dev);
                if (entry != NULL &&
                    security_secid_to_secctx(entry->secid, &ctx) >= 0) {
                        audit_log_format(audit_buf, " sec_obj=%s", ctx.context);
                        security_release_secctx(&ctx);
                }
                audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
                audit_log_end(audit_buf);
        }

        if (entry == NULL)
                return -ENOENT;

        kfree_rcu(entry, rcu);
        return 0;
}
#endif /* IPv6 */

/**
 * netlbl_unlhsh_condremove_iface - Remove an interface entry
 * @iface: the interface entry
 *
 * Description:
 * Remove an interface entry from the unlabeled connection hash table if it is
 * empty.  An interface entry is considered to be empty if there are no
 * address entries assigned to it.
 *
 */
static void netlbl_unlhsh_condremove_iface(struct netlbl_unlhsh_iface *iface)
{
        struct netlbl_af4list *iter4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netlbl_af6list *iter6;
#endif /* IPv6 */

        spin_lock(&netlbl_unlhsh_lock);
        netlbl_af4list_foreach_rcu(iter4, &iface->addr4_list)
                goto unlhsh_condremove_failure;
#if IS_ENABLED(CONFIG_IPV6)
        netlbl_af6list_foreach_rcu(iter6, &iface->addr6_list)
                goto unlhsh_condremove_failure;
#endif /* IPv6 */
        iface->valid = 0;
        if (iface->ifindex > 0)
                list_del_rcu(&iface->list);
        else
                RCU_INIT_POINTER(netlbl_unlhsh_def, NULL);
        spin_unlock(&netlbl_unlhsh_lock);

        call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);
        return;

unlhsh_condremove_failure:
        spin_unlock(&netlbl_unlhsh_lock);
}

/**
 * netlbl_unlhsh_remove - Remove an entry from the unlabeled hash table
 * @net: network namespace
 * @dev_name: interface name
 * @addr: IP address in network byte order
 * @mask: address mask in network byte order
 * @addr_len: length of address/mask (4 for IPv4, 16 for IPv6)
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Removes and existing entry from the unlabeled connection hash table.
 * Returns zero on success, negative values on failure.
 *
 */
int netlbl_unlhsh_remove(struct net *net,
                         const char *dev_name,
                         const void *addr,
                         const void *mask,
                         u32 addr_len,
                         struct netlbl_audit *audit_info)
{
        int ret_val;
        struct net_device *dev;
        struct netlbl_unlhsh_iface *iface;

        if (addr_len != sizeof(struct in_addr) &&
            addr_len != sizeof(struct in6_addr))
                return -EINVAL;

        rcu_read_lock();
        if (dev_name != NULL) {
                dev = dev_get_by_name_rcu(net, dev_name);
                if (dev == NULL) {
                        ret_val = -ENODEV;
                        goto unlhsh_remove_return;
                }
                iface = netlbl_unlhsh_search_iface(dev->ifindex);
        } else
                iface = rcu_dereference(netlbl_unlhsh_def);
        if (iface == NULL) {
                ret_val = -ENOENT;
                goto unlhsh_remove_return;
        }
        switch (addr_len) {
        case sizeof(struct in_addr):
                ret_val = netlbl_unlhsh_remove_addr4(net,
                                                     iface, addr, mask,
                                                     audit_info);
                break;
#if IS_ENABLED(CONFIG_IPV6)
        case sizeof(struct in6_addr):
                ret_val = netlbl_unlhsh_remove_addr6(net,
                                                     iface, addr, mask,
                                                     audit_info);
                break;
#endif /* IPv6 */
        default:
                ret_val = -EINVAL;
        }
        if (ret_val == 0) {
                netlbl_unlhsh_condremove_iface(iface);
                atomic_dec(&netlabel_mgmt_protocount);
        }

unlhsh_remove_return:
        rcu_read_unlock();
        return ret_val;
}

/*
 * General Helper Functions
 */

/**
 * netlbl_unlhsh_netdev_handler - Network device notification handler
 * @this: notifier block
 * @event: the event
 * @ptr: the netdevice notifier info (cast to void)
 *
 * Description:
 * Handle network device events, although at present all we care about is a
 * network device going away.  In the case of a device going away we clear any
 * related entries from the unlabeled connection hash table.
 *
 */
static int netlbl_unlhsh_netdev_handler(struct notifier_block *this,
                                        unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netlbl_unlhsh_iface *iface = NULL;

        if (!net_eq(dev_net(dev), &init_net))
                return NOTIFY_DONE;

        /* XXX - should this be a check for NETDEV_DOWN or _UNREGISTER? */
        if (event == NETDEV_DOWN) {
                spin_lock(&netlbl_unlhsh_lock);
                iface = netlbl_unlhsh_search_iface(dev->ifindex);
                if (iface != NULL && iface->valid) {
                        iface->valid = 0;
                        list_del_rcu(&iface->list);
                } else
                        iface = NULL;
                spin_unlock(&netlbl_unlhsh_lock);
        }

        if (iface != NULL)
                call_rcu(&iface->rcu, netlbl_unlhsh_free_iface);

        return NOTIFY_DONE;
}

/**
 * netlbl_unlabel_acceptflg_set - Set the unlabeled accept flag
 * @value: desired value
 * @audit_info: NetLabel audit information
 *
 * Description:
 * Set the value of the unlabeled accept flag to @value.
 *
 */
static void netlbl_unlabel_acceptflg_set(u8 value,
                                         struct netlbl_audit *audit_info)
{
        struct audit_buffer *audit_buf;
        u8 old_val;

        old_val = netlabel_unlabel_acceptflg;
        netlabel_unlabel_acceptflg = value;
        audit_buf = netlbl_audit_start_common(AUDIT_MAC_UNLBL_ALLOW,
                                              audit_info);
        if (audit_buf != NULL) {
                audit_log_format(audit_buf,
                                 " unlbl_accept=%u old=%u", value, old_val);
                audit_log_end(audit_buf);
        }
}

/**
 * netlbl_unlabel_addrinfo_get - Get the IPv4/6 address information
 * @info: the Generic NETLINK info block
 * @addr: the IP address
 * @mask: the IP address mask
 * @len: the address length
 *
 * Description:
 * Examine the Generic NETLINK message and extract the IP address information.
 * Returns zero on success, negative values on failure.
 *
 */
static int netlbl_unlabel_addrinfo_get(struct genl_info *info,
                                       void **addr,
                                       void **mask,
                                       u32 *len)
{
        u32 addr_len;

        if (info->attrs[NLBL_UNLABEL_A_IPV4ADDR] &&
            info->attrs[NLBL_UNLABEL_A_IPV4MASK]) {
                addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
                if (addr_len != sizeof(struct in_addr) &&
                    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV4MASK]))
                        return -EINVAL;
                *len = addr_len;
                *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4ADDR]);
                *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV4MASK]);
                return 0;
        } else if (info->attrs[NLBL_UNLABEL_A_IPV6ADDR]) {
                addr_len = nla_len(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
                if (addr_len != sizeof(struct in6_addr) &&
                    addr_len != nla_len(info->attrs[NLBL_UNLABEL_A_IPV6MASK]))
                        return -EINVAL;
                *len = addr_len;
                *addr = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6ADDR]);
                *mask = nla_data(info->attrs[NLBL_UNLABEL_A_IPV6MASK]);
                return 0;
        }

        return -EINVAL;
}

/*
 * NetLabel Command Handlers
 */

/**
 * netlbl_unlabel_accept - Handle an ACCEPT message
 * @skb: the NETLINK buffer
 * @info: the Generic NETLINK info block
 *
 * Description:
 * Process a user generated ACCEPT message and set the accept flag accordingly.
 * Returns zero on success, negative values on failure.
 *
 */
static int netlbl_unlabel_accept(struct sk_buff *skb, struct genl_info *info)
{
        u8 value;
        struct netlbl_audit audit_info;

        if (info->attrs[NLBL_UNLABEL_A_ACPTFLG]) {
                value = nla_get_u8(info->attrs[NLBL_UNLABEL_A_ACPTFLG]);
                if (value == 1 || value == 0) {
                        netlbl_netlink_auditinfo(&audit_info);
                        netlbl_unlabel_acceptflg_set(value, &audit_info);
                        return 0;
                }
        }

        return -EINVAL;
}

/**
 * netlbl_unlabel_list - Handle a LIST message
 * @skb: the NETLINK buffer
 * @info: the Generic NETLINK info block
 *
 * Description:
 * Process a user generated LIST message and respond with the current status.
 * Returns zero on success, negative values on failure.
 *
 */
static int netlbl_unlabel_list(struct sk_buff *skb, struct genl_info *info)
{
        int ret_val = -EINVAL;
        struct sk_buff *ans_skb;
        void *data;

        ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (ans_skb == NULL)
                goto list_failure;
        data = genlmsg_put_reply(ans_skb, info, &netlbl_unlabel_gnl_family,
                                 0, NLBL_UNLABEL_C_LIST);
        if (data == NULL) {
                ret_val = -ENOMEM;
                goto list_failure;
        }

        ret_val = nla_put_u8(ans_skb,
                             NLBL_UNLABEL_A_ACPTFLG,
                             netlabel_unlabel_acceptflg);
        if (ret_val != 0)
                goto list_failure;

        genlmsg_end(ans_skb, data);
        return genlmsg_reply(ans_skb, info);

list_failure:
        kfree_skb(ans_skb);
        return ret_val;
}

/**
 * netlbl_unlabel_staticadd - Handle a STATICADD message
 * @skb: the NETLINK buffer
 * @info: the Generic NETLINK info block
 *
 * Description:
 * Process a user generated STATICADD message and add a new unlabeled
 * connection entry to the hash table.  Returns zero on success, negative
 * values on failure.
 *
 */
static int netlbl_unlabel_staticadd(struct sk_buff *skb,
                                    struct genl_info *info)
{
        int ret_val;
        char *dev_name;
        void *addr;
        void *mask;
        u32 addr_len;
        u32 secid;
        struct netlbl_audit audit_info;

        /* Don't allow users to add both IPv4 and IPv6 addresses for a
         * single entry.  However, allow users to create two entries, one each
         * for IPv4 and IPv6, with the same LSM security context which should
         * achieve the same result. */
        if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
            !info->attrs[NLBL_UNLABEL_A_IFACE] ||
            !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
               !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
              (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
               !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
                return -EINVAL;

        netlbl_netlink_auditinfo(&audit_info);

        ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
        if (ret_val != 0)
                return ret_val;
        dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);
        ret_val = security_secctx_to_secid(
                                  nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
                                  nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
                                  &secid);
        if (ret_val != 0)
                return ret_val;

        return netlbl_unlhsh_add(&init_net,
                                 dev_name, addr, mask, addr_len, secid,
                                 &audit_info);
}

/**
 * netlbl_unlabel_staticadddef - Handle a STATICADDDEF message
 * @skb: the NETLINK buffer
 * @info: the Generic NETLINK info block
 *
 * Description:
 * Process a user generated STATICADDDEF message and add a new default
 * unlabeled connection entry.  Returns zero on success, negative values on
 * failure.
 *
 */
static int netlbl_unlabel_staticadddef(struct sk_buff *skb,
                                       struct genl_info *info)
{
        int ret_val;
        void *addr;
        void *mask;
        u32 addr_len;
        u32 secid;
        struct netlbl_audit audit_info;

        /* Don't allow users to add both IPv4 and IPv6 addresses for a
         * single entry.  However, allow users to create two entries, one each
         * for IPv4 and IPv6, with the same LSM security context which should
         * achieve the same result. */
        if (!info->attrs[NLBL_UNLABEL_A_SECCTX] ||
            !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
               !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
              (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
               !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
                return -EINVAL;

        netlbl_netlink_auditinfo(&audit_info);

        ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
        if (ret_val != 0)
                return ret_val;
        ret_val = security_secctx_to_secid(
                                  nla_data(info->attrs[NLBL_UNLABEL_A_SECCTX]),
                                  nla_len(info->attrs[NLBL_UNLABEL_A_SECCTX]),
                                  &secid);
        if (ret_val != 0)
                return ret_val;

        return netlbl_unlhsh_add(&init_net,
                                 NULL, addr, mask, addr_len, secid,
                                 &audit_info);
}

/**
 * netlbl_unlabel_staticremove - Handle a STATICREMOVE message
 * @skb: the NETLINK buffer
 * @info: the Generic NETLINK info block
 *
 * Description:
 * Process a user generated STATICREMOVE message and remove the specified
 * unlabeled connection entry.  Returns zero on success, negative values on
 * failure.
 *
 */
static int netlbl_unlabel_staticremove(struct sk_buff *skb,
                                       struct genl_info *info)
{
        int ret_val;
        char *dev_name;
        void *addr;
        void *mask;
        u32 addr_len;
        struct netlbl_audit audit_info;

        /* See the note in netlbl_unlabel_staticadd() about not allowing both
         * IPv4 and IPv6 in the same entry. */
        if (!info->attrs[NLBL_UNLABEL_A_IFACE] ||
            !((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
               !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
              (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
               !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
                return -EINVAL;

        netlbl_netlink_auditinfo(&audit_info);

        ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
        if (ret_val != 0)
                return ret_val;
        dev_name = nla_data(info->attrs[NLBL_UNLABEL_A_IFACE]);

        return netlbl_unlhsh_remove(&init_net,
                                    dev_name, addr, mask, addr_len,
                                    &audit_info);
}

/**
 * netlbl_unlabel_staticremovedef - Handle a STATICREMOVEDEF message
 * @skb: the NETLINK buffer
 * @info: the Generic NETLINK info block
 *
 * Description:
 * Process a user generated STATICREMOVEDEF message and remove the default
 * unlabeled connection entry.  Returns zero on success, negative values on
 * failure.
 *
 */
static int netlbl_unlabel_staticremovedef(struct sk_buff *skb,
                                          struct genl_info *info)
{
        int ret_val;
        void *addr;
        void *mask;
        u32 addr_len;
        struct netlbl_audit audit_info;

        /* See the note in netlbl_unlabel_staticadd() about not allowing both
         * IPv4 and IPv6 in the same entry. */
        if (!((!info->attrs[NLBL_UNLABEL_A_IPV4ADDR] ||
               !info->attrs[NLBL_UNLABEL_A_IPV4MASK]) ^
              (!info->attrs[NLBL_UNLABEL_A_IPV6ADDR] ||
               !info->attrs[NLBL_UNLABEL_A_IPV6MASK])))
                return -EINVAL;

        netlbl_netlink_auditinfo(&audit_info);

        ret_val = netlbl_unlabel_addrinfo_get(info, &addr, &mask, &addr_len);
        if (ret_val != 0)
                return ret_val;

        return netlbl_unlhsh_remove(&init_net,
                                    NULL, addr, mask, addr_len,
                                    &audit_info);
}


/**
 * netlbl_unlabel_staticlist_gen - Generate messages for STATICLIST[DEF]
 * @cmd: command/message
 * @iface: the interface entry
 * @addr4: the IPv4 address entry
 * @addr6: the IPv6 address entry
 * @arg: the netlbl_unlhsh_walk_arg structure
 *
 * Description:
 * This function is designed to be used to generate a response for a
 * STATICLIST or STATICLISTDEF message.  When called either @addr4 or @addr6
 * can be specified, not both, the other unspecified entry should be set to
 * NULL by the caller.  Returns the size of the message on success, negative
 * values on failure.
 *
 */
static int netlbl_unlabel_staticlist_gen(u32 cmd,
                                       const struct netlbl_unlhsh_iface *iface,
                                       const struct netlbl_unlhsh_addr4 *addr4,
                                       const struct netlbl_unlhsh_addr6 *addr6,
                                       void *arg)
{
        int ret_val = -ENOMEM;
        struct netlbl_unlhsh_walk_arg *cb_arg = arg;
        struct net_device *dev;
        struct lsm_context ctx;
        void *data;
        u32 secid;

        data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid,
                           cb_arg->seq, &netlbl_unlabel_gnl_family,
                           NLM_F_MULTI, cmd);
        if (data == NULL)
                goto list_cb_failure;

        if (iface->ifindex > 0) {
                dev = dev_get_by_index(&init_net, iface->ifindex);
                if (!dev) {
                        ret_val = -ENODEV;
                        goto list_cb_failure;
                }
                ret_val = nla_put_string(cb_arg->skb,
                                         NLBL_UNLABEL_A_IFACE, dev->name);
                dev_put(dev);
                if (ret_val != 0)
                        goto list_cb_failure;
        }

        if (addr4) {
                struct in_addr addr_struct;

                addr_struct.s_addr = addr4->list.addr;
                ret_val = nla_put_in_addr(cb_arg->skb,
                                          NLBL_UNLABEL_A_IPV4ADDR,
                                          addr_struct.s_addr);
                if (ret_val != 0)
                        goto list_cb_failure;

                addr_struct.s_addr = addr4->list.mask;
                ret_val = nla_put_in_addr(cb_arg->skb,
                                          NLBL_UNLABEL_A_IPV4MASK,
                                          addr_struct.s_addr);
                if (ret_val != 0)
                        goto list_cb_failure;

                secid = addr4->secid;
        } else {
                ret_val = nla_put_in6_addr(cb_arg->skb,
                                           NLBL_UNLABEL_A_IPV6ADDR,
                                           &addr6->list.addr);
                if (ret_val != 0)
                        goto list_cb_failure;

                ret_val = nla_put_in6_addr(cb_arg->skb,
                                           NLBL_UNLABEL_A_IPV6MASK,
                                           &addr6->list.mask);
                if (ret_val != 0)
                        goto list_cb_failure;

                secid = addr6->secid;
        }

        ret_val = security_secid_to_secctx(secid, &ctx);
        if (ret_val < 0)
                goto list_cb_failure;
        ret_val = nla_put(cb_arg->skb,
                          NLBL_UNLABEL_A_SECCTX,
                          ctx.len,
                          ctx.context);
        security_release_secctx(&ctx);
        if (ret_val != 0)
                goto list_cb_failure;

        cb_arg->seq++;
        genlmsg_end(cb_arg->skb, data);
        return 0;

list_cb_failure:
        genlmsg_cancel(cb_arg->skb, data);
        return ret_val;
}

/**
 * netlbl_unlabel_staticlist - Handle a STATICLIST message
 * @skb: the NETLINK buffer
 * @cb: the NETLINK callback
 *
 * Description:
 * Process a user generated STATICLIST message and dump the unlabeled
 * connection hash table in a form suitable for use in a kernel generated
 * STATICLIST message.  Returns the length of @skb.
 *
 */
static int netlbl_unlabel_staticlist(struct sk_buff *skb,
                                     struct netlink_callback *cb)
{
        struct netlbl_unlhsh_walk_arg cb_arg;
        u32 skip_bkt = cb->args[0];
        u32 skip_chain = cb->args[1];
        u32 skip_addr4 = cb->args[2];
        u32 iter_bkt, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0;
        struct netlbl_unlhsh_iface *iface;
        struct list_head *iter_list;
        struct netlbl_af4list *addr4;
#if IS_ENABLED(CONFIG_IPV6)
        u32 skip_addr6 = cb->args[3];
        struct netlbl_af6list *addr6;
#endif

        cb_arg.nl_cb = cb;
        cb_arg.skb = skb;
        cb_arg.seq = cb->nlh->nlmsg_seq;

        rcu_read_lock();
        for (iter_bkt = skip_bkt;
             iter_bkt < rcu_dereference(netlbl_unlhsh)->size;
             iter_bkt++) {
                iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt];
                list_for_each_entry_rcu(iface, iter_list, list) {
                        if (!iface->valid ||
                            iter_chain++ < skip_chain)
                                continue;
                        netlbl_af4list_foreach_rcu(addr4,
                                                   &iface->addr4_list) {
                                if (iter_addr4++ < skip_addr4)
                                        continue;
                                if (netlbl_unlabel_staticlist_gen(
                                              NLBL_UNLABEL_C_STATICLIST,
                                              iface,
                                              netlbl_unlhsh_addr4_entry(addr4),
                                              NULL,
                                              &cb_arg) < 0) {
                                        iter_addr4--;
                                        iter_chain--;
                                        goto unlabel_staticlist_return;
                                }
                        }
                        iter_addr4 = 0;
                        skip_addr4 = 0;
#if IS_ENABLED(CONFIG_IPV6)
                        netlbl_af6list_foreach_rcu(addr6,
                                                   &iface->addr6_list) {
                                if (iter_addr6++ < skip_addr6)
                                        continue;
                                if (netlbl_unlabel_staticlist_gen(
                                              NLBL_UNLABEL_C_STATICLIST,
                                              iface,
                                              NULL,
                                              netlbl_unlhsh_addr6_entry(addr6),
                                              &cb_arg) < 0) {
                                        iter_addr6--;
                                        iter_chain--;
                                        goto unlabel_staticlist_return;
                                }
                        }
                        iter_addr6 = 0;
                        skip_addr6 = 0;
#endif /* IPv6 */
                }
                iter_chain = 0;
                skip_chain = 0;
        }

unlabel_staticlist_return:
        rcu_read_unlock();
        cb->args[0] = iter_bkt;
        cb->args[1] = iter_chain;
        cb->args[2] = iter_addr4;
        cb->args[3] = iter_addr6;
        return skb->len;
}

/**
 * netlbl_unlabel_staticlistdef - Handle a STATICLISTDEF message
 * @skb: the NETLINK buffer
 * @cb: the NETLINK callback
 *
 * Description:
 * Process a user generated STATICLISTDEF message and dump the default
 * unlabeled connection entry in a form suitable for use in a kernel generated
 * STATICLISTDEF message.  Returns the length of @skb.
 *
 */
static int netlbl_unlabel_staticlistdef(struct sk_buff *skb,
                                        struct netlink_callback *cb)
{
        struct netlbl_unlhsh_walk_arg cb_arg;
        struct netlbl_unlhsh_iface *iface;
        u32 iter_addr4 = 0, iter_addr6 = 0;
        struct netlbl_af4list *addr4;
#if IS_ENABLED(CONFIG_IPV6)
        struct netlbl_af6list *addr6;
#endif

        cb_arg.nl_cb = cb;
        cb_arg.skb = skb;
        cb_arg.seq = cb->nlh->nlmsg_seq;

        rcu_read_lock();
        iface = rcu_dereference(netlbl_unlhsh_def);
        if (iface == NULL || !iface->valid)
                goto unlabel_staticlistdef_return;

        netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) {
                if (iter_addr4++ < cb->args[0])
                        continue;
                if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
                                              iface,
                                              netlbl_unlhsh_addr4_entry(addr4),
                                              NULL,
                                              &cb_arg) < 0) {
                        iter_addr4--;
                        goto unlabel_staticlistdef_return;
                }
        }
#if IS_ENABLED(CONFIG_IPV6)
        netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) {
                if (iter_addr6++ < cb->args[1])
                        continue;
                if (netlbl_unlabel_staticlist_gen(NLBL_UNLABEL_C_STATICLISTDEF,
                                              iface,
                                              NULL,
                                              netlbl_unlhsh_addr6_entry(addr6),
                                              &cb_arg) < 0) {
                        iter_addr6--;
                        goto unlabel_staticlistdef_return;
                }
        }
#endif /* IPv6 */

unlabel_staticlistdef_return:
        rcu_read_unlock();
        cb->args[0] = iter_addr4;
        cb->args[1] = iter_addr6;
        return skb->len;
}

/*
 * NetLabel Generic NETLINK Command Definitions
 */

static const struct genl_small_ops netlbl_unlabel_genl_ops[] = {
        {
        .cmd = NLBL_UNLABEL_C_STATICADD,
        .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
        .flags = GENL_ADMIN_PERM,
        .doit = netlbl_unlabel_staticadd,
        .dumpit = NULL,
        },
        {
        .cmd = NLBL_UNLABEL_C_STATICREMOVE,
        .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
        .flags = GENL_ADMIN_PERM,
        .doit = netlbl_unlabel_staticremove,
        .dumpit = NULL,
        },
        {
        .cmd = NLBL_UNLABEL_C_STATICLIST,
        .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
        .flags = 0,
        .doit = NULL,
        .dumpit = netlbl_unlabel_staticlist,
        },
        {
        .cmd = NLBL_UNLABEL_C_STATICADDDEF,
        .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
        .flags = GENL_ADMIN_PERM,
        .doit = netlbl_unlabel_staticadddef,
        .dumpit = NULL,
        },
        {
        .cmd = NLBL_UNLABEL_C_STATICREMOVEDEF,
        .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
        .flags = GENL_ADMIN_PERM,
        .doit = netlbl_unlabel_staticremovedef,
        .dumpit = NULL,
        },
        {
        .cmd = NLBL_UNLABEL_C_STATICLISTDEF,
        .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
        .flags = 0,
        .doit = NULL,
        .dumpit = netlbl_unlabel_staticlistdef,
        },
        {
        .cmd = NLBL_UNLABEL_C_ACCEPT,
        .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
        .flags = GENL_ADMIN_PERM,
        .doit = netlbl_unlabel_accept,
        .dumpit = NULL,
        },
        {
        .cmd = NLBL_UNLABEL_C_LIST,
        .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
        .flags = 0,
        .doit = netlbl_unlabel_list,
        .dumpit = NULL,
        },
};

static struct genl_family netlbl_unlabel_gnl_family __ro_after_init = {
        .hdrsize = 0,
        .name = NETLBL_NLTYPE_UNLABELED_NAME,
        .version = NETLBL_PROTO_VERSION,
        .maxattr = NLBL_UNLABEL_A_MAX,
        .policy = netlbl_unlabel_genl_policy,
        .module = THIS_MODULE,
        .small_ops = netlbl_unlabel_genl_ops,
        .n_small_ops = ARRAY_SIZE(netlbl_unlabel_genl_ops),
        .resv_start_op = NLBL_UNLABEL_C_STATICLISTDEF + 1,
};

/*
 * NetLabel Generic NETLINK Protocol Functions
 */

/**
 * netlbl_unlabel_genl_init - Register the Unlabeled NetLabel component
 *
 * Description:
 * Register the unlabeled packet NetLabel component with the Generic NETLINK
 * mechanism.  Returns zero on success, negative values on failure.
 *
 */
int __init netlbl_unlabel_genl_init(void)
{
        return genl_register_family(&netlbl_unlabel_gnl_family);
}

/*
 * NetLabel KAPI Hooks
 */

static struct notifier_block netlbl_unlhsh_netdev_notifier = {
        .notifier_call = netlbl_unlhsh_netdev_handler,
};

/**
 * netlbl_unlabel_init - Initialize the unlabeled connection hash table
 * @size: the number of bits to use for the hash buckets
 *
 * Description:
 * Initializes the unlabeled connection hash table and registers a network
 * device notification handler.  This function should only be called by the
 * NetLabel subsystem itself during initialization.  Returns zero on success,
 * non-zero values on error.
 *
 */
int __init netlbl_unlabel_init(u32 size)
{
        u32 iter;
        struct netlbl_unlhsh_tbl *hsh_tbl;

        if (size == 0)
                return -EINVAL;

        hsh_tbl = kmalloc(sizeof(*hsh_tbl), GFP_KERNEL);
        if (hsh_tbl == NULL)
                return -ENOMEM;
        hsh_tbl->size = 1 << size;
        hsh_tbl->tbl = kcalloc(hsh_tbl->size,
                               sizeof(struct list_head),
                               GFP_KERNEL);
        if (hsh_tbl->tbl == NULL) {
                kfree(hsh_tbl);
                return -ENOMEM;
        }
        for (iter = 0; iter < hsh_tbl->size; iter++)
                INIT_LIST_HEAD(&hsh_tbl->tbl[iter]);

        spin_lock(&netlbl_unlhsh_lock);
        rcu_assign_pointer(netlbl_unlhsh, hsh_tbl);
        spin_unlock(&netlbl_unlhsh_lock);

        register_netdevice_notifier(&netlbl_unlhsh_netdev_notifier);

        return 0;
}

/**
 * netlbl_unlabel_getattr - Get the security attributes for an unlabled packet
 * @skb: the packet
 * @family: protocol family
 * @secattr: the security attributes
 *
 * Description:
 * Determine the security attributes, if any, for an unlabled packet and return
 * them in @secattr.  Returns zero on success and negative values on failure.
 *
 */
int netlbl_unlabel_getattr(const struct sk_buff *skb,
                           u16 family,
                           struct netlbl_lsm_secattr *secattr)
{
        struct netlbl_unlhsh_iface *iface;

        rcu_read_lock();
        iface = netlbl_unlhsh_search_iface(skb->skb_iif);
        if (iface == NULL)
                iface = rcu_dereference(netlbl_unlhsh_def);
        if (iface == NULL || !iface->valid)
                goto unlabel_getattr_nolabel;

#if IS_ENABLED(CONFIG_IPV6)
        /* When resolving a fallback label, check the sk_buff version as
         * it is possible (e.g. SCTP) to have family = PF_INET6 while
         * receiving ip_hdr(skb)->version = 4.
         */
        if (family == PF_INET6 && ip_hdr(skb)->version == 4)
                family = PF_INET;
#endif /* IPv6 */

        switch (family) {
        case PF_INET: {
                struct iphdr *hdr4;
                struct netlbl_af4list *addr4;

                hdr4 = ip_hdr(skb);
                addr4 = netlbl_af4list_search(hdr4->saddr,
                                              &iface->addr4_list);
                if (addr4 == NULL)
                        goto unlabel_getattr_nolabel;
                secattr->attr.secid = netlbl_unlhsh_addr4_entry(addr4)->secid;
                break;
        }
#if IS_ENABLED(CONFIG_IPV6)
        case PF_INET6: {
                struct ipv6hdr *hdr6;
                struct netlbl_af6list *addr6;

                hdr6 = ipv6_hdr(skb);
                addr6 = netlbl_af6list_search(&hdr6->saddr,
                                              &iface->addr6_list);
                if (addr6 == NULL)
                        goto unlabel_getattr_nolabel;
                secattr->attr.secid = netlbl_unlhsh_addr6_entry(addr6)->secid;
                break;
        }
#endif /* IPv6 */
        default:
                goto unlabel_getattr_nolabel;
        }
        rcu_read_unlock();

        secattr->flags |= NETLBL_SECATTR_SECID;
        secattr->type = NETLBL_NLTYPE_UNLABELED;
        return 0;

unlabel_getattr_nolabel:
        rcu_read_unlock();
        if (netlabel_unlabel_acceptflg == 0)
                return -ENOMSG;
        secattr->type = NETLBL_NLTYPE_UNLABELED;
        return 0;
}

/**
 * netlbl_unlabel_defconf - Set the default config to allow unlabeled packets
 *
 * Description:
 * Set the default NetLabel configuration to allow incoming unlabeled packets
 * and to send unlabeled network traffic by default.
 *
 */
int __init netlbl_unlabel_defconf(void)
{
        int ret_val;
        struct netlbl_dom_map *entry;
        struct netlbl_audit audit_info;

        /* Only the kernel is allowed to call this function and the only time
         * it is called is at bootup before the audit subsystem is reporting
         * messages so don't worry to much about these values. */
        security_current_getlsmprop_subj(&audit_info.prop);
        audit_info.loginuid = GLOBAL_ROOT_UID;
        audit_info.sessionid = 0;

        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (entry == NULL)
                return -ENOMEM;
        entry->family = AF_UNSPEC;
        entry->def.type = NETLBL_NLTYPE_UNLABELED;
        ret_val = netlbl_domhsh_add_default(entry, &audit_info);
        if (ret_val != 0)
                return ret_val;

        netlbl_unlabel_acceptflg_set(1, &audit_info);

        return 0;
}

























































































































































































































































































  264 












  265 























  265 





  265 


  265 



  265 


  265 



















































  265 










  265 












  265 
































































  265 














  265 


  265 


























  265 








  264 


































  265 









  265 







  265 





















  264 








  265 























































  265 



















  265 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
// SPDX-License-Identifier: GPL-2.0
/*
 * kernel userspace event delivery
 *
 * Copyright (C) 2004 Red Hat, Inc.  All rights reserved.
 * Copyright (C) 2004 Novell, Inc.  All rights reserved.
 * Copyright (C) 2004 IBM, Inc. All rights reserved.
 *
 * Authors:
 *        Robert Love                <rml@novell.com>
 *        Kay Sievers                <kay.sievers@vrfy.org>
 *        Arjan van de Ven        <arjanv@redhat.com>
 *        Greg Kroah-Hartman        <greg@kroah.com>
 */

#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/kobject.h>
#include <linux/export.h>
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/uidgid.h>
#include <linux/uuid.h>
#include <linux/ctype.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>


atomic64_t uevent_seqnum;
#ifdef CONFIG_UEVENT_HELPER
char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH;
#endif

struct uevent_sock {
        struct list_head list;
        struct sock *sk;
};

#ifdef CONFIG_NET
static LIST_HEAD(uevent_sock_list);
/* This lock protects uevent_sock_list */
static DEFINE_MUTEX(uevent_sock_mutex);
#endif

/* the strings here must match the enum in include/linux/kobject.h */
static const char *kobject_actions[] = {
        [KOBJ_ADD] =                "add",
        [KOBJ_REMOVE] =                "remove",
        [KOBJ_CHANGE] =                "change",
        [KOBJ_MOVE] =                "move",
        [KOBJ_ONLINE] =                "online",
        [KOBJ_OFFLINE] =        "offline",
        [KOBJ_BIND] =                "bind",
        [KOBJ_UNBIND] =                "unbind",
};

static int kobject_action_type(const char *buf, size_t count,
                               enum kobject_action *type,
                               const char **args)
{
        enum kobject_action action;
        size_t count_first;
        const char *args_start;
        int ret = -EINVAL;

        if (count && (buf[count-1] == '\n' || buf[count-1] == '\0'))
                count--;

        if (!count)
                goto out;

        args_start = strnchr(buf, count, ' ');
        if (args_start) {
                count_first = args_start - buf;
                args_start = args_start + 1;
        } else
                count_first = count;

        for (action = 0; action < ARRAY_SIZE(kobject_actions); action++) {
                if (strncmp(kobject_actions[action], buf, count_first) != 0)
                        continue;
                if (kobject_actions[action][count_first] != '\0')
                        continue;
                if (args)
                        *args = args_start;
                *type = action;
                ret = 0;
                break;
        }
out:
        return ret;
}

static const char *action_arg_word_end(const char *buf, const char *buf_end,
                                       char delim)
{
        const char *next = buf;

        while (next <= buf_end && *next != delim)
                if (!isalnum(*next++))
                        return NULL;

        if (next == buf)
                return NULL;

        return next;
}

static int kobject_action_args(const char *buf, size_t count,
                               struct kobj_uevent_env **ret_env)
{
        struct kobj_uevent_env *env = NULL;
        const char *next, *buf_end, *key;
        int key_len;
        int r = -EINVAL;

        if (count && (buf[count - 1] == '\n' || buf[count - 1] == '\0'))
                count--;

        if (!count)
                return -EINVAL;

        env = kzalloc(sizeof(*env), GFP_KERNEL);
        if (!env)
                return -ENOMEM;

        /* first arg is UUID */
        if (count < UUID_STRING_LEN || !uuid_is_valid(buf) ||
            add_uevent_var(env, "SYNTH_UUID=%.*s", UUID_STRING_LEN, buf))
                goto out;

        /*
         * the rest are custom environment variables in KEY=VALUE
         * format with ' ' delimiter between each KEY=VALUE pair
         */
        next = buf + UUID_STRING_LEN;
        buf_end = buf + count - 1;

        while (next <= buf_end) {
                if (*next != ' ')
                        goto out;

                /* skip the ' ', key must follow */
                key = ++next;
                if (key > buf_end)
                        goto out;

                buf = next;
                next = action_arg_word_end(buf, buf_end, '=');
                if (!next || next > buf_end || *next != '=')
                        goto out;
                key_len = next - buf;

                /* skip the '=', value must follow */
                if (++next > buf_end)
                        goto out;

                buf = next;
                next = action_arg_word_end(buf, buf_end, ' ');
                if (!next)
                        goto out;

                if (add_uevent_var(env, "SYNTH_ARG_%.*s=%.*s",
                                   key_len, key, (int) (next - buf), buf))
                        goto out;
        }

        r = 0;
out:
        if (r)
                kfree(env);
        else
                *ret_env = env;
        return r;
}

/**
 * kobject_synth_uevent - send synthetic uevent with arguments
 *
 * @kobj: struct kobject for which synthetic uevent is to be generated
 * @buf: buffer containing action type and action args, newline is ignored
 * @count: length of buffer
 *
 * Returns 0 if kobject_synthetic_uevent() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count)
{
        char *no_uuid_envp[] = { "SYNTH_UUID=0", NULL };
        enum kobject_action action;
        const char *action_args;
        struct kobj_uevent_env *env;
        const char *msg = NULL, *devpath;
        int r;

        r = kobject_action_type(buf, count, &action, &action_args);
        if (r) {
                msg = "unknown uevent action string";
                goto out;
        }

        if (!action_args) {
                r = kobject_uevent_env(kobj, action, no_uuid_envp);
                goto out;
        }

        r = kobject_action_args(action_args,
                                count - (action_args - buf), &env);
        if (r == -EINVAL) {
                msg = "incorrect uevent action arguments";
                goto out;
        }

        if (r)
                goto out;

        r = kobject_uevent_env(kobj, action, env->envp);
        kfree(env);
out:
        if (r) {
                devpath = kobject_get_path(kobj, GFP_KERNEL);
                pr_warn("synth uevent: %s: %s\n",
                       devpath ?: "unknown device",
                       msg ?: "failed to send uevent");
                kfree(devpath);
        }
        return r;
}

#ifdef CONFIG_UEVENT_HELPER
static int kobj_usermode_filter(struct kobject *kobj)
{
        const struct kobj_ns_type_operations *ops;

        ops = kobj_ns_ops(kobj);
        if (ops) {
                const void *init_ns, *ns;

                ns = kobj->ktype->namespace(kobj);
                init_ns = ops->initial_ns();
                return ns != init_ns;
        }

        return 0;
}

static int init_uevent_argv(struct kobj_uevent_env *env, const char *subsystem)
{
        int buffer_size = sizeof(env->buf) - env->buflen;
        int len;

        len = strscpy(&env->buf[env->buflen], subsystem, buffer_size);
        if (len < 0) {
                pr_warn("%s: insufficient buffer space (%u left) for %s\n",
                        __func__, buffer_size, subsystem);
                return -ENOMEM;
        }

        env->argv[0] = uevent_helper;
        env->argv[1] = &env->buf[env->buflen];
        env->argv[2] = NULL;

        env->buflen += len + 1;
        return 0;
}

static void cleanup_uevent_env(struct subprocess_info *info)
{
        kfree(info->data);
}
#endif

#ifdef CONFIG_NET
static struct sk_buff *alloc_uevent_skb(struct kobj_uevent_env *env,
                                        const char *action_string,
                                        const char *devpath)
{
        struct netlink_skb_parms *parms;
        struct sk_buff *skb = NULL;
        char *scratch;
        size_t len;

        /* allocate message with maximum possible size */
        len = strlen(action_string) + strlen(devpath) + 2;
        skb = alloc_skb(len + env->buflen, GFP_KERNEL);
        if (!skb)
                return NULL;

        /* add header */
        scratch = skb_put(skb, len);
        sprintf(scratch, "%s@%s", action_string, devpath);

        skb_put_data(skb, env->buf, env->buflen);

        parms = &NETLINK_CB(skb);
        parms->creds.uid = GLOBAL_ROOT_UID;
        parms->creds.gid = GLOBAL_ROOT_GID;
        parms->dst_group = 1;
        parms->portid = 0;

        return skb;
}

static int uevent_net_broadcast_untagged(struct kobj_uevent_env *env,
                                         const char *action_string,
                                         const char *devpath)
{
        struct sk_buff *skb = NULL;
        struct uevent_sock *ue_sk;
        int retval = 0;

        /* send netlink message */
        mutex_lock(&uevent_sock_mutex);
        list_for_each_entry(ue_sk, &uevent_sock_list, list) {
                struct sock *uevent_sock = ue_sk->sk;

                if (!netlink_has_listeners(uevent_sock, 1))
                        continue;

                if (!skb) {
                        retval = -ENOMEM;
                        skb = alloc_uevent_skb(env, action_string, devpath);
                        if (!skb)
                                continue;
                }

                retval = netlink_broadcast(uevent_sock, skb_get(skb), 0, 1,
                                           GFP_KERNEL);
                /* ENOBUFS should be handled in userspace */
                if (retval == -ENOBUFS || retval == -ESRCH)
                        retval = 0;
        }
        mutex_unlock(&uevent_sock_mutex);
        consume_skb(skb);

        return retval;
}

static int uevent_net_broadcast_tagged(struct sock *usk,
                                       struct kobj_uevent_env *env,
                                       const char *action_string,
                                       const char *devpath)
{
        struct user_namespace *owning_user_ns = sock_net(usk)->user_ns;
        struct sk_buff *skb = NULL;
        int ret = 0;

        skb = alloc_uevent_skb(env, action_string, devpath);
        if (!skb)
                return -ENOMEM;

        /* fix credentials */
        if (owning_user_ns != &init_user_ns) {
                struct netlink_skb_parms *parms = &NETLINK_CB(skb);
                kuid_t root_uid;
                kgid_t root_gid;

                /* fix uid */
                root_uid = make_kuid(owning_user_ns, 0);
                if (uid_valid(root_uid))
                        parms->creds.uid = root_uid;

                /* fix gid */
                root_gid = make_kgid(owning_user_ns, 0);
                if (gid_valid(root_gid))
                        parms->creds.gid = root_gid;
        }

        ret = netlink_broadcast(usk, skb, 0, 1, GFP_KERNEL);
        /* ENOBUFS should be handled in userspace */
        if (ret == -ENOBUFS || ret == -ESRCH)
                ret = 0;

        return ret;
}
#endif

static int kobject_uevent_net_broadcast(struct kobject *kobj,
                                        struct kobj_uevent_env *env,
                                        const char *action_string,
                                        const char *devpath)
{
        int ret = 0;

#ifdef CONFIG_NET
        const struct kobj_ns_type_operations *ops;
        const struct net *net = NULL;

        ops = kobj_ns_ops(kobj);
        if (!ops && kobj->kset) {
                struct kobject *ksobj = &kobj->kset->kobj;

                if (ksobj->parent != NULL)
                        ops = kobj_ns_ops(ksobj->parent);
        }

        /* kobjects currently only carry network namespace tags and they
         * are the only tag relevant here since we want to decide which
         * network namespaces to broadcast the uevent into.
         */
        if (ops && ops->netlink_ns && kobj->ktype->namespace)
                if (ops->type == KOBJ_NS_TYPE_NET)
                        net = kobj->ktype->namespace(kobj);

        if (!net)
                ret = uevent_net_broadcast_untagged(env, action_string,
                                                    devpath);
        else
                ret = uevent_net_broadcast_tagged(net->uevent_sock->sk, env,
                                                  action_string, devpath);
#endif

        return ret;
}

static void zap_modalias_env(struct kobj_uevent_env *env)
{
        static const char modalias_prefix[] = "MODALIAS=";
        size_t len;
        int i, j;

        for (i = 0; i < env->envp_idx;) {
                if (strncmp(env->envp[i], modalias_prefix,
                            sizeof(modalias_prefix) - 1)) {
                        i++;
                        continue;
                }

                len = strlen(env->envp[i]) + 1;

                if (i != env->envp_idx - 1) {
                        /* @env->envp[] contains pointers to @env->buf[]
                         * with @env->buflen chars, and we are removing
                         * variable MODALIAS here pointed by @env->envp[i]
                         * with length @len as shown below:
                         *
                         * 0               @env->buf[]      @env->buflen
                         * ---------------------------------------------
                         * ^             ^              ^              ^
                         * |             |->   @len   <-| target block |
                         * @env->envp[0] @env->envp[i]  @env->envp[i + 1]
                         *
                         * so the "target block" indicated above is moved
                         * backward by @len, and its right size is
                         * @env->buflen - (@env->envp[i + 1] - @env->envp[0]).
                         */
                        memmove(env->envp[i], env->envp[i + 1],
                                env->buflen - (env->envp[i + 1] - env->envp[0]));

                        for (j = i; j < env->envp_idx - 1; j++)
                                env->envp[j] = env->envp[j + 1] - len;
                }

                env->envp_idx--;
                env->buflen -= len;
        }
}

/**
 * kobject_uevent_env - send an uevent with environmental data
 *
 * @kobj: struct kobject that the action is happening to
 * @action: action that is happening
 * @envp_ext: pointer to environmental data
 *
 * Returns 0 if kobject_uevent_env() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
                       char *envp_ext[])
{
        struct kobj_uevent_env *env;
        const char *action_string = kobject_actions[action];
        const char *devpath = NULL;
        const char *subsystem;
        struct kobject *top_kobj;
        struct kset *kset;
        const struct kset_uevent_ops *uevent_ops;
        int i = 0;
        int retval = 0;

        /*
         * Mark "remove" event done regardless of result, for some subsystems
         * do not want to re-trigger "remove" event via automatic cleanup.
         */
        if (action == KOBJ_REMOVE)
                kobj->state_remove_uevent_sent = 1;

        pr_debug("kobject: '%s' (%p): %s\n",
                 kobject_name(kobj), kobj, __func__);

        /* search the kset we belong to */
        top_kobj = kobj;
        while (!top_kobj->kset && top_kobj->parent)
                top_kobj = top_kobj->parent;

        if (!top_kobj->kset) {
                pr_debug("kobject: '%s' (%p): %s: attempted to send uevent "
                         "without kset!\n", kobject_name(kobj), kobj,
                         __func__);
                return -EINVAL;
        }

        kset = top_kobj->kset;
        uevent_ops = kset->uevent_ops;

        /* skip the event, if uevent_suppress is set*/
        if (kobj->uevent_suppress) {
                pr_debug("kobject: '%s' (%p): %s: uevent_suppress "
                                 "caused the event to drop!\n",
                                 kobject_name(kobj), kobj, __func__);
                return 0;
        }
        /* skip the event, if the filter returns zero. */
        if (uevent_ops && uevent_ops->filter)
                if (!uevent_ops->filter(kobj)) {
                        pr_debug("kobject: '%s' (%p): %s: filter function "
                                 "caused the event to drop!\n",
                                 kobject_name(kobj), kobj, __func__);
                        return 0;
                }

        /* originating subsystem */
        if (uevent_ops && uevent_ops->name)
                subsystem = uevent_ops->name(kobj);
        else
                subsystem = kobject_name(&kset->kobj);
        if (!subsystem) {
                pr_debug("kobject: '%s' (%p): %s: unset subsystem caused the "
                         "event to drop!\n", kobject_name(kobj), kobj,
                         __func__);
                return 0;
        }

        /* environment buffer */
        env = kzalloc(sizeof(struct kobj_uevent_env), GFP_KERNEL);
        if (!env)
                return -ENOMEM;

        /* complete object path */
        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                retval = -ENOENT;
                goto exit;
        }

        /* default keys */
        retval = add_uevent_var(env, "ACTION=%s", action_string);
        if (retval)
                goto exit;
        retval = add_uevent_var(env, "DEVPATH=%s", devpath);
        if (retval)
                goto exit;
        retval = add_uevent_var(env, "SUBSYSTEM=%s", subsystem);
        if (retval)
                goto exit;

        /* keys passed in from the caller */
        if (envp_ext) {
                for (i = 0; envp_ext[i]; i++) {
                        retval = add_uevent_var(env, "%s", envp_ext[i]);
                        if (retval)
                                goto exit;
                }
        }

        /* let the kset specific function add its stuff */
        if (uevent_ops && uevent_ops->uevent) {
                retval = uevent_ops->uevent(kobj, env);
                if (retval) {
                        pr_debug("kobject: '%s' (%p): %s: uevent() returned "
                                 "%d\n", kobject_name(kobj), kobj,
                                 __func__, retval);
                        goto exit;
                }
        }

        switch (action) {
        case KOBJ_ADD:
                /*
                 * Mark "add" event so we can make sure we deliver "remove"
                 * event to userspace during automatic cleanup. If
                 * the object did send an "add" event, "remove" will
                 * automatically generated by the core, if not already done
                 * by the caller.
                 */
                kobj->state_add_uevent_sent = 1;
                break;

        case KOBJ_UNBIND:
                zap_modalias_env(env);
                break;

        default:
                break;
        }

        /* we will send an event, so request a new sequence number */
        retval = add_uevent_var(env, "SEQNUM=%llu",
                                atomic64_inc_return(&uevent_seqnum));
        if (retval)
                goto exit;

        retval = kobject_uevent_net_broadcast(kobj, env, action_string,
                                              devpath);

#ifdef CONFIG_UEVENT_HELPER
        /* call uevent_helper, usually only enabled during early boot */
        if (uevent_helper[0] && !kobj_usermode_filter(kobj)) {
                struct subprocess_info *info;

                retval = add_uevent_var(env, "HOME=/");
                if (retval)
                        goto exit;
                retval = add_uevent_var(env,
                                        "PATH=/sbin:/bin:/usr/sbin:/usr/bin");
                if (retval)
                        goto exit;
                retval = init_uevent_argv(env, subsystem);
                if (retval)
                        goto exit;

                retval = -ENOMEM;
                info = call_usermodehelper_setup(env->argv[0], env->argv,
                                                 env->envp, GFP_KERNEL,
                                                 NULL, cleanup_uevent_env, env);
                if (info) {
                        retval = call_usermodehelper_exec(info, UMH_NO_WAIT);
                        env = NULL;        /* freed by cleanup_uevent_env */
                }
        }
#endif

exit:
        kfree(devpath);
        kfree(env);
        return retval;
}
EXPORT_SYMBOL_GPL(kobject_uevent_env);

/**
 * kobject_uevent - notify userspace by sending an uevent
 *
 * @kobj: struct kobject that the action is happening to
 * @action: action that is happening
 *
 * Returns 0 if kobject_uevent() is completed with success or the
 * corresponding error when it fails.
 */
int kobject_uevent(struct kobject *kobj, enum kobject_action action)
{
        return kobject_uevent_env(kobj, action, NULL);
}
EXPORT_SYMBOL_GPL(kobject_uevent);

/**
 * add_uevent_var - add key value string to the environment buffer
 * @env: environment buffer structure
 * @format: printf format for the key=value pair
 *
 * Returns 0 if environment variable was added successfully or -ENOMEM
 * if no space was available.
 */
int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...)
{
        va_list args;
        int len;

        if (env->envp_idx >= ARRAY_SIZE(env->envp)) {
                WARN(1, KERN_ERR "add_uevent_var: too many keys\n");
                return -ENOMEM;
        }

        va_start(args, format);
        len = vsnprintf(&env->buf[env->buflen],
                        sizeof(env->buf) - env->buflen,
                        format, args);
        va_end(args);

        if (len >= (sizeof(env->buf) - env->buflen)) {
                WARN(1, KERN_ERR "add_uevent_var: buffer size too small\n");
                return -ENOMEM;
        }

        env->envp[env->envp_idx++] = &env->buf[env->buflen];
        env->buflen += len + 1;
        return 0;
}
EXPORT_SYMBOL_GPL(add_uevent_var);

#if defined(CONFIG_NET)
static int uevent_net_broadcast(struct sock *usk, struct sk_buff *skb,
                                struct netlink_ext_ack *extack)
{
        /* u64 to chars: 2^64 - 1 = 21 chars */
        char buf[sizeof("SEQNUM=") + 21];
        struct sk_buff *skbc;
        int ret;

        /* bump and prepare sequence number */
        ret = snprintf(buf, sizeof(buf), "SEQNUM=%llu",
                       atomic64_inc_return(&uevent_seqnum));
        if (ret < 0 || (size_t)ret >= sizeof(buf))
                return -ENOMEM;
        ret++;

        /* verify message does not overflow */
        if ((skb->len + ret) > UEVENT_BUFFER_SIZE) {
                NL_SET_ERR_MSG(extack, "uevent message too big");
                return -EINVAL;
        }

        /* copy skb and extend to accommodate sequence number */
        skbc = skb_copy_expand(skb, 0, ret, GFP_KERNEL);
        if (!skbc)
                return -ENOMEM;

        /* append sequence number */
        skb_put_data(skbc, buf, ret);

        /* remove msg header */
        skb_pull(skbc, NLMSG_HDRLEN);

        /* set portid 0 to inform userspace message comes from kernel */
        NETLINK_CB(skbc).portid = 0;
        NETLINK_CB(skbc).dst_group = 1;

        ret = netlink_broadcast(usk, skbc, 0, 1, GFP_KERNEL);
        /* ENOBUFS should be handled in userspace */
        if (ret == -ENOBUFS || ret == -ESRCH)
                ret = 0;

        return ret;
}

static int uevent_net_rcv_skb(struct sk_buff *skb, struct nlmsghdr *nlh,
                              struct netlink_ext_ack *extack)
{
        struct net *net;
        int ret;

        if (!nlmsg_data(nlh))
                return -EINVAL;

        /*
         * Verify that we are allowed to send messages to the target
         * network namespace. The caller must have CAP_SYS_ADMIN in the
         * owning user namespace of the target network namespace.
         */
        net = sock_net(NETLINK_CB(skb).sk);
        if (!netlink_ns_capable(skb, net->user_ns, CAP_SYS_ADMIN)) {
                NL_SET_ERR_MSG(extack, "missing CAP_SYS_ADMIN capability");
                return -EPERM;
        }

        ret = uevent_net_broadcast(net->uevent_sock->sk, skb, extack);

        return ret;
}

static void uevent_net_rcv(struct sk_buff *skb)
{
        netlink_rcv_skb(skb, &uevent_net_rcv_skb);
}

static int uevent_net_init(struct net *net)
{
        struct uevent_sock *ue_sk;
        struct netlink_kernel_cfg cfg = {
                .groups        = 1,
                .input = uevent_net_rcv,
                .flags        = NL_CFG_F_NONROOT_RECV
        };

        ue_sk = kzalloc(sizeof(*ue_sk), GFP_KERNEL);
        if (!ue_sk)
                return -ENOMEM;

        ue_sk->sk = netlink_kernel_create(net, NETLINK_KOBJECT_UEVENT, &cfg);
        if (!ue_sk->sk) {
                pr_err("kobject_uevent: unable to create netlink socket!\n");
                kfree(ue_sk);
                return -ENODEV;
        }

        net->uevent_sock = ue_sk;

        /* Restrict uevents to initial user namespace. */
        if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
                mutex_lock(&uevent_sock_mutex);
                list_add_tail(&ue_sk->list, &uevent_sock_list);
                mutex_unlock(&uevent_sock_mutex);
        }

        return 0;
}

static void uevent_net_exit(struct net *net)
{
        struct uevent_sock *ue_sk = net->uevent_sock;

        if (sock_net(ue_sk->sk)->user_ns == &init_user_ns) {
                mutex_lock(&uevent_sock_mutex);
                list_del(&ue_sk->list);
                mutex_unlock(&uevent_sock_mutex);
        }

        netlink_kernel_release(ue_sk->sk);
        kfree(ue_sk);
}

static struct pernet_operations uevent_net_ops = {
        .init        = uevent_net_init,
        .exit        = uevent_net_exit,
};

static int __init kobject_uevent_init(void)
{
        return register_pernet_subsys(&uevent_net_ops);
}


postcore_initcall(kobject_uevent_init);
#endif














    3 





    3 



    3 





































    3 


























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// SPDX-License-Identifier: GPL-2.0-or-later
/* Copyright Amazon.com Inc. or its affiliates. */

#include <linux/init.h>
#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/netdev_lock.h>
#include <net/netns/generic.h>

int netdev_debug_event(struct notifier_block *nb, unsigned long event,
                       void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);
        enum netdev_cmd cmd = event;

        /* Keep enum and don't add default to trigger -Werror=switch */
        switch (cmd) {
        case NETDEV_REGISTER:
        case NETDEV_UP:
        case NETDEV_CHANGE:
                netdev_ops_assert_locked(dev);
                fallthrough;
        case NETDEV_DOWN:
        case NETDEV_REBOOT:
        case NETDEV_UNREGISTER:
        case NETDEV_CHANGEMTU:
        case NETDEV_CHANGEADDR:
        case NETDEV_PRE_CHANGEADDR:
        case NETDEV_GOING_DOWN:
        case NETDEV_FEAT_CHANGE:
        case NETDEV_BONDING_FAILOVER:
        case NETDEV_PRE_UP:
        case NETDEV_PRE_TYPE_CHANGE:
        case NETDEV_POST_TYPE_CHANGE:
        case NETDEV_POST_INIT:
        case NETDEV_PRE_UNINIT:
        case NETDEV_RELEASE:
        case NETDEV_NOTIFY_PEERS:
        case NETDEV_JOIN:
        case NETDEV_CHANGEUPPER:
        case NETDEV_RESEND_IGMP:
        case NETDEV_PRECHANGEMTU:
        case NETDEV_CHANGEINFODATA:
        case NETDEV_BONDING_INFO:
        case NETDEV_PRECHANGEUPPER:
        case NETDEV_CHANGELOWERSTATE:
        case NETDEV_UDP_TUNNEL_PUSH_INFO:
        case NETDEV_UDP_TUNNEL_DROP_INFO:
        case NETDEV_CHANGE_TX_QUEUE_LEN:
        case NETDEV_CVLAN_FILTER_PUSH_INFO:
        case NETDEV_CVLAN_FILTER_DROP_INFO:
        case NETDEV_SVLAN_FILTER_PUSH_INFO:
        case NETDEV_SVLAN_FILTER_DROP_INFO:
        case NETDEV_OFFLOAD_XSTATS_ENABLE:
        case NETDEV_OFFLOAD_XSTATS_DISABLE:
        case NETDEV_OFFLOAD_XSTATS_REPORT_USED:
        case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA:
        case NETDEV_XDP_FEAT_CHANGE:
                ASSERT_RTNL();
                break;

        case NETDEV_CHANGENAME:
                ASSERT_RTNL_NET(net);
                break;
        }

        return NOTIFY_DONE;
}
EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL");

static int rtnl_net_debug_net_id;

static int __net_init rtnl_net_debug_net_init(struct net *net)
{
        struct notifier_block *nb;

        nb = net_generic(net, rtnl_net_debug_net_id);
        nb->notifier_call = netdev_debug_event;

        return register_netdevice_notifier_net(net, nb);
}

static void __net_exit rtnl_net_debug_net_exit(struct net *net)
{
        struct notifier_block *nb;

        nb = net_generic(net, rtnl_net_debug_net_id);
        unregister_netdevice_notifier_net(net, nb);
}

static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = {
        .init = rtnl_net_debug_net_init,
        .exit = rtnl_net_debug_net_exit,
        .id = &rtnl_net_debug_net_id,
        .size = sizeof(struct notifier_block),
};

static struct notifier_block rtnl_net_debug_block = {
        .notifier_call = netdev_debug_event,
};

static int __init rtnl_net_debug_init(void)
{
        int ret;

        ret = register_pernet_subsys(&rtnl_net_debug_net_ops);
        if (ret)
                return ret;

        ret = register_netdevice_notifier(&rtnl_net_debug_block);
        if (ret)
                unregister_pernet_subsys(&rtnl_net_debug_net_ops);

        return ret;
}

subsys_initcall(rtnl_net_debug_init);
























  442 























































  310 
  242 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RCULIST_BL_H
#define _LINUX_RCULIST_BL_H

/*
 * RCU-protected bl list version. See include/linux/list_bl.h.
 */
#include <linux/list_bl.h>
#include <linux/rcupdate.h>

static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
                                        struct hlist_bl_node *n)
{
        LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK);
        LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) !=
                                                        LIST_BL_LOCKMASK);
        rcu_assign_pointer(h->first,
                (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK));
}

static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
{
        return (struct hlist_bl_node *)
                ((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
}

/**
 * hlist_bl_del_rcu - deletes entry from hash list without re-initialization
 * @n: the element to delete from the hash list.
 *
 * Note: hlist_bl_unhashed() on entry does not return true after this,
 * the entry is in an undefined state. It is useful for RCU based
 * lockfree traversal.
 *
 * In particular, it means that we can not poison the forward
 * pointers that may still be used for walking the hash list.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry().
 */
static inline void hlist_bl_del_rcu(struct hlist_bl_node *n)
{
        __hlist_bl_del(n);
        n->pprev = LIST_POISON2;
}

/**
 * hlist_bl_add_head_rcu
 * @n: the element to add to the hash list.
 * @h: the list to add to.
 *
 * Description:
 * Adds the specified element to the specified hlist_bl,
 * while permitting racing traversals.
 *
 * The caller must take whatever precautions are necessary
 * (such as holding appropriate locks) to avoid racing
 * with another list-mutation primitive, such as hlist_bl_add_head_rcu()
 * or hlist_bl_del_rcu(), running on this same list.
 * However, it is perfectly legal to run concurrently with
 * the _rcu list-traversal primitives, such as
 * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency
 * problems on Alpha CPUs.  Regardless of the type of CPU, the
 * list-traversal primitive must be guarded by rcu_read_lock().
 */
static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n,
                                        struct hlist_bl_head *h)
{
        struct hlist_bl_node *first;

        /* don't need hlist_bl_first_rcu because we're under lock */
        first = hlist_bl_first(h);

        n->next = first;
        if (first)
                first->pprev = &n->next;
        n->pprev = &h->first;

        /* need _rcu because we can have concurrent lock free readers */
        hlist_bl_set_first_rcu(h, n);
}
/**
 * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type
 * @tpos:        the type * to use as a loop cursor.
 * @pos:        the &struct hlist_bl_node to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_bl_node within the struct.
 *
 */
#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member)                \
        for (pos = hlist_bl_first_rcu(head);                                \
                pos &&                                                        \
                ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \
                pos = rcu_dereference_raw(pos->next))

#endif























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2008-2011, Intel Corporation.
 *
 * Description: Data Center Bridging netlink interface
 * Author: Lucy Liu <lucy.liu@intel.com>
 */

#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/slab.h>
#include <net/netlink.h>
#include <net/rtnetlink.h>
#include <linux/dcbnl.h>
#include <net/dcbevent.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <net/sock.h>

/* Data Center Bridging (DCB) is a collection of Ethernet enhancements
 * intended to allow network traffic with differing requirements
 * (highly reliable, no drops vs. best effort vs. low latency) to operate
 * and co-exist on Ethernet.  Current DCB features are:
 *
 * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a
 *   framework for assigning bandwidth guarantees to traffic classes.
 *
 * Priority-based Flow Control (PFC) - provides a flow control mechanism which
 *   can work independently for each 802.1p priority.
 *
 * Congestion Notification - provides a mechanism for end-to-end congestion
 *   control for protocols which do not have built-in congestion management.
 *
 * More information about the emerging standards for these Ethernet features
 * can be found at: http://www.ieee802.org/1/pages/dcbridges.html
 *
 * This file implements an rtnetlink interface to allow configuration of DCB
 * features for capable devices.
 */

/**************** DCB attribute policies *************************************/

/* DCB netlink attributes policy */
static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = {
        [DCB_ATTR_IFNAME]      = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1},
        [DCB_ATTR_STATE]       = {.type = NLA_U8},
        [DCB_ATTR_PFC_CFG]     = {.type = NLA_NESTED},
        [DCB_ATTR_PG_CFG]      = {.type = NLA_NESTED},
        [DCB_ATTR_SET_ALL]     = {.type = NLA_U8},
        [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG},
        [DCB_ATTR_CAP]         = {.type = NLA_NESTED},
        [DCB_ATTR_PFC_STATE]   = {.type = NLA_U8},
        [DCB_ATTR_BCN]         = {.type = NLA_NESTED},
        [DCB_ATTR_APP]         = {.type = NLA_NESTED},
        [DCB_ATTR_IEEE]               = {.type = NLA_NESTED},
        [DCB_ATTR_DCBX]        = {.type = NLA_U8},
        [DCB_ATTR_FEATCFG]     = {.type = NLA_NESTED},
};

/* DCB priority flow control to User Priority nested attributes */
static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = {
        [DCB_PFC_UP_ATTR_0]   = {.type = NLA_U8},
        [DCB_PFC_UP_ATTR_1]   = {.type = NLA_U8},
        [DCB_PFC_UP_ATTR_2]   = {.type = NLA_U8},
        [DCB_PFC_UP_ATTR_3]   = {.type = NLA_U8},
        [DCB_PFC_UP_ATTR_4]   = {.type = NLA_U8},
        [DCB_PFC_UP_ATTR_5]   = {.type = NLA_U8},
        [DCB_PFC_UP_ATTR_6]   = {.type = NLA_U8},
        [DCB_PFC_UP_ATTR_7]   = {.type = NLA_U8},
        [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG},
};

/* DCB priority grouping nested attributes */
static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = {
        [DCB_PG_ATTR_TC_0]      = {.type = NLA_NESTED},
        [DCB_PG_ATTR_TC_1]      = {.type = NLA_NESTED},
        [DCB_PG_ATTR_TC_2]      = {.type = NLA_NESTED},
        [DCB_PG_ATTR_TC_3]      = {.type = NLA_NESTED},
        [DCB_PG_ATTR_TC_4]      = {.type = NLA_NESTED},
        [DCB_PG_ATTR_TC_5]      = {.type = NLA_NESTED},
        [DCB_PG_ATTR_TC_6]      = {.type = NLA_NESTED},
        [DCB_PG_ATTR_TC_7]      = {.type = NLA_NESTED},
        [DCB_PG_ATTR_TC_ALL]    = {.type = NLA_NESTED},
        [DCB_PG_ATTR_BW_ID_0]   = {.type = NLA_U8},
        [DCB_PG_ATTR_BW_ID_1]   = {.type = NLA_U8},
        [DCB_PG_ATTR_BW_ID_2]   = {.type = NLA_U8},
        [DCB_PG_ATTR_BW_ID_3]   = {.type = NLA_U8},
        [DCB_PG_ATTR_BW_ID_4]   = {.type = NLA_U8},
        [DCB_PG_ATTR_BW_ID_5]   = {.type = NLA_U8},
        [DCB_PG_ATTR_BW_ID_6]   = {.type = NLA_U8},
        [DCB_PG_ATTR_BW_ID_7]   = {.type = NLA_U8},
        [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG},
};

/* DCB traffic class nested attributes. */
static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = {
        [DCB_TC_ATTR_PARAM_PGID]            = {.type = NLA_U8},
        [DCB_TC_ATTR_PARAM_UP_MAPPING]      = {.type = NLA_U8},
        [DCB_TC_ATTR_PARAM_STRICT_PRIO]     = {.type = NLA_U8},
        [DCB_TC_ATTR_PARAM_BW_PCT]          = {.type = NLA_U8},
        [DCB_TC_ATTR_PARAM_ALL]             = {.type = NLA_FLAG},
};

/* DCB capabilities nested attributes. */
static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = {
        [DCB_CAP_ATTR_ALL]     = {.type = NLA_FLAG},
        [DCB_CAP_ATTR_PG]      = {.type = NLA_U8},
        [DCB_CAP_ATTR_PFC]     = {.type = NLA_U8},
        [DCB_CAP_ATTR_UP2TC]   = {.type = NLA_U8},
        [DCB_CAP_ATTR_PG_TCS]  = {.type = NLA_U8},
        [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8},
        [DCB_CAP_ATTR_GSP]     = {.type = NLA_U8},
        [DCB_CAP_ATTR_BCN]     = {.type = NLA_U8},
        [DCB_CAP_ATTR_DCBX]    = {.type = NLA_U8},
};

/* DCB capabilities nested attributes. */
static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = {
        [DCB_NUMTCS_ATTR_ALL]     = {.type = NLA_FLAG},
        [DCB_NUMTCS_ATTR_PG]      = {.type = NLA_U8},
        [DCB_NUMTCS_ATTR_PFC]     = {.type = NLA_U8},
};

/* DCB BCN nested attributes. */
static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = {
        [DCB_BCN_ATTR_RP_0]         = {.type = NLA_U8},
        [DCB_BCN_ATTR_RP_1]         = {.type = NLA_U8},
        [DCB_BCN_ATTR_RP_2]         = {.type = NLA_U8},
        [DCB_BCN_ATTR_RP_3]         = {.type = NLA_U8},
        [DCB_BCN_ATTR_RP_4]         = {.type = NLA_U8},
        [DCB_BCN_ATTR_RP_5]         = {.type = NLA_U8},
        [DCB_BCN_ATTR_RP_6]         = {.type = NLA_U8},
        [DCB_BCN_ATTR_RP_7]         = {.type = NLA_U8},
        [DCB_BCN_ATTR_RP_ALL]       = {.type = NLA_FLAG},
        [DCB_BCN_ATTR_BCNA_0]       = {.type = NLA_U32},
        [DCB_BCN_ATTR_BCNA_1]       = {.type = NLA_U32},
        [DCB_BCN_ATTR_ALPHA]        = {.type = NLA_U32},
        [DCB_BCN_ATTR_BETA]         = {.type = NLA_U32},
        [DCB_BCN_ATTR_GD]           = {.type = NLA_U32},
        [DCB_BCN_ATTR_GI]           = {.type = NLA_U32},
        [DCB_BCN_ATTR_TMAX]         = {.type = NLA_U32},
        [DCB_BCN_ATTR_TD]           = {.type = NLA_U32},
        [DCB_BCN_ATTR_RMIN]         = {.type = NLA_U32},
        [DCB_BCN_ATTR_W]            = {.type = NLA_U32},
        [DCB_BCN_ATTR_RD]           = {.type = NLA_U32},
        [DCB_BCN_ATTR_RU]           = {.type = NLA_U32},
        [DCB_BCN_ATTR_WRTT]         = {.type = NLA_U32},
        [DCB_BCN_ATTR_RI]           = {.type = NLA_U32},
        [DCB_BCN_ATTR_C]            = {.type = NLA_U32},
        [DCB_BCN_ATTR_ALL]          = {.type = NLA_FLAG},
};

/* DCB APP nested attributes. */
static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = {
        [DCB_APP_ATTR_IDTYPE]       = {.type = NLA_U8},
        [DCB_APP_ATTR_ID]           = {.type = NLA_U16},
        [DCB_APP_ATTR_PRIORITY]     = {.type = NLA_U8},
};

/* IEEE 802.1Qaz nested attributes. */
static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
        [DCB_ATTR_IEEE_ETS]            = {.len = sizeof(struct ieee_ets)},
        [DCB_ATTR_IEEE_PFC]            = {.len = sizeof(struct ieee_pfc)},
        [DCB_ATTR_IEEE_APP_TABLE]   = {.type = NLA_NESTED},
        [DCB_ATTR_IEEE_MAXRATE]   = {.len = sizeof(struct ieee_maxrate)},
        [DCB_ATTR_IEEE_QCN]         = {.len = sizeof(struct ieee_qcn)},
        [DCB_ATTR_IEEE_QCN_STATS]   = {.len = sizeof(struct ieee_qcn_stats)},
        [DCB_ATTR_DCB_BUFFER]       = {.len = sizeof(struct dcbnl_buffer)},
        [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED},
};

/* DCB number of traffic classes nested attributes. */
static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = {
        [DCB_FEATCFG_ATTR_ALL]      = {.type = NLA_FLAG},
        [DCB_FEATCFG_ATTR_PG]       = {.type = NLA_U8},
        [DCB_FEATCFG_ATTR_PFC]      = {.type = NLA_U8},
        [DCB_FEATCFG_ATTR_APP]      = {.type = NLA_U8},
};

static LIST_HEAD(dcb_app_list);
static LIST_HEAD(dcb_rewr_list);
static DEFINE_SPINLOCK(dcb_lock);

static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector)
{
        switch (selector) {
        case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
        case IEEE_8021QAZ_APP_SEL_STREAM:
        case IEEE_8021QAZ_APP_SEL_DGRAM:
        case IEEE_8021QAZ_APP_SEL_ANY:
        case IEEE_8021QAZ_APP_SEL_DSCP:
                return DCB_ATTR_IEEE_APP;
        case DCB_APP_SEL_PCP:
                return DCB_ATTR_DCB_APP;
        default:
                return DCB_ATTR_IEEE_APP_UNSPEC;
        }
}

static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type)
{
        switch (type) {
        case DCB_ATTR_IEEE_APP:
        case DCB_ATTR_DCB_APP:
                return true;
        default:
                return false;
        }
}

static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector)
{
        return dcbnl_app_attr_type_get(selector) == type;
}

static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq,
                                    u32 flags, struct nlmsghdr **nlhp)
{
        struct sk_buff *skb;
        struct dcbmsg *dcb;
        struct nlmsghdr *nlh;

        skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!skb)
                return NULL;

        nlh = nlmsg_put(skb, port, seq, type, sizeof(*dcb), flags);
        BUG_ON(!nlh);

        dcb = nlmsg_data(nlh);
        dcb->dcb_family = AF_UNSPEC;
        dcb->cmd = cmd;
        dcb->dcb_pad = 0;

        if (nlhp)
                *nlhp = nlh;

        return skb;
}

static int dcbnl_getstate(struct net_device *netdev, struct nlmsghdr *nlh,
                          u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */
        if (!netdev->dcbnl_ops->getstate)
                return -EOPNOTSUPP;

        return nla_put_u8(skb, DCB_ATTR_STATE,
                          netdev->dcbnl_ops->getstate(netdev));
}

static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh,
                           u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest;
        u8 value;
        int ret;
        int i;
        int getall = 0;

        if (!tb[DCB_ATTR_PFC_CFG])
                return -EINVAL;

        if (!netdev->dcbnl_ops->getpfccfg)
                return -EOPNOTSUPP;

        ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX,
                                          tb[DCB_ATTR_PFC_CFG],
                                          dcbnl_pfc_up_nest, NULL);
        if (ret)
                return ret;

        nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG);
        if (!nest)
                return -EMSGSIZE;

        if (data[DCB_PFC_UP_ATTR_ALL])
                getall = 1;

        for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
                if (!getall && !data[i])
                        continue;

                netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0,
                                             &value);
                ret = nla_put_u8(skb, i, value);
                if (ret) {
                        nla_nest_cancel(skb, nest);
                        return ret;
                }
        }
        nla_nest_end(skb, nest);

        return 0;
}

static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh,
                                u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        u8 perm_addr[MAX_ADDR_LEN];

        if (!netdev->dcbnl_ops->getpermhwaddr)
                return -EOPNOTSUPP;

        memset(perm_addr, 0, sizeof(perm_addr));
        netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr);

        return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr);
}

static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh,
                        u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest;
        u8 value;
        int ret;
        int i;
        int getall = 0;

        if (!tb[DCB_ATTR_CAP])
                return -EINVAL;

        if (!netdev->dcbnl_ops->getcap)
                return -EOPNOTSUPP;

        ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX,
                                          tb[DCB_ATTR_CAP], dcbnl_cap_nest,
                                          NULL);
        if (ret)
                return ret;

        nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP);
        if (!nest)
                return -EMSGSIZE;

        if (data[DCB_CAP_ATTR_ALL])
                getall = 1;

        for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) {
                if (!getall && !data[i])
                        continue;

                if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) {
                        ret = nla_put_u8(skb, i, value);
                        if (ret) {
                                nla_nest_cancel(skb, nest);
                                return ret;
                        }
                }
        }
        nla_nest_end(skb, nest);

        return 0;
}

static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh,
                           u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest;
        u8 value;
        int ret;
        int i;
        int getall = 0;

        if (!tb[DCB_ATTR_NUMTCS])
                return -EINVAL;

        if (!netdev->dcbnl_ops->getnumtcs)
                return -EOPNOTSUPP;

        ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX,
                                          tb[DCB_ATTR_NUMTCS],
                                          dcbnl_numtcs_nest, NULL);
        if (ret)
                return ret;

        nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS);
        if (!nest)
                return -EMSGSIZE;

        if (data[DCB_NUMTCS_ATTR_ALL])
                getall = 1;

        for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
                if (!getall && !data[i])
                        continue;

                ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value);
                if (!ret) {
                        ret = nla_put_u8(skb, i, value);
                        if (ret) {
                                nla_nest_cancel(skb, nest);
                                return ret;
                        }
                } else
                        return -EINVAL;
        }
        nla_nest_end(skb, nest);

        return 0;
}

static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh,
                           u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1];
        int ret;
        u8 value;
        int i;

        if (!tb[DCB_ATTR_NUMTCS])
                return -EINVAL;

        if (!netdev->dcbnl_ops->setnumtcs)
                return -EOPNOTSUPP;

        ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX,
                                          tb[DCB_ATTR_NUMTCS],
                                          dcbnl_numtcs_nest, NULL);
        if (ret)
                return ret;

        for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) {
                if (data[i] == NULL)
                        continue;

                value = nla_get_u8(data[i]);

                ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value);
                if (ret)
                        break;
        }

        return nla_put_u8(skb, DCB_ATTR_NUMTCS, !!ret);
}

static int dcbnl_getpfcstate(struct net_device *netdev, struct nlmsghdr *nlh,
                             u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        if (!netdev->dcbnl_ops->getpfcstate)
                return -EOPNOTSUPP;

        return nla_put_u8(skb, DCB_ATTR_PFC_STATE,
                          netdev->dcbnl_ops->getpfcstate(netdev));
}

static int dcbnl_setpfcstate(struct net_device *netdev, struct nlmsghdr *nlh,
                             u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        u8 value;

        if (!tb[DCB_ATTR_PFC_STATE])
                return -EINVAL;

        if (!netdev->dcbnl_ops->setpfcstate)
                return -EOPNOTSUPP;

        value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]);

        netdev->dcbnl_ops->setpfcstate(netdev, value);

        return nla_put_u8(skb, DCB_ATTR_PFC_STATE, 0);
}

static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh,
                        u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *app_nest;
        struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1];
        u16 id;
        u8 up, idtype;
        int ret;

        if (!tb[DCB_ATTR_APP])
                return -EINVAL;

        ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX,
                                          tb[DCB_ATTR_APP], dcbnl_app_nest,
                                          NULL);
        if (ret)
                return ret;

        /* all must be non-null */
        if ((!app_tb[DCB_APP_ATTR_IDTYPE]) ||
            (!app_tb[DCB_APP_ATTR_ID]))
                return -EINVAL;

        /* either by eth type or by socket number */
        idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]);
        if ((idtype != DCB_APP_IDTYPE_ETHTYPE) &&
            (idtype != DCB_APP_IDTYPE_PORTNUM))
                return -EINVAL;

        id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);

        if (netdev->dcbnl_ops->getapp) {
                ret = netdev->dcbnl_ops->getapp(netdev, idtype, id);
                if (ret < 0)
                        return ret;
                else
                        up = ret;
        } else {
                struct dcb_app app = {
                                        .selector = idtype,
                                        .protocol = id,
                                     };
                up = dcb_getapp(netdev, &app);
        }

        app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP);
        if (!app_nest)
                return -EMSGSIZE;

        ret = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, idtype);
        if (ret)
                goto out_cancel;

        ret = nla_put_u16(skb, DCB_APP_ATTR_ID, id);
        if (ret)
                goto out_cancel;

        ret = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, up);
        if (ret)
                goto out_cancel;

        nla_nest_end(skb, app_nest);

        return 0;

out_cancel:
        nla_nest_cancel(skb, app_nest);
        return ret;
}

static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh,
                        u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        int ret;
        u16 id;
        u8 up, idtype;
        struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1];

        if (!tb[DCB_ATTR_APP])
                return -EINVAL;

        ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX,
                                          tb[DCB_ATTR_APP], dcbnl_app_nest,
                                          NULL);
        if (ret)
                return ret;

        /* all must be non-null */
        if ((!app_tb[DCB_APP_ATTR_IDTYPE]) ||
            (!app_tb[DCB_APP_ATTR_ID]) ||
            (!app_tb[DCB_APP_ATTR_PRIORITY]))
                return -EINVAL;

        /* either by eth type or by socket number */
        idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]);
        if ((idtype != DCB_APP_IDTYPE_ETHTYPE) &&
            (idtype != DCB_APP_IDTYPE_PORTNUM))
                return -EINVAL;

        id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]);
        up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]);

        if (netdev->dcbnl_ops->setapp) {
                ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up);
                if (ret < 0)
                        return ret;
        } else {
                struct dcb_app app;
                app.selector = idtype;
                app.protocol = id;
                app.priority = up;
                ret = dcb_setapp(netdev, &app);
        }

        ret = nla_put_u8(skb, DCB_ATTR_APP, ret);
        dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SAPP, seq, 0);

        return ret;
}

static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                             struct nlattr **tb, struct sk_buff *skb, int dir)
{
        struct nlattr *pg_nest, *param_nest, *data;
        struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
        struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
        u8 prio, pgid, tc_pct, up_map;
        int ret;
        int getall = 0;
        int i;

        if (!tb[DCB_ATTR_PG_CFG])
                return -EINVAL;

        if (!netdev->dcbnl_ops->getpgtccfgtx ||
            !netdev->dcbnl_ops->getpgtccfgrx ||
            !netdev->dcbnl_ops->getpgbwgcfgtx ||
            !netdev->dcbnl_ops->getpgbwgcfgrx)
                return -EOPNOTSUPP;

        ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX,
                                          tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest,
                                          NULL);
        if (ret)
                return ret;

        pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG);
        if (!pg_nest)
                return -EMSGSIZE;

        if (pg_tb[DCB_PG_ATTR_TC_ALL])
                getall = 1;

        for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
                if (!getall && !pg_tb[i])
                        continue;

                if (pg_tb[DCB_PG_ATTR_TC_ALL])
                        data = pg_tb[DCB_PG_ATTR_TC_ALL];
                else
                        data = pg_tb[i];
                ret = nla_parse_nested_deprecated(param_tb,
                                                  DCB_TC_ATTR_PARAM_MAX, data,
                                                  dcbnl_tc_param_nest, NULL);
                if (ret)
                        goto err_pg;

                param_nest = nla_nest_start_noflag(skb, i);
                if (!param_nest)
                        goto err_pg;

                pgid = DCB_ATTR_VALUE_UNDEFINED;
                prio = DCB_ATTR_VALUE_UNDEFINED;
                tc_pct = DCB_ATTR_VALUE_UNDEFINED;
                up_map = DCB_ATTR_VALUE_UNDEFINED;

                if (dir) {
                        /* Rx */
                        netdev->dcbnl_ops->getpgtccfgrx(netdev,
                                                i - DCB_PG_ATTR_TC_0, &prio,
                                                &pgid, &tc_pct, &up_map);
                } else {
                        /* Tx */
                        netdev->dcbnl_ops->getpgtccfgtx(netdev,
                                                i - DCB_PG_ATTR_TC_0, &prio,
                                                &pgid, &tc_pct, &up_map);
                }

                if (param_tb[DCB_TC_ATTR_PARAM_PGID] ||
                    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
                        ret = nla_put_u8(skb,
                                         DCB_TC_ATTR_PARAM_PGID, pgid);
                        if (ret)
                                goto err_param;
                }
                if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] ||
                    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
                        ret = nla_put_u8(skb,
                                         DCB_TC_ATTR_PARAM_UP_MAPPING, up_map);
                        if (ret)
                                goto err_param;
                }
                if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] ||
                    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
                        ret = nla_put_u8(skb,
                                         DCB_TC_ATTR_PARAM_STRICT_PRIO, prio);
                        if (ret)
                                goto err_param;
                }
                if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] ||
                    param_tb[DCB_TC_ATTR_PARAM_ALL]) {
                        ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT,
                                         tc_pct);
                        if (ret)
                                goto err_param;
                }
                nla_nest_end(skb, param_nest);
        }

        if (pg_tb[DCB_PG_ATTR_BW_ID_ALL])
                getall = 1;
        else
                getall = 0;

        for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
                if (!getall && !pg_tb[i])
                        continue;

                tc_pct = DCB_ATTR_VALUE_UNDEFINED;

                if (dir) {
                        /* Rx */
                        netdev->dcbnl_ops->getpgbwgcfgrx(netdev,
                                        i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
                } else {
                        /* Tx */
                        netdev->dcbnl_ops->getpgbwgcfgtx(netdev,
                                        i - DCB_PG_ATTR_BW_ID_0, &tc_pct);
                }
                ret = nla_put_u8(skb, i, tc_pct);
                if (ret)
                        goto err_pg;
        }

        nla_nest_end(skb, pg_nest);

        return 0;

err_param:
        nla_nest_cancel(skb, param_nest);
err_pg:
        nla_nest_cancel(skb, pg_nest);

        return -EMSGSIZE;
}

static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                             u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 0);
}

static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                             u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 1);
}

static int dcbnl_setstate(struct net_device *netdev, struct nlmsghdr *nlh,
                          u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        u8 value;

        if (!tb[DCB_ATTR_STATE])
                return -EINVAL;

        if (!netdev->dcbnl_ops->setstate)
                return -EOPNOTSUPP;

        value = nla_get_u8(tb[DCB_ATTR_STATE]);

        return nla_put_u8(skb, DCB_ATTR_STATE,
                          netdev->dcbnl_ops->setstate(netdev, value));
}

static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh,
                           u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1];
        int i;
        int ret;
        u8 value;

        if (!tb[DCB_ATTR_PFC_CFG])
                return -EINVAL;

        if (!netdev->dcbnl_ops->setpfccfg)
                return -EOPNOTSUPP;

        ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX,
                                          tb[DCB_ATTR_PFC_CFG],
                                          dcbnl_pfc_up_nest, NULL);
        if (ret)
                return ret;

        for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
                if (data[i] == NULL)
                        continue;
                value = nla_get_u8(data[i]);
                netdev->dcbnl_ops->setpfccfg(netdev,
                        data[i]->nla_type - DCB_PFC_UP_ATTR_0, value);
        }

        return nla_put_u8(skb, DCB_ATTR_PFC_CFG, 0);
}

static int dcbnl_setall(struct net_device *netdev, struct nlmsghdr *nlh,
                        u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        int ret;

        if (!tb[DCB_ATTR_SET_ALL])
                return -EINVAL;

        if (!netdev->dcbnl_ops->setall)
                return -EOPNOTSUPP;

        ret = nla_put_u8(skb, DCB_ATTR_SET_ALL,
                         netdev->dcbnl_ops->setall(netdev));
        dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SET_ALL, seq, 0);

        return ret;
}

static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                             u32 seq, struct nlattr **tb, struct sk_buff *skb,
                             int dir)
{
        struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1];
        struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1];
        int ret;
        int i;
        u8 pgid;
        u8 up_map;
        u8 prio;
        u8 tc_pct;

        if (!tb[DCB_ATTR_PG_CFG])
                return -EINVAL;

        if (!netdev->dcbnl_ops->setpgtccfgtx ||
            !netdev->dcbnl_ops->setpgtccfgrx ||
            !netdev->dcbnl_ops->setpgbwgcfgtx ||
            !netdev->dcbnl_ops->setpgbwgcfgrx)
                return -EOPNOTSUPP;

        ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX,
                                          tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest,
                                          NULL);
        if (ret)
                return ret;

        for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
                if (!pg_tb[i])
                        continue;

                ret = nla_parse_nested_deprecated(param_tb,
                                                  DCB_TC_ATTR_PARAM_MAX,
                                                  pg_tb[i],
                                                  dcbnl_tc_param_nest, NULL);
                if (ret)
                        return ret;

                pgid = DCB_ATTR_VALUE_UNDEFINED;
                prio = DCB_ATTR_VALUE_UNDEFINED;
                tc_pct = DCB_ATTR_VALUE_UNDEFINED;
                up_map = DCB_ATTR_VALUE_UNDEFINED;

                if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO])
                        prio =
                            nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]);

                if (param_tb[DCB_TC_ATTR_PARAM_PGID])
                        pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]);

                if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT])
                        tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]);

                if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING])
                        up_map =
                             nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]);

                /* dir: Tx = 0, Rx = 1 */
                if (dir) {
                        /* Rx */
                        netdev->dcbnl_ops->setpgtccfgrx(netdev,
                                i - DCB_PG_ATTR_TC_0,
                                prio, pgid, tc_pct, up_map);
                } else {
                        /* Tx */
                        netdev->dcbnl_ops->setpgtccfgtx(netdev,
                                i - DCB_PG_ATTR_TC_0,
                                prio, pgid, tc_pct, up_map);
                }
        }

        for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
                if (!pg_tb[i])
                        continue;

                tc_pct = nla_get_u8(pg_tb[i]);

                /* dir: Tx = 0, Rx = 1 */
                if (dir) {
                        /* Rx */
                        netdev->dcbnl_ops->setpgbwgcfgrx(netdev,
                                         i - DCB_PG_ATTR_BW_ID_0, tc_pct);
                } else {
                        /* Tx */
                        netdev->dcbnl_ops->setpgbwgcfgtx(netdev,
                                         i - DCB_PG_ATTR_BW_ID_0, tc_pct);
                }
        }

        return nla_put_u8(skb, DCB_ATTR_PG_CFG, 0);
}

static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                             u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 0);
}

static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                             u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 1);
}

static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                            u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *bcn_nest;
        struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1];
        u8 value_byte;
        u32 value_integer;
        int ret;
        bool getall = false;
        int i;

        if (!tb[DCB_ATTR_BCN])
                return -EINVAL;

        if (!netdev->dcbnl_ops->getbcnrp ||
            !netdev->dcbnl_ops->getbcncfg)
                return -EOPNOTSUPP;

        ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX,
                                          tb[DCB_ATTR_BCN], dcbnl_bcn_nest,
                                          NULL);
        if (ret)
                return ret;

        bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN);
        if (!bcn_nest)
                return -EMSGSIZE;

        if (bcn_tb[DCB_BCN_ATTR_ALL])
                getall = true;

        for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
                if (!getall && !bcn_tb[i])
                        continue;

                netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0,
                                            &value_byte);
                ret = nla_put_u8(skb, i, value_byte);
                if (ret)
                        goto err_bcn;
        }

        for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
                if (!getall && !bcn_tb[i])
                        continue;

                netdev->dcbnl_ops->getbcncfg(netdev, i,
                                             &value_integer);
                ret = nla_put_u32(skb, i, value_integer);
                if (ret)
                        goto err_bcn;
        }

        nla_nest_end(skb, bcn_nest);

        return 0;

err_bcn:
        nla_nest_cancel(skb, bcn_nest);
        return ret;
}

static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                            u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *data[DCB_BCN_ATTR_MAX + 1];
        int i;
        int ret;
        u8 value_byte;
        u32 value_int;

        if (!tb[DCB_ATTR_BCN])
                return -EINVAL;

        if (!netdev->dcbnl_ops->setbcncfg ||
            !netdev->dcbnl_ops->setbcnrp)
                return -EOPNOTSUPP;

        ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX,
                                          tb[DCB_ATTR_BCN], dcbnl_bcn_nest,
                                          NULL);
        if (ret)
                return ret;

        for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) {
                if (data[i] == NULL)
                        continue;
                value_byte = nla_get_u8(data[i]);
                netdev->dcbnl_ops->setbcnrp(netdev,
                        data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte);
        }

        for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) {
                if (data[i] == NULL)
                        continue;
                value_int = nla_get_u32(data[i]);
                netdev->dcbnl_ops->setbcncfg(netdev,
                                             i, value_int);
        }

        return nla_put_u8(skb, DCB_ATTR_BCN, 0);
}

static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb,
                                int app_nested_type, int app_info_type,
                                int app_entry_type)
{
        struct dcb_peer_app_info info;
        struct dcb_app *table = NULL;
        const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
        u16 app_count;
        int err;


        /**
         * retrieve the peer app configuration form the driver. If the driver
         * handlers fail exit without doing anything
         */
        err = ops->peer_getappinfo(netdev, &info, &app_count);
        if (!err && app_count) {
                table = kmalloc_array(app_count, sizeof(struct dcb_app),
                                      GFP_KERNEL);
                if (!table)
                        return -ENOMEM;

                err = ops->peer_getapptable(netdev, table);
        }

        if (!err) {
                u16 i;
                struct nlattr *app;

                /**
                 * build the message, from here on the only possible failure
                 * is due to the skb size
                 */
                err = -EMSGSIZE;

                app = nla_nest_start_noflag(skb, app_nested_type);
                if (!app)
                        goto nla_put_failure;

                if (app_info_type &&
                    nla_put(skb, app_info_type, sizeof(info), &info))
                        goto nla_put_failure;

                for (i = 0; i < app_count; i++) {
                        if (nla_put(skb, app_entry_type, sizeof(struct dcb_app),
                                    &table[i]))
                                goto nla_put_failure;
                }
                nla_nest_end(skb, app);
        }
        err = 0;

nla_put_failure:
        kfree(table);
        return err;
}

static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb)
{
        const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
        enum ieee_attrs_app type;
        struct nlattr *apptrust;
        int nselectors, err, i;
        u8 *selectors;

        selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL);
        if (!selectors)
                return -ENOMEM;

        err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors);
        if (err) {
                err = 0;
                goto out;
        }

        apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE);
        if (!apptrust) {
                err = -EMSGSIZE;
                goto out;
        }

        for (i = 0; i < nselectors; i++) {
                type = dcbnl_app_attr_type_get(selectors[i]);
                err = nla_put_u8(skb, type, selectors[i]);
                if (err) {
                        nla_nest_cancel(skb, apptrust);
                        goto out;
                }
        }
        nla_nest_end(skb, apptrust);

out:
        kfree(selectors);
        return err;
}

/* Set or delete APP table or rewrite table entries. The APP struct is validated
 * and the appropriate callback function is called.
 */
static int dcbnl_app_table_setdel(struct nlattr *attr,
                                  struct net_device *netdev,
                                  int (*setdel)(struct net_device *dev,
                                                struct dcb_app *app))
{
        struct dcb_app *app_data;
        enum ieee_attrs_app type;
        struct nlattr *attr_itr;
        int rem, err;

        nla_for_each_nested(attr_itr, attr, rem) {
                type = nla_type(attr_itr);

                if (!dcbnl_app_attr_type_validate(type))
                        continue;

                if (nla_len(attr_itr) < sizeof(struct dcb_app))
                        return -ERANGE;

                app_data = nla_data(attr_itr);

                if (!dcbnl_app_selector_validate(type, app_data->selector))
                        return -EINVAL;

                err = setdel(netdev, app_data);
                if (err)
                        return err;
        }

        return 0;
}

/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */
static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
{
        const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
        struct nlattr *ieee, *app, *rewr;
        struct dcb_app_type *itr;
        int dcbx;
        int err;

        if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name))
                return -EMSGSIZE;

        ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE);
        if (!ieee)
                return -EMSGSIZE;

        if (ops->ieee_getets) {
                struct ieee_ets ets;
                memset(&ets, 0, sizeof(ets));
                err = ops->ieee_getets(netdev, &ets);
                if (!err &&
                    nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets))
                        return -EMSGSIZE;
        }

        if (ops->ieee_getmaxrate) {
                struct ieee_maxrate maxrate;
                memset(&maxrate, 0, sizeof(maxrate));
                err = ops->ieee_getmaxrate(netdev, &maxrate);
                if (!err) {
                        err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE,
                                      sizeof(maxrate), &maxrate);
                        if (err)
                                return -EMSGSIZE;
                }
        }

        if (ops->ieee_getqcn) {
                struct ieee_qcn qcn;

                memset(&qcn, 0, sizeof(qcn));
                err = ops->ieee_getqcn(netdev, &qcn);
                if (!err) {
                        err = nla_put(skb, DCB_ATTR_IEEE_QCN,
                                      sizeof(qcn), &qcn);
                        if (err)
                                return -EMSGSIZE;
                }
        }

        if (ops->ieee_getqcnstats) {
                struct ieee_qcn_stats qcn_stats;

                memset(&qcn_stats, 0, sizeof(qcn_stats));
                err = ops->ieee_getqcnstats(netdev, &qcn_stats);
                if (!err) {
                        err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS,
                                      sizeof(qcn_stats), &qcn_stats);
                        if (err)
                                return -EMSGSIZE;
                }
        }

        if (ops->ieee_getpfc) {
                struct ieee_pfc pfc;
                memset(&pfc, 0, sizeof(pfc));
                err = ops->ieee_getpfc(netdev, &pfc);
                if (!err &&
                    nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc))
                        return -EMSGSIZE;
        }

        if (ops->dcbnl_getbuffer) {
                struct dcbnl_buffer buffer;

                memset(&buffer, 0, sizeof(buffer));
                err = ops->dcbnl_getbuffer(netdev, &buffer);
                if (!err &&
                    nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer))
                        return -EMSGSIZE;
        }

        app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE);
        if (!app)
                return -EMSGSIZE;

        spin_lock_bh(&dcb_lock);
        list_for_each_entry(itr, &dcb_app_list, list) {
                if (itr->ifindex == netdev->ifindex) {
                        enum ieee_attrs_app type =
                                dcbnl_app_attr_type_get(itr->app.selector);
                        err = nla_put(skb, type, sizeof(itr->app), &itr->app);
                        if (err) {
                                spin_unlock_bh(&dcb_lock);
                                return -EMSGSIZE;
                        }
                }
        }

        if (netdev->dcbnl_ops->getdcbx)
                dcbx = netdev->dcbnl_ops->getdcbx(netdev);
        else
                dcbx = -EOPNOTSUPP;

        spin_unlock_bh(&dcb_lock);
        nla_nest_end(skb, app);

        rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE);
        if (!rewr)
                return -EMSGSIZE;

        spin_lock_bh(&dcb_lock);
        list_for_each_entry(itr, &dcb_rewr_list, list) {
                if (itr->ifindex == netdev->ifindex) {
                        enum ieee_attrs_app type =
                                dcbnl_app_attr_type_get(itr->app.selector);
                        err = nla_put(skb, type, sizeof(itr->app), &itr->app);
                        if (err) {
                                spin_unlock_bh(&dcb_lock);
                                nla_nest_cancel(skb, rewr);
                                return -EMSGSIZE;
                        }
                }
        }

        spin_unlock_bh(&dcb_lock);
        nla_nest_end(skb, rewr);

        if (ops->dcbnl_getapptrust) {
                err = dcbnl_getapptrust(netdev, skb);
                if (err)
                        return err;
        }

        /* get peer info if available */
        if (ops->ieee_peer_getets) {
                struct ieee_ets ets;
                memset(&ets, 0, sizeof(ets));
                err = ops->ieee_peer_getets(netdev, &ets);
                if (!err &&
                    nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets))
                        return -EMSGSIZE;
        }

        if (ops->ieee_peer_getpfc) {
                struct ieee_pfc pfc;
                memset(&pfc, 0, sizeof(pfc));
                err = ops->ieee_peer_getpfc(netdev, &pfc);
                if (!err &&
                    nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc))
                        return -EMSGSIZE;
        }

        if (ops->peer_getappinfo && ops->peer_getapptable) {
                err = dcbnl_build_peer_app(netdev, skb,
                                           DCB_ATTR_IEEE_PEER_APP,
                                           DCB_ATTR_IEEE_APP_UNSPEC,
                                           DCB_ATTR_IEEE_APP);
                if (err)
                        return -EMSGSIZE;
        }

        nla_nest_end(skb, ieee);
        if (dcbx >= 0) {
                err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx);
                if (err)
                        return -EMSGSIZE;
        }

        return 0;
}

static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev,
                             int dir)
{
        u8 pgid, up_map, prio, tc_pct;
        const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops;
        int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG;
        struct nlattr *pg = nla_nest_start_noflag(skb, i);

        if (!pg)
                return -EMSGSIZE;

        for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
                struct nlattr *tc_nest = nla_nest_start_noflag(skb, i);

                if (!tc_nest)
                        return -EMSGSIZE;

                pgid = DCB_ATTR_VALUE_UNDEFINED;
                prio = DCB_ATTR_VALUE_UNDEFINED;
                tc_pct = DCB_ATTR_VALUE_UNDEFINED;
                up_map = DCB_ATTR_VALUE_UNDEFINED;

                if (!dir)
                        ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0,
                                          &prio, &pgid, &tc_pct, &up_map);
                else
                        ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0,
                                          &prio, &pgid, &tc_pct, &up_map);

                if (nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid) ||
                    nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map) ||
                    nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio) ||
                    nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct))
                        return -EMSGSIZE;
                nla_nest_end(skb, tc_nest);
        }

        for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) {
                tc_pct = DCB_ATTR_VALUE_UNDEFINED;

                if (!dir)
                        ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0,
                                           &tc_pct);
                else
                        ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0,
                                           &tc_pct);
                if (nla_put_u8(skb, i, tc_pct))
                        return -EMSGSIZE;
        }
        nla_nest_end(skb, pg);
        return 0;
}

static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
{
        struct nlattr *cee, *app;
        struct dcb_app_type *itr;
        const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
        int dcbx, i, err = -EMSGSIZE;
        u8 value;

        if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name))
                goto nla_put_failure;
        cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE);
        if (!cee)
                goto nla_put_failure;

        /* local pg */
        if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) {
                err = dcbnl_cee_pg_fill(skb, netdev, 1);
                if (err)
                        goto nla_put_failure;
        }

        if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) {
                err = dcbnl_cee_pg_fill(skb, netdev, 0);
                if (err)
                        goto nla_put_failure;
        }

        /* local pfc */
        if (ops->getpfccfg) {
                struct nlattr *pfc_nest = nla_nest_start_noflag(skb,
                                                                DCB_ATTR_CEE_PFC);

                if (!pfc_nest)
                        goto nla_put_failure;

                for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) {
                        ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value);
                        if (nla_put_u8(skb, i, value))
                                goto nla_put_failure;
                }
                nla_nest_end(skb, pfc_nest);
        }

        /* local app */
        spin_lock_bh(&dcb_lock);
        app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE);
        if (!app)
                goto dcb_unlock;

        list_for_each_entry(itr, &dcb_app_list, list) {
                if (itr->ifindex == netdev->ifindex) {
                        struct nlattr *app_nest = nla_nest_start_noflag(skb,
                                                                        DCB_ATTR_APP);
                        if (!app_nest)
                                goto dcb_unlock;

                        err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE,
                                         itr->app.selector);
                        if (err)
                                goto dcb_unlock;

                        err = nla_put_u16(skb, DCB_APP_ATTR_ID,
                                          itr->app.protocol);
                        if (err)
                                goto dcb_unlock;

                        err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY,
                                         itr->app.priority);
                        if (err)
                                goto dcb_unlock;

                        nla_nest_end(skb, app_nest);
                }
        }
        nla_nest_end(skb, app);

        if (netdev->dcbnl_ops->getdcbx)
                dcbx = netdev->dcbnl_ops->getdcbx(netdev);
        else
                dcbx = -EOPNOTSUPP;

        spin_unlock_bh(&dcb_lock);

        /* features flags */
        if (ops->getfeatcfg) {
                struct nlattr *feat = nla_nest_start_noflag(skb,
                                                            DCB_ATTR_CEE_FEAT);
                if (!feat)
                        goto nla_put_failure;

                for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX;
                     i++)
                        if (!ops->getfeatcfg(netdev, i, &value) &&
                            nla_put_u8(skb, i, value))
                                goto nla_put_failure;

                nla_nest_end(skb, feat);
        }

        /* peer info if available */
        if (ops->cee_peer_getpg) {
                struct cee_pg pg;
                memset(&pg, 0, sizeof(pg));
                err = ops->cee_peer_getpg(netdev, &pg);
                if (!err &&
                    nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg))
                        goto nla_put_failure;
        }

        if (ops->cee_peer_getpfc) {
                struct cee_pfc pfc;
                memset(&pfc, 0, sizeof(pfc));
                err = ops->cee_peer_getpfc(netdev, &pfc);
                if (!err &&
                    nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc))
                        goto nla_put_failure;
        }

        if (ops->peer_getappinfo && ops->peer_getapptable) {
                err = dcbnl_build_peer_app(netdev, skb,
                                           DCB_ATTR_CEE_PEER_APP_TABLE,
                                           DCB_ATTR_CEE_PEER_APP_INFO,
                                           DCB_ATTR_CEE_PEER_APP);
                if (err)
                        goto nla_put_failure;
        }
        nla_nest_end(skb, cee);

        /* DCBX state */
        if (dcbx >= 0) {
                err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx);
                if (err)
                        goto nla_put_failure;
        }
        return 0;

dcb_unlock:
        spin_unlock_bh(&dcb_lock);
nla_put_failure:
        err = -EMSGSIZE;
        return err;
}

static int dcbnl_notify(struct net_device *dev, int event, int cmd,
                        u32 seq, u32 portid, int dcbx_ver)
{
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        struct nlmsghdr *nlh;
        const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops;
        int err;

        if (!ops)
                return -EOPNOTSUPP;

        skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh);
        if (!skb)
                return -ENOMEM;

        if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE)
                err = dcbnl_ieee_fill(skb, dev);
        else
                err = dcbnl_cee_fill(skb, dev);

        if (err < 0) {
                /* Report error to broadcast listeners */
                nlmsg_free(skb);
                rtnl_set_sk_err(net, RTNLGRP_DCB, err);
        } else {
                /* End nlmsg and notify broadcast listeners */
                nlmsg_end(skb, nlh);
                rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL);
        }

        return err;
}

int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd,
                      u32 seq, u32 portid)
{
        return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_IEEE);
}
EXPORT_SYMBOL(dcbnl_ieee_notify);

int dcbnl_cee_notify(struct net_device *dev, int event, int cmd,
                     u32 seq, u32 portid)
{
        return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_CEE);
}
EXPORT_SYMBOL(dcbnl_cee_notify);

/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands.
 * If any requested operation can not be completed
 * the entire msg is aborted and error value is returned.
 * No attempt is made to reconcile the case where only part of the
 * cmd can be completed.
 */
static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
                          u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
        struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
        int prio;
        int err;

        if (!ops)
                return -EOPNOTSUPP;

        if (!tb[DCB_ATTR_IEEE])
                return -EINVAL;

        err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX,
                                          tb[DCB_ATTR_IEEE],
                                          dcbnl_ieee_policy, NULL);
        if (err)
                return err;

        if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) {
                struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]);
                err = ops->ieee_setets(netdev, ets);
                if (err)
                        goto err;
        }

        if (ieee[DCB_ATTR_IEEE_MAXRATE] && ops->ieee_setmaxrate) {
                struct ieee_maxrate *maxrate =
                        nla_data(ieee[DCB_ATTR_IEEE_MAXRATE]);
                err = ops->ieee_setmaxrate(netdev, maxrate);
                if (err)
                        goto err;
        }

        if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) {
                struct ieee_qcn *qcn =
                        nla_data(ieee[DCB_ATTR_IEEE_QCN]);

                err = ops->ieee_setqcn(netdev, qcn);
                if (err)
                        goto err;
        }

        if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) {
                struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]);
                err = ops->ieee_setpfc(netdev, pfc);
                if (err)
                        goto err;
        }

        if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) {
                struct dcbnl_buffer *buffer =
                        nla_data(ieee[DCB_ATTR_DCB_BUFFER]);

                for (prio = 0; prio < ARRAY_SIZE(buffer->prio2buffer); prio++) {
                        if (buffer->prio2buffer[prio] >= DCBX_MAX_BUFFERS) {
                                err = -EINVAL;
                                goto err;
                        }
                }

                err = ops->dcbnl_setbuffer(netdev, buffer);
                if (err)
                        goto err;
        }

        if (ieee[DCB_ATTR_DCB_REWR_TABLE]) {
                err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE],
                                             netdev,
                                             ops->dcbnl_setrewr ?: dcb_setrewr);
                if (err)
                        goto err;
        }

        if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
                err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE],
                                             netdev, ops->ieee_setapp ?:
                                             dcb_ieee_setapp);
                if (err)
                        goto err;
        }

        if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) {
                u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0};
                struct nlattr *attr;
                int nselectors = 0;
                int rem;

                if (!ops->dcbnl_setapptrust) {
                        err = -EOPNOTSUPP;
                        goto err;
                }

                nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE],
                                    rem) {
                        enum ieee_attrs_app type = nla_type(attr);
                        u8 selector;
                        int i;

                        if (!dcbnl_app_attr_type_validate(type) ||
                            nla_len(attr) != 1 ||
                            nselectors >= sizeof(selectors)) {
                                err = -EINVAL;
                                goto err;
                        }

                        selector = nla_get_u8(attr);

                        if (!dcbnl_app_selector_validate(type, selector)) {
                                err = -EINVAL;
                                goto err;
                        }

                        /* Duplicate selector ? */
                        for (i = 0; i < nselectors; i++) {
                                if (selectors[i] == selector) {
                                        err = -EINVAL;
                                        goto err;
                                }
                        }

                        selectors[nselectors++] = selector;
                }

                err = ops->dcbnl_setapptrust(netdev, selectors, nselectors);
                if (err)
                        goto err;
        }

err:
        err = nla_put_u8(skb, DCB_ATTR_IEEE, err);
        dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0);
        return err;
}

static int dcbnl_ieee_get(struct net_device *netdev, struct nlmsghdr *nlh,
                          u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;

        if (!ops)
                return -EOPNOTSUPP;

        return dcbnl_ieee_fill(skb, netdev);
}

static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh,
                          u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
        struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
        int err;

        if (!ops)
                return -EOPNOTSUPP;

        if (!tb[DCB_ATTR_IEEE])
                return -EINVAL;

        err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX,
                                          tb[DCB_ATTR_IEEE],
                                          dcbnl_ieee_policy, NULL);
        if (err)
                return err;

        if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
                err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE],
                                             netdev, ops->ieee_delapp ?:
                                             dcb_ieee_delapp);
                if (err)
                        goto err;
        }

        if (ieee[DCB_ATTR_DCB_REWR_TABLE]) {
                err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE],
                                             netdev,
                                             ops->dcbnl_delrewr ?: dcb_delrewr);
                if (err)
                        goto err;
        }

err:
        err = nla_put_u8(skb, DCB_ATTR_IEEE, err);
        dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0);
        return err;
}


/* DCBX configuration */
static int dcbnl_getdcbx(struct net_device *netdev, struct nlmsghdr *nlh,
                         u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        if (!netdev->dcbnl_ops->getdcbx)
                return -EOPNOTSUPP;

        return nla_put_u8(skb, DCB_ATTR_DCBX,
                          netdev->dcbnl_ops->getdcbx(netdev));
}

static int dcbnl_setdcbx(struct net_device *netdev, struct nlmsghdr *nlh,
                         u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        u8 value;

        if (!netdev->dcbnl_ops->setdcbx)
                return -EOPNOTSUPP;

        if (!tb[DCB_ATTR_DCBX])
                return -EINVAL;

        value = nla_get_u8(tb[DCB_ATTR_DCBX]);

        return nla_put_u8(skb, DCB_ATTR_DCBX,
                          netdev->dcbnl_ops->setdcbx(netdev, value));
}

static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                            u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest;
        u8 value;
        int ret, i;
        int getall = 0;

        if (!netdev->dcbnl_ops->getfeatcfg)
                return -EOPNOTSUPP;

        if (!tb[DCB_ATTR_FEATCFG])
                return -EINVAL;

        ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX,
                                          tb[DCB_ATTR_FEATCFG],
                                          dcbnl_featcfg_nest, NULL);
        if (ret)
                return ret;

        nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG);
        if (!nest)
                return -EMSGSIZE;

        if (data[DCB_FEATCFG_ATTR_ALL])
                getall = 1;

        for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) {
                if (!getall && !data[i])
                        continue;

                ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value);
                if (!ret)
                        ret = nla_put_u8(skb, i, value);

                if (ret) {
                        nla_nest_cancel(skb, nest);
                        goto nla_put_failure;
                }
        }
        nla_nest_end(skb, nest);

nla_put_failure:
        return ret;
}

static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh,
                            u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1];
        int ret, i;
        u8 value;

        if (!netdev->dcbnl_ops->setfeatcfg)
                return -ENOTSUPP;

        if (!tb[DCB_ATTR_FEATCFG])
                return -EINVAL;

        ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX,
                                          tb[DCB_ATTR_FEATCFG],
                                          dcbnl_featcfg_nest, NULL);

        if (ret)
                goto err;

        for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) {
                if (data[i] == NULL)
                        continue;

                value = nla_get_u8(data[i]);

                ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value);

                if (ret)
                        goto err;
        }
err:
        ret = nla_put_u8(skb, DCB_ATTR_FEATCFG, ret);

        return ret;
}

/* Handle CEE DCBX GET commands. */
static int dcbnl_cee_get(struct net_device *netdev, struct nlmsghdr *nlh,
                         u32 seq, struct nlattr **tb, struct sk_buff *skb)
{
        const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;

        if (!ops)
                return -EOPNOTSUPP;

        return dcbnl_cee_fill(skb, netdev);
}

struct reply_func {
        /* reply netlink message type */
        int        type;

        /* function to fill message contents */
        int   (*cb)(struct net_device *, struct nlmsghdr *, u32,
                    struct nlattr **, struct sk_buff *);
};

static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = {
        [DCB_CMD_GSTATE]        = { RTM_GETDCB, dcbnl_getstate },
        [DCB_CMD_SSTATE]        = { RTM_SETDCB, dcbnl_setstate },
        [DCB_CMD_PFC_GCFG]        = { RTM_GETDCB, dcbnl_getpfccfg },
        [DCB_CMD_PFC_SCFG]        = { RTM_SETDCB, dcbnl_setpfccfg },
        [DCB_CMD_GPERM_HWADDR]        = { RTM_GETDCB, dcbnl_getperm_hwaddr },
        [DCB_CMD_GCAP]                = { RTM_GETDCB, dcbnl_getcap },
        [DCB_CMD_GNUMTCS]        = { RTM_GETDCB, dcbnl_getnumtcs },
        [DCB_CMD_SNUMTCS]        = { RTM_SETDCB, dcbnl_setnumtcs },
        [DCB_CMD_PFC_GSTATE]        = { RTM_GETDCB, dcbnl_getpfcstate },
        [DCB_CMD_PFC_SSTATE]        = { RTM_SETDCB, dcbnl_setpfcstate },
        [DCB_CMD_GAPP]                = { RTM_GETDCB, dcbnl_getapp },
        [DCB_CMD_SAPP]                = { RTM_SETDCB, dcbnl_setapp },
        [DCB_CMD_PGTX_GCFG]        = { RTM_GETDCB, dcbnl_pgtx_getcfg },
        [DCB_CMD_PGTX_SCFG]        = { RTM_SETDCB, dcbnl_pgtx_setcfg },
        [DCB_CMD_PGRX_GCFG]        = { RTM_GETDCB, dcbnl_pgrx_getcfg },
        [DCB_CMD_PGRX_SCFG]        = { RTM_SETDCB, dcbnl_pgrx_setcfg },
        [DCB_CMD_SET_ALL]        = { RTM_SETDCB, dcbnl_setall },
        [DCB_CMD_BCN_GCFG]        = { RTM_GETDCB, dcbnl_bcn_getcfg },
        [DCB_CMD_BCN_SCFG]        = { RTM_SETDCB, dcbnl_bcn_setcfg },
        [DCB_CMD_IEEE_GET]        = { RTM_GETDCB, dcbnl_ieee_get },
        [DCB_CMD_IEEE_SET]        = { RTM_SETDCB, dcbnl_ieee_set },
        [DCB_CMD_IEEE_DEL]        = { RTM_SETDCB, dcbnl_ieee_del },
        [DCB_CMD_GDCBX]                = { RTM_GETDCB, dcbnl_getdcbx },
        [DCB_CMD_SDCBX]                = { RTM_SETDCB, dcbnl_setdcbx },
        [DCB_CMD_GFEATCFG]        = { RTM_GETDCB, dcbnl_getfeatcfg },
        [DCB_CMD_SFEATCFG]        = { RTM_SETDCB, dcbnl_setfeatcfg },
        [DCB_CMD_CEE_GET]        = { RTM_GETDCB, dcbnl_cee_get },
};

static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                    struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct net_device *netdev;
        struct dcbmsg *dcb = nlmsg_data(nlh);
        struct nlattr *tb[DCB_ATTR_MAX + 1];
        u32 portid = NETLINK_CB(skb).portid;
        int ret = -EINVAL;
        struct sk_buff *reply_skb;
        struct nlmsghdr *reply_nlh = NULL;
        const struct reply_func *fn;

        if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX,
                                     dcbnl_rtnl_policy, extack);
        if (ret < 0)
                return ret;

        if (dcb->cmd > DCB_CMD_MAX)
                return -EINVAL;

        /* check if a reply function has been defined for the command */
        fn = &reply_funcs[dcb->cmd];
        if (!fn->cb)
                return -EOPNOTSUPP;
        if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        if (!tb[DCB_ATTR_IFNAME])
                return -EINVAL;

        netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME]));
        if (!netdev)
                return -ENODEV;

        if (!netdev->dcbnl_ops)
                return -EOPNOTSUPP;

        reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq,
                                 nlh->nlmsg_flags, &reply_nlh);
        if (!reply_skb)
                return -ENOMEM;

        ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb);
        if (ret < 0) {
                nlmsg_free(reply_skb);
                goto out;
        }

        nlmsg_end(reply_skb, reply_nlh);

        ret = rtnl_unicast(reply_skb, net, portid);
out:
        return ret;
}

static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app,
                                            int ifindex, int proto)
{
        struct dcb_app_type *itr;

        list_for_each_entry(itr, &dcb_rewr_list, list) {
                if (itr->app.selector == app->selector &&
                    itr->app.priority == app->priority &&
                    itr->ifindex == ifindex &&
                    ((proto == -1) || itr->app.protocol == proto))
                        return itr;
        }

        return NULL;
}

static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app,
                                           int ifindex, int prio)
{
        struct dcb_app_type *itr;

        list_for_each_entry(itr, &dcb_app_list, list) {
                if (itr->app.selector == app->selector &&
                    itr->app.protocol == app->protocol &&
                    itr->ifindex == ifindex &&
                    ((prio == -1) || itr->app.priority == prio))
                        return itr;
        }

        return NULL;
}

static int dcb_app_add(struct list_head *list, const struct dcb_app *app,
                       int ifindex)
{
        struct dcb_app_type *entry;

        entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
        if (!entry)
                return -ENOMEM;

        memcpy(&entry->app, app, sizeof(*app));
        entry->ifindex = ifindex;
        list_add(&entry->list, list);

        return 0;
}

/**
 * dcb_getapp - retrieve the DCBX application user priority
 * @dev: network interface
 * @app: application to get user priority of
 *
 * On success returns a non-zero 802.1p user priority bitmap
 * otherwise returns 0 as the invalid user priority bitmap to
 * indicate an error.
 */
u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
{
        struct dcb_app_type *itr;
        u8 prio = 0;

        spin_lock_bh(&dcb_lock);
        itr = dcb_app_lookup(app, dev->ifindex, -1);
        if (itr)
                prio = itr->app.priority;
        spin_unlock_bh(&dcb_lock);

        return prio;
}
EXPORT_SYMBOL(dcb_getapp);

/**
 * dcb_setapp - add CEE dcb application data to app list
 * @dev: network interface
 * @new: application data to add
 *
 * Priority 0 is an invalid priority in CEE spec. This routine
 * removes applications from the app list if the priority is
 * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap
 */
int dcb_setapp(struct net_device *dev, struct dcb_app *new)
{
        struct dcb_app_type *itr;
        struct dcb_app_type event;
        int err = 0;

        event.ifindex = dev->ifindex;
        memcpy(&event.app, new, sizeof(event.app));
        if (dev->dcbnl_ops->getdcbx)
                event.dcbx = dev->dcbnl_ops->getdcbx(dev);

        spin_lock_bh(&dcb_lock);
        /* Search for existing match and replace */
        itr = dcb_app_lookup(new, dev->ifindex, -1);
        if (itr) {
                if (new->priority)
                        itr->app.priority = new->priority;
                else {
                        list_del(&itr->list);
                        kfree(itr);
                }
                goto out;
        }
        /* App type does not exist add new application type */
        if (new->priority)
                err = dcb_app_add(&dcb_app_list, new, dev->ifindex);
out:
        spin_unlock_bh(&dcb_lock);
        if (!err)
                call_dcbevent_notifiers(DCB_APP_EVENT, &event);
        return err;
}
EXPORT_SYMBOL(dcb_setapp);

/**
 * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority
 * @dev: network interface
 * @app: where to store the retrieve application data
 *
 * Helper routine which on success returns a non-zero 802.1Qaz user
 * priority bitmap otherwise returns 0 to indicate the dcb_app was
 * not found in APP list.
 */
u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app)
{
        struct dcb_app_type *itr;
        u8 prio = 0;

        spin_lock_bh(&dcb_lock);
        itr = dcb_app_lookup(app, dev->ifindex, -1);
        if (itr)
                prio |= 1 << itr->app.priority;
        spin_unlock_bh(&dcb_lock);

        return prio;
}
EXPORT_SYMBOL(dcb_ieee_getapp_mask);

/* Get protocol value from rewrite entry. */
u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app)
{
        struct dcb_app_type *itr;
        u16 proto = 0;

        spin_lock_bh(&dcb_lock);
        itr = dcb_rewr_lookup(app, dev->ifindex, -1);
        if (itr)
                proto = itr->app.protocol;
        spin_unlock_bh(&dcb_lock);

        return proto;
}
EXPORT_SYMBOL(dcb_getrewr);

 /* Add rewrite entry to the rewrite list. */
int dcb_setrewr(struct net_device *dev, struct dcb_app *new)
{
        int err;

        spin_lock_bh(&dcb_lock);
        /* Search for existing match and abort if found. */
        if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) {
                err = -EEXIST;
                goto out;
        }

        err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex);
out:
        spin_unlock_bh(&dcb_lock);

        return err;
}
EXPORT_SYMBOL(dcb_setrewr);

/* Delete rewrite entry from the rewrite list. */
int dcb_delrewr(struct net_device *dev, struct dcb_app *del)
{
        struct dcb_app_type *itr;
        int err = -ENOENT;

        spin_lock_bh(&dcb_lock);
        /* Search for existing match and remove it. */
        itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol);
        if (itr) {
                list_del(&itr->list);
                kfree(itr);
                err = 0;
        }

        spin_unlock_bh(&dcb_lock);

        return err;
}
EXPORT_SYMBOL(dcb_delrewr);

/**
 * dcb_ieee_setapp - add IEEE dcb application data to app list
 * @dev: network interface
 * @new: application data to add
 *
 * This adds Application data to the list. Multiple application
 * entries may exists for the same selector and protocol as long
 * as the priorities are different. Priority is expected to be a
 * 3-bit unsigned integer
 */
int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new)
{
        struct dcb_app_type event;
        int err = 0;

        event.ifindex = dev->ifindex;
        memcpy(&event.app, new, sizeof(event.app));
        if (dev->dcbnl_ops->getdcbx)
                event.dcbx = dev->dcbnl_ops->getdcbx(dev);

        spin_lock_bh(&dcb_lock);
        /* Search for existing match and abort if found */
        if (dcb_app_lookup(new, dev->ifindex, new->priority)) {
                err = -EEXIST;
                goto out;
        }

        err = dcb_app_add(&dcb_app_list, new, dev->ifindex);
out:
        spin_unlock_bh(&dcb_lock);
        if (!err)
                call_dcbevent_notifiers(DCB_APP_EVENT, &event);
        return err;
}
EXPORT_SYMBOL(dcb_ieee_setapp);

/**
 * dcb_ieee_delapp - delete IEEE dcb application data from list
 * @dev: network interface
 * @del: application data to delete
 *
 * This removes a matching APP data from the APP list
 */
int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
{
        struct dcb_app_type *itr;
        struct dcb_app_type event;
        int err = -ENOENT;

        event.ifindex = dev->ifindex;
        memcpy(&event.app, del, sizeof(event.app));
        if (dev->dcbnl_ops->getdcbx)
                event.dcbx = dev->dcbnl_ops->getdcbx(dev);

        spin_lock_bh(&dcb_lock);
        /* Search for existing match and remove it. */
        if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) {
                list_del(&itr->list);
                kfree(itr);
                err = 0;
        }

        spin_unlock_bh(&dcb_lock);
        if (!err)
                call_dcbevent_notifiers(DCB_APP_EVENT, &event);
        return err;
}
EXPORT_SYMBOL(dcb_ieee_delapp);

/* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from
 * priorities to the PCP and DEI values assigned to that priority.
 */
void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev,
                                   struct dcb_rewr_prio_pcp_map *p_map)
{
        int ifindex = dev->ifindex;
        struct dcb_app_type *itr;
        u8 prio;

        memset(p_map->map, 0, sizeof(p_map->map));

        spin_lock_bh(&dcb_lock);
        list_for_each_entry(itr, &dcb_rewr_list, list) {
                if (itr->ifindex == ifindex &&
                    itr->app.selector == DCB_APP_SEL_PCP &&
                    itr->app.protocol < 16 &&
                    itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
                        prio = itr->app.priority;
                        p_map->map[prio] |= 1 << itr->app.protocol;
                }
        }
        spin_unlock_bh(&dcb_lock);
}
EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map);

/* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from
 * priorities to the DSCP values assigned to that priority.
 */
void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev,
                                    struct dcb_ieee_app_prio_map *p_map)
{
        int ifindex = dev->ifindex;
        struct dcb_app_type *itr;
        u8 prio;

        memset(p_map->map, 0, sizeof(p_map->map));

        spin_lock_bh(&dcb_lock);
        list_for_each_entry(itr, &dcb_rewr_list, list) {
                if (itr->ifindex == ifindex &&
                    itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
                    itr->app.protocol < 64 &&
                    itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
                        prio = itr->app.priority;
                        p_map->map[prio] |= 1ULL << itr->app.protocol;
                }
        }
        spin_unlock_bh(&dcb_lock);
}
EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map);

/*
 * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from
 * priorities to the DSCP values assigned to that priority. Initialize p_map
 * such that each map element holds a bit mask of DSCP values configured for
 * that priority by APP entries.
 */
void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev,
                                        struct dcb_ieee_app_prio_map *p_map)
{
        int ifindex = dev->ifindex;
        struct dcb_app_type *itr;
        u8 prio;

        memset(p_map->map, 0, sizeof(p_map->map));

        spin_lock_bh(&dcb_lock);
        list_for_each_entry(itr, &dcb_app_list, list) {
                if (itr->ifindex == ifindex &&
                    itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
                    itr->app.protocol < 64 &&
                    itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
                        prio = itr->app.priority;
                        p_map->map[prio] |= 1ULL << itr->app.protocol;
                }
        }
        spin_unlock_bh(&dcb_lock);
}
EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map);

/*
 * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from
 * DSCP values to the priorities assigned to that DSCP value. Initialize p_map
 * such that each map element holds a bit mask of priorities configured for a
 * given DSCP value by APP entries.
 */
void
dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev,
                                   struct dcb_ieee_app_dscp_map *p_map)
{
        int ifindex = dev->ifindex;
        struct dcb_app_type *itr;

        memset(p_map->map, 0, sizeof(p_map->map));

        spin_lock_bh(&dcb_lock);
        list_for_each_entry(itr, &dcb_app_list, list) {
                if (itr->ifindex == ifindex &&
                    itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
                    itr->app.protocol < 64 &&
                    itr->app.priority < IEEE_8021QAZ_MAX_TCS)
                        p_map->map[itr->app.protocol] |= 1 << itr->app.priority;
        }
        spin_unlock_bh(&dcb_lock);
}
EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map);

/*
 * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet
 * type, with valid PID values >= 1536. A special meaning is then assigned to
 * protocol value of 0: "default priority. For use when priority is not
 * otherwise specified".
 *
 * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries
 * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default
 * priorities set by these entries.
 */
u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev)
{
        int ifindex = dev->ifindex;
        struct dcb_app_type *itr;
        u8 mask = 0;

        spin_lock_bh(&dcb_lock);
        list_for_each_entry(itr, &dcb_app_list, list) {
                if (itr->ifindex == ifindex &&
                    itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE &&
                    itr->app.protocol == 0 &&
                    itr->app.priority < IEEE_8021QAZ_MAX_TCS)
                        mask |= 1 << itr->app.priority;
        }
        spin_unlock_bh(&dcb_lock);

        return mask;
}
EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask);

static void dcbnl_flush_dev(struct net_device *dev)
{
        struct dcb_app_type *itr, *tmp;

        spin_lock_bh(&dcb_lock);

        list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) {
                if (itr->ifindex == dev->ifindex) {
                        list_del(&itr->list);
                        kfree(itr);
                }
        }

        spin_unlock_bh(&dcb_lock);
}

static int dcbnl_netdevice_event(struct notifier_block *nb,
                                 unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        switch (event) {
        case NETDEV_UNREGISTER:
                if (!dev->dcbnl_ops)
                        return NOTIFY_DONE;

                dcbnl_flush_dev(dev);

                return NOTIFY_OK;
        default:
                return NOTIFY_DONE;
        }
}

static struct notifier_block dcbnl_nb __read_mostly = {
        .notifier_call  = dcbnl_netdevice_event,
};

static const struct rtnl_msg_handler dcbnl_rtnl_msg_handlers[] __initconst = {
        {.msgtype = RTM_GETDCB, .doit = dcb_doit},
        {.msgtype = RTM_SETDCB, .doit = dcb_doit},
};

static int __init dcbnl_init(void)
{
        int err;

        err = register_netdevice_notifier(&dcbnl_nb);
        if (err)
                return err;

        rtnl_register_many(dcbnl_rtnl_msg_handlers);

        return 0;
}
device_initcall(dcbnl_init);
























































































































































































































































































































































































   21 






























































   22 









   22 
































   22 



   22 
   22 


















































   22 


































































































   22 




































   22 



















































































































































































































































   22 


















































   22 









   22 













   22 




































   22 















































   22 





   22 






































































































   22 




   22 



























   21 


















   21 

























   21 












































































































































































































































   22 
















































   22 
















   22 
































   22 






   22 








   22 

   22 















































   22 












   22 










   22 





























   22 





   22 



















































   22 















































   21 














   21 






   21 































   22 












   22 



























   22 





   22 




   21 


   21 





   21 




   22 















   22 

















































































































































   22 



   22 
































   22 





































   22 












































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
// SPDX-License-Identifier: GPL-2.0

#include <linux/kernel.h>
#include <linux/irqflags.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/bug.h>
#include "printk_ringbuffer.h"
#include "internal.h"

/**
 * DOC: printk_ringbuffer overview
 *
 * Data Structure
 * --------------
 * The printk_ringbuffer is made up of 3 internal ringbuffers:
 *
 *   desc_ring
 *     A ring of descriptors and their meta data (such as sequence number,
 *     timestamp, loglevel, etc.) as well as internal state information about
 *     the record and logical positions specifying where in the other
 *     ringbuffer the text strings are located.
 *
 *   text_data_ring
 *     A ring of data blocks. A data block consists of an unsigned long
 *     integer (ID) that maps to a desc_ring index followed by the text
 *     string of the record.
 *
 * The internal state information of a descriptor is the key element to allow
 * readers and writers to locklessly synchronize access to the data.
 *
 * Implementation
 * --------------
 *
 * Descriptor Ring
 * ~~~~~~~~~~~~~~~
 * The descriptor ring is an array of descriptors. A descriptor contains
 * essential meta data to track the data of a printk record using
 * blk_lpos structs pointing to associated text data blocks (see
 * "Data Rings" below). Each descriptor is assigned an ID that maps
 * directly to index values of the descriptor array and has a state. The ID
 * and the state are bitwise combined into a single descriptor field named
 * @state_var, allowing ID and state to be synchronously and atomically
 * updated.
 *
 * Descriptors have four states:
 *
 *   reserved
 *     A writer is modifying the record.
 *
 *   committed
 *     The record and all its data are written. A writer can reopen the
 *     descriptor (transitioning it back to reserved), but in the committed
 *     state the data is consistent.
 *
 *   finalized
 *     The record and all its data are complete and available for reading. A
 *     writer cannot reopen the descriptor.
 *
 *   reusable
 *     The record exists, but its text and/or meta data may no longer be
 *     available.
 *
 * Querying the @state_var of a record requires providing the ID of the
 * descriptor to query. This can yield a possible fifth (pseudo) state:
 *
 *   miss
 *     The descriptor being queried has an unexpected ID.
 *
 * The descriptor ring has a @tail_id that contains the ID of the oldest
 * descriptor and @head_id that contains the ID of the newest descriptor.
 *
 * When a new descriptor should be created (and the ring is full), the tail
 * descriptor is invalidated by first transitioning to the reusable state and
 * then invalidating all tail data blocks up to and including the data blocks
 * associated with the tail descriptor (for the text ring). Then
 * @tail_id is advanced, followed by advancing @head_id. And finally the
 * @state_var of the new descriptor is initialized to the new ID and reserved
 * state.
 *
 * The @tail_id can only be advanced if the new @tail_id would be in the
 * committed or reusable queried state. This makes it possible that a valid
 * sequence number of the tail is always available.
 *
 * Descriptor Finalization
 * ~~~~~~~~~~~~~~~~~~~~~~~
 * When a writer calls the commit function prb_commit(), record data is
 * fully stored and is consistent within the ringbuffer. However, a writer can
 * reopen that record, claiming exclusive access (as with prb_reserve()), and
 * modify that record. When finished, the writer must again commit the record.
 *
 * In order for a record to be made available to readers (and also become
 * recyclable for writers), it must be finalized. A finalized record cannot be
 * reopened and can never become "unfinalized". Record finalization can occur
 * in three different scenarios:
 *
 *   1) A writer can simultaneously commit and finalize its record by calling
 *      prb_final_commit() instead of prb_commit().
 *
 *   2) When a new record is reserved and the previous record has been
 *      committed via prb_commit(), that previous record is automatically
 *      finalized.
 *
 *   3) When a record is committed via prb_commit() and a newer record
 *      already exists, the record being committed is automatically finalized.
 *
 * Data Ring
 * ~~~~~~~~~
 * The text data ring is a byte array composed of data blocks. Data blocks are
 * referenced by blk_lpos structs that point to the logical position of the
 * beginning of a data block and the beginning of the next adjacent data
 * block. Logical positions are mapped directly to index values of the byte
 * array ringbuffer.
 *
 * Each data block consists of an ID followed by the writer data. The ID is
 * the identifier of a descriptor that is associated with the data block. A
 * given data block is considered valid if all of the following conditions
 * are met:
 *
 *   1) The descriptor associated with the data block is in the committed
 *      or finalized queried state.
 *
 *   2) The blk_lpos struct within the descriptor associated with the data
 *      block references back to the same data block.
 *
 *   3) The data block is within the head/tail logical position range.
 *
 * If the writer data of a data block would extend beyond the end of the
 * byte array, only the ID of the data block is stored at the logical
 * position and the full data block (ID and writer data) is stored at the
 * beginning of the byte array. The referencing blk_lpos will point to the
 * ID before the wrap and the next data block will be at the logical
 * position adjacent the full data block after the wrap.
 *
 * Data rings have a @tail_lpos that points to the beginning of the oldest
 * data block and a @head_lpos that points to the logical position of the
 * next (not yet existing) data block.
 *
 * When a new data block should be created (and the ring is full), tail data
 * blocks will first be invalidated by putting their associated descriptors
 * into the reusable state and then pushing the @tail_lpos forward beyond
 * them. Then the @head_lpos is pushed forward and is associated with a new
 * descriptor. If a data block is not valid, the @tail_lpos cannot be
 * advanced beyond it.
 *
 * Info Array
 * ~~~~~~~~~~
 * The general meta data of printk records are stored in printk_info structs,
 * stored in an array with the same number of elements as the descriptor ring.
 * Each info corresponds to the descriptor of the same index in the
 * descriptor ring. Info validity is confirmed by evaluating the corresponding
 * descriptor before and after loading the info.
 *
 * Usage
 * -----
 * Here are some simple examples demonstrating writers and readers. For the
 * examples a global ringbuffer (test_rb) is available (which is not the
 * actual ringbuffer used by printk)::
 *
 *        DEFINE_PRINTKRB(test_rb, 15, 5);
 *
 * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of
 * 1 MiB (2 ^ (15 + 5)) for text data.
 *
 * Sample writer code::
 *
 *        const char *textstr = "message text";
 *        struct prb_reserved_entry e;
 *        struct printk_record r;
 *
 *        // specify how much to allocate
 *        prb_rec_init_wr(&r, strlen(textstr) + 1);
 *
 *        if (prb_reserve(&e, &test_rb, &r)) {
 *                snprintf(r.text_buf, r.text_buf_size, "%s", textstr);
 *
 *                r.info->text_len = strlen(textstr);
 *                r.info->ts_nsec = local_clock();
 *                r.info->caller_id = printk_caller_id();
 *
 *                // commit and finalize the record
 *                prb_final_commit(&e);
 *        }
 *
 * Note that additional writer functions are available to extend a record
 * after it has been committed but not yet finalized. This can be done as
 * long as no new records have been reserved and the caller is the same.
 *
 * Sample writer code (record extending)::
 *
 *                // alternate rest of previous example
 *
 *                r.info->text_len = strlen(textstr);
 *                r.info->ts_nsec = local_clock();
 *                r.info->caller_id = printk_caller_id();
 *
 *                // commit the record (but do not finalize yet)
 *                prb_commit(&e);
 *        }
 *
 *        ...
 *
 *        // specify additional 5 bytes text space to extend
 *        prb_rec_init_wr(&r, 5);
 *
 *        // try to extend, but only if it does not exceed 32 bytes
 *        if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) {
 *                snprintf(&r.text_buf[r.info->text_len],
 *                         r.text_buf_size - r.info->text_len, "hello");
 *
 *                r.info->text_len += 5;
 *
 *                // commit and finalize the record
 *                prb_final_commit(&e);
 *        }
 *
 * Sample reader code::
 *
 *        struct printk_info info;
 *        struct printk_record r;
 *        char text_buf[32];
 *        u64 seq;
 *
 *        prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf));
 *
 *        prb_for_each_record(0, &test_rb, &seq, &r) {
 *                if (info.seq != seq)
 *                        pr_warn("lost %llu records\n", info.seq - seq);
 *
 *                if (info.text_len > r.text_buf_size) {
 *                        pr_warn("record %llu text truncated\n", info.seq);
 *                        text_buf[r.text_buf_size - 1] = 0;
 *                }
 *
 *                pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec,
 *                        &text_buf[0]);
 *        }
 *
 * Note that additional less convenient reader functions are available to
 * allow complex record access.
 *
 * ABA Issues
 * ~~~~~~~~~~
 * To help avoid ABA issues, descriptors are referenced by IDs (array index
 * values combined with tagged bits counting array wraps) and data blocks are
 * referenced by logical positions (array index values combined with tagged
 * bits counting array wraps). However, on 32-bit systems the number of
 * tagged bits is relatively small such that an ABA incident is (at least
 * theoretically) possible. For example, if 4 million maximally sized (1KiB)
 * printk messages were to occur in NMI context on a 32-bit system, the
 * interrupted context would not be able to recognize that the 32-bit integer
 * completely wrapped and thus represents a different data block than the one
 * the interrupted context expects.
 *
 * To help combat this possibility, additional state checking is performed
 * (such as using cmpxchg() even though set() would suffice). These extra
 * checks are commented as such and will hopefully catch any ABA issue that
 * a 32-bit system might experience.
 *
 * Memory Barriers
 * ~~~~~~~~~~~~~~~
 * Multiple memory barriers are used. To simplify proving correctness and
 * generating litmus tests, lines of code related to memory barriers
 * (loads, stores, and the associated memory barriers) are labeled::
 *
 *        LMM(function:letter)
 *
 * Comments reference the labels using only the "function:letter" part.
 *
 * The memory barrier pairs and their ordering are:
 *
 *   desc_reserve:D / desc_reserve:B
 *     push descriptor tail (id), then push descriptor head (id)
 *
 *   desc_reserve:D / data_push_tail:B
 *     push data tail (lpos), then set new descriptor reserved (state)
 *
 *   desc_reserve:D / desc_push_tail:C
 *     push descriptor tail (id), then set new descriptor reserved (state)
 *
 *   desc_reserve:D / prb_first_seq:C
 *     push descriptor tail (id), then set new descriptor reserved (state)
 *
 *   desc_reserve:F / desc_read:D
 *     set new descriptor id and reserved (state), then allow writer changes
 *
 *   data_alloc:A (or data_realloc:A) / desc_read:D
 *     set old descriptor reusable (state), then modify new data block area
 *
 *   data_alloc:A (or data_realloc:A) / data_push_tail:B
 *     push data tail (lpos), then modify new data block area
 *
 *   _prb_commit:B / desc_read:B
 *     store writer changes, then set new descriptor committed (state)
 *
 *   desc_reopen_last:A / _prb_commit:B
 *     set descriptor reserved (state), then read descriptor data
 *
 *   _prb_commit:B / desc_reserve:D
 *     set new descriptor committed (state), then check descriptor head (id)
 *
 *   data_push_tail:D / data_push_tail:A
 *     set descriptor reusable (state), then push data tail (lpos)
 *
 *   desc_push_tail:B / desc_reserve:D
 *     set descriptor reusable (state), then push descriptor tail (id)
 *
 *   desc_update_last_finalized:A / desc_last_finalized_seq:A
 *     store finalized record, then set new highest finalized sequence number
 */

#define DATA_SIZE(data_ring)                _DATA_SIZE((data_ring)->size_bits)
#define DATA_SIZE_MASK(data_ring)        (DATA_SIZE(data_ring) - 1)

#define DESCS_COUNT(desc_ring)                _DESCS_COUNT((desc_ring)->count_bits)
#define DESCS_COUNT_MASK(desc_ring)        (DESCS_COUNT(desc_ring) - 1)

/* Determine the data array index from a logical position. */
#define DATA_INDEX(data_ring, lpos)        ((lpos) & DATA_SIZE_MASK(data_ring))

/* Determine the desc array index from an ID or sequence number. */
#define DESC_INDEX(desc_ring, n)        ((n) & DESCS_COUNT_MASK(desc_ring))

/* Determine how many times the data array has wrapped. */
#define DATA_WRAPS(data_ring, lpos)        ((lpos) >> (data_ring)->size_bits)

/* Determine if a logical position refers to a data-less block. */
#define LPOS_DATALESS(lpos)                ((lpos) & 1UL)
#define BLK_DATALESS(blk)                (LPOS_DATALESS((blk)->begin) && \
                                         LPOS_DATALESS((blk)->next))

/* Get the logical position at index 0 of the current wrap. */
#define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \
((lpos) & ~DATA_SIZE_MASK(data_ring))

/* Get the ID for the same index of the previous wrap as the given ID. */
#define DESC_ID_PREV_WRAP(desc_ring, id) \
DESC_ID((id) - DESCS_COUNT(desc_ring))

/*
 * A data block: mapped directly to the beginning of the data block area
 * specified as a logical position within the data ring.
 *
 * @id:   the ID of the associated descriptor
 * @data: the writer data
 *
 * Note that the size of a data block is only known by its associated
 * descriptor.
 */
struct prb_data_block {
        unsigned long        id;
        char                data[];
};

/*
 * Return the descriptor associated with @n. @n can be either a
 * descriptor ID or a sequence number.
 */
static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n)
{
        return &desc_ring->descs[DESC_INDEX(desc_ring, n)];
}

/*
 * Return the printk_info associated with @n. @n can be either a
 * descriptor ID or a sequence number.
 */
static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n)
{
        return &desc_ring->infos[DESC_INDEX(desc_ring, n)];
}

static struct prb_data_block *to_block(struct prb_data_ring *data_ring,
                                       unsigned long begin_lpos)
{
        return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)];
}

/*
 * Increase the data size to account for data block meta data plus any
 * padding so that the adjacent data block is aligned on the ID size.
 */
static unsigned int to_blk_size(unsigned int size)
{
        struct prb_data_block *db = NULL;

        size += sizeof(*db);
        size = ALIGN(size, sizeof(db->id));
        return size;
}

/*
 * Sanity checker for reserve size. The ringbuffer code assumes that a data
 * block does not exceed the maximum possible size that could fit within the
 * ringbuffer. This function provides that basic size check so that the
 * assumption is safe.
 */
static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size)
{
        struct prb_data_block *db = NULL;

        if (size == 0)
                return true;

        /*
         * Ensure the alignment padded size could possibly fit in the data
         * array. The largest possible data block must still leave room for
         * at least the ID of the next block.
         */
        size = to_blk_size(size);
        if (size > DATA_SIZE(data_ring) - sizeof(db->id))
                return false;

        return true;
}

/* Query the state of a descriptor. */
static enum desc_state get_desc_state(unsigned long id,
                                      unsigned long state_val)
{
        if (id != DESC_ID(state_val))
                return desc_miss;

        return DESC_STATE(state_val);
}

/*
 * Get a copy of a specified descriptor and return its queried state. If the
 * descriptor is in an inconsistent state (miss or reserved), the caller can
 * only expect the descriptor's @state_var field to be valid.
 *
 * The sequence number and caller_id can be optionally retrieved. Like all
 * non-state_var data, they are only valid if the descriptor is in a
 * consistent state.
 */
static enum desc_state desc_read(struct prb_desc_ring *desc_ring,
                                 unsigned long id, struct prb_desc *desc_out,
                                 u64 *seq_out, u32 *caller_id_out)
{
        struct printk_info *info = to_info(desc_ring, id);
        struct prb_desc *desc = to_desc(desc_ring, id);
        atomic_long_t *state_var = &desc->state_var;
        enum desc_state d_state;
        unsigned long state_val;

        /* Check the descriptor state. */
        state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */
        d_state = get_desc_state(id, state_val);
        if (d_state == desc_miss || d_state == desc_reserved) {
                /*
                 * The descriptor is in an inconsistent state. Set at least
                 * @state_var so that the caller can see the details of
                 * the inconsistent state.
                 */
                goto out;
        }

        /*
         * Guarantee the state is loaded before copying the descriptor
         * content. This avoids copying obsolete descriptor content that might
         * not apply to the descriptor state. This pairs with _prb_commit:B.
         *
         * Memory barrier involvement:
         *
         * If desc_read:A reads from _prb_commit:B, then desc_read:C reads
         * from _prb_commit:A.
         *
         * Relies on:
         *
         * WMB from _prb_commit:A to _prb_commit:B
         *    matching
         * RMB from desc_read:A to desc_read:C
         */
        smp_rmb(); /* LMM(desc_read:B) */

        /*
         * Copy the descriptor data. The data is not valid until the
         * state has been re-checked. A memcpy() for all of @desc
         * cannot be used because of the atomic_t @state_var field.
         */
        if (desc_out) {
                memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos,
                       sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */
        }
        if (seq_out)
                *seq_out = info->seq; /* also part of desc_read:C */
        if (caller_id_out)
                *caller_id_out = info->caller_id; /* also part of desc_read:C */

        /*
         * 1. Guarantee the descriptor content is loaded before re-checking
         *    the state. This avoids reading an obsolete descriptor state
         *    that may not apply to the copied content. This pairs with
         *    desc_reserve:F.
         *
         *    Memory barrier involvement:
         *
         *    If desc_read:C reads from desc_reserve:G, then desc_read:E
         *    reads from desc_reserve:F.
         *
         *    Relies on:
         *
         *    WMB from desc_reserve:F to desc_reserve:G
         *       matching
         *    RMB from desc_read:C to desc_read:E
         *
         * 2. Guarantee the record data is loaded before re-checking the
         *    state. This avoids reading an obsolete descriptor state that may
         *    not apply to the copied data. This pairs with data_alloc:A and
         *    data_realloc:A.
         *
         *    Memory barrier involvement:
         *
         *    If copy_data:A reads from data_alloc:B, then desc_read:E
         *    reads from desc_make_reusable:A.
         *
         *    Relies on:
         *
         *    MB from desc_make_reusable:A to data_alloc:B
         *       matching
         *    RMB from desc_read:C to desc_read:E
         *
         *    Note: desc_make_reusable:A and data_alloc:B can be different
         *          CPUs. However, the data_alloc:B CPU (which performs the
         *          full memory barrier) must have previously seen
         *          desc_make_reusable:A.
         */
        smp_rmb(); /* LMM(desc_read:D) */

        /*
         * The data has been copied. Return the current descriptor state,
         * which may have changed since the load above.
         */
        state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */
        d_state = get_desc_state(id, state_val);
out:
        if (desc_out)
                atomic_long_set(&desc_out->state_var, state_val);
        return d_state;
}

/*
 * Take a specified descriptor out of the finalized state by attempting
 * the transition from finalized to reusable. Either this context or some
 * other context will have been successful.
 */
static void desc_make_reusable(struct prb_desc_ring *desc_ring,
                               unsigned long id)
{
        unsigned long val_finalized = DESC_SV(id, desc_finalized);
        unsigned long val_reusable = DESC_SV(id, desc_reusable);
        struct prb_desc *desc = to_desc(desc_ring, id);
        atomic_long_t *state_var = &desc->state_var;

        atomic_long_cmpxchg_relaxed(state_var, val_finalized,
                                    val_reusable); /* LMM(desc_make_reusable:A) */
}

/*
 * Given the text data ring, put the associated descriptor of each
 * data block from @lpos_begin until @lpos_end into the reusable state.
 *
 * If there is any problem making the associated descriptor reusable, either
 * the descriptor has not yet been finalized or another writer context has
 * already pushed the tail lpos past the problematic data block. Regardless,
 * on error the caller can re-load the tail lpos to determine the situation.
 */
static bool data_make_reusable(struct printk_ringbuffer *rb,
                               unsigned long lpos_begin,
                               unsigned long lpos_end,
                               unsigned long *lpos_out)
{

        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct prb_data_block *blk;
        enum desc_state d_state;
        struct prb_desc desc;
        struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos;
        unsigned long id;

        /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */
        while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) {
                blk = to_block(data_ring, lpos_begin);

                /*
                 * Load the block ID from the data block. This is a data race
                 * against a writer that may have newly reserved this data
                 * area. If the loaded value matches a valid descriptor ID,
                 * the blk_lpos of that descriptor will be checked to make
                 * sure it points back to this data block. If the check fails,
                 * the data area has been recycled by another writer.
                 */
                id = blk->id; /* LMM(data_make_reusable:A) */

                d_state = desc_read(desc_ring, id, &desc,
                                    NULL, NULL); /* LMM(data_make_reusable:B) */

                switch (d_state) {
                case desc_miss:
                case desc_reserved:
                case desc_committed:
                        return false;
                case desc_finalized:
                        /*
                         * This data block is invalid if the descriptor
                         * does not point back to it.
                         */
                        if (blk_lpos->begin != lpos_begin)
                                return false;
                        desc_make_reusable(desc_ring, id);
                        break;
                case desc_reusable:
                        /*
                         * This data block is invalid if the descriptor
                         * does not point back to it.
                         */
                        if (blk_lpos->begin != lpos_begin)
                                return false;
                        break;
                }

                /* Advance @lpos_begin to the next data block. */
                lpos_begin = blk_lpos->next;
        }

        *lpos_out = lpos_begin;
        return true;
}

/*
 * Advance the data ring tail to at least @lpos. This function puts
 * descriptors into the reusable state if the tail is pushed beyond
 * their associated data block.
 */
static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        unsigned long tail_lpos_new;
        unsigned long tail_lpos;
        unsigned long next_lpos;

        /* If @lpos is from a data-less block, there is nothing to do. */
        if (LPOS_DATALESS(lpos))
                return true;

        /*
         * Any descriptor states that have transitioned to reusable due to the
         * data tail being pushed to this loaded value will be visible to this
         * CPU. This pairs with data_push_tail:D.
         *
         * Memory barrier involvement:
         *
         * If data_push_tail:A reads from data_push_tail:D, then this CPU can
         * see desc_make_reusable:A.
         *
         * Relies on:
         *
         * MB from desc_make_reusable:A to data_push_tail:D
         *    matches
         * READFROM from data_push_tail:D to data_push_tail:A
         *    thus
         * READFROM from desc_make_reusable:A to this CPU
         */
        tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */

        /*
         * Loop until the tail lpos is at or beyond @lpos. This condition
         * may already be satisfied, resulting in no full memory barrier
         * from data_push_tail:D being performed. However, since this CPU
         * sees the new tail lpos, any descriptor states that transitioned to
         * the reusable state must already be visible.
         */
        while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) {
                /*
                 * Make all descriptors reusable that are associated with
                 * data blocks before @lpos.
                 */
                if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) {
                        /*
                         * 1. Guarantee the block ID loaded in
                         *    data_make_reusable() is performed before
                         *    reloading the tail lpos. The failed
                         *    data_make_reusable() may be due to a newly
                         *    recycled data area causing the tail lpos to
                         *    have been previously pushed. This pairs with
                         *    data_alloc:A and data_realloc:A.
                         *
                         *    Memory barrier involvement:
                         *
                         *    If data_make_reusable:A reads from data_alloc:B,
                         *    then data_push_tail:C reads from
                         *    data_push_tail:D.
                         *
                         *    Relies on:
                         *
                         *    MB from data_push_tail:D to data_alloc:B
                         *       matching
                         *    RMB from data_make_reusable:A to
                         *    data_push_tail:C
                         *
                         *    Note: data_push_tail:D and data_alloc:B can be
                         *          different CPUs. However, the data_alloc:B
                         *          CPU (which performs the full memory
                         *          barrier) must have previously seen
                         *          data_push_tail:D.
                         *
                         * 2. Guarantee the descriptor state loaded in
                         *    data_make_reusable() is performed before
                         *    reloading the tail lpos. The failed
                         *    data_make_reusable() may be due to a newly
                         *    recycled descriptor causing the tail lpos to
                         *    have been previously pushed. This pairs with
                         *    desc_reserve:D.
                         *
                         *    Memory barrier involvement:
                         *
                         *    If data_make_reusable:B reads from
                         *    desc_reserve:F, then data_push_tail:C reads
                         *    from data_push_tail:D.
                         *
                         *    Relies on:
                         *
                         *    MB from data_push_tail:D to desc_reserve:F
                         *       matching
                         *    RMB from data_make_reusable:B to
                         *    data_push_tail:C
                         *
                         *    Note: data_push_tail:D and desc_reserve:F can
                         *          be different CPUs. However, the
                         *          desc_reserve:F CPU (which performs the
                         *          full memory barrier) must have previously
                         *          seen data_push_tail:D.
                         */
                        smp_rmb(); /* LMM(data_push_tail:B) */

                        tail_lpos_new = atomic_long_read(&data_ring->tail_lpos
                                                        ); /* LMM(data_push_tail:C) */
                        if (tail_lpos_new == tail_lpos)
                                return false;

                        /* Another CPU pushed the tail. Try again. */
                        tail_lpos = tail_lpos_new;
                        continue;
                }

                /*
                 * Guarantee any descriptor states that have transitioned to
                 * reusable are stored before pushing the tail lpos. A full
                 * memory barrier is needed since other CPUs may have made
                 * the descriptor states reusable. This pairs with
                 * data_push_tail:A.
                 */
                if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos,
                                            next_lpos)) { /* LMM(data_push_tail:D) */
                        break;
                }
        }

        return true;
}

/*
 * Advance the desc ring tail. This function advances the tail by one
 * descriptor, thus invalidating the oldest descriptor. Before advancing
 * the tail, the tail descriptor is made reusable and all data blocks up to
 * and including the descriptor's data block are invalidated (i.e. the data
 * ring tail is pushed past the data block of the descriptor being made
 * reusable).
 */
static bool desc_push_tail(struct printk_ringbuffer *rb,
                           unsigned long tail_id)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        enum desc_state d_state;
        struct prb_desc desc;

        d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL);

        switch (d_state) {
        case desc_miss:
                /*
                 * If the ID is exactly 1 wrap behind the expected, it is
                 * in the process of being reserved by another writer and
                 * must be considered reserved.
                 */
                if (DESC_ID(atomic_long_read(&desc.state_var)) ==
                    DESC_ID_PREV_WRAP(desc_ring, tail_id)) {
                        return false;
                }

                /*
                 * The ID has changed. Another writer must have pushed the
                 * tail and recycled the descriptor already. Success is
                 * returned because the caller is only interested in the
                 * specified tail being pushed, which it was.
                 */
                return true;
        case desc_reserved:
        case desc_committed:
                return false;
        case desc_finalized:
                desc_make_reusable(desc_ring, tail_id);
                break;
        case desc_reusable:
                break;
        }

        /*
         * Data blocks must be invalidated before their associated
         * descriptor can be made available for recycling. Invalidating
         * them later is not possible because there is no way to trust
         * data blocks once their associated descriptor is gone.
         */

        if (!data_push_tail(rb, desc.text_blk_lpos.next))
                return false;

        /*
         * Check the next descriptor after @tail_id before pushing the tail
         * to it because the tail must always be in a finalized or reusable
         * state. The implementation of prb_first_seq() relies on this.
         *
         * A successful read implies that the next descriptor is less than or
         * equal to @head_id so there is no risk of pushing the tail past the
         * head.
         */
        d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc,
                            NULL, NULL); /* LMM(desc_push_tail:A) */

        if (d_state == desc_finalized || d_state == desc_reusable) {
                /*
                 * Guarantee any descriptor states that have transitioned to
                 * reusable are stored before pushing the tail ID. This allows
                 * verifying the recycled descriptor state. A full memory
                 * barrier is needed since other CPUs may have made the
                 * descriptor states reusable. This pairs with desc_reserve:D.
                 */
                atomic_long_cmpxchg(&desc_ring->tail_id, tail_id,
                                    DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */
        } else {
                /*
                 * Guarantee the last state load from desc_read() is before
                 * reloading @tail_id in order to see a new tail ID in the
                 * case that the descriptor has been recycled. This pairs
                 * with desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If desc_push_tail:A reads from desc_reserve:F, then
                 * desc_push_tail:D reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:F
                 *    matching
                 * RMB from desc_push_tail:A to desc_push_tail:D
                 *
                 * Note: desc_push_tail:B and desc_reserve:F can be different
                 *       CPUs. However, the desc_reserve:F CPU (which performs
                 *       the full memory barrier) must have previously seen
                 *       desc_push_tail:B.
                 */
                smp_rmb(); /* LMM(desc_push_tail:C) */

                /*
                 * Re-check the tail ID. The descriptor following @tail_id is
                 * not in an allowed tail state. But if the tail has since
                 * been moved by another CPU, then it does not matter.
                 */
                if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */
                        return false;
        }

        return true;
}

/* Reserve a new descriptor, invalidating the oldest if necessary. */
static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long prev_state_val;
        unsigned long id_prev_wrap;
        struct prb_desc *desc;
        unsigned long head_id;
        unsigned long id;

        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */

        do {
                id = DESC_ID(head_id + 1);
                id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id);

                /*
                 * Guarantee the head ID is read before reading the tail ID.
                 * Since the tail ID is updated before the head ID, this
                 * guarantees that @id_prev_wrap is never ahead of the tail
                 * ID. This pairs with desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If desc_reserve:A reads from desc_reserve:D, then
                 * desc_reserve:C reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:D
                 *    matching
                 * RMB from desc_reserve:A to desc_reserve:C
                 *
                 * Note: desc_push_tail:B and desc_reserve:D can be different
                 *       CPUs. However, the desc_reserve:D CPU (which performs
                 *       the full memory barrier) must have previously seen
                 *       desc_push_tail:B.
                 */
                smp_rmb(); /* LMM(desc_reserve:B) */

                if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id
                                                    )) { /* LMM(desc_reserve:C) */
                        /*
                         * Make space for the new descriptor by
                         * advancing the tail.
                         */
                        if (!desc_push_tail(rb, id_prev_wrap))
                                return false;
                }

                /*
                 * 1. Guarantee the tail ID is read before validating the
                 *    recycled descriptor state. A read memory barrier is
                 *    sufficient for this. This pairs with desc_push_tail:B.
                 *
                 *    Memory barrier involvement:
                 *
                 *    If desc_reserve:C reads from desc_push_tail:B, then
                 *    desc_reserve:E reads from desc_make_reusable:A.
                 *
                 *    Relies on:
                 *
                 *    MB from desc_make_reusable:A to desc_push_tail:B
                 *       matching
                 *    RMB from desc_reserve:C to desc_reserve:E
                 *
                 *    Note: desc_make_reusable:A and desc_push_tail:B can be
                 *          different CPUs. However, the desc_push_tail:B CPU
                 *          (which performs the full memory barrier) must have
                 *          previously seen desc_make_reusable:A.
                 *
                 * 2. Guarantee the tail ID is stored before storing the head
                 *    ID. This pairs with desc_reserve:B.
                 *
                 * 3. Guarantee any data ring tail changes are stored before
                 *    recycling the descriptor. Data ring tail changes can
                 *    happen via desc_push_tail()->data_push_tail(). A full
                 *    memory barrier is needed since another CPU may have
                 *    pushed the data ring tails. This pairs with
                 *    data_push_tail:B.
                 *
                 * 4. Guarantee a new tail ID is stored before recycling the
                 *    descriptor. A full memory barrier is needed since
                 *    another CPU may have pushed the tail ID. This pairs
                 *    with desc_push_tail:C and this also pairs with
                 *    prb_first_seq:C.
                 *
                 * 5. Guarantee the head ID is stored before trying to
                 *    finalize the previous descriptor. This pairs with
                 *    _prb_commit:B.
                 */
        } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id,
                                          id)); /* LMM(desc_reserve:D) */

        desc = to_desc(desc_ring, id);

        /*
         * If the descriptor has been recycled, verify the old state val.
         * See "ABA Issues" about why this verification is performed.
         */
        prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */
        if (prev_state_val &&
            get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) {
                WARN_ON_ONCE(1);
                return false;
        }

        /*
         * Assign the descriptor a new ID and set its state to reserved.
         * See "ABA Issues" about why cmpxchg() instead of set() is used.
         *
         * Guarantee the new descriptor ID and state is stored before making
         * any other changes. A write memory barrier is sufficient for this.
         * This pairs with desc_read:D.
         */
        if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val,
                        DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */
                WARN_ON_ONCE(1);
                return false;
        }

        /* Now data in @desc can be modified: LMM(desc_reserve:G) */

        *id_out = id;
        return true;
}

/* Determine the end of a data block. */
static unsigned long get_next_lpos(struct prb_data_ring *data_ring,
                                   unsigned long lpos, unsigned int size)
{
        unsigned long begin_lpos;
        unsigned long next_lpos;

        begin_lpos = lpos;
        next_lpos = lpos + size;

        /* First check if the data block does not wrap. */
        if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos))
                return next_lpos;

        /* Wrapping data blocks store their data at the beginning. */
        return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size);
}

/*
 * Allocate a new data block, invalidating the oldest data block(s)
 * if necessary. This function also associates the data block with
 * a specified descriptor.
 */
static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size,
                        struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_data_block *blk;
        unsigned long begin_lpos;
        unsigned long next_lpos;

        if (size == 0) {
                /*
                 * Data blocks are not created for empty lines. Instead, the
                 * reader will recognize these special lpos values and handle
                 * it appropriately.
                 */
                blk_lpos->begin = EMPTY_LINE_LPOS;
                blk_lpos->next = EMPTY_LINE_LPOS;
                return NULL;
        }

        size = to_blk_size(size);

        begin_lpos = atomic_long_read(&data_ring->head_lpos);

        do {
                next_lpos = get_next_lpos(data_ring, begin_lpos, size);

                if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) {
                        /* Failed to allocate, specify a data-less block. */
                        blk_lpos->begin = FAILED_LPOS;
                        blk_lpos->next = FAILED_LPOS;
                        return NULL;
                }

                /*
                 * 1. Guarantee any descriptor states that have transitioned
                 *    to reusable are stored before modifying the newly
                 *    allocated data area. A full memory barrier is needed
                 *    since other CPUs may have made the descriptor states
                 *    reusable. See data_push_tail:A about why the reusable
                 *    states are visible. This pairs with desc_read:D.
                 *
                 * 2. Guarantee any updated tail lpos is stored before
                 *    modifying the newly allocated data area. Another CPU may
                 *    be in data_make_reusable() and is reading a block ID
                 *    from this area. data_make_reusable() can handle reading
                 *    a garbage block ID value, but then it must be able to
                 *    load a new tail lpos. A full memory barrier is needed
                 *    since other CPUs may have updated the tail lpos. This
                 *    pairs with data_push_tail:B.
                 */
        } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos,
                                          next_lpos)); /* LMM(data_alloc:A) */

        blk = to_block(data_ring, begin_lpos);
        blk->id = id; /* LMM(data_alloc:B) */

        if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) {
                /* Wrapping data blocks store their data at the beginning. */
                blk = to_block(data_ring, 0);

                /*
                 * Store the ID on the wrapped block for consistency.
                 * The printk_ringbuffer does not actually use it.
                 */
                blk->id = id;
        }

        blk_lpos->begin = begin_lpos;
        blk_lpos->next = next_lpos;

        return &blk->data[0];
}

/*
 * Try to resize an existing data block associated with the descriptor
 * specified by @id. If the resized data block should become wrapped, it
 * copies the old data to the new data block. If @size yields a data block
 * with the same or less size, the data block is left as is.
 *
 * Fail if this is not the last allocated data block or if there is not
 * enough space or it is not possible make enough space.
 *
 * Return a pointer to the beginning of the entire data buffer or NULL on
 * failure.
 */
static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size,
                          struct prb_data_blk_lpos *blk_lpos, unsigned long id)
{
        struct prb_data_ring *data_ring = &rb->text_data_ring;
        struct prb_data_block *blk;
        unsigned long head_lpos;
        unsigned long next_lpos;
        bool wrapped;

        /* Reallocation only works if @blk_lpos is the newest data block. */
        head_lpos = atomic_long_read(&data_ring->head_lpos);
        if (head_lpos != blk_lpos->next)
                return NULL;

        /* Keep track if @blk_lpos was a wrapping data block. */
        wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next));

        size = to_blk_size(size);

        next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size);

        /* If the data block does not increase, there is nothing to do. */
        if (head_lpos - next_lpos < DATA_SIZE(data_ring)) {
                if (wrapped)
                        blk = to_block(data_ring, 0);
                else
                        blk = to_block(data_ring, blk_lpos->begin);
                return &blk->data[0];
        }

        if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring)))
                return NULL;

        /* The memory barrier involvement is the same as data_alloc:A. */
        if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos,
                                     next_lpos)) { /* LMM(data_realloc:A) */
                return NULL;
        }

        blk = to_block(data_ring, blk_lpos->begin);

        if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) {
                struct prb_data_block *old_blk = blk;

                /* Wrapping data blocks store their data at the beginning. */
                blk = to_block(data_ring, 0);

                /*
                 * Store the ID on the wrapped block for consistency.
                 * The printk_ringbuffer does not actually use it.
                 */
                blk->id = id;

                if (!wrapped) {
                        /*
                         * Since the allocated space is now in the newly
                         * created wrapping data block, copy the content
                         * from the old data block.
                         */
                        memcpy(&blk->data[0], &old_blk->data[0],
                               (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id));
                }
        }

        blk_lpos->next = next_lpos;

        return &blk->data[0];
}

/* Return the number of bytes used by a data block. */
static unsigned int space_used(struct prb_data_ring *data_ring,
                               struct prb_data_blk_lpos *blk_lpos)
{
        /* Data-less blocks take no space. */
        if (BLK_DATALESS(blk_lpos))
                return 0;

        if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) {
                /* Data block does not wrap. */
                return (DATA_INDEX(data_ring, blk_lpos->next) -
                        DATA_INDEX(data_ring, blk_lpos->begin));
        }

        /*
         * For wrapping data blocks, the trailing (wasted) space is
         * also counted.
         */
        return (DATA_INDEX(data_ring, blk_lpos->next) +
                DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin));
}

/*
 * Given @blk_lpos, return a pointer to the writer data from the data block
 * and calculate the size of the data part. A NULL pointer is returned if
 * @blk_lpos specifies values that could never be legal.
 *
 * This function (used by readers) performs strict validation on the lpos
 * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
 * triggered if an internal error is detected.
 */
static const char *get_data(struct prb_data_ring *data_ring,
                            struct prb_data_blk_lpos *blk_lpos,
                            unsigned int *data_size)
{
        struct prb_data_block *db;

        /* Data-less data block description. */
        if (BLK_DATALESS(blk_lpos)) {
                /*
                 * Records that are just empty lines are also valid, even
                 * though they do not have a data block. For such records
                 * explicitly return empty string data to signify success.
                 */
                if (blk_lpos->begin == EMPTY_LINE_LPOS &&
                    blk_lpos->next == EMPTY_LINE_LPOS) {
                        *data_size = 0;
                        return "";
                }

                /* Data lost, invalid, or otherwise unavailable. */
                return NULL;
        }

        /* Regular data block: @begin less than @next and in same wrap. */
        if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) &&
            blk_lpos->begin < blk_lpos->next) {
                db = to_block(data_ring, blk_lpos->begin);
                *data_size = blk_lpos->next - blk_lpos->begin;

        /* Wrapping data block: @begin is one wrap behind @next. */
        } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) ==
                   DATA_WRAPS(data_ring, blk_lpos->next)) {
                db = to_block(data_ring, 0);
                *data_size = DATA_INDEX(data_ring, blk_lpos->next);

        /* Illegal block description. */
        } else {
                WARN_ON_ONCE(1);
                return NULL;
        }

        /* A valid data block will always be aligned to the ID size. */
        if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) ||
            WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) {
                return NULL;
        }

        /* A valid data block will always have at least an ID. */
        if (WARN_ON_ONCE(*data_size < sizeof(db->id)))
                return NULL;

        /* Subtract block ID space from size to reflect data size. */
        *data_size -= sizeof(db->id);

        return &db->data[0];
}

/*
 * Attempt to transition the newest descriptor from committed back to reserved
 * so that the record can be modified by a writer again. This is only possible
 * if the descriptor is not yet finalized and the provided @caller_id matches.
 */
static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring,
                                         u32 caller_id, unsigned long *id_out)
{
        unsigned long prev_state_val;
        enum desc_state d_state;
        struct prb_desc desc;
        struct prb_desc *d;
        unsigned long id;
        u32 cid;

        id = atomic_long_read(&desc_ring->head_id);

        /*
         * To reduce unnecessarily reopening, first check if the descriptor
         * state and caller ID are correct.
         */
        d_state = desc_read(desc_ring, id, &desc, NULL, &cid);
        if (d_state != desc_committed || cid != caller_id)
                return NULL;

        d = to_desc(desc_ring, id);

        prev_state_val = DESC_SV(id, desc_committed);

        /*
         * Guarantee the reserved state is stored before reading any
         * record data. A full memory barrier is needed because @state_var
         * modification is followed by reading. This pairs with _prb_commit:B.
         *
         * Memory barrier involvement:
         *
         * If desc_reopen_last:A reads from _prb_commit:B, then
         * prb_reserve_in_last:A reads from _prb_commit:A.
         *
         * Relies on:
         *
         * WMB from _prb_commit:A to _prb_commit:B
         *    matching
         * MB If desc_reopen_last:A to prb_reserve_in_last:A
         */
        if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
                        DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */
                return NULL;
        }

        *id_out = id;
        return d;
}

/**
 * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer
 *                         used by the newest record.
 *
 * @e:         The entry structure to setup.
 * @rb:        The ringbuffer to re-reserve and extend data in.
 * @r:         The record structure to allocate buffers for.
 * @caller_id: The caller ID of the caller (reserving writer).
 * @max_size:  Fail if the extended size would be greater than this.
 *
 * This is the public function available to writers to re-reserve and extend
 * data.
 *
 * The writer specifies the text size to extend (not the new total size) by
 * setting the @text_buf_size field of @r. To ensure proper initialization
 * of @r, prb_rec_init_wr() should be used.
 *
 * This function will fail if @caller_id does not match the caller ID of the
 * newest record. In that case the caller must reserve new data using
 * prb_reserve().
 *
 * Context: Any context. Disables local interrupts on success.
 * Return: true if text data could be extended, otherwise false.
 *
 * On success:
 *
 *   - @r->text_buf points to the beginning of the entire text buffer.
 *
 *   - @r->text_buf_size is set to the new total size of the buffer.
 *
 *   - @r->info is not touched so that @r->info->text_len could be used
 *     to append the text.
 *
 *   - prb_record_text_space() can be used on @e to query the new
 *     actually used space.
 *
 * Important: All @r->info fields will already be set with the current values
 *            for the record. I.e. @r->info->text_len will be less than
 *            @text_buf_size. Writers can use @r->info->text_len to know
 *            where concatenation begins and writers should update
 *            @r->info->text_len after concatenating.
 */
bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                         struct printk_record *r, u32 caller_id, unsigned int max_size)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info;
        unsigned int data_size;
        struct prb_desc *d;
        unsigned long id;

        local_irq_save(e->irqflags);

        /* Transition the newest descriptor back to the reserved state. */
        d = desc_reopen_last(desc_ring, caller_id, &id);
        if (!d) {
                local_irq_restore(e->irqflags);
                goto fail_reopen;
        }

        /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */

        info = to_info(desc_ring, id);

        /*
         * Set the @e fields here so that prb_commit() can be used if
         * anything fails from now on.
         */
        e->rb = rb;
        e->id = id;

        /*
         * desc_reopen_last() checked the caller_id, but there was no
         * exclusive access at that point. The descriptor may have
         * changed since then.
         */
        if (caller_id != info->caller_id)
                goto fail;

        if (BLK_DATALESS(&d->text_blk_lpos)) {
                if (WARN_ON_ONCE(info->text_len != 0)) {
                        pr_warn_once("wrong text_len value (%hu, expecting 0)\n",
                                     info->text_len);
                        info->text_len = 0;
                }

                if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                        goto fail;

                if (r->text_buf_size > max_size)
                        goto fail;

                r->text_buf = data_alloc(rb, r->text_buf_size,
                                         &d->text_blk_lpos, id);
        } else {
                if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size))
                        goto fail;

                /*
                 * Increase the buffer size to include the original size. If
                 * the meta data (@text_len) is not sane, use the full data
                 * block size.
                 */
                if (WARN_ON_ONCE(info->text_len > data_size)) {
                        pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n",
                                     info->text_len, data_size);
                        info->text_len = data_size;
                }
                r->text_buf_size += info->text_len;

                if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                        goto fail;

                if (r->text_buf_size > max_size)
                        goto fail;

                r->text_buf = data_realloc(rb, r->text_buf_size,
                                           &d->text_blk_lpos, id);
        }
        if (r->text_buf_size && !r->text_buf)
                goto fail;

        r->info = info;

        e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);

        return true;
fail:
        prb_commit(e);
        /* prb_commit() re-enabled interrupts. */
fail_reopen:
        /* Make it clear to the caller that the re-reserve failed. */
        memset(r, 0, sizeof(*r));
        return false;
}

/*
 * @last_finalized_seq value guarantees that all records up to and including
 * this sequence number are finalized and can be read. The only exception are
 * too old records which have already been overwritten.
 *
 * It is also guaranteed that @last_finalized_seq only increases.
 *
 * Be aware that finalized records following non-finalized records are not
 * reported because they are not yet available to the reader. For example,
 * a new record stored via printk() will not be available to a printer if
 * it follows a record that has not been finalized yet. However, once that
 * non-finalized record becomes finalized, @last_finalized_seq will be
 * appropriately updated and the full set of finalized records will be
 * available to the printer. And since each printk() caller will either
 * directly print or trigger deferred printing of all available unprinted
 * records, all printk() messages will get printed.
 */
static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long ulseq;

        /*
         * Guarantee the sequence number is loaded before loading the
         * associated record in order to guarantee that the record can be
         * seen by this CPU. This pairs with desc_update_last_finalized:A.
         */
        ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq
                                        ); /* LMM(desc_last_finalized_seq:A) */

        return __ulseq_to_u64seq(rb, ulseq);
}

static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
                            struct printk_record *r, unsigned int *line_count);

/*
 * Check if there are records directly following @last_finalized_seq that are
 * finalized. If so, update @last_finalized_seq to the latest of these
 * records. It is not allowed to skip over records that are not yet finalized.
 */
static void desc_update_last_finalized(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        u64 old_seq = desc_last_finalized_seq(rb);
        unsigned long oldval;
        unsigned long newval;
        u64 finalized_seq;
        u64 try_seq;

try_again:
        finalized_seq = old_seq;
        try_seq = finalized_seq + 1;

        /* Try to find later finalized records. */
        while (_prb_read_valid(rb, &try_seq, NULL, NULL)) {
                finalized_seq = try_seq;
                try_seq++;
        }

        /* No update needed if no later finalized record was found. */
        if (finalized_seq == old_seq)
                return;

        oldval = __u64seq_to_ulseq(old_seq);
        newval = __u64seq_to_ulseq(finalized_seq);

        /*
         * Set the sequence number of a later finalized record that has been
         * seen.
         *
         * Guarantee the record data is visible to other CPUs before storing
         * its sequence number. This pairs with desc_last_finalized_seq:A.
         *
         * Memory barrier involvement:
         *
         * If desc_last_finalized_seq:A reads from
         * desc_update_last_finalized:A, then desc_read:A reads from
         * _prb_commit:B.
         *
         * Relies on:
         *
         * RELEASE from _prb_commit:B to desc_update_last_finalized:A
         *    matching
         * ACQUIRE from desc_last_finalized_seq:A to desc_read:A
         *
         * Note: _prb_commit:B and desc_update_last_finalized:A can be
         *       different CPUs. However, the desc_update_last_finalized:A
         *       CPU (which performs the release) must have previously seen
         *       _prb_commit:B.
         */
        if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq,
                                &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */
                old_seq = __ulseq_to_u64seq(rb, oldval);
                goto try_again;
        }
}

/*
 * Attempt to finalize a specified descriptor. If this fails, the descriptor
 * is either already final or it will finalize itself when the writer commits.
 */
static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long prev_state_val = DESC_SV(id, desc_committed);
        struct prb_desc *d = to_desc(desc_ring, id);

        if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val,
                        DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */
                desc_update_last_finalized(rb);
        }
}

/**
 * prb_reserve() - Reserve space in the ringbuffer.
 *
 * @e:  The entry structure to setup.
 * @rb: The ringbuffer to reserve data in.
 * @r:  The record structure to allocate buffers for.
 *
 * This is the public function available to writers to reserve data.
 *
 * The writer specifies the text size to reserve by setting the
 * @text_buf_size field of @r. To ensure proper initialization of @r,
 * prb_rec_init_wr() should be used.
 *
 * Context: Any context. Disables local interrupts on success.
 * Return: true if at least text data could be allocated, otherwise false.
 *
 * On success, the fields @info and @text_buf of @r will be set by this
 * function and should be filled in by the writer before committing. Also
 * on success, prb_record_text_space() can be used on @e to query the actual
 * space used for the text data block.
 *
 * Important: @info->text_len needs to be set correctly by the writer in
 *            order for data to be readable and/or extended. Its value
 *            is initialized to 0.
 */
bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb,
                 struct printk_record *r)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info;
        struct prb_desc *d;
        unsigned long id;
        u64 seq;

        if (!data_check_size(&rb->text_data_ring, r->text_buf_size))
                goto fail;

        /*
         * Descriptors in the reserved state act as blockers to all further
         * reservations once the desc_ring has fully wrapped. Disable
         * interrupts during the reserve/commit window in order to minimize
         * the likelihood of this happening.
         */
        local_irq_save(e->irqflags);

        if (!desc_reserve(rb, &id)) {
                /* Descriptor reservation failures are tracked. */
                atomic_long_inc(&rb->fail);
                local_irq_restore(e->irqflags);
                goto fail;
        }

        d = to_desc(desc_ring, id);
        info = to_info(desc_ring, id);

        /*
         * All @info fields (except @seq) are cleared and must be filled in
         * by the writer. Save @seq before clearing because it is used to
         * determine the new sequence number.
         */
        seq = info->seq;
        memset(info, 0, sizeof(*info));

        /*
         * Set the @e fields here so that prb_commit() can be used if
         * text data allocation fails.
         */
        e->rb = rb;
        e->id = id;

        /*
         * Initialize the sequence number if it has "never been set".
         * Otherwise just increment it by a full wrap.
         *
         * @seq is considered "never been set" if it has a value of 0,
         * _except_ for @infos[0], which was specially setup by the ringbuffer
         * initializer and therefore is always considered as set.
         *
         * See the "Bootstrap" comment block in printk_ringbuffer.h for
         * details about how the initializer bootstraps the descriptors.
         */
        if (seq == 0 && DESC_INDEX(desc_ring, id) != 0)
                info->seq = DESC_INDEX(desc_ring, id);
        else
                info->seq = seq + DESCS_COUNT(desc_ring);

        /*
         * New data is about to be reserved. Once that happens, previous
         * descriptors are no longer able to be extended. Finalize the
         * previous descriptor now so that it can be made available to
         * readers. (For seq==0 there is no previous descriptor.)
         */
        if (info->seq > 0)
                desc_make_final(rb, DESC_ID(id - 1));

        r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id);
        /* If text data allocation fails, a data-less record is committed. */
        if (r->text_buf_size && !r->text_buf) {
                prb_commit(e);
                /* prb_commit() re-enabled interrupts. */
                goto fail;
        }

        r->info = info;

        /* Record full text space used by record. */
        e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos);

        return true;
fail:
        /* Make it clear to the caller that the reserve failed. */
        memset(r, 0, sizeof(*r));
        return false;
}

/* Commit the data (possibly finalizing it) and restore interrupts. */
static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val)
{
        struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
        struct prb_desc *d = to_desc(desc_ring, e->id);
        unsigned long prev_state_val = DESC_SV(e->id, desc_reserved);

        /* Now the writer has finished all writing: LMM(_prb_commit:A) */

        /*
         * Set the descriptor as committed. See "ABA Issues" about why
         * cmpxchg() instead of set() is used.
         *
         * 1  Guarantee all record data is stored before the descriptor state
         *    is stored as committed. A write memory barrier is sufficient
         *    for this. This pairs with desc_read:B and desc_reopen_last:A.
         *
         * 2. Guarantee the descriptor state is stored as committed before
         *    re-checking the head ID in order to possibly finalize this
         *    descriptor. This pairs with desc_reserve:D.
         *
         *    Memory barrier involvement:
         *
         *    If prb_commit:A reads from desc_reserve:D, then
         *    desc_make_final:A reads from _prb_commit:B.
         *
         *    Relies on:
         *
         *    MB _prb_commit:B to prb_commit:A
         *       matching
         *    MB desc_reserve:D to desc_make_final:A
         */
        if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val,
                        DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */
                WARN_ON_ONCE(1);
        }

        /* Restore interrupts, the reserve/commit window is finished. */
        local_irq_restore(e->irqflags);
}

/**
 * prb_commit() - Commit (previously reserved) data to the ringbuffer.
 *
 * @e: The entry containing the reserved data information.
 *
 * This is the public function available to writers to commit data.
 *
 * Note that the data is not yet available to readers until it is finalized.
 * Finalizing happens automatically when space for the next record is
 * reserved.
 *
 * See prb_final_commit() for a version of this function that finalizes
 * immediately.
 *
 * Context: Any context. Enables local interrupts.
 */
void prb_commit(struct prb_reserved_entry *e)
{
        struct prb_desc_ring *desc_ring = &e->rb->desc_ring;
        unsigned long head_id;

        _prb_commit(e, desc_committed);

        /*
         * If this descriptor is no longer the head (i.e. a new record has
         * been allocated), extending the data for this record is no longer
         * allowed and therefore it must be finalized.
         */
        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */
        if (head_id != e->id)
                desc_make_final(e->rb, e->id);
}

/**
 * prb_final_commit() - Commit and finalize (previously reserved) data to
 *                      the ringbuffer.
 *
 * @e: The entry containing the reserved data information.
 *
 * This is the public function available to writers to commit+finalize data.
 *
 * By finalizing, the data is made immediately available to readers.
 *
 * This function should only be used if there are no intentions of extending
 * this data using prb_reserve_in_last().
 *
 * Context: Any context. Enables local interrupts.
 */
void prb_final_commit(struct prb_reserved_entry *e)
{
        _prb_commit(e, desc_finalized);

        desc_update_last_finalized(e->rb);
}

/*
 * Count the number of lines in provided text. All text has at least 1 line
 * (even if @text_size is 0). Each '\n' processed is counted as an additional
 * line.
 */
static unsigned int count_lines(const char *text, unsigned int text_size)
{
        unsigned int next_size = text_size;
        unsigned int line_count = 1;
        const char *next = text;

        while (next_size) {
                next = memchr(next, '\n', next_size);
                if (!next)
                        break;
                line_count++;
                next++;
                next_size = text_size - (next - text);
        }

        return line_count;
}

/*
 * Given @blk_lpos, copy an expected @len of data into the provided buffer.
 * If @line_count is provided, count the number of lines in the data.
 *
 * This function (used by readers) performs strict validation on the data
 * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is
 * triggered if an internal error is detected.
 */
static bool copy_data(struct prb_data_ring *data_ring,
                      struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf,
                      unsigned int buf_size, unsigned int *line_count)
{
        unsigned int data_size;
        const char *data;

        /* Caller might not want any data. */
        if ((!buf || !buf_size) && !line_count)
                return true;

        data = get_data(data_ring, blk_lpos, &data_size);
        if (!data)
                return false;

        /*
         * Actual cannot be less than expected. It can be more than expected
         * because of the trailing alignment padding.
         *
         * Note that invalid @len values can occur because the caller loads
         * the value during an allowed data race.
         */
        if (data_size < (unsigned int)len)
                return false;

        /* Caller interested in the line count? */
        if (line_count)
                *line_count = count_lines(data, len);

        /* Caller interested in the data content? */
        if (!buf || !buf_size)
                return true;

        data_size = min_t(unsigned int, buf_size, len);

        memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */
        return true;
}

/*
 * This is an extended version of desc_read(). It gets a copy of a specified
 * descriptor. However, it also verifies that the record is finalized and has
 * the sequence number @seq. On success, 0 is returned.
 *
 * Error return values:
 * -EINVAL: A finalized record with sequence number @seq does not exist.
 * -ENOENT: A finalized record with sequence number @seq exists, but its data
 *          is not available. This is a valid record, so readers should
 *          continue with the next record.
 */
static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring,
                                   unsigned long id, u64 seq,
                                   struct prb_desc *desc_out)
{
        struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos;
        enum desc_state d_state;
        u64 s;

        d_state = desc_read(desc_ring, id, desc_out, &s, NULL);

        /*
         * An unexpected @id (desc_miss) or @seq mismatch means the record
         * does not exist. A descriptor in the reserved or committed state
         * means the record does not yet exist for the reader.
         */
        if (d_state == desc_miss ||
            d_state == desc_reserved ||
            d_state == desc_committed ||
            s != seq) {
                return -EINVAL;
        }

        /*
         * A descriptor in the reusable state may no longer have its data
         * available; report it as existing but with lost data. Or the record
         * may actually be a record with lost data.
         */
        if (d_state == desc_reusable ||
            (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) {
                return -ENOENT;
        }

        return 0;
}

/*
 * Copy the ringbuffer data from the record with @seq to the provided
 * @r buffer. On success, 0 is returned.
 *
 * See desc_read_finalized_seq() for error return values.
 */
static int prb_read(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r, unsigned int *line_count)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        struct printk_info *info = to_info(desc_ring, seq);
        struct prb_desc *rdesc = to_desc(desc_ring, seq);
        atomic_long_t *state_var = &rdesc->state_var;
        struct prb_desc desc;
        unsigned long id;
        int err;

        /* Extract the ID, used to specify the descriptor to read. */
        id = DESC_ID(atomic_long_read(state_var));

        /* Get a local copy of the correct descriptor (if available). */
        err = desc_read_finalized_seq(desc_ring, id, seq, &desc);

        /*
         * If @r is NULL, the caller is only interested in the availability
         * of the record.
         */
        if (err || !r)
                return err;

        /* If requested, copy meta data. */
        if (r->info)
                memcpy(r->info, info, sizeof(*(r->info)));

        /* Copy text data. If it fails, this is a data-less record. */
        if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len,
                       r->text_buf, r->text_buf_size, line_count)) {
                return -ENOENT;
        }

        /* Ensure the record is still finalized and has the same @seq. */
        return desc_read_finalized_seq(desc_ring, id, seq, &desc);
}

/* Get the sequence number of the tail descriptor. */
u64 prb_first_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        enum desc_state d_state;
        struct prb_desc desc;
        unsigned long id;
        u64 seq;

        for (;;) {
                id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */

                d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */

                /*
                 * This loop will not be infinite because the tail is
                 * _always_ in the finalized or reusable state.
                 */
                if (d_state == desc_finalized || d_state == desc_reusable)
                        break;

                /*
                 * Guarantee the last state load from desc_read() is before
                 * reloading @tail_id in order to see a new tail in the case
                 * that the descriptor has been recycled. This pairs with
                 * desc_reserve:D.
                 *
                 * Memory barrier involvement:
                 *
                 * If prb_first_seq:B reads from desc_reserve:F, then
                 * prb_first_seq:A reads from desc_push_tail:B.
                 *
                 * Relies on:
                 *
                 * MB from desc_push_tail:B to desc_reserve:F
                 *    matching
                 * RMB prb_first_seq:B to prb_first_seq:A
                 */
                smp_rmb(); /* LMM(prb_first_seq:C) */
        }

        return seq;
}

/**
 * prb_next_reserve_seq() - Get the sequence number after the most recently
 *                  reserved record.
 *
 * @rb:  The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what sequence
 * number will be assigned to the next reserved record.
 *
 * Note that depending on the situation, this value can be equal to or
 * higher than the sequence number returned by prb_next_seq().
 *
 * Context: Any context.
 * Return: The sequence number that will be assigned to the next record
 *         reserved.
 */
u64 prb_next_reserve_seq(struct printk_ringbuffer *rb)
{
        struct prb_desc_ring *desc_ring = &rb->desc_ring;
        unsigned long last_finalized_id;
        atomic_long_t *state_var;
        u64 last_finalized_seq;
        unsigned long head_id;
        struct prb_desc desc;
        unsigned long diff;
        struct prb_desc *d;
        int err;

        /*
         * It may not be possible to read a sequence number for @head_id.
         * So the ID of @last_finailzed_seq is used to calculate what the
         * sequence number of @head_id will be.
         */

try_again:
        last_finalized_seq = desc_last_finalized_seq(rb);

        /*
         * @head_id is loaded after @last_finalized_seq to ensure that
         * it points to the record with @last_finalized_seq or newer.
         *
         * Memory barrier involvement:
         *
         * If desc_last_finalized_seq:A reads from
         * desc_update_last_finalized:A, then
         * prb_next_reserve_seq:A reads from desc_reserve:D.
         *
         * Relies on:
         *
         * RELEASE from desc_reserve:D to desc_update_last_finalized:A
         *    matching
         * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A
         *
         * Note: desc_reserve:D and desc_update_last_finalized:A can be
         *       different CPUs. However, the desc_update_last_finalized:A CPU
         *       (which performs the release) must have previously seen
         *       desc_read:C, which implies desc_reserve:D can be seen.
         */
        head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */

        d = to_desc(desc_ring, last_finalized_seq);
        state_var = &d->state_var;

        /* Extract the ID, used to specify the descriptor to read. */
        last_finalized_id = DESC_ID(atomic_long_read(state_var));

        /* Ensure @last_finalized_id is correct. */
        err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc);

        if (err == -EINVAL) {
                if (last_finalized_seq == 0) {
                        /*
                         * No record has been finalized or even reserved yet.
                         *
                         * The @head_id is initialized such that the first
                         * increment will yield the first record (seq=0).
                         * Handle it separately to avoid a negative @diff
                         * below.
                         */
                        if (head_id == DESC0_ID(desc_ring->count_bits))
                                return 0;

                        /*
                         * One or more descriptors are already reserved. Use
                         * the descriptor ID of the first one (@seq=0) for
                         * the @diff below.
                         */
                        last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1;
                } else {
                        /* Record must have been overwritten. Try again. */
                        goto try_again;
                }
        }

        /* Diff of known descriptor IDs to compute related sequence numbers. */
        diff = head_id - last_finalized_id;

        /*
         * @head_id points to the most recently reserved record, but this
         * function returns the sequence number that will be assigned to the
         * next (not yet reserved) record. Thus +1 is needed.
         */
        return (last_finalized_seq + diff + 1);
}

/*
 * Non-blocking read of a record.
 *
 * On success @seq is updated to the record that was read and (if provided)
 * @r and @line_count will contain the read/calculated data.
 *
 * On failure @seq is updated to a record that is not yet available to the
 * reader, but it will be the next record available to the reader.
 *
 * Note: When the current CPU is in panic, this function will skip over any
 *       non-existent/non-finalized records in order to allow the panic CPU
 *       to print any and all records that have been finalized.
 */
static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq,
                            struct printk_record *r, unsigned int *line_count)
{
        u64 tail_seq;
        int err;

        while ((err = prb_read(rb, *seq, r, line_count))) {
                tail_seq = prb_first_seq(rb);

                if (*seq < tail_seq) {
                        /*
                         * Behind the tail. Catch up and try again. This
                         * can happen for -ENOENT and -EINVAL cases.
                         */
                        *seq = tail_seq;

                } else if (err == -ENOENT) {
                        /* Record exists, but the data was lost. Skip. */
                        (*seq)++;

                } else {
                        /*
                         * Non-existent/non-finalized record. Must stop.
                         *
                         * For panic situations it cannot be expected that
                         * non-finalized records will become finalized. But
                         * there may be other finalized records beyond that
                         * need to be printed for a panic situation. If this
                         * is the panic CPU, skip this
                         * non-existent/non-finalized record unless non-panic
                         * CPUs are still running and their debugging is
                         * explicitly enabled.
                         *
                         * Note that new messages printed on panic CPU are
                         * finalized when we are here. The only exception
                         * might be the last message without trailing newline.
                         * But it would have the sequence number returned
                         * by "prb_next_reserve_seq() - 1".
                         */
                        if (this_cpu_in_panic() &&
                            (!debug_non_panic_cpus || legacy_allow_panic_sync) &&
                            ((*seq + 1) < prb_next_reserve_seq(rb))) {
                                (*seq)++;
                        } else {
                                return false;
                        }
                }
        }

        return true;
}

/**
 * prb_read_valid() - Non-blocking read of a requested record or (if gone)
 *                    the next available record.
 *
 * @rb:  The ringbuffer to read from.
 * @seq: The sequence number of the record to read.
 * @r:   A record data buffer to store the read record to.
 *
 * This is the public function available to readers to read a record.
 *
 * The reader provides the @info and @text_buf buffers of @r to be
 * filled in. Any of the buffer pointers can be set to NULL if the reader
 * is not interested in that data. To ensure proper initialization of @r,
 * prb_rec_init_rd() should be used.
 *
 * Context: Any context.
 * Return: true if a record was read, otherwise false.
 *
 * On success, the reader must check r->info.seq to see which record was
 * actually read. This allows the reader to detect dropped records.
 *
 * Failure means @seq refers to a record not yet available to the reader.
 */
bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq,
                    struct printk_record *r)
{
        return _prb_read_valid(rb, &seq, r, NULL);
}

/**
 * prb_read_valid_info() - Non-blocking read of meta data for a requested
 *                         record or (if gone) the next available record.
 *
 * @rb:         The ringbuffer to read from.
 * @seq:        The sequence number of the record to read.
 * @info:       A buffer to store the read record meta data to.
 * @line_count: A buffer to store the number of lines in the record text.
 *
 * This is the public function available to readers to read only the
 * meta data of a record.
 *
 * The reader provides the @info, @line_count buffers to be filled in.
 * Either of the buffer pointers can be set to NULL if the reader is not
 * interested in that data.
 *
 * Context: Any context.
 * Return: true if a record's meta data was read, otherwise false.
 *
 * On success, the reader must check info->seq to see which record meta data
 * was actually read. This allows the reader to detect dropped records.
 *
 * Failure means @seq refers to a record not yet available to the reader.
 */
bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq,
                         struct printk_info *info, unsigned int *line_count)
{
        struct printk_record r;

        prb_rec_init_rd(&r, info, NULL, 0);

        return _prb_read_valid(rb, &seq, &r, line_count);
}

/**
 * prb_first_valid_seq() - Get the sequence number of the oldest available
 *                         record.
 *
 * @rb: The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what the
 * first/oldest valid sequence number is.
 *
 * This provides readers a starting point to begin iterating the ringbuffer.
 *
 * Context: Any context.
 * Return: The sequence number of the first/oldest record or, if the
 *         ringbuffer is empty, 0 is returned.
 */
u64 prb_first_valid_seq(struct printk_ringbuffer *rb)
{
        u64 seq = 0;

        if (!_prb_read_valid(rb, &seq, NULL, NULL))
                return 0;

        return seq;
}

/**
 * prb_next_seq() - Get the sequence number after the last available record.
 *
 * @rb:  The ringbuffer to get the sequence number from.
 *
 * This is the public function available to readers to see what the next
 * newest sequence number available to readers will be.
 *
 * This provides readers a sequence number to jump to if all currently
 * available records should be skipped. It is guaranteed that all records
 * previous to the returned value have been finalized and are (or were)
 * available to the reader.
 *
 * Context: Any context.
 * Return: The sequence number of the next newest (not yet available) record
 *         for readers.
 */
u64 prb_next_seq(struct printk_ringbuffer *rb)
{
        u64 seq;

        seq = desc_last_finalized_seq(rb);

        /*
         * Begin searching after the last finalized record.
         *
         * On 0, the search must begin at 0 because of hack#2
         * of the bootstrapping phase it is not known if a
         * record at index 0 exists.
         */
        if (seq != 0)
                seq++;

        /*
         * The information about the last finalized @seq might be inaccurate.
         * Search forward to find the current one.
         */
        while (_prb_read_valid(rb, &seq, NULL, NULL))
                seq++;

        return seq;
}

/**
 * prb_init() - Initialize a ringbuffer to use provided external buffers.
 *
 * @rb:       The ringbuffer to initialize.
 * @text_buf: The data buffer for text data.
 * @textbits: The size of @text_buf as a power-of-2 value.
 * @descs:    The descriptor buffer for ringbuffer records.
 * @descbits: The count of @descs items as a power-of-2 value.
 * @infos:    The printk_info buffer for ringbuffer records.
 *
 * This is the public function available to writers to setup a ringbuffer
 * during runtime using provided buffers.
 *
 * This must match the initialization of DEFINE_PRINTKRB().
 *
 * Context: Any context.
 */
void prb_init(struct printk_ringbuffer *rb,
              char *text_buf, unsigned int textbits,
              struct prb_desc *descs, unsigned int descbits,
              struct printk_info *infos)
{
        memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0]));
        memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0]));

        rb->desc_ring.count_bits = descbits;
        rb->desc_ring.descs = descs;
        rb->desc_ring.infos = infos;
        atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits));
        atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits));
        atomic_long_set(&rb->desc_ring.last_finalized_seq, 0);

        rb->text_data_ring.size_bits = textbits;
        rb->text_data_ring.data = text_buf;
        atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits));
        atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits));

        atomic_long_set(&rb->fail, 0);

        atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits));
        descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS;
        descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS;

        infos[0].seq = -(u64)_DESCS_COUNT(descbits);
        infos[_DESCS_COUNT(descbits) - 1].seq = 0;
}

/**
 * prb_record_text_space() - Query the full actual used ringbuffer space for
 *                           the text data of a reserved entry.
 *
 * @e: The successfully reserved entry to query.
 *
 * This is the public function available to writers to see how much actual
 * space is used in the ringbuffer to store the text data of the specified
 * entry.
 *
 * This function is only valid if @e has been successfully reserved using
 * prb_reserve().
 *
 * Context: Any context.
 * Return: The size in bytes used by the text data of the associated record.
 */
unsigned int prb_record_text_space(struct prb_reserved_entry *e)
{
        return e->text_space;
}




































































































































































































































































































































































































































































































































































































   16 


























  131 





  166 










  169 


















































































































































































































































   34 




   57 




    4 




    4 







































    4 











    4 









































   25 


























   27 
































































































































   13 
  209 























































   27 











































































































































































































































































   70 
























































    8 

















    8 















   70 

















    8 

















    8 
























































































































   41 

























    6 




   33 




    6 
   33 



































  205 




   78 




  149 




































































   42 












   74 
  205 














































   78 
   78 




  149 

































































































































































































































































































































































































































































































   67 

   68 







   61 





















































































































































































    1 
  152 























































































































































































































































































































































































































































   43 

   28 

















   43 

   24 





























































   36 








    9 































    3 










    7 



  150 
































  276 
























  108 
































































































  146 









































































































































































































































































































   33 






































   26 
















































































































































































    4 
   20 
















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FS_H
#define _LINUX_FS_H

#include <linux/vfsdebug.h>
#include <linux/linkage.h>
#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
#include <linux/dcache.h>
#include <linux/path.h>
#include <linux/stat.h>
#include <linux/cache.h>
#include <linux/list.h>
#include <linux/list_lru.h>
#include <linux/llist.h>
#include <linux/radix-tree.h>
#include <linux/xarray.h>
#include <linux/rbtree.h>
#include <linux/init.h>
#include <linux/pid.h>
#include <linux/bug.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/mm_types.h>
#include <linux/capability.h>
#include <linux/semaphore.h>
#include <linux/fcntl.h>
#include <linux/rculist_bl.h>
#include <linux/atomic.h>
#include <linux/shrinker.h>
#include <linux/migrate_mode.h>
#include <linux/uidgid.h>
#include <linux/lockdep.h>
#include <linux/percpu-rwsem.h>
#include <linux/workqueue.h>
#include <linux/delayed_call.h>
#include <linux/uuid.h>
#include <linux/errseq.h>
#include <linux/ioprio.h>
#include <linux/fs_types.h>
#include <linux/build_bug.h>
#include <linux/stddef.h>
#include <linux/mount.h>
#include <linux/cred.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/maple_tree.h>
#include <linux/rw_hint.h>
#include <linux/file_ref.h>
#include <linux/unicode.h>

#include <asm/byteorder.h>
#include <uapi/linux/fs.h>

struct backing_dev_info;
struct bdi_writeback;
struct bio;
struct io_comp_batch;
struct export_operations;
struct fiemap_extent_info;
struct hd_geometry;
struct iovec;
struct kiocb;
struct kobject;
struct pipe_inode_info;
struct poll_table_struct;
struct kstatfs;
struct vm_area_struct;
struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
struct workqueue_struct;
struct iov_iter;
struct fscrypt_inode_info;
struct fscrypt_operations;
struct fsverity_info;
struct fsverity_operations;
struct fsnotify_mark_connector;
struct fsnotify_sb_info;
struct fs_context;
struct fs_parameter_spec;
struct fileattr;
struct iomap_ops;

extern void __init inode_init(void);
extern void __init inode_init_early(void);
extern void __init files_init(void);
extern void __init files_maxfiles_init(void);

extern unsigned long get_max_files(void);
extern unsigned int sysctl_nr_open;

typedef __kernel_rwf_t rwf_t;

struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
                        ssize_t bytes, void *private);

#define MAY_EXEC                0x00000001
#define MAY_WRITE                0x00000002
#define MAY_READ                0x00000004
#define MAY_APPEND                0x00000008
#define MAY_ACCESS                0x00000010
#define MAY_OPEN                0x00000020
#define MAY_CHDIR                0x00000040
/* called from RCU mode, don't block */
#define MAY_NOT_BLOCK                0x00000080

/*
 * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
 * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open()
 */

/* file is open for reading */
#define FMODE_READ                ((__force fmode_t)(1 << 0))
/* file is open for writing */
#define FMODE_WRITE                ((__force fmode_t)(1 << 1))
/* file is seekable */
#define FMODE_LSEEK                ((__force fmode_t)(1 << 2))
/* file can be accessed using pread */
#define FMODE_PREAD                ((__force fmode_t)(1 << 3))
/* file can be accessed using pwrite */
#define FMODE_PWRITE                ((__force fmode_t)(1 << 4))
/* File is opened for execution with sys_execve / sys_uselib */
#define FMODE_EXEC                ((__force fmode_t)(1 << 5))
/* File writes are restricted (block device specific) */
#define FMODE_WRITE_RESTRICTED        ((__force fmode_t)(1 << 6))
/* File supports atomic writes */
#define FMODE_CAN_ATOMIC_WRITE        ((__force fmode_t)(1 << 7))

/* FMODE_* bit 8 */

/* 32bit hashes as llseek() offset (for directories) */
#define FMODE_32BITHASH         ((__force fmode_t)(1 << 9))
/* 64bit hashes as llseek() offset (for directories) */
#define FMODE_64BITHASH         ((__force fmode_t)(1 << 10))

/*
 * Don't update ctime and mtime.
 *
 * Currently a special hack for the XFS open_by_handle ioctl, but we'll
 * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon.
 */
#define FMODE_NOCMTIME                ((__force fmode_t)(1 << 11))

/* Expect random access pattern */
#define FMODE_RANDOM                ((__force fmode_t)(1 << 12))

/* FMODE_* bit 13 */

/* File is opened with O_PATH; almost nothing can be done with it */
#define FMODE_PATH                ((__force fmode_t)(1 << 14))

/* File needs atomic accesses to f_pos */
#define FMODE_ATOMIC_POS        ((__force fmode_t)(1 << 15))
/* Write access to underlying fs */
#define FMODE_WRITER                ((__force fmode_t)(1 << 16))
/* Has read method(s) */
#define FMODE_CAN_READ          ((__force fmode_t)(1 << 17))
/* Has write method(s) */
#define FMODE_CAN_WRITE         ((__force fmode_t)(1 << 18))

#define FMODE_OPENED                ((__force fmode_t)(1 << 19))
#define FMODE_CREATED                ((__force fmode_t)(1 << 20))

/* File is stream-like */
#define FMODE_STREAM                ((__force fmode_t)(1 << 21))

/* File supports DIRECT IO */
#define        FMODE_CAN_ODIRECT        ((__force fmode_t)(1 << 22))

#define        FMODE_NOREUSE                ((__force fmode_t)(1 << 23))

/* File is embedded in backing_file object */
#define FMODE_BACKING                ((__force fmode_t)(1 << 24))

/*
 * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be
 * generated (see below)
 */
#define FMODE_NONOTIFY                ((__force fmode_t)(1 << 25))

/*
 * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be
 * generated (see below)
 */
#define FMODE_NONOTIFY_PERM        ((__force fmode_t)(1 << 26))

/* File is capable of returning -EAGAIN if I/O will block */
#define FMODE_NOWAIT                ((__force fmode_t)(1 << 27))

/* File represents mount that needs unmounting */
#define FMODE_NEED_UNMOUNT        ((__force fmode_t)(1 << 28))

/* File does not contribute to nr_files count */
#define FMODE_NOACCOUNT                ((__force fmode_t)(1 << 29))

/*
 * The two FMODE_NONOTIFY* define which fsnotify events should not be generated
 * for a file. These are the possible values of (f->f_mode &
 * FMODE_FSNOTIFY_MASK) and their meaning:
 *
 * FMODE_NONOTIFY - suppress all (incl. non-permission) events.
 * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events.
 * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events.
 */
#define FMODE_FSNOTIFY_MASK \
        (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)

#define FMODE_FSNOTIFY_NONE(mode) \
        ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY)
#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
#define FMODE_FSNOTIFY_PERM(mode) \
        ((mode & FMODE_FSNOTIFY_MASK) == 0 || \
         (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM))
#define FMODE_FSNOTIFY_HSM(mode) \
        ((mode & FMODE_FSNOTIFY_MASK) == 0)
#else
#define FMODE_FSNOTIFY_PERM(mode)        0
#define FMODE_FSNOTIFY_HSM(mode)        0
#endif

/*
 * Attribute flags.  These should be or-ed together to figure out what
 * has been changed!
 */
#define ATTR_MODE        (1 << 0)
#define ATTR_UID        (1 << 1)
#define ATTR_GID        (1 << 2)
#define ATTR_SIZE        (1 << 3)
#define ATTR_ATIME        (1 << 4)
#define ATTR_MTIME        (1 << 5)
#define ATTR_CTIME        (1 << 6)
#define ATTR_ATIME_SET        (1 << 7)
#define ATTR_MTIME_SET        (1 << 8)
#define ATTR_FORCE        (1 << 9) /* Not a change, but a change it */
#define ATTR_KILL_SUID        (1 << 11)
#define ATTR_KILL_SGID        (1 << 12)
#define ATTR_FILE        (1 << 13)
#define ATTR_KILL_PRIV        (1 << 14)
#define ATTR_OPEN        (1 << 15) /* Truncating from open(O_TRUNC) */
#define ATTR_TIMES_SET        (1 << 16)
#define ATTR_TOUCH        (1 << 17)
#define ATTR_DELEG        (1 << 18) /* Delegated attrs. Don't break write delegations */

/*
 * Whiteout is represented by a char device.  The following constants define the
 * mode and device number to use.
 */
#define WHITEOUT_MODE 0
#define WHITEOUT_DEV 0

/*
 * This is the Inode Attributes structure, used for notify_change().  It
 * uses the above definitions as flags, to know which values have changed.
 * Also, in this manner, a Filesystem can look at only the values it cares
 * about.  Basically, these are the attributes that the VFS layer can
 * request to change from the FS layer.
 *
 * Derek Atkins <warlord@MIT.EDU> 94-10-20
 */
struct iattr {
        unsigned int        ia_valid;
        umode_t                ia_mode;
        /*
         * The two anonymous unions wrap structures with the same member.
         *
         * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which
         * are a dedicated type requiring the filesystem to use the dedicated
         * helpers. Other filesystem can continue to use ia_{g,u}id until they
         * have been ported.
         *
         * They always contain the same value. In other words FS_ALLOW_IDMAP
         * pass down the same value on idmapped mounts as they would on regular
         * mounts.
         */
        union {
                kuid_t                ia_uid;
                vfsuid_t        ia_vfsuid;
        };
        union {
                kgid_t                ia_gid;
                vfsgid_t        ia_vfsgid;
        };
        loff_t                ia_size;
        struct timespec64 ia_atime;
        struct timespec64 ia_mtime;
        struct timespec64 ia_ctime;

        /*
         * Not an attribute, but an auxiliary info for filesystems wanting to
         * implement an ftruncate() like method.  NOTE: filesystem should
         * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL).
         */
        struct file        *ia_file;
};

/*
 * Includes for diskquotas.
 */
#include <linux/quota.h>

/*
 * Maximum number of layers of fs stack.  Needs to be limited to
 * prevent kernel stack overflow
 */
#define FILESYSTEM_MAX_STACK_DEPTH 2

/** 
 * enum positive_aop_returns - aop return codes with specific semantics
 *
 * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
 *                             completed, that the page is still locked, and
 *                             should be considered active.  The VM uses this hint
 *                             to return the page to the active list -- it won't
 *                             be a candidate for writeback again in the near
 *                             future.  Other callers must be careful to unlock
 *                             the page if they get this return.  Returned by
 *                             writepage(); 
 *
 * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
 *                          unlocked it and the page might have been truncated.
 *                          The caller should back up to acquiring a new page and
 *                          trying again.  The aop will be taking reasonable
 *                          precautions not to livelock.  If the caller held a page
 *                          reference, it should drop it before retrying.  Returned
 *                          by read_folio().
 *
 * address_space_operation functions return these large constants to indicate
 * special semantics to the caller.  These are much larger than the bytes in a
 * page to allow for functions that return the number of bytes operated on in a
 * given page.
 */

enum positive_aop_returns {
        AOP_WRITEPAGE_ACTIVATE        = 0x80000,
        AOP_TRUNCATED_PAGE        = 0x80001,
};

/*
 * oh the beauties of C type declarations.
 */
struct page;
struct address_space;
struct writeback_control;
struct readahead_control;

/* Match RWF_* bits to IOCB bits */
#define IOCB_HIPRI                (__force int) RWF_HIPRI
#define IOCB_DSYNC                (__force int) RWF_DSYNC
#define IOCB_SYNC                (__force int) RWF_SYNC
#define IOCB_NOWAIT                (__force int) RWF_NOWAIT
#define IOCB_APPEND                (__force int) RWF_APPEND
#define IOCB_ATOMIC                (__force int) RWF_ATOMIC
#define IOCB_DONTCACHE                (__force int) RWF_DONTCACHE

/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD                (1 << 16)
#define IOCB_DIRECT                (1 << 17)
#define IOCB_WRITE                (1 << 18)
/* iocb->ki_waitq is valid */
#define IOCB_WAITQ                (1 << 19)
#define IOCB_NOIO                (1 << 20)
/* can use bio alloc cache */
#define IOCB_ALLOC_CACHE        (1 << 21)
/*
 * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
 * iocb completion can be passed back to the owner for execution from a safe
 * context rather than needing to be punted through a workqueue. If this
 * flag is set, the bio completion handling may set iocb->dio_complete to a
 * handler function and iocb->private to context information for that handler.
 * The issuer should call the handler with that context information from task
 * context to complete the processing of the iocb. Note that while this
 * provides a task context for the dio_complete() callback, it should only be
 * used on the completion side for non-IO generating completions. It's fine to
 * call blocking functions from this callback, but they should not wait for
 * unrelated IO (like cache flushing, new IO generation, etc).
 */
#define IOCB_DIO_CALLER_COMP        (1 << 22)
/* kiocb is a read or write operation submitted by fs/aio.c. */
#define IOCB_AIO_RW                (1 << 23)
#define IOCB_HAS_METADATA        (1 << 24)

/* for use in trace events */
#define TRACE_IOCB_STRINGS \
        { IOCB_HIPRI,                "HIPRI" }, \
        { IOCB_DSYNC,                "DSYNC" }, \
        { IOCB_SYNC,                "SYNC" }, \
        { IOCB_NOWAIT,                "NOWAIT" }, \
        { IOCB_APPEND,                "APPEND" }, \
        { IOCB_ATOMIC,                "ATOMIC" }, \
        { IOCB_DONTCACHE,        "DONTCACHE" }, \
        { IOCB_EVENTFD,                "EVENTFD"}, \
        { IOCB_DIRECT,                "DIRECT" }, \
        { IOCB_WRITE,                "WRITE" }, \
        { IOCB_WAITQ,                "WAITQ" }, \
        { IOCB_NOIO,                "NOIO" }, \
        { IOCB_ALLOC_CACHE,        "ALLOC_CACHE" }, \
        { IOCB_DIO_CALLER_COMP,        "CALLER_COMP" }

struct kiocb {
        struct file                *ki_filp;
        loff_t                        ki_pos;
        void (*ki_complete)(struct kiocb *iocb, long ret);
        void                        *private;
        int                        ki_flags;
        u16                        ki_ioprio; /* See linux/ioprio.h */
        union {
                /*
                 * Only used for async buffered reads, where it denotes the
                 * page waitqueue associated with completing the read. Valid
                 * IFF IOCB_WAITQ is set.
                 */
                struct wait_page_queue        *ki_waitq;
                /*
                 * Can be used for O_DIRECT IO, where the completion handling
                 * is punted back to the issuer of the IO. May only be set
                 * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer
                 * must then check for presence of this handler when ki_complete
                 * is invoked. The data passed in to this handler must be
                 * assigned to ->private when dio_complete is assigned.
                 */
                ssize_t (*dio_complete)(void *data);
        };
};

static inline bool is_sync_kiocb(struct kiocb *kiocb)
{
        return kiocb->ki_complete == NULL;
}

struct address_space_operations {
        int (*writepage)(struct page *page, struct writeback_control *wbc);
        int (*read_folio)(struct file *, struct folio *);

        /* Write back some dirty pages from this mapping. */
        int (*writepages)(struct address_space *, struct writeback_control *);

        /* Mark a folio dirty.  Return true if this dirtied it */
        bool (*dirty_folio)(struct address_space *, struct folio *);

        void (*readahead)(struct readahead_control *);

        int (*write_begin)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len,
                                struct folio **foliop, void **fsdata);
        int (*write_end)(struct file *, struct address_space *mapping,
                                loff_t pos, unsigned len, unsigned copied,
                                struct folio *folio, void *fsdata);

        /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
        sector_t (*bmap)(struct address_space *, sector_t);
        void (*invalidate_folio) (struct folio *, size_t offset, size_t len);
        bool (*release_folio)(struct folio *, gfp_t);
        void (*free_folio)(struct folio *folio);
        ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
        /*
         * migrate the contents of a folio to the specified target. If
         * migrate_mode is MIGRATE_ASYNC, it must not block.
         */
        int (*migrate_folio)(struct address_space *, struct folio *dst,
                        struct folio *src, enum migrate_mode);
        int (*launder_folio)(struct folio *);
        bool (*is_partially_uptodate) (struct folio *, size_t from,
                        size_t count);
        void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb);
        int (*error_remove_folio)(struct address_space *, struct folio *);

        /* swapfile support */
        int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
                                sector_t *span);
        void (*swap_deactivate)(struct file *file);
        int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
};

extern const struct address_space_operations empty_aops;

/**
 * struct address_space - Contents of a cacheable, mappable object.
 * @host: Owner, either the inode or the block_device.
 * @i_pages: Cached pages.
 * @invalidate_lock: Guards coherency between page cache contents and
 *   file offset->disk block mappings in the filesystem during invalidates.
 *   It is also used to block modification of page cache contents through
 *   memory mappings.
 * @gfp_mask: Memory allocation flags to use for allocating pages.
 * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings.
 * @nr_thps: Number of THPs in the pagecache (non-shmem only).
 * @i_mmap: Tree of private and shared mappings.
 * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable.
 * @nrpages: Number of page entries, protected by the i_pages lock.
 * @writeback_index: Writeback starts here.
 * @a_ops: Methods.
 * @flags: Error bits and flags (AS_*).
 * @wb_err: The most recent error which has occurred.
 * @i_private_lock: For use by the owner of the address_space.
 * @i_private_list: For use by the owner of the address_space.
 * @i_private_data: For use by the owner of the address_space.
 */
struct address_space {
        struct inode                *host;
        struct xarray                i_pages;
        struct rw_semaphore        invalidate_lock;
        gfp_t                        gfp_mask;
        atomic_t                i_mmap_writable;
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
        /* number of thp, only for non-shmem files */
        atomic_t                nr_thps;
#endif
        struct rb_root_cached        i_mmap;
        unsigned long                nrpages;
        pgoff_t                        writeback_index;
        const struct address_space_operations *a_ops;
        unsigned long                flags;
        errseq_t                wb_err;
        spinlock_t                i_private_lock;
        struct list_head        i_private_list;
        struct rw_semaphore        i_mmap_rwsem;
        void *                        i_private_data;
} __attribute__((aligned(sizeof(long)))) __randomize_layout;
        /*
         * On most architectures that alignment is already the case; but
         * must be enforced here for CRIS, to let the least significant bit
         * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
         */

/* XArray tags, for tagging dirty and writeback pages in the pagecache. */
#define PAGECACHE_TAG_DIRTY        XA_MARK_0
#define PAGECACHE_TAG_WRITEBACK        XA_MARK_1
#define PAGECACHE_TAG_TOWRITE        XA_MARK_2

/*
 * Returns true if any of the pages in the mapping are marked with the tag.
 */
static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag)
{
        return xa_marked(&mapping->i_pages, tag);
}

static inline void i_mmap_lock_write(struct address_space *mapping)
{
        down_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_write(struct address_space *mapping)
{
        return down_write_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_write(struct address_space *mapping)
{
        up_write(&mapping->i_mmap_rwsem);
}

static inline int i_mmap_trylock_read(struct address_space *mapping)
{
        return down_read_trylock(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_lock_read(struct address_space *mapping)
{
        down_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_unlock_read(struct address_space *mapping)
{
        up_read(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_locked(struct address_space *mapping)
{
        lockdep_assert_held(&mapping->i_mmap_rwsem);
}

static inline void i_mmap_assert_write_locked(struct address_space *mapping)
{
        lockdep_assert_held_write(&mapping->i_mmap_rwsem);
}

/*
 * Might pages of this file be mapped into userspace?
 */
static inline int mapping_mapped(struct address_space *mapping)
{
        return        !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root);
}

/*
 * Might pages of this file have been modified in userspace?
 * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap
 * marks vma as VM_SHARED if it is shared, and the file was opened for
 * writing i.e. vma may be mprotected writable even if now readonly.
 *
 * If i_mmap_writable is negative, no new writable mappings are allowed. You
 * can only deny writable mappings, if none exists right now.
 */
static inline int mapping_writably_mapped(struct address_space *mapping)
{
        return atomic_read(&mapping->i_mmap_writable) > 0;
}

static inline int mapping_map_writable(struct address_space *mapping)
{
        return atomic_inc_unless_negative(&mapping->i_mmap_writable) ?
                0 : -EPERM;
}

static inline void mapping_unmap_writable(struct address_space *mapping)
{
        atomic_dec(&mapping->i_mmap_writable);
}

static inline int mapping_deny_writable(struct address_space *mapping)
{
        return atomic_dec_unless_positive(&mapping->i_mmap_writable) ?
                0 : -EBUSY;
}

static inline void mapping_allow_writable(struct address_space *mapping)
{
        atomic_inc(&mapping->i_mmap_writable);
}

/*
 * Use sequence counter to get consistent i_size on 32-bit processors.
 */
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
#include <linux/seqlock.h>
#define __NEED_I_SIZE_ORDERED
#define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount)
#else
#define i_size_ordered_init(inode) do { } while (0)
#endif

struct posix_acl;
#define ACL_NOT_CACHED ((void *)(-1))
/*
 * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to
 * cache the ACL.  This also means that ->get_inode_acl() can be called in RCU
 * mode with the LOOKUP_RCU flag.
 */
#define ACL_DONT_CACHE ((void *)(-3))

static inline struct posix_acl *
uncached_acl_sentinel(struct task_struct *task)
{
        return (void *)task + 1;
}

static inline bool
is_uncached_acl(struct posix_acl *acl)
{
        return (long)acl & 1;
}

#define IOP_FASTPERM        0x0001
#define IOP_LOOKUP        0x0002
#define IOP_NOFOLLOW        0x0004
#define IOP_XATTR        0x0008
#define IOP_DEFAULT_READLINK        0x0010
#define IOP_MGTIME        0x0020
#define IOP_CACHED_LINK        0x0040

/*
 * Keep mostly read-only and often accessed (especially for
 * the RCU path lookup and 'stat' data) fields at the beginning
 * of the 'struct inode'
 */
struct inode {
        umode_t                        i_mode;
        unsigned short                i_opflags;
        kuid_t                        i_uid;
        kgid_t                        i_gid;
        unsigned int                i_flags;

#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl        *i_acl;
        struct posix_acl        *i_default_acl;
#endif

        const struct inode_operations        *i_op;
        struct super_block        *i_sb;
        struct address_space        *i_mapping;

#ifdef CONFIG_SECURITY
        void                        *i_security;
#endif

        /* Stat data, not accessed from path walking */
        unsigned long                i_ino;
        /*
         * Filesystems may only read i_nlink directly.  They shall use the
         * following functions for modification:
         *
         *    (set|clear|inc|drop)_nlink
         *    inode_(inc|dec)_link_count
         */
        union {
                const unsigned int i_nlink;
                unsigned int __i_nlink;
        };
        dev_t                        i_rdev;
        loff_t                        i_size;
        time64_t                i_atime_sec;
        time64_t                i_mtime_sec;
        time64_t                i_ctime_sec;
        u32                        i_atime_nsec;
        u32                        i_mtime_nsec;
        u32                        i_ctime_nsec;
        u32                        i_generation;
        spinlock_t                i_lock;        /* i_blocks, i_bytes, maybe i_size */
        unsigned short          i_bytes;
        u8                        i_blkbits;
        enum rw_hint                i_write_hint;
        blkcnt_t                i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
        seqcount_t                i_size_seqcount;
#endif

        /* Misc */
        u32                        i_state;
        /* 32-bit hole */
        struct rw_semaphore        i_rwsem;

        unsigned long                dirtied_when;        /* jiffies of first dirtying */
        unsigned long                dirtied_time_when;

        struct hlist_node        i_hash;
        struct list_head        i_io_list;        /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback        *i_wb;                /* the associated cgroup wb */

        /* foreign inode detection, see wbc_detach_inode() */
        int                        i_wb_frn_winner;
        u16                        i_wb_frn_avg_time;
        u16                        i_wb_frn_history;
#endif
        struct list_head        i_lru;                /* inode LRU list */
        struct list_head        i_sb_list;
        struct list_head        i_wb_list;        /* backing dev writeback list */
        union {
                struct hlist_head        i_dentry;
                struct rcu_head                i_rcu;
        };
        atomic64_t                i_version;
        atomic64_t                i_sequence; /* see futex */
        atomic_t                i_count;
        atomic_t                i_dio_count;
        atomic_t                i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
        atomic_t                i_readcount; /* struct files open RO */
#endif
        union {
                const struct file_operations        *i_fop;        /* former ->i_op->default_file_ops */
                void (*free_inode)(struct inode *);
        };
        struct file_lock_context        *i_flctx;
        struct address_space        i_data;
        union {
                struct list_head        i_devices;
                int                        i_linklen;
        };
        union {
                struct pipe_inode_info        *i_pipe;
                struct cdev                *i_cdev;
                char                        *i_link;
                unsigned                i_dir_seq;
        };


#ifdef CONFIG_FSNOTIFY
        __u32                        i_fsnotify_mask; /* all events this inode cares about */
        /* 32-bit hole reserved for expanding i_fsnotify_mask */
        struct fsnotify_mark_connector __rcu        *i_fsnotify_marks;
#endif

#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_inode_info        *i_crypt_info;
#endif

#ifdef CONFIG_FS_VERITY
        struct fsverity_info        *i_verity_info;
#endif

        void                        *i_private; /* fs or device private pointer */
} __randomize_layout;

static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
        VFS_WARN_ON_INODE(strlen(link) != linklen, inode);
        VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode);
        inode->i_link = link;
        inode->i_linklen = linklen;
        inode->i_opflags |= IOP_CACHED_LINK;
}

/*
 * Get bit address from inode->i_state to use with wait_var_event()
 * infrastructre.
 */
#define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit))

struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe,
                                            struct inode *inode, u32 bit);

static inline void inode_wake_up_bit(struct inode *inode, u32 bit)
{
        /* Caller is responsible for correct memory barriers. */
        wake_up_var(inode_state_wait_address(inode, bit));
}

struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode);

static inline unsigned int i_blocksize(const struct inode *node)
{
        return (1 << node->i_blkbits);
}

static inline int inode_unhashed(struct inode *inode)
{
        return hlist_unhashed(&inode->i_hash);
}

/*
 * __mark_inode_dirty expects inodes to be hashed.  Since we don't
 * want special inodes in the fileset inode space, we make them
 * appear hashed, but do not put on any lists.  hlist_del()
 * will work fine and require no locking.
 */
static inline void inode_fake_hash(struct inode *inode)
{
        hlist_add_fake(&inode->i_hash);
}

/*
 * inode->i_mutex nesting subclasses for the lock validator:
 *
 * 0: the object of the current VFS operation
 * 1: parent
 * 2: child/target
 * 3: xattr
 * 4: second non-directory
 * 5: second parent (when locking independent directories in rename)
 *
 * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two
 * non-directories at once.
 *
 * The locking order between these classes is
 * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory
 */
enum inode_i_mutex_lock_class
{
        I_MUTEX_NORMAL,
        I_MUTEX_PARENT,
        I_MUTEX_CHILD,
        I_MUTEX_XATTR,
        I_MUTEX_NONDIR2,
        I_MUTEX_PARENT2,
};

static inline void inode_lock(struct inode *inode)
{
        down_write(&inode->i_rwsem);
}

static inline void inode_unlock(struct inode *inode)
{
        up_write(&inode->i_rwsem);
}

static inline void inode_lock_shared(struct inode *inode)
{
        down_read(&inode->i_rwsem);
}

static inline void inode_unlock_shared(struct inode *inode)
{
        up_read(&inode->i_rwsem);
}

static inline int inode_trylock(struct inode *inode)
{
        return down_write_trylock(&inode->i_rwsem);
}

static inline int inode_trylock_shared(struct inode *inode)
{
        return down_read_trylock(&inode->i_rwsem);
}

static inline int inode_is_locked(struct inode *inode)
{
        return rwsem_is_locked(&inode->i_rwsem);
}

static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
{
        down_write_nested(&inode->i_rwsem, subclass);
}

static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass)
{
        down_read_nested(&inode->i_rwsem, subclass);
}

static inline void filemap_invalidate_lock(struct address_space *mapping)
{
        down_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock(struct address_space *mapping)
{
        up_write(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_lock_shared(struct address_space *mapping)
{
        down_read(&mapping->invalidate_lock);
}

static inline int filemap_invalidate_trylock_shared(
                                        struct address_space *mapping)
{
        return down_read_trylock(&mapping->invalidate_lock);
}

static inline void filemap_invalidate_unlock_shared(
                                        struct address_space *mapping)
{
        up_read(&mapping->invalidate_lock);
}

void lock_two_nondirectories(struct inode *, struct inode*);
void unlock_two_nondirectories(struct inode *, struct inode*);

void filemap_invalidate_lock_two(struct address_space *mapping1,
                                 struct address_space *mapping2);
void filemap_invalidate_unlock_two(struct address_space *mapping1,
                                   struct address_space *mapping2);


/*
 * NOTE: in a 32bit arch with a preemptable kernel and
 * an UP compile the i_size_read/write must be atomic
 * with respect to the local cpu (unlike with preempt disabled),
 * but they don't need to be atomic with respect to other cpus like in
 * true SMP (so they need either to either locally disable irq around
 * the read or for example on x86 they can be still implemented as a
 * cmpxchg8b without the need of the lock prefix). For SMP compiles
 * and 64bit archs it makes no difference if preempt is enabled or not.
 */
static inline loff_t i_size_read(const struct inode *inode)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        loff_t i_size;
        unsigned int seq;

        do {
                seq = read_seqcount_begin(&inode->i_size_seqcount);
                i_size = inode->i_size;
        } while (read_seqcount_retry(&inode->i_size_seqcount, seq));
        return i_size;
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        loff_t i_size;

        preempt_disable();
        i_size = inode->i_size;
        preempt_enable();
        return i_size;
#else
        /* Pairs with smp_store_release() in i_size_write() */
        return smp_load_acquire(&inode->i_size);
#endif
}

/*
 * NOTE: unlike i_size_read(), i_size_write() does need locking around it
 * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount
 * can be lost, resulting in subsequent i_size_read() calls spinning forever.
 */
static inline void i_size_write(struct inode *inode, loff_t i_size)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
        preempt_disable();
        write_seqcount_begin(&inode->i_size_seqcount);
        inode->i_size = i_size;
        write_seqcount_end(&inode->i_size_seqcount);
        preempt_enable();
#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
        preempt_disable();
        inode->i_size = i_size;
        preempt_enable();
#else
        /*
         * Pairs with smp_load_acquire() in i_size_read() to ensure
         * changes related to inode size (such as page contents) are
         * visible before we see the changed inode size.
         */
        smp_store_release(&inode->i_size, i_size);
#endif
}

static inline unsigned iminor(const struct inode *inode)
{
        return MINOR(inode->i_rdev);
}

static inline unsigned imajor(const struct inode *inode)
{
        return MAJOR(inode->i_rdev);
}

struct fown_struct {
        struct file *file;        /* backpointer for security modules */
        rwlock_t lock;          /* protects pid, uid, euid fields */
        struct pid *pid;        /* pid or -pgrp where SIGIO should be sent */
        enum pid_type pid_type;        /* Kind of process group SIGIO should be sent to */
        kuid_t uid, euid;        /* uid/euid of process setting the owner */
        int signum;                /* posix.1b rt signal to be delivered on IO */
};

/**
 * struct file_ra_state - Track a file's readahead state.
 * @start: Where the most recent readahead started.
 * @size: Number of pages read in the most recent readahead.
 * @async_size: Numer of pages that were/are not needed immediately
 *      and so were/are genuinely "ahead".  Start next readahead when
 *      the first of these pages is accessed.
 * @ra_pages: Maximum size of a readahead request, copied from the bdi.
 * @mmap_miss: How many mmap accesses missed in the page cache.
 * @prev_pos: The last byte in the most recent read request.
 *
 * When this structure is passed to ->readahead(), the "most recent"
 * readahead means the current readahead.
 */
struct file_ra_state {
        pgoff_t start;
        unsigned int size;
        unsigned int async_size;
        unsigned int ra_pages;
        unsigned int mmap_miss;
        loff_t prev_pos;
};

/*
 * Check if @index falls in the readahead windows.
 */
static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
{
        return (index >= ra->start &&
                index <  ra->start + ra->size);
}

/**
 * struct file - Represents a file
 * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
 * @f_mode: FMODE_* flags often used in hotpaths
 * @f_op: file operations
 * @f_mapping: Contents of a cacheable, mappable object.
 * @private_data: filesystem or driver specific data
 * @f_inode: cached inode
 * @f_flags: file flags
 * @f_iocb_flags: iocb flags
 * @f_cred: stashed credentials of creator/opener
 * @f_owner: file owner
 * @f_path: path of the file
 * @f_pos_lock: lock protecting file position
 * @f_pipe: specific to pipes
 * @f_pos: file position
 * @f_security: LSM security context of this file
 * @f_wb_err: writeback error
 * @f_sb_err: per sb writeback errors
 * @f_ep: link of all epoll hooks for this file
 * @f_task_work: task work entry point
 * @f_llist: work queue entrypoint
 * @f_ra: file's readahead state
 * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
 * @f_ref: reference count
 */
struct file {
        spinlock_t                        f_lock;
        fmode_t                                f_mode;
        const struct file_operations        *f_op;
        struct address_space                *f_mapping;
        void                                *private_data;
        struct inode                        *f_inode;
        unsigned int                        f_flags;
        unsigned int                        f_iocb_flags;
        const struct cred                *f_cred;
        struct fown_struct                *f_owner;
        /* --- cacheline 1 boundary (64 bytes) --- */
        struct path                        f_path;
        union {
                /* regular files (with FMODE_ATOMIC_POS) and directories */
                struct mutex                f_pos_lock;
                /* pipes */
                u64                        f_pipe;
        };
        loff_t                                f_pos;
#ifdef CONFIG_SECURITY
        void                                *f_security;
#endif
        /* --- cacheline 2 boundary (128 bytes) --- */
        errseq_t                        f_wb_err;
        errseq_t                        f_sb_err;
#ifdef CONFIG_EPOLL
        struct hlist_head                *f_ep;
#endif
        union {
                struct callback_head        f_task_work;
                struct llist_node        f_llist;
                struct file_ra_state        f_ra;
                freeptr_t                f_freeptr;
        };
        file_ref_t                        f_ref;
        /* --- cacheline 3 boundary (192 bytes) --- */
} __randomize_layout
  __attribute__((aligned(4)));        /* lest something weird decides that 2 is OK */

struct file_handle {
        __u32 handle_bytes;
        int handle_type;
        /* file identifier */
        unsigned char f_handle[] __counted_by(handle_bytes);
};

static inline struct file *get_file(struct file *f)
{
        file_ref_inc(&f->f_ref);
        return f;
}

struct file *get_file_rcu(struct file __rcu **f);
struct file *get_file_active(struct file **f);

#define file_count(f)        file_ref_read(&(f)->f_ref)

#define        MAX_NON_LFS        ((1UL<<31) - 1)

/* Page cache limit. The filesystems should put that into their s_maxbytes 
   limits, otherwise bad things can happen in VM. */ 
#if BITS_PER_LONG==32
#define MAX_LFS_FILESIZE        ((loff_t)ULONG_MAX << PAGE_SHIFT)
#elif BITS_PER_LONG==64
#define MAX_LFS_FILESIZE         ((loff_t)LLONG_MAX)
#endif

/* legacy typedef, should eventually be removed */
typedef void *fl_owner_t;

struct file_lock;
struct file_lease;

/* The following constant reflects the upper bound of the file/locking space */
#ifndef OFFSET_MAX
#define OFFSET_MAX        type_max(loff_t)
#define OFFT_OFFSET_MAX        type_max(off_t)
#endif

int file_f_owner_allocate(struct file *file);
static inline struct fown_struct *file_f_owner(const struct file *file)
{
        return READ_ONCE(file->f_owner);
}

extern void send_sigio(struct fown_struct *fown, int fd, int band);

static inline struct inode *file_inode(const struct file *f)
{
        return f->f_inode;
}

/*
 * file_dentry() is a relic from the days that overlayfs was using files with a
 * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs.
 * In those days, file_dentry() was needed to get the underlying fs dentry that
 * matches f_inode.
 * Files with "fake" path should not exist nowadays, so use an assertion to make
 * sure that file_dentry() was not papering over filesystem bugs.
 */
static inline struct dentry *file_dentry(const struct file *file)
{
        struct dentry *dentry = file->f_path.dentry;

        WARN_ON_ONCE(d_inode(dentry) != file_inode(file));
        return dentry;
}

struct fasync_struct {
        rwlock_t                fa_lock;
        int                        magic;
        int                        fa_fd;
        struct fasync_struct        *fa_next; /* singly linked list */
        struct file                *fa_file;
        struct rcu_head                fa_rcu;
};

#define FASYNC_MAGIC 0x4601

/* SMP safe fasync helpers: */
extern int fasync_helper(int, struct file *, int, struct fasync_struct **);
extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *);
extern int fasync_remove_entry(struct file *, struct fasync_struct **);
extern struct fasync_struct *fasync_alloc(void);
extern void fasync_free(struct fasync_struct *);

/* can be called from interrupts */
extern void kill_fasync(struct fasync_struct **, int, int);

extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
extern int f_setown(struct file *filp, int who, int force);
extern void f_delown(struct file *filp);
extern pid_t f_getown(struct file *filp);
extern int send_sigurg(struct file *file);

/*
 * sb->s_flags.  Note that these mirror the equivalent MS_* flags where
 * represented in both.
 */
#define SB_RDONLY       BIT(0)        /* Mount read-only */
#define SB_NOSUID       BIT(1)        /* Ignore suid and sgid bits */
#define SB_NODEV        BIT(2)        /* Disallow access to device special files */
#define SB_NOEXEC       BIT(3)        /* Disallow program execution */
#define SB_SYNCHRONOUS  BIT(4)        /* Writes are synced at once */
#define SB_MANDLOCK     BIT(6)        /* Allow mandatory locks on an FS */
#define SB_DIRSYNC      BIT(7)        /* Directory modifications are synchronous */
#define SB_NOATIME      BIT(10)        /* Do not update access times. */
#define SB_NODIRATIME   BIT(11)        /* Do not update directory access times */
#define SB_SILENT       BIT(15)
#define SB_POSIXACL     BIT(16)        /* Supports POSIX ACLs */
#define SB_INLINECRYPT  BIT(17)        /* Use blk-crypto for encrypted files */
#define SB_KERNMOUNT    BIT(22)        /* this is a kern_mount call */
#define SB_I_VERSION    BIT(23)        /* Update inode I_version field */
#define SB_LAZYTIME     BIT(25)        /* Update the on-disk [acm]times lazily */

/* These sb flags are internal to the kernel */
#define SB_DEAD         BIT(21)
#define SB_DYING        BIT(24)
#define SB_SUBMOUNT     BIT(26)
#define SB_FORCE        BIT(27)
#define SB_NOSEC        BIT(28)
#define SB_BORN         BIT(29)
#define SB_ACTIVE       BIT(30)
#define SB_NOUSER       BIT(31)

/* These flags relate to encoding and casefolding */
#define SB_ENC_STRICT_MODE_FL                (1 << 0)
#define SB_ENC_NO_COMPAT_FALLBACK_FL        (1 << 1)

#define sb_has_strict_encoding(sb) \
        (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL)

#if IS_ENABLED(CONFIG_UNICODE)
#define sb_no_casefold_compat_fallback(sb) \
        (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL)
#else
#define sb_no_casefold_compat_fallback(sb) (1)
#endif

/*
 *        Umount options
 */

#define MNT_FORCE        0x00000001        /* Attempt to forcibily umount */
#define MNT_DETACH        0x00000002        /* Just detach from the tree */
#define MNT_EXPIRE        0x00000004        /* Mark for expiry */
#define UMOUNT_NOFOLLOW        0x00000008        /* Don't follow symlink on umount */
#define UMOUNT_UNUSED        0x80000000        /* Flag guaranteed to be unused */

/* sb->s_iflags */
#define SB_I_CGROUPWB        0x00000001        /* cgroup-aware writeback enabled */
#define SB_I_NOEXEC        0x00000002        /* Ignore executables on this fs */
#define SB_I_NODEV        0x00000004        /* Ignore devices on this fs */
#define SB_I_STABLE_WRITES 0x00000008        /* don't modify blks until WB is done */

/* sb->s_iflags to limit user namespace mounts */
#define SB_I_USERNS_VISIBLE                0x00000010 /* fstype already mounted */
#define SB_I_IMA_UNVERIFIABLE_SIGNATURE        0x00000020
#define SB_I_UNTRUSTED_MOUNTER                0x00000040
#define SB_I_EVM_HMAC_UNSUPPORTED        0x00000080

#define SB_I_SKIP_SYNC        0x00000100        /* Skip superblock at global sync */
#define SB_I_PERSB_BDI        0x00000200        /* has a per-sb bdi */
#define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */
#define SB_I_RETIRED        0x00000800        /* superblock shouldn't be reused */
#define SB_I_NOUMASK        0x00001000        /* VFS does not apply umask */
#define SB_I_NOIDMAP        0x00002000        /* No idmapped mounts on this superblock */
#define SB_I_ALLOW_HSM        0x00004000        /* Allow HSM events on this superblock */

/* Possible states of 'frozen' field */
enum {
        SB_UNFROZEN = 0,                /* FS is unfrozen */
        SB_FREEZE_WRITE        = 1,                /* Writes, dir ops, ioctls frozen */
        SB_FREEZE_PAGEFAULT = 2,        /* Page faults stopped as well */
        SB_FREEZE_FS = 3,                /* For internal FS use (e.g. to stop
                                         * internal threads if needed) */
        SB_FREEZE_COMPLETE = 4,                /* ->freeze_fs finished successfully */
};

#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)

struct sb_writers {
        unsigned short                        frozen;                /* Is sb frozen? */
        int                                freeze_kcount;        /* How many kernel freeze requests? */
        int                                freeze_ucount;        /* How many userspace freeze requests? */
        struct percpu_rw_semaphore        rw_sem[SB_FREEZE_LEVELS];
};

struct super_block {
        struct list_head        s_list;                /* Keep this first */
        dev_t                        s_dev;                /* search index; _not_ kdev_t */
        unsigned char                s_blocksize_bits;
        unsigned long                s_blocksize;
        loff_t                        s_maxbytes;        /* Max file size */
        struct file_system_type        *s_type;
        const struct super_operations        *s_op;
        const struct dquot_operations        *dq_op;
        const struct quotactl_ops        *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long                s_flags;
        unsigned long                s_iflags;        /* internal SB_I_* flags */
        unsigned long                s_magic;
        struct dentry                *s_root;
        struct rw_semaphore        s_umount;
        int                        s_count;
        atomic_t                s_active;
#ifdef CONFIG_SECURITY
        void                    *s_security;
#endif
        const struct xattr_handler * const *s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
        const struct fscrypt_operations        *s_cop;
        struct fscrypt_keyring        *s_master_keys; /* master crypto keys in use */
#endif
#ifdef CONFIG_FS_VERITY
        const struct fsverity_operations *s_vop;
#endif
#if IS_ENABLED(CONFIG_UNICODE)
        struct unicode_map *s_encoding;
        __u16 s_encoding_flags;
#endif
        struct hlist_bl_head        s_roots;        /* alternate root dentries for NFS */
        struct list_head        s_mounts;        /* list of mounts; _not_ for fs use */
        struct block_device        *s_bdev;        /* can go away once we use an accessor for @s_bdev_file */
        struct file                *s_bdev_file;
        struct backing_dev_info *s_bdi;
        struct mtd_info                *s_mtd;
        struct hlist_node        s_instances;
        unsigned int                s_quota_types;        /* Bitmask of supported quota types */
        struct quota_info        s_dquot;        /* Diskquota specific options */

        struct sb_writers        s_writers;

        /*
         * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
         * s_fsnotify_info together for cache efficiency. They are frequently
         * accessed and rarely modified.
         */
        void                        *s_fs_info;        /* Filesystem private info */

        /* Granularity of c/m/atime in ns (cannot be worse than a second) */
        u32                        s_time_gran;
        /* Time limits for c/m/atime in seconds */
        time64_t                   s_time_min;
        time64_t                   s_time_max;
#ifdef CONFIG_FSNOTIFY
        u32                        s_fsnotify_mask;
        struct fsnotify_sb_info        *s_fsnotify_info;
#endif

        /*
         * q: why are s_id and s_sysfs_name not the same? both are human
         * readable strings that identify the filesystem
         * a: s_id is allowed to change at runtime; it's used in log messages,
         * and we want to when a device starts out as single device (s_id is dev
         * name) but then a device is hot added and we have to switch to
         * identifying it by UUID
         * but s_sysfs_name is a handle for programmatic access, and can't
         * change at runtime
         */
        char                        s_id[32];        /* Informational name */
        uuid_t                        s_uuid;                /* UUID */
        u8                        s_uuid_len;        /* Default 16, possibly smaller for weird filesystems */

        /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */
        char                        s_sysfs_name[UUID_STRING_LEN + 1];

        unsigned int                s_max_links;

        /*
         * The next field is for VFS *only*. No filesystems have any business
         * even looking at it. You had been warned.
         */
        struct mutex s_vfs_rename_mutex;        /* Kludge */

        /*
         * Filesystem subtype.  If non-empty the filesystem type field
         * in /proc/mounts will be "type.subtype"
         */
        const char *s_subtype;

        const struct dentry_operations *s_d_op; /* default d_op for dentries */

        struct shrinker *s_shrink;        /* per-sb shrinker handle */

        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;

        /* Read-only state of the superblock is being changed */
        int s_readonly_remount;

        /* per-sb errseq_t for reporting writeback errors via syncfs */
        errseq_t s_wb_err;

        /* AIO completions deferred from interrupt context */
        struct workqueue_struct *s_dio_done_wq;
        struct hlist_head s_pins;

        /*
         * Owning user namespace and default context in which to
         * interpret filesystem uids, gids, quotas, device nodes,
         * xattrs and security labels.
         */
        struct user_namespace *s_user_ns;

        /*
         * The list_lru structure is essentially just a pointer to a table
         * of per-node lru lists, each of which has its own spinlock.
         * There is no need to put them into separate cachelines.
         */
        struct list_lru                s_dentry_lru;
        struct list_lru                s_inode_lru;
        struct rcu_head                rcu;
        struct work_struct        destroy_work;

        struct mutex                s_sync_lock;        /* sync serialisation lock */

        /*
         * Indicates how deep in a filesystem stack this SB is
         */
        int s_stack_depth;

        /* s_inode_list_lock protects s_inodes */
        spinlock_t                s_inode_list_lock ____cacheline_aligned_in_smp;
        struct list_head        s_inodes;        /* all inodes */

        spinlock_t                s_inode_wblist_lock;
        struct list_head        s_inodes_wb;        /* writeback inodes */
} __randomize_layout;

static inline struct user_namespace *i_user_ns(const struct inode *inode)
{
        return inode->i_sb->s_user_ns;
}

/* Helper functions so that in most cases filesystems will
 * not need to deal directly with kuid_t and kgid_t and can
 * instead deal with the raw numeric values that are stored
 * in the filesystem.
 */
static inline uid_t i_uid_read(const struct inode *inode)
{
        return from_kuid(i_user_ns(inode), inode->i_uid);
}

static inline gid_t i_gid_read(const struct inode *inode)
{
        return from_kgid(i_user_ns(inode), inode->i_gid);
}

static inline void i_uid_write(struct inode *inode, uid_t uid)
{
        inode->i_uid = make_kuid(i_user_ns(inode), uid);
}

static inline void i_gid_write(struct inode *inode, gid_t gid)
{
        inode->i_gid = make_kgid(i_user_ns(inode), gid);
}

/**
 * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: whe inode's i_uid mapped down according to @idmap.
 * If the inode's i_uid has no mapping INVALID_VFSUID is returned.
 */
static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid);
}

/**
 * i_uid_needs_update - check whether inode's i_uid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_uid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_uid field needs to be updated, false if not.
 */
static inline bool i_uid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_UID) &&
                !vfsuid_eq(attr->ia_vfsuid,
                           i_uid_into_vfsuid(idmap, inode)));
}

/**
 * i_uid_update - update @inode's i_uid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_uid field translating the vfsuid of any idmapped
 * mount into the filesystem kuid.
 */
static inline void i_uid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_UID)
                inode->i_uid = from_vfsuid(idmap, i_user_ns(inode),
                                           attr->ia_vfsuid);
}

/**
 * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping
 * @idmap: idmap of the mount the inode was found from
 * @inode: inode to map
 *
 * Return: the inode's i_gid mapped down according to @idmap.
 * If the inode's i_gid has no mapping INVALID_VFSGID is returned.
 */
static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap,
                                         const struct inode *inode)
{
        return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid);
}

/**
 * i_gid_needs_update - check whether inode's i_gid needs to be updated
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Check whether the $inode's i_gid field needs to be updated taking idmapped
 * mounts into account if the filesystem supports it.
 *
 * Return: true if @inode's i_gid field needs to be updated, false if not.
 */
static inline bool i_gid_needs_update(struct mnt_idmap *idmap,
                                      const struct iattr *attr,
                                      const struct inode *inode)
{
        return ((attr->ia_valid & ATTR_GID) &&
                !vfsgid_eq(attr->ia_vfsgid,
                           i_gid_into_vfsgid(idmap, inode)));
}

/**
 * i_gid_update - update @inode's i_gid field
 * @idmap: idmap of the mount the inode was found from
 * @attr: the new attributes of @inode
 * @inode: the inode to update
 *
 * Safely update @inode's i_gid field translating the vfsgid of any idmapped
 * mount into the filesystem kgid.
 */
static inline void i_gid_update(struct mnt_idmap *idmap,
                                const struct iattr *attr,
                                struct inode *inode)
{
        if (attr->ia_valid & ATTR_GID)
                inode->i_gid = from_vfsgid(idmap, i_user_ns(inode),
                                           attr->ia_vfsgid);
}

/**
 * inode_fsuid_set - initialize inode's i_uid field with callers fsuid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_uid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsuid according to @idmap.
 */
static inline void inode_fsuid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode));
}

/**
 * inode_fsgid_set - initialize inode's i_gid field with callers fsgid
 * @inode: inode to initialize
 * @idmap: idmap of the mount the inode was found from
 *
 * Initialize the i_gid field of @inode. If the inode was found/created via
 * an idmapped mount map the caller's fsgid according to @idmap.
 */
static inline void inode_fsgid_set(struct inode *inode,
                                   struct mnt_idmap *idmap)
{
        inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode));
}

/**
 * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped
 * @sb: the superblock we want a mapping in
 * @idmap: idmap of the relevant mount
 *
 * Check whether the caller's fsuid and fsgid have a valid mapping in the
 * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map
 * the caller's fsuid and fsgid according to the @idmap first.
 *
 * Return: true if fsuid and fsgid is mapped, false if not.
 */
static inline bool fsuidgid_has_mapping(struct super_block *sb,
                                        struct mnt_idmap *idmap)
{
        struct user_namespace *fs_userns = sb->s_user_ns;
        kuid_t kuid;
        kgid_t kgid;

        kuid = mapped_fsuid(idmap, fs_userns);
        if (!uid_valid(kuid))
                return false;
        kgid = mapped_fsgid(idmap, fs_userns);
        if (!gid_valid(kgid))
                return false;
        return kuid_has_mapping(fs_userns, kuid) &&
               kgid_has_mapping(fs_userns, kgid);
}

struct timespec64 current_time(struct inode *inode);
struct timespec64 inode_set_ctime_current(struct inode *inode);
struct timespec64 inode_set_ctime_deleg(struct inode *inode,
                                        struct timespec64 update);

static inline time64_t inode_get_atime_sec(const struct inode *inode)
{
        return inode->i_atime_sec;
}

static inline long inode_get_atime_nsec(const struct inode *inode)
{
        return inode->i_atime_nsec;
}

static inline struct timespec64 inode_get_atime(const struct inode *inode)
{
        struct timespec64 ts = { .tv_sec  = inode_get_atime_sec(inode),
                                 .tv_nsec = inode_get_atime_nsec(inode) };

        return ts;
}

static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->i_atime_sec = ts.tv_sec;
        inode->i_atime_nsec = ts.tv_nsec;
        return ts;
}

static inline struct timespec64 inode_set_atime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_atime_to_ts(inode, ts);
}

static inline time64_t inode_get_mtime_sec(const struct inode *inode)
{
        return inode->i_mtime_sec;
}

static inline long inode_get_mtime_nsec(const struct inode *inode)
{
        return inode->i_mtime_nsec;
}

static inline struct timespec64 inode_get_mtime(const struct inode *inode)
{
        struct timespec64 ts = { .tv_sec  = inode_get_mtime_sec(inode),
                                 .tv_nsec = inode_get_mtime_nsec(inode) };
        return ts;
}

static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode,
                                                      struct timespec64 ts)
{
        inode->i_mtime_sec = ts.tv_sec;
        inode->i_mtime_nsec = ts.tv_nsec;
        return ts;
}

static inline struct timespec64 inode_set_mtime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };
        return inode_set_mtime_to_ts(inode, ts);
}

/*
 * Multigrain timestamps
 *
 * Conditionally use fine-grained ctime and mtime timestamps when there
 * are users actively observing them via getattr. The primary use-case
 * for this is NFS clients that use the ctime to distinguish between
 * different states of the file, and that are often fooled by multiple
 * operations that occur in the same coarse-grained timer tick.
 */
#define I_CTIME_QUERIED                ((u32)BIT(31))

static inline time64_t inode_get_ctime_sec(const struct inode *inode)
{
        return inode->i_ctime_sec;
}

static inline long inode_get_ctime_nsec(const struct inode *inode)
{
        return inode->i_ctime_nsec & ~I_CTIME_QUERIED;
}

static inline struct timespec64 inode_get_ctime(const struct inode *inode)
{
        struct timespec64 ts = { .tv_sec  = inode_get_ctime_sec(inode),
                                 .tv_nsec = inode_get_ctime_nsec(inode) };

        return ts;
}

struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts);

/**
 * inode_set_ctime - set the ctime in the inode
 * @inode: inode in which to set the ctime
 * @sec: tv_sec value to set
 * @nsec: tv_nsec value to set
 *
 * Set the ctime in @inode to { @sec, @nsec }
 */
static inline struct timespec64 inode_set_ctime(struct inode *inode,
                                                time64_t sec, long nsec)
{
        struct timespec64 ts = { .tv_sec  = sec,
                                 .tv_nsec = nsec };

        return inode_set_ctime_to_ts(inode, ts);
}

struct timespec64 simple_inode_init_ts(struct inode *inode);

/*
 * Snapshotting support.
 */

/*
 * These are internal functions, please use sb_start_{write,pagefault,intwrite}
 * instead.
 */
static inline void __sb_end_write(struct super_block *sb, int level)
{
        percpu_up_read(sb->s_writers.rw_sem + level-1);
}

static inline void __sb_start_write(struct super_block *sb, int level)
{
        percpu_down_read(sb->s_writers.rw_sem + level - 1);
}

static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
{
        return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1);
}

#define __sb_writers_acquired(sb, lev)        \
        percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
#define __sb_writers_release(sb, lev)        \
        percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_)

/**
 * __sb_write_started - check if sb freeze level is held
 * @sb: the super we write to
 * @level: the freeze level
 *
 * * > 0 - sb freeze level is held
 * *   0 - sb freeze level is not held
 * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN
 */
static inline int __sb_write_started(const struct super_block *sb, int level)
{
        return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1);
}

/**
 * sb_write_started - check if SB_FREEZE_WRITE is held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE);
}

/**
 * sb_write_not_started - check if SB_FREEZE_WRITE is not held
 * @sb: the super we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 */
static inline bool sb_write_not_started(const struct super_block *sb)
{
        return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0;
}

/**
 * file_write_started - check if SB_FREEZE_WRITE is held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_started(file_inode(file)->i_sb);
}

/**
 * file_write_not_started - check if SB_FREEZE_WRITE is not held
 * @file: the file we write to
 *
 * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN.
 * May be false positive with !S_ISREG, because file_start_write() has
 * no effect on !S_ISREG.
 */
static inline bool file_write_not_started(const struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_write_not_started(file_inode(file)->i_sb);
}

/**
 * sb_end_write - drop write access to a superblock
 * @sb: the super we wrote to
 *
 * Decrement number of writers to the filesystem. Wake up possible waiters
 * wanting to freeze the filesystem.
 */
static inline void sb_end_write(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_WRITE);
}

/**
 * sb_end_pagefault - drop write access to a superblock from a page fault
 * @sb: the super we wrote to
 *
 * Decrement number of processes handling write page fault to the filesystem.
 * Wake up possible waiters wanting to freeze the filesystem.
 */
static inline void sb_end_pagefault(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_end_intwrite - drop write access to a superblock for internal fs purposes
 * @sb: the super we wrote to
 *
 * Decrement fs-internal number of writers to the filesystem.  Wake up possible
 * waiters wanting to freeze the filesystem.
 */
static inline void sb_end_intwrite(struct super_block *sb)
{
        __sb_end_write(sb, SB_FREEZE_FS);
}

/**
 * sb_start_write - get write access to a superblock
 * @sb: the super we write to
 *
 * When a process wants to write data or metadata to a file system (i.e. dirty
 * a page or an inode), it should embed the operation in a sb_start_write() -
 * sb_end_write() pair to get exclusion against file system freezing. This
 * function increments number of writers preventing freezing. If the file
 * system is already frozen, the function waits until the file system is
 * thawed.
 *
 * Since freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. Generally,
 * freeze protection should be the outermost lock. In particular, we have:
 *
 * sb_start_write
 *   -> i_mutex                        (write path, truncate, directory ops, ...)
 *   -> s_umount                (freeze_super, thaw_super)
 */
static inline void sb_start_write(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_WRITE);
}

static inline bool sb_start_write_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_WRITE);
}

/**
 * sb_start_pagefault - get write access to a superblock from a page fault
 * @sb: the super we write to
 *
 * When a process starts handling write page fault, it should embed the
 * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
 * exclusion against file system freezing. This is needed since the page fault
 * is going to dirty a page. This function increments number of running page
 * faults preventing freezing. If the file system is already frozen, the
 * function waits until the file system is thawed.
 *
 * Since page fault freeze protection behaves as a lock, users have to preserve
 * ordering of freeze protection and other filesystem locks. It is advised to
 * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault
 * handling code implies lock dependency:
 *
 * mmap_lock
 *   -> sb_start_pagefault
 */
static inline void sb_start_pagefault(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_PAGEFAULT);
}

/**
 * sb_start_intwrite - get write access to a superblock for internal fs purposes
 * @sb: the super we write to
 *
 * This is the third level of protection against filesystem freezing. It is
 * free for use by a filesystem. The only requirement is that it must rank
 * below sb_start_pagefault.
 *
 * For example filesystem can call sb_start_intwrite() when starting a
 * transaction which somewhat eases handling of freezing for internal sources
 * of filesystem changes (internal fs threads, discarding preallocation on file
 * close, etc.).
 */
static inline void sb_start_intwrite(struct super_block *sb)
{
        __sb_start_write(sb, SB_FREEZE_FS);
}

static inline bool sb_start_intwrite_trylock(struct super_block *sb)
{
        return __sb_start_write_trylock(sb, SB_FREEZE_FS);
}

bool inode_owner_or_capable(struct mnt_idmap *idmap,
                            const struct inode *inode);

/*
 * VFS helper functions..
 */
int vfs_create(struct mnt_idmap *, struct inode *,
               struct dentry *, umode_t, bool);
struct dentry *vfs_mkdir(struct mnt_idmap *, struct inode *,
                         struct dentry *, umode_t);
int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *,
              umode_t, dev_t);
int vfs_symlink(struct mnt_idmap *, struct inode *,
                struct dentry *, const char *);
int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *,
             struct dentry *, struct inode **);
int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *);
int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *,
               struct inode **);

/**
 * struct renamedata - contains all information required for renaming
 * @old_mnt_idmap:     idmap of the old mount the inode was found from
 * @old_dir:           parent of source
 * @old_dentry:                source
 * @new_mnt_idmap:     idmap of the new mount the inode was found from
 * @new_dir:           parent of destination
 * @new_dentry:                destination
 * @delegated_inode:   returns an inode needing a delegation break
 * @flags:             rename flags
 */
struct renamedata {
        struct mnt_idmap *old_mnt_idmap;
        struct inode *old_dir;
        struct dentry *old_dentry;
        struct mnt_idmap *new_mnt_idmap;
        struct inode *new_dir;
        struct dentry *new_dentry;
        struct inode **delegated_inode;
        unsigned int flags;
} __randomize_layout;

int vfs_rename(struct renamedata *);

static inline int vfs_whiteout(struct mnt_idmap *idmap,
                               struct inode *dir, struct dentry *dentry)
{
        return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE,
                         WHITEOUT_DEV);
}

struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
                                 const struct path *parentpath,
                                 umode_t mode, int open_flag,
                                 const struct cred *cred);
struct file *kernel_file_open(const struct path *path, int flags,
                              const struct cred *cred);

int vfs_mkobj(struct dentry *, umode_t,
                int (*f)(struct dentry *, umode_t, void *),
                void *);

int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);

int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);

#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
                                        unsigned long arg);
#else
#define compat_ptr_ioctl NULL
#endif

/*
 * VFS file helper functions.
 */
void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode,
                      const struct inode *dir, umode_t mode);
extern bool may_open_dev(const struct path *path);
umode_t mode_strip_sgid(struct mnt_idmap *idmap,
                        const struct inode *dir, umode_t mode);
bool in_group_or_capable(struct mnt_idmap *idmap,
                         const struct inode *inode, vfsgid_t vfsgid);

/*
 * This is the "filldir" function type, used by readdir() to let
 * the kernel specify what kind of dirent layout it wants to have.
 * This allows the kernel to read directories into kernel space or
 * to have different dirent layouts depending on the binary type.
 * Return 'true' to keep going and 'false' if there are no more entries.
 */
struct dir_context;
typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64,
                         unsigned);

struct dir_context {
        filldir_t actor;
        loff_t pos;
};

/*
 * These flags let !MMU mmap() govern direct device mapping vs immediate
 * copying more easily for MAP_PRIVATE, especially for ROM filesystems.
 *
 * NOMMU_MAP_COPY:        Copy can be mapped (MAP_PRIVATE)
 * NOMMU_MAP_DIRECT:        Can be mapped directly (MAP_SHARED)
 * NOMMU_MAP_READ:        Can be mapped for reading
 * NOMMU_MAP_WRITE:        Can be mapped for writing
 * NOMMU_MAP_EXEC:        Can be mapped for execution
 */
#define NOMMU_MAP_COPY                0x00000001
#define NOMMU_MAP_DIRECT        0x00000008
#define NOMMU_MAP_READ                VM_MAYREAD
#define NOMMU_MAP_WRITE                VM_MAYWRITE
#define NOMMU_MAP_EXEC                VM_MAYEXEC

#define NOMMU_VMFLAGS \
        (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC)

/*
 * These flags control the behavior of the remap_file_range function pointer.
 * If it is called with len == 0 that means "remap to end of source file".
 * See Documentation/filesystems/vfs.rst for more details about this call.
 *
 * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate)
 * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request
 */
#define REMAP_FILE_DEDUP                (1 << 0)
#define REMAP_FILE_CAN_SHORTEN                (1 << 1)

/*
 * These flags signal that the caller is ok with altering various aspects of
 * the behavior of the remap operation.  The changes must be made by the
 * implementation; the vfs remap helper functions can take advantage of them.
 * Flags in this category exist to preserve the quirky behavior of the hoisted
 * btrfs clone/dedupe ioctls.
 */
#define REMAP_FILE_ADVISORY                (REMAP_FILE_CAN_SHORTEN)

/*
 * These flags control the behavior of vfs_copy_file_range().
 * They are not available to the user via syscall.
 *
 * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops
 */
#define COPY_FILE_SPLICE                (1 << 0)

struct iov_iter;
struct io_uring_cmd;
struct offset_ctx;

typedef unsigned int __bitwise fop_flags_t;

struct file_operations {
        struct module *owner;
        fop_flags_t fop_flags;
        loff_t (*llseek) (struct file *, loff_t, int);
        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
        int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *,
                        unsigned int flags);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
        int (*fsync) (struct file *, loff_t, loff_t, int datasync);
        int (*fasync) (int, struct file *, int);
        int (*lock) (struct file *, int, struct file_lock *);
        unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
        int (*check_flags)(int);
        int (*flock) (struct file *, int, struct file_lock *);
        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
        ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
        void (*splice_eof)(struct file *file);
        int (*setlease)(struct file *, int, struct file_lease **, void **);
        long (*fallocate)(struct file *file, int mode, loff_t offset,
                          loff_t len);
        void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
        unsigned (*mmap_capabilities)(struct file *);
#endif
        ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                        loff_t, size_t, unsigned int);
        loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
        int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
        int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *,
                                unsigned int poll_flags);
} __randomize_layout;

/* Supports async buffered reads */
#define FOP_BUFFER_RASYNC        ((__force fop_flags_t)(1 << 0))
/* Supports async buffered writes */
#define FOP_BUFFER_WASYNC        ((__force fop_flags_t)(1 << 1))
/* Supports synchronous page faults for mappings */
#define FOP_MMAP_SYNC                ((__force fop_flags_t)(1 << 2))
/* Supports non-exclusive O_DIRECT writes from multiple threads */
#define FOP_DIO_PARALLEL_WRITE        ((__force fop_flags_t)(1 << 3))
/* Contains huge pages */
#define FOP_HUGE_PAGES                ((__force fop_flags_t)(1 << 4))
/* Treat loff_t as unsigned (e.g., /dev/mem) */
#define FOP_UNSIGNED_OFFSET        ((__force fop_flags_t)(1 << 5))
/* Supports asynchronous lock callbacks */
#define FOP_ASYNC_LOCK                ((__force fop_flags_t)(1 << 6))
/* File system supports uncached read/write buffered IO */
#define FOP_DONTCACHE                ((__force fop_flags_t)(1 << 7))

/* Wrap a directory iterator that needs exclusive inode access */
int wrap_directory_iterator(struct file *, struct dir_context *,
                            int (*) (struct file *, struct dir_context *));
#define WRAP_DIR_ITER(x) \
        static int shared_##x(struct file *file , struct dir_context *ctx) \
        { return wrap_directory_iterator(file, ctx, x); }

struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
        const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
        int (*permission) (struct mnt_idmap *, struct inode *, int);
        struct posix_acl * (*get_inode_acl)(struct inode *, int, bool);

        int (*readlink) (struct dentry *, char __user *,int);

        int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,
                       umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
        int (*unlink) (struct inode *,struct dentry *);
        int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,
                        const char *);
        struct dentry *(*mkdir) (struct mnt_idmap *, struct inode *,
                                 struct dentry *, umode_t);
        int (*rmdir) (struct inode *,struct dentry *);
        int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,
                      umode_t,dev_t);
        int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *,
                        struct inode *, struct dentry *, unsigned int);
        int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *);
        int (*getattr) (struct mnt_idmap *, const struct path *,
                        struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                      u64 len);
        int (*update_time)(struct inode *, int);
        int (*atomic_open)(struct inode *, struct dentry *,
                           struct file *, unsigned open_flag,
                           umode_t create_mode);
        int (*tmpfile) (struct mnt_idmap *, struct inode *,
                        struct file *, umode_t);
        struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *,
                                     int);
        int (*set_acl)(struct mnt_idmap *, struct dentry *,
                       struct posix_acl *, int);
        int (*fileattr_set)(struct mnt_idmap *idmap,
                            struct dentry *dentry, struct fileattr *fa);
        int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa);
        struct offset_ctx *(*get_offset_ctx)(struct inode *inode);
} ____cacheline_aligned;

static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
        return file->f_op->mmap(file, vma);
}

extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *,
                                   loff_t, size_t, unsigned int);
int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write);
int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                    struct file *file_out, loff_t pos_out,
                                    loff_t *len, unsigned int remap_flags,
                                    const struct iomap_ops *dax_read_ops);
int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                  struct file *file_out, loff_t pos_out,
                                  loff_t *count, unsigned int remap_flags);
extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
extern int vfs_dedupe_file_range(struct file *file,
                                 struct file_dedupe_range *same);
extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
                                        struct file *dst_file, loff_t dst_pos,
                                        loff_t len, unsigned int remap_flags);

/**
 * enum freeze_holder - holder of the freeze
 * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem
 * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem
 * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed
 *
 * Indicate who the owner of the freeze or thaw request is and whether
 * the freeze needs to be exclusive or can nest.
 * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the
 * same holder aren't allowed. It is however allowed to hold a single
 * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at
 * the same time. This is relied upon by some filesystems during online
 * repair or similar.
 */
enum freeze_holder {
        FREEZE_HOLDER_KERNEL        = (1U << 0),
        FREEZE_HOLDER_USERSPACE        = (1U << 1),
        FREEZE_MAY_NEST                = (1U << 2),
};

struct super_operations {
           struct inode *(*alloc_inode)(struct super_block *sb);
        void (*destroy_inode)(struct inode *);
        void (*free_inode)(struct inode *);

           void (*dirty_inode) (struct inode *, int flags);
        int (*write_inode) (struct inode *, struct writeback_control *wbc);
        int (*drop_inode) (struct inode *);
        void (*evict_inode) (struct inode *);
        void (*put_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
        int (*freeze_super) (struct super_block *, enum freeze_holder who);
        int (*freeze_fs) (struct super_block *);
        int (*thaw_super) (struct super_block *, enum freeze_holder who);
        int (*unfreeze_fs) (struct super_block *);
        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*umount_begin) (struct super_block *);

        int (*show_options)(struct seq_file *, struct dentry *);
        int (*show_devname)(struct seq_file *, struct dentry *);
        int (*show_path)(struct seq_file *, struct dentry *);
        int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
        struct dquot __rcu **(*get_dquots)(struct inode *);
#endif
        long (*nr_cached_objects)(struct super_block *,
                                  struct shrink_control *);
        long (*free_cached_objects)(struct super_block *,
                                    struct shrink_control *);
        void (*shutdown)(struct super_block *sb);
};

/*
 * Inode flags - they have no relation to superblock flags now
 */
#define S_SYNC                (1 << 0)  /* Writes are synced at once */
#define S_NOATIME        (1 << 1)  /* Do not update access times */
#define S_APPEND        (1 << 2)  /* Append-only file */
#define S_IMMUTABLE        (1 << 3)  /* Immutable file */
#define S_DEAD                (1 << 4)  /* removed, but still open directory */
#define S_NOQUOTA        (1 << 5)  /* Inode is not counted to quota */
#define S_DIRSYNC        (1 << 6)  /* Directory modifications are synchronous */
#define S_NOCMTIME        (1 << 7)  /* Do not update file c/mtime */
#define S_SWAPFILE        (1 << 8)  /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE        (1 << 9)  /* Inode is fs-internal */
#define S_IMA                (1 << 10) /* Inode has an associated IMA struct */
#define S_AUTOMOUNT        (1 << 11) /* Automount/referral quasi-directory */
#define S_NOSEC                (1 << 12) /* no suid or xattr security attributes */
#ifdef CONFIG_FS_DAX
#define S_DAX                (1 << 13) /* Direct Access, avoiding the page cache */
#else
#define S_DAX                0          /* Make all the DAX code disappear */
#endif
#define S_ENCRYPTED        (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD        (1 << 15) /* Casefolded file */
#define S_VERITY        (1 << 16) /* Verity file (using fs/verity/) */
#define S_KERNEL_FILE        (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */

/*
 * Note that nosuid etc flags are inode-specific: setting some file-system
 * flags just means all the inodes inherit those flags by default. It might be
 * possible to override it selectively if you really wanted to with some
 * ioctl() that is not currently implemented.
 *
 * Exception: SB_RDONLY is always applied to the entire file system.
 *
 * Unfortunately, it is possible to change a filesystems flags with it mounted
 * with files in use.  This means that all of the inodes will not have their
 * i_flags updated.  Hence, i_flags no longer inherit the superblock mount
 * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org
 */
#define __IS_FLG(inode, flg)        ((inode)->i_sb->s_flags & (flg))

static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; }
#define IS_RDONLY(inode)        sb_rdonly((inode)->i_sb)
#define IS_SYNC(inode)                (__IS_FLG(inode, SB_SYNCHRONOUS) || \
                                        ((inode)->i_flags & S_SYNC))
#define IS_DIRSYNC(inode)        (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \
                                        ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode)        __IS_FLG(inode, SB_MANDLOCK)
#define IS_NOATIME(inode)        __IS_FLG(inode, SB_RDONLY|SB_NOATIME)
#define IS_I_VERSION(inode)        __IS_FLG(inode, SB_I_VERSION)

#define IS_NOQUOTA(inode)        ((inode)->i_flags & S_NOQUOTA)
#define IS_APPEND(inode)        ((inode)->i_flags & S_APPEND)
#define IS_IMMUTABLE(inode)        ((inode)->i_flags & S_IMMUTABLE)

#ifdef CONFIG_FS_POSIX_ACL
#define IS_POSIXACL(inode)        __IS_FLG(inode, SB_POSIXACL)
#else
#define IS_POSIXACL(inode)        0
#endif

#define IS_DEADDIR(inode)        ((inode)->i_flags & S_DEAD)
#define IS_NOCMTIME(inode)        ((inode)->i_flags & S_NOCMTIME)

#ifdef CONFIG_SWAP
#define IS_SWAPFILE(inode)        ((inode)->i_flags & S_SWAPFILE)
#else
#define IS_SWAPFILE(inode)        ((void)(inode), 0U)
#endif

#define IS_PRIVATE(inode)        ((inode)->i_flags & S_PRIVATE)
#define IS_IMA(inode)                ((inode)->i_flags & S_IMA)
#define IS_AUTOMOUNT(inode)        ((inode)->i_flags & S_AUTOMOUNT)
#define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
#define IS_DAX(inode)                ((inode)->i_flags & S_DAX)
#define IS_ENCRYPTED(inode)        ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode)        ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode)        ((inode)->i_flags & S_VERITY)

#define IS_WHITEOUT(inode)        (S_ISCHR(inode->i_mode) && \
                                 (inode)->i_rdev == WHITEOUT_DEV)

static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap,
                                   struct inode *inode)
{
        return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
               !vfsgid_valid(i_gid_into_vfsgid(idmap, inode));
}

static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = filp->f_iocb_flags,
                .ki_ioprio = get_current_ioprio(),
        };
}

static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
                               struct file *filp)
{
        *kiocb = (struct kiocb) {
                .ki_filp = filp,
                .ki_flags = kiocb_src->ki_flags,
                .ki_ioprio = kiocb_src->ki_ioprio,
                .ki_pos = kiocb_src->ki_pos,
        };
}

/*
 * Inode state bits.  Protected by inode->i_lock
 *
 * Four bits determine the dirty state of the inode: I_DIRTY_SYNC,
 * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME.
 *
 * Four bits define the lifetime of an inode.  Initially, inodes are I_NEW,
 * until that flag is cleared.  I_WILL_FREE, I_FREEING and I_CLEAR are set at
 * various stages of removing an inode.
 *
 * Two bits are used for locking and completion notification, I_NEW and I_SYNC.
 *
 * I_DIRTY_SYNC                Inode is dirty, but doesn't have to be written on
 *                        fdatasync() (unless I_DIRTY_DATASYNC is also set).
 *                        Timestamp updates are the usual cause.
 * I_DIRTY_DATASYNC        Data-related inode changes pending.  We keep track of
 *                        these changes separately from I_DIRTY_SYNC so that we
 *                        don't have to write inode on fdatasync() when only
 *                        e.g. the timestamps have changed.
 * I_DIRTY_PAGES        Inode has dirty pages.  Inode itself may be clean.
 * I_DIRTY_TIME                The inode itself has dirty timestamps, and the
 *                        lazytime mount option is enabled.  We keep track of this
 *                        separately from I_DIRTY_SYNC in order to implement
 *                        lazytime.  This gets cleared if I_DIRTY_INODE
 *                        (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But
 *                        I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already
 *                        in place because writeback might already be in progress
 *                        and we don't want to lose the time update
 * I_NEW                Serves as both a mutex and completion notification.
 *                        New inodes set I_NEW.  If two processes both create
 *                        the same inode, one of them will release its inode and
 *                        wait for I_NEW to be released before returning.
 *                        Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can
 *                        also cause waiting on I_NEW, without I_NEW actually
 *                        being set.  find_inode() uses this to prevent returning
 *                        nearly-dead inodes.
 * I_WILL_FREE                Must be set when calling write_inode_now() if i_count
 *                        is zero.  I_FREEING must be set when I_WILL_FREE is
 *                        cleared.
 * I_FREEING                Set when inode is about to be freed but still has dirty
 *                        pages or buffers attached or the inode itself is still
 *                        dirty.
 * I_CLEAR                Added by clear_inode().  In this state the inode is
 *                        clean and can be destroyed.  Inode keeps I_FREEING.
 *
 *                        Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are
 *                        prohibited for many purposes.  iget() must wait for
 *                        the inode to be completely released, then create it
 *                        anew.  Other functions will just ignore such inodes,
 *                        if appropriate.  I_NEW is used for waiting.
 *
 * I_SYNC                Writeback of inode is running. The bit is set during
 *                        data writeback, and cleared with a wakeup on the bit
 *                        address once it is done. The bit is also used to pin
 *                        the inode in memory for flusher thread.
 *
 * I_REFERENCED                Marks the inode as recently references on the LRU list.
 *
 * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
 *                        synchronize competing switching instances and to tell
 *                        wb stat updates to grab the i_pages lock.  See
 *                        inode_switch_wbs_work_fn() for details.
 *
 * I_OVL_INUSE                Used by overlayfs to get exclusive ownership on upper
 *                        and work dirs among overlayfs mounts.
 *
 * I_CREATING                New object's inode in the middle of setting up.
 *
 * I_DONTCACHE                Evict inode as soon as it is not used anymore.
 *
 * I_SYNC_QUEUED        Inode is queued in b_io or b_more_io writeback lists.
 *                        Used to detect that mark_inode_dirty() should not move
 *                         inode between dirty lists.
 *
 * I_PINNING_FSCACHE_WB        Inode is pinning an fscache object for writeback.
 *
 * I_LRU_ISOLATING        Inode is pinned being isolated from LRU without holding
 *                        i_count.
 *
 * Q: What is the difference between I_WILL_FREE and I_FREEING?
 *
 * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait
 * upon. There's one free address left.
 */
#define __I_NEW                        0
#define I_NEW                        (1 << __I_NEW)
#define __I_SYNC                1
#define I_SYNC                        (1 << __I_SYNC)
#define __I_LRU_ISOLATING        2
#define I_LRU_ISOLATING                (1 << __I_LRU_ISOLATING)

#define I_DIRTY_SYNC                (1 << 3)
#define I_DIRTY_DATASYNC        (1 << 4)
#define I_DIRTY_PAGES                (1 << 5)
#define I_WILL_FREE                (1 << 6)
#define I_FREEING                (1 << 7)
#define I_CLEAR                        (1 << 8)
#define I_REFERENCED                (1 << 9)
#define I_LINKABLE                (1 << 10)
#define I_DIRTY_TIME                (1 << 11)
#define I_WB_SWITCH                (1 << 12)
#define I_OVL_INUSE                (1 << 13)
#define I_CREATING                (1 << 14)
#define I_DONTCACHE                (1 << 15)
#define I_SYNC_QUEUED                (1 << 16)
#define I_PINNING_NETFS_WB        (1 << 17)

#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
#define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES)
#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)

extern void __mark_inode_dirty(struct inode *, int);
static inline void mark_inode_dirty(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY);
}

static inline void mark_inode_dirty_sync(struct inode *inode)
{
        __mark_inode_dirty(inode, I_DIRTY_SYNC);
}

/*
 * Returns true if the given inode itself only has dirty timestamps (its pages
 * may still be dirty) and isn't currently being allocated or freed.
 * Filesystems should call this if when writing an inode when lazytime is
 * enabled, they want to opportunistically write the timestamps of other inodes
 * located very nearby on-disk, e.g. in the same inode block.  This returns true
 * if the given inode is in need of such an opportunistic update.  Requires
 * i_lock, or at least later re-checking under i_lock.
 */
static inline bool inode_is_dirtytime_only(struct inode *inode)
{
        return (inode->i_state & (I_DIRTY_TIME | I_NEW |
                                  I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME;
}

extern void inc_nlink(struct inode *inode);
extern void drop_nlink(struct inode *inode);
extern void clear_nlink(struct inode *inode);
extern void set_nlink(struct inode *inode, unsigned int nlink);

static inline void inode_inc_link_count(struct inode *inode)
{
        inc_nlink(inode);
        mark_inode_dirty(inode);
}

static inline void inode_dec_link_count(struct inode *inode)
{
        drop_nlink(inode);
        mark_inode_dirty(inode);
}

enum file_time_flags {
        S_ATIME = 1,
        S_MTIME = 2,
        S_CTIME = 4,
        S_VERSION = 8,
};

extern bool atime_needs_update(const struct path *, struct inode *);
extern void touch_atime(const struct path *);
int inode_update_time(struct inode *inode, int flags);

static inline void file_accessed(struct file *file)
{
        if (!(file->f_flags & O_NOATIME))
                touch_atime(&file->f_path);
}

extern int file_modified(struct file *file);
int kiocb_modified(struct kiocb *iocb);

int sync_inode_metadata(struct inode *inode, int wait);

struct file_system_type {
        const char *name;
        int fs_flags;
#define FS_REQUIRES_DEV                1 
#define FS_BINARY_MOUNTDATA        2
#define FS_HAS_SUBTYPE                4
#define FS_USERNS_MOUNT                8        /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM        16        /* Disable fanotify permission events */
#define FS_ALLOW_IDMAP         32      /* FS has been updated to handle vfs idmappings. */
#define FS_MGTIME                64        /* FS uses multigrain timestamps */
#define FS_LBS                        128        /* FS supports LBS */
#define FS_RENAME_DOES_D_MOVE        32768        /* FS will handle d_move() during rename() internally. */
        int (*init_fs_context)(struct fs_context *);
        const struct fs_parameter_spec *parameters;
        struct dentry *(*mount) (struct file_system_type *, int,
                       const char *, void *);
        void (*kill_sb) (struct super_block *);
        struct module *owner;
        struct file_system_type * next;
        struct hlist_head fs_supers;

        struct lock_class_key s_lock_key;
        struct lock_class_key s_umount_key;
        struct lock_class_key s_vfs_rename_key;
        struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
        struct lock_class_key invalidate_lock_key;
        struct lock_class_key i_mutex_dir_key;
};

#define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME)

/**
 * is_mgtime: is this inode using multigrain timestamps
 * @inode: inode to test for multigrain timestamps
 *
 * Return true if the inode uses multigrain timestamps, false otherwise.
 */
static inline bool is_mgtime(const struct inode *inode)
{
        return inode->i_opflags & IOP_MGTIME;
}

extern struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int));
extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path);
void retire_super(struct super_block *sb);
void generic_shutdown_super(struct super_block *sb);
void kill_block_super(struct super_block *sb);
void kill_anon_super(struct super_block *sb);
void kill_litter_super(struct super_block *sb);
void deactivate_super(struct super_block *sb);
void deactivate_locked_super(struct super_block *sb);
int set_anon_super(struct super_block *s, void *data);
int set_anon_super_fc(struct super_block *s, struct fs_context *fc);
int get_anon_bdev(dev_t *);
void free_anon_bdev(dev_t);
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *));
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags, void *data);
struct super_block *sget_dev(struct fs_context *fc, dev_t dev);

/* Alas, no aliases. Too much hassle with bringing module.h everywhere */
#define fops_get(fops) ({                                                \
        const struct file_operations *_fops = (fops);                        \
        (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL));        \
})

#define fops_put(fops) ({                                                \
        const struct file_operations *_fops = (fops);                        \
        if (_fops)                                                        \
                module_put((_fops)->owner);                                \
})

/*
 * This one is to be used *ONLY* from ->open() instances.
 * fops must be non-NULL, pinned down *and* module dependencies
 * should be sufficient to pin the caller down as well.
 */
#define replace_fops(f, fops) \
        do {        \
                struct file *__file = (f); \
                fops_put(__file->f_op); \
                BUG_ON(!(__file->f_op = (fops))); \
        } while(0)

extern int register_filesystem(struct file_system_type *);
extern int unregister_filesystem(struct file_system_type *);
extern int vfs_statfs(const struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
int freeze_super(struct super_block *super, enum freeze_holder who);
int thaw_super(struct super_block *super, enum freeze_holder who);
extern __printf(2, 3)
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...);
extern int super_setup_bdi(struct super_block *sb);

static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len)
{
        if (WARN_ON(len > sizeof(sb->s_uuid)))
                len = sizeof(sb->s_uuid);
        sb->s_uuid_len = len;
        memcpy(&sb->s_uuid, uuid, len);
}

/* set sb sysfs name based on sb->s_bdev */
static inline void super_set_sysfs_name_bdev(struct super_block *sb)
{
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev);
}

/* set sb sysfs name based on sb->s_uuid */
static inline void super_set_sysfs_name_uuid(struct super_block *sb)
{
        WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid));
        snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b);
}

/* set sb sysfs name based on sb->s_id */
static inline void super_set_sysfs_name_id(struct super_block *sb)
{
        strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name));
}

/* try to use something standard before you use this */
__printf(2, 3)
static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...)
{
        va_list args;

        va_start(args, fmt);
        vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args);
        va_end(args);
}

extern int current_umask(void);

extern void ihold(struct inode * inode);
extern void iput(struct inode *);
int inode_update_timestamps(struct inode *inode, int flags);
int generic_update_time(struct inode *, int);

/* /sys/fs */
extern struct kobject *fs_kobj;

#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)

/* fs/open.c */
struct audit_names;
struct filename {
        const char                *name;        /* pointer to actual string */
        const __user char        *uptr;        /* original userland pointer */
        atomic_t                refcnt;
        struct audit_names        *aname;
        const char                iname[];
};
static_assert(offsetof(struct filename, iname) % sizeof(long) == 0);

static inline struct mnt_idmap *file_mnt_idmap(const struct file *file)
{
        return mnt_idmap(file->f_path.mnt);
}

/**
 * is_idmapped_mnt - check whether a mount is mapped
 * @mnt: the mount to check
 *
 * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped.
 *
 * Return: true if mount is mapped, false if not.
 */
static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
{
        return mnt_idmap(mnt) != &nop_mnt_idmap;
}

int vfs_truncate(const struct path *, loff_t);
int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start,
                unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                        loff_t len);
int do_sys_open(int dfd, const char __user *filename, int flags,
                umode_t mode);
extern struct file *file_open_name(struct filename *, int, umode_t);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(const struct path *,
                                   const char *, int, umode_t);
static inline struct file *file_open_root_mnt(struct vfsmount *mnt,
                                   const char *name, int flags, umode_t mode)
{
        return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root},
                              name, flags, mode);
}
struct file *dentry_open(const struct path *path, int flags,
                         const struct cred *creds);
struct file *dentry_open_nonotify(const struct path *path, int flags,
                                  const struct cred *cred);
struct file *dentry_create(const struct path *path, int flags, umode_t mode,
                           const struct cred *cred);
struct path *backing_file_user_path(struct file *f);

/*
 * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file
 * stored in ->vm_file is a backing file whose f_inode is on the underlying
 * filesystem.  When the mapped file path and inode number are displayed to
 * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the
 * path and inode number to display to the user, which is the path of the fd
 * that user has requested to map and the inode number that would be returned
 * by fstat() on that same fd.
 */
/* Get the path to display in /proc/<pid>/maps */
static inline const struct path *file_user_path(struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return backing_file_user_path(f);
        return &f->f_path;
}
/* Get the inode whose inode number to display in /proc/<pid>/maps */
static inline const struct inode *file_user_inode(struct file *f)
{
        if (unlikely(f->f_mode & FMODE_BACKING))
                return d_inode(backing_file_user_path(f)->dentry);
        return file_inode(f);
}

static inline struct file *file_clone_open(struct file *file)
{
        return dentry_open(&file->f_path, file->f_flags, file->f_cred);
}
extern int filp_close(struct file *, fl_owner_t id);

extern struct filename *getname_flags(const char __user *, int);
extern struct filename *getname_uflags(const char __user *, int);
static inline struct filename *getname(const char __user *name)
{
        return getname_flags(name, 0);
}
extern struct filename *getname_kernel(const char *);
extern struct filename *__getname_maybe_null(const char __user *);
static inline struct filename *getname_maybe_null(const char __user *name, int flags)
{
        if (!(flags & AT_EMPTY_PATH))
                return getname(name);

        if (!name)
                return NULL;
        return __getname_maybe_null(name);
}
extern void putname(struct filename *name);
DEFINE_FREE(putname, struct filename *, if (!IS_ERR_OR_NULL(_T)) putname(_T))

static inline struct filename *refname(struct filename *name)
{
        atomic_inc(&name->refcnt);
        return name;
}

extern int finish_open(struct file *file, struct dentry *dentry,
                        int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);

/* Helper for the simple case when original dentry is used */
static inline int finish_open_simple(struct file *file, int error)
{
        if (error)
                return error;

        return finish_open(file, file->f_path.dentry, NULL);
}

/* fs/dcache.c */
extern void __init vfs_caches_init_early(void);
extern void __init vfs_caches_init(void);

extern struct kmem_cache *names_cachep;

#define __getname()                kmem_cache_alloc(names_cachep, GFP_KERNEL)
#define __putname(name)                kmem_cache_free(names_cachep, (void *)(name))

extern struct super_block *blockdev_superblock;
static inline bool sb_is_blkdev_sb(struct super_block *sb)
{
        return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock;
}

void emergency_thaw_all(void);
extern int sync_filesystem(struct super_block *);
extern const struct file_operations def_blk_fops;
extern const struct file_operations def_chr_fops;

/* fs/char_dev.c */
#define CHRDEV_MAJOR_MAX 512
/* Marks the bottom of the first segment of free char majors */
#define CHRDEV_MAJOR_DYN_END 234
/* Marks the top and bottom of the second segment of free char majors */
#define CHRDEV_MAJOR_DYN_EXT_START 511
#define CHRDEV_MAJOR_DYN_EXT_END 384

extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
extern int register_chrdev_region(dev_t, unsigned, const char *);
extern int __register_chrdev(unsigned int major, unsigned int baseminor,
                             unsigned int count, const char *name,
                             const struct file_operations *fops);
extern void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                                unsigned int count, const char *name);
extern void unregister_chrdev_region(dev_t, unsigned);
extern void chrdev_show(struct seq_file *,off_t);

static inline int register_chrdev(unsigned int major, const char *name,
                                  const struct file_operations *fops)
{
        return __register_chrdev(major, 0, 256, name, fops);
}

static inline void unregister_chrdev(unsigned int major, const char *name)
{
        __unregister_chrdev(major, 0, 256, name);
}

extern void init_special_inode(struct inode *, umode_t, dev_t);

/* Invalid inode operations -- fs/bad_inode.c */
extern void make_bad_inode(struct inode *);
extern bool is_bad_inode(struct inode *);

extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart,
                                                loff_t lend);
extern int __must_check file_check_and_advance_wb_err(struct file *file);
extern int __must_check file_write_and_wait_range(struct file *file,
                                                loff_t start, loff_t end);
int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
                loff_t end);

static inline int file_write_and_wait(struct file *file)
{
        return file_write_and_wait_range(file, 0, LLONG_MAX);
}

extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
                           int datasync);
extern int vfs_fsync(struct file *file, int datasync);

extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
                                unsigned int flags);

static inline bool iocb_is_dsync(const struct kiocb *iocb)
{
        return (iocb->ki_flags & IOCB_DSYNC) ||
                IS_SYNC(iocb->ki_filp->f_mapping->host);
}

/*
 * Sync the bytes written if this was a synchronous write.  Expect ki_pos
 * to already be updated for the write, and will return either the amount
 * of bytes passed in, or an error if syncing the file failed.
 */
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
        if (iocb_is_dsync(iocb)) {
                int ret = vfs_fsync_range(iocb->ki_filp,
                                iocb->ki_pos - count, iocb->ki_pos - 1,
                                (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
                if (ret)
                        return ret;
        } else if (iocb->ki_flags & IOCB_DONTCACHE) {
                struct address_space *mapping = iocb->ki_filp->f_mapping;

                filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count,
                                              iocb->ki_pos - 1);
        }

        return count;
}

extern void emergency_sync(void);
extern void emergency_remount(void);

#ifdef CONFIG_BLOCK
extern int bmap(struct inode *inode, sector_t *block);
#else
static inline int bmap(struct inode *inode,  sector_t *block)
{
        return -EINVAL;
}
#endif

int notify_change(struct mnt_idmap *, struct dentry *,
                  struct iattr *, struct inode **);
int inode_permission(struct mnt_idmap *, struct inode *, int);
int generic_permission(struct mnt_idmap *, struct inode *, int);
static inline int file_permission(struct file *file, int mask)
{
        return inode_permission(file_mnt_idmap(file),
                                file_inode(file), mask);
}
static inline int path_permission(const struct path *path, int mask)
{
        return inode_permission(mnt_idmap(path->mnt),
                                d_inode(path->dentry), mask);
}
int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
                   struct inode *inode);

static inline bool execute_ok(struct inode *inode)
{
        return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode);
}

static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
{
        return (inode->i_mode ^ mode) & S_IFMT;
}

/**
 * file_start_write - get write access to a superblock for regular file io
 * @file: the file we want to write to
 *
 * This is a variant of sb_start_write() which is a noop on non-regualr file.
 * Should be matched with a call to file_end_write().
 */
static inline void file_start_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_start_write(file_inode(file)->i_sb);
}

static inline bool file_start_write_trylock(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return true;
        return sb_start_write_trylock(file_inode(file)->i_sb);
}

/**
 * file_end_write - drop write access to a superblock of a regular file
 * @file: the file we wrote to
 *
 * Should be matched with a call to file_start_write().
 */
static inline void file_end_write(struct file *file)
{
        if (!S_ISREG(file_inode(file)->i_mode))
                return;
        sb_end_write(file_inode(file)->i_sb);
}

/**
 * kiocb_start_write - get write access to a superblock for async file io
 * @iocb: the io context we want to submit the write with
 *
 * This is a variant of sb_start_write() for async io submission.
 * Should be matched with a call to kiocb_end_write().
 */
static inline void kiocb_start_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        sb_start_write(inode->i_sb);
        /*
         * Fool lockdep by telling it the lock got released so that it
         * doesn't complain about the held lock when we return to userspace.
         */
        __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
}

/**
 * kiocb_end_write - drop write access to a superblock after async file io
 * @iocb: the io context we sumbitted the write with
 *
 * Should be matched with a call to kiocb_start_write().
 */
static inline void kiocb_end_write(struct kiocb *iocb)
{
        struct inode *inode = file_inode(iocb->ki_filp);

        /*
         * Tell lockdep we inherited freeze protection from submission thread.
         */
        __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
        sb_end_write(inode->i_sb);
}

/*
 * This is used for regular files where some users -- especially the
 * currently executed binary in a process, previously handled via
 * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap
 * read-write shared) accesses.
 *
 * get_write_access() gets write permission for a file.
 * put_write_access() releases this write permission.
 * deny_write_access() denies write access to a file.
 * allow_write_access() re-enables write access to a file.
 *
 * The i_writecount field of an inode can have the following values:
 * 0: no write access, no denied write access
 * < 0: (-i_writecount) users that denied write access to the file.
 * > 0: (i_writecount) users that have write access to the file.
 *
 * Normally we operate on that counter with atomic_{inc,dec} and it's safe
 * except for the cases where we don't hold i_writecount yet. Then we need to
 * use {get,deny}_write_access() - these functions check the sign and refuse
 * to do the change if sign is wrong.
 */
static inline int get_write_access(struct inode *inode)
{
        return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline int deny_write_access(struct file *file)
{
        struct inode *inode = file_inode(file);
        return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY;
}
static inline void put_write_access(struct inode * inode)
{
        atomic_dec(&inode->i_writecount);
}
static inline void allow_write_access(struct file *file)
{
        if (file)
                atomic_inc(&file_inode(file)->i_writecount);
}

/*
 * Do not prevent write to executable file when watched by pre-content events.
 *
 * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at
 * the time of file open and remains constant for entire lifetime of the file,
 * so if pre-content watches are added post execution or removed before the end
 * of the execution, it will not cause i_writecount reference leak.
 */
static inline int exe_file_deny_write_access(struct file *exe_file)
{
        if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
                return 0;
        return deny_write_access(exe_file);
}
static inline void exe_file_allow_write_access(struct file *exe_file)
{
        if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode)))
                return;
        allow_write_access(exe_file);
}

static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode)
{
        file->f_mode &= ~FMODE_FSNOTIFY_MASK;
        file->f_mode |= mode;
}

static inline bool inode_is_open_for_write(const struct inode *inode)
{
        return atomic_read(&inode->i_writecount) > 0;
}

#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
static inline void i_readcount_dec(struct inode *inode)
{
        BUG_ON(atomic_dec_return(&inode->i_readcount) < 0);
}
static inline void i_readcount_inc(struct inode *inode)
{
        atomic_inc(&inode->i_readcount);
}
#else
static inline void i_readcount_dec(struct inode *inode)
{
        return;
}
static inline void i_readcount_inc(struct inode *inode)
{
        return;
}
#endif
extern int do_pipe_flags(int *, int);

extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *);
ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos);
extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *);
extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *);
extern struct file * open_exec(const char *);
 
/* fs/dcache.c -- generic fs support functions */
extern bool is_subdir(struct dentry *, struct dentry *);
extern bool path_is_under(const struct path *, const struct path *);

extern char *file_path(struct file *, char *, int);

/**
 * is_dot_dotdot - returns true only if @name is "." or ".."
 * @name: file name to check
 * @len: length of file name, in bytes
 */
static inline bool is_dot_dotdot(const char *name, size_t len)
{
        return len && unlikely(name[0] == '.') &&
                (len == 1 || (len == 2 && name[1] == '.'));
}

#include <linux/err.h>

/* needed for stackable file system support */
extern loff_t default_llseek(struct file *file, loff_t offset, int whence);

extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence);

extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t);
static inline int inode_init_always(struct super_block *sb, struct inode *inode)
{
        return inode_init_always_gfp(sb, inode, GFP_NOFS);
}

extern void inode_init_once(struct inode *);
extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
extern ino_t iunique(struct super_block *, ino_t);
extern int inode_needs_sync(struct inode *inode);
extern int generic_delete_inode(struct inode *inode);
static inline int generic_drop_inode(struct inode *inode)
{
        return !inode->i_nlink || inode_unhashed(inode);
}
extern void d_mark_dontcache(struct inode *inode);

extern struct inode *ilookup5_nowait(struct super_block *sb,
                unsigned long hashval, int (*test)(struct inode *, void *),
                void *data);
extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
                int (*test)(struct inode *, void *), void *data);
extern struct inode *ilookup(struct super_block *sb, unsigned long ino);

extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
                int (*test)(struct inode *, void *),
                int (*set)(struct inode *, void *),
                void *data);
struct inode *iget5_locked(struct super_block *, unsigned long,
                           int (*test)(struct inode *, void *),
                           int (*set)(struct inode *, void *), void *);
struct inode *iget5_locked_rcu(struct super_block *, unsigned long,
                               int (*test)(struct inode *, void *),
                               int (*set)(struct inode *, void *), void *);
extern struct inode * iget_locked(struct super_block *, unsigned long);
extern struct inode *find_inode_nowait(struct super_block *,
                                       unsigned long,
                                       int (*match)(struct inode *,
                                                    unsigned long, void *),
                                       void *data);
extern struct inode *find_inode_rcu(struct super_block *, unsigned long,
                                    int (*)(struct inode *, void *), void *);
extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long);
extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
extern int insert_inode_locked(struct inode *);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern void lockdep_annotate_inode_mutex_key(struct inode *inode);
#else
static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { };
#endif
extern void unlock_new_inode(struct inode *);
extern void discard_new_inode(struct inode *);
extern unsigned int get_next_ino(void);
extern void evict_inodes(struct super_block *sb);
void dump_mapping(const struct address_space *);

/*
 * Userspace may rely on the inode number being non-zero. For example, glibc
 * simply ignores files with zero i_ino in unlink() and other places.
 *
 * As an additional complication, if userspace was compiled with
 * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the
 * lower 32 bits, so we need to check that those aren't zero explicitly. With
 * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but
 * better safe than sorry.
 */
static inline bool is_zero_ino(ino_t ino)
{
        return (u32)ino == 0;
}

/*
 * inode->i_lock must be held
 */
static inline void __iget(struct inode *inode)
{
        atomic_inc(&inode->i_count);
}

extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
struct inode *alloc_inode(struct super_block *sb);
static inline struct inode *new_inode_pseudo(struct super_block *sb)
{
        return alloc_inode(sb);
}
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
extern int file_remove_privs_flags(struct file *file, unsigned int flags);
extern int file_remove_privs(struct file *);
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
                             const struct inode *inode);

/*
 * This must be used for allocating filesystems specific inodes to set
 * up the inode reclaim context correctly.
 */
#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp)

extern void __insert_inode_hash(struct inode *, unsigned long hashval);
static inline void insert_inode_hash(struct inode *inode)
{
        __insert_inode_hash(inode, inode->i_ino);
}

extern void __remove_inode_hash(struct inode *);
static inline void remove_inode_hash(struct inode *inode)
{
        if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
                __remove_inode_hash(inode);
}

extern void inode_sb_list_add(struct inode *inode);
extern void inode_add_lru(struct inode *inode);

extern int sb_set_blocksize(struct super_block *, int);
extern int sb_min_blocksize(struct super_block *, int);

extern int generic_file_mmap(struct file *, struct vm_area_struct *);
extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
int generic_write_checks_count(struct kiocb *iocb, loff_t *count);
extern int generic_write_check_limits(struct file *file, loff_t pos,
                loff_t *count);
extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
                ssize_t already_read);
extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
ssize_t generic_perform_write(struct kiocb *, struct iov_iter *);
ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t direct_written, ssize_t buffered_written);

ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
                rwf_t flags);
ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
                           struct iov_iter *iter);
ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
                            struct iov_iter *iter);

/* fs/splice.c */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags);
ssize_t copy_splice_read(struct file *in, loff_t *ppos,
                         struct pipe_inode_info *pipe,
                         size_t len, unsigned int flags);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
                struct file *, loff_t *, size_t, unsigned int);


extern void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
extern loff_t noop_llseek(struct file *file, loff_t offset, int whence);
extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize);
extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence);
extern loff_t generic_file_llseek_size(struct file *file, loff_t offset,
                int whence, loff_t maxsize, loff_t eof);
loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence,
                             u64 *cookie);
extern loff_t fixed_size_llseek(struct file *file, loff_t offset,
                int whence, loff_t size);
extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t);
extern loff_t no_seek_end_llseek(struct file *, loff_t, int);
int rw_verify_area(int, struct file *, const loff_t *, size_t);
extern int generic_file_open(struct inode * inode, struct file * filp);
extern int nonseekable_open(struct inode * inode, struct file * filp);
extern int stream_open(struct inode * inode, struct file * filp);

#ifdef CONFIG_BLOCK
typedef void (dio_submit_t)(struct bio *bio, struct inode *inode,
                            loff_t file_offset);

enum {
        /* need locking between buffered and direct access */
        DIO_LOCKING        = 0x01,

        /* filesystem does not support filling holes */
        DIO_SKIP_HOLES        = 0x02,
};

ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
                             struct block_device *bdev, struct iov_iter *iter,
                             get_block_t get_block,
                             dio_iodone_t end_io,
                             int flags);

static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
                                         struct inode *inode,
                                         struct iov_iter *iter,
                                         get_block_t get_block)
{
        return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
                        get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
}
#endif

bool inode_dio_finished(const struct inode *inode);
void inode_dio_wait(struct inode *inode);
void inode_dio_wait_interruptible(struct inode *inode);

/**
 * inode_dio_begin - signal start of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_begin(struct inode *inode)
{
        atomic_inc(&inode->i_dio_count);
}

/**
 * inode_dio_end - signal finish of a direct I/O requests
 * @inode: inode the direct I/O happens on
 *
 * This is called once we've finished processing a direct I/O request,
 * and is used to wake up callers waiting for direct I/O to be quiesced.
 */
static inline void inode_dio_end(struct inode *inode)
{
        if (atomic_dec_and_test(&inode->i_dio_count))
                wake_up_var(&inode->i_dio_count);
}

extern void inode_set_flags(struct inode *inode, unsigned int flags,
                            unsigned int mask);

extern const struct file_operations generic_ro_fops;

#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))

extern int readlink_copy(char __user *, int, const char *, int);
extern int page_readlink(struct dentry *, char __user *, int);
extern const char *page_get_link_raw(struct dentry *, struct inode *,
                                     struct delayed_call *);
extern const char *page_get_link(struct dentry *, struct inode *,
                                 struct delayed_call *);
extern void page_put_link(void *);
extern int page_symlink(struct inode *inode, const char *symname, int len);
extern const struct inode_operations page_symlink_inode_operations;
extern void kfree_link(void *);
void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode);
void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *);
void generic_fill_statx_attr(struct inode *inode, struct kstat *stat);
void generic_fill_statx_atomic_writes(struct kstat *stat,
                                      unsigned int unit_min,
                                      unsigned int unit_max);
extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int);
extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int);
void __inode_add_bytes(struct inode *inode, loff_t bytes);
void inode_add_bytes(struct inode *inode, loff_t bytes);
void __inode_sub_bytes(struct inode *inode, loff_t bytes);
void inode_sub_bytes(struct inode *inode, loff_t bytes);
static inline loff_t __inode_get_bytes(struct inode *inode)
{
        return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes;
}
loff_t inode_get_bytes(struct inode *inode);
void inode_set_bytes(struct inode *inode, loff_t bytes);
const char *simple_get_link(struct dentry *, struct inode *,
                            struct delayed_call *);
extern const struct inode_operations simple_symlink_inode_operations;

extern int iterate_dir(struct file *, struct dir_context *);

int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
                int flags);
int vfs_fstat(int fd, struct kstat *stat);

static inline int vfs_stat(const char __user *filename, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, filename, stat, 0);
}
static inline int vfs_lstat(const char __user *name, struct kstat *stat)
{
        return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW);
}

extern const char *vfs_get_link(struct dentry *, struct delayed_call *);
extern int vfs_readlink(struct dentry *, char __user *, int);

extern struct file_system_type *get_filesystem(struct file_system_type *fs);
extern void put_filesystem(struct file_system_type *fs);
extern struct file_system_type *get_fs_type(const char *name);
extern void drop_super(struct super_block *sb);
extern void drop_super_exclusive(struct super_block *sb);
extern void iterate_supers(void (*)(struct super_block *, void *), void *);
extern void iterate_supers_type(struct file_system_type *,
                                void (*)(struct super_block *, void *), void *);

extern int dcache_dir_open(struct inode *, struct file *);
extern int dcache_dir_close(struct inode *, struct file *);
extern loff_t dcache_dir_lseek(struct file *, loff_t, int);
extern int dcache_readdir(struct file *, struct dir_context *);
extern int simple_setattr(struct mnt_idmap *, struct dentry *,
                          struct iattr *);
extern int simple_getattr(struct mnt_idmap *, const struct path *,
                          struct kstat *, u32, unsigned int);
extern int simple_statfs(struct dentry *, struct kstatfs *);
extern int simple_open(struct inode *inode, struct file *file);
extern int simple_link(struct dentry *, struct inode *, struct dentry *);
extern int simple_unlink(struct inode *, struct dentry *);
extern int simple_rmdir(struct inode *, struct dentry *);
void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                                  struct inode *new_dir, struct dentry *new_dentry);
extern int simple_rename(struct mnt_idmap *, struct inode *,
                         struct dentry *, struct inode *, struct dentry *,
                         unsigned int);
extern void simple_recursive_removal(struct dentry *,
                              void (*callback)(struct dentry *));
extern int noop_fsync(struct file *, loff_t, loff_t, int);
extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
extern int simple_empty(struct dentry *);
extern int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct folio **foliop, void **fsdata);
extern const struct address_space_operations ram_aops;
extern int always_delete_dentry(const struct dentry *);
extern struct inode *alloc_anon_inode(struct super_block *);
extern int simple_nosetlease(struct file *, int, struct file_lease **, void **);
extern const struct dentry_operations simple_dentry_operations;

extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags);
extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
extern const struct file_operations simple_dir_operations;
extern const struct inode_operations simple_dir_inode_operations;
extern void make_empty_dir_inode(struct inode *inode);
extern bool is_empty_dir_inode(struct inode *inode);
struct tree_descr { const char *name; const struct file_operations *ops; int mode; };
struct dentry *d_alloc_name(struct dentry *, const char *);
extern int simple_fill_super(struct super_block *, unsigned long,
                             const struct tree_descr *);
extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count);
extern void simple_release_fs(struct vfsmount **mount, int *count);

extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
                        loff_t *ppos, const void *from, size_t available);
extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count);

struct offset_ctx {
        struct maple_tree        mt;
        unsigned long                next_offset;
};

void simple_offset_init(struct offset_ctx *octx);
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry);
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry);
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry);
int simple_offset_rename_exchange(struct inode *old_dir,
                                  struct dentry *old_dentry,
                                  struct inode *new_dir,
                                  struct dentry *new_dentry);
void simple_offset_destroy(struct offset_ctx *octx);

extern const struct file_operations simple_offset_dir_operations;

extern int __generic_file_fsync(struct file *, loff_t, loff_t, int);
extern int generic_file_fsync(struct file *, loff_t, loff_t, int);

extern int generic_check_addressable(unsigned, u64);

extern void generic_set_sb_d_ops(struct super_block *sb);
extern int generic_ci_match(const struct inode *parent,
                            const struct qstr *name,
                            const struct qstr *folded_name,
                            const u8 *de_name, u32 de_name_len);

#if IS_ENABLED(CONFIG_UNICODE)
int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str);
int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                         const char *str, const struct qstr *name);

/**
 * generic_ci_validate_strict_name - Check if a given name is suitable
 * for a directory
 *
 * This functions checks if the proposed filename is valid for the
 * parent directory. That means that only valid UTF-8 filenames will be
 * accepted for casefold directories from filesystems created with the
 * strict encoding flag.  That also means that any name will be
 * accepted for directories that doesn't have casefold enabled, or
 * aren't being strict with the encoding.
 *
 * @dir: inode of the directory where the new file will be created
 * @name: name of the new file
 *
 * Return:
 * * True: if the filename is suitable for this directory. It can be
 *   true if a given name is not suitable for a strict encoding
 *   directory, but the directory being used isn't strict
 * * False if the filename isn't suitable for this directory. This only
 *   happens when a directory is casefolded and the filesystem is strict
 *   about its encoding.
 */
static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
{
        if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb))
                return true;

        /*
         * A casefold dir must have a encoding set, unless the filesystem
         * is corrupted
         */
        if (WARN_ON_ONCE(!dir->i_sb->s_encoding))
                return true;

        return !utf8_validate(dir->i_sb->s_encoding, name);
}
#else
static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name)
{
        return true;
}
#endif

static inline bool sb_has_encoding(const struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
        return !!sb->s_encoding;
#else
        return false;
#endif
}

int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
                unsigned int ia_valid);
int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *);
extern int inode_newsize_ok(const struct inode *, loff_t offset);
void setattr_copy(struct mnt_idmap *, struct inode *inode,
                  const struct iattr *attr);

extern int file_update_time(struct file *file);

static inline bool vma_is_dax(const struct vm_area_struct *vma)
{
        return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
}

static inline bool vma_is_fsdax(struct vm_area_struct *vma)
{
        struct inode *inode;

        if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file)
                return false;
        if (!vma_is_dax(vma))
                return false;
        inode = file_inode(vma->vm_file);
        if (S_ISCHR(inode->i_mode))
                return false; /* device-dax */
        return true;
}

static inline int iocb_flags(struct file *file)
{
        int res = 0;
        if (file->f_flags & O_APPEND)
                res |= IOCB_APPEND;
        if (file->f_flags & O_DIRECT)
                res |= IOCB_DIRECT;
        if (file->f_flags & O_DSYNC)
                res |= IOCB_DSYNC;
        if (file->f_flags & __O_SYNC)
                res |= IOCB_SYNC;
        return res;
}

static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags,
                                     int rw_type)
{
        int kiocb_flags = 0;

        /* make sure there's no overlap between RWF and private IOCB flags */
        BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD);

        if (!flags)
                return 0;
        if (unlikely(flags & ~RWF_SUPPORTED))
                return -EOPNOTSUPP;
        if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND)))
                return -EINVAL;

        if (flags & RWF_NOWAIT) {
                if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
                        return -EOPNOTSUPP;
        }
        if (flags & RWF_ATOMIC) {
                if (rw_type != WRITE)
                        return -EOPNOTSUPP;
                if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE))
                        return -EOPNOTSUPP;
        }
        if (flags & RWF_DONTCACHE) {
                /* file system must support it */
                if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE))
                        return -EOPNOTSUPP;
                /* DAX mappings not supported */
                if (IS_DAX(ki->ki_filp->f_mapping->host))
                        return -EOPNOTSUPP;
        }
        kiocb_flags |= (__force int) (flags & RWF_SUPPORTED);
        if (flags & RWF_SYNC)
                kiocb_flags |= IOCB_DSYNC;

        if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) {
                if (IS_APPEND(file_inode(ki->ki_filp)))
                        return -EPERM;
                ki->ki_flags &= ~IOCB_APPEND;
        }

        ki->ki_flags |= kiocb_flags;
        return 0;
}

/* Transaction based IO helpers */

/*
 * An argresp is stored in an allocated page and holds the
 * size of the argument or response, along with its content
 */
struct simple_transaction_argresp {
        ssize_t size;
        char data[];
};

#define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp))

char *simple_transaction_get(struct file *file, const char __user *buf,
                                size_t size);
ssize_t simple_transaction_read(struct file *file, char __user *buf,
                                size_t size, loff_t *pos);
int simple_transaction_release(struct inode *inode, struct file *file);

void simple_transaction_set(struct file *file, size_t n);

/*
 * simple attribute files
 *
 * These attributes behave similar to those in sysfs:
 *
 * Writing to an attribute immediately sets a value, an open file can be
 * written to multiple times.
 *
 * Reading from an attribute creates a buffer from the value that might get
 * read with multiple read calls. When the attribute has been read
 * completely, no further read calls are possible until the file is opened
 * again.
 *
 * All attributes contain a text representation of a numeric value
 * that are accessed with the get() and set() functions.
 */
#define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed)        \
static int __fops ## _open(struct inode *inode, struct file *file)        \
{                                                                        \
        __simple_attr_check_format(__fmt, 0ull);                        \
        return simple_attr_open(inode, file, __get, __set, __fmt);        \
}                                                                        \
static const struct file_operations __fops = {                                \
        .owner         = THIS_MODULE,                                                \
        .open         = __fops ## _open,                                        \
        .release = simple_attr_release,                                        \
        .read         = simple_attr_read,                                        \
        .write         = (__is_signed) ? simple_attr_write_signed : simple_attr_write,        \
        .llseek         = generic_file_llseek,                                        \
}

#define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt)                \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false)

#define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt)        \
        DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true)

static inline __printf(1, 2)
void __simple_attr_check_format(const char *fmt, ...)
{
        /* don't do anything, just let the compiler check the arguments; */
}

int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt);
int simple_attr_release(struct inode *inode, struct file *file);
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos);
ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos);
ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                                 size_t len, loff_t *ppos);

struct ctl_table;
int __init list_bdev_fs_names(char *buf, size_t size);

#define __FMODE_EXEC                ((__force int) FMODE_EXEC)

#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
#define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE))

static inline bool is_sxid(umode_t mode)
{
        return mode & (S_ISUID | S_ISGID);
}

static inline int check_sticky(struct mnt_idmap *idmap,
                               struct inode *dir, struct inode *inode)
{
        if (!(dir->i_mode & S_ISVTX))
                return 0;

        return __check_sticky(idmap, dir, inode);
}

static inline void inode_has_no_xattr(struct inode *inode)
{
        if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC))
                inode->i_flags |= S_NOSEC;
}

static inline bool is_root_inode(struct inode *inode)
{
        return inode == inode->i_sb->s_root->d_inode;
}

static inline bool dir_emit(struct dir_context *ctx,
                            const char *name, int namelen,
                            u64 ino, unsigned type)
{
        return ctx->actor(ctx, name, namelen, ctx->pos, ino, type);
}
static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, ".", 1, ctx->pos,
                          file->f_path.dentry->d_inode->i_ino, DT_DIR);
}
static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx)
{
        return ctx->actor(ctx, "..", 2, ctx->pos,
                          d_parent_ino(file->f_path.dentry), DT_DIR);
}
static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx)
{
        if (ctx->pos == 0) {
                if (!dir_emit_dot(file, ctx))
                        return false;
                ctx->pos = 1;
        }
        if (ctx->pos == 1) {
                if (!dir_emit_dotdot(file, ctx))
                        return false;
                ctx->pos = 2;
        }
        return true;
}
static inline bool dir_relax(struct inode *inode)
{
        inode_unlock(inode);
        inode_lock(inode);
        return !IS_DEADDIR(inode);
}

static inline bool dir_relax_shared(struct inode *inode)
{
        inode_unlock_shared(inode);
        inode_lock_shared(inode);
        return !IS_DEADDIR(inode);
}

extern bool path_noexec(const struct path *path);
extern void inode_nohighmem(struct inode *inode);

/* mm/fadvise.c */
extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len,
                       int advice);
extern int generic_fadvise(struct file *file, loff_t offset, loff_t len,
                           int advice);

static inline bool vfs_empty_path(int dfd, const char __user *path)
{
        char c;

        if (dfd < 0)
                return false;

        /* We now allow NULL to be used for empty path. */
        if (!path)
                return true;

        if (unlikely(get_user(c, path)))
                return false;

        return !c;
}

int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter);

#endif /* _LINUX_FS_H */

























































































































































































































































































  628 

  445 






  588 























  445 
  445 









  591 
  587 






































  146 


























































































































































































































































































































































































































































































































































































































































































































































































  441 





























































































































































































  274 












  274 










  274 








































































  442 

  441 





   35 







  417 






































































  153 














    3 
  274 






  274 



























  274 




















  417 
































































  152 












  154 

















   69 



  372 
















    8 
    8 



























  202 

















  253 









  254 









  254 






    9 
  247 













   36 



   36 


   36 



   36 












   34 







   34 






















   34 




































































































































   20 
  166 



  444 

















  159 

   33 

   17 

















  202 



  202 





























































































































  151 
  152 








































































  139 
























































































































































































































































































































































































































































































































































































































































  499 
  499 














  147 
  146 
















































   35 


   35 








   35 






   35 














   35 































































   34 

































































    8 

    8 









  445 













   35 





  445 

  445 
  445 




  445 


  445 


    8 







    8 






    8 











    8 




    8 






















































































































































































































































































































































































































































































   15 



   78 









   78 
   78 




   15 



  672 















 1169 















 1165 








  962 


  417 








 1204 



   44 





















    2 






    3 






 1163 


































  321 




  250 
   73 
   36 







   81 



  161 


















  317 


  317 











  321 



  159 


  208 

  165 








































































































































  202 




  202 

















  202 




























































































































































































































  375 
















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  265 








  265 




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  441 

  441 






































































































































































































































































































































































































































































































































































  254 










































































































































    8 




























































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  Security-Enhanced Linux (SELinux) security module
 *
 *  This file contains the SELinux hook function implementations.
 *
 *  Authors:  Stephen Smalley, <stephen.smalley.work@gmail.com>
 *              Chris Vance, <cvance@nai.com>
 *              Wayne Salamon, <wsalamon@nai.com>
 *              James Morris <jmorris@redhat.com>
 *
 *  Copyright (C) 2001,2002 Networks Associates Technology, Inc.
 *  Copyright (C) 2003-2008 Red Hat, Inc., James Morris <jmorris@redhat.com>
 *                                           Eric Paris <eparis@redhat.com>
 *  Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 *                            <dgoeddel@trustedcs.com>
 *  Copyright (C) 2006, 2007, 2009 Hewlett-Packard Development Company, L.P.
 *        Paul Moore <paul@paul-moore.com>
 *  Copyright (C) 2007 Hitachi Software Engineering Co., Ltd.
 *                       Yuichi Nakamura <ynakam@hitachisoft.jp>
 *  Copyright (C) 2016 Mellanox Technologies
 */

#include <linux/init.h>
#include <linux/kd.h>
#include <linux/kernel.h>
#include <linux/kernel_read_file.h>
#include <linux/errno.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/lsm_hooks.h>
#include <linux/xattr.h>
#include <linux/capability.h>
#include <linux/unistd.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/proc_fs.h>
#include <linux/swap.h>
#include <linux/spinlock.h>
#include <linux/syscalls.h>
#include <linux/dcache.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/fs_context.h>
#include <linux/fs_parser.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/tty.h>
#include <net/icmp.h>
#include <net/ip.h>                /* for local_port_range[] */
#include <net/tcp.h>                /* struct or_callable used in sock_rcv_skb */
#include <net/inet_connection_sock.h>
#include <net/net_namespace.h>
#include <net/netlabel.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>        /* for network interface checks */
#include <net/netlink.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/dccp.h>
#include <linux/sctp.h>
#include <net/sctp/structs.h>
#include <linux/quota.h>
#include <linux/un.h>                /* for Unix socket types */
#include <net/af_unix.h>        /* for Unix socket types */
#include <linux/parser.h>
#include <linux/nfs_mount.h>
#include <net/ipv6.h>
#include <linux/hugetlb.h>
#include <linux/personality.h>
#include <linux/audit.h>
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/posix-timers.h>
#include <linux/syslog.h>
#include <linux/user_namespace.h>
#include <linux/export.h>
#include <linux/msg.h>
#include <linux/shm.h>
#include <uapi/linux/shm.h>
#include <linux/bpf.h>
#include <linux/kernfs.h>
#include <linux/stringhash.h>        /* for hashlen_string() */
#include <uapi/linux/mount.h>
#include <linux/fsnotify.h>
#include <linux/fanotify.h>
#include <linux/io_uring/cmd.h>
#include <uapi/linux/lsm.h>

#include "avc.h"
#include "objsec.h"
#include "netif.h"
#include "netnode.h"
#include "netport.h"
#include "ibpkey.h"
#include "xfrm.h"
#include "netlabel.h"
#include "audit.h"
#include "avc_ss.h"

#define SELINUX_INODE_INIT_XATTRS 1

struct selinux_state selinux_state;

/* SECMARK reference count */
static atomic_t selinux_secmark_refcount = ATOMIC_INIT(0);

#ifdef CONFIG_SECURITY_SELINUX_DEVELOP
static int selinux_enforcing_boot __initdata;

static int __init enforcing_setup(char *str)
{
        unsigned long enforcing;
        if (!kstrtoul(str, 0, &enforcing))
                selinux_enforcing_boot = enforcing ? 1 : 0;
        return 1;
}
__setup("enforcing=", enforcing_setup);
#else
#define selinux_enforcing_boot 1
#endif

int selinux_enabled_boot __initdata = 1;
#ifdef CONFIG_SECURITY_SELINUX_BOOTPARAM
static int __init selinux_enabled_setup(char *str)
{
        unsigned long enabled;
        if (!kstrtoul(str, 0, &enabled))
                selinux_enabled_boot = enabled ? 1 : 0;
        return 1;
}
__setup("selinux=", selinux_enabled_setup);
#endif

static int __init checkreqprot_setup(char *str)
{
        unsigned long checkreqprot;

        if (!kstrtoul(str, 0, &checkreqprot)) {
                if (checkreqprot)
                        pr_err("SELinux: checkreqprot set to 1 via kernel parameter.  This is no longer supported.\n");
        }
        return 1;
}
__setup("checkreqprot=", checkreqprot_setup);

/**
 * selinux_secmark_enabled - Check to see if SECMARK is currently enabled
 *
 * Description:
 * This function checks the SECMARK reference counter to see if any SECMARK
 * targets are currently configured, if the reference counter is greater than
 * zero SECMARK is considered to be enabled.  Returns true (1) if SECMARK is
 * enabled, false (0) if SECMARK is disabled.  If the always_check_network
 * policy capability is enabled, SECMARK is always considered enabled.
 *
 */
static int selinux_secmark_enabled(void)
{
        return (selinux_policycap_alwaysnetwork() ||
                atomic_read(&selinux_secmark_refcount));
}

/**
 * selinux_peerlbl_enabled - Check to see if peer labeling is currently enabled
 *
 * Description:
 * This function checks if NetLabel or labeled IPSEC is enabled.  Returns true
 * (1) if any are enabled or false (0) if neither are enabled.  If the
 * always_check_network policy capability is enabled, peer labeling
 * is always considered enabled.
 *
 */
static int selinux_peerlbl_enabled(void)
{
        return (selinux_policycap_alwaysnetwork() ||
                netlbl_enabled() || selinux_xfrm_enabled());
}

static int selinux_netcache_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET) {
                sel_netif_flush();
                sel_netnode_flush();
                sel_netport_flush();
                synchronize_net();
        }
        return 0;
}

static int selinux_lsm_notifier_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET) {
                sel_ib_pkey_flush();
                call_blocking_lsm_notifier(LSM_POLICY_CHANGE, NULL);
        }

        return 0;
}

/*
 * initialise the security for the init task
 */
static void cred_init_security(void)
{
        struct task_security_struct *tsec;

        tsec = selinux_cred(unrcu_pointer(current->real_cred));
        tsec->osid = tsec->sid = SECINITSID_KERNEL;
}

/*
 * get the security ID of a set of credentials
 */
static inline u32 cred_sid(const struct cred *cred)
{
        const struct task_security_struct *tsec;

        tsec = selinux_cred(cred);
        return tsec->sid;
}

static void __ad_net_init(struct common_audit_data *ad,
                          struct lsm_network_audit *net,
                          int ifindex, struct sock *sk, u16 family)
{
        ad->type = LSM_AUDIT_DATA_NET;
        ad->u.net = net;
        net->netif = ifindex;
        net->sk = sk;
        net->family = family;
}

static void ad_net_init_from_sk(struct common_audit_data *ad,
                                struct lsm_network_audit *net,
                                struct sock *sk)
{
        __ad_net_init(ad, net, 0, sk, 0);
}

static void ad_net_init_from_iif(struct common_audit_data *ad,
                                 struct lsm_network_audit *net,
                                 int ifindex, u16 family)
{
        __ad_net_init(ad, net, ifindex, NULL, family);
}

/*
 * get the objective security ID of a task
 */
static inline u32 task_sid_obj(const struct task_struct *task)
{
        u32 sid;

        rcu_read_lock();
        sid = cred_sid(__task_cred(task));
        rcu_read_unlock();
        return sid;
}

static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry);

/*
 * Try reloading inode security labels that have been marked as invalid.  The
 * @may_sleep parameter indicates when sleeping and thus reloading labels is
 * allowed; when set to false, returns -ECHILD when the label is
 * invalid.  The @dentry parameter should be set to a dentry of the inode.
 */
static int __inode_security_revalidate(struct inode *inode,
                                       struct dentry *dentry,
                                       bool may_sleep)
{
        struct inode_security_struct *isec = selinux_inode(inode);

        might_sleep_if(may_sleep);

        /*
         * The check of isec->initialized below is racy but
         * inode_doinit_with_dentry() will recheck with
         * isec->lock held.
         */
        if (selinux_initialized() &&
            data_race(isec->initialized != LABEL_INITIALIZED)) {
                if (!may_sleep)
                        return -ECHILD;

                /*
                 * Try reloading the inode security label.  This will fail if
                 * @opt_dentry is NULL and no dentry for this inode can be
                 * found; in that case, continue using the old label.
                 */
                inode_doinit_with_dentry(inode, dentry);
        }
        return 0;
}

static struct inode_security_struct *inode_security_novalidate(struct inode *inode)
{
        return selinux_inode(inode);
}

static struct inode_security_struct *inode_security_rcu(struct inode *inode, bool rcu)
{
        int error;

        error = __inode_security_revalidate(inode, NULL, !rcu);
        if (error)
                return ERR_PTR(error);
        return selinux_inode(inode);
}

/*
 * Get the security label of an inode.
 */
static struct inode_security_struct *inode_security(struct inode *inode)
{
        __inode_security_revalidate(inode, NULL, true);
        return selinux_inode(inode);
}

static struct inode_security_struct *backing_inode_security_novalidate(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        return selinux_inode(inode);
}

/*
 * Get the security label of a dentry's backing inode.
 */
static struct inode_security_struct *backing_inode_security(struct dentry *dentry)
{
        struct inode *inode = d_backing_inode(dentry);

        __inode_security_revalidate(inode, dentry, true);
        return selinux_inode(inode);
}

static void inode_free_security(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        struct superblock_security_struct *sbsec;

        if (!isec)
                return;
        sbsec = selinux_superblock(inode->i_sb);
        /*
         * As not all inode security structures are in a list, we check for
         * empty list outside of the lock to make sure that we won't waste
         * time taking a lock doing nothing.
         *
         * The list_del_init() function can be safely called more than once.
         * It should not be possible for this function to be called with
         * concurrent list_add(), but for better safety against future changes
         * in the code, we use list_empty_careful() here.
         */
        if (!list_empty_careful(&isec->list)) {
                spin_lock(&sbsec->isec_lock);
                list_del_init(&isec->list);
                spin_unlock(&sbsec->isec_lock);
        }
}

struct selinux_mnt_opts {
        u32 fscontext_sid;
        u32 context_sid;
        u32 rootcontext_sid;
        u32 defcontext_sid;
};

static void selinux_free_mnt_opts(void *mnt_opts)
{
        kfree(mnt_opts);
}

enum {
        Opt_error = -1,
        Opt_context = 0,
        Opt_defcontext = 1,
        Opt_fscontext = 2,
        Opt_rootcontext = 3,
        Opt_seclabel = 4,
};

#define A(s, has_arg) {#s, sizeof(#s) - 1, Opt_##s, has_arg}
static const struct {
        const char *name;
        int len;
        int opt;
        bool has_arg;
} tokens[] = {
        A(context, true),
        A(fscontext, true),
        A(defcontext, true),
        A(rootcontext, true),
        A(seclabel, false),
};
#undef A

static int match_opt_prefix(char *s, int l, char **arg)
{
        unsigned int i;

        for (i = 0; i < ARRAY_SIZE(tokens); i++) {
                size_t len = tokens[i].len;
                if (len > l || memcmp(s, tokens[i].name, len))
                        continue;
                if (tokens[i].has_arg) {
                        if (len == l || s[len] != '=')
                                continue;
                        *arg = s + len + 1;
                } else if (len != l)
                        continue;
                return tokens[i].opt;
        }
        return Opt_error;
}

#define SEL_MOUNT_FAIL_MSG "SELinux:  duplicate or incompatible mount options\n"

static int may_context_mount_sb_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
{
        const struct task_security_struct *tsec = selinux_cred(cred);
        int rc;

        rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELFROM, NULL);
        if (rc)
                return rc;

        rc = avc_has_perm(tsec->sid, sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELTO, NULL);
        return rc;
}

static int may_context_mount_inode_relabel(u32 sid,
                        struct superblock_security_struct *sbsec,
                        const struct cred *cred)
{
        const struct task_security_struct *tsec = selinux_cred(cred);
        int rc;
        rc = avc_has_perm(tsec->sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__RELABELFROM, NULL);
        if (rc)
                return rc;

        rc = avc_has_perm(sid, sbsec->sid, SECCLASS_FILESYSTEM,
                          FILESYSTEM__ASSOCIATE, NULL);
        return rc;
}

static int selinux_is_genfs_special_handling(struct super_block *sb)
{
        /* Special handling. Genfs but also in-core setxattr handler */
        return        !strcmp(sb->s_type->name, "sysfs") ||
                !strcmp(sb->s_type->name, "pstore") ||
                !strcmp(sb->s_type->name, "debugfs") ||
                !strcmp(sb->s_type->name, "tracefs") ||
                !strcmp(sb->s_type->name, "rootfs") ||
                (selinux_policycap_cgroupseclabel() &&
                 (!strcmp(sb->s_type->name, "cgroup") ||
                  !strcmp(sb->s_type->name, "cgroup2")));
}

static int selinux_is_sblabel_mnt(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);

        /*
         * IMPORTANT: Double-check logic in this function when adding a new
         * SECURITY_FS_USE_* definition!
         */
        BUILD_BUG_ON(SECURITY_FS_USE_MAX != 7);

        switch (sbsec->behavior) {
        case SECURITY_FS_USE_XATTR:
        case SECURITY_FS_USE_TRANS:
        case SECURITY_FS_USE_TASK:
        case SECURITY_FS_USE_NATIVE:
                return 1;

        case SECURITY_FS_USE_GENFS:
                return selinux_is_genfs_special_handling(sb);

        /* Never allow relabeling on context mounts */
        case SECURITY_FS_USE_MNTPOINT:
        case SECURITY_FS_USE_NONE:
        default:
                return 0;
        }
}

static int sb_check_xattr_support(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        struct dentry *root = sb->s_root;
        struct inode *root_inode = d_backing_inode(root);
        u32 sid;
        int rc;

        /*
         * Make sure that the xattr handler exists and that no
         * error other than -ENODATA is returned by getxattr on
         * the root directory.  -ENODATA is ok, as this may be
         * the first boot of the SELinux kernel before we have
         * assigned xattr values to the filesystem.
         */
        if (!(root_inode->i_opflags & IOP_XATTR)) {
                pr_warn("SELinux: (dev %s, type %s) has no xattr support\n",
                        sb->s_id, sb->s_type->name);
                goto fallback;
        }

        rc = __vfs_getxattr(root, root_inode, XATTR_NAME_SELINUX, NULL, 0);
        if (rc < 0 && rc != -ENODATA) {
                if (rc == -EOPNOTSUPP) {
                        pr_warn("SELinux: (dev %s, type %s) has no security xattr handler\n",
                                sb->s_id, sb->s_type->name);
                        goto fallback;
                } else {
                        pr_warn("SELinux: (dev %s, type %s) getxattr errno %d\n",
                                sb->s_id, sb->s_type->name, -rc);
                        return rc;
                }
        }
        return 0;

fallback:
        /* No xattr support - try to fallback to genfs if possible. */
        rc = security_genfs_sid(sb->s_type->name, "/",
                                SECCLASS_DIR, &sid);
        if (rc)
                return -EOPNOTSUPP;

        pr_warn("SELinux: (dev %s, type %s) falling back to genfs\n",
                sb->s_id, sb->s_type->name);
        sbsec->behavior = SECURITY_FS_USE_GENFS;
        sbsec->sid = sid;
        return 0;
}

static int sb_finish_set_opts(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        struct dentry *root = sb->s_root;
        struct inode *root_inode = d_backing_inode(root);
        int rc = 0;

        if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
                rc = sb_check_xattr_support(sb);
                if (rc)
                        return rc;
        }

        sbsec->flags |= SE_SBINITIALIZED;

        /*
         * Explicitly set or clear SBLABEL_MNT.  It's not sufficient to simply
         * leave the flag untouched because sb_clone_mnt_opts might be handing
         * us a superblock that needs the flag to be cleared.
         */
        if (selinux_is_sblabel_mnt(sb))
                sbsec->flags |= SBLABEL_MNT;
        else
                sbsec->flags &= ~SBLABEL_MNT;

        /* Initialize the root inode. */
        rc = inode_doinit_with_dentry(root_inode, root);

        /* Initialize any other inodes associated with the superblock, e.g.
           inodes created prior to initial policy load or inodes created
           during get_sb by a pseudo filesystem that directly
           populates itself. */
        spin_lock(&sbsec->isec_lock);
        while (!list_empty(&sbsec->isec_head)) {
                struct inode_security_struct *isec =
                                list_first_entry(&sbsec->isec_head,
                                           struct inode_security_struct, list);
                struct inode *inode = isec->inode;
                list_del_init(&isec->list);
                spin_unlock(&sbsec->isec_lock);
                inode = igrab(inode);
                if (inode) {
                        if (!IS_PRIVATE(inode))
                                inode_doinit_with_dentry(inode, NULL);
                        iput(inode);
                }
                spin_lock(&sbsec->isec_lock);
        }
        spin_unlock(&sbsec->isec_lock);
        return rc;
}

static int bad_option(struct superblock_security_struct *sbsec, char flag,
                      u32 old_sid, u32 new_sid)
{
        char mnt_flags = sbsec->flags & SE_MNTMASK;

        /* check if the old mount command had the same options */
        if (sbsec->flags & SE_SBINITIALIZED)
                if (!(sbsec->flags & flag) ||
                    (old_sid != new_sid))
                        return 1;

        /* check if we were passed the same options twice,
         * aka someone passed context=a,context=b
         */
        if (!(sbsec->flags & SE_SBINITIALIZED))
                if (mnt_flags & flag)
                        return 1;
        return 0;
}

/*
 * Allow filesystems with binary mount data to explicitly set mount point
 * labeling information.
 */
static int selinux_set_mnt_opts(struct super_block *sb,
                                void *mnt_opts,
                                unsigned long kern_flags,
                                unsigned long *set_kern_flags)
{
        const struct cred *cred = current_cred();
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        struct dentry *root = sb->s_root;
        struct selinux_mnt_opts *opts = mnt_opts;
        struct inode_security_struct *root_isec;
        u32 fscontext_sid = 0, context_sid = 0, rootcontext_sid = 0;
        u32 defcontext_sid = 0;
        int rc = 0;

        /*
         * Specifying internal flags without providing a place to
         * place the results is not allowed
         */
        if (kern_flags && !set_kern_flags)
                return -EINVAL;

        mutex_lock(&sbsec->lock);

        if (!selinux_initialized()) {
                if (!opts) {
                        /* Defer initialization until selinux_complete_init,
                           after the initial policy is loaded and the security
                           server is ready to handle calls. */
                        if (kern_flags & SECURITY_LSM_NATIVE_LABELS) {
                                sbsec->flags |= SE_SBNATIVE;
                                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
                        }
                        goto out;
                }
                rc = -EINVAL;
                pr_warn("SELinux: Unable to set superblock options "
                        "before the security server is initialized\n");
                goto out;
        }

        /*
         * Binary mount data FS will come through this function twice.  Once
         * from an explicit call and once from the generic calls from the vfs.
         * Since the generic VFS calls will not contain any security mount data
         * we need to skip the double mount verification.
         *
         * This does open a hole in which we will not notice if the first
         * mount using this sb set explicit options and a second mount using
         * this sb does not set any security options.  (The first options
         * will be used for both mounts)
         */
        if ((sbsec->flags & SE_SBINITIALIZED) && (sb->s_type->fs_flags & FS_BINARY_MOUNTDATA)
            && !opts)
                goto out;

        root_isec = backing_inode_security_novalidate(root);

        /*
         * parse the mount options, check if they are valid sids.
         * also check if someone is trying to mount the same sb more
         * than once with different security options.
         */
        if (opts) {
                if (opts->fscontext_sid) {
                        fscontext_sid = opts->fscontext_sid;
                        if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid,
                                        fscontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= FSCONTEXT_MNT;
                }
                if (opts->context_sid) {
                        context_sid = opts->context_sid;
                        if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid,
                                        context_sid))
                                goto out_double_mount;
                        sbsec->flags |= CONTEXT_MNT;
                }
                if (opts->rootcontext_sid) {
                        rootcontext_sid = opts->rootcontext_sid;
                        if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid,
                                        rootcontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= ROOTCONTEXT_MNT;
                }
                if (opts->defcontext_sid) {
                        defcontext_sid = opts->defcontext_sid;
                        if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid,
                                        defcontext_sid))
                                goto out_double_mount;
                        sbsec->flags |= DEFCONTEXT_MNT;
                }
        }

        if (sbsec->flags & SE_SBINITIALIZED) {
                /* previously mounted with options, but not on this attempt? */
                if ((sbsec->flags & SE_MNTMASK) && !opts)
                        goto out_double_mount;
                rc = 0;
                goto out;
        }

        if (strcmp(sb->s_type->name, "proc") == 0)
                sbsec->flags |= SE_SBPROC | SE_SBGENFS;

        if (!strcmp(sb->s_type->name, "debugfs") ||
            !strcmp(sb->s_type->name, "tracefs") ||
            !strcmp(sb->s_type->name, "binder") ||
            !strcmp(sb->s_type->name, "bpf") ||
            !strcmp(sb->s_type->name, "pstore") ||
            !strcmp(sb->s_type->name, "securityfs"))
                sbsec->flags |= SE_SBGENFS;

        if (!strcmp(sb->s_type->name, "sysfs") ||
            !strcmp(sb->s_type->name, "cgroup") ||
            !strcmp(sb->s_type->name, "cgroup2"))
                sbsec->flags |= SE_SBGENFS | SE_SBGENFS_XATTR;

        if (!sbsec->behavior) {
                /*
                 * Determine the labeling behavior to use for this
                 * filesystem type.
                 */
                rc = security_fs_use(sb);
                if (rc) {
                        pr_warn("%s: security_fs_use(%s) returned %d\n",
                                        __func__, sb->s_type->name, rc);
                        goto out;
                }
        }

        /*
         * If this is a user namespace mount and the filesystem type is not
         * explicitly whitelisted, then no contexts are allowed on the command
         * line and security labels must be ignored.
         */
        if (sb->s_user_ns != &init_user_ns &&
            strcmp(sb->s_type->name, "tmpfs") &&
            strcmp(sb->s_type->name, "ramfs") &&
            strcmp(sb->s_type->name, "devpts") &&
            strcmp(sb->s_type->name, "overlay")) {
                if (context_sid || fscontext_sid || rootcontext_sid ||
                    defcontext_sid) {
                        rc = -EACCES;
                        goto out;
                }
                if (sbsec->behavior == SECURITY_FS_USE_XATTR) {
                        sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
                        rc = security_transition_sid(current_sid(),
                                                     current_sid(),
                                                     SECCLASS_FILE, NULL,
                                                     &sbsec->mntpoint_sid);
                        if (rc)
                                goto out;
                }
                goto out_set_opts;
        }

        /* sets the context of the superblock for the fs being mounted. */
        if (fscontext_sid) {
                rc = may_context_mount_sb_relabel(fscontext_sid, sbsec, cred);
                if (rc)
                        goto out;

                sbsec->sid = fscontext_sid;
        }

        /*
         * Switch to using mount point labeling behavior.
         * sets the label used on all file below the mountpoint, and will set
         * the superblock context if not already set.
         */
        if (sbsec->flags & SE_SBNATIVE) {
                /*
                 * This means we are initializing a superblock that has been
                 * mounted before the SELinux was initialized and the
                 * filesystem requested native labeling. We had already
                 * returned SECURITY_LSM_NATIVE_LABELS in *set_kern_flags
                 * in the original mount attempt, so now we just need to set
                 * the SECURITY_FS_USE_NATIVE behavior.
                 */
                sbsec->behavior = SECURITY_FS_USE_NATIVE;
        } else if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !context_sid) {
                sbsec->behavior = SECURITY_FS_USE_NATIVE;
                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
        }

        if (context_sid) {
                if (!fscontext_sid) {
                        rc = may_context_mount_sb_relabel(context_sid, sbsec,
                                                          cred);
                        if (rc)
                                goto out;
                        sbsec->sid = context_sid;
                } else {
                        rc = may_context_mount_inode_relabel(context_sid, sbsec,
                                                             cred);
                        if (rc)
                                goto out;
                }
                if (!rootcontext_sid)
                        rootcontext_sid = context_sid;

                sbsec->mntpoint_sid = context_sid;
                sbsec->behavior = SECURITY_FS_USE_MNTPOINT;
        }

        if (rootcontext_sid) {
                rc = may_context_mount_inode_relabel(rootcontext_sid, sbsec,
                                                     cred);
                if (rc)
                        goto out;

                root_isec->sid = rootcontext_sid;
                root_isec->initialized = LABEL_INITIALIZED;
        }

        if (defcontext_sid) {
                if (sbsec->behavior != SECURITY_FS_USE_XATTR &&
                        sbsec->behavior != SECURITY_FS_USE_NATIVE) {
                        rc = -EINVAL;
                        pr_warn("SELinux: defcontext option is "
                               "invalid for this filesystem type\n");
                        goto out;
                }

                if (defcontext_sid != sbsec->def_sid) {
                        rc = may_context_mount_inode_relabel(defcontext_sid,
                                                             sbsec, cred);
                        if (rc)
                                goto out;
                }

                sbsec->def_sid = defcontext_sid;
        }

out_set_opts:
        rc = sb_finish_set_opts(sb);
out:
        mutex_unlock(&sbsec->lock);
        return rc;
out_double_mount:
        rc = -EINVAL;
        pr_warn("SELinux: mount invalid.  Same superblock, different "
               "security settings for (dev %s, type %s)\n", sb->s_id,
               sb->s_type->name);
        goto out;
}

static int selinux_cmp_sb_context(const struct super_block *oldsb,
                                    const struct super_block *newsb)
{
        struct superblock_security_struct *old = selinux_superblock(oldsb);
        struct superblock_security_struct *new = selinux_superblock(newsb);
        char oldflags = old->flags & SE_MNTMASK;
        char newflags = new->flags & SE_MNTMASK;

        if (oldflags != newflags)
                goto mismatch;
        if ((oldflags & FSCONTEXT_MNT) && old->sid != new->sid)
                goto mismatch;
        if ((oldflags & CONTEXT_MNT) && old->mntpoint_sid != new->mntpoint_sid)
                goto mismatch;
        if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid)
                goto mismatch;
        if (oldflags & ROOTCONTEXT_MNT) {
                struct inode_security_struct *oldroot = backing_inode_security(oldsb->s_root);
                struct inode_security_struct *newroot = backing_inode_security(newsb->s_root);
                if (oldroot->sid != newroot->sid)
                        goto mismatch;
        }
        return 0;
mismatch:
        pr_warn("SELinux: mount invalid.  Same superblock, "
                            "different security settings for (dev %s, "
                            "type %s)\n", newsb->s_id, newsb->s_type->name);
        return -EBUSY;
}

static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
                                        struct super_block *newsb,
                                        unsigned long kern_flags,
                                        unsigned long *set_kern_flags)
{
        int rc = 0;
        const struct superblock_security_struct *oldsbsec =
                                                selinux_superblock(oldsb);
        struct superblock_security_struct *newsbsec = selinux_superblock(newsb);

        int set_fscontext =        (oldsbsec->flags & FSCONTEXT_MNT);
        int set_context =        (oldsbsec->flags & CONTEXT_MNT);
        int set_rootcontext =        (oldsbsec->flags & ROOTCONTEXT_MNT);

        /*
         * Specifying internal flags without providing a place to
         * place the results is not allowed.
         */
        if (kern_flags && !set_kern_flags)
                return -EINVAL;

        mutex_lock(&newsbsec->lock);

        /*
         * if the parent was able to be mounted it clearly had no special lsm
         * mount options.  thus we can safely deal with this superblock later
         */
        if (!selinux_initialized()) {
                if (kern_flags & SECURITY_LSM_NATIVE_LABELS) {
                        newsbsec->flags |= SE_SBNATIVE;
                        *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
                }
                goto out;
        }

        /* how can we clone if the old one wasn't set up?? */
        BUG_ON(!(oldsbsec->flags & SE_SBINITIALIZED));

        /* if fs is reusing a sb, make sure that the contexts match */
        if (newsbsec->flags & SE_SBINITIALIZED) {
                mutex_unlock(&newsbsec->lock);
                if ((kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context)
                        *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
                return selinux_cmp_sb_context(oldsb, newsb);
        }

        newsbsec->flags = oldsbsec->flags;

        newsbsec->sid = oldsbsec->sid;
        newsbsec->def_sid = oldsbsec->def_sid;
        newsbsec->behavior = oldsbsec->behavior;

        if (newsbsec->behavior == SECURITY_FS_USE_NATIVE &&
                !(kern_flags & SECURITY_LSM_NATIVE_LABELS) && !set_context) {
                rc = security_fs_use(newsb);
                if (rc)
                        goto out;
        }

        if (kern_flags & SECURITY_LSM_NATIVE_LABELS && !set_context) {
                newsbsec->behavior = SECURITY_FS_USE_NATIVE;
                *set_kern_flags |= SECURITY_LSM_NATIVE_LABELS;
        }

        if (set_context) {
                u32 sid = oldsbsec->mntpoint_sid;

                if (!set_fscontext)
                        newsbsec->sid = sid;
                if (!set_rootcontext) {
                        struct inode_security_struct *newisec = backing_inode_security(newsb->s_root);
                        newisec->sid = sid;
                }
                newsbsec->mntpoint_sid = sid;
        }
        if (set_rootcontext) {
                const struct inode_security_struct *oldisec = backing_inode_security(oldsb->s_root);
                struct inode_security_struct *newisec = backing_inode_security(newsb->s_root);

                newisec->sid = oldisec->sid;
        }

        sb_finish_set_opts(newsb);
out:
        mutex_unlock(&newsbsec->lock);
        return rc;
}

/*
 * NOTE: the caller is responsible for freeing the memory even if on error.
 */
static int selinux_add_opt(int token, const char *s, void **mnt_opts)
{
        struct selinux_mnt_opts *opts = *mnt_opts;
        u32 *dst_sid;
        int rc;

        if (token == Opt_seclabel)
                /* eaten and completely ignored */
                return 0;
        if (!s)
                return -EINVAL;

        if (!selinux_initialized()) {
                pr_warn("SELinux: Unable to set superblock options before the security server is initialized\n");
                return -EINVAL;
        }

        if (!opts) {
                opts = kzalloc(sizeof(*opts), GFP_KERNEL);
                if (!opts)
                        return -ENOMEM;
                *mnt_opts = opts;
        }

        switch (token) {
        case Opt_context:
                if (opts->context_sid || opts->defcontext_sid)
                        goto err;
                dst_sid = &opts->context_sid;
                break;
        case Opt_fscontext:
                if (opts->fscontext_sid)
                        goto err;
                dst_sid = &opts->fscontext_sid;
                break;
        case Opt_rootcontext:
                if (opts->rootcontext_sid)
                        goto err;
                dst_sid = &opts->rootcontext_sid;
                break;
        case Opt_defcontext:
                if (opts->context_sid || opts->defcontext_sid)
                        goto err;
                dst_sid = &opts->defcontext_sid;
                break;
        default:
                WARN_ON(1);
                return -EINVAL;
        }
        rc = security_context_str_to_sid(s, dst_sid, GFP_KERNEL);
        if (rc)
                pr_warn("SELinux: security_context_str_to_sid (%s) failed with errno=%d\n",
                        s, rc);
        return rc;

err:
        pr_warn(SEL_MOUNT_FAIL_MSG);
        return -EINVAL;
}

static int show_sid(struct seq_file *m, u32 sid)
{
        char *context = NULL;
        u32 len;
        int rc;

        rc = security_sid_to_context(sid, &context, &len);
        if (!rc) {
                bool has_comma = strchr(context, ',');

                seq_putc(m, '=');
                if (has_comma)
                        seq_putc(m, '\"');
                seq_escape(m, context, "\"\n\\");
                if (has_comma)
                        seq_putc(m, '\"');
        }
        kfree(context);
        return rc;
}

static int selinux_sb_show_options(struct seq_file *m, struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        int rc;

        if (!(sbsec->flags & SE_SBINITIALIZED))
                return 0;

        if (!selinux_initialized())
                return 0;

        if (sbsec->flags & FSCONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, FSCONTEXT_STR);
                rc = show_sid(m, sbsec->sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & CONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, CONTEXT_STR);
                rc = show_sid(m, sbsec->mntpoint_sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & DEFCONTEXT_MNT) {
                seq_putc(m, ',');
                seq_puts(m, DEFCONTEXT_STR);
                rc = show_sid(m, sbsec->def_sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & ROOTCONTEXT_MNT) {
                struct dentry *root = sb->s_root;
                struct inode_security_struct *isec = backing_inode_security(root);
                seq_putc(m, ',');
                seq_puts(m, ROOTCONTEXT_STR);
                rc = show_sid(m, isec->sid);
                if (rc)
                        return rc;
        }
        if (sbsec->flags & SBLABEL_MNT) {
                seq_putc(m, ',');
                seq_puts(m, SECLABEL_STR);
        }
        return 0;
}

static inline u16 inode_mode_to_security_class(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFSOCK:
                return SECCLASS_SOCK_FILE;
        case S_IFLNK:
                return SECCLASS_LNK_FILE;
        case S_IFREG:
                return SECCLASS_FILE;
        case S_IFBLK:
                return SECCLASS_BLK_FILE;
        case S_IFDIR:
                return SECCLASS_DIR;
        case S_IFCHR:
                return SECCLASS_CHR_FILE;
        case S_IFIFO:
                return SECCLASS_FIFO_FILE;

        }

        return SECCLASS_FILE;
}

static inline int default_protocol_stream(int protocol)
{
        return (protocol == IPPROTO_IP || protocol == IPPROTO_TCP ||
                protocol == IPPROTO_MPTCP);
}

static inline int default_protocol_dgram(int protocol)
{
        return (protocol == IPPROTO_IP || protocol == IPPROTO_UDP);
}

static inline u16 socket_type_to_security_class(int family, int type, int protocol)
{
        bool extsockclass = selinux_policycap_extsockclass();

        switch (family) {
        case PF_UNIX:
                switch (type) {
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
                        return SECCLASS_UNIX_STREAM_SOCKET;
                case SOCK_DGRAM:
                case SOCK_RAW:
                        return SECCLASS_UNIX_DGRAM_SOCKET;
                }
                break;
        case PF_INET:
        case PF_INET6:
                switch (type) {
                case SOCK_STREAM:
                case SOCK_SEQPACKET:
                        if (default_protocol_stream(protocol))
                                return SECCLASS_TCP_SOCKET;
                        else if (extsockclass && protocol == IPPROTO_SCTP)
                                return SECCLASS_SCTP_SOCKET;
                        else
                                return SECCLASS_RAWIP_SOCKET;
                case SOCK_DGRAM:
                        if (default_protocol_dgram(protocol))
                                return SECCLASS_UDP_SOCKET;
                        else if (extsockclass && (protocol == IPPROTO_ICMP ||
                                                  protocol == IPPROTO_ICMPV6))
                                return SECCLASS_ICMP_SOCKET;
                        else
                                return SECCLASS_RAWIP_SOCKET;
                case SOCK_DCCP:
                        return SECCLASS_DCCP_SOCKET;
                default:
                        return SECCLASS_RAWIP_SOCKET;
                }
                break;
        case PF_NETLINK:
                switch (protocol) {
                case NETLINK_ROUTE:
                        return SECCLASS_NETLINK_ROUTE_SOCKET;
                case NETLINK_SOCK_DIAG:
                        return SECCLASS_NETLINK_TCPDIAG_SOCKET;
                case NETLINK_NFLOG:
                        return SECCLASS_NETLINK_NFLOG_SOCKET;
                case NETLINK_XFRM:
                        return SECCLASS_NETLINK_XFRM_SOCKET;
                case NETLINK_SELINUX:
                        return SECCLASS_NETLINK_SELINUX_SOCKET;
                case NETLINK_ISCSI:
                        return SECCLASS_NETLINK_ISCSI_SOCKET;
                case NETLINK_AUDIT:
                        return SECCLASS_NETLINK_AUDIT_SOCKET;
                case NETLINK_FIB_LOOKUP:
                        return SECCLASS_NETLINK_FIB_LOOKUP_SOCKET;
                case NETLINK_CONNECTOR:
                        return SECCLASS_NETLINK_CONNECTOR_SOCKET;
                case NETLINK_NETFILTER:
                        return SECCLASS_NETLINK_NETFILTER_SOCKET;
                case NETLINK_DNRTMSG:
                        return SECCLASS_NETLINK_DNRT_SOCKET;
                case NETLINK_KOBJECT_UEVENT:
                        return SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET;
                case NETLINK_GENERIC:
                        return SECCLASS_NETLINK_GENERIC_SOCKET;
                case NETLINK_SCSITRANSPORT:
                        return SECCLASS_NETLINK_SCSITRANSPORT_SOCKET;
                case NETLINK_RDMA:
                        return SECCLASS_NETLINK_RDMA_SOCKET;
                case NETLINK_CRYPTO:
                        return SECCLASS_NETLINK_CRYPTO_SOCKET;
                default:
                        return SECCLASS_NETLINK_SOCKET;
                }
        case PF_PACKET:
                return SECCLASS_PACKET_SOCKET;
        case PF_KEY:
                return SECCLASS_KEY_SOCKET;
        case PF_APPLETALK:
                return SECCLASS_APPLETALK_SOCKET;
        }

        if (extsockclass) {
                switch (family) {
                case PF_AX25:
                        return SECCLASS_AX25_SOCKET;
                case PF_IPX:
                        return SECCLASS_IPX_SOCKET;
                case PF_NETROM:
                        return SECCLASS_NETROM_SOCKET;
                case PF_ATMPVC:
                        return SECCLASS_ATMPVC_SOCKET;
                case PF_X25:
                        return SECCLASS_X25_SOCKET;
                case PF_ROSE:
                        return SECCLASS_ROSE_SOCKET;
                case PF_DECnet:
                        return SECCLASS_DECNET_SOCKET;
                case PF_ATMSVC:
                        return SECCLASS_ATMSVC_SOCKET;
                case PF_RDS:
                        return SECCLASS_RDS_SOCKET;
                case PF_IRDA:
                        return SECCLASS_IRDA_SOCKET;
                case PF_PPPOX:
                        return SECCLASS_PPPOX_SOCKET;
                case PF_LLC:
                        return SECCLASS_LLC_SOCKET;
                case PF_CAN:
                        return SECCLASS_CAN_SOCKET;
                case PF_TIPC:
                        return SECCLASS_TIPC_SOCKET;
                case PF_BLUETOOTH:
                        return SECCLASS_BLUETOOTH_SOCKET;
                case PF_IUCV:
                        return SECCLASS_IUCV_SOCKET;
                case PF_RXRPC:
                        return SECCLASS_RXRPC_SOCKET;
                case PF_ISDN:
                        return SECCLASS_ISDN_SOCKET;
                case PF_PHONET:
                        return SECCLASS_PHONET_SOCKET;
                case PF_IEEE802154:
                        return SECCLASS_IEEE802154_SOCKET;
                case PF_CAIF:
                        return SECCLASS_CAIF_SOCKET;
                case PF_ALG:
                        return SECCLASS_ALG_SOCKET;
                case PF_NFC:
                        return SECCLASS_NFC_SOCKET;
                case PF_VSOCK:
                        return SECCLASS_VSOCK_SOCKET;
                case PF_KCM:
                        return SECCLASS_KCM_SOCKET;
                case PF_QIPCRTR:
                        return SECCLASS_QIPCRTR_SOCKET;
                case PF_SMC:
                        return SECCLASS_SMC_SOCKET;
                case PF_XDP:
                        return SECCLASS_XDP_SOCKET;
                case PF_MCTP:
                        return SECCLASS_MCTP_SOCKET;
#if PF_MAX > 46
#error New address family defined, please update this function.
#endif
                }
        }

        return SECCLASS_SOCKET;
}

static int selinux_genfs_get_sid(struct dentry *dentry,
                                 u16 tclass,
                                 u16 flags,
                                 u32 *sid)
{
        int rc;
        struct super_block *sb = dentry->d_sb;
        char *buffer, *path;

        buffer = (char *)__get_free_page(GFP_KERNEL);
        if (!buffer)
                return -ENOMEM;

        path = dentry_path_raw(dentry, buffer, PAGE_SIZE);
        if (IS_ERR(path))
                rc = PTR_ERR(path);
        else {
                if (flags & SE_SBPROC) {
                        /* each process gets a /proc/PID/ entry. Strip off the
                         * PID part to get a valid selinux labeling.
                         * e.g. /proc/1/net/rpc/nfs -> /net/rpc/nfs */
                        while (path[1] >= '0' && path[1] <= '9') {
                                path[1] = '/';
                                path++;
                        }
                }
                rc = security_genfs_sid(sb->s_type->name,
                                        path, tclass, sid);
                if (rc == -ENOENT) {
                        /* No match in policy, mark as unlabeled. */
                        *sid = SECINITSID_UNLABELED;
                        rc = 0;
                }
        }
        free_page((unsigned long)buffer);
        return rc;
}

static int inode_doinit_use_xattr(struct inode *inode, struct dentry *dentry,
                                  u32 def_sid, u32 *sid)
{
#define INITCONTEXTLEN 255
        char *context;
        unsigned int len;
        int rc;

        len = INITCONTEXTLEN;
        context = kmalloc(len + 1, GFP_NOFS);
        if (!context)
                return -ENOMEM;

        context[len] = '\0';
        rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, context, len);
        if (rc == -ERANGE) {
                kfree(context);

                /* Need a larger buffer.  Query for the right size. */
                rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX, NULL, 0);
                if (rc < 0)
                        return rc;

                len = rc;
                context = kmalloc(len + 1, GFP_NOFS);
                if (!context)
                        return -ENOMEM;

                context[len] = '\0';
                rc = __vfs_getxattr(dentry, inode, XATTR_NAME_SELINUX,
                                    context, len);
        }
        if (rc < 0) {
                kfree(context);
                if (rc != -ENODATA) {
                        pr_warn("SELinux: %s:  getxattr returned %d for dev=%s ino=%ld\n",
                                __func__, -rc, inode->i_sb->s_id, inode->i_ino);
                        return rc;
                }
                *sid = def_sid;
                return 0;
        }

        rc = security_context_to_sid_default(context, rc, sid,
                                             def_sid, GFP_NOFS);
        if (rc) {
                char *dev = inode->i_sb->s_id;
                unsigned long ino = inode->i_ino;

                if (rc == -EINVAL) {
                        pr_notice_ratelimited("SELinux: inode=%lu on dev=%s was found to have an invalid context=%s.  This indicates you may need to relabel the inode or the filesystem in question.\n",
                                              ino, dev, context);
                } else {
                        pr_warn("SELinux: %s:  context_to_sid(%s) returned %d for dev=%s ino=%ld\n",
                                __func__, context, -rc, dev, ino);
                }
        }
        kfree(context);
        return 0;
}

/* The inode's security attributes must be initialized before first use. */
static int inode_doinit_with_dentry(struct inode *inode, struct dentry *opt_dentry)
{
        struct superblock_security_struct *sbsec = NULL;
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 task_sid, sid = 0;
        u16 sclass;
        struct dentry *dentry;
        int rc = 0;

        if (isec->initialized == LABEL_INITIALIZED)
                return 0;

        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_INITIALIZED)
                goto out_unlock;

        if (isec->sclass == SECCLASS_FILE)
                isec->sclass = inode_mode_to_security_class(inode->i_mode);

        sbsec = selinux_superblock(inode->i_sb);
        if (!(sbsec->flags & SE_SBINITIALIZED)) {
                /* Defer initialization until selinux_complete_init,
                   after the initial policy is loaded and the security
                   server is ready to handle calls. */
                spin_lock(&sbsec->isec_lock);
                if (list_empty(&isec->list))
                        list_add(&isec->list, &sbsec->isec_head);
                spin_unlock(&sbsec->isec_lock);
                goto out_unlock;
        }

        sclass = isec->sclass;
        task_sid = isec->task_sid;
        sid = isec->sid;
        isec->initialized = LABEL_PENDING;
        spin_unlock(&isec->lock);

        switch (sbsec->behavior) {
        /*
         * In case of SECURITY_FS_USE_NATIVE we need to re-fetch the labels
         * via xattr when called from delayed_superblock_init().
         */
        case SECURITY_FS_USE_NATIVE:
        case SECURITY_FS_USE_XATTR:
                if (!(inode->i_opflags & IOP_XATTR)) {
                        sid = sbsec->def_sid;
                        break;
                }
                /* Need a dentry, since the xattr API requires one.
                   Life would be simpler if we could just pass the inode. */
                if (opt_dentry) {
                        /* Called from d_instantiate or d_splice_alias. */
                        dentry = dget(opt_dentry);
                } else {
                        /*
                         * Called from selinux_complete_init, try to find a dentry.
                         * Some filesystems really want a connected one, so try
                         * that first.  We could split SECURITY_FS_USE_XATTR in
                         * two, depending upon that...
                         */
                        dentry = d_find_alias(inode);
                        if (!dentry)
                                dentry = d_find_any_alias(inode);
                }
                if (!dentry) {
                        /*
                         * this is can be hit on boot when a file is accessed
                         * before the policy is loaded.  When we load policy we
                         * may find inodes that have no dentry on the
                         * sbsec->isec_head list.  No reason to complain as these
                         * will get fixed up the next time we go through
                         * inode_doinit with a dentry, before these inodes could
                         * be used again by userspace.
                         */
                        goto out_invalid;
                }

                rc = inode_doinit_use_xattr(inode, dentry, sbsec->def_sid,
                                            &sid);
                dput(dentry);
                if (rc)
                        goto out;
                break;
        case SECURITY_FS_USE_TASK:
                sid = task_sid;
                break;
        case SECURITY_FS_USE_TRANS:
                /* Default to the fs SID. */
                sid = sbsec->sid;

                /* Try to obtain a transition SID. */
                rc = security_transition_sid(task_sid, sid,
                                             sclass, NULL, &sid);
                if (rc)
                        goto out;
                break;
        case SECURITY_FS_USE_MNTPOINT:
                sid = sbsec->mntpoint_sid;
                break;
        default:
                /* Default to the fs superblock SID. */
                sid = sbsec->sid;

                if ((sbsec->flags & SE_SBGENFS) &&
                     (!S_ISLNK(inode->i_mode) ||
                      selinux_policycap_genfs_seclabel_symlinks())) {
                        /* We must have a dentry to determine the label on
                         * procfs inodes */
                        if (opt_dentry) {
                                /* Called from d_instantiate or
                                 * d_splice_alias. */
                                dentry = dget(opt_dentry);
                        } else {
                                /* Called from selinux_complete_init, try to
                                 * find a dentry.  Some filesystems really want
                                 * a connected one, so try that first.
                                 */
                                dentry = d_find_alias(inode);
                                if (!dentry)
                                        dentry = d_find_any_alias(inode);
                        }
                        /*
                         * This can be hit on boot when a file is accessed
                         * before the policy is loaded.  When we load policy we
                         * may find inodes that have no dentry on the
                         * sbsec->isec_head list.  No reason to complain as
                         * these will get fixed up the next time we go through
                         * inode_doinit() with a dentry, before these inodes
                         * could be used again by userspace.
                         */
                        if (!dentry)
                                goto out_invalid;
                        rc = selinux_genfs_get_sid(dentry, sclass,
                                                   sbsec->flags, &sid);
                        if (rc) {
                                dput(dentry);
                                goto out;
                        }

                        if ((sbsec->flags & SE_SBGENFS_XATTR) &&
                            (inode->i_opflags & IOP_XATTR)) {
                                rc = inode_doinit_use_xattr(inode, dentry,
                                                            sid, &sid);
                                if (rc) {
                                        dput(dentry);
                                        goto out;
                                }
                        }
                        dput(dentry);
                }
                break;
        }

out:
        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_PENDING) {
                if (rc) {
                        isec->initialized = LABEL_INVALID;
                        goto out_unlock;
                }
                isec->initialized = LABEL_INITIALIZED;
                isec->sid = sid;
        }

out_unlock:
        spin_unlock(&isec->lock);
        return rc;

out_invalid:
        spin_lock(&isec->lock);
        if (isec->initialized == LABEL_PENDING) {
                isec->initialized = LABEL_INVALID;
                isec->sid = sid;
        }
        spin_unlock(&isec->lock);
        return 0;
}

/* Convert a Linux signal to an access vector. */
static inline u32 signal_to_av(int sig)
{
        u32 perm = 0;

        switch (sig) {
        case SIGCHLD:
                /* Commonly granted from child to parent. */
                perm = PROCESS__SIGCHLD;
                break;
        case SIGKILL:
                /* Cannot be caught or ignored */
                perm = PROCESS__SIGKILL;
                break;
        case SIGSTOP:
                /* Cannot be caught or ignored */
                perm = PROCESS__SIGSTOP;
                break;
        default:
                /* All other signals. */
                perm = PROCESS__SIGNAL;
                break;
        }

        return perm;
}

#if CAP_LAST_CAP > 63
#error Fix SELinux to handle capabilities > 63.
#endif

/* Check whether a task is allowed to use a capability. */
static int cred_has_capability(const struct cred *cred,
                               int cap, unsigned int opts, bool initns)
{
        struct common_audit_data ad;
        struct av_decision avd;
        u16 sclass;
        u32 sid = cred_sid(cred);
        u32 av = CAP_TO_MASK(cap);
        int rc;

        ad.type = LSM_AUDIT_DATA_CAP;
        ad.u.cap = cap;

        switch (CAP_TO_INDEX(cap)) {
        case 0:
                sclass = initns ? SECCLASS_CAPABILITY : SECCLASS_CAP_USERNS;
                break;
        case 1:
                sclass = initns ? SECCLASS_CAPABILITY2 : SECCLASS_CAP2_USERNS;
                break;
        default:
                pr_err("SELinux:  out of range capability %d\n", cap);
                BUG();
                return -EINVAL;
        }

        rc = avc_has_perm_noaudit(sid, sid, sclass, av, 0, &avd);
        if (!(opts & CAP_OPT_NOAUDIT)) {
                int rc2 = avc_audit(sid, sid, sclass, av, &avd, rc, &ad);
                if (rc2)
                        return rc2;
        }
        return rc;
}

/* Check whether a task has a particular permission to an inode.
   The 'adp' parameter is optional and allows other audit
   data to be passed (e.g. the dentry). */
static int inode_has_perm(const struct cred *cred,
                          struct inode *inode,
                          u32 perms,
                          struct common_audit_data *adp)
{
        struct inode_security_struct *isec;
        u32 sid;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        sid = cred_sid(cred);
        isec = selinux_inode(inode);

        return avc_has_perm(sid, isec->sid, isec->sclass, perms, adp);
}

/* Same as inode_has_perm, but pass explicit audit data containing
   the dentry to help the auditing code to more easily generate the
   pathname if needed. */
static inline int dentry_has_perm(const struct cred *cred,
                                  struct dentry *dentry,
                                  u32 av)
{
        struct inode *inode = d_backing_inode(dentry);
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
        __inode_security_revalidate(inode, dentry, true);
        return inode_has_perm(cred, inode, av, &ad);
}

/* Same as inode_has_perm, but pass explicit audit data containing
   the path to help the auditing code to more easily generate the
   pathname if needed. */
static inline int path_has_perm(const struct cred *cred,
                                const struct path *path,
                                u32 av)
{
        struct inode *inode = d_backing_inode(path->dentry);
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = *path;
        __inode_security_revalidate(inode, path->dentry, true);
        return inode_has_perm(cred, inode, av, &ad);
}

/* Same as path_has_perm, but uses the inode from the file struct. */
static inline int file_path_has_perm(const struct cred *cred,
                                     struct file *file,
                                     u32 av)
{
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;
        return inode_has_perm(cred, file_inode(file), av, &ad);
}

#ifdef CONFIG_BPF_SYSCALL
static int bpf_fd_pass(const struct file *file, u32 sid);
#endif

/* Check whether a task can use an open file descriptor to
   access an inode in a given way.  Check access to the
   descriptor itself, and then use dentry_has_perm to
   check a particular permission to the file.
   Access to the descriptor is implicitly granted if it
   has the same SID as the process.  If av is zero, then
   access to the file is not checked, e.g. for cases
   where only the descriptor is affected like seek. */
static int file_has_perm(const struct cred *cred,
                         struct file *file,
                         u32 av)
{
        struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct common_audit_data ad;
        u32 sid = cred_sid(cred);
        int rc;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        if (sid != fsec->sid) {
                rc = avc_has_perm(sid, fsec->sid,
                                  SECCLASS_FD,
                                  FD__USE,
                                  &ad);
                if (rc)
                        goto out;
        }

#ifdef CONFIG_BPF_SYSCALL
        rc = bpf_fd_pass(file, cred_sid(cred));
        if (rc)
                return rc;
#endif

        /* av is zero if only checking access to the descriptor. */
        rc = 0;
        if (av)
                rc = inode_has_perm(cred, inode, av, &ad);

out:
        return rc;
}

/*
 * Determine the label for an inode that might be unioned.
 */
static int
selinux_determine_inode_label(const struct task_security_struct *tsec,
                                 struct inode *dir,
                                 const struct qstr *name, u16 tclass,
                                 u32 *_new_isid)
{
        const struct superblock_security_struct *sbsec =
                                                selinux_superblock(dir->i_sb);

        if ((sbsec->flags & SE_SBINITIALIZED) &&
            (sbsec->behavior == SECURITY_FS_USE_MNTPOINT)) {
                *_new_isid = sbsec->mntpoint_sid;
        } else if ((sbsec->flags & SBLABEL_MNT) &&
                   tsec->create_sid) {
                *_new_isid = tsec->create_sid;
        } else {
                const struct inode_security_struct *dsec = inode_security(dir);
                return security_transition_sid(tsec->sid,
                                               dsec->sid, tclass,
                                               name, _new_isid);
        }

        return 0;
}

/* Check whether a task can create a file. */
static int may_create(struct inode *dir,
                      struct dentry *dentry,
                      u16 tclass)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct inode_security_struct *dsec;
        struct superblock_security_struct *sbsec;
        u32 sid, newsid;
        struct common_audit_data ad;
        int rc;

        dsec = inode_security(dir);
        sbsec = selinux_superblock(dir->i_sb);

        sid = tsec->sid;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        rc = avc_has_perm(sid, dsec->sid, SECCLASS_DIR,
                          DIR__ADD_NAME | DIR__SEARCH,
                          &ad);
        if (rc)
                return rc;

        rc = selinux_determine_inode_label(tsec, dir, &dentry->d_name, tclass,
                                           &newsid);
        if (rc)
                return rc;

        rc = avc_has_perm(sid, newsid, tclass, FILE__CREATE, &ad);
        if (rc)
                return rc;

        return avc_has_perm(newsid, sbsec->sid,
                            SECCLASS_FILESYSTEM,
                            FILESYSTEM__ASSOCIATE, &ad);
}

#define MAY_LINK        0
#define MAY_UNLINK        1
#define MAY_RMDIR        2

/* Check whether a task can link, unlink, or rmdir a file/directory. */
static int may_link(struct inode *dir,
                    struct dentry *dentry,
                    int kind)

{
        struct inode_security_struct *dsec, *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        u32 av;
        int rc;

        dsec = inode_security(dir);
        isec = backing_inode_security(dentry);

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        av = DIR__SEARCH;
        av |= (kind ? DIR__REMOVE_NAME : DIR__ADD_NAME);
        rc = avc_has_perm(sid, dsec->sid, SECCLASS_DIR, av, &ad);
        if (rc)
                return rc;

        switch (kind) {
        case MAY_LINK:
                av = FILE__LINK;
                break;
        case MAY_UNLINK:
                av = FILE__UNLINK;
                break;
        case MAY_RMDIR:
                av = DIR__RMDIR;
                break;
        default:
                pr_warn("SELinux: %s:  unrecognized kind %d\n",
                        __func__, kind);
                return 0;
        }

        rc = avc_has_perm(sid, isec->sid, isec->sclass, av, &ad);
        return rc;
}

static inline int may_rename(struct inode *old_dir,
                             struct dentry *old_dentry,
                             struct inode *new_dir,
                             struct dentry *new_dentry)
{
        struct inode_security_struct *old_dsec, *new_dsec, *old_isec, *new_isec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        u32 av;
        int old_is_dir, new_is_dir;
        int rc;

        old_dsec = inode_security(old_dir);
        old_isec = backing_inode_security(old_dentry);
        old_is_dir = d_is_dir(old_dentry);
        new_dsec = inode_security(new_dir);

        ad.type = LSM_AUDIT_DATA_DENTRY;

        ad.u.dentry = old_dentry;
        rc = avc_has_perm(sid, old_dsec->sid, SECCLASS_DIR,
                          DIR__REMOVE_NAME | DIR__SEARCH, &ad);
        if (rc)
                return rc;
        rc = avc_has_perm(sid, old_isec->sid,
                          old_isec->sclass, FILE__RENAME, &ad);
        if (rc)
                return rc;
        if (old_is_dir && new_dir != old_dir) {
                rc = avc_has_perm(sid, old_isec->sid,
                                  old_isec->sclass, DIR__REPARENT, &ad);
                if (rc)
                        return rc;
        }

        ad.u.dentry = new_dentry;
        av = DIR__ADD_NAME | DIR__SEARCH;
        if (d_is_positive(new_dentry))
                av |= DIR__REMOVE_NAME;
        rc = avc_has_perm(sid, new_dsec->sid, SECCLASS_DIR, av, &ad);
        if (rc)
                return rc;
        if (d_is_positive(new_dentry)) {
                new_isec = backing_inode_security(new_dentry);
                new_is_dir = d_is_dir(new_dentry);
                rc = avc_has_perm(sid, new_isec->sid,
                                  new_isec->sclass,
                                  (new_is_dir ? DIR__RMDIR : FILE__UNLINK), &ad);
                if (rc)
                        return rc;
        }

        return 0;
}

/* Check whether a task can perform a filesystem operation. */
static int superblock_has_perm(const struct cred *cred,
                               const struct super_block *sb,
                               u32 perms,
                               struct common_audit_data *ad)
{
        struct superblock_security_struct *sbsec;
        u32 sid = cred_sid(cred);

        sbsec = selinux_superblock(sb);
        return avc_has_perm(sid, sbsec->sid, SECCLASS_FILESYSTEM, perms, ad);
}

/* Convert a Linux mode and permission mask to an access vector. */
static inline u32 file_mask_to_av(int mode, int mask)
{
        u32 av = 0;

        if (!S_ISDIR(mode)) {
                if (mask & MAY_EXEC)
                        av |= FILE__EXECUTE;
                if (mask & MAY_READ)
                        av |= FILE__READ;

                if (mask & MAY_APPEND)
                        av |= FILE__APPEND;
                else if (mask & MAY_WRITE)
                        av |= FILE__WRITE;

        } else {
                if (mask & MAY_EXEC)
                        av |= DIR__SEARCH;
                if (mask & MAY_WRITE)
                        av |= DIR__WRITE;
                if (mask & MAY_READ)
                        av |= DIR__READ;
        }

        return av;
}

/* Convert a Linux file to an access vector. */
static inline u32 file_to_av(const struct file *file)
{
        u32 av = 0;

        if (file->f_mode & FMODE_READ)
                av |= FILE__READ;
        if (file->f_mode & FMODE_WRITE) {
                if (file->f_flags & O_APPEND)
                        av |= FILE__APPEND;
                else
                        av |= FILE__WRITE;
        }
        if (!av) {
                /*
                 * Special file opened with flags 3 for ioctl-only use.
                 */
                av = FILE__IOCTL;
        }

        return av;
}

/*
 * Convert a file to an access vector and include the correct
 * open permission.
 */
static inline u32 open_file_to_av(struct file *file)
{
        u32 av = file_to_av(file);
        struct inode *inode = file_inode(file);

        if (selinux_policycap_openperm() &&
            inode->i_sb->s_magic != SOCKFS_MAGIC)
                av |= FILE__OPEN;

        return av;
}

/* Hook functions begin here. */

static int selinux_binder_set_context_mgr(const struct cred *mgr)
{
        return avc_has_perm(current_sid(), cred_sid(mgr), SECCLASS_BINDER,
                            BINDER__SET_CONTEXT_MGR, NULL);
}

static int selinux_binder_transaction(const struct cred *from,
                                      const struct cred *to)
{
        u32 mysid = current_sid();
        u32 fromsid = cred_sid(from);
        u32 tosid = cred_sid(to);
        int rc;

        if (mysid != fromsid) {
                rc = avc_has_perm(mysid, fromsid, SECCLASS_BINDER,
                                  BINDER__IMPERSONATE, NULL);
                if (rc)
                        return rc;
        }

        return avc_has_perm(fromsid, tosid,
                            SECCLASS_BINDER, BINDER__CALL, NULL);
}

static int selinux_binder_transfer_binder(const struct cred *from,
                                          const struct cred *to)
{
        return avc_has_perm(cred_sid(from), cred_sid(to),
                            SECCLASS_BINDER, BINDER__TRANSFER,
                            NULL);
}

static int selinux_binder_transfer_file(const struct cred *from,
                                        const struct cred *to,
                                        const struct file *file)
{
        u32 sid = cred_sid(to);
        struct file_security_struct *fsec = selinux_file(file);
        struct dentry *dentry = file->f_path.dentry;
        struct inode_security_struct *isec;
        struct common_audit_data ad;
        int rc;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = file->f_path;

        if (sid != fsec->sid) {
                rc = avc_has_perm(sid, fsec->sid,
                                  SECCLASS_FD,
                                  FD__USE,
                                  &ad);
                if (rc)
                        return rc;
        }

#ifdef CONFIG_BPF_SYSCALL
        rc = bpf_fd_pass(file, sid);
        if (rc)
                return rc;
#endif

        if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                return 0;

        isec = backing_inode_security(dentry);
        return avc_has_perm(sid, isec->sid, isec->sclass, file_to_av(file),
                            &ad);
}

static int selinux_ptrace_access_check(struct task_struct *child,
                                       unsigned int mode)
{
        u32 sid = current_sid();
        u32 csid = task_sid_obj(child);

        if (mode & PTRACE_MODE_READ)
                return avc_has_perm(sid, csid, SECCLASS_FILE, FILE__READ,
                                NULL);

        return avc_has_perm(sid, csid, SECCLASS_PROCESS, PROCESS__PTRACE,
                        NULL);
}

static int selinux_ptrace_traceme(struct task_struct *parent)
{
        return avc_has_perm(task_sid_obj(parent), task_sid_obj(current),
                            SECCLASS_PROCESS, PROCESS__PTRACE, NULL);
}

static int selinux_capget(const struct task_struct *target, kernel_cap_t *effective,
                          kernel_cap_t *inheritable, kernel_cap_t *permitted)
{
        return avc_has_perm(current_sid(), task_sid_obj(target),
                        SECCLASS_PROCESS, PROCESS__GETCAP, NULL);
}

static int selinux_capset(struct cred *new, const struct cred *old,
                          const kernel_cap_t *effective,
                          const kernel_cap_t *inheritable,
                          const kernel_cap_t *permitted)
{
        return avc_has_perm(cred_sid(old), cred_sid(new), SECCLASS_PROCESS,
                            PROCESS__SETCAP, NULL);
}

/*
 * (This comment used to live with the selinux_task_setuid hook,
 * which was removed).
 *
 * Since setuid only affects the current process, and since the SELinux
 * controls are not based on the Linux identity attributes, SELinux does not
 * need to control this operation.  However, SELinux does control the use of
 * the CAP_SETUID and CAP_SETGID capabilities using the capable hook.
 */

static int selinux_capable(const struct cred *cred, struct user_namespace *ns,
                           int cap, unsigned int opts)
{
        return cred_has_capability(cred, cap, opts, ns == &init_user_ns);
}

static int selinux_quotactl(int cmds, int type, int id, const struct super_block *sb)
{
        const struct cred *cred = current_cred();
        int rc = 0;

        if (!sb)
                return 0;

        switch (cmds) {
        case Q_SYNC:
        case Q_QUOTAON:
        case Q_QUOTAOFF:
        case Q_SETINFO:
        case Q_SETQUOTA:
        case Q_XQUOTAOFF:
        case Q_XQUOTAON:
        case Q_XSETQLIM:
                rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAMOD, NULL);
                break;
        case Q_GETFMT:
        case Q_GETINFO:
        case Q_GETQUOTA:
        case Q_XGETQUOTA:
        case Q_XGETQSTAT:
        case Q_XGETQSTATV:
        case Q_XGETNEXTQUOTA:
                rc = superblock_has_perm(cred, sb, FILESYSTEM__QUOTAGET, NULL);
                break;
        default:
                rc = 0;  /* let the kernel handle invalid cmds */
                break;
        }
        return rc;
}

static int selinux_quota_on(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__QUOTAON);
}

static int selinux_syslog(int type)
{
        switch (type) {
        case SYSLOG_ACTION_READ_ALL:        /* Read last kernel messages */
        case SYSLOG_ACTION_SIZE_BUFFER:        /* Return size of the log buffer */
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__SYSLOG_READ, NULL);
        case SYSLOG_ACTION_CONSOLE_OFF:        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_ON:        /* Enable logging to console */
        /* Set level of messages printed to console */
        case SYSLOG_ACTION_CONSOLE_LEVEL:
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__SYSLOG_CONSOLE,
                                    NULL);
        }
        /* All other syslog types */
        return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                            SECCLASS_SYSTEM, SYSTEM__SYSLOG_MOD, NULL);
}

/*
 * Check permission for allocating a new virtual mapping. Returns
 * 0 if permission is granted, negative error code if not.
 *
 * Do not audit the selinux permission check, as this is applied to all
 * processes that allocate mappings.
 */
static int selinux_vm_enough_memory(struct mm_struct *mm, long pages)
{
        return cred_has_capability(current_cred(), CAP_SYS_ADMIN,
                                   CAP_OPT_NOAUDIT, true);
}

/* binprm security operations */

static u32 ptrace_parent_sid(void)
{
        u32 sid = 0;
        struct task_struct *tracer;

        rcu_read_lock();
        tracer = ptrace_parent(current);
        if (tracer)
                sid = task_sid_obj(tracer);
        rcu_read_unlock();

        return sid;
}

static int check_nnp_nosuid(const struct linux_binprm *bprm,
                            const struct task_security_struct *old_tsec,
                            const struct task_security_struct *new_tsec)
{
        int nnp = (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS);
        int nosuid = !mnt_may_suid(bprm->file->f_path.mnt);
        int rc;
        u32 av;

        if (!nnp && !nosuid)
                return 0; /* neither NNP nor nosuid */

        if (new_tsec->sid == old_tsec->sid)
                return 0; /* No change in credentials */

        /*
         * If the policy enables the nnp_nosuid_transition policy capability,
         * then we permit transitions under NNP or nosuid if the
         * policy allows the corresponding permission between
         * the old and new contexts.
         */
        if (selinux_policycap_nnp_nosuid_transition()) {
                av = 0;
                if (nnp)
                        av |= PROCESS2__NNP_TRANSITION;
                if (nosuid)
                        av |= PROCESS2__NOSUID_TRANSITION;
                rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
                                  SECCLASS_PROCESS2, av, NULL);
                if (!rc)
                        return 0;
        }

        /*
         * We also permit NNP or nosuid transitions to bounded SIDs,
         * i.e. SIDs that are guaranteed to only be allowed a subset
         * of the permissions of the current SID.
         */
        rc = security_bounded_transition(old_tsec->sid,
                                         new_tsec->sid);
        if (!rc)
                return 0;

        /*
         * On failure, preserve the errno values for NNP vs nosuid.
         * NNP:  Operation not permitted for caller.
         * nosuid:  Permission denied to file.
         */
        if (nnp)
                return -EPERM;
        return -EACCES;
}

static int selinux_bprm_creds_for_exec(struct linux_binprm *bprm)
{
        const struct task_security_struct *old_tsec;
        struct task_security_struct *new_tsec;
        struct inode_security_struct *isec;
        struct common_audit_data ad;
        struct inode *inode = file_inode(bprm->file);
        int rc;

        /* SELinux context only depends on initial program or script and not
         * the script interpreter */

        old_tsec = selinux_cred(current_cred());
        new_tsec = selinux_cred(bprm->cred);
        isec = inode_security(inode);

        /* Default to the current task SID. */
        new_tsec->sid = old_tsec->sid;
        new_tsec->osid = old_tsec->sid;

        /* Reset fs, key, and sock SIDs on execve. */
        new_tsec->create_sid = 0;
        new_tsec->keycreate_sid = 0;
        new_tsec->sockcreate_sid = 0;

        /*
         * Before policy is loaded, label any task outside kernel space
         * as SECINITSID_INIT, so that any userspace tasks surviving from
         * early boot end up with a label different from SECINITSID_KERNEL
         * (if the policy chooses to set SECINITSID_INIT != SECINITSID_KERNEL).
         */
        if (!selinux_initialized()) {
                new_tsec->sid = SECINITSID_INIT;
                /* also clear the exec_sid just in case */
                new_tsec->exec_sid = 0;
                return 0;
        }

        if (old_tsec->exec_sid) {
                new_tsec->sid = old_tsec->exec_sid;
                /* Reset exec SID on execve. */
                new_tsec->exec_sid = 0;

                /* Fail on NNP or nosuid if not an allowed transition. */
                rc = check_nnp_nosuid(bprm, old_tsec, new_tsec);
                if (rc)
                        return rc;
        } else {
                /* Check for a default transition on this program. */
                rc = security_transition_sid(old_tsec->sid,
                                             isec->sid, SECCLASS_PROCESS, NULL,
                                             &new_tsec->sid);
                if (rc)
                        return rc;

                /*
                 * Fallback to old SID on NNP or nosuid if not an allowed
                 * transition.
                 */
                rc = check_nnp_nosuid(bprm, old_tsec, new_tsec);
                if (rc)
                        new_tsec->sid = old_tsec->sid;
        }

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = bprm->file;

        if (new_tsec->sid == old_tsec->sid) {
                rc = avc_has_perm(old_tsec->sid, isec->sid,
                                  SECCLASS_FILE, FILE__EXECUTE_NO_TRANS, &ad);
                if (rc)
                        return rc;
        } else {
                /* Check permissions for the transition. */
                rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
                                  SECCLASS_PROCESS, PROCESS__TRANSITION, &ad);
                if (rc)
                        return rc;

                rc = avc_has_perm(new_tsec->sid, isec->sid,
                                  SECCLASS_FILE, FILE__ENTRYPOINT, &ad);
                if (rc)
                        return rc;

                /* Check for shared state */
                if (bprm->unsafe & LSM_UNSAFE_SHARE) {
                        rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
                                          SECCLASS_PROCESS, PROCESS__SHARE,
                                          NULL);
                        if (rc)
                                return -EPERM;
                }

                /* Make sure that anyone attempting to ptrace over a task that
                 * changes its SID has the appropriate permit */
                if (bprm->unsafe & LSM_UNSAFE_PTRACE) {
                        u32 ptsid = ptrace_parent_sid();
                        if (ptsid != 0) {
                                rc = avc_has_perm(ptsid, new_tsec->sid,
                                                  SECCLASS_PROCESS,
                                                  PROCESS__PTRACE, NULL);
                                if (rc)
                                        return -EPERM;
                        }
                }

                /* Clear any possibly unsafe personality bits on exec: */
                bprm->per_clear |= PER_CLEAR_ON_SETID;

                /* Enable secure mode for SIDs transitions unless
                   the noatsecure permission is granted between
                   the two SIDs, i.e. ahp returns 0. */
                rc = avc_has_perm(old_tsec->sid, new_tsec->sid,
                                  SECCLASS_PROCESS, PROCESS__NOATSECURE,
                                  NULL);
                bprm->secureexec |= !!rc;
        }

        return 0;
}

static int match_file(const void *p, struct file *file, unsigned fd)
{
        return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0;
}

/* Derived from fs/exec.c:flush_old_files. */
static inline void flush_unauthorized_files(const struct cred *cred,
                                            struct files_struct *files)
{
        struct file *file, *devnull = NULL;
        struct tty_struct *tty;
        int drop_tty = 0;
        unsigned n;

        tty = get_current_tty();
        if (tty) {
                spin_lock(&tty->files_lock);
                if (!list_empty(&tty->tty_files)) {
                        struct tty_file_private *file_priv;

                        /* Revalidate access to controlling tty.
                           Use file_path_has_perm on the tty path directly
                           rather than using file_has_perm, as this particular
                           open file may belong to another process and we are
                           only interested in the inode-based check here. */
                        file_priv = list_first_entry(&tty->tty_files,
                                                struct tty_file_private, list);
                        file = file_priv->file;
                        if (file_path_has_perm(cred, file, FILE__READ | FILE__WRITE))
                                drop_tty = 1;
                }
                spin_unlock(&tty->files_lock);
                tty_kref_put(tty);
        }
        /* Reset controlling tty. */
        if (drop_tty)
                no_tty();

        /* Revalidate access to inherited open files. */
        n = iterate_fd(files, 0, match_file, cred);
        if (!n) /* none found? */
                return;

        devnull = dentry_open(&selinux_null, O_RDWR, cred);
        if (IS_ERR(devnull))
                devnull = NULL;
        /* replace all the matching ones with this */
        do {
                replace_fd(n - 1, devnull, 0);
        } while ((n = iterate_fd(files, n, match_file, cred)) != 0);
        if (devnull)
                fput(devnull);
}

/*
 * Prepare a process for imminent new credential changes due to exec
 */
static void selinux_bprm_committing_creds(const struct linux_binprm *bprm)
{
        struct task_security_struct *new_tsec;
        struct rlimit *rlim, *initrlim;
        int rc, i;

        new_tsec = selinux_cred(bprm->cred);
        if (new_tsec->sid == new_tsec->osid)
                return;

        /* Close files for which the new task SID is not authorized. */
        flush_unauthorized_files(bprm->cred, current->files);

        /* Always clear parent death signal on SID transitions. */
        current->pdeath_signal = 0;

        /* Check whether the new SID can inherit resource limits from the old
         * SID.  If not, reset all soft limits to the lower of the current
         * task's hard limit and the init task's soft limit.
         *
         * Note that the setting of hard limits (even to lower them) can be
         * controlled by the setrlimit check.  The inclusion of the init task's
         * soft limit into the computation is to avoid resetting soft limits
         * higher than the default soft limit for cases where the default is
         * lower than the hard limit, e.g. RLIMIT_CORE or RLIMIT_STACK.
         */
        rc = avc_has_perm(new_tsec->osid, new_tsec->sid, SECCLASS_PROCESS,
                          PROCESS__RLIMITINH, NULL);
        if (rc) {
                /* protect against do_prlimit() */
                task_lock(current);
                for (i = 0; i < RLIM_NLIMITS; i++) {
                        rlim = current->signal->rlim + i;
                        initrlim = init_task.signal->rlim + i;
                        rlim->rlim_cur = min(rlim->rlim_max, initrlim->rlim_cur);
                }
                task_unlock(current);
                if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                        update_rlimit_cpu(current, rlimit(RLIMIT_CPU));
        }
}

/*
 * Clean up the process immediately after the installation of new credentials
 * due to exec
 */
static void selinux_bprm_committed_creds(const struct linux_binprm *bprm)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 osid, sid;
        int rc;

        osid = tsec->osid;
        sid = tsec->sid;

        if (sid == osid)
                return;

        /* Check whether the new SID can inherit signal state from the old SID.
         * If not, clear itimers to avoid subsequent signal generation and
         * flush and unblock signals.
         *
         * This must occur _after_ the task SID has been updated so that any
         * kill done after the flush will be checked against the new SID.
         */
        rc = avc_has_perm(osid, sid, SECCLASS_PROCESS, PROCESS__SIGINH, NULL);
        if (rc) {
                clear_itimer();

                spin_lock_irq(&unrcu_pointer(current->sighand)->siglock);
                if (!fatal_signal_pending(current)) {
                        flush_sigqueue(&current->pending);
                        flush_sigqueue(&current->signal->shared_pending);
                        flush_signal_handlers(current, 1);
                        sigemptyset(&current->blocked);
                        recalc_sigpending();
                }
                spin_unlock_irq(&unrcu_pointer(current->sighand)->siglock);
        }

        /* Wake up the parent if it is waiting so that it can recheck
         * wait permission to the new task SID. */
        read_lock(&tasklist_lock);
        __wake_up_parent(current, unrcu_pointer(current->real_parent));
        read_unlock(&tasklist_lock);
}

/* superblock security operations */

static int selinux_sb_alloc_security(struct super_block *sb)
{
        struct superblock_security_struct *sbsec = selinux_superblock(sb);

        mutex_init(&sbsec->lock);
        INIT_LIST_HEAD(&sbsec->isec_head);
        spin_lock_init(&sbsec->isec_lock);
        sbsec->sid = SECINITSID_UNLABELED;
        sbsec->def_sid = SECINITSID_FILE;
        sbsec->mntpoint_sid = SECINITSID_UNLABELED;

        return 0;
}

static inline int opt_len(const char *s)
{
        bool open_quote = false;
        int len;
        char c;

        for (len = 0; (c = s[len]) != '\0'; len++) {
                if (c == '"')
                        open_quote = !open_quote;
                if (c == ',' && !open_quote)
                        break;
        }
        return len;
}

static int selinux_sb_eat_lsm_opts(char *options, void **mnt_opts)
{
        char *from = options;
        char *to = options;
        bool first = true;
        int rc;

        while (1) {
                int len = opt_len(from);
                int token;
                char *arg = NULL;

                token = match_opt_prefix(from, len, &arg);

                if (token != Opt_error) {
                        char *p, *q;

                        /* strip quotes */
                        if (arg) {
                                for (p = q = arg; p < from + len; p++) {
                                        char c = *p;
                                        if (c != '"')
                                                *q++ = c;
                                }
                                arg = kmemdup_nul(arg, q - arg, GFP_KERNEL);
                                if (!arg) {
                                        rc = -ENOMEM;
                                        goto free_opt;
                                }
                        }
                        rc = selinux_add_opt(token, arg, mnt_opts);
                        kfree(arg);
                        arg = NULL;
                        if (unlikely(rc)) {
                                goto free_opt;
                        }
                } else {
                        if (!first) {        // copy with preceding comma
                                from--;
                                len++;
                        }
                        if (to != from)
                                memmove(to, from, len);
                        to += len;
                        first = false;
                }
                if (!from[len])
                        break;
                from += len + 1;
        }
        *to = '\0';
        return 0;

free_opt:
        if (*mnt_opts) {
                selinux_free_mnt_opts(*mnt_opts);
                *mnt_opts = NULL;
        }
        return rc;
}

static int selinux_sb_mnt_opts_compat(struct super_block *sb, void *mnt_opts)
{
        struct selinux_mnt_opts *opts = mnt_opts;
        struct superblock_security_struct *sbsec = selinux_superblock(sb);

        /*
         * Superblock not initialized (i.e. no options) - reject if any
         * options specified, otherwise accept.
         */
        if (!(sbsec->flags & SE_SBINITIALIZED))
                return opts ? 1 : 0;

        /*
         * Superblock initialized and no options specified - reject if
         * superblock has any options set, otherwise accept.
         */
        if (!opts)
                return (sbsec->flags & SE_MNTMASK) ? 1 : 0;

        if (opts->fscontext_sid) {
                if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid,
                               opts->fscontext_sid))
                        return 1;
        }
        if (opts->context_sid) {
                if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid,
                               opts->context_sid))
                        return 1;
        }
        if (opts->rootcontext_sid) {
                struct inode_security_struct *root_isec;

                root_isec = backing_inode_security(sb->s_root);
                if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid,
                               opts->rootcontext_sid))
                        return 1;
        }
        if (opts->defcontext_sid) {
                if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid,
                               opts->defcontext_sid))
                        return 1;
        }
        return 0;
}

static int selinux_sb_remount(struct super_block *sb, void *mnt_opts)
{
        struct selinux_mnt_opts *opts = mnt_opts;
        struct superblock_security_struct *sbsec = selinux_superblock(sb);

        if (!(sbsec->flags & SE_SBINITIALIZED))
                return 0;

        if (!opts)
                return 0;

        if (opts->fscontext_sid) {
                if (bad_option(sbsec, FSCONTEXT_MNT, sbsec->sid,
                               opts->fscontext_sid))
                        goto out_bad_option;
        }
        if (opts->context_sid) {
                if (bad_option(sbsec, CONTEXT_MNT, sbsec->mntpoint_sid,
                               opts->context_sid))
                        goto out_bad_option;
        }
        if (opts->rootcontext_sid) {
                struct inode_security_struct *root_isec;
                root_isec = backing_inode_security(sb->s_root);
                if (bad_option(sbsec, ROOTCONTEXT_MNT, root_isec->sid,
                               opts->rootcontext_sid))
                        goto out_bad_option;
        }
        if (opts->defcontext_sid) {
                if (bad_option(sbsec, DEFCONTEXT_MNT, sbsec->def_sid,
                               opts->defcontext_sid))
                        goto out_bad_option;
        }
        return 0;

out_bad_option:
        pr_warn("SELinux: unable to change security options "
               "during remount (dev %s, type=%s)\n", sb->s_id,
               sb->s_type->name);
        return -EINVAL;
}

static int selinux_sb_kern_mount(const struct super_block *sb)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = sb->s_root;
        return superblock_has_perm(cred, sb, FILESYSTEM__MOUNT, &ad);
}

static int selinux_sb_statfs(struct dentry *dentry)
{
        const struct cred *cred = current_cred();
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry->d_sb->s_root;
        return superblock_has_perm(cred, dentry->d_sb, FILESYSTEM__GETATTR, &ad);
}

static int selinux_mount(const char *dev_name,
                         const struct path *path,
                         const char *type,
                         unsigned long flags,
                         void *data)
{
        const struct cred *cred = current_cred();

        if (flags & MS_REMOUNT)
                return superblock_has_perm(cred, path->dentry->d_sb,
                                           FILESYSTEM__REMOUNT, NULL);
        else
                return path_has_perm(cred, path, FILE__MOUNTON);
}

static int selinux_move_mount(const struct path *from_path,
                              const struct path *to_path)
{
        const struct cred *cred = current_cred();

        return path_has_perm(cred, to_path, FILE__MOUNTON);
}

static int selinux_umount(struct vfsmount *mnt, int flags)
{
        const struct cred *cred = current_cred();

        return superblock_has_perm(cred, mnt->mnt_sb,
                                   FILESYSTEM__UNMOUNT, NULL);
}

static int selinux_fs_context_submount(struct fs_context *fc,
                                   struct super_block *reference)
{
        const struct superblock_security_struct *sbsec = selinux_superblock(reference);
        struct selinux_mnt_opts *opts;

        /*
         * Ensure that fc->security remains NULL when no options are set
         * as expected by selinux_set_mnt_opts().
         */
        if (!(sbsec->flags & (FSCONTEXT_MNT|CONTEXT_MNT|DEFCONTEXT_MNT)))
                return 0;

        opts = kzalloc(sizeof(*opts), GFP_KERNEL);
        if (!opts)
                return -ENOMEM;

        if (sbsec->flags & FSCONTEXT_MNT)
                opts->fscontext_sid = sbsec->sid;
        if (sbsec->flags & CONTEXT_MNT)
                opts->context_sid = sbsec->mntpoint_sid;
        if (sbsec->flags & DEFCONTEXT_MNT)
                opts->defcontext_sid = sbsec->def_sid;
        fc->security = opts;
        return 0;
}

static int selinux_fs_context_dup(struct fs_context *fc,
                                  struct fs_context *src_fc)
{
        const struct selinux_mnt_opts *src = src_fc->security;

        if (!src)
                return 0;

        fc->security = kmemdup(src, sizeof(*src), GFP_KERNEL);
        return fc->security ? 0 : -ENOMEM;
}

static const struct fs_parameter_spec selinux_fs_parameters[] = {
        fsparam_string(CONTEXT_STR,        Opt_context),
        fsparam_string(DEFCONTEXT_STR,        Opt_defcontext),
        fsparam_string(FSCONTEXT_STR,        Opt_fscontext),
        fsparam_string(ROOTCONTEXT_STR,        Opt_rootcontext),
        fsparam_flag  (SECLABEL_STR,        Opt_seclabel),
        {}
};

static int selinux_fs_context_parse_param(struct fs_context *fc,
                                          struct fs_parameter *param)
{
        struct fs_parse_result result;
        int opt;

        opt = fs_parse(fc, selinux_fs_parameters, param, &result);
        if (opt < 0)
                return opt;

        return selinux_add_opt(opt, param->string, &fc->security);
}

/* inode security operations */

static int selinux_inode_alloc_security(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = current_sid();

        spin_lock_init(&isec->lock);
        INIT_LIST_HEAD(&isec->list);
        isec->inode = inode;
        isec->sid = SECINITSID_UNLABELED;
        isec->sclass = SECCLASS_FILE;
        isec->task_sid = sid;
        isec->initialized = LABEL_INVALID;

        return 0;
}

static void selinux_inode_free_security(struct inode *inode)
{
        inode_free_security(inode);
}

static int selinux_dentry_init_security(struct dentry *dentry, int mode,
                                        const struct qstr *name,
                                        const char **xattr_name,
                                        struct lsm_context *cp)
{
        u32 newsid;
        int rc;

        rc = selinux_determine_inode_label(selinux_cred(current_cred()),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
        if (rc)
                return rc;

        if (xattr_name)
                *xattr_name = XATTR_NAME_SELINUX;

        cp->id = LSM_ID_SELINUX;
        return security_sid_to_context(newsid, &cp->context, &cp->len);
}

static int selinux_dentry_create_files_as(struct dentry *dentry, int mode,
                                          struct qstr *name,
                                          const struct cred *old,
                                          struct cred *new)
{
        u32 newsid;
        int rc;
        struct task_security_struct *tsec;

        rc = selinux_determine_inode_label(selinux_cred(old),
                                           d_inode(dentry->d_parent), name,
                                           inode_mode_to_security_class(mode),
                                           &newsid);
        if (rc)
                return rc;

        tsec = selinux_cred(new);
        tsec->create_sid = newsid;
        return 0;
}

static int selinux_inode_init_security(struct inode *inode, struct inode *dir,
                                       const struct qstr *qstr,
                                       struct xattr *xattrs, int *xattr_count)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct superblock_security_struct *sbsec;
        struct xattr *xattr = lsm_get_xattr_slot(xattrs, xattr_count);
        u32 newsid, clen;
        u16 newsclass;
        int rc;
        char *context;

        sbsec = selinux_superblock(dir->i_sb);

        newsid = tsec->create_sid;
        newsclass = inode_mode_to_security_class(inode->i_mode);
        rc = selinux_determine_inode_label(tsec, dir, qstr, newsclass, &newsid);
        if (rc)
                return rc;

        /* Possibly defer initialization to selinux_complete_init. */
        if (sbsec->flags & SE_SBINITIALIZED) {
                struct inode_security_struct *isec = selinux_inode(inode);
                isec->sclass = newsclass;
                isec->sid = newsid;
                isec->initialized = LABEL_INITIALIZED;
        }

        if (!selinux_initialized() ||
            !(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (xattr) {
                rc = security_sid_to_context_force(newsid,
                                                   &context, &clen);
                if (rc)
                        return rc;
                xattr->value = context;
                xattr->value_len = clen;
                xattr->name = XATTR_SELINUX_SUFFIX;
        }

        return 0;
}

static int selinux_inode_init_security_anon(struct inode *inode,
                                            const struct qstr *name,
                                            const struct inode *context_inode)
{
        u32 sid = current_sid();
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        int rc;

        if (unlikely(!selinux_initialized()))
                return 0;

        isec = selinux_inode(inode);

        /*
         * We only get here once per ephemeral inode.  The inode has
         * been initialized via inode_alloc_security but is otherwise
         * untouched.
         */

        if (context_inode) {
                struct inode_security_struct *context_isec =
                        selinux_inode(context_inode);
                if (context_isec->initialized != LABEL_INITIALIZED) {
                        pr_err("SELinux:  context_inode is not initialized\n");
                        return -EACCES;
                }

                isec->sclass = context_isec->sclass;
                isec->sid = context_isec->sid;
        } else {
                isec->sclass = SECCLASS_ANON_INODE;
                rc = security_transition_sid(
                        sid, sid,
                        isec->sclass, name, &isec->sid);
                if (rc)
                        return rc;
        }

        isec->initialized = LABEL_INITIALIZED;
        /*
         * Now that we've initialized security, check whether we're
         * allowed to actually create this type of anonymous inode.
         */

        ad.type = LSM_AUDIT_DATA_ANONINODE;
        ad.u.anonclass = name ? (const char *)name->name : "?";

        return avc_has_perm(sid,
                            isec->sid,
                            isec->sclass,
                            FILE__CREATE,
                            &ad);
}

static int selinux_inode_create(struct inode *dir, struct dentry *dentry, umode_t mode)
{
        return may_create(dir, dentry, SECCLASS_FILE);
}

static int selinux_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry)
{
        return may_link(dir, old_dentry, MAY_LINK);
}

static int selinux_inode_unlink(struct inode *dir, struct dentry *dentry)
{
        return may_link(dir, dentry, MAY_UNLINK);
}

static int selinux_inode_symlink(struct inode *dir, struct dentry *dentry, const char *name)
{
        return may_create(dir, dentry, SECCLASS_LNK_FILE);
}

static int selinux_inode_mkdir(struct inode *dir, struct dentry *dentry, umode_t mask)
{
        return may_create(dir, dentry, SECCLASS_DIR);
}

static int selinux_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
        return may_link(dir, dentry, MAY_RMDIR);
}

static int selinux_inode_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
{
        return may_create(dir, dentry, inode_mode_to_security_class(mode));
}

static int selinux_inode_rename(struct inode *old_inode, struct dentry *old_dentry,
                                struct inode *new_inode, struct dentry *new_dentry)
{
        return may_rename(old_inode, old_dentry, new_inode, new_dentry);
}

static int selinux_inode_readlink(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__READ);
}

static int selinux_inode_follow_link(struct dentry *dentry, struct inode *inode,
                                     bool rcu)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        u32 sid = current_sid();

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;
        isec = inode_security_rcu(inode, rcu);
        if (IS_ERR(isec))
                return PTR_ERR(isec);

        return avc_has_perm(sid, isec->sid, isec->sclass, FILE__READ, &ad);
}

static noinline int audit_inode_permission(struct inode *inode,
                                           u32 perms, u32 audited, u32 denied,
                                           int result)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec = selinux_inode(inode);

        ad.type = LSM_AUDIT_DATA_INODE;
        ad.u.inode = inode;

        return slow_avc_audit(current_sid(), isec->sid, isec->sclass, perms,
                            audited, denied, result, &ad);
}

static int selinux_inode_permission(struct inode *inode, int mask)
{
        u32 perms;
        bool from_access;
        bool no_block = mask & MAY_NOT_BLOCK;
        struct inode_security_struct *isec;
        u32 sid = current_sid();
        struct av_decision avd;
        int rc, rc2;
        u32 audited, denied;

        from_access = mask & MAY_ACCESS;
        mask &= (MAY_READ|MAY_WRITE|MAY_EXEC|MAY_APPEND);

        /* No permission to check.  Existence test. */
        if (!mask)
                return 0;

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        perms = file_mask_to_av(inode->i_mode, mask);

        isec = inode_security_rcu(inode, no_block);
        if (IS_ERR(isec))
                return PTR_ERR(isec);

        rc = avc_has_perm_noaudit(sid, isec->sid, isec->sclass, perms, 0,
                                  &avd);
        audited = avc_audit_required(perms, &avd, rc,
                                     from_access ? FILE__AUDIT_ACCESS : 0,
                                     &denied);
        if (likely(!audited))
                return rc;

        rc2 = audit_inode_permission(inode, perms, audited, denied, rc);
        if (rc2)
                return rc2;
        return rc;
}

static int selinux_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                                 struct iattr *iattr)
{
        const struct cred *cred = current_cred();
        struct inode *inode = d_backing_inode(dentry);
        unsigned int ia_valid = iattr->ia_valid;
        u32 av = FILE__WRITE;

        /* ATTR_FORCE is just used for ATTR_KILL_S[UG]ID. */
        if (ia_valid & ATTR_FORCE) {
                ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE |
                              ATTR_FORCE);
                if (!ia_valid)
                        return 0;
        }

        if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID |
                        ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_TIMES_SET))
                return dentry_has_perm(cred, dentry, FILE__SETATTR);

        if (selinux_policycap_openperm() &&
            inode->i_sb->s_magic != SOCKFS_MAGIC &&
            (ia_valid & ATTR_SIZE) &&
            !(ia_valid & ATTR_FILE))
                av |= FILE__OPEN;

        return dentry_has_perm(cred, dentry, av);
}

static int selinux_inode_getattr(const struct path *path)
{
        return path_has_perm(current_cred(), path, FILE__GETATTR);
}

static bool has_cap_mac_admin(bool audit)
{
        const struct cred *cred = current_cred();
        unsigned int opts = audit ? CAP_OPT_NONE : CAP_OPT_NOAUDIT;

        if (cap_capable(cred, &init_user_ns, CAP_MAC_ADMIN, opts))
                return false;
        if (cred_has_capability(cred, CAP_MAC_ADMIN, opts, true))
                return false;
        return true;
}

/**
 * selinux_inode_xattr_skipcap - Skip the xattr capability checks?
 * @name: name of the xattr
 *
 * Returns 1 to indicate that SELinux "owns" the access control rights to xattrs
 * named @name; the LSM layer should avoid enforcing any traditional
 * capability based access controls on this xattr.  Returns 0 to indicate that
 * SELinux does not "own" the access control rights to xattrs named @name and is
 * deferring to the LSM layer for further access controls, including capability
 * based controls.
 */
static int selinux_inode_xattr_skipcap(const char *name)
{
        /* require capability check if not a selinux xattr */
        return !strcmp(name, XATTR_NAME_SELINUX);
}

static int selinux_inode_setxattr(struct mnt_idmap *idmap,
                                  struct dentry *dentry, const char *name,
                                  const void *value, size_t size, int flags)
{
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec;
        struct superblock_security_struct *sbsec;
        struct common_audit_data ad;
        u32 newsid, sid = current_sid();
        int rc = 0;

        /* if not a selinux xattr, only check the ordinary setattr perm */
        if (strcmp(name, XATTR_NAME_SELINUX))
                return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);

        if (!selinux_initialized())
                return (inode_owner_or_capable(idmap, inode) ? 0 : -EPERM);

        sbsec = selinux_superblock(inode->i_sb);
        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (!inode_owner_or_capable(idmap, inode))
                return -EPERM;

        ad.type = LSM_AUDIT_DATA_DENTRY;
        ad.u.dentry = dentry;

        isec = backing_inode_security(dentry);
        rc = avc_has_perm(sid, isec->sid, isec->sclass,
                          FILE__RELABELFROM, &ad);
        if (rc)
                return rc;

        rc = security_context_to_sid(value, size, &newsid,
                                     GFP_KERNEL);
        if (rc == -EINVAL) {
                if (!has_cap_mac_admin(true)) {
                        struct audit_buffer *ab;
                        size_t audit_size;

                        /* We strip a nul only if it is at the end, otherwise the
                         * context contains a nul and we should audit that */
                        if (value) {
                                const char *str = value;

                                if (str[size - 1] == '\0')
                                        audit_size = size - 1;
                                else
                                        audit_size = size;
                        } else {
                                audit_size = 0;
                        }
                        ab = audit_log_start(audit_context(),
                                             GFP_ATOMIC, AUDIT_SELINUX_ERR);
                        if (!ab)
                                return rc;
                        audit_log_format(ab, "op=setxattr invalid_context=");
                        audit_log_n_untrustedstring(ab, value, audit_size);
                        audit_log_end(ab);

                        return rc;
                }
                rc = security_context_to_sid_force(value,
                                                   size, &newsid);
        }
        if (rc)
                return rc;

        rc = avc_has_perm(sid, newsid, isec->sclass,
                          FILE__RELABELTO, &ad);
        if (rc)
                return rc;

        rc = security_validate_transition(isec->sid, newsid,
                                          sid, isec->sclass);
        if (rc)
                return rc;

        return avc_has_perm(newsid,
                            sbsec->sid,
                            SECCLASS_FILESYSTEM,
                            FILESYSTEM__ASSOCIATE,
                            &ad);
}

static int selinux_inode_set_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name,
                                 struct posix_acl *kacl)
{
        return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
}

static int selinux_inode_get_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name)
{
        return dentry_has_perm(current_cred(), dentry, FILE__GETATTR);
}

static int selinux_inode_remove_acl(struct mnt_idmap *idmap,
                                    struct dentry *dentry, const char *acl_name)
{
        return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);
}

static void selinux_inode_post_setxattr(struct dentry *dentry, const char *name,
                                        const void *value, size_t size,
                                        int flags)
{
        struct inode *inode = d_backing_inode(dentry);
        struct inode_security_struct *isec;
        u32 newsid;
        int rc;

        if (strcmp(name, XATTR_NAME_SELINUX)) {
                /* Not an attribute we recognize, so nothing to do. */
                return;
        }

        if (!selinux_initialized()) {
                /* If we haven't even been initialized, then we can't validate
                 * against a policy, so leave the label as invalid. It may
                 * resolve to a valid label on the next revalidation try if
                 * we've since initialized.
                 */
                return;
        }

        rc = security_context_to_sid_force(value, size,
                                           &newsid);
        if (rc) {
                pr_err("SELinux:  unable to map context to SID"
                       "for (%s, %lu), rc=%d\n",
                       inode->i_sb->s_id, inode->i_ino, -rc);
                return;
        }

        isec = backing_inode_security(dentry);
        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
}

static int selinux_inode_getxattr(struct dentry *dentry, const char *name)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__GETATTR);
}

static int selinux_inode_listxattr(struct dentry *dentry)
{
        const struct cred *cred = current_cred();

        return dentry_has_perm(cred, dentry, FILE__GETATTR);
}

static int selinux_inode_removexattr(struct mnt_idmap *idmap,
                                     struct dentry *dentry, const char *name)
{
        /* if not a selinux xattr, only check the ordinary setattr perm */
        if (strcmp(name, XATTR_NAME_SELINUX))
                return dentry_has_perm(current_cred(), dentry, FILE__SETATTR);

        if (!selinux_initialized())
                return 0;

        /* No one is allowed to remove a SELinux security label.
           You can change the label, but all data must be labeled. */
        return -EACCES;
}

static int selinux_path_notify(const struct path *path, u64 mask,
                                                unsigned int obj_type)
{
        int ret;
        u32 perm;

        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_PATH;
        ad.u.path = *path;

        /*
         * Set permission needed based on the type of mark being set.
         * Performs an additional check for sb watches.
         */
        switch (obj_type) {
        case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
                perm = FILE__WATCH_MOUNT;
                break;
        case FSNOTIFY_OBJ_TYPE_SB:
                perm = FILE__WATCH_SB;
                ret = superblock_has_perm(current_cred(), path->dentry->d_sb,
                                                FILESYSTEM__WATCH, &ad);
                if (ret)
                        return ret;
                break;
        case FSNOTIFY_OBJ_TYPE_INODE:
                perm = FILE__WATCH;
                break;
        case FSNOTIFY_OBJ_TYPE_MNTNS:
                perm = FILE__WATCH_MOUNTNS;
                break;
        default:
                return -EINVAL;
        }

        /* blocking watches require the file:watch_with_perm permission */
        if (mask & (ALL_FSNOTIFY_PERM_EVENTS))
                perm |= FILE__WATCH_WITH_PERM;

        /* watches on read-like events need the file:watch_reads permission */
        if (mask & (FS_ACCESS | FS_ACCESS_PERM | FS_PRE_ACCESS |
                    FS_CLOSE_NOWRITE))
                perm |= FILE__WATCH_READS;

        return path_has_perm(current_cred(), path, perm);
}

/*
 * Copy the inode security context value to the user.
 *
 * Permission check is handled by selinux_inode_getxattr hook.
 */
static int selinux_inode_getsecurity(struct mnt_idmap *idmap,
                                     struct inode *inode, const char *name,
                                     void **buffer, bool alloc)
{
        u32 size;
        int error;
        char *context = NULL;
        struct inode_security_struct *isec;

        /*
         * If we're not initialized yet, then we can't validate contexts, so
         * just let vfs_getxattr fall back to using the on-disk xattr.
         */
        if (!selinux_initialized() ||
            strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;

        /*
         * If the caller has CAP_MAC_ADMIN, then get the raw context
         * value even if it is not defined by current policy; otherwise,
         * use the in-core value under current policy.
         * Use the non-auditing forms of the permission checks since
         * getxattr may be called by unprivileged processes commonly
         * and lack of permission just means that we fall back to the
         * in-core context value, not a denial.
         */
        isec = inode_security(inode);
        if (has_cap_mac_admin(false))
                error = security_sid_to_context_force(isec->sid, &context,
                                                      &size);
        else
                error = security_sid_to_context(isec->sid,
                                                &context, &size);
        if (error)
                return error;
        error = size;
        if (alloc) {
                *buffer = context;
                goto out_nofree;
        }
        kfree(context);
out_nofree:
        return error;
}

static int selinux_inode_setsecurity(struct inode *inode, const char *name,
                                     const void *value, size_t size, int flags)
{
        struct inode_security_struct *isec = inode_security_novalidate(inode);
        struct superblock_security_struct *sbsec;
        u32 newsid;
        int rc;

        if (strcmp(name, XATTR_SELINUX_SUFFIX))
                return -EOPNOTSUPP;

        sbsec = selinux_superblock(inode->i_sb);
        if (!(sbsec->flags & SBLABEL_MNT))
                return -EOPNOTSUPP;

        if (!value || !size)
                return -EACCES;

        rc = security_context_to_sid(value, size, &newsid,
                                     GFP_KERNEL);
        if (rc)
                return rc;

        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = newsid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
        return 0;
}

static int selinux_inode_listsecurity(struct inode *inode, char *buffer, size_t buffer_size)
{
        const int len = sizeof(XATTR_NAME_SELINUX);

        if (!selinux_initialized())
                return 0;

        if (buffer && len <= buffer_size)
                memcpy(buffer, XATTR_NAME_SELINUX, len);
        return len;
}

static void selinux_inode_getlsmprop(struct inode *inode, struct lsm_prop *prop)
{
        struct inode_security_struct *isec = inode_security_novalidate(inode);

        prop->selinux.secid = isec->sid;
}

static int selinux_inode_copy_up(struct dentry *src, struct cred **new)
{
        struct lsm_prop prop;
        struct task_security_struct *tsec;
        struct cred *new_creds = *new;

        if (new_creds == NULL) {
                new_creds = prepare_creds();
                if (!new_creds)
                        return -ENOMEM;
        }

        tsec = selinux_cred(new_creds);
        /* Get label from overlay inode and set it in create_sid */
        selinux_inode_getlsmprop(d_inode(src), &prop);
        tsec->create_sid = prop.selinux.secid;
        *new = new_creds;
        return 0;
}

static int selinux_inode_copy_up_xattr(struct dentry *dentry, const char *name)
{
        /* The copy_up hook above sets the initial context on an inode, but we
         * don't then want to overwrite it by blindly copying all the lower
         * xattrs up.  Instead, filter out SELinux-related xattrs following
         * policy load.
         */
        if (selinux_initialized() && !strcmp(name, XATTR_NAME_SELINUX))
                return -ECANCELED; /* Discard */
        /*
         * Any other attribute apart from SELINUX is not claimed, supported
         * by selinux.
         */
        return -EOPNOTSUPP;
}

/* kernfs node operations */

static int selinux_kernfs_init_security(struct kernfs_node *kn_dir,
                                        struct kernfs_node *kn)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 parent_sid, newsid, clen;
        int rc;
        char *context;

        rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, NULL, 0);
        if (rc == -ENODATA)
                return 0;
        else if (rc < 0)
                return rc;

        clen = (u32)rc;
        context = kmalloc(clen, GFP_KERNEL);
        if (!context)
                return -ENOMEM;

        rc = kernfs_xattr_get(kn_dir, XATTR_NAME_SELINUX, context, clen);
        if (rc < 0) {
                kfree(context);
                return rc;
        }

        rc = security_context_to_sid(context, clen, &parent_sid,
                                     GFP_KERNEL);
        kfree(context);
        if (rc)
                return rc;

        if (tsec->create_sid) {
                newsid = tsec->create_sid;
        } else {
                u16 secclass = inode_mode_to_security_class(kn->mode);
                const char *kn_name;
                struct qstr q;

                /* kn is fresh, can't be renamed, name goes not away */
                kn_name = rcu_dereference_check(kn->name, true);
                q.name = kn_name;
                q.hash_len = hashlen_string(kn_dir, kn_name);

                rc = security_transition_sid(tsec->sid,
                                             parent_sid, secclass, &q,
                                             &newsid);
                if (rc)
                        return rc;
        }

        rc = security_sid_to_context_force(newsid,
                                           &context, &clen);
        if (rc)
                return rc;

        rc = kernfs_xattr_set(kn, XATTR_NAME_SELINUX, context, clen,
                              XATTR_CREATE);
        kfree(context);
        return rc;
}


/* file security operations */

static int selinux_revalidate_file_permission(struct file *file, int mask)
{
        const struct cred *cred = current_cred();
        struct inode *inode = file_inode(file);

        /* file_mask_to_av won't add FILE__WRITE if MAY_APPEND is set */
        if ((file->f_flags & O_APPEND) && (mask & MAY_WRITE))
                mask |= MAY_APPEND;

        return file_has_perm(cred, file,
                             file_mask_to_av(inode->i_mode, mask));
}

static int selinux_file_permission(struct file *file, int mask)
{
        struct inode *inode = file_inode(file);
        struct file_security_struct *fsec = selinux_file(file);
        struct inode_security_struct *isec;
        u32 sid = current_sid();

        if (!mask)
                /* No permission to check.  Existence test. */
                return 0;

        isec = inode_security(inode);
        if (sid == fsec->sid && fsec->isid == isec->sid &&
            fsec->pseqno == avc_policy_seqno())
                /* No change since file_open check. */
                return 0;

        return selinux_revalidate_file_permission(file, mask);
}

static int selinux_file_alloc_security(struct file *file)
{
        struct file_security_struct *fsec = selinux_file(file);
        u32 sid = current_sid();

        fsec->sid = sid;
        fsec->fown_sid = sid;

        return 0;
}

/*
 * Check whether a task has the ioctl permission and cmd
 * operation to an inode.
 */
static int ioctl_has_perm(const struct cred *cred, struct file *file,
                u32 requested, u16 cmd)
{
        struct common_audit_data ad;
        struct file_security_struct *fsec = selinux_file(file);
        struct inode *inode = file_inode(file);
        struct inode_security_struct *isec;
        struct lsm_ioctlop_audit ioctl;
        u32 ssid = cred_sid(cred);
        int rc;
        u8 driver = cmd >> 8;
        u8 xperm = cmd & 0xff;

        ad.type = LSM_AUDIT_DATA_IOCTL_OP;
        ad.u.op = &ioctl;
        ad.u.op->cmd = cmd;
        ad.u.op->path = file->f_path;

        if (ssid != fsec->sid) {
                rc = avc_has_perm(ssid, fsec->sid,
                                SECCLASS_FD,
                                FD__USE,
                                &ad);
                if (rc)
                        goto out;
        }

        if (unlikely(IS_PRIVATE(inode)))
                return 0;

        isec = inode_security(inode);
        rc = avc_has_extended_perms(ssid, isec->sid, isec->sclass, requested,
                                    driver, AVC_EXT_IOCTL, xperm, &ad);
out:
        return rc;
}

static int selinux_file_ioctl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        const struct cred *cred = current_cred();
        int error = 0;

        switch (cmd) {
        case FIONREAD:
        case FIBMAP:
        case FIGETBSZ:
        case FS_IOC_GETFLAGS:
        case FS_IOC_GETVERSION:
                error = file_has_perm(cred, file, FILE__GETATTR);
                break;

        case FS_IOC_SETFLAGS:
        case FS_IOC_SETVERSION:
                error = file_has_perm(cred, file, FILE__SETATTR);
                break;

        /* sys_ioctl() checks */
        case FIONBIO:
        case FIOASYNC:
                error = file_has_perm(cred, file, 0);
                break;

        case KDSKBENT:
        case KDSKBSENT:
                error = cred_has_capability(cred, CAP_SYS_TTY_CONFIG,
                                            CAP_OPT_NONE, true);
                break;

        case FIOCLEX:
        case FIONCLEX:
                if (!selinux_policycap_ioctl_skip_cloexec())
                        error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
                break;

        /* default case assumes that the command will go
         * to the file's ioctl() function.
         */
        default:
                error = ioctl_has_perm(cred, file, FILE__IOCTL, (u16) cmd);
        }
        return error;
}

static int selinux_file_ioctl_compat(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        /*
         * If we are in a 64-bit kernel running 32-bit userspace, we need to
         * make sure we don't compare 32-bit flags to 64-bit flags.
         */
        switch (cmd) {
        case FS_IOC32_GETFLAGS:
                cmd = FS_IOC_GETFLAGS;
                break;
        case FS_IOC32_SETFLAGS:
                cmd = FS_IOC_SETFLAGS;
                break;
        case FS_IOC32_GETVERSION:
                cmd = FS_IOC_GETVERSION;
                break;
        case FS_IOC32_SETVERSION:
                cmd = FS_IOC_SETVERSION;
                break;
        default:
                break;
        }

        return selinux_file_ioctl(file, cmd, arg);
}

static int default_noexec __ro_after_init;

static int file_map_prot_check(struct file *file, unsigned long prot, int shared)
{
        const struct cred *cred = current_cred();
        u32 sid = cred_sid(cred);
        int rc = 0;

        if (default_noexec &&
            (prot & PROT_EXEC) && (!file || IS_PRIVATE(file_inode(file)) ||
                                   (!shared && (prot & PROT_WRITE)))) {
                /*
                 * We are making executable an anonymous mapping or a
                 * private file mapping that will also be writable.
                 * This has an additional check.
                 */
                rc = avc_has_perm(sid, sid, SECCLASS_PROCESS,
                                  PROCESS__EXECMEM, NULL);
                if (rc)
                        goto error;
        }

        if (file) {
                /* read access is always possible with a mapping */
                u32 av = FILE__READ;

                /* write access only matters if the mapping is shared */
                if (shared && (prot & PROT_WRITE))
                        av |= FILE__WRITE;

                if (prot & PROT_EXEC)
                        av |= FILE__EXECUTE;

                return file_has_perm(cred, file, av);
        }

error:
        return rc;
}

static int selinux_mmap_addr(unsigned long addr)
{
        int rc = 0;

        if (addr < CONFIG_LSM_MMAP_MIN_ADDR) {
                u32 sid = current_sid();
                rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT,
                                  MEMPROTECT__MMAP_ZERO, NULL);
        }

        return rc;
}

static int selinux_mmap_file(struct file *file,
                             unsigned long reqprot __always_unused,
                             unsigned long prot, unsigned long flags)
{
        struct common_audit_data ad;
        int rc;

        if (file) {
                ad.type = LSM_AUDIT_DATA_FILE;
                ad.u.file = file;
                rc = inode_has_perm(current_cred(), file_inode(file),
                                    FILE__MAP, &ad);
                if (rc)
                        return rc;
        }

        return file_map_prot_check(file, prot,
                                   (flags & MAP_TYPE) == MAP_SHARED);
}

static int selinux_file_mprotect(struct vm_area_struct *vma,
                                 unsigned long reqprot __always_unused,
                                 unsigned long prot)
{
        const struct cred *cred = current_cred();
        u32 sid = cred_sid(cred);

        if (default_noexec &&
            (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) {
                int rc = 0;
                /*
                 * We don't use the vma_is_initial_heap() helper as it has
                 * a history of problems and is currently broken on systems
                 * where there is no heap, e.g. brk == start_brk.  Before
                 * replacing the conditional below with vma_is_initial_heap(),
                 * or something similar, please ensure that the logic is the
                 * same as what we have below or you have tested every possible
                 * corner case you can think to test.
                 */
                if (vma->vm_start >= vma->vm_mm->start_brk &&
                    vma->vm_end <= vma->vm_mm->brk) {
                        rc = avc_has_perm(sid, sid, SECCLASS_PROCESS,
                                          PROCESS__EXECHEAP, NULL);
                } else if (!vma->vm_file && (vma_is_initial_stack(vma) ||
                            vma_is_stack_for_current(vma))) {
                        rc = avc_has_perm(sid, sid, SECCLASS_PROCESS,
                                          PROCESS__EXECSTACK, NULL);
                } else if (vma->vm_file && vma->anon_vma) {
                        /*
                         * We are making executable a file mapping that has
                         * had some COW done. Since pages might have been
                         * written, check ability to execute the possibly
                         * modified content.  This typically should only
                         * occur for text relocations.
                         */
                        rc = file_has_perm(cred, vma->vm_file, FILE__EXECMOD);
                }
                if (rc)
                        return rc;
        }

        return file_map_prot_check(vma->vm_file, prot, vma->vm_flags&VM_SHARED);
}

static int selinux_file_lock(struct file *file, unsigned int cmd)
{
        const struct cred *cred = current_cred();

        return file_has_perm(cred, file, FILE__LOCK);
}

static int selinux_file_fcntl(struct file *file, unsigned int cmd,
                              unsigned long arg)
{
        const struct cred *cred = current_cred();
        int err = 0;

        switch (cmd) {
        case F_SETFL:
                if ((file->f_flags & O_APPEND) && !(arg & O_APPEND)) {
                        err = file_has_perm(cred, file, FILE__WRITE);
                        break;
                }
                fallthrough;
        case F_SETOWN:
        case F_SETSIG:
        case F_GETFL:
        case F_GETOWN:
        case F_GETSIG:
        case F_GETOWNER_UIDS:
                /* Just check FD__USE permission */
                err = file_has_perm(cred, file, 0);
                break;
        case F_GETLK:
        case F_SETLK:
        case F_SETLKW:
        case F_OFD_GETLK:
        case F_OFD_SETLK:
        case F_OFD_SETLKW:
#if BITS_PER_LONG == 32
        case F_GETLK64:
        case F_SETLK64:
        case F_SETLKW64:
#endif
                err = file_has_perm(cred, file, FILE__LOCK);
                break;
        }

        return err;
}

static void selinux_file_set_fowner(struct file *file)
{
        struct file_security_struct *fsec;

        fsec = selinux_file(file);
        fsec->fown_sid = current_sid();
}

static int selinux_file_send_sigiotask(struct task_struct *tsk,
                                       struct fown_struct *fown, int signum)
{
        struct file *file;
        u32 sid = task_sid_obj(tsk);
        u32 perm;
        struct file_security_struct *fsec;

        /* struct fown_struct is never outside the context of a struct file */
        file = fown->file;

        fsec = selinux_file(file);

        if (!signum)
                perm = signal_to_av(SIGIO); /* as per send_sigio_to_task */
        else
                perm = signal_to_av(signum);

        return avc_has_perm(fsec->fown_sid, sid,
                            SECCLASS_PROCESS, perm, NULL);
}

static int selinux_file_receive(struct file *file)
{
        const struct cred *cred = current_cred();

        return file_has_perm(cred, file, file_to_av(file));
}

static int selinux_file_open(struct file *file)
{
        struct file_security_struct *fsec;
        struct inode_security_struct *isec;

        fsec = selinux_file(file);
        isec = inode_security(file_inode(file));
        /*
         * Save inode label and policy sequence number
         * at open-time so that selinux_file_permission
         * can determine whether revalidation is necessary.
         * Task label is already saved in the file security
         * struct as its SID.
         */
        fsec->isid = isec->sid;
        fsec->pseqno = avc_policy_seqno();
        /*
         * Since the inode label or policy seqno may have changed
         * between the selinux_inode_permission check and the saving
         * of state above, recheck that access is still permitted.
         * Otherwise, access might never be revalidated against the
         * new inode label or new policy.
         * This check is not redundant - do not remove.
         */
        return file_path_has_perm(file->f_cred, file, open_file_to_av(file));
}

/* task security operations */

static int selinux_task_alloc(struct task_struct *task,
                              unsigned long clone_flags)
{
        u32 sid = current_sid();

        return avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__FORK, NULL);
}

/*
 * prepare a new set of credentials for modification
 */
static int selinux_cred_prepare(struct cred *new, const struct cred *old,
                                gfp_t gfp)
{
        const struct task_security_struct *old_tsec = selinux_cred(old);
        struct task_security_struct *tsec = selinux_cred(new);

        *tsec = *old_tsec;
        return 0;
}

/*
 * transfer the SELinux data to a blank set of creds
 */
static void selinux_cred_transfer(struct cred *new, const struct cred *old)
{
        const struct task_security_struct *old_tsec = selinux_cred(old);
        struct task_security_struct *tsec = selinux_cred(new);

        *tsec = *old_tsec;
}

static void selinux_cred_getsecid(const struct cred *c, u32 *secid)
{
        *secid = cred_sid(c);
}

static void selinux_cred_getlsmprop(const struct cred *c, struct lsm_prop *prop)
{
        prop->selinux.secid = cred_sid(c);
}

/*
 * set the security data for a kernel service
 * - all the creation contexts are set to unlabelled
 */
static int selinux_kernel_act_as(struct cred *new, u32 secid)
{
        struct task_security_struct *tsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;

        ret = avc_has_perm(sid, secid,
                           SECCLASS_KERNEL_SERVICE,
                           KERNEL_SERVICE__USE_AS_OVERRIDE,
                           NULL);
        if (ret == 0) {
                tsec->sid = secid;
                tsec->create_sid = 0;
                tsec->keycreate_sid = 0;
                tsec->sockcreate_sid = 0;
        }
        return ret;
}

/*
 * set the file creation context in a security record to the same as the
 * objective context of the specified inode
 */
static int selinux_kernel_create_files_as(struct cred *new, struct inode *inode)
{
        struct inode_security_struct *isec = inode_security(inode);
        struct task_security_struct *tsec = selinux_cred(new);
        u32 sid = current_sid();
        int ret;

        ret = avc_has_perm(sid, isec->sid,
                           SECCLASS_KERNEL_SERVICE,
                           KERNEL_SERVICE__CREATE_FILES_AS,
                           NULL);

        if (ret == 0)
                tsec->create_sid = isec->sid;
        return ret;
}

static int selinux_kernel_module_request(char *kmod_name)
{
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_KMOD;
        ad.u.kmod_name = kmod_name;

        return avc_has_perm(current_sid(), SECINITSID_KERNEL, SECCLASS_SYSTEM,
                            SYSTEM__MODULE_REQUEST, &ad);
}

static int selinux_kernel_load_from_file(struct file *file, u32 requested)
{
        struct common_audit_data ad;
        struct inode_security_struct *isec;
        struct file_security_struct *fsec;
        u32 sid = current_sid();
        int rc;

        if (file == NULL)
                return avc_has_perm(sid, sid, SECCLASS_SYSTEM, requested, NULL);

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        fsec = selinux_file(file);
        if (sid != fsec->sid) {
                rc = avc_has_perm(sid, fsec->sid, SECCLASS_FD, FD__USE, &ad);
                if (rc)
                        return rc;
        }

        isec = inode_security(file_inode(file));
        return avc_has_perm(sid, isec->sid, SECCLASS_SYSTEM, requested, &ad);
}

static int selinux_kernel_read_file(struct file *file,
                                    enum kernel_read_file_id id,
                                    bool contents)
{
        int rc = 0;

        BUILD_BUG_ON_MSG(READING_MAX_ID > 7,
                         "New kernel_read_file_id introduced; update SELinux!");

        switch (id) {
        case READING_FIRMWARE:
                rc = selinux_kernel_load_from_file(file, SYSTEM__FIRMWARE_LOAD);
                break;
        case READING_MODULE:
                rc = selinux_kernel_load_from_file(file, SYSTEM__MODULE_LOAD);
                break;
        case READING_KEXEC_IMAGE:
                rc = selinux_kernel_load_from_file(file,
                                                   SYSTEM__KEXEC_IMAGE_LOAD);
                break;
        case READING_KEXEC_INITRAMFS:
                rc = selinux_kernel_load_from_file(file,
                                                SYSTEM__KEXEC_INITRAMFS_LOAD);
                break;
        case READING_POLICY:
                rc = selinux_kernel_load_from_file(file, SYSTEM__POLICY_LOAD);
                break;
        case READING_X509_CERTIFICATE:
                rc = selinux_kernel_load_from_file(file,
                                                SYSTEM__X509_CERTIFICATE_LOAD);
                break;
        default:
                break;
        }

        return rc;
}

static int selinux_kernel_load_data(enum kernel_load_data_id id, bool contents)
{
        int rc = 0;

        BUILD_BUG_ON_MSG(LOADING_MAX_ID > 7,
                         "New kernel_load_data_id introduced; update SELinux!");

        switch (id) {
        case LOADING_FIRMWARE:
                rc = selinux_kernel_load_from_file(NULL, SYSTEM__FIRMWARE_LOAD);
                break;
        case LOADING_MODULE:
                rc = selinux_kernel_load_from_file(NULL, SYSTEM__MODULE_LOAD);
                break;
        case LOADING_KEXEC_IMAGE:
                rc = selinux_kernel_load_from_file(NULL,
                                                   SYSTEM__KEXEC_IMAGE_LOAD);
                break;
        case LOADING_KEXEC_INITRAMFS:
                rc = selinux_kernel_load_from_file(NULL,
                                                SYSTEM__KEXEC_INITRAMFS_LOAD);
                break;
        case LOADING_POLICY:
                rc = selinux_kernel_load_from_file(NULL,
                                                   SYSTEM__POLICY_LOAD);
                break;
        case LOADING_X509_CERTIFICATE:
                rc = selinux_kernel_load_from_file(NULL,
                                                SYSTEM__X509_CERTIFICATE_LOAD);
                break;
        default:
                break;
        }

        return rc;
}

static int selinux_task_setpgid(struct task_struct *p, pid_t pgid)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETPGID, NULL);
}

static int selinux_task_getpgid(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__GETPGID, NULL);
}

static int selinux_task_getsid(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__GETSESSION, NULL);
}

static void selinux_current_getlsmprop_subj(struct lsm_prop *prop)
{
        prop->selinux.secid = current_sid();
}

static void selinux_task_getlsmprop_obj(struct task_struct *p,
                                        struct lsm_prop *prop)
{
        prop->selinux.secid = task_sid_obj(p);
}

static int selinux_task_setnice(struct task_struct *p, int nice)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_setioprio(struct task_struct *p, int ioprio)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_getioprio(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__GETSCHED, NULL);
}

static int selinux_task_prlimit(const struct cred *cred, const struct cred *tcred,
                                unsigned int flags)
{
        u32 av = 0;

        if (!flags)
                return 0;
        if (flags & LSM_PRLIMIT_WRITE)
                av |= PROCESS__SETRLIMIT;
        if (flags & LSM_PRLIMIT_READ)
                av |= PROCESS__GETRLIMIT;
        return avc_has_perm(cred_sid(cred), cred_sid(tcred),
                            SECCLASS_PROCESS, av, NULL);
}

static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
                struct rlimit *new_rlim)
{
        struct rlimit *old_rlim = p->signal->rlim + resource;

        /* Control the ability to change the hard limit (whether
           lowering or raising it), so that the hard limit can
           later be used as a safe reset point for the soft limit
           upon context transitions.  See selinux_bprm_committing_creds. */
        if (old_rlim->rlim_max != new_rlim->rlim_max)
                return avc_has_perm(current_sid(), task_sid_obj(p),
                                    SECCLASS_PROCESS, PROCESS__SETRLIMIT, NULL);

        return 0;
}

static int selinux_task_setscheduler(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_getscheduler(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__GETSCHED, NULL);
}

static int selinux_task_movememory(struct task_struct *p)
{
        return avc_has_perm(current_sid(), task_sid_obj(p), SECCLASS_PROCESS,
                            PROCESS__SETSCHED, NULL);
}

static int selinux_task_kill(struct task_struct *p, struct kernel_siginfo *info,
                                int sig, const struct cred *cred)
{
        u32 secid;
        u32 perm;

        if (!sig)
                perm = PROCESS__SIGNULL; /* null signal; existence test */
        else
                perm = signal_to_av(sig);
        if (!cred)
                secid = current_sid();
        else
                secid = cred_sid(cred);
        return avc_has_perm(secid, task_sid_obj(p), SECCLASS_PROCESS, perm, NULL);
}

static void selinux_task_to_inode(struct task_struct *p,
                                  struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);
        u32 sid = task_sid_obj(p);

        spin_lock(&isec->lock);
        isec->sclass = inode_mode_to_security_class(inode->i_mode);
        isec->sid = sid;
        isec->initialized = LABEL_INITIALIZED;
        spin_unlock(&isec->lock);
}

static int selinux_userns_create(const struct cred *cred)
{
        u32 sid = current_sid();

        return avc_has_perm(sid, sid, SECCLASS_USER_NAMESPACE,
                        USER_NAMESPACE__CREATE, NULL);
}

/* Returns error only if unable to parse addresses */
static int selinux_parse_skb_ipv4(struct sk_buff *skb,
                        struct common_audit_data *ad, u8 *proto)
{
        int offset, ihlen, ret = -EINVAL;
        struct iphdr _iph, *ih;

        offset = skb_network_offset(skb);
        ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
        if (ih == NULL)
                goto out;

        ihlen = ih->ihl * 4;
        if (ihlen < sizeof(_iph))
                goto out;

        ad->u.net->v4info.saddr = ih->saddr;
        ad->u.net->v4info.daddr = ih->daddr;
        ret = 0;

        if (proto)
                *proto = ih->protocol;

        switch (ih->protocol) {
        case IPPROTO_TCP: {
                struct tcphdr _tcph, *th;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th == NULL)
                        break;

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }

        case IPPROTO_UDP: {
                struct udphdr _udph, *uh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh == NULL)
                        break;

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }

        case IPPROTO_DCCP: {
                struct dccp_hdr _dccph, *dh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
                if (dh == NULL)
                        break;

                ad->u.net->sport = dh->dccph_sport;
                ad->u.net->dport = dh->dccph_dport;
                break;
        }

#if IS_ENABLED(CONFIG_IP_SCTP)
        case IPPROTO_SCTP: {
                struct sctphdr _sctph, *sh;

                if (ntohs(ih->frag_off) & IP_OFFSET)
                        break;

                offset += ihlen;
                sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                if (sh == NULL)
                        break;

                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
#endif
        default:
                break;
        }
out:
        return ret;
}

#if IS_ENABLED(CONFIG_IPV6)

/* Returns error only if unable to parse addresses */
static int selinux_parse_skb_ipv6(struct sk_buff *skb,
                        struct common_audit_data *ad, u8 *proto)
{
        u8 nexthdr;
        int ret = -EINVAL, offset;
        struct ipv6hdr _ipv6h, *ip6;
        __be16 frag_off;

        offset = skb_network_offset(skb);
        ip6 = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
        if (ip6 == NULL)
                goto out;

        ad->u.net->v6info.saddr = ip6->saddr;
        ad->u.net->v6info.daddr = ip6->daddr;
        ret = 0;

        nexthdr = ip6->nexthdr;
        offset += sizeof(_ipv6h);
        offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
        if (offset < 0)
                goto out;

        if (proto)
                *proto = nexthdr;

        switch (nexthdr) {
        case IPPROTO_TCP: {
                struct tcphdr _tcph, *th;

                th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
                if (th == NULL)
                        break;

                ad->u.net->sport = th->source;
                ad->u.net->dport = th->dest;
                break;
        }

        case IPPROTO_UDP: {
                struct udphdr _udph, *uh;

                uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
                if (uh == NULL)
                        break;

                ad->u.net->sport = uh->source;
                ad->u.net->dport = uh->dest;
                break;
        }

        case IPPROTO_DCCP: {
                struct dccp_hdr _dccph, *dh;

                dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
                if (dh == NULL)
                        break;

                ad->u.net->sport = dh->dccph_sport;
                ad->u.net->dport = dh->dccph_dport;
                break;
        }

#if IS_ENABLED(CONFIG_IP_SCTP)
        case IPPROTO_SCTP: {
                struct sctphdr _sctph, *sh;

                sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
                if (sh == NULL)
                        break;

                ad->u.net->sport = sh->source;
                ad->u.net->dport = sh->dest;
                break;
        }
#endif
        /* includes fragments */
        default:
                break;
        }
out:
        return ret;
}

#endif /* IPV6 */

static int selinux_parse_skb(struct sk_buff *skb, struct common_audit_data *ad,
                             char **_addrp, int src, u8 *proto)
{
        char *addrp;
        int ret;

        switch (ad->u.net->family) {
        case PF_INET:
                ret = selinux_parse_skb_ipv4(skb, ad, proto);
                if (ret)
                        goto parse_error;
                addrp = (char *)(src ? &ad->u.net->v4info.saddr :
                                       &ad->u.net->v4info.daddr);
                goto okay;

#if IS_ENABLED(CONFIG_IPV6)
        case PF_INET6:
                ret = selinux_parse_skb_ipv6(skb, ad, proto);
                if (ret)
                        goto parse_error;
                addrp = (char *)(src ? &ad->u.net->v6info.saddr :
                                       &ad->u.net->v6info.daddr);
                goto okay;
#endif        /* IPV6 */
        default:
                addrp = NULL;
                goto okay;
        }

parse_error:
        pr_warn(
               "SELinux: failure in selinux_parse_skb(),"
               " unable to parse packet\n");
        return ret;

okay:
        if (_addrp)
                *_addrp = addrp;
        return 0;
}

/**
 * selinux_skb_peerlbl_sid - Determine the peer label of a packet
 * @skb: the packet
 * @family: protocol family
 * @sid: the packet's peer label SID
 *
 * Description:
 * Check the various different forms of network peer labeling and determine
 * the peer label/SID for the packet; most of the magic actually occurs in
 * the security server function security_net_peersid_cmp().  The function
 * returns zero if the value in @sid is valid (although it may be SECSID_NULL)
 * or -EACCES if @sid is invalid due to inconsistencies with the different
 * peer labels.
 *
 */
static int selinux_skb_peerlbl_sid(struct sk_buff *skb, u16 family, u32 *sid)
{
        int err;
        u32 xfrm_sid;
        u32 nlbl_sid;
        u32 nlbl_type;

        err = selinux_xfrm_skb_sid(skb, &xfrm_sid);
        if (unlikely(err))
                return -EACCES;
        err = selinux_netlbl_skbuff_getsid(skb, family, &nlbl_type, &nlbl_sid);
        if (unlikely(err))
                return -EACCES;

        err = security_net_peersid_resolve(nlbl_sid,
                                           nlbl_type, xfrm_sid, sid);
        if (unlikely(err)) {
                pr_warn(
                       "SELinux: failure in selinux_skb_peerlbl_sid(),"
                       " unable to determine packet's peer label\n");
                return -EACCES;
        }

        return 0;
}

/**
 * selinux_conn_sid - Determine the child socket label for a connection
 * @sk_sid: the parent socket's SID
 * @skb_sid: the packet's SID
 * @conn_sid: the resulting connection SID
 *
 * If @skb_sid is valid then the user:role:type information from @sk_sid is
 * combined with the MLS information from @skb_sid in order to create
 * @conn_sid.  If @skb_sid is not valid then @conn_sid is simply a copy
 * of @sk_sid.  Returns zero on success, negative values on failure.
 *
 */
static int selinux_conn_sid(u32 sk_sid, u32 skb_sid, u32 *conn_sid)
{
        int err = 0;

        if (skb_sid != SECSID_NULL)
                err = security_sid_mls_copy(sk_sid, skb_sid,
                                            conn_sid);
        else
                *conn_sid = sk_sid;

        return err;
}

/* socket security operations */

static int socket_sockcreate_sid(const struct task_security_struct *tsec,
                                 u16 secclass, u32 *socksid)
{
        if (tsec->sockcreate_sid > SECSID_NULL) {
                *socksid = tsec->sockcreate_sid;
                return 0;
        }

        return security_transition_sid(tsec->sid, tsec->sid,
                                       secclass, NULL, socksid);
}

static bool sock_skip_has_perm(u32 sid)
{
        if (sid == SECINITSID_KERNEL)
                return true;

        /*
         * Before POLICYDB_CAP_USERSPACE_INITIAL_CONTEXT, sockets that
         * inherited the kernel context from early boot used to be skipped
         * here, so preserve that behavior unless the capability is set.
         *
         * By setting the capability the policy signals that it is ready
         * for this quirk to be fixed. Note that sockets created by a kernel
         * thread or a usermode helper executed without a transition will
         * still be skipped in this check regardless of the policycap
         * setting.
         */
        if (!selinux_policycap_userspace_initial_context() &&
            sid == SECINITSID_INIT)
                return true;
        return false;
}


static int sock_has_perm(struct sock *sk, u32 perms)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct common_audit_data ad;
        struct lsm_network_audit net;

        if (sock_skip_has_perm(sksec->sid))
                return 0;

        ad_net_init_from_sk(&ad, &net, sk);

        return avc_has_perm(current_sid(), sksec->sid, sksec->sclass, perms,
                            &ad);
}

static int selinux_socket_create(int family, int type,
                                 int protocol, int kern)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        u32 newsid;
        u16 secclass;
        int rc;

        if (kern)
                return 0;

        secclass = socket_type_to_security_class(family, type, protocol);
        rc = socket_sockcreate_sid(tsec, secclass, &newsid);
        if (rc)
                return rc;

        return avc_has_perm(tsec->sid, newsid, secclass, SOCKET__CREATE, NULL);
}

static int selinux_socket_post_create(struct socket *sock, int family,
                                      int type, int protocol, int kern)
{
        const struct task_security_struct *tsec = selinux_cred(current_cred());
        struct inode_security_struct *isec = inode_security_novalidate(SOCK_INODE(sock));
        struct sk_security_struct *sksec;
        u16 sclass = socket_type_to_security_class(family, type, protocol);
        u32 sid = SECINITSID_KERNEL;
        int err = 0;

        if (!kern) {
                err = socket_sockcreate_sid(tsec, sclass, &sid);
                if (err)
                        return err;
        }

        isec->sclass = sclass;
        isec->sid = sid;
        isec->initialized = LABEL_INITIALIZED;

        if (sock->sk) {
                sksec = selinux_sock(sock->sk);
                sksec->sclass = sclass;
                sksec->sid = sid;
                /* Allows detection of the first association on this socket */
                if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                        sksec->sctp_assoc_state = SCTP_ASSOC_UNSET;

                err = selinux_netlbl_socket_post_create(sock->sk, family);
        }

        return err;
}

static int selinux_socket_socketpair(struct socket *socka,
                                     struct socket *sockb)
{
        struct sk_security_struct *sksec_a = selinux_sock(socka->sk);
        struct sk_security_struct *sksec_b = selinux_sock(sockb->sk);

        sksec_a->peer_sid = sksec_b->sid;
        sksec_b->peer_sid = sksec_a->sid;

        return 0;
}

/* Range of port numbers used to automatically bind.
   Need to determine whether we should perform a name_bind
   permission check between the socket and the port number. */

static int selinux_socket_bind(struct socket *sock, struct sockaddr *address, int addrlen)
{
        struct sock *sk = sock->sk;
        struct sk_security_struct *sksec = selinux_sock(sk);
        u16 family;
        int err;

        err = sock_has_perm(sk, SOCKET__BIND);
        if (err)
                goto out;

        /* If PF_INET or PF_INET6, check name_bind permission for the port. */
        family = sk->sk_family;
        if (family == PF_INET || family == PF_INET6) {
                char *addrp;
                struct common_audit_data ad;
                struct lsm_network_audit net = {0,};
                struct sockaddr_in *addr4 = NULL;
                struct sockaddr_in6 *addr6 = NULL;
                u16 family_sa;
                unsigned short snum;
                u32 sid, node_perm;

                /*
                 * sctp_bindx(3) calls via selinux_sctp_bind_connect()
                 * that validates multiple binding addresses. Because of this
                 * need to check address->sa_family as it is possible to have
                 * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
                 */
                if (addrlen < offsetofend(struct sockaddr, sa_family))
                        return -EINVAL;
                family_sa = address->sa_family;
                switch (family_sa) {
                case AF_UNSPEC:
                case AF_INET:
                        if (addrlen < sizeof(struct sockaddr_in))
                                return -EINVAL;
                        addr4 = (struct sockaddr_in *)address;
                        if (family_sa == AF_UNSPEC) {
                                if (family == PF_INET6) {
                                        /* Length check from inet6_bind_sk() */
                                        if (addrlen < SIN6_LEN_RFC2133)
                                                return -EINVAL;
                                        /* Family check from __inet6_bind() */
                                        goto err_af;
                                }
                                /* see __inet_bind(), we only want to allow
                                 * AF_UNSPEC if the address is INADDR_ANY
                                 */
                                if (addr4->sin_addr.s_addr != htonl(INADDR_ANY))
                                        goto err_af;
                                family_sa = AF_INET;
                        }
                        snum = ntohs(addr4->sin_port);
                        addrp = (char *)&addr4->sin_addr.s_addr;
                        break;
                case AF_INET6:
                        if (addrlen < SIN6_LEN_RFC2133)
                                return -EINVAL;
                        addr6 = (struct sockaddr_in6 *)address;
                        snum = ntohs(addr6->sin6_port);
                        addrp = (char *)&addr6->sin6_addr.s6_addr;
                        break;
                default:
                        goto err_af;
                }

                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->sport = htons(snum);
                ad.u.net->family = family_sa;

                if (snum) {
                        int low, high;

                        inet_get_local_port_range(sock_net(sk), &low, &high);

                        if (inet_port_requires_bind_service(sock_net(sk), snum) ||
                            snum < low || snum > high) {
                                err = sel_netport_sid(sk->sk_protocol,
                                                      snum, &sid);
                                if (err)
                                        goto out;
                                err = avc_has_perm(sksec->sid, sid,
                                                   sksec->sclass,
                                                   SOCKET__NAME_BIND, &ad);
                                if (err)
                                        goto out;
                        }
                }

                switch (sksec->sclass) {
                case SECCLASS_TCP_SOCKET:
                        node_perm = TCP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_UDP_SOCKET:
                        node_perm = UDP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_DCCP_SOCKET:
                        node_perm = DCCP_SOCKET__NODE_BIND;
                        break;

                case SECCLASS_SCTP_SOCKET:
                        node_perm = SCTP_SOCKET__NODE_BIND;
                        break;

                default:
                        node_perm = RAWIP_SOCKET__NODE_BIND;
                        break;
                }

                err = sel_netnode_sid(addrp, family_sa, &sid);
                if (err)
                        goto out;

                if (family_sa == AF_INET)
                        ad.u.net->v4info.saddr = addr4->sin_addr.s_addr;
                else
                        ad.u.net->v6info.saddr = addr6->sin6_addr;

                err = avc_has_perm(sksec->sid, sid,
                                   sksec->sclass, node_perm, &ad);
                if (err)
                        goto out;
        }
out:
        return err;
err_af:
        /* Note that SCTP services expect -EINVAL, others -EAFNOSUPPORT. */
        if (sk->sk_protocol == IPPROTO_SCTP)
                return -EINVAL;
        return -EAFNOSUPPORT;
}

/* This supports connect(2) and SCTP connect services such as sctp_connectx(3)
 * and sctp_sendmsg(3) as described in Documentation/security/SCTP.rst
 */
static int selinux_socket_connect_helper(struct socket *sock,
                                         struct sockaddr *address, int addrlen)
{
        struct sock *sk = sock->sk;
        struct sk_security_struct *sksec = selinux_sock(sk);
        int err;

        err = sock_has_perm(sk, SOCKET__CONNECT);
        if (err)
                return err;
        if (addrlen < offsetofend(struct sockaddr, sa_family))
                return -EINVAL;

        /* connect(AF_UNSPEC) has special handling, as it is a documented
         * way to disconnect the socket
         */
        if (address->sa_family == AF_UNSPEC)
                return 0;

        /*
         * If a TCP, DCCP or SCTP socket, check name_connect permission
         * for the port.
         */
        if (sksec->sclass == SECCLASS_TCP_SOCKET ||
            sksec->sclass == SECCLASS_DCCP_SOCKET ||
            sksec->sclass == SECCLASS_SCTP_SOCKET) {
                struct common_audit_data ad;
                struct lsm_network_audit net = {0,};
                struct sockaddr_in *addr4 = NULL;
                struct sockaddr_in6 *addr6 = NULL;
                unsigned short snum;
                u32 sid, perm;

                /* sctp_connectx(3) calls via selinux_sctp_bind_connect()
                 * that validates multiple connect addresses. Because of this
                 * need to check address->sa_family as it is possible to have
                 * sk->sk_family = PF_INET6 with addr->sa_family = AF_INET.
                 */
                switch (address->sa_family) {
                case AF_INET:
                        addr4 = (struct sockaddr_in *)address;
                        if (addrlen < sizeof(struct sockaddr_in))
                                return -EINVAL;
                        snum = ntohs(addr4->sin_port);
                        break;
                case AF_INET6:
                        addr6 = (struct sockaddr_in6 *)address;
                        if (addrlen < SIN6_LEN_RFC2133)
                                return -EINVAL;
                        snum = ntohs(addr6->sin6_port);
                        break;
                default:
                        /* Note that SCTP services expect -EINVAL, whereas
                         * others expect -EAFNOSUPPORT.
                         */
                        if (sksec->sclass == SECCLASS_SCTP_SOCKET)
                                return -EINVAL;
                        else
                                return -EAFNOSUPPORT;
                }

                err = sel_netport_sid(sk->sk_protocol, snum, &sid);
                if (err)
                        return err;

                switch (sksec->sclass) {
                case SECCLASS_TCP_SOCKET:
                        perm = TCP_SOCKET__NAME_CONNECT;
                        break;
                case SECCLASS_DCCP_SOCKET:
                        perm = DCCP_SOCKET__NAME_CONNECT;
                        break;
                case SECCLASS_SCTP_SOCKET:
                        perm = SCTP_SOCKET__NAME_CONNECT;
                        break;
                }

                ad.type = LSM_AUDIT_DATA_NET;
                ad.u.net = &net;
                ad.u.net->dport = htons(snum);
                ad.u.net->family = address->sa_family;
                err = avc_has_perm(sksec->sid, sid, sksec->sclass, perm, &ad);
                if (err)
                        return err;
        }

        return 0;
}

/* Supports connect(2), see comments in selinux_socket_connect_helper() */
static int selinux_socket_connect(struct socket *sock,
                                  struct sockaddr *address, int addrlen)
{
        int err;
        struct sock *sk = sock->sk;

        err = selinux_socket_connect_helper(sock, address, addrlen);
        if (err)
                return err;

        return selinux_netlbl_socket_connect(sk, address);
}

static int selinux_socket_listen(struct socket *sock, int backlog)
{
        return sock_has_perm(sock->sk, SOCKET__LISTEN);
}

static int selinux_socket_accept(struct socket *sock, struct socket *newsock)
{
        int err;
        struct inode_security_struct *isec;
        struct inode_security_struct *newisec;
        u16 sclass;
        u32 sid;

        err = sock_has_perm(sock->sk, SOCKET__ACCEPT);
        if (err)
                return err;

        isec = inode_security_novalidate(SOCK_INODE(sock));
        spin_lock(&isec->lock);
        sclass = isec->sclass;
        sid = isec->sid;
        spin_unlock(&isec->lock);

        newisec = inode_security_novalidate(SOCK_INODE(newsock));
        newisec->sclass = sclass;
        newisec->sid = sid;
        newisec->initialized = LABEL_INITIALIZED;

        return 0;
}

static int selinux_socket_sendmsg(struct socket *sock, struct msghdr *msg,
                                  int size)
{
        return sock_has_perm(sock->sk, SOCKET__WRITE);
}

static int selinux_socket_recvmsg(struct socket *sock, struct msghdr *msg,
                                  int size, int flags)
{
        return sock_has_perm(sock->sk, SOCKET__READ);
}

static int selinux_socket_getsockname(struct socket *sock)
{
        return sock_has_perm(sock->sk, SOCKET__GETATTR);
}

static int selinux_socket_getpeername(struct socket *sock)
{
        return sock_has_perm(sock->sk, SOCKET__GETATTR);
}

static int selinux_socket_setsockopt(struct socket *sock, int level, int optname)
{
        int err;

        err = sock_has_perm(sock->sk, SOCKET__SETOPT);
        if (err)
                return err;

        return selinux_netlbl_socket_setsockopt(sock, level, optname);
}

static int selinux_socket_getsockopt(struct socket *sock, int level,
                                     int optname)
{
        return sock_has_perm(sock->sk, SOCKET__GETOPT);
}

static int selinux_socket_shutdown(struct socket *sock, int how)
{
        return sock_has_perm(sock->sk, SOCKET__SHUTDOWN);
}

static int selinux_socket_unix_stream_connect(struct sock *sock,
                                              struct sock *other,
                                              struct sock *newsk)
{
        struct sk_security_struct *sksec_sock = selinux_sock(sock);
        struct sk_security_struct *sksec_other = selinux_sock(other);
        struct sk_security_struct *sksec_new = selinux_sock(newsk);
        struct common_audit_data ad;
        struct lsm_network_audit net;
        int err;

        ad_net_init_from_sk(&ad, &net, other);

        err = avc_has_perm(sksec_sock->sid, sksec_other->sid,
                           sksec_other->sclass,
                           UNIX_STREAM_SOCKET__CONNECTTO, &ad);
        if (err)
                return err;

        /* server child socket */
        sksec_new->peer_sid = sksec_sock->sid;
        err = security_sid_mls_copy(sksec_other->sid,
                                    sksec_sock->sid, &sksec_new->sid);
        if (err)
                return err;

        /* connecting socket */
        sksec_sock->peer_sid = sksec_new->sid;

        return 0;
}

static int selinux_socket_unix_may_send(struct socket *sock,
                                        struct socket *other)
{
        struct sk_security_struct *ssec = selinux_sock(sock->sk);
        struct sk_security_struct *osec = selinux_sock(other->sk);
        struct common_audit_data ad;
        struct lsm_network_audit net;

        ad_net_init_from_sk(&ad, &net, other->sk);

        return avc_has_perm(ssec->sid, osec->sid, osec->sclass, SOCKET__SENDTO,
                            &ad);
}

static int selinux_inet_sys_rcv_skb(struct net *ns, int ifindex,
                                    char *addrp, u16 family, u32 peer_sid,
                                    struct common_audit_data *ad)
{
        int err;
        u32 if_sid;
        u32 node_sid;

        err = sel_netif_sid(ns, ifindex, &if_sid);
        if (err)
                return err;
        err = avc_has_perm(peer_sid, if_sid,
                           SECCLASS_NETIF, NETIF__INGRESS, ad);
        if (err)
                return err;

        err = sel_netnode_sid(addrp, family, &node_sid);
        if (err)
                return err;
        return avc_has_perm(peer_sid, node_sid,
                            SECCLASS_NODE, NODE__RECVFROM, ad);
}

static int selinux_sock_rcv_skb_compat(struct sock *sk, struct sk_buff *skb,
                                       u16 family)
{
        int err = 0;
        struct sk_security_struct *sksec = selinux_sock(sk);
        u32 sk_sid = sksec->sid;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        char *addrp;

        ad_net_init_from_iif(&ad, &net, skb->skb_iif, family);
        err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
        if (err)
                return err;

        if (selinux_secmark_enabled()) {
                err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
                                   PACKET__RECV, &ad);
                if (err)
                        return err;
        }

        err = selinux_netlbl_sock_rcv_skb(sksec, skb, family, &ad);
        if (err)
                return err;
        err = selinux_xfrm_sock_rcv_skb(sksec->sid, skb, &ad);

        return err;
}

static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        int err, peerlbl_active, secmark_active;
        struct sk_security_struct *sksec = selinux_sock(sk);
        u16 family = sk->sk_family;
        u32 sk_sid = sksec->sid;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        char *addrp;

        if (family != PF_INET && family != PF_INET6)
                return 0;

        /* Handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        /* If any sort of compatibility mode is enabled then handoff processing
         * to the selinux_sock_rcv_skb_compat() function to deal with the
         * special handling.  We do this in an attempt to keep this function
         * as fast and as clean as possible. */
        if (!selinux_policycap_netpeer())
                return selinux_sock_rcv_skb_compat(sk, skb, family);

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return 0;

        ad_net_init_from_iif(&ad, &net, skb->skb_iif, family);
        err = selinux_parse_skb(skb, &ad, &addrp, 1, NULL);
        if (err)
                return err;

        if (peerlbl_active) {
                u32 peer_sid;

                err = selinux_skb_peerlbl_sid(skb, family, &peer_sid);
                if (err)
                        return err;
                err = selinux_inet_sys_rcv_skb(sock_net(sk), skb->skb_iif,
                                               addrp, family, peer_sid, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 0);
                        return err;
                }
                err = avc_has_perm(sk_sid, peer_sid, SECCLASS_PEER,
                                   PEER__RECV, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 0);
                        return err;
                }
        }

        if (secmark_active) {
                err = avc_has_perm(sk_sid, skb->secmark, SECCLASS_PACKET,
                                   PACKET__RECV, &ad);
                if (err)
                        return err;
        }

        return err;
}

static int selinux_socket_getpeersec_stream(struct socket *sock,
                                            sockptr_t optval, sockptr_t optlen,
                                            unsigned int len)
{
        int err = 0;
        char *scontext = NULL;
        u32 scontext_len;
        struct sk_security_struct *sksec = selinux_sock(sock->sk);
        u32 peer_sid = SECSID_NULL;

        if (sksec->sclass == SECCLASS_UNIX_STREAM_SOCKET ||
            sksec->sclass == SECCLASS_TCP_SOCKET ||
            sksec->sclass == SECCLASS_SCTP_SOCKET)
                peer_sid = sksec->peer_sid;
        if (peer_sid == SECSID_NULL)
                return -ENOPROTOOPT;

        err = security_sid_to_context(peer_sid, &scontext,
                                      &scontext_len);
        if (err)
                return err;
        if (scontext_len > len) {
                err = -ERANGE;
                goto out_len;
        }

        if (copy_to_sockptr(optval, scontext, scontext_len))
                err = -EFAULT;
out_len:
        if (copy_to_sockptr(optlen, &scontext_len, sizeof(scontext_len)))
                err = -EFAULT;
        kfree(scontext);
        return err;
}

static int selinux_socket_getpeersec_dgram(struct socket *sock,
                                           struct sk_buff *skb, u32 *secid)
{
        u32 peer_secid = SECSID_NULL;
        u16 family;

        if (skb && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;
        else if (skb && skb->protocol == htons(ETH_P_IPV6))
                family = PF_INET6;
        else if (sock)
                family = sock->sk->sk_family;
        else {
                *secid = SECSID_NULL;
                return -EINVAL;
        }

        if (sock && family == PF_UNIX) {
                struct inode_security_struct *isec;
                isec = inode_security_novalidate(SOCK_INODE(sock));
                peer_secid = isec->sid;
        } else if (skb)
                selinux_skb_peerlbl_sid(skb, family, &peer_secid);

        *secid = peer_secid;
        if (peer_secid == SECSID_NULL)
                return -ENOPROTOOPT;
        return 0;
}

static int selinux_sk_alloc_security(struct sock *sk, int family, gfp_t priority)
{
        struct sk_security_struct *sksec = selinux_sock(sk);

        sksec->peer_sid = SECINITSID_UNLABELED;
        sksec->sid = SECINITSID_UNLABELED;
        sksec->sclass = SECCLASS_SOCKET;
        selinux_netlbl_sk_security_reset(sksec);

        return 0;
}

static void selinux_sk_free_security(struct sock *sk)
{
        struct sk_security_struct *sksec = selinux_sock(sk);

        selinux_netlbl_sk_security_free(sksec);
}

static void selinux_sk_clone_security(const struct sock *sk, struct sock *newsk)
{
        struct sk_security_struct *sksec = selinux_sock(sk);
        struct sk_security_struct *newsksec = selinux_sock(newsk);

        newsksec->sid = sksec->sid;
        newsksec->peer_sid = sksec->peer_sid;
        newsksec->sclass = sksec->sclass;

        selinux_netlbl_sk_security_reset(newsksec);
}

static void selinux_sk_getsecid(const struct sock *sk, u32 *secid)
{
        if (!sk)
                *secid = SECINITSID_ANY_SOCKET;
        else {
                const struct sk_security_struct *sksec = selinux_sock(sk);

                *secid = sksec->sid;
        }
}

static void selinux_sock_graft(struct sock *sk, struct socket *parent)
{
        struct inode_security_struct *isec =
                inode_security_novalidate(SOCK_INODE(parent));
        struct sk_security_struct *sksec = selinux_sock(sk);

        if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6 ||
            sk->sk_family == PF_UNIX)
                isec->sid = sksec->sid;
        sksec->sclass = isec->sclass;
}

/*
 * Determines peer_secid for the asoc and updates socket's peer label
 * if it's the first association on the socket.
 */
static int selinux_sctp_process_new_assoc(struct sctp_association *asoc,
                                          struct sk_buff *skb)
{
        struct sock *sk = asoc->base.sk;
        u16 family = sk->sk_family;
        struct sk_security_struct *sksec = selinux_sock(sk);
        struct common_audit_data ad;
        struct lsm_network_audit net;
        int err;

        /* handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        if (selinux_peerlbl_enabled()) {
                asoc->peer_secid = SECSID_NULL;

                /* This will return peer_sid = SECSID_NULL if there are
                 * no peer labels, see security_net_peersid_resolve().
                 */
                err = selinux_skb_peerlbl_sid(skb, family, &asoc->peer_secid);
                if (err)
                        return err;

                if (asoc->peer_secid == SECSID_NULL)
                        asoc->peer_secid = SECINITSID_UNLABELED;
        } else {
                asoc->peer_secid = SECINITSID_UNLABELED;
        }

        if (sksec->sctp_assoc_state == SCTP_ASSOC_UNSET) {
                sksec->sctp_assoc_state = SCTP_ASSOC_SET;

                /* Here as first association on socket. As the peer SID
                 * was allowed by peer recv (and the netif/node checks),
                 * then it is approved by policy and used as the primary
                 * peer SID for getpeercon(3).
                 */
                sksec->peer_sid = asoc->peer_secid;
        } else if (sksec->peer_sid != asoc->peer_secid) {
                /* Other association peer SIDs are checked to enforce
                 * consistency among the peer SIDs.
                 */
                ad_net_init_from_sk(&ad, &net, asoc->base.sk);
                err = avc_has_perm(sksec->peer_sid, asoc->peer_secid,
                                   sksec->sclass, SCTP_SOCKET__ASSOCIATION,
                                   &ad);
                if (err)
                        return err;
        }
        return 0;
}

/* Called whenever SCTP receives an INIT or COOKIE ECHO chunk. This
 * happens on an incoming connect(2), sctp_connectx(3) or
 * sctp_sendmsg(3) (with no association already present).
 */
static int selinux_sctp_assoc_request(struct sctp_association *asoc,
                                      struct sk_buff *skb)
{
        struct sk_security_struct *sksec = selinux_sock(asoc->base.sk);
        u32 conn_sid;
        int err;

        if (!selinux_policycap_extsockclass())
                return 0;

        err = selinux_sctp_process_new_assoc(asoc, skb);
        if (err)
                return err;

        /* Compute the MLS component for the connection and store
         * the information in asoc. This will be used by SCTP TCP type
         * sockets and peeled off connections as they cause a new
         * socket to be generated. selinux_sctp_sk_clone() will then
         * plug this into the new socket.
         */
        err = selinux_conn_sid(sksec->sid, asoc->peer_secid, &conn_sid);
        if (err)
                return err;

        asoc->secid = conn_sid;

        /* Set any NetLabel labels including CIPSO/CALIPSO options. */
        return selinux_netlbl_sctp_assoc_request(asoc, skb);
}

/* Called when SCTP receives a COOKIE ACK chunk as the final
 * response to an association request (initited by us).
 */
static int selinux_sctp_assoc_established(struct sctp_association *asoc,
                                          struct sk_buff *skb)
{
        struct sk_security_struct *sksec = selinux_sock(asoc->base.sk);

        if (!selinux_policycap_extsockclass())
                return 0;

        /* Inherit secid from the parent socket - this will be picked up
         * by selinux_sctp_sk_clone() if the association gets peeled off
         * into a new socket.
         */
        asoc->secid = sksec->sid;

        return selinux_sctp_process_new_assoc(asoc, skb);
}

/* Check if sctp IPv4/IPv6 addresses are valid for binding or connecting
 * based on their @optname.
 */
static int selinux_sctp_bind_connect(struct sock *sk, int optname,
                                     struct sockaddr *address,
                                     int addrlen)
{
        int len, err = 0, walk_size = 0;
        void *addr_buf;
        struct sockaddr *addr;
        struct socket *sock;

        if (!selinux_policycap_extsockclass())
                return 0;

        /* Process one or more addresses that may be IPv4 or IPv6 */
        sock = sk->sk_socket;
        addr_buf = address;

        while (walk_size < addrlen) {
                if (walk_size + sizeof(sa_family_t) > addrlen)
                        return -EINVAL;

                addr = addr_buf;
                switch (addr->sa_family) {
                case AF_UNSPEC:
                case AF_INET:
                        len = sizeof(struct sockaddr_in);
                        break;
                case AF_INET6:
                        len = sizeof(struct sockaddr_in6);
                        break;
                default:
                        return -EINVAL;
                }

                if (walk_size + len > addrlen)
                        return -EINVAL;

                err = -EINVAL;
                switch (optname) {
                /* Bind checks */
                case SCTP_PRIMARY_ADDR:
                case SCTP_SET_PEER_PRIMARY_ADDR:
                case SCTP_SOCKOPT_BINDX_ADD:
                        err = selinux_socket_bind(sock, addr, len);
                        break;
                /* Connect checks */
                case SCTP_SOCKOPT_CONNECTX:
                case SCTP_PARAM_SET_PRIMARY:
                case SCTP_PARAM_ADD_IP:
                case SCTP_SENDMSG_CONNECT:
                        err = selinux_socket_connect_helper(sock, addr, len);
                        if (err)
                                return err;

                        /* As selinux_sctp_bind_connect() is called by the
                         * SCTP protocol layer, the socket is already locked,
                         * therefore selinux_netlbl_socket_connect_locked()
                         * is called here. The situations handled are:
                         * sctp_connectx(3), sctp_sendmsg(3), sendmsg(2),
                         * whenever a new IP address is added or when a new
                         * primary address is selected.
                         * Note that an SCTP connect(2) call happens before
                         * the SCTP protocol layer and is handled via
                         * selinux_socket_connect().
                         */
                        err = selinux_netlbl_socket_connect_locked(sk, addr);
                        break;
                }

                if (err)
                        return err;

                addr_buf += len;
                walk_size += len;
        }

        return 0;
}

/* Called whenever a new socket is created by accept(2) or sctp_peeloff(3). */
static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk,
                                  struct sock *newsk)
{
        struct sk_security_struct *sksec = selinux_sock(sk);
        struct sk_security_struct *newsksec = selinux_sock(newsk);

        /* If policy does not support SECCLASS_SCTP_SOCKET then call
         * the non-sctp clone version.
         */
        if (!selinux_policycap_extsockclass())
                return selinux_sk_clone_security(sk, newsk);

        newsksec->sid = asoc->secid;
        newsksec->peer_sid = asoc->peer_secid;
        newsksec->sclass = sksec->sclass;
        selinux_netlbl_sctp_sk_clone(sk, newsk);
}

static int selinux_mptcp_add_subflow(struct sock *sk, struct sock *ssk)
{
        struct sk_security_struct *ssksec = selinux_sock(ssk);
        struct sk_security_struct *sksec = selinux_sock(sk);

        ssksec->sclass = sksec->sclass;
        ssksec->sid = sksec->sid;

        /* replace the existing subflow label deleting the existing one
         * and re-recreating a new label using the updated context
         */
        selinux_netlbl_sk_security_free(ssksec);
        return selinux_netlbl_socket_post_create(ssk, ssk->sk_family);
}

static int selinux_inet_conn_request(const struct sock *sk, struct sk_buff *skb,
                                     struct request_sock *req)
{
        struct sk_security_struct *sksec = selinux_sock(sk);
        int err;
        u16 family = req->rsk_ops->family;
        u32 connsid;
        u32 peersid;

        err = selinux_skb_peerlbl_sid(skb, family, &peersid);
        if (err)
                return err;
        err = selinux_conn_sid(sksec->sid, peersid, &connsid);
        if (err)
                return err;
        req->secid = connsid;
        req->peer_secid = peersid;

        return selinux_netlbl_inet_conn_request(req, family);
}

static void selinux_inet_csk_clone(struct sock *newsk,
                                   const struct request_sock *req)
{
        struct sk_security_struct *newsksec = selinux_sock(newsk);

        newsksec->sid = req->secid;
        newsksec->peer_sid = req->peer_secid;
        /* NOTE: Ideally, we should also get the isec->sid for the
           new socket in sync, but we don't have the isec available yet.
           So we will wait until sock_graft to do it, by which
           time it will have been created and available. */

        /* We don't need to take any sort of lock here as we are the only
         * thread with access to newsksec */
        selinux_netlbl_inet_csk_clone(newsk, req->rsk_ops->family);
}

static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
{
        u16 family = sk->sk_family;
        struct sk_security_struct *sksec = selinux_sock(sk);

        /* handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
                family = PF_INET;

        selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
}

static int selinux_secmark_relabel_packet(u32 sid)
{
        return avc_has_perm(current_sid(), sid, SECCLASS_PACKET, PACKET__RELABELTO,
                            NULL);
}

static void selinux_secmark_refcount_inc(void)
{
        atomic_inc(&selinux_secmark_refcount);
}

static void selinux_secmark_refcount_dec(void)
{
        atomic_dec(&selinux_secmark_refcount);
}

static void selinux_req_classify_flow(const struct request_sock *req,
                                      struct flowi_common *flic)
{
        flic->flowic_secid = req->secid;
}

static int selinux_tun_dev_alloc_security(void *security)
{
        struct tun_security_struct *tunsec = selinux_tun_dev(security);

        tunsec->sid = current_sid();
        return 0;
}

static int selinux_tun_dev_create(void)
{
        u32 sid = current_sid();

        /* we aren't taking into account the "sockcreate" SID since the socket
         * that is being created here is not a socket in the traditional sense,
         * instead it is a private sock, accessible only to the kernel, and
         * representing a wide range of network traffic spanning multiple
         * connections unlike traditional sockets - check the TUN driver to
         * get a better understanding of why this socket is special */

        return avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET, TUN_SOCKET__CREATE,
                            NULL);
}

static int selinux_tun_dev_attach_queue(void *security)
{
        struct tun_security_struct *tunsec = selinux_tun_dev(security);

        return avc_has_perm(current_sid(), tunsec->sid, SECCLASS_TUN_SOCKET,
                            TUN_SOCKET__ATTACH_QUEUE, NULL);
}

static int selinux_tun_dev_attach(struct sock *sk, void *security)
{
        struct tun_security_struct *tunsec = selinux_tun_dev(security);
        struct sk_security_struct *sksec = selinux_sock(sk);

        /* we don't currently perform any NetLabel based labeling here and it
         * isn't clear that we would want to do so anyway; while we could apply
         * labeling without the support of the TUN user the resulting labeled
         * traffic from the other end of the connection would almost certainly
         * cause confusion to the TUN user that had no idea network labeling
         * protocols were being used */

        sksec->sid = tunsec->sid;
        sksec->sclass = SECCLASS_TUN_SOCKET;

        return 0;
}

static int selinux_tun_dev_open(void *security)
{
        struct tun_security_struct *tunsec = selinux_tun_dev(security);
        u32 sid = current_sid();
        int err;

        err = avc_has_perm(sid, tunsec->sid, SECCLASS_TUN_SOCKET,
                           TUN_SOCKET__RELABELFROM, NULL);
        if (err)
                return err;
        err = avc_has_perm(sid, sid, SECCLASS_TUN_SOCKET,
                           TUN_SOCKET__RELABELTO, NULL);
        if (err)
                return err;
        tunsec->sid = sid;

        return 0;
}

#ifdef CONFIG_NETFILTER

static unsigned int selinux_ip_forward(void *priv, struct sk_buff *skb,
                                       const struct nf_hook_state *state)
{
        int ifindex;
        u16 family;
        char *addrp;
        u32 peer_sid;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        int secmark_active, peerlbl_active;

        if (!selinux_policycap_netpeer())
                return NF_ACCEPT;

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;

        family = state->pf;
        if (selinux_skb_peerlbl_sid(skb, family, &peer_sid) != 0)
                return NF_DROP;

        ifindex = state->in->ifindex;
        ad_net_init_from_iif(&ad, &net, ifindex, family);
        if (selinux_parse_skb(skb, &ad, &addrp, 1, NULL) != 0)
                return NF_DROP;

        if (peerlbl_active) {
                int err;

                err = selinux_inet_sys_rcv_skb(state->net, ifindex,
                                               addrp, family, peer_sid, &ad);
                if (err) {
                        selinux_netlbl_err(skb, family, err, 1);
                        return NF_DROP;
                }
        }

        if (secmark_active)
                if (avc_has_perm(peer_sid, skb->secmark,
                                 SECCLASS_PACKET, PACKET__FORWARD_IN, &ad))
                        return NF_DROP;

        if (netlbl_enabled())
                /* we do this in the FORWARD path and not the POST_ROUTING
                 * path because we want to make sure we apply the necessary
                 * labeling before IPsec is applied so we can leverage AH
                 * protection */
                if (selinux_netlbl_skbuff_setsid(skb, family, peer_sid) != 0)
                        return NF_DROP;

        return NF_ACCEPT;
}

static unsigned int selinux_ip_output(void *priv, struct sk_buff *skb,
                                      const struct nf_hook_state *state)
{
        struct sock *sk;
        u32 sid;

        if (!netlbl_enabled())
                return NF_ACCEPT;

        /* we do this in the LOCAL_OUT path and not the POST_ROUTING path
         * because we want to make sure we apply the necessary labeling
         * before IPsec is applied so we can leverage AH protection */
        sk = sk_to_full_sk(skb->sk);
        if (sk) {
                struct sk_security_struct *sksec;

                if (sk_listener(sk))
                        /* if the socket is the listening state then this
                         * packet is a SYN-ACK packet which means it needs to
                         * be labeled based on the connection/request_sock and
                         * not the parent socket.  unfortunately, we can't
                         * lookup the request_sock yet as it isn't queued on
                         * the parent socket until after the SYN-ACK is sent.
                         * the "solution" is to simply pass the packet as-is
                         * as any IP option based labeling should be copied
                         * from the initial connection request (in the IP
                         * layer).  it is far from ideal, but until we get a
                         * security label in the packet itself this is the
                         * best we can do. */
                        return NF_ACCEPT;

                /* standard practice, label using the parent socket */
                sksec = selinux_sock(sk);
                sid = sksec->sid;
        } else
                sid = SECINITSID_KERNEL;
        if (selinux_netlbl_skbuff_setsid(skb, state->pf, sid) != 0)
                return NF_DROP;

        return NF_ACCEPT;
}


static unsigned int selinux_ip_postroute_compat(struct sk_buff *skb,
                                        const struct nf_hook_state *state)
{
        struct sock *sk;
        struct sk_security_struct *sksec;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        u8 proto = 0;

        sk = skb_to_full_sk(skb);
        if (sk == NULL)
                return NF_ACCEPT;
        sksec = selinux_sock(sk);

        ad_net_init_from_iif(&ad, &net, state->out->ifindex, state->pf);
        if (selinux_parse_skb(skb, &ad, NULL, 0, &proto))
                return NF_DROP;

        if (selinux_secmark_enabled())
                if (avc_has_perm(sksec->sid, skb->secmark,
                                 SECCLASS_PACKET, PACKET__SEND, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

        if (selinux_xfrm_postroute_last(sksec->sid, skb, &ad, proto))
                return NF_DROP_ERR(-ECONNREFUSED);

        return NF_ACCEPT;
}

static unsigned int selinux_ip_postroute(void *priv,
                                         struct sk_buff *skb,
                                         const struct nf_hook_state *state)
{
        u16 family;
        u32 secmark_perm;
        u32 peer_sid;
        int ifindex;
        struct sock *sk;
        struct common_audit_data ad;
        struct lsm_network_audit net;
        char *addrp;
        int secmark_active, peerlbl_active;

        /* If any sort of compatibility mode is enabled then handoff processing
         * to the selinux_ip_postroute_compat() function to deal with the
         * special handling.  We do this in an attempt to keep this function
         * as fast and as clean as possible. */
        if (!selinux_policycap_netpeer())
                return selinux_ip_postroute_compat(skb, state);

        secmark_active = selinux_secmark_enabled();
        peerlbl_active = selinux_peerlbl_enabled();
        if (!secmark_active && !peerlbl_active)
                return NF_ACCEPT;

        sk = skb_to_full_sk(skb);

#ifdef CONFIG_XFRM
        /* If skb->dst->xfrm is non-NULL then the packet is undergoing an IPsec
         * packet transformation so allow the packet to pass without any checks
         * since we'll have another chance to perform access control checks
         * when the packet is on it's final way out.
         * NOTE: there appear to be some IPv6 multicast cases where skb->dst
         *       is NULL, in this case go ahead and apply access control.
         * NOTE: if this is a local socket (skb->sk != NULL) that is in the
         *       TCP listening state we cannot wait until the XFRM processing
         *       is done as we will miss out on the SA label if we do;
         *       unfortunately, this means more work, but it is only once per
         *       connection. */
        if (skb_dst(skb) != NULL && skb_dst(skb)->xfrm != NULL &&
            !(sk && sk_listener(sk)))
                return NF_ACCEPT;
#endif

        family = state->pf;
        if (sk == NULL) {
                /* Without an associated socket the packet is either coming
                 * from the kernel or it is being forwarded; check the packet
                 * to determine which and if the packet is being forwarded
                 * query the packet directly to determine the security label. */
                if (skb->skb_iif) {
                        secmark_perm = PACKET__FORWARD_OUT;
                        if (selinux_skb_peerlbl_sid(skb, family, &peer_sid))
                                return NF_DROP;
                } else {
                        secmark_perm = PACKET__SEND;
                        peer_sid = SECINITSID_KERNEL;
                }
        } else if (sk_listener(sk)) {
                /* Locally generated packet but the associated socket is in the
                 * listening state which means this is a SYN-ACK packet.  In
                 * this particular case the correct security label is assigned
                 * to the connection/request_sock but unfortunately we can't
                 * query the request_sock as it isn't queued on the parent
                 * socket until after the SYN-ACK packet is sent; the only
                 * viable choice is to regenerate the label like we do in
                 * selinux_inet_conn_request().  See also selinux_ip_output()
                 * for similar problems. */
                u32 skb_sid;
                struct sk_security_struct *sksec;

                sksec = selinux_sock(sk);
                if (selinux_skb_peerlbl_sid(skb, family, &skb_sid))
                        return NF_DROP;
                /* At this point, if the returned skb peerlbl is SECSID_NULL
                 * and the packet has been through at least one XFRM
                 * transformation then we must be dealing with the "final"
                 * form of labeled IPsec packet; since we've already applied
                 * all of our access controls on this packet we can safely
                 * pass the packet. */
                if (skb_sid == SECSID_NULL) {
                        switch (family) {
                        case PF_INET:
                                if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
                                        return NF_ACCEPT;
                                break;
                        case PF_INET6:
                                if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
                                        return NF_ACCEPT;
                                break;
                        default:
                                return NF_DROP_ERR(-ECONNREFUSED);
                        }
                }
                if (selinux_conn_sid(sksec->sid, skb_sid, &peer_sid))
                        return NF_DROP;
                secmark_perm = PACKET__SEND;
        } else {
                /* Locally generated packet, fetch the security label from the
                 * associated socket. */
                struct sk_security_struct *sksec = selinux_sock(sk);
                peer_sid = sksec->sid;
                secmark_perm = PACKET__SEND;
        }

        ifindex = state->out->ifindex;
        ad_net_init_from_iif(&ad, &net, ifindex, family);
        if (selinux_parse_skb(skb, &ad, &addrp, 0, NULL))
                return NF_DROP;

        if (secmark_active)
                if (avc_has_perm(peer_sid, skb->secmark,
                                 SECCLASS_PACKET, secmark_perm, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

        if (peerlbl_active) {
                u32 if_sid;
                u32 node_sid;

                if (sel_netif_sid(state->net, ifindex, &if_sid))
                        return NF_DROP;
                if (avc_has_perm(peer_sid, if_sid,
                                 SECCLASS_NETIF, NETIF__EGRESS, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);

                if (sel_netnode_sid(addrp, family, &node_sid))
                        return NF_DROP;
                if (avc_has_perm(peer_sid, node_sid,
                                 SECCLASS_NODE, NODE__SENDTO, &ad))
                        return NF_DROP_ERR(-ECONNREFUSED);
        }

        return NF_ACCEPT;
}
#endif        /* CONFIG_NETFILTER */

static int nlmsg_sock_has_extended_perms(struct sock *sk, u32 perms, u16 nlmsg_type)
{
        struct sk_security_struct *sksec = sk->sk_security;
        struct common_audit_data ad;
        u8 driver;
        u8 xperm;

        if (sock_skip_has_perm(sksec->sid))
                return 0;

        ad.type = LSM_AUDIT_DATA_NLMSGTYPE;
        ad.u.nlmsg_type = nlmsg_type;

        driver = nlmsg_type >> 8;
        xperm = nlmsg_type & 0xff;

        return avc_has_extended_perms(current_sid(), sksec->sid, sksec->sclass,
                                      perms, driver, AVC_EXT_NLMSG, xperm, &ad);
}

static int selinux_netlink_send(struct sock *sk, struct sk_buff *skb)
{
        int rc = 0;
        unsigned int msg_len;
        unsigned int data_len = skb->len;
        unsigned char *data = skb->data;
        struct nlmsghdr *nlh;
        struct sk_security_struct *sksec = selinux_sock(sk);
        u16 sclass = sksec->sclass;
        u32 perm;

        while (data_len >= nlmsg_total_size(0)) {
                nlh = (struct nlmsghdr *)data;

                /* NOTE: the nlmsg_len field isn't reliably set by some netlink
                 *       users which means we can't reject skb's with bogus
                 *       length fields; our solution is to follow what
                 *       netlink_rcv_skb() does and simply skip processing at
                 *       messages with length fields that are clearly junk
                 */
                if (nlh->nlmsg_len < NLMSG_HDRLEN || nlh->nlmsg_len > data_len)
                        return 0;

                rc = selinux_nlmsg_lookup(sclass, nlh->nlmsg_type, &perm);
                if (rc == 0) {
                        if (selinux_policycap_netlink_xperm()) {
                                rc = nlmsg_sock_has_extended_perms(
                                        sk, perm, nlh->nlmsg_type);
                        } else {
                                rc = sock_has_perm(sk, perm);
                        }
                        if (rc)
                                return rc;
                } else if (rc == -EINVAL) {
                        /* -EINVAL is a missing msg/perm mapping */
                        pr_warn_ratelimited("SELinux: unrecognized netlink"
                                " message: protocol=%hu nlmsg_type=%hu sclass=%s"
                                " pid=%d comm=%s\n",
                                sk->sk_protocol, nlh->nlmsg_type,
                                secclass_map[sclass - 1].name,
                                task_pid_nr(current), current->comm);
                        if (enforcing_enabled() &&
                            !security_get_allow_unknown())
                                return rc;
                        rc = 0;
                } else if (rc == -ENOENT) {
                        /* -ENOENT is a missing socket/class mapping, ignore */
                        rc = 0;
                } else {
                        return rc;
                }

                /* move to the next message after applying netlink padding */
                msg_len = NLMSG_ALIGN(nlh->nlmsg_len);
                if (msg_len >= data_len)
                        return 0;
                data_len -= msg_len;
                data += msg_len;
        }

        return rc;
}

static void ipc_init_security(struct ipc_security_struct *isec, u16 sclass)
{
        isec->sclass = sclass;
        isec->sid = current_sid();
}

static int ipc_has_perm(struct kern_ipc_perm *ipc_perms,
                        u32 perms)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(ipc_perms);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = ipc_perms->key;

        return avc_has_perm(sid, isec->sid, isec->sclass, perms, &ad);
}

static int selinux_msg_msg_alloc_security(struct msg_msg *msg)
{
        struct msg_security_struct *msec;

        msec = selinux_msg_msg(msg);
        msec->sid = SECINITSID_UNLABELED;

        return 0;
}

/* message queue security operations */
static int selinux_msg_queue_alloc_security(struct kern_ipc_perm *msq)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(msq);
        ipc_init_security(isec, SECCLASS_MSGQ);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_MSGQ,
                            MSGQ__CREATE, &ad);
}

static int selinux_msg_queue_associate(struct kern_ipc_perm *msq, int msqflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(msq);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_MSGQ,
                            MSGQ__ASSOCIATE, &ad);
}

static int selinux_msg_queue_msgctl(struct kern_ipc_perm *msq, int cmd)
{
        u32 perms;

        switch (cmd) {
        case IPC_INFO:
        case MSG_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case IPC_STAT:
        case MSG_STAT:
        case MSG_STAT_ANY:
                perms = MSGQ__GETATTR | MSGQ__ASSOCIATE;
                break;
        case IPC_SET:
                perms = MSGQ__SETATTR;
                break;
        case IPC_RMID:
                perms = MSGQ__DESTROY;
                break;
        default:
                return 0;
        }

        return ipc_has_perm(msq, perms);
}

static int selinux_msg_queue_msgsnd(struct kern_ipc_perm *msq, struct msg_msg *msg, int msqflg)
{
        struct ipc_security_struct *isec;
        struct msg_security_struct *msec;
        struct common_audit_data ad;
        u32 sid = current_sid();
        int rc;

        isec = selinux_ipc(msq);
        msec = selinux_msg_msg(msg);

        /*
         * First time through, need to assign label to the message
         */
        if (msec->sid == SECINITSID_UNLABELED) {
                /*
                 * Compute new sid based on current process and
                 * message queue this message will be stored in
                 */
                rc = security_transition_sid(sid, isec->sid,
                                             SECCLASS_MSG, NULL, &msec->sid);
                if (rc)
                        return rc;
        }

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        /* Can this process write to the queue? */
        rc = avc_has_perm(sid, isec->sid, SECCLASS_MSGQ,
                          MSGQ__WRITE, &ad);
        if (!rc)
                /* Can this process send the message */
                rc = avc_has_perm(sid, msec->sid, SECCLASS_MSG,
                                  MSG__SEND, &ad);
        if (!rc)
                /* Can the message be put in the queue? */
                rc = avc_has_perm(msec->sid, isec->sid, SECCLASS_MSGQ,
                                  MSGQ__ENQUEUE, &ad);

        return rc;
}

static int selinux_msg_queue_msgrcv(struct kern_ipc_perm *msq, struct msg_msg *msg,
                                    struct task_struct *target,
                                    long type, int mode)
{
        struct ipc_security_struct *isec;
        struct msg_security_struct *msec;
        struct common_audit_data ad;
        u32 sid = task_sid_obj(target);
        int rc;

        isec = selinux_ipc(msq);
        msec = selinux_msg_msg(msg);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = msq->key;

        rc = avc_has_perm(sid, isec->sid,
                          SECCLASS_MSGQ, MSGQ__READ, &ad);
        if (!rc)
                rc = avc_has_perm(sid, msec->sid,
                                  SECCLASS_MSG, MSG__RECEIVE, &ad);
        return rc;
}

/* Shared Memory security operations */
static int selinux_shm_alloc_security(struct kern_ipc_perm *shp)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(shp);
        ipc_init_security(isec, SECCLASS_SHM);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_SHM,
                            SHM__CREATE, &ad);
}

static int selinux_shm_associate(struct kern_ipc_perm *shp, int shmflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(shp);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = shp->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_SHM,
                            SHM__ASSOCIATE, &ad);
}

/* Note, at this point, shp is locked down */
static int selinux_shm_shmctl(struct kern_ipc_perm *shp, int cmd)
{
        u32 perms;

        switch (cmd) {
        case IPC_INFO:
        case SHM_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case IPC_STAT:
        case SHM_STAT:
        case SHM_STAT_ANY:
                perms = SHM__GETATTR | SHM__ASSOCIATE;
                break;
        case IPC_SET:
                perms = SHM__SETATTR;
                break;
        case SHM_LOCK:
        case SHM_UNLOCK:
                perms = SHM__LOCK;
                break;
        case IPC_RMID:
                perms = SHM__DESTROY;
                break;
        default:
                return 0;
        }

        return ipc_has_perm(shp, perms);
}

static int selinux_shm_shmat(struct kern_ipc_perm *shp,
                             char __user *shmaddr, int shmflg)
{
        u32 perms;

        if (shmflg & SHM_RDONLY)
                perms = SHM__READ;
        else
                perms = SHM__READ | SHM__WRITE;

        return ipc_has_perm(shp, perms);
}

/* Semaphore security operations */
static int selinux_sem_alloc_security(struct kern_ipc_perm *sma)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(sma);
        ipc_init_security(isec, SECCLASS_SEM);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_SEM,
                            SEM__CREATE, &ad);
}

static int selinux_sem_associate(struct kern_ipc_perm *sma, int semflg)
{
        struct ipc_security_struct *isec;
        struct common_audit_data ad;
        u32 sid = current_sid();

        isec = selinux_ipc(sma);

        ad.type = LSM_AUDIT_DATA_IPC;
        ad.u.ipc_id = sma->key;

        return avc_has_perm(sid, isec->sid, SECCLASS_SEM,
                            SEM__ASSOCIATE, &ad);
}

/* Note, at this point, sma is locked down */
static int selinux_sem_semctl(struct kern_ipc_perm *sma, int cmd)
{
        int err;
        u32 perms;

        switch (cmd) {
        case IPC_INFO:
        case SEM_INFO:
                /* No specific object, just general system-wide information. */
                return avc_has_perm(current_sid(), SECINITSID_KERNEL,
                                    SECCLASS_SYSTEM, SYSTEM__IPC_INFO, NULL);
        case GETPID:
        case GETNCNT:
        case GETZCNT:
                perms = SEM__GETATTR;
                break;
        case GETVAL:
        case GETALL:
                perms = SEM__READ;
                break;
        case SETVAL:
        case SETALL:
                perms = SEM__WRITE;
                break;
        case IPC_RMID:
                perms = SEM__DESTROY;
                break;
        case IPC_SET:
                perms = SEM__SETATTR;
                break;
        case IPC_STAT:
        case SEM_STAT:
        case SEM_STAT_ANY:
                perms = SEM__GETATTR | SEM__ASSOCIATE;
                break;
        default:
                return 0;
        }

        err = ipc_has_perm(sma, perms);
        return err;
}

static int selinux_sem_semop(struct kern_ipc_perm *sma,
                             struct sembuf *sops, unsigned nsops, int alter)
{
        u32 perms;

        if (alter)
                perms = SEM__READ | SEM__WRITE;
        else
                perms = SEM__READ;

        return ipc_has_perm(sma, perms);
}

static int selinux_ipc_permission(struct kern_ipc_perm *ipcp, short flag)
{
        u32 av = 0;

        av = 0;
        if (flag & S_IRUGO)
                av |= IPC__UNIX_READ;
        if (flag & S_IWUGO)
                av |= IPC__UNIX_WRITE;

        if (av == 0)
                return 0;

        return ipc_has_perm(ipcp, av);
}

static void selinux_ipc_getlsmprop(struct kern_ipc_perm *ipcp,
                                   struct lsm_prop *prop)
{
        struct ipc_security_struct *isec = selinux_ipc(ipcp);
        prop->selinux.secid = isec->sid;
}

static void selinux_d_instantiate(struct dentry *dentry, struct inode *inode)
{
        if (inode)
                inode_doinit_with_dentry(inode, dentry);
}

static int selinux_lsm_getattr(unsigned int attr, struct task_struct *p,
                               char **value)
{
        const struct task_security_struct *tsec;
        int error;
        u32 sid;
        u32 len;

        rcu_read_lock();
        tsec = selinux_cred(__task_cred(p));
        if (p != current) {
                error = avc_has_perm(current_sid(), tsec->sid,
                                     SECCLASS_PROCESS, PROCESS__GETATTR, NULL);
                if (error)
                        goto err_unlock;
        }
        switch (attr) {
        case LSM_ATTR_CURRENT:
                sid = tsec->sid;
                break;
        case LSM_ATTR_PREV:
                sid = tsec->osid;
                break;
        case LSM_ATTR_EXEC:
                sid = tsec->exec_sid;
                break;
        case LSM_ATTR_FSCREATE:
                sid = tsec->create_sid;
                break;
        case LSM_ATTR_KEYCREATE:
                sid = tsec->keycreate_sid;
                break;
        case LSM_ATTR_SOCKCREATE:
                sid = tsec->sockcreate_sid;
                break;
        default:
                error = -EOPNOTSUPP;
                goto err_unlock;
        }
        rcu_read_unlock();

        if (sid == SECSID_NULL) {
                *value = NULL;
                return 0;
        }

        error = security_sid_to_context(sid, value, &len);
        if (error)
                return error;
        return len;

err_unlock:
        rcu_read_unlock();
        return error;
}

static int selinux_lsm_setattr(u64 attr, void *value, size_t size)
{
        struct task_security_struct *tsec;
        struct cred *new;
        u32 mysid = current_sid(), sid = 0, ptsid;
        int error;
        char *str = value;

        /*
         * Basic control over ability to set these attributes at all.
         */
        switch (attr) {
        case LSM_ATTR_EXEC:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETEXEC, NULL);
                break;
        case LSM_ATTR_FSCREATE:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETFSCREATE, NULL);
                break;
        case LSM_ATTR_KEYCREATE:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETKEYCREATE, NULL);
                break;
        case LSM_ATTR_SOCKCREATE:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETSOCKCREATE, NULL);
                break;
        case LSM_ATTR_CURRENT:
                error = avc_has_perm(mysid, mysid, SECCLASS_PROCESS,
                                     PROCESS__SETCURRENT, NULL);
                break;
        default:
                error = -EOPNOTSUPP;
                break;
        }
        if (error)
                return error;

        /* Obtain a SID for the context, if one was specified. */
        if (size && str[0] && str[0] != '\n') {
                if (str[size-1] == '\n') {
                        str[size-1] = 0;
                        size--;
                }
                error = security_context_to_sid(value, size,
                                                &sid, GFP_KERNEL);
                if (error == -EINVAL && attr == LSM_ATTR_FSCREATE) {
                        if (!has_cap_mac_admin(true)) {
                                struct audit_buffer *ab;
                                size_t audit_size;

                                /* We strip a nul only if it is at the end,
                                 * otherwise the context contains a nul and
                                 * we should audit that */
                                if (str[size - 1] == '\0')
                                        audit_size = size - 1;
                                else
                                        audit_size = size;
                                ab = audit_log_start(audit_context(),
                                                     GFP_ATOMIC,
                                                     AUDIT_SELINUX_ERR);
                                if (!ab)
                                        return error;
                                audit_log_format(ab, "op=fscreate invalid_context=");
                                audit_log_n_untrustedstring(ab, value,
                                                            audit_size);
                                audit_log_end(ab);

                                return error;
                        }
                        error = security_context_to_sid_force(value, size,
                                                        &sid);
                }
                if (error)
                        return error;
        }

        new = prepare_creds();
        if (!new)
                return -ENOMEM;

        /* Permission checking based on the specified context is
           performed during the actual operation (execve,
           open/mkdir/...), when we know the full context of the
           operation.  See selinux_bprm_creds_for_exec for the execve
           checks and may_create for the file creation checks. The
           operation will then fail if the context is not permitted. */
        tsec = selinux_cred(new);
        if (attr == LSM_ATTR_EXEC) {
                tsec->exec_sid = sid;
        } else if (attr == LSM_ATTR_FSCREATE) {
                tsec->create_sid = sid;
        } else if (attr == LSM_ATTR_KEYCREATE) {
                if (sid) {
                        error = avc_has_perm(mysid, sid,
                                             SECCLASS_KEY, KEY__CREATE, NULL);
                        if (error)
                                goto abort_change;
                }
                tsec->keycreate_sid = sid;
        } else if (attr == LSM_ATTR_SOCKCREATE) {
                tsec->sockcreate_sid = sid;
        } else if (attr == LSM_ATTR_CURRENT) {
                error = -EINVAL;
                if (sid == 0)
                        goto abort_change;

                if (!current_is_single_threaded()) {
                        error = security_bounded_transition(tsec->sid, sid);
                        if (error)
                                goto abort_change;
                }

                /* Check permissions for the transition. */
                error = avc_has_perm(tsec->sid, sid, SECCLASS_PROCESS,
                                     PROCESS__DYNTRANSITION, NULL);
                if (error)
                        goto abort_change;

                /* Check for ptracing, and update the task SID if ok.
                   Otherwise, leave SID unchanged and fail. */
                ptsid = ptrace_parent_sid();
                if (ptsid != 0) {
                        error = avc_has_perm(ptsid, sid, SECCLASS_PROCESS,
                                             PROCESS__PTRACE, NULL);
                        if (error)
                                goto abort_change;
                }

                tsec->sid = sid;
        } else {
                error = -EINVAL;
                goto abort_change;
        }

        commit_creds(new);
        return size;

abort_change:
        abort_creds(new);
        return error;
}

/**
 * selinux_getselfattr - Get SELinux current task attributes
 * @attr: the requested attribute
 * @ctx: buffer to receive the result
 * @size: buffer size (input), buffer size used (output)
 * @flags: unused
 *
 * Fill the passed user space @ctx with the details of the requested
 * attribute.
 *
 * Returns the number of attributes on success, an error code otherwise.
 * There will only ever be one attribute.
 */
static int selinux_getselfattr(unsigned int attr, struct lsm_ctx __user *ctx,
                               u32 *size, u32 flags)
{
        int rc;
        char *val = NULL;
        int val_len;

        val_len = selinux_lsm_getattr(attr, current, &val);
        if (val_len < 0)
                return val_len;
        rc = lsm_fill_user_ctx(ctx, size, val, val_len, LSM_ID_SELINUX, 0);
        kfree(val);
        return (!rc ? 1 : rc);
}

static int selinux_setselfattr(unsigned int attr, struct lsm_ctx *ctx,
                               u32 size, u32 flags)
{
        int rc;

        rc = selinux_lsm_setattr(attr, ctx->ctx, ctx->ctx_len);
        if (rc > 0)
                return 0;
        return rc;
}

static int selinux_getprocattr(struct task_struct *p,
                               const char *name, char **value)
{
        unsigned int attr = lsm_name_to_attr(name);
        int rc;

        if (attr) {
                rc = selinux_lsm_getattr(attr, p, value);
                if (rc != -EOPNOTSUPP)
                        return rc;
        }

        return -EINVAL;
}

static int selinux_setprocattr(const char *name, void *value, size_t size)
{
        int attr = lsm_name_to_attr(name);

        if (attr)
                return selinux_lsm_setattr(attr, value, size);
        return -EINVAL;
}

static int selinux_ismaclabel(const char *name)
{
        return (strcmp(name, XATTR_SELINUX_SUFFIX) == 0);
}

static int selinux_secid_to_secctx(u32 secid, struct lsm_context *cp)
{
        u32 seclen;
        int ret;

        if (cp) {
                cp->id = LSM_ID_SELINUX;
                ret = security_sid_to_context(secid, &cp->context, &cp->len);
                if (ret < 0)
                        return ret;
                return cp->len;
        }
        ret = security_sid_to_context(secid, NULL, &seclen);
        if (ret < 0)
                return ret;
        return seclen;
}

static int selinux_lsmprop_to_secctx(struct lsm_prop *prop,
                                     struct lsm_context *cp)
{
        return selinux_secid_to_secctx(prop->selinux.secid, cp);
}

static int selinux_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
{
        return security_context_to_sid(secdata, seclen,
                                       secid, GFP_KERNEL);
}

static void selinux_release_secctx(struct lsm_context *cp)
{
        if (cp->id == LSM_ID_SELINUX) {
                kfree(cp->context);
                cp->context = NULL;
                cp->id = LSM_ID_UNDEF;
        }
}

static void selinux_inode_invalidate_secctx(struct inode *inode)
{
        struct inode_security_struct *isec = selinux_inode(inode);

        spin_lock(&isec->lock);
        isec->initialized = LABEL_INVALID;
        spin_unlock(&isec->lock);
}

/*
 *        called with inode->i_mutex locked
 */
static int selinux_inode_notifysecctx(struct inode *inode, void *ctx, u32 ctxlen)
{
        int rc = selinux_inode_setsecurity(inode, XATTR_SELINUX_SUFFIX,
                                           ctx, ctxlen, 0);
        /* Do not return error when suppressing label (SBLABEL_MNT not set). */
        return rc == -EOPNOTSUPP ? 0 : rc;
}

/*
 *        called with inode->i_mutex locked
 */
static int selinux_inode_setsecctx(struct dentry *dentry, void *ctx, u32 ctxlen)
{
        return __vfs_setxattr_locked(&nop_mnt_idmap, dentry, XATTR_NAME_SELINUX,
                                     ctx, ctxlen, 0, NULL);
}

static int selinux_inode_getsecctx(struct inode *inode, struct lsm_context *cp)
{
        int len;
        len = selinux_inode_getsecurity(&nop_mnt_idmap, inode,
                                        XATTR_SELINUX_SUFFIX,
                                        (void **)&cp->context, true);
        if (len < 0)
                return len;
        cp->len = len;
        cp->id = LSM_ID_SELINUX;
        return 0;
}
#ifdef CONFIG_KEYS

static int selinux_key_alloc(struct key *k, const struct cred *cred,
                             unsigned long flags)
{
        const struct task_security_struct *tsec;
        struct key_security_struct *ksec = selinux_key(k);

        tsec = selinux_cred(cred);
        if (tsec->keycreate_sid)
                ksec->sid = tsec->keycreate_sid;
        else
                ksec->sid = tsec->sid;

        return 0;
}

static int selinux_key_permission(key_ref_t key_ref,
                                  const struct cred *cred,
                                  enum key_need_perm need_perm)
{
        struct key *key;
        struct key_security_struct *ksec;
        u32 perm, sid;

        switch (need_perm) {
        case KEY_NEED_VIEW:
                perm = KEY__VIEW;
                break;
        case KEY_NEED_READ:
                perm = KEY__READ;
                break;
        case KEY_NEED_WRITE:
                perm = KEY__WRITE;
                break;
        case KEY_NEED_SEARCH:
                perm = KEY__SEARCH;
                break;
        case KEY_NEED_LINK:
                perm = KEY__LINK;
                break;
        case KEY_NEED_SETATTR:
                perm = KEY__SETATTR;
                break;
        case KEY_NEED_UNLINK:
        case KEY_SYSADMIN_OVERRIDE:
        case KEY_AUTHTOKEN_OVERRIDE:
        case KEY_DEFER_PERM_CHECK:
                return 0;
        default:
                WARN_ON(1);
                return -EPERM;

        }

        sid = cred_sid(cred);
        key = key_ref_to_ptr(key_ref);
        ksec = selinux_key(key);

        return avc_has_perm(sid, ksec->sid, SECCLASS_KEY, perm, NULL);
}

static int selinux_key_getsecurity(struct key *key, char **_buffer)
{
        struct key_security_struct *ksec = selinux_key(key);
        char *context = NULL;
        unsigned len;
        int rc;

        rc = security_sid_to_context(ksec->sid,
                                     &context, &len);
        if (!rc)
                rc = len;
        *_buffer = context;
        return rc;
}

#ifdef CONFIG_KEY_NOTIFICATIONS
static int selinux_watch_key(struct key *key)
{
        struct key_security_struct *ksec = selinux_key(key);
        u32 sid = current_sid();

        return avc_has_perm(sid, ksec->sid, SECCLASS_KEY, KEY__VIEW, NULL);
}
#endif
#endif

#ifdef CONFIG_SECURITY_INFINIBAND
static int selinux_ib_pkey_access(void *ib_sec, u64 subnet_prefix, u16 pkey_val)
{
        struct common_audit_data ad;
        int err;
        u32 sid = 0;
        struct ib_security_struct *sec = ib_sec;
        struct lsm_ibpkey_audit ibpkey;

        err = sel_ib_pkey_sid(subnet_prefix, pkey_val, &sid);
        if (err)
                return err;

        ad.type = LSM_AUDIT_DATA_IBPKEY;
        ibpkey.subnet_prefix = subnet_prefix;
        ibpkey.pkey = pkey_val;
        ad.u.ibpkey = &ibpkey;
        return avc_has_perm(sec->sid, sid,
                            SECCLASS_INFINIBAND_PKEY,
                            INFINIBAND_PKEY__ACCESS, &ad);
}

static int selinux_ib_endport_manage_subnet(void *ib_sec, const char *dev_name,
                                            u8 port_num)
{
        struct common_audit_data ad;
        int err;
        u32 sid = 0;
        struct ib_security_struct *sec = ib_sec;
        struct lsm_ibendport_audit ibendport;

        err = security_ib_endport_sid(dev_name, port_num,
                                      &sid);

        if (err)
                return err;

        ad.type = LSM_AUDIT_DATA_IBENDPORT;
        ibendport.dev_name = dev_name;
        ibendport.port = port_num;
        ad.u.ibendport = &ibendport;
        return avc_has_perm(sec->sid, sid,
                            SECCLASS_INFINIBAND_ENDPORT,
                            INFINIBAND_ENDPORT__MANAGE_SUBNET, &ad);
}

static int selinux_ib_alloc_security(void *ib_sec)
{
        struct ib_security_struct *sec = selinux_ib(ib_sec);

        sec->sid = current_sid();
        return 0;
}
#endif

#ifdef CONFIG_BPF_SYSCALL
static int selinux_bpf(int cmd, union bpf_attr *attr,
                       unsigned int size, bool kernel)
{
        u32 sid = current_sid();
        int ret;

        switch (cmd) {
        case BPF_MAP_CREATE:
                ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__MAP_CREATE,
                                   NULL);
                break;
        case BPF_PROG_LOAD:
                ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__PROG_LOAD,
                                   NULL);
                break;
        default:
                ret = 0;
                break;
        }

        return ret;
}

static u32 bpf_map_fmode_to_av(fmode_t fmode)
{
        u32 av = 0;

        if (fmode & FMODE_READ)
                av |= BPF__MAP_READ;
        if (fmode & FMODE_WRITE)
                av |= BPF__MAP_WRITE;
        return av;
}

/* This function will check the file pass through unix socket or binder to see
 * if it is a bpf related object. And apply corresponding checks on the bpf
 * object based on the type. The bpf maps and programs, not like other files and
 * socket, are using a shared anonymous inode inside the kernel as their inode.
 * So checking that inode cannot identify if the process have privilege to
 * access the bpf object and that's why we have to add this additional check in
 * selinux_file_receive and selinux_binder_transfer_files.
 */
static int bpf_fd_pass(const struct file *file, u32 sid)
{
        struct bpf_security_struct *bpfsec;
        struct bpf_prog *prog;
        struct bpf_map *map;
        int ret;

        if (file->f_op == &bpf_map_fops) {
                map = file->private_data;
                bpfsec = map->security;
                ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
                                   bpf_map_fmode_to_av(file->f_mode), NULL);
                if (ret)
                        return ret;
        } else if (file->f_op == &bpf_prog_fops) {
                prog = file->private_data;
                bpfsec = prog->aux->security;
                ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
                                   BPF__PROG_RUN, NULL);
                if (ret)
                        return ret;
        }
        return 0;
}

static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
{
        u32 sid = current_sid();
        struct bpf_security_struct *bpfsec;

        bpfsec = map->security;
        return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
                            bpf_map_fmode_to_av(fmode), NULL);
}

static int selinux_bpf_prog(struct bpf_prog *prog)
{
        u32 sid = current_sid();
        struct bpf_security_struct *bpfsec;

        bpfsec = prog->aux->security;
        return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
                            BPF__PROG_RUN, NULL);
}

static int selinux_bpf_map_create(struct bpf_map *map, union bpf_attr *attr,
                                  struct bpf_token *token, bool kernel)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
        if (!bpfsec)
                return -ENOMEM;

        bpfsec->sid = current_sid();
        map->security = bpfsec;

        return 0;
}

static void selinux_bpf_map_free(struct bpf_map *map)
{
        struct bpf_security_struct *bpfsec = map->security;

        map->security = NULL;
        kfree(bpfsec);
}

static int selinux_bpf_prog_load(struct bpf_prog *prog, union bpf_attr *attr,
                                 struct bpf_token *token, bool kernel)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
        if (!bpfsec)
                return -ENOMEM;

        bpfsec->sid = current_sid();
        prog->aux->security = bpfsec;

        return 0;
}

static void selinux_bpf_prog_free(struct bpf_prog *prog)
{
        struct bpf_security_struct *bpfsec = prog->aux->security;

        prog->aux->security = NULL;
        kfree(bpfsec);
}

static int selinux_bpf_token_create(struct bpf_token *token, union bpf_attr *attr,
                                    const struct path *path)
{
        struct bpf_security_struct *bpfsec;

        bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
        if (!bpfsec)
                return -ENOMEM;

        bpfsec->sid = current_sid();
        token->security = bpfsec;

        return 0;
}

static void selinux_bpf_token_free(struct bpf_token *token)
{
        struct bpf_security_struct *bpfsec = token->security;

        token->security = NULL;
        kfree(bpfsec);
}
#endif

struct lsm_blob_sizes selinux_blob_sizes __ro_after_init = {
        .lbs_cred = sizeof(struct task_security_struct),
        .lbs_file = sizeof(struct file_security_struct),
        .lbs_inode = sizeof(struct inode_security_struct),
        .lbs_ipc = sizeof(struct ipc_security_struct),
        .lbs_key = sizeof(struct key_security_struct),
        .lbs_msg_msg = sizeof(struct msg_security_struct),
#ifdef CONFIG_PERF_EVENTS
        .lbs_perf_event = sizeof(struct perf_event_security_struct),
#endif
        .lbs_sock = sizeof(struct sk_security_struct),
        .lbs_superblock = sizeof(struct superblock_security_struct),
        .lbs_xattr_count = SELINUX_INODE_INIT_XATTRS,
        .lbs_tun_dev = sizeof(struct tun_security_struct),
        .lbs_ib = sizeof(struct ib_security_struct),
};

#ifdef CONFIG_PERF_EVENTS
static int selinux_perf_event_open(int type)
{
        u32 requested, sid = current_sid();

        if (type == PERF_SECURITY_OPEN)
                requested = PERF_EVENT__OPEN;
        else if (type == PERF_SECURITY_CPU)
                requested = PERF_EVENT__CPU;
        else if (type == PERF_SECURITY_KERNEL)
                requested = PERF_EVENT__KERNEL;
        else if (type == PERF_SECURITY_TRACEPOINT)
                requested = PERF_EVENT__TRACEPOINT;
        else
                return -EINVAL;

        return avc_has_perm(sid, sid, SECCLASS_PERF_EVENT,
                            requested, NULL);
}

static int selinux_perf_event_alloc(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec;

        perfsec = selinux_perf_event(event->security);
        perfsec->sid = current_sid();

        return 0;
}

static int selinux_perf_event_read(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;
        u32 sid = current_sid();

        return avc_has_perm(sid, perfsec->sid,
                            SECCLASS_PERF_EVENT, PERF_EVENT__READ, NULL);
}

static int selinux_perf_event_write(struct perf_event *event)
{
        struct perf_event_security_struct *perfsec = event->security;
        u32 sid = current_sid();

        return avc_has_perm(sid, perfsec->sid,
                            SECCLASS_PERF_EVENT, PERF_EVENT__WRITE, NULL);
}
#endif

#ifdef CONFIG_IO_URING
/**
 * selinux_uring_override_creds - check the requested cred override
 * @new: the target creds
 *
 * Check to see if the current task is allowed to override it's credentials
 * to service an io_uring operation.
 */
static int selinux_uring_override_creds(const struct cred *new)
{
        return avc_has_perm(current_sid(), cred_sid(new),
                            SECCLASS_IO_URING, IO_URING__OVERRIDE_CREDS, NULL);
}

/**
 * selinux_uring_sqpoll - check if a io_uring polling thread can be created
 *
 * Check to see if the current task is allowed to create a new io_uring
 * kernel polling thread.
 */
static int selinux_uring_sqpoll(void)
{
        u32 sid = current_sid();

        return avc_has_perm(sid, sid,
                            SECCLASS_IO_URING, IO_URING__SQPOLL, NULL);
}

/**
 * selinux_uring_cmd - check if IORING_OP_URING_CMD is allowed
 * @ioucmd: the io_uring command structure
 *
 * Check to see if the current domain is allowed to execute an
 * IORING_OP_URING_CMD against the device/file specified in @ioucmd.
 *
 */
static int selinux_uring_cmd(struct io_uring_cmd *ioucmd)
{
        struct file *file = ioucmd->file;
        struct inode *inode = file_inode(file);
        struct inode_security_struct *isec = selinux_inode(inode);
        struct common_audit_data ad;

        ad.type = LSM_AUDIT_DATA_FILE;
        ad.u.file = file;

        return avc_has_perm(current_sid(), isec->sid,
                            SECCLASS_IO_URING, IO_URING__CMD, &ad);
}

/**
 * selinux_uring_allowed - check if io_uring_setup() can be called
 *
 * Check to see if the current task is allowed to call io_uring_setup().
 */
static int selinux_uring_allowed(void)
{
        u32 sid = current_sid();

        return avc_has_perm(sid, sid, SECCLASS_IO_URING, IO_URING__ALLOWED,
                            NULL);
}
#endif /* CONFIG_IO_URING */

static const struct lsm_id selinux_lsmid = {
        .name = "selinux",
        .id = LSM_ID_SELINUX,
};

/*
 * IMPORTANT NOTE: When adding new hooks, please be careful to keep this order:
 * 1. any hooks that don't belong to (2.) or (3.) below,
 * 2. hooks that both access structures allocated by other hooks, and allocate
 *    structures that can be later accessed by other hooks (mostly "cloning"
 *    hooks),
 * 3. hooks that only allocate structures that can be later accessed by other
 *    hooks ("allocating" hooks).
 *
 * Please follow block comment delimiters in the list to keep this order.
 */
static struct security_hook_list selinux_hooks[] __ro_after_init = {
        LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
        LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
        LSM_HOOK_INIT(binder_transfer_binder, selinux_binder_transfer_binder),
        LSM_HOOK_INIT(binder_transfer_file, selinux_binder_transfer_file),

        LSM_HOOK_INIT(ptrace_access_check, selinux_ptrace_access_check),
        LSM_HOOK_INIT(ptrace_traceme, selinux_ptrace_traceme),
        LSM_HOOK_INIT(capget, selinux_capget),
        LSM_HOOK_INIT(capset, selinux_capset),
        LSM_HOOK_INIT(capable, selinux_capable),
        LSM_HOOK_INIT(quotactl, selinux_quotactl),
        LSM_HOOK_INIT(quota_on, selinux_quota_on),
        LSM_HOOK_INIT(syslog, selinux_syslog),
        LSM_HOOK_INIT(vm_enough_memory, selinux_vm_enough_memory),

        LSM_HOOK_INIT(netlink_send, selinux_netlink_send),

        LSM_HOOK_INIT(bprm_creds_for_exec, selinux_bprm_creds_for_exec),
        LSM_HOOK_INIT(bprm_committing_creds, selinux_bprm_committing_creds),
        LSM_HOOK_INIT(bprm_committed_creds, selinux_bprm_committed_creds),

        LSM_HOOK_INIT(sb_free_mnt_opts, selinux_free_mnt_opts),
        LSM_HOOK_INIT(sb_mnt_opts_compat, selinux_sb_mnt_opts_compat),
        LSM_HOOK_INIT(sb_remount, selinux_sb_remount),
        LSM_HOOK_INIT(sb_kern_mount, selinux_sb_kern_mount),
        LSM_HOOK_INIT(sb_show_options, selinux_sb_show_options),
        LSM_HOOK_INIT(sb_statfs, selinux_sb_statfs),
        LSM_HOOK_INIT(sb_mount, selinux_mount),
        LSM_HOOK_INIT(sb_umount, selinux_umount),
        LSM_HOOK_INIT(sb_set_mnt_opts, selinux_set_mnt_opts),
        LSM_HOOK_INIT(sb_clone_mnt_opts, selinux_sb_clone_mnt_opts),

        LSM_HOOK_INIT(move_mount, selinux_move_mount),

        LSM_HOOK_INIT(dentry_init_security, selinux_dentry_init_security),
        LSM_HOOK_INIT(dentry_create_files_as, selinux_dentry_create_files_as),

        LSM_HOOK_INIT(inode_free_security, selinux_inode_free_security),
        LSM_HOOK_INIT(inode_init_security, selinux_inode_init_security),
        LSM_HOOK_INIT(inode_init_security_anon, selinux_inode_init_security_anon),
        LSM_HOOK_INIT(inode_create, selinux_inode_create),
        LSM_HOOK_INIT(inode_link, selinux_inode_link),
        LSM_HOOK_INIT(inode_unlink, selinux_inode_unlink),
        LSM_HOOK_INIT(inode_symlink, selinux_inode_symlink),
        LSM_HOOK_INIT(inode_mkdir, selinux_inode_mkdir),
        LSM_HOOK_INIT(inode_rmdir, selinux_inode_rmdir),
        LSM_HOOK_INIT(inode_mknod, selinux_inode_mknod),
        LSM_HOOK_INIT(inode_rename, selinux_inode_rename),
        LSM_HOOK_INIT(inode_readlink, selinux_inode_readlink),
        LSM_HOOK_INIT(inode_follow_link, selinux_inode_follow_link),
        LSM_HOOK_INIT(inode_permission, selinux_inode_permission),
        LSM_HOOK_INIT(inode_setattr, selinux_inode_setattr),
        LSM_HOOK_INIT(inode_getattr, selinux_inode_getattr),
        LSM_HOOK_INIT(inode_xattr_skipcap, selinux_inode_xattr_skipcap),
        LSM_HOOK_INIT(inode_setxattr, selinux_inode_setxattr),
        LSM_HOOK_INIT(inode_post_setxattr, selinux_inode_post_setxattr),
        LSM_HOOK_INIT(inode_getxattr, selinux_inode_getxattr),
        LSM_HOOK_INIT(inode_listxattr, selinux_inode_listxattr),
        LSM_HOOK_INIT(inode_removexattr, selinux_inode_removexattr),
        LSM_HOOK_INIT(inode_set_acl, selinux_inode_set_acl),
        LSM_HOOK_INIT(inode_get_acl, selinux_inode_get_acl),
        LSM_HOOK_INIT(inode_remove_acl, selinux_inode_remove_acl),
        LSM_HOOK_INIT(inode_getsecurity, selinux_inode_getsecurity),
        LSM_HOOK_INIT(inode_setsecurity, selinux_inode_setsecurity),
        LSM_HOOK_INIT(inode_listsecurity, selinux_inode_listsecurity),
        LSM_HOOK_INIT(inode_getlsmprop, selinux_inode_getlsmprop),
        LSM_HOOK_INIT(inode_copy_up, selinux_inode_copy_up),
        LSM_HOOK_INIT(inode_copy_up_xattr, selinux_inode_copy_up_xattr),
        LSM_HOOK_INIT(path_notify, selinux_path_notify),

        LSM_HOOK_INIT(kernfs_init_security, selinux_kernfs_init_security),

        LSM_HOOK_INIT(file_permission, selinux_file_permission),
        LSM_HOOK_INIT(file_alloc_security, selinux_file_alloc_security),
        LSM_HOOK_INIT(file_ioctl, selinux_file_ioctl),
        LSM_HOOK_INIT(file_ioctl_compat, selinux_file_ioctl_compat),
        LSM_HOOK_INIT(mmap_file, selinux_mmap_file),
        LSM_HOOK_INIT(mmap_addr, selinux_mmap_addr),
        LSM_HOOK_INIT(file_mprotect, selinux_file_mprotect),
        LSM_HOOK_INIT(file_lock, selinux_file_lock),
        LSM_HOOK_INIT(file_fcntl, selinux_file_fcntl),
        LSM_HOOK_INIT(file_set_fowner, selinux_file_set_fowner),
        LSM_HOOK_INIT(file_send_sigiotask, selinux_file_send_sigiotask),
        LSM_HOOK_INIT(file_receive, selinux_file_receive),

        LSM_HOOK_INIT(file_open, selinux_file_open),

        LSM_HOOK_INIT(task_alloc, selinux_task_alloc),
        LSM_HOOK_INIT(cred_prepare, selinux_cred_prepare),
        LSM_HOOK_INIT(cred_transfer, selinux_cred_transfer),
        LSM_HOOK_INIT(cred_getsecid, selinux_cred_getsecid),
        LSM_HOOK_INIT(cred_getlsmprop, selinux_cred_getlsmprop),
        LSM_HOOK_INIT(kernel_act_as, selinux_kernel_act_as),
        LSM_HOOK_INIT(kernel_create_files_as, selinux_kernel_create_files_as),
        LSM_HOOK_INIT(kernel_module_request, selinux_kernel_module_request),
        LSM_HOOK_INIT(kernel_load_data, selinux_kernel_load_data),
        LSM_HOOK_INIT(kernel_read_file, selinux_kernel_read_file),
        LSM_HOOK_INIT(task_setpgid, selinux_task_setpgid),
        LSM_HOOK_INIT(task_getpgid, selinux_task_getpgid),
        LSM_HOOK_INIT(task_getsid, selinux_task_getsid),
        LSM_HOOK_INIT(current_getlsmprop_subj, selinux_current_getlsmprop_subj),
        LSM_HOOK_INIT(task_getlsmprop_obj, selinux_task_getlsmprop_obj),
        LSM_HOOK_INIT(task_setnice, selinux_task_setnice),
        LSM_HOOK_INIT(task_setioprio, selinux_task_setioprio),
        LSM_HOOK_INIT(task_getioprio, selinux_task_getioprio),
        LSM_HOOK_INIT(task_prlimit, selinux_task_prlimit),
        LSM_HOOK_INIT(task_setrlimit, selinux_task_setrlimit),
        LSM_HOOK_INIT(task_setscheduler, selinux_task_setscheduler),
        LSM_HOOK_INIT(task_getscheduler, selinux_task_getscheduler),
        LSM_HOOK_INIT(task_movememory, selinux_task_movememory),
        LSM_HOOK_INIT(task_kill, selinux_task_kill),
        LSM_HOOK_INIT(task_to_inode, selinux_task_to_inode),
        LSM_HOOK_INIT(userns_create, selinux_userns_create),

        LSM_HOOK_INIT(ipc_permission, selinux_ipc_permission),
        LSM_HOOK_INIT(ipc_getlsmprop, selinux_ipc_getlsmprop),

        LSM_HOOK_INIT(msg_queue_associate, selinux_msg_queue_associate),
        LSM_HOOK_INIT(msg_queue_msgctl, selinux_msg_queue_msgctl),
        LSM_HOOK_INIT(msg_queue_msgsnd, selinux_msg_queue_msgsnd),
        LSM_HOOK_INIT(msg_queue_msgrcv, selinux_msg_queue_msgrcv),

        LSM_HOOK_INIT(shm_associate, selinux_shm_associate),
        LSM_HOOK_INIT(shm_shmctl, selinux_shm_shmctl),
        LSM_HOOK_INIT(shm_shmat, selinux_shm_shmat),

        LSM_HOOK_INIT(sem_associate, selinux_sem_associate),
        LSM_HOOK_INIT(sem_semctl, selinux_sem_semctl),
        LSM_HOOK_INIT(sem_semop, selinux_sem_semop),

        LSM_HOOK_INIT(d_instantiate, selinux_d_instantiate),

        LSM_HOOK_INIT(getselfattr, selinux_getselfattr),
        LSM_HOOK_INIT(setselfattr, selinux_setselfattr),
        LSM_HOOK_INIT(getprocattr, selinux_getprocattr),
        LSM_HOOK_INIT(setprocattr, selinux_setprocattr),

        LSM_HOOK_INIT(ismaclabel, selinux_ismaclabel),
        LSM_HOOK_INIT(secctx_to_secid, selinux_secctx_to_secid),
        LSM_HOOK_INIT(release_secctx, selinux_release_secctx),
        LSM_HOOK_INIT(inode_invalidate_secctx, selinux_inode_invalidate_secctx),
        LSM_HOOK_INIT(inode_notifysecctx, selinux_inode_notifysecctx),
        LSM_HOOK_INIT(inode_setsecctx, selinux_inode_setsecctx),

        LSM_HOOK_INIT(unix_stream_connect, selinux_socket_unix_stream_connect),
        LSM_HOOK_INIT(unix_may_send, selinux_socket_unix_may_send),

        LSM_HOOK_INIT(socket_create, selinux_socket_create),
        LSM_HOOK_INIT(socket_post_create, selinux_socket_post_create),
        LSM_HOOK_INIT(socket_socketpair, selinux_socket_socketpair),
        LSM_HOOK_INIT(socket_bind, selinux_socket_bind),
        LSM_HOOK_INIT(socket_connect, selinux_socket_connect),
        LSM_HOOK_INIT(socket_listen, selinux_socket_listen),
        LSM_HOOK_INIT(socket_accept, selinux_socket_accept),
        LSM_HOOK_INIT(socket_sendmsg, selinux_socket_sendmsg),
        LSM_HOOK_INIT(socket_recvmsg, selinux_socket_recvmsg),
        LSM_HOOK_INIT(socket_getsockname, selinux_socket_getsockname),
        LSM_HOOK_INIT(socket_getpeername, selinux_socket_getpeername),
        LSM_HOOK_INIT(socket_getsockopt, selinux_socket_getsockopt),
        LSM_HOOK_INIT(socket_setsockopt, selinux_socket_setsockopt),
        LSM_HOOK_INIT(socket_shutdown, selinux_socket_shutdown),
        LSM_HOOK_INIT(socket_sock_rcv_skb, selinux_socket_sock_rcv_skb),
        LSM_HOOK_INIT(socket_getpeersec_stream,
                        selinux_socket_getpeersec_stream),
        LSM_HOOK_INIT(socket_getpeersec_dgram, selinux_socket_getpeersec_dgram),
        LSM_HOOK_INIT(sk_free_security, selinux_sk_free_security),
        LSM_HOOK_INIT(sk_clone_security, selinux_sk_clone_security),
        LSM_HOOK_INIT(sk_getsecid, selinux_sk_getsecid),
        LSM_HOOK_INIT(sock_graft, selinux_sock_graft),
        LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request),
        LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone),
        LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect),
        LSM_HOOK_INIT(sctp_assoc_established, selinux_sctp_assoc_established),
        LSM_HOOK_INIT(mptcp_add_subflow, selinux_mptcp_add_subflow),
        LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request),
        LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone),
        LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established),
        LSM_HOOK_INIT(secmark_relabel_packet, selinux_secmark_relabel_packet),
        LSM_HOOK_INIT(secmark_refcount_inc, selinux_secmark_refcount_inc),
        LSM_HOOK_INIT(secmark_refcount_dec, selinux_secmark_refcount_dec),
        LSM_HOOK_INIT(req_classify_flow, selinux_req_classify_flow),
        LSM_HOOK_INIT(tun_dev_create, selinux_tun_dev_create),
        LSM_HOOK_INIT(tun_dev_attach_queue, selinux_tun_dev_attach_queue),
        LSM_HOOK_INIT(tun_dev_attach, selinux_tun_dev_attach),
        LSM_HOOK_INIT(tun_dev_open, selinux_tun_dev_open),
#ifdef CONFIG_SECURITY_INFINIBAND
        LSM_HOOK_INIT(ib_pkey_access, selinux_ib_pkey_access),
        LSM_HOOK_INIT(ib_endport_manage_subnet,
                      selinux_ib_endport_manage_subnet),
#endif
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_free_security, selinux_xfrm_policy_free),
        LSM_HOOK_INIT(xfrm_policy_delete_security, selinux_xfrm_policy_delete),
        LSM_HOOK_INIT(xfrm_state_free_security, selinux_xfrm_state_free),
        LSM_HOOK_INIT(xfrm_state_delete_security, selinux_xfrm_state_delete),
        LSM_HOOK_INIT(xfrm_policy_lookup, selinux_xfrm_policy_lookup),
        LSM_HOOK_INIT(xfrm_state_pol_flow_match,
                        selinux_xfrm_state_pol_flow_match),
        LSM_HOOK_INIT(xfrm_decode_session, selinux_xfrm_decode_session),
#endif

#ifdef CONFIG_KEYS
        LSM_HOOK_INIT(key_permission, selinux_key_permission),
        LSM_HOOK_INIT(key_getsecurity, selinux_key_getsecurity),
#ifdef CONFIG_KEY_NOTIFICATIONS
        LSM_HOOK_INIT(watch_key, selinux_watch_key),
#endif
#endif

#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_known, selinux_audit_rule_known),
        LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match),
        LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free),
#endif

#ifdef CONFIG_BPF_SYSCALL
        LSM_HOOK_INIT(bpf, selinux_bpf),
        LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
        LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
        LSM_HOOK_INIT(bpf_map_free, selinux_bpf_map_free),
        LSM_HOOK_INIT(bpf_prog_free, selinux_bpf_prog_free),
        LSM_HOOK_INIT(bpf_token_free, selinux_bpf_token_free),
#endif

#ifdef CONFIG_PERF_EVENTS
        LSM_HOOK_INIT(perf_event_open, selinux_perf_event_open),
        LSM_HOOK_INIT(perf_event_read, selinux_perf_event_read),
        LSM_HOOK_INIT(perf_event_write, selinux_perf_event_write),
#endif

#ifdef CONFIG_IO_URING
        LSM_HOOK_INIT(uring_override_creds, selinux_uring_override_creds),
        LSM_HOOK_INIT(uring_sqpoll, selinux_uring_sqpoll),
        LSM_HOOK_INIT(uring_cmd, selinux_uring_cmd),
        LSM_HOOK_INIT(uring_allowed, selinux_uring_allowed),
#endif

        /*
         * PUT "CLONING" (ACCESSING + ALLOCATING) HOOKS HERE
         */
        LSM_HOOK_INIT(fs_context_submount, selinux_fs_context_submount),
        LSM_HOOK_INIT(fs_context_dup, selinux_fs_context_dup),
        LSM_HOOK_INIT(fs_context_parse_param, selinux_fs_context_parse_param),
        LSM_HOOK_INIT(sb_eat_lsm_opts, selinux_sb_eat_lsm_opts),
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_clone_security, selinux_xfrm_policy_clone),
#endif

        /*
         * PUT "ALLOCATING" HOOKS HERE
         */
        LSM_HOOK_INIT(msg_msg_alloc_security, selinux_msg_msg_alloc_security),
        LSM_HOOK_INIT(msg_queue_alloc_security,
                      selinux_msg_queue_alloc_security),
        LSM_HOOK_INIT(shm_alloc_security, selinux_shm_alloc_security),
        LSM_HOOK_INIT(sb_alloc_security, selinux_sb_alloc_security),
        LSM_HOOK_INIT(inode_alloc_security, selinux_inode_alloc_security),
        LSM_HOOK_INIT(sem_alloc_security, selinux_sem_alloc_security),
        LSM_HOOK_INIT(secid_to_secctx, selinux_secid_to_secctx),
        LSM_HOOK_INIT(lsmprop_to_secctx, selinux_lsmprop_to_secctx),
        LSM_HOOK_INIT(inode_getsecctx, selinux_inode_getsecctx),
        LSM_HOOK_INIT(sk_alloc_security, selinux_sk_alloc_security),
        LSM_HOOK_INIT(tun_dev_alloc_security, selinux_tun_dev_alloc_security),
#ifdef CONFIG_SECURITY_INFINIBAND
        LSM_HOOK_INIT(ib_alloc_security, selinux_ib_alloc_security),
#endif
#ifdef CONFIG_SECURITY_NETWORK_XFRM
        LSM_HOOK_INIT(xfrm_policy_alloc_security, selinux_xfrm_policy_alloc),
        LSM_HOOK_INIT(xfrm_state_alloc, selinux_xfrm_state_alloc),
        LSM_HOOK_INIT(xfrm_state_alloc_acquire,
                      selinux_xfrm_state_alloc_acquire),
#endif
#ifdef CONFIG_KEYS
        LSM_HOOK_INIT(key_alloc, selinux_key_alloc),
#endif
#ifdef CONFIG_AUDIT
        LSM_HOOK_INIT(audit_rule_init, selinux_audit_rule_init),
#endif
#ifdef CONFIG_BPF_SYSCALL
        LSM_HOOK_INIT(bpf_map_create, selinux_bpf_map_create),
        LSM_HOOK_INIT(bpf_prog_load, selinux_bpf_prog_load),
        LSM_HOOK_INIT(bpf_token_create, selinux_bpf_token_create),
#endif
#ifdef CONFIG_PERF_EVENTS
        LSM_HOOK_INIT(perf_event_alloc, selinux_perf_event_alloc),
#endif
};

static __init int selinux_init(void)
{
        pr_info("SELinux:  Initializing.\n");

        memset(&selinux_state, 0, sizeof(selinux_state));
        enforcing_set(selinux_enforcing_boot);
        selinux_avc_init();
        mutex_init(&selinux_state.status_lock);
        mutex_init(&selinux_state.policy_mutex);

        /* Set the security state for the initial task. */
        cred_init_security();

        default_noexec = !(VM_DATA_DEFAULT_FLAGS & VM_EXEC);
        if (!default_noexec)
                pr_notice("SELinux:  virtual memory is executable by default\n");

        avc_init();

        avtab_cache_init();

        ebitmap_cache_init();

        hashtab_cache_init();

        security_add_hooks(selinux_hooks, ARRAY_SIZE(selinux_hooks),
                           &selinux_lsmid);

        if (avc_add_callback(selinux_netcache_avc_callback, AVC_CALLBACK_RESET))
                panic("SELinux: Unable to register AVC netcache callback\n");

        if (avc_add_callback(selinux_lsm_notifier_avc_callback, AVC_CALLBACK_RESET))
                panic("SELinux: Unable to register AVC LSM notifier callback\n");

        if (selinux_enforcing_boot)
                pr_debug("SELinux:  Starting in enforcing mode\n");
        else
                pr_debug("SELinux:  Starting in permissive mode\n");

        fs_validate_description("selinux", selinux_fs_parameters);

        return 0;
}

static void delayed_superblock_init(struct super_block *sb, void *unused)
{
        selinux_set_mnt_opts(sb, NULL, 0, NULL);
}

void selinux_complete_init(void)
{
        pr_debug("SELinux:  Completing initialization.\n");

        /* Set up any superblocks initialized prior to the policy load. */
        pr_debug("SELinux:  Setting up existing superblocks.\n");
        iterate_supers(delayed_superblock_init, NULL);
}

/* SELinux requires early initialization in order to label
   all processes and objects when they are created. */
DEFINE_LSM(selinux) = {
        .name = "selinux",
        .flags = LSM_FLAG_LEGACY_MAJOR | LSM_FLAG_EXCLUSIVE,
        .enabled = &selinux_enabled_boot,
        .blobs = &selinux_blob_sizes,
        .init = selinux_init,
};

#if defined(CONFIG_NETFILTER)
static const struct nf_hook_ops selinux_nf_ops[] = {
        {
                .hook =                selinux_ip_postroute,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_POST_ROUTING,
                .priority =        NF_IP_PRI_SELINUX_LAST,
        },
        {
                .hook =                selinux_ip_forward,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_FORWARD,
                .priority =        NF_IP_PRI_SELINUX_FIRST,
        },
        {
                .hook =                selinux_ip_output,
                .pf =                NFPROTO_IPV4,
                .hooknum =        NF_INET_LOCAL_OUT,
                .priority =        NF_IP_PRI_SELINUX_FIRST,
        },
#if IS_ENABLED(CONFIG_IPV6)
        {
                .hook =                selinux_ip_postroute,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_POST_ROUTING,
                .priority =        NF_IP6_PRI_SELINUX_LAST,
        },
        {
                .hook =                selinux_ip_forward,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_FORWARD,
                .priority =        NF_IP6_PRI_SELINUX_FIRST,
        },
        {
                .hook =                selinux_ip_output,
                .pf =                NFPROTO_IPV6,
                .hooknum =        NF_INET_LOCAL_OUT,
                .priority =        NF_IP6_PRI_SELINUX_FIRST,
        },
#endif        /* IPV6 */
};

static int __net_init selinux_nf_register(struct net *net)
{
        return nf_register_net_hooks(net, selinux_nf_ops,
                                     ARRAY_SIZE(selinux_nf_ops));
}

static void __net_exit selinux_nf_unregister(struct net *net)
{
        nf_unregister_net_hooks(net, selinux_nf_ops,
                                ARRAY_SIZE(selinux_nf_ops));
}

static struct pernet_operations selinux_net_ops = {
        .init = selinux_nf_register,
        .exit = selinux_nf_unregister,
};

static int __init selinux_nf_ip_init(void)
{
        int err;

        if (!selinux_enabled_boot)
                return 0;

        pr_debug("SELinux:  Registering netfilter hooks\n");

        err = register_pernet_subsys(&selinux_net_ops);
        if (err)
                panic("SELinux: register_pernet_subsys: error %d\n", err);

        return 0;
}
__initcall(selinux_nf_ip_init);
#endif /* CONFIG_NETFILTER */





































































































































































































































  165 
  165 


















  170 









  170 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BACKING_DEV_DEFS_H
#define __LINUX_BACKING_DEV_DEFS_H

#include <linux/list.h>
#include <linux/radix-tree.h>
#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/percpu_counter.h>
#include <linux/percpu-refcount.h>
#include <linux/flex_proportions.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
#include <linux/kref.h>
#include <linux/refcount.h>

struct page;
struct device;
struct dentry;

/*
 * Bits in bdi_writeback.state
 */
enum wb_state {
        WB_registered,                /* bdi_register() was done */
        WB_writeback_running,        /* Writeback is in progress */
        WB_has_dirty_io,        /* Dirty inodes on ->b_{dirty|io|more_io} */
        WB_start_all,                /* nr_pages == 0 (all) work pending */
};

enum wb_stat_item {
        WB_RECLAIMABLE,
        WB_WRITEBACK,
        WB_DIRTIED,
        WB_WRITTEN,
        NR_WB_STAT_ITEMS
};

#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))

/*
 * why some writeback work was initiated
 */
enum wb_reason {
        WB_REASON_BACKGROUND,
        WB_REASON_VMSCAN,
        WB_REASON_SYNC,
        WB_REASON_PERIODIC,
        WB_REASON_LAPTOP_TIMER,
        WB_REASON_FS_FREE_SPACE,
        /*
         * There is no bdi forker thread any more and works are done
         * by emergency worker, however, this is TPs userland visible
         * and we'll be exposing exactly the same information,
         * so it has a mismatch name.
         */
        WB_REASON_FORKER_THREAD,
        WB_REASON_FOREIGN_FLUSH,

        WB_REASON_MAX,
};

struct wb_completion {
        atomic_t                cnt;
        wait_queue_head_t        *waitq;
};

#define __WB_COMPLETION_INIT(_waitq)        \
        (struct wb_completion){ .cnt = ATOMIC_INIT(1), .waitq = (_waitq) }

/*
 * If one wants to wait for one or more wb_writeback_works, each work's
 * ->done should be set to a wb_completion defined using the following
 * macro.  Once all work items are issued with wb_queue_work(), the caller
 * can wait for the completion of all using wb_wait_for_completion().  Work
 * items which are waited upon aren't freed automatically on completion.
 */
#define WB_COMPLETION_INIT(bdi)                __WB_COMPLETION_INIT(&(bdi)->wb_waitq)

#define DEFINE_WB_COMPLETION(cmpl, bdi)        \
        struct wb_completion cmpl = WB_COMPLETION_INIT(bdi)

/*
 * Each wb (bdi_writeback) can perform writeback operations, is measured
 * and throttled, independently.  Without cgroup writeback, each bdi
 * (bdi_writeback) is served by its embedded bdi->wb.
 *
 * On the default hierarchy, blkcg implicitly enables memcg.  This allows
 * using memcg's page ownership for attributing writeback IOs, and every
 * memcg - blkcg combination can be served by its own wb by assigning a
 * dedicated wb to each memcg, which enables isolation across different
 * cgroups and propagation of IO back pressure down from the IO layer upto
 * the tasks which are generating the dirty pages to be written back.
 *
 * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
 * refcounted with the number of inodes attached to it, and pins the memcg
 * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
 * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
 * is tested for blkcg after lookup and removed from index on mismatch so
 * that a new wb for the combination can be created.
 *
 * Each bdi_writeback that is not embedded into the backing_dev_info must hold
 * a reference to the parent backing_dev_info.  See cgwb_create() for details.
 */
struct bdi_writeback {
        struct backing_dev_info *bdi;        /* our parent bdi */

        unsigned long state;                /* Always use atomic bitops on this */
        unsigned long last_old_flush;        /* last old data flush */

        struct list_head b_dirty;        /* dirty inodes */
        struct list_head b_io;                /* parked for writeback */
        struct list_head b_more_io;        /* parked for more writeback */
        struct list_head b_dirty_time;        /* time stamps are dirty */
        spinlock_t list_lock;                /* protects the b_* lists */

        atomic_t writeback_inodes;        /* number of inodes under writeback */
        struct percpu_counter stat[NR_WB_STAT_ITEMS];

        unsigned long bw_time_stamp;        /* last time write bw is updated */
        unsigned long dirtied_stamp;
        unsigned long written_stamp;        /* pages written at bw_time_stamp */
        unsigned long write_bandwidth;        /* the estimated write bandwidth */
        unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */

        /*
         * The base dirty throttle rate, re-calculated on every 200ms.
         * All the bdi tasks' dirty rate will be curbed under it.
         * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
         * in small steps and is much more smooth/stable than the latter.
         */
        unsigned long dirty_ratelimit;
        unsigned long balanced_dirty_ratelimit;

        struct fprop_local_percpu completions;
        int dirty_exceeded;
        enum wb_reason start_all_reason;

        spinlock_t work_lock;                /* protects work_list & dwork scheduling */
        struct list_head work_list;
        struct delayed_work dwork;        /* work item used for writeback */
        struct delayed_work bw_dwork;        /* work item used for bandwidth estimate */

        struct list_head bdi_node;        /* anchored at bdi->wb_list */

#ifdef CONFIG_CGROUP_WRITEBACK
        struct percpu_ref refcnt;        /* used only for !root wb's */
        struct fprop_local_percpu memcg_completions;
        struct cgroup_subsys_state *memcg_css; /* the associated memcg */
        struct cgroup_subsys_state *blkcg_css; /* and blkcg */
        struct list_head memcg_node;        /* anchored at memcg->cgwb_list */
        struct list_head blkcg_node;        /* anchored at blkcg->cgwb_list */
        struct list_head b_attached;        /* attached inodes, protected by list_lock */
        struct list_head offline_node;        /* anchored at offline_cgwbs */

        union {
                struct work_struct release_work;
                struct rcu_head rcu;
        };
#endif
};

struct backing_dev_info {
        u64 id;
        struct rb_node rb_node; /* keyed by ->id */
        struct list_head bdi_list;
        unsigned long ra_pages;        /* max readahead in PAGE_SIZE units */
        unsigned long io_pages;        /* max allowed IO size */

        struct kref refcnt;        /* Reference counter for the structure */
        unsigned int capabilities; /* Device capabilities */
        unsigned int min_ratio;
        unsigned int max_ratio, max_prop_frac;

        /*
         * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
         * any dirty wbs, which is depended upon by bdi_has_dirty().
         */
        atomic_long_t tot_write_bandwidth;
        /*
         * Jiffies when last process was dirty throttled on this bdi. Used by
         * blk-wbt.
         */
        unsigned long last_bdp_sleep;

        struct bdi_writeback wb;  /* the root writeback info for this bdi */
        struct list_head wb_list; /* list of all wbs */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
        struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
        struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
#endif
        wait_queue_head_t wb_waitq;

        struct device *dev;
        char dev_name[64];
        struct device *owner;

        struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS
        struct dentry *debug_dir;
#endif
};

struct wb_lock_cookie {
        bool locked;
        unsigned long flags;
};

#ifdef CONFIG_CGROUP_WRITEBACK

/**
 * wb_tryget - try to increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline bool wb_tryget(struct bdi_writeback *wb)
{
        if (wb != &wb->bdi->wb)
                return percpu_ref_tryget(&wb->refcnt);
        return true;
}

/**
 * wb_get - increment a wb's refcount
 * @wb: bdi_writeback to get
 */
static inline void wb_get(struct bdi_writeback *wb)
{
        if (wb != &wb->bdi->wb)
                percpu_ref_get(&wb->refcnt);
}

/**
 * wb_put - decrement a wb's refcount
 * @wb: bdi_writeback to put
 * @nr: number of references to put
 */
static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
        if (WARN_ON_ONCE(!wb->bdi)) {
                /*
                 * A driver bug might cause a file to be removed before bdi was
                 * initialized.
                 */
                return;
        }

        if (wb != &wb->bdi->wb)
                percpu_ref_put_many(&wb->refcnt, nr);
}

/**
 * wb_put - decrement a wb's refcount
 * @wb: bdi_writeback to put
 */
static inline void wb_put(struct bdi_writeback *wb)
{
        wb_put_many(wb, 1);
}

/**
 * wb_dying - is a wb dying?
 * @wb: bdi_writeback of interest
 *
 * Returns whether @wb is unlinked and being drained.
 */
static inline bool wb_dying(struct bdi_writeback *wb)
{
        return percpu_ref_is_dying(&wb->refcnt);
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool wb_tryget(struct bdi_writeback *wb)
{
        return true;
}

static inline void wb_get(struct bdi_writeback *wb)
{
}

static inline void wb_put(struct bdi_writeback *wb)
{
}

static inline void wb_put_many(struct bdi_writeback *wb, unsigned long nr)
{
}

static inline bool wb_dying(struct bdi_writeback *wb)
{
        return false;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

#endif        /* __LINUX_BACKING_DEV_DEFS_H */









































































































































































































































































































































































































































































































  271 
  272 










  271 
  272 














  272 









  270 













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
/* CPU control.
 * (C) 2001, 2002, 2003, 2004 Rusty Russell
 *
 * This code is licenced under the GPL.
 */
#include <linux/sched/mm.h>
#include <linux/proc_fs.h>
#include <linux/smp.h>
#include <linux/init.h>
#include <linux/notifier.h>
#include <linux/sched/signal.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/task.h>
#include <linux/sched/smt.h>
#include <linux/unistd.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <linux/export.h>
#include <linux/bug.h>
#include <linux/kthread.h>
#include <linux/stop_machine.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
#include <linux/nmi.h>
#include <linux/smpboot.h>
#include <linux/relay.h>
#include <linux/slab.h>
#include <linux/scs.h>
#include <linux/percpu-rwsem.h>
#include <linux/cpuset.h>
#include <linux/random.h>
#include <linux/cc_platform.h>

#include <trace/events/power.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cpuhp.h>

#include "smpboot.h"

/**
 * struct cpuhp_cpu_state - Per cpu hotplug state storage
 * @state:        The current cpu state
 * @target:        The target state
 * @fail:        Current CPU hotplug callback state
 * @thread:        Pointer to the hotplug thread
 * @should_run:        Thread should execute
 * @rollback:        Perform a rollback
 * @single:        Single callback invocation
 * @bringup:        Single callback bringup or teardown selector
 * @node:        Remote CPU node; for multi-instance, do a
 *                single entry callback for install/remove
 * @last:        For multi-instance rollback, remember how far we got
 * @cb_state:        The state for a single callback (install/uninstall)
 * @result:        Result of the operation
 * @ap_sync_state:        State for AP synchronization
 * @done_up:        Signal completion to the issuer of the task for cpu-up
 * @done_down:        Signal completion to the issuer of the task for cpu-down
 */
struct cpuhp_cpu_state {
        enum cpuhp_state        state;
        enum cpuhp_state        target;
        enum cpuhp_state        fail;
#ifdef CONFIG_SMP
        struct task_struct        *thread;
        bool                        should_run;
        bool                        rollback;
        bool                        single;
        bool                        bringup;
        struct hlist_node        *node;
        struct hlist_node        *last;
        enum cpuhp_state        cb_state;
        int                        result;
        atomic_t                ap_sync_state;
        struct completion        done_up;
        struct completion        done_down;
#endif
};

static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
        .fail = CPUHP_INVALID,
};

#ifdef CONFIG_SMP
cpumask_t cpus_booted_once_mask;
#endif

#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
static struct lockdep_map cpuhp_state_up_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
static struct lockdep_map cpuhp_state_down_map =
        STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);


static inline void cpuhp_lock_acquire(bool bringup)
{
        lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}

static inline void cpuhp_lock_release(bool bringup)
{
        lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
}
#else

static inline void cpuhp_lock_acquire(bool bringup) { }
static inline void cpuhp_lock_release(bool bringup) { }

#endif

/**
 * struct cpuhp_step - Hotplug state machine step
 * @name:        Name of the step
 * @startup:        Startup function of the step
 * @teardown:        Teardown function of the step
 * @cant_stop:        Bringup/teardown can't be stopped at this step
 * @multi_instance:        State has multiple instances which get added afterwards
 */
struct cpuhp_step {
        const char                *name;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } startup;
        union {
                int                (*single)(unsigned int cpu);
                int                (*multi)(unsigned int cpu,
                                         struct hlist_node *node);
        } teardown;
        /* private: */
        struct hlist_head        list;
        /* public: */
        bool                        cant_stop;
        bool                        multi_instance;
};

static DEFINE_MUTEX(cpuhp_state_mutex);
static struct cpuhp_step cpuhp_hp_states[];

static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
{
        return cpuhp_hp_states + state;
}

static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
{
        return bringup ? !step->startup.single : !step->teardown.single;
}

/**
 * cpuhp_invoke_callback - Invoke the callbacks for a given state
 * @cpu:        The cpu for which the callback should be invoked
 * @state:        The state to do callbacks for
 * @bringup:        True if the bringup callback should be invoked
 * @node:        For multi-instance, do a single entry callback for install/remove
 * @lastp:        For multi-instance rollback, remember how far we got
 *
 * Called from cpu hotplug and from the state register machinery.
 *
 * Return: %0 on success or a negative errno code
 */
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
                                 bool bringup, struct hlist_node *node,
                                 struct hlist_node **lastp)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct cpuhp_step *step = cpuhp_get_step(state);
        int (*cbm)(unsigned int cpu, struct hlist_node *node);
        int (*cb)(unsigned int cpu);
        int ret, cnt;

        if (st->fail == state) {
                st->fail = CPUHP_INVALID;
                return -EAGAIN;
        }

        if (cpuhp_step_empty(bringup, step)) {
                WARN_ON_ONCE(1);
                return 0;
        }

        if (!step->multi_instance) {
                WARN_ON_ONCE(lastp && *lastp);
                cb = bringup ? step->startup.single : step->teardown.single;

                trace_cpuhp_enter(cpu, st->target, state, cb);
                ret = cb(cpu);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }
        cbm = bringup ? step->startup.multi : step->teardown.multi;

        /* Single invocation for instance add/remove */
        if (node) {
                WARN_ON_ONCE(lastp && *lastp);
                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                return ret;
        }

        /* State transition. Invoke on all instances */
        cnt = 0;
        hlist_for_each(node, &step->list) {
                if (lastp && node == *lastp)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                if (ret) {
                        if (!lastp)
                                goto err;

                        *lastp = node;
                        return ret;
                }
                cnt++;
        }
        if (lastp)
                *lastp = NULL;
        return 0;
err:
        /* Rollback the instances if one failed */
        cbm = !bringup ? step->startup.multi : step->teardown.multi;
        if (!cbm)
                return ret;

        hlist_for_each(node, &step->list) {
                if (!cnt--)
                        break;

                trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
                ret = cbm(cpu, node);
                trace_cpuhp_exit(cpu, st->state, state, ret);
                /*
                 * Rollback must not fail,
                 */
                WARN_ON_ONCE(ret);
        }
        return ret;
}

#ifdef CONFIG_SMP
static bool cpuhp_is_ap_state(enum cpuhp_state state)
{
        /*
         * The extra check for CPUHP_TEARDOWN_CPU is only for documentation
         * purposes as that state is handled explicitly in cpu_down.
         */
        return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
}

static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        wait_for_completion(done);
}

static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
{
        struct completion *done = bringup ? &st->done_up : &st->done_down;
        complete(done);
}

/*
 * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
 */
static bool cpuhp_is_atomic_state(enum cpuhp_state state)
{
        return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
}

/* Synchronization state management */
enum cpuhp_sync_state {
        SYNC_STATE_DEAD,
        SYNC_STATE_KICKED,
        SYNC_STATE_SHOULD_DIE,
        SYNC_STATE_ALIVE,
        SYNC_STATE_SHOULD_ONLINE,
        SYNC_STATE_ONLINE,
};

#ifdef CONFIG_HOTPLUG_CORE_SYNC
/**
 * cpuhp_ap_update_sync_state - Update synchronization state during bringup/teardown
 * @state:        The synchronization state to set
 *
 * No synchronization point. Just update of the synchronization state, but implies
 * a full barrier so that the AP changes are visible before the control CPU proceeds.
 */
static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state)
{
        atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);

        (void)atomic_xchg(st, state);
}

void __weak arch_cpuhp_sync_state_poll(void) { cpu_relax(); }

static bool cpuhp_wait_for_sync_state(unsigned int cpu, enum cpuhp_sync_state state,
                                      enum cpuhp_sync_state next_state)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        ktime_t now, end, start = ktime_get();
        int sync;

        end = start + 10ULL * NSEC_PER_SEC;

        sync = atomic_read(st);
        while (1) {
                if (sync == state) {
                        if (!atomic_try_cmpxchg(st, &sync, next_state))
                                continue;
                        return true;
                }

                now = ktime_get();
                if (now > end) {
                        /* Timeout. Leave the state unchanged */
                        return false;
                } else if (now - start < NSEC_PER_MSEC) {
                        /* Poll for one millisecond */
                        arch_cpuhp_sync_state_poll();
                } else {
                        usleep_range(USEC_PER_MSEC, 2 * USEC_PER_MSEC);
                }
                sync = atomic_read(st);
        }
        return true;
}
#else  /* CONFIG_HOTPLUG_CORE_SYNC */
static inline void cpuhp_ap_update_sync_state(enum cpuhp_sync_state state) { }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC */

#ifdef CONFIG_HOTPLUG_CORE_SYNC_DEAD
/**
 * cpuhp_ap_report_dead - Update synchronization state to DEAD
 *
 * No synchronization point. Just update of the synchronization state.
 */
void cpuhp_ap_report_dead(void)
{
        cpuhp_ap_update_sync_state(SYNC_STATE_DEAD);
}

void __weak arch_cpuhp_cleanup_dead_cpu(unsigned int cpu) { }

/*
 * Late CPU shutdown synchronization point. Cannot use cpuhp_state::done_down
 * because the AP cannot issue complete() at this stage.
 */
static void cpuhp_bp_sync_dead(unsigned int cpu)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        int sync = atomic_read(st);

        do {
                /* CPU can have reported dead already. Don't overwrite that! */
                if (sync == SYNC_STATE_DEAD)
                        break;
        } while (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_SHOULD_DIE));

        if (cpuhp_wait_for_sync_state(cpu, SYNC_STATE_DEAD, SYNC_STATE_DEAD)) {
                /* CPU reached dead state. Invoke the cleanup function */
                arch_cpuhp_cleanup_dead_cpu(cpu);
                return;
        }

        /* No further action possible. Emit message and give up. */
        pr_err("CPU%u failed to report dead state\n", cpu);
}
#else /* CONFIG_HOTPLUG_CORE_SYNC_DEAD */
static inline void cpuhp_bp_sync_dead(unsigned int cpu) { }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC_DEAD */

#ifdef CONFIG_HOTPLUG_CORE_SYNC_FULL
/**
 * cpuhp_ap_sync_alive - Synchronize AP with the control CPU once it is alive
 *
 * Updates the AP synchronization state to SYNC_STATE_ALIVE and waits
 * for the BP to release it.
 */
void cpuhp_ap_sync_alive(void)
{
        atomic_t *st = this_cpu_ptr(&cpuhp_state.ap_sync_state);

        cpuhp_ap_update_sync_state(SYNC_STATE_ALIVE);

        /* Wait for the control CPU to release it. */
        while (atomic_read(st) != SYNC_STATE_SHOULD_ONLINE)
                cpu_relax();
}

static bool cpuhp_can_boot_ap(unsigned int cpu)
{
        atomic_t *st = per_cpu_ptr(&cpuhp_state.ap_sync_state, cpu);
        int sync = atomic_read(st);

again:
        switch (sync) {
        case SYNC_STATE_DEAD:
                /* CPU is properly dead */
                break;
        case SYNC_STATE_KICKED:
                /* CPU did not come up in previous attempt */
                break;
        case SYNC_STATE_ALIVE:
                /* CPU is stuck cpuhp_ap_sync_alive(). */
                break;
        default:
                /* CPU failed to report online or dead and is in limbo state. */
                return false;
        }

        /* Prepare for booting */
        if (!atomic_try_cmpxchg(st, &sync, SYNC_STATE_KICKED))
                goto again;

        return true;
}

void __weak arch_cpuhp_cleanup_kick_cpu(unsigned int cpu) { }

/*
 * Early CPU bringup synchronization point. Cannot use cpuhp_state::done_up
 * because the AP cannot issue complete() so early in the bringup.
 */
static int cpuhp_bp_sync_alive(unsigned int cpu)
{
        int ret = 0;

        if (!IS_ENABLED(CONFIG_HOTPLUG_CORE_SYNC_FULL))
                return 0;

        if (!cpuhp_wait_for_sync_state(cpu, SYNC_STATE_ALIVE, SYNC_STATE_SHOULD_ONLINE)) {
                pr_err("CPU%u failed to report alive state\n", cpu);
                ret = -EIO;
        }

        /* Let the architecture cleanup the kick alive mechanics. */
        arch_cpuhp_cleanup_kick_cpu(cpu);
        return ret;
}
#else /* CONFIG_HOTPLUG_CORE_SYNC_FULL */
static inline int cpuhp_bp_sync_alive(unsigned int cpu) { return 0; }
static inline bool cpuhp_can_boot_ap(unsigned int cpu) { return true; }
#endif /* !CONFIG_HOTPLUG_CORE_SYNC_FULL */

/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen);

/*
 * The following two APIs (cpu_maps_update_begin/done) must be used when
 * attempting to serialize the updates to cpu_online_mask & cpu_present_mask.
 */
void cpu_maps_update_begin(void)
{
        mutex_lock(&cpu_add_remove_lock);
}

void cpu_maps_update_done(void)
{
        mutex_unlock(&cpu_add_remove_lock);
}

/*
 * If set, cpu_up and cpu_down will return -EBUSY and do nothing.
 * Should always be manipulated under cpu_add_remove_lock
 */
static int cpu_hotplug_disabled;

#ifdef CONFIG_HOTPLUG_CPU

DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);

static bool cpu_hotplug_offline_disabled __ro_after_init;

void cpus_read_lock(void)
{
        percpu_down_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_lock);

int cpus_read_trylock(void)
{
        return percpu_down_read_trylock(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_trylock);

void cpus_read_unlock(void)
{
        percpu_up_read(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(cpus_read_unlock);

void cpus_write_lock(void)
{
        percpu_down_write(&cpu_hotplug_lock);
}

void cpus_write_unlock(void)
{
        percpu_up_write(&cpu_hotplug_lock);
}

void lockdep_assert_cpus_held(void)
{
        /*
         * We can't have hotplug operations before userspace starts running,
         * and some init codepaths will knowingly not take the hotplug lock.
         * This is all valid, so mute lockdep until it makes sense to report
         * unheld locks.
         */
        if (system_state < SYSTEM_RUNNING)
                return;

        percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
EXPORT_SYMBOL_GPL(lockdep_assert_cpus_held);

#ifdef CONFIG_LOCKDEP
int lockdep_is_cpus_held(void)
{
        return percpu_rwsem_is_held(&cpu_hotplug_lock);
}
#endif

static void lockdep_acquire_cpus_lock(void)
{
        rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_);
}

static void lockdep_release_cpus_lock(void)
{
        rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}

/* Declare CPU offlining not supported */
void cpu_hotplug_disable_offlining(void)
{
        cpu_maps_update_begin();
        cpu_hotplug_offline_disabled = true;
        cpu_maps_update_done();
}

/*
 * Wait for currently running CPU hotplug operations to complete (if any) and
 * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
 * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
 * hotplug path before performing hotplug operations. So acquiring that lock
 * guarantees mutual exclusion from any currently running hotplug operations.
 */
void cpu_hotplug_disable(void)
{
        cpu_maps_update_begin();
        cpu_hotplug_disabled++;
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);

static void __cpu_hotplug_enable(void)
{
        if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
                return;
        cpu_hotplug_disabled--;
}

void cpu_hotplug_enable(void)
{
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);

#else

static void lockdep_acquire_cpus_lock(void)
{
}

static void lockdep_release_cpus_lock(void)
{
}

#endif        /* CONFIG_HOTPLUG_CPU */

/*
 * Architectures that need SMT-specific errata handling during SMT hotplug
 * should override this.
 */
void __weak arch_smt_update(void) { }

#ifdef CONFIG_HOTPLUG_SMT

enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
static unsigned int cpu_smt_max_threads __ro_after_init;
unsigned int cpu_smt_num_threads __read_mostly = UINT_MAX;

void __init cpu_smt_disable(bool force)
{
        if (!cpu_smt_possible())
                return;

        if (force) {
                pr_info("SMT: Force disabled\n");
                cpu_smt_control = CPU_SMT_FORCE_DISABLED;
        } else {
                pr_info("SMT: disabled\n");
                cpu_smt_control = CPU_SMT_DISABLED;
        }
        cpu_smt_num_threads = 1;
}

/*
 * The decision whether SMT is supported can only be done after the full
 * CPU identification. Called from architecture code.
 */
void __init cpu_smt_set_num_threads(unsigned int num_threads,
                                    unsigned int max_threads)
{
        WARN_ON(!num_threads || (num_threads > max_threads));

        if (max_threads == 1)
                cpu_smt_control = CPU_SMT_NOT_SUPPORTED;

        cpu_smt_max_threads = max_threads;

        /*
         * If SMT has been disabled via the kernel command line or SMT is
         * not supported, set cpu_smt_num_threads to 1 for consistency.
         * If enabled, take the architecture requested number of threads
         * to bring up into account.
         */
        if (cpu_smt_control != CPU_SMT_ENABLED)
                cpu_smt_num_threads = 1;
        else if (num_threads < cpu_smt_num_threads)
                cpu_smt_num_threads = num_threads;
}

static int __init smt_cmdline_disable(char *str)
{
        cpu_smt_disable(str && !strcmp(str, "force"));
        return 0;
}
early_param("nosmt", smt_cmdline_disable);

/*
 * For Archicture supporting partial SMT states check if the thread is allowed.
 * Otherwise this has already been checked through cpu_smt_max_threads when
 * setting the SMT level.
 */
static inline bool cpu_smt_thread_allowed(unsigned int cpu)
{
#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
        return topology_smt_thread_allowed(cpu);
#else
        return true;
#endif
}

static inline bool cpu_bootable(unsigned int cpu)
{
        if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
                return true;

        /* All CPUs are bootable if controls are not configured */
        if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
                return true;

        /* All CPUs are bootable if CPU is not SMT capable */
        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return true;

        if (topology_is_primary_thread(cpu))
                return true;

        /*
         * On x86 it's required to boot all logical CPUs at least once so
         * that the init code can get a chance to set CR4.MCE on each
         * CPU. Otherwise, a broadcasted MCE observing CR4.MCE=0b on any
         * core will shutdown the machine.
         */
        return !cpumask_test_cpu(cpu, &cpus_booted_once_mask);
}

/* Returns true if SMT is supported and not forcefully (irreversibly) disabled */
bool cpu_smt_possible(void)
{
        return cpu_smt_control != CPU_SMT_FORCE_DISABLED &&
                cpu_smt_control != CPU_SMT_NOT_SUPPORTED;
}
EXPORT_SYMBOL_GPL(cpu_smt_possible);

#else
static inline bool cpu_bootable(unsigned int cpu) { return true; }
#endif

static inline enum cpuhp_state
cpuhp_set_state(int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        bool bringup = st->state < target;

        st->rollback = false;
        st->last = NULL;

        st->target = target;
        st->single = false;
        st->bringup = bringup;
        if (cpu_dying(cpu) != !bringup)
                set_cpu_dying(cpu, !bringup);

        return prev_state;
}

static inline void
cpuhp_reset_state(int cpu, struct cpuhp_cpu_state *st,
                  enum cpuhp_state prev_state)
{
        bool bringup = !st->bringup;

        st->target = prev_state;

        /*
         * Already rolling back. No need invert the bringup value or to change
         * the current state.
         */
        if (st->rollback)
                return;

        st->rollback = true;

        /*
         * If we have st->last we need to undo partial multi_instance of this
         * state first. Otherwise start undo at the previous state.
         */
        if (!st->last) {
                if (st->bringup)
                        st->state--;
                else
                        st->state++;
        }

        st->bringup = bringup;
        if (cpu_dying(cpu) != !bringup)
                set_cpu_dying(cpu, !bringup);
}

/* Regular hotplug invocation of the AP hotplug thread */
static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
{
        if (!st->single && st->state == st->target)
                return;

        st->result = 0;
        /*
         * Make sure the above stores are visible before should_run becomes
         * true. Paired with the mb() above in cpuhp_thread_fun()
         */
        smp_mb();
        st->should_run = true;
        wake_up_process(st->thread);
        wait_for_ap_thread(st, st->bringup);
}

static int cpuhp_kick_ap(int cpu, struct cpuhp_cpu_state *st,
                         enum cpuhp_state target)
{
        enum cpuhp_state prev_state;
        int ret;

        prev_state = cpuhp_set_state(cpu, st, target);
        __cpuhp_kick_ap(st);
        if ((ret = st->result)) {
                cpuhp_reset_state(cpu, st, prev_state);
                __cpuhp_kick_ap(st);
        }

        return ret;
}

static int bringup_wait_for_ap_online(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

        /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
        wait_for_ap_thread(st, true);
        if (WARN_ON_ONCE((!cpu_online(cpu))))
                return -ECANCELED;

        /* Unpark the hotplug thread of the target cpu */
        kthread_unpark(st->thread);

        /*
         * SMT soft disabling on X86 requires to bring the CPU out of the
         * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit.  The
         * CPU marked itself as booted_once in notify_cpu_starting() so the
         * cpu_bootable() check will now return false if this is not the
         * primary sibling.
         */
        if (!cpu_bootable(cpu))
                return -ECANCELED;
        return 0;
}

#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
static int cpuhp_kick_ap_alive(unsigned int cpu)
{
        if (!cpuhp_can_boot_ap(cpu))
                return -EAGAIN;

        return arch_cpuhp_kick_ap_alive(cpu, idle_thread_get(cpu));
}

static int cpuhp_bringup_ap(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int ret;

        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
         * Prevent irq alloc/free across the bringup.
         */
        irq_lock_sparse();

        ret = cpuhp_bp_sync_alive(cpu);
        if (ret)
                goto out_unlock;

        ret = bringup_wait_for_ap_online(cpu);
        if (ret)
                goto out_unlock;

        irq_unlock_sparse();

        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;

        return cpuhp_kick_ap(cpu, st, st->target);

out_unlock:
        irq_unlock_sparse();
        return ret;
}
#else
static int bringup_cpu(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle = idle_thread_get(cpu);
        int ret;

        if (!cpuhp_can_boot_ap(cpu))
                return -EAGAIN;

        /*
         * Some architectures have to walk the irq descriptors to
         * setup the vector space for the cpu which comes online.
         *
         * Prevent irq alloc/free across the bringup by acquiring the
         * sparse irq lock. Hold it until the upcoming CPU completes the
         * startup in cpuhp_online_idle() which allows to avoid
         * intermediate synchronization points in the architecture code.
         */
        irq_lock_sparse();

        ret = __cpu_up(cpu, idle);
        if (ret)
                goto out_unlock;

        ret = cpuhp_bp_sync_alive(cpu);
        if (ret)
                goto out_unlock;

        ret = bringup_wait_for_ap_online(cpu);
        if (ret)
                goto out_unlock;

        irq_unlock_sparse();

        if (st->target <= CPUHP_AP_ONLINE_IDLE)
                return 0;

        return cpuhp_kick_ap(cpu, st, st->target);

out_unlock:
        irq_unlock_sparse();
        return ret;
}
#endif

static int finish_cpu(unsigned int cpu)
{
        struct task_struct *idle = idle_thread_get(cpu);
        struct mm_struct *mm = idle->active_mm;

        /*
         * sched_force_init_mm() ensured the use of &init_mm,
         * drop that refcount now that the CPU has stopped.
         */
        WARN_ON(mm != &init_mm);
        idle->active_mm = NULL;
        mmdrop_lazy_tlb(mm);

        return 0;
}

/*
 * Hotplug state machine related functions
 */

/*
 * Get the next state to run. Empty ones will be skipped. Returns true if a
 * state must be run.
 *
 * st->state will be modified ahead of time, to match state_to_run, as if it
 * has already ran.
 */
static bool cpuhp_next_state(bool bringup,
                             enum cpuhp_state *state_to_run,
                             struct cpuhp_cpu_state *st,
                             enum cpuhp_state target)
{
        do {
                if (bringup) {
                        if (st->state >= target)
                                return false;

                        *state_to_run = ++st->state;
                } else {
                        if (st->state <= target)
                                return false;

                        *state_to_run = st->state--;
                }

                if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
                        break;
        } while (true);

        return true;
}

static int __cpuhp_invoke_callback_range(bool bringup,
                                         unsigned int cpu,
                                         struct cpuhp_cpu_state *st,
                                         enum cpuhp_state target,
                                         bool nofail)
{
        enum cpuhp_state state;
        int ret = 0;

        while (cpuhp_next_state(bringup, &state, st, target)) {
                int err;

                err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
                if (!err)
                        continue;

                if (nofail) {
                        pr_warn("CPU %u %s state %s (%d) failed (%d)\n",
                                cpu, bringup ? "UP" : "DOWN",
                                cpuhp_get_step(st->state)->name,
                                st->state, err);
                        ret = -1;
                } else {
                        ret = err;
                        break;
                }
        }

        return ret;
}

static inline int cpuhp_invoke_callback_range(bool bringup,
                                              unsigned int cpu,
                                              struct cpuhp_cpu_state *st,
                                              enum cpuhp_state target)
{
        return __cpuhp_invoke_callback_range(bringup, cpu, st, target, false);
}

static inline void cpuhp_invoke_callback_range_nofail(bool bringup,
                                                      unsigned int cpu,
                                                      struct cpuhp_cpu_state *st,
                                                      enum cpuhp_state target)
{
        __cpuhp_invoke_callback_range(bringup, cpu, st, target, true);
}

static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
{
        if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return true;
        /*
         * When CPU hotplug is disabled, then taking the CPU down is not
         * possible because takedown_cpu() and the architecture and
         * subsystem specific mechanisms are not available. So the CPU
         * which would be completely unplugged again needs to stay around
         * in the current state.
         */
        return st->state <= CPUHP_BRINGUP_CPU;
}

static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                              enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        ret = cpuhp_invoke_callback_range(true, cpu, st, target);
        if (ret) {
                pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
                         ret, cpu, cpuhp_get_step(st->state)->name,
                         st->state);

                cpuhp_reset_state(cpu, st, prev_state);
                if (can_rollback_cpu(st))
                        WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
                                                            prev_state));
        }
        return ret;
}

/*
 * The cpu hotplug threads manage the bringup and teardown of the cpus
 */
static int cpuhp_should_run(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        return st->should_run;
}

/*
 * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
 * callbacks when a state gets [un]installed at runtime.
 *
 * Each invocation of this function by the smpboot thread does a single AP
 * state callback.
 *
 * It has 3 modes of operation:
 *  - single: runs st->cb_state
 *  - up:     runs ++st->state, while st->state < st->target
 *  - down:   runs st->state--, while st->state > st->target
 *
 * When complete or on error, should_run is cleared and the completion is fired.
 */
static void cpuhp_thread_fun(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        bool bringup = st->bringup;
        enum cpuhp_state state;

        if (WARN_ON_ONCE(!st->should_run))
                return;

        /*
         * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
         * that if we see ->should_run we also see the rest of the state.
         */
        smp_mb();

        /*
         * The BP holds the hotplug lock, but we're now running on the AP,
         * ensure that anybody asserting the lock is held, will actually find
         * it so.
         */
        lockdep_acquire_cpus_lock();
        cpuhp_lock_acquire(bringup);

        if (st->single) {
                state = st->cb_state;
                st->should_run = false;
        } else {
                st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
                if (!st->should_run)
                        goto end;
        }

        WARN_ON_ONCE(!cpuhp_is_ap_state(state));

        if (cpuhp_is_atomic_state(state)) {
                local_irq_disable();
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
                local_irq_enable();

                /*
                 * STARTING/DYING must not fail!
                 */
                WARN_ON_ONCE(st->result);
        } else {
                st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
        }

        if (st->result) {
                /*
                 * If we fail on a rollback, we're up a creek without no
                 * paddle, no way forward, no way back. We loose, thanks for
                 * playing.
                 */
                WARN_ON_ONCE(st->rollback);
                st->should_run = false;
        }

end:
        cpuhp_lock_release(bringup);
        lockdep_release_cpus_lock();

        if (!st->should_run)
                complete_ap_thread(st, bringup);
}

/* Invoke a single callback on a remote cpu */
static int
cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
                         struct hlist_node *node)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int ret;

        if (!cpu_online(cpu))
                return 0;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        /*
         * If we are up and running, use the hotplug thread. For early calls
         * we invoke the thread function directly.
         */
        if (!st->thread)
                return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);

        st->rollback = false;
        st->last = NULL;

        st->node = node;
        st->bringup = bringup;
        st->cb_state = state;
        st->single = true;

        __cpuhp_kick_ap(st);

        /*
         * If we failed and did a partial, do a rollback.
         */
        if ((ret = st->result) && st->last) {
                st->rollback = true;
                st->bringup = !bringup;

                __cpuhp_kick_ap(st);
        }

        /*
         * Clean up the leftovers so the next hotplug operation wont use stale
         * data.
         */
        st->node = st->last = NULL;
        return ret;
}

static int cpuhp_kick_ap_work(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state prev_state = st->state;
        int ret;

        cpuhp_lock_acquire(false);
        cpuhp_lock_release(false);

        cpuhp_lock_acquire(true);
        cpuhp_lock_release(true);

        trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
        ret = cpuhp_kick_ap(cpu, st, st->target);
        trace_cpuhp_exit(cpu, st->state, prev_state, ret);

        return ret;
}

static struct smp_hotplug_thread cpuhp_threads = {
        .store                        = &cpuhp_state.thread,
        .thread_should_run        = cpuhp_should_run,
        .thread_fn                = cpuhp_thread_fun,
        .thread_comm                = "cpuhp/%u",
        .selfparking                = true,
};

static __init void cpuhp_init_state(void)
{
        struct cpuhp_cpu_state *st;
        int cpu;

        for_each_possible_cpu(cpu) {
                st = per_cpu_ptr(&cpuhp_state, cpu);
                init_completion(&st->done_up);
                init_completion(&st->done_down);
        }
}

void __init cpuhp_threads_init(void)
{
        cpuhp_init_state();
        BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads));
        kthread_unpark(this_cpu_read(cpuhp_state.thread));
}

#ifdef CONFIG_HOTPLUG_CPU
#ifndef arch_clear_mm_cpumask_cpu
#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm))
#endif

/**
 * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
 * @cpu: a CPU id
 *
 * This function walks all processes, finds a valid mm struct for each one and
 * then clears a corresponding bit in mm's cpumask.  While this all sounds
 * trivial, there are various non-obvious corner cases, which this function
 * tries to solve in a safe manner.
 *
 * Also note that the function uses a somewhat relaxed locking scheme, so it may
 * be called only for an already offlined CPU.
 */
void clear_tasks_mm_cpumask(int cpu)
{
        struct task_struct *p;

        /*
         * This function is called after the cpu is taken down and marked
         * offline, so its not like new tasks will ever get this cpu set in
         * their mm mask. -- Peter Zijlstra
         * Thus, we may use rcu_read_lock() here, instead of grabbing
         * full-fledged tasklist_lock.
         */
        WARN_ON(cpu_online(cpu));
        rcu_read_lock();
        for_each_process(p) {
                struct task_struct *t;

                /*
                 * Main thread might exit, but other threads may still have
                 * a valid mm. Find one.
                 */
                t = find_lock_task_mm(p);
                if (!t)
                        continue;
                arch_clear_mm_cpumask_cpu(cpu, t->mm);
                task_unlock(t);
        }
        rcu_read_unlock();
}

/* Take this CPU down. */
static int take_cpu_down(void *_param)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
        enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
        int err, cpu = smp_processor_id();

        /* Ensure this CPU doesn't handle any more interrupts. */
        err = __cpu_disable();
        if (err < 0)
                return err;

        /*
         * Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
         * down, that the current state is CPUHP_TEARDOWN_CPU - 1.
         */
        WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));

        /*
         * Invoke the former CPU_DYING callbacks. DYING must not fail!
         */
        cpuhp_invoke_callback_range_nofail(false, cpu, st, target);

        /* Park the stopper thread */
        stop_machine_park(cpu);
        return 0;
}

static int takedown_cpu(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int err;

        /* Park the smpboot threads */
        kthread_park(st->thread);

        /*
         * Prevent irq alloc/free while the dying cpu reorganizes the
         * interrupt affinities.
         */
        irq_lock_sparse();

        /*
         * So now all preempt/rcu users must observe !cpu_active().
         */
        err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu));
        if (err) {
                /* CPU refused to die */
                irq_unlock_sparse();
                /* Unpark the hotplug thread so we can rollback there */
                kthread_unpark(st->thread);
                return err;
        }
        BUG_ON(cpu_online(cpu));

        /*
         * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed
         * all runnable tasks from the CPU, there's only the idle task left now
         * that the migration thread is done doing the stop_machine thing.
         *
         * Wait for the stop thread to go away.
         */
        wait_for_ap_thread(st, false);
        BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);

        /* Interrupts are moved away from the dying cpu, reenable alloc/free */
        irq_unlock_sparse();

        hotplug_cpu__broadcast_tick_pull(cpu);
        /* This actually kills the CPU. */
        __cpu_die(cpu);

        cpuhp_bp_sync_dead(cpu);

        lockdep_cleanup_dead_cpu(cpu, idle_thread_get(cpu));

        /*
         * Callbacks must be re-integrated right away to the RCU state machine.
         * Otherwise an RCU callback could block a further teardown function
         * waiting for its completion.
         */
        rcutree_migrate_callbacks(cpu);

        return 0;
}

static void cpuhp_complete_idle_dead(void *arg)
{
        struct cpuhp_cpu_state *st = arg;

        complete_ap_thread(st, false);
}

void cpuhp_report_idle_dead(void)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        BUG_ON(st->state != CPUHP_AP_OFFLINE);
        tick_assert_timekeeping_handover();
        rcutree_report_cpu_dead();
        st->state = CPUHP_AP_IDLE_DEAD;
        /*
         * We cannot call complete after rcutree_report_cpu_dead() so we delegate it
         * to an online cpu.
         */
        smp_call_function_single(cpumask_first(cpu_online_mask),
                                 cpuhp_complete_idle_dead, st, 0);
}

static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
                                enum cpuhp_state target)
{
        enum cpuhp_state prev_state = st->state;
        int ret = 0;

        ret = cpuhp_invoke_callback_range(false, cpu, st, target);
        if (ret) {
                pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
                         ret, cpu, cpuhp_get_step(st->state)->name,
                         st->state);

                cpuhp_reset_state(cpu, st, prev_state);

                if (st->state < prev_state)
                        WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
                                                            prev_state));
        }

        return ret;
}

/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
                           enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        int prev_state, ret = 0;

        if (num_online_cpus() == 1)
                return -EBUSY;

        if (!cpu_present(cpu))
                return -EINVAL;

        cpus_write_lock();

        cpuhp_tasks_frozen = tasks_frozen;

        prev_state = cpuhp_set_state(cpu, st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread.
         */
        if (st->state > CPUHP_TEARDOWN_CPU) {
                st->target = max((int)target, CPUHP_TEARDOWN_CPU);
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;

                /*
                 * We might have stopped still in the range of the AP hotplug
                 * thread. Nothing to do anymore.
                 */
                if (st->state > CPUHP_TEARDOWN_CPU)
                        goto out;

                st->target = target;
        }
        /*
         * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
         * to do the further cleanups.
         */
        ret = cpuhp_down_callbacks(cpu, st, target);
        if (ret && st->state < prev_state) {
                if (st->state == CPUHP_TEARDOWN_CPU) {
                        cpuhp_reset_state(cpu, st, prev_state);
                        __cpuhp_kick_ap(st);
                } else {
                        WARN(1, "DEAD callback error for CPU%d", cpu);
                }
        }

out:
        cpus_write_unlock();
        arch_smt_update();
        return ret;
}

struct cpu_down_work {
        unsigned int                cpu;
        enum cpuhp_state        target;
};

static long __cpu_down_maps_locked(void *arg)
{
        struct cpu_down_work *work = arg;

        return _cpu_down(work->cpu, 0, work->target);
}

static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
{
        struct cpu_down_work work = { .cpu = cpu, .target = target, };

        /*
         * If the platform does not support hotplug, report it explicitly to
         * differentiate it from a transient offlining failure.
         */
        if (cpu_hotplug_offline_disabled)
                return -EOPNOTSUPP;
        if (cpu_hotplug_disabled)
                return -EBUSY;

        /*
         * Ensure that the control task does not run on the to be offlined
         * CPU to prevent a deadlock against cfs_b->period_timer.
         * Also keep at least one housekeeping cpu onlined to avoid generating
         * an empty sched_domain span.
         */
        for_each_cpu_and(cpu, cpu_online_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) {
                if (cpu != work.cpu)
                        return work_on_cpu(cpu, __cpu_down_maps_locked, &work);
        }
        return -EBUSY;
}

static int cpu_down(unsigned int cpu, enum cpuhp_state target)
{
        int err;

        cpu_maps_update_begin();
        err = cpu_down_maps_locked(cpu, target);
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_down - Bring down a cpu device
 * @dev: Pointer to the cpu device to offline
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use remove_cpu() instead.
 *
 * Return: %0 on success or a negative errno code
 */
int cpu_device_down(struct device *dev)
{
        return cpu_down(dev->id, CPUHP_OFFLINE);
}

int remove_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_offline(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(remove_cpu);

void smp_shutdown_nonboot_cpus(unsigned int primary_cpu)
{
        unsigned int cpu;
        int error;

        cpu_maps_update_begin();

        /*
         * Make certain the cpu I'm about to reboot on is online.
         *
         * This is inline to what migrate_to_reboot_cpu() already do.
         */
        if (!cpu_online(primary_cpu))
                primary_cpu = cpumask_first(cpu_online_mask);

        for_each_online_cpu(cpu) {
                if (cpu == primary_cpu)
                        continue;

                error = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (error) {
                        pr_err("Failed to offline CPU%d - error=%d",
                                cpu, error);
                        break;
                }
        }

        /*
         * Ensure all but the reboot CPU are offline.
         */
        BUG_ON(num_online_cpus() > 1);

        /*
         * Make sure the CPUs won't be enabled by someone else after this
         * point. Kexec will reboot to a new kernel shortly resetting
         * everything along the way.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
}

#else
#define takedown_cpu                NULL
#endif /*CONFIG_HOTPLUG_CPU*/

/**
 * notify_cpu_starting(cpu) - Invoke the callbacks on the starting CPU
 * @cpu: cpu that just started
 *
 * It must be called by the arch code on the new cpu, before the new cpu
 * enables interrupts and before the "boot" cpu returns from __cpu_up().
 */
void notify_cpu_starting(unsigned int cpu)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);

        rcutree_report_cpu_starting(cpu);        /* Enables RCU usage on this CPU. */
        cpumask_set_cpu(cpu, &cpus_booted_once_mask);

        /*
         * STARTING must not fail!
         */
        cpuhp_invoke_callback_range_nofail(true, cpu, st, target);
}

/*
 * Called from the idle task. Wake up the controlling task which brings the
 * hotplug thread of the upcoming CPU up and then delegates the rest of the
 * online bringup to the hotplug thread.
 */
void cpuhp_online_idle(enum cpuhp_state state)
{
        struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);

        /* Happens for the boot cpu */
        if (state != CPUHP_AP_ONLINE_IDLE)
                return;

        cpuhp_ap_update_sync_state(SYNC_STATE_ONLINE);

        /*
         * Unpark the stopper thread before we start the idle loop (and start
         * scheduling); this ensures the stopper task is always available.
         */
        stop_machine_unpark(smp_processor_id());

        st->state = CPUHP_AP_ONLINE_IDLE;
        complete_ap_thread(st, true);
}

/* Requires cpu_add_remove_lock to be held */
static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
        struct task_struct *idle;
        int ret = 0;

        cpus_write_lock();

        if (!cpu_present(cpu)) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * The caller of cpu_up() might have raced with another
         * caller. Nothing to do.
         */
        if (st->state >= target)
                goto out;

        if (st->state == CPUHP_OFFLINE) {
                /* Let it fail before we try to bring the cpu up */
                idle = idle_thread_get(cpu);
                if (IS_ERR(idle)) {
                        ret = PTR_ERR(idle);
                        goto out;
                }

                /*
                 * Reset stale stack state from the last time this CPU was online.
                 */
                scs_task_reset(idle);
                kasan_unpoison_task_stack(idle);
        }

        cpuhp_tasks_frozen = tasks_frozen;

        cpuhp_set_state(cpu, st, target);
        /*
         * If the current CPU state is in the range of the AP hotplug thread,
         * then we need to kick the thread once more.
         */
        if (st->state > CPUHP_BRINGUP_CPU) {
                ret = cpuhp_kick_ap_work(cpu);
                /*
                 * The AP side has done the error rollback already. Just
                 * return the error code..
                 */
                if (ret)
                        goto out;
        }

        /*
         * Try to reach the target state. We max out on the BP at
         * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is
         * responsible for bringing it up to the target state.
         */
        target = min((int)target, CPUHP_BRINGUP_CPU);
        ret = cpuhp_up_callbacks(cpu, st, target);
out:
        cpus_write_unlock();
        arch_smt_update();
        return ret;
}

static int cpu_up(unsigned int cpu, enum cpuhp_state target)
{
        int err = 0;

        if (!cpu_possible(cpu)) {
                pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
                       cpu);
                return -EINVAL;
        }

        err = try_online_node(cpu_to_node(cpu));
        if (err)
                return err;

        cpu_maps_update_begin();

        if (cpu_hotplug_disabled) {
                err = -EBUSY;
                goto out;
        }
        if (!cpu_bootable(cpu)) {
                err = -EPERM;
                goto out;
        }

        err = _cpu_up(cpu, 0, target);
out:
        cpu_maps_update_done();
        return err;
}

/**
 * cpu_device_up - Bring up a cpu device
 * @dev: Pointer to the cpu device to online
 *
 * This function is meant to be used by device core cpu subsystem only.
 *
 * Other subsystems should use add_cpu() instead.
 *
 * Return: %0 on success or a negative errno code
 */
int cpu_device_up(struct device *dev)
{
        return cpu_up(dev->id, CPUHP_ONLINE);
}

int add_cpu(unsigned int cpu)
{
        int ret;

        lock_device_hotplug();
        ret = device_online(get_cpu_device(cpu));
        unlock_device_hotplug();

        return ret;
}
EXPORT_SYMBOL_GPL(add_cpu);

/**
 * bringup_hibernate_cpu - Bring up the CPU that we hibernated on
 * @sleep_cpu: The cpu we hibernated on and should be brought up.
 *
 * On some architectures like arm64, we can hibernate on any CPU, but on
 * wake up the CPU we hibernated on might be offline as a side effect of
 * using maxcpus= for example.
 *
 * Return: %0 on success or a negative errno code
 */
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
        int ret;

        if (!cpu_online(sleep_cpu)) {
                pr_info("Hibernated on a CPU that is offline! Bringing CPU up.\n");
                ret = cpu_up(sleep_cpu, CPUHP_ONLINE);
                if (ret) {
                        pr_err("Failed to bring hibernate-CPU up!\n");
                        return ret;
                }
        }
        return 0;
}

static void __init cpuhp_bringup_mask(const struct cpumask *mask, unsigned int ncpus,
                                      enum cpuhp_state target)
{
        unsigned int cpu;

        for_each_cpu(cpu, mask) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);

                if (cpu_up(cpu, target) && can_rollback_cpu(st)) {
                        /*
                         * If this failed then cpu_up() might have only
                         * rolled back to CPUHP_BP_KICK_AP for the final
                         * online. Clean it up. NOOP if already rolled back.
                         */
                        WARN_ON(cpuhp_invoke_callback_range(false, cpu, st, CPUHP_OFFLINE));
                }

                if (!--ncpus)
                        break;
        }
}

#ifdef CONFIG_HOTPLUG_PARALLEL
static bool __cpuhp_parallel_bringup __ro_after_init = true;

static int __init parallel_bringup_parse_param(char *arg)
{
        return kstrtobool(arg, &__cpuhp_parallel_bringup);
}
early_param("cpuhp.parallel", parallel_bringup_parse_param);

#ifdef CONFIG_HOTPLUG_SMT
static inline bool cpuhp_smt_aware(void)
{
        return cpu_smt_max_threads > 1;
}

static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
{
        return cpu_primary_thread_mask;
}
#else
static inline bool cpuhp_smt_aware(void)
{
        return false;
}
static inline const struct cpumask *cpuhp_get_primary_thread_mask(void)
{
        return cpu_none_mask;
}
#endif

bool __weak arch_cpuhp_init_parallel_bringup(void)
{
        return true;
}

/*
 * On architectures which have enabled parallel bringup this invokes all BP
 * prepare states for each of the to be onlined APs first. The last state
 * sends the startup IPI to the APs. The APs proceed through the low level
 * bringup code in parallel and then wait for the control CPU to release
 * them one by one for the final onlining procedure.
 *
 * This avoids waiting for each AP to respond to the startup IPI in
 * CPUHP_BRINGUP_CPU.
 */
static bool __init cpuhp_bringup_cpus_parallel(unsigned int ncpus)
{
        const struct cpumask *mask = cpu_present_mask;

        if (__cpuhp_parallel_bringup)
                __cpuhp_parallel_bringup = arch_cpuhp_init_parallel_bringup();
        if (!__cpuhp_parallel_bringup)
                return false;

        if (cpuhp_smt_aware()) {
                const struct cpumask *pmask = cpuhp_get_primary_thread_mask();
                static struct cpumask tmp_mask __initdata;

                /*
                 * X86 requires to prevent that SMT siblings stopped while
                 * the primary thread does a microcode update for various
                 * reasons. Bring the primary threads up first.
                 */
                cpumask_and(&tmp_mask, mask, pmask);
                cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_BP_KICK_AP);
                cpuhp_bringup_mask(&tmp_mask, ncpus, CPUHP_ONLINE);
                /* Account for the online CPUs */
                ncpus -= num_online_cpus();
                if (!ncpus)
                        return true;
                /* Create the mask for secondary CPUs */
                cpumask_andnot(&tmp_mask, mask, pmask);
                mask = &tmp_mask;
        }

        /* Bring the not-yet started CPUs up */
        cpuhp_bringup_mask(mask, ncpus, CPUHP_BP_KICK_AP);
        cpuhp_bringup_mask(mask, ncpus, CPUHP_ONLINE);
        return true;
}
#else
static inline bool cpuhp_bringup_cpus_parallel(unsigned int ncpus) { return false; }
#endif /* CONFIG_HOTPLUG_PARALLEL */

void __init bringup_nonboot_cpus(unsigned int max_cpus)
{
        if (!max_cpus)
                return;

        /* Try parallel bringup optimization if enabled */
        if (cpuhp_bringup_cpus_parallel(max_cpus))
                return;

        /* Full per CPU serialized bringup */
        cpuhp_bringup_mask(cpu_present_mask, max_cpus, CPUHP_ONLINE);
}

#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;

int freeze_secondary_cpus(int primary)
{
        int cpu, error = 0;

        cpu_maps_update_begin();
        if (primary == -1) {
                primary = cpumask_first(cpu_online_mask);
                if (!housekeeping_cpu(primary, HK_TYPE_TIMER))
                        primary = housekeeping_any_cpu(HK_TYPE_TIMER);
        } else {
                if (!cpu_online(primary))
                        primary = cpumask_first(cpu_online_mask);
        }

        /*
         * We take down all of the non-boot CPUs in one shot to avoid races
         * with the userspace trying to use the CPU hotplug at the same time
         */
        cpumask_clear(frozen_cpus);

        pr_info("Disabling non-boot CPUs ...\n");
        for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) {
                if (!cpu_online(cpu) || cpu == primary)
                        continue;

                if (pm_wakeup_pending()) {
                        pr_info("Wakeup pending. Abort CPU freeze\n");
                        error = -EBUSY;
                        break;
                }

                trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
                error = _cpu_down(cpu, 1, CPUHP_OFFLINE);
                trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
                if (!error)
                        cpumask_set_cpu(cpu, frozen_cpus);
                else {
                        pr_err("Error taking CPU%d down: %d\n", cpu, error);
                        break;
                }
        }

        if (!error)
                BUG_ON(num_online_cpus() > 1);
        else
                pr_err("Non-boot CPUs are not disabled\n");

        /*
         * Make sure the CPUs won't be enabled by someone else. We need to do
         * this even in case of failure as all freeze_secondary_cpus() users are
         * supposed to do thaw_secondary_cpus() on the failure path.
         */
        cpu_hotplug_disabled++;

        cpu_maps_update_done();
        return error;
}

void __weak arch_thaw_secondary_cpus_begin(void)
{
}

void __weak arch_thaw_secondary_cpus_end(void)
{
}

void thaw_secondary_cpus(void)
{
        int cpu, error;

        /* Allow everyone to use the CPU hotplug again */
        cpu_maps_update_begin();
        __cpu_hotplug_enable();
        if (cpumask_empty(frozen_cpus))
                goto out;

        pr_info("Enabling non-boot CPUs ...\n");

        arch_thaw_secondary_cpus_begin();

        for_each_cpu(cpu, frozen_cpus) {
                trace_suspend_resume(TPS("CPU_ON"), cpu, true);
                error = _cpu_up(cpu, 1, CPUHP_ONLINE);
                trace_suspend_resume(TPS("CPU_ON"), cpu, false);
                if (!error) {
                        pr_info("CPU%d is up\n", cpu);
                        continue;
                }
                pr_warn("Error taking CPU%d up: %d\n", cpu, error);
        }

        arch_thaw_secondary_cpus_end();

        cpumask_clear(frozen_cpus);
out:
        cpu_maps_update_done();
}

static int __init alloc_frozen_cpus(void)
{
        if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
                return -ENOMEM;
        return 0;
}
core_initcall(alloc_frozen_cpus);

/*
 * When callbacks for CPU hotplug notifications are being executed, we must
 * ensure that the state of the system with respect to the tasks being frozen
 * or not, as reported by the notification, remains unchanged *throughout the
 * duration* of the execution of the callbacks.
 * Hence we need to prevent the freezer from racing with regular CPU hotplug.
 *
 * This synchronization is implemented by mutually excluding regular CPU
 * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
 * Hibernate notifications.
 */
static int
cpu_hotplug_pm_callback(struct notifier_block *nb,
                        unsigned long action, void *ptr)
{
        switch (action) {

        case PM_SUSPEND_PREPARE:
        case PM_HIBERNATION_PREPARE:
                cpu_hotplug_disable();
                break;

        case PM_POST_SUSPEND:
        case PM_POST_HIBERNATION:
                cpu_hotplug_enable();
                break;

        default:
                return NOTIFY_DONE;
        }

        return NOTIFY_OK;
}


static int __init cpu_hotplug_pm_sync_init(void)
{
        /*
         * cpu_hotplug_pm_callback has higher priority than x86
         * bsp_pm_callback which depends on cpu_hotplug_pm_callback
         * to disable cpu hotplug to avoid cpu hotplug race.
         */
        pm_notifier(cpu_hotplug_pm_callback, 0);
        return 0;
}
core_initcall(cpu_hotplug_pm_sync_init);

#endif /* CONFIG_PM_SLEEP_SMP */

int __boot_cpu_id;

#endif /* CONFIG_SMP */

/* Boot processor state steps */
static struct cpuhp_step cpuhp_hp_states[] = {
        [CPUHP_OFFLINE] = {
                .name                        = "offline",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
#ifdef CONFIG_SMP
        [CPUHP_CREATE_THREADS]= {
                .name                        = "threads:prepare",
                .startup.single                = smpboot_create_threads,
                .teardown.single        = NULL,
                .cant_stop                = true,
        },
        [CPUHP_PERF_PREPARE] = {
                .name                        = "perf:prepare",
                .startup.single                = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_RANDOM_PREPARE] = {
                .name                        = "random:prepare",
                .startup.single                = random_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_WORKQUEUE_PREP] = {
                .name                        = "workqueue:prepare",
                .startup.single                = workqueue_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_HRTIMERS_PREPARE] = {
                .name                        = "hrtimers:prepare",
                .startup.single                = hrtimers_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_SMPCFD_PREPARE] = {
                .name                        = "smpcfd:prepare",
                .startup.single                = smpcfd_prepare_cpu,
                .teardown.single        = smpcfd_dead_cpu,
        },
        [CPUHP_RELAY_PREPARE] = {
                .name                        = "relay:prepare",
                .startup.single                = relay_prepare_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_RCUTREE_PREP] = {
                .name                        = "RCU/tree:prepare",
                .startup.single                = rcutree_prepare_cpu,
                .teardown.single        = rcutree_dead_cpu,
        },
        /*
         * On the tear-down path, timers_dead_cpu() must be invoked
         * before blk_mq_queue_reinit_notify() from notify_dead(),
         * otherwise a RCU stall occurs.
         */
        [CPUHP_TIMERS_PREPARE] = {
                .name                        = "timers:prepare",
                .startup.single                = timers_prepare_cpu,
                .teardown.single        = timers_dead_cpu,
        },

#ifdef CONFIG_HOTPLUG_SPLIT_STARTUP
        /*
         * Kicks the AP alive. AP will wait in cpuhp_ap_sync_alive() until
         * the next step will release it.
         */
        [CPUHP_BP_KICK_AP] = {
                .name                        = "cpu:kick_ap",
                .startup.single                = cpuhp_kick_ap_alive,
        },

        /*
         * Waits for the AP to reach cpuhp_ap_sync_alive() and then
         * releases it for the complete bringup.
         */
        [CPUHP_BRINGUP_CPU] = {
                .name                        = "cpu:bringup",
                .startup.single                = cpuhp_bringup_ap,
                .teardown.single        = finish_cpu,
                .cant_stop                = true,
        },
#else
        /*
         * All-in-one CPU bringup state which includes the kick alive.
         */
        [CPUHP_BRINGUP_CPU] = {
                .name                        = "cpu:bringup",
                .startup.single                = bringup_cpu,
                .teardown.single        = finish_cpu,
                .cant_stop                = true,
        },
#endif
        /* Final state before CPU kills itself */
        [CPUHP_AP_IDLE_DEAD] = {
                .name                        = "idle:dead",
        },
        /*
         * Last state before CPU enters the idle loop to die. Transient state
         * for synchronization.
         */
        [CPUHP_AP_OFFLINE] = {
                .name                        = "ap:offline",
                .cant_stop                = true,
        },
        /* First state is scheduler control. Interrupts are disabled */
        [CPUHP_AP_SCHED_STARTING] = {
                .name                        = "sched:starting",
                .startup.single                = sched_cpu_starting,
                .teardown.single        = sched_cpu_dying,
        },
        [CPUHP_AP_RCUTREE_DYING] = {
                .name                        = "RCU/tree:dying",
                .startup.single                = NULL,
                .teardown.single        = rcutree_dying_cpu,
        },
        [CPUHP_AP_SMPCFD_DYING] = {
                .name                        = "smpcfd:dying",
                .startup.single                = NULL,
                .teardown.single        = smpcfd_dying_cpu,
        },
        [CPUHP_AP_HRTIMERS_DYING] = {
                .name                        = "hrtimers:dying",
                .startup.single                = hrtimers_cpu_starting,
                .teardown.single        = hrtimers_cpu_dying,
        },
        [CPUHP_AP_TICK_DYING] = {
                .name                        = "tick:dying",
                .startup.single                = NULL,
                .teardown.single        = tick_cpu_dying,
        },
        /* Entry state on starting. Interrupts enabled from here on. Transient
         * state for synchronsization */
        [CPUHP_AP_ONLINE] = {
                .name                        = "ap:online",
        },
        /*
         * Handled on control processor until the plugged processor manages
         * this itself.
         */
        [CPUHP_TEARDOWN_CPU] = {
                .name                        = "cpu:teardown",
                .startup.single                = NULL,
                .teardown.single        = takedown_cpu,
                .cant_stop                = true,
        },

        [CPUHP_AP_SCHED_WAIT_EMPTY] = {
                .name                        = "sched:waitempty",
                .startup.single                = NULL,
                .teardown.single        = sched_cpu_wait_empty,
        },

        /* Handle smpboot threads park/unpark */
        [CPUHP_AP_SMPBOOT_THREADS] = {
                .name                        = "smpboot/threads:online",
                .startup.single                = smpboot_unpark_threads,
                .teardown.single        = smpboot_park_threads,
        },
        [CPUHP_AP_IRQ_AFFINITY_ONLINE] = {
                .name                        = "irq/affinity:online",
                .startup.single                = irq_affinity_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_PERF_ONLINE] = {
                .name                        = "perf:online",
                .startup.single                = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
        [CPUHP_AP_WATCHDOG_ONLINE] = {
                .name                        = "lockup_detector:online",
                .startup.single                = lockup_detector_online_cpu,
                .teardown.single        = lockup_detector_offline_cpu,
        },
        [CPUHP_AP_WORKQUEUE_ONLINE] = {
                .name                        = "workqueue:online",
                .startup.single                = workqueue_online_cpu,
                .teardown.single        = workqueue_offline_cpu,
        },
        [CPUHP_AP_RANDOM_ONLINE] = {
                .name                        = "random:online",
                .startup.single                = random_online_cpu,
                .teardown.single        = NULL,
        },
        [CPUHP_AP_RCUTREE_ONLINE] = {
                .name                        = "RCU/tree:online",
                .startup.single                = rcutree_online_cpu,
                .teardown.single        = rcutree_offline_cpu,
        },
#endif
        /*
         * The dynamically registered state space is here
         */

#ifdef CONFIG_SMP
        /* Last state is scheduler control setting the cpu active */
        [CPUHP_AP_ACTIVE] = {
                .name                        = "sched:active",
                .startup.single                = sched_cpu_activate,
                .teardown.single        = sched_cpu_deactivate,
        },
#endif

        /* CPU is fully up and running. */
        [CPUHP_ONLINE] = {
                .name                        = "online",
                .startup.single                = NULL,
                .teardown.single        = NULL,
        },
};

/* Sanity check for callbacks */
static int cpuhp_cb_check(enum cpuhp_state state)
{
        if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE)
                return -EINVAL;
        return 0;
}

/*
 * Returns a free for dynamic slot assignment of the Online state. The states
 * are protected by the cpuhp_slot_states mutex and an empty slot is identified
 * by having no name assigned.
 */
static int cpuhp_reserve_state(enum cpuhp_state state)
{
        enum cpuhp_state i, end;
        struct cpuhp_step *step;

        switch (state) {
        case CPUHP_AP_ONLINE_DYN:
                step = cpuhp_hp_states + CPUHP_AP_ONLINE_DYN;
                end = CPUHP_AP_ONLINE_DYN_END;
                break;
        case CPUHP_BP_PREPARE_DYN:
                step = cpuhp_hp_states + CPUHP_BP_PREPARE_DYN;
                end = CPUHP_BP_PREPARE_DYN_END;
                break;
        default:
                return -EINVAL;
        }

        for (i = state; i <= end; i++, step++) {
                if (!step->name)
                        return i;
        }
        WARN(1, "No more dynamic states available for CPU hotplug\n");
        return -ENOSPC;
}

static int cpuhp_store_callbacks(enum cpuhp_state state, const char *name,
                                 int (*startup)(unsigned int cpu),
                                 int (*teardown)(unsigned int cpu),
                                 bool multi_instance)
{
        /* (Un)Install the callbacks for further cpu hotplug operations */
        struct cpuhp_step *sp;
        int ret = 0;

        /*
         * If name is NULL, then the state gets removed.
         *
         * CPUHP_AP_ONLINE_DYN and CPUHP_BP_PREPARE_DYN are handed out on
         * the first allocation from these dynamic ranges, so the removal
         * would trigger a new allocation and clear the wrong (already
         * empty) state, leaving the callbacks of the to be cleared state
         * dangling, which causes wreckage on the next hotplug operation.
         */
        if (name && (state == CPUHP_AP_ONLINE_DYN ||
                     state == CPUHP_BP_PREPARE_DYN)) {
                ret = cpuhp_reserve_state(state);
                if (ret < 0)
                        return ret;
                state = ret;
        }
        sp = cpuhp_get_step(state);
        if (name && sp->name)
                return -EBUSY;

        sp->startup.single = startup;
        sp->teardown.single = teardown;
        sp->name = name;
        sp->multi_instance = multi_instance;
        INIT_HLIST_HEAD(&sp->list);
        return ret;
}

static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
{
        return cpuhp_get_step(state)->teardown.single;
}

/*
 * Call the startup/teardown function for a step either on the AP or
 * on the current CPU.
 */
static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
                            struct hlist_node *node)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int ret;

        /*
         * If there's nothing to do, we done.
         * Relies on the union for multi_instance.
         */
        if (cpuhp_step_empty(bringup, sp))
                return 0;
        /*
         * The non AP bound callbacks can fail on bringup. On teardown
         * e.g. module removal we crash for now.
         */
#ifdef CONFIG_SMP
        if (cpuhp_is_ap_state(state))
                ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
        else
                ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
        ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
        BUG_ON(ret && !bringup);
        return ret;
}

/*
 * Called from __cpuhp_setup_state on a recoverable failure.
 *
 * Note: The teardown callbacks for rollback are not allowed to fail!
 */
static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state,
                                   struct hlist_node *node)
{
        int cpu;

        /* Roll back the already executed steps on the other cpus */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpu >= failedcpu)
                        break;

                /* Did we invoke the startup call on that cpu ? */
                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }
}

int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state,
                                          struct hlist_node *node,
                                          bool invoke)
{
        struct cpuhp_step *sp;
        int cpu;
        int ret;

        lockdep_assert_cpus_held();

        sp = cpuhp_get_step(state);
        if (sp->multi_instance == false)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !sp->startup.multi)
                goto add_node;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, node);
                if (ret) {
                        if (sp->teardown.multi)
                                cpuhp_rollback_install(cpu, state, node);
                        goto unlock;
                }
        }
add_node:
        ret = 0;
        hlist_add_head(node, &sp->list);
unlock:
        mutex_unlock(&cpuhp_state_mutex);
        return ret;
}

int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
                               bool invoke)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_state_add_instance_cpuslocked(state, node, invoke);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);

/**
 * __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
 * @state:                The state to setup
 * @name:                Name of the step
 * @invoke:                If true, the startup function is invoked for cpus where
 *                        cpu state >= @state
 * @startup:                startup callback function
 * @teardown:                teardown callback function
 * @multi_instance:        State is set up for multiple instances which get
 *                        added afterwards.
 *
 * The caller needs to hold cpus read locked while calling this function.
 * Return:
 *   On success:
 *      Positive state number if @state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN;
 *      0 for all other states
 *   On failure: proper (negative) error code
 */
int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state,
                                   const char *name, bool invoke,
                                   int (*startup)(unsigned int cpu),
                                   int (*teardown)(unsigned int cpu),
                                   bool multi_instance)
{
        int cpu, ret = 0;
        bool dynstate;

        lockdep_assert_cpus_held();

        if (cpuhp_cb_check(state) || !name)
                return -EINVAL;

        mutex_lock(&cpuhp_state_mutex);

        ret = cpuhp_store_callbacks(state, name, startup, teardown,
                                    multi_instance);

        dynstate = state == CPUHP_AP_ONLINE_DYN || state == CPUHP_BP_PREPARE_DYN;
        if (ret > 0 && dynstate) {
                state = ret;
                ret = 0;
        }

        if (ret || !invoke || !startup)
                goto out;

        /*
         * Try to call the startup callback for each present cpu
         * depending on the hotplug state of the cpu.
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate < state)
                        continue;

                ret = cpuhp_issue_call(cpu, state, true, NULL);
                if (ret) {
                        if (teardown)
                                cpuhp_rollback_install(cpu, state, NULL);
                        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
                        goto out;
                }
        }
out:
        mutex_unlock(&cpuhp_state_mutex);
        /*
         * If the requested state is CPUHP_AP_ONLINE_DYN or CPUHP_BP_PREPARE_DYN,
         * return the dynamically allocated state in case of success.
         */
        if (!ret && dynstate)
                return state;
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state_cpuslocked);

int __cpuhp_setup_state(enum cpuhp_state state,
                        const char *name, bool invoke,
                        int (*startup)(unsigned int cpu),
                        int (*teardown)(unsigned int cpu),
                        bool multi_instance)
{
        int ret;

        cpus_read_lock();
        ret = __cpuhp_setup_state_cpuslocked(state, name, invoke, startup,
                                             teardown, multi_instance);
        cpus_read_unlock();
        return ret;
}
EXPORT_SYMBOL(__cpuhp_setup_state);

int __cpuhp_state_remove_instance(enum cpuhp_state state,
                                  struct hlist_node *node, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        if (!sp->multi_instance)
                return -EINVAL;

        cpus_read_lock();
        mutex_lock(&cpuhp_state_mutex);

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;
        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, node);
        }

remove:
        hlist_del(node);
        mutex_unlock(&cpuhp_state_mutex);
        cpus_read_unlock();

        return 0;
}
EXPORT_SYMBOL_GPL(__cpuhp_state_remove_instance);

/**
 * __cpuhp_remove_state_cpuslocked - Remove the callbacks for an hotplug machine state
 * @state:        The state to remove
 * @invoke:        If true, the teardown function is invoked for cpus where
 *                cpu state >= @state
 *
 * The caller needs to hold cpus read locked while calling this function.
 * The teardown callback is currently not allowed to fail. Think
 * about module removal!
 */
void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke)
{
        struct cpuhp_step *sp = cpuhp_get_step(state);
        int cpu;

        BUG_ON(cpuhp_cb_check(state));

        lockdep_assert_cpus_held();

        mutex_lock(&cpuhp_state_mutex);
        if (sp->multi_instance) {
                WARN(!hlist_empty(&sp->list),
                     "Error: Removing state %d which has instances left.\n",
                     state);
                goto remove;
        }

        if (!invoke || !cpuhp_get_teardown_cb(state))
                goto remove;

        /*
         * Call the teardown callback for each present cpu depending
         * on the hotplug state of the cpu. This function is not
         * allowed to fail currently!
         */
        for_each_present_cpu(cpu) {
                struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
                int cpustate = st->state;

                if (cpustate >= state)
                        cpuhp_issue_call(cpu, state, false, NULL);
        }
remove:
        cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
        mutex_unlock(&cpuhp_state_mutex);
}
EXPORT_SYMBOL(__cpuhp_remove_state_cpuslocked);

void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
{
        cpus_read_lock();
        __cpuhp_remove_state_cpuslocked(state, invoke);
        cpus_read_unlock();
}
EXPORT_SYMBOL(__cpuhp_remove_state);

#ifdef CONFIG_HOTPLUG_SMT
static void cpuhp_offline_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = true;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_OFFLINE);
}

static void cpuhp_online_cpu_device(unsigned int cpu)
{
        struct device *dev = get_cpu_device(cpu);

        dev->offline = false;
        /* Tell user space about the state change */
        kobject_uevent(&dev->kobj, KOBJ_ONLINE);
}

int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        for_each_online_cpu(cpu) {
                if (topology_is_primary_thread(cpu))
                        continue;
                /*
                 * Disable can be called with CPU_SMT_ENABLED when changing
                 * from a higher to lower number of SMT threads per core.
                 */
                if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
                        continue;
                ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
                if (ret)
                        break;
                /*
                 * As this needs to hold the cpu maps lock it's impossible
                 * to call device_offline() because that ends up calling
                 * cpu_down() which takes cpu maps lock. cpu maps lock
                 * needs to be held as this might race against in kernel
                 * abusers of the hotplug machinery (thermal management).
                 *
                 * So nothing would update device:offline state. That would
                 * leave the sysfs entry stale and prevent onlining after
                 * smt control has been changed to 'off' again. This is
                 * called under the sysfs hotplug lock, so it is properly
                 * serialized against the regular offline usage.
                 */
                cpuhp_offline_cpu_device(cpu);
        }
        if (!ret)
                cpu_smt_control = ctrlval;
        cpu_maps_update_done();
        return ret;
}

/* Check if the core a CPU belongs to is online */
#if !defined(topology_is_core_online)
static inline bool topology_is_core_online(unsigned int cpu)
{
        return true;
}
#endif

int cpuhp_smt_enable(void)
{
        int cpu, ret = 0;

        cpu_maps_update_begin();
        cpu_smt_control = CPU_SMT_ENABLED;
        for_each_present_cpu(cpu) {
                /* Skip online CPUs and CPUs on offline nodes */
                if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
                        continue;
                if (!cpu_smt_thread_allowed(cpu) || !topology_is_core_online(cpu))
                        continue;
                ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
                if (ret)
                        break;
                /* See comment in cpuhp_smt_disable() */
                cpuhp_online_cpu_device(cpu);
        }
        cpu_maps_update_done();
        return ret;
}
#endif

#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
static ssize_t state_show(struct device *dev,
                          struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->state);
}
static DEVICE_ATTR_RO(state);

static ssize_t target_store(struct device *dev, struct device_attribute *attr,
                            const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int target, ret;

        ret = kstrtoint(buf, 10, &target);
        if (ret)
                return ret;

#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL
        if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE)
                return -EINVAL;
#else
        if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE)
                return -EINVAL;
#endif

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(target);
        ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                goto out;

        if (st->state < target)
                ret = cpu_up(dev->id, target);
        else if (st->state > target)
                ret = cpu_down(dev->id, target);
        else if (WARN_ON(st->target != target))
                st->target = target;
out:
        unlock_device_hotplug();
        return ret ? ret : count;
}

static ssize_t target_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->target);
}
static DEVICE_ATTR_RW(target);

static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
                          const char *buf, size_t count)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
        struct cpuhp_step *sp;
        int fail, ret;

        ret = kstrtoint(buf, 10, &fail);
        if (ret)
                return ret;

        if (fail == CPUHP_INVALID) {
                st->fail = fail;
                return count;
        }

        if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
                return -EINVAL;

        /*
         * Cannot fail STARTING/DYING callbacks.
         */
        if (cpuhp_is_atomic_state(fail))
                return -EINVAL;

        /*
         * DEAD callbacks cannot fail...
         * ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
         * triggering STARTING callbacks, a failure in this state would
         * hinder rollback.
         */
        if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
                return -EINVAL;

        /*
         * Cannot fail anything that doesn't have callbacks.
         */
        mutex_lock(&cpuhp_state_mutex);
        sp = cpuhp_get_step(fail);
        if (!sp->startup.single && !sp->teardown.single)
                ret = -EINVAL;
        mutex_unlock(&cpuhp_state_mutex);
        if (ret)
                return ret;

        st->fail = fail;

        return count;
}

static ssize_t fail_show(struct device *dev,
                         struct device_attribute *attr, char *buf)
{
        struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);

        return sprintf(buf, "%d\n", st->fail);
}

static DEVICE_ATTR_RW(fail);

static struct attribute *cpuhp_cpu_attrs[] = {
        &dev_attr_state.attr,
        &dev_attr_target.attr,
        &dev_attr_fail.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_attr_group = {
        .attrs = cpuhp_cpu_attrs,
        .name = "hotplug",
};

static ssize_t states_show(struct device *dev,
                                 struct device_attribute *attr, char *buf)
{
        ssize_t cur, res = 0;
        int i;

        mutex_lock(&cpuhp_state_mutex);
        for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) {
                struct cpuhp_step *sp = cpuhp_get_step(i);

                if (sp->name) {
                        cur = sprintf(buf, "%3d: %s\n", i, sp->name);
                        buf += cur;
                        res += cur;
                }
        }
        mutex_unlock(&cpuhp_state_mutex);
        return res;
}
static DEVICE_ATTR_RO(states);

static struct attribute *cpuhp_cpu_root_attrs[] = {
        &dev_attr_states.attr,
        NULL
};

static const struct attribute_group cpuhp_cpu_root_attr_group = {
        .attrs = cpuhp_cpu_root_attrs,
        .name = "hotplug",
};

#ifdef CONFIG_HOTPLUG_SMT

static bool cpu_smt_num_threads_valid(unsigned int threads)
{
        if (IS_ENABLED(CONFIG_SMT_NUM_THREADS_DYNAMIC))
                return threads >= 1 && threads <= cpu_smt_max_threads;
        return threads == 1 || threads == cpu_smt_max_threads;
}

static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        int ctrlval, ret, num_threads, orig_threads;
        bool force_off;

        if (cpu_smt_control == CPU_SMT_FORCE_DISABLED)
                return -EPERM;

        if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
                return -ENODEV;

        if (sysfs_streq(buf, "on")) {
                ctrlval = CPU_SMT_ENABLED;
                num_threads = cpu_smt_max_threads;
        } else if (sysfs_streq(buf, "off")) {
                ctrlval = CPU_SMT_DISABLED;
                num_threads = 1;
        } else if (sysfs_streq(buf, "forceoff")) {
                ctrlval = CPU_SMT_FORCE_DISABLED;
                num_threads = 1;
        } else if (kstrtoint(buf, 10, &num_threads) == 0) {
                if (num_threads == 1)
                        ctrlval = CPU_SMT_DISABLED;
                else if (cpu_smt_num_threads_valid(num_threads))
                        ctrlval = CPU_SMT_ENABLED;
                else
                        return -EINVAL;
        } else {
                return -EINVAL;
        }

        ret = lock_device_hotplug_sysfs();
        if (ret)
                return ret;

        orig_threads = cpu_smt_num_threads;
        cpu_smt_num_threads = num_threads;

        force_off = ctrlval != cpu_smt_control && ctrlval == CPU_SMT_FORCE_DISABLED;

        if (num_threads > orig_threads)
                ret = cpuhp_smt_enable();
        else if (num_threads < orig_threads || force_off)
                ret = cpuhp_smt_disable(ctrlval);

        unlock_device_hotplug();
        return ret ? ret : count;
}

#else /* !CONFIG_HOTPLUG_SMT */
static ssize_t
__store_smt_control(struct device *dev, struct device_attribute *attr,
                    const char *buf, size_t count)
{
        return -ENODEV;
}
#endif /* CONFIG_HOTPLUG_SMT */

static const char *smt_states[] = {
        [CPU_SMT_ENABLED]                = "on",
        [CPU_SMT_DISABLED]                = "off",
        [CPU_SMT_FORCE_DISABLED]        = "forceoff",
        [CPU_SMT_NOT_SUPPORTED]                = "notsupported",
        [CPU_SMT_NOT_IMPLEMENTED]        = "notimplemented",
};

static ssize_t control_show(struct device *dev,
                            struct device_attribute *attr, char *buf)
{
        const char *state = smt_states[cpu_smt_control];

#ifdef CONFIG_HOTPLUG_SMT
        /*
         * If SMT is enabled but not all threads are enabled then show the
         * number of threads. If all threads are enabled show "on". Otherwise
         * show the state name.
         */
        if (cpu_smt_control == CPU_SMT_ENABLED &&
            cpu_smt_num_threads != cpu_smt_max_threads)
                return sysfs_emit(buf, "%d\n", cpu_smt_num_threads);
#endif

        return sysfs_emit(buf, "%s\n", state);
}

static ssize_t control_store(struct device *dev, struct device_attribute *attr,
                             const char *buf, size_t count)
{
        return __store_smt_control(dev, attr, buf, count);
}
static DEVICE_ATTR_RW(control);

static ssize_t active_show(struct device *dev,
                           struct device_attribute *attr, char *buf)
{
        return sysfs_emit(buf, "%d\n", sched_smt_active());
}
static DEVICE_ATTR_RO(active);

static struct attribute *cpuhp_smt_attrs[] = {
        &dev_attr_control.attr,
        &dev_attr_active.attr,
        NULL
};

static const struct attribute_group cpuhp_smt_attr_group = {
        .attrs = cpuhp_smt_attrs,
        .name = "smt",
};

static int __init cpu_smt_sysfs_init(void)
{
        struct device *dev_root;
        int ret = -ENODEV;

        dev_root = bus_get_dev_root(&cpu_subsys);
        if (dev_root) {
                ret = sysfs_create_group(&dev_root->kobj, &cpuhp_smt_attr_group);
                put_device(dev_root);
        }
        return ret;
}

static int __init cpuhp_sysfs_init(void)
{
        struct device *dev_root;
        int cpu, ret;

        ret = cpu_smt_sysfs_init();
        if (ret)
                return ret;

        dev_root = bus_get_dev_root(&cpu_subsys);
        if (dev_root) {
                ret = sysfs_create_group(&dev_root->kobj, &cpuhp_cpu_root_attr_group);
                put_device(dev_root);
                if (ret)
                        return ret;
        }

        for_each_possible_cpu(cpu) {
                struct device *dev = get_cpu_device(cpu);

                if (!dev)
                        continue;
                ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group);
                if (ret)
                        return ret;
        }
        return 0;
}
device_initcall(cpuhp_sysfs_init);
#endif /* CONFIG_SYSFS && CONFIG_HOTPLUG_CPU */

/*
 * cpu_bit_bitmap[] is a special, "compressed" data structure that
 * represents all NR_CPUS bits binary values of 1<<nr.
 *
 * It is used by cpumask_of() to get a constant address to a CPU
 * mask value that has a single bit set only.
 */

/* cpu_bit_bitmap[0] is empty - so we can back into it */
#define MASK_DECLARE_1(x)        [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x)        MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x)        MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x)        MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)

const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {

        MASK_DECLARE_8(0),        MASK_DECLARE_8(8),
        MASK_DECLARE_8(16),        MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
        MASK_DECLARE_8(32),        MASK_DECLARE_8(40),
        MASK_DECLARE_8(48),        MASK_DECLARE_8(56),
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);

const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);

#ifdef CONFIG_INIT_ALL_POSSIBLE
struct cpumask __cpu_possible_mask __ro_after_init
        = {CPU_BITS_ALL};
#else
struct cpumask __cpu_possible_mask __ro_after_init;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);

struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);

struct cpumask __cpu_enabled_mask __read_mostly;
EXPORT_SYMBOL(__cpu_enabled_mask);

struct cpumask __cpu_present_mask __read_mostly;
EXPORT_SYMBOL(__cpu_present_mask);

struct cpumask __cpu_active_mask __read_mostly;
EXPORT_SYMBOL(__cpu_active_mask);

struct cpumask __cpu_dying_mask __read_mostly;
EXPORT_SYMBOL(__cpu_dying_mask);

atomic_t __num_online_cpus __read_mostly;
EXPORT_SYMBOL(__num_online_cpus);

void init_cpu_present(const struct cpumask *src)
{
        cpumask_copy(&__cpu_present_mask, src);
}

void init_cpu_possible(const struct cpumask *src)
{
        cpumask_copy(&__cpu_possible_mask, src);
}

void set_cpu_online(unsigned int cpu, bool online)
{
        /*
         * atomic_inc/dec() is required to handle the horrid abuse of this
         * function by the reboot and kexec code which invoke it from
         * IPI/NMI broadcasts when shutting down CPUs. Invocation from
         * regular CPU hotplug is properly serialized.
         *
         * Note, that the fact that __num_online_cpus is of type atomic_t
         * does not protect readers which are not serialized against
         * concurrent hotplug operations.
         */
        if (online) {
                if (!cpumask_test_and_set_cpu(cpu, &__cpu_online_mask))
                        atomic_inc(&__num_online_cpus);
        } else {
                if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask))
                        atomic_dec(&__num_online_cpus);
        }
}

/*
 * Activate the first processor.
 */
void __init boot_cpu_init(void)
{
        int cpu = smp_processor_id();

        /* Mark the boot cpu "present", "online" etc for SMP and UP case */
        set_cpu_online(cpu, true);
        set_cpu_active(cpu, true);
        set_cpu_present(cpu, true);
        set_cpu_possible(cpu, true);

#ifdef CONFIG_SMP
        __boot_cpu_id = cpu;
#endif
}

/*
 * Must be called _AFTER_ setting up the per_cpu areas
 */
void __init boot_cpu_hotplug_init(void)
{
#ifdef CONFIG_SMP
        cpumask_set_cpu(smp_processor_id(), &cpus_booted_once_mask);
        atomic_set(this_cpu_ptr(&cpuhp_state.ap_sync_state), SYNC_STATE_ONLINE);
#endif
        this_cpu_write(cpuhp_state.state, CPUHP_ONLINE);
        this_cpu_write(cpuhp_state.target, CPUHP_ONLINE);
}

#ifdef CONFIG_CPU_MITIGATIONS
/*
 * These are used for a global "mitigations=" cmdline option for toggling
 * optional CPU mitigations.
 */
enum cpu_mitigations {
        CPU_MITIGATIONS_OFF,
        CPU_MITIGATIONS_AUTO,
        CPU_MITIGATIONS_AUTO_NOSMT,
};

static enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO;

static int __init mitigations_parse_cmdline(char *arg)
{
        if (!strcmp(arg, "off"))
                cpu_mitigations = CPU_MITIGATIONS_OFF;
        else if (!strcmp(arg, "auto"))
                cpu_mitigations = CPU_MITIGATIONS_AUTO;
        else if (!strcmp(arg, "auto,nosmt"))
                cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT;
        else
                pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n",
                        arg);

        return 0;
}

/* mitigations=off */
bool cpu_mitigations_off(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_OFF;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_off);

/* mitigations=auto,nosmt */
bool cpu_mitigations_auto_nosmt(void)
{
        return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT;
}
EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt);
#else
static int __init mitigations_parse_cmdline(char *arg)
{
        pr_crit("Kernel compiled without mitigations, ignoring 'mitigations'; system may still be vulnerable\n");
        return 0;
}
#endif
early_param("mitigations", mitigations_parse_cmdline);









































 1506 


 1498 



 1506 
















 1498 









 1504 

































































 1498 








 1506 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// SPDX-License-Identifier: GPL-2.0

#include <linux/compiler.h>
#include <linux/context_tracking.h>
#include <linux/errno.h>
#include <linux/nospec.h>
#include <linux/ptrace.h>
#include <linux/randomize_kstack.h>
#include <linux/syscalls.h>

#include <asm/debug-monitors.h>
#include <asm/exception.h>
#include <asm/fpsimd.h>
#include <asm/syscall.h>
#include <asm/thread_info.h>
#include <asm/unistd.h>
#include <asm/unistd_compat_32.h>

long compat_arm_syscall(struct pt_regs *regs, int scno);
long sys_ni_syscall(void);

static long do_ni_syscall(struct pt_regs *regs, int scno)
{
        if (is_compat_task()) {
                long ret = compat_arm_syscall(regs, scno);
                if (ret != -ENOSYS)
                        return ret;
        }

        return sys_ni_syscall();
}

static long __invoke_syscall(struct pt_regs *regs, syscall_fn_t syscall_fn)
{
        return syscall_fn(regs);
}

static void invoke_syscall(struct pt_regs *regs, unsigned int scno,
                           unsigned int sc_nr,
                           const syscall_fn_t syscall_table[])
{
        long ret;

        add_random_kstack_offset();

        if (scno < sc_nr) {
                syscall_fn_t syscall_fn;
                syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)];
                ret = __invoke_syscall(regs, syscall_fn);
        } else {
                ret = do_ni_syscall(regs, scno);
        }

        syscall_set_return_value(current, regs, 0, ret);

        /*
         * This value will get limited by KSTACK_OFFSET_MAX(), which is 10
         * bits. The actual entropy will be further reduced by the compiler
         * when applying stack alignment constraints: the AAPCS mandates a
         * 16-byte aligned SP at function boundaries, which will remove the
         * 4 low bits from any entropy chosen here.
         *
         * The resulting 6 bits of entropy is seen in SP[9:4].
         */
        choose_random_kstack_offset(get_random_u16());
}

static inline bool has_syscall_work(unsigned long flags)
{
        return unlikely(flags & _TIF_SYSCALL_WORK);
}

static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr,
                           const syscall_fn_t syscall_table[])
{
        unsigned long flags = read_thread_flags();

        regs->orig_x0 = regs->regs[0];
        regs->syscallno = scno;

        /*
         * BTI note:
         * The architecture does not guarantee that SPSR.BTYPE is zero
         * on taking an SVC, so we could return to userspace with a
         * non-zero BTYPE after the syscall.
         *
         * This shouldn't matter except when userspace is explicitly
         * doing something stupid, such as setting PROT_BTI on a page
         * that lacks conforming BTI/PACIxSP instructions, falling
         * through from one executable page to another with differing
         * PROT_BTI, or messing with BTYPE via ptrace: in such cases,
         * userspace should not be surprised if a SIGILL occurs on
         * syscall return.
         *
         * So, don't touch regs->pstate & PSR_BTYPE_MASK here.
         * (Similarly for HVC and SMC elsewhere.)
         */

        if (flags & _TIF_MTE_ASYNC_FAULT) {
                /*
                 * Process the asynchronous tag check fault before the actual
                 * syscall. do_notify_resume() will send a signal to userspace
                 * before the syscall is restarted.
                 */
                syscall_set_return_value(current, regs, -ERESTARTNOINTR, 0);
                return;
        }

        if (has_syscall_work(flags)) {
                /*
                 * The de-facto standard way to skip a system call using ptrace
                 * is to set the system call to -1 (NO_SYSCALL) and set x0 to a
                 * suitable error code for consumption by userspace. However,
                 * this cannot be distinguished from a user-issued syscall(-1)
                 * and so we must set x0 to -ENOSYS here in case the tracer doesn't
                 * issue the skip and we fall into trace_exit with x0 preserved.
                 *
                 * This is slightly odd because it also means that if a tracer
                 * sets the system call number to -1 but does not initialise x0,
                 * then x0 will be preserved for all system calls apart from a
                 * user-issued syscall(-1). However, requesting a skip and not
                 * setting the return value is unlikely to do anything sensible
                 * anyway.
                 */
                if (scno == NO_SYSCALL)
                        syscall_set_return_value(current, regs, -ENOSYS, 0);
                scno = syscall_trace_enter(regs);
                if (scno == NO_SYSCALL)
                        goto trace_exit;
        }

        invoke_syscall(regs, scno, sc_nr, syscall_table);

        /*
         * The tracing status may have changed under our feet, so we have to
         * check again. However, if we were tracing entry, then we always trace
         * exit regardless, as the old entry assembly did.
         */
        if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) {
                flags = read_thread_flags();
                if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP))
                        return;
        }

trace_exit:
        syscall_trace_exit(regs);
}

void do_el0_svc(struct pt_regs *regs)
{
        el0_svc_common(regs, regs->regs[8], __NR_syscalls, sys_call_table);
}

#ifdef CONFIG_COMPAT
void do_el0_svc_compat(struct pt_regs *regs)
{
        el0_svc_common(regs, regs->regs[7], __NR_compat32_syscalls,
                       compat_sys_call_table);
}
#endif














































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
// SPDX-License-Identifier: GPL-2.0+
/*
 * ext4_jbd2.h
 *
 * Written by Stephen C. Tweedie <sct@redhat.com>, 1999
 *
 * Copyright 1998--1999 Red Hat corp --- All Rights Reserved
 *
 * Ext4-specific journaling extensions.
 */

#ifndef _EXT4_JBD2_H
#define _EXT4_JBD2_H

#include <linux/fs.h>
#include <linux/jbd2.h>
#include "ext4.h"

#define EXT4_JOURNAL(inode)        (EXT4_SB((inode)->i_sb)->s_journal)

/* Define the number of blocks we need to account to a transaction to
 * modify one block of data.
 *
 * We may have to touch one inode, one bitmap buffer, up to three
 * indirection blocks, the group and superblock summaries, and the data
 * block to complete the transaction.
 *
 * For extents-enabled fs we may have to allocate and modify up to
 * 5 levels of tree, data block (for each of these we need bitmap + group
 * summaries), root which is stored in the inode, sb
 */

#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)                                \
        (ext4_has_feature_extents(sb) ? 20U : 8U)

/* Extended attribute operations touch at most two data buffers,
 * two bitmap buffers, and two group summaries, in addition to the inode
 * and the superblock, which are already accounted for. */

#define EXT4_XATTR_TRANS_BLOCKS                6U

/* Define the minimum size for a transaction which modifies data.  This
 * needs to take into account the fact that we may end up modifying two
 * quota files too (one for the group, one for the user quota).  The
 * superblock only gets updated once, of course, so don't bother
 * counting that again for the quota updates. */

#define EXT4_DATA_TRANS_BLOCKS(sb)        (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
                                         EXT4_XATTR_TRANS_BLOCKS - 2 + \
                                         EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/*
 * Define the number of metadata blocks we need to account to modify data.
 *
 * This include super block, inode block, quota blocks and xattr blocks
 */
#define EXT4_META_TRANS_BLOCKS(sb)        (EXT4_XATTR_TRANS_BLOCKS + \
                                        EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))

/* Define an arbitrary limit for the amount of data we will anticipate
 * writing to any given transaction.  For unbounded transactions such as
 * write(2) and truncate(2) we can write more than this, but we always
 * start off at the maximum transaction size and grow the transaction
 * optimistically as we go. */

#define EXT4_MAX_TRANS_DATA                64U

/* We break up a large truncate or write transaction once the handle's
 * buffer credits gets this low, we need either to extend the
 * transaction or to start a new one.  Reserve enough space here for
 * inode, bitmap, superblock, group and indirection updates for at least
 * one block, plus two quota updates.  Quota allocations are not
 * needed. */

#define EXT4_RESERVE_TRANS_BLOCKS        12U

/*
 * Number of credits needed if we need to insert an entry into a
 * directory.  For each new index block, we need 4 blocks (old index
 * block, new index block, bitmap block, bg summary).  For normal
 * htree directories there are 2 levels; if the largedir feature
 * enabled it's 3 levels.
 */
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS        12U

#ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was
 * allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((ext4_quota_capable(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
 * but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_INIT_REWRITE) : 0)

#define EXT4_QUOTA_DEL_BLOCKS(sb) ((ext4_quota_capable(sb)) ?\
                (DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
                 +3+DQUOT_DEL_REWRITE) : 0)
#else
#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
#endif
#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))

/*
 * Ext4 handle operation types -- for logging purposes
 */
#define EXT4_HT_MISC             0
#define EXT4_HT_INODE            1
#define EXT4_HT_WRITE_PAGE       2
#define EXT4_HT_MAP_BLOCKS       3
#define EXT4_HT_DIR              4
#define EXT4_HT_TRUNCATE         5
#define EXT4_HT_QUOTA            6
#define EXT4_HT_RESIZE           7
#define EXT4_HT_MIGRATE          8
#define EXT4_HT_MOVE_EXTENTS     9
#define EXT4_HT_XATTR           10
#define EXT4_HT_EXT_CONVERT     11
#define EXT4_HT_MAX             12

int
ext4_mark_iloc_dirty(handle_t *handle,
                     struct inode *inode,
                     struct ext4_iloc *iloc);

/*
 * On success, We end up with an outstanding reference count against
 * iloc->bh.  This _must_ be cleaned up later.
 */

int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
                        struct ext4_iloc *iloc);

#define ext4_mark_inode_dirty(__h, __i)                                        \
                __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__)
int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode,
                                const char *func, unsigned int line);

int ext4_expand_extra_isize(struct inode *inode,
                            unsigned int new_extra_isize,
                            struct ext4_iloc *iloc);
/*
 * Wrapper functions with which ext4 calls into JBD.
 */
int __ext4_journal_get_write_access(const char *where, unsigned int line,
                                    handle_t *handle, struct super_block *sb,
                                    struct buffer_head *bh,
                                    enum ext4_journal_trigger_type trigger_type);

int __ext4_forget(const char *where, unsigned int line, handle_t *handle,
                  int is_metadata, struct inode *inode,
                  struct buffer_head *bh, ext4_fsblk_t blocknr);

int __ext4_journal_get_create_access(const char *where, unsigned int line,
                                handle_t *handle, struct super_block *sb,
                                struct buffer_head *bh,
                                enum ext4_journal_trigger_type trigger_type);

int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 handle_t *handle, struct inode *inode,
                                 struct buffer_head *bh);

#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \
                                        (bh), (trigger_type))
#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
        __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \
                      (bh), (block_nr))
#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \
        __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \
                                         (bh), (trigger_type))
#define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
                                     (bh))

handle_t *__ext4_journal_start_sb(struct inode *inode, struct super_block *sb,
                                  unsigned int line, int type, int blocks,
                                  int rsv_blocks, int revoke_creds);
int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);

#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)

/* Note:  Do not use this for NULL handles.  This is only to determine if
 * a properly allocated handle is using a journal or not. */
static inline int ext4_handle_valid(handle_t *handle)
{
        if ((unsigned long)handle < EXT4_NOJOURNAL_MAX_REF_COUNT)
                return 0;
        return 1;
}

static inline void ext4_handle_sync(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                handle->h_sync = 1;
}

static inline int ext4_handle_is_aborted(handle_t *handle)
{
        if (ext4_handle_valid(handle))
                return is_handle_aborted(handle);
        return 0;
}

static inline int ext4_free_metadata_revoke_credits(struct super_block *sb,
                                                    int blocks)
{
        /* Freeing each metadata block can result in freeing one cluster */
        return blocks * EXT4_SB(sb)->s_cluster_ratio;
}

static inline int ext4_trans_default_revoke_credits(struct super_block *sb)
{
        return ext4_free_metadata_revoke_credits(sb, 8);
}

#define ext4_journal_start_sb(sb, type, nblocks)                        \
        __ext4_journal_start_sb(NULL, (sb), __LINE__, (type), (nblocks), 0,\
                                ext4_trans_default_revoke_credits(sb))

#define ext4_journal_start(inode, type, nblocks)                        \
        __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0,        \
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks)\
        __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks),\
                             ext4_trans_default_revoke_credits((inode)->i_sb))

#define ext4_journal_start_with_revoke(inode, type, blocks, revoke_creds) \
        __ext4_journal_start((inode), __LINE__, (type), (blocks), 0,        \
                             (revoke_creds))

static inline handle_t *__ext4_journal_start(struct inode *inode,
                                             unsigned int line, int type,
                                             int blocks, int rsv_blocks,
                                             int revoke_creds)
{
        return __ext4_journal_start_sb(inode, inode->i_sb, line, type, blocks,
                                       rsv_blocks, revoke_creds);
}

#define ext4_journal_stop(handle) \
        __ext4_journal_stop(__func__, __LINE__, (handle))

#define ext4_journal_start_reserved(handle, type) \
        __ext4_journal_start_reserved((handle), __LINE__, (type))

handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
                                        int type);

static inline handle_t *ext4_journal_current_handle(void)
{
        return journal_current_handle();
}

static inline int ext4_journal_extend(handle_t *handle, int nblocks, int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_extend(handle, nblocks, revoke);
        return 0;
}

static inline int ext4_journal_restart(handle_t *handle, int nblocks,
                                       int revoke)
{
        if (ext4_handle_valid(handle))
                return jbd2__journal_restart(handle, nblocks, revoke, GFP_NOFS);
        return 0;
}

int __ext4_journal_ensure_credits(handle_t *handle, int check_cred,
                                  int extend_cred, int revoke_cred);


/*
 * Ensure @handle has at least @check_creds credits available. If not,
 * transaction will be extended or restarted to contain at least @extend_cred
 * credits. Before restarting transaction @fn is executed to allow for cleanup
 * before the transaction is restarted.
 *
 * The return value is < 0 in case of error, 0 in case the handle has enough
 * credits or transaction extension succeeded, 1 in case transaction had to be
 * restarted.
 */
#define ext4_journal_ensure_credits_fn(handle, check_cred, extend_cred,        \
                                       revoke_cred, fn) \
({                                                                        \
        __label__ __ensure_end;                                                \
        int err = __ext4_journal_ensure_credits((handle), (check_cred),        \
                                        (extend_cred), (revoke_cred));        \
                                                                        \
        if (err <= 0)                                                        \
                goto __ensure_end;                                        \
        err = (fn);                                                        \
        if (err < 0)                                                        \
                goto __ensure_end;                                        \
        err = ext4_journal_restart((handle), (extend_cred), (revoke_cred)); \
        if (err == 0)                                                        \
                err = 1;                                                \
__ensure_end:                                                                \
        err;                                                                \
})

/*
 * Ensure given handle has at least requested amount of credits available,
 * possibly restarting transaction if needed. We also make sure the transaction
 * has space for at least ext4_trans_default_revoke_credits(sb) revoke records
 * as freeing one or two blocks is very common pattern and requesting this is
 * very cheap.
 */
static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
                                              int revoke_creds)
{
        return ext4_journal_ensure_credits_fn(handle, credits, credits,
                                revoke_creds, 0);
}

static inline int ext4_journal_blocks_per_page(struct inode *inode)
{
        if (EXT4_JOURNAL(inode) != NULL)
                return jbd2_journal_blocks_per_page(inode);
        return 0;
}

static inline int ext4_journal_force_commit(journal_t *journal)
{
        if (journal)
                return jbd2_journal_force_commit(journal);
        return 0;
}

static inline int ext4_jbd2_inode_add_write(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_write(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline int ext4_jbd2_inode_add_wait(handle_t *handle,
                struct inode *inode, loff_t start_byte, loff_t length)
{
        if (ext4_handle_valid(handle))
                return jbd2_journal_inode_ranged_wait(handle,
                                EXT4_I(inode)->jinode, start_byte, length);
        return 0;
}

static inline void ext4_update_inode_fsync_trans(handle_t *handle,
                                                 struct inode *inode,
                                                 int datasync)
{
        struct ext4_inode_info *ei = EXT4_I(inode);

        if (ext4_handle_valid(handle) && !is_handle_aborted(handle)) {
                ei->i_sync_tid = handle->h_transaction->t_tid;
                if (datasync)
                        ei->i_datasync_tid = handle->h_transaction->t_tid;
        }
}

/* super.c */
int ext4_force_commit(struct super_block *sb);

/*
 * Ext4 inode journal modes
 */
#define EXT4_INODE_JOURNAL_DATA_MODE        0x01 /* journal data mode */
#define EXT4_INODE_ORDERED_DATA_MODE        0x02 /* ordered data mode */
#define EXT4_INODE_WRITEBACK_DATA_MODE        0x04 /* writeback data mode */

int ext4_inode_journal_mode(struct inode *inode);

static inline int ext4_should_journal_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
}

static inline int ext4_should_order_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
}

static inline int ext4_should_writeback_data(struct inode *inode)
{
        return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
}

static inline int ext4_free_data_revoke_credits(struct inode *inode, int blocks)
{
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 0;
        if (!ext4_should_journal_data(inode))
                return 0;
        /*
         * Data blocks in one extent are contiguous, just account for partial
         * clusters at extent boundaries
         */
        return blocks + 2*(EXT4_SB(inode->i_sb)->s_cluster_ratio - 1);
}

/*
 * This function controls whether or not we should try to go down the
 * dioread_nolock code paths, which makes it safe to avoid taking
 * i_rwsem for direct I/O reads.  This only works for extent-based
 * files, and it doesn't work if data journaling is enabled, since the
 * dioread_nolock code uses b_private to pass information back to the
 * I/O completion handler, and this conflicts with the jbd's use of
 * b_private.
 */
static inline int ext4_should_dioread_nolock(struct inode *inode)
{
        if (!test_opt(inode->i_sb, DIOREAD_NOLOCK))
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
        /* temporary fix to prevent generic/422 test failures */
        if (!test_opt(inode->i_sb, DELALLOC))
                return 0;
        return 1;
}

/*
 * Pass journal explicitly as it may not be cached in the sbi->s_journal in some
 * cases
 */
static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal)
{
        int err = 0;

        /*
         * At this point only two things can be operating on the journal.
         * JBD2 thread performing transaction commit and s_sb_upd_work
         * issuing sb update through the journal. Once we set
         * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not
         * queue s_sb_upd_work and ext4_force_commit() makes sure any
         * ext4_handle_error() calls from the running transaction commit are
         * finished. Hence no new s_sb_upd_work can be queued after we
         * flush it here.
         */
        ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY);

        ext4_force_commit(sbi->s_sb);
        flush_work(&sbi->s_sb_upd_work);

        err = jbd2_journal_destroy(journal);
        sbi->s_journal = NULL;

        return err;
}

#endif        /* _EXT4_JBD2_H */









































  131 
  132 





















   16 























   13 



    1 
    1 
    1 

    1 
    1 

   14 













   37 

   37 

   37 
   37 





   35 


   35 







    1 

   32 





   35 

   35 






    4 
   32 
   32 
   32 




   32 


























   61 





   26 






   16 

   37 

   35 


    5 
   57 




  179 




  151 
   60 
  179 






















   16 





   16 
   16 

   16 



   16 
    1 
   16 

   16 







   24 

   24 



   24 




   24 
    1 
   24 

   24 







   31 








   31 
    2 
   31 

   31 











































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
// SPDX-License-Identifier: GPL-2.0
/*
 *        linux/mm/mlock.c
 *
 *  (C) Copyright 1995 Linus Torvalds
 *  (C) Copyright 2002 Christoph Hellwig
 */

#include <linux/capability.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/sched/user.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/pagewalk.h>
#include <linux/mempolicy.h>
#include <linux/syscalls.h>
#include <linux/sched.h>
#include <linux/export.h>
#include <linux/rmap.h>
#include <linux/mmzone.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/secretmem.h>

#include "internal.h"

struct mlock_fbatch {
        local_lock_t lock;
        struct folio_batch fbatch;
};

static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
        .lock = INIT_LOCAL_LOCK(lock),
};

bool can_do_mlock(void)
{
        if (rlimit(RLIMIT_MEMLOCK) != 0)
                return true;
        if (capable(CAP_IPC_LOCK))
                return true;
        return false;
}
EXPORT_SYMBOL(can_do_mlock);

/*
 * Mlocked folios are marked with the PG_mlocked flag for efficient testing
 * in vmscan and, possibly, the fault path; and to support semi-accurate
 * statistics.
 *
 * An mlocked folio [folio_test_mlocked(folio)] is unevictable.  As such, it
 * will be ostensibly placed on the LRU "unevictable" list (actually no such
 * list exists), rather than the [in]active lists. PG_unevictable is set to
 * indicate the unevictable state.
 */

static struct lruvec *__mlock_folio(struct folio *folio, struct lruvec *lruvec)
{
        /* There is nothing more we can do while it's off LRU */
        if (!folio_test_clear_lru(folio))
                return lruvec;

        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        if (unlikely(folio_evictable(folio))) {
                /*
                 * This is a little surprising, but quite possible: PG_mlocked
                 * must have got cleared already by another CPU.  Could this
                 * folio be unevictable?  I'm not sure, but move it now if so.
                 */
                if (folio_test_unevictable(folio)) {
                        lruvec_del_folio(lruvec, folio);
                        folio_clear_unevictable(folio);
                        lruvec_add_folio(lruvec, folio);

                        __count_vm_events(UNEVICTABLE_PGRESCUED,
                                          folio_nr_pages(folio));
                }
                goto out;
        }

        if (folio_test_unevictable(folio)) {
                if (folio_test_mlocked(folio))
                        folio->mlock_count++;
                goto out;
        }

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_set_unevictable(folio);
        folio->mlock_count = !!folio_test_mlocked(folio);
        lruvec_add_folio(lruvec, folio);
        __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
        folio_set_lru(folio);
        return lruvec;
}

static struct lruvec *__mlock_new_folio(struct folio *folio, struct lruvec *lruvec)
{
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        /* As above, this is a little surprising, but possible */
        if (unlikely(folio_evictable(folio)))
                goto out;

        folio_set_unevictable(folio);
        folio->mlock_count = !!folio_test_mlocked(folio);
        __count_vm_events(UNEVICTABLE_PGCULLED, folio_nr_pages(folio));
out:
        lruvec_add_folio(lruvec, folio);
        folio_set_lru(folio);
        return lruvec;
}

static struct lruvec *__munlock_folio(struct folio *folio, struct lruvec *lruvec)
{
        int nr_pages = folio_nr_pages(folio);
        bool isolated = false;

        if (!folio_test_clear_lru(folio))
                goto munlock;

        isolated = true;
        lruvec = folio_lruvec_relock_irq(folio, lruvec);

        if (folio_test_unevictable(folio)) {
                /* Then mlock_count is maintained, but might undercount */
                if (folio->mlock_count)
                        folio->mlock_count--;
                if (folio->mlock_count)
                        goto out;
        }
        /* else assume that was the last mlock: reclaim will fix it if not */

munlock:
        if (folio_test_clear_mlocked(folio)) {
                __zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
                if (isolated || !folio_test_unevictable(folio))
                        __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
                else
                        __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
        }

        /* folio_evictable() has to be checked *after* clearing Mlocked */
        if (isolated && folio_test_unevictable(folio) && folio_evictable(folio)) {
                lruvec_del_folio(lruvec, folio);
                folio_clear_unevictable(folio);
                lruvec_add_folio(lruvec, folio);
                __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        }
out:
        if (isolated)
                folio_set_lru(folio);
        return lruvec;
}

/*
 * Flags held in the low bits of a struct folio pointer on the mlock_fbatch.
 */
#define LRU_FOLIO 0x1
#define NEW_FOLIO 0x2
static inline struct folio *mlock_lru(struct folio *folio)
{
        return (struct folio *)((unsigned long)folio + LRU_FOLIO);
}

static inline struct folio *mlock_new(struct folio *folio)
{
        return (struct folio *)((unsigned long)folio + NEW_FOLIO);
}

/*
 * mlock_folio_batch() is derived from folio_batch_move_lru(): perhaps that can
 * make use of such folio pointer flags in future, but for now just keep it for
 * mlock.  We could use three separate folio batches instead, but one feels
 * better (munlocking a full folio batch does not need to drain mlocking folio
 * batches first).
 */
static void mlock_folio_batch(struct folio_batch *fbatch)
{
        struct lruvec *lruvec = NULL;
        unsigned long mlock;
        struct folio *folio;
        int i;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                folio = fbatch->folios[i];
                mlock = (unsigned long)folio & (LRU_FOLIO | NEW_FOLIO);
                folio = (struct folio *)((unsigned long)folio - mlock);
                fbatch->folios[i] = folio;

                if (mlock & LRU_FOLIO)
                        lruvec = __mlock_folio(folio, lruvec);
                else if (mlock & NEW_FOLIO)
                        lruvec = __mlock_new_folio(folio, lruvec);
                else
                        lruvec = __munlock_folio(folio, lruvec);
        }

        if (lruvec)
                unlock_page_lruvec_irq(lruvec);
        folios_put(fbatch);
}

void mlock_drain_local(void)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        if (folio_batch_count(fbatch))
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

void mlock_drain_remote(int cpu)
{
        struct folio_batch *fbatch;

        WARN_ON_ONCE(cpu_online(cpu));
        fbatch = &per_cpu(mlock_fbatch.fbatch, cpu);
        if (folio_batch_count(fbatch))
                mlock_folio_batch(fbatch);
}

bool need_mlock_drain(int cpu)
{
        return folio_batch_count(&per_cpu(mlock_fbatch.fbatch, cpu));
}

/**
 * mlock_folio - mlock a folio already on (or temporarily off) LRU
 * @folio: folio to be mlocked.
 */
void mlock_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);

        if (!folio_test_set_mlocked(folio)) {
                int nr_pages = folio_nr_pages(folio);

                zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
                __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
        }

        folio_get(folio);
        if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
            folio_test_large(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

/**
 * mlock_new_folio - mlock a newly allocated folio not yet on LRU
 * @folio: folio to be mlocked, either normal or a THP head.
 */
void mlock_new_folio(struct folio *folio)
{
        struct folio_batch *fbatch;
        int nr_pages = folio_nr_pages(folio);

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        folio_set_mlocked(folio);

        zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
        __count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);

        folio_get(folio);
        if (!folio_batch_add(fbatch, mlock_new(folio)) ||
            folio_test_large(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

/**
 * munlock_folio - munlock a folio
 * @folio: folio to be munlocked, either normal or a THP head.
 */
void munlock_folio(struct folio *folio)
{
        struct folio_batch *fbatch;

        local_lock(&mlock_fbatch.lock);
        fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
        /*
         * folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
         * which will check whether the folio is multiply mlocked.
         */
        folio_get(folio);
        if (!folio_batch_add(fbatch, folio) ||
            folio_test_large(folio) || lru_cache_disabled())
                mlock_folio_batch(fbatch);
        local_unlock(&mlock_fbatch.lock);
}

static inline unsigned int folio_mlock_step(struct folio *folio,
                pte_t *pte, unsigned long addr, unsigned long end)
{
        const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
        unsigned int count = (end - addr) >> PAGE_SHIFT;
        pte_t ptent = ptep_get(pte);

        if (!folio_test_large(folio))
                return 1;

        return folio_pte_batch(folio, addr, pte, ptent, count, fpb_flags, NULL,
                               NULL, NULL);
}

static inline bool allow_mlock_munlock(struct folio *folio,
                struct vm_area_struct *vma, unsigned long start,
                unsigned long end, unsigned int step)
{
        /*
         * For unlock, allow munlock large folio which is partially
         * mapped to VMA. As it's possible that large folio is
         * mlocked and VMA is split later.
         *
         * During memory pressure, such kind of large folio can
         * be split. And the pages are not in VM_LOCKed VMA
         * can be reclaimed.
         */
        if (!(vma->vm_flags & VM_LOCKED))
                return true;

        /* folio_within_range() cannot take KSM, but any small folio is OK */
        if (!folio_test_large(folio))
                return true;

        /* folio not in range [start, end), skip mlock */
        if (!folio_within_range(folio, vma, start, end))
                return false;

        /* folio is not fully mapped, skip mlock */
        if (step != folio_nr_pages(folio))
                return false;

        return true;
}

static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
                           unsigned long end, struct mm_walk *walk)

{
        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *start_pte, *pte;
        pte_t ptent;
        struct folio *folio;
        unsigned int step = 1;
        unsigned long start = addr;

        ptl = pmd_trans_huge_lock(pmd, vma);
        if (ptl) {
                if (!pmd_present(*pmd))
                        goto out;
                if (is_huge_zero_pmd(*pmd))
                        goto out;
                folio = pmd_folio(*pmd);
                if (folio_is_zone_device(folio))
                        goto out;
                if (vma->vm_flags & VM_LOCKED)
                        mlock_folio(folio);
                else
                        munlock_folio(folio);
                goto out;
        }

        start_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (!start_pte) {
                walk->action = ACTION_AGAIN;
                return 0;
        }

        for (pte = start_pte; addr != end; pte++, addr += PAGE_SIZE) {
                ptent = ptep_get(pte);
                if (!pte_present(ptent))
                        continue;
                folio = vm_normal_folio(vma, addr, ptent);
                if (!folio || folio_is_zone_device(folio))
                        continue;

                step = folio_mlock_step(folio, pte, addr, end);
                if (!allow_mlock_munlock(folio, vma, start, end, step))
                        goto next_entry;

                if (vma->vm_flags & VM_LOCKED)
                        mlock_folio(folio);
                else
                        munlock_folio(folio);

next_entry:
                pte += step - 1;
                addr += (step - 1) << PAGE_SHIFT;
        }
        pte_unmap(start_pte);
out:
        spin_unlock(ptl);
        cond_resched();
        return 0;
}

/*
 * mlock_vma_pages_range() - mlock any pages already in the range,
 *                           or munlock all pages in the range.
 * @vma - vma containing range to be mlock()ed or munlock()ed
 * @start - start address in @vma of the range
 * @end - end of range in @vma
 * @newflags - the new set of flags for @vma.
 *
 * Called for mlock(), mlock2() and mlockall(), to set @vma VM_LOCKED;
 * called for munlock() and munlockall(), to clear VM_LOCKED from @vma.
 */
static void mlock_vma_pages_range(struct vm_area_struct *vma,
        unsigned long start, unsigned long end, vm_flags_t newflags)
{
        static const struct mm_walk_ops mlock_walk_ops = {
                .pmd_entry = mlock_pte_range,
                .walk_lock = PGWALK_WRLOCK_VERIFY,
        };

        /*
         * There is a slight chance that concurrent page migration,
         * or page reclaim finding a page of this now-VM_LOCKED vma,
         * will call mlock_vma_folio() and raise page's mlock_count:
         * double counting, leaving the page unevictable indefinitely.
         * Communicate this danger to mlock_vma_folio() with VM_IO,
         * which is a VM_SPECIAL flag not allowed on VM_LOCKED vmas.
         * mmap_lock is held in write mode here, so this weird
         * combination should not be visible to other mmap_lock users;
         * but WRITE_ONCE so rmap walkers must see VM_IO if VM_LOCKED.
         */
        if (newflags & VM_LOCKED)
                newflags |= VM_IO;
        vma_start_write(vma);
        vm_flags_reset_once(vma, newflags);

        lru_add_drain();
        walk_page_range(vma->vm_mm, start, end, &mlock_walk_ops, NULL);
        lru_add_drain();

        if (newflags & VM_IO) {
                newflags &= ~VM_IO;
                vm_flags_reset_once(vma, newflags);
        }
}

/*
 * mlock_fixup  - handle mlock[all]/munlock[all] requests.
 *
 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
 * munlock is a no-op.  However, for some special vmas, we go ahead and
 * populate the ptes.
 *
 * For vmas that pass the filters, merge/split as appropriate.
 */
static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
               struct vm_area_struct **prev, unsigned long start,
               unsigned long end, vm_flags_t newflags)
{
        struct mm_struct *mm = vma->vm_mm;
        int nr_pages;
        int ret = 0;
        vm_flags_t oldflags = vma->vm_flags;

        if (newflags == oldflags || (oldflags & VM_SPECIAL) ||
            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
            vma_is_dax(vma) || vma_is_secretmem(vma) || (oldflags & VM_DROPPABLE))
                /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
                goto out;

        vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
        if (IS_ERR(vma)) {
                ret = PTR_ERR(vma);
                goto out;
        }

        /*
         * Keep track of amount of locked VM.
         */
        nr_pages = (end - start) >> PAGE_SHIFT;
        if (!(newflags & VM_LOCKED))
                nr_pages = -nr_pages;
        else if (oldflags & VM_LOCKED)
                nr_pages = 0;
        mm->locked_vm += nr_pages;

        /*
         * vm_flags is protected by the mmap_lock held in write mode.
         * It's okay if try_to_unmap_one unmaps a page just after we
         * set VM_LOCKED, populate_vma_page_range will bring it back.
         */
        if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) {
                /* No work to do, and mlocking twice would be wrong */
                vma_start_write(vma);
                vm_flags_reset(vma, newflags);
        } else {
                mlock_vma_pages_range(vma, start, end, newflags);
        }
out:
        *prev = vma;
        return ret;
}

static int apply_vma_lock_flags(unsigned long start, size_t len,
                                vm_flags_t flags)
{
        unsigned long nstart, end, tmp;
        struct vm_area_struct *vma, *prev;
        VMA_ITERATOR(vmi, current->mm, start);

        VM_BUG_ON(offset_in_page(start));
        VM_BUG_ON(len != PAGE_ALIGN(len));
        end = start + len;
        if (end < start)
                return -EINVAL;
        if (end == start)
                return 0;
        vma = vma_iter_load(&vmi);
        if (!vma)
                return -ENOMEM;

        prev = vma_prev(&vmi);
        if (start > vma->vm_start)
                prev = vma;

        nstart = start;
        tmp = vma->vm_start;
        for_each_vma_range(vmi, vma, end) {
                int error;
                vm_flags_t newflags;

                if (vma->vm_start != tmp)
                        return -ENOMEM;

                newflags = vma->vm_flags & ~VM_LOCKED_MASK;
                newflags |= flags;
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
                tmp = vma->vm_end;
                if (tmp > end)
                        tmp = end;
                error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
                if (error)
                        return error;
                tmp = vma_iter_end(&vmi);
                nstart = tmp;
        }

        if (tmp < end)
                return -ENOMEM;

        return 0;
}

/*
 * Go through vma areas and sum size of mlocked
 * vma pages, as return value.
 * Note deferred memory locking case(mlock2(,,MLOCK_ONFAULT)
 * is also counted.
 * Return value: previously mlocked page counts
 */
static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
                unsigned long start, size_t len)
{
        struct vm_area_struct *vma;
        unsigned long count = 0;
        unsigned long end;
        VMA_ITERATOR(vmi, mm, start);

        /* Don't overflow past ULONG_MAX */
        if (unlikely(ULONG_MAX - len < start))
                end = ULONG_MAX;
        else
                end = start + len;

        for_each_vma_range(vmi, vma, end) {
                if (vma->vm_flags & VM_LOCKED) {
                        if (start > vma->vm_start)
                                count -= (start - vma->vm_start);
                        if (end < vma->vm_end) {
                                count += end - vma->vm_start;
                                break;
                        }
                        count += vma->vm_end - vma->vm_start;
                }
        }

        return count >> PAGE_SHIFT;
}

/*
 * convert get_user_pages() return value to posix mlock() error
 */
static int __mlock_posix_error_return(long retval)
{
        if (retval == -EFAULT)
                retval = -ENOMEM;
        else if (retval == -ENOMEM)
                retval = -EAGAIN;
        return retval;
}

static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
{
        unsigned long locked;
        unsigned long lock_limit;
        int error = -ENOMEM;

        start = untagged_addr(start);

        if (!can_do_mlock())
                return -EPERM;

        len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = len >> PAGE_SHIFT;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;

        locked += current->mm->locked_vm;
        if ((locked > lock_limit) && (!capable(CAP_IPC_LOCK))) {
                /*
                 * It is possible that the regions requested intersect with
                 * previously mlocked areas, that part area in "mm->locked_vm"
                 * should not be counted to new mlock increment count. So check
                 * and adjust locked count if necessary.
                 */
                locked -= count_mm_mlocked_page_nr(current->mm,
                                start, len);
        }

        /* check against resource limits */
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = apply_vma_lock_flags(start, len, flags);

        mmap_write_unlock(current->mm);
        if (error)
                return error;

        error = __mm_populate(start, len, 0);
        if (error)
                return __mlock_posix_error_return(error);
        return 0;
}

SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
{
        return do_mlock(start, len, VM_LOCKED);
}

SYSCALL_DEFINE3(mlock2, unsigned long, start, size_t, len, int, flags)
{
        vm_flags_t vm_flags = VM_LOCKED;

        if (flags & ~MLOCK_ONFAULT)
                return -EINVAL;

        if (flags & MLOCK_ONFAULT)
                vm_flags |= VM_LOCKONFAULT;

        return do_mlock(start, len, vm_flags);
}

SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
{
        int ret;

        start = untagged_addr(start);

        len = PAGE_ALIGN(len + (offset_in_page(start)));
        start &= PAGE_MASK;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        ret = apply_vma_lock_flags(start, len, 0);
        mmap_write_unlock(current->mm);

        return ret;
}

/*
 * Take the MCL_* flags passed into mlockall (or 0 if called from munlockall)
 * and translate into the appropriate modifications to mm->def_flags and/or the
 * flags for all current VMAs.
 *
 * There are a couple of subtleties with this.  If mlockall() is called multiple
 * times with different flags, the values do not necessarily stack.  If mlockall
 * is called once including the MCL_FUTURE flag and then a second time without
 * it, VM_LOCKED and VM_LOCKONFAULT will be cleared from mm->def_flags.
 */
static int apply_mlockall_flags(int flags)
{
        VMA_ITERATOR(vmi, current->mm, 0);
        struct vm_area_struct *vma, *prev = NULL;
        vm_flags_t to_add = 0;

        current->mm->def_flags &= ~VM_LOCKED_MASK;
        if (flags & MCL_FUTURE) {
                current->mm->def_flags |= VM_LOCKED;

                if (flags & MCL_ONFAULT)
                        current->mm->def_flags |= VM_LOCKONFAULT;

                if (!(flags & MCL_CURRENT))
                        goto out;
        }

        if (flags & MCL_CURRENT) {
                to_add |= VM_LOCKED;
                if (flags & MCL_ONFAULT)
                        to_add |= VM_LOCKONFAULT;
        }

        for_each_vma(vmi, vma) {
                int error;
                vm_flags_t newflags;

                newflags = vma->vm_flags & ~VM_LOCKED_MASK;
                newflags |= to_add;

                error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end,
                                    newflags);
                /* Ignore errors, but prev needs fixing up. */
                if (error)
                        prev = vma;
                cond_resched();
        }
out:
        return 0;
}

SYSCALL_DEFINE1(mlockall, int, flags)
{
        unsigned long lock_limit;
        int ret;

        if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
            flags == MCL_ONFAULT)
                return -EINVAL;

        if (!can_do_mlock())
                return -EPERM;

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;

        ret = -ENOMEM;
        if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
            capable(CAP_IPC_LOCK))
                ret = apply_mlockall_flags(flags);
        mmap_write_unlock(current->mm);
        if (!ret && (flags & MCL_CURRENT))
                mm_populate(0, TASK_SIZE);

        return ret;
}

SYSCALL_DEFINE0(munlockall)
{
        int ret;

        if (mmap_write_lock_killable(current->mm))
                return -EINTR;
        ret = apply_mlockall_flags(0);
        mmap_write_unlock(current->mm);
        return ret;
}

/*
 * Objects with different lifetime than processes (SHM_LOCK and SHM_HUGETLB
 * shm segments) get accounted against the user_struct instead.
 */
static DEFINE_SPINLOCK(shmlock_user_lock);

int user_shm_lock(size_t size, struct ucounts *ucounts)
{
        unsigned long lock_limit, locked;
        long memlock;
        int allowed = 0;

        locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        if (lock_limit != RLIM_INFINITY)
                lock_limit >>= PAGE_SHIFT;
        spin_lock(&shmlock_user_lock);
        memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);

        if ((memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
                dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
                goto out;
        }
        if (!get_ucounts(ucounts)) {
                dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
                allowed = 0;
                goto out;
        }
        allowed = 1;
out:
        spin_unlock(&shmlock_user_lock);
        return allowed;
}

void user_shm_unlock(size_t size, struct ucounts *ucounts)
{
        spin_lock(&shmlock_user_lock);
        dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
        spin_unlock(&shmlock_user_lock);
        put_ucounts(ucounts);
}






























    1 


  573 








   11 


































    2 


















































  682 















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
/* SPDX-License-Identifier: GPL-2.0-only */

#ifndef __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H
#define __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H

#include <linux/bits.h>
#include <asm/barrier.h>

#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
#endif

/*
 * Generic definitions for bit operations, should not be used in regular code
 * directly.
 */

/**
 * generic___set_bit - Set a bit in memory
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * Unlike set_bit(), this function is non-atomic and may be reordered.
 * If it's called on the same region of memory simultaneously, the effect
 * may be that only one operation succeeds.
 */
static __always_inline void
generic___set_bit(unsigned long nr, volatile unsigned long *addr)
{
        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);

        *p  |= mask;
}

static __always_inline void
generic___clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);

        *p &= ~mask;
}

/**
 * generic___change_bit - Toggle a bit in memory
 * @nr: the bit to change
 * @addr: the address to start counting from
 *
 * Unlike change_bit(), this function is non-atomic and may be reordered.
 * If it's called on the same region of memory simultaneously, the effect
 * may be that only one operation succeeds.
 */
static __always_inline void
generic___change_bit(unsigned long nr, volatile unsigned long *addr)
{
        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);

        *p ^= mask;
}

/**
 * generic___test_and_set_bit - Set a bit and return its old value
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This operation is non-atomic and can be reordered.
 * If two examples of this operation race, one can appear to succeed
 * but actually fail.  You must protect multiple accesses with a lock.
 */
static __always_inline bool
generic___test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
{
        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        unsigned long old = *p;

        *p = old | mask;
        return (old & mask) != 0;
}

/**
 * generic___test_and_clear_bit - Clear a bit and return its old value
 * @nr: Bit to clear
 * @addr: Address to count from
 *
 * This operation is non-atomic and can be reordered.
 * If two examples of this operation race, one can appear to succeed
 * but actually fail.  You must protect multiple accesses with a lock.
 */
static __always_inline bool
generic___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
{
        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        unsigned long old = *p;

        *p = old & ~mask;
        return (old & mask) != 0;
}

/* WARNING: non atomic and it can be reordered! */
static __always_inline bool
generic___test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
{
        unsigned long mask = BIT_MASK(nr);
        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        unsigned long old = *p;

        *p = old ^ mask;
        return (old & mask) != 0;
}

/**
 * generic_test_bit - Determine whether a bit is set
 * @nr: bit number to test
 * @addr: Address to start counting from
 */
static __always_inline bool
generic_test_bit(unsigned long nr, const volatile unsigned long *addr)
{
        /*
         * Unlike the bitops with the '__' prefix above, this one *is* atomic,
         * so `volatile` must always stay here with no cast-aways. See
         * `Documentation/atomic_bitops.txt` for the details.
         */
        return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}

/**
 * generic_test_bit_acquire - Determine, with acquire semantics, whether a bit is set
 * @nr: bit number to test
 * @addr: Address to start counting from
 */
static __always_inline bool
generic_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
{
        unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
        return 1UL & (smp_load_acquire(p) >> (nr & (BITS_PER_LONG-1)));
}

/*
 * const_*() definitions provide good compile-time optimizations when
 * the passed arguments can be resolved at compile time.
 */
#define const___set_bit                        generic___set_bit
#define const___clear_bit                generic___clear_bit
#define const___change_bit                generic___change_bit
#define const___test_and_set_bit        generic___test_and_set_bit
#define const___test_and_clear_bit        generic___test_and_clear_bit
#define const___test_and_change_bit        generic___test_and_change_bit
#define const_test_bit_acquire                generic_test_bit_acquire

/**
 * const_test_bit - Determine whether a bit is set
 * @nr: bit number to test
 * @addr: Address to start counting from
 *
 * A version of generic_test_bit() which discards the `volatile` qualifier to
 * allow a compiler to optimize code harder. Non-atomic and to be called only
 * for testing compile-time constants, e.g. by the corresponding macros, not
 * directly from "regular" code.
 */
static __always_inline bool
const_test_bit(unsigned long nr, const volatile unsigned long *addr)
{
        const unsigned long *p = (const unsigned long *)addr + BIT_WORD(nr);
        unsigned long mask = BIT_MASK(nr);
        unsigned long val = *p;

        return !!(val & mask);
}

#endif /* __ASM_GENERIC_BITOPS_GENERIC_NON_ATOMIC_H */








































































































  317 





  317 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  963 





  964 






































































































































































































































  317 



  318 
  318 






  317 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
// SPDX-License-Identifier: GPL-2.0
/*
 * Common Block IO controller cgroup interface
 *
 * Based on ideas and code from CFQ, CFS and BFQ:
 * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
 *
 * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
 *                      Paolo Valente <paolo.valente@unimore.it>
 *
 * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
 *                       Nauman Rafique <nauman@google.com>
 *
 * For policy-specific per-blkcg data:
 * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
 *                    Arianna Avanzini <avanzini.arianna@gmail.com>
 */
#include <linux/ioprio.h>
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/sched/signal.h>
#include <linux/err.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h"
#include "blk-cgroup.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);

/*
 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
 * blkcg_pol_register_mutex nests outside of it and synchronizes entire
 * policy [un]register operations including cgroup file additions /
 * removals.  Putting cgroup file registration outside blkcg_pol_mutex
 * allows grabbing it from cgroup callbacks.
 */
static DEFINE_MUTEX(blkcg_pol_register_mutex);
static DEFINE_MUTEX(blkcg_pol_mutex);

struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);

struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
EXPORT_SYMBOL_GPL(blkcg_root_css);

static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];

static LIST_HEAD(all_blkcgs);                /* protected by blkcg_pol_mutex */

bool blkcg_debug_stats = false;

static DEFINE_RAW_SPINLOCK(blkg_stat_lock);

#define BLKG_DESTROY_BATCH_SIZE  64

/*
 * Lockless lists for tracking IO stats update
 *
 * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
 * There are multiple blkg's (one for each block device) attached to each
 * blkcg. The rstat code keeps track of which cpu has IO stats updated,
 * but it doesn't know which blkg has the updated stats. If there are many
 * block devices in a system, the cost of iterating all the blkg's to flush
 * out the IO stats can be high. To reduce such overhead, a set of percpu
 * lockless lists (lhead) per blkcg are used to track the set of recently
 * updated iostat_cpu's since the last flush. An iostat_cpu will be put
 * onto the lockless list on the update side [blk_cgroup_bio_start()] if
 * not there yet and then removed when being flushed [blkcg_rstat_flush()].
 * References to blkg are gotten and then put back in the process to
 * protect against blkg removal.
 *
 * Return: 0 if successful or -ENOMEM if allocation fails.
 */
static int init_blkcg_llists(struct blkcg *blkcg)
{
        int cpu;

        blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
        if (!blkcg->lhead)
                return -ENOMEM;

        for_each_possible_cpu(cpu)
                init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
        return 0;
}

/**
 * blkcg_css - find the current css
 *
 * Find the css associated with either the kthread or the current task.
 * This may return a dying css, so it is up to the caller to use tryget logic
 * to confirm it is alive and well.
 */
static struct cgroup_subsys_state *blkcg_css(void)
{
        struct cgroup_subsys_state *css;

        css = kthread_blkcg();
        if (css)
                return css;
        return task_css(current, io_cgrp_id);
}

static bool blkcg_policy_enabled(struct request_queue *q,
                                 const struct blkcg_policy *pol)
{
        return pol && test_bit(pol->plid, q->blkcg_pols);
}

static void blkg_free_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             free_work);
        struct request_queue *q = blkg->q;
        int i;

        /*
         * pd_free_fn() can also be called from blkcg_deactivate_policy(),
         * in order to make sure pd_free_fn() is called in order, the deletion
         * of the list blkg->q_node is delayed to here from blkg_destroy(), and
         * blkcg_mutex is used to synchronize blkg_free_workfn() and
         * blkcg_deactivate_policy().
         */
        mutex_lock(&q->blkcg_mutex);
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        if (blkg->parent)
                blkg_put(blkg->parent);
        spin_lock_irq(&q->queue_lock);
        list_del_init(&blkg->q_node);
        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        blk_put_queue(q);
        free_percpu(blkg->iostat_cpu);
        percpu_ref_exit(&blkg->refcnt);
        kfree(blkg);
}

/**
 * blkg_free - free a blkg
 * @blkg: blkg to free
 *
 * Free @blkg which may be partially allocated.
 */
static void blkg_free(struct blkcg_gq *blkg)
{
        if (!blkg)
                return;

        /*
         * Both ->pd_free_fn() and request queue's release handler may
         * sleep, so free us by scheduling one work func
         */
        INIT_WORK(&blkg->free_work, blkg_free_workfn);
        schedule_work(&blkg->free_work);
}

static void __blkg_release(struct rcu_head *rcu)
{
        struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
        struct blkcg *blkcg = blkg->blkcg;
        int cpu;

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        WARN_ON(!bio_list_empty(&blkg->async_bios));
#endif
        /*
         * Flush all the non-empty percpu lockless lists before releasing
         * us, given these stat belongs to us.
         *
         * blkg_stat_lock is for serializing blkg stat update
         */
        for_each_possible_cpu(cpu)
                __blkcg_rstat_flush(blkcg, cpu);

        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
        blkg_free(blkg);
}

/*
 * A group is RCU protected, but having an rcu lock does not mean that one
 * can access all the fields of blkg and assume these are valid.  For
 * example, don't try to follow throtl_data and request queue links.
 *
 * Having a reference to blkg under an rcu allows accesses to only values
 * local to groups like group stats and group rate limits.
 */
static void blkg_release(struct percpu_ref *ref)
{
        struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt);

        call_rcu(&blkg->rcu_head, __blkg_release);
}

#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
static struct workqueue_struct *blkcg_punt_bio_wq;

static void blkg_async_bio_workfn(struct work_struct *work)
{
        struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
                                             async_bio_work);
        struct bio_list bios = BIO_EMPTY_LIST;
        struct bio *bio;
        struct blk_plug plug;
        bool need_plug = false;

        /* as long as there are pending bios, @blkg can't go away */
        spin_lock(&blkg->async_bio_lock);
        bio_list_merge_init(&bios, &blkg->async_bios);
        spin_unlock(&blkg->async_bio_lock);

        /* start plug only when bio_list contains at least 2 bios */
        if (bios.head && bios.head->bi_next) {
                need_plug = true;
                blk_start_plug(&plug);
        }
        while ((bio = bio_list_pop(&bios)))
                submit_bio(bio);
        if (need_plug)
                blk_finish_plug(&plug);
}

/*
 * When a shared kthread issues a bio for a cgroup, doing so synchronously can
 * lead to priority inversions as the kthread can be trapped waiting for that
 * cgroup.  Use this helper instead of submit_bio to punt the actual issuing to
 * a dedicated per-blkcg work item to avoid such priority inversions.
 */
void blkcg_punt_bio_submit(struct bio *bio)
{
        struct blkcg_gq *blkg = bio->bi_blkg;

        if (blkg->parent) {
                spin_lock(&blkg->async_bio_lock);
                bio_list_add(&blkg->async_bios, bio);
                spin_unlock(&blkg->async_bio_lock);
                queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
        } else {
                /* never bounce for the root cgroup */
                submit_bio(bio);
        }
}
EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);

static int __init blkcg_punt_bio_init(void)
{
        blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
                                            WQ_MEM_RECLAIM | WQ_FREEZABLE |
                                            WQ_UNBOUND | WQ_SYSFS, 0);
        if (!blkcg_punt_bio_wq)
                return -ENOMEM;
        return 0;
}
subsys_initcall(blkcg_punt_bio_init);
#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */

/**
 * bio_blkcg_css - return the blkcg CSS associated with a bio
 * @bio: target bio
 *
 * This returns the CSS for the blkcg associated with a bio, or %NULL if not
 * associated. Callers are expected to either handle %NULL or know association
 * has been done prior to calling this.
 */
struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
{
        if (!bio || !bio->bi_blkg)
                return NULL;
        return &bio->bi_blkg->blkcg->css;
}
EXPORT_SYMBOL_GPL(bio_blkcg_css);

/**
 * blkcg_parent - get the parent of a blkcg
 * @blkcg: blkcg of interest
 *
 * Return the parent blkcg of @blkcg.  Can be called anytime.
 */
static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
{
        return css_to_blkcg(blkcg->css.parent);
}

/**
 * blkg_alloc - allocate a blkg
 * @blkcg: block cgroup the new blkg is associated with
 * @disk: gendisk the new blkg is associated with
 * @gfp_mask: allocation mask to use
 *
 * Allocate a new blkg associating @blkcg and @disk.
 */
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
                                   gfp_t gfp_mask)
{
        struct blkcg_gq *blkg;
        int i, cpu;

        /* alloc and init base part */
        blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node);
        if (!blkg)
                return NULL;
        if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
                goto out_free_blkg;
        blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
        if (!blkg->iostat_cpu)
                goto out_exit_refcnt;
        if (!blk_get_queue(disk->queue))
                goto out_free_iostat;

        blkg->q = disk->queue;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
        blkg->iostat.blkg = blkg;
#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
        spin_lock_init(&blkg->async_bio_lock);
        bio_list_init(&blkg->async_bios);
        INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
#endif

        u64_stats_init(&blkg->iostat.sync);
        for_each_possible_cpu(cpu) {
                u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
                per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkg_policy_data *pd;

                if (!blkcg_policy_enabled(disk->queue, pol))
                        continue;

                /* alloc per-policy data and attach it to blkg */
                pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask);
                if (!pd)
                        goto out_free_pds;
                blkg->pd[i] = pd;
                pd->blkg = blkg;
                pd->plid = i;
                pd->online = false;
        }

        return blkg;

out_free_pds:
        while (--i >= 0)
                if (blkg->pd[i])
                        blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
        blk_put_queue(disk->queue);
out_free_iostat:
        free_percpu(blkg->iostat_cpu);
out_exit_refcnt:
        percpu_ref_exit(&blkg->refcnt);
out_free_blkg:
        kfree(blkg);
        return NULL;
}

/*
 * If @new_blkg is %NULL, this function tries to allocate a new one as
 * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
 */
static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
                                    struct blkcg_gq *new_blkg)
{
        struct blkcg_gq *blkg;
        int i, ret;

        lockdep_assert_held(&disk->queue->queue_lock);

        /* request_queue is dying, do not create/recreate a blkg */
        if (blk_queue_dying(disk->queue)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* blkg holds a reference to blkcg */
        if (!css_tryget_online(&blkcg->css)) {
                ret = -ENODEV;
                goto err_free_blkg;
        }

        /* allocate */
        if (!new_blkg) {
                new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto err_put_css;
                }
        }
        blkg = new_blkg;

        /* link parent */
        if (blkcg_parent(blkcg)) {
                blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
                if (WARN_ON_ONCE(!blkg->parent)) {
                        ret = -ENODEV;
                        goto err_put_css;
                }
                blkg_get(blkg->parent);
        }

        /* invoke per-policy init */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && pol->pd_init_fn)
                        pol->pd_init_fn(blkg->pd[i]);
        }

        /* insert */
        spin_lock(&blkcg->lock);
        ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
        if (likely(!ret)) {
                hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
                list_add(&blkg->q_node, &disk->queue->blkg_list);

                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i]) {
                                if (pol->pd_online_fn)
                                        pol->pd_online_fn(blkg->pd[i]);
                                blkg->pd[i]->online = true;
                        }
                }
        }
        blkg->online = true;
        spin_unlock(&blkcg->lock);

        if (!ret)
                return blkg;

        /* @blkg failed fully initialized, use the usual release path */
        blkg_put(blkg);
        return ERR_PTR(ret);

err_put_css:
        css_put(&blkcg->css);
err_free_blkg:
        if (new_blkg)
                blkg_free(new_blkg);
        return ERR_PTR(ret);
}

/**
 * blkg_lookup_create - lookup blkg, try to create one if not there
 * @blkcg: blkcg of interest
 * @disk: gendisk of interest
 *
 * Lookup blkg for the @blkcg - @disk pair.  If it doesn't exist, try to
 * create one.  blkg creation is performed recursively from blkcg_root such
 * that all non-root blkg's have access to the parent blkg.  This function
 * should be called under RCU read lock and takes @disk->queue->queue_lock.
 *
 * Returns the blkg or the closest blkg if blkg_create() fails as it walks
 * down from root.
 */
static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        unsigned long flags;

        WARN_ON_ONCE(!rcu_read_lock_held());

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                return blkg;

        spin_lock_irqsave(&q->queue_lock, flags);
        blkg = blkg_lookup(blkcg, q);
        if (blkg) {
                if (blkcg != &blkcg_root &&
                    blkg != rcu_dereference(blkcg->blkg_hint))
                        rcu_assign_pointer(blkcg->blkg_hint, blkg);
                goto found;
        }

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.  Returns the closest
         * blkg to the intended blkg should blkg_create() fail.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent = blkcg_parent(blkcg);
                struct blkcg_gq *ret_blkg = q->root_blkg;

                while (parent) {
                        blkg = blkg_lookup(parent, q);
                        if (blkg) {
                                /* remember closest blkg */
                                ret_blkg = blkg;
                                break;
                        }
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                blkg = blkg_create(pos, disk, NULL);
                if (IS_ERR(blkg)) {
                        blkg = ret_blkg;
                        break;
                }
                if (pos == blkcg)
                        break;
        }

found:
        spin_unlock_irqrestore(&q->queue_lock, flags);
        return blkg;
}

static void blkg_destroy(struct blkcg_gq *blkg)
{
        struct blkcg *blkcg = blkg->blkcg;
        int i;

        lockdep_assert_held(&blkg->q->queue_lock);
        lockdep_assert_held(&blkcg->lock);

        /*
         * blkg stays on the queue list until blkg_free_workfn(), see details in
         * blkg_free_workfn(), hence this function can be called from
         * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before
         * blkg_free_workfn().
         */
        if (hlist_unhashed(&blkg->blkcg_node))
                return;

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (blkg->pd[i] && blkg->pd[i]->online) {
                        blkg->pd[i]->online = false;
                        if (pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[i]);
                }
        }

        blkg->online = false;

        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
        hlist_del_init_rcu(&blkg->blkcg_node);

        /*
         * Both setting lookup hint to and clearing it from @blkg are done
         * under queue_lock.  If it's not pointing to @blkg now, it never
         * will.  Hint assignment itself can race safely.
         */
        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
                rcu_assign_pointer(blkcg->blkg_hint, NULL);

        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
         */
        percpu_ref_kill(&blkg->refcnt);
}

static void blkg_destroy_all(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        int count = BLKG_DESTROY_BATCH_SIZE;
        int i;

restart:
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                if (hlist_unhashed(&blkg->blkcg_node))
                        continue;

                spin_lock(&blkcg->lock);
                blkg_destroy(blkg);
                spin_unlock(&blkcg->lock);

                /*
                 * in order to avoid holding the spin lock for too long, release
                 * it when a batch of blkgs are destroyed.
                 */
                if (!(--count)) {
                        count = BLKG_DESTROY_BATCH_SIZE;
                        spin_unlock_irq(&q->queue_lock);
                        cond_resched();
                        goto restart;
                }
        }

        /*
         * Mark policy deactivated since policy offline has been done, and
         * the free is scheduled, so future blkcg_deactivate_policy() can
         * be bypassed
         */
        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (pol)
                        __clear_bit(pol->plid, q->blkcg_pols);
        }

        q->root_blkg = NULL;
        spin_unlock_irq(&q->queue_lock);
}

static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] = src->bytes[i];
                dst->ios[i] = src->ios[i];
        }
}

static void __blkg_clear_stat(struct blkg_iostat_set *bis)
{
        struct blkg_iostat cur = {0};
        unsigned long flags;

        flags = u64_stats_update_begin_irqsave(&bis->sync);
        blkg_iostat_set(&bis->cur, &cur);
        blkg_iostat_set(&bis->last, &cur);
        u64_stats_update_end_irqrestore(&bis->sync, flags);
}

static void blkg_clear_stat(struct blkcg_gq *blkg)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                struct blkg_iostat_set *s = per_cpu_ptr(blkg->iostat_cpu, cpu);

                __blkg_clear_stat(s);
        }
        __blkg_clear_stat(&blkg->iostat);
}

static int blkcg_reset_stats(struct cgroup_subsys_state *css,
                             struct cftype *cftype, u64 val)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        struct blkcg_gq *blkg;
        int i;

        pr_info_once("blkio.%s is deprecated\n", cftype->name);
        mutex_lock(&blkcg_pol_mutex);
        spin_lock_irq(&blkcg->lock);

        /*
         * Note that stat reset is racy - it doesn't synchronize against
         * stat updates.  This is a debug feature which shouldn't exist
         * anyway.  If you get hit by a race, retry.
         */
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                blkg_clear_stat(blkg);
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];

                        if (blkg->pd[i] && pol->pd_reset_stats_fn)
                                pol->pd_reset_stats_fn(blkg->pd[i]);
                }
        }

        spin_unlock_irq(&blkcg->lock);
        mutex_unlock(&blkcg_pol_mutex);
        return 0;
}

const char *blkg_dev_name(struct blkcg_gq *blkg)
{
        if (!blkg->q->disk)
                return NULL;
        return bdi_dev_name(blkg->q->disk->bdi);
}

/**
 * blkcg_print_blkgs - helper for printing per-blkg data
 * @sf: seq_file to print to
 * @blkcg: blkcg of interest
 * @prfill: fill function to print out a blkg
 * @pol: policy in question
 * @data: data to be passed to @prfill
 * @show_total: to print out sum of prfill return values or not
 *
 * This function invokes @prfill on each blkg of @blkcg if pd for the
 * policy specified by @pol exists.  @prfill is invoked with @sf, the
 * policy data and @data and the matching queue lock held.  If @show_total
 * is %true, the sum of the return values from @prfill is printed with
 * "Total" label at the end.
 *
 * This is to be used to construct print functions for
 * cftype->read_seq_string method.
 */
void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
                       u64 (*prfill)(struct seq_file *,
                                     struct blkg_policy_data *, int),
                       const struct blkcg_policy *pol, int data,
                       bool show_total)
{
        struct blkcg_gq *blkg;
        u64 total = 0;

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                if (blkcg_policy_enabled(blkg->q, pol))
                        total += prfill(sf, blkg->pd[pol->plid], data);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();

        if (show_total)
                seq_printf(sf, "Total %llu\n", (unsigned long long)total);
}
EXPORT_SYMBOL_GPL(blkcg_print_blkgs);

/**
 * __blkg_prfill_u64 - prfill helper for a single u64 value
 * @sf: seq_file to print to
 * @pd: policy private data of interest
 * @v: value to print
 *
 * Print @v to @sf for the device associated with @pd.
 */
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
        const char *dname = blkg_dev_name(pd->blkg);

        if (!dname)
                return 0;

        seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v);
        return v;
}
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);

/**
 * blkg_conf_init - initialize a blkg_conf_ctx
 * @ctx: blkg_conf_ctx to initialize
 * @input: input string
 *
 * Initialize @ctx which can be used to parse blkg config input string @input.
 * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
 * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
 */
void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
{
        *ctx = (struct blkg_conf_ctx){ .input = input };
}
EXPORT_SYMBOL_GPL(blkg_conf_init);

/**
 * blkg_conf_open_bdev - parse and open bdev for per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from
 * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
 * set to point past the device node prefix.
 *
 * This function may be called multiple times on @ctx and the extra calls become
 * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
 * explicitly if bdev access is needed without resolving the blkcg / policy part
 * of @ctx->input. Returns -errno on error.
 */
int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
{
        char *input = ctx->input;
        unsigned int major, minor;
        struct block_device *bdev;
        int key_len;

        if (ctx->bdev)
                return 0;

        if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
                return -EINVAL;

        input += key_len;
        if (!isspace(*input))
                return -EINVAL;
        input = skip_spaces(input);

        bdev = blkdev_get_no_open(MKDEV(major, minor), false);
        if (!bdev)
                return -ENODEV;
        if (bdev_is_partition(bdev)) {
                blkdev_put_no_open(bdev);
                return -ENODEV;
        }

        mutex_lock(&bdev->bd_queue->rq_qos_mutex);
        if (!disk_live(bdev->bd_disk)) {
                blkdev_put_no_open(bdev);
                mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
                return -ENODEV;
        }

        ctx->body = input;
        ctx->bdev = bdev;
        return 0;
}
/*
 * Similar to blkg_conf_open_bdev, but additionally freezes the queue,
 * acquires q->elevator_lock, and ensures the correct locking order
 * between q->elevator_lock and q->rq_qos_mutex.
 *
 * This function returns negative error on failure. On success it returns
 * memflags which must be saved and later passed to blkg_conf_exit_frozen
 * for restoring the memalloc scope.
 */
unsigned long __must_check blkg_conf_open_bdev_frozen(struct blkg_conf_ctx *ctx)
{
        int ret;
        unsigned long memflags;

        if (ctx->bdev)
                return -EINVAL;

        ret = blkg_conf_open_bdev(ctx);
        if (ret < 0)
                return ret;
        /*
         * At this point, we haven’t started protecting anything related to QoS,
         * so we release q->rq_qos_mutex here, which was first acquired in blkg_
         * conf_open_bdev. Later, we re-acquire q->rq_qos_mutex after freezing
         * the queue and acquiring q->elevator_lock to maintain the correct
         * locking order.
         */
        mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);

        memflags = blk_mq_freeze_queue(ctx->bdev->bd_queue);
        mutex_lock(&ctx->bdev->bd_queue->elevator_lock);
        mutex_lock(&ctx->bdev->bd_queue->rq_qos_mutex);

        return memflags;
}

/**
 * blkg_conf_prep - parse and prepare for per-blkg config update
 * @blkcg: target block cgroup
 * @pol: target policy
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Parse per-blkg config update from @ctx->input and initialize @ctx
 * accordingly. On success, @ctx->body points to the part of @ctx->input
 * following MAJ:MIN, @ctx->bdev points to the target block device and
 * @ctx->blkg to the blkg being configured.
 *
 * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
 * function returns with queue lock held and must be followed by
 * blkg_conf_exit().
 */
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
                   struct blkg_conf_ctx *ctx)
        __acquires(&bdev->bd_queue->queue_lock)
{
        struct gendisk *disk;
        struct request_queue *q;
        struct blkcg_gq *blkg;
        int ret;

        ret = blkg_conf_open_bdev(ctx);
        if (ret)
                return ret;

        disk = ctx->bdev->bd_disk;
        q = disk->queue;

        /*
         * blkcg_deactivate_policy() requires queue to be frozen, we can grab
         * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
         */
        ret = blk_queue_enter(q, 0);
        if (ret)
                goto fail;

        spin_lock_irq(&q->queue_lock);

        if (!blkcg_policy_enabled(q, pol)) {
                ret = -EOPNOTSUPP;
                goto fail_unlock;
        }

        blkg = blkg_lookup(blkcg, q);
        if (blkg)
                goto success;

        /*
         * Create blkgs walking down from blkcg_root to @blkcg, so that all
         * non-root blkgs have access to their parents.
         */
        while (true) {
                struct blkcg *pos = blkcg;
                struct blkcg *parent;
                struct blkcg_gq *new_blkg;

                parent = blkcg_parent(blkcg);
                while (parent && !blkg_lookup(parent, q)) {
                        pos = parent;
                        parent = blkcg_parent(parent);
                }

                /* Drop locks to do new blkg allocation with GFP_KERNEL. */
                spin_unlock_irq(&q->queue_lock);

                new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
                        goto fail_exit_queue;
                }

                if (radix_tree_preload(GFP_KERNEL)) {
                        blkg_free(new_blkg);
                        ret = -ENOMEM;
                        goto fail_exit_queue;
                }

                spin_lock_irq(&q->queue_lock);

                if (!blkcg_policy_enabled(q, pol)) {
                        blkg_free(new_blkg);
                        ret = -EOPNOTSUPP;
                        goto fail_preloaded;
                }

                blkg = blkg_lookup(pos, q);
                if (blkg) {
                        blkg_free(new_blkg);
                } else {
                        blkg = blkg_create(pos, disk, new_blkg);
                        if (IS_ERR(blkg)) {
                                ret = PTR_ERR(blkg);
                                goto fail_preloaded;
                        }
                }

                radix_tree_preload_end();

                if (pos == blkcg)
                        goto success;
        }
success:
        blk_queue_exit(q);
        ctx->blkg = blkg;
        return 0;

fail_preloaded:
        radix_tree_preload_end();
fail_unlock:
        spin_unlock_irq(&q->queue_lock);
fail_exit_queue:
        blk_queue_exit(q);
fail:
        /*
         * If queue was bypassing, we should retry.  Do so after a
         * short msleep().  It isn't strictly necessary but queue
         * can be bypassing for some time and it's always nice to
         * avoid busy looping.
         */
        if (ret == -EBUSY) {
                msleep(10);
                ret = restart_syscall();
        }
        return ret;
}
EXPORT_SYMBOL_GPL(blkg_conf_prep);

/**
 * blkg_conf_exit - clean up per-blkg config update
 * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
 *
 * Clean up after per-blkg config update. This function must be called on all
 * blkg_conf_ctx's initialized with blkg_conf_init().
 */
void blkg_conf_exit(struct blkg_conf_ctx *ctx)
        __releases(&ctx->bdev->bd_queue->queue_lock)
        __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
{
        if (ctx->blkg) {
                spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
                ctx->blkg = NULL;
        }

        if (ctx->bdev) {
                mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
                blkdev_put_no_open(ctx->bdev);
                ctx->body = NULL;
                ctx->bdev = NULL;
        }
}
EXPORT_SYMBOL_GPL(blkg_conf_exit);

/*
 * Similar to blkg_conf_exit, but also unfreezes the queue and releases
 * q->elevator_lock. Should be used when blkg_conf_open_bdev_frozen
 * is used to open the bdev.
 */
void blkg_conf_exit_frozen(struct blkg_conf_ctx *ctx, unsigned long memflags)
{
        if (ctx->bdev) {
                struct request_queue *q = ctx->bdev->bd_queue;

                blkg_conf_exit(ctx);
                mutex_unlock(&q->elevator_lock);
                blk_mq_unfreeze_queue(q, memflags);
        }
}

static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] += src->bytes[i];
                dst->ios[i] += src->ios[i];
        }
}

static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
{
        int i;

        for (i = 0; i < BLKG_IOSTAT_NR; i++) {
                dst->bytes[i] -= src->bytes[i];
                dst->ios[i] -= src->ios[i];
        }
}

static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
                                struct blkg_iostat *last)
{
        struct blkg_iostat delta;
        unsigned long flags;

        /* propagate percpu delta to global */
        flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
        blkg_iostat_set(&delta, cur);
        blkg_iostat_sub(&delta, last);
        blkg_iostat_add(&blkg->iostat.cur, &delta);
        blkg_iostat_add(last, &delta);
        u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}

static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
{
        struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
        struct llist_node *lnode;
        struct blkg_iostat_set *bisc, *next_bisc;
        unsigned long flags;

        rcu_read_lock();

        lnode = llist_del_all(lhead);
        if (!lnode)
                goto out;

        /*
         * For covering concurrent parent blkg update from blkg_release().
         *
         * When flushing from cgroup, cgroup_rstat_lock is always held, so
         * this lock won't cause contention most of time.
         */
        raw_spin_lock_irqsave(&blkg_stat_lock, flags);

        /*
         * Iterate only the iostat_cpu's queued in the lockless list.
         */
        llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
                struct blkcg_gq *blkg = bisc->blkg;
                struct blkcg_gq *parent = blkg->parent;
                struct blkg_iostat cur;
                unsigned int seq;

                /*
                 * Order assignment of `next_bisc` from `bisc->lnode.next` in
                 * llist_for_each_entry_safe and clearing `bisc->lqueued` for
                 * avoiding to assign `next_bisc` with new next pointer added
                 * in blk_cgroup_bio_start() in case of re-ordering.
                 *
                 * The pair barrier is implied in llist_add() in blk_cgroup_bio_start().
                 */
                smp_mb();

                WRITE_ONCE(bisc->lqueued, false);
                if (bisc == &blkg->iostat)
                        goto propagate_up; /* propagate up to parent only */

                /* fetch the current per-cpu values */
                do {
                        seq = u64_stats_fetch_begin(&bisc->sync);
                        blkg_iostat_set(&cur, &bisc->cur);
                } while (u64_stats_fetch_retry(&bisc->sync, seq));

                blkcg_iostat_update(blkg, &cur, &bisc->last);

propagate_up:
                /* propagate global delta to parent (unless that's root) */
                if (parent && parent->parent) {
                        blkcg_iostat_update(parent, &blkg->iostat.cur,
                                            &blkg->iostat.last);
                        /*
                         * Queue parent->iostat to its blkcg's lockless
                         * list to propagate up to the grandparent if the
                         * iostat hasn't been queued yet.
                         */
                        if (!parent->iostat.lqueued) {
                                struct llist_head *plhead;

                                plhead = per_cpu_ptr(parent->blkcg->lhead, cpu);
                                llist_add(&parent->iostat.lnode, plhead);
                                parent->iostat.lqueued = true;
                        }
                }
        }
        raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
out:
        rcu_read_unlock();
}

static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
        /* Root-level stats are sourced from system-wide IO stats */
        if (cgroup_parent(css->cgroup))
                __blkcg_rstat_flush(css_to_blkcg(css), cpu);
}

/*
 * We source root cgroup stats from the system-wide stats to avoid
 * tracking the same information twice and incurring overhead when no
 * cgroups are defined. For that reason, cgroup_rstat_flush in
 * blkcg_print_stat does not actually fill out the iostat in the root
 * cgroup's blkcg_gq.
 *
 * However, we would like to re-use the printing code between the root and
 * non-root cgroups to the extent possible. For that reason, we simulate
 * flushing the root cgroup's stats by explicitly filling in the iostat
 * with disk level statistics.
 */
static void blkcg_fill_root_iostats(void)
{
        struct class_dev_iter iter;
        struct device *dev;

        class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
        while ((dev = class_dev_iter_next(&iter))) {
                struct block_device *bdev = dev_to_bdev(dev);
                struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
                struct blkg_iostat tmp;
                int cpu;
                unsigned long flags;

                memset(&tmp, 0, sizeof(tmp));
                for_each_possible_cpu(cpu) {
                        struct disk_stats *cpu_dkstats;

                        cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
                        tmp.ios[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->ios[STAT_READ];
                        tmp.ios[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->ios[STAT_WRITE];
                        tmp.ios[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->ios[STAT_DISCARD];
                        // convert sectors to bytes
                        tmp.bytes[BLKG_IOSTAT_READ] +=
                                cpu_dkstats->sectors[STAT_READ] << 9;
                        tmp.bytes[BLKG_IOSTAT_WRITE] +=
                                cpu_dkstats->sectors[STAT_WRITE] << 9;
                        tmp.bytes[BLKG_IOSTAT_DISCARD] +=
                                cpu_dkstats->sectors[STAT_DISCARD] << 9;
                }

                flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
                blkg_iostat_set(&blkg->iostat.cur, &tmp);
                u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
        }
        class_dev_iter_exit(&iter);
}

static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
{
        struct blkg_iostat_set *bis = &blkg->iostat;
        u64 rbytes, wbytes, rios, wios, dbytes, dios;
        const char *dname;
        unsigned seq;
        int i;

        if (!blkg->online)
                return;

        dname = blkg_dev_name(blkg);
        if (!dname)
                return;

        seq_printf(s, "%s ", dname);

        do {
                seq = u64_stats_fetch_begin(&bis->sync);

                rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
                wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
                dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
                rios = bis->cur.ios[BLKG_IOSTAT_READ];
                wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
                dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
        } while (u64_stats_fetch_retry(&bis->sync, seq));

        if (rbytes || wbytes || rios || wios) {
                seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
                        rbytes, wbytes, rios, wios,
                        dbytes, dios);
        }

        if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
                seq_printf(s, " use_delay=%d delay_nsec=%llu",
                        atomic_read(&blkg->use_delay),
                        atomic64_read(&blkg->delay_nsec));
        }

        for (i = 0; i < BLKCG_MAX_POLS; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];

                if (!blkg->pd[i] || !pol->pd_stat_fn)
                        continue;

                pol->pd_stat_fn(blkg->pd[i], s);
        }

        seq_puts(s, "\n");
}

static int blkcg_print_stat(struct seq_file *sf, void *v)
{
        struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
        struct blkcg_gq *blkg;

        if (!seq_css(sf)->parent)
                blkcg_fill_root_iostats();
        else
                cgroup_rstat_flush(blkcg->css.cgroup);

        rcu_read_lock();
        hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
                spin_lock_irq(&blkg->q->queue_lock);
                blkcg_print_one_stat(blkg, sf);
                spin_unlock_irq(&blkg->q->queue_lock);
        }
        rcu_read_unlock();
        return 0;
}

static struct cftype blkcg_files[] = {
        {
                .name = "stat",
                .seq_show = blkcg_print_stat,
        },
        { }        /* terminate */
};

static struct cftype blkcg_legacy_files[] = {
        {
                .name = "reset_stats",
                .write_u64 = blkcg_reset_stats,
        },
        { }        /* terminate */
};

#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
{
        return &css_to_blkcg(css)->cgwb_list;
}
#endif

/*
 * blkcg destruction is a three-stage process.
 *
 * 1. Destruction starts.  The blkcg_css_offline() callback is invoked
 *    which offlines writeback.  Here we tie the next stage of blkg destruction
 *    to the completion of writeback associated with the blkcg.  This lets us
 *    avoid punting potentially large amounts of outstanding writeback to root
 *    while maintaining any ongoing policies.  The next stage is triggered when
 *    the nr_cgwbs count goes to zero.
 *
 * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called
 *    and handles the destruction of blkgs.  Here the css reference held by
 *    the blkg is put back eventually allowing blkcg_css_free() to be called.
 *    This work may occur in cgwb_release_workfn() on the cgwb_release
 *    workqueue.  Any submitted ios that fail to get the blkg ref will be
 *    punted to the root_blkg.
 *
 * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called.
 *    This finally frees the blkcg.
 */

/**
 * blkcg_destroy_blkgs - responsible for shooting down blkgs
 * @blkcg: blkcg of interest
 *
 * blkgs should be removed while holding both q and blkcg locks.  As blkcg lock
 * is nested inside q lock, this function performs reverse double lock dancing.
 * Destroying the blkgs releases the reference held on the blkcg's css allowing
 * blkcg_css_free to eventually be called.
 *
 * This is the blkcg counterpart of ioc_release_fn().
 */
static void blkcg_destroy_blkgs(struct blkcg *blkcg)
{
        might_sleep();

        spin_lock_irq(&blkcg->lock);

        while (!hlist_empty(&blkcg->blkg_list)) {
                struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first,
                                                struct blkcg_gq, blkcg_node);
                struct request_queue *q = blkg->q;

                if (need_resched() || !spin_trylock(&q->queue_lock)) {
                        /*
                         * Given that the system can accumulate a huge number
                         * of blkgs in pathological cases, check to see if we
                         * need to rescheduling to avoid softlockup.
                         */
                        spin_unlock_irq(&blkcg->lock);
                        cond_resched();
                        spin_lock_irq(&blkcg->lock);
                        continue;
                }

                blkg_destroy(blkg);
                spin_unlock(&q->queue_lock);
        }

        spin_unlock_irq(&blkcg->lock);
}

/**
 * blkcg_pin_online - pin online state
 * @blkcg_css: blkcg of interest
 *
 * While pinned, a blkcg is kept online.  This is primarily used to
 * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline
 * while an associated cgwb is still active.
 */
void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css)
{
        refcount_inc(&css_to_blkcg(blkcg_css)->online_pin);
}

/**
 * blkcg_unpin_online - unpin online state
 * @blkcg_css: blkcg of interest
 *
 * This is primarily used to impedance-match blkg and cgwb lifetimes so
 * that blkg doesn't go offline while an associated cgwb is still active.
 * When this count goes to zero, all active cgwbs have finished so the
 * blkcg can continue destruction by calling blkcg_destroy_blkgs().
 */
void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
{
        struct blkcg *blkcg = css_to_blkcg(blkcg_css);

        do {
                struct blkcg *parent;

                if (!refcount_dec_and_test(&blkcg->online_pin))
                        break;

                parent = blkcg_parent(blkcg);
                blkcg_destroy_blkgs(blkcg);
                blkcg = parent;
        } while (blkcg);
}

/**
 * blkcg_css_offline - cgroup css_offline callback
 * @css: css of interest
 *
 * This function is called when @css is about to go away.  Here the cgwbs are
 * offlined first and only once writeback associated with the blkcg has
 * finished do we start step 2 (see above).
 */
static void blkcg_css_offline(struct cgroup_subsys_state *css)
{
        /* this prevents anyone from attaching or migrating to this blkcg */
        wb_blkcg_offline(css);

        /* put the base online pin allowing step 2 to be triggered */
        blkcg_unpin_online(css);
}

static void blkcg_css_free(struct cgroup_subsys_state *css)
{
        struct blkcg *blkcg = css_to_blkcg(css);
        int i;

        mutex_lock(&blkcg_pol_mutex);

        list_del(&blkcg->all_blkcgs_node);

        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);

        mutex_unlock(&blkcg_pol_mutex);

        free_percpu(blkcg->lhead);
        kfree(blkcg);
}

static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct blkcg *blkcg;
        int i;

        mutex_lock(&blkcg_pol_mutex);

        if (!parent_css) {
                blkcg = &blkcg_root;
        } else {
                blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
                if (!blkcg)
                        goto unlock;
        }

        if (init_blkcg_llists(blkcg))
                goto free_blkcg;

        for (i = 0; i < BLKCG_MAX_POLS ; i++) {
                struct blkcg_policy *pol = blkcg_policy[i];
                struct blkcg_policy_data *cpd;

                /*
                 * If the policy hasn't been attached yet, wait for it
                 * to be attached before doing anything else. Otherwise,
                 * check if the policy requires any specific per-cgroup
                 * data: if it does, allocate and initialize it.
                 */
                if (!pol || !pol->cpd_alloc_fn)
                        continue;

                cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                if (!cpd)
                        goto free_pd_blkcg;

                blkcg->cpd[i] = cpd;
                cpd->blkcg = blkcg;
                cpd->plid = i;
        }

        spin_lock_init(&blkcg->lock);
        refcount_set(&blkcg->online_pin, 1);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&blkcg->cgwb_list);
#endif
        list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);

        mutex_unlock(&blkcg_pol_mutex);
        return &blkcg->css;

free_pd_blkcg:
        for (i--; i >= 0; i--)
                if (blkcg->cpd[i])
                        blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
        free_percpu(blkcg->lhead);
free_blkcg:
        if (blkcg != &blkcg_root)
                kfree(blkcg);
unlock:
        mutex_unlock(&blkcg_pol_mutex);
        return ERR_PTR(-ENOMEM);
}

static int blkcg_css_online(struct cgroup_subsys_state *css)
{
        struct blkcg *parent = blkcg_parent(css_to_blkcg(css));

        /*
         * blkcg_pin_online() is used to delay blkcg offline so that blkgs
         * don't go offline while cgwbs are still active on them.  Pin the
         * parent so that offline always happens towards the root.
         */
        if (parent)
                blkcg_pin_online(&parent->css);
        return 0;
}

void blkg_init_queue(struct request_queue *q)
{
        INIT_LIST_HEAD(&q->blkg_list);
        mutex_init(&q->blkcg_mutex);
}

int blkcg_init_disk(struct gendisk *disk)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *new_blkg, *blkg;
        bool preloaded;

        new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
        if (!new_blkg)
                return -ENOMEM;

        preloaded = !radix_tree_preload(GFP_KERNEL);

        /* Make sure the root blkg exists. */
        /* spin_lock_irq can serve as RCU read-side critical section. */
        spin_lock_irq(&q->queue_lock);
        blkg = blkg_create(&blkcg_root, disk, new_blkg);
        if (IS_ERR(blkg))
                goto err_unlock;
        q->root_blkg = blkg;
        spin_unlock_irq(&q->queue_lock);

        if (preloaded)
                radix_tree_preload_end();

        return 0;

err_unlock:
        spin_unlock_irq(&q->queue_lock);
        if (preloaded)
                radix_tree_preload_end();
        return PTR_ERR(blkg);
}

void blkcg_exit_disk(struct gendisk *disk)
{
        blkg_destroy_all(disk);
        blk_throtl_exit(disk);
}

static void blkcg_exit(struct task_struct *tsk)
{
        if (tsk->throttle_disk)
                put_disk(tsk->throttle_disk);
        tsk->throttle_disk = NULL;
}

struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_online = blkcg_css_online,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .css_rstat_flush = blkcg_rstat_flush,
        .dfl_cftypes = blkcg_files,
        .legacy_cftypes = blkcg_legacy_files,
        .legacy_name = "blkio",
        .exit = blkcg_exit,
#ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
         * together on the default hierarchy so that the owner cgroup can
         * be retrieved from writeback pages.
         */
        .depends_on = 1 << memory_cgrp_id,
#endif
};
EXPORT_SYMBOL_GPL(io_cgrp_subsys);

/**
 * blkcg_activate_policy - activate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to activate
 *
 * Activate @pol on @disk.  Requires %GFP_KERNEL context.  @disk goes through
 * bypass mode to populate its blkgs with policy_data for @pol.
 *
 * Activation happens with @disk bypassed, so nobody would be accessing blkgs
 * from IO path.  Update of each blkg is protected by both queue and blkcg
 * locks so that holding either lock and testing blkcg_policy_enabled() is
 * always enough for dereferencing policy data.
 *
 * The caller is responsible for synchronizing [de]activations and policy
 * [un]registerations.  Returns 0 on success, -errno on failure.
 */
int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkg_policy_data *pd_prealloc = NULL;
        struct blkcg_gq *blkg, *pinned_blkg = NULL;
        unsigned int memflags;
        int ret;

        if (blkcg_policy_enabled(q, pol))
                return 0;

        /*
         * Policy is allowed to be registered without pd_alloc_fn/pd_free_fn,
         * for example, ioprio. Such policy will work on blkcg level, not disk
         * level, and don't need to be activated.
         */
        if (WARN_ON_ONCE(!pol->pd_alloc_fn || !pol->pd_free_fn))
                return -EINVAL;

        if (queue_is_mq(q))
                memflags = blk_mq_freeze_queue(q);
retry:
        spin_lock_irq(&q->queue_lock);

        /* blkg_list is pushed at the head, reverse walk to initialize parents first */
        list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
                struct blkg_policy_data *pd;

                if (blkg->pd[pol->plid])
                        continue;

                /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
                if (blkg == pinned_blkg) {
                        pd = pd_prealloc;
                        pd_prealloc = NULL;
                } else {
                        pd = pol->pd_alloc_fn(disk, blkg->blkcg,
                                              GFP_NOWAIT | __GFP_NOWARN);
                }

                if (!pd) {
                        /*
                         * GFP_NOWAIT failed.  Free the existing one and
                         * prealloc for @blkg w/ GFP_KERNEL.
                         */
                        if (pinned_blkg)
                                blkg_put(pinned_blkg);
                        blkg_get(blkg);
                        pinned_blkg = blkg;

                        spin_unlock_irq(&q->queue_lock);

                        if (pd_prealloc)
                                pol->pd_free_fn(pd_prealloc);
                        pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
                                                       GFP_KERNEL);
                        if (pd_prealloc)
                                goto retry;
                        else
                                goto enomem;
                }

                spin_lock(&blkg->blkcg->lock);

                pd->blkg = blkg;
                pd->plid = pol->plid;
                blkg->pd[pol->plid] = pd;

                if (pol->pd_init_fn)
                        pol->pd_init_fn(pd);

                if (pol->pd_online_fn)
                        pol->pd_online_fn(pd);
                pd->online = true;

                spin_unlock(&blkg->blkcg->lock);
        }

        __set_bit(pol->plid, q->blkcg_pols);
        ret = 0;

        spin_unlock_irq(&q->queue_lock);
out:
        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q, memflags);
        if (pinned_blkg)
                blkg_put(pinned_blkg);
        if (pd_prealloc)
                pol->pd_free_fn(pd_prealloc);
        return ret;

enomem:
        /* alloc failed, take down everything */
        spin_lock_irq(&q->queue_lock);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;
                struct blkg_policy_data *pd;

                spin_lock(&blkcg->lock);
                pd = blkg->pd[pol->plid];
                if (pd) {
                        if (pd->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(pd);
                        pd->online = false;
                        pol->pd_free_fn(pd);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }
        spin_unlock_irq(&q->queue_lock);
        ret = -ENOMEM;
        goto out;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);

/**
 * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk
 * @disk: gendisk of interest
 * @pol: blkcg policy to deactivate
 *
 * Deactivate @pol on @disk.  Follows the same synchronization rules as
 * blkcg_activate_policy().
 */
void blkcg_deactivate_policy(struct gendisk *disk,
                             const struct blkcg_policy *pol)
{
        struct request_queue *q = disk->queue;
        struct blkcg_gq *blkg;
        unsigned int memflags;

        if (!blkcg_policy_enabled(q, pol))
                return;

        if (queue_is_mq(q))
                memflags = blk_mq_freeze_queue(q);

        mutex_lock(&q->blkcg_mutex);
        spin_lock_irq(&q->queue_lock);

        __clear_bit(pol->plid, q->blkcg_pols);

        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                struct blkcg *blkcg = blkg->blkcg;

                spin_lock(&blkcg->lock);
                if (blkg->pd[pol->plid]) {
                        if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
                                pol->pd_offline_fn(blkg->pd[pol->plid]);
                        pol->pd_free_fn(blkg->pd[pol->plid]);
                        blkg->pd[pol->plid] = NULL;
                }
                spin_unlock(&blkcg->lock);
        }

        spin_unlock_irq(&q->queue_lock);
        mutex_unlock(&q->blkcg_mutex);

        if (queue_is_mq(q))
                blk_mq_unfreeze_queue(q, memflags);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);

static void blkcg_free_all_cpd(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;

        list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                if (blkcg->cpd[pol->plid]) {
                        pol->cpd_free_fn(blkcg->cpd[pol->plid]);
                        blkcg->cpd[pol->plid] = NULL;
                }
        }
}

/**
 * blkcg_policy_register - register a blkcg policy
 * @pol: blkcg policy to register
 *
 * Register @pol with blkcg core.  Might sleep and @pol may be modified on
 * successful registration.  Returns 0 on success and -errno on failure.
 */
int blkcg_policy_register(struct blkcg_policy *pol)
{
        struct blkcg *blkcg;
        int i, ret;

        /*
         * Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs, and policy
         * without pd_alloc_fn/pd_free_fn can't be activated.
         */
        if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) ||
            (!pol->pd_alloc_fn ^ !pol->pd_free_fn))
                return -EINVAL;

        mutex_lock(&blkcg_pol_register_mutex);
        mutex_lock(&blkcg_pol_mutex);

        /* find an empty slot */
        for (i = 0; i < BLKCG_MAX_POLS; i++)
                if (!blkcg_policy[i])
                        break;
        if (i >= BLKCG_MAX_POLS) {
                pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n");
                ret = -ENOSPC;
                goto err_unlock;
        }

        /* register @pol */
        pol->plid = i;
        blkcg_policy[pol->plid] = pol;

        /* allocate and install cpd's */
        if (pol->cpd_alloc_fn) {
                list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
                        struct blkcg_policy_data *cpd;

                        cpd = pol->cpd_alloc_fn(GFP_KERNEL);
                        if (!cpd) {
                                ret = -ENOMEM;
                                goto err_free_cpds;
                        }

                        blkcg->cpd[pol->plid] = cpd;
                        cpd->blkcg = blkcg;
                        cpd->plid = pol->plid;
                }
        }

        mutex_unlock(&blkcg_pol_mutex);

        /* everything is in place, add intf files for the new policy */
        if (pol->dfl_cftypes == pol->legacy_cftypes) {
                WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys,
                                           pol->dfl_cftypes));
        } else {
                WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
                                               pol->dfl_cftypes));
                WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
                                                  pol->legacy_cftypes));
        }
        mutex_unlock(&blkcg_pol_register_mutex);
        return 0;

err_free_cpds:
        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;
err_unlock:
        mutex_unlock(&blkcg_pol_mutex);
        mutex_unlock(&blkcg_pol_register_mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(blkcg_policy_register);

/**
 * blkcg_policy_unregister - unregister a blkcg policy
 * @pol: blkcg policy to unregister
 *
 * Undo blkcg_policy_register(@pol).  Might sleep.
 */
void blkcg_policy_unregister(struct blkcg_policy *pol)
{
        mutex_lock(&blkcg_pol_register_mutex);

        if (WARN_ON(blkcg_policy[pol->plid] != pol))
                goto out_unlock;

        /* kill the intf files first */
        if (pol->dfl_cftypes)
                cgroup_rm_cftypes(pol->dfl_cftypes);
        if (pol->legacy_cftypes)
                cgroup_rm_cftypes(pol->legacy_cftypes);

        /* remove cpds and unregister */
        mutex_lock(&blkcg_pol_mutex);

        if (pol->cpd_free_fn)
                blkcg_free_all_cpd(pol);

        blkcg_policy[pol->plid] = NULL;

        mutex_unlock(&blkcg_pol_mutex);
out_unlock:
        mutex_unlock(&blkcg_pol_register_mutex);
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);

/*
 * Scale the accumulated delay based on how long it has been since we updated
 * the delay.  We only call this when we are adding delay, in case it's been a
 * while since we added delay, and when we are checking to see if we need to
 * delay a task, to account for any delays that may have occurred.
 */
static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
{
        u64 old = atomic64_read(&blkg->delay_start);

        /* negative use_delay means no scaling, see blkcg_set_delay() */
        if (atomic_read(&blkg->use_delay) < 0)
                return;

        /*
         * We only want to scale down every second.  The idea here is that we
         * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
         * time window.  We only want to throttle tasks for recent delay that
         * has occurred, in 1 second time windows since that's the maximum
         * things can be throttled.  We save the current delay window in
         * blkg->last_delay so we know what amount is still left to be charged
         * to the blkg from this point onward.  blkg->last_use keeps track of
         * the use_delay counter.  The idea is if we're unthrottling the blkg we
         * are ok with whatever is happening now, and we can take away more of
         * the accumulated delay as we've already throttled enough that
         * everybody is happy with their IO latencies.
         */
        if (time_before64(old + NSEC_PER_SEC, now) &&
            atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
                u64 cur = atomic64_read(&blkg->delay_nsec);
                u64 sub = min_t(u64, blkg->last_delay, now - old);
                int cur_use = atomic_read(&blkg->use_delay);

                /*
                 * We've been unthrottled, subtract a larger chunk of our
                 * accumulated delay.
                 */
                if (cur_use < blkg->last_use)
                        sub = max_t(u64, sub, blkg->last_delay >> 1);

                /*
                 * This shouldn't happen, but handle it anyway.  Our delay_nsec
                 * should only ever be growing except here where we subtract out
                 * min(last_delay, 1 second), but lord knows bugs happen and I'd
                 * rather not end up with negative numbers.
                 */
                if (unlikely(cur < sub)) {
                        atomic64_set(&blkg->delay_nsec, 0);
                        blkg->last_delay = 0;
                } else {
                        atomic64_sub(sub, &blkg->delay_nsec);
                        blkg->last_delay = cur - sub;
                }
                blkg->last_use = cur_use;
        }
}

/*
 * This is called when we want to actually walk up the hierarchy and check to
 * see if we need to throttle, and then actually throttle if there is some
 * accumulated delay.  This should only be called upon return to user space so
 * we're not holding some lock that would induce a priority inversion.
 */
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
        unsigned long pflags;
        bool clamp;
        u64 now = blk_time_get_ns();
        u64 exp;
        u64 delay_nsec = 0;
        int tok;

        while (blkg->parent) {
                int use_delay = atomic_read(&blkg->use_delay);

                if (use_delay) {
                        u64 this_delay;

                        blkcg_scale_delay(blkg, now);
                        this_delay = atomic64_read(&blkg->delay_nsec);
                        if (this_delay > delay_nsec) {
                                delay_nsec = this_delay;
                                clamp = use_delay > 0;
                        }
                }
                blkg = blkg->parent;
        }

        if (!delay_nsec)
                return;

        /*
         * Let's not sleep for all eternity if we've amassed a huge delay.
         * Swapping or metadata IO can accumulate 10's of seconds worth of
         * delay, and we want userspace to be able to do _something_ so cap the
         * delays at 0.25s. If there's 10's of seconds worth of delay then the
         * tasks will be delayed for 0.25 second for every syscall. If
         * blkcg_set_delay() was used as indicated by negative use_delay, the
         * caller is responsible for regulating the range.
         */
        if (clamp)
                delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);

        if (use_memdelay)
                psi_memstall_enter(&pflags);

        exp = ktime_add_ns(now, delay_nsec);
        tok = io_schedule_prepare();
        do {
                __set_current_state(TASK_KILLABLE);
                if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS))
                        break;
        } while (!fatal_signal_pending(current));
        io_schedule_finish(tok);

        if (use_memdelay)
                psi_memstall_leave(&pflags);
}

/**
 * blkcg_maybe_throttle_current - throttle the current task if it has been marked
 *
 * This is only called if we've been marked with set_notify_resume().  Obviously
 * we can be set_notify_resume() for reasons other than blkcg throttling, so we
 * check to see if current->throttle_disk is set and if not this doesn't do
 * anything.  This should only ever be called by the resume code, it's not meant
 * to be called by people willy-nilly as it will actually do the work to
 * throttle the task if it is setup for throttling.
 */
void blkcg_maybe_throttle_current(void)
{
        struct gendisk *disk = current->throttle_disk;
        struct blkcg *blkcg;
        struct blkcg_gq *blkg;
        bool use_memdelay = current->use_memdelay;

        if (!disk)
                return;

        current->throttle_disk = NULL;
        current->use_memdelay = false;

        rcu_read_lock();
        blkcg = css_to_blkcg(blkcg_css());
        if (!blkcg)
                goto out;
        blkg = blkg_lookup(blkcg, disk->queue);
        if (!blkg)
                goto out;
        if (!blkg_tryget(blkg))
                goto out;
        rcu_read_unlock();

        blkcg_maybe_throttle_blkg(blkg, use_memdelay);
        blkg_put(blkg);
        put_disk(disk);
        return;
out:
        rcu_read_unlock();
}

/**
 * blkcg_schedule_throttle - this task needs to check for throttling
 * @disk: disk to throttle
 * @use_memdelay: do we charge this to memory delay for PSI
 *
 * This is called by the IO controller when we know there's delay accumulated
 * for the blkg for this task.  We do not pass the blkg because there are places
 * we call this that may not have that information, the swapping code for
 * instance will only have a block_device at that point.  This set's the
 * notify_resume for the task to check and see if it requires throttling before
 * returning to user space.
 *
 * We will only schedule once per syscall.  You can call this over and over
 * again and it will only do the check once upon return to user space, and only
 * throttle once.  If the task needs to be throttled again it'll need to be
 * re-set at the next time we see the task.
 */
void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay)
{
        if (unlikely(current->flags & PF_KTHREAD))
                return;

        if (current->throttle_disk != disk) {
                if (test_bit(GD_DEAD, &disk->state))
                        return;
                get_device(disk_to_dev(disk));

                if (current->throttle_disk)
                        put_disk(current->throttle_disk);
                current->throttle_disk = disk;
        }

        if (use_memdelay)
                current->use_memdelay = use_memdelay;
        set_notify_resume(current);
}

/**
 * blkcg_add_delay - add delay to this blkg
 * @blkg: blkg of interest
 * @now: the current time in nanoseconds
 * @delta: how many nanoseconds of delay to add
 *
 * Charge @delta to the blkg's current delay accumulation.  This is used to
 * throttle tasks if an IO controller thinks we need more throttling.
 */
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
{
        if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
                return;
        blkcg_scale_delay(blkg, now);
        atomic64_add(delta, &blkg->delay_nsec);
}

/**
 * blkg_tryget_closest - try and get a blkg ref on the closet blkg
 * @bio: target bio
 * @css: target css
 *
 * As the failure mode here is to walk up the blkg tree, this ensure that the
 * blkg->parent pointers are always valid.  This returns the blkg that it ended
 * up taking a reference on or %NULL if no reference was taken.
 */
static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
                struct cgroup_subsys_state *css)
{
        struct blkcg_gq *blkg, *ret_blkg = NULL;

        rcu_read_lock();
        blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
        while (blkg) {
                if (blkg_tryget(blkg)) {
                        ret_blkg = blkg;
                        break;
                }
                blkg = blkg->parent;
        }
        rcu_read_unlock();

        return ret_blkg;
}

/**
 * bio_associate_blkg_from_css - associate a bio with a specified css
 * @bio: target bio
 * @css: target css
 *
 * Associate @bio with the blkg found by combining the css's blkg and the
 * request_queue of the @bio.  An association failure is handled by walking up
 * the blkg tree.  Therefore, the blkg associated can be anything between @blkg
 * and q->root_blkg.  This situation only happens when a cgroup is dying and
 * then the remaining bios will spill to the closest alive blkg.
 *
 * A reference will be taken on the blkg and will be released when @bio is
 * freed.
 */
void bio_associate_blkg_from_css(struct bio *bio,
                                 struct cgroup_subsys_state *css)
{
        if (bio->bi_blkg)
                blkg_put(bio->bi_blkg);

        if (css && css->parent) {
                bio->bi_blkg = blkg_tryget_closest(bio, css);
        } else {
                blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
                bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
        }
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);

/**
 * bio_associate_blkg - associate a bio with a blkg
 * @bio: target bio
 *
 * Associate @bio with the blkg found from the bio's css and request_queue.
 * If one is not found, bio_lookup_blkg() creates the blkg.  If a blkg is
 * already associated, the css is reused and association redone as the
 * request_queue may have changed.
 */
void bio_associate_blkg(struct bio *bio)
{
        struct cgroup_subsys_state *css;

        if (blk_op_is_passthrough(bio->bi_opf))
                return;

        rcu_read_lock();

        if (bio->bi_blkg)
                css = bio_blkcg_css(bio);
        else
                css = blkcg_css();

        bio_associate_blkg_from_css(bio, css);

        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(bio_associate_blkg);

/**
 * bio_clone_blkg_association - clone blkg association from src to dst bio
 * @dst: destination bio
 * @src: source bio
 */
void bio_clone_blkg_association(struct bio *dst, struct bio *src)
{
        if (src->bi_blkg)
                bio_associate_blkg_from_css(dst, bio_blkcg_css(src));
}
EXPORT_SYMBOL_GPL(bio_clone_blkg_association);

static int blk_cgroup_io_type(struct bio *bio)
{
        if (op_is_discard(bio->bi_opf))
                return BLKG_IOSTAT_DISCARD;
        if (op_is_write(bio->bi_opf))
                return BLKG_IOSTAT_WRITE;
        return BLKG_IOSTAT_READ;
}

void blk_cgroup_bio_start(struct bio *bio)
{
        struct blkcg *blkcg = bio->bi_blkg->blkcg;
        int rwd = blk_cgroup_io_type(bio), cpu;
        struct blkg_iostat_set *bis;
        unsigned long flags;

        if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
                return;

        /* Root-level stats are sourced from system-wide IO stats */
        if (!cgroup_parent(blkcg->css.cgroup))
                return;

        cpu = get_cpu();
        bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
        flags = u64_stats_update_begin_irqsave(&bis->sync);

        /*
         * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
         * bio and we would have already accounted for the size of the bio.
         */
        if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
                bio_set_flag(bio, BIO_CGROUP_ACCT);
                bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
        }
        bis->cur.ios[rwd]++;

        /*
         * If the iostat_cpu isn't in a lockless list, put it into the
         * list to indicate that a stat update is pending.
         */
        if (!READ_ONCE(bis->lqueued)) {
                struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);

                llist_add(&bis->lnode, lhead);
                WRITE_ONCE(bis->lqueued, true);
        }

        u64_stats_update_end_irqrestore(&bis->sync, flags);
        cgroup_rstat_updated(blkcg->css.cgroup, cpu);
        put_cpu();
}

bool blk_cgroup_congested(void)
{
        struct blkcg *blkcg;
        bool ret = false;

        rcu_read_lock();
        for (blkcg = css_to_blkcg(blkcg_css()); blkcg;
             blkcg = blkcg_parent(blkcg)) {
                if (atomic_read(&blkcg->congestion_count)) {
                        ret = true;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}

module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * descriptor table internals; you almost certainly want file.h instead.
 */

#ifndef __LINUX_FDTABLE_H
#define __LINUX_FDTABLE_H

#include <linux/posix_types.h>
#include <linux/compiler.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/nospec.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/fs.h>

#include <linux/atomic.h>

/*
 * The default fd array needs to be at least BITS_PER_LONG,
 * as this is the granularity returned by copy_fdset().
 */
#define NR_OPEN_DEFAULT BITS_PER_LONG

struct fdtable {
        unsigned int max_fds;
        struct file __rcu **fd;      /* current fd array */
        unsigned long *close_on_exec;
        unsigned long *open_fds;
        unsigned long *full_fds_bits;
        struct rcu_head rcu;
};

/*
 * Open file table structure
 */
struct files_struct {
  /*
   * read mostly part
   */
        atomic_t count;
        bool resize_in_progress;
        wait_queue_head_t resize_wait;

        struct fdtable __rcu *fdt;
        struct fdtable fdtab;
  /*
   * written part on a separate cache line in SMP
   */
        spinlock_t file_lock ____cacheline_aligned_in_smp;
        unsigned int next_fd;
        unsigned long close_on_exec_init[1];
        unsigned long open_fds_init[1];
        unsigned long full_fds_bits_init[1];
        struct file __rcu * fd_array[NR_OPEN_DEFAULT];
};

struct file_operations;
struct vfsmount;
struct dentry;

#define rcu_dereference_check_fdtable(files, fdtfd) \
        rcu_dereference_check((fdtfd), lockdep_is_held(&(files)->file_lock))

#define files_fdtable(files) \
        rcu_dereference_check_fdtable((files), (files)->fdt)

/*
 * The caller must ensure that fd table isn't shared or hold rcu or file lock
 */
static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
{
        struct fdtable *fdt = rcu_dereference_raw(files->fdt);
        unsigned long mask = array_index_mask_nospec(fd, fdt->max_fds);
        struct file *needs_masking;

        /*
         * 'mask' is zero for an out-of-bounds fd, all ones for ok.
         * 'fd&mask' is 'fd' for ok, or 0 for out of bounds.
         *
         * Accessing fdt->fd[0] is ok, but needs masking of the result.
         */
        needs_masking = rcu_dereference_raw(fdt->fd[fd&mask]);
        return (struct file *)(mask & (unsigned long)needs_masking);
}

static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
{
        RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
                           "suspicious rcu_dereference_check() usage");
        return files_lookup_fd_raw(files, fd);
}

static inline bool close_on_exec(unsigned int fd, const struct files_struct *files)
{
        return test_bit(fd, files_fdtable(files)->close_on_exec);
}

struct task_struct;

void put_files_struct(struct files_struct *fs);
int unshare_files(void);
struct fd_range {
        unsigned int from, to;
};
struct files_struct *dup_fd(struct files_struct *, struct fd_range *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
                int (*)(const void *, struct file *, unsigned),
                const void *);

extern int close_fd(unsigned int fd);
extern struct file *file_close_fd(unsigned int fd);

extern struct kmem_cache *files_cachep;

#endif /* __LINUX_FDTABLE_H */

























    2 


  476 














































  275 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_LOCK_H_
#define _ASM_GENERIC_BITOPS_LOCK_H_

#include <linux/atomic.h>
#include <linux/compiler.h>
#include <asm/barrier.h>

/**
 * arch_test_and_set_bit_lock - Set a bit and return its old value, for lock
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This operation is atomic and provides acquire barrier semantics if
 * the returned value is 0.
 * It can be used to implement bit locks.
 */
static __always_inline int
arch_test_and_set_bit_lock(unsigned int nr, volatile unsigned long *p)
{
        long old;
        unsigned long mask = BIT_MASK(nr);

        p += BIT_WORD(nr);
        if (READ_ONCE(*p) & mask)
                return 1;

        old = raw_atomic_long_fetch_or_acquire(mask, (atomic_long_t *)p);
        return !!(old & mask);
}


/**
 * arch_clear_bit_unlock - Clear a bit in memory, for unlock
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * This operation is atomic and provides release barrier semantics.
 */
static __always_inline void
arch_clear_bit_unlock(unsigned int nr, volatile unsigned long *p)
{
        p += BIT_WORD(nr);
        raw_atomic_long_fetch_andnot_release(BIT_MASK(nr), (atomic_long_t *)p);
}

/**
 * arch___clear_bit_unlock - Clear a bit in memory, for unlock
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * A weaker form of clear_bit_unlock() as used by __bit_lock_unlock(). If all
 * the bits in the word are protected by this lock some archs can use weaker
 * ops to safely unlock.
 *
 * See for example x86's implementation.
 */
static inline void
arch___clear_bit_unlock(unsigned int nr, volatile unsigned long *p)
{
        unsigned long old;

        p += BIT_WORD(nr);
        old = READ_ONCE(*p);
        old &= ~BIT_MASK(nr);
        raw_atomic_long_set_release((atomic_long_t *)p, old);
}

#ifndef arch_xor_unlock_is_negative_byte
static inline bool arch_xor_unlock_is_negative_byte(unsigned long mask,
                volatile unsigned long *p)
{
        long old;

        old = raw_atomic_long_fetch_xor_release(mask, (atomic_long_t *)p);
        return !!(old & BIT(7));
}
#endif

#include <asm-generic/bitops/instrumented-lock.h>

#endif /* _ASM_GENERIC_BITOPS_LOCK_H_ */




































 1070 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/fault-inject.h>
#include <linux/fault-inject-usercopy.h>

static struct {
        struct fault_attr attr;
} fail_usercopy = {
        .attr = FAULT_ATTR_INITIALIZER,
};

static int __init setup_fail_usercopy(char *str)
{
        return setup_fault_attr(&fail_usercopy.attr, str);
}
__setup("fail_usercopy=", setup_fail_usercopy);

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS

static int __init fail_usercopy_debugfs(void)
{
        struct dentry *dir;

        dir = fault_create_debugfs_attr("fail_usercopy", NULL,
                                        &fail_usercopy.attr);
        if (IS_ERR(dir))
                return PTR_ERR(dir);

        return 0;
}

late_initcall(fail_usercopy_debugfs);

#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */

bool should_fail_usercopy(void)
{
        return should_fail(&fail_usercopy.attr, 1);
}
EXPORT_SYMBOL_GPL(should_fail_usercopy);





















  444 














































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/fs.h>

#define DEVCG_ACC_MKNOD 1
#define DEVCG_ACC_READ  2
#define DEVCG_ACC_WRITE 4
#define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE)

#define DEVCG_DEV_BLOCK 1
#define DEVCG_DEV_CHAR  2
#define DEVCG_DEV_ALL   4  /* this represents all devices */


#if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF)
int devcgroup_check_permission(short type, u32 major, u32 minor,
                               short access);
static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{
        short type, access = 0;

        if (likely(!inode->i_rdev))
                return 0;

        if (S_ISBLK(inode->i_mode))
                type = DEVCG_DEV_BLOCK;
        else if (S_ISCHR(inode->i_mode))
                type = DEVCG_DEV_CHAR;
        else
                return 0;

        if (mask & MAY_WRITE)
                access |= DEVCG_ACC_WRITE;
        if (mask & MAY_READ)
                access |= DEVCG_ACC_READ;

        return devcgroup_check_permission(type, imajor(inode), iminor(inode),
                                          access);
}

static inline int devcgroup_inode_mknod(int mode, dev_t dev)
{
        short type;

        if (!S_ISBLK(mode) && !S_ISCHR(mode))
                return 0;

        if (S_ISCHR(mode) && dev == WHITEOUT_DEV)
                return 0;

        if (S_ISBLK(mode))
                type = DEVCG_DEV_BLOCK;
        else
                type = DEVCG_DEV_CHAR;

        return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev),
                                          DEVCG_ACC_MKNOD);
}

#else
static inline int devcgroup_check_permission(short type, u32 major, u32 minor,
                               short access)
{ return 0; }
static inline int devcgroup_inode_permission(struct inode *inode, int mask)
{ return 0; }
static inline int devcgroup_inode_mknod(int mode, dev_t dev)
{ return 0; }
#endif






































































































































































































































































































































































































































































































































































































































    4 












    4 

    4 





    4 

































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
// SPDX-License-Identifier: GPL-2.0
/*
 * Workingset detection
 *
 * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
 */

#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/swap.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include "internal.h"

/*
 *                Double CLOCK lists
 *
 * Per node, two clock lists are maintained for file pages: the
 * inactive and the active list.  Freshly faulted pages start out at
 * the head of the inactive list and page reclaim scans pages from the
 * tail.  Pages that are accessed multiple times on the inactive list
 * are promoted to the active list, to protect them from reclaim,
 * whereas active pages are demoted to the inactive list when the
 * active list grows too big.
 *
 *   fault ------------------------+
 *                                 |
 *              +--------------+   |            +-------------+
 *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
 *              +--------------+                +-------------+    |
 *                     |                                           |
 *                     +-------------- promotion ------------------+
 *
 *
 *                Access frequency and refault distance
 *
 * A workload is thrashing when its pages are frequently used but they
 * are evicted from the inactive list every time before another access
 * would have promoted them to the active list.
 *
 * In cases where the average access distance between thrashing pages
 * is bigger than the size of memory there is nothing that can be
 * done - the thrashing set could never fit into memory under any
 * circumstance.
 *
 * However, the average access distance could be bigger than the
 * inactive list, yet smaller than the size of memory.  In this case,
 * the set could fit into memory if it weren't for the currently
 * active pages - which may be used more, hopefully less frequently:
 *
 *      +-memory available to cache-+
 *      |                           |
 *      +-inactive------+-active----+
 *  a b | c d e f g h i | J K L M N |
 *      +---------------+-----------+
 *
 * It is prohibitively expensive to accurately track access frequency
 * of pages.  But a reasonable approximation can be made to measure
 * thrashing on the inactive list, after which refaulting pages can be
 * activated optimistically to compete with the existing active pages.
 *
 * Approximating inactive page access frequency - Observations:
 *
 * 1. When a page is accessed for the first time, it is added to the
 *    head of the inactive list, slides every existing inactive page
 *    towards the tail by one slot, and pushes the current tail page
 *    out of memory.
 *
 * 2. When a page is accessed for the second time, it is promoted to
 *    the active list, shrinking the inactive list by one slot.  This
 *    also slides all inactive pages that were faulted into the cache
 *    more recently than the activated page towards the tail of the
 *    inactive list.
 *
 * Thus:
 *
 * 1. The sum of evictions and activations between any two points in
 *    time indicate the minimum number of inactive pages accessed in
 *    between.
 *
 * 2. Moving one inactive page N page slots towards the tail of the
 *    list requires at least N inactive page accesses.
 *
 * Combining these:
 *
 * 1. When a page is finally evicted from memory, the number of
 *    inactive pages accessed while the page was in cache is at least
 *    the number of page slots on the inactive list.
 *
 * 2. In addition, measuring the sum of evictions and activations (E)
 *    at the time of a page's eviction, and comparing it to another
 *    reading (R) at the time the page faults back into memory tells
 *    the minimum number of accesses while the page was not cached.
 *    This is called the refault distance.
 *
 * Because the first access of the page was the fault and the second
 * access the refault, we combine the in-cache distance with the
 * out-of-cache distance to get the complete minimum access distance
 * of this page:
 *
 *      NR_inactive + (R - E)
 *
 * And knowing the minimum access distance of a page, we can easily
 * tell if the page would be able to stay in cache assuming all page
 * slots in the cache were available:
 *
 *   NR_inactive + (R - E) <= NR_inactive + NR_active
 *
 * If we have swap we should consider about NR_inactive_anon and
 * NR_active_anon, so for page cache and anonymous respectively:
 *
 *   NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
 *   + NR_inactive_anon + NR_active_anon
 *
 *   NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
 *   + NR_inactive_file + NR_active_file
 *
 * Which can be further simplified to:
 *
 *   (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
 *
 *   (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
 *
 * Put into words, the refault distance (out-of-cache) can be seen as
 * a deficit in inactive list space (in-cache).  If the inactive list
 * had (R - E) more page slots, the page would not have been evicted
 * in between accesses, but activated instead.  And on a full system,
 * the only thing eating into inactive list space is active pages.
 *
 *
 *                Refaulting inactive pages
 *
 * All that is known about the active list is that the pages have been
 * accessed more than once in the past.  This means that at any given
 * time there is actually a good chance that pages on the active list
 * are no longer in active use.
 *
 * So when a refault distance of (R - E) is observed and there are at
 * least (R - E) pages in the userspace workingset, the refaulting page
 * is activated optimistically in the hope that (R - E) pages are actually
 * used less frequently than the refaulting page - or even not used at
 * all anymore.
 *
 * That means if inactive cache is refaulting with a suitable refault
 * distance, we assume the cache workingset is transitioning and put
 * pressure on the current workingset.
 *
 * If this is wrong and demotion kicks in, the pages which are truly
 * used more frequently will be reactivated while the less frequently
 * used once will be evicted from memory.
 *
 * But if this is right, the stale pages will be pushed out of memory
 * and the used pages get to stay in cache.
 *
 *                Refaulting active pages
 *
 * If on the other hand the refaulting pages have recently been
 * deactivated, it means that the active list is no longer protecting
 * actively used cache from reclaim. The cache is NOT transitioning to
 * a different workingset; the existing workingset is thrashing in the
 * space allocated to the page cache.
 *
 *
 *                Implementation
 *
 * For each node's LRU lists, a counter for inactive evictions and
 * activations is maintained (node->nonresident_age).
 *
 * On eviction, a snapshot of this counter (along with some bits to
 * identify the node) is stored in the now empty page cache
 * slot of the evicted page.  This is called a shadow entry.
 *
 * On cache misses for which there are shadow entries, an eligible
 * refault distance will immediately activate the refaulting page.
 */

#define WORKINGSET_SHIFT 1
#define EVICTION_SHIFT        ((BITS_PER_LONG - BITS_PER_XA_VALUE) +        \
                         WORKINGSET_SHIFT + NODES_SHIFT + \
                         MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK        (~0UL >> EVICTION_SHIFT)

/*
 * Eviction timestamps need to be able to cover the full range of
 * actionable refaults. However, bits are tight in the xarray
 * entry, and after storing the identifier for the lruvec there might
 * not be enough left to represent every single actionable refault. In
 * that case, we have to sacrifice granularity for distance, and group
 * evictions into coarser buckets by shaving off lower timestamp bits.
 */
static unsigned int bucket_order __read_mostly;

static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
                         bool workingset)
{
        eviction &= EVICTION_MASK;
        eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
        eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
        eviction = (eviction << WORKINGSET_SHIFT) | workingset;

        return xa_mk_value(eviction);
}

static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
                          unsigned long *evictionp, bool *workingsetp)
{
        unsigned long entry = xa_to_value(shadow);
        int memcgid, nid;
        bool workingset;

        workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
        entry >>= WORKINGSET_SHIFT;
        nid = entry & ((1UL << NODES_SHIFT) - 1);
        entry >>= NODES_SHIFT;
        memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
        entry >>= MEM_CGROUP_ID_SHIFT;

        *memcgidp = memcgid;
        *pgdat = NODE_DATA(nid);
        *evictionp = entry;
        *workingsetp = workingset;
}

#ifdef CONFIG_LRU_GEN

static void *lru_gen_eviction(struct folio *folio)
{
        int hist;
        unsigned long token;
        unsigned long min_seq;
        struct lruvec *lruvec;
        struct lru_gen_folio *lrugen;
        int type = folio_is_file_lru(folio);
        int delta = folio_nr_pages(folio);
        int refs = folio_lru_refs(folio);
        bool workingset = folio_test_workingset(folio);
        int tier = lru_tier_from_refs(refs, workingset);
        struct mem_cgroup *memcg = folio_memcg(folio);
        struct pglist_data *pgdat = folio_pgdat(folio);

        BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);

        lruvec = mem_cgroup_lruvec(memcg, pgdat);
        lrugen = &lruvec->lrugen;
        min_seq = READ_ONCE(lrugen->min_seq[type]);
        token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);

        hist = lru_hist_from_seq(min_seq);
        atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);

        return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
}

/*
 * Tests if the shadow entry is for a folio that was recently evicted.
 * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
 */
static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
                                unsigned long *token, bool *workingset)
{
        int memcg_id;
        unsigned long max_seq;
        struct mem_cgroup *memcg;
        struct pglist_data *pgdat;

        unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);

        memcg = mem_cgroup_from_id(memcg_id);
        *lruvec = mem_cgroup_lruvec(memcg, pgdat);

        max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
        max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH;

        return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
}

static void lru_gen_refault(struct folio *folio, void *shadow)
{
        bool recent;
        int hist, tier, refs;
        bool workingset;
        unsigned long token;
        struct lruvec *lruvec;
        struct lru_gen_folio *lrugen;
        int type = folio_is_file_lru(folio);
        int delta = folio_nr_pages(folio);

        rcu_read_lock();

        recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset);
        if (lruvec != folio_lruvec(folio))
                goto unlock;

        mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);

        if (!recent)
                goto unlock;

        lrugen = &lruvec->lrugen;

        hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
        refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
        tier = lru_tier_from_refs(refs, workingset);

        atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);

        /* see folio_add_lru() where folio_set_active() will be called */
        if (lru_gen_in_fault())
                mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);

        if (workingset) {
                folio_set_workingset(folio);
                mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
        } else
                set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
unlock:
        rcu_read_unlock();
}

#else /* !CONFIG_LRU_GEN */

static void *lru_gen_eviction(struct folio *folio)
{
        return NULL;
}

static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
                                unsigned long *token, bool *workingset)
{
        return false;
}

static void lru_gen_refault(struct folio *folio, void *shadow)
{
}

#endif /* CONFIG_LRU_GEN */

/**
 * workingset_age_nonresident - age non-resident entries as LRU ages
 * @lruvec: the lruvec that was aged
 * @nr_pages: the number of pages to count
 *
 * As in-memory pages are aged, non-resident pages need to be aged as
 * well, in order for the refault distances later on to be comparable
 * to the in-memory dimensions. This function allows reclaim and LRU
 * operations to drive the non-resident aging along in parallel.
 */
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
{
        /*
         * Reclaiming a cgroup means reclaiming all its children in a
         * round-robin fashion. That means that each cgroup has an LRU
         * order that is composed of the LRU orders of its child
         * cgroups; and every page has an LRU position not just in the
         * cgroup that owns it, but in all of that group's ancestors.
         *
         * So when the physical inactive list of a leaf cgroup ages,
         * the virtual inactive lists of all its parents, including
         * the root cgroup's, age as well.
         */
        do {
                atomic_long_add(nr_pages, &lruvec->nonresident_age);
        } while ((lruvec = parent_lruvec(lruvec)));
}

/**
 * workingset_eviction - note the eviction of a folio from memory
 * @target_memcg: the cgroup that is causing the reclaim
 * @folio: the folio being evicted
 *
 * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
 * of the evicted @folio so that a later refault can be detected.
 */
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
{
        struct pglist_data *pgdat = folio_pgdat(folio);
        unsigned long eviction;
        struct lruvec *lruvec;
        int memcgid;

        /* Folio is fully exclusive and pins folio's memory cgroup pointer */
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
        VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (lru_gen_enabled())
                return lru_gen_eviction(folio);

        lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
        /* XXX: target_memcg can be NULL, go through lruvec */
        memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
        eviction = atomic_long_read(&lruvec->nonresident_age);
        eviction >>= bucket_order;
        workingset_age_nonresident(lruvec, folio_nr_pages(folio));
        return pack_shadow(memcgid, pgdat, eviction,
                                folio_test_workingset(folio));
}

/**
 * workingset_test_recent - tests if the shadow entry is for a folio that was
 * recently evicted. Also fills in @workingset with the value unpacked from
 * shadow.
 * @shadow: the shadow entry to be tested.
 * @file: whether the corresponding folio is from the file lru.
 * @workingset: where the workingset value unpacked from shadow should
 * be stored.
 * @flush: whether to flush cgroup rstat.
 *
 * Return: true if the shadow is for a recently evicted folio; false otherwise.
 */
bool workingset_test_recent(void *shadow, bool file, bool *workingset,
                                bool flush)
{
        struct mem_cgroup *eviction_memcg;
        struct lruvec *eviction_lruvec;
        unsigned long refault_distance;
        unsigned long workingset_size;
        unsigned long refault;
        int memcgid;
        struct pglist_data *pgdat;
        unsigned long eviction;

        if (lru_gen_enabled()) {
                bool recent;

                rcu_read_lock();
                recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset);
                rcu_read_unlock();
                return recent;
        }

        rcu_read_lock();
        unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
        eviction <<= bucket_order;

        /*
         * Look up the memcg associated with the stored ID. It might
         * have been deleted since the folio's eviction.
         *
         * Note that in rare events the ID could have been recycled
         * for a new cgroup that refaults a shared folio. This is
         * impossible to tell from the available data. However, this
         * should be a rare and limited disturbance, and activations
         * are always speculative anyway. Ultimately, it's the aging
         * algorithm's job to shake out the minimum access frequency
         * for the active cache.
         *
         * XXX: On !CONFIG_MEMCG, this will always return NULL; it
         * would be better if the root_mem_cgroup existed in all
         * configurations instead.
         */
        eviction_memcg = mem_cgroup_from_id(memcgid);
        if (!mem_cgroup_tryget(eviction_memcg))
                eviction_memcg = NULL;
        rcu_read_unlock();

        if (!mem_cgroup_disabled() && !eviction_memcg)
                return false;
        /*
         * Flush stats (and potentially sleep) outside the RCU read section.
         *
         * Note that workingset_test_recent() itself might be called in RCU read
         * section (for e.g, in cachestat) - these callers need to skip flushing
         * stats (via the flush argument).
         *
         * XXX: With per-memcg flushing and thresholding, is ratelimiting
         * still needed here?
         */
        if (flush)
                mem_cgroup_flush_stats_ratelimited(eviction_memcg);

        eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
        refault = atomic_long_read(&eviction_lruvec->nonresident_age);

        /*
         * Calculate the refault distance
         *
         * The unsigned subtraction here gives an accurate distance
         * across nonresident_age overflows in most cases. There is a
         * special case: usually, shadow entries have a short lifetime
         * and are either refaulted or reclaimed along with the inode
         * before they get too old.  But it is not impossible for the
         * nonresident_age to lap a shadow entry in the field, which
         * can then result in a false small refault distance, leading
         * to a false activation should this old entry actually
         * refault again.  However, earlier kernels used to deactivate
         * unconditionally with *every* reclaim invocation for the
         * longest time, so the occasional inappropriate activation
         * leading to pressure on the active list is not a problem.
         */
        refault_distance = (refault - eviction) & EVICTION_MASK;

        /*
         * Compare the distance to the existing workingset size. We
         * don't activate pages that couldn't stay resident even if
         * all the memory was available to the workingset. Whether
         * workingset competition needs to consider anon or not depends
         * on having free swap space.
         */
        workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
        if (!file) {
                workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_INACTIVE_FILE);
        }
        if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
                workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_ACTIVE_ANON);
                if (file) {
                        workingset_size += lruvec_page_state(eviction_lruvec,
                                                     NR_INACTIVE_ANON);
                }
        }

        mem_cgroup_put(eviction_memcg);
        return refault_distance <= workingset_size;
}

/**
 * workingset_refault - Evaluate the refault of a previously evicted folio.
 * @folio: The freshly allocated replacement folio.
 * @shadow: Shadow entry of the evicted folio.
 *
 * Calculates and evaluates the refault distance of the previously
 * evicted folio in the context of the node and the memcg whose memory
 * pressure caused the eviction.
 */
void workingset_refault(struct folio *folio, void *shadow)
{
        bool file = folio_is_file_lru(folio);
        struct pglist_data *pgdat;
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
        bool workingset;
        long nr;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (lru_gen_enabled()) {
                lru_gen_refault(folio, shadow);
                return;
        }

        /*
         * The activation decision for this folio is made at the level
         * where the eviction occurred, as that is where the LRU order
         * during folio reclaim is being determined.
         *
         * However, the cgroup that will own the folio is the one that
         * is actually experiencing the refault event. Make sure the folio is
         * locked to guarantee folio_memcg() stability throughout.
         */
        nr = folio_nr_pages(folio);
        memcg = folio_memcg(folio);
        pgdat = folio_pgdat(folio);
        lruvec = mem_cgroup_lruvec(memcg, pgdat);

        mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);

        if (!workingset_test_recent(shadow, file, &workingset, true))
                return;

        folio_set_active(folio);
        workingset_age_nonresident(lruvec, nr);
        mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);

        /* Folio was active prior to eviction */
        if (workingset) {
                folio_set_workingset(folio);
                /*
                 * XXX: Move to folio_add_lru() when it supports new vs
                 * putback
                 */
                lru_note_cost_refault(folio);
                mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
        }
}

/**
 * workingset_activation - note a page activation
 * @folio: Folio that is being activated.
 */
void workingset_activation(struct folio *folio)
{
        /*
         * Filter non-memcg pages here, e.g. unmap can call
         * mark_page_accessed() on VDSO pages.
         */
        if (mem_cgroup_disabled() || folio_memcg_charged(folio))
                workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
}

/*
 * Shadow entries reflect the share of the working set that does not
 * fit into memory, so their number depends on the access pattern of
 * the workload.  In most cases, they will refault or get reclaimed
 * along with the inode, but a (malicious) workload that streams
 * through files with a total size several times that of available
 * memory, while preventing the inodes from being reclaimed, can
 * create excessive amounts of shadow nodes.  To keep a lid on this,
 * track shadow nodes and reclaim them when they grow way past the
 * point where they would still be useful.
 */

struct list_lru shadow_nodes;

void workingset_update_node(struct xa_node *node)
{
        struct address_space *mapping;
        struct page *page = virt_to_page(node);

        /*
         * Track non-empty nodes that contain only shadow entries;
         * unlink those that contain pages or are being freed.
         *
         * Avoid acquiring the list_lru lock when the nodes are
         * already where they should be. The list_empty() test is safe
         * as node->private_list is protected by the i_pages lock.
         */
        mapping = container_of(node->array, struct address_space, i_pages);
        lockdep_assert_held(&mapping->i_pages.xa_lock);

        if (node->count && node->count == node->nr_values) {
                if (list_empty(&node->private_list)) {
                        list_lru_add_obj(&shadow_nodes, &node->private_list);
                        __inc_node_page_state(page, WORKINGSET_NODES);
                }
        } else {
                if (!list_empty(&node->private_list)) {
                        list_lru_del_obj(&shadow_nodes, &node->private_list);
                        __dec_node_page_state(page, WORKINGSET_NODES);
                }
        }
}

static unsigned long count_shadow_nodes(struct shrinker *shrinker,
                                        struct shrink_control *sc)
{
        unsigned long max_nodes;
        unsigned long nodes;
        unsigned long pages;

        nodes = list_lru_shrink_count(&shadow_nodes, sc);
        if (!nodes)
                return SHRINK_EMPTY;

        /*
         * Approximate a reasonable limit for the nodes
         * containing shadow entries. We don't need to keep more
         * shadow entries than possible pages on the active list,
         * since refault distances bigger than that are dismissed.
         *
         * The size of the active list converges toward 100% of
         * overall page cache as memory grows, with only a tiny
         * inactive list. Assume the total cache size for that.
         *
         * Nodes might be sparsely populated, with only one shadow
         * entry in the extreme case. Obviously, we cannot keep one
         * node for every eligible shadow entry, so compromise on a
         * worst-case density of 1/8th. Below that, not all eligible
         * refaults can be detected anymore.
         *
         * On 64-bit with 7 xa_nodes per page and 64 slots
         * each, this will reclaim shadow entries when they consume
         * ~1.8% of available memory:
         *
         * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
         */
#ifdef CONFIG_MEMCG
        if (sc->memcg) {
                struct lruvec *lruvec;
                int i;

                mem_cgroup_flush_stats_ratelimited(sc->memcg);
                lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
                for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
                        pages += lruvec_page_state_local(lruvec,
                                                         NR_LRU_BASE + i);
                pages += lruvec_page_state_local(
                        lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
                pages += lruvec_page_state_local(
                        lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
        } else
#endif
                pages = node_present_pages(sc->nid);

        max_nodes = pages >> (XA_CHUNK_SHIFT - 3);

        if (nodes <= max_nodes)
                return 0;
        return nodes - max_nodes;
}

static enum lru_status shadow_lru_isolate(struct list_head *item,
                                          struct list_lru_one *lru,
                                          void *arg) __must_hold(lru->lock)
{
        struct xa_node *node = container_of(item, struct xa_node, private_list);
        struct address_space *mapping;
        int ret;

        /*
         * Page cache insertions and deletions synchronously maintain
         * the shadow node LRU under the i_pages lock and the
         * &lru->lock. Because the page cache tree is emptied before
         * the inode can be destroyed, holding the &lru->lock pins any
         * address_space that has nodes on the LRU.
         *
         * We can then safely transition to the i_pages lock to
         * pin only the address_space of the particular node we want
         * to reclaim, take the node off-LRU, and drop the &lru->lock.
         */

        mapping = container_of(node->array, struct address_space, i_pages);

        /* Coming from the list, invert the lock order */
        if (!xa_trylock(&mapping->i_pages)) {
                spin_unlock_irq(&lru->lock);
                ret = LRU_RETRY;
                goto out;
        }

        /* For page cache we need to hold i_lock */
        if (mapping->host != NULL) {
                if (!spin_trylock(&mapping->host->i_lock)) {
                        xa_unlock(&mapping->i_pages);
                        spin_unlock_irq(&lru->lock);
                        ret = LRU_RETRY;
                        goto out;
                }
        }

        list_lru_isolate(lru, item);
        __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);

        spin_unlock(&lru->lock);

        /*
         * The nodes should only contain one or more shadow entries,
         * no pages, so we expect to be able to remove them all and
         * delete and free the empty node afterwards.
         */
        if (WARN_ON_ONCE(!node->nr_values))
                goto out_invalid;
        if (WARN_ON_ONCE(node->count != node->nr_values))
                goto out_invalid;
        xa_delete_node(node, workingset_update_node);
        __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);

out_invalid:
        xa_unlock_irq(&mapping->i_pages);
        if (mapping->host != NULL) {
                if (mapping_shrinkable(mapping))
                        inode_add_lru(mapping->host);
                spin_unlock(&mapping->host->i_lock);
        }
        ret = LRU_REMOVED_RETRY;
out:
        cond_resched();
        return ret;
}

static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
                                       struct shrink_control *sc)
{
        /* list_lru lock nests inside the IRQ-safe i_pages lock */
        return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
                                        NULL);
}

/*
 * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
 * i_pages lock.
 */
static struct lock_class_key shadow_nodes_key;

static int __init workingset_init(void)
{
        struct shrinker *workingset_shadow_shrinker;
        unsigned int timestamp_bits;
        unsigned int max_order;
        int ret = -ENOMEM;

        BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
        /*
         * Calculate the eviction bucket size to cover the longest
         * actionable refault distance, which is currently half of
         * memory (totalram_pages/2). However, memory hotplug may add
         * some more pages at runtime, so keep working with up to
         * double the initial memory by using totalram_pages as-is.
         */
        timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
        max_order = fls_long(totalram_pages() - 1);
        if (max_order > timestamp_bits)
                bucket_order = max_order - timestamp_bits;
        pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
               timestamp_bits, max_order, bucket_order);

        workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
                                                    SHRINKER_MEMCG_AWARE,
                                                    "mm-shadow");
        if (!workingset_shadow_shrinker)
                goto err;

        ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker,
                                      &shadow_nodes_key);
        if (ret)
                goto err_list_lru;

        workingset_shadow_shrinker->count_objects = count_shadow_nodes;
        workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
        /* ->count reports only fully expendable nodes */
        workingset_shadow_shrinker->seeks = 0;

        shrinker_register(workingset_shadow_shrinker);
        return 0;
err_list_lru:
        shrinker_free(workingset_shadow_shrinker);
err:
        return ret;
}
module_init(workingset_init);





















































































































































































































































































































































 1257 


























 1259 




 1257 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/audit.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/**
 * tomoyo_print_bprm - Print "struct linux_binprm" for auditing.
 *
 * @bprm: Pointer to "struct linux_binprm".
 * @dump: Pointer to "struct tomoyo_page_dump".
 *
 * Returns the contents of @bprm on success, NULL otherwise.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
static char *tomoyo_print_bprm(struct linux_binprm *bprm,
                               struct tomoyo_page_dump *dump)
{
        static const int tomoyo_buffer_len = 4096 * 2;
        char *buffer = kzalloc(tomoyo_buffer_len, GFP_NOFS);
        char *cp;
        char *last_start;
        int len;
        unsigned long pos = bprm->p;
        int offset = pos % PAGE_SIZE;
        int argv_count = bprm->argc;
        int envp_count = bprm->envc;
        bool truncated = false;

        if (!buffer)
                return NULL;
        len = snprintf(buffer, tomoyo_buffer_len - 1, "argv[]={ ");
        cp = buffer + len;
        if (!argv_count) {
                memmove(cp, "} envp[]={ ", 11);
                cp += 11;
        }
        last_start = cp;
        while (argv_count || envp_count) {
                if (!tomoyo_dump_page(bprm, pos, dump))
                        goto out;
                pos += PAGE_SIZE - offset;
                /* Read. */
                while (offset < PAGE_SIZE) {
                        const char *kaddr = dump->data;
                        const unsigned char c = kaddr[offset++];

                        if (cp == last_start)
                                *cp++ = '"';
                        if (cp >= buffer + tomoyo_buffer_len - 32) {
                                /* Reserve some room for "..." string. */
                                truncated = true;
                        } else if (c == '\\') {
                                *cp++ = '\\';
                                *cp++ = '\\';
                        } else if (c > ' ' && c < 127) {
                                *cp++ = c;
                        } else if (!c) {
                                *cp++ = '"';
                                *cp++ = ' ';
                                last_start = cp;
                        } else {
                                *cp++ = '\\';
                                *cp++ = (c >> 6) + '0';
                                *cp++ = ((c >> 3) & 7) + '0';
                                *cp++ = (c & 7) + '0';
                        }
                        if (c)
                                continue;
                        if (argv_count) {
                                if (--argv_count == 0) {
                                        if (truncated) {
                                                cp = last_start;
                                                memmove(cp, "... ", 4);
                                                cp += 4;
                                        }
                                        memmove(cp, "} envp[]={ ", 11);
                                        cp += 11;
                                        last_start = cp;
                                        truncated = false;
                                }
                        } else if (envp_count) {
                                if (--envp_count == 0) {
                                        if (truncated) {
                                                cp = last_start;
                                                memmove(cp, "... ", 4);
                                                cp += 4;
                                        }
                                }
                        }
                        if (!argv_count && !envp_count)
                                break;
                }
                offset = 0;
        }
        *cp++ = '}';
        *cp = '\0';
        return buffer;
out:
        snprintf(buffer, tomoyo_buffer_len - 1,
                 "argv[]={ ... } envp[]= { ... }");
        return buffer;
}

/**
 * tomoyo_filetype - Get string representation of file type.
 *
 * @mode: Mode value for stat().
 *
 * Returns file type string.
 */
static inline const char *tomoyo_filetype(const umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case 0:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FILE];
        case S_IFDIR:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_DIRECTORY];
        case S_IFLNK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SYMLINK];
        case S_IFIFO:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_FIFO];
        case S_IFSOCK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_SOCKET];
        case S_IFBLK:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_BLOCK_DEV];
        case S_IFCHR:
                return tomoyo_condition_keyword[TOMOYO_TYPE_IS_CHAR_DEV];
        }
        return "unknown"; /* This should not happen. */
}

/**
 * tomoyo_print_header - Get header line of audit log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns string representation.
 *
 * This function uses kmalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
static char *tomoyo_print_header(struct tomoyo_request_info *r)
{
        struct tomoyo_time stamp;
        const pid_t gpid = task_pid_nr(current);
        struct tomoyo_obj_info *obj = r->obj;
        static const int tomoyo_buffer_len = 4096;
        char *buffer = kmalloc(tomoyo_buffer_len, GFP_NOFS);
        int pos;
        u8 i;

        if (!buffer)
                return NULL;

        tomoyo_convert_time(ktime_get_real_seconds(), &stamp);

        pos = snprintf(buffer, tomoyo_buffer_len - 1,
                       "#%04u/%02u/%02u %02u:%02u:%02u# profile=%u mode=%s granted=%s (global-pid=%u) task={ pid=%u ppid=%u uid=%u gid=%u euid=%u egid=%u suid=%u sgid=%u fsuid=%u fsgid=%u }",
                       stamp.year, stamp.month, stamp.day, stamp.hour,
                       stamp.min, stamp.sec, r->profile, tomoyo_mode[r->mode],
                       str_yes_no(r->granted), gpid, tomoyo_sys_getpid(),
                       tomoyo_sys_getppid(),
                       from_kuid(&init_user_ns, current_uid()),
                       from_kgid(&init_user_ns, current_gid()),
                       from_kuid(&init_user_ns, current_euid()),
                       from_kgid(&init_user_ns, current_egid()),
                       from_kuid(&init_user_ns, current_suid()),
                       from_kgid(&init_user_ns, current_sgid()),
                       from_kuid(&init_user_ns, current_fsuid()),
                       from_kgid(&init_user_ns, current_fsgid()));
        if (!obj)
                goto no_obj_info;
        if (!obj->validate_done) {
                tomoyo_get_attributes(obj);
                obj->validate_done = true;
        }
        for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) {
                struct tomoyo_mini_stat *stat;
                unsigned int dev;
                umode_t mode;

                if (!obj->stat_valid[i])
                        continue;
                stat = &obj->stat[i];
                dev = stat->dev;
                mode = stat->mode;
                if (i & 1) {
                        pos += snprintf(buffer + pos,
                                        tomoyo_buffer_len - 1 - pos,
                                        " path%u.parent={ uid=%u gid=%u ino=%lu perm=0%o }",
                                        (i >> 1) + 1,
                                        from_kuid(&init_user_ns, stat->uid),
                                        from_kgid(&init_user_ns, stat->gid),
                                        (unsigned long)stat->ino,
                                        stat->mode & S_IALLUGO);
                        continue;
                }
                pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
                                " path%u={ uid=%u gid=%u ino=%lu major=%u minor=%u perm=0%o type=%s",
                                (i >> 1) + 1,
                                from_kuid(&init_user_ns, stat->uid),
                                from_kgid(&init_user_ns, stat->gid),
                                (unsigned long)stat->ino,
                                MAJOR(dev), MINOR(dev),
                                mode & S_IALLUGO, tomoyo_filetype(mode));
                if (S_ISCHR(mode) || S_ISBLK(mode)) {
                        dev = stat->rdev;
                        pos += snprintf(buffer + pos,
                                        tomoyo_buffer_len - 1 - pos,
                                        " dev_major=%u dev_minor=%u",
                                        MAJOR(dev), MINOR(dev));
                }
                pos += snprintf(buffer + pos, tomoyo_buffer_len - 1 - pos,
                                " }");
        }
no_obj_info:
        if (pos < tomoyo_buffer_len - 1)
                return buffer;
        kfree(buffer);
        return NULL;
}

/**
 * tomoyo_init_log - Allocate buffer for audit logs.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @len:  Buffer size needed for @fmt and @args.
 * @fmt:  The printf()'s format string.
 * @args: va_list structure for @fmt.
 *
 * Returns pointer to allocated memory.
 *
 * This function uses kzalloc(), so caller must kfree() if this function
 * didn't return NULL.
 */
char *tomoyo_init_log(struct tomoyo_request_info *r, int len, const char *fmt,
                      va_list args)
{
        char *buf = NULL;
        char *bprm_info = NULL;
        const char *header = NULL;
        char *realpath = NULL;
        const char *symlink = NULL;
        int pos;
        const char *domainname = r->domain->domainname->name;

        header = tomoyo_print_header(r);
        if (!header)
                return NULL;
        /* +10 is for '\n' etc. and '\0'. */
        len += strlen(domainname) + strlen(header) + 10;
        if (r->ee) {
                struct file *file = r->ee->bprm->file;

                realpath = tomoyo_realpath_from_path(&file->f_path);
                bprm_info = tomoyo_print_bprm(r->ee->bprm, &r->ee->dump);
                if (!realpath || !bprm_info)
                        goto out;
                /* +80 is for " exec={ realpath=\"%s\" argc=%d envc=%d %s }" */
                len += strlen(realpath) + 80 + strlen(bprm_info);
        } else if (r->obj && r->obj->symlink_target) {
                symlink = r->obj->symlink_target->name;
                /* +18 is for " symlink.target=\"%s\"" */
                len += 18 + strlen(symlink);
        }
        len = kmalloc_size_roundup(len);
        buf = kzalloc(len, GFP_NOFS);
        if (!buf)
                goto out;
        len--;
        pos = snprintf(buf, len, "%s", header);
        if (realpath) {
                struct linux_binprm *bprm = r->ee->bprm;

                pos += snprintf(buf + pos, len - pos,
                                " exec={ realpath=\"%s\" argc=%d envc=%d %s }",
                                realpath, bprm->argc, bprm->envc, bprm_info);
        } else if (symlink)
                pos += snprintf(buf + pos, len - pos, " symlink.target=\"%s\"",
                                symlink);
        pos += snprintf(buf + pos, len - pos, "\n%s\n", domainname);
        vsnprintf(buf + pos, len - pos, fmt, args);
out:
        kfree(realpath);
        kfree(bprm_info);
        kfree(header);
        return buf;
}

/* Wait queue for /sys/kernel/security/tomoyo/audit. */
static DECLARE_WAIT_QUEUE_HEAD(tomoyo_log_wait);

/* Structure for audit log. */
struct tomoyo_log {
        struct list_head list;
        char *log;
        int size;
};

/* The list for "struct tomoyo_log". */
static LIST_HEAD(tomoyo_log);

/* Lock for "struct list_head tomoyo_log". */
static DEFINE_SPINLOCK(tomoyo_log_lock);

/* Length of "struct list_head tomoyo_log". */
static unsigned int tomoyo_log_count;

/**
 * tomoyo_get_audit - Get audit mode.
 *
 * @ns:          Pointer to "struct tomoyo_policy_namespace".
 * @profile:     Profile number.
 * @index:       Index number of functionality.
 * @matched_acl: Pointer to "struct tomoyo_acl_info".
 * @is_granted:  True if granted log, false otherwise.
 *
 * Returns true if this request should be audited, false otherwise.
 */
static bool tomoyo_get_audit(const struct tomoyo_policy_namespace *ns,
                             const u8 profile, const u8 index,
                             const struct tomoyo_acl_info *matched_acl,
                             const bool is_granted)
{
        u8 mode;
        const u8 category = tomoyo_index2category[index] +
                TOMOYO_MAX_MAC_INDEX;
        struct tomoyo_profile *p;

        if (!tomoyo_policy_loaded)
                return false;
        p = tomoyo_profile(ns, profile);
        if (tomoyo_log_count >= p->pref[TOMOYO_PREF_MAX_AUDIT_LOG])
                return false;
        if (is_granted && matched_acl && matched_acl->cond &&
            matched_acl->cond->grant_log != TOMOYO_GRANTLOG_AUTO)
                return matched_acl->cond->grant_log == TOMOYO_GRANTLOG_YES;
        mode = p->config[index];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->config[category];
        if (mode == TOMOYO_CONFIG_USE_DEFAULT)
                mode = p->default_config;
        if (is_granted)
                return mode & TOMOYO_CONFIG_WANT_GRANT_LOG;
        return mode & TOMOYO_CONFIG_WANT_REJECT_LOG;
}

/**
 * tomoyo_write_log2 - Write an audit log.
 *
 * @r:    Pointer to "struct tomoyo_request_info".
 * @len:  Buffer size needed for @fmt and @args.
 * @fmt:  The printf()'s format string.
 * @args: va_list structure for @fmt.
 *
 * Returns nothing.
 */
void tomoyo_write_log2(struct tomoyo_request_info *r, int len, const char *fmt,
                       va_list args)
{
        char *buf;
        struct tomoyo_log *entry;
        bool quota_exceeded = false;

        if (!tomoyo_get_audit(r->domain->ns, r->profile, r->type,
                              r->matched_acl, r->granted))
                goto out;
        buf = tomoyo_init_log(r, len, fmt, args);
        if (!buf)
                goto out;
        entry = kzalloc(sizeof(*entry), GFP_NOFS);
        if (!entry) {
                kfree(buf);
                goto out;
        }
        entry->log = buf;
        len = kmalloc_size_roundup(strlen(buf) + 1);
        /*
         * The entry->size is used for memory quota checks.
         * Don't go beyond strlen(entry->log).
         */
        entry->size = len + kmalloc_size_roundup(sizeof(*entry));
        spin_lock(&tomoyo_log_lock);
        if (tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT] &&
            tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] + entry->size >=
            tomoyo_memory_quota[TOMOYO_MEMORY_AUDIT]) {
                quota_exceeded = true;
        } else {
                tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] += entry->size;
                list_add_tail(&entry->list, &tomoyo_log);
                tomoyo_log_count++;
        }
        spin_unlock(&tomoyo_log_lock);
        if (quota_exceeded) {
                kfree(buf);
                kfree(entry);
                goto out;
        }
        wake_up(&tomoyo_log_wait);
out:
        return;
}

/**
 * tomoyo_write_log - Write an audit log.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @fmt: The printf()'s format string, followed by parameters.
 *
 * Returns nothing.
 */
void tomoyo_write_log(struct tomoyo_request_info *r, const char *fmt, ...)
{
        va_list args;
        int len;

        va_start(args, fmt);
        len = vsnprintf(NULL, 0, fmt, args) + 1;
        va_end(args);
        va_start(args, fmt);
        tomoyo_write_log2(r, len, fmt, args);
        va_end(args);
}

/**
 * tomoyo_read_log - Read an audit log.
 *
 * @head: Pointer to "struct tomoyo_io_buffer".
 *
 * Returns nothing.
 */
void tomoyo_read_log(struct tomoyo_io_buffer *head)
{
        struct tomoyo_log *ptr = NULL;

        if (head->r.w_pos)
                return;
        kfree(head->read_buf);
        head->read_buf = NULL;
        spin_lock(&tomoyo_log_lock);
        if (!list_empty(&tomoyo_log)) {
                ptr = list_entry(tomoyo_log.next, typeof(*ptr), list);
                list_del(&ptr->list);
                tomoyo_log_count--;
                tomoyo_memory_used[TOMOYO_MEMORY_AUDIT] -= ptr->size;
        }
        spin_unlock(&tomoyo_log_lock);
        if (ptr) {
                head->read_buf = ptr->log;
                head->r.w[head->r.w_pos++] = head->read_buf;
                kfree(ptr);
        }
}

/**
 * tomoyo_poll_log - Wait for an audit log.
 *
 * @file: Pointer to "struct file".
 * @wait: Pointer to "poll_table". Maybe NULL.
 *
 * Returns EPOLLIN | EPOLLRDNORM when ready to read an audit log.
 */
__poll_t tomoyo_poll_log(struct file *file, poll_table *wait)
{
        if (tomoyo_log_count)
                return EPOLLIN | EPOLLRDNORM;
        poll_wait(file, &tomoyo_log_wait, wait);
        if (tomoyo_log_count)
                return EPOLLIN | EPOLLRDNORM;
        return 0;
}














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 






    3 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        IP multicast routing support for mrouted 3.6/3.8
 *
 *                (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *          Linux Consultancy and Custom Driver Development
 *
 *        Fixes:
 *        Michael Chastain        :        Incorrect size of copying.
 *        Alan Cox                :        Added the cache manager code
 *        Alan Cox                :        Fixed the clone/copy bug and device race.
 *        Mike McLagan                :        Routing by source
 *        Malcolm Beattie                :        Buffer handling fixes.
 *        Alexey Kuznetsov        :        Double buffer free and other fixes.
 *        SVR Anand                :        Fixed several multicast bugs and problems.
 *        Alexey Kuznetsov        :        Status, optimisations and more.
 *        Brad Parker                :        Better behaviour on mrouted upcall
 *                                        overflow.
 *      Carlos Picoto           :       PIMv1 Support
 *        Pavlin Ivanov Radoslavov:        PIMv2 Registers must checksum only PIM header
 *                                        Relax this requirement to work with older peers.
 */

#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/cache.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/if_ether.h>
#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <net/route.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/compat.h>
#include <linux/export.h>
#include <linux/rhashtable.h>
#include <net/ip_tunnels.h>
#include <net/checksum.h>
#include <net/netlink.h>
#include <net/fib_rules.h>
#include <linux/netconf.h>
#include <net/rtnh.h>
#include <net/inet_dscp.h>

#include <linux/nospec.h>

struct ipmr_rule {
        struct fib_rule                common;
};

struct ipmr_result {
        struct mr_table                *mrt;
};

/* Big lock, protecting vif table, mrt cache and mroute socket state.
 * Note that the changes are semaphored via rtnl_lock.
 */

static DEFINE_SPINLOCK(mrt_lock);

static struct net_device *vif_dev_read(const struct vif_device *vif)
{
        return rcu_dereference(vif->dev);
}

/* Multicast router control variables */

/* Special spinlock for queue of unresolved entries */
static DEFINE_SPINLOCK(mfc_unres_lock);

/* We return to original Alan's scheme. Hash table of resolved
 * entries is changed only in process context and protected
 * with weak lock mrt_lock. Queue of unresolved entries is protected
 * with strong spinlock mfc_unres_lock.
 *
 * In this case data path is free of exclusive locks at all.
 */

static struct kmem_cache *mrt_cachep __ro_after_init;

static struct mr_table *ipmr_new_table(struct net *net, u32 id);
static void ipmr_free_table(struct mr_table *mrt);

static void ip_mr_forward(struct net *net, struct mr_table *mrt,
                          struct net_device *dev, struct sk_buff *skb,
                          struct mfc_cache *cache, int local);
static int ipmr_cache_report(const struct mr_table *mrt,
                             struct sk_buff *pkt, vifi_t vifi, int assert);
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
                                 int cmd);
static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
static void mroute_clean_tables(struct mr_table *mrt, int flags);
static void ipmr_expire_process(struct timer_list *t);

#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
#define ipmr_for_each_table(mrt, net)                                        \
        list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list,        \
                                lockdep_rtnl_is_held() ||                \
                                list_empty(&net->ipv4.mr_tables))

static bool ipmr_can_free_table(struct net *net)
{
        return !check_net(net) || !net_initialized(net);
}

static struct mr_table *ipmr_mr_table_iter(struct net *net,
                                           struct mr_table *mrt)
{
        struct mr_table *ret;

        if (!mrt)
                ret = list_entry_rcu(net->ipv4.mr_tables.next,
                                     struct mr_table, list);
        else
                ret = list_entry_rcu(mrt->list.next,
                                     struct mr_table, list);

        if (&ret->list == &net->ipv4.mr_tables)
                return NULL;
        return ret;
}

static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
{
        struct mr_table *mrt;

        ipmr_for_each_table(mrt, net) {
                if (mrt->id == id)
                        return mrt;
        }
        return NULL;
}

static struct mr_table *ipmr_get_table(struct net *net, u32 id)
{
        struct mr_table *mrt;

        rcu_read_lock();
        mrt = __ipmr_get_table(net, id);
        rcu_read_unlock();
        return mrt;
}

static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
                           struct mr_table **mrt)
{
        int err;
        struct ipmr_result res;
        struct fib_lookup_arg arg = {
                .result = &res,
                .flags = FIB_LOOKUP_NOREF,
        };

        /* update flow if oif or iif point to device enslaved to l3mdev */
        l3mdev_update_flow(net, flowi4_to_flowi(flp4));

        err = fib_rules_lookup(net->ipv4.mr_rules_ops,
                               flowi4_to_flowi(flp4), 0, &arg);
        if (err < 0)
                return err;
        *mrt = res.mrt;
        return 0;
}

static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
                            int flags, struct fib_lookup_arg *arg)
{
        struct ipmr_result *res = arg->result;
        struct mr_table *mrt;

        switch (rule->action) {
        case FR_ACT_TO_TBL:
                break;
        case FR_ACT_UNREACHABLE:
                return -ENETUNREACH;
        case FR_ACT_PROHIBIT:
                return -EACCES;
        case FR_ACT_BLACKHOLE:
        default:
                return -EINVAL;
        }

        arg->table = fib_rule_get_table(rule, arg);

        mrt = __ipmr_get_table(rule->fr_net, arg->table);
        if (!mrt)
                return -EAGAIN;
        res->mrt = mrt;
        return 0;
}

static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
        return 1;
}

static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
                               struct fib_rule_hdr *frh, struct nlattr **tb,
                               struct netlink_ext_ack *extack)
{
        return 0;
}

static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
                             struct nlattr **tb)
{
        return 1;
}

static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
                          struct fib_rule_hdr *frh)
{
        frh->dst_len = 0;
        frh->src_len = 0;
        frh->tos     = 0;
        return 0;
}

static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
        .family                = RTNL_FAMILY_IPMR,
        .rule_size        = sizeof(struct ipmr_rule),
        .addr_size        = sizeof(u32),
        .action                = ipmr_rule_action,
        .match                = ipmr_rule_match,
        .configure        = ipmr_rule_configure,
        .compare        = ipmr_rule_compare,
        .fill                = ipmr_rule_fill,
        .nlgroup        = RTNLGRP_IPV4_RULE,
        .owner                = THIS_MODULE,
};

static int __net_init ipmr_rules_init(struct net *net)
{
        struct fib_rules_ops *ops;
        struct mr_table *mrt;
        int err;

        ops = fib_rules_register(&ipmr_rules_ops_template, net);
        if (IS_ERR(ops))
                return PTR_ERR(ops);

        INIT_LIST_HEAD(&net->ipv4.mr_tables);

        mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
        if (IS_ERR(mrt)) {
                err = PTR_ERR(mrt);
                goto err1;
        }

        err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT);
        if (err < 0)
                goto err2;

        net->ipv4.mr_rules_ops = ops;
        return 0;

err2:
        rtnl_lock();
        ipmr_free_table(mrt);
        rtnl_unlock();
err1:
        fib_rules_unregister(ops);
        return err;
}

static void __net_exit ipmr_rules_exit(struct net *net)
{
        struct mr_table *mrt, *next;

        ASSERT_RTNL();
        list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
                list_del(&mrt->list);
                ipmr_free_table(mrt);
        }
        fib_rules_unregister(net->ipv4.mr_rules_ops);
}

static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
                           struct netlink_ext_ack *extack)
{
        return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack);
}

static unsigned int ipmr_rules_seq_read(const struct net *net)
{
        return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
}

bool ipmr_rule_default(const struct fib_rule *rule)
{
        return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
}
EXPORT_SYMBOL(ipmr_rule_default);
#else
#define ipmr_for_each_table(mrt, net) \
        for (mrt = net->ipv4.mrt; mrt; mrt = NULL)

static bool ipmr_can_free_table(struct net *net)
{
        return !check_net(net);
}

static struct mr_table *ipmr_mr_table_iter(struct net *net,
                                           struct mr_table *mrt)
{
        if (!mrt)
                return net->ipv4.mrt;
        return NULL;
}

static struct mr_table *ipmr_get_table(struct net *net, u32 id)
{
        return net->ipv4.mrt;
}

#define __ipmr_get_table ipmr_get_table

static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
                           struct mr_table **mrt)
{
        *mrt = net->ipv4.mrt;
        return 0;
}

static int __net_init ipmr_rules_init(struct net *net)
{
        struct mr_table *mrt;

        mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
        if (IS_ERR(mrt))
                return PTR_ERR(mrt);
        net->ipv4.mrt = mrt;
        return 0;
}

static void __net_exit ipmr_rules_exit(struct net *net)
{
        ASSERT_RTNL();
        ipmr_free_table(net->ipv4.mrt);
        net->ipv4.mrt = NULL;
}

static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
                           struct netlink_ext_ack *extack)
{
        return 0;
}

static unsigned int ipmr_rules_seq_read(const struct net *net)
{
        return 0;
}

bool ipmr_rule_default(const struct fib_rule *rule)
{
        return true;
}
EXPORT_SYMBOL(ipmr_rule_default);
#endif

static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
                                const void *ptr)
{
        const struct mfc_cache_cmp_arg *cmparg = arg->key;
        const struct mfc_cache *c = ptr;

        return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
               cmparg->mfc_origin != c->mfc_origin;
}

static const struct rhashtable_params ipmr_rht_params = {
        .head_offset = offsetof(struct mr_mfc, mnode),
        .key_offset = offsetof(struct mfc_cache, cmparg),
        .key_len = sizeof(struct mfc_cache_cmp_arg),
        .nelem_hint = 3,
        .obj_cmpfn = ipmr_hash_cmp,
        .automatic_shrinking = true,
};

static void ipmr_new_table_set(struct mr_table *mrt,
                               struct net *net)
{
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
        list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
#endif
}

static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = {
        .mfc_mcastgrp = htonl(INADDR_ANY),
        .mfc_origin = htonl(INADDR_ANY),
};

static struct mr_table_ops ipmr_mr_table_ops = {
        .rht_params = &ipmr_rht_params,
        .cmparg_any = &ipmr_mr_table_ops_cmparg_any,
};

static struct mr_table *ipmr_new_table(struct net *net, u32 id)
{
        struct mr_table *mrt;

        /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
        if (id != RT_TABLE_DEFAULT && id >= 1000000000)
                return ERR_PTR(-EINVAL);

        mrt = __ipmr_get_table(net, id);
        if (mrt)
                return mrt;

        return mr_table_alloc(net, id, &ipmr_mr_table_ops,
                              ipmr_expire_process, ipmr_new_table_set);
}

static void ipmr_free_table(struct mr_table *mrt)
{
        struct net *net = read_pnet(&mrt->net);

        WARN_ON_ONCE(!ipmr_can_free_table(net));

        timer_shutdown_sync(&mrt->ipmr_expire_timer);
        mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
                                 MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC);
        rhltable_destroy(&mrt->mfc_hash);
        kfree(mrt);
}

/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */

/* Initialize ipmr pimreg/tunnel in_device */
static bool ipmr_init_vif_indev(const struct net_device *dev)
{
        struct in_device *in_dev;

        ASSERT_RTNL();

        in_dev = __in_dev_get_rtnl(dev);
        if (!in_dev)
                return false;
        ipv4_devconf_setall(in_dev);
        neigh_parms_data_state_setall(in_dev->arp_parms);
        IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;

        return true;
}

static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
{
        struct net_device *tunnel_dev, *new_dev;
        struct ip_tunnel_parm_kern p = { };
        int err;

        tunnel_dev = __dev_get_by_name(net, "tunl0");
        if (!tunnel_dev)
                goto out;

        p.iph.daddr = v->vifc_rmt_addr.s_addr;
        p.iph.saddr = v->vifc_lcl_addr.s_addr;
        p.iph.version = 4;
        p.iph.ihl = 5;
        p.iph.protocol = IPPROTO_IPIP;
        sprintf(p.name, "dvmrp%d", v->vifc_vifi);

        if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl)
                goto out;
        err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
                        SIOCADDTUNNEL);
        if (err)
                goto out;

        new_dev = __dev_get_by_name(net, p.name);
        if (!new_dev)
                goto out;

        new_dev->flags |= IFF_MULTICAST;
        if (!ipmr_init_vif_indev(new_dev))
                goto out_unregister;
        if (dev_open(new_dev, NULL))
                goto out_unregister;
        dev_hold(new_dev);
        err = dev_set_allmulti(new_dev, 1);
        if (err) {
                dev_close(new_dev);
                tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
                                SIOCDELTUNNEL);
                dev_put(new_dev);
                new_dev = ERR_PTR(err);
        }
        return new_dev;

out_unregister:
        unregister_netdevice(new_dev);
out:
        return ERR_PTR(-ENOBUFS);
}

#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct net *net = dev_net(dev);
        struct mr_table *mrt;
        struct flowi4 fl4 = {
                .flowi4_oif        = dev->ifindex,
                .flowi4_iif        = skb->skb_iif ? : LOOPBACK_IFINDEX,
                .flowi4_mark        = skb->mark,
        };
        int err;

        err = ipmr_fib_lookup(net, &fl4, &mrt);
        if (err < 0) {
                kfree_skb(skb);
                return err;
        }

        DEV_STATS_ADD(dev, tx_bytes, skb->len);
        DEV_STATS_INC(dev, tx_packets);
        rcu_read_lock();

        /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
        ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
                          IGMPMSG_WHOLEPKT);

        rcu_read_unlock();
        kfree_skb(skb);
        return NETDEV_TX_OK;
}

static int reg_vif_get_iflink(const struct net_device *dev)
{
        return 0;
}

static const struct net_device_ops reg_vif_netdev_ops = {
        .ndo_start_xmit        = reg_vif_xmit,
        .ndo_get_iflink = reg_vif_get_iflink,
};

static void reg_vif_setup(struct net_device *dev)
{
        dev->type                = ARPHRD_PIMREG;
        dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
        dev->flags                = IFF_NOARP;
        dev->netdev_ops                = &reg_vif_netdev_ops;
        dev->needs_free_netdev        = true;
        dev->netns_immutable        = true;
}

static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
{
        struct net_device *dev;
        char name[IFNAMSIZ];

        if (mrt->id == RT_TABLE_DEFAULT)
                sprintf(name, "pimreg");
        else
                sprintf(name, "pimreg%u", mrt->id);

        dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);

        if (!dev)
                return NULL;

        dev_net_set(dev, net);

        if (register_netdevice(dev)) {
                free_netdev(dev);
                return NULL;
        }

        if (!ipmr_init_vif_indev(dev))
                goto failure;
        if (dev_open(dev, NULL))
                goto failure;

        dev_hold(dev);

        return dev;

failure:
        unregister_netdevice(dev);
        return NULL;
}

/* called with rcu_read_lock() */
static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
                     unsigned int pimlen)
{
        struct net_device *reg_dev = NULL;
        struct iphdr *encap;
        int vif_num;

        encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
        /* Check that:
         * a. packet is really sent to a multicast group
         * b. packet is not a NULL-REGISTER
         * c. packet is not truncated
         */
        if (!ipv4_is_multicast(encap->daddr) ||
            encap->tot_len == 0 ||
            ntohs(encap->tot_len) + pimlen > skb->len)
                return 1;

        /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */
        vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
        if (vif_num >= 0)
                reg_dev = vif_dev_read(&mrt->vif_table[vif_num]);
        if (!reg_dev)
                return 1;

        skb->mac_header = skb->network_header;
        skb_pull(skb, (u8 *)encap - skb->data);
        skb_reset_network_header(skb);
        skb->protocol = htons(ETH_P_IP);
        skb->ip_summed = CHECKSUM_NONE;

        skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));

        netif_rx(skb);

        return NET_RX_SUCCESS;
}
#else
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
{
        return NULL;
}
#endif

static int call_ipmr_vif_entry_notifiers(struct net *net,
                                         enum fib_event_type event_type,
                                         struct vif_device *vif,
                                         struct net_device *vif_dev,
                                         vifi_t vif_index, u32 tb_id)
{
        return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
                                     vif, vif_dev, vif_index, tb_id,
                                     &net->ipv4.ipmr_seq);
}

static int call_ipmr_mfc_entry_notifiers(struct net *net,
                                         enum fib_event_type event_type,
                                         struct mfc_cache *mfc, u32 tb_id)
{
        return mr_call_mfc_notifiers(net, RTNL_FAMILY_IPMR, event_type,
                                     &mfc->_c, tb_id, &net->ipv4.ipmr_seq);
}

/**
 *        vif_delete - Delete a VIF entry
 *        @mrt: Table to delete from
 *        @vifi: VIF identifier to delete
 *        @notify: Set to 1, if the caller is a notifier_call
 *        @head: if unregistering the VIF, place it on this queue
 */
static int vif_delete(struct mr_table *mrt, int vifi, int notify,
                      struct list_head *head)
{
        struct net *net = read_pnet(&mrt->net);
        struct vif_device *v;
        struct net_device *dev;
        struct in_device *in_dev;

        if (vifi < 0 || vifi >= mrt->maxvif)
                return -EADDRNOTAVAIL;

        v = &mrt->vif_table[vifi];

        dev = rtnl_dereference(v->dev);
        if (!dev)
                return -EADDRNOTAVAIL;

        spin_lock(&mrt_lock);
        call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev,
                                      vifi, mrt->id);
        RCU_INIT_POINTER(v->dev, NULL);

        if (vifi == mrt->mroute_reg_vif_num) {
                /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
                WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
        }
        if (vifi + 1 == mrt->maxvif) {
                int tmp;

                for (tmp = vifi - 1; tmp >= 0; tmp--) {
                        if (VIF_EXISTS(mrt, tmp))
                                break;
                }
                WRITE_ONCE(mrt->maxvif, tmp + 1);
        }

        spin_unlock(&mrt_lock);

        dev_set_allmulti(dev, -1);

        in_dev = __in_dev_get_rtnl(dev);
        if (in_dev) {
                IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
                inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
                                            NETCONFA_MC_FORWARDING,
                                            dev->ifindex, &in_dev->cnf);
                ip_rt_multicast_event(in_dev);
        }

        if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
                unregister_netdevice_queue(dev, head);

        netdev_put(dev, &v->dev_tracker);
        return 0;
}

static void ipmr_cache_free_rcu(struct rcu_head *head)
{
        struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);

        kmem_cache_free(mrt_cachep, (struct mfc_cache *)c);
}

static void ipmr_cache_free(struct mfc_cache *c)
{
        call_rcu(&c->_c.rcu, ipmr_cache_free_rcu);
}

/* Destroy an unresolved cache entry, killing queued skbs
 * and reporting error to netlink readers.
 */
static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
{
        struct net *net = read_pnet(&mrt->net);
        struct sk_buff *skb;
        struct nlmsgerr *e;

        atomic_dec(&mrt->cache_resolve_queue_len);

        while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) {
                if (ip_hdr(skb)->version == 0) {
                        struct nlmsghdr *nlh = skb_pull(skb,
                                                        sizeof(struct iphdr));
                        nlh->nlmsg_type = NLMSG_ERROR;
                        nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
                        skb_trim(skb, nlh->nlmsg_len);
                        e = nlmsg_data(nlh);
                        e->error = -ETIMEDOUT;
                        memset(&e->msg, 0, sizeof(e->msg));

                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
                } else {
                        kfree_skb(skb);
                }
        }

        ipmr_cache_free(c);
}

/* Timer process for the unresolved queue. */
static void ipmr_expire_process(struct timer_list *t)
{
        struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
        struct mr_mfc *c, *next;
        unsigned long expires;
        unsigned long now;

        if (!spin_trylock(&mfc_unres_lock)) {
                mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
                return;
        }

        if (list_empty(&mrt->mfc_unres_queue))
                goto out;

        now = jiffies;
        expires = 10*HZ;

        list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
                if (time_after(c->mfc_un.unres.expires, now)) {
                        unsigned long interval = c->mfc_un.unres.expires - now;
                        if (interval < expires)
                                expires = interval;
                        continue;
                }

                list_del(&c->list);
                mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE);
                ipmr_destroy_unres(mrt, (struct mfc_cache *)c);
        }

        if (!list_empty(&mrt->mfc_unres_queue))
                mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);

out:
        spin_unlock(&mfc_unres_lock);
}

/* Fill oifs list. It is called under locked mrt_lock. */
static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
                                   unsigned char *ttls)
{
        int vifi;

        cache->mfc_un.res.minvif = MAXVIFS;
        cache->mfc_un.res.maxvif = 0;
        memset(cache->mfc_un.res.ttls, 255, MAXVIFS);

        for (vifi = 0; vifi < mrt->maxvif; vifi++) {
                if (VIF_EXISTS(mrt, vifi) &&
                    ttls[vifi] && ttls[vifi] < 255) {
                        cache->mfc_un.res.ttls[vifi] = ttls[vifi];
                        if (cache->mfc_un.res.minvif > vifi)
                                cache->mfc_un.res.minvif = vifi;
                        if (cache->mfc_un.res.maxvif <= vifi)
                                cache->mfc_un.res.maxvif = vifi + 1;
                }
        }
        WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies);
}

static int vif_add(struct net *net, struct mr_table *mrt,
                   struct vifctl *vifc, int mrtsock)
{
        struct netdev_phys_item_id ppid = { };
        int vifi = vifc->vifc_vifi;
        struct vif_device *v = &mrt->vif_table[vifi];
        struct net_device *dev;
        struct in_device *in_dev;
        int err;

        /* Is vif busy ? */
        if (VIF_EXISTS(mrt, vifi))
                return -EADDRINUSE;

        switch (vifc->vifc_flags) {
        case VIFF_REGISTER:
                if (!ipmr_pimsm_enabled())
                        return -EINVAL;
                /* Special Purpose VIF in PIM
                 * All the packets will be sent to the daemon
                 */
                if (mrt->mroute_reg_vif_num >= 0)
                        return -EADDRINUSE;
                dev = ipmr_reg_vif(net, mrt);
                if (!dev)
                        return -ENOBUFS;
                err = dev_set_allmulti(dev, 1);
                if (err) {
                        unregister_netdevice(dev);
                        dev_put(dev);
                        return err;
                }
                break;
        case VIFF_TUNNEL:
                dev = ipmr_new_tunnel(net, vifc);
                if (IS_ERR(dev))
                        return PTR_ERR(dev);
                break;
        case VIFF_USE_IFINDEX:
        case 0:
                if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
                        dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
                        if (dev && !__in_dev_get_rtnl(dev)) {
                                dev_put(dev);
                                return -EADDRNOTAVAIL;
                        }
                } else {
                        dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
                }
                if (!dev)
                        return -EADDRNOTAVAIL;
                err = dev_set_allmulti(dev, 1);
                if (err) {
                        dev_put(dev);
                        return err;
                }
                break;
        default:
                return -EINVAL;
        }

        in_dev = __in_dev_get_rtnl(dev);
        if (!in_dev) {
                dev_put(dev);
                return -EADDRNOTAVAIL;
        }
        IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
        inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING,
                                    dev->ifindex, &in_dev->cnf);
        ip_rt_multicast_event(in_dev);

        /* Fill in the VIF structures */
        vif_device_init(v, dev, vifc->vifc_rate_limit,
                        vifc->vifc_threshold,
                        vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
                        (VIFF_TUNNEL | VIFF_REGISTER));

        err = dev_get_port_parent_id(dev, &ppid, true);
        if (err == 0) {
                memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len);
                v->dev_parent_id.id_len = ppid.id_len;
        } else {
                v->dev_parent_id.id_len = 0;
        }

        v->local = vifc->vifc_lcl_addr.s_addr;
        v->remote = vifc->vifc_rmt_addr.s_addr;

        /* And finish update writing critical data */
        spin_lock(&mrt_lock);
        rcu_assign_pointer(v->dev, dev);
        netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
        if (v->flags & VIFF_REGISTER) {
                /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
                WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
        }
        if (vifi+1 > mrt->maxvif)
                WRITE_ONCE(mrt->maxvif, vifi + 1);
        spin_unlock(&mrt_lock);
        call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev,
                                      vifi, mrt->id);
        return 0;
}

/* called with rcu_read_lock() */
static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
                                         __be32 origin,
                                         __be32 mcastgrp)
{
        struct mfc_cache_cmp_arg arg = {
                        .mfc_mcastgrp = mcastgrp,
                        .mfc_origin = origin
        };

        return mr_mfc_find(mrt, &arg);
}

/* Look for a (*,G) entry */
static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
                                             __be32 mcastgrp, int vifi)
{
        struct mfc_cache_cmp_arg arg = {
                        .mfc_mcastgrp = mcastgrp,
                        .mfc_origin = htonl(INADDR_ANY)
        };

        if (mcastgrp == htonl(INADDR_ANY))
                return mr_mfc_find_any_parent(mrt, vifi);
        return mr_mfc_find_any(mrt, vifi, &arg);
}

/* Look for a (S,G,iif) entry if parent != -1 */
static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
                                                __be32 origin, __be32 mcastgrp,
                                                int parent)
{
        struct mfc_cache_cmp_arg arg = {
                        .mfc_mcastgrp = mcastgrp,
                        .mfc_origin = origin,
        };

        return mr_mfc_find_parent(mrt, &arg, parent);
}

/* Allocate a multicast cache entry */
static struct mfc_cache *ipmr_cache_alloc(void)
{
        struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);

        if (c) {
                c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
                c->_c.mfc_un.res.minvif = MAXVIFS;
                c->_c.free = ipmr_cache_free_rcu;
                refcount_set(&c->_c.mfc_un.res.refcount, 1);
        }
        return c;
}

static struct mfc_cache *ipmr_cache_alloc_unres(void)
{
        struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);

        if (c) {
                skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
                c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
        }
        return c;
}

/* A cache entry has gone into a resolved state from queued */
static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
                               struct mfc_cache *uc, struct mfc_cache *c)
{
        struct sk_buff *skb;
        struct nlmsgerr *e;

        /* Play the pending entries through our router */
        while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
                if (ip_hdr(skb)->version == 0) {
                        struct nlmsghdr *nlh = skb_pull(skb,
                                                        sizeof(struct iphdr));

                        if (mr_fill_mroute(mrt, skb, &c->_c,
                                           nlmsg_data(nlh)) > 0) {
                                nlh->nlmsg_len = skb_tail_pointer(skb) -
                                                 (u8 *)nlh;
                        } else {
                                nlh->nlmsg_type = NLMSG_ERROR;
                                nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
                                skb_trim(skb, nlh->nlmsg_len);
                                e = nlmsg_data(nlh);
                                e->error = -EMSGSIZE;
                                memset(&e->msg, 0, sizeof(e->msg));
                        }

                        rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
                } else {
                        rcu_read_lock();
                        ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
                        rcu_read_unlock();
                }
        }
}

/* Bounce a cache query up to mrouted and netlink.
 *
 * Called under rcu_read_lock().
 */
static int ipmr_cache_report(const struct mr_table *mrt,
                             struct sk_buff *pkt, vifi_t vifi, int assert)
{
        const int ihl = ip_hdrlen(pkt);
        struct sock *mroute_sk;
        struct igmphdr *igmp;
        struct igmpmsg *msg;
        struct sk_buff *skb;
        int ret;

        mroute_sk = rcu_dereference(mrt->mroute_sk);
        if (!mroute_sk)
                return -EINVAL;

        if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
                skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
        else
                skb = alloc_skb(128, GFP_ATOMIC);

        if (!skb)
                return -ENOBUFS;

        if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
                /* Ugly, but we have no choice with this interface.
                 * Duplicate old header, fix ihl, length etc.
                 * And all this only to mangle msg->im_msgtype and
                 * to set msg->im_mbz to "mbz" :-)
                 */
                skb_push(skb, sizeof(struct iphdr));
                skb_reset_network_header(skb);
                skb_reset_transport_header(skb);
                msg = (struct igmpmsg *)skb_network_header(skb);
                memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
                msg->im_msgtype = assert;
                msg->im_mbz = 0;
                if (assert == IGMPMSG_WRVIFWHOLE) {
                        msg->im_vif = vifi;
                        msg->im_vif_hi = vifi >> 8;
                } else {
                        /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
                        int vif_num = READ_ONCE(mrt->mroute_reg_vif_num);

                        msg->im_vif = vif_num;
                        msg->im_vif_hi = vif_num >> 8;
                }
                ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
                ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
                                             sizeof(struct iphdr));
        } else {
                /* Copy the IP header */
                skb_set_network_header(skb, skb->len);
                skb_put(skb, ihl);
                skb_copy_to_linear_data(skb, pkt->data, ihl);
                /* Flag to the kernel this is a route add */
                ip_hdr(skb)->protocol = 0;
                msg = (struct igmpmsg *)skb_network_header(skb);
                msg->im_vif = vifi;
                msg->im_vif_hi = vifi >> 8;
                ipv4_pktinfo_prepare(mroute_sk, pkt, false);
                memcpy(skb->cb, pkt->cb, sizeof(skb->cb));
                /* Add our header */
                igmp = skb_put(skb, sizeof(struct igmphdr));
                igmp->type = assert;
                msg->im_msgtype = assert;
                igmp->code = 0;
                ip_hdr(skb)->tot_len = htons(skb->len);        /* Fix the length */
                skb->transport_header = skb->network_header;
        }

        igmpmsg_netlink_event(mrt, skb);

        /* Deliver to mrouted */
        ret = sock_queue_rcv_skb(mroute_sk, skb);

        if (ret < 0) {
                net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
                kfree_skb(skb);
        }

        return ret;
}

/* Queue a packet for resolution. It gets locked cache entry! */
/* Called under rcu_read_lock() */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
                                 struct sk_buff *skb, struct net_device *dev)
{
        const struct iphdr *iph = ip_hdr(skb);
        struct mfc_cache *c;
        bool found = false;
        int err;

        spin_lock_bh(&mfc_unres_lock);
        list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
                if (c->mfc_mcastgrp == iph->daddr &&
                    c->mfc_origin == iph->saddr) {
                        found = true;
                        break;
                }
        }

        if (!found) {
                /* Create a new entry if allowable */
                c = ipmr_cache_alloc_unres();
                if (!c) {
                        spin_unlock_bh(&mfc_unres_lock);

                        kfree_skb(skb);
                        return -ENOBUFS;
                }

                /* Fill in the new cache entry */
                c->_c.mfc_parent = -1;
                c->mfc_origin        = iph->saddr;
                c->mfc_mcastgrp        = iph->daddr;

                /* Reflect first query at mrouted. */
                err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);

                if (err < 0) {
                        /* If the report failed throw the cache entry
                           out - Brad Parker
                         */
                        spin_unlock_bh(&mfc_unres_lock);

                        ipmr_cache_free(c);
                        kfree_skb(skb);
                        return err;
                }

                atomic_inc(&mrt->cache_resolve_queue_len);
                list_add(&c->_c.list, &mrt->mfc_unres_queue);
                mroute_netlink_event(mrt, c, RTM_NEWROUTE);

                if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
                        mod_timer(&mrt->ipmr_expire_timer,
                                  c->_c.mfc_un.unres.expires);
        }

        /* See if we can append the packet */
        if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
                kfree_skb(skb);
                err = -ENOBUFS;
        } else {
                if (dev) {
                        skb->dev = dev;
                        skb->skb_iif = dev->ifindex;
                }
                skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
                err = 0;
        }

        spin_unlock_bh(&mfc_unres_lock);
        return err;
}

/* MFC cache manipulation by user space mroute daemon */

static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
        struct net *net = read_pnet(&mrt->net);
        struct mfc_cache *c;

        /* The entries are added/deleted only under RTNL */
        rcu_read_lock();
        c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
                                   mfc->mfcc_mcastgrp.s_addr, parent);
        rcu_read_unlock();
        if (!c)
                return -ENOENT;
        rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params);
        list_del_rcu(&c->_c.list);
        call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
        mroute_netlink_event(mrt, c, RTM_DELROUTE);
        mr_cache_put(&c->_c);

        return 0;
}

static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
                        struct mfcctl *mfc, int mrtsock, int parent)
{
        struct mfc_cache *uc, *c;
        struct mr_mfc *_uc;
        bool found;
        int ret;

        if (mfc->mfcc_parent >= MAXVIFS)
                return -ENFILE;

        /* The entries are added/deleted only under RTNL */
        rcu_read_lock();
        c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
                                   mfc->mfcc_mcastgrp.s_addr, parent);
        rcu_read_unlock();
        if (c) {
                spin_lock(&mrt_lock);
                c->_c.mfc_parent = mfc->mfcc_parent;
                ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
                if (!mrtsock)
                        c->_c.mfc_flags |= MFC_STATIC;
                spin_unlock(&mrt_lock);
                call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
                                              mrt->id);
                mroute_netlink_event(mrt, c, RTM_NEWROUTE);
                return 0;
        }

        if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
            !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
                return -EINVAL;

        c = ipmr_cache_alloc();
        if (!c)
                return -ENOMEM;

        c->mfc_origin = mfc->mfcc_origin.s_addr;
        c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
        c->_c.mfc_parent = mfc->mfcc_parent;
        ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
        if (!mrtsock)
                c->_c.mfc_flags |= MFC_STATIC;

        ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode,
                                  ipmr_rht_params);
        if (ret) {
                pr_err("ipmr: rhtable insert error %d\n", ret);
                ipmr_cache_free(c);
                return ret;
        }
        list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list);
        /* Check to see if we resolved a queued list. If so we
         * need to send on the frames and tidy up.
         */
        found = false;
        spin_lock_bh(&mfc_unres_lock);
        list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) {
                uc = (struct mfc_cache *)_uc;
                if (uc->mfc_origin == c->mfc_origin &&
                    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
                        list_del(&_uc->list);
                        atomic_dec(&mrt->cache_resolve_queue_len);
                        found = true;
                        break;
                }
        }
        if (list_empty(&mrt->mfc_unres_queue))
                timer_delete(&mrt->ipmr_expire_timer);
        spin_unlock_bh(&mfc_unres_lock);

        if (found) {
                ipmr_cache_resolve(net, mrt, uc, c);
                ipmr_cache_free(uc);
        }
        call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id);
        mroute_netlink_event(mrt, c, RTM_NEWROUTE);
        return 0;
}

/* Close the multicast socket, and clear the vif tables etc */
static void mroute_clean_tables(struct mr_table *mrt, int flags)
{
        struct net *net = read_pnet(&mrt->net);
        struct mr_mfc *c, *tmp;
        struct mfc_cache *cache;
        LIST_HEAD(list);
        int i;

        /* Shut down all active vif entries */
        if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) {
                for (i = 0; i < mrt->maxvif; i++) {
                        if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
                             !(flags & MRT_FLUSH_VIFS_STATIC)) ||
                            (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS)))
                                continue;
                        vif_delete(mrt, i, 0, &list);
                }
                unregister_netdevice_many(&list);
        }

        /* Wipe the cache */
        if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) {
                list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
                        if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) ||
                            (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC)))
                                continue;
                        rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
                        list_del_rcu(&c->list);
                        cache = (struct mfc_cache *)c;
                        call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
                                                      mrt->id);
                        mroute_netlink_event(mrt, cache, RTM_DELROUTE);
                        mr_cache_put(c);
                }
        }

        if (flags & MRT_FLUSH_MFC) {
                if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
                        spin_lock_bh(&mfc_unres_lock);
                        list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
                                list_del(&c->list);
                                cache = (struct mfc_cache *)c;
                                mroute_netlink_event(mrt, cache, RTM_DELROUTE);
                                ipmr_destroy_unres(mrt, cache);
                        }
                        spin_unlock_bh(&mfc_unres_lock);
                }
        }
}

/* called from ip_ra_control(), before an RCU grace period,
 * we don't need to call synchronize_rcu() here
 */
static void mrtsock_destruct(struct sock *sk)
{
        struct net *net = sock_net(sk);
        struct mr_table *mrt;

        rtnl_lock();
        ipmr_for_each_table(mrt, net) {
                if (sk == rtnl_dereference(mrt->mroute_sk)) {
                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_MC_FORWARDING,
                                                    NETCONFA_IFINDEX_ALL,
                                                    net->ipv4.devconf_all);
                        RCU_INIT_POINTER(mrt->mroute_sk, NULL);
                        mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC);
                }
        }
        rtnl_unlock();
}

/* Socket options and virtual interface manipulation. The whole
 * virtual interface system is a complete heap, but unfortunately
 * that's how BSD mrouted happens to think. Maybe one day with a proper
 * MOSPF/PIM router set up we can clean this up.
 */

int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
                         unsigned int optlen)
{
        struct net *net = sock_net(sk);
        int val, ret = 0, parent = 0;
        struct mr_table *mrt;
        struct vifctl vif;
        struct mfcctl mfc;
        bool do_wrvifwhole;
        u32 uval;

        /* There's one exception to the lock - MRT_DONE which needs to unlock */
        rtnl_lock();
        if (sk->sk_type != SOCK_RAW ||
            inet_sk(sk)->inet_num != IPPROTO_IGMP) {
                ret = -EOPNOTSUPP;
                goto out_unlock;
        }

        mrt = __ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
        if (!mrt) {
                ret = -ENOENT;
                goto out_unlock;
        }
        if (optname != MRT_INIT) {
                if (sk != rcu_access_pointer(mrt->mroute_sk) &&
                    !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
                        ret = -EACCES;
                        goto out_unlock;
                }
        }

        switch (optname) {
        case MRT_INIT:
                if (optlen != sizeof(int)) {
                        ret = -EINVAL;
                        break;
                }
                if (rtnl_dereference(mrt->mroute_sk)) {
                        ret = -EADDRINUSE;
                        break;
                }

                ret = ip_ra_control(sk, 1, mrtsock_destruct);
                if (ret == 0) {
                        rcu_assign_pointer(mrt->mroute_sk, sk);
                        IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
                        inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                    NETCONFA_MC_FORWARDING,
                                                    NETCONFA_IFINDEX_ALL,
                                                    net->ipv4.devconf_all);
                }
                break;
        case MRT_DONE:
                if (sk != rcu_access_pointer(mrt->mroute_sk)) {
                        ret = -EACCES;
                } else {
                        /* We need to unlock here because mrtsock_destruct takes
                         * care of rtnl itself and we can't change that due to
                         * the IP_ROUTER_ALERT setsockopt which runs without it.
                         */
                        rtnl_unlock();
                        ret = ip_ra_control(sk, 0, NULL);
                        goto out;
                }
                break;
        case MRT_ADD_VIF:
        case MRT_DEL_VIF:
                if (optlen != sizeof(vif)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_sockptr(&vif, optval, sizeof(vif))) {
                        ret = -EFAULT;
                        break;
                }
                if (vif.vifc_vifi >= MAXVIFS) {
                        ret = -ENFILE;
                        break;
                }
                if (optname == MRT_ADD_VIF) {
                        ret = vif_add(net, mrt, &vif,
                                      sk == rtnl_dereference(mrt->mroute_sk));
                } else {
                        ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
                }
                break;
        /* Manipulate the forwarding caches. These live
         * in a sort of kernel/user symbiosis.
         */
        case MRT_ADD_MFC:
        case MRT_DEL_MFC:
                parent = -1;
                fallthrough;
        case MRT_ADD_MFC_PROXY:
        case MRT_DEL_MFC_PROXY:
                if (optlen != sizeof(mfc)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) {
                        ret = -EFAULT;
                        break;
                }
                if (parent == 0)
                        parent = mfc.mfcc_parent;
                if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
                        ret = ipmr_mfc_delete(mrt, &mfc, parent);
                else
                        ret = ipmr_mfc_add(net, mrt, &mfc,
                                           sk == rtnl_dereference(mrt->mroute_sk),
                                           parent);
                break;
        case MRT_FLUSH:
                if (optlen != sizeof(val)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_sockptr(&val, optval, sizeof(val))) {
                        ret = -EFAULT;
                        break;
                }
                mroute_clean_tables(mrt, val);
                break;
        /* Control PIM assert. */
        case MRT_ASSERT:
                if (optlen != sizeof(val)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_sockptr(&val, optval, sizeof(val))) {
                        ret = -EFAULT;
                        break;
                }
                mrt->mroute_do_assert = val;
                break;
        case MRT_PIM:
                if (!ipmr_pimsm_enabled()) {
                        ret = -ENOPROTOOPT;
                        break;
                }
                if (optlen != sizeof(val)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_sockptr(&val, optval, sizeof(val))) {
                        ret = -EFAULT;
                        break;
                }

                do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
                val = !!val;
                if (val != mrt->mroute_do_pim) {
                        mrt->mroute_do_pim = val;
                        mrt->mroute_do_assert = val;
                        mrt->mroute_do_wrvifwhole = do_wrvifwhole;
                }
                break;
        case MRT_TABLE:
                if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) {
                        ret = -ENOPROTOOPT;
                        break;
                }
                if (optlen != sizeof(uval)) {
                        ret = -EINVAL;
                        break;
                }
                if (copy_from_sockptr(&uval, optval, sizeof(uval))) {
                        ret = -EFAULT;
                        break;
                }

                if (sk == rtnl_dereference(mrt->mroute_sk)) {
                        ret = -EBUSY;
                } else {
                        mrt = ipmr_new_table(net, uval);
                        if (IS_ERR(mrt))
                                ret = PTR_ERR(mrt);
                        else
                                raw_sk(sk)->ipmr_table = uval;
                }
                break;
        /* Spurious command, or MRT_VERSION which you cannot set. */
        default:
                ret = -ENOPROTOOPT;
        }
out_unlock:
        rtnl_unlock();
out:
        return ret;
}

/* Execute if this ioctl is a special mroute ioctl */
int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
        switch (cmd) {
        /* These userspace buffers will be consumed by ipmr_ioctl() */
        case SIOCGETVIFCNT: {
                struct sioc_vif_req buffer;

                return sock_ioctl_inout(sk, cmd, arg, &buffer,
                                      sizeof(buffer));
                }
        case SIOCGETSGCNT: {
                struct sioc_sg_req buffer;

                return sock_ioctl_inout(sk, cmd, arg, &buffer,
                                      sizeof(buffer));
                }
        }
        /* return code > 0 means that the ioctl was not executed */
        return 1;
}

/* Getsock opt support for the multicast routing system. */
int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
                         sockptr_t optlen)
{
        int olr;
        int val;
        struct net *net = sock_net(sk);
        struct mr_table *mrt;

        if (sk->sk_type != SOCK_RAW ||
            inet_sk(sk)->inet_num != IPPROTO_IGMP)
                return -EOPNOTSUPP;

        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
        if (!mrt)
                return -ENOENT;

        switch (optname) {
        case MRT_VERSION:
                val = 0x0305;
                break;
        case MRT_PIM:
                if (!ipmr_pimsm_enabled())
                        return -ENOPROTOOPT;
                val = mrt->mroute_do_pim;
                break;
        case MRT_ASSERT:
                val = mrt->mroute_do_assert;
                break;
        default:
                return -ENOPROTOOPT;
        }

        if (copy_from_sockptr(&olr, optlen, sizeof(int)))
                return -EFAULT;
        if (olr < 0)
                return -EINVAL;

        olr = min_t(unsigned int, olr, sizeof(int));

        if (copy_to_sockptr(optlen, &olr, sizeof(int)))
                return -EFAULT;
        if (copy_to_sockptr(optval, &val, olr))
                return -EFAULT;
        return 0;
}

/* The IP multicast ioctl support routines. */
int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
{
        struct vif_device *vif;
        struct mfc_cache *c;
        struct net *net = sock_net(sk);
        struct sioc_vif_req *vr;
        struct sioc_sg_req *sr;
        struct mr_table *mrt;

        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
        if (!mrt)
                return -ENOENT;

        switch (cmd) {
        case SIOCGETVIFCNT:
                vr = (struct sioc_vif_req *)arg;
                if (vr->vifi >= mrt->maxvif)
                        return -EINVAL;
                vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif);
                rcu_read_lock();
                vif = &mrt->vif_table[vr->vifi];
                if (VIF_EXISTS(mrt, vr->vifi)) {
                        vr->icount = READ_ONCE(vif->pkt_in);
                        vr->ocount = READ_ONCE(vif->pkt_out);
                        vr->ibytes = READ_ONCE(vif->bytes_in);
                        vr->obytes = READ_ONCE(vif->bytes_out);
                        rcu_read_unlock();

                        return 0;
                }
                rcu_read_unlock();
                return -EADDRNOTAVAIL;
        case SIOCGETSGCNT:
                sr = (struct sioc_sg_req *)arg;

                rcu_read_lock();
                c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr);
                if (c) {
                        sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
                        sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
                        sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
                        rcu_read_unlock();
                        return 0;
                }
                rcu_read_unlock();
                return -EADDRNOTAVAIL;
        default:
                return -ENOIOCTLCMD;
        }
}

#ifdef CONFIG_COMPAT
struct compat_sioc_sg_req {
        struct in_addr src;
        struct in_addr grp;
        compat_ulong_t pktcnt;
        compat_ulong_t bytecnt;
        compat_ulong_t wrong_if;
};

struct compat_sioc_vif_req {
        vifi_t        vifi;                /* Which iface */
        compat_ulong_t icount;
        compat_ulong_t ocount;
        compat_ulong_t ibytes;
        compat_ulong_t obytes;
};

int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
        struct compat_sioc_sg_req sr;
        struct compat_sioc_vif_req vr;
        struct vif_device *vif;
        struct mfc_cache *c;
        struct net *net = sock_net(sk);
        struct mr_table *mrt;

        mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
        if (!mrt)
                return -ENOENT;

        switch (cmd) {
        case SIOCGETVIFCNT:
                if (copy_from_user(&vr, arg, sizeof(vr)))
                        return -EFAULT;
                if (vr.vifi >= mrt->maxvif)
                        return -EINVAL;
                vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
                rcu_read_lock();
                vif = &mrt->vif_table[vr.vifi];
                if (VIF_EXISTS(mrt, vr.vifi)) {
                        vr.icount = READ_ONCE(vif->pkt_in);
                        vr.ocount = READ_ONCE(vif->pkt_out);
                        vr.ibytes = READ_ONCE(vif->bytes_in);
                        vr.obytes = READ_ONCE(vif->bytes_out);
                        rcu_read_unlock();

                        if (copy_to_user(arg, &vr, sizeof(vr)))
                                return -EFAULT;
                        return 0;
                }
                rcu_read_unlock();
                return -EADDRNOTAVAIL;
        case SIOCGETSGCNT:
                if (copy_from_user(&sr, arg, sizeof(sr)))
                        return -EFAULT;

                rcu_read_lock();
                c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
                if (c) {
                        sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
                        sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
                        sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
                        rcu_read_unlock();

                        if (copy_to_user(arg, &sr, sizeof(sr)))
                                return -EFAULT;
                        return 0;
                }
                rcu_read_unlock();
                return -EADDRNOTAVAIL;
        default:
                return -ENOIOCTLCMD;
        }
}
#endif

static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);
        struct mr_table *mrt;
        struct vif_device *v;
        int ct;

        if (event != NETDEV_UNREGISTER)
                return NOTIFY_DONE;

        ipmr_for_each_table(mrt, net) {
                v = &mrt->vif_table[0];
                for (ct = 0; ct < mrt->maxvif; ct++, v++) {
                        if (rcu_access_pointer(v->dev) == dev)
                                vif_delete(mrt, ct, 1, NULL);
                }
        }
        return NOTIFY_DONE;
}

static struct notifier_block ip_mr_notifier = {
        .notifier_call = ipmr_device_event,
};

/* Encapsulate a packet by attaching a valid IPIP header to it.
 * This avoids tunnel drivers and other mess and gives us the speed so
 * important for multicast video.
 */
static void ip_encap(struct net *net, struct sk_buff *skb,
                     __be32 saddr, __be32 daddr)
{
        struct iphdr *iph;
        const struct iphdr *old_iph = ip_hdr(skb);

        skb_push(skb, sizeof(struct iphdr));
        skb->transport_header = skb->network_header;
        skb_reset_network_header(skb);
        iph = ip_hdr(skb);

        iph->version        =        4;
        iph->tos        =        old_iph->tos;
        iph->ttl        =        old_iph->ttl;
        iph->frag_off        =        0;
        iph->daddr        =        daddr;
        iph->saddr        =        saddr;
        iph->protocol        =        IPPROTO_IPIP;
        iph->ihl        =        5;
        iph->tot_len        =        htons(skb->len);
        ip_select_ident(net, skb, NULL);
        ip_send_check(iph);

        memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
        nf_reset_ct(skb);
}

static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
                                      struct sk_buff *skb)
{
        struct ip_options *opt = &(IPCB(skb)->opt);

        IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);

        if (unlikely(opt->optlen))
                ip_forward_options(skb);

        return dst_output(net, sk, skb);
}

#ifdef CONFIG_NET_SWITCHDEV
static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
                                   int in_vifi, int out_vifi)
{
        struct vif_device *out_vif = &mrt->vif_table[out_vifi];
        struct vif_device *in_vif = &mrt->vif_table[in_vifi];

        if (!skb->offload_l3_fwd_mark)
                return false;
        if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
                return false;
        return netdev_phys_item_id_same(&out_vif->dev_parent_id,
                                        &in_vif->dev_parent_id);
}
#else
static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
                                   int in_vifi, int out_vifi)
{
        return false;
}
#endif

/* Processing handlers for ipmr_forward, under rcu_read_lock() */

static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
                            int in_vifi, struct sk_buff *skb, int vifi)
{
        const struct iphdr *iph = ip_hdr(skb);
        struct vif_device *vif = &mrt->vif_table[vifi];
        struct net_device *vif_dev;
        struct net_device *dev;
        struct rtable *rt;
        struct flowi4 fl4;
        int    encap = 0;

        vif_dev = vif_dev_read(vif);
        if (!vif_dev)
                goto out_free;

        if (vif->flags & VIFF_REGISTER) {
                WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
                WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
                DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
                DEV_STATS_INC(vif_dev, tx_packets);
                ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
                goto out_free;
        }

        if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
                goto out_free;

        if (vif->flags & VIFF_TUNNEL) {
                rt = ip_route_output_ports(net, &fl4, NULL,
                                           vif->remote, vif->local,
                                           0, 0,
                                           IPPROTO_IPIP,
                                           iph->tos & INET_DSCP_MASK, vif->link);
                if (IS_ERR(rt))
                        goto out_free;
                encap = sizeof(struct iphdr);
        } else {
                rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
                                           0, 0,
                                           IPPROTO_IPIP,
                                           iph->tos & INET_DSCP_MASK, vif->link);
                if (IS_ERR(rt))
                        goto out_free;
        }

        dev = rt->dst.dev;

        if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
                /* Do not fragment multicasts. Alas, IPv4 does not
                 * allow to send ICMP, so that packets will disappear
                 * to blackhole.
                 */
                IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
                ip_rt_put(rt);
                goto out_free;
        }

        encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;

        if (skb_cow(skb, encap)) {
                ip_rt_put(rt);
                goto out_free;
        }

        WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
        WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);

        skb_dst_drop(skb);
        skb_dst_set(skb, &rt->dst);
        ip_decrease_ttl(ip_hdr(skb));

        /* FIXME: forward and output firewalls used to be called here.
         * What do we do with netfilter? -- RR
         */
        if (vif->flags & VIFF_TUNNEL) {
                ip_encap(net, skb, vif->local, vif->remote);
                /* FIXME: extra output firewall step used to be here. --RR */
                DEV_STATS_INC(vif_dev, tx_packets);
                DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
        }

        IPCB(skb)->flags |= IPSKB_FORWARDED;

        /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
         * not only before forwarding, but after forwarding on all output
         * interfaces. It is clear, if mrouter runs a multicasting
         * program, it should receive packets not depending to what interface
         * program is joined.
         * If we will not make it, the program will have to join on all
         * interfaces. On the other hand, multihoming host (or router, but
         * not mrouter) cannot join to more than one interface - it will
         * result in receiving multiple packets.
         */
        NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
                net, NULL, skb, skb->dev, dev,
                ipmr_forward_finish);
        return;

out_free:
        kfree_skb(skb);
}

/* Called with mrt_lock or rcu_read_lock() */
static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev)
{
        int ct;
        /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */
        for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
                if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
                        break;
        }
        return ct;
}

/* "local" means that we should preserve one skb (for local delivery) */
/* Called uner rcu_read_lock() */
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
                          struct net_device *dev, struct sk_buff *skb,
                          struct mfc_cache *c, int local)
{
        int true_vifi = ipmr_find_vif(mrt, dev);
        int psend = -1;
        int vif, ct;

        vif = c->_c.mfc_parent;
        atomic_long_inc(&c->_c.mfc_un.res.pkt);
        atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
        WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);

        if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
                struct mfc_cache *cache_proxy;

                /* For an (*,G) entry, we only check that the incoming
                 * interface is part of the static tree.
                 */
                cache_proxy = mr_mfc_find_any_parent(mrt, vif);
                if (cache_proxy &&
                    cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
                        goto forward;
        }

        /* Wrong interface: drop packet and (maybe) send PIM assert. */
        if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
                if (rt_is_output_route(skb_rtable(skb))) {
                        /* It is our own packet, looped back.
                         * Very complicated situation...
                         *
                         * The best workaround until routing daemons will be
                         * fixed is not to redistribute packet, if it was
                         * send through wrong interface. It means, that
                         * multicast applications WILL NOT work for
                         * (S,G), which have default multicast route pointing
                         * to wrong oif. In any case, it is not a good
                         * idea to use multicasting applications on router.
                         */
                        goto dont_forward;
                }

                atomic_long_inc(&c->_c.mfc_un.res.wrong_if);

                if (true_vifi >= 0 && mrt->mroute_do_assert &&
                    /* pimsm uses asserts, when switching from RPT to SPT,
                     * so that we cannot check that packet arrived on an oif.
                     * It is bad, but otherwise we would need to move pretty
                     * large chunk of pimd to kernel. Ough... --ANK
                     */
                    (mrt->mroute_do_pim ||
                     c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
                    time_after(jiffies,
                               c->_c.mfc_un.res.last_assert +
                               MFC_ASSERT_THRESH)) {
                        c->_c.mfc_un.res.last_assert = jiffies;
                        ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
                        if (mrt->mroute_do_wrvifwhole)
                                ipmr_cache_report(mrt, skb, true_vifi,
                                                  IGMPMSG_WRVIFWHOLE);
                }
                goto dont_forward;
        }

forward:
        WRITE_ONCE(mrt->vif_table[vif].pkt_in,
                   mrt->vif_table[vif].pkt_in + 1);
        WRITE_ONCE(mrt->vif_table[vif].bytes_in,
                   mrt->vif_table[vif].bytes_in + skb->len);

        /* Forward the frame */
        if (c->mfc_origin == htonl(INADDR_ANY) &&
            c->mfc_mcastgrp == htonl(INADDR_ANY)) {
                if (true_vifi >= 0 &&
                    true_vifi != c->_c.mfc_parent &&
                    ip_hdr(skb)->ttl >
                                c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
                        /* It's an (*,*) entry and the packet is not coming from
                         * the upstream: forward the packet to the upstream
                         * only.
                         */
                        psend = c->_c.mfc_parent;
                        goto last_forward;
                }
                goto dont_forward;
        }
        for (ct = c->_c.mfc_un.res.maxvif - 1;
             ct >= c->_c.mfc_un.res.minvif; ct--) {
                /* For (*,G) entry, don't forward to the incoming interface */
                if ((c->mfc_origin != htonl(INADDR_ANY) ||
                     ct != true_vifi) &&
                    ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
                        if (psend != -1) {
                                struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);

                                if (skb2)
                                        ipmr_queue_xmit(net, mrt, true_vifi,
                                                        skb2, psend);
                        }
                        psend = ct;
                }
        }
last_forward:
        if (psend != -1) {
                if (local) {
                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);

                        if (skb2)
                                ipmr_queue_xmit(net, mrt, true_vifi, skb2,
                                                psend);
                } else {
                        ipmr_queue_xmit(net, mrt, true_vifi, skb, psend);
                        return;
                }
        }

dont_forward:
        if (!local)
                kfree_skb(skb);
}

static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct iphdr *iph = ip_hdr(skb);
        struct flowi4 fl4 = {
                .daddr = iph->daddr,
                .saddr = iph->saddr,
                .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)),
                .flowi4_oif = (rt_is_output_route(rt) ?
                               skb->dev->ifindex : 0),
                .flowi4_iif = (rt_is_output_route(rt) ?
                               LOOPBACK_IFINDEX :
                               skb->dev->ifindex),
                .flowi4_mark = skb->mark,
        };
        struct mr_table *mrt;
        int err;

        err = ipmr_fib_lookup(net, &fl4, &mrt);
        if (err)
                return ERR_PTR(err);
        return mrt;
}

/* Multicast packets for forwarding arrive here
 * Called with rcu_read_lock();
 */
int ip_mr_input(struct sk_buff *skb)
{
        struct mfc_cache *cache;
        struct net *net = dev_net(skb->dev);
        int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
        struct mr_table *mrt;
        struct net_device *dev;

        /* skb->dev passed in is the loX master dev for vrfs.
         * As there are no vifs associated with loopback devices,
         * get the proper interface that does have a vif associated with it.
         */
        dev = skb->dev;
        if (netif_is_l3_master(skb->dev)) {
                dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
                if (!dev) {
                        kfree_skb(skb);
                        return -ENODEV;
                }
        }

        /* Packet is looped back after forward, it should not be
         * forwarded second time, but still can be delivered locally.
         */
        if (IPCB(skb)->flags & IPSKB_FORWARDED)
                goto dont_forward;

        mrt = ipmr_rt_fib_lookup(net, skb);
        if (IS_ERR(mrt)) {
                kfree_skb(skb);
                return PTR_ERR(mrt);
        }
        if (!local) {
                if (IPCB(skb)->opt.router_alert) {
                        if (ip_call_ra_chain(skb))
                                return 0;
                } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
                        /* IGMPv1 (and broken IGMPv2 implementations sort of
                         * Cisco IOS <= 11.2(8)) do not put router alert
                         * option to IGMP packets destined to routable
                         * groups. It is very bad, because it means
                         * that we can forward NO IGMP messages.
                         */
                        struct sock *mroute_sk;

                        mroute_sk = rcu_dereference(mrt->mroute_sk);
                        if (mroute_sk) {
                                nf_reset_ct(skb);
                                raw_rcv(mroute_sk, skb);
                                return 0;
                        }
                }
        }

        /* already under rcu_read_lock() */
        cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
        if (!cache) {
                int vif = ipmr_find_vif(mrt, dev);

                if (vif >= 0)
                        cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
                                                    vif);
        }

        /* No usable cache entry */
        if (!cache) {
                int vif;

                if (local) {
                        struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
                        ip_local_deliver(skb);
                        if (!skb2)
                                return -ENOBUFS;
                        skb = skb2;
                }

                vif = ipmr_find_vif(mrt, dev);
                if (vif >= 0)
                        return ipmr_cache_unresolved(mrt, vif, skb, dev);
                kfree_skb(skb);
                return -ENODEV;
        }

        ip_mr_forward(net, mrt, dev, skb, cache, local);

        if (local)
                return ip_local_deliver(skb);

        return 0;

dont_forward:
        if (local)
                return ip_local_deliver(skb);
        kfree_skb(skb);
        return 0;
}

#ifdef CONFIG_IP_PIMSM_V1
/* Handle IGMP messages of PIMv1 */
int pim_rcv_v1(struct sk_buff *skb)
{
        struct igmphdr *pim;
        struct net *net = dev_net(skb->dev);
        struct mr_table *mrt;

        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
                goto drop;

        pim = igmp_hdr(skb);

        mrt = ipmr_rt_fib_lookup(net, skb);
        if (IS_ERR(mrt))
                goto drop;
        if (!mrt->mroute_do_pim ||
            pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
                goto drop;

        if (__pim_rcv(mrt, skb, sizeof(*pim))) {
drop:
                kfree_skb(skb);
        }
        return 0;
}
#endif

#ifdef CONFIG_IP_PIMSM_V2
static int pim_rcv(struct sk_buff *skb)
{
        struct pimreghdr *pim;
        struct net *net = dev_net(skb->dev);
        struct mr_table *mrt;

        if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
                goto drop;

        pim = (struct pimreghdr *)skb_transport_header(skb);
        if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
            (pim->flags & PIM_NULL_REGISTER) ||
            (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
             csum_fold(skb_checksum(skb, 0, skb->len, 0))))
                goto drop;

        mrt = ipmr_rt_fib_lookup(net, skb);
        if (IS_ERR(mrt))
                goto drop;
        if (__pim_rcv(mrt, skb, sizeof(*pim))) {
drop:
                kfree_skb(skb);
        }
        return 0;
}
#endif

int ipmr_get_route(struct net *net, struct sk_buff *skb,
                   __be32 saddr, __be32 daddr,
                   struct rtmsg *rtm, u32 portid)
{
        struct mfc_cache *cache;
        struct mr_table *mrt;
        int err;

        rcu_read_lock();
        mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT);
        if (!mrt) {
                rcu_read_unlock();
                return -ENOENT;
        }

        cache = ipmr_cache_find(mrt, saddr, daddr);
        if (!cache && skb->dev) {
                int vif = ipmr_find_vif(mrt, skb->dev);

                if (vif >= 0)
                        cache = ipmr_cache_find_any(mrt, daddr, vif);
        }
        if (!cache) {
                struct sk_buff *skb2;
                struct iphdr *iph;
                struct net_device *dev;
                int vif = -1;

                dev = skb->dev;
                if (dev)
                        vif = ipmr_find_vif(mrt, dev);
                if (vif < 0) {
                        rcu_read_unlock();
                        return -ENODEV;
                }

                skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr));
                if (!skb2) {
                        rcu_read_unlock();
                        return -ENOMEM;
                }

                NETLINK_CB(skb2).portid = portid;
                skb_push(skb2, sizeof(struct iphdr));
                skb_reset_network_header(skb2);
                iph = ip_hdr(skb2);
                iph->ihl = sizeof(struct iphdr) >> 2;
                iph->saddr = saddr;
                iph->daddr = daddr;
                iph->version = 0;
                err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
                rcu_read_unlock();
                return err;
        }

        err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
        rcu_read_unlock();
        return err;
}

static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
                            u32 portid, u32 seq, struct mfc_cache *c, int cmd,
                            int flags)
{
        struct nlmsghdr *nlh;
        struct rtmsg *rtm;
        int err;

        nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
        if (!nlh)
                return -EMSGSIZE;

        rtm = nlmsg_data(nlh);
        rtm->rtm_family   = RTNL_FAMILY_IPMR;
        rtm->rtm_dst_len  = 32;
        rtm->rtm_src_len  = 32;
        rtm->rtm_tos      = 0;
        rtm->rtm_table    = mrt->id;
        if (nla_put_u32(skb, RTA_TABLE, mrt->id))
                goto nla_put_failure;
        rtm->rtm_type     = RTN_MULTICAST;
        rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
        if (c->_c.mfc_flags & MFC_STATIC)
                rtm->rtm_protocol = RTPROT_STATIC;
        else
                rtm->rtm_protocol = RTPROT_MROUTED;
        rtm->rtm_flags    = 0;

        if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
            nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
                goto nla_put_failure;
        err = mr_fill_mroute(mrt, skb, &c->_c, rtm);
        /* do not break the dump if cache is unresolved */
        if (err < 0 && err != -ENOENT)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
                             u32 portid, u32 seq, struct mr_mfc *c, int cmd,
                             int flags)
{
        return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c,
                                cmd, flags);
}

static size_t mroute_msgsize(bool unresolved, int maxvif)
{
        size_t len =
                NLMSG_ALIGN(sizeof(struct rtmsg))
                + nla_total_size(4)        /* RTA_TABLE */
                + nla_total_size(4)        /* RTA_SRC */
                + nla_total_size(4)        /* RTA_DST */
                ;

        if (!unresolved)
                len = len
                      + nla_total_size(4)        /* RTA_IIF */
                      + nla_total_size(0)        /* RTA_MULTIPATH */
                      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
                                                /* RTA_MFC_STATS */
                      + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
                ;

        return len;
}

static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
                                 int cmd)
{
        struct net *net = read_pnet(&mrt->net);
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS,
                                       mrt->maxvif),
                        GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
        if (err < 0)
                goto errout;

        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
        return;

errout:
        kfree_skb(skb);
        rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
}

static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
{
        size_t len =
                NLMSG_ALIGN(sizeof(struct rtgenmsg))
                + nla_total_size(1)        /* IPMRA_CREPORT_MSGTYPE */
                + nla_total_size(4)        /* IPMRA_CREPORT_VIF_ID */
                + nla_total_size(4)        /* IPMRA_CREPORT_SRC_ADDR */
                + nla_total_size(4)        /* IPMRA_CREPORT_DST_ADDR */
                + nla_total_size(4)        /* IPMRA_CREPORT_TABLE */
                                        /* IPMRA_CREPORT_PKT */
                + nla_total_size(payloadlen)
                ;

        return len;
}

static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
{
        struct net *net = read_pnet(&mrt->net);
        struct nlmsghdr *nlh;
        struct rtgenmsg *rtgenm;
        struct igmpmsg *msg;
        struct sk_buff *skb;
        struct nlattr *nla;
        int payloadlen;

        payloadlen = pkt->len - sizeof(struct igmpmsg);
        msg = (struct igmpmsg *)skb_network_header(pkt);

        skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC);
        if (!skb)
                goto errout;

        nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
                        sizeof(struct rtgenmsg), 0);
        if (!nlh)
                goto errout;
        rtgenm = nlmsg_data(nlh);
        rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
        if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
            nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) ||
            nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
                            msg->im_src.s_addr) ||
            nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
                            msg->im_dst.s_addr) ||
            nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id))
                goto nla_put_failure;

        nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
        if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg),
                                  nla_data(nla), payloadlen))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);

        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC);
        return;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
errout:
        kfree_skb(skb);
        rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
}

static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
                                       const struct nlmsghdr *nlh,
                                       struct nlattr **tb,
                                       struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                              rtm_ipv4_policy, extack);

        rtm = nlmsg_data(nlh);
        if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
            (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
            rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
            rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
                                            rtm_ipv4_policy, extack);
        if (err)
                return err;

        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
                NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
                return -EINVAL;
        }

        for (i = 0; i <= RTA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case RTA_SRC:
                case RTA_DST:
                case RTA_TABLE:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[RTA_MAX + 1];
        struct sk_buff *skb = NULL;
        struct mfc_cache *cache;
        struct mr_table *mrt;
        __be32 src, grp;
        u32 tableid;
        int err;

        err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
        if (err < 0)
                goto errout;

        src = nla_get_in_addr_default(tb[RTA_SRC], 0);
        grp = nla_get_in_addr_default(tb[RTA_DST], 0);
        tableid = nla_get_u32_default(tb[RTA_TABLE], 0);

        mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT);
        if (!mrt) {
                err = -ENOENT;
                goto errout_free;
        }

        /* entries are added/deleted only under RTNL */
        rcu_read_lock();
        cache = ipmr_cache_find(mrt, src, grp);
        rcu_read_unlock();
        if (!cache) {
                err = -ENOENT;
                goto errout_free;
        }

        skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL);
        if (!skb) {
                err = -ENOBUFS;
                goto errout_free;
        }

        err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
                               nlh->nlmsg_seq, cache,
                               RTM_NEWROUTE, 0);
        if (err < 0)
                goto errout_free;

        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);

errout:
        return err;

errout_free:
        kfree_skb(skb);
        goto errout;
}

static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct fib_dump_filter filter = {
                .rtnl_held = true,
        };
        int err;

        if (cb->strict_check) {
                err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
                                            &filter, cb);
                if (err < 0)
                        return err;
        }

        if (filter.table_id) {
                struct mr_table *mrt;

                mrt = __ipmr_get_table(sock_net(skb->sk), filter.table_id);
                if (!mrt) {
                        if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR)
                                return skb->len;

                        NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
                        return -ENOENT;
                }
                err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute,
                                    &mfc_unres_lock, &filter);
                return skb->len ? : err;
        }

        return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
                                _ipmr_fill_mroute, &mfc_unres_lock, &filter);
}

static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
        [RTA_SRC]        = { .type = NLA_U32 },
        [RTA_DST]        = { .type = NLA_U32 },
        [RTA_IIF]        = { .type = NLA_U32 },
        [RTA_TABLE]        = { .type = NLA_U32 },
        [RTA_MULTIPATH]        = { .len = sizeof(struct rtnexthop) },
};

static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol)
{
        switch (rtm_protocol) {
        case RTPROT_STATIC:
        case RTPROT_MROUTED:
                return true;
        }
        return false;
}

static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
{
        struct rtnexthop *rtnh = nla_data(nla);
        int remaining = nla_len(nla), vifi = 0;

        while (rtnh_ok(rtnh, remaining)) {
                mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops;
                if (++vifi == MAXVIFS)
                        break;
                rtnh = rtnh_next(rtnh, &remaining);
        }

        return remaining > 0 ? -EINVAL : vifi;
}

/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
                            struct mfcctl *mfcc, int *mrtsock,
                            struct mr_table **mrtret,
                            struct netlink_ext_ack *extack)
{
        struct net_device *dev = NULL;
        u32 tblid = RT_TABLE_DEFAULT;
        struct mr_table *mrt;
        struct nlattr *attr;
        struct rtmsg *rtm;
        int ret, rem;

        ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
                                        rtm_ipmr_policy, extack);
        if (ret < 0)
                goto out;
        rtm = nlmsg_data(nlh);

        ret = -EINVAL;
        if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 ||
            rtm->rtm_type != RTN_MULTICAST ||
            rtm->rtm_scope != RT_SCOPE_UNIVERSE ||
            !ipmr_rtm_validate_proto(rtm->rtm_protocol))
                goto out;

        memset(mfcc, 0, sizeof(*mfcc));
        mfcc->mfcc_parent = -1;
        ret = 0;
        nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) {
                switch (nla_type(attr)) {
                case RTA_SRC:
                        mfcc->mfcc_origin.s_addr = nla_get_be32(attr);
                        break;
                case RTA_DST:
                        mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr);
                        break;
                case RTA_IIF:
                        dev = __dev_get_by_index(net, nla_get_u32(attr));
                        if (!dev) {
                                ret = -ENODEV;
                                goto out;
                        }
                        break;
                case RTA_MULTIPATH:
                        if (ipmr_nla_get_ttls(attr, mfcc) < 0) {
                                ret = -EINVAL;
                                goto out;
                        }
                        break;
                case RTA_PREFSRC:
                        ret = 1;
                        break;
                case RTA_TABLE:
                        tblid = nla_get_u32(attr);
                        break;
                }
        }
        mrt = __ipmr_get_table(net, tblid);
        if (!mrt) {
                ret = -ENOENT;
                goto out;
        }
        *mrtret = mrt;
        *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0;
        if (dev)
                mfcc->mfcc_parent = ipmr_find_vif(mrt, dev);

out:
        return ret;
}

/* takes care of both newroute and delroute */
static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        int ret, mrtsock, parent;
        struct mr_table *tbl;
        struct mfcctl mfcc;

        mrtsock = 0;
        tbl = NULL;
        ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack);
        if (ret < 0)
                return ret;

        parent = ret ? mfcc.mfcc_parent : -1;
        if (nlh->nlmsg_type == RTM_NEWROUTE)
                return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent);
        else
                return ipmr_mfc_delete(tbl, &mfcc, parent);
}

static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
{
        u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len);

        if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) ||
            nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) ||
            nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM,
                        mrt->mroute_reg_vif_num) ||
            nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
                       mrt->mroute_do_assert) ||
            nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
            nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
                       mrt->mroute_do_wrvifwhole))
                return false;

        return true;
}

static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
{
        struct net_device *vif_dev;
        struct nlattr *vif_nest;
        struct vif_device *vif;

        vif = &mrt->vif_table[vifid];
        vif_dev = rtnl_dereference(vif->dev);
        /* if the VIF doesn't exist just continue */
        if (!vif_dev)
                return true;

        vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF);
        if (!vif_nest)
                return false;

        if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) ||
            nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
            nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
            nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
                              IPMRA_VIFA_PAD) ||
            nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out,
                              IPMRA_VIFA_PAD) ||
            nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in,
                              IPMRA_VIFA_PAD) ||
            nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out,
                              IPMRA_VIFA_PAD) ||
            nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) ||
            nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) {
                nla_nest_cancel(skb, vif_nest);
                return false;
        }
        nla_nest_end(skb, vif_nest);

        return true;
}

static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
                               struct netlink_ext_ack *extack)
{
        struct ifinfomsg *ifm;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump");
                return -EINVAL;
        }

        if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
                NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump");
                return -EINVAL;
        }

        ifm = nlmsg_data(nlh);
        if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
            ifm->ifi_change || ifm->ifi_index) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request");
                return -EINVAL;
        }

        return 0;
}

static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct nlmsghdr *nlh = NULL;
        unsigned int t = 0, s_t;
        unsigned int e = 0, s_e;
        struct mr_table *mrt;

        if (cb->strict_check) {
                int err = ipmr_valid_dumplink(cb->nlh, cb->extack);

                if (err < 0)
                        return err;
        }

        s_t = cb->args[0];
        s_e = cb->args[1];

        ipmr_for_each_table(mrt, net) {
                struct nlattr *vifs, *af;
                struct ifinfomsg *hdr;
                u32 i;

                if (t < s_t)
                        goto skip_table;
                nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
                                cb->nlh->nlmsg_seq, RTM_NEWLINK,
                                sizeof(*hdr), NLM_F_MULTI);
                if (!nlh)
                        break;

                hdr = nlmsg_data(nlh);
                memset(hdr, 0, sizeof(*hdr));
                hdr->ifi_family = RTNL_FAMILY_IPMR;

                af = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
                if (!af) {
                        nlmsg_cancel(skb, nlh);
                        goto out;
                }

                if (!ipmr_fill_table(mrt, skb)) {
                        nlmsg_cancel(skb, nlh);
                        goto out;
                }

                vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS);
                if (!vifs) {
                        nla_nest_end(skb, af);
                        nlmsg_end(skb, nlh);
                        goto out;
                }
                for (i = 0; i < mrt->maxvif; i++) {
                        if (e < s_e)
                                goto skip_entry;
                        if (!ipmr_fill_vif(mrt, i, skb)) {
                                nla_nest_end(skb, vifs);
                                nla_nest_end(skb, af);
                                nlmsg_end(skb, nlh);
                                goto out;
                        }
skip_entry:
                        e++;
                }
                s_e = 0;
                e = 0;
                nla_nest_end(skb, vifs);
                nla_nest_end(skb, af);
                nlmsg_end(skb, nlh);
skip_table:
                t++;
        }

out:
        cb->args[1] = e;
        cb->args[0] = t;

        return skb->len;
}

#ifdef CONFIG_PROC_FS
/* The /proc interfaces to multicast routing :
 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
 */

static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        struct mr_vif_iter *iter = seq->private;
        struct net *net = seq_file_net(seq);
        struct mr_table *mrt;

        rcu_read_lock();
        mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT);
        if (!mrt) {
                rcu_read_unlock();
                return ERR_PTR(-ENOENT);
        }

        iter->mrt = mrt;

        return mr_vif_seq_start(seq, pos);
}

static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        rcu_read_unlock();
}

static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
{
        struct mr_vif_iter *iter = seq->private;
        struct mr_table *mrt = iter->mrt;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                         "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
        } else {
                const struct vif_device *vif = v;
                const struct net_device *vif_dev;
                const char *name;

                vif_dev = vif_dev_read(vif);
                name = vif_dev ? vif_dev->name : "none";
                seq_printf(seq,
                           "%2td %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
                           vif - mrt->vif_table,
                           name, vif->bytes_in, vif->pkt_in,
                           vif->bytes_out, vif->pkt_out,
                           vif->flags, vif->local, vif->remote);
        }
        return 0;
}

static const struct seq_operations ipmr_vif_seq_ops = {
        .start = ipmr_vif_seq_start,
        .next  = mr_vif_seq_next,
        .stop  = ipmr_vif_seq_stop,
        .show  = ipmr_vif_seq_show,
};

static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
{
        struct net *net = seq_file_net(seq);
        struct mr_table *mrt;

        mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
        if (!mrt)
                return ERR_PTR(-ENOENT);

        return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
}

static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
{
        int n;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq,
                 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
        } else {
                const struct mfc_cache *mfc = v;
                const struct mr_mfc_iter *it = seq->private;
                const struct mr_table *mrt = it->mrt;

                seq_printf(seq, "%08X %08X %-3hd",
                           (__force u32) mfc->mfc_mcastgrp,
                           (__force u32) mfc->mfc_origin,
                           mfc->_c.mfc_parent);

                if (it->cache != &mrt->mfc_unres_queue) {
                        seq_printf(seq, " %8lu %8lu %8lu",
                                   atomic_long_read(&mfc->_c.mfc_un.res.pkt),
                                   atomic_long_read(&mfc->_c.mfc_un.res.bytes),
                                   atomic_long_read(&mfc->_c.mfc_un.res.wrong_if));
                        for (n = mfc->_c.mfc_un.res.minvif;
                             n < mfc->_c.mfc_un.res.maxvif; n++) {
                                if (VIF_EXISTS(mrt, n) &&
                                    mfc->_c.mfc_un.res.ttls[n] < 255)
                                        seq_printf(seq,
                                           " %2d:%-3d",
                                           n, mfc->_c.mfc_un.res.ttls[n]);
                        }
                } else {
                        /* unresolved mfc_caches don't contain
                         * pkt, bytes and wrong_if values
                         */
                        seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
                }
                seq_putc(seq, '\n');
        }
        return 0;
}

static const struct seq_operations ipmr_mfc_seq_ops = {
        .start = ipmr_mfc_seq_start,
        .next  = mr_mfc_seq_next,
        .stop  = mr_mfc_seq_stop,
        .show  = ipmr_mfc_seq_show,
};
#endif

#ifdef CONFIG_IP_PIMSM_V2
static const struct net_protocol pim_protocol = {
        .handler        =        pim_rcv,
};
#endif

static unsigned int ipmr_seq_read(const struct net *net)
{
        return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net);
}

static int ipmr_dump(struct net *net, struct notifier_block *nb,
                     struct netlink_ext_ack *extack)
{
        return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
                       ipmr_mr_table_iter, extack);
}

static const struct fib_notifier_ops ipmr_notifier_ops_template = {
        .family                = RTNL_FAMILY_IPMR,
        .fib_seq_read        = ipmr_seq_read,
        .fib_dump        = ipmr_dump,
        .owner                = THIS_MODULE,
};

static int __net_init ipmr_notifier_init(struct net *net)
{
        struct fib_notifier_ops *ops;

        net->ipv4.ipmr_seq = 0;

        ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
        if (IS_ERR(ops))
                return PTR_ERR(ops);
        net->ipv4.ipmr_notifier_ops = ops;

        return 0;
}

static void __net_exit ipmr_notifier_exit(struct net *net)
{
        fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
        net->ipv4.ipmr_notifier_ops = NULL;
}

/* Setup for IP multicast routing */
static int __net_init ipmr_net_init(struct net *net)
{
        int err;

        err = ipmr_notifier_init(net);
        if (err)
                goto ipmr_notifier_fail;

        err = ipmr_rules_init(net);
        if (err < 0)
                goto ipmr_rules_fail;

#ifdef CONFIG_PROC_FS
        err = -ENOMEM;
        if (!proc_create_net("ip_mr_vif", 0, net->proc_net, &ipmr_vif_seq_ops,
                        sizeof(struct mr_vif_iter)))
                goto proc_vif_fail;
        if (!proc_create_net("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops,
                        sizeof(struct mr_mfc_iter)))
                goto proc_cache_fail;
#endif
        return 0;

#ifdef CONFIG_PROC_FS
proc_cache_fail:
        remove_proc_entry("ip_mr_vif", net->proc_net);
proc_vif_fail:
        rtnl_lock();
        ipmr_rules_exit(net);
        rtnl_unlock();
#endif
ipmr_rules_fail:
        ipmr_notifier_exit(net);
ipmr_notifier_fail:
        return err;
}

static void __net_exit ipmr_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
        remove_proc_entry("ip_mr_cache", net->proc_net);
        remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
        ipmr_notifier_exit(net);
}

static void __net_exit ipmr_net_exit_batch(struct list_head *net_list)
{
        struct net *net;

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list)
                ipmr_rules_exit(net);
        rtnl_unlock();
}

static struct pernet_operations ipmr_net_ops = {
        .init = ipmr_net_init,
        .exit = ipmr_net_exit,
        .exit_batch = ipmr_net_exit_batch,
};

static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = {
        {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK,
         .dumpit = ipmr_rtm_dumplink},
        {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE,
         .doit = ipmr_rtm_route},
        {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE,
         .doit = ipmr_rtm_route},
        {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE,
         .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute},
};

int __init ip_mr_init(void)
{
        int err;

        mrt_cachep = KMEM_CACHE(mfc_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);

        err = register_pernet_subsys(&ipmr_net_ops);
        if (err)
                goto reg_pernet_fail;

        err = register_netdevice_notifier(&ip_mr_notifier);
        if (err)
                goto reg_notif_fail;
#ifdef CONFIG_IP_PIMSM_V2
        if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
                pr_err("%s: can't add PIM protocol\n", __func__);
                err = -EAGAIN;
                goto add_proto_fail;
        }
#endif
        rtnl_register_many(ipmr_rtnl_msg_handlers);

        return 0;

#ifdef CONFIG_IP_PIMSM_V2
add_proto_fail:
        unregister_netdevice_notifier(&ip_mr_notifier);
#endif
reg_notif_fail:
        unregister_pernet_subsys(&ipmr_net_ops);
reg_pernet_fail:
        kmem_cache_destroy(mrt_cachep);
        return err;
}














































   95 

























   95 































































































































































































   95 







   95 
   95 
   95 



   95 












   95 








   35 





   95 





























































































   95 









   35 
   35 




   95 
   95 































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
/*
 *
 * Copyright IBM Corporation, 2012
 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
 *
 * Cgroup v2
 * Copyright (C) 2019 Red Hat, Inc.
 * Author: Giuseppe Scrivano <gscrivan@redhat.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2.1 of the GNU Lesser General Public License
 * as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 *
 */

#include <linux/cgroup.h>
#include <linux/page_counter.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>

#define MEMFILE_PRIVATE(x, val)        (((x) << 16) | (val))
#define MEMFILE_IDX(val)        (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val)        ((val) & 0xffff)

/* Use t->m[0] to encode the offset */
#define MEMFILE_OFFSET(t, m0)        (((offsetof(t, m0) << 16) | sizeof_field(t, m0)))
#define MEMFILE_OFFSET0(val)        (((val) >> 16) & 0xffff)
#define MEMFILE_FIELD_SIZE(val)        ((val) & 0xffff)

#define DFL_TMPL_SIZE                ARRAY_SIZE(hugetlb_dfl_tmpl)
#define LEGACY_TMPL_SIZE        ARRAY_SIZE(hugetlb_legacy_tmpl)

static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static struct cftype *dfl_files;
static struct cftype *legacy_files;

static inline struct page_counter *
__hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx,
                                     bool rsvd)
{
        if (rsvd)
                return &h_cg->rsvd_hugepage[idx];
        return &h_cg->hugepage[idx];
}

static inline struct page_counter *
hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx)
{
        return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false);
}

static inline struct page_counter *
hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx)
{
        return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true);
}

static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
{
        return s ? container_of(s, struct hugetlb_cgroup, css) : NULL;
}

static inline
struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
{
        return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id));
}

static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
{
        return (h_cg == root_h_cgroup);
}

static inline struct hugetlb_cgroup *
parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg)
{
        return hugetlb_cgroup_from_css(h_cg->css.parent);
}

static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
{
        struct hstate *h;

        for_each_hstate(h) {
                if (page_counter_read(
                    hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h))))
                        return true;
        }
        return false;
}

static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
                                struct hugetlb_cgroup *parent_h_cgroup)
{
        int idx;

        for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
                struct page_counter *fault, *fault_parent = NULL;
                struct page_counter *rsvd, *rsvd_parent = NULL;
                unsigned long limit;

                if (parent_h_cgroup) {
                        fault_parent = hugetlb_cgroup_counter_from_cgroup(
                                parent_h_cgroup, idx);
                        rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd(
                                parent_h_cgroup, idx);
                }
                fault = hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx);
                rsvd = hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx);

                page_counter_init(fault, fault_parent, false);
                page_counter_init(rsvd, rsvd_parent, false);

                if (!cgroup_subsys_on_dfl(hugetlb_cgrp_subsys)) {
                        fault->track_failcnt = true;
                        rsvd->track_failcnt = true;
                }

                limit = round_down(PAGE_COUNTER_MAX,
                                   pages_per_huge_page(&hstates[idx]));

                VM_BUG_ON(page_counter_set_max(fault, limit));
                VM_BUG_ON(page_counter_set_max(rsvd, limit));
        }
}

static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
{
        int node;

        for_each_node(node)
                kfree(h_cgroup->nodeinfo[node]);
        kfree(h_cgroup);
}

static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
        struct hugetlb_cgroup *h_cgroup;
        int node;

        h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
                           GFP_KERNEL);

        if (!h_cgroup)
                return ERR_PTR(-ENOMEM);

        if (!parent_h_cgroup)
                root_h_cgroup = h_cgroup;

        /*
         * TODO: this routine can waste much memory for nodes which will
         * never be onlined. It's better to use memory hotplug callback
         * function.
         */
        for_each_node(node) {
                /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */
                int node_to_alloc =
                        node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE;
                h_cgroup->nodeinfo[node] =
                        kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
                                     GFP_KERNEL, node_to_alloc);
                if (!h_cgroup->nodeinfo[node])
                        goto fail_alloc_nodeinfo;
        }

        hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
        return &h_cgroup->css;

fail_alloc_nodeinfo:
        hugetlb_cgroup_free(h_cgroup);
        return ERR_PTR(-ENOMEM);
}

static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
{
        hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
}

/*
 * Should be called with hugetlb_lock held.
 * Since we are holding hugetlb_lock, pages cannot get moved from
 * active list or uncharged from the cgroup, So no need to get
 * page reference and test for page active here. This function
 * cannot fail.
 */
static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg,
                                       struct folio *folio)
{
        unsigned int nr_pages;
        struct page_counter *counter;
        struct hugetlb_cgroup *hcg;
        struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg);

        hcg = hugetlb_cgroup_from_folio(folio);
        /*
         * We can have pages in active list without any cgroup
         * ie, hugepage with less than 3 pages. We can safely
         * ignore those pages.
         */
        if (!hcg || hcg != h_cg)
                goto out;

        nr_pages = folio_nr_pages(folio);
        if (!parent) {
                parent = root_h_cgroup;
                /* root has no limit */
                page_counter_charge(&parent->hugepage[idx], nr_pages);
        }
        counter = &h_cg->hugepage[idx];
        /* Take the pages off the local counter */
        page_counter_cancel(counter, nr_pages);

        set_hugetlb_cgroup(folio, parent);
out:
        return;
}

/*
 * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
 * the parent cgroup.
 */
static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
{
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
        struct hstate *h;
        struct folio *folio;

        do {
                for_each_hstate(h) {
                        spin_lock_irq(&hugetlb_lock);
                        list_for_each_entry(folio, &h->hugepage_activelist, lru)
                                hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio);

                        spin_unlock_irq(&hugetlb_lock);
                }
                cond_resched();
        } while (hugetlb_cgroup_have_usage(h_cg));
}

static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx,
                                 enum hugetlb_memory_event event)
{
        atomic_long_inc(&hugetlb->events_local[idx][event]);
        cgroup_file_notify(&hugetlb->events_local_file[idx]);

        do {
                atomic_long_inc(&hugetlb->events[idx][event]);
                cgroup_file_notify(&hugetlb->events_file[idx]);
        } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) &&
                 !hugetlb_cgroup_is_root(hugetlb));
}

static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                          struct hugetlb_cgroup **ptr,
                                          bool rsvd)
{
        int ret = 0;
        struct page_counter *counter;
        struct hugetlb_cgroup *h_cg = NULL;

        if (hugetlb_cgroup_disabled())
                goto done;
again:
        rcu_read_lock();
        h_cg = hugetlb_cgroup_from_task(current);
        if (!css_tryget(&h_cg->css)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        if (!page_counter_try_charge(
                    __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
                    nr_pages, &counter)) {
                ret = -ENOMEM;
                hugetlb_event(h_cg, idx, HUGETLB_MAX);
                css_put(&h_cg->css);
                goto done;
        }
        /* Reservations take a reference to the css because they do not get
         * reparented.
         */
        if (!rsvd)
                css_put(&h_cg->css);
done:
        *ptr = h_cg;
        return ret;
}

int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
                                 struct hugetlb_cgroup **ptr)
{
        return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false);
}

int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                      struct hugetlb_cgroup **ptr)
{
        return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true);
}

/* Should be called with hugetlb_lock held */
static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
                                           struct hugetlb_cgroup *h_cg,
                                           struct folio *folio, bool rsvd)
{
        if (hugetlb_cgroup_disabled() || !h_cg)
                return;
        lockdep_assert_held(&hugetlb_lock);
        __set_hugetlb_cgroup(folio, h_cg, rsvd);
        if (!rsvd) {
                unsigned long usage =
                        h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
                /*
                 * This write is not atomic due to fetching usage and writing
                 * to it, but that's fine because we call this with
                 * hugetlb_lock held anyway.
                 */
                WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
                           usage + nr_pages);
        }
}

void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
                                  struct hugetlb_cgroup *h_cg,
                                  struct folio *folio)
{
        __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false);
}

void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages,
                                       struct hugetlb_cgroup *h_cg,
                                       struct folio *folio)
{
        __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true);
}

/*
 * Should be called with hugetlb_lock held
 */
static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
                                           struct folio *folio, bool rsvd)
{
        struct hugetlb_cgroup *h_cg;

        if (hugetlb_cgroup_disabled())
                return;
        lockdep_assert_held(&hugetlb_lock);
        h_cg = __hugetlb_cgroup_from_folio(folio, rsvd);
        if (unlikely(!h_cg))
                return;
        __set_hugetlb_cgroup(folio, NULL, rsvd);

        page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
                                                                   rsvd),
                              nr_pages);

        if (rsvd)
                css_put(&h_cg->css);
        else {
                unsigned long usage =
                        h_cg->nodeinfo[folio_nid(folio)]->usage[idx];
                /*
                 * This write is not atomic due to fetching usage and writing
                 * to it, but that's fine because we call this with
                 * hugetlb_lock held anyway.
                 */
                WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx],
                           usage - nr_pages);
        }
}

void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages,
                                  struct folio *folio)
{
        __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false);
}

void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages,
                                       struct folio *folio)
{
        __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true);
}

static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
                                             struct hugetlb_cgroup *h_cg,
                                             bool rsvd)
{
        if (hugetlb_cgroup_disabled() || !h_cg)
                return;

        page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx,
                                                                   rsvd),
                              nr_pages);

        if (rsvd)
                css_put(&h_cg->css);
}

void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
                                    struct hugetlb_cgroup *h_cg)
{
        __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false);
}

void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages,
                                         struct hugetlb_cgroup *h_cg)
{
        __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true);
}

void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start,
                                     unsigned long end)
{
        if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter ||
            !resv->css)
                return;

        page_counter_uncharge(resv->reservation_counter,
                              (end - start) * resv->pages_per_hpage);
        css_put(resv->css);
}

void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv,
                                         struct file_region *rg,
                                         unsigned long nr_pages,
                                         bool region_del)
{
        if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages)
                return;

        if (rg->reservation_counter && resv->pages_per_hpage &&
            !resv->reservation_counter) {
                page_counter_uncharge(rg->reservation_counter,
                                      nr_pages * resv->pages_per_hpage);
                /*
                 * Only do css_put(rg->css) when we delete the entire region
                 * because one file_region must hold exactly one css reference.
                 */
                if (region_del)
                        css_put(rg->css);
        }
}

enum {
        RES_USAGE,
        RES_RSVD_USAGE,
        RES_LIMIT,
        RES_RSVD_LIMIT,
        RES_MAX_USAGE,
        RES_RSVD_MAX_USAGE,
        RES_FAILCNT,
        RES_RSVD_FAILCNT,
};

static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
{
        int nid;
        struct cftype *cft = seq_cft(seq);
        int idx = MEMFILE_IDX(cft->private);
        bool legacy = !cgroup_subsys_on_dfl(hugetlb_cgrp_subsys);
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
        struct cgroup_subsys_state *css;
        unsigned long usage;

        if (legacy) {
                /* Add up usage across all nodes for the non-hierarchical total. */
                usage = 0;
                for_each_node_state(nid, N_MEMORY)
                        usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
                seq_printf(seq, "total=%lu", usage * PAGE_SIZE);

                /* Simply print the per-node usage for the non-hierarchical total. */
                for_each_node_state(nid, N_MEMORY)
                        seq_printf(seq, " N%d=%lu", nid,
                                   READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
                                           PAGE_SIZE);
                seq_putc(seq, '\n');
        }

        /*
         * The hierarchical total is pretty much the value recorded by the
         * counter, so use that.
         */
        seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
                   page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);

        /*
         * For each node, transverse the css tree to obtain the hierarchical
         * node usage.
         */
        for_each_node_state(nid, N_MEMORY) {
                usage = 0;
                rcu_read_lock();
                css_for_each_descendant_pre(css, &h_cg->css) {
                        usage += READ_ONCE(hugetlb_cgroup_from_css(css)
                                                   ->nodeinfo[nid]
                                                   ->usage[idx]);
                }
                rcu_read_unlock();
                seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
        }

        seq_putc(seq, '\n');

        return 0;
}

static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
                                   struct cftype *cft)
{
        struct page_counter *counter;
        struct page_counter *rsvd_counter;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);

        counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)];
        rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)];

        switch (MEMFILE_ATTR(cft->private)) {
        case RES_USAGE:
                return (u64)page_counter_read(counter) * PAGE_SIZE;
        case RES_RSVD_USAGE:
                return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE;
        case RES_LIMIT:
                return (u64)counter->max * PAGE_SIZE;
        case RES_RSVD_LIMIT:
                return (u64)rsvd_counter->max * PAGE_SIZE;
        case RES_MAX_USAGE:
                return (u64)counter->watermark * PAGE_SIZE;
        case RES_RSVD_MAX_USAGE:
                return (u64)rsvd_counter->watermark * PAGE_SIZE;
        case RES_FAILCNT:
                return counter->failcnt;
        case RES_RSVD_FAILCNT:
                return rsvd_counter->failcnt;
        default:
                BUG();
        }
}

static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v)
{
        int idx;
        u64 val;
        struct cftype *cft = seq_cft(seq);
        unsigned long limit;
        struct page_counter *counter;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));

        idx = MEMFILE_IDX(cft->private);
        counter = &h_cg->hugepage[idx];

        limit = round_down(PAGE_COUNTER_MAX,
                           pages_per_huge_page(&hstates[idx]));

        switch (MEMFILE_ATTR(cft->private)) {
        case RES_RSVD_USAGE:
                counter = &h_cg->rsvd_hugepage[idx];
                fallthrough;
        case RES_USAGE:
                val = (u64)page_counter_read(counter);
                seq_printf(seq, "%llu\n", val * PAGE_SIZE);
                break;
        case RES_RSVD_LIMIT:
                counter = &h_cg->rsvd_hugepage[idx];
                fallthrough;
        case RES_LIMIT:
                val = (u64)counter->max;
                if (val == limit)
                        seq_puts(seq, "max\n");
                else
                        seq_printf(seq, "%llu\n", val * PAGE_SIZE);
                break;
        default:
                BUG();
        }

        return 0;
}

static DEFINE_MUTEX(hugetlb_limit_mutex);

static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off,
                                    const char *max)
{
        int ret, idx;
        unsigned long nr_pages;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));
        bool rsvd = false;

        if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */
                return -EINVAL;

        buf = strstrip(buf);
        ret = page_counter_memparse(buf, max, &nr_pages);
        if (ret)
                return ret;

        idx = MEMFILE_IDX(of_cft(of)->private);
        nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx]));

        switch (MEMFILE_ATTR(of_cft(of)->private)) {
        case RES_RSVD_LIMIT:
                rsvd = true;
                fallthrough;
        case RES_LIMIT:
                mutex_lock(&hugetlb_limit_mutex);
                ret = page_counter_set_max(
                        __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd),
                        nr_pages);
                mutex_unlock(&hugetlb_limit_mutex);
                break;
        default:
                ret = -EINVAL;
                break;
        }
        return ret ?: nbytes;
}

static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of,
                                           char *buf, size_t nbytes, loff_t off)
{
        return hugetlb_cgroup_write(of, buf, nbytes, off, "-1");
}

static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of,
                                        char *buf, size_t nbytes, loff_t off)
{
        return hugetlb_cgroup_write(of, buf, nbytes, off, "max");
}

static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of,
                                    char *buf, size_t nbytes, loff_t off)
{
        int ret = 0;
        struct page_counter *counter, *rsvd_counter;
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of));

        counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)];
        rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)];

        switch (MEMFILE_ATTR(of_cft(of)->private)) {
        case RES_MAX_USAGE:
                page_counter_reset_watermark(counter);
                break;
        case RES_RSVD_MAX_USAGE:
                page_counter_reset_watermark(rsvd_counter);
                break;
        case RES_FAILCNT:
                counter->failcnt = 0;
                break;
        case RES_RSVD_FAILCNT:
                rsvd_counter->failcnt = 0;
                break;
        default:
                ret = -EINVAL;
                break;
        }
        return ret ?: nbytes;
}

static char *mem_fmt(char *buf, int size, unsigned long hsize)
{
        if (hsize >= SZ_1G)
                snprintf(buf, size, "%luGB", hsize / SZ_1G);
        else if (hsize >= SZ_1M)
                snprintf(buf, size, "%luMB", hsize / SZ_1M);
        else
                snprintf(buf, size, "%luKB", hsize / SZ_1K);
        return buf;
}

static int __hugetlb_events_show(struct seq_file *seq, bool local)
{
        int idx;
        long max;
        struct cftype *cft = seq_cft(seq);
        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));

        idx = MEMFILE_IDX(cft->private);

        if (local)
                max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]);
        else
                max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]);

        seq_printf(seq, "max %lu\n", max);

        return 0;
}

static int hugetlb_events_show(struct seq_file *seq, void *v)
{
        return __hugetlb_events_show(seq, false);
}

static int hugetlb_events_local_show(struct seq_file *seq, void *v)
{
        return __hugetlb_events_show(seq, true);
}

static struct cftype hugetlb_dfl_tmpl[] = {
        {
                .name = "max",
                .private = RES_LIMIT,
                .seq_show = hugetlb_cgroup_read_u64_max,
                .write = hugetlb_cgroup_write_dfl,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "rsvd.max",
                .private = RES_RSVD_LIMIT,
                .seq_show = hugetlb_cgroup_read_u64_max,
                .write = hugetlb_cgroup_write_dfl,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "current",
                .private = RES_USAGE,
                .seq_show = hugetlb_cgroup_read_u64_max,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "rsvd.current",
                .private = RES_RSVD_USAGE,
                .seq_show = hugetlb_cgroup_read_u64_max,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "events",
                .seq_show = hugetlb_events_show,
                .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_file[0]),
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "events.local",
                .seq_show = hugetlb_events_local_show,
                .file_offset = MEMFILE_OFFSET(struct hugetlb_cgroup, events_local_file[0]),
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        {
                .name = "numa_stat",
                .seq_show = hugetlb_cgroup_read_numa_stat,
                .flags = CFTYPE_NOT_ON_ROOT,
        },
        /* don't need terminator here */
};

static struct cftype hugetlb_legacy_tmpl[] = {
        {
                .name = "limit_in_bytes",
                .private = RES_LIMIT,
                .read_u64 = hugetlb_cgroup_read_u64,
                .write = hugetlb_cgroup_write_legacy,
        },
        {
                .name = "rsvd.limit_in_bytes",
                .private = RES_RSVD_LIMIT,
                .read_u64 = hugetlb_cgroup_read_u64,
                .write = hugetlb_cgroup_write_legacy,
        },
        {
                .name = "usage_in_bytes",
                .private = RES_USAGE,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "rsvd.usage_in_bytes",
                .private = RES_RSVD_USAGE,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "max_usage_in_bytes",
                .private = RES_MAX_USAGE,
                .write = hugetlb_cgroup_reset,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "rsvd.max_usage_in_bytes",
                .private = RES_RSVD_MAX_USAGE,
                .write = hugetlb_cgroup_reset,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "failcnt",
                .private = RES_FAILCNT,
                .write = hugetlb_cgroup_reset,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "rsvd.failcnt",
                .private = RES_RSVD_FAILCNT,
                .write = hugetlb_cgroup_reset,
                .read_u64 = hugetlb_cgroup_read_u64,
        },
        {
                .name = "numa_stat",
                .seq_show = hugetlb_cgroup_read_numa_stat,
        },
        /* don't need terminator here */
};

static void __init
hugetlb_cgroup_cfttypes_init(struct hstate *h, struct cftype *cft,
                             struct cftype *tmpl, int tmpl_size)
{
        char buf[32];
        int i, idx = hstate_index(h);

        /* format the size */
        mem_fmt(buf, sizeof(buf), huge_page_size(h));

        for (i = 0; i < tmpl_size; cft++, tmpl++, i++) {
                *cft = *tmpl;
                /* rebuild the name */
                snprintf(cft->name, MAX_CFTYPE_NAME, "%s.%s", buf, tmpl->name);
                /* rebuild the private */
                cft->private = MEMFILE_PRIVATE(idx, tmpl->private);
                /* rebuild the file_offset */
                if (tmpl->file_offset) {
                        unsigned int offset = tmpl->file_offset;

                        cft->file_offset = MEMFILE_OFFSET0(offset) +
                                           MEMFILE_FIELD_SIZE(offset) * idx;
                }

                lockdep_register_key(&cft->lockdep_key);
        }
}

static void __init __hugetlb_cgroup_file_dfl_init(struct hstate *h)
{
        int idx = hstate_index(h);

        hugetlb_cgroup_cfttypes_init(h, dfl_files + idx * DFL_TMPL_SIZE,
                                     hugetlb_dfl_tmpl, DFL_TMPL_SIZE);
}

static void __init __hugetlb_cgroup_file_legacy_init(struct hstate *h)
{
        int idx = hstate_index(h);

        hugetlb_cgroup_cfttypes_init(h, legacy_files + idx * LEGACY_TMPL_SIZE,
                                     hugetlb_legacy_tmpl, LEGACY_TMPL_SIZE);
}

static void __init __hugetlb_cgroup_file_init(struct hstate *h)
{
        __hugetlb_cgroup_file_dfl_init(h);
        __hugetlb_cgroup_file_legacy_init(h);
}

static void __init __hugetlb_cgroup_file_pre_init(void)
{
        int cft_count;

        cft_count = hugetlb_max_hstate * DFL_TMPL_SIZE + 1; /* add terminator */
        dfl_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
        BUG_ON(!dfl_files);
        cft_count = hugetlb_max_hstate * LEGACY_TMPL_SIZE + 1; /* add terminator */
        legacy_files = kcalloc(cft_count, sizeof(struct cftype), GFP_KERNEL);
        BUG_ON(!legacy_files);
}

static void __init __hugetlb_cgroup_file_post_init(void)
{
        WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
                                       dfl_files));
        WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
                                          legacy_files));
}

void __init hugetlb_cgroup_file_init(void)
{
        struct hstate *h;

        __hugetlb_cgroup_file_pre_init();
        for_each_hstate(h)
                __hugetlb_cgroup_file_init(h);
        __hugetlb_cgroup_file_post_init();
}

/*
 * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
 * when we migrate hugepages
 */
void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio)
{
        struct hugetlb_cgroup *h_cg;
        struct hugetlb_cgroup *h_cg_rsvd;
        struct hstate *h = folio_hstate(old_folio);

        if (hugetlb_cgroup_disabled())
                return;

        spin_lock_irq(&hugetlb_lock);
        h_cg = hugetlb_cgroup_from_folio(old_folio);
        h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio);
        set_hugetlb_cgroup(old_folio, NULL);
        set_hugetlb_cgroup_rsvd(old_folio, NULL);

        /* move the h_cg details to new cgroup */
        set_hugetlb_cgroup(new_folio, h_cg);
        set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd);
        list_move(&new_folio->lru, &h->hugepage_activelist);
        spin_unlock_irq(&hugetlb_lock);
}

static struct cftype hugetlb_files[] = {
        {} /* terminate */
};

struct cgroup_subsys hugetlb_cgrp_subsys = {
        .css_alloc        = hugetlb_cgroup_css_alloc,
        .css_offline        = hugetlb_cgroup_css_offline,
        .css_free        = hugetlb_cgroup_css_free,
        .dfl_cftypes        = hugetlb_files,
        .legacy_cftypes        = hugetlb_files,
};

























































































































































































































































































































  476 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/mmu_context.h
 *
 * Copyright (C) 1996 Russell King.
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_MMU_CONTEXT_H
#define __ASM_MMU_CONTEXT_H

#ifndef __ASSEMBLY__

#include <linux/compiler.h>
#include <linux/sched.h>
#include <linux/sched/hotplug.h>
#include <linux/mm_types.h>
#include <linux/pgtable.h>
#include <linux/pkeys.h>

#include <asm/cacheflush.h>
#include <asm/cpufeature.h>
#include <asm/daifflags.h>
#include <asm/gcs.h>
#include <asm/proc-fns.h>
#include <asm/cputype.h>
#include <asm/sysreg.h>
#include <asm/tlbflush.h>

extern bool rodata_full;

static inline void contextidr_thread_switch(struct task_struct *next)
{
        if (!IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR))
                return;

        write_sysreg(task_pid_nr(next), contextidr_el1);
        isb();
}

/*
 * Set TTBR0 to reserved_pg_dir. No translations will be possible via TTBR0.
 */
static inline void cpu_set_reserved_ttbr0_nosync(void)
{
        unsigned long ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir));

        write_sysreg(ttbr, ttbr0_el1);
}

static inline void cpu_set_reserved_ttbr0(void)
{
        cpu_set_reserved_ttbr0_nosync();
        isb();
}

void cpu_do_switch_mm(phys_addr_t pgd_phys, struct mm_struct *mm);

static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm)
{
        BUG_ON(pgd == swapper_pg_dir);
        cpu_do_switch_mm(virt_to_phys(pgd),mm);
}

/*
 * TCR.T0SZ value to use when the ID map is active.
 */
#define idmap_t0sz        TCR_T0SZ(IDMAP_VA_BITS)

/*
 * Ensure TCR.T0SZ is set to the provided value.
 */
static inline void __cpu_set_tcr_t0sz(unsigned long t0sz)
{
        unsigned long tcr = read_sysreg(tcr_el1);

        if ((tcr & TCR_T0SZ_MASK) == t0sz)
                return;

        tcr &= ~TCR_T0SZ_MASK;
        tcr |= t0sz;
        write_sysreg(tcr, tcr_el1);
        isb();
}

#define cpu_set_default_tcr_t0sz()        __cpu_set_tcr_t0sz(TCR_T0SZ(vabits_actual))
#define cpu_set_idmap_tcr_t0sz()        __cpu_set_tcr_t0sz(idmap_t0sz)

/*
 * Remove the idmap from TTBR0_EL1 and install the pgd of the active mm.
 *
 * The idmap lives in the same VA range as userspace, but uses global entries
 * and may use a different TCR_EL1.T0SZ. To avoid issues resulting from
 * speculative TLB fetches, we must temporarily install the reserved page
 * tables while we invalidate the TLBs and set up the correct TCR_EL1.T0SZ.
 *
 * If current is a not a user task, the mm covers the TTBR1_EL1 page tables,
 * which should not be installed in TTBR0_EL1. In this case we can leave the
 * reserved page tables in place.
 */
static inline void cpu_uninstall_idmap(void)
{
        struct mm_struct *mm = current->active_mm;

        cpu_set_reserved_ttbr0();
        local_flush_tlb_all();
        cpu_set_default_tcr_t0sz();

        if (mm != &init_mm && !system_uses_ttbr0_pan())
                cpu_switch_mm(mm->pgd, mm);
}

static inline void cpu_install_idmap(void)
{
        cpu_set_reserved_ttbr0();
        local_flush_tlb_all();
        cpu_set_idmap_tcr_t0sz();

        cpu_switch_mm(lm_alias(idmap_pg_dir), &init_mm);
}

/*
 * Load our new page tables. A strict BBM approach requires that we ensure that
 * TLBs are free of any entries that may overlap with the global mappings we are
 * about to install.
 *
 * For a real hibernate/resume/kexec cycle TTBR0 currently points to a zero
 * page, but TLBs may contain stale ASID-tagged entries (e.g. for EFI runtime
 * services), while for a userspace-driven test_resume cycle it points to
 * userspace page tables (and we must point it at a zero page ourselves).
 *
 * We change T0SZ as part of installing the idmap. This is undone by
 * cpu_uninstall_idmap() in __cpu_suspend_exit().
 */
static inline void cpu_install_ttbr0(phys_addr_t ttbr0, unsigned long t0sz)
{
        cpu_set_reserved_ttbr0();
        local_flush_tlb_all();
        __cpu_set_tcr_t0sz(t0sz);

        /* avoid cpu_switch_mm() and its SW-PAN and CNP interactions */
        write_sysreg(ttbr0, ttbr0_el1);
        isb();
}

void __cpu_replace_ttbr1(pgd_t *pgdp, bool cnp);

static inline void cpu_enable_swapper_cnp(void)
{
        __cpu_replace_ttbr1(lm_alias(swapper_pg_dir), true);
}

static inline void cpu_replace_ttbr1(pgd_t *pgdp)
{
        /*
         * Only for early TTBR1 replacement before cpucaps are finalized and
         * before we've decided whether to use CNP.
         */
        WARN_ON(system_capabilities_finalized());
        __cpu_replace_ttbr1(pgdp, false);
}

/*
 * It would be nice to return ASIDs back to the allocator, but unfortunately
 * that introduces a race with a generation rollover where we could erroneously
 * free an ASID allocated in a future generation. We could workaround this by
 * freeing the ASID from the context of the dying mm (e.g. in arch_exit_mmap),
 * but we'd then need to make sure that we didn't dirty any TLBs afterwards.
 * Setting a reserved TTBR0 or EPD0 would work, but it all gets ugly when you
 * take CPU migration into account.
 */
void check_and_switch_context(struct mm_struct *mm);

#define init_new_context(tsk, mm) init_new_context(tsk, mm)
static inline int
init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
        atomic64_set(&mm->context.id, 0);
        refcount_set(&mm->context.pinned, 0);

        /* pkey 0 is the default, so always reserve it. */
        mm->context.pkey_allocation_map = BIT(0);

        return 0;
}

static inline void arch_dup_pkeys(struct mm_struct *oldmm,
                                  struct mm_struct *mm)
{
        /* Duplicate the oldmm pkey state in mm: */
        mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
}

static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
        arch_dup_pkeys(oldmm, mm);

        return 0;
}

static inline void arch_exit_mmap(struct mm_struct *mm)
{
}

static inline void arch_unmap(struct mm_struct *mm,
                        unsigned long start, unsigned long end)
{
}

#ifdef CONFIG_ARM64_SW_TTBR0_PAN
static inline void update_saved_ttbr0(struct task_struct *tsk,
                                      struct mm_struct *mm)
{
        u64 ttbr;

        if (!system_uses_ttbr0_pan())
                return;

        if (mm == &init_mm)
                ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
        else
                ttbr = phys_to_ttbr(virt_to_phys(mm->pgd)) | ASID(mm) << 48;

        WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr);
}
#else
static inline void update_saved_ttbr0(struct task_struct *tsk,
                                      struct mm_struct *mm)
{
}
#endif

#define enter_lazy_tlb enter_lazy_tlb
static inline void
enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
        /*
         * We don't actually care about the ttbr0 mapping, so point it at the
         * zero page.
         */
        update_saved_ttbr0(tsk, &init_mm);
}

static inline void __switch_mm(struct mm_struct *next)
{
        /*
         * init_mm.pgd does not contain any user mappings and it is always
         * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
         */
        if (next == &init_mm) {
                cpu_set_reserved_ttbr0();
                return;
        }

        check_and_switch_context(next);
}

static inline void
switch_mm(struct mm_struct *prev, struct mm_struct *next,
          struct task_struct *tsk)
{
        if (prev != next)
                __switch_mm(next);

        /*
         * Update the saved TTBR0_EL1 of the scheduled-in task as the previous
         * value may have not been initialised yet (activate_mm caller) or the
         * ASID has changed since the last run (following the context switch
         * of another thread of the same process).
         */
        update_saved_ttbr0(tsk, next);
}

static inline const struct cpumask *
__task_cpu_possible_mask(struct task_struct *p, const struct cpumask *mask)
{
        if (!static_branch_unlikely(&arm64_mismatched_32bit_el0))
                return mask;

        if (!is_compat_thread(task_thread_info(p)))
                return mask;

        return system_32bit_el0_cpumask();
}

static inline const struct cpumask *
task_cpu_possible_mask(struct task_struct *p)
{
        return __task_cpu_possible_mask(p, cpu_possible_mask);
}
#define task_cpu_possible_mask        task_cpu_possible_mask

const struct cpumask *task_cpu_fallback_mask(struct task_struct *p);

void verify_cpu_asid_bits(void);
void post_ttbr_update_workaround(void);

unsigned long arm64_mm_context_get(struct mm_struct *mm);
void arm64_mm_context_put(struct mm_struct *mm);

#define mm_untag_mask mm_untag_mask
static inline unsigned long mm_untag_mask(struct mm_struct *mm)
{
        return -1UL >> 8;
}

/*
 * Only enforce protection keys on the current process, because there is no
 * user context to access POR_EL0 for another address space.
 */
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
                bool write, bool execute, bool foreign)
{
        if (!system_supports_poe())
                return true;

        /* allow access if the VMA is not one from this process */
        if (foreign || vma_is_foreign(vma))
                return true;

        return por_el0_allows_pkey(vma_pkey(vma), write, execute);
}

#define deactivate_mm deactivate_mm
static inline void deactivate_mm(struct task_struct *tsk,
                        struct mm_struct *mm)
{
        gcs_free(tsk);
}


#include <asm-generic/mmu_context.h>

#endif /* !__ASSEMBLY__ */

#endif /* !__ASM_MMU_CONTEXT_H */







































































































































    1 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/cacheflush.h
 *
 * Copyright (C) 1999-2002 Russell King.
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_CACHEFLUSH_H
#define __ASM_CACHEFLUSH_H

#include <linux/kgdb.h>
#include <linux/mm.h>

/*
 * This flag is used to indicate that the page pointed to by a pte is clean
 * and does not require cleaning before returning it to the user.
 */
#define PG_dcache_clean PG_arch_1

/*
 *        MM Cache Management
 *        ===================
 *
 *        The arch/arm64/mm/cache.S implements these methods.
 *
 *        Start addresses are inclusive and end addresses are exclusive; start
 *        addresses should be rounded down, end addresses up.
 *
 *        See Documentation/core-api/cachetlb.rst for more information. Please note that
 *        the implementation assumes non-aliasing VIPT D-cache and (aliasing)
 *        VIPT I-cache.
 *
 *        All functions below apply to the interval [start, end)
 *                - start  - virtual start address (inclusive)
 *                - end    - virtual end address (exclusive)
 *
 *        caches_clean_inval_pou(start, end)
 *
 *                Ensure coherency between the I-cache and the D-cache region to
 *                the Point of Unification.
 *
 *        caches_clean_inval_user_pou(start, end)
 *
 *                Ensure coherency between the I-cache and the D-cache region to
 *                the Point of Unification.
 *                Use only if the region might access user memory.
 *
 *        icache_inval_pou(start, end)
 *
 *                Invalidate I-cache region to the Point of Unification.
 *
 *        dcache_clean_inval_poc(start, end)
 *
 *                Clean and invalidate D-cache region to the Point of Coherency.
 *
 *        dcache_inval_poc(start, end)
 *
 *                Invalidate D-cache region to the Point of Coherency.
 *
 *        dcache_clean_poc(start, end)
 *
 *                Clean D-cache region to the Point of Coherency.
 *
 *        dcache_clean_pop(start, end)
 *
 *                Clean D-cache region to the Point of Persistence.
 *
 *        dcache_clean_pou(start, end)
 *
 *                Clean D-cache region to the Point of Unification.
 */
extern void caches_clean_inval_pou(unsigned long start, unsigned long end);
extern void icache_inval_pou(unsigned long start, unsigned long end);
extern void dcache_clean_inval_poc(unsigned long start, unsigned long end);
extern void dcache_inval_poc(unsigned long start, unsigned long end);
extern void dcache_clean_poc(unsigned long start, unsigned long end);
extern void dcache_clean_pop(unsigned long start, unsigned long end);
extern void dcache_clean_pou(unsigned long start, unsigned long end);
extern long caches_clean_inval_user_pou(unsigned long start, unsigned long end);
extern void sync_icache_aliases(unsigned long start, unsigned long end);

static inline void flush_icache_range(unsigned long start, unsigned long end)
{
        caches_clean_inval_pou(start, end);

        /*
         * IPI all online CPUs so that they undergo a context synchronization
         * event and are forced to refetch the new instructions.
         */

        /*
         * KGDB performs cache maintenance with interrupts disabled, so we
         * will deadlock trying to IPI the secondary CPUs. In theory, we can
         * set CACHE_FLUSH_IS_SAFE to 0 to avoid this known issue, but that
         * just means that KGDB will elide the maintenance altogether! As it
         * turns out, KGDB uses IPIs to round-up the secondary CPUs during
         * the patching operation, so we don't need extra IPIs here anyway.
         * In which case, add a KGDB-specific bodge and return early.
         */
        if (in_dbg_master())
                return;

        kick_all_cpus_sync();
}
#define flush_icache_range flush_icache_range

/*
 * Copy user data from/to a page which is mapped into a different
 * processes address space.  Really, we want to allow our "user
 * space" model to handle this.
 */
extern void copy_to_user_page(struct vm_area_struct *, struct page *,
        unsigned long, void *, const void *, unsigned long);
#define copy_to_user_page copy_to_user_page

/*
 * flush_dcache_folio is used when the kernel has written to the page
 * cache page at virtual address page->virtual.
 *
 * If this page isn't mapped (ie, folio_mapping == NULL), or it might
 * have userspace mappings, then we _must_ always clean + invalidate
 * the dcache entries associated with the kernel mapping.
 *
 * Otherwise we can defer the operation, and clean the cache when we are
 * about to change to user space.  This is the same method as used on SPARC64.
 * See update_mmu_cache for the user space part.
 */
#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
extern void flush_dcache_page(struct page *);
void flush_dcache_folio(struct folio *);
#define flush_dcache_folio flush_dcache_folio

static __always_inline void icache_inval_all_pou(void)
{
        if (alternative_has_cap_unlikely(ARM64_HAS_CACHE_DIC))
                return;

        asm("ic        ialluis");
        dsb(ish);
}

#include <asm-generic/cacheflush.h>

#endif /* __ASM_CACHEFLUSH_H */


































































































































   31 























































































   31 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_IVERSION_H
#define _LINUX_IVERSION_H

#include <linux/fs.h>

/*
 * The inode->i_version field:
 * ---------------------------
 * The change attribute (i_version) is mandated by NFSv4 and is mostly for
 * knfsd, but is also used for other purposes (e.g. IMA). The i_version must
 * appear larger to observers if there was an explicit change to the inode's
 * data or metadata since it was last queried.
 *
 * An explicit change is one that would ordinarily result in a change to the
 * inode status change time (aka ctime). i_version must appear to change, even
 * if the ctime does not (since the whole point is to avoid missing updates due
 * to timestamp granularity). If POSIX or other relevant spec mandates that the
 * ctime must change due to an operation, then the i_version counter must be
 * incremented as well.
 *
 * Making the i_version update completely atomic with the operation itself would
 * be prohibitively expensive. Traditionally the kernel has updated the times on
 * directories after an operation that changes its contents. For regular files,
 * the ctime is usually updated before the data is copied into the cache for a
 * write. This means that there is a window of time when an observer can
 * associate a new timestamp with old file contents. Since the purpose of the
 * i_version is to allow for better cache coherency, the i_version must always
 * be updated after the results of the operation are visible. Updating it before
 * and after a change is also permitted. (Note that no filesystems currently do
 * this. Fixing that is a work-in-progress).
 *
 * Observers see the i_version as a 64-bit number that never decreases. If it
 * remains the same since it was last checked, then nothing has changed in the
 * inode. If it's different then something has changed. Observers cannot infer
 * anything about the nature or magnitude of the changes from the value, only
 * that the inode has changed in some fashion.
 *
 * Not all filesystems properly implement the i_version counter. Subsystems that
 * want to use i_version field on an inode should first check whether the
 * filesystem sets the SB_I_VERSION flag (usually via the IS_I_VERSION macro).
 *
 * Those that set SB_I_VERSION will automatically have their i_version counter
 * incremented on writes to normal files. If the SB_I_VERSION is not set, then
 * the VFS will not touch it on writes, and the filesystem can use it how it
 * wishes. Note that the filesystem is always responsible for updating the
 * i_version on namespace changes in directories (mkdir, rmdir, unlink, etc.).
 * We consider these sorts of filesystems to have a kernel-managed i_version.
 *
 * It may be impractical for filesystems to keep i_version updates atomic with
 * respect to the changes that cause them.  They should, however, guarantee
 * that i_version updates are never visible before the changes that caused
 * them.  Also, i_version updates should never be delayed longer than it takes
 * the original change to reach disk.
 *
 * This implementation uses the low bit in the i_version field as a flag to
 * track when the value has been queried. If it has not been queried since it
 * was last incremented, we can skip the increment in most cases.
 *
 * In the event that we're updating the ctime, we will usually go ahead and
 * bump the i_version anyway. Since that has to go to stable storage in some
 * fashion, we might as well increment it as well.
 *
 * With this implementation, the value should always appear to observers to
 * increase over time if the file has changed. It's recommended to use
 * inode_eq_iversion() helper to compare values.
 *
 * Note that some filesystems (e.g. NFS and AFS) just use the field to store
 * a server-provided value (for the most part). For that reason, those
 * filesystems do not set SB_I_VERSION. These filesystems are considered to
 * have a self-managed i_version.
 *
 * Persistently storing the i_version
 * ----------------------------------
 * Queries of the i_version field are not gated on them hitting the backing
 * store. It's always possible that the host could crash after allowing
 * a query of the value but before it has made it to disk.
 *
 * To mitigate this problem, filesystems should always use
 * inode_set_iversion_queried when loading an existing inode from disk. This
 * ensures that the next attempted inode increment will result in the value
 * changing.
 *
 * Storing the value to disk therefore does not count as a query, so those
 * filesystems should use inode_peek_iversion to grab the value to be stored.
 * There is no need to flag the value as having been queried in that case.
 */

/*
 * We borrow the lowest bit in the i_version to use as a flag to tell whether
 * it has been queried since we last incremented it. If it has, then we must
 * increment it on the next change. After that, we can clear the flag and
 * avoid incrementing it again until it has again been queried.
 */
#define I_VERSION_QUERIED_SHIFT        (1)
#define I_VERSION_QUERIED        (1ULL << (I_VERSION_QUERIED_SHIFT - 1))
#define I_VERSION_INCREMENT        (1ULL << I_VERSION_QUERIED_SHIFT)

/**
 * inode_set_iversion_raw - set i_version to the specified raw value
 * @inode: inode to set
 * @val: new i_version value to set
 *
 * Set @inode's i_version field to @val. This function is for use by
 * filesystems that self-manage the i_version.
 *
 * For example, the NFS client stores its NFSv4 change attribute in this way,
 * and the AFS client stores the data_version from the server here.
 */
static inline void
inode_set_iversion_raw(struct inode *inode, u64 val)
{
        atomic64_set(&inode->i_version, val);
}

/**
 * inode_peek_iversion_raw - grab a "raw" iversion value
 * @inode: inode from which i_version should be read
 *
 * Grab a "raw" inode->i_version value and return it. The i_version is not
 * flagged or converted in any way. This is mostly used to access a self-managed
 * i_version.
 *
 * With those filesystems, we want to treat the i_version as an entirely
 * opaque value.
 */
static inline u64
inode_peek_iversion_raw(const struct inode *inode)
{
        return atomic64_read(&inode->i_version);
}

/**
 * inode_set_max_iversion_raw - update i_version new value is larger
 * @inode: inode to set
 * @val: new i_version to set
 *
 * Some self-managed filesystems (e.g Ceph) will only update the i_version
 * value if the new value is larger than the one we already have.
 */
static inline void
inode_set_max_iversion_raw(struct inode *inode, u64 val)
{
        u64 cur = inode_peek_iversion_raw(inode);

        do {
                if (cur > val)
                        break;
        } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, val));
}

/**
 * inode_set_iversion - set i_version to a particular value
 * @inode: inode to set
 * @val: new i_version value to set
 *
 * Set @inode's i_version field to @val. This function is for filesystems with
 * a kernel-managed i_version, for initializing a newly-created inode from
 * scratch.
 *
 * In this case, we do not set the QUERIED flag since we know that this value
 * has never been queried.
 */
static inline void
inode_set_iversion(struct inode *inode, u64 val)
{
        inode_set_iversion_raw(inode, val << I_VERSION_QUERIED_SHIFT);
}

/**
 * inode_set_iversion_queried - set i_version to a particular value as quereied
 * @inode: inode to set
 * @val: new i_version value to set
 *
 * Set @inode's i_version field to @val, and flag it for increment on the next
 * change.
 *
 * Filesystems that persistently store the i_version on disk should use this
 * when loading an existing inode from disk.
 *
 * When loading in an i_version value from a backing store, we can't be certain
 * that it wasn't previously viewed before being stored. Thus, we must assume
 * that it was, to ensure that we don't end up handing out the same value for
 * different versions of the same inode.
 */
static inline void
inode_set_iversion_queried(struct inode *inode, u64 val)
{
        inode_set_iversion_raw(inode, (val << I_VERSION_QUERIED_SHIFT) |
                                I_VERSION_QUERIED);
}

bool inode_maybe_inc_iversion(struct inode *inode, bool force);

/**
 * inode_inc_iversion - forcibly increment i_version
 * @inode: inode that needs to be updated
 *
 * Forcbily increment the i_version field. This always results in a change to
 * the observable value.
 */
static inline void
inode_inc_iversion(struct inode *inode)
{
        inode_maybe_inc_iversion(inode, true);
}

/**
 * inode_iversion_need_inc - is the i_version in need of being incremented?
 * @inode: inode to check
 *
 * Returns whether the inode->i_version counter needs incrementing on the next
 * change. Just fetch the value and check the QUERIED flag.
 */
static inline bool
inode_iversion_need_inc(struct inode *inode)
{
        return inode_peek_iversion_raw(inode) & I_VERSION_QUERIED;
}

/**
 * inode_inc_iversion_raw - forcibly increment raw i_version
 * @inode: inode that needs to be updated
 *
 * Forcbily increment the raw i_version field. This always results in a change
 * to the raw value.
 *
 * NFS will use the i_version field to store the value from the server. It
 * mostly treats it as opaque, but in the case where it holds a write
 * delegation, it must increment the value itself. This function does that.
 */
static inline void
inode_inc_iversion_raw(struct inode *inode)
{
        atomic64_inc(&inode->i_version);
}

/**
 * inode_peek_iversion - read i_version without flagging it to be incremented
 * @inode: inode from which i_version should be read
 *
 * Read the inode i_version counter for an inode without registering it as a
 * query.
 *
 * This is typically used by local filesystems that need to store an i_version
 * on disk. In that situation, it's not necessary to flag it as having been
 * viewed, as the result won't be used to gauge changes from that point.
 */
static inline u64
inode_peek_iversion(const struct inode *inode)
{
        return inode_peek_iversion_raw(inode) >> I_VERSION_QUERIED_SHIFT;
}

/*
 * For filesystems without any sort of change attribute, the best we can
 * do is fake one up from the ctime:
 */
static inline u64 time_to_chattr(const struct timespec64 *t)
{
        u64 chattr = t->tv_sec;

        chattr <<= 32;
        chattr += t->tv_nsec;
        return chattr;
}

u64 inode_query_iversion(struct inode *inode);

/**
 * inode_eq_iversion_raw - check whether the raw i_version counter has changed
 * @inode: inode to check
 * @old: old value to check against its i_version
 *
 * Compare the current raw i_version counter with a previous one. Returns true
 * if they are the same or false if they are different.
 */
static inline bool
inode_eq_iversion_raw(const struct inode *inode, u64 old)
{
        return inode_peek_iversion_raw(inode) == old;
}

/**
 * inode_eq_iversion - check whether the i_version counter has changed
 * @inode: inode to check
 * @old: old value to check against its i_version
 *
 * Compare an i_version counter with a previous one. Returns true if they are
 * the same, and false if they are different.
 *
 * Note that we don't need to set the QUERIED flag in this case, as the value
 * in the inode is not being recorded for later use.
 */
static inline bool
inode_eq_iversion(const struct inode *inode, u64 old)
{
        return inode_peek_iversion(inode) == old;
}
#endif





































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_IP6_ROUTE_H
#define _NET_IP6_ROUTE_H

#include <net/addrconf.h>
#include <net/flow.h>
#include <net/ip6_fib.h>
#include <net/sock.h>
#include <net/lwtunnel.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/route.h>
#include <net/nexthop.h>

struct route_info {
        __u8                        type;
        __u8                        length;
        __u8                        prefix_len;
#if defined(__BIG_ENDIAN_BITFIELD)
        __u8                        reserved_h:3,
                                route_pref:2,
                                reserved_l:3;
#elif defined(__LITTLE_ENDIAN_BITFIELD)
        __u8                        reserved_l:3,
                                route_pref:2,
                                reserved_h:3;
#endif
        __be32                        lifetime;
        __u8                        prefix[];        /* 0,8 or 16 */
};

#define RT6_LOOKUP_F_IFACE                0x00000001
#define RT6_LOOKUP_F_REACHABLE                0x00000002
#define RT6_LOOKUP_F_HAS_SADDR                0x00000004
#define RT6_LOOKUP_F_SRCPREF_TMP        0x00000008
#define RT6_LOOKUP_F_SRCPREF_PUBLIC        0x00000010
#define RT6_LOOKUP_F_SRCPREF_COA        0x00000020
#define RT6_LOOKUP_F_IGNORE_LINKSTATE        0x00000040
#define RT6_LOOKUP_F_DST_NOREF                0x00000080

/* We do not (yet ?) support IPv6 jumbograms (RFC 2675)
 * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header
 */
#define IP6_MAX_MTU (0xFFFF + sizeof(struct ipv6hdr))

/*
 * rt6_srcprefs2flags() and rt6_flags2srcprefs() translate
 * between IPV6_ADDR_PREFERENCES socket option values
 *        IPV6_PREFER_SRC_TMP    = 0x1
 *        IPV6_PREFER_SRC_PUBLIC = 0x2
 *        IPV6_PREFER_SRC_COA    = 0x4
 * and above RT6_LOOKUP_F_SRCPREF_xxx flags.
 */
static inline int rt6_srcprefs2flags(unsigned int srcprefs)
{
        return (srcprefs & IPV6_PREFER_SRC_MASK) << 3;
}

static inline unsigned int rt6_flags2srcprefs(int flags)
{
        return (flags >> 3) & IPV6_PREFER_SRC_MASK;
}

static inline bool rt6_need_strict(const struct in6_addr *daddr)
{
        return ipv6_addr_type(daddr) &
                (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}

/* fib entries using a nexthop object can not be coalesced into
 * a multipath route
 */
static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
{
        /* the RTF_ADDRCONF flag filters out RA's */
        return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh &&
                f6i->fib6_nh->fib_nh_gw_family;
}

void ip6_route_input(struct sk_buff *skb);
struct dst_entry *ip6_route_input_lookup(struct net *net,
                                         struct net_device *dev,
                                         struct flowi6 *fl6,
                                         const struct sk_buff *skb, int flags);

struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
                                         struct flowi6 *fl6, int flags);

static inline struct dst_entry *ip6_route_output(struct net *net,
                                                 const struct sock *sk,
                                                 struct flowi6 *fl6)
{
        return ip6_route_output_flags(net, sk, fl6, 0);
}

/* Only conditionally release dst if flags indicates
 * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list.
 */
static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
{
        if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
            !list_empty(&rt->dst.rt_uncached))
                ip6_rt_put(rt);
}

struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb, int flags);
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
                               int ifindex, struct flowi6 *fl6,
                               const struct sk_buff *skb, int flags);

void ip6_route_init_special_entries(void);
int ip6_route_init(void);
void ip6_route_cleanup(void);

int ipv6_route_ioctl(struct net *net, unsigned int cmd,
                struct in6_rtmsg *rtmsg);

int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
                  struct netlink_ext_ack *extack);
int ip6_ins_rt(struct net *net, struct fib6_info *f6i);
int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify);

void rt6_flush_exceptions(struct fib6_info *f6i);
void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args,
                        unsigned long now);

static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i,
                                      const struct in6_addr *daddr,
                                      unsigned int prefs, int l3mdev_index,
                                      struct in6_addr *saddr)
{
        struct net_device *l3mdev;
        struct net_device *dev;
        bool same_vrf;
        int err = 0;

        rcu_read_lock();

        l3mdev = dev_get_by_index_rcu(net, l3mdev_index);
        if (!f6i || !f6i->fib6_prefsrc.plen || l3mdev)
                dev = f6i ? fib6_info_nh_dev(f6i) : NULL;
        same_vrf = !l3mdev || l3mdev_master_dev_rcu(dev) == l3mdev;
        if (f6i && f6i->fib6_prefsrc.plen && same_vrf)
                *saddr = f6i->fib6_prefsrc.addr;
        else
                err = ipv6_dev_get_saddr(net, same_vrf ? dev : l3mdev, daddr, prefs, saddr);

        rcu_read_unlock();

        return err;
}

struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
                            const struct in6_addr *saddr, int oif,
                            const struct sk_buff *skb, int flags);
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
                       const struct sk_buff *skb, struct flow_keys *hkeys);

struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);

void fib6_force_start_gc(struct net *net);

struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev,
                                     const struct in6_addr *addr, bool anycast,
                                     gfp_t gfp_flags, struct netlink_ext_ack *extack);

struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
                               int flags);

/*
 *        support functions for ND
 *
 */
struct fib6_info *rt6_get_dflt_router(struct net *net,
                                     const struct in6_addr *addr,
                                     struct net_device *dev);
struct fib6_info *rt6_add_dflt_router(struct net *net,
                                     const struct in6_addr *gwaddr,
                                     struct net_device *dev, unsigned int pref,
                                     u32 defrtr_usr_metric,
                                     int lifetime);

void rt6_purge_dflt_routers(struct net *net);

int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
                  const struct in6_addr *gwaddr);

void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif,
                     u32 mark, kuid_t uid);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu);
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
                  kuid_t uid);
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif);
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk);

struct netlink_callback;

struct rt6_rtnl_dump_arg {
        struct sk_buff *skb;
        struct netlink_callback *cb;
        struct net *net;
        struct fib_dump_filter filter;
};

int rt6_dump_route(struct fib6_info *f6i, void *p_arg, unsigned int skip);
void rt6_mtu_change(struct net_device *dev, unsigned int mtu);
void rt6_remove_prefsrc(struct inet6_ifaddr *ifp);
void rt6_clean_tohost(struct net *net, struct in6_addr *gateway);
void rt6_sync_up(struct net_device *dev, unsigned char nh_flags);
void rt6_disable_ip(struct net_device *dev, unsigned long event);
void rt6_sync_down_dev(struct net_device *dev, unsigned long event);
void rt6_multipath_rebalance(struct fib6_info *f6i);

void rt6_uncached_list_add(struct rt6_info *rt);
void rt6_uncached_list_del(struct rt6_info *rt);

static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb)
{
        const struct dst_entry *dst = skb_dst(skb);

        if (dst)
                return dst_rt6_info(dst);

        return NULL;
}

/*
 *        Store a destination cache entry in a socket
 */
static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst,
                                 const struct in6_addr *daddr,
                                 const struct in6_addr *saddr)
{
        struct ipv6_pinfo *np = inet6_sk(sk);

        np->dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
        sk_setup_caps(sk, dst);
        np->daddr_cache = daddr;
#ifdef CONFIG_IPV6_SUBTREES
        np->saddr_cache = saddr;
#endif
}

void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
                           const struct flowi6 *fl6);

static inline bool ipv6_unicast_destination(const struct sk_buff *skb)
{
        const struct rt6_info *rt = dst_rt6_info(skb_dst(skb));

        return rt->rt6i_flags & RTF_LOCAL;
}

static inline bool ipv6_anycast_destination(const struct dst_entry *dst,
                                            const struct in6_addr *daddr)
{
        const struct rt6_info *rt = dst_rt6_info(dst);

        return rt->rt6i_flags & RTF_ANYCAST ||
                (rt->rt6i_dst.plen < 127 &&
                 !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) &&
                 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr));
}

int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
                 int (*output)(struct net *, struct sock *, struct sk_buff *));

static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb)
{
        const struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
                                inet6_sk(skb->sk) : NULL;
        const struct dst_entry *dst = skb_dst(skb);
        unsigned int mtu;

        if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) {
                mtu = READ_ONCE(dst->dev->mtu);
                mtu -= lwtunnel_headroom(dst->lwtstate, mtu);
        } else {
                mtu = dst_mtu(dst);
        }
        return mtu;
}

static inline bool ip6_sk_accept_pmtu(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);

        return pmtudisc != IPV6_PMTUDISC_INTERFACE &&
               pmtudisc != IPV6_PMTUDISC_OMIT;
}

static inline bool ip6_sk_ignore_df(const struct sock *sk)
{
        u8 pmtudisc = READ_ONCE(inet6_sk(sk)->pmtudisc);

        return pmtudisc < IPV6_PMTUDISC_DO ||
               pmtudisc == IPV6_PMTUDISC_OMIT;
}

static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt,
                                                 const struct in6_addr *daddr)
{
        if (rt->rt6i_flags & RTF_GATEWAY)
                return &rt->rt6i_gateway;
        else if (unlikely(rt->rt6i_flags & RTF_CACHE))
                return &rt->rt6i_dst.addr;
        else
                return daddr;
}

static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
{
        struct fib6_nh *nha, *nhb;

        if (a->nh || b->nh)
                return nexthop_cmp(a->nh, b->nh);

        nha = a->fib6_nh;
        nhb = b->fib6_nh;
        return nha->fib_nh_dev == nhb->fib_nh_dev &&
               ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
               !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
}

static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst,
                                                     bool forwarding)
{
        struct inet6_dev *idev;
        unsigned int mtu;

        if (!forwarding || dst_metric_locked(dst, RTAX_MTU)) {
                mtu = dst_metric_raw(dst, RTAX_MTU);
                if (mtu)
                        goto out;
        }

        mtu = IPV6_MIN_MTU;
        rcu_read_lock();
        idev = __in6_dev_get(dst->dev);
        if (idev)
                mtu = READ_ONCE(idev->cnf.mtu6);
        rcu_read_unlock();

out:
        return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}

u32 ip6_mtu_from_fib6(const struct fib6_result *res,
                      const struct in6_addr *daddr,
                      const struct in6_addr *saddr);

struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
                                   struct net_device *dev, struct sk_buff *skb,
                                   const void *daddr);
#endif










































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
// SPDX-License-Identifier: GPL-2.0-only
/*
 * VFIO core
 *
 * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
 *     Author: Alex Williamson <alex.williamson@redhat.com>
 *
 * Derived from original vfio:
 * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
 * Author: Tom Lyon, pugs@cisco.com
 */

#include <linux/vfio.h>
#include <linux/iommufd.h>
#include <linux/anon_inodes.h>
#include "vfio.h"

static struct vfio {
        struct class                        *class;
        struct list_head                group_list;
        struct mutex                        group_lock; /* locks group_list */
        struct ida                        group_ida;
        dev_t                                group_devt;
} vfio;

static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
                                                     char *buf)
{
        struct vfio_device *it, *device = ERR_PTR(-ENODEV);

        mutex_lock(&group->device_lock);
        list_for_each_entry(it, &group->device_list, group_next) {
                int ret;

                if (it->ops->match) {
                        ret = it->ops->match(it, buf);
                        if (ret < 0) {
                                device = ERR_PTR(ret);
                                break;
                        }
                } else {
                        ret = !strcmp(dev_name(it->dev), buf);
                }

                if (ret && vfio_device_try_get_registration(it)) {
                        device = it;
                        break;
                }
        }
        mutex_unlock(&group->device_lock);

        return device;
}

/*
 * VFIO Group fd, /dev/vfio/$GROUP
 */
static bool vfio_group_has_iommu(struct vfio_group *group)
{
        lockdep_assert_held(&group->group_lock);
        /*
         * There can only be users if there is a container, and if there is a
         * container there must be users.
         */
        WARN_ON(!group->container != !group->container_users);

        return group->container || group->iommufd;
}

/*
 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
 * if there was no container to unset.  Since the ioctl is called on
 * the group, we know that still exists, therefore the only valid
 * transition here is 1->0.
 */
static int vfio_group_ioctl_unset_container(struct vfio_group *group)
{
        int ret = 0;

        mutex_lock(&group->group_lock);
        if (!vfio_group_has_iommu(group)) {
                ret = -EINVAL;
                goto out_unlock;
        }
        if (group->container) {
                if (group->container_users != 1) {
                        ret = -EBUSY;
                        goto out_unlock;
                }
                vfio_group_detach_container(group);
        }
        if (group->iommufd) {
                iommufd_ctx_put(group->iommufd);
                group->iommufd = NULL;
        }

out_unlock:
        mutex_unlock(&group->group_lock);
        return ret;
}

static int vfio_group_ioctl_set_container(struct vfio_group *group,
                                          int __user *arg)
{
        struct vfio_container *container;
        struct iommufd_ctx *iommufd;
        int ret;
        int fd;

        if (get_user(fd, arg))
                return -EFAULT;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        mutex_lock(&group->group_lock);
        if (vfio_group_has_iommu(group)) {
                ret = -EINVAL;
                goto out_unlock;
        }
        if (!group->iommu_group) {
                ret = -ENODEV;
                goto out_unlock;
        }

        container = vfio_container_from_file(fd_file(f));
        if (container) {
                ret = vfio_container_attach_group(container, group);
                goto out_unlock;
        }

        iommufd = iommufd_ctx_from_file(fd_file(f));
        if (!IS_ERR(iommufd)) {
                if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) &&
                    group->type == VFIO_NO_IOMMU)
                        ret = iommufd_vfio_compat_set_no_iommu(iommufd);
                else
                        ret = iommufd_vfio_compat_ioas_create(iommufd);

                if (ret) {
                        iommufd_ctx_put(iommufd);
                        goto out_unlock;
                }

                group->iommufd = iommufd;
                goto out_unlock;
        }

        /* The FD passed is not recognized. */
        ret = -EBADFD;

out_unlock:
        mutex_unlock(&group->group_lock);
        return ret;
}

static void vfio_device_group_get_kvm_safe(struct vfio_device *device)
{
        spin_lock(&device->group->kvm_ref_lock);
        vfio_device_get_kvm_safe(device, device->group->kvm);
        spin_unlock(&device->group->kvm_ref_lock);
}

static int vfio_df_group_open(struct vfio_device_file *df)
{
        struct vfio_device *device = df->device;
        int ret;

        mutex_lock(&device->group->group_lock);
        if (!vfio_group_has_iommu(device->group)) {
                ret = -EINVAL;
                goto out_unlock;
        }

        mutex_lock(&device->dev_set->lock);

        /*
         * Before the first device open, get the KVM pointer currently
         * associated with the group (if there is one) and obtain a reference
         * now that will be held until the open_count reaches 0 again.  Save
         * the pointer in the device for use by drivers.
         */
        if (device->open_count == 0)
                vfio_device_group_get_kvm_safe(device);

        df->iommufd = device->group->iommufd;
        if (df->iommufd && vfio_device_is_noiommu(device) && device->open_count == 0) {
                /*
                 * Require no compat ioas to be assigned to proceed.  The basic
                 * statement is that the user cannot have done something that
                 * implies they expected translation to exist
                 */
                if (!capable(CAP_SYS_RAWIO) ||
                    vfio_iommufd_device_has_compat_ioas(device, df->iommufd))
                        ret = -EPERM;
                else
                        ret = 0;
                goto out_put_kvm;
        }

        ret = vfio_df_open(df);
        if (ret)
                goto out_put_kvm;

        if (df->iommufd && device->open_count == 1) {
                ret = vfio_iommufd_compat_attach_ioas(device, df->iommufd);
                if (ret)
                        goto out_close_device;
        }

        /*
         * Paired with smp_load_acquire() in vfio_device_fops::ioctl/
         * read/write/mmap and vfio_file_has_device_access()
         */
        smp_store_release(&df->access_granted, true);

        mutex_unlock(&device->dev_set->lock);
        mutex_unlock(&device->group->group_lock);
        return 0;

out_close_device:
        vfio_df_close(df);
out_put_kvm:
        df->iommufd = NULL;
        if (device->open_count == 0)
                vfio_device_put_kvm(device);
        mutex_unlock(&device->dev_set->lock);
out_unlock:
        mutex_unlock(&device->group->group_lock);
        return ret;
}

void vfio_df_group_close(struct vfio_device_file *df)
{
        struct vfio_device *device = df->device;

        mutex_lock(&device->group->group_lock);
        mutex_lock(&device->dev_set->lock);

        vfio_df_close(df);
        df->iommufd = NULL;

        if (device->open_count == 0)
                vfio_device_put_kvm(device);

        mutex_unlock(&device->dev_set->lock);
        mutex_unlock(&device->group->group_lock);
}

static struct file *vfio_device_open_file(struct vfio_device *device)
{
        struct vfio_device_file *df;
        struct file *filep;
        int ret;

        df = vfio_allocate_device_file(device);
        if (IS_ERR(df)) {
                ret = PTR_ERR(df);
                goto err_out;
        }

        df->group = device->group;

        ret = vfio_df_group_open(df);
        if (ret)
                goto err_free;

        filep = anon_inode_getfile_fmode("[vfio-device]", &vfio_device_fops,
                                   df, O_RDWR, FMODE_PREAD | FMODE_PWRITE);
        if (IS_ERR(filep)) {
                ret = PTR_ERR(filep);
                goto err_close_device;
        }
        /*
         * Use the pseudo fs inode on the device to link all mmaps
         * to the same address space, allowing us to unmap all vmas
         * associated to this device using unmap_mapping_range().
         */
        filep->f_mapping = device->inode->i_mapping;

        if (device->group->type == VFIO_NO_IOMMU)
                dev_warn(device->dev, "vfio-noiommu device opened by user "
                         "(%s:%d)\n", current->comm, task_pid_nr(current));
        /*
         * On success the ref of device is moved to the file and
         * put in vfio_device_fops_release()
         */
        return filep;

err_close_device:
        vfio_df_group_close(df);
err_free:
        kfree(df);
err_out:
        return ERR_PTR(ret);
}

static int vfio_group_ioctl_get_device_fd(struct vfio_group *group,
                                          char __user *arg)
{
        struct vfio_device *device;
        struct file *filep;
        char *buf;
        int fdno;
        int ret;

        buf = strndup_user(arg, PAGE_SIZE);
        if (IS_ERR(buf))
                return PTR_ERR(buf);

        device = vfio_device_get_from_name(group, buf);
        kfree(buf);
        if (IS_ERR(device))
                return PTR_ERR(device);

        fdno = get_unused_fd_flags(O_CLOEXEC);
        if (fdno < 0) {
                ret = fdno;
                goto err_put_device;
        }

        filep = vfio_device_open_file(device);
        if (IS_ERR(filep)) {
                ret = PTR_ERR(filep);
                goto err_put_fdno;
        }

        fd_install(fdno, filep);
        return fdno;

err_put_fdno:
        put_unused_fd(fdno);
err_put_device:
        vfio_device_put_registration(device);
        return ret;
}

static int vfio_group_ioctl_get_status(struct vfio_group *group,
                                       struct vfio_group_status __user *arg)
{
        unsigned long minsz = offsetofend(struct vfio_group_status, flags);
        struct vfio_group_status status;

        if (copy_from_user(&status, arg, minsz))
                return -EFAULT;

        if (status.argsz < minsz)
                return -EINVAL;

        status.flags = 0;

        mutex_lock(&group->group_lock);
        if (!group->iommu_group) {
                mutex_unlock(&group->group_lock);
                return -ENODEV;
        }

        /*
         * With the container FD the iommu_group_claim_dma_owner() is done
         * during SET_CONTAINER but for IOMMFD this is done during
         * VFIO_GROUP_GET_DEVICE_FD. Meaning that with iommufd
         * VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due
         * to viability.
         */
        if (vfio_group_has_iommu(group))
                status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET |
                                VFIO_GROUP_FLAGS_VIABLE;
        else if (!iommu_group_dma_owner_claimed(group->iommu_group))
                status.flags |= VFIO_GROUP_FLAGS_VIABLE;
        mutex_unlock(&group->group_lock);

        if (copy_to_user(arg, &status, minsz))
                return -EFAULT;
        return 0;
}

static long vfio_group_fops_unl_ioctl(struct file *filep,
                                      unsigned int cmd, unsigned long arg)
{
        struct vfio_group *group = filep->private_data;
        void __user *uarg = (void __user *)arg;

        switch (cmd) {
        case VFIO_GROUP_GET_DEVICE_FD:
                return vfio_group_ioctl_get_device_fd(group, uarg);
        case VFIO_GROUP_GET_STATUS:
                return vfio_group_ioctl_get_status(group, uarg);
        case VFIO_GROUP_SET_CONTAINER:
                return vfio_group_ioctl_set_container(group, uarg);
        case VFIO_GROUP_UNSET_CONTAINER:
                return vfio_group_ioctl_unset_container(group);
        default:
                return -ENOTTY;
        }
}

int vfio_device_block_group(struct vfio_device *device)
{
        struct vfio_group *group = device->group;
        int ret = 0;

        mutex_lock(&group->group_lock);
        if (group->opened_file) {
                ret = -EBUSY;
                goto out_unlock;
        }

        group->cdev_device_open_cnt++;

out_unlock:
        mutex_unlock(&group->group_lock);
        return ret;
}

void vfio_device_unblock_group(struct vfio_device *device)
{
        struct vfio_group *group = device->group;

        mutex_lock(&group->group_lock);
        group->cdev_device_open_cnt--;
        mutex_unlock(&group->group_lock);
}

static int vfio_group_fops_open(struct inode *inode, struct file *filep)
{
        struct vfio_group *group =
                container_of(inode->i_cdev, struct vfio_group, cdev);
        int ret;

        mutex_lock(&group->group_lock);

        /*
         * drivers can be zero if this races with vfio_device_remove_group(), it
         * will be stable at 0 under the group rwsem
         */
        if (refcount_read(&group->drivers) == 0) {
                ret = -ENODEV;
                goto out_unlock;
        }

        if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) {
                ret = -EPERM;
                goto out_unlock;
        }

        if (group->cdev_device_open_cnt) {
                ret = -EBUSY;
                goto out_unlock;
        }

        /*
         * Do we need multiple instances of the group open?  Seems not.
         */
        if (group->opened_file) {
                ret = -EBUSY;
                goto out_unlock;
        }
        group->opened_file = filep;
        filep->private_data = group;
        ret = 0;
out_unlock:
        mutex_unlock(&group->group_lock);
        return ret;
}

static int vfio_group_fops_release(struct inode *inode, struct file *filep)
{
        struct vfio_group *group = filep->private_data;

        filep->private_data = NULL;

        mutex_lock(&group->group_lock);
        /*
         * Device FDs hold a group file reference, therefore the group release
         * is only called when there are no open devices.
         */
        WARN_ON(group->notifier.head);
        if (group->container)
                vfio_group_detach_container(group);
        if (group->iommufd) {
                iommufd_ctx_put(group->iommufd);
                group->iommufd = NULL;
        }
        group->opened_file = NULL;
        mutex_unlock(&group->group_lock);
        return 0;
}

static const struct file_operations vfio_group_fops = {
        .owner                = THIS_MODULE,
        .unlocked_ioctl        = vfio_group_fops_unl_ioctl,
        .compat_ioctl        = compat_ptr_ioctl,
        .open                = vfio_group_fops_open,
        .release        = vfio_group_fops_release,
};

/*
 * Group objects - create, release, get, put, search
 */
static struct vfio_group *
vfio_group_find_from_iommu(struct iommu_group *iommu_group)
{
        struct vfio_group *group;

        lockdep_assert_held(&vfio.group_lock);

        /*
         * group->iommu_group from the vfio.group_list cannot be NULL
         * under the vfio.group_lock.
         */
        list_for_each_entry(group, &vfio.group_list, vfio_next) {
                if (group->iommu_group == iommu_group)
                        return group;
        }
        return NULL;
}

static void vfio_group_release(struct device *dev)
{
        struct vfio_group *group = container_of(dev, struct vfio_group, dev);

        mutex_destroy(&group->device_lock);
        mutex_destroy(&group->group_lock);
        WARN_ON(group->iommu_group);
        WARN_ON(group->cdev_device_open_cnt);
        ida_free(&vfio.group_ida, MINOR(group->dev.devt));
        kfree(group);
}

static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
                                           enum vfio_group_type type)
{
        struct vfio_group *group;
        int minor;

        group = kzalloc(sizeof(*group), GFP_KERNEL);
        if (!group)
                return ERR_PTR(-ENOMEM);

        minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL);
        if (minor < 0) {
                kfree(group);
                return ERR_PTR(minor);
        }

        device_initialize(&group->dev);
        group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor);
        group->dev.class = vfio.class;
        group->dev.release = vfio_group_release;
        cdev_init(&group->cdev, &vfio_group_fops);
        group->cdev.owner = THIS_MODULE;

        refcount_set(&group->drivers, 1);
        mutex_init(&group->group_lock);
        spin_lock_init(&group->kvm_ref_lock);
        INIT_LIST_HEAD(&group->device_list);
        mutex_init(&group->device_lock);
        group->iommu_group = iommu_group;
        /* put in vfio_group_release() */
        iommu_group_ref_get(iommu_group);
        group->type = type;
        BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier);

        return group;
}

static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
                enum vfio_group_type type)
{
        struct vfio_group *group;
        struct vfio_group *ret;
        int err;

        lockdep_assert_held(&vfio.group_lock);

        group = vfio_group_alloc(iommu_group, type);
        if (IS_ERR(group))
                return group;

        err = dev_set_name(&group->dev, "%s%d",
                           group->type == VFIO_NO_IOMMU ? "noiommu-" : "",
                           iommu_group_id(iommu_group));
        if (err) {
                ret = ERR_PTR(err);
                goto err_put;
        }

        err = cdev_device_add(&group->cdev, &group->dev);
        if (err) {
                ret = ERR_PTR(err);
                goto err_put;
        }

        list_add(&group->vfio_next, &vfio.group_list);

        return group;

err_put:
        put_device(&group->dev);
        return ret;
}

static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev,
                enum vfio_group_type type)
{
        struct iommu_group *iommu_group;
        struct vfio_group *group;
        int ret;

        iommu_group = iommu_group_alloc();
        if (IS_ERR(iommu_group))
                return ERR_CAST(iommu_group);

        ret = iommu_group_set_name(iommu_group, "vfio-noiommu");
        if (ret)
                goto out_put_group;
        ret = iommu_group_add_device(iommu_group, dev);
        if (ret)
                goto out_put_group;

        mutex_lock(&vfio.group_lock);
        group = vfio_create_group(iommu_group, type);
        mutex_unlock(&vfio.group_lock);
        if (IS_ERR(group)) {
                ret = PTR_ERR(group);
                goto out_remove_device;
        }
        iommu_group_put(iommu_group);
        return group;

out_remove_device:
        iommu_group_remove_device(dev);
out_put_group:
        iommu_group_put(iommu_group);
        return ERR_PTR(ret);
}

static bool vfio_group_has_device(struct vfio_group *group, struct device *dev)
{
        struct vfio_device *device;

        mutex_lock(&group->device_lock);
        list_for_each_entry(device, &group->device_list, group_next) {
                if (device->dev == dev) {
                        mutex_unlock(&group->device_lock);
                        return true;
                }
        }
        mutex_unlock(&group->device_lock);
        return false;
}

static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
{
        struct iommu_group *iommu_group;
        struct vfio_group *group;

        iommu_group = iommu_group_get(dev);
        if (!iommu_group && vfio_noiommu) {
                /*
                 * With noiommu enabled, create an IOMMU group for devices that
                 * don't already have one, implying no IOMMU hardware/driver
                 * exists.  Taint the kernel because we're about to give a DMA
                 * capable device to a user without IOMMU protection.
                 */
                group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU);
                if (!IS_ERR(group)) {
                        add_taint(TAINT_USER, LOCKDEP_STILL_OK);
                        dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n");
                }
                return group;
        }

        if (!iommu_group)
                return ERR_PTR(-EINVAL);

        mutex_lock(&vfio.group_lock);
        group = vfio_group_find_from_iommu(iommu_group);
        if (group) {
                if (WARN_ON(vfio_group_has_device(group, dev)))
                        group = ERR_PTR(-EINVAL);
                else
                        refcount_inc(&group->drivers);
        } else {
                group = vfio_create_group(iommu_group, VFIO_IOMMU);
        }
        mutex_unlock(&vfio.group_lock);

        /* The vfio_group holds a reference to the iommu_group */
        iommu_group_put(iommu_group);
        return group;
}

int vfio_device_set_group(struct vfio_device *device,
                          enum vfio_group_type type)
{
        struct vfio_group *group;

        if (type == VFIO_IOMMU)
                group = vfio_group_find_or_alloc(device->dev);
        else
                group = vfio_noiommu_group_alloc(device->dev, type);

        if (IS_ERR(group))
                return PTR_ERR(group);

        /* Our reference on group is moved to the device */
        device->group = group;
        return 0;
}

void vfio_device_remove_group(struct vfio_device *device)
{
        struct vfio_group *group = device->group;
        struct iommu_group *iommu_group;

        if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
                iommu_group_remove_device(device->dev);

        /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */
        if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock))
                return;
        list_del(&group->vfio_next);

        /*
         * We could concurrently probe another driver in the group that might
         * race vfio_device_remove_group() with vfio_get_group(), so we have to
         * ensure that the sysfs is all cleaned up under lock otherwise the
         * cdev_device_add() will fail due to the name aready existing.
         */
        cdev_device_del(&group->cdev, &group->dev);

        mutex_lock(&group->group_lock);
        /*
         * These data structures all have paired operations that can only be
         * undone when the caller holds a live reference on the device. Since
         * all pairs must be undone these WARN_ON's indicate some caller did not
         * properly hold the group reference.
         */
        WARN_ON(!list_empty(&group->device_list));
        WARN_ON(group->notifier.head);

        /*
         * Revoke all users of group->iommu_group. At this point we know there
         * are no devices active because we are unplugging the last one. Setting
         * iommu_group to NULL blocks all new users.
         */
        if (group->container)
                vfio_group_detach_container(group);
        iommu_group = group->iommu_group;
        group->iommu_group = NULL;
        mutex_unlock(&group->group_lock);
        mutex_unlock(&vfio.group_lock);

        iommu_group_put(iommu_group);
        put_device(&group->dev);
}

void vfio_device_group_register(struct vfio_device *device)
{
        mutex_lock(&device->group->device_lock);
        list_add(&device->group_next, &device->group->device_list);
        mutex_unlock(&device->group->device_lock);
}

void vfio_device_group_unregister(struct vfio_device *device)
{
        mutex_lock(&device->group->device_lock);
        list_del(&device->group_next);
        mutex_unlock(&device->group->device_lock);
}

int vfio_device_group_use_iommu(struct vfio_device *device)
{
        struct vfio_group *group = device->group;
        int ret = 0;

        lockdep_assert_held(&group->group_lock);

        if (WARN_ON(!group->container))
                return -EINVAL;

        ret = vfio_group_use_container(group);
        if (ret)
                return ret;
        vfio_device_container_register(device);
        return 0;
}

void vfio_device_group_unuse_iommu(struct vfio_device *device)
{
        struct vfio_group *group = device->group;

        lockdep_assert_held(&group->group_lock);

        if (WARN_ON(!group->container))
                return;

        vfio_device_container_unregister(device);
        vfio_group_unuse_container(group);
}

bool vfio_device_has_container(struct vfio_device *device)
{
        return device->group->container;
}

struct vfio_group *vfio_group_from_file(struct file *file)
{
        struct vfio_group *group = file->private_data;

        if (file->f_op != &vfio_group_fops)
                return NULL;
        return group;
}

/**
 * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file
 * @file: VFIO group file
 *
 * The returned iommu_group is valid as long as a ref is held on the file. This
 * returns a reference on the group. This function is deprecated, only the SPAPR
 * path in kvm should call it.
 */
struct iommu_group *vfio_file_iommu_group(struct file *file)
{
        struct vfio_group *group = vfio_group_from_file(file);
        struct iommu_group *iommu_group = NULL;

        if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU))
                return NULL;

        if (!group)
                return NULL;

        mutex_lock(&group->group_lock);
        if (group->iommu_group) {
                iommu_group = group->iommu_group;
                iommu_group_ref_get(iommu_group);
        }
        mutex_unlock(&group->group_lock);
        return iommu_group;
}
EXPORT_SYMBOL_GPL(vfio_file_iommu_group);

/**
 * vfio_file_is_group - True if the file is a vfio group file
 * @file: VFIO group file
 */
bool vfio_file_is_group(struct file *file)
{
        return vfio_group_from_file(file);
}
EXPORT_SYMBOL_GPL(vfio_file_is_group);

bool vfio_group_enforced_coherent(struct vfio_group *group)
{
        struct vfio_device *device;
        bool ret = true;

        /*
         * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then
         * any domain later attached to it will also not support it. If the cap
         * is set then the iommu_domain eventually attached to the device/group
         * must use a domain with enforce_cache_coherency().
         */
        mutex_lock(&group->device_lock);
        list_for_each_entry(device, &group->device_list, group_next) {
                if (!device_iommu_capable(device->dev,
                                          IOMMU_CAP_ENFORCE_CACHE_COHERENCY)) {
                        ret = false;
                        break;
                }
        }
        mutex_unlock(&group->device_lock);
        return ret;
}

void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm)
{
        spin_lock(&group->kvm_ref_lock);
        group->kvm = kvm;
        spin_unlock(&group->kvm_ref_lock);
}

/**
 * vfio_file_has_dev - True if the VFIO file is a handle for device
 * @file: VFIO file to check
 * @device: Device that must be part of the file
 *
 * Returns true if given file has permission to manipulate the given device.
 */
bool vfio_file_has_dev(struct file *file, struct vfio_device *device)
{
        struct vfio_group *group = vfio_group_from_file(file);

        if (!group)
                return false;

        return group == device->group;
}
EXPORT_SYMBOL_GPL(vfio_file_has_dev);

static char *vfio_devnode(const struct device *dev, umode_t *mode)
{
        return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
}

int __init vfio_group_init(void)
{
        int ret;

        ida_init(&vfio.group_ida);
        mutex_init(&vfio.group_lock);
        INIT_LIST_HEAD(&vfio.group_list);

        ret = vfio_container_init();
        if (ret)
                return ret;

        /* /dev/vfio/$GROUP */
        vfio.class = class_create("vfio");
        if (IS_ERR(vfio.class)) {
                ret = PTR_ERR(vfio.class);
                goto err_group_class;
        }

        vfio.class->devnode = vfio_devnode;

        ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio");
        if (ret)
                goto err_alloc_chrdev;
        return 0;

err_alloc_chrdev:
        class_destroy(vfio.class);
        vfio.class = NULL;
err_group_class:
        vfio_container_cleanup();
        return ret;
}

void vfio_group_cleanup(void)
{
        WARN_ON(!list_empty(&vfio.group_list));
        ida_destroy(&vfio.group_ida);
        unregister_chrdev_region(vfio.group_devt, MINORMASK + 1);
        class_destroy(vfio.class);
        vfio.class = NULL;
        vfio_container_cleanup();
}































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PATH_H
#define _LINUX_PATH_H

struct dentry;
struct vfsmount;

struct path {
        struct vfsmount *mnt;
        struct dentry *dentry;
} __randomize_layout;

extern void path_get(const struct path *);
extern void path_put(const struct path *);

static inline int path_equal(const struct path *path1, const struct path *path2)
{
        return path1->mnt == path2->mnt && path1->dentry == path2->dentry;
}

/*
 * Cleanup macro for use with __free(path_put). Avoids dereference and
 * copying @path unlike DEFINE_FREE(). path_put() will handle the empty
 * path correctly just ensure @path is initialized:
 *
 * struct path path __free(path_put) = {};
 */
#define __free_path_put path_put

#endif  /* _LINUX_PATH_H */





































































































































































































































































































































































































































  249 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2015, 2016 ARM Ltd.
 */
#ifndef __KVM_ARM_VGIC_H
#define __KVM_ARM_VGIC_H

#include <linux/bits.h>
#include <linux/kvm.h>
#include <linux/irqreturn.h>
#include <linux/kref.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/static_key.h>
#include <linux/types.h>
#include <linux/xarray.h>
#include <kvm/iodev.h>
#include <linux/list.h>
#include <linux/jump_label.h>

#include <linux/irqchip/arm-gic-v4.h>

#define VGIC_V3_MAX_CPUS        512
#define VGIC_V2_MAX_CPUS        8
#define VGIC_NR_IRQS_LEGACY     256
#define VGIC_NR_SGIS                16
#define VGIC_NR_PPIS                16
#define VGIC_NR_PRIVATE_IRQS        (VGIC_NR_SGIS + VGIC_NR_PPIS)
#define VGIC_MAX_SPI                1019
#define VGIC_MAX_RESERVED        1023
#define VGIC_MIN_LPI                8192
#define KVM_IRQCHIP_NUM_PINS        (1020 - 32)

#define irq_is_ppi(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS)
#define irq_is_spi(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \
                         (irq) <= VGIC_MAX_SPI)

enum vgic_type {
        VGIC_V2,                /* Good ol' GICv2 */
        VGIC_V3,                /* New fancy GICv3 */
};

/* same for all guests, as depending only on the _host's_ GIC model */
struct vgic_global {
        /* type of the host GIC */
        enum vgic_type                type;

        /* Physical address of vgic virtual cpu interface */
        phys_addr_t                vcpu_base;

        /* GICV mapping, kernel VA */
        void __iomem                *vcpu_base_va;
        /* GICV mapping, HYP VA */
        void __iomem                *vcpu_hyp_va;

        /* virtual control interface mapping, kernel VA */
        void __iomem                *vctrl_base;
        /* virtual control interface mapping, HYP VA */
        void __iomem                *vctrl_hyp;

        /* Number of implemented list registers */
        int                        nr_lr;

        /* Maintenance IRQ number */
        unsigned int                maint_irq;

        /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
        int                        max_gic_vcpus;

        /* Only needed for the legacy KVM_CREATE_IRQCHIP */
        bool                        can_emulate_gicv2;

        /* Hardware has GICv4? */
        bool                        has_gicv4;
        bool                        has_gicv4_1;

        /* Pseudo GICv3 from outer space */
        bool                        no_hw_deactivation;

        /* GIC system register CPU interface */
        struct static_key_false gicv3_cpuif;

        u32                        ich_vtr_el2;
};

extern struct vgic_global kvm_vgic_global_state;

#define VGIC_V2_MAX_LRS                (1 << 6)
#define VGIC_V3_MAX_LRS                16
#define VGIC_V3_LR_INDEX(lr)        (VGIC_V3_MAX_LRS - 1 - lr)

enum vgic_irq_config {
        VGIC_CONFIG_EDGE = 0,
        VGIC_CONFIG_LEVEL
};

/*
 * Per-irq ops overriding some common behavious.
 *
 * Always called in non-preemptible section and the functions can use
 * kvm_arm_get_running_vcpu() to get the vcpu pointer for private IRQs.
 */
struct irq_ops {
        /* Per interrupt flags for special-cased interrupts */
        unsigned long flags;

#define VGIC_IRQ_SW_RESAMPLE        BIT(0)        /* Clear the active state for resampling */

        /*
         * Callback function pointer to in-kernel devices that can tell us the
         * state of the input level of mapped level-triggered IRQ faster than
         * peaking into the physical GIC.
         */
        bool (*get_input_level)(int vintid);
};

struct vgic_irq {
        raw_spinlock_t irq_lock;        /* Protects the content of the struct */
        struct rcu_head rcu;
        struct list_head ap_list;

        struct kvm_vcpu *vcpu;                /* SGIs and PPIs: The VCPU
                                         * SPIs and LPIs: The VCPU whose ap_list
                                         * this is queued on.
                                         */

        struct kvm_vcpu *target_vcpu;        /* The VCPU that this interrupt should
                                         * be sent to, as a result of the
                                         * targets reg (v2) or the
                                         * affinity reg (v3).
                                         */

        u32 intid;                        /* Guest visible INTID */
        bool line_level;                /* Level only */
        bool pending_latch;                /* The pending latch state used to calculate
                                         * the pending state for both level
                                         * and edge triggered IRQs. */
        bool active;                        /* not used for LPIs */
        bool enabled;
        bool hw;                        /* Tied to HW IRQ */
        struct kref refcount;                /* Used for LPIs */
        u32 hwintid;                        /* HW INTID number */
        unsigned int host_irq;                /* linux irq corresponding to hwintid */
        union {
                u8 targets;                        /* GICv2 target VCPUs mask */
                u32 mpidr;                        /* GICv3 target VCPU */
        };
        u8 source;                        /* GICv2 SGIs only */
        u8 active_source;                /* GICv2 SGIs only */
        u8 priority;
        u8 group;                        /* 0 == group 0, 1 == group 1 */
        enum vgic_irq_config config;        /* Level or edge */

        struct irq_ops *ops;

        void *owner;                        /* Opaque pointer to reserve an interrupt
                                           for in-kernel devices. */
};

static inline bool vgic_irq_needs_resampling(struct vgic_irq *irq)
{
        return irq->ops && (irq->ops->flags & VGIC_IRQ_SW_RESAMPLE);
}

struct vgic_register_region;
struct vgic_its;

enum iodev_type {
        IODEV_CPUIF,
        IODEV_DIST,
        IODEV_REDIST,
        IODEV_ITS
};

struct vgic_io_device {
        gpa_t base_addr;
        union {
                struct kvm_vcpu *redist_vcpu;
                struct vgic_its *its;
        };
        const struct vgic_register_region *regions;
        enum iodev_type iodev_type;
        int nr_regions;
        struct kvm_io_device dev;
};

struct vgic_its {
        /* The base address of the ITS control register frame */
        gpa_t                        vgic_its_base;

        bool                        enabled;
        struct vgic_io_device        iodev;
        struct kvm_device        *dev;

        /* These registers correspond to GITS_BASER{0,1} */
        u64                        baser_device_table;
        u64                        baser_coll_table;

        /* Protects the command queue */
        struct mutex                cmd_lock;
        u64                        cbaser;
        u32                        creadr;
        u32                        cwriter;

        /* migration ABI revision in use */
        u32                        abi_rev;

        /* Protects the device and collection lists */
        struct mutex                its_lock;
        struct list_head        device_list;
        struct list_head        collection_list;

        /*
         * Caches the (device_id, event_id) -> vgic_irq translation for
         * LPIs that are mapped and enabled.
         */
        struct xarray                translation_cache;
};

struct vgic_state_iter;

struct vgic_redist_region {
        u32 index;
        gpa_t base;
        u32 count; /* number of redistributors or 0 if single region */
        u32 free_index; /* index of the next free redistributor */
        struct list_head list;
};

struct vgic_dist {
        bool                        in_kernel;
        bool                        ready;
        bool                        initialized;

        /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
        u32                        vgic_model;

        /* Implementation revision as reported in the GICD_IIDR */
        u32                        implementation_rev;
#define KVM_VGIC_IMP_REV_2        2 /* GICv2 restorable groups */
#define KVM_VGIC_IMP_REV_3        3 /* GICv3 GICR_CTLR.{IW,CES,RWP} */
#define KVM_VGIC_IMP_REV_LATEST        KVM_VGIC_IMP_REV_3

        /* Userspace can write to GICv2 IGROUPR */
        bool                        v2_groups_user_writable;

        /* Do injected MSIs require an additional device ID? */
        bool                        msis_require_devid;

        int                        nr_spis;

        /* The GIC maintenance IRQ for nested hypervisors. */
        u32                        mi_intid;

        /* base addresses in guest physical address space: */
        gpa_t                        vgic_dist_base;                /* distributor */
        union {
                /* either a GICv2 CPU interface */
                gpa_t                        vgic_cpu_base;
                /* or a number of GICv3 redistributor regions */
                struct list_head rd_regions;
        };

        /* distributor enabled */
        bool                        enabled;

        /* Wants SGIs without active state */
        bool                        nassgireq;

        struct vgic_irq                *spis;

        struct vgic_io_device        dist_iodev;

        bool                        has_its;
        bool                        table_write_in_progress;

        /*
         * Contains the attributes and gpa of the LPI configuration table.
         * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share
         * one address across all redistributors.
         * GICv3 spec: IHI 0069E 6.1.1 "LPI Configuration tables"
         */
        u64                        propbaser;

#define LPI_XA_MARK_DEBUG_ITER        XA_MARK_0
        struct xarray                lpi_xa;

        /* used by vgic-debug */
        struct vgic_state_iter *iter;

        /*
         * GICv4 ITS per-VM data, containing the IRQ domain, the VPE
         * array, the property table pointer as well as allocation
         * data. This essentially ties the Linux IRQ core and ITS
         * together, and avoids leaking KVM's data structures anywhere
         * else.
         */
        struct its_vm                its_vm;
};

struct vgic_v2_cpu_if {
        u32                vgic_hcr;
        u32                vgic_vmcr;
        u32                vgic_apr;
        u32                vgic_lr[VGIC_V2_MAX_LRS];

        unsigned int used_lrs;
};

struct vgic_v3_cpu_if {
        u32                vgic_hcr;
        u32                vgic_vmcr;
        u32                vgic_sre;        /* Restored only, change ignored */
        u32                vgic_ap0r[4];
        u32                vgic_ap1r[4];
        u64                vgic_lr[VGIC_V3_MAX_LRS];

        /*
         * GICv4 ITS per-VPE data, containing the doorbell IRQ, the
         * pending table pointer, the its_vm pointer and a few other
         * HW specific things. As for the its_vm structure, this is
         * linking the Linux IRQ subsystem and the ITS together.
         */
        struct its_vpe        its_vpe;

        unsigned int used_lrs;
};

struct vgic_cpu {
        /* CPU vif control registers for world switch */
        union {
                struct vgic_v2_cpu_if        vgic_v2;
                struct vgic_v3_cpu_if        vgic_v3;
        };

        struct vgic_irq *private_irqs;

        raw_spinlock_t ap_list_lock;        /* Protects the ap_list */

        /*
         * List of IRQs that this VCPU should consider because they are either
         * Active or Pending (hence the name; AP list), or because they recently
         * were one of the two and need to be migrated off this list to another
         * VCPU.
         */
        struct list_head ap_list_head;

        /*
         * Members below are used with GICv3 emulation only and represent
         * parts of the redistributor.
         */
        struct vgic_io_device        rd_iodev;
        struct vgic_redist_region *rdreg;
        u32 rdreg_index;
        atomic_t syncr_busy;

        /* Contains the attributes and gpa of the LPI pending tables. */
        u64 pendbaser;
        /* GICR_CTLR.{ENABLE_LPIS,RWP} */
        atomic_t ctlr;

        /* Cache guest priority bits */
        u32 num_pri_bits;

        /* Cache guest interrupt ID bits */
        u32 num_id_bits;
};

extern struct static_key_false vgic_v2_cpuif_trap;
extern struct static_key_false vgic_v3_cpuif_trap;

int kvm_set_legacy_vgic_v2_addr(struct kvm *kvm, struct kvm_arm_device_addr *dev_addr);
void kvm_vgic_early_init(struct kvm *kvm);
int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu);
int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu);
int kvm_vgic_create(struct kvm *kvm, u32 type);
void kvm_vgic_destroy(struct kvm *kvm);
void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
int kvm_vgic_map_resources(struct kvm *kvm);
int kvm_vgic_hyp_init(void);
void kvm_vgic_init_cpu_hardware(void);

int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
                        unsigned int intid, bool level, void *owner);
int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
                          u32 vintid, struct irq_ops *ops);
int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid);
int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid);
bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid);

int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);

void kvm_vgic_load(struct kvm_vcpu *vcpu);
void kvm_vgic_put(struct kvm_vcpu *vcpu);

u16 vgic_v3_get_eisr(struct kvm_vcpu *vcpu);
u16 vgic_v3_get_elrsr(struct kvm_vcpu *vcpu);
u64 vgic_v3_get_misr(struct kvm_vcpu *vcpu);

#define irqchip_in_kernel(k)        (!!((k)->arch.vgic.in_kernel))
#define vgic_initialized(k)        ((k)->arch.vgic.initialized)
#define vgic_ready(k)                ((k)->arch.vgic.ready)
#define vgic_valid_spi(k, i)        (((i) >= VGIC_NR_PRIVATE_IRQS) && \
                        ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))

bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid);

void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg, bool allow_group1);

/**
 * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
 *
 * The host's GIC naturally limits the maximum amount of VCPUs a guest
 * can use.
 */
static inline int kvm_vgic_get_max_vcpus(void)
{
        return kvm_vgic_global_state.max_gic_vcpus;
}

/**
 * kvm_vgic_setup_default_irq_routing:
 * Setup a default flat gsi routing table mapping all SPIs
 */
int kvm_vgic_setup_default_irq_routing(struct kvm *kvm);

int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner);

struct kvm_kernel_irq_routing_entry;

int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int irq,
                               struct kvm_kernel_irq_routing_entry *irq_entry);

int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int irq,
                                 struct kvm_kernel_irq_routing_entry *irq_entry);

int vgic_v4_load(struct kvm_vcpu *vcpu);
void vgic_v4_commit(struct kvm_vcpu *vcpu);
int vgic_v4_put(struct kvm_vcpu *vcpu);

bool vgic_state_is_nested(struct kvm_vcpu *vcpu);

/* CPU HP callbacks */
void kvm_vgic_cpu_up(void);
void kvm_vgic_cpu_down(void);

#endif /* __KVM_ARM_VGIC_H */






































































































































































































































  894 





























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/memory.h
 *
 * Copyright (C) 2000-2002 Russell King
 * Copyright (C) 2012 ARM Ltd.
 *
 * Note: this file should not be included by non-asm/.h files
 */
#ifndef __ASM_MEMORY_H
#define __ASM_MEMORY_H

#include <linux/const.h>
#include <linux/sizes.h>
#include <asm/page-def.h>

/*
 * Size of the PCI I/O space. This must remain a power of two so that
 * IO_SPACE_LIMIT acts as a mask for the low bits of I/O addresses.
 */
#define PCI_IO_SIZE                SZ_16M

/*
 * VMEMMAP_SIZE - allows the whole linear region to be covered by
 *                a struct page array
 *
 * If we are configured with a 52-bit kernel VA then our VMEMMAP_SIZE
 * needs to cover the memory region from the beginning of the 52-bit
 * PAGE_OFFSET all the way to PAGE_END for 48-bit. This allows us to
 * keep a constant PAGE_OFFSET and "fallback" to using the higher end
 * of the VMEMMAP where 52-bit support is not available in hardware.
 */
#define VMEMMAP_RANGE        (_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET)
#define VMEMMAP_SIZE        ((VMEMMAP_RANGE >> PAGE_SHIFT) * sizeof(struct page))

/*
 * PAGE_OFFSET - the virtual address of the start of the linear map, at the
 *               start of the TTBR1 address space.
 * PAGE_END - the end of the linear map, where all other kernel mappings begin.
 * KIMAGE_VADDR - the virtual address of the start of the kernel image.
 * VA_BITS - the maximum number of bits for virtual addresses.
 */
#define VA_BITS                        (CONFIG_ARM64_VA_BITS)
#define _PAGE_OFFSET(va)        (-(UL(1) << (va)))
#define PAGE_OFFSET                (_PAGE_OFFSET(VA_BITS))
#define KIMAGE_VADDR                (MODULES_END)
#define MODULES_END                (MODULES_VADDR + MODULES_VSIZE)
#define MODULES_VADDR                (_PAGE_END(VA_BITS_MIN))
#define MODULES_VSIZE                (SZ_2G)
#define VMEMMAP_START                (VMEMMAP_END - VMEMMAP_SIZE)
#define VMEMMAP_END                (-UL(SZ_1G))
#define PCI_IO_START                (VMEMMAP_END + SZ_8M)
#define PCI_IO_END                (PCI_IO_START + PCI_IO_SIZE)
#define FIXADDR_TOP                (-UL(SZ_8M))

#if VA_BITS > 48
#ifdef CONFIG_ARM64_16K_PAGES
#define VA_BITS_MIN                (47)
#else
#define VA_BITS_MIN                (48)
#endif
#else
#define VA_BITS_MIN                (VA_BITS)
#endif

#define _PAGE_END(va)                (-(UL(1) << ((va) - 1)))

#define KERNEL_START                _text
#define KERNEL_END                _end

/*
 * Generic and Software Tag-Based KASAN modes require 1/8th and 1/16th of the
 * kernel virtual address space for storing the shadow memory respectively.
 *
 * The mapping between a virtual memory address and its corresponding shadow
 * memory address is defined based on the formula:
 *
 *     shadow_addr = (addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET
 *
 * where KASAN_SHADOW_SCALE_SHIFT is the order of the number of bits that map
 * to a single shadow byte and KASAN_SHADOW_OFFSET is a constant that offsets
 * the mapping. Note that KASAN_SHADOW_OFFSET does not point to the start of
 * the shadow memory region.
 *
 * Based on this mapping, we define two constants:
 *
 *     KASAN_SHADOW_START: the start of the shadow memory region;
 *     KASAN_SHADOW_END: the end of the shadow memory region.
 *
 * KASAN_SHADOW_END is defined first as the shadow address that corresponds to
 * the upper bound of possible virtual kernel memory addresses UL(1) << 64
 * according to the mapping formula.
 *
 * KASAN_SHADOW_START is defined second based on KASAN_SHADOW_END. The shadow
 * memory start must map to the lowest possible kernel virtual memory address
 * and thus it depends on the actual bitness of the address space.
 *
 * As KASAN inserts redzones between stack variables, this increases the stack
 * memory usage significantly. Thus, we double the (minimum) stack size.
 */
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_SHADOW_OFFSET        _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
#define KASAN_SHADOW_END        ((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) + KASAN_SHADOW_OFFSET)
#define _KASAN_SHADOW_START(va)        (KASAN_SHADOW_END - (UL(1) << ((va) - KASAN_SHADOW_SCALE_SHIFT)))
#define KASAN_SHADOW_START        _KASAN_SHADOW_START(vabits_actual)
#define PAGE_END                KASAN_SHADOW_START
#define KASAN_THREAD_SHIFT        1
#else
#define KASAN_THREAD_SHIFT        0
#define PAGE_END                (_PAGE_END(VA_BITS_MIN))
#endif /* CONFIG_KASAN */

#define DIRECT_MAP_PHYSMEM_END        __pa(PAGE_END - 1)

#define MIN_THREAD_SHIFT        (14 + KASAN_THREAD_SHIFT)

/*
 * VMAP'd stacks are allocated at page granularity, so we must ensure that such
 * stacks are a multiple of page size.
 */
#if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT)
#define THREAD_SHIFT                PAGE_SHIFT
#else
#define THREAD_SHIFT                MIN_THREAD_SHIFT
#endif

#if THREAD_SHIFT >= PAGE_SHIFT
#define THREAD_SIZE_ORDER        (THREAD_SHIFT - PAGE_SHIFT)
#endif

#define THREAD_SIZE                (UL(1) << THREAD_SHIFT)

/*
 * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by
 * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry
 * assembly.
 */
#ifdef CONFIG_VMAP_STACK
#define THREAD_ALIGN                (2 * THREAD_SIZE)
#else
#define THREAD_ALIGN                THREAD_SIZE
#endif

#define IRQ_STACK_SIZE                THREAD_SIZE

#define OVERFLOW_STACK_SIZE        SZ_4K

#define NVHE_STACK_SHIFT       PAGE_SHIFT
#define NVHE_STACK_SIZE        (UL(1) << NVHE_STACK_SHIFT)

/*
 * With the minimum frame size of [x29, x30], exactly half the combined
 * sizes of the hyp and overflow stacks is the maximum size needed to
 * save the unwinded stacktrace; plus an additional entry to delimit the
 * end.
 */
#define NVHE_STACKTRACE_SIZE        ((OVERFLOW_STACK_SIZE + NVHE_STACK_SIZE) / 2 + sizeof(long))

/*
 * Alignment of kernel segments (e.g. .text, .data).
 *
 *  4 KB granule:  16 level 3 entries, with contiguous bit
 * 16 KB granule:   4 level 3 entries, without contiguous bit
 * 64 KB granule:   1 level 3 entry
 */
#define SEGMENT_ALIGN                SZ_64K

/*
 * Memory types available.
 *
 * IMPORTANT: MT_NORMAL must be index 0 since vm_get_page_prot() may 'or' in
 *              the MT_NORMAL_TAGGED memory type for PROT_MTE mappings. Note
 *              that protection_map[] only contains MT_NORMAL attributes.
 */
#define MT_NORMAL                0
#define MT_NORMAL_TAGGED        1
#define MT_NORMAL_NC                2
#define MT_DEVICE_nGnRnE        3
#define MT_DEVICE_nGnRE                4

/*
 * Memory types for Stage-2 translation
 */
#define MT_S2_NORMAL                0xf
#define MT_S2_NORMAL_NC                0x5
#define MT_S2_DEVICE_nGnRE        0x1

/*
 * Memory types for Stage-2 translation when ID_AA64MMFR2_EL1.FWB is 0001
 * Stage-2 enforces Normal-WB and Device-nGnRE
 */
#define MT_S2_FWB_NORMAL        6
#define MT_S2_FWB_NORMAL_NC        5
#define MT_S2_FWB_DEVICE_nGnRE        1

#ifdef CONFIG_ARM64_4K_PAGES
#define IOREMAP_MAX_ORDER        (PUD_SHIFT)
#else
#define IOREMAP_MAX_ORDER        (PMD_SHIFT)
#endif

/*
 *  Open-coded (swapper_pg_dir - reserved_pg_dir) as this cannot be calculated
 *  until link time.
 */
#define RESERVED_SWAPPER_OFFSET        (PAGE_SIZE)

/*
 *  Open-coded (swapper_pg_dir - tramp_pg_dir) as this cannot be calculated
 *  until link time.
 */
#define TRAMP_SWAPPER_OFFSET        (2 * PAGE_SIZE)

#ifndef __ASSEMBLY__

#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/mmdebug.h>
#include <linux/types.h>
#include <asm/boot.h>
#include <asm/bug.h>
#include <asm/sections.h>
#include <asm/sysreg.h>

static inline u64 __pure read_tcr(void)
{
        u64  tcr;

        // read_sysreg() uses asm volatile, so avoid it here
        asm("mrs %0, tcr_el1" : "=r"(tcr));
        return tcr;
}

#if VA_BITS > 48
// For reasons of #include hell, we can't use TCR_T1SZ_OFFSET/TCR_T1SZ_MASK here
#define vabits_actual                (64 - ((read_tcr() >> 16) & 63))
#else
#define vabits_actual                ((u64)VA_BITS)
#endif

extern s64                        memstart_addr;
/* PHYS_OFFSET - the physical address of the start of memory. */
#define PHYS_OFFSET                ({ VM_BUG_ON(memstart_addr & 1); memstart_addr; })

/* the offset between the kernel virtual and physical mappings */
extern u64                        kimage_voffset;

static inline unsigned long kaslr_offset(void)
{
        return (u64)&_text - KIMAGE_VADDR;
}

#ifdef CONFIG_RANDOMIZE_BASE
void kaslr_init(void);
static inline bool kaslr_enabled(void)
{
        extern bool __kaslr_is_enabled;
        return __kaslr_is_enabled;
}
#else
static inline void kaslr_init(void) { }
static inline bool kaslr_enabled(void) { return false; }
#endif

/*
 * Allow all memory at the discovery stage. We will clip it later.
 */
#define MIN_MEMBLOCK_ADDR        0
#define MAX_MEMBLOCK_ADDR        U64_MAX

/*
 * PFNs are used to describe any physical page; this means
 * PFN 0 == physical address 0.
 *
 * This is the PFN of the first RAM page in the kernel
 * direct-mapped view.  We assume this is the first page
 * of RAM in the mem_map as well.
 */
#define PHYS_PFN_OFFSET        (PHYS_OFFSET >> PAGE_SHIFT)

/*
 * When dealing with data aborts, watchpoints, or instruction traps we may end
 * up with a tagged userland pointer. Clear the tag to get a sane pointer to
 * pass on to access_ok(), for instance.
 */
#define __untagged_addr(addr)        \
        ((__force __typeof__(addr))sign_extend64((__force u64)(addr), 55))

#define untagged_addr(addr)        ({                                        \
        u64 __addr = (__force u64)(addr);                                        \
        __addr &= __untagged_addr(__addr);                                \
        (__force __typeof__(addr))__addr;                                \
})

#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
#define __tag_shifted(tag)        ((u64)(tag) << 56)
#define __tag_reset(addr)        __untagged_addr(addr)
#define __tag_get(addr)                (__u8)((u64)(addr) >> 56)
#else
#define __tag_shifted(tag)        0UL
#define __tag_reset(addr)        (addr)
#define __tag_get(addr)                0
#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */

static inline const void *__tag_set(const void *addr, u8 tag)
{
        u64 __addr = (u64)addr & ~__tag_shifted(0xff);
        return (const void *)(__addr | __tag_shifted(tag));
}

#ifdef CONFIG_KASAN_HW_TAGS
#define arch_enable_tag_checks_sync()                mte_enable_kernel_sync()
#define arch_enable_tag_checks_async()                mte_enable_kernel_async()
#define arch_enable_tag_checks_asymm()                mte_enable_kernel_asymm()
#define arch_suppress_tag_checks_start()        mte_enable_tco()
#define arch_suppress_tag_checks_stop()                mte_disable_tco()
#define arch_force_async_tag_fault()                mte_check_tfsr_exit()
#define arch_get_random_tag()                        mte_get_random_tag()
#define arch_get_mem_tag(addr)                        mte_get_mem_tag(addr)
#define arch_set_mem_tag_range(addr, size, tag, init)        \
                        mte_set_mem_tag_range((addr), (size), (tag), (init))
#endif /* CONFIG_KASAN_HW_TAGS */

/*
 * Physical vs virtual RAM address space conversion.  These are
 * private definitions which should NOT be used outside memory.h
 * files.  Use virt_to_phys/phys_to_virt/__pa/__va instead.
 */


/*
 * Check whether an arbitrary address is within the linear map, which
 * lives in the [PAGE_OFFSET, PAGE_END) interval at the bottom of the
 * kernel's TTBR1 address range.
 */
#define __is_lm_address(addr)        (((u64)(addr) - PAGE_OFFSET) < (PAGE_END - PAGE_OFFSET))

#define __lm_to_phys(addr)        (((addr) - PAGE_OFFSET) + PHYS_OFFSET)
#define __kimg_to_phys(addr)        ((addr) - kimage_voffset)

#define __virt_to_phys_nodebug(x) ({                                        \
        phys_addr_t __x = (phys_addr_t)(__tag_reset(x));                \
        __is_lm_address(__x) ? __lm_to_phys(__x) : __kimg_to_phys(__x);        \
})

#define __pa_symbol_nodebug(x)        __kimg_to_phys((phys_addr_t)(x))

#ifdef CONFIG_DEBUG_VIRTUAL
extern phys_addr_t __virt_to_phys(unsigned long x);
extern phys_addr_t __phys_addr_symbol(unsigned long x);
#else
#define __virt_to_phys(x)        __virt_to_phys_nodebug(x)
#define __phys_addr_symbol(x)        __pa_symbol_nodebug(x)
#endif /* CONFIG_DEBUG_VIRTUAL */

#define __phys_to_virt(x)        ((unsigned long)((x) - PHYS_OFFSET) | PAGE_OFFSET)
#define __phys_to_kimg(x)        ((unsigned long)((x) + kimage_voffset))

/*
 * Note: Drivers should NOT use these.  They are the wrong
 * translation for translating DMA addresses.  Use the driver
 * DMA support - see dma-mapping.h.
 */
#define virt_to_phys virt_to_phys
static inline phys_addr_t virt_to_phys(const volatile void *x)
{
        return __virt_to_phys((unsigned long)(x));
}

#define phys_to_virt phys_to_virt
static inline void *phys_to_virt(phys_addr_t x)
{
        return (void *)(__phys_to_virt(x));
}

/* Needed already here for resolving __phys_to_pfn() in virt_to_pfn() */
#include <asm-generic/memory_model.h>

static inline unsigned long virt_to_pfn(const void *kaddr)
{
        return __phys_to_pfn(virt_to_phys(kaddr));
}

/*
 * Drivers should NOT use these either.
 */
#define __pa(x)                        __virt_to_phys((unsigned long)(x))
#define __pa_symbol(x)                __phys_addr_symbol(RELOC_HIDE((unsigned long)(x), 0))
#define __pa_nodebug(x)                __virt_to_phys_nodebug((unsigned long)(x))
#define __va(x)                        ((void *)__phys_to_virt((phys_addr_t)(x)))
#define pfn_to_kaddr(pfn)        __va((pfn) << PAGE_SHIFT)
#define sym_to_pfn(x)                __phys_to_pfn(__pa_symbol(x))

/*
 *  virt_to_page(x)        convert a _valid_ virtual address to struct page *
 *  virt_addr_valid(x)        indicates whether a virtual address is valid
 */
#define ARCH_PFN_OFFSET                ((unsigned long)PHYS_PFN_OFFSET)

#if defined(CONFIG_DEBUG_VIRTUAL)
#define page_to_virt(x)        ({                                                \
        __typeof__(x) __page = x;                                        \
        void *__addr = __va(page_to_phys(__page));                        \
        (void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\
})
#define virt_to_page(x)                pfn_to_page(virt_to_pfn(x))
#else
#define page_to_virt(x)        ({                                                \
        __typeof__(x) __page = x;                                        \
        u64 __idx = ((u64)__page - VMEMMAP_START) / sizeof(struct page);\
        u64 __addr = PAGE_OFFSET + (__idx * PAGE_SIZE);                        \
        (void *)__tag_set((const void *)__addr, page_kasan_tag(__page));\
})

#define virt_to_page(x)        ({                                                \
        u64 __idx = (__tag_reset((u64)x) - PAGE_OFFSET) / PAGE_SIZE;        \
        u64 __addr = VMEMMAP_START + (__idx * sizeof(struct page));        \
        (struct page *)__addr;                                                \
})
#endif /* CONFIG_DEBUG_VIRTUAL */

#define virt_addr_valid(addr)        ({                                        \
        __typeof__(addr) __addr = __tag_reset(addr);                        \
        __is_lm_address(__addr) && pfn_is_map_memory(virt_to_pfn(__addr));        \
})

void dump_mem_limit(void);
#endif /* !ASSEMBLY */

/*
 * Given that the GIC architecture permits ITS implementations that can only be
 * configured with a LPI table address once, GICv3 systems with many CPUs may
 * end up reserving a lot of different regions after a kexec for their LPI
 * tables (one per CPU), as we are forced to reuse the same memory after kexec
 * (and thus reserve it persistently with EFI beforehand)
 */
#if defined(CONFIG_EFI) && defined(CONFIG_ARM_GIC_V3_ITS)
# define INIT_MEMBLOCK_RESERVED_REGIONS        (INIT_MEMBLOCK_REGIONS + NR_CPUS + 1)
#endif

/*
 * memory regions which marked with flag MEMBLOCK_NOMAP(for example, the memory
 * of the EFI_UNUSABLE_MEMORY type) may divide a continuous memory block into
 * multiple parts. As a result, the number of memory regions is large.
 */
#ifdef CONFIG_EFI
#define INIT_MEMBLOCK_MEMORY_REGIONS        (INIT_MEMBLOCK_REGIONS * 8)
#endif


#endif /* __ASM_MEMORY_H */























































































































































   24 



























































































   24 



















































   24 
























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM net

#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NET_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <linux/tracepoint.h>

TRACE_EVENT(net_dev_start_xmit,

        TP_PROTO(const struct sk_buff *skb, const struct net_device *dev),

        TP_ARGS(skb, dev),

        TP_STRUCT__entry(
                __string(        name,                        dev->name        )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        int,                        network_offset        )
                __field(        bool,                        transport_offset_valid)
                __field(        int,                        transport_offset)
                __field(        u8,                        tx_flags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_segs        )
                __field(        u16,                        gso_type        )
        ),

        TP_fast_assign(
                __assign_str(name);
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->network_offset = skb_network_offset(skb);
                __entry->transport_offset_valid =
                        skb_transport_header_was_set(skb);
                __entry->transport_offset = skb_transport_header_was_set(skb) ?
                        skb_transport_offset(skb) : 0;
                __entry->tx_flags = skb_shinfo(skb)->tx_flags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_segs = skb_shinfo(skb)->gso_segs;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
        ),

        TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
                  __get_str(name), __entry->queue_mapping, __entry->skbaddr,
                  __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
                  __entry->protocol, __entry->ip_summed, __entry->len,
                  __entry->data_len,
                  __entry->network_offset, __entry->transport_offset_valid,
                  __entry->transport_offset, __entry->tx_flags,
                  __entry->gso_size, __entry->gso_segs, __entry->gso_type)
);

TRACE_EVENT(net_dev_xmit,

        TP_PROTO(struct sk_buff *skb,
                 int rc,
                 struct net_device *dev,
                 unsigned int skb_len),

        TP_ARGS(skb, rc, dev, skb_len),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __field(        int,                rc                )
                __string(        name,                dev->name        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb_len;
                __entry->rc = rc;
                __assign_str(name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
                __get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
);

TRACE_EVENT(net_dev_xmit_timeout,

        TP_PROTO(struct net_device *dev,
                 int queue_index),

        TP_ARGS(dev, queue_index),

        TP_STRUCT__entry(
                __string(        name,                dev->name        )
                __string(        driver,                netdev_drivername(dev))
                __field(        int,                queue_index        )
        ),

        TP_fast_assign(
                __assign_str(name);
                __assign_str(driver);
                __entry->queue_index = queue_index;
        ),

        TP_printk("dev=%s driver=%s queue=%d",
                __get_str(name), __get_str(driver), __entry->queue_index)
);

DECLARE_EVENT_CLASS(net_dev_template,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __field(        void *,                skbaddr                )
                __field(        unsigned int,        len                )
                __string(        name,                skb->dev->name        )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = skb->len;
                __assign_str(name);
        ),

        TP_printk("dev=%s skbaddr=%p len=%u",
                __get_str(name), __entry->skbaddr, __entry->len)
)

DEFINE_EVENT(net_dev_template, net_dev_queue,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_receive_skb,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_template, netif_rx,

        TP_PROTO(struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb),

        TP_STRUCT__entry(
                __string(        name,                        skb->dev->name        )
                __field(        unsigned int,                napi_id                )
                __field(        u16,                        queue_mapping        )
                __field(        const void *,                skbaddr                )
                __field(        bool,                        vlan_tagged        )
                __field(        u16,                        vlan_proto        )
                __field(        u16,                        vlan_tci        )
                __field(        u16,                        protocol        )
                __field(        u8,                        ip_summed        )
                __field(        u32,                        hash                )
                __field(        bool,                        l4_hash                )
                __field(        unsigned int,                len                )
                __field(        unsigned int,                data_len        )
                __field(        unsigned int,                truesize        )
                __field(        bool,                        mac_header_valid)
                __field(        int,                        mac_header        )
                __field(        unsigned char,                nr_frags        )
                __field(        u16,                        gso_size        )
                __field(        u16,                        gso_type        )
        ),

        TP_fast_assign(
                __assign_str(name);
#ifdef CONFIG_NET_RX_BUSY_POLL
                __entry->napi_id = skb->napi_id;
#else
                __entry->napi_id = 0;
#endif
                __entry->queue_mapping = skb->queue_mapping;
                __entry->skbaddr = skb;
                __entry->vlan_tagged = skb_vlan_tag_present(skb);
                __entry->vlan_proto = ntohs(skb->vlan_proto);
                __entry->vlan_tci = skb_vlan_tag_get(skb);
                __entry->protocol = ntohs(skb->protocol);
                __entry->ip_summed = skb->ip_summed;
                __entry->hash = skb->hash;
                __entry->l4_hash = skb->l4_hash;
                __entry->len = skb->len;
                __entry->data_len = skb->data_len;
                __entry->truesize = skb->truesize;
                __entry->mac_header_valid = skb_mac_header_was_set(skb);
                __entry->mac_header = skb_mac_header(skb) - skb->data;
                __entry->nr_frags = skb_shinfo(skb)->nr_frags;
                __entry->gso_size = skb_shinfo(skb)->gso_size;
                __entry->gso_type = skb_shinfo(skb)->gso_type;
        ),

        TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
                  __get_str(name), __entry->napi_id, __entry->queue_mapping,
                  __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
                  __entry->vlan_tci, __entry->protocol, __entry->ip_summed,
                  __entry->hash, __entry->l4_hash, __entry->len,
                  __entry->data_len, __entry->truesize,
                  __entry->mac_header_valid, __entry->mac_header,
                  __entry->nr_frags, __entry->gso_size, __entry->gso_type)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry,

        TP_PROTO(const struct sk_buff *skb),

        TP_ARGS(skb)
);

DECLARE_EVENT_CLASS(net_dev_rx_exit_template,

        TP_PROTO(int ret),

        TP_ARGS(ret),

        TP_STRUCT__entry(
                __field(int,        ret)
        ),

        TP_fast_assign(
                __entry->ret = ret;
        ),

        TP_printk("ret=%d", __entry->ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit,

        TP_PROTO(int ret),

        TP_ARGS(ret)
);

#endif /* _TRACE_NET_H */

/* This part must be outside protection */
#include <trace/define_trace.h>


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 































   24 
























































































































































































































































   24 



   24 
   24 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   57 










   56 





































   56 









   57 












   56 











   57 




















































































































































    3 












































































































































































































































































































































































































































































































































































































































































































































































































































    3 














































































    3 


















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Definitions for the Interfaces handler.
 *
 * Version:        @(#)dev.h        1.0.10        08/12/93
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Corey Minyard <wf-rch!minyard@relay.EU.net>
 *                Donald J. Becker, <becker@cesdis.gsfc.nasa.gov>
 *                Alan Cox, <alan@lxorguk.ukuu.org.uk>
 *                Bjorn Ekwall. <bj0rn@blox.se>
 *              Pekka Riikonen <priikone@poseidon.pspt.fi>
 *
 *                Moved to /usr/include/linux for NET3
 */
#ifndef _LINUX_NETDEVICE_H
#define _LINUX_NETDEVICE_H

#include <linux/timer.h>
#include <linux/bug.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/prefetch.h>
#include <asm/cache.h>
#include <asm/byteorder.h>
#include <asm/local.h>

#include <linux/percpu.h>
#include <linux/rculist.h>
#include <linux/workqueue.h>
#include <linux/dynamic_queue_limits.h>

#include <net/net_namespace.h>
#ifdef CONFIG_DCB
#include <net/dcbnl.h>
#endif
#include <net/netprio_cgroup.h>
#include <linux/netdev_features.h>
#include <linux/neighbour.h>
#include <linux/netdevice_xmit.h>
#include <uapi/linux/netdevice.h>
#include <uapi/linux/if_bonding.h>
#include <uapi/linux/pkt_cls.h>
#include <uapi/linux/netdev.h>
#include <linux/hashtable.h>
#include <linux/rbtree.h>
#include <net/net_trackers.h>
#include <net/net_debug.h>
#include <net/dropreason-core.h>
#include <net/neighbour_tables.h>

struct netpoll_info;
struct device;
struct ethtool_ops;
struct kernel_hwtstamp_config;
struct phy_device;
struct dsa_port;
struct ip_tunnel_parm_kern;
struct macsec_context;
struct macsec_ops;
struct netdev_config;
struct netdev_name_node;
struct sd_flow_limit;
struct sfp_bus;
/* 802.11 specific */
struct wireless_dev;
/* 802.15.4 specific */
struct wpan_dev;
struct mpls_dev;
/* UDP Tunnel offloads */
struct udp_tunnel_info;
struct udp_tunnel_nic_info;
struct udp_tunnel_nic;
struct bpf_prog;
struct xdp_buff;
struct xdp_frame;
struct xdp_metadata_ops;
struct xdp_md;
struct ethtool_netdev_state;
struct phy_link_topology;
struct hwtstamp_provider;

typedef u32 xdp_features_t;

void synchronize_net(void);
void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops);
void netdev_sw_irq_coalesce_default_on(struct net_device *dev);

/* Backlog congestion levels */
#define NET_RX_SUCCESS                0        /* keep 'em coming, baby */
#define NET_RX_DROP                1        /* packet dropped */

#define MAX_NEST_DEV 8

/*
 * Transmit return codes: transmit return codes originate from three different
 * namespaces:
 *
 * - qdisc return codes
 * - driver transmit return codes
 * - errno values
 *
 * Drivers are allowed to return any one of those in their hard_start_xmit()
 * function. Real network devices commonly used with qdiscs should only return
 * the driver transmit return codes though - when qdiscs are used, the actual
 * transmission happens asynchronously, so the value is not propagated to
 * higher layers. Virtual network devices transmit synchronously; in this case
 * the driver transmit return codes are consumed by dev_queue_xmit(), and all
 * others are propagated to higher layers.
 */

/* qdisc ->enqueue() return codes. */
#define NET_XMIT_SUCCESS        0x00
#define NET_XMIT_DROP                0x01        /* skb dropped                        */
#define NET_XMIT_CN                0x02        /* congestion notification        */
#define NET_XMIT_MASK                0x0f        /* qdisc flags in net/sch_generic.h */

/* NET_XMIT_CN is special. It does not guarantee that this packet is lost. It
 * indicates that the device will soon be dropping packets, or already drops
 * some packets of the same priority; prompting us to send less aggressively. */
#define net_xmit_eval(e)        ((e) == NET_XMIT_CN ? 0 : (e))
#define net_xmit_errno(e)        ((e) != NET_XMIT_CN ? -ENOBUFS : 0)

/* Driver transmit return codes */
#define NETDEV_TX_MASK                0xf0

enum netdev_tx {
        __NETDEV_TX_MIN         = INT_MIN,        /* make sure enum is signed */
        NETDEV_TX_OK         = 0x00,        /* driver took care of packet */
        NETDEV_TX_BUSY         = 0x10,        /* driver tx path was busy*/
};
typedef enum netdev_tx netdev_tx_t;

/*
 * Current order: NETDEV_TX_MASK > NET_XMIT_MASK >= 0 is significant;
 * hard_start_xmit() return < NET_XMIT_MASK means skb was consumed.
 */
static inline bool dev_xmit_complete(int rc)
{
        /*
         * Positive cases with an skb consumed by a driver:
         * - successful transmission (rc == NETDEV_TX_OK)
         * - error while transmitting (rc < 0)
         * - error while queueing to a different device (rc & NET_XMIT_MASK)
         */
        if (likely(rc < NET_XMIT_MASK))
                return true;

        return false;
}

/*
 *        Compute the worst-case header length according to the protocols
 *        used.
 */

#if defined(CONFIG_HYPERV_NET)
# define LL_MAX_HEADER 128
#elif defined(CONFIG_WLAN) || IS_ENABLED(CONFIG_AX25)
# if defined(CONFIG_MAC80211_MESH)
#  define LL_MAX_HEADER 128
# else
#  define LL_MAX_HEADER 96
# endif
#else
# define LL_MAX_HEADER 32
#endif

#if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \
    !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL)
#define MAX_HEADER LL_MAX_HEADER
#else
#define MAX_HEADER (LL_MAX_HEADER + 48)
#endif

/*
 *        Old network device statistics. Fields are native words
 *        (unsigned long) so they can be read and written atomically.
 */

#define NET_DEV_STAT(FIELD)                        \
        union {                                        \
                unsigned long FIELD;                \
                atomic_long_t __##FIELD;        \
        }

struct net_device_stats {
        NET_DEV_STAT(rx_packets);
        NET_DEV_STAT(tx_packets);
        NET_DEV_STAT(rx_bytes);
        NET_DEV_STAT(tx_bytes);
        NET_DEV_STAT(rx_errors);
        NET_DEV_STAT(tx_errors);
        NET_DEV_STAT(rx_dropped);
        NET_DEV_STAT(tx_dropped);
        NET_DEV_STAT(multicast);
        NET_DEV_STAT(collisions);
        NET_DEV_STAT(rx_length_errors);
        NET_DEV_STAT(rx_over_errors);
        NET_DEV_STAT(rx_crc_errors);
        NET_DEV_STAT(rx_frame_errors);
        NET_DEV_STAT(rx_fifo_errors);
        NET_DEV_STAT(rx_missed_errors);
        NET_DEV_STAT(tx_aborted_errors);
        NET_DEV_STAT(tx_carrier_errors);
        NET_DEV_STAT(tx_fifo_errors);
        NET_DEV_STAT(tx_heartbeat_errors);
        NET_DEV_STAT(tx_window_errors);
        NET_DEV_STAT(rx_compressed);
        NET_DEV_STAT(tx_compressed);
};
#undef NET_DEV_STAT

/* per-cpu stats, allocated on demand.
 * Try to fit them in a single cache line, for dev_get_stats() sake.
 */
struct net_device_core_stats {
        unsigned long        rx_dropped;
        unsigned long        tx_dropped;
        unsigned long        rx_nohandler;
        unsigned long        rx_otherhost_dropped;
} __aligned(4 * sizeof(unsigned long));

#include <linux/cache.h>
#include <linux/skbuff.h>

struct neighbour;
struct neigh_parms;
struct sk_buff;

struct netdev_hw_addr {
        struct list_head        list;
        struct rb_node                node;
        unsigned char                addr[MAX_ADDR_LEN];
        unsigned char                type;
#define NETDEV_HW_ADDR_T_LAN                1
#define NETDEV_HW_ADDR_T_SAN                2
#define NETDEV_HW_ADDR_T_UNICAST        3
#define NETDEV_HW_ADDR_T_MULTICAST        4
        bool                        global_use;
        int                        sync_cnt;
        int                        refcount;
        int                        synced;
        struct rcu_head                rcu_head;
};

struct netdev_hw_addr_list {
        struct list_head        list;
        int                        count;

        /* Auxiliary tree for faster lookup on addition and deletion */
        struct rb_root                tree;
};

#define netdev_hw_addr_list_count(l) ((l)->count)
#define netdev_hw_addr_list_empty(l) (netdev_hw_addr_list_count(l) == 0)
#define netdev_hw_addr_list_for_each(ha, l) \
        list_for_each_entry(ha, &(l)->list, list)

#define netdev_uc_count(dev) netdev_hw_addr_list_count(&(dev)->uc)
#define netdev_uc_empty(dev) netdev_hw_addr_list_empty(&(dev)->uc)
#define netdev_for_each_uc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->uc)
#define netdev_for_each_synced_uc_addr(_ha, _dev) \
        netdev_for_each_uc_addr((_ha), (_dev)) \
                if ((_ha)->sync_cnt)

#define netdev_mc_count(dev) netdev_hw_addr_list_count(&(dev)->mc)
#define netdev_mc_empty(dev) netdev_hw_addr_list_empty(&(dev)->mc)
#define netdev_for_each_mc_addr(ha, dev) \
        netdev_hw_addr_list_for_each(ha, &(dev)->mc)
#define netdev_for_each_synced_mc_addr(_ha, _dev) \
        netdev_for_each_mc_addr((_ha), (_dev)) \
                if ((_ha)->sync_cnt)

struct hh_cache {
        unsigned int        hh_len;
        seqlock_t        hh_lock;

        /* cached hardware header; allow for machine alignment needs.        */
#define HH_DATA_MOD        16
#define HH_DATA_OFF(__len) \
        (HH_DATA_MOD - (((__len - 1) & (HH_DATA_MOD - 1)) + 1))
#define HH_DATA_ALIGN(__len) \
        (((__len)+(HH_DATA_MOD-1))&~(HH_DATA_MOD - 1))
        unsigned long        hh_data[HH_DATA_ALIGN(LL_MAX_HEADER) / sizeof(long)];
};

/* Reserve HH_DATA_MOD byte-aligned hard_header_len, but at least that much.
 * Alternative is:
 *   dev->hard_header_len ? (dev->hard_header_len +
 *                           (HH_DATA_MOD - 1)) & ~(HH_DATA_MOD - 1) : 0
 *
 * We could use other alignment values, but we must maintain the
 * relationship HH alignment <= LL alignment.
 */
#define LL_RESERVED_SPACE(dev) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
#define LL_RESERVED_SPACE_EXTRA(dev,extra) \
        ((((dev)->hard_header_len + READ_ONCE((dev)->needed_headroom) + (extra)) \
          & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)

struct header_ops {
        int        (*create) (struct sk_buff *skb, struct net_device *dev,
                           unsigned short type, const void *daddr,
                           const void *saddr, unsigned int len);
        int        (*parse)(const struct sk_buff *skb, unsigned char *haddr);
        int        (*cache)(const struct neighbour *neigh, struct hh_cache *hh, __be16 type);
        void        (*cache_update)(struct hh_cache *hh,
                                const struct net_device *dev,
                                const unsigned char *haddr);
        bool        (*validate)(const char *ll_header, unsigned int len);
        __be16        (*parse_protocol)(const struct sk_buff *skb);
};

/* These flag bits are private to the generic network queueing
 * layer; they may not be explicitly referenced by any other
 * code.
 */

enum netdev_state_t {
        __LINK_STATE_START,
        __LINK_STATE_PRESENT,
        __LINK_STATE_NOCARRIER,
        __LINK_STATE_LINKWATCH_PENDING,
        __LINK_STATE_DORMANT,
        __LINK_STATE_TESTING,
};

struct gro_list {
        struct list_head        list;
        int                        count;
};

/*
 * size of gro hash buckets, must be <= the number of bits in
 * gro_node::bitmask
 */
#define GRO_HASH_BUCKETS        8

/**
 * struct gro_node - structure to support Generic Receive Offload
 * @bitmask: bitmask to indicate used buckets in @hash
 * @hash: hashtable of pending aggregated skbs, separated by flows
 * @rx_list: list of pending ``GRO_NORMAL`` skbs
 * @rx_count: cached current length of @rx_list
 * @cached_napi_id: napi_struct::napi_id cached for hotpath, 0 for standalone
 */
struct gro_node {
        unsigned long                bitmask;
        struct gro_list                hash[GRO_HASH_BUCKETS];
        struct list_head        rx_list;
        u32                        rx_count;
        u32                        cached_napi_id;
};

/*
 * Structure for per-NAPI config
 */
struct napi_config {
        u64 gro_flush_timeout;
        u64 irq_suspend_timeout;
        u32 defer_hard_irqs;
        cpumask_t affinity_mask;
        unsigned int napi_id;
};

/*
 * Structure for NAPI scheduling similar to tasklet but with weighting
 */
struct napi_struct {
        /* The poll_list must only be managed by the entity which
         * changes the state of the NAPI_STATE_SCHED bit.  This means
         * whoever atomically sets that bit can add this napi_struct
         * to the per-CPU poll_list, and whoever clears that bit
         * can remove from the list right before clearing the bit.
         */
        struct list_head        poll_list;

        unsigned long                state;
        int                        weight;
        u32                        defer_hard_irqs_count;
        int                        (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
        /* CPU actively polling if netpoll is configured */
        int                        poll_owner;
#endif
        /* CPU on which NAPI has been scheduled for processing */
        int                        list_owner;
        struct net_device        *dev;
        struct sk_buff                *skb;
        struct gro_node                gro;
        struct hrtimer                timer;
        /* all fields past this point are write-protected by netdev_lock */
        struct task_struct        *thread;
        unsigned long                gro_flush_timeout;
        unsigned long                irq_suspend_timeout;
        u32                        defer_hard_irqs;
        /* control-path-only fields follow */
        u32                        napi_id;
        struct list_head        dev_list;
        struct hlist_node        napi_hash_node;
        int                        irq;
        struct irq_affinity_notify notify;
        int                        napi_rmap_idx;
        int                        index;
        struct napi_config        *config;
};

enum {
        NAPI_STATE_SCHED,                /* Poll is scheduled */
        NAPI_STATE_MISSED,                /* reschedule a napi */
        NAPI_STATE_DISABLE,                /* Disable pending */
        NAPI_STATE_NPSVC,                /* Netpoll - don't dequeue from poll_list */
        NAPI_STATE_LISTED,                /* NAPI added to system lists */
        NAPI_STATE_NO_BUSY_POLL,        /* Do not add in napi_hash, no busy polling */
        NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
        NAPI_STATE_PREFER_BUSY_POLL,        /* prefer busy-polling over softirq processing*/
        NAPI_STATE_THREADED,                /* The poll is performed inside its own thread*/
        NAPI_STATE_SCHED_THREADED,        /* Napi is currently scheduled in threaded mode */
        NAPI_STATE_HAS_NOTIFIER,        /* Napi has an IRQ notifier */
};

enum {
        NAPIF_STATE_SCHED                = BIT(NAPI_STATE_SCHED),
        NAPIF_STATE_MISSED                = BIT(NAPI_STATE_MISSED),
        NAPIF_STATE_DISABLE                = BIT(NAPI_STATE_DISABLE),
        NAPIF_STATE_NPSVC                = BIT(NAPI_STATE_NPSVC),
        NAPIF_STATE_LISTED                = BIT(NAPI_STATE_LISTED),
        NAPIF_STATE_NO_BUSY_POLL        = BIT(NAPI_STATE_NO_BUSY_POLL),
        NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
        NAPIF_STATE_PREFER_BUSY_POLL        = BIT(NAPI_STATE_PREFER_BUSY_POLL),
        NAPIF_STATE_THREADED                = BIT(NAPI_STATE_THREADED),
        NAPIF_STATE_SCHED_THREADED        = BIT(NAPI_STATE_SCHED_THREADED),
        NAPIF_STATE_HAS_NOTIFIER        = BIT(NAPI_STATE_HAS_NOTIFIER),
};

enum gro_result {
        GRO_MERGED,
        GRO_MERGED_FREE,
        GRO_HELD,
        GRO_NORMAL,
        GRO_CONSUMED,
};
typedef enum gro_result gro_result_t;

/*
 * enum rx_handler_result - Possible return values for rx_handlers.
 * @RX_HANDLER_CONSUMED: skb was consumed by rx_handler, do not process it
 * further.
 * @RX_HANDLER_ANOTHER: Do another round in receive path. This is indicated in
 * case skb->dev was changed by rx_handler.
 * @RX_HANDLER_EXACT: Force exact delivery, no wildcard.
 * @RX_HANDLER_PASS: Do nothing, pass the skb as if no rx_handler was called.
 *
 * rx_handlers are functions called from inside __netif_receive_skb(), to do
 * special processing of the skb, prior to delivery to protocol handlers.
 *
 * Currently, a net_device can only have a single rx_handler registered. Trying
 * to register a second rx_handler will return -EBUSY.
 *
 * To register a rx_handler on a net_device, use netdev_rx_handler_register().
 * To unregister a rx_handler on a net_device, use
 * netdev_rx_handler_unregister().
 *
 * Upon return, rx_handler is expected to tell __netif_receive_skb() what to
 * do with the skb.
 *
 * If the rx_handler consumed the skb in some way, it should return
 * RX_HANDLER_CONSUMED. This is appropriate when the rx_handler arranged for
 * the skb to be delivered in some other way.
 *
 * If the rx_handler changed skb->dev, to divert the skb to another
 * net_device, it should return RX_HANDLER_ANOTHER. The rx_handler for the
 * new device will be called if it exists.
 *
 * If the rx_handler decides the skb should be ignored, it should return
 * RX_HANDLER_EXACT. The skb will only be delivered to protocol handlers that
 * are registered on exact device (ptype->dev == skb->dev).
 *
 * If the rx_handler didn't change skb->dev, but wants the skb to be normally
 * delivered, it should return RX_HANDLER_PASS.
 *
 * A device without a registered rx_handler will behave as if rx_handler
 * returned RX_HANDLER_PASS.
 */

enum rx_handler_result {
        RX_HANDLER_CONSUMED,
        RX_HANDLER_ANOTHER,
        RX_HANDLER_EXACT,
        RX_HANDLER_PASS,
};
typedef enum rx_handler_result rx_handler_result_t;
typedef rx_handler_result_t rx_handler_func_t(struct sk_buff **pskb);

void __napi_schedule(struct napi_struct *n);
void __napi_schedule_irqoff(struct napi_struct *n);

static inline bool napi_disable_pending(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_DISABLE, &n->state);
}

static inline bool napi_prefer_busy_poll(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
}

/**
 * napi_is_scheduled - test if NAPI is scheduled
 * @n: NAPI context
 *
 * This check is "best-effort". With no locking implemented,
 * a NAPI can be scheduled or terminate right after this check
 * and produce not precise results.
 *
 * NAPI_STATE_SCHED is an internal state, napi_is_scheduled
 * should not be used normally and napi_schedule should be
 * used instead.
 *
 * Use only if the driver really needs to check if a NAPI
 * is scheduled for example in the context of delayed timer
 * that can be skipped if a NAPI is already scheduled.
 *
 * Return: True if NAPI is scheduled, False otherwise.
 */
static inline bool napi_is_scheduled(struct napi_struct *n)
{
        return test_bit(NAPI_STATE_SCHED, &n->state);
}

bool napi_schedule_prep(struct napi_struct *n);

/**
 *        napi_schedule - schedule NAPI poll
 *        @n: NAPI context
 *
 * Schedule NAPI poll routine to be called if it is not already
 * running.
 * Return: true if we schedule a NAPI or false if not.
 * Refer to napi_schedule_prep() for additional reason on why
 * a NAPI might not be scheduled.
 */
static inline bool napi_schedule(struct napi_struct *n)
{
        if (napi_schedule_prep(n)) {
                __napi_schedule(n);
                return true;
        }

        return false;
}

/**
 *        napi_schedule_irqoff - schedule NAPI poll
 *        @n: NAPI context
 *
 * Variant of napi_schedule(), assuming hard irqs are masked.
 */
static inline void napi_schedule_irqoff(struct napi_struct *n)
{
        if (napi_schedule_prep(n))
                __napi_schedule_irqoff(n);
}

/**
 * napi_complete_done - NAPI processing complete
 * @n: NAPI context
 * @work_done: number of packets processed
 *
 * Mark NAPI processing as complete. Should only be called if poll budget
 * has not been completely consumed.
 * Prefer over napi_complete().
 * Return: false if device should avoid rearming interrupts.
 */
bool napi_complete_done(struct napi_struct *n, int work_done);

static inline bool napi_complete(struct napi_struct *n)
{
        return napi_complete_done(n, 0);
}

int dev_set_threaded(struct net_device *dev, bool threaded);

void napi_disable(struct napi_struct *n);
void napi_disable_locked(struct napi_struct *n);

void napi_enable(struct napi_struct *n);
void napi_enable_locked(struct napi_struct *n);

/**
 *        napi_synchronize - wait until NAPI is not running
 *        @n: NAPI context
 *
 * Wait until NAPI is done being scheduled on this context.
 * Waits till any outstanding processing completes but
 * does not disable future activations.
 */
static inline void napi_synchronize(const struct napi_struct *n)
{
        if (IS_ENABLED(CONFIG_SMP))
                while (test_bit(NAPI_STATE_SCHED, &n->state))
                        msleep(1);
        else
                barrier();
}

/**
 *        napi_if_scheduled_mark_missed - if napi is running, set the
 *        NAPIF_STATE_MISSED
 *        @n: NAPI context
 *
 * If napi is running, set the NAPIF_STATE_MISSED, and return true if
 * NAPI is scheduled.
 **/
static inline bool napi_if_scheduled_mark_missed(struct napi_struct *n)
{
        unsigned long val, new;

        val = READ_ONCE(n->state);
        do {
                if (val & NAPIF_STATE_DISABLE)
                        return true;

                if (!(val & NAPIF_STATE_SCHED))
                        return false;

                new = val | NAPIF_STATE_MISSED;
        } while (!try_cmpxchg(&n->state, &val, new));

        return true;
}

enum netdev_queue_state_t {
        __QUEUE_STATE_DRV_XOFF,
        __QUEUE_STATE_STACK_XOFF,
        __QUEUE_STATE_FROZEN,
};

#define QUEUE_STATE_DRV_XOFF        (1 << __QUEUE_STATE_DRV_XOFF)
#define QUEUE_STATE_STACK_XOFF        (1 << __QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_FROZEN        (1 << __QUEUE_STATE_FROZEN)

#define QUEUE_STATE_ANY_XOFF        (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF)
#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \
                                        QUEUE_STATE_FROZEN)
#define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \
                                        QUEUE_STATE_FROZEN)

/*
 * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue.  The
 * netif_tx_* functions below are used to manipulate this flag.  The
 * __QUEUE_STATE_STACK_XOFF flag is used by the stack to stop the transmit
 * queue independently.  The netif_xmit_*stopped functions below are called
 * to check if the queue has been stopped by the driver or stack (either
 * of the XOFF bits are set in the state).  Drivers should not need to call
 * netif_xmit*stopped functions, they should only be using netif_tx_*.
 */

struct netdev_queue {
/*
 * read-mostly part
 */
        struct net_device        *dev;
        netdevice_tracker        dev_tracker;

        struct Qdisc __rcu        *qdisc;
        struct Qdisc __rcu        *qdisc_sleeping;
#ifdef CONFIG_SYSFS
        struct kobject                kobj;
        const struct attribute_group        **groups;
#endif
        unsigned long                tx_maxrate;
        /*
         * Number of TX timeouts for this queue
         * (/sys/class/net/DEV/Q/trans_timeout)
         */
        atomic_long_t                trans_timeout;

        /* Subordinate device that the queue has been assigned to */
        struct net_device        *sb_dev;
#ifdef CONFIG_XDP_SOCKETS
        struct xsk_buff_pool    *pool;
#endif

/*
 * write-mostly part
 */
#ifdef CONFIG_BQL
        struct dql                dql;
#endif
        spinlock_t                _xmit_lock ____cacheline_aligned_in_smp;
        int                        xmit_lock_owner;
        /*
         * Time (in jiffies) of last Tx
         */
        unsigned long                trans_start;

        unsigned long                state;

/*
 * slow- / control-path part
 */
        /* NAPI instance for the queue
         * "ops protected", see comment about net_device::lock
         */
        struct napi_struct        *napi;

#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        int                        numa_node;
#endif
} ____cacheline_aligned_in_smp;

extern int sysctl_fb_tunnels_only_for_init_net;
extern int sysctl_devconf_inherit_init_net;

/*
 * sysctl_fb_tunnels_only_for_init_net == 0 : For all netns
 *                                     == 1 : For initns only
 *                                     == 2 : For none.
 */
static inline bool net_has_fallback_tunnels(const struct net *net)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        int fb_tunnels_only_for_init_net = READ_ONCE(sysctl_fb_tunnels_only_for_init_net);

        return !fb_tunnels_only_for_init_net ||
                (net_eq(net, &init_net) && fb_tunnels_only_for_init_net == 1);
#else
        return true;
#endif
}

static inline int net_inherit_devconf(void)
{
#if IS_ENABLED(CONFIG_SYSCTL)
        return READ_ONCE(sysctl_devconf_inherit_init_net);
#else
        return 0;
#endif
}

static inline int netdev_queue_numa_node_read(const struct netdev_queue *q)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        return q->numa_node;
#else
        return NUMA_NO_NODE;
#endif
}

static inline void netdev_queue_numa_node_write(struct netdev_queue *q, int node)
{
#if defined(CONFIG_XPS) && defined(CONFIG_NUMA)
        q->numa_node = node;
#endif
}

#ifdef CONFIG_RFS_ACCEL
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, u32 flow_id,
                         u16 filter_id);
#endif

/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
enum xps_map_type {
        XPS_CPUS = 0,
        XPS_RXQS,
        XPS_MAPS_MAX,
};

#ifdef CONFIG_XPS
/*
 * This structure holds an XPS map which can be of variable length.  The
 * map is an array of queues.
 */
struct xps_map {
        unsigned int len;
        unsigned int alloc_len;
        struct rcu_head rcu;
        u16 queues[];
};
#define XPS_MAP_SIZE(_num) (sizeof(struct xps_map) + ((_num) * sizeof(u16)))
#define XPS_MIN_MAP_ALLOC ((L1_CACHE_ALIGN(offsetof(struct xps_map, queues[1])) \
       - sizeof(struct xps_map)) / sizeof(u16))

/*
 * This structure holds all XPS maps for device.  Maps are indexed by CPU.
 *
 * We keep track of the number of cpus/rxqs used when the struct is allocated,
 * in nr_ids. This will help not accessing out-of-bound memory.
 *
 * We keep track of the number of traffic classes used when the struct is
 * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
 * not crossing its upper bound, as the original dev->num_tc can be updated in
 * the meantime.
 */
struct xps_dev_maps {
        struct rcu_head rcu;
        unsigned int nr_ids;
        s16 num_tc;
        struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
};

#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) +        \
        (nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))

#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
        (_rxqs * (_tcs) * sizeof(struct xps_map *)))

#endif /* CONFIG_XPS */

#define TC_MAX_QUEUE        16
#define TC_BITMASK        15
/* HW offloaded queuing disciplines txq count and offset maps */
struct netdev_tc_txq {
        u16 count;
        u16 offset;
};

#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/*
 * This structure is to hold information about the device
 * configured to run FCoE protocol stack.
 */
struct netdev_fcoe_hbainfo {
        char        manufacturer[64];
        char        serial_number[64];
        char        hardware_version[64];
        char        driver_version[64];
        char        optionrom_version[64];
        char        firmware_version[64];
        char        model[256];
        char        model_description[256];
};
#endif

#define MAX_PHYS_ITEM_ID_LEN 32

/* This structure holds a unique identifier to identify some
 * physical item (port for example) used by a netdevice.
 */
struct netdev_phys_item_id {
        unsigned char id[MAX_PHYS_ITEM_ID_LEN];
        unsigned char id_len;
};

static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a,
                                            struct netdev_phys_item_id *b)
{
        return a->id_len == b->id_len &&
               memcmp(a->id, b->id, a->id_len) == 0;
}

typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
                                       struct sk_buff *skb,
                                       struct net_device *sb_dev);

enum net_device_path_type {
        DEV_PATH_ETHERNET = 0,
        DEV_PATH_VLAN,
        DEV_PATH_BRIDGE,
        DEV_PATH_PPPOE,
        DEV_PATH_DSA,
        DEV_PATH_MTK_WDMA,
};

struct net_device_path {
        enum net_device_path_type        type;
        const struct net_device                *dev;
        union {
                struct {
                        u16                id;
                        __be16                proto;
                        u8                h_dest[ETH_ALEN];
                } encap;
                struct {
                        enum {
                                DEV_PATH_BR_VLAN_KEEP,
                                DEV_PATH_BR_VLAN_TAG,
                                DEV_PATH_BR_VLAN_UNTAG,
                                DEV_PATH_BR_VLAN_UNTAG_HW,
                        }                vlan_mode;
                        u16                vlan_id;
                        __be16                vlan_proto;
                } bridge;
                struct {
                        int port;
                        u16 proto;
                } dsa;
                struct {
                        u8 wdma_idx;
                        u8 queue;
                        u16 wcid;
                        u8 bss;
                        u8 amsdu;
                } mtk_wdma;
        };
};

#define NET_DEVICE_PATH_STACK_MAX        5
#define NET_DEVICE_PATH_VLAN_MAX        2

struct net_device_path_stack {
        int                        num_paths;
        struct net_device_path        path[NET_DEVICE_PATH_STACK_MAX];
};

struct net_device_path_ctx {
        const struct net_device *dev;
        u8                        daddr[ETH_ALEN];

        int                        num_vlans;
        struct {
                u16                id;
                __be16                proto;
        } vlan[NET_DEVICE_PATH_VLAN_MAX];
};

enum tc_setup_type {
        TC_QUERY_CAPS,
        TC_SETUP_QDISC_MQPRIO,
        TC_SETUP_CLSU32,
        TC_SETUP_CLSFLOWER,
        TC_SETUP_CLSMATCHALL,
        TC_SETUP_CLSBPF,
        TC_SETUP_BLOCK,
        TC_SETUP_QDISC_CBS,
        TC_SETUP_QDISC_RED,
        TC_SETUP_QDISC_PRIO,
        TC_SETUP_QDISC_MQ,
        TC_SETUP_QDISC_ETF,
        TC_SETUP_ROOT_QDISC,
        TC_SETUP_QDISC_GRED,
        TC_SETUP_QDISC_TAPRIO,
        TC_SETUP_FT,
        TC_SETUP_QDISC_ETS,
        TC_SETUP_QDISC_TBF,
        TC_SETUP_QDISC_FIFO,
        TC_SETUP_QDISC_HTB,
        TC_SETUP_ACT,
};

/* These structures hold the attributes of bpf state that are being passed
 * to the netdevice through the bpf op.
 */
enum bpf_netdev_command {
        /* Set or clear a bpf program used in the earliest stages of packet
         * rx. The prog will have been loaded as BPF_PROG_TYPE_XDP. The callee
         * is responsible for calling bpf_prog_put on any old progs that are
         * stored. In case of error, the callee need not release the new prog
         * reference, but on success it takes ownership and must bpf_prog_put
         * when it is no longer used.
         */
        XDP_SETUP_PROG,
        XDP_SETUP_PROG_HW,
        /* BPF program for offload callbacks, invoked at program load time. */
        BPF_OFFLOAD_MAP_ALLOC,
        BPF_OFFLOAD_MAP_FREE,
        XDP_SETUP_XSK_POOL,
};

struct bpf_prog_offload_ops;
struct netlink_ext_ack;
struct xdp_umem;
struct xdp_dev_bulk_queue;
struct bpf_xdp_link;

enum bpf_xdp_mode {
        XDP_MODE_SKB = 0,
        XDP_MODE_DRV = 1,
        XDP_MODE_HW = 2,
        __MAX_XDP_MODE
};

struct bpf_xdp_entity {
        struct bpf_prog *prog;
        struct bpf_xdp_link *link;
};

struct netdev_bpf {
        enum bpf_netdev_command command;
        union {
                /* XDP_SETUP_PROG */
                struct {
                        u32 flags;
                        struct bpf_prog *prog;
                        struct netlink_ext_ack *extack;
                };
                /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
                struct {
                        struct bpf_offloaded_map *offmap;
                };
                /* XDP_SETUP_XSK_POOL */
                struct {
                        struct xsk_buff_pool *pool;
                        u16 queue_id;
                } xsk;
        };
};

/* Flags for ndo_xsk_wakeup. */
#define XDP_WAKEUP_RX (1 << 0)
#define XDP_WAKEUP_TX (1 << 1)

#ifdef CONFIG_XFRM_OFFLOAD
struct xfrmdev_ops {
        int        (*xdo_dev_state_add) (struct xfrm_state *x, struct netlink_ext_ack *extack);
        void        (*xdo_dev_state_delete) (struct xfrm_state *x);
        void        (*xdo_dev_state_free) (struct xfrm_state *x);
        bool        (*xdo_dev_offload_ok) (struct sk_buff *skb,
                                       struct xfrm_state *x);
        void        (*xdo_dev_state_advance_esn) (struct xfrm_state *x);
        void        (*xdo_dev_state_update_stats) (struct xfrm_state *x);
        int        (*xdo_dev_policy_add) (struct xfrm_policy *x, struct netlink_ext_ack *extack);
        void        (*xdo_dev_policy_delete) (struct xfrm_policy *x);
        void        (*xdo_dev_policy_free) (struct xfrm_policy *x);
};
#endif

struct dev_ifalias {
        struct rcu_head rcuhead;
        char ifalias[];
};

struct devlink;
struct tlsdev_ops;

struct netdev_net_notifier {
        struct list_head list;
        struct notifier_block *nb;
};

/*
 * This structure defines the management hooks for network devices.
 * The following hooks can be defined; unless noted otherwise, they are
 * optional and can be filled with a null pointer.
 *
 * int (*ndo_init)(struct net_device *dev);
 *     This function is called once when a network device is registered.
 *     The network device can use this for any late stage initialization
 *     or semantic validation. It can fail with an error code which will
 *     be propagated back to register_netdev.
 *
 * void (*ndo_uninit)(struct net_device *dev);
 *     This function is called when device is unregistered or when registration
 *     fails. It is not called if init fails.
 *
 * int (*ndo_open)(struct net_device *dev);
 *     This function is called when a network device transitions to the up
 *     state.
 *
 * int (*ndo_stop)(struct net_device *dev);
 *     This function is called when a network device transitions to the down
 *     state.
 *
 * netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
 *                               struct net_device *dev);
 *        Called when a packet needs to be transmitted.
 *        Returns NETDEV_TX_OK.  Can return NETDEV_TX_BUSY, but you should stop
 *        the queue before that can happen; it's for obsolete devices and weird
 *        corner cases, but the stack really does a non-trivial amount
 *        of useless work if you return NETDEV_TX_BUSY.
 *        Required; cannot be NULL.
 *
 * netdev_features_t (*ndo_features_check)(struct sk_buff *skb,
 *                                           struct net_device *dev
 *                                           netdev_features_t features);
 *        Called by core transmit path to determine if device is capable of
 *        performing offload operations on a given packet. This is to give
 *        the device an opportunity to implement any restrictions that cannot
 *        be otherwise expressed by feature flags. The check is called with
 *        the set of features that the stack has calculated and it returns
 *        those the driver believes to be appropriate.
 *
 * u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
 *                         struct net_device *sb_dev);
 *        Called to decide which queue to use when device supports multiple
 *        transmit queues.
 *
 * void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
 *        This function is called to allow device receiver to make
 *        changes to configuration when multicast or promiscuous is enabled.
 *
 * void (*ndo_set_rx_mode)(struct net_device *dev);
 *        This function is called device changes address list filtering.
 *        If driver handles unicast address filtering, it should set
 *        IFF_UNICAST_FLT in its priv_flags.
 *
 * int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
 *        This function  is called when the Media Access Control address
 *        needs to be changed. If this interface is not defined, the
 *        MAC address can not be changed.
 *
 * int (*ndo_validate_addr)(struct net_device *dev);
 *        Test if Media Access Control address is valid for the device.
 *
 * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Old-style ioctl entry point. This is used internally by the
 *        ieee802154 subsystem but is no longer called by the device
 *        ioctl handler.
 *
 * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Used by the bonding driver for its device specific ioctls:
 *        SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE,
 *        SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY
 *
 * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 *        Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG,
 *        SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP.
 *
 * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
 *        Used to set network devices bus interface parameters. This interface
 *        is retained for legacy reasons; new devices should use the bus
 *        interface (PCI) for low level management.
 *
 * int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
 *        Called when a user wants to change the Maximum Transfer Unit
 *        of a device.
 *
 * void (*ndo_tx_timeout)(struct net_device *dev, unsigned int txqueue);
 *        Callback used when the transmitter has not made any progress
 *        for dev->watchdog ticks.
 *
 * void (*ndo_get_stats64)(struct net_device *dev,
 *                         struct rtnl_link_stats64 *storage);
 * struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 *        Called when a user wants to get the network device usage
 *        statistics. Drivers must do one of the following:
 *        1. Define @ndo_get_stats64 to fill in a zero-initialised
 *           rtnl_link_stats64 structure passed by the caller.
 *        2. Define @ndo_get_stats to update a net_device_stats structure
 *           (which should normally be dev->stats) and return a pointer to
 *           it. The structure may be changed asynchronously only if each
 *           field is written atomically.
 *        3. Update dev->stats asynchronously and atomically, and define
 *           neither operation.
 *
 * bool (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id)
 *        Return true if this device supports offload stats of this attr_id.
 *
 * int (*ndo_get_offload_stats)(int attr_id, const struct net_device *dev,
 *        void *attr_data)
 *        Get statistics for offload operations by attr_id. Write it into the
 *        attr_data pointer.
 *
 * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is registered.
 *
 * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
 *        If device supports VLAN filtering this function is called when a
 *        VLAN id is unregistered.
 *
 * void (*ndo_poll_controller)(struct net_device *dev);
 *
 *        SR-IOV management functions.
 * int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
 * int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan,
 *                          u8 qos, __be16 proto);
 * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
 *                          int max_tx_rate);
 * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_config)(struct net_device *dev,
 *                            int vf, struct ifla_vf_info *ivf);
 * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
 * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
 *                          struct nlattr *port[]);
 *
 *      Enable or disable the VF ability to query its RSS Redirection Table and
 *      Hash Key. This is needed since on some devices VF share this information
 *      with PF and querying it may introduce a theoretical security risk.
 * int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
 * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
 * int (*ndo_setup_tc)(struct net_device *dev, enum tc_setup_type type,
 *                       void *type_data);
 *        Called to setup any 'tc' scheduler, classifier or action on @dev.
 *        This is always called from the stack with the rtnl lock held and netif
 *        tx queues stopped. This allows the netdevice to perform queue
 *        management safely.
 *
 *        Fiber Channel over Ethernet (FCoE) offload functions.
 * int (*ndo_fcoe_enable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to start using LLD for FCoE
 *        so the underlying device can perform whatever needed configuration or
 *        initialization to support acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_disable)(struct net_device *dev);
 *        Called when the FCoE protocol stack wants to stop using LLD for FCoE
 *        so the underlying device can perform whatever needed clean-ups to
 *        stop supporting acceleration of FCoE traffic.
 *
 * int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid,
 *                             struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Initiator wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_ddp_done)(struct net_device *dev,  u16 xid);
 *        Called when the FCoE Initiator/Target is done with the DDPed I/O as
 *        indicated by the FC exchange id 'xid', so the underlying device can
 *        clean up and reuse resources for later DDP requests.
 *
 * int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid,
 *                              struct scatterlist *sgl, unsigned int sgc);
 *        Called when the FCoE Target wants to initialize an I/O that
 *        is a possible candidate for Direct Data Placement (DDP). The LLD can
 *        perform necessary setup and returns 1 to indicate the device is set up
 *        successfully to perform DDP on this I/O, otherwise this returns 0.
 *
 * int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
 *                               struct netdev_fcoe_hbainfo *hbainfo);
 *        Called when the FCoE Protocol stack wants information on the underlying
 *        device. This information is utilized by the FCoE protocol stack to
 *        register attributes with Fiber Channel management service as per the
 *        FC-GS Fabric Device Management Information(FDMI) specification.
 *
 * int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type);
 *        Called when the underlying device wants to override default World Wide
 *        Name (WWN) generation mechanism in FCoE protocol stack to pass its own
 *        World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE
 *        protocol stack to use.
 *
 *        RFS acceleration.
 * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
 *                            u16 rxq_index, u32 flow_id);
 *        Set hardware filter for RFS.  rxq_index is the target queue index;
 *        flow_id is a flow ID to be passed to rps_may_expire_flow() later.
 *        Return the filter ID on success, or a negative error code.
 *
 *        Slave management functions (for bridge, bonding, etc).
 * int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to make another netdev an underling.
 *
 * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
 *        Called to release previously enslaved netdev.
 *
 * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev,
 *                                            struct sk_buff *skb,
 *                                            bool all_slaves);
 *        Get the xmit slave of master device. If all_slaves is true, function
 *        assume all the slaves can transmit.
 *
 *      Feature/offload setting functions.
 * netdev_features_t (*ndo_fix_features)(struct net_device *dev,
 *                netdev_features_t features);
 *        Adjusts the requested feature flags according to device-specific
 *        constraints, and returns the resulting flags. Must not modify
 *        the device state.
 *
 * int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
 *        Called to update device configuration to new features. Passed
 *        feature set might be less than what was returned by ndo_fix_features()).
 *        Must return >0 or -errno if it changed dev->features itself.
 *
 * int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid, u16 flags,
 *                      bool *notified, struct netlink_ext_ack *extack);
 *        Adds an FDB entry to dev for addr.
 *        Callee shall set *notified to true if it sent any appropriate
 *        notification(s). Otherwise core will send a generic one.
 * int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
 *                      struct net_device *dev,
 *                      const unsigned char *addr, u16 vid
 *                      bool *notified, struct netlink_ext_ack *extack);
 *        Deletes the FDB entry from dev corresponding to addr.
 *        Callee shall set *notified to true if it sent any appropriate
 *        notification(s). Otherwise core will send a generic one.
 * int (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh, struct net_device *dev,
 *                           struct netlink_ext_ack *extack);
 * int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
 *                       struct net_device *dev, struct net_device *filter_dev,
 *                       int *idx)
 *        Used to add FDB entries to dump requests. Implementers should add
 *        entries to skb and update idx with the number of entries.
 *
 * int (*ndo_mdb_add)(struct net_device *dev, struct nlattr *tb[],
 *                      u16 nlmsg_flags, struct netlink_ext_ack *extack);
 *        Adds an MDB entry to dev.
 * int (*ndo_mdb_del)(struct net_device *dev, struct nlattr *tb[],
 *                      struct netlink_ext_ack *extack);
 *        Deletes the MDB entry from dev.
 * int (*ndo_mdb_del_bulk)(struct net_device *dev, struct nlattr *tb[],
 *                           struct netlink_ext_ack *extack);
 *        Bulk deletes MDB entries from dev.
 * int (*ndo_mdb_dump)(struct net_device *dev, struct sk_buff *skb,
 *                       struct netlink_callback *cb);
 *        Dumps MDB entries from dev. The first argument (marker) in the netlink
 *        callback is used by core rtnetlink code.
 *
 * int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags, struct netlink_ext_ack *extack)
 * int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
 *                             struct net_device *dev, u32 filter_mask,
 *                             int nlflags)
 * int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
 *                             u16 flags);
 *
 * int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
 *        Called to change device carrier. Soft-devices (like dummy, team, etc)
 *        which do not represent real hardware may define this to allow their
 *        userspace components to manage their virtual carrier state. Devices
 *        that determine carrier state from physical hardware properties (eg
 *        network cables) or protocol-dependent mechanisms (eg
 *        USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
 *
 * int (*ndo_get_phys_port_id)(struct net_device *dev,
 *                               struct netdev_phys_item_id *ppid);
 *        Called to get ID of physical port of this device. If driver does
 *        not implement this, it is assumed that the hw is not able to have
 *        multiple net devices on single physical port.
 *
 * int (*ndo_get_port_parent_id)(struct net_device *dev,
 *                                 struct netdev_phys_item_id *ppid)
 *        Called to get the parent ID of the physical port of this device.
 *
 * void* (*ndo_dfwd_add_station)(struct net_device *pdev,
 *                                 struct net_device *dev)
 *        Called by upper layer devices to accelerate switching or other
 *        station functionality into hardware. 'pdev is the lowerdev
 *        to use for the offload and 'dev' is the net device that will
 *        back the offload. Returns a pointer to the private structure
 *        the upper layer will maintain.
 * void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
 *        Called by upper layer device to delete the station created
 *        by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
 *        the station and priv is the structure returned by the add
 *        operation.
 * int (*ndo_set_tx_maxrate)(struct net_device *dev,
 *                             int queue_index, u32 maxrate);
 *        Called when a user wants to set a max-rate limitation of specific
 *        TX queue.
 * int (*ndo_get_iflink)(const struct net_device *dev);
 *        Called to get the iflink value of this device.
 * int (*ndo_fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb);
 *        This function is used to get egress tunnel information for given skb.
 *        This is useful for retrieving outer tunnel header parameters while
 *        sampling packet.
 * void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
 *        This function is used to specify the headroom that the skb must
 *        consider when allocation skb during packet reception. Setting
 *        appropriate rx headroom value allows avoiding skb head copy on
 *        forward. Setting a negative value resets the rx headroom to the
 *        default value.
 * int (*ndo_bpf)(struct net_device *dev, struct netdev_bpf *bpf);
 *        This function is used to set or query state related to XDP on the
 *        netdevice and manage BPF offload. See definition of
 *        enum bpf_netdev_command for details.
 * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp,
 *                        u32 flags);
 *        This function is used to submit @n XDP packets for transmit on a
 *        netdevice. Returns number of frames successfully transmitted, frames
 *        that got dropped are freed/returned via xdp_return_frame().
 *        Returns negative number, means general error invoking ndo, meaning
 *        no frames were xmit'ed and core-caller will free all frames.
 * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
 *                                                struct xdp_buff *xdp);
 *      Get the xmit slave of master device based on the xdp_buff.
 * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
 *      This function is used to wake up the softirq, ksoftirqd or kthread
 *        responsible for sending and/or receiving packets on a specific
 *        queue id bound to an AF_XDP socket. The flags field specifies if
 *        only RX, only Tx, or both should be woken up using the flags
 *        XDP_WAKEUP_RX and XDP_WAKEUP_TX.
 * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm_kern *p,
 *                         int cmd);
 *        Add, change, delete or get information on an IPv4 tunnel.
 * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
 *        If a device is paired with a peer device, return the peer instance.
 *        The caller must be under RCU read context.
 * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
 *     Get the forwarding path to reach the real device from the HW destination address
 * ktime_t (*ndo_get_tstamp)(struct net_device *dev,
 *                             const struct skb_shared_hwtstamps *hwtstamps,
 *                             bool cycles);
 *        Get hardware timestamp based on normal/adjustable time or free running
 *        cycle counter. This function is required if physical clock supports a
 *        free running cycle counter.
 *
 * int (*ndo_hwtstamp_get)(struct net_device *dev,
 *                           struct kernel_hwtstamp_config *kernel_config);
 *        Get the currently configured hardware timestamping parameters for the
 *        NIC device.
 *
 * int (*ndo_hwtstamp_set)(struct net_device *dev,
 *                           struct kernel_hwtstamp_config *kernel_config,
 *                           struct netlink_ext_ack *extack);
 *        Change the hardware timestamping parameters for NIC device.
 */
struct net_device_ops {
        int                        (*ndo_init)(struct net_device *dev);
        void                        (*ndo_uninit)(struct net_device *dev);
        int                        (*ndo_open)(struct net_device *dev);
        int                        (*ndo_stop)(struct net_device *dev);
        netdev_tx_t                (*ndo_start_xmit)(struct sk_buff *skb,
                                                  struct net_device *dev);
        netdev_features_t        (*ndo_features_check)(struct sk_buff *skb,
                                                      struct net_device *dev,
                                                      netdev_features_t features);
        u16                        (*ndo_select_queue)(struct net_device *dev,
                                                    struct sk_buff *skb,
                                                    struct net_device *sb_dev);
        void                        (*ndo_change_rx_flags)(struct net_device *dev,
                                                       int flags);
        void                        (*ndo_set_rx_mode)(struct net_device *dev);
        int                        (*ndo_set_mac_address)(struct net_device *dev,
                                                       void *addr);
        int                        (*ndo_validate_addr)(struct net_device *dev);
        int                        (*ndo_do_ioctl)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_eth_ioctl)(struct net_device *dev,
                                                 struct ifreq *ifr, int cmd);
        int                        (*ndo_siocbond)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
        int                        (*ndo_siocwandev)(struct net_device *dev,
                                                  struct if_settings *ifs);
        int                        (*ndo_siocdevprivate)(struct net_device *dev,
                                                      struct ifreq *ifr,
                                                      void __user *data, int cmd);
        int                        (*ndo_set_config)(struct net_device *dev,
                                                  struct ifmap *map);
        int                        (*ndo_change_mtu)(struct net_device *dev,
                                                  int new_mtu);
        int                        (*ndo_neigh_setup)(struct net_device *dev,
                                                   struct neigh_parms *);
        void                        (*ndo_tx_timeout) (struct net_device *dev,
                                                   unsigned int txqueue);

        void                        (*ndo_get_stats64)(struct net_device *dev,
                                                   struct rtnl_link_stats64 *storage);
        bool                        (*ndo_has_offload_stats)(const struct net_device *dev, int attr_id);
        int                        (*ndo_get_offload_stats)(int attr_id,
                                                         const struct net_device *dev,
                                                         void *attr_data);
        struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);

        int                        (*ndo_vlan_rx_add_vid)(struct net_device *dev,
                                                       __be16 proto, u16 vid);
        int                        (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
                                                        __be16 proto, u16 vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
        void                    (*ndo_poll_controller)(struct net_device *dev);
        int                        (*ndo_netpoll_setup)(struct net_device *dev);
        void                        (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
        int                        (*ndo_set_vf_mac)(struct net_device *dev,
                                                  int queue, u8 *mac);
        int                        (*ndo_set_vf_vlan)(struct net_device *dev,
                                                   int queue, u16 vlan,
                                                   u8 qos, __be16 proto);
        int                        (*ndo_set_vf_rate)(struct net_device *dev,
                                                   int vf, int min_tx_rate,
                                                   int max_tx_rate);
        int                        (*ndo_set_vf_spoofchk)(struct net_device *dev,
                                                       int vf, bool setting);
        int                        (*ndo_set_vf_trust)(struct net_device *dev,
                                                    int vf, bool setting);
        int                        (*ndo_get_vf_config)(struct net_device *dev,
                                                     int vf,
                                                     struct ifla_vf_info *ivf);
        int                        (*ndo_set_vf_link_state)(struct net_device *dev,
                                                         int vf, int link_state);
        int                        (*ndo_get_vf_stats)(struct net_device *dev,
                                                    int vf,
                                                    struct ifla_vf_stats
                                                    *vf_stats);
        int                        (*ndo_set_vf_port)(struct net_device *dev,
                                                   int vf,
                                                   struct nlattr *port[]);
        int                        (*ndo_get_vf_port)(struct net_device *dev,
                                                   int vf, struct sk_buff *skb);
        int                        (*ndo_get_vf_guid)(struct net_device *dev,
                                                   int vf,
                                                   struct ifla_vf_guid *node_guid,
                                                   struct ifla_vf_guid *port_guid);
        int                        (*ndo_set_vf_guid)(struct net_device *dev,
                                                   int vf, u64 guid,
                                                   int guid_type);
        int                        (*ndo_set_vf_rss_query_en)(
                                                   struct net_device *dev,
                                                   int vf, bool setting);
        int                        (*ndo_setup_tc)(struct net_device *dev,
                                                enum tc_setup_type type,
                                                void *type_data);
#if IS_ENABLED(CONFIG_FCOE)
        int                        (*ndo_fcoe_enable)(struct net_device *dev);
        int                        (*ndo_fcoe_disable)(struct net_device *dev);
        int                        (*ndo_fcoe_ddp_setup)(struct net_device *dev,
                                                      u16 xid,
                                                      struct scatterlist *sgl,
                                                      unsigned int sgc);
        int                        (*ndo_fcoe_ddp_done)(struct net_device *dev,
                                                     u16 xid);
        int                        (*ndo_fcoe_ddp_target)(struct net_device *dev,
                                                       u16 xid,
                                                       struct scatterlist *sgl,
                                                       unsigned int sgc);
        int                        (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
                                                        struct netdev_fcoe_hbainfo *hbainfo);
#endif

#if IS_ENABLED(CONFIG_LIBFCOE)
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
        int                        (*ndo_fcoe_get_wwn)(struct net_device *dev,
                                                    u64 *wwn, int type);
#endif

#ifdef CONFIG_RFS_ACCEL
        int                        (*ndo_rx_flow_steer)(struct net_device *dev,
                                                     const struct sk_buff *skb,
                                                     u16 rxq_index,
                                                     u32 flow_id);
#endif
        int                        (*ndo_add_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev,
                                                 struct netlink_ext_ack *extack);
        int                        (*ndo_del_slave)(struct net_device *dev,
                                                 struct net_device *slave_dev);
        struct net_device*        (*ndo_get_xmit_slave)(struct net_device *dev,
                                                      struct sk_buff *skb,
                                                      bool all_slaves);
        struct net_device*        (*ndo_sk_get_lower_dev)(struct net_device *dev,
                                                        struct sock *sk);
        netdev_features_t        (*ndo_fix_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_set_features)(struct net_device *dev,
                                                    netdev_features_t features);
        int                        (*ndo_neigh_construct)(struct net_device *dev,
                                                       struct neighbour *n);
        void                        (*ndo_neigh_destroy)(struct net_device *dev,
                                                     struct neighbour *n);

        int                        (*ndo_fdb_add)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid,
                                               u16 flags,
                                               bool *notified,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del)(struct ndmsg *ndm,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid,
                                               bool *notified,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_del_bulk)(struct nlmsghdr *nlh,
                                                    struct net_device *dev,
                                                    struct netlink_ext_ack *extack);
        int                        (*ndo_fdb_dump)(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                struct net_device *dev,
                                                struct net_device *filter_dev,
                                                int *idx);
        int                        (*ndo_fdb_get)(struct sk_buff *skb,
                                               struct nlattr *tb[],
                                               struct net_device *dev,
                                               const unsigned char *addr,
                                               u16 vid, u32 portid, u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_add)(struct net_device *dev,
                                               struct nlattr *tb[],
                                               u16 nlmsg_flags,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_del)(struct net_device *dev,
                                               struct nlattr *tb[],
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_del_bulk)(struct net_device *dev,
                                                    struct nlattr *tb[],
                                                    struct netlink_ext_ack *extack);
        int                        (*ndo_mdb_dump)(struct net_device *dev,
                                                struct sk_buff *skb,
                                                struct netlink_callback *cb);
        int                        (*ndo_mdb_get)(struct net_device *dev,
                                               struct nlattr *tb[], u32 portid,
                                               u32 seq,
                                               struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_setlink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags,
                                                      struct netlink_ext_ack *extack);
        int                        (*ndo_bridge_getlink)(struct sk_buff *skb,
                                                      u32 pid, u32 seq,
                                                      struct net_device *dev,
                                                      u32 filter_mask,
                                                      int nlflags);
        int                        (*ndo_bridge_dellink)(struct net_device *dev,
                                                      struct nlmsghdr *nlh,
                                                      u16 flags);
        int                        (*ndo_change_carrier)(struct net_device *dev,
                                                      bool new_carrier);
        int                        (*ndo_get_phys_port_id)(struct net_device *dev,
                                                        struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_port_parent_id)(struct net_device *dev,
                                                          struct netdev_phys_item_id *ppid);
        int                        (*ndo_get_phys_port_name)(struct net_device *dev,
                                                          char *name, size_t len);
        void*                        (*ndo_dfwd_add_station)(struct net_device *pdev,
                                                        struct net_device *dev);
        void                        (*ndo_dfwd_del_station)(struct net_device *pdev,
                                                        void *priv);

        int                        (*ndo_set_tx_maxrate)(struct net_device *dev,
                                                      int queue_index,
                                                      u32 maxrate);
        int                        (*ndo_get_iflink)(const struct net_device *dev);
        int                        (*ndo_fill_metadata_dst)(struct net_device *dev,
                                                       struct sk_buff *skb);
        void                        (*ndo_set_rx_headroom)(struct net_device *dev,
                                                       int needed_headroom);
        int                        (*ndo_bpf)(struct net_device *dev,
                                           struct netdev_bpf *bpf);
        int                        (*ndo_xdp_xmit)(struct net_device *dev, int n,
                                                struct xdp_frame **xdp,
                                                u32 flags);
        struct net_device *        (*ndo_xdp_get_xmit_slave)(struct net_device *dev,
                                                          struct xdp_buff *xdp);
        int                        (*ndo_xsk_wakeup)(struct net_device *dev,
                                                  u32 queue_id, u32 flags);
        int                        (*ndo_tunnel_ctl)(struct net_device *dev,
                                                  struct ip_tunnel_parm_kern *p,
                                                  int cmd);
        struct net_device *        (*ndo_get_peer_dev)(struct net_device *dev);
        int                     (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
                                                         struct net_device_path *path);
        ktime_t                        (*ndo_get_tstamp)(struct net_device *dev,
                                                  const struct skb_shared_hwtstamps *hwtstamps,
                                                  bool cycles);
        int                        (*ndo_hwtstamp_get)(struct net_device *dev,
                                                    struct kernel_hwtstamp_config *kernel_config);
        int                        (*ndo_hwtstamp_set)(struct net_device *dev,
                                                    struct kernel_hwtstamp_config *kernel_config,
                                                    struct netlink_ext_ack *extack);

#if IS_ENABLED(CONFIG_NET_SHAPER)
        /**
         * @net_shaper_ops: Device shaping offload operations
         * see include/net/net_shapers.h
         */
        const struct net_shaper_ops *net_shaper_ops;
#endif
};

/**
 * enum netdev_priv_flags - &struct net_device priv_flags
 *
 * These are the &struct net_device, they are only set internally
 * by drivers and used in the kernel. These flags are invisible to
 * userspace; this means that the order of these flags can change
 * during any kernel release.
 *
 * You should add bitfield booleans after either net_device::priv_flags
 * (hotpath) or ::threaded (slowpath) instead of extending these flags.
 *
 * @IFF_802_1Q_VLAN: 802.1Q VLAN device
 * @IFF_EBRIDGE: Ethernet bridging device
 * @IFF_BONDING: bonding master or slave
 * @IFF_ISATAP: ISATAP interface (RFC4214)
 * @IFF_WAN_HDLC: WAN HDLC device
 * @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
 *        release skb->dst
 * @IFF_DONT_BRIDGE: disallow bridging this ether dev
 * @IFF_DISABLE_NETPOLL: disable netpoll at run-time
 * @IFF_MACVLAN_PORT: device used as macvlan port
 * @IFF_BRIDGE_PORT: device used as bridge port
 * @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port
 * @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit
 * @IFF_UNICAST_FLT: Supports unicast filtering
 * @IFF_TEAM_PORT: device used as team port
 * @IFF_SUPP_NOFCS: device supports sending custom FCS
 * @IFF_LIVE_ADDR_CHANGE: device supports hardware address
 *        change when it's running
 * @IFF_MACVLAN: Macvlan device
 * @IFF_XMIT_DST_RELEASE_PERM: IFF_XMIT_DST_RELEASE not taking into account
 *        underlying stacked devices
 * @IFF_L3MDEV_MASTER: device is an L3 master device
 * @IFF_NO_QUEUE: device can run without qdisc attached
 * @IFF_OPENVSWITCH: device is a Open vSwitch master
 * @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
 * @IFF_TEAM: device is a team device
 * @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
 * @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
 *        entity (i.e. the master device for bridged veth)
 * @IFF_MACSEC: device is a MACsec device
 * @IFF_NO_RX_HANDLER: device doesn't support the rx_handler hook
 * @IFF_FAILOVER: device is a failover master device
 * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
 * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
 * @IFF_NO_ADDRCONF: prevent ipv6 addrconf
 * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
 *        skb_headlen(skb) == 0 (data starts from frag0)
 */
enum netdev_priv_flags {
        IFF_802_1Q_VLAN                        = 1<<0,
        IFF_EBRIDGE                        = 1<<1,
        IFF_BONDING                        = 1<<2,
        IFF_ISATAP                        = 1<<3,
        IFF_WAN_HDLC                        = 1<<4,
        IFF_XMIT_DST_RELEASE                = 1<<5,
        IFF_DONT_BRIDGE                        = 1<<6,
        IFF_DISABLE_NETPOLL                = 1<<7,
        IFF_MACVLAN_PORT                = 1<<8,
        IFF_BRIDGE_PORT                        = 1<<9,
        IFF_OVS_DATAPATH                = 1<<10,
        IFF_TX_SKB_SHARING                = 1<<11,
        IFF_UNICAST_FLT                        = 1<<12,
        IFF_TEAM_PORT                        = 1<<13,
        IFF_SUPP_NOFCS                        = 1<<14,
        IFF_LIVE_ADDR_CHANGE                = 1<<15,
        IFF_MACVLAN                        = 1<<16,
        IFF_XMIT_DST_RELEASE_PERM        = 1<<17,
        IFF_L3MDEV_MASTER                = 1<<18,
        IFF_NO_QUEUE                        = 1<<19,
        IFF_OPENVSWITCH                        = 1<<20,
        IFF_L3MDEV_SLAVE                = 1<<21,
        IFF_TEAM                        = 1<<22,
        IFF_RXFH_CONFIGURED                = 1<<23,
        IFF_PHONY_HEADROOM                = 1<<24,
        IFF_MACSEC                        = 1<<25,
        IFF_NO_RX_HANDLER                = 1<<26,
        IFF_FAILOVER                        = 1<<27,
        IFF_FAILOVER_SLAVE                = 1<<28,
        IFF_L3MDEV_RX_HANDLER                = 1<<29,
        IFF_NO_ADDRCONF                        = BIT_ULL(30),
        IFF_TX_SKB_NO_LINEAR                = BIT_ULL(31),
};

/* Specifies the type of the struct net_device::ml_priv pointer */
enum netdev_ml_priv_type {
        ML_PRIV_NONE,
        ML_PRIV_CAN,
};

enum netdev_stat_type {
        NETDEV_PCPU_STAT_NONE,
        NETDEV_PCPU_STAT_LSTATS, /* struct pcpu_lstats */
        NETDEV_PCPU_STAT_TSTATS, /* struct pcpu_sw_netstats */
        NETDEV_PCPU_STAT_DSTATS, /* struct pcpu_dstats */
};

enum netdev_reg_state {
        NETREG_UNINITIALIZED = 0,
        NETREG_REGISTERED,        /* completed register_netdevice */
        NETREG_UNREGISTERING,        /* called unregister_netdevice */
        NETREG_UNREGISTERED,        /* completed unregister todo */
        NETREG_RELEASED,        /* called free_netdev */
        NETREG_DUMMY,                /* dummy device for NAPI poll */
};

/**
 *        struct net_device - The DEVICE structure.
 *
 *        Actually, this whole structure is a big mistake.  It mixes I/O
 *        data with strictly "high-level" data, and it has to know about
 *        almost every data structure used in the INET module.
 *
 *        @priv_flags:        flags invisible to userspace defined as bits, see
 *                        enum netdev_priv_flags for the definitions
 *        @lltx:                device supports lockless Tx. Deprecated for real HW
 *                        drivers. Mainly used by logical interfaces, such as
 *                        bonding and tunnels
 *
 *        @name:        This is the first field of the "visible" part of this structure
 *                (i.e. as seen by users in the "Space.c" file).  It is the name
 *                of the interface.
 *
 *        @name_node:        Name hashlist node
 *        @ifalias:        SNMP alias
 *        @mem_end:        Shared memory end
 *        @mem_start:        Shared memory start
 *        @base_addr:        Device I/O address
 *        @irq:                Device IRQ number
 *
 *        @state:                Generic network queuing layer state, see netdev_state_t
 *        @dev_list:        The global list of network devices
 *        @napi_list:        List entry used for polling NAPI devices
 *        @unreg_list:        List entry  when we are unregistering the
 *                        device; see the function unregister_netdev
 *        @close_list:        List entry used when we are closing the device
 *        @ptype_all:     Device-specific packet handlers for all protocols
 *        @ptype_specific: Device-specific, protocol-specific packet handlers
 *
 *        @adj_list:        Directly linked devices, like slaves for bonding
 *        @features:        Currently active device features
 *        @hw_features:        User-changeable features
 *
 *        @wanted_features:        User-requested features
 *        @vlan_features:                Mask of features inheritable by VLAN devices
 *
 *        @hw_enc_features:        Mask of features inherited by encapsulating devices
 *                                This field indicates what encapsulation
 *                                offloads the hardware is capable of doing,
 *                                and drivers will need to set them appropriately.
 *
 *        @mpls_features:        Mask of features inheritable by MPLS
 *        @gso_partial_features: value(s) from NETIF_F_GSO\*
 *
 *        @ifindex:        interface index
 *        @group:                The group the device belongs to
 *
 *        @stats:                Statistics struct, which was left as a legacy, use
 *                        rtnl_link_stats64 instead
 *
 *        @core_stats:        core networking counters,
 *                        do not use this in drivers
 *        @carrier_up_count:        Number of times the carrier has been up
 *        @carrier_down_count:        Number of times the carrier has been down
 *
 *        @wireless_handlers:        List of functions to handle Wireless Extensions,
 *                                instead of ioctl,
 *                                see <net/iw_handler.h> for details.
 *
 *        @netdev_ops:        Includes several pointers to callbacks,
 *                        if one wants to override the ndo_*() functions
 *        @xdp_metadata_ops:        Includes pointers to XDP metadata callbacks.
 *        @xsk_tx_metadata_ops:        Includes pointers to AF_XDP TX metadata callbacks.
 *        @ethtool_ops:        Management operations
 *        @l3mdev_ops:        Layer 3 master device operations
 *        @ndisc_ops:        Includes callbacks for different IPv6 neighbour
 *                        discovery handling. Necessary for e.g. 6LoWPAN.
 *        @xfrmdev_ops:        Transformation offload operations
 *        @tlsdev_ops:        Transport Layer Security offload operations
 *        @header_ops:        Includes callbacks for creating,parsing,caching,etc
 *                        of Layer 2 headers.
 *
 *        @flags:                Interface flags (a la BSD)
 *        @xdp_features:        XDP capability supported by the device
 *        @gflags:        Global flags ( kept as legacy )
 *        @priv_len:        Size of the ->priv flexible array
 *        @priv:                Flexible array containing private data
 *        @operstate:        RFC2863 operstate
 *        @link_mode:        Mapping policy to operstate
 *        @if_port:        Selectable AUI, TP, ...
 *        @dma:                DMA channel
 *        @mtu:                Interface MTU value
 *        @min_mtu:        Interface Minimum MTU value
 *        @max_mtu:        Interface Maximum MTU value
 *        @type:                Interface hardware type
 *        @hard_header_len: Maximum hardware header length.
 *        @min_header_len:  Minimum hardware header length
 *
 *        @needed_headroom: Extra headroom the hardware may need, but not in all
 *                          cases can this be guaranteed
 *        @needed_tailroom: Extra tailroom the hardware may need, but not in all
 *                          cases can this be guaranteed. Some cases also use
 *                          LL_MAX_HEADER instead to allocate the skb
 *
 *        interface address info:
 *
 *         @perm_addr:                Permanent hw address
 *         @addr_assign_type:        Hw address assignment type
 *         @addr_len:                Hardware address length
 *        @upper_level:                Maximum depth level of upper devices.
 *        @lower_level:                Maximum depth level of lower devices.
 *        @neigh_priv_len:        Used in neigh_alloc()
 *         @dev_id:                Used to differentiate devices that share
 *                                 the same link layer address
 *         @dev_port:                Used to differentiate devices that share
 *                                 the same function
 *        @addr_list_lock:        XXX: need comments on this one
 *        @name_assign_type:        network interface name assignment type
 *        @uc_promisc:                Counter that indicates promiscuous mode
 *                                has been enabled due to the need to listen to
 *                                additional unicast addresses in a device that
 *                                does not implement ndo_set_rx_mode()
 *        @uc:                        unicast mac addresses
 *        @mc:                        multicast mac addresses
 *        @dev_addrs:                list of device hw addresses
 *        @queues_kset:                Group of all Kobjects in the Tx and RX queues
 *        @promiscuity:                Number of times the NIC is told to work in
 *                                promiscuous mode; if it becomes 0 the NIC will
 *                                exit promiscuous mode
 *        @allmulti:                Counter, enables or disables allmulticast mode
 *
 *        @vlan_info:        VLAN info
 *        @dsa_ptr:        dsa specific data
 *        @tipc_ptr:        TIPC specific data
 *        @atalk_ptr:        AppleTalk link
 *        @ip_ptr:        IPv4 specific data
 *        @ip6_ptr:        IPv6 specific data
 *        @ax25_ptr:        AX.25 specific data
 *        @ieee80211_ptr:        IEEE 802.11 specific data, assign before registering
 *        @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
 *                         device struct
 *        @mpls_ptr:        mpls_dev struct pointer
 *        @mctp_ptr:        MCTP specific data
 *
 *        @dev_addr:        Hw address (before bcast,
 *                        because most packets are unicast)
 *
 *        @_rx:                        Array of RX queues
 *        @num_rx_queues:                Number of RX queues
 *                                allocated at register_netdev() time
 *        @real_num_rx_queues:         Number of RX queues currently active in device
 *        @xdp_prog:                XDP sockets filter program pointer
 *
 *        @rx_handler:                handler for received packets
 *        @rx_handler_data:         XXX: need comments on this one
 *        @tcx_ingress:                BPF & clsact qdisc specific data for ingress processing
 *        @ingress_queue:                XXX: need comments on this one
 *        @nf_hooks_ingress:        netfilter hooks executed for ingress packets
 *        @broadcast:                hw bcast address
 *
 *        @rx_cpu_rmap:        CPU reverse-mapping for RX completion interrupts,
 *                        indexed by RX queue number. Assigned by driver.
 *                        This must only be set if the ndo_rx_flow_steer
 *                        operation is defined
 *        @index_hlist:                Device index hash chain
 *
 *        @_tx:                        Array of TX queues
 *        @num_tx_queues:                Number of TX queues allocated at alloc_netdev_mq() time
 *        @real_num_tx_queues:         Number of TX queues currently active in device
 *        @qdisc:                        Root qdisc from userspace point of view
 *        @tx_queue_len:                Max frames per queue allowed
 *        @tx_global_lock:         XXX: need comments on this one
 *        @xdp_bulkq:                XDP device bulk queue
 *        @xps_maps:                all CPUs/RXQs maps for XPS device
 *
 *        @xps_maps:        XXX: need comments on this one
 *        @tcx_egress:                BPF & clsact qdisc specific data for egress processing
 *        @nf_hooks_egress:        netfilter hooks executed for egress packets
 *        @qdisc_hash:                qdisc hash table
 *        @watchdog_timeo:        Represents the timeout that is used by
 *                                the watchdog (see dev_watchdog())
 *        @watchdog_timer:        List of timers
 *
 *        @proto_down_reason:        reason a netdev interface is held down
 *        @pcpu_refcnt:                Number of references to this device
 *        @dev_refcnt:                Number of references to this device
 *        @refcnt_tracker:        Tracker directory for tracked references to this device
 *        @todo_list:                Delayed register/unregister
 *        @link_watch_list:        XXX: need comments on this one
 *
 *        @reg_state:                Register/unregister state machine
 *        @dismantle:                Device is going to be freed
 *        @rtnl_link_state:        This enum represents the phases of creating
 *                                a new link
 *
 *        @needs_free_netdev:        Should unregister perform free_netdev?
 *        @priv_destructor:        Called from unregister
 *        @npinfo:                XXX: need comments on this one
 *         @nd_net:                Network namespace this network device is inside
 *
 *         @ml_priv:        Mid-layer private
 *        @ml_priv_type:  Mid-layer private type
 *
 *        @pcpu_stat_type:        Type of device statistics which the core should
 *                                allocate/free: none, lstats, tstats, dstats. none
 *                                means the driver is handling statistics allocation/
 *                                freeing internally.
 *        @lstats:                Loopback statistics: packets, bytes
 *        @tstats:                Tunnel statistics: RX/TX packets, RX/TX bytes
 *        @dstats:                Dummy statistics: RX/TX/drop packets, RX/TX bytes
 *
 *        @garp_port:        GARP
 *        @mrp_port:        MRP
 *
 *        @dm_private:        Drop monitor private
 *
 *        @dev:                Class/net/name entry
 *        @sysfs_groups:        Space for optional device, statistics and wireless
 *                        sysfs groups
 *
 *        @sysfs_rx_queue_group:        Space for optional per-rx queue attributes
 *        @rtnl_link_ops:        Rtnl_link_ops
 *        @stat_ops:        Optional ops for queue-aware statistics
 *        @queue_mgmt_ops:        Optional ops for queue management
 *
 *        @gso_max_size:        Maximum size of generic segmentation offload
 *        @tso_max_size:        Device (as in HW) limit on the max TSO request size
 *        @gso_max_segs:        Maximum number of segments that can be passed to the
 *                        NIC for GSO
 *        @tso_max_segs:        Device (as in HW) limit on the max TSO segment count
 *         @gso_ipv4_max_size:        Maximum size of generic segmentation offload,
 *                                 for IPv4.
 *
 *        @dcbnl_ops:        Data Center Bridging netlink ops
 *        @num_tc:        Number of traffic classes in the net device
 *        @tc_to_txq:        XXX: need comments on this one
 *        @prio_tc_map:        XXX: need comments on this one
 *
 *        @fcoe_ddp_xid:        Max exchange id for FCoE LRO by ddp
 *
 *        @priomap:        XXX: need comments on this one
 *        @link_topo:        Physical link topology tracking attached PHYs
 *        @phydev:        Physical device may attach itself
 *                        for hardware timestamping
 *        @sfp_bus:        attached &struct sfp_bus structure.
 *
 *        @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
 *
 *        @proto_down:        protocol port state information can be sent to the
 *                        switch driver and used to set the phys state of the
 *                        switch port.
 *
 *        @threaded:        napi threaded mode is enabled
 *
 *        @irq_affinity_auto: driver wants the core to store and re-assign the IRQ
 *                            affinity. Set by netif_enable_irq_affinity(), then
 *                            the driver must create a persistent napi by
 *                            netif_napi_add_config() and finally bind the napi to
 *                            IRQ (via netif_napi_set_irq()).
 *
 *        @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap.
 *                           Set by calling netif_enable_cpu_rmap().
 *
 *        @see_all_hwtstamp_requests: device wants to see calls to
 *                        ndo_hwtstamp_set() for all timestamp requests
 *                        regardless of source, even if those aren't
 *                        HWTSTAMP_SOURCE_NETDEV
 *        @change_proto_down: device supports setting carrier via IFLA_PROTO_DOWN
 *        @netns_immutable: interface can't change network namespaces
 *        @fcoe_mtu:        device supports maximum FCoE MTU, 2158 bytes
 *
 *        @net_notifier_list:        List of per-net netdev notifier block
 *                                that follow this device when it is moved
 *                                to another network namespace.
 *
 *        @macsec_ops:    MACsec offloading ops
 *
 *        @udp_tunnel_nic_info:        static structure describing the UDP tunnel
 *                                offload capabilities of the device
 *        @udp_tunnel_nic:        UDP tunnel offload state
 *        @ethtool:        ethtool related state
 *        @xdp_state:                stores info on attached XDP BPF programs
 *
 *        @nested_level:        Used as a parameter of spin_lock_nested() of
 *                        dev->addr_list_lock.
 *        @unlink_list:        As netif_addr_lock() can be called recursively,
 *                        keep a list of interfaces to be deleted.
 *        @gro_max_size:        Maximum size of aggregated packet in generic
 *                        receive offload (GRO)
 *         @gro_ipv4_max_size:        Maximum size of aggregated packet in generic
 *                                 receive offload (GRO), for IPv4.
 *        @xdp_zc_max_segs:        Maximum number of segments supported by AF_XDP
 *                                zero copy driver
 *
 *        @dev_addr_shadow:        Copy of @dev_addr to catch direct writes.
 *        @linkwatch_dev_tracker:        refcount tracker used by linkwatch.
 *        @watchdog_dev_tracker:        refcount tracker used by watchdog.
 *        @dev_registered_tracker:        tracker for reference held while
 *                                        registered
 *        @offload_xstats_l3:        L3 HW stats for this netdevice.
 *
 *        @devlink_port:        Pointer to related devlink port structure.
 *                        Assigned by a driver before netdev registration using
 *                        SET_NETDEV_DEVLINK_PORT macro. This pointer is static
 *                        during the time netdevice is registered.
 *
 *        @dpll_pin: Pointer to the SyncE source pin of a DPLL subsystem,
 *                   where the clock is recovered.
 *
 *        @max_pacing_offload_horizon: max EDT offload horizon in nsec.
 *        @napi_config: An array of napi_config structures containing per-NAPI
 *                      settings.
 *        @gro_flush_timeout:        timeout for GRO layer in NAPI
 *        @napi_defer_hard_irqs:        If not zero, provides a counter that would
 *                                allow to avoid NIC hard IRQ, on busy queues.
 *
 *        @neighbours:        List heads pointing to this device's neighbours'
 *                        dev_list, one per address-family.
 *        @hwprov: Tracks which PTP performs hardware packet time stamping.
 *
 *        FIXME: cleanup struct net_device such that network protocol info
 *        moves out.
 */

struct net_device {
        /* Cacheline organization can be found documented in
         * Documentation/networking/net_cachelines/net_device.rst.
         * Please update the document when adding new fields.
         */

        /* TX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_tx);
        struct_group(priv_flags_fast,
                unsigned long                priv_flags:32;
                unsigned long                lltx:1;
        );
        const struct net_device_ops *netdev_ops;
        const struct header_ops *header_ops;
        struct netdev_queue        *_tx;
        netdev_features_t        gso_partial_features;
        unsigned int                real_num_tx_queues;
        unsigned int                gso_max_size;
        unsigned int                gso_ipv4_max_size;
        u16                        gso_max_segs;
        s16                        num_tc;
        /* Note : dev->mtu is often read without holding a lock.
         * Writers usually hold RTNL.
         * It is recommended to use READ_ONCE() to annotate the reads,
         * and to use WRITE_ONCE() to annotate the writes.
         */
        unsigned int                mtu;
        unsigned short                needed_headroom;
        struct netdev_tc_txq        tc_to_txq[TC_MAX_QUEUE];
#ifdef CONFIG_XPS
        struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        struct nf_hook_entries __rcu *nf_hooks_egress;
#endif
#ifdef CONFIG_NET_XGRESS
        struct bpf_mprog_entry __rcu *tcx_egress;
#endif
        __cacheline_group_end(net_device_read_tx);

        /* TXRX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_txrx);
        union {
                struct pcpu_lstats __percpu                *lstats;
                struct pcpu_sw_netstats __percpu        *tstats;
                struct pcpu_dstats __percpu                *dstats;
        };
        unsigned long                state;
        unsigned int                flags;
        unsigned short                hard_header_len;
        netdev_features_t        features;
        struct inet6_dev __rcu        *ip6_ptr;
        __cacheline_group_end(net_device_read_txrx);

        /* RX read-mostly hotpath */
        __cacheline_group_begin(net_device_read_rx);
        struct bpf_prog __rcu        *xdp_prog;
        struct list_head        ptype_specific;
        int                        ifindex;
        unsigned int                real_num_rx_queues;
        struct netdev_rx_queue        *_rx;
        unsigned int                gro_max_size;
        unsigned int                gro_ipv4_max_size;
        rx_handler_func_t __rcu        *rx_handler;
        void __rcu                *rx_handler_data;
        possible_net_t                        nd_net;
#ifdef CONFIG_NETPOLL
        struct netpoll_info __rcu        *npinfo;
#endif
#ifdef CONFIG_NET_XGRESS
        struct bpf_mprog_entry __rcu *tcx_ingress;
#endif
        __cacheline_group_end(net_device_read_rx);

        char                        name[IFNAMSIZ];
        struct netdev_name_node        *name_node;
        struct dev_ifalias        __rcu *ifalias;
        /*
         *        I/O specific fields
         *        FIXME: Merge these and struct ifmap into one
         */
        unsigned long                mem_end;
        unsigned long                mem_start;
        unsigned long                base_addr;

        /*
         *        Some hardware also needs these fields (state,dev_list,
         *        napi_list,unreg_list,close_list) but they are not
         *        part of the usual set specified in Space.c.
         */


        struct list_head        dev_list;
        struct list_head        napi_list;
        struct list_head        unreg_list;
        struct list_head        close_list;
        struct list_head        ptype_all;

        struct {
                struct list_head upper;
                struct list_head lower;
        } adj_list;

        /* Read-mostly cache-line for fast-path access */
        xdp_features_t                xdp_features;
        const struct xdp_metadata_ops *xdp_metadata_ops;
        const struct xsk_tx_metadata_ops *xsk_tx_metadata_ops;
        unsigned short                gflags;

        unsigned short                needed_tailroom;

        netdev_features_t        hw_features;
        netdev_features_t        wanted_features;
        netdev_features_t        vlan_features;
        netdev_features_t        hw_enc_features;
        netdev_features_t        mpls_features;

        unsigned int                min_mtu;
        unsigned int                max_mtu;
        unsigned short                type;
        unsigned char                min_header_len;
        unsigned char                name_assign_type;

        int                        group;

        struct net_device_stats        stats; /* not used by modern drivers */

        struct net_device_core_stats __percpu *core_stats;

        /* Stats to monitor link on/off, flapping */
        atomic_t                carrier_up_count;
        atomic_t                carrier_down_count;

#ifdef CONFIG_WIRELESS_EXT
        const struct iw_handler_def *wireless_handlers;
#endif
        const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_L3_MASTER_DEV
        const struct l3mdev_ops        *l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
        const struct ndisc_ops *ndisc_ops;
#endif

#ifdef CONFIG_XFRM_OFFLOAD
        const struct xfrmdev_ops *xfrmdev_ops;
#endif

#if IS_ENABLED(CONFIG_TLS_DEVICE)
        const struct tlsdev_ops *tlsdev_ops;
#endif

        unsigned int                operstate;
        unsigned char                link_mode;

        unsigned char                if_port;
        unsigned char                dma;

        /* Interface address info. */
        unsigned char                perm_addr[MAX_ADDR_LEN];
        unsigned char                addr_assign_type;
        unsigned char                addr_len;
        unsigned char                upper_level;
        unsigned char                lower_level;

        unsigned short                neigh_priv_len;
        unsigned short          dev_id;
        unsigned short          dev_port;
        int                        irq;
        u32                        priv_len;

        spinlock_t                addr_list_lock;

        struct netdev_hw_addr_list        uc;
        struct netdev_hw_addr_list        mc;
        struct netdev_hw_addr_list        dev_addrs;

#ifdef CONFIG_SYSFS
        struct kset                *queues_kset;
#endif
#ifdef CONFIG_LOCKDEP
        struct list_head        unlink_list;
#endif
        unsigned int                promiscuity;
        unsigned int                allmulti;
        bool                        uc_promisc;
#ifdef CONFIG_LOCKDEP
        unsigned char                nested_level;
#endif


        /* Protocol-specific pointers */
        struct in_device __rcu        *ip_ptr;
        /** @fib_nh_head: nexthops associated with this netdev */
        struct hlist_head        fib_nh_head;

#if IS_ENABLED(CONFIG_VLAN_8021Q)
        struct vlan_info __rcu        *vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
        struct dsa_port                *dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
        struct tipc_bearer __rcu *tipc_ptr;
#endif
#if IS_ENABLED(CONFIG_ATALK)
        void                         *atalk_ptr;
#endif
#if IS_ENABLED(CONFIG_AX25)
        struct ax25_dev        __rcu        *ax25_ptr;
#endif
#if IS_ENABLED(CONFIG_CFG80211)
        struct wireless_dev        *ieee80211_ptr;
#endif
#if IS_ENABLED(CONFIG_IEEE802154) || IS_ENABLED(CONFIG_6LOWPAN)
        struct wpan_dev                *ieee802154_ptr;
#endif
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
        struct mpls_dev __rcu        *mpls_ptr;
#endif
#if IS_ENABLED(CONFIG_MCTP)
        struct mctp_dev __rcu        *mctp_ptr;
#endif

/*
 * Cache lines mostly used on receive path (including eth_type_trans())
 */
        /* Interface address info used in eth_type_trans() */
        const unsigned char        *dev_addr;

        unsigned int                num_rx_queues;
#define GRO_LEGACY_MAX_SIZE        65536u
/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
 * and shinfo->gso_segs is a 16bit field.
 */
#define GRO_MAX_SIZE                (8 * 65535u)
        unsigned int                xdp_zc_max_segs;
        struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
        struct nf_hook_entries __rcu *nf_hooks_ingress;
#endif

        unsigned char                broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap                *rx_cpu_rmap;
#endif
        struct hlist_node        index_hlist;

/*
 * Cache lines mostly used on transmit path
 */
        unsigned int                num_tx_queues;
        struct Qdisc __rcu        *qdisc;
        unsigned int                tx_queue_len;
        spinlock_t                tx_global_lock;

        struct xdp_dev_bulk_queue __percpu *xdp_bulkq;

#ifdef CONFIG_NET_SCHED
        DECLARE_HASHTABLE        (qdisc_hash, 4);
#endif
        /* These may be needed for future network-power-down code. */
        struct timer_list        watchdog_timer;
        int                        watchdog_timeo;

        u32                     proto_down_reason;

        struct list_head        todo_list;

#ifdef CONFIG_PCPU_DEV_REFCNT
        int __percpu                *pcpu_refcnt;
#else
        refcount_t                dev_refcnt;
#endif
        struct ref_tracker_dir        refcnt_tracker;

        struct list_head        link_watch_list;

        u8 reg_state;

        bool dismantle;

        enum {
                RTNL_LINK_INITIALIZED,
                RTNL_LINK_INITIALIZING,
        } rtnl_link_state:16;

        bool needs_free_netdev;
        void (*priv_destructor)(struct net_device *dev);

        /* mid-layer private */
        void                                *ml_priv;
        enum netdev_ml_priv_type        ml_priv_type;

        enum netdev_stat_type                pcpu_stat_type:8;

#if IS_ENABLED(CONFIG_GARP)
        struct garp_port __rcu        *garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
        struct mrp_port __rcu        *mrp_port;
#endif
#if IS_ENABLED(CONFIG_NET_DROP_MONITOR)
        struct dm_hw_stat_delta __rcu *dm_private;
#endif
        struct device                dev;
        const struct attribute_group *sysfs_groups[4];
        const struct attribute_group *sysfs_rx_queue_group;

        const struct rtnl_link_ops *rtnl_link_ops;

        const struct netdev_stat_ops *stat_ops;

        const struct netdev_queue_mgmt_ops *queue_mgmt_ops;

        /* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SEGS                65535u
#define GSO_LEGACY_MAX_SIZE        65536u
/* TCP minimal MSS is 8 (TCP_MIN_GSO_SIZE),
 * and shinfo->gso_segs is a 16bit field.
 */
#define GSO_MAX_SIZE                (8 * GSO_MAX_SEGS)

#define TSO_LEGACY_MAX_SIZE        65536
#define TSO_MAX_SIZE                UINT_MAX
        unsigned int                tso_max_size;
#define TSO_MAX_SEGS                U16_MAX
        u16                        tso_max_segs;

#ifdef CONFIG_DCB
        const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
        u8                        prio_tc_map[TC_BITMASK + 1];

#if IS_ENABLED(CONFIG_FCOE)
        unsigned int                fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
        struct netprio_map __rcu *priomap;
#endif
        struct phy_link_topology        *link_topo;
        struct phy_device        *phydev;
        struct sfp_bus                *sfp_bus;
        struct lock_class_key        *qdisc_tx_busylock;
        bool                        proto_down;
        bool                        threaded;
        bool                        irq_affinity_auto;
        bool                        rx_cpu_rmap_auto;

        /* priv_flags_slow, ungrouped to save space */
        unsigned long                see_all_hwtstamp_requests:1;
        unsigned long                change_proto_down:1;
        unsigned long                netns_immutable:1;
        unsigned long                fcoe_mtu:1;

        struct list_head        net_notifier_list;

#if IS_ENABLED(CONFIG_MACSEC)
        /* MACsec management functions */
        const struct macsec_ops *macsec_ops;
#endif
        const struct udp_tunnel_nic_info        *udp_tunnel_nic_info;
        struct udp_tunnel_nic        *udp_tunnel_nic;

        /** @cfg: net_device queue-related configuration */
        struct netdev_config        *cfg;
        /**
         * @cfg_pending: same as @cfg but when device is being actively
         *        reconfigured includes any changes to the configuration
         *        requested by the user, but which may or may not be rejected.
         */
        struct netdev_config        *cfg_pending;
        struct ethtool_netdev_state *ethtool;

        /* protected by rtnl_lock */
        struct bpf_xdp_entity        xdp_state[__MAX_XDP_MODE];

        u8 dev_addr_shadow[MAX_ADDR_LEN];
        netdevice_tracker        linkwatch_dev_tracker;
        netdevice_tracker        watchdog_dev_tracker;
        netdevice_tracker        dev_registered_tracker;
        struct rtnl_hw_stats64        *offload_xstats_l3;

        struct devlink_port        *devlink_port;

#if IS_ENABLED(CONFIG_DPLL)
        struct dpll_pin        __rcu        *dpll_pin;
#endif
#if IS_ENABLED(CONFIG_PAGE_POOL)
        /** @page_pools: page pools created for this netdevice */
        struct hlist_head        page_pools;
#endif

        /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */
        struct dim_irq_moder        *irq_moder;

        u64                        max_pacing_offload_horizon;
        struct napi_config        *napi_config;
        unsigned long                gro_flush_timeout;
        u32                        napi_defer_hard_irqs;

        /**
         * @up: copy of @state's IFF_UP, but safe to read with just @lock.
         *        May report false negatives while the device is being opened
         *        or closed (@lock does not protect .ndo_open, or .ndo_close).
         */
        bool                        up;

        /**
         * @request_ops_lock: request the core to run all @netdev_ops and
         * @ethtool_ops under the @lock.
         */
        bool                        request_ops_lock;

        /**
         * @lock: netdev-scope lock, protects a small selection of fields.
         * Should always be taken using netdev_lock() / netdev_unlock() helpers.
         * Drivers are free to use it for other protection.
         *
         * For the drivers that implement shaper or queue API, the scope
         * of this lock is expanded to cover most ndo/queue/ethtool/sysfs
         * operations. Drivers may opt-in to this behavior by setting
         * @request_ops_lock.
         *
         * @lock protection mixes with rtnl_lock in multiple ways, fields are
         * either:
         *
         * - simply protected by the instance @lock;
         *
         * - double protected - writers hold both locks, readers hold either;
         *
         * - ops protected - protected by the lock held around the NDOs
         *   and other callbacks, that is the instance lock on devices for
         *   which netdev_need_ops_lock() returns true, otherwise by rtnl_lock;
         *
         * - double ops protected - always protected by rtnl_lock but for
         *   devices for which netdev_need_ops_lock() returns true - also
         *   the instance lock.
         *
         * Simply protects:
         *        @gro_flush_timeout, @napi_defer_hard_irqs, @napi_list,
         *        @net_shaper_hierarchy, @reg_state, @threaded
         *
         * Double protects:
         *        @up
         *
         * Double ops protects:
         *        @real_num_rx_queues, @real_num_tx_queues
         *
         * Also protects some fields in:
         *        struct napi_struct, struct netdev_queue, struct netdev_rx_queue
         *
         * Ordering: take after rtnl_lock.
         */
        struct mutex                lock;

#if IS_ENABLED(CONFIG_NET_SHAPER)
        /**
         * @net_shaper_hierarchy: data tracking the current shaper status
         *  see include/net/net_shapers.h
         */
        struct net_shaper_hierarchy *net_shaper_hierarchy;
#endif

        struct hlist_head neighbours[NEIGH_NR_TABLES];

        struct hwtstamp_provider __rcu        *hwprov;

        u8                        priv[] ____cacheline_aligned
                                       __counted_by(priv_len);
} ____cacheline_aligned;
#define to_net_dev(d) container_of(d, struct net_device, dev)

/*
 * Driver should use this to assign devlink port instance to a netdevice
 * before it registers the netdevice. Therefore devlink_port is static
 * during the netdev lifetime after it is registered.
 */
#define SET_NETDEV_DEVLINK_PORT(dev, port)                        \
({                                                                \
        WARN_ON((dev)->reg_state != NETREG_UNINITIALIZED);        \
        ((dev)->devlink_port = (port));                                \
})

static inline bool netif_elide_gro(const struct net_device *dev)
{
        if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog)
                return true;
        return false;
}

#define        NETDEV_ALIGN                32

static inline
int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio)
{
        return dev->prio_tc_map[prio & TC_BITMASK];
}

static inline
int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

        dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK;
        return 0;
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq);
void netdev_reset_tc(struct net_device *dev);
int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset);
int netdev_set_num_tc(struct net_device *dev, u8 num_tc);

static inline
int netdev_get_num_tc(struct net_device *dev)
{
        return dev->num_tc;
}

static inline void net_prefetch(void *p)
{
        prefetch(p);
#if L1_CACHE_BYTES < 128
        prefetch((u8 *)p + L1_CACHE_BYTES);
#endif
}

static inline void net_prefetchw(void *p)
{
        prefetchw(p);
#if L1_CACHE_BYTES < 128
        prefetchw((u8 *)p + L1_CACHE_BYTES);
#endif
}

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev);
int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset);
int netdev_set_sb_channel(struct net_device *dev, u16 channel);
static inline int netdev_get_sb_channel(struct net_device *dev)
{
        return max_t(int, -dev->num_tc, 0);
}

static inline
struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev,
                                         unsigned int index)
{
        DEBUG_NET_WARN_ON_ONCE(index >= dev->num_tx_queues);
        return &dev->_tx[index];
}

static inline struct netdev_queue *skb_get_tx_queue(const struct net_device *dev,
                                                    const struct sk_buff *skb)
{
        return netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
}

static inline void netdev_for_each_tx_queue(struct net_device *dev,
                                            void (*f)(struct net_device *,
                                                      struct netdev_queue *,
                                                      void *),
                                            void *arg)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                f(dev, &dev->_tx[i], arg);
}

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);
struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev);

/* returns the headroom that the master device needs to take in account
 * when forwarding to this dev
 */
static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
{
        return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
}

static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
{
        if (dev->netdev_ops->ndo_set_rx_headroom)
                dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
}

/* set the device rx headroom to the dev's default */
static inline void netdev_reset_rx_headroom(struct net_device *dev)
{
        netdev_set_rx_headroom(dev, -1);
}

static inline void *netdev_get_ml_priv(struct net_device *dev,
                                       enum netdev_ml_priv_type type)
{
        if (dev->ml_priv_type != type)
                return NULL;

        return dev->ml_priv;
}

static inline void netdev_set_ml_priv(struct net_device *dev,
                                      void *ml_priv,
                                      enum netdev_ml_priv_type type)
{
        WARN(dev->ml_priv_type && dev->ml_priv_type != type,
             "Overwriting already set ml_priv_type (%u) with different ml_priv_type (%u)!\n",
             dev->ml_priv_type, type);
        WARN(!dev->ml_priv_type && dev->ml_priv,
             "Overwriting already set ml_priv and ml_priv_type is ML_PRIV_NONE!\n");

        dev->ml_priv = ml_priv;
        dev->ml_priv_type = type;
}

/*
 * Net namespace inlines
 */
static inline
struct net *dev_net(const struct net_device *dev)
{
        return read_pnet(&dev->nd_net);
}

static inline
struct net *dev_net_rcu(const struct net_device *dev)
{
        return read_pnet_rcu(&dev->nd_net);
}

static inline
void dev_net_set(struct net_device *dev, struct net *net)
{
        write_pnet(&dev->nd_net, net);
}

/**
 *        netdev_priv - access network device private data
 *        @dev: network device
 *
 * Get network device private data
 */
static inline void *netdev_priv(const struct net_device *dev)
{
        return (void *)dev->priv;
}

/* Set the sysfs physical device reference for the network logical device
 * if set prior to registration will cause a symlink during initialization.
 */
#define SET_NETDEV_DEV(net, pdev)        ((net)->dev.parent = (pdev))

/* Set the sysfs device type for the network logical device to allow
 * fine-grained identification of different network device types. For
 * example Ethernet, Wireless LAN, Bluetooth, WiMAX etc.
 */
#define SET_NETDEV_DEVTYPE(net, devtype)        ((net)->dev.type = (devtype))

void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
                          enum netdev_queue_type type,
                          struct napi_struct *napi);

static inline void netdev_lock(struct net_device *dev)
{
        mutex_lock(&dev->lock);
}

static inline void netdev_unlock(struct net_device *dev)
{
        mutex_unlock(&dev->lock);
}
/* Additional netdev_lock()-related helpers are in net/netdev_lock.h */

void netif_napi_set_irq_locked(struct napi_struct *napi, int irq);

static inline void netif_napi_set_irq(struct napi_struct *napi, int irq)
{
        netdev_lock(napi->dev);
        netif_napi_set_irq_locked(napi, irq);
        netdev_unlock(napi->dev);
}

/* Default NAPI poll() weight
 * Device drivers are strongly advised to not use bigger value
 */
#define NAPI_POLL_WEIGHT 64

void netif_napi_add_weight_locked(struct net_device *dev,
                                  struct napi_struct *napi,
                                  int (*poll)(struct napi_struct *, int),
                                  int weight);

static inline void
netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
                      int (*poll)(struct napi_struct *, int), int weight)
{
        netdev_lock(dev);
        netif_napi_add_weight_locked(dev, napi, poll, weight);
        netdev_unlock(dev);
}

/**
 * netif_napi_add() - initialize a NAPI context
 * @dev:  network device
 * @napi: NAPI context
 * @poll: polling function
 *
 * netif_napi_add() must be used to initialize a NAPI context prior to calling
 * *any* of the other NAPI-related functions.
 */
static inline void
netif_napi_add(struct net_device *dev, struct napi_struct *napi,
               int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
}

static inline void
netif_napi_add_locked(struct net_device *dev, struct napi_struct *napi,
                      int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT);
}

static inline void
netif_napi_add_tx_weight(struct net_device *dev,
                         struct napi_struct *napi,
                         int (*poll)(struct napi_struct *, int),
                         int weight)
{
        set_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state);
        netif_napi_add_weight(dev, napi, poll, weight);
}

static inline void
netif_napi_add_config_locked(struct net_device *dev, struct napi_struct *napi,
                             int (*poll)(struct napi_struct *, int), int index)
{
        napi->index = index;
        napi->config = &dev->napi_config[index];
        netif_napi_add_weight_locked(dev, napi, poll, NAPI_POLL_WEIGHT);
}

/**
 * netif_napi_add_config - initialize a NAPI context with persistent config
 * @dev: network device
 * @napi: NAPI context
 * @poll: polling function
 * @index: the NAPI index
 */
static inline void
netif_napi_add_config(struct net_device *dev, struct napi_struct *napi,
                      int (*poll)(struct napi_struct *, int), int index)
{
        netdev_lock(dev);
        netif_napi_add_config_locked(dev, napi, poll, index);
        netdev_unlock(dev);
}

/**
 * netif_napi_add_tx() - initialize a NAPI context to be used for Tx only
 * @dev:  network device
 * @napi: NAPI context
 * @poll: polling function
 *
 * This variant of netif_napi_add() should be used from drivers using NAPI
 * to exclusively poll a TX queue.
 * This will avoid we add it into napi_hash[], thus polluting this hash table.
 */
static inline void netif_napi_add_tx(struct net_device *dev,
                                     struct napi_struct *napi,
                                     int (*poll)(struct napi_struct *, int))
{
        netif_napi_add_tx_weight(dev, napi, poll, NAPI_POLL_WEIGHT);
}

void __netif_napi_del_locked(struct napi_struct *napi);

/**
 *  __netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 * Warning: caller must observe RCU grace period before freeing memory
 * containing @napi. Drivers might want to call this helper to combine
 * all the needed RCU grace periods into a single one.
 */
static inline void __netif_napi_del(struct napi_struct *napi)
{
        netdev_lock(napi->dev);
        __netif_napi_del_locked(napi);
        netdev_unlock(napi->dev);
}

static inline void netif_napi_del_locked(struct napi_struct *napi)
{
        __netif_napi_del_locked(napi);
        synchronize_net();
}

/**
 *  netif_napi_del - remove a NAPI context
 *  @napi: NAPI context
 *
 *  netif_napi_del() removes a NAPI context from the network device NAPI list
 */
static inline void netif_napi_del(struct napi_struct *napi)
{
        __netif_napi_del(napi);
        synchronize_net();
}

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs);
void netif_set_affinity_auto(struct net_device *dev);

struct packet_type {
        __be16                        type;        /* This is really htons(ether_type). */
        bool                        ignore_outgoing;
        struct net_device        *dev;        /* NULL is wildcarded here             */
        netdevice_tracker        dev_tracker;
        int                        (*func) (struct sk_buff *,
                                         struct net_device *,
                                         struct packet_type *,
                                         struct net_device *);
        void                        (*list_func) (struct list_head *,
                                              struct packet_type *,
                                              struct net_device *);
        bool                        (*id_match)(struct packet_type *ptype,
                                            struct sock *sk);
        struct net                *af_packet_net;
        void                        *af_packet_priv;
        struct list_head        list;
};

struct offload_callbacks {
        struct sk_buff                *(*gso_segment)(struct sk_buff *skb,
                                                netdev_features_t features);
        struct sk_buff                *(*gro_receive)(struct list_head *head,
                                                struct sk_buff *skb);
        int                        (*gro_complete)(struct sk_buff *skb, int nhoff);
};

struct packet_offload {
        __be16                         type;        /* This is really htons(ether_type). */
        u16                         priority;
        struct offload_callbacks callbacks;
        struct list_head         list;
};

/* often modified stats are per-CPU, other are shared (netdev->stats) */
struct pcpu_sw_netstats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        struct u64_stats_sync   syncp;
} __aligned(4 * sizeof(u64));

struct pcpu_dstats {
        u64_stats_t                rx_packets;
        u64_stats_t                rx_bytes;
        u64_stats_t                tx_packets;
        u64_stats_t                tx_bytes;
        u64_stats_t                rx_drops;
        u64_stats_t                tx_drops;
        struct u64_stats_sync        syncp;
} __aligned(8 * sizeof(u64));

struct pcpu_lstats {
        u64_stats_t packets;
        u64_stats_t bytes;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes);

static inline void dev_sw_netstats_rx_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        u64_stats_add(&tstats->rx_bytes, len);
        u64_stats_inc(&tstats->rx_packets);
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_sw_netstats_tx_add(struct net_device *dev,
                                          unsigned int packets,
                                          unsigned int len)
{
        struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);

        u64_stats_update_begin(&tstats->syncp);
        u64_stats_add(&tstats->tx_bytes, len);
        u64_stats_add(&tstats->tx_packets, packets);
        u64_stats_update_end(&tstats->syncp);
}

static inline void dev_lstats_add(struct net_device *dev, unsigned int len)
{
        struct pcpu_lstats *lstats = this_cpu_ptr(dev->lstats);

        u64_stats_update_begin(&lstats->syncp);
        u64_stats_add(&lstats->bytes, len);
        u64_stats_inc(&lstats->packets);
        u64_stats_update_end(&lstats->syncp);
}

static inline void dev_dstats_rx_add(struct net_device *dev,
                                     unsigned int len)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->rx_packets);
        u64_stats_add(&dstats->rx_bytes, len);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_rx_dropped(struct net_device *dev)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->rx_drops);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_tx_add(struct net_device *dev,
                                     unsigned int len)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->tx_packets);
        u64_stats_add(&dstats->tx_bytes, len);
        u64_stats_update_end(&dstats->syncp);
}

static inline void dev_dstats_tx_dropped(struct net_device *dev)
{
        struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);

        u64_stats_update_begin(&dstats->syncp);
        u64_stats_inc(&dstats->tx_drops);
        u64_stats_update_end(&dstats->syncp);
}

#define __netdev_alloc_pcpu_stats(type, gfp)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = alloc_percpu_gfp(type, gfp);\
        if (pcpu_stats)        {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

#define netdev_alloc_pcpu_stats(type)                                        \
        __netdev_alloc_pcpu_stats(type, GFP_KERNEL)

#define devm_netdev_alloc_pcpu_stats(dev, type)                                \
({                                                                        \
        typeof(type) __percpu *pcpu_stats = devm_alloc_percpu(dev, type);\
        if (pcpu_stats) {                                                \
                int __cpu;                                                \
                for_each_possible_cpu(__cpu) {                                \
                        typeof(type) *stat;                                \
                        stat = per_cpu_ptr(pcpu_stats, __cpu);                \
                        u64_stats_init(&stat->syncp);                        \
                }                                                        \
        }                                                                \
        pcpu_stats;                                                        \
})

enum netdev_lag_tx_type {
        NETDEV_LAG_TX_TYPE_UNKNOWN,
        NETDEV_LAG_TX_TYPE_RANDOM,
        NETDEV_LAG_TX_TYPE_BROADCAST,
        NETDEV_LAG_TX_TYPE_ROUNDROBIN,
        NETDEV_LAG_TX_TYPE_ACTIVEBACKUP,
        NETDEV_LAG_TX_TYPE_HASH,
};

enum netdev_lag_hash {
        NETDEV_LAG_HASH_NONE,
        NETDEV_LAG_HASH_L2,
        NETDEV_LAG_HASH_L34,
        NETDEV_LAG_HASH_L23,
        NETDEV_LAG_HASH_E23,
        NETDEV_LAG_HASH_E34,
        NETDEV_LAG_HASH_VLAN_SRCMAC,
        NETDEV_LAG_HASH_UNKNOWN,
};

struct netdev_lag_upper_info {
        enum netdev_lag_tx_type tx_type;
        enum netdev_lag_hash hash_type;
};

struct netdev_lag_lower_state_info {
        u8 link_up : 1,
           tx_enabled : 1;
};

#include <linux/notifier.h>

/* netdevice notifier chain. Please remember to update netdev_cmd_to_name()
 * and the rtnetlink notification exclusion list in rtnetlink_event() when
 * adding new types.
 */
enum netdev_cmd {
        NETDEV_UP        = 1,        /* For now you can't veto a device up/down */
        NETDEV_DOWN,
        NETDEV_REBOOT,                /* Tell a protocol stack a network interface
                                   detected a hardware crash and restarted
                                   - we can use this eg to kick tcp sessions
                                   once done */
        NETDEV_CHANGE,                /* Notify device state change */
        NETDEV_REGISTER,
        NETDEV_UNREGISTER,
        NETDEV_CHANGEMTU,        /* notify after mtu change happened */
        NETDEV_CHANGEADDR,        /* notify after the address change */
        NETDEV_PRE_CHANGEADDR,        /* notify before the address change */
        NETDEV_GOING_DOWN,
        NETDEV_CHANGENAME,
        NETDEV_FEAT_CHANGE,
        NETDEV_BONDING_FAILOVER,
        NETDEV_PRE_UP,
        NETDEV_PRE_TYPE_CHANGE,
        NETDEV_POST_TYPE_CHANGE,
        NETDEV_POST_INIT,
        NETDEV_PRE_UNINIT,
        NETDEV_RELEASE,
        NETDEV_NOTIFY_PEERS,
        NETDEV_JOIN,
        NETDEV_CHANGEUPPER,
        NETDEV_RESEND_IGMP,
        NETDEV_PRECHANGEMTU,        /* notify before mtu change happened */
        NETDEV_CHANGEINFODATA,
        NETDEV_BONDING_INFO,
        NETDEV_PRECHANGEUPPER,
        NETDEV_CHANGELOWERSTATE,
        NETDEV_UDP_TUNNEL_PUSH_INFO,
        NETDEV_UDP_TUNNEL_DROP_INFO,
        NETDEV_CHANGE_TX_QUEUE_LEN,
        NETDEV_CVLAN_FILTER_PUSH_INFO,
        NETDEV_CVLAN_FILTER_DROP_INFO,
        NETDEV_SVLAN_FILTER_PUSH_INFO,
        NETDEV_SVLAN_FILTER_DROP_INFO,
        NETDEV_OFFLOAD_XSTATS_ENABLE,
        NETDEV_OFFLOAD_XSTATS_DISABLE,
        NETDEV_OFFLOAD_XSTATS_REPORT_USED,
        NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
        NETDEV_XDP_FEAT_CHANGE,
};
const char *netdev_cmd_to_name(enum netdev_cmd cmd);

int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb);
int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb);
int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn);
int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn);

struct netdev_notifier_info {
        struct net_device        *dev;
        struct netlink_ext_ack        *extack;
};

struct netdev_notifier_info_ext {
        struct netdev_notifier_info info; /* must be first */
        union {
                u32 mtu;
        } ext;
};

struct netdev_notifier_change_info {
        struct netdev_notifier_info info; /* must be first */
        unsigned int flags_changed;
};

struct netdev_notifier_changeupper_info {
        struct netdev_notifier_info info; /* must be first */
        struct net_device *upper_dev; /* new upper dev */
        bool master; /* is upper dev master */
        bool linking; /* is the notification for link or unlink */
        void *upper_info; /* upper dev info */
};

struct netdev_notifier_changelowerstate_info {
        struct netdev_notifier_info info; /* must be first */
        void *lower_state_info; /* is lower dev state */
};

struct netdev_notifier_pre_changeaddr_info {
        struct netdev_notifier_info info; /* must be first */
        const unsigned char *dev_addr;
};

enum netdev_offload_xstats_type {
        NETDEV_OFFLOAD_XSTATS_TYPE_L3 = 1,
};

struct netdev_notifier_offload_xstats_info {
        struct netdev_notifier_info info; /* must be first */
        enum netdev_offload_xstats_type type;

        union {
                /* NETDEV_OFFLOAD_XSTATS_REPORT_DELTA */
                struct netdev_notifier_offload_xstats_rd *report_delta;
                /* NETDEV_OFFLOAD_XSTATS_REPORT_USED */
                struct netdev_notifier_offload_xstats_ru *report_used;
        };
};

int netdev_offload_xstats_enable(struct net_device *dev,
                                 enum netdev_offload_xstats_type type,
                                 struct netlink_ext_ack *extack);
int netdev_offload_xstats_disable(struct net_device *dev,
                                  enum netdev_offload_xstats_type type);
bool netdev_offload_xstats_enabled(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type);
int netdev_offload_xstats_get(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_hw_stats64 *stats, bool *used,
                              struct netlink_ext_ack *extack);
void
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *rd,
                                   const struct rtnl_hw_stats64 *stats);
void
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *ru);
void netdev_offload_xstats_push_delta(struct net_device *dev,
                                      enum netdev_offload_xstats_type type,
                                      const struct rtnl_hw_stats64 *stats);

static inline void netdev_notifier_info_init(struct netdev_notifier_info *info,
                                             struct net_device *dev)
{
        info->dev = dev;
        info->extack = NULL;
}

static inline struct net_device *
netdev_notifier_info_to_dev(const struct netdev_notifier_info *info)
{
        return info->dev;
}

static inline struct netlink_ext_ack *
netdev_notifier_info_to_extack(const struct netdev_notifier_info *info)
{
        return info->extack;
}

int call_netdevice_notifiers(unsigned long val, struct net_device *dev);
int call_netdevice_notifiers_info(unsigned long val,
                                  struct netdev_notifier_info *info);

#define for_each_netdev(net, d)                \
                list_for_each_entry(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_reverse(net, d)        \
                list_for_each_entry_reverse(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_rcu(net, d)                \
                list_for_each_entry_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_safe(net, d, n)        \
                list_for_each_entry_safe(d, n, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue(net, d)                \
                list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_continue_reverse(net, d)                \
                list_for_each_entry_continue_reverse(d, &(net)->dev_base_head, \
                                                     dev_list)
#define for_each_netdev_continue_rcu(net, d)                \
        list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
#define for_each_netdev_in_bond_rcu(bond, slave)        \
                for_each_netdev_rcu(&init_net, slave)        \
                        if (netdev_master_upper_dev_get_rcu(slave) == (bond))
#define net_device_entry(lh)        list_entry(lh, struct net_device, dev_list)

#define for_each_netdev_dump(net, d, ifindex)                                \
        for (; (d = xa_find(&(net)->dev_by_index, &ifindex,                \
                            ULONG_MAX, XA_PRESENT)); ifindex++)

static inline struct net_device *next_net_device(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = dev->dev_list.next;
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *next_net_device_rcu(struct net_device *dev)
{
        struct list_head *lh;
        struct net *net;

        net = dev_net(dev);
        lh = rcu_dereference(list_next_rcu(&dev->dev_list));
        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

static inline struct net_device *first_net_device(struct net *net)
{
        return list_empty(&net->dev_base_head) ? NULL :
                net_device_entry(net->dev_base_head.next);
}

static inline struct net_device *first_net_device_rcu(struct net *net)
{
        struct list_head *lh = rcu_dereference(list_next_rcu(&net->dev_base_head));

        return lh == &net->dev_base_head ? NULL : net_device_entry(lh);
}

int netdev_boot_setup_check(struct net_device *dev);
struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
                                   const char *hwaddr);
struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *hwaddr);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
void dev_add_pack(struct packet_type *pt);
void dev_remove_pack(struct packet_type *pt);
void __dev_remove_pack(struct packet_type *pt);
void dev_add_offload(struct packet_offload *po);
void dev_remove_offload(struct packet_offload *po);

int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
                          struct net_device_path_stack *stack);
struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
                                      unsigned short mask);
struct net_device *dev_get_by_name(struct net *net, const char *name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name);
struct net_device *__dev_get_by_name(struct net *net, const char *name);
bool netdev_name_in_use(struct net *net, const char *name);
int dev_alloc_name(struct net_device *dev, const char *name);
int netif_open(struct net_device *dev, struct netlink_ext_ack *extack);
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack);
void netif_close(struct net_device *dev);
void dev_close(struct net_device *dev);
void dev_close_many(struct list_head *head, bool unlink);
void netif_disable_lro(struct net_device *dev);
void dev_disable_lro(struct net_device *dev);
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *newskb);
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev);

int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev);
int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id);

static inline int dev_queue_xmit(struct sk_buff *skb)
{
        return __dev_queue_xmit(skb, NULL);
}

static inline int dev_queue_xmit_accel(struct sk_buff *skb,
                                       struct net_device *sb_dev)
{
        return __dev_queue_xmit(skb, sb_dev);
}

static inline int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        int ret;

        ret = __dev_direct_xmit(skb, queue_id);
        if (!dev_xmit_complete(ret))
                kfree_skb(skb);
        return ret;
}

int register_netdevice(struct net_device *dev);
void unregister_netdevice_queue(struct net_device *dev, struct list_head *head);
void unregister_netdevice_many(struct list_head *head);
static inline void unregister_netdevice(struct net_device *dev)
{
        unregister_netdevice_queue(dev, NULL);
}

int netdev_refcnt_read(const struct net_device *dev);
void free_netdev(struct net_device *dev);

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves);
struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
                                            struct sock *sk);
struct net_device *dev_get_by_index(struct net *net, int ifindex);
struct net_device *__dev_get_by_index(struct net *net, int ifindex);
struct net_device *netdev_get_by_index(struct net *net, int ifindex,
                                       netdevice_tracker *tracker, gfp_t gfp);
struct net_device *netdev_get_by_name(struct net *net, const char *name,
                                      netdevice_tracker *tracker, gfp_t gfp);
struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex);
void netdev_copy_name(struct net_device *dev, char *name);

static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
                                  unsigned short type,
                                  const void *daddr, const void *saddr,
                                  unsigned int len)
{
        if (!dev->header_ops || !dev->header_ops->create)
                return 0;

        return dev->header_ops->create(skb, dev, type, daddr, saddr, len);
}

static inline int dev_parse_header(const struct sk_buff *skb,
                                   unsigned char *haddr)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse)
                return 0;
        return dev->header_ops->parse(skb, haddr);
}

static inline __be16 dev_parse_header_protocol(const struct sk_buff *skb)
{
        const struct net_device *dev = skb->dev;

        if (!dev->header_ops || !dev->header_ops->parse_protocol)
                return 0;
        return dev->header_ops->parse_protocol(skb);
}

/* ll_header must have at least hard_header_len allocated */
static inline bool dev_validate_header(const struct net_device *dev,
                                       char *ll_header, int len)
{
        if (likely(len >= dev->hard_header_len))
                return true;
        if (len < dev->min_header_len)
                return false;

        if (capable(CAP_SYS_RAWIO)) {
                memset(ll_header + len, 0, dev->hard_header_len - len);
                return true;
        }

        if (dev->header_ops && dev->header_ops->validate)
                return dev->header_ops->validate(ll_header, len);

        return false;
}

static inline bool dev_has_header(const struct net_device *dev)
{
        return dev->header_ops && dev->header_ops->create;
}

/*
 * Incoming packets are placed on per-CPU queues
 */
struct softnet_data {
        struct list_head        poll_list;
        struct sk_buff_head        process_queue;
        local_lock_t                process_queue_bh_lock;

        /* stats */
        unsigned int                processed;
        unsigned int                time_squeeze;
#ifdef CONFIG_RPS
        struct softnet_data        *rps_ipi_list;
#endif

        unsigned int                received_rps;
        bool                        in_net_rx_action;
        bool                        in_napi_threaded_poll;

#ifdef CONFIG_NET_FLOW_LIMIT
        struct sd_flow_limit __rcu *flow_limit;
#endif
        struct Qdisc                *output_queue;
        struct Qdisc                **output_queue_tailp;
        struct sk_buff                *completion_queue;
#ifdef CONFIG_XFRM_OFFLOAD
        struct sk_buff_head        xfrm_backlog;
#endif
        /* written and read only by owning cpu: */
        struct netdev_xmit xmit;
#ifdef CONFIG_RPS
        /* input_queue_head should be written by cpu owning this struct,
         * and only read by other cpus. Worth using a cache line.
         */
        unsigned int                input_queue_head ____cacheline_aligned_in_smp;

        /* Elements below can be accessed between CPUs for RPS/RFS */
        call_single_data_t        csd ____cacheline_aligned_in_smp;
        struct softnet_data        *rps_ipi_next;
        unsigned int                cpu;
        unsigned int                input_queue_tail;
#endif
        struct sk_buff_head        input_pkt_queue;
        struct napi_struct        backlog;

        atomic_t                dropped ____cacheline_aligned_in_smp;

        /* Another possibly contended cache line */
        spinlock_t                defer_lock ____cacheline_aligned_in_smp;
        int                        defer_count;
        int                        defer_ipi_scheduled;
        struct sk_buff                *defer_list;
        call_single_data_t        defer_csd;
};

DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
DECLARE_PER_CPU(struct page_pool *, system_page_pool);

#ifndef CONFIG_PREEMPT_RT
static inline int dev_recursion_level(void)
{
        return this_cpu_read(softnet_data.xmit.recursion);
}
#else
static inline int dev_recursion_level(void)
{
        return current->net_xmit.recursion;
}

#endif

void __netif_schedule(struct Qdisc *q);
void netif_schedule_queue(struct netdev_queue *txq);

static inline void netif_tx_schedule_all(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++)
                netif_schedule_queue(netdev_get_tx_queue(dev, i));
}

static __always_inline void netif_tx_start_queue(struct netdev_queue *dev_queue)
{
        clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_start_queue - allow transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 */
static inline void netif_start_queue(struct net_device *dev)
{
        netif_tx_start_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_start_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_start_queue(txq);
        }
}

void netif_tx_wake_queue(struct netdev_queue *dev_queue);

/**
 *        netif_wake_queue - restart transmit
 *        @dev: network device
 *
 *        Allow upper layers to call the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are available.
 */
static inline void netif_wake_queue(struct net_device *dev)
{
        netif_tx_wake_queue(netdev_get_tx_queue(dev, 0));
}

static inline void netif_tx_wake_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
                netif_tx_wake_queue(txq);
        }
}

static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
{
        /* Paired with READ_ONCE() from dev_watchdog() */
        WRITE_ONCE(dev_queue->trans_start, jiffies);

        /* This barrier is paired with smp_mb() from dev_watchdog() */
        smp_mb__before_atomic();

        /* Must be an atomic op see netif_txq_try_stop() */
        set_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_stop_queue - stop transmitted packets
 *        @dev: network device
 *
 *        Stop upper layers calling the device hard_start_xmit routine.
 *        Used for flow control when transmit resources are unavailable.
 */
static inline void netif_stop_queue(struct net_device *dev)
{
        netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}

void netif_tx_stop_all_queues(struct net_device *dev);

static inline bool netif_tx_queue_stopped(const struct netdev_queue *dev_queue)
{
        return test_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state);
}

/**
 *        netif_queue_stopped - test if transmit queue is flowblocked
 *        @dev: network device
 *
 *        Test if transmit queue on device is currently unable to send.
 */
static inline bool netif_queue_stopped(const struct net_device *dev)
{
        return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0));
}

static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF;
}

static inline bool
netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN;
}

static inline bool
netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue)
{
        return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
}

/**
 *        netdev_queue_set_dql_min_limit - set dql minimum limit
 *        @dev_queue: pointer to transmit queue
 *        @min_limit: dql minimum limit
 *
 * Forces xmit_more() to return true until the minimum threshold
 * defined by @min_limit is reached (or until the tx queue is
 * empty). Warning: to be use with care, misuse will impact the
 * latency.
 */
static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue,
                                                  unsigned int min_limit)
{
#ifdef CONFIG_BQL
        dev_queue->dql.min_limit = min_limit;
#endif
}

static inline int netdev_queue_dql_avail(const struct netdev_queue *txq)
{
#ifdef CONFIG_BQL
        /* Non-BQL migrated drivers will return 0, too. */
        return dql_avail(&txq->dql);
#else
        return 0;
#endif
}

/**
 *        netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their ndo_start_xmit(),
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_enqueue_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.num_queued);
#endif
}

/**
 *        netdev_txq_bql_complete_prefetchw - prefetch bql data for write
 *        @dev_queue: pointer to transmit queue
 *
 * BQL enabled drivers might use this helper in their TX completion path,
 * to give appropriate hint to the CPU.
 */
static inline void netdev_txq_bql_complete_prefetchw(struct netdev_queue *dev_queue)
{
#ifdef CONFIG_BQL
        prefetchw(&dev_queue->dql.limit);
#endif
}

/**
 *        netdev_tx_sent_queue - report the number of bytes queued to a given tx queue
 *        @dev_queue: network device queue
 *        @bytes: number of bytes queued to the device queue
 *
 *        Report the number of bytes queued for sending/completion to the network
 *        device hardware queue. @bytes should be a good approximation and should
 *        exactly match netdev_completed_queue() @bytes.
 *        This is typically called once per packet, from ndo_start_xmit().
 */
static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                        unsigned int bytes)
{
#ifdef CONFIG_BQL
        dql_queued(&dev_queue->dql, bytes);

        if (likely(dql_avail(&dev_queue->dql) >= 0))
                return;

        /* Paired with READ_ONCE() from dev_watchdog() */
        WRITE_ONCE(dev_queue->trans_start, jiffies);

        /* This barrier is paired with smp_mb() from dev_watchdog() */
        smp_mb__before_atomic();

        set_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);

        /*
         * The XOFF flag must be set before checking the dql_avail below,
         * because in netdev_tx_completed_queue we update the dql_completed
         * before checking the XOFF flag.
         */
        smp_mb__after_atomic();

        /* check again in case another CPU has just made room avail */
        if (unlikely(dql_avail(&dev_queue->dql) >= 0))
                clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state);
#endif
}

/* Variant of netdev_tx_sent_queue() for drivers that are aware
 * that they should not test BQL status themselves.
 * We do want to change __QUEUE_STATE_STACK_XOFF only for the last
 * skb of a batch.
 * Returns true if the doorbell must be used to kick the NIC.
 */
static inline bool __netdev_tx_sent_queue(struct netdev_queue *dev_queue,
                                          unsigned int bytes,
                                          bool xmit_more)
{
        if (xmit_more) {
#ifdef CONFIG_BQL
                dql_queued(&dev_queue->dql, bytes);
#endif
                return netif_tx_queue_stopped(dev_queue);
        }
        netdev_tx_sent_queue(dev_queue, bytes);
        return true;
}

/**
 *        netdev_sent_queue - report the number of bytes queued to hardware
 *        @dev: network device
 *        @bytes: number of bytes queued to the hardware device queue
 *
 *        Report the number of bytes queued for sending/completion to the network
 *        device hardware queue#0. @bytes should be a good approximation and should
 *        exactly match netdev_completed_queue() @bytes.
 *        This is typically called once per packet, from ndo_start_xmit().
 */
static inline void netdev_sent_queue(struct net_device *dev, unsigned int bytes)
{
        netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes);
}

static inline bool __netdev_sent_queue(struct net_device *dev,
                                       unsigned int bytes,
                                       bool xmit_more)
{
        return __netdev_tx_sent_queue(netdev_get_tx_queue(dev, 0), bytes,
                                      xmit_more);
}

/**
 *        netdev_tx_completed_queue - report number of packets/bytes at TX completion.
 *        @dev_queue: network device queue
 *        @pkts: number of packets (currently ignored)
 *        @bytes: number of bytes dequeued from the device queue
 *
 *        Must be called at most once per TX completion round (and not per
 *        individual packet), so that BQL can adjust its limits appropriately.
 */
static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue,
                                             unsigned int pkts, unsigned int bytes)
{
#ifdef CONFIG_BQL
        if (unlikely(!bytes))
                return;

        dql_completed(&dev_queue->dql, bytes);

        /*
         * Without the memory barrier there is a small possibility that
         * netdev_tx_sent_queue will miss the update and cause the queue to
         * be stopped forever
         */
        smp_mb(); /* NOTE: netdev_txq_completed_mb() assumes this exists */

        if (unlikely(dql_avail(&dev_queue->dql) < 0))
                return;

        if (test_and_clear_bit(__QUEUE_STATE_STACK_XOFF, &dev_queue->state))
                netif_schedule_queue(dev_queue);
#endif
}

/**
 *         netdev_completed_queue - report bytes and packets completed by device
 *         @dev: network device
 *         @pkts: actual number of packets sent over the medium
 *         @bytes: actual number of bytes sent over the medium
 *
 *         Report the number of bytes and packets transmitted by the network device
 *         hardware queue over the physical medium, @bytes must exactly match the
 *         @bytes amount passed to netdev_sent_queue()
 */
static inline void netdev_completed_queue(struct net_device *dev,
                                          unsigned int pkts, unsigned int bytes)
{
        netdev_tx_completed_queue(netdev_get_tx_queue(dev, 0), pkts, bytes);
}

static inline void netdev_tx_reset_queue(struct netdev_queue *q)
{
#ifdef CONFIG_BQL
        clear_bit(__QUEUE_STATE_STACK_XOFF, &q->state);
        dql_reset(&q->dql);
#endif
}

/**
 * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue
 * @dev: network device
 * @qid: stack index of the queue to reset
 */
static inline void netdev_tx_reset_subqueue(const struct net_device *dev,
                                            u32 qid)
{
        netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid));
}

/**
 *         netdev_reset_queue - reset the packets and bytes count of a network device
 *         @dev_queue: network device
 *
 *         Reset the bytes and packet count of a network device and clear the
 *         software flow control OFF bit for this network device
 */
static inline void netdev_reset_queue(struct net_device *dev_queue)
{
        netdev_tx_reset_subqueue(dev_queue, 0);
}

/**
 *         netdev_cap_txqueue - check if selected tx queue exceeds device queues
 *         @dev: network device
 *         @queue_index: given tx queue index
 *
 *         Returns 0 if given tx queue index >= number of device tx queues,
 *         otherwise returns the originally passed tx queue index.
 */
static inline u16 netdev_cap_txqueue(struct net_device *dev, u16 queue_index)
{
        if (unlikely(queue_index >= dev->real_num_tx_queues)) {
                net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
                                     dev->name, queue_index,
                                     dev->real_num_tx_queues);
                return 0;
        }

        return queue_index;
}

/**
 *        netif_running - test if up
 *        @dev: network device
 *
 *        Test if the device has been brought up.
 */
static inline bool netif_running(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_START, &dev->state);
}

/*
 * Routines to manage the subqueues on a device.  We only need start,
 * stop, and a check if it's stopped.  All other device management is
 * done at the overall netdevice level.
 * Also test the device if we're multiqueue.
 */

/**
 *        netif_start_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Start individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_start_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_start_queue(txq);
}

/**
 *        netif_stop_subqueue - stop sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Stop individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_stop_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
        netif_tx_stop_queue(txq);
}

/**
 *        __netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool __netif_subqueue_stopped(const struct net_device *dev,
                                            u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        return netif_tx_queue_stopped(txq);
}

/**
 *        netif_subqueue_stopped - test status of subqueue
 *        @dev: network device
 *        @skb: sub queue buffer pointer
 *
 * Check individual transmit queue of a device with multiple transmit queues.
 */
static inline bool netif_subqueue_stopped(const struct net_device *dev,
                                          struct sk_buff *skb)
{
        return __netif_subqueue_stopped(dev, skb_get_queue_mapping(skb));
}

/**
 *        netif_wake_subqueue - allow sending packets on subqueue
 *        @dev: network device
 *        @queue_index: sub queue index
 *
 * Resume individual transmit queue of a device with multiple transmit queues.
 */
static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);

        netif_tx_wake_queue(txq);
}

#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index);
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, enum xps_map_type type);

/**
 *        netif_attr_test_mask - Test a CPU or Rx queue set in a mask
 *        @j: CPU/Rx queue index
 *        @mask: bitmask of all cpus/rx queues
 *        @nr_bits: number of bits in the bitmask
 *
 * Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
 */
static inline bool netif_attr_test_mask(unsigned long j,
                                        const unsigned long *mask,
                                        unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);
        return test_bit(j, mask);
}

/**
 *        netif_attr_test_online - Test for online CPU/Rx queue
 *        @j: CPU/Rx queue index
 *        @online_mask: bitmask for CPUs/Rx queues that are online
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns: true if a CPU/Rx queue is online.
 */
static inline bool netif_attr_test_online(unsigned long j,
                                          const unsigned long *online_mask,
                                          unsigned int nr_bits)
{
        cpu_max_bits_warn(j, nr_bits);

        if (online_mask)
                return test_bit(j, online_mask);

        return (j < nr_bits);
}

/**
 *        netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
 *        @n: CPU/Rx queue index
 *        @srcp: the cpumask/Rx queue mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns: next (after n) CPU/Rx queue index in the mask;
 * >= nr_bits if no further CPUs/Rx queues set.
 */
static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
                                               unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (srcp)
                return find_next_bit(srcp, nr_bits, n + 1);

        return n + 1;
}

/**
 *        netif_attrmask_next_and - get the next CPU/Rx queue in \*src1p & \*src2p
 *        @n: CPU/Rx queue index
 *        @src1p: the first CPUs/Rx queues mask pointer
 *        @src2p: the second CPUs/Rx queues mask pointer
 *        @nr_bits: number of bits in the bitmask
 *
 * Returns: next (after n) CPU/Rx queue index set in both masks;
 * >= nr_bits if no further CPUs/Rx queues set in both.
 */
static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
                                          const unsigned long *src2p,
                                          unsigned int nr_bits)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpu_max_bits_warn(n, nr_bits);

        if (src1p && src2p)
                return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
        else if (src1p)
                return find_next_bit(src1p, nr_bits, n + 1);
        else if (src2p)
                return find_next_bit(src2p, nr_bits, n + 1);

        return n + 1;
}
#else
static inline int netif_set_xps_queue(struct net_device *dev,
                                      const struct cpumask *mask,
                                      u16 index)
{
        return 0;
}

static inline int __netif_set_xps_queue(struct net_device *dev,
                                        const unsigned long *mask,
                                        u16 index, enum xps_map_type type)
{
        return 0;
}
#endif

/**
 *        netif_is_multiqueue - test if device has multiple transmit queues
 *        @dev: network device
 *
 * Check if device has multiple transmit queues
 */
static inline bool netif_is_multiqueue(const struct net_device *dev)
{
        return dev->num_tx_queues > 1;
}

int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq);
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq);
int netif_set_real_num_queues(struct net_device *dev,
                              unsigned int txq, unsigned int rxq);

int netif_get_num_default_rss_queues(void);

void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason);
void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason);

/*
 * It is not allowed to call kfree_skb() or consume_skb() from hardware
 * interrupt context or with hardware interrupts being disabled.
 * (in_hardirq() || irqs_disabled())
 *
 * We provide four helpers that can be used in following contexts :
 *
 * dev_kfree_skb_irq(skb) when caller drops a packet from irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_irq(skb) when caller consumes a packet from irq context.
 *  Typically used in place of consume_skb(skb) in TX completion path
 *
 * dev_kfree_skb_any(skb) when caller doesn't know its current irq context,
 *  replacing kfree_skb(skb)
 *
 * dev_consume_skb_any(skb) when caller doesn't know its current irq context,
 *  and consumed a packet. Used in place of consume_skb(skb)
 */
static inline void dev_kfree_skb_irq(struct sk_buff *skb)
{
        dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

static inline void dev_consume_skb_irq(struct sk_buff *skb)
{
        dev_kfree_skb_irq_reason(skb, SKB_CONSUMED);
}

static inline void dev_kfree_skb_any(struct sk_buff *skb)
{
        dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
}

static inline void dev_consume_skb_any(struct sk_buff *skb)
{
        dev_kfree_skb_any_reason(skb, SKB_CONSUMED);
}

u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             const struct bpf_prog *xdp_prog);
void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog);
int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb);
int netif_rx(struct sk_buff *skb);
int __netif_rx(struct sk_buff *skb);

int netif_receive_skb(struct sk_buff *skb);
int netif_receive_skb_core(struct sk_buff *skb);
void netif_receive_skb_list_internal(struct list_head *head);
void netif_receive_skb_list(struct list_head *head);
gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb);

static inline gro_result_t napi_gro_receive(struct napi_struct *napi,
                                            struct sk_buff *skb)
{
        return gro_receive_skb(&napi->gro, skb);
}

struct sk_buff *napi_get_frags(struct napi_struct *napi);
gro_result_t napi_gro_frags(struct napi_struct *napi);

static inline void napi_free_frags(struct napi_struct *napi)
{
        kfree_skb(napi->skb);
        napi->skb = NULL;
}

bool netdev_is_rx_handler_busy(struct net_device *dev);
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data);
void netdev_rx_handler_unregister(struct net_device *dev);

bool dev_valid_name(const char *name);
static inline bool is_socket_ioctl_cmd(unsigned int cmd)
{
        return _IOC_TYPE(cmd) == SOCK_IOC_TYPE;
}
int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg);
int put_user_ifreq(struct ifreq *ifr, void __user *arg);
int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
                void __user *data, bool *need_copyout);
int dev_ifconf(struct net *net, struct ifconf __user *ifc);
int dev_eth_ioctl(struct net_device *dev,
                  struct ifreq *ifr, unsigned int cmd);
int generic_hwtstamp_get_lower(struct net_device *dev,
                               struct kernel_hwtstamp_config *kernel_cfg);
int generic_hwtstamp_set_lower(struct net_device *dev,
                               struct kernel_hwtstamp_config *kernel_cfg,
                               struct netlink_ext_ack *extack);
int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
unsigned int dev_get_flags(const struct net_device *);
int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
int netif_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
int dev_change_flags(struct net_device *dev, unsigned int flags,
                     struct netlink_ext_ack *extack);
int netif_set_alias(struct net_device *dev, const char *alias, size_t len);
int dev_set_alias(struct net_device *, const char *, size_t);
int dev_get_alias(const struct net_device *, char *, size_t);
int __dev_change_net_namespace(struct net_device *dev, struct net *net,
                               const char *pat, int new_ifindex,
                               struct netlink_ext_ack *extack);
int dev_change_net_namespace(struct net_device *dev, struct net *net,
                             const char *pat);
int __dev_set_mtu(struct net_device *, int);
int netif_set_mtu(struct net_device *dev, int new_mtu);
int dev_set_mtu(struct net_device *, int);
int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                              struct netlink_ext_ack *extack);
int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                          struct netlink_ext_ack *extack);
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                        struct netlink_ext_ack *extack);
int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
                             struct netlink_ext_ack *extack);
int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name);
int dev_get_port_parent_id(struct net_device *dev,
                           struct netdev_phys_item_id *ppid, bool recurse);
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b);

struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again);
struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret);

int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
u8 dev_xdp_prog_count(struct net_device *dev);
int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf);
int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf);
u8 dev_xdp_sb_prog_count(struct net_device *dev);
u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);

u32 dev_get_min_mp_channel_count(const struct net_device *dev);

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb);
bool is_skb_forwardable(const struct net_device *dev,
                        const struct sk_buff *skb);

static __always_inline bool __is_skb_forwardable(const struct net_device *dev,
                                                 const struct sk_buff *skb,
                                                 const bool check_mtu)
{
        const u32 vlan_hdr_len = 4; /* VLAN_HLEN */
        unsigned int len;

        if (!(dev->flags & IFF_UP))
                return false;

        if (!check_mtu)
                return true;

        len = dev->mtu + dev->hard_header_len + vlan_hdr_len;
        if (skb->len <= len)
                return true;

        /* if TSO is enabled, we don't care about the length as the packet
         * could be forwarded without being segmented before
         */
        if (skb_is_gso(skb))
                return true;

        return false;
}

void netdev_core_stats_inc(struct net_device *dev, u32 offset);

#define DEV_CORE_STATS_INC(FIELD)                                                \
static inline void dev_core_stats_##FIELD##_inc(struct net_device *dev)                \
{                                                                                \
        netdev_core_stats_inc(dev,                                                \
                        offsetof(struct net_device_core_stats, FIELD));                \
}
DEV_CORE_STATS_INC(rx_dropped)
DEV_CORE_STATS_INC(tx_dropped)
DEV_CORE_STATS_INC(rx_nohandler)
DEV_CORE_STATS_INC(rx_otherhost_dropped)
#undef DEV_CORE_STATS_INC

static __always_inline int ____dev_forward_skb(struct net_device *dev,
                                               struct sk_buff *skb,
                                               const bool check_mtu)
{
        if (skb_orphan_frags(skb, GFP_ATOMIC) ||
            unlikely(!__is_skb_forwardable(dev, skb, check_mtu))) {
                dev_core_stats_rx_dropped_inc(dev);
                kfree_skb(skb);
                return NET_RX_DROP;
        }

        skb_scrub_packet(skb, !net_eq(dev_net(dev), dev_net(skb->dev)));
        skb->priority = 0;
        return 0;
}

bool dev_nit_active_rcu(const struct net_device *dev);
static inline bool dev_nit_active(const struct net_device *dev)
{
        bool ret;

        rcu_read_lock();
        ret = dev_nit_active_rcu(dev);
        rcu_read_unlock();
        return ret;
}

void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);

static inline void __dev_put(struct net_device *dev)
{
        if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
                this_cpu_dec(*dev->pcpu_refcnt);
#else
                refcount_dec(&dev->dev_refcnt);
#endif
        }
}

static inline void __dev_hold(struct net_device *dev)
{
        if (dev) {
#ifdef CONFIG_PCPU_DEV_REFCNT
                this_cpu_inc(*dev->pcpu_refcnt);
#else
                refcount_inc(&dev->dev_refcnt);
#endif
        }
}

static inline void __netdev_tracker_alloc(struct net_device *dev,
                                          netdevice_tracker *tracker,
                                          gfp_t gfp)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        ref_tracker_alloc(&dev->refcnt_tracker, tracker, gfp);
#endif
}

/* netdev_tracker_alloc() can upgrade a prior untracked reference
 * taken by dev_get_by_name()/dev_get_by_index() to a tracked one.
 */
static inline void netdev_tracker_alloc(struct net_device *dev,
                                        netdevice_tracker *tracker, gfp_t gfp)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        refcount_dec(&dev->refcnt_tracker.no_tracker);
        __netdev_tracker_alloc(dev, tracker, gfp);
#endif
}

static inline void netdev_tracker_free(struct net_device *dev,
                                       netdevice_tracker *tracker)
{
#ifdef CONFIG_NET_DEV_REFCNT_TRACKER
        ref_tracker_free(&dev->refcnt_tracker, tracker);
#endif
}

static inline void netdev_hold(struct net_device *dev,
                               netdevice_tracker *tracker, gfp_t gfp)
{
        if (dev) {
                __dev_hold(dev);
                __netdev_tracker_alloc(dev, tracker, gfp);
        }
}

static inline void netdev_put(struct net_device *dev,
                              netdevice_tracker *tracker)
{
        if (dev) {
                netdev_tracker_free(dev, tracker);
                __dev_put(dev);
        }
}

/**
 *        dev_hold - get reference to device
 *        @dev: network device
 *
 * Hold reference to device to keep it from being freed.
 * Try using netdev_hold() instead.
 */
static inline void dev_hold(struct net_device *dev)
{
        netdev_hold(dev, NULL, GFP_ATOMIC);
}

/**
 *        dev_put - release reference to device
 *        @dev: network device
 *
 * Release reference to device to allow it to be freed.
 * Try using netdev_put() instead.
 */
static inline void dev_put(struct net_device *dev)
{
        netdev_put(dev, NULL);
}

DEFINE_FREE(dev_put, struct net_device *, if (_T) dev_put(_T))

static inline void netdev_ref_replace(struct net_device *odev,
                                      struct net_device *ndev,
                                      netdevice_tracker *tracker,
                                      gfp_t gfp)
{
        if (odev)
                netdev_tracker_free(odev, tracker);

        __dev_hold(ndev);
        __dev_put(odev);

        if (ndev)
                __netdev_tracker_alloc(ndev, tracker, gfp);
}

/* Carrier loss detection, dial on demand. The functions netif_carrier_on
 * and _off may be called from IRQ context, but it is caller
 * who is responsible for serialization of these calls.
 *
 * The name carrier is inappropriate, these functions should really be
 * called netif_lowerlayer_*() because they represent the state of any
 * kind of lower layer not just hardware media.
 */
void linkwatch_fire_event(struct net_device *dev);

/**
 * linkwatch_sync_dev - sync linkwatch for the given device
 * @dev: network device to sync linkwatch for
 *
 * Sync linkwatch for the given device, removing it from the
 * pending work list (if queued).
 */
void linkwatch_sync_dev(struct net_device *dev);
void __linkwatch_sync_dev(struct net_device *dev);

/**
 *        netif_carrier_ok - test if carrier present
 *        @dev: network device
 *
 * Check if carrier is present on device
 */
static inline bool netif_carrier_ok(const struct net_device *dev)
{
        return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
}

unsigned long dev_trans_start(struct net_device *dev);

void netdev_watchdog_up(struct net_device *dev);

void netif_carrier_on(struct net_device *dev);
void netif_carrier_off(struct net_device *dev);
void netif_carrier_event(struct net_device *dev);

/**
 *        netif_dormant_on - mark device as dormant.
 *        @dev: network device
 *
 * Mark device as dormant (as per RFC2863).
 *
 * The dormant state indicates that the relevant interface is not
 * actually in a condition to pass packets (i.e., it is not 'up') but is
 * in a "pending" state, waiting for some external event.  For "on-
 * demand" interfaces, this new state identifies the situation where the
 * interface is waiting for events to place it in the up state.
 */
static inline void netif_dormant_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant_off - set device as not dormant.
 *        @dev: network device
 *
 * Device is not in dormant state.
 */
static inline void netif_dormant_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_dormant - test if device is dormant
 *        @dev: network device
 *
 * Check if device is dormant.
 */
static inline bool netif_dormant(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_DORMANT, &dev->state);
}


/**
 *        netif_testing_on - mark device as under test.
 *        @dev: network device
 *
 * Mark device as under test (as per RFC2863).
 *
 * The testing state indicates that some test(s) must be performed on
 * the interface. After completion, of the test, the interface state
 * will change to up, dormant, or down, as appropriate.
 */
static inline void netif_testing_on(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing_off - set device as not under test.
 *        @dev: network device
 *
 * Device is not in testing state.
 */
static inline void netif_testing_off(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_TESTING, &dev->state))
                linkwatch_fire_event(dev);
}

/**
 *        netif_testing - test if device is under test
 *        @dev: network device
 *
 * Check if device is under test
 */
static inline bool netif_testing(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_TESTING, &dev->state);
}


/**
 *        netif_oper_up - test if device is operational
 *        @dev: network device
 *
 * Check if carrier is operational
 */
static inline bool netif_oper_up(const struct net_device *dev)
{
        unsigned int operstate = READ_ONCE(dev->operstate);

        return        operstate == IF_OPER_UP ||
                operstate == IF_OPER_UNKNOWN /* backward compat */;
}

/**
 *        netif_device_present - is device available or removed
 *        @dev: network device
 *
 * Check if device has not been removed from system.
 */
static inline bool netif_device_present(const struct net_device *dev)
{
        return test_bit(__LINK_STATE_PRESENT, &dev->state);
}

void netif_device_detach(struct net_device *dev);

void netif_device_attach(struct net_device *dev);

/*
 * Network interface message level settings
 */

enum {
        NETIF_MSG_DRV_BIT,
        NETIF_MSG_PROBE_BIT,
        NETIF_MSG_LINK_BIT,
        NETIF_MSG_TIMER_BIT,
        NETIF_MSG_IFDOWN_BIT,
        NETIF_MSG_IFUP_BIT,
        NETIF_MSG_RX_ERR_BIT,
        NETIF_MSG_TX_ERR_BIT,
        NETIF_MSG_TX_QUEUED_BIT,
        NETIF_MSG_INTR_BIT,
        NETIF_MSG_TX_DONE_BIT,
        NETIF_MSG_RX_STATUS_BIT,
        NETIF_MSG_PKTDATA_BIT,
        NETIF_MSG_HW_BIT,
        NETIF_MSG_WOL_BIT,

        /* When you add a new bit above, update netif_msg_class_names array
         * in net/ethtool/common.c
         */
        NETIF_MSG_CLASS_COUNT,
};
/* Both ethtool_ops interface and internal driver implementation use u32 */
static_assert(NETIF_MSG_CLASS_COUNT <= 32);

#define __NETIF_MSG_BIT(bit)        ((u32)1 << (bit))
#define __NETIF_MSG(name)        __NETIF_MSG_BIT(NETIF_MSG_ ## name ## _BIT)

#define NETIF_MSG_DRV                __NETIF_MSG(DRV)
#define NETIF_MSG_PROBE                __NETIF_MSG(PROBE)
#define NETIF_MSG_LINK                __NETIF_MSG(LINK)
#define NETIF_MSG_TIMER                __NETIF_MSG(TIMER)
#define NETIF_MSG_IFDOWN        __NETIF_MSG(IFDOWN)
#define NETIF_MSG_IFUP                __NETIF_MSG(IFUP)
#define NETIF_MSG_RX_ERR        __NETIF_MSG(RX_ERR)
#define NETIF_MSG_TX_ERR        __NETIF_MSG(TX_ERR)
#define NETIF_MSG_TX_QUEUED        __NETIF_MSG(TX_QUEUED)
#define NETIF_MSG_INTR                __NETIF_MSG(INTR)
#define NETIF_MSG_TX_DONE        __NETIF_MSG(TX_DONE)
#define NETIF_MSG_RX_STATUS        __NETIF_MSG(RX_STATUS)
#define NETIF_MSG_PKTDATA        __NETIF_MSG(PKTDATA)
#define NETIF_MSG_HW                __NETIF_MSG(HW)
#define NETIF_MSG_WOL                __NETIF_MSG(WOL)

#define netif_msg_drv(p)        ((p)->msg_enable & NETIF_MSG_DRV)
#define netif_msg_probe(p)        ((p)->msg_enable & NETIF_MSG_PROBE)
#define netif_msg_link(p)        ((p)->msg_enable & NETIF_MSG_LINK)
#define netif_msg_timer(p)        ((p)->msg_enable & NETIF_MSG_TIMER)
#define netif_msg_ifdown(p)        ((p)->msg_enable & NETIF_MSG_IFDOWN)
#define netif_msg_ifup(p)        ((p)->msg_enable & NETIF_MSG_IFUP)
#define netif_msg_rx_err(p)        ((p)->msg_enable & NETIF_MSG_RX_ERR)
#define netif_msg_tx_err(p)        ((p)->msg_enable & NETIF_MSG_TX_ERR)
#define netif_msg_tx_queued(p)        ((p)->msg_enable & NETIF_MSG_TX_QUEUED)
#define netif_msg_intr(p)        ((p)->msg_enable & NETIF_MSG_INTR)
#define netif_msg_tx_done(p)        ((p)->msg_enable & NETIF_MSG_TX_DONE)
#define netif_msg_rx_status(p)        ((p)->msg_enable & NETIF_MSG_RX_STATUS)
#define netif_msg_pktdata(p)        ((p)->msg_enable & NETIF_MSG_PKTDATA)
#define netif_msg_hw(p)                ((p)->msg_enable & NETIF_MSG_HW)
#define netif_msg_wol(p)        ((p)->msg_enable & NETIF_MSG_WOL)

static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits)
{
        /* use default */
        if (debug_value < 0 || debug_value >= (sizeof(u32) * 8))
                return default_msg_enable_bits;
        if (debug_value == 0)        /* no output */
                return 0;
        /* set low N bits */
        return (1U << debug_value) - 1;
}

static inline void __netif_tx_lock(struct netdev_queue *txq, int cpu)
{
        spin_lock(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, cpu);
}

static inline bool __netif_tx_acquire(struct netdev_queue *txq)
{
        __acquire(&txq->_xmit_lock);
        return true;
}

static inline void __netif_tx_release(struct netdev_queue *txq)
{
        __release(&txq->_xmit_lock);
}

static inline void __netif_tx_lock_bh(struct netdev_queue *txq)
{
        spin_lock_bh(&txq->_xmit_lock);
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
}

static inline bool __netif_tx_trylock(struct netdev_queue *txq)
{
        bool ok = spin_trylock(&txq->_xmit_lock);

        if (likely(ok)) {
                /* Pairs with READ_ONCE() in __dev_queue_xmit() */
                WRITE_ONCE(txq->xmit_lock_owner, smp_processor_id());
        }
        return ok;
}

static inline void __netif_tx_unlock(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock(&txq->_xmit_lock);
}

static inline void __netif_tx_unlock_bh(struct netdev_queue *txq)
{
        /* Pairs with READ_ONCE() in __dev_queue_xmit() */
        WRITE_ONCE(txq->xmit_lock_owner, -1);
        spin_unlock_bh(&txq->_xmit_lock);
}

/*
 * txq->trans_start can be read locklessly from dev_watchdog()
 */
static inline void txq_trans_update(struct netdev_queue *txq)
{
        if (txq->xmit_lock_owner != -1)
                WRITE_ONCE(txq->trans_start, jiffies);
}

static inline void txq_trans_cond_update(struct netdev_queue *txq)
{
        unsigned long now = jiffies;

        if (READ_ONCE(txq->trans_start) != now)
                WRITE_ONCE(txq->trans_start, now);
}

/* legacy drivers only, netdev_start_xmit() sets txq->trans_start */
static inline void netif_trans_update(struct net_device *dev)
{
        struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);

        txq_trans_cond_update(txq);
}

/**
 *        netif_tx_lock - grab network device transmit lock
 *        @dev: network device
 *
 * Get network device transmit lock
 */
void netif_tx_lock(struct net_device *dev);

static inline void netif_tx_lock_bh(struct net_device *dev)
{
        local_bh_disable();
        netif_tx_lock(dev);
}

void netif_tx_unlock(struct net_device *dev);

static inline void netif_tx_unlock_bh(struct net_device *dev)
{
        netif_tx_unlock(dev);
        local_bh_enable();
}

#define HARD_TX_LOCK(dev, txq, cpu) {                        \
        if (!(dev)->lltx) {                                \
                __netif_tx_lock(txq, cpu);                \
        } else {                                        \
                __netif_tx_acquire(txq);                \
        }                                                \
}

#define HARD_TX_TRYLOCK(dev, txq)                        \
        (!(dev)->lltx ?                                        \
                __netif_tx_trylock(txq) :                \
                __netif_tx_acquire(txq))

#define HARD_TX_UNLOCK(dev, txq) {                        \
        if (!(dev)->lltx) {                                \
                __netif_tx_unlock(txq);                        \
        } else {                                        \
                __netif_tx_release(txq);                \
        }                                                \
}

static inline void netif_tx_disable(struct net_device *dev)
{
        unsigned int i;
        int cpu;

        local_bh_disable();
        cpu = smp_processor_id();
        spin_lock(&dev->tx_global_lock);
        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                __netif_tx_lock(txq, cpu);
                netif_tx_stop_queue(txq);
                __netif_tx_unlock(txq);
        }
        spin_unlock(&dev->tx_global_lock);
        local_bh_enable();
}

static inline void netif_addr_lock(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_lock_bh(struct net_device *dev)
{
        unsigned char nest_level = 0;

#ifdef CONFIG_LOCKDEP
        nest_level = dev->nested_level;
#endif
        local_bh_disable();
        spin_lock_nested(&dev->addr_list_lock, nest_level);
}

static inline void netif_addr_unlock(struct net_device *dev)
{
        spin_unlock(&dev->addr_list_lock);
}

static inline void netif_addr_unlock_bh(struct net_device *dev)
{
        spin_unlock_bh(&dev->addr_list_lock);
}

/*
 * dev_addrs walker. Should be used only for read access. Call with
 * rcu_read_lock held.
 */
#define for_each_dev_addr(dev, ha) \
                list_for_each_entry_rcu(ha, &dev->dev_addrs.list, list)

/* These functions live elsewhere (drivers/net/net_init.c, but related) */

void ether_setup(struct net_device *dev);

/* Allocate dummy net_device */
struct net_device *alloc_netdev_dummy(int sizeof_priv);

/* Support for loadable net-drivers */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                                    unsigned char name_assign_type,
                                    void (*setup)(struct net_device *),
                                    unsigned int txqs, unsigned int rxqs);
#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)

#define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \
        alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \
                         count)

int register_netdev(struct net_device *dev);
void unregister_netdev(struct net_device *dev);

int devm_register_netdev(struct device *dev, struct net_device *ndev);

/* General hardware address lists handling functions */
int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
                   struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
                            struct netdev_hw_addr_list *from_list,
                            int addr_len);
void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
                      struct netdev_hw_addr_list *from_list, int addr_len);
int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
                       struct net_device *dev,
                       int (*sync)(struct net_device *, const unsigned char *),
                       int (*unsync)(struct net_device *,
                                     const unsigned char *));
int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
                           struct net_device *dev,
                           int (*sync)(struct net_device *,
                                       const unsigned char *, int),
                           int (*unsync)(struct net_device *,
                                         const unsigned char *, int));
void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
                              struct net_device *dev,
                              int (*unsync)(struct net_device *,
                                            const unsigned char *, int));
void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list,
                          struct net_device *dev,
                          int (*unsync)(struct net_device *,
                                        const unsigned char *));
void __hw_addr_init(struct netdev_hw_addr_list *list);

/* Functions used for device addresses handling */
void dev_addr_mod(struct net_device *dev, unsigned int offset,
                  const void *addr, size_t len);

static inline void
__dev_addr_set(struct net_device *dev, const void *addr, size_t len)
{
        dev_addr_mod(dev, 0, addr, len);
}

static inline void dev_addr_set(struct net_device *dev, const u8 *addr)
{
        __dev_addr_set(dev, addr, dev->addr_len);
}

int dev_addr_add(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);
int dev_addr_del(struct net_device *dev, const unsigned char *addr,
                 unsigned char addr_type);

/* Functions used for unicast addresses handling */
int dev_uc_add(struct net_device *dev, const unsigned char *addr);
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_uc_del(struct net_device *dev, const unsigned char *addr);
int dev_uc_sync(struct net_device *to, struct net_device *from);
int dev_uc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_uc_unsync(struct net_device *to, struct net_device *from);
void dev_uc_flush(struct net_device *dev);
void dev_uc_init(struct net_device *dev);

/**
 *  __dev_uc_sync - Synchronize device's unicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_uc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->uc, dev, sync, unsync);
}

/**
 *  __dev_uc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_uc_sync().
 */
static inline void __dev_uc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->uc, dev, unsync);
}

/* Functions used for multicast addresses handling */
int dev_mc_add(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr);
int dev_mc_del(struct net_device *dev, const unsigned char *addr);
int dev_mc_del_global(struct net_device *dev, const unsigned char *addr);
int dev_mc_sync(struct net_device *to, struct net_device *from);
int dev_mc_sync_multiple(struct net_device *to, struct net_device *from);
void dev_mc_unsync(struct net_device *to, struct net_device *from);
void dev_mc_flush(struct net_device *dev);
void dev_mc_init(struct net_device *dev);

/**
 *  __dev_mc_sync - Synchronize device's multicast list
 *  @dev:  device to sync
 *  @sync: function to call if address should be added
 *  @unsync: function to call if address should be removed
 *
 *  Add newly added addresses to the interface, and release
 *  addresses that have been deleted.
 */
static inline int __dev_mc_sync(struct net_device *dev,
                                int (*sync)(struct net_device *,
                                            const unsigned char *),
                                int (*unsync)(struct net_device *,
                                              const unsigned char *))
{
        return __hw_addr_sync_dev(&dev->mc, dev, sync, unsync);
}

/**
 *  __dev_mc_unsync - Remove synchronized addresses from device
 *  @dev:  device to sync
 *  @unsync: function to call if address should be removed
 *
 *  Remove all addresses that were added to the device by dev_mc_sync().
 */
static inline void __dev_mc_unsync(struct net_device *dev,
                                   int (*unsync)(struct net_device *,
                                                 const unsigned char *))
{
        __hw_addr_unsync_dev(&dev->mc, dev, unsync);
}

/* Functions used for secondary unicast and multicast support */
void dev_set_rx_mode(struct net_device *dev);
int dev_set_promiscuity(struct net_device *dev, int inc);
int netif_set_allmulti(struct net_device *dev, int inc, bool notify);
int dev_set_allmulti(struct net_device *dev, int inc);
void netif_state_change(struct net_device *dev);
void netdev_state_change(struct net_device *dev);
void __netdev_notify_peers(struct net_device *dev);
void netdev_notify_peers(struct net_device *dev);
void netdev_features_change(struct net_device *dev);
/* Load a device via the kmod */
void dev_load(struct net *net, const char *name);
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage);
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats);
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats);
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s);

enum {
        NESTED_SYNC_IMM_BIT,
        NESTED_SYNC_TODO_BIT,
};

#define __NESTED_SYNC_BIT(bit)        ((u32)1 << (bit))
#define __NESTED_SYNC(name)        __NESTED_SYNC_BIT(NESTED_SYNC_ ## name ## _BIT)

#define NESTED_SYNC_IMM                __NESTED_SYNC(IMM)
#define NESTED_SYNC_TODO        __NESTED_SYNC(TODO)

struct netdev_nested_priv {
        unsigned char flags;
        void *data;
};

bool netdev_has_upper_dev(struct net_device *dev, struct net_device *upper_dev);
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                     struct list_head **iter);

/* iterate through upper list, must be called under RCU read lock */
#define netdev_for_each_upper_dev_rcu(dev, updev, iter) \
        for (iter = &(dev)->adj_list.upper, \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)); \
             updev; \
             updev = netdev_upper_get_next_dev_rcu(dev, &(iter)))

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *upper_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev);

bool netdev_has_any_upper_dev(struct net_device *dev);

void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter);
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter);

#define netdev_for_each_lower_private(dev, priv, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             priv = netdev_lower_get_next_private(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private(dev, &(iter)))

#define netdev_for_each_lower_private_rcu(dev, priv, iter) \
        for (iter = &(dev)->adj_list.lower, \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)); \
             priv; \
             priv = netdev_lower_get_next_private_rcu(dev, &(iter)))

void *netdev_lower_get_next(struct net_device *dev,
                                struct list_head **iter);

#define netdev_for_each_lower_dev(dev, ldev, iter) \
        for (iter = (dev)->adj_list.lower.next, \
             ldev = netdev_lower_get_next(dev, &(iter)); \
             ldev; \
             ldev = netdev_lower_get_next(dev, &(iter)))

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter);
int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *lower_dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv);
int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *lower_dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv);

void *netdev_adjacent_get_private(struct list_head *adj_list);
void *netdev_lower_get_first_private_rcu(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get(struct net_device *dev);
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev);
int netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev,
                          struct netlink_ext_ack *extack);
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack);
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev);
int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack);
void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev);
void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev);
void netdev_adjacent_rename_links(struct net_device *dev, char *oldname);
void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev);
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info);

/* RSS keys are 40 or 52 bytes long */
#define NETDEV_RSS_KEY_LEN 52
extern u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len);

int skb_checksum_help(struct sk_buff *skb);
int skb_crc32c_csum_help(struct sk_buff *skb);
int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features);

struct netdev_bonding_info {
        ifslave        slave;
        ifbond        master;
};

struct netdev_notifier_bonding_info {
        struct netdev_notifier_info info; /* must be first */
        struct netdev_bonding_info  bonding_info;
};

void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info);

#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data);
#else
static inline void ethtool_notify(struct net_device *dev, unsigned int cmd,
                                  const void *data)
{
}
#endif

__be16 skb_network_protocol(struct sk_buff *skb, int *depth);

static inline bool can_checksum_protocol(netdev_features_t features,
                                         __be16 protocol)
{
        if (protocol == htons(ETH_P_FCOE))
                return !!(features & NETIF_F_FCOE_CRC);

        /* Assume this is an IP checksum (not SCTP CRC) */

        if (features & NETIF_F_HW_CSUM) {
                /* Can checksum everything */
                return true;
        }

        switch (protocol) {
        case htons(ETH_P_IP):
                return !!(features & NETIF_F_IP_CSUM);
        case htons(ETH_P_IPV6):
                return !!(features & NETIF_F_IPV6_CSUM);
        default:
                return false;
        }
}

#ifdef CONFIG_BUG
void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb);
#else
static inline void netdev_rx_csum_fault(struct net_device *dev,
                                        struct sk_buff *skb)
{
}
#endif
/* rx skb timestamps */
void net_enable_timestamp(void);
void net_disable_timestamp(void);

static inline ktime_t netdev_get_tstamp(struct net_device *dev,
                                        const struct skb_shared_hwtstamps *hwtstamps,
                                        bool cycles)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_get_tstamp)
                return ops->ndo_get_tstamp(dev, hwtstamps, cycles);

        return hwtstamps->hwtstamp;
}

#ifndef CONFIG_PREEMPT_RT
static inline void netdev_xmit_set_more(bool more)
{
        __this_cpu_write(softnet_data.xmit.more, more);
}

static inline bool netdev_xmit_more(void)
{
        return __this_cpu_read(softnet_data.xmit.more);
}
#else
static inline void netdev_xmit_set_more(bool more)
{
        current->net_xmit.more = more;
}

static inline bool netdev_xmit_more(void)
{
        return current->net_xmit.more;
}
#endif

static inline netdev_tx_t __netdev_start_xmit(const struct net_device_ops *ops,
                                              struct sk_buff *skb, struct net_device *dev,
                                              bool more)
{
        netdev_xmit_set_more(more);
        return ops->ndo_start_xmit(skb, dev);
}

static inline netdev_tx_t netdev_start_xmit(struct sk_buff *skb, struct net_device *dev,
                                            struct netdev_queue *txq, bool more)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        netdev_tx_t rc;

        rc = __netdev_start_xmit(ops, skb, dev, more);
        if (rc == NETDEV_TX_OK)
                txq_trans_update(txq);

        return rc;
}

int netdev_class_create_file_ns(const struct class_attribute *class_attr,
                                const void *ns);
void netdev_class_remove_file_ns(const struct class_attribute *class_attr,
                                 const void *ns);

extern const struct kobj_ns_type_operations net_ns_type_operations;

const char *netdev_drivername(const struct net_device *dev);

static inline netdev_features_t netdev_intersect_features(netdev_features_t f1,
                                                          netdev_features_t f2)
{
        if ((f1 ^ f2) & NETIF_F_HW_CSUM) {
                if (f1 & NETIF_F_HW_CSUM)
                        f1 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
                else
                        f2 |= (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        return f1 & f2;
}

static inline netdev_features_t netdev_get_wanted_features(
        struct net_device *dev)
{
        return (dev->features & ~dev->hw_features) | dev->wanted_features;
}
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask);

/* Allow TSO being used on stacked device :
 * Performing the GSO segmentation before last device
 * is a performance improvement.
 */
static inline netdev_features_t netdev_add_tso_features(netdev_features_t features,
                                                        netdev_features_t mask)
{
        return netdev_increment_features(features, NETIF_F_ALL_TSO, mask);
}

int __netdev_update_features(struct net_device *dev);
void netdev_update_features(struct net_device *dev);
void netdev_change_features(struct net_device *dev);

void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev);

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features);
netdev_features_t netif_skb_features(struct sk_buff *skb);
void skb_warn_bad_offload(const struct sk_buff *skb);

static inline bool net_gso_ok(netdev_features_t features, int gso_type)
{
        netdev_features_t feature = (netdev_features_t)gso_type << NETIF_F_GSO_SHIFT;

        /* check flags correspondence */
        BUILD_BUG_ON(SKB_GSO_TCPV4   != (NETIF_F_TSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_DODGY   != (NETIF_F_GSO_ROBUST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ECN != (NETIF_F_TSO_ECN >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID != (NETIF_F_TSO_MANGLEID >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCPV6   != (NETIF_F_TSO6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FCOE    != (NETIF_F_FSO >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE     != (NETIF_F_GSO_GRE >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_GRE_CSUM != (NETIF_F_GSO_GRE_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP4  != (NETIF_F_GSO_IPXIP4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_IPXIP6  != (NETIF_F_GSO_IPXIP6 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL != (NETIF_F_GSO_UDP_TUNNEL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_TUNNEL_CSUM != (NETIF_F_GSO_UDP_TUNNEL_CSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP != (NETIF_F_GSO_UDP >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_UDP_L4 != (NETIF_F_GSO_UDP_L4 >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_FRAGLIST != (NETIF_F_GSO_FRAGLIST >> NETIF_F_GSO_SHIFT));
        BUILD_BUG_ON(SKB_GSO_TCP_ACCECN !=
                     (NETIF_F_GSO_ACCECN >> NETIF_F_GSO_SHIFT));

        return (features & feature) == feature;
}

static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
{
        return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
               (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
}

static inline bool netif_needs_gso(struct sk_buff *skb,
                                   netdev_features_t features)
{
        return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
                unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
                         (skb->ip_summed != CHECKSUM_UNNECESSARY)));
}

void netif_set_tso_max_size(struct net_device *dev, unsigned int size);
void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs);
void netif_inherit_tso_max(struct net_device *to,
                           const struct net_device *from);

static inline unsigned int
netif_get_gro_max_size(const struct net_device *dev, const struct sk_buff *skb)
{
        /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
        return skb->protocol == htons(ETH_P_IPV6) ?
               READ_ONCE(dev->gro_max_size) :
               READ_ONCE(dev->gro_ipv4_max_size);
}

static inline unsigned int
netif_get_gso_max_size(const struct net_device *dev, const struct sk_buff *skb)
{
        /* pairs with WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
        return skb->protocol == htons(ETH_P_IPV6) ?
               READ_ONCE(dev->gso_max_size) :
               READ_ONCE(dev->gso_ipv4_max_size);
}

static inline bool netif_is_macsec(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACSEC;
}

static inline bool netif_is_macvlan(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN;
}

static inline bool netif_is_macvlan_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_MACVLAN_PORT;
}

static inline bool netif_is_bond_master(const struct net_device *dev)
{
        return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_is_bond_slave(const struct net_device *dev)
{
        return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
}

static inline bool netif_supports_nofcs(struct net_device *dev)
{
        return dev->priv_flags & IFF_SUPP_NOFCS;
}

static inline bool netif_has_l3_rx_handler(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_RX_HANDLER;
}

static inline bool netif_is_l3_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_MASTER;
}

static inline bool netif_is_l3_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_L3MDEV_SLAVE;
}

static inline int dev_sdif(const struct net_device *dev)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
        if (netif_is_l3_slave(dev))
                return dev->ifindex;
#endif
        return 0;
}

static inline bool netif_is_bridge_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_EBRIDGE;
}

static inline bool netif_is_bridge_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_BRIDGE_PORT;
}

static inline bool netif_is_ovs_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OPENVSWITCH;
}

static inline bool netif_is_ovs_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_OVS_DATAPATH;
}

static inline bool netif_is_any_bridge_master(const struct net_device *dev)
{
        return netif_is_bridge_master(dev) || netif_is_ovs_master(dev);
}

static inline bool netif_is_any_bridge_port(const struct net_device *dev)
{
        return netif_is_bridge_port(dev) || netif_is_ovs_port(dev);
}

static inline bool netif_is_team_master(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM;
}

static inline bool netif_is_team_port(const struct net_device *dev)
{
        return dev->priv_flags & IFF_TEAM_PORT;
}

static inline bool netif_is_lag_master(const struct net_device *dev)
{
        return netif_is_bond_master(dev) || netif_is_team_master(dev);
}

static inline bool netif_is_lag_port(const struct net_device *dev)
{
        return netif_is_bond_slave(dev) || netif_is_team_port(dev);
}

static inline bool netif_is_rxfh_configured(const struct net_device *dev)
{
        return dev->priv_flags & IFF_RXFH_CONFIGURED;
}

static inline bool netif_is_failover(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER;
}

static inline bool netif_is_failover_slave(const struct net_device *dev)
{
        return dev->priv_flags & IFF_FAILOVER_SLAVE;
}

/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
        dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
}

/* return true if dev can't cope with mtu frames that need vlan tag insertion */
static inline bool netif_reduces_vlan_mtu(struct net_device *dev)
{
        /* TODO: reserve and use an additional IFF bit, if we get more users */
        return netif_is_macsec(dev);
}

extern struct pernet_operations __net_initdata loopback_net_ops;

/* Logging, debugging and troubleshooting/diagnostic helpers. */

/* netdev_printk helpers, similar to dev_printk */

static inline const char *netdev_name(const struct net_device *dev)
{
        if (!dev->name[0] || strchr(dev->name, '%'))
                return "(unnamed net_device)";
        return dev->name;
}

static inline const char *netdev_reg_state(const struct net_device *dev)
{
        u8 reg_state = READ_ONCE(dev->reg_state);

        switch (reg_state) {
        case NETREG_UNINITIALIZED: return " (uninitialized)";
        case NETREG_REGISTERED: return "";
        case NETREG_UNREGISTERING: return " (unregistering)";
        case NETREG_UNREGISTERED: return " (unregistered)";
        case NETREG_RELEASED: return " (released)";
        case NETREG_DUMMY: return " (dummy)";
        }

        WARN_ONCE(1, "%s: unknown reg_state %d\n", dev->name, reg_state);
        return " (unknown)";
}

#define MODULE_ALIAS_NETDEV(device) \
        MODULE_ALIAS("netdev-" device)

/*
 * netdev_WARN() acts like dev_printk(), but with the key difference
 * of using a WARN/WARN_ON to get the message out, including the
 * file/line information and a backtrace.
 */
#define netdev_WARN(dev, format, args...)                        \
        WARN(1, "netdevice: %s%s: " format, netdev_name(dev),        \
             netdev_reg_state(dev), ##args)

#define netdev_WARN_ONCE(dev, format, args...)                                \
        WARN_ONCE(1, "netdevice: %s%s: " format, netdev_name(dev),        \
                  netdev_reg_state(dev), ##args)

/*
 *        The list of packet types we will receive (as opposed to discard)
 *        and the routines to invoke.
 *
 *        Why 16. Because with 16 the only overlap we get on a hash of the
 *        low nibble of the protocol value is RARP/SNAP/X.25.
 *
 *                0800        IP
 *                0001        802.3
 *                0002        AX.25
 *                0004        802.2
 *                8035        RARP
 *                0005        SNAP
 *                0805        X.25
 *                0806        ARP
 *                8137        IPX
 *                0009        Localtalk
 *                86DD        IPv6
 */
#define PTYPE_HASH_SIZE        (16)
#define PTYPE_HASH_MASK        (PTYPE_HASH_SIZE - 1)

extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

extern struct net_device *blackhole_netdev;

/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */
#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
#define DEV_STATS_ADD(DEV, FIELD, VAL)         \
                atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
#define DEV_STATS_READ(DEV, FIELD) atomic_long_read(&(DEV)->stats.__##FIELD)

#endif        /* _LINUX_NETDEVICE_H */



















































































































































































































































































































































































































    3 
    3 















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                ROUTE - implementation of the IP router.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Alan Cox, <gw4pts@gw4pts.ampr.org>
 *                Linus Torvalds, <Linus.Torvalds@helsinki.fi>
 *                Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 * Fixes:
 *                Alan Cox        :        Verify area fixes.
 *                Alan Cox        :        cli() protects routing changes
 *                Rui Oliveira        :        ICMP routing table updates
 *                (rco@di.uminho.pt)        Routing table insertion and update
 *                Linus Torvalds        :        Rewrote bits to be sensible
 *                Alan Cox        :        Added BSD route gw semantics
 *                Alan Cox        :        Super /proc >4K
 *                Alan Cox        :        MTU in route table
 *                Alan Cox        :        MSS actually. Also added the window
 *                                        clamper.
 *                Sam Lantinga        :        Fixed route matching in rt_del()
 *                Alan Cox        :        Routing cache support.
 *                Alan Cox        :        Removed compatibility cruft.
 *                Alan Cox        :        RTF_REJECT support.
 *                Alan Cox        :        TCP irtt support.
 *                Jonathan Naylor        :        Added Metric support.
 *        Miquel van Smoorenburg        :        BSD API fixes.
 *        Miquel van Smoorenburg        :        Metrics.
 *                Alan Cox        :        Use __u32 properly
 *                Alan Cox        :        Aligned routing errors more closely with BSD
 *                                        our system is still very different.
 *                Alan Cox        :        Faster /proc handling
 *        Alexey Kuznetsov        :        Massive rework to support tree based routing,
 *                                        routing caches and better behaviour.
 *
 *                Olaf Erb        :        irtt wasn't being copied right.
 *                Bjorn Ekwall        :        Kerneld route support.
 *                Alan Cox        :        Multicast fixed (I hope)
 *                Pavel Krauz        :        Limited broadcast fixed
 *                Mike McLagan        :        Routing by source
 *        Alexey Kuznetsov        :        End of old history. Split to fib.c and
 *                                        route.c and rewritten from scratch.
 *                Andi Kleen        :        Load-limit warning messages.
 *        Vitaly E. Lavrov        :        Transparent proxy revived after year coma.
 *        Vitaly E. Lavrov        :        Race condition in ip_route_input_slow.
 *        Tobias Ringstrom        :        Uninitialized res.type in ip_route_output_slow.
 *        Vladimir V. Ivanov        :        IP rule info (flowid) is really useful.
 *                Marc Boucher        :        routing by fwmark
 *        Robert Olsson                :        Added rt_cache statistics
 *        Arnaldo C. Melo                :        Convert proc stuff to seq_file
 *        Eric Dumazet                :        hashed spinlocks and rt_check_expire() fixes.
 *        Ilia Sotnikov                :        Ignore TOS on PMTUD and Redirect
 *        Ilia Sotnikov                :        Removed TOS from hash calculations
 */

#define pr_fmt(fmt) "IPv4: " fmt

#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/memblock.h>
#include <linux/socket.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/pkt_sched.h>
#include <linux/mroute.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/inet_dscp.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
#include <net/nexthop.h>
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
#include <net/lwtunnel.h>
#include <net/netevent.h>
#include <net/rtnetlink.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <net/secure_seq.h>
#include <net/ip_tunnels.h>

#include "fib_lookup.h"

#define RT_GC_TIMEOUT (300*HZ)

#define DEFAULT_MIN_PMTU (512 + 20 + 20)
#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
#define DEFAULT_MIN_ADVMSS 256
static int ip_rt_max_size;
static int ip_rt_redirect_number __read_mostly        = 9;
static int ip_rt_redirect_load __read_mostly        = HZ / 50;
static int ip_rt_redirect_silence __read_mostly        = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly        = HZ;
static int ip_rt_error_burst __read_mostly        = 5 * HZ;

static int ip_rt_gc_timeout __read_mostly        = RT_GC_TIMEOUT;

/*
 *        Interface to generic destination cache.
 */

INDIRECT_CALLABLE_SCOPE
struct dst_entry        *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int         ipv4_default_advmss(const struct dst_entry *dst);
INDIRECT_CALLABLE_SCOPE
unsigned int                ipv4_mtu(const struct dst_entry *dst);
static void                ipv4_negative_advice(struct sock *sk,
                                             struct dst_entry *dst);
static void                 ipv4_link_failure(struct sk_buff *skb);
static void                 ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                           struct sk_buff *skb, u32 mtu,
                                           bool confirm_neigh);
static void                 ip_do_redirect(struct dst_entry *dst, struct sock *sk,
                                        struct sk_buff *skb);
static void                ipv4_dst_destroy(struct dst_entry *dst);

static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
{
        WARN_ON(1);
        return NULL;
}

static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr);
static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);

static struct dst_ops ipv4_dst_ops = {
        .family =                AF_INET,
        .check =                ipv4_dst_check,
        .default_advmss =        ipv4_default_advmss,
        .mtu =                        ipv4_mtu,
        .cow_metrics =                ipv4_cow_metrics,
        .destroy =                ipv4_dst_destroy,
        .negative_advice =        ipv4_negative_advice,
        .link_failure =                ipv4_link_failure,
        .update_pmtu =                ip_rt_update_pmtu,
        .redirect =                ip_do_redirect,
        .local_out =                __ip_local_out,
        .neigh_lookup =                ipv4_neigh_lookup,
        .confirm_neigh =        ipv4_confirm_neigh,
};

#define ECN_OR_COST(class)        TC_PRIO_##class

const __u8 ip_tos2prio[16] = {
        TC_PRIO_BESTEFFORT,
        ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BESTEFFORT,
        ECN_OR_COST(BESTEFFORT),
        TC_PRIO_BULK,
        ECN_OR_COST(BULK),
        TC_PRIO_BULK,
        ECN_OR_COST(BULK),
        TC_PRIO_INTERACTIVE,
        ECN_OR_COST(INTERACTIVE),
        TC_PRIO_INTERACTIVE,
        ECN_OR_COST(INTERACTIVE),
        TC_PRIO_INTERACTIVE_BULK,
        ECN_OR_COST(INTERACTIVE_BULK),
        TC_PRIO_INTERACTIVE_BULK,
        ECN_OR_COST(INTERACTIVE_BULK)
};
EXPORT_SYMBOL(ip_tos2prio);

static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)

#ifdef CONFIG_PROC_FS
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
{
        if (*pos)
                return NULL;
        return SEQ_START_TOKEN;
}

static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        ++*pos;
        return NULL;
}

static void rt_cache_seq_stop(struct seq_file *seq, void *v)
{
}

static int rt_cache_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN)
                seq_printf(seq, "%-127s\n",
                           "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
                           "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
                           "HHUptod\tSpecDst");
        return 0;
}

static const struct seq_operations rt_cache_seq_ops = {
        .start  = rt_cache_seq_start,
        .next   = rt_cache_seq_next,
        .stop   = rt_cache_seq_stop,
        .show   = rt_cache_seq_show,
};

static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
        int cpu;

        if (*pos == 0)
                return SEQ_START_TOKEN;

        for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
                if (!cpu_possible(cpu))
                        continue;
                *pos = cpu+1;
                return &per_cpu(rt_cache_stat, cpu);
        }
        return NULL;
}

static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        int cpu;

        for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
                if (!cpu_possible(cpu))
                        continue;
                *pos = cpu+1;
                return &per_cpu(rt_cache_stat, cpu);
        }
        (*pos)++;
        return NULL;

}

static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
{

}

static int rt_cpu_seq_show(struct seq_file *seq, void *v)
{
        struct rt_cache_stat *st = v;

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "entries  in_hit   in_slow_tot in_slow_mc in_no_route in_brd   in_martian_dst in_martian_src out_hit  out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
                return 0;
        }

        seq_printf(seq, "%08x %08x %08x    %08x   %08x    %08x %08x       "
                        "%08x       %08x %08x     %08x    %08x %08x   "
                        "%08x     %08x        %08x        %08x\n",
                   dst_entries_get_slow(&ipv4_dst_ops),
                   0, /* st->in_hit */
                   st->in_slow_tot,
                   st->in_slow_mc,
                   st->in_no_route,
                   st->in_brd,
                   st->in_martian_dst,
                   st->in_martian_src,

                   0, /* st->out_hit */
                   st->out_slow_tot,
                   st->out_slow_mc,

                   0, /* st->gc_total */
                   0, /* st->gc_ignored */
                   0, /* st->gc_goal_miss */
                   0, /* st->gc_dst_overflow */
                   0, /* st->in_hlist_search */
                   0  /* st->out_hlist_search */
                );
        return 0;
}

static const struct seq_operations rt_cpu_seq_ops = {
        .start  = rt_cpu_seq_start,
        .next   = rt_cpu_seq_next,
        .stop   = rt_cpu_seq_stop,
        .show   = rt_cpu_seq_show,
};

#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
        struct ip_rt_acct *dst, *src;
        unsigned int i, j;

        dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
        if (!dst)
                return -ENOMEM;

        for_each_possible_cpu(i) {
                src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
                for (j = 0; j < 256; j++) {
                        dst[j].o_bytes   += src[j].o_bytes;
                        dst[j].o_packets += src[j].o_packets;
                        dst[j].i_bytes   += src[j].i_bytes;
                        dst[j].i_packets += src[j].i_packets;
                }
        }

        seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
        kfree(dst);
        return 0;
}
#endif

static int __net_init ip_rt_do_proc_init(struct net *net)
{
        struct proc_dir_entry *pde;

        pde = proc_create_seq("rt_cache", 0444, net->proc_net,
                              &rt_cache_seq_ops);
        if (!pde)
                goto err1;

        pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
                              &rt_cpu_seq_ops);
        if (!pde)
                goto err2;

#ifdef CONFIG_IP_ROUTE_CLASSID
        pde = proc_create_single("rt_acct", 0, net->proc_net,
                        rt_acct_proc_show);
        if (!pde)
                goto err3;
#endif
        return 0;

#ifdef CONFIG_IP_ROUTE_CLASSID
err3:
        remove_proc_entry("rt_cache", net->proc_net_stat);
#endif
err2:
        remove_proc_entry("rt_cache", net->proc_net);
err1:
        return -ENOMEM;
}

static void __net_exit ip_rt_do_proc_exit(struct net *net)
{
        remove_proc_entry("rt_cache", net->proc_net_stat);
        remove_proc_entry("rt_cache", net->proc_net);
#ifdef CONFIG_IP_ROUTE_CLASSID
        remove_proc_entry("rt_acct", net->proc_net);
#endif
}

static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
        .init = ip_rt_do_proc_init,
        .exit = ip_rt_do_proc_exit,
};

static int __init ip_rt_proc_init(void)
{
        return register_pernet_subsys(&ip_rt_proc_ops);
}

#else
static inline int ip_rt_proc_init(void)
{
        return 0;
}
#endif /* CONFIG_PROC_FS */

static inline bool rt_is_expired(const struct rtable *rth)
{
        bool res;

        rcu_read_lock();
        res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
        rcu_read_unlock();

        return res;
}

void rt_cache_flush(struct net *net)
{
        rt_genid_bump_ipv4(net);
}

static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
                                           struct sk_buff *skb,
                                           const void *daddr)
{
        const struct rtable *rt = container_of(dst, struct rtable, dst);
        struct net_device *dev = dst->dev;
        struct neighbour *n;

        rcu_read_lock();

        if (likely(rt->rt_gw_family == AF_INET)) {
                n = ip_neigh_gw4(dev, rt->rt_gw4);
        } else if (rt->rt_gw_family == AF_INET6) {
                n = ip_neigh_gw6(dev, &rt->rt_gw6);
        } else {
                __be32 pkey;

                pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
                n = ip_neigh_gw4(dev, pkey);
        }

        if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
                n = NULL;

        rcu_read_unlock();

        return n;
}

static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
        const struct rtable *rt = container_of(dst, struct rtable, dst);
        struct net_device *dev = dst->dev;
        const __be32 *pkey = daddr;

        if (rt->rt_gw_family == AF_INET) {
                pkey = (const __be32 *)&rt->rt_gw4;
        } else if (rt->rt_gw_family == AF_INET6) {
                return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
        } else if (!daddr ||
                 (rt->rt_flags &
                  (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
                return;
        }
        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}

/* Hash tables of size 2048..262144 depending on RAM size.
 * Each bucket uses 8 bytes.
 */
static u32 ip_idents_mask __read_mostly;
static atomic_t *ip_idents __read_mostly;
static u32 *ip_tstamps __read_mostly;

/* In order to protect privacy, we add a perturbation to identifiers
 * if one generator is seldom used. This makes hard for an attacker
 * to infer how many packets were sent between two points in time.
 */
static u32 ip_idents_reserve(u32 hash, int segs)
{
        u32 bucket, old, now = (u32)jiffies;
        atomic_t *p_id;
        u32 *p_tstamp;
        u32 delta = 0;

        bucket = hash & ip_idents_mask;
        p_tstamp = ip_tstamps + bucket;
        p_id = ip_idents + bucket;
        old = READ_ONCE(*p_tstamp);

        if (old != now && cmpxchg(p_tstamp, old, now) == old)
                delta = get_random_u32_below(now - old);

        /* If UBSAN reports an error there, please make sure your compiler
         * supports -fno-strict-overflow before reporting it that was a bug
         * in UBSAN, and it has been fixed in GCC-8.
         */
        return atomic_add_return(segs + delta, p_id) - segs;
}

void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
{
        u32 hash, id;

        /* Note the following code is not safe, but this is okay. */
        if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
                get_random_bytes(&net->ipv4.ip_id_key,
                                 sizeof(net->ipv4.ip_id_key));

        hash = siphash_3u32((__force u32)iph->daddr,
                            (__force u32)iph->saddr,
                            iph->protocol,
                            &net->ipv4.ip_id_key);
        id = ip_idents_reserve(hash, segs);
        iph->id = htons(id);
}
EXPORT_SYMBOL(__ip_select_ident);

static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
                             const struct sock *sk, const struct iphdr *iph,
                             int oif, __u8 tos, u8 prot, u32 mark,
                             int flow_flags)
{
        __u8 scope = RT_SCOPE_UNIVERSE;

        if (sk) {
                oif = sk->sk_bound_dev_if;
                mark = READ_ONCE(sk->sk_mark);
                tos = ip_sock_rt_tos(sk);
                scope = ip_sock_rt_scope(sk);
                prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
                                                    sk->sk_protocol;
        }

        flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope,
                           prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
                           sock_net_uid(net, sk));
}

static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
                               const struct sock *sk)
{
        const struct net *net = dev_net(skb->dev);
        const struct iphdr *iph = ip_hdr(skb);
        int oif = skb->dev->ifindex;
        u8 prot = iph->protocol;
        u32 mark = skb->mark;
        __u8 tos = iph->tos;

        __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}

static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
{
        const struct inet_sock *inet = inet_sk(sk);
        const struct ip_options_rcu *inet_opt;
        __be32 daddr = inet->inet_daddr;

        rcu_read_lock();
        inet_opt = rcu_dereference(inet->inet_opt);
        if (inet_opt && inet_opt->opt.srr)
                daddr = inet_opt->opt.faddr;
        flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
                           ip_sock_rt_tos(sk),
                           ip_sock_rt_scope(sk),
                           inet_test_bit(HDRINCL, sk) ?
                                IPPROTO_RAW : sk->sk_protocol,
                           inet_sk_flowi_flags(sk),
                           daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
        rcu_read_unlock();
}

static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
                                 const struct sk_buff *skb)
{
        if (skb)
                build_skb_flow_key(fl4, skb, sk);
        else
                build_sk_flow_key(fl4, sk);
}

static DEFINE_SPINLOCK(fnhe_lock);

static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
{
        struct rtable *rt;

        rt = rcu_dereference(fnhe->fnhe_rth_input);
        if (rt) {
                RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
                dst_dev_put(&rt->dst);
                dst_release(&rt->dst);
        }
        rt = rcu_dereference(fnhe->fnhe_rth_output);
        if (rt) {
                RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
                dst_dev_put(&rt->dst);
                dst_release(&rt->dst);
        }
}

static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
{
        struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
        struct fib_nh_exception *fnhe, *oldest = NULL;

        for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
                fnhe = rcu_dereference_protected(*fnhe_p,
                                                 lockdep_is_held(&fnhe_lock));
                if (!fnhe)
                        break;
                if (!oldest ||
                    time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
                        oldest = fnhe;
                        oldest_p = fnhe_p;
                }
        }
        fnhe_flush_routes(oldest);
        *oldest_p = oldest->fnhe_next;
        kfree_rcu(oldest, rcu);
}

static u32 fnhe_hashfun(__be32 daddr)
{
        static siphash_aligned_key_t fnhe_hash_key;
        u64 hval;

        net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
        hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
        return hash_64(hval, FNHE_HASH_SHIFT);
}

static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
{
        rt->rt_pmtu = fnhe->fnhe_pmtu;
        rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
        rt->dst.expires = fnhe->fnhe_expires;

        if (fnhe->fnhe_gw) {
                rt->rt_flags |= RTCF_REDIRECTED;
                rt->rt_uses_gateway = 1;
                rt->rt_gw_family = AF_INET;
                rt->rt_gw4 = fnhe->fnhe_gw;
        }
}

static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
                                  __be32 gw, u32 pmtu, bool lock,
                                  unsigned long expires)
{
        struct fnhe_hash_bucket *hash;
        struct fib_nh_exception *fnhe;
        struct rtable *rt;
        u32 genid, hval;
        unsigned int i;
        int depth;

        genid = fnhe_genid(dev_net(nhc->nhc_dev));
        hval = fnhe_hashfun(daddr);

        spin_lock_bh(&fnhe_lock);

        hash = rcu_dereference(nhc->nhc_exceptions);
        if (!hash) {
                hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
                if (!hash)
                        goto out_unlock;
                rcu_assign_pointer(nhc->nhc_exceptions, hash);
        }

        hash += hval;

        depth = 0;
        for (fnhe = rcu_dereference(hash->chain); fnhe;
             fnhe = rcu_dereference(fnhe->fnhe_next)) {
                if (fnhe->fnhe_daddr == daddr)
                        break;
                depth++;
        }

        if (fnhe) {
                if (fnhe->fnhe_genid != genid)
                        fnhe->fnhe_genid = genid;
                if (gw)
                        fnhe->fnhe_gw = gw;
                if (pmtu) {
                        fnhe->fnhe_pmtu = pmtu;
                        fnhe->fnhe_mtu_locked = lock;
                }
                fnhe->fnhe_expires = max(1UL, expires);
                /* Update all cached dsts too */
                rt = rcu_dereference(fnhe->fnhe_rth_input);
                if (rt)
                        fill_route_from_fnhe(rt, fnhe);
                rt = rcu_dereference(fnhe->fnhe_rth_output);
                if (rt)
                        fill_route_from_fnhe(rt, fnhe);
        } else {
                /* Randomize max depth to avoid some side channels attacks. */
                int max_depth = FNHE_RECLAIM_DEPTH +
                                get_random_u32_below(FNHE_RECLAIM_DEPTH);

                while (depth > max_depth) {
                        fnhe_remove_oldest(hash);
                        depth--;
                }

                fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
                if (!fnhe)
                        goto out_unlock;

                fnhe->fnhe_next = hash->chain;

                fnhe->fnhe_genid = genid;
                fnhe->fnhe_daddr = daddr;
                fnhe->fnhe_gw = gw;
                fnhe->fnhe_pmtu = pmtu;
                fnhe->fnhe_mtu_locked = lock;
                fnhe->fnhe_expires = max(1UL, expires);

                rcu_assign_pointer(hash->chain, fnhe);

                /* Exception created; mark the cached routes for the nexthop
                 * stale, so anyone caching it rechecks if this exception
                 * applies to them.
                 */
                rt = rcu_dereference(nhc->nhc_rth_input);
                if (rt)
                        rt->dst.obsolete = DST_OBSOLETE_KILL;

                for_each_possible_cpu(i) {
                        struct rtable __rcu **prt;

                        prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
                        rt = rcu_dereference(*prt);
                        if (rt)
                                rt->dst.obsolete = DST_OBSOLETE_KILL;
                }
        }

        fnhe->fnhe_stamp = jiffies;

out_unlock:
        spin_unlock_bh(&fnhe_lock);
}

static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
                             bool kill_route)
{
        __be32 new_gw = icmp_hdr(skb)->un.gateway;
        __be32 old_gw = ip_hdr(skb)->saddr;
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        struct fib_result res;
        struct neighbour *n;
        struct net *net;

        switch (icmp_hdr(skb)->code & 7) {
        case ICMP_REDIR_NET:
        case ICMP_REDIR_NETTOS:
        case ICMP_REDIR_HOST:
        case ICMP_REDIR_HOSTTOS:
                break;

        default:
                return;
        }

        if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
                return;

        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                return;

        net = dev_net(dev);
        if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
            ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
            ipv4_is_zeronet(new_gw))
                goto reject_redirect;

        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
                        goto reject_redirect;
                if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
                        goto reject_redirect;
        } else {
                if (inet_addr_type(net, new_gw) != RTN_UNICAST)
                        goto reject_redirect;
        }

        n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
        if (!n)
                n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
        if (!IS_ERR(n)) {
                if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
                        neigh_event_send(n, NULL);
                } else {
                        if (fib_lookup(net, fl4, &res, 0) == 0) {
                                struct fib_nh_common *nhc;

                                fib_select_path(net, &res, fl4, skb);
                                nhc = FIB_RES_NHC(res);
                                update_or_create_fnhe(nhc, fl4->daddr, new_gw,
                                                0, false,
                                                jiffies + ip_rt_gc_timeout);
                        }
                        if (kill_route)
                                rt->dst.obsolete = DST_OBSOLETE_KILL;
                        call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
                }
                neigh_release(n);
        }
        return;

reject_redirect:
#ifdef CONFIG_IP_ROUTE_VERBOSE
        if (IN_DEV_LOG_MARTIANS(in_dev)) {
                const struct iphdr *iph = (const struct iphdr *) skb->data;
                __be32 daddr = iph->daddr;
                __be32 saddr = iph->saddr;

                net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
                                     "  Advised path = %pI4 -> %pI4\n",
                                     &old_gw, dev->name, &new_gw,
                                     &saddr, &daddr);
        }
#endif
        ;
}

static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
        struct rtable *rt;
        struct flowi4 fl4;
        const struct iphdr *iph = (const struct iphdr *) skb->data;
        struct net *net = dev_net(skb->dev);
        int oif = skb->dev->ifindex;
        u8 prot = iph->protocol;
        u32 mark = skb->mark;
        __u8 tos = iph->tos;

        rt = dst_rtable(dst);

        __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
        __ip_do_redirect(rt, skb, &fl4, true);
}

static void ipv4_negative_advice(struct sock *sk,
                                 struct dst_entry *dst)
{
        struct rtable *rt = dst_rtable(dst);

        if ((dst->obsolete > 0) ||
            (rt->rt_flags & RTCF_REDIRECTED) ||
            rt->dst.expires)
                sk_dst_reset(sk);
}

/*
 * Algorithm:
 *        1. The first ip_rt_redirect_number redirects are sent
 *           with exponential backoff, then we stop sending them at all,
 *           assuming that the host ignores our redirects.
 *        2. If we did not see packets requiring redirects
 *           during ip_rt_redirect_silence, we assume that the host
 *           forgot redirected route and start to send redirects again.
 *
 * This algorithm is much cheaper and more intelligent than dumb load limiting
 * in icmp.c.
 *
 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 * and "frag. need" (breaks PMTU discovery) in icmp.c.
 */

void ip_rt_send_redirect(struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct in_device *in_dev;
        struct inet_peer *peer;
        struct net *net;
        int log_martians;
        int vif;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(rt->dst.dev);
        if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
                rcu_read_unlock();
                return;
        }
        log_martians = IN_DEV_LOG_MARTIANS(in_dev);
        vif = l3mdev_master_ifindex_rcu(rt->dst.dev);

        net = dev_net(rt->dst.dev);
        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
        if (!peer) {
                rcu_read_unlock();
                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
                          rt_nexthop(rt, ip_hdr(skb)->daddr));
                return;
        }

        /* No redirected packets during ip_rt_redirect_silence;
         * reset the algorithm.
         */
        if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
                peer->rate_tokens = 0;
                peer->n_redirects = 0;
        }

        /* Too many ignored redirects; do not send anything
         * set dst.rate_last to the last seen redirected packet.
         */
        if (peer->n_redirects >= ip_rt_redirect_number) {
                peer->rate_last = jiffies;
                goto out_unlock;
        }

        /* Check for load limit; set rate_last to the latest sent
         * redirect.
         */
        if (peer->n_redirects == 0 ||
            time_after(jiffies,
                       (peer->rate_last +
                        (ip_rt_redirect_load << peer->n_redirects)))) {
                __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);

                icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
                peer->rate_last = jiffies;
                ++peer->n_redirects;
                if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
                    peer->n_redirects == ip_rt_redirect_number)
                        net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
                                             &ip_hdr(skb)->saddr, inet_iif(skb),
                                             &ip_hdr(skb)->daddr, &gw);
        }
out_unlock:
        rcu_read_unlock();
}

static int ip_error(struct sk_buff *skb)
{
        struct rtable *rt = skb_rtable(skb);
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        struct inet_peer *peer;
        unsigned long now;
        struct net *net;
        SKB_DR(reason);
        bool send;
        int code;

        if (netif_is_l3_master(skb->dev)) {
                dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
                if (!dev)
                        goto out;
        }

        in_dev = __in_dev_get_rcu(dev);

        /* IP on this device is disabled. */
        if (!in_dev)
                goto out;

        net = dev_net(rt->dst.dev);
        if (!IN_DEV_FORWARD(in_dev)) {
                switch (rt->dst.error) {
                case EHOSTUNREACH:
                        SKB_DR_SET(reason, IP_INADDRERRORS);
                        __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
                        break;

                case ENETUNREACH:
                        SKB_DR_SET(reason, IP_INNOROUTES);
                        __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
                        break;
                }
                goto out;
        }

        switch (rt->dst.error) {
        case EINVAL:
        default:
                goto out;
        case EHOSTUNREACH:
                code = ICMP_HOST_UNREACH;
                break;
        case ENETUNREACH:
                code = ICMP_NET_UNREACH;
                SKB_DR_SET(reason, IP_INNOROUTES);
                __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
                break;
        case EACCES:
                code = ICMP_PKT_FILTERED;
                break;
        }

        rcu_read_lock();
        peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
                               l3mdev_master_ifindex_rcu(skb->dev));
        send = true;
        if (peer) {
                now = jiffies;
                peer->rate_tokens += now - peer->rate_last;
                if (peer->rate_tokens > ip_rt_error_burst)
                        peer->rate_tokens = ip_rt_error_burst;
                peer->rate_last = now;
                if (peer->rate_tokens >= ip_rt_error_cost)
                        peer->rate_tokens -= ip_rt_error_cost;
                else
                        send = false;
        }
        rcu_read_unlock();

        if (send)
                icmp_send(skb, ICMP_DEST_UNREACH, code, 0);

out:        kfree_skb_reason(skb, reason);
        return 0;
}

static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
        struct dst_entry *dst = &rt->dst;
        struct fib_result res;
        bool lock = false;
        struct net *net;
        u32 old_mtu;

        if (ip_mtu_locked(dst))
                return;

        old_mtu = ipv4_mtu(dst);
        if (old_mtu < mtu)
                return;

        rcu_read_lock();
        net = dev_net_rcu(dst->dev);
        if (mtu < net->ipv4.ip_rt_min_pmtu) {
                lock = true;
                mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
        }

        if (rt->rt_pmtu == mtu && !lock &&
            time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2))
                goto out;

        if (fib_lookup(net, fl4, &res, 0) == 0) {
                struct fib_nh_common *nhc;

                fib_select_path(net, &res, fl4, NULL);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
                if (fib_info_num_path(res.fi) > 1) {
                        int nhsel;

                        for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
                                nhc = fib_info_nhc(res.fi, nhsel);
                                update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
                                                      jiffies + net->ipv4.ip_rt_mtu_expires);
                        }
                        goto out;
                }
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
                nhc = FIB_RES_NHC(res);
                update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
                                      jiffies + net->ipv4.ip_rt_mtu_expires);
        }
out:
        rcu_read_unlock();
}

static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                              struct sk_buff *skb, u32 mtu,
                              bool confirm_neigh)
{
        struct rtable *rt = dst_rtable(dst);
        struct flowi4 fl4;

        ip_rt_build_flow_key(&fl4, sk, skb);

        /* Don't make lookup fail for bridged encapsulations */
        if (skb && netif_is_any_bridge_port(skb->dev))
                fl4.flowi4_oif = 0;

        __ip_rt_update_pmtu(rt, &fl4, mtu);
}

void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
                      int oif, u8 protocol)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
        u32 mark = IP4_REPLY_MARK(net, skb->mark);

        __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
                         0);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
                __ip_rt_update_pmtu(rt, &fl4, mtu);
                ip_rt_put(rt);
        }
}
EXPORT_SYMBOL_GPL(ipv4_update_pmtu);

static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;

        __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);

        if (!fl4.flowi4_mark)
                fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);

        rt = __ip_route_output_key(sock_net(sk), &fl4);
        if (!IS_ERR(rt)) {
                __ip_rt_update_pmtu(rt, &fl4, mtu);
                ip_rt_put(rt);
        }
}

void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
        struct dst_entry *odst = NULL;
        bool new = false;
        struct net *net = sock_net(sk);

        bh_lock_sock(sk);

        if (!ip_sk_accept_pmtu(sk))
                goto out;

        odst = sk_dst_get(sk);

        if (sock_owned_by_user(sk) || !odst) {
                __ipv4_sk_update_pmtu(skb, sk, mtu);
                goto out;
        }

        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);

        rt = dst_rtable(odst);
        if (odst->obsolete && !odst->ops->check(odst, 0)) {
                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
                if (IS_ERR(rt))
                        goto out;

                new = true;
        }

        __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);

        if (!dst_check(&rt->dst, 0)) {
                if (new)
                        dst_release(&rt->dst);

                rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
                if (IS_ERR(rt))
                        goto out;

                new = true;
        }

        if (new)
                sk_dst_set(sk, &rt->dst);

out:
        bh_unlock_sock(sk);
        dst_release(odst);
}
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);

void ipv4_redirect(struct sk_buff *skb, struct net *net,
                   int oif, u8 protocol)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;

        __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
                __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
        }
}
EXPORT_SYMBOL_GPL(ipv4_redirect);

void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
        const struct iphdr *iph = (const struct iphdr *)skb->data;
        struct flowi4 fl4;
        struct rtable *rt;
        struct net *net = sock_net(sk);

        __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
        rt = __ip_route_output_key(net, &fl4);
        if (!IS_ERR(rt)) {
                __ip_do_redirect(rt, skb, &fl4, false);
                ip_rt_put(rt);
        }
}
EXPORT_SYMBOL_GPL(ipv4_sk_redirect);

INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
                                                         u32 cookie)
{
        struct rtable *rt = dst_rtable(dst);

        /* All IPV4 dsts are created with ->obsolete set to the value
         * DST_OBSOLETE_FORCE_CHK which forces validation calls down
         * into this function always.
         *
         * When a PMTU/redirect information update invalidates a route,
         * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
         * DST_OBSOLETE_DEAD.
         */
        if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
                return NULL;
        return dst;
}
EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);

static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
        struct net_device *dev;
        struct ip_options opt;
        int res;

        /* Recompile ip options since IPCB may not be valid anymore.
         * Also check we have a reasonable ipv4 header.
         */
        if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
            ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
                return;

        memset(&opt, 0, sizeof(opt));
        if (ip_hdr(skb)->ihl > 5) {
                if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
                        return;
                opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);

                rcu_read_lock();
                dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
                res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
                rcu_read_unlock();

                if (res)
                        return;
        }
        __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
}

static void ipv4_link_failure(struct sk_buff *skb)
{
        struct rtable *rt;

        ipv4_send_dest_unreach(skb);

        rt = skb_rtable(skb);
        if (rt)
                dst_set_expires(&rt->dst, 0);
}

static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        pr_debug("%s: %pI4 -> %pI4, %s\n",
                 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
                 skb->dev ? skb->dev->name : "?");
        kfree_skb(skb);
        WARN_ON(1);
        return 0;
}

/*
 * We do not cache source address of outgoing interface,
 * because it is used only by IP RR, TS and SRR options,
 * so that it out of fast path.
 *
 * BTW remember: "addr" is allowed to be not aligned
 * in IP options!
 */

void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
{
        __be32 src;

        if (rt_is_output_route(rt))
                src = ip_hdr(skb)->saddr;
        else {
                struct fib_result res;
                struct iphdr *iph = ip_hdr(skb);
                struct flowi4 fl4 = {
                        .daddr = iph->daddr,
                        .saddr = iph->saddr,
                        .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)),
                        .flowi4_oif = rt->dst.dev->ifindex,
                        .flowi4_iif = skb->dev->ifindex,
                        .flowi4_mark = skb->mark,
                };

                rcu_read_lock();
                if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
                        src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
                else
                        src = inet_select_addr(rt->dst.dev,
                                               rt_nexthop(rt, iph->daddr),
                                               RT_SCOPE_UNIVERSE);
                rcu_read_unlock();
        }
        memcpy(addr, &src, 4);
}

#ifdef CONFIG_IP_ROUTE_CLASSID
static void set_class_tag(struct rtable *rt, u32 tag)
{
        if (!(rt->dst.tclassid & 0xFFFF))
                rt->dst.tclassid |= tag & 0xFFFF;
        if (!(rt->dst.tclassid & 0xFFFF0000))
                rt->dst.tclassid |= tag & 0xFFFF0000;
}
#endif

static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
        unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
        unsigned int advmss;
        struct net *net;

        rcu_read_lock();
        net = dev_net_rcu(dst->dev);
        advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
                                   net->ipv4.ip_rt_min_advmss);
        rcu_read_unlock();

        return min(advmss, IPV4_MAX_PMTU - header_size);
}

INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
{
        return ip_dst_mtu_maybe_forward(dst, false);
}
EXPORT_INDIRECT_CALLABLE(ipv4_mtu);

static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
{
        struct fnhe_hash_bucket *hash;
        struct fib_nh_exception *fnhe, __rcu **fnhe_p;
        u32 hval = fnhe_hashfun(daddr);

        spin_lock_bh(&fnhe_lock);

        hash = rcu_dereference_protected(nhc->nhc_exceptions,
                                         lockdep_is_held(&fnhe_lock));
        hash += hval;

        fnhe_p = &hash->chain;
        fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
        while (fnhe) {
                if (fnhe->fnhe_daddr == daddr) {
                        rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
                                fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
                        /* set fnhe_daddr to 0 to ensure it won't bind with
                         * new dsts in rt_bind_exception().
                         */
                        fnhe->fnhe_daddr = 0;
                        fnhe_flush_routes(fnhe);
                        kfree_rcu(fnhe, rcu);
                        break;
                }
                fnhe_p = &fnhe->fnhe_next;
                fnhe = rcu_dereference_protected(fnhe->fnhe_next,
                                                 lockdep_is_held(&fnhe_lock));
        }

        spin_unlock_bh(&fnhe_lock);
}

static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
                                               __be32 daddr)
{
        struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
        struct fib_nh_exception *fnhe;
        u32 hval;

        if (!hash)
                return NULL;

        hval = fnhe_hashfun(daddr);

        for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
             fnhe = rcu_dereference(fnhe->fnhe_next)) {
                if (fnhe->fnhe_daddr == daddr) {
                        if (fnhe->fnhe_expires &&
                            time_after(jiffies, fnhe->fnhe_expires)) {
                                ip_del_fnhe(nhc, daddr);
                                break;
                        }
                        return fnhe;
                }
        }
        return NULL;
}

/* MTU selection:
 * 1. mtu on route is locked - use it
 * 2. mtu from nexthop exception
 * 3. mtu from egress device
 */

u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
{
        struct fib_nh_common *nhc = res->nhc;
        struct net_device *dev = nhc->nhc_dev;
        struct fib_info *fi = res->fi;
        u32 mtu = 0;

        if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
            fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
                mtu = fi->fib_mtu;

        if (likely(!mtu)) {
                struct fib_nh_exception *fnhe;

                fnhe = find_exception(nhc, daddr);
                if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
                        mtu = fnhe->fnhe_pmtu;
        }

        if (likely(!mtu))
                mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);

        return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
}

static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
                              __be32 daddr, const bool do_cache)
{
        bool ret = false;

        spin_lock_bh(&fnhe_lock);

        if (daddr == fnhe->fnhe_daddr) {
                struct rtable __rcu **porig;
                struct rtable *orig;
                int genid = fnhe_genid(dev_net(rt->dst.dev));

                if (rt_is_input_route(rt))
                        porig = &fnhe->fnhe_rth_input;
                else
                        porig = &fnhe->fnhe_rth_output;
                orig = rcu_dereference(*porig);

                if (fnhe->fnhe_genid != genid) {
                        fnhe->fnhe_genid = genid;
                        fnhe->fnhe_gw = 0;
                        fnhe->fnhe_pmtu = 0;
                        fnhe->fnhe_expires = 0;
                        fnhe->fnhe_mtu_locked = false;
                        fnhe_flush_routes(fnhe);
                        orig = NULL;
                }
                fill_route_from_fnhe(rt, fnhe);
                if (!rt->rt_gw4) {
                        rt->rt_gw4 = daddr;
                        rt->rt_gw_family = AF_INET;
                }

                if (do_cache) {
                        dst_hold(&rt->dst);
                        rcu_assign_pointer(*porig, rt);
                        if (orig) {
                                dst_dev_put(&orig->dst);
                                dst_release(&orig->dst);
                        }
                        ret = true;
                }

                fnhe->fnhe_stamp = jiffies;
        }
        spin_unlock_bh(&fnhe_lock);

        return ret;
}

static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
{
        struct rtable *orig, *prev, **p;
        bool ret = true;

        if (rt_is_input_route(rt)) {
                p = (struct rtable **)&nhc->nhc_rth_input;
        } else {
                p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
        }
        orig = *p;

        /* hold dst before doing cmpxchg() to avoid race condition
         * on this dst
         */
        dst_hold(&rt->dst);
        prev = cmpxchg(p, orig, rt);
        if (prev == orig) {
                if (orig) {
                        rt_add_uncached_list(orig);
                        dst_release(&orig->dst);
                }
        } else {
                dst_release(&rt->dst);
                ret = false;
        }

        return ret;
}

struct uncached_list {
        spinlock_t                lock;
        struct list_head        head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);

void rt_add_uncached_list(struct rtable *rt)
{
        struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);

        rt->dst.rt_uncached_list = ul;

        spin_lock_bh(&ul->lock);
        list_add_tail(&rt->dst.rt_uncached, &ul->head);
        spin_unlock_bh(&ul->lock);
}

void rt_del_uncached_list(struct rtable *rt)
{
        if (!list_empty(&rt->dst.rt_uncached)) {
                struct uncached_list *ul = rt->dst.rt_uncached_list;

                spin_lock_bh(&ul->lock);
                list_del_init(&rt->dst.rt_uncached);
                spin_unlock_bh(&ul->lock);
        }
}

static void ipv4_dst_destroy(struct dst_entry *dst)
{
        ip_dst_metrics_put(dst);
        rt_del_uncached_list(dst_rtable(dst));
}

void rt_flush_dev(struct net_device *dev)
{
        struct rtable *rt, *safe;
        int cpu;

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);

                if (list_empty(&ul->head))
                        continue;

                spin_lock_bh(&ul->lock);
                list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
                        if (rt->dst.dev != dev)
                                continue;
                        rt->dst.dev = blackhole_netdev;
                        netdev_ref_replace(dev, blackhole_netdev,
                                           &rt->dst.dev_tracker, GFP_ATOMIC);
                        list_del_init(&rt->dst.rt_uncached);
                }
                spin_unlock_bh(&ul->lock);
        }
}

static bool rt_cache_valid(const struct rtable *rt)
{
        return        rt &&
                rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
                !rt_is_expired(rt);
}

static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
                           const struct fib_result *res,
                           struct fib_nh_exception *fnhe,
                           struct fib_info *fi, u16 type, u32 itag,
                           const bool do_cache)
{
        bool cached = false;

        if (fi) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);

                if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
                        rt->rt_uses_gateway = 1;
                        rt->rt_gw_family = nhc->nhc_gw_family;
                        /* only INET and INET6 are supported */
                        if (likely(nhc->nhc_gw_family == AF_INET))
                                rt->rt_gw4 = nhc->nhc_gw.ipv4;
                        else
                                rt->rt_gw6 = nhc->nhc_gw.ipv6;
                }

                ip_dst_init_metrics(&rt->dst, fi->fib_metrics);

#ifdef CONFIG_IP_ROUTE_CLASSID
                if (nhc->nhc_family == AF_INET) {
                        struct fib_nh *nh;

                        nh = container_of(nhc, struct fib_nh, nh_common);
                        rt->dst.tclassid = nh->nh_tclassid;
                }
#endif
                rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
                if (unlikely(fnhe))
                        cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
                else if (do_cache)
                        cached = rt_cache_route(nhc, rt);
                if (unlikely(!cached)) {
                        /* Routes we intend to cache in nexthop exception or
                         * FIB nexthop have the DST_NOCACHE bit clear.
                         * However, if we are unsuccessful at storing this
                         * route into the cache we really need to set it.
                         */
                        if (!rt->rt_gw4) {
                                rt->rt_gw_family = AF_INET;
                                rt->rt_gw4 = daddr;
                        }
                        rt_add_uncached_list(rt);
                }
        } else
                rt_add_uncached_list(rt);

#ifdef CONFIG_IP_ROUTE_CLASSID
#ifdef CONFIG_IP_MULTIPLE_TABLES
        set_class_tag(rt, res->tclassid);
#endif
        set_class_tag(rt, itag);
#endif
}

struct rtable *rt_dst_alloc(struct net_device *dev,
                            unsigned int flags, u16 type,
                            bool noxfrm)
{
        struct rtable *rt;

        rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
                       (noxfrm ? DST_NOXFRM : 0));

        if (rt) {
                rt->rt_genid = rt_genid_ipv4(dev_net(dev));
                rt->rt_flags = flags;
                rt->rt_type = type;
                rt->rt_is_input = 0;
                rt->rt_iif = 0;
                rt->rt_pmtu = 0;
                rt->rt_mtu_locked = 0;
                rt->rt_uses_gateway = 0;
                rt->rt_gw_family = 0;
                rt->rt_gw4 = 0;

                rt->dst.output = ip_output;
                if (flags & RTCF_LOCAL)
                        rt->dst.input = ip_local_deliver;
        }

        return rt;
}
EXPORT_SYMBOL(rt_dst_alloc);

struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
{
        struct rtable *new_rt;

        new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
                           rt->dst.flags);

        if (new_rt) {
                new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
                new_rt->rt_flags = rt->rt_flags;
                new_rt->rt_type = rt->rt_type;
                new_rt->rt_is_input = rt->rt_is_input;
                new_rt->rt_iif = rt->rt_iif;
                new_rt->rt_pmtu = rt->rt_pmtu;
                new_rt->rt_mtu_locked = rt->rt_mtu_locked;
                new_rt->rt_gw_family = rt->rt_gw_family;
                if (rt->rt_gw_family == AF_INET)
                        new_rt->rt_gw4 = rt->rt_gw4;
                else if (rt->rt_gw_family == AF_INET6)
                        new_rt->rt_gw6 = rt->rt_gw6;

                new_rt->dst.input = rt->dst.input;
                new_rt->dst.output = rt->dst.output;
                new_rt->dst.error = rt->dst.error;
                new_rt->dst.lastuse = jiffies;
                new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
        }
        return new_rt;
}
EXPORT_SYMBOL(rt_dst_clone);

/* called in rcu_read_lock() section */
enum skb_drop_reason
ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                      dscp_t dscp, struct net_device *dev,
                      struct in_device *in_dev, u32 *itag)
{
        enum skb_drop_reason reason;

        /* Primary sanity checks. */
        if (!in_dev)
                return SKB_DROP_REASON_NOT_SPECIFIED;

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
                return SKB_DROP_REASON_IP_INVALID_SOURCE;

        if (skb->protocol != htons(ETH_P_IP))
                return SKB_DROP_REASON_INVALID_PROTO;

        if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
                return SKB_DROP_REASON_IP_LOCALNET;

        if (ipv4_is_zeronet(saddr)) {
                if (!ipv4_is_local_multicast(daddr) &&
                    ip_hdr(skb)->protocol != IPPROTO_IGMP)
                        return SKB_DROP_REASON_IP_INVALID_SOURCE;
        } else {
                reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
                                                    dev, in_dev, itag);
                if (reason)
                        return reason;
        }
        return SKB_NOT_DROPPED_YET;
}

/* called in rcu_read_lock() section */
static enum skb_drop_reason
ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                  dscp_t dscp, struct net_device *dev, int our)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        unsigned int flags = RTCF_MULTICAST;
        enum skb_drop_reason reason;
        struct rtable *rth;
        u32 itag = 0;

        reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev,
                                       &itag);
        if (reason)
                return reason;

        if (our)
                flags |= RTCF_LOCAL;

        if (IN_DEV_ORCONF(in_dev, NOPOLICY))
                IPCB(skb)->flags |= IPSKB_NOPOLICY;

        rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
                           false);
        if (!rth)
                return SKB_DROP_REASON_NOMEM;

#ifdef CONFIG_IP_ROUTE_CLASSID
        rth->dst.tclassid = itag;
#endif
        rth->dst.output = ip_rt_bug;
        rth->rt_is_input= 1;

#ifdef CONFIG_IP_MROUTE
        if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
                rth->dst.input = ip_mr_input;
#endif
        RT_CACHE_STAT_INC(in_slow_mc);

        skb_dst_drop(skb);
        skb_dst_set(skb, &rth->dst);
        return SKB_NOT_DROPPED_YET;
}


static void ip_handle_martian_source(struct net_device *dev,
                                     struct in_device *in_dev,
                                     struct sk_buff *skb,
                                     __be32 daddr,
                                     __be32 saddr)
{
        RT_CACHE_STAT_INC(in_martian_src);
#ifdef CONFIG_IP_ROUTE_VERBOSE
        if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
                /*
                 *        RFC1812 recommendation, if source is martian,
                 *        the only hint is MAC header.
                 */
                pr_warn("martian source %pI4 from %pI4, on dev %s\n",
                        &daddr, &saddr, dev->name);
                if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
                        print_hex_dump(KERN_WARNING, "ll header: ",
                                       DUMP_PREFIX_OFFSET, 16, 1,
                                       skb_mac_header(skb),
                                       dev->hard_header_len, false);
                }
        }
#endif
}

/* called in rcu_read_lock() section */
static enum skb_drop_reason
__mkroute_input(struct sk_buff *skb, const struct fib_result *res,
                struct in_device *in_dev, __be32 daddr,
                __be32 saddr, dscp_t dscp)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct fib_nh_common *nhc = FIB_RES_NHC(*res);
        struct net_device *dev = nhc->nhc_dev;
        struct fib_nh_exception *fnhe;
        struct rtable *rth;
        int err;
        struct in_device *out_dev;
        bool do_cache;
        u32 itag = 0;

        /* get a working reference to the output device */
        out_dev = __in_dev_get_rcu(dev);
        if (!out_dev) {
                net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
                return reason;
        }

        err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res),
                                  in_dev->dev, in_dev, &itag);
        if (err < 0) {
                reason = -err;
                ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
                                         saddr);

                goto cleanup;
        }

        do_cache = res->fi && !itag;
        if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
            skb->protocol == htons(ETH_P_IP)) {
                __be32 gw;

                gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
                if (IN_DEV_SHARED_MEDIA(out_dev) ||
                    inet_addr_onlink(out_dev, saddr, gw))
                        IPCB(skb)->flags |= IPSKB_DOREDIRECT;
        }

        if (skb->protocol != htons(ETH_P_IP)) {
                /* Not IP (i.e. ARP). Do not create route, if it is
                 * invalid for proxy arp. DNAT routes are always valid.
                 *
                 * Proxy arp feature have been extended to allow, ARP
                 * replies back to the same interface, to support
                 * Private VLAN switch technologies. See arp.c.
                 */
                if (out_dev == in_dev &&
                    IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
                        reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE;
                        goto cleanup;
                }
        }

        if (IN_DEV_ORCONF(in_dev, NOPOLICY))
                IPCB(skb)->flags |= IPSKB_NOPOLICY;

        fnhe = find_exception(nhc, daddr);
        if (do_cache) {
                if (fnhe)
                        rth = rcu_dereference(fnhe->fnhe_rth_input);
                else
                        rth = rcu_dereference(nhc->nhc_rth_input);
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        goto out;
                }
        }

        rth = rt_dst_alloc(out_dev->dev, 0, res->type,
                           IN_DEV_ORCONF(out_dev, NOXFRM));
        if (!rth) {
                reason = SKB_DROP_REASON_NOMEM;
                goto cleanup;
        }

        rth->rt_is_input = 1;
        RT_CACHE_STAT_INC(in_slow_tot);

        rth->dst.input = ip_forward;

        rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
                       do_cache);
        lwtunnel_set_redirect(&rth->dst);
        skb_dst_set(skb, &rth->dst);
out:
        reason = SKB_NOT_DROPPED_YET;
cleanup:
        return reason;
}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
/* To make ICMP packets follow the right flow, the multipath hash is
 * calculated from the inner IP addresses.
 */
static void ip_multipath_l3_keys(const struct sk_buff *skb,
                                 struct flow_keys *hash_keys)
{
        const struct iphdr *outer_iph = ip_hdr(skb);
        const struct iphdr *key_iph = outer_iph;
        const struct iphdr *inner_iph;
        const struct icmphdr *icmph;
        struct iphdr _inner_iph;
        struct icmphdr _icmph;

        if (likely(outer_iph->protocol != IPPROTO_ICMP))
                goto out;

        if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
                goto out;

        icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
                                   &_icmph);
        if (!icmph)
                goto out;

        if (!icmp_is_err(icmph->type))
                goto out;

        inner_iph = skb_header_pointer(skb,
                                       outer_iph->ihl * 4 + sizeof(_icmph),
                                       sizeof(_inner_iph), &_inner_iph);
        if (!inner_iph)
                goto out;

        key_iph = inner_iph;
out:
        hash_keys->addrs.v4addrs.src = key_iph->saddr;
        hash_keys->addrs.v4addrs.dst = key_iph->daddr;
}

static u32 fib_multipath_custom_hash_outer(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool *p_has_inner)
{
        u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
        struct flow_keys keys, hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);

        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 fib_multipath_custom_hash_inner(const struct net *net,
                                           const struct sk_buff *skb,
                                           bool has_inner)
{
        u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
        struct flow_keys keys, hash_keys;

        /* We assume the packet carries an encapsulation, but if none was
         * encountered during dissection of the outer flow, then there is no
         * point in calling the flow dissector again.
         */
        if (!has_inner)
                return 0;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        skb_flow_dissect_flow_keys(skb, &keys, 0);

        if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
                return 0;

        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
                        hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
                        hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
                if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
                        hash_keys.tags.flow_label = keys.tags.flow_label;
        }

        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
                hash_keys.basic.ip_proto = keys.basic.ip_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
                hash_keys.ports.src = keys.ports.src;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
                hash_keys.ports.dst = keys.ports.dst;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

static u32 fib_multipath_custom_hash_skb(const struct net *net,
                                         const struct sk_buff *skb)
{
        u32 mhash, mhash_inner;
        bool has_inner = true;

        mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
        mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);

        return jhash_2words(mhash, mhash_inner, 0);
}

static u32 fib_multipath_custom_hash_fl4(const struct net *net,
                                         const struct flowi4 *fl4)
{
        u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
        struct flow_keys hash_keys;

        if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
                return 0;

        memset(&hash_keys, 0, sizeof(hash_keys));
        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
                hash_keys.addrs.v4addrs.src = fl4->saddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
                hash_keys.addrs.v4addrs.dst = fl4->daddr;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
                hash_keys.basic.ip_proto = fl4->flowi4_proto;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
                hash_keys.ports.src = fl4->fl4_sport;
        if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
                hash_keys.ports.dst = fl4->fl4_dport;

        return fib_multipath_hash_from_keys(net, &hash_keys);
}

/* if skb is set it will be used and fl4 can be NULL */
int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
                       const struct sk_buff *skb, struct flow_keys *flkeys)
{
        u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
        struct flow_keys hash_keys;
        u32 mhash = 0;

        switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
        case 0:
                memset(&hash_keys, 0, sizeof(hash_keys));
                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                if (skb) {
                        ip_multipath_l3_keys(skb, &hash_keys);
                } else {
                        hash_keys.addrs.v4addrs.src = fl4->saddr;
                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 1:
                /* skb is currently provided only when forwarding */
                if (skb) {
                        unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
                        struct flow_keys keys;

                        /* short-circuit if we already have L4 hash present */
                        if (skb->l4_hash)
                                return skb_get_hash_raw(skb) >> 1;

                        memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, flag);
                                flkeys = &keys;
                        }

                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
                        hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
                        hash_keys.ports.src = flkeys->ports.src;
                        hash_keys.ports.dst = flkeys->ports.dst;
                        hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
                } else {
                        memset(&hash_keys, 0, sizeof(hash_keys));
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = fl4->saddr;
                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
                        hash_keys.ports.src = fl4->fl4_sport;
                        hash_keys.ports.dst = fl4->fl4_dport;
                        hash_keys.basic.ip_proto = fl4->flowi4_proto;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 2:
                memset(&hash_keys, 0, sizeof(hash_keys));
                /* skb is currently provided only when forwarding */
                if (skb) {
                        struct flow_keys keys;

                        skb_flow_dissect_flow_keys(skb, &keys, 0);
                        /* Inner can be v4 or v6 */
                        if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                                hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
                                hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
                        } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                                hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
                                hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
                                hash_keys.tags.flow_label = keys.tags.flow_label;
                                hash_keys.basic.ip_proto = keys.basic.ip_proto;
                        } else {
                                /* Same as case 0 */
                                hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                                ip_multipath_l3_keys(skb, &hash_keys);
                        }
                } else {
                        /* Same as case 0 */
                        hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                        hash_keys.addrs.v4addrs.src = fl4->saddr;
                        hash_keys.addrs.v4addrs.dst = fl4->daddr;
                }
                mhash = fib_multipath_hash_from_keys(net, &hash_keys);
                break;
        case 3:
                if (skb)
                        mhash = fib_multipath_custom_hash_skb(net, skb);
                else
                        mhash = fib_multipath_custom_hash_fl4(net, fl4);
                break;
        }

        if (multipath_hash)
                mhash = jhash_2words(mhash, multipath_hash, 0);

        return mhash >> 1;
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH */

static enum skb_drop_reason
ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
                 struct in_device *in_dev, __be32 daddr,
                 __be32 saddr, dscp_t dscp, struct flow_keys *hkeys)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
        if (res->fi && fib_info_num_path(res->fi) > 1) {
                int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);

                fib_select_multipath(res, h);
                IPCB(skb)->flags |= IPSKB_MULTIPATH;
        }
#endif

        /* create a routing cache entry */
        return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp);
}

/* Implements all the saddr-related checks as ip_route_input_slow(),
 * assuming daddr is valid and the destination is not a local broadcast one.
 * Uses the provided hint instead of performing a route lookup.
 */
enum skb_drop_reason
ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                  dscp_t dscp, struct net_device *dev,
                  const struct sk_buff *hint)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        struct rtable *rt = skb_rtable(hint);
        struct net *net = dev_net(dev);
        u32 tag = 0;

        if (!in_dev)
                return reason;

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                goto martian_source;
        }

        if (ipv4_is_zeronet(saddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                goto martian_source;
        }

        if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
                reason = SKB_DROP_REASON_IP_LOCALNET;
                goto martian_source;
        }

        if (rt->rt_type != RTN_LOCAL)
                goto skip_validate_source;

        reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev,
                                            in_dev, &tag);
        if (reason)
                goto martian_source;

skip_validate_source:
        skb_dst_copy(skb, hint);
        return SKB_NOT_DROPPED_YET;

martian_source:
        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
        return reason;
}

/* get device for dst_alloc with local routes */
static struct net_device *ip_rt_get_dev(struct net *net,
                                        const struct fib_result *res)
{
        struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
        struct net_device *dev = NULL;

        if (nhc)
                dev = l3mdev_master_dev_rcu(nhc->nhc_dev);

        return dev ? : net->loopback_dev;
}

/*
 *        NOTE. We drop all the packets that has local source
 *        addresses, because every properly looped back packet
 *        must have correct destination already attached by output routine.
 *        Changes in the enforced policies must be applied also to
 *        ip_route_use_hint().
 *
 *        Such approach solves two big problems:
 *        1. Not simplex devices are handled properly.
 *        2. IP spoofing attempts are filtered with 100% of guarantee.
 *        called with rcu_read_lock()
 */

static enum skb_drop_reason
ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                    dscp_t dscp, struct net_device *dev,
                    struct fib_result *res)
{
        enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        struct flow_keys *flkeys = NULL, _flkeys;
        struct net    *net = dev_net(dev);
        struct ip_tunnel_info *tun_info;
        int                err = -EINVAL;
        unsigned int        flags = 0;
        u32                itag = 0;
        struct rtable        *rth;
        struct flowi4        fl4;
        bool do_cache = true;

        /* IP on this device is disabled. */

        if (!in_dev)
                goto out;

        /* Check for the most weird martians, which can be not detected
         * by fib_lookup.
         */

        tun_info = skb_tunnel_info(skb);
        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
                fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
        else
                fl4.flowi4_tun_key.tun_id = 0;
        skb_dst_drop(skb);

        if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                goto martian_source;
        }

        res->fi = NULL;
        res->table = NULL;
        if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
                goto brd_input;

        /* Accept zero addresses only to limited broadcast;
         * I even do not know to fix it or not. Waiting for complains :-)
         */
        if (ipv4_is_zeronet(saddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
                goto martian_source;
        }

        if (ipv4_is_zeronet(daddr)) {
                reason = SKB_DROP_REASON_IP_INVALID_DEST;
                goto martian_destination;
        }

        /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
         * and call it once if daddr or/and saddr are loopback addresses
         */
        if (ipv4_is_loopback(daddr)) {
                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
                        reason = SKB_DROP_REASON_IP_LOCALNET;
                        goto martian_destination;
                }
        } else if (ipv4_is_loopback(saddr)) {
                if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
                        reason = SKB_DROP_REASON_IP_LOCALNET;
                        goto martian_source;
                }
        }

        /*
         *        Now we are ready to route packet.
         */
        fl4.flowi4_l3mdev = 0;
        fl4.flowi4_oif = 0;
        fl4.flowi4_iif = dev->ifindex;
        fl4.flowi4_mark = skb->mark;
        fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
        fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
        fl4.flowi4_flags = 0;
        fl4.daddr = daddr;
        fl4.saddr = saddr;
        fl4.flowi4_uid = sock_net_uid(net, NULL);
        fl4.flowi4_multipath_hash = 0;

        if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
                flkeys = &_flkeys;
        } else {
                fl4.flowi4_proto = 0;
                fl4.fl4_sport = 0;
                fl4.fl4_dport = 0;
        }

        err = fib_lookup(net, &fl4, res, 0);
        if (err != 0) {
                if (!IN_DEV_FORWARD(in_dev))
                        err = -EHOSTUNREACH;
                goto no_route;
        }

        if (res->type == RTN_BROADCAST) {
                if (IN_DEV_BFORWARD(in_dev))
                        goto make_route;
                /* not do cache if bc_forwarding is enabled */
                if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
                        do_cache = false;
                goto brd_input;
        }

        err = -EINVAL;
        if (res->type == RTN_LOCAL) {
                reason = fib_validate_source_reason(skb, saddr, daddr, dscp,
                                                    0, dev, in_dev, &itag);
                if (reason)
                        goto martian_source;
                goto local_input;
        }

        if (!IN_DEV_FORWARD(in_dev)) {
                err = -EHOSTUNREACH;
                goto no_route;
        }
        if (res->type != RTN_UNICAST) {
                reason = SKB_DROP_REASON_IP_INVALID_DEST;
                goto martian_destination;
        }

make_route:
        reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp,
                                  flkeys);

out:
        return reason;

brd_input:
        if (skb->protocol != htons(ETH_P_IP)) {
                reason = SKB_DROP_REASON_INVALID_PROTO;
                goto out;
        }

        if (!ipv4_is_zeronet(saddr)) {
                reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
                                                    dev, in_dev, &itag);
                if (reason)
                        goto martian_source;
        }
        flags |= RTCF_BROADCAST;
        res->type = RTN_BROADCAST;
        RT_CACHE_STAT_INC(in_brd);

local_input:
        if (IN_DEV_ORCONF(in_dev, NOPOLICY))
                IPCB(skb)->flags |= IPSKB_NOPOLICY;

        do_cache &= res->fi && !itag;
        if (do_cache) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);

                rth = rcu_dereference(nhc->nhc_rth_input);
                if (rt_cache_valid(rth)) {
                        skb_dst_set_noref(skb, &rth->dst);
                        reason = SKB_NOT_DROPPED_YET;
                        goto out;
                }
        }

        rth = rt_dst_alloc(ip_rt_get_dev(net, res),
                           flags | RTCF_LOCAL, res->type, false);
        if (!rth)
                goto e_nobufs;

        rth->dst.output= ip_rt_bug;
#ifdef CONFIG_IP_ROUTE_CLASSID
        rth->dst.tclassid = itag;
#endif
        rth->rt_is_input = 1;

        RT_CACHE_STAT_INC(in_slow_tot);
        if (res->type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
                rth->dst.error= -err;
                rth->rt_flags        &= ~RTCF_LOCAL;
        }

        if (do_cache) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);

                rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
                if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
                        WARN_ON(rth->dst.input == lwtunnel_input);
                        rth->dst.lwtstate->orig_input = rth->dst.input;
                        rth->dst.input = lwtunnel_input;
                }

                if (unlikely(!rt_cache_route(nhc, rth)))
                        rt_add_uncached_list(rth);
        }
        skb_dst_set(skb, &rth->dst);
        reason = SKB_NOT_DROPPED_YET;
        goto out;

no_route:
        RT_CACHE_STAT_INC(in_no_route);
        res->type = RTN_UNREACHABLE;
        res->fi = NULL;
        res->table = NULL;
        goto local_input;

        /*
         *        Do not cache martian addresses: they should be logged (RFC1812)
         */
martian_destination:
        RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
        if (IN_DEV_LOG_MARTIANS(in_dev))
                net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
                                     &daddr, &saddr, dev->name);
#endif
        goto out;

e_nobufs:
        reason = SKB_DROP_REASON_NOMEM;
        goto out;

martian_source:
        ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
        goto out;
}

/* called with rcu_read_lock held */
static enum skb_drop_reason
ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                   dscp_t dscp, struct net_device *dev,
                   struct fib_result *res)
{
        /* Multicast recognition logic is moved from route cache to here.
         * The problem was that too many Ethernet cards have broken/missing
         * hardware multicast filters :-( As result the host on multicasting
         * network acquires a lot of useless route cache entries, sort of
         * SDR messages from all the world. Now we try to get rid of them.
         * Really, provided software IP multicast filter is organized
         * reasonably (at least, hashed), it does not result in a slowdown
         * comparing with route cache reject entries.
         * Note, that multicast routers are not affected, because
         * route cache entry is created eventually.
         */
        if (ipv4_is_multicast(daddr)) {
                enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
                struct in_device *in_dev = __in_dev_get_rcu(dev);
                int our = 0;

                if (!in_dev)
                        return reason;

                our = ip_check_mc_rcu(in_dev, daddr, saddr,
                                      ip_hdr(skb)->protocol);

                /* check l3 master if no match yet */
                if (!our && netif_is_l3_slave(dev)) {
                        struct in_device *l3_in_dev;

                        l3_in_dev = __in_dev_get_rcu(skb->dev);
                        if (l3_in_dev)
                                our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
                                                      ip_hdr(skb)->protocol);
                }

                if (our
#ifdef CONFIG_IP_MROUTE
                        ||
                    (!ipv4_is_local_multicast(daddr) &&
                     IN_DEV_MFORWARD(in_dev))
#endif
                   ) {
                        reason = ip_route_input_mc(skb, daddr, saddr, dscp,
                                                   dev, our);
                }
                return reason;
        }

        return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res);
}

enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr,
                                          __be32 saddr, dscp_t dscp,
                                          struct net_device *dev)
{
        enum skb_drop_reason reason;
        struct fib_result res;

        rcu_read_lock();
        reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res);
        rcu_read_unlock();

        return reason;
}
EXPORT_SYMBOL(ip_route_input_noref);

/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
                                       const struct flowi4 *fl4, int orig_oif,
                                       struct net_device *dev_out,
                                       unsigned int flags)
{
        struct fib_info *fi = res->fi;
        struct fib_nh_exception *fnhe;
        struct in_device *in_dev;
        u16 type = res->type;
        struct rtable *rth;
        bool do_cache;

        in_dev = __in_dev_get_rcu(dev_out);
        if (!in_dev)
                return ERR_PTR(-EINVAL);

        if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
                if (ipv4_is_loopback(fl4->saddr) &&
                    !(dev_out->flags & IFF_LOOPBACK) &&
                    !netif_is_l3_master(dev_out))
                        return ERR_PTR(-EINVAL);

        if (ipv4_is_lbcast(fl4->daddr))
                type = RTN_BROADCAST;
        else if (ipv4_is_multicast(fl4->daddr))
                type = RTN_MULTICAST;
        else if (ipv4_is_zeronet(fl4->daddr))
                return ERR_PTR(-EINVAL);

        if (dev_out->flags & IFF_LOOPBACK)
                flags |= RTCF_LOCAL;

        do_cache = true;
        if (type == RTN_BROADCAST) {
                flags |= RTCF_BROADCAST | RTCF_LOCAL;
                fi = NULL;
        } else if (type == RTN_MULTICAST) {
                flags |= RTCF_MULTICAST | RTCF_LOCAL;
                if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
                                     fl4->flowi4_proto))
                        flags &= ~RTCF_LOCAL;
                else
                        do_cache = false;
                /* If multicast route do not exist use
                 * default one, but do not gateway in this case.
                 * Yes, it is hack.
                 */
                if (fi && res->prefixlen < 4)
                        fi = NULL;
        } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
                   (orig_oif != dev_out->ifindex)) {
                /* For local routes that require a particular output interface
                 * we do not want to cache the result.  Caching the result
                 * causes incorrect behaviour when there are multiple source
                 * addresses on the interface, the end result being that if the
                 * intended recipient is waiting on that interface for the
                 * packet he won't receive it because it will be delivered on
                 * the loopback interface and the IP_PKTINFO ipi_ifindex will
                 * be set to the loopback interface as well.
                 */
                do_cache = false;
        }

        fnhe = NULL;
        do_cache &= fi != NULL;
        if (fi) {
                struct fib_nh_common *nhc = FIB_RES_NHC(*res);
                struct rtable __rcu **prth;

                fnhe = find_exception(nhc, fl4->daddr);
                if (!do_cache)
                        goto add;
                if (fnhe) {
                        prth = &fnhe->fnhe_rth_output;
                } else {
                        if (unlikely(fl4->flowi4_flags &
                                     FLOWI_FLAG_KNOWN_NH &&
                                     !(nhc->nhc_gw_family &&
                                       nhc->nhc_scope == RT_SCOPE_LINK))) {
                                do_cache = false;
                                goto add;
                        }
                        prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
                }
                rth = rcu_dereference(*prth);
                if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
                        return rth;
        }

add:
        rth = rt_dst_alloc(dev_out, flags, type,
                           IN_DEV_ORCONF(in_dev, NOXFRM));
        if (!rth)
                return ERR_PTR(-ENOBUFS);

        rth->rt_iif = orig_oif;

        RT_CACHE_STAT_INC(out_slow_tot);

        if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
                if (flags & RTCF_LOCAL &&
                    !(dev_out->flags & IFF_LOOPBACK)) {
                        rth->dst.output = ip_mc_output;
                        RT_CACHE_STAT_INC(out_slow_mc);
                }
#ifdef CONFIG_IP_MROUTE
                if (type == RTN_MULTICAST) {
                        if (IN_DEV_MFORWARD(in_dev) &&
                            !ipv4_is_local_multicast(fl4->daddr)) {
                                rth->dst.input = ip_mr_input;
                                rth->dst.output = ip_mc_output;
                        }
                }
#endif
        }

        rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
        lwtunnel_set_redirect(&rth->dst);

        return rth;
}

/*
 * Major route resolver routine.
 */

struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
                                        const struct sk_buff *skb)
{
        struct fib_result res = {
                .type                = RTN_UNSPEC,
                .fi                = NULL,
                .table                = NULL,
                .tclassid        = 0,
        };
        struct rtable *rth;

        fl4->flowi4_iif = LOOPBACK_IFINDEX;
        fl4->flowi4_tos &= INET_DSCP_MASK;

        rcu_read_lock();
        rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
        rcu_read_unlock();

        return rth;
}
EXPORT_SYMBOL_GPL(ip_route_output_key_hash);

struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
                                            struct fib_result *res,
                                            const struct sk_buff *skb)
{
        struct net_device *dev_out = NULL;
        int orig_oif = fl4->flowi4_oif;
        unsigned int flags = 0;
        struct rtable *rth;
        int err;

        if (fl4->saddr) {
                if (ipv4_is_multicast(fl4->saddr) ||
                    ipv4_is_lbcast(fl4->saddr) ||
                    ipv4_is_zeronet(fl4->saddr)) {
                        rth = ERR_PTR(-EINVAL);
                        goto out;
                }

                rth = ERR_PTR(-ENETUNREACH);

                /* I removed check for oif == dev_out->oif here.
                 * It was wrong for two reasons:
                 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
                 *    is assigned to multiple interfaces.
                 * 2. Moreover, we are allowed to send packets with saddr
                 *    of another iface. --ANK
                 */

                if (fl4->flowi4_oif == 0 &&
                    (ipv4_is_multicast(fl4->daddr) ||
                     ipv4_is_lbcast(fl4->daddr))) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
                        dev_out = __ip_dev_find(net, fl4->saddr, false);
                        if (!dev_out)
                                goto out;

                        /* Special hack: user can direct multicasts
                         * and limited broadcast via necessary interface
                         * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
                         * This hack is not just for fun, it allows
                         * vic,vat and friends to work.
                         * They bind socket to loopback, set ttl to zero
                         * and expect that it will work.
                         * From the viewpoint of routing cache they are broken,
                         * because we are not allowed to build multicast path
                         * with loopback source addr (look, routing cache
                         * cannot know, that ttl is zero, so that packet
                         * will not leave this host and route is valid).
                         * Luckily, this hack is good workaround.
                         */

                        fl4->flowi4_oif = dev_out->ifindex;
                        goto make_route;
                }

                if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
                        /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
                        if (!__ip_dev_find(net, fl4->saddr, false))
                                goto out;
                }
        }


        if (fl4->flowi4_oif) {
                dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
                rth = ERR_PTR(-ENODEV);
                if (!dev_out)
                        goto out;

                /* RACE: Check return value of inet_select_addr instead. */
                if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
                        rth = ERR_PTR(-ENETUNREACH);
                        goto out;
                }
                if (ipv4_is_local_multicast(fl4->daddr) ||
                    ipv4_is_lbcast(fl4->daddr) ||
                    fl4->flowi4_proto == IPPROTO_IGMP) {
                        if (!fl4->saddr)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
                        goto make_route;
                }
                if (!fl4->saddr) {
                        if (ipv4_is_multicast(fl4->daddr))
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              fl4->flowi4_scope);
                        else if (!fl4->daddr)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_HOST);
                }
        }

        if (!fl4->daddr) {
                fl4->daddr = fl4->saddr;
                if (!fl4->daddr)
                        fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
                dev_out = net->loopback_dev;
                fl4->flowi4_oif = LOOPBACK_IFINDEX;
                res->type = RTN_LOCAL;
                flags |= RTCF_LOCAL;
                goto make_route;
        }

        err = fib_lookup(net, fl4, res, 0);
        if (err) {
                res->fi = NULL;
                res->table = NULL;
                if (fl4->flowi4_oif &&
                    (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
                        /* Apparently, routing tables are wrong. Assume,
                         * that the destination is on link.
                         *
                         * WHY? DW.
                         * Because we are allowed to send to iface
                         * even if it has NO routes and NO assigned
                         * addresses. When oif is specified, routing
                         * tables are looked up with only one purpose:
                         * to catch if destination is gatewayed, rather than
                         * direct. Moreover, if MSG_DONTROUTE is set,
                         * we send packet, ignoring both routing tables
                         * and ifaddr state. --ANK
                         *
                         *
                         * We could make it even if oif is unknown,
                         * likely IPv6, but we do not.
                         */

                        if (fl4->saddr == 0)
                                fl4->saddr = inet_select_addr(dev_out, 0,
                                                              RT_SCOPE_LINK);
                        res->type = RTN_UNICAST;
                        goto make_route;
                }
                rth = ERR_PTR(err);
                goto out;
        }

        if (res->type == RTN_LOCAL) {
                if (!fl4->saddr) {
                        if (res->fi->fib_prefsrc)
                                fl4->saddr = res->fi->fib_prefsrc;
                        else
                                fl4->saddr = fl4->daddr;
                }

                /* L3 master device is the loopback for that domain */
                dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
                        net->loopback_dev;

                /* make sure orig_oif points to fib result device even
                 * though packet rx/tx happens over loopback or l3mdev
                 */
                orig_oif = FIB_RES_OIF(*res);

                fl4->flowi4_oif = dev_out->ifindex;
                flags |= RTCF_LOCAL;
                goto make_route;
        }

        fib_select_path(net, res, fl4, skb);

        dev_out = FIB_RES_DEV(*res);

make_route:
        rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);

out:
        return rth;
}

static struct dst_ops ipv4_dst_blackhole_ops = {
        .family                        = AF_INET,
        .default_advmss                = ipv4_default_advmss,
        .neigh_lookup                = ipv4_neigh_lookup,
        .check                        = dst_blackhole_check,
        .cow_metrics                = dst_blackhole_cow_metrics,
        .update_pmtu                = dst_blackhole_update_pmtu,
        .redirect                = dst_blackhole_redirect,
        .mtu                        = dst_blackhole_mtu,
};

struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
        struct rtable *ort = dst_rtable(dst_orig);
        struct rtable *rt;

        rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
        if (rt) {
                struct dst_entry *new = &rt->dst;

                new->__use = 1;
                new->input = dst_discard;
                new->output = dst_discard_out;

                new->dev = net->loopback_dev;
                netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);

                rt->rt_is_input = ort->rt_is_input;
                rt->rt_iif = ort->rt_iif;
                rt->rt_pmtu = ort->rt_pmtu;
                rt->rt_mtu_locked = ort->rt_mtu_locked;

                rt->rt_genid = rt_genid_ipv4(net);
                rt->rt_flags = ort->rt_flags;
                rt->rt_type = ort->rt_type;
                rt->rt_uses_gateway = ort->rt_uses_gateway;
                rt->rt_gw_family = ort->rt_gw_family;
                if (rt->rt_gw_family == AF_INET)
                        rt->rt_gw4 = ort->rt_gw4;
                else if (rt->rt_gw_family == AF_INET6)
                        rt->rt_gw6 = ort->rt_gw6;
        }

        dst_release(dst_orig);

        return rt ? &rt->dst : ERR_PTR(-ENOMEM);
}

struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
                                    const struct sock *sk)
{
        struct rtable *rt = __ip_route_output_key(net, flp4);

        if (IS_ERR(rt))
                return rt;

        if (flp4->flowi4_proto) {
                flp4->flowi4_oif = rt->dst.dev->ifindex;
                rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
                                                  flowi4_to_flowi(flp4),
                                                  sk, 0));
        }

        return rt;
}
EXPORT_SYMBOL_GPL(ip_route_output_flow);

/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
                        struct rtable *rt, u32 table_id, dscp_t dscp,
                        struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
                        u32 seq, unsigned int flags)
{
        struct rtmsg *r;
        struct nlmsghdr *nlh;
        unsigned long expires = 0;
        u32 error;
        u32 metrics[RTAX_MAX];

        nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
        if (!nlh)
                return -EMSGSIZE;

        r = nlmsg_data(nlh);
        r->rtm_family         = AF_INET;
        r->rtm_dst_len        = 32;
        r->rtm_src_len        = 0;
        r->rtm_tos        = inet_dscp_to_dsfield(dscp);
        r->rtm_table        = table_id < 256 ? table_id : RT_TABLE_COMPAT;
        if (nla_put_u32(skb, RTA_TABLE, table_id))
                goto nla_put_failure;
        r->rtm_type        = rt->rt_type;
        r->rtm_scope        = RT_SCOPE_UNIVERSE;
        r->rtm_protocol = RTPROT_UNSPEC;
        r->rtm_flags        = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
        if (rt->rt_flags & RTCF_NOTIFY)
                r->rtm_flags |= RTM_F_NOTIFY;
        if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
                r->rtm_flags |= RTCF_DOREDIRECT;

        if (nla_put_in_addr(skb, RTA_DST, dst))
                goto nla_put_failure;
        if (src) {
                r->rtm_src_len = 32;
                if (nla_put_in_addr(skb, RTA_SRC, src))
                        goto nla_put_failure;
        }
        if (rt->dst.dev &&
            nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
                goto nla_put_failure;
        if (rt->dst.lwtstate &&
            lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
                goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
        if (rt->dst.tclassid &&
            nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
                goto nla_put_failure;
#endif
        if (fl4 && !rt_is_input_route(rt) &&
            fl4->saddr != src) {
                if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
                        goto nla_put_failure;
        }
        if (rt->rt_uses_gateway) {
                if (rt->rt_gw_family == AF_INET &&
                    nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
                        goto nla_put_failure;
                } else if (rt->rt_gw_family == AF_INET6) {
                        int alen = sizeof(struct in6_addr);
                        struct nlattr *nla;
                        struct rtvia *via;

                        nla = nla_reserve(skb, RTA_VIA, alen + 2);
                        if (!nla)
                                goto nla_put_failure;

                        via = nla_data(nla);
                        via->rtvia_family = AF_INET6;
                        memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
                }
        }

        expires = rt->dst.expires;
        if (expires) {
                unsigned long now = jiffies;

                if (time_before(now, expires))
                        expires -= now;
                else
                        expires = 0;
        }

        memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
        if (rt->rt_pmtu && expires)
                metrics[RTAX_MTU - 1] = rt->rt_pmtu;
        if (rt->rt_mtu_locked && expires)
                metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
        if (rtnetlink_put_metrics(skb, metrics) < 0)
                goto nla_put_failure;

        if (fl4) {
                if (fl4->flowi4_mark &&
                    nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
                        goto nla_put_failure;

                if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
                    nla_put_u32(skb, RTA_UID,
                                from_kuid_munged(current_user_ns(),
                                                 fl4->flowi4_uid)))
                        goto nla_put_failure;

                if (rt_is_input_route(rt)) {
#ifdef CONFIG_IP_MROUTE
                        if (ipv4_is_multicast(dst) &&
                            !ipv4_is_local_multicast(dst) &&
                            IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
                                int err = ipmr_get_route(net, skb,
                                                         fl4->saddr, fl4->daddr,
                                                         r, portid);

                                if (err <= 0) {
                                        if (err == 0)
                                                return 0;
                                        goto nla_put_failure;
                                }
                        } else
#endif
                                if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
                                        goto nla_put_failure;
                }
        }

        error = rt->dst.error;

        if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
                            struct netlink_callback *cb, u32 table_id,
                            struct fnhe_hash_bucket *bucket, int genid,
                            int *fa_index, int fa_start, unsigned int flags)
{
        int i;

        for (i = 0; i < FNHE_HASH_SIZE; i++) {
                struct fib_nh_exception *fnhe;

                for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
                     fnhe = rcu_dereference(fnhe->fnhe_next)) {
                        struct rtable *rt;
                        int err;

                        if (*fa_index < fa_start)
                                goto next;

                        if (fnhe->fnhe_genid != genid)
                                goto next;

                        if (fnhe->fnhe_expires &&
                            time_after(jiffies, fnhe->fnhe_expires))
                                goto next;

                        rt = rcu_dereference(fnhe->fnhe_rth_input);
                        if (!rt)
                                rt = rcu_dereference(fnhe->fnhe_rth_output);
                        if (!rt)
                                goto next;

                        err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
                                           table_id, 0, NULL, skb,
                                           NETLINK_CB(cb->skb).portid,
                                           cb->nlh->nlmsg_seq, flags);
                        if (err)
                                return err;
next:
                        (*fa_index)++;
                }
        }

        return 0;
}

int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
                       u32 table_id, struct fib_info *fi,
                       int *fa_index, int fa_start, unsigned int flags)
{
        struct net *net = sock_net(cb->skb->sk);
        int nhsel, genid = fnhe_genid(net);

        for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
                struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
                struct fnhe_hash_bucket *bucket;
                int err;

                if (nhc->nhc_flags & RTNH_F_DEAD)
                        continue;

                rcu_read_lock();
                bucket = rcu_dereference(nhc->nhc_exceptions);
                err = 0;
                if (bucket)
                        err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
                                               genid, fa_index, fa_start,
                                               flags);
                rcu_read_unlock();
                if (err)
                        return err;
        }

        return 0;
}

static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
                                                   u8 ip_proto, __be16 sport,
                                                   __be16 dport)
{
        struct sk_buff *skb;
        struct iphdr *iph;

        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                return NULL;

        /* Reserve room for dummy headers, this skb can pass
         * through good chunk of routing engine.
         */
        skb_reset_mac_header(skb);
        skb_reset_network_header(skb);
        skb->protocol = htons(ETH_P_IP);
        iph = skb_put(skb, sizeof(struct iphdr));
        iph->protocol = ip_proto;
        iph->saddr = src;
        iph->daddr = dst;
        iph->version = 0x4;
        iph->frag_off = 0;
        iph->ihl = 0x5;
        skb_set_transport_header(skb, skb->len);

        switch (iph->protocol) {
        case IPPROTO_UDP: {
                struct udphdr *udph;

                udph = skb_put_zero(skb, sizeof(struct udphdr));
                udph->source = sport;
                udph->dest = dport;
                udph->len = htons(sizeof(struct udphdr));
                udph->check = 0;
                break;
        }
        case IPPROTO_TCP: {
                struct tcphdr *tcph;

                tcph = skb_put_zero(skb, sizeof(struct tcphdr));
                tcph->source        = sport;
                tcph->dest        = dport;
                tcph->doff        = sizeof(struct tcphdr) / 4;
                tcph->rst = 1;
                tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
                                            src, dst, 0);
                break;
        }
        case IPPROTO_ICMP: {
                struct icmphdr *icmph;

                icmph = skb_put_zero(skb, sizeof(struct icmphdr));
                icmph->type = ICMP_ECHO;
                icmph->code = 0;
        }
        }

        return skb;
}

static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
                                       const struct nlmsghdr *nlh,
                                       struct nlattr **tb,
                                       struct netlink_ext_ack *extack)
{
        struct rtmsg *rtm;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
                NL_SET_ERR_MSG(extack,
                               "ipv4: Invalid header for route get request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
                                              rtm_ipv4_policy, extack);

        rtm = nlmsg_data(nlh);
        if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
            (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
            rtm->rtm_table || rtm->rtm_protocol ||
            rtm->rtm_scope || rtm->rtm_type) {
                NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
                return -EINVAL;
        }

        if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
                               RTM_F_LOOKUP_TABLE |
                               RTM_F_FIB_MATCH)) {
                NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
                                            rtm_ipv4_policy, extack);
        if (err)
                return err;

        if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
            (tb[RTA_DST] && !rtm->rtm_dst_len)) {
                NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
                return -EINVAL;
        }

        for (i = 0; i <= RTA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case RTA_IIF:
                case RTA_OIF:
                case RTA_SRC:
                case RTA_DST:
                case RTA_IP_PROTO:
                case RTA_SPORT:
                case RTA_DPORT:
                case RTA_MARK:
                case RTA_UID:
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[RTA_MAX+1];
        u32 table_id = RT_TABLE_MAIN;
        __be16 sport = 0, dport = 0;
        struct fib_result res = {};
        u8 ip_proto = IPPROTO_UDP;
        struct rtable *rt = NULL;
        struct sk_buff *skb;
        struct rtmsg *rtm;
        struct flowi4 fl4 = {};
        __be32 dst = 0;
        __be32 src = 0;
        dscp_t dscp;
        kuid_t uid;
        u32 iif;
        int err;
        int mark;

        err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
        if (err < 0)
                return err;

        rtm = nlmsg_data(nlh);
        src = nla_get_in_addr_default(tb[RTA_SRC], 0);
        dst = nla_get_in_addr_default(tb[RTA_DST], 0);
        iif = nla_get_u32_default(tb[RTA_IIF], 0);
        mark = nla_get_u32_default(tb[RTA_MARK], 0);
        dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
        if (tb[RTA_UID])
                uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
        else
                uid = (iif ? INVALID_UID : current_uid());

        if (tb[RTA_IP_PROTO]) {
                err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
                                                  &ip_proto, AF_INET, extack);
                if (err)
                        return err;
        }

        if (tb[RTA_SPORT])
                sport = nla_get_be16(tb[RTA_SPORT]);

        if (tb[RTA_DPORT])
                dport = nla_get_be16(tb[RTA_DPORT]);

        skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
        if (!skb)
                return -ENOBUFS;

        fl4.daddr = dst;
        fl4.saddr = src;
        fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
        fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
        fl4.flowi4_mark = mark;
        fl4.flowi4_uid = uid;
        if (sport)
                fl4.fl4_sport = sport;
        if (dport)
                fl4.fl4_dport = dport;
        fl4.flowi4_proto = ip_proto;

        rcu_read_lock();

        if (iif) {
                struct net_device *dev;

                dev = dev_get_by_index_rcu(net, iif);
                if (!dev) {
                        err = -ENODEV;
                        goto errout_rcu;
                }

                fl4.flowi4_iif = iif; /* for rt_fill_info */
                skb->dev        = dev;
                skb->mark        = mark;
                err = ip_route_input_rcu(skb, dst, src, dscp, dev,
                                         &res) ? -EINVAL : 0;

                rt = skb_rtable(skb);
                if (err == 0 && rt->dst.error)
                        err = -rt->dst.error;
        } else {
                fl4.flowi4_iif = LOOPBACK_IFINDEX;
                skb->dev = net->loopback_dev;
                rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
                err = 0;
                if (IS_ERR(rt))
                        err = PTR_ERR(rt);
                else
                        skb_dst_set(skb, &rt->dst);
        }

        if (err)
                goto errout_rcu;

        if (rtm->rtm_flags & RTM_F_NOTIFY)
                rt->rt_flags |= RTCF_NOTIFY;

        if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
                table_id = res.table ? res.table->tb_id : 0;

        /* reset skb for netlink reply msg */
        skb_trim(skb, 0);
        skb_reset_network_header(skb);
        skb_reset_transport_header(skb);
        skb_reset_mac_header(skb);

        if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
                struct fib_rt_info fri;

                if (!res.fi) {
                        err = fib_props[res.type].error;
                        if (!err)
                                err = -EHOSTUNREACH;
                        goto errout_rcu;
                }
                fri.fi = res.fi;
                fri.tb_id = table_id;
                fri.dst = res.prefix;
                fri.dst_len = res.prefixlen;
                fri.dscp = res.dscp;
                fri.type = rt->rt_type;
                fri.offload = 0;
                fri.trap = 0;
                fri.offload_failed = 0;
                if (res.fa_head) {
                        struct fib_alias *fa;

                        hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
                                u8 slen = 32 - fri.dst_len;

                                if (fa->fa_slen == slen &&
                                    fa->tb_id == fri.tb_id &&
                                    fa->fa_dscp == fri.dscp &&
                                    fa->fa_info == res.fi &&
                                    fa->fa_type == fri.type) {
                                        fri.offload = READ_ONCE(fa->offload);
                                        fri.trap = READ_ONCE(fa->trap);
                                        fri.offload_failed =
                                                READ_ONCE(fa->offload_failed);
                                        break;
                                }
                        }
                }
                err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
                                    nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
        } else {
                err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4,
                                   skb, NETLINK_CB(in_skb).portid,
                                   nlh->nlmsg_seq, 0);
        }
        if (err < 0)
                goto errout_rcu;

        rcu_read_unlock();

        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);

errout_free:
        return err;
errout_rcu:
        rcu_read_unlock();
        kfree_skb(skb);
        goto errout_free;
}

void ip_rt_multicast_event(struct in_device *in_dev)
{
        rt_cache_flush(dev_net(in_dev->dev));
}

#ifdef CONFIG_SYSCTL
static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
static int ip_rt_gc_min_interval __read_mostly        = HZ / 2;
static int ip_rt_gc_elasticity __read_mostly        = 8;
static int ip_min_valid_pmtu __read_mostly        = IPV4_MIN_MTU;

static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct net *net = (struct net *)__ctl->extra1;

        if (write) {
                rt_cache_flush(net);
                fnhe_genid_bump(net);
                return 0;
        }

        return -EINVAL;
}

static struct ctl_table ipv4_route_table[] = {
        {
                .procname        = "gc_thresh",
                .data                = &ipv4_dst_ops.gc_thresh,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "max_size",
                .data                = &ip_rt_max_size,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                /*  Deprecated. Use gc_min_interval_ms */

                .procname        = "gc_min_interval",
                .data                = &ip_rt_gc_min_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "gc_min_interval_ms",
                .data                = &ip_rt_gc_min_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_ms_jiffies,
        },
        {
                .procname        = "gc_timeout",
                .data                = &ip_rt_gc_timeout,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "gc_interval",
                .data                = &ip_rt_gc_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "redirect_load",
                .data                = &ip_rt_redirect_load,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "redirect_number",
                .data                = &ip_rt_redirect_number,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "redirect_silence",
                .data                = &ip_rt_redirect_silence,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "error_cost",
                .data                = &ip_rt_error_cost,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "error_burst",
                .data                = &ip_rt_error_burst,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "gc_elasticity",
                .data                = &ip_rt_gc_elasticity,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};

static const char ipv4_route_flush_procname[] = "flush";

static struct ctl_table ipv4_route_netns_table[] = {
        {
                .procname        = ipv4_route_flush_procname,
                .maxlen                = sizeof(int),
                .mode                = 0200,
                .proc_handler        = ipv4_sysctl_rtcache_flush,
        },
        {
                .procname       = "min_pmtu",
                .data           = &init_net.ipv4.ip_rt_min_pmtu,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1         = &ip_min_valid_pmtu,
        },
        {
                .procname       = "mtu_expires",
                .data           = &init_net.ipv4.ip_rt_mtu_expires,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
        {
                .procname   = "min_adv_mss",
                .data       = &init_net.ipv4.ip_rt_min_advmss,
                .maxlen     = sizeof(int),
                .mode       = 0644,
                .proc_handler   = proc_dointvec,
        },
};

static __net_init int sysctl_route_net_init(struct net *net)
{
        struct ctl_table *tbl;
        size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);

        tbl = ipv4_route_netns_table;
        if (!net_eq(net, &init_net)) {
                int i;

                tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
                if (!tbl)
                        goto err_dup;

                /* Don't export non-whitelisted sysctls to unprivileged users */
                if (net->user_ns != &init_user_ns) {
                        if (tbl[0].procname != ipv4_route_flush_procname)
                                table_size = 0;
                }

                /* Update the variables to point into the current struct net
                 * except for the first element flush
                 */
                for (i = 1; i < table_size; i++)
                        tbl[i].data += (void *)net - (void *)&init_net;
        }
        tbl[0].extra1 = net;

        net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
                                                     tbl, table_size);
        if (!net->ipv4.route_hdr)
                goto err_reg;
        return 0;

err_reg:
        if (tbl != ipv4_route_netns_table)
                kfree(tbl);
err_dup:
        return -ENOMEM;
}

static __net_exit void sysctl_route_net_exit(struct net *net)
{
        const struct ctl_table *tbl;

        tbl = net->ipv4.route_hdr->ctl_table_arg;
        unregister_net_sysctl_table(net->ipv4.route_hdr);
        BUG_ON(tbl == ipv4_route_netns_table);
        kfree(tbl);
}

static __net_initdata struct pernet_operations sysctl_route_ops = {
        .init = sysctl_route_net_init,
        .exit = sysctl_route_net_exit,
};
#endif

static __net_init int netns_ip_rt_init(struct net *net)
{
        /* Set default value for namespaceified sysctls */
        net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
        net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
        net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
        return 0;
}

static struct pernet_operations __net_initdata ip_rt_ops = {
        .init = netns_ip_rt_init,
};

static __net_init int rt_genid_init(struct net *net)
{
        atomic_set(&net->ipv4.rt_genid, 0);
        atomic_set(&net->fnhe_genid, 0);
        atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
        return 0;
}

static __net_initdata struct pernet_operations rt_genid_ops = {
        .init = rt_genid_init,
};

static int __net_init ipv4_inetpeer_init(struct net *net)
{
        struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

        if (!bp)
                return -ENOMEM;
        inet_peer_base_init(bp);
        net->ipv4.peers = bp;
        return 0;
}

static void __net_exit ipv4_inetpeer_exit(struct net *net)
{
        struct inet_peer_base *bp = net->ipv4.peers;

        net->ipv4.peers = NULL;
        inetpeer_invalidate_tree(bp);
        kfree(bp);
}

static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
        .init        =        ipv4_inetpeer_init,
        .exit        =        ipv4_inetpeer_exit,
};

#ifdef CONFIG_IP_ROUTE_CLASSID
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
#endif /* CONFIG_IP_ROUTE_CLASSID */

static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = {
        {.protocol = PF_INET, .msgtype = RTM_GETROUTE,
         .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
};

int __init ip_rt_init(void)
{
        void *idents_hash;
        int cpu;

        /* For modern hosts, this will use 2 MB of memory */
        idents_hash = alloc_large_system_hash("IP idents",
                                              sizeof(*ip_idents) + sizeof(*ip_tstamps),
                                              0,
                                              16, /* one bucket per 64 KB */
                                              HASH_ZERO,
                                              NULL,
                                              &ip_idents_mask,
                                              2048,
                                              256*1024);

        ip_idents = idents_hash;

        get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));

        ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);

        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);

                INIT_LIST_HEAD(&ul->head);
                spin_lock_init(&ul->lock);
        }
#ifdef CONFIG_IP_ROUTE_CLASSID
        ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
        if (!ip_rt_acct)
                panic("IP: failed to allocate ip_rt_acct\n");
#endif

        ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable,
                                              SLAB_HWCACHE_ALIGN | SLAB_PANIC);

        ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;

        if (dst_entries_init(&ipv4_dst_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_ops counter\n");

        if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
                panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");

        ipv4_dst_ops.gc_thresh = ~0;
        ip_rt_max_size = INT_MAX;

        devinet_init();
        ip_fib_init();

        if (ip_rt_proc_init())
                pr_err("Unable to create route proc files\n");
#ifdef CONFIG_XFRM
        xfrm_init();
        xfrm4_init();
#endif
        rtnl_register_many(ip_rt_rtnl_msg_handlers);

#ifdef CONFIG_SYSCTL
        register_pernet_subsys(&sysctl_route_ops);
#endif
        register_pernet_subsys(&ip_rt_ops);
        register_pernet_subsys(&rt_genid_ops);
        register_pernet_subsys(&ipv4_inetpeer_ops);
        return 0;
}

#ifdef CONFIG_SYSCTL
/*
 * We really need to sanitize the damn ipv4 init order, then all
 * this nonsense will go away.
 */
void __init ip_static_sysctl_init(void)
{
        register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
}
#endif














































































































    1 
    1 


    1 












    1 




    1 












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    1 


    1 

























































    1 




    1 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/super.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  super.c contains code to handle: - mount structures
 *                                   - super-block tables
 *                                   - filesystem drivers list
 *                                   - mount system call
 *                                   - umount system call
 *                                   - ustat system call
 *
 * GK 2/5/95  -  Changed to support mounting the root fs via NFS
 *
 *  Added kerneld support: Jacques Gelinas and Bjorn Ekwall
 *  Added change_root: Werner Almesberger & Hans Lermen, Feb '96
 *  Added options to /proc/mounts:
 *    Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996.
 *  Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998
 *  Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000
 */

#include <linux/export.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/writeback.h>                /* for the emergency remount stuff */
#include <linux/idr.h>
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/rculist_bl.h>
#include <linux/fscrypt.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
#include <linux/user_namespace.h>
#include <linux/fs_context.h>
#include <uapi/linux/mount.h>
#include "internal.h"

static int thaw_super_locked(struct super_block *sb, enum freeze_holder who);

static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);

static char *sb_writers_name[SB_FREEZE_LEVELS] = {
        "sb_writers",
        "sb_pagefaults",
        "sb_internal",
};

static inline void __super_lock(struct super_block *sb, bool excl)
{
        if (excl)
                down_write(&sb->s_umount);
        else
                down_read(&sb->s_umount);
}

static inline void super_unlock(struct super_block *sb, bool excl)
{
        if (excl)
                up_write(&sb->s_umount);
        else
                up_read(&sb->s_umount);
}

static inline void __super_lock_excl(struct super_block *sb)
{
        __super_lock(sb, true);
}

static inline void super_unlock_excl(struct super_block *sb)
{
        super_unlock(sb, true);
}

static inline void super_unlock_shared(struct super_block *sb)
{
        super_unlock(sb, false);
}

static bool super_flags(const struct super_block *sb, unsigned int flags)
{
        /*
         * Pairs with smp_store_release() in super_wake() and ensures
         * that we see @flags after we're woken.
         */
        return smp_load_acquire(&sb->s_flags) & flags;
}

/**
 * super_lock - wait for superblock to become ready and lock it
 * @sb: superblock to wait for
 * @excl: whether exclusive access is required
 *
 * If the superblock has neither passed through vfs_get_tree() or
 * generic_shutdown_super() yet wait for it to happen. Either superblock
 * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
 * woken and we'll see SB_DYING.
 *
 * The caller must have acquired a temporary reference on @sb->s_count.
 *
 * Return: The function returns true if SB_BORN was set and with
 *         s_umount held. The function returns false if SB_DYING was
 *         set and without s_umount held.
 */
static __must_check bool super_lock(struct super_block *sb, bool excl)
{
        lockdep_assert_not_held(&sb->s_umount);

        /* wait until the superblock is ready or dying */
        wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING));

        /* Don't pointlessly acquire s_umount. */
        if (super_flags(sb, SB_DYING))
                return false;

        __super_lock(sb, excl);

        /*
         * Has gone through generic_shutdown_super() in the meantime.
         * @sb->s_root is NULL and @sb->s_active is 0. No one needs to
         * grab a reference to this. Tell them so.
         */
        if (sb->s_flags & SB_DYING) {
                super_unlock(sb, excl);
                return false;
        }

        WARN_ON_ONCE(!(sb->s_flags & SB_BORN));
        return true;
}

/* wait and try to acquire read-side of @sb->s_umount */
static inline bool super_lock_shared(struct super_block *sb)
{
        return super_lock(sb, false);
}

/* wait and try to acquire write-side of @sb->s_umount */
static inline bool super_lock_excl(struct super_block *sb)
{
        return super_lock(sb, true);
}

/* wake waiters */
#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD)
static void super_wake(struct super_block *sb, unsigned int flag)
{
        WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS));
        WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1);

        /*
         * Pairs with smp_load_acquire() in super_lock() to make sure
         * all initializations in the superblock are seen by the user
         * seeing SB_BORN sent.
         */
        smp_store_release(&sb->s_flags, sb->s_flags | flag);
        /*
         * Pairs with the barrier in prepare_to_wait_event() to make sure
         * ___wait_var_event() either sees SB_BORN set or
         * waitqueue_active() check in wake_up_var() sees the waiter.
         */
        smp_mb();
        wake_up_var(&sb->s_flags);
}

/*
 * One thing we have to be careful of with a per-sb shrinker is that we don't
 * drop the last active reference to the superblock from within the shrinker.
 * If that happens we could trigger unregistering the shrinker from within the
 * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we
 * take a passive reference to the superblock to avoid this from occurring.
 */
static unsigned long super_cache_scan(struct shrinker *shrink,
                                      struct shrink_control *sc)
{
        struct super_block *sb;
        long        fs_objects = 0;
        long        total_objects;
        long        freed = 0;
        long        dentries;
        long        inodes;

        sb = shrink->private_data;

        /*
         * Deadlock avoidance.  We may hold various FS locks, and we don't want
         * to recurse into the FS that called us in clear_inode() and friends..
         */
        if (!(sc->gfp_mask & __GFP_FS))
                return SHRINK_STOP;

        if (!super_trylock_shared(sb))
                return SHRINK_STOP;

        if (sb->s_op->nr_cached_objects)
                fs_objects = sb->s_op->nr_cached_objects(sb, sc);

        inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
        dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects = dentries + inodes + fs_objects + 1;
        if (!total_objects)
                total_objects = 1;

        /* proportion the scan between the caches */
        dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
        inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
        fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);

        /*
         * prune the dcache first as the icache is pinned by it, then
         * prune the icache, followed by the filesystem specific caches
         *
         * Ensure that we always scan at least one object - memcg kmem
         * accounting uses this to fully empty the caches.
         */
        sc->nr_to_scan = dentries + 1;
        freed = prune_dcache_sb(sb, sc);
        sc->nr_to_scan = inodes + 1;
        freed += prune_icache_sb(sb, sc);

        if (fs_objects) {
                sc->nr_to_scan = fs_objects + 1;
                freed += sb->s_op->free_cached_objects(sb, sc);
        }

        super_unlock_shared(sb);
        return freed;
}

static unsigned long super_cache_count(struct shrinker *shrink,
                                       struct shrink_control *sc)
{
        struct super_block *sb;
        long        total_objects = 0;

        sb = shrink->private_data;

        /*
         * We don't call super_trylock_shared() here as it is a scalability
         * bottleneck, so we're exposed to partial setup state. The shrinker
         * rwsem does not protect filesystem operations backing
         * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can
         * change between super_cache_count and super_cache_scan, so we really
         * don't need locks here.
         *
         * However, if we are currently mounting the superblock, the underlying
         * filesystem might be in a state of partial construction and hence it
         * is dangerous to access it.  super_trylock_shared() uses a SB_BORN check
         * to avoid this situation, so do the same here. The memory barrier is
         * matched with the one in mount_fs() as we don't hold locks here.
         */
        if (!(sb->s_flags & SB_BORN))
                return 0;
        smp_rmb();

        if (sb->s_op && sb->s_op->nr_cached_objects)
                total_objects = sb->s_op->nr_cached_objects(sb, sc);

        total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);

        if (!total_objects)
                return SHRINK_EMPTY;

        total_objects = vfs_pressure_ratio(total_objects);
        return total_objects;
}

static void destroy_super_work(struct work_struct *work)
{
        struct super_block *s = container_of(work, struct super_block,
                                                        destroy_work);
        fsnotify_sb_free(s);
        security_sb_free(s);
        put_user_ns(s->s_user_ns);
        kfree(s->s_subtype);
        for (int i = 0; i < SB_FREEZE_LEVELS; i++)
                percpu_free_rwsem(&s->s_writers.rw_sem[i]);
        kfree(s);
}

static void destroy_super_rcu(struct rcu_head *head)
{
        struct super_block *s = container_of(head, struct super_block, rcu);
        INIT_WORK(&s->destroy_work, destroy_super_work);
        schedule_work(&s->destroy_work);
}

/* Free a superblock that has never been seen by anyone */
static void destroy_unused_super(struct super_block *s)
{
        if (!s)
                return;
        super_unlock_excl(s);
        list_lru_destroy(&s->s_dentry_lru);
        list_lru_destroy(&s->s_inode_lru);
        shrinker_free(s->s_shrink);
        /* no delays needed */
        destroy_super_work(&s->destroy_work);
}

/**
 *        alloc_super        -        create new superblock
 *        @type:        filesystem type superblock should belong to
 *        @flags: the mount flags
 *        @user_ns: User namespace for the super_block
 *
 *        Allocates and initializes a new &struct super_block.  alloc_super()
 *        returns a pointer new superblock or %NULL if allocation had failed.
 */
static struct super_block *alloc_super(struct file_system_type *type, int flags,
                                       struct user_namespace *user_ns)
{
        struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL);
        static const struct super_operations default_op;
        int i;

        if (!s)
                return NULL;

        INIT_LIST_HEAD(&s->s_mounts);
        s->s_user_ns = get_user_ns(user_ns);
        init_rwsem(&s->s_umount);
        lockdep_set_class(&s->s_umount, &type->s_umount_key);
        /*
         * sget() can have s_umount recursion.
         *
         * When it cannot find a suitable sb, it allocates a new
         * one (this one), and tries again to find a suitable old
         * one.
         *
         * In case that succeeds, it will acquire the s_umount
         * lock of the old one. Since these are clearly distrinct
         * locks, and this object isn't exposed yet, there's no
         * risk of deadlocks.
         *
         * Annotate this by putting this lock in a different
         * subclass.
         */
        down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);

        if (security_sb_alloc(s))
                goto fail;

        for (i = 0; i < SB_FREEZE_LEVELS; i++) {
                if (__percpu_init_rwsem(&s->s_writers.rw_sem[i],
                                        sb_writers_name[i],
                                        &type->s_writers_key[i]))
                        goto fail;
        }
        s->s_bdi = &noop_backing_dev_info;
        s->s_flags = flags;
        if (s->s_user_ns != &init_user_ns)
                s->s_iflags |= SB_I_NODEV;
        INIT_HLIST_NODE(&s->s_instances);
        INIT_HLIST_BL_HEAD(&s->s_roots);
        mutex_init(&s->s_sync_lock);
        INIT_LIST_HEAD(&s->s_inodes);
        spin_lock_init(&s->s_inode_list_lock);
        INIT_LIST_HEAD(&s->s_inodes_wb);
        spin_lock_init(&s->s_inode_wblist_lock);

        s->s_count = 1;
        atomic_set(&s->s_active, 1);
        mutex_init(&s->s_vfs_rename_mutex);
        lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
        init_rwsem(&s->s_dquot.dqio_sem);
        s->s_maxbytes = MAX_NON_LFS;
        s->s_op = &default_op;
        s->s_time_gran = 1000000000;
        s->s_time_min = TIME64_MIN;
        s->s_time_max = TIME64_MAX;

        s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
                                     "sb-%s", type->name);
        if (!s->s_shrink)
                goto fail;

        s->s_shrink->scan_objects = super_cache_scan;
        s->s_shrink->count_objects = super_cache_count;
        s->s_shrink->batch = 1024;
        s->s_shrink->private_data = s;

        if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink))
                goto fail;
        if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
                goto fail;
        return s;

fail:
        destroy_unused_super(s);
        return NULL;
}

/* Superblock refcounting  */

/*
 * Drop a superblock's refcount.  The caller must hold sb_lock.
 */
static void __put_super(struct super_block *s)
{
        if (!--s->s_count) {
                list_del_init(&s->s_list);
                WARN_ON(s->s_dentry_lru.node);
                WARN_ON(s->s_inode_lru.node);
                WARN_ON(!list_empty(&s->s_mounts));
                call_rcu(&s->rcu, destroy_super_rcu);
        }
}

/**
 *        put_super        -        drop a temporary reference to superblock
 *        @sb: superblock in question
 *
 *        Drops a temporary reference, frees superblock if there's no
 *        references left.
 */
void put_super(struct super_block *sb)
{
        spin_lock(&sb_lock);
        __put_super(sb);
        spin_unlock(&sb_lock);
}

static void kill_super_notify(struct super_block *sb)
{
        lockdep_assert_not_held(&sb->s_umount);

        /* already notified earlier */
        if (sb->s_flags & SB_DEAD)
                return;

        /*
         * Remove it from @fs_supers so it isn't found by new
         * sget{_fc}() walkers anymore. Any concurrent mounter still
         * managing to grab a temporary reference is guaranteed to
         * already see SB_DYING and will wait until we notify them about
         * SB_DEAD.
         */
        spin_lock(&sb_lock);
        hlist_del_init(&sb->s_instances);
        spin_unlock(&sb_lock);

        /*
         * Let concurrent mounts know that this thing is really dead.
         * We don't need @sb->s_umount here as every concurrent caller
         * will see SB_DYING and either discard the superblock or wait
         * for SB_DEAD.
         */
        super_wake(sb, SB_DEAD);
}

/**
 *        deactivate_locked_super        -        drop an active reference to superblock
 *        @s: superblock to deactivate
 *
 *        Drops an active reference to superblock, converting it into a temporary
 *        one if there is no other active references left.  In that case we
 *        tell fs driver to shut it down and drop the temporary reference we
 *        had just acquired.
 *
 *        Caller holds exclusive lock on superblock; that lock is released.
 */
void deactivate_locked_super(struct super_block *s)
{
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
                shrinker_free(s->s_shrink);
                fs->kill_sb(s);

                kill_super_notify(s);

                /*
                 * Since list_lru_destroy() may sleep, we cannot call it from
                 * put_super(), where we hold the sb_lock. Therefore we destroy
                 * the lru lists right now.
                 */
                list_lru_destroy(&s->s_dentry_lru);
                list_lru_destroy(&s->s_inode_lru);

                put_filesystem(fs);
                put_super(s);
        } else {
                super_unlock_excl(s);
        }
}

EXPORT_SYMBOL(deactivate_locked_super);

/**
 *        deactivate_super        -        drop an active reference to superblock
 *        @s: superblock to deactivate
 *
 *        Variant of deactivate_locked_super(), except that superblock is *not*
 *        locked by caller.  If we are going to drop the final active reference,
 *        lock will be acquired prior to that.
 */
void deactivate_super(struct super_block *s)
{
        if (!atomic_add_unless(&s->s_active, -1, 1)) {
                __super_lock_excl(s);
                deactivate_locked_super(s);
        }
}

EXPORT_SYMBOL(deactivate_super);

/**
 * grab_super - acquire an active reference to a superblock
 * @sb: superblock to acquire
 *
 * Acquire a temporary reference on a superblock and try to trade it for
 * an active reference. This is used in sget{_fc}() to wait for a
 * superblock to either become SB_BORN or for it to pass through
 * sb->kill() and be marked as SB_DEAD.
 *
 * Return: This returns true if an active reference could be acquired,
 *         false if not.
 */
static bool grab_super(struct super_block *sb)
{
        bool locked;

        sb->s_count++;
        spin_unlock(&sb_lock);
        locked = super_lock_excl(sb);
        if (locked) {
                if (atomic_inc_not_zero(&sb->s_active)) {
                        put_super(sb);
                        return true;
                }
                super_unlock_excl(sb);
        }
        wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD));
        put_super(sb);
        return false;
}

/*
 *        super_trylock_shared - try to grab ->s_umount shared
 *        @sb: reference we are trying to grab
 *
 *        Try to prevent fs shutdown.  This is used in places where we
 *        cannot take an active reference but we need to ensure that the
 *        filesystem is not shut down while we are working on it. It returns
 *        false if we cannot acquire s_umount or if we lose the race and
 *        filesystem already got into shutdown, and returns true with the s_umount
 *        lock held in read mode in case of success. On successful return,
 *        the caller must drop the s_umount lock when done.
 *
 *        Note that unlike get_super() et.al. this one does *not* bump ->s_count.
 *        The reason why it's safe is that we are OK with doing trylock instead
 *        of down_read().  There's a couple of places that are OK with that, but
 *        it's very much not a general-purpose interface.
 */
bool super_trylock_shared(struct super_block *sb)
{
        if (down_read_trylock(&sb->s_umount)) {
                if (!(sb->s_flags & SB_DYING) && sb->s_root &&
                    (sb->s_flags & SB_BORN))
                        return true;
                super_unlock_shared(sb);
        }

        return false;
}

/**
 *        retire_super        -        prevents superblock from being reused
 *        @sb: superblock to retire
 *
 *        The function marks superblock to be ignored in superblock test, which
 *        prevents it from being reused for any new mounts.  If the superblock has
 *        a private bdi, it also unregisters it, but doesn't reduce the refcount
 *        of the superblock to prevent potential races.  The refcount is reduced
 *        by generic_shutdown_super().  The function can not be called
 *        concurrently with generic_shutdown_super().  It is safe to call the
 *        function multiple times, subsequent calls have no effect.
 *
 *        The marker will affect the re-use only for block-device-based
 *        superblocks.  Other superblocks will still get marked if this function
 *        is used, but that will not affect their reusability.
 */
void retire_super(struct super_block *sb)
{
        WARN_ON(!sb->s_bdev);
        __super_lock_excl(sb);
        if (sb->s_iflags & SB_I_PERSB_BDI) {
                bdi_unregister(sb->s_bdi);
                sb->s_iflags &= ~SB_I_PERSB_BDI;
        }
        sb->s_iflags |= SB_I_RETIRED;
        super_unlock_excl(sb);
}
EXPORT_SYMBOL(retire_super);

/**
 *        generic_shutdown_super        -        common helper for ->kill_sb()
 *        @sb: superblock to kill
 *
 *        generic_shutdown_super() does all fs-independent work on superblock
 *        shutdown.  Typical ->kill_sb() should pick all fs-specific objects
 *        that need destruction out of superblock, call generic_shutdown_super()
 *        and release aforementioned objects.  Note: dentries and inodes _are_
 *        taken care of and do not need specific handling.
 *
 *        Upon calling this function, the filesystem may no longer alter or
 *        rearrange the set of dentries belonging to this super_block, nor may it
 *        change the attachments of dentries to inodes.
 */
void generic_shutdown_super(struct super_block *sb)
{
        const struct super_operations *sop = sb->s_op;

        if (sb->s_root) {
                shrink_dcache_for_umount(sb);
                sync_filesystem(sb);
                sb->s_flags &= ~SB_ACTIVE;

                cgroup_writeback_umount(sb);

                /* Evict all inodes with zero refcount. */
                evict_inodes(sb);

                /*
                 * Clean up and evict any inodes that still have references due
                 * to fsnotify or the security policy.
                 */
                fsnotify_sb_delete(sb);
                security_sb_delete(sb);

                if (sb->s_dio_done_wq) {
                        destroy_workqueue(sb->s_dio_done_wq);
                        sb->s_dio_done_wq = NULL;
                }

                if (sop->put_super)
                        sop->put_super(sb);

                /*
                 * Now that all potentially-encrypted inodes have been evicted,
                 * the fscrypt keyring can be destroyed.
                 */
                fscrypt_destroy_keyring(sb);

                if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), NULL,
                                "VFS: Busy inodes after unmount of %s (%s)",
                                sb->s_id, sb->s_type->name)) {
                        /*
                         * Adding a proper bailout path here would be hard, but
                         * we can at least make it more likely that a later
                         * iput_final() or such crashes cleanly.
                         */
                        struct inode *inode;

                        spin_lock(&sb->s_inode_list_lock);
                        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
                                inode->i_op = VFS_PTR_POISON;
                                inode->i_sb = VFS_PTR_POISON;
                                inode->i_mapping = VFS_PTR_POISON;
                        }
                        spin_unlock(&sb->s_inode_list_lock);
                }
        }
        /*
         * Broadcast to everyone that grabbed a temporary reference to this
         * superblock before we removed it from @fs_supers that the superblock
         * is dying. Every walker of @fs_supers outside of sget{_fc}() will now
         * discard this superblock and treat it as dead.
         *
         * We leave the superblock on @fs_supers so it can be found by
         * sget{_fc}() until we passed sb->kill_sb().
         */
        super_wake(sb, SB_DYING);
        super_unlock_excl(sb);
        if (sb->s_bdi != &noop_backing_dev_info) {
                if (sb->s_iflags & SB_I_PERSB_BDI)
                        bdi_unregister(sb->s_bdi);
                bdi_put(sb->s_bdi);
                sb->s_bdi = &noop_backing_dev_info;
        }
}

EXPORT_SYMBOL(generic_shutdown_super);

bool mount_capable(struct fs_context *fc)
{
        if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT))
                return capable(CAP_SYS_ADMIN);
        else
                return ns_capable(fc->user_ns, CAP_SYS_ADMIN);
}

/**
 * sget_fc - Find or create a superblock
 * @fc:        Filesystem context.
 * @test: Comparison callback
 * @set: Setup callback
 *
 * Create a new superblock or find an existing one.
 *
 * The @test callback is used to find a matching existing superblock.
 * Whether or not the requested parameters in @fc are taken into account
 * is specific to the @test callback that is used. They may even be
 * completely ignored.
 *
 * If an extant superblock is matched, it will be returned unless:
 *
 * (1) the namespace the filesystem context @fc and the extant
 *     superblock's namespace differ
 *
 * (2) the filesystem context @fc has requested that reusing an extant
 *     superblock is not allowed
 *
 * In both cases EBUSY will be returned.
 *
 * If no match is made, a new superblock will be allocated and basic
 * initialisation will be performed (s_type, s_fs_info and s_id will be
 * set and the @set callback will be invoked), the superblock will be
 * published and it will be returned in a partially constructed state
 * with SB_BORN and SB_ACTIVE as yet unset.
 *
 * Return: On success, an extant or newly created superblock is
 *         returned. On failure an error pointer is returned.
 */
struct super_block *sget_fc(struct fs_context *fc,
                            int (*test)(struct super_block *, struct fs_context *),
                            int (*set)(struct super_block *, struct fs_context *))
{
        struct super_block *s = NULL;
        struct super_block *old;
        struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns;
        int err;

        /*
         * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is
         * not set, as the filesystem is likely unprepared to handle it.
         * This can happen when fsconfig() is called from init_user_ns with
         * an fs_fd opened in another user namespace.
         */
        if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) {
                errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed");
                return ERR_PTR(-EPERM);
        }

retry:
        spin_lock(&sb_lock);
        if (test) {
                hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) {
                        if (test(old, fc))
                                goto share_extant_sb;
                }
        }
        if (!s) {
                spin_unlock(&sb_lock);
                s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
                if (!s)
                        return ERR_PTR(-ENOMEM);
                goto retry;
        }

        s->s_fs_info = fc->s_fs_info;
        err = set(s, fc);
        if (err) {
                s->s_fs_info = NULL;
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(err);
        }
        fc->s_fs_info = NULL;
        s->s_type = fc->fs_type;
        s->s_iflags |= fc->s_iflags;
        strscpy(s->s_id, s->s_type->name, sizeof(s->s_id));
        /*
         * Make the superblock visible on @super_blocks and @fs_supers.
         * It's in a nascent state and users should wait on SB_BORN or
         * SB_DYING to be set.
         */
        list_add_tail(&s->s_list, &super_blocks);
        hlist_add_head(&s->s_instances, &s->s_type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(s->s_type);
        shrinker_register(s->s_shrink);
        return s;

share_extant_sb:
        if (user_ns != old->s_user_ns || fc->exclusive) {
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                if (fc->exclusive)
                        warnfc(fc, "reusing existing filesystem not allowed");
                else
                        warnfc(fc, "reusing existing filesystem in another namespace not allowed");
                return ERR_PTR(-EBUSY);
        }
        if (!grab_super(old))
                goto retry;
        destroy_unused_super(s);
        return old;
}
EXPORT_SYMBOL(sget_fc);

/**
 *        sget        -        find or create a superblock
 *        @type:          filesystem type superblock should belong to
 *        @test:          comparison callback
 *        @set:          setup callback
 *        @flags:          mount flags
 *        @data:          argument to each of them
 */
struct super_block *sget(struct file_system_type *type,
                        int (*test)(struct super_block *,void *),
                        int (*set)(struct super_block *,void *),
                        int flags,
                        void *data)
{
        struct user_namespace *user_ns = current_user_ns();
        struct super_block *s = NULL;
        struct super_block *old;
        int err;

        /* We don't yet pass the user namespace of the parent
         * mount through to here so always use &init_user_ns
         * until that changes.
         */
        if (flags & SB_SUBMOUNT)
                user_ns = &init_user_ns;

retry:
        spin_lock(&sb_lock);
        if (test) {
                hlist_for_each_entry(old, &type->fs_supers, s_instances) {
                        if (!test(old, data))
                                continue;
                        if (user_ns != old->s_user_ns) {
                                spin_unlock(&sb_lock);
                                destroy_unused_super(s);
                                return ERR_PTR(-EBUSY);
                        }
                        if (!grab_super(old))
                                goto retry;
                        destroy_unused_super(s);
                        return old;
                }
        }
        if (!s) {
                spin_unlock(&sb_lock);
                s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
                if (!s)
                        return ERR_PTR(-ENOMEM);
                goto retry;
        }

        err = set(s, data);
        if (err) {
                spin_unlock(&sb_lock);
                destroy_unused_super(s);
                return ERR_PTR(err);
        }
        s->s_type = type;
        strscpy(s->s_id, type->name, sizeof(s->s_id));
        list_add_tail(&s->s_list, &super_blocks);
        hlist_add_head(&s->s_instances, &type->fs_supers);
        spin_unlock(&sb_lock);
        get_filesystem(type);
        shrinker_register(s->s_shrink);
        return s;
}
EXPORT_SYMBOL(sget);

void drop_super(struct super_block *sb)
{
        super_unlock_shared(sb);
        put_super(sb);
}

EXPORT_SYMBOL(drop_super);

void drop_super_exclusive(struct super_block *sb)
{
        super_unlock_excl(sb);
        put_super(sb);
}
EXPORT_SYMBOL(drop_super_exclusive);

static void __iterate_supers(void (*f)(struct super_block *))
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (super_flags(sb, SB_DYING))
                        continue;
                sb->s_count++;
                spin_unlock(&sb_lock);

                f(sb);

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}
/**
 *        iterate_supers - call function for all active superblocks
 *        @f: function to call
 *        @arg: argument to pass to it
 *
 *        Scans the superblock list and calls given function, passing it
 *        locked superblock and given argument.
 */
void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                bool locked;

                sb->s_count++;
                spin_unlock(&sb_lock);

                locked = super_lock_shared(sb);
                if (locked) {
                        if (sb->s_root)
                                f(sb, arg);
                        super_unlock_shared(sb);
                }

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}

/**
 *        iterate_supers_type - call function for superblocks of given type
 *        @type: fs type
 *        @f: function to call
 *        @arg: argument to pass to it
 *
 *        Scans the superblock list and calls given function, passing it
 *        locked superblock and given argument.
 */
void iterate_supers_type(struct file_system_type *type,
        void (*f)(struct super_block *, void *), void *arg)
{
        struct super_block *sb, *p = NULL;

        spin_lock(&sb_lock);
        hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
                bool locked;

                sb->s_count++;
                spin_unlock(&sb_lock);

                locked = super_lock_shared(sb);
                if (locked) {
                        if (sb->s_root)
                                f(sb, arg);
                        super_unlock_shared(sb);
                }

                spin_lock(&sb_lock);
                if (p)
                        __put_super(p);
                p = sb;
        }
        if (p)
                __put_super(p);
        spin_unlock(&sb_lock);
}

EXPORT_SYMBOL(iterate_supers_type);

struct super_block *user_get_super(dev_t dev, bool excl)
{
        struct super_block *sb;

        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (sb->s_dev ==  dev) {
                        bool locked;

                        sb->s_count++;
                        spin_unlock(&sb_lock);
                        /* still alive? */
                        locked = super_lock(sb, excl);
                        if (locked) {
                                if (sb->s_root)
                                        return sb;
                                super_unlock(sb, excl);
                        }
                        /* nope, got unmounted */
                        spin_lock(&sb_lock);
                        __put_super(sb);
                        break;
                }
        }
        spin_unlock(&sb_lock);
        return NULL;
}

/**
 * reconfigure_super - asks filesystem to change superblock parameters
 * @fc: The superblock and configuration
 *
 * Alters the configuration parameters of a live superblock.
 */
int reconfigure_super(struct fs_context *fc)
{
        struct super_block *sb = fc->root->d_sb;
        int retval;
        bool remount_ro = false;
        bool remount_rw = false;
        bool force = fc->sb_flags & SB_FORCE;

        if (fc->sb_flags_mask & ~MS_RMT_MASK)
                return -EINVAL;
        if (sb->s_writers.frozen != SB_UNFROZEN)
                return -EBUSY;

        retval = security_sb_remount(sb, fc->security);
        if (retval)
                return retval;

        if (fc->sb_flags_mask & SB_RDONLY) {
#ifdef CONFIG_BLOCK
                if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev &&
                    bdev_read_only(sb->s_bdev))
                        return -EACCES;
#endif
                remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb);
                remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
        }

        if (remount_ro) {
                if (!hlist_empty(&sb->s_pins)) {
                        super_unlock_excl(sb);
                        group_pin_kill(&sb->s_pins);
                        __super_lock_excl(sb);
                        if (!sb->s_root)
                                return 0;
                        if (sb->s_writers.frozen != SB_UNFROZEN)
                                return -EBUSY;
                        remount_ro = !sb_rdonly(sb);
                }
        }
        shrink_dcache_sb(sb);

        /* If we are reconfiguring to RDONLY and current sb is read/write,
         * make sure there are no files open for writing.
         */
        if (remount_ro) {
                if (force) {
                        sb_start_ro_state_change(sb);
                } else {
                        retval = sb_prepare_remount_readonly(sb);
                        if (retval)
                                return retval;
                }
        } else if (remount_rw) {
                /*
                 * Protect filesystem's reconfigure code from writes from
                 * userspace until reconfigure finishes.
                 */
                sb_start_ro_state_change(sb);
        }

        if (fc->ops->reconfigure) {
                retval = fc->ops->reconfigure(fc);
                if (retval) {
                        if (!force)
                                goto cancel_readonly;
                        /* If forced remount, go ahead despite any errors */
                        WARN(1, "forced remount of a %s fs returned %i\n",
                             sb->s_type->name, retval);
                }
        }

        WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) |
                                 (fc->sb_flags & fc->sb_flags_mask)));
        sb_end_ro_state_change(sb);

        /*
         * Some filesystems modify their metadata via some other path than the
         * bdev buffer cache (eg. use a private mapping, or directories in
         * pagecache, etc). Also file data modifications go via their own
         * mappings. So If we try to mount readonly then copy the filesystem
         * from bdev, we could get stale data, so invalidate it to give a best
         * effort at coherency.
         */
        if (remount_ro && sb->s_bdev)
                invalidate_bdev(sb->s_bdev);
        return 0;

cancel_readonly:
        sb_end_ro_state_change(sb);
        return retval;
}

static void do_emergency_remount_callback(struct super_block *sb)
{
        bool locked = super_lock_excl(sb);

        if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
                struct fs_context *fc;

                fc = fs_context_for_reconfigure(sb->s_root,
                                        SB_RDONLY | SB_FORCE, SB_RDONLY);
                if (!IS_ERR(fc)) {
                        if (parse_monolithic_mount_data(fc, NULL) == 0)
                                (void)reconfigure_super(fc);
                        put_fs_context(fc);
                }
        }
        if (locked)
                super_unlock_excl(sb);
}

static void do_emergency_remount(struct work_struct *work)
{
        __iterate_supers(do_emergency_remount_callback);
        kfree(work);
        printk("Emergency Remount complete\n");
}

void emergency_remount(void)
{
        struct work_struct *work;

        work = kmalloc(sizeof(*work), GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_emergency_remount);
                schedule_work(work);
        }
}

static void do_thaw_all_callback(struct super_block *sb)
{
        bool locked = super_lock_excl(sb);

        if (locked && sb->s_root) {
                if (IS_ENABLED(CONFIG_BLOCK))
                        while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
                                pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
                thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
                return;
        }
        if (locked)
                super_unlock_excl(sb);
}

static void do_thaw_all(struct work_struct *work)
{
        __iterate_supers(do_thaw_all_callback);
        kfree(work);
        printk(KERN_WARNING "Emergency Thaw complete\n");
}

/**
 * emergency_thaw_all -- forcibly thaw every frozen filesystem
 *
 * Used for emergency unfreeze of all filesystems via SysRq
 */
void emergency_thaw_all(void)
{
        struct work_struct *work;

        work = kmalloc(sizeof(*work), GFP_ATOMIC);
        if (work) {
                INIT_WORK(work, do_thaw_all);
                schedule_work(work);
        }
}

static DEFINE_IDA(unnamed_dev_ida);

/**
 * get_anon_bdev - Allocate a block device for filesystems which don't have one.
 * @p: Pointer to a dev_t.
 *
 * Filesystems which don't use real block devices can call this function
 * to allocate a virtual block device.
 *
 * Context: Any context.  Frequently called while holding sb_lock.
 * Return: 0 on success, -EMFILE if there are no anonymous bdevs left
 * or -ENOMEM if memory allocation failed.
 */
int get_anon_bdev(dev_t *p)
{
        int dev;

        /*
         * Many userspace utilities consider an FSID of 0 invalid.
         * Always return at least 1 from get_anon_bdev.
         */
        dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1,
                        GFP_ATOMIC);
        if (dev == -ENOSPC)
                dev = -EMFILE;
        if (dev < 0)
                return dev;

        *p = MKDEV(0, dev);
        return 0;
}
EXPORT_SYMBOL(get_anon_bdev);

void free_anon_bdev(dev_t dev)
{
        ida_free(&unnamed_dev_ida, MINOR(dev));
}
EXPORT_SYMBOL(free_anon_bdev);

int set_anon_super(struct super_block *s, void *data)
{
        return get_anon_bdev(&s->s_dev);
}
EXPORT_SYMBOL(set_anon_super);

void kill_anon_super(struct super_block *sb)
{
        dev_t dev = sb->s_dev;
        generic_shutdown_super(sb);
        kill_super_notify(sb);
        free_anon_bdev(dev);
}
EXPORT_SYMBOL(kill_anon_super);

void kill_litter_super(struct super_block *sb)
{
        if (sb->s_root)
                d_genocide(sb->s_root);
        kill_anon_super(sb);
}
EXPORT_SYMBOL(kill_litter_super);

int set_anon_super_fc(struct super_block *sb, struct fs_context *fc)
{
        return set_anon_super(sb, NULL);
}
EXPORT_SYMBOL(set_anon_super_fc);

static int test_keyed_super(struct super_block *sb, struct fs_context *fc)
{
        return sb->s_fs_info == fc->s_fs_info;
}

static int test_single_super(struct super_block *s, struct fs_context *fc)
{
        return 1;
}

static int vfs_get_super(struct fs_context *fc,
                int (*test)(struct super_block *, struct fs_context *),
                int (*fill_super)(struct super_block *sb,
                                  struct fs_context *fc))
{
        struct super_block *sb;
        int err;

        sb = sget_fc(fc, test, set_anon_super_fc);
        if (IS_ERR(sb))
                return PTR_ERR(sb);

        if (!sb->s_root) {
                err = fill_super(sb, fc);
                if (err)
                        goto error;

                sb->s_flags |= SB_ACTIVE;
        }

        fc->root = dget(sb->s_root);
        return 0;

error:
        deactivate_locked_super(sb);
        return err;
}

int get_tree_nodev(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        return vfs_get_super(fc, NULL, fill_super);
}
EXPORT_SYMBOL(get_tree_nodev);

int get_tree_single(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc))
{
        return vfs_get_super(fc, test_single_super, fill_super);
}
EXPORT_SYMBOL(get_tree_single);

int get_tree_keyed(struct fs_context *fc,
                  int (*fill_super)(struct super_block *sb,
                                    struct fs_context *fc),
                void *key)
{
        fc->s_fs_info = key;
        return vfs_get_super(fc, test_keyed_super, fill_super);
}
EXPORT_SYMBOL(get_tree_keyed);

static int set_bdev_super(struct super_block *s, void *data)
{
        s->s_dev = *(dev_t *)data;
        return 0;
}

static int super_s_dev_set(struct super_block *s, struct fs_context *fc)
{
        return set_bdev_super(s, fc->sget_key);
}

static int super_s_dev_test(struct super_block *s, struct fs_context *fc)
{
        return !(s->s_iflags & SB_I_RETIRED) &&
                s->s_dev == *(dev_t *)fc->sget_key;
}

/**
 * sget_dev - Find or create a superblock by device number
 * @fc: Filesystem context.
 * @dev: device number
 *
 * Find or create a superblock using the provided device number that
 * will be stored in fc->sget_key.
 *
 * If an extant superblock is matched, then that will be returned with
 * an elevated reference count that the caller must transfer or discard.
 *
 * If no match is made, a new superblock will be allocated and basic
 * initialisation will be performed (s_type, s_fs_info, s_id, s_dev will
 * be set). The superblock will be published and it will be returned in
 * a partially constructed state with SB_BORN and SB_ACTIVE as yet
 * unset.
 *
 * Return: an existing or newly created superblock on success, an error
 *         pointer on failure.
 */
struct super_block *sget_dev(struct fs_context *fc, dev_t dev)
{
        fc->sget_key = &dev;
        return sget_fc(fc, super_s_dev_test, super_s_dev_set);
}
EXPORT_SYMBOL(sget_dev);

#ifdef CONFIG_BLOCK
/*
 * Lock the superblock that is holder of the bdev. Returns the superblock
 * pointer if we successfully locked the superblock and it is alive. Otherwise
 * we return NULL and just unlock bdev->bd_holder_lock.
 *
 * The function must be called with bdev->bd_holder_lock and releases it.
 */
static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
        __releases(&bdev->bd_holder_lock)
{
        struct super_block *sb = bdev->bd_holder;
        bool locked;

        lockdep_assert_held(&bdev->bd_holder_lock);
        lockdep_assert_not_held(&sb->s_umount);
        lockdep_assert_not_held(&bdev->bd_disk->open_mutex);

        /* Make sure sb doesn't go away from under us */
        spin_lock(&sb_lock);
        sb->s_count++;
        spin_unlock(&sb_lock);

        mutex_unlock(&bdev->bd_holder_lock);

        locked = super_lock(sb, excl);

        /*
         * If the superblock wasn't already SB_DYING then we hold
         * s_umount and can safely drop our temporary reference.
         */
        put_super(sb);

        if (!locked)
                return NULL;

        if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
                super_unlock(sb, excl);
                return NULL;
        }

        return sb;
}

static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
{
        struct super_block *sb;

        sb = bdev_super_lock(bdev, false);
        if (!sb)
                return;

        if (!surprise)
                sync_filesystem(sb);
        shrink_dcache_sb(sb);
        evict_inodes(sb);
        if (sb->s_op->shutdown)
                sb->s_op->shutdown(sb);

        super_unlock_shared(sb);
}

static void fs_bdev_sync(struct block_device *bdev)
{
        struct super_block *sb;

        sb = bdev_super_lock(bdev, false);
        if (!sb)
                return;

        sync_filesystem(sb);
        super_unlock_shared(sb);
}

static struct super_block *get_bdev_super(struct block_device *bdev)
{
        bool active = false;
        struct super_block *sb;

        sb = bdev_super_lock(bdev, true);
        if (sb) {
                active = atomic_inc_not_zero(&sb->s_active);
                super_unlock_excl(sb);
        }
        if (!active)
                return NULL;
        return sb;
}

/**
 * fs_bdev_freeze - freeze owning filesystem of block device
 * @bdev: block device
 *
 * Freeze the filesystem that owns this block device if it is still
 * active.
 *
 * A filesystem that owns multiple block devices may be frozen from each
 * block device and won't be unfrozen until all block devices are
 * unfrozen. Each block device can only freeze the filesystem once as we
 * nest freezes for block devices in the block layer.
 *
 * Return: If the freeze was successful zero is returned. If the freeze
 *         failed a negative error code is returned.
 */
static int fs_bdev_freeze(struct block_device *bdev)
{
        struct super_block *sb;
        int error = 0;

        lockdep_assert_held(&bdev->bd_fsfreeze_mutex);

        sb = get_bdev_super(bdev);
        if (!sb)
                return -EINVAL;

        if (sb->s_op->freeze_super)
                error = sb->s_op->freeze_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
        else
                error = freeze_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
        if (!error)
                error = sync_blockdev(bdev);
        deactivate_super(sb);
        return error;
}

/**
 * fs_bdev_thaw - thaw owning filesystem of block device
 * @bdev: block device
 *
 * Thaw the filesystem that owns this block device.
 *
 * A filesystem that owns multiple block devices may be frozen from each
 * block device and won't be unfrozen until all block devices are
 * unfrozen. Each block device can only freeze the filesystem once as we
 * nest freezes for block devices in the block layer.
 *
 * Return: If the thaw was successful zero is returned. If the thaw
 *         failed a negative error code is returned. If this function
 *         returns zero it doesn't mean that the filesystem is unfrozen
 *         as it may have been frozen multiple times (kernel may hold a
 *         freeze or might be frozen from other block devices).
 */
static int fs_bdev_thaw(struct block_device *bdev)
{
        struct super_block *sb;
        int error;

        lockdep_assert_held(&bdev->bd_fsfreeze_mutex);

        /*
         * The block device may have been frozen before it was claimed by a
         * filesystem. Concurrently another process might try to mount that
         * frozen block device and has temporarily claimed the block device for
         * that purpose causing a concurrent fs_bdev_thaw() to end up here. The
         * mounter is already about to abort mounting because they still saw an
         * elevanted bdev->bd_fsfreeze_count so get_bdev_super() will return
         * NULL in that case.
         */
        sb = get_bdev_super(bdev);
        if (!sb)
                return -EINVAL;

        if (sb->s_op->thaw_super)
                error = sb->s_op->thaw_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
        else
                error = thaw_super(sb,
                                FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
        deactivate_super(sb);
        return error;
}

const struct blk_holder_ops fs_holder_ops = {
        .mark_dead                = fs_bdev_mark_dead,
        .sync                        = fs_bdev_sync,
        .freeze                        = fs_bdev_freeze,
        .thaw                        = fs_bdev_thaw,
};
EXPORT_SYMBOL_GPL(fs_holder_ops);

int setup_bdev_super(struct super_block *sb, int sb_flags,
                struct fs_context *fc)
{
        blk_mode_t mode = sb_open_mode(sb_flags);
        struct file *bdev_file;
        struct block_device *bdev;

        bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
        if (IS_ERR(bdev_file)) {
                if (fc)
                        errorf(fc, "%s: Can't open blockdev", fc->source);
                return PTR_ERR(bdev_file);
        }
        bdev = file_bdev(bdev_file);

        /*
         * This really should be in blkdev_get_by_dev, but right now can't due
         * to legacy issues that require us to allow opening a block device node
         * writable from userspace even for a read-only block device.
         */
        if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
                bdev_fput(bdev_file);
                return -EACCES;
        }

        /*
         * It is enough to check bdev was not frozen before we set
         * s_bdev as freezing will wait until SB_BORN is set.
         */
        if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
                if (fc)
                        warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
                bdev_fput(bdev_file);
                return -EBUSY;
        }
        spin_lock(&sb_lock);
        sb->s_bdev_file = bdev_file;
        sb->s_bdev = bdev;
        sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
        if (bdev_stable_writes(bdev))
                sb->s_iflags |= SB_I_STABLE_WRITES;
        spin_unlock(&sb_lock);

        snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
        shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
                                sb->s_id);
        sb_set_blocksize(sb, block_size(bdev));
        return 0;
}
EXPORT_SYMBOL_GPL(setup_bdev_super);

/**
 * get_tree_bdev_flags - Get a superblock based on a single block device
 * @fc: The filesystem context holding the parameters
 * @fill_super: Helper to initialise a new superblock
 * @flags: GET_TREE_BDEV_* flags
 */
int get_tree_bdev_flags(struct fs_context *fc,
                int (*fill_super)(struct super_block *sb,
                                  struct fs_context *fc), unsigned int flags)
{
        struct super_block *s;
        int error = 0;
        dev_t dev;

        if (!fc->source)
                return invalf(fc, "No source specified");

        error = lookup_bdev(fc->source, &dev);
        if (error) {
                if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP))
                        errorf(fc, "%s: Can't lookup blockdev", fc->source);
                return error;
        }
        fc->sb_flags |= SB_NOSEC;
        s = sget_dev(fc, dev);
        if (IS_ERR(s))
                return PTR_ERR(s);

        if (s->s_root) {
                /* Don't summarily change the RO/RW state. */
                if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) {
                        warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev);
                        deactivate_locked_super(s);
                        return -EBUSY;
                }
        } else {
                error = setup_bdev_super(s, fc->sb_flags, fc);
                if (!error)
                        error = fill_super(s, fc);
                if (error) {
                        deactivate_locked_super(s);
                        return error;
                }
                s->s_flags |= SB_ACTIVE;
        }

        BUG_ON(fc->root);
        fc->root = dget(s->s_root);
        return 0;
}
EXPORT_SYMBOL_GPL(get_tree_bdev_flags);

/**
 * get_tree_bdev - Get a superblock based on a single block device
 * @fc: The filesystem context holding the parameters
 * @fill_super: Helper to initialise a new superblock
 */
int get_tree_bdev(struct fs_context *fc,
                int (*fill_super)(struct super_block *,
                                  struct fs_context *))
{
        return get_tree_bdev_flags(fc, fill_super, 0);
}
EXPORT_SYMBOL(get_tree_bdev);

static int test_bdev_super(struct super_block *s, void *data)
{
        return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data;
}

struct dentry *mount_bdev(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data,
        int (*fill_super)(struct super_block *, void *, int))
{
        struct super_block *s;
        int error;
        dev_t dev;

        error = lookup_bdev(dev_name, &dev);
        if (error)
                return ERR_PTR(error);

        flags |= SB_NOSEC;
        s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev);
        if (IS_ERR(s))
                return ERR_CAST(s);

        if (s->s_root) {
                if ((flags ^ s->s_flags) & SB_RDONLY) {
                        deactivate_locked_super(s);
                        return ERR_PTR(-EBUSY);
                }
        } else {
                error = setup_bdev_super(s, flags, NULL);
                if (!error)
                        error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(s);
                        return ERR_PTR(error);
                }

                s->s_flags |= SB_ACTIVE;
        }

        return dget(s->s_root);
}
EXPORT_SYMBOL(mount_bdev);

void kill_block_super(struct super_block *sb)
{
        struct block_device *bdev = sb->s_bdev;

        generic_shutdown_super(sb);
        if (bdev) {
                sync_blockdev(bdev);
                bdev_fput(sb->s_bdev_file);
        }
}

EXPORT_SYMBOL(kill_block_super);
#endif

struct dentry *mount_nodev(struct file_system_type *fs_type,
        int flags, void *data,
        int (*fill_super)(struct super_block *, void *, int))
{
        int error;
        struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);

        if (IS_ERR(s))
                return ERR_CAST(s);

        error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
        if (error) {
                deactivate_locked_super(s);
                return ERR_PTR(error);
        }
        s->s_flags |= SB_ACTIVE;
        return dget(s->s_root);
}
EXPORT_SYMBOL(mount_nodev);

/**
 * vfs_get_tree - Get the mountable root
 * @fc: The superblock configuration context.
 *
 * The filesystem is invoked to get or create a superblock which can then later
 * be used for mounting.  The filesystem places a pointer to the root to be
 * used for mounting in @fc->root.
 */
int vfs_get_tree(struct fs_context *fc)
{
        struct super_block *sb;
        int error;

        if (fc->root)
                return -EBUSY;

        /* Get the mountable root in fc->root, with a ref on the root and a ref
         * on the superblock.
         */
        error = fc->ops->get_tree(fc);
        if (error < 0)
                return error;

        if (!fc->root) {
                pr_err("Filesystem %s get_tree() didn't set fc->root, returned %i\n",
                       fc->fs_type->name, error);
                /* We don't know what the locking state of the superblock is -
                 * if there is a superblock.
                 */
                BUG();
        }

        sb = fc->root->d_sb;
        WARN_ON(!sb->s_bdi);

        /*
         * super_wake() contains a memory barrier which also care of
         * ordering for super_cache_count(). We place it before setting
         * SB_BORN as the data dependency between the two functions is
         * the superblock structure contents that we just set up, not
         * the SB_BORN flag.
         */
        super_wake(sb, SB_BORN);

        error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL);
        if (unlikely(error)) {
                fc_drop_locked(fc);
                return error;
        }

        /*
         * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE
         * but s_maxbytes was an unsigned long long for many releases. Throw
         * this warning for a little while to try and catch filesystems that
         * violate this rule.
         */
        WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to "
                "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes);

        return 0;
}
EXPORT_SYMBOL(vfs_get_tree);

/*
 * Setup private BDI for given superblock. It gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi_name(struct super_block *sb, char *fmt, ...)
{
        struct backing_dev_info *bdi;
        int err;
        va_list args;

        bdi = bdi_alloc(NUMA_NO_NODE);
        if (!bdi)
                return -ENOMEM;

        va_start(args, fmt);
        err = bdi_register_va(bdi, fmt, args);
        va_end(args);
        if (err) {
                bdi_put(bdi);
                return err;
        }
        WARN_ON(sb->s_bdi != &noop_backing_dev_info);
        sb->s_bdi = bdi;
        sb->s_iflags |= SB_I_PERSB_BDI;

        return 0;
}
EXPORT_SYMBOL(super_setup_bdi_name);

/*
 * Setup private BDI for given superblock. I gets automatically cleaned up
 * in generic_shutdown_super().
 */
int super_setup_bdi(struct super_block *sb)
{
        static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);

        return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name,
                                    atomic_long_inc_return(&bdi_seq));
}
EXPORT_SYMBOL(super_setup_bdi);

/**
 * sb_wait_write - wait until all writers to given file system finish
 * @sb: the super for which we wait
 * @level: type of writers we wait for (normal vs page fault)
 *
 * This function waits until there are no writers of given type to given file
 * system.
 */
static void sb_wait_write(struct super_block *sb, int level)
{
        percpu_down_write(sb->s_writers.rw_sem + level-1);
}

/*
 * We are going to return to userspace and forget about these locks, the
 * ownership goes to the caller of thaw_super() which does unlock().
 */
static void lockdep_sb_freeze_release(struct super_block *sb)
{
        int level;

        for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--)
                percpu_rwsem_release(sb->s_writers.rw_sem + level, _THIS_IP_);
}

/*
 * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb).
 */
static void lockdep_sb_freeze_acquire(struct super_block *sb)
{
        int level;

        for (level = 0; level < SB_FREEZE_LEVELS; ++level)
                percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_);
}

static void sb_freeze_unlock(struct super_block *sb, int level)
{
        for (level--; level >= 0; level--)
                percpu_up_write(sb->s_writers.rw_sem + level);
}

static int wait_for_partially_frozen(struct super_block *sb)
{
        int ret = 0;

        do {
                unsigned short old = sb->s_writers.frozen;

                up_write(&sb->s_umount);
                ret = wait_var_event_killable(&sb->s_writers.frozen,
                                               sb->s_writers.frozen != old);
                down_write(&sb->s_umount);
        } while (ret == 0 &&
                 sb->s_writers.frozen != SB_UNFROZEN &&
                 sb->s_writers.frozen != SB_FREEZE_COMPLETE);

        return ret;
}

#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE)
#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST)

static inline int freeze_inc(struct super_block *sb, enum freeze_holder who)
{
        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if (who & FREEZE_HOLDER_KERNEL)
                ++sb->s_writers.freeze_kcount;
        if (who & FREEZE_HOLDER_USERSPACE)
                ++sb->s_writers.freeze_ucount;
        return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
}

static inline int freeze_dec(struct super_block *sb, enum freeze_holder who)
{
        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount)
                --sb->s_writers.freeze_kcount;
        if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount)
                --sb->s_writers.freeze_ucount;
        return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
}

static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
{
        WARN_ON_ONCE((who & ~FREEZE_FLAGS));
        WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);

        if (who & FREEZE_HOLDER_KERNEL)
                return (who & FREEZE_MAY_NEST) ||
                       sb->s_writers.freeze_kcount == 0;
        if (who & FREEZE_HOLDER_USERSPACE)
                return (who & FREEZE_MAY_NEST) ||
                       sb->s_writers.freeze_ucount == 0;
        return false;
}

/**
 * freeze_super - lock the filesystem and force it into a consistent state
 * @sb: the super to lock
 * @who: context that wants to freeze
 *
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs may return
 * -EBUSY.
 *
 * @who should be:
 * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
 * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs.
 * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed.
 *
 * The @who argument distinguishes between the kernel and userspace trying to
 * freeze the filesystem.  Although there cannot be multiple kernel freezes or
 * multiple userspace freezes in effect at any given time, the kernel and
 * userspace can both hold a filesystem frozen.  The filesystem remains frozen
 * until there are no kernel or userspace freezes in effect.
 *
 * A filesystem may hold multiple devices and thus a filesystems may be
 * frozen through the block layer via multiple block devices. In this
 * case the request is marked as being allowed to nest by passing
 * FREEZE_MAY_NEST. The filesystem remains frozen until all block
 * devices are unfrozen. If multiple freezes are attempted without
 * FREEZE_MAY_NEST -EBUSY will be returned.
 *
 * During this function, sb->s_writers.frozen goes through these values:
 *
 * SB_UNFROZEN: File system is normal, all writes progress as usual.
 *
 * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
 * writes should be blocked, though page faults are still allowed. We wait for
 * all writes to complete and then proceed to the next stage.
 *
 * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
 * but internal fs threads can still modify the filesystem (although they
 * should not dirty new pages or inodes), writeback can run etc. After waiting
 * for all running page faults we sync the filesystem which will clean all
 * dirty pages and inodes (no new dirty pages or inodes can be created when
 * sync is running).
 *
 * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
 * modification are blocked (e.g. XFS preallocation truncation on inode
 * reclaim). This is usually implemented by blocking new transactions for
 * filesystems that have them and need this additional guard. After all
 * internal writers are finished we call ->freeze_fs() to finish filesystem
 * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
 * mostly auxiliary for filesystems to verify they do not modify frozen fs.
 *
 * sb->s_writers.frozen is protected by sb->s_umount.
 *
 * Return: If the freeze was successful zero is returned. If the freeze
 *         failed a negative error code is returned.
 */
int freeze_super(struct super_block *sb, enum freeze_holder who)
{
        int ret;

        if (!super_lock_excl(sb)) {
                WARN_ON_ONCE("Dying superblock while freezing!");
                return -EINVAL;
        }
        atomic_inc(&sb->s_active);

retry:
        if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
                if (may_freeze(sb, who))
                        ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1);
                else
                        ret = -EBUSY;
                /* All freezers share a single active reference. */
                deactivate_locked_super(sb);
                return ret;
        }

        if (sb->s_writers.frozen != SB_UNFROZEN) {
                ret = wait_for_partially_frozen(sb);
                if (ret) {
                        deactivate_locked_super(sb);
                        return ret;
                }

                goto retry;
        }

        if (sb_rdonly(sb)) {
                /* Nothing to do really... */
                WARN_ON_ONCE(freeze_inc(sb, who) > 1);
                sb->s_writers.frozen = SB_FREEZE_COMPLETE;
                wake_up_var(&sb->s_writers.frozen);
                super_unlock_excl(sb);
                return 0;
        }

        sb->s_writers.frozen = SB_FREEZE_WRITE;
        /* Release s_umount to preserve sb_start_write -> s_umount ordering */
        super_unlock_excl(sb);
        sb_wait_write(sb, SB_FREEZE_WRITE);
        __super_lock_excl(sb);

        /* Now we go and block page faults... */
        sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
        sb_wait_write(sb, SB_FREEZE_PAGEFAULT);

        /* All writers are done so after syncing there won't be dirty data */
        ret = sync_filesystem(sb);
        if (ret) {
                sb->s_writers.frozen = SB_UNFROZEN;
                sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT);
                wake_up_var(&sb->s_writers.frozen);
                deactivate_locked_super(sb);
                return ret;
        }

        /* Now wait for internal filesystem counter */
        sb->s_writers.frozen = SB_FREEZE_FS;
        sb_wait_write(sb, SB_FREEZE_FS);

        if (sb->s_op->freeze_fs) {
                ret = sb->s_op->freeze_fs(sb);
                if (ret) {
                        printk(KERN_ERR
                                "VFS:Filesystem freeze failed\n");
                        sb->s_writers.frozen = SB_UNFROZEN;
                        sb_freeze_unlock(sb, SB_FREEZE_FS);
                        wake_up_var(&sb->s_writers.frozen);
                        deactivate_locked_super(sb);
                        return ret;
                }
        }
        /*
         * For debugging purposes so that fs can warn if it sees write activity
         * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
         */
        WARN_ON_ONCE(freeze_inc(sb, who) > 1);
        sb->s_writers.frozen = SB_FREEZE_COMPLETE;
        wake_up_var(&sb->s_writers.frozen);
        lockdep_sb_freeze_release(sb);
        super_unlock_excl(sb);
        return 0;
}
EXPORT_SYMBOL(freeze_super);

/*
 * Undoes the effect of a freeze_super_locked call.  If the filesystem is
 * frozen both by userspace and the kernel, a thaw call from either source
 * removes that state without releasing the other state or unlocking the
 * filesystem.
 */
static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
{
        int error = -EINVAL;

        if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
                goto out_unlock;

        /*
         * All freezers share a single active reference.
         * So just unlock in case there are any left.
         */
        if (freeze_dec(sb, who))
                goto out_unlock;

        if (sb_rdonly(sb)) {
                sb->s_writers.frozen = SB_UNFROZEN;
                wake_up_var(&sb->s_writers.frozen);
                goto out_deactivate;
        }

        lockdep_sb_freeze_acquire(sb);

        if (sb->s_op->unfreeze_fs) {
                error = sb->s_op->unfreeze_fs(sb);
                if (error) {
                        pr_err("VFS: Filesystem thaw failed\n");
                        freeze_inc(sb, who);
                        lockdep_sb_freeze_release(sb);
                        goto out_unlock;
                }
        }

        sb->s_writers.frozen = SB_UNFROZEN;
        wake_up_var(&sb->s_writers.frozen);
        sb_freeze_unlock(sb, SB_FREEZE_FS);
out_deactivate:
        deactivate_locked_super(sb);
        return 0;

out_unlock:
        super_unlock_excl(sb);
        return error;
}

/**
 * thaw_super -- unlock filesystem
 * @sb: the super to thaw
 * @who: context that wants to freeze
 *
 * Unlocks the filesystem and marks it writeable again after freeze_super()
 * if there are no remaining freezes on the filesystem.
 *
 * @who should be:
 * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs;
 * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs.
 * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed
 *
 * A filesystem may hold multiple devices and thus a filesystems may
 * have been frozen through the block layer via multiple block devices.
 * The filesystem remains frozen until all block devices are unfrozen.
 */
int thaw_super(struct super_block *sb, enum freeze_holder who)
{
        if (!super_lock_excl(sb)) {
                WARN_ON_ONCE("Dying superblock while thawing!");
                return -EINVAL;
        }
        return thaw_super_locked(sb, who);
}
EXPORT_SYMBOL(thaw_super);

/*
 * Create workqueue for deferred direct IO completions. We allocate the
 * workqueue when it's first needed. This avoids creating workqueue for
 * filesystems that don't need it and also allows us to create the workqueue
 * late enough so the we can include s_id in the name of the workqueue.
 */
int sb_init_dio_done_wq(struct super_block *sb)
{
        struct workqueue_struct *old;
        struct workqueue_struct *wq = alloc_workqueue("dio/%s",
                                                      WQ_MEM_RECLAIM, 0,
                                                      sb->s_id);
        if (!wq)
                return -ENOMEM;
        /*
         * This has to be atomic as more DIOs can race to create the workqueue
         */
        old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
        /* Someone created workqueue before us? Free ours... */
        if (old)
                destroy_workqueue(wq);
        return 0;
}
EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);


















    5 







    4 






    4 






    2 









































    6 




    2 


































    6 




    6 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2012 ARM Ltd.
 */

#ifndef __ASM_PMUV3_H
#define __ASM_PMUV3_H

#include <asm/kvm_host.h>

#include <asm/cpufeature.h>
#include <asm/sysreg.h>

#define RETURN_READ_PMEVCNTRN(n) \
        return read_sysreg(pmevcntr##n##_el0)
static inline unsigned long read_pmevcntrn(int n)
{
        PMEVN_SWITCH(n, RETURN_READ_PMEVCNTRN);
        return 0;
}

#define WRITE_PMEVCNTRN(n) \
        write_sysreg(val, pmevcntr##n##_el0)
static inline void write_pmevcntrn(int n, unsigned long val)
{
        PMEVN_SWITCH(n, WRITE_PMEVCNTRN);
}

#define WRITE_PMEVTYPERN(n) \
        write_sysreg(val, pmevtyper##n##_el0)
static inline void write_pmevtypern(int n, unsigned long val)
{
        PMEVN_SWITCH(n, WRITE_PMEVTYPERN);
}

#define RETURN_READ_PMEVTYPERN(n) \
        return read_sysreg(pmevtyper##n##_el0)
static inline unsigned long read_pmevtypern(int n)
{
        PMEVN_SWITCH(n, RETURN_READ_PMEVTYPERN);
        return 0;
}

static inline unsigned long read_pmmir(void)
{
        return read_cpuid(PMMIR_EL1);
}

static inline u32 read_pmuver(void)
{
        u64 dfr0 = read_sysreg(id_aa64dfr0_el1);

        return cpuid_feature_extract_unsigned_field(dfr0,
                        ID_AA64DFR0_EL1_PMUVer_SHIFT);
}

static inline bool pmuv3_has_icntr(void)
{
        u64 dfr1 = read_sysreg(id_aa64dfr1_el1);

        return !!cpuid_feature_extract_unsigned_field(dfr1,
                        ID_AA64DFR1_EL1_PMICNTR_SHIFT);
}

static inline void write_pmcr(u64 val)
{
        write_sysreg(val, pmcr_el0);
}

static inline u64 read_pmcr(void)
{
        return read_sysreg(pmcr_el0);
}

static inline void write_pmselr(u32 val)
{
        write_sysreg(val, pmselr_el0);
}

static inline void write_pmccntr(u64 val)
{
        write_sysreg(val, pmccntr_el0);
}

static inline u64 read_pmccntr(void)
{
        return read_sysreg(pmccntr_el0);
}

static inline void write_pmicntr(u64 val)
{
        write_sysreg_s(val, SYS_PMICNTR_EL0);
}

static inline u64 read_pmicntr(void)
{
        return read_sysreg_s(SYS_PMICNTR_EL0);
}

static inline void write_pmcntenset(u64 val)
{
        write_sysreg(val, pmcntenset_el0);
}

static inline void write_pmcntenclr(u64 val)
{
        write_sysreg(val, pmcntenclr_el0);
}

static inline void write_pmintenset(u64 val)
{
        write_sysreg(val, pmintenset_el1);
}

static inline void write_pmintenclr(u64 val)
{
        write_sysreg(val, pmintenclr_el1);
}

static inline void write_pmccfiltr(u64 val)
{
        write_sysreg(val, pmccfiltr_el0);
}

static inline u64 read_pmccfiltr(void)
{
        return read_sysreg(pmccfiltr_el0);
}

static inline void write_pmicfiltr(u64 val)
{
        write_sysreg_s(val, SYS_PMICFILTR_EL0);
}

static inline u64 read_pmicfiltr(void)
{
        return read_sysreg_s(SYS_PMICFILTR_EL0);
}

static inline void write_pmovsclr(u64 val)
{
        write_sysreg(val, pmovsclr_el0);
}

static inline u64 read_pmovsclr(void)
{
        return read_sysreg(pmovsclr_el0);
}

static inline void write_pmuserenr(u32 val)
{
        write_sysreg(val, pmuserenr_el0);
}

static inline void write_pmuacr(u64 val)
{
        write_sysreg_s(val, SYS_PMUACR_EL1);
}

static inline u64 read_pmceid0(void)
{
        return read_sysreg(pmceid0_el0);
}

static inline u64 read_pmceid1(void)
{
        return read_sysreg(pmceid1_el0);
}

static inline bool pmuv3_implemented(int pmuver)
{
        return !(pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF ||
                 pmuver == ID_AA64DFR0_EL1_PMUVer_NI);
}

static inline bool is_pmuv3p4(int pmuver)
{
        return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P4;
}

static inline bool is_pmuv3p5(int pmuver)
{
        return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5;
}

static inline bool is_pmuv3p9(int pmuver)
{
        return pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P9;
}

#endif











































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * A policy database (policydb) specifies the
 * configuration data for the security policy.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */

/*
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *          Support for enhanced MLS infrastructure.
 *          Copyright (C) 2004-2005 Trusted Computer Solutions, Inc.
 *
 * Updated: Frank Mayer <mayerf@tresys.com> and
 *          Karl MacMillan <kmacmillan@tresys.com>
 *          Added conditional policy language extensions
 *          Copyright (C) 2003-2004 Tresys Technology, LLC
 */

#ifndef _SS_POLICYDB_H_
#define _SS_POLICYDB_H_

#include "symtab.h"
#include "avtab.h"
#include "sidtab.h"
#include "ebitmap.h"
#include "mls_types.h"
#include "context.h"
#include "constraint.h"

/*
 * A datum type is defined for each kind of symbol
 * in the configuration data:  individual permissions,
 * common prefixes for access vectors, classes,
 * users, roles, types, sensitivities, categories, etc.
 */

/* Permission attributes */
struct perm_datum {
        u32 value; /* permission bit + 1 */
};

/* Attributes of a common prefix for access vectors */
struct common_datum {
        u32 value; /* internal common value */
        struct symtab permissions; /* common permissions */
};

/* Class attributes */
struct class_datum {
        u32 value; /* class value */
        char *comkey; /* common name */
        struct common_datum *comdatum; /* common datum */
        struct symtab permissions; /* class-specific permission symbol table */
        struct constraint_node *constraints; /* constraints on class perms */
        struct constraint_node *validatetrans; /* special transition rules */
/* Options how a new object user, role, and type should be decided */
#define DEFAULT_SOURCE 1
#define DEFAULT_TARGET 2
        char default_user;
        char default_role;
        char default_type;
/* Options how a new object range should be decided */
#define DEFAULT_SOURCE_LOW        1
#define DEFAULT_SOURCE_HIGH        2
#define DEFAULT_SOURCE_LOW_HIGH 3
#define DEFAULT_TARGET_LOW        4
#define DEFAULT_TARGET_HIGH        5
#define DEFAULT_TARGET_LOW_HIGH 6
#define DEFAULT_GLBLUB                7
        char default_range;
};

/* Role attributes */
struct role_datum {
        u32 value; /* internal role value */
        u32 bounds; /* boundary of role */
        struct ebitmap dominates; /* set of roles dominated by this role */
        struct ebitmap types; /* set of authorized types for role */
};

struct role_trans_key {
        u32 role; /* current role */
        u32 type; /* program executable type, or new object type */
        u32 tclass; /* process class, or new object class */
};

struct role_trans_datum {
        u32 new_role; /* new role */
};

struct filename_trans_key {
        u32 ttype; /* parent dir context */
        u16 tclass; /* class of new object */
        const char *name; /* last path component */
};

struct filename_trans_datum {
        struct ebitmap stypes; /* bitmap of source types for this otype */
        u32 otype; /* resulting type of new object */
        struct filename_trans_datum *next; /* record for next otype*/
};

struct role_allow {
        u32 role; /* current role */
        u32 new_role; /* new role */
        struct role_allow *next;
};

/* Type attributes */
struct type_datum {
        u32 value; /* internal type value */
        u32 bounds; /* boundary of type */
        unsigned char primary; /* primary name? */
        unsigned char attribute; /* attribute ?*/
};

/* User attributes */
struct user_datum {
        u32 value; /* internal user value */
        u32 bounds; /* bounds of user */
        struct ebitmap roles; /* set of authorized roles for user */
        struct mls_range range; /* MLS range (min - max) for user */
        struct mls_level dfltlevel; /* default login MLS level for user */
};

/* Sensitivity attributes */
struct level_datum {
        struct mls_level level; /* sensitivity and associated categories */
        unsigned char isalias; /* is this sensitivity an alias for another? */
};

/* Category attributes */
struct cat_datum {
        u32 value; /* internal category bit + 1 */
        unsigned char isalias; /* is this category an alias for another? */
};

struct range_trans {
        u32 source_type;
        u32 target_type;
        u32 target_class;
};

/* Boolean data type */
struct cond_bool_datum {
        u32 value; /* internal type value */
        int state;
};

struct cond_node;

/*
 * type set preserves data needed to determine constraint info from
 * policy source. This is not used by the kernel policy but allows
 * utilities such as audit2allow to determine constraint denials.
 */
struct type_set {
        struct ebitmap types;
        struct ebitmap negset;
        u32 flags;
};

/*
 * The configuration data includes security contexts for
 * initial SIDs, unlabeled file systems, TCP and UDP port numbers,
 * network interfaces, and nodes.  This structure stores the
 * relevant data for one such entry.  Entries of the same kind
 * (e.g. all initial SIDs) are linked together into a list.
 */
struct ocontext {
        union {
                char *name; /* name of initial SID, fs, netif, fstype, path */
                struct {
                        u8 protocol;
                        u16 low_port;
                        u16 high_port;
                } port; /* TCP or UDP port information */
                struct {
                        u32 addr;
                        u32 mask;
                } node; /* node information */
                struct {
                        u32 addr[4];
                        u32 mask[4];
                } node6; /* IPv6 node information */
                struct {
                        u64 subnet_prefix;
                        u16 low_pkey;
                        u16 high_pkey;
                } ibpkey;
                struct {
                        char *dev_name;
                        u8 port;
                } ibendport;
        } u;
        union {
                u32 sclass; /* security class for genfs */
                u32 behavior; /* labeling behavior for fs_use */
        } v;
        struct context context[2]; /* security context(s) */
        u32 sid[2]; /* SID(s) */
        struct ocontext *next;
};

struct genfs {
        char *fstype;
        struct ocontext *head;
        struct genfs *next;
};

/* symbol table array indices */
#define SYM_COMMONS 0
#define SYM_CLASSES 1
#define SYM_ROLES   2
#define SYM_TYPES   3
#define SYM_USERS   4
#define SYM_BOOLS   5
#define SYM_LEVELS  6
#define SYM_CATS    7
#define SYM_NUM            8

/* object context array indices */
#define OCON_ISID      0 /* initial SIDs */
#define OCON_FS               1 /* unlabeled file systems (deprecated) */
#define OCON_PORT      2 /* TCP and UDP port numbers */
#define OCON_NETIF     3 /* network interfaces */
#define OCON_NODE      4 /* nodes */
#define OCON_FSUSE     5 /* fs_use */
#define OCON_NODE6     6 /* IPv6 nodes */
#define OCON_IBPKEY    7 /* Infiniband PKeys */
#define OCON_IBENDPORT 8 /* Infiniband end ports */
#define OCON_NUM       9

/* The policy database */
struct policydb {
        int mls_enabled;

        /* symbol tables */
        struct symtab symtab[SYM_NUM];
#define p_commons symtab[SYM_COMMONS]
#define p_classes symtab[SYM_CLASSES]
#define p_roles          symtab[SYM_ROLES]
#define p_types          symtab[SYM_TYPES]
#define p_users          symtab[SYM_USERS]
#define p_bools          symtab[SYM_BOOLS]
#define p_levels  symtab[SYM_LEVELS]
#define p_cats          symtab[SYM_CATS]

        /* symbol names indexed by (value - 1) */
        char **sym_val_to_name[SYM_NUM];

        /* class, role, and user attributes indexed by (value - 1) */
        struct class_datum **class_val_to_struct;
        struct role_datum **role_val_to_struct;
        struct user_datum **user_val_to_struct;
        struct type_datum **type_val_to_struct;

        /* type enforcement access vectors and transitions */
        struct avtab te_avtab;

        /* role transitions */
        struct hashtab role_tr;

        /* file transitions with the last path component */
        /* quickly exclude lookups when parent ttype has no rules */
        struct ebitmap filename_trans_ttypes;
        /* actual set of filename_trans rules */
        struct hashtab filename_trans;
        /* only used if policyvers < POLICYDB_VERSION_COMP_FTRANS */
        u32 compat_filename_trans_count;

        /* bools indexed by (value - 1) */
        struct cond_bool_datum **bool_val_to_struct;
        /* type enforcement conditional access vectors and transitions */
        struct avtab te_cond_avtab;
        /* array indexing te_cond_avtab by conditional */
        struct cond_node *cond_list;
        u32 cond_list_len;

        /* role allows */
        struct role_allow *role_allow;

        /* security contexts of initial SIDs, unlabeled file systems,
           TCP or UDP port numbers, network interfaces and nodes */
        struct ocontext *ocontexts[OCON_NUM];

        /* security contexts for files in filesystems that cannot support
           a persistent label mapping or use another
           fixed labeling behavior. */
        struct genfs *genfs;

        /* range transitions table (range_trans_key -> mls_range) */
        struct hashtab range_tr;

        /* type -> attribute reverse mapping */
        struct ebitmap *type_attr_map_array;

        struct ebitmap policycaps;

        struct ebitmap permissive_map;

        /* length of this policy when it was loaded */
        size_t len;

        unsigned int policyvers;

        unsigned int reject_unknown : 1;
        unsigned int allow_unknown : 1;

        u16 process_class;
        u32 process_trans_perms;
} __randomize_layout;

struct policy_file {
        char *data;
        size_t len;
};

extern void policydb_destroy(struct policydb *p);
extern int policydb_load_isids(struct policydb *p, struct sidtab *s);
extern int policydb_context_isvalid(struct policydb *p, struct context *c);
extern int policydb_class_isvalid(struct policydb *p, unsigned int class);
extern int policydb_type_isvalid(struct policydb *p, unsigned int type);
extern int policydb_role_isvalid(struct policydb *p, unsigned int role);
extern int policydb_read(struct policydb *p, struct policy_file *fp);
extern int policydb_write(struct policydb *p, struct policy_file *fp);

extern struct filename_trans_datum *
policydb_filenametr_search(struct policydb *p, struct filename_trans_key *key);

extern struct mls_range *policydb_rangetr_search(struct policydb *p,
                                                 struct range_trans *key);

extern struct role_trans_datum *
policydb_roletr_search(struct policydb *p, struct role_trans_key *key);

#define POLICYDB_CONFIG_MLS 1

/* the config flags related to unknown classes/perms are bits 2 and 3 */
#define REJECT_UNKNOWN 0x00000002
#define ALLOW_UNKNOWN  0x00000004

#define OBJECT_R     "object_r"
#define OBJECT_R_VAL 1

#define POLICYDB_MAGIC        SELINUX_MAGIC
#define POLICYDB_STRING "SE Linux"

struct policy_data {
        struct policydb *p;
        struct policy_file *fp;
};

static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
{
        if (bytes > fp->len)
                return -EINVAL;

        memcpy(buf, fp->data, bytes);
        fp->data += bytes;
        fp->len -= bytes;
        return 0;
}

static inline int put_entry(const void *buf, size_t bytes, size_t num,
                            struct policy_file *fp)
{
        size_t len;

        if (unlikely(check_mul_overflow(bytes, num, &len)))
                return -EINVAL;

        if (len > fp->len)
                return -EINVAL;
        memcpy(fp->data, buf, len);
        fp->data += len;
        fp->len -= len;

        return 0;
}

static inline char *sym_name(struct policydb *p, unsigned int sym_num,
                             unsigned int element_nr)
{
        return p->sym_val_to_name[sym_num][element_nr];
}

extern int str_read(char **strp, gfp_t flags, struct policy_file *fp, u32 len);

extern u16 string_to_security_class(struct policydb *p, const char *name);
extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name);

#endif /* _SS_POLICYDB_H_ */





























































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
/*
 * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved.
 * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

#ifndef _TLS_OFFLOAD_H
#define _TLS_OFFLOAD_H

#include <linux/types.h>
#include <asm/byteorder.h>
#include <linux/crypto.h>
#include <linux/socket.h>
#include <linux/tcp.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/rcupdate.h>

#include <net/net_namespace.h>
#include <net/tcp.h>
#include <net/strparser.h>
#include <crypto/aead.h>
#include <uapi/linux/tls.h>

struct tls_rec;

/* Maximum data size carried in a TLS record */
#define TLS_MAX_PAYLOAD_SIZE                ((size_t)1 << 14)

#define TLS_HEADER_SIZE                        5
#define TLS_NONCE_OFFSET                TLS_HEADER_SIZE

#define TLS_CRYPTO_INFO_READY(info)        ((info)->cipher_type)

#define TLS_HANDSHAKE_KEYUPDATE                24        /* rfc8446 B.3: Key update */

#define TLS_AAD_SPACE_SIZE                13

#define TLS_MAX_IV_SIZE                        16
#define TLS_MAX_SALT_SIZE                4
#define TLS_TAG_SIZE                        16
#define TLS_MAX_REC_SEQ_SIZE                8
#define TLS_MAX_AAD_SIZE                TLS_AAD_SPACE_SIZE

/* For CCM mode, the full 16-bytes of IV is made of '4' fields of given sizes.
 *
 * IV[16] = b0[1] || implicit nonce[4] || explicit nonce[8] || length[3]
 *
 * The field 'length' is encoded in field 'b0' as '(length width - 1)'.
 * Hence b0 contains (3 - 1) = 2.
 */
#define TLS_AES_CCM_IV_B0_BYTE                2
#define TLS_SM4_CCM_IV_B0_BYTE                2

enum {
        TLS_BASE,
        TLS_SW,
        TLS_HW,
        TLS_HW_RECORD,
        TLS_NUM_CONFIG,
};

struct tx_work {
        struct delayed_work work;
        struct sock *sk;
};

struct tls_sw_context_tx {
        struct crypto_aead *aead_send;
        struct crypto_wait async_wait;
        struct tx_work tx_work;
        struct tls_rec *open_rec;
        struct list_head tx_list;
        atomic_t encrypt_pending;
        u8 async_capable:1;

#define BIT_TX_SCHEDULED        0
#define BIT_TX_CLOSING                1
        unsigned long tx_bitmask;
};

struct tls_strparser {
        struct sock *sk;

        u32 mark : 8;
        u32 stopped : 1;
        u32 copy_mode : 1;
        u32 mixed_decrypted : 1;

        bool msg_ready;

        struct strp_msg stm;

        struct sk_buff *anchor;
        struct work_struct work;
};

struct tls_sw_context_rx {
        struct crypto_aead *aead_recv;
        struct crypto_wait async_wait;
        struct sk_buff_head rx_list;        /* list of decrypted 'data' records */
        void (*saved_data_ready)(struct sock *sk);

        u8 reader_present;
        u8 async_capable:1;
        u8 zc_capable:1;
        u8 reader_contended:1;
        bool key_update_pending;

        struct tls_strparser strp;

        atomic_t decrypt_pending;
        struct sk_buff_head async_hold;
        struct wait_queue_head wq;
};

struct tls_record_info {
        struct list_head list;
        u32 end_seq;
        int len;
        int num_frags;
        skb_frag_t frags[MAX_SKB_FRAGS];
};

#define TLS_DRIVER_STATE_SIZE_TX        16
struct tls_offload_context_tx {
        struct crypto_aead *aead_send;
        spinlock_t lock;        /* protects records list */
        struct list_head records_list;
        struct tls_record_info *open_record;
        struct tls_record_info *retransmit_hint;
        u64 hint_record_sn;
        u64 unacked_record_sn;

        struct scatterlist sg_tx_data[MAX_SKB_FRAGS];
        void (*sk_destruct)(struct sock *sk);
        struct work_struct destruct_work;
        struct tls_context *ctx;
        /* The TLS layer reserves room for driver specific state
         * Currently the belief is that there is not enough
         * driver specific state to justify another layer of indirection
         */
        u8 driver_state[TLS_DRIVER_STATE_SIZE_TX] __aligned(8);
};

enum tls_context_flags {
        /* tls_device_down was called after the netdev went down, device state
         * was released, and kTLS works in software, even though rx_conf is
         * still TLS_HW (needed for transition).
         */
        TLS_RX_DEV_DEGRADED = 0,
        /* Unlike RX where resync is driven entirely by the core in TX only
         * the driver knows when things went out of sync, so we need the flag
         * to be atomic.
         */
        TLS_TX_SYNC_SCHED = 1,
        /* tls_dev_del was called for the RX side, device state was released,
         * but tls_ctx->netdev might still be kept, because TX-side driver
         * resources might not be released yet. Used to prevent the second
         * tls_dev_del call in tls_device_down if it happens simultaneously.
         */
        TLS_RX_DEV_CLOSED = 2,
};

struct cipher_context {
        char iv[TLS_MAX_IV_SIZE + TLS_MAX_SALT_SIZE];
        char rec_seq[TLS_MAX_REC_SEQ_SIZE];
};

union tls_crypto_context {
        struct tls_crypto_info info;
        union {
                struct tls12_crypto_info_aes_gcm_128 aes_gcm_128;
                struct tls12_crypto_info_aes_gcm_256 aes_gcm_256;
                struct tls12_crypto_info_chacha20_poly1305 chacha20_poly1305;
                struct tls12_crypto_info_sm4_gcm sm4_gcm;
                struct tls12_crypto_info_sm4_ccm sm4_ccm;
        };
};

struct tls_prot_info {
        u16 version;
        u16 cipher_type;
        u16 prepend_size;
        u16 tag_size;
        u16 overhead_size;
        u16 iv_size;
        u16 salt_size;
        u16 rec_seq_size;
        u16 aad_size;
        u16 tail_size;
};

struct tls_context {
        /* read-only cache line */
        struct tls_prot_info prot_info;

        u8 tx_conf:3;
        u8 rx_conf:3;
        u8 zerocopy_sendfile:1;
        u8 rx_no_pad:1;

        int (*push_pending_record)(struct sock *sk, int flags);
        void (*sk_write_space)(struct sock *sk);

        void *priv_ctx_tx;
        void *priv_ctx_rx;

        struct net_device __rcu *netdev;

        /* rw cache line */
        struct cipher_context tx;
        struct cipher_context rx;

        struct scatterlist *partially_sent_record;
        u16 partially_sent_offset;

        bool splicing_pages;
        bool pending_open_record_frags;

        struct mutex tx_lock; /* protects partially_sent_* fields and
                               * per-type TX fields
                               */
        unsigned long flags;

        /* cache cold stuff */
        struct proto *sk_proto;
        struct sock *sk;

        void (*sk_destruct)(struct sock *sk);

        union tls_crypto_context crypto_send;
        union tls_crypto_context crypto_recv;

        struct list_head list;
        refcount_t refcount;
        struct rcu_head rcu;
};

enum tls_offload_ctx_dir {
        TLS_OFFLOAD_CTX_DIR_RX,
        TLS_OFFLOAD_CTX_DIR_TX,
};

struct tlsdev_ops {
        int (*tls_dev_add)(struct net_device *netdev, struct sock *sk,
                           enum tls_offload_ctx_dir direction,
                           struct tls_crypto_info *crypto_info,
                           u32 start_offload_tcp_sn);
        void (*tls_dev_del)(struct net_device *netdev,
                            struct tls_context *ctx,
                            enum tls_offload_ctx_dir direction);
        int (*tls_dev_resync)(struct net_device *netdev,
                              struct sock *sk, u32 seq, u8 *rcd_sn,
                              enum tls_offload_ctx_dir direction);
};

enum tls_offload_sync_type {
        TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ = 0,
        TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT = 1,
        TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC = 2,
};

#define TLS_DEVICE_RESYNC_NH_START_IVAL                2
#define TLS_DEVICE_RESYNC_NH_MAX_IVAL                128

#define TLS_DEVICE_RESYNC_ASYNC_LOGMAX                13
struct tls_offload_resync_async {
        atomic64_t req;
        u16 loglen;
        u16 rcd_delta;
        u32 log[TLS_DEVICE_RESYNC_ASYNC_LOGMAX];
};

#define TLS_DRIVER_STATE_SIZE_RX        8
struct tls_offload_context_rx {
        /* sw must be the first member of tls_offload_context_rx */
        struct tls_sw_context_rx sw;
        enum tls_offload_sync_type resync_type;
        /* this member is set regardless of resync_type, to avoid branches */
        u8 resync_nh_reset:1;
        /* CORE_NEXT_HINT-only member, but use the hole here */
        u8 resync_nh_do_now:1;
        union {
                /* TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ */
                struct {
                        atomic64_t resync_req;
                };
                /* TLS_OFFLOAD_SYNC_TYPE_CORE_NEXT_HINT */
                struct {
                        u32 decrypted_failed;
                        u32 decrypted_tgt;
                } resync_nh;
                /* TLS_OFFLOAD_SYNC_TYPE_DRIVER_REQ_ASYNC */
                struct {
                        struct tls_offload_resync_async *resync_async;
                };
        };
        /* The TLS layer reserves room for driver specific state
         * Currently the belief is that there is not enough
         * driver specific state to justify another layer of indirection
         */
        u8 driver_state[TLS_DRIVER_STATE_SIZE_RX] __aligned(8);
};

struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
                                       u32 seq, u64 *p_record_sn);

static inline bool tls_record_is_start_marker(struct tls_record_info *rec)
{
        return rec->len == 0;
}

static inline u32 tls_record_start_seq(struct tls_record_info *rec)
{
        return rec->end_seq - rec->len;
}

struct sk_buff *
tls_validate_xmit_skb(struct sock *sk, struct net_device *dev,
                      struct sk_buff *skb);
struct sk_buff *
tls_validate_xmit_skb_sw(struct sock *sk, struct net_device *dev,
                         struct sk_buff *skb);

static inline bool tls_is_skb_tx_device_offloaded(const struct sk_buff *skb)
{
#ifdef CONFIG_TLS_DEVICE
        struct sock *sk = skb->sk;

        return sk && sk_fullsock(sk) &&
               (smp_load_acquire(&sk->sk_validate_xmit_skb) ==
               &tls_validate_xmit_skb);
#else
        return false;
#endif
}

static inline struct tls_context *tls_get_ctx(const struct sock *sk)
{
        const struct inet_connection_sock *icsk = inet_csk(sk);

        /* Use RCU on icsk_ulp_data only for sock diag code,
         * TLS data path doesn't need rcu_dereference().
         */
        return (__force void *)icsk->icsk_ulp_data;
}

static inline struct tls_sw_context_rx *tls_sw_ctx_rx(
                const struct tls_context *tls_ctx)
{
        return (struct tls_sw_context_rx *)tls_ctx->priv_ctx_rx;
}

static inline struct tls_sw_context_tx *tls_sw_ctx_tx(
                const struct tls_context *tls_ctx)
{
        return (struct tls_sw_context_tx *)tls_ctx->priv_ctx_tx;
}

static inline struct tls_offload_context_tx *
tls_offload_ctx_tx(const struct tls_context *tls_ctx)
{
        return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx;
}

static inline bool tls_sw_has_ctx_tx(const struct sock *sk)
{
        struct tls_context *ctx;

        if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk))
                return false;

        ctx = tls_get_ctx(sk);
        if (!ctx)
                return false;
        return !!tls_sw_ctx_tx(ctx);
}

static inline bool tls_sw_has_ctx_rx(const struct sock *sk)
{
        struct tls_context *ctx;

        if (!sk_is_inet(sk) || !inet_test_bit(IS_ICSK, sk))
                return false;

        ctx = tls_get_ctx(sk);
        if (!ctx)
                return false;
        return !!tls_sw_ctx_rx(ctx);
}

static inline struct tls_offload_context_rx *
tls_offload_ctx_rx(const struct tls_context *tls_ctx)
{
        return (struct tls_offload_context_rx *)tls_ctx->priv_ctx_rx;
}

static inline void *__tls_driver_ctx(struct tls_context *tls_ctx,
                                     enum tls_offload_ctx_dir direction)
{
        if (direction == TLS_OFFLOAD_CTX_DIR_TX)
                return tls_offload_ctx_tx(tls_ctx)->driver_state;
        else
                return tls_offload_ctx_rx(tls_ctx)->driver_state;
}

static inline void *
tls_driver_ctx(const struct sock *sk, enum tls_offload_ctx_dir direction)
{
        return __tls_driver_ctx(tls_get_ctx(sk), direction);
}

#define RESYNC_REQ BIT(0)
#define RESYNC_REQ_ASYNC BIT(1)
/* The TLS context is valid until sk_destruct is called */
static inline void tls_offload_rx_resync_request(struct sock *sk, __be32 seq)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);

        atomic64_set(&rx_ctx->resync_req, ((u64)ntohl(seq) << 32) | RESYNC_REQ);
}

/* Log all TLS record header TCP sequences in [seq, seq+len] */
static inline void
tls_offload_rx_resync_async_request_start(struct sock *sk, __be32 seq, u16 len)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);

        atomic64_set(&rx_ctx->resync_async->req, ((u64)ntohl(seq) << 32) |
                     ((u64)len << 16) | RESYNC_REQ | RESYNC_REQ_ASYNC);
        rx_ctx->resync_async->loglen = 0;
        rx_ctx->resync_async->rcd_delta = 0;
}

static inline void
tls_offload_rx_resync_async_request_end(struct sock *sk, __be32 seq)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        struct tls_offload_context_rx *rx_ctx = tls_offload_ctx_rx(tls_ctx);

        atomic64_set(&rx_ctx->resync_async->req,
                     ((u64)ntohl(seq) << 32) | RESYNC_REQ);
}

static inline void
tls_offload_rx_resync_set_type(struct sock *sk, enum tls_offload_sync_type type)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);

        tls_offload_ctx_rx(tls_ctx)->resync_type = type;
}

/* Driver's seq tracking has to be disabled until resync succeeded */
static inline bool tls_offload_tx_resync_pending(struct sock *sk)
{
        struct tls_context *tls_ctx = tls_get_ctx(sk);
        bool ret;

        ret = test_bit(TLS_TX_SYNC_SCHED, &tls_ctx->flags);
        smp_mb__after_atomic();
        return ret;
}

struct sk_buff *tls_encrypt_skb(struct sk_buff *skb);

#ifdef CONFIG_TLS_DEVICE
void tls_device_sk_destruct(struct sock *sk);
void tls_offload_tx_resync_request(struct sock *sk, u32 got_seq, u32 exp_seq);

static inline bool tls_is_sk_rx_device_offloaded(struct sock *sk)
{
        if (!sk_fullsock(sk) ||
            smp_load_acquire(&sk->sk_destruct) != tls_device_sk_destruct)
                return false;
        return tls_get_ctx(sk)->rx_conf == TLS_HW;
}
#endif
#endif /* _TLS_OFFLOAD_H */





























































    2 







    1 













    1 


















































    2 




    1 









    1 
    2 
    1 

    1 











    2 












    2 















    1 















    1 





















    4 







   12 






    9 




    5 






















    3 


    5 










    1 
































   11 








    1 

























































    1 







    1 




































    9 











    2 































   13 

   13 


   12 







   11 









    2 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#include <linux/arm-smccc.h>
#include <linux/preempt.h>
#include <linux/kvm_host.h>
#include <linux/uaccess.h>
#include <linux/wait.h>

#include <asm/cputype.h>
#include <asm/kvm_emulate.h>

#include <kvm/arm_psci.h>
#include <kvm/arm_hypercalls.h>

/*
 * This is an implementation of the Power State Coordination Interface
 * as described in ARM document number ARM DEN 0022A.
 */

#define AFFINITY_MASK(level)        ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1)

static unsigned long psci_affinity_mask(unsigned long affinity_level)
{
        if (affinity_level <= 3)
                return MPIDR_HWID_BITMASK & AFFINITY_MASK(affinity_level);

        return 0;
}

static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
{
        /*
         * NOTE: For simplicity, we make VCPU suspend emulation to be
         * same-as WFI (Wait-for-interrupt) emulation.
         *
         * This means for KVM the wakeup events are interrupts and
         * this is consistent with intended use of StateID as described
         * in section 5.4.1 of PSCI v0.2 specification (ARM DEN 0022A).
         *
         * Further, we also treat power-down request to be same as
         * stand-by request as-per section 5.4.2 clause 3 of PSCI v0.2
         * specification (ARM DEN 0022A). This means all suspend states
         * for KVM will preserve the register state.
         */
        kvm_vcpu_wfi(vcpu);

        return PSCI_RET_SUCCESS;
}

static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu,
                                           unsigned long affinity)
{
        return !(affinity & ~MPIDR_HWID_BITMASK);
}

static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
{
        struct vcpu_reset_state *reset_state;
        struct kvm *kvm = source_vcpu->kvm;
        struct kvm_vcpu *vcpu = NULL;
        int ret = PSCI_RET_SUCCESS;
        unsigned long cpu_id;

        cpu_id = smccc_get_arg1(source_vcpu);
        if (!kvm_psci_valid_affinity(source_vcpu, cpu_id))
                return PSCI_RET_INVALID_PARAMS;

        vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id);

        /*
         * Make sure the caller requested a valid CPU and that the CPU is
         * turned off.
         */
        if (!vcpu)
                return PSCI_RET_INVALID_PARAMS;

        spin_lock(&vcpu->arch.mp_state_lock);
        if (!kvm_arm_vcpu_stopped(vcpu)) {
                if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
                        ret = PSCI_RET_ALREADY_ON;
                else
                        ret = PSCI_RET_INVALID_PARAMS;

                goto out_unlock;
        }

        reset_state = &vcpu->arch.reset_state;

        reset_state->pc = smccc_get_arg2(source_vcpu);

        /* Propagate caller endianness */
        reset_state->be = kvm_vcpu_is_be(source_vcpu);

        /*
         * NOTE: We always update r0 (or x0) because for PSCI v0.1
         * the general purpose registers are undefined upon CPU_ON.
         */
        reset_state->r0 = smccc_get_arg3(source_vcpu);

        reset_state->reset = true;
        kvm_make_request(KVM_REQ_VCPU_RESET, vcpu);

        /*
         * Make sure the reset request is observed if the RUNNABLE mp_state is
         * observed.
         */
        smp_wmb();

        WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE);
        kvm_vcpu_wake_up(vcpu);

out_unlock:
        spin_unlock(&vcpu->arch.mp_state_lock);
        return ret;
}

static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
{
        int matching_cpus = 0;
        unsigned long i, mpidr;
        unsigned long target_affinity;
        unsigned long target_affinity_mask;
        unsigned long lowest_affinity_level;
        struct kvm *kvm = vcpu->kvm;
        struct kvm_vcpu *tmp;

        target_affinity = smccc_get_arg1(vcpu);
        lowest_affinity_level = smccc_get_arg2(vcpu);

        if (!kvm_psci_valid_affinity(vcpu, target_affinity))
                return PSCI_RET_INVALID_PARAMS;

        /* Determine target affinity mask */
        target_affinity_mask = psci_affinity_mask(lowest_affinity_level);
        if (!target_affinity_mask)
                return PSCI_RET_INVALID_PARAMS;

        /* Ignore other bits of target affinity */
        target_affinity &= target_affinity_mask;

        /*
         * If one or more VCPU matching target affinity are running
         * then ON else OFF
         */
        kvm_for_each_vcpu(i, tmp, kvm) {
                mpidr = kvm_vcpu_get_mpidr_aff(tmp);
                if ((mpidr & target_affinity_mask) == target_affinity) {
                        matching_cpus++;
                        if (!kvm_arm_vcpu_stopped(tmp))
                                return PSCI_0_2_AFFINITY_LEVEL_ON;
                }
        }

        if (!matching_cpus)
                return PSCI_RET_INVALID_PARAMS;

        return PSCI_0_2_AFFINITY_LEVEL_OFF;
}

static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type, u64 flags)
{
        unsigned long i;
        struct kvm_vcpu *tmp;

        /*
         * The KVM ABI specifies that a system event exit may call KVM_RUN
         * again and may perform shutdown/reboot at a later time that when the
         * actual request is made.  Since we are implementing PSCI and a
         * caller of PSCI reboot and shutdown expects that the system shuts
         * down or reboots immediately, let's make sure that VCPUs are not run
         * after this call is handled and before the VCPUs have been
         * re-initialized.
         */
        kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
                spin_lock(&tmp->arch.mp_state_lock);
                WRITE_ONCE(tmp->arch.mp_state.mp_state, KVM_MP_STATE_STOPPED);
                spin_unlock(&tmp->arch.mp_state_lock);
        }
        kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);

        memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
        vcpu->run->system_event.type = type;
        vcpu->run->system_event.ndata = 1;
        vcpu->run->system_event.data[0] = flags;
        vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
}

static void kvm_psci_system_off(struct kvm_vcpu *vcpu)
{
        kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN, 0);
}

static void kvm_psci_system_off2(struct kvm_vcpu *vcpu)
{
        kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_SHUTDOWN,
                                 KVM_SYSTEM_EVENT_SHUTDOWN_FLAG_PSCI_OFF2);
}

static void kvm_psci_system_reset(struct kvm_vcpu *vcpu)
{
        kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET, 0);
}

static void kvm_psci_system_reset2(struct kvm_vcpu *vcpu)
{
        kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET,
                                 KVM_SYSTEM_EVENT_RESET_FLAG_PSCI_RESET2);
}

static void kvm_psci_system_suspend(struct kvm_vcpu *vcpu)
{
        struct kvm_run *run = vcpu->run;

        memset(&run->system_event, 0, sizeof(vcpu->run->system_event));
        run->system_event.type = KVM_SYSTEM_EVENT_SUSPEND;
        run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
}

static void kvm_psci_narrow_to_32bit(struct kvm_vcpu *vcpu)
{
        int i;

        /*
         * Zero the input registers' upper 32 bits. They will be fully
         * zeroed on exit, so we're fine changing them in place.
         */
        for (i = 1; i < 4; i++)
                vcpu_set_reg(vcpu, i, lower_32_bits(vcpu_get_reg(vcpu, i)));
}

static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32 fn)
{
        /*
         * Prevent 32 bit guests from calling 64 bit PSCI functions.
         */
        if ((fn & PSCI_0_2_64BIT) && vcpu_mode_is_32bit(vcpu))
                return PSCI_RET_NOT_SUPPORTED;

        return 0;
}

static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
{
        u32 psci_fn = smccc_get_function(vcpu);
        unsigned long val;
        int ret = 1;

        switch (psci_fn) {
        case PSCI_0_2_FN_PSCI_VERSION:
                /*
                 * Bits[31:16] = Major Version = 0
                 * Bits[15:0] = Minor Version = 2
                 */
                val = KVM_ARM_PSCI_0_2;
                break;
        case PSCI_0_2_FN_CPU_SUSPEND:
        case PSCI_0_2_FN64_CPU_SUSPEND:
                val = kvm_psci_vcpu_suspend(vcpu);
                break;
        case PSCI_0_2_FN_CPU_OFF:
                kvm_arm_vcpu_power_off(vcpu);
                val = PSCI_RET_SUCCESS;
                break;
        case PSCI_0_2_FN_CPU_ON:
                kvm_psci_narrow_to_32bit(vcpu);
                fallthrough;
        case PSCI_0_2_FN64_CPU_ON:
                val = kvm_psci_vcpu_on(vcpu);
                break;
        case PSCI_0_2_FN_AFFINITY_INFO:
                kvm_psci_narrow_to_32bit(vcpu);
                fallthrough;
        case PSCI_0_2_FN64_AFFINITY_INFO:
                val = kvm_psci_vcpu_affinity_info(vcpu);
                break;
        case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
                /*
                 * Trusted OS is MP hence does not require migration
                 * or
                 * Trusted OS is not present
                 */
                val = PSCI_0_2_TOS_MP;
                break;
        case PSCI_0_2_FN_SYSTEM_OFF:
                kvm_psci_system_off(vcpu);
                /*
                 * We shouldn't be going back to guest VCPU after
                 * receiving SYSTEM_OFF request.
                 *
                 * If user space accidentally/deliberately resumes
                 * guest VCPU after SYSTEM_OFF request then guest
                 * VCPU should see internal failure from PSCI return
                 * value. To achieve this, we preload r0 (or x0) with
                 * PSCI return value INTERNAL_FAILURE.
                 */
                val = PSCI_RET_INTERNAL_FAILURE;
                ret = 0;
                break;
        case PSCI_0_2_FN_SYSTEM_RESET:
                kvm_psci_system_reset(vcpu);
                /*
                 * Same reason as SYSTEM_OFF for preloading r0 (or x0)
                 * with PSCI return value INTERNAL_FAILURE.
                 */
                val = PSCI_RET_INTERNAL_FAILURE;
                ret = 0;
                break;
        default:
                val = PSCI_RET_NOT_SUPPORTED;
                break;
        }

        smccc_set_retval(vcpu, val, 0, 0, 0);
        return ret;
}

static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
{
        unsigned long val = PSCI_RET_NOT_SUPPORTED;
        u32 psci_fn = smccc_get_function(vcpu);
        struct kvm *kvm = vcpu->kvm;
        u32 arg;
        int ret = 1;

        switch(psci_fn) {
        case PSCI_0_2_FN_PSCI_VERSION:
                val = PSCI_VERSION(1, minor);
                break;
        case PSCI_1_0_FN_PSCI_FEATURES:
                arg = smccc_get_arg1(vcpu);
                val = kvm_psci_check_allowed_function(vcpu, arg);
                if (val)
                        break;

                val = PSCI_RET_NOT_SUPPORTED;

                switch(arg) {
                case PSCI_0_2_FN_PSCI_VERSION:
                case PSCI_0_2_FN_CPU_SUSPEND:
                case PSCI_0_2_FN64_CPU_SUSPEND:
                case PSCI_0_2_FN_CPU_OFF:
                case PSCI_0_2_FN_CPU_ON:
                case PSCI_0_2_FN64_CPU_ON:
                case PSCI_0_2_FN_AFFINITY_INFO:
                case PSCI_0_2_FN64_AFFINITY_INFO:
                case PSCI_0_2_FN_MIGRATE_INFO_TYPE:
                case PSCI_0_2_FN_SYSTEM_OFF:
                case PSCI_0_2_FN_SYSTEM_RESET:
                case PSCI_1_0_FN_PSCI_FEATURES:
                case ARM_SMCCC_VERSION_FUNC_ID:
                        val = 0;
                        break;
                case PSCI_1_0_FN_SYSTEM_SUSPEND:
                case PSCI_1_0_FN64_SYSTEM_SUSPEND:
                        if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags))
                                val = 0;
                        break;
                case PSCI_1_1_FN_SYSTEM_RESET2:
                case PSCI_1_1_FN64_SYSTEM_RESET2:
                        if (minor >= 1)
                                val = 0;
                        break;
                case PSCI_1_3_FN_SYSTEM_OFF2:
                case PSCI_1_3_FN64_SYSTEM_OFF2:
                        if (minor >= 3)
                                val = PSCI_1_3_OFF_TYPE_HIBERNATE_OFF;
                        break;
                }
                break;
        case PSCI_1_0_FN_SYSTEM_SUSPEND:
                kvm_psci_narrow_to_32bit(vcpu);
                fallthrough;
        case PSCI_1_0_FN64_SYSTEM_SUSPEND:
                /*
                 * Return directly to userspace without changing the vCPU's
                 * registers. Userspace depends on reading the SMCCC parameters
                 * to implement SYSTEM_SUSPEND.
                 */
                if (test_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags)) {
                        kvm_psci_system_suspend(vcpu);
                        return 0;
                }
                break;
        case PSCI_1_1_FN_SYSTEM_RESET2:
                kvm_psci_narrow_to_32bit(vcpu);
                fallthrough;
        case PSCI_1_1_FN64_SYSTEM_RESET2:
                if (minor >= 1) {
                        arg = smccc_get_arg1(vcpu);

                        if (arg <= PSCI_1_1_RESET_TYPE_SYSTEM_WARM_RESET ||
                            arg >= PSCI_1_1_RESET_TYPE_VENDOR_START) {
                                kvm_psci_system_reset2(vcpu);
                                vcpu_set_reg(vcpu, 0, PSCI_RET_INTERNAL_FAILURE);
                                return 0;
                        }

                        val = PSCI_RET_INVALID_PARAMS;
                        break;
                }
                break;
        case PSCI_1_3_FN_SYSTEM_OFF2:
                kvm_psci_narrow_to_32bit(vcpu);
                fallthrough;
        case PSCI_1_3_FN64_SYSTEM_OFF2:
                if (minor < 3)
                        break;

                arg = smccc_get_arg1(vcpu);
                /*
                 * SYSTEM_OFF2 defaults to HIBERNATE_OFF if arg1 is zero. arg2
                 * must be zero.
                 */
                if ((arg && arg != PSCI_1_3_OFF_TYPE_HIBERNATE_OFF) ||
                    smccc_get_arg2(vcpu) != 0) {
                        val = PSCI_RET_INVALID_PARAMS;
                        break;
                }
                kvm_psci_system_off2(vcpu);
                /*
                 * We shouldn't be going back to the guest after receiving a
                 * SYSTEM_OFF2 request. Preload a return value of
                 * INTERNAL_FAILURE should userspace ignore the exit and resume
                 * the vCPU.
                 */
                val = PSCI_RET_INTERNAL_FAILURE;
                ret = 0;
                break;
        default:
                return kvm_psci_0_2_call(vcpu);
        }

        smccc_set_retval(vcpu, val, 0, 0, 0);
        return ret;
}

static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
{
        u32 psci_fn = smccc_get_function(vcpu);
        unsigned long val;

        switch (psci_fn) {
        case KVM_PSCI_FN_CPU_OFF:
                kvm_arm_vcpu_power_off(vcpu);
                val = PSCI_RET_SUCCESS;
                break;
        case KVM_PSCI_FN_CPU_ON:
                val = kvm_psci_vcpu_on(vcpu);
                break;
        default:
                val = PSCI_RET_NOT_SUPPORTED;
                break;
        }

        smccc_set_retval(vcpu, val, 0, 0, 0);
        return 1;
}

/**
 * kvm_psci_call - handle PSCI call if r0 value is in range
 * @vcpu: Pointer to the VCPU struct
 *
 * Handle PSCI calls from guests through traps from HVC instructions.
 * The calling convention is similar to SMC calls to the secure world
 * where the function number is placed in r0.
 *
 * This function returns: > 0 (success), 0 (success but exit to user
 * space), and < 0 (errors)
 *
 * Errors:
 * -EINVAL: Unrecognized PSCI function
 */
int kvm_psci_call(struct kvm_vcpu *vcpu)
{
        u32 psci_fn = smccc_get_function(vcpu);
        int version = kvm_psci_version(vcpu);
        unsigned long val;

        val = kvm_psci_check_allowed_function(vcpu, psci_fn);
        if (val) {
                smccc_set_retval(vcpu, val, 0, 0, 0);
                return 1;
        }

        switch (version) {
        case KVM_ARM_PSCI_1_3:
                return kvm_psci_1_x_call(vcpu, 3);
        case KVM_ARM_PSCI_1_2:
                return kvm_psci_1_x_call(vcpu, 2);
        case KVM_ARM_PSCI_1_1:
                return kvm_psci_1_x_call(vcpu, 1);
        case KVM_ARM_PSCI_1_0:
                return kvm_psci_1_x_call(vcpu, 0);
        case KVM_ARM_PSCI_0_2:
                return kvm_psci_0_2_call(vcpu);
        case KVM_ARM_PSCI_0_1:
                return kvm_psci_0_1_call(vcpu);
        default:
                WARN_ONCE(1, "Unknown PSCI version %d", version);
                smccc_set_retval(vcpu, SMCCC_RET_NOT_SUPPORTED, 0, 0, 0);
                return 1;
        }
}










































































  818 
   18 




















































































   18 






































   39 
































   39 


































































































  441 
  322 






























  442 







  653 






















  553 
  253 








  656 
























  527 
  654 
  657 


  307 
  654 


















































































































































































































































































































  309 








































   82 
  159 






   82 




  503 




  503 


















































   54 











   96 




   96 






































































































































































  818 





































  818 






















  816 

  818 





  811 









































































































































































































  112 

  117 








  110 













   55 





















































































































































































































































































  650 


  215 

  105 






  698 



































  723 










  401 


























  380 
  653 
  266 
  656 

















  110 
  110 













   55 
   55 











   84 
   84 



































































   17 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_PGTABLE_H
#define __ASM_PGTABLE_H

#include <asm/bug.h>
#include <asm/proc-fns.h>

#include <asm/memory.h>
#include <asm/mte.h>
#include <asm/pgtable-hwdef.h>
#include <asm/pgtable-prot.h>
#include <asm/tlbflush.h>

/*
 * VMALLOC range.
 *
 * VMALLOC_START: beginning of the kernel vmalloc space
 * VMALLOC_END: extends to the available space below vmemmap
 */
#define VMALLOC_START                (MODULES_END)
#if VA_BITS == VA_BITS_MIN
#define VMALLOC_END                (VMEMMAP_START - SZ_8M)
#else
#define VMEMMAP_UNUSED_NPAGES        ((_PAGE_OFFSET(vabits_actual) - PAGE_OFFSET) >> PAGE_SHIFT)
#define VMALLOC_END                (VMEMMAP_START + VMEMMAP_UNUSED_NPAGES * sizeof(struct page) - SZ_8M)
#endif

#define vmemmap                        ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))

#ifndef __ASSEMBLY__

#include <asm/cmpxchg.h>
#include <asm/fixmap.h>
#include <asm/por.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/sched.h>
#include <linux/page_table_check.h>

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE

/* Set stride and tlb_level in flush_*_tlb_range */
#define flush_pmd_tlb_range(vma, addr, end)        \
        __flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2)
#define flush_pud_tlb_range(vma, addr, end)        \
        __flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

/*
 * Outside of a few very special situations (e.g. hibernation), we always
 * use broadcast TLB invalidation instructions, therefore a spurious page
 * fault on one CPU which has been handled concurrently by another CPU
 * does not need to perform additional invalidation.
 */
#define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)

/*
 * ZERO_PAGE is a global shared page that is always zero: used
 * for zero-mapped memory areas etc..
 */
extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
#define ZERO_PAGE(vaddr)        phys_to_page(__pa_symbol(empty_zero_page))

#define pte_ERROR(e)        \
        pr_err("%s:%d: bad pte %016llx.\n", __FILE__, __LINE__, pte_val(e))

#ifdef CONFIG_ARM64_PA_BITS_52
static inline phys_addr_t __pte_to_phys(pte_t pte)
{
        pte_val(pte) &= ~PTE_MAYBE_SHARED;
        return (pte_val(pte) & PTE_ADDR_LOW) |
                ((pte_val(pte) & PTE_ADDR_HIGH) << PTE_ADDR_HIGH_SHIFT);
}
static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
{
        return (phys | (phys >> PTE_ADDR_HIGH_SHIFT)) & PHYS_TO_PTE_ADDR_MASK;
}
#else
static inline phys_addr_t __pte_to_phys(pte_t pte)
{
        return pte_val(pte) & PTE_ADDR_LOW;
}

static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
{
        return phys;
}
#endif

#define pte_pfn(pte)                (__pte_to_phys(pte) >> PAGE_SHIFT)
#define pfn_pte(pfn,prot)        \
        __pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))

#define pte_none(pte)                (!pte_val(pte))
#define __pte_clear(mm, addr, ptep) \
                                __set_pte(ptep, __pte(0))
#define pte_page(pte)                (pfn_to_page(pte_pfn(pte)))

/*
 * The following only work if pte_present(). Undefined behaviour otherwise.
 */
#define pte_present(pte)        (pte_valid(pte) || pte_present_invalid(pte))
#define pte_young(pte)                (!!(pte_val(pte) & PTE_AF))
#define pte_special(pte)        (!!(pte_val(pte) & PTE_SPECIAL))
#define pte_write(pte)                (!!(pte_val(pte) & PTE_WRITE))
#define pte_rdonly(pte)                (!!(pte_val(pte) & PTE_RDONLY))
#define pte_user(pte)                (!!(pte_val(pte) & PTE_USER))
#define pte_user_exec(pte)        (!(pte_val(pte) & PTE_UXN))
#define pte_cont(pte)                (!!(pte_val(pte) & PTE_CONT))
#define pte_devmap(pte)                (!!(pte_val(pte) & PTE_DEVMAP))
#define pte_tagged(pte)                ((pte_val(pte) & PTE_ATTRINDX_MASK) == \
                                 PTE_ATTRINDX(MT_NORMAL_TAGGED))

#define pte_cont_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + CONT_PTE_SIZE) & CONT_PTE_MASK;        \
        (__boundary - 1 < (end) - 1) ? __boundary : (end);                        \
})

#define pmd_cont_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + CONT_PMD_SIZE) & CONT_PMD_MASK;        \
        (__boundary - 1 < (end) - 1) ? __boundary : (end);                        \
})

#define pte_hw_dirty(pte)        (pte_write(pte) && !pte_rdonly(pte))
#define pte_sw_dirty(pte)        (!!(pte_val(pte) & PTE_DIRTY))
#define pte_dirty(pte)                (pte_sw_dirty(pte) || pte_hw_dirty(pte))

#define pte_valid(pte)                (!!(pte_val(pte) & PTE_VALID))
#define pte_present_invalid(pte) \
        ((pte_val(pte) & (PTE_VALID | PTE_PRESENT_INVALID)) == PTE_PRESENT_INVALID)
/*
 * Execute-only user mappings do not have the PTE_USER bit set. All valid
 * kernel mappings have the PTE_UXN bit set.
 */
#define pte_valid_not_user(pte) \
        ((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN))
/*
 * Returns true if the pte is valid and has the contiguous bit set.
 */
#define pte_valid_cont(pte)        (pte_valid(pte) && pte_cont(pte))
/*
 * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
 * so that we don't erroneously return false for pages that have been
 * remapped as PROT_NONE but are yet to be flushed from the TLB.
 * Note that we can't make any assumptions based on the state of the access
 * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the
 * TLB.
 */
#define pte_accessible(mm, pte)        \
        (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))

static inline bool por_el0_allows_pkey(u8 pkey, bool write, bool execute)
{
        u64 por;

        if (!system_supports_poe())
                return true;

        por = read_sysreg_s(SYS_POR_EL0);

        if (write)
                return por_elx_allows_write(por, pkey);

        if (execute)
                return por_elx_allows_exec(por, pkey);

        return por_elx_allows_read(por, pkey);
}

/*
 * p??_access_permitted() is true for valid user mappings (PTE_USER
 * bit set, subject to the write permission check). For execute-only
 * mappings, like PROT_EXEC with EPAN (both PTE_USER and PTE_UXN bits
 * not set) must return false. PROT_NONE mappings do not have the
 * PTE_VALID bit set.
 */
#define pte_access_permitted_no_overlay(pte, write) \
        (((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) && (!(write) || pte_write(pte)))
#define pte_access_permitted(pte, write) \
        (pte_access_permitted_no_overlay(pte, write) && \
        por_el0_allows_pkey(FIELD_GET(PTE_PO_IDX_MASK, pte_val(pte)), write, false))
#define pmd_access_permitted(pmd, write) \
        (pte_access_permitted(pmd_pte(pmd), (write)))
#define pud_access_permitted(pud, write) \
        (pte_access_permitted(pud_pte(pud), (write)))

static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot)
{
        pte_val(pte) &= ~pgprot_val(prot);
        return pte;
}

static inline pte_t set_pte_bit(pte_t pte, pgprot_t prot)
{
        pte_val(pte) |= pgprot_val(prot);
        return pte;
}

static inline pmd_t clear_pmd_bit(pmd_t pmd, pgprot_t prot)
{
        pmd_val(pmd) &= ~pgprot_val(prot);
        return pmd;
}

static inline pmd_t set_pmd_bit(pmd_t pmd, pgprot_t prot)
{
        pmd_val(pmd) |= pgprot_val(prot);
        return pmd;
}

static inline pte_t pte_mkwrite_novma(pte_t pte)
{
        pte = set_pte_bit(pte, __pgprot(PTE_WRITE));
        pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
        return pte;
}

static inline pte_t pte_mkclean(pte_t pte)
{
        pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
        pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));

        return pte;
}

static inline pte_t pte_mkdirty(pte_t pte)
{
        pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));

        if (pte_write(pte))
                pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));

        return pte;
}

static inline pte_t pte_wrprotect(pte_t pte)
{
        /*
         * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY
         * clear), set the PTE_DIRTY bit.
         */
        if (pte_hw_dirty(pte))
                pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));

        pte = clear_pte_bit(pte, __pgprot(PTE_WRITE));
        pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
        return pte;
}

static inline pte_t pte_mkold(pte_t pte)
{
        return clear_pte_bit(pte, __pgprot(PTE_AF));
}

static inline pte_t pte_mkyoung(pte_t pte)
{
        return set_pte_bit(pte, __pgprot(PTE_AF));
}

static inline pte_t pte_mkspecial(pte_t pte)
{
        return set_pte_bit(pte, __pgprot(PTE_SPECIAL));
}

static inline pte_t pte_mkcont(pte_t pte)
{
        return set_pte_bit(pte, __pgprot(PTE_CONT));
}

static inline pte_t pte_mknoncont(pte_t pte)
{
        return clear_pte_bit(pte, __pgprot(PTE_CONT));
}

static inline pte_t pte_mkvalid(pte_t pte)
{
        return set_pte_bit(pte, __pgprot(PTE_VALID));
}

static inline pte_t pte_mkinvalid(pte_t pte)
{
        pte = set_pte_bit(pte, __pgprot(PTE_PRESENT_INVALID));
        pte = clear_pte_bit(pte, __pgprot(PTE_VALID));
        return pte;
}

static inline pmd_t pmd_mkcont(pmd_t pmd)
{
        return __pmd(pmd_val(pmd) | PMD_SECT_CONT);
}

static inline pte_t pte_mkdevmap(pte_t pte)
{
        return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline int pte_uffd_wp(pte_t pte)
{
        return !!(pte_val(pte) & PTE_UFFD_WP);
}

static inline pte_t pte_mkuffd_wp(pte_t pte)
{
        return pte_wrprotect(set_pte_bit(pte, __pgprot(PTE_UFFD_WP)));
}

static inline pte_t pte_clear_uffd_wp(pte_t pte)
{
        return clear_pte_bit(pte, __pgprot(PTE_UFFD_WP));
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

static inline void __set_pte_nosync(pte_t *ptep, pte_t pte)
{
        WRITE_ONCE(*ptep, pte);
}

static inline void __set_pte(pte_t *ptep, pte_t pte)
{
        __set_pte_nosync(ptep, pte);

        /*
         * Only if the new pte is valid and kernel, otherwise TLB maintenance
         * or update_mmu_cache() have the necessary barriers.
         */
        if (pte_valid_not_user(pte)) {
                dsb(ishst);
                isb();
        }
}

static inline pte_t __ptep_get(pte_t *ptep)
{
        return READ_ONCE(*ptep);
}

extern void __sync_icache_dcache(pte_t pteval);
bool pgattr_change_is_safe(pteval_t old, pteval_t new);

/*
 * PTE bits configuration in the presence of hardware Dirty Bit Management
 * (PTE_WRITE == PTE_DBM):
 *
 * Dirty  Writable | PTE_RDONLY  PTE_WRITE  PTE_DIRTY (sw)
 *   0      0      |   1           0          0
 *   0      1      |   1           1          0
 *   1      0      |   1           0          1
 *   1      1      |   0           1          x
 *
 * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via
 * the page fault mechanism. Checking the dirty status of a pte becomes:
 *
 *   PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY)
 */

static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
                                           pte_t pte)
{
        pte_t old_pte;

        if (!IS_ENABLED(CONFIG_DEBUG_VM))
                return;

        old_pte = __ptep_get(ptep);

        if (!pte_valid(old_pte) || !pte_valid(pte))
                return;
        if (mm != current->active_mm && atomic_read(&mm->mm_users) <= 1)
                return;

        /*
         * Check for potential race with hardware updates of the pte
         * (__ptep_set_access_flags safely changes valid ptes without going
         * through an invalid entry).
         */
        VM_WARN_ONCE(!pte_young(pte),
                     "%s: racy access flag clearing: 0x%016llx -> 0x%016llx",
                     __func__, pte_val(old_pte), pte_val(pte));
        VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte),
                     "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx",
                     __func__, pte_val(old_pte), pte_val(pte));
        VM_WARN_ONCE(!pgattr_change_is_safe(pte_val(old_pte), pte_val(pte)),
                     "%s: unsafe attribute change: 0x%016llx -> 0x%016llx",
                     __func__, pte_val(old_pte), pte_val(pte));
}

static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages)
{
        if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
                __sync_icache_dcache(pte);

        /*
         * If the PTE would provide user space access to the tags associated
         * with it then ensure that the MTE tags are synchronised.  Although
         * pte_access_permitted_no_overlay() returns false for exec only
         * mappings, they don't expose tags (instruction fetches don't check
         * tags).
         */
        if (system_supports_mte() && pte_access_permitted_no_overlay(pte, false) &&
            !pte_special(pte) && pte_tagged(pte))
                mte_sync_tags(pte, nr_pages);
}

/*
 * Select all bits except the pfn
 */
#define pte_pgprot pte_pgprot
static inline pgprot_t pte_pgprot(pte_t pte)
{
        unsigned long pfn = pte_pfn(pte);

        return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
}

#define pte_advance_pfn pte_advance_pfn
static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte));
}

static inline void __set_ptes(struct mm_struct *mm,
                              unsigned long __always_unused addr,
                              pte_t *ptep, pte_t pte, unsigned int nr)
{
        page_table_check_ptes_set(mm, ptep, pte, nr);
        __sync_cache_and_tags(pte, nr);

        for (;;) {
                __check_safe_pte_update(mm, ptep, pte);
                __set_pte(ptep, pte);
                if (--nr == 0)
                        break;
                ptep++;
                pte = pte_advance_pfn(pte, 1);
        }
}

/*
 * Hugetlb definitions.
 */
#define HUGE_MAX_HSTATE                4
#define HPAGE_SHIFT                PMD_SHIFT
#define HPAGE_SIZE                (_AC(1, UL) << HPAGE_SHIFT)
#define HPAGE_MASK                (~(HPAGE_SIZE - 1))
#define HUGETLB_PAGE_ORDER        (HPAGE_SHIFT - PAGE_SHIFT)

static inline pte_t pgd_pte(pgd_t pgd)
{
        return __pte(pgd_val(pgd));
}

static inline pte_t p4d_pte(p4d_t p4d)
{
        return __pte(p4d_val(p4d));
}

static inline pte_t pud_pte(pud_t pud)
{
        return __pte(pud_val(pud));
}

static inline pud_t pte_pud(pte_t pte)
{
        return __pud(pte_val(pte));
}

static inline pmd_t pud_pmd(pud_t pud)
{
        return __pmd(pud_val(pud));
}

static inline pte_t pmd_pte(pmd_t pmd)
{
        return __pte(pmd_val(pmd));
}

static inline pmd_t pte_pmd(pte_t pte)
{
        return __pmd(pte_val(pte));
}

static inline pgprot_t mk_pud_sect_prot(pgprot_t prot)
{
        return __pgprot((pgprot_val(prot) & ~PUD_TYPE_MASK) | PUD_TYPE_SECT);
}

static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot)
{
        return __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) | PMD_TYPE_SECT);
}

static inline pte_t pte_swp_mkexclusive(pte_t pte)
{
        return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
}

static inline int pte_swp_exclusive(pte_t pte)
{
        return pte_val(pte) & PTE_SWP_EXCLUSIVE;
}

static inline pte_t pte_swp_clear_exclusive(pte_t pte)
{
        return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
}

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
static inline pte_t pte_swp_mkuffd_wp(pte_t pte)
{
        return set_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP));
}

static inline int pte_swp_uffd_wp(pte_t pte)
{
        return !!(pte_val(pte) & PTE_SWP_UFFD_WP);
}

static inline pte_t pte_swp_clear_uffd_wp(pte_t pte)
{
        return clear_pte_bit(pte, __pgprot(PTE_SWP_UFFD_WP));
}
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

#ifdef CONFIG_NUMA_BALANCING
/*
 * See the comment in include/linux/pgtable.h
 */
static inline int pte_protnone(pte_t pte)
{
        /*
         * pte_present_invalid() tells us that the pte is invalid from HW
         * perspective but present from SW perspective, so the fields are to be
         * interpretted as per the HW layout. The second 2 checks are the unique
         * encoding that we use for PROT_NONE. It is insufficient to only use
         * the first check because we share the same encoding scheme with pmds
         * which support pmd_mkinvalid(), so can be present-invalid without
         * being PROT_NONE.
         */
        return pte_present_invalid(pte) && !pte_user(pte) && !pte_user_exec(pte);
}

static inline int pmd_protnone(pmd_t pmd)
{
        return pte_protnone(pmd_pte(pmd));
}
#endif

#define pmd_present(pmd)        pte_present(pmd_pte(pmd))
#define pmd_dirty(pmd)                pte_dirty(pmd_pte(pmd))
#define pmd_young(pmd)                pte_young(pmd_pte(pmd))
#define pmd_valid(pmd)                pte_valid(pmd_pte(pmd))
#define pmd_user(pmd)                pte_user(pmd_pte(pmd))
#define pmd_user_exec(pmd)        pte_user_exec(pmd_pte(pmd))
#define pmd_cont(pmd)                pte_cont(pmd_pte(pmd))
#define pmd_wrprotect(pmd)        pte_pmd(pte_wrprotect(pmd_pte(pmd)))
#define pmd_mkold(pmd)                pte_pmd(pte_mkold(pmd_pte(pmd)))
#define pmd_mkwrite_novma(pmd)        pte_pmd(pte_mkwrite_novma(pmd_pte(pmd)))
#define pmd_mkclean(pmd)        pte_pmd(pte_mkclean(pmd_pte(pmd)))
#define pmd_mkdirty(pmd)        pte_pmd(pte_mkdirty(pmd_pte(pmd)))
#define pmd_mkyoung(pmd)        pte_pmd(pte_mkyoung(pmd_pte(pmd)))
#define pmd_mkinvalid(pmd)        pte_pmd(pte_mkinvalid(pmd_pte(pmd)))
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define pmd_uffd_wp(pmd)        pte_uffd_wp(pmd_pte(pmd))
#define pmd_mkuffd_wp(pmd)        pte_pmd(pte_mkuffd_wp(pmd_pte(pmd)))
#define pmd_clear_uffd_wp(pmd)        pte_pmd(pte_clear_uffd_wp(pmd_pte(pmd)))
#define pmd_swp_uffd_wp(pmd)        pte_swp_uffd_wp(pmd_pte(pmd))
#define pmd_swp_mkuffd_wp(pmd)        pte_pmd(pte_swp_mkuffd_wp(pmd_pte(pmd)))
#define pmd_swp_clear_uffd_wp(pmd) \
                                pte_pmd(pte_swp_clear_uffd_wp(pmd_pte(pmd)))
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

#define pmd_write(pmd)                pte_write(pmd_pte(pmd))

static inline pmd_t pmd_mkhuge(pmd_t pmd)
{
        /*
         * It's possible that the pmd is present-invalid on entry
         * and in that case it needs to remain present-invalid on
         * exit. So ensure the VALID bit does not get modified.
         */
        pmdval_t mask = PMD_TYPE_MASK & ~PTE_VALID;
        pmdval_t val = PMD_TYPE_SECT & ~PTE_VALID;

        return __pmd((pmd_val(pmd) & ~mask) | val);
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define pmd_devmap(pmd)                pte_devmap(pmd_pte(pmd))
#endif
static inline pmd_t pmd_mkdevmap(pmd_t pmd)
{
        return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP)));
}

#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
#define pmd_special(pte)        (!!((pmd_val(pte) & PTE_SPECIAL)))
static inline pmd_t pmd_mkspecial(pmd_t pmd)
{
        return set_pmd_bit(pmd, __pgprot(PTE_SPECIAL));
}
#endif

#define __pmd_to_phys(pmd)        __pte_to_phys(pmd_pte(pmd))
#define __phys_to_pmd_val(phys)        __phys_to_pte_val(phys)
#define pmd_pfn(pmd)                ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT)
#define pfn_pmd(pfn,prot)        __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
#define mk_pmd(page,prot)        pfn_pmd(page_to_pfn(page),prot)

#define pud_young(pud)                pte_young(pud_pte(pud))
#define pud_mkyoung(pud)        pte_pud(pte_mkyoung(pud_pte(pud)))
#define pud_write(pud)                pte_write(pud_pte(pud))

static inline pud_t pud_mkhuge(pud_t pud)
{
        /*
         * It's possible that the pud is present-invalid on entry
         * and in that case it needs to remain present-invalid on
         * exit. So ensure the VALID bit does not get modified.
         */
        pudval_t mask = PUD_TYPE_MASK & ~PTE_VALID;
        pudval_t val = PUD_TYPE_SECT & ~PTE_VALID;

        return __pud((pud_val(pud) & ~mask) | val);
}

#define __pud_to_phys(pud)        __pte_to_phys(pud_pte(pud))
#define __phys_to_pud_val(phys)        __phys_to_pte_val(phys)
#define pud_pfn(pud)                ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
#define pfn_pud(pfn,prot)        __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))

#define pmd_pgprot pmd_pgprot
static inline pgprot_t pmd_pgprot(pmd_t pmd)
{
        unsigned long pfn = pmd_pfn(pmd);

        return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd));
}

#define pud_pgprot pud_pgprot
static inline pgprot_t pud_pgprot(pud_t pud)
{
        unsigned long pfn = pud_pfn(pud);

        return __pgprot(pud_val(pfn_pud(pfn, __pgprot(0))) ^ pud_val(pud));
}

static inline void __set_pte_at(struct mm_struct *mm,
                                unsigned long __always_unused addr,
                                pte_t *ptep, pte_t pte, unsigned int nr)
{
        __sync_cache_and_tags(pte, nr);
        __check_safe_pte_update(mm, ptep, pte);
        __set_pte(ptep, pte);
}

static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
                              pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(mm, pmdp, pmd);
        return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd),
                                                PMD_SIZE >> PAGE_SHIFT);
}

static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
                              pud_t *pudp, pud_t pud)
{
        page_table_check_pud_set(mm, pudp, pud);
        return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud),
                                                PUD_SIZE >> PAGE_SHIFT);
}

#define __p4d_to_phys(p4d)        __pte_to_phys(p4d_pte(p4d))
#define __phys_to_p4d_val(phys)        __phys_to_pte_val(phys)

#define __pgd_to_phys(pgd)        __pte_to_phys(pgd_pte(pgd))
#define __phys_to_pgd_val(phys)        __phys_to_pte_val(phys)

#define __pgprot_modify(prot,mask,bits) \
        __pgprot((pgprot_val(prot) & ~(mask)) | (bits))

#define pgprot_nx(prot) \
        __pgprot_modify(prot, PTE_MAYBE_GP, PTE_PXN)

#define pgprot_decrypted(prot) \
        __pgprot_modify(prot, PROT_NS_SHARED, PROT_NS_SHARED)
#define pgprot_encrypted(prot) \
        __pgprot_modify(prot, PROT_NS_SHARED, 0)

/*
 * Mark the prot value as uncacheable and unbufferable.
 */
#define pgprot_noncached(prot) \
        __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRnE) | PTE_PXN | PTE_UXN)
#define pgprot_writecombine(prot) \
        __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)
#define pgprot_device(prot) \
        __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_DEVICE_nGnRE) | PTE_PXN | PTE_UXN)
#define pgprot_tagged(prot) \
        __pgprot_modify(prot, PTE_ATTRINDX_MASK, PTE_ATTRINDX(MT_NORMAL_TAGGED))
#define pgprot_mhp        pgprot_tagged
/*
 * DMA allocations for non-coherent devices use what the Arm architecture calls
 * "Normal non-cacheable" memory, which permits speculation, unaligned accesses
 * and merging of writes.  This is different from "Device-nGnR[nE]" memory which
 * is intended for MMIO and thus forbids speculation, preserves access size,
 * requires strict alignment and can also force write responses to come from the
 * endpoint.
 */
#define pgprot_dmacoherent(prot) \
        __pgprot_modify(prot, PTE_ATTRINDX_MASK, \
                        PTE_ATTRINDX(MT_NORMAL_NC) | PTE_PXN | PTE_UXN)

#define __HAVE_PHYS_MEM_ACCESS_PROT
struct file;
extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
                                     unsigned long size, pgprot_t vma_prot);

#define pmd_none(pmd)                (!pmd_val(pmd))

#define pmd_table(pmd)                ((pmd_val(pmd) & PMD_TYPE_MASK) == \
                                 PMD_TYPE_TABLE)
#define pmd_sect(pmd)                ((pmd_val(pmd) & PMD_TYPE_MASK) == \
                                 PMD_TYPE_SECT)
#define pmd_leaf(pmd)                (pmd_present(pmd) && !pmd_table(pmd))
#define pmd_bad(pmd)                (!pmd_table(pmd))

#define pmd_leaf_size(pmd)        (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)
#define pte_leaf_size(pte)        (pte_cont(pte) ? CONT_PTE_SIZE : PAGE_SIZE)

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        /*
         * If pmd is present-invalid, pmd_table() won't detect it
         * as a table, so force the valid bit for the comparison.
         */
        return pmd_val(pmd) && pmd_present(pmd) &&
               !pmd_table(__pmd(pmd_val(pmd) | PTE_VALID));
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#if defined(CONFIG_ARM64_64K_PAGES) || CONFIG_PGTABLE_LEVELS < 3
static inline bool pud_sect(pud_t pud) { return false; }
static inline bool pud_table(pud_t pud) { return true; }
#else
#define pud_sect(pud)                ((pud_val(pud) & PUD_TYPE_MASK) == \
                                 PUD_TYPE_SECT)
#define pud_table(pud)                ((pud_val(pud) & PUD_TYPE_MASK) == \
                                 PUD_TYPE_TABLE)
#endif

extern pgd_t init_pg_dir[];
extern pgd_t init_pg_end[];
extern pgd_t swapper_pg_dir[];
extern pgd_t idmap_pg_dir[];
extern pgd_t tramp_pg_dir[];
extern pgd_t reserved_pg_dir[];

extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd);

static inline bool in_swapper_pgdir(void *addr)
{
        return ((unsigned long)addr & PAGE_MASK) ==
                ((unsigned long)swapper_pg_dir & PAGE_MASK);
}

static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
#ifdef __PAGETABLE_PMD_FOLDED
        if (in_swapper_pgdir(pmdp)) {
                set_swapper_pgd((pgd_t *)pmdp, __pgd(pmd_val(pmd)));
                return;
        }
#endif /* __PAGETABLE_PMD_FOLDED */

        WRITE_ONCE(*pmdp, pmd);

        if (pmd_valid(pmd)) {
                dsb(ishst);
                isb();
        }
}

static inline void pmd_clear(pmd_t *pmdp)
{
        set_pmd(pmdp, __pmd(0));
}

static inline phys_addr_t pmd_page_paddr(pmd_t pmd)
{
        return __pmd_to_phys(pmd);
}

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
        return (unsigned long)__va(pmd_page_paddr(pmd));
}

/* Find an entry in the third-level page table. */
#define pte_offset_phys(dir,addr)        (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t))

#define pte_set_fixmap(addr)                ((pte_t *)set_fixmap_offset(FIX_PTE, addr))
#define pte_set_fixmap_offset(pmd, addr)        pte_set_fixmap(pte_offset_phys(pmd, addr))
#define pte_clear_fixmap()                clear_fixmap(FIX_PTE)

#define pmd_page(pmd)                        phys_to_page(__pmd_to_phys(pmd))

/* use ONLY for statically allocated translation tables */
#define pte_offset_kimg(dir,addr)        ((pte_t *)__phys_to_kimg(pte_offset_phys((dir), (addr))))

/*
 * Conversion functions: convert a page and protection to a page entry,
 * and a page entry and page directory to the page they refer to.
 */
#define mk_pte(page,prot)        pfn_pte(page_to_pfn(page),prot)

#if CONFIG_PGTABLE_LEVELS > 2

#define pmd_ERROR(e)        \
        pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e))

#define pud_none(pud)                (!pud_val(pud))
#define pud_bad(pud)                ((pud_val(pud) & PUD_TYPE_MASK) != \
                                 PUD_TYPE_TABLE)
#define pud_present(pud)        pte_present(pud_pte(pud))
#ifndef __PAGETABLE_PMD_FOLDED
#define pud_leaf(pud)                (pud_present(pud) && !pud_table(pud))
#else
#define pud_leaf(pud)                false
#endif
#define pud_valid(pud)                pte_valid(pud_pte(pud))
#define pud_user(pud)                pte_user(pud_pte(pud))
#define pud_user_exec(pud)        pte_user_exec(pud_pte(pud))

static inline bool pgtable_l4_enabled(void);

static inline void set_pud(pud_t *pudp, pud_t pud)
{
        if (!pgtable_l4_enabled() && in_swapper_pgdir(pudp)) {
                set_swapper_pgd((pgd_t *)pudp, __pgd(pud_val(pud)));
                return;
        }

        WRITE_ONCE(*pudp, pud);

        if (pud_valid(pud)) {
                dsb(ishst);
                isb();
        }
}

static inline void pud_clear(pud_t *pudp)
{
        set_pud(pudp, __pud(0));
}

static inline phys_addr_t pud_page_paddr(pud_t pud)
{
        return __pud_to_phys(pud);
}

static inline pmd_t *pud_pgtable(pud_t pud)
{
        return (pmd_t *)__va(pud_page_paddr(pud));
}

/* Find an entry in the second-level page table. */
#define pmd_offset_phys(dir, addr)        (pud_page_paddr(READ_ONCE(*(dir))) + pmd_index(addr) * sizeof(pmd_t))

#define pmd_set_fixmap(addr)                ((pmd_t *)set_fixmap_offset(FIX_PMD, addr))
#define pmd_set_fixmap_offset(pud, addr)        pmd_set_fixmap(pmd_offset_phys(pud, addr))
#define pmd_clear_fixmap()                clear_fixmap(FIX_PMD)

#define pud_page(pud)                        phys_to_page(__pud_to_phys(pud))

/* use ONLY for statically allocated translation tables */
#define pmd_offset_kimg(dir,addr)        ((pmd_t *)__phys_to_kimg(pmd_offset_phys((dir), (addr))))

#else

#define pud_valid(pud)                false
#define pud_page_paddr(pud)        ({ BUILD_BUG(); 0; })
#define pud_user_exec(pud)        pud_user(pud) /* Always 0 with folding */

/* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */
#define pmd_set_fixmap(addr)                NULL
#define pmd_set_fixmap_offset(pudp, addr)        ((pmd_t *)pudp)
#define pmd_clear_fixmap()

#define pmd_offset_kimg(dir,addr)        ((pmd_t *)dir)

#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3

static __always_inline bool pgtable_l4_enabled(void)
{
        if (CONFIG_PGTABLE_LEVELS > 4 || !IS_ENABLED(CONFIG_ARM64_LPA2))
                return true;
        if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT))
                return vabits_actual == VA_BITS;
        return alternative_has_cap_unlikely(ARM64_HAS_VA52);
}

static inline bool mm_pud_folded(const struct mm_struct *mm)
{
        return !pgtable_l4_enabled();
}
#define mm_pud_folded  mm_pud_folded

#define pud_ERROR(e)        \
        pr_err("%s:%d: bad pud %016llx.\n", __FILE__, __LINE__, pud_val(e))

#define p4d_none(p4d)                (pgtable_l4_enabled() && !p4d_val(p4d))
#define p4d_bad(p4d)                (pgtable_l4_enabled() && \
                                ((p4d_val(p4d) & P4D_TYPE_MASK) != \
                                 P4D_TYPE_TABLE))
#define p4d_present(p4d)        (!p4d_none(p4d))

static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
{
        if (in_swapper_pgdir(p4dp)) {
                set_swapper_pgd((pgd_t *)p4dp, __pgd(p4d_val(p4d)));
                return;
        }

        WRITE_ONCE(*p4dp, p4d);
        dsb(ishst);
        isb();
}

static inline void p4d_clear(p4d_t *p4dp)
{
        if (pgtable_l4_enabled())
                set_p4d(p4dp, __p4d(0));
}

static inline phys_addr_t p4d_page_paddr(p4d_t p4d)
{
        return __p4d_to_phys(p4d);
}

#define pud_index(addr)                (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))

static inline pud_t *p4d_to_folded_pud(p4d_t *p4dp, unsigned long addr)
{
        /* Ensure that 'p4dp' indexes a page table according to 'addr' */
        VM_BUG_ON(((addr >> P4D_SHIFT) ^ ((u64)p4dp >> 3)) % PTRS_PER_P4D);

        return (pud_t *)PTR_ALIGN_DOWN(p4dp, PAGE_SIZE) + pud_index(addr);
}

static inline pud_t *p4d_pgtable(p4d_t p4d)
{
        return (pud_t *)__va(p4d_page_paddr(p4d));
}

static inline phys_addr_t pud_offset_phys(p4d_t *p4dp, unsigned long addr)
{
        BUG_ON(!pgtable_l4_enabled());

        return p4d_page_paddr(READ_ONCE(*p4dp)) + pud_index(addr) * sizeof(pud_t);
}

static inline
pud_t *pud_offset_lockless(p4d_t *p4dp, p4d_t p4d, unsigned long addr)
{
        if (!pgtable_l4_enabled())
                return p4d_to_folded_pud(p4dp, addr);
        return (pud_t *)__va(p4d_page_paddr(p4d)) + pud_index(addr);
}
#define pud_offset_lockless pud_offset_lockless

static inline pud_t *pud_offset(p4d_t *p4dp, unsigned long addr)
{
        return pud_offset_lockless(p4dp, READ_ONCE(*p4dp), addr);
}
#define pud_offset        pud_offset

static inline pud_t *pud_set_fixmap(unsigned long addr)
{
        if (!pgtable_l4_enabled())
                return NULL;
        return (pud_t *)set_fixmap_offset(FIX_PUD, addr);
}

static inline pud_t *pud_set_fixmap_offset(p4d_t *p4dp, unsigned long addr)
{
        if (!pgtable_l4_enabled())
                return p4d_to_folded_pud(p4dp, addr);
        return pud_set_fixmap(pud_offset_phys(p4dp, addr));
}

static inline void pud_clear_fixmap(void)
{
        if (pgtable_l4_enabled())
                clear_fixmap(FIX_PUD);
}

/* use ONLY for statically allocated translation tables */
static inline pud_t *pud_offset_kimg(p4d_t *p4dp, u64 addr)
{
        if (!pgtable_l4_enabled())
                return p4d_to_folded_pud(p4dp, addr);
        return (pud_t *)__phys_to_kimg(pud_offset_phys(p4dp, addr));
}

#define p4d_page(p4d)                pfn_to_page(__phys_to_pfn(__p4d_to_phys(p4d)))

#else

static inline bool pgtable_l4_enabled(void) { return false; }

#define p4d_page_paddr(p4d)        ({ BUILD_BUG(); 0;})

/* Match pud_offset folding in <asm/generic/pgtable-nopud.h> */
#define pud_set_fixmap(addr)                NULL
#define pud_set_fixmap_offset(pgdp, addr)        ((pud_t *)pgdp)
#define pud_clear_fixmap()

#define pud_offset_kimg(dir,addr)        ((pud_t *)dir)

#endif  /* CONFIG_PGTABLE_LEVELS > 3 */

#if CONFIG_PGTABLE_LEVELS > 4

static __always_inline bool pgtable_l5_enabled(void)
{
        if (!alternative_has_cap_likely(ARM64_ALWAYS_BOOT))
                return vabits_actual == VA_BITS;
        return alternative_has_cap_unlikely(ARM64_HAS_VA52);
}

static inline bool mm_p4d_folded(const struct mm_struct *mm)
{
        return !pgtable_l5_enabled();
}
#define mm_p4d_folded  mm_p4d_folded

#define p4d_ERROR(e)        \
        pr_err("%s:%d: bad p4d %016llx.\n", __FILE__, __LINE__, p4d_val(e))

#define pgd_none(pgd)                (pgtable_l5_enabled() && !pgd_val(pgd))
#define pgd_bad(pgd)                (pgtable_l5_enabled() && \
                                ((pgd_val(pgd) & PGD_TYPE_MASK) != \
                                 PGD_TYPE_TABLE))
#define pgd_present(pgd)        (!pgd_none(pgd))

static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
{
        if (in_swapper_pgdir(pgdp)) {
                set_swapper_pgd(pgdp, __pgd(pgd_val(pgd)));
                return;
        }

        WRITE_ONCE(*pgdp, pgd);
        dsb(ishst);
        isb();
}

static inline void pgd_clear(pgd_t *pgdp)
{
        if (pgtable_l5_enabled())
                set_pgd(pgdp, __pgd(0));
}

static inline phys_addr_t pgd_page_paddr(pgd_t pgd)
{
        return __pgd_to_phys(pgd);
}

#define p4d_index(addr)                (((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1))

static inline p4d_t *pgd_to_folded_p4d(pgd_t *pgdp, unsigned long addr)
{
        /* Ensure that 'pgdp' indexes a page table according to 'addr' */
        VM_BUG_ON(((addr >> PGDIR_SHIFT) ^ ((u64)pgdp >> 3)) % PTRS_PER_PGD);

        return (p4d_t *)PTR_ALIGN_DOWN(pgdp, PAGE_SIZE) + p4d_index(addr);
}

static inline phys_addr_t p4d_offset_phys(pgd_t *pgdp, unsigned long addr)
{
        BUG_ON(!pgtable_l5_enabled());

        return pgd_page_paddr(READ_ONCE(*pgdp)) + p4d_index(addr) * sizeof(p4d_t);
}

static inline
p4d_t *p4d_offset_lockless(pgd_t *pgdp, pgd_t pgd, unsigned long addr)
{
        if (!pgtable_l5_enabled())
                return pgd_to_folded_p4d(pgdp, addr);
        return (p4d_t *)__va(pgd_page_paddr(pgd)) + p4d_index(addr);
}
#define p4d_offset_lockless p4d_offset_lockless

static inline p4d_t *p4d_offset(pgd_t *pgdp, unsigned long addr)
{
        return p4d_offset_lockless(pgdp, READ_ONCE(*pgdp), addr);
}

static inline p4d_t *p4d_set_fixmap(unsigned long addr)
{
        if (!pgtable_l5_enabled())
                return NULL;
        return (p4d_t *)set_fixmap_offset(FIX_P4D, addr);
}

static inline p4d_t *p4d_set_fixmap_offset(pgd_t *pgdp, unsigned long addr)
{
        if (!pgtable_l5_enabled())
                return pgd_to_folded_p4d(pgdp, addr);
        return p4d_set_fixmap(p4d_offset_phys(pgdp, addr));
}

static inline void p4d_clear_fixmap(void)
{
        if (pgtable_l5_enabled())
                clear_fixmap(FIX_P4D);
}

/* use ONLY for statically allocated translation tables */
static inline p4d_t *p4d_offset_kimg(pgd_t *pgdp, u64 addr)
{
        if (!pgtable_l5_enabled())
                return pgd_to_folded_p4d(pgdp, addr);
        return (p4d_t *)__phys_to_kimg(p4d_offset_phys(pgdp, addr));
}

#define pgd_page(pgd)                pfn_to_page(__phys_to_pfn(__pgd_to_phys(pgd)))

#else

static inline bool pgtable_l5_enabled(void) { return false; }

#define p4d_index(addr)                (((addr) >> P4D_SHIFT) & (PTRS_PER_P4D - 1))

/* Match p4d_offset folding in <asm/generic/pgtable-nop4d.h> */
#define p4d_set_fixmap(addr)                NULL
#define p4d_set_fixmap_offset(p4dp, addr)        ((p4d_t *)p4dp)
#define p4d_clear_fixmap()

#define p4d_offset_kimg(dir,addr)        ((p4d_t *)dir)

static inline
p4d_t *p4d_offset_lockless_folded(pgd_t *pgdp, pgd_t pgd, unsigned long addr)
{
        /*
         * With runtime folding of the pud, pud_offset_lockless() passes
         * the 'pgd_t *' we return here to p4d_to_folded_pud(), which
         * will offset the pointer assuming that it points into
         * a page-table page. However, the fast GUP path passes us a
         * pgd_t allocated on the stack and so we must use the original
         * pointer in 'pgdp' to construct the p4d pointer instead of
         * using the generic p4d_offset_lockless() implementation.
         *
         * Note: reusing the original pointer means that we may
         * dereference the same (live) page-table entry multiple times.
         * This is safe because it is still only loaded once in the
         * context of each level and the CPU guarantees same-address
         * read-after-read ordering.
         */
        return p4d_offset(pgdp, addr);
}
#define p4d_offset_lockless p4d_offset_lockless_folded

#endif  /* CONFIG_PGTABLE_LEVELS > 4 */

#define pgd_ERROR(e)        \
        pr_err("%s:%d: bad pgd %016llx.\n", __FILE__, __LINE__, pgd_val(e))

#define pgd_set_fixmap(addr)        ((pgd_t *)set_fixmap_offset(FIX_PGD, addr))
#define pgd_clear_fixmap()        clear_fixmap(FIX_PGD)

static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
{
        /*
         * Normal and Normal-Tagged are two different memory types and indices
         * in MAIR_EL1. The mask below has to include PTE_ATTRINDX_MASK.
         */
        const pteval_t mask = PTE_USER | PTE_PXN | PTE_UXN | PTE_RDONLY |
                              PTE_PRESENT_INVALID | PTE_VALID | PTE_WRITE |
                              PTE_GP | PTE_ATTRINDX_MASK | PTE_PO_IDX_MASK;

        /* preserve the hardware dirty information */
        if (pte_hw_dirty(pte))
                pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));

        pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask);
        /*
         * If we end up clearing hw dirtiness for a sw-dirty PTE, set hardware
         * dirtiness again.
         */
        if (pte_sw_dirty(pte))
                pte = pte_mkdirty(pte);
        return pte;
}

static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
{
        return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
}

extern int __ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmdp,
                                        pmd_t entry, int dirty)
{
        return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
                                                        pmd_pte(entry), dirty);
}

static inline int pud_devmap(pud_t pud)
{
        return 0;
}

static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif

#ifdef CONFIG_PAGE_TABLE_CHECK
static inline bool pte_user_accessible_page(pte_t pte)
{
        return pte_valid(pte) && (pte_user(pte) || pte_user_exec(pte));
}

static inline bool pmd_user_accessible_page(pmd_t pmd)
{
        return pmd_valid(pmd) && !pmd_table(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
}

static inline bool pud_user_accessible_page(pud_t pud)
{
        return pud_valid(pud) && !pud_table(pud) && (pud_user(pud) || pud_user_exec(pud));
}
#endif

/*
 * Atomic pte/pmd modifications.
 */
static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma,
                                              unsigned long address,
                                              pte_t *ptep)
{
        pte_t old_pte, pte;

        pte = __ptep_get(ptep);
        do {
                old_pte = pte;
                pte = pte_mkold(pte);
                pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
                                               pte_val(old_pte), pte_val(pte));
        } while (pte_val(pte) != pte_val(old_pte));

        return pte_young(pte);
}

static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
                                         unsigned long address, pte_t *ptep)
{
        int young = __ptep_test_and_clear_young(vma, address, ptep);

        if (young) {
                /*
                 * We can elide the trailing DSB here since the worst that can
                 * happen is that a CPU continues to use the young entry in its
                 * TLB and we mistakenly reclaim the associated page. The
                 * window for such an event is bounded by the next
                 * context-switch, which provides a DSB to complete the TLB
                 * invalidation.
                 */
                flush_tlb_page_nosync(vma, address);
        }

        return young;
}

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        /* Operation applies to PMD table entry only if FEAT_HAFT is enabled */
        VM_WARN_ON(pmd_table(READ_ONCE(*pmdp)) && !system_supports_haft());
        return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */

static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address, pte_t *ptep)
{
        pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));

        page_table_check_pte_clear(mm, pte);

        return pte;
}

static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                __ptep_get_and_clear(mm, addr, ptep);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}

static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep,
                                unsigned int nr, int full)
{
        pte_t pte, tmp_pte;

        pte = __ptep_get_and_clear(mm, addr, ptep);
        while (--nr) {
                ptep++;
                addr += PAGE_SIZE;
                tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
                if (pte_dirty(tmp_pte))
                        pte = pte_mkdirty(pte);
                if (pte_young(tmp_pte))
                        pte = pte_mkyoung(pte);
        }
        return pte;
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address, pmd_t *pmdp)
{
        pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0));

        page_table_check_pmd_clear(mm, pmd);

        return pmd;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline void ___ptep_set_wrprotect(struct mm_struct *mm,
                                        unsigned long address, pte_t *ptep,
                                        pte_t pte)
{
        pte_t old_pte;

        do {
                old_pte = pte;
                pte = pte_wrprotect(pte);
                pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
                                               pte_val(old_pte), pte_val(pte));
        } while (pte_val(pte) != pte_val(old_pte));
}

/*
 * __ptep_set_wrprotect - mark read-only while transferring potential hardware
 * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
 */
static inline void __ptep_set_wrprotect(struct mm_struct *mm,
                                        unsigned long address, pte_t *ptep)
{
        ___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep));
}

static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
                                pte_t *ptep, unsigned int nr)
{
        unsigned int i;

        for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
                __ptep_set_wrprotect(mm, address, ptep);
}

static inline void __clear_young_dirty_pte(struct vm_area_struct *vma,
                                           unsigned long addr, pte_t *ptep,
                                           pte_t pte, cydp_t flags)
{
        pte_t old_pte;

        do {
                old_pte = pte;

                if (flags & CYDP_CLEAR_YOUNG)
                        pte = pte_mkold(pte);
                if (flags & CYDP_CLEAR_DIRTY)
                        pte = pte_mkclean(pte);

                pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
                                               pte_val(old_pte), pte_val(pte));
        } while (pte_val(pte) != pte_val(old_pte));
}

static inline void __clear_young_dirty_ptes(struct vm_area_struct *vma,
                                            unsigned long addr, pte_t *ptep,
                                            unsigned int nr, cydp_t flags)
{
        pte_t pte;

        for (;;) {
                pte = __ptep_get(ptep);

                if (flags == (CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY))
                        __set_pte(ptep, pte_mkclean(pte_mkold(pte)));
                else
                        __clear_young_dirty_pte(vma, addr, ptep, pte, flags);

                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        __ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
}

#define pmdp_establish pmdp_establish
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        page_table_check_pmd_set(vma->vm_mm, pmdp, pmd);
        return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
}
#endif

/*
 * Encode and decode a swap entry:
 *        bits 0-1:        present (must be zero)
 *        bits 2:                remember PG_anon_exclusive
 *        bit  3:                remember uffd-wp state
 *        bits 6-10:        swap type
 *        bit  11:        PTE_PRESENT_INVALID (must be zero)
 *        bits 12-61:        swap offset
 */
#define __SWP_TYPE_SHIFT        6
#define __SWP_TYPE_BITS                5
#define __SWP_TYPE_MASK                ((1 << __SWP_TYPE_BITS) - 1)
#define __SWP_OFFSET_SHIFT        12
#define __SWP_OFFSET_BITS        50
#define __SWP_OFFSET_MASK        ((1UL << __SWP_OFFSET_BITS) - 1)

#define __swp_type(x)                (((x).val >> __SWP_TYPE_SHIFT) & __SWP_TYPE_MASK)
#define __swp_offset(x)                (((x).val >> __SWP_OFFSET_SHIFT) & __SWP_OFFSET_MASK)
#define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) })

#define __pte_to_swp_entry(pte)        ((swp_entry_t) { pte_val(pte) })
#define __swp_entry_to_pte(swp)        ((pte_t) { (swp).val })

#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
#define __pmd_to_swp_entry(pmd)                ((swp_entry_t) { pmd_val(pmd) })
#define __swp_entry_to_pmd(swp)                __pmd((swp).val)
#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */

/*
 * Ensure that there are not more swap files than can be encoded in the kernel
 * PTEs.
 */
#define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)

#ifdef CONFIG_ARM64_MTE

#define __HAVE_ARCH_PREPARE_TO_SWAP
extern int arch_prepare_to_swap(struct folio *folio);

#define __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
{
        if (system_supports_mte())
                mte_invalidate_tags(type, offset);
}

static inline void arch_swap_invalidate_area(int type)
{
        if (system_supports_mte())
                mte_invalidate_tags_area(type);
}

#define __HAVE_ARCH_SWAP_RESTORE
extern void arch_swap_restore(swp_entry_t entry, struct folio *folio);

#endif /* CONFIG_ARM64_MTE */

/*
 * On AArch64, the cache coherency is handled via the __set_ptes() function.
 */
static inline void update_mmu_cache_range(struct vm_fault *vmf,
                struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
                unsigned int nr)
{
        /*
         * We don't do anything here, so there's a very small chance of
         * us retaking a user fault which we just fixed up. The alternative
         * is doing a dsb(ishst), but that penalises the fastpath.
         */
}

#define update_mmu_cache(vma, addr, ptep) \
        update_mmu_cache_range(NULL, vma, addr, ptep, 1)
#define update_mmu_cache_pmd(vma, address, pmd) do { } while (0)

#ifdef CONFIG_ARM64_PA_BITS_52
#define phys_to_ttbr(addr)        (((addr) | ((addr) >> 46)) & TTBR_BADDR_MASK_52)
#else
#define phys_to_ttbr(addr)        (addr)
#endif

/*
 * On arm64 without hardware Access Flag, copying from user will fail because
 * the pte is old and cannot be marked young. So we always end up with zeroed
 * page after fork() + CoW for pfn mappings. We don't always have a
 * hardware-managed access flag on arm64.
 */
#define arch_has_hw_pte_young                cpu_has_hw_af

#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
#define arch_has_hw_nonleaf_pmd_young        system_supports_haft
#endif

/*
 * Experimentally, it's cheap to set the access flag in hardware and we
 * benefit from prefaulting mappings as 'old' to start with.
 */
#define arch_wants_old_prefaulted_pte        cpu_has_hw_af

static inline bool pud_sect_supported(void)
{
        return PAGE_SIZE == SZ_4K;
}


#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
#define ptep_modify_prot_start ptep_modify_prot_start
extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
                                    unsigned long addr, pte_t *ptep);

#define ptep_modify_prot_commit ptep_modify_prot_commit
extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
                                    unsigned long addr, pte_t *ptep,
                                    pte_t old_pte, pte_t new_pte);

#ifdef CONFIG_ARM64_CONTPTE

/*
 * The contpte APIs are used to transparently manage the contiguous bit in ptes
 * where it is possible and makes sense to do so. The PTE_CONT bit is considered
 * a private implementation detail of the public ptep API (see below).
 */
extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, pte_t pte);
extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, pte_t pte);
extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, pte_t pte, unsigned int nr);
extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, unsigned int nr, int full);
extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep,
                                unsigned int nr, int full);
extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep);
extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep);
extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, unsigned int nr);
extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep,
                                pte_t entry, int dirty);
extern void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep,
                                unsigned int nr, cydp_t flags);

static __always_inline void contpte_try_fold(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep, pte_t pte)
{
        /*
         * Only bother trying if both the virtual and physical addresses are
         * aligned and correspond to the last entry in a contig range. The core
         * code mostly modifies ranges from low to high, so this is the likely
         * the last modification in the contig range, so a good time to fold.
         * We can't fold special mappings, because there is no associated folio.
         */

        const unsigned long contmask = CONT_PTES - 1;
        bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask;

        if (unlikely(valign)) {
                bool palign = (pte_pfn(pte) & contmask) == contmask;

                if (unlikely(palign &&
                    pte_valid(pte) && !pte_cont(pte) && !pte_special(pte)))
                        __contpte_try_fold(mm, addr, ptep, pte);
        }
}

static __always_inline void contpte_try_unfold(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep, pte_t pte)
{
        if (unlikely(pte_valid_cont(pte)))
                __contpte_try_unfold(mm, addr, ptep, pte);
}

#define pte_batch_hint pte_batch_hint
static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
{
        if (!pte_valid_cont(pte))
                return 1;

        return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1));
}

/*
 * The below functions constitute the public API that arm64 presents to the
 * core-mm to manipulate PTE entries within their page tables (or at least this
 * is the subset of the API that arm64 needs to implement). These public
 * versions will automatically and transparently apply the contiguous bit where
 * it makes sense to do so. Therefore any users that are contig-aware (e.g.
 * hugetlb, kernel mapper) should NOT use these APIs, but instead use the
 * private versions, which are prefixed with double underscore. All of these
 * APIs except for ptep_get_lockless() are expected to be called with the PTL
 * held. Although the contiguous bit is considered private to the
 * implementation, it is deliberately allowed to leak through the getters (e.g.
 * ptep_get()), back to core code. This is required so that pte_leaf_size() can
 * provide an accurate size for perf_get_pgtable_size(). But this leakage means
 * its possible a pte will be passed to a setter with the contiguous bit set, so
 * we explicitly clear the contiguous bit in those cases to prevent accidentally
 * setting it in the pgtable.
 */

#define ptep_get ptep_get
static inline pte_t ptep_get(pte_t *ptep)
{
        pte_t pte = __ptep_get(ptep);

        if (likely(!pte_valid_cont(pte)))
                return pte;

        return contpte_ptep_get(ptep, pte);
}

#define ptep_get_lockless ptep_get_lockless
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        pte_t pte = __ptep_get(ptep);

        if (likely(!pte_valid_cont(pte)))
                return pte;

        return contpte_ptep_get_lockless(ptep);
}

static inline void set_pte(pte_t *ptep, pte_t pte)
{
        /*
         * We don't have the mm or vaddr so cannot unfold contig entries (since
         * it requires tlb maintenance). set_pte() is not used in core code, so
         * this should never even be called. Regardless do our best to service
         * any call and emit a warning if there is any attempt to set a pte on
         * top of an existing contig range.
         */
        pte_t orig_pte = __ptep_get(ptep);

        WARN_ON_ONCE(pte_valid_cont(orig_pte));
        __set_pte(ptep, pte_mknoncont(pte));
}

#define set_ptes set_ptes
static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, pte_t pte, unsigned int nr)
{
        pte = pte_mknoncont(pte);

        if (likely(nr == 1)) {
                contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
                __set_ptes(mm, addr, ptep, pte, 1);
                contpte_try_fold(mm, addr, ptep, pte);
        } else {
                contpte_set_ptes(mm, addr, ptep, pte, nr);
        }
}

static inline void pte_clear(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep)
{
        contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
        __pte_clear(mm, addr, ptep);
}

#define clear_full_ptes clear_full_ptes
static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, unsigned int nr, int full)
{
        if (likely(nr == 1)) {
                contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
                __clear_full_ptes(mm, addr, ptep, nr, full);
        } else {
                contpte_clear_full_ptes(mm, addr, ptep, nr, full);
        }
}

#define get_and_clear_full_ptes get_and_clear_full_ptes
static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep,
                                unsigned int nr, int full)
{
        pte_t pte;

        if (likely(nr == 1)) {
                contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
                pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
        } else {
                pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full);
        }

        return pte;
}

#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep)
{
        contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
        return __ptep_get_and_clear(mm, addr, ptep);
}

#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep)
{
        pte_t orig_pte = __ptep_get(ptep);

        if (likely(!pte_valid_cont(orig_pte)))
                return __ptep_test_and_clear_young(vma, addr, ptep);

        return contpte_ptep_test_and_clear_young(vma, addr, ptep);
}

#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep)
{
        pte_t orig_pte = __ptep_get(ptep);

        if (likely(!pte_valid_cont(orig_pte)))
                return __ptep_clear_flush_young(vma, addr, ptep);

        return contpte_ptep_clear_flush_young(vma, addr, ptep);
}

#define wrprotect_ptes wrprotect_ptes
static __always_inline void wrprotect_ptes(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep, unsigned int nr)
{
        if (likely(nr == 1)) {
                /*
                 * Optimization: wrprotect_ptes() can only be called for present
                 * ptes so we only need to check contig bit as condition for
                 * unfold, and we can remove the contig bit from the pte we read
                 * to avoid re-reading. This speeds up fork() which is sensitive
                 * for order-0 folios. Equivalent to contpte_try_unfold().
                 */
                pte_t orig_pte = __ptep_get(ptep);

                if (unlikely(pte_cont(orig_pte))) {
                        __contpte_try_unfold(mm, addr, ptep, orig_pte);
                        orig_pte = pte_mknoncont(orig_pte);
                }
                ___ptep_set_wrprotect(mm, addr, ptep, orig_pte);
        } else {
                contpte_wrprotect_ptes(mm, addr, ptep, nr);
        }
}

#define __HAVE_ARCH_PTEP_SET_WRPROTECT
static inline void ptep_set_wrprotect(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep)
{
        wrprotect_ptes(mm, addr, ptep, 1);
}

#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
static inline int ptep_set_access_flags(struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep,
                                pte_t entry, int dirty)
{
        pte_t orig_pte = __ptep_get(ptep);

        entry = pte_mknoncont(entry);

        if (likely(!pte_valid_cont(orig_pte)))
                return __ptep_set_access_flags(vma, addr, ptep, entry, dirty);

        return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
}

#define clear_young_dirty_ptes clear_young_dirty_ptes
static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep,
                                          unsigned int nr, cydp_t flags)
{
        if (likely(nr == 1 && !pte_cont(__ptep_get(ptep))))
                __clear_young_dirty_ptes(vma, addr, ptep, nr, flags);
        else
                contpte_clear_young_dirty_ptes(vma, addr, ptep, nr, flags);
}

#else /* CONFIG_ARM64_CONTPTE */

#define ptep_get                                __ptep_get
#define set_pte                                        __set_pte
#define set_ptes                                __set_ptes
#define pte_clear                                __pte_clear
#define clear_full_ptes                                __clear_full_ptes
#define get_and_clear_full_ptes                        __get_and_clear_full_ptes
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
#define ptep_get_and_clear                        __ptep_get_and_clear
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
#define ptep_test_and_clear_young                __ptep_test_and_clear_young
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
#define ptep_clear_flush_young                        __ptep_clear_flush_young
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
#define ptep_set_wrprotect                        __ptep_set_wrprotect
#define wrprotect_ptes                                __wrprotect_ptes
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
#define ptep_set_access_flags                        __ptep_set_access_flags
#define clear_young_dirty_ptes                        __clear_young_dirty_ptes

#endif /* CONFIG_ARM64_CONTPTE */

#endif /* !__ASSEMBLY__ */

#endif /* __ASM_PGTABLE_H */





































































































































































































































   34 




















   34 







































































































 1251 



 1249 
 1250 
 1253 

 1247 






 1257 




















































 1255 















  194 







  196 
  195 

  184 


  137 







  137 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  165 










  166 


























































































































   34 










   34 
   34 
   34 









 1251 


 1255 


























 1248 




















 1254 


























 1253 
 1253 
































































  165 




  166 
  166 

  166 
  165 






































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
// SPDX-License-Identifier: GPL-2.0-only
/*
 * FP/SIMD context switching and fault handling
 *
 * Copyright (C) 2012 ARM Ltd.
 * Author: Catalin Marinas <catalin.marinas@arm.com>
 */

#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/bottom_half.h>
#include <linux/bug.h>
#include <linux/cache.h>
#include <linux/compat.h>
#include <linux/compiler.h>
#include <linux/cpu.h>
#include <linux/cpu_pm.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/linkage.h>
#include <linux/irqflags.h>
#include <linux/init.h>
#include <linux/percpu.h>
#include <linux/prctl.h>
#include <linux/preempt.h>
#include <linux/ptrace.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/signal.h>
#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/sysctl.h>
#include <linux/swab.h>

#include <asm/esr.h>
#include <asm/exception.h>
#include <asm/fpsimd.h>
#include <asm/cpufeature.h>
#include <asm/cputype.h>
#include <asm/neon.h>
#include <asm/processor.h>
#include <asm/simd.h>
#include <asm/sigcontext.h>
#include <asm/sysreg.h>
#include <asm/traps.h>
#include <asm/virt.h>

#define FPEXC_IOF        (1 << 0)
#define FPEXC_DZF        (1 << 1)
#define FPEXC_OFF        (1 << 2)
#define FPEXC_UFF        (1 << 3)
#define FPEXC_IXF        (1 << 4)
#define FPEXC_IDF        (1 << 7)

/*
 * (Note: in this discussion, statements about FPSIMD apply equally to SVE.)
 *
 * In order to reduce the number of times the FPSIMD state is needlessly saved
 * and restored, we need to keep track of two things:
 * (a) for each task, we need to remember which CPU was the last one to have
 *     the task's FPSIMD state loaded into its FPSIMD registers;
 * (b) for each CPU, we need to remember which task's userland FPSIMD state has
 *     been loaded into its FPSIMD registers most recently, or whether it has
 *     been used to perform kernel mode NEON in the meantime.
 *
 * For (a), we add a fpsimd_cpu field to thread_struct, which gets updated to
 * the id of the current CPU every time the state is loaded onto a CPU. For (b),
 * we add the per-cpu variable 'fpsimd_last_state' (below), which contains the
 * address of the userland FPSIMD state of the task that was loaded onto the CPU
 * the most recently, or NULL if kernel mode NEON has been performed after that.
 *
 * With this in place, we no longer have to restore the next FPSIMD state right
 * when switching between tasks. Instead, we can defer this check to userland
 * resume, at which time we verify whether the CPU's fpsimd_last_state and the
 * task's fpsimd_cpu are still mutually in sync. If this is the case, we
 * can omit the FPSIMD restore.
 *
 * As an optimization, we use the thread_info flag TIF_FOREIGN_FPSTATE to
 * indicate whether or not the userland FPSIMD state of the current task is
 * present in the registers. The flag is set unless the FPSIMD registers of this
 * CPU currently contain the most recent userland FPSIMD state of the current
 * task. If the task is behaving as a VMM, then this is will be managed by
 * KVM which will clear it to indicate that the vcpu FPSIMD state is currently
 * loaded on the CPU, allowing the state to be saved if a FPSIMD-aware
 * softirq kicks in. Upon vcpu_put(), KVM will save the vcpu FP state and
 * flag the register state as invalid.
 *
 * In order to allow softirq handlers to use FPSIMD, kernel_neon_begin() may be
 * called from softirq context, which will save the task's FPSIMD context back
 * to task_struct. To prevent this from racing with the manipulation of the
 * task's FPSIMD state from task context and thereby corrupting the state, it
 * is necessary to protect any manipulation of a task's fpsimd_state or
 * TIF_FOREIGN_FPSTATE flag with get_cpu_fpsimd_context(), which will suspend
 * softirq servicing entirely until put_cpu_fpsimd_context() is called.
 *
 * For a certain task, the sequence may look something like this:
 * - the task gets scheduled in; if both the task's fpsimd_cpu field
 *   contains the id of the current CPU, and the CPU's fpsimd_last_state per-cpu
 *   variable points to the task's fpsimd_state, the TIF_FOREIGN_FPSTATE flag is
 *   cleared, otherwise it is set;
 *
 * - the task returns to userland; if TIF_FOREIGN_FPSTATE is set, the task's
 *   userland FPSIMD state is copied from memory to the registers, the task's
 *   fpsimd_cpu field is set to the id of the current CPU, the current
 *   CPU's fpsimd_last_state pointer is set to this task's fpsimd_state and the
 *   TIF_FOREIGN_FPSTATE flag is cleared;
 *
 * - the task executes an ordinary syscall; upon return to userland, the
 *   TIF_FOREIGN_FPSTATE flag will still be cleared, so no FPSIMD state is
 *   restored;
 *
 * - the task executes a syscall which executes some NEON instructions; this is
 *   preceded by a call to kernel_neon_begin(), which copies the task's FPSIMD
 *   register contents to memory, clears the fpsimd_last_state per-cpu variable
 *   and sets the TIF_FOREIGN_FPSTATE flag;
 *
 * - the task gets preempted after kernel_neon_end() is called; as we have not
 *   returned from the 2nd syscall yet, TIF_FOREIGN_FPSTATE is still set so
 *   whatever is in the FPSIMD registers is not saved to memory, but discarded.
 */

static DEFINE_PER_CPU(struct cpu_fp_state, fpsimd_last_state);

__ro_after_init struct vl_info vl_info[ARM64_VEC_MAX] = {
#ifdef CONFIG_ARM64_SVE
        [ARM64_VEC_SVE] = {
                .type                        = ARM64_VEC_SVE,
                .name                        = "SVE",
                .min_vl                        = SVE_VL_MIN,
                .max_vl                        = SVE_VL_MIN,
                .max_virtualisable_vl        = SVE_VL_MIN,
        },
#endif
#ifdef CONFIG_ARM64_SME
        [ARM64_VEC_SME] = {
                .type                        = ARM64_VEC_SME,
                .name                        = "SME",
        },
#endif
};

static unsigned int vec_vl_inherit_flag(enum vec_type type)
{
        switch (type) {
        case ARM64_VEC_SVE:
                return TIF_SVE_VL_INHERIT;
        case ARM64_VEC_SME:
                return TIF_SME_VL_INHERIT;
        default:
                WARN_ON_ONCE(1);
                return 0;
        }
}

struct vl_config {
        int __default_vl;                /* Default VL for tasks */
};

static struct vl_config vl_config[ARM64_VEC_MAX];

static inline int get_default_vl(enum vec_type type)
{
        return READ_ONCE(vl_config[type].__default_vl);
}

#ifdef CONFIG_ARM64_SVE

static inline int get_sve_default_vl(void)
{
        return get_default_vl(ARM64_VEC_SVE);
}

static inline void set_default_vl(enum vec_type type, int val)
{
        WRITE_ONCE(vl_config[type].__default_vl, val);
}

static inline void set_sve_default_vl(int val)
{
        set_default_vl(ARM64_VEC_SVE, val);
}

static void __percpu *efi_sve_state;

#else /* ! CONFIG_ARM64_SVE */

/* Dummy declaration for code that will be optimised out: */
extern void __percpu *efi_sve_state;

#endif /* ! CONFIG_ARM64_SVE */

#ifdef CONFIG_ARM64_SME

static int get_sme_default_vl(void)
{
        return get_default_vl(ARM64_VEC_SME);
}

static void set_sme_default_vl(int val)
{
        set_default_vl(ARM64_VEC_SME, val);
}

static void sme_free(struct task_struct *);

#else

static inline void sme_free(struct task_struct *t) { }

#endif

static void fpsimd_bind_task_to_cpu(void);

/*
 * Claim ownership of the CPU FPSIMD context for use by the calling context.
 *
 * The caller may freely manipulate the FPSIMD context metadata until
 * put_cpu_fpsimd_context() is called.
 *
 * On RT kernels local_bh_disable() is not sufficient because it only
 * serializes soft interrupt related sections via a local lock, but stays
 * preemptible. Disabling preemption is the right choice here as bottom
 * half processing is always in thread context on RT kernels so it
 * implicitly prevents bottom half processing as well.
 */
static void get_cpu_fpsimd_context(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_disable();
        else
                preempt_disable();
}

/*
 * Release the CPU FPSIMD context.
 *
 * Must be called from a context in which get_cpu_fpsimd_context() was
 * previously called, with no call to put_cpu_fpsimd_context() in the
 * meantime.
 */
static void put_cpu_fpsimd_context(void)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                local_bh_enable();
        else
                preempt_enable();
}

unsigned int task_get_vl(const struct task_struct *task, enum vec_type type)
{
        return task->thread.vl[type];
}

void task_set_vl(struct task_struct *task, enum vec_type type,
                 unsigned long vl)
{
        task->thread.vl[type] = vl;
}

unsigned int task_get_vl_onexec(const struct task_struct *task,
                                enum vec_type type)
{
        return task->thread.vl_onexec[type];
}

void task_set_vl_onexec(struct task_struct *task, enum vec_type type,
                        unsigned long vl)
{
        task->thread.vl_onexec[type] = vl;
}

/*
 * TIF_SME controls whether a task can use SME without trapping while
 * in userspace, when TIF_SME is set then we must have storage
 * allocated in sve_state and sme_state to store the contents of both ZA
 * and the SVE registers for both streaming and non-streaming modes.
 *
 * If both SVCR.ZA and SVCR.SM are disabled then at any point we
 * may disable TIF_SME and reenable traps.
 */


/*
 * TIF_SVE controls whether a task can use SVE without trapping while
 * in userspace, and also (together with TIF_SME) the way a task's
 * FPSIMD/SVE state is stored in thread_struct.
 *
 * The kernel uses this flag to track whether a user task is actively
 * using SVE, and therefore whether full SVE register state needs to
 * be tracked.  If not, the cheaper FPSIMD context handling code can
 * be used instead of the more costly SVE equivalents.
 *
 *  * TIF_SVE or SVCR.SM set:
 *
 *    The task can execute SVE instructions while in userspace without
 *    trapping to the kernel.
 *
 *    During any syscall, the kernel may optionally clear TIF_SVE and
 *    discard the vector state except for the FPSIMD subset.
 *
 *  * TIF_SVE clear:
 *
 *    An attempt by the user task to execute an SVE instruction causes
 *    do_sve_acc() to be called, which does some preparation and then
 *    sets TIF_SVE.
 *
 * During any syscall, the kernel may optionally clear TIF_SVE and
 * discard the vector state except for the FPSIMD subset.
 *
 * The data will be stored in one of two formats:
 *
 *  * FPSIMD only - FP_STATE_FPSIMD:
 *
 *    When the FPSIMD only state stored task->thread.fp_type is set to
 *    FP_STATE_FPSIMD, the FPSIMD registers V0-V31 are encoded in
 *    task->thread.uw.fpsimd_state; bits [max : 128] for each of Z0-Z31 are
 *    logically zero but not stored anywhere; P0-P15 and FFR are not
 *    stored and have unspecified values from userspace's point of
 *    view.  For hygiene purposes, the kernel zeroes them on next use,
 *    but userspace is discouraged from relying on this.
 *
 *    task->thread.sve_state does not need to be non-NULL, valid or any
 *    particular size: it must not be dereferenced and any data stored
 *    there should be considered stale and not referenced.
 *
 *  * SVE state - FP_STATE_SVE:
 *
 *    When the full SVE state is stored task->thread.fp_type is set to
 *    FP_STATE_SVE and Z0-Z31 (incorporating Vn in bits[127:0] or the
 *    corresponding Zn), P0-P15 and FFR are encoded in in
 *    task->thread.sve_state, formatted appropriately for vector
 *    length task->thread.sve_vl or, if SVCR.SM is set,
 *    task->thread.sme_vl. The storage for the vector registers in
 *    task->thread.uw.fpsimd_state should be ignored.
 *
 *    task->thread.sve_state must point to a valid buffer at least
 *    sve_state_size(task) bytes in size. The data stored in
 *    task->thread.uw.fpsimd_state.vregs should be considered stale
 *    and not referenced.
 *
 *  * FPSR and FPCR are always stored in task->thread.uw.fpsimd_state
 *    irrespective of whether TIF_SVE is clear or set, since these are
 *    not vector length dependent.
 */

/*
 * Update current's FPSIMD/SVE registers from thread_struct.
 *
 * This function should be called only when the FPSIMD/SVE state in
 * thread_struct is known to be up to date, when preparing to enter
 * userspace.
 */
static void task_fpsimd_load(void)
{
        bool restore_sve_regs = false;
        bool restore_ffr;

        WARN_ON(!system_supports_fpsimd());
        WARN_ON(preemptible());
        WARN_ON(test_thread_flag(TIF_KERNEL_FPSTATE));

        if (system_supports_fpmr())
                write_sysreg_s(current->thread.uw.fpmr, SYS_FPMR);

        if (system_supports_sve() || system_supports_sme()) {
                switch (current->thread.fp_type) {
                case FP_STATE_FPSIMD:
                        /* Stop tracking SVE for this task until next use. */
                        if (test_and_clear_thread_flag(TIF_SVE))
                                sve_user_disable();
                        break;
                case FP_STATE_SVE:
                        if (!thread_sm_enabled(&current->thread) &&
                            !WARN_ON_ONCE(!test_and_set_thread_flag(TIF_SVE)))
                                sve_user_enable();

                        if (test_thread_flag(TIF_SVE))
                                sve_set_vq(sve_vq_from_vl(task_get_sve_vl(current)) - 1);

                        restore_sve_regs = true;
                        restore_ffr = true;
                        break;
                default:
                        /*
                         * This indicates either a bug in
                         * fpsimd_save_user_state() or memory corruption, we
                         * should always record an explicit format
                         * when we save. We always at least have the
                         * memory allocated for FPSIMD registers so
                         * try that and hope for the best.
                         */
                        WARN_ON_ONCE(1);
                        clear_thread_flag(TIF_SVE);
                        break;
                }
        }

        /* Restore SME, override SVE register configuration if needed */
        if (system_supports_sme()) {
                unsigned long sme_vl = task_get_sme_vl(current);

                /* Ensure VL is set up for restoring data */
                if (test_thread_flag(TIF_SME))
                        sme_set_vq(sve_vq_from_vl(sme_vl) - 1);

                write_sysreg_s(current->thread.svcr, SYS_SVCR);

                if (thread_za_enabled(&current->thread))
                        sme_load_state(current->thread.sme_state,
                                       system_supports_sme2());

                if (thread_sm_enabled(&current->thread))
                        restore_ffr = system_supports_fa64();
        }

        if (restore_sve_regs) {
                WARN_ON_ONCE(current->thread.fp_type != FP_STATE_SVE);
                sve_load_state(sve_pffr(&current->thread),
                               &current->thread.uw.fpsimd_state.fpsr,
                               restore_ffr);
        } else {
                WARN_ON_ONCE(current->thread.fp_type != FP_STATE_FPSIMD);
                fpsimd_load_state(&current->thread.uw.fpsimd_state);
        }
}

/*
 * Ensure FPSIMD/SVE storage in memory for the loaded context is up to
 * date with respect to the CPU registers. Note carefully that the
 * current context is the context last bound to the CPU stored in
 * last, if KVM is involved this may be the guest VM context rather
 * than the host thread for the VM pointed to by current. This means
 * that we must always reference the state storage via last rather
 * than via current, if we are saving KVM state then it will have
 * ensured that the type of registers to save is set in last->to_save.
 */
static void fpsimd_save_user_state(void)
{
        struct cpu_fp_state const *last =
                this_cpu_ptr(&fpsimd_last_state);
        /* set by fpsimd_bind_task_to_cpu() or fpsimd_bind_state_to_cpu() */
        bool save_sve_regs = false;
        bool save_ffr;
        unsigned int vl;

        WARN_ON(!system_supports_fpsimd());
        WARN_ON(preemptible());

        if (test_thread_flag(TIF_FOREIGN_FPSTATE))
                return;

        if (system_supports_fpmr())
                *(last->fpmr) = read_sysreg_s(SYS_FPMR);

        /*
         * If a task is in a syscall the ABI allows us to only
         * preserve the state shared with FPSIMD so don't bother
         * saving the full SVE state in that case.
         */
        if ((last->to_save == FP_STATE_CURRENT && test_thread_flag(TIF_SVE) &&
             !in_syscall(current_pt_regs())) ||
            last->to_save == FP_STATE_SVE) {
                save_sve_regs = true;
                save_ffr = true;
                vl = last->sve_vl;
        }

        if (system_supports_sme()) {
                u64 *svcr = last->svcr;

                *svcr = read_sysreg_s(SYS_SVCR);

                if (*svcr & SVCR_ZA_MASK)
                        sme_save_state(last->sme_state,
                                       system_supports_sme2());

                /* If we are in streaming mode override regular SVE. */
                if (*svcr & SVCR_SM_MASK) {
                        save_sve_regs = true;
                        save_ffr = system_supports_fa64();
                        vl = last->sme_vl;
                }
        }

        if (IS_ENABLED(CONFIG_ARM64_SVE) && save_sve_regs) {
                /* Get the configured VL from RDVL, will account for SM */
                if (WARN_ON(sve_get_vl() != vl)) {
                        /*
                         * Can't save the user regs, so current would
                         * re-enter user with corrupt state.
                         * There's no way to recover, so kill it:
                         */
                        force_signal_inject(SIGKILL, SI_KERNEL, 0, 0);
                        return;
                }

                sve_save_state((char *)last->sve_state +
                                        sve_ffr_offset(vl),
                               &last->st->fpsr, save_ffr);
                *last->fp_type = FP_STATE_SVE;
        } else {
                fpsimd_save_state(last->st);
                *last->fp_type = FP_STATE_FPSIMD;
        }
}

/*
 * All vector length selection from userspace comes through here.
 * We're on a slow path, so some sanity-checks are included.
 * If things go wrong there's a bug somewhere, but try to fall back to a
 * safe choice.
 */
static unsigned int find_supported_vector_length(enum vec_type type,
                                                 unsigned int vl)
{
        struct vl_info *info = &vl_info[type];
        int bit;
        int max_vl = info->max_vl;

        if (WARN_ON(!sve_vl_valid(vl)))
                vl = info->min_vl;

        if (WARN_ON(!sve_vl_valid(max_vl)))
                max_vl = info->min_vl;

        if (vl > max_vl)
                vl = max_vl;
        if (vl < info->min_vl)
                vl = info->min_vl;

        bit = find_next_bit(info->vq_map, SVE_VQ_MAX,
                            __vq_to_bit(sve_vq_from_vl(vl)));
        return sve_vl_from_vq(__bit_to_vq(bit));
}

#if defined(CONFIG_ARM64_SVE) && defined(CONFIG_SYSCTL)

static int vec_proc_do_default_vl(const struct ctl_table *table, int write,
                                  void *buffer, size_t *lenp, loff_t *ppos)
{
        struct vl_info *info = table->extra1;
        enum vec_type type = info->type;
        int ret;
        int vl = get_default_vl(type);
        struct ctl_table tmp_table = {
                .data = &vl,
                .maxlen = sizeof(vl),
        };

        ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        /* Writing -1 has the special meaning "set to max": */
        if (vl == -1)
                vl = info->max_vl;

        if (!sve_vl_valid(vl))
                return -EINVAL;

        set_default_vl(type, find_supported_vector_length(type, vl));
        return 0;
}

static const struct ctl_table sve_default_vl_table[] = {
        {
                .procname        = "sve_default_vector_length",
                .mode                = 0644,
                .proc_handler        = vec_proc_do_default_vl,
                .extra1                = &vl_info[ARM64_VEC_SVE],
        },
};

static int __init sve_sysctl_init(void)
{
        if (system_supports_sve())
                if (!register_sysctl("abi", sve_default_vl_table))
                        return -EINVAL;

        return 0;
}

#else /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */
static int __init sve_sysctl_init(void) { return 0; }
#endif /* ! (CONFIG_ARM64_SVE && CONFIG_SYSCTL) */

#if defined(CONFIG_ARM64_SME) && defined(CONFIG_SYSCTL)
static const struct ctl_table sme_default_vl_table[] = {
        {
                .procname        = "sme_default_vector_length",
                .mode                = 0644,
                .proc_handler        = vec_proc_do_default_vl,
                .extra1                = &vl_info[ARM64_VEC_SME],
        },
};

static int __init sme_sysctl_init(void)
{
        if (system_supports_sme())
                if (!register_sysctl("abi", sme_default_vl_table))
                        return -EINVAL;

        return 0;
}

#else /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */
static int __init sme_sysctl_init(void) { return 0; }
#endif /* ! (CONFIG_ARM64_SME && CONFIG_SYSCTL) */

#define ZREG(sve_state, vq, n) ((char *)(sve_state) +                \
        (SVE_SIG_ZREG_OFFSET(vq, n) - SVE_SIG_REGS_OFFSET))

#ifdef CONFIG_CPU_BIG_ENDIAN
static __uint128_t arm64_cpu_to_le128(__uint128_t x)
{
        u64 a = swab64(x);
        u64 b = swab64(x >> 64);

        return ((__uint128_t)a << 64) | b;
}
#else
static __uint128_t arm64_cpu_to_le128(__uint128_t x)
{
        return x;
}
#endif

#define arm64_le128_to_cpu(x) arm64_cpu_to_le128(x)

static void __fpsimd_to_sve(void *sst, struct user_fpsimd_state const *fst,
                            unsigned int vq)
{
        unsigned int i;
        __uint128_t *p;

        for (i = 0; i < SVE_NUM_ZREGS; ++i) {
                p = (__uint128_t *)ZREG(sst, vq, i);
                *p = arm64_cpu_to_le128(fst->vregs[i]);
        }
}

/*
 * Transfer the FPSIMD state in task->thread.uw.fpsimd_state to
 * task->thread.sve_state.
 *
 * Task can be a non-runnable task, or current.  In the latter case,
 * the caller must have ownership of the cpu FPSIMD context before calling
 * this function.
 * task->thread.sve_state must point to at least sve_state_size(task)
 * bytes of allocated kernel memory.
 * task->thread.uw.fpsimd_state must be up to date before calling this
 * function.
 */
static void fpsimd_to_sve(struct task_struct *task)
{
        unsigned int vq;
        void *sst = task->thread.sve_state;
        struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state;

        if (!system_supports_sve() && !system_supports_sme())
                return;

        vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread));
        __fpsimd_to_sve(sst, fst, vq);
}

/*
 * Transfer the SVE state in task->thread.sve_state to
 * task->thread.uw.fpsimd_state.
 *
 * Task can be a non-runnable task, or current.  In the latter case,
 * the caller must have ownership of the cpu FPSIMD context before calling
 * this function.
 * task->thread.sve_state must point to at least sve_state_size(task)
 * bytes of allocated kernel memory.
 * task->thread.sve_state must be up to date before calling this function.
 */
static void sve_to_fpsimd(struct task_struct *task)
{
        unsigned int vq, vl;
        void const *sst = task->thread.sve_state;
        struct user_fpsimd_state *fst = &task->thread.uw.fpsimd_state;
        unsigned int i;
        __uint128_t const *p;

        if (!system_supports_sve() && !system_supports_sme())
                return;

        vl = thread_get_cur_vl(&task->thread);
        vq = sve_vq_from_vl(vl);
        for (i = 0; i < SVE_NUM_ZREGS; ++i) {
                p = (__uint128_t const *)ZREG(sst, vq, i);
                fst->vregs[i] = arm64_le128_to_cpu(*p);
        }
}

void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__always_unused p)
{
        write_sysreg_s(read_sysreg_s(SYS_SCTLR_EL1) | SCTLR_EL1_EnFPM_MASK,
                       SYS_SCTLR_EL1);
}

#ifdef CONFIG_ARM64_SVE
/*
 * Call __sve_free() directly only if you know task can't be scheduled
 * or preempted.
 */
static void __sve_free(struct task_struct *task)
{
        kfree(task->thread.sve_state);
        task->thread.sve_state = NULL;
}

static void sve_free(struct task_struct *task)
{
        WARN_ON(test_tsk_thread_flag(task, TIF_SVE));

        __sve_free(task);
}

/*
 * Return how many bytes of memory are required to store the full SVE
 * state for task, given task's currently configured vector length.
 */
size_t sve_state_size(struct task_struct const *task)
{
        unsigned int vl = 0;

        if (system_supports_sve())
                vl = task_get_sve_vl(task);
        if (system_supports_sme())
                vl = max(vl, task_get_sme_vl(task));

        return SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl));
}

/*
 * Ensure that task->thread.sve_state is allocated and sufficiently large.
 *
 * This function should be used only in preparation for replacing
 * task->thread.sve_state with new data.  The memory is always zeroed
 * here to prevent stale data from showing through: this is done in
 * the interest of testability and predictability: except in the
 * do_sve_acc() case, there is no ABI requirement to hide stale data
 * written previously be task.
 */
void sve_alloc(struct task_struct *task, bool flush)
{
        if (task->thread.sve_state) {
                if (flush)
                        memset(task->thread.sve_state, 0,
                               sve_state_size(task));
                return;
        }

        /* This is a small allocation (maximum ~8KB) and Should Not Fail. */
        task->thread.sve_state =
                kzalloc(sve_state_size(task), GFP_KERNEL);
}


/*
 * Force the FPSIMD state shared with SVE to be updated in the SVE state
 * even if the SVE state is the current active state.
 *
 * This should only be called by ptrace.  task must be non-runnable.
 * task->thread.sve_state must point to at least sve_state_size(task)
 * bytes of allocated kernel memory.
 */
void fpsimd_force_sync_to_sve(struct task_struct *task)
{
        fpsimd_to_sve(task);
}

/*
 * Ensure that task->thread.sve_state is up to date with respect to
 * the user task, irrespective of when SVE is in use or not.
 *
 * This should only be called by ptrace.  task must be non-runnable.
 * task->thread.sve_state must point to at least sve_state_size(task)
 * bytes of allocated kernel memory.
 */
void fpsimd_sync_to_sve(struct task_struct *task)
{
        if (!test_tsk_thread_flag(task, TIF_SVE) &&
            !thread_sm_enabled(&task->thread))
                fpsimd_to_sve(task);
}

/*
 * Ensure that task->thread.uw.fpsimd_state is up to date with respect to
 * the user task, irrespective of whether SVE is in use or not.
 *
 * This should only be called by ptrace.  task must be non-runnable.
 * task->thread.sve_state must point to at least sve_state_size(task)
 * bytes of allocated kernel memory.
 */
void sve_sync_to_fpsimd(struct task_struct *task)
{
        if (task->thread.fp_type == FP_STATE_SVE)
                sve_to_fpsimd(task);
}

/*
 * Ensure that task->thread.sve_state is up to date with respect to
 * the task->thread.uw.fpsimd_state.
 *
 * This should only be called by ptrace to merge new FPSIMD register
 * values into a task for which SVE is currently active.
 * task must be non-runnable.
 * task->thread.sve_state must point to at least sve_state_size(task)
 * bytes of allocated kernel memory.
 * task->thread.uw.fpsimd_state must already have been initialised with
 * the new FPSIMD register values to be merged in.
 */
void sve_sync_from_fpsimd_zeropad(struct task_struct *task)
{
        unsigned int vq;
        void *sst = task->thread.sve_state;
        struct user_fpsimd_state const *fst = &task->thread.uw.fpsimd_state;

        if (!test_tsk_thread_flag(task, TIF_SVE) &&
            !thread_sm_enabled(&task->thread))
                return;

        vq = sve_vq_from_vl(thread_get_cur_vl(&task->thread));

        memset(sst, 0, SVE_SIG_REGS_SIZE(vq));
        __fpsimd_to_sve(sst, fst, vq);
}

int vec_set_vector_length(struct task_struct *task, enum vec_type type,
                          unsigned long vl, unsigned long flags)
{
        bool free_sme = false;

        if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT |
                                     PR_SVE_SET_VL_ONEXEC))
                return -EINVAL;

        if (!sve_vl_valid(vl))
                return -EINVAL;

        /*
         * Clamp to the maximum vector length that VL-agnostic code
         * can work with.  A flag may be assigned in the future to
         * allow setting of larger vector lengths without confusing
         * older software.
         */
        if (vl > VL_ARCH_MAX)
                vl = VL_ARCH_MAX;

        vl = find_supported_vector_length(type, vl);

        if (flags & (PR_SVE_VL_INHERIT |
                     PR_SVE_SET_VL_ONEXEC))
                task_set_vl_onexec(task, type, vl);
        else
                /* Reset VL to system default on next exec: */
                task_set_vl_onexec(task, type, 0);

        /* Only actually set the VL if not deferred: */
        if (flags & PR_SVE_SET_VL_ONEXEC)
                goto out;

        if (vl == task_get_vl(task, type))
                goto out;

        /*
         * To ensure the FPSIMD bits of the SVE vector registers are preserved,
         * write any live register state back to task_struct, and convert to a
         * regular FPSIMD thread.
         */
        if (task == current) {
                get_cpu_fpsimd_context();

                fpsimd_save_user_state();
        }

        fpsimd_flush_task_state(task);
        if (test_and_clear_tsk_thread_flag(task, TIF_SVE) ||
            thread_sm_enabled(&task->thread)) {
                sve_to_fpsimd(task);
                task->thread.fp_type = FP_STATE_FPSIMD;
        }

        if (system_supports_sme()) {
                if (type == ARM64_VEC_SME ||
                    !(task->thread.svcr & (SVCR_SM_MASK | SVCR_ZA_MASK))) {
                        /*
                         * We are changing the SME VL or weren't using
                         * SME anyway, discard the state and force a
                         * reallocation.
                         */
                        task->thread.svcr &= ~(SVCR_SM_MASK |
                                               SVCR_ZA_MASK);
                        clear_tsk_thread_flag(task, TIF_SME);
                        free_sme = true;
                }
        }

        if (task == current)
                put_cpu_fpsimd_context();

        task_set_vl(task, type, vl);

        /*
         * Free the changed states if they are not in use, SME will be
         * reallocated to the correct size on next use and we just
         * allocate SVE now in case it is needed for use in streaming
         * mode.
         */
        sve_free(task);
        sve_alloc(task, true);

        if (free_sme)
                sme_free(task);

out:
        update_tsk_thread_flag(task, vec_vl_inherit_flag(type),
                               flags & PR_SVE_VL_INHERIT);

        return 0;
}

/*
 * Encode the current vector length and flags for return.
 * This is only required for prctl(): ptrace has separate fields.
 * SVE and SME use the same bits for _ONEXEC and _INHERIT.
 *
 * flags are as for vec_set_vector_length().
 */
static int vec_prctl_status(enum vec_type type, unsigned long flags)
{
        int ret;

        if (flags & PR_SVE_SET_VL_ONEXEC)
                ret = task_get_vl_onexec(current, type);
        else
                ret = task_get_vl(current, type);

        if (test_thread_flag(vec_vl_inherit_flag(type)))
                ret |= PR_SVE_VL_INHERIT;

        return ret;
}

/* PR_SVE_SET_VL */
int sve_set_current_vl(unsigned long arg)
{
        unsigned long vl, flags;
        int ret;

        vl = arg & PR_SVE_VL_LEN_MASK;
        flags = arg & ~vl;

        if (!system_supports_sve() || is_compat_task())
                return -EINVAL;

        ret = vec_set_vector_length(current, ARM64_VEC_SVE, vl, flags);
        if (ret)
                return ret;

        return vec_prctl_status(ARM64_VEC_SVE, flags);
}

/* PR_SVE_GET_VL */
int sve_get_current_vl(void)
{
        if (!system_supports_sve() || is_compat_task())
                return -EINVAL;

        return vec_prctl_status(ARM64_VEC_SVE, 0);
}

#ifdef CONFIG_ARM64_SME
/* PR_SME_SET_VL */
int sme_set_current_vl(unsigned long arg)
{
        unsigned long vl, flags;
        int ret;

        vl = arg & PR_SME_VL_LEN_MASK;
        flags = arg & ~vl;

        if (!system_supports_sme() || is_compat_task())
                return -EINVAL;

        ret = vec_set_vector_length(current, ARM64_VEC_SME, vl, flags);
        if (ret)
                return ret;

        return vec_prctl_status(ARM64_VEC_SME, flags);
}

/* PR_SME_GET_VL */
int sme_get_current_vl(void)
{
        if (!system_supports_sme() || is_compat_task())
                return -EINVAL;

        return vec_prctl_status(ARM64_VEC_SME, 0);
}
#endif /* CONFIG_ARM64_SME */

static void vec_probe_vqs(struct vl_info *info,
                          DECLARE_BITMAP(map, SVE_VQ_MAX))
{
        unsigned int vq, vl;

        bitmap_zero(map, SVE_VQ_MAX);

        for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) {
                write_vl(info->type, vq - 1); /* self-syncing */

                switch (info->type) {
                case ARM64_VEC_SVE:
                        vl = sve_get_vl();
                        break;
                case ARM64_VEC_SME:
                        vl = sme_get_vl();
                        break;
                default:
                        vl = 0;
                        break;
                }

                /* Minimum VL identified? */
                if (sve_vq_from_vl(vl) > vq)
                        break;

                vq = sve_vq_from_vl(vl); /* skip intervening lengths */
                set_bit(__vq_to_bit(vq), map);
        }
}

/*
 * Initialise the set of known supported VQs for the boot CPU.
 * This is called during kernel boot, before secondary CPUs are brought up.
 */
void __init vec_init_vq_map(enum vec_type type)
{
        struct vl_info *info = &vl_info[type];
        vec_probe_vqs(info, info->vq_map);
        bitmap_copy(info->vq_partial_map, info->vq_map, SVE_VQ_MAX);
}

/*
 * If we haven't committed to the set of supported VQs yet, filter out
 * those not supported by the current CPU.
 * This function is called during the bring-up of early secondary CPUs only.
 */
void vec_update_vq_map(enum vec_type type)
{
        struct vl_info *info = &vl_info[type];
        DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);

        vec_probe_vqs(info, tmp_map);
        bitmap_and(info->vq_map, info->vq_map, tmp_map, SVE_VQ_MAX);
        bitmap_or(info->vq_partial_map, info->vq_partial_map, tmp_map,
                  SVE_VQ_MAX);
}

/*
 * Check whether the current CPU supports all VQs in the committed set.
 * This function is called during the bring-up of late secondary CPUs only.
 */
int vec_verify_vq_map(enum vec_type type)
{
        struct vl_info *info = &vl_info[type];
        DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);
        unsigned long b;

        vec_probe_vqs(info, tmp_map);

        bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX);
        if (bitmap_intersects(tmp_map, info->vq_map, SVE_VQ_MAX)) {
                pr_warn("%s: cpu%d: Required vector length(s) missing\n",
                        info->name, smp_processor_id());
                return -EINVAL;
        }

        if (!IS_ENABLED(CONFIG_KVM) || !is_hyp_mode_available())
                return 0;

        /*
         * For KVM, it is necessary to ensure that this CPU doesn't
         * support any vector length that guests may have probed as
         * unsupported.
         */

        /* Recover the set of supported VQs: */
        bitmap_complement(tmp_map, tmp_map, SVE_VQ_MAX);
        /* Find VQs supported that are not globally supported: */
        bitmap_andnot(tmp_map, tmp_map, info->vq_map, SVE_VQ_MAX);

        /* Find the lowest such VQ, if any: */
        b = find_last_bit(tmp_map, SVE_VQ_MAX);
        if (b >= SVE_VQ_MAX)
                return 0; /* no mismatches */

        /*
         * Mismatches above sve_max_virtualisable_vl are fine, since
         * no guest is allowed to configure ZCR_EL2.LEN to exceed this:
         */
        if (sve_vl_from_vq(__bit_to_vq(b)) <= info->max_virtualisable_vl) {
                pr_warn("%s: cpu%d: Unsupported vector length(s) present\n",
                        info->name, smp_processor_id());
                return -EINVAL;
        }

        return 0;
}

static void __init sve_efi_setup(void)
{
        int max_vl = 0;
        int i;

        if (!IS_ENABLED(CONFIG_EFI))
                return;

        for (i = 0; i < ARRAY_SIZE(vl_info); i++)
                max_vl = max(vl_info[i].max_vl, max_vl);

        /*
         * alloc_percpu() warns and prints a backtrace if this goes wrong.
         * This is evidence of a crippled system and we are returning void,
         * so no attempt is made to handle this situation here.
         */
        if (!sve_vl_valid(max_vl))
                goto fail;

        efi_sve_state = __alloc_percpu(
                SVE_SIG_REGS_SIZE(sve_vq_from_vl(max_vl)), SVE_VQ_BYTES);
        if (!efi_sve_state)
                goto fail;

        return;

fail:
        panic("Cannot allocate percpu memory for EFI SVE save/restore");
}

void cpu_enable_sve(const struct arm64_cpu_capabilities *__always_unused p)
{
        write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_ZEN_EL1EN, CPACR_EL1);
        isb();

        write_sysreg_s(0, SYS_ZCR_EL1);
}

void __init sve_setup(void)
{
        struct vl_info *info = &vl_info[ARM64_VEC_SVE];
        DECLARE_BITMAP(tmp_map, SVE_VQ_MAX);
        unsigned long b;
        int max_bit;

        if (!system_supports_sve())
                return;

        /*
         * The SVE architecture mandates support for 128-bit vectors,
         * so sve_vq_map must have at least SVE_VQ_MIN set.
         * If something went wrong, at least try to patch it up:
         */
        if (WARN_ON(!test_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map)))
                set_bit(__vq_to_bit(SVE_VQ_MIN), info->vq_map);

        max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX);
        info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit));

        /*
         * For the default VL, pick the maximum supported value <= 64.
         * VL == 64 is guaranteed not to grow the signal frame.
         */
        set_sve_default_vl(find_supported_vector_length(ARM64_VEC_SVE, 64));

        bitmap_andnot(tmp_map, info->vq_partial_map, info->vq_map,
                      SVE_VQ_MAX);

        b = find_last_bit(tmp_map, SVE_VQ_MAX);
        if (b >= SVE_VQ_MAX)
                /* No non-virtualisable VLs found */
                info->max_virtualisable_vl = SVE_VQ_MAX;
        else if (WARN_ON(b == SVE_VQ_MAX - 1))
                /* No virtualisable VLs?  This is architecturally forbidden. */
                info->max_virtualisable_vl = SVE_VQ_MIN;
        else /* b + 1 < SVE_VQ_MAX */
                info->max_virtualisable_vl = sve_vl_from_vq(__bit_to_vq(b + 1));

        if (info->max_virtualisable_vl > info->max_vl)
                info->max_virtualisable_vl = info->max_vl;

        pr_info("%s: maximum available vector length %u bytes per vector\n",
                info->name, info->max_vl);
        pr_info("%s: default vector length %u bytes per vector\n",
                info->name, get_sve_default_vl());

        /* KVM decides whether to support mismatched systems. Just warn here: */
        if (sve_max_virtualisable_vl() < sve_max_vl())
                pr_warn("%s: unvirtualisable vector lengths present\n",
                        info->name);

        sve_efi_setup();
}

/*
 * Called from the put_task_struct() path, which cannot get here
 * unless dead_task is really dead and not schedulable.
 */
void fpsimd_release_task(struct task_struct *dead_task)
{
        __sve_free(dead_task);
        sme_free(dead_task);
}

#endif /* CONFIG_ARM64_SVE */

#ifdef CONFIG_ARM64_SME

/*
 * Ensure that task->thread.sme_state is allocated and sufficiently large.
 *
 * This function should be used only in preparation for replacing
 * task->thread.sme_state with new data.  The memory is always zeroed
 * here to prevent stale data from showing through: this is done in
 * the interest of testability and predictability, the architecture
 * guarantees that when ZA is enabled it will be zeroed.
 */
void sme_alloc(struct task_struct *task, bool flush)
{
        if (task->thread.sme_state) {
                if (flush)
                        memset(task->thread.sme_state, 0,
                               sme_state_size(task));
                return;
        }

        /* This could potentially be up to 64K. */
        task->thread.sme_state =
                kzalloc(sme_state_size(task), GFP_KERNEL);
}

static void sme_free(struct task_struct *task)
{
        kfree(task->thread.sme_state);
        task->thread.sme_state = NULL;
}

void cpu_enable_sme(const struct arm64_cpu_capabilities *__always_unused p)
{
        /* Set priority for all PEs to architecturally defined minimum */
        write_sysreg_s(read_sysreg_s(SYS_SMPRI_EL1) & ~SMPRI_EL1_PRIORITY_MASK,
                       SYS_SMPRI_EL1);

        /* Allow SME in kernel */
        write_sysreg(read_sysreg(CPACR_EL1) | CPACR_EL1_SMEN_EL1EN, CPACR_EL1);
        isb();

        /* Ensure all bits in SMCR are set to known values */
        write_sysreg_s(0, SYS_SMCR_EL1);

        /* Allow EL0 to access TPIDR2 */
        write_sysreg(read_sysreg(SCTLR_EL1) | SCTLR_ELx_ENTP2, SCTLR_EL1);
        isb();
}

void cpu_enable_sme2(const struct arm64_cpu_capabilities *__always_unused p)
{
        /* This must be enabled after SME */
        BUILD_BUG_ON(ARM64_SME2 <= ARM64_SME);

        /* Allow use of ZT0 */
        write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_EZT0_MASK,
                       SYS_SMCR_EL1);
}

void cpu_enable_fa64(const struct arm64_cpu_capabilities *__always_unused p)
{
        /* This must be enabled after SME */
        BUILD_BUG_ON(ARM64_SME_FA64 <= ARM64_SME);

        /* Allow use of FA64 */
        write_sysreg_s(read_sysreg_s(SYS_SMCR_EL1) | SMCR_ELx_FA64_MASK,
                       SYS_SMCR_EL1);
}

void __init sme_setup(void)
{
        struct vl_info *info = &vl_info[ARM64_VEC_SME];
        int min_bit, max_bit;

        if (!system_supports_sme())
                return;

        /*
         * SME doesn't require any particular vector length be
         * supported but it does require at least one.  We should have
         * disabled the feature entirely while bringing up CPUs but
         * let's double check here.  The bitmap is SVE_VQ_MAP sized for
         * sharing with SVE.
         */
        WARN_ON(bitmap_empty(info->vq_map, SVE_VQ_MAX));

        min_bit = find_last_bit(info->vq_map, SVE_VQ_MAX);
        info->min_vl = sve_vl_from_vq(__bit_to_vq(min_bit));

        max_bit = find_first_bit(info->vq_map, SVE_VQ_MAX);
        info->max_vl = sve_vl_from_vq(__bit_to_vq(max_bit));

        WARN_ON(info->min_vl > info->max_vl);

        /*
         * For the default VL, pick the maximum supported value <= 32
         * (256 bits) if there is one since this is guaranteed not to
         * grow the signal frame when in streaming mode, otherwise the
         * minimum available VL will be used.
         */
        set_sme_default_vl(find_supported_vector_length(ARM64_VEC_SME, 32));

        pr_info("SME: minimum available vector length %u bytes per vector\n",
                info->min_vl);
        pr_info("SME: maximum available vector length %u bytes per vector\n",
                info->max_vl);
        pr_info("SME: default vector length %u bytes per vector\n",
                get_sme_default_vl());
}

void sme_suspend_exit(void)
{
        u64 smcr = 0;

        if (!system_supports_sme())
                return;

        if (system_supports_fa64())
                smcr |= SMCR_ELx_FA64;
        if (system_supports_sme2())
                smcr |= SMCR_ELx_EZT0;

        write_sysreg_s(smcr, SYS_SMCR_EL1);
        write_sysreg_s(0, SYS_SMPRI_EL1);
}

#endif /* CONFIG_ARM64_SME */

static void sve_init_regs(void)
{
        /*
         * Convert the FPSIMD state to SVE, zeroing all the state that
         * is not shared with FPSIMD. If (as is likely) the current
         * state is live in the registers then do this there and
         * update our metadata for the current task including
         * disabling the trap, otherwise update our in-memory copy.
         * We are guaranteed to not be in streaming mode, we can only
         * take a SVE trap when not in streaming mode and we can't be
         * in streaming mode when taking a SME trap.
         */
        if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) {
                unsigned long vq_minus_one =
                        sve_vq_from_vl(task_get_sve_vl(current)) - 1;
                sve_set_vq(vq_minus_one);
                sve_flush_live(true, vq_minus_one);
                fpsimd_bind_task_to_cpu();
        } else {
                fpsimd_to_sve(current);
                current->thread.fp_type = FP_STATE_SVE;
                fpsimd_flush_task_state(current);
        }
}

/*
 * Trapped SVE access
 *
 * Storage is allocated for the full SVE state, the current FPSIMD
 * register contents are migrated across, and the access trap is
 * disabled.
 *
 * TIF_SVE should be clear on entry: otherwise, fpsimd_restore_current_state()
 * would have disabled the SVE access trap for userspace during
 * ret_to_user, making an SVE access trap impossible in that case.
 */
void do_sve_acc(unsigned long esr, struct pt_regs *regs)
{
        /* Even if we chose not to use SVE, the hardware could still trap: */
        if (unlikely(!system_supports_sve()) || WARN_ON(is_compat_task())) {
                force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
                return;
        }

        sve_alloc(current, true);
        if (!current->thread.sve_state) {
                force_sig(SIGKILL);
                return;
        }

        get_cpu_fpsimd_context();

        if (test_and_set_thread_flag(TIF_SVE))
                WARN_ON(1); /* SVE access shouldn't have trapped */

        /*
         * Even if the task can have used streaming mode we can only
         * generate SVE access traps in normal SVE mode and
         * transitioning out of streaming mode may discard any
         * streaming mode state.  Always clear the high bits to avoid
         * any potential errors tracking what is properly initialised.
         */
        sve_init_regs();

        put_cpu_fpsimd_context();
}

/*
 * Trapped SME access
 *
 * Storage is allocated for the full SVE and SME state, the current
 * FPSIMD register contents are migrated to SVE if SVE is not already
 * active, and the access trap is disabled.
 *
 * TIF_SME should be clear on entry: otherwise, fpsimd_restore_current_state()
 * would have disabled the SME access trap for userspace during
 * ret_to_user, making an SME access trap impossible in that case.
 */
void do_sme_acc(unsigned long esr, struct pt_regs *regs)
{
        /* Even if we chose not to use SME, the hardware could still trap: */
        if (unlikely(!system_supports_sme()) || WARN_ON(is_compat_task())) {
                force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
                return;
        }

        /*
         * If this not a trap due to SME being disabled then something
         * is being used in the wrong mode, report as SIGILL.
         */
        if (ESR_ELx_ISS(esr) != ESR_ELx_SME_ISS_SME_DISABLED) {
                force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
                return;
        }

        sve_alloc(current, false);
        sme_alloc(current, true);
        if (!current->thread.sve_state || !current->thread.sme_state) {
                force_sig(SIGKILL);
                return;
        }

        get_cpu_fpsimd_context();

        /* With TIF_SME userspace shouldn't generate any traps */
        if (test_and_set_thread_flag(TIF_SME))
                WARN_ON(1);

        if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) {
                unsigned long vq_minus_one =
                        sve_vq_from_vl(task_get_sme_vl(current)) - 1;
                sme_set_vq(vq_minus_one);

                fpsimd_bind_task_to_cpu();
        }

        put_cpu_fpsimd_context();
}

/*
 * Trapped FP/ASIMD access.
 */
void do_fpsimd_acc(unsigned long esr, struct pt_regs *regs)
{
        /* Even if we chose not to use FPSIMD, the hardware could still trap: */
        if (!system_supports_fpsimd()) {
                force_signal_inject(SIGILL, ILL_ILLOPC, regs->pc, 0);
                return;
        }

        /*
         * When FPSIMD is enabled, we should never take a trap unless something
         * has gone very wrong.
         */
        BUG();
}

/*
 * Raise a SIGFPE for the current process.
 */
void do_fpsimd_exc(unsigned long esr, struct pt_regs *regs)
{
        unsigned int si_code = FPE_FLTUNK;

        if (esr & ESR_ELx_FP_EXC_TFV) {
                if (esr & FPEXC_IOF)
                        si_code = FPE_FLTINV;
                else if (esr & FPEXC_DZF)
                        si_code = FPE_FLTDIV;
                else if (esr & FPEXC_OFF)
                        si_code = FPE_FLTOVF;
                else if (esr & FPEXC_UFF)
                        si_code = FPE_FLTUND;
                else if (esr & FPEXC_IXF)
                        si_code = FPE_FLTRES;
        }

        send_sig_fault(SIGFPE, si_code,
                       (void __user *)instruction_pointer(regs),
                       current);
}

static void fpsimd_load_kernel_state(struct task_struct *task)
{
        struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state);

        /*
         * Elide the load if this CPU holds the most recent kernel mode
         * FPSIMD context of the current task.
         */
        if (last->st == &task->thread.kernel_fpsimd_state &&
            task->thread.kernel_fpsimd_cpu == smp_processor_id())
                return;

        fpsimd_load_state(&task->thread.kernel_fpsimd_state);
}

static void fpsimd_save_kernel_state(struct task_struct *task)
{
        struct cpu_fp_state cpu_fp_state = {
                .st                = &task->thread.kernel_fpsimd_state,
                .to_save        = FP_STATE_FPSIMD,
        };

        fpsimd_save_state(&task->thread.kernel_fpsimd_state);
        fpsimd_bind_state_to_cpu(&cpu_fp_state);

        task->thread.kernel_fpsimd_cpu = smp_processor_id();
}

/*
 * Invalidate any task's FPSIMD state that is present on this cpu.
 * The FPSIMD context should be acquired with get_cpu_fpsimd_context()
 * before calling this function.
 */
static void fpsimd_flush_cpu_state(void)
{
        WARN_ON(!system_supports_fpsimd());
        __this_cpu_write(fpsimd_last_state.st, NULL);

        /*
         * Leaving streaming mode enabled will cause issues for any kernel
         * NEON and leaving streaming mode or ZA enabled may increase power
         * consumption.
         */
        if (system_supports_sme())
                sme_smstop();

        set_thread_flag(TIF_FOREIGN_FPSTATE);
}

void fpsimd_thread_switch(struct task_struct *next)
{
        bool wrong_task, wrong_cpu;

        if (!system_supports_fpsimd())
                return;

        WARN_ON_ONCE(!irqs_disabled());

        /* Save unsaved fpsimd state, if any: */
        if (test_thread_flag(TIF_KERNEL_FPSTATE))
                fpsimd_save_kernel_state(current);
        else
                fpsimd_save_user_state();

        if (test_tsk_thread_flag(next, TIF_KERNEL_FPSTATE)) {
                fpsimd_load_kernel_state(next);
                fpsimd_flush_cpu_state();
        } else {
                /*
                 * Fix up TIF_FOREIGN_FPSTATE to correctly describe next's
                 * state.  For kernel threads, FPSIMD registers are never
                 * loaded with user mode FPSIMD state and so wrong_task and
                 * wrong_cpu will always be true.
                 */
                wrong_task = __this_cpu_read(fpsimd_last_state.st) !=
                        &next->thread.uw.fpsimd_state;
                wrong_cpu = next->thread.fpsimd_cpu != smp_processor_id();

                update_tsk_thread_flag(next, TIF_FOREIGN_FPSTATE,
                                       wrong_task || wrong_cpu);
        }
}

static void fpsimd_flush_thread_vl(enum vec_type type)
{
        int vl, supported_vl;

        /*
         * Reset the task vector length as required.  This is where we
         * ensure that all user tasks have a valid vector length
         * configured: no kernel task can become a user task without
         * an exec and hence a call to this function.  By the time the
         * first call to this function is made, all early hardware
         * probing is complete, so __sve_default_vl should be valid.
         * If a bug causes this to go wrong, we make some noise and
         * try to fudge thread.sve_vl to a safe value here.
         */
        vl = task_get_vl_onexec(current, type);
        if (!vl)
                vl = get_default_vl(type);

        if (WARN_ON(!sve_vl_valid(vl)))
                vl = vl_info[type].min_vl;

        supported_vl = find_supported_vector_length(type, vl);
        if (WARN_ON(supported_vl != vl))
                vl = supported_vl;

        task_set_vl(current, type, vl);

        /*
         * If the task is not set to inherit, ensure that the vector
         * length will be reset by a subsequent exec:
         */
        if (!test_thread_flag(vec_vl_inherit_flag(type)))
                task_set_vl_onexec(current, type, 0);
}

void fpsimd_flush_thread(void)
{
        void *sve_state = NULL;
        void *sme_state = NULL;

        if (!system_supports_fpsimd())
                return;

        get_cpu_fpsimd_context();

        fpsimd_flush_task_state(current);
        memset(&current->thread.uw.fpsimd_state, 0,
               sizeof(current->thread.uw.fpsimd_state));

        if (system_supports_sve()) {
                clear_thread_flag(TIF_SVE);

                /* Defer kfree() while in atomic context */
                sve_state = current->thread.sve_state;
                current->thread.sve_state = NULL;

                fpsimd_flush_thread_vl(ARM64_VEC_SVE);
        }

        if (system_supports_sme()) {
                clear_thread_flag(TIF_SME);

                /* Defer kfree() while in atomic context */
                sme_state = current->thread.sme_state;
                current->thread.sme_state = NULL;

                fpsimd_flush_thread_vl(ARM64_VEC_SME);
                current->thread.svcr = 0;
        }

        current->thread.fp_type = FP_STATE_FPSIMD;

        put_cpu_fpsimd_context();
        kfree(sve_state);
        kfree(sme_state);
}

/*
 * Save the userland FPSIMD state of 'current' to memory, but only if the state
 * currently held in the registers does in fact belong to 'current'
 */
void fpsimd_preserve_current_state(void)
{
        if (!system_supports_fpsimd())
                return;

        get_cpu_fpsimd_context();
        fpsimd_save_user_state();
        put_cpu_fpsimd_context();
}

/*
 * Like fpsimd_preserve_current_state(), but ensure that
 * current->thread.uw.fpsimd_state is updated so that it can be copied to
 * the signal frame.
 */
void fpsimd_signal_preserve_current_state(void)
{
        fpsimd_preserve_current_state();
        if (current->thread.fp_type == FP_STATE_SVE)
                sve_to_fpsimd(current);
}

/*
 * Associate current's FPSIMD context with this cpu
 * The caller must have ownership of the cpu FPSIMD context before calling
 * this function.
 */
static void fpsimd_bind_task_to_cpu(void)
{
        struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state);

        WARN_ON(!system_supports_fpsimd());
        last->st = &current->thread.uw.fpsimd_state;
        last->sve_state = current->thread.sve_state;
        last->sme_state = current->thread.sme_state;
        last->sve_vl = task_get_sve_vl(current);
        last->sme_vl = task_get_sme_vl(current);
        last->svcr = &current->thread.svcr;
        last->fpmr = &current->thread.uw.fpmr;
        last->fp_type = &current->thread.fp_type;
        last->to_save = FP_STATE_CURRENT;
        current->thread.fpsimd_cpu = smp_processor_id();

        /*
         * Toggle SVE and SME trapping for userspace if needed, these
         * are serialsied by ret_to_user().
         */
        if (system_supports_sme()) {
                if (test_thread_flag(TIF_SME))
                        sme_user_enable();
                else
                        sme_user_disable();
        }

        if (system_supports_sve()) {
                if (test_thread_flag(TIF_SVE))
                        sve_user_enable();
                else
                        sve_user_disable();
        }
}

void fpsimd_bind_state_to_cpu(struct cpu_fp_state *state)
{
        struct cpu_fp_state *last = this_cpu_ptr(&fpsimd_last_state);

        WARN_ON(!system_supports_fpsimd());
        WARN_ON(!in_softirq() && !irqs_disabled());

        *last = *state;
}

/*
 * Load the userland FPSIMD state of 'current' from memory, but only if the
 * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
 * state of 'current'.  This is called when we are preparing to return to
 * userspace to ensure that userspace sees a good register state.
 */
void fpsimd_restore_current_state(void)
{
        /*
         * TIF_FOREIGN_FPSTATE is set on the init task and copied by
         * arch_dup_task_struct() regardless of whether FP/SIMD is detected.
         * Thus user threads can have this set even when FP/SIMD hasn't been
         * detected.
         *
         * When FP/SIMD is detected, begin_new_exec() will set
         * TIF_FOREIGN_FPSTATE via flush_thread() -> fpsimd_flush_thread(),
         * and fpsimd_thread_switch() will set TIF_FOREIGN_FPSTATE when
         * switching tasks. We detect FP/SIMD before we exec the first user
         * process, ensuring this has TIF_FOREIGN_FPSTATE set and
         * do_notify_resume() will call fpsimd_restore_current_state() to
         * install the user FP/SIMD context.
         *
         * When FP/SIMD is not detected, nothing else will clear or set
         * TIF_FOREIGN_FPSTATE prior to the first return to userspace, and
         * we must clear TIF_FOREIGN_FPSTATE to avoid do_notify_resume()
         * looping forever calling fpsimd_restore_current_state().
         */
        if (!system_supports_fpsimd()) {
                clear_thread_flag(TIF_FOREIGN_FPSTATE);
                return;
        }

        get_cpu_fpsimd_context();

        if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) {
                task_fpsimd_load();
                fpsimd_bind_task_to_cpu();
        }

        put_cpu_fpsimd_context();
}

/*
 * Load an updated userland FPSIMD state for 'current' from memory and set the
 * flag that indicates that the FPSIMD register contents are the most recent
 * FPSIMD state of 'current'. This is used by the signal code to restore the
 * register state when returning from a signal handler in FPSIMD only cases,
 * any SVE context will be discarded.
 */
void fpsimd_update_current_state(struct user_fpsimd_state const *state)
{
        if (WARN_ON(!system_supports_fpsimd()))
                return;

        get_cpu_fpsimd_context();

        current->thread.uw.fpsimd_state = *state;
        if (test_thread_flag(TIF_SVE))
                fpsimd_to_sve(current);

        task_fpsimd_load();
        fpsimd_bind_task_to_cpu();

        clear_thread_flag(TIF_FOREIGN_FPSTATE);

        put_cpu_fpsimd_context();
}

/*
 * Invalidate live CPU copies of task t's FPSIMD state
 *
 * This function may be called with preemption enabled.  The barrier()
 * ensures that the assignment to fpsimd_cpu is visible to any
 * preemption/softirq that could race with set_tsk_thread_flag(), so
 * that TIF_FOREIGN_FPSTATE cannot be spuriously re-cleared.
 *
 * The final barrier ensures that TIF_FOREIGN_FPSTATE is seen set by any
 * subsequent code.
 */
void fpsimd_flush_task_state(struct task_struct *t)
{
        t->thread.fpsimd_cpu = NR_CPUS;
        /*
         * If we don't support fpsimd, bail out after we have
         * reset the fpsimd_cpu for this task and clear the
         * FPSTATE.
         */
        if (!system_supports_fpsimd())
                return;
        barrier();
        set_tsk_thread_flag(t, TIF_FOREIGN_FPSTATE);

        barrier();
}

/*
 * Save the FPSIMD state to memory and invalidate cpu view.
 * This function must be called with preemption disabled.
 */
void fpsimd_save_and_flush_cpu_state(void)
{
        unsigned long flags;

        if (!system_supports_fpsimd())
                return;
        WARN_ON(preemptible());
        local_irq_save(flags);
        fpsimd_save_user_state();
        fpsimd_flush_cpu_state();
        local_irq_restore(flags);
}

#ifdef CONFIG_KERNEL_MODE_NEON

/*
 * Kernel-side NEON support functions
 */

/*
 * kernel_neon_begin(): obtain the CPU FPSIMD registers for use by the calling
 * context
 *
 * Must not be called unless may_use_simd() returns true.
 * Task context in the FPSIMD registers is saved back to memory as necessary.
 *
 * A matching call to kernel_neon_end() must be made before returning from the
 * calling context.
 *
 * The caller may freely use the FPSIMD registers until kernel_neon_end() is
 * called.
 */
void kernel_neon_begin(void)
{
        if (WARN_ON(!system_supports_fpsimd()))
                return;

        BUG_ON(!may_use_simd());

        get_cpu_fpsimd_context();

        /* Save unsaved fpsimd state, if any: */
        if (test_thread_flag(TIF_KERNEL_FPSTATE)) {
                BUG_ON(IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq());
                fpsimd_save_kernel_state(current);
        } else {
                fpsimd_save_user_state();

                /*
                 * Set the thread flag so that the kernel mode FPSIMD state
                 * will be context switched along with the rest of the task
                 * state.
                 *
                 * On non-PREEMPT_RT, softirqs may interrupt task level kernel
                 * mode FPSIMD, but the task will not be preemptible so setting
                 * TIF_KERNEL_FPSTATE for those would be both wrong (as it
                 * would mark the task context FPSIMD state as requiring a
                 * context switch) and unnecessary.
                 *
                 * On PREEMPT_RT, softirqs are serviced from a separate thread,
                 * which is scheduled as usual, and this guarantees that these
                 * softirqs are not interrupting use of the FPSIMD in kernel
                 * mode in task context. So in this case, setting the flag here
                 * is always appropriate.
                 */
                if (IS_ENABLED(CONFIG_PREEMPT_RT) || !in_serving_softirq())
                        set_thread_flag(TIF_KERNEL_FPSTATE);
        }

        /* Invalidate any task state remaining in the fpsimd regs: */
        fpsimd_flush_cpu_state();

        put_cpu_fpsimd_context();
}
EXPORT_SYMBOL_GPL(kernel_neon_begin);

/*
 * kernel_neon_end(): give the CPU FPSIMD registers back to the current task
 *
 * Must be called from a context in which kernel_neon_begin() was previously
 * called, with no call to kernel_neon_end() in the meantime.
 *
 * The caller must not use the FPSIMD registers after this function is called,
 * unless kernel_neon_begin() is called again in the meantime.
 */
void kernel_neon_end(void)
{
        if (!system_supports_fpsimd())
                return;

        /*
         * If we are returning from a nested use of kernel mode FPSIMD, restore
         * the task context kernel mode FPSIMD state. This can only happen when
         * running in softirq context on non-PREEMPT_RT.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) && in_serving_softirq() &&
            test_thread_flag(TIF_KERNEL_FPSTATE))
                fpsimd_load_kernel_state(current);
        else
                clear_thread_flag(TIF_KERNEL_FPSTATE);
}
EXPORT_SYMBOL_GPL(kernel_neon_end);

#ifdef CONFIG_EFI

static DEFINE_PER_CPU(struct user_fpsimd_state, efi_fpsimd_state);
static DEFINE_PER_CPU(bool, efi_fpsimd_state_used);
static DEFINE_PER_CPU(bool, efi_sve_state_used);
static DEFINE_PER_CPU(bool, efi_sm_state);

/*
 * EFI runtime services support functions
 *
 * The ABI for EFI runtime services allows EFI to use FPSIMD during the call.
 * This means that for EFI (and only for EFI), we have to assume that FPSIMD
 * is always used rather than being an optional accelerator.
 *
 * These functions provide the necessary support for ensuring FPSIMD
 * save/restore in the contexts from which EFI is used.
 *
 * Do not use them for any other purpose -- if tempted to do so, you are
 * either doing something wrong or you need to propose some refactoring.
 */

/*
 * __efi_fpsimd_begin(): prepare FPSIMD for making an EFI runtime services call
 */
void __efi_fpsimd_begin(void)
{
        if (!system_supports_fpsimd())
                return;

        WARN_ON(preemptible());

        if (may_use_simd()) {
                kernel_neon_begin();
        } else {
                /*
                 * If !efi_sve_state, SVE can't be in use yet and doesn't need
                 * preserving:
                 */
                if (system_supports_sve() && likely(efi_sve_state)) {
                        char *sve_state = this_cpu_ptr(efi_sve_state);
                        bool ffr = true;
                        u64 svcr;

                        __this_cpu_write(efi_sve_state_used, true);

                        if (system_supports_sme()) {
                                svcr = read_sysreg_s(SYS_SVCR);

                                __this_cpu_write(efi_sm_state,
                                                 svcr & SVCR_SM_MASK);

                                /*
                                 * Unless we have FA64 FFR does not
                                 * exist in streaming mode.
                                 */
                                if (!system_supports_fa64())
                                        ffr = !(svcr & SVCR_SM_MASK);
                        }

                        sve_save_state(sve_state + sve_ffr_offset(sve_max_vl()),
                                       &this_cpu_ptr(&efi_fpsimd_state)->fpsr,
                                       ffr);

                        if (system_supports_sme())
                                sysreg_clear_set_s(SYS_SVCR,
                                                   SVCR_SM_MASK, 0);

                } else {
                        fpsimd_save_state(this_cpu_ptr(&efi_fpsimd_state));
                }

                __this_cpu_write(efi_fpsimd_state_used, true);
        }
}

/*
 * __efi_fpsimd_end(): clean up FPSIMD after an EFI runtime services call
 */
void __efi_fpsimd_end(void)
{
        if (!system_supports_fpsimd())
                return;

        if (!__this_cpu_xchg(efi_fpsimd_state_used, false)) {
                kernel_neon_end();
        } else {
                if (system_supports_sve() &&
                    likely(__this_cpu_read(efi_sve_state_used))) {
                        char const *sve_state = this_cpu_ptr(efi_sve_state);
                        bool ffr = true;

                        /*
                         * Restore streaming mode; EFI calls are
                         * normal function calls so should not return in
                         * streaming mode.
                         */
                        if (system_supports_sme()) {
                                if (__this_cpu_read(efi_sm_state)) {
                                        sysreg_clear_set_s(SYS_SVCR,
                                                           0,
                                                           SVCR_SM_MASK);

                                        /*
                                         * Unless we have FA64 FFR does not
                                         * exist in streaming mode.
                                         */
                                        if (!system_supports_fa64())
                                                ffr = false;
                                }
                        }

                        sve_load_state(sve_state + sve_ffr_offset(sve_max_vl()),
                                       &this_cpu_ptr(&efi_fpsimd_state)->fpsr,
                                       ffr);

                        __this_cpu_write(efi_sve_state_used, false);
                } else {
                        fpsimd_load_state(this_cpu_ptr(&efi_fpsimd_state));
                }
        }
}

#endif /* CONFIG_EFI */

#endif /* CONFIG_KERNEL_MODE_NEON */

#ifdef CONFIG_CPU_PM
static int fpsimd_cpu_pm_notifier(struct notifier_block *self,
                                  unsigned long cmd, void *v)
{
        switch (cmd) {
        case CPU_PM_ENTER:
                fpsimd_save_and_flush_cpu_state();
                break;
        case CPU_PM_EXIT:
                break;
        case CPU_PM_ENTER_FAILED:
        default:
                return NOTIFY_DONE;
        }
        return NOTIFY_OK;
}

static struct notifier_block fpsimd_cpu_pm_notifier_block = {
        .notifier_call = fpsimd_cpu_pm_notifier,
};

static void __init fpsimd_pm_init(void)
{
        cpu_pm_register_notifier(&fpsimd_cpu_pm_notifier_block);
}

#else
static inline void fpsimd_pm_init(void) { }
#endif /* CONFIG_CPU_PM */

#ifdef CONFIG_HOTPLUG_CPU
static int fpsimd_cpu_dead(unsigned int cpu)
{
        per_cpu(fpsimd_last_state.st, cpu) = NULL;
        return 0;
}

static inline void fpsimd_hotplug_init(void)
{
        cpuhp_setup_state_nocalls(CPUHP_ARM64_FPSIMD_DEAD, "arm64/fpsimd:dead",
                                  NULL, fpsimd_cpu_dead);
}

#else
static inline void fpsimd_hotplug_init(void) { }
#endif

void cpu_enable_fpsimd(const struct arm64_cpu_capabilities *__always_unused p)
{
        unsigned long enable = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN;
        write_sysreg(read_sysreg(CPACR_EL1) | enable, CPACR_EL1);
        isb();
}

/*
 * FP/SIMD support code initialisation.
 */
static int __init fpsimd_init(void)
{
        if (cpu_have_named_feature(FP)) {
                fpsimd_pm_init();
                fpsimd_hotplug_init();
        } else {
                pr_notice("Floating-point is not implemented\n");
        }

        if (!cpu_have_named_feature(ASIMD))
                pr_notice("Advanced SIMD is not implemented\n");


        sve_sysctl_init();
        sme_sysctl_init();

        return 0;
}
core_initcall(fpsimd_init);



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BLK_MQ_H
#define BLK_MQ_H

#include <linux/blkdev.h>
#include <linux/sbitmap.h>
#include <linux/lockdep.h>
#include <linux/scatterlist.h>
#include <linux/prefetch.h>
#include <linux/srcu.h>
#include <linux/rw_hint.h>

struct blk_mq_tags;
struct blk_flush_queue;

#define BLKDEV_MIN_RQ        4
#define BLKDEV_DEFAULT_RQ        128

enum rq_end_io_ret {
        RQ_END_IO_NONE,
        RQ_END_IO_FREE,
};

typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t);

/*
 * request flags */
typedef __u32 __bitwise req_flags_t;

/* Keep rqf_name[] in sync with the definitions below */
enum rqf_flags {
        /* drive already may have started this one */
        __RQF_STARTED,
        /* request for flush sequence */
        __RQF_FLUSH_SEQ,
        /* merge of different types, fail separately */
        __RQF_MIXED_MERGE,
        /* don't call prep for this one */
        __RQF_DONTPREP,
        /* use hctx->sched_tags */
        __RQF_SCHED_TAGS,
        /* use an I/O scheduler for this request */
        __RQF_USE_SCHED,
        /* vaguely specified driver internal error.  Ignored by block layer */
        __RQF_FAILED,
        /* don't warn about errors */
        __RQF_QUIET,
        /* account into disk and partition IO statistics */
        __RQF_IO_STAT,
        /* runtime pm request */
        __RQF_PM,
        /* on IO scheduler merge hash */
        __RQF_HASHED,
        /* track IO completion time */
        __RQF_STATS,
        /* Look at ->special_vec for the actual data payload instead of the
           bio chain. */
        __RQF_SPECIAL_PAYLOAD,
        /* request completion needs to be signaled to zone write plugging. */
        __RQF_ZONE_WRITE_PLUGGING,
        /* ->timeout has been called, don't expire again */
        __RQF_TIMED_OUT,
        __RQF_RESV,
        __RQF_BITS
};

#define RQF_STARTED                ((__force req_flags_t)(1 << __RQF_STARTED))
#define RQF_FLUSH_SEQ                ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ))
#define RQF_MIXED_MERGE                ((__force req_flags_t)(1 << __RQF_MIXED_MERGE))
#define RQF_DONTPREP                ((__force req_flags_t)(1 << __RQF_DONTPREP))
#define RQF_SCHED_TAGS                ((__force req_flags_t)(1 << __RQF_SCHED_TAGS))
#define RQF_USE_SCHED                ((__force req_flags_t)(1 << __RQF_USE_SCHED))
#define RQF_FAILED                ((__force req_flags_t)(1 << __RQF_FAILED))
#define RQF_QUIET                ((__force req_flags_t)(1 << __RQF_QUIET))
#define RQF_IO_STAT                ((__force req_flags_t)(1 << __RQF_IO_STAT))
#define RQF_PM                        ((__force req_flags_t)(1 << __RQF_PM))
#define RQF_HASHED                ((__force req_flags_t)(1 << __RQF_HASHED))
#define RQF_STATS                ((__force req_flags_t)(1 << __RQF_STATS))
#define RQF_SPECIAL_PAYLOAD        \
                        ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD))
#define RQF_ZONE_WRITE_PLUGGING        \
                        ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING))
#define RQF_TIMED_OUT                ((__force req_flags_t)(1 << __RQF_TIMED_OUT))
#define RQF_RESV                ((__force req_flags_t)(1 << __RQF_RESV))

/* flags that prevent us from merging requests: */
#define RQF_NOMERGE_FLAGS \
        (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)

enum mq_rq_state {
        MQ_RQ_IDLE                = 0,
        MQ_RQ_IN_FLIGHT                = 1,
        MQ_RQ_COMPLETE                = 2,
};

/*
 * Try to put the fields that are referenced together in the same cacheline.
 *
 * If you modify this structure, make sure to update blk_rq_init() and
 * especially blk_mq_rq_ctx_init() to take care of the added fields.
 */
struct request {
        struct request_queue *q;
        struct blk_mq_ctx *mq_ctx;
        struct blk_mq_hw_ctx *mq_hctx;

        blk_opf_t cmd_flags;                /* op and common flags */
        req_flags_t rq_flags;

        int tag;
        int internal_tag;

        unsigned int timeout;

        /* the following two fields are internal, NEVER access directly */
        unsigned int __data_len;        /* total data len */
        sector_t __sector;                /* sector cursor */

        struct bio *bio;
        struct bio *biotail;

        union {
                struct list_head queuelist;
                struct request *rq_next;
        };

        struct block_device *part;
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
        /* Time that the first bio started allocating this request. */
        u64 alloc_time_ns;
#endif
        /* Time that this request was allocated for this IO. */
        u64 start_time_ns;
        /* Time that I/O was submitted to the device. */
        u64 io_start_time_ns;

#ifdef CONFIG_BLK_WBT
        unsigned short wbt_flags;
#endif
        /*
         * rq sectors used for blk stats. It has the same value
         * with blk_rq_sectors(rq), except that it never be zeroed
         * by completion.
         */
        unsigned short stats_sectors;

        /*
         * Number of scatter-gather DMA addr+len pairs after
         * physical address coalescing is performed.
         */
        unsigned short nr_phys_segments;
        unsigned short nr_integrity_segments;

#ifdef CONFIG_BLK_INLINE_ENCRYPTION
        struct bio_crypt_ctx *crypt_ctx;
        struct blk_crypto_keyslot *crypt_keyslot;
#endif

        enum mq_rq_state state;
        atomic_t ref;

        unsigned long deadline;

        /*
         * The hash is used inside the scheduler, and killed once the
         * request reaches the dispatch list. The ipi_list is only used
         * to queue the request for softirq completion, which is long
         * after the request has been unhashed (and even removed from
         * the dispatch list).
         */
        union {
                struct hlist_node hash;        /* merge hash */
                struct llist_node ipi_list;
        };

        /*
         * The rb_node is only used inside the io scheduler, requests
         * are pruned when moved to the dispatch queue. special_vec must
         * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be
         * insert into an IO scheduler.
         */
        union {
                struct rb_node rb_node;        /* sort/lookup */
                struct bio_vec special_vec;
        };

        /*
         * Three pointers are available for the IO schedulers, if they need
         * more they have to dynamically allocate it.
         */
        struct {
                struct io_cq                *icq;
                void                        *priv[2];
        } elv;

        struct {
                unsigned int                seq;
                rq_end_io_fn                *saved_end_io;
        } flush;

        u64 fifo_time;

        /*
         * completion callback.
         */
        rq_end_io_fn *end_io;
        void *end_io_data;
};

static inline enum req_op req_op(const struct request *req)
{
        return req->cmd_flags & REQ_OP_MASK;
}

static inline bool blk_rq_is_passthrough(struct request *rq)
{
        return blk_op_is_passthrough(rq->cmd_flags);
}

static inline unsigned short req_get_ioprio(struct request *req)
{
        if (req->bio)
                return req->bio->bi_ioprio;
        return 0;
}

#define rq_data_dir(rq)                (op_is_write(req_op(rq)) ? WRITE : READ)

#define rq_dma_dir(rq) \
        (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE)

static inline int rq_list_empty(const struct rq_list *rl)
{
        return rl->head == NULL;
}

static inline void rq_list_init(struct rq_list *rl)
{
        rl->head = NULL;
        rl->tail = NULL;
}

static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq)
{
        rq->rq_next = NULL;
        if (rl->tail)
                rl->tail->rq_next = rq;
        else
                rl->head = rq;
        rl->tail = rq;
}

static inline void rq_list_add_head(struct rq_list *rl, struct request *rq)
{
        rq->rq_next = rl->head;
        rl->head = rq;
        if (!rl->tail)
                rl->tail = rq;
}

static inline struct request *rq_list_pop(struct rq_list *rl)
{
        struct request *rq = rl->head;

        if (rq) {
                rl->head = rl->head->rq_next;
                if (!rl->head)
                        rl->tail = NULL;
                rq->rq_next = NULL;
        }

        return rq;
}

static inline struct request *rq_list_peek(struct rq_list *rl)
{
        return rl->head;
}

#define rq_list_for_each(rl, pos)                                        \
        for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next)

#define rq_list_for_each_safe(rl, pos, nxt)                                \
        for (pos = rq_list_peek((rl)), nxt = pos->rq_next;                \
                pos; pos = nxt, nxt = pos ? pos->rq_next : NULL)

/**
 * enum blk_eh_timer_return - How the timeout handler should proceed
 * @BLK_EH_DONE: The block driver completed the command or will complete it at
 *        a later time.
 * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the
 *        request to complete.
 */
enum blk_eh_timer_return {
        BLK_EH_DONE,
        BLK_EH_RESET_TIMER,
};

/**
 * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
 * block device
 */
struct blk_mq_hw_ctx {
        struct {
                /** @lock: Protects the dispatch list. */
                spinlock_t                lock;
                /**
                 * @dispatch: Used for requests that are ready to be
                 * dispatched to the hardware but for some reason (e.g. lack of
                 * resources) could not be sent to the hardware. As soon as the
                 * driver can send new requests, requests at this list will
                 * be sent first for a fairer dispatch.
                 */
                struct list_head        dispatch;
                 /**
                  * @state: BLK_MQ_S_* flags. Defines the state of the hw
                  * queue (active, scheduled to restart, stopped).
                  */
                unsigned long                state;
        } ____cacheline_aligned_in_smp;

        /**
         * @run_work: Used for scheduling a hardware queue run at a later time.
         */
        struct delayed_work        run_work;
        /** @cpumask: Map of available CPUs where this hctx can run. */
        cpumask_var_t                cpumask;
        /**
         * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
         * selection from @cpumask.
         */
        int                        next_cpu;
        /**
         * @next_cpu_batch: Counter of how many works left in the batch before
         * changing to the next CPU.
         */
        int                        next_cpu_batch;

        /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
        unsigned long                flags;

        /**
         * @sched_data: Pointer owned by the IO scheduler attached to a request
         * queue. It's up to the IO scheduler how to use this pointer.
         */
        void                        *sched_data;
        /**
         * @queue: Pointer to the request queue that owns this hardware context.
         */
        struct request_queue        *queue;
        /** @fq: Queue of requests that need to perform a flush operation. */
        struct blk_flush_queue        *fq;

        /**
         * @driver_data: Pointer to data owned by the block driver that created
         * this hctx
         */
        void                        *driver_data;

        /**
         * @ctx_map: Bitmap for each software queue. If bit is on, there is a
         * pending request in that software queue.
         */
        struct sbitmap                ctx_map;

        /**
         * @dispatch_from: Software queue to be used when no scheduler was
         * selected.
         */
        struct blk_mq_ctx        *dispatch_from;
        /**
         * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
         * decide if the hw_queue is busy using Exponential Weighted Moving
         * Average algorithm.
         */
        unsigned int                dispatch_busy;

        /** @type: HCTX_TYPE_* flags. Type of hardware queue. */
        unsigned short                type;
        /** @nr_ctx: Number of software queues. */
        unsigned short                nr_ctx;
        /** @ctxs: Array of software queues. */
        struct blk_mq_ctx        **ctxs;

        /** @dispatch_wait_lock: Lock for dispatch_wait queue. */
        spinlock_t                dispatch_wait_lock;
        /**
         * @dispatch_wait: Waitqueue to put requests when there is no tag
         * available at the moment, to wait for another try in the future.
         */
        wait_queue_entry_t        dispatch_wait;

        /**
         * @wait_index: Index of next available dispatch_wait queue to insert
         * requests.
         */
        atomic_t                wait_index;

        /**
         * @tags: Tags owned by the block driver. A tag at this set is only
         * assigned when a request is dispatched from a hardware queue.
         */
        struct blk_mq_tags        *tags;
        /**
         * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
         * scheduler associated with a request queue, a tag is assigned when
         * that request is allocated. Else, this member is not used.
         */
        struct blk_mq_tags        *sched_tags;

        /** @numa_node: NUMA node the storage adapter has been connected to. */
        unsigned int                numa_node;
        /** @queue_num: Index of this hardware queue. */
        unsigned int                queue_num;

        /**
         * @nr_active: Number of active requests. Only used when a tag set is
         * shared across request queues.
         */
        atomic_t                nr_active;

        /** @cpuhp_online: List to store request if CPU is going to die */
        struct hlist_node        cpuhp_online;
        /** @cpuhp_dead: List to store request if some CPU die. */
        struct hlist_node        cpuhp_dead;
        /** @kobj: Kernel object for sysfs. */
        struct kobject                kobj;

#ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @debugfs_dir: debugfs directory for this hardware queue. Named
         * as cpu<cpu_number>.
         */
        struct dentry                *debugfs_dir;
        /** @sched_debugfs_dir:        debugfs directory for the scheduler. */
        struct dentry                *sched_debugfs_dir;
#endif

        /**
         * @hctx_list: if this hctx is not in use, this is an entry in
         * q->unused_hctx_list.
         */
        struct list_head        hctx_list;
};

/**
 * struct blk_mq_queue_map - Map software queues to hardware queues
 * @mq_map:       CPU ID to hardware queue index map. This is an array
 *        with nr_cpu_ids elements. Each element has a value in the range
 *        [@queue_offset, @queue_offset + @nr_queues).
 * @nr_queues:    Number of hardware queues to map CPU IDs onto.
 * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
 *        driver to map each hardware queue type (enum hctx_type) onto a distinct
 *        set of hardware queues.
 */
struct blk_mq_queue_map {
        unsigned int *mq_map;
        unsigned int nr_queues;
        unsigned int queue_offset;
};

/**
 * enum hctx_type - Type of hardware queue
 * @HCTX_TYPE_DEFAULT:        All I/O not otherwise accounted for.
 * @HCTX_TYPE_READ:        Just for READ I/O.
 * @HCTX_TYPE_POLL:        Polled I/O of any kind.
 * @HCTX_MAX_TYPES:        Number of types of hctx.
 */
enum hctx_type {
        HCTX_TYPE_DEFAULT,
        HCTX_TYPE_READ,
        HCTX_TYPE_POLL,

        HCTX_MAX_TYPES,
};

/**
 * struct blk_mq_tag_set - tag set that can be shared between request queues
 * @ops:           Pointers to functions that implement block driver behavior.
 * @map:           One or more ctx -> hctx mappings. One map exists for each
 *                   hardware queue type (enum hctx_type) that the driver wishes
 *                   to support. There are no restrictions on maps being of the
 *                   same size, and it's perfectly legal to share maps between
 *                   types.
 * @nr_maps:           Number of elements in the @map array. A number in the range
 *                   [1, HCTX_MAX_TYPES].
 * @nr_hw_queues:  Number of hardware queues supported by the block driver that
 *                   owns this data structure.
 * @queue_depth:   Number of tags per hardware queue, reserved tags included.
 * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
 *                   allocations.
 * @cmd_size:           Number of additional bytes to allocate per request. The block
 *                   driver owns these additional bytes.
 * @numa_node:           NUMA node the storage adapter has been connected to.
 * @timeout:           Request processing timeout in jiffies.
 * @flags:           Zero or more BLK_MQ_F_* flags.
 * @driver_data:   Pointer to data owned by the block driver that created this
 *                   tag set.
 * @tags:           Tag sets. One tag set per hardware queue. Has @nr_hw_queues
 *                   elements.
 * @shared_tags:
 *                   Shared set of tags. Has @nr_hw_queues elements. If set,
 *                   shared by all @tags.
 * @tag_list_lock: Serializes tag_list accesses.
 * @tag_list:           List of the request queues that use this tag set. See also
 *                   request_queue.tag_set_list.
 * @srcu:           Use as lock when type of the request queue is blocking
 *                   (BLK_MQ_F_BLOCKING).
 */
struct blk_mq_tag_set {
        const struct blk_mq_ops        *ops;
        struct blk_mq_queue_map        map[HCTX_MAX_TYPES];
        unsigned int                nr_maps;
        unsigned int                nr_hw_queues;
        unsigned int                queue_depth;
        unsigned int                reserved_tags;
        unsigned int                cmd_size;
        int                        numa_node;
        unsigned int                timeout;
        unsigned int                flags;
        void                        *driver_data;

        struct blk_mq_tags        **tags;

        struct blk_mq_tags        *shared_tags;

        struct mutex                tag_list_lock;
        struct list_head        tag_list;
        struct srcu_struct        *srcu;
};

/**
 * struct blk_mq_queue_data - Data about a request inserted in a queue
 *
 * @rq:   Request pointer.
 * @last: If it is the last request in the queue.
 */
struct blk_mq_queue_data {
        struct request *rq;
        bool last;
};

typedef bool (busy_tag_iter_fn)(struct request *, void *);

/**
 * struct blk_mq_ops - Callback functions that implements block driver
 * behaviour.
 */
struct blk_mq_ops {
        /**
         * @queue_rq: Queue a new request from block IO.
         */
        blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *,
                                 const struct blk_mq_queue_data *);

        /**
         * @commit_rqs: If a driver uses bd->last to judge when to submit
         * requests to hardware, it must define this function. In case of errors
         * that make us stop issuing further requests, this hook serves the
         * purpose of kicking the hardware (which the last request otherwise
         * would have done).
         */
        void (*commit_rqs)(struct blk_mq_hw_ctx *);

        /**
         * @queue_rqs: Queue a list of new requests. Driver is guaranteed
         * that each request belongs to the same queue. If the driver doesn't
         * empty the @rqlist completely, then the rest will be queued
         * individually by the block layer upon return.
         */
        void (*queue_rqs)(struct rq_list *rqlist);

        /**
         * @get_budget: Reserve budget before queue request, once .queue_rq is
         * run, it is driver's responsibility to release the
         * reserved budget. Also we have to handle failure case
         * of .get_budget for avoiding I/O deadlock.
         */
        int (*get_budget)(struct request_queue *);

        /**
         * @put_budget: Release the reserved budget.
         */
        void (*put_budget)(struct request_queue *, int);

        /**
         * @set_rq_budget_token: store rq's budget token
         */
        void (*set_rq_budget_token)(struct request *, int);
        /**
         * @get_rq_budget_token: retrieve rq's budget token
         */
        int (*get_rq_budget_token)(struct request *);

        /**
         * @timeout: Called on request timeout.
         */
        enum blk_eh_timer_return (*timeout)(struct request *);

        /**
         * @poll: Called to poll for completion of a specific tag.
         */
        int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *);

        /**
         * @complete: Mark the request as complete.
         */
        void (*complete)(struct request *);

        /**
         * @init_hctx: Called when the block layer side of a hardware queue has
         * been set up, allowing the driver to allocate/init matching
         * structures.
         */
        int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int);
        /**
         * @exit_hctx: Ditto for exit/teardown.
         */
        void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);

        /**
         * @init_request: Called for every command allocated by the block layer
         * to allow the driver to set up driver specific data.
         *
         * Tag greater than or equal to queue_depth is for setting up
         * flush request.
         */
        int (*init_request)(struct blk_mq_tag_set *set, struct request *,
                            unsigned int, unsigned int);
        /**
         * @exit_request: Ditto for exit/teardown.
         */
        void (*exit_request)(struct blk_mq_tag_set *set, struct request *,
                             unsigned int);

        /**
         * @cleanup_rq: Called before freeing one request which isn't completed
         * yet, and usually for freeing the driver private data.
         */
        void (*cleanup_rq)(struct request *);

        /**
         * @busy: If set, returns whether or not this queue currently is busy.
         */
        bool (*busy)(struct request_queue *);

        /**
         * @map_queues: This allows drivers specify their own queue mapping by
         * overriding the setup-time function that builds the mq_map.
         */
        void (*map_queues)(struct blk_mq_tag_set *set);

#ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @show_rq: Used by the debugfs implementation to show driver-specific
         * information about a request.
         */
        void (*show_rq)(struct seq_file *m, struct request *rq);
#endif
};

/* Keep hctx_flag_name[] in sync with the definitions below */
enum {
        BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1,
        /*
         * Set when this device requires underlying blk-mq device for
         * completing IO:
         */
        BLK_MQ_F_STACKING        = 1 << 2,
        BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3,
        BLK_MQ_F_BLOCKING        = 1 << 4,

        /*
         * Alloc tags on a round-robin base instead of the first available one.
         */
        BLK_MQ_F_TAG_RR                = 1 << 5,

        /*
         * Select 'none' during queue registration in case of a single hwq
         * or shared hwqs instead of 'mq-deadline'.
         */
        BLK_MQ_F_NO_SCHED_BY_DEFAULT        = 1 << 6,

        BLK_MQ_F_MAX = 1 << 7,
};

#define BLK_MQ_MAX_DEPTH        (10240)
#define BLK_MQ_NO_HCTX_IDX        (-1U)

enum {
        /* Keep hctx_state_name[] in sync with the definitions below */
        BLK_MQ_S_STOPPED,
        BLK_MQ_S_TAG_ACTIVE,
        BLK_MQ_S_SCHED_RESTART,
        /* hw queue is inactive after all its CPUs become offline */
        BLK_MQ_S_INACTIVE,
        BLK_MQ_S_MAX
};

struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata,
                struct lock_class_key *lkclass);
#define blk_mq_alloc_disk(set, lim, queuedata)                                \
({                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __blk_mq_alloc_disk(set, lim, queuedata, &__key);                \
})
struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
                struct lock_class_key *lkclass);
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                struct request_queue *q);
void blk_mq_destroy_queue(struct request_queue *);

int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
                const struct blk_mq_ops *ops, unsigned int queue_depth,
                unsigned int set_flags);
void blk_mq_free_tag_set(struct blk_mq_tag_set *set);

void blk_mq_free_request(struct request *rq);
int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
                unsigned int poll_flags);

bool blk_mq_queue_inflight(struct request_queue *q);

enum {
        /* return when out of requests */
        BLK_MQ_REQ_NOWAIT        = (__force blk_mq_req_flags_t)(1 << 0),
        /* allocate from reserved pool */
        BLK_MQ_REQ_RESERVED        = (__force blk_mq_req_flags_t)(1 << 1),
        /* set RQF_PM */
        BLK_MQ_REQ_PM                = (__force blk_mq_req_flags_t)(1 << 2),
};

struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
                blk_mq_req_flags_t flags);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
                blk_opf_t opf, blk_mq_req_flags_t flags,
                unsigned int hctx_idx);

/*
 * Tag address space map.
 */
struct blk_mq_tags {
        unsigned int nr_tags;
        unsigned int nr_reserved_tags;
        unsigned int active_queues;

        struct sbitmap_queue bitmap_tags;
        struct sbitmap_queue breserved_tags;

        struct request **rqs;
        struct request **static_rqs;
        struct list_head page_list;

        /*
         * used to clear request reference in rqs[] before freeing one
         * request pool
         */
        spinlock_t lock;
};

static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,
                                               unsigned int tag)
{
        if (tag < tags->nr_tags) {
                prefetch(tags->rqs[tag]);
                return tags->rqs[tag];
        }

        return NULL;
}

enum {
        BLK_MQ_UNIQUE_TAG_BITS = 16,
        BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1,
};

u32 blk_mq_unique_tag(struct request *rq);

static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag)
{
        return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS;
}

static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
{
        return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
}

/**
 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
 * @rq: target request.
 */
static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
{
        return READ_ONCE(rq->state);
}

static inline int blk_mq_request_started(struct request *rq)
{
        return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
}

static inline int blk_mq_request_completed(struct request *rq)
{
        return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
}

/*
 * 
 * Set the state to complete when completing a request from inside ->queue_rq.
 * This is used by drivers that want to ensure special complete actions that
 * need access to the request are called on failure, e.g. by nvme for
 * multipathing.
 */
static inline void blk_mq_set_request_complete(struct request *rq)
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
}

/*
 * Complete the request directly instead of deferring it to softirq or
 * completing it another CPU. Useful in preemptible instead of an interrupt.
 */
static inline void blk_mq_complete_request_direct(struct request *rq,
                   void (*complete)(struct request *rq))
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
        complete(rq);
}

void blk_mq_start_request(struct request *rq);
void blk_mq_end_request(struct request *rq, blk_status_t error);
void __blk_mq_end_request(struct request *rq, blk_status_t error);
void blk_mq_end_request_batch(struct io_comp_batch *ib);

/*
 * Only need start/end time stamping if we have iostat or
 * blk stats enabled, or using an IO scheduler.
 */
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
        return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED));
}

static inline bool blk_mq_is_reserved_rq(struct request *rq)
{
        return rq->rq_flags & RQF_RESV;
}

/**
 * blk_mq_add_to_batch() - add a request to the completion batch
 * @req: The request to add to batch
 * @iob: The batch to add the request
 * @is_error: Specify true if the request failed with an error
 * @complete: The completaion handler for the request
 *
 * Batched completions only work when there is no I/O error and no special
 * ->end_io handler.
 *
 * Return: true when the request was added to the batch, otherwise false
 */
static inline bool blk_mq_add_to_batch(struct request *req,
                                       struct io_comp_batch *iob, bool is_error,
                                       void (*complete)(struct io_comp_batch *))
{
        /*
         * Check various conditions that exclude batch processing:
         * 1) No batch container
         * 2) Has scheduler data attached
         * 3) Not a passthrough request and end_io set
         * 4) Not a passthrough request and failed with an error
         */
        if (!iob)
                return false;
        if (req->rq_flags & RQF_SCHED_TAGS)
                return false;
        if (!blk_rq_is_passthrough(req)) {
                if (req->end_io)
                        return false;
                if (is_error)
                        return false;
        }

        if (!iob->complete)
                iob->complete = complete;
        else if (iob->complete != complete)
                return false;
        iob->need_ts |= blk_mq_need_time_stamp(req);
        rq_list_add_tail(&iob->req_list, req);
        return true;
}

void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
void blk_mq_complete_request(struct request *rq);
bool blk_mq_complete_request_remote(struct request *rq);
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
void blk_mq_stop_hw_queues(struct request_queue *q);
void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_quiesce_queue(struct request_queue *q);
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set);
void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
void blk_mq_unquiesce_queue(struct request_queue *q);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_run_hw_queues(struct request_queue *q, bool async);
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs);
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
                busy_tag_iter_fn *fn, void *priv);
void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset);
void blk_mq_freeze_queue_nomemsave(struct request_queue *q);
void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q);
static inline unsigned int __must_check
blk_mq_freeze_queue(struct request_queue *q)
{
        unsigned int memflags = memalloc_noio_save();

        blk_mq_freeze_queue_nomemsave(q);
        return memflags;
}
static inline void
blk_mq_unfreeze_queue(struct request_queue *q, unsigned int memflags)
{
        blk_mq_unfreeze_queue_nomemrestore(q);
        memalloc_noio_restore(memflags);
}
void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout);
void blk_mq_unfreeze_queue_non_owner(struct request_queue *q);
void blk_freeze_queue_start_non_owner(struct request_queue *q);

void blk_mq_map_queues(struct blk_mq_queue_map *qmap);
void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap,
                          struct device *dev, unsigned int offset);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);

void blk_mq_quiesce_queue_nowait(struct request_queue *q);

unsigned int blk_mq_rq_cpu(struct request *rq);

bool __blk_should_fake_timeout(struct request_queue *q);
static inline bool blk_should_fake_timeout(struct request_queue *q)
{
        if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) &&
            test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
                return __blk_should_fake_timeout(q);
        return false;
}

/**
 * blk_mq_rq_from_pdu - cast a PDU to a request
 * @pdu: the PDU (Protocol Data Unit) to be casted
 *
 * Return: request
 *
 * Driver command data is immediately after the request. So subtract request
 * size to get back to the original request.
 */
static inline struct request *blk_mq_rq_from_pdu(void *pdu)
{
        return pdu - sizeof(struct request);
}

/**
 * blk_mq_rq_to_pdu - cast a request to a PDU
 * @rq: the request to be casted
 *
 * Return: pointer to the PDU
 *
 * Driver command data is immediately after the request. So add request to get
 * the PDU.
 */
static inline void *blk_mq_rq_to_pdu(struct request *rq)
{
        return rq + 1;
}

#define queue_for_each_hw_ctx(q, hctx, i)                                \
        xa_for_each(&(q)->hctx_table, (i), (hctx))

#define hctx_for_each_ctx(hctx, ctx, i)                                        \
        for ((i) = 0; (i) < (hctx)->nr_ctx &&                                \
             ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++)

static inline void blk_mq_cleanup_rq(struct request *rq)
{
        if (rq->q->mq_ops->cleanup_rq)
                rq->q->mq_ops->cleanup_rq(rq);
}

void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
                struct lock_class_key *key);

static inline bool rq_is_sync(struct request *rq)
{
        return op_is_sync(rq->cmd_flags);
}

void blk_rq_init(struct request_queue *q, struct request *rq);
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
                struct bio_set *bs, gfp_t gfp_mask,
                int (*bio_ctr)(struct bio *, struct bio *, void *), void *data);
void blk_rq_unprep_clone(struct request *rq);
blk_status_t blk_insert_cloned_request(struct request *rq);

struct rq_map_data {
        struct page **pages;
        unsigned long offset;
        unsigned short page_order;
        unsigned short nr_entries;
        bool null_mapped;
        bool from_user;
};

int blk_rq_map_user(struct request_queue *, struct request *,
                struct rq_map_data *, void __user *, unsigned long, gfp_t);
int blk_rq_map_user_io(struct request *, struct rq_map_data *,
                void __user *, unsigned long, gfp_t, bool, int, bool, int);
int blk_rq_map_user_iov(struct request_queue *, struct request *,
                struct rq_map_data *, const struct iov_iter *, gfp_t);
int blk_rq_unmap_user(struct bio *);
int blk_rq_map_kern(struct request_queue *, struct request *, void *,
                unsigned int, gfp_t);
int blk_rq_append_bio(struct request *rq, struct bio *bio);
void blk_execute_rq_nowait(struct request *rq, bool at_head);
blk_status_t blk_execute_rq(struct request *rq, bool at_head);
bool blk_rq_is_poll(struct request *rq);

struct req_iterator {
        struct bvec_iter iter;
        struct bio *bio;
};

#define __rq_for_each_bio(_bio, rq)        \
        if ((rq->bio))                        \
                for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)

#define rq_for_each_segment(bvl, _rq, _iter)                        \
        __rq_for_each_bio(_iter.bio, _rq)                        \
                bio_for_each_segment(bvl, _iter.bio, _iter.iter)

#define rq_for_each_bvec(bvl, _rq, _iter)                        \
        __rq_for_each_bio(_iter.bio, _rq)                        \
                bio_for_each_bvec(bvl, _iter.bio, _iter.iter)

#define rq_iter_last(bvec, _iter)                                \
                (_iter.bio->bi_next == NULL &&                        \
                 bio_iter_last(bvec, _iter.iter))

/*
 * blk_rq_pos()                        : the current sector
 * blk_rq_bytes()                : bytes left in the entire request
 * blk_rq_cur_bytes()                : bytes left in the current segment
 * blk_rq_sectors()                : sectors left in the entire request
 * blk_rq_cur_sectors()                : sectors left in the current segment
 * blk_rq_stats_sectors()        : sectors of the entire request used for stats
 */
static inline sector_t blk_rq_pos(const struct request *rq)
{
        return rq->__sector;
}

static inline unsigned int blk_rq_bytes(const struct request *rq)
{
        return rq->__data_len;
}

static inline int blk_rq_cur_bytes(const struct request *rq)
{
        if (!rq->bio)
                return 0;
        if (!bio_has_data(rq->bio))        /* dataless requests such as discard */
                return rq->bio->bi_iter.bi_size;
        return bio_iovec(rq->bio).bv_len;
}

static inline unsigned int blk_rq_sectors(const struct request *rq)
{
        return blk_rq_bytes(rq) >> SECTOR_SHIFT;
}

static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
{
        return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT;
}

static inline unsigned int blk_rq_stats_sectors(const struct request *rq)
{
        return rq->stats_sectors;
}

/*
 * Some commands like WRITE SAME have a payload or data transfer size which
 * is different from the size of the request.  Any driver that supports such
 * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to
 * calculate the data transfer size.
 */
static inline unsigned int blk_rq_payload_bytes(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return rq->special_vec.bv_len;
        return blk_rq_bytes(rq);
}

/*
 * Return the first full biovec in the request.  The caller needs to check that
 * there are any bvecs before calling this helper.
 */
static inline struct bio_vec req_bvec(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return rq->special_vec;
        return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter);
}

static inline unsigned int blk_rq_count_bios(struct request *rq)
{
        unsigned int nr_bios = 0;
        struct bio *bio;

        __rq_for_each_bio(bio, rq)
                nr_bios++;

        return nr_bios;
}

void blk_steal_bios(struct bio_list *list, struct request *rq);

/*
 * Request completion related functions.
 *
 * blk_update_request() completes given number of bytes and updates
 * the request without completing it.
 */
bool blk_update_request(struct request *rq, blk_status_t error,
                               unsigned int nr_bytes);
void blk_abort_request(struct request *);

/*
 * Number of physical segments as sent to the device.
 *
 * Normally this is the number of discontiguous data segments sent by the
 * submitter.  But for data-less command like discard we might have no
 * actual data segments submitted, but the driver might have to add it's
 * own special payload.  In that case we still return 1 here so that this
 * special payload will be mapped.
 */
static inline unsigned short blk_rq_nr_phys_segments(struct request *rq)
{
        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
                return 1;
        return rq->nr_phys_segments;
}

/*
 * Number of discard segments (or ranges) the driver needs to fill in.
 * Each discard bio merged into a request is counted as one segment.
 */
static inline unsigned short blk_rq_nr_discard_segments(struct request *rq)
{
        return max_t(unsigned short, rq->nr_phys_segments, 1);
}

int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist,
                struct scatterlist **last_sg);
static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist)
{
        struct scatterlist *last_sg = NULL;

        return __blk_rq_map_sg(rq, sglist, &last_sg);
}
void blk_dump_rq_flags(struct request *, char *);

#endif /* BLK_MQ_H */























  237 















































  263 





  224 




  224 




  223 
  223 


















  335 
  334 
  333 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/interval_tree.c - interval tree for mapping->i_mmap
 *
 * Copyright (C) 2012, Michel Lespinasse <walken@google.com>
 */

#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/rmap.h>
#include <linux/interval_tree_generic.h>

static inline unsigned long vma_start_pgoff(struct vm_area_struct *v)
{
        return v->vm_pgoff;
}

static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
{
        return v->vm_pgoff + vma_pages(v) - 1;
}

INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
                     unsigned long, shared.rb_subtree_last,
                     vma_start_pgoff, vma_last_pgoff, /* empty */, vma_interval_tree)

/* Insert node immediately after prev in the interval tree */
void vma_interval_tree_insert_after(struct vm_area_struct *node,
                                    struct vm_area_struct *prev,
                                    struct rb_root_cached *root)
{
        struct rb_node **link;
        struct vm_area_struct *parent;
        unsigned long last = vma_last_pgoff(node);

        VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);

        if (!prev->shared.rb.rb_right) {
                parent = prev;
                link = &prev->shared.rb.rb_right;
        } else {
                parent = rb_entry(prev->shared.rb.rb_right,
                                  struct vm_area_struct, shared.rb);
                if (parent->shared.rb_subtree_last < last)
                        parent->shared.rb_subtree_last = last;
                while (parent->shared.rb.rb_left) {
                        parent = rb_entry(parent->shared.rb.rb_left,
                                struct vm_area_struct, shared.rb);
                        if (parent->shared.rb_subtree_last < last)
                                parent->shared.rb_subtree_last = last;
                }
                link = &parent->shared.rb.rb_left;
        }

        node->shared.rb_subtree_last = last;
        rb_link_node(&node->shared.rb, &parent->shared.rb, link);
        rb_insert_augmented(&node->shared.rb, &root->rb_root,
                            &vma_interval_tree_augment);
}

static inline unsigned long avc_start_pgoff(struct anon_vma_chain *avc)
{
        return vma_start_pgoff(avc->vma);
}

static inline unsigned long avc_last_pgoff(struct anon_vma_chain *avc)
{
        return vma_last_pgoff(avc->vma);
}

INTERVAL_TREE_DEFINE(struct anon_vma_chain, rb, unsigned long, rb_subtree_last,
                     avc_start_pgoff, avc_last_pgoff,
                     static inline, __anon_vma_interval_tree)

void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
#ifdef CONFIG_DEBUG_VM_RB
        node->cached_vma_start = avc_start_pgoff(node);
        node->cached_vma_last = avc_last_pgoff(node);
#endif
        __anon_vma_interval_tree_insert(node, root);
}

void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
                                   struct rb_root_cached *root)
{
        __anon_vma_interval_tree_remove(node, root);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_first(struct rb_root_cached *root,
                                  unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_first(root, first, last);
}

struct anon_vma_chain *
anon_vma_interval_tree_iter_next(struct anon_vma_chain *node,
                                 unsigned long first, unsigned long last)
{
        return __anon_vma_interval_tree_iter_next(node, first, last);
}

#ifdef CONFIG_DEBUG_VM_RB
void anon_vma_interval_tree_verify(struct anon_vma_chain *node)
{
        WARN_ON_ONCE(node->cached_vma_start != avc_start_pgoff(node));
        WARN_ON_ONCE(node->cached_vma_last != avc_last_pgoff(node));
}
#endif




















































































































































































    7 




























    7 







    7 




    7 










    7 




































    7 








    7 












    7 







    7 



















































































    7 













    7 

    7 






    7 




    6 







    7 


    7 













    7 















































































































    7 














































































    7 













    7 

















    7 
    7 


    7 




    7 



    7 
















    7 
    7 












    7 
















    7 



































































































































































































































    7 


















































































































































































    7 






    7 
    7 
    7 















    7 









    7 







    7 


    7 





































































    6 






    6 






















    6 























   68 





    6 


































   67 







   68 

































   68 
   68 




   67 

   32 






   68 


   68 



   68 















   33 






   35 



















   33 













































































































































































































































































   34 























  178 




























  179 

  179 


  179 





























  179 


  179 


  179 














  179 




  179 

















  179 













  179 














  155 
   34 













  179 











  179 








  179 





  178 





















  179 


  179 






































































































































































































































































































































































































































































  274 




















































































































































































































































































































































































































































































  274 







  274 


  274 


  274 


















  274 
















  274 









  274 
  274 


  274 
  274 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Implementation of the security services.
 *
 * Authors : Stephen Smalley, <stephen.smalley.work@gmail.com>
 *             James Morris <jmorris@redhat.com>
 *
 * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com>
 *
 *        Support for enhanced MLS infrastructure.
 *        Support for context based audit filters.
 *
 * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com>
 *
 *        Added conditional policy language extensions
 *
 * Updated: Hewlett-Packard <paul@paul-moore.com>
 *
 *      Added support for NetLabel
 *      Added support for the policy capability bitmap
 *
 * Updated: Chad Sellers <csellers@tresys.com>
 *
 *  Added validation of kernel classes and permissions
 *
 * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com>
 *
 *  Added support for bounds domain and audit messaged on masked permissions
 *
 * Updated: Guido Trentalancia <guido@trentalancia.com>
 *
 *  Added support for runtime switching of the policy type
 *
 * Copyright (C) 2008, 2009 NEC Corporation
 * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P.
 * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc.
 * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC
 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/sched.h>
#include <linux/audit.h>
#include <linux/parser.h>
#include <linux/vmalloc.h>
#include <linux/lsm_hooks.h>
#include <net/netlabel.h>

#include "flask.h"
#include "avc.h"
#include "avc_ss.h"
#include "security.h"
#include "context.h"
#include "policydb.h"
#include "sidtab.h"
#include "services.h"
#include "conditional.h"
#include "mls.h"
#include "objsec.h"
#include "netlabel.h"
#include "xfrm.h"
#include "ebitmap.h"
#include "audit.h"
#include "policycap_names.h"
#include "ima.h"

struct selinux_policy_convert_data {
        struct convert_context_args args;
        struct sidtab_convert_params sidtab_params;
};

/* Forward declaration. */
static int context_struct_to_string(struct policydb *policydb,
                                    struct context *context,
                                    char **scontext,
                                    u32 *scontext_len);

static int sidtab_entry_to_string(struct policydb *policydb,
                                  struct sidtab *sidtab,
                                  struct sidtab_entry *entry,
                                  char **scontext,
                                  u32 *scontext_len);

static void context_struct_compute_av(struct policydb *policydb,
                                      struct context *scontext,
                                      struct context *tcontext,
                                      u16 tclass,
                                      struct av_decision *avd,
                                      struct extended_perms *xperms);

static int selinux_set_mapping(struct policydb *pol,
                               const struct security_class_mapping *map,
                               struct selinux_map *out_map)
{
        u16 i, j;
        bool print_unknown_handle = false;

        /* Find number of classes in the input mapping */
        if (!map)
                return -EINVAL;
        i = 0;
        while (map[i].name)
                i++;

        /* Allocate space for the class records, plus one for class zero */
        out_map->mapping = kcalloc(++i, sizeof(*out_map->mapping), GFP_ATOMIC);
        if (!out_map->mapping)
                return -ENOMEM;

        /* Store the raw class and permission values */
        j = 0;
        while (map[j].name) {
                const struct security_class_mapping *p_in = map + (j++);
                struct selinux_mapping *p_out = out_map->mapping + j;
                u16 k;

                /* An empty class string skips ahead */
                if (!strcmp(p_in->name, "")) {
                        p_out->num_perms = 0;
                        continue;
                }

                p_out->value = string_to_security_class(pol, p_in->name);
                if (!p_out->value) {
                        pr_info("SELinux:  Class %s not defined in policy.\n",
                               p_in->name);
                        if (pol->reject_unknown)
                                goto err;
                        p_out->num_perms = 0;
                        print_unknown_handle = true;
                        continue;
                }

                k = 0;
                while (p_in->perms[k]) {
                        /* An empty permission string skips ahead */
                        if (!*p_in->perms[k]) {
                                k++;
                                continue;
                        }
                        p_out->perms[k] = string_to_av_perm(pol, p_out->value,
                                                            p_in->perms[k]);
                        if (!p_out->perms[k]) {
                                pr_info("SELinux:  Permission %s in class %s not defined in policy.\n",
                                       p_in->perms[k], p_in->name);
                                if (pol->reject_unknown)
                                        goto err;
                                print_unknown_handle = true;
                        }

                        k++;
                }
                p_out->num_perms = k;
        }

        if (print_unknown_handle)
                pr_info("SELinux: the above unknown classes and permissions will be %s\n",
                       pol->allow_unknown ? "allowed" : "denied");

        out_map->size = i;
        return 0;
err:
        kfree(out_map->mapping);
        out_map->mapping = NULL;
        return -EINVAL;
}

/*
 * Get real, policy values from mapped values
 */

static u16 unmap_class(struct selinux_map *map, u16 tclass)
{
        if (tclass < map->size)
                return map->mapping[tclass].value;

        return tclass;
}

/*
 * Get kernel value for class from its policy value
 */
static u16 map_class(struct selinux_map *map, u16 pol_value)
{
        u16 i;

        for (i = 1; i < map->size; i++) {
                if (map->mapping[i].value == pol_value)
                        return i;
        }

        return SECCLASS_NULL;
}

static void map_decision(struct selinux_map *map,
                         u16 tclass, struct av_decision *avd,
                         int allow_unknown)
{
        if (tclass < map->size) {
                struct selinux_mapping *mapping = &map->mapping[tclass];
                unsigned int i, n = mapping->num_perms;
                u32 result;

                for (i = 0, result = 0; i < n; i++) {
                        if (avd->allowed & mapping->perms[i])
                                result |= (u32)1<<i;
                        if (allow_unknown && !mapping->perms[i])
                                result |= (u32)1<<i;
                }
                avd->allowed = result;

                for (i = 0, result = 0; i < n; i++)
                        if (avd->auditallow & mapping->perms[i])
                                result |= (u32)1<<i;
                avd->auditallow = result;

                for (i = 0, result = 0; i < n; i++) {
                        if (avd->auditdeny & mapping->perms[i])
                                result |= (u32)1<<i;
                        if (!allow_unknown && !mapping->perms[i])
                                result |= (u32)1<<i;
                }
                /*
                 * In case the kernel has a bug and requests a permission
                 * between num_perms and the maximum permission number, we
                 * should audit that denial
                 */
                for (; i < (sizeof(u32)*8); i++)
                        result |= (u32)1<<i;
                avd->auditdeny = result;
        }
}

int security_mls_enabled(void)
{
        int mls_enabled;
        struct selinux_policy *policy;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        mls_enabled = policy->policydb.mls_enabled;
        rcu_read_unlock();
        return mls_enabled;
}

/*
 * Return the boolean value of a constraint expression
 * when it is applied to the specified source and target
 * security contexts.
 *
 * xcontext is a special beast...  It is used by the validatetrans rules
 * only.  For these rules, scontext is the context before the transition,
 * tcontext is the context after the transition, and xcontext is the context
 * of the process performing the transition.  All other callers of
 * constraint_expr_eval should pass in NULL for xcontext.
 */
static int constraint_expr_eval(struct policydb *policydb,
                                struct context *scontext,
                                struct context *tcontext,
                                struct context *xcontext,
                                struct constraint_expr *cexpr)
{
        u32 val1, val2;
        struct context *c;
        struct role_datum *r1, *r2;
        struct mls_level *l1, *l2;
        struct constraint_expr *e;
        int s[CEXPR_MAXDEPTH];
        int sp = -1;

        for (e = cexpr; e; e = e->next) {
                switch (e->expr_type) {
                case CEXPR_NOT:
                        BUG_ON(sp < 0);
                        s[sp] = !s[sp];
                        break;
                case CEXPR_AND:
                        BUG_ON(sp < 1);
                        sp--;
                        s[sp] &= s[sp + 1];
                        break;
                case CEXPR_OR:
                        BUG_ON(sp < 1);
                        sp--;
                        s[sp] |= s[sp + 1];
                        break;
                case CEXPR_ATTR:
                        if (sp == (CEXPR_MAXDEPTH - 1))
                                return 0;
                        switch (e->attr) {
                        case CEXPR_USER:
                                val1 = scontext->user;
                                val2 = tcontext->user;
                                break;
                        case CEXPR_TYPE:
                                val1 = scontext->type;
                                val2 = tcontext->type;
                                break;
                        case CEXPR_ROLE:
                                val1 = scontext->role;
                                val2 = tcontext->role;
                                r1 = policydb->role_val_to_struct[val1 - 1];
                                r2 = policydb->role_val_to_struct[val2 - 1];
                                switch (e->op) {
                                case CEXPR_DOM:
                                        s[++sp] = ebitmap_get_bit(&r1->dominates,
                                                                  val2 - 1);
                                        continue;
                                case CEXPR_DOMBY:
                                        s[++sp] = ebitmap_get_bit(&r2->dominates,
                                                                  val1 - 1);
                                        continue;
                                case CEXPR_INCOMP:
                                        s[++sp] = (!ebitmap_get_bit(&r1->dominates,
                                                                    val2 - 1) &&
                                                   !ebitmap_get_bit(&r2->dominates,
                                                                    val1 - 1));
                                        continue;
                                default:
                                        break;
                                }
                                break;
                        case CEXPR_L1L2:
                                l1 = &(scontext->range.level[0]);
                                l2 = &(tcontext->range.level[0]);
                                goto mls_ops;
                        case CEXPR_L1H2:
                                l1 = &(scontext->range.level[0]);
                                l2 = &(tcontext->range.level[1]);
                                goto mls_ops;
                        case CEXPR_H1L2:
                                l1 = &(scontext->range.level[1]);
                                l2 = &(tcontext->range.level[0]);
                                goto mls_ops;
                        case CEXPR_H1H2:
                                l1 = &(scontext->range.level[1]);
                                l2 = &(tcontext->range.level[1]);
                                goto mls_ops;
                        case CEXPR_L1H1:
                                l1 = &(scontext->range.level[0]);
                                l2 = &(scontext->range.level[1]);
                                goto mls_ops;
                        case CEXPR_L2H2:
                                l1 = &(tcontext->range.level[0]);
                                l2 = &(tcontext->range.level[1]);
                                goto mls_ops;
mls_ops:
                                switch (e->op) {
                                case CEXPR_EQ:
                                        s[++sp] = mls_level_eq(l1, l2);
                                        continue;
                                case CEXPR_NEQ:
                                        s[++sp] = !mls_level_eq(l1, l2);
                                        continue;
                                case CEXPR_DOM:
                                        s[++sp] = mls_level_dom(l1, l2);
                                        continue;
                                case CEXPR_DOMBY:
                                        s[++sp] = mls_level_dom(l2, l1);
                                        continue;
                                case CEXPR_INCOMP:
                                        s[++sp] = mls_level_incomp(l2, l1);
                                        continue;
                                default:
                                        BUG();
                                        return 0;
                                }
                                break;
                        default:
                                BUG();
                                return 0;
                        }

                        switch (e->op) {
                        case CEXPR_EQ:
                                s[++sp] = (val1 == val2);
                                break;
                        case CEXPR_NEQ:
                                s[++sp] = (val1 != val2);
                                break;
                        default:
                                BUG();
                                return 0;
                        }
                        break;
                case CEXPR_NAMES:
                        if (sp == (CEXPR_MAXDEPTH-1))
                                return 0;
                        c = scontext;
                        if (e->attr & CEXPR_TARGET)
                                c = tcontext;
                        else if (e->attr & CEXPR_XTARGET) {
                                c = xcontext;
                                if (!c) {
                                        BUG();
                                        return 0;
                                }
                        }
                        if (e->attr & CEXPR_USER)
                                val1 = c->user;
                        else if (e->attr & CEXPR_ROLE)
                                val1 = c->role;
                        else if (e->attr & CEXPR_TYPE)
                                val1 = c->type;
                        else {
                                BUG();
                                return 0;
                        }

                        switch (e->op) {
                        case CEXPR_EQ:
                                s[++sp] = ebitmap_get_bit(&e->names, val1 - 1);
                                break;
                        case CEXPR_NEQ:
                                s[++sp] = !ebitmap_get_bit(&e->names, val1 - 1);
                                break;
                        default:
                                BUG();
                                return 0;
                        }
                        break;
                default:
                        BUG();
                        return 0;
                }
        }

        BUG_ON(sp != 0);
        return s[0];
}

/*
 * security_dump_masked_av - dumps masked permissions during
 * security_compute_av due to RBAC, MLS/Constraint and Type bounds.
 */
static int dump_masked_av_helper(void *k, void *d, void *args)
{
        struct perm_datum *pdatum = d;
        char **permission_names = args;

        BUG_ON(pdatum->value < 1 || pdatum->value > 32);

        permission_names[pdatum->value - 1] = (char *)k;

        return 0;
}

static void security_dump_masked_av(struct policydb *policydb,
                                    struct context *scontext,
                                    struct context *tcontext,
                                    u16 tclass,
                                    u32 permissions,
                                    const char *reason)
{
        struct common_datum *common_dat;
        struct class_datum *tclass_dat;
        struct audit_buffer *ab;
        char *tclass_name;
        char *scontext_name = NULL;
        char *tcontext_name = NULL;
        char *permission_names[32];
        int index;
        u32 length;
        bool need_comma = false;

        if (!permissions)
                return;

        tclass_name = sym_name(policydb, SYM_CLASSES, tclass - 1);
        tclass_dat = policydb->class_val_to_struct[tclass - 1];
        common_dat = tclass_dat->comdatum;

        /* init permission_names */
        if (common_dat &&
            hashtab_map(&common_dat->permissions.table,
                        dump_masked_av_helper, permission_names) < 0)
                goto out;

        if (hashtab_map(&tclass_dat->permissions.table,
                        dump_masked_av_helper, permission_names) < 0)
                goto out;

        /* get scontext/tcontext in text form */
        if (context_struct_to_string(policydb, scontext,
                                     &scontext_name, &length) < 0)
                goto out;

        if (context_struct_to_string(policydb, tcontext,
                                     &tcontext_name, &length) < 0)
                goto out;

        /* audit a message */
        ab = audit_log_start(audit_context(),
                             GFP_ATOMIC, AUDIT_SELINUX_ERR);
        if (!ab)
                goto out;

        audit_log_format(ab, "op=security_compute_av reason=%s "
                         "scontext=%s tcontext=%s tclass=%s perms=",
                         reason, scontext_name, tcontext_name, tclass_name);

        for (index = 0; index < 32; index++) {
                u32 mask = (1 << index);

                if ((mask & permissions) == 0)
                        continue;

                audit_log_format(ab, "%s%s",
                                 need_comma ? "," : "",
                                 permission_names[index]
                                 ? permission_names[index] : "????");
                need_comma = true;
        }
        audit_log_end(ab);
out:
        /* release scontext/tcontext */
        kfree(tcontext_name);
        kfree(scontext_name);
}

/*
 * security_boundary_permission - drops violated permissions
 * on boundary constraint.
 */
static void type_attribute_bounds_av(struct policydb *policydb,
                                     struct context *scontext,
                                     struct context *tcontext,
                                     u16 tclass,
                                     struct av_decision *avd)
{
        struct context lo_scontext;
        struct context lo_tcontext, *tcontextp = tcontext;
        struct av_decision lo_avd;
        struct type_datum *source;
        struct type_datum *target;
        u32 masked = 0;

        source = policydb->type_val_to_struct[scontext->type - 1];
        BUG_ON(!source);

        if (!source->bounds)
                return;

        target = policydb->type_val_to_struct[tcontext->type - 1];
        BUG_ON(!target);

        memset(&lo_avd, 0, sizeof(lo_avd));

        memcpy(&lo_scontext, scontext, sizeof(lo_scontext));
        lo_scontext.type = source->bounds;

        if (target->bounds) {
                memcpy(&lo_tcontext, tcontext, sizeof(lo_tcontext));
                lo_tcontext.type = target->bounds;
                tcontextp = &lo_tcontext;
        }

        context_struct_compute_av(policydb, &lo_scontext,
                                  tcontextp,
                                  tclass,
                                  &lo_avd,
                                  NULL);

        masked = ~lo_avd.allowed & avd->allowed;

        if (likely(!masked))
                return;                /* no masked permission */

        /* mask violated permissions */
        avd->allowed &= ~masked;

        /* audit masked permissions */
        security_dump_masked_av(policydb, scontext, tcontext,
                                tclass, masked, "bounds");
}

/*
 * Flag which drivers have permissions and which base permissions are covered.
 */
void services_compute_xperms_drivers(
                struct extended_perms *xperms,
                struct avtab_node *node)
{
        unsigned int i;

        switch (node->datum.u.xperms->specified) {
        case AVTAB_XPERMS_IOCTLDRIVER:
                xperms->base_perms |= AVC_EXT_IOCTL;
                /* if one or more driver has all permissions allowed */
                for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++)
                        xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i];
                break;
        case AVTAB_XPERMS_IOCTLFUNCTION:
                xperms->base_perms |= AVC_EXT_IOCTL;
                /* if allowing permissions within a driver */
                security_xperm_set(xperms->drivers.p,
                                        node->datum.u.xperms->driver);
                break;
        case AVTAB_XPERMS_NLMSG:
                xperms->base_perms |= AVC_EXT_NLMSG;
                /* if allowing permissions within a driver */
                security_xperm_set(xperms->drivers.p,
                                        node->datum.u.xperms->driver);
                break;
        }

        xperms->len = 1;
}

/*
 * Compute access vectors and extended permissions based on a context
 * structure pair for the permissions in a particular class.
 */
static void context_struct_compute_av(struct policydb *policydb,
                                      struct context *scontext,
                                      struct context *tcontext,
                                      u16 tclass,
                                      struct av_decision *avd,
                                      struct extended_perms *xperms)
{
        struct constraint_node *constraint;
        struct role_allow *ra;
        struct avtab_key avkey;
        struct avtab_node *node;
        struct class_datum *tclass_datum;
        struct ebitmap *sattr, *tattr;
        struct ebitmap_node *snode, *tnode;
        unsigned int i, j;

        avd->allowed = 0;
        avd->auditallow = 0;
        avd->auditdeny = 0xffffffff;
        if (xperms) {
                memset(xperms, 0, sizeof(*xperms));
        }

        if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) {
                pr_warn_ratelimited("SELinux:  Invalid class %u\n", tclass);
                return;
        }

        tclass_datum = policydb->class_val_to_struct[tclass - 1];

        /*
         * If a specific type enforcement rule was defined for
         * this permission check, then use it.
         */
        avkey.target_class = tclass;
        avkey.specified = AVTAB_AV | AVTAB_XPERMS;
        sattr = &policydb->type_attr_map_array[scontext->type - 1];
        tattr = &policydb->type_attr_map_array[tcontext->type - 1];
        ebitmap_for_each_positive_bit(sattr, snode, i) {
                ebitmap_for_each_positive_bit(tattr, tnode, j) {
                        avkey.source_type = i + 1;
                        avkey.target_type = j + 1;
                        for (node = avtab_search_node(&policydb->te_avtab,
                                                      &avkey);
                             node;
                             node = avtab_search_node_next(node, avkey.specified)) {
                                if (node->key.specified == AVTAB_ALLOWED)
                                        avd->allowed |= node->datum.u.data;
                                else if (node->key.specified == AVTAB_AUDITALLOW)
                                        avd->auditallow |= node->datum.u.data;
                                else if (node->key.specified == AVTAB_AUDITDENY)
                                        avd->auditdeny &= node->datum.u.data;
                                else if (xperms && (node->key.specified & AVTAB_XPERMS))
                                        services_compute_xperms_drivers(xperms, node);
                        }

                        /* Check conditional av table for additional permissions */
                        cond_compute_av(&policydb->te_cond_avtab, &avkey,
                                        avd, xperms);

                }
        }

        /*
         * Remove any permissions prohibited by a constraint (this includes
         * the MLS policy).
         */
        constraint = tclass_datum->constraints;
        while (constraint) {
                if ((constraint->permissions & (avd->allowed)) &&
                    !constraint_expr_eval(policydb, scontext, tcontext, NULL,
                                          constraint->expr)) {
                        avd->allowed &= ~(constraint->permissions);
                }
                constraint = constraint->next;
        }

        /*
         * If checking process transition permission and the
         * role is changing, then check the (current_role, new_role)
         * pair.
         */
        if (tclass == policydb->process_class &&
            (avd->allowed & policydb->process_trans_perms) &&
            scontext->role != tcontext->role) {
                for (ra = policydb->role_allow; ra; ra = ra->next) {
                        if (scontext->role == ra->role &&
                            tcontext->role == ra->new_role)
                                break;
                }
                if (!ra)
                        avd->allowed &= ~policydb->process_trans_perms;
        }

        /*
         * If the given source and target types have boundary
         * constraint, lazy checks have to mask any violated
         * permission and notice it to userspace via audit.
         */
        type_attribute_bounds_av(policydb, scontext, tcontext,
                                 tclass, avd);
}

static int security_validtrans_handle_fail(struct selinux_policy *policy,
                                        struct sidtab_entry *oentry,
                                        struct sidtab_entry *nentry,
                                        struct sidtab_entry *tentry,
                                        u16 tclass)
{
        struct policydb *p = &policy->policydb;
        struct sidtab *sidtab = policy->sidtab;
        char *o = NULL, *n = NULL, *t = NULL;
        u32 olen, nlen, tlen;

        if (sidtab_entry_to_string(p, sidtab, oentry, &o, &olen))
                goto out;
        if (sidtab_entry_to_string(p, sidtab, nentry, &n, &nlen))
                goto out;
        if (sidtab_entry_to_string(p, sidtab, tentry, &t, &tlen))
                goto out;
        audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR,
                  "op=security_validate_transition seresult=denied"
                  " oldcontext=%s newcontext=%s taskcontext=%s tclass=%s",
                  o, n, t, sym_name(p, SYM_CLASSES, tclass-1));
out:
        kfree(o);
        kfree(n);
        kfree(t);

        if (!enforcing_enabled())
                return 0;
        return -EPERM;
}

static int security_compute_validatetrans(u32 oldsid, u32 newsid, u32 tasksid,
                                          u16 orig_tclass, bool user)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct sidtab_entry *oentry;
        struct sidtab_entry *nentry;
        struct sidtab_entry *tentry;
        struct class_datum *tclass_datum;
        struct constraint_node *constraint;
        u16 tclass;
        int rc = 0;


        if (!selinux_initialized())
                return 0;

        rcu_read_lock();

        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (!user)
                tclass = unmap_class(&policy->map, orig_tclass);
        else
                tclass = orig_tclass;

        if (!tclass || tclass > policydb->p_classes.nprim) {
                rc = -EINVAL;
                goto out;
        }
        tclass_datum = policydb->class_val_to_struct[tclass - 1];

        oentry = sidtab_search_entry(sidtab, oldsid);
        if (!oentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, oldsid);
                rc = -EINVAL;
                goto out;
        }

        nentry = sidtab_search_entry(sidtab, newsid);
        if (!nentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, newsid);
                rc = -EINVAL;
                goto out;
        }

        tentry = sidtab_search_entry(sidtab, tasksid);
        if (!tentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, tasksid);
                rc = -EINVAL;
                goto out;
        }

        constraint = tclass_datum->validatetrans;
        while (constraint) {
                if (!constraint_expr_eval(policydb, &oentry->context,
                                          &nentry->context, &tentry->context,
                                          constraint->expr)) {
                        if (user)
                                rc = -EPERM;
                        else
                                rc = security_validtrans_handle_fail(policy,
                                                                oentry,
                                                                nentry,
                                                                tentry,
                                                                tclass);
                        goto out;
                }
                constraint = constraint->next;
        }

out:
        rcu_read_unlock();
        return rc;
}

int security_validate_transition_user(u32 oldsid, u32 newsid, u32 tasksid,
                                      u16 tclass)
{
        return security_compute_validatetrans(oldsid, newsid, tasksid,
                                              tclass, true);
}

int security_validate_transition(u32 oldsid, u32 newsid, u32 tasksid,
                                 u16 orig_tclass)
{
        return security_compute_validatetrans(oldsid, newsid, tasksid,
                                              orig_tclass, false);
}

/*
 * security_bounded_transition - check whether the given
 * transition is directed to bounded, or not.
 * It returns 0, if @newsid is bounded by @oldsid.
 * Otherwise, it returns error code.
 *
 * @oldsid : current security identifier
 * @newsid : destinated security identifier
 */
int security_bounded_transition(u32 old_sid, u32 new_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct sidtab_entry *old_entry, *new_entry;
        struct type_datum *type;
        u32 index;
        int rc;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        rc = -EINVAL;
        old_entry = sidtab_search_entry(sidtab, old_sid);
        if (!old_entry) {
                pr_err("SELinux: %s: unrecognized SID %u\n",
                       __func__, old_sid);
                goto out;
        }

        rc = -EINVAL;
        new_entry = sidtab_search_entry(sidtab, new_sid);
        if (!new_entry) {
                pr_err("SELinux: %s: unrecognized SID %u\n",
                       __func__, new_sid);
                goto out;
        }

        rc = 0;
        /* type/domain unchanged */
        if (old_entry->context.type == new_entry->context.type)
                goto out;

        index = new_entry->context.type;
        while (true) {
                type = policydb->type_val_to_struct[index - 1];
                BUG_ON(!type);

                /* not bounded anymore */
                rc = -EPERM;
                if (!type->bounds)
                        break;

                /* @newsid is bounded by @oldsid */
                rc = 0;
                if (type->bounds == old_entry->context.type)
                        break;

                index = type->bounds;
        }

        if (rc) {
                char *old_name = NULL;
                char *new_name = NULL;
                u32 length;

                if (!sidtab_entry_to_string(policydb, sidtab, old_entry,
                                            &old_name, &length) &&
                    !sidtab_entry_to_string(policydb, sidtab, new_entry,
                                            &new_name, &length)) {
                        audit_log(audit_context(),
                                  GFP_ATOMIC, AUDIT_SELINUX_ERR,
                                  "op=security_bounded_transition "
                                  "seresult=denied "
                                  "oldcontext=%s newcontext=%s",
                                  old_name, new_name);
                }
                kfree(new_name);
                kfree(old_name);
        }
out:
        rcu_read_unlock();

        return rc;
}

static void avd_init(struct selinux_policy *policy, struct av_decision *avd)
{
        avd->allowed = 0;
        avd->auditallow = 0;
        avd->auditdeny = 0xffffffff;
        if (policy)
                avd->seqno = policy->latest_granting;
        else
                avd->seqno = 0;
        avd->flags = 0;
}

static void update_xperms_extended_data(u8 specified,
                                        const struct extended_perms_data *from,
                                        struct extended_perms_data *xp_data)
{
        unsigned int i;

        switch (specified) {
        case AVTAB_XPERMS_IOCTLDRIVER:
                memset(xp_data->p, 0xff, sizeof(xp_data->p));
                break;
        case AVTAB_XPERMS_IOCTLFUNCTION:
        case AVTAB_XPERMS_NLMSG:
                for (i = 0; i < ARRAY_SIZE(xp_data->p); i++)
                        xp_data->p[i] |= from->p[i];
                break;
        }

}

void services_compute_xperms_decision(struct extended_perms_decision *xpermd,
                                        struct avtab_node *node)
{
        u16 specified;

        switch (node->datum.u.xperms->specified) {
        case AVTAB_XPERMS_IOCTLFUNCTION:
                if (xpermd->base_perm != AVC_EXT_IOCTL ||
                    xpermd->driver != node->datum.u.xperms->driver)
                        return;
                break;
        case AVTAB_XPERMS_IOCTLDRIVER:
                if (xpermd->base_perm != AVC_EXT_IOCTL ||
                    !security_xperm_test(node->datum.u.xperms->perms.p,
                                         xpermd->driver))
                        return;
                break;
        case AVTAB_XPERMS_NLMSG:
                if (xpermd->base_perm != AVC_EXT_NLMSG ||
                    xpermd->driver != node->datum.u.xperms->driver)
                        return;
                break;
        default:
                pr_warn_once(
                        "SELinux: unknown extended permission (%u) will be ignored\n",
                        node->datum.u.xperms->specified);
                return;
        }

        specified = node->key.specified & ~(AVTAB_ENABLED | AVTAB_ENABLED_OLD);

        if (specified == AVTAB_XPERMS_ALLOWED) {
                xpermd->used |= XPERMS_ALLOWED;
                update_xperms_extended_data(node->datum.u.xperms->specified,
                                            &node->datum.u.xperms->perms,
                                            xpermd->allowed);
        } else if (specified == AVTAB_XPERMS_AUDITALLOW) {
                xpermd->used |= XPERMS_AUDITALLOW;
                update_xperms_extended_data(node->datum.u.xperms->specified,
                                            &node->datum.u.xperms->perms,
                                            xpermd->auditallow);
        } else if (specified == AVTAB_XPERMS_DONTAUDIT) {
                xpermd->used |= XPERMS_DONTAUDIT;
                update_xperms_extended_data(node->datum.u.xperms->specified,
                                            &node->datum.u.xperms->perms,
                                            xpermd->dontaudit);
        } else {
                pr_warn_once("SELinux: unknown specified key (%u)\n",
                             node->key.specified);
        }
}

void security_compute_xperms_decision(u32 ssid,
                                      u32 tsid,
                                      u16 orig_tclass,
                                      u8 driver,
                                      u8 base_perm,
                                      struct extended_perms_decision *xpermd)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        u16 tclass;
        struct context *scontext, *tcontext;
        struct avtab_key avkey;
        struct avtab_node *node;
        struct ebitmap *sattr, *tattr;
        struct ebitmap_node *snode, *tnode;
        unsigned int i, j;

        xpermd->base_perm = base_perm;
        xpermd->driver = driver;
        xpermd->used = 0;
        memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p));
        memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p));
        memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p));

        rcu_read_lock();
        if (!selinux_initialized())
                goto allow;

        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        scontext = sidtab_search(sidtab, ssid);
        if (!scontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                goto out;
        }

        tcontext = sidtab_search(sidtab, tsid);
        if (!tcontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                goto out;
        }

        tclass = unmap_class(&policy->map, orig_tclass);
        if (unlikely(orig_tclass && !tclass)) {
                if (policydb->allow_unknown)
                        goto allow;
                goto out;
        }


        if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) {
                pr_warn_ratelimited("SELinux:  Invalid class %hu\n", tclass);
                goto out;
        }

        avkey.target_class = tclass;
        avkey.specified = AVTAB_XPERMS;
        sattr = &policydb->type_attr_map_array[scontext->type - 1];
        tattr = &policydb->type_attr_map_array[tcontext->type - 1];
        ebitmap_for_each_positive_bit(sattr, snode, i) {
                ebitmap_for_each_positive_bit(tattr, tnode, j) {
                        avkey.source_type = i + 1;
                        avkey.target_type = j + 1;
                        for (node = avtab_search_node(&policydb->te_avtab,
                                                      &avkey);
                             node;
                             node = avtab_search_node_next(node, avkey.specified))
                                services_compute_xperms_decision(xpermd, node);

                        cond_compute_xperms(&policydb->te_cond_avtab,
                                                &avkey, xpermd);
                }
        }
out:
        rcu_read_unlock();
        return;
allow:
        memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p));
        goto out;
}

/**
 * security_compute_av - Compute access vector decisions.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @orig_tclass: target security class
 * @avd: access vector decisions
 * @xperms: extended permissions
 *
 * Compute a set of access vector decisions based on the
 * SID pair (@ssid, @tsid) for the permissions in @tclass.
 */
void security_compute_av(u32 ssid,
                         u32 tsid,
                         u16 orig_tclass,
                         struct av_decision *avd,
                         struct extended_perms *xperms)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        u16 tclass;
        struct context *scontext = NULL, *tcontext = NULL;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        avd_init(policy, avd);
        xperms->len = 0;
        if (!selinux_initialized())
                goto allow;

        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        scontext = sidtab_search(sidtab, ssid);
        if (!scontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                goto out;
        }

        /* permissive domain? */
        if (ebitmap_get_bit(&policydb->permissive_map, scontext->type))
                avd->flags |= AVD_FLAGS_PERMISSIVE;

        tcontext = sidtab_search(sidtab, tsid);
        if (!tcontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                goto out;
        }

        tclass = unmap_class(&policy->map, orig_tclass);
        if (unlikely(orig_tclass && !tclass)) {
                if (policydb->allow_unknown)
                        goto allow;
                goto out;
        }
        context_struct_compute_av(policydb, scontext, tcontext, tclass, avd,
                                  xperms);
        map_decision(&policy->map, orig_tclass, avd,
                     policydb->allow_unknown);
out:
        rcu_read_unlock();
        return;
allow:
        avd->allowed = 0xffffffff;
        goto out;
}

void security_compute_av_user(u32 ssid,
                              u32 tsid,
                              u16 tclass,
                              struct av_decision *avd)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct context *scontext = NULL, *tcontext = NULL;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        avd_init(policy, avd);
        if (!selinux_initialized())
                goto allow;

        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        scontext = sidtab_search(sidtab, ssid);
        if (!scontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                goto out;
        }

        /* permissive domain? */
        if (ebitmap_get_bit(&policydb->permissive_map, scontext->type))
                avd->flags |= AVD_FLAGS_PERMISSIVE;

        tcontext = sidtab_search(sidtab, tsid);
        if (!tcontext) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                goto out;
        }

        if (unlikely(!tclass)) {
                if (policydb->allow_unknown)
                        goto allow;
                goto out;
        }

        context_struct_compute_av(policydb, scontext, tcontext, tclass, avd,
                                  NULL);
 out:
        rcu_read_unlock();
        return;
allow:
        avd->allowed = 0xffffffff;
        goto out;
}

/*
 * Write the security context string representation of
 * the context structure `context' into a dynamically
 * allocated string of the correct size.  Set `*scontext'
 * to point to this string and set `*scontext_len' to
 * the length of the string.
 */
static int context_struct_to_string(struct policydb *p,
                                    struct context *context,
                                    char **scontext, u32 *scontext_len)
{
        char *scontextp;

        if (scontext)
                *scontext = NULL;
        *scontext_len = 0;

        if (context->len) {
                *scontext_len = context->len;
                if (scontext) {
                        *scontext = kstrdup(context->str, GFP_ATOMIC);
                        if (!(*scontext))
                                return -ENOMEM;
                }
                return 0;
        }

        /* Compute the size of the context. */
        *scontext_len += strlen(sym_name(p, SYM_USERS, context->user - 1)) + 1;
        *scontext_len += strlen(sym_name(p, SYM_ROLES, context->role - 1)) + 1;
        *scontext_len += strlen(sym_name(p, SYM_TYPES, context->type - 1)) + 1;
        *scontext_len += mls_compute_context_len(p, context);

        if (!scontext)
                return 0;

        /* Allocate space for the context; caller must free this space. */
        scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
        if (!scontextp)
                return -ENOMEM;
        *scontext = scontextp;

        /*
         * Copy the user name, role name and type name into the context.
         */
        scontextp += sprintf(scontextp, "%s:%s:%s",
                sym_name(p, SYM_USERS, context->user - 1),
                sym_name(p, SYM_ROLES, context->role - 1),
                sym_name(p, SYM_TYPES, context->type - 1));

        mls_sid_to_context(p, context, &scontextp);

        *scontextp = 0;

        return 0;
}

static int sidtab_entry_to_string(struct policydb *p,
                                  struct sidtab *sidtab,
                                  struct sidtab_entry *entry,
                                  char **scontext, u32 *scontext_len)
{
        int rc = sidtab_sid2str_get(sidtab, entry, scontext, scontext_len);

        if (rc != -ENOENT)
                return rc;

        rc = context_struct_to_string(p, &entry->context, scontext,
                                      scontext_len);
        if (!rc && scontext)
                sidtab_sid2str_put(sidtab, entry, *scontext, *scontext_len);
        return rc;
}

#include "initial_sid_to_string.h"

int security_sidtab_hash_stats(char *page)
{
        struct selinux_policy *policy;
        int rc;

        if (!selinux_initialized()) {
                pr_err("SELinux: %s:  called before initial load_policy\n",
                       __func__);
                return -EINVAL;
        }

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        rc = sidtab_hash_stats(policy->sidtab, page);
        rcu_read_unlock();

        return rc;
}

const char *security_get_initial_sid_context(u32 sid)
{
        if (unlikely(sid > SECINITSID_NUM))
                return NULL;
        return initial_sid_to_string[sid];
}

static int security_sid_to_context_core(u32 sid, char **scontext,
                                        u32 *scontext_len, int force,
                                        int only_invalid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct sidtab_entry *entry;
        int rc = 0;

        if (scontext)
                *scontext = NULL;
        *scontext_len  = 0;

        if (!selinux_initialized()) {
                if (sid <= SECINITSID_NUM) {
                        char *scontextp;
                        const char *s;

                        /*
                         * Before the policy is loaded, translate
                         * SECINITSID_INIT to "kernel", because systemd and
                         * libselinux < 2.6 take a getcon_raw() result that is
                         * both non-null and not "kernel" to mean that a policy
                         * is already loaded.
                         */
                        if (sid == SECINITSID_INIT)
                                sid = SECINITSID_KERNEL;

                        s = initial_sid_to_string[sid];
                        if (!s)
                                return -EINVAL;
                        *scontext_len = strlen(s) + 1;
                        if (!scontext)
                                return 0;
                        scontextp = kmemdup(s, *scontext_len, GFP_ATOMIC);
                        if (!scontextp)
                                return -ENOMEM;
                        *scontext = scontextp;
                        return 0;
                }
                pr_err("SELinux: %s:  called before initial "
                       "load_policy on unknown SID %d\n", __func__, sid);
                return -EINVAL;
        }
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (force)
                entry = sidtab_search_entry_force(sidtab, sid);
        else
                entry = sidtab_search_entry(sidtab, sid);
        if (!entry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, sid);
                rc = -EINVAL;
                goto out_unlock;
        }
        if (only_invalid && !entry->context.len)
                goto out_unlock;

        rc = sidtab_entry_to_string(policydb, sidtab, entry, scontext,
                                    scontext_len);

out_unlock:
        rcu_read_unlock();
        return rc;

}

/**
 * security_sid_to_context - Obtain a context for a given SID.
 * @sid: security identifier, SID
 * @scontext: security context
 * @scontext_len: length in bytes
 *
 * Write the string representation of the context associated with @sid
 * into a dynamically allocated string of the correct size.  Set @scontext
 * to point to this string and set @scontext_len to the length of the string.
 */
int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len)
{
        return security_sid_to_context_core(sid, scontext,
                                            scontext_len, 0, 0);
}

int security_sid_to_context_force(u32 sid,
                                  char **scontext, u32 *scontext_len)
{
        return security_sid_to_context_core(sid, scontext,
                                            scontext_len, 1, 0);
}

/**
 * security_sid_to_context_inval - Obtain a context for a given SID if it
 *                                 is invalid.
 * @sid: security identifier, SID
 * @scontext: security context
 * @scontext_len: length in bytes
 *
 * Write the string representation of the context associated with @sid
 * into a dynamically allocated string of the correct size, but only if the
 * context is invalid in the current policy.  Set @scontext to point to
 * this string (or NULL if the context is valid) and set @scontext_len to
 * the length of the string (or 0 if the context is valid).
 */
int security_sid_to_context_inval(u32 sid,
                                  char **scontext, u32 *scontext_len)
{
        return security_sid_to_context_core(sid, scontext,
                                            scontext_len, 1, 1);
}

/*
 * Caveat:  Mutates scontext.
 */
static int string_to_context_struct(struct policydb *pol,
                                    struct sidtab *sidtabp,
                                    char *scontext,
                                    struct context *ctx,
                                    u32 def_sid)
{
        struct role_datum *role;
        struct type_datum *typdatum;
        struct user_datum *usrdatum;
        char *scontextp, *p, oldc;
        int rc = 0;

        context_init(ctx);

        /* Parse the security context. */

        rc = -EINVAL;
        scontextp = scontext;

        /* Extract the user. */
        p = scontextp;
        while (*p && *p != ':')
                p++;

        if (*p == 0)
                goto out;

        *p++ = 0;

        usrdatum = symtab_search(&pol->p_users, scontextp);
        if (!usrdatum)
                goto out;

        ctx->user = usrdatum->value;

        /* Extract role. */
        scontextp = p;
        while (*p && *p != ':')
                p++;

        if (*p == 0)
                goto out;

        *p++ = 0;

        role = symtab_search(&pol->p_roles, scontextp);
        if (!role)
                goto out;
        ctx->role = role->value;

        /* Extract type. */
        scontextp = p;
        while (*p && *p != ':')
                p++;
        oldc = *p;
        *p++ = 0;

        typdatum = symtab_search(&pol->p_types, scontextp);
        if (!typdatum || typdatum->attribute)
                goto out;

        ctx->type = typdatum->value;

        rc = mls_context_to_sid(pol, oldc, p, ctx, sidtabp, def_sid);
        if (rc)
                goto out;

        /* Check the validity of the new context. */
        rc = -EINVAL;
        if (!policydb_context_isvalid(pol, ctx))
                goto out;
        rc = 0;
out:
        if (rc)
                context_destroy(ctx);
        return rc;
}

static int security_context_to_sid_core(const char *scontext, u32 scontext_len,
                                        u32 *sid, u32 def_sid, gfp_t gfp_flags,
                                        int force)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        char *scontext2, *str = NULL;
        struct context context;
        int rc = 0;

        /* An empty security context is never valid. */
        if (!scontext_len)
                return -EINVAL;

        /* Copy the string to allow changes and ensure a NUL terminator */
        scontext2 = kmemdup_nul(scontext, scontext_len, gfp_flags);
        if (!scontext2)
                return -ENOMEM;

        if (!selinux_initialized()) {
                u32 i;

                for (i = 1; i < SECINITSID_NUM; i++) {
                        const char *s = initial_sid_to_string[i];

                        if (s && !strcmp(s, scontext2)) {
                                *sid = i;
                                goto out;
                        }
                }
                *sid = SECINITSID_KERNEL;
                goto out;
        }
        *sid = SECSID_NULL;

        if (force) {
                /* Save another copy for storing in uninterpreted form */
                rc = -ENOMEM;
                str = kstrdup(scontext2, gfp_flags);
                if (!str)
                        goto out;
        }
retry:
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;
        rc = string_to_context_struct(policydb, sidtab, scontext2,
                                      &context, def_sid);
        if (rc == -EINVAL && force) {
                context.str = str;
                context.len = strlen(str) + 1;
                str = NULL;
        } else if (rc)
                goto out_unlock;
        rc = sidtab_context_to_sid(sidtab, &context, sid);
        if (rc == -ESTALE) {
                rcu_read_unlock();
                if (context.str) {
                        str = context.str;
                        context.str = NULL;
                }
                context_destroy(&context);
                goto retry;
        }
        context_destroy(&context);
out_unlock:
        rcu_read_unlock();
out:
        kfree(scontext2);
        kfree(str);
        return rc;
}

/**
 * security_context_to_sid - Obtain a SID for a given security context.
 * @scontext: security context
 * @scontext_len: length in bytes
 * @sid: security identifier, SID
 * @gfp: context for the allocation
 *
 * Obtains a SID associated with the security context that
 * has the string representation specified by @scontext.
 * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient
 * memory is available, or 0 on success.
 */
int security_context_to_sid(const char *scontext, u32 scontext_len, u32 *sid,
                            gfp_t gfp)
{
        return security_context_to_sid_core(scontext, scontext_len,
                                            sid, SECSID_NULL, gfp, 0);
}

int security_context_str_to_sid(const char *scontext, u32 *sid, gfp_t gfp)
{
        return security_context_to_sid(scontext, strlen(scontext),
                                       sid, gfp);
}

/**
 * security_context_to_sid_default - Obtain a SID for a given security context,
 * falling back to specified default if needed.
 *
 * @scontext: security context
 * @scontext_len: length in bytes
 * @sid: security identifier, SID
 * @def_sid: default SID to assign on error
 * @gfp_flags: the allocator get-free-page (GFP) flags
 *
 * Obtains a SID associated with the security context that
 * has the string representation specified by @scontext.
 * The default SID is passed to the MLS layer to be used to allow
 * kernel labeling of the MLS field if the MLS field is not present
 * (for upgrading to MLS without full relabel).
 * Implicitly forces adding of the context even if it cannot be mapped yet.
 * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient
 * memory is available, or 0 on success.
 */
int security_context_to_sid_default(const char *scontext, u32 scontext_len,
                                    u32 *sid, u32 def_sid, gfp_t gfp_flags)
{
        return security_context_to_sid_core(scontext, scontext_len,
                                            sid, def_sid, gfp_flags, 1);
}

int security_context_to_sid_force(const char *scontext, u32 scontext_len,
                                  u32 *sid)
{
        return security_context_to_sid_core(scontext, scontext_len,
                                            sid, SECSID_NULL, GFP_KERNEL, 1);
}

static int compute_sid_handle_invalid_context(
        struct selinux_policy *policy,
        struct sidtab_entry *sentry,
        struct sidtab_entry *tentry,
        u16 tclass,
        struct context *newcontext)
{
        struct policydb *policydb = &policy->policydb;
        struct sidtab *sidtab = policy->sidtab;
        char *s = NULL, *t = NULL, *n = NULL;
        u32 slen, tlen, nlen;
        struct audit_buffer *ab;

        if (sidtab_entry_to_string(policydb, sidtab, sentry, &s, &slen))
                goto out;
        if (sidtab_entry_to_string(policydb, sidtab, tentry, &t, &tlen))
                goto out;
        if (context_struct_to_string(policydb, newcontext, &n, &nlen))
                goto out;
        ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR);
        if (!ab)
                goto out;
        audit_log_format(ab,
                         "op=security_compute_sid invalid_context=");
        /* no need to record the NUL with untrusted strings */
        audit_log_n_untrustedstring(ab, n, nlen - 1);
        audit_log_format(ab, " scontext=%s tcontext=%s tclass=%s",
                         s, t, sym_name(policydb, SYM_CLASSES, tclass-1));
        audit_log_end(ab);
out:
        kfree(s);
        kfree(t);
        kfree(n);
        if (!enforcing_enabled())
                return 0;
        return -EACCES;
}

static void filename_compute_type(struct policydb *policydb,
                                  struct context *newcontext,
                                  u32 stype, u32 ttype, u16 tclass,
                                  const char *objname)
{
        struct filename_trans_key ft;
        struct filename_trans_datum *datum;

        /*
         * Most filename trans rules are going to live in specific directories
         * like /dev or /var/run.  This bitmap will quickly skip rule searches
         * if the ttype does not contain any rules.
         */
        if (!ebitmap_get_bit(&policydb->filename_trans_ttypes, ttype))
                return;

        ft.ttype = ttype;
        ft.tclass = tclass;
        ft.name = objname;

        datum = policydb_filenametr_search(policydb, &ft);
        while (datum) {
                if (ebitmap_get_bit(&datum->stypes, stype - 1)) {
                        newcontext->type = datum->otype;
                        return;
                }
                datum = datum->next;
        }
}

static int security_compute_sid(u32 ssid,
                                u32 tsid,
                                u16 orig_tclass,
                                u16 specified,
                                const char *objname,
                                u32 *out_sid,
                                bool kern)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct class_datum *cladatum;
        struct context *scontext, *tcontext, newcontext;
        struct sidtab_entry *sentry, *tentry;
        struct avtab_key avkey;
        struct avtab_node *avnode, *node;
        u16 tclass;
        int rc = 0;
        bool sock;

        if (!selinux_initialized()) {
                switch (orig_tclass) {
                case SECCLASS_PROCESS: /* kernel value */
                        *out_sid = ssid;
                        break;
                default:
                        *out_sid = tsid;
                        break;
                }
                goto out;
        }

retry:
        cladatum = NULL;
        context_init(&newcontext);

        rcu_read_lock();

        policy = rcu_dereference(selinux_state.policy);

        if (kern) {
                tclass = unmap_class(&policy->map, orig_tclass);
                sock = security_is_socket_class(orig_tclass);
        } else {
                tclass = orig_tclass;
                sock = security_is_socket_class(map_class(&policy->map,
                                                          tclass));
        }

        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        sentry = sidtab_search_entry(sidtab, ssid);
        if (!sentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, ssid);
                rc = -EINVAL;
                goto out_unlock;
        }
        tentry = sidtab_search_entry(sidtab, tsid);
        if (!tentry) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, tsid);
                rc = -EINVAL;
                goto out_unlock;
        }

        scontext = &sentry->context;
        tcontext = &tentry->context;

        if (tclass && tclass <= policydb->p_classes.nprim)
                cladatum = policydb->class_val_to_struct[tclass - 1];

        /* Set the user identity. */
        switch (specified) {
        case AVTAB_TRANSITION:
        case AVTAB_CHANGE:
                if (cladatum && cladatum->default_user == DEFAULT_TARGET) {
                        newcontext.user = tcontext->user;
                } else {
                        /* notice this gets both DEFAULT_SOURCE and unset */
                        /* Use the process user identity. */
                        newcontext.user = scontext->user;
                }
                break;
        case AVTAB_MEMBER:
                /* Use the related object owner. */
                newcontext.user = tcontext->user;
                break;
        }

        /* Set the role to default values. */
        if (cladatum && cladatum->default_role == DEFAULT_SOURCE) {
                newcontext.role = scontext->role;
        } else if (cladatum && cladatum->default_role == DEFAULT_TARGET) {
                newcontext.role = tcontext->role;
        } else {
                if ((tclass == policydb->process_class) || sock)
                        newcontext.role = scontext->role;
                else
                        newcontext.role = OBJECT_R_VAL;
        }

        /* Set the type.
         * Look for a type transition/member/change rule.
         */
        avkey.source_type = scontext->type;
        avkey.target_type = tcontext->type;
        avkey.target_class = tclass;
        avkey.specified = specified;
        avnode = avtab_search_node(&policydb->te_avtab, &avkey);

        /* If no permanent rule, also check for enabled conditional rules */
        if (!avnode) {
                node = avtab_search_node(&policydb->te_cond_avtab, &avkey);
                for (; node; node = avtab_search_node_next(node, specified)) {
                        if (node->key.specified & AVTAB_ENABLED) {
                                avnode = node;
                                break;
                        }
                }
        }

        /* If a permanent rule is found, use the type from
         * the type transition/member/change rule. Otherwise,
         * set the type to its default values.
         */
        if (avnode) {
                newcontext.type = avnode->datum.u.data;
        } else if (cladatum && cladatum->default_type == DEFAULT_SOURCE) {
                newcontext.type = scontext->type;
        } else if (cladatum && cladatum->default_type == DEFAULT_TARGET) {
                newcontext.type = tcontext->type;
        } else {
                if ((tclass == policydb->process_class) || sock) {
                        /* Use the type of process. */
                        newcontext.type = scontext->type;
                } else {
                        /* Use the type of the related object. */
                        newcontext.type = tcontext->type;
                }
        }

        /* if we have a objname this is a file trans check so check those rules */
        if (objname)
                filename_compute_type(policydb, &newcontext, scontext->type,
                                      tcontext->type, tclass, objname);

        /* Check for class-specific changes. */
        if (specified & AVTAB_TRANSITION) {
                /* Look for a role transition rule. */
                struct role_trans_datum *rtd;
                struct role_trans_key rtk = {
                        .role = scontext->role,
                        .type = tcontext->type,
                        .tclass = tclass,
                };

                rtd = policydb_roletr_search(policydb, &rtk);
                if (rtd)
                        newcontext.role = rtd->new_role;
        }

        /* Set the MLS attributes.
           This is done last because it may allocate memory. */
        rc = mls_compute_sid(policydb, scontext, tcontext, tclass, specified,
                             &newcontext, sock);
        if (rc)
                goto out_unlock;

        /* Check the validity of the context. */
        if (!policydb_context_isvalid(policydb, &newcontext)) {
                rc = compute_sid_handle_invalid_context(policy, sentry,
                                                        tentry, tclass,
                                                        &newcontext);
                if (rc)
                        goto out_unlock;
        }
        /* Obtain the sid for the context. */
        rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid);
        if (rc == -ESTALE) {
                rcu_read_unlock();
                context_destroy(&newcontext);
                goto retry;
        }
out_unlock:
        rcu_read_unlock();
        context_destroy(&newcontext);
out:
        return rc;
}

/**
 * security_transition_sid - Compute the SID for a new subject/object.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @qstr: object name
 * @out_sid: security identifier for new subject/object
 *
 * Compute a SID to use for labeling a new subject or object in the
 * class @tclass based on a SID pair (@ssid, @tsid).
 * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM
 * if insufficient memory is available, or %0 if the new SID was
 * computed successfully.
 */
int security_transition_sid(u32 ssid, u32 tsid, u16 tclass,
                            const struct qstr *qstr, u32 *out_sid)
{
        return security_compute_sid(ssid, tsid, tclass,
                                    AVTAB_TRANSITION,
                                    qstr ? qstr->name : NULL, out_sid, true);
}

int security_transition_sid_user(u32 ssid, u32 tsid, u16 tclass,
                                 const char *objname, u32 *out_sid)
{
        return security_compute_sid(ssid, tsid, tclass,
                                    AVTAB_TRANSITION,
                                    objname, out_sid, false);
}

/**
 * security_member_sid - Compute the SID for member selection.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @out_sid: security identifier for selected member
 *
 * Compute a SID to use when selecting a member of a polyinstantiated
 * object of class @tclass based on a SID pair (@ssid, @tsid).
 * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM
 * if insufficient memory is available, or %0 if the SID was
 * computed successfully.
 */
int security_member_sid(u32 ssid,
                        u32 tsid,
                        u16 tclass,
                        u32 *out_sid)
{
        return security_compute_sid(ssid, tsid, tclass,
                                    AVTAB_MEMBER, NULL,
                                    out_sid, false);
}

/**
 * security_change_sid - Compute the SID for object relabeling.
 * @ssid: source security identifier
 * @tsid: target security identifier
 * @tclass: target security class
 * @out_sid: security identifier for selected member
 *
 * Compute a SID to use for relabeling an object of class @tclass
 * based on a SID pair (@ssid, @tsid).
 * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM
 * if insufficient memory is available, or %0 if the SID was
 * computed successfully.
 */
int security_change_sid(u32 ssid,
                        u32 tsid,
                        u16 tclass,
                        u32 *out_sid)
{
        return security_compute_sid(ssid, tsid, tclass, AVTAB_CHANGE, NULL,
                                    out_sid, false);
}

static inline int convert_context_handle_invalid_context(
        struct policydb *policydb,
        struct context *context)
{
        char *s;
        u32 len;

        if (enforcing_enabled())
                return -EINVAL;

        if (!context_struct_to_string(policydb, context, &s, &len)) {
                pr_warn("SELinux:  Context %s would be invalid if enforcing\n",
                        s);
                kfree(s);
        }
        return 0;
}

/**
 * services_convert_context - Convert a security context across policies.
 * @args: populated convert_context_args struct
 * @oldc: original context
 * @newc: converted context
 * @gfp_flags: allocation flags
 *
 * Convert the values in the security context structure @oldc from the values
 * specified in the policy @args->oldp to the values specified in the policy
 * @args->newp, storing the new context in @newc, and verifying that the
 * context is valid under the new policy.
 */
int services_convert_context(struct convert_context_args *args,
                             struct context *oldc, struct context *newc,
                             gfp_t gfp_flags)
{
        struct ocontext *oc;
        struct role_datum *role;
        struct type_datum *typdatum;
        struct user_datum *usrdatum;
        char *s;
        u32 len;
        int rc;

        if (oldc->str) {
                s = kstrdup(oldc->str, gfp_flags);
                if (!s)
                        return -ENOMEM;

                rc = string_to_context_struct(args->newp, NULL, s, newc, SECSID_NULL);
                if (rc == -EINVAL) {
                        /*
                         * Retain string representation for later mapping.
                         *
                         * IMPORTANT: We need to copy the contents of oldc->str
                         * back into s again because string_to_context_struct()
                         * may have garbled it.
                         */
                        memcpy(s, oldc->str, oldc->len);
                        context_init(newc);
                        newc->str = s;
                        newc->len = oldc->len;
                        return 0;
                }
                kfree(s);
                if (rc) {
                        /* Other error condition, e.g. ENOMEM. */
                        pr_err("SELinux:   Unable to map context %s, rc = %d.\n",
                               oldc->str, -rc);
                        return rc;
                }
                pr_info("SELinux:  Context %s became valid (mapped).\n",
                        oldc->str);
                return 0;
        }

        context_init(newc);

        /* Convert the user. */
        usrdatum = symtab_search(&args->newp->p_users,
                                 sym_name(args->oldp, SYM_USERS, oldc->user - 1));
        if (!usrdatum)
                goto bad;
        newc->user = usrdatum->value;

        /* Convert the role. */
        role = symtab_search(&args->newp->p_roles,
                             sym_name(args->oldp, SYM_ROLES, oldc->role - 1));
        if (!role)
                goto bad;
        newc->role = role->value;

        /* Convert the type. */
        typdatum = symtab_search(&args->newp->p_types,
                                 sym_name(args->oldp, SYM_TYPES, oldc->type - 1));
        if (!typdatum)
                goto bad;
        newc->type = typdatum->value;

        /* Convert the MLS fields if dealing with MLS policies */
        if (args->oldp->mls_enabled && args->newp->mls_enabled) {
                rc = mls_convert_context(args->oldp, args->newp, oldc, newc);
                if (rc)
                        goto bad;
        } else if (!args->oldp->mls_enabled && args->newp->mls_enabled) {
                /*
                 * Switching between non-MLS and MLS policy:
                 * ensure that the MLS fields of the context for all
                 * existing entries in the sidtab are filled in with a
                 * suitable default value, likely taken from one of the
                 * initial SIDs.
                 */
                oc = args->newp->ocontexts[OCON_ISID];
                while (oc && oc->sid[0] != SECINITSID_UNLABELED)
                        oc = oc->next;
                if (!oc) {
                        pr_err("SELinux:  unable to look up"
                                " the initial SIDs list\n");
                        goto bad;
                }
                rc = mls_range_set(newc, &oc->context[0].range);
                if (rc)
                        goto bad;
        }

        /* Check the validity of the new context. */
        if (!policydb_context_isvalid(args->newp, newc)) {
                rc = convert_context_handle_invalid_context(args->oldp, oldc);
                if (rc)
                        goto bad;
        }

        return 0;
bad:
        /* Map old representation to string and save it. */
        rc = context_struct_to_string(args->oldp, oldc, &s, &len);
        if (rc)
                return rc;
        context_destroy(newc);
        newc->str = s;
        newc->len = len;
        pr_info("SELinux:  Context %s became invalid (unmapped).\n",
                newc->str);
        return 0;
}

static void security_load_policycaps(struct selinux_policy *policy)
{
        struct policydb *p;
        unsigned int i;
        struct ebitmap_node *node;

        p = &policy->policydb;

        for (i = 0; i < ARRAY_SIZE(selinux_state.policycap); i++)
                WRITE_ONCE(selinux_state.policycap[i],
                        ebitmap_get_bit(&p->policycaps, i));

        for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++)
                pr_info("SELinux:  policy capability %s=%d\n",
                        selinux_policycap_names[i],
                        ebitmap_get_bit(&p->policycaps, i));

        ebitmap_for_each_positive_bit(&p->policycaps, node, i) {
                if (i >= ARRAY_SIZE(selinux_policycap_names))
                        pr_info("SELinux:  unknown policy capability %u\n",
                                i);
        }
}

static int security_preserve_bools(struct selinux_policy *oldpolicy,
                                struct selinux_policy *newpolicy);

static void selinux_policy_free(struct selinux_policy *policy)
{
        if (!policy)
                return;

        sidtab_destroy(policy->sidtab);
        kfree(policy->map.mapping);
        policydb_destroy(&policy->policydb);
        kfree(policy->sidtab);
        kfree(policy);
}

static void selinux_policy_cond_free(struct selinux_policy *policy)
{
        cond_policydb_destroy_dup(&policy->policydb);
        kfree(policy);
}

void selinux_policy_cancel(struct selinux_load_state *load_state)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *oldpolicy;

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        sidtab_cancel_convert(oldpolicy->sidtab);
        selinux_policy_free(load_state->policy);
        kfree(load_state->convert_data);
}

static void selinux_notify_policy_change(u32 seqno)
{
        /* Flush external caches and notify userspace of policy load */
        avc_ss_reset(seqno);
        selnl_notify_policyload(seqno);
        selinux_status_update_policyload(seqno);
        selinux_netlbl_cache_invalidate();
        selinux_xfrm_notify_policyload();
        selinux_ima_measure_state_locked();
}

void selinux_policy_commit(struct selinux_load_state *load_state)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *oldpolicy, *newpolicy = load_state->policy;
        unsigned long flags;
        u32 seqno;

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        /* If switching between different policy types, log MLS status */
        if (oldpolicy) {
                if (oldpolicy->policydb.mls_enabled && !newpolicy->policydb.mls_enabled)
                        pr_info("SELinux: Disabling MLS support...\n");
                else if (!oldpolicy->policydb.mls_enabled && newpolicy->policydb.mls_enabled)
                        pr_info("SELinux: Enabling MLS support...\n");
        }

        /* Set latest granting seqno for new policy. */
        if (oldpolicy)
                newpolicy->latest_granting = oldpolicy->latest_granting + 1;
        else
                newpolicy->latest_granting = 1;
        seqno = newpolicy->latest_granting;

        /* Install the new policy. */
        if (oldpolicy) {
                sidtab_freeze_begin(oldpolicy->sidtab, &flags);
                rcu_assign_pointer(state->policy, newpolicy);
                sidtab_freeze_end(oldpolicy->sidtab, &flags);
        } else {
                rcu_assign_pointer(state->policy, newpolicy);
        }

        /* Load the policycaps from the new policy */
        security_load_policycaps(newpolicy);

        if (!selinux_initialized()) {
                /*
                 * After first policy load, the security server is
                 * marked as initialized and ready to handle requests and
                 * any objects created prior to policy load are then labeled.
                 */
                selinux_mark_initialized();
                selinux_complete_init();
        }

        /* Free the old policy */
        synchronize_rcu();
        selinux_policy_free(oldpolicy);
        kfree(load_state->convert_data);

        /* Notify others of the policy change */
        selinux_notify_policy_change(seqno);
}

/**
 * security_load_policy - Load a security policy configuration.
 * @data: binary policy data
 * @len: length of data in bytes
 * @load_state: policy load state
 *
 * Load a new set of security policy configuration data,
 * validate it and convert the SID table as necessary.
 * This function will flush the access vector cache after
 * loading the new policy.
 */
int security_load_policy(void *data, size_t len,
                         struct selinux_load_state *load_state)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *newpolicy, *oldpolicy;
        struct selinux_policy_convert_data *convert_data;
        int rc = 0;
        struct policy_file file = { data, len }, *fp = &file;

        newpolicy = kzalloc(sizeof(*newpolicy), GFP_KERNEL);
        if (!newpolicy)
                return -ENOMEM;

        newpolicy->sidtab = kzalloc(sizeof(*newpolicy->sidtab), GFP_KERNEL);
        if (!newpolicy->sidtab) {
                rc = -ENOMEM;
                goto err_policy;
        }

        rc = policydb_read(&newpolicy->policydb, fp);
        if (rc)
                goto err_sidtab;

        newpolicy->policydb.len = len;
        rc = selinux_set_mapping(&newpolicy->policydb, secclass_map,
                                &newpolicy->map);
        if (rc)
                goto err_policydb;

        rc = policydb_load_isids(&newpolicy->policydb, newpolicy->sidtab);
        if (rc) {
                pr_err("SELinux:  unable to load the initial SIDs\n");
                goto err_mapping;
        }

        if (!selinux_initialized()) {
                /* First policy load, so no need to preserve state from old policy */
                load_state->policy = newpolicy;
                load_state->convert_data = NULL;
                return 0;
        }

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        /* Preserve active boolean values from the old policy */
        rc = security_preserve_bools(oldpolicy, newpolicy);
        if (rc) {
                pr_err("SELinux:  unable to preserve booleans\n");
                goto err_free_isids;
        }

        /*
         * Convert the internal representations of contexts
         * in the new SID table.
         */

        convert_data = kmalloc(sizeof(*convert_data), GFP_KERNEL);
        if (!convert_data) {
                rc = -ENOMEM;
                goto err_free_isids;
        }

        convert_data->args.oldp = &oldpolicy->policydb;
        convert_data->args.newp = &newpolicy->policydb;

        convert_data->sidtab_params.args = &convert_data->args;
        convert_data->sidtab_params.target = newpolicy->sidtab;

        rc = sidtab_convert(oldpolicy->sidtab, &convert_data->sidtab_params);
        if (rc) {
                pr_err("SELinux:  unable to convert the internal"
                        " representation of contexts in the new SID"
                        " table\n");
                goto err_free_convert_data;
        }

        load_state->policy = newpolicy;
        load_state->convert_data = convert_data;
        return 0;

err_free_convert_data:
        kfree(convert_data);
err_free_isids:
        sidtab_destroy(newpolicy->sidtab);
err_mapping:
        kfree(newpolicy->map.mapping);
err_policydb:
        policydb_destroy(&newpolicy->policydb);
err_sidtab:
        kfree(newpolicy->sidtab);
err_policy:
        kfree(newpolicy);

        return rc;
}

/**
 * ocontext_to_sid - Helper to safely get sid for an ocontext
 * @sidtab: SID table
 * @c: ocontext structure
 * @index: index of the context entry (0 or 1)
 * @out_sid: pointer to the resulting SID value
 *
 * For all ocontexts except OCON_ISID the SID fields are populated
 * on-demand when needed. Since updating the SID value is an SMP-sensitive
 * operation, this helper must be used to do that safely.
 *
 * WARNING: This function may return -ESTALE, indicating that the caller
 * must retry the operation after re-acquiring the policy pointer!
 */
static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c,
                           size_t index, u32 *out_sid)
{
        int rc;
        u32 sid;

        /* Ensure the associated sidtab entry is visible to this thread. */
        sid = smp_load_acquire(&c->sid[index]);
        if (!sid) {
                rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid);
                if (rc)
                        return rc;

                /*
                 * Ensure the new sidtab entry is visible to other threads
                 * when they see the SID.
                 */
                smp_store_release(&c->sid[index], sid);
        }
        *out_sid = sid;
        return 0;
}

/**
 * security_port_sid - Obtain the SID for a port.
 * @protocol: protocol number
 * @port: port number
 * @out_sid: security identifier
 */
int security_port_sid(u8 protocol, u16 port, u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct ocontext *c;
        int rc;

        if (!selinux_initialized()) {
                *out_sid = SECINITSID_PORT;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_PORT];
        while (c) {
                if (c->u.port.protocol == protocol &&
                    c->u.port.low_port <= port &&
                    c->u.port.high_port >= port)
                        break;
                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else {
                *out_sid = SECINITSID_PORT;
        }

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_ib_pkey_sid - Obtain the SID for a pkey.
 * @subnet_prefix: Subnet Prefix
 * @pkey_num: pkey number
 * @out_sid: security identifier
 */
int security_ib_pkey_sid(u64 subnet_prefix, u16 pkey_num, u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct ocontext *c;
        int rc;

        if (!selinux_initialized()) {
                *out_sid = SECINITSID_UNLABELED;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_IBPKEY];
        while (c) {
                if (c->u.ibpkey.low_pkey <= pkey_num &&
                    c->u.ibpkey.high_pkey >= pkey_num &&
                    c->u.ibpkey.subnet_prefix == subnet_prefix)
                        break;

                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else
                *out_sid = SECINITSID_UNLABELED;

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_ib_endport_sid - Obtain the SID for a subnet management interface.
 * @dev_name: device name
 * @port_num: port number
 * @out_sid: security identifier
 */
int security_ib_endport_sid(const char *dev_name, u8 port_num, u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct ocontext *c;
        int rc;

        if (!selinux_initialized()) {
                *out_sid = SECINITSID_UNLABELED;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_IBENDPORT];
        while (c) {
                if (c->u.ibendport.port == port_num &&
                    !strncmp(c->u.ibendport.dev_name,
                             dev_name,
                             IB_DEVICE_NAME_MAX))
                        break;

                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else
                *out_sid = SECINITSID_UNLABELED;

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_netif_sid - Obtain the SID for a network interface.
 * @name: interface name
 * @if_sid: interface SID
 */
int security_netif_sid(const char *name, u32 *if_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;
        bool wildcard_support;

        if (!selinux_initialized()) {
                *if_sid = SECINITSID_NETIF;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;
        wildcard_support = ebitmap_get_bit(&policydb->policycaps, POLICYDB_CAP_NETIF_WILDCARD);

        c = policydb->ocontexts[OCON_NETIF];
        while (c) {
                if (wildcard_support) {
                        if (match_wildcard(c->u.name, name))
                                break;
                } else {
                        if (strcmp(c->u.name, name) == 0)
                                break;
                }

                c = c->next;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, if_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else
                *if_sid = SECINITSID_NETIF;

out:
        rcu_read_unlock();
        return rc;
}

static bool match_ipv6_addrmask(const u32 input[4], const u32 addr[4], const u32 mask[4])
{
        int i;

        for (i = 0; i < 4; i++)
                if (addr[i] != (input[i] & mask[i]))
                        return false;

        return true;
}

/**
 * security_node_sid - Obtain the SID for a node (host).
 * @domain: communication domain aka address family
 * @addrp: address
 * @addrlen: address length in bytes
 * @out_sid: security identifier
 */
int security_node_sid(u16 domain,
                      void *addrp,
                      u32 addrlen,
                      u32 *out_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;

        if (!selinux_initialized()) {
                *out_sid = SECINITSID_NODE;
                return 0;
        }

retry:
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        switch (domain) {
        case AF_INET: {
                u32 addr;

                rc = -EINVAL;
                if (addrlen != sizeof(u32))
                        goto out;

                addr = *((u32 *)addrp);

                c = policydb->ocontexts[OCON_NODE];
                while (c) {
                        if (c->u.node.addr == (addr & c->u.node.mask))
                                break;
                        c = c->next;
                }
                break;
        }

        case AF_INET6:
                rc = -EINVAL;
                if (addrlen != sizeof(u64) * 2)
                        goto out;
                c = policydb->ocontexts[OCON_NODE6];
                while (c) {
                        if (match_ipv6_addrmask(addrp, c->u.node6.addr,
                                                c->u.node6.mask))
                                break;
                        c = c->next;
                }
                break;

        default:
                rc = 0;
                *out_sid = SECINITSID_NODE;
                goto out;
        }

        if (c) {
                rc = ocontext_to_sid(sidtab, c, 0, out_sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else {
                *out_sid = SECINITSID_NODE;
        }

        rc = 0;
out:
        rcu_read_unlock();
        return rc;
}

#define SIDS_NEL 25

/**
 * security_get_user_sids - Obtain reachable SIDs for a user.
 * @fromsid: starting SID
 * @username: username
 * @sids: array of reachable SIDs for user
 * @nel: number of elements in @sids
 *
 * Generate the set of SIDs for legal security contexts
 * for a given user that can be reached by @fromsid.
 * Set *@sids to point to a dynamically allocated
 * array containing the set of SIDs.  Set *@nel to the
 * number of elements in the array.
 */

int security_get_user_sids(u32 fromsid,
                           const char *username,
                           u32 **sids,
                           u32 *nel)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct context *fromcon, usercon;
        u32 *mysids = NULL, *mysids2, sid;
        u32 i, j, mynel, maxnel = SIDS_NEL;
        struct user_datum *user;
        struct role_datum *role;
        struct ebitmap_node *rnode, *tnode;
        int rc;

        *sids = NULL;
        *nel = 0;

        if (!selinux_initialized())
                return 0;

        mysids = kcalloc(maxnel, sizeof(*mysids), GFP_KERNEL);
        if (!mysids)
                return -ENOMEM;

retry:
        mynel = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        context_init(&usercon);

        rc = -EINVAL;
        fromcon = sidtab_search(sidtab, fromsid);
        if (!fromcon)
                goto out_unlock;

        rc = -EINVAL;
        user = symtab_search(&policydb->p_users, username);
        if (!user)
                goto out_unlock;

        usercon.user = user->value;

        ebitmap_for_each_positive_bit(&user->roles, rnode, i) {
                role = policydb->role_val_to_struct[i];
                usercon.role = i + 1;
                ebitmap_for_each_positive_bit(&role->types, tnode, j) {
                        usercon.type = j + 1;

                        if (mls_setup_user_range(policydb, fromcon, user,
                                                 &usercon))
                                continue;

                        rc = sidtab_context_to_sid(sidtab, &usercon, &sid);
                        if (rc == -ESTALE) {
                                rcu_read_unlock();
                                goto retry;
                        }
                        if (rc)
                                goto out_unlock;
                        if (mynel < maxnel) {
                                mysids[mynel++] = sid;
                        } else {
                                rc = -ENOMEM;
                                maxnel += SIDS_NEL;
                                mysids2 = kcalloc(maxnel, sizeof(*mysids2), GFP_ATOMIC);
                                if (!mysids2)
                                        goto out_unlock;
                                memcpy(mysids2, mysids, mynel * sizeof(*mysids2));
                                kfree(mysids);
                                mysids = mysids2;
                                mysids[mynel++] = sid;
                        }
                }
        }
        rc = 0;
out_unlock:
        rcu_read_unlock();
        if (rc || !mynel) {
                kfree(mysids);
                return rc;
        }

        rc = -ENOMEM;
        mysids2 = kcalloc(mynel, sizeof(*mysids2), GFP_KERNEL);
        if (!mysids2) {
                kfree(mysids);
                return rc;
        }
        for (i = 0, j = 0; i < mynel; i++) {
                struct av_decision dummy_avd;
                rc = avc_has_perm_noaudit(fromsid, mysids[i],
                                          SECCLASS_PROCESS, /* kernel value */
                                          PROCESS__TRANSITION, AVC_STRICT,
                                          &dummy_avd);
                if (!rc)
                        mysids2[j++] = mysids[i];
                cond_resched();
        }
        kfree(mysids);
        *sids = mysids2;
        *nel = j;
        return 0;
}

/**
 * __security_genfs_sid - Helper to obtain a SID for a file in a filesystem
 * @policy: policy
 * @fstype: filesystem type
 * @path: path from root of mount
 * @orig_sclass: file security class
 * @sid: SID for path
 *
 * Obtain a SID to use for a file in a filesystem that
 * cannot support xattr or use a fixed labeling behavior like
 * transition SIDs or task SIDs.
 *
 * WARNING: This function may return -ESTALE, indicating that the caller
 * must retry the operation after re-acquiring the policy pointer!
 */
static inline int __security_genfs_sid(struct selinux_policy *policy,
                                       const char *fstype,
                                       const char *path,
                                       u16 orig_sclass,
                                       u32 *sid)
{
        struct policydb *policydb = &policy->policydb;
        struct sidtab *sidtab = policy->sidtab;
        u16 sclass;
        struct genfs *genfs;
        struct ocontext *c;
        int cmp = 0;

        while (path[0] == '/' && path[1] == '/')
                path++;

        sclass = unmap_class(&policy->map, orig_sclass);
        *sid = SECINITSID_UNLABELED;

        for (genfs = policydb->genfs; genfs; genfs = genfs->next) {
                cmp = strcmp(fstype, genfs->fstype);
                if (cmp <= 0)
                        break;
        }

        if (!genfs || cmp)
                return -ENOENT;

        for (c = genfs->head; c; c = c->next) {
                size_t len = strlen(c->u.name);
                if ((!c->v.sclass || sclass == c->v.sclass) &&
                    (strncmp(c->u.name, path, len) == 0))
                        break;
        }

        if (!c)
                return -ENOENT;

        return ocontext_to_sid(sidtab, c, 0, sid);
}

/**
 * security_genfs_sid - Obtain a SID for a file in a filesystem
 * @fstype: filesystem type
 * @path: path from root of mount
 * @orig_sclass: file security class
 * @sid: SID for path
 *
 * Acquire policy_rwlock before calling __security_genfs_sid() and release
 * it afterward.
 */
int security_genfs_sid(const char *fstype,
                       const char *path,
                       u16 orig_sclass,
                       u32 *sid)
{
        struct selinux_policy *policy;
        int retval;

        if (!selinux_initialized()) {
                *sid = SECINITSID_UNLABELED;
                return 0;
        }

        do {
                rcu_read_lock();
                policy = rcu_dereference(selinux_state.policy);
                retval = __security_genfs_sid(policy, fstype, path,
                                              orig_sclass, sid);
                rcu_read_unlock();
        } while (retval == -ESTALE);
        return retval;
}

int selinux_policy_genfs_sid(struct selinux_policy *policy,
                        const char *fstype,
                        const char *path,
                        u16 orig_sclass,
                        u32 *sid)
{
        /* no lock required, policy is not yet accessible by other threads */
        return __security_genfs_sid(policy, fstype, path, orig_sclass, sid);
}

/**
 * security_fs_use - Determine how to handle labeling for a filesystem.
 * @sb: superblock in question
 */
int security_fs_use(struct super_block *sb)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct ocontext *c;
        struct superblock_security_struct *sbsec = selinux_superblock(sb);
        const char *fstype = sb->s_type->name;

        if (!selinux_initialized()) {
                sbsec->behavior = SECURITY_FS_USE_NONE;
                sbsec->sid = SECINITSID_UNLABELED;
                return 0;
        }

retry:
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        c = policydb->ocontexts[OCON_FSUSE];
        while (c) {
                if (strcmp(fstype, c->u.name) == 0)
                        break;
                c = c->next;
        }

        if (c) {
                sbsec->behavior = c->v.behavior;
                rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;
        } else {
                rc = __security_genfs_sid(policy, fstype, "/",
                                        SECCLASS_DIR, &sbsec->sid);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc) {
                        sbsec->behavior = SECURITY_FS_USE_NONE;
                        rc = 0;
                } else {
                        sbsec->behavior = SECURITY_FS_USE_GENFS;
                }
        }

out:
        rcu_read_unlock();
        return rc;
}

int security_get_bools(struct selinux_policy *policy,
                       u32 *len, char ***names, int **values)
{
        struct policydb *policydb;
        u32 i;
        int rc;

        policydb = &policy->policydb;

        *names = NULL;
        *values = NULL;

        rc = 0;
        *len = policydb->p_bools.nprim;
        if (!*len)
                goto out;

        rc = -ENOMEM;
        *names = kcalloc(*len, sizeof(char *), GFP_ATOMIC);
        if (!*names)
                goto err;

        rc = -ENOMEM;
        *values = kcalloc(*len, sizeof(int), GFP_ATOMIC);
        if (!*values)
                goto err;

        for (i = 0; i < *len; i++) {
                (*values)[i] = policydb->bool_val_to_struct[i]->state;

                rc = -ENOMEM;
                (*names)[i] = kstrdup(sym_name(policydb, SYM_BOOLS, i),
                                      GFP_ATOMIC);
                if (!(*names)[i])
                        goto err;
        }
        rc = 0;
out:
        return rc;
err:
        if (*names) {
                for (i = 0; i < *len; i++)
                        kfree((*names)[i]);
                kfree(*names);
        }
        kfree(*values);
        *len = 0;
        *names = NULL;
        *values = NULL;
        goto out;
}


int security_set_bools(u32 len, const int *values)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *newpolicy, *oldpolicy;
        int rc;
        u32 i, seqno = 0;

        if (!selinux_initialized())
                return -EINVAL;

        oldpolicy = rcu_dereference_protected(state->policy,
                                        lockdep_is_held(&state->policy_mutex));

        /* Consistency check on number of booleans, should never fail */
        if (WARN_ON(len != oldpolicy->policydb.p_bools.nprim))
                return -EINVAL;

        newpolicy = kmemdup(oldpolicy, sizeof(*newpolicy), GFP_KERNEL);
        if (!newpolicy)
                return -ENOMEM;

        /*
         * Deep copy only the parts of the policydb that might be
         * modified as a result of changing booleans.
         */
        rc = cond_policydb_dup(&newpolicy->policydb, &oldpolicy->policydb);
        if (rc) {
                kfree(newpolicy);
                return -ENOMEM;
        }

        /* Update the boolean states in the copy */
        for (i = 0; i < len; i++) {
                int new_state = !!values[i];
                int old_state = newpolicy->policydb.bool_val_to_struct[i]->state;

                if (new_state != old_state) {
                        audit_log(audit_context(), GFP_ATOMIC,
                                AUDIT_MAC_CONFIG_CHANGE,
                                "bool=%s val=%d old_val=%d auid=%u ses=%u",
                                sym_name(&newpolicy->policydb, SYM_BOOLS, i),
                                new_state,
                                old_state,
                                from_kuid(&init_user_ns, audit_get_loginuid(current)),
                                audit_get_sessionid(current));
                        newpolicy->policydb.bool_val_to_struct[i]->state = new_state;
                }
        }

        /* Re-evaluate the conditional rules in the copy */
        evaluate_cond_nodes(&newpolicy->policydb);

        /* Set latest granting seqno for new policy */
        newpolicy->latest_granting = oldpolicy->latest_granting + 1;
        seqno = newpolicy->latest_granting;

        /* Install the new policy */
        rcu_assign_pointer(state->policy, newpolicy);

        /*
         * Free the conditional portions of the old policydb
         * that were copied for the new policy, and the oldpolicy
         * structure itself but not what it references.
         */
        synchronize_rcu();
        selinux_policy_cond_free(oldpolicy);

        /* Notify others of the policy change */
        selinux_notify_policy_change(seqno);
        return 0;
}

int security_get_bool_value(u32 index)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        int rc;
        u32 len;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;

        rc = -EFAULT;
        len = policydb->p_bools.nprim;
        if (index >= len)
                goto out;

        rc = policydb->bool_val_to_struct[index]->state;
out:
        rcu_read_unlock();
        return rc;
}

static int security_preserve_bools(struct selinux_policy *oldpolicy,
                                struct selinux_policy *newpolicy)
{
        int rc, *bvalues = NULL;
        char **bnames = NULL;
        struct cond_bool_datum *booldatum;
        u32 i, nbools = 0;

        rc = security_get_bools(oldpolicy, &nbools, &bnames, &bvalues);
        if (rc)
                goto out;
        for (i = 0; i < nbools; i++) {
                booldatum = symtab_search(&newpolicy->policydb.p_bools,
                                        bnames[i]);
                if (booldatum)
                        booldatum->state = bvalues[i];
        }
        evaluate_cond_nodes(&newpolicy->policydb);

out:
        if (bnames) {
                for (i = 0; i < nbools; i++)
                        kfree(bnames[i]);
        }
        kfree(bnames);
        kfree(bvalues);
        return rc;
}

/*
 * security_sid_mls_copy() - computes a new sid based on the given
 * sid and the mls portion of mls_sid.
 */
int security_sid_mls_copy(u32 sid, u32 mls_sid, u32 *new_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        struct context *context1;
        struct context *context2;
        struct context newcon;
        char *s;
        u32 len;
        int rc;

        if (!selinux_initialized()) {
                *new_sid = sid;
                return 0;
        }

retry:
        rc = 0;
        context_init(&newcon);

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (!policydb->mls_enabled) {
                *new_sid = sid;
                goto out_unlock;
        }

        rc = -EINVAL;
        context1 = sidtab_search(sidtab, sid);
        if (!context1) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, sid);
                goto out_unlock;
        }

        rc = -EINVAL;
        context2 = sidtab_search(sidtab, mls_sid);
        if (!context2) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                        __func__, mls_sid);
                goto out_unlock;
        }

        newcon.user = context1->user;
        newcon.role = context1->role;
        newcon.type = context1->type;
        rc = mls_context_cpy(&newcon, context2);
        if (rc)
                goto out_unlock;

        /* Check the validity of the new context. */
        if (!policydb_context_isvalid(policydb, &newcon)) {
                rc = convert_context_handle_invalid_context(policydb,
                                                        &newcon);
                if (rc) {
                        if (!context_struct_to_string(policydb, &newcon, &s,
                                                      &len)) {
                                struct audit_buffer *ab;

                                ab = audit_log_start(audit_context(),
                                                     GFP_ATOMIC,
                                                     AUDIT_SELINUX_ERR);
                                audit_log_format(ab,
                                                 "op=security_sid_mls_copy invalid_context=");
                                /* don't record NUL with untrusted strings */
                                audit_log_n_untrustedstring(ab, s, len - 1);
                                audit_log_end(ab);
                                kfree(s);
                        }
                        goto out_unlock;
                }
        }
        rc = sidtab_context_to_sid(sidtab, &newcon, new_sid);
        if (rc == -ESTALE) {
                rcu_read_unlock();
                context_destroy(&newcon);
                goto retry;
        }
out_unlock:
        rcu_read_unlock();
        context_destroy(&newcon);
        return rc;
}

/**
 * security_net_peersid_resolve - Compare and resolve two network peer SIDs
 * @nlbl_sid: NetLabel SID
 * @nlbl_type: NetLabel labeling protocol type
 * @xfrm_sid: XFRM SID
 * @peer_sid: network peer sid
 *
 * Description:
 * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be
 * resolved into a single SID it is returned via @peer_sid and the function
 * returns zero.  Otherwise @peer_sid is set to SECSID_NULL and the function
 * returns a negative value.  A table summarizing the behavior is below:
 *
 *                                 | function return |      @sid
 *   ------------------------------+-----------------+-----------------
 *   no peer labels                |        0        |    SECSID_NULL
 *   single peer label             |        0        |    <peer_label>
 *   multiple, consistent labels   |        0        |    <peer_label>
 *   multiple, inconsistent labels |    -<errno>     |    SECSID_NULL
 *
 */
int security_net_peersid_resolve(u32 nlbl_sid, u32 nlbl_type,
                                 u32 xfrm_sid,
                                 u32 *peer_sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct context *nlbl_ctx;
        struct context *xfrm_ctx;

        *peer_sid = SECSID_NULL;

        /* handle the common (which also happens to be the set of easy) cases
         * right away, these two if statements catch everything involving a
         * single or absent peer SID/label */
        if (xfrm_sid == SECSID_NULL) {
                *peer_sid = nlbl_sid;
                return 0;
        }
        /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label
         * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label
         * is present */
        if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) {
                *peer_sid = xfrm_sid;
                return 0;
        }

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        /*
         * We don't need to check initialized here since the only way both
         * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the
         * security server was initialized and state->initialized was true.
         */
        if (!policydb->mls_enabled) {
                rc = 0;
                goto out;
        }

        rc = -EINVAL;
        nlbl_ctx = sidtab_search(sidtab, nlbl_sid);
        if (!nlbl_ctx) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, nlbl_sid);
                goto out;
        }
        rc = -EINVAL;
        xfrm_ctx = sidtab_search(sidtab, xfrm_sid);
        if (!xfrm_ctx) {
                pr_err("SELinux: %s:  unrecognized SID %d\n",
                       __func__, xfrm_sid);
                goto out;
        }
        rc = (mls_context_equal(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES);
        if (rc)
                goto out;

        /* at present NetLabel SIDs/labels really only carry MLS
         * information so if the MLS portion of the NetLabel SID
         * matches the MLS portion of the labeled XFRM SID/label
         * then pass along the XFRM SID as it is the most
         * expressive */
        *peer_sid = xfrm_sid;
out:
        rcu_read_unlock();
        return rc;
}

static int get_classes_callback(void *k, void *d, void *args)
{
        struct class_datum *datum = d;
        char *name = k, **classes = args;
        u32 value = datum->value - 1;

        classes[value] = kstrdup(name, GFP_ATOMIC);
        if (!classes[value])
                return -ENOMEM;

        return 0;
}

int security_get_classes(struct selinux_policy *policy,
                         char ***classes, u32 *nclasses)
{
        struct policydb *policydb;
        int rc;

        policydb = &policy->policydb;

        rc = -ENOMEM;
        *nclasses = policydb->p_classes.nprim;
        *classes = kcalloc(*nclasses, sizeof(**classes), GFP_ATOMIC);
        if (!*classes)
                goto out;

        rc = hashtab_map(&policydb->p_classes.table, get_classes_callback,
                         *classes);
        if (rc) {
                u32 i;

                for (i = 0; i < *nclasses; i++)
                        kfree((*classes)[i]);
                kfree(*classes);
        }

out:
        return rc;
}

static int get_permissions_callback(void *k, void *d, void *args)
{
        struct perm_datum *datum = d;
        char *name = k, **perms = args;
        u32 value = datum->value - 1;

        perms[value] = kstrdup(name, GFP_ATOMIC);
        if (!perms[value])
                return -ENOMEM;

        return 0;
}

int security_get_permissions(struct selinux_policy *policy,
                             const char *class, char ***perms, u32 *nperms)
{
        struct policydb *policydb;
        u32 i;
        int rc;
        struct class_datum *match;

        policydb = &policy->policydb;

        rc = -EINVAL;
        match = symtab_search(&policydb->p_classes, class);
        if (!match) {
                pr_err("SELinux: %s:  unrecognized class %s\n",
                        __func__, class);
                goto out;
        }

        rc = -ENOMEM;
        *nperms = match->permissions.nprim;
        *perms = kcalloc(*nperms, sizeof(**perms), GFP_ATOMIC);
        if (!*perms)
                goto out;

        if (match->comdatum) {
                rc = hashtab_map(&match->comdatum->permissions.table,
                                 get_permissions_callback, *perms);
                if (rc)
                        goto err;
        }

        rc = hashtab_map(&match->permissions.table, get_permissions_callback,
                         *perms);
        if (rc)
                goto err;

out:
        return rc;

err:
        for (i = 0; i < *nperms; i++)
                kfree((*perms)[i]);
        kfree(*perms);
        return rc;
}

int security_get_reject_unknown(void)
{
        struct selinux_policy *policy;
        int value;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        value = policy->policydb.reject_unknown;
        rcu_read_unlock();
        return value;
}

int security_get_allow_unknown(void)
{
        struct selinux_policy *policy;
        int value;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        value = policy->policydb.allow_unknown;
        rcu_read_unlock();
        return value;
}

/**
 * security_policycap_supported - Check for a specific policy capability
 * @req_cap: capability
 *
 * Description:
 * This function queries the currently loaded policy to see if it supports the
 * capability specified by @req_cap.  Returns true (1) if the capability is
 * supported, false (0) if it isn't supported.
 *
 */
int security_policycap_supported(unsigned int req_cap)
{
        struct selinux_policy *policy;
        int rc;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        rc = ebitmap_get_bit(&policy->policydb.policycaps, req_cap);
        rcu_read_unlock();

        return rc;
}

struct selinux_audit_rule {
        u32 au_seqno;
        struct context au_ctxt;
};

void selinux_audit_rule_free(void *vrule)
{
        struct selinux_audit_rule *rule = vrule;

        if (rule) {
                context_destroy(&rule->au_ctxt);
                kfree(rule);
        }
}

int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule,
                            gfp_t gfp)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct selinux_audit_rule *tmprule;
        struct role_datum *roledatum;
        struct type_datum *typedatum;
        struct user_datum *userdatum;
        struct selinux_audit_rule **rule = (struct selinux_audit_rule **)vrule;
        int rc = 0;

        *rule = NULL;

        if (!selinux_initialized())
                return -EOPNOTSUPP;

        switch (field) {
        case AUDIT_SUBJ_USER:
        case AUDIT_SUBJ_ROLE:
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_USER:
        case AUDIT_OBJ_ROLE:
        case AUDIT_OBJ_TYPE:
                /* only 'equals' and 'not equals' fit user, role, and type */
                if (op != Audit_equal && op != Audit_not_equal)
                        return -EINVAL;
                break;
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                /* we do not allow a range, indicated by the presence of '-' */
                if (strchr(rulestr, '-'))
                        return -EINVAL;
                break;
        default:
                /* only the above fields are valid */
                return -EINVAL;
        }

        tmprule = kzalloc(sizeof(struct selinux_audit_rule), gfp);
        if (!tmprule)
                return -ENOMEM;
        context_init(&tmprule->au_ctxt);

        rcu_read_lock();
        policy = rcu_dereference(state->policy);
        policydb = &policy->policydb;
        tmprule->au_seqno = policy->latest_granting;
        switch (field) {
        case AUDIT_SUBJ_USER:
        case AUDIT_OBJ_USER:
                userdatum = symtab_search(&policydb->p_users, rulestr);
                if (!userdatum) {
                        rc = -EINVAL;
                        goto err;
                }
                tmprule->au_ctxt.user = userdatum->value;
                break;
        case AUDIT_SUBJ_ROLE:
        case AUDIT_OBJ_ROLE:
                roledatum = symtab_search(&policydb->p_roles, rulestr);
                if (!roledatum) {
                        rc = -EINVAL;
                        goto err;
                }
                tmprule->au_ctxt.role = roledatum->value;
                break;
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_TYPE:
                typedatum = symtab_search(&policydb->p_types, rulestr);
                if (!typedatum) {
                        rc = -EINVAL;
                        goto err;
                }
                tmprule->au_ctxt.type = typedatum->value;
                break;
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                rc = mls_from_string(policydb, rulestr, &tmprule->au_ctxt,
                                     GFP_ATOMIC);
                if (rc)
                        goto err;
                break;
        }
        rcu_read_unlock();

        *rule = tmprule;
        return 0;

err:
        rcu_read_unlock();
        selinux_audit_rule_free(tmprule);
        *rule = NULL;
        return rc;
}

/* Check to see if the rule contains any selinux fields */
int selinux_audit_rule_known(struct audit_krule *rule)
{
        u32 i;

        for (i = 0; i < rule->field_count; i++) {
                struct audit_field *f = &rule->fields[i];
                switch (f->type) {
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        return 1;
                }
        }

        return 0;
}

int selinux_audit_rule_match(struct lsm_prop *prop, u32 field, u32 op, void *vrule)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;
        struct context *ctxt;
        struct mls_level *level;
        struct selinux_audit_rule *rule = vrule;
        int match = 0;

        if (unlikely(!rule)) {
                WARN_ONCE(1, "selinux_audit_rule_match: missing rule\n");
                return -ENOENT;
        }

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();

        policy = rcu_dereference(state->policy);

        if (rule->au_seqno < policy->latest_granting) {
                match = -ESTALE;
                goto out;
        }

        ctxt = sidtab_search(policy->sidtab, prop->selinux.secid);
        if (unlikely(!ctxt)) {
                WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n",
                          prop->selinux.secid);
                match = -ENOENT;
                goto out;
        }

        /* a field/op pair that is not caught here will simply fall through
           without a match */
        switch (field) {
        case AUDIT_SUBJ_USER:
        case AUDIT_OBJ_USER:
                switch (op) {
                case Audit_equal:
                        match = (ctxt->user == rule->au_ctxt.user);
                        break;
                case Audit_not_equal:
                        match = (ctxt->user != rule->au_ctxt.user);
                        break;
                }
                break;
        case AUDIT_SUBJ_ROLE:
        case AUDIT_OBJ_ROLE:
                switch (op) {
                case Audit_equal:
                        match = (ctxt->role == rule->au_ctxt.role);
                        break;
                case Audit_not_equal:
                        match = (ctxt->role != rule->au_ctxt.role);
                        break;
                }
                break;
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_TYPE:
                switch (op) {
                case Audit_equal:
                        match = (ctxt->type == rule->au_ctxt.type);
                        break;
                case Audit_not_equal:
                        match = (ctxt->type != rule->au_ctxt.type);
                        break;
                }
                break;
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                level = ((field == AUDIT_SUBJ_SEN ||
                          field == AUDIT_OBJ_LEV_LOW) ?
                         &ctxt->range.level[0] : &ctxt->range.level[1]);
                switch (op) {
                case Audit_equal:
                        match = mls_level_eq(&rule->au_ctxt.range.level[0],
                                             level);
                        break;
                case Audit_not_equal:
                        match = !mls_level_eq(&rule->au_ctxt.range.level[0],
                                              level);
                        break;
                case Audit_lt:
                        match = (mls_level_dom(&rule->au_ctxt.range.level[0],
                                               level) &&
                                 !mls_level_eq(&rule->au_ctxt.range.level[0],
                                               level));
                        break;
                case Audit_le:
                        match = mls_level_dom(&rule->au_ctxt.range.level[0],
                                              level);
                        break;
                case Audit_gt:
                        match = (mls_level_dom(level,
                                              &rule->au_ctxt.range.level[0]) &&
                                 !mls_level_eq(level,
                                               &rule->au_ctxt.range.level[0]));
                        break;
                case Audit_ge:
                        match = mls_level_dom(level,
                                              &rule->au_ctxt.range.level[0]);
                        break;
                }
        }

out:
        rcu_read_unlock();
        return match;
}

static int aurule_avc_callback(u32 event)
{
        if (event == AVC_CALLBACK_RESET)
                return audit_update_lsm_rules();
        return 0;
}

static int __init aurule_init(void)
{
        int err;

        err = avc_add_callback(aurule_avc_callback, AVC_CALLBACK_RESET);
        if (err)
                panic("avc_add_callback() failed, error %d\n", err);

        return err;
}
__initcall(aurule_init);

#ifdef CONFIG_NETLABEL
/**
 * security_netlbl_cache_add - Add an entry to the NetLabel cache
 * @secattr: the NetLabel packet security attributes
 * @sid: the SELinux SID
 *
 * Description:
 * Attempt to cache the context in @ctx, which was derived from the packet in
 * @skb, in the NetLabel subsystem cache.  This function assumes @secattr has
 * already been initialized.
 *
 */
static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr,
                                      u32 sid)
{
        u32 *sid_cache;

        sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC);
        if (sid_cache == NULL)
                return;
        secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC);
        if (secattr->cache == NULL) {
                kfree(sid_cache);
                return;
        }

        *sid_cache = sid;
        secattr->cache->free = kfree;
        secattr->cache->data = sid_cache;
        secattr->flags |= NETLBL_SECATTR_CACHE;
}

/**
 * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID
 * @secattr: the NetLabel packet security attributes
 * @sid: the SELinux SID
 *
 * Description:
 * Convert the given NetLabel security attributes in @secattr into a
 * SELinux SID.  If the @secattr field does not contain a full SELinux
 * SID/context then use SECINITSID_NETMSG as the foundation.  If possible the
 * 'cache' field of @secattr is set and the CACHE flag is set; this is to
 * allow the @secattr to be used by NetLabel to cache the secattr to SID
 * conversion for future lookups.  Returns zero on success, negative values on
 * failure.
 *
 */
int security_netlbl_secattr_to_sid(struct netlbl_lsm_secattr *secattr,
                                   u32 *sid)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        struct sidtab *sidtab;
        int rc;
        struct context *ctx;
        struct context ctx_new;

        if (!selinux_initialized()) {
                *sid = SECSID_NULL;
                return 0;
        }

retry:
        rc = 0;
        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;
        sidtab = policy->sidtab;

        if (secattr->flags & NETLBL_SECATTR_CACHE)
                *sid = *(u32 *)secattr->cache->data;
        else if (secattr->flags & NETLBL_SECATTR_SECID)
                *sid = secattr->attr.secid;
        else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) {
                rc = -EIDRM;
                ctx = sidtab_search(sidtab, SECINITSID_NETMSG);
                if (ctx == NULL)
                        goto out;

                context_init(&ctx_new);
                ctx_new.user = ctx->user;
                ctx_new.role = ctx->role;
                ctx_new.type = ctx->type;
                mls_import_netlbl_lvl(policydb, &ctx_new, secattr);
                if (secattr->flags & NETLBL_SECATTR_MLS_CAT) {
                        rc = mls_import_netlbl_cat(policydb, &ctx_new, secattr);
                        if (rc)
                                goto out;
                }
                rc = -EIDRM;
                if (!mls_context_isvalid(policydb, &ctx_new)) {
                        ebitmap_destroy(&ctx_new.range.level[0].cat);
                        goto out;
                }

                rc = sidtab_context_to_sid(sidtab, &ctx_new, sid);
                ebitmap_destroy(&ctx_new.range.level[0].cat);
                if (rc == -ESTALE) {
                        rcu_read_unlock();
                        goto retry;
                }
                if (rc)
                        goto out;

                security_netlbl_cache_add(secattr, *sid);
        } else
                *sid = SECSID_NULL;

out:
        rcu_read_unlock();
        return rc;
}

/**
 * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr
 * @sid: the SELinux SID
 * @secattr: the NetLabel packet security attributes
 *
 * Description:
 * Convert the given SELinux SID in @sid into a NetLabel security attribute.
 * Returns zero on success, negative values on failure.
 *
 */
int security_netlbl_sid_to_secattr(u32 sid, struct netlbl_lsm_secattr *secattr)
{
        struct selinux_policy *policy;
        struct policydb *policydb;
        int rc;
        struct context *ctx;

        if (!selinux_initialized())
                return 0;

        rcu_read_lock();
        policy = rcu_dereference(selinux_state.policy);
        policydb = &policy->policydb;

        rc = -ENOENT;
        ctx = sidtab_search(policy->sidtab, sid);
        if (ctx == NULL)
                goto out;

        rc = -ENOMEM;
        secattr->domain = kstrdup(sym_name(policydb, SYM_TYPES, ctx->type - 1),
                                  GFP_ATOMIC);
        if (secattr->domain == NULL)
                goto out;

        secattr->attr.secid = sid;
        secattr->flags |= NETLBL_SECATTR_DOMAIN_CPY | NETLBL_SECATTR_SECID;
        mls_export_netlbl_lvl(policydb, ctx, secattr);
        rc = mls_export_netlbl_cat(policydb, ctx, secattr);
out:
        rcu_read_unlock();
        return rc;
}
#endif /* CONFIG_NETLABEL */

/**
 * __security_read_policy - read the policy.
 * @policy: SELinux policy
 * @data: binary policy data
 * @len: length of data in bytes
 *
 */
static int __security_read_policy(struct selinux_policy *policy,
                                  void *data, size_t *len)
{
        int rc;
        struct policy_file fp;

        fp.data = data;
        fp.len = *len;

        rc = policydb_write(&policy->policydb, &fp);
        if (rc)
                return rc;

        *len = (unsigned long)fp.data - (unsigned long)data;
        return 0;
}

/**
 * security_read_policy - read the policy.
 * @data: binary policy data
 * @len: length of data in bytes
 *
 */
int security_read_policy(void **data, size_t *len)
{
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;

        policy = rcu_dereference_protected(
                        state->policy, lockdep_is_held(&state->policy_mutex));
        if (!policy)
                return -EINVAL;

        *len = policy->policydb.len;
        *data = vmalloc_user(*len);
        if (!*data)
                return -ENOMEM;

        return __security_read_policy(policy, *data, len);
}

/**
 * security_read_state_kernel - read the policy.
 * @data: binary policy data
 * @len: length of data in bytes
 *
 * Allocates kernel memory for reading SELinux policy.
 * This function is for internal use only and should not
 * be used for returning data to user space.
 *
 * This function must be called with policy_mutex held.
 */
int security_read_state_kernel(void **data, size_t *len)
{
        int err;
        struct selinux_state *state = &selinux_state;
        struct selinux_policy *policy;

        policy = rcu_dereference_protected(
                        state->policy, lockdep_is_held(&state->policy_mutex));
        if (!policy)
                return -EINVAL;

        *len = policy->policydb.len;
        *data = vmalloc(*len);
        if (!*data)
                return -ENOMEM;

        err = __security_read_policy(policy, *data, len);
        if (err) {
                vfree(*data);
                *data = NULL;
                *len = 0;
        }
        return err;
}













































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Stack tracing support
 *
 * Copyright (C) 2012 ARM Ltd.
 */
#include <linux/kernel.h>
#include <linux/efi.h>
#include <linux/export.h>
#include <linux/filter.h>
#include <linux/ftrace.h>
#include <linux/kprobes.h>
#include <linux/sched.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>
#include <linux/stacktrace.h>

#include <asm/efi.h>
#include <asm/irq.h>
#include <asm/stack_pointer.h>
#include <asm/stacktrace.h>

enum kunwind_source {
        KUNWIND_SOURCE_UNKNOWN,
        KUNWIND_SOURCE_FRAME,
        KUNWIND_SOURCE_CALLER,
        KUNWIND_SOURCE_TASK,
        KUNWIND_SOURCE_REGS_PC,
};

union unwind_flags {
        unsigned long        all;
        struct {
                unsigned long        fgraph : 1,
                                kretprobe : 1;
        };
};

/*
 * Kernel unwind state
 *
 * @common:      Common unwind state.
 * @task:        The task being unwound.
 * @graph_idx:   Used by ftrace_graph_ret_addr() for optimized stack unwinding.
 * @kr_cur:      When KRETPROBES is selected, holds the kretprobe instance
 *               associated with the most recently encountered replacement lr
 *               value.
 */
struct kunwind_state {
        struct unwind_state common;
        struct task_struct *task;
        int graph_idx;
#ifdef CONFIG_KRETPROBES
        struct llist_node *kr_cur;
#endif
        enum kunwind_source source;
        union unwind_flags flags;
        struct pt_regs *regs;
};

static __always_inline void
kunwind_init(struct kunwind_state *state,
             struct task_struct *task)
{
        unwind_init_common(&state->common);
        state->task = task;
        state->source = KUNWIND_SOURCE_UNKNOWN;
        state->flags.all = 0;
        state->regs = NULL;
}

/*
 * Start an unwind from a pt_regs.
 *
 * The unwind will begin at the PC within the regs.
 *
 * The regs must be on a stack currently owned by the calling task.
 */
static __always_inline void
kunwind_init_from_regs(struct kunwind_state *state,
                       struct pt_regs *regs)
{
        kunwind_init(state, current);

        state->regs = regs;
        state->common.fp = regs->regs[29];
        state->common.pc = regs->pc;
        state->source = KUNWIND_SOURCE_REGS_PC;
}

/*
 * Start an unwind from a caller.
 *
 * The unwind will begin at the caller of whichever function this is inlined
 * into.
 *
 * The function which invokes this must be noinline.
 */
static __always_inline void
kunwind_init_from_caller(struct kunwind_state *state)
{
        kunwind_init(state, current);

        state->common.fp = (unsigned long)__builtin_frame_address(1);
        state->common.pc = (unsigned long)__builtin_return_address(0);
        state->source = KUNWIND_SOURCE_CALLER;
}

/*
 * Start an unwind from a blocked task.
 *
 * The unwind will begin at the blocked tasks saved PC (i.e. the caller of
 * cpu_switch_to()).
 *
 * The caller should ensure the task is blocked in cpu_switch_to() for the
 * duration of the unwind, or the unwind will be bogus. It is never valid to
 * call this for the current task.
 */
static __always_inline void
kunwind_init_from_task(struct kunwind_state *state,
                       struct task_struct *task)
{
        kunwind_init(state, task);

        state->common.fp = thread_saved_fp(task);
        state->common.pc = thread_saved_pc(task);
        state->source = KUNWIND_SOURCE_TASK;
}

static __always_inline int
kunwind_recover_return_address(struct kunwind_state *state)
{
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
        if (state->task->ret_stack &&
            (state->common.pc == (unsigned long)return_to_handler)) {
                unsigned long orig_pc;
                orig_pc = ftrace_graph_ret_addr(state->task, &state->graph_idx,
                                                state->common.pc,
                                                (void *)state->common.fp);
                if (state->common.pc == orig_pc) {
                        WARN_ON_ONCE(state->task == current);
                        return -EINVAL;
                }
                state->common.pc = orig_pc;
                state->flags.fgraph = 1;
        }
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */

#ifdef CONFIG_KRETPROBES
        if (is_kretprobe_trampoline(state->common.pc)) {
                unsigned long orig_pc;
                orig_pc = kretprobe_find_ret_addr(state->task,
                                                  (void *)state->common.fp,
                                                  &state->kr_cur);
                state->common.pc = orig_pc;
                state->flags.kretprobe = 1;
        }
#endif /* CONFIG_KRETPROBES */

        return 0;
}

static __always_inline
int kunwind_next_regs_pc(struct kunwind_state *state)
{
        struct stack_info *info;
        unsigned long fp = state->common.fp;
        struct pt_regs *regs;

        regs = container_of((u64 *)fp, struct pt_regs, stackframe.record.fp);

        info = unwind_find_stack(&state->common, (unsigned long)regs, sizeof(*regs));
        if (!info)
                return -EINVAL;

        unwind_consume_stack(&state->common, info, (unsigned long)regs,
                             sizeof(*regs));

        state->regs = regs;
        state->common.pc = regs->pc;
        state->common.fp = regs->regs[29];
        state->regs = NULL;
        state->source = KUNWIND_SOURCE_REGS_PC;
        return 0;
}

static __always_inline int
kunwind_next_frame_record_meta(struct kunwind_state *state)
{
        struct task_struct *tsk = state->task;
        unsigned long fp = state->common.fp;
        struct frame_record_meta *meta;
        struct stack_info *info;

        info = unwind_find_stack(&state->common, fp, sizeof(*meta));
        if (!info)
                return -EINVAL;

        meta = (struct frame_record_meta *)fp;
        switch (READ_ONCE(meta->type)) {
        case FRAME_META_TYPE_FINAL:
                if (meta == &task_pt_regs(tsk)->stackframe)
                        return -ENOENT;
                WARN_ON_ONCE(tsk == current);
                return -EINVAL;
        case FRAME_META_TYPE_PT_REGS:
                return kunwind_next_regs_pc(state);
        default:
                WARN_ON_ONCE(tsk == current);
                return -EINVAL;
        }
}

static __always_inline int
kunwind_next_frame_record(struct kunwind_state *state)
{
        unsigned long fp = state->common.fp;
        struct frame_record *record;
        struct stack_info *info;
        unsigned long new_fp, new_pc;

        if (fp & 0x7)
                return -EINVAL;

        info = unwind_find_stack(&state->common, fp, sizeof(*record));
        if (!info)
                return -EINVAL;

        record = (struct frame_record *)fp;
        new_fp = READ_ONCE(record->fp);
        new_pc = READ_ONCE(record->lr);

        if (!new_fp && !new_pc)
                return kunwind_next_frame_record_meta(state);

        unwind_consume_stack(&state->common, info, fp, sizeof(*record));

        state->common.fp = new_fp;
        state->common.pc = new_pc;
        state->source = KUNWIND_SOURCE_FRAME;

        return 0;
}

/*
 * Unwind from one frame record (A) to the next frame record (B).
 *
 * We terminate early if the location of B indicates a malformed chain of frame
 * records (e.g. a cycle), determined based on the location and fp value of A
 * and the location (but not the fp value) of B.
 */
static __always_inline int
kunwind_next(struct kunwind_state *state)
{
        int err;

        state->flags.all = 0;

        switch (state->source) {
        case KUNWIND_SOURCE_FRAME:
        case KUNWIND_SOURCE_CALLER:
        case KUNWIND_SOURCE_TASK:
        case KUNWIND_SOURCE_REGS_PC:
                err = kunwind_next_frame_record(state);
                break;
        default:
                err = -EINVAL;
        }

        if (err)
                return err;

        state->common.pc = ptrauth_strip_kernel_insn_pac(state->common.pc);

        return kunwind_recover_return_address(state);
}

typedef bool (*kunwind_consume_fn)(const struct kunwind_state *state, void *cookie);

static __always_inline void
do_kunwind(struct kunwind_state *state, kunwind_consume_fn consume_state,
           void *cookie)
{
        if (kunwind_recover_return_address(state))
                return;

        while (1) {
                int ret;

                if (!consume_state(state, cookie))
                        break;
                ret = kunwind_next(state);
                if (ret < 0)
                        break;
        }
}

/*
 * Per-cpu stacks are only accessible when unwinding the current task in a
 * non-preemptible context.
 */
#define STACKINFO_CPU(name)                                        \
        ({                                                        \
                ((task == current) && !preemptible())                \
                        ? stackinfo_get_##name()                \
                        : stackinfo_get_unknown();                \
        })

/*
 * SDEI stacks are only accessible when unwinding the current task in an NMI
 * context.
 */
#define STACKINFO_SDEI(name)                                        \
        ({                                                        \
                ((task == current) && in_nmi())                        \
                        ? stackinfo_get_sdei_##name()                \
                        : stackinfo_get_unknown();                \
        })

#define STACKINFO_EFI                                                \
        ({                                                        \
                ((task == current) && current_in_efi())                \
                        ? stackinfo_get_efi()                        \
                        : stackinfo_get_unknown();                \
        })

static __always_inline void
kunwind_stack_walk(kunwind_consume_fn consume_state,
                   void *cookie, struct task_struct *task,
                   struct pt_regs *regs)
{
        struct stack_info stacks[] = {
                stackinfo_get_task(task),
                STACKINFO_CPU(irq),
#if defined(CONFIG_VMAP_STACK)
                STACKINFO_CPU(overflow),
#endif
#if defined(CONFIG_VMAP_STACK) && defined(CONFIG_ARM_SDE_INTERFACE)
                STACKINFO_SDEI(normal),
                STACKINFO_SDEI(critical),
#endif
#ifdef CONFIG_EFI
                STACKINFO_EFI,
#endif
        };
        struct kunwind_state state = {
                .common = {
                        .stacks = stacks,
                        .nr_stacks = ARRAY_SIZE(stacks),
                },
        };

        if (regs) {
                if (task != current)
                        return;
                kunwind_init_from_regs(&state, regs);
        } else if (task == current) {
                kunwind_init_from_caller(&state);
        } else {
                kunwind_init_from_task(&state, task);
        }

        do_kunwind(&state, consume_state, cookie);
}

struct kunwind_consume_entry_data {
        stack_trace_consume_fn consume_entry;
        void *cookie;
};

static __always_inline bool
arch_kunwind_consume_entry(const struct kunwind_state *state, void *cookie)
{
        struct kunwind_consume_entry_data *data = cookie;
        return data->consume_entry(data->cookie, state->common.pc);
}

noinline noinstr void arch_stack_walk(stack_trace_consume_fn consume_entry,
                              void *cookie, struct task_struct *task,
                              struct pt_regs *regs)
{
        struct kunwind_consume_entry_data data = {
                .consume_entry = consume_entry,
                .cookie = cookie,
        };

        kunwind_stack_walk(arch_kunwind_consume_entry, &data, task, regs);
}

struct bpf_unwind_consume_entry_data {
        bool (*consume_entry)(void *cookie, u64 ip, u64 sp, u64 fp);
        void *cookie;
};

static bool
arch_bpf_unwind_consume_entry(const struct kunwind_state *state, void *cookie)
{
        struct bpf_unwind_consume_entry_data *data = cookie;

        return data->consume_entry(data->cookie, state->common.pc, 0,
                                   state->common.fp);
}

noinline noinstr void arch_bpf_stack_walk(bool (*consume_entry)(void *cookie, u64 ip, u64 sp,
                                                                u64 fp), void *cookie)
{
        struct bpf_unwind_consume_entry_data data = {
                .consume_entry = consume_entry,
                .cookie = cookie,
        };

        kunwind_stack_walk(arch_bpf_unwind_consume_entry, &data, current, NULL);
}

static const char *state_source_string(const struct kunwind_state *state)
{
        switch (state->source) {
        case KUNWIND_SOURCE_FRAME:        return NULL;
        case KUNWIND_SOURCE_CALLER:        return "C";
        case KUNWIND_SOURCE_TASK:        return "T";
        case KUNWIND_SOURCE_REGS_PC:        return "P";
        default:                        return "U";
        }
}

static bool dump_backtrace_entry(const struct kunwind_state *state, void *arg)
{
        const char *source = state_source_string(state);
        union unwind_flags flags = state->flags;
        bool has_info = source || flags.all;
        char *loglvl = arg;

        printk("%s %pSb%s%s%s%s%s\n", loglvl,
                (void *)state->common.pc,
                has_info ? " (" : "",
                source ? source : "",
                flags.fgraph ? "F" : "",
                flags.kretprobe ? "K" : "",
                has_info ? ")" : "");

        return true;
}

void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
                    const char *loglvl)
{
        pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);

        if (regs && user_mode(regs))
                return;

        if (!tsk)
                tsk = current;

        if (!try_get_task_stack(tsk))
                return;

        printk("%sCall trace:\n", loglvl);
        kunwind_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs);

        put_task_stack(tsk);
}

void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
{
        dump_backtrace(NULL, tsk, loglvl);
        barrier();
}

/*
 * The struct defined for userspace stack frame in AARCH64 mode.
 */
struct frame_tail {
        struct frame_tail        __user *fp;
        unsigned long                lr;
} __attribute__((packed));

/*
 * Get the return address for a single stackframe and return a pointer to the
 * next frame tail.
 */
static struct frame_tail __user *
unwind_user_frame(struct frame_tail __user *tail, void *cookie,
               stack_trace_consume_fn consume_entry)
{
        struct frame_tail buftail;
        unsigned long err;
        unsigned long lr;

        /* Also check accessibility of one struct frame_tail beyond */
        if (!access_ok(tail, sizeof(buftail)))
                return NULL;

        pagefault_disable();
        err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
        pagefault_enable();

        if (err)
                return NULL;

        lr = ptrauth_strip_user_insn_pac(buftail.lr);

        if (!consume_entry(cookie, lr))
                return NULL;

        /*
         * Frame pointers should strictly progress back up the stack
         * (towards higher addresses).
         */
        if (tail >= buftail.fp)
                return NULL;

        return buftail.fp;
}

#ifdef CONFIG_COMPAT
/*
 * The registers we're interested in are at the end of the variable
 * length saved register structure. The fp points at the end of this
 * structure so the address of this struct is:
 * (struct compat_frame_tail *)(xxx->fp)-1
 *
 * This code has been adapted from the ARM OProfile support.
 */
struct compat_frame_tail {
        compat_uptr_t        fp; /* a (struct compat_frame_tail *) in compat mode */
        u32                sp;
        u32                lr;
} __attribute__((packed));

static struct compat_frame_tail __user *
unwind_compat_user_frame(struct compat_frame_tail __user *tail, void *cookie,
                                stack_trace_consume_fn consume_entry)
{
        struct compat_frame_tail buftail;
        unsigned long err;

        /* Also check accessibility of one struct frame_tail beyond */
        if (!access_ok(tail, sizeof(buftail)))
                return NULL;

        pagefault_disable();
        err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
        pagefault_enable();

        if (err)
                return NULL;

        if (!consume_entry(cookie, buftail.lr))
                return NULL;

        /*
         * Frame pointers should strictly progress back up the stack
         * (towards higher addresses).
         */
        if (tail + 1 >= (struct compat_frame_tail __user *)
                        compat_ptr(buftail.fp))
                return NULL;

        return (struct compat_frame_tail __user *)compat_ptr(buftail.fp) - 1;
}
#endif /* CONFIG_COMPAT */


void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
                                        const struct pt_regs *regs)
{
        if (!consume_entry(cookie, regs->pc))
                return;

        if (!compat_user_mode(regs)) {
                /* AARCH64 mode */
                struct frame_tail __user *tail;

                tail = (struct frame_tail __user *)regs->regs[29];
                while (tail && !((unsigned long)tail & 0x7))
                        tail = unwind_user_frame(tail, cookie, consume_entry);
        } else {
#ifdef CONFIG_COMPAT
                /* AARCH32 compat mode */
                struct compat_frame_tail __user *tail;

                tail = (struct compat_frame_tail __user *)regs->compat_fp - 1;
                while (tail && !((unsigned long)tail & 0x3))
                        tail = unwind_compat_user_frame(tail, cookie, consume_entry);
#endif
        }
}

















































































































  196 

  196 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/exec.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * #!-checking implemented by tytso.
 */
/*
 * Demand-loading implemented 01.12.91 - no need to read anything but
 * the header into memory. The inode of the executable is put into
 * "current->executable", and page faults do the actual loading. Clean.
 *
 * Once more I can proudly say that linux stood up to being changed: it
 * was less than 2 hours work to get demand-loading completely implemented.
 *
 * Demand loading changed July 1993 by Eric Youngdale.   Use mmap instead,
 * current->executable is only used by the procfs.  This allows a dispatch
 * table to check for several different types  of binary formats.  We keep
 * trying until we recognize the file or we run out of supported binary
 * formats.
 */

#include <linux/kernel_read_file.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/stat.h>
#include <linux/fcntl.h>
#include <linux/swap.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/signal.h>
#include <linux/sched/numa_balancing.h>
#include <linux/sched/task.h>
#include <linux/pagemap.h>
#include <linux/perf_event.h>
#include <linux/highmem.h>
#include <linux/spinlock.h>
#include <linux/key.h>
#include <linux/personality.h>
#include <linux/binfmts.h>
#include <linux/utsname.h>
#include <linux/pid_namespace.h>
#include <linux/module.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/audit.h>
#include <linux/kmod.h>
#include <linux/fsnotify.h>
#include <linux/fs_struct.h>
#include <linux/oom.h>
#include <linux/compat.h>
#include <linux/vmalloc.h>
#include <linux/io_uring.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/coredump.h>
#include <linux/time_namespace.h>
#include <linux/user_events.h>
#include <linux/rseq.h>
#include <linux/ksm.h>

#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>

#include <trace/events/task.h>
#include "internal.h"

#include <trace/events/sched.h>

static int bprm_creds_from_file(struct linux_binprm *bprm);

int suid_dumpable = 0;

static LIST_HEAD(formats);
static DEFINE_RWLOCK(binfmt_lock);

void __register_binfmt(struct linux_binfmt * fmt, int insert)
{
        write_lock(&binfmt_lock);
        insert ? list_add(&fmt->lh, &formats) :
                 list_add_tail(&fmt->lh, &formats);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(__register_binfmt);

void unregister_binfmt(struct linux_binfmt * fmt)
{
        write_lock(&binfmt_lock);
        list_del(&fmt->lh);
        write_unlock(&binfmt_lock);
}

EXPORT_SYMBOL(unregister_binfmt);

static inline void put_binfmt(struct linux_binfmt * fmt)
{
        module_put(fmt->module);
}

bool path_noexec(const struct path *path)
{
        return (path->mnt->mnt_flags & MNT_NOEXEC) ||
               (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}

#ifdef CONFIG_USELIB
/*
 * Note that a shared library must be both readable and executable due to
 * security reasons.
 *
 * Also note that we take the address to load from the file itself.
 */
SYSCALL_DEFINE1(uselib, const char __user *, library)
{
        struct linux_binfmt *fmt;
        struct file *file;
        struct filename *tmp = getname(library);
        int error = PTR_ERR(tmp);
        static const struct open_flags uselib_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY,
                .acc_mode = MAY_READ | MAY_EXEC,
                .intent = LOOKUP_OPEN,
                .lookup_flags = LOOKUP_FOLLOW,
        };

        if (IS_ERR(tmp))
                goto out;

        file = do_filp_open(AT_FDCWD, tmp, &uselib_flags);
        putname(tmp);
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto out;

        /*
         * Check do_open_execat() for an explanation.
         */
        error = -EACCES;
        if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
            path_noexec(&file->f_path))
                goto exit;

        error = -ENOEXEC;

        read_lock(&binfmt_lock);
        list_for_each_entry(fmt, &formats, lh) {
                if (!fmt->load_shlib)
                        continue;
                if (!try_module_get(fmt->module))
                        continue;
                read_unlock(&binfmt_lock);
                error = fmt->load_shlib(file);
                read_lock(&binfmt_lock);
                put_binfmt(fmt);
                if (error != -ENOEXEC)
                        break;
        }
        read_unlock(&binfmt_lock);
exit:
        fput(file);
out:
        return error;
}
#endif /* #ifdef CONFIG_USELIB */

#ifdef CONFIG_MMU
/*
 * The nascent bprm->mm is not visible until exec_mmap() but it can
 * use a lot of memory, account these pages in current->mm temporary
 * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
 * change the counter back via acct_arg_size(0).
 */
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
        struct mm_struct *mm = current->mm;
        long diff = (long)(pages - bprm->vma_pages);

        if (!mm || !diff)
                return;

        bprm->vma_pages = pages;
        add_mm_counter(mm, MM_ANONPAGES, diff);
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;
        struct vm_area_struct *vma = bprm->vma;
        struct mm_struct *mm = bprm->mm;
        int ret;

        /*
         * Avoid relying on expanding the stack down in GUP (which
         * does not work for STACK_GROWSUP anyway), and just do it
         * ahead of time.
         */
        if (!mmap_read_lock_maybe_expand(mm, vma, pos, write))
                return NULL;

        /*
         * We are doing an exec().  'current' is the process
         * doing the exec and 'mm' is the new process's mm.
         */
        ret = get_user_pages_remote(mm, pos, 1,
                        write ? FOLL_WRITE : 0,
                        &page, NULL);
        mmap_read_unlock(mm);
        if (ret <= 0)
                return NULL;

        if (write)
                acct_arg_size(bprm, vma_pages(vma));

        return page;
}

static void put_arg_page(struct page *page)
{
        put_page(page);
}

static void free_arg_pages(struct linux_binprm *bprm)
{
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
        flush_cache_page(bprm->vma, pos, page_to_pfn(page));
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
        int err;
        struct vm_area_struct *vma = NULL;
        struct mm_struct *mm = bprm->mm;

        bprm->vma = vma = vm_area_alloc(mm);
        if (!vma)
                return -ENOMEM;
        vma_set_anonymous(vma);

        if (mmap_write_lock_killable(mm)) {
                err = -EINTR;
                goto err_free;
        }

        /*
         * Need to be called with mmap write lock
         * held, to avoid race with ksmd.
         */
        err = ksm_execve(mm);
        if (err)
                goto err_ksm;

        /*
         * Place the stack at the largest stack address the architecture
         * supports. Later, we'll move this to an appropriate place. We don't
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
        BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
        vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP);
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);

        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;

        mm->stack_vm = mm->total_vm = 1;
        mmap_write_unlock(mm);
        bprm->p = vma->vm_end - sizeof(void *);
        return 0;
err:
        ksm_exit(mm);
err_ksm:
        mmap_write_unlock(mm);
err_free:
        bprm->vma = NULL;
        vm_area_free(vma);
        return err;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= MAX_ARG_STRLEN;
}

#else

static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
{
}

static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
                int write)
{
        struct page *page;

        page = bprm->page[pos / PAGE_SIZE];
        if (!page && write) {
                page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
                if (!page)
                        return NULL;
                bprm->page[pos / PAGE_SIZE] = page;
        }

        return page;
}

static void put_arg_page(struct page *page)
{
}

static void free_arg_page(struct linux_binprm *bprm, int i)
{
        if (bprm->page[i]) {
                __free_page(bprm->page[i]);
                bprm->page[i] = NULL;
        }
}

static void free_arg_pages(struct linux_binprm *bprm)
{
        int i;

        for (i = 0; i < MAX_ARG_PAGES; i++)
                free_arg_page(bprm, i);
}

static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
                struct page *page)
{
}

static int __bprm_mm_init(struct linux_binprm *bprm)
{
        bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
        return 0;
}

static bool valid_arg_len(struct linux_binprm *bprm, long len)
{
        return len <= bprm->p;
}

#endif /* CONFIG_MMU */

/*
 * Create a new mm_struct and populate it with a temporary stack
 * vm_area_struct.  We don't have enough context at this point to set the stack
 * flags, permissions, and offset, so we use temporary values.  We'll update
 * them later in setup_arg_pages().
 */
static int bprm_mm_init(struct linux_binprm *bprm)
{
        int err;
        struct mm_struct *mm = NULL;

        bprm->mm = mm = mm_alloc();
        err = -ENOMEM;
        if (!mm)
                goto err;

        /* Save current stack limit for all calculations made during exec. */
        task_lock(current->group_leader);
        bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
        task_unlock(current->group_leader);

        err = __bprm_mm_init(bprm);
        if (err)
                goto err;

        return 0;

err:
        if (mm) {
                bprm->mm = NULL;
                mmdrop(mm);
        }

        return err;
}

struct user_arg_ptr {
#ifdef CONFIG_COMPAT
        bool is_compat;
#endif
        union {
                const char __user *const __user *native;
#ifdef CONFIG_COMPAT
                const compat_uptr_t __user *compat;
#endif
        } ptr;
};

static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
{
        const char __user *native;

#ifdef CONFIG_COMPAT
        if (unlikely(argv.is_compat)) {
                compat_uptr_t compat;

                if (get_user(compat, argv.ptr.compat + nr))
                        return ERR_PTR(-EFAULT);

                return compat_ptr(compat);
        }
#endif

        if (get_user(native, argv.ptr.native + nr))
                return ERR_PTR(-EFAULT);

        return native;
}

/*
 * count() counts the number of strings in array ARGV.
 */
static int count(struct user_arg_ptr argv, int max)
{
        int i = 0;

        if (argv.ptr.native != NULL) {
                for (;;) {
                        const char __user *p = get_user_arg_ptr(argv, i);

                        if (!p)
                                break;

                        if (IS_ERR(p))
                                return -EFAULT;

                        if (i >= max)
                                return -E2BIG;
                        ++i;

                        if (fatal_signal_pending(current))
                                return -ERESTARTNOHAND;
                        cond_resched();
                }
        }
        return i;
}

static int count_strings_kernel(const char *const *argv)
{
        int i;

        if (!argv)
                return 0;

        for (i = 0; argv[i]; ++i) {
                if (i >= MAX_ARG_STRINGS)
                        return -E2BIG;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return i;
}

static inline int bprm_set_stack_limit(struct linux_binprm *bprm,
                                       unsigned long limit)
{
#ifdef CONFIG_MMU
        /* Avoid a pathological bprm->p. */
        if (bprm->p < limit)
                return -E2BIG;
        bprm->argmin = bprm->p - limit;
#endif
        return 0;
}
static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm)
{
#ifdef CONFIG_MMU
        return bprm->p < bprm->argmin;
#else
        return false;
#endif
}

/*
 * Calculate bprm->argmin from:
 * - _STK_LIM
 * - ARG_MAX
 * - bprm->rlim_stack.rlim_cur
 * - bprm->argc
 * - bprm->envc
 * - bprm->p
 */
static int bprm_stack_limits(struct linux_binprm *bprm)
{
        unsigned long limit, ptr_size;

        /*
         * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM
         * (whichever is smaller) for the argv+env strings.
         * This ensures that:
         *  - the remaining binfmt code will not run out of stack space,
         *  - the program will have a reasonable amount of stack left
         *    to work from.
         */
        limit = _STK_LIM / 4 * 3;
        limit = min(limit, bprm->rlim_stack.rlim_cur / 4);
        /*
         * We've historically supported up to 32 pages (ARG_MAX)
         * of argument strings even with small stacks
         */
        limit = max_t(unsigned long, limit, ARG_MAX);
        /* Reject totally pathological counts. */
        if (bprm->argc < 0 || bprm->envc < 0)
                return -E2BIG;
        /*
         * We must account for the size of all the argv and envp pointers to
         * the argv and envp strings, since they will also take up space in
         * the stack. They aren't stored until much later when we can't
         * signal to the parent that the child has run out of stack space.
         * Instead, calculate it here so it's possible to fail gracefully.
         *
         * In the case of argc = 0, make sure there is space for adding a
         * empty string (which will bump argc to 1), to ensure confused
         * userspace programs don't start processing from argv[1], thinking
         * argc can never be 0, to keep them from walking envp by accident.
         * See do_execveat_common().
         */
        if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) ||
            check_mul_overflow(ptr_size, sizeof(void *), &ptr_size))
                return -E2BIG;
        if (limit <= ptr_size)
                return -E2BIG;
        limit -= ptr_size;

        return bprm_set_stack_limit(bprm, limit);
}

/*
 * 'copy_strings()' copies argument/environment strings from the old
 * processes's memory to the new process's stack.  The call to get_user_pages()
 * ensures the destination page is created and not swapped out.
 */
static int copy_strings(int argc, struct user_arg_ptr argv,
                        struct linux_binprm *bprm)
{
        struct page *kmapped_page = NULL;
        char *kaddr = NULL;
        unsigned long kpos = 0;
        int ret;

        while (argc-- > 0) {
                const char __user *str;
                int len;
                unsigned long pos;

                ret = -EFAULT;
                str = get_user_arg_ptr(argv, argc);
                if (IS_ERR(str))
                        goto out;

                len = strnlen_user(str, MAX_ARG_STRLEN);
                if (!len)
                        goto out;

                ret = -E2BIG;
                if (!valid_arg_len(bprm, len))
                        goto out;

                /* We're going to work our way backwards. */
                pos = bprm->p;
                str += len;
                bprm->p -= len;
                if (bprm_hit_stack_limit(bprm))
                        goto out;

                while (len > 0) {
                        int offset, bytes_to_copy;

                        if (fatal_signal_pending(current)) {
                                ret = -ERESTARTNOHAND;
                                goto out;
                        }
                        cond_resched();

                        offset = pos % PAGE_SIZE;
                        if (offset == 0)
                                offset = PAGE_SIZE;

                        bytes_to_copy = offset;
                        if (bytes_to_copy > len)
                                bytes_to_copy = len;

                        offset -= bytes_to_copy;
                        pos -= bytes_to_copy;
                        str -= bytes_to_copy;
                        len -= bytes_to_copy;

                        if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
                                struct page *page;

                                page = get_arg_page(bprm, pos, 1);
                                if (!page) {
                                        ret = -E2BIG;
                                        goto out;
                                }

                                if (kmapped_page) {
                                        flush_dcache_page(kmapped_page);
                                        kunmap_local(kaddr);
                                        put_arg_page(kmapped_page);
                                }
                                kmapped_page = page;
                                kaddr = kmap_local_page(kmapped_page);
                                kpos = pos & PAGE_MASK;
                                flush_arg_page(bprm, kpos, kmapped_page);
                        }
                        if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
                                ret = -EFAULT;
                                goto out;
                        }
                }
        }
        ret = 0;
out:
        if (kmapped_page) {
                flush_dcache_page(kmapped_page);
                kunmap_local(kaddr);
                put_arg_page(kmapped_page);
        }
        return ret;
}

/*
 * Copy and argument/environment string from the kernel to the processes stack.
 */
int copy_string_kernel(const char *arg, struct linux_binprm *bprm)
{
        int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */;
        unsigned long pos = bprm->p;

        if (len == 0)
                return -EFAULT;
        if (!valid_arg_len(bprm, len))
                return -E2BIG;

        /* We're going to work our way backwards. */
        arg += len;
        bprm->p -= len;
        if (bprm_hit_stack_limit(bprm))
                return -E2BIG;

        while (len > 0) {
                unsigned int bytes_to_copy = min_t(unsigned int, len,
                                min_not_zero(offset_in_page(pos), PAGE_SIZE));
                struct page *page;

                pos -= bytes_to_copy;
                arg -= bytes_to_copy;
                len -= bytes_to_copy;

                page = get_arg_page(bprm, pos, 1);
                if (!page)
                        return -E2BIG;
                flush_arg_page(bprm, pos & PAGE_MASK, page);
                memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy);
                put_arg_page(page);
        }

        return 0;
}
EXPORT_SYMBOL(copy_string_kernel);

static int copy_strings_kernel(int argc, const char *const *argv,
                               struct linux_binprm *bprm)
{
        while (argc-- > 0) {
                int ret = copy_string_kernel(argv[argc], bprm);
                if (ret < 0)
                        return ret;
                if (fatal_signal_pending(current))
                        return -ERESTARTNOHAND;
                cond_resched();
        }
        return 0;
}

#ifdef CONFIG_MMU

/*
 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 * the stack is optionally relocated, and some extra space is added.
 */
int setup_arg_pages(struct linux_binprm *bprm,
                    unsigned long stack_top,
                    int executable_stack)
{
        unsigned long ret;
        unsigned long stack_shift;
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = bprm->vma;
        struct vm_area_struct *prev = NULL;
        unsigned long vm_flags;
        unsigned long stack_base;
        unsigned long stack_size;
        unsigned long stack_expand;
        unsigned long rlim_stack;
        struct mmu_gather tlb;
        struct vma_iterator vmi;

#ifdef CONFIG_STACK_GROWSUP
        /* Limit stack size */
        stack_base = bprm->rlim_stack.rlim_max;

        stack_base = calc_max_stack_size(stack_base);

        /* Add space for stack randomization. */
        if (current->flags & PF_RANDOMIZE)
                stack_base += (STACK_RND_MASK << PAGE_SHIFT);

        /* Make sure we didn't let the argument array grow too large. */
        if (vma->vm_end - vma->vm_start > stack_base)
                return -ENOMEM;

        stack_base = PAGE_ALIGN(stack_top - stack_base);

        stack_shift = vma->vm_start - stack_base;
        mm->arg_start = bprm->p - stack_shift;
        bprm->p = vma->vm_end - stack_shift;
#else
        stack_top = arch_align_stack(stack_top);
        stack_top = PAGE_ALIGN(stack_top);

        if (unlikely(stack_top < mmap_min_addr) ||
            unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
                return -ENOMEM;

        stack_shift = vma->vm_end - stack_top;

        bprm->p -= stack_shift;
        mm->arg_start = bprm->p;
#endif

        bprm->exec -= stack_shift;

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        vm_flags = VM_STACK_FLAGS;

        /*
         * Adjust stack execute permissions; explicitly enable for
         * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
         * (arch default) otherwise.
         */
        if (unlikely(executable_stack == EXSTACK_ENABLE_X))
                vm_flags |= VM_EXEC;
        else if (executable_stack == EXSTACK_DISABLE_X)
                vm_flags &= ~VM_EXEC;
        vm_flags |= mm->def_flags;
        vm_flags |= VM_STACK_INCOMPLETE_SETUP;

        vma_iter_init(&vmi, mm, vma->vm_start);

        tlb_gather_mmu(&tlb, mm);
        ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end,
                        vm_flags);
        tlb_finish_mmu(&tlb);

        if (ret)
                goto out_unlock;
        BUG_ON(prev != vma);

        if (unlikely(vm_flags & VM_EXEC)) {
                pr_warn_once("process '%pD4' started with executable stack\n",
                             bprm->file);
        }

        /* Move stack pages down in memory. */
        if (stack_shift) {
                /*
                 * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX.  Once
                 * the binfmt code determines where the new stack should reside, we shift it to
                 * its final location.
                 */
                ret = relocate_vma_down(vma, stack_shift);
                if (ret)
                        goto out_unlock;
        }

        /* mprotect_fixup is overkill to remove the temporary stack flags */
        vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP);

        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
        stack_size = vma->vm_end - vma->vm_start;
        /*
         * Align this down to a page boundary as expand_stack
         * will align it up.
         */
        rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;

        stack_expand = min(rlim_stack, stack_size + stack_expand);

#ifdef CONFIG_STACK_GROWSUP
        stack_base = vma->vm_start + stack_expand;
#else
        stack_base = vma->vm_end - stack_expand;
#endif
        current->mm->start_stack = bprm->p;
        ret = expand_stack_locked(vma, stack_base);
        if (ret)
                ret = -EFAULT;

out_unlock:
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL(setup_arg_pages);

#else

/*
 * Transfer the program arguments and environment from the holding pages
 * onto the stack. The provided stack pointer is adjusted accordingly.
 */
int transfer_args_to_stack(struct linux_binprm *bprm,
                           unsigned long *sp_location)
{
        unsigned long index, stop, sp;
        int ret = 0;

        stop = bprm->p >> PAGE_SHIFT;
        sp = *sp_location;

        for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
                unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
                char *src = kmap_local_page(bprm->page[index]) + offset;
                sp -= PAGE_SIZE - offset;
                if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
                        ret = -EFAULT;
                kunmap_local(src);
                if (ret)
                        goto out;
        }

        bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE;
        *sp_location = sp;

out:
        return ret;
}
EXPORT_SYMBOL(transfer_args_to_stack);

#endif /* CONFIG_MMU */

/*
 * On success, caller must call do_close_execat() on the returned
 * struct file to close it.
 */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
        int err;
        struct file *file __free(fput) = NULL;
        struct open_flags open_exec_flags = {
                .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
                .acc_mode = MAY_EXEC,
                .intent = LOOKUP_OPEN,
                .lookup_flags = LOOKUP_FOLLOW,
        };

        if ((flags &
             ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_EXECVE_CHECK)) != 0)
                return ERR_PTR(-EINVAL);
        if (flags & AT_SYMLINK_NOFOLLOW)
                open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW;
        if (flags & AT_EMPTY_PATH)
                open_exec_flags.lookup_flags |= LOOKUP_EMPTY;

        file = do_filp_open(fd, name, &open_exec_flags);
        if (IS_ERR(file))
                return file;

        /*
         * In the past the regular type check was here. It moved to may_open() in
         * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is
         * an invariant that all non-regular files error out before we get here.
         */
        if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) ||
            path_noexec(&file->f_path))
                return ERR_PTR(-EACCES);

        err = exe_file_deny_write_access(file);
        if (err)
                return ERR_PTR(err);

        return no_free_ptr(file);
}

/**
 * open_exec - Open a path name for execution
 *
 * @name: path name to open with the intent of executing it.
 *
 * Returns ERR_PTR on failure or allocated struct file on success.
 *
 * As this is a wrapper for the internal do_open_execat(), callers
 * must call exe_file_allow_write_access() before fput() on release. Also see
 * do_close_execat().
 */
struct file *open_exec(const char *name)
{
        struct filename *filename = getname_kernel(name);
        struct file *f = ERR_CAST(filename);

        if (!IS_ERR(filename)) {
                f = do_open_execat(AT_FDCWD, filename, 0);
                putname(filename);
        }
        return f;
}
EXPORT_SYMBOL(open_exec);

#if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC)
ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
{
        ssize_t res = vfs_read(file, (void __user *)addr, len, &pos);
        if (res > 0)
                flush_icache_user_range(addr, addr + len);
        return res;
}
EXPORT_SYMBOL(read_code);
#endif

/*
 * Maps the mm_struct mm into the current task struct.
 * On success, this function returns with exec_update_lock
 * held for writing.
 */
static int exec_mmap(struct mm_struct *mm)
{
        struct task_struct *tsk;
        struct mm_struct *old_mm, *active_mm;
        int ret;

        /* Notify parent that we're no longer interested in the old VM */
        tsk = current;
        old_mm = current->mm;
        exec_mm_release(tsk, old_mm);

        ret = down_write_killable(&tsk->signal->exec_update_lock);
        if (ret)
                return ret;

        if (old_mm) {
                /*
                 * If there is a pending fatal signal perhaps a signal
                 * whose default action is to create a coredump get
                 * out and die instead of going through with the exec.
                 */
                ret = mmap_read_lock_killable(old_mm);
                if (ret) {
                        up_write(&tsk->signal->exec_update_lock);
                        return ret;
                }
        }

        task_lock(tsk);
        membarrier_exec_mmap(mm);

        local_irq_disable();
        active_mm = tsk->active_mm;
        tsk->active_mm = mm;
        tsk->mm = mm;
        mm_init_cid(mm, tsk);
        /*
         * This prevents preemption while active_mm is being loaded and
         * it and mm are being updated, which could cause problems for
         * lazy tlb mm refcounting when these are updated by context
         * switches. Not all architectures can handle irqs off over
         * activate_mm yet.
         */
        if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        activate_mm(active_mm, mm);
        if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
                local_irq_enable();
        lru_gen_add_mm(mm);
        task_unlock(tsk);
        lru_gen_use_mm(mm);
        if (old_mm) {
                mmap_read_unlock(old_mm);
                BUG_ON(active_mm != old_mm);
                setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
                mm_update_next_owner(old_mm);
                mmput(old_mm);
                return 0;
        }
        mmdrop_lazy_tlb(active_mm);
        return 0;
}

static int de_thread(struct task_struct *tsk)
{
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;

        if (thread_group_empty(tsk))
                goto no_thread_group;

        /*
         * Kill all other threads in the thread group.
         */
        spin_lock_irq(lock);
        if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) {
                /*
                 * Another group action in progress, just
                 * return so that the signal is processed.
                 */
                spin_unlock_irq(lock);
                return -EAGAIN;
        }

        sig->group_exec_task = tsk;
        sig->notify_count = zap_other_threads(tsk);
        if (!thread_group_leader(tsk))
                sig->notify_count--;

        while (sig->notify_count) {
                __set_current_state(TASK_KILLABLE);
                spin_unlock_irq(lock);
                schedule();
                if (__fatal_signal_pending(tsk))
                        goto killed;
                spin_lock_irq(lock);
        }
        spin_unlock_irq(lock);

        /*
         * At this point all other threads have exited, all we have to
         * do is to wait for the thread group leader to become inactive,
         * and to assume its PID:
         */
        if (!thread_group_leader(tsk)) {
                struct task_struct *leader = tsk->group_leader;

                for (;;) {
                        cgroup_threadgroup_change_begin(tsk);
                        write_lock_irq(&tasklist_lock);
                        /*
                         * Do this under tasklist_lock to ensure that
                         * exit_notify() can't miss ->group_exec_task
                         */
                        sig->notify_count = -1;
                        if (likely(leader->exit_state))
                                break;
                        __set_current_state(TASK_KILLABLE);
                        write_unlock_irq(&tasklist_lock);
                        cgroup_threadgroup_change_end(tsk);
                        schedule();
                        if (__fatal_signal_pending(tsk))
                                goto killed;
                }

                /*
                 * The only record we have of the real-time age of a
                 * process, regardless of execs it's done, is start_time.
                 * All the past CPU time is accumulated in signal_struct
                 * from sister threads now dead.  But in this non-leader
                 * exec, nothing survives from the original leader thread,
                 * whose birth marks the true age of this process now.
                 * When we take on its identity by switching to its PID, we
                 * also take its birthdate (always earlier than our own).
                 */
                tsk->start_time = leader->start_time;
                tsk->start_boottime = leader->start_boottime;

                BUG_ON(!same_thread_group(leader, tsk));
                /*
                 * An exec() starts a new thread group with the
                 * TGID of the previous thread group. Rehash the
                 * two threads with a switched PID, and release
                 * the former thread group leader:
                 */

                /* Become a process group leader with the old leader's pid.
                 * The old leader becomes a thread of the this thread group.
                 */
                exchange_tids(tsk, leader);
                transfer_pid(leader, tsk, PIDTYPE_TGID);
                transfer_pid(leader, tsk, PIDTYPE_PGID);
                transfer_pid(leader, tsk, PIDTYPE_SID);

                list_replace_rcu(&leader->tasks, &tsk->tasks);
                list_replace_init(&leader->sibling, &tsk->sibling);

                tsk->group_leader = tsk;
                leader->group_leader = tsk;

                tsk->exit_signal = SIGCHLD;
                leader->exit_signal = -1;

                BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                leader->exit_state = EXIT_DEAD;
                /*
                 * We are going to release_task()->ptrace_unlink() silently,
                 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
                 * the tracer won't block again waiting for this thread.
                 */
                if (unlikely(leader->ptrace))
                        __wake_up_parent(leader, leader->parent);
                write_unlock_irq(&tasklist_lock);
                cgroup_threadgroup_change_end(tsk);

                release_task(leader);
        }

        sig->group_exec_task = NULL;
        sig->notify_count = 0;

no_thread_group:
        /* we have changed execution domain */
        tsk->exit_signal = SIGCHLD;

        BUG_ON(!thread_group_leader(tsk));
        return 0;

killed:
        /* protects against exit_notify() and __exit_signal() */
        read_lock(&tasklist_lock);
        sig->group_exec_task = NULL;
        sig->notify_count = 0;
        read_unlock(&tasklist_lock);
        return -EAGAIN;
}


/*
 * This function makes sure the current process has its own signal table,
 * so that flush_signal_handlers can later reset the handlers without
 * disturbing other processes.  (Other processes might share the signal
 * table via the CLONE_SIGHAND option to clone().)
 */
static int unshare_sighand(struct task_struct *me)
{
        struct sighand_struct *oldsighand = me->sighand;

        if (refcount_read(&oldsighand->count) != 1) {
                struct sighand_struct *newsighand;
                /*
                 * This ->sighand is shared with the CLONE_SIGHAND
                 * but not CLONE_THREAD task, switch to the new one.
                 */
                newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
                if (!newsighand)
                        return -ENOMEM;

                refcount_set(&newsighand->count, 1);

                write_lock_irq(&tasklist_lock);
                spin_lock(&oldsighand->siglock);
                memcpy(newsighand->action, oldsighand->action,
                       sizeof(newsighand->action));
                rcu_assign_pointer(me->sighand, newsighand);
                spin_unlock(&oldsighand->siglock);
                write_unlock_irq(&tasklist_lock);

                __cleanup_sighand(oldsighand);
        }
        return 0;
}

/*
 * This is unlocked -- the string will always be NUL-terminated, but
 * may show overlapping contents if racing concurrent reads.
 */
void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec)
{
        size_t len = min(strlen(buf), sizeof(tsk->comm) - 1);

        trace_task_rename(tsk, buf);
        memcpy(tsk->comm, buf, len);
        memset(&tsk->comm[len], 0, sizeof(tsk->comm) - len);
        perf_event_comm(tsk, exec);
}

/*
 * Calling this is the point of no return. None of the failures will be
 * seen by userspace since either the process is already taking a fatal
 * signal (via de_thread() or coredump), or will have SEGV raised
 * (after exec_mmap()) by search_binary_handler (see below).
 */
int begin_new_exec(struct linux_binprm * bprm)
{
        struct task_struct *me = current;
        int retval;

        /* Once we are committed compute the creds */
        retval = bprm_creds_from_file(bprm);
        if (retval)
                return retval;

        /*
         * This tracepoint marks the point before flushing the old exec where
         * the current task is still unchanged, but errors are fatal (point of
         * no return). The later "sched_process_exec" tracepoint is called after
         * the current task has successfully switched to the new exec.
         */
        trace_sched_prepare_exec(current, bprm);

        /*
         * Ensure all future errors are fatal.
         */
        bprm->point_of_no_return = true;

        /* Make this the only thread in the thread group */
        retval = de_thread(me);
        if (retval)
                goto out;
        /* see the comment in check_unsafe_exec() */
        current->fs->in_exec = 0;
        /*
         * Cancel any io_uring activity across execve
         */
        io_uring_task_cancel();

        /* Ensure the files table is not shared. */
        retval = unshare_files();
        if (retval)
                goto out;

        /*
         * Must be called _before_ exec_mmap() as bprm->mm is
         * not visible until then. Doing it here also ensures
         * we don't race against replace_mm_exe_file().
         */
        retval = set_mm_exe_file(bprm->mm, bprm->file);
        if (retval)
                goto out;

        /* If the binary is not readable then enforce mm->dumpable=0 */
        would_dump(bprm, bprm->file);
        if (bprm->have_execfd)
                would_dump(bprm, bprm->executable);

        /*
         * Release all of the old mmap stuff
         */
        acct_arg_size(bprm, 0);
        retval = exec_mmap(bprm->mm);
        if (retval)
                goto out;

        bprm->mm = NULL;

        retval = exec_task_namespaces();
        if (retval)
                goto out_unlock;

#ifdef CONFIG_POSIX_TIMERS
        spin_lock_irq(&me->sighand->siglock);
        posix_cpu_timers_exit(me);
        spin_unlock_irq(&me->sighand->siglock);
        exit_itimers(me);
        flush_itimer_signals();
#endif

        /*
         * Make the signal table private.
         */
        retval = unshare_sighand(me);
        if (retval)
                goto out_unlock;

        me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC |
                                        PF_NOFREEZE | PF_NO_SETAFFINITY);
        flush_thread();
        me->personality &= ~bprm->per_clear;

        clear_syscall_work_syscall_user_dispatch(me);

        /*
         * We have to apply CLOEXEC before we change whether the process is
         * dumpable (in setup_new_exec) to avoid a race with a process in userspace
         * trying to access the should-be-closed file descriptors of a process
         * undergoing exec(2).
         */
        do_close_on_exec(me->files);

        if (bprm->secureexec) {
                /* Make sure parent cannot signal privileged process. */
                me->pdeath_signal = 0;

                /*
                 * For secureexec, reset the stack limit to sane default to
                 * avoid bad behavior from the prior rlimits. This has to
                 * happen before arch_pick_mmap_layout(), which examines
                 * RLIMIT_STACK, but after the point of no return to avoid
                 * needing to clean up the change on failure.
                 */
                if (bprm->rlim_stack.rlim_cur > _STK_LIM)
                        bprm->rlim_stack.rlim_cur = _STK_LIM;
        }

        me->sas_ss_sp = me->sas_ss_size = 0;

        /*
         * Figure out dumpability. Note that this checking only of current
         * is wrong, but userspace depends on it. This should be testing
         * bprm->secureexec instead.
         */
        if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
            !(uid_eq(current_euid(), current_uid()) &&
              gid_eq(current_egid(), current_gid())))
                set_dumpable(current->mm, suid_dumpable);
        else
                set_dumpable(current->mm, SUID_DUMP_USER);

        perf_event_exec();

        /*
         * If the original filename was empty, alloc_bprm() made up a path
         * that will probably not be useful to admins running ps or similar.
         * Let's fix it up to be something reasonable.
         */
        if (bprm->comm_from_dentry) {
                /*
                 * Hold RCU lock to keep the name from being freed behind our back.
                 * Use acquire semantics to make sure the terminating NUL from
                 * __d_alloc() is seen.
                 *
                 * Note, we're deliberately sloppy here. We don't need to care about
                 * detecting a concurrent rename and just want a terminated name.
                 */
                rcu_read_lock();
                __set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name),
                                true);
                rcu_read_unlock();
        } else {
                __set_task_comm(me, kbasename(bprm->filename), true);
        }

        /* An exec changes our domain. We are no longer part of the thread
           group */
        WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
        flush_signal_handlers(me, 0);

        retval = set_cred_ucounts(bprm->cred);
        if (retval < 0)
                goto out_unlock;

        /*
         * install the new credentials for this executable
         */
        security_bprm_committing_creds(bprm);

        commit_creds(bprm->cred);
        bprm->cred = NULL;

        /*
         * Disable monitoring for regular users
         * when executing setuid binaries. Must
         * wait until new credentials are committed
         * by commit_creds() above
         */
        if (get_dumpable(me->mm) != SUID_DUMP_USER)
                perf_event_exit_task(me);
        /*
         * cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
         * credentials; any time after this it may be unlocked.
         */
        security_bprm_committed_creds(bprm);

        /* Pass the opened binary to the interpreter. */
        if (bprm->have_execfd) {
                retval = get_unused_fd_flags(0);
                if (retval < 0)
                        goto out_unlock;
                fd_install(retval, bprm->executable);
                bprm->executable = NULL;
                bprm->execfd = retval;
        }
        return 0;

out_unlock:
        up_write(&me->signal->exec_update_lock);
        if (!bprm->cred)
                mutex_unlock(&me->signal->cred_guard_mutex);

out:
        return retval;
}
EXPORT_SYMBOL(begin_new_exec);

void would_dump(struct linux_binprm *bprm, struct file *file)
{
        struct inode *inode = file_inode(file);
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        if (inode_permission(idmap, inode, MAY_READ) < 0) {
                struct user_namespace *old, *user_ns;
                bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;

                /* Ensure mm->user_ns contains the executable */
                user_ns = old = bprm->mm->user_ns;
                while ((user_ns != &init_user_ns) &&
                       !privileged_wrt_inode_uidgid(user_ns, idmap, inode))
                        user_ns = user_ns->parent;

                if (old != user_ns) {
                        bprm->mm->user_ns = get_user_ns(user_ns);
                        put_user_ns(old);
                }
        }
}
EXPORT_SYMBOL(would_dump);

void setup_new_exec(struct linux_binprm * bprm)
{
        /* Setup things that can depend upon the personality */
        struct task_struct *me = current;

        arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);

        arch_setup_new_exec();

        /* Set the new mm task size. We have to do that late because it may
         * depend on TIF_32BIT which is only updated in flush_thread() on
         * some architectures like powerpc
         */
        me->mm->task_size = TASK_SIZE;
        up_write(&me->signal->exec_update_lock);
        mutex_unlock(&me->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(setup_new_exec);

/* Runs immediately before start_thread() takes over. */
void finalize_exec(struct linux_binprm *bprm)
{
        /* Store any stack rlimit changes before starting thread. */
        task_lock(current->group_leader);
        current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
        task_unlock(current->group_leader);
}
EXPORT_SYMBOL(finalize_exec);

/*
 * Prepare credentials and lock ->cred_guard_mutex.
 * setup_new_exec() commits the new creds and drops the lock.
 * Or, if exec fails before, free_bprm() should release ->cred
 * and unlock.
 */
static int prepare_bprm_creds(struct linux_binprm *bprm)
{
        if (mutex_lock_interruptible(&current->signal->cred_guard_mutex))
                return -ERESTARTNOINTR;

        bprm->cred = prepare_exec_creds();
        if (likely(bprm->cred))
                return 0;

        mutex_unlock(&current->signal->cred_guard_mutex);
        return -ENOMEM;
}

/* Matches do_open_execat() */
static void do_close_execat(struct file *file)
{
        if (!file)
                return;
        exe_file_allow_write_access(file);
        fput(file);
}

static void free_bprm(struct linux_binprm *bprm)
{
        if (bprm->mm) {
                acct_arg_size(bprm, 0);
                mmput(bprm->mm);
        }
        free_arg_pages(bprm);
        if (bprm->cred) {
                /* in case exec fails before de_thread() succeeds */
                current->fs->in_exec = 0;
                mutex_unlock(&current->signal->cred_guard_mutex);
                abort_creds(bprm->cred);
        }
        do_close_execat(bprm->file);
        if (bprm->executable)
                fput(bprm->executable);
        /* If a binfmt changed the interp, free it. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        kfree(bprm->fdpath);
        kfree(bprm);
}

static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
{
        struct linux_binprm *bprm;
        struct file *file;
        int retval = -ENOMEM;

        file = do_open_execat(fd, filename, flags);
        if (IS_ERR(file))
                return ERR_CAST(file);

        bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
        if (!bprm) {
                do_close_execat(file);
                return ERR_PTR(-ENOMEM);
        }

        bprm->file = file;

        if (fd == AT_FDCWD || filename->name[0] == '/') {
                bprm->filename = filename->name;
        } else {
                if (filename->name[0] == '\0') {
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd);
                        bprm->comm_from_dentry = 1;
                } else {
                        bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s",
                                                  fd, filename->name);
                }
                if (!bprm->fdpath)
                        goto out_free;

                /*
                 * Record that a name derived from an O_CLOEXEC fd will be
                 * inaccessible after exec.  This allows the code in exec to
                 * choose to fail when the executable is not mmaped into the
                 * interpreter and an open file descriptor is not passed to
                 * the interpreter.  This makes for a better user experience
                 * than having the interpreter start and then immediately fail
                 * when it finds the executable is inaccessible.
                 */
                if (get_close_on_exec(fd))
                        bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;

                bprm->filename = bprm->fdpath;
        }
        bprm->interp = bprm->filename;

        /*
         * At this point, security_file_open() has already been called (with
         * __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will
         * stop just after the security_bprm_creds_for_exec() call in
         * bprm_execve().  Indeed, the kernel should not try to parse the
         * content of the file with exec_binprm() nor change the calling
         * thread, which means that the following security functions will not
         * be called:
         * - security_bprm_check()
         * - security_bprm_creds_from_file()
         * - security_bprm_committing_creds()
         * - security_bprm_committed_creds()
         */
        bprm->is_check = !!(flags & AT_EXECVE_CHECK);

        retval = bprm_mm_init(bprm);
        if (!retval)
                return bprm;

out_free:
        free_bprm(bprm);
        return ERR_PTR(retval);
}

int bprm_change_interp(const char *interp, struct linux_binprm *bprm)
{
        /* If a binfmt changed the interp, free it first. */
        if (bprm->interp != bprm->filename)
                kfree(bprm->interp);
        bprm->interp = kstrdup(interp, GFP_KERNEL);
        if (!bprm->interp)
                return -ENOMEM;
        return 0;
}
EXPORT_SYMBOL(bprm_change_interp);

/*
 * determine how safe it is to execute the proposed program
 * - the caller must hold ->cred_guard_mutex to protect against
 *   PTRACE_ATTACH or seccomp thread-sync
 */
static void check_unsafe_exec(struct linux_binprm *bprm)
{
        struct task_struct *p = current, *t;
        unsigned n_fs;

        if (p->ptrace)
                bprm->unsafe |= LSM_UNSAFE_PTRACE;

        /*
         * This isn't strictly necessary, but it makes it harder for LSMs to
         * mess up.
         */
        if (task_no_new_privs(current))
                bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;

        /*
         * If another task is sharing our fs, we cannot safely
         * suid exec because the differently privileged task
         * will be able to manipulate the current directory, etc.
         * It would be nice to force an unshare instead...
         *
         * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS)
         * from another sub-thread until de_thread() succeeds, this
         * state is protected by cred_guard_mutex we hold.
         */
        n_fs = 1;
        spin_lock(&p->fs->lock);
        rcu_read_lock();
        for_other_threads(p, t) {
                if (t->fs == p->fs)
                        n_fs++;
        }
        rcu_read_unlock();

        /* "users" and "in_exec" locked for copy_fs() */
        if (p->fs->users > n_fs)
                bprm->unsafe |= LSM_UNSAFE_SHARE;
        else
                p->fs->in_exec = 1;
        spin_unlock(&p->fs->lock);
}

static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file)
{
        /* Handle suid and sgid on files */
        struct mnt_idmap *idmap;
        struct inode *inode = file_inode(file);
        unsigned int mode;
        vfsuid_t vfsuid;
        vfsgid_t vfsgid;
        int err;

        if (!mnt_may_suid(file->f_path.mnt))
                return;

        if (task_no_new_privs(current))
                return;

        mode = READ_ONCE(inode->i_mode);
        if (!(mode & (S_ISUID|S_ISGID)))
                return;

        idmap = file_mnt_idmap(file);

        /* Be careful if suid/sgid is set */
        inode_lock(inode);

        /* Atomically reload and check mode/uid/gid now that lock held. */
        mode = inode->i_mode;
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        vfsgid = i_gid_into_vfsgid(idmap, inode);
        err = inode_permission(idmap, inode, MAY_EXEC);
        inode_unlock(inode);

        /* Did the exec bit vanish out from under us? Give up. */
        if (err)
                return;

        /* We ignore suid/sgid if there are no mappings for them in the ns */
        if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
            !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid))
                return;

        if (mode & S_ISUID) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->euid = vfsuid_into_kuid(vfsuid);
        }

        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
                bprm->per_clear |= PER_CLEAR_ON_SETID;
                bprm->cred->egid = vfsgid_into_kgid(vfsgid);
        }
}

/*
 * Compute brpm->cred based upon the final binary.
 */
static int bprm_creds_from_file(struct linux_binprm *bprm)
{
        /* Compute creds based on which file? */
        struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;

        bprm_fill_uid(bprm, file);
        return security_bprm_creds_from_file(bprm, file);
}

/*
 * Fill the binprm structure from the inode.
 * Read the first BINPRM_BUF_SIZE bytes
 *
 * This may be called multiple times for binary chains (scripts for example).
 */
static int prepare_binprm(struct linux_binprm *bprm)
{
        loff_t pos = 0;

        memset(bprm->buf, 0, BINPRM_BUF_SIZE);
        return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos);
}

/*
 * Arguments are '\0' separated strings found at the location bprm->p
 * points to; chop off the first by relocating brpm->p to right after
 * the first '\0' encountered.
 */
int remove_arg_zero(struct linux_binprm *bprm)
{
        unsigned long offset;
        char *kaddr;
        struct page *page;

        if (!bprm->argc)
                return 0;

        do {
                offset = bprm->p & ~PAGE_MASK;
                page = get_arg_page(bprm, bprm->p, 0);
                if (!page)
                        return -EFAULT;
                kaddr = kmap_local_page(page);

                for (; offset < PAGE_SIZE && kaddr[offset];
                                offset++, bprm->p++)
                        ;

                kunmap_local(kaddr);
                put_arg_page(page);
        } while (offset == PAGE_SIZE);

        bprm->p++;
        bprm->argc--;

        return 0;
}
EXPORT_SYMBOL(remove_arg_zero);

/*
 * cycle the list of binary formats handler, until one recognizes the image
 */
static int search_binary_handler(struct linux_binprm *bprm)
{
        struct linux_binfmt *fmt;
        int retval;

        retval = prepare_binprm(bprm);
        if (retval < 0)
                return retval;

        retval = security_bprm_check(bprm);
        if (retval)
                return retval;

        read_lock(&binfmt_lock);
        list_for_each_entry(fmt, &formats, lh) {
                if (!try_module_get(fmt->module))
                        continue;
                read_unlock(&binfmt_lock);

                retval = fmt->load_binary(bprm);

                read_lock(&binfmt_lock);
                put_binfmt(fmt);
                if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
                        read_unlock(&binfmt_lock);
                        return retval;
                }
        }
        read_unlock(&binfmt_lock);

        return -ENOEXEC;
}

/* binfmt handlers will call back into begin_new_exec() on success. */
static int exec_binprm(struct linux_binprm *bprm)
{
        pid_t old_pid, old_vpid;
        int ret, depth;

        /* Need to fetch pid before load_binary changes it */
        old_pid = current->pid;
        rcu_read_lock();
        old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
        rcu_read_unlock();

        /* This allows 4 levels of binfmt rewrites before failing hard. */
        for (depth = 0;; depth++) {
                struct file *exec;
                if (depth > 5)
                        return -ELOOP;

                ret = search_binary_handler(bprm);
                if (ret < 0)
                        return ret;
                if (!bprm->interpreter)
                        break;

                exec = bprm->file;
                bprm->file = bprm->interpreter;
                bprm->interpreter = NULL;

                exe_file_allow_write_access(exec);
                if (unlikely(bprm->have_execfd)) {
                        if (bprm->executable) {
                                fput(exec);
                                return -ENOEXEC;
                        }
                        bprm->executable = exec;
                } else
                        fput(exec);
        }

        audit_bprm(bprm);
        trace_sched_process_exec(current, old_pid, bprm);
        ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
        proc_exec_connector(current);
        return 0;
}

static int bprm_execve(struct linux_binprm *bprm)
{
        int retval;

        retval = prepare_bprm_creds(bprm);
        if (retval)
                return retval;

        /*
         * Check for unsafe execution states before exec_binprm(), which
         * will call back into begin_new_exec(), into bprm_creds_from_file(),
         * where setuid-ness is evaluated.
         */
        check_unsafe_exec(bprm);
        current->in_execve = 1;
        sched_mm_cid_before_execve(current);

        sched_exec();

        /* Set the unchanging part of bprm->cred */
        retval = security_bprm_creds_for_exec(bprm);
        if (retval || bprm->is_check)
                goto out;

        retval = exec_binprm(bprm);
        if (retval < 0)
                goto out;

        sched_mm_cid_after_execve(current);
        rseq_execve(current);
        /* execve succeeded */
        current->in_execve = 0;
        user_events_execve(current);
        acct_update_integrals(current);
        task_numa_free(current, false);
        return retval;

out:
        /*
         * If past the point of no return ensure the code never
         * returns to the userspace process.  Use an existing fatal
         * signal if present otherwise terminate the process with
         * SIGSEGV.
         */
        if (bprm->point_of_no_return && !fatal_signal_pending(current))
                force_fatal_sig(SIGSEGV);

        sched_mm_cid_after_execve(current);
        rseq_set_notify_resume(current);
        current->in_execve = 0;

        return retval;
}

static int do_execveat_common(int fd, struct filename *filename,
                              struct user_arg_ptr argv,
                              struct user_arg_ptr envp,
                              int flags)
{
        struct linux_binprm *bprm;
        int retval;

        if (IS_ERR(filename))
                return PTR_ERR(filename);

        /*
         * We move the actual failure in case of RLIMIT_NPROC excess from
         * set*uid() to execve() because too many poorly written programs
         * don't check setuid() return code.  Here we additionally recheck
         * whether NPROC limit is still exceeded.
         */
        if ((current->flags & PF_NPROC_EXCEEDED) &&
            is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                retval = -EAGAIN;
                goto out_ret;
        }

        /* We're below the limit (still or again), so we don't want to make
         * further execve() calls fail. */
        current->flags &= ~PF_NPROC_EXCEEDED;

        bprm = alloc_bprm(fd, filename, flags);
        if (IS_ERR(bprm)) {
                retval = PTR_ERR(bprm);
                goto out_ret;
        }

        retval = count(argv, MAX_ARG_STRINGS);
        if (retval < 0)
                goto out_free;
        bprm->argc = retval;

        retval = count(envp, MAX_ARG_STRINGS);
        if (retval < 0)
                goto out_free;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                goto out_free;
        bprm->exec = bprm->p;

        retval = copy_strings(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_strings(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out_free;

        /*
         * When argv is empty, add an empty string ("") as argv[0] to
         * ensure confused userspace programs that start processing
         * from argv[1] won't end up walking envp. See also
         * bprm_stack_limits().
         */
        if (bprm->argc == 0) {
                retval = copy_string_kernel("", bprm);
                if (retval < 0)
                        goto out_free;
                bprm->argc = 1;

                pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
                             current->comm, bprm->filename);
        }

        retval = bprm_execve(bprm);
out_free:
        free_bprm(bprm);

out_ret:
        putname(filename);
        return retval;
}

int kernel_execve(const char *kernel_filename,
                  const char *const *argv, const char *const *envp)
{
        struct filename *filename;
        struct linux_binprm *bprm;
        int fd = AT_FDCWD;
        int retval;

        /* It is non-sense for kernel threads to call execve */
        if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
                return -EINVAL;

        filename = getname_kernel(kernel_filename);
        if (IS_ERR(filename))
                return PTR_ERR(filename);

        bprm = alloc_bprm(fd, filename, 0);
        if (IS_ERR(bprm)) {
                retval = PTR_ERR(bprm);
                goto out_ret;
        }

        retval = count_strings_kernel(argv);
        if (WARN_ON_ONCE(retval == 0))
                retval = -EINVAL;
        if (retval < 0)
                goto out_free;
        bprm->argc = retval;

        retval = count_strings_kernel(envp);
        if (retval < 0)
                goto out_free;
        bprm->envc = retval;

        retval = bprm_stack_limits(bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_string_kernel(bprm->filename, bprm);
        if (retval < 0)
                goto out_free;
        bprm->exec = bprm->p;

        retval = copy_strings_kernel(bprm->envc, envp, bprm);
        if (retval < 0)
                goto out_free;

        retval = copy_strings_kernel(bprm->argc, argv, bprm);
        if (retval < 0)
                goto out_free;

        retval = bprm_execve(bprm);
out_free:
        free_bprm(bprm);
out_ret:
        putname(filename);
        return retval;
}

static int do_execve(struct filename *filename,
        const char __user *const __user *__argv,
        const char __user *const __user *__envp)
{
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };
        return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int do_execveat(int fd, struct filename *filename,
                const char __user *const __user *__argv,
                const char __user *const __user *__envp,
                int flags)
{
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };

        return do_execveat_common(fd, filename, argv, envp, flags);
}

#ifdef CONFIG_COMPAT
static int compat_do_execve(struct filename *filename,
        const compat_uptr_t __user *__argv,
        const compat_uptr_t __user *__envp)
{
        struct user_arg_ptr argv = {
                .is_compat = true,
                .ptr.compat = __argv,
        };
        struct user_arg_ptr envp = {
                .is_compat = true,
                .ptr.compat = __envp,
        };
        return do_execveat_common(AT_FDCWD, filename, argv, envp, 0);
}

static int compat_do_execveat(int fd, struct filename *filename,
                              const compat_uptr_t __user *__argv,
                              const compat_uptr_t __user *__envp,
                              int flags)
{
        struct user_arg_ptr argv = {
                .is_compat = true,
                .ptr.compat = __argv,
        };
        struct user_arg_ptr envp = {
                .is_compat = true,
                .ptr.compat = __envp,
        };
        return do_execveat_common(fd, filename, argv, envp, flags);
}
#endif

void set_binfmt(struct linux_binfmt *new)
{
        struct mm_struct *mm = current->mm;

        if (mm->binfmt)
                module_put(mm->binfmt->module);

        mm->binfmt = new;
        if (new)
                __module_get(new->module);
}
EXPORT_SYMBOL(set_binfmt);

/*
 * set_dumpable stores three-value SUID_DUMP_* into mm->flags.
 */
void set_dumpable(struct mm_struct *mm, int value)
{
        if (WARN_ON((unsigned)value > SUID_DUMP_ROOT))
                return;

        set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value);
}

SYSCALL_DEFINE3(execve,
                const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp)
{
        return do_execve(getname(filename), argv, envp);
}

SYSCALL_DEFINE5(execveat,
                int, fd, const char __user *, filename,
                const char __user *const __user *, argv,
                const char __user *const __user *, envp,
                int, flags)
{
        return do_execveat(fd,
                           getname_uflags(filename, flags),
                           argv, envp, flags);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename,
        const compat_uptr_t __user *, argv,
        const compat_uptr_t __user *, envp)
{
        return compat_do_execve(getname(filename), argv, envp);
}

COMPAT_SYSCALL_DEFINE5(execveat, int, fd,
                       const char __user *, filename,
                       const compat_uptr_t __user *, argv,
                       const compat_uptr_t __user *, envp,
                       int,  flags)
{
        return compat_do_execveat(fd,
                                  getname_uflags(filename, flags),
                                  argv, envp, flags);
}
#endif

#ifdef CONFIG_SYSCTL

static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (!error)
                validate_coredump_safety();
        return error;
}

static const struct ctl_table fs_exec_sysctls[] = {
        {
                .procname        = "suid_dumpable",
                .data                = &suid_dumpable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax_coredump,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
};

static int __init init_fs_exec_sysctls(void)
{
        register_sysctl_init("fs", fs_exec_sysctls);
        return 0;
}

fs_initcall(init_fs_exec_sysctls);
#endif /* CONFIG_SYSCTL */

#ifdef CONFIG_EXEC_KUNIT_TEST
#include "tests/exec_kunit.c"
#endif




























  220 























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM pagemap

#if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PAGEMAP_H

#include <linux/tracepoint.h>
#include <linux/mm.h>

#define        PAGEMAP_MAPPED                0x0001u
#define PAGEMAP_ANONYMOUS        0x0002u
#define PAGEMAP_FILE                0x0004u
#define PAGEMAP_SWAPCACHE        0x0008u
#define PAGEMAP_SWAPBACKED        0x0010u
#define PAGEMAP_MAPPEDDISK        0x0020u
#define PAGEMAP_BUFFERS                0x0040u

#define trace_pagemap_flags(folio) ( \
        (folio_test_anon(folio)                ? PAGEMAP_ANONYMOUS  : PAGEMAP_FILE) | \
        (folio_mapped(folio)                ? PAGEMAP_MAPPED     : 0) | \
        (folio_test_swapcache(folio)        ? PAGEMAP_SWAPCACHE  : 0) | \
        (folio_test_swapbacked(folio)        ? PAGEMAP_SWAPBACKED : 0) | \
        (folio_test_mappedtodisk(folio)        ? PAGEMAP_MAPPEDDISK : 0) | \
        (folio_test_private(folio)        ? PAGEMAP_BUFFERS    : 0) \
        )

TRACE_EVENT(mm_lru_insertion,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(struct folio *,        folio        )
                __field(unsigned long,        pfn        )
                __field(enum lru_list,        lru        )
                __field(unsigned long,        flags        )
        ),

        TP_fast_assign(
                __entry->folio        = folio;
                __entry->pfn        = folio_pfn(folio);
                __entry->lru        = folio_lru_list(folio);
                __entry->flags        = trace_pagemap_flags(folio);
        ),

        /* Flag format is based on page-types.c formatting for pagemap */
        TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s",
                        __entry->folio,
                        __entry->pfn,
                        __entry->lru,
                        __entry->flags & PAGEMAP_MAPPED                ? "M" : " ",
                        __entry->flags & PAGEMAP_ANONYMOUS        ? "a" : "f",
                        __entry->flags & PAGEMAP_SWAPCACHE        ? "s" : " ",
                        __entry->flags & PAGEMAP_SWAPBACKED        ? "b" : " ",
                        __entry->flags & PAGEMAP_MAPPEDDISK        ? "d" : " ",
                        __entry->flags & PAGEMAP_BUFFERS        ? "B" : " ")
);

TRACE_EVENT(mm_lru_activate,

        TP_PROTO(struct folio *folio),

        TP_ARGS(folio),

        TP_STRUCT__entry(
                __field(struct folio *,        folio        )
                __field(unsigned long,        pfn        )
        ),

        TP_fast_assign(
                __entry->folio        = folio;
                __entry->pfn        = folio_pfn(folio);
        ),

        TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn)
);

#endif /* _TRACE_PAGEMAP_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



































































   34 















   34 





   34 
   34 






























































   34 
   34 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _LINUX_RCUREF_H
#define _LINUX_RCUREF_H

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/limits.h>
#include <linux/lockdep.h>
#include <linux/preempt.h>
#include <linux/rcupdate.h>

#define RCUREF_ONEREF                0x00000000U
#define RCUREF_MAXREF                0x7FFFFFFFU
#define RCUREF_SATURATED        0xA0000000U
#define RCUREF_RELEASED                0xC0000000U
#define RCUREF_DEAD                0xE0000000U
#define RCUREF_NOREF                0xFFFFFFFFU

/**
 * rcuref_init - Initialize a rcuref reference count with the given reference count
 * @ref:        Pointer to the reference count
 * @cnt:        The initial reference count typically '1'
 */
static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
{
        atomic_set(&ref->refcnt, cnt - 1);
}

/**
 * rcuref_read - Read the number of held reference counts of a rcuref
 * @ref:        Pointer to the reference count
 *
 * Return: The number of held references (0 ... N)
 */
static inline unsigned int rcuref_read(rcuref_t *ref)
{
        unsigned int c = atomic_read(&ref->refcnt);

        /* Return 0 if within the DEAD zone. */
        return c >= RCUREF_RELEASED ? 0 : c + 1;
}

extern __must_check bool rcuref_get_slowpath(rcuref_t *ref);

/**
 * rcuref_get - Acquire one reference on a rcuref reference count
 * @ref:        Pointer to the reference count
 *
 * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
 *
 * Provides no memory ordering, it is assumed the caller has guaranteed the
 * object memory to be stable (RCU, etc.). It does provide a control dependency
 * and thereby orders future stores. See documentation in lib/rcuref.c
 *
 * Return:
 *        False if the attempt to acquire a reference failed. This happens
 *        when the last reference has been put already
 *
 *        True if a reference was successfully acquired
 */
static inline __must_check bool rcuref_get(rcuref_t *ref)
{
        /*
         * Unconditionally increase the reference count. The saturation and
         * dead zones provide enough tolerance for this.
         */
        if (likely(!atomic_add_negative_relaxed(1, &ref->refcnt)))
                return true;

        /* Handle the cases inside the saturation and dead zones */
        return rcuref_get_slowpath(ref);
}

extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt);

/*
 * Internal helper. Do not invoke directly.
 */
static __always_inline __must_check bool __rcuref_put(rcuref_t *ref)
{
        int cnt;

        RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(),
                         "suspicious rcuref_put_rcusafe() usage");
        /*
         * Unconditionally decrease the reference count. The saturation and
         * dead zones provide enough tolerance for this.
         */
        cnt = atomic_sub_return_release(1, &ref->refcnt);
        if (likely(cnt >= 0))
                return false;

        /*
         * Handle the last reference drop and cases inside the saturation
         * and dead zones.
         */
        return rcuref_put_slowpath(ref, cnt);
}

/**
 * rcuref_put_rcusafe -- Release one reference for a rcuref reference count RCU safe
 * @ref:        Pointer to the reference count
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Can be invoked from contexts, which guarantee that no grace period can
 * happen which would free the object concurrently if the decrement drops
 * the last reference and the slowpath races against a concurrent get() and
 * put() pair. rcu_read_lock()'ed and atomic contexts qualify.
 *
 * Return:
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely release the
 *        object which is protected by the reference counter.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        release the protected object.
 */
static inline __must_check bool rcuref_put_rcusafe(rcuref_t *ref)
{
        return __rcuref_put(ref);
}

/**
 * rcuref_put -- Release one reference for a rcuref reference count
 * @ref:        Pointer to the reference count
 *
 * Can be invoked from any context.
 *
 * Provides release memory ordering, such that prior loads and stores are done
 * before, and provides an acquire ordering on success such that free()
 * must come after.
 *
 * Return:
 *
 *        True if this was the last reference with no future references
 *        possible. This signals the caller that it can safely schedule the
 *        object, which is protected by the reference counter, for
 *        deconstruction.
 *
 *        False if there are still active references or the put() raced
 *        with a concurrent get()/put() pair. Caller is not allowed to
 *        deconstruct the protected object.
 */
static inline __must_check bool rcuref_put(rcuref_t *ref)
{
        bool released;

        preempt_disable();
        released = __rcuref_put(ref);
        preempt_enable();
        return released;
}

#endif



































































































































































































































































































































































































































































































































































    3 


    3 





















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * xfrm_device.c - IPsec device offloading code.
 *
 * Copyright (c) 2015 secunet Security Networks AG
 *
 * Author:
 * Steffen Klassert <steffen.klassert@secunet.com>
 */

#include <linux/errno.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <net/dst.h>
#include <net/gso.h>
#include <net/xfrm.h>
#include <linux/notifier.h>

#ifdef CONFIG_XFRM_OFFLOAD
static void __xfrm_transport_prep(struct xfrm_state *x, struct sk_buff *skb,
                                  unsigned int hsize)
{
        struct xfrm_offload *xo = xfrm_offload(skb);

        skb_reset_mac_len(skb);
        if (xo->flags & XFRM_GSO_SEGMENT)
                skb->transport_header -= x->props.header_len;

        pskb_pull(skb, skb_transport_offset(skb) + x->props.header_len);
}

static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb,
                                    unsigned int hsize)

{
        struct xfrm_offload *xo = xfrm_offload(skb);

        if (xo->flags & XFRM_GSO_SEGMENT)
                skb->transport_header = skb->network_header + hsize;

        skb_reset_mac_len(skb);
        pskb_pull(skb,
                  skb->mac_len + x->props.header_len - x->props.enc_hdr_len);
}

static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
                                  unsigned int hsize)
{
        struct xfrm_offload *xo = xfrm_offload(skb);
        int phlen = 0;

        if (xo->flags & XFRM_GSO_SEGMENT)
                skb->transport_header = skb->network_header + hsize;

        skb_reset_mac_len(skb);
        if (x->sel.family != AF_INET6) {
                phlen = IPV4_BEET_PHMAXLEN;
                if (x->outer_mode.family == AF_INET6)
                        phlen += sizeof(struct ipv6hdr) - sizeof(struct iphdr);
        }

        pskb_pull(skb, skb->mac_len + hsize + (x->props.header_len - phlen));
}

/* Adjust pointers into the packet when IPsec is done at layer2 */
static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
{
        switch (x->outer_mode.encap) {
        case XFRM_MODE_IPTFS:
        case XFRM_MODE_TUNNEL:
                if (x->outer_mode.family == AF_INET)
                        return __xfrm_mode_tunnel_prep(x, skb,
                                                       sizeof(struct iphdr));
                if (x->outer_mode.family == AF_INET6)
                        return __xfrm_mode_tunnel_prep(x, skb,
                                                       sizeof(struct ipv6hdr));
                break;
        case XFRM_MODE_TRANSPORT:
                if (x->outer_mode.family == AF_INET)
                        return __xfrm_transport_prep(x, skb,
                                                     sizeof(struct iphdr));
                if (x->outer_mode.family == AF_INET6)
                        return __xfrm_transport_prep(x, skb,
                                                     sizeof(struct ipv6hdr));
                break;
        case XFRM_MODE_BEET:
                if (x->outer_mode.family == AF_INET)
                        return __xfrm_mode_beet_prep(x, skb,
                                                     sizeof(struct iphdr));
                if (x->outer_mode.family == AF_INET6)
                        return __xfrm_mode_beet_prep(x, skb,
                                                     sizeof(struct ipv6hdr));
                break;
        case XFRM_MODE_ROUTEOPTIMIZATION:
        case XFRM_MODE_IN_TRIGGER:
                break;
        }
}

static inline bool xmit_xfrm_check_overflow(struct sk_buff *skb)
{
        struct xfrm_offload *xo = xfrm_offload(skb);
        __u32 seq = xo->seq.low;

        seq += skb_shinfo(skb)->gso_segs;
        if (unlikely(seq < xo->seq.low))
                return true;

        return false;
}

struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again)
{
        int err;
        unsigned long flags;
        struct xfrm_state *x;
        struct softnet_data *sd;
        struct sk_buff *skb2, *nskb, *pskb = NULL;
        netdev_features_t esp_features = features;
        struct xfrm_offload *xo = xfrm_offload(skb);
        struct net_device *dev = skb->dev;
        struct sec_path *sp;

        if (!xo || (xo->flags & XFRM_XMIT))
                return skb;

        if (!(features & NETIF_F_HW_ESP))
                esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);

        sp = skb_sec_path(skb);
        x = sp->xvec[sp->len - 1];
        if (xo->flags & XFRM_GRO || x->xso.dir == XFRM_DEV_OFFLOAD_IN)
                return skb;

        /* The packet was sent to HW IPsec packet offload engine,
         * but to wrong device. Drop the packet, so it won't skip
         * XFRM stack.
         */
        if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->xso.dev != dev) {
                kfree_skb(skb);
                dev_core_stats_tx_dropped_inc(dev);
                return NULL;
        }

        /* This skb was already validated on the upper/virtual dev */
        if ((x->xso.dev != dev) && (x->xso.real_dev == dev))
                return skb;

        local_irq_save(flags);
        sd = this_cpu_ptr(&softnet_data);
        err = !skb_queue_empty(&sd->xfrm_backlog);
        local_irq_restore(flags);

        if (err) {
                *again = true;
                return skb;
        }

        if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) ||
                                unlikely(xmit_xfrm_check_overflow(skb)))) {
                struct sk_buff *segs;

                /* Packet got rerouted, fixup features and segment it. */
                esp_features = esp_features & ~(NETIF_F_HW_ESP | NETIF_F_GSO_ESP);

                segs = skb_gso_segment(skb, esp_features);
                if (IS_ERR(segs)) {
                        kfree_skb(skb);
                        dev_core_stats_tx_dropped_inc(dev);
                        return NULL;
                } else {
                        consume_skb(skb);
                        skb = segs;
                }
        }

        if (!skb->next) {
                esp_features |= skb->dev->gso_partial_features;
                xfrm_outer_mode_prep(x, skb);

                xo->flags |= XFRM_DEV_RESUME;

                err = x->type_offload->xmit(x, skb, esp_features);
                if (err) {
                        if (err == -EINPROGRESS)
                                return NULL;

                        XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
                        kfree_skb(skb);
                        return NULL;
                }

                skb_push(skb, skb->data - skb_mac_header(skb));

                return skb;
        }

        skb_list_walk_safe(skb, skb2, nskb) {
                esp_features |= skb->dev->gso_partial_features;
                skb_mark_not_on_list(skb2);

                xo = xfrm_offload(skb2);
                xo->flags |= XFRM_DEV_RESUME;

                xfrm_outer_mode_prep(x, skb2);

                err = x->type_offload->xmit(x, skb2, esp_features);
                if (!err) {
                        skb2->next = nskb;
                } else if (err != -EINPROGRESS) {
                        XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
                        skb2->next = nskb;
                        kfree_skb_list(skb2);
                        return NULL;
                } else {
                        if (skb == skb2)
                                skb = nskb;
                        else
                                pskb->next = nskb;

                        continue;
                }

                skb_push(skb2, skb2->data - skb_mac_header(skb2));
                pskb = skb2;
        }

        return skb;
}
EXPORT_SYMBOL_GPL(validate_xmit_xfrm);

int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
                       struct xfrm_user_offload *xuo,
                       struct netlink_ext_ack *extack)
{
        int err;
        struct dst_entry *dst;
        struct net_device *dev;
        struct xfrm_dev_offload *xso = &x->xso;
        xfrm_address_t *saddr;
        xfrm_address_t *daddr;
        bool is_packet_offload;

        if (xuo->flags &
            ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) {
                NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
                return -EINVAL;
        }

        if ((xuo->flags & XFRM_OFFLOAD_INBOUND && x->dir == XFRM_SA_DIR_OUT) ||
            (!(xuo->flags & XFRM_OFFLOAD_INBOUND) && x->dir == XFRM_SA_DIR_IN)) {
                NL_SET_ERR_MSG(extack, "Mismatched SA and offload direction");
                return -EINVAL;
        }

        is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET;

        /* We don't yet support TFC padding. */
        if (x->tfcpad) {
                NL_SET_ERR_MSG(extack, "TFC padding can't be offloaded");
                return -EINVAL;
        }

        dev = dev_get_by_index(net, xuo->ifindex);
        if (!dev) {
                struct xfrm_dst_lookup_params params;

                if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) {
                        saddr = &x->props.saddr;
                        daddr = &x->id.daddr;
                } else {
                        saddr = &x->id.daddr;
                        daddr = &x->props.saddr;
                }

                memset(&params, 0, sizeof(params));
                params.net = net;
                params.saddr = saddr;
                params.daddr = daddr;
                params.mark = xfrm_smark_get(0, x);
                dst = __xfrm_dst_lookup(x->props.family, &params);
                if (IS_ERR(dst))
                        return (is_packet_offload) ? -EINVAL : 0;

                dev = dst->dev;

                dev_hold(dev);
                dst_release(dst);
        }

        if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) {
                xso->dev = NULL;
                dev_put(dev);
                return (is_packet_offload) ? -EINVAL : 0;
        }

        if (!is_packet_offload && x->props.flags & XFRM_STATE_ESN &&
            !dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
                NL_SET_ERR_MSG(extack, "Device doesn't support offload with ESN");
                xso->dev = NULL;
                dev_put(dev);
                return -EINVAL;
        }

        xfrm_set_type_offload(x);
        if (!x->type_offload) {
                NL_SET_ERR_MSG(extack, "Type doesn't support offload");
                dev_put(dev);
                return -EINVAL;
        }

        xso->dev = dev;
        netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC);
        xso->real_dev = dev;

        if (xuo->flags & XFRM_OFFLOAD_INBOUND)
                xso->dir = XFRM_DEV_OFFLOAD_IN;
        else
                xso->dir = XFRM_DEV_OFFLOAD_OUT;

        if (is_packet_offload)
                xso->type = XFRM_DEV_OFFLOAD_PACKET;
        else
                xso->type = XFRM_DEV_OFFLOAD_CRYPTO;

        err = dev->xfrmdev_ops->xdo_dev_state_add(x, extack);
        if (err) {
                xso->dev = NULL;
                xso->dir = 0;
                xso->real_dev = NULL;
                netdev_put(dev, &xso->dev_tracker);
                xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;

                xfrm_unset_type_offload(x);
                /* User explicitly requested packet offload mode and configured
                 * policy in addition to the XFRM state. So be civil to users,
                 * and return an error instead of taking fallback path.
                 */
                if ((err != -EOPNOTSUPP && !is_packet_offload) || is_packet_offload) {
                        NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this state");
                        return err;
                }
        }

        return 0;
}
EXPORT_SYMBOL_GPL(xfrm_dev_state_add);

int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
                        struct xfrm_user_offload *xuo, u8 dir,
                        struct netlink_ext_ack *extack)
{
        struct xfrm_dev_offload *xdo = &xp->xdo;
        struct net_device *dev;
        int err;

        if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) {
                /* We support only packet offload mode and it means
                 * that user must set XFRM_OFFLOAD_PACKET bit.
                 */
                NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
                return -EINVAL;
        }

        dev = dev_get_by_index(net, xuo->ifindex);
        if (!dev)
                return -EINVAL;

        if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) {
                xdo->dev = NULL;
                dev_put(dev);
                NL_SET_ERR_MSG(extack, "Policy offload is not supported");
                return -EINVAL;
        }

        xdo->dev = dev;
        netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC);
        xdo->real_dev = dev;
        xdo->type = XFRM_DEV_OFFLOAD_PACKET;
        switch (dir) {
        case XFRM_POLICY_IN:
                xdo->dir = XFRM_DEV_OFFLOAD_IN;
                break;
        case XFRM_POLICY_OUT:
                xdo->dir = XFRM_DEV_OFFLOAD_OUT;
                break;
        case XFRM_POLICY_FWD:
                xdo->dir = XFRM_DEV_OFFLOAD_FWD;
                break;
        default:
                xdo->dev = NULL;
                netdev_put(dev, &xdo->dev_tracker);
                NL_SET_ERR_MSG(extack, "Unrecognized offload direction");
                return -EINVAL;
        }

        err = dev->xfrmdev_ops->xdo_dev_policy_add(xp, extack);
        if (err) {
                xdo->dev = NULL;
                xdo->real_dev = NULL;
                xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
                xdo->dir = 0;
                netdev_put(dev, &xdo->dev_tracker);
                NL_SET_ERR_MSG_WEAK(extack, "Device failed to offload this policy");
                return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(xfrm_dev_policy_add);

bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
{
        int mtu;
        struct dst_entry *dst = skb_dst(skb);
        struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
        struct net_device *dev = x->xso.dev;
        bool check_tunnel_size;

        if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED)
                return false;

        if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) {
                mtu = xfrm_state_mtu(x, xdst->child_mtu_cached);
                if (skb->len <= mtu)
                        goto ok;

                if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
                        goto ok;
        }

        return false;

ok:
        check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET &&
                            x->props.mode == XFRM_MODE_TUNNEL;
        switch (x->props.family) {
        case AF_INET:
                /* Check for IPv4 options */
                if (ip_hdr(skb)->ihl != 5)
                        return false;
                if (check_tunnel_size && xfrm4_tunnel_check_size(skb))
                        return false;
                break;
        case AF_INET6:
                /* Check for IPv6 extensions */
                if (ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr))
                        return false;
                if (check_tunnel_size && xfrm6_tunnel_check_size(skb))
                        return false;
                break;
        default:
                break;
        }

        if (dev->xfrmdev_ops->xdo_dev_offload_ok)
                return dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x);

        return true;
}
EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok);

void xfrm_dev_resume(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        int ret = NETDEV_TX_BUSY;
        struct netdev_queue *txq;
        struct softnet_data *sd;
        unsigned long flags;

        rcu_read_lock();
        txq = netdev_core_pick_tx(dev, skb, NULL);

        HARD_TX_LOCK(dev, txq, smp_processor_id());
        if (!netif_xmit_frozen_or_stopped(txq))
                skb = dev_hard_start_xmit(skb, dev, txq, &ret);
        HARD_TX_UNLOCK(dev, txq);

        if (!dev_xmit_complete(ret)) {
                local_irq_save(flags);
                sd = this_cpu_ptr(&softnet_data);
                skb_queue_tail(&sd->xfrm_backlog, skb);
                raise_softirq_irqoff(NET_TX_SOFTIRQ);
                local_irq_restore(flags);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(xfrm_dev_resume);

void xfrm_dev_backlog(struct softnet_data *sd)
{
        struct sk_buff_head *xfrm_backlog = &sd->xfrm_backlog;
        struct sk_buff_head list;
        struct sk_buff *skb;

        if (skb_queue_empty(xfrm_backlog))
                return;

        __skb_queue_head_init(&list);

        spin_lock(&xfrm_backlog->lock);
        skb_queue_splice_init(xfrm_backlog, &list);
        spin_unlock(&xfrm_backlog->lock);

        while (!skb_queue_empty(&list)) {
                skb = __skb_dequeue(&list);
                xfrm_dev_resume(skb);
        }

}
#endif

static int xfrm_api_check(struct net_device *dev)
{
#ifdef CONFIG_XFRM_OFFLOAD
        if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) &&
            !(dev->features & NETIF_F_HW_ESP))
                return NOTIFY_BAD;

        if ((dev->features & NETIF_F_HW_ESP) &&
            (!(dev->xfrmdev_ops &&
               dev->xfrmdev_ops->xdo_dev_state_add &&
               dev->xfrmdev_ops->xdo_dev_state_delete)))
                return NOTIFY_BAD;
#else
        if (dev->features & (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM))
                return NOTIFY_BAD;
#endif

        return NOTIFY_DONE;
}

static int xfrm_dev_down(struct net_device *dev)
{
        if (dev->features & NETIF_F_HW_ESP) {
                xfrm_dev_state_flush(dev_net(dev), dev, true);
                xfrm_dev_policy_flush(dev_net(dev), dev, true);
        }

        return NOTIFY_DONE;
}

static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        switch (event) {
        case NETDEV_REGISTER:
                return xfrm_api_check(dev);

        case NETDEV_FEAT_CHANGE:
                return xfrm_api_check(dev);

        case NETDEV_DOWN:
        case NETDEV_UNREGISTER:
                return xfrm_dev_down(dev);
        }
        return NOTIFY_DONE;
}

static struct notifier_block xfrm_dev_notifier = {
        .notifier_call        = xfrm_dev_event,
};

void __init xfrm_dev_init(void)
{
        register_netdevice_notifier(&xfrm_dev_notifier);
}

















   34 




























































   34 




















































































































































    1 










































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SIGNAL_H
#define _LINUX_SIGNAL_H

#include <linux/bug.h>
#include <linux/list.h>
#include <linux/signal_types.h>
#include <linux/string.h>

struct task_struct;

/* for sysctl */
extern int print_fatal_signals;

static inline void copy_siginfo(kernel_siginfo_t *to,
                                const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*to));
}

static inline void clear_siginfo(kernel_siginfo_t *info)
{
        memset(info, 0, sizeof(*info));
}

#define SI_EXPANSION_SIZE (sizeof(struct siginfo) - sizeof(struct kernel_siginfo))

static inline void copy_siginfo_to_external(siginfo_t *to,
                                            const kernel_siginfo_t *from)
{
        memcpy(to, from, sizeof(*from));
        memset(((char *)to) + sizeof(struct kernel_siginfo), 0,
                SI_EXPANSION_SIZE);
}

int copy_siginfo_to_user(siginfo_t __user *to, const kernel_siginfo_t *from);
int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from);

enum siginfo_layout {
        SIL_KILL,
        SIL_TIMER,
        SIL_POLL,
        SIL_FAULT,
        SIL_FAULT_TRAPNO,
        SIL_FAULT_MCEERR,
        SIL_FAULT_BNDERR,
        SIL_FAULT_PKUERR,
        SIL_FAULT_PERF_EVENT,
        SIL_CHLD,
        SIL_RT,
        SIL_SYS,
};

enum siginfo_layout siginfo_layout(unsigned sig, int si_code);

/*
 * Define some primitives to manipulate sigset_t.
 */

#ifndef __HAVE_ARCH_SIG_BITOPS
#include <linux/bitops.h>

/* We don't use <linux/bitops.h> for these because there is no need to
   be atomic.  */
static inline void sigaddset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] |= 1UL << sig;
        else
                set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW);
}

static inline void sigdelset(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                set->sig[0] &= ~(1UL << sig);
        else
                set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW));
}

static inline int sigismember(sigset_t *set, int _sig)
{
        unsigned long sig = _sig - 1;
        if (_NSIG_WORDS == 1)
                return 1 & (set->sig[0] >> sig);
        else
                return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
}

#endif /* __HAVE_ARCH_SIG_BITOPS */

static inline int sigisemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        case 4:
                return (set->sig[3] | set->sig[2] |
                        set->sig[1] | set->sig[0]) == 0;
        case 2:
                return (set->sig[1] | set->sig[0]) == 0;
        case 1:
                return set->sig[0] == 0;
        default:
                BUILD_BUG();
                return 0;
        }
}

static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2)
{
        switch (_NSIG_WORDS) {
        case 4:
                return        (set1->sig[3] == set2->sig[3]) &&
                        (set1->sig[2] == set2->sig[2]) &&
                        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 2:
                return        (set1->sig[1] == set2->sig[1]) &&
                        (set1->sig[0] == set2->sig[0]);
        case 1:
                return        set1->sig[0] == set2->sig[0];
        }
        return 0;
}

#define sigmask(sig)        (1UL << ((sig) - 1))

#ifndef __HAVE_ARCH_SIG_SETOPS

#define _SIG_SET_BINOP(name, op)                                        \
static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \
{                                                                        \
        unsigned long a0, a1, a2, a3, b0, b1, b2, b3;                        \
                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:                                                                \
                a3 = a->sig[3]; a2 = a->sig[2];                                \
                b3 = b->sig[3]; b2 = b->sig[2];                                \
                r->sig[3] = op(a3, b3);                                        \
                r->sig[2] = op(a2, b2);                                        \
                fallthrough;                                                \
        case 2:                                                                \
                a1 = a->sig[1]; b1 = b->sig[1];                                \
                r->sig[1] = op(a1, b1);                                        \
                fallthrough;                                                \
        case 1:                                                                \
                a0 = a->sig[0]; b0 = b->sig[0];                                \
                r->sig[0] = op(a0, b0);                                        \
                break;                                                        \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_or(x,y)        ((x) | (y))
_SIG_SET_BINOP(sigorsets, _sig_or)

#define _sig_and(x,y)        ((x) & (y))
_SIG_SET_BINOP(sigandsets, _sig_and)

#define _sig_andn(x,y)        ((x) & ~(y))
_SIG_SET_BINOP(sigandnsets, _sig_andn)

#undef _SIG_SET_BINOP
#undef _sig_or
#undef _sig_and
#undef _sig_andn

#define _SIG_SET_OP(name, op)                                                \
static inline void name(sigset_t *set)                                        \
{                                                                        \
        switch (_NSIG_WORDS) {                                                \
        case 4:        set->sig[3] = op(set->sig[3]);                                \
                set->sig[2] = op(set->sig[2]);                                \
                fallthrough;                                                \
        case 2:        set->sig[1] = op(set->sig[1]);                                \
                fallthrough;                                                \
        case 1:        set->sig[0] = op(set->sig[0]);                                \
                    break;                                                \
        default:                                                        \
                BUILD_BUG();                                                \
        }                                                                \
}

#define _sig_not(x)        (~(x))
_SIG_SET_OP(signotset, _sig_not)

#undef _SIG_SET_OP
#undef _sig_not

static inline void sigemptyset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, 0, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = 0;
                fallthrough;
        case 1:        set->sig[0] = 0;
                break;
        }
}

static inline void sigfillset(sigset_t *set)
{
        switch (_NSIG_WORDS) {
        default:
                memset(set, -1, sizeof(sigset_t));
                break;
        case 2: set->sig[1] = -1;
                fallthrough;
        case 1:        set->sig[0] = -1;
                break;
        }
}

/* Some extensions for manipulating the low 32 signals in particular.  */

static inline void sigaddsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] |= mask;
}

static inline void sigdelsetmask(sigset_t *set, unsigned long mask)
{
        set->sig[0] &= ~mask;
}

static inline int sigtestsetmask(sigset_t *set, unsigned long mask)
{
        return (set->sig[0] & mask) != 0;
}

static inline void siginitset(sigset_t *set, unsigned long mask)
{
        set->sig[0] = mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = 0;
                break;
        case 1: ;
        }
}

static inline void siginitsetinv(sigset_t *set, unsigned long mask)
{
        set->sig[0] = ~mask;
        switch (_NSIG_WORDS) {
        default:
                memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1));
                break;
        case 2: set->sig[1] = -1;
                break;
        case 1: ;
        }
}

#endif /* __HAVE_ARCH_SIG_SETOPS */

static inline void init_sigpending(struct sigpending *sig)
{
        sigemptyset(&sig->signal);
        INIT_LIST_HEAD(&sig->list);
}

extern void flush_sigqueue(struct sigpending *queue);

/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
static inline int valid_signal(unsigned long sig)
{
        return sig <= _NSIG ? 1 : 0;
}

struct timespec;
struct pt_regs;
enum pid_type;

extern int next_signal(struct sigpending *pending, sigset_t *mask);
extern int do_send_sig_info(int sig, struct kernel_siginfo *info,
                                struct task_struct *p, enum pid_type type);
extern int group_send_sig_info(int sig, struct kernel_siginfo *info,
                               struct task_struct *p, enum pid_type type);
extern int send_signal_locked(int sig, struct kernel_siginfo *info,
                              struct task_struct *p, enum pid_type type);
extern int sigprocmask(int, sigset_t *, sigset_t *);
extern void set_current_blocked(sigset_t *);
extern void __set_current_blocked(const sigset_t *);
extern int show_unhandled_signals;

extern bool get_signal(struct ksignal *ksig);
extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping);
extern void exit_signals(struct task_struct *tsk);
extern void kernel_sigaction(int, __sighandler_t);

#define SIG_KTHREAD ((__force __sighandler_t)2)
#define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3)

static inline void allow_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know it'll be handled, so that they don't get converted to
         * SIGKILL or just silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD);
}

static inline void allow_kernel_signal(int sig)
{
        /*
         * Kernel threads handle their own signals. Let the signal code
         * know signals sent by the kernel will be handled, so that they
         * don't get silently dropped.
         */
        kernel_sigaction(sig, SIG_KTHREAD_KERNEL);
}

static inline void disallow_signal(int sig)
{
        kernel_sigaction(sig, SIG_IGN);
}

extern struct kmem_cache *sighand_cachep;

extern bool unhandled_signal(struct task_struct *tsk, int sig);

/*
 * In POSIX a signal is sent either to a specific thread (Linux task)
 * or to the process as a whole (Linux thread group).  How the signal
 * is sent determines whether it's to one thread or the whole group,
 * which determines which signal mask(s) are involved in blocking it
 * from being delivered until later.  When the signal is delivered,
 * either it's caught or ignored by a user handler or it has a default
 * effect that applies to the whole thread group (POSIX process).
 *
 * The possible effects an unblocked signal set to SIG_DFL can have are:
 *   ignore        - Nothing Happens
 *   terminate        - kill the process, i.e. all threads in the group,
 *                   similar to exit_group.  The group leader (only) reports
 *                  WIFSIGNALED status to its parent.
 *   coredump        - write a core dump file describing all threads using
 *                  the same mm and then kill all those threads
 *   stop         - stop all the threads in the group, i.e. TASK_STOPPED state
 *
 * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored.
 * Other signals when not blocked and set to SIG_DFL behaves as follows.
 * The job control signals also have other special effects.
 *
 *        +--------------------+------------------+
 *        |  POSIX signal      |  default action  |
 *        +--------------------+------------------+
 *        |  SIGHUP            |  terminate        |
 *        |  SIGINT            |        terminate        |
 *        |  SIGQUIT           |        coredump         |
 *        |  SIGILL            |        coredump         |
 *        |  SIGTRAP           |        coredump         |
 *        |  SIGABRT/SIGIOT    |        coredump         |
 *        |  SIGBUS            |        coredump         |
 *        |  SIGFPE            |        coredump         |
 *        |  SIGKILL           |        terminate(+)        |
 *        |  SIGUSR1           |        terminate        |
 *        |  SIGSEGV           |        coredump         |
 *        |  SIGUSR2           |        terminate        |
 *        |  SIGPIPE           |        terminate        |
 *        |  SIGALRM           |        terminate        |
 *        |  SIGTERM           |        terminate        |
 *        |  SIGCHLD           |        ignore           |
 *        |  SIGCONT           |        ignore(*)        |
 *        |  SIGSTOP           |        stop(*)(+)          |
 *        |  SIGTSTP           |        stop(*)          |
 *        |  SIGTTIN           |        stop(*)          |
 *        |  SIGTTOU           |        stop(*)          |
 *        |  SIGURG            |        ignore           |
 *        |  SIGXCPU           |        coredump         |
 *        |  SIGXFSZ           |        coredump         |
 *        |  SIGVTALRM         |        terminate        |
 *        |  SIGPROF           |        terminate        |
 *        |  SIGPOLL/SIGIO     |        terminate        |
 *        |  SIGSYS/SIGUNUSED  |        coredump         |
 *        |  SIGSTKFLT         |        terminate        |
 *        |  SIGWINCH          |        ignore           |
 *        |  SIGPWR            |        terminate        |
 *        |  SIGRTMIN-SIGRTMAX |        terminate       |
 *        +--------------------+------------------+
 *        |  non-POSIX signal  |  default action  |
 *        +--------------------+------------------+
 *        |  SIGEMT            |  coredump        |
 *        +--------------------+------------------+
 *
 * (+) For SIGKILL and SIGSTOP the action is "always", not just "default".
 * (*) Special job control effects:
 * When SIGCONT is sent, it resumes the process (all threads in the group)
 * from TASK_STOPPED state and also clears any pending/queued stop signals
 * (any of those marked with "stop(*)").  This happens regardless of blocking,
 * catching, or ignoring SIGCONT.  When any stop signal is sent, it clears
 * any pending/queued SIGCONT signals; this happens regardless of blocking,
 * catching, or ignored the stop signal, though (except for SIGSTOP) the
 * default action of stopping the process may happen later or never.
 */

#ifdef SIGEMT
#define SIGEMT_MASK        rt_sigmask(SIGEMT)
#else
#define SIGEMT_MASK        0
#endif

#if SIGRTMIN > BITS_PER_LONG
#define rt_sigmask(sig)        (1ULL << ((sig)-1))
#else
#define rt_sigmask(sig)        sigmask(sig)
#endif

#define siginmask(sig, mask) \
        ((sig) > 0 && (sig) < SIGRTMIN && (rt_sigmask(sig) & (mask)))

#define SIG_KERNEL_ONLY_MASK (\
        rt_sigmask(SIGKILL)   |  rt_sigmask(SIGSTOP))

#define SIG_KERNEL_STOP_MASK (\
        rt_sigmask(SIGSTOP)   |  rt_sigmask(SIGTSTP)   | \
        rt_sigmask(SIGTTIN)   |  rt_sigmask(SIGTTOU)   )

#define SIG_KERNEL_COREDUMP_MASK (\
        rt_sigmask(SIGQUIT)   |  rt_sigmask(SIGILL)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGABRT)   | \
        rt_sigmask(SIGFPE)    |  rt_sigmask(SIGSEGV)   | \
        rt_sigmask(SIGBUS)    |  rt_sigmask(SIGSYS)    | \
        rt_sigmask(SIGXCPU)   |  rt_sigmask(SIGXFSZ)   | \
        SIGEMT_MASK                                       )

#define SIG_KERNEL_IGNORE_MASK (\
        rt_sigmask(SIGCONT)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGWINCH)  |  rt_sigmask(SIGURG)    )

#define SIG_SPECIFIC_SICODES_MASK (\
        rt_sigmask(SIGILL)    |  rt_sigmask(SIGFPE)    | \
        rt_sigmask(SIGSEGV)   |  rt_sigmask(SIGBUS)    | \
        rt_sigmask(SIGTRAP)   |  rt_sigmask(SIGCHLD)   | \
        rt_sigmask(SIGPOLL)   |  rt_sigmask(SIGSYS)    | \
        SIGEMT_MASK                                    )

#define sig_kernel_only(sig)                siginmask(sig, SIG_KERNEL_ONLY_MASK)
#define sig_kernel_coredump(sig)        siginmask(sig, SIG_KERNEL_COREDUMP_MASK)
#define sig_kernel_ignore(sig)                siginmask(sig, SIG_KERNEL_IGNORE_MASK)
#define sig_kernel_stop(sig)                siginmask(sig, SIG_KERNEL_STOP_MASK)
#define sig_specific_sicodes(sig)        siginmask(sig, SIG_SPECIFIC_SICODES_MASK)

#define sig_fatal(t, signr) \
        (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \
         (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL)

void signals_init(void);

int restore_altstack(const stack_t __user *);
int __save_altstack(stack_t __user *, unsigned long);

#define unsafe_save_altstack(uss, sp, label) do { \
        stack_t __user *__uss = uss; \
        struct task_struct *t = current; \
        unsafe_put_user((void __user *)t->sas_ss_sp, &__uss->ss_sp, label); \
        unsafe_put_user(t->sas_ss_flags, &__uss->ss_flags, label); \
        unsafe_put_user(t->sas_ss_size, &__uss->ss_size, label); \
} while (0);

#ifdef CONFIG_DYNAMIC_SIGFRAME
bool sigaltstack_size_valid(size_t ss_size);
#else
static inline bool sigaltstack_size_valid(size_t size) { return true; }
#endif /* !CONFIG_DYNAMIC_SIGFRAME */

#ifdef CONFIG_PROC_FS
struct seq_file;
extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
#endif

#ifndef arch_untagged_si_addr
/*
 * Given a fault address and a signal and si_code which correspond to the
 * _sigfault union member, returns the address that must appear in si_addr if
 * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags.
 */
static inline void __user *arch_untagged_si_addr(void __user *addr,
                                                 unsigned long sig,
                                                 unsigned long si_code)
{
        return addr;
}
#endif

#endif /* _LINUX_SIGNAL_H */




































































































































































































































































































































































  413 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/processor.h
 *
 * Copyright (C) 1995-1999 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_PROCESSOR_H
#define __ASM_PROCESSOR_H

/*
 * On arm64 systems, unaligned accesses by the CPU are cheap, and so there is
 * no point in shifting all network buffers by 2 bytes just to make some IP
 * header fields appear aligned in memory, potentially sacrificing some DMA
 * performance on some platforms.
 */
#define NET_IP_ALIGN        0

#define MTE_CTRL_GCR_USER_EXCL_SHIFT        0
#define MTE_CTRL_GCR_USER_EXCL_MASK        0xffff

#define MTE_CTRL_TCF_SYNC                (1UL << 16)
#define MTE_CTRL_TCF_ASYNC                (1UL << 17)
#define MTE_CTRL_TCF_ASYMM                (1UL << 18)

#ifndef __ASSEMBLY__

#include <linux/build_bug.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/thread_info.h>

#include <vdso/processor.h>

#include <asm/alternative.h>
#include <asm/cpufeature.h>
#include <asm/hw_breakpoint.h>
#include <asm/kasan.h>
#include <asm/lse.h>
#include <asm/pgtable-hwdef.h>
#include <asm/pointer_auth.h>
#include <asm/ptrace.h>
#include <asm/spectre.h>
#include <asm/types.h>

/*
 * TASK_SIZE - the maximum size of a user space task.
 * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area.
 */

#define DEFAULT_MAP_WINDOW_64        (UL(1) << VA_BITS_MIN)
#define TASK_SIZE_64                (UL(1) << vabits_actual)
#define TASK_SIZE_MAX                (UL(1) << VA_BITS)

#ifdef CONFIG_COMPAT
#if defined(CONFIG_ARM64_64K_PAGES) && defined(CONFIG_KUSER_HELPERS)
/*
 * With CONFIG_ARM64_64K_PAGES enabled, the last page is occupied
 * by the compat vectors page.
 */
#define TASK_SIZE_32                UL(0x100000000)
#else
#define TASK_SIZE_32                (UL(0x100000000) - PAGE_SIZE)
#endif /* CONFIG_ARM64_64K_PAGES */
#define TASK_SIZE                (test_thread_flag(TIF_32BIT) ? \
                                TASK_SIZE_32 : TASK_SIZE_64)
#define TASK_SIZE_OF(tsk)        (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
                                TASK_SIZE_32 : TASK_SIZE_64)
#define DEFAULT_MAP_WINDOW        (test_thread_flag(TIF_32BIT) ? \
                                TASK_SIZE_32 : DEFAULT_MAP_WINDOW_64)
#else
#define TASK_SIZE                TASK_SIZE_64
#define DEFAULT_MAP_WINDOW        DEFAULT_MAP_WINDOW_64
#endif /* CONFIG_COMPAT */

#ifdef CONFIG_ARM64_FORCE_52BIT
#define STACK_TOP_MAX                TASK_SIZE_64
#define TASK_UNMAPPED_BASE        (PAGE_ALIGN(TASK_SIZE / 4))
#else
#define STACK_TOP_MAX                DEFAULT_MAP_WINDOW_64
#define TASK_UNMAPPED_BASE        (PAGE_ALIGN(DEFAULT_MAP_WINDOW / 4))
#endif /* CONFIG_ARM64_FORCE_52BIT */

#ifdef CONFIG_COMPAT
#define AARCH32_VECTORS_BASE        0xffff0000
#define STACK_TOP                (test_thread_flag(TIF_32BIT) ? \
                                AARCH32_VECTORS_BASE : STACK_TOP_MAX)
#else
#define STACK_TOP                STACK_TOP_MAX
#endif /* CONFIG_COMPAT */

#ifndef CONFIG_ARM64_FORCE_52BIT
#define arch_get_mmap_end(addr, len, flags) \
                (((addr) > DEFAULT_MAP_WINDOW) ? TASK_SIZE : DEFAULT_MAP_WINDOW)

#define arch_get_mmap_base(addr, base) ((addr > DEFAULT_MAP_WINDOW) ? \
                                        base + TASK_SIZE - DEFAULT_MAP_WINDOW :\
                                        base)
#endif /* CONFIG_ARM64_FORCE_52BIT */

extern phys_addr_t arm64_dma_phys_limit;
#define ARCH_LOW_ADDRESS_LIMIT        (arm64_dma_phys_limit - 1)

struct debug_info {
#ifdef CONFIG_HAVE_HW_BREAKPOINT
        /* Have we suspended stepping by a debugger? */
        int                        suspended_step;
        /* Allow breakpoints and watchpoints to be disabled for this thread. */
        int                        bps_disabled;
        int                        wps_disabled;
        /* Hardware breakpoints pinned to this task. */
        struct perf_event        *hbp_break[ARM_MAX_BRP];
        struct perf_event        *hbp_watch[ARM_MAX_WRP];
#endif
};

enum vec_type {
        ARM64_VEC_SVE = 0,
        ARM64_VEC_SME,
        ARM64_VEC_MAX,
};

enum fp_type {
        FP_STATE_CURRENT,        /* Save based on current task state. */
        FP_STATE_FPSIMD,
        FP_STATE_SVE,
};

struct cpu_context {
        unsigned long x19;
        unsigned long x20;
        unsigned long x21;
        unsigned long x22;
        unsigned long x23;
        unsigned long x24;
        unsigned long x25;
        unsigned long x26;
        unsigned long x27;
        unsigned long x28;
        unsigned long fp;
        unsigned long sp;
        unsigned long pc;
};

struct thread_struct {
        struct cpu_context        cpu_context;        /* cpu context */

        /*
         * Whitelisted fields for hardened usercopy:
         * Maintainers must ensure manually that this contains no
         * implicit padding.
         */
        struct {
                unsigned long        tp_value;        /* TLS register */
                unsigned long        tp2_value;
                u64                fpmr;
                unsigned long        pad;
                struct user_fpsimd_state fpsimd_state;
        } uw;

        enum fp_type                fp_type;        /* registers FPSIMD or SVE? */
        unsigned int                fpsimd_cpu;
        void                        *sve_state;        /* SVE registers, if any */
        void                        *sme_state;        /* ZA and ZT state, if any */
        unsigned int                vl[ARM64_VEC_MAX];        /* vector length */
        unsigned int                vl_onexec[ARM64_VEC_MAX]; /* vl after next exec */
        unsigned long                fault_address;        /* fault info */
        unsigned long                fault_code;        /* ESR_EL1 value */
        struct debug_info        debug;                /* debugging */

        struct user_fpsimd_state        kernel_fpsimd_state;
        unsigned int                        kernel_fpsimd_cpu;
#ifdef CONFIG_ARM64_PTR_AUTH
        struct ptrauth_keys_user        keys_user;
#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL
        struct ptrauth_keys_kernel        keys_kernel;
#endif
#endif
#ifdef CONFIG_ARM64_MTE
        u64                        mte_ctrl;
#endif
        u64                        sctlr_user;
        u64                        svcr;
        u64                        tpidr2_el0;
        u64                        por_el0;
#ifdef CONFIG_ARM64_GCS
        unsigned int                gcs_el0_mode;
        unsigned int                gcs_el0_locked;
        u64                        gcspr_el0;
        u64                        gcs_base;
        u64                        gcs_size;
#endif
};

static inline unsigned int thread_get_vl(struct thread_struct *thread,
                                         enum vec_type type)
{
        return thread->vl[type];
}

static inline unsigned int thread_get_sve_vl(struct thread_struct *thread)
{
        return thread_get_vl(thread, ARM64_VEC_SVE);
}

static inline unsigned int thread_get_sme_vl(struct thread_struct *thread)
{
        return thread_get_vl(thread, ARM64_VEC_SME);
}

static inline unsigned int thread_get_cur_vl(struct thread_struct *thread)
{
        if (system_supports_sme() && (thread->svcr & SVCR_SM_MASK))
                return thread_get_sme_vl(thread);
        else
                return thread_get_sve_vl(thread);
}

unsigned int task_get_vl(const struct task_struct *task, enum vec_type type);
void task_set_vl(struct task_struct *task, enum vec_type type,
                 unsigned long vl);
void task_set_vl_onexec(struct task_struct *task, enum vec_type type,
                        unsigned long vl);
unsigned int task_get_vl_onexec(const struct task_struct *task,
                                enum vec_type type);

static inline unsigned int task_get_sve_vl(const struct task_struct *task)
{
        return task_get_vl(task, ARM64_VEC_SVE);
}

static inline unsigned int task_get_sme_vl(const struct task_struct *task)
{
        return task_get_vl(task, ARM64_VEC_SME);
}

static inline void task_set_sve_vl(struct task_struct *task, unsigned long vl)
{
        task_set_vl(task, ARM64_VEC_SVE, vl);
}

static inline unsigned int task_get_sve_vl_onexec(const struct task_struct *task)
{
        return task_get_vl_onexec(task, ARM64_VEC_SVE);
}

static inline void task_set_sve_vl_onexec(struct task_struct *task,
                                          unsigned long vl)
{
        task_set_vl_onexec(task, ARM64_VEC_SVE, vl);
}

#define SCTLR_USER_MASK                                                        \
        (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | SCTLR_ELx_ENDA | SCTLR_ELx_ENDB |   \
         SCTLR_EL1_TCF0_MASK)

static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        /* Verify that there is no padding among the whitelisted fields: */
        BUILD_BUG_ON(sizeof_field(struct thread_struct, uw) !=
                     sizeof_field(struct thread_struct, uw.tp_value) +
                     sizeof_field(struct thread_struct, uw.tp2_value) +
                     sizeof_field(struct thread_struct, uw.fpmr) +
                     sizeof_field(struct thread_struct, uw.pad) +
                     sizeof_field(struct thread_struct, uw.fpsimd_state));

        *offset = offsetof(struct thread_struct, uw);
        *size = sizeof_field(struct thread_struct, uw);
}

#ifdef CONFIG_COMPAT
#define task_user_tls(t)                                                \
({                                                                        \
        unsigned long *__tls;                                                \
        if (is_compat_thread(task_thread_info(t)))                        \
                __tls = &(t)->thread.uw.tp2_value;                        \
        else                                                                \
                __tls = &(t)->thread.uw.tp_value;                        \
        __tls;                                                                \
 })
#else
#define task_user_tls(t)        (&(t)->thread.uw.tp_value)
#endif

/* Sync TPIDR_EL0 back to thread_struct for current */
void tls_preserve_current_state(void);

#define INIT_THREAD {                                \
        .fpsimd_cpu = NR_CPUS,                        \
}

static inline void start_thread_common(struct pt_regs *regs, unsigned long pc,
                                       unsigned long pstate)
{
        /*
         * Ensure all GPRs are zeroed, and initialize PC + PSTATE.
         * The SP (or compat SP) will be initialized later.
         */
        regs->user_regs = (struct user_pt_regs) {
                .pc = pc,
                .pstate = pstate,
        };

        /*
         * To allow the syscalls:sys_exit_execve tracepoint we need to preserve
         * syscallno, but do not need orig_x0 or the original GPRs.
         */
        regs->orig_x0 = 0;

        /*
         * An exec from a kernel thread won't have an existing PMR value.
         */
        if (system_uses_irq_prio_masking())
                regs->pmr = GIC_PRIO_IRQON;

        /*
         * The pt_regs::stackframe field must remain valid throughout this
         * function as a stacktrace can be taken at any time. Any user or
         * kernel task should have a valid final frame.
         */
        WARN_ON_ONCE(regs->stackframe.record.fp != 0);
        WARN_ON_ONCE(regs->stackframe.record.lr != 0);
        WARN_ON_ONCE(regs->stackframe.type != FRAME_META_TYPE_FINAL);
}

static inline void start_thread(struct pt_regs *regs, unsigned long pc,
                                unsigned long sp)
{
        start_thread_common(regs, pc, PSR_MODE_EL0t);
        spectre_v4_enable_task_mitigation(current);
        regs->sp = sp;
}

#ifdef CONFIG_COMPAT
static inline void compat_start_thread(struct pt_regs *regs, unsigned long pc,
                                       unsigned long sp)
{
        unsigned long pstate = PSR_AA32_MODE_USR;
        if (pc & 1)
                pstate |= PSR_AA32_T_BIT;
        if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
                pstate |= PSR_AA32_E_BIT;

        start_thread_common(regs, pc, pstate);
        spectre_v4_enable_task_mitigation(current);
        regs->compat_sp = sp;
}
#endif

static __always_inline bool is_ttbr0_addr(unsigned long addr)
{
        /* entry assembly clears tags for TTBR0 addrs */
        return addr < TASK_SIZE;
}

static __always_inline bool is_ttbr1_addr(unsigned long addr)
{
        /* TTBR1 addresses may have a tag if KASAN_SW_TAGS is in use */
        return arch_kasan_reset_tag(addr) >= PAGE_OFFSET;
}

/* Forward declaration, a strange C thing */
struct task_struct;

unsigned long __get_wchan(struct task_struct *p);

void update_sctlr_el1(u64 sctlr);

/* Thread switching */
extern struct task_struct *cpu_switch_to(struct task_struct *prev,
                                         struct task_struct *next);

#define task_pt_regs(p) \
        ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1)

#define KSTK_EIP(tsk)        ((unsigned long)task_pt_regs(tsk)->pc)
#define KSTK_ESP(tsk)        user_stack_pointer(task_pt_regs(tsk))

/*
 * Prefetching support
 */
#define ARCH_HAS_PREFETCH
static inline void prefetch(const void *ptr)
{
        asm volatile("prfm pldl1keep, %a0\n" : : "p" (ptr));
}

#define ARCH_HAS_PREFETCHW
static inline void prefetchw(const void *ptr)
{
        asm volatile("prfm pstl1keep, %a0\n" : : "p" (ptr));
}

extern unsigned long __ro_after_init signal_minsigstksz; /* sigframe size */
extern void __init minsigstksz_setup(void);

/*
 * Not at the top of the file due to a direct #include cycle between
 * <asm/fpsimd.h> and <asm/processor.h>.  Deferring this #include
 * ensures that contents of processor.h are visible to fpsimd.h even if
 * processor.h is included first.
 *
 * These prctl helpers are the only things in this file that require
 * fpsimd.h.  The core code expects them to be in this header.
 */
#include <asm/fpsimd.h>

/* Userspace interface for PR_S[MV]E_{SET,GET}_VL prctl()s: */
#define SVE_SET_VL(arg)        sve_set_current_vl(arg)
#define SVE_GET_VL()        sve_get_current_vl()
#define SME_SET_VL(arg)        sme_set_current_vl(arg)
#define SME_GET_VL()        sme_get_current_vl()

/* PR_PAC_RESET_KEYS prctl */
#define PAC_RESET_KEYS(tsk, arg)        ptrauth_prctl_reset_keys(tsk, arg)

/* PR_PAC_{SET,GET}_ENABLED_KEYS prctl */
#define PAC_SET_ENABLED_KEYS(tsk, keys, enabled)                                \
        ptrauth_set_enabled_keys(tsk, keys, enabled)
#define PAC_GET_ENABLED_KEYS(tsk) ptrauth_get_enabled_keys(tsk)

#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
/* PR_{SET,GET}_TAGGED_ADDR_CTRL prctl */
long set_tagged_addr_ctrl(struct task_struct *task, unsigned long arg);
long get_tagged_addr_ctrl(struct task_struct *task);
#define SET_TAGGED_ADDR_CTRL(arg)        set_tagged_addr_ctrl(current, arg)
#define GET_TAGGED_ADDR_CTRL()                get_tagged_addr_ctrl(current)
#endif

int get_tsc_mode(unsigned long adr);
int set_tsc_mode(unsigned int val);
#define GET_TSC_CTL(adr)        get_tsc_mode((adr))
#define SET_TSC_CTL(val)        set_tsc_mode((val))

#endif /* __ASSEMBLY__ */
#endif /* __ASM_PROCESSOR_H */

































   26 



































   26 






















































































































































































































































































































































   26 





   26 




   26 

















































































































































































































































































































































































































































































































































































































































































































































































   26 
   26 

















   35 
































   35 












   35 


























































   26 





   26 







































































































































































































   35 

   35 










  122 













   50 


   50 


   50 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
// SPDX-License-Identifier: GPL-2.0-only
/*
  File: fs/xattr.c

  Extended attribute handling.

  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com>
  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
 */
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/fsnotify.h>
#include <linux/audit.h>
#include <linux/vmalloc.h>
#include <linux/posix_acl_xattr.h>

#include <linux/uaccess.h>

#include "internal.h"

static const char *
strcmp_prefix(const char *a, const char *a_prefix)
{
        while (*a_prefix && *a == *a_prefix) {
                a++;
                a_prefix++;
        }
        return *a_prefix ? NULL : a;
}

/*
 * In order to implement different sets of xattr operations for each xattr
 * prefix, a filesystem should create a null-terminated array of struct
 * xattr_handler (one for each prefix) and hang a pointer to it off of the
 * s_xattr field of the superblock.
 */
#define for_each_xattr_handler(handlers, handler)                \
        if (handlers)                                                \
                for ((handler) = *(handlers)++;                        \
                        (handler) != NULL;                        \
                        (handler) = *(handlers)++)

/*
 * Find the xattr_handler with the matching prefix.
 */
static const struct xattr_handler *
xattr_resolve_name(struct inode *inode, const char **name)
{
        const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
        const struct xattr_handler *handler;

        if (!(inode->i_opflags & IOP_XATTR)) {
                if (unlikely(is_bad_inode(inode)))
                        return ERR_PTR(-EIO);
                return ERR_PTR(-EOPNOTSUPP);
        }
        for_each_xattr_handler(handlers, handler) {
                const char *n;

                n = strcmp_prefix(*name, xattr_prefix(handler));
                if (n) {
                        if (!handler->prefix ^ !*n) {
                                if (*n)
                                        continue;
                                return ERR_PTR(-EINVAL);
                        }
                        *name = n;
                        return handler;
                }
        }
        return ERR_PTR(-EOPNOTSUPP);
}

/**
 * may_write_xattr - check whether inode allows writing xattr
 * @idmap: idmap of the mount the inode was found from
 * @inode: the inode on which to set an xattr
 *
 * Check whether the inode allows writing xattrs. Specifically, we can never
 * set or remove an extended attribute on a read-only filesystem  or on an
 * immutable / append-only inode.
 *
 * We also need to ensure that the inode has a mapping in the mount to
 * not risk writing back invalid i_{g,u}id values.
 *
 * Return: On success zero is returned. On error a negative errno is returned.
 */
int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode)
{
        if (IS_IMMUTABLE(inode))
                return -EPERM;
        if (IS_APPEND(inode))
                return -EPERM;
        if (HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        return 0;
}

/*
 * Check permissions for extended attribute access.  This is a bit complicated
 * because different namespaces have very different rules.
 */
static int
xattr_permission(struct mnt_idmap *idmap, struct inode *inode,
                 const char *name, int mask)
{
        if (mask & MAY_WRITE) {
                int ret;

                ret = may_write_xattr(idmap, inode);
                if (ret)
                        return ret;
        }

        /*
         * No restriction for security.* and system.* from the VFS.  Decision
         * on these is left to the underlying filesystem / security module.
         */
        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
            !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
                return 0;

        /*
         * The trusted.* namespace can only be accessed by privileged users.
         */
        if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
                if (!capable(CAP_SYS_ADMIN))
                        return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
                return 0;
        }

        /*
         * In the user.* namespace, only regular files and directories can have
         * extended attributes. For sticky directories, only the owner and
         * privileged users can write attributes.
         */
        if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) {
                if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
                        return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
                if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) &&
                    (mask & MAY_WRITE) &&
                    !inode_owner_or_capable(idmap, inode))
                        return -EPERM;
        }

        return inode_permission(idmap, inode, mask);
}

/*
 * Look for any handler that deals with the specified namespace.
 */
int
xattr_supports_user_prefix(struct inode *inode)
{
        const struct xattr_handler * const *handlers = inode->i_sb->s_xattr;
        const struct xattr_handler *handler;

        if (!(inode->i_opflags & IOP_XATTR)) {
                if (unlikely(is_bad_inode(inode)))
                        return -EIO;
                return -EOPNOTSUPP;
        }

        for_each_xattr_handler(handlers, handler) {
                if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX,
                             XATTR_USER_PREFIX_LEN))
                        return 0;
        }

        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(xattr_supports_user_prefix);

int
__vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
               struct inode *inode, const char *name, const void *value,
               size_t size, int flags)
{
        const struct xattr_handler *handler;

        if (is_posix_acl_xattr(name))
                return -EOPNOTSUPP;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->set)
                return -EOPNOTSUPP;
        if (size == 0)
                value = "";  /* empty EA, do not remove */
        return handler->set(handler, idmap, dentry, inode, name, value,
                            size, flags);
}
EXPORT_SYMBOL(__vfs_setxattr);

/**
 *  __vfs_setxattr_noperm - perform setxattr operation without performing
 *  permission checks.
 *
 *  @idmap: idmap of the mount the inode was found from
 *  @dentry: object to perform setxattr on
 *  @name: xattr name to set
 *  @value: value to set @name to
 *  @size: size of @value
 *  @flags: flags to pass into filesystem operations
 *
 *  returns the result of the internal setxattr or setsecurity operations.
 *
 *  This function requires the caller to lock the inode's i_mutex before it
 *  is executed. It also assumes that the caller will make the appropriate
 *  permission checks.
 */
int __vfs_setxattr_noperm(struct mnt_idmap *idmap,
                          struct dentry *dentry, const char *name,
                          const void *value, size_t size, int flags)
{
        struct inode *inode = dentry->d_inode;
        int error = -EAGAIN;
        int issec = !strncmp(name, XATTR_SECURITY_PREFIX,
                                   XATTR_SECURITY_PREFIX_LEN);

        if (issec)
                inode->i_flags &= ~S_NOSEC;
        if (inode->i_opflags & IOP_XATTR) {
                error = __vfs_setxattr(idmap, dentry, inode, name, value,
                                       size, flags);
                if (!error) {
                        fsnotify_xattr(dentry);
                        security_inode_post_setxattr(dentry, name, value,
                                                     size, flags);
                }
        } else {
                if (unlikely(is_bad_inode(inode)))
                        return -EIO;
        }
        if (error == -EAGAIN) {
                error = -EOPNOTSUPP;

                if (issec) {
                        const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;

                        error = security_inode_setsecurity(inode, suffix, value,
                                                           size, flags);
                        if (!error)
                                fsnotify_xattr(dentry);
                }
        }

        return error;
}

/**
 * __vfs_setxattr_locked - set an extended attribute while holding the inode
 * lock
 *
 *  @idmap: idmap of the mount of the target inode
 *  @dentry: object to perform setxattr on
 *  @name: xattr name to set
 *  @value: value to set @name to
 *  @size: size of @value
 *  @flags: flags to pass into filesystem operations
 *  @delegated_inode: on return, will contain an inode pointer that
 *  a delegation was broken on, NULL if none.
 */
int
__vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry,
                      const char *name, const void *value, size_t size,
                      int flags, struct inode **delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_WRITE);
        if (error)
                return error;

        error = security_inode_setxattr(idmap, dentry, name, value, size,
                                        flags);
        if (error)
                goto out;

        error = try_break_deleg(inode, delegated_inode);
        if (error)
                goto out;

        error = __vfs_setxattr_noperm(idmap, dentry, name, value,
                                      size, flags);

out:
        return error;
}
EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);

int
vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
             const char *name, const void *value, size_t size, int flags)
{
        struct inode *inode = dentry->d_inode;
        struct inode *delegated_inode = NULL;
        const void  *orig_value = value;
        int error;

        if (size && strcmp(name, XATTR_NAME_CAPS) == 0) {
                error = cap_convert_nscap(idmap, dentry, &value, size);
                if (error < 0)
                        return error;
                size = error;
        }

retry_deleg:
        inode_lock(inode);
        error = __vfs_setxattr_locked(idmap, dentry, name, value, size,
                                      flags, &delegated_inode);
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        if (value != orig_value)
                kfree(value);

        return error;
}
EXPORT_SYMBOL_GPL(vfs_setxattr);

static ssize_t
xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode,
                  const char *name, void *value, size_t size)
{
        void *buffer = NULL;
        ssize_t len;

        if (!value || !size) {
                len = security_inode_getsecurity(idmap, inode, name,
                                                 &buffer, false);
                goto out_noalloc;
        }

        len = security_inode_getsecurity(idmap, inode, name, &buffer,
                                         true);
        if (len < 0)
                return len;
        if (size < len) {
                len = -ERANGE;
                goto out;
        }
        memcpy(value, buffer, len);
out:
        kfree(buffer);
out_noalloc:
        return len;
}

/*
 * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr
 *
 * Allocate memory, if not already allocated, or re-allocate correct size,
 * before retrieving the extended attribute.  The xattr value buffer should
 * always be freed by the caller, even on error.
 *
 * Returns the result of alloc, if failed, or the getxattr operation.
 */
int
vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *name, char **xattr_value, size_t xattr_size,
                   gfp_t flags)
{
        const struct xattr_handler *handler;
        struct inode *inode = dentry->d_inode;
        char *value = *xattr_value;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_READ);
        if (error)
                return error;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->get)
                return -EOPNOTSUPP;
        error = handler->get(handler, dentry, inode, name, NULL, 0);
        if (error < 0)
                return error;

        if (!value || (error > xattr_size)) {
                value = krealloc(*xattr_value, error + 1, flags);
                if (!value)
                        return -ENOMEM;
                memset(value, 0, error + 1);
        }

        error = handler->get(handler, dentry, inode, name, value, error);
        *xattr_value = value;
        return error;
}

ssize_t
__vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name,
               void *value, size_t size)
{
        const struct xattr_handler *handler;

        if (is_posix_acl_xattr(name))
                return -EOPNOTSUPP;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->get)
                return -EOPNOTSUPP;
        return handler->get(handler, dentry, inode, name, value, size);
}
EXPORT_SYMBOL(__vfs_getxattr);

ssize_t
vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry,
             const char *name, void *value, size_t size)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_READ);
        if (error)
                return error;

        error = security_inode_getxattr(dentry, name);
        if (error)
                return error;

        if (!strncmp(name, XATTR_SECURITY_PREFIX,
                                XATTR_SECURITY_PREFIX_LEN)) {
                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
                int ret = xattr_getsecurity(idmap, inode, suffix, value,
                                            size);
                /*
                 * Only overwrite the return value if a security module
                 * is actually active.
                 */
                if (ret == -EOPNOTSUPP)
                        goto nolsm;
                return ret;
        }
nolsm:
        return __vfs_getxattr(dentry, inode, name, value, size);
}
EXPORT_SYMBOL_GPL(vfs_getxattr);

/**
 * vfs_listxattr - retrieve \0 separated list of xattr names
 * @dentry: the dentry from whose inode the xattr names are retrieved
 * @list: buffer to store xattr names into
 * @size: size of the buffer
 *
 * This function returns the names of all xattrs associated with the
 * inode of @dentry.
 *
 * Note, for legacy reasons the vfs_listxattr() function lists POSIX
 * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the
 * vfs_listxattr() function doesn't check for this flag since a
 * filesystem could implement POSIX ACLs without implementing any other
 * xattrs.
 *
 * However, since all codepaths that remove IOP_XATTR also assign of
 * inode operations that either don't implement or implement a stub
 * ->listxattr() operation.
 *
 * Return: On success, the size of the buffer that was used. On error a
 *         negative error code.
 */
ssize_t
vfs_listxattr(struct dentry *dentry, char *list, size_t size)
{
        struct inode *inode = d_inode(dentry);
        ssize_t error;

        error = security_inode_listxattr(dentry);
        if (error)
                return error;

        if (inode->i_op->listxattr) {
                error = inode->i_op->listxattr(dentry, list, size);
        } else {
                error = security_inode_listsecurity(inode, list, size);
                if (size && error > size)
                        error = -ERANGE;
        }
        return error;
}
EXPORT_SYMBOL_GPL(vfs_listxattr);

int
__vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                  const char *name)
{
        struct inode *inode = d_inode(dentry);
        const struct xattr_handler *handler;

        if (is_posix_acl_xattr(name))
                return -EOPNOTSUPP;

        handler = xattr_resolve_name(inode, &name);
        if (IS_ERR(handler))
                return PTR_ERR(handler);
        if (!handler->set)
                return -EOPNOTSUPP;
        return handler->set(handler, idmap, dentry, inode, name, NULL, 0,
                            XATTR_REPLACE);
}
EXPORT_SYMBOL(__vfs_removexattr);

/**
 * __vfs_removexattr_locked - set an extended attribute while holding the inode
 * lock
 *
 *  @idmap: idmap of the mount of the target inode
 *  @dentry: object to perform setxattr on
 *  @name: name of xattr to remove
 *  @delegated_inode: on return, will contain an inode pointer that
 *  a delegation was broken on, NULL if none.
 */
int
__vfs_removexattr_locked(struct mnt_idmap *idmap,
                         struct dentry *dentry, const char *name,
                         struct inode **delegated_inode)
{
        struct inode *inode = dentry->d_inode;
        int error;

        error = xattr_permission(idmap, inode, name, MAY_WRITE);
        if (error)
                return error;

        error = security_inode_removexattr(idmap, dentry, name);
        if (error)
                goto out;

        error = try_break_deleg(inode, delegated_inode);
        if (error)
                goto out;

        error = __vfs_removexattr(idmap, dentry, name);
        if (error)
                return error;

        fsnotify_xattr(dentry);
        security_inode_post_removexattr(dentry, name);

out:
        return error;
}
EXPORT_SYMBOL_GPL(__vfs_removexattr_locked);

int
vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *name)
{
        struct inode *inode = dentry->d_inode;
        struct inode *delegated_inode = NULL;
        int error;

retry_deleg:
        inode_lock(inode);
        error = __vfs_removexattr_locked(idmap, dentry,
                                         name, &delegated_inode);
        inode_unlock(inode);

        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }

        return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);

int import_xattr_name(struct xattr_name *kname, const char __user *name)
{
        int error = strncpy_from_user(kname->name, name,
                                        sizeof(kname->name));
        if (error == 0 || error == sizeof(kname->name))
                return -ERANGE;
        if (error < 0)
                return error;
        return 0;
}

/*
 * Extended attribute SET operations
 */

int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx)
{
        int error;

        if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
                return -EINVAL;

        error = import_xattr_name(ctx->kname, name);
        if (error)
                return error;

        if (ctx->size) {
                if (ctx->size > XATTR_SIZE_MAX)
                        return -E2BIG;

                ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size);
                if (IS_ERR(ctx->kvalue)) {
                        error = PTR_ERR(ctx->kvalue);
                        ctx->kvalue = NULL;
                }
        }

        return error;
}

static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
                struct kernel_xattr_ctx *ctx)
{
        if (is_posix_acl_xattr(ctx->kname->name))
                return do_set_acl(idmap, dentry, ctx->kname->name,
                                  ctx->kvalue, ctx->size);

        return vfs_setxattr(idmap, dentry, ctx->kname->name,
                        ctx->kvalue, ctx->size, ctx->flags);
}

int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx)
{
        int error = mnt_want_write_file(f);

        if (!error) {
                audit_file(f);
                error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
                mnt_drop_write_file(f);
        }
        return error;
}

/* unconditionally consumes filename */
int filename_setxattr(int dfd, struct filename *filename,
                      unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
{
        struct path path;
        int error;

retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
        if (!error) {
                error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx);
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }

out:
        putname(filename);
        return error;
}

static int path_setxattrat(int dfd, const char __user *pathname,
                           unsigned int at_flags, const char __user *name,
                           const void __user *value, size_t size, int flags)
{
        struct xattr_name kname;
        struct kernel_xattr_ctx ctx = {
                .cvalue        = value,
                .kvalue        = NULL,
                .size        = size,
                .kname        = &kname,
                .flags        = flags,
        };
        struct filename *filename;
        unsigned int lookup_flags = 0;
        int error;

        if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;

        if (!(at_flags & AT_SYMLINK_NOFOLLOW))
                lookup_flags = LOOKUP_FOLLOW;

        error = setxattr_copy(name, &ctx);
        if (error)
                return error;

        filename = getname_maybe_null(pathname, at_flags);
        if (!filename && dfd >= 0) {
                CLASS(fd, f)(dfd);
                if (fd_empty(f))
                        error = -EBADF;
                else
                        error = file_setxattr(fd_file(f), &ctx);
        } else {
                error = filename_setxattr(dfd, filename, lookup_flags, &ctx);
        }
        kvfree(ctx.kvalue);
        return error;
}

SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
                const char __user *, name, const struct xattr_args __user *, uargs,
                size_t, usize)
{
        struct xattr_args args = {};
        int error;

        BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);

        if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
                return -EINVAL;
        if (usize > PAGE_SIZE)
                return -E2BIG;

        error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (error)
                return error;

        return path_setxattrat(dfd, pathname, at_flags, name,
                               u64_to_user_ptr(args.value), args.size,
                               args.flags);
}

SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
                const char __user *, name, const void __user *, value,
                size_t, size, int, flags)
{
        return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags);
}

SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
                const char __user *, name, const void __user *, value,
                size_t, size, int, flags)
{
        return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
                               value, size, flags);
}

SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                const void __user *,value, size_t, size, int, flags)
{
        return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name,
                               value, size, flags);
}

/*
 * Extended attribute GET operations
 */
static ssize_t
do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
        struct kernel_xattr_ctx *ctx)
{
        ssize_t error;
        char *kname = ctx->kname->name;
        void *kvalue = NULL;

        if (ctx->size) {
                if (ctx->size > XATTR_SIZE_MAX)
                        ctx->size = XATTR_SIZE_MAX;
                kvalue = kvzalloc(ctx->size, GFP_KERNEL);
                if (!kvalue)
                        return -ENOMEM;
        }

        if (is_posix_acl_xattr(kname))
                error = do_get_acl(idmap, d, kname, kvalue, ctx->size);
        else
                error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size);
        if (error > 0) {
                if (ctx->size && copy_to_user(ctx->value, kvalue, error))
                        error = -EFAULT;
        } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
                /* The file system tried to returned a value bigger
                   than XATTR_SIZE_MAX bytes. Not possible. */
                error = -E2BIG;
        }

        kvfree(kvalue);
        return error;
}

ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx)
{
        audit_file(f);
        return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
}

/* unconditionally consumes filename */
ssize_t filename_getxattr(int dfd, struct filename *filename,
                          unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
{
        struct path path;
        ssize_t error;
retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                goto out;
        error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        putname(filename);
        return error;
}

static ssize_t path_getxattrat(int dfd, const char __user *pathname,
                               unsigned int at_flags, const char __user *name,
                               void __user *value, size_t size)
{
        struct xattr_name kname;
        struct kernel_xattr_ctx ctx = {
                .value    = value,
                .size     = size,
                .kname    = &kname,
                .flags    = 0,
        };
        struct filename *filename;
        ssize_t error;

        if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;

        error = import_xattr_name(&kname, name);
        if (error)
                return error;

        filename = getname_maybe_null(pathname, at_flags);
        if (!filename && dfd >= 0) {
                CLASS(fd, f)(dfd);
                if (fd_empty(f))
                        return -EBADF;
                return file_getxattr(fd_file(f), &ctx);
        } else {
                int lookup_flags = 0;
                if (!(at_flags & AT_SYMLINK_NOFOLLOW))
                        lookup_flags = LOOKUP_FOLLOW;
                return filename_getxattr(dfd, filename, lookup_flags, &ctx);
        }
}

SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
                const char __user *, name, struct xattr_args __user *, uargs, size_t, usize)
{
        struct xattr_args args = {};
        int error;

        BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
        BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);

        if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
                return -EINVAL;
        if (usize > PAGE_SIZE)
                return -E2BIG;

        error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
        if (error)
                return error;

        if (args.flags != 0)
                return -EINVAL;

        return path_getxattrat(dfd, pathname, at_flags, name,
                               u64_to_user_ptr(args.value), args.size);
}

SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
                const char __user *, name, void __user *, value, size_t, size)
{
        return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size);
}

SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
                const char __user *, name, void __user *, value, size_t, size)
{
        return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
                               value, size);
}

SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
                void __user *, value, size_t, size)
{
        return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size);
}

/*
 * Extended attribute LIST operations
 */
static ssize_t
listxattr(struct dentry *d, char __user *list, size_t size)
{
        ssize_t error;
        char *klist = NULL;

        if (size) {
                if (size > XATTR_LIST_MAX)
                        size = XATTR_LIST_MAX;
                klist = kvmalloc(size, GFP_KERNEL);
                if (!klist)
                        return -ENOMEM;
        }

        error = vfs_listxattr(d, klist, size);
        if (error > 0) {
                if (size && copy_to_user(list, klist, error))
                        error = -EFAULT;
        } else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
                /* The file system tried to returned a list bigger
                   than XATTR_LIST_MAX bytes. Not possible. */
                error = -E2BIG;
        }

        kvfree(klist);

        return error;
}

static
ssize_t file_listxattr(struct file *f, char __user *list, size_t size)
{
        audit_file(f);
        return listxattr(f->f_path.dentry, list, size);
}

/* unconditionally consumes filename */
static
ssize_t filename_listxattr(int dfd, struct filename *filename,
                           unsigned int lookup_flags,
                           char __user *list, size_t size)
{
        struct path path;
        ssize_t error;
retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                goto out;
        error = listxattr(path.dentry, list, size);
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        putname(filename);
        return error;
}

static ssize_t path_listxattrat(int dfd, const char __user *pathname,
                                unsigned int at_flags, char __user *list,
                                size_t size)
{
        struct filename *filename;
        int lookup_flags;

        if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;

        filename = getname_maybe_null(pathname, at_flags);
        if (!filename) {
                CLASS(fd, f)(dfd);
                if (fd_empty(f))
                        return -EBADF;
                return file_listxattr(fd_file(f), list, size);
        }

        lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        return filename_listxattr(dfd, filename, lookup_flags, list, size);
}

SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname,
                unsigned int, at_flags,
                char __user *, list, size_t, size)
{
        return path_listxattrat(dfd, pathname, at_flags, list, size);
}

SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
                size_t, size)
{
        return path_listxattrat(AT_FDCWD, pathname, 0, list, size);
}

SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
                size_t, size)
{
        return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size);
}

SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
{
        return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size);
}

/*
 * Extended attribute REMOVE operations
 */
static long
removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name)
{
        if (is_posix_acl_xattr(name))
                return vfs_remove_acl(idmap, d, name);
        return vfs_removexattr(idmap, d, name);
}

static int file_removexattr(struct file *f, struct xattr_name *kname)
{
        int error = mnt_want_write_file(f);

        if (!error) {
                audit_file(f);
                error = removexattr(file_mnt_idmap(f),
                                    f->f_path.dentry, kname->name);
                mnt_drop_write_file(f);
        }
        return error;
}

/* unconditionally consumes filename */
static int filename_removexattr(int dfd, struct filename *filename,
                                unsigned int lookup_flags, struct xattr_name *kname)
{
        struct path path;
        int error;

retry:
        error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
        if (error)
                goto out;
        error = mnt_want_write(path.mnt);
        if (!error) {
                error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name);
                mnt_drop_write(path.mnt);
        }
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out:
        putname(filename);
        return error;
}

static int path_removexattrat(int dfd, const char __user *pathname,
                              unsigned int at_flags, const char __user *name)
{
        struct xattr_name kname;
        struct filename *filename;
        unsigned int lookup_flags;
        int error;

        if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;

        error = import_xattr_name(&kname, name);
        if (error)
                return error;

        filename = getname_maybe_null(pathname, at_flags);
        if (!filename) {
                CLASS(fd, f)(dfd);
                if (fd_empty(f))
                        return -EBADF;
                return file_removexattr(fd_file(f), &kname);
        }
        lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
        return filename_removexattr(dfd, filename, lookup_flags, &kname);
}

SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname,
                unsigned int, at_flags, const char __user *, name)
{
        return path_removexattrat(dfd, pathname, at_flags, name);
}

SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
                const char __user *, name)
{
        return path_removexattrat(AT_FDCWD, pathname, 0, name);
}

SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
                const char __user *, name)
{
        return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name);
}

SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
        return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name);
}

int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
{
        size_t len;

        len = strlen(name) + 1;
        if (*buffer) {
                if (*remaining_size < len)
                        return -ERANGE;
                memcpy(*buffer, name, len);
                *buffer += len;
        }
        *remaining_size -= len;
        return 0;
}

/**
 * generic_listxattr - run through a dentry's xattr list() operations
 * @dentry: dentry to list the xattrs
 * @buffer: result buffer
 * @buffer_size: size of @buffer
 *
 * Combine the results of the list() operation from every xattr_handler in the
 * xattr_handler stack.
 *
 * Note that this will not include the entries for POSIX ACLs.
 */
ssize_t
generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
        const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
        ssize_t remaining_size = buffer_size;

        for_each_xattr_handler(handlers, handler) {
                int err;

                if (!handler->name || (handler->list && !handler->list(dentry)))
                        continue;
                err = xattr_list_one(&buffer, &remaining_size, handler->name);
                if (err)
                        return err;
        }

        return buffer_size - remaining_size;
}
EXPORT_SYMBOL(generic_listxattr);

/**
 * xattr_full_name  -  Compute full attribute name from suffix
 *
 * @handler:        handler of the xattr_handler operation
 * @name:        name passed to the xattr_handler operation
 *
 * The get and set xattr handler operations are called with the remainder of
 * the attribute name after skipping the handler's prefix: for example, "foo"
 * is passed to the get operation of a handler with prefix "user." to get
 * attribute "user.foo".  The full name is still "there" in the name though.
 *
 * Note: the list xattr handler operation when called from the vfs is passed a
 * NULL name; some file systems use this operation internally, with varying
 * semantics.
 */
const char *xattr_full_name(const struct xattr_handler *handler,
                            const char *name)
{
        size_t prefix_len = strlen(xattr_prefix(handler));

        return name - prefix_len;
}
EXPORT_SYMBOL(xattr_full_name);

/**
 * simple_xattr_space - estimate the memory used by a simple xattr
 * @name: the full name of the xattr
 * @size: the size of its value
 *
 * This takes no account of how much larger the two slab objects actually are:
 * that would depend on the slab implementation, when what is required is a
 * deterministic number, which grows with name length and size and quantity.
 *
 * Return: The approximate number of bytes of memory used by such an xattr.
 */
size_t simple_xattr_space(const char *name, size_t size)
{
        /*
         * Use "40" instead of sizeof(struct simple_xattr), to return the
         * same result on 32-bit and 64-bit, and even if simple_xattr grows.
         */
        return 40 + size + strlen(name);
}

/**
 * simple_xattr_free - free an xattr object
 * @xattr: the xattr object
 *
 * Free the xattr object. Can handle @xattr being NULL.
 */
void simple_xattr_free(struct simple_xattr *xattr)
{
        if (xattr)
                kfree(xattr->name);
        kvfree(xattr);
}

/**
 * simple_xattr_alloc - allocate new xattr object
 * @value: value of the xattr object
 * @size: size of @value
 *
 * Allocate a new xattr object and initialize respective members. The caller is
 * responsible for handling the name of the xattr.
 *
 * Return: On success a new xattr object is returned. On failure NULL is
 * returned.
 */
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
{
        struct simple_xattr *new_xattr;
        size_t len;

        /* wrap around? */
        len = sizeof(*new_xattr) + size;
        if (len < sizeof(*new_xattr))
                return NULL;

        new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT);
        if (!new_xattr)
                return NULL;

        new_xattr->size = size;
        memcpy(new_xattr->value, value, size);
        return new_xattr;
}

/**
 * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry
 * @key: xattr name
 * @node: current node
 *
 * Compare the xattr name with the xattr name attached to @node in the rbtree.
 *
 * Return: Negative value if continuing left, positive if continuing right, 0
 * if the xattr attached to @node matches @key.
 */
static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node)
{
        const char *xattr_name = key;
        const struct simple_xattr *xattr;

        xattr = rb_entry(node, struct simple_xattr, rb_node);
        return strcmp(xattr->name, xattr_name);
}

/**
 * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes
 * @new_node: new node
 * @node: current node
 *
 * Compare the xattr attached to @new_node with the xattr attached to @node.
 *
 * Return: Negative value if continuing left, positive if continuing right, 0
 * if the xattr attached to @new_node matches the xattr attached to @node.
 */
static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node,
                                        const struct rb_node *node)
{
        struct simple_xattr *xattr;
        xattr = rb_entry(new_node, struct simple_xattr, rb_node);
        return rbtree_simple_xattr_cmp(xattr->name, node);
}

/**
 * simple_xattr_get - get an xattr object
 * @xattrs: the header of the xattr object
 * @name: the name of the xattr to retrieve
 * @buffer: the buffer to store the value into
 * @size: the size of @buffer
 *
 * Try to find and retrieve the xattr object associated with @name.
 * If @buffer is provided store the value of @xattr in @buffer
 * otherwise just return the length. The size of @buffer is limited
 * to XATTR_SIZE_MAX which currently is 65536.
 *
 * Return: On success the length of the xattr value is returned. On error a
 * negative error code is returned.
 */
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
                     void *buffer, size_t size)
{
        struct simple_xattr *xattr = NULL;
        struct rb_node *rbp;
        int ret = -ENODATA;

        read_lock(&xattrs->lock);
        rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp);
        if (rbp) {
                xattr = rb_entry(rbp, struct simple_xattr, rb_node);
                ret = xattr->size;
                if (buffer) {
                        if (size < xattr->size)
                                ret = -ERANGE;
                        else
                                memcpy(buffer, xattr->value, xattr->size);
                }
        }
        read_unlock(&xattrs->lock);
        return ret;
}

/**
 * simple_xattr_set - set an xattr object
 * @xattrs: the header of the xattr object
 * @name: the name of the xattr to retrieve
 * @value: the value to store along the xattr
 * @size: the size of @value
 * @flags: the flags determining how to set the xattr
 *
 * Set a new xattr object.
 * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE
 * is specified in @flags a matching xattr object for @name must already exist.
 * If it does it will be replaced with the new xattr object. If it doesn't we
 * fail. If XATTR_CREATE is specified and a matching xattr does already exist
 * we fail. If it doesn't we create a new xattr. If @flags is zero we simply
 * insert the new xattr replacing any existing one.
 *
 * If @value is empty and a matching xattr object is found we delete it if
 * XATTR_REPLACE is specified in @flags or @flags is zero.
 *
 * If @value is empty and no matching xattr object for @name is found we do
 * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For
 * XATTR_REPLACE we fail as mentioned above.
 *
 * Return: On success, the removed or replaced xattr is returned, to be freed
 * by the caller; or NULL if none. On failure a negative error code is returned.
 */
struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs,
                                      const char *name, const void *value,
                                      size_t size, int flags)
{
        struct simple_xattr *old_xattr = NULL, *new_xattr = NULL;
        struct rb_node *parent = NULL, **rbp;
        int err = 0, ret;

        /* value == NULL means remove */
        if (value) {
                new_xattr = simple_xattr_alloc(value, size);
                if (!new_xattr)
                        return ERR_PTR(-ENOMEM);

                new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT);
                if (!new_xattr->name) {
                        simple_xattr_free(new_xattr);
                        return ERR_PTR(-ENOMEM);
                }
        }

        write_lock(&xattrs->lock);
        rbp = &xattrs->rb_root.rb_node;
        while (*rbp) {
                parent = *rbp;
                ret = rbtree_simple_xattr_cmp(name, *rbp);
                if (ret < 0)
                        rbp = &(*rbp)->rb_left;
                else if (ret > 0)
                        rbp = &(*rbp)->rb_right;
                else
                        old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node);
                if (old_xattr)
                        break;
        }

        if (old_xattr) {
                /* Fail if XATTR_CREATE is requested and the xattr exists. */
                if (flags & XATTR_CREATE) {
                        err = -EEXIST;
                        goto out_unlock;
                }

                if (new_xattr)
                        rb_replace_node(&old_xattr->rb_node,
                                        &new_xattr->rb_node, &xattrs->rb_root);
                else
                        rb_erase(&old_xattr->rb_node, &xattrs->rb_root);
        } else {
                /* Fail if XATTR_REPLACE is requested but no xattr is found. */
                if (flags & XATTR_REPLACE) {
                        err = -ENODATA;
                        goto out_unlock;
                }

                /*
                 * If XATTR_CREATE or no flags are specified together with a
                 * new value simply insert it.
                 */
                if (new_xattr) {
                        rb_link_node(&new_xattr->rb_node, parent, rbp);
                        rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root);
                }

                /*
                 * If XATTR_CREATE or no flags are specified and neither an
                 * old or new xattr exist then we don't need to do anything.
                 */
        }

out_unlock:
        write_unlock(&xattrs->lock);
        if (!err)
                return old_xattr;
        simple_xattr_free(new_xattr);
        return ERR_PTR(err);
}

static bool xattr_is_trusted(const char *name)
{
        return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}

/**
 * simple_xattr_list - list all xattr objects
 * @inode: inode from which to get the xattrs
 * @xattrs: the header of the xattr object
 * @buffer: the buffer to store all xattrs into
 * @size: the size of @buffer
 *
 * List all xattrs associated with @inode. If @buffer is NULL we returned
 * the required size of the buffer. If @buffer is provided we store the
 * xattrs value into it provided it is big enough.
 *
 * Note, the number of xattr names that can be listed with listxattr(2) is
 * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed
 * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names
 * are found it will return -E2BIG.
 *
 * Return: On success the required size or the size of the copied xattrs is
 * returned. On error a negative error code is returned.
 */
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
                          char *buffer, size_t size)
{
        bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
        struct simple_xattr *xattr;
        struct rb_node *rbp;
        ssize_t remaining_size = size;
        int err = 0;

        err = posix_acl_listxattr(inode, &buffer, &remaining_size);
        if (err)
                return err;

        read_lock(&xattrs->lock);
        for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) {
                xattr = rb_entry(rbp, struct simple_xattr, rb_node);

                /* skip "trusted." attributes for unprivileged callers */
                if (!trusted && xattr_is_trusted(xattr->name))
                        continue;

                err = xattr_list_one(&buffer, &remaining_size, xattr->name);
                if (err)
                        break;
        }
        read_unlock(&xattrs->lock);

        return err ? err : size - remaining_size;
}

/**
 * rbtree_simple_xattr_less - compare two xattr rbtree nodes
 * @new_node: new node
 * @node: current node
 *
 * Compare the xattr attached to @new_node with the xattr attached to @node.
 * Note that this function technically tolerates duplicate entries.
 *
 * Return: True if insertion point in the rbtree is found.
 */
static bool rbtree_simple_xattr_less(struct rb_node *new_node,
                                     const struct rb_node *node)
{
        return rbtree_simple_xattr_node_cmp(new_node, node) < 0;
}

/**
 * simple_xattr_add - add xattr objects
 * @xattrs: the header of the xattr object
 * @new_xattr: the xattr object to add
 *
 * Add an xattr object to @xattrs. This assumes no replacement or removal
 * of matching xattrs is wanted. Should only be called during inode
 * initialization when a few distinct initial xattrs are supposed to be set.
 */
void simple_xattr_add(struct simple_xattrs *xattrs,
                      struct simple_xattr *new_xattr)
{
        write_lock(&xattrs->lock);
        rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less);
        write_unlock(&xattrs->lock);
}

/**
 * simple_xattrs_init - initialize new xattr header
 * @xattrs: header to initialize
 *
 * Initialize relevant fields of a an xattr header.
 */
void simple_xattrs_init(struct simple_xattrs *xattrs)
{
        xattrs->rb_root = RB_ROOT;
        rwlock_init(&xattrs->lock);
}

/**
 * simple_xattrs_free - free xattrs
 * @xattrs: xattr header whose xattrs to destroy
 * @freed_space: approximate number of bytes of memory freed from @xattrs
 *
 * Destroy all xattrs in @xattr. When this is called no one can hold a
 * reference to any of the xattrs anymore.
 */
void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space)
{
        struct rb_node *rbp;

        if (freed_space)
                *freed_space = 0;
        rbp = rb_first(&xattrs->rb_root);
        while (rbp) {
                struct simple_xattr *xattr;
                struct rb_node *rbp_next;

                rbp_next = rb_next(rbp);
                xattr = rb_entry(rbp, struct simple_xattr, rb_node);
                rb_erase(&xattr->rb_node, &xattrs->rb_root);
                if (freed_space)
                        *freed_space += simple_xattr_space(xattr->name,
                                                           xattr->size);
                simple_xattr_free(xattr);
                rbp = rbp_next;
        }
}




































































































































































































































































































































































































































































































































































































































































   25 

















  247 



















































































































































































































































   25 






















  247 


  307 



























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk).
 *
 * (C) SGI 2006, Christoph Lameter
 *         Cleaned up and restructured to ease the addition of alternative
 *         implementations of SLAB allocators.
 * (C) Linux Foundation 2008-2013
 *      Unified interface for all slab allocators
 */

#ifndef _LINUX_SLAB_H
#define        _LINUX_SLAB_H

#include <linux/cache.h>
#include <linux/gfp.h>
#include <linux/overflow.h>
#include <linux/types.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/percpu-refcount.h>
#include <linux/cleanup.h>
#include <linux/hash.h>

enum _slab_flag_bits {
        _SLAB_CONSISTENCY_CHECKS,
        _SLAB_RED_ZONE,
        _SLAB_POISON,
        _SLAB_KMALLOC,
        _SLAB_HWCACHE_ALIGN,
        _SLAB_CACHE_DMA,
        _SLAB_CACHE_DMA32,
        _SLAB_STORE_USER,
        _SLAB_PANIC,
        _SLAB_TYPESAFE_BY_RCU,
        _SLAB_TRACE,
#ifdef CONFIG_DEBUG_OBJECTS
        _SLAB_DEBUG_OBJECTS,
#endif
        _SLAB_NOLEAKTRACE,
        _SLAB_NO_MERGE,
#ifdef CONFIG_FAILSLAB
        _SLAB_FAILSLAB,
#endif
#ifdef CONFIG_MEMCG
        _SLAB_ACCOUNT,
#endif
#ifdef CONFIG_KASAN_GENERIC
        _SLAB_KASAN,
#endif
        _SLAB_NO_USER_FLAGS,
#ifdef CONFIG_KFENCE
        _SLAB_SKIP_KFENCE,
#endif
#ifndef CONFIG_SLUB_TINY
        _SLAB_RECLAIM_ACCOUNT,
#endif
        _SLAB_OBJECT_POISON,
        _SLAB_CMPXCHG_DOUBLE,
#ifdef CONFIG_SLAB_OBJ_EXT
        _SLAB_NO_OBJ_EXT,
#endif
        _SLAB_FLAGS_LAST_BIT
};

#define __SLAB_FLAG_BIT(nr)        ((slab_flags_t __force)(1U << (nr)))
#define __SLAB_FLAG_UNUSED        ((slab_flags_t __force)(0U))

/*
 * Flags to pass to kmem_cache_create().
 * The ones marked DEBUG need CONFIG_SLUB_DEBUG enabled, otherwise are no-op
 */
/* DEBUG: Perform (expensive) checks on alloc/free */
#define SLAB_CONSISTENCY_CHECKS        __SLAB_FLAG_BIT(_SLAB_CONSISTENCY_CHECKS)
/* DEBUG: Red zone objs in a cache */
#define SLAB_RED_ZONE                __SLAB_FLAG_BIT(_SLAB_RED_ZONE)
/* DEBUG: Poison objects */
#define SLAB_POISON                __SLAB_FLAG_BIT(_SLAB_POISON)
/* Indicate a kmalloc slab */
#define SLAB_KMALLOC                __SLAB_FLAG_BIT(_SLAB_KMALLOC)
/**
 * define SLAB_HWCACHE_ALIGN - Align objects on cache line boundaries.
 *
 * Sufficiently large objects are aligned on cache line boundary. For object
 * size smaller than a half of cache line size, the alignment is on the half of
 * cache line size. In general, if object size is smaller than 1/2^n of cache
 * line size, the alignment is adjusted to 1/2^n.
 *
 * If explicit alignment is also requested by the respective
 * &struct kmem_cache_args field, the greater of both is alignments is applied.
 */
#define SLAB_HWCACHE_ALIGN        __SLAB_FLAG_BIT(_SLAB_HWCACHE_ALIGN)
/* Use GFP_DMA memory */
#define SLAB_CACHE_DMA                __SLAB_FLAG_BIT(_SLAB_CACHE_DMA)
/* Use GFP_DMA32 memory */
#define SLAB_CACHE_DMA32        __SLAB_FLAG_BIT(_SLAB_CACHE_DMA32)
/* DEBUG: Store the last owner for bug hunting */
#define SLAB_STORE_USER                __SLAB_FLAG_BIT(_SLAB_STORE_USER)
/* Panic if kmem_cache_create() fails */
#define SLAB_PANIC                __SLAB_FLAG_BIT(_SLAB_PANIC)
/**
 * define SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS!
 *
 * This delays freeing the SLAB page by a grace period, it does _NOT_
 * delay object freeing. This means that if you do kmem_cache_free()
 * that memory location is free to be reused at any time. Thus it may
 * be possible to see another object there in the same RCU grace period.
 *
 * This feature only ensures the memory location backing the object
 * stays valid, the trick to using this is relying on an independent
 * object validation pass. Something like:
 *
 * ::
 *
 *  begin:
 *   rcu_read_lock();
 *   obj = lockless_lookup(key);
 *   if (obj) {
 *     if (!try_get_ref(obj)) // might fail for free objects
 *       rcu_read_unlock();
 *       goto begin;
 *
 *     if (obj->key != key) { // not the object we expected
 *       put_ref(obj);
 *       rcu_read_unlock();
 *       goto begin;
 *     }
 *   }
 *  rcu_read_unlock();
 *
 * This is useful if we need to approach a kernel structure obliquely,
 * from its address obtained without the usual locking. We can lock
 * the structure to stabilize it and check it's still at the given address,
 * only if we can be sure that the memory has not been meanwhile reused
 * for some other kind of object (which our subsystem's lock might corrupt).
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
 *
 * Note that object identity check has to be done *after* acquiring a
 * reference, therefore user has to ensure proper ordering for loads.
 * Similarly, when initializing objects allocated with SLAB_TYPESAFE_BY_RCU,
 * the newly allocated object has to be fully initialized *before* its
 * refcount gets initialized and proper ordering for stores is required.
 * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() are
 * designed with the proper fences required for reference counting objects
 * allocated with SLAB_TYPESAFE_BY_RCU.
 *
 * Note that it is not possible to acquire a lock within a structure
 * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference
 * as described above.  The reason is that SLAB_TYPESAFE_BY_RCU pages
 * are not zeroed before being given to the slab, which means that any
 * locks must be initialized after each and every kmem_struct_alloc().
 * Alternatively, make the ctor passed to kmem_cache_create() initialize
 * the locks at page-allocation time, as is done in __i915_request_ctor(),
 * sighand_ctor(), and anon_vma_ctor().  Such a ctor permits readers
 * to safely acquire those ctor-initialized locks under rcu_read_lock()
 * protection.
 *
 * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU.
 */
#define SLAB_TYPESAFE_BY_RCU        __SLAB_FLAG_BIT(_SLAB_TYPESAFE_BY_RCU)
/* Trace allocations and frees */
#define SLAB_TRACE                __SLAB_FLAG_BIT(_SLAB_TRACE)

/* Flag to prevent checks on free */
#ifdef CONFIG_DEBUG_OBJECTS
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_BIT(_SLAB_DEBUG_OBJECTS)
#else
# define SLAB_DEBUG_OBJECTS        __SLAB_FLAG_UNUSED
#endif

/* Avoid kmemleak tracing */
#define SLAB_NOLEAKTRACE        __SLAB_FLAG_BIT(_SLAB_NOLEAKTRACE)

/*
 * Prevent merging with compatible kmem caches. This flag should be used
 * cautiously. Valid use cases:
 *
 * - caches created for self-tests (e.g. kunit)
 * - general caches created and used by a subsystem, only when a
 *   (subsystem-specific) debug option is enabled
 * - performance critical caches, should be very rare and consulted with slab
 *   maintainers, and not used together with CONFIG_SLUB_TINY
 */
#define SLAB_NO_MERGE                __SLAB_FLAG_BIT(_SLAB_NO_MERGE)

/* Fault injection mark */
#ifdef CONFIG_FAILSLAB
# define SLAB_FAILSLAB                __SLAB_FLAG_BIT(_SLAB_FAILSLAB)
#else
# define SLAB_FAILSLAB                __SLAB_FLAG_UNUSED
#endif
/**
 * define SLAB_ACCOUNT - Account allocations to memcg.
 *
 * All object allocations from this cache will be memcg accounted, regardless of
 * __GFP_ACCOUNT being or not being passed to individual allocations.
 */
#ifdef CONFIG_MEMCG
# define SLAB_ACCOUNT                __SLAB_FLAG_BIT(_SLAB_ACCOUNT)
#else
# define SLAB_ACCOUNT                __SLAB_FLAG_UNUSED
#endif

#ifdef CONFIG_KASAN_GENERIC
#define SLAB_KASAN                __SLAB_FLAG_BIT(_SLAB_KASAN)
#else
#define SLAB_KASAN                __SLAB_FLAG_UNUSED
#endif

/*
 * Ignore user specified debugging flags.
 * Intended for caches created for self-tests so they have only flags
 * specified in the code and other flags are ignored.
 */
#define SLAB_NO_USER_FLAGS        __SLAB_FLAG_BIT(_SLAB_NO_USER_FLAGS)

#ifdef CONFIG_KFENCE
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_BIT(_SLAB_SKIP_KFENCE)
#else
#define SLAB_SKIP_KFENCE        __SLAB_FLAG_UNUSED
#endif

/* The following flags affect the page allocator grouping pages by mobility */
/**
 * define SLAB_RECLAIM_ACCOUNT - Objects are reclaimable.
 *
 * Use this flag for caches that have an associated shrinker. As a result, slab
 * pages are allocated with __GFP_RECLAIMABLE, which affects grouping pages by
 * mobility, and are accounted in SReclaimable counter in /proc/meminfo
 */
#ifndef CONFIG_SLUB_TINY
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_BIT(_SLAB_RECLAIM_ACCOUNT)
#else
#define SLAB_RECLAIM_ACCOUNT        __SLAB_FLAG_UNUSED
#endif
#define SLAB_TEMPORARY                SLAB_RECLAIM_ACCOUNT        /* Objects are short-lived */

/* Slab created using create_boot_cache */
#ifdef CONFIG_SLAB_OBJ_EXT
#define SLAB_NO_OBJ_EXT                __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT)
#else
#define SLAB_NO_OBJ_EXT                __SLAB_FLAG_UNUSED
#endif

/*
 * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
 *
 * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault.
 *
 * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can.
 * Both make kfree a no-op.
 */
#define ZERO_SIZE_PTR ((void *)16)

#define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
                                (unsigned long)ZERO_SIZE_PTR)

#include <linux/kasan.h>

struct list_lru;
struct mem_cgroup;
/*
 * struct kmem_cache related prototypes
 */
bool slab_is_available(void);

/**
 * struct kmem_cache_args - Less common arguments for kmem_cache_create()
 *
 * Any uninitialized fields of the structure are interpreted as unused. The
 * exception is @freeptr_offset where %0 is a valid value, so
 * @use_freeptr_offset must be also set to %true in order to interpret the field
 * as used. For @useroffset %0 is also valid, but only with non-%0
 * @usersize.
 *
 * When %NULL args is passed to kmem_cache_create(), it is equivalent to all
 * fields unused.
 */
struct kmem_cache_args {
        /**
         * @align: The required alignment for the objects.
         *
         * %0 means no specific alignment is requested.
         */
        unsigned int align;
        /**
         * @useroffset: Usercopy region offset.
         *
         * %0 is a valid offset, when @usersize is non-%0
         */
        unsigned int useroffset;
        /**
         * @usersize: Usercopy region size.
         *
         * %0 means no usercopy region is specified.
         */
        unsigned int usersize;
        /**
         * @freeptr_offset: Custom offset for the free pointer
         * in &SLAB_TYPESAFE_BY_RCU caches
         *
         * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
         * outside of the object. This might cause the object to grow in size.
         * Cache creators that have a reason to avoid this can specify a custom
         * free pointer offset in their struct where the free pointer will be
         * placed.
         *
         * Note that placing the free pointer inside the object requires the
         * caller to ensure that no fields are invalidated that are required to
         * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
         * details).
         *
         * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
         * is specified, %use_freeptr_offset must be set %true.
         *
         * Note that @ctor currently isn't supported with custom free pointers
         * as a @ctor requires an external free pointer.
         */
        unsigned int freeptr_offset;
        /**
         * @use_freeptr_offset: Whether a @freeptr_offset is used.
         */
        bool use_freeptr_offset;
        /**
         * @ctor: A constructor for the objects.
         *
         * The constructor is invoked for each object in a newly allocated slab
         * page. It is the cache user's responsibility to free object in the
         * same state as after calling the constructor, or deal appropriately
         * with any differences between a freshly constructed and a reallocated
         * object.
         *
         * %NULL means no constructor.
         */
        void (*ctor)(void *);
};

struct kmem_cache *__kmem_cache_create_args(const char *name,
                                            unsigned int object_size,
                                            struct kmem_cache_args *args,
                                            slab_flags_t flags);
static inline struct kmem_cache *
__kmem_cache_create(const char *name, unsigned int size, unsigned int align,
                    slab_flags_t flags, void (*ctor)(void *))
{
        struct kmem_cache_args kmem_args = {
                .align        = align,
                .ctor        = ctor,
        };

        return __kmem_cache_create_args(name, size, &kmem_args, flags);
}

/**
 * kmem_cache_create_usercopy - Create a kmem cache with a region suitable
 * for copying to userspace.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @useroffset: Usercopy region offset
 * @usersize: Usercopy region size
 * @ctor: A constructor for the objects, or %NULL.
 *
 * This is a legacy wrapper, new code should use either KMEM_CACHE_USERCOPY()
 * if whitelisting a single field is sufficient, or kmem_cache_create() with
 * the necessary parameters passed via the args parameter (see
 * &struct kmem_cache_args)
 *
 * Return: a pointer to the cache on success, NULL on failure.
 */
static inline struct kmem_cache *
kmem_cache_create_usercopy(const char *name, unsigned int size,
                           unsigned int align, slab_flags_t flags,
                           unsigned int useroffset, unsigned int usersize,
                           void (*ctor)(void *))
{
        struct kmem_cache_args kmem_args = {
                .align                = align,
                .ctor                = ctor,
                .useroffset        = useroffset,
                .usersize        = usersize,
        };

        return __kmem_cache_create_args(name, size, &kmem_args, flags);
}

/* If NULL is passed for @args, use this variant with default arguments. */
static inline struct kmem_cache *
__kmem_cache_default_args(const char *name, unsigned int size,
                          struct kmem_cache_args *args,
                          slab_flags_t flags)
{
        struct kmem_cache_args kmem_default_args = {};

        /* Make sure we don't get passed garbage. */
        if (WARN_ON_ONCE(args))
                return ERR_PTR(-EINVAL);

        return __kmem_cache_create_args(name, size, &kmem_default_args, flags);
}

/**
 * kmem_cache_create - Create a kmem cache.
 * @__name: A string which is used in /proc/slabinfo to identify this cache.
 * @__object_size: The size of objects to be created in this cache.
 * @__args: Optional arguments, see &struct kmem_cache_args. Passing %NULL
 *            means defaults will be used for all the arguments.
 *
 * This is currently implemented as a macro using ``_Generic()`` to call
 * either the new variant of the function, or a legacy one.
 *
 * The new variant has 4 parameters:
 * ``kmem_cache_create(name, object_size, args, flags)``
 *
 * See __kmem_cache_create_args() which implements this.
 *
 * The legacy variant has 5 parameters:
 * ``kmem_cache_create(name, object_size, align, flags, ctor)``
 *
 * The align and ctor parameters map to the respective fields of
 * &struct kmem_cache_args
 *
 * Context: Cannot be called within a interrupt, but can be interrupted.
 *
 * Return: a pointer to the cache on success, NULL on failure.
 */
#define kmem_cache_create(__name, __object_size, __args, ...)           \
        _Generic((__args),                                              \
                struct kmem_cache_args *: __kmem_cache_create_args,        \
                void *: __kmem_cache_default_args,                        \
                default: __kmem_cache_create)(__name, __object_size, __args, __VA_ARGS__)

void kmem_cache_destroy(struct kmem_cache *s);
int kmem_cache_shrink(struct kmem_cache *s);

/*
 * Please use this macro to create slab caches. Simply specify the
 * name of the structure and maybe some flags that are listed above.
 *
 * The alignment of the struct determines object alignment. If you
 * f.e. add ____cacheline_aligned_in_smp to the struct declaration
 * then the objects will be properly aligned in SMP configurations.
 */
#define KMEM_CACHE(__struct, __flags)                                   \
        __kmem_cache_create_args(#__struct, sizeof(struct __struct),    \
                        &(struct kmem_cache_args) {                        \
                                .align        = __alignof__(struct __struct), \
                        }, (__flags))

/*
 * To whitelist a single field for copying to/from usercopy, use this
 * macro instead for KMEM_CACHE() above.
 */
#define KMEM_CACHE_USERCOPY(__struct, __flags, __field)                                                \
        __kmem_cache_create_args(#__struct, sizeof(struct __struct),                                \
                        &(struct kmem_cache_args) {                                                \
                                .align                = __alignof__(struct __struct),                        \
                                .useroffset        = offsetof(struct __struct, __field),                \
                                .usersize        = sizeof_field(struct __struct, __field),        \
                        }, (__flags))

/*
 * Common kmalloc functions provided by all allocators
 */
void * __must_check krealloc_noprof(const void *objp, size_t new_size,
                                    gfp_t flags) __realloc_size(2);
#define krealloc(...)                                alloc_hooks(krealloc_noprof(__VA_ARGS__))

void kfree(const void *objp);
void kfree_sensitive(const void *objp);
size_t __ksize(const void *objp);

DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T))
DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T))

/**
 * ksize - Report actual allocation size of associated object
 *
 * @objp: Pointer returned from a prior kmalloc()-family allocation.
 *
 * This should not be used for writing beyond the originally requested
 * allocation size. Either use krealloc() or round up the allocation size
 * with kmalloc_size_roundup() prior to allocation. If this is used to
 * access beyond the originally requested allocation size, UBSAN_BOUNDS
 * and/or FORTIFY_SOURCE may trip, since they only know about the
 * originally allocated size via the __alloc_size attribute.
 */
size_t ksize(const void *objp);

#ifdef CONFIG_PRINTK
bool kmem_dump_obj(void *object);
#else
static inline bool kmem_dump_obj(void *object) { return false; }
#endif

/*
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
 * alignment larger than the alignment of a 64-bit integer.
 * Setting ARCH_DMA_MINALIGN in arch headers allows that.
 */
#ifdef ARCH_HAS_DMA_MINALIGN
#if ARCH_DMA_MINALIGN > 8 && !defined(ARCH_KMALLOC_MINALIGN)
#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
#endif
#endif

#ifndef ARCH_KMALLOC_MINALIGN
#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
#elif ARCH_KMALLOC_MINALIGN > 8
#define KMALLOC_MIN_SIZE ARCH_KMALLOC_MINALIGN
#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
#endif

/*
 * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
 * Intended for arches that get misalignment faults even for 64 bit integer
 * aligned buffers.
 */
#ifndef ARCH_SLAB_MINALIGN
#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
#endif

/*
 * Arches can define this function if they want to decide the minimum slab
 * alignment at runtime. The value returned by the function must be a power
 * of two and >= ARCH_SLAB_MINALIGN.
 */
#ifndef arch_slab_minalign
static inline unsigned int arch_slab_minalign(void)
{
        return ARCH_SLAB_MINALIGN;
}
#endif

/*
 * kmem_cache_alloc and friends return pointers aligned to ARCH_SLAB_MINALIGN.
 * kmalloc and friends return pointers aligned to both ARCH_KMALLOC_MINALIGN
 * and ARCH_SLAB_MINALIGN, but here we only assume the former alignment.
 */
#define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN)
#define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN)
#define __assume_page_alignment __assume_aligned(PAGE_SIZE)

/*
 * Kmalloc array related definitions
 */

/*
 * SLUB directly allocates requests fitting in to an order-1 page
 * (PAGE_SIZE*2).  Larger requests are passed to the page allocator.
 */
#define KMALLOC_SHIFT_HIGH        (PAGE_SHIFT + 1)
#define KMALLOC_SHIFT_MAX        (MAX_PAGE_ORDER + PAGE_SHIFT)
#ifndef KMALLOC_SHIFT_LOW
#define KMALLOC_SHIFT_LOW        3
#endif

/* Maximum allocatable size */
#define KMALLOC_MAX_SIZE        (1UL << KMALLOC_SHIFT_MAX)
/* Maximum size for which we actually use a slab cache */
#define KMALLOC_MAX_CACHE_SIZE        (1UL << KMALLOC_SHIFT_HIGH)
/* Maximum order allocatable via the slab allocator */
#define KMALLOC_MAX_ORDER        (KMALLOC_SHIFT_MAX - PAGE_SHIFT)

/*
 * Kmalloc subsystem.
 */
#ifndef KMALLOC_MIN_SIZE
#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
#endif

/*
 * This restriction comes from byte sized index implementation.
 * Page size is normally 2^12 bytes and, in this case, if we want to use
 * byte sized index which can represent 2^8 entries, the size of the object
 * should be equal or greater to 2^12 / 2^8 = 2^4 = 16.
 * If minimum size of kmalloc is less than 16, we use it as minimum object
 * size and give up to use byte sized index.
 */
#define SLAB_OBJ_MIN_SIZE      (KMALLOC_MIN_SIZE < 16 ? \
                               (KMALLOC_MIN_SIZE) : 16)

#ifdef CONFIG_RANDOM_KMALLOC_CACHES
#define RANDOM_KMALLOC_CACHES_NR        15 // # of cache copies
#else
#define RANDOM_KMALLOC_CACHES_NR        0
#endif

/*
 * Whenever changing this, take care of that kmalloc_type() and
 * create_kmalloc_caches() still work as intended.
 *
 * KMALLOC_NORMAL can contain only unaccounted objects whereas KMALLOC_CGROUP
 * is for accounted but unreclaimable and non-dma objects. All the other
 * kmem caches can have both accounted and unaccounted objects.
 */
enum kmalloc_cache_type {
        KMALLOC_NORMAL = 0,
#ifndef CONFIG_ZONE_DMA
        KMALLOC_DMA = KMALLOC_NORMAL,
#endif
#ifndef CONFIG_MEMCG
        KMALLOC_CGROUP = KMALLOC_NORMAL,
#endif
        KMALLOC_RANDOM_START = KMALLOC_NORMAL,
        KMALLOC_RANDOM_END = KMALLOC_RANDOM_START + RANDOM_KMALLOC_CACHES_NR,
#ifdef CONFIG_SLUB_TINY
        KMALLOC_RECLAIM = KMALLOC_NORMAL,
#else
        KMALLOC_RECLAIM,
#endif
#ifdef CONFIG_ZONE_DMA
        KMALLOC_DMA,
#endif
#ifdef CONFIG_MEMCG
        KMALLOC_CGROUP,
#endif
        NR_KMALLOC_TYPES
};

typedef struct kmem_cache * kmem_buckets[KMALLOC_SHIFT_HIGH + 1];

extern kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES];

/*
 * Define gfp bits that should not be set for KMALLOC_NORMAL.
 */
#define KMALLOC_NOT_NORMAL_BITS                                        \
        (__GFP_RECLAIMABLE |                                        \
        (IS_ENABLED(CONFIG_ZONE_DMA)   ? __GFP_DMA : 0) |        \
        (IS_ENABLED(CONFIG_MEMCG) ? __GFP_ACCOUNT : 0))

extern unsigned long random_kmalloc_seed;

static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags, unsigned long caller)
{
        /*
         * The most common case is KMALLOC_NORMAL, so test for it
         * with a single branch for all the relevant flags.
         */
        if (likely((flags & KMALLOC_NOT_NORMAL_BITS) == 0))
#ifdef CONFIG_RANDOM_KMALLOC_CACHES
                /* RANDOM_KMALLOC_CACHES_NR (=15) copies + the KMALLOC_NORMAL */
                return KMALLOC_RANDOM_START + hash_64(caller ^ random_kmalloc_seed,
                                                      ilog2(RANDOM_KMALLOC_CACHES_NR + 1));
#else
                return KMALLOC_NORMAL;
#endif

        /*
         * At least one of the flags has to be set. Their priorities in
         * decreasing order are:
         *  1) __GFP_DMA
         *  2) __GFP_RECLAIMABLE
         *  3) __GFP_ACCOUNT
         */
        if (IS_ENABLED(CONFIG_ZONE_DMA) && (flags & __GFP_DMA))
                return KMALLOC_DMA;
        if (!IS_ENABLED(CONFIG_MEMCG) || (flags & __GFP_RECLAIMABLE))
                return KMALLOC_RECLAIM;
        else
                return KMALLOC_CGROUP;
}

/*
 * Figure out which kmalloc slab an allocation of a certain size
 * belongs to.
 * 0 = zero alloc
 * 1 =  65 .. 96 bytes
 * 2 = 129 .. 192 bytes
 * n = 2^(n-1)+1 .. 2^n
 *
 * Note: __kmalloc_index() is compile-time optimized, and not runtime optimized;
 * typical usage is via kmalloc_index() and therefore evaluated at compile-time.
 * Callers where !size_is_constant should only be test modules, where runtime
 * overheads of __kmalloc_index() can be tolerated.  Also see kmalloc_slab().
 */
static __always_inline unsigned int __kmalloc_index(size_t size,
                                                    bool size_is_constant)
{
        if (!size)
                return 0;

        if (size <= KMALLOC_MIN_SIZE)
                return KMALLOC_SHIFT_LOW;

        if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
                return 1;
        if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
                return 2;
        if (size <=          8) return 3;
        if (size <=         16) return 4;
        if (size <=         32) return 5;
        if (size <=         64) return 6;
        if (size <=        128) return 7;
        if (size <=        256) return 8;
        if (size <=        512) return 9;
        if (size <=       1024) return 10;
        if (size <=   2 * 1024) return 11;
        if (size <=   4 * 1024) return 12;
        if (size <=   8 * 1024) return 13;
        if (size <=  16 * 1024) return 14;
        if (size <=  32 * 1024) return 15;
        if (size <=  64 * 1024) return 16;
        if (size <= 128 * 1024) return 17;
        if (size <= 256 * 1024) return 18;
        if (size <= 512 * 1024) return 19;
        if (size <= 1024 * 1024) return 20;
        if (size <=  2 * 1024 * 1024) return 21;

        if (!IS_ENABLED(CONFIG_PROFILE_ALL_BRANCHES) && size_is_constant)
                BUILD_BUG_ON_MSG(1, "unexpected size in kmalloc_index()");
        else
                BUG();

        /* Will never be reached. Needed because the compiler may complain */
        return -1;
}
static_assert(PAGE_SHIFT <= 20);
#define kmalloc_index(s) __kmalloc_index(s, true)

#include <linux/alloc_tag.h>

/**
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache.
 * See kmem_cache_zalloc() for a shortcut of adding __GFP_ZERO to flags.
 *
 * Return: pointer to the new object or %NULL in case of error
 */
void *kmem_cache_alloc_noprof(struct kmem_cache *cachep,
                              gfp_t flags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc(...)                        alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__))

void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
                            gfp_t gfpflags) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_lru(...)        alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))

/**
 * kmem_cache_charge - memcg charge an already allocated slab memory
 * @objp: address of the slab object to memcg charge
 * @gfpflags: describe the allocation context
 *
 * kmem_cache_charge allows charging a slab object to the current memcg,
 * primarily in cases where charging at allocation time might not be possible
 * because the target memcg is not known (i.e. softirq context)
 *
 * The objp should be pointer returned by the slab allocator functions like
 * kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge
 * behavior can be controlled through gfpflags parameter, which affects how the
 * necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes
 * that overcharging is requested instead of failure, but is not applied for the
 * internal metadata allocation.
 *
 * There are several cases where it will return true even if the charging was
 * not done:
 * More specifically:
 *
 * 1. For !CONFIG_MEMCG or cgroup_disable=memory systems.
 * 2. Already charged slab objects.
 * 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc()
 *    without __GFP_ACCOUNT
 * 4. Allocating internal metadata has failed
 *
 * Return: true if charge was successful otherwise false.
 */
bool kmem_cache_charge(void *objp, gfp_t gfpflags);
void kmem_cache_free(struct kmem_cache *s, void *objp);

kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
                                  unsigned int useroffset, unsigned int usersize,
                                  void (*ctor)(void *));

/*
 * Bulk allocation and freeing operations. These are accelerated in an
 * allocator specific way to avoid taking locks repeatedly or building
 * metadata structures unnecessarily.
 *
 * Note that interrupts must be enabled when calling these functions.
 */
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p);

int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p);
#define kmem_cache_alloc_bulk(...)        alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__))

static __always_inline void kfree_bulk(size_t size, void **p)
{
        kmem_cache_free_bulk(NULL, size, p);
}

void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags,
                                   int node) __assume_slab_alignment __malloc;
#define kmem_cache_alloc_node(...)        alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__))

/*
 * These macros allow declaring a kmem_buckets * parameter alongside size, which
 * can be compiled out with CONFIG_SLAB_BUCKETS=n so that a large number of call
 * sites don't have to pass NULL.
 */
#ifdef CONFIG_SLAB_BUCKETS
#define DECL_BUCKET_PARAMS(_size, _b)        size_t (_size), kmem_buckets *(_b)
#define PASS_BUCKET_PARAMS(_size, _b)        (_size), (_b)
#define PASS_BUCKET_PARAM(_b)                (_b)
#else
#define DECL_BUCKET_PARAMS(_size, _b)        size_t (_size)
#define PASS_BUCKET_PARAMS(_size, _b)        (_size)
#define PASS_BUCKET_PARAM(_b)                NULL
#endif

/*
 * The following functions are not to be used directly and are intended only
 * for internal use from kmalloc() and kmalloc_node()
 * with the exception of kunit tests
 */

void *__kmalloc_noprof(size_t size, gfp_t flags)
                                __assume_kmalloc_alignment __alloc_size(1);

void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
                                __assume_kmalloc_alignment __alloc_size(1);

void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t flags, size_t size)
                                __assume_kmalloc_alignment __alloc_size(3);

void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
                                  int node, size_t size)
                                __assume_kmalloc_alignment __alloc_size(4);

void *__kmalloc_large_noprof(size_t size, gfp_t flags)
                                __assume_page_alignment __alloc_size(1);

void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
                                __assume_page_alignment __alloc_size(1);

/**
 * kmalloc - allocate kernel memory
 * @size: how many bytes of memory are required.
 * @flags: describe the allocation context
 *
 * kmalloc is the normal method of allocating memory
 * for objects smaller than page size in the kernel.
 *
 * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN
 * bytes. For @size of power of two bytes, the alignment is also guaranteed
 * to be at least to the size. For other sizes, the alignment is guaranteed to
 * be at least the largest power-of-two divisor of @size.
 *
 * The @flags argument may be one of the GFP flags defined at
 * include/linux/gfp_types.h and described at
 * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>`
 *
 * The recommended usage of the @flags is described at
 * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>`
 *
 * Below is a brief outline of the most useful GFP flags
 *
 * %GFP_KERNEL
 *        Allocate normal kernel ram. May sleep.
 *
 * %GFP_NOWAIT
 *        Allocation will not sleep.
 *
 * %GFP_ATOMIC
 *        Allocation will not sleep.  May use emergency pools.
 *
 * Also it is possible to set different flags by OR'ing
 * in one or more of the following additional @flags:
 *
 * %__GFP_ZERO
 *        Zero the allocated memory before returning. Also see kzalloc().
 *
 * %__GFP_HIGH
 *        This allocation has high priority and may use emergency pools.
 *
 * %__GFP_NOFAIL
 *        Indicate that this allocation is in no way allowed to fail
 *        (think twice before using).
 *
 * %__GFP_NORETRY
 *        If memory is not immediately available,
 *        then give up at once.
 *
 * %__GFP_NOWARN
 *        If allocation fails, don't issue any warnings.
 *
 * %__GFP_RETRY_MAYFAIL
 *        Try really hard to succeed the allocation but fail
 *        eventually.
 */
static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return __kmalloc_large_noprof(size, flags);

                index = kmalloc_index(size);
                return __kmalloc_cache_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, size);
        }
        return __kmalloc_noprof(size, flags);
}
#define kmalloc(...)                                alloc_hooks(kmalloc_noprof(__VA_ARGS__))

#define kmem_buckets_alloc(_b, _size, _flags)        \
        alloc_hooks(__kmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))

#define kmem_buckets_alloc_track_caller(_b, _size, _flags)        \
        alloc_hooks(__kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE, _RET_IP_))

static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node)
{
        if (__builtin_constant_p(size) && size) {
                unsigned int index;

                if (size > KMALLOC_MAX_CACHE_SIZE)
                        return __kmalloc_large_node_noprof(size, flags, node);

                index = kmalloc_index(size);
                return __kmalloc_cache_node_noprof(
                                kmalloc_caches[kmalloc_type(flags, _RET_IP_)][index],
                                flags, node, size);
        }
        return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node);
}
#define kmalloc_node(...)                        alloc_hooks(kmalloc_node_noprof(__VA_ARGS__))

/**
 * kmalloc_array - allocate memory for an array.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        return kmalloc_noprof(bytes, flags);
}
#define kmalloc_array(...)                        alloc_hooks(kmalloc_array_noprof(__VA_ARGS__))

/**
 * krealloc_array - reallocate memory for an array.
 * @p: pointer to the memory chunk to reallocate
 * @new_n: new number of elements to alloc
 * @new_size: new size of a single member of the array
 * @flags: the type of memory to allocate (see kmalloc)
 *
 * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
 * initial memory allocation, every subsequent call to this API for the same
 * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
 * __GFP_ZERO is not fully honored by this API.
 *
 * See krealloc_noprof() for further details.
 *
 * In any case, the contents of the object pointed to are preserved up to the
 * lesser of the new and old sizes.
 */
static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p,
                                                                       size_t new_n,
                                                                       size_t new_size,
                                                                       gfp_t flags)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(new_n, new_size, &bytes)))
                return NULL;

        return krealloc_noprof(p, bytes, flags);
}
#define krealloc_array(...)                        alloc_hooks(krealloc_array_noprof(__VA_ARGS__))

/**
 * kcalloc - allocate memory for an array. The memory is set to zero.
 * @n: number of elements.
 * @size: element size.
 * @flags: the type of memory to allocate (see kmalloc).
 */
#define kcalloc(n, size, flags)                kmalloc_array(n, size, (flags) | __GFP_ZERO)

void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node,
                                         unsigned long caller) __alloc_size(1);
#define kmalloc_node_track_caller_noprof(size, flags, node, caller) \
        __kmalloc_node_track_caller_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node, caller)
#define kmalloc_node_track_caller(...)                \
        alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_))

/*
 * kmalloc_track_caller is a special version of kmalloc that records the
 * calling function of the routine calling it for slab leak tracking instead
 * of just the calling function (confusing, eh?).
 * It's useful when the call to kmalloc comes from a widely-used standard
 * allocator where we care about the real place the memory allocation
 * request comes from.
 */
#define kmalloc_track_caller(...)                kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE)

#define kmalloc_track_caller_noprof(...)        \
                kmalloc_node_track_caller_noprof(__VA_ARGS__, NUMA_NO_NODE, _RET_IP_)

static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags,
                                                          int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;
        if (__builtin_constant_p(n) && __builtin_constant_p(size))
                return kmalloc_node_noprof(bytes, flags, node);
        return __kmalloc_node_noprof(PASS_BUCKET_PARAMS(bytes, NULL), flags, node);
}
#define kmalloc_array_node(...)                        alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__))

#define kcalloc_node(_n, _size, _flags, _node)        \
        kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node)

/*
 * Shortcuts
 */
#define kmem_cache_zalloc(_k, _flags)                kmem_cache_alloc(_k, (_flags)|__GFP_ZERO)

/**
 * kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate (see kmalloc).
 */
static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags)
{
        return kmalloc_noprof(size, flags | __GFP_ZERO);
}
#define kzalloc(...)                                alloc_hooks(kzalloc_noprof(__VA_ARGS__))
#define kzalloc_node(_size, _flags, _node)        kmalloc_node(_size, (_flags)|__GFP_ZERO, _node)

void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) __alloc_size(1);
#define kvmalloc_node_noprof(size, flags, node)        \
        __kvmalloc_node_noprof(PASS_BUCKET_PARAMS(size, NULL), flags, node)
#define kvmalloc_node(...)                        alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__))

#define kvmalloc(_size, _flags)                        kvmalloc_node(_size, _flags, NUMA_NO_NODE)
#define kvmalloc_noprof(_size, _flags)                kvmalloc_node_noprof(_size, _flags, NUMA_NO_NODE)
#define kvzalloc(_size, _flags)                        kvmalloc(_size, (_flags)|__GFP_ZERO)

#define kvzalloc_node(_size, _flags, _node)        kvmalloc_node(_size, (_flags)|__GFP_ZERO, _node)
#define kmem_buckets_valloc(_b, _size, _flags)        \
        alloc_hooks(__kvmalloc_node_noprof(PASS_BUCKET_PARAMS(_size, _b), _flags, NUMA_NO_NODE))

static inline __alloc_size(1, 2) void *
kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node)
{
        size_t bytes;

        if (unlikely(check_mul_overflow(n, size, &bytes)))
                return NULL;

        return kvmalloc_node_noprof(bytes, flags, node);
}

#define kvmalloc_array_noprof(...)                kvmalloc_array_node_noprof(__VA_ARGS__, NUMA_NO_NODE)
#define kvcalloc_node_noprof(_n,_s,_f,_node)        kvmalloc_array_node_noprof(_n,_s,(_f)|__GFP_ZERO,_node)
#define kvcalloc_noprof(...)                        kvcalloc_node_noprof(__VA_ARGS__, NUMA_NO_NODE)

#define kvmalloc_array(...)                        alloc_hooks(kvmalloc_array_noprof(__VA_ARGS__))
#define kvcalloc_node(...)                        alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__))
#define kvcalloc(...)                                alloc_hooks(kvcalloc_noprof(__VA_ARGS__))

void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
                __realloc_size(2);
#define kvrealloc(...)                                alloc_hooks(kvrealloc_noprof(__VA_ARGS__))

extern void kvfree(const void *addr);
DEFINE_FREE(kvfree, void *, if (!IS_ERR_OR_NULL(_T)) kvfree(_T))

extern void kvfree_sensitive(const void *addr, size_t len);

unsigned int kmem_cache_size(struct kmem_cache *s);

#ifndef CONFIG_KVFREE_RCU_BATCHED
static inline void kvfree_rcu_barrier(void)
{
        rcu_barrier();
}

static inline void kfree_rcu_scheduler_running(void) { }
#else
void kvfree_rcu_barrier(void);

void kfree_rcu_scheduler_running(void);
#endif

/**
 * kmalloc_size_roundup - Report allocation bucket size for the given size
 *
 * @size: Number of bytes to round up from.
 *
 * This returns the number of bytes that would be available in a kmalloc()
 * allocation of @size bytes. For example, a 126 byte request would be
 * rounded up to the next sized kmalloc bucket, 128 bytes. (This is strictly
 * for the general-purpose kmalloc()-based allocations, and is not for the
 * pre-sized kmem_cache_alloc()-based allocations.)
 *
 * Use this to kmalloc() the full bucket size ahead of time instead of using
 * ksize() to query the size after an allocation.
 */
size_t kmalloc_size_roundup(size_t size);

void __init kmem_cache_init_late(void);
void __init kvfree_rcu_init(void);

#endif        /* _LINUX_SLAB_H */































    2 




    3 





    1 


    2 






    8 





    3 


    1 

    1 









    1 



    2 


    5 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2020 Arm Ltd.

#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>

#include <asm/kvm_emulate.h>

#include <kvm/arm_hypercalls.h>

#define ARM_SMCCC_TRNG_VERSION_1_0        0x10000UL

/* Those values are deliberately separate from the generic SMCCC definitions. */
#define TRNG_SUCCESS                        0UL
#define TRNG_NOT_SUPPORTED                ((unsigned long)-1)
#define TRNG_INVALID_PARAMETER                ((unsigned long)-2)
#define TRNG_NO_ENTROPY                        ((unsigned long)-3)

#define TRNG_MAX_BITS64                        192

static const uuid_t arm_smc_trng_uuid __aligned(4) = UUID_INIT(
        0x0d21e000, 0x4384, 0x11eb, 0x80, 0x70, 0x52, 0x44, 0x55, 0x4e, 0x5a, 0x4c);

static int kvm_trng_do_rnd(struct kvm_vcpu *vcpu, int size)
{
        DECLARE_BITMAP(bits, TRNG_MAX_BITS64);
        u32 num_bits = smccc_get_arg1(vcpu);
        int i;

        if (num_bits > 3 * size) {
                smccc_set_retval(vcpu, TRNG_INVALID_PARAMETER, 0, 0, 0);
                return 1;
        }

        /* get as many bits as we need to fulfil the request */
        for (i = 0; i < DIV_ROUND_UP(num_bits, BITS_PER_LONG); i++)
                bits[i] = get_random_long();

        bitmap_clear(bits, num_bits, TRNG_MAX_BITS64 - num_bits);

        if (size == 32)
                smccc_set_retval(vcpu, TRNG_SUCCESS, lower_32_bits(bits[1]),
                                 upper_32_bits(bits[0]), lower_32_bits(bits[0]));
        else
                smccc_set_retval(vcpu, TRNG_SUCCESS, bits[2], bits[1], bits[0]);

        memzero_explicit(bits, sizeof(bits));
        return 1;
}

int kvm_trng_call(struct kvm_vcpu *vcpu)
{
        const __le32 *u = (__le32 *)arm_smc_trng_uuid.b;
        u32 func_id = smccc_get_function(vcpu);
        unsigned long val = TRNG_NOT_SUPPORTED;
        int size = 64;

        switch (func_id) {
        case ARM_SMCCC_TRNG_VERSION:
                val = ARM_SMCCC_TRNG_VERSION_1_0;
                break;
        case ARM_SMCCC_TRNG_FEATURES:
                switch (smccc_get_arg1(vcpu)) {
                case ARM_SMCCC_TRNG_VERSION:
                case ARM_SMCCC_TRNG_FEATURES:
                case ARM_SMCCC_TRNG_GET_UUID:
                case ARM_SMCCC_TRNG_RND32:
                case ARM_SMCCC_TRNG_RND64:
                        val = TRNG_SUCCESS;
                }
                break;
        case ARM_SMCCC_TRNG_GET_UUID:
                smccc_set_retval(vcpu, le32_to_cpu(u[0]), le32_to_cpu(u[1]),
                                 le32_to_cpu(u[2]), le32_to_cpu(u[3]));
                return 1;
        case ARM_SMCCC_TRNG_RND32:
                size = 32;
                fallthrough;
        case ARM_SMCCC_TRNG_RND64:
                return kvm_trng_do_rnd(vcpu, size);
        }

        smccc_set_retval(vcpu, val, 0, 0, 0);
        return 1;
}



































































































































































































  651 







  572 






























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
/* SPDX-License-Identifier: GPL-2.0 */
/* rwsem.h: R/W semaphores, public interface
 *
 * Written by David Howells (dhowells@redhat.com).
 * Derived from asm-i386/semaphore.h
 */

#ifndef _LINUX_RWSEM_H
#define _LINUX_RWSEM_H

#include <linux/linkage.h>

#include <linux/types.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>
#include <linux/err.h>
#include <linux/cleanup.h>

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define __RWSEM_DEP_MAP_INIT(lockname)                        \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_SLEEP,        \
        },
#else
# define __RWSEM_DEP_MAP_INIT(lockname)
#endif

#ifndef CONFIG_PREEMPT_RT

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#include <linux/osq_lock.h>
#endif

/*
 * For an uncontended rwsem, count and owner are the only fields a task
 * needs to touch when acquiring the rwsem. So they are put next to each
 * other to increase the chance that they will share the same cacheline.
 *
 * In a contended rwsem, the owner is likely the most frequently accessed
 * field in the structure as the optimistic waiter that holds the osq lock
 * will spin on owner. For an embedded rwsem, other hot fields in the
 * containing structure should be moved further away from the rwsem to
 * reduce the chance that they will share the same cacheline causing
 * cacheline bouncing problem.
 */
struct rw_semaphore {
        atomic_long_t count;
        /*
         * Write owner or one of the read owners as well flags regarding
         * the current state of the rwsem. Can be used as a speculative
         * check to see if the write owner is running on the cpu.
         */
        atomic_long_t owner;
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
        struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
        raw_spinlock_t wait_lock;
        struct list_head wait_list;
#ifdef CONFIG_DEBUG_RWSEMS
        void *magic;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define RWSEM_UNLOCKED_VALUE                0UL
#define RWSEM_WRITER_LOCKED                (1UL << 0)
#define __RWSEM_COUNT_INIT(name)        .count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)

static inline int rwsem_is_locked(struct rw_semaphore *sem)
{
        return atomic_long_read(&sem->count) != RWSEM_UNLOCKED_VALUE;
}

static inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(atomic_long_read(&sem->count) == RWSEM_UNLOCKED_VALUE);
}

static inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!(atomic_long_read(&sem->count) & RWSEM_WRITER_LOCKED));
}

/* Common initializer macros and functions */

#ifdef CONFIG_DEBUG_RWSEMS
# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname,
#else
# define __RWSEM_DEBUG_INIT(lockname)
#endif

#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED,
#else
#define __RWSEM_OPT_INIT(lockname)
#endif

#define __RWSEM_INITIALIZER(name)                                \
        { __RWSEM_COUNT_INIT(name),                                \
          .owner = ATOMIC_LONG_INIT(0),                                \
          __RWSEM_OPT_INIT(name)                                \
          .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
          .wait_list = LIST_HEAD_INIT((name).wait_list),        \
          __RWSEM_DEBUG_INIT(name)                                \
          __RWSEM_DEP_MAP_INIT(name) }

#define DECLARE_RWSEM(name) \
        struct rw_semaphore name = __RWSEM_INITIALIZER(name)

extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
                         struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

/*
 * This is the same regardless of which rwsem implementation that is being used.
 * It is just a heuristic meant to be called by somebody already holding the
 * rwsem to see if somebody from an incompatible type is wanting access to the
 * lock.
 */
static inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return !list_empty(&sem->wait_list);
}

#else /* !CONFIG_PREEMPT_RT */

#include <linux/rwbase_rt.h>

struct rw_semaphore {
        struct rwbase_rt        rwbase;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
#endif
};

#define __RWSEM_INITIALIZER(name)                                \
        {                                                        \
                .rwbase = __RWBASE_INITIALIZER(name),                \
                __RWSEM_DEP_MAP_INIT(name)                        \
        }

#define DECLARE_RWSEM(lockname) \
        struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)

extern void  __init_rwsem(struct rw_semaphore *rwsem, const char *name,
                          struct lock_class_key *key);

#define init_rwsem(sem)                                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __init_rwsem((sem), #sem, &__key);                        \
} while (0)

static __always_inline int rwsem_is_locked(const struct rw_semaphore *sem)
{
        return rw_base_is_locked(&sem->rwbase);
}

static __always_inline void rwsem_assert_held_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!rwsem_is_locked(sem));
}

static __always_inline void rwsem_assert_held_write_nolockdep(const struct rw_semaphore *sem)
{
        WARN_ON(!rw_base_is_write_locked(&sem->rwbase));
}

static __always_inline int rwsem_is_contended(struct rw_semaphore *sem)
{
        return rw_base_is_contended(&sem->rwbase);
}

#endif /* CONFIG_PREEMPT_RT */

/*
 * The functions below are the same for all rwsem implementations including
 * the RT specific variant.
 */

static inline void rwsem_assert_held(const struct rw_semaphore *sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held(sem);
        else
                rwsem_assert_held_nolockdep(sem);
}

static inline void rwsem_assert_held_write(const struct rw_semaphore *sem)
{
        if (IS_ENABLED(CONFIG_LOCKDEP))
                lockdep_assert_held_write(sem);
        else
                rwsem_assert_held_write_nolockdep(sem);
}

/*
 * lock for reading
 */
extern void down_read(struct rw_semaphore *sem);
extern int __must_check down_read_interruptible(struct rw_semaphore *sem);
extern int __must_check down_read_killable(struct rw_semaphore *sem);

/*
 * trylock for reading -- returns 1 if successful, 0 if contention
 */
extern int down_read_trylock(struct rw_semaphore *sem);

/*
 * lock for writing
 */
extern void down_write(struct rw_semaphore *sem);
extern int __must_check down_write_killable(struct rw_semaphore *sem);

/*
 * trylock for writing -- returns 1 if successful, 0 if contention
 */
extern int down_write_trylock(struct rw_semaphore *sem);

/*
 * release a read lock
 */
extern void up_read(struct rw_semaphore *sem);

/*
 * release a write lock
 */
extern void up_write(struct rw_semaphore *sem);

DEFINE_GUARD(rwsem_read, struct rw_semaphore *, down_read(_T), up_read(_T))
DEFINE_GUARD_COND(rwsem_read, _try, down_read_trylock(_T))
DEFINE_GUARD_COND(rwsem_read, _intr, down_read_interruptible(_T) == 0)

DEFINE_GUARD(rwsem_write, struct rw_semaphore *, down_write(_T), up_write(_T))
DEFINE_GUARD_COND(rwsem_write, _try, down_write_trylock(_T))

/*
 * downgrade write lock to read lock
 */
extern void downgrade_write(struct rw_semaphore *sem);

#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
 * nested locking. NOTE: rwsems are not allowed to recurse
 * (which occurs if the same task tries to acquire the same
 * lock instance multiple times), but multiple locks of the
 * same lock class might be taken, if the order of the locks
 * is always the same. This ordering rule can be expressed
 * to lockdep via the _nested() APIs, but enumerating the
 * subclasses that are used. (If the nesting relationship is
 * static then another method for expressing nested locking is
 * the explicit definition of lock class keys and the use of
 * lockdep_set_class() at lock initialization time.
 * See Documentation/locking/lockdep-design.rst for more details.)
 */
extern void down_read_nested(struct rw_semaphore *sem, int subclass);
extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass);
extern void down_write_nested(struct rw_semaphore *sem, int subclass);
extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass);
extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock);

# define down_write_nest_lock(sem, nest_lock)                        \
do {                                                                \
        typecheck(struct lockdep_map *, &(nest_lock)->dep_map);        \
        _down_write_nest_lock(sem, &(nest_lock)->dep_map);        \
} while (0)

/*
 * Take/release a lock when not the owner will release it.
 *
 * [ This API should be avoided as much as possible - the
 *   proper abstraction for this case is completions. ]
 */
extern void down_read_non_owner(struct rw_semaphore *sem);
extern void up_read_non_owner(struct rw_semaphore *sem);
#else
# define down_read_nested(sem, subclass)                down_read(sem)
# define down_read_killable_nested(sem, subclass)        down_read_killable(sem)
# define down_write_nest_lock(sem, nest_lock)        down_write(sem)
# define down_write_nested(sem, subclass)        down_write(sem)
# define down_write_killable_nested(sem, subclass)        down_write_killable(sem)
# define down_read_non_owner(sem)                down_read(sem)
# define up_read_non_owner(sem)                        up_read(sem)
#endif

#endif /* _LINUX_RWSEM_H */




























































































  155 






















  154 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Derived from arch/ppc/mm/extable.c and arch/i386/mm/extable.c.
 *
 * Copyright (C) 2004 Paul Mackerras, IBM Corp.
 */

#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/sort.h>
#include <linux/uaccess.h>
#include <linux/extable.h>

#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define ex_to_insn(x)        ((x)->insn)
#else
static inline unsigned long ex_to_insn(const struct exception_table_entry *x)
{
        return (unsigned long)&x->insn + x->insn;
}
#endif

#ifndef ARCH_HAS_RELATIVE_EXTABLE
#define swap_ex                NULL
#else
static void swap_ex(void *a, void *b, int size)
{
        struct exception_table_entry *x = a, *y = b, tmp;
        int delta = b - a;

        tmp = *x;
        x->insn = y->insn + delta;
        y->insn = tmp.insn - delta;

#ifdef swap_ex_entry_fixup
        swap_ex_entry_fixup(x, y, tmp, delta);
#else
        x->fixup = y->fixup + delta;
        y->fixup = tmp.fixup - delta;
#endif
}
#endif /* ARCH_HAS_RELATIVE_EXTABLE */

/*
 * The exception table needs to be sorted so that the binary
 * search that we use to find entries in it works properly.
 * This is used both for the kernel exception table and for
 * the exception tables of modules that get loaded.
 */
static int cmp_ex_sort(const void *a, const void *b)
{
        const struct exception_table_entry *x = a, *y = b;

        /* avoid overflow */
        if (ex_to_insn(x) > ex_to_insn(y))
                return 1;
        if (ex_to_insn(x) < ex_to_insn(y))
                return -1;
        return 0;
}

void sort_extable(struct exception_table_entry *start,
                  struct exception_table_entry *finish)
{
        sort(start, finish - start, sizeof(struct exception_table_entry),
             cmp_ex_sort, swap_ex);
}

#ifdef CONFIG_MODULES
/*
 * If the exception table is sorted, any referring to the module init
 * will be at the beginning or the end.
 */
void trim_init_extable(struct module *m)
{
        /*trim the beginning*/
        while (m->num_exentries &&
               within_module_init(ex_to_insn(&m->extable[0]), m)) {
                m->extable++;
                m->num_exentries--;
        }
        /*trim the end*/
        while (m->num_exentries &&
               within_module_init(ex_to_insn(&m->extable[m->num_exentries - 1]),
                                  m))
                m->num_exentries--;
}
#endif /* CONFIG_MODULES */

static int cmp_ex_search(const void *key, const void *elt)
{
        const struct exception_table_entry *_elt = elt;
        unsigned long _key = *(unsigned long *)key;

        /* avoid overflow */
        if (_key > ex_to_insn(_elt))
                return 1;
        if (_key < ex_to_insn(_elt))
                return -1;
        return 0;
}

/*
 * Search one exception table for an entry corresponding to the
 * given instruction address, and return the address of the entry,
 * or NULL if none is found.
 * We use a binary search, and thus we assume that the table is
 * already sorted.
 */
const struct exception_table_entry *
search_extable(const struct exception_table_entry *base,
               const size_t num,
               unsigned long value)
{
        return bsearch(&value, base, num,
                       sizeof(struct exception_table_entry), cmp_ex_search);
}
























   25 
































  269 









































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM skb

#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SKB_H

#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/tracepoint.h>

#undef FN
#define FN(reason)        TRACE_DEFINE_ENUM(SKB_DROP_REASON_##reason);
DEFINE_DROP_REASON(FN, FN)

#undef FN
#undef FNe
#define FN(reason)        { SKB_DROP_REASON_##reason, #reason },
#define FNe(reason)        { SKB_DROP_REASON_##reason, #reason }

/*
 * Tracepoint for free an sk_buff:
 */
TRACE_EVENT(kfree_skb,

        TP_PROTO(struct sk_buff *skb, void *location,
                 enum skb_drop_reason reason, struct sock *rx_sk),

        TP_ARGS(skb, location, reason, rx_sk),

        TP_STRUCT__entry(
                __field(void *,                skbaddr)
                __field(void *,                location)
                __field(void *,                rx_sk)
                __field(unsigned short,        protocol)
                __field(enum skb_drop_reason,        reason)
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
                __entry->rx_sk = rx_sk;
                __entry->protocol = ntohs(skb->protocol);
                __entry->reason = reason;
        ),

        TP_printk("skbaddr=%p rx_sk=%p protocol=%u location=%pS reason: %s",
                  __entry->skbaddr, __entry->rx_sk, __entry->protocol,
                  __entry->location,
                  __print_symbolic(__entry->reason,
                                   DEFINE_DROP_REASON(FN, FNe)))
);

#undef FN
#undef FNe

TRACE_EVENT(consume_skb,

        TP_PROTO(struct sk_buff *skb, void *location),

        TP_ARGS(skb, location),

        TP_STRUCT__entry(
                __field(        void *,        skbaddr)
                __field(        void *,        location)
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->location = location;
        ),

        TP_printk("skbaddr=%p location=%pS", __entry->skbaddr, __entry->location)
);

TRACE_EVENT(skb_copy_datagram_iovec,

        TP_PROTO(const struct sk_buff *skb, int len),

        TP_ARGS(skb, len),

        TP_STRUCT__entry(
                __field(        const void *,                skbaddr                )
                __field(        int,                        len                )
        ),

        TP_fast_assign(
                __entry->skbaddr = skb;
                __entry->len = len;
        ),

        TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len)
);

#endif /* _TRACE_SKB_H */

/* This part must be outside protection */
#include <trace/define_trace.h>










































































































































































































































































    3 








    3 




















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * net/core/netprio_cgroup.c        Priority Control Group
 *
 * Authors:        Neil Horman <nhorman@tuxdriver.com>
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/atomic.h>
#include <linux/sched/task.h>

#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
#include <net/netprio_cgroup.h>

#include <linux/fdtable.h>

/*
 * netprio allocates per-net_device priomap array which is indexed by
 * css->id.  Limiting css ID to 16bits doesn't lose anything.
 */
#define NETPRIO_ID_MAX                USHRT_MAX

#define PRIOMAP_MIN_SZ                128

/*
 * Extend @dev->priomap so that it's large enough to accommodate
 * @target_idx.  @dev->priomap.priomap_len > @target_idx after successful
 * return.  Must be called under rtnl lock.
 */
static int extend_netdev_table(struct net_device *dev, u32 target_idx)
{
        struct netprio_map *old, *new;
        size_t new_sz, new_len;

        /* is the existing priomap large enough? */
        old = rtnl_dereference(dev->priomap);
        if (old && old->priomap_len > target_idx)
                return 0;

        /*
         * Determine the new size.  Let's keep it power-of-two.  We start
         * from PRIOMAP_MIN_SZ and double it until it's large enough to
         * accommodate @target_idx.
         */
        new_sz = PRIOMAP_MIN_SZ;
        while (true) {
                new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
                        sizeof(new->priomap[0]);
                if (new_len > target_idx)
                        break;
                new_sz *= 2;
                /* overflowed? */
                if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
                        return -ENOSPC;
        }

        /* allocate & copy */
        new = kzalloc(new_sz, GFP_KERNEL);
        if (!new)
                return -ENOMEM;

        if (old)
                memcpy(new->priomap, old->priomap,
                       old->priomap_len * sizeof(old->priomap[0]));

        new->priomap_len = new_len;

        /* install the new priomap */
        rcu_assign_pointer(dev->priomap, new);
        if (old)
                kfree_rcu(old, rcu);
        return 0;
}

/**
 * netprio_prio - return the effective netprio of a cgroup-net_device pair
 * @css: css part of the target pair
 * @dev: net_device part of the target pair
 *
 * Should be called under RCU read or rtnl lock.
 */
static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
{
        struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
        int id = css->id;

        if (map && id < map->priomap_len)
                return map->priomap[id];
        return 0;
}

/**
 * netprio_set_prio - set netprio on a cgroup-net_device pair
 * @css: css part of the target pair
 * @dev: net_device part of the target pair
 * @prio: prio to set
 *
 * Set netprio to @prio on @css-@dev pair.  Should be called under rtnl
 * lock and may fail under memory pressure for non-zero @prio.
 */
static int netprio_set_prio(struct cgroup_subsys_state *css,
                            struct net_device *dev, u32 prio)
{
        struct netprio_map *map;
        int id = css->id;
        int ret;

        /* avoid extending priomap for zero writes */
        map = rtnl_dereference(dev->priomap);
        if (!prio && (!map || map->priomap_len <= id))
                return 0;

        ret = extend_netdev_table(dev, id);
        if (ret)
                return ret;

        map = rtnl_dereference(dev->priomap);
        map->priomap[id] = prio;
        return 0;
}

static struct cgroup_subsys_state *
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct cgroup_subsys_state *css;

        css = kzalloc(sizeof(*css), GFP_KERNEL);
        if (!css)
                return ERR_PTR(-ENOMEM);

        return css;
}

static int cgrp_css_online(struct cgroup_subsys_state *css)
{
        struct cgroup_subsys_state *parent_css = css->parent;
        struct net_device *dev;
        int ret = 0;

        if (css->id > NETPRIO_ID_MAX)
                return -ENOSPC;

        if (!parent_css)
                return 0;

        rtnl_lock();
        /*
         * Inherit prios from the parent.  As all prios are set during
         * onlining, there is no need to clear them on offline.
         */
        for_each_netdev(&init_net, dev) {
                u32 prio = netprio_prio(parent_css, dev);

                ret = netprio_set_prio(css, dev, prio);
                if (ret)
                        break;
        }
        rtnl_unlock();
        return ret;
}

static void cgrp_css_free(struct cgroup_subsys_state *css)
{
        kfree(css);
}

static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
{
        return css->id;
}

static int read_priomap(struct seq_file *sf, void *v)
{
        struct net_device *dev;

        rcu_read_lock();
        for_each_netdev_rcu(&init_net, dev)
                seq_printf(sf, "%s %u\n", dev->name,
                           netprio_prio(seq_css(sf), dev));
        rcu_read_unlock();
        return 0;
}

static ssize_t write_priomap(struct kernfs_open_file *of,
                             char *buf, size_t nbytes, loff_t off)
{
        char devname[IFNAMSIZ + 1];
        struct net_device *dev;
        u32 prio;
        int ret;

        if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
                return -EINVAL;

        dev = dev_get_by_name(&init_net, devname);
        if (!dev)
                return -ENODEV;

        rtnl_lock();

        ret = netprio_set_prio(of_css(of), dev, prio);

        rtnl_unlock();
        dev_put(dev);
        return ret ?: nbytes;
}

static int update_netprio(const void *v, struct file *file, unsigned n)
{
        struct socket *sock = sock_from_file(file);

        if (sock)
                sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
                                        (unsigned long)v);
        return 0;
}

static void net_prio_attach(struct cgroup_taskset *tset)
{
        struct task_struct *p;
        struct cgroup_subsys_state *css;

        cgroup_taskset_for_each(p, css, tset) {
                void *v = (void *)(unsigned long)css->id;

                task_lock(p);
                iterate_fd(p->files, 0, update_netprio, v);
                task_unlock(p);
        }
}

static struct cftype ss_files[] = {
        {
                .name = "prioidx",
                .read_u64 = read_prioidx,
        },
        {
                .name = "ifpriomap",
                .seq_show = read_priomap,
                .write = write_priomap,
        },
        { }        /* terminate */
};

struct cgroup_subsys net_prio_cgrp_subsys = {
        .css_alloc        = cgrp_css_alloc,
        .css_online        = cgrp_css_online,
        .css_free        = cgrp_css_free,
        .attach                = net_prio_attach,
        .legacy_cftypes        = ss_files,
};

static int netprio_device_event(struct notifier_block *unused,
                                unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netprio_map *old;

        /*
         * Note this is called with rtnl_lock held so we have update side
         * protection on our rcu assignments
         */

        switch (event) {
        case NETDEV_UNREGISTER:
                old = rtnl_dereference(dev->priomap);
                RCU_INIT_POINTER(dev->priomap, NULL);
                if (old)
                        kfree_rcu(old, rcu);
                break;
        }
        return NOTIFY_DONE;
}

static struct notifier_block netprio_device_notifier = {
        .notifier_call = netprio_device_event
};

static int __init init_cgroup_netprio(void)
{
        register_netdevice_notifier(&netprio_device_notifier);
        return 0;
}
subsys_initcall(init_cgroup_netprio);
















































































































































































































































































































































































































































































































































































































































































































    3 
    3 






























































































































































































































































































































































































































    3 



    3 


    3 
































































































































































































































































































































































































    3 



    3 
    3 
    3 

    3 

















    3 
    3 






    3 




























































































































































































































































































































































































































































































































































































































































    3 








    3 





























































































































































































































































   23 
    3 

   18 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   23 








   24 

   24 



   24 








   24 








   23 













   24 

    5 









   23 






   23 







   23 

















   23 


   18 
    4 





    4 



   23 
   22 




















   23 

    2 




    2 

































   23 








   23 
   23 


   23 


   23 




    5 
   18 

































    6 
   18 





























































































   23 


   24 















   24 



































































   23 




   24 

   24 











   24 




















































   23 


   24 

   24 
   24 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 

    3 































































    1 



    1 
































































































































































































































    1 













































































































































































































































































































    3 


    3 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 











    3 



















































































































































































































































































































































    3 


    3 














    3 



    3 


















    3 






    3 






    3 








































































































































































    3 


    3 





















































































































































































































































































































































































































































































































































































































    1 






    1 




    1 






    1 














    1 





    1 






    1 












    1 











    1 




    1 




    1 




    1 








    1 





    1 
    1 




    1 


    1 


    1 


    1 
























    1 


    1 



















































    1 
    1 

































































































































































































































































































































































































































































































































































































   27 








   27 













   27 


   27 













   28 
















































































    3 





























   14 




   14 






   14 















    3 























    3 












    3 




















    3 


    3 
































    3 





























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *      NET3    Protocol independent device support routines.
 *
 *        Derived from the non IP parts of dev.c 1.0.19
 *              Authors:        Ross Biro
 *                                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                                Mark Evans, <evansmp@uhura.aston.ac.uk>
 *
 *        Additional Authors:
 *                Florian la Roche <rzsfl@rz.uni-sb.de>
 *                Alan Cox <gw4pts@gw4pts.ampr.org>
 *                David Hinds <dahinds@users.sourceforge.net>
 *                Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
 *                Adam Sulmicki <adam@cfar.umd.edu>
 *              Pekka Riikonen <priikone@poesidon.pspt.fi>
 *
 *        Changes:
 *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
 *                                      to 2 if register_netdev gets called
 *                                      before net_dev_init & also removed a
 *                                      few lines of code in the process.
 *                Alan Cox        :        device private ioctl copies fields back.
 *                Alan Cox        :        Transmit queue code does relevant
 *                                        stunts to keep the queue safe.
 *                Alan Cox        :        Fixed double lock.
 *                Alan Cox        :        Fixed promisc NULL pointer trap
 *                ????????        :        Support the full private ioctl range
 *                Alan Cox        :        Moved ioctl permission check into
 *                                        drivers
 *                Tim Kordas        :        SIOCADDMULTI/SIOCDELMULTI
 *                Alan Cox        :        100 backlog just doesn't cut it when
 *                                        you start doing multicast video 8)
 *                Alan Cox        :        Rewrote net_bh and list manager.
 *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
 *                Alan Cox        :        Took out transmit every packet pass
 *                                        Saved a few bytes in the ioctl handler
 *                Alan Cox        :        Network driver sets packet type before
 *                                        calling netif_rx. Saves a function
 *                                        call a packet.
 *                Alan Cox        :        Hashed net_bh()
 *                Richard Kooijman:        Timestamp fixes.
 *                Alan Cox        :        Wrong field in SIOCGIFDSTADDR
 *                Alan Cox        :        Device lock protection.
 *              Alan Cox        :       Fixed nasty side effect of device close
 *                                        changes.
 *                Rudi Cilibrasi        :        Pass the right thing to
 *                                        set_mac_address()
 *                Dave Miller        :        32bit quantity for the device lock to
 *                                        make it work out on a Sparc.
 *                Bjorn Ekwall        :        Added KERNELD hack.
 *                Alan Cox        :        Cleaned up the backlog initialise.
 *                Craig Metz        :        SIOCGIFCONF fix if space for under
 *                                        1 device.
 *            Thomas Bogendoerfer :        Return ENODEV for dev_open, if there
 *                                        is no device open function.
 *                Andi Kleen        :        Fix error reporting for SIOCGIFCONF
 *            Michael Chastain        :        Fix signed/unsigned for SIOCGIFCONF
 *                Cyrus Durgin        :        Cleaned for KMOD
 *                Adam Sulmicki   :        Bug Fix : Network Device Unload
 *                                        A network device unload needs to purge
 *                                        the backlog queue.
 *        Paul Rusty Russell        :        SIOCSIFNAME
 *              Pekka Riikonen  :        Netdev boot-time settings code
 *              Andrew Morton   :       Make unregister_netdevice wait
 *                                      indefinitely on dev->refcnt
 *              J Hadi Salim    :       - Backlog queue sampling
 *                                        - netif_rx() feedback
 */

#include <linux/uaccess.h>
#include <linux/bitmap.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/isolation.h>
#include <linux/sched/mm.h>
#include <linux/smpboot.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/if_ether.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
#include <net/xfrm.h>
#include <net/tcx.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netpoll.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <net/iw_handler.h>
#include <asm/current.h>
#include <linux/audit.h>
#include <linux/dmaengine.h>
#include <linux/err.h>
#include <linux/ctype.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
#include <linux/random.h>
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
#include <trace/events/qdisc.h>
#include <trace/events/xdp.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
#include <linux/hashtable.h>
#include <linux/vmalloc.h>
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_netdev.h>
#include <linux/crash_dump.h>
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
#include <linux/indirect_call_wrapper.h>
#include <net/devlink.h>
#include <linux/pm_runtime.h>
#include <linux/prandom.h>
#include <linux/once_lite.h>
#include <net/netdev_lock.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/types.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/memory_provider.h>
#include <net/rps.h>
#include <linux/phy_link_topology.h>

#include "dev.h"
#include "devmem.h"
#include "net-sysfs.h"

static DEFINE_SPINLOCK(ptype_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;

static int netif_rx_internal(struct sk_buff *skb);
static int call_netdevice_notifiers_extack(unsigned long val,
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack);

static DEFINE_MUTEX(ifalias_mutex);

/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);

static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);

static inline void dev_base_seq_inc(struct net *net)
{
        unsigned int val = net->dev_base_seq + 1;

        WRITE_ONCE(net->dev_base_seq, val ?: 1);
}

static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
{
        unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));

        return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
}

static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
{
        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}

#ifndef CONFIG_PREEMPT_RT

static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);

static int __init setup_backlog_napi_threads(char *arg)
{
        static_branch_enable(&use_backlog_threads_key);
        return 0;
}
early_param("thread_backlog_napi", setup_backlog_napi_threads);

static bool use_backlog_threads(void)
{
        return static_branch_unlikely(&use_backlog_threads_key);
}

#else

static bool use_backlog_threads(void)
{
        return true;
}

#endif

static inline void backlog_lock_irq_save(struct softnet_data *sd,
                                         unsigned long *flags)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
        else
                local_irq_save(*flags);
}

static inline void backlog_lock_irq_disable(struct softnet_data *sd)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_lock_irq(&sd->input_pkt_queue.lock);
        else
                local_irq_disable();
}

static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
                                              unsigned long *flags)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
        else
                local_irq_restore(*flags);
}

static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
{
        if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
                spin_unlock_irq(&sd->input_pkt_queue.lock);
        else
                local_irq_enable();
}

static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
                                                       const char *name)
{
        struct netdev_name_node *name_node;

        name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
        if (!name_node)
                return NULL;
        INIT_HLIST_NODE(&name_node->hlist);
        name_node->dev = dev;
        name_node->name = name;
        return name_node;
}

static struct netdev_name_node *
netdev_name_node_head_alloc(struct net_device *dev)
{
        struct netdev_name_node *name_node;

        name_node = netdev_name_node_alloc(dev, dev->name);
        if (!name_node)
                return NULL;
        INIT_LIST_HEAD(&name_node->list);
        return name_node;
}

static void netdev_name_node_free(struct netdev_name_node *name_node)
{
        kfree(name_node);
}

static void netdev_name_node_add(struct net *net,
                                 struct netdev_name_node *name_node)
{
        hlist_add_head_rcu(&name_node->hlist,
                           dev_name_hash(net, name_node->name));
}

static void netdev_name_node_del(struct netdev_name_node *name_node)
{
        hlist_del_rcu(&name_node->hlist);
}

static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
                                                        const char *name)
{
        struct hlist_head *head = dev_name_hash(net, name);
        struct netdev_name_node *name_node;

        hlist_for_each_entry(name_node, head, hlist)
                if (!strcmp(name_node->name, name))
                        return name_node;
        return NULL;
}

static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
                                                            const char *name)
{
        struct hlist_head *head = dev_name_hash(net, name);
        struct netdev_name_node *name_node;

        hlist_for_each_entry_rcu(name_node, head, hlist)
                if (!strcmp(name_node->name, name))
                        return name_node;
        return NULL;
}

bool netdev_name_in_use(struct net *net, const char *name)
{
        return netdev_name_node_lookup(net, name);
}
EXPORT_SYMBOL(netdev_name_in_use);

int netdev_name_node_alt_create(struct net_device *dev, const char *name)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        name_node = netdev_name_node_lookup(net, name);
        if (name_node)
                return -EEXIST;
        name_node = netdev_name_node_alloc(dev, name);
        if (!name_node)
                return -ENOMEM;
        netdev_name_node_add(net, name_node);
        /* The node that holds dev->name acts as a head of per-device list. */
        list_add_tail_rcu(&name_node->list, &dev->name_node->list);

        return 0;
}

static void netdev_name_node_alt_free(struct rcu_head *head)
{
        struct netdev_name_node *name_node =
                container_of(head, struct netdev_name_node, rcu);

        kfree(name_node->name);
        netdev_name_node_free(name_node);
}

static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
{
        netdev_name_node_del(name_node);
        list_del(&name_node->list);
        call_rcu(&name_node->rcu, netdev_name_node_alt_free);
}

int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        name_node = netdev_name_node_lookup(net, name);
        if (!name_node)
                return -ENOENT;
        /* lookup might have found our primary name or a name belonging
         * to another device.
         */
        if (name_node == dev->name_node || name_node->dev != dev)
                return -EINVAL;

        __netdev_name_node_alt_destroy(name_node);
        return 0;
}

static void netdev_name_node_alt_flush(struct net_device *dev)
{
        struct netdev_name_node *name_node, *tmp;

        list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
                list_del(&name_node->list);
                netdev_name_node_alt_free(&name_node->rcu);
        }
}

/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        ASSERT_RTNL();

        list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
        netdev_name_node_add(net, dev->name_node);
        hlist_add_head_rcu(&dev->index_hlist,
                           dev_index_hash(net, dev->ifindex));

        netdev_for_each_altname(dev, name_node)
                netdev_name_node_add(net, name_node);

        /* We reserved the ifindex, this can't fail */
        WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));

        dev_base_seq_inc(net);
}

/* Device list removal
 * caller must respect a RCU grace period before freeing/reusing dev
 */
static void unlist_netdevice(struct net_device *dev)
{
        struct netdev_name_node *name_node;
        struct net *net = dev_net(dev);

        ASSERT_RTNL();

        xa_erase(&net->dev_by_index, dev->ifindex);

        netdev_for_each_altname(dev, name_node)
                netdev_name_node_del(name_node);

        /* Unlink dev from the device chain */
        list_del_rcu(&dev->dev_list);
        netdev_name_node_del(dev->name_node);
        hlist_del_rcu(&dev->index_hlist);

        dev_base_seq_inc(dev_net(dev));
}

/*
 *        Our notifier list
 */

static RAW_NOTIFIER_HEAD(netdev_chain);

/*
 *        Device drivers call our routines to queue packets here. We empty the
 *        queue in the local softnet handler.
 */

DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
        .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
};
EXPORT_PER_CPU_SYMBOL(softnet_data);

/* Page_pool has a lockless array/stack to alloc/recycle pages.
 * PP consumers must pay attention to run APIs in the appropriate context
 * (e.g. NAPI context).
 */
DEFINE_PER_CPU(struct page_pool *, system_page_pool);

#ifdef CONFIG_LOCKDEP
/*
 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 * according to dev->type
 */
static const unsigned short netdev_lock_type[] = {
         ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
         ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
         ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
         ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
         ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
         ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
         ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
         ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
         ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
         ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
         ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
         ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
         ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
         ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
         ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};

static const char *const netdev_lock_name[] = {
        "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
        "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
        "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
        "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
        "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
        "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
        "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
        "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
        "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
        "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
        "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
        "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
        "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
        "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
        "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};

static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];

static inline unsigned short netdev_lock_pos(unsigned short dev_type)
{
        int i;

        for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
                if (netdev_lock_type[i] == dev_type)
                        return i;
        /* the last key is used by default */
        return ARRAY_SIZE(netdev_lock_type) - 1;
}

static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
                                                 unsigned short dev_type)
{
        int i;

        i = netdev_lock_pos(dev_type);
        lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
                                   netdev_lock_name[i]);
}

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
        int i;

        i = netdev_lock_pos(dev->type);
        lockdep_set_class_and_name(&dev->addr_list_lock,
                                   &netdev_addr_lock_key[i],
                                   netdev_lock_name[i]);
}
#else
static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
                                                 unsigned short dev_type)
{
}

static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
}
#endif

/*******************************************************************************
 *
 *                Protocol management and registration routines
 *
 *******************************************************************************/


/*
 *        Add a protocol ID to the list. Now that the input handler is
 *        smarter we can dispense with all the messy stuff that used to be
 *        here.
 *
 *        BEWARE!!! Protocol handlers, mangling input packets,
 *        MUST BE last in hash buckets and checking protocol handlers
 *        MUST start from promiscuous ptype_all chain in net_bh.
 *        It is true now, do not change it.
 *        Explanation follows: if protocol handler, mangling packet, will
 *        be the first on list, it is not able to sense, that packet
 *        is cloned and should be copied-on-write, so that it will
 *        change it and subsequent readers will get broken packet.
 *                                                        --ANK (980803)
 */

static inline struct list_head *ptype_head(const struct packet_type *pt)
{
        if (pt->type == htons(ETH_P_ALL)) {
                if (!pt->af_packet_net && !pt->dev)
                        return NULL;

                return pt->dev ? &pt->dev->ptype_all :
                                 &pt->af_packet_net->ptype_all;
        }

        if (pt->dev)
                return &pt->dev->ptype_specific;

        return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}

/**
 *        dev_add_pack - add packet handler
 *        @pt: packet type declaration
 *
 *        Add a protocol handler to the networking stack. The passed &packet_type
 *        is linked into kernel lists and may not be freed until it has been
 *        removed from the kernel lists.
 *
 *        This call does not sleep therefore it can not
 *        guarantee all CPU's that are in middle of receiving packets
 *        will see the new packet type (until the next received packet).
 */

void dev_add_pack(struct packet_type *pt)
{
        struct list_head *head = ptype_head(pt);

        if (WARN_ON_ONCE(!head))
                return;

        spin_lock(&ptype_lock);
        list_add_rcu(&pt->list, head);
        spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(dev_add_pack);

/**
 *        __dev_remove_pack         - remove packet handler
 *        @pt: packet type declaration
 *
 *        Remove a protocol handler that was previously added to the kernel
 *        protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *        from the kernel lists and can be freed or reused once this function
 *        returns.
 *
 *      The packet type might still be in use by receivers
 *        and must not be freed until after all the CPU's have gone
 *        through a quiescent state.
 */
void __dev_remove_pack(struct packet_type *pt)
{
        struct list_head *head = ptype_head(pt);
        struct packet_type *pt1;

        if (!head)
                return;

        spin_lock(&ptype_lock);

        list_for_each_entry(pt1, head, list) {
                if (pt == pt1) {
                        list_del_rcu(&pt->list);
                        goto out;
                }
        }

        pr_warn("dev_remove_pack: %p not found\n", pt);
out:
        spin_unlock(&ptype_lock);
}
EXPORT_SYMBOL(__dev_remove_pack);

/**
 *        dev_remove_pack         - remove packet handler
 *        @pt: packet type declaration
 *
 *        Remove a protocol handler that was previously added to the kernel
 *        protocol handlers by dev_add_pack(). The passed &packet_type is removed
 *        from the kernel lists and can be freed or reused once this function
 *        returns.
 *
 *        This call sleeps to guarantee that no CPU is looking at the packet
 *        type after return.
 */
void dev_remove_pack(struct packet_type *pt)
{
        __dev_remove_pack(pt);

        synchronize_net();
}
EXPORT_SYMBOL(dev_remove_pack);


/*******************************************************************************
 *
 *                            Device Interface Subroutines
 *
 *******************************************************************************/

/**
 *        dev_get_iflink        - get 'iflink' value of a interface
 *        @dev: targeted interface
 *
 *        Indicates the ifindex the interface is linked to.
 *        Physical interfaces have the same 'ifindex' and 'iflink' values.
 */

int dev_get_iflink(const struct net_device *dev)
{
        if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
                return dev->netdev_ops->ndo_get_iflink(dev);

        return READ_ONCE(dev->ifindex);
}
EXPORT_SYMBOL(dev_get_iflink);

/**
 *        dev_fill_metadata_dst - Retrieve tunnel egress information.
 *        @dev: targeted interface
 *        @skb: The packet.
 *
 *        For better visibility of tunnel traffic OVS needs to retrieve
 *        egress tunnel information for a packet. Following API allows
 *        user to get this info.
 */
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
        struct ip_tunnel_info *info;

        if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
                return -EINVAL;

        info = skb_tunnel_info_unclone(skb);
        if (!info)
                return -ENOMEM;
        if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
                return -EINVAL;

        return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);

static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
{
        int k = stack->num_paths++;

        if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
                return NULL;

        return &stack->path[k];
}

int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
                          struct net_device_path_stack *stack)
{
        const struct net_device *last_dev;
        struct net_device_path_ctx ctx = {
                .dev        = dev,
        };
        struct net_device_path *path;
        int ret = 0;

        memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
        stack->num_paths = 0;
        while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
                last_dev = ctx.dev;
                path = dev_fwd_path(stack);
                if (!path)
                        return -1;

                memset(path, 0, sizeof(struct net_device_path));
                ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
                if (ret < 0)
                        return -1;

                if (WARN_ON_ONCE(last_dev == ctx.dev))
                        return -1;
        }

        if (!ctx.dev)
                return ret;

        path = dev_fwd_path(stack);
        if (!path)
                return -1;
        path->type = DEV_PATH_ETHERNET;
        path->dev = ctx.dev;

        return ret;
}
EXPORT_SYMBOL_GPL(dev_fill_forward_path);

/* must be called under rcu_read_lock(), as we dont take a reference */
static struct napi_struct *napi_by_id(unsigned int napi_id)
{
        unsigned int hash = napi_id % HASH_SIZE(napi_hash);
        struct napi_struct *napi;

        hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
                if (napi->napi_id == napi_id)
                        return napi;

        return NULL;
}

/* must be called under rcu_read_lock(), as we dont take a reference */
static struct napi_struct *
netdev_napi_by_id(struct net *net, unsigned int napi_id)
{
        struct napi_struct *napi;

        napi = napi_by_id(napi_id);
        if (!napi)
                return NULL;

        if (WARN_ON_ONCE(!napi->dev))
                return NULL;
        if (!net_eq(net, dev_net(napi->dev)))
                return NULL;

        return napi;
}

/**
 *        netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
 *        @net: the applicable net namespace
 *        @napi_id: ID of a NAPI of a target device
 *
 *        Find a NAPI instance with @napi_id. Lock its device.
 *        The device must be in %NETREG_REGISTERED state for lookup to succeed.
 *        netdev_unlock() must be called to release it.
 *
 *        Return: pointer to NAPI, its device with lock held, NULL if not found.
 */
struct napi_struct *
netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
{
        struct napi_struct *napi;
        struct net_device *dev;

        rcu_read_lock();
        napi = netdev_napi_by_id(net, napi_id);
        if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
                rcu_read_unlock();
                return NULL;
        }

        dev = napi->dev;
        dev_hold(dev);
        rcu_read_unlock();

        dev = __netdev_put_lock(dev);
        if (!dev)
                return NULL;

        rcu_read_lock();
        napi = netdev_napi_by_id(net, napi_id);
        if (napi && napi->dev != dev)
                napi = NULL;
        rcu_read_unlock();

        if (!napi)
                netdev_unlock(dev);
        return napi;
}

/**
 *        __dev_get_by_name        - find a device by its name
 *        @net: the applicable net namespace
 *        @name: name to find
 *
 *        Find an interface by name. Must be called under RTNL semaphore.
 *        If the name is found a pointer to the device is returned.
 *        If the name is not found then %NULL is returned. The
 *        reference counters are not incremented so the caller must be
 *        careful with locks.
 */

struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
        struct netdev_name_node *node_name;

        node_name = netdev_name_node_lookup(net, name);
        return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);

/**
 * dev_get_by_name_rcu        - find a device by its name
 * @net: the applicable net namespace
 * @name: name to find
 *
 * Find an interface by name.
 * If the name is found a pointer to the device is returned.
 * If the name is not found then %NULL is returned.
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
        struct netdev_name_node *node_name;

        node_name = netdev_name_node_lookup_rcu(net, name);
        return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_name_rcu);

/* Deprecated for new users, call netdev_get_by_name() instead */
struct net_device *dev_get_by_name(struct net *net, const char *name)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_name_rcu(net, name);
        dev_hold(dev);
        rcu_read_unlock();
        return dev;
}
EXPORT_SYMBOL(dev_get_by_name);

/**
 *        netdev_get_by_name() - find a device by its name
 *        @net: the applicable net namespace
 *        @name: name to find
 *        @tracker: tracking object for the acquired reference
 *        @gfp: allocation flags for the tracker
 *
 *        Find an interface by name. This can be called from any
 *        context and does its own locking. The returned handle has
 *        the usage count incremented and the caller must use netdev_put() to
 *        release it when it is no longer needed. %NULL is returned if no
 *        matching device is found.
 */
struct net_device *netdev_get_by_name(struct net *net, const char *name,
                                      netdevice_tracker *tracker, gfp_t gfp)
{
        struct net_device *dev;

        dev = dev_get_by_name(net, name);
        if (dev)
                netdev_tracker_alloc(dev, tracker, gfp);
        return dev;
}
EXPORT_SYMBOL(netdev_get_by_name);

/**
 *        __dev_get_by_index - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not
 *        had its reference counter increased so the caller must be careful
 *        about locking. The caller must hold the RTNL semaphore.
 */

struct net_device *__dev_get_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct hlist_head *head = dev_index_hash(net, ifindex);

        hlist_for_each_entry(dev, head, index_hlist)
                if (dev->ifindex == ifindex)
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(__dev_get_by_index);

/**
 *        dev_get_by_index_rcu - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not
 *        had its reference counter increased so the caller must be careful
 *        about locking. The caller must hold RCU lock.
 */

struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
{
        struct net_device *dev;
        struct hlist_head *head = dev_index_hash(net, ifindex);

        hlist_for_each_entry_rcu(dev, head, index_hlist)
                if (dev->ifindex == ifindex)
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_get_by_index_rcu);

/* Deprecated for new users, call netdev_get_by_index() instead */
struct net_device *dev_get_by_index(struct net *net, int ifindex)
{
        struct net_device *dev;

        rcu_read_lock();
        dev = dev_get_by_index_rcu(net, ifindex);
        dev_hold(dev);
        rcu_read_unlock();
        return dev;
}
EXPORT_SYMBOL(dev_get_by_index);

/**
 *        netdev_get_by_index() - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *        @tracker: tracking object for the acquired reference
 *        @gfp: allocation flags for the tracker
 *
 *        Search for an interface by index. Returns NULL if the device
 *        is not found or a pointer to the device. The device returned has
 *        had a reference added and the pointer is safe until the user calls
 *        netdev_put() to indicate they have finished with it.
 */
struct net_device *netdev_get_by_index(struct net *net, int ifindex,
                                       netdevice_tracker *tracker, gfp_t gfp)
{
        struct net_device *dev;

        dev = dev_get_by_index(net, ifindex);
        if (dev)
                netdev_tracker_alloc(dev, tracker, gfp);
        return dev;
}
EXPORT_SYMBOL(netdev_get_by_index);

/**
 *        dev_get_by_napi_id - find a device by napi_id
 *        @napi_id: ID of the NAPI struct
 *
 *        Search for an interface by NAPI ID. Returns %NULL if the device
 *        is not found or a pointer to the device. The device has not had
 *        its reference counter increased so the caller must be careful
 *        about locking. The caller must hold RCU lock.
 */
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
        struct napi_struct *napi;

        WARN_ON_ONCE(!rcu_read_lock_held());

        if (!napi_id_valid(napi_id))
                return NULL;

        napi = napi_by_id(napi_id);

        return napi ? napi->dev : NULL;
}

/* Release the held reference on the net_device, and if the net_device
 * is still registered try to lock the instance lock. If device is being
 * unregistered NULL will be returned (but the reference has been released,
 * either way!)
 *
 * This helper is intended for locking net_device after it has been looked up
 * using a lockless lookup helper. Lock prevents the instance from going away.
 */
struct net_device *__netdev_put_lock(struct net_device *dev)
{
        netdev_lock(dev);
        if (dev->reg_state > NETREG_REGISTERED) {
                netdev_unlock(dev);
                dev_put(dev);
                return NULL;
        }
        dev_put(dev);
        return dev;
}

/**
 *        netdev_get_by_index_lock() - find a device by its ifindex
 *        @net: the applicable net namespace
 *        @ifindex: index of device
 *
 *        Search for an interface by index. If a valid device
 *        with @ifindex is found it will be returned with netdev->lock held.
 *        netdev_unlock() must be called to release it.
 *
 *        Return: pointer to a device with lock held, NULL if not found.
 */
struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
{
        struct net_device *dev;

        dev = dev_get_by_index(net, ifindex);
        if (!dev)
                return NULL;

        return __netdev_put_lock(dev);
}

struct net_device *
netdev_xa_find_lock(struct net *net, struct net_device *dev,
                    unsigned long *index)
{
        if (dev)
                netdev_unlock(dev);

        do {
                rcu_read_lock();
                dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
                if (!dev) {
                        rcu_read_unlock();
                        return NULL;
                }
                dev_hold(dev);
                rcu_read_unlock();

                dev = __netdev_put_lock(dev);
                if (dev)
                        return dev;

                (*index)++;
        } while (true);
}

static DEFINE_SEQLOCK(netdev_rename_lock);

void netdev_copy_name(struct net_device *dev, char *name)
{
        unsigned int seq;

        do {
                seq = read_seqbegin(&netdev_rename_lock);
                strscpy(name, dev->name, IFNAMSIZ);
        } while (read_seqretry(&netdev_rename_lock, seq));
}

/**
 *        netdev_get_name - get a netdevice name, knowing its ifindex.
 *        @net: network namespace
 *        @name: a pointer to the buffer where the name will be stored.
 *        @ifindex: the ifindex of the interface to get the name from.
 */
int netdev_get_name(struct net *net, char *name, int ifindex)
{
        struct net_device *dev;
        int ret;

        rcu_read_lock();

        dev = dev_get_by_index_rcu(net, ifindex);
        if (!dev) {
                ret = -ENODEV;
                goto out;
        }

        netdev_copy_name(dev, name);

        ret = 0;
out:
        rcu_read_unlock();
        return ret;
}

static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
                         const char *ha)
{
        return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
}

/**
 *        dev_getbyhwaddr_rcu - find a device by its hardware address
 *        @net: the applicable net namespace
 *        @type: media type of device
 *        @ha: hardware address
 *
 *        Search for an interface by MAC address. Returns NULL if the device
 *        is not found or a pointer to the device.
 *        The caller must hold RCU.
 *        The returned device has not had its ref count increased
 *        and the caller must therefore be careful about locking
 *
 */

struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *ha)
{
        struct net_device *dev;

        for_each_netdev_rcu(net, dev)
                if (dev_addr_cmp(dev, type, ha))
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);

/**
 * dev_getbyhwaddr() - find a device by its hardware address
 * @net: the applicable net namespace
 * @type: media type of device
 * @ha: hardware address
 *
 * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
 * rtnl_lock.
 *
 * Context: rtnl_lock() must be held.
 * Return: pointer to the net_device, or NULL if not found
 */
struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
                                   const char *ha)
{
        struct net_device *dev;

        ASSERT_RTNL();
        for_each_netdev(net, dev)
                if (dev_addr_cmp(dev, type, ha))
                        return dev;

        return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr);

struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
        struct net_device *dev, *ret = NULL;

        rcu_read_lock();
        for_each_netdev_rcu(net, dev)
                if (dev->type == type) {
                        dev_hold(dev);
                        ret = dev;
                        break;
                }
        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(dev_getfirstbyhwtype);

/**
 *        __dev_get_by_flags - find any device with given flags
 *        @net: the applicable net namespace
 *        @if_flags: IFF_* values
 *        @mask: bitmask of bits in if_flags to check
 *
 *        Search for any interface with the given flags. Returns NULL if a device
 *        is not found or a pointer to the device. Must be called inside
 *        rtnl_lock(), and result refcount is unchanged.
 */

struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
                                      unsigned short mask)
{
        struct net_device *dev, *ret;

        ASSERT_RTNL();

        ret = NULL;
        for_each_netdev(net, dev) {
                if (((dev->flags ^ if_flags) & mask) == 0) {
                        ret = dev;
                        break;
                }
        }
        return ret;
}
EXPORT_SYMBOL(__dev_get_by_flags);

/**
 *        dev_valid_name - check if name is okay for network device
 *        @name: name string
 *
 *        Network device names need to be valid file names to
 *        allow sysfs to work.  We also disallow any kind of
 *        whitespace.
 */
bool dev_valid_name(const char *name)
{
        if (*name == '\0')
                return false;
        if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
                return false;
        if (!strcmp(name, ".") || !strcmp(name, ".."))
                return false;

        while (*name) {
                if (*name == '/' || *name == ':' || isspace(*name))
                        return false;
                name++;
        }
        return true;
}
EXPORT_SYMBOL(dev_valid_name);

/**
 *        __dev_alloc_name - allocate a name for a device
 *        @net: network namespace to allocate the device name in
 *        @name: name format string
 *        @res: result name string
 *
 *        Passed a format string - eg "lt%d" it will try and find a suitable
 *        id. It scans list of devices to build up a free map, then chooses
 *        the first empty slot. The caller must hold the dev_base or rtnl lock
 *        while allocating the name and adding the device in order to avoid
 *        duplicates.
 *        Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *        Returns the number of the unit assigned or a negative errno code.
 */

static int __dev_alloc_name(struct net *net, const char *name, char *res)
{
        int i = 0;
        const char *p;
        const int max_netdevices = 8*PAGE_SIZE;
        unsigned long *inuse;
        struct net_device *d;
        char buf[IFNAMSIZ];

        /* Verify the string as this thing may have come from the user.
         * There must be one "%d" and no other "%" characters.
         */
        p = strchr(name, '%');
        if (!p || p[1] != 'd' || strchr(p + 2, '%'))
                return -EINVAL;

        /* Use one page as a bit array of possible slots */
        inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
        if (!inuse)
                return -ENOMEM;

        for_each_netdev(net, d) {
                struct netdev_name_node *name_node;

                netdev_for_each_altname(d, name_node) {
                        if (!sscanf(name_node->name, name, &i))
                                continue;
                        if (i < 0 || i >= max_netdevices)
                                continue;

                        /* avoid cases where sscanf is not exact inverse of printf */
                        snprintf(buf, IFNAMSIZ, name, i);
                        if (!strncmp(buf, name_node->name, IFNAMSIZ))
                                __set_bit(i, inuse);
                }
                if (!sscanf(d->name, name, &i))
                        continue;
                if (i < 0 || i >= max_netdevices)
                        continue;

                /* avoid cases where sscanf is not exact inverse of printf */
                snprintf(buf, IFNAMSIZ, name, i);
                if (!strncmp(buf, d->name, IFNAMSIZ))
                        __set_bit(i, inuse);
        }

        i = find_first_zero_bit(inuse, max_netdevices);
        bitmap_free(inuse);
        if (i == max_netdevices)
                return -ENFILE;

        /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
        strscpy(buf, name, IFNAMSIZ);
        snprintf(res, IFNAMSIZ, buf, i);
        return i;
}

/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
static int dev_prep_valid_name(struct net *net, struct net_device *dev,
                               const char *want_name, char *out_name,
                               int dup_errno)
{
        if (!dev_valid_name(want_name))
                return -EINVAL;

        if (strchr(want_name, '%'))
                return __dev_alloc_name(net, want_name, out_name);

        if (netdev_name_in_use(net, want_name))
                return -dup_errno;
        if (out_name != want_name)
                strscpy(out_name, want_name, IFNAMSIZ);
        return 0;
}

/**
 *        dev_alloc_name - allocate a name for a device
 *        @dev: device
 *        @name: name format string
 *
 *        Passed a format string - eg "lt%d" it will try and find a suitable
 *        id. It scans list of devices to build up a free map, then chooses
 *        the first empty slot. The caller must hold the dev_base or rtnl lock
 *        while allocating the name and adding the device in order to avoid
 *        duplicates.
 *        Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 *        Returns the number of the unit assigned or a negative errno code.
 */

int dev_alloc_name(struct net_device *dev, const char *name)
{
        return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
}
EXPORT_SYMBOL(dev_alloc_name);

static int dev_get_valid_name(struct net *net, struct net_device *dev,
                              const char *name)
{
        int ret;

        ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
        return ret < 0 ? ret : 0;
}

int netif_change_name(struct net_device *dev, const char *newname)
{
        struct net *net = dev_net(dev);
        unsigned char old_assign_type;
        char oldname[IFNAMSIZ];
        int err = 0;
        int ret;

        ASSERT_RTNL_NET(net);

        if (!strncmp(newname, dev->name, IFNAMSIZ))
                return 0;

        memcpy(oldname, dev->name, IFNAMSIZ);

        write_seqlock_bh(&netdev_rename_lock);
        err = dev_get_valid_name(net, dev, newname);
        write_sequnlock_bh(&netdev_rename_lock);

        if (err < 0)
                return err;

        if (oldname[0] && !strchr(oldname, '%'))
                netdev_info(dev, "renamed from %s%s\n", oldname,
                            dev->flags & IFF_UP ? " (while UP)" : "");

        old_assign_type = dev->name_assign_type;
        WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);

rollback:
        ret = device_rename(&dev->dev, dev->name);
        if (ret) {
                write_seqlock_bh(&netdev_rename_lock);
                memcpy(dev->name, oldname, IFNAMSIZ);
                write_sequnlock_bh(&netdev_rename_lock);
                WRITE_ONCE(dev->name_assign_type, old_assign_type);
                return ret;
        }

        netdev_adjacent_rename_links(dev, oldname);

        netdev_name_node_del(dev->name_node);

        synchronize_net();

        netdev_name_node_add(net, dev->name_node);

        ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
        ret = notifier_to_errno(ret);

        if (ret) {
                /* err >= 0 after dev_alloc_name() or stores the first errno */
                if (err >= 0) {
                        err = ret;
                        write_seqlock_bh(&netdev_rename_lock);
                        memcpy(dev->name, oldname, IFNAMSIZ);
                        write_sequnlock_bh(&netdev_rename_lock);
                        memcpy(oldname, newname, IFNAMSIZ);
                        WRITE_ONCE(dev->name_assign_type, old_assign_type);
                        old_assign_type = NET_NAME_RENAMED;
                        goto rollback;
                } else {
                        netdev_err(dev, "name change rollback failed: %d\n",
                                   ret);
                }
        }

        return err;
}

int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
{
        struct dev_ifalias *new_alias = NULL;

        if (len >= IFALIASZ)
                return -EINVAL;

        if (len) {
                new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
                if (!new_alias)
                        return -ENOMEM;

                memcpy(new_alias->ifalias, alias, len);
                new_alias->ifalias[len] = 0;
        }

        mutex_lock(&ifalias_mutex);
        new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
                                        mutex_is_locked(&ifalias_mutex));
        mutex_unlock(&ifalias_mutex);

        if (new_alias)
                kfree_rcu(new_alias, rcuhead);

        return len;
}

/**
 *        dev_get_alias - get ifalias of a device
 *        @dev: device
 *        @name: buffer to store name of ifalias
 *        @len: size of buffer
 *
 *        get ifalias for a device.  Caller must make sure dev cannot go
 *        away,  e.g. rcu read lock or own a reference count to device.
 */
int dev_get_alias(const struct net_device *dev, char *name, size_t len)
{
        const struct dev_ifalias *alias;
        int ret = 0;

        rcu_read_lock();
        alias = rcu_dereference(dev->ifalias);
        if (alias)
                ret = snprintf(name, len, "%s", alias->ifalias);
        rcu_read_unlock();

        return ret;
}

/**
 *        netdev_features_change - device changes features
 *        @dev: device to cause notification
 *
 *        Called to indicate a device has changed features.
 */
void netdev_features_change(struct net_device *dev)
{
        call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
}
EXPORT_SYMBOL(netdev_features_change);

void netif_state_change(struct net_device *dev)
{
        netdev_ops_assert_locked_or_invisible(dev);

        if (dev->flags & IFF_UP) {
                struct netdev_notifier_change_info change_info = {
                        .info.dev = dev,
                };

                call_netdevice_notifiers_info(NETDEV_CHANGE,
                                              &change_info.info);
                rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
        }
}

/**
 * __netdev_notify_peers - notify network peers about existence of @dev,
 * to be called when rtnl lock is already held.
 * @dev: network device
 *
 * Generate traffic such that interested network peers are aware of
 * @dev, such as by generating a gratuitous ARP. This may be used when
 * a device wants to inform the rest of the network about some sort of
 * reconfiguration such as a failover event or virtual machine
 * migration.
 */
void __netdev_notify_peers(struct net_device *dev)
{
        ASSERT_RTNL();
        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
        call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
}
EXPORT_SYMBOL(__netdev_notify_peers);

/**
 * netdev_notify_peers - notify network peers about existence of @dev
 * @dev: network device
 *
 * Generate traffic such that interested network peers are aware of
 * @dev, such as by generating a gratuitous ARP. This may be used when
 * a device wants to inform the rest of the network about some sort of
 * reconfiguration such as a failover event or virtual machine
 * migration.
 */
void netdev_notify_peers(struct net_device *dev)
{
        rtnl_lock();
        __netdev_notify_peers(dev);
        rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);

static int napi_threaded_poll(void *data);

static int napi_kthread_create(struct napi_struct *n)
{
        int err = 0;

        /* Create and wake up the kthread once to put it in
         * TASK_INTERRUPTIBLE mode to avoid the blocked task
         * warning and work with loadavg.
         */
        n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
                                n->dev->name, n->napi_id);
        if (IS_ERR(n->thread)) {
                err = PTR_ERR(n->thread);
                pr_err("kthread_run failed with err %d\n", err);
                n->thread = NULL;
        }

        return err;
}

static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int ret;

        ASSERT_RTNL();
        dev_addr_check(dev);

        if (!netif_device_present(dev)) {
                /* may be detached because parent is runtime-suspended */
                if (dev->dev.parent)
                        pm_runtime_resume(dev->dev.parent);
                if (!netif_device_present(dev))
                        return -ENODEV;
        }

        /* Block netpoll from trying to do any rx path servicing.
         * If we don't do this there is a chance ndo_poll_controller
         * or ndo_poll may be running while we open the device
         */
        netpoll_poll_disable(dev);

        ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
        ret = notifier_to_errno(ret);
        if (ret)
                return ret;

        set_bit(__LINK_STATE_START, &dev->state);

        netdev_ops_assert_locked(dev);

        if (ops->ndo_validate_addr)
                ret = ops->ndo_validate_addr(dev);

        if (!ret && ops->ndo_open)
                ret = ops->ndo_open(dev);

        netpoll_poll_enable(dev);

        if (ret)
                clear_bit(__LINK_STATE_START, &dev->state);
        else {
                netif_set_up(dev, true);
                dev_set_rx_mode(dev);
                dev_activate(dev);
                add_device_randomness(dev->dev_addr, dev->addr_len);
        }

        return ret;
}

int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
        int ret;

        if (dev->flags & IFF_UP)
                return 0;

        ret = __dev_open(dev, extack);
        if (ret < 0)
                return ret;

        rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
        call_netdevice_notifiers(NETDEV_UP, dev);

        return ret;
}

static void __dev_close_many(struct list_head *head)
{
        struct net_device *dev;

        ASSERT_RTNL();
        might_sleep();

        list_for_each_entry(dev, head, close_list) {
                /* Temporarily disable netpoll until the interface is down */
                netpoll_poll_disable(dev);

                call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);

                clear_bit(__LINK_STATE_START, &dev->state);

                /* Synchronize to scheduled poll. We cannot touch poll list, it
                 * can be even on different cpu. So just clear netif_running().
                 *
                 * dev->stop() will invoke napi_disable() on all of it's
                 * napi_struct instances on this device.
                 */
                smp_mb__after_atomic(); /* Commit netif_running(). */
        }

        dev_deactivate_many(head);

        list_for_each_entry(dev, head, close_list) {
                const struct net_device_ops *ops = dev->netdev_ops;

                /*
                 *        Call the device specific close. This cannot fail.
                 *        Only if device is UP
                 *
                 *        We allow it to be called even after a DETACH hot-plug
                 *        event.
                 */

                netdev_ops_assert_locked(dev);

                if (ops->ndo_stop)
                        ops->ndo_stop(dev);

                netif_set_up(dev, false);
                netpoll_poll_enable(dev);
        }
}

static void __dev_close(struct net_device *dev)
{
        LIST_HEAD(single);

        list_add(&dev->close_list, &single);
        __dev_close_many(&single);
        list_del(&single);
}

void dev_close_many(struct list_head *head, bool unlink)
{
        struct net_device *dev, *tmp;

        /* Remove the devices that don't need to be closed */
        list_for_each_entry_safe(dev, tmp, head, close_list)
                if (!(dev->flags & IFF_UP))
                        list_del_init(&dev->close_list);

        __dev_close_many(head);

        list_for_each_entry_safe(dev, tmp, head, close_list) {
                rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
                call_netdevice_notifiers(NETDEV_DOWN, dev);
                if (unlink)
                        list_del_init(&dev->close_list);
        }
}
EXPORT_SYMBOL(dev_close_many);

void netif_close(struct net_device *dev)
{
        if (dev->flags & IFF_UP) {
                LIST_HEAD(single);

                list_add(&dev->close_list, &single);
                dev_close_many(&single, true);
                list_del(&single);
        }
}
EXPORT_SYMBOL(netif_close);

void netif_disable_lro(struct net_device *dev)
{
        struct net_device *lower_dev;
        struct list_head *iter;

        dev->wanted_features &= ~NETIF_F_LRO;
        netdev_update_features(dev);

        if (unlikely(dev->features & NETIF_F_LRO))
                netdev_WARN(dev, "failed to disable LRO!\n");

        netdev_for_each_lower_dev(dev, lower_dev, iter) {
                netdev_lock_ops(lower_dev);
                netif_disable_lro(lower_dev);
                netdev_unlock_ops(lower_dev);
        }
}
EXPORT_IPV6_MOD(netif_disable_lro);

/**
 *        dev_disable_gro_hw - disable HW Generic Receive Offload on a device
 *        @dev: device
 *
 *        Disable HW Generic Receive Offload (GRO_HW) on a net device.  Must be
 *        called under RTNL.  This is needed if Generic XDP is installed on
 *        the device.
 */
static void dev_disable_gro_hw(struct net_device *dev)
{
        dev->wanted_features &= ~NETIF_F_GRO_HW;
        netdev_update_features(dev);

        if (unlikely(dev->features & NETIF_F_GRO_HW))
                netdev_WARN(dev, "failed to disable GRO_HW!\n");
}

const char *netdev_cmd_to_name(enum netdev_cmd cmd)
{
#define N(val)                                                 \
        case NETDEV_##val:                                \
                return "NETDEV_" __stringify(val);
        switch (cmd) {
        N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
        N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
        N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
        N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
        N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
        N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
        N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
        N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
        N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
        N(XDP_FEAT_CHANGE)
        }
#undef N
        return "UNKNOWN_NETDEV_EVENT";
}
EXPORT_SYMBOL_GPL(netdev_cmd_to_name);

static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
                                   struct net_device *dev)
{
        struct netdev_notifier_info info = {
                .dev = dev,
        };

        return nb->notifier_call(nb, val, &info);
}

static int call_netdevice_register_notifiers(struct notifier_block *nb,
                                             struct net_device *dev)
{
        int err;

        err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
        err = notifier_to_errno(err);
        if (err)
                return err;

        if (!(dev->flags & IFF_UP))
                return 0;

        call_netdevice_notifier(nb, NETDEV_UP, dev);
        return 0;
}

static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
                                                struct net_device *dev)
{
        if (dev->flags & IFF_UP) {
                call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
                                        dev);
                call_netdevice_notifier(nb, NETDEV_DOWN, dev);
        }
        call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
}

static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
                                                 struct net *net)
{
        struct net_device *dev;
        int err;

        for_each_netdev(net, dev) {
                netdev_lock_ops(dev);
                err = call_netdevice_register_notifiers(nb, dev);
                netdev_unlock_ops(dev);
                if (err)
                        goto rollback;
        }
        return 0;

rollback:
        for_each_netdev_continue_reverse(net, dev)
                call_netdevice_unregister_notifiers(nb, dev);
        return err;
}

static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
                                                    struct net *net)
{
        struct net_device *dev;

        for_each_netdev(net, dev)
                call_netdevice_unregister_notifiers(nb, dev);
}

static int dev_boot_phase = 1;

/**
 * register_netdevice_notifier - register a network notifier block
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

int register_netdevice_notifier(struct notifier_block *nb)
{
        struct net *net;
        int err;

        /* Close race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);

        /* When RTNL is removed, we need protection for netdev_chain. */
        rtnl_lock();

        err = raw_notifier_chain_register(&netdev_chain, nb);
        if (err)
                goto unlock;
        if (dev_boot_phase)
                goto unlock;
        for_each_net(net) {
                __rtnl_net_lock(net);
                err = call_netdevice_register_net_notifiers(nb, net);
                __rtnl_net_unlock(net);
                if (err)
                        goto rollback;
        }

unlock:
        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
        return err;

rollback:
        for_each_net_continue_reverse(net) {
                __rtnl_net_lock(net);
                call_netdevice_unregister_net_notifiers(nb, net);
                __rtnl_net_unlock(net);
        }

        raw_notifier_chain_unregister(&netdev_chain, nb);
        goto unlock;
}
EXPORT_SYMBOL(register_netdevice_notifier);

/**
 * unregister_netdevice_notifier - unregister a network notifier block
 * @nb: notifier
 *
 * Unregister a notifier previously registered by
 * register_netdevice_notifier(). The notifier is unlinked into the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
 *
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
 */

int unregister_netdevice_notifier(struct notifier_block *nb)
{
        struct net *net;
        int err;

        /* Close race with setup_net() and cleanup_net() */
        down_write(&pernet_ops_rwsem);
        rtnl_lock();
        err = raw_notifier_chain_unregister(&netdev_chain, nb);
        if (err)
                goto unlock;

        for_each_net(net) {
                __rtnl_net_lock(net);
                call_netdevice_unregister_net_notifiers(nb, net);
                __rtnl_net_unlock(net);
        }

unlock:
        rtnl_unlock();
        up_write(&pernet_ops_rwsem);
        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier);

static int __register_netdevice_notifier_net(struct net *net,
                                             struct notifier_block *nb,
                                             bool ignore_call_fail)
{
        int err;

        err = raw_notifier_chain_register(&net->netdev_chain, nb);
        if (err)
                return err;
        if (dev_boot_phase)
                return 0;

        err = call_netdevice_register_net_notifiers(nb, net);
        if (err && !ignore_call_fail)
                goto chain_unregister;

        return 0;

chain_unregister:
        raw_notifier_chain_unregister(&net->netdev_chain, nb);
        return err;
}

static int __unregister_netdevice_notifier_net(struct net *net,
                                               struct notifier_block *nb)
{
        int err;

        err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
        if (err)
                return err;

        call_netdevice_unregister_net_notifiers(nb, net);
        return 0;
}

/**
 * register_netdevice_notifier_net - register a per-netns network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Register a notifier to be called when network device events occur.
 * The notifier passed is linked into the kernel structures and must
 * not be reused until it has been unregistered. A negative errno code
 * is returned on a failure.
 *
 * When registered all registration and up events are replayed
 * to the new notifier to allow device to have a race free
 * view of the network device list.
 */

int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
        int err;

        rtnl_net_lock(net);
        err = __register_netdevice_notifier_net(net, nb, false);
        rtnl_net_unlock(net);

        return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);

/**
 * unregister_netdevice_notifier_net - unregister a per-netns
 *                                     network notifier block
 * @net: network namespace
 * @nb: notifier
 *
 * Unregister a notifier previously registered by
 * register_netdevice_notifier_net(). The notifier is unlinked from the
 * kernel structures and may then be reused. A negative errno code
 * is returned on a failure.
 *
 * After unregistering unregister and down device events are synthesized
 * for all devices on the device list to the removed notifier to remove
 * the need for special case cleanup code.
 */

int unregister_netdevice_notifier_net(struct net *net,
                                      struct notifier_block *nb)
{
        int err;

        rtnl_net_lock(net);
        err = __unregister_netdevice_notifier_net(net, nb);
        rtnl_net_unlock(net);

        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);

static void __move_netdevice_notifier_net(struct net *src_net,
                                          struct net *dst_net,
                                          struct notifier_block *nb)
{
        __unregister_netdevice_notifier_net(src_net, nb);
        __register_netdevice_notifier_net(dst_net, nb, true);
}

static void rtnl_net_dev_lock(struct net_device *dev)
{
        bool again;

        do {
                struct net *net;

                again = false;

                /* netns might be being dismantled. */
                rcu_read_lock();
                net = dev_net_rcu(dev);
                net_passive_inc(net);
                rcu_read_unlock();

                rtnl_net_lock(net);

#ifdef CONFIG_NET_NS
                /* dev might have been moved to another netns. */
                if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
                        rtnl_net_unlock(net);
                        net_passive_dec(net);
                        again = true;
                }
#endif
        } while (again);
}

static void rtnl_net_dev_unlock(struct net_device *dev)
{
        struct net *net = dev_net(dev);

        rtnl_net_unlock(net);
        net_passive_dec(net);
}

int register_netdevice_notifier_dev_net(struct net_device *dev,
                                        struct notifier_block *nb,
                                        struct netdev_net_notifier *nn)
{
        int err;

        rtnl_net_dev_lock(dev);
        err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
        if (!err) {
                nn->nb = nb;
                list_add(&nn->list, &dev->net_notifier_list);
        }
        rtnl_net_dev_unlock(dev);

        return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);

int unregister_netdevice_notifier_dev_net(struct net_device *dev,
                                          struct notifier_block *nb,
                                          struct netdev_net_notifier *nn)
{
        int err;

        rtnl_net_dev_lock(dev);
        list_del(&nn->list);
        err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
        rtnl_net_dev_unlock(dev);

        return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);

static void move_netdevice_notifiers_dev_net(struct net_device *dev,
                                             struct net *net)
{
        struct netdev_net_notifier *nn;

        list_for_each_entry(nn, &dev->net_notifier_list, list)
                __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
}

/**
 *        call_netdevice_notifiers_info - call all network notifier blocks
 *        @val: value passed unmodified to notifier function
 *        @info: notifier information data
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */

int call_netdevice_notifiers_info(unsigned long val,
                                  struct netdev_notifier_info *info)
{
        struct net *net = dev_net(info->dev);
        int ret;

        ASSERT_RTNL();

        /* Run per-netns notifier block chain first, then run the global one.
         * Hopefully, one day, the global one is going to be removed after
         * all notifier block registrators get converted to be per-netns.
         */
        ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
        if (ret & NOTIFY_STOP_MASK)
                return ret;
        return raw_notifier_call_chain(&netdev_chain, val, info);
}

/**
 *        call_netdevice_notifiers_info_robust - call per-netns notifier blocks
 *                                               for and rollback on error
 *        @val_up: value passed unmodified to notifier function
 *        @val_down: value passed unmodified to the notifier function when
 *                   recovering from an error on @val_up
 *        @info: notifier information data
 *
 *        Call all per-netns network notifier blocks, but not notifier blocks on
 *        the global notifier chain. Parameters and return value are as for
 *        raw_notifier_call_chain_robust().
 */

static int
call_netdevice_notifiers_info_robust(unsigned long val_up,
                                     unsigned long val_down,
                                     struct netdev_notifier_info *info)
{
        struct net *net = dev_net(info->dev);

        ASSERT_RTNL();

        return raw_notifier_call_chain_robust(&net->netdev_chain,
                                              val_up, val_down, info);
}

static int call_netdevice_notifiers_extack(unsigned long val,
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_info info = {
                .dev = dev,
                .extack = extack,
        };

        return call_netdevice_notifiers_info(val, &info);
}

/**
 *        call_netdevice_notifiers - call all network notifier blocks
 *      @val: value passed unmodified to notifier function
 *      @dev: net_device pointer passed unmodified to notifier function
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */

int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
        return call_netdevice_notifiers_extack(val, dev, NULL);
}
EXPORT_SYMBOL(call_netdevice_notifiers);

/**
 *        call_netdevice_notifiers_mtu - call all network notifier blocks
 *        @val: value passed unmodified to notifier function
 *        @dev: net_device pointer passed unmodified to notifier function
 *        @arg: additional u32 argument passed to the notifier function
 *
 *        Call all network notifier blocks.  Parameters and return value
 *        are as for raw_notifier_call_chain().
 */
static int call_netdevice_notifiers_mtu(unsigned long val,
                                        struct net_device *dev, u32 arg)
{
        struct netdev_notifier_info_ext info = {
                .info.dev = dev,
                .ext.mtu = arg,
        };

        BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);

        return call_netdevice_notifiers_info(val, &info.info);
}

#ifdef CONFIG_NET_INGRESS
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);

void net_inc_ingress_queue(void)
{
        static_branch_inc(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_ingress_queue);

void net_dec_ingress_queue(void)
{
        static_branch_dec(&ingress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif

#ifdef CONFIG_NET_EGRESS
static DEFINE_STATIC_KEY_FALSE(egress_needed_key);

void net_inc_egress_queue(void)
{
        static_branch_inc(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_inc_egress_queue);

void net_dec_egress_queue(void)
{
        static_branch_dec(&egress_needed_key);
}
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif

#ifdef CONFIG_NET_CLS_ACT
DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
EXPORT_SYMBOL(tcf_sw_enabled_key);
#endif

DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
EXPORT_SYMBOL(netstamp_needed_key);
#ifdef CONFIG_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
static void netstamp_clear(struct work_struct *work)
{
        int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
        int wanted;

        wanted = atomic_add_return(deferred, &netstamp_wanted);
        if (wanted > 0)
                static_branch_enable(&netstamp_needed_key);
        else
                static_branch_disable(&netstamp_needed_key);
}
static DECLARE_WORK(netstamp_work, netstamp_clear);
#endif

void net_enable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
        int wanted = atomic_read(&netstamp_wanted);

        while (wanted > 0) {
                if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
                        return;
        }
        atomic_inc(&netstamp_needed_deferred);
        schedule_work(&netstamp_work);
#else
        static_branch_inc(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_enable_timestamp);

void net_disable_timestamp(void)
{
#ifdef CONFIG_JUMP_LABEL
        int wanted = atomic_read(&netstamp_wanted);

        while (wanted > 1) {
                if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
                        return;
        }
        atomic_dec(&netstamp_needed_deferred);
        schedule_work(&netstamp_work);
#else
        static_branch_dec(&netstamp_needed_key);
#endif
}
EXPORT_SYMBOL(net_disable_timestamp);

static inline void net_timestamp_set(struct sk_buff *skb)
{
        skb->tstamp = 0;
        skb->tstamp_type = SKB_CLOCK_REALTIME;
        if (static_branch_unlikely(&netstamp_needed_key))
                skb->tstamp = ktime_get_real();
}

#define net_timestamp_check(COND, SKB)                                \
        if (static_branch_unlikely(&netstamp_needed_key)) {        \
                if ((COND) && !(SKB)->tstamp)                        \
                        (SKB)->tstamp = ktime_get_real();        \
        }                                                        \

bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
        return __is_skb_forwardable(dev, skb, true);
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);

static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
                              bool check_mtu)
{
        int ret = ____dev_forward_skb(dev, skb, check_mtu);

        if (likely(!ret)) {
                skb->protocol = eth_type_trans(skb, dev);
                skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
        }

        return ret;
}

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb2(dev, skb, true);
}
EXPORT_SYMBOL_GPL(__dev_forward_skb);

/**
 * dev_forward_skb - loopback an skb to another netif
 *
 * @dev: destination network device
 * @skb: buffer to forward
 *
 * return values:
 *        NET_RX_SUCCESS        (no congestion)
 *        NET_RX_DROP     (packet was dropped, but freed)
 *
 * dev_forward_skb can be used for injecting an skb from the
 * start_xmit function of one device into the receive queue
 * of another device.
 *
 * The receiving device may be in another namespace, so
 * we have to clear all information in the skb that could
 * impact namespace isolation.
 */
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}
EXPORT_SYMBOL_GPL(dev_forward_skb);

int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
{
        return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
}

static inline int deliver_skb(struct sk_buff *skb,
                              struct packet_type *pt_prev,
                              struct net_device *orig_dev)
{
        if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                return -ENOMEM;
        refcount_inc(&skb->users);
        return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}

static inline void deliver_ptype_list_skb(struct sk_buff *skb,
                                          struct packet_type **pt,
                                          struct net_device *orig_dev,
                                          __be16 type,
                                          struct list_head *ptype_list)
{
        struct packet_type *ptype, *pt_prev = *pt;

        list_for_each_entry_rcu(ptype, ptype_list, list) {
                if (ptype->type != type)
                        continue;
                if (pt_prev)
                        deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }
        *pt = pt_prev;
}

static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
{
        if (!ptype->af_packet_priv || !skb->sk)
                return false;

        if (ptype->id_match)
                return ptype->id_match(ptype, skb->sk);
        else if ((struct sock *)ptype->af_packet_priv == skb->sk)
                return true;

        return false;
}

/**
 * dev_nit_active_rcu - return true if any network interface taps are in use
 *
 * The caller must hold the RCU lock
 *
 * @dev: network device to check for the presence of taps
 */
bool dev_nit_active_rcu(const struct net_device *dev)
{
        /* Callers may hold either RCU or RCU BH lock */
        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());

        return !list_empty(&dev_net(dev)->ptype_all) ||
               !list_empty(&dev->ptype_all);
}
EXPORT_SYMBOL_GPL(dev_nit_active_rcu);

/*
 *        Support routine. Sends outgoing frames to any network
 *        taps currently in use.
 */

void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
        struct packet_type *ptype, *pt_prev = NULL;
        struct list_head *ptype_list;
        struct sk_buff *skb2 = NULL;

        rcu_read_lock();
        ptype_list = &dev_net_rcu(dev)->ptype_all;
again:
        list_for_each_entry_rcu(ptype, ptype_list, list) {
                if (READ_ONCE(ptype->ignore_outgoing))
                        continue;

                /* Never send packets back to the socket
                 * they originated from - MvS (miquels@drinkel.ow.org)
                 */
                if (skb_loop_sk(ptype, skb))
                        continue;

                if (pt_prev) {
                        deliver_skb(skb2, pt_prev, skb->dev);
                        pt_prev = ptype;
                        continue;
                }

                /* need to clone skb, done only once */
                skb2 = skb_clone(skb, GFP_ATOMIC);
                if (!skb2)
                        goto out_unlock;

                net_timestamp_set(skb2);

                /* skb->nh should be correctly
                 * set by sender, so that the second statement is
                 * just protection against buggy protocols.
                 */
                skb_reset_mac_header(skb2);

                if (skb_network_header(skb2) < skb2->data ||
                    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
                        net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
                                             ntohs(skb2->protocol),
                                             dev->name);
                        skb_reset_network_header(skb2);
                }

                skb2->transport_header = skb2->network_header;
                skb2->pkt_type = PACKET_OUTGOING;
                pt_prev = ptype;
        }

        if (ptype_list != &dev->ptype_all) {
                ptype_list = &dev->ptype_all;
                goto again;
        }
out_unlock:
        if (pt_prev) {
                if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
                        pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
                else
                        kfree_skb(skb2);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);

/**
 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
 * @dev: Network device
 * @txq: number of queues available
 *
 * If real_num_tx_queues is changed the tc mappings may no longer be
 * valid. To resolve this verify the tc mapping remains valid and if
 * not NULL the mapping. With no priorities mapping to this
 * offset/count pair it will no longer be used. In the worst case TC0
 * is invalid nothing can be done so disable priority mappings. If is
 * expected that drivers will fix this mapping if they can before
 * calling netif_set_real_num_tx_queues.
 */
static void netif_setup_tc(struct net_device *dev, unsigned int txq)
{
        int i;
        struct netdev_tc_txq *tc = &dev->tc_to_txq[0];

        /* If TC0 is invalidated disable TC mapping */
        if (tc->offset + tc->count > txq) {
                netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
                dev->num_tc = 0;
                return;
        }

        /* Invalidated prio to tc mappings set to TC0 */
        for (i = 1; i < TC_BITMASK + 1; i++) {
                int q = netdev_get_prio_tc_map(dev, i);

                tc = &dev->tc_to_txq[q];
                if (tc->offset + tc->count > txq) {
                        netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
                                    i, q);
                        netdev_set_prio_tc_map(dev, i, 0);
                }
        }
}

int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
{
        if (dev->num_tc) {
                struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
                int i;

                /* walk through the TCs and see if it falls into any of them */
                for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
                        if ((txq - tc->offset) < tc->count)
                                return i;
                }

                /* didn't find it, just return -1 to indicate no match */
                return -1;
        }

        return 0;
}
EXPORT_SYMBOL(netdev_txq_to_tc);

#ifdef CONFIG_XPS
static struct static_key xps_needed __read_mostly;
static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P)                \
        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))

static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
                             struct xps_dev_maps *old_maps, int tci, u16 index)
{
        struct xps_map *map = NULL;
        int pos;

        map = xmap_dereference(dev_maps->attr_map[tci]);
        if (!map)
                return false;

        for (pos = map->len; pos--;) {
                if (map->queues[pos] != index)
                        continue;

                if (map->len > 1) {
                        map->queues[pos] = map->queues[--map->len];
                        break;
                }

                if (old_maps)
                        RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                kfree_rcu(map, rcu);
                return false;
        }

        return true;
}

static bool remove_xps_queue_cpu(struct net_device *dev,
                                 struct xps_dev_maps *dev_maps,
                                 int cpu, u16 offset, u16 count)
{
        int num_tc = dev_maps->num_tc;
        bool active = false;
        int tci;

        for (tci = cpu * num_tc; num_tc--; tci++) {
                int i, j;

                for (i = count, j = offset; i--; j++) {
                        if (!remove_xps_queue(dev_maps, NULL, tci, j))
                                break;
                }

                active |= i < 0;
        }

        return active;
}

static void reset_xps_maps(struct net_device *dev,
                           struct xps_dev_maps *dev_maps,
                           enum xps_map_type type)
{
        static_key_slow_dec_cpuslocked(&xps_needed);
        if (type == XPS_RXQS)
                static_key_slow_dec_cpuslocked(&xps_rxqs_needed);

        RCU_INIT_POINTER(dev->xps_maps[type], NULL);

        kfree_rcu(dev_maps, rcu);
}

static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
                           u16 offset, u16 count)
{
        struct xps_dev_maps *dev_maps;
        bool active = false;
        int i, j;

        dev_maps = xmap_dereference(dev->xps_maps[type]);
        if (!dev_maps)
                return;

        for (j = 0; j < dev_maps->nr_ids; j++)
                active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
        if (!active)
                reset_xps_maps(dev, dev_maps, type);

        if (type == XPS_CPUS) {
                for (i = offset + (count - 1); count--; i--)
                        netdev_queue_numa_node_write(
                                netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
        }
}

static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
                                   u16 count)
{
        if (!static_key_false(&xps_needed))
                return;

        cpus_read_lock();
        mutex_lock(&xps_map_mutex);

        if (static_key_false(&xps_rxqs_needed))
                clean_xps_maps(dev, XPS_RXQS, offset, count);

        clean_xps_maps(dev, XPS_CPUS, offset, count);

        mutex_unlock(&xps_map_mutex);
        cpus_read_unlock();
}

static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
{
        netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
}

static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
                                      u16 index, bool is_rxqs_map)
{
        struct xps_map *new_map;
        int alloc_len = XPS_MIN_MAP_ALLOC;
        int i, pos;

        for (pos = 0; map && pos < map->len; pos++) {
                if (map->queues[pos] != index)
                        continue;
                return map;
        }

        /* Need to add tx-queue to this CPU's/rx-queue's existing map */
        if (map) {
                if (pos < map->alloc_len)
                        return map;

                alloc_len = map->alloc_len * 2;
        }

        /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
         *  map
         */
        if (is_rxqs_map)
                new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
        else
                new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
                                       cpu_to_node(attr_index));
        if (!new_map)
                return NULL;

        for (i = 0; i < pos; i++)
                new_map->queues[i] = map->queues[i];
        new_map->alloc_len = alloc_len;
        new_map->len = pos;

        return new_map;
}

/* Copy xps maps at a given index */
static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
                              struct xps_dev_maps *new_dev_maps, int index,
                              int tc, bool skip_tc)
{
        int i, tci = index * dev_maps->num_tc;
        struct xps_map *map;

        /* copy maps belonging to foreign traffic classes */
        for (i = 0; i < dev_maps->num_tc; i++, tci++) {
                if (i == tc && skip_tc)
                        continue;

                /* fill in the new device map from the old device map */
                map = xmap_dereference(dev_maps->attr_map[tci]);
                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
        }
}

/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
                          u16 index, enum xps_map_type type)
{
        struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
        const unsigned long *online_mask = NULL;
        bool active = false, copy = false;
        int i, j, tci, numa_node_id = -2;
        int maps_sz, num_tc = 1, tc = 0;
        struct xps_map *map, *new_map;
        unsigned int nr_ids;

        WARN_ON_ONCE(index >= dev->num_tx_queues);

        if (dev->num_tc) {
                /* Do not allow XPS on subordinate device directly */
                num_tc = dev->num_tc;
                if (num_tc < 0)
                        return -EINVAL;

                /* If queue belongs to subordinate dev use its map */
                dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;

                tc = netdev_txq_to_tc(dev, index);
                if (tc < 0)
                        return -EINVAL;
        }

        mutex_lock(&xps_map_mutex);

        dev_maps = xmap_dereference(dev->xps_maps[type]);
        if (type == XPS_RXQS) {
                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
                nr_ids = dev->num_rx_queues;
        } else {
                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
                if (num_possible_cpus() > 1)
                        online_mask = cpumask_bits(cpu_online_mask);
                nr_ids = nr_cpu_ids;
        }

        if (maps_sz < L1_CACHE_BYTES)
                maps_sz = L1_CACHE_BYTES;

        /* The old dev_maps could be larger or smaller than the one we're
         * setting up now, as dev->num_tc or nr_ids could have been updated in
         * between. We could try to be smart, but let's be safe instead and only
         * copy foreign traffic classes if the two map sizes match.
         */
        if (dev_maps &&
            dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
                copy = true;

        /* allocate memory for queue storage */
        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
             j < nr_ids;) {
                if (!new_dev_maps) {
                        new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
                        if (!new_dev_maps) {
                                mutex_unlock(&xps_map_mutex);
                                return -ENOMEM;
                        }

                        new_dev_maps->nr_ids = nr_ids;
                        new_dev_maps->num_tc = num_tc;
                }

                tci = j * num_tc + tc;
                map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;

                map = expand_xps_map(map, j, index, type == XPS_RXQS);
                if (!map)
                        goto error;

                RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
        }

        if (!new_dev_maps)
                goto out_no_new_maps;

        if (!dev_maps) {
                /* Increment static keys at most once per type */
                static_key_slow_inc_cpuslocked(&xps_needed);
                if (type == XPS_RXQS)
                        static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
        }

        for (j = 0; j < nr_ids; j++) {
                bool skip_tc = false;

                tci = j * num_tc + tc;
                if (netif_attr_test_mask(j, mask, nr_ids) &&
                    netif_attr_test_online(j, online_mask, nr_ids)) {
                        /* add tx-queue to CPU/rx-queue maps */
                        int pos = 0;

                        skip_tc = true;

                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
                        while ((pos < map->len) && (map->queues[pos] != index))
                                pos++;

                        if (pos == map->len)
                                map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
                        if (type == XPS_CPUS) {
                                if (numa_node_id == -2)
                                        numa_node_id = cpu_to_node(j);
                                else if (numa_node_id != cpu_to_node(j))
                                        numa_node_id = -1;
                        }
#endif
                }

                if (copy)
                        xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
                                          skip_tc);
        }

        rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);

        /* Cleanup old maps */
        if (!dev_maps)
                goto out_no_old_maps;

        for (j = 0; j < dev_maps->nr_ids; j++) {
                for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
                        map = xmap_dereference(dev_maps->attr_map[tci]);
                        if (!map)
                                continue;

                        if (copy) {
                                new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
                                if (map == new_map)
                                        continue;
                        }

                        RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                        kfree_rcu(map, rcu);
                }
        }

        old_dev_maps = dev_maps;

out_no_old_maps:
        dev_maps = new_dev_maps;
        active = true;

out_no_new_maps:
        if (type == XPS_CPUS)
                /* update Tx queue numa node */
                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
                                             (numa_node_id >= 0) ?
                                             numa_node_id : NUMA_NO_NODE);

        if (!dev_maps)
                goto out_no_maps;

        /* removes tx-queue from unused CPUs/rx-queues */
        for (j = 0; j < dev_maps->nr_ids; j++) {
                tci = j * dev_maps->num_tc;

                for (i = 0; i < dev_maps->num_tc; i++, tci++) {
                        if (i == tc &&
                            netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
                            netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
                                continue;

                        active |= remove_xps_queue(dev_maps,
                                                   copy ? old_dev_maps : NULL,
                                                   tci, index);
                }
        }

        if (old_dev_maps)
                kfree_rcu(old_dev_maps, rcu);

        /* free map if not active */
        if (!active)
                reset_xps_maps(dev, dev_maps, type);

out_no_maps:
        mutex_unlock(&xps_map_mutex);

        return 0;
error:
        /* remove any maps that we added */
        for (j = 0; j < nr_ids; j++) {
                for (i = num_tc, tci = j * num_tc; i--; tci++) {
                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
                        map = copy ?
                              xmap_dereference(dev_maps->attr_map[tci]) :
                              NULL;
                        if (new_map && new_map != map)
                                kfree(new_map);
                }
        }

        mutex_unlock(&xps_map_mutex);

        kfree(new_dev_maps);
        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__netif_set_xps_queue);

int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index)
{
        int ret;

        cpus_read_lock();
        ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
        cpus_read_unlock();

        return ret;
}
EXPORT_SYMBOL(netif_set_xps_queue);

#endif
static void netdev_unbind_all_sb_channels(struct net_device *dev)
{
        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

        /* Unbind any subordinate channels */
        while (txq-- != &dev->_tx[0]) {
                if (txq->sb_dev)
                        netdev_unbind_sb_channel(dev, txq->sb_dev);
        }
}

void netdev_reset_tc(struct net_device *dev)
{
#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(dev, 0);
#endif
        netdev_unbind_all_sb_channels(dev);

        /* Reset TC configuration of device */
        dev->num_tc = 0;
        memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
        memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
}
EXPORT_SYMBOL(netdev_reset_tc);

int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
{
        if (tc >= dev->num_tc)
                return -EINVAL;

#ifdef CONFIG_XPS
        netif_reset_xps_queues(dev, offset, count);
#endif
        dev->tc_to_txq[tc].count = count;
        dev->tc_to_txq[tc].offset = offset;
        return 0;
}
EXPORT_SYMBOL(netdev_set_tc_queue);

int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
{
        if (num_tc > TC_MAX_QUEUE)
                return -EINVAL;

#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(dev, 0);
#endif
        netdev_unbind_all_sb_channels(dev);

        dev->num_tc = num_tc;
        return 0;
}
EXPORT_SYMBOL(netdev_set_num_tc);

void netdev_unbind_sb_channel(struct net_device *dev,
                              struct net_device *sb_dev)
{
        struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];

#ifdef CONFIG_XPS
        netif_reset_xps_queues_gt(sb_dev, 0);
#endif
        memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
        memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));

        while (txq-- != &dev->_tx[0]) {
                if (txq->sb_dev == sb_dev)
                        txq->sb_dev = NULL;
        }
}
EXPORT_SYMBOL(netdev_unbind_sb_channel);

int netdev_bind_sb_channel_queue(struct net_device *dev,
                                 struct net_device *sb_dev,
                                 u8 tc, u16 count, u16 offset)
{
        /* Make certain the sb_dev and dev are already configured */
        if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
                return -EINVAL;

        /* We cannot hand out queues we don't have */
        if ((offset + count) > dev->real_num_tx_queues)
                return -EINVAL;

        /* Record the mapping */
        sb_dev->tc_to_txq[tc].count = count;
        sb_dev->tc_to_txq[tc].offset = offset;

        /* Provide a way for Tx queue to find the tc_to_txq map or
         * XPS map for itself.
         */
        while (count--)
                netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;

        return 0;
}
EXPORT_SYMBOL(netdev_bind_sb_channel_queue);

int netdev_set_sb_channel(struct net_device *dev, u16 channel)
{
        /* Do not use a multiqueue device to represent a subordinate channel */
        if (netif_is_multiqueue(dev))
                return -ENODEV;

        /* We allow channels 1 - 32767 to be used for subordinate channels.
         * Channel 0 is meant to be "native" mode and used only to represent
         * the main root device. We allow writing 0 to reset the device back
         * to normal mode after being used as a subordinate channel.
         */
        if (channel > S16_MAX)
                return -EINVAL;

        dev->num_tc = -channel;

        return 0;
}
EXPORT_SYMBOL(netdev_set_sb_channel);

/*
 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
 * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
 */
int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
{
        bool disabling;
        int rc;

        disabling = txq < dev->real_num_tx_queues;

        if (txq < 1 || txq > dev->num_tx_queues)
                return -EINVAL;

        if (dev->reg_state == NETREG_REGISTERED ||
            dev->reg_state == NETREG_UNREGISTERING) {
                ASSERT_RTNL();
                netdev_ops_assert_locked(dev);

                rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
                                                  txq);
                if (rc)
                        return rc;

                if (dev->num_tc)
                        netif_setup_tc(dev, txq);

                net_shaper_set_real_num_tx_queues(dev, txq);

                dev_qdisc_change_real_num_tx(dev, txq);

                dev->real_num_tx_queues = txq;

                if (disabling) {
                        synchronize_net();
                        qdisc_reset_all_tx_gt(dev, txq);
#ifdef CONFIG_XPS
                        netif_reset_xps_queues_gt(dev, txq);
#endif
                }
        } else {
                dev->real_num_tx_queues = txq;
        }

        return 0;
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);

/**
 *        netif_set_real_num_rx_queues - set actual number of RX queues used
 *        @dev: Network device
 *        @rxq: Actual number of RX queues
 *
 *        This must be called either with the rtnl_lock held or before
 *        registration of the net device.  Returns 0 on success, or a
 *        negative error code.  If called before registration, it always
 *        succeeds.
 */
int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
{
        int rc;

        if (rxq < 1 || rxq > dev->num_rx_queues)
                return -EINVAL;

        if (dev->reg_state == NETREG_REGISTERED) {
                ASSERT_RTNL();
                netdev_ops_assert_locked(dev);

                rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
                                                  rxq);
                if (rc)
                        return rc;
        }

        dev->real_num_rx_queues = rxq;
        return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);

/**
 *        netif_set_real_num_queues - set actual number of RX and TX queues used
 *        @dev: Network device
 *        @txq: Actual number of TX queues
 *        @rxq: Actual number of RX queues
 *
 *        Set the real number of both TX and RX queues.
 *        Does nothing if the number of queues is already correct.
 */
int netif_set_real_num_queues(struct net_device *dev,
                              unsigned int txq, unsigned int rxq)
{
        unsigned int old_rxq = dev->real_num_rx_queues;
        int err;

        if (txq < 1 || txq > dev->num_tx_queues ||
            rxq < 1 || rxq > dev->num_rx_queues)
                return -EINVAL;

        /* Start from increases, so the error path only does decreases -
         * decreases can't fail.
         */
        if (rxq > dev->real_num_rx_queues) {
                err = netif_set_real_num_rx_queues(dev, rxq);
                if (err)
                        return err;
        }
        if (txq > dev->real_num_tx_queues) {
                err = netif_set_real_num_tx_queues(dev, txq);
                if (err)
                        goto undo_rx;
        }
        if (rxq < dev->real_num_rx_queues)
                WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
        if (txq < dev->real_num_tx_queues)
                WARN_ON(netif_set_real_num_tx_queues(dev, txq));

        return 0;
undo_rx:
        WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
        return err;
}
EXPORT_SYMBOL(netif_set_real_num_queues);

/**
 * netif_set_tso_max_size() - set the max size of TSO frames supported
 * @dev:        netdev to update
 * @size:        max skb->len of a TSO frame
 *
 * Set the limit on the size of TSO super-frames the device can handle.
 * Unless explicitly set the stack will assume the value of
 * %GSO_LEGACY_MAX_SIZE.
 */
void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
{
        dev->tso_max_size = min(GSO_MAX_SIZE, size);
        if (size < READ_ONCE(dev->gso_max_size))
                netif_set_gso_max_size(dev, size);
        if (size < READ_ONCE(dev->gso_ipv4_max_size))
                netif_set_gso_ipv4_max_size(dev, size);
}
EXPORT_SYMBOL(netif_set_tso_max_size);

/**
 * netif_set_tso_max_segs() - set the max number of segs supported for TSO
 * @dev:        netdev to update
 * @segs:        max number of TCP segments
 *
 * Set the limit on the number of TCP segments the device can generate from
 * a single TSO super-frame.
 * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
 */
void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
{
        dev->tso_max_segs = segs;
        if (segs < READ_ONCE(dev->gso_max_segs))
                netif_set_gso_max_segs(dev, segs);
}
EXPORT_SYMBOL(netif_set_tso_max_segs);

/**
 * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
 * @to:                netdev to update
 * @from:        netdev from which to copy the limits
 */
void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
{
        netif_set_tso_max_size(to, from->tso_max_size);
        netif_set_tso_max_segs(to, from->tso_max_segs);
}
EXPORT_SYMBOL(netif_inherit_tso_max);

/**
 * netif_get_num_default_rss_queues - default number of RSS queues
 *
 * Default value is the number of physical cores if there are only 1 or 2, or
 * divided by 2 if there are more.
 */
int netif_get_num_default_rss_queues(void)
{
        cpumask_var_t cpus;
        int cpu, count = 0;

        if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
                return 1;

        cpumask_copy(cpus, cpu_online_mask);
        for_each_cpu(cpu, cpus) {
                ++count;
                cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
        }
        free_cpumask_var(cpus);

        return count > 2 ? DIV_ROUND_UP(count, 2) : count;
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);

static void __netif_reschedule(struct Qdisc *q)
{
        struct softnet_data *sd;
        unsigned long flags;

        local_irq_save(flags);
        sd = this_cpu_ptr(&softnet_data);
        q->next_sched = NULL;
        *sd->output_queue_tailp = q;
        sd->output_queue_tailp = &q->next_sched;
        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_restore(flags);
}

void __netif_schedule(struct Qdisc *q)
{
        if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
                __netif_reschedule(q);
}
EXPORT_SYMBOL(__netif_schedule);

struct dev_kfree_skb_cb {
        enum skb_drop_reason reason;
};

static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
{
        return (struct dev_kfree_skb_cb *)skb->cb;
}

void netif_schedule_queue(struct netdev_queue *txq)
{
        rcu_read_lock();
        if (!netif_xmit_stopped(txq)) {
                struct Qdisc *q = rcu_dereference(txq->qdisc);

                __netif_schedule(q);
        }
        rcu_read_unlock();
}
EXPORT_SYMBOL(netif_schedule_queue);

void netif_tx_wake_queue(struct netdev_queue *dev_queue)
{
        if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
                struct Qdisc *q;

                rcu_read_lock();
                q = rcu_dereference(dev_queue->qdisc);
                __netif_schedule(q);
                rcu_read_unlock();
        }
}
EXPORT_SYMBOL(netif_tx_wake_queue);

void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        unsigned long flags;

        if (unlikely(!skb))
                return;

        if (likely(refcount_read(&skb->users) == 1)) {
                smp_rmb();
                refcount_set(&skb->users, 0);
        } else if (likely(!refcount_dec_and_test(&skb->users))) {
                return;
        }
        get_kfree_skb_cb(skb)->reason = reason;
        local_irq_save(flags);
        skb->next = __this_cpu_read(softnet_data.completion_queue);
        __this_cpu_write(softnet_data.completion_queue, skb);
        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_restore(flags);
}
EXPORT_SYMBOL(dev_kfree_skb_irq_reason);

void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
        if (in_hardirq() || irqs_disabled())
                dev_kfree_skb_irq_reason(skb, reason);
        else
                kfree_skb_reason(skb, reason);
}
EXPORT_SYMBOL(dev_kfree_skb_any_reason);


/**
 * netif_device_detach - mark device as removed
 * @dev: network device
 *
 * Mark device as removed from system and therefore no longer available.
 */
void netif_device_detach(struct net_device *dev)
{
        if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
            netif_running(dev)) {
                netif_tx_stop_all_queues(dev);
        }
}
EXPORT_SYMBOL(netif_device_detach);

/**
 * netif_device_attach - mark device as attached
 * @dev: network device
 *
 * Mark device as attached from system and restart if needed.
 */
void netif_device_attach(struct net_device *dev)
{
        if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
            netif_running(dev)) {
                netif_tx_wake_all_queues(dev);
                netdev_watchdog_up(dev);
        }
}
EXPORT_SYMBOL(netif_device_attach);

/*
 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 * to be used as a distribution range.
 */
static u16 skb_tx_hash(const struct net_device *dev,
                       const struct net_device *sb_dev,
                       struct sk_buff *skb)
{
        u32 hash;
        u16 qoffset = 0;
        u16 qcount = dev->real_num_tx_queues;

        if (dev->num_tc) {
                u8 tc = netdev_get_prio_tc_map(dev, skb->priority);

                qoffset = sb_dev->tc_to_txq[tc].offset;
                qcount = sb_dev->tc_to_txq[tc].count;
                if (unlikely(!qcount)) {
                        net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
                                             sb_dev->name, qoffset, tc);
                        qoffset = 0;
                        qcount = dev->real_num_tx_queues;
                }
        }

        if (skb_rx_queue_recorded(skb)) {
                DEBUG_NET_WARN_ON_ONCE(qcount == 0);
                hash = skb_get_rx_queue(skb);
                if (hash >= qoffset)
                        hash -= qoffset;
                while (unlikely(hash >= qcount))
                        hash -= qcount;
                return hash + qoffset;
        }

        return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}

void skb_warn_bad_offload(const struct sk_buff *skb)
{
        static const netdev_features_t null_features;
        struct net_device *dev = skb->dev;
        const char *name = "";

        if (!net_ratelimit())
                return;

        if (dev) {
                if (dev->dev.parent)
                        name = dev_driver_string(dev->dev.parent);
                else
                        name = netdev_name(dev);
        }
        skb_dump(KERN_WARNING, skb, false);
        WARN(1, "%s: caps=(%pNF, %pNF)\n",
             name, dev ? &dev->features : &null_features,
             skb->sk ? &skb->sk->sk_route_caps : &null_features);
}

/*
 * Invalidate hardware checksum when packet is to be mangled, and
 * complete checksum manually on outgoing path.
 */
int skb_checksum_help(struct sk_buff *skb)
{
        __wsum csum;
        int ret = 0, offset;

        if (skb->ip_summed == CHECKSUM_COMPLETE)
                goto out_set_summed;

        if (unlikely(skb_is_gso(skb))) {
                skb_warn_bad_offload(skb);
                return -EINVAL;
        }

        if (!skb_frags_readable(skb)) {
                return -EFAULT;
        }

        /* Before computing a checksum, we should make sure no frag could
         * be modified by an external entity : checksum could be wrong.
         */
        if (skb_has_shared_frag(skb)) {
                ret = __skb_linearize(skb);
                if (ret)
                        goto out;
        }

        offset = skb_checksum_start_offset(skb);
        ret = -EINVAL;
        if (unlikely(offset >= skb_headlen(skb))) {
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
                WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
                          offset, skb_headlen(skb));
                goto out;
        }
        csum = skb_checksum(skb, offset, skb->len - offset, 0);

        offset += skb->csum_offset;
        if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
                DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
                WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
                          offset + sizeof(__sum16), skb_headlen(skb));
                goto out;
        }
        ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
        if (ret)
                goto out;

        *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
        skb->ip_summed = CHECKSUM_NONE;
out:
        return ret;
}
EXPORT_SYMBOL(skb_checksum_help);

int skb_crc32c_csum_help(struct sk_buff *skb)
{
        __le32 crc32c_csum;
        int ret = 0, offset, start;

        if (skb->ip_summed != CHECKSUM_PARTIAL)
                goto out;

        if (unlikely(skb_is_gso(skb)))
                goto out;

        /* Before computing a checksum, we should make sure no frag could
         * be modified by an external entity : checksum could be wrong.
         */
        if (unlikely(skb_has_shared_frag(skb))) {
                ret = __skb_linearize(skb);
                if (ret)
                        goto out;
        }
        start = skb_checksum_start_offset(skb);
        offset = start + offsetof(struct sctphdr, checksum);
        if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
                ret = -EINVAL;
                goto out;
        }

        ret = skb_ensure_writable(skb, offset + sizeof(__le32));
        if (ret)
                goto out;

        crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
                                                  skb->len - start, ~(__u32)0,
                                                  crc32c_csum_stub));
        *(__le32 *)(skb->data + offset) = crc32c_csum;
        skb_reset_csum_not_inet(skb);
out:
        return ret;
}
EXPORT_SYMBOL(skb_crc32c_csum_help);

__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
        __be16 type = skb->protocol;

        /* Tunnel gso handlers can set protocol to ethernet. */
        if (type == htons(ETH_P_TEB)) {
                struct ethhdr *eth;

                if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
                        return 0;

                eth = (struct ethhdr *)skb->data;
                type = eth->h_proto;
        }

        return vlan_get_protocol_and_depth(skb, type, depth);
}


/* Take action when hardware reception checksum errors are detected. */
#ifdef CONFIG_BUG
static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
        netdev_err(dev, "hw csum failure\n");
        skb_dump(KERN_ERR, skb, true);
        dump_stack();
}

void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
        DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif

/* XXX: check that highmem exists at all on the given machine. */
static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_HIGHMEM
        int i;

        if (!(dev->features & NETIF_F_HIGHDMA)) {
                for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                        skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
                        struct page *page = skb_frag_page(frag);

                        if (page && PageHighMem(page))
                                return 1;
                }
        }
#endif
        return 0;
}

/* If MPLS offload request, verify we are testing hardware MPLS features
 * instead of standard features for the netdev.
 */
#if IS_ENABLED(CONFIG_NET_MPLS_GSO)
static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                           netdev_features_t features,
                                           __be16 type)
{
        if (eth_p_mpls(type))
                features &= skb->dev->mpls_features;

        return features;
}
#else
static netdev_features_t net_mpls_features(struct sk_buff *skb,
                                           netdev_features_t features,
                                           __be16 type)
{
        return features;
}
#endif

static netdev_features_t harmonize_features(struct sk_buff *skb,
        netdev_features_t features)
{
        __be16 type;

        type = skb_network_protocol(skb, NULL);
        features = net_mpls_features(skb, features, type);

        if (skb->ip_summed != CHECKSUM_NONE &&
            !can_checksum_protocol(features, type)) {
                features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
        }
        if (illegal_highdma(skb->dev, skb))
                features &= ~NETIF_F_SG;

        return features;
}

netdev_features_t passthru_features_check(struct sk_buff *skb,
                                          struct net_device *dev,
                                          netdev_features_t features)
{
        return features;
}
EXPORT_SYMBOL(passthru_features_check);

static netdev_features_t dflt_features_check(struct sk_buff *skb,
                                             struct net_device *dev,
                                             netdev_features_t features)
{
        return vlan_features_check(skb, features);
}

static netdev_features_t gso_features_check(const struct sk_buff *skb,
                                            struct net_device *dev,
                                            netdev_features_t features)
{
        u16 gso_segs = skb_shinfo(skb)->gso_segs;

        if (gso_segs > READ_ONCE(dev->gso_max_segs))
                return features & ~NETIF_F_GSO_MASK;

        if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
                return features & ~NETIF_F_GSO_MASK;

        if (!skb_shinfo(skb)->gso_type) {
                skb_warn_bad_offload(skb);
                return features & ~NETIF_F_GSO_MASK;
        }

        /* Support for GSO partial features requires software
         * intervention before we can actually process the packets
         * so we need to strip support for any partial features now
         * and we can pull them back in after we have partially
         * segmented the frame.
         */
        if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
                features &= ~dev->gso_partial_features;

        /* Make sure to clear the IPv4 ID mangling feature if the
         * IPv4 header has the potential to be fragmented.
         */
        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
                struct iphdr *iph = skb->encapsulation ?
                                    inner_ip_hdr(skb) : ip_hdr(skb);

                if (!(iph->frag_off & htons(IP_DF)))
                        features &= ~NETIF_F_TSO_MANGLEID;
        }

        return features;
}

netdev_features_t netif_skb_features(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        netdev_features_t features = dev->features;

        if (skb_is_gso(skb))
                features = gso_features_check(skb, dev, features);

        /* If encapsulation offload request, verify we are testing
         * hardware encapsulation features instead of standard
         * features for the netdev
         */
        if (skb->encapsulation)
                features &= dev->hw_enc_features;

        if (skb_vlan_tagged(skb))
                features = netdev_intersect_features(features,
                                                     dev->vlan_features |
                                                     NETIF_F_HW_VLAN_CTAG_TX |
                                                     NETIF_F_HW_VLAN_STAG_TX);

        if (dev->netdev_ops->ndo_features_check)
                features &= dev->netdev_ops->ndo_features_check(skb, dev,
                                                                features);
        else
                features &= dflt_features_check(skb, dev, features);

        return harmonize_features(skb, features);
}
EXPORT_SYMBOL(netif_skb_features);

static int xmit_one(struct sk_buff *skb, struct net_device *dev,
                    struct netdev_queue *txq, bool more)
{
        unsigned int len;
        int rc;

        if (dev_nit_active_rcu(dev))
                dev_queue_xmit_nit(skb, dev);

        len = skb->len;
        trace_net_dev_start_xmit(skb, dev);
        rc = netdev_start_xmit(skb, dev, txq, more);
        trace_net_dev_xmit(skb, rc, dev, len);

        return rc;
}

struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
                                    struct netdev_queue *txq, int *ret)
{
        struct sk_buff *skb = first;
        int rc = NETDEV_TX_OK;

        while (skb) {
                struct sk_buff *next = skb->next;

                skb_mark_not_on_list(skb);
                rc = xmit_one(skb, dev, txq, next != NULL);
                if (unlikely(!dev_xmit_complete(rc))) {
                        skb->next = next;
                        goto out;
                }

                skb = next;
                if (netif_tx_queue_stopped(txq) && skb) {
                        rc = NETDEV_TX_BUSY;
                        break;
                }
        }

out:
        *ret = rc;
        return skb;
}

static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
                                          netdev_features_t features)
{
        if (skb_vlan_tag_present(skb) &&
            !vlan_hw_offload_capable(features, skb->vlan_proto))
                skb = __vlan_hwaccel_push_inside(skb);
        return skb;
}

int skb_csum_hwoffload_help(struct sk_buff *skb,
                            const netdev_features_t features)
{
        if (unlikely(skb_csum_is_sctp(skb)))
                return !!(features & NETIF_F_SCTP_CRC) ? 0 :
                        skb_crc32c_csum_help(skb);

        if (features & NETIF_F_HW_CSUM)
                return 0;

        if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
                if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
                    skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
                    !ipv6_has_hopopt_jumbo(skb))
                        goto sw_checksum;

                switch (skb->csum_offset) {
                case offsetof(struct tcphdr, check):
                case offsetof(struct udphdr, check):
                        return 0;
                }
        }

sw_checksum:
        return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);

static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
        netdev_features_t features;

        if (!skb_frags_readable(skb))
                goto out_kfree_skb;

        features = netif_skb_features(skb);
        skb = validate_xmit_vlan(skb, features);
        if (unlikely(!skb))
                goto out_null;

        skb = sk_validate_xmit_skb(skb, dev);
        if (unlikely(!skb))
                goto out_null;

        if (netif_needs_gso(skb, features)) {
                struct sk_buff *segs;

                segs = skb_gso_segment(skb, features);
                if (IS_ERR(segs)) {
                        goto out_kfree_skb;
                } else if (segs) {
                        consume_skb(skb);
                        skb = segs;
                }
        } else {
                if (skb_needs_linearize(skb, features) &&
                    __skb_linearize(skb))
                        goto out_kfree_skb;

                /* If packet is not checksummed and device does not
                 * support checksumming for this protocol, complete
                 * checksumming here.
                 */
                if (skb->ip_summed == CHECKSUM_PARTIAL) {
                        if (skb->encapsulation)
                                skb_set_inner_transport_header(skb,
                                                               skb_checksum_start_offset(skb));
                        else
                                skb_set_transport_header(skb,
                                                         skb_checksum_start_offset(skb));
                        if (skb_csum_hwoffload_help(skb, features))
                                goto out_kfree_skb;
                }
        }

        skb = validate_xmit_xfrm(skb, features, again);

        return skb;

out_kfree_skb:
        kfree_skb(skb);
out_null:
        dev_core_stats_tx_dropped_inc(dev);
        return NULL;
}

struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
{
        struct sk_buff *next, *head = NULL, *tail;

        for (; skb != NULL; skb = next) {
                next = skb->next;
                skb_mark_not_on_list(skb);

                /* in case skb won't be segmented, point to itself */
                skb->prev = skb;

                skb = validate_xmit_skb(skb, dev, again);
                if (!skb)
                        continue;

                if (!head)
                        head = skb;
                else
                        tail->next = skb;
                /* If skb was segmented, skb->prev points to
                 * the last segment. If not, it still contains skb.
                 */
                tail = skb->prev;
        }
        return head;
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);

static void qdisc_pkt_len_init(struct sk_buff *skb)
{
        const struct skb_shared_info *shinfo = skb_shinfo(skb);

        qdisc_skb_cb(skb)->pkt_len = skb->len;

        /* To get more precise estimation of bytes sent on wire,
         * we add to pkt_len the headers size of all segments
         */
        if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
                u16 gso_segs = shinfo->gso_segs;
                unsigned int hdr_len;

                /* mac layer + network layer */
                hdr_len = skb_transport_offset(skb);

                /* + transport layer */
                if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
                        const struct tcphdr *th;
                        struct tcphdr _tcphdr;

                        th = skb_header_pointer(skb, hdr_len,
                                                sizeof(_tcphdr), &_tcphdr);
                        if (likely(th))
                                hdr_len += __tcp_hdrlen(th);
                } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
                        struct udphdr _udphdr;

                        if (skb_header_pointer(skb, hdr_len,
                                               sizeof(_udphdr), &_udphdr))
                                hdr_len += sizeof(struct udphdr);
                }

                if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
                        int payload = skb->len - hdr_len;

                        /* Malicious packet. */
                        if (payload <= 0)
                                return;
                        gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
                }
                qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
        }
}

static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
                             struct sk_buff **to_free,
                             struct netdev_queue *txq)
{
        int rc;

        rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
        if (rc == NET_XMIT_SUCCESS)
                trace_qdisc_enqueue(q, txq, skb);
        return rc;
}

static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
                                 struct net_device *dev,
                                 struct netdev_queue *txq)
{
        spinlock_t *root_lock = qdisc_lock(q);
        struct sk_buff *to_free = NULL;
        bool contended;
        int rc;

        qdisc_calculate_pkt_len(skb, q);

        tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);

        if (q->flags & TCQ_F_NOLOCK) {
                if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
                    qdisc_run_begin(q)) {
                        /* Retest nolock_qdisc_is_empty() within the protection
                         * of q->seqlock to protect from racing with requeuing.
                         */
                        if (unlikely(!nolock_qdisc_is_empty(q))) {
                                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                                __qdisc_run(q);
                                qdisc_run_end(q);

                                goto no_lock_out;
                        }

                        qdisc_bstats_cpu_update(q, skb);
                        if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
                            !nolock_qdisc_is_empty(q))
                                __qdisc_run(q);

                        qdisc_run_end(q);
                        return NET_XMIT_SUCCESS;
                }

                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                qdisc_run(q);

no_lock_out:
                if (unlikely(to_free))
                        kfree_skb_list_reason(to_free,
                                              tcf_get_drop_reason(to_free));
                return rc;
        }

        if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) {
                kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP);
                return NET_XMIT_DROP;
        }
        /*
         * Heuristic to force contended enqueues to serialize on a
         * separate lock before trying to get qdisc main lock.
         * This permits qdisc->running owner to get the lock more
         * often and dequeue packets faster.
         * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit
         * and then other tasks will only enqueue packets. The packets will be
         * sent after the qdisc owner is scheduled again. To prevent this
         * scenario the task always serialize on the lock.
         */
        contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT);
        if (unlikely(contended))
                spin_lock(&q->busylock);

        spin_lock(root_lock);
        if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
                __qdisc_drop(skb, &to_free);
                rc = NET_XMIT_DROP;
        } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
                   qdisc_run_begin(q)) {
                /*
                 * This is a work-conserving queue; there are no old skbs
                 * waiting to be sent out; and the qdisc is not running -
                 * xmit the skb directly.
                 */

                qdisc_bstats_update(q, skb);

                if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
                        if (unlikely(contended)) {
                                spin_unlock(&q->busylock);
                                contended = false;
                        }
                        __qdisc_run(q);
                }

                qdisc_run_end(q);
                rc = NET_XMIT_SUCCESS;
        } else {
                WRITE_ONCE(q->owner, smp_processor_id());
                rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
                WRITE_ONCE(q->owner, -1);
                if (qdisc_run_begin(q)) {
                        if (unlikely(contended)) {
                                spin_unlock(&q->busylock);
                                contended = false;
                        }
                        __qdisc_run(q);
                        qdisc_run_end(q);
                }
        }
        spin_unlock(root_lock);
        if (unlikely(to_free))
                kfree_skb_list_reason(to_free,
                                      tcf_get_drop_reason(to_free));
        if (unlikely(contended))
                spin_unlock(&q->busylock);
        return rc;
}

#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
static void skb_update_prio(struct sk_buff *skb)
{
        const struct netprio_map *map;
        const struct sock *sk;
        unsigned int prioidx;

        if (skb->priority)
                return;
        map = rcu_dereference_bh(skb->dev->priomap);
        if (!map)
                return;
        sk = skb_to_full_sk(skb);
        if (!sk)
                return;

        prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);

        if (prioidx < map->priomap_len)
                skb->priority = map->priomap[prioidx];
}
#else
#define skb_update_prio(skb)
#endif

/**
 *        dev_loopback_xmit - loop back @skb
 *        @net: network namespace this loopback is happening in
 *        @sk:  sk needed to be a netfilter okfn
 *        @skb: buffer to transmit
 */
int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        skb_reset_mac_header(skb);
        __skb_pull(skb, skb_network_offset(skb));
        skb->pkt_type = PACKET_LOOPBACK;
        if (skb->ip_summed == CHECKSUM_NONE)
                skb->ip_summed = CHECKSUM_UNNECESSARY;
        DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
        skb_dst_force(skb);
        netif_rx(skb);
        return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);

#ifdef CONFIG_NET_EGRESS
static struct netdev_queue *
netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
{
        int qm = skb_get_queue_mapping(skb);

        return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
}

#ifndef CONFIG_PREEMPT_RT
static bool netdev_xmit_txqueue_skipped(void)
{
        return __this_cpu_read(softnet_data.xmit.skip_txqueue);
}

void netdev_xmit_skip_txqueue(bool skip)
{
        __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);

#else
static bool netdev_xmit_txqueue_skipped(void)
{
        return current->net_xmit.skip_txqueue;
}

void netdev_xmit_skip_txqueue(bool skip)
{
        current->net_xmit.skip_txqueue = skip;
}
EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
#endif
#endif /* CONFIG_NET_EGRESS */

#ifdef CONFIG_NET_XGRESS
static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
                  enum skb_drop_reason *drop_reason)
{
        int ret = TC_ACT_UNSPEC;
#ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
        struct tcf_result res;

        if (!miniq)
                return ret;

        /* Global bypass */
        if (!static_branch_likely(&tcf_sw_enabled_key))
                return ret;

        /* Block-wise bypass */
        if (tcf_block_bypass_sw(miniq->block))
                return ret;

        tc_skb_cb(skb)->mru = 0;
        tc_skb_cb(skb)->post_ct = false;
        tcf_set_drop_reason(skb, *drop_reason);

        mini_qdisc_bstats_cpu_update(miniq, skb);
        ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
        /* Only tcf related quirks below. */
        switch (ret) {
        case TC_ACT_SHOT:
                *drop_reason = tcf_get_drop_reason(skb);
                mini_qdisc_qstats_cpu_drop(miniq);
                break;
        case TC_ACT_OK:
        case TC_ACT_RECLASSIFY:
                skb->tc_index = TC_H_MIN(res.classid);
                break;
        }
#endif /* CONFIG_NET_CLS_ACT */
        return ret;
}

static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);

void tcx_inc(void)
{
        static_branch_inc(&tcx_needed_key);
}

void tcx_dec(void)
{
        static_branch_dec(&tcx_needed_key);
}

static __always_inline enum tcx_action_base
tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
        const bool needs_mac)
{
        const struct bpf_mprog_fp *fp;
        const struct bpf_prog *prog;
        int ret = TCX_NEXT;

        if (needs_mac)
                __skb_push(skb, skb->mac_len);
        bpf_mprog_foreach_prog(entry, fp, prog) {
                bpf_compute_data_pointers(skb);
                ret = bpf_prog_run(prog, skb);
                if (ret != TCX_NEXT)
                        break;
        }
        if (needs_mac)
                __skb_pull(skb, skb->mac_len);
        return tcx_action_code(skb, ret);
}

static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
{
        struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int sch_ret;

        if (!entry)
                return skb;

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
        if (*pt_prev) {
                *ret = deliver_skb(skb, *pt_prev, orig_dev);
                *pt_prev = NULL;
        }

        qdisc_skb_cb(skb)->pkt_len = skb->len;
        tcx_set_ingress(skb, true);

        if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, true);
                if (sch_ret != TC_ACT_UNSPEC)
                        goto ingress_verdict;
        }
        sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
ingress_verdict:
        switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* skb_mac_header check was done by BPF, so we can safely
                 * push the L2 header back before redirecting to another
                 * netdev.
                 */
                __skb_push(skb, skb->mac_len);
                if (skb_do_redirect(skb) == -EAGAIN) {
                        __skb_pull(skb, skb->mac_len);
                        *another = true;
                        break;
                }
                *ret = NET_RX_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        case TC_ACT_SHOT:
                kfree_skb_reason(skb, drop_reason);
                *ret = NET_RX_DROP;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        /* used by tc_run */
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
        case TC_ACT_TRAP:
                consume_skb(skb);
                fallthrough;
        case TC_ACT_CONSUMED:
                *ret = NET_RX_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        }
        bpf_net_ctx_clear(bpf_net_ctx);

        return skb;
}

static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
        struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
        enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int sch_ret;

        if (!entry)
                return skb;

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

        /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
         * already set by the caller.
         */
        if (static_branch_unlikely(&tcx_needed_key)) {
                sch_ret = tcx_run(entry, skb, false);
                if (sch_ret != TC_ACT_UNSPEC)
                        goto egress_verdict;
        }
        sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
egress_verdict:
        switch (sch_ret) {
        case TC_ACT_REDIRECT:
                /* No need to push/pop skb's mac_header here on egress! */
                skb_do_redirect(skb);
                *ret = NET_XMIT_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        case TC_ACT_SHOT:
                kfree_skb_reason(skb, drop_reason);
                *ret = NET_XMIT_DROP;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        /* used by tc_run */
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
        case TC_ACT_TRAP:
                consume_skb(skb);
                fallthrough;
        case TC_ACT_CONSUMED:
                *ret = NET_XMIT_SUCCESS;
                bpf_net_ctx_clear(bpf_net_ctx);
                return NULL;
        }
        bpf_net_ctx_clear(bpf_net_ctx);

        return skb;
}
#else
static __always_inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                   struct net_device *orig_dev, bool *another)
{
        return skb;
}

static __always_inline struct sk_buff *
sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
{
        return skb;
}
#endif /* CONFIG_NET_XGRESS */

#ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
                               struct xps_dev_maps *dev_maps, unsigned int tci)
{
        int tc = netdev_get_prio_tc_map(dev, skb->priority);
        struct xps_map *map;
        int queue_index = -1;

        if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
                return queue_index;

        tci *= dev_maps->num_tc;
        tci += tc;

        map = rcu_dereference(dev_maps->attr_map[tci]);
        if (map) {
                if (map->len == 1)
                        queue_index = map->queues[0];
                else
                        queue_index = map->queues[reciprocal_scale(
                                                skb_get_hash(skb), map->len)];
                if (unlikely(queue_index >= dev->real_num_tx_queues))
                        queue_index = -1;
        }
        return queue_index;
}
#endif

static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
                         struct sk_buff *skb)
{
#ifdef CONFIG_XPS
        struct xps_dev_maps *dev_maps;
        struct sock *sk = skb->sk;
        int queue_index = -1;

        if (!static_key_false(&xps_needed))
                return -1;

        rcu_read_lock();
        if (!static_key_false(&xps_rxqs_needed))
                goto get_cpus_map;

        dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
        if (dev_maps) {
                int tci = sk_rx_queue_get(sk);

                if (tci >= 0)
                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                          tci);
        }

get_cpus_map:
        if (queue_index < 0) {
                dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
                if (dev_maps) {
                        unsigned int tci = skb->sender_cpu - 1;

                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                          tci);
                }
        }
        rcu_read_unlock();

        return queue_index;
#else
        return -1;
#endif
}

u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev)
{
        return 0;
}
EXPORT_SYMBOL(dev_pick_tx_zero);

u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
                     struct net_device *sb_dev)
{
        struct sock *sk = skb->sk;
        int queue_index = sk_tx_queue_get(sk);

        sb_dev = sb_dev ? : dev;

        if (queue_index < 0 || skb->ooo_okay ||
            queue_index >= dev->real_num_tx_queues) {
                int new_index = get_xps_queue(dev, sb_dev, skb);

                if (new_index < 0)
                        new_index = skb_tx_hash(dev, sb_dev, skb);

                if (queue_index != new_index && sk &&
                    sk_fullsock(sk) &&
                    rcu_access_pointer(sk->sk_dst_cache))
                        sk_tx_queue_set(sk, new_index);

                queue_index = new_index;
        }

        return queue_index;
}
EXPORT_SYMBOL(netdev_pick_tx);

struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
                                         struct sk_buff *skb,
                                         struct net_device *sb_dev)
{
        int queue_index = 0;

#ifdef CONFIG_XPS
        u32 sender_cpu = skb->sender_cpu - 1;

        if (sender_cpu >= (u32)NR_CPUS)
                skb->sender_cpu = raw_smp_processor_id() + 1;
#endif

        if (dev->real_num_tx_queues != 1) {
                const struct net_device_ops *ops = dev->netdev_ops;

                if (ops->ndo_select_queue)
                        queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
                else
                        queue_index = netdev_pick_tx(dev, skb, sb_dev);

                queue_index = netdev_cap_txqueue(dev, queue_index);
        }

        skb_set_queue_mapping(skb, queue_index);
        return netdev_get_tx_queue(dev, queue_index);
}

/**
 * __dev_queue_xmit() - transmit a buffer
 * @skb:        buffer to transmit
 * @sb_dev:        suboordinate device used for L2 forwarding offload
 *
 * Queue a buffer for transmission to a network device. The caller must
 * have set the device and priority and built the buffer before calling
 * this function. The function can be called from an interrupt.
 *
 * When calling this method, interrupts MUST be enabled. This is because
 * the BH enable code must have IRQs enabled so that it will not deadlock.
 *
 * Regardless of the return value, the skb is consumed, so it is currently
 * difficult to retry a send to this method. (You can bump the ref count
 * before sending to hold a reference for retry if you are careful.)
 *
 * Return:
 * * 0                                - buffer successfully transmitted
 * * positive qdisc return code        - NET_XMIT_DROP etc.
 * * negative errno                - other errors
 */
int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
        struct net_device *dev = skb->dev;
        struct netdev_queue *txq = NULL;
        struct Qdisc *q;
        int rc = -ENOMEM;
        bool again = false;

        skb_reset_mac_header(skb);
        skb_assert_len(skb);

        if (unlikely(skb_shinfo(skb)->tx_flags &
                     (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
                __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);

        /* Disable soft irqs for various locks below. Also
         * stops preemption for RCU.
         */
        rcu_read_lock_bh();

        skb_update_prio(skb);

        qdisc_pkt_len_init(skb);
        tcx_set_ingress(skb, false);
#ifdef CONFIG_NET_EGRESS
        if (static_branch_unlikely(&egress_needed_key)) {
                if (nf_hook_egress_active()) {
                        skb = nf_hook_egress(skb, &rc, dev);
                        if (!skb)
                                goto out;
                }

                netdev_xmit_skip_txqueue(false);

                nf_skip_egress(skb, true);
                skb = sch_handle_egress(skb, &rc, dev);
                if (!skb)
                        goto out;
                nf_skip_egress(skb, false);

                if (netdev_xmit_txqueue_skipped())
                        txq = netdev_tx_queue_mapping(dev, skb);
        }
#endif
        /* If device/qdisc don't need skb->dst, release it right now while
         * its hot in this cpu cache.
         */
        if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
                skb_dst_drop(skb);
        else
                skb_dst_force(skb);

        if (!txq)
                txq = netdev_core_pick_tx(dev, skb, sb_dev);

        q = rcu_dereference_bh(txq->qdisc);

        trace_net_dev_queue(skb);
        if (q->enqueue) {
                rc = __dev_xmit_skb(skb, q, dev, txq);
                goto out;
        }

        /* The device has no queue. Common case for software devices:
         * loopback, all the sorts of tunnels...

         * Really, it is unlikely that netif_tx_lock protection is necessary
         * here.  (f.e. loopback and IP tunnels are clean ignoring statistics
         * counters.)
         * However, it is possible, that they rely on protection
         * made by us here.

         * Check this and shot the lock. It is not prone from deadlocks.
         *Either shot noqueue qdisc, it is even simpler 8)
         */
        if (dev->flags & IFF_UP) {
                int cpu = smp_processor_id(); /* ok because BHs are off */

                /* Other cpus might concurrently change txq->xmit_lock_owner
                 * to -1 or to their cpu id, but not to our id.
                 */
                if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
                        if (dev_xmit_recursion())
                                goto recursion_alert;

                        skb = validate_xmit_skb(skb, dev, &again);
                        if (!skb)
                                goto out;

                        HARD_TX_LOCK(dev, txq, cpu);

                        if (!netif_xmit_stopped(txq)) {
                                dev_xmit_recursion_inc();
                                skb = dev_hard_start_xmit(skb, dev, txq, &rc);
                                dev_xmit_recursion_dec();
                                if (dev_xmit_complete(rc)) {
                                        HARD_TX_UNLOCK(dev, txq);
                                        goto out;
                                }
                        }
                        HARD_TX_UNLOCK(dev, txq);
                        net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
                                             dev->name);
                } else {
                        /* Recursion is detected! It is possible,
                         * unfortunately
                         */
recursion_alert:
                        net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
                                             dev->name);
                }
        }

        rc = -ENETDOWN;
        rcu_read_unlock_bh();

        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list(skb);
        return rc;
out:
        rcu_read_unlock_bh();
        return rc;
}
EXPORT_SYMBOL(__dev_queue_xmit);

int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
        struct net_device *dev = skb->dev;
        struct sk_buff *orig_skb = skb;
        struct netdev_queue *txq;
        int ret = NETDEV_TX_BUSY;
        bool again = false;

        if (unlikely(!netif_running(dev) ||
                     !netif_carrier_ok(dev)))
                goto drop;

        skb = validate_xmit_skb_list(skb, dev, &again);
        if (skb != orig_skb)
                goto drop;

        skb_set_queue_mapping(skb, queue_id);
        txq = skb_get_tx_queue(dev, skb);

        local_bh_disable();

        dev_xmit_recursion_inc();
        HARD_TX_LOCK(dev, txq, smp_processor_id());
        if (!netif_xmit_frozen_or_drv_stopped(txq))
                ret = netdev_start_xmit(skb, dev, txq, false);
        HARD_TX_UNLOCK(dev, txq);
        dev_xmit_recursion_dec();

        local_bh_enable();
        return ret;
drop:
        dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list(skb);
        return NET_XMIT_DROP;
}
EXPORT_SYMBOL(__dev_direct_xmit);

/*************************************************************************
 *                        Receiver routines
 *************************************************************************/
static DEFINE_PER_CPU(struct task_struct *, backlog_napi);

int weight_p __read_mostly = 64;           /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1;  /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1;  /* bias for output_queue quota */

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
                                     struct napi_struct *napi)
{
        struct task_struct *thread;

        lockdep_assert_irqs_disabled();

        if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
                /* Paired with smp_mb__before_atomic() in
                 * napi_enable()/dev_set_threaded().
                 * Use READ_ONCE() to guarantee a complete
                 * read on napi->thread. Only call
                 * wake_up_process() when it's not NULL.
                 */
                thread = READ_ONCE(napi->thread);
                if (thread) {
                        if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
                                goto use_local_napi;

                        set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
                        wake_up_process(thread);
                        return;
                }
        }

use_local_napi:
        list_add_tail(&napi->poll_list, &sd->poll_list);
        WRITE_ONCE(napi->list_owner, smp_processor_id());
        /* If not called from net_rx_action()
         * we have to raise NET_RX_SOFTIRQ.
         */
        if (!sd->in_net_rx_action)
                raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

#ifdef CONFIG_RPS

struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);

static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
{
        return hash_32(hash, flow_table->log);
}

static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
            struct rps_dev_flow *rflow, u16 next_cpu)
{
        if (next_cpu < nr_cpu_ids) {
                u32 head;
#ifdef CONFIG_RFS_ACCEL
                struct netdev_rx_queue *rxqueue;
                struct rps_dev_flow_table *flow_table;
                struct rps_dev_flow *old_rflow;
                u16 rxq_index;
                u32 flow_id;
                int rc;

                /* Should we steer this flow to a different hardware queue? */
                if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
                    !(dev->features & NETIF_F_NTUPLE))
                        goto out;
                rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
                if (rxq_index == skb_get_rx_queue(skb))
                        goto out;

                rxqueue = dev->_rx + rxq_index;
                flow_table = rcu_dereference(rxqueue->rps_flow_table);
                if (!flow_table)
                        goto out;
                flow_id = rfs_slot(skb_get_hash(skb), flow_table);
                rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
                                                        rxq_index, flow_id);
                if (rc < 0)
                        goto out;
                old_rflow = rflow;
                rflow = &flow_table->flows[flow_id];
                WRITE_ONCE(rflow->filter, rc);
                if (old_rflow->filter == rc)
                        WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
        out:
#endif
                head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
                rps_input_queue_tail_save(&rflow->last_qtail, head);
        }

        WRITE_ONCE(rflow->cpu, next_cpu);
        return rflow;
}

/*
 * get_rps_cpu is called from netif_receive_skb and returns the target
 * CPU from the RPS map of the receiving queue for a given skb.
 * rcu_read_lock must be held on entry.
 */
static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                       struct rps_dev_flow **rflowp)
{
        const struct rps_sock_flow_table *sock_flow_table;
        struct netdev_rx_queue *rxqueue = dev->_rx;
        struct rps_dev_flow_table *flow_table;
        struct rps_map *map;
        int cpu = -1;
        u32 tcpu;
        u32 hash;

        if (skb_rx_queue_recorded(skb)) {
                u16 index = skb_get_rx_queue(skb);

                if (unlikely(index >= dev->real_num_rx_queues)) {
                        WARN_ONCE(dev->real_num_rx_queues > 1,
                                  "%s received packet on queue %u, but number "
                                  "of RX queues is %u\n",
                                  dev->name, index, dev->real_num_rx_queues);
                        goto done;
                }
                rxqueue += index;
        }

        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */

        flow_table = rcu_dereference(rxqueue->rps_flow_table);
        map = rcu_dereference(rxqueue->rps_map);
        if (!flow_table && !map)
                goto done;

        skb_reset_network_header(skb);
        hash = skb_get_hash(skb);
        if (!hash)
                goto done;

        sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
        if (flow_table && sock_flow_table) {
                struct rps_dev_flow *rflow;
                u32 next_cpu;
                u32 ident;

                /* First check into global flow table if there is a match.
                 * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
                 */
                ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
                if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
                        goto try_rps;

                next_cpu = ident & net_hotdata.rps_cpu_mask;

                /* OK, now we know there is a match,
                 * we can look at the local (per receive queue) flow table
                 */
                rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
                tcpu = rflow->cpu;

                /*
                 * If the desired CPU (where last recvmsg was done) is
                 * different from current CPU (one in the rx-queue flow
                 * table entry), switch if one of the following holds:
                 *   - Current CPU is unset (>= nr_cpu_ids).
                 *   - Current CPU is offline.
                 *   - The current CPU's queue tail has advanced beyond the
                 *     last packet that was enqueued using this table entry.
                 *     This guarantees that all previous packets for the flow
                 *     have been dequeued, thus preserving in order delivery.
                 */
                if (unlikely(tcpu != next_cpu) &&
                    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
                     ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
                      rflow->last_qtail)) >= 0)) {
                        tcpu = next_cpu;
                        rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
                }

                if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
                        *rflowp = rflow;
                        cpu = tcpu;
                        goto done;
                }
        }

try_rps:

        if (map) {
                tcpu = map->cpus[reciprocal_scale(hash, map->len)];
                if (cpu_online(tcpu)) {
                        cpu = tcpu;
                        goto done;
                }
        }

done:
        return cpu;
}

#ifdef CONFIG_RFS_ACCEL

/**
 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
 * @dev: Device on which the filter was set
 * @rxq_index: RX queue index
 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
 *
 * Drivers that implement ndo_rx_flow_steer() should periodically call
 * this function for each installed filter and remove the filters for
 * which it returns %true.
 */
bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
                         u32 flow_id, u16 filter_id)
{
        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
        struct rps_dev_flow_table *flow_table;
        struct rps_dev_flow *rflow;
        bool expire = true;
        unsigned int cpu;

        rcu_read_lock();
        flow_table = rcu_dereference(rxqueue->rps_flow_table);
        if (flow_table && flow_id < (1UL << flow_table->log)) {
                rflow = &flow_table->flows[flow_id];
                cpu = READ_ONCE(rflow->cpu);
                if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
                    ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
                           READ_ONCE(rflow->last_qtail)) <
                     (int)(10 << flow_table->log)))
                        expire = false;
        }
        rcu_read_unlock();
        return expire;
}
EXPORT_SYMBOL(rps_may_expire_flow);

#endif /* CONFIG_RFS_ACCEL */

/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
        struct softnet_data *sd = data;

        ____napi_schedule(sd, &sd->backlog);
        sd->received_rps++;
}

#endif /* CONFIG_RPS */

/* Called from hardirq (IPI) context */
static void trigger_rx_softirq(void *data)
{
        struct softnet_data *sd = data;

        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
        smp_store_release(&sd->defer_ipi_scheduled, 0);
}

/*
 * After we queued a packet into sd->input_pkt_queue,
 * we need to make sure this queue is serviced soon.
 *
 * - If this is another cpu queue, link it to our rps_ipi_list,
 *   and make sure we will process rps_ipi_list from net_rx_action().
 *
 * - If this is our own queue, NAPI schedule our backlog.
 *   Note that this also raises NET_RX_SOFTIRQ.
 */
static void napi_schedule_rps(struct softnet_data *sd)
{
        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);

#ifdef CONFIG_RPS
        if (sd != mysd) {
                if (use_backlog_threads()) {
                        __napi_schedule_irqoff(&sd->backlog);
                        return;
                }

                sd->rps_ipi_next = mysd->rps_ipi_list;
                mysd->rps_ipi_list = sd;

                /* If not called from net_rx_action() or napi_threaded_poll()
                 * we have to raise NET_RX_SOFTIRQ.
                 */
                if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
                        __raise_softirq_irqoff(NET_RX_SOFTIRQ);
                return;
        }
#endif /* CONFIG_RPS */
        __napi_schedule_irqoff(&mysd->backlog);
}

void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
{
        unsigned long flags;

        if (use_backlog_threads()) {
                backlog_lock_irq_save(sd, &flags);

                if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
                        __napi_schedule_irqoff(&sd->backlog);

                backlog_unlock_irq_restore(sd, &flags);

        } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
                smp_call_function_single_async(cpu, &sd->defer_csd);
        }
}

#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif

static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
{
#ifdef CONFIG_NET_FLOW_LIMIT
        struct sd_flow_limit *fl;
        struct softnet_data *sd;
        unsigned int old_flow, new_flow;

        if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1))
                return false;

        sd = this_cpu_ptr(&softnet_data);

        rcu_read_lock();
        fl = rcu_dereference(sd->flow_limit);
        if (fl) {
                new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
                old_flow = fl->history[fl->history_head];
                fl->history[fl->history_head] = new_flow;

                fl->history_head++;
                fl->history_head &= FLOW_LIMIT_HISTORY - 1;

                if (likely(fl->buckets[old_flow]))
                        fl->buckets[old_flow]--;

                if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
                        fl->count++;
                        rcu_read_unlock();
                        return true;
                }
        }
        rcu_read_unlock();
#endif
        return false;
}

/*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
 */
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
                              unsigned int *qtail)
{
        enum skb_drop_reason reason;
        struct softnet_data *sd;
        unsigned long flags;
        unsigned int qlen;
        int max_backlog;
        u32 tail;

        reason = SKB_DROP_REASON_DEV_READY;
        if (!netif_running(skb->dev))
                goto bad_dev;

        reason = SKB_DROP_REASON_CPU_BACKLOG;
        sd = &per_cpu(softnet_data, cpu);

        qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
        max_backlog = READ_ONCE(net_hotdata.max_backlog);
        if (unlikely(qlen > max_backlog))
                goto cpu_backlog_drop;
        backlog_lock_irq_save(sd, &flags);
        qlen = skb_queue_len(&sd->input_pkt_queue);
        if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) {
                if (!qlen) {
                        /* Schedule NAPI for backlog device. We can use
                         * non atomic operation as we own the queue lock.
                         */
                        if (!__test_and_set_bit(NAPI_STATE_SCHED,
                                                &sd->backlog.state))
                                napi_schedule_rps(sd);
                }
                __skb_queue_tail(&sd->input_pkt_queue, skb);
                tail = rps_input_queue_tail_incr(sd);
                backlog_unlock_irq_restore(sd, &flags);

                /* save the tail outside of the critical section */
                rps_input_queue_tail_save(qtail, tail);
                return NET_RX_SUCCESS;
        }

        backlog_unlock_irq_restore(sd, &flags);

cpu_backlog_drop:
        atomic_inc(&sd->dropped);
bad_dev:
        dev_core_stats_rx_dropped_inc(skb->dev);
        kfree_skb_reason(skb, reason);
        return NET_RX_DROP;
}

static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        struct netdev_rx_queue *rxqueue;

        rxqueue = dev->_rx;

        if (skb_rx_queue_recorded(skb)) {
                u16 index = skb_get_rx_queue(skb);

                if (unlikely(index >= dev->real_num_rx_queues)) {
                        WARN_ONCE(dev->real_num_rx_queues > 1,
                                  "%s received packet on queue %u, but number "
                                  "of RX queues is %u\n",
                                  dev->name, index, dev->real_num_rx_queues);

                        return rxqueue; /* Return first rxqueue */
                }
                rxqueue += index;
        }
        return rxqueue;
}

u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
                             const struct bpf_prog *xdp_prog)
{
        void *orig_data, *orig_data_end, *hard_start;
        struct netdev_rx_queue *rxqueue;
        bool orig_bcast, orig_host;
        u32 mac_len, frame_sz;
        __be16 orig_eth_type;
        struct ethhdr *eth;
        u32 metalen, act;
        int off;

        /* The XDP program wants to see the packet starting at the MAC
         * header.
         */
        mac_len = skb->data - skb_mac_header(skb);
        hard_start = skb->data - skb_headroom(skb);

        /* SKB "head" area always have tailroom for skb_shared_info */
        frame_sz = (void *)skb_end_pointer(skb) - hard_start;
        frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));

        rxqueue = netif_get_rxqueue(skb);
        xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
        xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
                         skb_headlen(skb) + mac_len, true);
        if (skb_is_nonlinear(skb)) {
                skb_shinfo(skb)->xdp_frags_size = skb->data_len;
                xdp_buff_set_frags_flag(xdp);
        } else {
                xdp_buff_clear_frags_flag(xdp);
        }

        orig_data_end = xdp->data_end;
        orig_data = xdp->data;
        eth = (struct ethhdr *)xdp->data;
        orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
        orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
        orig_eth_type = eth->h_proto;

        act = bpf_prog_run_xdp(xdp_prog, xdp);

        /* check if bpf_xdp_adjust_head was used */
        off = xdp->data - orig_data;
        if (off) {
                if (off > 0)
                        __skb_pull(skb, off);
                else if (off < 0)
                        __skb_push(skb, -off);

                skb->mac_header += off;
                skb_reset_network_header(skb);
        }

        /* check if bpf_xdp_adjust_tail was used */
        off = xdp->data_end - orig_data_end;
        if (off != 0) {
                skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
                skb->len += off; /* positive on grow, negative on shrink */
        }

        /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
         * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
         */
        if (xdp_buff_has_frags(xdp))
                skb->data_len = skb_shinfo(skb)->xdp_frags_size;
        else
                skb->data_len = 0;

        /* check if XDP changed eth hdr such SKB needs update */
        eth = (struct ethhdr *)xdp->data;
        if ((orig_eth_type != eth->h_proto) ||
            (orig_host != ether_addr_equal_64bits(eth->h_dest,
                                                  skb->dev->dev_addr)) ||
            (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
                __skb_push(skb, ETH_HLEN);
                skb->pkt_type = PACKET_HOST;
                skb->protocol = eth_type_trans(skb, skb->dev);
        }

        /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
         * before calling us again on redirect path. We do not call do_redirect
         * as we leave that up to the caller.
         *
         * Caller is responsible for managing lifetime of skb (i.e. calling
         * kfree_skb in response to actions it cannot handle/XDP_DROP).
         */
        switch (act) {
        case XDP_REDIRECT:
        case XDP_TX:
                __skb_push(skb, mac_len);
                break;
        case XDP_PASS:
                metalen = xdp->data - xdp->data_meta;
                if (metalen)
                        skb_metadata_set(skb, metalen);
                break;
        }

        return act;
}

static int
netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
{
        struct sk_buff *skb = *pskb;
        int err, hroom, troom;

        if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog))
                return 0;

        /* In case we have to go down the path and also linearize,
         * then lets do the pskb_expand_head() work just once here.
         */
        hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
        troom = skb->tail + skb->data_len - skb->end;
        err = pskb_expand_head(skb,
                               hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
                               troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
        if (err)
                return err;

        return skb_linearize(skb);
}

static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
                                     struct xdp_buff *xdp,
                                     const struct bpf_prog *xdp_prog)
{
        struct sk_buff *skb = *pskb;
        u32 mac_len, act = XDP_DROP;

        /* Reinjected packets coming from act_mirred or similar should
         * not get XDP generic processing.
         */
        if (skb_is_redirected(skb))
                return XDP_PASS;

        /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
         * bytes. This is the guarantee that also native XDP provides,
         * thus we need to do it here as well.
         */
        mac_len = skb->data - skb_mac_header(skb);
        __skb_push(skb, mac_len);

        if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
            skb_headroom(skb) < XDP_PACKET_HEADROOM) {
                if (netif_skb_check_for_xdp(pskb, xdp_prog))
                        goto do_drop;
        }

        __skb_pull(*pskb, mac_len);

        act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
        switch (act) {
        case XDP_REDIRECT:
        case XDP_TX:
        case XDP_PASS:
                break;
        default:
                bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_ABORTED:
                trace_xdp_exception((*pskb)->dev, xdp_prog, act);
                fallthrough;
        case XDP_DROP:
        do_drop:
                kfree_skb(*pskb);
                break;
        }

        return act;
}

/* When doing generic XDP we have to bypass the qdisc layer and the
 * network taps in order to match in-driver-XDP behavior. This also means
 * that XDP packets are able to starve other packets going through a qdisc,
 * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
 * queues, so they do not have this starvation issue.
 */
void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
        struct net_device *dev = skb->dev;
        struct netdev_queue *txq;
        bool free_skb = true;
        int cpu, rc;

        txq = netdev_core_pick_tx(dev, skb, NULL);
        cpu = smp_processor_id();
        HARD_TX_LOCK(dev, txq, cpu);
        if (!netif_xmit_frozen_or_drv_stopped(txq)) {
                rc = netdev_start_xmit(skb, dev, txq, 0);
                if (dev_xmit_complete(rc))
                        free_skb = false;
        }
        HARD_TX_UNLOCK(dev, txq);
        if (free_skb) {
                trace_xdp_exception(dev, xdp_prog, XDP_TX);
                dev_core_stats_tx_dropped_inc(dev);
                kfree_skb(skb);
        }
}

static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);

int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;

        if (xdp_prog) {
                struct xdp_buff xdp;
                u32 act;
                int err;

                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
                act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
                if (act != XDP_PASS) {
                        switch (act) {
                        case XDP_REDIRECT:
                                err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
                                                              &xdp, xdp_prog);
                                if (err)
                                        goto out_redir;
                                break;
                        case XDP_TX:
                                generic_xdp_tx(*pskb, xdp_prog);
                                break;
                        }
                        bpf_net_ctx_clear(bpf_net_ctx);
                        return XDP_DROP;
                }
                bpf_net_ctx_clear(bpf_net_ctx);
        }
        return XDP_PASS;
out_redir:
        bpf_net_ctx_clear(bpf_net_ctx);
        kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
        return XDP_DROP;
}
EXPORT_SYMBOL_GPL(do_xdp_generic);

static int netif_rx_internal(struct sk_buff *skb)
{
        int ret;

        net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        trace_netif_rx(skb);

#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu;

                rcu_read_lock();

                cpu = get_rps_cpu(skb->dev, skb, &rflow);
                if (cpu < 0)
                        cpu = smp_processor_id();

                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

                rcu_read_unlock();
        } else
#endif
        {
                unsigned int qtail;

                ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
        }
        return ret;
}

/**
 *        __netif_rx        -        Slightly optimized version of netif_rx
 *        @skb: buffer to post
 *
 *        This behaves as netif_rx except that it does not disable bottom halves.
 *        As a result this function may only be invoked from the interrupt context
 *        (either hard or soft interrupt).
 */
int __netif_rx(struct sk_buff *skb)
{
        int ret;

        lockdep_assert_once(hardirq_count() | softirq_count());

        trace_netif_rx_entry(skb);
        ret = netif_rx_internal(skb);
        trace_netif_rx_exit(ret);
        return ret;
}
EXPORT_SYMBOL(__netif_rx);

/**
 *        netif_rx        -        post buffer to the network code
 *        @skb: buffer to post
 *
 *        This function receives a packet from a device driver and queues it for
 *        the upper (protocol) levels to process via the backlog NAPI device. It
 *        always succeeds. The buffer may be dropped during processing for
 *        congestion control or by the protocol layers.
 *        The network buffer is passed via the backlog NAPI device. Modern NIC
 *        driver should use NAPI and GRO.
 *        This function can used from interrupt and from process context. The
 *        caller from process context must not disable interrupts before invoking
 *        this function.
 *
 *        return values:
 *        NET_RX_SUCCESS        (no congestion)
 *        NET_RX_DROP     (packet was dropped)
 *
 */
int netif_rx(struct sk_buff *skb)
{
        bool need_bh_off = !(hardirq_count() | softirq_count());
        int ret;

        if (need_bh_off)
                local_bh_disable();
        trace_netif_rx_entry(skb);
        ret = netif_rx_internal(skb);
        trace_netif_rx_exit(ret);
        if (need_bh_off)
                local_bh_enable();
        return ret;
}
EXPORT_SYMBOL(netif_rx);

static __latent_entropy void net_tx_action(void)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);

        if (sd->completion_queue) {
                struct sk_buff *clist;

                local_irq_disable();
                clist = sd->completion_queue;
                sd->completion_queue = NULL;
                local_irq_enable();

                while (clist) {
                        struct sk_buff *skb = clist;

                        clist = clist->next;

                        WARN_ON(refcount_read(&skb->users));
                        if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
                                trace_consume_skb(skb, net_tx_action);
                        else
                                trace_kfree_skb(skb, net_tx_action,
                                                get_kfree_skb_cb(skb)->reason, NULL);

                        if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
                                __kfree_skb(skb);
                        else
                                __napi_kfree_skb(skb,
                                                 get_kfree_skb_cb(skb)->reason);
                }
        }

        if (sd->output_queue) {
                struct Qdisc *head;

                local_irq_disable();
                head = sd->output_queue;
                sd->output_queue = NULL;
                sd->output_queue_tailp = &sd->output_queue;
                local_irq_enable();

                rcu_read_lock();

                while (head) {
                        struct Qdisc *q = head;
                        spinlock_t *root_lock = NULL;

                        head = head->next_sched;

                        /* We need to make sure head->next_sched is read
                         * before clearing __QDISC_STATE_SCHED
                         */
                        smp_mb__before_atomic();

                        if (!(q->flags & TCQ_F_NOLOCK)) {
                                root_lock = qdisc_lock(q);
                                spin_lock(root_lock);
                        } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
                                                     &q->state))) {
                                /* There is a synchronize_net() between
                                 * STATE_DEACTIVATED flag being set and
                                 * qdisc_reset()/some_qdisc_is_busy() in
                                 * dev_deactivate(), so we can safely bail out
                                 * early here to avoid data race between
                                 * qdisc_deactivate() and some_qdisc_is_busy()
                                 * for lockless qdisc.
                                 */
                                clear_bit(__QDISC_STATE_SCHED, &q->state);
                                continue;
                        }

                        clear_bit(__QDISC_STATE_SCHED, &q->state);
                        qdisc_run(q);
                        if (root_lock)
                                spin_unlock(root_lock);
                }

                rcu_read_unlock();
        }

        xfrm_dev_backlog(sd);
}

#if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
/* This hook is defined here for ATM LANE */
int (*br_fdb_test_addr_hook)(struct net_device *dev,
                             unsigned char *addr) __read_mostly;
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif

/**
 *        netdev_is_rx_handler_busy - check if receive handler is registered
 *        @dev: device to check
 *
 *        Check if a receive handler is already registered for a given device.
 *        Return true if there one.
 *
 *        The caller must hold the rtnl_mutex.
 */
bool netdev_is_rx_handler_busy(struct net_device *dev)
{
        ASSERT_RTNL();
        return dev && rtnl_dereference(dev->rx_handler);
}
EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);

/**
 *        netdev_rx_handler_register - register receive handler
 *        @dev: device to register a handler for
 *        @rx_handler: receive handler to register
 *        @rx_handler_data: data pointer that is used by rx handler
 *
 *        Register a receive handler for a device. This handler will then be
 *        called from __netif_receive_skb. A negative errno code is returned
 *        on a failure.
 *
 *        The caller must hold the rtnl_mutex.
 *
 *        For a general description of rx_handler, see enum rx_handler_result.
 */
int netdev_rx_handler_register(struct net_device *dev,
                               rx_handler_func_t *rx_handler,
                               void *rx_handler_data)
{
        if (netdev_is_rx_handler_busy(dev))
                return -EBUSY;

        if (dev->priv_flags & IFF_NO_RX_HANDLER)
                return -EINVAL;

        /* Note: rx_handler_data must be set before rx_handler */
        rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
        rcu_assign_pointer(dev->rx_handler, rx_handler);

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_register);

/**
 *        netdev_rx_handler_unregister - unregister receive handler
 *        @dev: device to unregister a handler from
 *
 *        Unregister a receive handler from a device.
 *
 *        The caller must hold the rtnl_mutex.
 */
void netdev_rx_handler_unregister(struct net_device *dev)
{

        ASSERT_RTNL();
        RCU_INIT_POINTER(dev->rx_handler, NULL);
        /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
         * section has a guarantee to see a non NULL rx_handler_data
         * as well.
         */
        synchronize_net();
        RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);

/*
 * Limit the use of PFMEMALLOC reserves to those protocols that implement
 * the special handling of PFMEMALLOC skbs.
 */
static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
{
        switch (skb->protocol) {
        case htons(ETH_P_ARP):
        case htons(ETH_P_IP):
        case htons(ETH_P_IPV6):
        case htons(ETH_P_8021Q):
        case htons(ETH_P_8021AD):
                return true;
        default:
                return false;
        }
}

static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
                             int *ret, struct net_device *orig_dev)
{
        if (nf_hook_ingress_active(skb)) {
                int ingress_retval;

                if (*pt_prev) {
                        *ret = deliver_skb(skb, *pt_prev, orig_dev);
                        *pt_prev = NULL;
                }

                rcu_read_lock();
                ingress_retval = nf_hook_ingress(skb);
                rcu_read_unlock();
                return ingress_retval;
        }
        return 0;
}

static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
                                    struct packet_type **ppt_prev)
{
        struct packet_type *ptype, *pt_prev;
        rx_handler_func_t *rx_handler;
        struct sk_buff *skb = *pskb;
        struct net_device *orig_dev;
        bool deliver_exact = false;
        int ret = NET_RX_DROP;
        __be16 type;

        net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        trace_netif_receive_skb(skb);

        orig_dev = skb->dev;

        skb_reset_network_header(skb);
#if !defined(CONFIG_DEBUG_NET)
        /* We plan to no longer reset the transport header here.
         * Give some time to fuzzers and dev build to catch bugs
         * in network stacks.
         */
        if (!skb_transport_header_was_set(skb))
                skb_reset_transport_header(skb);
#endif
        skb_reset_mac_len(skb);

        pt_prev = NULL;

another_round:
        skb->skb_iif = skb->dev->ifindex;

        __this_cpu_inc(softnet_data.processed);

        if (static_branch_unlikely(&generic_xdp_needed_key)) {
                int ret2;

                migrate_disable();
                ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
                                      &skb);
                migrate_enable();

                if (ret2 != XDP_PASS) {
                        ret = NET_RX_DROP;
                        goto out;
                }
        }

        if (eth_type_vlan(skb->protocol)) {
                skb = skb_vlan_untag(skb);
                if (unlikely(!skb))
                        goto out;
        }

        if (skb_skip_tc_classify(skb))
                goto skip_classify;

        if (pfmemalloc)
                goto skip_taps;

        list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
                                list) {
                if (pt_prev)
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }

        list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
                if (pt_prev)
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
        }

skip_taps:
#ifdef CONFIG_NET_INGRESS
        if (static_branch_unlikely(&ingress_needed_key)) {
                bool another = false;

                nf_skip_egress(skb, true);
                skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
                                         &another);
                if (another)
                        goto another_round;
                if (!skb)
                        goto out;

                nf_skip_egress(skb, false);
                if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
                        goto out;
        }
#endif
        skb_reset_redirect(skb);
skip_classify:
        if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
                goto drop;

        if (skb_vlan_tag_present(skb)) {
                if (pt_prev) {
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = NULL;
                }
                if (vlan_do_receive(&skb))
                        goto another_round;
                else if (unlikely(!skb))
                        goto out;
        }

        rx_handler = rcu_dereference(skb->dev->rx_handler);
        if (rx_handler) {
                if (pt_prev) {
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                        pt_prev = NULL;
                }
                switch (rx_handler(&skb)) {
                case RX_HANDLER_CONSUMED:
                        ret = NET_RX_SUCCESS;
                        goto out;
                case RX_HANDLER_ANOTHER:
                        goto another_round;
                case RX_HANDLER_EXACT:
                        deliver_exact = true;
                        break;
                case RX_HANDLER_PASS:
                        break;
                default:
                        BUG();
                }
        }

        if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
check_vlan_id:
                if (skb_vlan_tag_get_id(skb)) {
                        /* Vlan id is non 0 and vlan_do_receive() above couldn't
                         * find vlan device.
                         */
                        skb->pkt_type = PACKET_OTHERHOST;
                } else if (eth_type_vlan(skb->protocol)) {
                        /* Outer header is 802.1P with vlan 0, inner header is
                         * 802.1Q or 802.1AD and vlan_do_receive() above could
                         * not find vlan dev for vlan id 0.
                         */
                        __vlan_hwaccel_clear_tag(skb);
                        skb = skb_vlan_untag(skb);
                        if (unlikely(!skb))
                                goto out;
                        if (vlan_do_receive(&skb))
                                /* After stripping off 802.1P header with vlan 0
                                 * vlan dev is found for inner header.
                                 */
                                goto another_round;
                        else if (unlikely(!skb))
                                goto out;
                        else
                                /* We have stripped outer 802.1P vlan 0 header.
                                 * But could not find vlan dev.
                                 * check again for vlan id to set OTHERHOST.
                                 */
                                goto check_vlan_id;
                }
                /* Note: we might in the future use prio bits
                 * and set skb->priority like in vlan_do_receive()
                 * For the time being, just ignore Priority Code Point
                 */
                __vlan_hwaccel_clear_tag(skb);
        }

        type = skb->protocol;

        /* deliver only exact match when indicated */
        if (likely(!deliver_exact)) {
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &ptype_base[ntohs(type) &
                                                   PTYPE_HASH_MASK]);

                /* orig_dev and skb->dev could belong to different netns;
                 * Even in such case we need to traverse only the list
                 * coming from skb->dev, as the ptype owner (packet socket)
                 * will use dev_net(skb->dev) to do namespace filtering.
                 */
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &dev_net_rcu(skb->dev)->ptype_specific);
        }

        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                               &orig_dev->ptype_specific);

        if (unlikely(skb->dev != orig_dev)) {
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &skb->dev->ptype_specific);
        }

        if (pt_prev) {
                if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
                        goto drop;
                *ppt_prev = pt_prev;
        } else {
drop:
                if (!deliver_exact)
                        dev_core_stats_rx_dropped_inc(skb->dev);
                else
                        dev_core_stats_rx_nohandler_inc(skb->dev);
                kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
                /* Jamal, now you will not able to escape explaining
                 * me how you were going to use this. :-)
                 */
                ret = NET_RX_DROP;
        }

out:
        /* The invariant here is that if *ppt_prev is not NULL
         * then skb should also be non-NULL.
         *
         * Apparently *ppt_prev assignment above holds this invariant due to
         * skb dereferencing near it.
         */
        *pskb = skb;
        return ret;
}

static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
{
        struct net_device *orig_dev = skb->dev;
        struct packet_type *pt_prev = NULL;
        int ret;

        ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
        if (pt_prev)
                ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
                                         skb->dev, pt_prev, orig_dev);
        return ret;
}

/**
 *        netif_receive_skb_core - special purpose version of netif_receive_skb
 *        @skb: buffer to process
 *
 *        More direct receive version of netif_receive_skb().  It should
 *        only be used by callers that have a need to skip RPS and Generic XDP.
 *        Caller must also take care of handling if ``(page_is_)pfmemalloc``.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 *
 *        Return values (usually ignored):
 *        NET_RX_SUCCESS: no congestion
 *        NET_RX_DROP: packet was dropped
 */
int netif_receive_skb_core(struct sk_buff *skb)
{
        int ret;

        rcu_read_lock();
        ret = __netif_receive_skb_one_core(skb, false);
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(netif_receive_skb_core);

static inline void __netif_receive_skb_list_ptype(struct list_head *head,
                                                  struct packet_type *pt_prev,
                                                  struct net_device *orig_dev)
{
        struct sk_buff *skb, *next;

        if (!pt_prev)
                return;
        if (list_empty(head))
                return;
        if (pt_prev->list_func != NULL)
                INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
                                   ip_list_rcv, head, pt_prev, orig_dev);
        else
                list_for_each_entry_safe(skb, next, head, list) {
                        skb_list_del_init(skb);
                        pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
                }
}

static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
{
        /* Fast-path assumptions:
         * - There is no RX handler.
         * - Only one packet_type matches.
         * If either of these fails, we will end up doing some per-packet
         * processing in-line, then handling the 'last ptype' for the whole
         * sublist.  This can't cause out-of-order delivery to any single ptype,
         * because the 'last ptype' must be constant across the sublist, and all
         * other ptypes are handled per-packet.
         */
        /* Current (common) ptype of sublist */
        struct packet_type *pt_curr = NULL;
        /* Current (common) orig_dev of sublist */
        struct net_device *od_curr = NULL;
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                struct net_device *orig_dev = skb->dev;
                struct packet_type *pt_prev = NULL;

                skb_list_del_init(skb);
                __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
                if (!pt_prev)
                        continue;
                if (pt_curr != pt_prev || od_curr != orig_dev) {
                        /* dispatch old sublist */
                        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
                        /* start new sublist */
                        INIT_LIST_HEAD(&sublist);
                        pt_curr = pt_prev;
                        od_curr = orig_dev;
                }
                list_add_tail(&skb->list, &sublist);
        }

        /* dispatch final sublist */
        __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
}

static int __netif_receive_skb(struct sk_buff *skb)
{
        int ret;

        if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
                unsigned int noreclaim_flag;

                /*
                 * PFMEMALLOC skbs are special, they should
                 * - be delivered to SOCK_MEMALLOC sockets only
                 * - stay away from userspace
                 * - have bounded memory usage
                 *
                 * Use PF_MEMALLOC as this saves us from propagating the allocation
                 * context down to all allocation sites.
                 */
                noreclaim_flag = memalloc_noreclaim_save();
                ret = __netif_receive_skb_one_core(skb, true);
                memalloc_noreclaim_restore(noreclaim_flag);
        } else
                ret = __netif_receive_skb_one_core(skb, false);

        return ret;
}

static void __netif_receive_skb_list(struct list_head *head)
{
        unsigned long noreclaim_flag = 0;
        struct sk_buff *skb, *next;
        bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */

        list_for_each_entry_safe(skb, next, head, list) {
                if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
                        struct list_head sublist;

                        /* Handle the previous sublist */
                        list_cut_before(&sublist, head, &skb->list);
                        if (!list_empty(&sublist))
                                __netif_receive_skb_list_core(&sublist, pfmemalloc);
                        pfmemalloc = !pfmemalloc;
                        /* See comments in __netif_receive_skb */
                        if (pfmemalloc)
                                noreclaim_flag = memalloc_noreclaim_save();
                        else
                                memalloc_noreclaim_restore(noreclaim_flag);
                }
        }
        /* Handle the remaining sublist */
        if (!list_empty(head))
                __netif_receive_skb_list_core(head, pfmemalloc);
        /* Restore pflags */
        if (pfmemalloc)
                memalloc_noreclaim_restore(noreclaim_flag);
}

static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
{
        struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
        struct bpf_prog *new = xdp->prog;
        int ret = 0;

        switch (xdp->command) {
        case XDP_SETUP_PROG:
                rcu_assign_pointer(dev->xdp_prog, new);
                if (old)
                        bpf_prog_put(old);

                if (old && !new) {
                        static_branch_dec(&generic_xdp_needed_key);
                } else if (new && !old) {
                        static_branch_inc(&generic_xdp_needed_key);
                        netif_disable_lro(dev);
                        dev_disable_gro_hw(dev);
                }
                break;

        default:
                ret = -EINVAL;
                break;
        }

        return ret;
}

static int netif_receive_skb_internal(struct sk_buff *skb)
{
        int ret;

        net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);

        if (skb_defer_rx_timestamp(skb))
                return NET_RX_SUCCESS;

        rcu_read_lock();
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu = get_rps_cpu(skb->dev, skb, &rflow);

                if (cpu >= 0) {
                        ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
                        rcu_read_unlock();
                        return ret;
                }
        }
#endif
        ret = __netif_receive_skb(skb);
        rcu_read_unlock();
        return ret;
}

void netif_receive_skb_list_internal(struct list_head *head)
{
        struct sk_buff *skb, *next;
        LIST_HEAD(sublist);

        list_for_each_entry_safe(skb, next, head, list) {
                net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
                                    skb);
                skb_list_del_init(skb);
                if (!skb_defer_rx_timestamp(skb))
                        list_add_tail(&skb->list, &sublist);
        }
        list_splice_init(&sublist, head);

        rcu_read_lock();
#ifdef CONFIG_RPS
        if (static_branch_unlikely(&rps_needed)) {
                list_for_each_entry_safe(skb, next, head, list) {
                        struct rps_dev_flow voidflow, *rflow = &voidflow;
                        int cpu = get_rps_cpu(skb->dev, skb, &rflow);

                        if (cpu >= 0) {
                                /* Will be handled, remove from list */
                                skb_list_del_init(skb);
                                enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
                        }
                }
        }
#endif
        __netif_receive_skb_list(head);
        rcu_read_unlock();
}

/**
 *        netif_receive_skb - process receive buffer from network
 *        @skb: buffer to process
 *
 *        netif_receive_skb() is the main receive data processing function.
 *        It always succeeds. The buffer may be dropped during processing
 *        for congestion control or by the protocol layers.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 *
 *        Return values (usually ignored):
 *        NET_RX_SUCCESS: no congestion
 *        NET_RX_DROP: packet was dropped
 */
int netif_receive_skb(struct sk_buff *skb)
{
        int ret;

        trace_netif_receive_skb_entry(skb);

        ret = netif_receive_skb_internal(skb);
        trace_netif_receive_skb_exit(ret);

        return ret;
}
EXPORT_SYMBOL(netif_receive_skb);

/**
 *        netif_receive_skb_list - process many receive buffers from network
 *        @head: list of skbs to process.
 *
 *        Since return value of netif_receive_skb() is normally ignored, and
 *        wouldn't be meaningful for a list, this function returns void.
 *
 *        This function may only be called from softirq context and interrupts
 *        should be enabled.
 */
void netif_receive_skb_list(struct list_head *head)
{
        struct sk_buff *skb;

        if (list_empty(head))
                return;
        if (trace_netif_receive_skb_list_entry_enabled()) {
                list_for_each_entry(skb, head, list)
                        trace_netif_receive_skb_list_entry(skb);
        }
        netif_receive_skb_list_internal(head);
        trace_netif_receive_skb_list_exit(0);
}
EXPORT_SYMBOL(netif_receive_skb_list);

/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
        struct sk_buff *skb, *tmp;
        struct sk_buff_head list;
        struct softnet_data *sd;

        __skb_queue_head_init(&list);
        local_bh_disable();
        sd = this_cpu_ptr(&softnet_data);

        backlog_lock_irq_disable(sd);
        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
                if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
                        __skb_unlink(skb, &sd->input_pkt_queue);
                        __skb_queue_tail(&list, skb);
                        rps_input_queue_head_incr(sd);
                }
        }
        backlog_unlock_irq_enable(sd);

        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
                if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
                        __skb_unlink(skb, &sd->process_queue);
                        __skb_queue_tail(&list, skb);
                        rps_input_queue_head_incr(sd);
                }
        }
        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
        local_bh_enable();

        __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
}

static bool flush_required(int cpu)
{
#if IS_ENABLED(CONFIG_RPS)
        struct softnet_data *sd = &per_cpu(softnet_data, cpu);
        bool do_flush;

        backlog_lock_irq_disable(sd);

        /* as insertion into process_queue happens with the rps lock held,
         * process_queue access may race only with dequeue
         */
        do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
                   !skb_queue_empty_lockless(&sd->process_queue);
        backlog_unlock_irq_enable(sd);

        return do_flush;
#endif
        /* without RPS we can't safely check input_pkt_queue: during a
         * concurrent remote skb_queue_splice() we can detect as empty both
         * input_pkt_queue and process_queue even if the latter could end-up
         * containing a lot of packets.
         */
        return true;
}

struct flush_backlogs {
        cpumask_t                flush_cpus;
        struct work_struct        w[];
};

static struct flush_backlogs *flush_backlogs_alloc(void)
{
        return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
                       GFP_KERNEL);
}

static struct flush_backlogs *flush_backlogs_fallback;
static DEFINE_MUTEX(flush_backlogs_mutex);

static void flush_all_backlogs(void)
{
        struct flush_backlogs *ptr = flush_backlogs_alloc();
        unsigned int cpu;

        if (!ptr) {
                mutex_lock(&flush_backlogs_mutex);
                ptr = flush_backlogs_fallback;
        }
        cpumask_clear(&ptr->flush_cpus);

        cpus_read_lock();

        for_each_online_cpu(cpu) {
                if (flush_required(cpu)) {
                        INIT_WORK(&ptr->w[cpu], flush_backlog);
                        queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
                        __cpumask_set_cpu(cpu, &ptr->flush_cpus);
                }
        }

        /* we can have in flight packet[s] on the cpus we are not flushing,
         * synchronize_net() in unregister_netdevice_many() will take care of
         * them.
         */
        for_each_cpu(cpu, &ptr->flush_cpus)
                flush_work(&ptr->w[cpu]);

        cpus_read_unlock();

        if (ptr != flush_backlogs_fallback)
                kfree(ptr);
        else
                mutex_unlock(&flush_backlogs_mutex);
}

static void net_rps_send_ipi(struct softnet_data *remsd)
{
#ifdef CONFIG_RPS
        while (remsd) {
                struct softnet_data *next = remsd->rps_ipi_next;

                if (cpu_online(remsd->cpu))
                        smp_call_function_single_async(remsd->cpu, &remsd->csd);
                remsd = next;
        }
#endif
}

/*
 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
 * Note: called with local irq disabled, but exits with local irq enabled.
 */
static void net_rps_action_and_irq_enable(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        struct softnet_data *remsd = sd->rps_ipi_list;

        if (!use_backlog_threads() && remsd) {
                sd->rps_ipi_list = NULL;

                local_irq_enable();

                /* Send pending IPI's to kick RPS processing on remote cpus. */
                net_rps_send_ipi(remsd);
        } else
#endif
                local_irq_enable();
}

static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
        return !use_backlog_threads() && sd->rps_ipi_list;
#else
        return false;
#endif
}

static int process_backlog(struct napi_struct *napi, int quota)
{
        struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
        bool again = true;
        int work = 0;

        /* Check if we have pending ipi, its better to send them now,
         * not waiting net_rx_action() end.
         */
        if (sd_has_rps_ipi_waiting(sd)) {
                local_irq_disable();
                net_rps_action_and_irq_enable(sd);
        }

        napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
        while (again) {
                struct sk_buff *skb;

                local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                while ((skb = __skb_dequeue(&sd->process_queue))) {
                        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
                        rcu_read_lock();
                        __netif_receive_skb(skb);
                        rcu_read_unlock();
                        if (++work >= quota) {
                                rps_input_queue_head_add(sd, work);
                                return work;
                        }

                        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                }
                local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);

                backlog_lock_irq_disable(sd);
                if (skb_queue_empty(&sd->input_pkt_queue)) {
                        /*
                         * Inline a custom version of __napi_complete().
                         * only current cpu owns and manipulates this napi,
                         * and NAPI_STATE_SCHED is the only possible flag set
                         * on backlog.
                         * We can use a plain write instead of clear_bit(),
                         * and we dont need an smp_mb() memory barrier.
                         */
                        napi->state &= NAPIF_STATE_THREADED;
                        again = false;
                } else {
                        local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
                                                   &sd->process_queue);
                        local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
                }
                backlog_unlock_irq_enable(sd);
        }

        if (work)
                rps_input_queue_head_add(sd, work);
        return work;
}

/**
 * __napi_schedule - schedule for receive
 * @n: entry to schedule
 *
 * The entry's receive function will be scheduled to run.
 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
 */
void __napi_schedule(struct napi_struct *n)
{
        unsigned long flags;

        local_irq_save(flags);
        ____napi_schedule(this_cpu_ptr(&softnet_data), n);
        local_irq_restore(flags);
}
EXPORT_SYMBOL(__napi_schedule);

/**
 *        napi_schedule_prep - check if napi can be scheduled
 *        @n: napi context
 *
 * Test if NAPI routine is already running, and if not mark
 * it as running.  This is used as a condition variable to
 * insure only one NAPI poll instance runs.  We also make
 * sure there is no pending NAPI disable.
 */
bool napi_schedule_prep(struct napi_struct *n)
{
        unsigned long new, val = READ_ONCE(n->state);

        do {
                if (unlikely(val & NAPIF_STATE_DISABLE))
                        return false;
                new = val | NAPIF_STATE_SCHED;

                /* Sets STATE_MISSED bit if STATE_SCHED was already set
                 * This was suggested by Alexander Duyck, as compiler
                 * emits better code than :
                 * if (val & NAPIF_STATE_SCHED)
                 *     new |= NAPIF_STATE_MISSED;
                 */
                new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
                                                   NAPIF_STATE_MISSED;
        } while (!try_cmpxchg(&n->state, &val, new));

        return !(val & NAPIF_STATE_SCHED);
}
EXPORT_SYMBOL(napi_schedule_prep);

/**
 * __napi_schedule_irqoff - schedule for receive
 * @n: entry to schedule
 *
 * Variant of __napi_schedule() assuming hard irqs are masked.
 *
 * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
 * because the interrupt disabled assumption might not be true
 * due to force-threaded interrupts and spinlock substitution.
 */
void __napi_schedule_irqoff(struct napi_struct *n)
{
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                ____napi_schedule(this_cpu_ptr(&softnet_data), n);
        else
                __napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);

bool napi_complete_done(struct napi_struct *n, int work_done)
{
        unsigned long flags, val, new, timeout = 0;
        bool ret = true;

        /*
         * 1) Don't let napi dequeue from the cpu poll list
         *    just in case its running on a different cpu.
         * 2) If we are busy polling, do nothing here, we have
         *    the guarantee we will be called later.
         */
        if (unlikely(n->state & (NAPIF_STATE_NPSVC |
                                 NAPIF_STATE_IN_BUSY_POLL)))
                return false;

        if (work_done) {
                if (n->gro.bitmask)
                        timeout = napi_get_gro_flush_timeout(n);
                n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
        }
        if (n->defer_hard_irqs_count > 0) {
                n->defer_hard_irqs_count--;
                timeout = napi_get_gro_flush_timeout(n);
                if (timeout)
                        ret = false;
        }

        /*
         * When the NAPI instance uses a timeout and keeps postponing
         * it, we need to bound somehow the time packets are kept in
         * the GRO layer.
         */
        gro_flush(&n->gro, !!timeout);
        gro_normal_list(&n->gro);

        if (unlikely(!list_empty(&n->poll_list))) {
                /* If n->poll_list is not empty, we need to mask irqs */
                local_irq_save(flags);
                list_del_init(&n->poll_list);
                local_irq_restore(flags);
        }
        WRITE_ONCE(n->list_owner, -1);

        val = READ_ONCE(n->state);
        do {
                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));

                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
                              NAPIF_STATE_SCHED_THREADED |
                              NAPIF_STATE_PREFER_BUSY_POLL);

                /* If STATE_MISSED was set, leave STATE_SCHED set,
                 * because we will call napi->poll() one more time.
                 * This C code was suggested by Alexander Duyck to help gcc.
                 */
                new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
                                                    NAPIF_STATE_SCHED;
        } while (!try_cmpxchg(&n->state, &val, new));

        if (unlikely(val & NAPIF_STATE_MISSED)) {
                __napi_schedule(n);
                return false;
        }

        if (timeout)
                hrtimer_start(&n->timer, ns_to_ktime(timeout),
                              HRTIMER_MODE_REL_PINNED);
        return ret;
}
EXPORT_SYMBOL(napi_complete_done);

static void skb_defer_free_flush(struct softnet_data *sd)
{
        struct sk_buff *skb, *next;

        /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
        if (!READ_ONCE(sd->defer_list))
                return;

        spin_lock(&sd->defer_lock);
        skb = sd->defer_list;
        sd->defer_list = NULL;
        sd->defer_count = 0;
        spin_unlock(&sd->defer_lock);

        while (skb != NULL) {
                next = skb->next;
                napi_consume_skb(skb, 1);
                skb = next;
        }
}

#if defined(CONFIG_NET_RX_BUSY_POLL)

static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
        if (!skip_schedule) {
                gro_normal_list(&napi->gro);
                __napi_schedule(napi);
                return;
        }

        /* Flush too old packets. If HZ < 1000, flush all packets */
        gro_flush(&napi->gro, HZ >= 1000);
        gro_normal_list(&napi->gro);

        clear_bit(NAPI_STATE_SCHED, &napi->state);
}

enum {
        NAPI_F_PREFER_BUSY_POLL        = 1,
        NAPI_F_END_ON_RESCHED        = 2,
};

static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
                           unsigned flags, u16 budget)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        bool skip_schedule = false;
        unsigned long timeout;
        int rc;

        /* Busy polling means there is a high chance device driver hard irq
         * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
         * set in napi_schedule_prep().
         * Since we are about to call napi->poll() once more, we can safely
         * clear NAPI_STATE_MISSED.
         *
         * Note: x86 could use a single "lock and ..." instruction
         * to perform these two clear_bit()
         */
        clear_bit(NAPI_STATE_MISSED, &napi->state);
        clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);

        local_bh_disable();
        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

        if (flags & NAPI_F_PREFER_BUSY_POLL) {
                napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
                timeout = napi_get_gro_flush_timeout(napi);
                if (napi->defer_hard_irqs_count && timeout) {
                        hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
                        skip_schedule = true;
                }
        }

        /* All we really want here is to re-enable device interrupts.
         * Ideally, a new ndo_busy_poll_stop() could avoid another round.
         */
        rc = napi->poll(napi, budget);
        /* We can't gro_normal_list() here, because napi->poll() might have
         * rearmed the napi (napi_complete_done()) in which case it could
         * already be running on another CPU.
         */
        trace_napi_poll(napi, rc, budget);
        netpoll_poll_unlock(have_poll_lock);
        if (rc == budget)
                __busy_poll_stop(napi, skip_schedule);
        bpf_net_ctx_clear(bpf_net_ctx);
        local_bh_enable();
}

static void __napi_busy_loop(unsigned int napi_id,
                      bool (*loop_end)(void *, unsigned long),
                      void *loop_end_arg, unsigned flags, u16 budget)
{
        unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
        int (*napi_poll)(struct napi_struct *napi, int budget);
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        void *have_poll_lock = NULL;
        struct napi_struct *napi;

        WARN_ON_ONCE(!rcu_read_lock_held());

restart:
        napi_poll = NULL;

        napi = napi_by_id(napi_id);
        if (!napi)
                return;

        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_disable();
        for (;;) {
                int work = 0;

                local_bh_disable();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
                if (!napi_poll) {
                        unsigned long val = READ_ONCE(napi->state);

                        /* If multiple threads are competing for this napi,
                         * we avoid dirtying napi->state as much as we can.
                         */
                        if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
                                   NAPIF_STATE_IN_BUSY_POLL)) {
                                if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
                        if (cmpxchg(&napi->state, val,
                                    val | NAPIF_STATE_IN_BUSY_POLL |
                                          NAPIF_STATE_SCHED) != val) {
                                if (flags & NAPI_F_PREFER_BUSY_POLL)
                                        set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                                goto count;
                        }
                        have_poll_lock = netpoll_poll_lock(napi);
                        napi_poll = napi->poll;
                }
                work = napi_poll(napi, budget);
                trace_napi_poll(napi, work, budget);
                gro_normal_list(&napi->gro);
count:
                if (work > 0)
                        __NET_ADD_STATS(dev_net(napi->dev),
                                        LINUX_MIB_BUSYPOLLRXPACKETS, work);
                skb_defer_free_flush(this_cpu_ptr(&softnet_data));
                bpf_net_ctx_clear(bpf_net_ctx);
                local_bh_enable();

                if (!loop_end || loop_end(loop_end_arg, start_time))
                        break;

                if (unlikely(need_resched())) {
                        if (flags & NAPI_F_END_ON_RESCHED)
                                break;
                        if (napi_poll)
                                busy_poll_stop(napi, have_poll_lock, flags, budget);
                        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                                preempt_enable();
                        rcu_read_unlock();
                        cond_resched();
                        rcu_read_lock();
                        if (loop_end(loop_end_arg, start_time))
                                return;
                        goto restart;
                }
                cpu_relax();
        }
        if (napi_poll)
                busy_poll_stop(napi, have_poll_lock, flags, budget);
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                preempt_enable();
}

void napi_busy_loop_rcu(unsigned int napi_id,
                        bool (*loop_end)(void *, unsigned long),
                        void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
        unsigned flags = NAPI_F_END_ON_RESCHED;

        if (prefer_busy_poll)
                flags |= NAPI_F_PREFER_BUSY_POLL;

        __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
}

void napi_busy_loop(unsigned int napi_id,
                    bool (*loop_end)(void *, unsigned long),
                    void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{
        unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;

        rcu_read_lock();
        __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
        rcu_read_unlock();
}
EXPORT_SYMBOL(napi_busy_loop);

void napi_suspend_irqs(unsigned int napi_id)
{
        struct napi_struct *napi;

        rcu_read_lock();
        napi = napi_by_id(napi_id);
        if (napi) {
                unsigned long timeout = napi_get_irq_suspend_timeout(napi);

                if (timeout)
                        hrtimer_start(&napi->timer, ns_to_ktime(timeout),
                                      HRTIMER_MODE_REL_PINNED);
        }
        rcu_read_unlock();
}

void napi_resume_irqs(unsigned int napi_id)
{
        struct napi_struct *napi;

        rcu_read_lock();
        napi = napi_by_id(napi_id);
        if (napi) {
                /* If irq_suspend_timeout is set to 0 between the call to
                 * napi_suspend_irqs and now, the original value still
                 * determines the safety timeout as intended and napi_watchdog
                 * will resume irq processing.
                 */
                if (napi_get_irq_suspend_timeout(napi)) {
                        local_bh_disable();
                        napi_schedule(napi);
                        local_bh_enable();
                }
        }
        rcu_read_unlock();
}

#endif /* CONFIG_NET_RX_BUSY_POLL */

static void __napi_hash_add_with_id(struct napi_struct *napi,
                                    unsigned int napi_id)
{
        napi->gro.cached_napi_id = napi_id;

        WRITE_ONCE(napi->napi_id, napi_id);
        hlist_add_head_rcu(&napi->napi_hash_node,
                           &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
}

static void napi_hash_add_with_id(struct napi_struct *napi,
                                  unsigned int napi_id)
{
        unsigned long flags;

        spin_lock_irqsave(&napi_hash_lock, flags);
        WARN_ON_ONCE(napi_by_id(napi_id));
        __napi_hash_add_with_id(napi, napi_id);
        spin_unlock_irqrestore(&napi_hash_lock, flags);
}

static void napi_hash_add(struct napi_struct *napi)
{
        unsigned long flags;

        if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
                return;

        spin_lock_irqsave(&napi_hash_lock, flags);

        /* 0..NR_CPUS range is reserved for sender_cpu use */
        do {
                if (unlikely(!napi_id_valid(++napi_gen_id)))
                        napi_gen_id = MIN_NAPI_ID;
        } while (napi_by_id(napi_gen_id));

        __napi_hash_add_with_id(napi, napi_gen_id);

        spin_unlock_irqrestore(&napi_hash_lock, flags);
}

/* Warning : caller is responsible to make sure rcu grace period
 * is respected before freeing memory containing @napi
 */
static void napi_hash_del(struct napi_struct *napi)
{
        unsigned long flags;

        spin_lock_irqsave(&napi_hash_lock, flags);

        hlist_del_init_rcu(&napi->napi_hash_node);

        spin_unlock_irqrestore(&napi_hash_lock, flags);
}

static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
        struct napi_struct *napi;

        napi = container_of(timer, struct napi_struct, timer);

        /* Note : we use a relaxed variant of napi_schedule_prep() not setting
         * NAPI_STATE_MISSED, since we do not react to a device IRQ.
         */
        if (!napi_disable_pending(napi) &&
            !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
                clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
                __napi_schedule_irqoff(napi);
        }

        return HRTIMER_NORESTART;
}

int dev_set_threaded(struct net_device *dev, bool threaded)
{
        struct napi_struct *napi;
        int err = 0;

        netdev_assert_locked_or_invisible(dev);

        if (dev->threaded == threaded)
                return 0;

        if (threaded) {
                list_for_each_entry(napi, &dev->napi_list, dev_list) {
                        if (!napi->thread) {
                                err = napi_kthread_create(napi);
                                if (err) {
                                        threaded = false;
                                        break;
                                }
                        }
                }
        }

        WRITE_ONCE(dev->threaded, threaded);

        /* Make sure kthread is created before THREADED bit
         * is set.
         */
        smp_mb__before_atomic();

        /* Setting/unsetting threaded mode on a napi might not immediately
         * take effect, if the current napi instance is actively being
         * polled. In this case, the switch between threaded mode and
         * softirq mode will happen in the next round of napi_schedule().
         * This should not cause hiccups/stalls to the live traffic.
         */
        list_for_each_entry(napi, &dev->napi_list, dev_list)
                assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);

        return err;
}
EXPORT_SYMBOL(dev_set_threaded);

/**
 * netif_queue_set_napi - Associate queue with the napi
 * @dev: device to which NAPI and queue belong
 * @queue_index: Index of queue
 * @type: queue type as RX or TX
 * @napi: NAPI context, pass NULL to clear previously set NAPI
 *
 * Set queue with its corresponding napi context. This should be done after
 * registering the NAPI handler for the queue-vector and the queues have been
 * mapped to the corresponding interrupt vector.
 */
void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
                          enum netdev_queue_type type, struct napi_struct *napi)
{
        struct netdev_rx_queue *rxq;
        struct netdev_queue *txq;

        if (WARN_ON_ONCE(napi && !napi->dev))
                return;
        netdev_ops_assert_locked_or_invisible(dev);

        switch (type) {
        case NETDEV_QUEUE_TYPE_RX:
                rxq = __netif_get_rx_queue(dev, queue_index);
                rxq->napi = napi;
                return;
        case NETDEV_QUEUE_TYPE_TX:
                txq = netdev_get_tx_queue(dev, queue_index);
                txq->napi = napi;
                return;
        default:
                return;
        }
}
EXPORT_SYMBOL(netif_queue_set_napi);

static void
netif_napi_irq_notify(struct irq_affinity_notify *notify,
                      const cpumask_t *mask)
{
        struct napi_struct *napi =
                container_of(notify, struct napi_struct, notify);
#ifdef CONFIG_RFS_ACCEL
        struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
        int err;
#endif

        if (napi->config && napi->dev->irq_affinity_auto)
                cpumask_copy(&napi->config->affinity_mask, mask);

#ifdef CONFIG_RFS_ACCEL
        if (napi->dev->rx_cpu_rmap_auto) {
                err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
                if (err)
                        netdev_warn(napi->dev, "RMAP update failed (%d)\n",
                                    err);
        }
#endif
}

#ifdef CONFIG_RFS_ACCEL
static void netif_napi_affinity_release(struct kref *ref)
{
        struct napi_struct *napi =
                container_of(ref, struct napi_struct, notify.kref);
        struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;

        netdev_assert_locked(napi->dev);
        WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
                                   &napi->state));

        if (!napi->dev->rx_cpu_rmap_auto)
                return;
        rmap->obj[napi->napi_rmap_idx] = NULL;
        napi->napi_rmap_idx = -1;
        cpu_rmap_put(rmap);
}

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
{
        if (dev->rx_cpu_rmap_auto)
                return 0;

        dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
        if (!dev->rx_cpu_rmap)
                return -ENOMEM;

        dev->rx_cpu_rmap_auto = true;
        return 0;
}
EXPORT_SYMBOL(netif_enable_cpu_rmap);

static void netif_del_cpu_rmap(struct net_device *dev)
{
        struct cpu_rmap *rmap = dev->rx_cpu_rmap;

        if (!dev->rx_cpu_rmap_auto)
                return;

        /* Free the rmap */
        cpu_rmap_put(rmap);
        dev->rx_cpu_rmap = NULL;
        dev->rx_cpu_rmap_auto = false;
}

#else
static void netif_napi_affinity_release(struct kref *ref)
{
}

int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
{
        return 0;
}
EXPORT_SYMBOL(netif_enable_cpu_rmap);

static void netif_del_cpu_rmap(struct net_device *dev)
{
}
#endif

void netif_set_affinity_auto(struct net_device *dev)
{
        unsigned int i, maxqs, numa;

        maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
        numa = dev_to_node(&dev->dev);

        for (i = 0; i < maxqs; i++)
                cpumask_set_cpu(cpumask_local_spread(i, numa),
                                &dev->napi_config[i].affinity_mask);

        dev->irq_affinity_auto = true;
}
EXPORT_SYMBOL(netif_set_affinity_auto);

void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
{
        int rc;

        netdev_assert_locked_or_invisible(napi->dev);

        if (napi->irq == irq)
                return;

        /* Remove existing resources */
        if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
                irq_set_affinity_notifier(napi->irq, NULL);

        napi->irq = irq;
        if (irq < 0 ||
            (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
                return;

        /* Abort for buggy drivers */
        if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
                return;

#ifdef CONFIG_RFS_ACCEL
        if (napi->dev->rx_cpu_rmap_auto) {
                rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
                if (rc < 0)
                        return;

                cpu_rmap_get(napi->dev->rx_cpu_rmap);
                napi->napi_rmap_idx = rc;
        }
#endif

        /* Use core IRQ notifier */
        napi->notify.notify = netif_napi_irq_notify;
        napi->notify.release = netif_napi_affinity_release;
        rc = irq_set_affinity_notifier(irq, &napi->notify);
        if (rc) {
                netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
                            rc);
                goto put_rmap;
        }

        set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
        return;

put_rmap:
#ifdef CONFIG_RFS_ACCEL
        if (napi->dev->rx_cpu_rmap_auto) {
                napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
                cpu_rmap_put(napi->dev->rx_cpu_rmap);
                napi->napi_rmap_idx = -1;
        }
#endif
        napi->notify.notify = NULL;
        napi->notify.release = NULL;
}
EXPORT_SYMBOL(netif_napi_set_irq_locked);

static void napi_restore_config(struct napi_struct *n)
{
        n->defer_hard_irqs = n->config->defer_hard_irqs;
        n->gro_flush_timeout = n->config->gro_flush_timeout;
        n->irq_suspend_timeout = n->config->irq_suspend_timeout;

        if (n->dev->irq_affinity_auto &&
            test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
                irq_set_affinity(n->irq, &n->config->affinity_mask);

        /* a NAPI ID might be stored in the config, if so use it. if not, use
         * napi_hash_add to generate one for us.
         */
        if (n->config->napi_id) {
                napi_hash_add_with_id(n, n->config->napi_id);
        } else {
                napi_hash_add(n);
                n->config->napi_id = n->napi_id;
        }
}

static void napi_save_config(struct napi_struct *n)
{
        n->config->defer_hard_irqs = n->defer_hard_irqs;
        n->config->gro_flush_timeout = n->gro_flush_timeout;
        n->config->irq_suspend_timeout = n->irq_suspend_timeout;
        napi_hash_del(n);
}

/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
 * inherit an existing ID try to insert it at the right position.
 */
static void
netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
{
        unsigned int new_id, pos_id;
        struct list_head *higher;
        struct napi_struct *pos;

        new_id = UINT_MAX;
        if (napi->config && napi->config->napi_id)
                new_id = napi->config->napi_id;

        higher = &dev->napi_list;
        list_for_each_entry(pos, &dev->napi_list, dev_list) {
                if (napi_id_valid(pos->napi_id))
                        pos_id = pos->napi_id;
                else if (pos->config)
                        pos_id = pos->config->napi_id;
                else
                        pos_id = UINT_MAX;

                if (pos_id <= new_id)
                        break;
                higher = &pos->dev_list;
        }
        list_add_rcu(&napi->dev_list, higher); /* adds after higher */
}

/* Double check that napi_get_frags() allocates skbs with
 * skb->head being backed by slab, not a page fragment.
 * This is to make sure bug fixed in 3226b158e67c
 * ("net: avoid 32 x truesize under-estimation for tiny skbs")
 * does not accidentally come back.
 */
static void napi_get_frags_check(struct napi_struct *napi)
{
        struct sk_buff *skb;

        local_bh_disable();
        skb = napi_get_frags(napi);
        WARN_ON_ONCE(skb && skb->head_frag);
        napi_free_frags(napi);
        local_bh_enable();
}

void netif_napi_add_weight_locked(struct net_device *dev,
                                  struct napi_struct *napi,
                                  int (*poll)(struct napi_struct *, int),
                                  int weight)
{
        netdev_assert_locked(dev);
        if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
                return;

        INIT_LIST_HEAD(&napi->poll_list);
        INIT_HLIST_NODE(&napi->napi_hash_node);
        hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
        gro_init(&napi->gro);
        napi->skb = NULL;
        napi->poll = poll;
        if (weight > NAPI_POLL_WEIGHT)
                netdev_err_once(dev, "%s() called with weight %d\n", __func__,
                                weight);
        napi->weight = weight;
        napi->dev = dev;
#ifdef CONFIG_NETPOLL
        napi->poll_owner = -1;
#endif
        napi->list_owner = -1;
        set_bit(NAPI_STATE_SCHED, &napi->state);
        set_bit(NAPI_STATE_NPSVC, &napi->state);
        netif_napi_dev_list_add(dev, napi);

        /* default settings from sysfs are applied to all NAPIs. any per-NAPI
         * configuration will be loaded in napi_enable
         */
        napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
        napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));

        napi_get_frags_check(napi);
        /* Create kthread for this napi if dev->threaded is set.
         * Clear dev->threaded if kthread creation failed so that
         * threaded mode will not be enabled in napi_enable().
         */
        if (dev->threaded && napi_kthread_create(napi))
                dev->threaded = false;
        netif_napi_set_irq_locked(napi, -1);
}
EXPORT_SYMBOL(netif_napi_add_weight_locked);

void napi_disable_locked(struct napi_struct *n)
{
        unsigned long val, new;

        might_sleep();
        netdev_assert_locked(n->dev);

        set_bit(NAPI_STATE_DISABLE, &n->state);

        val = READ_ONCE(n->state);
        do {
                while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
                        usleep_range(20, 200);
                        val = READ_ONCE(n->state);
                }

                new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
                new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
        } while (!try_cmpxchg(&n->state, &val, new));

        hrtimer_cancel(&n->timer);

        if (n->config)
                napi_save_config(n);
        else
                napi_hash_del(n);

        clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable_locked);

/**
 * napi_disable() - prevent NAPI from scheduling
 * @n: NAPI context
 *
 * Stop NAPI from being scheduled on this context.
 * Waits till any outstanding processing completes.
 * Takes netdev_lock() for associated net_device.
 */
void napi_disable(struct napi_struct *n)
{
        netdev_lock(n->dev);
        napi_disable_locked(n);
        netdev_unlock(n->dev);
}
EXPORT_SYMBOL(napi_disable);

void napi_enable_locked(struct napi_struct *n)
{
        unsigned long new, val = READ_ONCE(n->state);

        if (n->config)
                napi_restore_config(n);
        else
                napi_hash_add(n);

        do {
                BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));

                new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
                if (n->dev->threaded && n->thread)
                        new |= NAPIF_STATE_THREADED;
        } while (!try_cmpxchg(&n->state, &val, new));
}
EXPORT_SYMBOL(napi_enable_locked);

/**
 * napi_enable() - enable NAPI scheduling
 * @n: NAPI context
 *
 * Enable scheduling of a NAPI instance.
 * Must be paired with napi_disable().
 * Takes netdev_lock() for associated net_device.
 */
void napi_enable(struct napi_struct *n)
{
        netdev_lock(n->dev);
        napi_enable_locked(n);
        netdev_unlock(n->dev);
}
EXPORT_SYMBOL(napi_enable);

/* Must be called in process context */
void __netif_napi_del_locked(struct napi_struct *napi)
{
        netdev_assert_locked(napi->dev);

        if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
                return;

        /* Make sure NAPI is disabled (or was never enabled). */
        WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));

        if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
                irq_set_affinity_notifier(napi->irq, NULL);

        if (napi->config) {
                napi->index = -1;
                napi->config = NULL;
        }

        list_del_rcu(&napi->dev_list);
        napi_free_frags(napi);

        gro_cleanup(&napi->gro);

        if (napi->thread) {
                kthread_stop(napi->thread);
                napi->thread = NULL;
        }
}
EXPORT_SYMBOL(__netif_napi_del_locked);

static int __napi_poll(struct napi_struct *n, bool *repoll)
{
        int work, weight;

        weight = n->weight;

        /* This NAPI_STATE_SCHED test is for avoiding a race
         * with netpoll's poll_napi().  Only the entity which
         * obtains the lock and sees NAPI_STATE_SCHED set will
         * actually make the ->poll() call.  Therefore we avoid
         * accidentally calling ->poll() when NAPI is not scheduled.
         */
        work = 0;
        if (napi_is_scheduled(n)) {
                work = n->poll(n, weight);
                trace_napi_poll(n, work, weight);

                xdp_do_check_flushed(n);
        }

        if (unlikely(work > weight))
                netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
                                n->poll, work, weight);

        if (likely(work < weight))
                return work;

        /* Drivers must not modify the NAPI state if they
         * consume the entire weight.  In such cases this code
         * still "owns" the NAPI instance and therefore can
         * move the instance around on the list at-will.
         */
        if (unlikely(napi_disable_pending(n))) {
                napi_complete(n);
                return work;
        }

        /* The NAPI context has more processing work, but busy-polling
         * is preferred. Exit early.
         */
        if (napi_prefer_busy_poll(n)) {
                if (napi_complete_done(n, work)) {
                        /* If timeout is not set, we need to make sure
                         * that the NAPI is re-scheduled.
                         */
                        napi_schedule(n);
                }
                return work;
        }

        /* Flush too old packets. If HZ < 1000, flush all packets */
        gro_flush(&n->gro, HZ >= 1000);
        gro_normal_list(&n->gro);

        /* Some drivers may have called napi_schedule
         * prior to exhausting their budget.
         */
        if (unlikely(!list_empty(&n->poll_list))) {
                pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
                             n->dev ? n->dev->name : "backlog");
                return work;
        }

        *repoll = true;

        return work;
}

static int napi_poll(struct napi_struct *n, struct list_head *repoll)
{
        bool do_repoll = false;
        void *have;
        int work;

        list_del_init(&n->poll_list);

        have = netpoll_poll_lock(n);

        work = __napi_poll(n, &do_repoll);

        if (do_repoll)
                list_add_tail(&n->poll_list, repoll);

        netpoll_poll_unlock(have);

        return work;
}

static int napi_thread_wait(struct napi_struct *napi)
{
        set_current_state(TASK_INTERRUPTIBLE);

        while (!kthread_should_stop()) {
                /* Testing SCHED_THREADED bit here to make sure the current
                 * kthread owns this napi and could poll on this napi.
                 * Testing SCHED bit is not enough because SCHED bit might be
                 * set by some other busy poll thread or by napi_disable().
                 */
                if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
                        WARN_ON(!list_empty(&napi->poll_list));
                        __set_current_state(TASK_RUNNING);
                        return 0;
                }

                schedule();
                set_current_state(TASK_INTERRUPTIBLE);
        }
        __set_current_state(TASK_RUNNING);

        return -1;
}

static void napi_threaded_poll_loop(struct napi_struct *napi)
{
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        struct softnet_data *sd;
        unsigned long last_qs = jiffies;

        for (;;) {
                bool repoll = false;
                void *have;

                local_bh_disable();
                bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);

                sd = this_cpu_ptr(&softnet_data);
                sd->in_napi_threaded_poll = true;

                have = netpoll_poll_lock(napi);
                __napi_poll(napi, &repoll);
                netpoll_poll_unlock(have);

                sd->in_napi_threaded_poll = false;
                barrier();

                if (sd_has_rps_ipi_waiting(sd)) {
                        local_irq_disable();
                        net_rps_action_and_irq_enable(sd);
                }
                skb_defer_free_flush(sd);
                bpf_net_ctx_clear(bpf_net_ctx);
                local_bh_enable();

                if (!repoll)
                        break;

                rcu_softirq_qs_periodic(last_qs);
                cond_resched();
        }
}

static int napi_threaded_poll(void *data)
{
        struct napi_struct *napi = data;

        while (!napi_thread_wait(napi))
                napi_threaded_poll_loop(napi);

        return 0;
}

static __latent_entropy void net_rx_action(void)
{
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
        unsigned long time_limit = jiffies +
                usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        int budget = READ_ONCE(net_hotdata.netdev_budget);
        LIST_HEAD(list);
        LIST_HEAD(repoll);

        bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
start:
        sd->in_net_rx_action = true;
        local_irq_disable();
        list_splice_init(&sd->poll_list, &list);
        local_irq_enable();

        for (;;) {
                struct napi_struct *n;

                skb_defer_free_flush(sd);

                if (list_empty(&list)) {
                        if (list_empty(&repoll)) {
                                sd->in_net_rx_action = false;
                                barrier();
                                /* We need to check if ____napi_schedule()
                                 * had refilled poll_list while
                                 * sd->in_net_rx_action was true.
                                 */
                                if (!list_empty(&sd->poll_list))
                                        goto start;
                                if (!sd_has_rps_ipi_waiting(sd))
                                        goto end;
                        }
                        break;
                }

                n = list_first_entry(&list, struct napi_struct, poll_list);
                budget -= napi_poll(n, &repoll);

                /* If softirq window is exhausted then punt.
                 * Allow this to run for 2 jiffies since which will allow
                 * an average latency of 1.5/HZ.
                 */
                if (unlikely(budget <= 0 ||
                             time_after_eq(jiffies, time_limit))) {
                        sd->time_squeeze++;
                        break;
                }
        }

        local_irq_disable();

        list_splice_tail_init(&sd->poll_list, &list);
        list_splice_tail(&repoll, &list);
        list_splice(&list, &sd->poll_list);
        if (!list_empty(&sd->poll_list))
                __raise_softirq_irqoff(NET_RX_SOFTIRQ);
        else
                sd->in_net_rx_action = false;

        net_rps_action_and_irq_enable(sd);
end:
        bpf_net_ctx_clear(bpf_net_ctx);
}

struct netdev_adjacent {
        struct net_device *dev;
        netdevice_tracker dev_tracker;

        /* upper master flag, there can only be one master device per list */
        bool master;

        /* lookup ignore flag */
        bool ignore;

        /* counter for the number of times this device was added to us */
        u16 ref_nr;

        /* private field for the users */
        void *private;

        struct list_head list;
        struct rcu_head rcu;
};

static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
                                                 struct list_head *adj_list)
{
        struct netdev_adjacent *adj;

        list_for_each_entry(adj, adj_list, list) {
                if (adj->dev == adj_dev)
                        return adj;
        }
        return NULL;
}

static int ____netdev_has_upper_dev(struct net_device *upper_dev,
                                    struct netdev_nested_priv *priv)
{
        struct net_device *dev = (struct net_device *)priv->data;

        return upper_dev == dev;
}

/**
 * netdev_has_upper_dev - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks only immediate upper device,
 * not through a complete stack of devices. The caller must hold the RTNL lock.
 */
bool netdev_has_upper_dev(struct net_device *dev,
                          struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .data = (void *)upper_dev,
        };

        ASSERT_RTNL();

        return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
                                             &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev);

/**
 * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
 * @dev: device
 * @upper_dev: upper device to check
 *
 * Find out if a device is linked to specified upper device and return true
 * in case it is. Note that this checks the entire upper device chain.
 * The caller must hold rcu lock.
 */

bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
                                  struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .data = (void *)upper_dev,
        };

        return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
                                               &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);

/**
 * netdev_has_any_upper_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to an upper device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
bool netdev_has_any_upper_dev(struct net_device *dev)
{
        ASSERT_RTNL();

        return !list_empty(&dev->adj_list.upper);
}
EXPORT_SYMBOL(netdev_has_any_upper_dev);

/**
 * netdev_master_upper_dev_get - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RTNL lock.
 */
struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        ASSERT_RTNL();

        if (list_empty(&dev->adj_list.upper))
                return NULL;

        upper = list_first_entry(&dev->adj_list.upper,
                                 struct netdev_adjacent, list);
        if (likely(upper->master))
                return upper->dev;
        return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);

static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        ASSERT_RTNL();

        if (list_empty(&dev->adj_list.upper))
                return NULL;

        upper = list_first_entry(&dev->adj_list.upper,
                                 struct netdev_adjacent, list);
        if (likely(upper->master) && !upper->ignore)
                return upper->dev;
        return NULL;
}

/**
 * netdev_has_any_lower_dev - Check if device is linked to some device
 * @dev: device
 *
 * Find out if a device is linked to a lower device and return true in case
 * it is. The caller must hold the RTNL lock.
 */
static bool netdev_has_any_lower_dev(struct net_device *dev)
{
        ASSERT_RTNL();

        return !list_empty(&dev->adj_list.lower);
}

void *netdev_adjacent_get_private(struct list_head *adj_list)
{
        struct netdev_adjacent *adj;

        adj = list_entry(adj_list, struct netdev_adjacent, list);

        return adj->private;
}
EXPORT_SYMBOL(netdev_adjacent_get_private);

/**
 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next device from the dev's upper list, starting from iter
 * position. The caller must hold RCU read lock.
 */
struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
                                                 struct list_head **iter)
{
        struct netdev_adjacent *upper;

        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());

        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;

        return upper->dev;
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);

static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
                                                  struct list_head **iter,
                                                  bool *ignore)
{
        struct netdev_adjacent *upper;

        upper = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;
        *ignore = upper->ignore;

        return upper->dev;
}

static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
                                                    struct list_head **iter)
{
        struct netdev_adjacent *upper;

        WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());

        upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&upper->list == &dev->adj_list.upper)
                return NULL;

        *iter = &upper->list;

        return upper->dev;
}

static int __netdev_walk_all_upper_dev(struct net_device *dev,
                                       int (*fn)(struct net_device *dev,
                                         struct netdev_nested_priv *priv),
                                       struct netdev_nested_priv *priv)
{
        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;
        bool ignore;

        now = dev;
        iter = &dev->adj_list.upper;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        udev = __netdev_next_upper_dev(now, &iter, &ignore);
                        if (!udev)
                                break;
                        if (ignore)
                                continue;

                        next = udev;
                        niter = &udev->adj_list.upper;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}

int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv)
{
        struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.upper;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        udev = netdev_next_upper_dev_rcu(now, &iter);
                        if (!udev)
                                break;

                        next = udev;
                        niter = &udev->adj_list.upper;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);

static bool __netdev_has_upper_dev(struct net_device *dev,
                                   struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = (void *)upper_dev,
        };

        ASSERT_RTNL();

        return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
                                           &priv);
}

/**
 * netdev_lower_get_next_private - Get the next ->private from the
 *                                   lower neighbour list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold either hold the
 * RTNL lock or its own locking that guarantees that the neighbour lower
 * list will remain unchanged.
 */
void *netdev_lower_get_next_private(struct net_device *dev,
                                    struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry(*iter, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = lower->list.next;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private);

/**
 * netdev_lower_get_next_private_rcu - Get the next ->private from the
 *                                       lower neighbour list, RCU
 *                                       variant
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent->private from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RCU read lock.
 */
void *netdev_lower_get_next_private_rcu(struct net_device *dev,
                                        struct list_head **iter)
{
        struct netdev_adjacent *lower;

        WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());

        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);

/**
 * netdev_lower_get_next - Get the next device from the lower neighbour
 *                         list
 * @dev: device
 * @iter: list_head ** of the current position
 *
 * Gets the next netdev_adjacent from the dev's lower neighbour
 * list, starting from iter position. The caller must hold RTNL lock or
 * its own locking that guarantees that the neighbour lower
 * list will remain unchanged.
 */
void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry(*iter, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = lower->list.next;

        return lower->dev;
}
EXPORT_SYMBOL(netdev_lower_get_next);

static struct net_device *netdev_next_lower_dev(struct net_device *dev,
                                                struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->dev;
}

static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
                                                  struct list_head **iter,
                                                  bool *ignore)
{
        struct netdev_adjacent *lower;

        lower = list_entry((*iter)->next, struct netdev_adjacent, list);

        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;
        *ignore = lower->ignore;

        return lower->dev;
}

int netdev_walk_all_lower_dev(struct net_device *dev,
                              int (*fn)(struct net_device *dev,
                                        struct netdev_nested_priv *priv),
                              struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = netdev_next_lower_dev(now, &iter);
                        if (!ldev)
                                break;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);

static int __netdev_walk_all_lower_dev(struct net_device *dev,
                                       int (*fn)(struct net_device *dev,
                                         struct netdev_nested_priv *priv),
                                       struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;
        bool ignore;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = __netdev_next_lower_dev(now, &iter, &ignore);
                        if (!ldev)
                                break;
                        if (ignore)
                                continue;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}

struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
                                             struct list_head **iter)
{
        struct netdev_adjacent *lower;

        lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
        if (&lower->list == &dev->adj_list.lower)
                return NULL;

        *iter = &lower->list;

        return lower->dev;
}
EXPORT_SYMBOL(netdev_next_lower_dev_rcu);

static u8 __netdev_upper_depth(struct net_device *dev)
{
        struct net_device *udev;
        struct list_head *iter;
        u8 max_depth = 0;
        bool ignore;

        for (iter = &dev->adj_list.upper,
             udev = __netdev_next_upper_dev(dev, &iter, &ignore);
             udev;
             udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
                if (ignore)
                        continue;
                if (max_depth < udev->upper_level)
                        max_depth = udev->upper_level;
        }

        return max_depth;
}

static u8 __netdev_lower_depth(struct net_device *dev)
{
        struct net_device *ldev;
        struct list_head *iter;
        u8 max_depth = 0;
        bool ignore;

        for (iter = &dev->adj_list.lower,
             ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
             ldev;
             ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
                if (ignore)
                        continue;
                if (max_depth < ldev->lower_level)
                        max_depth = ldev->lower_level;
        }

        return max_depth;
}

static int __netdev_update_upper_level(struct net_device *dev,
                                       struct netdev_nested_priv *__unused)
{
        dev->upper_level = __netdev_upper_depth(dev) + 1;
        return 0;
}

#ifdef CONFIG_LOCKDEP
static LIST_HEAD(net_unlink_list);

static void net_unlink_todo(struct net_device *dev)
{
        if (list_empty(&dev->unlink_list))
                list_add_tail(&dev->unlink_list, &net_unlink_list);
}
#endif

static int __netdev_update_lower_level(struct net_device *dev,
                                       struct netdev_nested_priv *priv)
{
        dev->lower_level = __netdev_lower_depth(dev) + 1;

#ifdef CONFIG_LOCKDEP
        if (!priv)
                return 0;

        if (priv->flags & NESTED_SYNC_IMM)
                dev->nested_level = dev->lower_level - 1;
        if (priv->flags & NESTED_SYNC_TODO)
                net_unlink_todo(dev);
#endif
        return 0;
}

int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
                                  int (*fn)(struct net_device *dev,
                                            struct netdev_nested_priv *priv),
                                  struct netdev_nested_priv *priv)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int ret, cur = 0;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                if (now != dev) {
                        ret = fn(now, priv);
                        if (ret)
                                return ret;
                }

                next = NULL;
                while (1) {
                        ldev = netdev_next_lower_dev_rcu(now, &iter);
                        if (!ldev)
                                break;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return 0;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);

/**
 * netdev_lower_get_first_private_rcu - Get the first ->private from the
 *                                       lower neighbour list, RCU
 *                                       variant
 * @dev: device
 *
 * Gets the first netdev_adjacent->private from the dev's lower neighbour
 * list. The caller must hold RCU read lock.
 */
void *netdev_lower_get_first_private_rcu(struct net_device *dev)
{
        struct netdev_adjacent *lower;

        lower = list_first_or_null_rcu(&dev->adj_list.lower,
                        struct netdev_adjacent, list);
        if (lower)
                return lower->private;
        return NULL;
}
EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);

/**
 * netdev_master_upper_dev_get_rcu - Get master upper device
 * @dev: device
 *
 * Find a master upper device and return pointer to it or NULL in case
 * it's not there. The caller must hold the RCU read lock.
 */
struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
{
        struct netdev_adjacent *upper;

        upper = list_first_or_null_rcu(&dev->adj_list.upper,
                                       struct netdev_adjacent, list);
        if (upper && likely(upper->master))
                return upper->dev;
        return NULL;
}
EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);

static int netdev_adjacent_sysfs_add(struct net_device *dev,
                              struct net_device *adj_dev,
                              struct list_head *dev_list)
{
        char linkname[IFNAMSIZ+7];

        sprintf(linkname, dev_list == &dev->adj_list.upper ?
                "upper_%s" : "lower_%s", adj_dev->name);
        return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
                                 linkname);
}
static void netdev_adjacent_sysfs_del(struct net_device *dev,
                               char *name,
                               struct list_head *dev_list)
{
        char linkname[IFNAMSIZ+7];

        sprintf(linkname, dev_list == &dev->adj_list.upper ?
                "upper_%s" : "lower_%s", name);
        sysfs_remove_link(&(dev->dev.kobj), linkname);
}

static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
                                                 struct net_device *adj_dev,
                                                 struct list_head *dev_list)
{
        return (dev_list == &dev->adj_list.upper ||
                dev_list == &dev->adj_list.lower) &&
                net_eq(dev_net(dev), dev_net(adj_dev));
}

static int __netdev_adjacent_dev_insert(struct net_device *dev,
                                        struct net_device *adj_dev,
                                        struct list_head *dev_list,
                                        void *private, bool master)
{
        struct netdev_adjacent *adj;
        int ret;

        adj = __netdev_find_adj(adj_dev, dev_list);

        if (adj) {
                adj->ref_nr += 1;
                pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
                         dev->name, adj_dev->name, adj->ref_nr);

                return 0;
        }

        adj = kmalloc(sizeof(*adj), GFP_KERNEL);
        if (!adj)
                return -ENOMEM;

        adj->dev = adj_dev;
        adj->master = master;
        adj->ref_nr = 1;
        adj->private = private;
        adj->ignore = false;
        netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);

        pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
                 dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);

        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
                ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
                if (ret)
                        goto free_adj;
        }

        /* Ensure that master link is always the first item in list. */
        if (master) {
                ret = sysfs_create_link(&(dev->dev.kobj),
                                        &(adj_dev->dev.kobj), "master");
                if (ret)
                        goto remove_symlinks;

                list_add_rcu(&adj->list, dev_list);
        } else {
                list_add_tail_rcu(&adj->list, dev_list);
        }

        return 0;

remove_symlinks:
        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
        netdev_put(adj_dev, &adj->dev_tracker);
        kfree(adj);

        return ret;
}

static void __netdev_adjacent_dev_remove(struct net_device *dev,
                                         struct net_device *adj_dev,
                                         u16 ref_nr,
                                         struct list_head *dev_list)
{
        struct netdev_adjacent *adj;

        pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
                 dev->name, adj_dev->name, ref_nr);

        adj = __netdev_find_adj(adj_dev, dev_list);

        if (!adj) {
                pr_err("Adjacency does not exist for device %s from %s\n",
                       dev->name, adj_dev->name);
                WARN_ON(1);
                return;
        }

        if (adj->ref_nr > ref_nr) {
                pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
                         dev->name, adj_dev->name, ref_nr,
                         adj->ref_nr - ref_nr);
                adj->ref_nr -= ref_nr;
                return;
        }

        if (adj->master)
                sysfs_remove_link(&(dev->dev.kobj), "master");

        if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
                netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);

        list_del_rcu(&adj->list);
        pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
                 adj_dev->name, dev->name, adj_dev->name);
        netdev_put(adj_dev, &adj->dev_tracker);
        kfree_rcu(adj, rcu);
}

static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
                                            struct net_device *upper_dev,
                                            struct list_head *up_list,
                                            struct list_head *down_list,
                                            void *private, bool master)
{
        int ret;

        ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
                                           private, master);
        if (ret)
                return ret;

        ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
                                           private, false);
        if (ret) {
                __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
                return ret;
        }

        return 0;
}

static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
                                               struct net_device *upper_dev,
                                               u16 ref_nr,
                                               struct list_head *up_list,
                                               struct list_head *down_list)
{
        __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
        __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
}

static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
                                                struct net_device *upper_dev,
                                                void *private, bool master)
{
        return __netdev_adjacent_dev_link_lists(dev, upper_dev,
                                                &dev->adj_list.upper,
                                                &upper_dev->adj_list.lower,
                                                private, master);
}

static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
                                                   struct net_device *upper_dev)
{
        __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
                                           &dev->adj_list.upper,
                                           &upper_dev->adj_list.lower);
}

static int __netdev_upper_dev_link(struct net_device *dev,
                                   struct net_device *upper_dev, bool master,
                                   void *upper_priv, void *upper_info,
                                   struct netdev_nested_priv *priv,
                                   struct netlink_ext_ack *extack)
{
        struct netdev_notifier_changeupper_info changeupper_info = {
                .info = {
                        .dev = dev,
                        .extack = extack,
                },
                .upper_dev = upper_dev,
                .master = master,
                .linking = true,
                .upper_info = upper_info,
        };
        struct net_device *master_dev;
        int ret = 0;

        ASSERT_RTNL();

        if (dev == upper_dev)
                return -EBUSY;

        /* To prevent loops, check if dev is not upper device to upper_dev. */
        if (__netdev_has_upper_dev(upper_dev, dev))
                return -EBUSY;

        if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
                return -EMLINK;

        if (!master) {
                if (__netdev_has_upper_dev(dev, upper_dev))
                        return -EEXIST;
        } else {
                master_dev = __netdev_master_upper_dev_get(dev);
                if (master_dev)
                        return master_dev == upper_dev ? -EEXIST : -EBUSY;
        }

        ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
                                            &changeupper_info.info);
        ret = notifier_to_errno(ret);
        if (ret)
                return ret;

        ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
                                                   master);
        if (ret)
                return ret;

        ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
                                            &changeupper_info.info);
        ret = notifier_to_errno(ret);
        if (ret)
                goto rollback;

        __netdev_update_upper_level(dev, NULL);
        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);

        __netdev_update_lower_level(upper_dev, priv);
        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
                                    priv);

        return 0;

rollback:
        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);

        return ret;
}

/**
 * netdev_upper_dev_link - Add a link to the upper device
 * @dev: device
 * @upper_dev: new upper device
 * @extack: netlink extended ack
 *
 * Adds a link to device which is upper to this one. The caller must hold
 * the RTNL lock. On a failure a negative errno code is returned.
 * On success the reference counts are adjusted and the function
 * returns zero.
 */
int netdev_upper_dev_link(struct net_device *dev,
                          struct net_device *upper_dev,
                          struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        return __netdev_upper_dev_link(dev, upper_dev, false,
                                       NULL, NULL, &priv, extack);
}
EXPORT_SYMBOL(netdev_upper_dev_link);

/**
 * netdev_master_upper_dev_link - Add a master link to the upper device
 * @dev: device
 * @upper_dev: new upper device
 * @upper_priv: upper device private
 * @upper_info: upper info to be passed down via notifier
 * @extack: netlink extended ack
 *
 * Adds a link to device which is upper to this one. In this case, only
 * one master upper device can be linked, although other non-master devices
 * might be linked as well. The caller must hold the RTNL lock.
 * On a failure a negative errno code is returned. On success the reference
 * counts are adjusted and the function returns zero.
 */
int netdev_master_upper_dev_link(struct net_device *dev,
                                 struct net_device *upper_dev,
                                 void *upper_priv, void *upper_info,
                                 struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        return __netdev_upper_dev_link(dev, upper_dev, true,
                                       upper_priv, upper_info, &priv, extack);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);

static void __netdev_upper_dev_unlink(struct net_device *dev,
                                      struct net_device *upper_dev,
                                      struct netdev_nested_priv *priv)
{
        struct netdev_notifier_changeupper_info changeupper_info = {
                .info = {
                        .dev = dev,
                },
                .upper_dev = upper_dev,
                .linking = false,
        };

        ASSERT_RTNL();

        changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;

        call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
                                      &changeupper_info.info);

        __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);

        call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
                                      &changeupper_info.info);

        __netdev_update_upper_level(dev, NULL);
        __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);

        __netdev_update_lower_level(upper_dev, priv);
        __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
                                    priv);
}

/**
 * netdev_upper_dev_unlink - Removes a link to upper device
 * @dev: device
 * @upper_dev: new upper device
 *
 * Removes a link to device which is upper to this one. The caller must hold
 * the RTNL lock.
 */
void netdev_upper_dev_unlink(struct net_device *dev,
                             struct net_device *upper_dev)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_TODO,
                .data = NULL,
        };

        __netdev_upper_dev_unlink(dev, upper_dev, &priv);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);

static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
                                      struct net_device *lower_dev,
                                      bool val)
{
        struct netdev_adjacent *adj;

        adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
        if (adj)
                adj->ignore = val;

        adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
        if (adj)
                adj->ignore = val;
}

static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
                                        struct net_device *lower_dev)
{
        __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
}

static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
                                       struct net_device *lower_dev)
{
        __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
}

int netdev_adjacent_change_prepare(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev,
                                   struct netlink_ext_ack *extack)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = NULL,
        };
        int err;

        if (!new_dev)
                return 0;

        if (old_dev && new_dev != old_dev)
                netdev_adjacent_dev_disable(dev, old_dev);
        err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
                                      extack);
        if (err) {
                if (old_dev && new_dev != old_dev)
                        netdev_adjacent_dev_enable(dev, old_dev);
                return err;
        }

        return 0;
}
EXPORT_SYMBOL(netdev_adjacent_change_prepare);

void netdev_adjacent_change_commit(struct net_device *old_dev,
                                   struct net_device *new_dev,
                                   struct net_device *dev)
{
        struct netdev_nested_priv priv = {
                .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
                .data = NULL,
        };

        if (!new_dev || !old_dev)
                return;

        if (new_dev == old_dev)
                return;

        netdev_adjacent_dev_enable(dev, old_dev);
        __netdev_upper_dev_unlink(old_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_commit);

void netdev_adjacent_change_abort(struct net_device *old_dev,
                                  struct net_device *new_dev,
                                  struct net_device *dev)
{
        struct netdev_nested_priv priv = {
                .flags = 0,
                .data = NULL,
        };

        if (!new_dev)
                return;

        if (old_dev && new_dev != old_dev)
                netdev_adjacent_dev_enable(dev, old_dev);

        __netdev_upper_dev_unlink(new_dev, dev, &priv);
}
EXPORT_SYMBOL(netdev_adjacent_change_abort);

/**
 * netdev_bonding_info_change - Dispatch event about slave change
 * @dev: device
 * @bonding_info: info to dispatch
 *
 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_bonding_info_change(struct net_device *dev,
                                struct netdev_bonding_info *bonding_info)
{
        struct netdev_notifier_bonding_info info = {
                .info.dev = dev,
        };

        memcpy(&info.bonding_info, bonding_info,
               sizeof(struct netdev_bonding_info));
        call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
                                      &info.info);
}
EXPORT_SYMBOL(netdev_bonding_info_change);

static int netdev_offload_xstats_enable_l3(struct net_device *dev,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
        };
        int err;
        int rc;

        dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
                                         GFP_KERNEL);
        if (!dev->offload_xstats_l3)
                return -ENOMEM;

        rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
                                                  NETDEV_OFFLOAD_XSTATS_DISABLE,
                                                  &info.info);
        err = notifier_to_errno(rc);
        if (err)
                goto free_stats;

        return 0;

free_stats:
        kfree(dev->offload_xstats_l3);
        dev->offload_xstats_l3 = NULL;
        return err;
}

int netdev_offload_xstats_enable(struct net_device *dev,
                                 enum netdev_offload_xstats_type type,
                                 struct netlink_ext_ack *extack)
{
        ASSERT_RTNL();

        if (netdev_offload_xstats_enabled(dev, type))
                return -EALREADY;

        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                return netdev_offload_xstats_enable_l3(dev, extack);
        }

        WARN_ON(1);
        return -EINVAL;
}
EXPORT_SYMBOL(netdev_offload_xstats_enable);

static void netdev_offload_xstats_disable_l3(struct net_device *dev)
{
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
        };

        call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
                                      &info.info);
        kfree(dev->offload_xstats_l3);
        dev->offload_xstats_l3 = NULL;
}

int netdev_offload_xstats_disable(struct net_device *dev,
                                  enum netdev_offload_xstats_type type)
{
        ASSERT_RTNL();

        if (!netdev_offload_xstats_enabled(dev, type))
                return -EALREADY;

        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                netdev_offload_xstats_disable_l3(dev);
                return 0;
        }

        WARN_ON(1);
        return -EINVAL;
}
EXPORT_SYMBOL(netdev_offload_xstats_disable);

static void netdev_offload_xstats_disable_all(struct net_device *dev)
{
        netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
}

static struct rtnl_hw_stats64 *
netdev_offload_xstats_get_ptr(const struct net_device *dev,
                              enum netdev_offload_xstats_type type)
{
        switch (type) {
        case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
                return dev->offload_xstats_l3;
        }

        WARN_ON(1);
        return NULL;
}

bool netdev_offload_xstats_enabled(const struct net_device *dev,
                                   enum netdev_offload_xstats_type type)
{
        ASSERT_RTNL();

        return netdev_offload_xstats_get_ptr(dev, type);
}
EXPORT_SYMBOL(netdev_offload_xstats_enabled);

struct netdev_notifier_offload_xstats_ru {
        bool used;
};

struct netdev_notifier_offload_xstats_rd {
        struct rtnl_hw_stats64 stats;
        bool used;
};

static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
                                  const struct rtnl_hw_stats64 *src)
{
        dest->rx_packets          += src->rx_packets;
        dest->tx_packets          += src->tx_packets;
        dest->rx_bytes                  += src->rx_bytes;
        dest->tx_bytes                  += src->tx_bytes;
        dest->rx_errors                  += src->rx_errors;
        dest->tx_errors                  += src->tx_errors;
        dest->rx_dropped          += src->rx_dropped;
        dest->tx_dropped          += src->tx_dropped;
        dest->multicast                  += src->multicast;
}

static int netdev_offload_xstats_get_used(struct net_device *dev,
                                          enum netdev_offload_xstats_type type,
                                          bool *p_used,
                                          struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_ru report_used = {};
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = type,
                .report_used = &report_used,
        };
        int rc;

        WARN_ON(!netdev_offload_xstats_enabled(dev, type));
        rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
                                           &info.info);
        *p_used = report_used.used;
        return notifier_to_errno(rc);
}

static int netdev_offload_xstats_get_stats(struct net_device *dev,
                                           enum netdev_offload_xstats_type type,
                                           struct rtnl_hw_stats64 *p_stats,
                                           bool *p_used,
                                           struct netlink_ext_ack *extack)
{
        struct netdev_notifier_offload_xstats_rd report_delta = {};
        struct netdev_notifier_offload_xstats_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .type = type,
                .report_delta = &report_delta,
        };
        struct rtnl_hw_stats64 *stats;
        int rc;

        stats = netdev_offload_xstats_get_ptr(dev, type);
        if (WARN_ON(!stats))
                return -EINVAL;

        rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
                                           &info.info);

        /* Cache whatever we got, even if there was an error, otherwise the
         * successful stats retrievals would get lost.
         */
        netdev_hw_stats64_add(stats, &report_delta.stats);

        if (p_stats)
                *p_stats = *stats;
        *p_used = report_delta.used;

        return notifier_to_errno(rc);
}

int netdev_offload_xstats_get(struct net_device *dev,
                              enum netdev_offload_xstats_type type,
                              struct rtnl_hw_stats64 *p_stats, bool *p_used,
                              struct netlink_ext_ack *extack)
{
        ASSERT_RTNL();

        if (p_stats)
                return netdev_offload_xstats_get_stats(dev, type, p_stats,
                                                       p_used, extack);
        else
                return netdev_offload_xstats_get_used(dev, type, p_used,
                                                      extack);
}
EXPORT_SYMBOL(netdev_offload_xstats_get);

void
netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
                                   const struct rtnl_hw_stats64 *stats)
{
        report_delta->used = true;
        netdev_hw_stats64_add(&report_delta->stats, stats);
}
EXPORT_SYMBOL(netdev_offload_xstats_report_delta);

void
netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
{
        report_used->used = true;
}
EXPORT_SYMBOL(netdev_offload_xstats_report_used);

void netdev_offload_xstats_push_delta(struct net_device *dev,
                                      enum netdev_offload_xstats_type type,
                                      const struct rtnl_hw_stats64 *p_stats)
{
        struct rtnl_hw_stats64 *stats;

        ASSERT_RTNL();

        stats = netdev_offload_xstats_get_ptr(dev, type);
        if (WARN_ON(!stats))
                return;

        netdev_hw_stats64_add(stats, p_stats);
}
EXPORT_SYMBOL(netdev_offload_xstats_push_delta);

/**
 * netdev_get_xmit_slave - Get the xmit slave of master device
 * @dev: device
 * @skb: The packet
 * @all_slaves: assume all the slaves are active
 *
 * The reference counters are not incremented so the caller must be
 * careful with locks. The caller must hold RCU lock.
 * %NULL is returned if no slave is found.
 */

struct net_device *netdev_get_xmit_slave(struct net_device *dev,
                                         struct sk_buff *skb,
                                         bool all_slaves)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_get_xmit_slave)
                return NULL;
        return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
}
EXPORT_SYMBOL(netdev_get_xmit_slave);

static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
                                                  struct sock *sk)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_sk_get_lower_dev)
                return NULL;
        return ops->ndo_sk_get_lower_dev(dev, sk);
}

/**
 * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
 * @dev: device
 * @sk: the socket
 *
 * %NULL is returned if no lower device is found.
 */

struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
                                            struct sock *sk)
{
        struct net_device *lower;

        lower = netdev_sk_get_lower_dev(dev, sk);
        while (lower) {
                dev = lower;
                lower = netdev_sk_get_lower_dev(dev, sk);
        }

        return dev;
}
EXPORT_SYMBOL(netdev_sk_get_lowest_dev);

static void netdev_adjacent_add_links(struct net_device *dev)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_add(dev, iter->dev,
                                          &dev->adj_list.upper);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_add(dev, iter->dev,
                                          &dev->adj_list.lower);
        }
}

static void netdev_adjacent_del_links(struct net_device *dev)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, dev->name,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_del(dev, iter->dev->name,
                                          &dev->adj_list.upper);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, dev->name,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_del(dev, iter->dev->name,
                                          &dev->adj_list.lower);
        }
}

void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
{
        struct netdev_adjacent *iter;

        struct net *net = dev_net(dev);

        list_for_each_entry(iter, &dev->adj_list.upper, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, oldname,
                                          &iter->dev->adj_list.lower);
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.lower);
        }

        list_for_each_entry(iter, &dev->adj_list.lower, list) {
                if (!net_eq(net, dev_net(iter->dev)))
                        continue;
                netdev_adjacent_sysfs_del(iter->dev, oldname,
                                          &iter->dev->adj_list.upper);
                netdev_adjacent_sysfs_add(iter->dev, dev,
                                          &iter->dev->adj_list.upper);
        }
}

void *netdev_lower_dev_get_private(struct net_device *dev,
                                   struct net_device *lower_dev)
{
        struct netdev_adjacent *lower;

        if (!lower_dev)
                return NULL;
        lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
        if (!lower)
                return NULL;

        return lower->private;
}
EXPORT_SYMBOL(netdev_lower_dev_get_private);


/**
 * netdev_lower_state_changed - Dispatch event about lower device state change
 * @lower_dev: device
 * @lower_state_info: state to dispatch
 *
 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
 * The caller must hold the RTNL lock.
 */
void netdev_lower_state_changed(struct net_device *lower_dev,
                                void *lower_state_info)
{
        struct netdev_notifier_changelowerstate_info changelowerstate_info = {
                .info.dev = lower_dev,
        };

        ASSERT_RTNL();
        changelowerstate_info.lower_state_info = lower_state_info;
        call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
                                      &changelowerstate_info.info);
}
EXPORT_SYMBOL(netdev_lower_state_changed);

static void dev_change_rx_flags(struct net_device *dev, int flags)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_change_rx_flags)
                ops->ndo_change_rx_flags(dev, flags);
}

static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
        unsigned int old_flags = dev->flags;
        unsigned int promiscuity, flags;
        kuid_t uid;
        kgid_t gid;

        ASSERT_RTNL();

        promiscuity = dev->promiscuity + inc;
        if (promiscuity == 0) {
                /*
                 * Avoid overflow.
                 * If inc causes overflow, untouch promisc and return error.
                 */
                if (unlikely(inc > 0)) {
                        netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
                        return -EOVERFLOW;
                }
                flags = old_flags & ~IFF_PROMISC;
        } else {
                flags = old_flags | IFF_PROMISC;
        }
        WRITE_ONCE(dev->promiscuity, promiscuity);
        if (flags != old_flags) {
                WRITE_ONCE(dev->flags, flags);
                netdev_info(dev, "%s promiscuous mode\n",
                            dev->flags & IFF_PROMISC ? "entered" : "left");
                if (audit_enabled) {
                        current_uid_gid(&uid, &gid);
                        audit_log(audit_context(), GFP_ATOMIC,
                                  AUDIT_ANOM_PROMISCUOUS,
                                  "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
                                  dev->name, (dev->flags & IFF_PROMISC),
                                  (old_flags & IFF_PROMISC),
                                  from_kuid(&init_user_ns, audit_get_loginuid(current)),
                                  from_kuid(&init_user_ns, uid),
                                  from_kgid(&init_user_ns, gid),
                                  audit_get_sessionid(current));
                }

                dev_change_rx_flags(dev, IFF_PROMISC);
        }
        if (notify)
                __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
        return 0;
}

/**
 *        dev_set_promiscuity        - update promiscuity count on a device
 *        @dev: device
 *        @inc: modifier
 *
 *        Add or remove promiscuity from a device. While the count in the device
 *        remains above zero the interface remains promiscuous. Once it hits zero
 *        the device reverts back to normal filtering operation. A negative inc
 *        value is used to drop promiscuity on the device.
 *        Return 0 if successful or a negative errno code on error.
 */
int dev_set_promiscuity(struct net_device *dev, int inc)
{
        unsigned int old_flags = dev->flags;
        int err;

        err = __dev_set_promiscuity(dev, inc, true);
        if (err < 0)
                return err;
        if (dev->flags != old_flags)
                dev_set_rx_mode(dev);
        return err;
}
EXPORT_SYMBOL(dev_set_promiscuity);

int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
{
        unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
        unsigned int allmulti, flags;

        ASSERT_RTNL();

        allmulti = dev->allmulti + inc;
        if (allmulti == 0) {
                /*
                 * Avoid overflow.
                 * If inc causes overflow, untouch allmulti and return error.
                 */
                if (unlikely(inc > 0)) {
                        netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
                        return -EOVERFLOW;
                }
                flags = old_flags & ~IFF_ALLMULTI;
        } else {
                flags = old_flags | IFF_ALLMULTI;
        }
        WRITE_ONCE(dev->allmulti, allmulti);
        if (flags != old_flags) {
                WRITE_ONCE(dev->flags, flags);
                netdev_info(dev, "%s allmulticast mode\n",
                            dev->flags & IFF_ALLMULTI ? "entered" : "left");
                dev_change_rx_flags(dev, IFF_ALLMULTI);
                dev_set_rx_mode(dev);
                if (notify)
                        __dev_notify_flags(dev, old_flags,
                                           dev->gflags ^ old_gflags, 0, NULL);
        }
        return 0;
}

/*
 *        Upload unicast and multicast address lists to device and
 *        configure RX filtering. When the device doesn't support unicast
 *        filtering it is put in promiscuous mode while unicast addresses
 *        are present.
 */
void __dev_set_rx_mode(struct net_device *dev)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        /* dev_open will call this function so the list will stay sane. */
        if (!(dev->flags&IFF_UP))
                return;

        if (!netif_device_present(dev))
                return;

        if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
                /* Unicast addresses changes may only happen under the rtnl,
                 * therefore calling __dev_set_promiscuity here is safe.
                 */
                if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
                        __dev_set_promiscuity(dev, 1, false);
                        dev->uc_promisc = true;
                } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
                        __dev_set_promiscuity(dev, -1, false);
                        dev->uc_promisc = false;
                }
        }

        if (ops->ndo_set_rx_mode)
                ops->ndo_set_rx_mode(dev);
}

void dev_set_rx_mode(struct net_device *dev)
{
        netif_addr_lock_bh(dev);
        __dev_set_rx_mode(dev);
        netif_addr_unlock_bh(dev);
}

/**
 *        dev_get_flags - get flags reported to userspace
 *        @dev: device
 *
 *        Get the combination of flag bits exported through APIs to userspace.
 */
unsigned int dev_get_flags(const struct net_device *dev)
{
        unsigned int flags;

        flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
                                IFF_ALLMULTI |
                                IFF_RUNNING |
                                IFF_LOWER_UP |
                                IFF_DORMANT)) |
                (READ_ONCE(dev->gflags) & (IFF_PROMISC |
                                IFF_ALLMULTI));

        if (netif_running(dev)) {
                if (netif_oper_up(dev))
                        flags |= IFF_RUNNING;
                if (netif_carrier_ok(dev))
                        flags |= IFF_LOWER_UP;
                if (netif_dormant(dev))
                        flags |= IFF_DORMANT;
        }

        return flags;
}
EXPORT_SYMBOL(dev_get_flags);

int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack)
{
        unsigned int old_flags = dev->flags;
        int ret;

        ASSERT_RTNL();

        /*
         *        Set the flags on our device.
         */

        dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
                               IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
                               IFF_AUTOMEDIA)) |
                     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
                                    IFF_ALLMULTI));

        /*
         *        Load in the correct multicast list now the flags have changed.
         */

        if ((old_flags ^ flags) & IFF_MULTICAST)
                dev_change_rx_flags(dev, IFF_MULTICAST);

        dev_set_rx_mode(dev);

        /*
         *        Have we downed the interface. We handle IFF_UP ourselves
         *        according to user attempts to set it, rather than blindly
         *        setting it.
         */

        ret = 0;
        if ((old_flags ^ flags) & IFF_UP) {
                if (old_flags & IFF_UP)
                        __dev_close(dev);
                else
                        ret = __dev_open(dev, extack);
        }

        if ((flags ^ dev->gflags) & IFF_PROMISC) {
                int inc = (flags & IFF_PROMISC) ? 1 : -1;
                old_flags = dev->flags;

                dev->gflags ^= IFF_PROMISC;

                if (__dev_set_promiscuity(dev, inc, false) >= 0)
                        if (dev->flags != old_flags)
                                dev_set_rx_mode(dev);
        }

        /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
         * is important. Some (broken) drivers set IFF_PROMISC, when
         * IFF_ALLMULTI is requested not asking us and not reporting.
         */
        if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
                int inc = (flags & IFF_ALLMULTI) ? 1 : -1;

                dev->gflags ^= IFF_ALLMULTI;
                netif_set_allmulti(dev, inc, false);
        }

        return ret;
}

void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
                        unsigned int gchanges, u32 portid,
                        const struct nlmsghdr *nlh)
{
        unsigned int changes = dev->flags ^ old_flags;

        if (gchanges)
                rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);

        if (changes & IFF_UP) {
                if (dev->flags & IFF_UP)
                        call_netdevice_notifiers(NETDEV_UP, dev);
                else
                        call_netdevice_notifiers(NETDEV_DOWN, dev);
        }

        if (dev->flags & IFF_UP &&
            (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
                struct netdev_notifier_change_info change_info = {
                        .info = {
                                .dev = dev,
                        },
                        .flags_changed = changes,
                };

                call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
        }
}

int netif_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack)
{
        int ret;
        unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;

        ret = __dev_change_flags(dev, flags, extack);
        if (ret < 0)
                return ret;

        changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
        __dev_notify_flags(dev, old_flags, changes, 0, NULL);
        return ret;
}

int __dev_set_mtu(struct net_device *dev, int new_mtu)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (ops->ndo_change_mtu)
                return ops->ndo_change_mtu(dev, new_mtu);

        /* Pairs with all the lockless reads of dev->mtu in the stack */
        WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
}
EXPORT_SYMBOL(__dev_set_mtu);

int dev_validate_mtu(struct net_device *dev, int new_mtu,
                     struct netlink_ext_ack *extack)
{
        /* MTU must be positive, and in range */
        if (new_mtu < 0 || new_mtu < dev->min_mtu) {
                NL_SET_ERR_MSG(extack, "mtu less than device minimum");
                return -EINVAL;
        }

        if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
                NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
                return -EINVAL;
        }
        return 0;
}

/**
 *        netif_set_mtu_ext - Change maximum transfer unit
 *        @dev: device
 *        @new_mtu: new transfer unit
 *        @extack: netlink extended ack
 *
 *        Change the maximum transfer size of the network device.
 */
int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
                      struct netlink_ext_ack *extack)
{
        int err, orig_mtu;

        if (new_mtu == dev->mtu)
                return 0;

        err = dev_validate_mtu(dev, new_mtu, extack);
        if (err)
                return err;

        if (!netif_device_present(dev))
                return -ENODEV;

        err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
        err = notifier_to_errno(err);
        if (err)
                return err;

        orig_mtu = dev->mtu;
        err = __dev_set_mtu(dev, new_mtu);

        if (!err) {
                err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
                                                   orig_mtu);
                err = notifier_to_errno(err);
                if (err) {
                        /* setting mtu back and notifying everyone again,
                         * so that they have a chance to revert changes.
                         */
                        __dev_set_mtu(dev, orig_mtu);
                        call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
                                                     new_mtu);
                }
        }
        return err;
}

int netif_set_mtu(struct net_device *dev, int new_mtu)
{
        struct netlink_ext_ack extack;
        int err;

        memset(&extack, 0, sizeof(extack));
        err = netif_set_mtu_ext(dev, new_mtu, &extack);
        if (err && extack._msg)
                net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
        return err;
}
EXPORT_SYMBOL(netif_set_mtu);

int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
        unsigned int orig_len = dev->tx_queue_len;
        int res;

        if (new_len != (unsigned int)new_len)
                return -ERANGE;

        if (new_len != orig_len) {
                WRITE_ONCE(dev->tx_queue_len, new_len);
                res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
                res = notifier_to_errno(res);
                if (res)
                        goto err_rollback;
                res = dev_qdisc_change_tx_queue_len(dev);
                if (res)
                        goto err_rollback;
        }

        return 0;

err_rollback:
        netdev_err(dev, "refused to change device tx_queue_len\n");
        WRITE_ONCE(dev->tx_queue_len, orig_len);
        return res;
}

void netif_set_group(struct net_device *dev, int new_group)
{
        dev->group = new_group;
}

/**
 *        dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
 *        @dev: device
 *        @addr: new address
 *        @extack: netlink extended ack
 */
int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
                              struct netlink_ext_ack *extack)
{
        struct netdev_notifier_pre_changeaddr_info info = {
                .info.dev = dev,
                .info.extack = extack,
                .dev_addr = addr,
        };
        int rc;

        rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
        return notifier_to_errno(rc);
}
EXPORT_SYMBOL(dev_pre_changeaddr_notify);

int netif_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                          struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err;

        if (!ops->ndo_set_mac_address)
                return -EOPNOTSUPP;
        if (sa->sa_family != dev->type)
                return -EINVAL;
        if (!netif_device_present(dev))
                return -ENODEV;
        err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
        if (err)
                return err;
        if (memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) {
                err = ops->ndo_set_mac_address(dev, sa);
                if (err)
                        return err;
        }
        dev->addr_assign_type = NET_ADDR_SET;
        call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
        add_device_randomness(dev->dev_addr, dev->addr_len);
        return 0;
}

DECLARE_RWSEM(dev_addr_sem);

int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
{
        size_t size = sizeof(sa->sa_data_min);
        struct net_device *dev;
        int ret = 0;

        down_read(&dev_addr_sem);
        rcu_read_lock();

        dev = dev_get_by_name_rcu(net, dev_name);
        if (!dev) {
                ret = -ENODEV;
                goto unlock;
        }
        if (!dev->addr_len)
                memset(sa->sa_data, 0, size);
        else
                memcpy(sa->sa_data, dev->dev_addr,
                       min_t(size_t, size, dev->addr_len));
        sa->sa_family = dev->type;

unlock:
        rcu_read_unlock();
        up_read(&dev_addr_sem);
        return ret;
}
EXPORT_SYMBOL(dev_get_mac_address);

int netif_change_carrier(struct net_device *dev, bool new_carrier)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_change_carrier)
                return -EOPNOTSUPP;
        if (!netif_device_present(dev))
                return -ENODEV;
        return ops->ndo_change_carrier(dev, new_carrier);
}

/**
 *        dev_get_phys_port_id - Get device physical port ID
 *        @dev: device
 *        @ppid: port ID
 *
 *        Get device physical port ID
 */
int dev_get_phys_port_id(struct net_device *dev,
                         struct netdev_phys_item_id *ppid)
{
        const struct net_device_ops *ops = dev->netdev_ops;

        if (!ops->ndo_get_phys_port_id)
                return -EOPNOTSUPP;
        return ops->ndo_get_phys_port_id(dev, ppid);
}

/**
 *        dev_get_phys_port_name - Get device physical port name
 *        @dev: device
 *        @name: port name
 *        @len: limit of bytes to copy to name
 *
 *        Get device physical port name
 */
int dev_get_phys_port_name(struct net_device *dev,
                           char *name, size_t len)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int err;

        if (ops->ndo_get_phys_port_name) {
                err = ops->ndo_get_phys_port_name(dev, name, len);
                if (err != -EOPNOTSUPP)
                        return err;
        }
        return devlink_compat_phys_port_name_get(dev, name, len);
}

/**
 *        dev_get_port_parent_id - Get the device's port parent identifier
 *        @dev: network device
 *        @ppid: pointer to a storage for the port's parent identifier
 *        @recurse: allow/disallow recursion to lower devices
 *
 *        Get the devices's port parent identifier
 */
int dev_get_port_parent_id(struct net_device *dev,
                           struct netdev_phys_item_id *ppid,
                           bool recurse)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        struct netdev_phys_item_id first = { };
        struct net_device *lower_dev;
        struct list_head *iter;
        int err;

        if (ops->ndo_get_port_parent_id) {
                err = ops->ndo_get_port_parent_id(dev, ppid);
                if (err != -EOPNOTSUPP)
                        return err;
        }

        err = devlink_compat_switch_id_get(dev, ppid);
        if (!recurse || err != -EOPNOTSUPP)
                return err;

        netdev_for_each_lower_dev(dev, lower_dev, iter) {
                err = dev_get_port_parent_id(lower_dev, ppid, true);
                if (err)
                        break;
                if (!first.id_len)
                        first = *ppid;
                else if (memcmp(&first, ppid, sizeof(*ppid)))
                        return -EOPNOTSUPP;
        }

        return err;
}
EXPORT_SYMBOL(dev_get_port_parent_id);

/**
 *        netdev_port_same_parent_id - Indicate if two network devices have
 *        the same port parent identifier
 *        @a: first network device
 *        @b: second network device
 */
bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
{
        struct netdev_phys_item_id a_id = { };
        struct netdev_phys_item_id b_id = { };

        if (dev_get_port_parent_id(a, &a_id, true) ||
            dev_get_port_parent_id(b, &b_id, true))
                return false;

        return netdev_phys_item_id_same(&a_id, &b_id);
}
EXPORT_SYMBOL(netdev_port_same_parent_id);

int netif_change_proto_down(struct net_device *dev, bool proto_down)
{
        if (!dev->change_proto_down)
                return -EOPNOTSUPP;
        if (!netif_device_present(dev))
                return -ENODEV;
        if (proto_down)
                netif_carrier_off(dev);
        else
                netif_carrier_on(dev);
        WRITE_ONCE(dev->proto_down, proto_down);
        return 0;
}

/**
 *        netdev_change_proto_down_reason_locked - proto down reason
 *
 *        @dev: device
 *        @mask: proto down mask
 *        @value: proto down value
 */
void netdev_change_proto_down_reason_locked(struct net_device *dev,
                                            unsigned long mask, u32 value)
{
        u32 proto_down_reason;
        int b;

        if (!mask) {
                proto_down_reason = value;
        } else {
                proto_down_reason = dev->proto_down_reason;
                for_each_set_bit(b, &mask, 32) {
                        if (value & (1 << b))
                                proto_down_reason |= BIT(b);
                        else
                                proto_down_reason &= ~BIT(b);
                }
        }
        WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
}

struct bpf_xdp_link {
        struct bpf_link link;
        struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
        int flags;
};

static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
{
        if (flags & XDP_FLAGS_HW_MODE)
                return XDP_MODE_HW;
        if (flags & XDP_FLAGS_DRV_MODE)
                return XDP_MODE_DRV;
        if (flags & XDP_FLAGS_SKB_MODE)
                return XDP_MODE_SKB;
        return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
}

static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
{
        switch (mode) {
        case XDP_MODE_SKB:
                return generic_xdp_install;
        case XDP_MODE_DRV:
        case XDP_MODE_HW:
                return dev->netdev_ops->ndo_bpf;
        default:
                return NULL;
        }
}

static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
                                         enum bpf_xdp_mode mode)
{
        return dev->xdp_state[mode].link;
}

static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
                                     enum bpf_xdp_mode mode)
{
        struct bpf_xdp_link *link = dev_xdp_link(dev, mode);

        if (link)
                return link->link.prog;
        return dev->xdp_state[mode].prog;
}

u8 dev_xdp_prog_count(struct net_device *dev)
{
        u8 count = 0;
        int i;

        for (i = 0; i < __MAX_XDP_MODE; i++)
                if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
                        count++;
        return count;
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);

u8 dev_xdp_sb_prog_count(struct net_device *dev)
{
        u8 count = 0;
        int i;

        for (i = 0; i < __MAX_XDP_MODE; i++)
                if (dev->xdp_state[i].prog &&
                    !dev->xdp_state[i].prog->aux->xdp_has_frags)
                        count++;
        return count;
}

int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
{
        if (!dev->netdev_ops->ndo_bpf)
                return -EOPNOTSUPP;

        if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
            bpf->command == XDP_SETUP_PROG &&
            bpf->prog && !bpf->prog->aux->xdp_has_frags) {
                NL_SET_ERR_MSG(bpf->extack,
                               "unable to propagate XDP to device using tcp-data-split");
                return -EBUSY;
        }

        if (dev_get_min_mp_channel_count(dev)) {
                NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
                return -EBUSY;
        }

        return dev->netdev_ops->ndo_bpf(dev, bpf);
}

u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
{
        struct bpf_prog *prog = dev_xdp_prog(dev, mode);

        return prog ? prog->aux->id : 0;
}

static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
                             struct bpf_xdp_link *link)
{
        dev->xdp_state[mode].link = link;
        dev->xdp_state[mode].prog = NULL;
}

static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
                             struct bpf_prog *prog)
{
        dev->xdp_state[mode].link = NULL;
        dev->xdp_state[mode].prog = prog;
}

static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
                           bpf_op_t bpf_op, struct netlink_ext_ack *extack,
                           u32 flags, struct bpf_prog *prog)
{
        struct netdev_bpf xdp;
        int err;

        netdev_ops_assert_locked(dev);

        if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
            prog && !prog->aux->xdp_has_frags) {
                NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
                return -EBUSY;
        }

        if (dev_get_min_mp_channel_count(dev)) {
                NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
                return -EBUSY;
        }

        memset(&xdp, 0, sizeof(xdp));
        xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
        xdp.extack = extack;
        xdp.flags = flags;
        xdp.prog = prog;

        /* Drivers assume refcnt is already incremented (i.e, prog pointer is
         * "moved" into driver), so they don't increment it on their own, but
         * they do decrement refcnt when program is detached or replaced.
         * Given net_device also owns link/prog, we need to bump refcnt here
         * to prevent drivers from underflowing it.
         */
        if (prog)
                bpf_prog_inc(prog);
        err = bpf_op(dev, &xdp);
        if (err) {
                if (prog)
                        bpf_prog_put(prog);
                return err;
        }

        if (mode != XDP_MODE_HW)
                bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);

        return 0;
}

static void dev_xdp_uninstall(struct net_device *dev)
{
        struct bpf_xdp_link *link;
        struct bpf_prog *prog;
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;

        ASSERT_RTNL();

        for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
                prog = dev_xdp_prog(dev, mode);
                if (!prog)
                        continue;

                bpf_op = dev_xdp_bpf_op(dev, mode);
                if (!bpf_op)
                        continue;

                WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));

                /* auto-detach link from net device */
                link = dev_xdp_link(dev, mode);
                if (link)
                        link->dev = NULL;
                else
                        bpf_prog_put(prog);

                dev_xdp_set_link(dev, mode, NULL);
        }
}

static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
                          struct bpf_xdp_link *link, struct bpf_prog *new_prog,
                          struct bpf_prog *old_prog, u32 flags)
{
        unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
        struct bpf_prog *cur_prog;
        struct net_device *upper;
        struct list_head *iter;
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;
        int err;

        ASSERT_RTNL();

        /* either link or prog attachment, never both */
        if (link && (new_prog || old_prog))
                return -EINVAL;
        /* link supports only XDP mode flags */
        if (link && (flags & ~XDP_FLAGS_MODES)) {
                NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
                return -EINVAL;
        }
        /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
        if (num_modes > 1) {
                NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
                return -EINVAL;
        }
        /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
        if (!num_modes && dev_xdp_prog_count(dev) > 1) {
                NL_SET_ERR_MSG(extack,
                               "More than one program loaded, unset mode is ambiguous");
                return -EINVAL;
        }
        /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
        if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
                NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
                return -EINVAL;
        }

        mode = dev_xdp_mode(dev, flags);
        /* can't replace attached link */
        if (dev_xdp_link(dev, mode)) {
                NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
                return -EBUSY;
        }

        /* don't allow if an upper device already has a program */
        netdev_for_each_upper_dev_rcu(dev, upper, iter) {
                if (dev_xdp_prog_count(upper) > 0) {
                        NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
                        return -EEXIST;
                }
        }

        cur_prog = dev_xdp_prog(dev, mode);
        /* can't replace attached prog with link */
        if (link && cur_prog) {
                NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
                return -EBUSY;
        }
        if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
                NL_SET_ERR_MSG(extack, "Active program does not match expected");
                return -EEXIST;
        }

        /* put effective new program into new_prog */
        if (link)
                new_prog = link->link.prog;

        if (new_prog) {
                bool offload = mode == XDP_MODE_HW;
                enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
                                               ? XDP_MODE_DRV : XDP_MODE_SKB;

                if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
                        NL_SET_ERR_MSG(extack, "XDP program already attached");
                        return -EBUSY;
                }
                if (!offload && dev_xdp_prog(dev, other_mode)) {
                        NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
                        return -EEXIST;
                }
                if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
                        NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
                        return -EINVAL;
                }
                if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
                        NL_SET_ERR_MSG(extack, "Program bound to different device");
                        return -EINVAL;
                }
                if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
                        NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
                        return -EINVAL;
                }
                if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
                        NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
                        return -EINVAL;
                }
                if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
                        NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
                        return -EINVAL;
                }
        }

        /* don't call drivers if the effective program didn't change */
        if (new_prog != cur_prog) {
                bpf_op = dev_xdp_bpf_op(dev, mode);
                if (!bpf_op) {
                        NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
                        return -EOPNOTSUPP;
                }

                err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
                if (err)
                        return err;
        }

        if (link)
                dev_xdp_set_link(dev, mode, link);
        else
                dev_xdp_set_prog(dev, mode, new_prog);
        if (cur_prog)
                bpf_prog_put(cur_prog);

        return 0;
}

static int dev_xdp_attach_link(struct net_device *dev,
                               struct netlink_ext_ack *extack,
                               struct bpf_xdp_link *link)
{
        return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
}

static int dev_xdp_detach_link(struct net_device *dev,
                               struct netlink_ext_ack *extack,
                               struct bpf_xdp_link *link)
{
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;

        ASSERT_RTNL();

        mode = dev_xdp_mode(dev, link->flags);
        if (dev_xdp_link(dev, mode) != link)
                return -EINVAL;

        bpf_op = dev_xdp_bpf_op(dev, mode);
        WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
        dev_xdp_set_link(dev, mode, NULL);
        return 0;
}

static void bpf_xdp_link_release(struct bpf_link *link)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

        rtnl_lock();

        /* if racing with net_device's tear down, xdp_link->dev might be
         * already NULL, in which case link was already auto-detached
         */
        if (xdp_link->dev) {
                netdev_lock_ops(xdp_link->dev);
                WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
                netdev_unlock_ops(xdp_link->dev);
                xdp_link->dev = NULL;
        }

        rtnl_unlock();
}

static int bpf_xdp_link_detach(struct bpf_link *link)
{
        bpf_xdp_link_release(link);
        return 0;
}

static void bpf_xdp_link_dealloc(struct bpf_link *link)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);

        kfree(xdp_link);
}

static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
                                     struct seq_file *seq)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        u32 ifindex = 0;

        rtnl_lock();
        if (xdp_link->dev)
                ifindex = xdp_link->dev->ifindex;
        rtnl_unlock();

        seq_printf(seq, "ifindex:\t%u\n", ifindex);
}

static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
                                       struct bpf_link_info *info)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        u32 ifindex = 0;

        rtnl_lock();
        if (xdp_link->dev)
                ifindex = xdp_link->dev->ifindex;
        rtnl_unlock();

        info->xdp.ifindex = ifindex;
        return 0;
}

static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
                               struct bpf_prog *old_prog)
{
        struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
        enum bpf_xdp_mode mode;
        bpf_op_t bpf_op;
        int err = 0;

        rtnl_lock();

        /* link might have been auto-released already, so fail */
        if (!xdp_link->dev) {
                err = -ENOLINK;
                goto out_unlock;
        }

        if (old_prog && link->prog != old_prog) {
                err = -EPERM;
                goto out_unlock;
        }
        old_prog = link->prog;
        if (old_prog->type != new_prog->type ||
            old_prog->expected_attach_type != new_prog->expected_attach_type) {
                err = -EINVAL;
                goto out_unlock;
        }

        if (old_prog == new_prog) {
                /* no-op, don't disturb drivers */
                bpf_prog_put(new_prog);
                goto out_unlock;
        }

        netdev_lock_ops(xdp_link->dev);
        mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
        bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
        err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
                              xdp_link->flags, new_prog);
        netdev_unlock_ops(xdp_link->dev);
        if (err)
                goto out_unlock;

        old_prog = xchg(&link->prog, new_prog);
        bpf_prog_put(old_prog);

out_unlock:
        rtnl_unlock();
        return err;
}

static const struct bpf_link_ops bpf_xdp_link_lops = {
        .release = bpf_xdp_link_release,
        .dealloc = bpf_xdp_link_dealloc,
        .detach = bpf_xdp_link_detach,
        .show_fdinfo = bpf_xdp_link_show_fdinfo,
        .fill_link_info = bpf_xdp_link_fill_link_info,
        .update_prog = bpf_xdp_link_update,
};

int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
{
        struct net *net = current->nsproxy->net_ns;
        struct bpf_link_primer link_primer;
        struct netlink_ext_ack extack = {};
        struct bpf_xdp_link *link;
        struct net_device *dev;
        int err, fd;

        rtnl_lock();
        dev = dev_get_by_index(net, attr->link_create.target_ifindex);
        if (!dev) {
                rtnl_unlock();
                return -EINVAL;
        }

        link = kzalloc(sizeof(*link), GFP_USER);
        if (!link) {
                err = -ENOMEM;
                goto unlock;
        }

        bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
        link->dev = dev;
        link->flags = attr->link_create.flags;

        err = bpf_link_prime(&link->link, &link_primer);
        if (err) {
                kfree(link);
                goto unlock;
        }

        netdev_lock_ops(dev);
        err = dev_xdp_attach_link(dev, &extack, link);
        netdev_unlock_ops(dev);
        rtnl_unlock();

        if (err) {
                link->dev = NULL;
                bpf_link_cleanup(&link_primer);
                trace_bpf_xdp_link_attach_failed(extack._msg);
                goto out_put_dev;
        }

        fd = bpf_link_settle(&link_primer);
        /* link itself doesn't hold dev's refcnt to not complicate shutdown */
        dev_put(dev);
        return fd;

unlock:
        rtnl_unlock();

out_put_dev:
        dev_put(dev);
        return err;
}

/**
 *        dev_change_xdp_fd - set or clear a bpf program for a device rx path
 *        @dev: device
 *        @extack: netlink extended ack
 *        @fd: new program fd or negative value to clear
 *        @expected_fd: old program fd that userspace expects to replace or clear
 *        @flags: xdp-related flags
 *
 *        Set or clear a bpf program for a device
 */
int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                      int fd, int expected_fd, u32 flags)
{
        enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
        struct bpf_prog *new_prog = NULL, *old_prog = NULL;
        int err;

        ASSERT_RTNL();

        if (fd >= 0) {
                new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
                                                 mode != XDP_MODE_SKB);
                if (IS_ERR(new_prog))
                        return PTR_ERR(new_prog);
        }

        if (expected_fd >= 0) {
                old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
                                                 mode != XDP_MODE_SKB);
                if (IS_ERR(old_prog)) {
                        err = PTR_ERR(old_prog);
                        old_prog = NULL;
                        goto err_out;
                }
        }

        err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);

err_out:
        if (err && new_prog)
                bpf_prog_put(new_prog);
        if (old_prog)
                bpf_prog_put(old_prog);
        return err;
}

u32 dev_get_min_mp_channel_count(const struct net_device *dev)
{
        int i;

        netdev_ops_assert_locked(dev);

        for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
                if (dev->_rx[i].mp_params.mp_priv)
                        /* The channel count is the idx plus 1. */
                        return i + 1;

        return 0;
}

/**
 * dev_index_reserve() - allocate an ifindex in a namespace
 * @net: the applicable net namespace
 * @ifindex: requested ifindex, pass %0 to get one allocated
 *
 * Allocate a ifindex for a new device. Caller must either use the ifindex
 * to store the device (via list_netdevice()) or call dev_index_release()
 * to give the index up.
 *
 * Return: a suitable unique value for a new device interface number or -errno.
 */
static int dev_index_reserve(struct net *net, u32 ifindex)
{
        int err;

        if (ifindex > INT_MAX) {
                DEBUG_NET_WARN_ON_ONCE(1);
                return -EINVAL;
        }

        if (!ifindex)
                err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
                                      xa_limit_31b, &net->ifindex, GFP_KERNEL);
        else
                err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
        if (err < 0)
                return err;

        return ifindex;
}

static void dev_index_release(struct net *net, int ifindex)
{
        /* Expect only unused indexes, unlist_netdevice() removes the used */
        WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}

static bool from_cleanup_net(void)
{
#ifdef CONFIG_NET_NS
        return current == cleanup_net_task;
#else
        return false;
#endif
}

/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
atomic_t dev_unreg_count = ATOMIC_INIT(0);

static void net_set_todo(struct net_device *dev)
{
        list_add_tail(&dev->todo_list, &net_todo_list);
}

static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
        struct net_device *upper, netdev_features_t features)
{
        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
        netdev_features_t feature;
        int feature_bit;

        for_each_netdev_feature(upper_disables, feature_bit) {
                feature = __NETIF_F_BIT(feature_bit);
                if (!(upper->wanted_features & feature)
                    && (features & feature)) {
                        netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
                                   &feature, upper->name);
                        features &= ~feature;
                }
        }

        return features;
}

static void netdev_sync_lower_features(struct net_device *upper,
        struct net_device *lower, netdev_features_t features)
{
        netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
        netdev_features_t feature;
        int feature_bit;

        for_each_netdev_feature(upper_disables, feature_bit) {
                feature = __NETIF_F_BIT(feature_bit);
                if (!(features & feature) && (lower->features & feature)) {
                        netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
                                   &feature, lower->name);
                        lower->wanted_features &= ~feature;
                        __netdev_update_features(lower);

                        if (unlikely(lower->features & feature))
                                netdev_WARN(upper, "failed to disable %pNF on %s!\n",
                                            &feature, lower->name);
                        else
                                netdev_features_change(lower);
                }
        }
}

static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
{
        netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
        bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
        bool hw_csum = features & NETIF_F_HW_CSUM;

        return ip_csum || hw_csum;
}

static netdev_features_t netdev_fix_features(struct net_device *dev,
        netdev_features_t features)
{
        /* Fix illegal checksum combinations */
        if ((features & NETIF_F_HW_CSUM) &&
            (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
                netdev_warn(dev, "mixed HW and IP checksum settings.\n");
                features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
        }

        /* TSO requires that SG is present as well. */
        if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
                netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
                features &= ~NETIF_F_ALL_TSO;
        }

        if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
                                        !(features & NETIF_F_IP_CSUM)) {
                netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
                features &= ~NETIF_F_TSO;
                features &= ~NETIF_F_TSO_ECN;
        }

        if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
                                         !(features & NETIF_F_IPV6_CSUM)) {
                netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
                features &= ~NETIF_F_TSO6;
        }

        /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
        if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
                features &= ~NETIF_F_TSO_MANGLEID;

        /* TSO ECN requires that TSO is present as well. */
        if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
                features &= ~NETIF_F_TSO_ECN;

        /* Software GSO depends on SG. */
        if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
                netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
                features &= ~NETIF_F_GSO;
        }

        /* GSO partial features require GSO partial be set */
        if ((features & dev->gso_partial_features) &&
            !(features & NETIF_F_GSO_PARTIAL)) {
                netdev_dbg(dev,
                           "Dropping partially supported GSO features since no GSO partial.\n");
                features &= ~dev->gso_partial_features;
        }

        if (!(features & NETIF_F_RXCSUM)) {
                /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
                 * successfully merged by hardware must also have the
                 * checksum verified by hardware.  If the user does not
                 * want to enable RXCSUM, logically, we should disable GRO_HW.
                 */
                if (features & NETIF_F_GRO_HW) {
                        netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
                        features &= ~NETIF_F_GRO_HW;
                }
        }

        /* LRO/HW-GRO features cannot be combined with RX-FCS */
        if (features & NETIF_F_RXFCS) {
                if (features & NETIF_F_LRO) {
                        netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
                        features &= ~NETIF_F_LRO;
                }

                if (features & NETIF_F_GRO_HW) {
                        netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
                        features &= ~NETIF_F_GRO_HW;
                }
        }

        if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
                netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
                features &= ~NETIF_F_LRO;
        }

        if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
                netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
                features &= ~NETIF_F_HW_TLS_TX;
        }

        if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
                netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
                features &= ~NETIF_F_HW_TLS_RX;
        }

        if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
                netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
                features &= ~NETIF_F_GSO_UDP_L4;
        }

        return features;
}

int __netdev_update_features(struct net_device *dev)
{
        struct net_device *upper, *lower;
        netdev_features_t features;
        struct list_head *iter;
        int err = -1;

        ASSERT_RTNL();
        netdev_ops_assert_locked(dev);

        features = netdev_get_wanted_features(dev);

        if (dev->netdev_ops->ndo_fix_features)
                features = dev->netdev_ops->ndo_fix_features(dev, features);

        /* driver might be less strict about feature dependencies */
        features = netdev_fix_features(dev, features);

        /* some features can't be enabled if they're off on an upper device */
        netdev_for_each_upper_dev_rcu(dev, upper, iter)
                features = netdev_sync_upper_features(dev, upper, features);

        if (dev->features == features)
                goto sync_lower;

        netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
                &dev->features, &features);

        if (dev->netdev_ops->ndo_set_features)
                err = dev->netdev_ops->ndo_set_features(dev, features);
        else
                err = 0;

        if (unlikely(err < 0)) {
                netdev_err(dev,
                        "set_features() failed (%d); wanted %pNF, left %pNF\n",
                        err, &features, &dev->features);
                /* return non-0 since some features might have changed and
                 * it's better to fire a spurious notification than miss it
                 */
                return -1;
        }

sync_lower:
        /* some features must be disabled on lower devices when disabled
         * on an upper device (think: bonding master or bridge)
         */
        netdev_for_each_lower_dev(dev, lower, iter)
                netdev_sync_lower_features(dev, lower, features);

        if (!err) {
                netdev_features_t diff = features ^ dev->features;

                if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
                        /* udp_tunnel_{get,drop}_rx_info both need
                         * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
                         * device, or they won't do anything.
                         * Thus we need to update dev->features
                         * *before* calling udp_tunnel_get_rx_info,
                         * but *after* calling udp_tunnel_drop_rx_info.
                         */
                        if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
                                dev->features = features;
                                udp_tunnel_get_rx_info(dev);
                        } else {
                                udp_tunnel_drop_rx_info(dev);
                        }
                }

                if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
                        if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
                                dev->features = features;
                                err |= vlan_get_rx_ctag_filter_info(dev);
                        } else {
                                vlan_drop_rx_ctag_filter_info(dev);
                        }
                }

                if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
                        if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
                                dev->features = features;
                                err |= vlan_get_rx_stag_filter_info(dev);
                        } else {
                                vlan_drop_rx_stag_filter_info(dev);
                        }
                }

                dev->features = features;
        }

        return err < 0 ? 0 : 1;
}

/**
 *        netdev_update_features - recalculate device features
 *        @dev: the device to check
 *
 *        Recalculate dev->features set and send notifications if it
 *        has changed. Should be called after driver or hardware dependent
 *        conditions might have changed that influence the features.
 */
void netdev_update_features(struct net_device *dev)
{
        if (__netdev_update_features(dev))
                netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_update_features);

/**
 *        netdev_change_features - recalculate device features
 *        @dev: the device to check
 *
 *        Recalculate dev->features set and send notifications even
 *        if they have not changed. Should be called instead of
 *        netdev_update_features() if also dev->vlan_features might
 *        have changed to allow the changes to be propagated to stacked
 *        VLAN devices.
 */
void netdev_change_features(struct net_device *dev)
{
        __netdev_update_features(dev);
        netdev_features_change(dev);
}
EXPORT_SYMBOL(netdev_change_features);

/**
 *        netif_stacked_transfer_operstate -        transfer operstate
 *        @rootdev: the root or lower level device to transfer state from
 *        @dev: the device to transfer operstate to
 *
 *        Transfer operational state from root to device. This is normally
 *        called when a stacking relationship exists between the root
 *        device and the device(a leaf device).
 */
void netif_stacked_transfer_operstate(const struct net_device *rootdev,
                                        struct net_device *dev)
{
        if (rootdev->operstate == IF_OPER_DORMANT)
                netif_dormant_on(dev);
        else
                netif_dormant_off(dev);

        if (rootdev->operstate == IF_OPER_TESTING)
                netif_testing_on(dev);
        else
                netif_testing_off(dev);

        if (netif_carrier_ok(rootdev))
                netif_carrier_on(dev);
        else
                netif_carrier_off(dev);
}
EXPORT_SYMBOL(netif_stacked_transfer_operstate);

static int netif_alloc_rx_queues(struct net_device *dev)
{
        unsigned int i, count = dev->num_rx_queues;
        struct netdev_rx_queue *rx;
        size_t sz = count * sizeof(*rx);
        int err = 0;

        BUG_ON(count < 1);

        rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!rx)
                return -ENOMEM;

        dev->_rx = rx;

        for (i = 0; i < count; i++) {
                rx[i].dev = dev;

                /* XDP RX-queue setup */
                err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
                if (err < 0)
                        goto err_rxq_info;
        }
        return 0;

err_rxq_info:
        /* Rollback successful reg's and free other resources */
        while (i--)
                xdp_rxq_info_unreg(&rx[i].xdp_rxq);
        kvfree(dev->_rx);
        dev->_rx = NULL;
        return err;
}

static void netif_free_rx_queues(struct net_device *dev)
{
        unsigned int i, count = dev->num_rx_queues;

        /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
        if (!dev->_rx)
                return;

        for (i = 0; i < count; i++)
                xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);

        kvfree(dev->_rx);
}

static void netdev_init_one_queue(struct net_device *dev,
                                  struct netdev_queue *queue, void *_unused)
{
        /* Initialize queue lock */
        spin_lock_init(&queue->_xmit_lock);
        netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
        queue->xmit_lock_owner = -1;
        netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
        queue->dev = dev;
#ifdef CONFIG_BQL
        dql_init(&queue->dql, HZ);
#endif
}

static void netif_free_tx_queues(struct net_device *dev)
{
        kvfree(dev->_tx);
}

static int netif_alloc_netdev_queues(struct net_device *dev)
{
        unsigned int count = dev->num_tx_queues;
        struct netdev_queue *tx;
        size_t sz = count * sizeof(*tx);

        if (count < 1 || count > 0xffff)
                return -EINVAL;

        tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!tx)
                return -ENOMEM;

        dev->_tx = tx;

        netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
        spin_lock_init(&dev->tx_global_lock);

        return 0;
}

void netif_tx_stop_all_queues(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

                netif_tx_stop_queue(txq);
        }
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);

static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
{
        void __percpu *v;

        /* Drivers implementing ndo_get_peer_dev must support tstat
         * accounting, so that skb_do_redirect() can bump the dev's
         * RX stats upon network namespace switch.
         */
        if (dev->netdev_ops->ndo_get_peer_dev &&
            dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
                return -EOPNOTSUPP;

        switch (dev->pcpu_stat_type) {
        case NETDEV_PCPU_STAT_NONE:
                return 0;
        case NETDEV_PCPU_STAT_LSTATS:
                v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
                break;
        case NETDEV_PCPU_STAT_TSTATS:
                v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
                break;
        case NETDEV_PCPU_STAT_DSTATS:
                v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
                break;
        default:
                return -EINVAL;
        }

        return v ? 0 : -ENOMEM;
}

static void netdev_do_free_pcpu_stats(struct net_device *dev)
{
        switch (dev->pcpu_stat_type) {
        case NETDEV_PCPU_STAT_NONE:
                return;
        case NETDEV_PCPU_STAT_LSTATS:
                free_percpu(dev->lstats);
                break;
        case NETDEV_PCPU_STAT_TSTATS:
                free_percpu(dev->tstats);
                break;
        case NETDEV_PCPU_STAT_DSTATS:
                free_percpu(dev->dstats);
                break;
        }
}

static void netdev_free_phy_link_topology(struct net_device *dev)
{
        struct phy_link_topology *topo = dev->link_topo;

        if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
                xa_destroy(&topo->phys);
                kfree(topo);
                dev->link_topo = NULL;
        }
}

/**
 * register_netdevice() - register a network device
 * @dev: device to register
 *
 * Take a prepared network device structure and make it externally accessible.
 * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
 * Callers must hold the rtnl lock - you may want register_netdev()
 * instead of this.
 */
int register_netdevice(struct net_device *dev)
{
        int ret;
        struct net *net = dev_net(dev);

        BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
                     NETDEV_FEATURE_COUNT);
        BUG_ON(dev_boot_phase);
        ASSERT_RTNL();

        might_sleep();

        /* When net_device's are persistent, this will be fatal. */
        BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
        BUG_ON(!net);

        ret = ethtool_check_ops(dev->ethtool_ops);
        if (ret)
                return ret;

        /* rss ctx ID 0 is reserved for the default context, start from 1 */
        xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
        mutex_init(&dev->ethtool->rss_lock);

        spin_lock_init(&dev->addr_list_lock);
        netdev_set_addr_lockdep_class(dev);

        ret = dev_get_valid_name(net, dev, dev->name);
        if (ret < 0)
                goto out;

        ret = -ENOMEM;
        dev->name_node = netdev_name_node_head_alloc(dev);
        if (!dev->name_node)
                goto out;

        /* Init, if this function is available */
        if (dev->netdev_ops->ndo_init) {
                ret = dev->netdev_ops->ndo_init(dev);
                if (ret) {
                        if (ret > 0)
                                ret = -EIO;
                        goto err_free_name;
                }
        }

        if (((dev->hw_features | dev->features) &
             NETIF_F_HW_VLAN_CTAG_FILTER) &&
            (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
             !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
                netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
                ret = -EINVAL;
                goto err_uninit;
        }

        ret = netdev_do_alloc_pcpu_stats(dev);
        if (ret)
                goto err_uninit;

        ret = dev_index_reserve(net, dev->ifindex);
        if (ret < 0)
                goto err_free_pcpu;
        dev->ifindex = ret;

        /* Transfer changeable features to wanted_features and enable
         * software offloads (GSO and GRO).
         */
        dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
        dev->features |= NETIF_F_SOFT_FEATURES;

        if (dev->udp_tunnel_nic_info) {
                dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
                dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
        }

        dev->wanted_features = dev->features & dev->hw_features;

        if (!(dev->flags & IFF_LOOPBACK))
                dev->hw_features |= NETIF_F_NOCACHE_COPY;

        /* If IPv4 TCP segmentation offload is supported we should also
         * allow the device to enable segmenting the frame with the option
         * of ignoring a static IP ID value.  This doesn't enable the
         * feature itself but allows the user to enable it later.
         */
        if (dev->hw_features & NETIF_F_TSO)
                dev->hw_features |= NETIF_F_TSO_MANGLEID;
        if (dev->vlan_features & NETIF_F_TSO)
                dev->vlan_features |= NETIF_F_TSO_MANGLEID;
        if (dev->mpls_features & NETIF_F_TSO)
                dev->mpls_features |= NETIF_F_TSO_MANGLEID;
        if (dev->hw_enc_features & NETIF_F_TSO)
                dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;

        /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
         */
        dev->vlan_features |= NETIF_F_HIGHDMA;

        /* Make NETIF_F_SG inheritable to tunnel devices.
         */
        dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;

        /* Make NETIF_F_SG inheritable to MPLS.
         */
        dev->mpls_features |= NETIF_F_SG;

        ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
        ret = notifier_to_errno(ret);
        if (ret)
                goto err_ifindex_release;

        ret = netdev_register_kobject(dev);

        netdev_lock(dev);
        WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
        netdev_unlock(dev);

        if (ret)
                goto err_uninit_notify;

        netdev_lock_ops(dev);
        __netdev_update_features(dev);
        netdev_unlock_ops(dev);

        /*
         *        Default initial state at registry is that the
         *        device is present.
         */

        set_bit(__LINK_STATE_PRESENT, &dev->state);

        linkwatch_init_dev(dev);

        dev_init_scheduler(dev);

        netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
        list_netdevice(dev);

        add_device_randomness(dev->dev_addr, dev->addr_len);

        /* If the device has permanent device address, driver should
         * set dev_addr and also addr_assign_type should be set to
         * NET_ADDR_PERM (default value).
         */
        if (dev->addr_assign_type == NET_ADDR_PERM)
                memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);

        /* Notify protocols, that a new device appeared. */
        netdev_lock_ops(dev);
        ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
        netdev_unlock_ops(dev);
        ret = notifier_to_errno(ret);
        if (ret) {
                /* Expect explicit free_netdev() on failure */
                dev->needs_free_netdev = false;
                unregister_netdevice_queue(dev, NULL);
                goto out;
        }
        /*
         *        Prevent userspace races by waiting until the network
         *        device is fully setup before sending notifications.
         */
        if (!dev->rtnl_link_ops ||
            dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
                rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);

out:
        return ret;

err_uninit_notify:
        call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
err_ifindex_release:
        dev_index_release(net, dev->ifindex);
err_free_pcpu:
        netdev_do_free_pcpu_stats(dev);
err_uninit:
        if (dev->netdev_ops->ndo_uninit)
                dev->netdev_ops->ndo_uninit(dev);
        if (dev->priv_destructor)
                dev->priv_destructor(dev);
err_free_name:
        netdev_name_node_free(dev->name_node);
        goto out;
}
EXPORT_SYMBOL(register_netdevice);

/* Initialize the core of a dummy net device.
 * The setup steps dummy netdevs need which normal netdevs get by going
 * through register_netdevice().
 */
static void init_dummy_netdev(struct net_device *dev)
{
        /* make sure we BUG if trying to hit standard
         * register/unregister code path
         */
        dev->reg_state = NETREG_DUMMY;

        /* a dummy interface is started by default */
        set_bit(__LINK_STATE_PRESENT, &dev->state);
        set_bit(__LINK_STATE_START, &dev->state);

        /* Note : We dont allocate pcpu_refcnt for dummy devices,
         * because users of this 'device' dont need to change
         * its refcount.
         */
}

/**
 *        register_netdev        - register a network device
 *        @dev: device to register
 *
 *        Take a completed network device structure and add it to the kernel
 *        interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
 *        chain. 0 is returned on success. A negative errno code is returned
 *        on a failure to set up the device, or if the name is a duplicate.
 *
 *        This is a wrapper around register_netdevice that takes the rtnl semaphore
 *        and expands the device name if you passed a format string to
 *        alloc_netdev.
 */
int register_netdev(struct net_device *dev)
{
        struct net *net = dev_net(dev);
        int err;

        if (rtnl_net_lock_killable(net))
                return -EINTR;

        err = register_netdevice(dev);

        rtnl_net_unlock(net);

        return err;
}
EXPORT_SYMBOL(register_netdev);

int netdev_refcnt_read(const struct net_device *dev)
{
#ifdef CONFIG_PCPU_DEV_REFCNT
        int i, refcnt = 0;

        for_each_possible_cpu(i)
                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
        return refcnt;
#else
        return refcount_read(&dev->dev_refcnt);
#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);

int netdev_unregister_timeout_secs __read_mostly = 10;

#define WAIT_REFS_MIN_MSECS 1
#define WAIT_REFS_MAX_MSECS 250
/**
 * netdev_wait_allrefs_any - wait until all references are gone.
 * @list: list of net_devices to wait on
 *
 * This is called when unregistering network devices.
 *
 * Any protocol or device that holds a reference should register
 * for netdevice notification, and cleanup and put back the
 * reference if they receive an UNREGISTER event.
 * We can get stuck here if buggy protocols don't correctly
 * call dev_put.
 */
static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
{
        unsigned long rebroadcast_time, warning_time;
        struct net_device *dev;
        int wait = 0;

        rebroadcast_time = warning_time = jiffies;

        list_for_each_entry(dev, list, todo_list)
                if (netdev_refcnt_read(dev) == 1)
                        return dev;

        while (true) {
                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
                        rtnl_lock();

                        /* Rebroadcast unregister notification */
                        list_for_each_entry(dev, list, todo_list)
                                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);

                        __rtnl_unlock();
                        rcu_barrier();
                        rtnl_lock();

                        list_for_each_entry(dev, list, todo_list)
                                if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
                                             &dev->state)) {
                                        /* We must not have linkwatch events
                                         * pending on unregister. If this
                                         * happens, we simply run the queue
                                         * unscheduled, resulting in a noop
                                         * for this device.
                                         */
                                        linkwatch_run_queue();
                                        break;
                                }

                        __rtnl_unlock();

                        rebroadcast_time = jiffies;
                }

                rcu_barrier();

                if (!wait) {
                        wait = WAIT_REFS_MIN_MSECS;
                } else {
                        msleep(wait);
                        wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
                }

                list_for_each_entry(dev, list, todo_list)
                        if (netdev_refcnt_read(dev) == 1)
                                return dev;

                if (time_after(jiffies, warning_time +
                               READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
                        list_for_each_entry(dev, list, todo_list) {
                                pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
                                         dev->name, netdev_refcnt_read(dev));
                                ref_tracker_dir_print(&dev->refcnt_tracker, 10);
                        }

                        warning_time = jiffies;
                }
        }
}

/* The sequence is:
 *
 *        rtnl_lock();
 *        ...
 *        register_netdevice(x1);
 *        register_netdevice(x2);
 *        ...
 *        unregister_netdevice(y1);
 *        unregister_netdevice(y2);
 *      ...
 *        rtnl_unlock();
 *        free_netdev(y1);
 *        free_netdev(y2);
 *
 * We are invoked by rtnl_unlock().
 * This allows us to deal with problems:
 * 1) We can delete sysfs objects which invoke hotplug
 *    without deadlocking with linkwatch via keventd.
 * 2) Since we run with the RTNL semaphore not held, we can sleep
 *    safely in order to wait for the netdev refcnt to drop to zero.
 *
 * We must not return until all unregister events added during
 * the interval the lock was held have been completed.
 */
void netdev_run_todo(void)
{
        struct net_device *dev, *tmp;
        struct list_head list;
        int cnt;
#ifdef CONFIG_LOCKDEP
        struct list_head unlink_list;

        list_replace_init(&net_unlink_list, &unlink_list);

        while (!list_empty(&unlink_list)) {
                dev = list_first_entry(&unlink_list, struct net_device,
                                       unlink_list);
                list_del_init(&dev->unlink_list);
                dev->nested_level = dev->lower_level - 1;
        }
#endif

        /* Snapshot list, allow later requests */
        list_replace_init(&net_todo_list, &list);

        __rtnl_unlock();

        /* Wait for rcu callbacks to finish before next phase */
        if (!list_empty(&list))
                rcu_barrier();

        list_for_each_entry_safe(dev, tmp, &list, todo_list) {
                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
                        netdev_WARN(dev, "run_todo but not unregistering\n");
                        list_del(&dev->todo_list);
                        continue;
                }

                netdev_lock(dev);
                WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
                netdev_unlock(dev);
                linkwatch_sync_dev(dev);
        }

        cnt = 0;
        while (!list_empty(&list)) {
                dev = netdev_wait_allrefs_any(&list);
                list_del(&dev->todo_list);

                /* paranoia */
                BUG_ON(netdev_refcnt_read(dev) != 1);
                BUG_ON(!list_empty(&dev->ptype_all));
                BUG_ON(!list_empty(&dev->ptype_specific));
                WARN_ON(rcu_access_pointer(dev->ip_ptr));
                WARN_ON(rcu_access_pointer(dev->ip6_ptr));

                netdev_do_free_pcpu_stats(dev);
                if (dev->priv_destructor)
                        dev->priv_destructor(dev);
                if (dev->needs_free_netdev)
                        free_netdev(dev);

                cnt++;

                /* Free network device */
                kobject_put(&dev->dev.kobj);
        }
        if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
                wake_up(&netdev_unregistering_wq);
}

/* Collate per-cpu network dstats statistics
 *
 * Read per-cpu network statistics from dev->dstats and populate the related
 * fields in @s.
 */
static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
                             const struct pcpu_dstats __percpu *dstats)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                u64 rx_packets, rx_bytes, rx_drops;
                u64 tx_packets, tx_bytes, tx_drops;
                const struct pcpu_dstats *stats;
                unsigned int start;

                stats = per_cpu_ptr(dstats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        rx_packets = u64_stats_read(&stats->rx_packets);
                        rx_bytes   = u64_stats_read(&stats->rx_bytes);
                        rx_drops   = u64_stats_read(&stats->rx_drops);
                        tx_packets = u64_stats_read(&stats->tx_packets);
                        tx_bytes   = u64_stats_read(&stats->tx_bytes);
                        tx_drops   = u64_stats_read(&stats->tx_drops);
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                s->rx_packets += rx_packets;
                s->rx_bytes   += rx_bytes;
                s->rx_dropped += rx_drops;
                s->tx_packets += tx_packets;
                s->tx_bytes   += tx_bytes;
                s->tx_dropped += tx_drops;
        }
}

/* ndo_get_stats64 implementation for dtstats-based accounting.
 *
 * Populate @s from dev->stats and dev->dstats. This is used internally by the
 * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
 */
static void dev_get_dstats64(const struct net_device *dev,
                             struct rtnl_link_stats64 *s)
{
        netdev_stats_to_stats64(s, &dev->stats);
        dev_fetch_dstats(s, dev->dstats);
}

/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
 * all the same fields in the same order as net_device_stats, with only
 * the type differing, but rtnl_link_stats64 may have additional fields
 * at the end for newer counters.
 */
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
                             const struct net_device_stats *netdev_stats)
{
        size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
        const atomic_long_t *src = (atomic_long_t *)netdev_stats;
        u64 *dst = (u64 *)stats64;

        BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
        for (i = 0; i < n; i++)
                dst[i] = (unsigned long)atomic_long_read(&src[i]);
        /* zero out counters that only exist in rtnl_link_stats64 */
        memset((char *)stats64 + n * sizeof(u64), 0,
               sizeof(*stats64) - n * sizeof(u64));
}
EXPORT_SYMBOL(netdev_stats_to_stats64);

static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
                struct net_device *dev)
{
        struct net_device_core_stats __percpu *p;

        p = alloc_percpu_gfp(struct net_device_core_stats,
                             GFP_ATOMIC | __GFP_NOWARN);

        if (p && cmpxchg(&dev->core_stats, NULL, p))
                free_percpu(p);

        /* This READ_ONCE() pairs with the cmpxchg() above */
        return READ_ONCE(dev->core_stats);
}

noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
{
        /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
        struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
        unsigned long __percpu *field;

        if (unlikely(!p)) {
                p = netdev_core_stats_alloc(dev);
                if (!p)
                        return;
        }

        field = (unsigned long __percpu *)((void __percpu *)p + offset);
        this_cpu_inc(*field);
}
EXPORT_SYMBOL_GPL(netdev_core_stats_inc);

/**
 *        dev_get_stats        - get network device statistics
 *        @dev: device to get statistics from
 *        @storage: place to store stats
 *
 *        Get network statistics from device. Return @storage.
 *        The device driver may provide its own method by setting
 *        dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
 *        otherwise the internal statistics structure is used.
 */
struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        const struct net_device_core_stats __percpu *p;

        /*
         * IPv{4,6} and udp tunnels share common stat helpers and use
         * different stat type (NETDEV_PCPU_STAT_TSTATS vs
         * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
         */
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
                     offsetof(struct pcpu_dstats, rx_bytes));
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
                     offsetof(struct pcpu_dstats, rx_packets));
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
                     offsetof(struct pcpu_dstats, tx_bytes));
        BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
                     offsetof(struct pcpu_dstats, tx_packets));

        if (ops->ndo_get_stats64) {
                memset(storage, 0, sizeof(*storage));
                ops->ndo_get_stats64(dev, storage);
        } else if (ops->ndo_get_stats) {
                netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
        } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
                dev_get_tstats64(dev, storage);
        } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
                dev_get_dstats64(dev, storage);
        } else {
                netdev_stats_to_stats64(storage, &dev->stats);
        }

        /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
        p = READ_ONCE(dev->core_stats);
        if (p) {
                const struct net_device_core_stats *core_stats;
                int i;

                for_each_possible_cpu(i) {
                        core_stats = per_cpu_ptr(p, i);
                        storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
                        storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
                        storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
                        storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
                }
        }
        return storage;
}
EXPORT_SYMBOL(dev_get_stats);

/**
 *        dev_fetch_sw_netstats - get per-cpu network device statistics
 *        @s: place to store stats
 *        @netstats: per-cpu network stats to read from
 *
 *        Read per-cpu network statistics and populate the related fields in @s.
 */
void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
                           const struct pcpu_sw_netstats __percpu *netstats)
{
        int cpu;

        for_each_possible_cpu(cpu) {
                u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
                const struct pcpu_sw_netstats *stats;
                unsigned int start;

                stats = per_cpu_ptr(netstats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        rx_packets = u64_stats_read(&stats->rx_packets);
                        rx_bytes   = u64_stats_read(&stats->rx_bytes);
                        tx_packets = u64_stats_read(&stats->tx_packets);
                        tx_bytes   = u64_stats_read(&stats->tx_bytes);
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                s->rx_packets += rx_packets;
                s->rx_bytes   += rx_bytes;
                s->tx_packets += tx_packets;
                s->tx_bytes   += tx_bytes;
        }
}
EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);

/**
 *        dev_get_tstats64 - ndo_get_stats64 implementation
 *        @dev: device to get statistics from
 *        @s: place to store stats
 *
 *        Populate @s from dev->stats and dev->tstats. Can be used as
 *        ndo_get_stats64() callback.
 */
void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
{
        netdev_stats_to_stats64(s, &dev->stats);
        dev_fetch_sw_netstats(s, dev->tstats);
}
EXPORT_SYMBOL_GPL(dev_get_tstats64);

struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
        struct netdev_queue *queue = dev_ingress_queue(dev);

#ifdef CONFIG_NET_CLS_ACT
        if (queue)
                return queue;
        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
        if (!queue)
                return NULL;
        netdev_init_one_queue(dev, queue, NULL);
        RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
        RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
        rcu_assign_pointer(dev->ingress_queue, queue);
#endif
        return queue;
}

static const struct ethtool_ops default_ethtool_ops;

void netdev_set_default_ethtool_ops(struct net_device *dev,
                                    const struct ethtool_ops *ops)
{
        if (dev->ethtool_ops == &default_ethtool_ops)
                dev->ethtool_ops = ops;
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);

/**
 * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
 * @dev: netdev to enable the IRQ coalescing on
 *
 * Sets a conservative default for SW IRQ coalescing. Users can use
 * sysfs attributes to override the default values.
 */
void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
{
        WARN_ON(dev->reg_state == NETREG_REGISTERED);

        if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
                netdev_set_gro_flush_timeout(dev, 20000);
                netdev_set_defer_hard_irqs(dev, 1);
        }
}
EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);

/**
 * alloc_netdev_mqs - allocate network device
 * @sizeof_priv: size of private data to allocate space for
 * @name: device name format string
 * @name_assign_type: origin of device name
 * @setup: callback to initialize device
 * @txqs: the number of TX subqueues to allocate
 * @rxqs: the number of RX subqueues to allocate
 *
 * Allocates a struct net_device with private data area for driver use
 * and performs basic initialization.  Also allocates subqueue structs
 * for each queue on the device.
 */
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
                unsigned char name_assign_type,
                void (*setup)(struct net_device *),
                unsigned int txqs, unsigned int rxqs)
{
        struct net_device *dev;
        size_t napi_config_sz;
        unsigned int maxqs;

        BUG_ON(strlen(name) >= sizeof(dev->name));

        if (txqs < 1) {
                pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
                return NULL;
        }

        if (rxqs < 1) {
                pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
                return NULL;
        }

        maxqs = max(txqs, rxqs);

        dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
                       GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
        if (!dev)
                return NULL;

        dev->priv_len = sizeof_priv;

        ref_tracker_dir_init(&dev->refcnt_tracker, 128, name);
#ifdef CONFIG_PCPU_DEV_REFCNT
        dev->pcpu_refcnt = alloc_percpu(int);
        if (!dev->pcpu_refcnt)
                goto free_dev;
        __dev_hold(dev);
#else
        refcount_set(&dev->dev_refcnt, 1);
#endif

        if (dev_addr_init(dev))
                goto free_pcpu;

        dev_mc_init(dev);
        dev_uc_init(dev);

        dev_net_set(dev, &init_net);

        dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
        dev->xdp_zc_max_segs = 1;
        dev->gso_max_segs = GSO_MAX_SEGS;
        dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
        dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
        dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
        dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
        dev->tso_max_segs = TSO_MAX_SEGS;
        dev->upper_level = 1;
        dev->lower_level = 1;
#ifdef CONFIG_LOCKDEP
        dev->nested_level = 0;
        INIT_LIST_HEAD(&dev->unlink_list);
#endif

        INIT_LIST_HEAD(&dev->napi_list);
        INIT_LIST_HEAD(&dev->unreg_list);
        INIT_LIST_HEAD(&dev->close_list);
        INIT_LIST_HEAD(&dev->link_watch_list);
        INIT_LIST_HEAD(&dev->adj_list.upper);
        INIT_LIST_HEAD(&dev->adj_list.lower);
        INIT_LIST_HEAD(&dev->ptype_all);
        INIT_LIST_HEAD(&dev->ptype_specific);
        INIT_LIST_HEAD(&dev->net_notifier_list);
#ifdef CONFIG_NET_SCHED
        hash_init(dev->qdisc_hash);
#endif

        mutex_init(&dev->lock);

        dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
        setup(dev);

        if (!dev->tx_queue_len) {
                dev->priv_flags |= IFF_NO_QUEUE;
                dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
        }

        dev->num_tx_queues = txqs;
        dev->real_num_tx_queues = txqs;
        if (netif_alloc_netdev_queues(dev))
                goto free_all;

        dev->num_rx_queues = rxqs;
        dev->real_num_rx_queues = rxqs;
        if (netif_alloc_rx_queues(dev))
                goto free_all;
        dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
        if (!dev->ethtool)
                goto free_all;

        dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
        if (!dev->cfg)
                goto free_all;
        dev->cfg_pending = dev->cfg;

        napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
        dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
        if (!dev->napi_config)
                goto free_all;

        strscpy(dev->name, name);
        dev->name_assign_type = name_assign_type;
        dev->group = INIT_NETDEV_GROUP;
        if (!dev->ethtool_ops)
                dev->ethtool_ops = &default_ethtool_ops;

        nf_hook_netdev_init(dev);

        return dev;

free_all:
        free_netdev(dev);
        return NULL;

free_pcpu:
#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
free_dev:
#endif
        kvfree(dev);
        return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);

static void netdev_napi_exit(struct net_device *dev)
{
        if (!list_empty(&dev->napi_list)) {
                struct napi_struct *p, *n;

                netdev_lock(dev);
                list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
                        __netif_napi_del_locked(p);
                netdev_unlock(dev);

                synchronize_net();
        }

        kvfree(dev->napi_config);
}

/**
 * free_netdev - free network device
 * @dev: device
 *
 * This function does the last stage of destroying an allocated device
 * interface. The reference to the device object is released. If this
 * is the last reference then it will be freed.Must be called in process
 * context.
 */
void free_netdev(struct net_device *dev)
{
        might_sleep();

        /* When called immediately after register_netdevice() failed the unwind
         * handling may still be dismantling the device. Handle that case by
         * deferring the free.
         */
        if (dev->reg_state == NETREG_UNREGISTERING) {
                ASSERT_RTNL();
                dev->needs_free_netdev = true;
                return;
        }

        WARN_ON(dev->cfg != dev->cfg_pending);
        kfree(dev->cfg);
        kfree(dev->ethtool);
        netif_free_tx_queues(dev);
        netif_free_rx_queues(dev);

        kfree(rcu_dereference_protected(dev->ingress_queue, 1));

        /* Flush device addresses */
        dev_addr_flush(dev);

        netdev_napi_exit(dev);

        netif_del_cpu_rmap(dev);

        ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
        dev->pcpu_refcnt = NULL;
#endif
        free_percpu(dev->core_stats);
        dev->core_stats = NULL;
        free_percpu(dev->xdp_bulkq);
        dev->xdp_bulkq = NULL;

        netdev_free_phy_link_topology(dev);

        mutex_destroy(&dev->lock);

        /*  Compatibility with error handling in drivers */
        if (dev->reg_state == NETREG_UNINITIALIZED ||
            dev->reg_state == NETREG_DUMMY) {
                kvfree(dev);
                return;
        }

        BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
        WRITE_ONCE(dev->reg_state, NETREG_RELEASED);

        /* will free via device release */
        put_device(&dev->dev);
}
EXPORT_SYMBOL(free_netdev);

/**
 * alloc_netdev_dummy - Allocate and initialize a dummy net device.
 * @sizeof_priv: size of private data to allocate space for
 *
 * Return: the allocated net_device on success, NULL otherwise
 */
struct net_device *alloc_netdev_dummy(int sizeof_priv)
{
        return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
                            init_dummy_netdev);
}
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);

/**
 *        synchronize_net -  Synchronize with packet receive processing
 *
 *        Wait for packets currently being received to be done.
 *        Does not block later packets from starting.
 */
void synchronize_net(void)
{
        might_sleep();
        if (from_cleanup_net() || rtnl_is_locked())
                synchronize_rcu_expedited();
        else
                synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);

static void netdev_rss_contexts_free(struct net_device *dev)
{
        struct ethtool_rxfh_context *ctx;
        unsigned long context;

        mutex_lock(&dev->ethtool->rss_lock);
        xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
                struct ethtool_rxfh_param rxfh;

                rxfh.indir = ethtool_rxfh_context_indir(ctx);
                rxfh.key = ethtool_rxfh_context_key(ctx);
                rxfh.hfunc = ctx->hfunc;
                rxfh.input_xfrm = ctx->input_xfrm;
                rxfh.rss_context = context;
                rxfh.rss_delete = true;

                xa_erase(&dev->ethtool->rss_ctx, context);
                if (dev->ethtool_ops->create_rxfh_context)
                        dev->ethtool_ops->remove_rxfh_context(dev, ctx,
                                                              context, NULL);
                else
                        dev->ethtool_ops->set_rxfh(dev, &rxfh, NULL);
                kfree(ctx);
        }
        xa_destroy(&dev->ethtool->rss_ctx);
        mutex_unlock(&dev->ethtool->rss_lock);
}

/**
 *        unregister_netdevice_queue - remove device from the kernel
 *        @dev: device
 *        @head: list
 *
 *        This function shuts down a device interface and removes it
 *        from the kernel tables.
 *        If head not NULL, device is queued to be unregistered later.
 *
 *        Callers must hold the rtnl semaphore.  You may want
 *        unregister_netdev() instead of this.
 */

void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
{
        ASSERT_RTNL();

        if (head) {
                list_move_tail(&dev->unreg_list, head);
        } else {
                LIST_HEAD(single);

                list_add(&dev->unreg_list, &single);
                unregister_netdevice_many(&single);
        }
}
EXPORT_SYMBOL(unregister_netdevice_queue);

static void dev_memory_provider_uninstall(struct net_device *dev)
{
        unsigned int i;

        for (i = 0; i < dev->real_num_rx_queues; i++) {
                struct netdev_rx_queue *rxq = &dev->_rx[i];
                struct pp_memory_provider_params *p = &rxq->mp_params;

                if (p->mp_ops && p->mp_ops->uninstall)
                        p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
        }
}

void unregister_netdevice_many_notify(struct list_head *head,
                                      u32 portid, const struct nlmsghdr *nlh)
{
        struct net_device *dev, *tmp;
        LIST_HEAD(close_head);
        int cnt = 0;

        BUG_ON(dev_boot_phase);
        ASSERT_RTNL();

        if (list_empty(head))
                return;

        list_for_each_entry_safe(dev, tmp, head, unreg_list) {
                /* Some devices call without registering
                 * for initialization unwind. Remove those
                 * devices and proceed with the remaining.
                 */
                if (dev->reg_state == NETREG_UNINITIALIZED) {
                        pr_debug("unregister_netdevice: device %s/%p never was registered\n",
                                 dev->name, dev);

                        WARN_ON(1);
                        list_del(&dev->unreg_list);
                        continue;
                }
                dev->dismantle = true;
                BUG_ON(dev->reg_state != NETREG_REGISTERED);
        }

        /* If device is running, close it first. Start with ops locked... */
        list_for_each_entry(dev, head, unreg_list) {
                if (netdev_need_ops_lock(dev)) {
                        list_add_tail(&dev->close_list, &close_head);
                        netdev_lock(dev);
                }
        }
        dev_close_many(&close_head, true);
        /* ... now unlock them and go over the rest. */
        list_for_each_entry(dev, head, unreg_list) {
                if (netdev_need_ops_lock(dev))
                        netdev_unlock(dev);
                else
                        list_add_tail(&dev->close_list, &close_head);
        }
        dev_close_many(&close_head, true);

        list_for_each_entry(dev, head, unreg_list) {
                /* And unlink it from device chain. */
                unlist_netdevice(dev);
                netdev_lock(dev);
                WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
                netdev_unlock(dev);
        }
        flush_all_backlogs();

        synchronize_net();

        list_for_each_entry(dev, head, unreg_list) {
                struct sk_buff *skb = NULL;

                /* Shutdown queueing discipline. */
                dev_shutdown(dev);
                dev_tcx_uninstall(dev);
                netdev_lock_ops(dev);
                dev_xdp_uninstall(dev);
                dev_memory_provider_uninstall(dev);
                netdev_unlock_ops(dev);
                bpf_dev_bound_netdev_unregister(dev);

                netdev_offload_xstats_disable_all(dev);

                /* Notify protocols, that we are about to destroy
                 * this device. They should clean all the things.
                 */
                call_netdevice_notifiers(NETDEV_UNREGISTER, dev);

                if (!dev->rtnl_link_ops ||
                    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
                        skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
                                                     GFP_KERNEL, NULL, 0,
                                                     portid, nlh);

                /*
                 *        Flush the unicast and multicast chains
                 */
                dev_uc_flush(dev);
                dev_mc_flush(dev);

                netdev_name_node_alt_flush(dev);
                netdev_name_node_free(dev->name_node);

                netdev_rss_contexts_free(dev);

                call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);

                if (dev->netdev_ops->ndo_uninit)
                        dev->netdev_ops->ndo_uninit(dev);

                mutex_destroy(&dev->ethtool->rss_lock);

                net_shaper_flush_netdev(dev);

                if (skb)
                        rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);

                /* Notifier chain MUST detach us all upper devices. */
                WARN_ON(netdev_has_any_upper_dev(dev));
                WARN_ON(netdev_has_any_lower_dev(dev));

                /* Remove entries from kobject tree */
                netdev_unregister_kobject(dev);
#ifdef CONFIG_XPS
                /* Remove XPS queueing entries */
                netif_reset_xps_queues_gt(dev, 0);
#endif
        }

        synchronize_net();

        list_for_each_entry(dev, head, unreg_list) {
                netdev_put(dev, &dev->dev_registered_tracker);
                net_set_todo(dev);
                cnt++;
        }
        atomic_add(cnt, &dev_unreg_count);

        list_del(head);
}

/**
 *        unregister_netdevice_many - unregister many devices
 *        @head: list of devices
 *
 *  Note: As most callers use a stack allocated list_head,
 *  we force a list_del() to make sure stack won't be corrupted later.
 */
void unregister_netdevice_many(struct list_head *head)
{
        unregister_netdevice_many_notify(head, 0, NULL);
}
EXPORT_SYMBOL(unregister_netdevice_many);

/**
 *        unregister_netdev - remove device from the kernel
 *        @dev: device
 *
 *        This function shuts down a device interface and removes it
 *        from the kernel tables.
 *
 *        This is just a wrapper for unregister_netdevice that takes
 *        the rtnl semaphore.  In general you want to use this and not
 *        unregister_netdevice.
 */
void unregister_netdev(struct net_device *dev)
{
        rtnl_net_dev_lock(dev);
        unregister_netdevice(dev);
        rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);

int __dev_change_net_namespace(struct net_device *dev, struct net *net,
                               const char *pat, int new_ifindex,
                               struct netlink_ext_ack *extack)
{
        struct netdev_name_node *name_node;
        struct net *net_old = dev_net(dev);
        char new_name[IFNAMSIZ] = {};
        int err, new_nsid;

        ASSERT_RTNL();

        /* Don't allow namespace local devices to be moved. */
        err = -EINVAL;
        if (dev->netns_immutable) {
                NL_SET_ERR_MSG(extack, "The interface netns is immutable");
                goto out;
        }

        /* Ensure the device has been registered */
        if (dev->reg_state != NETREG_REGISTERED) {
                NL_SET_ERR_MSG(extack, "The interface isn't registered");
                goto out;
        }

        /* Get out if there is nothing todo */
        err = 0;
        if (net_eq(net_old, net))
                goto out;

        /* Pick the destination device name, and ensure
         * we can use it in the destination network namespace.
         */
        err = -EEXIST;
        if (netdev_name_in_use(net, dev->name)) {
                /* We get here if we can't use the current device name */
                if (!pat) {
                        NL_SET_ERR_MSG(extack,
                                       "An interface with the same name exists in the target netns");
                        goto out;
                }
                err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
                if (err < 0) {
                        NL_SET_ERR_MSG_FMT(extack,
                                           "Unable to use '%s' for the new interface name in the target netns",
                                           pat);
                        goto out;
                }
        }
        /* Check that none of the altnames conflicts. */
        err = -EEXIST;
        netdev_for_each_altname(dev, name_node) {
                if (netdev_name_in_use(net, name_node->name)) {
                        NL_SET_ERR_MSG_FMT(extack,
                                           "An interface with the altname %s exists in the target netns",
                                           name_node->name);
                        goto out;
                }
        }

        /* Check that new_ifindex isn't used yet. */
        if (new_ifindex) {
                err = dev_index_reserve(net, new_ifindex);
                if (err < 0) {
                        NL_SET_ERR_MSG_FMT(extack,
                                           "The ifindex %d is not available in the target netns",
                                           new_ifindex);
                        goto out;
                }
        } else {
                /* If there is an ifindex conflict assign a new one */
                err = dev_index_reserve(net, dev->ifindex);
                if (err == -EBUSY)
                        err = dev_index_reserve(net, 0);
                if (err < 0) {
                        NL_SET_ERR_MSG(extack,
                                       "Unable to allocate a new ifindex in the target netns");
                        goto out;
                }
                new_ifindex = err;
        }

        /*
         * And now a mini version of register_netdevice unregister_netdevice.
         */

        netdev_lock_ops(dev);
        /* If device is running close it first. */
        netif_close(dev);
        /* And unlink it from device chain */
        unlist_netdevice(dev);
        netdev_unlock_ops(dev);

        synchronize_net();

        /* Shutdown queueing discipline. */
        dev_shutdown(dev);

        /* Notify protocols, that we are about to destroy
         * this device. They should clean all the things.
         *
         * Note that dev->reg_state stays at NETREG_REGISTERED.
         * This is wanted because this way 8021q and macvlan know
         * the device is just moving and can keep their slaves up.
         */
        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
        rcu_barrier();

        new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);

        rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
                            new_ifindex);

        /*
         *        Flush the unicast and multicast chains
         */
        dev_uc_flush(dev);
        dev_mc_flush(dev);

        /* Send a netdev-removed uevent to the old namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
        netdev_adjacent_del_links(dev);

        /* Move per-net netdevice notifiers that are following the netdevice */
        move_netdevice_notifiers_dev_net(dev, net);

        /* Actually switch the network namespace */
        dev_net_set(dev, net);
        dev->ifindex = new_ifindex;

        if (new_name[0]) {
                /* Rename the netdev to prepared name */
                write_seqlock_bh(&netdev_rename_lock);
                strscpy(dev->name, new_name, IFNAMSIZ);
                write_sequnlock_bh(&netdev_rename_lock);
        }

        /* Fixup kobjects */
        dev_set_uevent_suppress(&dev->dev, 1);
        err = device_rename(&dev->dev, dev->name);
        dev_set_uevent_suppress(&dev->dev, 0);
        WARN_ON(err);

        /* Send a netdev-add uevent to the new namespace */
        kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
        netdev_adjacent_add_links(dev);

        /* Adapt owner in case owning user namespace of target network
         * namespace is different from the original one.
         */
        err = netdev_change_owner(dev, net_old, net);
        WARN_ON(err);

        netdev_lock_ops(dev);
        /* Add the device back in the hashes */
        list_netdevice(dev);
        /* Notify protocols, that a new device appeared. */
        call_netdevice_notifiers(NETDEV_REGISTER, dev);
        netdev_unlock_ops(dev);

        /*
         *        Prevent userspace races by waiting until the network
         *        device is fully setup before sending notifications.
         */
        rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);

        synchronize_net();
        err = 0;
out:
        return err;
}

static int dev_cpu_dead(unsigned int oldcpu)
{
        struct sk_buff **list_skb;
        struct sk_buff *skb;
        unsigned int cpu;
        struct softnet_data *sd, *oldsd, *remsd = NULL;

        local_irq_disable();
        cpu = smp_processor_id();
        sd = &per_cpu(softnet_data, cpu);
        oldsd = &per_cpu(softnet_data, oldcpu);

        /* Find end of our completion_queue. */
        list_skb = &sd->completion_queue;
        while (*list_skb)
                list_skb = &(*list_skb)->next;
        /* Append completion queue from offline CPU. */
        *list_skb = oldsd->completion_queue;
        oldsd->completion_queue = NULL;

        /* Append output queue from offline CPU. */
        if (oldsd->output_queue) {
                *sd->output_queue_tailp = oldsd->output_queue;
                sd->output_queue_tailp = oldsd->output_queue_tailp;
                oldsd->output_queue = NULL;
                oldsd->output_queue_tailp = &oldsd->output_queue;
        }
        /* Append NAPI poll list from offline CPU, with one exception :
         * process_backlog() must be called by cpu owning percpu backlog.
         * We properly handle process_queue & input_pkt_queue later.
         */
        while (!list_empty(&oldsd->poll_list)) {
                struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
                                                            struct napi_struct,
                                                            poll_list);

                list_del_init(&napi->poll_list);
                if (napi->poll == process_backlog)
                        napi->state &= NAPIF_STATE_THREADED;
                else
                        ____napi_schedule(sd, napi);
        }

        raise_softirq_irqoff(NET_TX_SOFTIRQ);
        local_irq_enable();

        if (!use_backlog_threads()) {
#ifdef CONFIG_RPS
                remsd = oldsd->rps_ipi_list;
                oldsd->rps_ipi_list = NULL;
#endif
                /* send out pending IPI's on offline CPU */
                net_rps_send_ipi(remsd);
        }

        /* Process offline CPU's input_pkt_queue */
        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
                netif_rx(skb);
                rps_input_queue_head_incr(oldsd);
        }
        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
                netif_rx(skb);
                rps_input_queue_head_incr(oldsd);
        }

        return 0;
}

/**
 *        netdev_increment_features - increment feature set by one
 *        @all: current feature set
 *        @one: new feature set
 *        @mask: mask feature set
 *
 *        Computes a new feature set after adding a device with feature set
 *        @one to the master device with current feature set @all.  Will not
 *        enable anything that is off in @mask. Returns the new feature set.
 */
netdev_features_t netdev_increment_features(netdev_features_t all,
        netdev_features_t one, netdev_features_t mask)
{
        if (mask & NETIF_F_HW_CSUM)
                mask |= NETIF_F_CSUM_MASK;
        mask |= NETIF_F_VLAN_CHALLENGED;

        all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
        all &= one | ~NETIF_F_ALL_FOR_ALL;

        /* If one device supports hw checksumming, set for all. */
        if (all & NETIF_F_HW_CSUM)
                all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);

        return all;
}
EXPORT_SYMBOL(netdev_increment_features);

static struct hlist_head * __net_init netdev_create_hash(void)
{
        int i;
        struct hlist_head *hash;

        hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
        if (hash != NULL)
                for (i = 0; i < NETDEV_HASHENTRIES; i++)
                        INIT_HLIST_HEAD(&hash[i]);

        return hash;
}

/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
        BUILD_BUG_ON(GRO_HASH_BUCKETS >
                     BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));

        INIT_LIST_HEAD(&net->dev_base_head);

        net->dev_name_head = netdev_create_hash();
        if (net->dev_name_head == NULL)
                goto err_name;

        net->dev_index_head = netdev_create_hash();
        if (net->dev_index_head == NULL)
                goto err_idx;

        xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);

        RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);

        return 0;

err_idx:
        kfree(net->dev_name_head);
err_name:
        return -ENOMEM;
}

/**
 *        netdev_drivername - network driver for the device
 *        @dev: network device
 *
 *        Determine network driver for device.
 */
const char *netdev_drivername(const struct net_device *dev)
{
        const struct device_driver *driver;
        const struct device *parent;
        const char *empty = "";

        parent = dev->dev.parent;
        if (!parent)
                return empty;

        driver = parent->driver;
        if (driver && driver->name)
                return driver->name;
        return empty;
}

static void __netdev_printk(const char *level, const struct net_device *dev,
                            struct va_format *vaf)
{
        if (dev && dev->dev.parent) {
                dev_printk_emit(level[1] - '0',
                                dev->dev.parent,
                                "%s %s %s%s: %pV",
                                dev_driver_string(dev->dev.parent),
                                dev_name(dev->dev.parent),
                                netdev_name(dev), netdev_reg_state(dev),
                                vaf);
        } else if (dev) {
                printk("%s%s%s: %pV",
                       level, netdev_name(dev), netdev_reg_state(dev), vaf);
        } else {
                printk("%s(NULL net_device): %pV", level, vaf);
        }
}

void netdev_printk(const char *level, const struct net_device *dev,
                   const char *format, ...)
{
        struct va_format vaf;
        va_list args;

        va_start(args, format);

        vaf.fmt = format;
        vaf.va = &args;

        __netdev_printk(level, dev, &vaf);

        va_end(args);
}
EXPORT_SYMBOL(netdev_printk);

#define define_netdev_printk_level(func, level)                        \
void func(const struct net_device *dev, const char *fmt, ...)        \
{                                                                \
        struct va_format vaf;                                        \
        va_list args;                                                \
                                                                \
        va_start(args, fmt);                                        \
                                                                \
        vaf.fmt = fmt;                                                \
        vaf.va = &args;                                                \
                                                                \
        __netdev_printk(level, dev, &vaf);                        \
                                                                \
        va_end(args);                                                \
}                                                                \
EXPORT_SYMBOL(func);

define_netdev_printk_level(netdev_emerg, KERN_EMERG);
define_netdev_printk_level(netdev_alert, KERN_ALERT);
define_netdev_printk_level(netdev_crit, KERN_CRIT);
define_netdev_printk_level(netdev_err, KERN_ERR);
define_netdev_printk_level(netdev_warn, KERN_WARNING);
define_netdev_printk_level(netdev_notice, KERN_NOTICE);
define_netdev_printk_level(netdev_info, KERN_INFO);

static void __net_exit netdev_exit(struct net *net)
{
        kfree(net->dev_name_head);
        kfree(net->dev_index_head);
        xa_destroy(&net->dev_by_index);
        if (net != &init_net)
                WARN_ON_ONCE(!list_empty(&net->dev_base_head));
}

static struct pernet_operations __net_initdata netdev_net_ops = {
        .init = netdev_init,
        .exit = netdev_exit,
};

static void __net_exit default_device_exit_net(struct net *net)
{
        struct netdev_name_node *name_node, *tmp;
        struct net_device *dev, *aux;
        /*
         * Push all migratable network devices back to the
         * initial network namespace
         */
        ASSERT_RTNL();
        for_each_netdev_safe(net, dev, aux) {
                int err;
                char fb_name[IFNAMSIZ];

                /* Ignore unmoveable devices (i.e. loopback) */
                if (dev->netns_immutable)
                        continue;

                /* Leave virtual devices for the generic cleanup */
                if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
                        continue;

                /* Push remaining network devices to init_net */
                snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
                if (netdev_name_in_use(&init_net, fb_name))
                        snprintf(fb_name, IFNAMSIZ, "dev%%d");

                netdev_for_each_altname_safe(dev, name_node, tmp)
                        if (netdev_name_in_use(&init_net, name_node->name))
                                __netdev_name_node_alt_destroy(name_node);

                err = dev_change_net_namespace(dev, &init_net, fb_name);
                if (err) {
                        pr_emerg("%s: failed to move %s to init_net: %d\n",
                                 __func__, dev->name, err);
                        BUG();
                }
        }
}

static void __net_exit default_device_exit_batch(struct list_head *net_list)
{
        /* At exit all network devices most be removed from a network
         * namespace.  Do this in the reverse order of registration.
         * Do this across as many network namespaces as possible to
         * improve batching efficiency.
         */
        struct net_device *dev;
        struct net *net;
        LIST_HEAD(dev_kill_list);

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list) {
                default_device_exit_net(net);
                cond_resched();
        }

        list_for_each_entry(net, net_list, exit_list) {
                for_each_netdev_reverse(net, dev) {
                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
                                dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
                        else
                                unregister_netdevice_queue(dev, &dev_kill_list);
                }
        }
        unregister_netdevice_many(&dev_kill_list);
        rtnl_unlock();
}

static struct pernet_operations __net_initdata default_device_ops = {
        .exit_batch = default_device_exit_batch,
};

static void __init net_dev_struct_check(void)
{
        /* TX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
#ifdef CONFIG_XPS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
#endif
#ifdef CONFIG_NETFILTER_EGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
#endif
#ifdef CONFIG_NET_XGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
#endif
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);

        /* TXRX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);

        /* RX read-mostly hotpath */
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
#ifdef CONFIG_NETPOLL
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
#endif
#ifdef CONFIG_NET_XGRESS
        CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
#endif
        CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
}

/*
 *        Initialize the DEV module. At boot time this walks the device list and
 *        unhooks any devices that fail to initialise (normally hardware not
 *        present) and leaves us with a valid list of present and active devices.
 *
 */

/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
#define SYSTEM_PERCPU_PAGE_POOL_SIZE        ((1 << 20) / PAGE_SIZE)

static int net_page_pool_create(int cpuid)
{
#if IS_ENABLED(CONFIG_PAGE_POOL)
        struct page_pool_params page_pool_params = {
                .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
                .flags = PP_FLAG_SYSTEM_POOL,
                .nid = cpu_to_mem(cpuid),
        };
        struct page_pool *pp_ptr;
        int err;

        pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
        if (IS_ERR(pp_ptr))
                return -ENOMEM;

        err = xdp_reg_page_pool(pp_ptr);
        if (err) {
                page_pool_destroy(pp_ptr);
                return err;
        }

        per_cpu(system_page_pool, cpuid) = pp_ptr;
#endif
        return 0;
}

static int backlog_napi_should_run(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
        struct napi_struct *napi = &sd->backlog;

        return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
}

static void run_backlog_napi(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);

        napi_threaded_poll_loop(&sd->backlog);
}

static void backlog_napi_setup(unsigned int cpu)
{
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
        struct napi_struct *napi = &sd->backlog;

        napi->thread = this_cpu_read(backlog_napi);
        set_bit(NAPI_STATE_THREADED, &napi->state);
}

static struct smp_hotplug_thread backlog_threads = {
        .store                        = &backlog_napi,
        .thread_should_run        = backlog_napi_should_run,
        .thread_fn                = run_backlog_napi,
        .thread_comm                = "backlog_napi/%u",
        .setup                        = backlog_napi_setup,
};

/*
 *       This is called single threaded during boot, so no need
 *       to take the rtnl semaphore.
 */
static int __init net_dev_init(void)
{
        int i, rc = -ENOMEM;

        BUG_ON(!dev_boot_phase);

        net_dev_struct_check();

        if (dev_proc_init())
                goto out;

        if (netdev_kobject_init())
                goto out;

        for (i = 0; i < PTYPE_HASH_SIZE; i++)
                INIT_LIST_HEAD(&ptype_base[i]);

        if (register_pernet_subsys(&netdev_net_ops))
                goto out;

        /*
         *        Initialise the packet receive queues.
         */

        flush_backlogs_fallback = flush_backlogs_alloc();
        if (!flush_backlogs_fallback)
                goto out;

        for_each_possible_cpu(i) {
                struct softnet_data *sd = &per_cpu(softnet_data, i);

                skb_queue_head_init(&sd->input_pkt_queue);
                skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
                skb_queue_head_init(&sd->xfrm_backlog);
#endif
                INIT_LIST_HEAD(&sd->poll_list);
                sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
                INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
                sd->cpu = i;
#endif
                INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
                spin_lock_init(&sd->defer_lock);

                gro_init(&sd->backlog.gro);
                sd->backlog.poll = process_backlog;
                sd->backlog.weight = weight_p;
                INIT_LIST_HEAD(&sd->backlog.poll_list);

                if (net_page_pool_create(i))
                        goto out;
        }
        if (use_backlog_threads())
                smpboot_register_percpu_thread(&backlog_threads);

        dev_boot_phase = 0;

        /* The loopback device is special if any other network devices
         * is present in a network namespace the loopback device must
         * be present. Since we now dynamically allocate and free the
         * loopback device ensure this invariant is maintained by
         * keeping the loopback device as the first device on the
         * list of network devices.  Ensuring the loopback devices
         * is the first device that appears and the last network device
         * that disappears.
         */
        if (register_pernet_device(&loopback_net_ops))
                goto out;

        if (register_pernet_device(&default_device_ops))
                goto out;

        open_softirq(NET_TX_SOFTIRQ, net_tx_action);
        open_softirq(NET_RX_SOFTIRQ, net_rx_action);

        rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
                                       NULL, dev_cpu_dead);
        WARN_ON(rc < 0);
        rc = 0;

        /* avoid static key IPIs to isolated CPUs */
        if (housekeeping_enabled(HK_TYPE_MISC))
                net_enable_timestamp();
out:
        if (rc < 0) {
                for_each_possible_cpu(i) {
                        struct page_pool *pp_ptr;

                        pp_ptr = per_cpu(system_page_pool, i);
                        if (!pp_ptr)
                                continue;

                        xdp_unreg_page_pool(pp_ptr);
                        page_pool_destroy(pp_ptr);
                        per_cpu(system_page_pool, i) = NULL;
                }
        }

        return rc;
}

subsys_initcall(net_dev_init);


















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/* SPDX-License-Identifier: GPL-2.0 */
/*
 *        Routines to manage notifier chains for passing status changes to any
 *        interested routines. We need this instead of hard coded call lists so
 *        that modules can poke their nose into the innards. The network devices
 *        needed them so here they are for the rest of you.
 *
 *                                Alan Cox <Alan.Cox@linux.org>
 */
 
#ifndef _LINUX_NOTIFIER_H
#define _LINUX_NOTIFIER_H
#include <linux/errno.h>
#include <linux/mutex.h>
#include <linux/rwsem.h>
#include <linux/srcu.h>

/*
 * Notifier chains are of four types:
 *
 *        Atomic notifier chains: Chain callbacks run in interrupt/atomic
 *                context. Callouts are not allowed to block.
 *        Blocking notifier chains: Chain callbacks run in process context.
 *                Callouts are allowed to block.
 *        Raw notifier chains: There are no restrictions on callbacks,
 *                registration, or unregistration.  All locking and protection
 *                must be provided by the caller.
 *        SRCU notifier chains: A variant of blocking notifier chains, with
 *                the same restrictions.
 *
 * atomic_notifier_chain_register() may be called from an atomic context,
 * but blocking_notifier_chain_register() and srcu_notifier_chain_register()
 * must be called from a process context.  Ditto for the corresponding
 * _unregister() routines.
 *
 * atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(),
 * and srcu_notifier_chain_unregister() _must not_ be called from within
 * the call chain.
 *
 * SRCU notifier chains are an alternative form of blocking notifier chains.
 * They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for
 * protection of the chain links.  This means there is _very_ low overhead
 * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
 * As compensation, srcu_notifier_chain_unregister() is rather expensive.
 * SRCU notifier chains should be used when the chain will be called very
 * often but notifier_blocks will seldom be removed.
 */

struct notifier_block;

typedef        int (*notifier_fn_t)(struct notifier_block *nb,
                        unsigned long action, void *data);

struct notifier_block {
        notifier_fn_t notifier_call;
        struct notifier_block __rcu *next;
        int priority;
};

struct atomic_notifier_head {
        spinlock_t lock;
        struct notifier_block __rcu *head;
};

struct blocking_notifier_head {
        struct rw_semaphore rwsem;
        struct notifier_block __rcu *head;
};

struct raw_notifier_head {
        struct notifier_block __rcu *head;
};

struct srcu_notifier_head {
        struct mutex mutex;
        struct srcu_usage srcuu;
        struct srcu_struct srcu;
        struct notifier_block __rcu *head;
};

#define ATOMIC_INIT_NOTIFIER_HEAD(name) do {        \
                spin_lock_init(&(name)->lock);        \
                (name)->head = NULL;                \
        } while (0)
#define BLOCKING_INIT_NOTIFIER_HEAD(name) do {        \
                init_rwsem(&(name)->rwsem);        \
                (name)->head = NULL;                \
        } while (0)
#define RAW_INIT_NOTIFIER_HEAD(name) do {        \
                (name)->head = NULL;                \
        } while (0)

/* srcu_notifier_heads must be cleaned up dynamically */
extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
#define srcu_cleanup_notifier_head(name)        \
                cleanup_srcu_struct(&(name)->srcu);

#define ATOMIC_NOTIFIER_INIT(name) {                                \
                .lock = __SPIN_LOCK_UNLOCKED(name.lock),        \
                .head = NULL }
#define BLOCKING_NOTIFIER_INIT(name) {                                \
                .rwsem = __RWSEM_INITIALIZER((name).rwsem),        \
                .head = NULL }
#define RAW_NOTIFIER_INIT(name)        {                                \
                .head = NULL }

#define SRCU_NOTIFIER_INIT(name, pcpu)                                \
        {                                                        \
                .mutex = __MUTEX_INITIALIZER(name.mutex),        \
                .head = NULL,                                        \
                .srcuu = __SRCU_USAGE_INIT(name.srcuu),                \
                .srcu = __SRCU_STRUCT_INIT(name.srcu, name.srcuu, pcpu), \
        }

#define ATOMIC_NOTIFIER_HEAD(name)                                \
        struct atomic_notifier_head name =                        \
                ATOMIC_NOTIFIER_INIT(name)
#define BLOCKING_NOTIFIER_HEAD(name)                                \
        struct blocking_notifier_head name =                        \
                BLOCKING_NOTIFIER_INIT(name)
#define RAW_NOTIFIER_HEAD(name)                                        \
        struct raw_notifier_head name =                                \
                RAW_NOTIFIER_INIT(name)

#ifdef CONFIG_TREE_SRCU
#define _SRCU_NOTIFIER_HEAD(name, mod)                                \
        static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \
        mod struct srcu_notifier_head name =                        \
                        SRCU_NOTIFIER_INIT(name, name##_head_srcu_data)

#else
#define _SRCU_NOTIFIER_HEAD(name, mod)                                \
        mod struct srcu_notifier_head name =                        \
                        SRCU_NOTIFIER_INIT(name, name)

#endif

#define SRCU_NOTIFIER_HEAD(name)                                \
        _SRCU_NOTIFIER_HEAD(name, /* not static */)

#define SRCU_NOTIFIER_HEAD_STATIC(name)                                \
        _SRCU_NOTIFIER_HEAD(name, static)

#ifdef __KERNEL__

extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
                struct notifier_block *nb);
extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh,
                struct notifier_block *nb);
extern int raw_notifier_chain_register(struct raw_notifier_head *nh,
                struct notifier_block *nb);
extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh,
                struct notifier_block *nb);

extern int atomic_notifier_chain_register_unique_prio(
                struct atomic_notifier_head *nh, struct notifier_block *nb);
extern int blocking_notifier_chain_register_unique_prio(
                struct blocking_notifier_head *nh, struct notifier_block *nb);

extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
                struct notifier_block *nb);
extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh,
                struct notifier_block *nb);
extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh,
                struct notifier_block *nb);
extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh,
                struct notifier_block *nb);

extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
                unsigned long val, void *v);
extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh,
                unsigned long val, void *v);
extern int raw_notifier_call_chain(struct raw_notifier_head *nh,
                unsigned long val, void *v);
extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh,
                unsigned long val, void *v);

extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v);
extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh,
                unsigned long val_up, unsigned long val_down, void *v);

extern bool atomic_notifier_call_chain_is_empty(struct atomic_notifier_head *nh);

#define NOTIFY_DONE                0x0000                /* Don't care */
#define NOTIFY_OK                0x0001                /* Suits me */
#define NOTIFY_STOP_MASK        0x8000                /* Don't call further */
#define NOTIFY_BAD                (NOTIFY_STOP_MASK|0x0002)
                                                /* Bad/Veto action */
/*
 * Clean way to return from the notifier and stop further calls.
 */
#define NOTIFY_STOP                (NOTIFY_OK|NOTIFY_STOP_MASK)

/* Encapsulate (negative) errno value (in particular, NOTIFY_BAD <=> EPERM). */
static inline int notifier_from_errno(int err)
{
        if (err)
                return NOTIFY_STOP_MASK | (NOTIFY_OK - err);

        return NOTIFY_OK;
}

/* Restore (negative) errno value from notify return value. */
static inline int notifier_to_errno(int ret)
{
        ret &= ~NOTIFY_STOP_MASK;
        return ret > NOTIFY_OK ? NOTIFY_OK - ret : 0;
}

/*
 *        Declared notifiers so far. I can imagine quite a few more chains
 *        over time (eg laptop power reset chains, reboot chain (to clean 
 *        device units up), device [un]mount chain, module load/unload chain,
 *        low memory chain, screenblank chain (for plug in modular screenblankers) 
 *        VC switch chains (for loadable kernel svgalib VC switch helpers) etc...
 */
 
/* CPU notfiers are defined in include/linux/cpu.h. */

/* netdevice notifiers are defined in include/linux/netdevice.h */

/* reboot notifiers are defined in include/linux/reboot.h. */

/* Hibernation and suspend events are defined in include/linux/suspend.h. */

/* Virtual Terminal events are defined in include/linux/vt.h. */

#define NETLINK_URELEASE        0x0001        /* Unicast netlink socket released */

/* Console keyboard events.
 * Note: KBD_KEYCODE is always sent before KBD_UNBOUND_KEYCODE, KBD_UNICODE and
 * KBD_KEYSYM. */
#define KBD_KEYCODE                0x0001 /* Keyboard keycode, called before any other */
#define KBD_UNBOUND_KEYCODE        0x0002 /* Keyboard keycode which is not bound to any other */
#define KBD_UNICODE                0x0003 /* Keyboard unicode */
#define KBD_KEYSYM                0x0004 /* Keyboard keysym */
#define KBD_POST_KEYSYM                0x0005 /* Called after keyboard keysym interpretation */

#endif /* __KERNEL__ */
#endif /* _LINUX_NOTIFIER_H */

































































































    4 

    4 











































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __BEN_VLAN_802_1Q_INC__
#define __BEN_VLAN_802_1Q_INC__

#include <linux/if_vlan.h>
#include <linux/u64_stats_sync.h>
#include <linux/list.h>

/* if this changes, algorithm will have to be reworked because this
 * depends on completely exhausting the VLAN identifier space.  Thus
 * it gives constant time look-up, but in many cases it wastes memory.
 */
#define VLAN_GROUP_ARRAY_SPLIT_PARTS  8
#define VLAN_GROUP_ARRAY_PART_LEN     (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS)

enum vlan_protos {
        VLAN_PROTO_8021Q        = 0,
        VLAN_PROTO_8021AD,
        VLAN_PROTO_NUM,
};

struct vlan_group {
        unsigned int                nr_vlan_devs;
        struct hlist_node        hlist;        /* linked list */
        struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM]
                                               [VLAN_GROUP_ARRAY_SPLIT_PARTS];
};

struct vlan_info {
        struct net_device        *real_dev; /* The ethernet(like) device
                                            * the vlan is attached to.
                                            */
        struct vlan_group        grp;
        struct list_head        vid_list;
        unsigned int                nr_vids;
        struct rcu_head                rcu;
};

static inline int vlan_proto_idx(__be16 proto)
{
        switch (proto) {
        case htons(ETH_P_8021Q):
                return VLAN_PROTO_8021Q;
        case htons(ETH_P_8021AD):
                return VLAN_PROTO_8021AD;
        default:
                WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto));
                return -EINVAL;
        }
}

static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg,
                                                         unsigned int pidx,
                                                         u16 vlan_id)
{
        struct net_device **array;

        array = vg->vlan_devices_arrays[pidx]
                                       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];

        /* paired with smp_wmb() in vlan_group_prealloc_vid() */
        smp_rmb();

        return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
}

static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
                                                       __be16 vlan_proto,
                                                       u16 vlan_id)
{
        int pidx = vlan_proto_idx(vlan_proto);

        if (pidx < 0)
                return NULL;

        return __vlan_group_get_device(vg, pidx, vlan_id);
}

static inline void vlan_group_set_device(struct vlan_group *vg,
                                         __be16 vlan_proto, u16 vlan_id,
                                         struct net_device *dev)
{
        int pidx = vlan_proto_idx(vlan_proto);
        struct net_device **array;

        if (!vg || pidx < 0)
                return;
        array = vg->vlan_devices_arrays[pidx]
                                       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
        array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
}

/* Must be invoked with rcu_read_lock or with RTNL. */
static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
                                               __be16 vlan_proto, u16 vlan_id)
{
        struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info);

        if (vlan_info)
                return vlan_group_get_device(&vlan_info->grp,
                                             vlan_proto, vlan_id);

        return NULL;
}

static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev)
{
        netdev_features_t ret;

        ret = real_dev->hw_enc_features &
              (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE |
               NETIF_F_GSO_ENCAP_ALL);

        if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
                return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM;
        return 0;
}

#define vlan_group_for_each_dev(grp, i, dev) \
        for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
                if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
                                                            (i) % VLAN_N_VID)))

int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto);
void vlan_filter_drop_vids(struct vlan_info *vlan_info, __be16 proto);

/* found in vlan_dev.c */
void vlan_dev_set_ingress_priority(const struct net_device *dev,
                                   u32 skb_prio, u16 vlan_prio);
int vlan_dev_set_egress_priority(const struct net_device *dev,
                                 u32 skb_prio, u16 vlan_prio);
void vlan_dev_free_egress_priority(const struct net_device *dev);
int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
void vlan_dev_get_realdev_name(const struct net_device *dev, char *result,
                               size_t size);

int vlan_check_real_dev(struct net_device *real_dev,
                        __be16 protocol, u16 vlan_id,
                        struct netlink_ext_ack *extack);
void vlan_setup(struct net_device *dev);
int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack);
void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
bool vlan_dev_inherit_address(struct net_device *dev,
                              struct net_device *real_dev);

static inline u32 vlan_get_ingress_priority(struct net_device *dev,
                                            u16 vlan_tci)
{
        struct vlan_dev_priv *vip = vlan_dev_priv(dev);

        return vip->ingress_priority_map[(vlan_tci >> VLAN_PRIO_SHIFT) & 0x7];
}

#ifdef CONFIG_VLAN_8021Q_GVRP
int vlan_gvrp_request_join(const struct net_device *dev);
void vlan_gvrp_request_leave(const struct net_device *dev);
int vlan_gvrp_init_applicant(struct net_device *dev);
void vlan_gvrp_uninit_applicant(struct net_device *dev);
int vlan_gvrp_init(void);
void vlan_gvrp_uninit(void);
#else
static inline int vlan_gvrp_request_join(const struct net_device *dev) { return 0; }
static inline void vlan_gvrp_request_leave(const struct net_device *dev) {}
static inline int vlan_gvrp_init_applicant(struct net_device *dev) { return 0; }
static inline void vlan_gvrp_uninit_applicant(struct net_device *dev) {}
static inline int vlan_gvrp_init(void) { return 0; }
static inline void vlan_gvrp_uninit(void) {}
#endif

#ifdef CONFIG_VLAN_8021Q_MVRP
int vlan_mvrp_request_join(const struct net_device *dev);
void vlan_mvrp_request_leave(const struct net_device *dev);
int vlan_mvrp_init_applicant(struct net_device *dev);
void vlan_mvrp_uninit_applicant(struct net_device *dev);
int vlan_mvrp_init(void);
void vlan_mvrp_uninit(void);
#else
static inline int vlan_mvrp_request_join(const struct net_device *dev) { return 0; }
static inline void vlan_mvrp_request_leave(const struct net_device *dev) {}
static inline int vlan_mvrp_init_applicant(struct net_device *dev) { return 0; }
static inline void vlan_mvrp_uninit_applicant(struct net_device *dev) {}
static inline int vlan_mvrp_init(void) { return 0; }
static inline void vlan_mvrp_uninit(void) {}
#endif

extern const char vlan_fullname[];
extern const char vlan_version[];
int vlan_netlink_init(void);
void vlan_netlink_fini(void);

extern struct rtnl_link_ops vlan_link_ops;

extern unsigned int vlan_net_id;

struct proc_dir_entry;

struct vlan_net {
        /* /proc/net/vlan */
        struct proc_dir_entry *proc_vlan_dir;
        /* /proc/net/vlan/config */
        struct proc_dir_entry *proc_vlan_conf;
        /* Determines interface naming scheme. */
        unsigned short name_type;
};

#endif /* !(__BEN_VLAN_802_1Q_INC__) */


































































































































































































































































































































































































































































































































































































   35 






   34 




    3 

   32 

















































    1 













   35 

   35 



   34 






   34 







    1 









































































































































































































  248 





    3 














  247 




















  248 
  248 
  247 




    3 


    3 

    3 

    3 








   26 


    1 
    3 















































   29 







    7 










   34 













































   35 












   35 





























































   34 



   34 




   35 





























   35 

















































































































































   35 


   34 














































































    3 

















    3 

    3 
    3 






















    3 














































   26 









   26 
   26 































































































































   26 









   26 


   26 





   26 











   25 



















































   26 








































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
// SPDX-License-Identifier: GPL-2.0
/*
 *  Kernel internal timers
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
 *
 *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
 *              "A Kernel Model for Precision Timekeeping" by Dave Mills
 *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
 *              serialize accesses to xtime/lost_ticks).
 *                              Copyright (C) 1998  Andrea Arcangeli
 *  1999-03-10  Improved NTP compatibility by Ulrich Windl
 *  2002-05-31        Move sys_sysinfo here and make its locking sane, Robert Love
 *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
 *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
 *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
 */

#include <linux/kernel_stat.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pid_namespace.h>
#include <linux/notifier.h>
#include <linux/thread_info.h>
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/posix-timers.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/delay.h>
#include <linux/tick.h>
#include <linux/kallsyms.h>
#include <linux/irq_work.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/slab.h>
#include <linux/compat.h>
#include <linux/random.h>
#include <linux/sysctl.h>

#include <linux/uaccess.h>
#include <asm/unistd.h>
#include <asm/div64.h>
#include <asm/timex.h>
#include <asm/io.h>

#include "tick-internal.h"
#include "timer_migration.h"

#define CREATE_TRACE_POINTS
#include <trace/events/timer.h>

__visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;

EXPORT_SYMBOL(jiffies_64);

/*
 * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
 * LVL_SIZE buckets. Each level is driven by its own clock and therefore each
 * level has a different granularity.
 *
 * The level granularity is:                LVL_CLK_DIV ^ level
 * The level clock frequency is:        HZ / (LVL_CLK_DIV ^ level)
 *
 * The array level of a newly armed timer depends on the relative expiry
 * time. The farther the expiry time is away the higher the array level and
 * therefore the granularity becomes.
 *
 * Contrary to the original timer wheel implementation, which aims for 'exact'
 * expiry of the timers, this implementation removes the need for recascading
 * the timers into the lower array levels. The previous 'classic' timer wheel
 * implementation of the kernel already violated the 'exact' expiry by adding
 * slack to the expiry time to provide batched expiration. The granularity
 * levels provide implicit batching.
 *
 * This is an optimization of the original timer wheel implementation for the
 * majority of the timer wheel use cases: timeouts. The vast majority of
 * timeout timers (networking, disk I/O ...) are canceled before expiry. If
 * the timeout expires it indicates that normal operation is disturbed, so it
 * does not matter much whether the timeout comes with a slight delay.
 *
 * The only exception to this are networking timers with a small expiry
 * time. They rely on the granularity. Those fit into the first wheel level,
 * which has HZ granularity.
 *
 * We don't have cascading anymore. timers with a expiry time above the
 * capacity of the last wheel level are force expired at the maximum timeout
 * value of the last wheel level. From data sampling we know that the maximum
 * value observed is 5 days (network connection tracking), so this should not
 * be an issue.
 *
 * The currently chosen array constants values are a good compromise between
 * array size and granularity.
 *
 * This results in the following granularity and range levels:
 *
 * HZ 1000 steps
 * Level Offset  Granularity            Range
 *  0      0         1 ms                0 ms -         63 ms
 *  1     64         8 ms               64 ms -        511 ms
 *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s)
 *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s)
 *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m)
 *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m)
 *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h)
 *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d)
 *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d)
 *
 * HZ  300
 * Level Offset  Granularity            Range
 *  0           0         3 ms                0 ms -        210 ms
 *  1          64        26 ms              213 ms -       1703 ms (213ms - ~1s)
 *  2         128       213 ms             1706 ms -      13650 ms (~1s - ~13s)
 *  3         192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m)
 *  4         256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m)
 *  5         320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h)
 *  6         384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h)
 *  7         448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d)
 *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
 *
 * HZ  250
 * Level Offset  Granularity            Range
 *  0           0         4 ms                0 ms -        255 ms
 *  1          64        32 ms              256 ms -       2047 ms (256ms - ~2s)
 *  2         128       256 ms             2048 ms -      16383 ms (~2s - ~16s)
 *  3         192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m)
 *  4         256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m)
 *  5         320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h)
 *  6         384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h)
 *  7         448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d)
 *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
 *
 * HZ  100
 * Level Offset  Granularity            Range
 *  0           0         10 ms               0 ms -        630 ms
 *  1          64         80 ms             640 ms -       5110 ms (640ms - ~5s)
 *  2         128        640 ms            5120 ms -      40950 ms (~5s - ~40s)
 *  3         192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m)
 *  4         256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m)
 *  5         320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h)
 *  6         384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d)
 *  7         448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
 */

/* Clock divisor for the next level */
#define LVL_CLK_SHIFT        3
#define LVL_CLK_DIV        (1UL << LVL_CLK_SHIFT)
#define LVL_CLK_MASK        (LVL_CLK_DIV - 1)
#define LVL_SHIFT(n)        ((n) * LVL_CLK_SHIFT)
#define LVL_GRAN(n)        (1UL << LVL_SHIFT(n))

/*
 * The time start value for each level to select the bucket at enqueue
 * time. We start from the last possible delta of the previous level
 * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
 */
#define LVL_START(n)        ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))

/* Size of each clock level */
#define LVL_BITS        6
#define LVL_SIZE        (1UL << LVL_BITS)
#define LVL_MASK        (LVL_SIZE - 1)
#define LVL_OFFS(n)        ((n) * LVL_SIZE)

/* Level depth */
#if HZ > 100
# define LVL_DEPTH        9
# else
# define LVL_DEPTH        8
#endif

/* The cutoff (max. capacity of the wheel) */
#define WHEEL_TIMEOUT_CUTOFF        (LVL_START(LVL_DEPTH))
#define WHEEL_TIMEOUT_MAX        (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))

/*
 * The resulting wheel size. If NOHZ is configured we allocate two
 * wheels so we have a separate storage for the deferrable timers.
 */
#define WHEEL_SIZE        (LVL_SIZE * LVL_DEPTH)

#ifdef CONFIG_NO_HZ_COMMON
/*
 * If multiple bases need to be locked, use the base ordering for lock
 * nesting, i.e. lowest number first.
 */
# define NR_BASES        3
# define BASE_LOCAL        0
# define BASE_GLOBAL        1
# define BASE_DEF        2
#else
# define NR_BASES        1
# define BASE_LOCAL        0
# define BASE_GLOBAL        0
# define BASE_DEF        0
#endif

/**
 * struct timer_base - Per CPU timer base (number of base depends on config)
 * @lock:                Lock protecting the timer_base
 * @running_timer:        When expiring timers, the lock is dropped. To make
 *                        sure not to race against deleting/modifying a
 *                        currently running timer, the pointer is set to the
 *                        timer, which expires at the moment. If no timer is
 *                        running, the pointer is NULL.
 * @expiry_lock:        PREEMPT_RT only: Lock is taken in softirq around
 *                        timer expiry callback execution and when trying to
 *                        delete a running timer and it wasn't successful in
 *                        the first glance. It prevents priority inversion
 *                        when callback was preempted on a remote CPU and a
 *                        caller tries to delete the running timer. It also
 *                        prevents a life lock, when the task which tries to
 *                        delete a timer preempted the softirq thread which
 *                        is running the timer callback function.
 * @timer_waiters:        PREEMPT_RT only: Tells, if there is a waiter
 *                        waiting for the end of the timer callback function
 *                        execution.
 * @clk:                clock of the timer base; is updated before enqueue
 *                        of a timer; during expiry, it is 1 offset ahead of
 *                        jiffies to avoid endless requeuing to current
 *                        jiffies
 * @next_expiry:        expiry value of the first timer; it is updated when
 *                        finding the next timer and during enqueue; the
 *                        value is not valid, when next_expiry_recalc is set
 * @cpu:                Number of CPU the timer base belongs to
 * @next_expiry_recalc: States, whether a recalculation of next_expiry is
 *                        required. Value is set true, when a timer was
 *                        deleted.
 * @is_idle:                Is set, when timer_base is idle. It is triggered by NOHZ
 *                        code. This state is only used in standard
 *                        base. Deferrable timers, which are enqueued remotely
 *                        never wake up an idle CPU. So no matter of supporting it
 *                        for this base.
 * @timers_pending:        Is set, when a timer is pending in the base. It is only
 *                        reliable when next_expiry_recalc is not set.
 * @pending_map:        bitmap of the timer wheel; each bit reflects a
 *                        bucket of the wheel. When a bit is set, at least a
 *                        single timer is enqueued in the related bucket.
 * @vectors:                Array of lists; Each array member reflects a bucket
 *                        of the timer wheel. The list contains all timers
 *                        which are enqueued into a specific bucket.
 */
struct timer_base {
        raw_spinlock_t                lock;
        struct timer_list        *running_timer;
#ifdef CONFIG_PREEMPT_RT
        spinlock_t                expiry_lock;
        atomic_t                timer_waiters;
#endif
        unsigned long                clk;
        unsigned long                next_expiry;
        unsigned int                cpu;
        bool                        next_expiry_recalc;
        bool                        is_idle;
        bool                        timers_pending;
        DECLARE_BITMAP(pending_map, WHEEL_SIZE);
        struct hlist_head        vectors[WHEEL_SIZE];
} ____cacheline_aligned;

static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);

#ifdef CONFIG_NO_HZ_COMMON

static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
static DEFINE_MUTEX(timer_keys_mutex);

static void timer_update_keys(struct work_struct *work);
static DECLARE_WORK(timer_update_work, timer_update_keys);

#ifdef CONFIG_SMP
static unsigned int sysctl_timer_migration = 1;

DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);

static void timers_update_migration(void)
{
        if (sysctl_timer_migration && tick_nohz_active)
                static_branch_enable(&timers_migration_enabled);
        else
                static_branch_disable(&timers_migration_enabled);
}

#ifdef CONFIG_SYSCTL
static int timer_migration_handler(const struct ctl_table *table, int write,
                            void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        mutex_lock(&timer_keys_mutex);
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (!ret && write)
                timers_update_migration();
        mutex_unlock(&timer_keys_mutex);
        return ret;
}

static const struct ctl_table timer_sysctl[] = {
        {
                .procname        = "timer_migration",
                .data                = &sysctl_timer_migration,
                .maxlen                = sizeof(unsigned int),
                .mode                = 0644,
                .proc_handler        = timer_migration_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
};

static int __init timer_sysctl_init(void)
{
        register_sysctl("kernel", timer_sysctl);
        return 0;
}
device_initcall(timer_sysctl_init);
#endif /* CONFIG_SYSCTL */
#else /* CONFIG_SMP */
static inline void timers_update_migration(void) { }
#endif /* !CONFIG_SMP */

static void timer_update_keys(struct work_struct *work)
{
        mutex_lock(&timer_keys_mutex);
        timers_update_migration();
        static_branch_enable(&timers_nohz_active);
        mutex_unlock(&timer_keys_mutex);
}

void timers_update_nohz(void)
{
        schedule_work(&timer_update_work);
}

static inline bool is_timers_nohz_active(void)
{
        return static_branch_unlikely(&timers_nohz_active);
}
#else
static inline bool is_timers_nohz_active(void) { return false; }
#endif /* NO_HZ_COMMON */

static unsigned long round_jiffies_common(unsigned long j, int cpu,
                bool force_up)
{
        int rem;
        unsigned long original = j;

        /*
         * We don't want all cpus firing their timers at once hitting the
         * same lock or cachelines, so we skew each extra cpu with an extra
         * 3 jiffies. This 3 jiffies came originally from the mm/ code which
         * already did this.
         * The skew is done by adding 3*cpunr, then round, then subtract this
         * extra offset again.
         */
        j += cpu * 3;

        rem = j % HZ;

        /*
         * If the target jiffy is just after a whole second (which can happen
         * due to delays of the timer irq, long irq off times etc etc) then
         * we should round down to the whole second, not up. Use 1/4th second
         * as cutoff for this rounding as an extreme upper bound for this.
         * But never round down if @force_up is set.
         */
        if (rem < HZ/4 && !force_up) /* round down */
                j = j - rem;
        else /* round up */
                j = j - rem + HZ;

        /* now that we have rounded, subtract the extra skew again */
        j -= cpu * 3;

        /*
         * Make sure j is still in the future. Otherwise return the
         * unmodified value.
         */
        return time_is_after_jiffies(j) ? j : original;
}

/**
 * __round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, false);
}
EXPORT_SYMBOL_GPL(__round_jiffies);

/**
 * __round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * __round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The exact rounding is skewed for each processor to avoid all
 * processors firing at the exact same time, which could lead
 * to lock contention or spurious cache line bouncing.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long __round_jiffies_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, false) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_relative);

/**
 * round_jiffies - function to round jiffies to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * round_jiffies() rounds an absolute time in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), false);
}
EXPORT_SYMBOL_GPL(round_jiffies);

/**
 * round_jiffies_relative - function to round jiffies to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * round_jiffies_relative() rounds a time delta  in the future (in jiffies)
 * up or down to (approximately) full seconds. This is useful for timers
 * for which the exact time they fire does not matter too much, as long as
 * they fire approximately every X seconds.
 *
 * By rounding these timers to whole seconds, all such timers will fire
 * at the same time, rather than at various times spread out. The goal
 * of this is to have the CPU wake up less, which saves power.
 *
 * The return value is the rounded version of the @j parameter.
 */
unsigned long round_jiffies_relative(unsigned long j)
{
        return __round_jiffies_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_relative);

/**
 * __round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up(unsigned long j, int cpu)
{
        return round_jiffies_common(j, cpu, true);
}
EXPORT_SYMBOL_GPL(__round_jiffies_up);

/**
 * __round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 * @cpu: the processor number on which the timeout will happen
 *
 * This is the same as __round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
{
        unsigned long j0 = jiffies;

        /* Use j0 because jiffies might change while we run */
        return round_jiffies_common(j + j0, cpu, true) - j0;
}
EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);

/**
 * round_jiffies_up - function to round jiffies up to a full second
 * @j: the time in (absolute) jiffies that should be rounded
 *
 * This is the same as round_jiffies() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up(unsigned long j)
{
        return round_jiffies_common(j, raw_smp_processor_id(), true);
}
EXPORT_SYMBOL_GPL(round_jiffies_up);

/**
 * round_jiffies_up_relative - function to round jiffies up to a full second
 * @j: the time in (relative) jiffies that should be rounded
 *
 * This is the same as round_jiffies_relative() except that it will never
 * round down.  This is useful for timeouts for which the exact time
 * of firing does not matter too much, as long as they don't fire too
 * early.
 */
unsigned long round_jiffies_up_relative(unsigned long j)
{
        return __round_jiffies_up_relative(j, raw_smp_processor_id());
}
EXPORT_SYMBOL_GPL(round_jiffies_up_relative);


static inline unsigned int timer_get_idx(struct timer_list *timer)
{
        return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
}

static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
{
        timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
                        idx << TIMER_ARRAYSHIFT;
}

/*
 * Helper function to calculate the array index for a given expiry
 * time.
 */
static inline unsigned calc_index(unsigned long expires, unsigned lvl,
                                  unsigned long *bucket_expiry)
{

        /*
         * The timer wheel has to guarantee that a timer does not fire
         * early. Early expiry can happen due to:
         * - Timer is armed at the edge of a tick
         * - Truncation of the expiry time in the outer wheel levels
         *
         * Round up with level granularity to prevent this.
         */
        expires = (expires >> LVL_SHIFT(lvl)) + 1;
        *bucket_expiry = expires << LVL_SHIFT(lvl);
        return LVL_OFFS(lvl) + (expires & LVL_MASK);
}

static int calc_wheel_index(unsigned long expires, unsigned long clk,
                            unsigned long *bucket_expiry)
{
        unsigned long delta = expires - clk;
        unsigned int idx;

        if (delta < LVL_START(1)) {
                idx = calc_index(expires, 0, bucket_expiry);
        } else if (delta < LVL_START(2)) {
                idx = calc_index(expires, 1, bucket_expiry);
        } else if (delta < LVL_START(3)) {
                idx = calc_index(expires, 2, bucket_expiry);
        } else if (delta < LVL_START(4)) {
                idx = calc_index(expires, 3, bucket_expiry);
        } else if (delta < LVL_START(5)) {
                idx = calc_index(expires, 4, bucket_expiry);
        } else if (delta < LVL_START(6)) {
                idx = calc_index(expires, 5, bucket_expiry);
        } else if (delta < LVL_START(7)) {
                idx = calc_index(expires, 6, bucket_expiry);
        } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
                idx = calc_index(expires, 7, bucket_expiry);
        } else if ((long) delta < 0) {
                idx = clk & LVL_MASK;
                *bucket_expiry = clk;
        } else {
                /*
                 * Force expire obscene large timeouts to expire at the
                 * capacity limit of the wheel.
                 */
                if (delta >= WHEEL_TIMEOUT_CUTOFF)
                        expires = clk + WHEEL_TIMEOUT_MAX;

                idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
        }
        return idx;
}

static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
        /*
         * Deferrable timers do not prevent the CPU from entering dynticks and
         * are not taken into account on the idle/nohz_full path. An IPI when a
         * new deferrable timer is enqueued will wake up the remote CPU but
         * nothing will be done with the deferrable timer base. Therefore skip
         * the remote IPI for deferrable timers completely.
         */
        if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
                return;

        /*
         * We might have to IPI the remote CPU if the base is idle and the
         * timer is pinned. If it is a non pinned timer, it is only queued
         * on the remote CPU, when timer was running during queueing. Then
         * everything is handled by remote CPU anyway. If the other CPU is
         * on the way to idle then it can't set base->is_idle as we hold
         * the base lock:
         */
        if (base->is_idle) {
                WARN_ON_ONCE(!(timer->flags & TIMER_PINNED ||
                               tick_nohz_full_cpu(base->cpu)));
                wake_up_nohz_cpu(base->cpu);
        }
}

/*
 * Enqueue the timer into the hash bucket, mark it pending in
 * the bitmap, store the index in the timer flags then wake up
 * the target CPU if needed.
 */
static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
                          unsigned int idx, unsigned long bucket_expiry)
{

        hlist_add_head(&timer->entry, base->vectors + idx);
        __set_bit(idx, base->pending_map);
        timer_set_idx(timer, idx);

        trace_timer_start(timer, bucket_expiry);

        /*
         * Check whether this is the new first expiring timer. The
         * effective expiry time of the timer is required here
         * (bucket_expiry) instead of timer->expires.
         */
        if (time_before(bucket_expiry, base->next_expiry)) {
                /*
                 * Set the next expiry time and kick the CPU so it
                 * can reevaluate the wheel:
                 */
                WRITE_ONCE(base->next_expiry, bucket_expiry);
                base->timers_pending = true;
                base->next_expiry_recalc = false;
                trigger_dyntick_cpu(base, timer);
        }
}

static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
{
        unsigned long bucket_expiry;
        unsigned int idx;

        idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
        enqueue_timer(base, timer, idx, bucket_expiry);
}

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr timer_debug_descr;

struct timer_hint {
        void        (*function)(struct timer_list *t);
        long        offset;
};

#define TIMER_HINT(fn, container, timr, hintfn)                        \
        {                                                        \
                .function = fn,                                        \
                .offset          = offsetof(container, hintfn) -        \
                            offsetof(container, timr)                \
        }

static const struct timer_hint timer_hints[] = {
        TIMER_HINT(delayed_work_timer_fn,
                   struct delayed_work, timer, work.func),
        TIMER_HINT(kthread_delayed_work_timer_fn,
                   struct kthread_delayed_work, timer, work.func),
};

static void *timer_debug_hint(void *addr)
{
        struct timer_list *timer = addr;
        int i;

        for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
                if (timer_hints[i].function == timer->function) {
                        void (**fn)(void) = addr + timer_hints[i].offset;

                        return *fn;
                }
        }

        return timer->function;
}

static bool timer_is_static_object(void *addr)
{
        struct timer_list *timer = addr;

        return (timer->entry.pprev == NULL &&
                timer->entry.next == TIMER_ENTRY_STATIC);
}

/*
 * timer_fixup_init is called when:
 * - an active object is initialized
 */
static bool timer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                timer_delete_sync(timer);
                debug_object_init(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/* Stub timer callback for improperly used timers. */
static void stub_timer(struct timer_list *unused)
{
        WARN_ON(1);
}

/*
 * timer_fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;

        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * timer_fixup_free is called when:
 * - an active object is freed
 */
static bool timer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                timer_delete_sync(timer);
                debug_object_free(timer, &timer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * timer_fixup_assert_init is called when:
 * - an untracked/uninit-ed object is found
 */
static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
{
        struct timer_list *timer = addr;

        switch (state) {
        case ODEBUG_STATE_NOTAVAILABLE:
                timer_setup(timer, stub_timer, 0);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr timer_debug_descr = {
        .name                        = "timer_list",
        .debug_hint                = timer_debug_hint,
        .is_static_object        = timer_is_static_object,
        .fixup_init                = timer_fixup_init,
        .fixup_activate                = timer_fixup_activate,
        .fixup_free                = timer_fixup_free,
        .fixup_assert_init        = timer_fixup_assert_init,
};

static inline void debug_timer_init(struct timer_list *timer)
{
        debug_object_init(timer, &timer_debug_descr);
}

static inline void debug_timer_activate(struct timer_list *timer)
{
        debug_object_activate(timer, &timer_debug_descr);
}

static inline void debug_timer_deactivate(struct timer_list *timer)
{
        debug_object_deactivate(timer, &timer_debug_descr);
}

static inline void debug_timer_assert_init(struct timer_list *timer)
{
        debug_object_assert_init(timer, &timer_debug_descr);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key);

void init_timer_on_stack_key(struct timer_list *timer,
                             void (*func)(struct timer_list *),
                             unsigned int flags,
                             const char *name, struct lock_class_key *key)
{
        debug_object_init_on_stack(timer, &timer_debug_descr);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);

void destroy_timer_on_stack(struct timer_list *timer)
{
        debug_object_free(timer, &timer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_timer_on_stack);

#else
static inline void debug_timer_init(struct timer_list *timer) { }
static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
static inline void debug_timer_assert_init(struct timer_list *timer) { }
#endif

static inline void debug_init(struct timer_list *timer)
{
        debug_timer_init(timer);
        trace_timer_init(timer);
}

static inline void debug_deactivate(struct timer_list *timer)
{
        debug_timer_deactivate(timer);
        trace_timer_cancel(timer);
}

static inline void debug_assert_init(struct timer_list *timer)
{
        debug_timer_assert_init(timer);
}

static void do_init_timer(struct timer_list *timer,
                          void (*func)(struct timer_list *),
                          unsigned int flags,
                          const char *name, struct lock_class_key *key)
{
        timer->entry.pprev = NULL;
        timer->function = func;
        if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
                flags &= TIMER_INIT_FLAGS;
        timer->flags = flags | raw_smp_processor_id();
        lockdep_init_map(&timer->lockdep_map, name, key, 0);
}

/**
 * init_timer_key - initialize a timer
 * @timer: the timer to be initialized
 * @func: timer callback function
 * @flags: timer flags
 * @name: name of the timer
 * @key: lockdep class key of the fake lock used for tracking timer
 *       sync lock dependencies
 *
 * init_timer_key() must be done to a timer prior to calling *any* of the
 * other timer functions.
 */
void init_timer_key(struct timer_list *timer,
                    void (*func)(struct timer_list *), unsigned int flags,
                    const char *name, struct lock_class_key *key)
{
        debug_init(timer);
        do_init_timer(timer, func, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);

static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
        struct hlist_node *entry = &timer->entry;

        debug_deactivate(timer);

        __hlist_del(entry);
        if (clear_pending)
                entry->pprev = NULL;
        entry->next = LIST_POISON2;
}

static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
                             bool clear_pending)
{
        unsigned idx = timer_get_idx(timer);

        if (!timer_pending(timer))
                return 0;

        if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
                __clear_bit(idx, base->pending_map);
                base->next_expiry_recalc = true;
        }

        detach_timer(timer, clear_pending);
        return 1;
}

static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                index = BASE_DEF;

        return per_cpu_ptr(&timer_bases[index], cpu);
}

static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
        int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;

        /*
         * If the timer is deferrable and NO_HZ_COMMON is set then we need
         * to use the deferrable base.
         */
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
                index = BASE_DEF;

        return this_cpu_ptr(&timer_bases[index]);
}

static inline struct timer_base *get_timer_base(u32 tflags)
{
        return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
}

static inline void __forward_timer_base(struct timer_base *base,
                                        unsigned long basej)
{
        /*
         * Check whether we can forward the base. We can only do that when
         * @basej is past base->clk otherwise we might rewind base->clk.
         */
        if (time_before_eq(basej, base->clk))
                return;

        /*
         * If the next expiry value is > jiffies, then we fast forward to
         * jiffies otherwise we forward to the next expiry value.
         */
        if (time_after(base->next_expiry, basej)) {
                base->clk = basej;
        } else {
                if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
                        return;
                base->clk = base->next_expiry;
        }

}

static inline void forward_timer_base(struct timer_base *base)
{
        __forward_timer_base(base, READ_ONCE(jiffies));
}

/*
 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
 * that all timers which are tied to this base are locked, and the base itself
 * is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found in the base->vectors array.
 *
 * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
 * to wait until the migration is done.
 */
static struct timer_base *lock_timer_base(struct timer_list *timer,
                                          unsigned long *flags)
        __acquires(timer->base->lock)
{
        for (;;) {
                struct timer_base *base;
                u32 tf;

                /*
                 * We need to use READ_ONCE() here, otherwise the compiler
                 * might re-read @tf between the check for TIMER_MIGRATING
                 * and spin_lock().
                 */
                tf = READ_ONCE(timer->flags);

                if (!(tf & TIMER_MIGRATING)) {
                        base = get_timer_base(tf);
                        raw_spin_lock_irqsave(&base->lock, *flags);
                        if (timer->flags == tf)
                                return base;
                        raw_spin_unlock_irqrestore(&base->lock, *flags);
                }
                cpu_relax();
        }
}

#define MOD_TIMER_PENDING_ONLY                0x01
#define MOD_TIMER_REDUCE                0x02
#define MOD_TIMER_NOTPENDING                0x04

static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
{
        unsigned long clk = 0, flags, bucket_expiry;
        struct timer_base *base, *new_base;
        unsigned int idx = UINT_MAX;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * This is a common optimization triggered by the networking code - if
         * the timer is re-modified to have the same timeout or ends up in the
         * same array bucket then just return:
         */
        if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
                /*
                 * The downside of this optimization is that it can result in
                 * larger granularity than you would get from adding a new
                 * timer with this expiry.
                 */
                long diff = timer->expires - expires;

                if (!diff)
                        return 1;
                if (options & MOD_TIMER_REDUCE && diff <= 0)
                        return 1;

                /*
                 * We lock timer base and calculate the bucket index right
                 * here. If the timer ends up in the same bucket, then we
                 * just update the expiry time and avoid the whole
                 * dequeue/enqueue dance.
                 */
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);

                if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
                    time_before_eq(timer->expires, expires)) {
                        ret = 1;
                        goto out_unlock;
                }

                clk = base->clk;
                idx = calc_wheel_index(expires, clk, &bucket_expiry);

                /*
                 * Retrieve and compare the array index of the pending
                 * timer. If it matches set the expiry to the new value so a
                 * subsequent call will exit in the expires check above.
                 */
                if (idx == timer_get_idx(timer)) {
                        if (!(options & MOD_TIMER_REDUCE))
                                timer->expires = expires;
                        else if (time_after(timer->expires, expires))
                                timer->expires = expires;
                        ret = 1;
                        goto out_unlock;
                }
        } else {
                base = lock_timer_base(timer, &flags);
                /*
                 * Has @timer been shutdown? This needs to be evaluated
                 * while holding base lock to prevent a race against the
                 * shutdown code.
                 */
                if (!timer->function)
                        goto out_unlock;

                forward_timer_base(base);
        }

        ret = detach_if_pending(timer, base, false);
        if (!ret && (options & MOD_TIMER_PENDING_ONLY))
                goto out_unlock;

        new_base = get_timer_this_cpu_base(timer->flags);

        if (base != new_base) {
                /*
                 * We are trying to schedule the timer on the new base.
                 * However we can't change timer's base while it is running,
                 * otherwise timer_delete_sync() can't detect that the timer's
                 * handler yet has not finished. This also guarantees that the
                 * timer is serialized wrt itself.
                 */
                if (likely(base->running_timer != timer)) {
                        /* See the comment in lock_timer_base() */
                        timer->flags |= TIMER_MIGRATING;

                        raw_spin_unlock(&base->lock);
                        base = new_base;
                        raw_spin_lock(&base->lock);
                        WRITE_ONCE(timer->flags,
                                   (timer->flags & ~TIMER_BASEMASK) | base->cpu);
                        forward_timer_base(base);
                }
        }

        debug_timer_activate(timer);

        timer->expires = expires;
        /*
         * If 'idx' was calculated above and the base time did not advance
         * between calculating 'idx' and possibly switching the base, only
         * enqueue_timer() is required. Otherwise we need to (re)calculate
         * the wheel index via internal_add_timer().
         */
        if (idx != UINT_MAX && clk == base->clk)
                enqueue_timer(base, timer, idx, bucket_expiry);
        else
                internal_add_timer(base, timer);

out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * mod_timer_pending - Modify a pending timer's timeout
 * @timer:        The pending timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer_pending() is the same for pending timers as mod_timer(), but
 * will not activate inactive timers.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and not modified or was in
 *          shutdown state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires
 */
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
}
EXPORT_SYMBOL(mod_timer_pending);

/**
 * mod_timer - Modify a timer's timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * mod_timer(timer, expires) is equivalent to:
 *
 *     timer_delete(timer); timer->expires = expires; add_timer(timer);
 *
 * mod_timer() is more efficient than the above open coded sequence. In
 * case that the timer is inactive, the timer_delete() part is a NOP. The
 * timer is in any case activated with the new expiry time @expires.
 *
 * Note that if there are multiple unserialized concurrent users of the
 * same timer, then mod_timer() is the only safe way to modify the timeout,
 * since add_timer() cannot modify an already running timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded. In this case the return value is 0 and meaningless.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires did
 *          not change the effective expiry time
 */
int mod_timer(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, 0);
}
EXPORT_SYMBOL(mod_timer);

/**
 * timer_reduce - Modify a timer's timeout if it would reduce the timeout
 * @timer:        The timer to be modified
 * @expires:        New absolute timeout in jiffies
 *
 * timer_reduce() is very similar to mod_timer(), except that it will only
 * modify an enqueued timer if that would reduce the expiration time. If
 * @timer is not enqueued it starts the timer.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * Return:
 * * %0 - The timer was inactive and started or was in shutdown
 *          state and the operation was discarded
 * * %1 - The timer was active and requeued to expire at @expires or
 *          the timer was active and not modified because @expires
 *          did not change the effective expiry time such that the
 *          timer would expire earlier than already scheduled
 */
int timer_reduce(struct timer_list *timer, unsigned long expires)
{
        return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
}
EXPORT_SYMBOL(timer_reduce);

/**
 * add_timer - Start a timer
 * @timer:        The timer to be started
 *
 * Start @timer to expire at @timer->expires in the future. @timer->expires
 * is the absolute expiry time measured in 'jiffies'. When the timer expires
 * timer->function(timer) will be invoked from soft interrupt context.
 *
 * The @timer->expires and @timer->function fields must be set prior
 * to calling this function.
 *
 * If @timer->function == NULL then the start operation is silently
 * discarded.
 *
 * If @timer->expires is already in the past @timer will be queued to
 * expire at the next timer tick.
 *
 * This can only operate on an inactive timer. Attempts to invoke this on
 * an active timer are rejected with a warning.
 */
void add_timer(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer);

/**
 * add_timer_local() - Start a timer on the local CPU
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is set.
 *
 * See add_timer() for further details.
 */
void add_timer_local(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags |= TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_local);

/**
 * add_timer_global() - Start a timer without TIMER_PINNED flag set
 * @timer:        The timer to be started
 *
 * Same as add_timer() except that the timer flag TIMER_PINNED is unset.
 *
 * See add_timer() for further details.
 */
void add_timer_global(struct timer_list *timer)
{
        if (WARN_ON_ONCE(timer_pending(timer)))
                return;
        timer->flags &= ~TIMER_PINNED;
        __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
}
EXPORT_SYMBOL(add_timer_global);

/**
 * add_timer_on - Start a timer on a particular CPU
 * @timer:        The timer to be started
 * @cpu:        The CPU to start it on
 *
 * Same as add_timer() except that it starts the timer on the given CPU and
 * the TIMER_PINNED flag is set. When timer shouldn't be a pinned timer in
 * the next round, add_timer_global() should be used instead as it unsets
 * the TIMER_PINNED flag.
 *
 * See add_timer() for further details.
 */
void add_timer_on(struct timer_list *timer, int cpu)
{
        struct timer_base *new_base, *base;
        unsigned long flags;

        debug_assert_init(timer);

        if (WARN_ON_ONCE(timer_pending(timer)))
                return;

        /* Make sure timer flags have TIMER_PINNED flag set */
        timer->flags |= TIMER_PINNED;

        new_base = get_timer_cpu_base(timer->flags, cpu);

        /*
         * If @timer was on a different CPU, it should be migrated with the
         * old base locked to prevent other operations proceeding with the
         * wrong base locked.  See lock_timer_base().
         */
        base = lock_timer_base(timer, &flags);
        /*
         * Has @timer been shutdown? This needs to be evaluated while
         * holding base lock to prevent a race against the shutdown code.
         */
        if (!timer->function)
                goto out_unlock;

        if (base != new_base) {
                timer->flags |= TIMER_MIGRATING;

                raw_spin_unlock(&base->lock);
                base = new_base;
                raw_spin_lock(&base->lock);
                WRITE_ONCE(timer->flags,
                           (timer->flags & ~TIMER_BASEMASK) | cpu);
        }
        forward_timer_base(base);

        debug_timer_activate(timer);
        internal_add_timer(base, timer);
out_unlock:
        raw_spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);

/**
 * __timer_delete - Internal function: Deactivate a timer
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the time. In that
 * case any attempt to rearm @timer after this function returns will be
 * silently ignored.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
static int __timer_delete(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = 0;

        debug_assert_init(timer);

        /*
         * If @shutdown is set then the lock has to be taken whether the
         * timer is pending or not to protect against a concurrent rearm
         * which might hit between the lockless pending check and the lock
         * acquisition. By taking the lock it is ensured that such a newly
         * enqueued timer is dequeued and cannot end up with
         * timer->function == NULL in the expiry code.
         *
         * If timer->function is currently executed, then this makes sure
         * that the callback cannot requeue the timer.
         */
        if (timer_pending(timer) || shutdown) {
                base = lock_timer_base(timer, &flags);
                ret = detach_if_pending(timer, base, true);
                if (shutdown)
                        timer->function = NULL;
                raw_spin_unlock_irqrestore(&base->lock, flags);
        }

        return ret;
}

/**
 * timer_delete - Deactivate a timer
 * @timer:        The timer to be deactivated
 *
 * The function only deactivates a pending timer, but contrary to
 * timer_delete_sync() it does not take into account whether the timer's
 * callback function is concurrently executed on a different CPU or not.
 * It neither prevents rearming of the timer.  If @timer can be rearmed
 * concurrently then the return value of this function is meaningless.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending and deactivated
 */
int timer_delete(struct timer_list *timer)
{
        return __timer_delete(timer, false);
}
EXPORT_SYMBOL(timer_delete);

/**
 * timer_shutdown - Deactivate a timer and prevent rearming
 * @timer:        The timer to be deactivated
 *
 * The function does not wait for an eventually running timer callback on a
 * different CPU but it prevents rearming of the timer. Any attempt to arm
 * @timer after this function returns will be silently ignored.
 *
 * This function is useful for teardown code and should only be used when
 * timer_shutdown_sync() cannot be invoked due to locking or context constraints.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown(struct timer_list *timer)
{
        return __timer_delete(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown);

/**
 * __try_to_del_timer_sync - Internal function: Try to deactivate a timer
 * @timer:        Timer to deactivate
 * @shutdown:        If true, this indicates that the timer is about to be
 *                shutdown permanently.
 *
 * If @shutdown is true then @timer->function is set to NULL under the
 * timer base lock which prevents further rearming of the timer. Any
 * attempt to rearm @timer after this function returns will be silently
 * ignored.
 *
 * This function cannot guarantee that the timer cannot be rearmed
 * right after dropping the base lock if @shutdown is false. That
 * needs to be prevented by the calling code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown)
{
        struct timer_base *base;
        unsigned long flags;
        int ret = -1;

        debug_assert_init(timer);

        base = lock_timer_base(timer, &flags);

        if (base->running_timer != timer)
                ret = detach_if_pending(timer, base, true);
        if (shutdown)
                timer->function = NULL;

        raw_spin_unlock_irqrestore(&base->lock, flags);

        return ret;
}

/**
 * try_to_del_timer_sync - Try to deactivate a timer
 * @timer:        Timer to deactivate
 *
 * This function tries to deactivate a timer. On success the timer is not
 * queued and the timer callback function is not running on any CPU.
 *
 * This function does not guarantee that the timer cannot be rearmed right
 * after dropping the base lock. That needs to be prevented by the calling
 * code if necessary.
 *
 * Return:
 * * %0  - The timer was not pending
 * * %1  - The timer was pending and deactivated
 * * %-1 - The timer callback function is running on a different CPU
 */
int try_to_del_timer_sync(struct timer_list *timer)
{
        return __try_to_del_timer_sync(timer, false);
}
EXPORT_SYMBOL(try_to_del_timer_sync);

#ifdef CONFIG_PREEMPT_RT
static __init void timer_base_init_expiry_lock(struct timer_base *base)
{
        spin_lock_init(&base->expiry_lock);
}

static inline void timer_base_lock_expiry(struct timer_base *base)
{
        spin_lock(&base->expiry_lock);
}

static inline void timer_base_unlock_expiry(struct timer_base *base)
{
        spin_unlock(&base->expiry_lock);
}

/*
 * The counterpart to del_timer_wait_running().
 *
 * If there is a waiter for base->expiry_lock, then it was waiting for the
 * timer callback to finish. Drop expiry_lock and reacquire it. That allows
 * the waiter to acquire the lock and make progress.
 */
static void timer_sync_wait_running(struct timer_base *base)
        __releases(&base->lock) __releases(&base->expiry_lock)
        __acquires(&base->expiry_lock) __acquires(&base->lock)
{
        if (atomic_read(&base->timer_waiters)) {
                raw_spin_unlock_irq(&base->lock);
                spin_unlock(&base->expiry_lock);
                spin_lock(&base->expiry_lock);
                raw_spin_lock_irq(&base->lock);
        }
}

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion, if the softirq thread on a remote CPU
 * got preempted, and it prevents a life lock when the task which tries to
 * delete a timer preempted the softirq thread running the timer callback
 * function.
 */
static void del_timer_wait_running(struct timer_list *timer)
{
        u32 tf;

        tf = READ_ONCE(timer->flags);
        if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
                struct timer_base *base = get_timer_base(tf);

                /*
                 * Mark the base as contended and grab the expiry lock,
                 * which is held by the softirq across the timer
                 * callback. Drop the lock immediately so the softirq can
                 * expire the next timer. In theory the timer could already
                 * be running again, but that's more than unlikely and just
                 * causes another wait loop.
                 */
                atomic_inc(&base->timer_waiters);
                spin_lock_bh(&base->expiry_lock);
                atomic_dec(&base->timer_waiters);
                spin_unlock_bh(&base->expiry_lock);
        }
}
#else
static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
static inline void timer_base_lock_expiry(struct timer_base *base) { }
static inline void timer_base_unlock_expiry(struct timer_base *base) { }
static inline void timer_sync_wait_running(struct timer_base *base) { }
static inline void del_timer_wait_running(struct timer_list *timer) { }
#endif

/**
 * __timer_delete_sync - Internal function: Deactivate a timer and wait
 *                         for the handler to finish.
 * @timer:        The timer to be deactivated
 * @shutdown:        If true, @timer->function will be set to NULL under the
 *                timer base lock which prevents rearming of @timer
 *
 * If @shutdown is not set the timer can be rearmed later. If the timer can
 * be rearmed concurrently, i.e. after dropping the base lock then the
 * return value is meaningless.
 *
 * If @shutdown is set then @timer->function is set to NULL under timer
 * base lock which prevents rearming of the timer. Any attempt to rearm
 * a shutdown timer is silently ignored.
 *
 * If the timer should be reused after shutdown it has to be initialized
 * again.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
static int __timer_delete_sync(struct timer_list *timer, bool shutdown)
{
        int ret;

#ifdef CONFIG_LOCKDEP
        unsigned long flags;

        /*
         * If lockdep gives a backtrace here, please reference
         * the synchronization rules above.
         */
        local_irq_save(flags);
        lock_map_acquire(&timer->lockdep_map);
        lock_map_release(&timer->lockdep_map);
        local_irq_restore(flags);
#endif
        /*
         * don't use it in hardirq context, because it
         * could lead to deadlock.
         */
        WARN_ON(in_hardirq() && !(timer->flags & TIMER_IRQSAFE));

        /*
         * Must be able to sleep on PREEMPT_RT because of the slowpath in
         * del_timer_wait_running().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
                lockdep_assert_preemption_enabled();

        do {
                ret = __try_to_del_timer_sync(timer, shutdown);

                if (unlikely(ret < 0)) {
                        del_timer_wait_running(timer);
                        cpu_relax();
                }
        } while (ret < 0);

        return ret;
}

/**
 * timer_delete_sync - Deactivate a timer and wait for the handler to finish.
 * @timer:        The timer to be deactivated
 *
 * Synchronization rules: Callers must prevent restarting of the timer,
 * otherwise this function is meaningless. It must not be called from
 * interrupt contexts unless the timer is an irqsafe one. The caller must
 * not hold locks which would prevent completion of the timer's callback
 * function. The timer's handler must not call add_timer_on(). Upon exit
 * the timer is not queued and the handler is not running on any CPU.
 *
 * For !irqsafe timers, the caller must not hold locks that are held in
 * interrupt context. Even if the lock has nothing to do with the timer in
 * question.  Here's why::
 *
 *    CPU0                             CPU1
 *    ----                             ----
 *                                     <SOFTIRQ>
 *                                       call_timer_fn();
 *                                       base->running_timer = mytimer;
 *    spin_lock_irq(somelock);
 *                                     <IRQ>
 *                                        spin_lock(somelock);
 *    timer_delete_sync(mytimer);
 *    while (base->running_timer == mytimer);
 *
 * Now timer_delete_sync() will never return and never release somelock.
 * The interrupt on the other CPU is waiting to grab somelock but it has
 * interrupted the softirq that CPU0 is waiting to finish.
 *
 * This function cannot guarantee that the timer is not rearmed again by
 * some concurrent or preempting code, right after it dropped the base
 * lock. If there is the possibility of a concurrent rearm then the return
 * value of the function is meaningless.
 *
 * If such a guarantee is needed, e.g. for teardown situations then use
 * timer_shutdown_sync() instead.
 *
 * Return:
 * * %0        - The timer was not pending
 * * %1        - The timer was pending and deactivated
 */
int timer_delete_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, false);
}
EXPORT_SYMBOL(timer_delete_sync);

/**
 * timer_shutdown_sync - Shutdown a timer and prevent rearming
 * @timer: The timer to be shutdown
 *
 * When the function returns it is guaranteed that:
 *   - @timer is not queued
 *   - The callback function of @timer is not running
 *   - @timer cannot be enqueued again. Any attempt to rearm
 *     @timer is silently ignored.
 *
 * See timer_delete_sync() for synchronization rules.
 *
 * This function is useful for final teardown of an infrastructure where
 * the timer is subject to a circular dependency problem.
 *
 * A common pattern for this is a timer and a workqueue where the timer can
 * schedule work and work can arm the timer. On shutdown the workqueue must
 * be destroyed and the timer must be prevented from rearming. Unless the
 * code has conditionals like 'if (mything->in_shutdown)' to prevent that
 * there is no way to get this correct with timer_delete_sync().
 *
 * timer_shutdown_sync() is solving the problem. The correct ordering of
 * calls in this case is:
 *
 *        timer_shutdown_sync(&mything->timer);
 *        workqueue_destroy(&mything->workqueue);
 *
 * After this 'mything' can be safely freed.
 *
 * This obviously implies that the timer is not required to be functional
 * for the rest of the shutdown operation.
 *
 * Return:
 * * %0 - The timer was not pending
 * * %1 - The timer was pending
 */
int timer_shutdown_sync(struct timer_list *timer)
{
        return __timer_delete_sync(timer, true);
}
EXPORT_SYMBOL_GPL(timer_shutdown_sync);

static void call_timer_fn(struct timer_list *timer,
                          void (*fn)(struct timer_list *),
                          unsigned long baseclk)
{
        int count = preempt_count();

#ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the timer from inside the
         * function that is called from it, this we need to take into
         * account for lockdep too. To avoid bogus "held lock freed"
         * warnings as well as problems when looking into
         * timer->lockdep_map, make a copy and use that here.
         */
        struct lockdep_map lockdep_map;

        lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
#endif
        /*
         * Couple the lock chain with the lock chain at
         * timer_delete_sync() by acquiring the lock_map around the fn()
         * call here and in timer_delete_sync().
         */
        lock_map_acquire(&lockdep_map);

        trace_timer_expire_entry(timer, baseclk);
        fn(timer);
        trace_timer_expire_exit(timer);

        lock_map_release(&lockdep_map);

        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
                preempt_count_set(count);
        }
}

static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
        /*
         * This value is required only for tracing. base->clk was
         * incremented directly before expire_timers was called. But expiry
         * is related to the old base->clk value.
         */
        unsigned long baseclk = base->clk - 1;

        while (!hlist_empty(head)) {
                struct timer_list *timer;
                void (*fn)(struct timer_list *);

                timer = hlist_entry(head->first, struct timer_list, entry);

                base->running_timer = timer;
                detach_timer(timer, true);

                fn = timer->function;

                if (WARN_ON_ONCE(!fn)) {
                        /* Should never happen. Emphasis on should! */
                        base->running_timer = NULL;
                        continue;
                }

                if (timer->flags & TIMER_IRQSAFE) {
                        raw_spin_unlock(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock(&base->lock);
                        base->running_timer = NULL;
                } else {
                        raw_spin_unlock_irq(&base->lock);
                        call_timer_fn(timer, fn, baseclk);
                        raw_spin_lock_irq(&base->lock);
                        base->running_timer = NULL;
                        timer_sync_wait_running(base);
                }
        }
}

static int collect_expired_timers(struct timer_base *base,
                                  struct hlist_head *heads)
{
        unsigned long clk = base->clk = base->next_expiry;
        struct hlist_head *vec;
        int i, levels = 0;
        unsigned int idx;

        for (i = 0; i < LVL_DEPTH; i++) {
                idx = (clk & LVL_MASK) + i * LVL_SIZE;

                if (__test_and_clear_bit(idx, base->pending_map)) {
                        vec = base->vectors + idx;
                        hlist_move_list(vec, heads++);
                        levels++;
                }
                /* Is it time to look at the next level? */
                if (clk & LVL_CLK_MASK)
                        break;
                /* Shift clock for the next level granularity */
                clk >>= LVL_CLK_SHIFT;
        }
        return levels;
}

/*
 * Find the next pending bucket of a level. Search from level start (@offset)
 * + @clk upwards and if nothing there, search from start of the level
 * (@offset) up to @offset + clk.
 */
static int next_pending_bucket(struct timer_base *base, unsigned offset,
                               unsigned clk)
{
        unsigned pos, start = offset + clk;
        unsigned end = offset + LVL_SIZE;

        pos = find_next_bit(base->pending_map, end, start);
        if (pos < end)
                return pos - start;

        pos = find_next_bit(base->pending_map, start, offset);
        return pos < start ? pos + LVL_SIZE - start : -1;
}

/*
 * Search the first expiring timer in the various clock levels. Caller must
 * hold base->lock.
 *
 * Store next expiry time in base->next_expiry.
 */
static void timer_recalc_next_expiry(struct timer_base *base)
{
        unsigned long clk, next, adj;
        unsigned lvl, offset = 0;

        next = base->clk + NEXT_TIMER_MAX_DELTA;
        clk = base->clk;
        for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
                int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
                unsigned long lvl_clk = clk & LVL_CLK_MASK;

                if (pos >= 0) {
                        unsigned long tmp = clk + (unsigned long) pos;

                        tmp <<= LVL_SHIFT(lvl);
                        if (time_before(tmp, next))
                                next = tmp;

                        /*
                         * If the next expiration happens before we reach
                         * the next level, no need to check further.
                         */
                        if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
                                break;
                }
                /*
                 * Clock for the next level. If the current level clock lower
                 * bits are zero, we look at the next level as is. If not we
                 * need to advance it by one because that's going to be the
                 * next expiring bucket in that level. base->clk is the next
                 * expiring jiffy. So in case of:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    0
                 *
                 * we have to look at all levels @index 0. With
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    0    2
                 *
                 * LVL0 has the next expiring bucket @index 2. The upper
                 * levels have the next expiring bucket @index 1.
                 *
                 * In case that the propagation wraps the next level the same
                 * rules apply:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
                 *  0    0    0    0    F    2
                 *
                 * So after looking at LVL0 we get:
                 *
                 * LVL5 LVL4 LVL3 LVL2 LVL1
                 *  0    0    0    1    0
                 *
                 * So no propagation from LVL1 to LVL2 because that happened
                 * with the add already, but then we need to propagate further
                 * from LVL2 to LVL3.
                 *
                 * So the simple check whether the lower bits of the current
                 * level are 0 or not is sufficient for all cases.
                 */
                adj = lvl_clk ? 1 : 0;
                clk >>= LVL_CLK_SHIFT;
                clk += adj;
        }

        WRITE_ONCE(base->next_expiry, next);
        base->next_expiry_recalc = false;
        base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
}

#ifdef CONFIG_NO_HZ_COMMON
/*
 * Check, if the next hrtimer event is before the next timer wheel
 * event:
 */
static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
{
        u64 nextevt = hrtimer_get_next_event();

        /*
         * If high resolution timers are enabled
         * hrtimer_get_next_event() returns KTIME_MAX.
         */
        if (expires <= nextevt)
                return expires;

        /*
         * If the next timer is already expired, return the tick base
         * time so the tick is fired immediately.
         */
        if (nextevt <= basem)
                return basem;

        /*
         * Round up to the next jiffy. High resolution timers are
         * off, so the hrtimers are expired in the tick and we need to
         * make sure that this tick really expires the timer to avoid
         * a ping pong of the nohz stop code.
         *
         * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
         */
        return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
}

static unsigned long next_timer_interrupt(struct timer_base *base,
                                          unsigned long basej)
{
        if (base->next_expiry_recalc)
                timer_recalc_next_expiry(base);

        /*
         * Move next_expiry for the empty base into the future to prevent an
         * unnecessary raise of the timer softirq when the next_expiry value
         * will be reached even if there is no timer pending.
         *
         * This update is also required to make timer_base::next_expiry values
         * easy comparable to find out which base holds the first pending timer.
         */
        if (!base->timers_pending)
                WRITE_ONCE(base->next_expiry, basej + NEXT_TIMER_MAX_DELTA);

        return base->next_expiry;
}

static unsigned long fetch_next_timer_interrupt(unsigned long basej, u64 basem,
                                                struct timer_base *base_local,
                                                struct timer_base *base_global,
                                                struct timer_events *tevt)
{
        unsigned long nextevt, nextevt_local, nextevt_global;
        bool local_first;

        nextevt_local = next_timer_interrupt(base_local, basej);
        nextevt_global = next_timer_interrupt(base_global, basej);

        local_first = time_before_eq(nextevt_local, nextevt_global);

        nextevt = local_first ? nextevt_local : nextevt_global;

        /*
         * If the @nextevt is at max. one tick away, use @nextevt and store
         * it in the local expiry value. The next global event is irrelevant in
         * this case and can be left as KTIME_MAX.
         */
        if (time_before_eq(nextevt, basej + 1)) {
                /* If we missed a tick already, force 0 delta */
                if (time_before(nextevt, basej))
                        nextevt = basej;
                tevt->local = basem + (u64)(nextevt - basej) * TICK_NSEC;

                /*
                 * This is required for the remote check only but it doesn't
                 * hurt, when it is done for both call sites:
                 *
                 * * The remote callers will only take care of the global timers
                 *   as local timers will be handled by CPU itself. When not
                 *   updating tevt->global with the already missed first global
                 *   timer, it is possible that it will be missed completely.
                 *
                 * * The local callers will ignore the tevt->global anyway, when
                 *   nextevt is max. one tick away.
                 */
                if (!local_first)
                        tevt->global = tevt->local;
                return nextevt;
        }

        /*
         * Update tevt.* values:
         *
         * If the local queue expires first, then the global event can be
         * ignored. If the global queue is empty, nothing to do either.
         */
        if (!local_first && base_global->timers_pending)
                tevt->global = basem + (u64)(nextevt_global - basej) * TICK_NSEC;

        if (base_local->timers_pending)
                tevt->local = basem + (u64)(nextevt_local - basej) * TICK_NSEC;

        return nextevt;
}

# ifdef CONFIG_SMP
/**
 * fetch_next_timer_interrupt_remote() - Store next timers into @tevt
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @tevt:        Pointer to the storage for the expiry values
 * @cpu:        Remote CPU
 *
 * Stores the next pending local and global timer expiry values in the
 * struct pointed to by @tevt. If a queue is empty the corresponding
 * field is set to KTIME_MAX. If local event expires before global
 * event, global event is set to KTIME_MAX as well.
 *
 * Caller needs to make sure timer base locks are held (use
 * timer_lock_remote_bases() for this purpose).
 */
void fetch_next_timer_interrupt_remote(unsigned long basej, u64 basem,
                                       struct timer_events *tevt,
                                       unsigned int cpu)
{
        struct timer_base *base_local, *base_global;

        /* Preset local / global events */
        tevt->local = tevt->global = KTIME_MAX;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_held(&base_local->lock);
        lockdep_assert_held(&base_global->lock);

        fetch_next_timer_interrupt(basej, basem, base_local, base_global, tevt);
}

/**
 * timer_unlock_remote_bases - unlock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Unlocks the remote timer bases.
 */
void timer_unlock_remote_bases(unsigned int cpu)
        __releases(timer_bases[BASE_LOCAL]->lock)
        __releases(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);
}

/**
 * timer_lock_remote_bases - lock timer bases of cpu
 * @cpu:        Remote CPU
 *
 * Locks the remote timer bases.
 */
void timer_lock_remote_bases(unsigned int cpu)
        __acquires(timer_bases[BASE_LOCAL]->lock)
        __acquires(timer_bases[BASE_GLOBAL]->lock)
{
        struct timer_base *base_local, *base_global;

        base_local = per_cpu_ptr(&timer_bases[BASE_LOCAL], cpu);
        base_global = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        lockdep_assert_irqs_disabled();

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);
}

/**
 * timer_base_is_idle() - Return whether timer base is set idle
 *
 * Returns value of local timer base is_idle value.
 */
bool timer_base_is_idle(void)
{
        return __this_cpu_read(timer_bases[BASE_LOCAL].is_idle);
}

static void __run_timer_base(struct timer_base *base);

/**
 * timer_expire_remote() - expire global timers of cpu
 * @cpu:        Remote CPU
 *
 * Expire timers of global base of remote CPU.
 */
void timer_expire_remote(unsigned int cpu)
{
        struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_GLOBAL], cpu);

        __run_timer_base(base);
}

static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        u64 next_tmigr;

        if (timer_base_idle)
                next_tmigr = tmigr_cpu_new_timer(tevt->global);
        else if (tick_stop_path)
                next_tmigr = tmigr_cpu_deactivate(tevt->global);
        else
                next_tmigr = tmigr_quick_check(tevt->global);

        /*
         * If the CPU is the last going idle in timer migration hierarchy, make
         * sure the CPU will wake up in time to handle remote timers.
         * next_tmigr == KTIME_MAX if other CPUs are still active.
         */
        if (next_tmigr < tevt->local) {
                u64 tmp;

                /* If we missed a tick already, force 0 delta */
                if (next_tmigr < basem)
                        next_tmigr = basem;

                tmp = div_u64(next_tmigr - basem, TICK_NSEC);

                *nextevt = basej + (unsigned long)tmp;
                tevt->local = next_tmigr;
        }
}
# else
static void timer_use_tmigr(unsigned long basej, u64 basem,
                            unsigned long *nextevt, bool *tick_stop_path,
                            bool timer_base_idle, struct timer_events *tevt)
{
        /*
         * Make sure first event is written into tevt->local to not miss a
         * timer on !SMP systems.
         */
        tevt->local = min_t(u64, tevt->local, tevt->global);
}
# endif /* CONFIG_SMP */

static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem,
                                             bool *idle)
{
        struct timer_events tevt = { .local = KTIME_MAX, .global = KTIME_MAX };
        struct timer_base *base_local, *base_global;
        unsigned long nextevt;
        bool idle_is_possible;

        /*
         * When the CPU is offline, the tick is cancelled and nothing is supposed
         * to try to stop it.
         */
        if (WARN_ON_ONCE(cpu_is_offline(smp_processor_id()))) {
                if (idle)
                        *idle = true;
                return tevt.local;
        }

        base_local = this_cpu_ptr(&timer_bases[BASE_LOCAL]);
        base_global = this_cpu_ptr(&timer_bases[BASE_GLOBAL]);

        raw_spin_lock(&base_local->lock);
        raw_spin_lock_nested(&base_global->lock, SINGLE_DEPTH_NESTING);

        nextevt = fetch_next_timer_interrupt(basej, basem, base_local,
                                             base_global, &tevt);

        /*
         * If the next event is only one jiffy ahead there is no need to call
         * timer migration hierarchy related functions. The value for the next
         * global timer in @tevt struct equals then KTIME_MAX. This is also
         * true, when the timer base is idle.
         *
         * The proper timer migration hierarchy function depends on the callsite
         * and whether timer base is idle or not. @nextevt will be updated when
         * this CPU needs to handle the first timer migration hierarchy
         * event. See timer_use_tmigr() for detailed information.
         */
        idle_is_possible = time_after(nextevt, basej + 1);
        if (idle_is_possible)
                timer_use_tmigr(basej, basem, &nextevt, idle,
                                base_local->is_idle, &tevt);

        /*
         * We have a fresh next event. Check whether we can forward the
         * base.
         */
        __forward_timer_base(base_local, basej);
        __forward_timer_base(base_global, basej);

        /*
         * Set base->is_idle only when caller is timer_base_try_to_set_idle()
         */
        if (idle) {
                /*
                 * Bases are idle if the next event is more than a tick
                 * away. Caution: @nextevt could have changed by enqueueing a
                 * global timer into timer migration hierarchy. Therefore a new
                 * check is required here.
                 *
                 * If the base is marked idle then any timer add operation must
                 * forward the base clk itself to keep granularity small. This
                 * idle logic is only maintained for the BASE_LOCAL and
                 * BASE_GLOBAL base, deferrable timers may still see large
                 * granularity skew (by design).
                 */
                if (!base_local->is_idle && time_after(nextevt, basej + 1)) {
                        base_local->is_idle = true;
                        /*
                         * Global timers queued locally while running in a task
                         * in nohz_full mode need a self-IPI to kick reprogramming
                         * in IRQ tail.
                         */
                        if (tick_nohz_full_cpu(base_local->cpu))
                                base_global->is_idle = true;
                        trace_timer_base_idle(true, base_local->cpu);
                }
                *idle = base_local->is_idle;

                /*
                 * When timer base is not set idle, undo the effect of
                 * tmigr_cpu_deactivate() to prevent inconsistent states - active
                 * timer base but inactive timer migration hierarchy.
                 *
                 * When timer base was already marked idle, nothing will be
                 * changed here.
                 */
                if (!base_local->is_idle && idle_is_possible)
                        tmigr_cpu_activate();
        }

        raw_spin_unlock(&base_global->lock);
        raw_spin_unlock(&base_local->lock);

        return cmp_next_hrtimer_event(basem, tevt.local);
}

/**
 * get_next_timer_interrupt() - return the time (clock mono) of the next timer
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. If timer of global base was queued into
 * timer migration hierarchy, first global timer is not taken into account. If
 * it was the last CPU of timer migration hierarchy going idle, first global
 * event is taken into account.
 */
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
        return __get_next_timer_interrupt(basej, basem, NULL);
}

/**
 * timer_base_try_to_set_idle() - Try to set the idle state of the timer bases
 * @basej:        base time jiffies
 * @basem:        base time clock monotonic
 * @idle:        pointer to store the value of timer_base->is_idle on return;
 *                *idle contains the information whether tick was already stopped
 *
 * Returns the tick aligned clock monotonic time of the next pending timer or
 * KTIME_MAX if no timer is pending. When tick was already stopped KTIME_MAX is
 * returned as well.
 */
u64 timer_base_try_to_set_idle(unsigned long basej, u64 basem, bool *idle)
{
        if (*idle)
                return KTIME_MAX;

        return __get_next_timer_interrupt(basej, basem, idle);
}

/**
 * timer_clear_idle - Clear the idle state of the timer base
 *
 * Called with interrupts disabled
 */
void timer_clear_idle(void)
{
        /*
         * We do this unlocked. The worst outcome is a remote pinned timer
         * enqueue sending a pointless IPI, but taking the lock would just
         * make the window for sending the IPI a few instructions smaller
         * for the cost of taking the lock in the exit from idle
         * path. Required for BASE_LOCAL only.
         */
        __this_cpu_write(timer_bases[BASE_LOCAL].is_idle, false);
        if (tick_nohz_full_cpu(smp_processor_id()))
                __this_cpu_write(timer_bases[BASE_GLOBAL].is_idle, false);
        trace_timer_base_idle(false, smp_processor_id());

        /* Activate without holding the timer_base->lock */
        tmigr_cpu_activate();
}
#endif

/**
 * __run_timers - run all expired timers (if any) on this CPU.
 * @base: the timer vector to be processed.
 */
static inline void __run_timers(struct timer_base *base)
{
        struct hlist_head heads[LVL_DEPTH];
        int levels;

        lockdep_assert_held(&base->lock);

        if (base->running_timer)
                return;

        while (time_after_eq(jiffies, base->clk) &&
               time_after_eq(jiffies, base->next_expiry)) {
                levels = collect_expired_timers(base, heads);
                /*
                 * The two possible reasons for not finding any expired
                 * timer at this clk are that all matching timers have been
                 * dequeued or no timer has been queued since
                 * base::next_expiry was set to base::clk +
                 * NEXT_TIMER_MAX_DELTA.
                 */
                WARN_ON_ONCE(!levels && !base->next_expiry_recalc
                             && base->timers_pending);
                /*
                 * While executing timers, base->clk is set 1 offset ahead of
                 * jiffies to avoid endless requeuing to current jiffies.
                 */
                base->clk++;
                timer_recalc_next_expiry(base);

                while (levels--)
                        expire_timers(base, heads + levels);
        }
}

static void __run_timer_base(struct timer_base *base)
{
        /* Can race against a remote CPU updating next_expiry under the lock */
        if (time_before(jiffies, READ_ONCE(base->next_expiry)))
                return;

        timer_base_lock_expiry(base);
        raw_spin_lock_irq(&base->lock);
        __run_timers(base);
        raw_spin_unlock_irq(&base->lock);
        timer_base_unlock_expiry(base);
}

static void run_timer_base(int index)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[index]);

        __run_timer_base(base);
}

/*
 * This function runs timers and the timer-tq in bottom half context.
 */
static __latent_entropy void run_timer_softirq(void)
{
        run_timer_base(BASE_LOCAL);
        if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) {
                run_timer_base(BASE_GLOBAL);
                run_timer_base(BASE_DEF);

                if (is_timers_nohz_active())
                        tmigr_handle_remote();
        }
}

/*
 * Called by the local, per-CPU timer interrupt on SMP.
 */
static void run_local_timers(void)
{
        struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_LOCAL]);

        hrtimer_run_queues();

        for (int i = 0; i < NR_BASES; i++, base++) {
                /*
                 * Raise the softirq only if required.
                 *
                 * timer_base::next_expiry can be written by a remote CPU while
                 * holding the lock. If this write happens at the same time than
                 * the lockless local read, sanity checker could complain about
                 * data corruption.
                 *
                 * There are two possible situations where
                 * timer_base::next_expiry is written by a remote CPU:
                 *
                 * 1. Remote CPU expires global timers of this CPU and updates
                 * timer_base::next_expiry of BASE_GLOBAL afterwards in
                 * next_timer_interrupt() or timer_recalc_next_expiry(). The
                 * worst outcome is a superfluous raise of the timer softirq
                 * when the not yet updated value is read.
                 *
                 * 2. A new first pinned timer is enqueued by a remote CPU
                 * and therefore timer_base::next_expiry of BASE_LOCAL is
                 * updated. When this update is missed, this isn't a
                 * problem, as an IPI is executed nevertheless when the CPU
                 * was idle before. When the CPU wasn't idle but the update
                 * is missed, then the timer would expire one jiffy late -
                 * bad luck.
                 *
                 * Those unlikely corner cases where the worst outcome is only a
                 * one jiffy delay or a superfluous raise of the softirq are
                 * not that expensive as doing the check always while holding
                 * the lock.
                 *
                 * Possible remote writers are using WRITE_ONCE(). Local reader
                 * uses therefore READ_ONCE().
                 */
                if (time_after_eq(jiffies, READ_ONCE(base->next_expiry)) ||
                    (i == BASE_DEF && tmigr_requires_handle_remote())) {
                        raise_timer_softirq(TIMER_SOFTIRQ);
                        return;
                }
        }
}

/*
 * Called from the timer interrupt handler to charge one tick to the current
 * process.  user_tick is 1 if the tick is user time, 0 for system.
 */
void update_process_times(int user_tick)
{
        struct task_struct *p = current;

        /* Note: this timer irq context must be accounted for as well. */
        account_process_tick(p, user_tick);
        run_local_timers();
        rcu_sched_clock_irq(user_tick);
#ifdef CONFIG_IRQ_WORK
        if (in_irq())
                irq_work_tick();
#endif
        sched_tick();
        if (IS_ENABLED(CONFIG_POSIX_TIMERS))
                run_posix_cpu_timers();
}

#ifdef CONFIG_HOTPLUG_CPU
static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
{
        struct timer_list *timer;
        int cpu = new_base->cpu;

        while (!hlist_empty(head)) {
                timer = hlist_entry(head->first, struct timer_list, entry);
                detach_timer(timer, false);
                timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
                internal_add_timer(new_base, timer);
        }
}

int timers_prepare_cpu(unsigned int cpu)
{
        struct timer_base *base;
        int b;

        for (b = 0; b < NR_BASES; b++) {
                base = per_cpu_ptr(&timer_bases[b], cpu);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                base->next_expiry_recalc = false;
                base->timers_pending = false;
                base->is_idle = false;
        }
        return 0;
}

int timers_dead_cpu(unsigned int cpu)
{
        struct timer_base *old_base;
        struct timer_base *new_base;
        int b, i;

        for (b = 0; b < NR_BASES; b++) {
                old_base = per_cpu_ptr(&timer_bases[b], cpu);
                new_base = get_cpu_ptr(&timer_bases[b]);
                /*
                 * The caller is globally serialized and nobody else
                 * takes two locks at once, deadlock is not possible.
                 */
                raw_spin_lock_irq(&new_base->lock);
                raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);

                /*
                 * The current CPUs base clock might be stale. Update it
                 * before moving the timers over.
                 */
                forward_timer_base(new_base);

                WARN_ON_ONCE(old_base->running_timer);
                old_base->running_timer = NULL;

                for (i = 0; i < WHEEL_SIZE; i++)
                        migrate_timer_list(new_base, old_base->vectors + i);

                raw_spin_unlock(&old_base->lock);
                raw_spin_unlock_irq(&new_base->lock);
                put_cpu_ptr(&timer_bases);
        }
        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

static void __init init_timer_cpu(int cpu)
{
        struct timer_base *base;
        int i;

        for (i = 0; i < NR_BASES; i++) {
                base = per_cpu_ptr(&timer_bases[i], cpu);
                base->cpu = cpu;
                raw_spin_lock_init(&base->lock);
                base->clk = jiffies;
                base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
                timer_base_init_expiry_lock(base);
        }
}

static void __init init_timer_cpus(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                init_timer_cpu(cpu);
}

void __init init_timers(void)
{
        init_timer_cpus();
        posix_cputimers_init_work();
        open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
}













































































































































































































  254 


























  255 

























  255 































  254 


































































































































































































  255 



  255 

  255 



















































































































































































































































































































































































































































































































































































































  255 
  255 
  254 































  255 










  255 












  255 
























  255 


















































































  255 

  255 








  255 
















































  255 









  255 





  255 


















  365 










  365 





  255 






















































































































  175 





  175 

























































































   72 













   72 


























   72 





   72 




















   72 






































  365 

  254 



  175 



















































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
// SPDX-License-Identifier: GPL-2.0
/*
 *  Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de>
 *  Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar
 *  Copyright(C) 2006-2007  Timesys Corp., Thomas Gleixner
 *
 *  High-resolution kernel timers
 *
 *  In contrast to the low-resolution timeout API, aka timer wheel,
 *  hrtimers provide finer resolution and accuracy depending on system
 *  configuration and capabilities.
 *
 *  Started by: Thomas Gleixner and Ingo Molnar
 *
 *  Credits:
 *        Based on the original timer wheel code
 *
 *        Help, testing, suggestions, bugfixes, improvements were
 *        provided by:
 *
 *        George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
 *        et. al.
 */

#include <linux/cpu.h>
#include <linux/export.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
#include <linux/notifier.h>
#include <linux/syscalls.h>
#include <linux/interrupt.h>
#include <linux/tick.h>
#include <linux/err.h>
#include <linux/debugobjects.h>
#include <linux/sched/signal.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/sched/nohz.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/timer.h>
#include <linux/freezer.h>
#include <linux/compat.h>

#include <linux/uaccess.h>

#include <trace/events/timer.h>

#include "tick-internal.h"

/*
 * Masks for selecting the soft and hard context timers from
 * cpu_base->active
 */
#define MASK_SHIFT                (HRTIMER_BASE_MONOTONIC_SOFT)
#define HRTIMER_ACTIVE_HARD        ((1U << MASK_SHIFT) - 1)
#define HRTIMER_ACTIVE_SOFT        (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL        (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)

static void retrigger_next_event(void *arg);

/*
 * The timer bases:
 *
 * There are more clockids than hrtimer bases. Thus, we index
 * into the timer bases by the hrtimer_base_type enum. When trying
 * to reach a base using a clockid, hrtimer_clockid_to_base()
 * is used to convert from clockid to the proper hrtimer_base_type.
 */
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
        .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
        .clock_base =
        {
                {
                        .index = HRTIMER_BASE_MONOTONIC,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
                {
                        .index = HRTIMER_BASE_MONOTONIC_SOFT,
                        .clockid = CLOCK_MONOTONIC,
                        .get_time = &ktime_get,
                },
                {
                        .index = HRTIMER_BASE_REALTIME_SOFT,
                        .clockid = CLOCK_REALTIME,
                        .get_time = &ktime_get_real,
                },
                {
                        .index = HRTIMER_BASE_BOOTTIME_SOFT,
                        .clockid = CLOCK_BOOTTIME,
                        .get_time = &ktime_get_boottime,
                },
                {
                        .index = HRTIMER_BASE_TAI_SOFT,
                        .clockid = CLOCK_TAI,
                        .get_time = &ktime_get_clocktai,
                },
        },
        .csd = CSD_INIT(retrigger_next_event, NULL)
};

static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
{
        if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
                return true;
        else
                return likely(base->online);
}

/*
 * Functions and macros which are different for UP/SMP systems are kept in a
 * single place
 */
#ifdef CONFIG_SMP

/*
 * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
 * such that hrtimer_callback_running() can unconditionally dereference
 * timer->base->cpu_base
 */
static struct hrtimer_cpu_base migration_cpu_base = {
        .clock_base = { {
                .cpu_base = &migration_cpu_base,
                .seq      = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq,
                                                     &migration_cpu_base.lock),
        }, },
};

#define migration_base        migration_cpu_base.clock_base[0]

/*
 * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
 * means that all timers which are tied to this base via timer->base are
 * locked, and the base itself is locked too.
 *
 * So __run_timers/migrate_timers can safely modify all timers which could
 * be found on the lists/queues.
 *
 * When the timer's base is locked, and the timer removed from list, it is
 * possible to set timer->base = &migration_base and drop the lock: the timer
 * remains locked.
 */
static
struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
                                             unsigned long *flags)
        __acquires(&timer->base->lock)
{
        struct hrtimer_clock_base *base;

        for (;;) {
                base = READ_ONCE(timer->base);
                if (likely(base != &migration_base)) {
                        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
                        if (likely(base == timer->base))
                                return base;
                        /* The timer has migrated to another CPU: */
                        raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
                }
                cpu_relax();
        }
}

/*
 * Check if the elected target is suitable considering its next
 * event and the hotplug state of the current CPU.
 *
 * If the elected target is remote and its next event is after the timer
 * to queue, then a remote reprogram is necessary. However there is no
 * guarantee the IPI handling the operation would arrive in time to meet
 * the high resolution deadline. In this case the local CPU becomes a
 * preferred target, unless it is offline.
 *
 * High and low resolution modes are handled the same way for simplicity.
 *
 * Called with cpu_base->lock of target cpu held.
 */
static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
                                    struct hrtimer_cpu_base *new_cpu_base,
                                    struct hrtimer_cpu_base *this_cpu_base)
{
        ktime_t expires;

        /*
         * The local CPU clockevent can be reprogrammed. Also get_target_base()
         * guarantees it is online.
         */
        if (new_cpu_base == this_cpu_base)
                return true;

        /*
         * The offline local CPU can't be the default target if the
         * next remote target event is after this timer. Keep the
         * elected new base. An IPI will we issued to reprogram
         * it as a last resort.
         */
        if (!hrtimer_base_is_online(this_cpu_base))
                return true;

        expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);

        return expires >= new_base->cpu_base->expires_next;
}

static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
        if (!hrtimer_base_is_online(base)) {
                int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));

                return &per_cpu(hrtimer_bases, cpu);
        }

#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
        if (static_branch_likely(&timers_migration_enabled) && !pinned)
                return &per_cpu(hrtimer_bases, get_nohz_timer_target());
#endif
        return base;
}

/*
 * We switch the timer base to a power-optimized selected CPU target,
 * if:
 *        - NO_HZ_COMMON is enabled
 *        - timer migration is enabled
 *        - the timer callback is not running
 *        - the timer is not the first expiring timer on the new target
 *
 * If one of the above requirements is not fulfilled we move the timer
 * to the current CPU or leave it on the previously assigned CPU if
 * the timer callback is currently running.
 */
static inline struct hrtimer_clock_base *
switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                    int pinned)
{
        struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
        struct hrtimer_clock_base *new_base;
        int basenum = base->index;

        this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        new_cpu_base = get_target_base(this_cpu_base, pinned);
again:
        new_base = &new_cpu_base->clock_base[basenum];

        if (base != new_base) {
                /*
                 * We are trying to move timer to new_base.
                 * However we can't change timer's base while it is running,
                 * so we keep it on the same CPU. No hassle vs. reprogramming
                 * the event source in the high resolution case. The softirq
                 * code will take care of this when the timer function has
                 * completed. There is no conflict as we hold the lock until
                 * the timer is enqueued.
                 */
                if (unlikely(hrtimer_callback_running(timer)))
                        return base;

                /* See the comment in lock_hrtimer_base() */
                WRITE_ONCE(timer->base, &migration_base);
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);

                if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
                                             this_cpu_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
                        new_cpu_base = this_cpu_base;
                        WRITE_ONCE(timer->base, base);
                        goto again;
                }
                WRITE_ONCE(timer->base, new_base);
        } else {
                if (!hrtimer_suitable_target(timer, new_base,  new_cpu_base, this_cpu_base)) {
                        new_cpu_base = this_cpu_base;
                        goto again;
                }
        }
        return new_base;
}

#else /* CONFIG_SMP */

static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        __acquires(&timer->base->cpu_base->lock)
{
        struct hrtimer_clock_base *base = timer->base;

        raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);

        return base;
}

# define switch_hrtimer_base(t, b, p)        (b)

#endif        /* !CONFIG_SMP */

/*
 * Functions for the union type storage format of ktime_t which are
 * too large for inlining:
 */
#if BITS_PER_LONG < 64
/*
 * Divide a ktime value by a nanosecond value
 */
s64 __ktime_divns(const ktime_t kt, s64 div)
{
        int sft = 0;
        s64 dclc;
        u64 tmp;

        dclc = ktime_to_ns(kt);
        tmp = dclc < 0 ? -dclc : dclc;

        /* Make sure the divisor is less than 2^32: */
        while (div >> 32) {
                sft++;
                div >>= 1;
        }
        tmp >>= sft;
        do_div(tmp, (u32) div);
        return dclc < 0 ? -tmp : tmp;
}
EXPORT_SYMBOL_GPL(__ktime_divns);
#endif /* BITS_PER_LONG >= 64 */

/*
 * Add two ktime values and do a safety check for overflow:
 */
ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs)
{
        ktime_t res = ktime_add_unsafe(lhs, rhs);

        /*
         * We use KTIME_SEC_MAX here, the maximum timeout which we can
         * return to user space in a timespec:
         */
        if (res < 0 || res < lhs || res < rhs)
                res = ktime_set(KTIME_SEC_MAX, 0);

        return res;
}

EXPORT_SYMBOL_GPL(ktime_add_safe);

#ifdef CONFIG_DEBUG_OBJECTS_TIMERS

static const struct debug_obj_descr hrtimer_debug_descr;

static void *hrtimer_debug_hint(void *addr)
{
        return ACCESS_PRIVATE((struct hrtimer *)addr, function);
}

/*
 * fixup_init is called when:
 * - an active object is initialized
 */
static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_init(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

/*
 * fixup_activate is called when:
 * - an active object is activated
 * - an unknown non-static object is activated
 */
static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state)
{
        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                WARN_ON(1);
                fallthrough;
        default:
                return false;
        }
}

/*
 * fixup_free is called when:
 * - an active object is freed
 */
static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
{
        struct hrtimer *timer = addr;

        switch (state) {
        case ODEBUG_STATE_ACTIVE:
                hrtimer_cancel(timer);
                debug_object_free(timer, &hrtimer_debug_descr);
                return true;
        default:
                return false;
        }
}

static const struct debug_obj_descr hrtimer_debug_descr = {
        .name                = "hrtimer",
        .debug_hint        = hrtimer_debug_hint,
        .fixup_init        = hrtimer_fixup_init,
        .fixup_activate        = hrtimer_fixup_activate,
        .fixup_free        = hrtimer_fixup_free,
};

static inline void debug_hrtimer_init(struct hrtimer *timer)
{
        debug_object_init(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer)
{
        debug_object_init_on_stack(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode)
{
        debug_object_activate(timer, &hrtimer_debug_descr);
}

static inline void debug_hrtimer_deactivate(struct hrtimer *timer)
{
        debug_object_deactivate(timer, &hrtimer_debug_descr);
}

void destroy_hrtimer_on_stack(struct hrtimer *timer)
{
        debug_object_free(timer, &hrtimer_debug_descr);
}
EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);

#else

static inline void debug_hrtimer_init(struct hrtimer *timer) { }
static inline void debug_hrtimer_init_on_stack(struct hrtimer *timer) { }
static inline void debug_hrtimer_activate(struct hrtimer *timer,
                                          enum hrtimer_mode mode) { }
static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { }
#endif

static inline void debug_setup(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode)
{
        debug_hrtimer_init(timer);
        trace_hrtimer_setup(timer, clockid, mode);
}

static inline void debug_setup_on_stack(struct hrtimer *timer, clockid_t clockid,
                                        enum hrtimer_mode mode)
{
        debug_hrtimer_init_on_stack(timer);
        trace_hrtimer_setup(timer, clockid, mode);
}

static inline void debug_activate(struct hrtimer *timer,
                                  enum hrtimer_mode mode)
{
        debug_hrtimer_activate(timer, mode);
        trace_hrtimer_start(timer, mode);
}

static inline void debug_deactivate(struct hrtimer *timer)
{
        debug_hrtimer_deactivate(timer);
        trace_hrtimer_cancel(timer);
}

static struct hrtimer_clock_base *
__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active)
{
        unsigned int idx;

        if (!*active)
                return NULL;

        idx = __ffs(*active);
        *active &= ~(1U << idx);

        return &cpu_base->clock_base[idx];
}

#define for_each_active_base(base, cpu_base, active)        \
        while ((base = __next_base((cpu_base), &(active))))

static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
                                         const struct hrtimer *exclude,
                                         unsigned int active,
                                         ktime_t expires_next)
{
        struct hrtimer_clock_base *base;
        ktime_t expires;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *next;
                struct hrtimer *timer;

                next = timerqueue_getnext(&base->active);
                timer = container_of(next, struct hrtimer, node);
                if (timer == exclude) {
                        /* Get to the next timer in the queue. */
                        next = timerqueue_iterate_next(next);
                        if (!next)
                                continue;

                        timer = container_of(next, struct hrtimer, node);
                }
                expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
                if (expires < expires_next) {
                        expires_next = expires;

                        /* Skip cpu_base update if a timer is being excluded. */
                        if (exclude)
                                continue;

                        if (timer->is_soft)
                                cpu_base->softirq_next_timer = timer;
                        else
                                cpu_base->next_timer = timer;
                }
        }
        /*
         * clock_was_set() might have changed base->offset of any of
         * the clock bases so the result might be negative. Fix it up
         * to prevent a false positive in clockevents_program_event().
         */
        if (expires_next < 0)
                expires_next = 0;
        return expires_next;
}

/*
 * Recomputes cpu_base::*next_timer and returns the earliest expires_next
 * but does not set cpu_base::*expires_next, that is done by
 * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating
 * cpu_base::*expires_next right away, reprogramming logic would no longer
 * work.
 *
 * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases,
 * those timers will get run whenever the softirq gets handled, at the end of
 * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases.
 *
 * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases.
 * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual
 * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD.
 *
 * @active_mask must be one of:
 *  - HRTIMER_ACTIVE_ALL,
 *  - HRTIMER_ACTIVE_SOFT, or
 *  - HRTIMER_ACTIVE_HARD.
 */
static ktime_t
__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask)
{
        unsigned int active;
        struct hrtimer *next_timer = NULL;
        ktime_t expires_next = KTIME_MAX;

        if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                cpu_base->softirq_next_timer = NULL;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL,
                                                         active, KTIME_MAX);

                next_timer = cpu_base->softirq_next_timer;
        }

        if (active_mask & HRTIMER_ACTIVE_HARD) {
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                cpu_base->next_timer = next_timer;
                expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
                                                         expires_next);
        }

        return expires_next;
}

static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base)
{
        ktime_t expires_next, soft = KTIME_MAX;

        /*
         * If the soft interrupt has already been activated, ignore the
         * soft bases. They will be handled in the already raised soft
         * interrupt.
         */
        if (!cpu_base->softirq_activated) {
                soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);
                /*
                 * Update the soft expiry time. clock_settime() might have
                 * affected it.
                 */
                cpu_base->softirq_expires_next = soft;
        }

        expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD);
        /*
         * If a softirq timer is expiring first, update cpu_base->next_timer
         * and program the hardware with the soft expiry time.
         */
        if (expires_next > soft) {
                cpu_base->next_timer = cpu_base->softirq_next_timer;
                expires_next = soft;
        }

        return expires_next;
}

static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
        ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
        ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
        ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;

        ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq,
                                            offs_real, offs_boot, offs_tai);

        base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real;
        base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot;
        base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai;

        return now;
}

/*
 * Is the high resolution mode active ?
 */
static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
{
        return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ?
                cpu_base->hres_active : 0;
}

static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base,
                                struct hrtimer *next_timer,
                                ktime_t expires_next)
{
        cpu_base->expires_next = expires_next;

        /*
         * If hres is not active, hardware does not have to be
         * reprogrammed yet.
         *
         * If a hang was detected in the last timer interrupt then we
         * leave the hang delay active in the hardware. We want the
         * system to make progress. That also prevents the following
         * scenario:
         * T1 expires 50ms from now
         * T2 expires 5s from now
         *
         * T1 is removed, so this code is called and would reprogram
         * the hardware to 5s from now. Any hrtimer_start after that
         * will not reprogram the hardware due to hang_detected being
         * set. So we'd effectively block all timers until the T2 event
         * fires.
         */
        if (!hrtimer_hres_active(cpu_base) || cpu_base->hang_detected)
                return;

        tick_program_event(expires_next, 1);
}

/*
 * Reprogram the event source with checking both queues for the
 * next event
 * Called with interrupts disabled and base->lock held
 */
static void
hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
{
        ktime_t expires_next;

        expires_next = hrtimer_update_next_event(cpu_base);

        if (skip_equal && expires_next == cpu_base->expires_next)
                return;

        __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next);
}

/* High resolution timer related functions */
#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer enabled ?
 */
static bool hrtimer_hres_enabled __read_mostly  = true;
unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
EXPORT_SYMBOL_GPL(hrtimer_resolution);

/*
 * Enable / Disable high resolution mode
 */
static int __init setup_hrtimer_hres(char *str)
{
        return (kstrtobool(str, &hrtimer_hres_enabled) == 0);
}

__setup("highres=", setup_hrtimer_hres);

/*
 * hrtimer_high_res_enabled - query, if the highres mode is enabled
 */
static inline int hrtimer_is_hres_enabled(void)
{
        return hrtimer_hres_enabled;
}

/*
 * Switch to high resolution mode
 */
static void hrtimer_switch_to_hres(void)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        if (tick_init_highres()) {
                pr_warn("Could not switch to high resolution mode on CPU %u\n",
                        base->cpu);
                return;
        }
        base->hres_active = 1;
        hrtimer_resolution = HIGH_RES_NSEC;

        tick_setup_sched_timer(true);
        /* "Retrigger" the interrupt to get things going */
        retrigger_next_event(NULL);
}

#else

static inline int hrtimer_is_hres_enabled(void) { return 0; }
static inline void hrtimer_switch_to_hres(void) { }

#endif /* CONFIG_HIGH_RES_TIMERS */
/*
 * Retrigger next event is called after clock was set with interrupts
 * disabled through an SMP function call or directly from low level
 * resume code.
 *
 * This is only invoked when:
 *        - CONFIG_HIGH_RES_TIMERS is enabled.
 *        - CONFIG_NOHZ_COMMON is enabled
 *
 * For the other cases this function is empty and because the call sites
 * are optimized out it vanishes as well, i.e. no need for lots of
 * #ifdeffery.
 */
static void retrigger_next_event(void *arg)
{
        struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);

        /*
         * When high resolution mode or nohz is active, then the offsets of
         * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the
         * next tick will take care of that.
         *
         * If high resolution mode is active then the next expiring timer
         * must be reevaluated and the clock event device reprogrammed if
         * necessary.
         *
         * In the NOHZ case the update of the offset and the reevaluation
         * of the next expiring timer is enough. The return from the SMP
         * function call will take care of the reprogramming in case the
         * CPU was in a NOHZ idle sleep.
         */
        if (!hrtimer_hres_active(base) && !tick_nohz_active)
                return;

        raw_spin_lock(&base->lock);
        hrtimer_update_base(base);
        if (hrtimer_hres_active(base))
                hrtimer_force_reprogram(base, 0);
        else
                hrtimer_update_next_event(base);
        raw_spin_unlock(&base->lock);
}

/*
 * When a timer is enqueued and expires earlier than the already enqueued
 * timers, we have to check, whether it expires earlier than the timer for
 * which the clock event device was armed.
 *
 * Called with interrupts disabled and base->cpu_base.lock held
 */
static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *base = timer->base;
        ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);

        WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);

        /*
         * CLOCK_REALTIME timer might be requested with an absolute
         * expiry time which is less than base->offset. Set it to 0.
         */
        if (expires < 0)
                expires = 0;

        if (timer->is_soft) {
                /*
                 * soft hrtimer could be started on a remote CPU. In this
                 * case softirq_expires_next needs to be updated on the
                 * remote CPU. The soft hrtimer will not expire before the
                 * first hard hrtimer on the remote CPU -
                 * hrtimer_check_target() prevents this case.
                 */
                struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base;

                if (timer_cpu_base->softirq_activated)
                        return;

                if (!ktime_before(expires, timer_cpu_base->softirq_expires_next))
                        return;

                timer_cpu_base->softirq_next_timer = timer;
                timer_cpu_base->softirq_expires_next = expires;

                if (!ktime_before(expires, timer_cpu_base->expires_next) ||
                    !reprogram)
                        return;
        }

        /*
         * If the timer is not on the current cpu, we cannot reprogram
         * the other cpus clock event device.
         */
        if (base->cpu_base != cpu_base)
                return;

        if (expires >= cpu_base->expires_next)
                return;

        /*
         * If the hrtimer interrupt is running, then it will reevaluate the
         * clock bases and reprogram the clock event device.
         */
        if (cpu_base->in_hrtirq)
                return;

        cpu_base->next_timer = timer;

        __hrtimer_reprogram(cpu_base, timer, expires);
}

static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base,
                             unsigned int active)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;
        ktime_t expires;

        /*
         * Update the base offsets unconditionally so the following
         * checks whether the SMP function call is required works.
         *
         * The update is safe even when the remote CPU is in the hrtimer
         * interrupt or the hrtimer soft interrupt and expiring affected
         * bases. Either it will see the update before handling a base or
         * it will see it when it finishes the processing and reevaluates
         * the next expiring timer.
         */
        seq = cpu_base->clock_was_set_seq;
        hrtimer_update_base(cpu_base);

        /*
         * If the sequence did not change over the update then the
         * remote CPU already handled it.
         */
        if (seq == cpu_base->clock_was_set_seq)
                return false;

        /*
         * If the remote CPU is currently handling an hrtimer interrupt, it
         * will reevaluate the first expiring timer of all clock bases
         * before reprogramming. Nothing to do here.
         */
        if (cpu_base->in_hrtirq)
                return false;

        /*
         * Walk the affected clock bases and check whether the first expiring
         * timer in a clock base is moving ahead of the first expiring timer of
         * @cpu_base. If so, the IPI must be invoked because per CPU clock
         * event devices cannot be remotely reprogrammed.
         */
        active &= cpu_base->active_bases;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *next;

                next = timerqueue_getnext(&base->active);
                expires = ktime_sub(next->expires, base->offset);
                if (expires < cpu_base->expires_next)
                        return true;

                /* Extra check for softirq clock bases */
                if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT)
                        continue;
                if (cpu_base->softirq_activated)
                        continue;
                if (expires < cpu_base->softirq_expires_next)
                        return true;
        }
        return false;
}

/*
 * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and
 * CLOCK_BOOTTIME (for late sleep time injection).
 *
 * This requires to update the offsets for these clocks
 * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this
 * also requires to eventually reprogram the per CPU clock event devices
 * when the change moves an affected timer ahead of the first expiring
 * timer on that CPU. Obviously remote per CPU clock event devices cannot
 * be reprogrammed. The other reason why an IPI has to be sent is when the
 * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets
 * in the tick, which obviously might be stopped, so this has to bring out
 * the remote CPU which might sleep in idle to get this sorted.
 */
void clock_was_set(unsigned int bases)
{
        struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases);
        cpumask_var_t mask;
        int cpu;

        if (!hrtimer_hres_active(cpu_base) && !tick_nohz_active)
                goto out_timerfd;

        if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
                on_each_cpu(retrigger_next_event, NULL, 1);
                goto out_timerfd;
        }

        /* Avoid interrupting CPUs if possible */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                unsigned long flags;

                cpu_base = &per_cpu(hrtimer_bases, cpu);
                raw_spin_lock_irqsave(&cpu_base->lock, flags);

                if (update_needs_ipi(cpu_base, bases))
                        cpumask_set_cpu(cpu, mask);

                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        }

        preempt_disable();
        smp_call_function_many(mask, retrigger_next_event, NULL, 1);
        preempt_enable();
        cpus_read_unlock();
        free_cpumask_var(mask);

out_timerfd:
        timerfd_clock_was_set();
}

static void clock_was_set_work(struct work_struct *work)
{
        clock_was_set(CLOCK_SET_WALL);
}

static DECLARE_WORK(hrtimer_work, clock_was_set_work);

/*
 * Called from timekeeping code to reprogram the hrtimer interrupt device
 * on all cpus and to notify timerfd.
 */
void clock_was_set_delayed(void)
{
        schedule_work(&hrtimer_work);
}

/*
 * Called during resume either directly from via timekeeping_resume()
 * or in the case of s2idle from tick_unfreeze() to ensure that the
 * hrtimers are up to date.
 */
void hrtimers_resume_local(void)
{
        lockdep_assert_irqs_disabled();
        /* Retrigger on the local CPU */
        retrigger_next_event(NULL);
}

/*
 * Counterpart to lock_hrtimer_base above:
 */
static inline
void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
        __releases(&timer->base->cpu_base->lock)
{
        raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
}

/**
 * hrtimer_forward() - forward the timer expiry
 * @timer:        hrtimer to forward
 * @now:        forward past this time
 * @interval:        the interval to forward
 *
 * Forward the timer expiry so it will expire in the future.
 *
 * .. note::
 *  This only updates the timer expiry value and does not requeue the timer.
 *
 * There is also a variant of the function hrtimer_forward_now().
 *
 * Context: Can be safely called from the callback function of @timer. If called
 *          from other contexts @timer must neither be enqueued nor running the
 *          callback and the caller needs to take care of serialization.
 *
 * Return: The number of overruns are returned.
 */
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
{
        u64 orun = 1;
        ktime_t delta;

        delta = ktime_sub(now, hrtimer_get_expires(timer));

        if (delta < 0)
                return 0;

        if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
                return 0;

        if (interval < hrtimer_resolution)
                interval = hrtimer_resolution;

        if (unlikely(delta >= interval)) {
                s64 incr = ktime_to_ns(interval);

                orun = ktime_divns(delta, incr);
                hrtimer_add_expires_ns(timer, incr * orun);
                if (hrtimer_get_expires_tv64(timer) > now)
                        return orun;
                /*
                 * This (and the ktime_add() below) is the
                 * correction for exact:
                 */
                orun++;
        }
        hrtimer_add_expires(timer, interval);

        return orun;
}
EXPORT_SYMBOL_GPL(hrtimer_forward);

/*
 * enqueue_hrtimer - internal function to (re)start a timer
 *
 * The timer is inserted in expiry order. Insertion into the
 * red black tree is O(log(n)). Must hold the base lock.
 *
 * Returns true when the new timer is the leftmost timer in the tree.
 */
static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
                            enum hrtimer_mode mode)
{
        debug_activate(timer, mode);
        WARN_ON_ONCE(!base->cpu_base->online);

        base->cpu_base->active_bases |= 1 << base->index;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);

        return timerqueue_add(&base->active, &timer->node);
}

/*
 * __remove_hrtimer - internal function to remove a timer
 *
 * Caller must hold the base lock.
 *
 * High resolution timer mode reprograms the clock event device when the
 * timer is the one which expires next. The caller can disable this by setting
 * reprogram to zero. This is useful, when the context does a reprogramming
 * anyway (e.g. timer interrupt)
 */
static void __remove_hrtimer(struct hrtimer *timer,
                             struct hrtimer_clock_base *base,
                             u8 newstate, int reprogram)
{
        struct hrtimer_cpu_base *cpu_base = base->cpu_base;
        u8 state = timer->state;

        /* Pairs with the lockless read in hrtimer_is_queued() */
        WRITE_ONCE(timer->state, newstate);
        if (!(state & HRTIMER_STATE_ENQUEUED))
                return;

        if (!timerqueue_del(&base->active, &timer->node))
                cpu_base->active_bases &= ~(1 << base->index);

        /*
         * Note: If reprogram is false we do not update
         * cpu_base->next_timer. This happens when we remove the first
         * timer on a remote cpu. No harm as we never dereference
         * cpu_base->next_timer. So the worst thing what can happen is
         * an superfluous call to hrtimer_force_reprogram() on the
         * remote cpu later on if the same timer gets enqueued again.
         */
        if (reprogram && timer == cpu_base->next_timer)
                hrtimer_force_reprogram(cpu_base, 1);
}

/*
 * remove hrtimer, called with base lock held
 */
static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
               bool restart, bool keep_local)
{
        u8 state = timer->state;

        if (state & HRTIMER_STATE_ENQUEUED) {
                bool reprogram;

                /*
                 * Remove the timer and force reprogramming when high
                 * resolution mode is active and the timer is on the current
                 * CPU. If we remove a timer on another CPU, reprogramming is
                 * skipped. The interrupt event on this CPU is fired and
                 * reprogramming happens in the interrupt handler. This is a
                 * rare case and less expensive than a smp call.
                 */
                debug_deactivate(timer);
                reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);

                /*
                 * If the timer is not restarted then reprogramming is
                 * required if the timer is local. If it is local and about
                 * to be restarted, avoid programming it twice (on removal
                 * and a moment later when it's requeued).
                 */
                if (!restart)
                        state = HRTIMER_STATE_INACTIVE;
                else
                        reprogram &= !keep_local;

                __remove_hrtimer(timer, base, state, reprogram);
                return 1;
        }
        return 0;
}

static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
                                            const enum hrtimer_mode mode)
{
#ifdef CONFIG_TIME_LOW_RES
        /*
         * CONFIG_TIME_LOW_RES indicates that the system has no way to return
         * granular time values. For relative timers we add hrtimer_resolution
         * (i.e. one jiffy) to prevent short timeouts.
         */
        timer->is_rel = mode & HRTIMER_MODE_REL;
        if (timer->is_rel)
                tim = ktime_add_safe(tim, hrtimer_resolution);
#endif
        return tim;
}

static void
hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram)
{
        ktime_t expires;

        /*
         * Find the next SOFT expiration.
         */
        expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT);

        /*
         * reprogramming needs to be triggered, even if the next soft
         * hrtimer expires at the same time than the next hard
         * hrtimer. cpu_base->softirq_expires_next needs to be updated!
         */
        if (expires == KTIME_MAX)
                return;

        /*
         * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event()
         * cpu_base->*expires_next is only set by hrtimer_reprogram()
         */
        hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram);
}

static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                                    u64 delta_ns, const enum hrtimer_mode mode,
                                    struct hrtimer_clock_base *base)
{
        struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
        struct hrtimer_clock_base *new_base;
        bool force_local, first;

        /*
         * If the timer is on the local cpu base and is the first expiring
         * timer then this might end up reprogramming the hardware twice
         * (on removal and on enqueue). To avoid that by prevent the
         * reprogram on removal, keep the timer local to the current CPU
         * and enforce reprogramming after it is queued no matter whether
         * it is the new first expiring timer again or not.
         */
        force_local = base->cpu_base == this_cpu_base;
        force_local &= base->cpu_base->next_timer == timer;

        /*
         * Don't force local queuing if this enqueue happens on a unplugged
         * CPU after hrtimer_cpu_dying() has been invoked.
         */
        force_local &= this_cpu_base->online;

        /*
         * Remove an active timer from the queue. In case it is not queued
         * on the current CPU, make sure that remove_hrtimer() updates the
         * remote data correctly.
         *
         * If it's on the current CPU and the first expiring timer, then
         * skip reprogramming, keep the timer local and enforce
         * reprogramming later if it was the first expiring timer.  This
         * avoids programming the underlying clock event twice (once at
         * removal and once after enqueue).
         */
        remove_hrtimer(timer, base, true, force_local);

        if (mode & HRTIMER_MODE_REL)
                tim = ktime_add_safe(tim, base->get_time());

        tim = hrtimer_update_lowres(timer, tim, mode);

        hrtimer_set_expires_range_ns(timer, tim, delta_ns);

        /* Switch the timer base, if necessary: */
        if (!force_local) {
                new_base = switch_hrtimer_base(timer, base,
                                               mode & HRTIMER_MODE_PINNED);
        } else {
                new_base = base;
        }

        first = enqueue_hrtimer(timer, new_base, mode);
        if (!force_local) {
                /*
                 * If the current CPU base is online, then the timer is
                 * never queued on a remote CPU if it would be the first
                 * expiring timer there.
                 */
                if (hrtimer_base_is_online(this_cpu_base))
                        return first;

                /*
                 * Timer was enqueued remote because the current base is
                 * already offline. If the timer is the first to expire,
                 * kick the remote CPU to reprogram the clock event.
                 */
                if (first) {
                        struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;

                        smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
                }
                return 0;
        }

        /*
         * Timer was forced to stay on the current CPU to avoid
         * reprogramming on removal and enqueue. Force reprogram the
         * hardware by evaluating the new first expiring timer.
         */
        hrtimer_force_reprogram(new_base->cpu_base, 1);
        return 0;
}

/**
 * hrtimer_start_range_ns - (re)start an hrtimer
 * @timer:        the timer to be added
 * @tim:        expiry time
 * @delta_ns:        "slack" range for the timer
 * @mode:        timer mode: absolute (HRTIMER_MODE_ABS) or
 *                relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED);
 *                softirq based mode is considered for debug purpose only!
 */
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
                            u64 delta_ns, const enum hrtimer_mode mode)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;

        /*
         * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft
         * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard
         * expiry mode because unmarked timers are moved to softirq expiry.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft);
        else
                WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard);

        base = lock_hrtimer_base(timer, &flags);

        if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base))
                hrtimer_reprogram(timer, true);

        unlock_hrtimer_base(timer, &flags);
}
EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);

/**
 * hrtimer_try_to_cancel - try to deactivate a timer
 * @timer:        hrtimer to stop
 *
 * Returns:
 *
 *  *  0 when the timer was not active
 *  *  1 when the timer was active
 *  * -1 when the timer is currently executing the callback function and
 *    cannot be stopped
 */
int hrtimer_try_to_cancel(struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned long flags;
        int ret = -1;

        /*
         * Check lockless first. If the timer is not active (neither
         * enqueued nor running the callback, nothing to do here.  The
         * base lock does not serialize against a concurrent enqueue,
         * so we can avoid taking it.
         */
        if (!hrtimer_active(timer))
                return 0;

        base = lock_hrtimer_base(timer, &flags);

        if (!hrtimer_callback_running(timer))
                ret = remove_hrtimer(timer, base, false, false);

        unlock_hrtimer_base(timer, &flags);

        return ret;

}
EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);

#ifdef CONFIG_PREEMPT_RT
static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base)
{
        spin_lock_init(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base)
        __acquires(&base->softirq_expiry_lock)
{
        spin_lock(&base->softirq_expiry_lock);
}

static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base)
        __releases(&base->softirq_expiry_lock)
{
        spin_unlock(&base->softirq_expiry_lock);
}

/*
 * The counterpart to hrtimer_cancel_wait_running().
 *
 * If there is a waiter for cpu_base->expiry_lock, then it was waiting for
 * the timer callback to finish. Drop expiry_lock and reacquire it. That
 * allows the waiter to acquire the lock and make progress.
 */
static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
                                      unsigned long flags)
{
        if (atomic_read(&cpu_base->timer_waiters)) {
                raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
                spin_unlock(&cpu_base->softirq_expiry_lock);
                spin_lock(&cpu_base->softirq_expiry_lock);
                raw_spin_lock_irq(&cpu_base->lock);
        }
}

#ifdef CONFIG_SMP
static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return base == &migration_base;
}
#else
static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
{
        return false;
}
#endif

/*
 * This function is called on PREEMPT_RT kernels when the fast path
 * deletion of a timer failed because the timer callback function was
 * running.
 *
 * This prevents priority inversion: if the soft irq thread is preempted
 * in the middle of a timer callback, then calling hrtimer_cancel() can
 * lead to two issues:
 *
 *  - If the caller is on a remote CPU then it has to spin wait for the timer
 *    handler to complete. This can result in unbound priority inversion.
 *
 *  - If the caller originates from the task which preempted the timer
 *    handler on the same CPU, then spin waiting for the timer handler to
 *    complete is never going to end.
 */
void hrtimer_cancel_wait_running(const struct hrtimer *timer)
{
        /* Lockless read. Prevent the compiler from reloading it below */
        struct hrtimer_clock_base *base = READ_ONCE(timer->base);

        /*
         * Just relax if the timer expires in hard interrupt context or if
         * it is currently on the migration base.
         */
        if (!timer->is_soft || is_migration_base(base)) {
                cpu_relax();
                return;
        }

        /*
         * Mark the base as contended and grab the expiry lock, which is
         * held by the softirq across the timer callback. Drop the lock
         * immediately so the softirq can expire the next timer. In theory
         * the timer could already be running again, but that's more than
         * unlikely and just causes another wait loop.
         */
        atomic_inc(&base->cpu_base->timer_waiters);
        spin_lock_bh(&base->cpu_base->softirq_expiry_lock);
        atomic_dec(&base->cpu_base->timer_waiters);
        spin_unlock_bh(&base->cpu_base->softirq_expiry_lock);
}
#else
static inline void
hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { }
static inline void
hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { }
static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base,
                                             unsigned long flags) { }
#endif

/**
 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
 * @timer:        the timer to be cancelled
 *
 * Returns:
 *  0 when the timer was not active
 *  1 when the timer was active
 */
int hrtimer_cancel(struct hrtimer *timer)
{
        int ret;

        do {
                ret = hrtimer_try_to_cancel(timer);

                if (ret < 0)
                        hrtimer_cancel_wait_running(timer);
        } while (ret < 0);
        return ret;
}
EXPORT_SYMBOL_GPL(hrtimer_cancel);

/**
 * __hrtimer_get_remaining - get remaining time for the timer
 * @timer:        the timer to read
 * @adjust:        adjust relative timers when CONFIG_TIME_LOW_RES=y
 */
ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
        unsigned long flags;
        ktime_t rem;

        lock_hrtimer_base(timer, &flags);
        if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
                rem = hrtimer_expires_remaining_adjusted(timer);
        else
                rem = hrtimer_expires_remaining(timer);
        unlock_hrtimer_base(timer, &flags);

        return rem;
}
EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);

#ifdef CONFIG_NO_HZ_COMMON
/**
 * hrtimer_get_next_event - get the time until next expiry event
 *
 * Returns the next expiry time or KTIME_MAX if no timer is pending.
 */
u64 hrtimer_get_next_event(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (!hrtimer_hres_active(cpu_base))
                expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}

/**
 * hrtimer_next_event_without - time until next expiry event w/o one timer
 * @exclude:        timer to exclude
 *
 * Returns the next expiry time over all timers except for the @exclude one or
 * KTIME_MAX if none of them is pending.
 */
u64 hrtimer_next_event_without(const struct hrtimer *exclude)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        u64 expires = KTIME_MAX;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        if (hrtimer_hres_active(cpu_base)) {
                unsigned int active;

                if (!cpu_base->softirq_activated) {
                        active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
                        expires = __hrtimer_next_event_base(cpu_base, exclude,
                                                            active, KTIME_MAX);
                }
                active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
                expires = __hrtimer_next_event_base(cpu_base, exclude, active,
                                                    expires);
        }

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        return expires;
}
#endif

static inline int hrtimer_clockid_to_base(clockid_t clock_id)
{
        switch (clock_id) {
        case CLOCK_REALTIME:
                return HRTIMER_BASE_REALTIME;
        case CLOCK_MONOTONIC:
                return HRTIMER_BASE_MONOTONIC;
        case CLOCK_BOOTTIME:
                return HRTIMER_BASE_BOOTTIME;
        case CLOCK_TAI:
                return HRTIMER_BASE_TAI;
        default:
                WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
                return HRTIMER_BASE_MONOTONIC;
        }
}

static void __hrtimer_setup(struct hrtimer *timer,
                            enum hrtimer_restart (*function)(struct hrtimer *),
                            clockid_t clock_id, enum hrtimer_mode mode)
{
        bool softtimer = !!(mode & HRTIMER_MODE_SOFT);
        struct hrtimer_cpu_base *cpu_base;
        int base;

        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context for latency reasons and because the callbacks
         * can invoke functions which might sleep on RT, e.g. spin_lock().
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD))
                softtimer = true;

        memset(timer, 0, sizeof(struct hrtimer));

        cpu_base = raw_cpu_ptr(&hrtimer_bases);

        /*
         * POSIX magic: Relative CLOCK_REALTIME timers are not affected by
         * clock modifications, so they needs to become CLOCK_MONOTONIC to
         * ensure POSIX compliance.
         */
        if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL)
                clock_id = CLOCK_MONOTONIC;

        base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0;
        base += hrtimer_clockid_to_base(clock_id);
        timer->is_soft = softtimer;
        timer->is_hard = !!(mode & HRTIMER_MODE_HARD);
        timer->base = &cpu_base->clock_base[base];
        timerqueue_init(&timer->node);

        if (WARN_ON_ONCE(!function))
                ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout;
        else
                ACCESS_PRIVATE(timer, function) = function;
}

/**
 * hrtimer_setup - initialize a timer to the given clock
 * @timer:        the timer to be initialized
 * @function:        the callback function
 * @clock_id:        the clock to be used
 * @mode:       The modes which are relevant for initialization:
 *              HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT,
 *              HRTIMER_MODE_REL_SOFT
 *
 *              The PINNED variants of the above can be handed in,
 *              but the PINNED bit is ignored as pinning happens
 *              when the hrtimer is started
 */
void hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*function)(struct hrtimer *),
                   clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_setup(timer, clock_id, mode);
        __hrtimer_setup(timer, function, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup);

/**
 * hrtimer_setup_on_stack - initialize a timer on stack memory
 * @timer:        The timer to be initialized
 * @function:        the callback function
 * @clock_id:        The clock to be used
 * @mode:       The timer mode
 *
 * Similar to hrtimer_setup(), except that this one must be used if struct hrtimer is in stack
 * memory.
 */
void hrtimer_setup_on_stack(struct hrtimer *timer,
                            enum hrtimer_restart (*function)(struct hrtimer *),
                            clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_setup_on_stack(timer, clock_id, mode);
        __hrtimer_setup(timer, function, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup_on_stack);

/*
 * A timer is active, when it is enqueued into the rbtree or the
 * callback function is running or it's in the state of being migrated
 * to another cpu.
 *
 * It is important for this function to not return a false negative.
 */
bool hrtimer_active(const struct hrtimer *timer)
{
        struct hrtimer_clock_base *base;
        unsigned int seq;

        do {
                base = READ_ONCE(timer->base);
                seq = raw_read_seqcount_begin(&base->seq);

                if (timer->state != HRTIMER_STATE_INACTIVE ||
                    base->running == timer)
                        return true;

        } while (read_seqcount_retry(&base->seq, seq) ||
                 base != READ_ONCE(timer->base));

        return false;
}
EXPORT_SYMBOL_GPL(hrtimer_active);

/*
 * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
 * distinct sections:
 *
 *  - queued:        the timer is queued
 *  - callback:        the timer is being ran
 *  - post:        the timer is inactive or (re)queued
 *
 * On the read side we ensure we observe timer->state and cpu_base->running
 * from the same section, if anything changed while we looked at it, we retry.
 * This includes timer->base changing because sequence numbers alone are
 * insufficient for that.
 *
 * The sequence numbers are required because otherwise we could still observe
 * a false negative if the read side got smeared over multiple consecutive
 * __run_hrtimer() invocations.
 */

static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
                          struct hrtimer_clock_base *base,
                          struct hrtimer *timer, ktime_t *now,
                          unsigned long flags) __must_hold(&cpu_base->lock)
{
        enum hrtimer_restart (*fn)(struct hrtimer *);
        bool expires_in_hardirq;
        int restart;

        lockdep_assert_held(&cpu_base->lock);

        debug_deactivate(timer);
        base->running = timer;

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
        fn = ACCESS_PRIVATE(timer, function);

        /*
         * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
         * timer is restarted with a period then it becomes an absolute
         * timer. If its not restarted it does not matter.
         */
        if (IS_ENABLED(CONFIG_TIME_LOW_RES))
                timer->is_rel = false;

        /*
         * The timer is marked as running in the CPU base, so it is
         * protected against migration to a different CPU even if the lock
         * is dropped.
         */
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        trace_hrtimer_expire_entry(timer, now);
        expires_in_hardirq = lockdep_hrtimer_enter(timer);

        restart = fn(timer);

        lockdep_hrtimer_exit(expires_in_hardirq);
        trace_hrtimer_expire_exit(timer);
        raw_spin_lock_irq(&cpu_base->lock);

        /*
         * Note: We clear the running state after enqueue_hrtimer and
         * we do not reprogram the event hardware. Happens either in
         * hrtimer_start_range_ns() or in hrtimer_interrupt()
         *
         * Note: Because we dropped the cpu_base->lock above,
         * hrtimer_start_range_ns() can have popped in and enqueued the timer
         * for us already.
         */
        if (restart != HRTIMER_NORESTART &&
            !(timer->state & HRTIMER_STATE_ENQUEUED))
                enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);

        /*
         * Separate the ->running assignment from the ->state assignment.
         *
         * As with a regular write barrier, this ensures the read side in
         * hrtimer_active() cannot observe base->running.timer == NULL &&
         * timer->state == INACTIVE.
         */
        raw_write_seqcount_barrier(&base->seq);

        WARN_ON_ONCE(base->running != timer);
        base->running = NULL;
}

static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
                                 unsigned long flags, unsigned int active_mask)
{
        struct hrtimer_clock_base *base;
        unsigned int active = cpu_base->active_bases & active_mask;

        for_each_active_base(base, cpu_base, active) {
                struct timerqueue_node *node;
                ktime_t basenow;

                basenow = ktime_add(now, base->offset);

                while ((node = timerqueue_getnext(&base->active))) {
                        struct hrtimer *timer;

                        timer = container_of(node, struct hrtimer, node);

                        /*
                         * The immediate goal for using the softexpires is
                         * minimizing wakeups, not running timers at the
                         * earliest interrupt after their soft expiration.
                         * This allows us to avoid using a Priority Search
                         * Tree, which can answer a stabbing query for
                         * overlapping intervals and instead use the simple
                         * BST we already have.
                         * We don't add extra wakeups by delaying timers that
                         * are right-of a not yet expired timer, because that
                         * timer will have to trigger a wakeup anyway.
                         */
                        if (basenow < hrtimer_get_softexpires_tv64(timer))
                                break;

                        __run_hrtimer(cpu_base, base, timer, &basenow, flags);
                        if (active_mask == HRTIMER_ACTIVE_SOFT)
                                hrtimer_sync_wait_running(cpu_base, flags);
                }
        }
}

static __latent_entropy void hrtimer_run_softirq(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        hrtimer_cpu_base_lock_expiry(cpu_base);
        raw_spin_lock_irqsave(&cpu_base->lock, flags);

        now = hrtimer_update_base(cpu_base);
        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT);

        cpu_base->softirq_activated = 0;
        hrtimer_update_softirq_timer(cpu_base, true);

        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
        hrtimer_cpu_base_unlock_expiry(cpu_base);
}

#ifdef CONFIG_HIGH_RES_TIMERS

/*
 * High resolution timer interrupt
 * Called with interrupts disabled
 */
void hrtimer_interrupt(struct clock_event_device *dev)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        ktime_t expires_next, now, entry_time, delta;
        unsigned long flags;
        int retries = 0;

        BUG_ON(!cpu_base->hres_active);
        cpu_base->nr_events++;
        dev->next_event = KTIME_MAX;

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        entry_time = now = hrtimer_update_base(cpu_base);
retry:
        cpu_base->in_hrtirq = 1;
        /*
         * We set expires_next to KTIME_MAX here with cpu_base->lock
         * held to prevent that a timer is enqueued in our queue via
         * the migration code. This does not affect enqueueing of
         * timers which run their callback and need to be requeued on
         * this CPU.
         */
        cpu_base->expires_next = KTIME_MAX;

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_timer_softirq(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);

        /* Reevaluate the clock bases for the [soft] next expiry */
        expires_next = hrtimer_update_next_event(cpu_base);
        /*
         * Store the new expiry value so the migration code can verify
         * against it.
         */
        cpu_base->expires_next = expires_next;
        cpu_base->in_hrtirq = 0;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        /* Reprogramming necessary ? */
        if (!tick_program_event(expires_next, 0)) {
                cpu_base->hang_detected = 0;
                return;
        }

        /*
         * The next timer was already expired due to:
         * - tracing
         * - long lasting callbacks
         * - being scheduled away when running in a VM
         *
         * We need to prevent that we loop forever in the hrtimer
         * interrupt routine. We give it 3 attempts to avoid
         * overreacting on some spurious event.
         *
         * Acquire base lock for updating the offsets and retrieving
         * the current time.
         */
        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);
        cpu_base->nr_retries++;
        if (++retries < 3)
                goto retry;
        /*
         * Give the system a chance to do something else than looping
         * here. We stored the entry time, so we know exactly how long
         * we spent here. We schedule the next event this amount of
         * time away.
         */
        cpu_base->nr_hangs++;
        cpu_base->hang_detected = 1;
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);

        delta = ktime_sub(now, entry_time);
        if ((unsigned int)delta > cpu_base->max_hang_time)
                cpu_base->max_hang_time = (unsigned int) delta;
        /*
         * Limit it to a sensible value as we enforce a longer
         * delay. Give the CPU at least 100ms to catch up.
         */
        if (delta > 100 * NSEC_PER_MSEC)
                expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
        else
                expires_next = ktime_add(now, delta);
        tick_program_event(expires_next, 1);
        pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}
#endif /* !CONFIG_HIGH_RES_TIMERS */

/*
 * Called from run_local_timers in hardirq context every jiffy
 */
void hrtimer_run_queues(void)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
        unsigned long flags;
        ktime_t now;

        if (hrtimer_hres_active(cpu_base))
                return;

        /*
         * This _is_ ugly: We have to check periodically, whether we
         * can switch to highres and / or nohz mode. The clocksource
         * switch happens with xtime_lock held. Notification from
         * there only sets the check bit in the tick_oneshot code,
         * otherwise we might deadlock vs. xtime_lock.
         */
        if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
                hrtimer_switch_to_hres();
                return;
        }

        raw_spin_lock_irqsave(&cpu_base->lock, flags);
        now = hrtimer_update_base(cpu_base);

        if (!ktime_before(now, cpu_base->softirq_expires_next)) {
                cpu_base->softirq_expires_next = KTIME_MAX;
                cpu_base->softirq_activated = 1;
                raise_timer_softirq(HRTIMER_SOFTIRQ);
        }

        __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
        raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}

/*
 * Sleep related functions:
 */
static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer)
{
        struct hrtimer_sleeper *t =
                container_of(timer, struct hrtimer_sleeper, timer);
        struct task_struct *task = t->task;

        t->task = NULL;
        if (task)
                wake_up_process(task);

        return HRTIMER_NORESTART;
}

/**
 * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer
 * @sl:                sleeper to be started
 * @mode:        timer mode abs/rel
 *
 * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers
 * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context)
 */
void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl,
                                   enum hrtimer_mode mode)
{
        /*
         * Make the enqueue delivery mode check work on RT. If the sleeper
         * was initialized for hard interrupt delivery, force the mode bit.
         * This is a special case for hrtimer_sleepers because
         * __hrtimer_setup_sleeper() determines the delivery mode on RT so the
         * fiddling with this decision is avoided at the call sites.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard)
                mode |= HRTIMER_MODE_HARD;

        hrtimer_start_expires(&sl->timer, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires);

static void __hrtimer_setup_sleeper(struct hrtimer_sleeper *sl,
                                    clockid_t clock_id, enum hrtimer_mode mode)
{
        /*
         * On PREEMPT_RT enabled kernels hrtimers which are not explicitly
         * marked for hard interrupt expiry mode are moved into soft
         * interrupt context either for latency reasons or because the
         * hrtimer callback takes regular spinlocks or invokes other
         * functions which are not suitable for hard interrupt context on
         * PREEMPT_RT.
         *
         * The hrtimer_sleeper callback is RT compatible in hard interrupt
         * context, but there is a latency concern: Untrusted userspace can
         * spawn many threads which arm timers for the same expiry time on
         * the same CPU. That causes a latency spike due to the wakeup of
         * a gazillion threads.
         *
         * OTOH, privileged real-time user space applications rely on the
         * low latency of hard interrupt wakeups. If the current task is in
         * a real-time scheduling class, mark the mode for hard interrupt
         * expiry.
         */
        if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
                if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT))
                        mode |= HRTIMER_MODE_HARD;
        }

        __hrtimer_setup(&sl->timer, hrtimer_wakeup, clock_id, mode);
        sl->task = current;
}

/**
 * hrtimer_setup_sleeper_on_stack - initialize a sleeper in stack memory
 * @sl:                sleeper to be initialized
 * @clock_id:        the clock to be used
 * @mode:        timer mode abs/rel
 */
void hrtimer_setup_sleeper_on_stack(struct hrtimer_sleeper *sl,
                                    clockid_t clock_id, enum hrtimer_mode mode)
{
        debug_setup_on_stack(&sl->timer, clock_id, mode);
        __hrtimer_setup_sleeper(sl, clock_id, mode);
}
EXPORT_SYMBOL_GPL(hrtimer_setup_sleeper_on_stack);

int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
        switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT_32BIT_TIME
        case TT_COMPAT:
                if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp))
                        return -EFAULT;
                break;
#endif
        case TT_NATIVE:
                if (put_timespec64(ts, restart->nanosleep.rmtp))
                        return -EFAULT;
                break;
        default:
                BUG();
        }
        return -ERESTART_RESTARTBLOCK;
}

static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode)
{
        struct restart_block *restart;

        do {
                set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
                hrtimer_sleeper_start_expires(t, mode);

                if (likely(t->task))
                        schedule();

                hrtimer_cancel(&t->timer);
                mode = HRTIMER_MODE_ABS;

        } while (t->task && !signal_pending(current));

        __set_current_state(TASK_RUNNING);

        if (!t->task)
                return 0;

        restart = &current->restart_block;
        if (restart->nanosleep.type != TT_NONE) {
                ktime_t rem = hrtimer_expires_remaining(&t->timer);
                struct timespec64 rmt;

                if (rem <= 0)
                        return 0;
                rmt = ktime_to_timespec64(rem);

                return nanosleep_copyout(restart, &rmt);
        }
        return -ERESTART_RESTARTBLOCK;
}

static long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
{
        struct hrtimer_sleeper t;
        int ret;

        hrtimer_setup_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS);
        hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
        ret = do_nanosleep(&t, HRTIMER_MODE_ABS);
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
                       const clockid_t clockid)
{
        struct restart_block *restart;
        struct hrtimer_sleeper t;
        int ret = 0;

        hrtimer_setup_sleeper_on_stack(&t, clockid, mode);
        hrtimer_set_expires_range_ns(&t.timer, rqtp, current->timer_slack_ns);
        ret = do_nanosleep(&t, mode);
        if (ret != -ERESTART_RESTARTBLOCK)
                goto out;

        /* Absolute timers do not update the rmtp value and restart: */
        if (mode == HRTIMER_MODE_ABS) {
                ret = -ERESTARTNOHAND;
                goto out;
        }

        restart = &current->restart_block;
        restart->nanosleep.clockid = t.timer.base->clockid;
        restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
        set_restart_fn(restart, hrtimer_nanosleep_restart);
out:
        destroy_hrtimer_on_stack(&t.timer);
        return ret;
}

#ifdef CONFIG_64BIT

SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp,
                struct __kernel_timespec __user *, rmtp)
{
        struct timespec64 tu;

        if (get_timespec64(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
        current->restart_block.nanosleep.rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}

#endif

#ifdef CONFIG_COMPAT_32BIT_TIME

SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp,
                       struct old_timespec32 __user *, rmtp)
{
        struct timespec64 tu;

        if (get_old_timespec32(&tu, rqtp))
                return -EFAULT;

        if (!timespec64_valid(&tu))
                return -EINVAL;

        current->restart_block.fn = do_no_restart_syscall;
        current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
        current->restart_block.nanosleep.compat_rmtp = rmtp;
        return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL,
                                 CLOCK_MONOTONIC);
}
#endif

/*
 * Functions related to boot-time initialization:
 */
int hrtimers_prepare_cpu(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
        int i;

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i];

                clock_b->cpu_base = cpu_base;
                seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
                timerqueue_init_head(&clock_b->active);
        }

        cpu_base->cpu = cpu;
        hrtimer_cpu_base_init_expiry_lock(cpu_base);
        return 0;
}

int hrtimers_cpu_starting(unsigned int cpu)
{
        struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);

        /* Clear out any left over state from a CPU down operation */
        cpu_base->active_bases = 0;
        cpu_base->hres_active = 0;
        cpu_base->hang_detected = 0;
        cpu_base->next_timer = NULL;
        cpu_base->softirq_next_timer = NULL;
        cpu_base->expires_next = KTIME_MAX;
        cpu_base->softirq_expires_next = KTIME_MAX;
        cpu_base->online = 1;
        return 0;
}

#ifdef CONFIG_HOTPLUG_CPU

static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
                                struct hrtimer_clock_base *new_base)
{
        struct hrtimer *timer;
        struct timerqueue_node *node;

        while ((node = timerqueue_getnext(&old_base->active))) {
                timer = container_of(node, struct hrtimer, node);
                BUG_ON(hrtimer_callback_running(timer));
                debug_deactivate(timer);

                /*
                 * Mark it as ENQUEUED not INACTIVE otherwise the
                 * timer could be seen as !active and just vanish away
                 * under us on another CPU
                 */
                __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
                timer->base = new_base;
                /*
                 * Enqueue the timers on the new cpu. This does not
                 * reprogram the event device in case the timer
                 * expires before the earliest on this CPU, but we run
                 * hrtimer_interrupt after we migrated everything to
                 * sort out already expired timers and reprogram the
                 * event device.
                 */
                enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS);
        }
}

int hrtimers_cpu_dying(unsigned int dying_cpu)
{
        int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER));
        struct hrtimer_cpu_base *old_base, *new_base;

        old_base = this_cpu_ptr(&hrtimer_bases);
        new_base = &per_cpu(hrtimer_bases, ncpu);

        /*
         * The caller is globally serialized and nobody else
         * takes two locks at once, deadlock is not possible.
         */
        raw_spin_lock(&old_base->lock);
        raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);

        for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
                migrate_hrtimer_list(&old_base->clock_base[i],
                                     &new_base->clock_base[i]);
        }

        /*
         * The migration might have changed the first expiring softirq
         * timer on this CPU. Update it.
         */
        __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
        /* Tell the other CPU to retrigger the next event */
        smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);

        raw_spin_unlock(&new_base->lock);
        old_base->online = 0;
        raw_spin_unlock(&old_base->lock);

        return 0;
}

#endif /* CONFIG_HOTPLUG_CPU */

void __init hrtimers_init(void)
{
        hrtimers_prepare_cpu(smp_processor_id());
        hrtimers_cpu_starting(smp_processor_id());
        open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}

















































  117 
























  654 











































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * Copyright (c) 2021, Google LLC.
 * Pasha Tatashin <pasha.tatashin@soleen.com>
 */
#ifndef __LINUX_PAGE_TABLE_CHECK_H
#define __LINUX_PAGE_TABLE_CHECK_H

#ifdef CONFIG_PAGE_TABLE_CHECK
#include <linux/jump_label.h>

extern struct static_key_true page_table_check_disabled;
extern struct page_ext_operations page_table_check_ops;

void __page_table_check_zero(struct page *page, unsigned int order);
void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte);
void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd);
void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud);
void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte,
                unsigned int nr);
void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd);
void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud);
void __page_table_check_pte_clear_range(struct mm_struct *mm,
                                        unsigned long addr,
                                        pmd_t pmd);

static inline void page_table_check_alloc(struct page *page, unsigned int order)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_zero(page, order);
}

static inline void page_table_check_free(struct page *page, unsigned int order)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_zero(page, order);
}

static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pte_clear(mm, pte);
}

static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pmd_clear(mm, pmd);
}

static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pud_clear(mm, pud);
}

static inline void page_table_check_ptes_set(struct mm_struct *mm,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_ptes_set(mm, ptep, pte, nr);
}

static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
                                            pmd_t pmd)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pmd_set(mm, pmdp, pmd);
}

static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
                                            pud_t pud)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pud_set(mm, pudp, pud);
}

static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    pmd_t pmd)
{
        if (static_branch_likely(&page_table_check_disabled))
                return;

        __page_table_check_pte_clear_range(mm, addr, pmd);
}

#else

static inline void page_table_check_alloc(struct page *page, unsigned int order)
{
}

static inline void page_table_check_free(struct page *page, unsigned int order)
{
}

static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte)
{
}

static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd)
{
}

static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud)
{
}

static inline void page_table_check_ptes_set(struct mm_struct *mm,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
}

static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp,
                                            pmd_t pmd)
{
}

static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp,
                                            pud_t pud)
{
}

static inline void page_table_check_pte_clear_range(struct mm_struct *mm,
                                                    unsigned long addr,
                                                    pmd_t pmd)
{
}

#endif /* CONFIG_PAGE_TABLE_CHECK */
#endif /* __LINUX_PAGE_TABLE_CHECK_H */


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (C) 2018 - Arm Ltd */

#ifndef __ARM64_KVM_RAS_H__
#define __ARM64_KVM_RAS_H__

#include <linux/acpi.h>
#include <linux/errno.h>
#include <linux/types.h>

#include <asm/acpi.h>

/*
 * Was this synchronous external abort a RAS notification?
 * Returns '0' for errors handled by some RAS subsystem, or -ENOENT.
 */
static inline int kvm_handle_guest_sea(void)
{
        /* apei_claim_sea(NULL) expects to mask interrupts itself */
        lockdep_assert_irqs_enabled();

        return apei_claim_sea(NULL);
}

#endif /* __ARM64_KVM_RAS_H__ */
























































































































































































































    1 



















































































































































































































































































































































































    3 















































































































































































































































































    3 




    3 




    3 



















    5 











































































































































































   24 




















   23 




































   24 




   24 










   24 

   24 

   24 































   24 

   24 










   24 












   17 




    1 
    6 








    5 












    5 


    3 


    1 




    1 


































































    5 


    1 

    4 








    4 









    4 








    4 





























































































































    3 





















































   11 



    4 













    2 
















    1 








































































































    1 







    3 



    3 




















   24 



























































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
#include <linux/filter.h>
#include <net/dsa.h>
#include <net/dst_metadata.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/gre.h>
#include <net/pptp.h>
#include <net/tipc.h>
#include <linux/igmp.h>
#include <linux/icmp.h>
#include <linux/sctp.h>
#include <linux/dccp.h>
#include <linux/if_tunnel.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <linux/stddef.h>
#include <linux/if_ether.h>
#include <linux/if_hsr.h>
#include <linux/mpls.h>
#include <linux/tcp.h>
#include <linux/ptp_classify.h>
#include <net/flow_dissector.h>
#include <net/pkt_cls.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
#include <linux/bpf.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_labels.h>
#endif
#include <linux/bpf-netns.h>

static void dissector_set_key(struct flow_dissector *flow_dissector,
                              enum flow_dissector_key_id key_id)
{
        flow_dissector->used_keys |= (1ULL << key_id);
}

void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
                             const struct flow_dissector_key *key,
                             unsigned int key_count)
{
        unsigned int i;

        memset(flow_dissector, 0, sizeof(*flow_dissector));

        for (i = 0; i < key_count; i++, key++) {
                /* User should make sure that every key target offset is within
                 * boundaries of unsigned short.
                 */
                BUG_ON(key->offset > USHRT_MAX);
                BUG_ON(dissector_uses_key(flow_dissector,
                                          key->key_id));

                dissector_set_key(flow_dissector, key->key_id);
                flow_dissector->offset[key->key_id] = key->offset;
        }

        /* Ensure that the dissector always includes control and basic key.
         * That way we are able to avoid handling lack of these in fast path.
         */
        BUG_ON(!dissector_uses_key(flow_dissector,
                                   FLOW_DISSECTOR_KEY_CONTROL));
        BUG_ON(!dissector_uses_key(flow_dissector,
                                   FLOW_DISSECTOR_KEY_BASIC));
}
EXPORT_SYMBOL(skb_flow_dissector_init);

#ifdef CONFIG_BPF_SYSCALL
int flow_dissector_bpf_prog_attach_check(struct net *net,
                                         struct bpf_prog *prog)
{
        enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;

        if (net == &init_net) {
                /* BPF flow dissector in the root namespace overrides
                 * any per-net-namespace one. When attaching to root,
                 * make sure we don't have any BPF program attached
                 * to the non-root namespaces.
                 */
                struct net *ns;

                for_each_net(ns) {
                        if (ns == &init_net)
                                continue;
                        if (rcu_access_pointer(ns->bpf.run_array[type]))
                                return -EEXIST;
                }
        } else {
                /* Make sure root flow dissector is not attached
                 * when attaching to the non-root namespace.
                 */
                if (rcu_access_pointer(init_net.bpf.run_array[type]))
                        return -EEXIST;
        }

        return 0;
}
#endif /* CONFIG_BPF_SYSCALL */

/**
 * skb_flow_get_ports - extract the upper layer ports and return them
 * @skb: sk_buff to extract the ports from
 * @thoff: transport header offset
 * @ip_proto: protocol for which to get port offset
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
 *
 * The function will try to retrieve the ports at offset thoff + poff where poff
 * is the protocol port offset returned from proto_ports_offset
 */
__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
                          const void *data, int hlen)
{
        int poff = proto_ports_offset(ip_proto);

        if (!data) {
                data = skb->data;
                hlen = skb_headlen(skb);
        }

        if (poff >= 0) {
                __be32 *ports, _ports;

                ports = __skb_header_pointer(skb, thoff + poff,
                                             sizeof(_ports), data, hlen, &_ports);
                if (ports)
                        return *ports;
        }

        return 0;
}
EXPORT_SYMBOL(skb_flow_get_ports);

static bool icmp_has_id(u8 type)
{
        switch (type) {
        case ICMP_ECHO:
        case ICMP_ECHOREPLY:
        case ICMP_TIMESTAMP:
        case ICMP_TIMESTAMPREPLY:
        case ICMPV6_ECHO_REQUEST:
        case ICMPV6_ECHO_REPLY:
                return true;
        }

        return false;
}

/**
 * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields
 * @skb: sk_buff to extract from
 * @key_icmp: struct flow_dissector_key_icmp to fill
 * @data: raw buffer pointer to the packet
 * @thoff: offset to extract at
 * @hlen: packet header length
 */
void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                           struct flow_dissector_key_icmp *key_icmp,
                           const void *data, int thoff, int hlen)
{
        struct icmphdr *ih, _ih;

        ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih);
        if (!ih)
                return;

        key_icmp->type = ih->type;
        key_icmp->code = ih->code;

        /* As we use 0 to signal that the Id field is not present,
         * avoid confusion with packets without such field
         */
        if (icmp_has_id(ih->type))
                key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
        else
                key_icmp->id = 0;
}
EXPORT_SYMBOL(skb_flow_get_icmp_tci);

/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet
 * using skb_flow_get_icmp_tci().
 */
static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
                                    void *target_container, const void *data,
                                    int thoff, int hlen)
{
        struct flow_dissector_key_icmp *key_icmp;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP))
                return;

        key_icmp = skb_flow_dissector_target(flow_dissector,
                                             FLOW_DISSECTOR_KEY_ICMP,
                                             target_container);

        skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
}

static void __skb_flow_dissect_ah(const struct sk_buff *skb,
                                  struct flow_dissector *flow_dissector,
                                  void *target_container, const void *data,
                                  int nhoff, int hlen)
{
        struct flow_dissector_key_ipsec *key_ah;
        struct ip_auth_hdr _hdr, *hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_ah = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IPSEC,
                                           target_container);

        key_ah->spi = hdr->spi;
}

static void __skb_flow_dissect_esp(const struct sk_buff *skb,
                                   struct flow_dissector *flow_dissector,
                                   void *target_container, const void *data,
                                   int nhoff, int hlen)
{
        struct flow_dissector_key_ipsec *key_esp;
        struct ip_esp_hdr _hdr, *hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_esp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_IPSEC,
                                            target_container);

        key_esp->spi = hdr->spi;
}

static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb,
                                      struct flow_dissector *flow_dissector,
                                      void *target_container, const void *data,
                                      int nhoff, int hlen)
{
        struct flow_dissector_key_l2tpv3 *key_l2tpv3;
        struct {
                __be32 session_id;
        } *hdr, _hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3))
                return;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
        if (!hdr)
                return;

        key_l2tpv3 = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_L2TPV3,
                                               target_container);

        key_l2tpv3->session_id = hdr->session_id;
}

void skb_flow_dissect_meta(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container)
{
        struct flow_dissector_key_meta *meta;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
                return;

        meta = skb_flow_dissector_target(flow_dissector,
                                         FLOW_DISSECTOR_KEY_META,
                                         target_container);
        meta->ingress_ifindex = skb->skb_iif;
#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
        if (tc_skb_ext_tc_enabled()) {
                struct tc_skb_ext *ext;

                ext = skb_ext_find(skb, TC_SKB_EXT);
                if (ext)
                        meta->l2_miss = ext->l2_miss;
        }
#endif
}
EXPORT_SYMBOL(skb_flow_dissect_meta);

static void
skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type,
                                 u32 ctrl_flags,
                                 struct flow_dissector *flow_dissector,
                                 void *target_container)
{
        struct flow_dissector_key_control *ctrl;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL))
                return;

        ctrl = skb_flow_dissector_target(flow_dissector,
                                         FLOW_DISSECTOR_KEY_ENC_CONTROL,
                                         target_container);
        ctrl->addr_type = type;
        ctrl->flags = ctrl_flags;
}

void
skb_flow_dissect_ct(const struct sk_buff *skb,
                    struct flow_dissector *flow_dissector,
                    void *target_container, u16 *ctinfo_map,
                    size_t mapsize, bool post_ct, u16 zone)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
        struct flow_dissector_key_ct *key;
        enum ip_conntrack_info ctinfo;
        struct nf_conn_labels *cl;
        struct nf_conn *ct;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
                return;

        ct = nf_ct_get(skb, &ctinfo);
        if (!ct && !post_ct)
                return;

        key = skb_flow_dissector_target(flow_dissector,
                                        FLOW_DISSECTOR_KEY_CT,
                                        target_container);

        if (!ct) {
                key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
                                TCA_FLOWER_KEY_CT_FLAGS_INVALID;
                key->ct_zone = zone;
                return;
        }

        if (ctinfo < mapsize)
                key->ct_state = ctinfo_map[ctinfo];
#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
        key->ct_zone = ct->zone.id;
#endif
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
        key->ct_mark = READ_ONCE(ct->mark);
#endif

        cl = nf_ct_labels_find(ct);
        if (cl)
                memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
#endif /* CONFIG_NF_CONNTRACK */
}
EXPORT_SYMBOL(skb_flow_dissect_ct);

void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
                             struct flow_dissector *flow_dissector,
                             void *target_container)
{
        struct ip_tunnel_info *info;
        struct ip_tunnel_key *key;
        u32 ctrl_flags = 0;

        /* A quick check to see if there might be something to do. */
        if (!dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_KEYID) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_PORTS) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_IP) &&
            !dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_ENC_OPTS))
                return;

        info = skb_tunnel_info(skb);
        if (!info)
                return;

        key = &info->key;

        if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM;
        if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT;
        if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM;
        if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags))
                ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT;

        switch (ip_tunnel_info_af(info)) {
        case AF_INET:
                skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                 ctrl_flags, flow_dissector,
                                                 target_container);
                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
                        struct flow_dissector_key_ipv4_addrs *ipv4;

                        ipv4 = skb_flow_dissector_target(flow_dissector,
                                                         FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS,
                                                         target_container);
                        ipv4->src = key->u.ipv4.src;
                        ipv4->dst = key->u.ipv4.dst;
                }
                break;
        case AF_INET6:
                skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                 ctrl_flags, flow_dissector,
                                                 target_container);
                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
                        struct flow_dissector_key_ipv6_addrs *ipv6;

                        ipv6 = skb_flow_dissector_target(flow_dissector,
                                                         FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
                                                         target_container);
                        ipv6->src = key->u.ipv6.src;
                        ipv6->dst = key->u.ipv6.dst;
                }
                break;
        default:
                skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector,
                                                 target_container);
                break;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
                struct flow_dissector_key_keyid *keyid;

                keyid = skb_flow_dissector_target(flow_dissector,
                                                  FLOW_DISSECTOR_KEY_ENC_KEYID,
                                                  target_container);
                keyid->keyid = tunnel_id_to_key32(key->tun_id);
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_PORTS)) {
                struct flow_dissector_key_ports *tp;

                tp = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_ENC_PORTS,
                                               target_container);
                tp->src = key->tp_src;
                tp->dst = key->tp_dst;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
                struct flow_dissector_key_ip *ip;

                ip = skb_flow_dissector_target(flow_dissector,
                                               FLOW_DISSECTOR_KEY_ENC_IP,
                                               target_container);
                ip->tos = key->tos;
                ip->ttl = key->ttl;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
                struct flow_dissector_key_enc_opts *enc_opt;
                IP_TUNNEL_DECLARE_FLAGS(flags) = { };
                u32 val;

                enc_opt = skb_flow_dissector_target(flow_dissector,
                                                    FLOW_DISSECTOR_KEY_ENC_OPTS,
                                                    target_container);

                if (!info->options_len)
                        return;

                enc_opt->len = info->options_len;
                ip_tunnel_info_opts_get(enc_opt->data, info);

                ip_tunnel_set_options_present(flags);
                ip_tunnel_flags_and(flags, info->key.tun_flags, flags);

                val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM,
                                    IP_TUNNEL_GENEVE_OPT_BIT);
                enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;
        }
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);

void skb_flow_dissect_hash(const struct sk_buff *skb,
                           struct flow_dissector *flow_dissector,
                           void *target_container)
{
        struct flow_dissector_key_hash *key;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH))
                return;

        key = skb_flow_dissector_target(flow_dissector,
                                        FLOW_DISSECTOR_KEY_HASH,
                                        target_container);

        key->hash = skb_get_hash_raw(skb);
}
EXPORT_SYMBOL(skb_flow_dissect_hash);

static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data, int nhoff,
                        int hlen, int lse_index, bool *entropy_label)
{
        struct mpls_label *hdr, _hdr;
        u32 entry, label, bos;

        if (!dissector_uses_key(flow_dissector,
                                FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
            !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
                return FLOW_DISSECT_RET_OUT_GOOD;

        if (lse_index >= FLOW_DIS_MPLS_MAX)
                return FLOW_DISSECT_RET_OUT_GOOD;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
                                   hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        entry = ntohl(hdr->entry);
        label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
        bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
                struct flow_dissector_key_mpls *key_mpls;
                struct flow_dissector_mpls_lse *lse;

                key_mpls = skb_flow_dissector_target(flow_dissector,
                                                     FLOW_DISSECTOR_KEY_MPLS,
                                                     target_container);
                lse = &key_mpls->ls[lse_index];

                lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
                lse->mpls_bos = bos;
                lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
                lse->mpls_label = label;
                dissector_set_mpls_lse(key_mpls, lse_index);
        }

        if (*entropy_label &&
            dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
                struct flow_dissector_key_keyid *key_keyid;

                key_keyid = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
                                                      target_container);
                key_keyid->keyid = cpu_to_be32(label);
        }

        *entropy_label = label == MPLS_LABEL_ENTROPY;

        return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN;
}

static enum flow_dissect_ret
__skb_flow_dissect_arp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int nhoff, int hlen)
{
        struct flow_dissector_key_arp *key_arp;
        struct {
                unsigned char ar_sha[ETH_ALEN];
                unsigned char ar_sip[4];
                unsigned char ar_tha[ETH_ALEN];
                unsigned char ar_tip[4];
        } *arp_eth, _arp_eth;
        const struct arphdr *arp;
        struct arphdr _arp;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP))
                return FLOW_DISSECT_RET_OUT_GOOD;

        arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data,
                                   hlen, &_arp);
        if (!arp)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
            arp->ar_pro != htons(ETH_P_IP) ||
            arp->ar_hln != ETH_ALEN ||
            arp->ar_pln != 4 ||
            (arp->ar_op != htons(ARPOP_REPLY) &&
             arp->ar_op != htons(ARPOP_REQUEST)))
                return FLOW_DISSECT_RET_OUT_BAD;

        arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
                                       sizeof(_arp_eth), data,
                                       hlen, &_arp_eth);
        if (!arp_eth)
                return FLOW_DISSECT_RET_OUT_BAD;

        key_arp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_ARP,
                                            target_container);

        memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip));
        memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip));

        /* Only store the lower byte of the opcode;
         * this covers ARPOP_REPLY and ARPOP_REQUEST.
         */
        key_arp->op = ntohs(arp->ar_op) & 0xff;

        ether_addr_copy(key_arp->sha, arp_eth->ar_sha);
        ether_addr_copy(key_arp->tha, arp_eth->ar_tha);

        return FLOW_DISSECT_RET_OUT_GOOD;
}

static enum flow_dissect_ret
__skb_flow_dissect_cfm(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int nhoff, int hlen)
{
        struct flow_dissector_key_cfm *key, *hdr, _hdr;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM))
                return FLOW_DISSECT_RET_OUT_GOOD;

        hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM,
                                        target_container);

        key->mdl_ver = hdr->mdl_ver;
        key->opcode = hdr->opcode;

        return FLOW_DISSECT_RET_OUT_GOOD;
}

static enum flow_dissect_ret
__skb_flow_dissect_gre(const struct sk_buff *skb,
                       struct flow_dissector_key_control *key_control,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       __be16 *p_proto, int *p_nhoff, int *p_hlen,
                       unsigned int flags)
{
        struct flow_dissector_key_keyid *key_keyid;
        struct gre_base_hdr *hdr, _hdr;
        int offset = 0;
        u16 gre_ver;

        hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr),
                                   data, *p_hlen, &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        /* Only look inside GRE without routing */
        if (hdr->flags & GRE_ROUTING)
                return FLOW_DISSECT_RET_OUT_GOOD;

        /* Only look inside GRE for version 0 and 1 */
        gre_ver = ntohs(hdr->flags & GRE_VERSION);
        if (gre_ver > 1)
                return FLOW_DISSECT_RET_OUT_GOOD;

        *p_proto = hdr->protocol;
        if (gre_ver) {
                /* Version1 must be PPTP, and check the flags */
                if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY)))
                        return FLOW_DISSECT_RET_OUT_GOOD;
        }

        offset += sizeof(struct gre_base_hdr);

        if (hdr->flags & GRE_CSUM)
                offset += sizeof_field(struct gre_full_hdr, csum) +
                          sizeof_field(struct gre_full_hdr, reserved1);

        if (hdr->flags & GRE_KEY) {
                const __be32 *keyid;
                __be32 _keyid;

                keyid = __skb_header_pointer(skb, *p_nhoff + offset,
                                             sizeof(_keyid),
                                             data, *p_hlen, &_keyid);
                if (!keyid)
                        return FLOW_DISSECT_RET_OUT_BAD;

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_GRE_KEYID)) {
                        key_keyid = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_GRE_KEYID,
                                                              target_container);
                        if (gre_ver == 0)
                                key_keyid->keyid = *keyid;
                        else
                                key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
                }
                offset += sizeof_field(struct gre_full_hdr, key);
        }

        if (hdr->flags & GRE_SEQ)
                offset += sizeof_field(struct pptp_gre_header, seq);

        if (gre_ver == 0) {
                if (*p_proto == htons(ETH_P_TEB)) {
                        const struct ethhdr *eth;
                        struct ethhdr _eth;

                        eth = __skb_header_pointer(skb, *p_nhoff + offset,
                                                   sizeof(_eth),
                                                   data, *p_hlen, &_eth);
                        if (!eth)
                                return FLOW_DISSECT_RET_OUT_BAD;
                        *p_proto = eth->h_proto;
                        offset += sizeof(*eth);

                        /* Cap headers that we access via pointers at the
                         * end of the Ethernet header as our maximum alignment
                         * at that point is only 2 bytes.
                         */
                        if (NET_IP_ALIGN)
                                *p_hlen = *p_nhoff + offset;
                }
        } else { /* version 1, must be PPTP */
                u8 _ppp_hdr[PPP_HDRLEN];
                u8 *ppp_hdr;

                if (hdr->flags & GRE_ACK)
                        offset += sizeof_field(struct pptp_gre_header, ack);

                ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
                                               sizeof(_ppp_hdr),
                                               data, *p_hlen, _ppp_hdr);
                if (!ppp_hdr)
                        return FLOW_DISSECT_RET_OUT_BAD;

                switch (PPP_PROTOCOL(ppp_hdr)) {
                case PPP_IP:
                        *p_proto = htons(ETH_P_IP);
                        break;
                case PPP_IPV6:
                        *p_proto = htons(ETH_P_IPV6);
                        break;
                default:
                        /* Could probably catch some more like MPLS */
                        break;
                }

                offset += PPP_HDRLEN;
        }

        *p_nhoff += offset;
        key_control->flags |= FLOW_DIS_ENCAPSULATION;
        if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
                return FLOW_DISSECT_RET_OUT_GOOD;

        return FLOW_DISSECT_RET_PROTO_AGAIN;
}

/**
 * __skb_flow_dissect_batadv() - dissect batman-adv header
 * @skb: sk_buff to with the batman-adv header
 * @key_control: flow dissectors control key
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @p_proto: pointer used to update the protocol to process next
 * @p_nhoff: pointer used to update inner network header offset
 * @hlen: packet header length
 * @flags: any combination of FLOW_DISSECTOR_F_*
 *
 * ETH_P_BATMAN packets are tried to be dissected. Only
 * &struct batadv_unicast packets are actually processed because they contain an
 * inner ethernet header and are usually followed by actual network header. This
 * allows the flow dissector to continue processing the packet.
 *
 * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found,
 *  FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation,
 *  otherwise FLOW_DISSECT_RET_OUT_BAD
 */
static enum flow_dissect_ret
__skb_flow_dissect_batadv(const struct sk_buff *skb,
                          struct flow_dissector_key_control *key_control,
                          const void *data, __be16 *p_proto, int *p_nhoff,
                          int hlen, unsigned int flags)
{
        struct {
                struct batadv_unicast_packet batadv_unicast;
                struct ethhdr eth;
        } *hdr, _hdr;

        hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen,
                                   &_hdr);
        if (!hdr)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION)
                return FLOW_DISSECT_RET_OUT_BAD;

        if (hdr->batadv_unicast.packet_type != BATADV_UNICAST)
                return FLOW_DISSECT_RET_OUT_BAD;

        *p_proto = hdr->eth.h_proto;
        *p_nhoff += sizeof(*hdr);

        key_control->flags |= FLOW_DIS_ENCAPSULATION;
        if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP)
                return FLOW_DISSECT_RET_OUT_GOOD;

        return FLOW_DISSECT_RET_PROTO_AGAIN;
}

static void
__skb_flow_dissect_tcp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
                       void *target_container, const void *data,
                       int thoff, int hlen)
{
        struct flow_dissector_key_tcp *key_tcp;
        struct tcphdr *th, _th;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TCP))
                return;

        th = __skb_header_pointer(skb, thoff, sizeof(_th), data, hlen, &_th);
        if (!th)
                return;

        if (unlikely(__tcp_hdrlen(th) < sizeof(_th)))
                return;

        key_tcp = skb_flow_dissector_target(flow_dissector,
                                            FLOW_DISSECTOR_KEY_TCP,
                                            target_container);
        key_tcp->flags = (*(__be16 *) &tcp_flag_word(th) & htons(0x0FFF));
}

static void
__skb_flow_dissect_ports(const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
                         void *target_container, const void *data,
                         int nhoff, u8 ip_proto, int hlen)
{
        struct flow_dissector_key_ports_range *key_ports_range = NULL;
        struct flow_dissector_key_ports *key_ports = NULL;
        __be32 ports;

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
                key_ports = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_PORTS,
                                                      target_container);

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE))
                key_ports_range = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_PORTS_RANGE,
                                                            target_container);

        if (!key_ports && !key_ports_range)
                return;

        ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);

        if (key_ports)
                key_ports->ports = ports;

        if (key_ports_range)
                key_ports_range->tp.ports = ports;
}

static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        const struct iphdr *iph)
{
        struct flow_dissector_key_ip *key_ip;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
                return;

        key_ip = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IP,
                                           target_container);
        key_ip->tos = iph->tos;
        key_ip->ttl = iph->ttl;
}

static void
__skb_flow_dissect_ipv6(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        const struct ipv6hdr *iph)
{
        struct flow_dissector_key_ip *key_ip;

        if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IP))
                return;

        key_ip = skb_flow_dissector_target(flow_dissector,
                                           FLOW_DISSECTOR_KEY_IP,
                                           target_container);
        key_ip->tos = ipv6_get_dsfield(iph);
        key_ip->ttl = iph->hop_limit;
}

/* Maximum number of protocol headers that can be parsed in
 * __skb_flow_dissect
 */
#define MAX_FLOW_DISSECT_HDRS        15

static bool skb_flow_dissect_allowed(int *num_hdrs)
{
        ++*num_hdrs;

        return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
}

static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
                                     struct flow_dissector *flow_dissector,
                                     void *target_container)
{
        struct flow_dissector_key_ports_range *key_ports_range = NULL;
        struct flow_dissector_key_ports *key_ports = NULL;
        struct flow_dissector_key_control *key_control;
        struct flow_dissector_key_basic *key_basic;
        struct flow_dissector_key_addrs *key_addrs;
        struct flow_dissector_key_tags *key_tags;

        key_control = skb_flow_dissector_target(flow_dissector,
                                                FLOW_DISSECTOR_KEY_CONTROL,
                                                target_container);
        key_control->thoff = flow_keys->thoff;
        if (flow_keys->is_frag)
                key_control->flags |= FLOW_DIS_IS_FRAGMENT;
        if (flow_keys->is_first_frag)
                key_control->flags |= FLOW_DIS_FIRST_FRAG;
        if (flow_keys->is_encap)
                key_control->flags |= FLOW_DIS_ENCAPSULATION;

        key_basic = skb_flow_dissector_target(flow_dissector,
                                              FLOW_DISSECTOR_KEY_BASIC,
                                              target_container);
        key_basic->n_proto = flow_keys->n_proto;
        key_basic->ip_proto = flow_keys->ip_proto;

        if (flow_keys->addr_proto == ETH_P_IP &&
            dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
                key_addrs = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                      target_container);
                key_addrs->v4addrs.src = flow_keys->ipv4_src;
                key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
                key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
        } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
                   dissector_uses_key(flow_dissector,
                                      FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
                key_addrs = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                      target_container);
                memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src,
                       sizeof(key_addrs->v6addrs.src));
                memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst,
                       sizeof(key_addrs->v6addrs.dst));
                key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        }

        if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
                key_ports = skb_flow_dissector_target(flow_dissector,
                                                      FLOW_DISSECTOR_KEY_PORTS,
                                                      target_container);
                key_ports->src = flow_keys->sport;
                key_ports->dst = flow_keys->dport;
        }
        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_PORTS_RANGE)) {
                key_ports_range = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_PORTS_RANGE,
                                                            target_container);
                key_ports_range->tp.src = flow_keys->sport;
                key_ports_range->tp.dst = flow_keys->dport;
        }

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
                key_tags = skb_flow_dissector_target(flow_dissector,
                                                     FLOW_DISSECTOR_KEY_FLOW_LABEL,
                                                     target_container);
                key_tags->flow_label = ntohl(flow_keys->flow_label);
        }
}

u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
                     __be16 proto, int nhoff, int hlen, unsigned int flags)
{
        struct bpf_flow_keys *flow_keys = ctx->flow_keys;
        u32 result;

        /* Pass parameters to the BPF program */
        memset(flow_keys, 0, sizeof(*flow_keys));
        flow_keys->n_proto = proto;
        flow_keys->nhoff = nhoff;
        flow_keys->thoff = flow_keys->nhoff;

        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
                     (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
                     (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
        BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
                     (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
        flow_keys->flags = flags;

        result = bpf_prog_run_pin_on_cpu(prog, ctx);

        flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
        flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
                                   flow_keys->nhoff, hlen);

        return result;
}

static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr)
{
        return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0;
}

/**
 * __skb_flow_dissect - extract the flow_keys struct and return it
 * @net: associated network namespace, derived from @skb if NULL
 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
 * @flow_dissector: list of keys to dissect
 * @target_container: target structure to put dissected values into
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
 * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
 * @flags: flags that control the dissection process, e.g.
 *         FLOW_DISSECTOR_F_STOP_AT_ENCAP.
 *
 * The function will try to retrieve individual keys into target specified
 * by flow_dissector from either the skbuff or a raw buffer specified by the
 * rest parameters.
 *
 * Caller must take care of zeroing target container memory.
 */
bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
                        void *target_container, const void *data,
                        __be16 proto, int nhoff, int hlen, unsigned int flags)
{
        struct flow_dissector_key_control *key_control;
        struct flow_dissector_key_basic *key_basic;
        struct flow_dissector_key_addrs *key_addrs;
        struct flow_dissector_key_tags *key_tags;
        struct flow_dissector_key_vlan *key_vlan;
        enum flow_dissect_ret fdret;
        enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
        bool mpls_el = false;
        int mpls_lse = 0;
        int num_hdrs = 0;
        u8 ip_proto = 0;
        bool ret;

        if (!data) {
                data = skb->data;
                proto = skb_vlan_tag_present(skb) ?
                         skb->vlan_proto : skb->protocol;
                nhoff = skb_network_offset(skb);
                hlen = skb_headlen(skb);
#if IS_ENABLED(CONFIG_NET_DSA)
                if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
                             proto == htons(ETH_P_XDSA))) {
                        struct metadata_dst *md_dst = skb_metadata_dst(skb);
                        const struct dsa_device_ops *ops;
                        int offset = 0;

                        ops = skb->dev->dsa_ptr->tag_ops;
                        /* Only DSA header taggers break flow dissection */
                        if (ops->needed_headroom &&
                            (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) {
                                if (ops->flow_dissect)
                                        ops->flow_dissect(skb, &proto, &offset);
                                else
                                        dsa_tag_generic_flow_dissect(skb,
                                                                     &proto,
                                                                     &offset);
                                hlen -= offset;
                                nhoff += offset;
                        }
                }
#endif
        }

        /* It is ensured by skb_flow_dissector_init() that control key will
         * be always present.
         */
        key_control = skb_flow_dissector_target(flow_dissector,
                                                FLOW_DISSECTOR_KEY_CONTROL,
                                                target_container);

        /* It is ensured by skb_flow_dissector_init() that basic key will
         * be always present.
         */
        key_basic = skb_flow_dissector_target(flow_dissector,
                                              FLOW_DISSECTOR_KEY_BASIC,
                                              target_container);

        rcu_read_lock();

        if (skb) {
                if (!net) {
                        if (skb->dev)
                                net = dev_net_rcu(skb->dev);
                        else if (skb->sk)
                                net = sock_net(skb->sk);
                }
        }

        DEBUG_NET_WARN_ON_ONCE(!net);
        if (net) {
                enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
                struct bpf_prog_array *run_array;

                run_array = rcu_dereference(init_net.bpf.run_array[type]);
                if (!run_array)
                        run_array = rcu_dereference(net->bpf.run_array[type]);

                if (run_array) {
                        struct bpf_flow_keys flow_keys;
                        struct bpf_flow_dissector ctx = {
                                .flow_keys = &flow_keys,
                                .data = data,
                                .data_end = data + hlen,
                        };
                        __be16 n_proto = proto;
                        struct bpf_prog *prog;
                        u32 result;

                        if (skb) {
                                ctx.skb = skb;
                                /* we can't use 'proto' in the skb case
                                 * because it might be set to skb->vlan_proto
                                 * which has been pulled from the data
                                 */
                                n_proto = skb->protocol;
                        }

                        prog = READ_ONCE(run_array->items[0].prog);
                        result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
                                                  hlen, flags);
                        if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
                                __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
                                                         target_container);
                                rcu_read_unlock();
                                return result == BPF_OK;
                        }
                }
        }

        rcu_read_unlock();

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
                struct ethhdr *eth = eth_hdr(skb);
                struct flow_dissector_key_eth_addrs *key_eth_addrs;

                key_eth_addrs = skb_flow_dissector_target(flow_dissector,
                                                          FLOW_DISSECTOR_KEY_ETH_ADDRS,
                                                          target_container);
                memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs));
        }

        if (dissector_uses_key(flow_dissector,
                               FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) {
                struct flow_dissector_key_num_of_vlans *key_num_of_vlans;

                key_num_of_vlans = skb_flow_dissector_target(flow_dissector,
                                                             FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
                                                             target_container);
                key_num_of_vlans->num_of_vlans = 0;
        }

proto_again:
        fdret = FLOW_DISSECT_RET_CONTINUE;

        switch (proto) {
        case htons(ETH_P_IP): {
                const struct iphdr *iph;
                struct iphdr _iph;

                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
                if (!iph || iph->ihl < 5) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += iph->ihl * 4;

                ip_proto = iph->protocol;

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                                                              target_container);

                        memcpy(&key_addrs->v4addrs.src, &iph->saddr,
                               sizeof(key_addrs->v4addrs.src));
                        memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
                               sizeof(key_addrs->v4addrs.dst));
                        key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                }

                __skb_flow_dissect_ipv4(skb, flow_dissector,
                                        target_container, data, iph);

                if (ip_is_fragment(iph)) {
                        key_control->flags |= FLOW_DIS_IS_FRAGMENT;

                        if (iph->frag_off & htons(IP_OFFSET)) {
                                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                break;
                        } else {
                                key_control->flags |= FLOW_DIS_FIRST_FRAG;
                                if (!(flags &
                                      FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) {
                                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                        break;
                                }
                        }
                }

                break;
        }
        case htons(ETH_P_IPV6): {
                const struct ipv6hdr *iph;
                struct ipv6hdr _iph;

                iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
                if (!iph) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                ip_proto = iph->nexthdr;
                nhoff += sizeof(struct ipv6hdr);

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                                                              target_container);

                        memcpy(&key_addrs->v6addrs.src, &iph->saddr,
                               sizeof(key_addrs->v6addrs.src));
                        memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
                               sizeof(key_addrs->v6addrs.dst));
                        key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                }

                if ((dissector_uses_key(flow_dissector,
                                        FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
                     (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
                    ip6_flowlabel(iph)) {
                        __be32 flow_label = ip6_flowlabel(iph);

                        if (dissector_uses_key(flow_dissector,
                                               FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
                                key_tags = skb_flow_dissector_target(flow_dissector,
                                                                     FLOW_DISSECTOR_KEY_FLOW_LABEL,
                                                                     target_container);
                                key_tags->flow_label = ntohl(flow_label);
                        }
                        if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) {
                                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                                break;
                        }
                }

                __skb_flow_dissect_ipv6(skb, flow_dissector,
                                        target_container, data, iph);

                break;
        }
        case htons(ETH_P_8021AD):
        case htons(ETH_P_8021Q): {
                const struct vlan_hdr *vlan = NULL;
                struct vlan_hdr _vlan;
                __be16 saved_vlan_tpid = proto;

                if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX &&
                    skb && skb_vlan_tag_present(skb)) {
                        proto = skb->protocol;
                } else {
                        vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan),
                                                    data, hlen, &_vlan);
                        if (!vlan) {
                                fdret = FLOW_DISSECT_RET_OUT_BAD;
                                break;
                        }

                        proto = vlan->h_vlan_encapsulated_proto;
                        nhoff += sizeof(*vlan);
                }

                if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) &&
                    !(key_control->flags & FLOW_DIS_ENCAPSULATION)) {
                        struct flow_dissector_key_num_of_vlans *key_nvs;

                        key_nvs = skb_flow_dissector_target(flow_dissector,
                                                            FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
                                                            target_container);
                        key_nvs->num_of_vlans++;
                }

                if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
                        dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
                } else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
                        dissector_vlan = FLOW_DISSECTOR_KEY_CVLAN;
                } else {
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                        break;
                }

                if (dissector_uses_key(flow_dissector, dissector_vlan)) {
                        key_vlan = skb_flow_dissector_target(flow_dissector,
                                                             dissector_vlan,
                                                             target_container);

                        if (!vlan) {
                                key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
                                key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb);
                        } else {
                                key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
                                        VLAN_VID_MASK;
                                key_vlan->vlan_priority =
                                        (ntohs(vlan->h_vlan_TCI) &
                                         VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
                        }
                        key_vlan->vlan_tpid = saved_vlan_tpid;
                        key_vlan->vlan_eth_type = proto;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;
        }
        case htons(ETH_P_PPP_SES): {
                struct {
                        struct pppoe_hdr hdr;
                        __be16 proto;
                } *hdr, _hdr;
                u16 ppp_proto;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                /* least significant bit of the most significant octet
                 * indicates if protocol field was compressed
                 */
                ppp_proto = ntohs(hdr->proto);
                if (ppp_proto & 0x0100) {
                        ppp_proto = ppp_proto >> 8;
                        nhoff += PPPOE_SES_HLEN - 1;
                } else {
                        nhoff += PPPOE_SES_HLEN;
                }

                if (ppp_proto == PPP_IP) {
                        proto = htons(ETH_P_IP);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_IPV6) {
                        proto = htons(ETH_P_IPV6);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_MPLS_UC) {
                        proto = htons(ETH_P_MPLS_UC);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto == PPP_MPLS_MC) {
                        proto = htons(ETH_P_MPLS_MC);
                        fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                } else if (ppp_proto_is_valid(ppp_proto)) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                } else {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_PPPOE)) {
                        struct flow_dissector_key_pppoe *key_pppoe;

                        key_pppoe = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_PPPOE,
                                                              target_container);
                        key_pppoe->session_id = hdr->hdr.sid;
                        key_pppoe->ppp_proto = htons(ppp_proto);
                        key_pppoe->type = htons(ETH_P_PPP_SES);
                }
                break;
        }
        case htons(ETH_P_TIPC): {
                struct tipc_basic_hdr *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr),
                                           data, hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                if (dissector_uses_key(flow_dissector,
                                       FLOW_DISSECTOR_KEY_TIPC)) {
                        key_addrs = skb_flow_dissector_target(flow_dissector,
                                                              FLOW_DISSECTOR_KEY_TIPC,
                                                              target_container);
                        key_addrs->tipckey.key = tipc_hdr_rps_key(hdr);
                        key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC;
                }
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }

        case htons(ETH_P_MPLS_UC):
        case htons(ETH_P_MPLS_MC):
                fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
                                                target_container, data,
                                                nhoff, hlen, mpls_lse,
                                                &mpls_el);
                nhoff += sizeof(struct mpls_label);
                mpls_lse++;
                break;
        case htons(ETH_P_FCOE):
                if ((hlen - nhoff) < FCOE_HEADER_LEN) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += FCOE_HEADER_LEN;
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;

        case htons(ETH_P_ARP):
        case htons(ETH_P_RARP):
                fdret = __skb_flow_dissect_arp(skb, flow_dissector,
                                               target_container, data,
                                               nhoff, hlen);
                break;

        case htons(ETH_P_BATMAN):
                fdret = __skb_flow_dissect_batadv(skb, key_control, data,
                                                  &proto, &nhoff, hlen, flags);
                break;

        case htons(ETH_P_1588): {
                struct ptp_header *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
                                           hlen, &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                nhoff += sizeof(struct ptp_header);
                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }

        case htons(ETH_P_PRP):
        case htons(ETH_P_HSR): {
                struct hsr_tag *hdr, _hdr;

                hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen,
                                           &_hdr);
                if (!hdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                proto = hdr->encap_proto;
                nhoff += HSR_HLEN;
                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;
        }

        case htons(ETH_P_CFM):
                fdret = __skb_flow_dissect_cfm(skb, flow_dissector,
                                               target_container, data,
                                               nhoff, hlen);
                break;

        default:
                fdret = FLOW_DISSECT_RET_OUT_BAD;
                break;
        }

        /* Process result of proto processing */
        switch (fdret) {
        case FLOW_DISSECT_RET_OUT_GOOD:
                goto out_good;
        case FLOW_DISSECT_RET_PROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto proto_again;
                goto out_good;
        case FLOW_DISSECT_RET_CONTINUE:
        case FLOW_DISSECT_RET_IPPROTO_AGAIN:
                break;
        case FLOW_DISSECT_RET_OUT_BAD:
        default:
                goto out_bad;
        }

ip_proto_again:
        fdret = FLOW_DISSECT_RET_CONTINUE;

        switch (ip_proto) {
        case IPPROTO_GRE:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
                                               target_container, data,
                                               &proto, &nhoff, &hlen, flags);
                break;

        case NEXTHDR_HOP:
        case NEXTHDR_ROUTING:
        case NEXTHDR_DEST: {
                u8 _opthdr[2], *opthdr;

                if (proto != htons(ETH_P_IPV6))
                        break;

                opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr),
                                              data, hlen, &_opthdr);
                if (!opthdr) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                ip_proto = opthdr[0];
                nhoff += (opthdr[1] + 1) << 3;

                fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
                break;
        }
        case NEXTHDR_FRAGMENT: {
                struct frag_hdr _fh, *fh;

                if (proto != htons(ETH_P_IPV6))
                        break;

                fh = __skb_header_pointer(skb, nhoff, sizeof(_fh),
                                          data, hlen, &_fh);

                if (!fh) {
                        fdret = FLOW_DISSECT_RET_OUT_BAD;
                        break;
                }

                key_control->flags |= FLOW_DIS_IS_FRAGMENT;

                nhoff += sizeof(_fh);
                ip_proto = fh->nexthdr;

                if (!(fh->frag_off & htons(IP6_OFFSET))) {
                        key_control->flags |= FLOW_DIS_FIRST_FRAG;
                        if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
                                fdret = FLOW_DISSECT_RET_IPPROTO_AGAIN;
                                break;
                        }
                }

                fdret = FLOW_DISSECT_RET_OUT_GOOD;
                break;
        }
        case IPPROTO_IPIP:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                proto = htons(ETH_P_IP);

                key_control->flags |= FLOW_DIS_ENCAPSULATION;
                if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;

        case IPPROTO_IPV6:
                if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                proto = htons(ETH_P_IPV6);

                key_control->flags |= FLOW_DIS_ENCAPSULATION;
                if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) {
                        fdret = FLOW_DISSECT_RET_OUT_GOOD;
                        break;
                }

                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;


        case IPPROTO_MPLS:
                proto = htons(ETH_P_MPLS_UC);
                fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
                break;

        case IPPROTO_TCP:
                __skb_flow_dissect_tcp(skb, flow_dissector, target_container,
                                       data, nhoff, hlen);
                break;

        case IPPROTO_ICMP:
        case IPPROTO_ICMPV6:
                __skb_flow_dissect_icmp(skb, flow_dissector, target_container,
                                        data, nhoff, hlen);
                break;
        case IPPROTO_L2TP:
                __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container,
                                          data, nhoff, hlen);
                break;
        case IPPROTO_ESP:
                __skb_flow_dissect_esp(skb, flow_dissector, target_container,
                                       data, nhoff, hlen);
                break;
        case IPPROTO_AH:
                __skb_flow_dissect_ah(skb, flow_dissector, target_container,
                                      data, nhoff, hlen);
                break;
        default:
                break;
        }

        if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT))
                __skb_flow_dissect_ports(skb, flow_dissector, target_container,
                                         data, nhoff, ip_proto, hlen);

        /* Process result of IP proto processing */
        switch (fdret) {
        case FLOW_DISSECT_RET_PROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto proto_again;
                break;
        case FLOW_DISSECT_RET_IPPROTO_AGAIN:
                if (skb_flow_dissect_allowed(&num_hdrs))
                        goto ip_proto_again;
                break;
        case FLOW_DISSECT_RET_OUT_GOOD:
        case FLOW_DISSECT_RET_CONTINUE:
                break;
        case FLOW_DISSECT_RET_OUT_BAD:
        default:
                goto out_bad;
        }

out_good:
        ret = true;

out:
        key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen);
        key_basic->n_proto = proto;
        key_basic->ip_proto = ip_proto;

        return ret;

out_bad:
        ret = false;
        goto out;
}
EXPORT_SYMBOL(__skb_flow_dissect);

static siphash_aligned_key_t hashrnd;
static __always_inline void __flow_hash_secret_init(void)
{
        net_get_random_once(&hashrnd, sizeof(hashrnd));
}

static const void *flow_keys_hash_start(const struct flow_keys *flow)
{
        BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
        return &flow->FLOW_KEYS_HASH_START_FIELD;
}

static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
{
        size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);

        BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));

        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                diff -= sizeof(flow->addrs.v4addrs);
                break;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                diff -= sizeof(flow->addrs.v6addrs);
                break;
        case FLOW_DISSECTOR_KEY_TIPC:
                diff -= sizeof(flow->addrs.tipckey);
                break;
        }
        return sizeof(*flow) - diff;
}

__be32 flow_get_u32_src(const struct flow_keys *flow)
{
        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                return flow->addrs.v4addrs.src;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                return (__force __be32)ipv6_addr_hash(
                        &flow->addrs.v6addrs.src);
        case FLOW_DISSECTOR_KEY_TIPC:
                return flow->addrs.tipckey.key;
        default:
                return 0;
        }
}
EXPORT_SYMBOL(flow_get_u32_src);

__be32 flow_get_u32_dst(const struct flow_keys *flow)
{
        switch (flow->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                return flow->addrs.v4addrs.dst;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                return (__force __be32)ipv6_addr_hash(
                        &flow->addrs.v6addrs.dst);
        default:
                return 0;
        }
}
EXPORT_SYMBOL(flow_get_u32_dst);

/* Sort the source and destination IP and the ports,
 * to have consistent hash within the two directions
 */
static inline void __flow_hash_consistentify(struct flow_keys *keys)
{
        int addr_diff, i;

        switch (keys->control.addr_type) {
        case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
                if ((__force u32)keys->addrs.v4addrs.dst <
                    (__force u32)keys->addrs.v4addrs.src)
                        swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);

                if ((__force u16)keys->ports.dst <
                    (__force u16)keys->ports.src) {
                        swap(keys->ports.src, keys->ports.dst);
                }
                break;
        case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
                addr_diff = memcmp(&keys->addrs.v6addrs.dst,
                                   &keys->addrs.v6addrs.src,
                                   sizeof(keys->addrs.v6addrs.dst));
                if (addr_diff < 0) {
                        for (i = 0; i < 4; i++)
                                swap(keys->addrs.v6addrs.src.s6_addr32[i],
                                     keys->addrs.v6addrs.dst.s6_addr32[i]);
                }
                if ((__force u16)keys->ports.dst <
                    (__force u16)keys->ports.src) {
                        swap(keys->ports.src, keys->ports.dst);
                }
                break;
        }
}

static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
                                        const siphash_key_t *keyval)
{
        u32 hash;

        __flow_hash_consistentify(keys);

        hash = siphash(flow_keys_hash_start(keys),
                       flow_keys_hash_length(keys), keyval);
        if (!hash)
                hash = 1;

        return hash;
}

u32 flow_hash_from_keys(struct flow_keys *keys)
{
        __flow_hash_secret_init();
        return __flow_hash_from_keys(keys, &hashrnd);
}
EXPORT_SYMBOL(flow_hash_from_keys);

u32 flow_hash_from_keys_seed(struct flow_keys *keys,
                             const siphash_key_t *keyval)
{
        return __flow_hash_from_keys(keys, keyval);
}
EXPORT_SYMBOL(flow_hash_from_keys_seed);

static inline u32 ___skb_get_hash(const struct sk_buff *skb,
                                  struct flow_keys *keys,
                                  const siphash_key_t *keyval)
{
        skb_flow_dissect_flow_keys(skb, keys,
                                   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);

        return __flow_hash_from_keys(keys, keyval);
}

struct _flow_keys_digest_data {
        __be16        n_proto;
        u8        ip_proto;
        u8        padding;
        __be32        ports;
        __be32        src;
        __be32        dst;
};

void make_flow_keys_digest(struct flow_keys_digest *digest,
                           const struct flow_keys *flow)
{
        struct _flow_keys_digest_data *data =
            (struct _flow_keys_digest_data *)digest;

        BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));

        memset(digest, 0, sizeof(*digest));

        data->n_proto = flow->basic.n_proto;
        data->ip_proto = flow->basic.ip_proto;
        data->ports = flow->ports.ports;
        data->src = flow->addrs.v4addrs.src;
        data->dst = flow->addrs.v4addrs.dst;
}
EXPORT_SYMBOL(make_flow_keys_digest);

static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;

u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb)
{
        struct flow_keys keys;

        __flow_hash_secret_init();

        memset(&keys, 0, sizeof(keys));
        __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric,
                           &keys, NULL, 0, 0, 0, 0);

        return __flow_hash_from_keys(&keys, &hashrnd);
}
EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net);

/**
 * __skb_get_hash_net: calculate a flow hash
 * @net: associated network namespace, derived from @skb if NULL
 * @skb: sk_buff to calculate flow hash from
 *
 * This function calculates a flow hash based on src/dst addresses
 * and src/dst port numbers.  Sets hash in skb to non-zero hash value
 * on success, zero indicates no valid hash.  Also, sets l4_hash in skb
 * if hash is a canonical 4-tuple hash over transport ports.
 */
void __skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
        struct flow_keys keys;
        u32 hash;

        memset(&keys, 0, sizeof(keys));

        __skb_flow_dissect(net, skb, &flow_keys_dissector,
                           &keys, NULL, 0, 0, 0,
                           FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);

        __flow_hash_secret_init();

        hash = __flow_hash_from_keys(&keys, &hashrnd);

        __skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
EXPORT_SYMBOL(__skb_get_hash_net);

__u32 skb_get_hash_perturb(const struct sk_buff *skb,
                           const siphash_key_t *perturb)
{
        struct flow_keys keys;

        return ___skb_get_hash(skb, &keys, perturb);
}
EXPORT_SYMBOL(skb_get_hash_perturb);

u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen)
{
        u32 poff = keys->control.thoff;

        /* skip L4 headers for fragments after the first */
        if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
            !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
                return poff;

        switch (keys->basic.ip_proto) {
        case IPPROTO_TCP: {
                /* access doff as u8 to avoid unaligned access */
                const u8 *doff;
                u8 _doff;

                doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
                                            data, hlen, &_doff);
                if (!doff)
                        return poff;

                poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
                break;
        }
        case IPPROTO_UDP:
        case IPPROTO_UDPLITE:
                poff += sizeof(struct udphdr);
                break;
        /* For the rest, we do not really care about header
         * extensions at this point for now.
         */
        case IPPROTO_ICMP:
                poff += sizeof(struct icmphdr);
                break;
        case IPPROTO_ICMPV6:
                poff += sizeof(struct icmp6hdr);
                break;
        case IPPROTO_IGMP:
                poff += sizeof(struct igmphdr);
                break;
        case IPPROTO_DCCP:
                poff += sizeof(struct dccp_hdr);
                break;
        case IPPROTO_SCTP:
                poff += sizeof(struct sctphdr);
                break;
        }

        return poff;
}

/**
 * skb_get_poff - get the offset to the payload
 * @skb: sk_buff to get the payload offset from
 *
 * The function will get the offset to the payload as far as it could
 * be dissected.  The main user is currently BPF, so that we can dynamically
 * truncate packets without needing to push actual payload to the user
 * space and can analyze headers only, instead.
 */
u32 skb_get_poff(const struct sk_buff *skb)
{
        struct flow_keys_basic keys;

        if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
                                              NULL, 0, 0, 0, 0))
                return 0;

        return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
}

__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys)
{
        memset(keys, 0, sizeof(*keys));

        memcpy(&keys->addrs.v6addrs.src, &fl6->saddr,
            sizeof(keys->addrs.v6addrs.src));
        memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr,
            sizeof(keys->addrs.v6addrs.dst));
        keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
        keys->ports.src = fl6->fl6_sport;
        keys->ports.dst = fl6->fl6_dport;
        keys->keyid.keyid = fl6->fl6_gre_key;
        keys->tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
        keys->basic.ip_proto = fl6->flowi6_proto;

        return flow_hash_from_keys(keys);
}
EXPORT_SYMBOL(__get_hash_from_flowi6);

static const struct flow_dissector_key flow_keys_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v4addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v6addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_TIPC,
                .offset = offsetof(struct flow_keys, addrs.tipckey),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct flow_keys, ports),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_VLAN,
                .offset = offsetof(struct flow_keys, vlan),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
                .offset = offsetof(struct flow_keys, tags),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
                .offset = offsetof(struct flow_keys, keyid),
        },
};

static const struct flow_dissector_key flow_keys_dissector_symmetric_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v4addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v6addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct flow_keys, ports),
        },
};

static const struct flow_dissector_key flow_keys_basic_dissector_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
};

struct flow_dissector flow_keys_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_dissector);

struct flow_dissector flow_keys_basic_dissector __read_mostly;
EXPORT_SYMBOL(flow_keys_basic_dissector);

static int __init init_default_flow_dissectors(void)
{
        skb_flow_dissector_init(&flow_keys_dissector,
                                flow_keys_dissector_keys,
                                ARRAY_SIZE(flow_keys_dissector_keys));
        skb_flow_dissector_init(&flow_keys_dissector_symmetric,
                                flow_keys_dissector_symmetric_keys,
                                ARRAY_SIZE(flow_keys_dissector_symmetric_keys));
        skb_flow_dissector_init(&flow_keys_basic_dissector,
                                flow_keys_basic_dissector_keys,
                                ARRAY_SIZE(flow_keys_basic_dissector_keys));
        return 0;
}
core_initcall(init_default_flow_dissectors);
































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 

    3 


    3 





















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
// SPDX-License-Identifier: GPL-2.0

/* net/sched/sch_taprio.c         Time Aware Priority Scheduler
 *
 * Authors:        Vinicius Costa Gomes <vinicius.gomes@intel.com>
 *
 */

#include <linux/ethtool.h>
#include <linux/ethtool_netlink.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/list.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/math64.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/time.h>
#include <net/gso.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/sch_generic.h>
#include <net/sock.h>
#include <net/tcp.h>

#define TAPRIO_STAT_NOT_SET        (~0ULL)

#include "sch_mqprio_lib.h"

static LIST_HEAD(taprio_list);
static struct static_key_false taprio_have_broken_mqprio;
static struct static_key_false taprio_have_working_mqprio;

#define TAPRIO_ALL_GATES_OPEN -1

#define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST)
#define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
#define TAPRIO_SUPPORTED_FLAGS \
        (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)
#define TAPRIO_FLAGS_INVALID U32_MAX

struct sched_entry {
        /* Durations between this GCL entry and the GCL entry where the
         * respective traffic class gate closes
         */
        u64 gate_duration[TC_MAX_QUEUE];
        atomic_t budget[TC_MAX_QUEUE];
        /* The qdisc makes some effort so that no packet leaves
         * after this time
         */
        ktime_t gate_close_time[TC_MAX_QUEUE];
        struct list_head list;
        /* Used to calculate when to advance the schedule */
        ktime_t end_time;
        ktime_t next_txtime;
        int index;
        u32 gate_mask;
        u32 interval;
        u8 command;
};

struct sched_gate_list {
        /* Longest non-zero contiguous gate durations per traffic class,
         * or 0 if a traffic class gate never opens during the schedule.
         */
        u64 max_open_gate_duration[TC_MAX_QUEUE];
        u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */
        u32 max_sdu[TC_MAX_QUEUE]; /* for dump */
        struct rcu_head rcu;
        struct list_head entries;
        size_t num_entries;
        ktime_t cycle_end_time;
        s64 cycle_time;
        s64 cycle_time_extension;
        s64 base_time;
};

struct taprio_sched {
        struct Qdisc **qdiscs;
        struct Qdisc *root;
        u32 flags;
        enum tk_offsets tk_offset;
        int clockid;
        bool offloaded;
        bool detected_mqprio;
        bool broken_mqprio;
        atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+
                                    * speeds it's sub-nanoseconds per byte
                                    */

        /* Protects the update side of the RCU protected current_entry */
        spinlock_t current_entry_lock;
        struct sched_entry __rcu *current_entry;
        struct sched_gate_list __rcu *oper_sched;
        struct sched_gate_list __rcu *admin_sched;
        struct hrtimer advance_timer;
        struct list_head taprio_list;
        int cur_txq[TC_MAX_QUEUE];
        u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */
        u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */
        u32 txtime_delay;
};

struct __tc_taprio_qopt_offload {
        refcount_t users;
        struct tc_taprio_qopt_offload offload;
};

static void taprio_calculate_gate_durations(struct taprio_sched *q,
                                            struct sched_gate_list *sched)
{
        struct net_device *dev = qdisc_dev(q->root);
        int num_tc = netdev_get_num_tc(dev);
        struct sched_entry *entry, *cur;
        int tc;

        list_for_each_entry(entry, &sched->entries, list) {
                u32 gates_still_open = entry->gate_mask;

                /* For each traffic class, calculate each open gate duration,
                 * starting at this schedule entry and ending at the schedule
                 * entry containing a gate close event for that TC.
                 */
                cur = entry;

                do {
                        if (!gates_still_open)
                                break;

                        for (tc = 0; tc < num_tc; tc++) {
                                if (!(gates_still_open & BIT(tc)))
                                        continue;

                                if (cur->gate_mask & BIT(tc))
                                        entry->gate_duration[tc] += cur->interval;
                                else
                                        gates_still_open &= ~BIT(tc);
                        }

                        cur = list_next_entry_circular(cur, &sched->entries, list);
                } while (cur != entry);

                /* Keep track of the maximum gate duration for each traffic
                 * class, taking care to not confuse a traffic class which is
                 * temporarily closed with one that is always closed.
                 */
                for (tc = 0; tc < num_tc; tc++)
                        if (entry->gate_duration[tc] &&
                            sched->max_open_gate_duration[tc] < entry->gate_duration[tc])
                                sched->max_open_gate_duration[tc] = entry->gate_duration[tc];
        }
}

static bool taprio_entry_allows_tx(ktime_t skb_end_time,
                                   struct sched_entry *entry, int tc)
{
        return ktime_before(skb_end_time, entry->gate_close_time[tc]);
}

static ktime_t sched_base_time(const struct sched_gate_list *sched)
{
        if (!sched)
                return KTIME_MAX;

        return ns_to_ktime(sched->base_time);
}

static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono)
{
        /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */
        enum tk_offsets tk_offset = READ_ONCE(q->tk_offset);

        switch (tk_offset) {
        case TK_OFFS_MAX:
                return mono;
        default:
                return ktime_mono_to_any(mono, tk_offset);
        }
}

static ktime_t taprio_get_time(const struct taprio_sched *q)
{
        return taprio_mono_to_any(q, ktime_get());
}

static void taprio_free_sched_cb(struct rcu_head *head)
{
        struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu);
        struct sched_entry *entry, *n;

        list_for_each_entry_safe(entry, n, &sched->entries, list) {
                list_del(&entry->list);
                kfree(entry);
        }

        kfree(sched);
}

static void switch_schedules(struct taprio_sched *q,
                             struct sched_gate_list **admin,
                             struct sched_gate_list **oper)
{
        rcu_assign_pointer(q->oper_sched, *admin);
        rcu_assign_pointer(q->admin_sched, NULL);

        if (*oper)
                call_rcu(&(*oper)->rcu, taprio_free_sched_cb);

        *oper = *admin;
        *admin = NULL;
}

/* Get how much time has been already elapsed in the current cycle. */
static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time)
{
        ktime_t time_since_sched_start;
        s32 time_elapsed;

        time_since_sched_start = ktime_sub(time, sched->base_time);
        div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed);

        return time_elapsed;
}

static ktime_t get_interval_end_time(struct sched_gate_list *sched,
                                     struct sched_gate_list *admin,
                                     struct sched_entry *entry,
                                     ktime_t intv_start)
{
        s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start);
        ktime_t intv_end, cycle_ext_end, cycle_end;

        cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed);
        intv_end = ktime_add_ns(intv_start, entry->interval);
        cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension);

        if (ktime_before(intv_end, cycle_end))
                return intv_end;
        else if (admin && admin != sched &&
                 ktime_after(admin->base_time, cycle_end) &&
                 ktime_before(admin->base_time, cycle_ext_end))
                return admin->base_time;
        else
                return cycle_end;
}

static int length_to_duration(struct taprio_sched *q, int len)
{
        return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC);
}

static int duration_to_length(struct taprio_sched *q, u64 duration)
{
        return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte));
}

/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the
 * q->max_sdu[] requested by the user and the max_sdu dynamically determined by
 * the maximum open gate durations at the given link speed.
 */
static void taprio_update_queue_max_sdu(struct taprio_sched *q,
                                        struct sched_gate_list *sched,
                                        struct qdisc_size_table *stab)
{
        struct net_device *dev = qdisc_dev(q->root);
        int num_tc = netdev_get_num_tc(dev);
        u32 max_sdu_from_user;
        u32 max_sdu_dynamic;
        u32 max_sdu;
        int tc;

        for (tc = 0; tc < num_tc; tc++) {
                max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX;

                /* TC gate never closes => keep the queueMaxSDU
                 * selected by the user
                 */
                if (sched->max_open_gate_duration[tc] == sched->cycle_time) {
                        max_sdu_dynamic = U32_MAX;
                } else {
                        u32 max_frm_len;

                        max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]);
                        /* Compensate for L1 overhead from size table,
                         * but don't let the frame size go negative
                         */
                        if (stab) {
                                max_frm_len -= stab->szopts.overhead;
                                max_frm_len = max_t(int, max_frm_len,
                                                    dev->hard_header_len + 1);
                        }
                        max_sdu_dynamic = max_frm_len - dev->hard_header_len;
                        if (max_sdu_dynamic > dev->max_mtu)
                                max_sdu_dynamic = U32_MAX;
                }

                max_sdu = min(max_sdu_dynamic, max_sdu_from_user);

                if (max_sdu != U32_MAX) {
                        sched->max_frm_len[tc] = max_sdu + dev->hard_header_len;
                        sched->max_sdu[tc] = max_sdu;
                } else {
                        sched->max_frm_len[tc] = U32_MAX; /* never oversized */
                        sched->max_sdu[tc] = 0;
                }
        }
}

/* Returns the entry corresponding to next available interval. If
 * validate_interval is set, it only validates whether the timestamp occurs
 * when the gate corresponding to the skb's traffic class is open.
 */
static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb,
                                                  struct Qdisc *sch,
                                                  struct sched_gate_list *sched,
                                                  struct sched_gate_list *admin,
                                                  ktime_t time,
                                                  ktime_t *interval_start,
                                                  ktime_t *interval_end,
                                                  bool validate_interval)
{
        ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time;
        ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time;
        struct sched_entry *entry = NULL, *entry_found = NULL;
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        bool entry_available = false;
        s32 cycle_elapsed;
        int tc, n;

        tc = netdev_get_prio_tc_map(dev, skb->priority);
        packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb));

        *interval_start = 0;
        *interval_end = 0;

        if (!sched)
                return NULL;

        cycle = sched->cycle_time;
        cycle_elapsed = get_cycle_time_elapsed(sched, time);
        curr_intv_end = ktime_sub_ns(time, cycle_elapsed);
        cycle_end = ktime_add_ns(curr_intv_end, cycle);

        list_for_each_entry(entry, &sched->entries, list) {
                curr_intv_start = curr_intv_end;
                curr_intv_end = get_interval_end_time(sched, admin, entry,
                                                      curr_intv_start);

                if (ktime_after(curr_intv_start, cycle_end))
                        break;

                if (!(entry->gate_mask & BIT(tc)) ||
                    packet_transmit_time > entry->interval)
                        continue;

                txtime = entry->next_txtime;

                if (ktime_before(txtime, time) || validate_interval) {
                        transmit_end_time = ktime_add_ns(time, packet_transmit_time);
                        if ((ktime_before(curr_intv_start, time) &&
                             ktime_before(transmit_end_time, curr_intv_end)) ||
                            (ktime_after(curr_intv_start, time) && !validate_interval)) {
                                entry_found = entry;
                                *interval_start = curr_intv_start;
                                *interval_end = curr_intv_end;
                                break;
                        } else if (!entry_available && !validate_interval) {
                                /* Here, we are just trying to find out the
                                 * first available interval in the next cycle.
                                 */
                                entry_available = true;
                                entry_found = entry;
                                *interval_start = ktime_add_ns(curr_intv_start, cycle);
                                *interval_end = ktime_add_ns(curr_intv_end, cycle);
                        }
                } else if (ktime_before(txtime, earliest_txtime) &&
                           !entry_available) {
                        earliest_txtime = txtime;
                        entry_found = entry;
                        n = div_s64(ktime_sub(txtime, curr_intv_start), cycle);
                        *interval_start = ktime_add(curr_intv_start, n * cycle);
                        *interval_end = ktime_add(curr_intv_end, n * cycle);
                }
        }

        return entry_found;
}

static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct sched_gate_list *sched, *admin;
        ktime_t interval_start, interval_end;
        struct sched_entry *entry;

        rcu_read_lock();
        sched = rcu_dereference(q->oper_sched);
        admin = rcu_dereference(q->admin_sched);

        entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp,
                                       &interval_start, &interval_end, true);
        rcu_read_unlock();

        return entry;
}

/* This returns the tstamp value set by TCP in terms of the set clock. */
static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb)
{
        unsigned int offset = skb_network_offset(skb);
        const struct ipv6hdr *ipv6h;
        const struct iphdr *iph;
        struct ipv6hdr _ipv6h;

        ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
        if (!ipv6h)
                return 0;

        if (ipv6h->version == 4) {
                iph = (struct iphdr *)ipv6h;
                offset += iph->ihl * 4;

                /* special-case 6in4 tunnelling, as that is a common way to get
                 * v6 connectivity in the home
                 */
                if (iph->protocol == IPPROTO_IPV6) {
                        ipv6h = skb_header_pointer(skb, offset,
                                                   sizeof(_ipv6h), &_ipv6h);

                        if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
                                return 0;
                } else if (iph->protocol != IPPROTO_TCP) {
                        return 0;
                }
        } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) {
                return 0;
        }

        return taprio_mono_to_any(q, skb->skb_mstamp_ns);
}

/* There are a few scenarios where we will have to modify the txtime from
 * what is read from next_txtime in sched_entry. They are:
 * 1. If txtime is in the past,
 *    a. The gate for the traffic class is currently open and packet can be
 *       transmitted before it closes, schedule the packet right away.
 *    b. If the gate corresponding to the traffic class is going to open later
 *       in the cycle, set the txtime of packet to the interval start.
 * 2. If txtime is in the future, there are packets corresponding to the
 *    current traffic class waiting to be transmitted. So, the following
 *    possibilities exist:
 *    a. We can transmit the packet before the window containing the txtime
 *       closes.
 *    b. The window might close before the transmission can be completed
 *       successfully. So, schedule the packet in the next open window.
 */
static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch)
{
        ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp;
        struct taprio_sched *q = qdisc_priv(sch);
        struct sched_gate_list *sched, *admin;
        ktime_t minimum_time, now, txtime;
        int len, packet_transmit_time;
        struct sched_entry *entry;
        bool sched_changed;

        now = taprio_get_time(q);
        minimum_time = ktime_add_ns(now, q->txtime_delay);

        tcp_tstamp = get_tcp_tstamp(q, skb);
        minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp);

        rcu_read_lock();
        admin = rcu_dereference(q->admin_sched);
        sched = rcu_dereference(q->oper_sched);
        if (admin && ktime_after(minimum_time, admin->base_time))
                switch_schedules(q, &admin, &sched);

        /* Until the schedule starts, all the queues are open */
        if (!sched || ktime_before(minimum_time, sched->base_time)) {
                txtime = minimum_time;
                goto done;
        }

        len = qdisc_pkt_len(skb);
        packet_transmit_time = length_to_duration(q, len);

        do {
                sched_changed = false;

                entry = find_entry_to_transmit(skb, sch, sched, admin,
                                               minimum_time,
                                               &interval_start, &interval_end,
                                               false);
                if (!entry) {
                        txtime = 0;
                        goto done;
                }

                txtime = entry->next_txtime;
                txtime = max_t(ktime_t, txtime, minimum_time);
                txtime = max_t(ktime_t, txtime, interval_start);

                if (admin && admin != sched &&
                    ktime_after(txtime, admin->base_time)) {
                        sched = admin;
                        sched_changed = true;
                        continue;
                }

                transmit_end_time = ktime_add(txtime, packet_transmit_time);
                minimum_time = transmit_end_time;

                /* Update the txtime of current entry to the next time it's
                 * interval starts.
                 */
                if (ktime_after(transmit_end_time, interval_end))
                        entry->next_txtime = ktime_add(interval_start, sched->cycle_time);
        } while (sched_changed || ktime_after(transmit_end_time, interval_end));

        entry->next_txtime = transmit_end_time;

done:
        rcu_read_unlock();
        return txtime;
}

/* Devices with full offload are expected to honor this in hardware */
static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch,
                                             struct sk_buff *skb)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        struct sched_gate_list *sched;
        int prio = skb->priority;
        bool exceeds = false;
        u8 tc;

        tc = netdev_get_prio_tc_map(dev, prio);

        rcu_read_lock();
        sched = rcu_dereference(q->oper_sched);
        if (sched && skb->len > sched->max_frm_len[tc])
                exceeds = true;
        rcu_read_unlock();

        return exceeds;
}

static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch,
                              struct Qdisc *child, struct sk_buff **to_free)
{
        struct taprio_sched *q = qdisc_priv(sch);

        /* sk_flags are only safe to use on full sockets. */
        if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) {
                if (!is_valid_interval(skb, sch))
                        return qdisc_drop(skb, sch, to_free);
        } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
                skb->tstamp = get_packet_txtime(skb, sch);
                if (!skb->tstamp)
                        return qdisc_drop(skb, sch, to_free);
        }

        qdisc_qstats_backlog_inc(sch, skb);
        sch->q.qlen++;

        return qdisc_enqueue(skb, child, to_free);
}

static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch,
                                    struct Qdisc *child,
                                    struct sk_buff **to_free)
{
        unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb);
        netdev_features_t features = netif_skb_features(skb);
        struct sk_buff *segs, *nskb;
        int ret;

        segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
        if (IS_ERR_OR_NULL(segs))
                return qdisc_drop(skb, sch, to_free);

        skb_list_walk_safe(segs, segs, nskb) {
                skb_mark_not_on_list(segs);
                qdisc_skb_cb(segs)->pkt_len = segs->len;
                slen += segs->len;

                /* FIXME: we should be segmenting to a smaller size
                 * rather than dropping these
                 */
                if (taprio_skb_exceeds_queue_max_sdu(sch, segs))
                        ret = qdisc_drop(segs, sch, to_free);
                else
                        ret = taprio_enqueue_one(segs, sch, child, to_free);

                if (ret != NET_XMIT_SUCCESS) {
                        if (net_xmit_drop_count(ret))
                                qdisc_qstats_drop(sch);
                } else {
                        numsegs++;
                }
        }

        if (numsegs > 1)
                qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen);
        consume_skb(skb);

        return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}

/* Will not be called in the full offload case, since the TX queues are
 * attached to the Qdisc created using qdisc_create_dflt()
 */
static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
                          struct sk_buff **to_free)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct Qdisc *child;
        int queue;

        queue = skb_get_queue_mapping(skb);

        child = q->qdiscs[queue];
        if (unlikely(!child))
                return qdisc_drop(skb, sch, to_free);

        if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) {
                /* Large packets might not be transmitted when the transmission
                 * duration exceeds any configured interval. Therefore, segment
                 * the skb into smaller chunks. Drivers with full offload are
                 * expected to handle this in hardware.
                 */
                if (skb_is_gso(skb))
                        return taprio_enqueue_segmented(skb, sch, child,
                                                        to_free);

                return qdisc_drop(skb, sch, to_free);
        }

        return taprio_enqueue_one(skb, sch, child, to_free);
}

static struct sk_buff *taprio_peek(struct Qdisc *sch)
{
        WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented");
        return NULL;
}

static void taprio_set_budgets(struct taprio_sched *q,
                               struct sched_gate_list *sched,
                               struct sched_entry *entry)
{
        struct net_device *dev = qdisc_dev(q->root);
        int num_tc = netdev_get_num_tc(dev);
        int tc, budget;

        for (tc = 0; tc < num_tc; tc++) {
                /* Traffic classes which never close have infinite budget */
                if (entry->gate_duration[tc] == sched->cycle_time)
                        budget = INT_MAX;
                else
                        budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC,
                                           atomic64_read(&q->picos_per_byte));

                atomic_set(&entry->budget[tc], budget);
        }
}

/* When an skb is sent, it consumes from the budget of all traffic classes */
static int taprio_update_budgets(struct sched_entry *entry, size_t len,
                                 int tc_consumed, int num_tc)
{
        int tc, budget, new_budget = 0;

        for (tc = 0; tc < num_tc; tc++) {
                budget = atomic_read(&entry->budget[tc]);
                /* Don't consume from infinite budget */
                if (budget == INT_MAX) {
                        if (tc == tc_consumed)
                                new_budget = budget;
                        continue;
                }

                if (tc == tc_consumed)
                        new_budget = atomic_sub_return(len, &entry->budget[tc]);
                else
                        atomic_sub(len, &entry->budget[tc]);
        }

        return new_budget;
}

static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq,
                                               struct sched_entry *entry,
                                               u32 gate_mask)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        struct Qdisc *child = q->qdiscs[txq];
        int num_tc = netdev_get_num_tc(dev);
        struct sk_buff *skb;
        ktime_t guard;
        int prio;
        int len;
        u8 tc;

        if (unlikely(!child))
                return NULL;

        if (TXTIME_ASSIST_IS_ENABLED(q->flags))
                goto skip_peek_checks;

        skb = child->ops->peek(child);
        if (!skb)
                return NULL;

        prio = skb->priority;
        tc = netdev_get_prio_tc_map(dev, prio);

        if (!(gate_mask & BIT(tc)))
                return NULL;

        len = qdisc_pkt_len(skb);
        guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len));

        /* In the case that there's no gate entry, there's no
         * guard band ...
         */
        if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
            !taprio_entry_allows_tx(guard, entry, tc))
                return NULL;

        /* ... and no budget. */
        if (gate_mask != TAPRIO_ALL_GATES_OPEN &&
            taprio_update_budgets(entry, len, tc, num_tc) < 0)
                return NULL;

skip_peek_checks:
        skb = child->ops->dequeue(child);
        if (unlikely(!skb))
                return NULL;

        qdisc_bstats_update(sch, skb);
        qdisc_qstats_backlog_dec(sch, skb);
        sch->q.qlen--;

        return skb;
}

static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq)
{
        int offset = dev->tc_to_txq[tc].offset;
        int count = dev->tc_to_txq[tc].count;

        (*txq)++;
        if (*txq == offset + count)
                *txq = offset;
}

/* Prioritize higher traffic classes, and select among TXQs belonging to the
 * same TC using round robin
 */
static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch,
                                                  struct sched_entry *entry,
                                                  u32 gate_mask)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        int num_tc = netdev_get_num_tc(dev);
        struct sk_buff *skb;
        int tc;

        for (tc = num_tc - 1; tc >= 0; tc--) {
                int first_txq = q->cur_txq[tc];

                if (!(gate_mask & BIT(tc)))
                        continue;

                do {
                        skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc],
                                                      entry, gate_mask);

                        taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]);

                        if (q->cur_txq[tc] >= dev->num_tx_queues)
                                q->cur_txq[tc] = first_txq;

                        if (skb)
                                return skb;
                } while (q->cur_txq[tc] != first_txq);
        }

        return NULL;
}

/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic
 * class other than to determine whether the gate is open or not
 */
static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch,
                                                   struct sched_entry *entry,
                                                   u32 gate_mask)
{
        struct net_device *dev = qdisc_dev(sch);
        struct sk_buff *skb;
        int i;

        for (i = 0; i < dev->num_tx_queues; i++) {
                skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask);
                if (skb)
                        return skb;
        }

        return NULL;
}

/* Will not be called in the full offload case, since the TX queues are
 * attached to the Qdisc created using qdisc_create_dflt()
 */
static struct sk_buff *taprio_dequeue(struct Qdisc *sch)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct sk_buff *skb = NULL;
        struct sched_entry *entry;
        u32 gate_mask;

        rcu_read_lock();
        entry = rcu_dereference(q->current_entry);
        /* if there's no entry, it means that the schedule didn't
         * start yet, so force all gates to be open, this is in
         * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5
         * "AdminGateStates"
         */
        gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN;
        if (!gate_mask)
                goto done;

        if (static_branch_unlikely(&taprio_have_broken_mqprio) &&
            !static_branch_likely(&taprio_have_working_mqprio)) {
                /* Single NIC kind which is broken */
                skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
        } else if (static_branch_likely(&taprio_have_working_mqprio) &&
                   !static_branch_unlikely(&taprio_have_broken_mqprio)) {
                /* Single NIC kind which prioritizes properly */
                skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
        } else {
                /* Mixed NIC kinds present in system, need dynamic testing */
                if (q->broken_mqprio)
                        skb = taprio_dequeue_txq_priority(sch, entry, gate_mask);
                else
                        skb = taprio_dequeue_tc_priority(sch, entry, gate_mask);
        }

done:
        rcu_read_unlock();

        return skb;
}

static bool should_restart_cycle(const struct sched_gate_list *oper,
                                 const struct sched_entry *entry)
{
        if (list_is_last(&entry->list, &oper->entries))
                return true;

        if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0)
                return true;

        return false;
}

static bool should_change_schedules(const struct sched_gate_list *admin,
                                    const struct sched_gate_list *oper,
                                    ktime_t end_time)
{
        ktime_t next_base_time, extension_time;

        if (!admin)
                return false;

        next_base_time = sched_base_time(admin);

        /* This is the simple case, the end_time would fall after
         * the next schedule base_time.
         */
        if (ktime_compare(next_base_time, end_time) <= 0)
                return true;

        /* This is the cycle_time_extension case, if the end_time
         * plus the amount that can be extended would fall after the
         * next schedule base_time, we can extend the current schedule
         * for that amount.
         */
        extension_time = ktime_add_ns(end_time, oper->cycle_time_extension);

        /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about
         * how precisely the extension should be made. So after
         * conformance testing, this logic may change.
         */
        if (ktime_compare(next_base_time, extension_time) <= 0)
                return true;

        return false;
}

static enum hrtimer_restart advance_sched(struct hrtimer *timer)
{
        struct taprio_sched *q = container_of(timer, struct taprio_sched,
                                              advance_timer);
        struct net_device *dev = qdisc_dev(q->root);
        struct sched_gate_list *oper, *admin;
        int num_tc = netdev_get_num_tc(dev);
        struct sched_entry *entry, *next;
        struct Qdisc *sch = q->root;
        ktime_t end_time;
        int tc;

        spin_lock(&q->current_entry_lock);
        entry = rcu_dereference_protected(q->current_entry,
                                          lockdep_is_held(&q->current_entry_lock));
        oper = rcu_dereference_protected(q->oper_sched,
                                         lockdep_is_held(&q->current_entry_lock));
        admin = rcu_dereference_protected(q->admin_sched,
                                          lockdep_is_held(&q->current_entry_lock));

        if (!oper)
                switch_schedules(q, &admin, &oper);

        /* This can happen in two cases: 1. this is the very first run
         * of this function (i.e. we weren't running any schedule
         * previously); 2. The previous schedule just ended. The first
         * entry of all schedules are pre-calculated during the
         * schedule initialization.
         */
        if (unlikely(!entry || entry->end_time == oper->base_time)) {
                next = list_first_entry(&oper->entries, struct sched_entry,
                                        list);
                end_time = next->end_time;
                goto first_run;
        }

        if (should_restart_cycle(oper, entry)) {
                next = list_first_entry(&oper->entries, struct sched_entry,
                                        list);
                oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time,
                                                    oper->cycle_time);
        } else {
                next = list_next_entry(entry, list);
        }

        end_time = ktime_add_ns(entry->end_time, next->interval);
        end_time = min_t(ktime_t, end_time, oper->cycle_end_time);

        for (tc = 0; tc < num_tc; tc++) {
                if (next->gate_duration[tc] == oper->cycle_time)
                        next->gate_close_time[tc] = KTIME_MAX;
                else
                        next->gate_close_time[tc] = ktime_add_ns(entry->end_time,
                                                                 next->gate_duration[tc]);
        }

        if (should_change_schedules(admin, oper, end_time)) {
                /* Set things so the next time this runs, the new
                 * schedule runs.
                 */
                end_time = sched_base_time(admin);
                switch_schedules(q, &admin, &oper);
        }

        next->end_time = end_time;
        taprio_set_budgets(q, oper, next);

first_run:
        rcu_assign_pointer(q->current_entry, next);
        spin_unlock(&q->current_entry_lock);

        hrtimer_set_expires(&q->advance_timer, end_time);

        rcu_read_lock();
        __netif_schedule(sch);
        rcu_read_unlock();

        return HRTIMER_RESTART;
}

static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
        [TCA_TAPRIO_SCHED_ENTRY_INDEX]           = { .type = NLA_U32 },
        [TCA_TAPRIO_SCHED_ENTRY_CMD]           = { .type = NLA_U8 },
        [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 },
        [TCA_TAPRIO_SCHED_ENTRY_INTERVAL]  = { .type = NLA_U32 },
};

static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = {
        [TCA_TAPRIO_TC_ENTRY_INDEX]           = NLA_POLICY_MAX(NLA_U32,
                                                            TC_QOPT_MAX_QUEUE),
        [TCA_TAPRIO_TC_ENTRY_MAX_SDU]           = { .type = NLA_U32 },
        [TCA_TAPRIO_TC_ENTRY_FP]           = NLA_POLICY_RANGE(NLA_U32,
                                                              TC_FP_EXPRESS,
                                                              TC_FP_PREEMPTIBLE),
};

static const struct netlink_range_validation_signed taprio_cycle_time_range = {
        .min = 0,
        .max = INT_MAX,
};

static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
        [TCA_TAPRIO_ATTR_PRIOMAP]               = {
                .len = sizeof(struct tc_mqprio_qopt)
        },
        [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]           = { .type = NLA_NESTED },
        [TCA_TAPRIO_ATTR_SCHED_BASE_TIME]            = { .type = NLA_S64 },
        [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]         = { .type = NLA_NESTED },
        [TCA_TAPRIO_ATTR_SCHED_CLOCKID]              = { .type = NLA_S32 },
        [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]           =
                NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range),
        [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 },
        [TCA_TAPRIO_ATTR_FLAGS]                      =
                NLA_POLICY_MASK(NLA_U32, TAPRIO_SUPPORTED_FLAGS),
        [TCA_TAPRIO_ATTR_TXTIME_DELAY]                     = { .type = NLA_U32 },
        [TCA_TAPRIO_ATTR_TC_ENTRY]                     = { .type = NLA_NESTED },
};

static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb,
                            struct sched_entry *entry,
                            struct netlink_ext_ack *extack)
{
        int min_duration = length_to_duration(q, ETH_ZLEN);
        u32 interval = 0;

        if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD])
                entry->command = nla_get_u8(
                        tb[TCA_TAPRIO_SCHED_ENTRY_CMD]);

        if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK])
                entry->gate_mask = nla_get_u32(
                        tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]);

        if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL])
                interval = nla_get_u32(
                        tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]);

        /* The interval should allow at least the minimum ethernet
         * frame to go out.
         */
        if (interval < min_duration) {
                NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry");
                return -EINVAL;
        }

        entry->interval = interval;

        return 0;
}

static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n,
                             struct sched_entry *entry, int index,
                             struct netlink_ext_ack *extack)
{
        struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { };
        int err;

        err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n,
                                          entry_policy, NULL);
        if (err < 0) {
                NL_SET_ERR_MSG(extack, "Could not parse nested entry");
                return -EINVAL;
        }

        entry->index = index;

        return fill_sched_entry(q, tb, entry, extack);
}

static int parse_sched_list(struct taprio_sched *q, struct nlattr *list,
                            struct sched_gate_list *sched,
                            struct netlink_ext_ack *extack)
{
        struct nlattr *n;
        int err, rem;
        int i = 0;

        if (!list)
                return -EINVAL;

        nla_for_each_nested(n, list, rem) {
                struct sched_entry *entry;

                if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) {
                        NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'");
                        continue;
                }

                entry = kzalloc(sizeof(*entry), GFP_KERNEL);
                if (!entry) {
                        NL_SET_ERR_MSG(extack, "Not enough memory for entry");
                        return -ENOMEM;
                }

                err = parse_sched_entry(q, n, entry, i, extack);
                if (err < 0) {
                        kfree(entry);
                        return err;
                }

                list_add_tail(&entry->list, &sched->entries);
                i++;
        }

        sched->num_entries = i;

        return i;
}

static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb,
                                 struct sched_gate_list *new,
                                 struct netlink_ext_ack *extack)
{
        int err = 0;

        if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) {
                NL_SET_ERR_MSG(extack, "Adding a single entry is not supported");
                return -ENOTSUPP;
        }

        if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME])
                new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]);

        if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION])
                new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]);

        if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME])
                new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]);

        if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST])
                err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST],
                                       new, extack);
        if (err < 0)
                return err;

        if (!new->cycle_time) {
                struct sched_entry *entry;
                ktime_t cycle = 0;

                list_for_each_entry(entry, &new->entries, list)
                        cycle = ktime_add_ns(cycle, entry->interval);

                if (cycle < 0 || cycle > INT_MAX) {
                        NL_SET_ERR_MSG(extack, "'cycle_time' is too big");
                        return -EINVAL;
                }

                new->cycle_time = cycle;
        }

        if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) {
                NL_SET_ERR_MSG(extack, "'cycle_time' is too small");
                return -EINVAL;
        }

        taprio_calculate_gate_durations(q, new);

        return 0;
}

static int taprio_parse_mqprio_opt(struct net_device *dev,
                                   struct tc_mqprio_qopt *qopt,
                                   struct netlink_ext_ack *extack,
                                   u32 taprio_flags)
{
        bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags);

        if (!qopt) {
                if (!dev->num_tc) {
                        NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary");
                        return -EINVAL;
                }
                return 0;
        }

        /* taprio imposes that traffic classes map 1:n to tx queues */
        if (qopt->num_tc > dev->num_tx_queues) {
                NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues");
                return -EINVAL;
        }

        /* For some reason, in txtime-assist mode, we allow TXQ ranges for
         * different TCs to overlap, and just validate the TXQ ranges.
         */
        return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs,
                                    extack);
}

static int taprio_get_start_time(struct Qdisc *sch,
                                 struct sched_gate_list *sched,
                                 ktime_t *start)
{
        struct taprio_sched *q = qdisc_priv(sch);
        ktime_t now, base, cycle;
        s64 n;

        base = sched_base_time(sched);
        now = taprio_get_time(q);

        if (ktime_after(base, now)) {
                *start = base;
                return 0;
        }

        cycle = sched->cycle_time;

        /* The qdisc is expected to have at least one sched_entry.  Moreover,
         * any entry must have 'interval' > 0. Thus if the cycle time is zero,
         * something went really wrong. In that case, we should warn about this
         * inconsistent state and return error.
         */
        if (WARN_ON(!cycle))
                return -EFAULT;

        /* Schedule the start time for the beginning of the next
         * cycle.
         */
        n = div64_s64(ktime_sub_ns(now, base), cycle);
        *start = ktime_add_ns(base, (n + 1) * cycle);
        return 0;
}

static void setup_first_end_time(struct taprio_sched *q,
                                 struct sched_gate_list *sched, ktime_t base)
{
        struct net_device *dev = qdisc_dev(q->root);
        int num_tc = netdev_get_num_tc(dev);
        struct sched_entry *first;
        ktime_t cycle;
        int tc;

        first = list_first_entry(&sched->entries,
                                 struct sched_entry, list);

        cycle = sched->cycle_time;

        /* FIXME: find a better place to do this */
        sched->cycle_end_time = ktime_add_ns(base, cycle);

        first->end_time = ktime_add_ns(base, first->interval);
        taprio_set_budgets(q, sched, first);

        for (tc = 0; tc < num_tc; tc++) {
                if (first->gate_duration[tc] == sched->cycle_time)
                        first->gate_close_time[tc] = KTIME_MAX;
                else
                        first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]);
        }

        rcu_assign_pointer(q->current_entry, NULL);
}

static void taprio_start_sched(struct Qdisc *sch,
                               ktime_t start, struct sched_gate_list *new)
{
        struct taprio_sched *q = qdisc_priv(sch);
        ktime_t expires;

        if (FULL_OFFLOAD_IS_ENABLED(q->flags))
                return;

        expires = hrtimer_get_expires(&q->advance_timer);
        if (expires == 0)
                expires = KTIME_MAX;

        /* If the new schedule starts before the next expiration, we
         * reprogram it to the earliest one, so we change the admin
         * schedule to the operational one at the right time.
         */
        start = min_t(ktime_t, start, expires);

        hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS);
}

static void taprio_set_picos_per_byte(struct net_device *dev,
                                      struct taprio_sched *q)
{
        struct ethtool_link_ksettings ecmd;
        int speed = SPEED_10;
        int picos_per_byte;
        int err;

        err = __ethtool_get_link_ksettings(dev, &ecmd);
        if (err < 0)
                goto skip;

        if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN)
                speed = ecmd.base.speed;

skip:
        picos_per_byte = (USEC_PER_SEC * 8) / speed;

        atomic64_set(&q->picos_per_byte, picos_per_byte);
        netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n",
                   dev->name, (long long)atomic64_read(&q->picos_per_byte),
                   ecmd.base.speed);
}

static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event,
                               void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct sched_gate_list *oper, *admin;
        struct qdisc_size_table *stab;
        struct taprio_sched *q;

        ASSERT_RTNL();

        if (event != NETDEV_UP && event != NETDEV_CHANGE)
                return NOTIFY_DONE;

        list_for_each_entry(q, &taprio_list, taprio_list) {
                if (dev != qdisc_dev(q->root))
                        continue;

                taprio_set_picos_per_byte(dev, q);

                stab = rtnl_dereference(q->root->stab);

                oper = rtnl_dereference(q->oper_sched);
                if (oper)
                        taprio_update_queue_max_sdu(q, oper, stab);

                admin = rtnl_dereference(q->admin_sched);
                if (admin)
                        taprio_update_queue_max_sdu(q, admin, stab);

                break;
        }

        return NOTIFY_DONE;
}

static void setup_txtime(struct taprio_sched *q,
                         struct sched_gate_list *sched, ktime_t base)
{
        struct sched_entry *entry;
        u64 interval = 0;

        list_for_each_entry(entry, &sched->entries, list) {
                entry->next_txtime = ktime_add_ns(base, interval);
                interval += entry->interval;
        }
}

static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries)
{
        struct __tc_taprio_qopt_offload *__offload;

        __offload = kzalloc(struct_size(__offload, offload.entries, num_entries),
                            GFP_KERNEL);
        if (!__offload)
                return NULL;

        refcount_set(&__offload->users, 1);

        return &__offload->offload;
}

struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload
                                                  *offload)
{
        struct __tc_taprio_qopt_offload *__offload;

        __offload = container_of(offload, struct __tc_taprio_qopt_offload,
                                 offload);

        refcount_inc(&__offload->users);

        return offload;
}
EXPORT_SYMBOL_GPL(taprio_offload_get);

void taprio_offload_free(struct tc_taprio_qopt_offload *offload)
{
        struct __tc_taprio_qopt_offload *__offload;

        __offload = container_of(offload, struct __tc_taprio_qopt_offload,
                                 offload);

        if (!refcount_dec_and_test(&__offload->users))
                return;

        kfree(__offload);
}
EXPORT_SYMBOL_GPL(taprio_offload_free);

/* The function will only serve to keep the pointers to the "oper" and "admin"
 * schedules valid in relation to their base times, so when calling dump() the
 * users looks at the right schedules.
 * When using full offload, the admin configuration is promoted to oper at the
 * base_time in the PHC time domain.  But because the system time is not
 * necessarily in sync with that, we can't just trigger a hrtimer to call
 * switch_schedules at the right hardware time.
 * At the moment we call this by hand right away from taprio, but in the future
 * it will be useful to create a mechanism for drivers to notify taprio of the
 * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump().
 * This is left as TODO.
 */
static void taprio_offload_config_changed(struct taprio_sched *q)
{
        struct sched_gate_list *oper, *admin;

        oper = rtnl_dereference(q->oper_sched);
        admin = rtnl_dereference(q->admin_sched);

        switch_schedules(q, &admin, &oper);
}

static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask)
{
        u32 i, queue_mask = 0;

        for (i = 0; i < dev->num_tc; i++) {
                u32 offset, count;

                if (!(tc_mask & BIT(i)))
                        continue;

                offset = dev->tc_to_txq[i].offset;
                count = dev->tc_to_txq[i].count;

                queue_mask |= GENMASK(offset + count - 1, offset);
        }

        return queue_mask;
}

static void taprio_sched_to_offload(struct net_device *dev,
                                    struct sched_gate_list *sched,
                                    struct tc_taprio_qopt_offload *offload,
                                    const struct tc_taprio_caps *caps)
{
        struct sched_entry *entry;
        int i = 0;

        offload->base_time = sched->base_time;
        offload->cycle_time = sched->cycle_time;
        offload->cycle_time_extension = sched->cycle_time_extension;

        list_for_each_entry(entry, &sched->entries, list) {
                struct tc_taprio_sched_entry *e = &offload->entries[i];

                e->command = entry->command;
                e->interval = entry->interval;
                if (caps->gate_mask_per_txq)
                        e->gate_mask = tc_map_to_queue_mask(dev,
                                                            entry->gate_mask);
                else
                        e->gate_mask = entry->gate_mask;

                i++;
        }

        offload->num_entries = i;
}

static void taprio_detect_broken_mqprio(struct taprio_sched *q)
{
        struct net_device *dev = qdisc_dev(q->root);
        struct tc_taprio_caps caps;

        qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
                                 &caps, sizeof(caps));

        q->broken_mqprio = caps.broken_mqprio;
        if (q->broken_mqprio)
                static_branch_inc(&taprio_have_broken_mqprio);
        else
                static_branch_inc(&taprio_have_working_mqprio);

        q->detected_mqprio = true;
}

static void taprio_cleanup_broken_mqprio(struct taprio_sched *q)
{
        if (!q->detected_mqprio)
                return;

        if (q->broken_mqprio)
                static_branch_dec(&taprio_have_broken_mqprio);
        else
                static_branch_dec(&taprio_have_working_mqprio);
}

static int taprio_enable_offload(struct net_device *dev,
                                 struct taprio_sched *q,
                                 struct sched_gate_list *sched,
                                 struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        struct tc_taprio_qopt_offload *offload;
        struct tc_taprio_caps caps;
        int tc, err = 0;

        if (!ops->ndo_setup_tc) {
                NL_SET_ERR_MSG(extack,
                               "Device does not support taprio offload");
                return -EOPNOTSUPP;
        }

        qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO,
                                 &caps, sizeof(caps));

        if (!caps.supports_queue_max_sdu) {
                for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
                        if (q->max_sdu[tc]) {
                                NL_SET_ERR_MSG_MOD(extack,
                                                   "Device does not handle queueMaxSDU");
                                return -EOPNOTSUPP;
                        }
                }
        }

        offload = taprio_offload_alloc(sched->num_entries);
        if (!offload) {
                NL_SET_ERR_MSG(extack,
                               "Not enough memory for enabling offload mode");
                return -ENOMEM;
        }
        offload->cmd = TAPRIO_CMD_REPLACE;
        offload->extack = extack;
        mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt);
        offload->mqprio.extack = extack;
        taprio_sched_to_offload(dev, sched, offload, &caps);
        mqprio_fp_to_offload(q->fp, &offload->mqprio);

        for (tc = 0; tc < TC_MAX_QUEUE; tc++)
                offload->max_sdu[tc] = q->max_sdu[tc];

        err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
        if (err < 0) {
                NL_SET_ERR_MSG_WEAK(extack,
                                    "Device failed to setup taprio offload");
                goto done;
        }

        q->offloaded = true;

done:
        /* The offload structure may linger around via a reference taken by the
         * device driver, so clear up the netlink extack pointer so that the
         * driver isn't tempted to dereference data which stopped being valid
         */
        offload->extack = NULL;
        offload->mqprio.extack = NULL;
        taprio_offload_free(offload);

        return err;
}

static int taprio_disable_offload(struct net_device *dev,
                                  struct taprio_sched *q,
                                  struct netlink_ext_ack *extack)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        struct tc_taprio_qopt_offload *offload;
        int err;

        if (!q->offloaded)
                return 0;

        offload = taprio_offload_alloc(0);
        if (!offload) {
                NL_SET_ERR_MSG(extack,
                               "Not enough memory to disable offload mode");
                return -ENOMEM;
        }
        offload->cmd = TAPRIO_CMD_DESTROY;

        err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
        if (err < 0) {
                NL_SET_ERR_MSG(extack,
                               "Device failed to disable offload");
                goto out;
        }

        q->offloaded = false;

out:
        taprio_offload_free(offload);

        return err;
}

/* If full offload is enabled, the only possible clockid is the net device's
 * PHC. For that reason, specifying a clockid through netlink is incorrect.
 * For txtime-assist, it is implicitly assumed that the device's PHC is kept
 * in sync with the specified clockid via a user space daemon such as phc2sys.
 * For both software taprio and txtime-assist, the clockid is used for the
 * hrtimer that advances the schedule and hence mandatory.
 */
static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb,
                                struct netlink_ext_ack *extack)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        int err = -EINVAL;

        if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
                const struct ethtool_ops *ops = dev->ethtool_ops;
                struct kernel_ethtool_ts_info info = {
                        .cmd = ETHTOOL_GET_TS_INFO,
                        .phc_index = -1,
                };

                if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
                        NL_SET_ERR_MSG(extack,
                                       "The 'clockid' cannot be specified for full offload");
                        goto out;
                }

                if (ops && ops->get_ts_info)
                        err = ops->get_ts_info(dev, &info);

                if (err || info.phc_index < 0) {
                        NL_SET_ERR_MSG(extack,
                                       "Device does not have a PTP clock");
                        err = -ENOTSUPP;
                        goto out;
                }
        } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) {
                int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]);
                enum tk_offsets tk_offset;

                /* We only support static clockids and we don't allow
                 * for it to be modified after the first init.
                 */
                if (clockid < 0 ||
                    (q->clockid != -1 && q->clockid != clockid)) {
                        NL_SET_ERR_MSG(extack,
                                       "Changing the 'clockid' of a running schedule is not supported");
                        err = -ENOTSUPP;
                        goto out;
                }

                switch (clockid) {
                case CLOCK_REALTIME:
                        tk_offset = TK_OFFS_REAL;
                        break;
                case CLOCK_MONOTONIC:
                        tk_offset = TK_OFFS_MAX;
                        break;
                case CLOCK_BOOTTIME:
                        tk_offset = TK_OFFS_BOOT;
                        break;
                case CLOCK_TAI:
                        tk_offset = TK_OFFS_TAI;
                        break;
                default:
                        NL_SET_ERR_MSG(extack, "Invalid 'clockid'");
                        err = -EINVAL;
                        goto out;
                }
                /* This pairs with READ_ONCE() in taprio_mono_to_any */
                WRITE_ONCE(q->tk_offset, tk_offset);

                q->clockid = clockid;
        } else {
                NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory");
                goto out;
        }

        /* Everything went ok, return success. */
        err = 0;

out:
        return err;
}

static int taprio_parse_tc_entry(struct Qdisc *sch,
                                 struct nlattr *opt,
                                 u32 max_sdu[TC_QOPT_MAX_QUEUE],
                                 u32 fp[TC_QOPT_MAX_QUEUE],
                                 unsigned long *seen_tcs,
                                 struct netlink_ext_ack *extack)
{
        struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { };
        struct net_device *dev = qdisc_dev(sch);
        int err, tc;
        u32 val;

        err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt,
                               taprio_tc_policy, extack);
        if (err < 0)
                return err;

        if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) {
                NL_SET_ERR_MSG_MOD(extack, "TC entry index missing");
                return -EINVAL;
        }

        tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]);
        if (tc >= TC_QOPT_MAX_QUEUE) {
                NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range");
                return -ERANGE;
        }

        if (*seen_tcs & BIT(tc)) {
                NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry");
                return -EINVAL;
        }

        *seen_tcs |= BIT(tc);

        if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) {
                val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]);
                if (val > dev->max_mtu) {
                        NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU");
                        return -ERANGE;
                }

                max_sdu[tc] = val;
        }

        if (tb[TCA_TAPRIO_TC_ENTRY_FP])
                fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]);

        return 0;
}

static int taprio_parse_tc_entries(struct Qdisc *sch,
                                   struct nlattr *opt,
                                   struct netlink_ext_ack *extack)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        u32 max_sdu[TC_QOPT_MAX_QUEUE];
        bool have_preemption = false;
        unsigned long seen_tcs = 0;
        u32 fp[TC_QOPT_MAX_QUEUE];
        struct nlattr *n;
        int tc, rem;
        int err = 0;

        for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
                max_sdu[tc] = q->max_sdu[tc];
                fp[tc] = q->fp[tc];
        }

        nla_for_each_nested_type(n, TCA_TAPRIO_ATTR_TC_ENTRY, opt, rem) {
                err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs,
                                            extack);
                if (err)
                        return err;
        }

        for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) {
                q->max_sdu[tc] = max_sdu[tc];
                q->fp[tc] = fp[tc];
                if (fp[tc] != TC_FP_EXPRESS)
                        have_preemption = true;
        }

        if (have_preemption) {
                if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) {
                        NL_SET_ERR_MSG(extack,
                                       "Preemption only supported with full offload");
                        return -EOPNOTSUPP;
                }

                if (!ethtool_dev_mm_supported(dev)) {
                        NL_SET_ERR_MSG(extack,
                                       "Device does not support preemption");
                        return -EOPNOTSUPP;
                }
        }

        return err;
}

static int taprio_mqprio_cmp(const struct net_device *dev,
                             const struct tc_mqprio_qopt *mqprio)
{
        int i;

        if (!mqprio || mqprio->num_tc != dev->num_tc)
                return -1;

        for (i = 0; i < mqprio->num_tc; i++)
                if (dev->tc_to_txq[i].count != mqprio->count[i] ||
                    dev->tc_to_txq[i].offset != mqprio->offset[i])
                        return -1;

        for (i = 0; i <= TC_BITMASK; i++)
                if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i])
                        return -1;

        return 0;
}

static int taprio_change(struct Qdisc *sch, struct nlattr *opt,
                         struct netlink_ext_ack *extack)
{
        struct qdisc_size_table *stab = rtnl_dereference(sch->stab);
        struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { };
        struct sched_gate_list *oper, *admin, *new_admin;
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        struct tc_mqprio_qopt *mqprio = NULL;
        unsigned long flags;
        u32 taprio_flags;
        ktime_t start;
        int i, err;

        err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt,
                                          taprio_policy, extack);
        if (err < 0)
                return err;

        if (tb[TCA_TAPRIO_ATTR_PRIOMAP])
                mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]);

        /* The semantics of the 'flags' argument in relation to 'change()'
         * requests, are interpreted following two rules (which are applied in
         * this order): (1) an omitted 'flags' argument is interpreted as
         * zero; (2) the 'flags' of a "running" taprio instance cannot be
         * changed.
         */
        taprio_flags = nla_get_u32_default(tb[TCA_TAPRIO_ATTR_FLAGS], 0);

        /* txtime-assist and full offload are mutually exclusive */
        if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) &&
            (taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) {
                NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS],
                                    "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive");
                return -EINVAL;
        }

        if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Changing 'flags' of a running schedule is not supported");
                return -EOPNOTSUPP;
        }
        q->flags = taprio_flags;

        /* Needed for length_to_duration() during netlink attribute parsing */
        taprio_set_picos_per_byte(dev, q);

        err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags);
        if (err < 0)
                return err;

        err = taprio_parse_tc_entries(sch, opt, extack);
        if (err)
                return err;

        new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL);
        if (!new_admin) {
                NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule");
                return -ENOMEM;
        }
        INIT_LIST_HEAD(&new_admin->entries);

        oper = rtnl_dereference(q->oper_sched);
        admin = rtnl_dereference(q->admin_sched);

        /* no changes - no new mqprio settings */
        if (!taprio_mqprio_cmp(dev, mqprio))
                mqprio = NULL;

        if (mqprio && (oper || admin)) {
                NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported");
                err = -ENOTSUPP;
                goto free_sched;
        }

        if (mqprio) {
                err = netdev_set_num_tc(dev, mqprio->num_tc);
                if (err)
                        goto free_sched;
                for (i = 0; i < mqprio->num_tc; i++) {
                        netdev_set_tc_queue(dev, i,
                                            mqprio->count[i],
                                            mqprio->offset[i]);
                        q->cur_txq[i] = mqprio->offset[i];
                }

                /* Always use supplied priority mappings */
                for (i = 0; i <= TC_BITMASK; i++)
                        netdev_set_prio_tc_map(dev, i,
                                               mqprio->prio_tc_map[i]);
        }

        err = parse_taprio_schedule(q, tb, new_admin, extack);
        if (err < 0)
                goto free_sched;

        if (new_admin->num_entries == 0) {
                NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule");
                err = -EINVAL;
                goto free_sched;
        }

        err = taprio_parse_clockid(sch, tb, extack);
        if (err < 0)
                goto free_sched;

        taprio_update_queue_max_sdu(q, new_admin, stab);

        if (FULL_OFFLOAD_IS_ENABLED(q->flags))
                err = taprio_enable_offload(dev, q, new_admin, extack);
        else
                err = taprio_disable_offload(dev, q, extack);
        if (err)
                goto free_sched;

        /* Protects against enqueue()/dequeue() */
        spin_lock_bh(qdisc_lock(sch));

        if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) {
                if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) {
                        NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled");
                        err = -EINVAL;
                        goto unlock;
                }

                q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]);
        }

        if (!TXTIME_ASSIST_IS_ENABLED(q->flags) &&
            !FULL_OFFLOAD_IS_ENABLED(q->flags) &&
            !hrtimer_active(&q->advance_timer)) {
                hrtimer_setup(&q->advance_timer, advance_sched, q->clockid, HRTIMER_MODE_ABS);
        }

        err = taprio_get_start_time(sch, new_admin, &start);
        if (err < 0) {
                NL_SET_ERR_MSG(extack, "Internal error: failed get start time");
                goto unlock;
        }

        setup_txtime(q, new_admin, start);

        if (TXTIME_ASSIST_IS_ENABLED(q->flags)) {
                if (!oper) {
                        rcu_assign_pointer(q->oper_sched, new_admin);
                        err = 0;
                        new_admin = NULL;
                        goto unlock;
                }

                /* Not going to race against advance_sched(), but still */
                admin = rcu_replace_pointer(q->admin_sched, new_admin,
                                            lockdep_rtnl_is_held());
                if (admin)
                        call_rcu(&admin->rcu, taprio_free_sched_cb);
        } else {
                setup_first_end_time(q, new_admin, start);

                /* Protects against advance_sched() */
                spin_lock_irqsave(&q->current_entry_lock, flags);

                taprio_start_sched(sch, start, new_admin);

                admin = rcu_replace_pointer(q->admin_sched, new_admin,
                                            lockdep_rtnl_is_held());
                if (admin)
                        call_rcu(&admin->rcu, taprio_free_sched_cb);

                spin_unlock_irqrestore(&q->current_entry_lock, flags);

                if (FULL_OFFLOAD_IS_ENABLED(q->flags))
                        taprio_offload_config_changed(q);
        }

        new_admin = NULL;
        err = 0;

        if (!stab)
                NL_SET_ERR_MSG_MOD(extack,
                                   "Size table not specified, frame length estimations may be inaccurate");

unlock:
        spin_unlock_bh(qdisc_lock(sch));

free_sched:
        if (new_admin)
                call_rcu(&new_admin->rcu, taprio_free_sched_cb);

        return err;
}

static void taprio_reset(struct Qdisc *sch)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        int i;

        hrtimer_cancel(&q->advance_timer);

        if (q->qdiscs) {
                for (i = 0; i < dev->num_tx_queues; i++)
                        if (q->qdiscs[i])
                                qdisc_reset(q->qdiscs[i]);
        }
}

static void taprio_destroy(struct Qdisc *sch)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        struct sched_gate_list *oper, *admin;
        unsigned int i;

        list_del(&q->taprio_list);

        /* Note that taprio_reset() might not be called if an error
         * happens in qdisc_create(), after taprio_init() has been called.
         */
        hrtimer_cancel(&q->advance_timer);
        qdisc_synchronize(sch);

        taprio_disable_offload(dev, q, NULL);

        if (q->qdiscs) {
                for (i = 0; i < dev->num_tx_queues; i++)
                        qdisc_put(q->qdiscs[i]);

                kfree(q->qdiscs);
        }
        q->qdiscs = NULL;

        netdev_reset_tc(dev);

        oper = rtnl_dereference(q->oper_sched);
        admin = rtnl_dereference(q->admin_sched);

        if (oper)
                call_rcu(&oper->rcu, taprio_free_sched_cb);

        if (admin)
                call_rcu(&admin->rcu, taprio_free_sched_cb);

        taprio_cleanup_broken_mqprio(q);
}

static int taprio_init(struct Qdisc *sch, struct nlattr *opt,
                       struct netlink_ext_ack *extack)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        int i, tc;

        spin_lock_init(&q->current_entry_lock);

        hrtimer_setup(&q->advance_timer, advance_sched, CLOCK_TAI, HRTIMER_MODE_ABS);

        q->root = sch;

        /* We only support static clockids. Use an invalid value as default
         * and get the valid one on taprio_change().
         */
        q->clockid = -1;
        q->flags = TAPRIO_FLAGS_INVALID;

        list_add(&q->taprio_list, &taprio_list);

        if (sch->parent != TC_H_ROOT) {
                NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc");
                return -EOPNOTSUPP;
        }

        if (!netif_is_multiqueue(dev)) {
                NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required");
                return -EOPNOTSUPP;
        }

        q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]),
                            GFP_KERNEL);
        if (!q->qdiscs)
                return -ENOMEM;

        if (!opt)
                return -EINVAL;

        for (i = 0; i < dev->num_tx_queues; i++) {
                struct netdev_queue *dev_queue;
                struct Qdisc *qdisc;

                dev_queue = netdev_get_tx_queue(dev, i);
                qdisc = qdisc_create_dflt(dev_queue,
                                          &pfifo_qdisc_ops,
                                          TC_H_MAKE(TC_H_MAJ(sch->handle),
                                                    TC_H_MIN(i + 1)),
                                          extack);
                if (!qdisc)
                        return -ENOMEM;

                if (i < dev->real_num_tx_queues)
                        qdisc_hash_add(qdisc, false);

                q->qdiscs[i] = qdisc;
        }

        for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++)
                q->fp[tc] = TC_FP_EXPRESS;

        taprio_detect_broken_mqprio(q);

        return taprio_change(sch, opt, extack);
}

static void taprio_attach(struct Qdisc *sch)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        unsigned int ntx;

        /* Attach underlying qdisc */
        for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
                struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx);
                struct Qdisc *old, *dev_queue_qdisc;

                if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
                        struct Qdisc *qdisc = q->qdiscs[ntx];

                        /* In offload mode, the root taprio qdisc is bypassed
                         * and the netdev TX queues see the children directly
                         */
                        qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
                        dev_queue_qdisc = qdisc;
                } else {
                        /* In software mode, attach the root taprio qdisc
                         * to all netdev TX queues, so that dev_qdisc_enqueue()
                         * goes through taprio_enqueue().
                         */
                        dev_queue_qdisc = sch;
                }
                old = dev_graft_qdisc(dev_queue, dev_queue_qdisc);
                /* The qdisc's refcount requires to be elevated once
                 * for each netdev TX queue it is grafted onto
                 */
                qdisc_refcount_inc(dev_queue_qdisc);
                if (old)
                        qdisc_put(old);
        }
}

static struct netdev_queue *taprio_queue_get(struct Qdisc *sch,
                                             unsigned long cl)
{
        struct net_device *dev = qdisc_dev(sch);
        unsigned long ntx = cl - 1;

        if (ntx >= dev->num_tx_queues)
                return NULL;

        return netdev_get_tx_queue(dev, ntx);
}

static int taprio_graft(struct Qdisc *sch, unsigned long cl,
                        struct Qdisc *new, struct Qdisc **old,
                        struct netlink_ext_ack *extack)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        struct netdev_queue *dev_queue = taprio_queue_get(sch, cl);

        if (!dev_queue)
                return -EINVAL;

        if (dev->flags & IFF_UP)
                dev_deactivate(dev);

        /* In offload mode, the child Qdisc is directly attached to the netdev
         * TX queue, and thus, we need to keep its refcount elevated in order
         * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue.
         * However, save the reference to the new qdisc in the private array in
         * both software and offload cases, to have an up-to-date reference to
         * our children.
         */
        *old = q->qdiscs[cl - 1];
        if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
                WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old);
                if (new)
                        qdisc_refcount_inc(new);
                if (*old)
                        qdisc_put(*old);
        }

        q->qdiscs[cl - 1] = new;
        if (new)
                new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;

        if (dev->flags & IFF_UP)
                dev_activate(dev);

        return 0;
}

static int dump_entry(struct sk_buff *msg,
                      const struct sched_entry *entry)
{
        struct nlattr *item;

        item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY);
        if (!item)
                return -ENOSPC;

        if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index))
                goto nla_put_failure;

        if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command))
                goto nla_put_failure;

        if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK,
                        entry->gate_mask))
                goto nla_put_failure;

        if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL,
                        entry->interval))
                goto nla_put_failure;

        return nla_nest_end(msg, item);

nla_put_failure:
        nla_nest_cancel(msg, item);
        return -1;
}

static int dump_schedule(struct sk_buff *msg,
                         const struct sched_gate_list *root)
{
        struct nlattr *entry_list;
        struct sched_entry *entry;

        if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME,
                        root->base_time, TCA_TAPRIO_PAD))
                return -1;

        if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME,
                        root->cycle_time, TCA_TAPRIO_PAD))
                return -1;

        if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION,
                        root->cycle_time_extension, TCA_TAPRIO_PAD))
                return -1;

        entry_list = nla_nest_start_noflag(msg,
                                           TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST);
        if (!entry_list)
                goto error_nest;

        list_for_each_entry(entry, &root->entries, list) {
                if (dump_entry(msg, entry) < 0)
                        goto error_nest;
        }

        nla_nest_end(msg, entry_list);
        return 0;

error_nest:
        nla_nest_cancel(msg, entry_list);
        return -1;
}

static int taprio_dump_tc_entries(struct sk_buff *skb,
                                  struct taprio_sched *q,
                                  struct sched_gate_list *sched)
{
        struct nlattr *n;
        int tc;

        for (tc = 0; tc < TC_MAX_QUEUE; tc++) {
                n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY);
                if (!n)
                        return -EMSGSIZE;

                if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc))
                        goto nla_put_failure;

                if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU,
                                sched->max_sdu[tc]))
                        goto nla_put_failure;

                if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc]))
                        goto nla_put_failure;

                nla_nest_end(skb, n);
        }

        return 0;

nla_put_failure:
        nla_nest_cancel(skb, n);
        return -EMSGSIZE;
}

static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype)
{
        if (val == TAPRIO_STAT_NOT_SET)
                return 0;
        if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD))
                return -EMSGSIZE;
        return 0;
}

static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d,
                              struct tc_taprio_qopt_offload *offload,
                              struct tc_taprio_qopt_stats *stats)
{
        struct net_device *dev = qdisc_dev(sch);
        const struct net_device_ops *ops;
        struct sk_buff *skb = d->skb;
        struct nlattr *xstats;
        int err;

        ops = qdisc_dev(sch)->netdev_ops;

        /* FIXME I could use qdisc_offload_dump_helper(), but that messes
         * with sch->flags depending on whether the device reports taprio
         * stats, and I'm not sure whether that's a good idea, considering
         * that stats are optional to the offload itself
         */
        if (!ops->ndo_setup_tc)
                return 0;

        memset(stats, 0xff, sizeof(*stats));

        err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload);
        if (err == -EOPNOTSUPP)
                return 0;
        if (err)
                return err;

        xstats = nla_nest_start(skb, TCA_STATS_APP);
        if (!xstats)
                goto err;

        if (taprio_put_stat(skb, stats->window_drops,
                            TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) ||
            taprio_put_stat(skb, stats->tx_overruns,
                            TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS))
                goto err_cancel;

        nla_nest_end(skb, xstats);

        return 0;

err_cancel:
        nla_nest_cancel(skb, xstats);
err:
        return -EMSGSIZE;
}

static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
        struct tc_taprio_qopt_offload offload = {
                .cmd = TAPRIO_CMD_STATS,
        };

        return taprio_dump_xstats(sch, d, &offload, &offload.stats);
}

static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        struct sched_gate_list *oper, *admin;
        struct tc_mqprio_qopt opt = { 0 };
        struct nlattr *nest, *sched_nest;

        mqprio_qopt_reconstruct(dev, &opt);

        nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
        if (!nest)
                goto start_error;

        if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt))
                goto options_error;

        if (!FULL_OFFLOAD_IS_ENABLED(q->flags) &&
            nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid))
                goto options_error;

        if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags))
                goto options_error;

        if (q->txtime_delay &&
            nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay))
                goto options_error;

        rcu_read_lock();

        oper = rtnl_dereference(q->oper_sched);
        admin = rtnl_dereference(q->admin_sched);

        if (oper && taprio_dump_tc_entries(skb, q, oper))
                goto options_error_rcu;

        if (oper && dump_schedule(skb, oper))
                goto options_error_rcu;

        if (!admin)
                goto done;

        sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED);
        if (!sched_nest)
                goto options_error_rcu;

        if (dump_schedule(skb, admin))
                goto admin_error;

        nla_nest_end(skb, sched_nest);

done:
        rcu_read_unlock();
        return nla_nest_end(skb, nest);

admin_error:
        nla_nest_cancel(skb, sched_nest);

options_error_rcu:
        rcu_read_unlock();

options_error:
        nla_nest_cancel(skb, nest);

start_error:
        return -ENOSPC;
}

static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl)
{
        struct taprio_sched *q = qdisc_priv(sch);
        struct net_device *dev = qdisc_dev(sch);
        unsigned int ntx = cl - 1;

        if (ntx >= dev->num_tx_queues)
                return NULL;

        return q->qdiscs[ntx];
}

static unsigned long taprio_find(struct Qdisc *sch, u32 classid)
{
        unsigned int ntx = TC_H_MIN(classid);

        if (!taprio_queue_get(sch, ntx))
                return 0;
        return ntx;
}

static int taprio_dump_class(struct Qdisc *sch, unsigned long cl,
                             struct sk_buff *skb, struct tcmsg *tcm)
{
        struct Qdisc *child = taprio_leaf(sch, cl);

        tcm->tcm_parent = TC_H_ROOT;
        tcm->tcm_handle |= TC_H_MIN(cl);
        tcm->tcm_info = child->handle;

        return 0;
}

static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
                                   struct gnet_dump *d)
        __releases(d->lock)
        __acquires(d->lock)
{
        struct Qdisc *child = taprio_leaf(sch, cl);
        struct tc_taprio_qopt_offload offload = {
                .cmd = TAPRIO_CMD_QUEUE_STATS,
                .queue_stats = {
                        .queue = cl - 1,
                },
        };

        if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 ||
            qdisc_qstats_copy(d, child) < 0)
                return -1;

        return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats);
}

static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
        struct net_device *dev = qdisc_dev(sch);
        unsigned long ntx;

        if (arg->stop)
                return;

        arg->count = arg->skip;
        for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) {
                if (!tc_qdisc_stats_dump(sch, ntx + 1, arg))
                        break;
        }
}

static struct netdev_queue *taprio_select_queue(struct Qdisc *sch,
                                                struct tcmsg *tcm)
{
        return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
}

static const struct Qdisc_class_ops taprio_class_ops = {
        .graft                = taprio_graft,
        .leaf                = taprio_leaf,
        .find                = taprio_find,
        .walk                = taprio_walk,
        .dump                = taprio_dump_class,
        .dump_stats        = taprio_dump_class_stats,
        .select_queue        = taprio_select_queue,
};

static struct Qdisc_ops taprio_qdisc_ops __read_mostly = {
        .cl_ops                = &taprio_class_ops,
        .id                = "taprio",
        .priv_size        = sizeof(struct taprio_sched),
        .init                = taprio_init,
        .change                = taprio_change,
        .destroy        = taprio_destroy,
        .reset                = taprio_reset,
        .attach                = taprio_attach,
        .peek                = taprio_peek,
        .dequeue        = taprio_dequeue,
        .enqueue        = taprio_enqueue,
        .dump                = taprio_dump,
        .dump_stats        = taprio_dump_stats,
        .owner                = THIS_MODULE,
};
MODULE_ALIAS_NET_SCH("taprio");

static struct notifier_block taprio_device_notifier = {
        .notifier_call = taprio_dev_notifier,
};

static int __init taprio_module_init(void)
{
        int err = register_netdevice_notifier(&taprio_device_notifier);

        if (err)
                return err;

        return register_qdisc(&taprio_qdisc_ops);
}

static void __exit taprio_module_exit(void)
{
        unregister_qdisc(&taprio_qdisc_ops);
        unregister_netdevice_notifier(&taprio_device_notifier);
}

module_init(taprio_module_init);
module_exit(taprio_module_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Time Aware Priority qdisc");
























































































































    8 
























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_H
#define _LINUX_SCHED_TASK_H

/*
 * Interface between the scheduler and various task lifetime (fork()/exit())
 * functionality:
 */

#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/uaccess.h>

struct task_struct;
struct rusage;
union thread_union;
struct css_set;

/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL

struct kernel_clone_args {
        u64 flags;
        int __user *pidfd;
        int __user *child_tid;
        int __user *parent_tid;
        const char *name;
        int exit_signal;
        u32 kthread:1;
        u32 io_thread:1;
        u32 user_worker:1;
        u32 no_files:1;
        unsigned long stack;
        unsigned long stack_size;
        unsigned long tls;
        pid_t *set_tid;
        /* Number of elements in *set_tid */
        size_t set_tid_size;
        int cgroup;
        int idle;
        int (*fn)(void *);
        void *fn_arg;
        struct cgroup *cgrp;
        struct css_set *cset;
        unsigned int kill_seq;
};

/*
 * This serializes "schedule()" and also protects
 * the run-queue from deletions/modifications (but
 * _adding_ to the beginning of the run-queue has
 * a separate lock).
 */
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;

extern union thread_union init_thread_union;
extern struct task_struct init_task;

extern int lockdep_tasklist_lock_is_held(void);

extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);

extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
extern void sched_cancel_fork(struct task_struct *p);
extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);

void __noreturn do_task_dead(void);
void __noreturn make_task_dead(int signr);

extern void mm_cache_init(void);
extern void proc_caches_init(void);

extern void fork_init(void);

extern void release_task(struct task_struct * p);

extern int copy_thread(struct task_struct *, const struct kernel_clone_args *);

extern void flush_thread(void);

#ifdef CONFIG_HAVE_EXIT_THREAD
extern void exit_thread(struct task_struct *tsk);
#else
static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
extern __noreturn void do_group_exit(int);

extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);

extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *copy_process(struct pid *pid, int trace, int node,
                                 struct kernel_clone_args *args);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
                            unsigned long flags);
extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);

extern void free_task(struct task_struct *tsk);

/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
#else
#define sched_exec()   {}
#endif

static inline struct task_struct *get_task_struct(struct task_struct *t)
{
        refcount_inc(&t->usage);
        return t;
}

static inline struct task_struct *tryget_task_struct(struct task_struct *t)
{
        return refcount_inc_not_zero(&t->usage) ? t : NULL;
}

extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);

static inline void put_task_struct(struct task_struct *t)
{
        if (!refcount_dec_and_test(&t->usage))
                return;

        /*
         * In !RT, it is always safe to call __put_task_struct().
         * Under RT, we can only call it in preemptible context.
         */
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) {
                static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP);

                lock_map_acquire_try(&put_task_map);
                __put_task_struct(t);
                lock_map_release(&put_task_map);
                return;
        }

        /*
         * under PREEMPT_RT, we can't call put_task_struct
         * in atomic context because it will indirectly
         * acquire sleeping locks.
         *
         * call_rcu() will schedule delayed_put_task_struct_rcu()
         * to be called in process context.
         *
         * __put_task_struct() is called when
         * refcount_dec_and_test(&t->usage) succeeds.
         *
         * This means that it can't "conflict" with
         * put_task_struct_rcu_user() which abuses ->rcu the same
         * way; rcu_users has a reference so task->usage can't be
         * zero after rcu_users 1 -> 0 transition.
         *
         * delayed_free_task() also uses ->rcu, but it is only called
         * when it fails to fork a process. Therefore, there is no
         * way it can conflict with put_task_struct().
         */
        call_rcu(&t->rcu, __put_task_struct_rcu_cb);
}

DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))

static inline void put_task_struct_many(struct task_struct *t, int nr)
{
        if (refcount_sub_and_test(nr, &t->usage))
                __put_task_struct(t);
}

void put_task_struct_rcu_user(struct task_struct *task);

/* Free all architecture-specific resources held by a thread. */
void release_thread(struct task_struct *dead_task);

#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif

#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
/*
 * If an architecture has not declared a thread_struct whitelist we
 * must assume something there may need to be copied to userspace.
 */
static inline void arch_thread_struct_whitelist(unsigned long *offset,
                                                unsigned long *size)
{
        *offset = 0;
        /* Handle dynamically sized thread_struct. */
        *size = arch_task_struct_size - offsetof(struct task_struct, thread);
}
#endif

#ifdef CONFIG_VMAP_STACK
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return t->stack_vm_area;
}
#else
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
        return NULL;
}
#endif

/*
 * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
 * subscriptions and synchronises with wait4().  Also used in procfs.  Also
 * pins the final release of task.io_context.  Also protects ->cpuset and
 * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
 *
 * Nests both inside and outside of read_lock(&tasklist_lock).
 * It must not be nested with write_lock_irq(&tasklist_lock),
 * neither inside nor outside.
 */
static inline void task_lock(struct task_struct *p)
{
        spin_lock(&p->alloc_lock);
}

static inline void task_unlock(struct task_struct *p)
{
        spin_unlock(&p->alloc_lock);
}

DEFINE_GUARD(task_lock, struct task_struct *, task_lock(_T), task_unlock(_T))

#endif /* _LINUX_SCHED_TASK_H */








































































































    3 




    3 




















    2 






   72 


   72 
   72 



   72 


   72 
   72 





   72 








   72 















































   94 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Derived from arch/arm/kvm/coproc.h
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Authors: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__
#define __ARM64_KVM_SYS_REGS_LOCAL_H__

#include <linux/bsearch.h>

#define reg_to_encoding(x)                                                \
        sys_reg((u32)(x)->Op0, (u32)(x)->Op1,                                \
                (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2)

struct sys_reg_params {
        u8        Op0;
        u8        Op1;
        u8        CRn;
        u8        CRm;
        u8        Op2;
        u64        regval;
        bool        is_write;
};

#define encoding_to_params(reg)                                                \
        ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg),                \
                                  .Op1 = sys_reg_Op1(reg),                \
                                  .CRn = sys_reg_CRn(reg),                \
                                  .CRm = sys_reg_CRm(reg),                \
                                  .Op2 = sys_reg_Op2(reg) })

#define esr_sys64_to_params(esr)                                               \
        ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3,                    \
                                  .Op1 = ((esr) >> 14) & 0x7,                  \
                                  .CRn = ((esr) >> 10) & 0xf,                  \
                                  .CRm = ((esr) >> 1) & 0xf,                   \
                                  .Op2 = ((esr) >> 17) & 0x7,                  \
                                  .is_write = !((esr) & 1) })

#define esr_cp1x_32_to_params(esr)                                                \
        ((struct sys_reg_params){ .Op1 = ((esr) >> 14) & 0x7,                        \
                                  .CRn = ((esr) >> 10) & 0xf,                        \
                                  .CRm = ((esr) >> 1) & 0xf,                        \
                                  .Op2 = ((esr) >> 17) & 0x7,                        \
                                  .is_write = !((esr) & 1) })

struct sys_reg_desc {
        /* Sysreg string for debug */
        const char *name;

        enum {
                AA32_DIRECT,
                AA32_LO,
                AA32_HI,
        } aarch32_map;

        /* MRS/MSR instruction which accesses it. */
        u8        Op0;
        u8        Op1;
        u8        CRn;
        u8        CRm;
        u8        Op2;

        /* Trapped access from guest, if non-NULL. */
        bool (*access)(struct kvm_vcpu *,
                       struct sys_reg_params *,
                       const struct sys_reg_desc *);

        /*
         * Initialization for vcpu. Return initialized value, or KVM
         * sanitized value for ID registers.
         */
        u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *);

        /* Index into sys_reg[], or 0 if we don't need to save it. */
        int reg;

        /* Value (usually reset value), or write mask for idregs */
        u64 val;

        /* Custom get/set_user functions, fallback to generic if NULL */
        int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                        u64 *val);
        int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
                        u64 val);

        /* Return mask of REG_* runtime visibility overrides */
        unsigned int (*visibility)(const struct kvm_vcpu *vcpu,
                                   const struct sys_reg_desc *rd);
};

#define REG_HIDDEN                (1 << 0) /* hidden from userspace and guest */
#define REG_RAZ                        (1 << 1) /* RAZ from userspace and guest */
#define REG_USER_WI                (1 << 2) /* WI from userspace only */

static __printf(2, 3)
inline void print_sys_reg_msg(const struct sys_reg_params *p,
                                       char *fmt, ...)
{
        va_list va;

        va_start(va, fmt);
        /* Look, we even formatted it for you to paste into the table! */
        kvm_pr_unimpl("%pV { Op0(%2u), Op1(%2u), CRn(%2u), CRm(%2u), Op2(%2u), func_%s },\n",
                      &(struct va_format){ fmt, &va },
                      p->Op0, p->Op1, p->CRn, p->CRm, p->Op2, p->is_write ? "write" : "read");
        va_end(va);
}

static inline void print_sys_reg_instr(const struct sys_reg_params *p)
{
        /* GCC warns on an empty format string */
        print_sys_reg_msg(p, "%s", "");
}

static inline bool ignore_write(struct kvm_vcpu *vcpu,
                                const struct sys_reg_params *p)
{
        return true;
}

static inline bool read_zero(struct kvm_vcpu *vcpu,
                             struct sys_reg_params *p)
{
        p->regval = 0;
        return true;
}

/* Reset functions */
static inline u64 reset_unknown(struct kvm_vcpu *vcpu,
                                 const struct sys_reg_desc *r)
{
        BUG_ON(!r->reg);
        BUG_ON(r->reg >= NR_SYS_REGS);
        __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL;
        return __vcpu_sys_reg(vcpu, r->reg);
}

static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
{
        BUG_ON(!r->reg);
        BUG_ON(r->reg >= NR_SYS_REGS);
        __vcpu_sys_reg(vcpu, r->reg) = r->val;
        return __vcpu_sys_reg(vcpu, r->reg);
}

static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu,
                                             const struct sys_reg_desc *r)
{
        if (likely(!r->visibility))
                return 0;

        return r->visibility(vcpu, r);
}

static inline bool sysreg_hidden(const struct kvm_vcpu *vcpu,
                                 const struct sys_reg_desc *r)
{
        return sysreg_visibility(vcpu, r) & REG_HIDDEN;
}

static inline bool sysreg_visible_as_raz(const struct kvm_vcpu *vcpu,
                                         const struct sys_reg_desc *r)
{
        return sysreg_visibility(vcpu, r) & REG_RAZ;
}

static inline bool sysreg_user_write_ignore(const struct kvm_vcpu *vcpu,
                                            const struct sys_reg_desc *r)
{
        return sysreg_visibility(vcpu, r) & REG_USER_WI;
}

static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
                              const struct sys_reg_desc *i2)
{
        BUG_ON(i1 == i2);
        if (!i1)
                return 1;
        else if (!i2)
                return -1;
        if (i1->Op0 != i2->Op0)
                return i1->Op0 - i2->Op0;
        if (i1->Op1 != i2->Op1)
                return i1->Op1 - i2->Op1;
        if (i1->CRn != i2->CRn)
                return i1->CRn - i2->CRn;
        if (i1->CRm != i2->CRm)
                return i1->CRm - i2->CRm;
        return i1->Op2 - i2->Op2;
}

static inline int match_sys_reg(const void *key, const void *elt)
{
        const unsigned long pval = (unsigned long)key;
        const struct sys_reg_desc *r = elt;

        return pval - reg_to_encoding(r);
}

static inline const struct sys_reg_desc *
find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[],
         unsigned int num)
{
        unsigned long pval = reg_to_encoding(params);

        return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
}

const struct sys_reg_desc *get_reg_by_id(u64 id,
                                         const struct sys_reg_desc table[],
                                         unsigned int num);

int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
int kvm_sys_reg_get_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
                         const struct sys_reg_desc table[], unsigned int num);
int kvm_sys_reg_set_user(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg,
                         const struct sys_reg_desc table[], unsigned int num);

bool triage_sysreg_trap(struct kvm_vcpu *vcpu, int *sr_index);

int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu);

#define AA32(_x)        .aarch32_map = AA32_##_x
#define Op0(_x)         .Op0 = _x
#define Op1(_x)         .Op1 = _x
#define CRn(_x)                .CRn = _x
#define CRm(_x)         .CRm = _x
#define Op2(_x)         .Op2 = _x

#define SYS_DESC(reg)                                        \
        .name = #reg,                                        \
        Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)),        \
        CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)),        \
        Op2(sys_reg_Op2(reg))

#define CP15_SYS_DESC(reg)                                \
        .name = #reg,                                        \
        .aarch32_map = AA32_DIRECT,                        \
        Op0(0), Op1(sys_reg_Op1(reg)),                        \
        CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)),        \
        Op2(sys_reg_Op2(reg))

#define ID_REG_LIMIT_FIELD_ENUM(val, reg, field, limit)                               \
({                                                                               \
        u64 __f_val = FIELD_GET(reg##_##field##_MASK, val);                       \
        (val) &= ~reg##_##field##_MASK;                                               \
        (val) |= FIELD_PREP(reg##_##field##_MASK,                               \
                            min(__f_val,                                       \
                                (u64)SYS_FIELD_VALUE(reg, field, limit)));     \
        (val);                                                                       \
})

#endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




























































































































    3 























    3 



















































































































































































    3 





















































































































































































    3 

    3 
























































































































































































    3 






























































































































































































































































































































































































































































































































































































































    3 













    3 





























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_NETLINK_H
#define __NET_NETLINK_H

#include <linux/types.h>
#include <linux/netlink.h>
#include <linux/jiffies.h>
#include <linux/in6.h>

/* ========================================================================
 *         Netlink Messages and Attributes Interface (As Seen On TV)
 * ------------------------------------------------------------------------
 *                          Messages Interface
 * ------------------------------------------------------------------------
 *
 * Message Format:
 *    <--- nlmsg_total_size(payload)  --->
 *    <-- nlmsg_msg_size(payload) ->
 *   +----------+- - -+-------------+- - -+-------- - -
 *   | nlmsghdr | Pad |   Payload   | Pad | nlmsghdr
 *   +----------+- - -+-------------+- - -+-------- - -
 *   nlmsg_data(nlh)---^                   ^
 *   nlmsg_next(nlh)-----------------------+
 *
 * Payload Format:
 *    <---------------------- nlmsg_len(nlh) --------------------->
 *    <------ hdrlen ------>       <- nlmsg_attrlen(nlh, hdrlen) ->
 *   +----------------------+- - -+--------------------------------+
 *   |     Family Header    | Pad |           Attributes           |
 *   +----------------------+- - -+--------------------------------+
 *   nlmsg_attrdata(nlh, hdrlen)---^
 *
 * Data Structures:
 *   struct nlmsghdr                        netlink message header
 *
 * Message Construction:
 *   nlmsg_new()                        create a new netlink message
 *   nlmsg_put()                        add a netlink message to an skb
 *   nlmsg_put_answer()                        callback based nlmsg_put()
 *   nlmsg_end()                        finalize netlink message
 *   nlmsg_get_pos()                        return current position in message
 *   nlmsg_trim()                        trim part of message
 *   nlmsg_cancel()                        cancel message construction
 *   nlmsg_consume()                        free a netlink message (expected)
 *   nlmsg_free()                        free a netlink message (drop)
 *
 * Message Sending:
 *   nlmsg_multicast()                        multicast message to several groups
 *   nlmsg_unicast()                        unicast a message to a single socket
 *   nlmsg_notify()                        send notification message
 *
 * Message Length Calculations:
 *   nlmsg_msg_size(payload)                length of message w/o padding
 *   nlmsg_total_size(payload)                length of message w/ padding
 *   nlmsg_padlen(payload)                length of padding at tail
 *
 * Message Payload Access:
 *   nlmsg_data(nlh)                        head of message payload
 *   nlmsg_len(nlh)                        length of message payload
 *   nlmsg_attrdata(nlh, hdrlen)        head of attributes data
 *   nlmsg_attrlen(nlh, hdrlen)                length of attributes data
 *
 * Message Parsing:
 *   nlmsg_ok(nlh, remaining)                does nlh fit into remaining bytes?
 *   nlmsg_next(nlh, remaining)                get next netlink message
 *   nlmsg_parse()                        parse attributes of a message
 *   nlmsg_find_attr()                        find an attribute in a message
 *   nlmsg_for_each_msg()                loop over all messages
 *   nlmsg_validate()                        validate netlink message incl. attrs
 *   nlmsg_for_each_attr()                loop over all attributes
 *
 * Misc:
 *   nlmsg_report()                        report back to application?
 *
 * ------------------------------------------------------------------------
 *                          Attributes Interface
 * ------------------------------------------------------------------------
 *
 * Attribute Format:
 *    <------- nla_total_size(payload) ------->
 *    <---- nla_attr_size(payload) ----->
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *   |  Header  | Pad |     Payload      | Pad |  Header
 *   +----------+- - -+- - - - - - - - - +- - -+-------- - -
 *                     <- nla_len(nla) ->      ^
 *   nla_data(nla)----^                        |
 *   nla_next(nla)-----------------------------'
 *
 * Data Structures:
 *   struct nlattr                        netlink attribute header
 *
 * Attribute Construction:
 *   nla_reserve(skb, type, len)        reserve room for an attribute
 *   nla_reserve_nohdr(skb, len)        reserve room for an attribute w/o hdr
 *   nla_put(skb, type, len, data)        add attribute to skb
 *   nla_put_nohdr(skb, len, data)        add attribute w/o hdr
 *   nla_append(skb, len, data)                append data to skb
 *
 * Attribute Construction for Basic Types:
 *   nla_put_u8(skb, type, value)        add u8 attribute to skb
 *   nla_put_u16(skb, type, value)        add u16 attribute to skb
 *   nla_put_u32(skb, type, value)        add u32 attribute to skb
 *   nla_put_u64_64bit(skb, type,
 *                     value, padattr)        add u64 attribute to skb
 *   nla_put_s8(skb, type, value)        add s8 attribute to skb
 *   nla_put_s16(skb, type, value)        add s16 attribute to skb
 *   nla_put_s32(skb, type, value)        add s32 attribute to skb
 *   nla_put_s64(skb, type, value,
 *               padattr)                add s64 attribute to skb
 *   nla_put_string(skb, type, str)        add string attribute to skb
 *   nla_put_flag(skb, type)                add flag attribute to skb
 *   nla_put_msecs(skb, type, jiffies,
 *                 padattr)                add msecs attribute to skb
 *   nla_put_in_addr(skb, type, addr)        add IPv4 address attribute to skb
 *   nla_put_in6_addr(skb, type, addr)        add IPv6 address attribute to skb
 *
 * Nested Attributes Construction:
 *   nla_nest_start(skb, type)                start a nested attribute
 *   nla_nest_end(skb, nla)                finalize a nested attribute
 *   nla_nest_cancel(skb, nla)                cancel nested attribute construction
 *   nla_put_empty_nest(skb, type)        create an empty nest
 *
 * Attribute Length Calculations:
 *   nla_attr_size(payload)                length of attribute w/o padding
 *   nla_total_size(payload)                length of attribute w/ padding
 *   nla_padlen(payload)                length of padding
 *
 * Attribute Payload Access:
 *   nla_data(nla)                        head of attribute payload
 *   nla_len(nla)                        length of attribute payload
 *
 * Attribute Payload Access for Basic Types:
 *   nla_get_uint(nla)                        get payload for a uint attribute
 *   nla_get_sint(nla)                        get payload for a sint attribute
 *   nla_get_u8(nla)                        get payload for a u8 attribute
 *   nla_get_u16(nla)                        get payload for a u16 attribute
 *   nla_get_u32(nla)                        get payload for a u32 attribute
 *   nla_get_u64(nla)                        get payload for a u64 attribute
 *   nla_get_s8(nla)                        get payload for a s8 attribute
 *   nla_get_s16(nla)                        get payload for a s16 attribute
 *   nla_get_s32(nla)                        get payload for a s32 attribute
 *   nla_get_s64(nla)                        get payload for a s64 attribute
 *   nla_get_flag(nla)                        return 1 if flag is true
 *   nla_get_msecs(nla)                        get payload for a msecs attribute
 *
 *   The same functions also exist with _default().
 *
 * Attribute Misc:
 *   nla_memcpy(dest, nla, count)        copy attribute into memory
 *   nla_memcmp(nla, data, size)        compare attribute with memory area
 *   nla_strscpy(dst, nla, size)        copy attribute to a sized string
 *   nla_strcmp(nla, str)                compare attribute with string
 *
 * Attribute Parsing:
 *   nla_ok(nla, remaining)                does nla fit into remaining bytes?
 *   nla_next(nla, remaining)                get next netlink attribute
 *   nla_validate()                        validate a stream of attributes
 *   nla_validate_nested()                validate a stream of nested attributes
 *   nla_find()                                find attribute in stream of attributes
 *   nla_find_nested()                        find attribute in nested attributes
 *   nla_parse()                        parse and validate stream of attrs
 *   nla_parse_nested()                        parse nested attributes
 *   nla_for_each_attr()                loop over all attributes
 *   nla_for_each_attr_type()                loop over all attributes with the
 *                                        given type
 *   nla_for_each_nested()                loop over the nested attributes
 *   nla_for_each_nested_type()                loop over the nested attributes with
 *                                        the given type
 *=========================================================================
 */

 /**
  * Standard attribute types to specify validation policy
  */
enum {
        NLA_UNSPEC,
        NLA_U8,
        NLA_U16,
        NLA_U32,
        NLA_U64,
        NLA_STRING,
        NLA_FLAG,
        NLA_MSECS,
        NLA_NESTED,
        NLA_NESTED_ARRAY,
        NLA_NUL_STRING,
        NLA_BINARY,
        NLA_S8,
        NLA_S16,
        NLA_S32,
        NLA_S64,
        NLA_BITFIELD32,
        NLA_REJECT,
        NLA_BE16,
        NLA_BE32,
        NLA_SINT,
        NLA_UINT,
        __NLA_TYPE_MAX,
};

#define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1)

struct netlink_range_validation {
        u64 min, max;
};

struct netlink_range_validation_signed {
        s64 min, max;
};

enum nla_policy_validation {
        NLA_VALIDATE_NONE,
        NLA_VALIDATE_RANGE,
        NLA_VALIDATE_RANGE_WARN_TOO_LONG,
        NLA_VALIDATE_MIN,
        NLA_VALIDATE_MAX,
        NLA_VALIDATE_MASK,
        NLA_VALIDATE_RANGE_PTR,
        NLA_VALIDATE_FUNCTION,
};

/**
 * struct nla_policy - attribute validation policy
 * @type: Type of attribute or NLA_UNSPEC
 * @validation_type: type of attribute validation done in addition to
 *        type-specific validation (e.g. range, function call), see
 *        &enum nla_policy_validation
 * @len: Type specific length of payload
 *
 * Policies are defined as arrays of this struct, the array must be
 * accessible by attribute type up to the highest identifier to be expected.
 *
 * Meaning of `len' field:
 *    NLA_STRING           Maximum length of string
 *    NLA_NUL_STRING       Maximum length of string (excluding NUL)
 *    NLA_FLAG             Unused
 *    NLA_BINARY           Maximum length of attribute payload
 *                         (but see also below with the validation type)
 *    NLA_NESTED,
 *    NLA_NESTED_ARRAY     Length verification is done by checking len of
 *                         nested header (or empty); len field is used if
 *                         nested_policy is also used, for the max attr
 *                         number in the nested policy.
 *    NLA_SINT, NLA_UINT,
 *    NLA_U8, NLA_U16,
 *    NLA_U32, NLA_U64,
 *    NLA_S8, NLA_S16,
 *    NLA_S32, NLA_S64,
 *    NLA_BE16, NLA_BE32,
 *    NLA_MSECS            Leaving the length field zero will verify the
 *                         given type fits, using it verifies minimum length
 *                         just like "All other"
 *    NLA_BITFIELD32       Unused
 *    NLA_REJECT           Unused
 *    All other            Minimum length of attribute payload
 *
 * Meaning of validation union:
 *    NLA_BITFIELD32       This is a 32-bit bitmap/bitselector attribute and
 *                         `bitfield32_valid' is the u32 value of valid flags
 *    NLA_REJECT           This attribute is always rejected and `reject_message'
 *                         may point to a string to report as the error instead
 *                         of the generic one in extended ACK.
 *    NLA_NESTED           `nested_policy' to a nested policy to validate, must
 *                         also set `len' to the max attribute number. Use the
 *                         provided NLA_POLICY_NESTED() macro.
 *                         Note that nla_parse() will validate, but of course not
 *                         parse, the nested sub-policies.
 *    NLA_NESTED_ARRAY     `nested_policy' points to a nested policy to validate,
 *                         must also set `len' to the max attribute number. Use
 *                         the provided NLA_POLICY_NESTED_ARRAY() macro.
 *                         The difference to NLA_NESTED is the structure:
 *                         NLA_NESTED has the nested attributes directly inside
 *                         while an array has the nested attributes at another
 *                         level down and the attribute types directly in the
 *                         nesting don't matter.
 *    NLA_UINT,
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64,
 *    NLA_BE16,
 *    NLA_BE32,
 *    NLA_SINT,
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              The `min' and `max' fields are used depending on the
 *                         validation_type field, if that is min/max/range then
 *                         the min, max or both are used (respectively) to check
 *                         the value of the integer attribute.
 *                         Note that in the interest of code simplicity and
 *                         struct size both limits are s16, so you cannot
 *                         enforce a range that doesn't fall within the range
 *                         of s16 - do that using the NLA_POLICY_FULL_RANGE()
 *                         or NLA_POLICY_FULL_RANGE_SIGNED() macros instead.
 *                         Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and
 *                         NLA_POLICY_RANGE() macros.
 *    NLA_UINT,
 *    NLA_U8,
 *    NLA_U16,
 *    NLA_U32,
 *    NLA_U64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range' must be a pointer
 *                         to a struct netlink_range_validation that indicates
 *                         the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE().
 *    NLA_SINT,
 *    NLA_S8,
 *    NLA_S16,
 *    NLA_S32,
 *    NLA_S64              If the validation_type field instead is set to
 *                         NLA_VALIDATE_RANGE_PTR, `range_signed' must be a
 *                         pointer to a struct netlink_range_validation_signed
 *                         that indicates the min/max values.
 *                         Use NLA_POLICY_FULL_RANGE_SIGNED().
 *
 *    NLA_BINARY           If the validation type is like the ones for integers
 *                         above, then the min/max length (not value like for
 *                         integers) of the attribute is enforced.
 *
 *    All other            Unused - but note that it's a union
 *
 * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN:
 *    NLA_BINARY           Validation function called for the attribute.
 *    All other            Unused - but note that it's a union
 *
 * Example:
 *
 * static const u32 myvalidflags = 0xff231023;
 *
 * static const struct nla_policy my_policy[ATTR_MAX+1] = {
 *         [ATTR_FOO] = { .type = NLA_U16 },
 *        [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ },
 *        [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)),
 *        [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags),
 * };
 */
struct nla_policy {
        u8                type;
        u8                validation_type;
        u16                len;
        union {
                /**
                 * @strict_start_type: first attribute to validate strictly
                 *
                 * This entry is special, and used for the attribute at index 0
                 * only, and specifies special data about the policy, namely it
                 * specifies the "boundary type" where strict length validation
                 * starts for any attribute types >= this value, also, strict
                 * nesting validation starts here.
                 *
                 * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT
                 * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to
                 * get the previous pure { .len = xyz } behaviour. The advantage
                 * of this is that types not specified in the policy will be
                 * rejected.
                 *
                 * For completely new families it should be set to 1 so that the
                 * validation is enforced for all attributes. For existing ones
                 * it should be set at least when new attributes are added to
                 * the enum used by the policy, and be set to the new value that
                 * was added to enforce strict validation from thereon.
                 */
                u16 strict_start_type;

                /* private: use NLA_POLICY_*() to set */
                const u32 bitfield32_valid;
                const u32 mask;
                const char *reject_message;
                const struct nla_policy *nested_policy;
                const struct netlink_range_validation *range;
                const struct netlink_range_validation_signed *range_signed;
                struct {
                        s16 min, max;
                };
                int (*validate)(const struct nlattr *attr,
                                struct netlink_ext_ack *extack);
        };
};

#define NLA_POLICY_ETH_ADDR                NLA_POLICY_EXACT_LEN(ETH_ALEN)
#define NLA_POLICY_ETH_ADDR_COMPAT        NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN)

#define _NLA_POLICY_NESTED(maxattr, policy) \
        { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr }
#define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \
        { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr }
#define NLA_POLICY_NESTED(policy) \
        _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_NESTED_ARRAY(policy) \
        _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy)
#define NLA_POLICY_BITFIELD32(valid) \
        { .type = NLA_BITFIELD32, .bitfield32_valid = valid }

#define __NLA_IS_UINT_TYPE(tp)                                        \
        (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 ||        \
         tp == NLA_U64 || tp == NLA_UINT ||                        \
         tp == NLA_BE16 || tp == NLA_BE32)
#define __NLA_IS_SINT_TYPE(tp)                                                \
        (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64 || \
         tp == NLA_SINT)

#define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
#define NLA_ENSURE_UINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp)
#define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||        \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_SINT_TYPE(tp)                        \
        (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp)
#define NLA_ENSURE_INT_OR_BINARY_TYPE(tp)                \
        (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||                \
                      __NLA_IS_SINT_TYPE(tp) ||                \
                      tp == NLA_MSECS ||                \
                      tp == NLA_BINARY) + tp)
#define NLA_ENSURE_NO_VALIDATION_PTR(tp)                \
        (__NLA_ENSURE(tp != NLA_BITFIELD32 &&                \
                      tp != NLA_REJECT &&                \
                      tp != NLA_NESTED &&                \
                      tp != NLA_NESTED_ARRAY) + tp)

#define NLA_POLICY_RANGE(tp, _min, _max) {                \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE,                \
        .min = _min,                                        \
        .max = _max                                        \
}

#define NLA_POLICY_FULL_RANGE(tp, _range) {                \
        .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range = _range,                                \
}

#define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) {        \
        .type = NLA_ENSURE_SINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_RANGE_PTR,        \
        .range_signed = _range,                                \
}

#define NLA_POLICY_MIN(tp, _min) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MIN,                \
        .min = _min,                                        \
}

#define NLA_POLICY_MAX(tp, _max) {                        \
        .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),        \
        .validation_type = NLA_VALIDATE_MAX,                \
        .max = _max,                                        \
}

#define NLA_POLICY_MASK(tp, _mask) {                        \
        .type = NLA_ENSURE_UINT_TYPE(tp),                \
        .validation_type = NLA_VALIDATE_MASK,                \
        .mask = _mask,                                        \
}

#define NLA_POLICY_VALIDATE_FN(tp, fn, ...) {                \
        .type = NLA_ENSURE_NO_VALIDATION_PTR(tp),        \
        .validation_type = NLA_VALIDATE_FUNCTION,        \
        .validate = fn,                                        \
        .len = __VA_ARGS__ + 0,                                \
}

#define NLA_POLICY_EXACT_LEN(_len)        NLA_POLICY_RANGE(NLA_BINARY, _len, _len)
#define NLA_POLICY_EXACT_LEN_WARN(_len) {                        \
        .type = NLA_BINARY,                                        \
        .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG,        \
        .min = _len,                                                \
        .max = _len                                                \
}
#define NLA_POLICY_MIN_LEN(_len)        NLA_POLICY_MIN(NLA_BINARY, _len)
#define NLA_POLICY_MAX_LEN(_len)        NLA_POLICY_MAX(NLA_BINARY, _len)

/**
 * struct nl_info - netlink source information
 * @nlh: Netlink message header of original request
 * @nl_net: Network namespace
 * @portid: Netlink PORTID of requesting application
 * @skip_notify: Skip netlink notifications to user space
 * @skip_notify_kernel: Skip selected in-kernel notifications
 */
struct nl_info {
        struct nlmsghdr                *nlh;
        struct net                *nl_net;
        u32                        portid;
        u8                        skip_notify:1,
                                skip_notify_kernel:1;
};

/**
 * enum netlink_validation - netlink message/attribute validation levels
 * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about
 *        extra data at the end of the message, attributes being longer than
 *        they should be, or unknown attributes being present.
 * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing.
 * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING
 *        this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict().
 * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy.
 *        This can safely be set by the kernel when the given policy has no
 *        NLA_UNSPEC anymore, and can thus be used to ensure policy entries
 *        are enforced going forward.
 * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g.
 *        U8, U16, U32 must have exact size, etc.)
 * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY)
 *        and unset for other policies.
 */
enum netlink_validation {
        NL_VALIDATE_LIBERAL = 0,
        NL_VALIDATE_TRAILING = BIT(0),
        NL_VALIDATE_MAXTYPE = BIT(1),
        NL_VALIDATE_UNSPEC = BIT(2),
        NL_VALIDATE_STRICT_ATTRS = BIT(3),
        NL_VALIDATE_NESTED = BIT(4),
};

#define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\
                                       NL_VALIDATE_MAXTYPE)
#define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\
                            NL_VALIDATE_MAXTYPE |\
                            NL_VALIDATE_UNSPEC |\
                            NL_VALIDATE_STRICT_ATTRS |\
                            NL_VALIDATE_NESTED)

int netlink_rcv_skb(struct sk_buff *skb,
                    int (*cb)(struct sk_buff *, struct nlmsghdr *,
                              struct netlink_ext_ack *));
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
                 unsigned int group, int report, gfp_t flags);

int __nla_validate(const struct nlattr *head, int len, int maxtype,
                   const struct nla_policy *policy, unsigned int validate,
                   struct netlink_ext_ack *extack);
int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
                int len, const struct nla_policy *policy, unsigned int validate,
                struct netlink_ext_ack *extack);
int nla_policy_len(const struct nla_policy *, int);
struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype);
ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize);
char *nla_strdup(const struct nlattr *nla, gfp_t flags);
int nla_memcpy(void *dest, const struct nlattr *src, int count);
int nla_memcmp(const struct nlattr *nla, const void *data, size_t size);
int nla_strcmp(const struct nlattr *nla, const char *str);
struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                   int attrlen, int padattr);
void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen);
struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype,
                                 int attrlen, int padattr);
void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen);
void __nla_put(struct sk_buff *skb, int attrtype, int attrlen,
               const void *data);
void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                     const void *data, int padattr);
void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data);
int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen,
                  const void *data, int padattr);
int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data);
int nla_append(struct sk_buff *skb, int attrlen, const void *data);

/**************************************************************************
 * Netlink Messages
 **************************************************************************/

/**
 * nlmsg_msg_size - length of netlink message not including padding
 * @payload: length of message payload
 */
static inline int nlmsg_msg_size(int payload)
{
        return NLMSG_HDRLEN + payload;
}

/**
 * nlmsg_total_size - length of netlink message including padding
 * @payload: length of message payload
 */
static inline int nlmsg_total_size(int payload)
{
        return NLMSG_ALIGN(nlmsg_msg_size(payload));
}

/**
 * nlmsg_padlen - length of padding at the message's tail
 * @payload: length of message payload
 */
static inline int nlmsg_padlen(int payload)
{
        return nlmsg_total_size(payload) - nlmsg_msg_size(payload);
}

/**
 * nlmsg_data - head of message payload
 * @nlh: netlink message header
 */
static inline void *nlmsg_data(const struct nlmsghdr *nlh)
{
        return (unsigned char *) nlh + NLMSG_HDRLEN;
}

/**
 * nlmsg_len - length of message payload
 * @nlh: netlink message header
 */
static inline int nlmsg_len(const struct nlmsghdr *nlh)
{
        return nlh->nlmsg_len - NLMSG_HDRLEN;
}

/**
 * nlmsg_attrdata - head of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh,
                                            int hdrlen)
{
        unsigned char *data = nlmsg_data(nlh);
        return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen));
}

/**
 * nlmsg_attrlen - length of attributes data
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 */
static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen)
{
        return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen);
}

/**
 * nlmsg_ok - check if the netlink message fits into the remaining bytes
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 */
static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining)
{
        return (remaining >= (int) sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len >= sizeof(struct nlmsghdr) &&
                nlh->nlmsg_len <= remaining);
}

/**
 * nlmsg_next - next netlink message in message stream
 * @nlh: netlink message header
 * @remaining: number of bytes remaining in message stream
 *
 * Returns: the next netlink message in the message stream and
 * decrements remaining by the size of the current message.
 */
static inline struct nlmsghdr *
nlmsg_next(const struct nlmsghdr *nlh, int *remaining)
{
        int totlen = NLMSG_ALIGN(nlh->nlmsg_len);

        *remaining -= totlen;

        return (struct nlmsghdr *) ((unsigned char *) nlh + totlen);
}

/**
 * nla_parse - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected, policy must be specified, attributes
 * will be validated in the strictest way possible.
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_parse(struct nlattr **tb, int maxtype,
                            const struct nlattr *head, int len,
                            const struct nla_policy *policy,
                            struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_deprecated - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be ignored and attributes from the policy are not
 * always strictly validated (only for new attributes).
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype,
                                       const struct nlattr *head, int len,
                                       const struct nla_policy *policy,
                                       struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @policy: validation policy
 * @extack: extended ACK pointer
 *
 * Parses a stream of attributes and stores a pointer to each attribute in
 * the tb array accessible via the attribute type. Attributes with a type
 * exceeding maxtype will be rejected as well as trailing data, but the
 * policy is not completely strictly validated (only for new attributes).
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype,
                                              const struct nlattr *head,
                                              int len,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, head, len, policy,
                           NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * __nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                                struct nlattr *tb[], int maxtype,
                                const struct nla_policy *policy,
                                unsigned int validate,
                                struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) {
                NL_SET_ERR_MSG(extack, "Invalid header length");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen),
                           nlmsg_attrlen(nlh, hdrlen), policy, validate,
                           extack);
}

/**
 * nlmsg_parse - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_STRICT, extack);
}

/**
 * nlmsg_parse_deprecated - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen,
                                         struct nlattr *tb[], int maxtype,
                                         const struct nla_policy *policy,
                                         struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_LIBERAL, extack);
}

/**
 * nlmsg_parse_deprecated_strict - parse attributes of a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated_strict()
 */
static inline int
nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen,
                              struct nlattr *tb[], int maxtype,
                              const struct nla_policy *policy,
                              struct netlink_ext_ack *extack)
{
        return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy,
                             NL_VALIDATE_DEPRECATED_STRICT, extack);
}

/**
 * nlmsg_find_attr - find a specific attribute in a netlink message
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @attrtype: type of attribute to look for
 *
 * Returns: the first attribute which matches the specified type.
 */
static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh,
                                             int hdrlen, int attrtype)
{
        return nla_find(nlmsg_attrdata(nlh, hdrlen),
                        nlmsg_attrlen(nlh, hdrlen), attrtype);
}

/**
 * nla_validate_deprecated - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in liberal mode.
 * See documentation of struct nla_policy for more details.
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_validate_deprecated(const struct nlattr *head, int len,
                                          int maxtype,
                                          const struct nla_policy *policy,
                                          struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL,
                              extack);
}

/**
 * nla_validate - Validate a stream of attributes
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the specified attribute stream against the
 * specified policy. Validation is done in strict mode.
 * See documentation of struct nla_policy for more details.
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int nla_validate(const struct nlattr *head, int len, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT,
                              extack);
}

/**
 * nlmsg_validate_deprecated - validate a netlink message including attributes
 * @nlh: netlinket message header
 * @hdrlen: length of family specific header
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @extack: extended ACK report struct
 */
static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh,
                                            int hdrlen, int maxtype,
                                            const struct nla_policy *policy,
                                            struct netlink_ext_ack *extack)
{
        if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
                return -EINVAL;

        return __nla_validate(nlmsg_attrdata(nlh, hdrlen),
                              nlmsg_attrlen(nlh, hdrlen), maxtype,
                              policy, NL_VALIDATE_LIBERAL, extack);
}



/**
 * nlmsg_report - need to report back to application?
 * @nlh: netlink message header
 *
 * Returns: 1 if a report back to the application is requested.
 */
static inline int nlmsg_report(const struct nlmsghdr *nlh)
{
        return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0;
}

/**
 * nlmsg_seq - return the seq number of netlink message
 * @nlh: netlink message header
 *
 * Returns: 0 if netlink message is NULL
 */
static inline u32 nlmsg_seq(const struct nlmsghdr *nlh)
{
        return nlh ? nlh->nlmsg_seq : 0;
}

/**
 * nlmsg_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @nlh: netlink message header
 * @hdrlen: length of family specific header
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \
        nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \
                          nlmsg_attrlen(nlh, hdrlen), rem)

/**
 * nlmsg_put - Add a new netlink message to an skb
 * @skb: socket buffer to store message in
 * @portid: netlink PORTID of requesting application
 * @seq: sequence number of message
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns: NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
                                         int type, int payload, int flags)
{
        if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload)))
                return NULL;

        return __nlmsg_put(skb, portid, seq, type, payload, flags);
}

/**
 * nlmsg_append - Add more data to a nlmsg in a skb
 * @skb: socket buffer to store message in
 * @size: length of message payload
 *
 * Append data to an existing nlmsg, used when constructing a message
 * with multiple fixed-format headers (which is rare).
 * Returns: NULL if the tailroom of the skb is insufficient to store
 * the extra payload.
 */
static inline void *nlmsg_append(struct sk_buff *skb, u32 size)
{
        if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size)))
                return NULL;

        if (NLMSG_ALIGN(size) - size)
                memset(skb_tail_pointer(skb) + size, 0,
                       NLMSG_ALIGN(size) - size);
        return __skb_put(skb, NLMSG_ALIGN(size));
}

/**
 * nlmsg_put_answer - Add a new callback based netlink message to an skb
 * @skb: socket buffer to store message in
 * @cb: netlink callback
 * @type: message type
 * @payload: length of message payload
 * @flags: message flags
 *
 * Returns: NULL if the tailroom of the skb is insufficient to store
 * the message header and payload.
 */
static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb,
                                                struct netlink_callback *cb,
                                                int type, int payload,
                                                int flags)
{
        return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                         type, payload, flags);
}

/**
 * nlmsg_new - Allocate a new netlink message
 * @payload: size of the message payload
 * @flags: the type of memory to allocate.
 *
 * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known
 * and a good default is needed.
 */
static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags)
{
        return alloc_skb(nlmsg_total_size(payload), flags);
}

/**
 * nlmsg_new_large - Allocate a new netlink message with non-contiguous
 * physical memory
 * @payload: size of the message payload
 *
 * The allocated skb is unable to have frag page for shinfo->frags*,
 * as the NULL setting for skb->head in netlink_skb_destructor() will
 * bypass most of the handling in skb_release_data()
 */
static inline struct sk_buff *nlmsg_new_large(size_t payload)
{
        return netlink_alloc_large_skb(nlmsg_total_size(payload), 0);
}

/**
 * nlmsg_end - Finalize a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Corrects the netlink message header to include the appended
 * attributes. Only necessary if attributes have been added to
 * the message.
 */
static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
}

/**
 * nlmsg_get_pos - return current position in netlink message
 * @skb: socket buffer the message is stored in
 *
 * Returns: a pointer to the current tail of the message.
 */
static inline void *nlmsg_get_pos(struct sk_buff *skb)
{
        return skb_tail_pointer(skb);
}

/**
 * nlmsg_trim - Trim message to a mark
 * @skb: socket buffer the message is stored in
 * @mark: mark to trim to
 *
 * Trims the message to the provided mark.
 */
static inline void nlmsg_trim(struct sk_buff *skb, const void *mark)
{
        if (mark) {
                WARN_ON((unsigned char *) mark < skb->data);
                skb_trim(skb, (unsigned char *) mark - skb->data);
        }
}

/**
 * nlmsg_cancel - Cancel construction of a netlink message
 * @skb: socket buffer the message is stored in
 * @nlh: netlink message header
 *
 * Removes the complete netlink message including all
 * attributes from the socket buffer again.
 */
static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh)
{
        nlmsg_trim(skb, nlh);
}

/**
 * nlmsg_free - drop a netlink message
 * @skb: socket buffer of netlink message
 */
static inline void nlmsg_free(struct sk_buff *skb)
{
        kfree_skb(skb);
}

/**
 * nlmsg_consume - free a netlink message
 * @skb: socket buffer of netlink message
 */
static inline void nlmsg_consume(struct sk_buff *skb)
{
        consume_skb(skb);
}

/**
 * nlmsg_multicast_filtered - multicast a netlink message with filter function
 * @sk: netlink socket to spread messages to
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: multicast group id
 * @flags: allocation flags
 * @filter: filter function
 * @filter_data: filter function private data
 *
 * Return: 0 on success, negative error code for failure.
 */
static inline int nlmsg_multicast_filtered(struct sock *sk, struct sk_buff *skb,
                                           u32 portid, unsigned int group,
                                           gfp_t flags,
                                           netlink_filter_fn filter,
                                           void *filter_data)
{
        int err;

        NETLINK_CB(skb).dst_group = group;

        err = netlink_broadcast_filtered(sk, skb, portid, group, flags,
                                         filter, filter_data);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_multicast - multicast a netlink message
 * @sk: netlink socket to spread messages to
 * @skb: netlink message as socket buffer
 * @portid: own netlink portid to avoid sending to yourself
 * @group: multicast group id
 * @flags: allocation flags
 */
static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb,
                                  u32 portid, unsigned int group, gfp_t flags)
{
        return nlmsg_multicast_filtered(sk, skb, portid, group, flags,
                                        NULL, NULL);
}

/**
 * nlmsg_unicast - unicast a netlink message
 * @sk: netlink socket to spread message to
 * @skb: netlink message as socket buffer
 * @portid: netlink portid of the destination socket
 */
static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid)
{
        int err;

        err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT);
        if (err > 0)
                err = 0;

        return err;
}

/**
 * nlmsg_for_each_msg - iterate over a stream of messages
 * @pos: loop counter, set to current message
 * @head: head of message stream
 * @len: length of message stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nlmsg_for_each_msg(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nlmsg_ok(pos, rem); \
             pos = nlmsg_next(pos, &(rem)))

/**
 * nl_dump_check_consistent - check if sequence is consistent and advertise if not
 * @cb: netlink callback structure that stores the sequence number
 * @nlh: netlink message header to write the flag to
 *
 * This function checks if the sequence (generation) number changed during dump
 * and if it did, advertises it in the netlink message header.
 *
 * The correct way to use it is to set cb->seq to the generation counter when
 * all locks for dumping have been acquired, and then call this function for
 * each message that is generated.
 *
 * Note that due to initialisation concerns, 0 is an invalid sequence number
 * and must not be used by code that uses this functionality.
 */
static inline void
nl_dump_check_consistent(struct netlink_callback *cb,
                         struct nlmsghdr *nlh)
{
        if (cb->prev_seq && cb->seq != cb->prev_seq)
                nlh->nlmsg_flags |= NLM_F_DUMP_INTR;
        cb->prev_seq = cb->seq;
}

/**************************************************************************
 * Netlink Attributes
 **************************************************************************/

/**
 * nla_attr_size - length of attribute not including padding
 * @payload: length of payload
 */
static inline int nla_attr_size(int payload)
{
        return NLA_HDRLEN + payload;
}

/**
 * nla_total_size - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload));
}

/**
 * nla_padlen - length of padding at the tail of attribute
 * @payload: length of payload
 */
static inline int nla_padlen(int payload)
{
        return nla_total_size(payload) - nla_attr_size(payload);
}

/**
 * nla_type - attribute type
 * @nla: netlink attribute
 */
static inline int nla_type(const struct nlattr *nla)
{
        return nla->nla_type & NLA_TYPE_MASK;
}

/**
 * nla_data - head of payload
 * @nla: netlink attribute
 */
static inline void *nla_data(const struct nlattr *nla)
{
        return (char *) nla + NLA_HDRLEN;
}

/**
 * nla_len - length of payload
 * @nla: netlink attribute
 */
static inline u16 nla_len(const struct nlattr *nla)
{
        return nla->nla_len - NLA_HDRLEN;
}

/**
 * nla_ok - check if the netlink attribute fits into the remaining bytes
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 */
static inline int nla_ok(const struct nlattr *nla, int remaining)
{
        return remaining >= (int) sizeof(*nla) &&
               nla->nla_len >= sizeof(*nla) &&
               nla->nla_len <= remaining;
}

/**
 * nla_next - next netlink attribute in attribute stream
 * @nla: netlink attribute
 * @remaining: number of bytes remaining in attribute stream
 *
 * Returns: the next netlink attribute in the attribute stream and
 * decrements remaining by the size of the current attribute.
 */
static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining)
{
        unsigned int totlen = NLA_ALIGN(nla->nla_len);

        *remaining -= totlen;
        return (struct nlattr *) ((char *) nla + totlen);
}

/**
 * nla_find_nested - find attribute in a set of nested attributes
 * @nla: attribute containing the nested attributes
 * @attrtype: type of attribute to look for
 *
 * Returns: the first attribute which matches the specified type.
 */
static inline struct nlattr *
nla_find_nested(const struct nlattr *nla, int attrtype)
{
        return nla_find(nla_data(nla), nla_len(nla), attrtype);
}

/**
 * nla_parse_nested - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse()
 */
static inline int nla_parse_nested(struct nlattr *tb[], int maxtype,
                                   const struct nlattr *nla,
                                   const struct nla_policy *policy,
                                   struct netlink_ext_ack *extack)
{
        if (!(nla->nla_type & NLA_F_NESTED)) {
                NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing");
                return -EINVAL;
        }

        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_STRICT, extack);
}

/**
 * nla_parse_nested_deprecated - parse nested attributes
 * @tb: destination array with maxtype+1 elements
 * @maxtype: maximum attribute type to be expected
 * @nla: attribute containing the nested attributes
 * @policy: validation policy
 * @extack: extended ACK report struct
 *
 * See nla_parse_deprecated()
 */
static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype,
                                              const struct nlattr *nla,
                                              const struct nla_policy *policy,
                                              struct netlink_ext_ack *extack)
{
        return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy,
                           NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_put_u8 - Add a u8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value)
{
        /* temporary variables to work around GCC PR81715 with asan-stack=1 */
        u8 tmp = value;

        return nla_put(skb, attrtype, sizeof(u8), &tmp);
}

/**
 * nla_put_u16 - Add a u16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value)
{
        u16 tmp = value;

        return nla_put(skb, attrtype, sizeof(u16), &tmp);
}

/**
 * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be16), &tmp);
}

/**
 * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value)
{
        __be16 tmp = value;

        return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value)
{
        __le16 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le16), &tmp);
}

/**
 * nla_put_u32 - Add a u32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value)
{
        u32 tmp = value;

        return nla_put(skb, attrtype, sizeof(u32), &tmp);
}

/**
 * nla_put_uint - Add a variable-size unsigned int to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_uint(struct sk_buff *skb, int attrtype, u64 value)
{
        u64 tmp64 = value;
        u32 tmp32 = value;

        if (tmp64 == tmp32)
                return nla_put_u32(skb, attrtype, tmp32);
        return nla_put(skb, attrtype, sizeof(u64), &tmp64);
}

/**
 * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__be32), &tmp);
}

/**
 * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value)
{
        __be32 tmp = value;

        return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp);
}

/**
 * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value)
{
        __le32 tmp = value;

        return nla_put(skb, attrtype, sizeof(__le32), &tmp);
}

/**
 * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype,
                                    u64 value, int padattr)
{
        u64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value,
                               int padattr)
{
        __be64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr);
}

/**
 * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value,
                                int padattr)
{
        __be64 tmp = value;

        return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp,
                            padattr);
}

/**
 * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value,
                               int padattr)
{
        __le64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr);
}

/**
 * nla_put_s8 - Add a s8 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value)
{
        s8 tmp = value;

        return nla_put(skb, attrtype, sizeof(s8), &tmp);
}

/**
 * nla_put_s16 - Add a s16 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value)
{
        s16 tmp = value;

        return nla_put(skb, attrtype, sizeof(s16), &tmp);
}

/**
 * nla_put_s32 - Add a s32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value)
{
        s32 tmp = value;

        return nla_put(skb, attrtype, sizeof(s32), &tmp);
}

/**
 * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 * @padattr: attribute type for the padding
 */
static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value,
                              int padattr)
{
        s64 tmp = value;

        return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr);
}

/**
 * nla_put_sint - Add a variable-size signed int to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: numeric value
 */
static inline int nla_put_sint(struct sk_buff *skb, int attrtype, s64 value)
{
        s64 tmp64 = value;
        s32 tmp32 = value;

        if (tmp64 == tmp32)
                return nla_put_s32(skb, attrtype, tmp32);
        return nla_put(skb, attrtype, sizeof(s64), &tmp64);
}

/**
 * nla_put_string - Add a string netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @str: NUL terminated string
 */
static inline int nla_put_string(struct sk_buff *skb, int attrtype,
                                 const char *str)
{
        return nla_put(skb, attrtype, strlen(str) + 1, str);
}

/**
 * nla_put_flag - Add a flag netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 */
static inline int nla_put_flag(struct sk_buff *skb, int attrtype)
{
        return nla_put(skb, attrtype, 0, NULL);
}

/**
 * nla_put_msecs - Add a msecs netlink attribute to a skb and align it
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @njiffies: number of jiffies to convert to msecs
 * @padattr: attribute type for the padding
 */
static inline int nla_put_msecs(struct sk_buff *skb, int attrtype,
                                unsigned long njiffies, int padattr)
{
        u64 tmp = jiffies_to_msecs(njiffies);

        return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr);
}

/**
 * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv4 address
 */
static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype,
                                  __be32 addr)
{
        __be32 tmp = addr;

        return nla_put_be32(skb, attrtype, tmp);
}

/**
 * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket
 * buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @addr: IPv6 address
 */
static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype,
                                   const struct in6_addr *addr)
{
        return nla_put(skb, attrtype, sizeof(*addr), addr);
}

/**
 * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer
 * @skb: socket buffer to add attribute to
 * @attrtype: attribute type
 * @value: value carrying bits
 * @selector: selector of valid bits
 */
static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype,
                                     __u32 value, __u32 selector)
{
        struct nla_bitfield32 tmp = { value, selector, };

        return nla_put(skb, attrtype, sizeof(tmp), &tmp);
}

/**
 * nla_get_u32 - return payload of u32 attribute
 * @nla: u32 netlink attribute
 */
static inline u32 nla_get_u32(const struct nlattr *nla)
{
        return *(u32 *) nla_data(nla);
}

/**
 * nla_get_u32_default - return payload of u32 attribute or default
 * @nla: u32 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u32 nla_get_u32_default(const struct nlattr *nla, u32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_u32(nla);
}

/**
 * nla_get_be32 - return payload of __be32 attribute
 * @nla: __be32 netlink attribute
 */
static inline __be32 nla_get_be32(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_be32_default - return payload of be32 attribute or default
 * @nla: __be32 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __be32 nla_get_be32_default(const struct nlattr *nla,
                                          __be32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_be32(nla);
}

/**
 * nla_get_le32 - return payload of __le32 attribute
 * @nla: __le32 netlink attribute
 */
static inline __le32 nla_get_le32(const struct nlattr *nla)
{
        return *(__le32 *) nla_data(nla);
}

/**
 * nla_get_le32_default - return payload of le32 attribute or default
 * @nla: __le32 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __le32 nla_get_le32_default(const struct nlattr *nla,
                                          __le32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_le32(nla);
}

/**
 * nla_get_u16 - return payload of u16 attribute
 * @nla: u16 netlink attribute
 */
static inline u16 nla_get_u16(const struct nlattr *nla)
{
        return *(u16 *) nla_data(nla);
}

/**
 * nla_get_u16_default - return payload of u16 attribute or default
 * @nla: u16 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u16 nla_get_u16_default(const struct nlattr *nla, u16 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_u16(nla);
}

/**
 * nla_get_be16 - return payload of __be16 attribute
 * @nla: __be16 netlink attribute
 */
static inline __be16 nla_get_be16(const struct nlattr *nla)
{
        return *(__be16 *) nla_data(nla);
}

/**
 * nla_get_be16_default - return payload of be16 attribute or default
 * @nla: __be16 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __be16 nla_get_be16_default(const struct nlattr *nla,
                                          __be16 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_be16(nla);
}

/**
 * nla_get_le16 - return payload of __le16 attribute
 * @nla: __le16 netlink attribute
 */
static inline __le16 nla_get_le16(const struct nlattr *nla)
{
        return *(__le16 *) nla_data(nla);
}

/**
 * nla_get_le16_default - return payload of le16 attribute or default
 * @nla: __le16 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __le16 nla_get_le16_default(const struct nlattr *nla,
                                          __le16 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_le16(nla);
}

/**
 * nla_get_u8 - return payload of u8 attribute
 * @nla: u8 netlink attribute
 */
static inline u8 nla_get_u8(const struct nlattr *nla)
{
        return *(u8 *) nla_data(nla);
}

/**
 * nla_get_u8_default - return payload of u8 attribute or default
 * @nla: u8 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u8 nla_get_u8_default(const struct nlattr *nla, u8 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_u8(nla);
}

/**
 * nla_get_u64 - return payload of u64 attribute
 * @nla: u64 netlink attribute
 */
static inline u64 nla_get_u64(const struct nlattr *nla)
{
        u64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_u64_default - return payload of u64 attribute or default
 * @nla: u64 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u64 nla_get_u64_default(const struct nlattr *nla, u64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_u64(nla);
}

/**
 * nla_get_uint - return payload of uint attribute
 * @nla: uint netlink attribute
 */
static inline u64 nla_get_uint(const struct nlattr *nla)
{
        if (nla_len(nla) == sizeof(u32))
                return nla_get_u32(nla);
        return nla_get_u64(nla);
}

/**
 * nla_get_uint_default - return payload of uint attribute or default
 * @nla: uint netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline u64 nla_get_uint_default(const struct nlattr *nla, u64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_uint(nla);
}

/**
 * nla_get_be64 - return payload of __be64 attribute
 * @nla: __be64 netlink attribute
 */
static inline __be64 nla_get_be64(const struct nlattr *nla)
{
        __be64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_be64_default - return payload of be64 attribute or default
 * @nla: __be64 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __be64 nla_get_be64_default(const struct nlattr *nla,
                                          __be64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_be64(nla);
}

/**
 * nla_get_le64 - return payload of __le64 attribute
 * @nla: __le64 netlink attribute
 */
static inline __le64 nla_get_le64(const struct nlattr *nla)
{
        return *(__le64 *) nla_data(nla);
}

/**
 * nla_get_le64_default - return payload of le64 attribute or default
 * @nla: __le64 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __le64 nla_get_le64_default(const struct nlattr *nla,
                                          __le64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_le64(nla);
}

/**
 * nla_get_s32 - return payload of s32 attribute
 * @nla: s32 netlink attribute
 */
static inline s32 nla_get_s32(const struct nlattr *nla)
{
        return *(s32 *) nla_data(nla);
}

/**
 * nla_get_s32_default - return payload of s32 attribute or default
 * @nla: s32 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s32 nla_get_s32_default(const struct nlattr *nla, s32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_s32(nla);
}

/**
 * nla_get_s16 - return payload of s16 attribute
 * @nla: s16 netlink attribute
 */
static inline s16 nla_get_s16(const struct nlattr *nla)
{
        return *(s16 *) nla_data(nla);
}

/**
 * nla_get_s16_default - return payload of s16 attribute or default
 * @nla: s16 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s16 nla_get_s16_default(const struct nlattr *nla, s16 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_s16(nla);
}

/**
 * nla_get_s8 - return payload of s8 attribute
 * @nla: s8 netlink attribute
 */
static inline s8 nla_get_s8(const struct nlattr *nla)
{
        return *(s8 *) nla_data(nla);
}

/**
 * nla_get_s8_default - return payload of s8 attribute or default
 * @nla: s8 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s8 nla_get_s8_default(const struct nlattr *nla, s8 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_s8(nla);
}

/**
 * nla_get_s64 - return payload of s64 attribute
 * @nla: s64 netlink attribute
 */
static inline s64 nla_get_s64(const struct nlattr *nla)
{
        s64 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));

        return tmp;
}

/**
 * nla_get_s64_default - return payload of s64 attribute or default
 * @nla: s64 netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s64 nla_get_s64_default(const struct nlattr *nla, s64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_s64(nla);
}

/**
 * nla_get_sint - return payload of uint attribute
 * @nla: uint netlink attribute
 */
static inline s64 nla_get_sint(const struct nlattr *nla)
{
        if (nla_len(nla) == sizeof(s32))
                return nla_get_s32(nla);
        return nla_get_s64(nla);
}

/**
 * nla_get_sint_default - return payload of sint attribute or default
 * @nla: sint netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline s64 nla_get_sint_default(const struct nlattr *nla, s64 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_sint(nla);
}

/**
 * nla_get_flag - return payload of flag attribute
 * @nla: flag netlink attribute
 */
static inline int nla_get_flag(const struct nlattr *nla)
{
        return !!nla;
}

/**
 * nla_get_msecs - return payload of msecs attribute
 * @nla: msecs netlink attribute
 *
 * Returns: the number of milliseconds in jiffies.
 */
static inline unsigned long nla_get_msecs(const struct nlattr *nla)
{
        u64 msecs = nla_get_u64(nla);

        return msecs_to_jiffies((unsigned long) msecs);
}

/**
 * nla_get_msecs_default - return payload of msecs attribute or default
 * @nla: msecs netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline unsigned long nla_get_msecs_default(const struct nlattr *nla,
                                                  unsigned long defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_msecs(nla);
}

/**
 * nla_get_in_addr - return payload of IPv4 address attribute
 * @nla: IPv4 address netlink attribute
 */
static inline __be32 nla_get_in_addr(const struct nlattr *nla)
{
        return *(__be32 *) nla_data(nla);
}

/**
 * nla_get_in_addr_default - return payload of be32 attribute or default
 * @nla: IPv4 address netlink attribute, may be %NULL
 * @defvalue: default value to use if @nla is %NULL
 *
 * Return: the value of the attribute, or the default value if not present
 */
static inline __be32 nla_get_in_addr_default(const struct nlattr *nla,
                                             __be32 defvalue)
{
        if (!nla)
                return defvalue;
        return nla_get_in_addr(nla);
}

/**
 * nla_get_in6_addr - return payload of IPv6 address attribute
 * @nla: IPv6 address netlink attribute
 */
static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla)
{
        struct in6_addr tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_get_bitfield32 - return payload of 32 bitfield attribute
 * @nla: nla_bitfield32 attribute
 */
static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla)
{
        struct nla_bitfield32 tmp;

        nla_memcpy(&tmp, nla, sizeof(tmp));
        return tmp;
}

/**
 * nla_memdup - duplicate attribute memory (kmemdup)
 * @src: netlink attribute to duplicate from
 * @gfp: GFP mask
 */
static inline void *nla_memdup_noprof(const struct nlattr *src, gfp_t gfp)
{
        return kmemdup_noprof(nla_data(src), nla_len(src), gfp);
}
#define nla_memdup(...)        alloc_hooks(nla_memdup_noprof(__VA_ARGS__))

/**
 * nla_nest_start_noflag - Start a new level of nested attributes
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * This function exists for backward compatibility to use in APIs which never
 * marked their nest attributes with NLA_F_NESTED flag. New APIs should use
 * nla_nest_start() which sets the flag.
 *
 * Returns: the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb,
                                                   int attrtype)
{
        struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb);

        if (nla_put(skb, attrtype, 0, NULL) < 0)
                return NULL;

        return start;
}

/**
 * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED
 * @skb: socket buffer to add attributes to
 * @attrtype: attribute type of container
 *
 * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED
 * flag. This is the preferred function to use in new code.
 *
 * Returns: the container attribute or NULL on error
 */
static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype)
{
        return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED);
}

/**
 * nla_nest_end - Finalize nesting of attributes
 * @skb: socket buffer the attributes are stored in
 * @start: container attribute
 *
 * Corrects the container attribute header to include the all
 * appended attributes.
 *
 * Returns: the total data length of the skb.
 */
static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start)
{
        start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start;
        return skb->len;
}

/**
 * nla_nest_cancel - Cancel nesting of attributes
 * @skb: socket buffer the message is stored in
 * @start: container attribute
 *
 * Removes the container attribute and including all nested
 * attributes. Returns -EMSGSIZE
 */
static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start)
{
        nlmsg_trim(skb, start);
}

/**
 * nla_put_empty_nest - Create an empty nest
 * @skb: socket buffer the message is stored in
 * @attrtype: attribute type of the container
 *
 * This function is a helper for creating empty nests.
 *
 * Returns: 0 when successful or -EMSGSIZE on failure.
 */
static inline int nla_put_empty_nest(struct sk_buff *skb, int attrtype)
{
        return nla_nest_start(skb, attrtype) ? 0 : -EMSGSIZE;
}

/**
 * __nla_validate_nested - Validate a stream of nested attributes
 * @start: container attribute
 * @maxtype: maximum attribute type to be expected
 * @policy: validation policy
 * @validate: validation strictness
 * @extack: extended ACK report struct
 *
 * Validates all attributes in the nested attribute stream against the
 * specified policy. Attributes with a type exceeding maxtype will be
 * ignored. See documentation of struct nla_policy for more details.
 *
 * Returns: 0 on success or a negative error code.
 */
static inline int __nla_validate_nested(const struct nlattr *start, int maxtype,
                                        const struct nla_policy *policy,
                                        unsigned int validate,
                                        struct netlink_ext_ack *extack)
{
        return __nla_validate(nla_data(start), nla_len(start), maxtype, policy,
                              validate, extack);
}

static inline int
nla_validate_nested(const struct nlattr *start, int maxtype,
                    const struct nla_policy *policy,
                    struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_STRICT, extack);
}

static inline int
nla_validate_nested_deprecated(const struct nlattr *start, int maxtype,
                               const struct nla_policy *policy,
                               struct netlink_ext_ack *extack)
{
        return __nla_validate_nested(start, maxtype, policy,
                                     NL_VALIDATE_LIBERAL, extack);
}

/**
 * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute
 * @skb: socket buffer the message is stored in
 *
 * Return: true if padding is needed to align the next attribute (nla_data()) to
 * a 64-bit aligned area.
 */
static inline bool nla_need_padding_for_64bit(struct sk_buff *skb)
{
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
        /* The nlattr header is 4 bytes in size, that's why we test
         * if the skb->data _is_ aligned.  A NOP attribute, plus
         * nlattr header for next attribute, will make nla_data()
         * 8-byte aligned.
         */
        if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
                return true;
#endif
        return false;
}

/**
 * nla_align_64bit - 64-bit align the nla_data() of next attribute
 * @skb: socket buffer the message is stored in
 * @padattr: attribute type for the padding
 *
 * Conditionally emit a padding netlink attribute in order to make
 * the next attribute we emit have a 64-bit aligned nla_data() area.
 * This will only be done in architectures which do not have
 * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined.
 *
 * Returns: zero on success or a negative error code.
 */
static inline int nla_align_64bit(struct sk_buff *skb, int padattr)
{
        if (nla_need_padding_for_64bit(skb) &&
            !nla_reserve(skb, padattr, 0))
                return -EMSGSIZE;

        return 0;
}

/**
 * nla_total_size_64bit - total length of attribute including padding
 * @payload: length of payload
 */
static inline int nla_total_size_64bit(int payload)
{
        return NLA_ALIGN(nla_attr_size(payload))
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
                + NLA_ALIGN(nla_attr_size(0))
#endif
                ;
}

/**
 * nla_for_each_attr - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_attr(pos, head, len, rem) \
        for (pos = head, rem = len; \
             nla_ok(pos, rem); \
             pos = nla_next(pos, &(rem)))

/**
 * nla_for_each_attr_type - iterate over a stream of attributes
 * @pos: loop counter, set to current attribute
 * @type: required attribute type for @pos
 * @head: head of attribute stream
 * @len: length of attribute stream
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_attr_type(pos, type, head, len, rem) \
        nla_for_each_attr(pos, head, len, rem) \
                if (nla_type(pos) == type)

/**
 * nla_for_each_nested - iterate over nested attributes
 * @pos: loop counter, set to current attribute
 * @nla: attribute containing the nested attributes
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_nested(pos, nla, rem) \
        nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem)

/**
 * nla_for_each_nested_type - iterate over nested attributes
 * @pos: loop counter, set to current attribute
 * @type: required attribute type for @pos
 * @nla: attribute containing the nested attributes
 * @rem: initialized to len, holds bytes currently remaining in stream
 */
#define nla_for_each_nested_type(pos, type, nla, rem) \
        nla_for_each_nested(pos, nla, rem) \
                if (nla_type(pos) == type)

/**
 * nla_is_last - Test if attribute is last in stream
 * @nla: attribute to test
 * @rem: bytes remaining in stream
 */
static inline bool nla_is_last(const struct nlattr *nla, int rem)
{
        return nla->nla_len == rem;
}

void nla_get_range_unsigned(const struct nla_policy *pt,
                            struct netlink_range_validation *range);
void nla_get_range_signed(const struct nla_policy *pt,
                          struct netlink_range_validation_signed *range);

struct netlink_policy_dump_state;

int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate,
                                   const struct nla_policy *policy,
                                   unsigned int maxtype);
int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state,
                                       const struct nla_policy *policy,
                                       unsigned int maxtype);
bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state);
int netlink_policy_dump_write(struct sk_buff *skb,
                              struct netlink_policy_dump_state *state);
int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt);
int netlink_policy_dump_write_attr(struct sk_buff *skb,
                                   const struct nla_policy *pt,
                                   int nestattr);
void netlink_policy_dump_free(struct netlink_policy_dump_state *state);

#endif



























































































































































































































































































































































   34 































   16 











   16 









  526 




    2 















   16 
  211 


















   34 



   34 










































   34 

































































   34 





























































   34 






































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_SIGNAL_H
#define _LINUX_SCHED_SIGNAL_H

#include <linux/rculist.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/sched/jobctl.h>
#include <linux/sched/task.h>
#include <linux/cred.h>
#include <linux/refcount.h>
#include <linux/pid.h>
#include <linux/posix-timers.h>
#include <linux/mm_types.h>
#include <asm/ptrace.h>

/*
 * Types defining task->signal and task->sighand and APIs using them:
 */

struct sighand_struct {
        spinlock_t                siglock;
        refcount_t                count;
        wait_queue_head_t        signalfd_wqh;
        struct k_sigaction        action[_NSIG];
};

/*
 * Per-process accounting stats:
 */
struct pacct_struct {
        int                        ac_flag;
        long                        ac_exitcode;
        unsigned long                ac_mem;
        u64                        ac_utime, ac_stime;
        unsigned long                ac_minflt, ac_majflt;
};

struct cpu_itimer {
        u64 expires;
        u64 incr;
};

/*
 * This is the atomic variant of task_cputime, which can be used for
 * storing and updating task_cputime statistics without locking.
 */
struct task_cputime_atomic {
        atomic64_t utime;
        atomic64_t stime;
        atomic64_t sum_exec_runtime;
};

#define INIT_CPUTIME_ATOMIC \
        (struct task_cputime_atomic) {                                \
                .utime = ATOMIC64_INIT(0),                        \
                .stime = ATOMIC64_INIT(0),                        \
                .sum_exec_runtime = ATOMIC64_INIT(0),                \
        }
/**
 * struct thread_group_cputimer - thread group interval timer counts
 * @cputime_atomic:        atomic thread group interval timers.
 *
 * This structure contains the version of task_cputime, above, that is
 * used for thread group CPU timer calculations.
 */
struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
};

struct multiprocess_signals {
        sigset_t signal;
        struct hlist_node node;
};

struct core_thread {
        struct task_struct *task;
        struct core_thread *next;
};

struct core_state {
        atomic_t nr_threads;
        struct core_thread dumper;
        struct completion startup;
};

/*
 * NOTE! "signal_struct" does not have its own
 * locking, because a shared signal_struct always
 * implies a shared sighand_struct, so locking
 * sighand_struct is always a proper superset of
 * the locking of signal_struct.
 */
struct signal_struct {
        refcount_t                sigcnt;
        atomic_t                live;
        int                        nr_threads;
        int                        quick_threads;
        struct list_head        thread_head;

        wait_queue_head_t        wait_chldexit;        /* for wait4() */

        /* current thread group signal load-balancing target: */
        struct task_struct        *curr_target;

        /* shared signal handling: */
        struct sigpending        shared_pending;

        /* For collecting multiprocess signals during fork */
        struct hlist_head        multiprocess;

        /* thread group exit support */
        int                        group_exit_code;
        /* notify group_exec_task when notify_count is less or equal to 0 */
        int                        notify_count;
        struct task_struct        *group_exec_task;

        /* thread group stop support, overloads group_exit_code too */
        int                        group_stop_count;
        unsigned int                flags; /* see SIGNAL_* flags below */

        struct core_state *core_state; /* coredumping support */

        /*
         * PR_SET_CHILD_SUBREAPER marks a process, like a service
         * manager, to re-parent orphan (double-forking) child processes
         * to this process instead of 'init'. The service manager is
         * able to receive SIGCHLD signals and is able to investigate
         * the process until it calls wait(). All children of this
         * process will inherit a flag if they should look for a
         * child_subreaper process at exit.
         */
        unsigned int                is_child_subreaper:1;
        unsigned int                has_child_subreaper:1;

#ifdef CONFIG_POSIX_TIMERS

        /* POSIX.1b Interval Timers */
        unsigned int                timer_create_restore_ids:1;
        atomic_t                next_posix_timer_id;
        struct hlist_head        posix_timers;
        struct hlist_head        ignored_posix_timers;

        /* ITIMER_REAL timer for the process */
        struct hrtimer real_timer;
        ktime_t it_real_incr;

        /*
         * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
         * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
         * values are defined to 0 and 1 respectively
         */
        struct cpu_itimer it[2];

        /*
         * Thread group totals for process CPU timers.
         * See thread_group_cputimer(), et al, for details.
         */
        struct thread_group_cputimer cputimer;

#endif
        /* Empty if CONFIG_POSIX_TIMERS=n */
        struct posix_cputimers posix_cputimers;

        /* PID/PID hash table linkage. */
        struct pid *pids[PIDTYPE_MAX];

#ifdef CONFIG_NO_HZ_FULL
        atomic_t tick_dep_mask;
#endif

        struct pid *tty_old_pgrp;

        /* boolean value for session group leader */
        int leader;

        struct tty_struct *tty; /* NULL if no tty */

#ifdef CONFIG_SCHED_AUTOGROUP
        struct autogroup *autogroup;
#endif
        /*
         * Cumulative resource counters for dead threads in the group,
         * and for reaped dead child processes forked by this group.
         * Live threads maintain their own counters and add to these
         * in __exit_signal, except for the group leader.
         */
        seqlock_t stats_lock;
        u64 utime, stime, cutime, cstime;
        u64 gtime;
        u64 cgtime;
        struct prev_cputime prev_cputime;
        unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
        unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
        unsigned long inblock, oublock, cinblock, coublock;
        unsigned long maxrss, cmaxrss;
        struct task_io_accounting ioac;

        /*
         * Cumulative ns of schedule CPU time fo dead threads in the
         * group, not including a zombie group leader, (This only differs
         * from jiffies_to_ns(utime + stime) if sched_clock uses something
         * other than jiffies.)
         */
        unsigned long long sum_sched_runtime;

        /*
         * We don't bother to synchronize most readers of this at all,
         * because there is no reader checking a limit that actually needs
         * to get both rlim_cur and rlim_max atomically, and either one
         * alone is a single word that can safely be read normally.
         * getrlimit/setrlimit use task_lock(current->group_leader) to
         * protect this instead of the siglock, because they really
         * have no need to disable irqs.
         */
        struct rlimit rlim[RLIM_NLIMITS];

#ifdef CONFIG_BSD_PROCESS_ACCT
        struct pacct_struct pacct;        /* per-process accounting information */
#endif
#ifdef CONFIG_TASKSTATS
        struct taskstats *stats;
#endif
#ifdef CONFIG_AUDIT
        unsigned audit_tty;
        struct tty_audit_buf *tty_audit_buf;
#endif

        /*
         * Thread is the potential origin of an oom condition; kill first on
         * oom
         */
        bool oom_flag_origin;
        short oom_score_adj;                /* OOM kill score adjustment */
        short oom_score_adj_min;        /* OOM kill score adjustment min value.
                                         * Only settable by CAP_SYS_RESOURCE. */
        struct mm_struct *oom_mm;        /* recorded mm when the thread group got
                                         * killed by the oom killer */

        struct mutex cred_guard_mutex;        /* guard against foreign influences on
                                         * credential calculations
                                         * (notably. ptrace)
                                         * Deprecated do not use in new code.
                                         * Use exec_update_lock instead.
                                         */
        struct rw_semaphore exec_update_lock;        /* Held while task_struct is
                                                 * being updated during exec,
                                                 * and may have inconsistent
                                                 * permissions.
                                                 */
} __randomize_layout;

/*
 * Bits in flags field of signal_struct.
 */
#define SIGNAL_STOP_STOPPED        0x00000001 /* job control stop in effect */
#define SIGNAL_STOP_CONTINUED        0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT        0x00000004 /* group exit in progress */
/*
 * Pending notifications to parent.
 */
#define SIGNAL_CLD_STOPPED        0x00000010
#define SIGNAL_CLD_CONTINUED        0x00000020
#define SIGNAL_CLD_MASK                (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)

#define SIGNAL_UNKILLABLE        0x00000040 /* for init: ignore fatal signals */

#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \
                          SIGNAL_STOP_CONTINUED)

static inline void signal_set_stop_flags(struct signal_struct *sig,
                                         unsigned int flags)
{
        WARN_ON(sig->flags & SIGNAL_GROUP_EXIT);
        sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags;
}

extern void flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type);

static inline int kernel_dequeue_signal(void)
{
        struct task_struct *task = current;
        kernel_siginfo_t __info;
        enum pid_type __type;
        int ret;

        spin_lock_irq(&task->sighand->siglock);
        ret = dequeue_signal(&task->blocked, &__info, &__type);
        spin_unlock_irq(&task->sighand->siglock);

        return ret;
}

static inline void kernel_signal_stop(void)
{
        spin_lock_irq(&current->sighand->siglock);
        if (current->jobctl & JOBCTL_STOP_DEQUEUED) {
                current->jobctl |= JOBCTL_STOPPED;
                set_special_state(TASK_STOPPED);
        }
        spin_unlock_irq(&current->sighand->siglock);

        schedule();
}

int force_sig_fault_to_task(int sig, int code, void __user *addr,
                            struct task_struct *t);
int force_sig_fault(int sig, int code, void __user *addr);
int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t);

int force_sig_mceerr(int code, void __user *, short);
int send_sig_mceerr(int code, void __user *, short, struct task_struct *);

int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper);
int force_sig_pkuerr(void __user *addr, u32 pkey);
int send_sig_perf(void __user *addr, u32 type, u64 sig_data);

int force_sig_ptrace_errno_trap(int errno, void __user *addr);
int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno);
int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno,
                        struct task_struct *t);
int force_sig_seccomp(int syscall, int reason, bool force_coredump);

extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
extern void force_sigsegv(int sig);
extern int force_sig_info(struct kernel_siginfo *);
extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *,
                                const struct cred *);
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int);
extern void force_fatal_sig(int);
extern void force_exit_sig(int);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);

static inline void clear_notify_signal(void)
{
        clear_thread_flag(TIF_NOTIFY_SIGNAL);
        smp_mb__after_atomic();
}

/*
 * Returns 'true' if kick_process() is needed to force a transition from
 * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work.
 */
static inline bool __set_notify_signal(struct task_struct *task)
{
        return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) &&
               !wake_up_state(task, TASK_INTERRUPTIBLE);
}

/*
 * Called to break out of interruptible wait loops, and enter the
 * exit_to_user_mode_loop().
 */
static inline void set_notify_signal(struct task_struct *task)
{
        if (__set_notify_signal(task))
                kick_process(task);
}

static inline int restart_syscall(void)
{
        set_tsk_thread_flag(current, TIF_SIGPENDING);
        return -ERESTARTNOINTR;
}

static inline int task_sigpending(struct task_struct *p)
{
        return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}

static inline int signal_pending(struct task_struct *p)
{
        /*
         * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same
         * behavior in terms of ensuring that we break out of wait loops
         * so that notify signal callbacks can be processed.
         */
        if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL)))
                return 1;
        return task_sigpending(p);
}

static inline int __fatal_signal_pending(struct task_struct *p)
{
        return unlikely(sigismember(&p->pending.signal, SIGKILL));
}

static inline int fatal_signal_pending(struct task_struct *p)
{
        return task_sigpending(p) && __fatal_signal_pending(p);
}

static inline int signal_pending_state(unsigned int state, struct task_struct *p)
{
        if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
                return 0;
        if (!signal_pending(p))
                return 0;

        return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}

/*
 * This should only be used in fault handlers to decide whether we
 * should stop the current fault routine to handle the signals
 * instead, especially with the case where we've got interrupted with
 * a VM_FAULT_RETRY.
 */
static inline bool fault_signal_pending(vm_fault_t fault_flags,
                                        struct pt_regs *regs)
{
        return unlikely((fault_flags & VM_FAULT_RETRY) &&
                        (fatal_signal_pending(current) ||
                         (user_mode(regs) && signal_pending(current))));
}

/*
 * Reevaluate whether the task has signals pending delivery.
 * Wake the task if so.
 * This is required every time the blocked sigset_t changes.
 * callers must hold sighand->siglock.
 */
extern void recalc_sigpending(void);
extern void calculate_sigpending(void);

extern void signal_wake_up_state(struct task_struct *t, unsigned int state);

static inline void signal_wake_up(struct task_struct *t, bool fatal)
{
        unsigned int state = 0;
        if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) {
                t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED);
                state = TASK_WAKEKILL | __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
{
        unsigned int state = 0;
        if (resume) {
                t->jobctl &= ~JOBCTL_TRACED;
                state = __TASK_TRACED;
        }
        signal_wake_up_state(t, state);
}

void task_join_group_stop(struct task_struct *task);

#ifdef TIF_RESTORE_SIGMASK
/*
 * Legacy restore_sigmask accessors.  These are inefficient on
 * SMP architectures because they require atomic operations.
 */

/**
 * set_restore_sigmask() - make sure saved_sigmask processing gets done
 *
 * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code
 * will run before returning to user mode, to process the flag.  For
 * all callers, TIF_SIGPENDING is already set or it's no harm to set
 * it.  TIF_RESTORE_SIGMASK need not be in the set of bits that the
 * arch code will notice on return to user mode, in case those bits
 * are scarce.  We set TIF_SIGPENDING here to ensure that the arch
 * signal code always gets run when TIF_RESTORE_SIGMASK is set.
 */
static inline void set_restore_sigmask(void)
{
        set_thread_flag(TIF_RESTORE_SIGMASK);
}

static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}

static inline void clear_restore_sigmask(void)
{
        clear_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK);
}
static inline bool test_restore_sigmask(void)
{
        return test_thread_flag(TIF_RESTORE_SIGMASK);
}
static inline bool test_and_clear_restore_sigmask(void)
{
        return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK);
}

#else        /* TIF_RESTORE_SIGMASK */

/* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */
static inline void set_restore_sigmask(void)
{
        current->restore_sigmask = true;
}
static inline void clear_tsk_restore_sigmask(struct task_struct *task)
{
        task->restore_sigmask = false;
}
static inline void clear_restore_sigmask(void)
{
        current->restore_sigmask = false;
}
static inline bool test_restore_sigmask(void)
{
        return current->restore_sigmask;
}
static inline bool test_tsk_restore_sigmask(struct task_struct *task)
{
        return task->restore_sigmask;
}
static inline bool test_and_clear_restore_sigmask(void)
{
        if (!current->restore_sigmask)
                return false;
        current->restore_sigmask = false;
        return true;
}
#endif

static inline void restore_saved_sigmask(void)
{
        if (test_and_clear_restore_sigmask())
                __set_current_blocked(&current->saved_sigmask);
}

extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize);

static inline void restore_saved_sigmask_unless(bool interrupted)
{
        if (interrupted)
                WARN_ON(!signal_pending(current));
        else
                restore_saved_sigmask();
}

static inline sigset_t *sigmask_to_save(void)
{
        sigset_t *res = &current->blocked;
        if (unlikely(test_restore_sigmask()))
                res = &current->saved_sigmask;
        return res;
}

static inline int kill_cad_pid(int sig, int priv)
{
        return kill_pid(cad_pid, sig, priv);
}

/* These can be the second arg to send_sig_info/send_group_sig_info.  */
#define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0)
#define SEND_SIG_PRIV        ((struct kernel_siginfo *) 1)

static inline int __on_sig_stack(unsigned long sp)
{
#ifdef CONFIG_STACK_GROWSUP
        return sp >= current->sas_ss_sp &&
                sp - current->sas_ss_sp < current->sas_ss_size;
#else
        return sp > current->sas_ss_sp &&
                sp - current->sas_ss_sp <= current->sas_ss_size;
#endif
}

/*
 * True if we are on the alternate signal stack.
 */
static inline int on_sig_stack(unsigned long sp)
{
        /*
         * If the signal stack is SS_AUTODISARM then, by construction, we
         * can't be on the signal stack unless user code deliberately set
         * SS_AUTODISARM when we were already on it.
         *
         * This improves reliability: if user state gets corrupted such that
         * the stack pointer points very close to the end of the signal stack,
         * then this check will enable the signal to be handled anyway.
         */
        if (current->sas_ss_flags & SS_AUTODISARM)
                return 0;

        return __on_sig_stack(sp);
}

static inline int sas_ss_flags(unsigned long sp)
{
        if (!current->sas_ss_size)
                return SS_DISABLE;

        return on_sig_stack(sp) ? SS_ONSTACK : 0;
}

static inline void sas_ss_reset(struct task_struct *p)
{
        p->sas_ss_sp = 0;
        p->sas_ss_size = 0;
        p->sas_ss_flags = SS_DISABLE;
}

static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
{
        if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
#ifdef CONFIG_STACK_GROWSUP
                return current->sas_ss_sp;
#else
                return current->sas_ss_sp + current->sas_ss_size;
#endif
        return sp;
}

extern void __cleanup_sighand(struct sighand_struct *);
extern void flush_itimer_signals(void);

#define tasklist_empty() \
        list_empty(&init_task.tasks)

#define next_task(p) \
        list_entry_rcu((p)->tasks.next, struct task_struct, tasks)

#define for_each_process(p) \
        for (p = &init_task ; (p = next_task(p)) != &init_task ; )

extern bool current_is_single_threaded(void);

/*
 * Without tasklist/siglock it is only rcu-safe if g can't exit/exec,
 * otherwise next_thread(t) will never reach g after list_del_rcu(g).
 */
#define while_each_thread(g, t) \
        while ((t = next_thread(t)) != g)

#define for_other_threads(p, t)        \
        for (t = p; (t = next_thread(t)) != p; )

#define __for_each_thread(signal, t)        \
        list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \
                lockdep_is_held(&tasklist_lock))

#define for_each_thread(p, t)                \
        __for_each_thread((p)->signal, t)

/* Careful: this is a double loop, 'break' won't work as expected. */
#define for_each_process_thread(p, t)        \
        for_each_process(p) for_each_thread(p, t)

typedef int (*proc_visitor)(struct task_struct *p, void *data);
void walk_process_tree(struct task_struct *top, proc_visitor, void *);

static inline
struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
{
        struct pid *pid;
        if (type == PIDTYPE_PID)
                pid = task_pid(task);
        else
                pid = task->signal->pids[type];
        return pid;
}

static inline struct pid *task_tgid(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_TGID];
}

/*
 * Without tasklist or RCU lock it is not safe to dereference
 * the result of task_pgrp/task_session even if task == current,
 * we can race with another thread doing sys_setsid/sys_setpgid.
 */
static inline struct pid *task_pgrp(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_PGID];
}

static inline struct pid *task_session(struct task_struct *task)
{
        return task->signal->pids[PIDTYPE_SID];
}

static inline int get_nr_threads(struct task_struct *task)
{
        return task->signal->nr_threads;
}

static inline bool thread_group_leader(struct task_struct *p)
{
        return p->exit_signal >= 0;
}

static inline
bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
{
        return p1->signal == p2->signal;
}

/*
 * returns NULL if p is the last thread in the thread group
 */
static inline struct task_struct *__next_thread(struct task_struct *p)
{
        return list_next_or_null_rcu(&p->signal->thread_head,
                                        &p->thread_node,
                                        struct task_struct,
                                        thread_node);
}

static inline struct task_struct *next_thread(struct task_struct *p)
{
        return __next_thread(p) ?: p->group_leader;
}

static inline int thread_group_empty(struct task_struct *p)
{
        return thread_group_leader(p) &&
               list_is_last(&p->thread_node, &p->signal->thread_head);
}

#define delay_group_leader(p) \
                (thread_group_leader(p) && !thread_group_empty(p))

extern struct sighand_struct *__lock_task_sighand(struct task_struct *task,
                                                        unsigned long *flags);

static inline struct sighand_struct *lock_task_sighand(struct task_struct *task,
                                                       unsigned long *flags)
{
        struct sighand_struct *ret;

        ret = __lock_task_sighand(task, flags);
        (void)__cond_lock(&task->sighand->siglock, ret);
        return ret;
}

static inline void unlock_task_sighand(struct task_struct *task,
                                                unsigned long *flags)
{
        spin_unlock_irqrestore(&task->sighand->siglock, *flags);
}

#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_task_sighand_held(struct task_struct *task);
#else
static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { }
#endif

static inline unsigned long task_rlimit(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_cur);
}

static inline unsigned long task_rlimit_max(const struct task_struct *task,
                unsigned int limit)
{
        return READ_ONCE(task->signal->rlim[limit].rlim_max);
}

static inline unsigned long rlimit(unsigned int limit)
{
        return task_rlimit(current, limit);
}

static inline unsigned long rlimit_max(unsigned int limit)
{
        return task_rlimit_max(current, limit);
}

#endif /* _LINUX_SCHED_SIGNAL_H */














































































































    7 



























































    8 










    8 




    8 














    6 







    6 
    3 

    4 




    3 
    4 
























    5 



    4 





    5 





    5 
    5 






    1 






    4 















    6 























    4 





    4 









    6 






    6 




    2 



    6 






    6 



























    8 


















    8 


    8 





































































    8 








    8 






    3 




    8 



    8 









    8 







    8 



    8 





    8 



    6 



    8 



    8 


    8 








    8 

    8 
















































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
// SPDX-License-Identifier: GPL-2.0-only
#undef DEBUG

/*
 * ARM performance counter support.
 *
 * Copyright (C) 2009 picoChip Designs, Ltd., Jamie Iles
 * Copyright (C) 2010 ARM Ltd., Will Deacon <will.deacon@arm.com>
 *
 * This code is based on the sparc64 perf event code, which is in turn based
 * on the x86 code.
 */
#define pr_fmt(fmt) "hw perfevents: " fmt

#include <linux/bitmap.h>
#include <linux/cpumask.h>
#include <linux/cpu_pm.h>
#include <linux/export.h>
#include <linux/kernel.h>
#include <linux/perf/arm_pmu.h>
#include <linux/slab.h>
#include <linux/sched/clock.h>
#include <linux/spinlock.h>
#include <linux/irq.h>
#include <linux/irqdesc.h>

#include <asm/irq_regs.h>

static int armpmu_count_irq_users(const int irq);

struct pmu_irq_ops {
        void (*enable_pmuirq)(unsigned int irq);
        void (*disable_pmuirq)(unsigned int irq);
        void (*free_pmuirq)(unsigned int irq, int cpu, void __percpu *devid);
};

static void armpmu_free_pmuirq(unsigned int irq, int cpu, void __percpu *devid)
{
        free_irq(irq, per_cpu_ptr(devid, cpu));
}

static const struct pmu_irq_ops pmuirq_ops = {
        .enable_pmuirq = enable_irq,
        .disable_pmuirq = disable_irq_nosync,
        .free_pmuirq = armpmu_free_pmuirq
};

static void armpmu_free_pmunmi(unsigned int irq, int cpu, void __percpu *devid)
{
        free_nmi(irq, per_cpu_ptr(devid, cpu));
}

static const struct pmu_irq_ops pmunmi_ops = {
        .enable_pmuirq = enable_nmi,
        .disable_pmuirq = disable_nmi_nosync,
        .free_pmuirq = armpmu_free_pmunmi
};

static void armpmu_enable_percpu_pmuirq(unsigned int irq)
{
        enable_percpu_irq(irq, IRQ_TYPE_NONE);
}

static void armpmu_free_percpu_pmuirq(unsigned int irq, int cpu,
                                   void __percpu *devid)
{
        if (armpmu_count_irq_users(irq) == 1)
                free_percpu_irq(irq, devid);
}

static const struct pmu_irq_ops percpu_pmuirq_ops = {
        .enable_pmuirq = armpmu_enable_percpu_pmuirq,
        .disable_pmuirq = disable_percpu_irq,
        .free_pmuirq = armpmu_free_percpu_pmuirq
};

static void armpmu_enable_percpu_pmunmi(unsigned int irq)
{
        if (!prepare_percpu_nmi(irq))
                enable_percpu_nmi(irq, IRQ_TYPE_NONE);
}

static void armpmu_disable_percpu_pmunmi(unsigned int irq)
{
        disable_percpu_nmi(irq);
        teardown_percpu_nmi(irq);
}

static void armpmu_free_percpu_pmunmi(unsigned int irq, int cpu,
                                      void __percpu *devid)
{
        if (armpmu_count_irq_users(irq) == 1)
                free_percpu_nmi(irq, devid);
}

static const struct pmu_irq_ops percpu_pmunmi_ops = {
        .enable_pmuirq = armpmu_enable_percpu_pmunmi,
        .disable_pmuirq = armpmu_disable_percpu_pmunmi,
        .free_pmuirq = armpmu_free_percpu_pmunmi
};

static DEFINE_PER_CPU(struct arm_pmu *, cpu_armpmu);
static DEFINE_PER_CPU(int, cpu_irq);
static DEFINE_PER_CPU(const struct pmu_irq_ops *, cpu_irq_ops);

static bool has_nmi;

static inline u64 arm_pmu_event_max_period(struct perf_event *event)
{
        if (event->hw.flags & ARMPMU_EVT_64BIT)
                return GENMASK_ULL(63, 0);
        else if (event->hw.flags & ARMPMU_EVT_63BIT)
                return GENMASK_ULL(62, 0);
        else if (event->hw.flags & ARMPMU_EVT_47BIT)
                return GENMASK_ULL(46, 0);
        else
                return GENMASK_ULL(31, 0);
}

static int
armpmu_map_cache_event(const unsigned (*cache_map)
                                      [PERF_COUNT_HW_CACHE_MAX]
                                      [PERF_COUNT_HW_CACHE_OP_MAX]
                                      [PERF_COUNT_HW_CACHE_RESULT_MAX],
                       u64 config)
{
        unsigned int cache_type, cache_op, cache_result, ret;

        cache_type = (config >>  0) & 0xff;
        if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
                return -EINVAL;

        cache_op = (config >>  8) & 0xff;
        if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
                return -EINVAL;

        cache_result = (config >> 16) & 0xff;
        if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
                return -EINVAL;

        if (!cache_map)
                return -ENOENT;

        ret = (int)(*cache_map)[cache_type][cache_op][cache_result];

        if (ret == CACHE_OP_UNSUPPORTED)
                return -ENOENT;

        return ret;
}

static int
armpmu_map_hw_event(const unsigned (*event_map)[PERF_COUNT_HW_MAX], u64 config)
{
        int mapping;

        if (config >= PERF_COUNT_HW_MAX)
                return -EINVAL;

        if (!event_map)
                return -ENOENT;

        mapping = (*event_map)[config];
        return mapping == HW_OP_UNSUPPORTED ? -ENOENT : mapping;
}

static int
armpmu_map_raw_event(u32 raw_event_mask, u64 config)
{
        return (int)(config & raw_event_mask);
}

int
armpmu_map_event(struct perf_event *event,
                 const unsigned (*event_map)[PERF_COUNT_HW_MAX],
                 const unsigned (*cache_map)
                                [PERF_COUNT_HW_CACHE_MAX]
                                [PERF_COUNT_HW_CACHE_OP_MAX]
                                [PERF_COUNT_HW_CACHE_RESULT_MAX],
                 u32 raw_event_mask)
{
        u64 config = event->attr.config;
        int type = event->attr.type;

        if (type == event->pmu->type)
                return armpmu_map_raw_event(raw_event_mask, config);

        switch (type) {
        case PERF_TYPE_HARDWARE:
                return armpmu_map_hw_event(event_map, config);
        case PERF_TYPE_HW_CACHE:
                return armpmu_map_cache_event(cache_map, config);
        case PERF_TYPE_RAW:
                return armpmu_map_raw_event(raw_event_mask, config);
        }

        return -ENOENT;
}

int armpmu_event_set_period(struct perf_event *event)
{
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
        struct hw_perf_event *hwc = &event->hw;
        s64 left = local64_read(&hwc->period_left);
        s64 period = hwc->sample_period;
        u64 max_period;
        int ret = 0;

        max_period = arm_pmu_event_max_period(event);
        if (unlikely(left <= -period)) {
                left = period;
                local64_set(&hwc->period_left, left);
                hwc->last_period = period;
                ret = 1;
        }

        if (unlikely(left <= 0)) {
                left += period;
                local64_set(&hwc->period_left, left);
                hwc->last_period = period;
                ret = 1;
        }

        /*
         * Limit the maximum period to prevent the counter value
         * from overtaking the one we are about to program. In
         * effect we are reducing max_period to account for
         * interrupt latency (and we are being very conservative).
         */
        if (left > (max_period >> 1))
                left = (max_period >> 1);

        local64_set(&hwc->prev_count, (u64)-left);

        armpmu->write_counter(event, (u64)(-left) & max_period);

        perf_event_update_userpage(event);

        return ret;
}

u64 armpmu_event_update(struct perf_event *event)
{
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
        struct hw_perf_event *hwc = &event->hw;
        u64 delta, prev_raw_count, new_raw_count;
        u64 max_period = arm_pmu_event_max_period(event);

again:
        prev_raw_count = local64_read(&hwc->prev_count);
        new_raw_count = armpmu->read_counter(event);

        if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
                             new_raw_count) != prev_raw_count)
                goto again;

        delta = (new_raw_count - prev_raw_count) & max_period;

        local64_add(delta, &event->count);
        local64_sub(delta, &hwc->period_left);

        return new_raw_count;
}

static void
armpmu_read(struct perf_event *event)
{
        armpmu_event_update(event);
}

static void
armpmu_stop(struct perf_event *event, int flags)
{
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
        struct hw_perf_event *hwc = &event->hw;

        /*
         * ARM pmu always has to update the counter, so ignore
         * PERF_EF_UPDATE, see comments in armpmu_start().
         */
        if (!(hwc->state & PERF_HES_STOPPED)) {
                armpmu->disable(event);
                armpmu_event_update(event);
                hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
        }
}

static void armpmu_start(struct perf_event *event, int flags)
{
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
        struct hw_perf_event *hwc = &event->hw;

        /*
         * ARM pmu always has to reprogram the period, so ignore
         * PERF_EF_RELOAD, see the comment below.
         */
        if (flags & PERF_EF_RELOAD)
                WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));

        hwc->state = 0;
        /*
         * Set the period again. Some counters can't be stopped, so when we
         * were stopped we simply disabled the IRQ source and the counter
         * may have been left counting. If we don't do this step then we may
         * get an interrupt too soon or *way* too late if the overflow has
         * happened since disabling.
         */
        armpmu_event_set_period(event);
        armpmu->enable(event);
}

static void
armpmu_del(struct perf_event *event, int flags)
{
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
        struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
        struct hw_perf_event *hwc = &event->hw;
        int idx = hwc->idx;

        armpmu_stop(event, PERF_EF_UPDATE);
        hw_events->events[idx] = NULL;
        armpmu->clear_event_idx(hw_events, event);
        perf_event_update_userpage(event);
        /* Clear the allocated counter */
        hwc->idx = -1;
}

static int
armpmu_add(struct perf_event *event, int flags)
{
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
        struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
        struct hw_perf_event *hwc = &event->hw;
        int idx;

        /* An event following a process won't be stopped earlier */
        if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
                return -ENOENT;

        /* If we don't have a space for the counter then finish early. */
        idx = armpmu->get_event_idx(hw_events, event);
        if (idx < 0)
                return idx;

        /* The newly-allocated counter should be empty */
        WARN_ON_ONCE(hw_events->events[idx]);

        event->hw.idx = idx;
        hw_events->events[idx] = event;

        hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
        if (flags & PERF_EF_START)
                armpmu_start(event, PERF_EF_RELOAD);

        /* Propagate our changes to the userspace mapping. */
        perf_event_update_userpage(event);

        return 0;
}

static int
validate_event(struct pmu *pmu, struct pmu_hw_events *hw_events,
                               struct perf_event *event)
{
        struct arm_pmu *armpmu;

        if (is_software_event(event))
                return 1;

        /*
         * Reject groups spanning multiple HW PMUs (e.g. CPU + CCI). The
         * core perf code won't check that the pmu->ctx == leader->ctx
         * until after pmu->event_init(event).
         */
        if (event->pmu != pmu)
                return 0;

        if (event->state < PERF_EVENT_STATE_OFF)
                return 1;

        if (event->state == PERF_EVENT_STATE_OFF && !event->attr.enable_on_exec)
                return 1;

        armpmu = to_arm_pmu(event->pmu);
        return armpmu->get_event_idx(hw_events, event) >= 0;
}

static int
validate_group(struct perf_event *event)
{
        struct perf_event *sibling, *leader = event->group_leader;
        struct pmu_hw_events fake_pmu;

        /*
         * Initialise the fake PMU. We only need to populate the
         * used_mask for the purposes of validation.
         */
        memset(&fake_pmu.used_mask, 0, sizeof(fake_pmu.used_mask));

        if (!validate_event(event->pmu, &fake_pmu, leader))
                return -EINVAL;

        if (event == leader)
                return 0;

        for_each_sibling_event(sibling, leader) {
                if (!validate_event(event->pmu, &fake_pmu, sibling))
                        return -EINVAL;
        }

        if (!validate_event(event->pmu, &fake_pmu, event))
                return -EINVAL;

        return 0;
}

static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
{
        struct arm_pmu *armpmu;
        int ret;
        u64 start_clock, finish_clock;

        /*
         * we request the IRQ with a (possibly percpu) struct arm_pmu**, but
         * the handlers expect a struct arm_pmu*. The percpu_irq framework will
         * do any necessary shifting, we just need to perform the first
         * dereference.
         */
        armpmu = *(void **)dev;
        if (WARN_ON_ONCE(!armpmu))
                return IRQ_NONE;

        start_clock = sched_clock();
        ret = armpmu->handle_irq(armpmu);
        finish_clock = sched_clock();

        perf_sample_event_took(finish_clock - start_clock);
        return ret;
}

static int
__hw_perf_event_init(struct perf_event *event)
{
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);
        struct hw_perf_event *hwc = &event->hw;
        int mapping, ret;

        hwc->flags = 0;
        mapping = armpmu->map_event(event);

        if (mapping < 0) {
                pr_debug("event %x:%llx not supported\n", event->attr.type,
                         event->attr.config);
                return mapping;
        }

        /*
         * We don't assign an index until we actually place the event onto
         * hardware. Use -1 to signify that we haven't decided where to put it
         * yet. For SMP systems, each core has it's own PMU so we can't do any
         * clever allocation or constraints checking at this point.
         */
        hwc->idx                = -1;
        hwc->config_base        = 0;
        hwc->config                = 0;
        hwc->event_base                = 0;

        /*
         * Check whether we need to exclude the counter from certain modes.
         */
        if (armpmu->set_event_filter) {
                ret = armpmu->set_event_filter(hwc, &event->attr);
                if (ret)
                        return ret;
        }

        /*
         * Store the event encoding into the config_base field.
         */
        hwc->config_base            |= (unsigned long)mapping;

        if (!is_sampling_event(event)) {
                /*
                 * For non-sampling runs, limit the sample_period to half
                 * of the counter width. That way, the new counter value
                 * is far less likely to overtake the previous one unless
                 * you have some serious IRQ latency issues.
                 */
                hwc->sample_period  = arm_pmu_event_max_period(event) >> 1;
                hwc->last_period    = hwc->sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
        }

        return validate_group(event);
}

static int armpmu_event_init(struct perf_event *event)
{
        struct arm_pmu *armpmu = to_arm_pmu(event->pmu);

        /*
         * Reject CPU-affine events for CPUs that are of a different class to
         * that which this PMU handles. Process-following events (where
         * event->cpu == -1) can be migrated between CPUs, and thus we have to
         * reject them later (in armpmu_add) if they're scheduled on a
         * different class of CPU.
         */
        if (event->cpu != -1 &&
                !cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
                return -ENOENT;

        /* does not support taken branch sampling */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        return __hw_perf_event_init(event);
}

static void armpmu_enable(struct pmu *pmu)
{
        struct arm_pmu *armpmu = to_arm_pmu(pmu);
        struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
        bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS);

        /* For task-bound events we may be called on other CPUs */
        if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
                return;

        if (enabled)
                armpmu->start(armpmu);
}

static void armpmu_disable(struct pmu *pmu)
{
        struct arm_pmu *armpmu = to_arm_pmu(pmu);

        /* For task-bound events we may be called on other CPUs */
        if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
                return;

        armpmu->stop(armpmu);
}

/*
 * In heterogeneous systems, events are specific to a particular
 * microarchitecture, and aren't suitable for another. Thus, only match CPUs of
 * the same microarchitecture.
 */
static bool armpmu_filter(struct pmu *pmu, int cpu)
{
        struct arm_pmu *armpmu = to_arm_pmu(pmu);
        return !cpumask_test_cpu(cpu, &armpmu->supported_cpus);
}

static ssize_t cpus_show(struct device *dev,
                         struct device_attribute *attr, char *buf)
{
        struct arm_pmu *armpmu = to_arm_pmu(dev_get_drvdata(dev));
        return cpumap_print_to_pagebuf(true, buf, &armpmu->supported_cpus);
}

static DEVICE_ATTR_RO(cpus);

static struct attribute *armpmu_common_attrs[] = {
        &dev_attr_cpus.attr,
        NULL,
};

static const struct attribute_group armpmu_common_attr_group = {
        .attrs = armpmu_common_attrs,
};

static int armpmu_count_irq_users(const int irq)
{
        int cpu, count = 0;

        for_each_possible_cpu(cpu) {
                if (per_cpu(cpu_irq, cpu) == irq)
                        count++;
        }

        return count;
}

static const struct pmu_irq_ops *armpmu_find_irq_ops(int irq)
{
        const struct pmu_irq_ops *ops = NULL;
        int cpu;

        for_each_possible_cpu(cpu) {
                if (per_cpu(cpu_irq, cpu) != irq)
                        continue;

                ops = per_cpu(cpu_irq_ops, cpu);
                if (ops)
                        break;
        }

        return ops;
}

void armpmu_free_irq(int irq, int cpu)
{
        if (per_cpu(cpu_irq, cpu) == 0)
                return;
        if (WARN_ON(irq != per_cpu(cpu_irq, cpu)))
                return;

        per_cpu(cpu_irq_ops, cpu)->free_pmuirq(irq, cpu, &cpu_armpmu);

        per_cpu(cpu_irq, cpu) = 0;
        per_cpu(cpu_irq_ops, cpu) = NULL;
}

int armpmu_request_irq(int irq, int cpu)
{
        int err = 0;
        const irq_handler_t handler = armpmu_dispatch_irq;
        const struct pmu_irq_ops *irq_ops;

        if (!irq)
                return 0;

        if (!irq_is_percpu_devid(irq)) {
                unsigned long irq_flags;

                err = irq_force_affinity(irq, cpumask_of(cpu));

                if (err && num_possible_cpus() > 1) {
                        pr_warn("unable to set irq affinity (irq=%d, cpu=%u)\n",
                                irq, cpu);
                        goto err_out;
                }

                irq_flags = IRQF_PERCPU |
                            IRQF_NOBALANCING | IRQF_NO_AUTOEN |
                            IRQF_NO_THREAD;

                err = request_nmi(irq, handler, irq_flags, "arm-pmu",
                                  per_cpu_ptr(&cpu_armpmu, cpu));

                /* If cannot get an NMI, get a normal interrupt */
                if (err) {
                        err = request_irq(irq, handler, irq_flags, "arm-pmu",
                                          per_cpu_ptr(&cpu_armpmu, cpu));
                        irq_ops = &pmuirq_ops;
                } else {
                        has_nmi = true;
                        irq_ops = &pmunmi_ops;
                }
        } else if (armpmu_count_irq_users(irq) == 0) {
                err = request_percpu_nmi(irq, handler, "arm-pmu", &cpu_armpmu);

                /* If cannot get an NMI, get a normal interrupt */
                if (err) {
                        err = request_percpu_irq(irq, handler, "arm-pmu",
                                                 &cpu_armpmu);
                        irq_ops = &percpu_pmuirq_ops;
                } else {
                        has_nmi = true;
                        irq_ops = &percpu_pmunmi_ops;
                }
        } else {
                /* Per cpudevid irq was already requested by another CPU */
                irq_ops = armpmu_find_irq_ops(irq);

                if (WARN_ON(!irq_ops))
                        err = -EINVAL;
        }

        if (err)
                goto err_out;

        per_cpu(cpu_irq, cpu) = irq;
        per_cpu(cpu_irq_ops, cpu) = irq_ops;
        return 0;

err_out:
        pr_err("unable to request IRQ%d for ARM PMU counters\n", irq);
        return err;
}

static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu)
{
        struct pmu_hw_events __percpu *hw_events = pmu->hw_events;
        return per_cpu(hw_events->irq, cpu);
}

bool arm_pmu_irq_is_nmi(void)
{
        return has_nmi;
}

/*
 * PMU hardware loses all context when a CPU goes offline.
 * When a CPU is hotplugged back in, since some hardware registers are
 * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading
 * junk values out of them.
 */
static int arm_perf_starting_cpu(unsigned int cpu, struct hlist_node *node)
{
        struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node);
        int irq;

        if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
                return 0;
        if (pmu->reset)
                pmu->reset(pmu);

        per_cpu(cpu_armpmu, cpu) = pmu;

        irq = armpmu_get_cpu_irq(pmu, cpu);
        if (irq)
                per_cpu(cpu_irq_ops, cpu)->enable_pmuirq(irq);

        return 0;
}

static int arm_perf_teardown_cpu(unsigned int cpu, struct hlist_node *node)
{
        struct arm_pmu *pmu = hlist_entry_safe(node, struct arm_pmu, node);
        int irq;

        if (!cpumask_test_cpu(cpu, &pmu->supported_cpus))
                return 0;

        irq = armpmu_get_cpu_irq(pmu, cpu);
        if (irq)
                per_cpu(cpu_irq_ops, cpu)->disable_pmuirq(irq);

        per_cpu(cpu_armpmu, cpu) = NULL;

        return 0;
}

#ifdef CONFIG_CPU_PM
static void cpu_pm_pmu_setup(struct arm_pmu *armpmu, unsigned long cmd)
{
        struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
        struct perf_event *event;
        int idx;

        for_each_set_bit(idx, armpmu->cntr_mask, ARMPMU_MAX_HWEVENTS) {
                event = hw_events->events[idx];
                if (!event)
                        continue;

                switch (cmd) {
                case CPU_PM_ENTER:
                        /*
                         * Stop and update the counter
                         */
                        armpmu_stop(event, PERF_EF_UPDATE);
                        break;
                case CPU_PM_EXIT:
                case CPU_PM_ENTER_FAILED:
                         /*
                          * Restore and enable the counter.
                          */
                        armpmu_start(event, PERF_EF_RELOAD);
                        break;
                default:
                        break;
                }
        }
}

static int cpu_pm_pmu_notify(struct notifier_block *b, unsigned long cmd,
                             void *v)
{
        struct arm_pmu *armpmu = container_of(b, struct arm_pmu, cpu_pm_nb);
        struct pmu_hw_events *hw_events = this_cpu_ptr(armpmu->hw_events);
        bool enabled = !bitmap_empty(hw_events->used_mask, ARMPMU_MAX_HWEVENTS);

        if (!cpumask_test_cpu(smp_processor_id(), &armpmu->supported_cpus))
                return NOTIFY_DONE;

        /*
         * Always reset the PMU registers on power-up even if
         * there are no events running.
         */
        if (cmd == CPU_PM_EXIT && armpmu->reset)
                armpmu->reset(armpmu);

        if (!enabled)
                return NOTIFY_OK;

        switch (cmd) {
        case CPU_PM_ENTER:
                armpmu->stop(armpmu);
                cpu_pm_pmu_setup(armpmu, cmd);
                break;
        case CPU_PM_EXIT:
        case CPU_PM_ENTER_FAILED:
                cpu_pm_pmu_setup(armpmu, cmd);
                armpmu->start(armpmu);
                break;
        default:
                return NOTIFY_DONE;
        }

        return NOTIFY_OK;
}

static int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu)
{
        cpu_pmu->cpu_pm_nb.notifier_call = cpu_pm_pmu_notify;
        return cpu_pm_register_notifier(&cpu_pmu->cpu_pm_nb);
}

static void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu)
{
        cpu_pm_unregister_notifier(&cpu_pmu->cpu_pm_nb);
}
#else
static inline int cpu_pm_pmu_register(struct arm_pmu *cpu_pmu) { return 0; }
static inline void cpu_pm_pmu_unregister(struct arm_pmu *cpu_pmu) { }
#endif

static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
{
        int err;

        err = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_STARTING,
                                       &cpu_pmu->node);
        if (err)
                goto out;

        err = cpu_pm_pmu_register(cpu_pmu);
        if (err)
                goto out_unregister;

        return 0;

out_unregister:
        cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING,
                                            &cpu_pmu->node);
out:
        return err;
}

static void cpu_pmu_destroy(struct arm_pmu *cpu_pmu)
{
        cpu_pm_pmu_unregister(cpu_pmu);
        cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_STARTING,
                                            &cpu_pmu->node);
}

struct arm_pmu *armpmu_alloc(void)
{
        struct arm_pmu *pmu;
        int cpu;

        pmu = kzalloc(sizeof(*pmu), GFP_KERNEL);
        if (!pmu)
                goto out;

        pmu->hw_events = alloc_percpu_gfp(struct pmu_hw_events, GFP_KERNEL);
        if (!pmu->hw_events) {
                pr_info("failed to allocate per-cpu PMU data.\n");
                goto out_free_pmu;
        }

        pmu->pmu = (struct pmu) {
                .pmu_enable        = armpmu_enable,
                .pmu_disable        = armpmu_disable,
                .event_init        = armpmu_event_init,
                .add                = armpmu_add,
                .del                = armpmu_del,
                .start                = armpmu_start,
                .stop                = armpmu_stop,
                .read                = armpmu_read,
                .filter                = armpmu_filter,
                .attr_groups        = pmu->attr_groups,
                /*
                 * This is a CPU PMU potentially in a heterogeneous
                 * configuration (e.g. big.LITTLE) so
                 * PERF_PMU_CAP_EXTENDED_HW_TYPE is required to open
                 * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE events on a
                 * specific PMU.
                 */
                .capabilities        = PERF_PMU_CAP_EXTENDED_REGS |
                                  PERF_PMU_CAP_EXTENDED_HW_TYPE,
        };

        pmu->attr_groups[ARMPMU_ATTR_GROUP_COMMON] =
                &armpmu_common_attr_group;

        for_each_possible_cpu(cpu) {
                struct pmu_hw_events *events;

                events = per_cpu_ptr(pmu->hw_events, cpu);
                events->percpu_pmu = pmu;
        }

        return pmu;

out_free_pmu:
        kfree(pmu);
out:
        return NULL;
}

void armpmu_free(struct arm_pmu *pmu)
{
        free_percpu(pmu->hw_events);
        kfree(pmu);
}

int armpmu_register(struct arm_pmu *pmu)
{
        int ret;

        ret = cpu_pmu_init(pmu);
        if (ret)
                return ret;

        if (!pmu->set_event_filter)
                pmu->pmu.capabilities |= PERF_PMU_CAP_NO_EXCLUDE;

        ret = perf_pmu_register(&pmu->pmu, pmu->name, -1);
        if (ret)
                goto out_destroy;

        pr_info("enabled with %s PMU driver, %d (%*pb) counters available%s\n",
                pmu->name, bitmap_weight(pmu->cntr_mask, ARMPMU_MAX_HWEVENTS),
                ARMPMU_MAX_HWEVENTS, &pmu->cntr_mask,
                has_nmi ? ", using NMIs" : "");

        kvm_host_pmu_init(pmu);

        return 0;

out_destroy:
        cpu_pmu_destroy(pmu);
        return ret;
}

static int arm_pmu_hp_init(void)
{
        int ret;

        ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_STARTING,
                                      "perf/arm/pmu:starting",
                                      arm_perf_starting_cpu,
                                      arm_perf_teardown_cpu);
        if (ret)
                pr_err("CPU hotplug notifier for ARM PMU could not be registered: %d\n",
                       ret);
        return ret;
}
subsys_initcall(arm_pmu_hp_init);
































































































































































































































































































































































































































    3 


    3 






















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
// SPDX-License-Identifier: GPL-2.0

#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/xarray.h>
#include <net/busy_poll.h>
#include <net/net_debug.h>
#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
#include <net/page_pool/types.h>
#include <net/page_pool/memory_provider.h>
#include <net/sock.h>

#include "page_pool_priv.h"
#include "netdev-genl-gen.h"

static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1);
/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev,
 *        pool->user.
 * Ordering: inside rtnl_lock
 */
DEFINE_MUTEX(page_pools_lock);

/* Page pools are only reachable from user space (via netlink) if they are
 * linked to a netdev at creation time. Following page pool "visibility"
 * states are possible:
 *  - normal
 *    - user.list: linked to real netdev, netdev: real netdev
 *  - orphaned - real netdev has disappeared
 *    - user.list: linked to lo, netdev: lo
 *  - invisible - either (a) created without netdev linking, (b) unlisted due
 *      to error, or (c) the entire namespace which owned this pool disappeared
 *    - user.list: unhashed, netdev: unknown
 */

typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool,
                             const struct genl_info *info);

static int
netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill)
{
        struct page_pool *pool;
        struct sk_buff *rsp;
        int err;

        mutex_lock(&page_pools_lock);
        pool = xa_load(&page_pools, id);
        if (!pool || hlist_unhashed(&pool->user.list) ||
            !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) {
                err = -ENOENT;
                goto err_unlock;
        }

        rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!rsp) {
                err = -ENOMEM;
                goto err_unlock;
        }

        err = fill(rsp, pool, info);
        if (err)
                goto err_free_msg;

        mutex_unlock(&page_pools_lock);

        return genlmsg_reply(rsp, info);

err_free_msg:
        nlmsg_free(rsp);
err_unlock:
        mutex_unlock(&page_pools_lock);
        return err;
}

struct page_pool_dump_cb {
        unsigned long ifindex;
        u32 pp_id;
};

static int
netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb,
                             pp_nl_fill_cb fill)
{
        struct page_pool_dump_cb *state = (void *)cb->ctx;
        const struct genl_info *info = genl_info_dump(cb);
        struct net *net = sock_net(skb->sk);
        struct net_device *netdev;
        struct page_pool *pool;
        int err = 0;

        rtnl_lock();
        mutex_lock(&page_pools_lock);
        for_each_netdev_dump(net, netdev, state->ifindex) {
                hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
                        if (state->pp_id && state->pp_id < pool->user.id)
                                continue;

                        state->pp_id = pool->user.id;
                        err = fill(skb, pool, info);
                        if (err)
                                goto out;
                }

                state->pp_id = 0;
        }
out:
        mutex_unlock(&page_pools_lock);
        rtnl_unlock();

        return err;
}

static int
page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool,
                        const struct genl_info *info)
{
#ifdef CONFIG_PAGE_POOL_STATS
        struct page_pool_stats stats = {};
        struct nlattr *nest;
        void *hdr;

        if (!page_pool_get_stats(pool, &stats))
                return 0;

        hdr = genlmsg_iput(rsp, info);
        if (!hdr)
                return -EMSGSIZE;

        nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO);

        if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) ||
            (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX &&
             nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
                         pool->slow.netdev->ifindex)))
                goto err_cancel_nest;

        nla_nest_end(rsp, nest);

        if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST,
                         stats.alloc_stats.fast) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW,
                         stats.alloc_stats.slow) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER,
                         stats.alloc_stats.slow_high_order) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY,
                         stats.alloc_stats.empty) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL,
                         stats.alloc_stats.refill) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE,
                         stats.alloc_stats.waive) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED,
                         stats.recycle_stats.cached) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL,
                         stats.recycle_stats.cache_full) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING,
                         stats.recycle_stats.ring) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL,
                         stats.recycle_stats.ring_full) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT,
                         stats.recycle_stats.released_refcnt))
                goto err_cancel_msg;

        genlmsg_end(rsp, hdr);

        return 0;
err_cancel_nest:
        nla_nest_cancel(rsp, nest);
err_cancel_msg:
        genlmsg_cancel(rsp, hdr);
        return -EMSGSIZE;
#else
        GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS");
        return -EOPNOTSUPP;
#endif
}

int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb,
                                       struct genl_info *info)
{
        struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)];
        struct nlattr *nest;
        int err;
        u32 id;

        if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO))
                return -EINVAL;

        nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO];
        err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest,
                               netdev_page_pool_info_nl_policy,
                               info->extack);
        if (err)
                return err;

        if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID))
                return -EINVAL;
        if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) {
                NL_SET_ERR_MSG_ATTR(info->extack,
                                    tb[NETDEV_A_PAGE_POOL_IFINDEX],
                                    "selecting by ifindex not supported");
                return -EINVAL;
        }

        id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]);

        return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill);
}

int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb,
                                         struct netlink_callback *cb)
{
        return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill);
}

static int
page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
                  const struct genl_info *info)
{
        size_t inflight, refsz;
        unsigned int napi_id;
        void *hdr;

        hdr = genlmsg_iput(rsp, info);
        if (!hdr)
                return -EMSGSIZE;

        if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id))
                goto err_cancel;

        if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX &&
            nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
                        pool->slow.netdev->ifindex))
                goto err_cancel;

        napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0;
        if (napi_id_valid(napi_id) &&
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id))
                goto err_cancel;

        inflight = page_pool_inflight(pool, false);
        refsz =        PAGE_SIZE << pool->p.order;
        if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) ||
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM,
                         inflight * refsz))
                goto err_cancel;
        if (pool->user.detach_time &&
            nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME,
                         pool->user.detach_time))
                goto err_cancel;

        if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL))
                goto err_cancel;

        genlmsg_end(rsp, hdr);

        return 0;
err_cancel:
        genlmsg_cancel(rsp, hdr);
        return -EMSGSIZE;
}

static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd)
{
        struct genl_info info;
        struct sk_buff *ntf;
        struct net *net;

        lockdep_assert_held(&page_pools_lock);

        /* 'invisible' page pools don't matter */
        if (hlist_unhashed(&pool->user.list))
                return;
        net = dev_net(pool->slow.netdev);

        if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL))
                return;

        genl_info_init_ntf(&info, &netdev_nl_family, cmd);

        ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!ntf)
                return;

        if (page_pool_nl_fill(ntf, pool, &info)) {
                nlmsg_free(ntf);
                return;
        }

        genlmsg_multicast_netns(&netdev_nl_family, net, ntf,
                                0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL);
}

int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info)
{
        u32 id;

        if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID))
                return -EINVAL;

        id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]);

        return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill);
}

int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb,
                                   struct netlink_callback *cb)
{
        return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill);
}

int page_pool_list(struct page_pool *pool)
{
        static u32 id_alloc_next;
        int err;

        mutex_lock(&page_pools_lock);
        err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b,
                              &id_alloc_next, GFP_KERNEL);
        if (err < 0)
                goto err_unlock;

        INIT_HLIST_NODE(&pool->user.list);
        if (pool->slow.netdev) {
                hlist_add_head(&pool->user.list,
                               &pool->slow.netdev->page_pools);
                netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF);
        }

        mutex_unlock(&page_pools_lock);
        return 0;

err_unlock:
        mutex_unlock(&page_pools_lock);
        return err;
}

void page_pool_detached(struct page_pool *pool)
{
        mutex_lock(&page_pools_lock);
        pool->user.detach_time = ktime_get_boottime_seconds();
        netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF);
        mutex_unlock(&page_pools_lock);
}

void page_pool_unlist(struct page_pool *pool)
{
        mutex_lock(&page_pools_lock);
        netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF);
        xa_erase(&page_pools, pool->user.id);
        if (!hlist_unhashed(&pool->user.list))
                hlist_del(&pool->user.list);
        mutex_unlock(&page_pools_lock);
}

int page_pool_check_memory_provider(struct net_device *dev,
                                    struct netdev_rx_queue *rxq)
{
        void *binding = rxq->mp_params.mp_priv;
        struct page_pool *pool;
        struct hlist_node *n;

        if (!binding)
                return 0;

        mutex_lock(&page_pools_lock);
        hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) {
                if (pool->mp_priv != binding)
                        continue;

                if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) {
                        mutex_unlock(&page_pools_lock);
                        return 0;
                }
        }
        mutex_unlock(&page_pools_lock);
        return -ENODATA;
}

static void page_pool_unreg_netdev_wipe(struct net_device *netdev)
{
        struct page_pool *pool;
        struct hlist_node *n;

        mutex_lock(&page_pools_lock);
        hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) {
                hlist_del_init(&pool->user.list);
                pool->slow.netdev = NET_PTR_POISON;
        }
        mutex_unlock(&page_pools_lock);
}

static void page_pool_unreg_netdev(struct net_device *netdev)
{
        struct page_pool *pool, *last;
        struct net_device *lo;

        lo = dev_net(netdev)->loopback_dev;

        mutex_lock(&page_pools_lock);
        last = NULL;
        hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
                pool->slow.netdev = lo;
                netdev_nl_page_pool_event(pool,
                                          NETDEV_CMD_PAGE_POOL_CHANGE_NTF);
                last = pool;
        }
        if (last)
                hlist_splice_init(&netdev->page_pools, &last->user.list,
                                  &lo->page_pools);
        mutex_unlock(&page_pools_lock);
}

static int
page_pool_netdevice_event(struct notifier_block *nb,
                          unsigned long event, void *ptr)
{
        struct net_device *netdev = netdev_notifier_info_to_dev(ptr);

        if (event != NETDEV_UNREGISTER)
                return NOTIFY_DONE;

        if (hlist_empty(&netdev->page_pools))
                return NOTIFY_OK;

        if (netdev->ifindex != LOOPBACK_IFINDEX)
                page_pool_unreg_netdev(netdev);
        else
                page_pool_unreg_netdev_wipe(netdev);
        return NOTIFY_OK;
}

static struct notifier_block page_pool_netdevice_nb = {
        .notifier_call = page_pool_netdevice_event,
};

static int __init page_pool_user_init(void)
{
        return register_netdevice_notifier(&page_pool_netdevice_nb);
}

subsys_initcall(page_pool_user_init);


































































































































































































































































































































































































































































   58 







































































































































































































































































































































































































































































  165 



  165 










  165 







  166 


  165 















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
 *
 * This file contains the interrupt descriptor management code. Detailed
 * information is available in Documentation/core-api/genericirq.rst
 *
 */
#include <linux/irq.h>
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <linux/maple_tree.h>
#include <linux/irqdomain.h>
#include <linux/sysfs.h>
#include <linux/string_choices.h>

#include "internals.h"

/*
 * lockdep: we want to handle all irq_desc locks as a single lock-class:
 */
static struct lock_class_key irq_desc_lock_class;

#if defined(CONFIG_SMP)
static int __init irq_affinity_setup(char *str)
{
        alloc_bootmem_cpumask_var(&irq_default_affinity);
        cpulist_parse(str, irq_default_affinity);
        /*
         * Set at least the boot cpu. We don't want to end up with
         * bugreports caused by random commandline masks
         */
        cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
        return 1;
}
__setup("irqaffinity=", irq_affinity_setup);

static void __init init_irq_default_affinity(void)
{
        if (!cpumask_available(irq_default_affinity))
                zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
        if (cpumask_empty(irq_default_affinity))
                cpumask_setall(irq_default_affinity);
}
#else
static void __init init_irq_default_affinity(void)
{
}
#endif

#ifdef CONFIG_SMP
static int alloc_masks(struct irq_desc *desc, int node)
{
        if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity,
                                     GFP_KERNEL, node))
                return -ENOMEM;

#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
        if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity,
                                     GFP_KERNEL, node)) {
                free_cpumask_var(desc->irq_common_data.affinity);
                return -ENOMEM;
        }
#endif

#ifdef CONFIG_GENERIC_PENDING_IRQ
        if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) {
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
                free_cpumask_var(desc->irq_common_data.effective_affinity);
#endif
                free_cpumask_var(desc->irq_common_data.affinity);
                return -ENOMEM;
        }
#endif
        return 0;
}

static void desc_smp_init(struct irq_desc *desc, int node,
                          const struct cpumask *affinity)
{
        if (!affinity)
                affinity = irq_default_affinity;
        cpumask_copy(desc->irq_common_data.affinity, affinity);

#ifdef CONFIG_GENERIC_PENDING_IRQ
        cpumask_clear(desc->pending_mask);
#endif
#ifdef CONFIG_NUMA
        desc->irq_common_data.node = node;
#endif
}

static void free_masks(struct irq_desc *desc)
{
#ifdef CONFIG_GENERIC_PENDING_IRQ
        free_cpumask_var(desc->pending_mask);
#endif
        free_cpumask_var(desc->irq_common_data.affinity);
#ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK
        free_cpumask_var(desc->irq_common_data.effective_affinity);
#endif
}

#else
static inline int
alloc_masks(struct irq_desc *desc, int node) { return 0; }
static inline void
desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { }
static inline void free_masks(struct irq_desc *desc) { }
#endif

static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
                              const struct cpumask *affinity, struct module *owner)
{
        int cpu;

        desc->irq_common_data.handler_data = NULL;
        desc->irq_common_data.msi_desc = NULL;

        desc->irq_data.common = &desc->irq_common_data;
        desc->irq_data.irq = irq;
        desc->irq_data.chip = &no_irq_chip;
        desc->irq_data.chip_data = NULL;
        irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
        irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
        irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
        desc->handle_irq = handle_bad_irq;
        desc->depth = 1;
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
        desc->tot_count = 0;
        desc->name = NULL;
        desc->owner = owner;
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { };
        desc_smp_init(desc, node, affinity);
}

static unsigned int nr_irqs = NR_IRQS;

/**
 * irq_get_nr_irqs() - Number of interrupts supported by the system.
 */
unsigned int irq_get_nr_irqs(void)
{
        return nr_irqs;
}
EXPORT_SYMBOL_GPL(irq_get_nr_irqs);

/**
 * irq_set_nr_irqs() - Set the number of interrupts supported by the system.
 * @nr: New number of interrupts.
 *
 * Return: @nr.
 */
unsigned int irq_set_nr_irqs(unsigned int nr)
{
        nr_irqs = nr;

        return nr;
}
EXPORT_SYMBOL_GPL(irq_set_nr_irqs);

static DEFINE_MUTEX(sparse_irq_lock);
static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs,
                                        MT_FLAGS_ALLOC_RANGE |
                                        MT_FLAGS_LOCK_EXTERN |
                                        MT_FLAGS_USE_RCU,
                                        sparse_irq_lock);

static int irq_find_free_area(unsigned int from, unsigned int cnt)
{
        MA_STATE(mas, &sparse_irqs, 0, 0);

        if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt))
                return -ENOSPC;
        return mas.index;
}

static unsigned int irq_find_at_or_after(unsigned int offset)
{
        unsigned long index = offset;
        struct irq_desc *desc;

        guard(rcu)();
        desc = mt_find(&sparse_irqs, &index, nr_irqs);

        return desc ? irq_desc_get_irq(desc) : nr_irqs;
}

static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
{
        MA_STATE(mas, &sparse_irqs, irq, irq);
        WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0);
}

static void delete_irq_desc(unsigned int irq)
{
        MA_STATE(mas, &sparse_irqs, irq, irq);
        mas_erase(&mas);
}

#ifdef CONFIG_SPARSE_IRQ
static const struct kobj_type irq_kobj_type;
#endif

static int init_desc(struct irq_desc *desc, int irq, int node,
                     unsigned int flags,
                     const struct cpumask *affinity,
                     struct module *owner)
{
        desc->kstat_irqs = alloc_percpu(struct irqstat);
        if (!desc->kstat_irqs)
                return -ENOMEM;

        if (alloc_masks(desc, node)) {
                free_percpu(desc->kstat_irqs);
                return -ENOMEM;
        }

        raw_spin_lock_init(&desc->lock);
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
        mutex_init(&desc->request_mutex);
        init_waitqueue_head(&desc->wait_for_threads);
        desc_set_defaults(irq, desc, node, affinity, owner);
        irqd_set(&desc->irq_data, flags);
        irq_resend_init(desc);
#ifdef CONFIG_SPARSE_IRQ
        kobject_init(&desc->kobj, &irq_kobj_type);
        init_rcu_head(&desc->rcu);
#endif

        return 0;
}

#ifdef CONFIG_SPARSE_IRQ

static void irq_kobj_release(struct kobject *kobj);

#ifdef CONFIG_SYSFS
static struct kobject *irq_kobj_base;

#define IRQ_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)

static ssize_t per_cpu_count_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf)
{
        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
        ssize_t ret = 0;
        char *p = "";
        int cpu;

        for_each_possible_cpu(cpu) {
                unsigned int c = irq_desc_kstat_cpu(desc, cpu);

                ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c);
                p = ",";
        }

        ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
        return ret;
}
IRQ_ATTR_RO(per_cpu_count);

static ssize_t chip_name_show(struct kobject *kobj,
                              struct kobj_attribute *attr, char *buf)
{
        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
        ssize_t ret = 0;

        raw_spin_lock_irq(&desc->lock);
        if (desc->irq_data.chip && desc->irq_data.chip->name) {
                ret = scnprintf(buf, PAGE_SIZE, "%s\n",
                                desc->irq_data.chip->name);
        }
        raw_spin_unlock_irq(&desc->lock);

        return ret;
}
IRQ_ATTR_RO(chip_name);

static ssize_t hwirq_show(struct kobject *kobj,
                          struct kobj_attribute *attr, char *buf)
{
        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
        ssize_t ret = 0;

        raw_spin_lock_irq(&desc->lock);
        if (desc->irq_data.domain)
                ret = sprintf(buf, "%lu\n", desc->irq_data.hwirq);
        raw_spin_unlock_irq(&desc->lock);

        return ret;
}
IRQ_ATTR_RO(hwirq);

static ssize_t type_show(struct kobject *kobj,
                         struct kobj_attribute *attr, char *buf)
{
        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
        ssize_t ret = 0;

        raw_spin_lock_irq(&desc->lock);
        ret = sprintf(buf, "%s\n",
                      irqd_is_level_type(&desc->irq_data) ? "level" : "edge");
        raw_spin_unlock_irq(&desc->lock);

        return ret;

}
IRQ_ATTR_RO(type);

static ssize_t wakeup_show(struct kobject *kobj,
                           struct kobj_attribute *attr, char *buf)
{
        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
        ssize_t ret = 0;

        raw_spin_lock_irq(&desc->lock);
        ret = sprintf(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data)));
        raw_spin_unlock_irq(&desc->lock);

        return ret;

}
IRQ_ATTR_RO(wakeup);

static ssize_t name_show(struct kobject *kobj,
                         struct kobj_attribute *attr, char *buf)
{
        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
        ssize_t ret = 0;

        raw_spin_lock_irq(&desc->lock);
        if (desc->name)
                ret = scnprintf(buf, PAGE_SIZE, "%s\n", desc->name);
        raw_spin_unlock_irq(&desc->lock);

        return ret;
}
IRQ_ATTR_RO(name);

static ssize_t actions_show(struct kobject *kobj,
                            struct kobj_attribute *attr, char *buf)
{
        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
        struct irqaction *action;
        ssize_t ret = 0;
        char *p = "";

        raw_spin_lock_irq(&desc->lock);
        for_each_action_of_desc(desc, action) {
                ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
                                 p, action->name);
                p = ",";
        }
        raw_spin_unlock_irq(&desc->lock);

        if (ret)
                ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");

        return ret;
}
IRQ_ATTR_RO(actions);

static struct attribute *irq_attrs[] = {
        &per_cpu_count_attr.attr,
        &chip_name_attr.attr,
        &hwirq_attr.attr,
        &type_attr.attr,
        &wakeup_attr.attr,
        &name_attr.attr,
        &actions_attr.attr,
        NULL
};
ATTRIBUTE_GROUPS(irq);

static const struct kobj_type irq_kobj_type = {
        .release        = irq_kobj_release,
        .sysfs_ops        = &kobj_sysfs_ops,
        .default_groups = irq_groups,
};

static void irq_sysfs_add(int irq, struct irq_desc *desc)
{
        if (irq_kobj_base) {
                /*
                 * Continue even in case of failure as this is nothing
                 * crucial and failures in the late irq_sysfs_init()
                 * cannot be rolled back.
                 */
                if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq))
                        pr_warn("Failed to add kobject for irq %d\n", irq);
                else
                        desc->istate |= IRQS_SYSFS;
        }
}

static void irq_sysfs_del(struct irq_desc *desc)
{
        /*
         * Only invoke kobject_del() when kobject_add() was successfully
         * invoked for the descriptor. This covers both early boot, where
         * sysfs is not initialized yet, and the case of a failed
         * kobject_add() invocation.
         */
        if (desc->istate & IRQS_SYSFS)
                kobject_del(&desc->kobj);
}

static int __init irq_sysfs_init(void)
{
        struct irq_desc *desc;
        int irq;

        /* Prevent concurrent irq alloc/free */
        irq_lock_sparse();

        irq_kobj_base = kobject_create_and_add("irq", kernel_kobj);
        if (!irq_kobj_base) {
                irq_unlock_sparse();
                return -ENOMEM;
        }

        /* Add the already allocated interrupts */
        for_each_irq_desc(irq, desc)
                irq_sysfs_add(irq, desc);
        irq_unlock_sparse();

        return 0;
}
postcore_initcall(irq_sysfs_init);

#else /* !CONFIG_SYSFS */

static const struct kobj_type irq_kobj_type = {
        .release        = irq_kobj_release,
};

static void irq_sysfs_add(int irq, struct irq_desc *desc) {}
static void irq_sysfs_del(struct irq_desc *desc) {}

#endif /* CONFIG_SYSFS */

struct irq_desc *irq_to_desc(unsigned int irq)
{
        return mtree_load(&sparse_irqs, irq);
}
#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE
EXPORT_SYMBOL_GPL(irq_to_desc);
#endif

void irq_lock_sparse(void)
{
        mutex_lock(&sparse_irq_lock);
}

void irq_unlock_sparse(void)
{
        mutex_unlock(&sparse_irq_lock);
}

static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
                                   const struct cpumask *affinity,
                                   struct module *owner)
{
        struct irq_desc *desc;
        int ret;

        desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node);
        if (!desc)
                return NULL;

        ret = init_desc(desc, irq, node, flags, affinity, owner);
        if (unlikely(ret)) {
                kfree(desc);
                return NULL;
        }

        return desc;
}

static void irq_kobj_release(struct kobject *kobj)
{
        struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);

        free_masks(desc);
        free_percpu(desc->kstat_irqs);
        kfree(desc);
}

static void delayed_free_desc(struct rcu_head *rhp)
{
        struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);

        kobject_put(&desc->kobj);
}

static void free_desc(unsigned int irq)
{
        struct irq_desc *desc = irq_to_desc(irq);

        irq_remove_debugfs_entry(desc);
        unregister_irq_proc(irq, desc);

        /*
         * sparse_irq_lock protects also show_interrupts() and
         * kstat_irq_usr(). Once we deleted the descriptor from the
         * sparse tree we can free it. Access in proc will fail to
         * lookup the descriptor.
         *
         * The sysfs entry must be serialized against a concurrent
         * irq_sysfs_init() as well.
         */
        irq_sysfs_del(desc);
        delete_irq_desc(irq);

        /*
         * We free the descriptor, masks and stat fields via RCU. That
         * allows demultiplex interrupts to do rcu based management of
         * the child interrupts.
         * This also allows us to use rcu in kstat_irqs_usr().
         */
        call_rcu(&desc->rcu, delayed_free_desc);
}

static int alloc_descs(unsigned int start, unsigned int cnt, int node,
                       const struct irq_affinity_desc *affinity,
                       struct module *owner)
{
        struct irq_desc *desc;
        int i;

        /* Validate affinity mask(s) */
        if (affinity) {
                for (i = 0; i < cnt; i++) {
                        if (cpumask_empty(&affinity[i].mask))
                                return -EINVAL;
                }
        }

        for (i = 0; i < cnt; i++) {
                const struct cpumask *mask = NULL;
                unsigned int flags = 0;

                if (affinity) {
                        if (affinity->is_managed) {
                                flags = IRQD_AFFINITY_MANAGED |
                                        IRQD_MANAGED_SHUTDOWN;
                        }
                        flags |= IRQD_AFFINITY_SET;
                        mask = &affinity->mask;
                        node = cpu_to_node(cpumask_first(mask));
                        affinity++;
                }

                desc = alloc_desc(start + i, node, flags, mask, owner);
                if (!desc)
                        goto err;
                irq_insert_desc(start + i, desc);
                irq_sysfs_add(start + i, desc);
                irq_add_debugfs_entry(start + i, desc);
        }
        return start;

err:
        for (i--; i >= 0; i--)
                free_desc(start + i);
        return -ENOMEM;
}

static int irq_expand_nr_irqs(unsigned int nr)
{
        if (nr > MAX_SPARSE_IRQS)
                return -ENOMEM;
        nr_irqs = nr;
        return 0;
}

int __init early_irq_init(void)
{
        int i, initcnt, node = first_online_node;
        struct irq_desc *desc;

        init_irq_default_affinity();

        /* Let arch update nr_irqs and return the nr of preallocated irqs */
        initcnt = arch_probe_nr_irqs();
        printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n",
               NR_IRQS, nr_irqs, initcnt);

        if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS))
                nr_irqs = MAX_SPARSE_IRQS;

        if (WARN_ON(initcnt > MAX_SPARSE_IRQS))
                initcnt = MAX_SPARSE_IRQS;

        if (initcnt > nr_irqs)
                nr_irqs = initcnt;

        for (i = 0; i < initcnt; i++) {
                desc = alloc_desc(i, node, 0, NULL, NULL);
                irq_insert_desc(i, desc);
        }
        return arch_early_irq_init();
}

#else /* !CONFIG_SPARSE_IRQ */

struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
        [0 ... NR_IRQS-1] = {
                .handle_irq        = handle_bad_irq,
                .depth                = 1,
                .lock                = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
        }
};

int __init early_irq_init(void)
{
        int count, i, node = first_online_node;
        int ret;

        init_irq_default_affinity();

        printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS);

        count = ARRAY_SIZE(irq_desc);

        for (i = 0; i < count; i++) {
                ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL);
                if (unlikely(ret))
                        goto __free_desc_res;
        }

        return arch_early_irq_init();

__free_desc_res:
        while (--i >= 0) {
                free_masks(irq_desc + i);
                free_percpu(irq_desc[i].kstat_irqs);
        }

        return ret;
}

struct irq_desc *irq_to_desc(unsigned int irq)
{
        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
EXPORT_SYMBOL(irq_to_desc);

static void free_desc(unsigned int irq)
{
        struct irq_desc *desc = irq_to_desc(irq);
        unsigned long flags;

        raw_spin_lock_irqsave(&desc->lock, flags);
        desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        delete_irq_desc(irq);
}

static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
                              const struct irq_affinity_desc *affinity,
                              struct module *owner)
{
        u32 i;

        for (i = 0; i < cnt; i++) {
                struct irq_desc *desc = irq_to_desc(start + i);

                desc->owner = owner;
                irq_insert_desc(start + i, desc);
        }
        return start;
}

static int irq_expand_nr_irqs(unsigned int nr)
{
        return -ENOMEM;
}

void irq_mark_irq(unsigned int irq)
{
        mutex_lock(&sparse_irq_lock);
        irq_insert_desc(irq, irq_desc + irq);
        mutex_unlock(&sparse_irq_lock);
}

#ifdef CONFIG_GENERIC_IRQ_LEGACY
void irq_init_desc(unsigned int irq)
{
        free_desc(irq);
}
#endif

#endif /* !CONFIG_SPARSE_IRQ */

int handle_irq_desc(struct irq_desc *desc)
{
        struct irq_data *data;

        if (!desc)
                return -EINVAL;

        data = irq_desc_get_irq_data(desc);
        if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data)))
                return -EPERM;

        generic_handle_irq_desc(desc);
        return 0;
}

/**
 * generic_handle_irq - Invoke the handler for a particular irq
 * @irq:        The irq number to handle
 *
 * Returns:        0 on success, or -EINVAL if conversion has failed
 *
 *                 This function must be called from an IRQ context with irq regs
 *                 initialized.
  */
int generic_handle_irq(unsigned int irq)
{
        return handle_irq_desc(irq_to_desc(irq));
}
EXPORT_SYMBOL_GPL(generic_handle_irq);

/**
 * generic_handle_irq_safe - Invoke the handler for a particular irq from any
 *                             context.
 * @irq:        The irq number to handle
 *
 * Returns:        0 on success, a negative value on error.
 *
 * This function can be called from any context (IRQ or process context). It
 * will report an error if not invoked from IRQ context and the irq has been
 * marked to enforce IRQ-context only.
 */
int generic_handle_irq_safe(unsigned int irq)
{
        unsigned long flags;
        int ret;

        local_irq_save(flags);
        ret = handle_irq_desc(irq_to_desc(irq));
        local_irq_restore(flags);
        return ret;
}
EXPORT_SYMBOL_GPL(generic_handle_irq_safe);

#ifdef CONFIG_IRQ_DOMAIN
/**
 * generic_handle_domain_irq - Invoke the handler for a HW irq belonging
 *                             to a domain.
 * @domain:        The domain where to perform the lookup
 * @hwirq:        The HW irq number to convert to a logical one
 *
 * Returns:        0 on success, or -EINVAL if conversion has failed
 *
 *                 This function must be called from an IRQ context with irq regs
 *                 initialized.
 */
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
        return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);

 /**
 * generic_handle_irq_safe - Invoke the handler for a HW irq belonging
 *                             to a domain from any context.
 * @domain:        The domain where to perform the lookup
 * @hwirq:        The HW irq number to convert to a logical one
 *
 * Returns:        0 on success, a negative value on error.
 *
 * This function can be called from any context (IRQ or process
 * context). If the interrupt is marked as 'enforce IRQ-context only' then
 * the function must be invoked from hard interrupt context.
 */
int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq)
{
        unsigned long flags;
        int ret;

        local_irq_save(flags);
        ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq));
        local_irq_restore(flags);
        return ret;
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe);

/**
 * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
 *                             to a domain.
 * @domain:        The domain where to perform the lookup
 * @hwirq:        The HW irq number to convert to a logical one
 *
 * Returns:        0 on success, or -EINVAL if conversion has failed
 *
 *                 This function must be called from an NMI context with irq regs
 *                 initialized.
 **/
int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
{
        WARN_ON_ONCE(!in_nmi());
        return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
#endif

/* Dynamic interrupt handling */

/**
 * irq_free_descs - free irq descriptors
 * @from:        Start of descriptor range
 * @cnt:        Number of consecutive irqs to free
 */
void irq_free_descs(unsigned int from, unsigned int cnt)
{
        int i;

        if (from >= nr_irqs || (from + cnt) > nr_irqs)
                return;

        mutex_lock(&sparse_irq_lock);
        for (i = 0; i < cnt; i++)
                free_desc(from + i);

        mutex_unlock(&sparse_irq_lock);
}
EXPORT_SYMBOL_GPL(irq_free_descs);

/**
 * __irq_alloc_descs - allocate and initialize a range of irq descriptors
 * @irq:        Allocate for specific irq number if irq >= 0
 * @from:        Start the search from this irq number
 * @cnt:        Number of consecutive irqs to allocate.
 * @node:        Preferred node on which the irq descriptor should be allocated
 * @owner:        Owning module (can be NULL)
 * @affinity:        Optional pointer to an affinity mask array of size @cnt which
 *                hints where the irq descriptors should be allocated and which
 *                default affinities to use
 *
 * Returns the first irq number or error code
 */
int __ref
__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
                  struct module *owner, const struct irq_affinity_desc *affinity)
{
        int start, ret;

        if (!cnt)
                return -EINVAL;

        if (irq >= 0) {
                if (from > irq)
                        return -EINVAL;
                from = irq;
        } else {
                /*
                 * For interrupts which are freely allocated the
                 * architecture can force a lower bound to the @from
                 * argument. x86 uses this to exclude the GSI space.
                 */
                from = arch_dynirq_lower_bound(from);
        }

        mutex_lock(&sparse_irq_lock);

        start = irq_find_free_area(from, cnt);
        ret = -EEXIST;
        if (irq >=0 && start != irq)
                goto unlock;

        if (start + cnt > nr_irqs) {
                ret = irq_expand_nr_irqs(start + cnt);
                if (ret)
                        goto unlock;
        }
        ret = alloc_descs(start, cnt, node, affinity, owner);
unlock:
        mutex_unlock(&sparse_irq_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(__irq_alloc_descs);

/**
 * irq_get_next_irq - get next allocated irq number
 * @offset:        where to start the search
 *
 * Returns next irq number after offset or nr_irqs if none is found.
 */
unsigned int irq_get_next_irq(unsigned int offset)
{
        return irq_find_at_or_after(offset);
}

struct irq_desc *
__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
                    unsigned int check)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (desc) {
                if (check & _IRQ_DESC_CHECK) {
                        if ((check & _IRQ_DESC_PERCPU) &&
                            !irq_settings_is_per_cpu_devid(desc))
                                return NULL;

                        if (!(check & _IRQ_DESC_PERCPU) &&
                            irq_settings_is_per_cpu_devid(desc))
                                return NULL;
                }

                if (bus)
                        chip_bus_lock(desc);
                raw_spin_lock_irqsave(&desc->lock, *flags);
        }
        return desc;
}

void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
        __releases(&desc->lock)
{
        raw_spin_unlock_irqrestore(&desc->lock, flags);
        if (bus)
                chip_bus_sync_unlock(desc);
}

int irq_set_percpu_devid_partition(unsigned int irq,
                                   const struct cpumask *affinity)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (!desc || desc->percpu_enabled)
                return -EINVAL;

        desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);

        if (!desc->percpu_enabled)
                return -ENOMEM;

        desc->percpu_affinity = affinity ? : cpu_possible_mask;

        irq_set_percpu_devid_flags(irq);
        return 0;
}

int irq_set_percpu_devid(unsigned int irq)
{
        return irq_set_percpu_devid_partition(irq, NULL);
}

int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (!desc || !desc->percpu_enabled)
                return -EINVAL;

        if (affinity)
                cpumask_copy(affinity, desc->percpu_affinity);

        return 0;
}
EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);

void kstat_incr_irq_this_cpu(unsigned int irq)
{
        kstat_incr_irqs_this_cpu(irq_to_desc(irq));
}

/**
 * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu
 * @irq:        The interrupt number
 * @cpu:        The cpu number
 *
 * Returns the sum of interrupt counts on @cpu since boot for
 * @irq. The caller must ensure that the interrupt is not removed
 * concurrently.
 */
unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
{
        struct irq_desc *desc = irq_to_desc(irq);

        return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0;
}

static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask)
{
        unsigned int sum = 0;
        int cpu;

        if (!irq_settings_is_per_cpu_devid(desc) &&
            !irq_settings_is_per_cpu(desc) &&
            !irq_is_nmi(desc))
                return data_race(desc->tot_count);

        for_each_cpu(cpu, cpumask)
                sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu));
        return sum;
}

static unsigned int kstat_irqs(unsigned int irq)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (!desc || !desc->kstat_irqs)
                return 0;
        return kstat_irqs_desc(desc, cpu_possible_mask);
}

#ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT

void kstat_snapshot_irqs(void)
{
        struct irq_desc *desc;
        unsigned int irq;

        for_each_irq_desc(irq, desc) {
                if (!desc->kstat_irqs)
                        continue;
                this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt));
        }
}

unsigned int kstat_get_irq_since_snapshot(unsigned int irq)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (!desc || !desc->kstat_irqs)
                return 0;
        return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref);
}

#endif

/**
 * kstat_irqs_usr - Get the statistics for an interrupt from thread context
 * @irq:        The interrupt number
 *
 * Returns the sum of interrupt counts on all cpus since boot for @irq.
 *
 * It uses rcu to protect the access since a concurrent removal of an
 * interrupt descriptor is observing an rcu grace period before
 * delayed_free_desc()/irq_kobj_release().
 */
unsigned int kstat_irqs_usr(unsigned int irq)
{
        unsigned int sum;

        rcu_read_lock();
        sum = kstat_irqs(irq);
        rcu_read_unlock();
        return sum;
}

#ifdef CONFIG_LOCKDEP
void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class,
                             struct lock_class_key *request_class)
{
        struct irq_desc *desc = irq_to_desc(irq);

        if (desc) {
                lockdep_set_class(&desc->lock, lock_class);
                lockdep_set_class(&desc->request_mutex, request_class);
        }
}
EXPORT_SYMBOL_GPL(__irq_set_lockdep_class);
#endif














































































  105 





























    9 



   97 

   97 



   97 





































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2023 ARM Ltd.
 */

#include <linux/mm.h>
#include <linux/efi.h>
#include <linux/export.h>
#include <asm/tlbflush.h>

static inline bool mm_is_user(struct mm_struct *mm)
{
        /*
         * Don't attempt to apply the contig bit to kernel mappings, because
         * dynamically adding/removing the contig bit can cause page faults.
         * These racing faults are ok for user space, since they get serialized
         * on the PTL. But kernel mappings can't tolerate faults.
         */
        if (unlikely(mm_is_efi(mm)))
                return false;
        return mm != &init_mm;
}

static inline pte_t *contpte_align_down(pte_t *ptep)
{
        return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
}

static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep, unsigned int nr)
{
        /*
         * Unfold any partially covered contpte block at the beginning and end
         * of the range.
         */

        if (ptep != contpte_align_down(ptep) || nr < CONT_PTES)
                contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));

        if (ptep + nr != contpte_align_down(ptep + nr)) {
                unsigned long last_addr = addr + PAGE_SIZE * (nr - 1);
                pte_t *last_ptep = ptep + nr - 1;

                contpte_try_unfold(mm, last_addr, last_ptep,
                                   __ptep_get(last_ptep));
        }
}

static void contpte_convert(struct mm_struct *mm, unsigned long addr,
                            pte_t *ptep, pte_t pte)
{
        struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
        unsigned long start_addr;
        pte_t *start_ptep;
        int i;

        start_ptep = ptep = contpte_align_down(ptep);
        start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
        pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte));

        for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) {
                pte_t ptent = __ptep_get_and_clear(mm, addr, ptep);

                if (pte_dirty(ptent))
                        pte = pte_mkdirty(pte);

                if (pte_young(ptent))
                        pte = pte_mkyoung(pte);
        }

        __flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);

        __set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
}

void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
                        pte_t *ptep, pte_t pte)
{
        /*
         * We have already checked that the virtual and pysical addresses are
         * correctly aligned for a contpte mapping in contpte_try_fold() so the
         * remaining checks are to ensure that the contpte range is fully
         * covered by a single folio, and ensure that all the ptes are valid
         * with contiguous PFNs and matching prots. We ignore the state of the
         * access and dirty bits for the purpose of deciding if its a contiguous
         * range; the folding process will generate a single contpte entry which
         * has a single access and dirty bit. Those 2 bits are the logical OR of
         * their respective bits in the constituent pte entries. In order to
         * ensure the contpte range is covered by a single folio, we must
         * recover the folio from the pfn, but special mappings don't have a
         * folio backing them. Fortunately contpte_try_fold() already checked
         * that the pte is not special - we never try to fold special mappings.
         * Note we can't use vm_normal_page() for this since we don't have the
         * vma.
         */

        unsigned long folio_start, folio_end;
        unsigned long cont_start, cont_end;
        pte_t expected_pte, subpte;
        struct folio *folio;
        struct page *page;
        unsigned long pfn;
        pte_t *orig_ptep;
        pgprot_t prot;

        int i;

        if (!mm_is_user(mm))
                return;

        page = pte_page(pte);
        folio = page_folio(page);
        folio_start = addr - (page - &folio->page) * PAGE_SIZE;
        folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE;
        cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE);
        cont_end = cont_start + CONT_PTE_SIZE;

        if (folio_start > cont_start || folio_end < cont_end)
                return;

        pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES);
        prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
        expected_pte = pfn_pte(pfn, prot);
        orig_ptep = ptep;
        ptep = contpte_align_down(ptep);

        for (i = 0; i < CONT_PTES; i++) {
                subpte = pte_mkold(pte_mkclean(__ptep_get(ptep)));
                if (!pte_same(subpte, expected_pte))
                        return;
                expected_pte = pte_advance_pfn(expected_pte, 1);
                ptep++;
        }

        pte = pte_mkcont(pte);
        contpte_convert(mm, addr, orig_ptep, pte);
}
EXPORT_SYMBOL_GPL(__contpte_try_fold);

void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
                        pte_t *ptep, pte_t pte)
{
        /*
         * We have already checked that the ptes are contiguous in
         * contpte_try_unfold(), so just check that the mm is user space.
         */
        if (!mm_is_user(mm))
                return;

        pte = pte_mknoncont(pte);
        contpte_convert(mm, addr, ptep, pte);
}
EXPORT_SYMBOL_GPL(__contpte_try_unfold);

pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
{
        /*
         * Gather access/dirty bits, which may be populated in any of the ptes
         * of the contig range. We are guaranteed to be holding the PTL, so any
         * contiguous range cannot be unfolded or otherwise modified under our
         * feet.
         */

        pte_t pte;
        int i;

        ptep = contpte_align_down(ptep);

        for (i = 0; i < CONT_PTES; i++, ptep++) {
                pte = __ptep_get(ptep);

                if (pte_dirty(pte))
                        orig_pte = pte_mkdirty(orig_pte);

                if (pte_young(pte))
                        orig_pte = pte_mkyoung(orig_pte);
        }

        return orig_pte;
}
EXPORT_SYMBOL_GPL(contpte_ptep_get);

pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
{
        /*
         * The ptep_get_lockless() API requires us to read and return *orig_ptep
         * so that it is self-consistent, without the PTL held, so we may be
         * racing with other threads modifying the pte. Usually a READ_ONCE()
         * would suffice, but for the contpte case, we also need to gather the
         * access and dirty bits from across all ptes in the contiguous block,
         * and we can't read all of those neighbouring ptes atomically, so any
         * contiguous range may be unfolded/modified/refolded under our feet.
         * Therefore we ensure we read a _consistent_ contpte range by checking
         * that all ptes in the range are valid and have CONT_PTE set, that all
         * pfns are contiguous and that all pgprots are the same (ignoring
         * access/dirty). If we find a pte that is not consistent, then we must
         * be racing with an update so start again. If the target pte does not
         * have CONT_PTE set then that is considered consistent on its own
         * because it is not part of a contpte range.
         */

        pgprot_t orig_prot;
        unsigned long pfn;
        pte_t orig_pte;
        pgprot_t prot;
        pte_t *ptep;
        pte_t pte;
        int i;

retry:
        orig_pte = __ptep_get(orig_ptep);

        if (!pte_valid_cont(orig_pte))
                return orig_pte;

        orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte)));
        ptep = contpte_align_down(orig_ptep);
        pfn = pte_pfn(orig_pte) - (orig_ptep - ptep);

        for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
                pte = __ptep_get(ptep);
                prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));

                if (!pte_valid_cont(pte) ||
                   pte_pfn(pte) != pfn ||
                   pgprot_val(prot) != pgprot_val(orig_prot))
                        goto retry;

                if (pte_dirty(pte))
                        orig_pte = pte_mkdirty(orig_pte);

                if (pte_young(pte))
                        orig_pte = pte_mkyoung(orig_pte);
        }

        return orig_pte;
}
EXPORT_SYMBOL_GPL(contpte_ptep_get_lockless);

void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep, pte_t pte, unsigned int nr)
{
        unsigned long next;
        unsigned long end;
        unsigned long pfn;
        pgprot_t prot;

        /*
         * The set_ptes() spec guarantees that when nr > 1, the initial state of
         * all ptes is not-present. Therefore we never need to unfold or
         * otherwise invalidate a range before we set the new ptes.
         * contpte_set_ptes() should never be called for nr < 2.
         */
        VM_WARN_ON(nr == 1);

        if (!mm_is_user(mm))
                return __set_ptes(mm, addr, ptep, pte, nr);

        end = addr + (nr << PAGE_SHIFT);
        pfn = pte_pfn(pte);
        prot = pte_pgprot(pte);

        do {
                next = pte_cont_addr_end(addr, end);
                nr = (next - addr) >> PAGE_SHIFT;
                pte = pfn_pte(pfn, prot);

                if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0)
                        pte = pte_mkcont(pte);
                else
                        pte = pte_mknoncont(pte);

                __set_ptes(mm, addr, ptep, pte, nr);

                addr = next;
                ptep += nr;
                pfn += nr;

        } while (addr != end);
}
EXPORT_SYMBOL_GPL(contpte_set_ptes);

void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
                                pte_t *ptep, unsigned int nr, int full)
{
        contpte_try_unfold_partial(mm, addr, ptep, nr);
        __clear_full_ptes(mm, addr, ptep, nr, full);
}
EXPORT_SYMBOL_GPL(contpte_clear_full_ptes);

pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
                                unsigned long addr, pte_t *ptep,
                                unsigned int nr, int full)
{
        contpte_try_unfold_partial(mm, addr, ptep, nr);
        return __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
}
EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);

int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
                                        unsigned long addr, pte_t *ptep)
{
        /*
         * ptep_clear_flush_young() technically requires us to clear the access
         * flag for a _single_ pte. However, the core-mm code actually tracks
         * access/dirty per folio, not per page. And since we only create a
         * contig range when the range is covered by a single folio, we can get
         * away with clearing young for the whole contig range here, so we avoid
         * having to unfold.
         */

        int young = 0;
        int i;

        ptep = contpte_align_down(ptep);
        addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);

        for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
                young |= __ptep_test_and_clear_young(vma, addr, ptep);

        return young;
}
EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);

int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
                                        unsigned long addr, pte_t *ptep)
{
        int young;

        young = contpte_ptep_test_and_clear_young(vma, addr, ptep);

        if (young) {
                /*
                 * See comment in __ptep_clear_flush_young(); same rationale for
                 * eliding the trailing DSB applies here.
                 */
                addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
                __flush_tlb_range_nosync(vma->vm_mm, addr, addr + CONT_PTE_SIZE,
                                         PAGE_SIZE, true, 3);
        }

        return young;
}
EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young);

void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
                                        pte_t *ptep, unsigned int nr)
{
        /*
         * If wrprotecting an entire contig range, we can avoid unfolding. Just
         * set wrprotect and wait for the later mmu_gather flush to invalidate
         * the tlb. Until the flush, the page may or may not be wrprotected.
         * After the flush, it is guaranteed wrprotected. If it's a partial
         * range though, we must unfold, because we can't have a case where
         * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this
         * would cause it to continue to be unpredictable after the flush.
         */

        contpte_try_unfold_partial(mm, addr, ptep, nr);
        __wrprotect_ptes(mm, addr, ptep, nr);
}
EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes);

void contpte_clear_young_dirty_ptes(struct vm_area_struct *vma,
                                    unsigned long addr, pte_t *ptep,
                                    unsigned int nr, cydp_t flags)
{
        /*
         * We can safely clear access/dirty without needing to unfold from
         * the architectures perspective, even when contpte is set. If the
         * range starts or ends midway through a contpte block, we can just
         * expand to include the full contpte block. While this is not
         * exactly what the core-mm asked for, it tracks access/dirty per
         * folio, not per page. And since we only create a contpte block
         * when it is covered by a single folio, we can get away with
         * clearing access/dirty for the whole block.
         */
        unsigned long start = addr;
        unsigned long end = start + nr * PAGE_SIZE;

        if (pte_cont(__ptep_get(ptep + nr - 1)))
                end = ALIGN(end, CONT_PTE_SIZE);

        if (pte_cont(__ptep_get(ptep))) {
                start = ALIGN_DOWN(start, CONT_PTE_SIZE);
                ptep = contpte_align_down(ptep);
        }

        __clear_young_dirty_ptes(vma, start, ptep, (end - start) / PAGE_SIZE, flags);
}
EXPORT_SYMBOL_GPL(contpte_clear_young_dirty_ptes);

int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long addr, pte_t *ptep,
                                        pte_t entry, int dirty)
{
        unsigned long start_addr;
        pte_t orig_pte;
        int i;

        /*
         * Gather the access/dirty bits for the contiguous range. If nothing has
         * changed, its a noop.
         */
        orig_pte = pte_mknoncont(ptep_get(ptep));
        if (pte_val(orig_pte) == pte_val(entry))
                return 0;

        /*
         * We can fix up access/dirty bits without having to unfold the contig
         * range. But if the write bit is changing, we must unfold.
         */
        if (pte_write(orig_pte) == pte_write(entry)) {
                /*
                 * For HW access management, we technically only need to update
                 * the flag on a single pte in the range. But for SW access
                 * management, we need to update all the ptes to prevent extra
                 * faults. Avoid per-page tlb flush in __ptep_set_access_flags()
                 * and instead flush the whole range at the end.
                 */
                ptep = contpte_align_down(ptep);
                start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);

                /*
                 * We are not advancing entry because __ptep_set_access_flags()
                 * only consumes access flags from entry. And since we have checked
                 * for the whole contpte block and returned early, pte_same()
                 * within __ptep_set_access_flags() is likely false.
                 */
                for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
                        __ptep_set_access_flags(vma, addr, ptep, entry, 0);

                if (dirty)
                        __flush_tlb_range(vma, start_addr, addr,
                                                        PAGE_SIZE, true, 3);
        } else {
                __contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
                __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
        }

        return 1;
}
EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags);





























































  265 



  265 



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
// SPDX-License-Identifier: GPL-2.0
/*
 * bus.c - bus driver management
 *
 * Copyright (c) 2002-3 Patrick Mochel
 * Copyright (c) 2002-3 Open Source Development Labs
 * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de>
 * Copyright (c) 2007 Novell Inc.
 * Copyright (c) 2023 Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 */

#include <linux/async.h>
#include <linux/device/bus.h>
#include <linux/device.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/string.h>
#include <linux/mutex.h>
#include <linux/sysfs.h>
#include "base.h"
#include "power/power.h"

/* /sys/devices/system */
static struct kset *system_kset;

/* /sys/bus */
static struct kset *bus_kset;

#define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr)

/*
 * sysfs bindings for drivers
 */

#define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr)

#define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \
        struct driver_attribute driver_attr_##_name =                \
                __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store)

static int __must_check bus_rescan_devices_helper(struct device *dev,
                                                void *data);

/**
 * bus_to_subsys - Turn a struct bus_type into a struct subsys_private
 *
 * @bus: pointer to the struct bus_type to look up
 *
 * The driver core internals needs to work on the subsys_private structure, not
 * the external struct bus_type pointer.  This function walks the list of
 * registered busses in the system and finds the matching one and returns the
 * internal struct subsys_private that relates to that bus.
 *
 * Note, the reference count of the return value is INCREMENTED if it is not
 * NULL.  A call to subsys_put() must be done when finished with the pointer in
 * order for it to be properly freed.
 */
struct subsys_private *bus_to_subsys(const struct bus_type *bus)
{
        struct subsys_private *sp = NULL;
        struct kobject *kobj;

        if (!bus || !bus_kset)
                return NULL;

        spin_lock(&bus_kset->list_lock);

        if (list_empty(&bus_kset->list))
                goto done;

        list_for_each_entry(kobj, &bus_kset->list, entry) {
                struct kset *kset = container_of(kobj, struct kset, kobj);

                sp = container_of_const(kset, struct subsys_private, subsys);
                if (sp->bus == bus)
                        goto done;
        }
        sp = NULL;
done:
        sp = subsys_get(sp);
        spin_unlock(&bus_kset->list_lock);
        return sp;
}

static const struct bus_type *bus_get(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);

        if (sp)
                return bus;
        return NULL;
}

static void bus_put(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);

        /* two puts are required as the call to bus_to_subsys incremented it again */
        subsys_put(sp);
        subsys_put(sp);
}

static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr,
                             char *buf)
{
        struct driver_attribute *drv_attr = to_drv_attr(attr);
        struct driver_private *drv_priv = to_driver(kobj);
        ssize_t ret = -EIO;

        if (drv_attr->show)
                ret = drv_attr->show(drv_priv->driver, buf);
        return ret;
}

static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr,
                              const char *buf, size_t count)
{
        struct driver_attribute *drv_attr = to_drv_attr(attr);
        struct driver_private *drv_priv = to_driver(kobj);
        ssize_t ret = -EIO;

        if (drv_attr->store)
                ret = drv_attr->store(drv_priv->driver, buf, count);
        return ret;
}

static const struct sysfs_ops driver_sysfs_ops = {
        .show        = drv_attr_show,
        .store        = drv_attr_store,
};

static void driver_release(struct kobject *kobj)
{
        struct driver_private *drv_priv = to_driver(kobj);

        pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__);
        kfree(drv_priv);
}

static const struct kobj_type driver_ktype = {
        .sysfs_ops        = &driver_sysfs_ops,
        .release        = driver_release,
};

/*
 * sysfs bindings for buses
 */
static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr,
                             char *buf)
{
        struct bus_attribute *bus_attr = to_bus_attr(attr);
        struct subsys_private *subsys_priv = to_subsys_private(kobj);
        /* return -EIO for reading a bus attribute without show() */
        ssize_t ret = -EIO;

        if (bus_attr->show)
                ret = bus_attr->show(subsys_priv->bus, buf);
        return ret;
}

static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr,
                              const char *buf, size_t count)
{
        struct bus_attribute *bus_attr = to_bus_attr(attr);
        struct subsys_private *subsys_priv = to_subsys_private(kobj);
        /* return -EIO for writing a bus attribute without store() */
        ssize_t ret = -EIO;

        if (bus_attr->store)
                ret = bus_attr->store(subsys_priv->bus, buf, count);
        return ret;
}

static const struct sysfs_ops bus_sysfs_ops = {
        .show        = bus_attr_show,
        .store        = bus_attr_store,
};

int bus_create_file(const struct bus_type *bus, struct bus_attribute *attr)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int error;

        if (!sp)
                return -EINVAL;

        error = sysfs_create_file(&sp->subsys.kobj, &attr->attr);

        subsys_put(sp);
        return error;
}
EXPORT_SYMBOL_GPL(bus_create_file);

void bus_remove_file(const struct bus_type *bus, struct bus_attribute *attr)
{
        struct subsys_private *sp = bus_to_subsys(bus);

        if (!sp)
                return;

        sysfs_remove_file(&sp->subsys.kobj, &attr->attr);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(bus_remove_file);

static void bus_release(struct kobject *kobj)
{
        struct subsys_private *priv = to_subsys_private(kobj);

        lockdep_unregister_key(&priv->lock_key);
        kfree(priv);
}

static const struct kobj_type bus_ktype = {
        .sysfs_ops        = &bus_sysfs_ops,
        .release        = bus_release,
};

static int bus_uevent_filter(const struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);

        if (ktype == &bus_ktype)
                return 1;
        return 0;
}

static const struct kset_uevent_ops bus_uevent_ops = {
        .filter = bus_uevent_filter,
};

/* Manually detach a device from its associated driver. */
static ssize_t unbind_store(struct device_driver *drv, const char *buf,
                            size_t count)
{
        const struct bus_type *bus = bus_get(drv->bus);
        struct device *dev;
        int err = -ENODEV;

        dev = bus_find_device_by_name(bus, NULL, buf);
        if (dev && dev->driver == drv) {
                device_driver_detach(dev);
                err = count;
        }
        put_device(dev);
        bus_put(bus);
        return err;
}
static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store);

/*
 * Manually attach a device to a driver.
 * Note: the driver must want to bind to the device,
 * it is not possible to override the driver's id table.
 */
static ssize_t bind_store(struct device_driver *drv, const char *buf,
                          size_t count)
{
        const struct bus_type *bus = bus_get(drv->bus);
        struct device *dev;
        int err = -ENODEV;

        dev = bus_find_device_by_name(bus, NULL, buf);
        if (dev && driver_match_device(drv, dev)) {
                err = device_driver_attach(drv, dev);
                if (!err) {
                        /* success */
                        err = count;
                }
        }
        put_device(dev);
        bus_put(bus);
        return err;
}
static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store);

static ssize_t drivers_autoprobe_show(const struct bus_type *bus, char *buf)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int ret;

        if (!sp)
                return -EINVAL;

        ret = sysfs_emit(buf, "%d\n", sp->drivers_autoprobe);
        subsys_put(sp);
        return ret;
}

static ssize_t drivers_autoprobe_store(const struct bus_type *bus,
                                       const char *buf, size_t count)
{
        struct subsys_private *sp = bus_to_subsys(bus);

        if (!sp)
                return -EINVAL;

        if (buf[0] == '0')
                sp->drivers_autoprobe = 0;
        else
                sp->drivers_autoprobe = 1;

        subsys_put(sp);
        return count;
}

static ssize_t drivers_probe_store(const struct bus_type *bus,
                                   const char *buf, size_t count)
{
        struct device *dev;
        int err = -EINVAL;

        dev = bus_find_device_by_name(bus, NULL, buf);
        if (!dev)
                return -ENODEV;
        if (bus_rescan_devices_helper(dev, NULL) == 0)
                err = count;
        put_device(dev);
        return err;
}

static struct device *next_device(struct klist_iter *i)
{
        struct klist_node *n = klist_next(i);
        struct device *dev = NULL;
        struct device_private *dev_prv;

        if (n) {
                dev_prv = to_device_private_bus(n);
                dev = dev_prv->device;
        }
        return dev;
}

/**
 * bus_for_each_dev - device iterator.
 * @bus: bus type.
 * @start: device to start iterating from.
 * @data: data for the callback.
 * @fn: function to be called for each device.
 *
 * Iterate over @bus's list of devices, and call @fn for each,
 * passing it @data. If @start is not NULL, we use that device to
 * begin iterating from.
 *
 * We check the return of @fn each time. If it returns anything
 * other than 0, we break out and return that value.
 *
 * NOTE: The device that returns a non-zero value is not retained
 * in any way, nor is its refcount incremented. If the caller needs
 * to retain this data, it should do so, and increment the reference
 * count in the supplied callback.
 */
int bus_for_each_dev(const struct bus_type *bus, struct device *start,
                     void *data, device_iter_t fn)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct klist_iter i;
        struct device *dev;
        int error = 0;

        if (!sp)
                return -EINVAL;

        klist_iter_init_node(&sp->klist_devices, &i,
                             (start ? &start->p->knode_bus : NULL));
        while (!error && (dev = next_device(&i)))
                error = fn(dev, data);
        klist_iter_exit(&i);
        subsys_put(sp);
        return error;
}
EXPORT_SYMBOL_GPL(bus_for_each_dev);

/**
 * bus_find_device - device iterator for locating a particular device.
 * @bus: bus type
 * @start: Device to begin with
 * @data: Data to pass to match function
 * @match: Callback function to check device
 *
 * This is similar to the bus_for_each_dev() function above, but it
 * returns a reference to a device that is 'found' for later use, as
 * determined by the @match callback.
 *
 * The callback should return 0 if the device doesn't match and non-zero
 * if it does.  If the callback returns non-zero, this function will
 * return to the caller and not iterate over any more devices.
 */
struct device *bus_find_device(const struct bus_type *bus,
                               struct device *start, const void *data,
                               device_match_t match)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct klist_iter i;
        struct device *dev;

        if (!sp)
                return NULL;

        klist_iter_init_node(&sp->klist_devices, &i,
                             (start ? &start->p->knode_bus : NULL));
        while ((dev = next_device(&i))) {
                if (match(dev, data)) {
                        get_device(dev);
                        break;
                }
        }
        klist_iter_exit(&i);
        subsys_put(sp);
        return dev;
}
EXPORT_SYMBOL_GPL(bus_find_device);

static struct device_driver *next_driver(struct klist_iter *i)
{
        struct klist_node *n = klist_next(i);
        struct driver_private *drv_priv;

        if (n) {
                drv_priv = container_of(n, struct driver_private, knode_bus);
                return drv_priv->driver;
        }
        return NULL;
}

/**
 * bus_for_each_drv - driver iterator
 * @bus: bus we're dealing with.
 * @start: driver to start iterating on.
 * @data: data to pass to the callback.
 * @fn: function to call for each driver.
 *
 * This is nearly identical to the device iterator above.
 * We iterate over each driver that belongs to @bus, and call
 * @fn for each. If @fn returns anything but 0, we break out
 * and return it. If @start is not NULL, we use it as the head
 * of the list.
 *
 * NOTE: we don't return the driver that returns a non-zero
 * value, nor do we leave the reference count incremented for that
 * driver. If the caller needs to know that info, it must set it
 * in the callback. It must also be sure to increment the refcount
 * so it doesn't disappear before returning to the caller.
 */
int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start,
                     void *data, int (*fn)(struct device_driver *, void *))
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct klist_iter i;
        struct device_driver *drv;
        int error = 0;

        if (!sp)
                return -EINVAL;

        klist_iter_init_node(&sp->klist_drivers, &i,
                             start ? &start->p->knode_bus : NULL);
        while ((drv = next_driver(&i)) && !error)
                error = fn(drv, data);
        klist_iter_exit(&i);
        subsys_put(sp);
        return error;
}
EXPORT_SYMBOL_GPL(bus_for_each_drv);

/**
 * bus_add_device - add device to bus
 * @dev: device being added
 *
 * - Add device's bus attributes.
 * - Create links to device's bus.
 * - Add the device to its bus's list of devices.
 */
int bus_add_device(struct device *dev)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);
        int error;

        if (!sp) {
                /*
                 * This is a normal operation for many devices that do not
                 * have a bus assigned to them, just say that all went
                 * well.
                 */
                return 0;
        }

        /*
         * Reference in sp is now incremented and will be dropped when
         * the device is removed from the bus
         */

        pr_debug("bus: '%s': add device %s\n", sp->bus->name, dev_name(dev));

        error = device_add_groups(dev, sp->bus->dev_groups);
        if (error)
                goto out_put;

        error = sysfs_create_link(&sp->devices_kset->kobj, &dev->kobj, dev_name(dev));
        if (error)
                goto out_groups;

        error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem");
        if (error)
                goto out_subsys;

        klist_add_tail(&dev->p->knode_bus, &sp->klist_devices);
        return 0;

out_subsys:
        sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev));
out_groups:
        device_remove_groups(dev, sp->bus->dev_groups);
out_put:
        subsys_put(sp);
        return error;
}

/**
 * bus_probe_device - probe drivers for a new device
 * @dev: device to probe
 *
 * - Automatically probe for a driver if the bus allows it.
 */
void bus_probe_device(struct device *dev)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);
        struct subsys_interface *sif;

        if (!sp)
                return;

        if (sp->drivers_autoprobe)
                device_initial_probe(dev);

        mutex_lock(&sp->mutex);
        list_for_each_entry(sif, &sp->interfaces, node)
                if (sif->add_dev)
                        sif->add_dev(dev, sif);
        mutex_unlock(&sp->mutex);
        subsys_put(sp);
}

/**
 * bus_remove_device - remove device from bus
 * @dev: device to be removed
 *
 * - Remove device from all interfaces.
 * - Remove symlink from bus' directory.
 * - Delete device from bus's list.
 * - Detach from its driver.
 * - Drop reference taken in bus_add_device().
 */
void bus_remove_device(struct device *dev)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);
        struct subsys_interface *sif;

        if (!sp)
                return;

        mutex_lock(&sp->mutex);
        list_for_each_entry(sif, &sp->interfaces, node)
                if (sif->remove_dev)
                        sif->remove_dev(dev, sif);
        mutex_unlock(&sp->mutex);

        sysfs_remove_link(&dev->kobj, "subsystem");
        sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev));
        device_remove_groups(dev, dev->bus->dev_groups);
        if (klist_node_attached(&dev->p->knode_bus))
                klist_del(&dev->p->knode_bus);

        pr_debug("bus: '%s': remove device %s\n",
                 dev->bus->name, dev_name(dev));
        device_release_driver(dev);

        /*
         * Decrement the reference count twice, once for the bus_to_subsys()
         * call in the start of this function, and the second one from the
         * reference increment in bus_add_device()
         */
        subsys_put(sp);
        subsys_put(sp);
}

static int __must_check add_bind_files(struct device_driver *drv)
{
        int ret;

        ret = driver_create_file(drv, &driver_attr_unbind);
        if (ret == 0) {
                ret = driver_create_file(drv, &driver_attr_bind);
                if (ret)
                        driver_remove_file(drv, &driver_attr_unbind);
        }
        return ret;
}

static void remove_bind_files(struct device_driver *drv)
{
        driver_remove_file(drv, &driver_attr_bind);
        driver_remove_file(drv, &driver_attr_unbind);
}

static BUS_ATTR_WO(drivers_probe);
static BUS_ATTR_RW(drivers_autoprobe);

static int add_probe_files(const struct bus_type *bus)
{
        int retval;

        retval = bus_create_file(bus, &bus_attr_drivers_probe);
        if (retval)
                goto out;

        retval = bus_create_file(bus, &bus_attr_drivers_autoprobe);
        if (retval)
                bus_remove_file(bus, &bus_attr_drivers_probe);
out:
        return retval;
}

static void remove_probe_files(const struct bus_type *bus)
{
        bus_remove_file(bus, &bus_attr_drivers_autoprobe);
        bus_remove_file(bus, &bus_attr_drivers_probe);
}

static ssize_t uevent_store(struct device_driver *drv, const char *buf,
                            size_t count)
{
        int rc;

        rc = kobject_synth_uevent(&drv->p->kobj, buf, count);
        return rc ? rc : count;
}
static DRIVER_ATTR_WO(uevent);

/**
 * bus_add_driver - Add a driver to the bus.
 * @drv: driver.
 */
int bus_add_driver(struct device_driver *drv)
{
        struct subsys_private *sp = bus_to_subsys(drv->bus);
        struct driver_private *priv;
        int error = 0;

        if (!sp)
                return -EINVAL;

        /*
         * Reference in sp is now incremented and will be dropped when
         * the driver is removed from the bus
         */
        pr_debug("bus: '%s': add driver %s\n", sp->bus->name, drv->name);

        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
        if (!priv) {
                error = -ENOMEM;
                goto out_put_bus;
        }
        klist_init(&priv->klist_devices, NULL, NULL);
        priv->driver = drv;
        drv->p = priv;
        priv->kobj.kset = sp->drivers_kset;
        error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL,
                                     "%s", drv->name);
        if (error)
                goto out_unregister;

        klist_add_tail(&priv->knode_bus, &sp->klist_drivers);
        if (sp->drivers_autoprobe) {
                error = driver_attach(drv);
                if (error)
                        goto out_del_list;
        }
        error = module_add_driver(drv->owner, drv);
        if (error) {
                printk(KERN_ERR "%s: failed to create module links for %s\n",
                        __func__, drv->name);
                goto out_detach;
        }

        error = driver_create_file(drv, &driver_attr_uevent);
        if (error) {
                printk(KERN_ERR "%s: uevent attr (%s) failed\n",
                        __func__, drv->name);
        }
        error = driver_add_groups(drv, sp->bus->drv_groups);
        if (error) {
                /* How the hell do we get out of this pickle? Give up */
                printk(KERN_ERR "%s: driver_add_groups(%s) failed\n",
                        __func__, drv->name);
        }

        if (!drv->suppress_bind_attrs) {
                error = add_bind_files(drv);
                if (error) {
                        /* Ditto */
                        printk(KERN_ERR "%s: add_bind_files(%s) failed\n",
                                __func__, drv->name);
                }
        }

        return 0;

out_detach:
        driver_detach(drv);
out_del_list:
        klist_del(&priv->knode_bus);
out_unregister:
        kobject_put(&priv->kobj);
        /* drv->p is freed in driver_release()  */
        drv->p = NULL;
out_put_bus:
        subsys_put(sp);
        return error;
}

/**
 * bus_remove_driver - delete driver from bus's knowledge.
 * @drv: driver.
 *
 * Detach the driver from the devices it controls, and remove
 * it from its bus's list of drivers. Finally, we drop the reference
 * to the bus we took in bus_add_driver().
 */
void bus_remove_driver(struct device_driver *drv)
{
        struct subsys_private *sp = bus_to_subsys(drv->bus);

        if (!sp)
                return;

        pr_debug("bus: '%s': remove driver %s\n", sp->bus->name, drv->name);

        if (!drv->suppress_bind_attrs)
                remove_bind_files(drv);
        driver_remove_groups(drv, sp->bus->drv_groups);
        driver_remove_file(drv, &driver_attr_uevent);
        klist_remove(&drv->p->knode_bus);
        driver_detach(drv);
        module_remove_driver(drv);
        kobject_put(&drv->p->kobj);

        /*
         * Decrement the reference count twice, once for the bus_to_subsys()
         * call in the start of this function, and the second one from the
         * reference increment in bus_add_driver()
         */
        subsys_put(sp);
        subsys_put(sp);
}

/* Helper for bus_rescan_devices's iter */
static int __must_check bus_rescan_devices_helper(struct device *dev,
                                                  void *data)
{
        int ret = 0;

        if (!dev->driver) {
                if (dev->parent && dev->bus->need_parent_lock)
                        device_lock(dev->parent);
                ret = device_attach(dev);
                if (dev->parent && dev->bus->need_parent_lock)
                        device_unlock(dev->parent);
        }
        return ret < 0 ? ret : 0;
}

/**
 * bus_rescan_devices - rescan devices on the bus for possible drivers
 * @bus: the bus to scan.
 *
 * This function will look for devices on the bus with no driver
 * attached and rescan it against existing drivers to see if it matches
 * any by calling device_attach() for the unbound devices.
 */
int bus_rescan_devices(const struct bus_type *bus)
{
        return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper);
}
EXPORT_SYMBOL_GPL(bus_rescan_devices);

/**
 * device_reprobe - remove driver for a device and probe for a new driver
 * @dev: the device to reprobe
 *
 * This function detaches the attached driver (if any) for the given
 * device and restarts the driver probing process.  It is intended
 * to use if probing criteria changed during a devices lifetime and
 * driver attachment should change accordingly.
 */
int device_reprobe(struct device *dev)
{
        if (dev->driver)
                device_driver_detach(dev);
        return bus_rescan_devices_helper(dev, NULL);
}
EXPORT_SYMBOL_GPL(device_reprobe);

static void klist_devices_get(struct klist_node *n)
{
        struct device_private *dev_prv = to_device_private_bus(n);
        struct device *dev = dev_prv->device;

        get_device(dev);
}

static void klist_devices_put(struct klist_node *n)
{
        struct device_private *dev_prv = to_device_private_bus(n);
        struct device *dev = dev_prv->device;

        put_device(dev);
}

static ssize_t bus_uevent_store(const struct bus_type *bus,
                                const char *buf, size_t count)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int ret;

        if (!sp)
                return -EINVAL;

        ret = kobject_synth_uevent(&sp->subsys.kobj, buf, count);
        subsys_put(sp);

        if (ret)
                return ret;
        return count;
}
/*
 * "open code" the old BUS_ATTR() macro here.  We want to use BUS_ATTR_WO()
 * here, but can not use it as earlier in the file we have
 * DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store
 * function name.
 */
static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL,
                                                     bus_uevent_store);

/**
 * bus_register - register a driver-core subsystem
 * @bus: bus to register
 *
 * Once we have that, we register the bus with the kobject
 * infrastructure, then register the children subsystems it has:
 * the devices and drivers that belong to the subsystem.
 */
int bus_register(const struct bus_type *bus)
{
        int retval;
        struct subsys_private *priv;
        struct kobject *bus_kobj;
        struct lock_class_key *key;

        priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
        if (!priv)
                return -ENOMEM;

        priv->bus = bus;

        BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier);

        bus_kobj = &priv->subsys.kobj;
        retval = kobject_set_name(bus_kobj, "%s", bus->name);
        if (retval)
                goto out;

        bus_kobj->kset = bus_kset;
        bus_kobj->ktype = &bus_ktype;
        priv->drivers_autoprobe = 1;

        retval = kset_register(&priv->subsys);
        if (retval)
                goto out;

        retval = bus_create_file(bus, &bus_attr_uevent);
        if (retval)
                goto bus_uevent_fail;

        priv->devices_kset = kset_create_and_add("devices", NULL, bus_kobj);
        if (!priv->devices_kset) {
                retval = -ENOMEM;
                goto bus_devices_fail;
        }

        priv->drivers_kset = kset_create_and_add("drivers", NULL, bus_kobj);
        if (!priv->drivers_kset) {
                retval = -ENOMEM;
                goto bus_drivers_fail;
        }

        INIT_LIST_HEAD(&priv->interfaces);
        key = &priv->lock_key;
        lockdep_register_key(key);
        __mutex_init(&priv->mutex, "subsys mutex", key);
        klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put);
        klist_init(&priv->klist_drivers, NULL, NULL);

        retval = add_probe_files(bus);
        if (retval)
                goto bus_probe_files_fail;

        retval = sysfs_create_groups(bus_kobj, bus->bus_groups);
        if (retval)
                goto bus_groups_fail;

        pr_debug("bus: '%s': registered\n", bus->name);
        return 0;

bus_groups_fail:
        remove_probe_files(bus);
bus_probe_files_fail:
        kset_unregister(priv->drivers_kset);
bus_drivers_fail:
        kset_unregister(priv->devices_kset);
bus_devices_fail:
        bus_remove_file(bus, &bus_attr_uevent);
bus_uevent_fail:
        kset_unregister(&priv->subsys);
        /* Above kset_unregister() will kfree @priv */
        priv = NULL;
out:
        kfree(priv);
        return retval;
}
EXPORT_SYMBOL_GPL(bus_register);

/**
 * bus_unregister - remove a bus from the system
 * @bus: bus.
 *
 * Unregister the child subsystems and the bus itself.
 * Finally, we call bus_put() to release the refcount
 */
void bus_unregister(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct kobject *bus_kobj;

        if (!sp)
                return;

        pr_debug("bus: '%s': unregistering\n", bus->name);
        if (sp->dev_root)
                device_unregister(sp->dev_root);

        bus_kobj = &sp->subsys.kobj;
        sysfs_remove_groups(bus_kobj, bus->bus_groups);
        remove_probe_files(bus);
        bus_remove_file(bus, &bus_attr_uevent);

        kset_unregister(sp->drivers_kset);
        kset_unregister(sp->devices_kset);
        kset_unregister(&sp->subsys);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(bus_unregister);

int bus_register_notifier(const struct bus_type *bus, struct notifier_block *nb)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int retval;

        if (!sp)
                return -EINVAL;

        retval = blocking_notifier_chain_register(&sp->bus_notifier, nb);
        subsys_put(sp);
        return retval;
}
EXPORT_SYMBOL_GPL(bus_register_notifier);

int bus_unregister_notifier(const struct bus_type *bus, struct notifier_block *nb)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        int retval;

        if (!sp)
                return -EINVAL;
        retval = blocking_notifier_chain_unregister(&sp->bus_notifier, nb);
        subsys_put(sp);
        return retval;
}
EXPORT_SYMBOL_GPL(bus_unregister_notifier);

void bus_notify(struct device *dev, enum bus_notifier_event value)
{
        struct subsys_private *sp = bus_to_subsys(dev->bus);

        if (!sp)
                return;

        blocking_notifier_call_chain(&sp->bus_notifier, value, dev);
        subsys_put(sp);
}

struct kset *bus_get_kset(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct kset *kset;

        if (!sp)
                return NULL;

        kset = &sp->subsys;
        subsys_put(sp);

        return kset;
}
EXPORT_SYMBOL_GPL(bus_get_kset);

/*
 * Yes, this forcibly breaks the klist abstraction temporarily.  It
 * just wants to sort the klist, not change reference counts and
 * take/drop locks rapidly in the process.  It does all this while
 * holding the lock for the list, so objects can't otherwise be
 * added/removed while we're swizzling.
 */
static void device_insertion_sort_klist(struct device *a, struct list_head *list,
                                        int (*compare)(const struct device *a,
                                                        const struct device *b))
{
        struct klist_node *n;
        struct device_private *dev_prv;
        struct device *b;

        list_for_each_entry(n, list, n_node) {
                dev_prv = to_device_private_bus(n);
                b = dev_prv->device;
                if (compare(a, b) <= 0) {
                        list_move_tail(&a->p->knode_bus.n_node,
                                       &b->p->knode_bus.n_node);
                        return;
                }
        }
        list_move_tail(&a->p->knode_bus.n_node, list);
}

void bus_sort_breadthfirst(const struct bus_type *bus,
                           int (*compare)(const struct device *a,
                                          const struct device *b))
{
        struct subsys_private *sp = bus_to_subsys(bus);
        LIST_HEAD(sorted_devices);
        struct klist_node *n, *tmp;
        struct device_private *dev_prv;
        struct device *dev;
        struct klist *device_klist;

        if (!sp)
                return;
        device_klist = &sp->klist_devices;

        spin_lock(&device_klist->k_lock);
        list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) {
                dev_prv = to_device_private_bus(n);
                dev = dev_prv->device;
                device_insertion_sort_klist(dev, &sorted_devices, compare);
        }
        list_splice(&sorted_devices, &device_klist->k_list);
        spin_unlock(&device_klist->k_lock);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(bus_sort_breadthfirst);

struct subsys_dev_iter {
        struct klist_iter                ki;
        const struct device_type        *type;
};

/**
 * subsys_dev_iter_init - initialize subsys device iterator
 * @iter: subsys iterator to initialize
 * @sp: the subsys private (i.e. bus) we wanna iterate over
 * @start: the device to start iterating from, if any
 * @type: device_type of the devices to iterate over, NULL for all
 *
 * Initialize subsys iterator @iter such that it iterates over devices
 * of @subsys.  If @start is set, the list iteration will start there,
 * otherwise if it is NULL, the iteration starts at the beginning of
 * the list.
 */
static void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct subsys_private *sp,
                                 struct device *start, const struct device_type *type)
{
        struct klist_node *start_knode = NULL;

        if (start)
                start_knode = &start->p->knode_bus;
        klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode);
        iter->type = type;
}

/**
 * subsys_dev_iter_next - iterate to the next device
 * @iter: subsys iterator to proceed
 *
 * Proceed @iter to the next device and return it.  Returns NULL if
 * iteration is complete.
 *
 * The returned device is referenced and won't be released till
 * iterator is proceed to the next device or exited.  The caller is
 * free to do whatever it wants to do with the device including
 * calling back into subsys code.
 */
static struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter)
{
        struct klist_node *knode;
        struct device *dev;

        for (;;) {
                knode = klist_next(&iter->ki);
                if (!knode)
                        return NULL;
                dev = to_device_private_bus(knode)->device;
                if (!iter->type || iter->type == dev->type)
                        return dev;
        }
}

/**
 * subsys_dev_iter_exit - finish iteration
 * @iter: subsys iterator to finish
 *
 * Finish an iteration.  Always call this function after iteration is
 * complete whether the iteration ran till the end or not.
 */
static void subsys_dev_iter_exit(struct subsys_dev_iter *iter)
{
        klist_iter_exit(&iter->ki);
}

int subsys_interface_register(struct subsys_interface *sif)
{
        struct subsys_private *sp;
        struct subsys_dev_iter iter;
        struct device *dev;

        if (!sif || !sif->subsys)
                return -ENODEV;

        sp = bus_to_subsys(sif->subsys);
        if (!sp)
                return -EINVAL;

        /*
         * Reference in sp is now incremented and will be dropped when
         * the interface is removed from the bus
         */

        mutex_lock(&sp->mutex);
        list_add_tail(&sif->node, &sp->interfaces);
        if (sif->add_dev) {
                subsys_dev_iter_init(&iter, sp, NULL, NULL);
                while ((dev = subsys_dev_iter_next(&iter)))
                        sif->add_dev(dev, sif);
                subsys_dev_iter_exit(&iter);
        }
        mutex_unlock(&sp->mutex);

        return 0;
}
EXPORT_SYMBOL_GPL(subsys_interface_register);

void subsys_interface_unregister(struct subsys_interface *sif)
{
        struct subsys_private *sp;
        struct subsys_dev_iter iter;
        struct device *dev;

        if (!sif || !sif->subsys)
                return;

        sp = bus_to_subsys(sif->subsys);
        if (!sp)
                return;

        mutex_lock(&sp->mutex);
        list_del_init(&sif->node);
        if (sif->remove_dev) {
                subsys_dev_iter_init(&iter, sp, NULL, NULL);
                while ((dev = subsys_dev_iter_next(&iter)))
                        sif->remove_dev(dev, sif);
                subsys_dev_iter_exit(&iter);
        }
        mutex_unlock(&sp->mutex);

        /*
         * Decrement the reference count twice, once for the bus_to_subsys()
         * call in the start of this function, and the second one from the
         * reference increment in subsys_interface_register()
         */
        subsys_put(sp);
        subsys_put(sp);
}
EXPORT_SYMBOL_GPL(subsys_interface_unregister);

static void system_root_device_release(struct device *dev)
{
        kfree(dev);
}

static int subsys_register(const struct bus_type *subsys,
                           const struct attribute_group **groups,
                           struct kobject *parent_of_root)
{
        struct subsys_private *sp;
        struct device *dev;
        int err;

        err = bus_register(subsys);
        if (err < 0)
                return err;

        sp = bus_to_subsys(subsys);
        if (!sp) {
                err = -EINVAL;
                goto err_sp;
        }

        dev = kzalloc(sizeof(struct device), GFP_KERNEL);
        if (!dev) {
                err = -ENOMEM;
                goto err_dev;
        }

        err = dev_set_name(dev, "%s", subsys->name);
        if (err < 0)
                goto err_name;

        dev->kobj.parent = parent_of_root;
        dev->groups = groups;
        dev->release = system_root_device_release;

        err = device_register(dev);
        if (err < 0)
                goto err_dev_reg;

        sp->dev_root = dev;
        subsys_put(sp);
        return 0;

err_dev_reg:
        put_device(dev);
        dev = NULL;
err_name:
        kfree(dev);
err_dev:
        subsys_put(sp);
err_sp:
        bus_unregister(subsys);
        return err;
}

/**
 * subsys_system_register - register a subsystem at /sys/devices/system/
 * @subsys: system subsystem
 * @groups: default attributes for the root device
 *
 * All 'system' subsystems have a /sys/devices/system/<name> root device
 * with the name of the subsystem. The root device can carry subsystem-
 * wide attributes. All registered devices are below this single root
 * device and are named after the subsystem with a simple enumeration
 * number appended. The registered devices are not explicitly named;
 * only 'id' in the device needs to be set.
 *
 * Do not use this interface for anything new, it exists for compatibility
 * with bad ideas only. New subsystems should use plain subsystems; and
 * add the subsystem-wide attributes should be added to the subsystem
 * directory itself and not some create fake root-device placed in
 * /sys/devices/system/<name>.
 */
int subsys_system_register(const struct bus_type *subsys,
                           const struct attribute_group **groups)
{
        return subsys_register(subsys, groups, &system_kset->kobj);
}
EXPORT_SYMBOL_GPL(subsys_system_register);

/**
 * subsys_virtual_register - register a subsystem at /sys/devices/virtual/
 * @subsys: virtual subsystem
 * @groups: default attributes for the root device
 *
 * All 'virtual' subsystems have a /sys/devices/system/<name> root device
 * with the name of the subsystem.  The root device can carry subsystem-wide
 * attributes.  All registered devices are below this single root device.
 * There's no restriction on device naming.  This is for kernel software
 * constructs which need sysfs interface.
 */
int subsys_virtual_register(const struct bus_type *subsys,
                            const struct attribute_group **groups)
{
        struct kobject *virtual_dir;

        virtual_dir = virtual_device_parent();
        if (!virtual_dir)
                return -ENOMEM;

        return subsys_register(subsys, groups, virtual_dir);
}
EXPORT_SYMBOL_GPL(subsys_virtual_register);

/**
 * driver_find - locate driver on a bus by its name.
 * @name: name of the driver.
 * @bus: bus to scan for the driver.
 *
 * Call kset_find_obj() to iterate over list of drivers on
 * a bus to find driver by name. Return driver if found.
 *
 * This routine provides no locking to prevent the driver it returns
 * from being unregistered or unloaded while the caller is using it.
 * The caller is responsible for preventing this.
 */
struct device_driver *driver_find(const char *name, const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct kobject *k;
        struct driver_private *priv;

        if (!sp)
                return NULL;

        k = kset_find_obj(sp->drivers_kset, name);
        subsys_put(sp);
        if (!k)
                return NULL;

        priv = to_driver(k);

        /* Drop reference added by kset_find_obj() */
        kobject_put(k);
        return priv->driver;
}
EXPORT_SYMBOL_GPL(driver_find);

/*
 * Warning, the value could go to "removed" instantly after calling this function, so be very
 * careful when calling it...
 */
bool bus_is_registered(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        bool is_initialized = false;

        if (sp) {
                is_initialized = true;
                subsys_put(sp);
        }
        return is_initialized;
}

/**
 * bus_get_dev_root - return a pointer to the "device root" of a bus
 * @bus: bus to return the device root of.
 *
 * If a bus has a "device root" structure, return it, WITH THE REFERENCE
 * COUNT INCREMENTED.
 *
 * Note, when finished with the device, a call to put_device() is required.
 *
 * If the device root is not present (or bus is not a valid pointer), NULL
 * will be returned.
 */
struct device *bus_get_dev_root(const struct bus_type *bus)
{
        struct subsys_private *sp = bus_to_subsys(bus);
        struct device *dev_root;

        if (!sp)
                return NULL;

        dev_root = get_device(sp->dev_root);
        subsys_put(sp);
        return dev_root;
}
EXPORT_SYMBOL_GPL(bus_get_dev_root);

int __init buses_init(void)
{
        bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL);
        if (!bus_kset)
                return -ENOMEM;

        system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj);
        if (!system_kset) {
                /* Do error handling here as devices_init() do */
                kset_unregister(bus_kset);
                bus_kset = NULL;
                pr_err("%s: failed to create and add kset 'bus'\n", __func__);
                return -ENOMEM;
        }

        return 0;
}




































































































































































































































































































































































    1 





    6 

    6 









  163 





















    1 





  163 









  163 


  163 










































    1 





    1 

    1 


































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/char_dev.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/kdev_t.h>
#include <linux/slab.h>
#include <linux/string.h>

#include <linux/major.h>
#include <linux/errno.h>
#include <linux/module.h>
#include <linux/seq_file.h>

#include <linux/kobject.h>
#include <linux/kobj_map.h>
#include <linux/cdev.h>
#include <linux/mutex.h>
#include <linux/backing-dev.h>
#include <linux/tty.h>

#include "internal.h"

static struct kobj_map *cdev_map __ro_after_init;

static DEFINE_MUTEX(chrdevs_lock);

#define CHRDEV_MAJOR_HASH_SIZE 255

static struct char_device_struct {
        struct char_device_struct *next;
        unsigned int major;
        unsigned int baseminor;
        int minorct;
        char name[64];
        struct cdev *cdev;                /* will die */
} *chrdevs[CHRDEV_MAJOR_HASH_SIZE];

/* index in the above */
static inline int major_to_index(unsigned major)
{
        return major % CHRDEV_MAJOR_HASH_SIZE;
}

#ifdef CONFIG_PROC_FS

void chrdev_show(struct seq_file *f, off_t offset)
{
        struct char_device_struct *cd;

        mutex_lock(&chrdevs_lock);
        for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) {
                if (cd->major == offset)
                        seq_printf(f, "%3d %s\n", cd->major, cd->name);
        }
        mutex_unlock(&chrdevs_lock);
}

#endif /* CONFIG_PROC_FS */

static int find_dynamic_major(void)
{
        int i;
        struct char_device_struct *cd;

        for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) {
                if (chrdevs[i] == NULL)
                        return i;
        }

        for (i = CHRDEV_MAJOR_DYN_EXT_START;
             i >= CHRDEV_MAJOR_DYN_EXT_END; i--) {
                for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next)
                        if (cd->major == i)
                                break;

                if (cd == NULL)
                        return i;
        }

        return -EBUSY;
}

/*
 * Register a single major with a specified minor range.
 *
 * If major == 0 this function will dynamically allocate an unused major.
 * If major > 0 this function will attempt to reserve the range of minors
 * with given major.
 *
 */
static struct char_device_struct *
__register_chrdev_region(unsigned int major, unsigned int baseminor,
                           int minorct, const char *name)
{
        struct char_device_struct *cd, *curr, *prev = NULL;
        int ret;
        int i;

        if (major >= CHRDEV_MAJOR_MAX) {
                pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n",
                       name, major, CHRDEV_MAJOR_MAX-1);
                return ERR_PTR(-EINVAL);
        }

        if (minorct > MINORMASK + 1 - baseminor) {
                pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n",
                        name, baseminor, baseminor + minorct - 1, 0, MINORMASK);
                return ERR_PTR(-EINVAL);
        }

        cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL);
        if (cd == NULL)
                return ERR_PTR(-ENOMEM);

        mutex_lock(&chrdevs_lock);

        if (major == 0) {
                ret = find_dynamic_major();
                if (ret < 0) {
                        pr_err("CHRDEV \"%s\" dynamic allocation region is full\n",
                               name);
                        goto out;
                }
                major = ret;
        }

        ret = -EBUSY;
        i = major_to_index(major);
        for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) {
                if (curr->major < major)
                        continue;

                if (curr->major > major)
                        break;

                if (curr->baseminor + curr->minorct <= baseminor)
                        continue;

                if (curr->baseminor >= baseminor + minorct)
                        break;

                goto out;
        }

        cd->major = major;
        cd->baseminor = baseminor;
        cd->minorct = minorct;
        strscpy(cd->name, name, sizeof(cd->name));

        if (!prev) {
                cd->next = curr;
                chrdevs[i] = cd;
        } else {
                cd->next = prev->next;
                prev->next = cd;
        }

        mutex_unlock(&chrdevs_lock);
        return cd;
out:
        mutex_unlock(&chrdevs_lock);
        kfree(cd);
        return ERR_PTR(ret);
}

static struct char_device_struct *
__unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct)
{
        struct char_device_struct *cd = NULL, **cp;
        int i = major_to_index(major);

        mutex_lock(&chrdevs_lock);
        for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next)
                if ((*cp)->major == major &&
                    (*cp)->baseminor == baseminor &&
                    (*cp)->minorct == minorct)
                        break;
        if (*cp) {
                cd = *cp;
                *cp = cd->next;
        }
        mutex_unlock(&chrdevs_lock);
        return cd;
}

/**
 * register_chrdev_region() - register a range of device numbers
 * @from: the first in the desired range of device numbers; must include
 *        the major number.
 * @count: the number of consecutive device numbers required
 * @name: the name of the device or driver.
 *
 * Return value is zero on success, a negative error code on failure.
 */
int register_chrdev_region(dev_t from, unsigned count, const char *name)
{
        struct char_device_struct *cd;
        dev_t to = from + count;
        dev_t n, next;

        for (n = from; n < to; n = next) {
                next = MKDEV(MAJOR(n)+1, 0);
                if (next > to)
                        next = to;
                cd = __register_chrdev_region(MAJOR(n), MINOR(n),
                               next - n, name);
                if (IS_ERR(cd))
                        goto fail;
        }
        return 0;
fail:
        to = n;
        for (n = from; n < to; n = next) {
                next = MKDEV(MAJOR(n)+1, 0);
                kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
        }
        return PTR_ERR(cd);
}

/**
 * alloc_chrdev_region() - register a range of char device numbers
 * @dev: output parameter for first assigned number
 * @baseminor: first of the requested range of minor numbers
 * @count: the number of minor numbers required
 * @name: the name of the associated device or driver
 *
 * Allocates a range of char device numbers.  The major number will be
 * chosen dynamically, and returned (along with the first minor number)
 * in @dev.  Returns zero or a negative error code.
 */
int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
                        const char *name)
{
        struct char_device_struct *cd;
        cd = __register_chrdev_region(0, baseminor, count, name);
        if (IS_ERR(cd))
                return PTR_ERR(cd);
        *dev = MKDEV(cd->major, cd->baseminor);
        return 0;
}

/**
 * __register_chrdev() - create and register a cdev occupying a range of minors
 * @major: major device number or 0 for dynamic allocation
 * @baseminor: first of the requested range of minor numbers
 * @count: the number of minor numbers required
 * @name: name of this range of devices
 * @fops: file operations associated with this devices
 *
 * If @major == 0 this functions will dynamically allocate a major and return
 * its number.
 *
 * If @major > 0 this function will attempt to reserve a device with the given
 * major number and will return zero on success.
 *
 * Returns a -ve errno on failure.
 *
 * The name of this device has nothing to do with the name of the device in
 * /dev. It only helps to keep track of the different owners of devices. If
 * your module name has only one type of devices it's ok to use e.g. the name
 * of the module here.
 */
int __register_chrdev(unsigned int major, unsigned int baseminor,
                      unsigned int count, const char *name,
                      const struct file_operations *fops)
{
        struct char_device_struct *cd;
        struct cdev *cdev;
        int err = -ENOMEM;

        cd = __register_chrdev_region(major, baseminor, count, name);
        if (IS_ERR(cd))
                return PTR_ERR(cd);

        cdev = cdev_alloc();
        if (!cdev)
                goto out2;

        cdev->owner = fops->owner;
        cdev->ops = fops;
        kobject_set_name(&cdev->kobj, "%s", name);

        err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
        if (err)
                goto out;

        cd->cdev = cdev;

        return major ? 0 : cd->major;
out:
        kobject_put(&cdev->kobj);
out2:
        kfree(__unregister_chrdev_region(cd->major, baseminor, count));
        return err;
}

/**
 * unregister_chrdev_region() - unregister a range of device numbers
 * @from: the first in the range of numbers to unregister
 * @count: the number of device numbers to unregister
 *
 * This function will unregister a range of @count device numbers,
 * starting with @from.  The caller should normally be the one who
 * allocated those numbers in the first place...
 */
void unregister_chrdev_region(dev_t from, unsigned count)
{
        dev_t to = from + count;
        dev_t n, next;

        for (n = from; n < to; n = next) {
                next = MKDEV(MAJOR(n)+1, 0);
                if (next > to)
                        next = to;
                kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n));
        }
}

/**
 * __unregister_chrdev - unregister and destroy a cdev
 * @major: major device number
 * @baseminor: first of the range of minor numbers
 * @count: the number of minor numbers this cdev is occupying
 * @name: name of this range of devices
 *
 * Unregister and destroy the cdev occupying the region described by
 * @major, @baseminor and @count.  This function undoes what
 * __register_chrdev() did.
 */
void __unregister_chrdev(unsigned int major, unsigned int baseminor,
                         unsigned int count, const char *name)
{
        struct char_device_struct *cd;

        cd = __unregister_chrdev_region(major, baseminor, count);
        if (cd && cd->cdev)
                cdev_del(cd->cdev);
        kfree(cd);
}

static DEFINE_SPINLOCK(cdev_lock);

static struct kobject *cdev_get(struct cdev *p)
{
        struct module *owner = p->owner;
        struct kobject *kobj;

        if (!try_module_get(owner))
                return NULL;
        kobj = kobject_get_unless_zero(&p->kobj);
        if (!kobj)
                module_put(owner);
        return kobj;
}

void cdev_put(struct cdev *p)
{
        if (p) {
                struct module *owner = p->owner;
                kobject_put(&p->kobj);
                module_put(owner);
        }
}

/*
 * Called every time a character special file is opened
 */
static int chrdev_open(struct inode *inode, struct file *filp)
{
        const struct file_operations *fops;
        struct cdev *p;
        struct cdev *new = NULL;
        int ret = 0;

        spin_lock(&cdev_lock);
        p = inode->i_cdev;
        if (!p) {
                struct kobject *kobj;
                int idx;
                spin_unlock(&cdev_lock);
                kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
                if (!kobj)
                        return -ENXIO;
                new = container_of(kobj, struct cdev, kobj);
                spin_lock(&cdev_lock);
                /* Check i_cdev again in case somebody beat us to it while
                   we dropped the lock. */
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
                        list_add(&inode->i_devices, &p->list);
                        new = NULL;
                } else if (!cdev_get(p))
                        ret = -ENXIO;
        } else if (!cdev_get(p))
                ret = -ENXIO;
        spin_unlock(&cdev_lock);
        cdev_put(new);
        if (ret)
                return ret;

        ret = -ENXIO;
        fops = fops_get(p->ops);
        if (!fops)
                goto out_cdev_put;

        replace_fops(filp, fops);
        if (filp->f_op->open) {
                ret = filp->f_op->open(inode, filp);
                if (ret)
                        goto out_cdev_put;
        }

        return 0;

 out_cdev_put:
        cdev_put(p);
        return ret;
}

void cd_forget(struct inode *inode)
{
        spin_lock(&cdev_lock);
        list_del_init(&inode->i_devices);
        inode->i_cdev = NULL;
        inode->i_mapping = &inode->i_data;
        spin_unlock(&cdev_lock);
}

static void cdev_purge(struct cdev *cdev)
{
        spin_lock(&cdev_lock);
        while (!list_empty(&cdev->list)) {
                struct inode *inode;
                inode = container_of(cdev->list.next, struct inode, i_devices);
                list_del_init(&inode->i_devices);
                inode->i_cdev = NULL;
        }
        spin_unlock(&cdev_lock);
}

/*
 * Dummy default file-operations: the only thing this does
 * is contain the open that then fills in the correct operations
 * depending on the special file...
 */
const struct file_operations def_chr_fops = {
        .open = chrdev_open,
        .llseek = noop_llseek,
};

static struct kobject *exact_match(dev_t dev, int *part, void *data)
{
        struct cdev *p = data;
        return &p->kobj;
}

static int exact_lock(dev_t dev, void *data)
{
        struct cdev *p = data;
        return cdev_get(p) ? 0 : -1;
}

/**
 * cdev_add() - add a char device to the system
 * @p: the cdev structure for the device
 * @dev: the first device number for which this device is responsible
 * @count: the number of consecutive minor numbers corresponding to this
 *         device
 *
 * cdev_add() adds the device represented by @p to the system, making it
 * live immediately.  A negative error code is returned on failure.
 */
int cdev_add(struct cdev *p, dev_t dev, unsigned count)
{
        int error;

        p->dev = dev;
        p->count = count;

        if (WARN_ON(dev == WHITEOUT_DEV)) {
                error = -EBUSY;
                goto err;
        }

        error = kobj_map(cdev_map, dev, count, NULL,
                         exact_match, exact_lock, p);
        if (error)
                goto err;

        kobject_get(p->kobj.parent);

        return 0;

err:
        kfree_const(p->kobj.name);
        p->kobj.name = NULL;
        return error;
}

/**
 * cdev_set_parent() - set the parent kobject for a char device
 * @p: the cdev structure
 * @kobj: the kobject to take a reference to
 *
 * cdev_set_parent() sets a parent kobject which will be referenced
 * appropriately so the parent is not freed before the cdev. This
 * should be called before cdev_add.
 */
void cdev_set_parent(struct cdev *p, struct kobject *kobj)
{
        WARN_ON(!kobj->state_initialized);
        p->kobj.parent = kobj;
}

/**
 * cdev_device_add() - add a char device and it's corresponding
 *        struct device, linkink
 * @dev: the device structure
 * @cdev: the cdev structure
 *
 * cdev_device_add() adds the char device represented by @cdev to the system,
 * just as cdev_add does. It then adds @dev to the system using device_add
 * The dev_t for the char device will be taken from the struct device which
 * needs to be initialized first. This helper function correctly takes a
 * reference to the parent device so the parent will not get released until
 * all references to the cdev are released.
 *
 * This helper uses dev->devt for the device number. If it is not set
 * it will not add the cdev and it will be equivalent to device_add.
 *
 * This function should be used whenever the struct cdev and the
 * struct device are members of the same structure whose lifetime is
 * managed by the struct device.
 *
 * NOTE: Callers must assume that userspace was able to open the cdev and
 * can call cdev fops callbacks at any time, even if this function fails.
 */
int cdev_device_add(struct cdev *cdev, struct device *dev)
{
        int rc = 0;

        if (dev->devt) {
                cdev_set_parent(cdev, &dev->kobj);

                rc = cdev_add(cdev, dev->devt, 1);
                if (rc)
                        return rc;
        }

        rc = device_add(dev);
        if (rc && dev->devt)
                cdev_del(cdev);

        return rc;
}

/**
 * cdev_device_del() - inverse of cdev_device_add
 * @cdev: the cdev structure
 * @dev: the device structure
 *
 * cdev_device_del() is a helper function to call cdev_del and device_del.
 * It should be used whenever cdev_device_add is used.
 *
 * If dev->devt is not set it will not remove the cdev and will be equivalent
 * to device_del.
 *
 * NOTE: This guarantees that associated sysfs callbacks are not running
 * or runnable, however any cdevs already open will remain and their fops
 * will still be callable even after this function returns.
 */
void cdev_device_del(struct cdev *cdev, struct device *dev)
{
        device_del(dev);
        if (dev->devt)
                cdev_del(cdev);
}

static void cdev_unmap(dev_t dev, unsigned count)
{
        kobj_unmap(cdev_map, dev, count);
}

/**
 * cdev_del() - remove a cdev from the system
 * @p: the cdev structure to be removed
 *
 * cdev_del() removes @p from the system, possibly freeing the structure
 * itself.
 *
 * NOTE: This guarantees that cdev device will no longer be able to be
 * opened, however any cdevs already open will remain and their fops will
 * still be callable even after cdev_del returns.
 */
void cdev_del(struct cdev *p)
{
        cdev_unmap(p->dev, p->count);
        kobject_put(&p->kobj);
}


static void cdev_default_release(struct kobject *kobj)
{
        struct cdev *p = container_of(kobj, struct cdev, kobj);
        struct kobject *parent = kobj->parent;

        cdev_purge(p);
        kobject_put(parent);
}

static void cdev_dynamic_release(struct kobject *kobj)
{
        struct cdev *p = container_of(kobj, struct cdev, kobj);
        struct kobject *parent = kobj->parent;

        cdev_purge(p);
        kfree(p);
        kobject_put(parent);
}

static struct kobj_type ktype_cdev_default = {
        .release        = cdev_default_release,
};

static struct kobj_type ktype_cdev_dynamic = {
        .release        = cdev_dynamic_release,
};

/**
 * cdev_alloc() - allocate a cdev structure
 *
 * Allocates and returns a cdev structure, or NULL on failure.
 */
struct cdev *cdev_alloc(void)
{
        struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL);
        if (p) {
                INIT_LIST_HEAD(&p->list);
                kobject_init(&p->kobj, &ktype_cdev_dynamic);
        }
        return p;
}

/**
 * cdev_init() - initialize a cdev structure
 * @cdev: the structure to initialize
 * @fops: the file_operations for this device
 *
 * Initializes @cdev, remembering @fops, making it ready to add to the
 * system with cdev_add().
 */
void cdev_init(struct cdev *cdev, const struct file_operations *fops)
{
        memset(cdev, 0, sizeof *cdev);
        INIT_LIST_HEAD(&cdev->list);
        kobject_init(&cdev->kobj, &ktype_cdev_default);
        cdev->ops = fops;
}

static struct kobject *base_probe(dev_t dev, int *part, void *data)
{
        if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0)
                /* Make old-style 2.4 aliases work */
                request_module("char-major-%d", MAJOR(dev));
        return NULL;
}

void __init chrdev_init(void)
{
        cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
}


/* Let modules do char dev stuff */
EXPORT_SYMBOL(register_chrdev_region);
EXPORT_SYMBOL(unregister_chrdev_region);
EXPORT_SYMBOL(alloc_chrdev_region);
EXPORT_SYMBOL(cdev_init);
EXPORT_SYMBOL(cdev_alloc);
EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
EXPORT_SYMBOL(cdev_set_parent);
EXPORT_SYMBOL(cdev_device_add);
EXPORT_SYMBOL(cdev_device_del);
EXPORT_SYMBOL(__register_chrdev);
EXPORT_SYMBOL(__unregister_chrdev);

























































































  306 






















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
/*
 * include/linux/topology.h
 *
 * Written by: Matthew Dobson, IBM Corporation
 *
 * Copyright (C) 2002, IBM Corp.
 *
 * All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
 * NON INFRINGEMENT.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * Send feedback to <colpatch@us.ibm.com>
 */
#ifndef _LINUX_TOPOLOGY_H
#define _LINUX_TOPOLOGY_H

#include <linux/arch_topology.h>
#include <linux/cpumask.h>
#include <linux/bitops.h>
#include <linux/mmzone.h>
#include <linux/smp.h>
#include <linux/percpu.h>
#include <asm/topology.h>

#ifndef nr_cpus_node
#define nr_cpus_node(node) cpumask_weight(cpumask_of_node(node))
#endif

#define for_each_node_with_cpus(node)                        \
        for_each_online_node(node)                        \
                if (nr_cpus_node(node))

int arch_update_cpu_topology(void);

/* Conform to ACPI 2.0 SLIT distance definitions */
#define LOCAL_DISTANCE                10
#define REMOTE_DISTANCE                20
#define DISTANCE_BITS           8
#ifndef node_distance
#define node_distance(from,to)        ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
#endif
#ifndef RECLAIM_DISTANCE
/*
 * If the distance between nodes in a system is larger than RECLAIM_DISTANCE
 * (in whatever arch specific measurement units returned by node_distance())
 * and node_reclaim_mode is enabled then the VM will only call node_reclaim()
 * on nodes within this distance.
 */
#define RECLAIM_DISTANCE 30
#endif

/*
 * The following tunable allows platforms to override the default node
 * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
 * sufficiently fast that the default value actually hurts
 * performance.
 *
 * AMD EPYC machines use this because even though the 2-hop distance
 * is 32 (3.2x slower than a local memory access) performance actually
 * *improves* if allowed to reclaim memory and load balance tasks
 * between NUMA nodes 2-hops apart.
 */
extern int __read_mostly node_reclaim_distance;

#ifndef PENALTY_FOR_NODE_WITH_CPUS
#define PENALTY_FOR_NODE_WITH_CPUS        (1)
#endif

#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DECLARE_PER_CPU(int, numa_node);

#ifndef numa_node_id
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
        return raw_cpu_read(numa_node);
}
#endif

#ifndef cpu_to_node
static inline int cpu_to_node(int cpu)
{
        return per_cpu(numa_node, cpu);
}
#endif

#ifndef set_numa_node
static inline void set_numa_node(int node)
{
        this_cpu_write(numa_node, node);
}
#endif

#ifndef set_cpu_numa_node
static inline void set_cpu_numa_node(int cpu, int node)
{
        per_cpu(numa_node, cpu) = node;
}
#endif

#else        /* !CONFIG_USE_PERCPU_NUMA_NODE_ID */

/* Returns the number of the current Node. */
#ifndef numa_node_id
static inline int numa_node_id(void)
{
        return cpu_to_node(raw_smp_processor_id());
}
#endif

#endif        /* [!]CONFIG_USE_PERCPU_NUMA_NODE_ID */

#ifdef CONFIG_HAVE_MEMORYLESS_NODES

/*
 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem().
 */
DECLARE_PER_CPU(int, _numa_mem_);

#ifndef set_numa_mem
static inline void set_numa_mem(int node)
{
        this_cpu_write(_numa_mem_, node);
}
#endif

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return raw_cpu_read(_numa_mem_);
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return per_cpu(_numa_mem_, cpu);
}
#endif

#ifndef set_cpu_numa_mem
static inline void set_cpu_numa_mem(int cpu, int node)
{
        per_cpu(_numa_mem_, cpu) = node;
}
#endif

#else        /* !CONFIG_HAVE_MEMORYLESS_NODES */

#ifndef numa_mem_id
/* Returns the number of the nearest Node with memory */
static inline int numa_mem_id(void)
{
        return numa_node_id();
}
#endif

#ifndef cpu_to_mem
static inline int cpu_to_mem(int cpu)
{
        return cpu_to_node(cpu);
}
#endif

#endif        /* [!]CONFIG_HAVE_MEMORYLESS_NODES */

#if defined(topology_die_id) && defined(topology_die_cpumask)
#define TOPOLOGY_DIE_SYSFS
#endif
#if defined(topology_cluster_id) && defined(topology_cluster_cpumask)
#define TOPOLOGY_CLUSTER_SYSFS
#endif
#if defined(topology_book_id) && defined(topology_book_cpumask)
#define TOPOLOGY_BOOK_SYSFS
#endif
#if defined(topology_drawer_id) && defined(topology_drawer_cpumask)
#define TOPOLOGY_DRAWER_SYSFS
#endif

#ifndef topology_physical_package_id
#define topology_physical_package_id(cpu)        ((void)(cpu), -1)
#endif
#ifndef topology_die_id
#define topology_die_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_cluster_id
#define topology_cluster_id(cpu)                ((void)(cpu), -1)
#endif
#ifndef topology_core_id
#define topology_core_id(cpu)                        ((void)(cpu), 0)
#endif
#ifndef topology_book_id
#define topology_book_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_drawer_id
#define topology_drawer_id(cpu)                        ((void)(cpu), -1)
#endif
#ifndef topology_ppin
#define topology_ppin(cpu)                        ((void)(cpu), 0ull)
#endif
#ifndef topology_sibling_cpumask
#define topology_sibling_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_core_cpumask
#define topology_core_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_cluster_cpumask
#define topology_cluster_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_die_cpumask
#define topology_die_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_book_cpumask
#define topology_book_cpumask(cpu)                cpumask_of(cpu)
#endif
#ifndef topology_drawer_cpumask
#define topology_drawer_cpumask(cpu)                cpumask_of(cpu)
#endif

#if defined(CONFIG_SCHED_SMT) && !defined(cpu_smt_mask)
static inline const struct cpumask *cpu_smt_mask(int cpu)
{
        return topology_sibling_cpumask(cpu);
}
#endif

#ifndef topology_is_primary_thread

static inline bool topology_is_primary_thread(unsigned int cpu)
{
        /*
         * When disabling SMT, the primary thread of the SMT will remain
         * enabled/active. Architectures that have a special primary thread
         * (e.g. x86) need to override this function. Otherwise the first
         * thread in the SMT can be made the primary thread.
         *
         * The sibling cpumask of an offline CPU always contains the CPU
         * itself on architectures using the implementation of
         * CONFIG_GENERIC_ARCH_TOPOLOGY for building their topology.
         * Other architectures not using CONFIG_GENERIC_ARCH_TOPOLOGY for
         * building their topology have to check whether to use this default
         * implementation or to override it.
         */
        return cpu == cpumask_first(topology_sibling_cpumask(cpu));
}
#define topology_is_primary_thread topology_is_primary_thread

#endif

static inline const struct cpumask *cpu_cpu_mask(int cpu)
{
        return cpumask_of_node(cpu_to_node(cpu));
}

#ifdef CONFIG_NUMA
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
#else
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
{
        return cpumask_nth_and(cpu, cpus, cpu_online_mask);
}

static inline const struct cpumask *
sched_numa_hop_mask(unsigned int node, unsigned int hops)
{
        return ERR_PTR(-EOPNOTSUPP);
}
#endif        /* CONFIG_NUMA */

/**
 * for_each_node_numadist() - iterate over nodes in increasing distance
 *                              order, starting from a given node
 * @node: the iteration variable and the starting node.
 * @unvisited: a nodemask to keep track of the unvisited nodes.
 *
 * This macro iterates over NUMA node IDs in increasing distance from the
 * starting @node and yields MAX_NUMNODES when all the nodes have been
 * visited.
 *
 * Note that by the time the loop completes, the @unvisited nodemask will
 * be fully cleared, unless the loop exits early.
 *
 * The difference between for_each_node() and for_each_node_numadist() is
 * that the former allows to iterate over nodes in numerical order, whereas
 * the latter iterates over nodes in increasing order of distance.
 *
 * This complexity of this iterator is O(N^2), where N represents the
 * number of nodes, as each iteration involves scanning all nodes to
 * find the one with the shortest distance.
 *
 * Requires rcu_lock to be held.
 */
#define for_each_node_numadist(node, unvisited)                                        \
        for (int __start = (node),                                                \
             (node) = nearest_node_nodemask((__start), &(unvisited));                \
             (node) < MAX_NUMNODES;                                                \
             node_clear((node), (unvisited)),                                        \
             (node) = nearest_node_nodemask((__start), &(unvisited)))

/**
 * for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
 *                          from a given node.
 * @mask: the iteration variable.
 * @node: the NUMA node to start the search from.
 *
 * Requires rcu_lock to be held.
 *
 * Yields cpu_online_mask for @node == NUMA_NO_NODE.
 */
#define for_each_numa_hop_mask(mask, node)                                       \
        for (unsigned int __hops = 0;                                               \
             mask = (node != NUMA_NO_NODE || __hops) ?                               \
                     sched_numa_hop_mask(node, __hops) :                       \
                     cpu_online_mask,                                               \
             !IS_ERR_OR_NULL(mask);                                               \
             __hops++)

#endif /* _LINUX_TOPOLOGY_H */









































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_TIMENS_H
#define _LINUX_TIMENS_H


#include <linux/sched.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/err.h>
#include <linux/time64.h>

struct user_namespace;
extern struct user_namespace init_user_ns;

struct vm_area_struct;

struct timens_offsets {
        struct timespec64 monotonic;
        struct timespec64 boottime;
};

struct time_namespace {
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct ns_common        ns;
        struct timens_offsets        offsets;
        struct page                *vvar_page;
        /* If set prevents changing offsets after any task joined namespace. */
        bool                        frozen_offsets;
} __randomize_layout;

extern struct time_namespace init_time_ns;

#ifdef CONFIG_TIME_NS
extern int vdso_join_timens(struct task_struct *task,
                            struct time_namespace *ns);
extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns);

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        refcount_inc(&ns->ns.count);
        return ns;
}

struct time_namespace *copy_time_ns(unsigned long flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns);
void free_time_ns(struct time_namespace *ns);
void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk);
struct page *find_timens_vvar_page(struct vm_area_struct *vma);

static inline void put_time_ns(struct time_namespace *ns)
{
        if (refcount_dec_and_test(&ns->ns.count))
                free_time_ns(ns);
}

void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m);

struct proc_timens_offset {
        int                        clockid;
        struct timespec64        val;
};

int proc_timens_set_offset(struct file *file, struct task_struct *p,
                           struct proc_timens_offset *offsets, int n);

static inline void timens_add_monotonic(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->monotonic);
}

static inline void timens_add_boottime(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_add(*ts, ns_offsets->boottime);
}

static inline u64 timens_add_boottime_ns(u64 nsec)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        return nsec + timespec64_to_ns(&ns_offsets->boottime);
}

static inline void timens_sub_boottime(struct timespec64 *ts)
{
        struct timens_offsets *ns_offsets = &current->nsproxy->time_ns->offsets;

        *ts = timespec64_sub(*ts, ns_offsets->boottime);
}

ktime_t do_timens_ktime_to_host(clockid_t clockid, ktime_t tim,
                                struct timens_offsets *offsets);

static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        struct time_namespace *ns = current->nsproxy->time_ns;

        if (likely(ns == &init_time_ns))
                return tim;

        return do_timens_ktime_to_host(clockid, tim, &ns->offsets);
}

#else
static inline int vdso_join_timens(struct task_struct *task,
                                   struct time_namespace *ns)
{
        return 0;
}

static inline void timens_commit(struct task_struct *tsk,
                                 struct time_namespace *ns)
{
}

static inline struct time_namespace *get_time_ns(struct time_namespace *ns)
{
        return NULL;
}

static inline void put_time_ns(struct time_namespace *ns)
{
}

static inline
struct time_namespace *copy_time_ns(unsigned long flags,
                                    struct user_namespace *user_ns,
                                    struct time_namespace *old_ns)
{
        if (flags & CLONE_NEWTIME)
                return ERR_PTR(-EINVAL);

        return old_ns;
}

static inline void timens_on_fork(struct nsproxy *nsproxy,
                                 struct task_struct *tsk)
{
        return;
}

static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma)
{
        return NULL;
}

static inline void timens_add_monotonic(struct timespec64 *ts) { }
static inline void timens_add_boottime(struct timespec64 *ts) { }

static inline u64 timens_add_boottime_ns(u64 nsec)
{
        return nsec;
}

static inline void timens_sub_boottime(struct timespec64 *ts) { }

static inline ktime_t timens_ktime_to_host(clockid_t clockid, ktime_t tim)
{
        return tim;
}
#endif

#endif /* _LINUX_TIMENS_H */



































































  639 





































  611 


















   21 
































































































































  566 

















    1 


















  255 




































   34 









































































  755 


































 1481 

















  710 










































































































































  352 

















  165 






















































































































































































































































































































































   22 

















































































































































































































































  190 



















   35 































































  188 





















  156 












































  668 


















  261 

















  619 

















  240 


















  132 






















































   34 









































  204 


















    5 


















  151 



























































































































































































































































































































































































































































































  271 


































































































































































































































































































































































































































































































































































































































































































































































































































  159 






















  102 












































































































































































































































































































































































   26 





































  780 


















  969 


















































































































































  116 


















  961 

















































































































































  262 











































































































































  209 
















  145 

















 1396 


























































































































































































































































    1 













































































































































































































































































































































    5 



















































































   95 












































  142 





















   22 


















  186 






















































 1306 




























































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-instrumented.sh
// DO NOT MODIFY THIS FILE DIRECTLY

/*
 * This file provoides atomic operations with explicit instrumentation (e.g.
 * KASAN, KCSAN), which should be used unless it is necessary to avoid
 * instrumentation. Where it is necessary to aovid instrumenation, the
 * raw_atomic*() operations should be used.
 */
#ifndef _LINUX_ATOMIC_INSTRUMENTED_H
#define _LINUX_ATOMIC_INSTRUMENTED_H

#include <linux/build_bug.h>
#include <linux/compiler.h>
#include <linux/instrumented.h>

/**
 * atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read(v);
}

/**
 * atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
atomic_read_acquire(const atomic_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_read_acquire(v);
}

/**
 * atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set(atomic_t *v, int i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set(v, i);
}

/**
 * atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_set_release(atomic_t *v, int i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_set_release(v, i);
}

/**
 * atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_add(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_add(i, v);
}

/**
 * atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return(i, v);
}

/**
 * atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_acquire(i, v);
}

/**
 * atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_release(i, v);
}

/**
 * atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_add_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_return_relaxed(i, v);
}

/**
 * atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add(i, v);
}

/**
 * atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_acquire(i, v);
}

/**
 * atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_release(i, v);
}

/**
 * atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_relaxed(i, v);
}

/**
 * atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_sub(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_sub(i, v);
}

/**
 * atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return(i, v);
}

/**
 * atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_acquire(i, v);
}

/**
 * atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_release(i, v);
}

/**
 * atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_sub_return_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_return_relaxed(i, v);
}

/**
 * atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub(i, v);
}

/**
 * atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_acquire(i, v);
}

/**
 * atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_release(i, v);
}

/**
 * atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_sub_relaxed(i, v);
}

/**
 * atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_inc(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_inc(v);
}

/**
 * atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return(v);
}

/**
 * atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_acquire(v);
}

/**
 * atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_release(v);
}

/**
 * atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_inc_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_return_relaxed(v);
}

/**
 * atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc(v);
}

/**
 * atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_acquire(v);
}

/**
 * atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_release(v);
}

/**
 * atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_inc_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_inc_relaxed(v);
}

/**
 * atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_dec(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_dec(v);
}

/**
 * atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return(v);
}

/**
 * atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_acquire(v);
}

/**
 * atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_release(v);
}

/**
 * atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
atomic_dec_return_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_return_relaxed(v);
}

/**
 * atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec(v);
}

/**
 * atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_acquire(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_acquire(v);
}

/**
 * atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_release(atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_release(v);
}

/**
 * atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_dec_relaxed(atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_dec_relaxed(v);
}

/**
 * atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_and(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_and(i, v);
}

/**
 * atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and(i, v);
}

/**
 * atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_acquire(i, v);
}

/**
 * atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_release(i, v);
}

/**
 * atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_and_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_and_relaxed(i, v);
}

/**
 * atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_andnot(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_andnot(i, v);
}

/**
 * atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot(i, v);
}

/**
 * atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_acquire(i, v);
}

/**
 * atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_release(i, v);
}

/**
 * atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_or(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_or(i, v);
}

/**
 * atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or(i, v);
}

/**
 * atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_acquire(i, v);
}

/**
 * atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_release(i, v);
}

/**
 * atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_or_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_or_relaxed(i, v);
}

/**
 * atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_xor(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_xor(i, v);
}

/**
 * atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor(i, v);
}

/**
 * atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_acquire(i, v);
}

/**
 * atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_release(i, v);
}

/**
 * atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_xor_relaxed(i, v);
}

/**
 * atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg(atomic_t *v, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg(v, new);
}

/**
 * atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_acquire(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_acquire(v, new);
}

/**
 * atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_release(atomic_t *v, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_release(v, new);
}

/**
 * atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_xchg_relaxed(atomic_t *v, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_xchg_relaxed(v, new);
}

/**
 * atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg(atomic_t *v, int old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg(v, old, new);
}

/**
 * atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_release(v, old, new);
}

/**
 * atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg(v, old, new);
}

/**
 * atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_sub_and_test(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_sub_and_test(i, v);
}

/**
 * atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_dec_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_and_test(v);
}

/**
 * atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_inc_and_test(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_and_test(v);
}

/**
 * atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative(int i, atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative(i, v);
}

/**
 * atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_acquire(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_acquire(i, v);
}

/**
 * atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_release(int i, atomic_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_release(i, v);
}

/**
 * atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_add_negative_relaxed(int i, atomic_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_negative_relaxed(i, v);
}

/**
 * atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline int
atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_fetch_add_unless(v, a, u);
}

/**
 * atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_add_unless(atomic_t *v, int a, int u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_add_unless(v, a, u);
}

/**
 * atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_not_zero(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_not_zero(v);
}

/**
 * atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_inc_unless_negative(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_inc_unless_negative(v);
}

/**
 * atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_dec_unless_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_unless_positive(v);
}

/**
 * atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
atomic_dec_if_positive(atomic_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_dec_if_positive(v);
}

/**
 * atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read(v);
}

/**
 * atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
atomic64_read_acquire(const atomic64_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic64_read_acquire(v);
}

/**
 * atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set(atomic64_t *v, s64 i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set(v, i);
}

/**
 * atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_set_release(atomic64_t *v, s64 i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic64_set_release(v, i);
}

/**
 * atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_add(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_add(i, v);
}

/**
 * atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return(i, v);
}

/**
 * atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_acquire(i, v);
}

/**
 * atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_release(i, v);
}

/**
 * atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_return_relaxed(i, v);
}

/**
 * atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add(i, v);
}

/**
 * atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_acquire(i, v);
}

/**
 * atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_release(i, v);
}

/**
 * atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_relaxed(i, v);
}

/**
 * atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_sub(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_sub(i, v);
}

/**
 * atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return(i, v);
}

/**
 * atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_acquire(i, v);
}

/**
 * atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_release(i, v);
}

/**
 * atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_return_relaxed(i, v);
}

/**
 * atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub(i, v);
}

/**
 * atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_acquire(i, v);
}

/**
 * atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_release(i, v);
}

/**
 * atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_sub_relaxed(i, v);
}

/**
 * atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_inc(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_inc(v);
}

/**
 * atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return(v);
}

/**
 * atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_acquire(v);
}

/**
 * atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_release(v);
}

/**
 * atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_inc_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_return_relaxed(v);
}

/**
 * atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc(v);
}

/**
 * atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_acquire(v);
}

/**
 * atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_release(v);
}

/**
 * atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_inc_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_inc_relaxed(v);
}

/**
 * atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_dec(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_dec(v);
}

/**
 * atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return(v);
}

/**
 * atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_acquire(v);
}

/**
 * atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_release(v);
}

/**
 * atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
atomic64_dec_return_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_return_relaxed(v);
}

/**
 * atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec(v);
}

/**
 * atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_acquire(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_acquire(v);
}

/**
 * atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_release(atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_release(v);
}

/**
 * atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_dec_relaxed(atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_dec_relaxed(v);
}

/**
 * atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_and(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_and(i, v);
}

/**
 * atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and(i, v);
}

/**
 * atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_acquire(i, v);
}

/**
 * atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_release(i, v);
}

/**
 * atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_and_relaxed(i, v);
}

/**
 * atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_andnot(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_andnot(i, v);
}

/**
 * atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot(i, v);
}

/**
 * atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_acquire(i, v);
}

/**
 * atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_release(i, v);
}

/**
 * atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_andnot_relaxed(i, v);
}

/**
 * atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_or(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_or(i, v);
}

/**
 * atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or(i, v);
}

/**
 * atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_acquire(i, v);
}

/**
 * atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_release(i, v);
}

/**
 * atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_or_relaxed(i, v);
}

/**
 * atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic64_xor(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic64_xor(i, v);
}

/**
 * atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor(i, v);
}

/**
 * atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_acquire(i, v);
}

/**
 * atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_release(i, v);
}

/**
 * atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_xor_relaxed(i, v);
}

/**
 * atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg(atomic64_t *v, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg(v, new);
}

/**
 * atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_acquire(v, new);
}

/**
 * atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_release(atomic64_t *v, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_release(v, new);
}

/**
 * atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_xchg_relaxed(v, new);
}

/**
 * atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg(v, old, new);
}

/**
 * atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_release(v, old, new);
}

/**
 * atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg(v, old, new);
}

/**
 * atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_release(v, old, new);
}

/**
 * atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic64_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_sub_and_test(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_sub_and_test(i, v);
}

/**
 * atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_dec_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_and_test(v);
}

/**
 * atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic64_inc_and_test(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_and_test(v);
}

/**
 * atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative(s64 i, atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative(i, v);
}

/**
 * atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_acquire(i, v);
}

/**
 * atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_release(s64 i, atomic64_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_release(i, v);
}

/**
 * atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_negative_relaxed(i, v);
}

/**
 * atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_fetch_add_unless(v, a, u);
}

/**
 * atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_add_unless(v, a, u);
}

/**
 * atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_not_zero(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_not_zero(v);
}

/**
 * atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_inc_unless_negative(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_inc_unless_negative(v);
}

/**
 * atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic64_dec_unless_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_unless_positive(v);
}

/**
 * atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic64_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
atomic64_dec_if_positive(atomic64_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic64_dec_if_positive(v);
}

/**
 * atomic_long_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read(v);
}

/**
 * atomic_long_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_read_acquire() there.
 *
 * Return: The value loaded from @v.
 */
static __always_inline long
atomic_long_read_acquire(const atomic_long_t *v)
{
        instrument_atomic_read(v, sizeof(*v));
        return raw_atomic_long_read_acquire(v);
}

/**
 * atomic_long_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set(atomic_long_t *v, long i)
{
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set(v, i);
}

/**
 * atomic_long_set_release() - atomic set with release ordering
 * @v: pointer to atomic_long_t
 * @i: long value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_set_release() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_set_release(atomic_long_t *v, long i)
{
        kcsan_release();
        instrument_atomic_write(v, sizeof(*v));
        raw_atomic_long_set_release(v, i);
}

/**
 * atomic_long_add() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_add(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_add(i, v);
}

/**
 * atomic_long_add_return() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return(i, v);
}

/**
 * atomic_long_add_return_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_acquire(i, v);
}

/**
 * atomic_long_add_return_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_release(i, v);
}

/**
 * atomic_long_add_return_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_add_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_add() - atomic add with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add(i, v);
}

/**
 * atomic_long_fetch_add_acquire() - atomic add with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_acquire(i, v);
}

/**
 * atomic_long_fetch_add_release() - atomic add with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_release(i, v);
}

/**
 * atomic_long_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_relaxed(i, v);
}

/**
 * atomic_long_sub() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_sub(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_sub(i, v);
}

/**
 * atomic_long_sub_return() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return(i, v);
}

/**
 * atomic_long_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_acquire(i, v);
}

/**
 * atomic_long_sub_return_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_release(i, v);
}

/**
 * atomic_long_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_sub_return_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_return_relaxed(i, v);
}

/**
 * atomic_long_fetch_sub() - atomic subtract with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub(i, v);
}

/**
 * atomic_long_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_acquire(i, v);
}

/**
 * atomic_long_fetch_sub_release() - atomic subtract with release ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_release(i, v);
}

/**
 * atomic_long_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_sub_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_sub_relaxed(i, v);
}

/**
 * atomic_long_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_inc(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_inc(v);
}

/**
 * atomic_long_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return(v);
}

/**
 * atomic_long_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_acquire(v);
}

/**
 * atomic_long_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_release(v);
}

/**
 * atomic_long_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_inc_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_return_relaxed(v);
}

/**
 * atomic_long_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc(v);
}

/**
 * atomic_long_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_acquire(v);
}

/**
 * atomic_long_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_release(v);
}

/**
 * atomic_long_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_inc_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_inc_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_inc_relaxed(v);
}

/**
 * atomic_long_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_dec(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_dec(v);
}

/**
 * atomic_long_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return(v);
}

/**
 * atomic_long_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_acquire() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_acquire(v);
}

/**
 * atomic_long_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_release() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_release(v);
}

/**
 * atomic_long_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_return_relaxed() there.
 *
 * Return: The updated value of @v.
 */
static __always_inline long
atomic_long_dec_return_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_return_relaxed(v);
}

/**
 * atomic_long_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec(v);
}

/**
 * atomic_long_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_acquire(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_acquire(v);
}

/**
 * atomic_long_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_release(atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_release(v);
}

/**
 * atomic_long_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_dec_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_dec_relaxed(atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_dec_relaxed(v);
}

/**
 * atomic_long_and() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_and() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_and(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_and(i, v);
}

/**
 * atomic_long_fetch_and() - atomic bitwise AND with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and(i, v);
}

/**
 * atomic_long_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_acquire(i, v);
}

/**
 * atomic_long_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_release(i, v);
}

/**
 * atomic_long_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_and_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_and_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_and_relaxed(i, v);
}

/**
 * atomic_long_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_andnot() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_andnot(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot(i, v);
}

/**
 * atomic_long_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_acquire(i, v);
}

/**
 * atomic_long_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_release(i, v);
}

/**
 * atomic_long_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_andnot_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_andnot_relaxed(i, v);
}

/**
 * atomic_long_or() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_or() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_or(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_or(i, v);
}

/**
 * atomic_long_fetch_or() - atomic bitwise OR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or(i, v);
}

/**
 * atomic_long_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_acquire(i, v);
}

/**
 * atomic_long_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_release(i, v);
}

/**
 * atomic_long_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_or_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_or_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_or_relaxed(i, v);
}

/**
 * atomic_long_xor() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xor() there.
 *
 * Return: Nothing.
 */
static __always_inline void
atomic_long_xor(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        raw_atomic_long_xor(i, v);
}

/**
 * atomic_long_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor(i, v);
}

/**
 * atomic_long_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_acquire(i, v);
}

/**
 * atomic_long_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_release(i, v);
}

/**
 * atomic_long_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: long value
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_xor_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_xor_relaxed(i, v);
}

/**
 * atomic_long_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg(atomic_long_t *v, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg(v, new);
}

/**
 * atomic_long_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_acquire(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_acquire(v, new);
}

/**
 * atomic_long_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_release(atomic_long_t *v, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_release(v, new);
}

/**
 * atomic_long_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @new: long value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_xchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_xchg_relaxed(atomic_long_t *v, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_xchg_relaxed(v, new);
}

/**
 * atomic_long_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg(atomic_long_t *v, long old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg(v, old, new);
}

/**
 * atomic_long_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_acquire() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_release() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_cmpxchg_relaxed() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_acquire() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_acquire(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_release() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_release(v, old, new);
}

/**
 * atomic_long_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_long_t
 * @old: pointer to long value to compare with
 * @new: long value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_try_cmpxchg_relaxed() there.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new)
{
        instrument_atomic_read_write(v, sizeof(*v));
        instrument_atomic_read_write(old, sizeof(*old));
        return raw_atomic_long_try_cmpxchg_relaxed(v, old, new);
}

/**
 * atomic_long_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: long value to subtract
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_sub_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_sub_and_test(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_sub_and_test(i, v);
}

/**
 * atomic_long_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_and_test(v);
}

/**
 * atomic_long_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_and_test() there.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_and_test(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_and_test(v);
}

/**
 * atomic_long_add_negative() - atomic add and test if negative with full ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative(long i, atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative(i, v);
}

/**
 * atomic_long_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_acquire() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_acquire(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_acquire(i, v);
}

/**
 * atomic_long_add_negative_release() - atomic add and test if negative with release ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_release() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_release(long i, atomic_long_t *v)
{
        kcsan_release();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_release(i, v);
}

/**
 * atomic_long_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: long value to add
 * @v: pointer to atomic_long_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_negative_relaxed() there.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
atomic_long_add_negative_relaxed(long i, atomic_long_t *v)
{
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_negative_relaxed(i, v);
}

/**
 * atomic_long_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_fetch_add_unless() there.
 *
 * Return: The original value of @v.
 */
static __always_inline long
atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_fetch_add_unless(v, a, u);
}

/**
 * atomic_long_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_long_t
 * @a: long value to add
 * @u: long value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_add_unless() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_add_unless(atomic_long_t *v, long a, long u)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_add_unless(v, a, u);
}

/**
 * atomic_long_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_not_zero() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_not_zero(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_not_zero(v);
}

/**
 * atomic_long_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_inc_unless_negative() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_inc_unless_negative(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_inc_unless_negative(v);
}

/**
 * atomic_long_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_unless_positive() there.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
atomic_long_dec_unless_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_unless_positive(v);
}

/**
 * atomic_long_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_long_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Unsafe to use in noinstr code; use raw_atomic_long_dec_if_positive() there.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline long
atomic_long_dec_if_positive(atomic_long_t *v)
{
        kcsan_mb();
        instrument_atomic_read_write(v, sizeof(*v));
        return raw_atomic_long_dec_if_positive(v);
}

#define xchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg(__ai_ptr, __VA_ARGS__); \
})

#define xchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define xchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_release(__ai_ptr, __VA_ARGS__); \
})

#define xchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_xchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_acquire(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_acquire(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_release(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_release(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_relaxed(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_relaxed(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_acquire(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_release(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        kcsan_release(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_relaxed(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define cmpxchg_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg64_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \
})

#define cmpxchg128_local(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \
})

#define sync_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \
})

#define try_cmpxchg_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg64_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg64_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define try_cmpxchg128_local(ptr, oldp, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        typeof(oldp) __ai_oldp = (oldp); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        instrument_read_write(__ai_oldp, sizeof(*__ai_oldp)); \
        raw_try_cmpxchg128_local(__ai_ptr, __ai_oldp, __VA_ARGS__); \
})

#define sync_try_cmpxchg(ptr, ...) \
({ \
        typeof(ptr) __ai_ptr = (ptr); \
        kcsan_mb(); \
        instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
        raw_sync_try_cmpxchg(__ai_ptr, __VA_ARGS__); \
})


#endif /* _LINUX_ATOMIC_INSTRUMENTED_H */
// 8829b337928e9508259079d32581775ececd415b






































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#ifndef LLC_H
#define LLC_H
/*
 * Copyright (c) 1997 by Procom Technology, Inc.
 *                  2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
 *
 * This program can be redistributed or modified under the terms of the
 * GNU General Public License as published by the Free Software Foundation.
 * This program is distributed without any warranty or implied warranty
 * of merchantability or fitness for a particular purpose.
 *
 * See the GNU General Public License for more details.
 */

#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rculist_nulls.h>
#include <linux/hash.h>
#include <linux/jhash.h>

#include <linux/atomic.h>

struct net_device;
struct packet_type;
struct sk_buff;

struct llc_addr {
        unsigned char lsap;
        unsigned char mac[IFHWADDRLEN];
};

#define LLC_SAP_STATE_INACTIVE        1
#define LLC_SAP_STATE_ACTIVE        2

#define LLC_SK_DEV_HASH_BITS 6
#define LLC_SK_DEV_HASH_ENTRIES (1<<LLC_SK_DEV_HASH_BITS)

#define LLC_SK_LADDR_HASH_BITS 6
#define LLC_SK_LADDR_HASH_ENTRIES (1<<LLC_SK_LADDR_HASH_BITS)

/**
 * struct llc_sap - Defines the SAP component
 *
 * @station - station this sap belongs to
 * @state - sap state
 * @p_bit - only lowest-order bit used
 * @f_bit - only lowest-order bit used
 * @laddr - SAP value in this 'lsap'
 * @node - entry in station sap_list
 * @sk_list - LLC sockets this one manages
 */
struct llc_sap {
        unsigned char         state;
        unsigned char         p_bit;
        unsigned char         f_bit;
        refcount_t                 refcnt;
        int                 (*rcv_func)(struct sk_buff *skb,
                                     struct net_device *dev,
                                     struct packet_type *pt,
                                     struct net_device *orig_dev);
        struct llc_addr         laddr;
        struct list_head node;
        spinlock_t sk_lock;
        int sk_count;
        struct hlist_nulls_head sk_laddr_hash[LLC_SK_LADDR_HASH_ENTRIES];
        struct hlist_head sk_dev_hash[LLC_SK_DEV_HASH_ENTRIES];
        struct rcu_head rcu;
};

static inline
struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex)
{
        u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS);

        return &sap->sk_dev_hash[bucket];
}

static inline
u32 llc_sk_laddr_hashfn(struct llc_sap *sap, const struct llc_addr *laddr)
{
        return hash_32(jhash(laddr->mac, sizeof(laddr->mac), 0),
                       LLC_SK_LADDR_HASH_BITS);
}

static inline
struct hlist_nulls_head *llc_sk_laddr_hash(struct llc_sap *sap,
                                           const struct llc_addr *laddr)
{
        return &sap->sk_laddr_hash[llc_sk_laddr_hashfn(sap, laddr)];
}

#define LLC_DEST_INVALID         0      /* Invalid LLC PDU type */
#define LLC_DEST_SAP             1      /* Type 1 goes here */
#define LLC_DEST_CONN            2      /* Type 2 goes here */

extern struct list_head llc_sap_list;

int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
            struct net_device *orig_dev);

int llc_mac_hdr_init(struct sk_buff *skb, const unsigned char *sa,
                     const unsigned char *da);

void llc_add_pack(int type,
                  void (*handler)(struct llc_sap *sap, struct sk_buff *skb));
void llc_remove_pack(int type);

void llc_set_station_handler(void (*handler)(struct sk_buff *skb));

struct llc_sap *llc_sap_open(unsigned char lsap,
                             int (*rcv)(struct sk_buff *skb,
                                        struct net_device *dev,
                                        struct packet_type *pt,
                                        struct net_device *orig_dev));
static inline void llc_sap_hold(struct llc_sap *sap)
{
        refcount_inc(&sap->refcnt);
}

static inline bool llc_sap_hold_safe(struct llc_sap *sap)
{
        return refcount_inc_not_zero(&sap->refcnt);
}

void llc_sap_close(struct llc_sap *sap);

static inline void llc_sap_put(struct llc_sap *sap)
{
        if (refcount_dec_and_test(&sap->refcnt))
                llc_sap_close(sap);
}

struct llc_sap *llc_sap_find(unsigned char sap_value);

int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb,
                              const unsigned char *dmac, unsigned char dsap);

void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb);
void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb);

void llc_station_init(void);
void llc_station_exit(void);

#ifdef CONFIG_PROC_FS
int llc_proc_init(void);
void llc_proc_exit(void);
#else
#define llc_proc_init()        (0)
#define llc_proc_exit()        do { } while(0)
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SYSCTL
int llc_sysctl_init(void);
void llc_sysctl_exit(void);

extern int sysctl_llc2_ack_timeout;
extern int sysctl_llc2_busy_timeout;
extern int sysctl_llc2_p_timeout;
extern int sysctl_llc2_rej_timeout;
#else
#define llc_sysctl_init() (0)
#define llc_sysctl_exit() do { } while(0)
#endif /* CONFIG_SYSCTL */
#endif /* LLC_H */



















































































































































































































 1248 


































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_FP_H
#define __ASM_FP_H

#include <asm/errno.h>
#include <asm/ptrace.h>
#include <asm/processor.h>
#include <asm/sigcontext.h>
#include <asm/sysreg.h>

#ifndef __ASSEMBLY__

#include <linux/bitmap.h>
#include <linux/build_bug.h>
#include <linux/bug.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/stddef.h>
#include <linux/types.h>

/* Masks for extracting the FPSR and FPCR from the FPSCR */
#define VFP_FPSCR_STAT_MASK        0xf800009f
#define VFP_FPSCR_CTRL_MASK        0x07f79f00
/*
 * The VFP state has 32x64-bit registers and a single 32-bit
 * control/status register.
 */
#define VFP_STATE_SIZE                ((32 * 8) + 4)

static inline unsigned long cpacr_save_enable_kernel_sve(void)
{
        unsigned long old = read_sysreg(cpacr_el1);
        unsigned long set = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_ZEN_EL1EN;

        write_sysreg(old | set, cpacr_el1);
        isb();
        return old;
}

static inline unsigned long cpacr_save_enable_kernel_sme(void)
{
        unsigned long old = read_sysreg(cpacr_el1);
        unsigned long set = CPACR_EL1_FPEN_EL1EN | CPACR_EL1_SMEN_EL1EN;

        write_sysreg(old | set, cpacr_el1);
        isb();
        return old;
}

static inline void cpacr_restore(unsigned long cpacr)
{
        write_sysreg(cpacr, cpacr_el1);
        isb();
}

/*
 * When we defined the maximum SVE vector length we defined the ABI so
 * that the maximum vector length included all the reserved for future
 * expansion bits in ZCR rather than those just currently defined by
 * the architecture.  Using this length to allocate worst size buffers
 * results in excessively large allocations, and this effect is even
 * more pronounced for SME due to ZA.  Define more suitable VLs for
 * these situations.
 */
#define ARCH_SVE_VQ_MAX ((ZCR_ELx_LEN_MASK >> ZCR_ELx_LEN_SHIFT) + 1)
#define SME_VQ_MAX        ((SMCR_ELx_LEN_MASK >> SMCR_ELx_LEN_SHIFT) + 1)

struct task_struct;

extern void fpsimd_save_state(struct user_fpsimd_state *state);
extern void fpsimd_load_state(struct user_fpsimd_state *state);

extern void fpsimd_thread_switch(struct task_struct *next);
extern void fpsimd_flush_thread(void);

extern void fpsimd_signal_preserve_current_state(void);
extern void fpsimd_preserve_current_state(void);
extern void fpsimd_restore_current_state(void);
extern void fpsimd_update_current_state(struct user_fpsimd_state const *state);

struct cpu_fp_state {
        struct user_fpsimd_state *st;
        void *sve_state;
        void *sme_state;
        u64 *svcr;
        u64 *fpmr;
        unsigned int sve_vl;
        unsigned int sme_vl;
        enum fp_type *fp_type;
        enum fp_type to_save;
};

extern void fpsimd_bind_state_to_cpu(struct cpu_fp_state *fp_state);

extern void fpsimd_flush_task_state(struct task_struct *target);
extern void fpsimd_save_and_flush_cpu_state(void);

static inline bool thread_sm_enabled(struct thread_struct *thread)
{
        return system_supports_sme() && (thread->svcr & SVCR_SM_MASK);
}

static inline bool thread_za_enabled(struct thread_struct *thread)
{
        return system_supports_sme() && (thread->svcr & SVCR_ZA_MASK);
}

/* Maximum VL that SVE/SME VL-agnostic software can transparently support */
#define VL_ARCH_MAX 0x100

/* Offset of FFR in the SVE register dump */
static inline size_t sve_ffr_offset(int vl)
{
        return SVE_SIG_FFR_OFFSET(sve_vq_from_vl(vl)) - SVE_SIG_REGS_OFFSET;
}

static inline void *sve_pffr(struct thread_struct *thread)
{
        unsigned int vl;

        if (system_supports_sme() && thread_sm_enabled(thread))
                vl = thread_get_sme_vl(thread);
        else
                vl = thread_get_sve_vl(thread);

        return (char *)thread->sve_state + sve_ffr_offset(vl);
}

static inline void *thread_zt_state(struct thread_struct *thread)
{
        /* The ZT register state is stored immediately after the ZA state */
        unsigned int sme_vq = sve_vq_from_vl(thread_get_sme_vl(thread));
        return thread->sme_state + ZA_SIG_REGS_SIZE(sme_vq);
}

extern void sve_save_state(void *state, u32 *pfpsr, int save_ffr);
extern void sve_load_state(void const *state, u32 const *pfpsr,
                           int restore_ffr);
extern void sve_flush_live(bool flush_ffr, unsigned long vq_minus_1);
extern unsigned int sve_get_vl(void);
extern void sve_set_vq(unsigned long vq_minus_1);
extern void sme_set_vq(unsigned long vq_minus_1);
extern void sme_save_state(void *state, int zt);
extern void sme_load_state(void const *state, int zt);

struct arm64_cpu_capabilities;
extern void cpu_enable_fpsimd(const struct arm64_cpu_capabilities *__unused);
extern void cpu_enable_sve(const struct arm64_cpu_capabilities *__unused);
extern void cpu_enable_sme(const struct arm64_cpu_capabilities *__unused);
extern void cpu_enable_sme2(const struct arm64_cpu_capabilities *__unused);
extern void cpu_enable_fa64(const struct arm64_cpu_capabilities *__unused);
extern void cpu_enable_fpmr(const struct arm64_cpu_capabilities *__unused);

/*
 * Helpers to translate bit indices in sve_vq_map to VQ values (and
 * vice versa).  This allows find_next_bit() to be used to find the
 * _maximum_ VQ not exceeding a certain value.
 */
static inline unsigned int __vq_to_bit(unsigned int vq)
{
        return SVE_VQ_MAX - vq;
}

static inline unsigned int __bit_to_vq(unsigned int bit)
{
        return SVE_VQ_MAX - bit;
}


struct vl_info {
        enum vec_type type;
        const char *name;                /* For display purposes */

        /* Minimum supported vector length across all CPUs */
        int min_vl;

        /* Maximum supported vector length across all CPUs */
        int max_vl;
        int max_virtualisable_vl;

        /*
         * Set of available vector lengths,
         * where length vq encoded as bit __vq_to_bit(vq):
         */
        DECLARE_BITMAP(vq_map, SVE_VQ_MAX);

        /* Set of vector lengths present on at least one cpu: */
        DECLARE_BITMAP(vq_partial_map, SVE_VQ_MAX);
};

#ifdef CONFIG_ARM64_SVE

extern void sve_alloc(struct task_struct *task, bool flush);
extern void fpsimd_release_task(struct task_struct *task);
extern void fpsimd_sync_to_sve(struct task_struct *task);
extern void fpsimd_force_sync_to_sve(struct task_struct *task);
extern void sve_sync_to_fpsimd(struct task_struct *task);
extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);

extern int vec_set_vector_length(struct task_struct *task, enum vec_type type,
                                 unsigned long vl, unsigned long flags);

extern int sve_set_current_vl(unsigned long arg);
extern int sve_get_current_vl(void);

static inline void sve_user_disable(void)
{
        sysreg_clear_set(cpacr_el1, CPACR_EL1_ZEN_EL0EN, 0);
}

static inline void sve_user_enable(void)
{
        sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_ZEN_EL0EN);
}

#define sve_cond_update_zcr_vq(val, reg)                \
        do {                                                \
                u64 __zcr = read_sysreg_s((reg));        \
                u64 __new = __zcr & ~ZCR_ELx_LEN_MASK;        \
                __new |= (val) & ZCR_ELx_LEN_MASK;        \
                if (__zcr != __new)                        \
                        write_sysreg_s(__new, (reg));        \
        } while (0)

/*
 * Probing and setup functions.
 * Calls to these functions must be serialised with one another.
 */
enum vec_type;

extern void __init vec_init_vq_map(enum vec_type type);
extern void vec_update_vq_map(enum vec_type type);
extern int vec_verify_vq_map(enum vec_type type);
extern void __init sve_setup(void);

extern __ro_after_init struct vl_info vl_info[ARM64_VEC_MAX];

static inline void write_vl(enum vec_type type, u64 val)
{
        u64 tmp;

        switch (type) {
#ifdef CONFIG_ARM64_SVE
        case ARM64_VEC_SVE:
                tmp = read_sysreg_s(SYS_ZCR_EL1) & ~ZCR_ELx_LEN_MASK;
                write_sysreg_s(tmp | val, SYS_ZCR_EL1);
                break;
#endif
#ifdef CONFIG_ARM64_SME
        case ARM64_VEC_SME:
                tmp = read_sysreg_s(SYS_SMCR_EL1) & ~SMCR_ELx_LEN_MASK;
                write_sysreg_s(tmp | val, SYS_SMCR_EL1);
                break;
#endif
        default:
                WARN_ON_ONCE(1);
                break;
        }
}

static inline int vec_max_vl(enum vec_type type)
{
        return vl_info[type].max_vl;
}

static inline int vec_max_virtualisable_vl(enum vec_type type)
{
        return vl_info[type].max_virtualisable_vl;
}

static inline int sve_max_vl(void)
{
        return vec_max_vl(ARM64_VEC_SVE);
}

static inline int sve_max_virtualisable_vl(void)
{
        return vec_max_virtualisable_vl(ARM64_VEC_SVE);
}

/* Ensure vq >= SVE_VQ_MIN && vq <= SVE_VQ_MAX before calling this function */
static inline bool vq_available(enum vec_type type, unsigned int vq)
{
        return test_bit(__vq_to_bit(vq), vl_info[type].vq_map);
}

static inline bool sve_vq_available(unsigned int vq)
{
        return vq_available(ARM64_VEC_SVE, vq);
}

size_t sve_state_size(struct task_struct const *task);

#else /* ! CONFIG_ARM64_SVE */

static inline void sve_alloc(struct task_struct *task, bool flush) { }
static inline void fpsimd_release_task(struct task_struct *task) { }
static inline void sve_sync_to_fpsimd(struct task_struct *task) { }
static inline void sve_sync_from_fpsimd_zeropad(struct task_struct *task) { }

static inline int sve_max_virtualisable_vl(void)
{
        return 0;
}

static inline int sve_set_current_vl(unsigned long arg)
{
        return -EINVAL;
}

static inline int sve_get_current_vl(void)
{
        return -EINVAL;
}

static inline int sve_max_vl(void)
{
        return -EINVAL;
}

static inline bool sve_vq_available(unsigned int vq) { return false; }

static inline void sve_user_disable(void) { BUILD_BUG(); }
static inline void sve_user_enable(void) { BUILD_BUG(); }

#define sve_cond_update_zcr_vq(val, reg) do { } while (0)

static inline void vec_init_vq_map(enum vec_type t) { }
static inline void vec_update_vq_map(enum vec_type t) { }
static inline int vec_verify_vq_map(enum vec_type t) { return 0; }
static inline void sve_setup(void) { }

static inline size_t sve_state_size(struct task_struct const *task)
{
        return 0;
}

#endif /* ! CONFIG_ARM64_SVE */

#ifdef CONFIG_ARM64_SME

static inline void sme_user_disable(void)
{
        sysreg_clear_set(cpacr_el1, CPACR_EL1_SMEN_EL0EN, 0);
}

static inline void sme_user_enable(void)
{
        sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_SMEN_EL0EN);
}

static inline void sme_smstart_sm(void)
{
        asm volatile(__msr_s(SYS_SVCR_SMSTART_SM_EL0, "xzr"));
}

static inline void sme_smstop_sm(void)
{
        asm volatile(__msr_s(SYS_SVCR_SMSTOP_SM_EL0, "xzr"));
}

static inline void sme_smstop(void)
{
        asm volatile(__msr_s(SYS_SVCR_SMSTOP_SMZA_EL0, "xzr"));
}

extern void __init sme_setup(void);

static inline int sme_max_vl(void)
{
        return vec_max_vl(ARM64_VEC_SME);
}

static inline int sme_max_virtualisable_vl(void)
{
        return vec_max_virtualisable_vl(ARM64_VEC_SME);
}

extern void sme_alloc(struct task_struct *task, bool flush);
extern unsigned int sme_get_vl(void);
extern int sme_set_current_vl(unsigned long arg);
extern int sme_get_current_vl(void);
extern void sme_suspend_exit(void);

/*
 * Return how many bytes of memory are required to store the full SME
 * specific state for task, given task's currently configured vector
 * length.
 */
static inline size_t sme_state_size(struct task_struct const *task)
{
        unsigned int vl = task_get_sme_vl(task);
        size_t size;

        size = ZA_SIG_REGS_SIZE(sve_vq_from_vl(vl));

        if (system_supports_sme2())
                size += ZT_SIG_REG_SIZE;

        return size;
}

#else

static inline void sme_user_disable(void) { BUILD_BUG(); }
static inline void sme_user_enable(void) { BUILD_BUG(); }

static inline void sme_smstart_sm(void) { }
static inline void sme_smstop_sm(void) { }
static inline void sme_smstop(void) { }

static inline void sme_alloc(struct task_struct *task, bool flush) { }
static inline void sme_setup(void) { }
static inline unsigned int sme_get_vl(void) { return 0; }
static inline int sme_max_vl(void) { return 0; }
static inline int sme_max_virtualisable_vl(void) { return 0; }
static inline int sme_set_current_vl(unsigned long arg) { return -EINVAL; }
static inline int sme_get_current_vl(void) { return -EINVAL; }
static inline void sme_suspend_exit(void) { }

static inline size_t sme_state_size(struct task_struct const *task)
{
        return 0;
}

#endif /* ! CONFIG_ARM64_SME */

/* For use by EFI runtime services calls only */
extern void __efi_fpsimd_begin(void);
extern void __efi_fpsimd_end(void);

#endif

#endif














































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_COMPAT_H
#define __ASM_COMPAT_H

#define compat_mode_t compat_mode_t
typedef u16                compat_mode_t;

#define __compat_uid_t        __compat_uid_t
typedef u16                __compat_uid_t;
typedef u16                __compat_gid_t;

#define compat_ipc_pid_t compat_ipc_pid_t
typedef u16                compat_ipc_pid_t;

#define compat_statfs        compat_statfs

#include <asm-generic/compat.h>

#ifdef CONFIG_COMPAT

/*
 * Architecture specific compatibility types
 */
#include <linux/types.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>

#ifdef __AARCH64EB__
#define COMPAT_UTS_MACHINE        "armv8b\0\0"
#else
#define COMPAT_UTS_MACHINE        "armv8l\0\0"
#endif

typedef u16                __compat_uid16_t;
typedef u16                __compat_gid16_t;
typedef s32                compat_nlink_t;

struct compat_stat {
#ifdef __AARCH64EB__
        short                st_dev;
        short                __pad1;
#else
        compat_dev_t        st_dev;
#endif
        compat_ino_t        st_ino;
        compat_mode_t        st_mode;
        compat_ushort_t        st_nlink;
        __compat_uid16_t        st_uid;
        __compat_gid16_t        st_gid;
#ifdef __AARCH64EB__
        short                st_rdev;
        short                __pad2;
#else
        compat_dev_t        st_rdev;
#endif
        compat_off_t        st_size;
        compat_off_t        st_blksize;
        compat_off_t        st_blocks;
        old_time32_t        st_atime;
        compat_ulong_t        st_atime_nsec;
        old_time32_t        st_mtime;
        compat_ulong_t        st_mtime_nsec;
        old_time32_t        st_ctime;
        compat_ulong_t        st_ctime_nsec;
        compat_ulong_t        __unused4[2];
};

struct compat_statfs {
        int                f_type;
        int                f_bsize;
        int                f_blocks;
        int                f_bfree;
        int                f_bavail;
        int                f_files;
        int                f_ffree;
        compat_fsid_t        f_fsid;
        int                f_namelen;        /* SunOS ignores this field. */
        int                f_frsize;
        int                f_flags;
        int                f_spare[4];
};

#define compat_user_stack_pointer() (user_stack_pointer(task_pt_regs(current)))
#define COMPAT_MINSIGSTKSZ        2048

static inline int is_compat_task(void)
{
        return test_thread_flag(TIF_32BIT);
}

static inline int is_compat_thread(struct thread_info *thread)
{
        return test_ti_thread_flag(thread, TIF_32BIT);
}

long compat_arm_syscall(struct pt_regs *regs, int scno);

#else /* !CONFIG_COMPAT */

static inline int is_compat_thread(struct thread_info *thread)
{
        return 0;
}

#endif /* CONFIG_COMPAT */
#endif /* __ASM_COMPAT_H */






































































































































   46 

   46 
   46 



   46 









   46 


   46 



























   46 




   46 




















   46 










   46 




   46 



   46 



   46 


   46 











   46 







   46 



























































































































































































































































































































































































































































































































































































































    4 
    4 


    4 



    4 

    4 

    4 

    4 


    4 








    4 















































    4 














    4 











    4 




    4 
















    4 
    4 









    4 













    4 
























































































































    1 
















    1 




























    1 




    1 




















    1 
    1 



























    2 










    2 









    2 

































    2 
    2 
    2 










    2 











    2 




    2 




















    2 


    2 











    2 















































































































































  275 




  275 
    1 


















    4 





    4 



    4 





































































































































    2 




























































































































































































  168 



  167 



  164 











   32 




  168 






















   47 






   47 


    4 




















    4 





    4 


   47 







































































    4 












   86 



  126 













































   56 



   56 
   56 





   56 










   56 


























   56 



   56 
   56 




   45 







   46 




   46 























   56 




















   85 

































































































   86 



   86 
   86 
























   86 











































































































































































































































































































































































































































































































































































































































































































































































































































































    4 






































    4 





















































    4 






































    4 


















    4 














































    4 














    4 






















    4 









    4 





    4 







    4 








    4 

































    4 













































    4 

    4 

    4 










   95 





    5 









   95 
    5 






  159 




  159 












    4 

   96 






   95 





    4 
    4 


























































































   96 







   88 






  160 











  160 

   82 


   96 























   96 



   96 







   96 
   87 

   96 
   96 

  159 




































































































































































































































































































































   27 





















   27 









   27 
   27 


   27 
    2 












   27 

























   23 




   26 


   26 

    1 

   22 






















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        linux/mm/filemap.c
 *
 * Copyright (C) 1994-1999  Linus Torvalds
 */

/*
 * This file handles the generic file mmap semantics used by
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
#include <linux/export.h>
#include <linux/compiler.h>
#include <linux/dax.h>
#include <linux/fs.h>
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/kernel_stat.h>
#include <linux/gfp.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/syscalls.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
#include <linux/page_idle.h>
#include <linux/migrate.h>
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
#include <linux/sysctl.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/filemap.h>

/*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
#include <linux/buffer_head.h> /* for try_to_free_buffers */

#include <asm/mman.h>

#include "swap.h"

/*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
 * though.
 *
 * Shared mappings now work. 15.8.1995  Bruno.
 *
 * finished 'unifying' the page and buffer cache and SMP-threaded the
 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 *
 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 */

/*
 * Lock ordering:
 *
 *  ->i_mmap_rwsem                (truncate_pagecache)
 *    ->private_lock                (__free_pte->block_dirty_folio)
 *      ->swap_lock                (exclusive_swap_page, others)
 *        ->i_pages lock
 *
 *  ->i_rwsem
 *    ->invalidate_lock                (acquired by fs in truncate path)
 *      ->i_mmap_rwsem                (truncate->unmap_mapping_range)
 *
 *  ->mmap_lock
 *    ->i_mmap_rwsem
 *      ->page_table_lock or pte_lock        (various, mainly in memory.c)
 *        ->i_pages lock        (arch-dependent flush_dcache_mmap_lock)
 *
 *  ->mmap_lock
 *    ->invalidate_lock                (filemap_fault)
 *      ->lock_page                (filemap_fault, access_process_vm)
 *
 *  ->i_rwsem                        (generic_perform_write)
 *    ->mmap_lock                (fault_in_readable->do_page_fault)
 *
 *  bdi->wb.list_lock
 *    sb_lock                        (fs/fs-writeback.c)
 *    ->i_pages lock                (__sync_single_inode)
 *
 *  ->i_mmap_rwsem
 *    ->anon_vma.lock                (vma_merge)
 *
 *  ->anon_vma.lock
 *    ->page_table_lock or pte_lock        (anon_vma_prepare and various)
 *
 *  ->page_table_lock or pte_lock
 *    ->swap_lock                (try_to_unmap_one)
 *    ->private_lock                (try_to_unmap_one)
 *    ->i_pages lock                (try_to_unmap_one)
 *    ->lruvec->lru_lock        (follow_page_mask->mark_page_accessed)
 *    ->lruvec->lru_lock        (check_pte_range->folio_isolate_lru)
 *    ->private_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->i_pages lock                (folio_remove_rmap_pte->set_page_dirty)
 *    bdi.wb->list_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    ->inode->i_lock                (folio_remove_rmap_pte->set_page_dirty)
 *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
 *    ->inode->i_lock                (zap_pte_range->set_page_dirty)
 *    ->private_lock                (zap_pte_range->block_dirty_folio)
 */

static void page_cache_delete(struct address_space *mapping,
                                   struct folio *folio, void *shadow)
{
        XA_STATE(xas, &mapping->i_pages, folio->index);
        long nr = 1;

        mapping_set_update(&xas, mapping);

        xas_set_order(&xas, folio->index, folio_order(folio));
        nr = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        xas_store(&xas, shadow);
        xas_init_marks(&xas);

        folio->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
        mapping->nrpages -= nr;
}

static void filemap_unaccount_folio(struct address_space *mapping,
                struct folio *folio)
{
        long nr;

        VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
        if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
                pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
                         current->comm, folio_pfn(folio));
                dump_page(&folio->page, "still mapped when deleted");
                dump_stack();
                add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

                if (mapping_exiting(mapping) && !folio_test_large(folio)) {
                        int mapcount = folio_mapcount(folio);

                        if (folio_ref_count(folio) >= mapcount + 2) {
                                /*
                                 * All vmas have already been torn down, so it's
                                 * a good bet that actually the page is unmapped
                                 * and we'd rather not leak it: if we're wrong,
                                 * another bad page check should catch it later.
                                 */
                                atomic_set(&folio->_mapcount, -1);
                                folio_ref_sub(folio, mapcount);
                        }
                }
        }

        /* hugetlb folios do not participate in page cache accounting. */
        if (folio_test_hugetlb(folio))
                return;

        nr = folio_nr_pages(folio);

        __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
        if (folio_test_swapbacked(folio)) {
                __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
                if (folio_test_pmd_mappable(folio))
                        __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
        } else if (folio_test_pmd_mappable(folio)) {
                __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
                filemap_nr_thps_dec(mapping);
        }

        /*
         * At this point folio must be either written or cleaned by
         * truncate.  Dirty folio here signals a bug and loss of
         * unwritten data - on ordinary filesystems.
         *
         * But it's harmless on in-memory filesystems like tmpfs; and can
         * occur when a driver which did get_user_pages() sets page dirty
         * before putting it, while the inode is being finally evicted.
         *
         * Below fixes dirty accounting after removing the folio entirely
         * but leaves the dirty flag set: it has no effect for truncated
         * folio and anyway will be cleared before returning folio to
         * buddy allocator.
         */
        if (WARN_ON_ONCE(folio_test_dirty(folio) &&
                         mapping_can_writeback(mapping)))
                folio_account_cleaned(folio, inode_to_wb(mapping->host));
}

/*
 * Delete a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
 * is safe.  The caller must hold the i_pages lock.
 */
void __filemap_remove_folio(struct folio *folio, void *shadow)
{
        struct address_space *mapping = folio->mapping;

        trace_mm_filemap_delete_from_page_cache(folio);
        filemap_unaccount_folio(mapping, folio);
        page_cache_delete(mapping, folio, shadow);
}

void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
        void (*free_folio)(struct folio *);

        free_folio = mapping->a_ops->free_folio;
        if (free_folio)
                free_folio(folio);

        folio_put_refs(folio, folio_nr_pages(folio));
}

/**
 * filemap_remove_folio - Remove folio from page cache.
 * @folio: The folio.
 *
 * This must be called only on folios that are locked and have been
 * verified to be in the page cache.  It will never put the folio into
 * the free list because the caller has a reference on the page.
 */
void filemap_remove_folio(struct folio *folio)
{
        struct address_space *mapping = folio->mapping;

        BUG_ON(!folio_test_locked(folio));
        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        __filemap_remove_folio(folio, NULL);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        filemap_free_folio(mapping, folio);
}

/*
 * page_cache_delete_batch - delete several folios from page cache
 * @mapping: the mapping to which folios belong
 * @fbatch: batch of folios to delete
 *
 * The function walks over mapping->i_pages and removes folios passed in
 * @fbatch from the mapping. The function expects @fbatch to be sorted
 * by page index and is optimised for it to be dense.
 * It tolerates holes in @fbatch (mapping entries at those indices are not
 * modified).
 *
 * The function expects the i_pages lock to be held.
 */
static void page_cache_delete_batch(struct address_space *mapping,
                             struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
        long total_pages = 0;
        int i = 0;
        struct folio *folio;

        mapping_set_update(&xas, mapping);
        xas_for_each(&xas, folio, ULONG_MAX) {
                if (i >= folio_batch_count(fbatch))
                        break;

                /* A swap/dax/shadow entry got inserted? Skip it. */
                if (xa_is_value(folio))
                        continue;
                /*
                 * A page got inserted in our range? Skip it. We have our
                 * pages locked so they are protected from being removed.
                 * If we see a page whose index is higher than ours, it
                 * means our page has been removed, which shouldn't be
                 * possible because we're holding the PageLock.
                 */
                if (folio != fbatch->folios[i]) {
                        VM_BUG_ON_FOLIO(folio->index >
                                        fbatch->folios[i]->index, folio);
                        continue;
                }

                WARN_ON_ONCE(!folio_test_locked(folio));

                folio->mapping = NULL;
                /* Leave folio->index set: truncation lookup relies on it */

                i++;
                xas_store(&xas, NULL);
                total_pages += folio_nr_pages(folio);
        }
        mapping->nrpages -= total_pages;
}

void delete_from_page_cache_batch(struct address_space *mapping,
                                  struct folio_batch *fbatch)
{
        int i;

        if (!folio_batch_count(fbatch))
                return;

        spin_lock(&mapping->host->i_lock);
        xa_lock_irq(&mapping->i_pages);
        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                trace_mm_filemap_delete_from_page_cache(folio);
                filemap_unaccount_folio(mapping, folio);
        }
        page_cache_delete_batch(mapping, fbatch);
        xa_unlock_irq(&mapping->i_pages);
        if (mapping_shrinkable(mapping))
                inode_add_lru(mapping->host);
        spin_unlock(&mapping->host->i_lock);

        for (i = 0; i < folio_batch_count(fbatch); i++)
                filemap_free_folio(mapping, fbatch->folios[i]);
}

int filemap_check_errors(struct address_space *mapping)
{
        int ret = 0;
        /* Check for outstanding write errors */
        if (test_bit(AS_ENOSPC, &mapping->flags) &&
            test_and_clear_bit(AS_ENOSPC, &mapping->flags))
                ret = -ENOSPC;
        if (test_bit(AS_EIO, &mapping->flags) &&
            test_and_clear_bit(AS_EIO, &mapping->flags))
                ret = -EIO;
        return ret;
}
EXPORT_SYMBOL(filemap_check_errors);

static int filemap_check_and_keep_errors(struct address_space *mapping)
{
        /* Check for outstanding write errors */
        if (test_bit(AS_EIO, &mapping->flags))
                return -EIO;
        if (test_bit(AS_ENOSPC, &mapping->flags))
                return -ENOSPC;
        return 0;
}

/**
 * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
 * @mapping:        address space structure to write
 * @wbc:        the writeback_control controlling the writeout
 *
 * Call writepages on the mapping using the provided wbc to control the
 * writeout.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_fdatawrite_wbc(struct address_space *mapping,
                           struct writeback_control *wbc)
{
        int ret;

        if (!mapping_can_writeback(mapping) ||
            !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
                return 0;

        wbc_attach_fdatawrite_inode(wbc, mapping->host);
        ret = do_writepages(mapping, wbc);
        wbc_detach_inode(wbc);
        return ret;
}
EXPORT_SYMBOL(filemap_fdatawrite_wbc);

/**
 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
 * @mapping:        address space structure to write
 * @start:        offset in bytes where the range starts
 * @end:        offset in bytes where the range ends (inclusive)
 * @sync_mode:        enable synchronous operation
 *
 * Start writeback against all of a mapping's dirty pages that lie
 * within the byte offsets <start, end> inclusive.
 *
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
 * opposed to a regular memory cleansing writeback.  The difference between
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                loff_t end, int sync_mode)
{
        struct writeback_control wbc = {
                .sync_mode = sync_mode,
                .nr_to_write = LONG_MAX,
                .range_start = start,
                .range_end = end,
        };

        return filemap_fdatawrite_wbc(mapping, &wbc);
}

static inline int __filemap_fdatawrite(struct address_space *mapping,
        int sync_mode)
{
        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
}

int filemap_fdatawrite(struct address_space *mapping)
{
        return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);

int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
                                loff_t end)
{
        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite_range);

/**
 * filemap_fdatawrite_range_kick - start writeback on a range
 * @mapping:        target address_space
 * @start:        index to start writeback on
 * @end:        last (inclusive) index for writeback
 *
 * This is a non-integrity writeback helper, to start writing back folios
 * for the indicated range.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
                                  loff_t end)
{
        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
}
EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);

/**
 * filemap_flush - mostly a non-blocking flush
 * @mapping:        target address_space
 *
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_flush(struct address_space *mapping)
{
        return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
}
EXPORT_SYMBOL(filemap_flush);

/**
 * filemap_range_has_page - check if a page exists in range.
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback.
 *
 * Return: %true if at least one page exists in the specified range,
 * %false otherwise.
 */
bool filemap_range_has_page(struct address_space *mapping,
                           loff_t start_byte, loff_t end_byte)
{
        struct folio *folio;
        XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
        pgoff_t max = end_byte >> PAGE_SHIFT;

        if (end_byte < start_byte)
                return false;

        rcu_read_lock();
        for (;;) {
                folio = xas_find(&xas, max);
                if (xas_retry(&xas, folio))
                        continue;
                /* Shadow entries don't count */
                if (xa_is_value(folio))
                        continue;
                /*
                 * We don't need to try to pin this page; we're about to
                 * release the RCU lock anyway.  It is enough to know that
                 * there was a page here recently.
                 */
                break;
        }
        rcu_read_unlock();

        return folio != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);

static void __filemap_fdatawait_range(struct address_space *mapping,
                                     loff_t start_byte, loff_t end_byte)
{
        pgoff_t index = start_byte >> PAGE_SHIFT;
        pgoff_t end = end_byte >> PAGE_SHIFT;
        struct folio_batch fbatch;
        unsigned nr_folios;

        folio_batch_init(&fbatch);

        while (index <= end) {
                unsigned i;

                nr_folios = filemap_get_folios_tag(mapping, &index, end,
                                PAGECACHE_TAG_WRITEBACK, &fbatch);

                if (!nr_folios)
                        break;

                for (i = 0; i < nr_folios; i++) {
                        struct folio *folio = fbatch.folios[i];

                        folio_wait_writeback(folio);
                }
                folio_batch_release(&fbatch);
                cond_resched();
        }
}

/**
 * filemap_fdatawait_range - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space
 * in the given range and wait for all of them.  Check error status of
 * the address space and return it.
 *
 * Since the error status of the address space is cleared by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                            loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range);

/**
 * filemap_fdatawait_range_keep_errors - wait for writeback to complete
 * @mapping:                address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space in the
 * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
 * this function does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 */
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
                loff_t start_byte, loff_t end_byte)
{
        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);

/**
 * file_fdatawait_range - wait for writeback to complete
 * @file:                file pointing to address space structure to wait for
 * @start_byte:                offset in bytes where the range starts
 * @end_byte:                offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the address space that file
 * refers to, in the given range and wait for all of them.  Check error
 * status of the address space vs. the file->f_wb_err cursor and return it.
 *
 * Since the error status of the file is advanced by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
 *
 * Return: error status of the address space vs. the file->f_wb_err cursor.
 */
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
        struct address_space *mapping = file->f_mapping;

        __filemap_fdatawait_range(mapping, start_byte, end_byte);
        return file_check_and_advance_wb_err(file);
}
EXPORT_SYMBOL(file_fdatawait_range);

/**
 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
 * @mapping: address space structure to wait for
 *
 * Walk the list of under-writeback pages of the given address space
 * and wait for all of them.  Unlike filemap_fdatawait(), this function
 * does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 *
 * Return: error status of the address space.
 */
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
        __filemap_fdatawait_range(mapping, 0, LLONG_MAX);
        return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);

/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
        return mapping->nrpages;
}

bool filemap_range_has_writeback(struct address_space *mapping,
                                 loff_t start_byte, loff_t end_byte)
{
        XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
        pgoff_t max = end_byte >> PAGE_SHIFT;
        struct folio *folio;

        if (end_byte < start_byte)
                return false;

        rcu_read_lock();
        xas_for_each(&xas, folio, max) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xa_is_value(folio))
                        continue;
                if (folio_test_dirty(folio) || folio_test_locked(folio) ||
                                folio_test_writeback(folio))
                        break;
        }
        rcu_read_unlock();
        return folio != NULL;
}
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);

/**
 * filemap_write_and_wait_range - write out & wait on a file range
 * @mapping:        the address_space for the pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * Return: error status of the address space.
 */
int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
{
        int err = 0, err2;

        if (lend < lstart)
                return 0;

        if (mapping_needs_writeback(mapping)) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
                /*
                 * Even if the above returned error, the pages may be
                 * written partially (e.g. -ENOSPC), so we wait for it.
                 * But the -EIO is special case, it may indicate the worst
                 * thing (e.g. bug) happened, so we avoid waiting for it.
                 */
                if (err != -EIO)
                        __filemap_fdatawait_range(mapping, lstart, lend);
        }
        err2 = filemap_check_errors(mapping);
        if (!err)
                err = err2;
        return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);

void __filemap_set_wb_err(struct address_space *mapping, int err)
{
        errseq_t eseq = errseq_set(&mapping->wb_err, err);

        trace_filemap_set_wb_err(mapping, eseq);
}
EXPORT_SYMBOL(__filemap_set_wb_err);

/**
 * file_check_and_advance_wb_err - report wb error (if any) that was previously
 *                                    and advance wb_err to current one
 * @file: struct file on which the error is being reported
 *
 * When userland calls fsync (or something like nfsd does the equivalent), we
 * want to report any writeback errors that occurred since the last fsync (or
 * since the file was opened if there haven't been any).
 *
 * Grab the wb_err from the mapping. If it matches what we have in the file,
 * then just quickly return 0. The file is all caught up.
 *
 * If it doesn't match, then take the mapping value, set the "seen" flag in
 * it and try to swap it into place. If it works, or another task beat us
 * to it with the new value, then update the f_wb_err and return the error
 * portion. The error at this point must be reported via proper channels
 * (a'la fsync, or NFS COMMIT operation, etc.).
 *
 * While we handle mapping->wb_err with atomic operations, the f_wb_err
 * value is protected by the f_lock since we must ensure that it reflects
 * the latest value swapped in for this file descriptor.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_check_and_advance_wb_err(struct file *file)
{
        int err = 0;
        errseq_t old = READ_ONCE(file->f_wb_err);
        struct address_space *mapping = file->f_mapping;

        /* Locklessly handle the common case where nothing has changed */
        if (errseq_check(&mapping->wb_err, old)) {
                /* Something changed, must use slow path */
                spin_lock(&file->f_lock);
                old = file->f_wb_err;
                err = errseq_check_and_advance(&mapping->wb_err,
                                                &file->f_wb_err);
                trace_file_check_and_advance_wb_err(file, old);
                spin_unlock(&file->f_lock);
        }

        /*
         * We're mostly using this function as a drop in replacement for
         * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
         * that the legacy code would have had on these flags.
         */
        clear_bit(AS_EIO, &mapping->flags);
        clear_bit(AS_ENOSPC, &mapping->flags);
        return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);

/**
 * file_write_and_wait_range - write out & wait on a file range
 * @file:        file pointing to address_space with pages
 * @lstart:        offset in bytes where the range starts
 * @lend:        offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * After writing out and waiting on the data, we check and advance the
 * f_wb_err cursor to the latest value, and return any errors detected there.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
        int err = 0, err2;
        struct address_space *mapping = file->f_mapping;

        if (lend < lstart)
                return 0;

        if (mapping_needs_writeback(mapping)) {
                err = __filemap_fdatawrite_range(mapping, lstart, lend,
                                                 WB_SYNC_ALL);
                /* See comment of filemap_write_and_wait() */
                if (err != -EIO)
                        __filemap_fdatawait_range(mapping, lstart, lend);
        }
        err2 = file_check_and_advance_wb_err(file);
        if (!err)
                err = err2;
        return err;
}
EXPORT_SYMBOL(file_write_and_wait_range);

/**
 * replace_page_cache_folio - replace a pagecache folio with a new one
 * @old:        folio to be replaced
 * @new:        folio to replace with
 *
 * This function replaces a folio in the pagecache with a new one.  On
 * success it acquires the pagecache reference for the new folio and
 * drops it for the old folio.  Both the old and new folios must be
 * locked.  This function does not add the new folio to the LRU, the
 * caller must do that.
 *
 * The remove + add is atomic.  This function cannot fail.
 */
void replace_page_cache_folio(struct folio *old, struct folio *new)
{
        struct address_space *mapping = old->mapping;
        void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
        pgoff_t offset = old->index;
        XA_STATE(xas, &mapping->i_pages, offset);

        VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
        VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
        VM_BUG_ON_FOLIO(new->mapping, new);

        folio_get(new);
        new->mapping = mapping;
        new->index = offset;

        mem_cgroup_replace_folio(old, new);

        xas_lock_irq(&xas);
        xas_store(&xas, new);

        old->mapping = NULL;
        /* hugetlb pages do not participate in page cache accounting. */
        if (!folio_test_hugetlb(old))
                __lruvec_stat_sub_folio(old, NR_FILE_PAGES);
        if (!folio_test_hugetlb(new))
                __lruvec_stat_add_folio(new, NR_FILE_PAGES);
        if (folio_test_swapbacked(old))
                __lruvec_stat_sub_folio(old, NR_SHMEM);
        if (folio_test_swapbacked(new))
                __lruvec_stat_add_folio(new, NR_SHMEM);
        xas_unlock_irq(&xas);
        if (free_folio)
                free_folio(old);
        folio_put(old);
}
EXPORT_SYMBOL_GPL(replace_page_cache_folio);

noinline int __filemap_add_folio(struct address_space *mapping,
                struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
        XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
        bool huge;
        long nr;
        unsigned int forder = folio_order(folio);

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
        VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping),
                        folio);
        mapping_set_update(&xas, mapping);

        VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
        huge = folio_test_hugetlb(folio);
        nr = folio_nr_pages(folio);

        gfp &= GFP_RECLAIM_MASK;
        folio_ref_add(folio, nr);
        folio->mapping = mapping;
        folio->index = xas.xa_index;

        for (;;) {
                int order = -1;
                void *entry, *old = NULL;

                xas_lock_irq(&xas);
                xas_for_each_conflict(&xas, entry) {
                        old = entry;
                        if (!xa_is_value(entry)) {
                                xas_set_err(&xas, -EEXIST);
                                goto unlock;
                        }
                        /*
                         * If a larger entry exists,
                         * it will be the first and only entry iterated.
                         */
                        if (order == -1)
                                order = xas_get_order(&xas);
                }

                if (old) {
                        if (order > 0 && order > forder) {
                                unsigned int split_order = max(forder,
                                                xas_try_split_min_order(order));

                                /* How to handle large swap entries? */
                                BUG_ON(shmem_mapping(mapping));

                                while (order > forder) {
                                        xas_set_order(&xas, index, split_order);
                                        xas_try_split(&xas, old, order);
                                        if (xas_error(&xas))
                                                goto unlock;
                                        order = split_order;
                                        split_order =
                                                max(xas_try_split_min_order(
                                                            split_order),
                                                    forder);
                                }
                                xas_reset(&xas);
                        }
                        if (shadowp)
                                *shadowp = old;
                }

                xas_store(&xas, folio);
                if (xas_error(&xas))
                        goto unlock;

                mapping->nrpages += nr;

                /* hugetlb pages do not participate in page cache accounting */
                if (!huge) {
                        __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
                        if (folio_test_pmd_mappable(folio))
                                __lruvec_stat_mod_folio(folio,
                                                NR_FILE_THPS, nr);
                }

unlock:
                xas_unlock_irq(&xas);

                if (!xas_nomem(&xas, gfp))
                        break;
        }

        if (xas_error(&xas))
                goto error;

        trace_mm_filemap_add_to_page_cache(folio);
        return 0;
error:
        folio->mapping = NULL;
        /* Leave page->index set: truncation relies upon it */
        folio_put_refs(folio, nr);
        return xas_error(&xas);
}
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);

int filemap_add_folio(struct address_space *mapping, struct folio *folio,
                                pgoff_t index, gfp_t gfp)
{
        void *shadow = NULL;
        int ret;

        ret = mem_cgroup_charge(folio, NULL, gfp);
        if (ret)
                return ret;

        __folio_set_locked(folio);
        ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
        if (unlikely(ret)) {
                mem_cgroup_uncharge(folio);
                __folio_clear_locked(folio);
        } else {
                /*
                 * The folio might have been evicted from cache only
                 * recently, in which case it should be activated like
                 * any other repeatedly accessed folio.
                 * The exception is folios getting rewritten; evicting other
                 * data from the working set, only to cache data that will
                 * get overwritten with something else, is a waste of memory.
                 */
                WARN_ON_ONCE(folio_test_active(folio));
                if (!(gfp & __GFP_WRITE) && shadow)
                        workingset_refault(folio, shadow);
                folio_add_lru(folio);
        }
        return ret;
}
EXPORT_SYMBOL_GPL(filemap_add_folio);

#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order)
{
        int n;
        struct folio *folio;

        if (cpuset_do_page_mem_spread()) {
                unsigned int cpuset_mems_cookie;
                do {
                        cpuset_mems_cookie = read_mems_allowed_begin();
                        n = cpuset_mem_spread_node();
                        folio = __folio_alloc_node_noprof(gfp, order, n);
                } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));

                return folio;
        }
        return folio_alloc_noprof(gfp, order);
}
EXPORT_SYMBOL(filemap_alloc_folio_noprof);
#endif

/*
 * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
 *
 * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to lock
 * @mapping2: the second mapping to lock
 */
void filemap_invalidate_lock_two(struct address_space *mapping1,
                                 struct address_space *mapping2)
{
        if (mapping1 > mapping2)
                swap(mapping1, mapping2);
        if (mapping1)
                down_write(&mapping1->invalidate_lock);
        if (mapping2 && mapping1 != mapping2)
                down_write_nested(&mapping2->invalidate_lock, 1);
}
EXPORT_SYMBOL(filemap_invalidate_lock_two);

/*
 * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
 *
 * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to unlock
 * @mapping2: the second mapping to unlock
 */
void filemap_invalidate_unlock_two(struct address_space *mapping1,
                                   struct address_space *mapping2)
{
        if (mapping1)
                up_write(&mapping1->invalidate_lock);
        if (mapping2 && mapping1 != mapping2)
                up_write(&mapping2->invalidate_lock);
}
EXPORT_SYMBOL(filemap_invalidate_unlock_two);

/*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
 * waitqueues where the bucket discipline is to maintain all
 * waiters on the same queue and wake all when any of the pages
 * become available, and for the woken contexts to check to be
 * sure the appropriate page became available, this saves space
 * at a cost of "thundering herd" phenomena during rare hash
 * collisions.
 */
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;

static wait_queue_head_t *folio_waitqueue(struct folio *folio)
{
        return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}

/* How many times do we accept lock stealing from under a waiter? */
static int sysctl_page_lock_unfairness = 5;
static const struct ctl_table filemap_sysctl_table[] = {
        {
                .procname        = "page_lock_unfairness",
                .data                = &sysctl_page_lock_unfairness,
                .maxlen                = sizeof(sysctl_page_lock_unfairness),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
        }
};

void __init pagecache_init(void)
{
        int i;

        for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
                init_waitqueue_head(&folio_wait_table[i]);

        page_writeback_init();
        register_sysctl_init("vm", filemap_sysctl_table);
}

/*
 * The page wait code treats the "wait->flags" somewhat unusually, because
 * we have multiple different kinds of waits, not just the usual "exclusive"
 * one.
 *
 * We have:
 *
 *  (a) no special bits set:
 *
 *        We're just waiting for the bit to be released, and when a waker
 *        calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
 *        and remove it from the wait queue.
 *
 *        Simple and straightforward.
 *
 *  (b) WQ_FLAG_EXCLUSIVE:
 *
 *        The waiter is waiting to get the lock, and only one waiter should
 *        be woken up to avoid any thundering herd behavior. We'll set the
 *        WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
 *
 *        This is the traditional exclusive wait.
 *
 *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
 *
 *        The waiter is waiting to get the bit, and additionally wants the
 *        lock to be transferred to it for fair lock behavior. If the lock
 *        cannot be taken, we stop walking the wait queue without waking
 *        the waiter.
 *
 *        This is the "fair lock handoff" case, and in addition to setting
 *        WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
 *        that it now has the lock.
 */
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
        unsigned int flags;
        struct wait_page_key *key = arg;
        struct wait_page_queue *wait_page
                = container_of(wait, struct wait_page_queue, wait);

        if (!wake_page_match(wait_page, key))
                return 0;

        /*
         * If it's a lock handoff wait, we get the bit for it, and
         * stop walking (and do not wake it up) if we can't.
         */
        flags = wait->flags;
        if (flags & WQ_FLAG_EXCLUSIVE) {
                if (test_bit(key->bit_nr, &key->folio->flags))
                        return -1;
                if (flags & WQ_FLAG_CUSTOM) {
                        if (test_and_set_bit(key->bit_nr, &key->folio->flags))
                                return -1;
                        flags |= WQ_FLAG_DONE;
                }
        }

        /*
         * We are holding the wait-queue lock, but the waiter that
         * is waiting for this will be checking the flags without
         * any locking.
         *
         * So update the flags atomically, and wake up the waiter
         * afterwards to avoid any races. This store-release pairs
         * with the load-acquire in folio_wait_bit_common().
         */
        smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
        wake_up_state(wait->private, mode);

        /*
         * Ok, we have successfully done what we're waiting for,
         * and we can unconditionally remove the wait entry.
         *
         * Note that this pairs with the "finish_wait()" in the
         * waiter, and has to be the absolute last thing we do.
         * After this list_del_init(&wait->entry) the wait entry
         * might be de-allocated and the process might even have
         * exited.
         */
        list_del_init_careful(&wait->entry);
        return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}

static void folio_wake_bit(struct folio *folio, int bit_nr)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        struct wait_page_key key;
        unsigned long flags;

        key.folio = folio;
        key.bit_nr = bit_nr;
        key.page_match = 0;

        spin_lock_irqsave(&q->lock, flags);
        __wake_up_locked_key(q, TASK_NORMAL, &key);

        /*
         * It's possible to miss clearing waiters here, when we woke our page
         * waiters, but the hashed waitqueue has waiters for other pages on it.
         * That's okay, it's a rare case. The next waker will clear it.
         *
         * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
         * other), the flag may be cleared in the course of freeing the page;
         * but that is not required for correctness.
         */
        if (!waitqueue_active(q) || !key.page_match)
                folio_clear_waiters(folio);

        spin_unlock_irqrestore(&q->lock, flags);
}

/*
 * A choice of three behaviors for folio_wait_bit_common():
 */
enum behavior {
        EXCLUSIVE,        /* Hold ref to page and take the bit when woken, like
                         * __folio_lock() waiting on then setting PG_locked.
                         */
        SHARED,                /* Hold ref to page and check the bit when woken, like
                         * folio_wait_writeback() waiting on PG_writeback.
                         */
        DROP,                /* Drop ref to page before wait, no check when woken,
                         * like folio_put_wait_locked() on PG_locked.
                         */
};

/*
 * Attempt to check (or get) the folio flag, and mark us done
 * if successful.
 */
static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
                                        struct wait_queue_entry *wait)
{
        if (wait->flags & WQ_FLAG_EXCLUSIVE) {
                if (test_and_set_bit(bit_nr, &folio->flags))
                        return false;
        } else if (test_bit(bit_nr, &folio->flags))
                return false;

        wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
        return true;
}

static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
                int state, enum behavior behavior)
{
        wait_queue_head_t *q = folio_waitqueue(folio);
        int unfairness = sysctl_page_lock_unfairness;
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
        bool thrashing = false;
        unsigned long pflags;
        bool in_thrashing;

        if (bit_nr == PG_locked &&
            !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
                delayacct_thrashing_start(&in_thrashing);
                psi_memstall_enter(&pflags);
                thrashing = true;
        }

        init_wait(wait);
        wait->func = wake_page_function;
        wait_page.folio = folio;
        wait_page.bit_nr = bit_nr;

repeat:
        wait->flags = 0;
        if (behavior == EXCLUSIVE) {
                wait->flags = WQ_FLAG_EXCLUSIVE;
                if (--unfairness < 0)
                        wait->flags |= WQ_FLAG_CUSTOM;
        }

        /*
         * Do one last check whether we can get the
         * page bit synchronously.
         *
         * Do the folio_set_waiters() marking before that
         * to let any waker we _just_ missed know they
         * need to wake us up (otherwise they'll never
         * even go to the slow case that looks at the
         * page queue), and add ourselves to the wait
         * queue if we need to sleep.
         *
         * This part needs to be done under the queue
         * lock to avoid races.
         */
        spin_lock_irq(&q->lock);
        folio_set_waiters(folio);
        if (!folio_trylock_flag(folio, bit_nr, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);

        /*
         * From now on, all the logic will be based on
         * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
         * see whether the page bit testing has already
         * been done by the wake function.
         *
         * We can drop our reference to the folio.
         */
        if (behavior == DROP)
                folio_put(folio);

        /*
         * Note that until the "finish_wait()", or until
         * we see the WQ_FLAG_WOKEN flag, we need to
         * be very careful with the 'wait->flags', because
         * we may race with a waker that sets them.
         */
        for (;;) {
                unsigned int flags;

                set_current_state(state);

                /* Loop until we've been woken or interrupted */
                flags = smp_load_acquire(&wait->flags);
                if (!(flags & WQ_FLAG_WOKEN)) {
                        if (signal_pending_state(state, current))
                                break;

                        io_schedule();
                        continue;
                }

                /* If we were non-exclusive, we're done */
                if (behavior != EXCLUSIVE)
                        break;

                /* If the waker got the lock for us, we're done */
                if (flags & WQ_FLAG_DONE)
                        break;

                /*
                 * Otherwise, if we're getting the lock, we need to
                 * try to get it ourselves.
                 *
                 * And if that fails, we'll have to retry this all.
                 */
                if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
                        goto repeat;

                wait->flags |= WQ_FLAG_DONE;
                break;
        }

        /*
         * If a signal happened, this 'finish_wait()' may remove the last
         * waiter from the wait-queues, but the folio waiters bit will remain
         * set. That's ok. The next wakeup will take care of it, and trying
         * to do it here would be difficult and prone to races.
         */
        finish_wait(q, wait);

        if (thrashing) {
                delayacct_thrashing_end(&in_thrashing);
                psi_memstall_leave(&pflags);
        }

        /*
         * NOTE! The wait->flags weren't stable until we've done the
         * 'finish_wait()', and we could have exited the loop above due
         * to a signal, and had a wakeup event happen after the signal
         * test but before the 'finish_wait()'.
         *
         * So only after the finish_wait() can we reliably determine
         * if we got woken up or not, so we can now figure out the final
         * return value based on that state without races.
         *
         * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
         * waiter, but an exclusive one requires WQ_FLAG_DONE.
         */
        if (behavior == EXCLUSIVE)
                return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;

        return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}

#ifdef CONFIG_MIGRATION
/**
 * migration_entry_wait_on_locked - Wait for a migration entry to be removed
 * @entry: migration swap entry.
 * @ptl: already locked ptl. This function will drop the lock.
 *
 * Wait for a migration entry referencing the given page to be removed. This is
 * equivalent to folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE) except
 * this can be called without taking a reference on the page. Instead this
 * should be called while holding the ptl for the migration entry referencing
 * the page.
 *
 * Returns after unlocking the ptl.
 *
 * This follows the same logic as folio_wait_bit_common() so see the comments
 * there.
 */
void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
        __releases(ptl)
{
        struct wait_page_queue wait_page;
        wait_queue_entry_t *wait = &wait_page.wait;
        bool thrashing = false;
        unsigned long pflags;
        bool in_thrashing;
        wait_queue_head_t *q;
        struct folio *folio = pfn_swap_entry_folio(entry);

        q = folio_waitqueue(folio);
        if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
                delayacct_thrashing_start(&in_thrashing);
                psi_memstall_enter(&pflags);
                thrashing = true;
        }

        init_wait(wait);
        wait->func = wake_page_function;
        wait_page.folio = folio;
        wait_page.bit_nr = PG_locked;
        wait->flags = 0;

        spin_lock_irq(&q->lock);
        folio_set_waiters(folio);
        if (!folio_trylock_flag(folio, PG_locked, wait))
                __add_wait_queue_entry_tail(q, wait);
        spin_unlock_irq(&q->lock);

        /*
         * If a migration entry exists for the page the migration path must hold
         * a valid reference to the page, and it must take the ptl to remove the
         * migration entry. So the page is valid until the ptl is dropped.
         */
        spin_unlock(ptl);

        for (;;) {
                unsigned int flags;

                set_current_state(TASK_UNINTERRUPTIBLE);

                /* Loop until we've been woken or interrupted */
                flags = smp_load_acquire(&wait->flags);
                if (!(flags & WQ_FLAG_WOKEN)) {
                        if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
                                break;

                        io_schedule();
                        continue;
                }
                break;
        }

        finish_wait(q, wait);

        if (thrashing) {
                delayacct_thrashing_end(&in_thrashing);
                psi_memstall_leave(&pflags);
        }
}
#endif

void folio_wait_bit(struct folio *folio, int bit_nr)
{
        folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit);

int folio_wait_bit_killable(struct folio *folio, int bit_nr)
{
        return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(folio_wait_bit_killable);

/**
 * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
 * @folio: The folio to wait for.
 * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
 *
 * The caller should hold a reference on @folio.  They expect the page to
 * become unlocked relatively soon, but do not wish to hold up migration
 * (for example) by holding the reference while waiting for the folio to
 * come unlocked.  After this function returns, the caller should not
 * dereference @folio.
 *
 * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
 */
static int folio_put_wait_locked(struct folio *folio, int state)
{
        return folio_wait_bit_common(folio, PG_locked, state, DROP);
}

/**
 * folio_unlock - Unlock a locked folio.
 * @folio: The folio.
 *
 * Unlocks the folio and wakes up any thread sleeping on the page lock.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_unlock(struct folio *folio)
{
        /* Bit 7 allows x86 to check the byte's sign bit */
        BUILD_BUG_ON(PG_waiters != 7);
        BUILD_BUG_ON(PG_locked > 7);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
                folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_unlock);

/**
 * folio_end_read - End read on a folio.
 * @folio: The folio.
 * @success: True if all reads completed successfully.
 *
 * When all reads against a folio have completed, filesystems should
 * call this function to let the pagecache know that no more reads
 * are outstanding.  This will unlock the folio and wake up any thread
 * sleeping on the lock.  The folio will also be marked uptodate if all
 * reads succeeded.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_end_read(struct folio *folio, bool success)
{
        unsigned long mask = 1 << PG_locked;

        /* Must be in bottom byte for x86 to work */
        BUILD_BUG_ON(PG_uptodate > 7);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
        VM_BUG_ON_FOLIO(success && folio_test_uptodate(folio), folio);

        if (likely(success))
                mask |= 1 << PG_uptodate;
        if (folio_xor_flags_has_waiters(folio, mask))
                folio_wake_bit(folio, PG_locked);
}
EXPORT_SYMBOL(folio_end_read);

/**
 * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
 * @folio: The folio.
 *
 * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
 * it.  The folio reference held for PG_private_2 being set is released.
 *
 * This is, for example, used when a netfs folio is being written to a local
 * disk cache, thereby allowing writes to the cache for the same folio to be
 * serialised.
 */
void folio_end_private_2(struct folio *folio)
{
        VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
        clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
        folio_wake_bit(folio, PG_private_2);
        folio_put(folio);
}
EXPORT_SYMBOL(folio_end_private_2);

/**
 * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
 *
 * Wait for PG_private_2 to be cleared on a folio.
 */
void folio_wait_private_2(struct folio *folio)
{
        while (folio_test_private_2(folio))
                folio_wait_bit(folio, PG_private_2);
}
EXPORT_SYMBOL(folio_wait_private_2);

/**
 * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
 *
 * Wait for PG_private_2 to be cleared on a folio or until a fatal signal is
 * received by the calling task.
 *
 * Return:
 * - 0 if successful.
 * - -EINTR if a fatal signal was encountered.
 */
int folio_wait_private_2_killable(struct folio *folio)
{
        int ret = 0;

        while (folio_test_private_2(folio)) {
                ret = folio_wait_bit_killable(folio, PG_private_2);
                if (ret < 0)
                        break;
        }

        return ret;
}
EXPORT_SYMBOL(folio_wait_private_2_killable);

/*
 * If folio was marked as dropbehind, then pages should be dropped when writeback
 * completes. Do that now. If we fail, it's likely because of a big folio -
 * just reset dropbehind for that case and latter completions should invalidate.
 */
static void folio_end_dropbehind_write(struct folio *folio)
{
        /*
         * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
         * but can happen if normal writeback just happens to find dirty folios
         * that were created as part of uncached writeback, and that writeback
         * would otherwise not need non-IRQ handling. Just skip the
         * invalidation in that case.
         */
        if (in_task() && folio_trylock(folio)) {
                if (folio->mapping)
                        folio_unmap_invalidate(folio->mapping, folio, 0);
                folio_unlock(folio);
        }
}

/**
 * folio_end_writeback - End writeback against a folio.
 * @folio: The folio.
 *
 * The folio must actually be under writeback.
 *
 * Context: May be called from process or interrupt context.
 */
void folio_end_writeback(struct folio *folio)
{
        bool folio_dropbehind = false;

        VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);

        /*
         * folio_test_clear_reclaim() could be used here but it is an
         * atomic operation and overkill in this particular case. Failing
         * to shuffle a folio marked for immediate reclaim is too mild
         * a gain to justify taking an atomic operation penalty at the
         * end of every folio writeback.
         */
        if (folio_test_reclaim(folio)) {
                folio_clear_reclaim(folio);
                folio_rotate_reclaimable(folio);
        }

        /*
         * Writeback does not hold a folio reference of its own, relying
         * on truncation to wait for the clearing of PG_writeback.
         * But here we must make sure that the folio is not freed and
         * reused before the folio_wake_bit().
         */
        folio_get(folio);
        if (!folio_test_dirty(folio))
                folio_dropbehind = folio_test_clear_dropbehind(folio);
        if (__folio_end_writeback(folio))
                folio_wake_bit(folio, PG_writeback);
        acct_reclaim_writeback(folio);

        if (folio_dropbehind)
                folio_end_dropbehind_write(folio);
        folio_put(folio);
}
EXPORT_SYMBOL(folio_end_writeback);

/**
 * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
 * @folio: The folio to lock
 */
void __folio_lock(struct folio *folio)
{
        folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
                                EXCLUSIVE);
}
EXPORT_SYMBOL(__folio_lock);

int __folio_lock_killable(struct folio *folio)
{
        return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
                                        EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__folio_lock_killable);

static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
        struct wait_queue_head *q = folio_waitqueue(folio);
        int ret;

        wait->folio = folio;
        wait->bit_nr = PG_locked;

        spin_lock_irq(&q->lock);
        __add_wait_queue_entry_tail(q, &wait->wait);
        folio_set_waiters(folio);
        ret = !folio_trylock(folio);
        /*
         * If we were successful now, we know we're still on the
         * waitqueue as we're still under the lock. This means it's
         * safe to remove and return success, we know the callback
         * isn't going to trigger.
         */
        if (!ret)
                __remove_wait_queue(q, &wait->wait);
        else
                ret = -EIOCBQUEUED;
        spin_unlock_irq(&q->lock);
        return ret;
}

/*
 * Return values:
 * 0 - folio is locked.
 * non-zero - folio is not locked.
 *     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
 *     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
 *     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
 *
 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
 */
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
{
        unsigned int flags = vmf->flags;

        if (fault_flag_allow_retry_first(flags)) {
                /*
                 * CAUTION! In this case, mmap_lock/per-VMA lock is not
                 * released even though returning VM_FAULT_RETRY.
                 */
                if (flags & FAULT_FLAG_RETRY_NOWAIT)
                        return VM_FAULT_RETRY;

                release_fault_lock(vmf);
                if (flags & FAULT_FLAG_KILLABLE)
                        folio_wait_locked_killable(folio);
                else
                        folio_wait_locked(folio);
                return VM_FAULT_RETRY;
        }
        if (flags & FAULT_FLAG_KILLABLE) {
                bool ret;

                ret = __folio_lock_killable(folio);
                if (ret) {
                        release_fault_lock(vmf);
                        return VM_FAULT_RETRY;
                }
        } else {
                __folio_lock(folio);
        }

        return 0;
}

/**
 * page_cache_next_miss() - Find the next gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
 * gap with the lowest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 5, then subsequently a gap is
 * created at index 10, page_cache_next_miss covering both indices may
 * return 10 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'return - index >= max_scan' will be true).
 * In the rare case of index wrap-around, 0 will be returned.
 */
pgoff_t page_cache_next_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);

        while (max_scan--) {
                void *entry = xas_next(&xas);
                if (!entry || xa_is_value(entry))
                        return xas.xa_index;
                if (xas.xa_index == 0)
                        return 0;
        }

        return index + max_scan;
}
EXPORT_SYMBOL(page_cache_next_miss);

/**
 * page_cache_prev_miss() - Find the previous gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
 *
 * Search the range [max(index - max_scan + 1, 0), index] for the
 * gap with the highest index.
 *
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 10, then subsequently a gap is
 * created at index 5, page_cache_prev_miss() covering both indices may
 * return 5 if called under the rcu_read_lock.
 *
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'index - return >= max_scan' will be true).
 * In the rare case of wrap-around, ULONG_MAX will be returned.
 */
pgoff_t page_cache_prev_miss(struct address_space *mapping,
                             pgoff_t index, unsigned long max_scan)
{
        XA_STATE(xas, &mapping->i_pages, index);

        while (max_scan--) {
                void *entry = xas_prev(&xas);
                if (!entry || xa_is_value(entry))
                        break;
                if (xas.xa_index == ULONG_MAX)
                        break;
        }

        return xas.xa_index;
}
EXPORT_SYMBOL(page_cache_prev_miss);

/*
 * Lockless page cache protocol:
 * On the lookup side:
 * 1. Load the folio from i_pages
 * 2. Increment the refcount if it's not zero
 * 3. If the folio is not found by xas_reload(), put the refcount and retry
 *
 * On the removal side:
 * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
 * B. Remove the page from i_pages
 * C. Return the page to the page allocator
 *
 * This means that any page may have its reference count temporarily
 * increased by a speculative page cache (or GUP-fast) lookup as it can
 * be allocated by another user before the RCU grace period expires.
 * Because the refcount temporarily acquired here may end up being the
 * last refcount on the page, any page allocation must be freeable by
 * folio_put().
 */

/*
 * filemap_get_entry - Get a page cache entry.
 * @mapping: the address_space to search
 * @index: The page cache index.
 *
 * Looks up the page cache entry at @mapping & @index.  If it is a folio,
 * it is returned with an increased refcount.  If it is a shadow entry
 * of a previously evicted folio, or a swap entry from shmem/tmpfs,
 * it is returned without further action.
 *
 * Return: The folio, swap or shadow entry, %NULL if nothing is found.
 */
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;

        rcu_read_lock();
repeat:
        xas_reset(&xas);
        folio = xas_load(&xas);
        if (xas_retry(&xas, folio))
                goto repeat;
        /*
         * A shadow entry of a recently evicted page, or a swap entry from
         * shmem/tmpfs.  Return it without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                goto out;

        if (!folio_try_get(folio))
                goto repeat;

        if (unlikely(folio != xas_reload(&xas))) {
                folio_put(folio);
                goto repeat;
        }
out:
        rcu_read_unlock();

        return folio;
}

/**
 * __filemap_get_folio - Find and get a reference to a folio.
 * @mapping: The address_space to search.
 * @index: The page index.
 * @fgp_flags: %FGP flags modify how the folio is returned.
 * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
 *
 * Looks up the page cache entry at @mapping & @index.
 *
 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
 * if the %GFP flags specified for %FGP_CREAT are atomic.
 *
 * If this function returns a folio, it is returned with an increased refcount.
 *
 * Return: The found folio or an ERR_PTR() otherwise.
 */
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp)
{
        struct folio *folio;

repeat:
        folio = filemap_get_entry(mapping, index);
        if (xa_is_value(folio))
                folio = NULL;
        if (!folio)
                goto no_page;

        if (fgp_flags & FGP_LOCK) {
                if (fgp_flags & FGP_NOWAIT) {
                        if (!folio_trylock(folio)) {
                                folio_put(folio);
                                return ERR_PTR(-EAGAIN);
                        }
                } else {
                        folio_lock(folio);
                }

                /* Has the page been truncated? */
                if (unlikely(folio->mapping != mapping)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto repeat;
                }
                VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
        }

        if (fgp_flags & FGP_ACCESSED)
                folio_mark_accessed(folio);
        else if (fgp_flags & FGP_WRITE) {
                /* Clear idle flag for buffer write */
                if (folio_test_idle(folio))
                        folio_clear_idle(folio);
        }

        if (fgp_flags & FGP_STABLE)
                folio_wait_stable(folio);
no_page:
        if (!folio && (fgp_flags & FGP_CREAT)) {
                unsigned int min_order = mapping_min_folio_order(mapping);
                unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags));
                int err;
                index = mapping_align_index(mapping, index);

                if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
                        gfp |= __GFP_WRITE;
                if (fgp_flags & FGP_NOFS)
                        gfp &= ~__GFP_FS;
                if (fgp_flags & FGP_NOWAIT) {
                        gfp &= ~GFP_KERNEL;
                        gfp |= GFP_NOWAIT | __GFP_NOWARN;
                }
                if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
                        fgp_flags |= FGP_LOCK;

                if (order > mapping_max_folio_order(mapping))
                        order = mapping_max_folio_order(mapping);
                /* If we're not aligned, allocate a smaller folio */
                if (index & ((1UL << order) - 1))
                        order = __ffs(index);

                do {
                        gfp_t alloc_gfp = gfp;

                        err = -ENOMEM;
                        if (order > min_order)
                                alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
                        folio = filemap_alloc_folio(alloc_gfp, order);
                        if (!folio)
                                continue;

                        /* Init accessed so avoid atomic mark_page_accessed later */
                        if (fgp_flags & FGP_ACCESSED)
                                __folio_set_referenced(folio);
                        if (fgp_flags & FGP_DONTCACHE)
                                __folio_set_dropbehind(folio);

                        err = filemap_add_folio(mapping, folio, index, gfp);
                        if (!err)
                                break;
                        folio_put(folio);
                        folio = NULL;
                } while (order-- > min_order);

                if (err == -EEXIST)
                        goto repeat;
                if (err) {
                        /*
                         * When NOWAIT I/O fails to allocate folios this could
                         * be due to a nonblocking memory allocation and not
                         * because the system actually is out of memory.
                         * Return -EAGAIN so that there caller retries in a
                         * blocking fashion instead of propagating -ENOMEM
                         * to the application.
                         */
                        if ((fgp_flags & FGP_NOWAIT) && err == -ENOMEM)
                                err = -EAGAIN;
                        return ERR_PTR(err);
                }
                /*
                 * filemap_add_folio locks the page, and for mmap
                 * we expect an unlocked page.
                 */
                if (folio && (fgp_flags & FGP_FOR_MMAP))
                        folio_unlock(folio);
        }

        if (!folio)
                return ERR_PTR(-ENOENT);
        /* not an uncached lookup, clear uncached if set */
        if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
                folio_clear_dropbehind(folio);
        return folio;
}
EXPORT_SYMBOL(__filemap_get_folio);

static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
                xa_mark_t mark)
{
        struct folio *folio;

retry:
        if (mark == XA_PRESENT)
                folio = xas_find(xas, max);
        else
                folio = xas_find_marked(xas, max, mark);

        if (xas_retry(xas, folio))
                goto retry;
        /*
         * A shadow entry of a recently evicted page, a swap
         * entry from shmem/tmpfs or a DAX entry.  Return it
         * without attempting to raise page count.
         */
        if (!folio || xa_is_value(folio))
                return folio;

        if (!folio_try_get(folio))
                goto reset;

        if (unlikely(folio != xas_reload(xas))) {
                folio_put(folio);
                goto reset;
        }

        return folio;
reset:
        xas_reset(xas);
        goto retry;
}

/**
 * find_get_entries - gang pagecache lookup
 * @mapping:        The address_space to search
 * @start:        The starting page cache index
 * @end:        The final page index (inclusive).
 * @fbatch:        Where the resulting entries are placed.
 * @indices:        The cache indices corresponding to the entries in @entries
 *
 * find_get_entries() will search for and return a batch of entries in
 * the mapping.  The entries are placed in @fbatch.  find_get_entries()
 * takes a reference on any actual folios it returns.
 *
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries or large folios.
 *
 * Any shadow entries of evicted folios, or swap entries from
 * shmem/tmpfs, are included in the returned array.
 *
 * Return: The number of entries which were found.
 */
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
                indices[fbatch->nr] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;
        }

        if (folio_batch_count(fbatch)) {
                unsigned long nr;
                int idx = folio_batch_count(fbatch) - 1;

                folio = fbatch->folios[idx];
                if (!xa_is_value(folio))
                        nr = folio_nr_pages(folio);
                else
                        nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
                *start = round_down(indices[idx] + nr, nr);
        }
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}

/**
 * find_lock_entries - Find a batch of pagecache entries.
 * @mapping:        The address_space to search.
 * @start:        The starting page cache index.
 * @end:        The final page index (inclusive).
 * @fbatch:        Where the resulting entries are placed.
 * @indices:        The cache indices of the entries in @fbatch.
 *
 * find_lock_entries() will return a batch of entries from @mapping.
 * Swap, shadow and DAX entries are included.  Folios are returned
 * locked and with an incremented refcount.  Folios which are locked
 * by somebody else or under writeback are skipped.  Folios which are
 * partially outside the range are not returned.
 *
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries, large folios, folios which could not be
 * locked or folios under writeback.
 *
 * Return: The number of entries which were found.
 */
unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
                unsigned long base;
                unsigned long nr;

                if (!xa_is_value(folio)) {
                        nr = folio_nr_pages(folio);
                        base = folio->index;
                        /* Omit large folio which begins before the start */
                        if (base < *start)
                                goto put;
                        /* Omit large folio which extends beyond the end */
                        if (base + nr - 1 > end)
                                goto put;
                        if (!folio_trylock(folio))
                                goto put;
                        if (folio->mapping != mapping ||
                            folio_test_writeback(folio))
                                goto unlock;
                        VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
                                        folio);
                } else {
                        nr = 1 << xas_get_order(&xas);
                        base = xas.xa_index & ~(nr - 1);
                        /* Omit order>0 value which begins before the start */
                        if (base < *start)
                                continue;
                        /* Omit order>0 value which extends beyond the end */
                        if (base + nr - 1 > end)
                                break;
                }

                /* Update start now so that last update is correct on return */
                *start = base + nr;
                indices[fbatch->nr] = xas.xa_index;
                if (!folio_batch_add(fbatch, folio))
                        break;
                continue;
unlock:
                folio_unlock(folio);
put:
                folio_put(folio);
        }
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}

/**
 * filemap_get_folios - Get a batch of folios
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index (inclusive)
 * @fbatch:        The batch to fill.
 *
 * Search for and return a batch of folios in the mapping starting at
 * index @start and up to index @end (inclusive).  The folios are returned
 * in @fbatch with an elevated reference count.
 *
 * Return: The number of folios which were found.
 * We also update @start to index the next folio for the traversal.
 */
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
                pgoff_t end, struct folio_batch *fbatch)
{
        return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
}
EXPORT_SYMBOL(filemap_get_folios);

/**
 * filemap_get_folios_contig - Get a batch of contiguous folios
 * @mapping:        The address_space to search
 * @start:        The starting page index
 * @end:        The final page index (inclusive)
 * @fbatch:        The batch to fill
 *
 * filemap_get_folios_contig() works exactly like filemap_get_folios(),
 * except the returned folios are guaranteed to be contiguous. This may
 * not return all contiguous folios if the batch gets filled up.
 *
 * Return: The number of folios found.
 * Also update @start to be positioned for traversal of the next folio.
 */

unsigned filemap_get_folios_contig(struct address_space *mapping,
                pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        unsigned long nr;
        struct folio *folio;

        rcu_read_lock();

        for (folio = xas_load(&xas); folio && xas.xa_index <= end;
                        folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;
                /*
                 * If the entry has been swapped out, we can stop looking.
                 * No current caller is looking for DAX entries.
                 */
                if (xa_is_value(folio))
                        goto update_start;

                /* If we landed in the middle of a THP, continue at its end. */
                if (xa_is_sibling(folio))
                        goto update_start;

                if (!folio_try_get(folio))
                        goto retry;

                if (unlikely(folio != xas_reload(&xas)))
                        goto put_folio;

                if (!folio_batch_add(fbatch, folio)) {
                        nr = folio_nr_pages(folio);
                        *start = folio->index + nr;
                        goto out;
                }
                xas_advance(&xas, folio_next_index(folio) - 1);
                continue;
put_folio:
                folio_put(folio);

retry:
                xas_reset(&xas);
        }

update_start:
        nr = folio_batch_count(fbatch);

        if (nr) {
                folio = fbatch->folios[nr - 1];
                *start = folio_next_index(folio);
        }
out:
        rcu_read_unlock();
        return folio_batch_count(fbatch);
}
EXPORT_SYMBOL(filemap_get_folios_contig);

/**
 * filemap_get_folios_tag - Get a batch of folios matching @tag
 * @mapping:    The address_space to search
 * @start:      The starting page index
 * @end:        The final page index (inclusive)
 * @tag:        The tag index
 * @fbatch:     The batch to fill
 *
 * The first folio may start before @start; if it does, it will contain
 * @start.  The final folio may extend beyond @end; if it does, it will
 * contain @end.  The folios have ascending indices.  There may be gaps
 * between the folios if there are indices which have no folio in the
 * page cache.  If folios are added to or removed from the page cache
 * while this is running, they may or may not be found by this call.
 * Only returns folios that are tagged with @tag.
 *
 * Return: The number of folios found.
 * Also update @start to index the next folio for traversal.
 */
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
                        pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, *start);
        struct folio *folio;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
                /*
                 * Shadow entries should never be tagged, but this iteration
                 * is lockless so there is a window for page reclaim to evict
                 * a page we saw tagged. Skip over it.
                 */
                if (xa_is_value(folio))
                        continue;
                if (!folio_batch_add(fbatch, folio)) {
                        unsigned long nr = folio_nr_pages(folio);
                        *start = folio->index + nr;
                        goto out;
                }
        }
        /*
         * We come here when there is no page beyond @end. We take care to not
         * overflow the index @start as it confuses some of the callers. This
         * breaks the iteration when there is a page at index -1 but that is
         * already broke anyway.
         */
        if (end == (pgoff_t)-1)
                *start = (pgoff_t)-1;
        else
                *start = end + 1;
out:
        rcu_read_unlock();

        return folio_batch_count(fbatch);
}
EXPORT_SYMBOL(filemap_get_folios_tag);

/*
 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 * a _large_ part of the i/o request. Imagine the worst scenario:
 *
 *      ---R__________________________________________B__________
 *         ^ reading here                             ^ bad block(assume 4k)
 *
 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 * => failing the whole request => read(R) => read(R+1) =>
 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 *
 * It is going insane. Fix it by quickly scaling down the readahead size.
 */
static void shrink_readahead_size_eio(struct file_ra_state *ra)
{
        ra->ra_pages /= 4;
}

/*
 * filemap_get_read_batch - Get a batch of folios for read
 *
 * Get a batch of folios which represent a contiguous range of bytes in
 * the file.  No exceptional entries will be returned.  If @index is in
 * the middle of a folio, the entire folio will be returned.  The last
 * folio in the batch may have the readahead flag set or the uptodate flag
 * clear so that the caller can take the appropriate action.
 */
static void filemap_get_read_batch(struct address_space *mapping,
                pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
{
        XA_STATE(xas, &mapping->i_pages, index);
        struct folio *folio;

        rcu_read_lock();
        for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
                if (xas_retry(&xas, folio))
                        continue;
                if (xas.xa_index > max || xa_is_value(folio))
                        break;
                if (xa_is_sibling(folio))
                        break;
                if (!folio_try_get(folio))
                        goto retry;

                if (unlikely(folio != xas_reload(&xas)))
                        goto put_folio;

                if (!folio_batch_add(fbatch, folio))
                        break;
                if (!folio_test_uptodate(folio))
                        break;
                if (folio_test_readahead(folio))
                        break;
                xas_advance(&xas, folio_next_index(folio) - 1);
                continue;
put_folio:
                folio_put(folio);
retry:
                xas_reset(&xas);
        }
        rcu_read_unlock();
}

static int filemap_read_folio(struct file *file, filler_t filler,
                struct folio *folio)
{
        bool workingset = folio_test_workingset(folio);
        unsigned long pflags;
        int error;

        /* Start the actual read. The read will unlock the page. */
        if (unlikely(workingset))
                psi_memstall_enter(&pflags);
        error = filler(file, folio);
        if (unlikely(workingset))
                psi_memstall_leave(&pflags);
        if (error)
                return error;

        error = folio_wait_locked_killable(folio);
        if (error)
                return error;
        if (folio_test_uptodate(folio))
                return 0;
        if (file)
                shrink_readahead_size_eio(&file->f_ra);
        return -EIO;
}

static bool filemap_range_uptodate(struct address_space *mapping,
                loff_t pos, size_t count, struct folio *folio,
                bool need_uptodate)
{
        if (folio_test_uptodate(folio))
                return true;
        /* pipes can't handle partially uptodate pages */
        if (need_uptodate)
                return false;
        if (!mapping->a_ops->is_partially_uptodate)
                return false;
        if (mapping->host->i_blkbits >= folio_shift(folio))
                return false;

        if (folio_pos(folio) > pos) {
                count -= folio_pos(folio) - pos;
                pos = 0;
        } else {
                pos -= folio_pos(folio);
        }

        return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}

static int filemap_update_page(struct kiocb *iocb,
                struct address_space *mapping, size_t count,
                struct folio *folio, bool need_uptodate)
{
        int error;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (!filemap_invalidate_trylock_shared(mapping))
                        return -EAGAIN;
        } else {
                filemap_invalidate_lock_shared(mapping);
        }

        if (!folio_trylock(folio)) {
                error = -EAGAIN;
                if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
                        goto unlock_mapping;
                if (!(iocb->ki_flags & IOCB_WAITQ)) {
                        filemap_invalidate_unlock_shared(mapping);
                        /*
                         * This is where we usually end up waiting for a
                         * previously submitted readahead to finish.
                         */
                        folio_put_wait_locked(folio, TASK_KILLABLE);
                        return AOP_TRUNCATED_PAGE;
                }
                error = __folio_lock_async(folio, iocb->ki_waitq);
                if (error)
                        goto unlock_mapping;
        }

        error = AOP_TRUNCATED_PAGE;
        if (!folio->mapping)
                goto unlock;

        error = 0;
        if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
                                   need_uptodate))
                goto unlock;

        error = -EAGAIN;
        if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
                goto unlock;

        error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
                        folio);
        goto unlock_mapping;
unlock:
        folio_unlock(folio);
unlock_mapping:
        filemap_invalidate_unlock_shared(mapping);
        if (error == AOP_TRUNCATED_PAGE)
                folio_put(folio);
        return error;
}

static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        struct folio *folio;
        int error;
        unsigned int min_order = mapping_min_folio_order(mapping);
        pgoff_t index;

        if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
                return -EAGAIN;

        folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order);
        if (!folio)
                return -ENOMEM;
        if (iocb->ki_flags & IOCB_DONTCACHE)
                __folio_set_dropbehind(folio);

        /*
         * Protect against truncate / hole punch. Grabbing invalidate_lock
         * here assures we cannot instantiate and bring uptodate new
         * pagecache folios after evicting page cache during truncate
         * and before actually freeing blocks.        Note that we could
         * release invalidate_lock after inserting the folio into
         * the page cache as the locked folio would then be enough to
         * synchronize with hole punching. But there are code paths
         * such as filemap_update_page() filling in partially uptodate
         * pages or ->readahead() that need to hold invalidate_lock
         * while mapping blocks for IO so let's hold the lock here as
         * well to keep locking rules simple.
         */
        filemap_invalidate_lock_shared(mapping);
        index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order;
        error = filemap_add_folio(mapping, folio, index,
                        mapping_gfp_constraint(mapping, GFP_KERNEL));
        if (error == -EEXIST)
                error = AOP_TRUNCATED_PAGE;
        if (error)
                goto error;

        error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
                                        folio);
        if (error)
                goto error;

        filemap_invalidate_unlock_shared(mapping);
        folio_batch_add(fbatch, folio);
        return 0;
error:
        filemap_invalidate_unlock_shared(mapping);
        folio_put(folio);
        return error;
}

static int filemap_readahead(struct kiocb *iocb, struct file *file,
                struct address_space *mapping, struct folio *folio,
                pgoff_t last_index)
{
        DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);

        if (iocb->ki_flags & IOCB_NOIO)
                return -EAGAIN;
        if (iocb->ki_flags & IOCB_DONTCACHE)
                ractl.dropbehind = 1;
        page_cache_async_ra(&ractl, folio, last_index - folio->index);
        return 0;
}

static int filemap_get_pages(struct kiocb *iocb, size_t count,
                struct folio_batch *fbatch, bool need_uptodate)
{
        struct file *filp = iocb->ki_filp;
        struct address_space *mapping = filp->f_mapping;
        pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
        pgoff_t last_index;
        struct folio *folio;
        unsigned int flags;
        int err = 0;

        /* "last_index" is the index of the page beyond the end of the read */
        last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
retry:
        if (fatal_signal_pending(current))
                return -EINTR;

        filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        if (!folio_batch_count(fbatch)) {
                DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index);

                if (iocb->ki_flags & IOCB_NOIO)
                        return -EAGAIN;
                if (iocb->ki_flags & IOCB_NOWAIT)
                        flags = memalloc_noio_save();
                if (iocb->ki_flags & IOCB_DONTCACHE)
                        ractl.dropbehind = 1;
                page_cache_sync_ra(&ractl, last_index - index);
                if (iocb->ki_flags & IOCB_NOWAIT)
                        memalloc_noio_restore(flags);
                filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
        }
        if (!folio_batch_count(fbatch)) {
                err = filemap_create_folio(iocb, fbatch);
                if (err == AOP_TRUNCATED_PAGE)
                        goto retry;
                return err;
        }

        folio = fbatch->folios[folio_batch_count(fbatch) - 1];
        if (folio_test_readahead(folio)) {
                err = filemap_readahead(iocb, filp, mapping, folio, last_index);
                if (err)
                        goto err;
        }
        if (!folio_test_uptodate(folio)) {
                if ((iocb->ki_flags & IOCB_WAITQ) &&
                    folio_batch_count(fbatch) > 1)
                        iocb->ki_flags |= IOCB_NOWAIT;
                err = filemap_update_page(iocb, mapping, count, folio,
                                          need_uptodate);
                if (err)
                        goto err;
        }

        trace_mm_filemap_get_pages(mapping, index, last_index - 1);
        return 0;
err:
        if (err < 0)
                folio_put(folio);
        if (likely(--fbatch->nr))
                return 0;
        if (err == AOP_TRUNCATED_PAGE)
                goto retry;
        return err;
}

static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
{
        unsigned int shift = folio_shift(folio);

        return (pos1 >> shift == pos2 >> shift);
}

static void filemap_end_dropbehind_read(struct address_space *mapping,
                                        struct folio *folio)
{
        if (!folio_test_dropbehind(folio))
                return;
        if (folio_test_writeback(folio) || folio_test_dirty(folio))
                return;
        if (folio_trylock(folio)) {
                if (folio_test_clear_dropbehind(folio))
                        folio_unmap_invalidate(mapping, folio, 0);
                folio_unlock(folio);
        }
}

/**
 * filemap_read - Read data from the page cache.
 * @iocb: The iocb to read.
 * @iter: Destination for the data.
 * @already_read: Number of bytes already read by the caller.
 *
 * Copies data from the page cache.  If the data is not currently present,
 * uses the readahead and read_folio address_space operations to fetch it.
 *
 * Return: Total number of bytes copied, including those already read by
 * the caller.  If an error happens before any bytes are copied, returns
 * a negative error number.
 */
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t already_read)
{
        struct file *filp = iocb->ki_filp;
        struct file_ra_state *ra = &filp->f_ra;
        struct address_space *mapping = filp->f_mapping;
        struct inode *inode = mapping->host;
        struct folio_batch fbatch;
        int i, error = 0;
        bool writably_mapped;
        loff_t isize, end_offset;
        loff_t last_pos = ra->prev_pos;

        if (unlikely(iocb->ki_pos < 0))
                return -EINVAL;
        if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
                return 0;
        if (unlikely(!iov_iter_count(iter)))
                return 0;

        iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
        folio_batch_init(&fbatch);

        do {
                cond_resched();

                /*
                 * If we've already successfully copied some data, then we
                 * can no longer safely return -EIOCBQUEUED. Hence mark
                 * an async read NOWAIT at that point.
                 */
                if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
                        iocb->ki_flags |= IOCB_NOWAIT;

                if (unlikely(iocb->ki_pos >= i_size_read(inode)))
                        break;

                error = filemap_get_pages(iocb, iter->count, &fbatch, false);
                if (error < 0)
                        break;

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(inode);
                if (unlikely(iocb->ki_pos >= isize))
                        goto put_folios;
                end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(mapping);

                /*
                 * When a read accesses the same folio several times, only
                 * mark it as accessed the first time.
                 */
                if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
                                    fbatch.folios[0]))
                        folio_mark_accessed(fbatch.folios[0]);

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t fsize = folio_size(folio);
                        size_t offset = iocb->ki_pos & (fsize - 1);
                        size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
                                             fsize - offset);
                        size_t copied;

                        if (end_offset < folio_pos(folio))
                                break;
                        if (i > 0)
                                folio_mark_accessed(folio);
                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);

                        copied = copy_folio_to_iter(folio, offset, bytes, iter);

                        already_read += copied;
                        iocb->ki_pos += copied;
                        last_pos = iocb->ki_pos;

                        if (copied < bytes) {
                                error = -EFAULT;
                                break;
                        }
                }
put_folios:
                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];

                        filemap_end_dropbehind_read(mapping, folio);
                        folio_put(folio);
                }
                folio_batch_init(&fbatch);
        } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

        file_accessed(filp);
        ra->prev_pos = last_pos;
        return already_read ? already_read : error;
}
EXPORT_SYMBOL_GPL(filemap_read);

int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos;
        loff_t end = pos + count - 1;

        if (iocb->ki_flags & IOCB_NOWAIT) {
                if (filemap_range_needs_writeback(mapping, pos, end))
                        return -EAGAIN;
                return 0;
        }

        return filemap_write_and_wait_range(mapping, pos, end);
}
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);

int filemap_invalidate_pages(struct address_space *mapping,
                             loff_t pos, loff_t end, bool nowait)
{
        int ret;

        if (nowait) {
                /* we could block if there are any pages in the range */
                if (filemap_range_has_page(mapping, pos, end))
                        return -EAGAIN;
        } else {
                ret = filemap_write_and_wait_range(mapping, pos, end);
                if (ret)
                        return ret;
        }

        /*
         * After a write we want buffered reads to be sure to go to disk to get
         * the new data.  We invalidate clean cached page from the region we're
         * about to write.  We do this *before* the write so that we can return
         * without clobbering -EIOCBQUEUED from ->direct_IO().
         */
        return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
                                             end >> PAGE_SHIFT);
}

int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;

        return filemap_invalidate_pages(mapping, iocb->ki_pos,
                                        iocb->ki_pos + count - 1,
                                        iocb->ki_flags & IOCB_NOWAIT);
}
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);

/**
 * generic_file_read_iter - generic filesystem read routine
 * @iocb:        kernel I/O control block
 * @iter:        destination for the data read
 *
 * This is the "read_iter()" routine for all filesystems
 * that can use the page cache directly.
 *
 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
 * be returned when no data can be read without waiting for I/O requests
 * to complete; it doesn't prevent readahead.
 *
 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
 * requests shall be made for the read or for readahead.  When no data
 * can be read, -EAGAIN shall be returned.  When readahead would be
 * triggered, a partial, possibly empty read shall be returned.
 *
 * Return:
 * * number of bytes copied, even for partial reads
 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
 */
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
        size_t count = iov_iter_count(iter);
        ssize_t retval = 0;

        if (!count)
                return 0; /* skip atime */

        if (iocb->ki_flags & IOCB_DIRECT) {
                struct file *file = iocb->ki_filp;
                struct address_space *mapping = file->f_mapping;
                struct inode *inode = mapping->host;

                retval = kiocb_write_and_wait(iocb, count);
                if (retval < 0)
                        return retval;
                file_accessed(file);

                retval = mapping->a_ops->direct_IO(iocb, iter);
                if (retval >= 0) {
                        iocb->ki_pos += retval;
                        count -= retval;
                }
                if (retval != -EIOCBQUEUED)
                        iov_iter_revert(iter, count - iov_iter_count(iter));

                /*
                 * Btrfs can have a short DIO read if we encounter
                 * compressed extents, so if there was an error, or if
                 * we've already read everything we wanted to, or if
                 * there was a short read because we hit EOF, go ahead
                 * and return.  Otherwise fallthrough to buffered io for
                 * the rest of the read.  Buffered reads will not work for
                 * DAX files, so don't bother trying.
                 */
                if (retval < 0 || !count || IS_DAX(inode))
                        return retval;
                if (iocb->ki_pos >= i_size_read(inode))
                        return retval;
        }

        return filemap_read(iocb, iter, retval);
}
EXPORT_SYMBOL(generic_file_read_iter);

/*
 * Splice subpages from a folio into a pipe.
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
                              struct folio *folio, loff_t fpos, size_t size)
{
        struct page *page;
        size_t spliced = 0, offset = offset_in_folio(folio, fpos);

        page = folio_page(folio, offset / PAGE_SIZE);
        size = min(size, folio_size(folio) - offset);
        offset %= PAGE_SIZE;

        while (spliced < size && !pipe_is_full(pipe)) {
                struct pipe_buffer *buf = pipe_head_buf(pipe);
                size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);

                *buf = (struct pipe_buffer) {
                        .ops        = &page_cache_pipe_buf_ops,
                        .page        = page,
                        .offset        = offset,
                        .len        = part,
                };
                folio_get(folio);
                pipe->head++;
                page++;
                spliced += part;
                offset = 0;
        }

        return spliced;
}

/**
 * filemap_splice_read -  Splice data from a file's pagecache into a pipe
 * @in: The file to read from
 * @ppos: Pointer to the file position to read from
 * @pipe: The pipe to splice into
 * @len: The amount to splice
 * @flags: The SPLICE_F_* flags
 *
 * This function gets folios from a file's pagecache and splices them into the
 * pipe.  Readahead will be called as necessary to fill more folios.  This may
 * be used for blockdevs also.
 *
 * Return: On success, the number of bytes read will be returned and *@ppos
 * will be updated if appropriate; 0 will be returned if there is no more data
 * to be read; -EAGAIN will be returned if the pipe had no space, and some
 * other negative error code will be returned on error.  A short read may occur
 * if the pipe has insufficient space, we reach the end of the data or we hit a
 * hole.
 */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
                            struct pipe_inode_info *pipe,
                            size_t len, unsigned int flags)
{
        struct folio_batch fbatch;
        struct kiocb iocb;
        size_t total_spliced = 0, used, npages;
        loff_t isize, end_offset;
        bool writably_mapped;
        int i, error = 0;

        if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
                return 0;

        init_sync_kiocb(&iocb, in);
        iocb.ki_pos = *ppos;

        /* Work out how much data we can actually add into the pipe */
        used = pipe_buf_usage(pipe);
        npages = max_t(ssize_t, pipe->max_usage - used, 0);
        len = min_t(size_t, len, npages * PAGE_SIZE);

        folio_batch_init(&fbatch);

        do {
                cond_resched();

                if (*ppos >= i_size_read(in->f_mapping->host))
                        break;

                iocb.ki_pos = *ppos;
                error = filemap_get_pages(&iocb, len, &fbatch, true);
                if (error < 0)
                        break;

                /*
                 * i_size must be checked after we know the pages are Uptodate.
                 *
                 * Checking i_size after the check allows us to calculate
                 * the correct value for "nr", which means the zero-filled
                 * part of the page is not copied back to userspace (unless
                 * another truncate extends the file - this is desired though).
                 */
                isize = i_size_read(in->f_mapping->host);
                if (unlikely(*ppos >= isize))
                        break;
                end_offset = min_t(loff_t, isize, *ppos + len);

                /*
                 * Once we start copying data, we don't want to be touching any
                 * cachelines that might be contended:
                 */
                writably_mapped = mapping_writably_mapped(in->f_mapping);

                for (i = 0; i < folio_batch_count(&fbatch); i++) {
                        struct folio *folio = fbatch.folios[i];
                        size_t n;

                        if (folio_pos(folio) >= end_offset)
                                goto out;
                        folio_mark_accessed(folio);

                        /*
                         * If users can be writing to this folio using arbitrary
                         * virtual addresses, take care of potential aliasing
                         * before reading the folio on the kernel side.
                         */
                        if (writably_mapped)
                                flush_dcache_folio(folio);

                        n = min_t(loff_t, len, isize - *ppos);
                        n = splice_folio_into_pipe(pipe, folio, *ppos, n);
                        if (!n)
                                goto out;
                        len -= n;
                        total_spliced += n;
                        *ppos += n;
                        in->f_ra.prev_pos = *ppos;
                        if (pipe_is_full(pipe))
                                goto out;
                }

                folio_batch_release(&fbatch);
        } while (len);

out:
        folio_batch_release(&fbatch);
        file_accessed(in);

        return total_spliced ? total_spliced : error;
}
EXPORT_SYMBOL(filemap_splice_read);

static inline loff_t folio_seek_hole_data(struct xa_state *xas,
                struct address_space *mapping, struct folio *folio,
                loff_t start, loff_t end, bool seek_data)
{
        const struct address_space_operations *ops = mapping->a_ops;
        size_t offset, bsz = i_blocksize(mapping->host);

        if (xa_is_value(folio) || folio_test_uptodate(folio))
                return seek_data ? start : end;
        if (!ops->is_partially_uptodate)
                return seek_data ? end : start;

        xas_pause(xas);
        rcu_read_unlock();
        folio_lock(folio);
        if (unlikely(folio->mapping != mapping))
                goto unlock;

        offset = offset_in_folio(folio, start) & ~(bsz - 1);

        do {
                if (ops->is_partially_uptodate(folio, offset, bsz) ==
                                                        seek_data)
                        break;
                start = (start + bsz) & ~((u64)bsz - 1);
                offset += bsz;
        } while (offset < folio_size(folio));
unlock:
        folio_unlock(folio);
        rcu_read_lock();
        return start;
}

static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
{
        if (xa_is_value(folio))
                return PAGE_SIZE << xas_get_order(xas);
        return folio_size(folio);
}

/**
 * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
 * @mapping: Address space to search.
 * @start: First byte to consider.
 * @end: Limit of search (exclusive).
 * @whence: Either SEEK_HOLE or SEEK_DATA.
 *
 * If the page cache knows which blocks contain holes and which blocks
 * contain data, your filesystem can use this function to implement
 * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
 * entirely memory-based such as tmpfs, and filesystems which support
 * unwritten extents.
 *
 * Return: The requested offset on success, or -ENXIO if @whence specifies
 * SEEK_DATA and there is no data after @start.  There is an implicit hole
 * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
 * and @end contain data.
 */
loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
                loff_t end, int whence)
{
        XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
        pgoff_t max = (end - 1) >> PAGE_SHIFT;
        bool seek_data = (whence == SEEK_DATA);
        struct folio *folio;

        if (end <= start)
                return -ENXIO;

        rcu_read_lock();
        while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
                loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
                size_t seek_size;

                if (start < pos) {
                        if (!seek_data)
                                goto unlock;
                        start = pos;
                }

                seek_size = seek_folio_size(&xas, folio);
                pos = round_up((u64)pos + 1, seek_size);
                start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
                                seek_data);
                if (start < pos)
                        goto unlock;
                if (start >= end)
                        break;
                if (seek_size > PAGE_SIZE)
                        xas_set(&xas, pos >> PAGE_SHIFT);
                if (!xa_is_value(folio))
                        folio_put(folio);
        }
        if (seek_data)
                start = -ENXIO;
unlock:
        rcu_read_unlock();
        if (folio && !xa_is_value(folio))
                folio_put(folio);
        if (start > end)
                return end;
        return start;
}

#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS  (100)
/*
 * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
 * @vmf - the vm_fault for this fault.
 * @folio - the folio to lock.
 * @fpin - the pointer to the file we may pin (or is already pinned).
 *
 * This works similar to lock_folio_or_retry in that it can drop the
 * mmap_lock.  It differs in that it actually returns the folio locked
 * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
 * to drop the mmap_lock then fpin will point to the pinned file and
 * needs to be fput()'ed at a later point.
 */
static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
                                     struct file **fpin)
{
        if (folio_trylock(folio))
                return 1;

        /*
         * NOTE! This will make us return with VM_FAULT_RETRY, but with
         * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
         * is supposed to work. We have way too many special cases..
         */
        if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
                return 0;

        *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
        if (vmf->flags & FAULT_FLAG_KILLABLE) {
                if (__folio_lock_killable(folio)) {
                        /*
                         * We didn't have the right flags to drop the
                         * fault lock, but all fault_handlers only check
                         * for fatal signals if we return VM_FAULT_RETRY,
                         * so we need to drop the fault lock here and
                         * return 0 if we don't have a fpin.
                         */
                        if (*fpin == NULL)
                                release_fault_lock(vmf);
                        return 0;
                }
        } else
                __folio_lock(folio);

        return 1;
}

/*
 * Synchronous readahead happens when we don't even find a page in the page
 * cache at all.  We don't want to perform IO under the mmap sem, so if we have
 * to drop the mmap sem we return the file that was pinned in order for us to do
 * that.  If we didn't pin a file then we return NULL.  The file that is
 * returned needs to be fput()'ed when we're done with it.
 */
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        struct address_space *mapping = file->f_mapping;
        DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
        struct file *fpin = NULL;
        unsigned long vm_flags = vmf->vma->vm_flags;
        unsigned int mmap_miss;

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        /* Use the readahead code, even if readahead is disabled */
        if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
                ra->size = HPAGE_PMD_NR;
                /*
                 * Fetch two PMD folios, so we get the chance to actually
                 * readahead, unless we've been told not to.
                 */
                if (!(vm_flags & VM_RAND_READ))
                        ra->size *= 2;
                ra->async_size = HPAGE_PMD_NR;
                page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
                return fpin;
        }
#endif

        /* If we don't want any read-ahead, don't bother */
        if (vm_flags & VM_RAND_READ)
                return fpin;
        if (!ra->ra_pages)
                return fpin;

        if (vm_flags & VM_SEQ_READ) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                page_cache_sync_ra(&ractl, ra->ra_pages);
                return fpin;
        }

        /* Avoid banging the cache line if not needed */
        mmap_miss = READ_ONCE(ra->mmap_miss);
        if (mmap_miss < MMAP_LOTSAMISS * 10)
                WRITE_ONCE(ra->mmap_miss, ++mmap_miss);

        /*
         * Do we miss much more than hit in this file? If so,
         * stop bothering with read-ahead. It will only hurt.
         */
        if (mmap_miss > MMAP_LOTSAMISS)
                return fpin;

        /*
         * mmap read-around
         */
        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
        ra->size = ra->ra_pages;
        ra->async_size = ra->ra_pages / 4;
        ractl._index = ra->start;
        page_cache_ra_order(&ractl, ra, 0);
        return fpin;
}

/*
 * Asynchronous readahead happens when we find the page and PG_readahead,
 * so we want to possibly extend the readahead further.  We return the file that
 * was pinned if we have to drop the mmap_lock in order to do IO.
 */
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
                                            struct folio *folio)
{
        struct file *file = vmf->vma->vm_file;
        struct file_ra_state *ra = &file->f_ra;
        DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
        struct file *fpin = NULL;
        unsigned int mmap_miss;

        /* If we don't want any read-ahead, don't bother */
        if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
                return fpin;

        mmap_miss = READ_ONCE(ra->mmap_miss);
        if (mmap_miss)
                WRITE_ONCE(ra->mmap_miss, --mmap_miss);

        if (folio_test_readahead(folio)) {
                fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                page_cache_async_ra(&ractl, folio, ra->ra_pages);
        }
        return fpin;
}

static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        vm_fault_t ret = 0;
        pte_t *ptep;

        /*
         * We might have COW'ed a pagecache folio and might now have an mlocked
         * anon folio mapped. The original pagecache folio is not mlocked and
         * might have been evicted. During a read+clear/modify/write update of
         * the PTE, such as done in do_numa_page()/change_pte_range(), we
         * temporarily clear the PTE under PT lock and might detect it here as
         * "none" when not holding the PT lock.
         *
         * Not rechecking the PTE under PT lock could result in an unexpected
         * major fault in an mlock'ed region. Recheck only for this special
         * scenario while holding the PT lock, to not degrade non-mlocked
         * scenarios. Recheck the PTE without PT lock firstly, thereby reducing
         * the number of times we hold PT lock.
         */
        if (!(vma->vm_flags & VM_LOCKED))
                return 0;

        if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID))
                return 0;

        ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address,
                                        &vmf->ptl);
        if (unlikely(!ptep))
                return VM_FAULT_NOPAGE;

        if (unlikely(!pte_none(ptep_get_lockless(ptep)))) {
                ret = VM_FAULT_NOPAGE;
        } else {
                spin_lock(vmf->ptl);
                if (unlikely(!pte_none(ptep_get(ptep))))
                        ret = VM_FAULT_NOPAGE;
                spin_unlock(vmf->ptl);
        }
        pte_unmap(ptep);
        return ret;
}

/**
 * filemap_fault - read in file data for page fault handling
 * @vmf:        struct vm_fault containing details of the fault
 *
 * filemap_fault() is invoked via the vma operations vector for a
 * mapped memory region to read in file data during a page fault.
 *
 * The goto's are kind of ugly, but this streamlines the normal case of having
 * it in the page cache, and handles the special cases reasonably without
 * having a lot of duplicated code.
 *
 * vma->vm_mm->mmap_lock must be held on entry.
 *
 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
 * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
 *
 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
 * has not been released.
 *
 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
 *
 * Return: bitwise-OR of %VM_FAULT_ codes.
 */
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
        int error;
        struct file *file = vmf->vma->vm_file;
        struct file *fpin = NULL;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        pgoff_t max_idx, index = vmf->pgoff;
        struct folio *folio;
        vm_fault_t ret = 0;
        bool mapping_locked = false;

        max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(index >= max_idx))
                return VM_FAULT_SIGBUS;

        trace_mm_filemap_fault(mapping, index);

        /*
         * Do we have something in the page cache already?
         */
        folio = filemap_get_folio(mapping, index);
        if (likely(!IS_ERR(folio))) {
                /*
                 * We found the page, so try async readahead before waiting for
                 * the lock.
                 */
                if (!(vmf->flags & FAULT_FLAG_TRIED))
                        fpin = do_async_mmap_readahead(vmf, folio);
                if (unlikely(!folio_test_uptodate(folio))) {
                        filemap_invalidate_lock_shared(mapping);
                        mapping_locked = true;
                }
        } else {
                ret = filemap_fault_recheck_pte_none(vmf);
                if (unlikely(ret))
                        return ret;

                /* No page in the page cache at all */
                count_vm_event(PGMAJFAULT);
                count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
                fpin = do_sync_mmap_readahead(vmf);
retry_find:
                /*
                 * See comment in filemap_create_folio() why we need
                 * invalidate_lock
                 */
                if (!mapping_locked) {
                        filemap_invalidate_lock_shared(mapping);
                        mapping_locked = true;
                }
                folio = __filemap_get_folio(mapping, index,
                                          FGP_CREAT|FGP_FOR_MMAP,
                                          vmf->gfp_mask);
                if (IS_ERR(folio)) {
                        if (fpin)
                                goto out_retry;
                        filemap_invalidate_unlock_shared(mapping);
                        return VM_FAULT_OOM;
                }
        }

        if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
                goto out_retry;

        /* Did it get truncated? */
        if (unlikely(folio->mapping != mapping)) {
                folio_unlock(folio);
                folio_put(folio);
                goto retry_find;
        }
        VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);

        /*
         * We have a locked folio in the page cache, now we need to check
         * that it's up-to-date. If not, it is going to be due to an error,
         * or because readahead was otherwise unable to retrieve it.
         */
        if (unlikely(!folio_test_uptodate(folio))) {
                /*
                 * If the invalidate lock is not held, the folio was in cache
                 * and uptodate and now it is not. Strange but possible since we
                 * didn't hold the page lock all the time. Let's drop
                 * everything, get the invalidate lock and try again.
                 */
                if (!mapping_locked) {
                        folio_unlock(folio);
                        folio_put(folio);
                        goto retry_find;
                }

                /*
                 * OK, the folio is really not uptodate. This can be because the
                 * VMA has the VM_RAND_READ flag set, or because an error
                 * arose. Let's read it in directly.
                 */
                goto page_not_uptodate;
        }

        /*
         * We've made it this far and we had to drop our mmap_lock, now is the
         * time to return to the upper layer and have it re-find the vma and
         * redo the fault.
         */
        if (fpin) {
                folio_unlock(folio);
                goto out_retry;
        }
        if (mapping_locked)
                filemap_invalidate_unlock_shared(mapping);

        /*
         * Found the page and have a reference on it.
         * We must recheck i_size under page lock.
         */
        max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
        if (unlikely(index >= max_idx)) {
                folio_unlock(folio);
                folio_put(folio);
                return VM_FAULT_SIGBUS;
        }

        vmf->page = folio_file_page(folio, index);
        return ret | VM_FAULT_LOCKED;

page_not_uptodate:
        /*
         * Umm, take care of errors if the page isn't up-to-date.
         * Try to re-read it _once_. We do this synchronously,
         * because there really aren't any performance issues here
         * and we need to check for errors.
         */
        fpin = maybe_unlock_mmap_for_io(vmf, fpin);
        error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
        if (fpin)
                goto out_retry;
        folio_put(folio);

        if (!error || error == AOP_TRUNCATED_PAGE)
                goto retry_find;
        filemap_invalidate_unlock_shared(mapping);

        return VM_FAULT_SIGBUS;

out_retry:
        /*
         * We dropped the mmap_lock, we need to return to the fault handler to
         * re-find the vma and come back and find our hopefully still populated
         * page.
         */
        if (!IS_ERR(folio))
                folio_put(folio);
        if (mapping_locked)
                filemap_invalidate_unlock_shared(mapping);
        if (fpin)
                fput(fpin);
        return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);

static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
                pgoff_t start)
{
        struct mm_struct *mm = vmf->vma->vm_mm;

        /* Huge page is mapped? No need to proceed. */
        if (pmd_trans_huge(*vmf->pmd)) {
                folio_unlock(folio);
                folio_put(folio);
                return true;
        }

        if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
                struct page *page = folio_file_page(folio, start);
                vm_fault_t ret = do_set_pmd(vmf, page);
                if (!ret) {
                        /* The page is mapped successfully, reference consumed. */
                        folio_unlock(folio);
                        return true;
                }
        }

        if (pmd_none(*vmf->pmd) && vmf->prealloc_pte)
                pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);

        return false;
}

static struct folio *next_uptodate_folio(struct xa_state *xas,
                struct address_space *mapping, pgoff_t end_pgoff)
{
        struct folio *folio = xas_next_entry(xas, end_pgoff);
        unsigned long max_idx;

        do {
                if (!folio)
                        return NULL;
                if (xas_retry(xas, folio))
                        continue;
                if (xa_is_value(folio))
                        continue;
                if (!folio_try_get(folio))
                        continue;
                if (folio_test_locked(folio))
                        goto skip;
                /* Has the page moved or been split? */
                if (unlikely(folio != xas_reload(xas)))
                        goto skip;
                if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
                        goto skip;
                if (!folio_trylock(folio))
                        goto skip;
                if (folio->mapping != mapping)
                        goto unlock;
                if (!folio_test_uptodate(folio))
                        goto unlock;
                max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
                if (xas->xa_index >= max_idx)
                        goto unlock;
                return folio;
unlock:
                folio_unlock(folio);
skip:
                folio_put(folio);
        } while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);

        return NULL;
}

/*
 * Map page range [start_page, start_page + nr_pages) of folio.
 * start_page is gotten from start by folio_page(folio, start)
 */
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
                        struct folio *folio, unsigned long start,
                        unsigned long addr, unsigned int nr_pages,
                        unsigned long *rss, unsigned int *mmap_miss)
{
        vm_fault_t ret = 0;
        struct page *page = folio_page(folio, start);
        unsigned int count = 0;
        pte_t *old_ptep = vmf->pte;

        do {
                if (PageHWPoison(page + count))
                        goto skip;

                /*
                 * If there are too many folios that are recently evicted
                 * in a file, they will probably continue to be evicted.
                 * In such situation, read-ahead is only a waste of IO.
                 * Don't decrease mmap_miss in this scenario to make sure
                 * we can stop read-ahead.
                 */
                if (!folio_test_workingset(folio))
                        (*mmap_miss)++;

                /*
                 * NOTE: If there're PTE markers, we'll leave them to be
                 * handled in the specific fault path, and it'll prohibit the
                 * fault-around logic.
                 */
                if (!pte_none(ptep_get(&vmf->pte[count])))
                        goto skip;

                count++;
                continue;
skip:
                if (count) {
                        set_pte_range(vmf, folio, page, count, addr);
                        *rss += count;
                        folio_ref_add(folio, count);
                        if (in_range(vmf->address, addr, count * PAGE_SIZE))
                                ret = VM_FAULT_NOPAGE;
                }

                count++;
                page += count;
                vmf->pte += count;
                addr += count * PAGE_SIZE;
                count = 0;
        } while (--nr_pages > 0);

        if (count) {
                set_pte_range(vmf, folio, page, count, addr);
                *rss += count;
                folio_ref_add(folio, count);
                if (in_range(vmf->address, addr, count * PAGE_SIZE))
                        ret = VM_FAULT_NOPAGE;
        }

        vmf->pte = old_ptep;

        return ret;
}

static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
                struct folio *folio, unsigned long addr,
                unsigned long *rss, unsigned int *mmap_miss)
{
        vm_fault_t ret = 0;
        struct page *page = &folio->page;

        if (PageHWPoison(page))
                return ret;

        /* See comment of filemap_map_folio_range() */
        if (!folio_test_workingset(folio))
                (*mmap_miss)++;

        /*
         * NOTE: If there're PTE markers, we'll leave them to be
         * handled in the specific fault path, and it'll prohibit
         * the fault-around logic.
         */
        if (!pte_none(ptep_get(vmf->pte)))
                return ret;

        if (vmf->address == addr)
                ret = VM_FAULT_NOPAGE;

        set_pte_range(vmf, folio, page, 1, addr);
        (*rss)++;
        folio_ref_inc(folio);

        return ret;
}

vm_fault_t filemap_map_pages(struct vm_fault *vmf,
                             pgoff_t start_pgoff, pgoff_t end_pgoff)
{
        struct vm_area_struct *vma = vmf->vma;
        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        pgoff_t file_end, last_pgoff = start_pgoff;
        unsigned long addr;
        XA_STATE(xas, &mapping->i_pages, start_pgoff);
        struct folio *folio;
        vm_fault_t ret = 0;
        unsigned long rss = 0;
        unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved, folio_type;

        rcu_read_lock();
        folio = next_uptodate_folio(&xas, mapping, end_pgoff);
        if (!folio)
                goto out;

        if (filemap_map_pmd(vmf, folio, start_pgoff)) {
                ret = VM_FAULT_NOPAGE;
                goto out;
        }

        addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
        if (!vmf->pte) {
                folio_unlock(folio);
                folio_put(folio);
                goto out;
        }

        file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
        if (end_pgoff > file_end)
                end_pgoff = file_end;

        folio_type = mm_counter_file(folio);
        do {
                unsigned long end;

                addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
                vmf->pte += xas.xa_index - last_pgoff;
                last_pgoff = xas.xa_index;
                end = folio_next_index(folio) - 1;
                nr_pages = min(end, end_pgoff) - xas.xa_index + 1;

                if (!folio_test_large(folio))
                        ret |= filemap_map_order0_folio(vmf,
                                        folio, addr, &rss, &mmap_miss);
                else
                        ret |= filemap_map_folio_range(vmf, folio,
                                        xas.xa_index - folio->index, addr,
                                        nr_pages, &rss, &mmap_miss);

                folio_unlock(folio);
                folio_put(folio);
        } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
        add_mm_counter(vma->vm_mm, folio_type, rss);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);
out:
        rcu_read_unlock();

        mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
        if (mmap_miss >= mmap_miss_saved)
                WRITE_ONCE(file->f_ra.mmap_miss, 0);
        else
                WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);

        return ret;
}
EXPORT_SYMBOL(filemap_map_pages);

vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        struct address_space *mapping = vmf->vma->vm_file->f_mapping;
        struct folio *folio = page_folio(vmf->page);
        vm_fault_t ret = VM_FAULT_LOCKED;

        sb_start_pagefault(mapping->host->i_sb);
        file_update_time(vmf->vma->vm_file);
        folio_lock(folio);
        if (folio->mapping != mapping) {
                folio_unlock(folio);
                ret = VM_FAULT_NOPAGE;
                goto out;
        }
        /*
         * We mark the folio dirty already here so that when freeze is in
         * progress, we are guaranteed that writeback during freezing will
         * see the dirty folio and writeprotect it again.
         */
        folio_mark_dirty(folio);
        folio_wait_stable(folio);
out:
        sb_end_pagefault(mapping->host->i_sb);
        return ret;
}

const struct vm_operations_struct generic_file_vm_ops = {
        .fault                = filemap_fault,
        .map_pages        = filemap_map_pages,
        .page_mkwrite        = filemap_page_mkwrite,
};

/* This is used for a general mmap of a disk file */

int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct address_space *mapping = file->f_mapping;

        if (!mapping->a_ops->read_folio)
                return -ENOEXEC;
        file_accessed(file);
        vma->vm_ops = &generic_file_vm_ops;
        return 0;
}

/*
 * This is for filesystems which do not implement ->writepage.
 */
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
        if (vma_is_shared_maywrite(vma))
                return -EINVAL;
        return generic_file_mmap(file, vma);
}
#else
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
        return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
        return -ENOSYS;
}
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
        return -ENOSYS;
}
#endif /* CONFIG_MMU */

EXPORT_SYMBOL(filemap_page_mkwrite);
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);

static struct folio *do_read_cache_folio(struct address_space *mapping,
                pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
{
        struct folio *folio;
        int err;

        if (!filler)
                filler = mapping->a_ops->read_folio;
repeat:
        folio = filemap_get_folio(mapping, index);
        if (IS_ERR(folio)) {
                folio = filemap_alloc_folio(gfp,
                                            mapping_min_folio_order(mapping));
                if (!folio)
                        return ERR_PTR(-ENOMEM);
                index = mapping_align_index(mapping, index);
                err = filemap_add_folio(mapping, folio, index, gfp);
                if (unlikely(err)) {
                        folio_put(folio);
                        if (err == -EEXIST)
                                goto repeat;
                        /* Presumably ENOMEM for xarray node */
                        return ERR_PTR(err);
                }

                goto filler;
        }
        if (folio_test_uptodate(folio))
                goto out;

        if (!folio_trylock(folio)) {
                folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
                goto repeat;
        }

        /* Folio was truncated from mapping */
        if (!folio->mapping) {
                folio_unlock(folio);
                folio_put(folio);
                goto repeat;
        }

        /* Someone else locked and filled the page in a very small window */
        if (folio_test_uptodate(folio)) {
                folio_unlock(folio);
                goto out;
        }

filler:
        err = filemap_read_folio(file, filler, folio);
        if (err) {
                folio_put(folio);
                if (err == AOP_TRUNCATED_PAGE)
                        goto repeat;
                return ERR_PTR(err);
        }

out:
        folio_mark_accessed(folio);
        return folio;
}

/**
 * read_cache_folio - Read into page cache, fill it if needed.
 * @mapping: The address_space to read from.
 * @index: The index to read.
 * @filler: Function to perform the read, or NULL to use aops->read_folio().
 * @file: Passed to filler function, may be NULL if not required.
 *
 * Read one page into the page cache.  If it succeeds, the folio returned
 * will contain @index, but it may not be the first page of the folio.
 *
 * If the filler function returns an error, it will be returned to the
 * caller.
 *
 * Context: May sleep.  Expects mapping->invalidate_lock to be held.
 * Return: An uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
                filler_t filler, struct file *file)
{
        return do_read_cache_folio(mapping, index, filler, file,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_folio);

/**
 * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
 * @mapping:        The address_space for the folio.
 * @index:        The index that the allocated folio will contain.
 * @gfp:        The page allocator flags to use if allocating.
 *
 * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
 * any new memory allocations done using the specified allocation flags.
 *
 * The most likely error from this function is EIO, but ENOMEM is
 * possible and so is EINTR.  If ->read_folio returns another error,
 * that will be returned to the caller.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: Uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *mapping_read_folio_gfp(struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
        return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(mapping_read_folio_gfp);

static struct page *do_read_cache_page(struct address_space *mapping,
                pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
{
        struct folio *folio;

        folio = do_read_cache_folio(mapping, index, filler, file, gfp);
        if (IS_ERR(folio))
                return &folio->page;
        return folio_file_page(folio, index);
}

struct page *read_cache_page(struct address_space *mapping,
                        pgoff_t index, filler_t *filler, struct file *file)
{
        return do_read_cache_page(mapping, index, filler, file,
                        mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);

/**
 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
 * @mapping:        the page's address_space
 * @index:        the page index
 * @gfp:        the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
 * any new page allocations done using the specified allocation flags.
 *
 * If the page does not get brought uptodate, return -EIO.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: up to date page on success, ERR_PTR() on failure.
 */
struct page *read_cache_page_gfp(struct address_space *mapping,
                                pgoff_t index,
                                gfp_t gfp)
{
        return do_read_cache_page(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);

/*
 * Warn about a page cache invalidation failure during a direct I/O write.
 */
static void dio_warn_stale_pagecache(struct file *filp)
{
        static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
        char pathname[128];
        char *path;

        errseq_set(&filp->f_mapping->wb_err, -EIO);
        if (__ratelimit(&_rs)) {
                path = file_path(filp, pathname, sizeof(pathname));
                if (IS_ERR(path))
                        path = "(unknown)";
                pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
                pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
                        current->comm);
        }
}

void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;

        if (mapping->nrpages &&
            invalidate_inode_pages2_range(mapping,
                        iocb->ki_pos >> PAGE_SHIFT,
                        (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
                dio_warn_stale_pagecache(iocb->ki_filp);
}

ssize_t
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        size_t write_len = iov_iter_count(from);
        ssize_t written;

        /*
         * If a page can not be invalidated, return 0 to fall back
         * to buffered write.
         */
        written = kiocb_invalidate_pages(iocb, write_len);
        if (written) {
                if (written == -EBUSY)
                        return 0;
                return written;
        }

        written = mapping->a_ops->direct_IO(iocb, from);

        /*
         * Finally, try again to invalidate clean pages which might have been
         * cached by non-direct readahead, or faulted in by get_user_pages()
         * if the source of the write was an mmap'ed region of the file
         * we're writing.  Either one is a pretty crazy thing to do,
         * so we don't support it 100%.  If this invalidation
         * fails, tough, the write still worked...
         *
         * Most of the time we do not need this since dio_complete() will do
         * the invalidation for us. However there are some file systems that
         * do not end up with dio_complete() being called, so let's not break
         * them by removing it completely.
         *
         * Noticeable example is a blkdev_direct_IO().
         *
         * Skip invalidation for async writes or if mapping has no pages.
         */
        if (written > 0) {
                struct inode *inode = mapping->host;
                loff_t pos = iocb->ki_pos;

                kiocb_invalidate_post_direct_write(iocb, written);
                pos += written;
                write_len -= written;
                if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
                        i_size_write(inode, pos);
                        mark_inode_dirty(inode);
                }
                iocb->ki_pos = pos;
        }
        if (written != -EIOCBQUEUED)
                iov_iter_revert(from, write_len - iov_iter_count(from));
        return written;
}
EXPORT_SYMBOL(generic_file_direct_write);

ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
{
        struct file *file = iocb->ki_filp;
        loff_t pos = iocb->ki_pos;
        struct address_space *mapping = file->f_mapping;
        const struct address_space_operations *a_ops = mapping->a_ops;
        size_t chunk = mapping_max_folio_size(mapping);
        long status = 0;
        ssize_t written = 0;

        do {
                struct folio *folio;
                size_t offset;                /* Offset into folio */
                size_t bytes;                /* Bytes to write to folio */
                size_t copied;                /* Bytes copied from user */
                void *fsdata = NULL;

                bytes = iov_iter_count(i);
retry:
                offset = pos & (chunk - 1);
                bytes = min(chunk - offset, bytes);
                balance_dirty_pages_ratelimited(mapping);

                if (fatal_signal_pending(current)) {
                        status = -EINTR;
                        break;
                }

                status = a_ops->write_begin(file, mapping, pos, bytes,
                                                &folio, &fsdata);
                if (unlikely(status < 0))
                        break;

                offset = offset_in_folio(folio, pos);
                if (bytes > folio_size(folio) - offset)
                        bytes = folio_size(folio) - offset;

                if (mapping_writably_mapped(mapping))
                        flush_dcache_folio(folio);

                /*
                 * Faults here on mmap()s can recurse into arbitrary
                 * filesystem code. Lots of locks are held that can
                 * deadlock. Use an atomic copy to avoid deadlocking
                 * in page fault handling.
                 */
                copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
                flush_dcache_folio(folio);

                status = a_ops->write_end(file, mapping, pos, bytes, copied,
                                                folio, fsdata);
                if (unlikely(status != copied)) {
                        iov_iter_revert(i, copied - max(status, 0L));
                        if (unlikely(status < 0))
                                break;
                }
                cond_resched();

                if (unlikely(status == 0)) {
                        /*
                         * A short copy made ->write_end() reject the
                         * thing entirely.  Might be memory poisoning
                         * halfway through, might be a race with munmap,
                         * might be severe memory pressure.
                         */
                        if (chunk > PAGE_SIZE)
                                chunk /= 2;
                        if (copied) {
                                bytes = copied;
                                goto retry;
                        }

                        /*
                         * 'folio' is now unlocked and faults on it can be
                         * handled. Ensure forward progress by trying to
                         * fault it in now.
                         */
                        if (fault_in_iov_iter_readable(i, bytes) == bytes) {
                                status = -EFAULT;
                                break;
                        }
                } else {
                        pos += status;
                        written += status;
                }
        } while (iov_iter_count(i));

        if (!written)
                return status;
        iocb->ki_pos += written;
        return written;
}
EXPORT_SYMBOL(generic_perform_write);

/**
 * __generic_file_write_iter - write data to a file
 * @iocb:        IO state structure (file, offset, etc.)
 * @from:        iov_iter with data to write
 *
 * This function does all the work needed for actually writing data to a
 * file. It does all basic checks, removes SUID from the file, updates
 * modification times and calls proper subroutines depending on whether we
 * do direct IO or a standard buffered write.
 *
 * It expects i_rwsem to be grabbed unless we work on a block device or similar
 * object which does not need locking at all.
 *
 * This function does *not* take care of syncing data in case of O_SYNC write.
 * A caller has to handle it. This is mainly due to the fact that we want to
 * avoid syncing under i_rwsem.
 *
 * Return:
 * * number of bytes written, even for truncated writes
 * * negative error code if no data has been written at all
 */
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
        ssize_t ret;

        ret = file_remove_privs(file);
        if (ret)
                return ret;

        ret = file_update_time(file);
        if (ret)
                return ret;

        if (iocb->ki_flags & IOCB_DIRECT) {
                ret = generic_file_direct_write(iocb, from);
                /*
                 * If the write stopped short of completing, fall back to
                 * buffered writes.  Some filesystems do this for writes to
                 * holes, for example.  For DAX files, a buffered write will
                 * not succeed (even if it did, DAX does not handle dirty
                 * page-cache pages correctly).
                 */
                if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
                        return ret;
                return direct_write_fallback(iocb, from, ret,
                                generic_perform_write(iocb, from));
        }

        return generic_perform_write(iocb, from);
}
EXPORT_SYMBOL(__generic_file_write_iter);

/**
 * generic_file_write_iter - write data to a file
 * @iocb:        IO state structure
 * @from:        iov_iter with data to write
 *
 * This is a wrapper around __generic_file_write_iter() to be used by most
 * filesystems. It takes care of syncing the file in case of O_SYNC file
 * and acquires i_rwsem as needed.
 * Return:
 * * negative error code if no data has been written at all of
 *   vfs_fsync_range() failed for a synchronous write
 * * number of bytes written, even for truncated writes
 */
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
        ssize_t ret;

        inode_lock(inode);
        ret = generic_write_checks(iocb, from);
        if (ret > 0)
                ret = __generic_file_write_iter(iocb, from);
        inode_unlock(inode);

        if (ret > 0)
                ret = generic_write_sync(iocb, ret);
        return ret;
}
EXPORT_SYMBOL(generic_file_write_iter);

/**
 * filemap_release_folio() - Release fs-specific metadata on a folio.
 * @folio: The folio which the kernel is trying to free.
 * @gfp: Memory allocation flags (and I/O mode).
 *
 * The address_space is trying to release any data attached to a folio
 * (presumably at folio->private).
 *
 * This will also be called if the private_2 flag is set on a page,
 * indicating that the folio has other metadata associated with it.
 *
 * The @gfp argument specifies whether I/O may be performed to release
 * this page (__GFP_IO), and whether the call may block
 * (__GFP_RECLAIM & __GFP_FS).
 *
 * Return: %true if the release was successful, otherwise %false.
 */
bool filemap_release_folio(struct folio *folio, gfp_t gfp)
{
        struct address_space * const mapping = folio->mapping;

        BUG_ON(!folio_test_locked(folio));
        if (!folio_needs_release(folio))
                return true;
        if (folio_test_writeback(folio))
                return false;

        if (mapping && mapping->a_ops->release_folio)
                return mapping->a_ops->release_folio(folio, gfp);
        return try_to_free_buffers(folio);
}
EXPORT_SYMBOL(filemap_release_folio);

/**
 * filemap_invalidate_inode - Invalidate/forcibly write back a range of an inode's pagecache
 * @inode: The inode to flush
 * @flush: Set to write back rather than simply invalidate.
 * @start: First byte to in range.
 * @end: Last byte in range (inclusive), or LLONG_MAX for everything from start
 *       onwards.
 *
 * Invalidate all the folios on an inode that contribute to the specified
 * range, possibly writing them back first.  Whilst the operation is
 * undertaken, the invalidate lock is held to prevent new folios from being
 * installed.
 */
int filemap_invalidate_inode(struct inode *inode, bool flush,
                             loff_t start, loff_t end)
{
        struct address_space *mapping = inode->i_mapping;
        pgoff_t first = start >> PAGE_SHIFT;
        pgoff_t last = end >> PAGE_SHIFT;
        pgoff_t nr = end == LLONG_MAX ? ULONG_MAX : last - first + 1;

        if (!mapping || !mapping->nrpages || end < start)
                goto out;

        /* Prevent new folios from being added to the inode. */
        filemap_invalidate_lock(mapping);

        if (!mapping->nrpages)
                goto unlock;

        unmap_mapping_pages(mapping, first, nr, false);

        /* Write back the data if we're asked to. */
        if (flush) {
                struct writeback_control wbc = {
                        .sync_mode        = WB_SYNC_ALL,
                        .nr_to_write        = LONG_MAX,
                        .range_start        = start,
                        .range_end        = end,
                };

                filemap_fdatawrite_wbc(mapping, &wbc);
        }

        /* Wait for writeback to complete on all folios and discard. */
        invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);

unlock:
        filemap_invalidate_unlock(mapping);
out:
        return filemap_check_errors(mapping);
}
EXPORT_SYMBOL_GPL(filemap_invalidate_inode);

#ifdef CONFIG_CACHESTAT_SYSCALL
/**
 * filemap_cachestat() - compute the page cache statistics of a mapping
 * @mapping:        The mapping to compute the statistics for.
 * @first_index:        The starting page cache index.
 * @last_index:        The final page index (inclusive).
 * @cs:        the cachestat struct to write the result to.
 *
 * This will query the page cache statistics of a mapping in the
 * page range of [first_index, last_index] (inclusive). The statistics
 * queried include: number of dirty pages, number of pages marked for
 * writeback, and the number of (recently) evicted pages.
 */
static void filemap_cachestat(struct address_space *mapping,
                pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
{
        XA_STATE(xas, &mapping->i_pages, first_index);
        struct folio *folio;

        /* Flush stats (and potentially sleep) outside the RCU read section. */
        mem_cgroup_flush_stats_ratelimited(NULL);

        rcu_read_lock();
        xas_for_each(&xas, folio, last_index) {
                int order;
                unsigned long nr_pages;
                pgoff_t folio_first_index, folio_last_index;

                /*
                 * Don't deref the folio. It is not pinned, and might
                 * get freed (and reused) underneath us.
                 *
                 * We *could* pin it, but that would be expensive for
                 * what should be a fast and lightweight syscall.
                 *
                 * Instead, derive all information of interest from
                 * the rcu-protected xarray.
                 */

                if (xas_retry(&xas, folio))
                        continue;

                order = xas_get_order(&xas);
                nr_pages = 1 << order;
                folio_first_index = round_down(xas.xa_index, 1 << order);
                folio_last_index = folio_first_index + nr_pages - 1;

                /* Folios might straddle the range boundaries, only count covered pages */
                if (folio_first_index < first_index)
                        nr_pages -= first_index - folio_first_index;

                if (folio_last_index > last_index)
                        nr_pages -= folio_last_index - last_index;

                if (xa_is_value(folio)) {
                        /* page is evicted */
                        void *shadow = (void *)folio;
                        bool workingset; /* not used */

                        cs->nr_evicted += nr_pages;

#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
                        if (shmem_mapping(mapping)) {
                                /* shmem file - in swap cache */
                                swp_entry_t swp = radix_to_swp_entry(folio);

                                /* swapin error results in poisoned entry */
                                if (non_swap_entry(swp))
                                        goto resched;

                                /*
                                 * Getting a swap entry from the shmem
                                 * inode means we beat
                                 * shmem_unuse(). rcu_read_lock()
                                 * ensures swapoff waits for us before
                                 * freeing the swapper space. However,
                                 * we can race with swapping and
                                 * invalidation, so there might not be
                                 * a shadow in the swapcache (yet).
                                 */
                                shadow = get_shadow_from_swap_cache(swp);
                                if (!shadow)
                                        goto resched;
                        }
#endif
                        if (workingset_test_recent(shadow, true, &workingset, false))
                                cs->nr_recently_evicted += nr_pages;

                        goto resched;
                }

                /* page is in cache */
                cs->nr_cache += nr_pages;

                if (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY))
                        cs->nr_dirty += nr_pages;

                if (xas_get_mark(&xas, PAGECACHE_TAG_WRITEBACK))
                        cs->nr_writeback += nr_pages;

resched:
                if (need_resched()) {
                        xas_pause(&xas);
                        cond_resched_rcu();
                }
        }
        rcu_read_unlock();
}

/*
 * See mincore: reveal pagecache information only for files
 * that the calling process has write access to, or could (if
 * tried) open for writing.
 */
static inline bool can_do_cachestat(struct file *f)
{
        if (f->f_mode & FMODE_WRITE)
                return true;
        if (inode_owner_or_capable(file_mnt_idmap(f), file_inode(f)))
                return true;
        return file_permission(f, MAY_WRITE) == 0;
}

/*
 * The cachestat(2) system call.
 *
 * cachestat() returns the page cache statistics of a file in the
 * bytes range specified by `off` and `len`: number of cached pages,
 * number of dirty pages, number of pages marked for writeback,
 * number of evicted pages, and number of recently evicted pages.
 *
 * An evicted page is a page that is previously in the page cache
 * but has been evicted since. A page is recently evicted if its last
 * eviction was recent enough that its reentry to the cache would
 * indicate that it is actively being used by the system, and that
 * there is memory pressure on the system.
 *
 * `off` and `len` must be non-negative integers. If `len` > 0,
 * the queried range is [`off`, `off` + `len`]. If `len` == 0,
 * we will query in the range from `off` to the end of the file.
 *
 * The `flags` argument is unused for now, but is included for future
 * extensibility. User should pass 0 (i.e no flag specified).
 *
 * Currently, hugetlbfs is not supported.
 *
 * Because the status of a page can change after cachestat() checks it
 * but before it returns to the application, the returned values may
 * contain stale information.
 *
 * return values:
 *  zero        - success
 *  -EFAULT     - cstat or cstat_range points to an illegal address
 *  -EINVAL     - invalid flags
 *  -EBADF      - invalid file descriptor
 *  -EOPNOTSUPP - file descriptor is of a hugetlbfs file
 */
SYSCALL_DEFINE4(cachestat, unsigned int, fd,
                struct cachestat_range __user *, cstat_range,
                struct cachestat __user *, cstat, unsigned int, flags)
{
        CLASS(fd, f)(fd);
        struct address_space *mapping;
        struct cachestat_range csr;
        struct cachestat cs;
        pgoff_t first_index, last_index;

        if (fd_empty(f))
                return -EBADF;

        if (copy_from_user(&csr, cstat_range,
                        sizeof(struct cachestat_range)))
                return -EFAULT;

        /* hugetlbfs is not supported */
        if (is_file_hugepages(fd_file(f)))
                return -EOPNOTSUPP;

        if (!can_do_cachestat(fd_file(f)))
                return -EPERM;

        if (flags != 0)
                return -EINVAL;

        first_index = csr.off >> PAGE_SHIFT;
        last_index =
                csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
        memset(&cs, 0, sizeof(struct cachestat));
        mapping = fd_file(f)->f_mapping;
        filemap_cachestat(mapping, first_index, last_index, &cs);

        if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
                return -EFAULT;

        return 0;
}
#endif /* CONFIG_CACHESTAT_SYSCALL */





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 


    3 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
// SPDX-License-Identifier: GPL-1.0+
/*
 * originally based on the dummy device.
 *
 * Copyright 1999, Thomas Davis, tadavis@lbl.gov.
 * Based on dummy.c, and eql.c devices.
 *
 * bonding.c: an Ethernet Bonding driver
 *
 * This is useful to talk to a Cisco EtherChannel compatible equipment:
 *        Cisco 5500
 *        Sun Trunking (Solaris)
 *        Alteon AceDirector Trunks
 *        Linux Bonding
 *        and probably many L2 switches ...
 *
 * How it works:
 *    ifconfig bond0 ipaddress netmask up
 *      will setup a network device, with an ip address.  No mac address
 *        will be assigned at this time.  The hw mac address will come from
 *        the first slave bonded to the channel.  All slaves will then use
 *        this hw mac address.
 *
 *    ifconfig bond0 down
 *         will release all slaves, marking them as down.
 *
 *    ifenslave bond0 eth0
 *        will attach eth0 to bond0 as a slave.  eth0 hw mac address will either
 *        a: be used as initial mac address
 *        b: if a hw mac address already is there, eth0's hw mac address
 *           will then be set from bond0.
 *
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/filter.h>
#include <linux/interrupt.h>
#include <linux/ptrace.h>
#include <linux/ioport.h>
#include <linux/in.h>
#include <net/ip.h>
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/init.h>
#include <linux/timer.h>
#include <linux/socket.h>
#include <linux/ctype.h>
#include <linux/inet.h>
#include <linux/bitops.h>
#include <linux/io.h>
#include <asm/dma.h>
#include <linux/uaccess.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <net/sock.h>
#include <linux/rtnetlink.h>
#include <linux/smp.h>
#include <linux/if_ether.h>
#include <net/arp.h>
#include <linux/mii.h>
#include <linux/ethtool.h>
#include <linux/if_vlan.h>
#include <linux/if_bonding.h>
#include <linux/phy.h>
#include <linux/jiffies.h>
#include <linux/preempt.h>
#include <net/route.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/pkt_sched.h>
#include <linux/rculist.h>
#include <net/flow_dissector.h>
#include <net/xfrm.h>
#include <net/bonding.h>
#include <net/bond_3ad.h>
#include <net/bond_alb.h>
#if IS_ENABLED(CONFIG_TLS_DEVICE)
#include <net/tls.h>
#endif
#include <net/ip6_route.h>
#include <net/netdev_lock.h>
#include <net/xdp.h>

#include "bonding_priv.h"

/*---------------------------- Module parameters ----------------------------*/

/* monitor all links that often (in milliseconds). <=0 disables monitoring */

static int max_bonds        = BOND_DEFAULT_MAX_BONDS;
static int tx_queues        = BOND_DEFAULT_TX_QUEUES;
static int num_peer_notif = 1;
static int miimon;
static int updelay;
static int downdelay;
static int use_carrier        = 1;
static char *mode;
static char *primary;
static char *primary_reselect;
static char *lacp_rate;
static int min_links;
static char *ad_select;
static char *xmit_hash_policy;
static int arp_interval;
static char *arp_ip_target[BOND_MAX_ARP_TARGETS];
static char *arp_validate;
static char *arp_all_targets;
static char *fail_over_mac;
static int all_slaves_active;
static struct bond_params bonding_defaults;
static int resend_igmp = BOND_DEFAULT_RESEND_IGMP;
static int packets_per_slave = 1;
static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;

module_param(max_bonds, int, 0);
MODULE_PARM_DESC(max_bonds, "Max number of bonded devices");
module_param(tx_queues, int, 0);
MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)");
module_param_named(num_grat_arp, num_peer_notif, int, 0644);
MODULE_PARM_DESC(num_grat_arp, "Number of peer notifications to send on "
                               "failover event (alias of num_unsol_na)");
module_param_named(num_unsol_na, num_peer_notif, int, 0644);
MODULE_PARM_DESC(num_unsol_na, "Number of peer notifications to send on "
                               "failover event (alias of num_grat_arp)");
module_param(miimon, int, 0);
MODULE_PARM_DESC(miimon, "Link check interval in milliseconds");
module_param(updelay, int, 0);
MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds");
module_param(downdelay, int, 0);
MODULE_PARM_DESC(downdelay, "Delay before considering link down, "
                            "in milliseconds");
module_param(use_carrier, int, 0);
MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; "
                              "0 for off, 1 for on (default)");
module_param(mode, charp, 0);
MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, "
                       "1 for active-backup, 2 for balance-xor, "
                       "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, "
                       "6 for balance-alb");
module_param(primary, charp, 0);
MODULE_PARM_DESC(primary, "Primary network device to use");
module_param(primary_reselect, charp, 0);
MODULE_PARM_DESC(primary_reselect, "Reselect primary slave "
                                   "once it comes up; "
                                   "0 for always (default), "
                                   "1 for only if speed of primary is "
                                   "better, "
                                   "2 for only on active slave "
                                   "failure");
module_param(lacp_rate, charp, 0);
MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner; "
                            "0 for slow, 1 for fast");
module_param(ad_select, charp, 0);
MODULE_PARM_DESC(ad_select, "802.3ad aggregation selection logic; "
                            "0 for stable (default), 1 for bandwidth, "
                            "2 for count");
module_param(min_links, int, 0);
MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier");

module_param(xmit_hash_policy, charp, 0);
MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; "
                                   "0 for layer 2 (default), 1 for layer 3+4, "
                                   "2 for layer 2+3, 3 for encap layer 2+3, "
                                   "4 for encap layer 3+4, 5 for vlan+srcmac");
module_param(arp_interval, int, 0);
MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
module_param_array(arp_ip_target, charp, NULL, 0);
MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form");
module_param(arp_validate, charp, 0);
MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; "
                               "0 for none (default), 1 for active, "
                               "2 for backup, 3 for all");
module_param(arp_all_targets, charp, 0);
MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all");
module_param(fail_over_mac, charp, 0);
MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to "
                                "the same MAC; 0 for none (default), "
                                "1 for active, 2 for follow");
module_param(all_slaves_active, int, 0);
MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface "
                                     "by setting active flag for all slaves; "
                                     "0 for never (default), 1 for always.");
module_param(resend_igmp, int, 0);
MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on "
                              "link failure");
module_param(packets_per_slave, int, 0);
MODULE_PARM_DESC(packets_per_slave, "Packets to send per slave in balance-rr "
                                    "mode; 0 for a random slave, 1 packet per "
                                    "slave (default), >1 packets per slave.");
module_param(lp_interval, uint, 0);
MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where "
                              "the bonding driver sends learning packets to "
                              "each slaves peer switch. The default is 1.");

/*----------------------------- Global variables ----------------------------*/

#ifdef CONFIG_NET_POLL_CONTROLLER
atomic_t netpoll_block_tx = ATOMIC_INIT(0);
#endif

unsigned int bond_net_id __read_mostly;

static const struct flow_dissector_key flow_keys_bonding_keys[] = {
        {
                .key_id = FLOW_DISSECTOR_KEY_CONTROL,
                .offset = offsetof(struct flow_keys, control),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_BASIC,
                .offset = offsetof(struct flow_keys, basic),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v4addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
                .offset = offsetof(struct flow_keys, addrs.v6addrs),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_TIPC,
                .offset = offsetof(struct flow_keys, addrs.tipckey),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_PORTS,
                .offset = offsetof(struct flow_keys, ports),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_ICMP,
                .offset = offsetof(struct flow_keys, icmp),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_VLAN,
                .offset = offsetof(struct flow_keys, vlan),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
                .offset = offsetof(struct flow_keys, tags),
        },
        {
                .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
                .offset = offsetof(struct flow_keys, keyid),
        },
};

static struct flow_dissector flow_keys_bonding __read_mostly;

/*-------------------------- Forward declarations ---------------------------*/

static int bond_init(struct net_device *bond_dev);
static void bond_uninit(struct net_device *bond_dev);
static void bond_get_stats(struct net_device *bond_dev,
                           struct rtnl_link_stats64 *stats);
static void bond_slave_arr_handler(struct work_struct *work);
static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
                                  int mod);
static void bond_netdev_notify_work(struct work_struct *work);

/*---------------------------- General routines -----------------------------*/

const char *bond_mode_name(int mode)
{
        static const char *names[] = {
                [BOND_MODE_ROUNDROBIN] = "load balancing (round-robin)",
                [BOND_MODE_ACTIVEBACKUP] = "fault-tolerance (active-backup)",
                [BOND_MODE_XOR] = "load balancing (xor)",
                [BOND_MODE_BROADCAST] = "fault-tolerance (broadcast)",
                [BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation",
                [BOND_MODE_TLB] = "transmit load balancing",
                [BOND_MODE_ALB] = "adaptive load balancing",
        };

        if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB)
                return "unknown";

        return names[mode];
}

/**
 * bond_dev_queue_xmit - Prepare skb for xmit.
 *
 * @bond: bond device that got this skb for tx.
 * @skb: hw accel VLAN tagged skb to transmit
 * @slave_dev: slave that is supposed to xmit this skbuff
 */
netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
                        struct net_device *slave_dev)
{
        skb->dev = slave_dev;

        BUILD_BUG_ON(sizeof(skb->queue_mapping) !=
                     sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping));
        skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);

        if (unlikely(netpoll_tx_running(bond->dev)))
                return bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb);

        return dev_queue_xmit(skb);
}

static bool bond_sk_check(struct bonding *bond)
{
        switch (BOND_MODE(bond)) {
        case BOND_MODE_8023AD:
        case BOND_MODE_XOR:
                if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34)
                        return true;
                fallthrough;
        default:
                return false;
        }
}

bool bond_xdp_check(struct bonding *bond, int mode)
{
        switch (mode) {
        case BOND_MODE_ROUNDROBIN:
        case BOND_MODE_ACTIVEBACKUP:
                return true;
        case BOND_MODE_8023AD:
        case BOND_MODE_XOR:
                /* vlan+srcmac is not supported with XDP as in most cases the 802.1q
                 * payload is not in the packet due to hardware offload.
                 */
                if (bond->params.xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC)
                        return true;
                fallthrough;
        default:
                return false;
        }
}

/*---------------------------------- VLAN -----------------------------------*/

/* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid,
 * We don't protect the slave list iteration with a lock because:
 * a. This operation is performed in IOCTL context,
 * b. The operation is protected by the RTNL semaphore in the 8021q code,
 * c. Holding a lock with BH disabled while directly calling a base driver
 *    entry point is generally a BAD idea.
 *
 * The design of synchronization/protection for this operation in the 8021q
 * module is good for one or more VLAN devices over a single physical device
 * and cannot be extended for a teaming solution like bonding, so there is a
 * potential race condition here where a net device from the vlan group might
 * be referenced (either by a base driver or the 8021q code) while it is being
 * removed from the system. However, it turns out we're not making matters
 * worse, and if it works for regular VLAN usage it will work here too.
*/

/**
 * bond_vlan_rx_add_vid - Propagates adding an id to slaves
 * @bond_dev: bonding net device that got called
 * @proto: network protocol ID
 * @vid: vlan id being added
 */
static int bond_vlan_rx_add_vid(struct net_device *bond_dev,
                                __be16 proto, u16 vid)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave, *rollback_slave;
        struct list_head *iter;
        int res;

        bond_for_each_slave(bond, slave, iter) {
                res = vlan_vid_add(slave->dev, proto, vid);
                if (res)
                        goto unwind;
        }

        return 0;

unwind:
        /* unwind to the slave that failed */
        bond_for_each_slave(bond, rollback_slave, iter) {
                if (rollback_slave == slave)
                        break;

                vlan_vid_del(rollback_slave->dev, proto, vid);
        }

        return res;
}

/**
 * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves
 * @bond_dev: bonding net device that got called
 * @proto: network protocol ID
 * @vid: vlan id being removed
 */
static int bond_vlan_rx_kill_vid(struct net_device *bond_dev,
                                 __be16 proto, u16 vid)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct list_head *iter;
        struct slave *slave;

        bond_for_each_slave(bond, slave, iter)
                vlan_vid_del(slave->dev, proto, vid);

        if (bond_is_lb(bond))
                bond_alb_clear_vlan(bond, vid);

        return 0;
}

/*---------------------------------- XFRM -----------------------------------*/

#ifdef CONFIG_XFRM_OFFLOAD
/**
 * bond_ipsec_dev - Get active device for IPsec offload
 * @xs: pointer to transformer state struct
 *
 * Context: caller must hold rcu_read_lock.
 *
 * Return: the device for ipsec offload, or NULL if not exist.
 **/
static struct net_device *bond_ipsec_dev(struct xfrm_state *xs)
{
        struct net_device *bond_dev = xs->xso.dev;
        struct bonding *bond;
        struct slave *slave;

        bond = netdev_priv(bond_dev);
        if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
                return NULL;

        slave = rcu_dereference(bond->curr_active_slave);
        if (!slave)
                return NULL;

        if (!xs->xso.real_dev)
                return NULL;

        if (xs->xso.real_dev != slave->dev)
                pr_warn_ratelimited("%s: (slave %s): not same with IPsec offload real dev %s\n",
                                    bond_dev->name, slave->dev->name, xs->xso.real_dev->name);

        return slave->dev;
}

/**
 * bond_ipsec_add_sa - program device with a security association
 * @xs: pointer to transformer state struct
 * @extack: extack point to fill failure reason
 **/
static int bond_ipsec_add_sa(struct xfrm_state *xs,
                             struct netlink_ext_ack *extack)
{
        struct net_device *bond_dev = xs->xso.dev;
        struct net_device *real_dev;
        netdevice_tracker tracker;
        struct bond_ipsec *ipsec;
        struct bonding *bond;
        struct slave *slave;
        int err;

        if (!bond_dev)
                return -EINVAL;

        rcu_read_lock();
        bond = netdev_priv(bond_dev);
        slave = rcu_dereference(bond->curr_active_slave);
        real_dev = slave ? slave->dev : NULL;
        netdev_hold(real_dev, &tracker, GFP_ATOMIC);
        rcu_read_unlock();
        if (!real_dev) {
                err = -ENODEV;
                goto out;
        }

        if (!real_dev->xfrmdev_ops ||
            !real_dev->xfrmdev_ops->xdo_dev_state_add ||
            netif_is_bond_master(real_dev)) {
                NL_SET_ERR_MSG_MOD(extack, "Slave does not support ipsec offload");
                err = -EINVAL;
                goto out;
        }

        ipsec = kmalloc(sizeof(*ipsec), GFP_KERNEL);
        if (!ipsec) {
                err = -ENOMEM;
                goto out;
        }

        xs->xso.real_dev = real_dev;
        err = real_dev->xfrmdev_ops->xdo_dev_state_add(xs, extack);
        if (!err) {
                ipsec->xs = xs;
                INIT_LIST_HEAD(&ipsec->list);
                mutex_lock(&bond->ipsec_lock);
                list_add(&ipsec->list, &bond->ipsec_list);
                mutex_unlock(&bond->ipsec_lock);
        } else {
                kfree(ipsec);
        }
out:
        netdev_put(real_dev, &tracker);
        return err;
}

static void bond_ipsec_add_sa_all(struct bonding *bond)
{
        struct net_device *bond_dev = bond->dev;
        struct net_device *real_dev;
        struct bond_ipsec *ipsec;
        struct slave *slave;

        slave = rtnl_dereference(bond->curr_active_slave);
        real_dev = slave ? slave->dev : NULL;
        if (!real_dev)
                return;

        mutex_lock(&bond->ipsec_lock);
        if (!real_dev->xfrmdev_ops ||
            !real_dev->xfrmdev_ops->xdo_dev_state_add ||
            netif_is_bond_master(real_dev)) {
                if (!list_empty(&bond->ipsec_list))
                        slave_warn(bond_dev, real_dev,
                                   "%s: no slave xdo_dev_state_add\n",
                                   __func__);
                goto out;
        }

        list_for_each_entry(ipsec, &bond->ipsec_list, list) {
                /* If new state is added before ipsec_lock acquired */
                if (ipsec->xs->xso.real_dev == real_dev)
                        continue;

                ipsec->xs->xso.real_dev = real_dev;
                if (real_dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs, NULL)) {
                        slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__);
                        ipsec->xs->xso.real_dev = NULL;
                }
        }
out:
        mutex_unlock(&bond->ipsec_lock);
}

/**
 * bond_ipsec_del_sa - clear out this specific SA
 * @xs: pointer to transformer state struct
 **/
static void bond_ipsec_del_sa(struct xfrm_state *xs)
{
        struct net_device *bond_dev = xs->xso.dev;
        struct net_device *real_dev;
        netdevice_tracker tracker;
        struct bond_ipsec *ipsec;
        struct bonding *bond;
        struct slave *slave;

        if (!bond_dev)
                return;

        rcu_read_lock();
        bond = netdev_priv(bond_dev);
        slave = rcu_dereference(bond->curr_active_slave);
        real_dev = slave ? slave->dev : NULL;
        netdev_hold(real_dev, &tracker, GFP_ATOMIC);
        rcu_read_unlock();

        if (!slave)
                goto out;

        if (!xs->xso.real_dev)
                goto out;

        WARN_ON(xs->xso.real_dev != real_dev);

        if (!real_dev->xfrmdev_ops ||
            !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
            netif_is_bond_master(real_dev)) {
                slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__);
                goto out;
        }

        real_dev->xfrmdev_ops->xdo_dev_state_delete(xs);
out:
        netdev_put(real_dev, &tracker);
        mutex_lock(&bond->ipsec_lock);
        list_for_each_entry(ipsec, &bond->ipsec_list, list) {
                if (ipsec->xs == xs) {
                        list_del(&ipsec->list);
                        kfree(ipsec);
                        break;
                }
        }
        mutex_unlock(&bond->ipsec_lock);
}

static void bond_ipsec_del_sa_all(struct bonding *bond)
{
        struct net_device *bond_dev = bond->dev;
        struct net_device *real_dev;
        struct bond_ipsec *ipsec;
        struct slave *slave;

        slave = rtnl_dereference(bond->curr_active_slave);
        real_dev = slave ? slave->dev : NULL;
        if (!real_dev)
                return;

        mutex_lock(&bond->ipsec_lock);
        list_for_each_entry(ipsec, &bond->ipsec_list, list) {
                if (!ipsec->xs->xso.real_dev)
                        continue;

                if (!real_dev->xfrmdev_ops ||
                    !real_dev->xfrmdev_ops->xdo_dev_state_delete ||
                    netif_is_bond_master(real_dev)) {
                        slave_warn(bond_dev, real_dev,
                                   "%s: no slave xdo_dev_state_delete\n",
                                   __func__);
                } else {
                        real_dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs);
                        if (real_dev->xfrmdev_ops->xdo_dev_state_free)
                                real_dev->xfrmdev_ops->xdo_dev_state_free(ipsec->xs);
                }
        }
        mutex_unlock(&bond->ipsec_lock);
}

static void bond_ipsec_free_sa(struct xfrm_state *xs)
{
        struct net_device *bond_dev = xs->xso.dev;
        struct net_device *real_dev;
        netdevice_tracker tracker;
        struct bonding *bond;
        struct slave *slave;

        if (!bond_dev)
                return;

        rcu_read_lock();
        bond = netdev_priv(bond_dev);
        slave = rcu_dereference(bond->curr_active_slave);
        real_dev = slave ? slave->dev : NULL;
        netdev_hold(real_dev, &tracker, GFP_ATOMIC);
        rcu_read_unlock();

        if (!slave)
                goto out;

        if (!xs->xso.real_dev)
                goto out;

        WARN_ON(xs->xso.real_dev != real_dev);

        if (real_dev && real_dev->xfrmdev_ops &&
            real_dev->xfrmdev_ops->xdo_dev_state_free)
                real_dev->xfrmdev_ops->xdo_dev_state_free(xs);
out:
        netdev_put(real_dev, &tracker);
}

/**
 * bond_ipsec_offload_ok - can this packet use the xfrm hw offload
 * @skb: current data packet
 * @xs: pointer to transformer state struct
 **/
static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
{
        struct net_device *real_dev;

        rcu_read_lock();
        real_dev = bond_ipsec_dev(xs);
        if (!real_dev || netif_is_bond_master(real_dev)) {
                rcu_read_unlock();
                return false;
        }

        rcu_read_unlock();
        return true;
}

/**
 * bond_advance_esn_state - ESN support for IPSec HW offload
 * @xs: pointer to transformer state struct
 **/
static void bond_advance_esn_state(struct xfrm_state *xs)
{
        struct net_device *real_dev;

        rcu_read_lock();
        real_dev = bond_ipsec_dev(xs);
        if (!real_dev)
                goto out;

        if (!real_dev->xfrmdev_ops ||
            !real_dev->xfrmdev_ops->xdo_dev_state_advance_esn) {
                pr_warn_ratelimited("%s: %s doesn't support xdo_dev_state_advance_esn\n", __func__, real_dev->name);
                goto out;
        }

        real_dev->xfrmdev_ops->xdo_dev_state_advance_esn(xs);
out:
        rcu_read_unlock();
}

/**
 * bond_xfrm_update_stats - Update xfrm state
 * @xs: pointer to transformer state struct
 **/
static void bond_xfrm_update_stats(struct xfrm_state *xs)
{
        struct net_device *real_dev;

        rcu_read_lock();
        real_dev = bond_ipsec_dev(xs);
        if (!real_dev)
                goto out;

        if (!real_dev->xfrmdev_ops ||
            !real_dev->xfrmdev_ops->xdo_dev_state_update_stats) {
                pr_warn_ratelimited("%s: %s doesn't support xdo_dev_state_update_stats\n", __func__, real_dev->name);
                goto out;
        }

        real_dev->xfrmdev_ops->xdo_dev_state_update_stats(xs);
out:
        rcu_read_unlock();
}

static const struct xfrmdev_ops bond_xfrmdev_ops = {
        .xdo_dev_state_add = bond_ipsec_add_sa,
        .xdo_dev_state_delete = bond_ipsec_del_sa,
        .xdo_dev_state_free = bond_ipsec_free_sa,
        .xdo_dev_offload_ok = bond_ipsec_offload_ok,
        .xdo_dev_state_advance_esn = bond_advance_esn_state,
        .xdo_dev_state_update_stats = bond_xfrm_update_stats,
};
#endif /* CONFIG_XFRM_OFFLOAD */

/*------------------------------- Link status -------------------------------*/

/* Set the carrier state for the master according to the state of its
 * slaves.  If any slaves are up, the master is up.  In 802.3ad mode,
 * do special 802.3ad magic.
 *
 * Returns zero if carrier state does not change, nonzero if it does.
 */
int bond_set_carrier(struct bonding *bond)
{
        struct list_head *iter;
        struct slave *slave;

        if (!bond_has_slaves(bond))
                goto down;

        if (BOND_MODE(bond) == BOND_MODE_8023AD)
                return bond_3ad_set_carrier(bond);

        bond_for_each_slave(bond, slave, iter) {
                if (slave->link == BOND_LINK_UP) {
                        if (!netif_carrier_ok(bond->dev)) {
                                netif_carrier_on(bond->dev);
                                return 1;
                        }
                        return 0;
                }
        }

down:
        if (netif_carrier_ok(bond->dev)) {
                netif_carrier_off(bond->dev);
                return 1;
        }
        return 0;
}

/* Get link speed and duplex from the slave's base driver
 * using ethtool. If for some reason the call fails or the
 * values are invalid, set speed and duplex to -1,
 * and return. Return 1 if speed or duplex settings are
 * UNKNOWN; 0 otherwise.
 */
static int bond_update_speed_duplex(struct slave *slave)
{
        struct net_device *slave_dev = slave->dev;
        struct ethtool_link_ksettings ecmd;
        int res;

        slave->speed = SPEED_UNKNOWN;
        slave->duplex = DUPLEX_UNKNOWN;

        res = __ethtool_get_link_ksettings(slave_dev, &ecmd);
        if (res < 0)
                return 1;
        if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1))
                return 1;
        switch (ecmd.base.duplex) {
        case DUPLEX_FULL:
        case DUPLEX_HALF:
                break;
        default:
                return 1;
        }

        slave->speed = ecmd.base.speed;
        slave->duplex = ecmd.base.duplex;

        return 0;
}

const char *bond_slave_link_status(s8 link)
{
        switch (link) {
        case BOND_LINK_UP:
                return "up";
        case BOND_LINK_FAIL:
                return "going down";
        case BOND_LINK_DOWN:
                return "down";
        case BOND_LINK_BACK:
                return "going back";
        default:
                return "unknown";
        }
}

/* if <dev> supports MII link status reporting, check its link status.
 *
 * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(),
 * depending upon the setting of the use_carrier parameter.
 *
 * Return either BMSR_LSTATUS, meaning that the link is up (or we
 * can't tell and just pretend it is), or 0, meaning that the link is
 * down.
 *
 * If reporting is non-zero, instead of faking link up, return -1 if
 * both ETHTOOL and MII ioctls fail (meaning the device does not
 * support them).  If use_carrier is set, return whatever it says.
 * It'd be nice if there was a good way to tell if a driver supports
 * netif_carrier, but there really isn't.
 */
static int bond_check_dev_link(struct bonding *bond,
                               struct net_device *slave_dev, int reporting)
{
        const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
        struct mii_ioctl_data *mii;
        struct ifreq ifr;
        int ret;

        if (!reporting && !netif_running(slave_dev))
                return 0;

        if (bond->params.use_carrier)
                return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0;

        /* Try to get link status using Ethtool first. */
        if (slave_dev->ethtool_ops->get_link) {
                netdev_lock_ops(slave_dev);
                ret = slave_dev->ethtool_ops->get_link(slave_dev);
                netdev_unlock_ops(slave_dev);

                return ret ? BMSR_LSTATUS : 0;
        }

        /* Ethtool can't be used, fallback to MII ioctls. */
        if (slave_ops->ndo_eth_ioctl) {
                /* TODO: set pointer to correct ioctl on a per team member
                 *       bases to make this more efficient. that is, once
                 *       we determine the correct ioctl, we will always
                 *       call it and not the others for that team
                 *       member.
                 */

                /* We cannot assume that SIOCGMIIPHY will also read a
                 * register; not all network drivers (e.g., e100)
                 * support that.
                 */

                /* Yes, the mii is overlaid on the ifreq.ifr_ifru */
                strscpy_pad(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
                mii = if_mii(&ifr);

                if (dev_eth_ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
                        mii->reg_num = MII_BMSR;
                        if (dev_eth_ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0)
                                return mii->val_out & BMSR_LSTATUS;
                }
        }

        /* If reporting, report that either there's no ndo_eth_ioctl,
         * or both SIOCGMIIREG and get_link failed (meaning that we
         * cannot report link status).  If not reporting, pretend
         * we're ok.
         */
        return reporting ? -1 : BMSR_LSTATUS;
}

/*----------------------------- Multicast list ------------------------------*/

/* Push the promiscuity flag down to appropriate slaves */
static int bond_set_promiscuity(struct bonding *bond, int inc)
{
        struct list_head *iter;
        int err = 0;

        if (bond_uses_primary(bond)) {
                struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);

                if (curr_active)
                        err = dev_set_promiscuity(curr_active->dev, inc);
        } else {
                struct slave *slave;

                bond_for_each_slave(bond, slave, iter) {
                        err = dev_set_promiscuity(slave->dev, inc);
                        if (err)
                                return err;
                }
        }
        return err;
}

/* Push the allmulti flag down to all slaves */
static int bond_set_allmulti(struct bonding *bond, int inc)
{
        struct list_head *iter;
        int err = 0;

        if (bond_uses_primary(bond)) {
                struct slave *curr_active = rtnl_dereference(bond->curr_active_slave);

                if (curr_active)
                        err = dev_set_allmulti(curr_active->dev, inc);
        } else {
                struct slave *slave;

                bond_for_each_slave(bond, slave, iter) {
                        err = dev_set_allmulti(slave->dev, inc);
                        if (err)
                                return err;
                }
        }
        return err;
}

/* Retrieve the list of registered multicast addresses for the bonding
 * device and retransmit an IGMP JOIN request to the current active
 * slave.
 */
static void bond_resend_igmp_join_requests_delayed(struct work_struct *work)
{
        struct bonding *bond = container_of(work, struct bonding,
                                            mcast_work.work);

        if (!rtnl_trylock()) {
                queue_delayed_work(bond->wq, &bond->mcast_work, 1);
                return;
        }
        call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev);

        if (bond->igmp_retrans > 1) {
                bond->igmp_retrans--;
                queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5);
        }
        rtnl_unlock();
}

/* Flush bond's hardware addresses from slave */
static void bond_hw_addr_flush(struct net_device *bond_dev,
                               struct net_device *slave_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);

        dev_uc_unsync(slave_dev, bond_dev);
        dev_mc_unsync(slave_dev, bond_dev);

        if (BOND_MODE(bond) == BOND_MODE_8023AD)
                dev_mc_del(slave_dev, lacpdu_mcast_addr);
}

/*--------------------------- Active slave change ---------------------------*/

/* Update the hardware address list and promisc/allmulti for the new and
 * old active slaves (if any).  Modes that are not using primary keep all
 * slaves up date at all times; only the modes that use primary need to call
 * this function to swap these settings during a failover.
 */
static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active,
                              struct slave *old_active)
{
        if (old_active) {
                if (bond->dev->flags & IFF_PROMISC)
                        dev_set_promiscuity(old_active->dev, -1);

                if (bond->dev->flags & IFF_ALLMULTI)
                        dev_set_allmulti(old_active->dev, -1);

                if (bond->dev->flags & IFF_UP)
                        bond_hw_addr_flush(bond->dev, old_active->dev);

                bond_slave_ns_maddrs_add(bond, old_active);
        }

        if (new_active) {
                /* FIXME: Signal errors upstream. */
                if (bond->dev->flags & IFF_PROMISC)
                        dev_set_promiscuity(new_active->dev, 1);

                if (bond->dev->flags & IFF_ALLMULTI)
                        dev_set_allmulti(new_active->dev, 1);

                if (bond->dev->flags & IFF_UP) {
                        netif_addr_lock_bh(bond->dev);
                        dev_uc_sync(new_active->dev, bond->dev);
                        dev_mc_sync(new_active->dev, bond->dev);
                        netif_addr_unlock_bh(bond->dev);
                }

                bond_slave_ns_maddrs_del(bond, new_active);
        }
}

/**
 * bond_set_dev_addr - clone slave's address to bond
 * @bond_dev: bond net device
 * @slave_dev: slave net device
 *
 * Should be called with RTNL held.
 */
static int bond_set_dev_addr(struct net_device *bond_dev,
                             struct net_device *slave_dev)
{
        int err;

        slave_dbg(bond_dev, slave_dev, "bond_dev=%p slave_dev=%p slave_dev->addr_len=%d\n",
                  bond_dev, slave_dev, slave_dev->addr_len);
        err = dev_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL);
        if (err)
                return err;

        __dev_addr_set(bond_dev, slave_dev->dev_addr, slave_dev->addr_len);
        bond_dev->addr_assign_type = NET_ADDR_STOLEN;
        call_netdevice_notifiers(NETDEV_CHANGEADDR, bond_dev);
        return 0;
}

static struct slave *bond_get_old_active(struct bonding *bond,
                                         struct slave *new_active)
{
        struct slave *slave;
        struct list_head *iter;

        bond_for_each_slave(bond, slave, iter) {
                if (slave == new_active)
                        continue;

                if (ether_addr_equal(bond->dev->dev_addr, slave->dev->dev_addr))
                        return slave;
        }

        return NULL;
}

/* bond_do_fail_over_mac
 *
 * Perform special MAC address swapping for fail_over_mac settings
 *
 * Called with RTNL
 */
static void bond_do_fail_over_mac(struct bonding *bond,
                                  struct slave *new_active,
                                  struct slave *old_active)
{
        u8 tmp_mac[MAX_ADDR_LEN];
        struct sockaddr_storage ss;
        int rv;

        switch (bond->params.fail_over_mac) {
        case BOND_FOM_ACTIVE:
                if (new_active) {
                        rv = bond_set_dev_addr(bond->dev, new_active->dev);
                        if (rv)
                                slave_err(bond->dev, new_active->dev, "Error %d setting bond MAC from slave\n",
                                          -rv);
                }
                break;
        case BOND_FOM_FOLLOW:
                /* if new_active && old_active, swap them
                 * if just old_active, do nothing (going to no active slave)
                 * if just new_active, set new_active to bond's MAC
                 */
                if (!new_active)
                        return;

                if (!old_active)
                        old_active = bond_get_old_active(bond, new_active);

                if (old_active) {
                        bond_hw_addr_copy(tmp_mac, new_active->dev->dev_addr,
                                          new_active->dev->addr_len);
                        bond_hw_addr_copy(ss.__data,
                                          old_active->dev->dev_addr,
                                          old_active->dev->addr_len);
                        ss.ss_family = new_active->dev->type;
                } else {
                        bond_hw_addr_copy(ss.__data, bond->dev->dev_addr,
                                          bond->dev->addr_len);
                        ss.ss_family = bond->dev->type;
                }

                rv = dev_set_mac_address(new_active->dev,
                                         (struct sockaddr *)&ss, NULL);
                if (rv) {
                        slave_err(bond->dev, new_active->dev, "Error %d setting MAC of new active slave\n",
                                  -rv);
                        goto out;
                }

                if (!old_active)
                        goto out;

                bond_hw_addr_copy(ss.__data, tmp_mac,
                                  new_active->dev->addr_len);
                ss.ss_family = old_active->dev->type;

                rv = dev_set_mac_address(old_active->dev,
                                         (struct sockaddr *)&ss, NULL);
                if (rv)
                        slave_err(bond->dev, old_active->dev, "Error %d setting MAC of old active slave\n",
                                  -rv);
out:
                break;
        default:
                netdev_err(bond->dev, "bond_do_fail_over_mac impossible: bad policy %d\n",
                           bond->params.fail_over_mac);
                break;
        }

}

/**
 * bond_choose_primary_or_current - select the primary or high priority slave
 * @bond: our bonding struct
 *
 * - Check if there is a primary link. If the primary link was set and is up,
 *   go on and do link reselection.
 *
 * - If primary link is not set or down, find the highest priority link.
 *   If the highest priority link is not current slave, set it as primary
 *   link and do link reselection.
 */
static struct slave *bond_choose_primary_or_current(struct bonding *bond)
{
        struct slave *prim = rtnl_dereference(bond->primary_slave);
        struct slave *curr = rtnl_dereference(bond->curr_active_slave);
        struct slave *slave, *hprio = NULL;
        struct list_head *iter;

        if (!prim || prim->link != BOND_LINK_UP) {
                bond_for_each_slave(bond, slave, iter) {
                        if (slave->link == BOND_LINK_UP) {
                                hprio = hprio ?: slave;
                                if (slave->prio > hprio->prio)
                                        hprio = slave;
                        }
                }

                if (hprio && hprio != curr) {
                        prim = hprio;
                        goto link_reselect;
                }

                if (!curr || curr->link != BOND_LINK_UP)
                        return NULL;
                return curr;
        }

        if (bond->force_primary) {
                bond->force_primary = false;
                return prim;
        }

link_reselect:
        if (!curr || curr->link != BOND_LINK_UP)
                return prim;

        /* At this point, prim and curr are both up */
        switch (bond->params.primary_reselect) {
        case BOND_PRI_RESELECT_ALWAYS:
                return prim;
        case BOND_PRI_RESELECT_BETTER:
                if (prim->speed < curr->speed)
                        return curr;
                if (prim->speed == curr->speed && prim->duplex <= curr->duplex)
                        return curr;
                return prim;
        case BOND_PRI_RESELECT_FAILURE:
                return curr;
        default:
                netdev_err(bond->dev, "impossible primary_reselect %d\n",
                           bond->params.primary_reselect);
                return curr;
        }
}

/**
 * bond_find_best_slave - select the best available slave to be the active one
 * @bond: our bonding struct
 */
static struct slave *bond_find_best_slave(struct bonding *bond)
{
        struct slave *slave, *bestslave = NULL;
        struct list_head *iter;
        int mintime = bond->params.updelay;

        slave = bond_choose_primary_or_current(bond);
        if (slave)
                return slave;

        bond_for_each_slave(bond, slave, iter) {
                if (slave->link == BOND_LINK_UP)
                        return slave;
                if (slave->link == BOND_LINK_BACK && bond_slave_is_up(slave) &&
                    slave->delay < mintime) {
                        mintime = slave->delay;
                        bestslave = slave;
                }
        }

        return bestslave;
}

/* must be called in RCU critical section or with RTNL held */
static bool bond_should_notify_peers(struct bonding *bond)
{
        struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave);

        if (!slave || !bond->send_peer_notif ||
            bond->send_peer_notif %
            max(1, bond->params.peer_notif_delay) != 0 ||
            !netif_carrier_ok(bond->dev) ||
            test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state))
                return false;

        netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n",
                   slave ? slave->dev->name : "NULL");

        return true;
}

/**
 * bond_change_active_slave - change the active slave into the specified one
 * @bond: our bonding struct
 * @new_active: the new slave to make the active one
 *
 * Set the new slave to the bond's settings and unset them on the old
 * curr_active_slave.
 * Setting include flags, mc-list, promiscuity, allmulti, etc.
 *
 * If @new's link state is %BOND_LINK_BACK we'll set it to %BOND_LINK_UP,
 * because it is apparently the best available slave we have, even though its
 * updelay hasn't timed out yet.
 *
 * Caller must hold RTNL.
 */
void bond_change_active_slave(struct bonding *bond, struct slave *new_active)
{
        struct slave *old_active;

        ASSERT_RTNL();

        old_active = rtnl_dereference(bond->curr_active_slave);

        if (old_active == new_active)
                return;

#ifdef CONFIG_XFRM_OFFLOAD
        bond_ipsec_del_sa_all(bond);
#endif /* CONFIG_XFRM_OFFLOAD */

        if (new_active) {
                new_active->last_link_up = jiffies;

                if (new_active->link == BOND_LINK_BACK) {
                        if (bond_uses_primary(bond)) {
                                slave_info(bond->dev, new_active->dev, "making interface the new active one %d ms earlier\n",
                                           (bond->params.updelay - new_active->delay) * bond->params.miimon);
                        }

                        new_active->delay = 0;
                        bond_set_slave_link_state(new_active, BOND_LINK_UP,
                                                  BOND_SLAVE_NOTIFY_NOW);

                        if (BOND_MODE(bond) == BOND_MODE_8023AD)
                                bond_3ad_handle_link_change(new_active, BOND_LINK_UP);

                        if (bond_is_lb(bond))
                                bond_alb_handle_link_change(bond, new_active, BOND_LINK_UP);
                } else {
                        if (bond_uses_primary(bond))
                                slave_info(bond->dev, new_active->dev, "making interface the new active one\n");
                }
        }

        if (bond_uses_primary(bond))
                bond_hw_addr_swap(bond, new_active, old_active);

        if (bond_is_lb(bond)) {
                bond_alb_handle_active_change(bond, new_active);
                if (old_active)
                        bond_set_slave_inactive_flags(old_active,
                                                      BOND_SLAVE_NOTIFY_NOW);
                if (new_active)
                        bond_set_slave_active_flags(new_active,
                                                    BOND_SLAVE_NOTIFY_NOW);
        } else {
                rcu_assign_pointer(bond->curr_active_slave, new_active);
        }

        if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
                if (old_active)
                        bond_set_slave_inactive_flags(old_active,
                                                      BOND_SLAVE_NOTIFY_NOW);

                if (new_active) {
                        bool should_notify_peers = false;

                        bond_set_slave_active_flags(new_active,
                                                    BOND_SLAVE_NOTIFY_NOW);

                        if (bond->params.fail_over_mac)
                                bond_do_fail_over_mac(bond, new_active,
                                                      old_active);

                        if (netif_running(bond->dev)) {
                                bond->send_peer_notif =
                                        bond->params.num_peer_notif *
                                        max(1, bond->params.peer_notif_delay);
                                should_notify_peers =
                                        bond_should_notify_peers(bond);
                        }

                        call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev);
                        if (should_notify_peers) {
                                bond->send_peer_notif--;
                                call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
                                                         bond->dev);
                        }
                }
        }

#ifdef CONFIG_XFRM_OFFLOAD
        bond_ipsec_add_sa_all(bond);
#endif /* CONFIG_XFRM_OFFLOAD */

        /* resend IGMP joins since active slave has changed or
         * all were sent on curr_active_slave.
         * resend only if bond is brought up with the affected
         * bonding modes and the retransmission is enabled
         */
        if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) &&
            ((bond_uses_primary(bond) && new_active) ||
             BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) {
                bond->igmp_retrans = bond->params.resend_igmp;
                queue_delayed_work(bond->wq, &bond->mcast_work, 1);
        }
}

/**
 * bond_select_active_slave - select a new active slave, if needed
 * @bond: our bonding struct
 *
 * This functions should be called when one of the following occurs:
 * - The old curr_active_slave has been released or lost its link.
 * - The primary_slave has got its link back.
 * - A slave has got its link back and there's no old curr_active_slave.
 *
 * Caller must hold RTNL.
 */
void bond_select_active_slave(struct bonding *bond)
{
        struct slave *best_slave;
        int rv;

        ASSERT_RTNL();

        best_slave = bond_find_best_slave(bond);
        if (best_slave != rtnl_dereference(bond->curr_active_slave)) {
                bond_change_active_slave(bond, best_slave);
                rv = bond_set_carrier(bond);
                if (!rv)
                        return;

                if (netif_carrier_ok(bond->dev))
                        netdev_info(bond->dev, "active interface up!\n");
                else
                        netdev_info(bond->dev, "now running without any active interface!\n");
        }
}

#ifdef CONFIG_NET_POLL_CONTROLLER
static inline int slave_enable_netpoll(struct slave *slave)
{
        struct netpoll *np;
        int err = 0;

        np = kzalloc(sizeof(*np), GFP_KERNEL);
        err = -ENOMEM;
        if (!np)
                goto out;

        err = __netpoll_setup(np, slave->dev);
        if (err) {
                kfree(np);
                goto out;
        }
        slave->np = np;
out:
        return err;
}
static inline void slave_disable_netpoll(struct slave *slave)
{
        struct netpoll *np = slave->np;

        if (!np)
                return;

        slave->np = NULL;

        __netpoll_free(np);
}

static void bond_poll_controller(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave = NULL;
        struct list_head *iter;
        struct ad_info ad_info;

        if (BOND_MODE(bond) == BOND_MODE_8023AD)
                if (bond_3ad_get_active_agg_info(bond, &ad_info))
                        return;

        bond_for_each_slave_rcu(bond, slave, iter) {
                if (!bond_slave_is_up(slave))
                        continue;

                if (BOND_MODE(bond) == BOND_MODE_8023AD) {
                        struct aggregator *agg =
                            SLAVE_AD_INFO(slave)->port.aggregator;

                        if (agg &&
                            agg->aggregator_identifier != ad_info.aggregator_id)
                                continue;
                }

                netpoll_poll_dev(slave->dev);
        }
}

static void bond_netpoll_cleanup(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct list_head *iter;
        struct slave *slave;

        bond_for_each_slave(bond, slave, iter)
                if (bond_slave_is_up(slave))
                        slave_disable_netpoll(slave);
}

static int bond_netpoll_setup(struct net_device *dev)
{
        struct bonding *bond = netdev_priv(dev);
        struct list_head *iter;
        struct slave *slave;
        int err = 0;

        bond_for_each_slave(bond, slave, iter) {
                err = slave_enable_netpoll(slave);
                if (err) {
                        bond_netpoll_cleanup(dev);
                        break;
                }
        }
        return err;
}
#else
static inline int slave_enable_netpoll(struct slave *slave)
{
        return 0;
}
static inline void slave_disable_netpoll(struct slave *slave)
{
}
static void bond_netpoll_cleanup(struct net_device *bond_dev)
{
}
#endif

/*---------------------------------- IOCTL ----------------------------------*/

static netdev_features_t bond_fix_features(struct net_device *dev,
                                           netdev_features_t features)
{
        struct bonding *bond = netdev_priv(dev);
        struct list_head *iter;
        netdev_features_t mask;
        struct slave *slave;

        mask = features;
        features = netdev_base_features(features);

        bond_for_each_slave(bond, slave, iter) {
                features = netdev_increment_features(features,
                                                     slave->dev->features,
                                                     mask);
        }
        features = netdev_add_tso_features(features, mask);

        return features;
}

#define BOND_VLAN_FEATURES        (NETIF_F_HW_CSUM | NETIF_F_SG | \
                                 NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \
                                 NETIF_F_GSO_ENCAP_ALL | \
                                 NETIF_F_HIGHDMA | NETIF_F_LRO)

#define BOND_ENC_FEATURES        (NETIF_F_HW_CSUM | NETIF_F_SG | \
                                 NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE | \
                                 NETIF_F_GSO_PARTIAL)

#define BOND_MPLS_FEATURES        (NETIF_F_HW_CSUM | NETIF_F_SG | \
                                 NETIF_F_GSO_SOFTWARE)

#define BOND_GSO_PARTIAL_FEATURES (NETIF_F_GSO_ESP)


static void bond_compute_features(struct bonding *bond)
{
        netdev_features_t gso_partial_features = BOND_GSO_PARTIAL_FEATURES;
        unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
                                        IFF_XMIT_DST_RELEASE_PERM;
        netdev_features_t vlan_features = BOND_VLAN_FEATURES;
        netdev_features_t enc_features  = BOND_ENC_FEATURES;
#ifdef CONFIG_XFRM_OFFLOAD
        netdev_features_t xfrm_features  = BOND_XFRM_FEATURES;
#endif /* CONFIG_XFRM_OFFLOAD */
        netdev_features_t mpls_features  = BOND_MPLS_FEATURES;
        struct net_device *bond_dev = bond->dev;
        struct list_head *iter;
        struct slave *slave;
        unsigned short max_hard_header_len = ETH_HLEN;
        unsigned int tso_max_size = TSO_MAX_SIZE;
        u16 tso_max_segs = TSO_MAX_SEGS;

        if (!bond_has_slaves(bond))
                goto done;

        vlan_features = netdev_base_features(vlan_features);
        mpls_features = netdev_base_features(mpls_features);

        bond_for_each_slave(bond, slave, iter) {
                vlan_features = netdev_increment_features(vlan_features,
                        slave->dev->vlan_features, BOND_VLAN_FEATURES);

                enc_features = netdev_increment_features(enc_features,
                                                         slave->dev->hw_enc_features,
                                                         BOND_ENC_FEATURES);

#ifdef CONFIG_XFRM_OFFLOAD
                xfrm_features = netdev_increment_features(xfrm_features,
                                                          slave->dev->hw_enc_features,
                                                          BOND_XFRM_FEATURES);
#endif /* CONFIG_XFRM_OFFLOAD */

                gso_partial_features = netdev_increment_features(gso_partial_features,
                                                                 slave->dev->gso_partial_features,
                                                                 BOND_GSO_PARTIAL_FEATURES);

                mpls_features = netdev_increment_features(mpls_features,
                                                          slave->dev->mpls_features,
                                                          BOND_MPLS_FEATURES);

                dst_release_flag &= slave->dev->priv_flags;
                if (slave->dev->hard_header_len > max_hard_header_len)
                        max_hard_header_len = slave->dev->hard_header_len;

                tso_max_size = min(tso_max_size, slave->dev->tso_max_size);
                tso_max_segs = min(tso_max_segs, slave->dev->tso_max_segs);
        }
        bond_dev->hard_header_len = max_hard_header_len;

done:
        bond_dev->gso_partial_features = gso_partial_features;
        bond_dev->vlan_features = vlan_features;
        bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
                                    NETIF_F_HW_VLAN_CTAG_TX |
                                    NETIF_F_HW_VLAN_STAG_TX;
#ifdef CONFIG_XFRM_OFFLOAD
        bond_dev->hw_enc_features |= xfrm_features;
#endif /* CONFIG_XFRM_OFFLOAD */
        bond_dev->mpls_features = mpls_features;
        netif_set_tso_max_segs(bond_dev, tso_max_segs);
        netif_set_tso_max_size(bond_dev, tso_max_size);

        bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
        if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
            dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
                bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE;

        netdev_change_features(bond_dev);
}

static void bond_setup_by_slave(struct net_device *bond_dev,
                                struct net_device *slave_dev)
{
        bool was_up = !!(bond_dev->flags & IFF_UP);

        dev_close(bond_dev);

        bond_dev->header_ops            = slave_dev->header_ops;

        bond_dev->type                    = slave_dev->type;
        bond_dev->hard_header_len   = slave_dev->hard_header_len;
        bond_dev->needed_headroom   = slave_dev->needed_headroom;
        bond_dev->addr_len            = slave_dev->addr_len;

        memcpy(bond_dev->broadcast, slave_dev->broadcast,
                slave_dev->addr_len);

        if (slave_dev->flags & IFF_POINTOPOINT) {
                bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST);
                bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP);
        }
        if (was_up)
                dev_open(bond_dev, NULL);
}

/* On bonding slaves other than the currently active slave, suppress
 * duplicates except for alb non-mcast/bcast.
 */
static bool bond_should_deliver_exact_match(struct sk_buff *skb,
                                            struct slave *slave,
                                            struct bonding *bond)
{
        if (bond_is_slave_inactive(slave)) {
                if (BOND_MODE(bond) == BOND_MODE_ALB &&
                    skb->pkt_type != PACKET_BROADCAST &&
                    skb->pkt_type != PACKET_MULTICAST)
                        return false;
                return true;
        }
        return false;
}

static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb)
{
        struct sk_buff *skb = *pskb;
        struct slave *slave;
        struct bonding *bond;
        int (*recv_probe)(const struct sk_buff *, struct bonding *,
                          struct slave *);
        int ret = RX_HANDLER_ANOTHER;

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (unlikely(!skb))
                return RX_HANDLER_CONSUMED;

        *pskb = skb;

        slave = bond_slave_get_rcu(skb->dev);
        bond = slave->bond;

        recv_probe = READ_ONCE(bond->recv_probe);
        if (recv_probe) {
                ret = recv_probe(skb, bond, slave);
                if (ret == RX_HANDLER_CONSUMED) {
                        consume_skb(skb);
                        return ret;
                }
        }

        /*
         * For packets determined by bond_should_deliver_exact_match() call to
         * be suppressed we want to make an exception for link-local packets.
         * This is necessary for e.g. LLDP daemons to be able to monitor
         * inactive slave links without being forced to bind to them
         * explicitly.
         *
         * At the same time, packets that are passed to the bonding master
         * (including link-local ones) can have their originating interface
         * determined via PACKET_ORIGDEV socket option.
         */
        if (bond_should_deliver_exact_match(skb, slave, bond)) {
                if (is_link_local_ether_addr(eth_hdr(skb)->h_dest))
                        return RX_HANDLER_PASS;
                return RX_HANDLER_EXACT;
        }

        skb->dev = bond->dev;

        if (BOND_MODE(bond) == BOND_MODE_ALB &&
            netif_is_bridge_port(bond->dev) &&
            skb->pkt_type == PACKET_HOST) {

                if (unlikely(skb_cow_head(skb,
                                          skb->data - skb_mac_header(skb)))) {
                        kfree_skb(skb);
                        return RX_HANDLER_CONSUMED;
                }
                bond_hw_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr,
                                  bond->dev->addr_len);
        }

        return ret;
}

static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond)
{
        switch (BOND_MODE(bond)) {
        case BOND_MODE_ROUNDROBIN:
                return NETDEV_LAG_TX_TYPE_ROUNDROBIN;
        case BOND_MODE_ACTIVEBACKUP:
                return NETDEV_LAG_TX_TYPE_ACTIVEBACKUP;
        case BOND_MODE_BROADCAST:
                return NETDEV_LAG_TX_TYPE_BROADCAST;
        case BOND_MODE_XOR:
        case BOND_MODE_8023AD:
                return NETDEV_LAG_TX_TYPE_HASH;
        default:
                return NETDEV_LAG_TX_TYPE_UNKNOWN;
        }
}

static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond,
                                               enum netdev_lag_tx_type type)
{
        if (type != NETDEV_LAG_TX_TYPE_HASH)
                return NETDEV_LAG_HASH_NONE;

        switch (bond->params.xmit_policy) {
        case BOND_XMIT_POLICY_LAYER2:
                return NETDEV_LAG_HASH_L2;
        case BOND_XMIT_POLICY_LAYER34:
                return NETDEV_LAG_HASH_L34;
        case BOND_XMIT_POLICY_LAYER23:
                return NETDEV_LAG_HASH_L23;
        case BOND_XMIT_POLICY_ENCAP23:
                return NETDEV_LAG_HASH_E23;
        case BOND_XMIT_POLICY_ENCAP34:
                return NETDEV_LAG_HASH_E34;
        case BOND_XMIT_POLICY_VLAN_SRCMAC:
                return NETDEV_LAG_HASH_VLAN_SRCMAC;
        default:
                return NETDEV_LAG_HASH_UNKNOWN;
        }
}

static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave,
                                      struct netlink_ext_ack *extack)
{
        struct netdev_lag_upper_info lag_upper_info;
        enum netdev_lag_tx_type type;
        int err;

        type = bond_lag_tx_type(bond);
        lag_upper_info.tx_type = type;
        lag_upper_info.hash_type = bond_lag_hash_type(bond, type);

        err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave,
                                           &lag_upper_info, extack);
        if (err)
                return err;

        slave->dev->flags |= IFF_SLAVE;
        return 0;
}

static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave)
{
        netdev_upper_dev_unlink(slave->dev, bond->dev);
        slave->dev->flags &= ~IFF_SLAVE;
}

static void slave_kobj_release(struct kobject *kobj)
{
        struct slave *slave = to_slave(kobj);
        struct bonding *bond = bond_get_bond_by_slave(slave);

        cancel_delayed_work_sync(&slave->notify_work);
        if (BOND_MODE(bond) == BOND_MODE_8023AD)
                kfree(SLAVE_AD_INFO(slave));

        kfree(slave);
}

static struct kobj_type slave_ktype = {
        .release = slave_kobj_release,
#ifdef CONFIG_SYSFS
        .sysfs_ops = &slave_sysfs_ops,
#endif
};

static int bond_kobj_init(struct slave *slave)
{
        int err;

        err = kobject_init_and_add(&slave->kobj, &slave_ktype,
                                   &(slave->dev->dev.kobj), "bonding_slave");
        if (err)
                kobject_put(&slave->kobj);

        return err;
}

static struct slave *bond_alloc_slave(struct bonding *bond,
                                      struct net_device *slave_dev)
{
        struct slave *slave = NULL;

        slave = kzalloc(sizeof(*slave), GFP_KERNEL);
        if (!slave)
                return NULL;

        slave->bond = bond;
        slave->dev = slave_dev;
        INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work);

        if (bond_kobj_init(slave))
                return NULL;

        if (BOND_MODE(bond) == BOND_MODE_8023AD) {
                SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info),
                                               GFP_KERNEL);
                if (!SLAVE_AD_INFO(slave)) {
                        kobject_put(&slave->kobj);
                        return NULL;
                }
        }

        return slave;
}

static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info)
{
        info->bond_mode = BOND_MODE(bond);
        info->miimon = bond->params.miimon;
        info->num_slaves = bond->slave_cnt;
}

static void bond_fill_ifslave(struct slave *slave, struct ifslave *info)
{
        strcpy(info->slave_name, slave->dev->name);
        info->link = slave->link;
        info->state = bond_slave_state(slave);
        info->link_failure_count = slave->link_failure_count;
}

static void bond_netdev_notify_work(struct work_struct *_work)
{
        struct slave *slave = container_of(_work, struct slave,
                                           notify_work.work);

        if (rtnl_trylock()) {
                struct netdev_bonding_info binfo;

                bond_fill_ifslave(slave, &binfo.slave);
                bond_fill_ifbond(slave->bond, &binfo.master);
                netdev_bonding_info_change(slave->dev, &binfo);
                rtnl_unlock();
        } else {
                queue_delayed_work(slave->bond->wq, &slave->notify_work, 1);
        }
}

void bond_queue_slave_event(struct slave *slave)
{
        queue_delayed_work(slave->bond->wq, &slave->notify_work, 0);
}

void bond_lower_state_changed(struct slave *slave)
{
        struct netdev_lag_lower_state_info info;

        info.link_up = slave->link == BOND_LINK_UP ||
                       slave->link == BOND_LINK_FAIL;
        info.tx_enabled = bond_is_active_slave(slave);
        netdev_lower_state_changed(slave->dev, &info);
}

#define BOND_NL_ERR(bond_dev, extack, errmsg) do {                \
        if (extack)                                                \
                NL_SET_ERR_MSG(extack, errmsg);                        \
        else                                                        \
                netdev_err(bond_dev, "Error: %s\n", errmsg);        \
} while (0)

#define SLAVE_NL_ERR(bond_dev, slave_dev, extack, errmsg) do {                \
        if (extack)                                                        \
                NL_SET_ERR_MSG(extack, errmsg);                                \
        else                                                                \
                slave_err(bond_dev, slave_dev, "Error: %s\n", errmsg);        \
} while (0)

/* The bonding driver uses ether_setup() to convert a master bond device
 * to ARPHRD_ETHER, that resets the target netdevice's flags so we always
 * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP
 * if they were set
 */
static void bond_ether_setup(struct net_device *bond_dev)
{
        unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP);

        ether_setup(bond_dev);
        bond_dev->flags |= IFF_MASTER | flags;
        bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING;
}

void bond_xdp_set_features(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        xdp_features_t val = NETDEV_XDP_ACT_MASK;
        struct list_head *iter;
        struct slave *slave;

        ASSERT_RTNL();

        if (!bond_xdp_check(bond, BOND_MODE(bond)) || !bond_has_slaves(bond)) {
                xdp_clear_features_flag(bond_dev);
                return;
        }

        bond_for_each_slave(bond, slave, iter)
                val &= slave->dev->xdp_features;

        val &= ~NETDEV_XDP_ACT_XSK_ZEROCOPY;

        xdp_set_features_flag(bond_dev, val);
}

/* enslave device <slave> to bond device <master> */
int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
                 struct netlink_ext_ack *extack)
{
        struct bonding *bond = netdev_priv(bond_dev);
        const struct net_device_ops *slave_ops = slave_dev->netdev_ops;
        struct slave *new_slave = NULL, *prev_slave;
        struct sockaddr_storage ss;
        int link_reporting;
        int res = 0, i;

        if (slave_dev->flags & IFF_MASTER &&
            !netif_is_bond_master(slave_dev)) {
                BOND_NL_ERR(bond_dev, extack,
                            "Device type (master device) cannot be enslaved");
                return -EPERM;
        }

        if (!bond->params.use_carrier &&
            slave_dev->ethtool_ops->get_link == NULL &&
            slave_ops->ndo_eth_ioctl == NULL) {
                slave_warn(bond_dev, slave_dev, "no link monitoring support\n");
        }

        /* already in-use? */
        if (netdev_is_rx_handler_busy(slave_dev)) {
                SLAVE_NL_ERR(bond_dev, slave_dev, extack,
                             "Device is in use and cannot be enslaved");
                return -EBUSY;
        }

        if (bond_dev == slave_dev) {
                BOND_NL_ERR(bond_dev, extack, "Cannot enslave bond to itself.");
                return -EPERM;
        }

        /* vlan challenged mutual exclusion */
        /* no need to lock since we're protected by rtnl_lock */
        if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) {
                slave_dbg(bond_dev, slave_dev, "is NETIF_F_VLAN_CHALLENGED\n");
                if (vlan_uses_dev(bond_dev)) {
                        SLAVE_NL_ERR(bond_dev, slave_dev, extack,
                                     "Can not enslave VLAN challenged device to VLAN enabled bond");
                        return -EPERM;
                } else {
                        slave_warn(bond_dev, slave_dev, "enslaved VLAN challenged slave. Adding VLANs will be blocked as long as it is part of bond.\n");
                }
        } else {
                slave_dbg(bond_dev, slave_dev, "is !NETIF_F_VLAN_CHALLENGED\n");
        }

        if (slave_dev->features & NETIF_F_HW_ESP)
                slave_dbg(bond_dev, slave_dev, "is esp-hw-offload capable\n");

        /* Old ifenslave binaries are no longer supported.  These can
         * be identified with moderate accuracy by the state of the slave:
         * the current ifenslave will set the interface down prior to
         * enslaving it; the old ifenslave will not.
         */
        if (slave_dev->flags & IFF_UP) {
                SLAVE_NL_ERR(bond_dev, slave_dev, extack,
                             "Device can not be enslaved while up");
                return -EPERM;
        }

        /* set bonding device ether type by slave - bonding netdevices are
         * created with ether_setup, so when the slave type is not ARPHRD_ETHER
         * there is a need to override some of the type dependent attribs/funcs.
         *
         * bond ether type mutual exclusion - don't allow slaves of dissimilar
         * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond
         */
        if (!bond_has_slaves(bond)) {
                if (bond_dev->type != slave_dev->type) {
                        slave_dbg(bond_dev, slave_dev, "change device type from %d to %d\n",
                                  bond_dev->type, slave_dev->type);

                        res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE,
                                                       bond_dev);
                        res = notifier_to_errno(res);
                        if (res) {
                                slave_err(bond_dev, slave_dev, "refused to change device type\n");
                                return -EBUSY;
                        }

                        /* Flush unicast and multicast addresses */
                        dev_uc_flush(bond_dev);
                        dev_mc_flush(bond_dev);

                        if (slave_dev->type != ARPHRD_ETHER)
                                bond_setup_by_slave(bond_dev, slave_dev);
                        else
                                bond_ether_setup(bond_dev);

                        call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE,
                                                 bond_dev);
                }
        } else if (bond_dev->type != slave_dev->type) {
                SLAVE_NL_ERR(bond_dev, slave_dev, extack,
                             "Device type is different from other slaves");
                return -EINVAL;
        }

        if (slave_dev->type == ARPHRD_INFINIBAND &&
            BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
                SLAVE_NL_ERR(bond_dev, slave_dev, extack,
                             "Only active-backup mode is supported for infiniband slaves");
                res = -EOPNOTSUPP;
                goto err_undo_flags;
        }

        if (!slave_ops->ndo_set_mac_address ||
            slave_dev->type == ARPHRD_INFINIBAND) {
                slave_warn(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address\n");
                if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP &&
                    bond->params.fail_over_mac != BOND_FOM_ACTIVE) {
                        if (!bond_has_slaves(bond)) {
                                bond->params.fail_over_mac = BOND_FOM_ACTIVE;
                                slave_warn(bond_dev, slave_dev, "Setting fail_over_mac to active for active-backup mode\n");
                        } else {
                                SLAVE_NL_ERR(bond_dev, slave_dev, extack,
                                             "Slave device does not support setting the MAC address, but fail_over_mac is not set to active");
                                res = -EOPNOTSUPP;
                                goto err_undo_flags;
                        }
                }
        }

        call_netdevice_notifiers(NETDEV_JOIN, slave_dev);

        /* If this is the first slave, then we need to set the master's hardware
         * address to be the same as the slave's.
         */
        if (!bond_has_slaves(bond) &&
            bond->dev->addr_assign_type == NET_ADDR_RANDOM) {
                res = bond_set_dev_addr(bond->dev, slave_dev);
                if (res)
                        goto err_undo_flags;
        }

        new_slave = bond_alloc_slave(bond, slave_dev);
        if (!new_slave) {
                res = -ENOMEM;
                goto err_undo_flags;
        }

        /* Set the new_slave's queue_id to be zero.  Queue ID mapping
         * is set via sysfs or module option if desired.
         */
        new_slave->queue_id = 0;

        /* Save slave's original mtu and then set it to match the bond */
        new_slave->original_mtu = slave_dev->mtu;
        res = dev_set_mtu(slave_dev, bond->dev->mtu);
        if (res) {
                slave_err(bond_dev, slave_dev, "Error %d calling dev_set_mtu\n", res);
                goto err_free;
        }

        /* Save slave's original ("permanent") mac address for modes
         * that need it, and for restoring it upon release, and then
         * set it to the master's address
         */
        bond_hw_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr,
                          slave_dev->addr_len);

        if (!bond->params.fail_over_mac ||
            BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
                /* Set slave to master's mac address.  The application already
                 * set the master's mac address to that of the first slave
                 */
                memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len);
                ss.ss_family = slave_dev->type;
                res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss,
                                          extack);
                if (res) {
                        slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res);
                        goto err_restore_mtu;
                }
        }

        /* set no_addrconf flag before open to prevent IPv6 addrconf */
        slave_dev->priv_flags |= IFF_NO_ADDRCONF;

        /* open the slave since the application closed it */
        res = dev_open(slave_dev, extack);
        if (res) {
                slave_err(bond_dev, slave_dev, "Opening slave failed\n");
                goto err_restore_mac;
        }

        slave_dev->priv_flags |= IFF_BONDING;
        /* initialize slave stats */
        dev_get_stats(new_slave->dev, &new_slave->slave_stats);

        if (bond_is_lb(bond)) {
                /* bond_alb_init_slave() must be called before all other stages since
                 * it might fail and we do not want to have to undo everything
                 */
                res = bond_alb_init_slave(bond, new_slave);
                if (res)
                        goto err_close;
        }

        res = vlan_vids_add_by_dev(slave_dev, bond_dev);
        if (res) {
                slave_err(bond_dev, slave_dev, "Couldn't add bond vlan ids\n");
                goto err_close;
        }

        prev_slave = bond_last_slave(bond);

        new_slave->delay = 0;
        new_slave->link_failure_count = 0;

        if (bond_update_speed_duplex(new_slave) &&
            bond_needs_speed_duplex(bond))
                new_slave->link = BOND_LINK_DOWN;

        new_slave->last_rx = jiffies -
                (msecs_to_jiffies(bond->params.arp_interval) + 1);
        for (i = 0; i < BOND_MAX_ARP_TARGETS; i++)
                new_slave->target_last_arp_rx[i] = new_slave->last_rx;

        new_slave->last_tx = new_slave->last_rx;

        if (bond->params.miimon && !bond->params.use_carrier) {
                link_reporting = bond_check_dev_link(bond, slave_dev, 1);

                if ((link_reporting == -1) && !bond->params.arp_interval) {
                        /* miimon is set but a bonded network driver
                         * does not support ETHTOOL/MII and
                         * arp_interval is not set.  Note: if
                         * use_carrier is enabled, we will never go
                         * here (because netif_carrier is always
                         * supported); thus, we don't need to change
                         * the messages for netif_carrier.
                         */
                        slave_warn(bond_dev, slave_dev, "MII and ETHTOOL support not available for slave, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n");
                } else if (link_reporting == -1) {
                        /* unable get link status using mii/ethtool */
                        slave_warn(bond_dev, slave_dev, "can't get link status from slave; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n");
                }
        }

        /* check for initial state */
        new_slave->link = BOND_LINK_NOCHANGE;
        if (bond->params.miimon) {
                if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) {
                        if (bond->params.updelay) {
                                bond_set_slave_link_state(new_slave,
                                                          BOND_LINK_BACK,
                                                          BOND_SLAVE_NOTIFY_NOW);
                                new_slave->delay = bond->params.updelay;
                        } else {
                                bond_set_slave_link_state(new_slave,
                                                          BOND_LINK_UP,
                                                          BOND_SLAVE_NOTIFY_NOW);
                        }
                } else {
                        bond_set_slave_link_state(new_slave, BOND_LINK_DOWN,
                                                  BOND_SLAVE_NOTIFY_NOW);
                }
        } else if (bond->params.arp_interval) {
                bond_set_slave_link_state(new_slave,
                                          (netif_carrier_ok(slave_dev) ?
                                          BOND_LINK_UP : BOND_LINK_DOWN),
                                          BOND_SLAVE_NOTIFY_NOW);
        } else {
                bond_set_slave_link_state(new_slave, BOND_LINK_UP,
                                          BOND_SLAVE_NOTIFY_NOW);
        }

        if (new_slave->link != BOND_LINK_DOWN)
                new_slave->last_link_up = jiffies;
        slave_dbg(bond_dev, slave_dev, "Initial state of slave is BOND_LINK_%s\n",
                  new_slave->link == BOND_LINK_DOWN ? "DOWN" :
                  (new_slave->link == BOND_LINK_UP ? "UP" : "BACK"));

        if (bond_uses_primary(bond) && bond->params.primary[0]) {
                /* if there is a primary slave, remember it */
                if (strcmp(bond->params.primary, new_slave->dev->name) == 0) {
                        rcu_assign_pointer(bond->primary_slave, new_slave);
                        bond->force_primary = true;
                }
        }

        switch (BOND_MODE(bond)) {
        case BOND_MODE_ACTIVEBACKUP:
                bond_set_slave_inactive_flags(new_slave,
                                              BOND_SLAVE_NOTIFY_NOW);
                break;
        case BOND_MODE_8023AD:
                /* in 802.3ad mode, the internal mechanism
                 * will activate the slaves in the selected
                 * aggregator
                 */
                bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
                /* if this is the first slave */
                if (!prev_slave) {
                        SLAVE_AD_INFO(new_slave)->id = 1;
                        /* Initialize AD with the number of times that the AD timer is called in 1 second
                         * can be called only after the mac address of the bond is set
                         */
                        bond_3ad_initialize(bond);
                } else {
                        SLAVE_AD_INFO(new_slave)->id =
                                SLAVE_AD_INFO(prev_slave)->id + 1;
                }

                bond_3ad_bind_slave(new_slave);
                break;
        case BOND_MODE_TLB:
        case BOND_MODE_ALB:
                bond_set_active_slave(new_slave);
                bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW);
                break;
        default:
                slave_dbg(bond_dev, slave_dev, "This slave is always active in trunk mode\n");

                /* always active in trunk mode */
                bond_set_active_slave(new_slave);

                /* In trunking mode there is little meaning to curr_active_slave
                 * anyway (it holds no special properties of the bond device),
                 * so we can change it without calling change_active_interface()
                 */
                if (!rcu_access_pointer(bond->curr_active_slave) &&
                    new_slave->link == BOND_LINK_UP)
                        rcu_assign_pointer(bond->curr_active_slave, new_slave);

                break;
        } /* switch(bond_mode) */

#ifdef CONFIG_NET_POLL_CONTROLLER
        if (bond->dev->npinfo) {
                if (slave_enable_netpoll(new_slave)) {
                        slave_info(bond_dev, slave_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n");
                        res = -EBUSY;
                        goto err_detach;
                }
        }
#endif

        if (!(bond_dev->features & NETIF_F_LRO))
                dev_disable_lro(slave_dev);

        res = netdev_rx_handler_register(slave_dev, bond_handle_frame,
                                         new_slave);
        if (res) {
                slave_dbg(bond_dev, slave_dev, "Error %d calling netdev_rx_handler_register\n", res);
                goto err_detach;
        }

        res = bond_master_upper_dev_link(bond, new_slave, extack);
        if (res) {
                slave_dbg(bond_dev, slave_dev, "Error %d calling bond_master_upper_dev_link\n", res);
                goto err_unregister;
        }

        bond_lower_state_changed(new_slave);

        res = bond_sysfs_slave_add(new_slave);
        if (res) {
                slave_dbg(bond_dev, slave_dev, "Error %d calling bond_sysfs_slave_add\n", res);
                goto err_upper_unlink;
        }

        /* If the mode uses primary, then the following is handled by
         * bond_change_active_slave().
         */
        if (!bond_uses_primary(bond)) {
                /* set promiscuity level to new slave */
                if (bond_dev->flags & IFF_PROMISC) {
                        res = dev_set_promiscuity(slave_dev, 1);
                        if (res)
                                goto err_sysfs_del;
                }

                /* set allmulti level to new slave */
                if (bond_dev->flags & IFF_ALLMULTI) {
                        res = dev_set_allmulti(slave_dev, 1);
                        if (res) {
                                if (bond_dev->flags & IFF_PROMISC)
                                        dev_set_promiscuity(slave_dev, -1);
                                goto err_sysfs_del;
                        }
                }

                if (bond_dev->flags & IFF_UP) {
                        netif_addr_lock_bh(bond_dev);
                        dev_mc_sync_multiple(slave_dev, bond_dev);
                        dev_uc_sync_multiple(slave_dev, bond_dev);
                        netif_addr_unlock_bh(bond_dev);

                        if (BOND_MODE(bond) == BOND_MODE_8023AD)
                                dev_mc_add(slave_dev, lacpdu_mcast_addr);
                }
        }

        bond->slave_cnt++;
        bond_compute_features(bond);
        bond_set_carrier(bond);

        /* Needs to be called before bond_select_active_slave(), which will
         * remove the maddrs if the slave is selected as active slave.
         */
        bond_slave_ns_maddrs_add(bond, new_slave);

        if (bond_uses_primary(bond)) {
                block_netpoll_tx();
                bond_select_active_slave(bond);
                unblock_netpoll_tx();
        }

        if (bond_mode_can_use_xmit_hash(bond))
                bond_update_slave_arr(bond, NULL);

        if (!slave_dev->netdev_ops->ndo_bpf ||
            !slave_dev->netdev_ops->ndo_xdp_xmit) {
                if (bond->xdp_prog) {
                        SLAVE_NL_ERR(bond_dev, slave_dev, extack,
                                     "Slave does not support XDP");
                        res = -EOPNOTSUPP;
                        goto err_sysfs_del;
                }
        } else if (bond->xdp_prog) {
                struct netdev_bpf xdp = {
                        .command = XDP_SETUP_PROG,
                        .flags   = 0,
                        .prog    = bond->xdp_prog,
                        .extack  = extack,
                };

                if (dev_xdp_prog_count(slave_dev) > 0) {
                        SLAVE_NL_ERR(bond_dev, slave_dev, extack,
                                     "Slave has XDP program loaded, please unload before enslaving");
                        res = -EOPNOTSUPP;
                        goto err_sysfs_del;
                }

                res = dev_xdp_propagate(slave_dev, &xdp);
                if (res < 0) {
                        /* ndo_bpf() sets extack error message */
                        slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res);
                        goto err_sysfs_del;
                }
                if (bond->xdp_prog)
                        bpf_prog_inc(bond->xdp_prog);
        }

        bond_xdp_set_features(bond_dev);

        slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n",
                   bond_is_active_slave(new_slave) ? "an active" : "a backup",
                   new_slave->link != BOND_LINK_DOWN ? "an up" : "a down");

        /* enslave is successful */
        bond_queue_slave_event(new_slave);
        return 0;

/* Undo stages on error */
err_sysfs_del:
        bond_sysfs_slave_del(new_slave);

err_upper_unlink:
        bond_upper_dev_unlink(bond, new_slave);

err_unregister:
        netdev_rx_handler_unregister(slave_dev);

err_detach:
        vlan_vids_del_by_dev(slave_dev, bond_dev);
        if (rcu_access_pointer(bond->primary_slave) == new_slave)
                RCU_INIT_POINTER(bond->primary_slave, NULL);
        if (rcu_access_pointer(bond->curr_active_slave) == new_slave) {
                block_netpoll_tx();
                bond_change_active_slave(bond, NULL);
                bond_select_active_slave(bond);
                unblock_netpoll_tx();
        }
        /* either primary_slave or curr_active_slave might've changed */
        synchronize_rcu();
        slave_disable_netpoll(new_slave);

err_close:
        if (!netif_is_bond_master(slave_dev))
                slave_dev->priv_flags &= ~IFF_BONDING;
        dev_close(slave_dev);

err_restore_mac:
        slave_dev->priv_flags &= ~IFF_NO_ADDRCONF;
        if (!bond->params.fail_over_mac ||
            BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
                /* XXX TODO - fom follow mode needs to change master's
                 * MAC if this slave's MAC is in use by the bond, or at
                 * least print a warning.
                 */
                bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr,
                                  new_slave->dev->addr_len);
                ss.ss_family = slave_dev->type;
                dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL);
        }

err_restore_mtu:
        dev_set_mtu(slave_dev, new_slave->original_mtu);

err_free:
        kobject_put(&new_slave->kobj);

err_undo_flags:
        /* Enslave of first slave has failed and we need to fix master's mac */
        if (!bond_has_slaves(bond)) {
                if (ether_addr_equal_64bits(bond_dev->dev_addr,
                                            slave_dev->dev_addr))
                        eth_hw_addr_random(bond_dev);
                if (bond_dev->type != ARPHRD_ETHER) {
                        dev_close(bond_dev);
                        bond_ether_setup(bond_dev);
                }
        }

        return res;
}

/* Try to release the slave device <slave> from the bond device <master>
 * It is legal to access curr_active_slave without a lock because all the function
 * is RTNL-locked. If "all" is true it means that the function is being called
 * while destroying a bond interface and all slaves are being released.
 *
 * The rules for slave state should be:
 *   for Active/Backup:
 *     Active stays on all backups go down
 *   for Bonded connections:
 *     The first up interface should be left on and all others downed.
 */
static int __bond_release_one(struct net_device *bond_dev,
                              struct net_device *slave_dev,
                              bool all, bool unregister)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave, *oldcurrent;
        struct sockaddr_storage ss;
        int old_flags = bond_dev->flags;
        netdev_features_t old_features = bond_dev->features;

        /* slave is not a slave or master is not master of this slave */
        if (!(slave_dev->flags & IFF_SLAVE) ||
            !netdev_has_upper_dev(slave_dev, bond_dev)) {
                slave_dbg(bond_dev, slave_dev, "cannot release slave\n");
                return -EINVAL;
        }

        block_netpoll_tx();

        slave = bond_get_slave_by_dev(bond, slave_dev);
        if (!slave) {
                /* not a slave of this bond */
                slave_info(bond_dev, slave_dev, "interface not enslaved\n");
                unblock_netpoll_tx();
                return -EINVAL;
        }

        bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW);

        bond_sysfs_slave_del(slave);

        /* recompute stats just before removing the slave */
        bond_get_stats(bond->dev, &bond->bond_stats);

        if (bond->xdp_prog) {
                struct netdev_bpf xdp = {
                        .command = XDP_SETUP_PROG,
                        .flags   = 0,
                        .prog         = NULL,
                        .extack  = NULL,
                };
                if (dev_xdp_propagate(slave_dev, &xdp))
                        slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n");
        }

        /* unregister rx_handler early so bond_handle_frame wouldn't be called
         * for this slave anymore.
         */
        netdev_rx_handler_unregister(slave_dev);

        if (BOND_MODE(bond) == BOND_MODE_8023AD)
                bond_3ad_unbind_slave(slave);

        bond_upper_dev_unlink(bond, slave);

        if (bond_mode_can_use_xmit_hash(bond))
                bond_update_slave_arr(bond, slave);

        slave_info(bond_dev, slave_dev, "Releasing %s interface\n",
                    bond_is_active_slave(slave) ? "active" : "backup");

        oldcurrent = rcu_access_pointer(bond->curr_active_slave);

        RCU_INIT_POINTER(bond->current_arp_slave, NULL);

        if (!all && (bond->params.fail_over_mac != BOND_FOM_ACTIVE ||
                     BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) {
                if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) &&
                    bond_has_slaves(bond))
                        slave_warn(bond_dev, slave_dev, "the permanent HWaddr of slave - %pM - is still in use by bond - set the HWaddr of slave to a different address to avoid conflicts\n",
                                   slave->perm_hwaddr);
        }

        if (rtnl_dereference(bond->primary_slave) == slave)
                RCU_INIT_POINTER(bond->primary_slave, NULL);

        if (oldcurrent == slave)
                bond_change_active_slave(bond, NULL);

        /* Must be called after bond_change_active_slave () as the slave
         * might change from an active slave to a backup slave. Then it is
         * necessary to clear the maddrs on the backup slave.
         */
        bond_slave_ns_maddrs_del(bond, slave);

        if (bond_is_lb(bond)) {
                /* Must be called only after the slave has been
                 * detached from the list and the curr_active_slave
                 * has been cleared (if our_slave == old_current),
                 * but before a new active slave is selected.
                 */
                bond_alb_deinit_slave(bond, slave);
        }

        if (all) {
                RCU_INIT_POINTER(bond->curr_active_slave, NULL);
        } else if (oldcurrent == slave) {
                /* Note that we hold RTNL over this sequence, so there
                 * is no concern that another slave add/remove event
                 * will interfere.
                 */
                bond_select_active_slave(bond);
        }

        bond_set_carrier(bond);
        if (!bond_has_slaves(bond))
                eth_hw_addr_random(bond_dev);

        unblock_netpoll_tx();
        synchronize_rcu();
        bond->slave_cnt--;

        if (!bond_has_slaves(bond)) {
                call_netdevice_notifiers(NETDEV_CHANGEADDR, bond->dev);
                call_netdevice_notifiers(NETDEV_RELEASE, bond->dev);
        }

        bond_compute_features(bond);
        if (!(bond_dev->features & NETIF_F_VLAN_CHALLENGED) &&
            (old_features & NETIF_F_VLAN_CHALLENGED))
                slave_info(bond_dev, slave_dev, "last VLAN challenged slave left bond - VLAN blocking is removed\n");

        vlan_vids_del_by_dev(slave_dev, bond_dev);

        /* If the mode uses primary, then this case was handled above by
         * bond_change_active_slave(..., NULL)
         */
        if (!bond_uses_primary(bond)) {
                /* unset promiscuity level from slave
                 * NOTE: The NETDEV_CHANGEADDR call above may change the value
                 * of the IFF_PROMISC flag in the bond_dev, but we need the
                 * value of that flag before that change, as that was the value
                 * when this slave was attached, so we cache at the start of the
                 * function and use it here. Same goes for ALLMULTI below
                 */
                if (old_flags & IFF_PROMISC)
                        dev_set_promiscuity(slave_dev, -1);

                /* unset allmulti level from slave */
                if (old_flags & IFF_ALLMULTI)
                        dev_set_allmulti(slave_dev, -1);

                if (old_flags & IFF_UP)
                        bond_hw_addr_flush(bond_dev, slave_dev);
        }

        slave_disable_netpoll(slave);

        /* close slave before restoring its mac address */
        dev_close(slave_dev);

        slave_dev->priv_flags &= ~IFF_NO_ADDRCONF;

        if (bond->params.fail_over_mac != BOND_FOM_ACTIVE ||
            BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
                /* restore original ("permanent") mac address */
                bond_hw_addr_copy(ss.__data, slave->perm_hwaddr,
                                  slave->dev->addr_len);
                ss.ss_family = slave_dev->type;
                dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL);
        }

        if (unregister) {
                netdev_lock_ops(slave_dev);
                __dev_set_mtu(slave_dev, slave->original_mtu);
                netdev_unlock_ops(slave_dev);
        } else {
                dev_set_mtu(slave_dev, slave->original_mtu);
        }

        if (!netif_is_bond_master(slave_dev))
                slave_dev->priv_flags &= ~IFF_BONDING;

        bond_xdp_set_features(bond_dev);
        kobject_put(&slave->kobj);

        return 0;
}

/* A wrapper used because of ndo_del_link */
int bond_release(struct net_device *bond_dev, struct net_device *slave_dev)
{
        return __bond_release_one(bond_dev, slave_dev, false, false);
}

/* First release a slave and then destroy the bond if no more slaves are left.
 * Must be under rtnl_lock when this function is called.
 */
static int bond_release_and_destroy(struct net_device *bond_dev,
                                    struct net_device *slave_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        int ret;

        ret = __bond_release_one(bond_dev, slave_dev, false, true);
        if (ret == 0 && !bond_has_slaves(bond) &&
            bond_dev->reg_state != NETREG_UNREGISTERING) {
                bond_dev->priv_flags |= IFF_DISABLE_NETPOLL;
                netdev_info(bond_dev, "Destroying bond\n");
                bond_remove_proc_entry(bond);
                unregister_netdevice(bond_dev);
        }
        return ret;
}

static void bond_info_query(struct net_device *bond_dev, struct ifbond *info)
{
        struct bonding *bond = netdev_priv(bond_dev);

        bond_fill_ifbond(bond, info);
}

static int bond_slave_info_query(struct net_device *bond_dev, struct ifslave *info)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct list_head *iter;
        int i = 0, res = -ENODEV;
        struct slave *slave;

        bond_for_each_slave(bond, slave, iter) {
                if (i++ == (int)info->slave_id) {
                        res = 0;
                        bond_fill_ifslave(slave, info);
                        break;
                }
        }

        return res;
}

/*-------------------------------- Monitoring -------------------------------*/

/* called with rcu_read_lock() */
static int bond_miimon_inspect(struct bonding *bond)
{
        bool ignore_updelay = false;
        int link_state, commit = 0;
        struct list_head *iter;
        struct slave *slave;

        if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) {
                ignore_updelay = !rcu_dereference(bond->curr_active_slave);
        } else {
                struct bond_up_slave *usable_slaves;

                usable_slaves = rcu_dereference(bond->usable_slaves);

                if (usable_slaves && usable_slaves->count == 0)
                        ignore_updelay = true;
        }

        bond_for_each_slave_rcu(bond, slave, iter) {
                bond_propose_link_state(slave, BOND_LINK_NOCHANGE);

                link_state = bond_check_dev_link(bond, slave->dev, 0);

                switch (slave->link) {
                case BOND_LINK_UP:
                        if (link_state)
                                continue;

                        bond_propose_link_state(slave, BOND_LINK_FAIL);
                        commit++;
                        slave->delay = bond->params.downdelay;
                        if (slave->delay && net_ratelimit()) {
                                slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n",
                                           (BOND_MODE(bond) ==
                                            BOND_MODE_ACTIVEBACKUP) ?
                                            (bond_is_active_slave(slave) ?
                                             "active " : "backup ") : "",
                                           bond->params.downdelay * bond->params.miimon);
                        }
                        fallthrough;
                case BOND_LINK_FAIL:
                        if (link_state) {
                                /* recovered before downdelay expired */
                                bond_propose_link_state(slave, BOND_LINK_UP);
                                slave->last_link_up = jiffies;
                                if (net_ratelimit())
                                        slave_info(bond->dev, slave->dev, "link status up again after %d ms\n",
                                                   (bond->params.downdelay - slave->delay) *
                                                   bond->params.miimon);
                                commit++;
                                continue;
                        }

                        if (slave->delay <= 0) {
                                bond_propose_link_state(slave, BOND_LINK_DOWN);
                                commit++;
                                continue;
                        }

                        slave->delay--;
                        break;

                case BOND_LINK_DOWN:
                        if (!link_state)
                                continue;

                        bond_propose_link_state(slave, BOND_LINK_BACK);
                        commit++;
                        slave->delay = bond->params.updelay;

                        if (slave->delay && net_ratelimit()) {
                                slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n",
                                           ignore_updelay ? 0 :
                                           bond->params.updelay *
                                           bond->params.miimon);
                        }
                        fallthrough;
                case BOND_LINK_BACK:
                        if (!link_state) {
                                bond_propose_link_state(slave, BOND_LINK_DOWN);
                                if (net_ratelimit())
                                        slave_info(bond->dev, slave->dev, "link status down again after %d ms\n",
                                                   (bond->params.updelay - slave->delay) *
                                                   bond->params.miimon);
                                commit++;
                                continue;
                        }

                        if (ignore_updelay)
                                slave->delay = 0;

                        if (slave->delay <= 0) {
                                bond_propose_link_state(slave, BOND_LINK_UP);
                                commit++;
                                ignore_updelay = false;
                                continue;
                        }

                        slave->delay--;
                        break;
                }
        }

        return commit;
}

static void bond_miimon_link_change(struct bonding *bond,
                                    struct slave *slave,
                                    char link)
{
        switch (BOND_MODE(bond)) {
        case BOND_MODE_8023AD:
                bond_3ad_handle_link_change(slave, link);
                break;
        case BOND_MODE_TLB:
        case BOND_MODE_ALB:
                bond_alb_handle_link_change(bond, slave, link);
                break;
        case BOND_MODE_XOR:
                bond_update_slave_arr(bond, NULL);
                break;
        }
}

static void bond_miimon_commit(struct bonding *bond)
{
        struct slave *slave, *primary, *active;
        bool do_failover = false;
        struct list_head *iter;

        ASSERT_RTNL();

        bond_for_each_slave(bond, slave, iter) {
                switch (slave->link_new_state) {
                case BOND_LINK_NOCHANGE:
                        /* For 802.3ad mode, check current slave speed and
                         * duplex again in case its port was disabled after
                         * invalid speed/duplex reporting but recovered before
                         * link monitoring could make a decision on the actual
                         * link status
                         */
                        if (BOND_MODE(bond) == BOND_MODE_8023AD &&
                            slave->link == BOND_LINK_UP)
                                bond_3ad_adapter_speed_duplex_changed(slave);
                        continue;

                case BOND_LINK_UP:
                        if (bond_update_speed_duplex(slave) &&
                            bond_needs_speed_duplex(bond)) {
                                slave->link = BOND_LINK_DOWN;
                                if (net_ratelimit())
                                        slave_warn(bond->dev, slave->dev,
                                                   "failed to get link speed/duplex\n");
                                continue;
                        }
                        bond_set_slave_link_state(slave, BOND_LINK_UP,
                                                  BOND_SLAVE_NOTIFY_NOW);
                        slave->last_link_up = jiffies;

                        primary = rtnl_dereference(bond->primary_slave);
                        if (BOND_MODE(bond) == BOND_MODE_8023AD) {
                                /* prevent it from being the active one */
                                bond_set_backup_slave(slave);
                        } else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
                                /* make it immediately active */
                                bond_set_active_slave(slave);
                        }

                        slave_info(bond->dev, slave->dev, "link status definitely up, %u Mbps %s duplex\n",
                                   slave->speed == SPEED_UNKNOWN ? 0 : slave->speed,
                                   slave->duplex ? "full" : "half");

                        bond_miimon_link_change(bond, slave, BOND_LINK_UP);

                        active = rtnl_dereference(bond->curr_active_slave);
                        if (!active || slave == primary || slave->prio > active->prio)
                                do_failover = true;

                        continue;

                case BOND_LINK_DOWN:
                        if (slave->link_failure_count < UINT_MAX)
                                slave->link_failure_count++;

                        bond_set_slave_link_state(slave, BOND_LINK_DOWN,
                                                  BOND_SLAVE_NOTIFY_NOW);

                        if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP ||
                            BOND_MODE(bond) == BOND_MODE_8023AD)
                                bond_set_slave_inactive_flags(slave,
                                                              BOND_SLAVE_NOTIFY_NOW);

                        slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n");

                        bond_miimon_link_change(bond, slave, BOND_LINK_DOWN);

                        if (slave == rcu_access_pointer(bond->curr_active_slave))
                                do_failover = true;

                        continue;

                default:
                        slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n",
                                  slave->link_new_state);
                        bond_propose_link_state(slave, BOND_LINK_NOCHANGE);

                        continue;
                }
        }

        if (do_failover) {
                block_netpoll_tx();
                bond_select_active_slave(bond);
                unblock_netpoll_tx();
        }

        bond_set_carrier(bond);
}

/* bond_mii_monitor
 *
 * Really a wrapper that splits the mii monitor into two phases: an
 * inspection, then (if inspection indicates something needs to be done)
 * an acquisition of appropriate locks followed by a commit phase to
 * implement whatever link state changes are indicated.
 */
static void bond_mii_monitor(struct work_struct *work)
{
        struct bonding *bond = container_of(work, struct bonding,
                                            mii_work.work);
        bool should_notify_peers = false;
        bool commit;
        unsigned long delay;
        struct slave *slave;
        struct list_head *iter;

        delay = msecs_to_jiffies(bond->params.miimon);

        if (!bond_has_slaves(bond))
                goto re_arm;

        rcu_read_lock();
        should_notify_peers = bond_should_notify_peers(bond);
        commit = !!bond_miimon_inspect(bond);
        if (bond->send_peer_notif) {
                rcu_read_unlock();
                if (rtnl_trylock()) {
                        bond->send_peer_notif--;
                        rtnl_unlock();
                }
        } else {
                rcu_read_unlock();
        }

        if (commit) {
                /* Race avoidance with bond_close cancel of workqueue */
                if (!rtnl_trylock()) {
                        delay = 1;
                        should_notify_peers = false;
                        goto re_arm;
                }

                bond_for_each_slave(bond, slave, iter) {
                        bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER);
                }
                bond_miimon_commit(bond);

                rtnl_unlock();        /* might sleep, hold no other locks */
        }

re_arm:
        if (bond->params.miimon)
                queue_delayed_work(bond->wq, &bond->mii_work, delay);

        if (should_notify_peers) {
                if (!rtnl_trylock())
                        return;
                call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev);
                rtnl_unlock();
        }
}

static int bond_upper_dev_walk(struct net_device *upper,
                               struct netdev_nested_priv *priv)
{
        __be32 ip = *(__be32 *)priv->data;

        return ip == bond_confirm_addr(upper, 0, ip);
}

static bool bond_has_this_ip(struct bonding *bond, __be32 ip)
{
        struct netdev_nested_priv priv = {
                .data = (void *)&ip,
        };
        bool ret = false;

        if (ip == bond_confirm_addr(bond->dev, 0, ip))
                return true;

        rcu_read_lock();
        if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv))
                ret = true;
        rcu_read_unlock();

        return ret;
}

#define BOND_VLAN_PROTO_NONE cpu_to_be16(0xffff)

static bool bond_handle_vlan(struct slave *slave, struct bond_vlan_tag *tags,
                             struct sk_buff *skb)
{
        struct net_device *bond_dev = slave->bond->dev;
        struct net_device *slave_dev = slave->dev;
        struct bond_vlan_tag *outer_tag = tags;

        if (!tags || tags->vlan_proto == BOND_VLAN_PROTO_NONE)
                return true;

        tags++;

        /* Go through all the tags backwards and add them to the packet */
        while (tags->vlan_proto != BOND_VLAN_PROTO_NONE) {
                if (!tags->vlan_id) {
                        tags++;
                        continue;
                }

                slave_dbg(bond_dev, slave_dev, "inner tag: proto %X vid %X\n",
                          ntohs(outer_tag->vlan_proto), tags->vlan_id);
                skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto,
                                                tags->vlan_id);
                if (!skb) {
                        net_err_ratelimited("failed to insert inner VLAN tag\n");
                        return false;
                }

                tags++;
        }
        /* Set the outer tag */
        if (outer_tag->vlan_id) {
                slave_dbg(bond_dev, slave_dev, "outer tag: proto %X vid %X\n",
                          ntohs(outer_tag->vlan_proto), outer_tag->vlan_id);
                __vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto,
                                       outer_tag->vlan_id);
        }

        return true;
}

/* We go to the (large) trouble of VLAN tagging ARP frames because
 * switches in VLAN mode (especially if ports are configured as
 * "native" to a VLAN) might not pass non-tagged frames.
 */
static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip,
                          __be32 src_ip, struct bond_vlan_tag *tags)
{
        struct net_device *bond_dev = slave->bond->dev;
        struct net_device *slave_dev = slave->dev;
        struct sk_buff *skb;

        slave_dbg(bond_dev, slave_dev, "arp %d on slave: dst %pI4 src %pI4\n",
                  arp_op, &dest_ip, &src_ip);

        skb = arp_create(arp_op, ETH_P_ARP, dest_ip, slave_dev, src_ip,
                         NULL, slave_dev->dev_addr, NULL);

        if (!skb) {
                net_err_ratelimited("ARP packet allocation failed\n");
                return;
        }

        if (bond_handle_vlan(slave, tags, skb)) {
                slave_update_last_tx(slave);
                arp_xmit(skb);
        }

        return;
}

/* Validate the device path between the @start_dev and the @end_dev.
 * The path is valid if the @end_dev is reachable through device
 * stacking.
 * When the path is validated, collect any vlan information in the
 * path.
 */
struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
                                              struct net_device *end_dev,
                                              int level)
{
        struct bond_vlan_tag *tags;
        struct net_device *upper;
        struct list_head  *iter;

        if (start_dev == end_dev) {
                tags = kcalloc(level + 1, sizeof(*tags), GFP_ATOMIC);
                if (!tags)
                        return ERR_PTR(-ENOMEM);
                tags[level].vlan_proto = BOND_VLAN_PROTO_NONE;
                return tags;
        }

        netdev_for_each_upper_dev_rcu(start_dev, upper, iter) {
                tags = bond_verify_device_path(upper, end_dev, level + 1);
                if (IS_ERR_OR_NULL(tags)) {
                        if (IS_ERR(tags))
                                return tags;
                        continue;
                }
                if (is_vlan_dev(upper)) {
                        tags[level].vlan_proto = vlan_dev_vlan_proto(upper);
                        tags[level].vlan_id = vlan_dev_vlan_id(upper);
                }

                return tags;
        }

        return NULL;
}

static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
{
        struct rtable *rt;
        struct bond_vlan_tag *tags;
        __be32 *targets = bond->params.arp_targets, addr;
        int i;

        for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) {
                slave_dbg(bond->dev, slave->dev, "%s: target %pI4\n",
                          __func__, &targets[i]);
                tags = NULL;

                /* Find out through which dev should the packet go */
                rt = ip_route_output(dev_net(bond->dev), targets[i], 0, 0, 0,
                                     RT_SCOPE_LINK);
                if (IS_ERR(rt)) {
                        /* there's no route to target - try to send arp
                         * probe to generate any traffic (arp_validate=0)
                         */
                        if (bond->params.arp_validate)
                                pr_warn_once("%s: no route to arp_ip_target %pI4 and arp_validate is set\n",
                                             bond->dev->name,
                                             &targets[i]);
                        bond_arp_send(slave, ARPOP_REQUEST, targets[i],
                                      0, tags);
                        continue;
                }

                /* bond device itself */
                if (rt->dst.dev == bond->dev)
                        goto found;

                rcu_read_lock();
                tags = bond_verify_device_path(bond->dev, rt->dst.dev, 0);
                rcu_read_unlock();

                if (!IS_ERR_OR_NULL(tags))
                        goto found;

                /* Not our device - skip */
                slave_dbg(bond->dev, slave->dev, "no path to arp_ip_target %pI4 via rt.dev %s\n",
                           &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL");

                ip_rt_put(rt);
                continue;

found:
                addr = bond_confirm_addr(rt->dst.dev, targets[i], 0);
                ip_rt_put(rt);
                bond_arp_send(slave, ARPOP_REQUEST, targets[i], addr, tags);
                kfree(tags);
        }
}

static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip)
{
        int i;

        if (!sip || !bond_has_this_ip(bond, tip)) {
                slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 tip %pI4 not found\n",
                           __func__, &sip, &tip);
                return;
        }

        i = bond_get_targets_ip(bond->params.arp_targets, sip);
        if (i == -1) {
                slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 not found in targets\n",
                           __func__, &sip);
                return;
        }
        slave->last_rx = jiffies;
        slave->target_last_arp_rx[i] = jiffies;
}

static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond,
                        struct slave *slave)
{
        struct arphdr *arp = (struct arphdr *)skb->data;
        struct slave *curr_active_slave, *curr_arp_slave;
        unsigned char *arp_ptr;
        __be32 sip, tip;
        unsigned int alen;

        alen = arp_hdr_len(bond->dev);

        if (alen > skb_headlen(skb)) {
                arp = kmalloc(alen, GFP_ATOMIC);
                if (!arp)
                        goto out_unlock;
                if (skb_copy_bits(skb, 0, arp, alen) < 0)
                        goto out_unlock;
        }

        if (arp->ar_hln != bond->dev->addr_len ||
            skb->pkt_type == PACKET_OTHERHOST ||
            skb->pkt_type == PACKET_LOOPBACK ||
            arp->ar_hrd != htons(ARPHRD_ETHER) ||
            arp->ar_pro != htons(ETH_P_IP) ||
            arp->ar_pln != 4)
                goto out_unlock;

        arp_ptr = (unsigned char *)(arp + 1);
        arp_ptr += bond->dev->addr_len;
        memcpy(&sip, arp_ptr, 4);
        arp_ptr += 4 + bond->dev->addr_len;
        memcpy(&tip, arp_ptr, 4);

        slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI4 tip %pI4\n",
                  __func__, slave->dev->name, bond_slave_state(slave),
                  bond->params.arp_validate, slave_do_arp_validate(bond, slave),
                  &sip, &tip);

        curr_active_slave = rcu_dereference(bond->curr_active_slave);
        curr_arp_slave = rcu_dereference(bond->current_arp_slave);

        /* We 'trust' the received ARP enough to validate it if:
         *
         * (a) the slave receiving the ARP is active (which includes the
         * current ARP slave, if any), or
         *
         * (b) the receiving slave isn't active, but there is a currently
         * active slave and it received valid arp reply(s) after it became
         * the currently active slave, or
         *
         * (c) there is an ARP slave that sent an ARP during the prior ARP
         * interval, and we receive an ARP reply on any slave.  We accept
         * these because switch FDB update delays may deliver the ARP
         * reply to a slave other than the sender of the ARP request.
         *
         * Note: for (b), backup slaves are receiving the broadcast ARP
         * request, not a reply.  This request passes from the sending
         * slave through the L2 switch(es) to the receiving slave.  Since
         * this is checking the request, sip/tip are swapped for
         * validation.
         *
         * This is done to avoid endless looping when we can't reach the
         * arp_ip_target and fool ourselves with our own arp requests.
         */
        if (bond_is_active_slave(slave))
                bond_validate_arp(bond, slave, sip, tip);
        else if (curr_active_slave &&
                 time_after(slave_last_rx(bond, curr_active_slave),
                            curr_active_slave->last_link_up))
                bond_validate_arp(bond, slave, tip, sip);
        else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) &&
                 bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1))
                bond_validate_arp(bond, slave, sip, tip);

out_unlock:
        if (arp != (struct arphdr *)skb->data)
                kfree(arp);
        return RX_HANDLER_ANOTHER;
}

#if IS_ENABLED(CONFIG_IPV6)
static void bond_ns_send(struct slave *slave, const struct in6_addr *daddr,
                         const struct in6_addr *saddr, struct bond_vlan_tag *tags)
{
        struct net_device *bond_dev = slave->bond->dev;
        struct net_device *slave_dev = slave->dev;
        struct in6_addr mcaddr;
        struct sk_buff *skb;

        slave_dbg(bond_dev, slave_dev, "NS on slave: dst %pI6c src %pI6c\n",
                  daddr, saddr);

        skb = ndisc_ns_create(slave_dev, daddr, saddr, 0);
        if (!skb) {
                net_err_ratelimited("NS packet allocation failed\n");
                return;
        }

        addrconf_addr_solict_mult(daddr, &mcaddr);
        if (bond_handle_vlan(slave, tags, skb)) {
                slave_update_last_tx(slave);
                ndisc_send_skb(skb, &mcaddr, saddr);
        }
}

static void bond_ns_send_all(struct bonding *bond, struct slave *slave)
{
        struct in6_addr *targets = bond->params.ns_targets;
        struct bond_vlan_tag *tags;
        struct dst_entry *dst;
        struct in6_addr saddr;
        struct flowi6 fl6;
        int i;

        for (i = 0; i < BOND_MAX_NS_TARGETS && !ipv6_addr_any(&targets[i]); i++) {
                slave_dbg(bond->dev, slave->dev, "%s: target %pI6c\n",
                          __func__, &targets[i]);
                tags = NULL;

                /* Find out through which dev should the packet go */
                memset(&fl6, 0, sizeof(struct flowi6));
                fl6.daddr = targets[i];
                fl6.flowi6_oif = bond->dev->ifindex;

                dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6);
                if (dst->error) {
                        dst_release(dst);
                        /* there's no route to target - try to send arp
                         * probe to generate any traffic (arp_validate=0)
                         */
                        if (bond->params.arp_validate)
                                pr_warn_once("%s: no route to ns_ip6_target %pI6c and arp_validate is set\n",
                                             bond->dev->name,
                                             &targets[i]);
                        bond_ns_send(slave, &targets[i], &in6addr_any, tags);
                        continue;
                }

                /* bond device itself */
                if (dst->dev == bond->dev)
                        goto found;

                rcu_read_lock();
                tags = bond_verify_device_path(bond->dev, dst->dev, 0);
                rcu_read_unlock();

                if (!IS_ERR_OR_NULL(tags))
                        goto found;

                /* Not our device - skip */
                slave_dbg(bond->dev, slave->dev, "no path to ns_ip6_target %pI6c via dst->dev %s\n",
                          &targets[i], dst->dev ? dst->dev->name : "NULL");

                dst_release(dst);
                continue;

found:
                if (!ipv6_dev_get_saddr(dev_net(dst->dev), dst->dev, &targets[i], 0, &saddr))
                        bond_ns_send(slave, &targets[i], &saddr, tags);
                else
                        bond_ns_send(slave, &targets[i], &in6addr_any, tags);

                dst_release(dst);
                kfree(tags);
        }
}

static int bond_confirm_addr6(struct net_device *dev,
                              struct netdev_nested_priv *priv)
{
        struct in6_addr *addr = (struct in6_addr *)priv->data;

        return ipv6_chk_addr(dev_net(dev), addr, dev, 0);
}

static bool bond_has_this_ip6(struct bonding *bond, struct in6_addr *addr)
{
        struct netdev_nested_priv priv = {
                .data = addr,
        };
        int ret = false;

        if (bond_confirm_addr6(bond->dev, &priv))
                return true;

        rcu_read_lock();
        if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_confirm_addr6, &priv))
                ret = true;
        rcu_read_unlock();

        return ret;
}

static void bond_validate_na(struct bonding *bond, struct slave *slave,
                             struct in6_addr *saddr, struct in6_addr *daddr)
{
        int i;

        /* Ignore NAs that:
         * 1. Source address is unspecified address.
         * 2. Dest address is neither all-nodes multicast address nor
         *    exist on bond interface.
         */
        if (ipv6_addr_any(saddr) ||
            (!ipv6_addr_equal(daddr, &in6addr_linklocal_allnodes) &&
             !bond_has_this_ip6(bond, daddr))) {
                slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c tip %pI6c not found\n",
                          __func__, saddr, daddr);
                return;
        }

        i = bond_get_targets_ip6(bond->params.ns_targets, saddr);
        if (i == -1) {
                slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c not found in targets\n",
                          __func__, saddr);
                return;
        }
        slave->last_rx = jiffies;
        slave->target_last_arp_rx[i] = jiffies;
}

static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond,
                       struct slave *slave)
{
        struct slave *curr_active_slave, *curr_arp_slave;
        struct in6_addr *saddr, *daddr;
        struct {
                struct ipv6hdr ip6;
                struct icmp6hdr icmp6;
        } *combined, _combined;

        if (skb->pkt_type == PACKET_OTHERHOST ||
            skb->pkt_type == PACKET_LOOPBACK)
                goto out;

        combined = skb_header_pointer(skb, 0, sizeof(_combined), &_combined);
        if (!combined || combined->ip6.nexthdr != NEXTHDR_ICMP ||
            (combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION &&
             combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT))
                goto out;

        saddr = &combined->ip6.saddr;
        daddr = &combined->ip6.daddr;

        slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI6c tip %pI6c\n",
                  __func__, slave->dev->name, bond_slave_state(slave),
                  bond->params.arp_validate, slave_do_arp_validate(bond, slave),
                  saddr, daddr);

        curr_active_slave = rcu_dereference(bond->curr_active_slave);
        curr_arp_slave = rcu_dereference(bond->current_arp_slave);

        /* We 'trust' the received ARP enough to validate it if:
         * see bond_arp_rcv().
         */
        if (bond_is_active_slave(slave))
                bond_validate_na(bond, slave, saddr, daddr);
        else if (curr_active_slave &&
                 time_after(slave_last_rx(bond, curr_active_slave),
                            curr_active_slave->last_link_up))
                bond_validate_na(bond, slave, daddr, saddr);
        else if (curr_arp_slave &&
                 bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1))
                bond_validate_na(bond, slave, saddr, daddr);

out:
        return RX_HANDLER_ANOTHER;
}
#endif

int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond,
                      struct slave *slave)
{
#if IS_ENABLED(CONFIG_IPV6)
        bool is_ipv6 = skb->protocol == __cpu_to_be16(ETH_P_IPV6);
#endif
        bool is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP);

        slave_dbg(bond->dev, slave->dev, "%s: skb->dev %s\n",
                  __func__, skb->dev->name);

        /* Use arp validate logic for both ARP and NS */
        if (!slave_do_arp_validate(bond, slave)) {
                if ((slave_do_arp_validate_only(bond) && is_arp) ||
#if IS_ENABLED(CONFIG_IPV6)
                    (slave_do_arp_validate_only(bond) && is_ipv6) ||
#endif
                    !slave_do_arp_validate_only(bond))
                        slave->last_rx = jiffies;
                return RX_HANDLER_ANOTHER;
        } else if (is_arp) {
                return bond_arp_rcv(skb, bond, slave);
#if IS_ENABLED(CONFIG_IPV6)
        } else if (is_ipv6) {
                return bond_na_rcv(skb, bond, slave);
#endif
        } else {
                return RX_HANDLER_ANOTHER;
        }
}

static void bond_send_validate(struct bonding *bond, struct slave *slave)
{
        bond_arp_send_all(bond, slave);
#if IS_ENABLED(CONFIG_IPV6)
        bond_ns_send_all(bond, slave);
#endif
}

/* function to verify if we're in the arp_interval timeslice, returns true if
 * (last_act - arp_interval) <= jiffies <= (last_act + mod * arp_interval +
 * arp_interval/2) . the arp_interval/2 is needed for really fast networks.
 */
static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act,
                                  int mod)
{
        int delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

        return time_in_range(jiffies,
                             last_act - delta_in_ticks,
                             last_act + mod * delta_in_ticks + delta_in_ticks/2);
}

/* This function is called regularly to monitor each slave's link
 * ensuring that traffic is being sent and received when arp monitoring
 * is used in load-balancing mode. if the adapter has been dormant, then an
 * arp is transmitted to generate traffic. see activebackup_arp_monitor for
 * arp monitoring in active backup mode.
 */
static void bond_loadbalance_arp_mon(struct bonding *bond)
{
        struct slave *slave, *oldcurrent;
        struct list_head *iter;
        int do_failover = 0, slave_state_changed = 0;

        if (!bond_has_slaves(bond))
                goto re_arm;

        rcu_read_lock();

        oldcurrent = rcu_dereference(bond->curr_active_slave);
        /* see if any of the previous devices are up now (i.e. they have
         * xmt and rcv traffic). the curr_active_slave does not come into
         * the picture unless it is null. also, slave->last_link_up is not
         * needed here because we send an arp on each slave and give a slave
         * as long as it needs to get the tx/rx within the delta.
         * TODO: what about up/down delay in arp mode? it wasn't here before
         *       so it can wait
         */
        bond_for_each_slave_rcu(bond, slave, iter) {
                unsigned long last_tx = slave_last_tx(slave);

                bond_propose_link_state(slave, BOND_LINK_NOCHANGE);

                if (slave->link != BOND_LINK_UP) {
                        if (bond_time_in_interval(bond, last_tx, 1) &&
                            bond_time_in_interval(bond, slave->last_rx, 1)) {

                                bond_propose_link_state(slave, BOND_LINK_UP);
                                slave_state_changed = 1;

                                /* primary_slave has no meaning in round-robin
                                 * mode. the window of a slave being up and
                                 * curr_active_slave being null after enslaving
                                 * is closed.
                                 */
                                if (!oldcurrent) {
                                        slave_info(bond->dev, slave->dev, "link status definitely up\n");
                                        do_failover = 1;
                                } else {
                                        slave_info(bond->dev, slave->dev, "interface is now up\n");
                                }
                        }
                } else {
                        /* slave->link == BOND_LINK_UP */

                        /* not all switches will respond to an arp request
                         * when the source ip is 0, so don't take the link down
                         * if we don't know our ip yet
                         */
                        if (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) ||
                            !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) {

                                bond_propose_link_state(slave, BOND_LINK_DOWN);
                                slave_state_changed = 1;

                                if (slave->link_failure_count < UINT_MAX)
                                        slave->link_failure_count++;

                                slave_info(bond->dev, slave->dev, "interface is now down\n");

                                if (slave == oldcurrent)
                                        do_failover = 1;
                        }
                }

                /* note: if switch is in round-robin mode, all links
                 * must tx arp to ensure all links rx an arp - otherwise
                 * links may oscillate or not come up at all; if switch is
                 * in something like xor mode, there is nothing we can
                 * do - all replies will be rx'ed on same link causing slaves
                 * to be unstable during low/no traffic periods
                 */
                if (bond_slave_is_up(slave))
                        bond_send_validate(bond, slave);
        }

        rcu_read_unlock();

        if (do_failover || slave_state_changed) {
                if (!rtnl_trylock())
                        goto re_arm;

                bond_for_each_slave(bond, slave, iter) {
                        if (slave->link_new_state != BOND_LINK_NOCHANGE)
                                slave->link = slave->link_new_state;
                }

                if (slave_state_changed) {
                        bond_slave_state_change(bond);
                        if (BOND_MODE(bond) == BOND_MODE_XOR)
                                bond_update_slave_arr(bond, NULL);
                }
                if (do_failover) {
                        block_netpoll_tx();
                        bond_select_active_slave(bond);
                        unblock_netpoll_tx();
                }
                rtnl_unlock();
        }

re_arm:
        if (bond->params.arp_interval)
                queue_delayed_work(bond->wq, &bond->arp_work,
                                   msecs_to_jiffies(bond->params.arp_interval));
}

/* Called to inspect slaves for active-backup mode ARP monitor link state
 * changes.  Sets proposed link state in slaves to specify what action
 * should take place for the slave.  Returns 0 if no changes are found, >0
 * if changes to link states must be committed.
 *
 * Called with rcu_read_lock held.
 */
static int bond_ab_arp_inspect(struct bonding *bond)
{
        unsigned long last_tx, last_rx;
        struct list_head *iter;
        struct slave *slave;
        int commit = 0;

        bond_for_each_slave_rcu(bond, slave, iter) {
                bond_propose_link_state(slave, BOND_LINK_NOCHANGE);
                last_rx = slave_last_rx(bond, slave);

                if (slave->link != BOND_LINK_UP) {
                        if (bond_time_in_interval(bond, last_rx, 1)) {
                                bond_propose_link_state(slave, BOND_LINK_UP);
                                commit++;
                        } else if (slave->link == BOND_LINK_BACK) {
                                bond_propose_link_state(slave, BOND_LINK_FAIL);
                                commit++;
                        }
                        continue;
                }

                /* Give slaves 2*delta after being enslaved or made
                 * active.  This avoids bouncing, as the last receive
                 * times need a full ARP monitor cycle to be updated.
                 */
                if (bond_time_in_interval(bond, slave->last_link_up, 2))
                        continue;

                /* Backup slave is down if:
                 * - No current_arp_slave AND
                 * - more than (missed_max+1)*delta since last receive AND
                 * - the bond has an IP address
                 *
                 * Note: a non-null current_arp_slave indicates
                 * the curr_active_slave went down and we are
                 * searching for a new one; under this condition
                 * we only take the curr_active_slave down - this
                 * gives each slave a chance to tx/rx traffic
                 * before being taken out
                 */
                if (!bond_is_active_slave(slave) &&
                    !rcu_access_pointer(bond->current_arp_slave) &&
                    !bond_time_in_interval(bond, last_rx, bond->params.missed_max + 1)) {
                        bond_propose_link_state(slave, BOND_LINK_DOWN);
                        commit++;
                }

                /* Active slave is down if:
                 * - more than missed_max*delta since transmitting OR
                 * - (more than missed_max*delta since receive AND
                 *    the bond has an IP address)
                 */
                last_tx = slave_last_tx(slave);
                if (bond_is_active_slave(slave) &&
                    (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) ||
                     !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) {
                        bond_propose_link_state(slave, BOND_LINK_DOWN);
                        commit++;
                }
        }

        return commit;
}

/* Called to commit link state changes noted by inspection step of
 * active-backup mode ARP monitor.
 *
 * Called with RTNL hold.
 */
static void bond_ab_arp_commit(struct bonding *bond)
{
        bool do_failover = false;
        struct list_head *iter;
        unsigned long last_tx;
        struct slave *slave;

        bond_for_each_slave(bond, slave, iter) {
                switch (slave->link_new_state) {
                case BOND_LINK_NOCHANGE:
                        continue;

                case BOND_LINK_UP:
                        last_tx = slave_last_tx(slave);
                        if (rtnl_dereference(bond->curr_active_slave) != slave ||
                            (!rtnl_dereference(bond->curr_active_slave) &&
                             bond_time_in_interval(bond, last_tx, 1))) {
                                struct slave *current_arp_slave;

                                current_arp_slave = rtnl_dereference(bond->current_arp_slave);
                                bond_set_slave_link_state(slave, BOND_LINK_UP,
                                                          BOND_SLAVE_NOTIFY_NOW);
                                if (current_arp_slave) {
                                        bond_set_slave_inactive_flags(
                                                current_arp_slave,
                                                BOND_SLAVE_NOTIFY_NOW);
                                        RCU_INIT_POINTER(bond->current_arp_slave, NULL);
                                }

                                slave_info(bond->dev, slave->dev, "link status definitely up\n");

                                if (!rtnl_dereference(bond->curr_active_slave) ||
                                    slave == rtnl_dereference(bond->primary_slave) ||
                                    slave->prio > rtnl_dereference(bond->curr_active_slave)->prio)
                                        do_failover = true;

                        }

                        continue;

                case BOND_LINK_DOWN:
                        if (slave->link_failure_count < UINT_MAX)
                                slave->link_failure_count++;

                        bond_set_slave_link_state(slave, BOND_LINK_DOWN,
                                                  BOND_SLAVE_NOTIFY_NOW);
                        bond_set_slave_inactive_flags(slave,
                                                      BOND_SLAVE_NOTIFY_NOW);

                        slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n");

                        if (slave == rtnl_dereference(bond->curr_active_slave)) {
                                RCU_INIT_POINTER(bond->current_arp_slave, NULL);
                                do_failover = true;
                        }

                        continue;

                case BOND_LINK_FAIL:
                        bond_set_slave_link_state(slave, BOND_LINK_FAIL,
                                                  BOND_SLAVE_NOTIFY_NOW);
                        bond_set_slave_inactive_flags(slave,
                                                      BOND_SLAVE_NOTIFY_NOW);

                        /* A slave has just been enslaved and has become
                         * the current active slave.
                         */
                        if (rtnl_dereference(bond->curr_active_slave))
                                RCU_INIT_POINTER(bond->current_arp_slave, NULL);
                        continue;

                default:
                        slave_err(bond->dev, slave->dev,
                                  "impossible: link_new_state %d on slave\n",
                                  slave->link_new_state);
                        continue;
                }
        }

        if (do_failover) {
                block_netpoll_tx();
                bond_select_active_slave(bond);
                unblock_netpoll_tx();
        }

        bond_set_carrier(bond);
}

/* Send ARP probes for active-backup mode ARP monitor.
 *
 * Called with rcu_read_lock held.
 */
static bool bond_ab_arp_probe(struct bonding *bond)
{
        struct slave *slave, *before = NULL, *new_slave = NULL,
                     *curr_arp_slave = rcu_dereference(bond->current_arp_slave),
                     *curr_active_slave = rcu_dereference(bond->curr_active_slave);
        struct list_head *iter;
        bool found = false;
        bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER;

        if (curr_arp_slave && curr_active_slave)
                netdev_info(bond->dev, "PROBE: c_arp %s && cas %s BAD\n",
                            curr_arp_slave->dev->name,
                            curr_active_slave->dev->name);

        if (curr_active_slave) {
                bond_send_validate(bond, curr_active_slave);
                return should_notify_rtnl;
        }

        /* if we don't have a curr_active_slave, search for the next available
         * backup slave from the current_arp_slave and make it the candidate
         * for becoming the curr_active_slave
         */

        if (!curr_arp_slave) {
                curr_arp_slave = bond_first_slave_rcu(bond);
                if (!curr_arp_slave)
                        return should_notify_rtnl;
        }

        bond_for_each_slave_rcu(bond, slave, iter) {
                if (!found && !before && bond_slave_is_up(slave))
                        before = slave;

                if (found && !new_slave && bond_slave_is_up(slave))
                        new_slave = slave;
                /* if the link state is up at this point, we
                 * mark it down - this can happen if we have
                 * simultaneous link failures and
                 * reselect_active_interface doesn't make this
                 * one the current slave so it is still marked
                 * up when it is actually down
                 */
                if (!bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) {
                        bond_set_slave_link_state(slave, BOND_LINK_DOWN,
                                                  BOND_SLAVE_NOTIFY_LATER);
                        if (slave->link_failure_count < UINT_MAX)
                                slave->link_failure_count++;

                        bond_set_slave_inactive_flags(slave,
                                                      BOND_SLAVE_NOTIFY_LATER);

                        slave_info(bond->dev, slave->dev, "backup interface is now down\n");
                }
                if (slave == curr_arp_slave)
                        found = true;
        }

        if (!new_slave && before)
                new_slave = before;

        if (!new_slave)
                goto check_state;

        bond_set_slave_link_state(new_slave, BOND_LINK_BACK,
                                  BOND_SLAVE_NOTIFY_LATER);
        bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER);
        bond_send_validate(bond, new_slave);
        new_slave->last_link_up = jiffies;
        rcu_assign_pointer(bond->current_arp_slave, new_slave);

check_state:
        bond_for_each_slave_rcu(bond, slave, iter) {
                if (slave->should_notify || slave->should_notify_link) {
                        should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW;
                        break;
                }
        }
        return should_notify_rtnl;
}

static void bond_activebackup_arp_mon(struct bonding *bond)
{
        bool should_notify_peers = false;
        bool should_notify_rtnl = false;
        int delta_in_ticks;

        delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval);

        if (!bond_has_slaves(bond))
                goto re_arm;

        rcu_read_lock();

        should_notify_peers = bond_should_notify_peers(bond);

        if (bond_ab_arp_inspect(bond)) {
                rcu_read_unlock();

                /* Race avoidance with bond_close flush of workqueue */
                if (!rtnl_trylock()) {
                        delta_in_ticks = 1;
                        should_notify_peers = false;
                        goto re_arm;
                }

                bond_ab_arp_commit(bond);

                rtnl_unlock();
                rcu_read_lock();
        }

        should_notify_rtnl = bond_ab_arp_probe(bond);
        rcu_read_unlock();

re_arm:
        if (bond->params.arp_interval)
                queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks);

        if (should_notify_peers || should_notify_rtnl) {
                if (!rtnl_trylock())
                        return;

                if (should_notify_peers) {
                        bond->send_peer_notif--;
                        call_netdevice_notifiers(NETDEV_NOTIFY_PEERS,
                                                 bond->dev);
                }
                if (should_notify_rtnl) {
                        bond_slave_state_notify(bond);
                        bond_slave_link_notify(bond);
                }

                rtnl_unlock();
        }
}

static void bond_arp_monitor(struct work_struct *work)
{
        struct bonding *bond = container_of(work, struct bonding,
                                            arp_work.work);

        if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
                bond_activebackup_arp_mon(bond);
        else
                bond_loadbalance_arp_mon(bond);
}

/*-------------------------- netdev event handling --------------------------*/

/* Change device name */
static int bond_event_changename(struct bonding *bond)
{
        bond_remove_proc_entry(bond);
        bond_create_proc_entry(bond);

        bond_debug_reregister(bond);

        return NOTIFY_DONE;
}

static int bond_master_netdev_event(unsigned long event,
                                    struct net_device *bond_dev)
{
        struct bonding *event_bond = netdev_priv(bond_dev);

        netdev_dbg(bond_dev, "%s called\n", __func__);

        switch (event) {
        case NETDEV_CHANGENAME:
                return bond_event_changename(event_bond);
        case NETDEV_UNREGISTER:
                bond_remove_proc_entry(event_bond);
#ifdef CONFIG_XFRM_OFFLOAD
                xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true);
#endif /* CONFIG_XFRM_OFFLOAD */
                break;
        case NETDEV_REGISTER:
                bond_create_proc_entry(event_bond);
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

static int bond_slave_netdev_event(unsigned long event,
                                   struct net_device *slave_dev)
{
        struct slave *slave = bond_slave_get_rtnl(slave_dev), *primary;
        struct bonding *bond;
        struct net_device *bond_dev;

        /* A netdev event can be generated while enslaving a device
         * before netdev_rx_handler_register is called in which case
         * slave will be NULL
         */
        if (!slave) {
                netdev_dbg(slave_dev, "%s called on NULL slave\n", __func__);
                return NOTIFY_DONE;
        }

        bond_dev = slave->bond->dev;
        bond = slave->bond;
        primary = rtnl_dereference(bond->primary_slave);

        slave_dbg(bond_dev, slave_dev, "%s called\n", __func__);

        switch (event) {
        case NETDEV_UNREGISTER:
                if (bond_dev->type != ARPHRD_ETHER)
                        bond_release_and_destroy(bond_dev, slave_dev);
                else
                        __bond_release_one(bond_dev, slave_dev, false, true);
                break;
        case NETDEV_UP:
        case NETDEV_CHANGE:
                /* For 802.3ad mode only:
                 * Getting invalid Speed/Duplex values here will put slave
                 * in weird state. Mark it as link-fail if the link was
                 * previously up or link-down if it hasn't yet come up, and
                 * let link-monitoring (miimon) set it right when correct
                 * speeds/duplex are available.
                 */
                if (bond_update_speed_duplex(slave) &&
                    BOND_MODE(bond) == BOND_MODE_8023AD) {
                        if (slave->last_link_up)
                                slave->link = BOND_LINK_FAIL;
                        else
                                slave->link = BOND_LINK_DOWN;
                }

                if (BOND_MODE(bond) == BOND_MODE_8023AD)
                        bond_3ad_adapter_speed_duplex_changed(slave);
                fallthrough;
        case NETDEV_DOWN:
                /* Refresh slave-array if applicable!
                 * If the setup does not use miimon or arpmon (mode-specific!),
                 * then these events will not cause the slave-array to be
                 * refreshed. This will cause xmit to use a slave that is not
                 * usable. Avoid such situation by refeshing the array at these
                 * events. If these (miimon/arpmon) parameters are configured
                 * then array gets refreshed twice and that should be fine!
                 */
                if (bond_mode_can_use_xmit_hash(bond))
                        bond_update_slave_arr(bond, NULL);
                break;
        case NETDEV_CHANGEMTU:
                /* TODO: Should slaves be allowed to
                 * independently alter their MTU?  For
                 * an active-backup bond, slaves need
                 * not be the same type of device, so
                 * MTUs may vary.  For other modes,
                 * slaves arguably should have the
                 * same MTUs. To do this, we'd need to
                 * take over the slave's change_mtu
                 * function for the duration of their
                 * servitude.
                 */
                break;
        case NETDEV_CHANGENAME:
                /* we don't care if we don't have primary set */
                if (!bond_uses_primary(bond) ||
                    !bond->params.primary[0])
                        break;

                if (slave == primary) {
                        /* slave's name changed - he's no longer primary */
                        RCU_INIT_POINTER(bond->primary_slave, NULL);
                } else if (!strcmp(slave_dev->name, bond->params.primary)) {
                        /* we have a new primary slave */
                        rcu_assign_pointer(bond->primary_slave, slave);
                } else { /* we didn't change primary - exit */
                        break;
                }

                netdev_info(bond->dev, "Primary slave changed to %s, reselecting active slave\n",
                            primary ? slave_dev->name : "none");

                block_netpoll_tx();
                bond_select_active_slave(bond);
                unblock_netpoll_tx();
                break;
        case NETDEV_FEAT_CHANGE:
                if (!bond->notifier_ctx) {
                        bond->notifier_ctx = true;
                        bond_compute_features(bond);
                        bond->notifier_ctx = false;
                }
                break;
        case NETDEV_RESEND_IGMP:
                /* Propagate to master device */
                call_netdevice_notifiers(event, slave->bond->dev);
                break;
        case NETDEV_XDP_FEAT_CHANGE:
                bond_xdp_set_features(bond_dev);
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

/* bond_netdev_event: handle netdev notifier chain events.
 *
 * This function receives events for the netdev chain.  The caller (an
 * ioctl handler calling blocking_notifier_call_chain) holds the necessary
 * locks for us to safely manipulate the slave devices (RTNL lock,
 * dev_probe_lock).
 */
static int bond_netdev_event(struct notifier_block *this,
                             unsigned long event, void *ptr)
{
        struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);

        netdev_dbg(event_dev, "%s received %s\n",
                   __func__, netdev_cmd_to_name(event));

        if (!(event_dev->priv_flags & IFF_BONDING))
                return NOTIFY_DONE;

        if (event_dev->flags & IFF_MASTER) {
                int ret;

                ret = bond_master_netdev_event(event, event_dev);
                if (ret != NOTIFY_DONE)
                        return ret;
        }

        if (event_dev->flags & IFF_SLAVE)
                return bond_slave_netdev_event(event, event_dev);

        return NOTIFY_DONE;
}

static struct notifier_block bond_netdev_notifier = {
        .notifier_call = bond_netdev_event,
};

/*---------------------------- Hashing Policies -----------------------------*/

/* Helper to access data in a packet, with or without a backing skb.
 * If skb is given the data is linearized if necessary via pskb_may_pull.
 */
static inline const void *bond_pull_data(struct sk_buff *skb,
                                         const void *data, int hlen, int n)
{
        if (likely(n <= hlen))
                return data;
        else if (skb && likely(pskb_may_pull(skb, n)))
                return skb->data;

        return NULL;
}

/* L2 hash helper */
static inline u32 bond_eth_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen)
{
        struct ethhdr *ep;

        data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr));
        if (!data)
                return 0;

        ep = (struct ethhdr *)(data + mhoff);
        return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto);
}

static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, const void *data,
                         int hlen, __be16 l2_proto, int *nhoff, int *ip_proto, bool l34)
{
        const struct ipv6hdr *iph6;
        const struct iphdr *iph;

        if (l2_proto == htons(ETH_P_IP)) {
                data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph));
                if (!data)
                        return false;

                iph = (const struct iphdr *)(data + *nhoff);
                iph_to_flow_copy_v4addrs(fk, iph);
                *nhoff += iph->ihl << 2;
                if (!ip_is_fragment(iph))
                        *ip_proto = iph->protocol;
        } else if (l2_proto == htons(ETH_P_IPV6)) {
                data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph6));
                if (!data)
                        return false;

                iph6 = (const struct ipv6hdr *)(data + *nhoff);
                iph_to_flow_copy_v6addrs(fk, iph6);
                *nhoff += sizeof(*iph6);
                *ip_proto = iph6->nexthdr;
        } else {
                return false;
        }

        if (l34 && *ip_proto >= 0)
                fk->ports.ports = skb_flow_get_ports(skb, *nhoff, *ip_proto, data, hlen);

        return true;
}

static u32 bond_vlan_srcmac_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen)
{
        u32 srcmac_vendor = 0, srcmac_dev = 0;
        struct ethhdr *mac_hdr;
        u16 vlan = 0;
        int i;

        data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr));
        if (!data)
                return 0;
        mac_hdr = (struct ethhdr *)(data + mhoff);

        for (i = 0; i < 3; i++)
                srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i];

        for (i = 3; i < ETH_ALEN; i++)
                srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i];

        if (skb && skb_vlan_tag_present(skb))
                vlan = skb_vlan_tag_get(skb);

        return vlan ^ srcmac_vendor ^ srcmac_dev;
}

/* Extract the appropriate headers based on bond's xmit policy */
static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const void *data,
                              __be16 l2_proto, int nhoff, int hlen, struct flow_keys *fk)
{
        bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34;
        int ip_proto = -1;

        switch (bond->params.xmit_policy) {
        case BOND_XMIT_POLICY_ENCAP23:
        case BOND_XMIT_POLICY_ENCAP34:
                memset(fk, 0, sizeof(*fk));
                return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
                                          fk, data, l2_proto, nhoff, hlen, 0);
        default:
                break;
        }

        fk->ports.ports = 0;
        memset(&fk->icmp, 0, sizeof(fk->icmp));
        if (!bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34))
                return false;

        /* ICMP error packets contains at least 8 bytes of the header
         * of the packet which generated the error. Use this information
         * to correlate ICMP error packets within the same flow which
         * generated the error.
         */
        if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) {
                skb_flow_get_icmp_tci(skb, &fk->icmp, data, nhoff, hlen);
                if (ip_proto == IPPROTO_ICMP) {
                        if (!icmp_is_err(fk->icmp.type))
                                return true;

                        nhoff += sizeof(struct icmphdr);
                } else if (ip_proto == IPPROTO_ICMPV6) {
                        if (!icmpv6_is_err(fk->icmp.type))
                                return true;

                        nhoff += sizeof(struct icmp6hdr);
                }
                return bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34);
        }

        return true;
}

static u32 bond_ip_hash(u32 hash, struct flow_keys *flow, int xmit_policy)
{
        hash ^= (__force u32)flow_get_u32_dst(flow) ^
                (__force u32)flow_get_u32_src(flow);
        hash ^= (hash >> 16);
        hash ^= (hash >> 8);

        /* discard lowest hash bit to deal with the common even ports pattern */
        if (xmit_policy == BOND_XMIT_POLICY_LAYER34 ||
                xmit_policy == BOND_XMIT_POLICY_ENCAP34)
                return hash >> 1;

        return hash;
}

/* Generate hash based on xmit policy. If @skb is given it is used to linearize
 * the data as required, but this function can be used without it if the data is
 * known to be linear (e.g. with xdp_buff).
 */
static u32 __bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, const void *data,
                            __be16 l2_proto, int mhoff, int nhoff, int hlen)
{
        struct flow_keys flow;
        u32 hash;

        if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC)
                return bond_vlan_srcmac_hash(skb, data, mhoff, hlen);

        if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 ||
            !bond_flow_dissect(bond, skb, data, l2_proto, nhoff, hlen, &flow))
                return bond_eth_hash(skb, data, mhoff, hlen);

        if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
            bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) {
                hash = bond_eth_hash(skb, data, mhoff, hlen);
        } else {
                if (flow.icmp.id)
                        memcpy(&hash, &flow.icmp, sizeof(hash));
                else
                        memcpy(&hash, &flow.ports.ports, sizeof(hash));
        }

        return bond_ip_hash(hash, &flow, bond->params.xmit_policy);
}

/**
 * bond_xmit_hash - generate a hash value based on the xmit policy
 * @bond: bonding device
 * @skb: buffer to use for headers
 *
 * This function will extract the necessary headers from the skb buffer and use
 * them to generate a hash based on the xmit_policy set in the bonding device
 */
u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
{
        if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 &&
            skb->l4_hash)
                return skb->hash;

        return __bond_xmit_hash(bond, skb, skb->data, skb->protocol,
                                0, skb_network_offset(skb),
                                skb_headlen(skb));
}

/**
 * bond_xmit_hash_xdp - generate a hash value based on the xmit policy
 * @bond: bonding device
 * @xdp: buffer to use for headers
 *
 * The XDP variant of bond_xmit_hash.
 */
static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp)
{
        struct ethhdr *eth;

        if (xdp->data + sizeof(struct ethhdr) > xdp->data_end)
                return 0;

        eth = (struct ethhdr *)xdp->data;

        return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0,
                                sizeof(struct ethhdr), xdp->data_end - xdp->data);
}

/*-------------------------- Device entry points ----------------------------*/

void bond_work_init_all(struct bonding *bond)
{
        INIT_DELAYED_WORK(&bond->mcast_work,
                          bond_resend_igmp_join_requests_delayed);
        INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor);
        INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor);
        INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor);
        INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler);
        INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler);
}

static void bond_work_cancel_all(struct bonding *bond)
{
        cancel_delayed_work_sync(&bond->mii_work);
        cancel_delayed_work_sync(&bond->arp_work);
        cancel_delayed_work_sync(&bond->alb_work);
        cancel_delayed_work_sync(&bond->ad_work);
        cancel_delayed_work_sync(&bond->mcast_work);
        cancel_delayed_work_sync(&bond->slave_arr_work);
}

static int bond_open(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct list_head *iter;
        struct slave *slave;

        if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN && !bond->rr_tx_counter) {
                bond->rr_tx_counter = alloc_percpu(u32);
                if (!bond->rr_tx_counter)
                        return -ENOMEM;
        }

        /* reset slave->backup and slave->inactive */
        if (bond_has_slaves(bond)) {
                bond_for_each_slave(bond, slave, iter) {
                        if (bond_uses_primary(bond) &&
                            slave != rcu_access_pointer(bond->curr_active_slave)) {
                                bond_set_slave_inactive_flags(slave,
                                                              BOND_SLAVE_NOTIFY_NOW);
                        } else if (BOND_MODE(bond) != BOND_MODE_8023AD) {
                                bond_set_slave_active_flags(slave,
                                                            BOND_SLAVE_NOTIFY_NOW);
                        }
                }
        }

        if (bond_is_lb(bond)) {
                /* bond_alb_initialize must be called before the timer
                 * is started.
                 */
                if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB)))
                        return -ENOMEM;
                if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB)
                        queue_delayed_work(bond->wq, &bond->alb_work, 0);
        }

        if (bond->params.miimon)  /* link check interval, in milliseconds. */
                queue_delayed_work(bond->wq, &bond->mii_work, 0);

        if (bond->params.arp_interval) {  /* arp interval, in milliseconds. */
                queue_delayed_work(bond->wq, &bond->arp_work, 0);
                bond->recv_probe = bond_rcv_validate;
        }

        if (BOND_MODE(bond) == BOND_MODE_8023AD) {
                queue_delayed_work(bond->wq, &bond->ad_work, 0);
                /* register to receive LACPDUs */
                bond->recv_probe = bond_3ad_lacpdu_recv;
                bond_3ad_initiate_agg_selection(bond, 1);

                bond_for_each_slave(bond, slave, iter)
                        dev_mc_add(slave->dev, lacpdu_mcast_addr);
        }

        if (bond_mode_can_use_xmit_hash(bond))
                bond_update_slave_arr(bond, NULL);

        return 0;
}

static int bond_close(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave;

        bond_work_cancel_all(bond);
        bond->send_peer_notif = 0;
        if (bond_is_lb(bond))
                bond_alb_deinitialize(bond);
        bond->recv_probe = NULL;

        if (bond_uses_primary(bond)) {
                rcu_read_lock();
                slave = rcu_dereference(bond->curr_active_slave);
                if (slave)
                        bond_hw_addr_flush(bond_dev, slave->dev);
                rcu_read_unlock();
        } else {
                struct list_head *iter;

                bond_for_each_slave(bond, slave, iter)
                        bond_hw_addr_flush(bond_dev, slave->dev);
        }

        return 0;
}

/* fold stats, assuming all rtnl_link_stats64 fields are u64, but
 * that some drivers can provide 32bit values only.
 */
static void bond_fold_stats(struct rtnl_link_stats64 *_res,
                            const struct rtnl_link_stats64 *_new,
                            const struct rtnl_link_stats64 *_old)
{
        const u64 *new = (const u64 *)_new;
        const u64 *old = (const u64 *)_old;
        u64 *res = (u64 *)_res;
        int i;

        for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) {
                u64 nv = new[i];
                u64 ov = old[i];
                s64 delta = nv - ov;

                /* detects if this particular field is 32bit only */
                if (((nv | ov) >> 32) == 0)
                        delta = (s64)(s32)((u32)nv - (u32)ov);

                /* filter anomalies, some drivers reset their stats
                 * at down/up events.
                 */
                if (delta > 0)
                        res[i] += delta;
        }
}

#ifdef CONFIG_LOCKDEP
static int bond_get_lowest_level_rcu(struct net_device *dev)
{
        struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
        struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
        int cur = 0, max = 0;

        now = dev;
        iter = &dev->adj_list.lower;

        while (1) {
                next = NULL;
                while (1) {
                        ldev = netdev_next_lower_dev_rcu(now, &iter);
                        if (!ldev)
                                break;

                        next = ldev;
                        niter = &ldev->adj_list.lower;
                        dev_stack[cur] = now;
                        iter_stack[cur++] = iter;
                        if (max <= cur)
                                max = cur;
                        break;
                }

                if (!next) {
                        if (!cur)
                                return max;
                        next = dev_stack[--cur];
                        niter = iter_stack[cur];
                }

                now = next;
                iter = niter;
        }

        return max;
}
#endif

static void bond_get_stats(struct net_device *bond_dev,
                           struct rtnl_link_stats64 *stats)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct rtnl_link_stats64 temp;
        struct list_head *iter;
        struct slave *slave;
        int nest_level = 0;


        rcu_read_lock();
#ifdef CONFIG_LOCKDEP
        nest_level = bond_get_lowest_level_rcu(bond_dev);
#endif

        spin_lock_nested(&bond->stats_lock, nest_level);
        memcpy(stats, &bond->bond_stats, sizeof(*stats));

        bond_for_each_slave_rcu(bond, slave, iter) {
                const struct rtnl_link_stats64 *new =
                        dev_get_stats(slave->dev, &temp);

                bond_fold_stats(stats, new, &slave->slave_stats);

                /* save off the slave stats for the next run */
                memcpy(&slave->slave_stats, new, sizeof(*new));
        }

        memcpy(&bond->bond_stats, stats, sizeof(*stats));
        spin_unlock(&bond->stats_lock);
        rcu_read_unlock();
}

static int bond_eth_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct mii_ioctl_data *mii = NULL;

        netdev_dbg(bond_dev, "bond_eth_ioctl: cmd=%d\n", cmd);

        switch (cmd) {
        case SIOCGMIIPHY:
                mii = if_mii(ifr);
                if (!mii)
                        return -EINVAL;

                mii->phy_id = 0;
                fallthrough;
        case SIOCGMIIREG:
                /* We do this again just in case we were called by SIOCGMIIREG
                 * instead of SIOCGMIIPHY.
                 */
                mii = if_mii(ifr);
                if (!mii)
                        return -EINVAL;

                if (mii->reg_num == 1) {
                        mii->val_out = 0;
                        if (netif_carrier_ok(bond->dev))
                                mii->val_out = BMSR_LSTATUS;
                }

                break;
        default:
                return -EOPNOTSUPP;
        }

        return 0;
}

static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct net_device *slave_dev = NULL;
        struct ifbond k_binfo;
        struct ifbond __user *u_binfo = NULL;
        struct ifslave k_sinfo;
        struct ifslave __user *u_sinfo = NULL;
        struct bond_opt_value newval;
        struct net *net;
        int res = 0;

        netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd);

        switch (cmd) {
        case SIOCBONDINFOQUERY:
                u_binfo = (struct ifbond __user *)ifr->ifr_data;

                if (copy_from_user(&k_binfo, u_binfo, sizeof(ifbond)))
                        return -EFAULT;

                bond_info_query(bond_dev, &k_binfo);
                if (copy_to_user(u_binfo, &k_binfo, sizeof(ifbond)))
                        return -EFAULT;

                return 0;
        case SIOCBONDSLAVEINFOQUERY:
                u_sinfo = (struct ifslave __user *)ifr->ifr_data;

                if (copy_from_user(&k_sinfo, u_sinfo, sizeof(ifslave)))
                        return -EFAULT;

                res = bond_slave_info_query(bond_dev, &k_sinfo);
                if (res == 0 &&
                    copy_to_user(u_sinfo, &k_sinfo, sizeof(ifslave)))
                        return -EFAULT;

                return res;
        default:
                break;
        }

        net = dev_net(bond_dev);

        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        slave_dev = __dev_get_by_name(net, ifr->ifr_slave);

        slave_dbg(bond_dev, slave_dev, "slave_dev=%p:\n", slave_dev);

        if (!slave_dev)
                return -ENODEV;

        switch (cmd) {
        case SIOCBONDENSLAVE:
                res = bond_enslave(bond_dev, slave_dev, NULL);
                break;
        case SIOCBONDRELEASE:
                res = bond_release(bond_dev, slave_dev);
                break;
        case SIOCBONDSETHWADDR:
                res = bond_set_dev_addr(bond_dev, slave_dev);
                break;
        case SIOCBONDCHANGEACTIVE:
                bond_opt_initstr(&newval, slave_dev->name);
                res = __bond_opt_set_notify(bond, BOND_OPT_ACTIVE_SLAVE,
                                            &newval);
                break;
        default:
                res = -EOPNOTSUPP;
        }

        return res;
}

static int bond_siocdevprivate(struct net_device *bond_dev, struct ifreq *ifr,
                               void __user *data, int cmd)
{
        struct ifreq ifrdata = { .ifr_data = data };

        switch (cmd) {
        case BOND_INFO_QUERY_OLD:
                return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDINFOQUERY);
        case BOND_SLAVE_INFO_QUERY_OLD:
                return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDSLAVEINFOQUERY);
        case BOND_ENSLAVE_OLD:
                return bond_do_ioctl(bond_dev, ifr, SIOCBONDENSLAVE);
        case BOND_RELEASE_OLD:
                return bond_do_ioctl(bond_dev, ifr, SIOCBONDRELEASE);
        case BOND_SETHWADDR_OLD:
                return bond_do_ioctl(bond_dev, ifr, SIOCBONDSETHWADDR);
        case BOND_CHANGE_ACTIVE_OLD:
                return bond_do_ioctl(bond_dev, ifr, SIOCBONDCHANGEACTIVE);
        }

        return -EOPNOTSUPP;
}

static void bond_change_rx_flags(struct net_device *bond_dev, int change)
{
        struct bonding *bond = netdev_priv(bond_dev);

        if (change & IFF_PROMISC)
                bond_set_promiscuity(bond,
                                     bond_dev->flags & IFF_PROMISC ? 1 : -1);

        if (change & IFF_ALLMULTI)
                bond_set_allmulti(bond,
                                  bond_dev->flags & IFF_ALLMULTI ? 1 : -1);
}

static void bond_set_rx_mode(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct list_head *iter;
        struct slave *slave;

        rcu_read_lock();
        if (bond_uses_primary(bond)) {
                slave = rcu_dereference(bond->curr_active_slave);
                if (slave) {
                        dev_uc_sync(slave->dev, bond_dev);
                        dev_mc_sync(slave->dev, bond_dev);
                }
        } else {
                bond_for_each_slave_rcu(bond, slave, iter) {
                        dev_uc_sync_multiple(slave->dev, bond_dev);
                        dev_mc_sync_multiple(slave->dev, bond_dev);
                }
        }
        rcu_read_unlock();
}

static int bond_neigh_init(struct neighbour *n)
{
        struct bonding *bond = netdev_priv(n->dev);
        const struct net_device_ops *slave_ops;
        struct neigh_parms parms;
        struct slave *slave;
        int ret = 0;

        rcu_read_lock();
        slave = bond_first_slave_rcu(bond);
        if (!slave)
                goto out;
        slave_ops = slave->dev->netdev_ops;
        if (!slave_ops->ndo_neigh_setup)
                goto out;

        /* TODO: find another way [1] to implement this.
         * Passing a zeroed structure is fragile,
         * but at least we do not pass garbage.
         *
         * [1] One way would be that ndo_neigh_setup() never touch
         *     struct neigh_parms, but propagate the new neigh_setup()
         *     back to ___neigh_create() / neigh_parms_alloc()
         */
        memset(&parms, 0, sizeof(parms));
        ret = slave_ops->ndo_neigh_setup(slave->dev, &parms);

        if (ret)
                goto out;

        if (parms.neigh_setup)
                ret = parms.neigh_setup(n);
out:
        rcu_read_unlock();
        return ret;
}

/* The bonding ndo_neigh_setup is called at init time beofre any
 * slave exists. So we must declare proxy setup function which will
 * be used at run time to resolve the actual slave neigh param setup.
 *
 * It's also called by master devices (such as vlans) to setup their
 * underlying devices. In that case - do nothing, we're already set up from
 * our init.
 */
static int bond_neigh_setup(struct net_device *dev,
                            struct neigh_parms *parms)
{
        /* modify only our neigh_parms */
        if (parms->dev == dev)
                parms->neigh_setup = bond_neigh_init;

        return 0;
}

/* Change the MTU of all of a master's slaves to match the master */
static int bond_change_mtu(struct net_device *bond_dev, int new_mtu)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave, *rollback_slave;
        struct list_head *iter;
        int res = 0;

        netdev_dbg(bond_dev, "bond=%p, new_mtu=%d\n", bond, new_mtu);

        bond_for_each_slave(bond, slave, iter) {
                slave_dbg(bond_dev, slave->dev, "s %p c_m %p\n",
                           slave, slave->dev->netdev_ops->ndo_change_mtu);

                res = dev_set_mtu(slave->dev, new_mtu);

                if (res) {
                        /* If we failed to set the slave's mtu to the new value
                         * we must abort the operation even in ACTIVE_BACKUP
                         * mode, because if we allow the backup slaves to have
                         * different mtu values than the active slave we'll
                         * need to change their mtu when doing a failover. That
                         * means changing their mtu from timer context, which
                         * is probably not a good idea.
                         */
                        slave_dbg(bond_dev, slave->dev, "err %d setting mtu to %d\n",
                                  res, new_mtu);
                        goto unwind;
                }
        }

        WRITE_ONCE(bond_dev->mtu, new_mtu);

        return 0;

unwind:
        /* unwind from head to the slave that failed */
        bond_for_each_slave(bond, rollback_slave, iter) {
                int tmp_res;

                if (rollback_slave == slave)
                        break;

                tmp_res = dev_set_mtu(rollback_slave->dev, bond_dev->mtu);
                if (tmp_res)
                        slave_dbg(bond_dev, rollback_slave->dev, "unwind err %d\n",
                                  tmp_res);
        }

        return res;
}

/* Change HW address
 *
 * Note that many devices must be down to change the HW address, and
 * downing the master releases all slaves.  We can make bonds full of
 * bonding devices to test this, however.
 */
static int bond_set_mac_address(struct net_device *bond_dev, void *addr)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave, *rollback_slave;
        struct sockaddr_storage *ss = addr, tmp_ss;
        struct list_head *iter;
        int res = 0;

        if (BOND_MODE(bond) == BOND_MODE_ALB)
                return bond_alb_set_mac_address(bond_dev, addr);


        netdev_dbg(bond_dev, "%s: bond=%p\n", __func__, bond);

        /* If fail_over_mac is enabled, do nothing and return success.
         * Returning an error causes ifenslave to fail.
         */
        if (bond->params.fail_over_mac &&
            BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
                return 0;

        if (!is_valid_ether_addr(ss->__data))
                return -EADDRNOTAVAIL;

        bond_for_each_slave(bond, slave, iter) {
                slave_dbg(bond_dev, slave->dev, "%s: slave=%p\n",
                          __func__, slave);
                res = dev_set_mac_address(slave->dev, addr, NULL);
                if (res) {
                        /* TODO: consider downing the slave
                         * and retry ?
                         * User should expect communications
                         * breakage anyway until ARP finish
                         * updating, so...
                         */
                        slave_dbg(bond_dev, slave->dev, "%s: err %d\n",
                                  __func__, res);
                        goto unwind;
                }
        }

        /* success */
        dev_addr_set(bond_dev, ss->__data);
        return 0;

unwind:
        memcpy(tmp_ss.__data, bond_dev->dev_addr, bond_dev->addr_len);
        tmp_ss.ss_family = bond_dev->type;

        /* unwind from head to the slave that failed */
        bond_for_each_slave(bond, rollback_slave, iter) {
                int tmp_res;

                if (rollback_slave == slave)
                        break;

                tmp_res = dev_set_mac_address(rollback_slave->dev,
                                              (struct sockaddr *)&tmp_ss, NULL);
                if (tmp_res) {
                        slave_dbg(bond_dev, rollback_slave->dev, "%s: unwind err %d\n",
                                   __func__, tmp_res);
                }
        }

        return res;
}

/**
 * bond_get_slave_by_id - get xmit slave with slave_id
 * @bond: bonding device that is transmitting
 * @slave_id: slave id up to slave_cnt-1 through which to transmit
 *
 * This function tries to get slave with slave_id but in case
 * it fails, it tries to find the first available slave for transmission.
 */
static struct slave *bond_get_slave_by_id(struct bonding *bond,
                                          int slave_id)
{
        struct list_head *iter;
        struct slave *slave;
        int i = slave_id;

        /* Here we start from the slave with slave_id */
        bond_for_each_slave_rcu(bond, slave, iter) {
                if (--i < 0) {
                        if (bond_slave_can_tx(slave))
                                return slave;
                }
        }

        /* Here we start from the first slave up to slave_id */
        i = slave_id;
        bond_for_each_slave_rcu(bond, slave, iter) {
                if (--i < 0)
                        break;
                if (bond_slave_can_tx(slave))
                        return slave;
        }
        /* no slave that can tx has been found */
        return NULL;
}

/**
 * bond_rr_gen_slave_id - generate slave id based on packets_per_slave
 * @bond: bonding device to use
 *
 * Based on the value of the bonding device's packets_per_slave parameter
 * this function generates a slave id, which is usually used as the next
 * slave to transmit through.
 */
static u32 bond_rr_gen_slave_id(struct bonding *bond)
{
        u32 slave_id;
        struct reciprocal_value reciprocal_packets_per_slave;
        int packets_per_slave = bond->params.packets_per_slave;

        switch (packets_per_slave) {
        case 0:
                slave_id = get_random_u32();
                break;
        case 1:
                slave_id = this_cpu_inc_return(*bond->rr_tx_counter);
                break;
        default:
                reciprocal_packets_per_slave =
                        bond->params.reciprocal_packets_per_slave;
                slave_id = this_cpu_inc_return(*bond->rr_tx_counter);
                slave_id = reciprocal_divide(slave_id,
                                             reciprocal_packets_per_slave);
                break;
        }

        return slave_id;
}

static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond,
                                                    struct sk_buff *skb)
{
        struct slave *slave;
        int slave_cnt;
        u32 slave_id;

        /* Start with the curr_active_slave that joined the bond as the
         * default for sending IGMP traffic.  For failover purposes one
         * needs to maintain some consistency for the interface that will
         * send the join/membership reports.  The curr_active_slave found
         * will send all of this type of traffic.
         */
        if (skb->protocol == htons(ETH_P_IP)) {
                int noff = skb_network_offset(skb);
                struct iphdr *iph;

                if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
                        goto non_igmp;

                iph = ip_hdr(skb);
                if (iph->protocol == IPPROTO_IGMP) {
                        slave = rcu_dereference(bond->curr_active_slave);
                        if (slave)
                                return slave;
                        return bond_get_slave_by_id(bond, 0);
                }
        }

non_igmp:
        slave_cnt = READ_ONCE(bond->slave_cnt);
        if (likely(slave_cnt)) {
                slave_id = bond_rr_gen_slave_id(bond) % slave_cnt;
                return bond_get_slave_by_id(bond, slave_id);
        }
        return NULL;
}

static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond,
                                                        struct xdp_buff *xdp)
{
        struct slave *slave;
        int slave_cnt;
        u32 slave_id;
        const struct ethhdr *eth;
        void *data = xdp->data;

        if (data + sizeof(struct ethhdr) > xdp->data_end)
                goto non_igmp;

        eth = (struct ethhdr *)data;
        data += sizeof(struct ethhdr);

        /* See comment on IGMP in bond_xmit_roundrobin_slave_get() */
        if (eth->h_proto == htons(ETH_P_IP)) {
                const struct iphdr *iph;

                if (data + sizeof(struct iphdr) > xdp->data_end)
                        goto non_igmp;

                iph = (struct iphdr *)data;

                if (iph->protocol == IPPROTO_IGMP) {
                        slave = rcu_dereference(bond->curr_active_slave);
                        if (slave)
                                return slave;
                        return bond_get_slave_by_id(bond, 0);
                }
        }

non_igmp:
        slave_cnt = READ_ONCE(bond->slave_cnt);
        if (likely(slave_cnt)) {
                slave_id = bond_rr_gen_slave_id(bond) % slave_cnt;
                return bond_get_slave_by_id(bond, slave_id);
        }
        return NULL;
}

static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
                                        struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave;

        slave = bond_xmit_roundrobin_slave_get(bond, skb);
        if (likely(slave))
                return bond_dev_queue_xmit(bond, skb, slave->dev);

        return bond_tx_drop(bond_dev, skb);
}

static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond)
{
        return rcu_dereference(bond->curr_active_slave);
}

/* In active-backup mode, we know that bond->curr_active_slave is always valid if
 * the bond has a usable interface.
 */
static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
                                          struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave;

        slave = bond_xmit_activebackup_slave_get(bond);
        if (slave)
                return bond_dev_queue_xmit(bond, skb, slave->dev);

        return bond_tx_drop(bond_dev, skb);
}

/* Use this to update slave_array when (a) it's not appropriate to update
 * slave_array right away (note that update_slave_array() may sleep)
 * and / or (b) RTNL is not held.
 */
void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay)
{
        queue_delayed_work(bond->wq, &bond->slave_arr_work, delay);
}

/* Slave array work handler. Holds only RTNL */
static void bond_slave_arr_handler(struct work_struct *work)
{
        struct bonding *bond = container_of(work, struct bonding,
                                            slave_arr_work.work);
        int ret;

        if (!rtnl_trylock())
                goto err;

        ret = bond_update_slave_arr(bond, NULL);
        rtnl_unlock();
        if (ret) {
                pr_warn_ratelimited("Failed to update slave array from WT\n");
                goto err;
        }
        return;

err:
        bond_slave_arr_work_rearm(bond, 1);
}

static void bond_skip_slave(struct bond_up_slave *slaves,
                            struct slave *skipslave)
{
        int idx;

        /* Rare situation where caller has asked to skip a specific
         * slave but allocation failed (most likely!). BTW this is
         * only possible when the call is initiated from
         * __bond_release_one(). In this situation; overwrite the
         * skipslave entry in the array with the last entry from the
         * array to avoid a situation where the xmit path may choose
         * this to-be-skipped slave to send a packet out.
         */
        for (idx = 0; slaves && idx < slaves->count; idx++) {
                if (skipslave == slaves->arr[idx]) {
                        slaves->arr[idx] =
                                slaves->arr[slaves->count - 1];
                        slaves->count--;
                        break;
                }
        }
}

static void bond_set_slave_arr(struct bonding *bond,
                               struct bond_up_slave *usable_slaves,
                               struct bond_up_slave *all_slaves)
{
        struct bond_up_slave *usable, *all;

        usable = rtnl_dereference(bond->usable_slaves);
        rcu_assign_pointer(bond->usable_slaves, usable_slaves);
        kfree_rcu(usable, rcu);

        all = rtnl_dereference(bond->all_slaves);
        rcu_assign_pointer(bond->all_slaves, all_slaves);
        kfree_rcu(all, rcu);
}

static void bond_reset_slave_arr(struct bonding *bond)
{
        bond_set_slave_arr(bond, NULL, NULL);
}

/* Build the usable slaves array in control path for modes that use xmit-hash
 * to determine the slave interface -
 * (a) BOND_MODE_8023AD
 * (b) BOND_MODE_XOR
 * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0
 *
 * The caller is expected to hold RTNL only and NO other lock!
 */
int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave)
{
        struct bond_up_slave *usable_slaves = NULL, *all_slaves = NULL;
        struct slave *slave;
        struct list_head *iter;
        int agg_id = 0;
        int ret = 0;

        might_sleep();

        usable_slaves = kzalloc(struct_size(usable_slaves, arr,
                                            bond->slave_cnt), GFP_KERNEL);
        all_slaves = kzalloc(struct_size(all_slaves, arr,
                                         bond->slave_cnt), GFP_KERNEL);
        if (!usable_slaves || !all_slaves) {
                ret = -ENOMEM;
                goto out;
        }
        if (BOND_MODE(bond) == BOND_MODE_8023AD) {
                struct ad_info ad_info;

                spin_lock_bh(&bond->mode_lock);
                if (bond_3ad_get_active_agg_info(bond, &ad_info)) {
                        spin_unlock_bh(&bond->mode_lock);
                        pr_debug("bond_3ad_get_active_agg_info failed\n");
                        /* No active aggragator means it's not safe to use
                         * the previous array.
                         */
                        bond_reset_slave_arr(bond);
                        goto out;
                }
                spin_unlock_bh(&bond->mode_lock);
                agg_id = ad_info.aggregator_id;
        }
        bond_for_each_slave(bond, slave, iter) {
                if (skipslave == slave)
                        continue;

                all_slaves->arr[all_slaves->count++] = slave;
                if (BOND_MODE(bond) == BOND_MODE_8023AD) {
                        struct aggregator *agg;

                        agg = SLAVE_AD_INFO(slave)->port.aggregator;
                        if (!agg || agg->aggregator_identifier != agg_id)
                                continue;
                }
                if (!bond_slave_can_tx(slave))
                        continue;

                slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n",
                          usable_slaves->count);

                usable_slaves->arr[usable_slaves->count++] = slave;
        }

        bond_set_slave_arr(bond, usable_slaves, all_slaves);
        return ret;
out:
        if (ret != 0 && skipslave) {
                bond_skip_slave(rtnl_dereference(bond->all_slaves),
                                skipslave);
                bond_skip_slave(rtnl_dereference(bond->usable_slaves),
                                skipslave);
        }
        kfree_rcu(all_slaves, rcu);
        kfree_rcu(usable_slaves, rcu);

        return ret;
}

static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond,
                                                 struct sk_buff *skb,
                                                 struct bond_up_slave *slaves)
{
        struct slave *slave;
        unsigned int count;
        u32 hash;

        hash = bond_xmit_hash(bond, skb);
        count = slaves ? READ_ONCE(slaves->count) : 0;
        if (unlikely(!count))
                return NULL;

        slave = slaves->arr[hash % count];
        return slave;
}

static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond,
                                                     struct xdp_buff *xdp)
{
        struct bond_up_slave *slaves;
        unsigned int count;
        u32 hash;

        hash = bond_xmit_hash_xdp(bond, xdp);
        slaves = rcu_dereference(bond->usable_slaves);
        count = slaves ? READ_ONCE(slaves->count) : 0;
        if (unlikely(!count))
                return NULL;

        return slaves->arr[hash % count];
}

/* Use this Xmit function for 3AD as well as XOR modes. The current
 * usable slave array is formed in the control path. The xmit function
 * just calculates hash and sends the packet out.
 */
static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
                                     struct net_device *dev)
{
        struct bonding *bond = netdev_priv(dev);
        struct bond_up_slave *slaves;
        struct slave *slave;

        slaves = rcu_dereference(bond->usable_slaves);
        slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves);
        if (likely(slave))
                return bond_dev_queue_xmit(bond, skb, slave->dev);

        return bond_tx_drop(dev, skb);
}

/* in broadcast mode, we send everything to all usable interfaces. */
static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
                                       struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave = NULL;
        struct list_head *iter;
        bool xmit_suc = false;
        bool skb_used = false;

        bond_for_each_slave_rcu(bond, slave, iter) {
                struct sk_buff *skb2;

                if (!(bond_slave_is_up(slave) && slave->link == BOND_LINK_UP))
                        continue;

                if (bond_is_last_slave(bond, slave)) {
                        skb2 = skb;
                        skb_used = true;
                } else {
                        skb2 = skb_clone(skb, GFP_ATOMIC);
                        if (!skb2) {
                                net_err_ratelimited("%s: Error: %s: skb_clone() failed\n",
                                                    bond_dev->name, __func__);
                                continue;
                        }
                }

                if (bond_dev_queue_xmit(bond, skb2, slave->dev) == NETDEV_TX_OK)
                        xmit_suc = true;
        }

        if (!skb_used)
                dev_kfree_skb_any(skb);

        if (xmit_suc)
                return NETDEV_TX_OK;

        dev_core_stats_tx_dropped_inc(bond_dev);
        return NET_XMIT_DROP;
}

/*------------------------- Device initialization ---------------------------*/

/* Lookup the slave that corresponds to a qid */
static inline int bond_slave_override(struct bonding *bond,
                                      struct sk_buff *skb)
{
        struct slave *slave = NULL;
        struct list_head *iter;

        if (!skb_rx_queue_recorded(skb))
                return 1;

        /* Find out if any slaves have the same mapping as this skb. */
        bond_for_each_slave_rcu(bond, slave, iter) {
                if (READ_ONCE(slave->queue_id) == skb_get_queue_mapping(skb)) {
                        if (bond_slave_is_up(slave) &&
                            slave->link == BOND_LINK_UP) {
                                bond_dev_queue_xmit(bond, skb, slave->dev);
                                return 0;
                        }
                        /* If the slave isn't UP, use default transmit policy. */
                        break;
                }
        }

        return 1;
}


static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb,
                             struct net_device *sb_dev)
{
        /* This helper function exists to help dev_pick_tx get the correct
         * destination queue.  Using a helper function skips a call to
         * skb_tx_hash and will put the skbs in the queue we expect on their
         * way down to the bonding driver.
         */
        u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;

        /* Save the original txq to restore before passing to the driver */
        qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb);

        if (unlikely(txq >= dev->real_num_tx_queues)) {
                do {
                        txq -= dev->real_num_tx_queues;
                } while (txq >= dev->real_num_tx_queues);
        }
        return txq;
}

static struct net_device *bond_xmit_get_slave(struct net_device *master_dev,
                                              struct sk_buff *skb,
                                              bool all_slaves)
{
        struct bonding *bond = netdev_priv(master_dev);
        struct bond_up_slave *slaves;
        struct slave *slave = NULL;

        switch (BOND_MODE(bond)) {
        case BOND_MODE_ROUNDROBIN:
                slave = bond_xmit_roundrobin_slave_get(bond, skb);
                break;
        case BOND_MODE_ACTIVEBACKUP:
                slave = bond_xmit_activebackup_slave_get(bond);
                break;
        case BOND_MODE_8023AD:
        case BOND_MODE_XOR:
                if (all_slaves)
                        slaves = rcu_dereference(bond->all_slaves);
                else
                        slaves = rcu_dereference(bond->usable_slaves);
                slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves);
                break;
        case BOND_MODE_BROADCAST:
                break;
        case BOND_MODE_ALB:
                slave = bond_xmit_alb_slave_get(bond, skb);
                break;
        case BOND_MODE_TLB:
                slave = bond_xmit_tlb_slave_get(bond, skb);
                break;
        default:
                /* Should never happen, mode already checked */
                WARN_ONCE(true, "Unknown bonding mode");
                break;
        }

        if (slave)
                return slave->dev;
        return NULL;
}

static void bond_sk_to_flow(struct sock *sk, struct flow_keys *flow)
{
        switch (sk->sk_family) {
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                if (ipv6_only_sock(sk) ||
                    ipv6_addr_type(&sk->sk_v6_daddr) != IPV6_ADDR_MAPPED) {
                        flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
                        flow->addrs.v6addrs.src = inet6_sk(sk)->saddr;
                        flow->addrs.v6addrs.dst = sk->sk_v6_daddr;
                        break;
                }
                fallthrough;
#endif
        default: /* AF_INET */
                flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
                flow->addrs.v4addrs.src = inet_sk(sk)->inet_rcv_saddr;
                flow->addrs.v4addrs.dst = inet_sk(sk)->inet_daddr;
                break;
        }

        flow->ports.src = inet_sk(sk)->inet_sport;
        flow->ports.dst = inet_sk(sk)->inet_dport;
}

/**
 * bond_sk_hash_l34 - generate a hash value based on the socket's L3 and L4 fields
 * @sk: socket to use for headers
 *
 * This function will extract the necessary field from the socket and use
 * them to generate a hash based on the LAYER34 xmit_policy.
 * Assumes that sk is a TCP or UDP socket.
 */
static u32 bond_sk_hash_l34(struct sock *sk)
{
        struct flow_keys flow;
        u32 hash;

        bond_sk_to_flow(sk, &flow);

        /* L4 */
        memcpy(&hash, &flow.ports.ports, sizeof(hash));
        /* L3 */
        return bond_ip_hash(hash, &flow, BOND_XMIT_POLICY_LAYER34);
}

static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond,
                                                  struct sock *sk)
{
        struct bond_up_slave *slaves;
        struct slave *slave;
        unsigned int count;
        u32 hash;

        slaves = rcu_dereference(bond->usable_slaves);
        count = slaves ? READ_ONCE(slaves->count) : 0;
        if (unlikely(!count))
                return NULL;

        hash = bond_sk_hash_l34(sk);
        slave = slaves->arr[hash % count];

        return slave->dev;
}

static struct net_device *bond_sk_get_lower_dev(struct net_device *dev,
                                                struct sock *sk)
{
        struct bonding *bond = netdev_priv(dev);
        struct net_device *lower = NULL;

        rcu_read_lock();
        if (bond_sk_check(bond))
                lower = __bond_sk_get_lower_dev(bond, sk);
        rcu_read_unlock();

        return lower;
}

#if IS_ENABLED(CONFIG_TLS_DEVICE)
static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb,
                                        struct net_device *dev)
{
        struct net_device *tls_netdev = rcu_dereference(tls_get_ctx(skb->sk)->netdev);

        /* tls_netdev might become NULL, even if tls_is_skb_tx_device_offloaded
         * was true, if tls_device_down is running in parallel, but it's OK,
         * because bond_get_slave_by_dev has a NULL check.
         */
        if (likely(bond_get_slave_by_dev(bond, tls_netdev)))
                return bond_dev_queue_xmit(bond, skb, tls_netdev);
        return bond_tx_drop(dev, skb);
}
#endif

static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct bonding *bond = netdev_priv(dev);

        if (bond_should_override_tx_queue(bond) &&
            !bond_slave_override(bond, skb))
                return NETDEV_TX_OK;

#if IS_ENABLED(CONFIG_TLS_DEVICE)
        if (tls_is_skb_tx_device_offloaded(skb))
                return bond_tls_device_xmit(bond, skb, dev);
#endif

        switch (BOND_MODE(bond)) {
        case BOND_MODE_ROUNDROBIN:
                return bond_xmit_roundrobin(skb, dev);
        case BOND_MODE_ACTIVEBACKUP:
                return bond_xmit_activebackup(skb, dev);
        case BOND_MODE_8023AD:
        case BOND_MODE_XOR:
                return bond_3ad_xor_xmit(skb, dev);
        case BOND_MODE_BROADCAST:
                return bond_xmit_broadcast(skb, dev);
        case BOND_MODE_ALB:
                return bond_alb_xmit(skb, dev);
        case BOND_MODE_TLB:
                return bond_tlb_xmit(skb, dev);
        default:
                /* Should never happen, mode already checked */
                netdev_err(dev, "Unknown bonding mode %d\n", BOND_MODE(bond));
                WARN_ON_ONCE(1);
                return bond_tx_drop(dev, skb);
        }
}

static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct bonding *bond = netdev_priv(dev);
        netdev_tx_t ret = NETDEV_TX_OK;

        /* If we risk deadlock from transmitting this in the
         * netpoll path, tell netpoll to queue the frame for later tx
         */
        if (unlikely(is_netpoll_tx_blocked(dev)))
                return NETDEV_TX_BUSY;

        rcu_read_lock();
        if (bond_has_slaves(bond))
                ret = __bond_start_xmit(skb, dev);
        else
                ret = bond_tx_drop(dev, skb);
        rcu_read_unlock();

        return ret;
}

static struct net_device *
bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct slave *slave;

        /* Caller needs to hold rcu_read_lock() */

        switch (BOND_MODE(bond)) {
        case BOND_MODE_ROUNDROBIN:
                slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp);
                break;

        case BOND_MODE_ACTIVEBACKUP:
                slave = bond_xmit_activebackup_slave_get(bond);
                break;

        case BOND_MODE_8023AD:
        case BOND_MODE_XOR:
                slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp);
                break;

        default:
                if (net_ratelimit())
                        netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n",
                                   BOND_MODE(bond));
                return NULL;
        }

        if (slave)
                return slave->dev;

        return NULL;
}

static int bond_xdp_xmit(struct net_device *bond_dev,
                         int n, struct xdp_frame **frames, u32 flags)
{
        int nxmit, err = -ENXIO;

        rcu_read_lock();

        for (nxmit = 0; nxmit < n; nxmit++) {
                struct xdp_frame *frame = frames[nxmit];
                struct xdp_frame *frames1[] = {frame};
                struct net_device *slave_dev;
                struct xdp_buff xdp;

                xdp_convert_frame_to_buff(frame, &xdp);

                slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp);
                if (!slave_dev) {
                        err = -ENXIO;
                        break;
                }

                err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags);
                if (err < 1)
                        break;
        }

        rcu_read_unlock();

        /* If error happened on the first frame then we can pass the error up, otherwise
         * report the number of frames that were xmitted.
         */
        if (err < 0)
                return (nxmit == 0 ? err : nxmit);

        return nxmit;
}

static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog,
                        struct netlink_ext_ack *extack)
{
        struct bonding *bond = netdev_priv(dev);
        struct list_head *iter;
        struct slave *slave, *rollback_slave;
        struct bpf_prog *old_prog;
        struct netdev_bpf xdp = {
                .command = XDP_SETUP_PROG,
                .flags   = 0,
                .prog    = prog,
                .extack  = extack,
        };
        int err;

        ASSERT_RTNL();

        if (!bond_xdp_check(bond, BOND_MODE(bond))) {
                BOND_NL_ERR(dev, extack,
                            "No native XDP support for the current bonding mode");
                return -EOPNOTSUPP;
        }

        old_prog = bond->xdp_prog;
        bond->xdp_prog = prog;

        bond_for_each_slave(bond, slave, iter) {
                struct net_device *slave_dev = slave->dev;

                if (!slave_dev->netdev_ops->ndo_bpf ||
                    !slave_dev->netdev_ops->ndo_xdp_xmit) {
                        SLAVE_NL_ERR(dev, slave_dev, extack,
                                     "Slave device does not support XDP");
                        err = -EOPNOTSUPP;
                        goto err;
                }

                if (dev_xdp_prog_count(slave_dev) > 0) {
                        SLAVE_NL_ERR(dev, slave_dev, extack,
                                     "Slave has XDP program loaded, please unload before enslaving");
                        err = -EOPNOTSUPP;
                        goto err;
                }

                err = dev_xdp_propagate(slave_dev, &xdp);
                if (err < 0) {
                        /* ndo_bpf() sets extack error message */
                        slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err);
                        goto err;
                }
                if (prog)
                        bpf_prog_inc(prog);
        }

        if (prog) {
                static_branch_inc(&bpf_master_redirect_enabled_key);
        } else if (old_prog) {
                bpf_prog_put(old_prog);
                static_branch_dec(&bpf_master_redirect_enabled_key);
        }

        return 0;

err:
        /* unwind the program changes */
        bond->xdp_prog = old_prog;
        xdp.prog = old_prog;
        xdp.extack = NULL; /* do not overwrite original error */

        bond_for_each_slave(bond, rollback_slave, iter) {
                struct net_device *slave_dev = rollback_slave->dev;
                int err_unwind;

                if (slave == rollback_slave)
                        break;

                err_unwind = dev_xdp_propagate(slave_dev, &xdp);
                if (err_unwind < 0)
                        slave_err(dev, slave_dev,
                                  "Error %d when unwinding XDP program change\n", err_unwind);
                else if (xdp.prog)
                        bpf_prog_inc(xdp.prog);
        }
        return err;
}

static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp)
{
        switch (xdp->command) {
        case XDP_SETUP_PROG:
                return bond_xdp_set(dev, xdp->prog, xdp->extack);
        default:
                return -EINVAL;
        }
}

static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed)
{
        if (speed == 0 || speed == SPEED_UNKNOWN)
                speed = slave->speed;
        else
                speed = min(speed, slave->speed);

        return speed;
}

/* Set the BOND_PHC_INDEX flag to notify user space */
static int bond_set_phc_index_flag(struct kernel_hwtstamp_config *kernel_cfg)
{
        struct ifreq *ifr = kernel_cfg->ifr;
        struct hwtstamp_config cfg;

        if (kernel_cfg->copied_to_user) {
                /* Lower device has a legacy implementation */
                if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
                        return -EFAULT;

                cfg.flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX;
                if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)))
                        return -EFAULT;
        } else {
                kernel_cfg->flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX;
        }

        return 0;
}

static int bond_hwtstamp_get(struct net_device *dev,
                             struct kernel_hwtstamp_config *cfg)
{
        struct bonding *bond = netdev_priv(dev);
        struct net_device *real_dev;
        int err;

        real_dev = bond_option_active_slave_get_rcu(bond);
        if (!real_dev)
                return -EOPNOTSUPP;

        err = generic_hwtstamp_get_lower(real_dev, cfg);
        if (err)
                return err;

        return bond_set_phc_index_flag(cfg);
}

static int bond_hwtstamp_set(struct net_device *dev,
                             struct kernel_hwtstamp_config *cfg,
                             struct netlink_ext_ack *extack)
{
        struct bonding *bond = netdev_priv(dev);
        struct net_device *real_dev;
        int err;

        if (!(cfg->flags & HWTSTAMP_FLAG_BONDED_PHC_INDEX))
                return -EOPNOTSUPP;

        real_dev = bond_option_active_slave_get_rcu(bond);
        if (!real_dev)
                return -EOPNOTSUPP;

        err = generic_hwtstamp_set_lower(real_dev, cfg, extack);
        if (err)
                return err;

        return bond_set_phc_index_flag(cfg);
}

static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev,
                                           struct ethtool_link_ksettings *cmd)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct list_head *iter;
        struct slave *slave;
        u32 speed = 0;

        cmd->base.duplex = DUPLEX_UNKNOWN;
        cmd->base.port = PORT_OTHER;

        /* Since bond_slave_can_tx returns false for all inactive or down slaves, we
         * do not need to check mode.  Though link speed might not represent
         * the true receive or transmit bandwidth (not all modes are symmetric)
         * this is an accurate maximum.
         */
        bond_for_each_slave(bond, slave, iter) {
                if (bond_slave_can_tx(slave)) {
                        bond_update_speed_duplex(slave);
                        if (slave->speed != SPEED_UNKNOWN) {
                                if (BOND_MODE(bond) == BOND_MODE_BROADCAST)
                                        speed = bond_mode_bcast_speed(slave,
                                                                      speed);
                                else
                                        speed += slave->speed;
                        }
                        if (cmd->base.duplex == DUPLEX_UNKNOWN &&
                            slave->duplex != DUPLEX_UNKNOWN)
                                cmd->base.duplex = slave->duplex;
                }
        }
        cmd->base.speed = speed ? : SPEED_UNKNOWN;

        return 0;
}

static void bond_ethtool_get_drvinfo(struct net_device *bond_dev,
                                     struct ethtool_drvinfo *drvinfo)
{
        strscpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
        snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d",
                 BOND_ABI_VERSION);
}

static int bond_ethtool_get_ts_info(struct net_device *bond_dev,
                                    struct kernel_ethtool_ts_info *info)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct kernel_ethtool_ts_info ts_info;
        struct net_device *real_dev;
        bool sw_tx_support = false;
        struct list_head *iter;
        struct slave *slave;
        int ret = 0;

        rcu_read_lock();
        real_dev = bond_option_active_slave_get_rcu(bond);
        dev_hold(real_dev);
        rcu_read_unlock();

        if (real_dev) {
                ret = ethtool_get_ts_info_by_layer(real_dev, info);
        } else {
                /* Check if all slaves support software tx timestamping */
                rcu_read_lock();
                bond_for_each_slave_rcu(bond, slave, iter) {
                        ret = ethtool_get_ts_info_by_layer(slave->dev, &ts_info);
                        if (!ret && (ts_info.so_timestamping & SOF_TIMESTAMPING_TX_SOFTWARE)) {
                                sw_tx_support = true;
                                continue;
                        }

                        sw_tx_support = false;
                        break;
                }
                rcu_read_unlock();
        }

        if (sw_tx_support)
                info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;

        dev_put(real_dev);
        return ret;
}

static const struct ethtool_ops bond_ethtool_ops = {
        .get_drvinfo                = bond_ethtool_get_drvinfo,
        .get_link                = ethtool_op_get_link,
        .get_link_ksettings        = bond_ethtool_get_link_ksettings,
        .get_ts_info                = bond_ethtool_get_ts_info,
};

static const struct net_device_ops bond_netdev_ops = {
        .ndo_init                = bond_init,
        .ndo_uninit                = bond_uninit,
        .ndo_open                = bond_open,
        .ndo_stop                = bond_close,
        .ndo_start_xmit                = bond_start_xmit,
        .ndo_select_queue        = bond_select_queue,
        .ndo_get_stats64        = bond_get_stats,
        .ndo_eth_ioctl                = bond_eth_ioctl,
        .ndo_siocbond                = bond_do_ioctl,
        .ndo_siocdevprivate        = bond_siocdevprivate,
        .ndo_change_rx_flags        = bond_change_rx_flags,
        .ndo_set_rx_mode        = bond_set_rx_mode,
        .ndo_change_mtu                = bond_change_mtu,
        .ndo_set_mac_address        = bond_set_mac_address,
        .ndo_neigh_setup        = bond_neigh_setup,
        .ndo_vlan_rx_add_vid        = bond_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid        = bond_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_netpoll_setup        = bond_netpoll_setup,
        .ndo_netpoll_cleanup        = bond_netpoll_cleanup,
        .ndo_poll_controller        = bond_poll_controller,
#endif
        .ndo_add_slave                = bond_enslave,
        .ndo_del_slave                = bond_release,
        .ndo_fix_features        = bond_fix_features,
        .ndo_features_check        = passthru_features_check,
        .ndo_get_xmit_slave        = bond_xmit_get_slave,
        .ndo_sk_get_lower_dev        = bond_sk_get_lower_dev,
        .ndo_bpf                = bond_xdp,
        .ndo_xdp_xmit           = bond_xdp_xmit,
        .ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave,
        .ndo_hwtstamp_get        = bond_hwtstamp_get,
        .ndo_hwtstamp_set        = bond_hwtstamp_set,
};

static const struct device_type bond_type = {
        .name = "bond",
};

static void bond_destructor(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);

        if (bond->wq)
                destroy_workqueue(bond->wq);

        free_percpu(bond->rr_tx_counter);
}

void bond_setup(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);

        spin_lock_init(&bond->mode_lock);
        bond->params = bonding_defaults;

        /* Initialize pointers */
        bond->dev = bond_dev;

        /* Initialize the device entry points */
        ether_setup(bond_dev);
        bond_dev->max_mtu = ETH_MAX_MTU;
        bond_dev->netdev_ops = &bond_netdev_ops;
        bond_dev->ethtool_ops = &bond_ethtool_ops;

        bond_dev->needs_free_netdev = true;
        bond_dev->priv_destructor = bond_destructor;

        SET_NETDEV_DEVTYPE(bond_dev, &bond_type);

        /* Initialize the device options */
        bond_dev->flags |= IFF_MASTER;
        bond_dev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT | IFF_NO_QUEUE;
        bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);

#ifdef CONFIG_XFRM_OFFLOAD
        /* set up xfrm device ops (only supported in active-backup right now) */
        bond_dev->xfrmdev_ops = &bond_xfrmdev_ops;
        INIT_LIST_HEAD(&bond->ipsec_list);
        mutex_init(&bond->ipsec_lock);
#endif /* CONFIG_XFRM_OFFLOAD */

        /* don't acquire bond device's netif_tx_lock when transmitting */
        bond_dev->lltx = true;

        /* Don't allow bond devices to change network namespaces. */
        bond_dev->netns_immutable = true;

        /* By default, we declare the bond to be fully
         * VLAN hardware accelerated capable. Special
         * care is taken in the various xmit functions
         * when there are slaves that are not hw accel
         * capable
         */

        bond_dev->hw_features = BOND_VLAN_FEATURES |
                                NETIF_F_HW_VLAN_CTAG_RX |
                                NETIF_F_HW_VLAN_CTAG_FILTER |
                                NETIF_F_HW_VLAN_STAG_RX |
                                NETIF_F_HW_VLAN_STAG_FILTER;

        bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL;
        bond_dev->features |= bond_dev->hw_features;
        bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
        bond_dev->features |= NETIF_F_GSO_PARTIAL;
#ifdef CONFIG_XFRM_OFFLOAD
        bond_dev->hw_features |= BOND_XFRM_FEATURES;
        /* Only enable XFRM features if this is an active-backup config */
        if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
                bond_dev->features |= BOND_XFRM_FEATURES;
#endif /* CONFIG_XFRM_OFFLOAD */
}

/* Destroy a bonding device.
 * Must be under rtnl_lock when this function is called.
 */
static void bond_uninit(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct list_head *iter;
        struct slave *slave;

        bond_netpoll_cleanup(bond_dev);

        /* Release the bonded slaves */
        bond_for_each_slave(bond, slave, iter)
                __bond_release_one(bond_dev, slave->dev, true, true);
        netdev_info(bond_dev, "Released all slaves\n");

#ifdef CONFIG_XFRM_OFFLOAD
        mutex_destroy(&bond->ipsec_lock);
#endif /* CONFIG_XFRM_OFFLOAD */

        bond_set_slave_arr(bond, NULL, NULL);

        list_del_rcu(&bond->bond_list);

        bond_debug_unregister(bond);
}

/*------------------------- Module initialization ---------------------------*/

static int __init bond_check_params(struct bond_params *params)
{
        int arp_validate_value, fail_over_mac_value, primary_reselect_value, i;
        struct bond_opt_value newval;
        const struct bond_opt_value *valptr;
        int arp_all_targets_value = 0;
        u16 ad_actor_sys_prio = 0;
        u16 ad_user_port_key = 0;
        __be32 arp_target[BOND_MAX_ARP_TARGETS] = { 0 };
        int arp_ip_count;
        int bond_mode        = BOND_MODE_ROUNDROBIN;
        int xmit_hashtype = BOND_XMIT_POLICY_LAYER2;
        int lacp_fast = 0;
        int tlb_dynamic_lb;

        /* Convert string parameters. */
        if (mode) {
                bond_opt_initstr(&newval, mode);
                valptr = bond_opt_parse(bond_opt_get(BOND_OPT_MODE), &newval);
                if (!valptr) {
                        pr_err("Error: Invalid bonding mode \"%s\"\n", mode);
                        return -EINVAL;
                }
                bond_mode = valptr->value;
        }

        if (xmit_hash_policy) {
                if (bond_mode == BOND_MODE_ROUNDROBIN ||
                    bond_mode == BOND_MODE_ACTIVEBACKUP ||
                    bond_mode == BOND_MODE_BROADCAST) {
                        pr_info("xmit_hash_policy param is irrelevant in mode %s\n",
                                bond_mode_name(bond_mode));
                } else {
                        bond_opt_initstr(&newval, xmit_hash_policy);
                        valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH),
                                                &newval);
                        if (!valptr) {
                                pr_err("Error: Invalid xmit_hash_policy \"%s\"\n",
                                       xmit_hash_policy);
                                return -EINVAL;
                        }
                        xmit_hashtype = valptr->value;
                }
        }

        if (lacp_rate) {
                if (bond_mode != BOND_MODE_8023AD) {
                        pr_info("lacp_rate param is irrelevant in mode %s\n",
                                bond_mode_name(bond_mode));
                } else {
                        bond_opt_initstr(&newval, lacp_rate);
                        valptr = bond_opt_parse(bond_opt_get(BOND_OPT_LACP_RATE),
                                                &newval);
                        if (!valptr) {
                                pr_err("Error: Invalid lacp rate \"%s\"\n",
                                       lacp_rate);
                                return -EINVAL;
                        }
                        lacp_fast = valptr->value;
                }
        }

        if (ad_select) {
                bond_opt_initstr(&newval, ad_select);
                valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_SELECT),
                                        &newval);
                if (!valptr) {
                        pr_err("Error: Invalid ad_select \"%s\"\n", ad_select);
                        return -EINVAL;
                }
                params->ad_select = valptr->value;
                if (bond_mode != BOND_MODE_8023AD)
                        pr_warn("ad_select param only affects 802.3ad mode\n");
        } else {
                params->ad_select = BOND_AD_STABLE;
        }

        if (max_bonds < 0) {
                pr_warn("Warning: max_bonds (%d) not in range %d-%d, so it was reset to BOND_DEFAULT_MAX_BONDS (%d)\n",
                        max_bonds, 0, INT_MAX, BOND_DEFAULT_MAX_BONDS);
                max_bonds = BOND_DEFAULT_MAX_BONDS;
        }

        if (miimon < 0) {
                pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n",
                        miimon, INT_MAX);
                miimon = 0;
        }

        if (updelay < 0) {
                pr_warn("Warning: updelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
                        updelay, INT_MAX);
                updelay = 0;
        }

        if (downdelay < 0) {
                pr_warn("Warning: downdelay module parameter (%d), not in range 0-%d, so it was reset to 0\n",
                        downdelay, INT_MAX);
                downdelay = 0;
        }

        if ((use_carrier != 0) && (use_carrier != 1)) {
                pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n",
                        use_carrier);
                use_carrier = 1;
        }

        if (num_peer_notif < 0 || num_peer_notif > 255) {
                pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n",
                        num_peer_notif);
                num_peer_notif = 1;
        }

        /* reset values for 802.3ad/TLB/ALB */
        if (!bond_mode_uses_arp(bond_mode)) {
                if (!miimon) {
                        pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n");
                        pr_warn("Forcing miimon to 100msec\n");
                        miimon = BOND_DEFAULT_MIIMON;
                }
        }

        if (tx_queues < 1 || tx_queues > 255) {
                pr_warn("Warning: tx_queues (%d) should be between 1 and 255, resetting to %d\n",
                        tx_queues, BOND_DEFAULT_TX_QUEUES);
                tx_queues = BOND_DEFAULT_TX_QUEUES;
        }

        if ((all_slaves_active != 0) && (all_slaves_active != 1)) {
                pr_warn("Warning: all_slaves_active module parameter (%d), not of valid value (0/1), so it was set to 0\n",
                        all_slaves_active);
                all_slaves_active = 0;
        }

        if (resend_igmp < 0 || resend_igmp > 255) {
                pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n",
                        resend_igmp, BOND_DEFAULT_RESEND_IGMP);
                resend_igmp = BOND_DEFAULT_RESEND_IGMP;
        }

        bond_opt_initval(&newval, packets_per_slave);
        if (!bond_opt_parse(bond_opt_get(BOND_OPT_PACKETS_PER_SLAVE), &newval)) {
                pr_warn("Warning: packets_per_slave (%d) should be between 0 and %u resetting to 1\n",
                        packets_per_slave, USHRT_MAX);
                packets_per_slave = 1;
        }

        if (bond_mode == BOND_MODE_ALB) {
                pr_notice("In ALB mode you might experience client disconnections upon reconnection of a link if the bonding module updelay parameter (%d msec) is incompatible with the forwarding delay time of the switch\n",
                          updelay);
        }

        if (!miimon) {
                if (updelay || downdelay) {
                        /* just warn the user the up/down delay will have
                         * no effect since miimon is zero...
                         */
                        pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n",
                                updelay, downdelay);
                }
        } else {
                /* don't allow arp monitoring */
                if (arp_interval) {
                        pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n",
                                miimon, arp_interval);
                        arp_interval = 0;
                }

                if ((updelay % miimon) != 0) {
                        pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n",
                                updelay, miimon, (updelay / miimon) * miimon);
                }

                updelay /= miimon;

                if ((downdelay % miimon) != 0) {
                        pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n",
                                downdelay, miimon,
                                (downdelay / miimon) * miimon);
                }

                downdelay /= miimon;
        }

        if (arp_interval < 0) {
                pr_warn("Warning: arp_interval module parameter (%d), not in range 0-%d, so it was reset to 0\n",
                        arp_interval, INT_MAX);
                arp_interval = 0;
        }

        for (arp_ip_count = 0, i = 0;
             (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) {
                __be32 ip;

                /* not a complete check, but good enough to catch mistakes */
                if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) ||
                    !bond_is_ip_target_ok(ip)) {
                        pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n",
                                arp_ip_target[i]);
                        arp_interval = 0;
                } else {
                        if (bond_get_targets_ip(arp_target, ip) == -1)
                                arp_target[arp_ip_count++] = ip;
                        else
                                pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n",
                                        &ip);
                }
        }

        if (arp_interval && !arp_ip_count) {
                /* don't allow arping if no arp_ip_target given... */
                pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_interval was reset to 0\n",
                        arp_interval);
                arp_interval = 0;
        }

        if (arp_validate) {
                if (!arp_interval) {
                        pr_err("arp_validate requires arp_interval\n");
                        return -EINVAL;
                }

                bond_opt_initstr(&newval, arp_validate);
                valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE),
                                        &newval);
                if (!valptr) {
                        pr_err("Error: invalid arp_validate \"%s\"\n",
                               arp_validate);
                        return -EINVAL;
                }
                arp_validate_value = valptr->value;
        } else {
                arp_validate_value = 0;
        }

        if (arp_all_targets) {
                bond_opt_initstr(&newval, arp_all_targets);
                valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS),
                                        &newval);
                if (!valptr) {
                        pr_err("Error: invalid arp_all_targets_value \"%s\"\n",
                               arp_all_targets);
                        arp_all_targets_value = 0;
                } else {
                        arp_all_targets_value = valptr->value;
                }
        }

        if (miimon) {
                pr_info("MII link monitoring set to %d ms\n", miimon);
        } else if (arp_interval) {
                valptr = bond_opt_get_val(BOND_OPT_ARP_VALIDATE,
                                          arp_validate_value);
                pr_info("ARP monitoring set to %d ms, validate %s, with %d target(s):",
                        arp_interval, valptr->string, arp_ip_count);

                for (i = 0; i < arp_ip_count; i++)
                        pr_cont(" %s", arp_ip_target[i]);

                pr_cont("\n");

        } else if (max_bonds) {
                /* miimon and arp_interval not set, we need one so things
                 * work as expected, see bonding.txt for details
                 */
                pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details\n");
        }

        if (primary && !bond_mode_uses_primary(bond_mode)) {
                /* currently, using a primary only makes sense
                 * in active backup, TLB or ALB modes
                 */
                pr_warn("Warning: %s primary device specified but has no effect in %s mode\n",
                        primary, bond_mode_name(bond_mode));
                primary = NULL;
        }

        if (primary && primary_reselect) {
                bond_opt_initstr(&newval, primary_reselect);
                valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT),
                                        &newval);
                if (!valptr) {
                        pr_err("Error: Invalid primary_reselect \"%s\"\n",
                               primary_reselect);
                        return -EINVAL;
                }
                primary_reselect_value = valptr->value;
        } else {
                primary_reselect_value = BOND_PRI_RESELECT_ALWAYS;
        }

        if (fail_over_mac) {
                bond_opt_initstr(&newval, fail_over_mac);
                valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC),
                                        &newval);
                if (!valptr) {
                        pr_err("Error: invalid fail_over_mac \"%s\"\n",
                               fail_over_mac);
                        return -EINVAL;
                }
                fail_over_mac_value = valptr->value;
                if (bond_mode != BOND_MODE_ACTIVEBACKUP)
                        pr_warn("Warning: fail_over_mac only affects active-backup mode\n");
        } else {
                fail_over_mac_value = BOND_FOM_NONE;
        }

        bond_opt_initstr(&newval, "default");
        valptr = bond_opt_parse(
                        bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO),
                                     &newval);
        if (!valptr) {
                pr_err("Error: No ad_actor_sys_prio default value");
                return -EINVAL;
        }
        ad_actor_sys_prio = valptr->value;

        valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY),
                                &newval);
        if (!valptr) {
                pr_err("Error: No ad_user_port_key default value");
                return -EINVAL;
        }
        ad_user_port_key = valptr->value;

        bond_opt_initstr(&newval, "default");
        valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval);
        if (!valptr) {
                pr_err("Error: No tlb_dynamic_lb default value");
                return -EINVAL;
        }
        tlb_dynamic_lb = valptr->value;

        if (lp_interval == 0) {
                pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n",
                        INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL);
                lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL;
        }

        /* fill params struct with the proper values */
        params->mode = bond_mode;
        params->xmit_policy = xmit_hashtype;
        params->miimon = miimon;
        params->num_peer_notif = num_peer_notif;
        params->arp_interval = arp_interval;
        params->arp_validate = arp_validate_value;
        params->arp_all_targets = arp_all_targets_value;
        params->missed_max = 2;
        params->updelay = updelay;
        params->downdelay = downdelay;
        params->peer_notif_delay = 0;
        params->use_carrier = use_carrier;
        params->lacp_active = 1;
        params->lacp_fast = lacp_fast;
        params->primary[0] = 0;
        params->primary_reselect = primary_reselect_value;
        params->fail_over_mac = fail_over_mac_value;
        params->tx_queues = tx_queues;
        params->all_slaves_active = all_slaves_active;
        params->resend_igmp = resend_igmp;
        params->min_links = min_links;
        params->lp_interval = lp_interval;
        params->packets_per_slave = packets_per_slave;
        params->tlb_dynamic_lb = tlb_dynamic_lb;
        params->ad_actor_sys_prio = ad_actor_sys_prio;
        eth_zero_addr(params->ad_actor_system);
        params->ad_user_port_key = ad_user_port_key;
        params->coupled_control = 1;
        if (packets_per_slave > 0) {
                params->reciprocal_packets_per_slave =
                        reciprocal_value(packets_per_slave);
        } else {
                /* reciprocal_packets_per_slave is unused if
                 * packets_per_slave is 0 or 1, just initialize it
                 */
                params->reciprocal_packets_per_slave =
                        (struct reciprocal_value) { 0 };
        }

        if (primary)
                strscpy_pad(params->primary, primary, sizeof(params->primary));

        memcpy(params->arp_targets, arp_target, sizeof(arp_target));
#if IS_ENABLED(CONFIG_IPV6)
        memset(params->ns_targets, 0, sizeof(struct in6_addr) * BOND_MAX_NS_TARGETS);
#endif

        return 0;
}

/* Called from registration process */
static int bond_init(struct net_device *bond_dev)
{
        struct bonding *bond = netdev_priv(bond_dev);
        struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);

        netdev_dbg(bond_dev, "Begin bond_init\n");

        bond->wq = alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM,
                                           bond_dev->name);
        if (!bond->wq)
                return -ENOMEM;

        bond->notifier_ctx = false;

        spin_lock_init(&bond->stats_lock);
        netdev_lockdep_set_classes(bond_dev);

        list_add_tail_rcu(&bond->bond_list, &bn->dev_list);

        bond_prepare_sysfs_group(bond);

        bond_debug_register(bond);

        /* Ensure valid dev_addr */
        if (is_zero_ether_addr(bond_dev->dev_addr) &&
            bond_dev->addr_assign_type == NET_ADDR_PERM)
                eth_hw_addr_random(bond_dev);

        return 0;
}

unsigned int bond_get_num_tx_queues(void)
{
        return tx_queues;
}

/* Create a new bond based on the specified name and bonding parameters.
 * If name is NULL, obtain a suitable "bond%d" name for us.
 * Caller must NOT hold rtnl_lock; we need to release it here before we
 * set up our sysfs entries.
 */
int bond_create(struct net *net, const char *name)
{
        struct net_device *bond_dev;
        struct bonding *bond;
        int res = -ENOMEM;

        rtnl_lock();

        bond_dev = alloc_netdev_mq(sizeof(struct bonding),
                                   name ? name : "bond%d", NET_NAME_UNKNOWN,
                                   bond_setup, tx_queues);
        if (!bond_dev)
                goto out;

        bond = netdev_priv(bond_dev);
        dev_net_set(bond_dev, net);
        bond_dev->rtnl_link_ops = &bond_link_ops;

        res = register_netdevice(bond_dev);
        if (res < 0) {
                free_netdev(bond_dev);
                goto out;
        }

        netif_carrier_off(bond_dev);

        bond_work_init_all(bond);

out:
        rtnl_unlock();
        return res;
}

static int __net_init bond_net_init(struct net *net)
{
        struct bond_net *bn = net_generic(net, bond_net_id);

        bn->net = net;
        INIT_LIST_HEAD(&bn->dev_list);

        bond_create_proc_dir(bn);
        bond_create_sysfs(bn);

        return 0;
}

/* According to commit 69b0216ac255 ("bonding: fix bonding_masters
 * race condition in bond unloading") we need to remove sysfs files
 * before we remove our devices (done later in bond_net_exit_batch_rtnl())
 */
static void __net_exit bond_net_pre_exit(struct net *net)
{
        struct bond_net *bn = net_generic(net, bond_net_id);

        bond_destroy_sysfs(bn);
}

static void __net_exit bond_net_exit_batch_rtnl(struct list_head *net_list,
                                                struct list_head *dev_kill_list)
{
        struct bond_net *bn;
        struct net *net;

        /* Kill off any bonds created after unregistering bond rtnl ops */
        list_for_each_entry(net, net_list, exit_list) {
                struct bonding *bond, *tmp_bond;

                bn = net_generic(net, bond_net_id);
                list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list)
                        unregister_netdevice_queue(bond->dev, dev_kill_list);
        }
}

/* According to commit 23fa5c2caae0 ("bonding: destroy proc directory
 * only after all bonds are gone") bond_destroy_proc_dir() is called
 * after bond_net_exit_batch_rtnl() has completed.
 */
static void __net_exit bond_net_exit_batch(struct list_head *net_list)
{
        struct bond_net *bn;
        struct net *net;

        list_for_each_entry(net, net_list, exit_list) {
                bn = net_generic(net, bond_net_id);
                bond_destroy_proc_dir(bn);
        }
}

static struct pernet_operations bond_net_ops = {
        .init = bond_net_init,
        .pre_exit = bond_net_pre_exit,
        .exit_batch_rtnl = bond_net_exit_batch_rtnl,
        .exit_batch = bond_net_exit_batch,
        .id   = &bond_net_id,
        .size = sizeof(struct bond_net),
};

static int __init bonding_init(void)
{
        int i;
        int res;

        res = bond_check_params(&bonding_defaults);
        if (res)
                goto out;

        bond_create_debugfs();

        res = register_pernet_subsys(&bond_net_ops);
        if (res)
                goto err_net_ops;

        res = bond_netlink_init();
        if (res)
                goto err_link;

        for (i = 0; i < max_bonds; i++) {
                res = bond_create(&init_net, NULL);
                if (res)
                        goto err;
        }

        skb_flow_dissector_init(&flow_keys_bonding,
                                flow_keys_bonding_keys,
                                ARRAY_SIZE(flow_keys_bonding_keys));

        register_netdevice_notifier(&bond_netdev_notifier);
out:
        return res;
err:
        bond_netlink_fini();
err_link:
        unregister_pernet_subsys(&bond_net_ops);
err_net_ops:
        bond_destroy_debugfs();
        goto out;

}

static void __exit bonding_exit(void)
{
        unregister_netdevice_notifier(&bond_netdev_notifier);

        bond_netlink_fini();
        unregister_pernet_subsys(&bond_net_ops);

        bond_destroy_debugfs();

#ifdef CONFIG_NET_POLL_CONTROLLER
        /* Make sure we don't have an imbalance on our netpoll blocking */
        WARN_ON(atomic_read(&netpoll_block_tx));
#endif
}

module_init(bonding_init);
module_exit(bonding_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION(DRV_DESCRIPTION);
MODULE_AUTHOR("Thomas Davis, tadavis@lbl.gov and many others");





















































































































  709 














  503 


   11 
  505 









  128 


























































































































































































































































































































































































































































































































































   16 








































































































































































































































































































































































































































































































































































  348 

  348 



















   89 










  118 



































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGTABLE_H
#define _LINUX_PGTABLE_H

#include <linux/pfn.h>
#include <asm/pgtable.h>

#define PMD_ORDER        (PMD_SHIFT - PAGE_SHIFT)
#define PUD_ORDER        (PUD_SHIFT - PAGE_SHIFT)

#ifndef __ASSEMBLY__
#ifdef CONFIG_MMU

#include <linux/mm_types.h>
#include <linux/bug.h>
#include <linux/errno.h>
#include <asm-generic/pgtable_uffd.h>
#include <linux/page_table_check.h>

#if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \
        defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS
#error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED
#endif

/*
 * On almost all architectures and configurations, 0 can be used as the
 * upper ceiling to free_pgtables(): on many architectures it has the same
 * effect as using TASK_SIZE.  However, there is one configuration which
 * must impose a more careful limit, to avoid freeing kernel pgtables.
 */
#ifndef USER_PGTABLES_CEILING
#define USER_PGTABLES_CEILING        0UL
#endif

/*
 * This defines the first usable user address. Platforms
 * can override its value with custom FIRST_USER_ADDRESS
 * defined in their respective <asm/pgtable.h>.
 */
#ifndef FIRST_USER_ADDRESS
#define FIRST_USER_ADDRESS        0UL
#endif

/*
 * This defines the generic helper for accessing PMD page
 * table page. Although platforms can still override this
 * via their respective <asm/pgtable.h>.
 */
#ifndef pmd_pgtable
#define pmd_pgtable(pmd) pmd_page(pmd)
#endif

#define pmd_folio(pmd) page_folio(pmd_page(pmd))

/*
 * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD]
 *
 * The pXx_index() functions return the index of the entry in the page
 * table page which would control the given virtual address
 *
 * As these functions may be used by the same code for different levels of
 * the page table folding, they are always available, regardless of
 * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0
 * because in such cases PTRS_PER_PxD equals 1.
 */

static inline unsigned long pte_index(unsigned long address)
{
        return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}

#ifndef pmd_index
static inline unsigned long pmd_index(unsigned long address)
{
        return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
}
#define pmd_index pmd_index
#endif

#ifndef pud_index
static inline unsigned long pud_index(unsigned long address)
{
        return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}
#define pud_index pud_index
#endif

#ifndef pgd_index
/* Must be a compile-time constant, so implement it as a macro */
#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#endif

#ifndef kernel_pte_init
static inline void kernel_pte_init(void *addr)
{
}
#define kernel_pte_init kernel_pte_init
#endif

#ifndef pmd_init
static inline void pmd_init(void *addr)
{
}
#define pmd_init pmd_init
#endif

#ifndef pud_init
static inline void pud_init(void *addr)
{
}
#define pud_init pud_init
#endif

#ifndef pte_offset_kernel
static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
        return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
}
#define pte_offset_kernel pte_offset_kernel
#endif

#ifdef CONFIG_HIGHPTE
#define __pte_map(pmd, address) \
        ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address)))
#define pte_unmap(pte)        do {        \
        kunmap_local((pte));        \
        rcu_read_unlock();        \
} while (0)
#else
static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address)
{
        return pte_offset_kernel(pmd, address);
}
static inline void pte_unmap(pte_t *pte)
{
        rcu_read_unlock();
}
#endif

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable);

/* Find an entry in the second-level page table.. */
#ifndef pmd_offset
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
        return pud_pgtable(*pud) + pmd_index(address);
}
#define pmd_offset pmd_offset
#endif

#ifndef pud_offset
static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
        return p4d_pgtable(*p4d) + pud_index(address);
}
#define pud_offset pud_offset
#endif

static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address)
{
        return (pgd + pgd_index(address));
};

/*
 * a shortcut to get a pgd_t in a given mm
 */
#ifndef pgd_offset
#define pgd_offset(mm, address)                pgd_offset_pgd((mm)->pgd, (address))
#endif

/*
 * a shortcut which implies the use of the kernel's pgd, instead
 * of a process's
 */
#define pgd_offset_k(address)                pgd_offset(&init_mm, (address))

/*
 * In many cases it is known that a virtual address is mapped at PMD or PTE
 * level, so instead of traversing all the page table levels, we can get a
 * pointer to the PMD entry in user or kernel page table or translate a virtual
 * address to the pointer in the PTE in the kernel page tables with simple
 * helpers.
 */
static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va);
}

static inline pmd_t *pmd_off_k(unsigned long va)
{
        return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va);
}

static inline pte_t *virt_to_kpte(unsigned long vaddr)
{
        pmd_t *pmd = pmd_off_k(vaddr);

        return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
}

#ifndef pmd_young
static inline int pmd_young(pmd_t pmd)
{
        return 0;
}
#endif

#ifndef pmd_dirty
static inline int pmd_dirty(pmd_t pmd)
{
        return 0;
}
#endif

/*
 * A facility to provide lazy MMU batching.  This allows PTE updates and
 * page invalidations to be delayed until a call to leave lazy MMU mode
 * is issued.  Some architectures may benefit from doing this, and it is
 * beneficial for both shadow and direct mode hypervisors, which may batch
 * the PTE updates which happen during this window.  Note that using this
 * interface requires that read hazards be removed from the code.  A read
 * hazard could result in the direct mode hypervisor case, since the actual
 * write to the page tables may not yet have taken place, so reads though
 * a raw PTE pointer after it has been modified are not guaranteed to be
 * up to date.
 *
 * In the general case, no lock is guaranteed to be held between entry and exit
 * of the lazy mode. So the implementation must assume preemption may be enabled
 * and cpu migration is possible; it must take steps to be robust against this.
 * (In practice, for user PTE updates, the appropriate page table lock(s) are
 * held, but for kernel PTE updates, no lock is held). Nesting is not permitted
 * and the mode cannot be used in interrupt context.
 */
#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE
#define arch_enter_lazy_mmu_mode()        do {} while (0)
#define arch_leave_lazy_mmu_mode()        do {} while (0)
#define arch_flush_lazy_mmu_mode()        do {} while (0)
#endif

#ifndef pte_batch_hint
/**
 * pte_batch_hint - Number of pages that can be added to batch without scanning.
 * @ptep: Page table pointer for the entry.
 * @pte: Page table entry.
 *
 * Some architectures know that a set of contiguous ptes all map the same
 * contiguous memory with the same permissions. In this case, it can provide a
 * hint to aid pte batching without the core code needing to scan every pte.
 *
 * An architecture implementation may ignore the PTE accessed state. Further,
 * the dirty state must apply atomically to all the PTEs described by the hint.
 *
 * May be overridden by the architecture, else pte_batch_hint is always 1.
 */
static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
{
        return 1;
}
#endif

#ifndef pte_advance_pfn
static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
{
        return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
}
#endif

#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)

#ifndef set_ptes
/**
 * set_ptes - Map consecutive pages to a contiguous range of addresses.
 * @mm: Address space to map the pages into.
 * @addr: Address to map the first page at.
 * @ptep: Page table pointer for the first entry.
 * @pte: Page table entry for the first page.
 * @nr: Number of pages to map.
 *
 * When nr==1, initial state of pte may be present or not present, and new state
 * may be present or not present. When nr>1, initial state of all ptes must be
 * not present, and new state must be present.
 *
 * May be overridden by the architecture, or the architecture can define
 * set_pte() and PFN_PTE_SHIFT.
 *
 * Context: The caller holds the page table lock.  The pages all belong
 * to the same folio.  The PTEs are all in the same PMD.
 */
static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, pte_t pte, unsigned int nr)
{
        page_table_check_ptes_set(mm, ptep, pte, nr);

        for (;;) {
                set_pte(ptep, pte);
                if (--nr == 0)
                        break;
                ptep++;
                pte = pte_next_pfn(pte);
        }
}
#endif
#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1)

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
extern int ptep_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pte_t *ptep,
                                 pte_t entry, int dirty);
#endif

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp,
                                 pmd_t entry, int dirty);
extern int pudp_set_access_flags(struct vm_area_struct *vma,
                                 unsigned long address, pud_t *pudp,
                                 pud_t entry, int dirty);
#else
static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pmd_t *pmdp,
                                        pmd_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
static inline int pudp_set_access_flags(struct vm_area_struct *vma,
                                        unsigned long address, pud_t *pudp,
                                        pud_t entry, int dirty)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef ptep_get
static inline pte_t ptep_get(pte_t *ptep)
{
        return READ_ONCE(*ptep);
}
#endif

#ifndef pmdp_get
static inline pmd_t pmdp_get(pmd_t *pmdp)
{
        return READ_ONCE(*pmdp);
}
#endif

#ifndef pudp_get
static inline pud_t pudp_get(pud_t *pudp)
{
        return READ_ONCE(*pudp);
}
#endif

#ifndef p4dp_get
static inline p4d_t p4dp_get(p4d_t *p4dp)
{
        return READ_ONCE(*p4dp);
}
#endif

#ifndef pgdp_get
static inline pgd_t pgdp_get(pgd_t *pgdp)
{
        return READ_ONCE(*pgdp);
}
#endif

#ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        int r = 1;
        if (!pte_young(pte))
                r = 0;
        else
                set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte));
        return r;
}
#endif

#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;
        int r = 1;
        if (!pmd_young(pmd))
                r = 0;
        else
                set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd));
        return r;
}
#else
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                  unsigned long address, pmd_t *pmdp);
#else
/*
 * Despite relevant to THP only, this API is called from generic rmap code
 * under PageTransHuge(), hence needs a dummy implementation for !THP
 */
static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef arch_has_hw_nonleaf_pmd_young
/*
 * Return whether the accessed bit in non-leaf PMD entries is supported on the
 * local CPU.
 */
static inline bool arch_has_hw_nonleaf_pmd_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
}
#endif

#ifndef arch_has_hw_pte_young
/*
 * Return whether the accessed bit is supported on the local CPU.
 *
 * This stub assumes accessing through an old PTE triggers a page fault.
 * Architectures that automatically set the access bit should overwrite it.
 */
static inline bool arch_has_hw_pte_young(void)
{
        return IS_ENABLED(CONFIG_ARCH_HAS_HW_PTE_YOUNG);
}
#endif

#ifndef arch_check_zapped_pte
static inline void arch_check_zapped_pte(struct vm_area_struct *vma,
                                         pte_t pte)
{
}
#endif

#ifndef arch_check_zapped_pmd
static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
                                         pmd_t pmd)
{
}
#endif

#ifndef arch_check_zapped_pud
static inline void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud)
{
}
#endif

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
                                       unsigned long address,
                                       pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);
        pte_clear(mm, address, ptep);
        page_table_check_pte_clear(mm, pte);
        return pte;
}
#endif

#ifndef clear_young_dirty_ptes
/**
 * clear_young_dirty_ptes - Mark PTEs that map consecutive pages of the
 *                same folio as old/clean.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to mark old/clean.
 * @flags: Flags to modify the PTE batch semantics.
 *
 * May be overridden by the architecture; otherwise, implemented by
 * get_and_clear/modify/set for each pte in the range.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_young_dirty_ptes(struct vm_area_struct *vma,
                                          unsigned long addr, pte_t *ptep,
                                          unsigned int nr, cydp_t flags)
{
        pte_t pte;

        for (;;) {
                if (flags == CYDP_CLEAR_YOUNG)
                        ptep_test_and_clear_young(vma, addr, ptep);
                else {
                        pte = ptep_get_and_clear(vma->vm_mm, addr, ptep);
                        if (flags & CYDP_CLEAR_YOUNG)
                                pte = pte_mkold(pte);
                        if (flags & CYDP_CLEAR_DIRTY)
                                pte = pte_mkclean(pte);
                        set_pte_at(vma->vm_mm, addr, ptep, pte);
                }
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep)
{
        pte_t pte = ptep_get(ptep);

        pte_clear(mm, addr, ptep);
        /*
         * No need for ptep_get_and_clear(): page table check doesn't care about
         * any bits that could have been set by HW concurrently.
         */
        page_table_check_pte_clear(mm, pte);
}

#ifdef CONFIG_GUP_GET_PXX_LOW_HIGH
/*
 * For walking the pagetables without holding any locks.  Some architectures
 * (eg x86-32 PAE) cannot load the entries atomically without using expensive
 * instructions.  We are guaranteed that a PTE will only either go from not
 * present to present, or present to not present -- it will not switch to a
 * completely different present page without a TLB flush inbetween; which we
 * are blocking by holding interrupts off.
 *
 * Setting ptes from not present to present goes:
 *
 *   ptep->pte_high = h;
 *   smp_wmb();
 *   ptep->pte_low = l;
 *
 * And present to not present goes:
 *
 *   ptep->pte_low = 0;
 *   smp_wmb();
 *   ptep->pte_high = 0;
 *
 * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
 * We load pte_high *after* loading pte_low, which ensures we don't see an older
 * value of pte_high.  *Then* we recheck pte_low, which ensures that we haven't
 * picked up a changed pte high. We might have gotten rubbish values from
 * pte_low and pte_high, but we are guaranteed that pte_low will not have the
 * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
 * operates on present ptes we're safe.
 */
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        pte_t pte;

        do {
                pte.pte_low = ptep->pte_low;
                smp_rmb();
                pte.pte_high = ptep->pte_high;
                smp_rmb();
        } while (unlikely(pte.pte_low != ptep->pte_low));

        return pte;
}
#define ptep_get_lockless ptep_get_lockless

#if CONFIG_PGTABLE_LEVELS > 2
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        pmd_t pmd;

        do {
                pmd.pmd_low = pmdp->pmd_low;
                smp_rmb();
                pmd.pmd_high = pmdp->pmd_high;
                smp_rmb();
        } while (unlikely(pmd.pmd_low != pmdp->pmd_low));

        return pmd;
}
#define pmdp_get_lockless pmdp_get_lockless
#define pmdp_get_lockless_sync() tlb_remove_table_sync_one()
#endif /* CONFIG_PGTABLE_LEVELS > 2 */
#endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */

/*
 * We require that the PTE can be read atomically.
 */
#ifndef ptep_get_lockless
static inline pte_t ptep_get_lockless(pte_t *ptep)
{
        return ptep_get(ptep);
}
#endif

#ifndef pmdp_get_lockless
static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
{
        return pmdp_get(pmdp);
}
static inline void pmdp_get_lockless_sync(void)
{
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pmd_t *pmdp)
{
        pmd_t pmd = *pmdp;

        pmd_clear(pmdp);
        page_table_check_pmd_clear(mm, pmd);

        return pmd;
}
#endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
                                            unsigned long address,
                                            pud_t *pudp)
{
        pud_t pud = *pudp;

        pud_clear(pudp);
        page_table_check_pud_clear(mm, pud);

        return pud;
}
#endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
#ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL
static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pmd_t *pmdp,
                                            int full)
{
        return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
}
#endif

#ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
                                            unsigned long address, pud_t *pudp,
                                            int full)
{
        return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
}
#endif
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
                                            unsigned long address, pte_t *ptep,
                                            int full)
{
        return ptep_get_and_clear(mm, address, ptep);
}
#endif

#ifndef get_and_clear_full_ptes
/**
 * get_and_clear_full_ptes - Clear present PTEs that map consecutive pages of
 *                             the same folio, collecting dirty/accessed bits.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full(), merging dirty/accessed bits into the
 * returned PTE.
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        pte_t pte, tmp_pte;

        pte = ptep_get_and_clear_full(mm, addr, ptep, full);
        while (--nr) {
                ptep++;
                addr += PAGE_SIZE;
                tmp_pte = ptep_get_and_clear_full(mm, addr, ptep, full);
                if (pte_dirty(tmp_pte))
                        pte = pte_mkdirty(pte);
                if (pte_young(tmp_pte))
                        pte = pte_mkyoung(pte);
        }
        return pte;
}
#endif

#ifndef clear_full_ptes
/**
 * clear_full_ptes - Clear present PTEs that map consecutive pages of the same
 *                     folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_get_and_clear_full().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                ptep_get_and_clear_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

/*
 * If two threads concurrently fault at the same page, the thread that
 * won the race updates the PTE and its local TLB/Cache. The other thread
 * gives up, simply does nothing, and continues; on architectures where
 * software can update TLB,  local TLB can be updated here to avoid next page
 * fault. This function updates TLB only, do nothing with cache or others.
 * It is the difference with function update_mmu_cache.
 */
#ifndef update_mmu_tlb_range
static inline void update_mmu_tlb_range(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep, unsigned int nr)
{
}
#endif

static inline void update_mmu_tlb(struct vm_area_struct *vma,
                                unsigned long address, pte_t *ptep)
{
        update_mmu_tlb_range(vma, address, ptep, 1);
}

/*
 * Some architectures may be able to avoid expensive synchronization
 * primitives when modifications are made to PTE's which are already
 * not present, or in the process of an address space destruction.
 */
#ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL
static inline void pte_clear_not_present_full(struct mm_struct *mm,
                                              unsigned long address,
                                              pte_t *ptep,
                                              int full)
{
        pte_clear(mm, address, ptep);
}
#endif

#ifndef clear_not_present_full_ptes
/**
 * clear_not_present_full_ptes - Clear multiple not present PTEs which are
 *                                 consecutive in the pgtable.
 * @mm: Address space the ptes represent.
 * @addr: Address of the first pte.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to clear.
 * @full: Whether we are clearing a full mm.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over pte_clear_not_present_full().
 *
 * Context: The caller holds the page table lock.  The PTEs are all not present.
 * The PTEs are all in the same PMD.
 */
static inline void clear_not_present_full_ptes(struct mm_struct *mm,
                unsigned long addr, pte_t *ptep, unsigned int nr, int full)
{
        for (;;) {
                pte_clear_not_present_full(mm, addr, ptep, full);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
extern pte_t ptep_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pte_t *ptep);
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pmd_t *pmdp);
extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma,
                              unsigned long address,
                              pud_t *pudp);
#endif

#ifndef pte_mkwrite
static inline pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma)
{
        return pte_mkwrite_novma(pte);
}
#endif

#if defined(CONFIG_ARCH_WANT_PMD_MKWRITE) && !defined(pmd_mkwrite)
static inline pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
        return pmd_mkwrite_novma(pmd);
}
#endif

#ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
struct mm_struct;
static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
{
        pte_t old_pte = ptep_get(ptep);
        set_pte_at(mm, address, ptep, pte_wrprotect(old_pte));
}
#endif

#ifndef wrprotect_ptes
/**
 * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
 *                    folio.
 * @mm: Address space the pages are mapped into.
 * @addr: Address the first page is mapped at.
 * @ptep: Page table pointer for the first entry.
 * @nr: Number of entries to write-protect.
 *
 * May be overridden by the architecture; otherwise, implemented as a simple
 * loop over ptep_set_wrprotect().
 *
 * Note that PTE bits in the PTE range besides the PFN can differ. For example,
 * some PTEs might be write-protected.
 *
 * Context: The caller holds the page table lock.  The PTEs map consecutive
 * pages that belong to the same folio.  The PTEs are all in the same PMD.
 */
static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
                pte_t *ptep, unsigned int nr)
{
        for (;;) {
                ptep_set_wrprotect(mm, addr, ptep);
                if (--nr == 0)
                        break;
                ptep++;
                addr += PAGE_SIZE;
        }
}
#endif

/*
 * On some architectures hardware does not set page access bit when accessing
 * memory page, it is responsibility of software setting this bit. It brings
 * out extra page fault penalty to track page access bit. For optimization page
 * access bit can be set during all page fault flow on these arches.
 * To be differentiate with macro pte_mkyoung, this macro is used on platforms
 * where software maintains page access bit.
 */
#ifndef pte_sw_mkyoung
static inline pte_t pte_sw_mkyoung(pte_t pte)
{
        return pte;
}
#define pte_sw_mkyoung        pte_sw_mkyoung
#endif

#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd));
}
#else
static inline void pmdp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pmd_t *pmdp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
#ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        pud_t old_pud = *pudp;

        set_pud_at(mm, address, pudp, pud_wrprotect(old_pud));
}
#else
static inline void pudp_set_wrprotect(struct mm_struct *mm,
                                      unsigned long address, pud_t *pudp)
{
        BUILD_BUG();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
#endif

#ifndef pmdp_collapse_flush
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                 unsigned long address, pmd_t *pmdp);
#else
static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
                                        unsigned long address,
                                        pmd_t *pmdp)
{
        BUILD_BUG();
        return *pmdp;
}
#define pmdp_collapse_flush pmdp_collapse_flush
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                       pgtable_t pgtable);
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
#endif

#ifndef arch_needs_pgtable_deposit
#define arch_needs_pgtable_deposit() (false)
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * This is an implementation of pmdp_establish() that is only suitable for an
 * architecture that doesn't have hardware dirty/accessed bits. In this case we
 * can't race with CPU which sets these bits and non-atomic approach is fine.
 */
static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma,
                unsigned long address, pmd_t *pmdp, pmd_t pmd)
{
        pmd_t old_pmd = *pmdp;
        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
        return old_pmd;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD

/*
 * pmdp_invalidate_ad() invalidates the PMD while changing a transparent
 * hugepage mapping in the page tables. This function is similar to
 * pmdp_invalidate(), but should only be used if the access and dirty bits would
 * not be cleared by the software in the new PMD value. The function ensures
 * that hardware changes of the access and dirty bits updates would not be lost.
 *
 * Doing so can allow in certain architectures to avoid a TLB flush in most
 * cases. Yet, another TLB flush might be necessary later if the PMD update
 * itself requires such flush (e.g., if protection was set to be stricter). Yet,
 * even when a TLB flush is needed because of the update, the caller may be able
 * to batch these TLB flushing operations, so fewer TLB flush operations are
 * needed.
 */
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
                                unsigned long address, pmd_t *pmdp);
#endif

#ifndef __HAVE_ARCH_PTE_SAME
static inline int pte_same(pte_t pte_a, pte_t pte_b)
{
        return pte_val(pte_a) == pte_val(pte_b);
}
#endif

#ifndef __HAVE_ARCH_PTE_UNUSED
/*
 * Some architectures provide facilities to virtualization guests
 * so that they can flag allocated pages as unused. This allows the
 * host to transparently reclaim unused pages. This function returns
 * whether the pte's page is unused.
 */
static inline int pte_unused(pte_t pte)
{
        return 0;
}
#endif

#ifndef pte_access_permitted
#define pte_access_permitted(pte, write) \
        (pte_present(pte) && (!(write) || pte_write(pte)))
#endif

#ifndef pmd_access_permitted
#define pmd_access_permitted(pmd, write) \
        (pmd_present(pmd) && (!(write) || pmd_write(pmd)))
#endif

#ifndef pud_access_permitted
#define pud_access_permitted(pud, write) \
        (pud_present(pud) && (!(write) || pud_write(pud)))
#endif

#ifndef p4d_access_permitted
#define p4d_access_permitted(p4d, write) \
        (p4d_present(p4d) && (!(write) || p4d_write(p4d)))
#endif

#ifndef pgd_access_permitted
#define pgd_access_permitted(pgd, write) \
        (pgd_present(pgd) && (!(write) || pgd_write(pgd)))
#endif

#ifndef __HAVE_ARCH_PMD_SAME
static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
{
        return pmd_val(pmd_a) == pmd_val(pmd_b);
}
#endif

#ifndef pud_same
static inline int pud_same(pud_t pud_a, pud_t pud_b)
{
        return pud_val(pud_a) == pud_val(pud_b);
}
#define pud_same pud_same
#endif

#ifndef __HAVE_ARCH_P4D_SAME
static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b)
{
        return p4d_val(p4d_a) == p4d_val(p4d_b);
}
#endif

#ifndef __HAVE_ARCH_PGD_SAME
static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b)
{
        return pgd_val(pgd_a) == pgd_val(pgd_b);
}
#endif

#ifndef __HAVE_ARCH_DO_SWAP_PAGE
static inline void arch_do_swap_page_nr(struct mm_struct *mm,
                                     struct vm_area_struct *vma,
                                     unsigned long addr,
                                     pte_t pte, pte_t oldpte,
                                     int nr)
{

}
#else
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_do_swap_page() can restore this
 * metadata when a page is swapped back in.
 */
static inline void arch_do_swap_page_nr(struct mm_struct *mm,
                                        struct vm_area_struct *vma,
                                        unsigned long addr,
                                        pte_t pte, pte_t oldpte,
                                        int nr)
{
        for (int i = 0; i < nr; i++) {
                arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE,
                                pte_advance_pfn(pte, i),
                                pte_advance_pfn(oldpte, i));
        }
}
#endif

#ifndef __HAVE_ARCH_UNMAP_ONE
/*
 * Some architectures support metadata associated with a page. When a
 * page is being swapped out, this metadata must be saved so it can be
 * restored when the page is swapped back in. SPARC M7 and newer
 * processors support an ADI (Application Data Integrity) tag for the
 * page as metadata for the page. arch_unmap_one() can save this
 * metadata on a swap-out of a page.
 */
static inline int arch_unmap_one(struct mm_struct *mm,
                                  struct vm_area_struct *vma,
                                  unsigned long addr,
                                  pte_t orig_pte)
{
        return 0;
}
#endif

/*
 * Allow architectures to preserve additional metadata associated with
 * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function
 * prototypes must be defined in the arch-specific asm/pgtable.h file.
 */
#ifndef __HAVE_ARCH_PREPARE_TO_SWAP
static inline int arch_prepare_to_swap(struct folio *folio)
{
        return 0;
}
#endif

#ifndef __HAVE_ARCH_SWAP_INVALIDATE
static inline void arch_swap_invalidate_page(int type, pgoff_t offset)
{
}

static inline void arch_swap_invalidate_area(int type)
{
}
#endif

#ifndef __HAVE_ARCH_SWAP_RESTORE
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
{
}
#endif

#ifndef __HAVE_ARCH_PGD_OFFSET_GATE
#define pgd_offset_gate(mm, addr)        pgd_offset(mm, addr)
#endif

#ifndef __HAVE_ARCH_MOVE_PTE
#define move_pte(pte, old_addr, new_addr)        (pte)
#endif

#ifndef pte_accessible
# define pte_accessible(mm, pte)        ((void)(pte), 1)
#endif

#ifndef flush_tlb_fix_spurious_fault
#define flush_tlb_fix_spurious_fault(vma, address, ptep) flush_tlb_page(vma, address)
#endif

/*
 * When walking page tables, get the address of the next boundary,
 * or the end address of the range if that comes earlier.  Although no
 * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout.
 */

#define pgd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})

#ifndef p4d_addr_end
#define p4d_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pud_addr_end
#define pud_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

#ifndef pmd_addr_end
#define pmd_addr_end(addr, end)                                                \
({        unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK;        \
        (__boundary - 1 < (end) - 1)? __boundary: (end);                \
})
#endif

/*
 * When walking page tables, we usually want to skip any p?d_none entries;
 * and any p?d_bad entries - reporting the error before resetting to none.
 * Do the tests inline, but report and clear the bad entry in mm/memory.c.
 */
void pgd_clear_bad(pgd_t *);

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *);
#else
#define p4d_clear_bad(p4d)        do { } while (0)
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *);
#else
#define pud_clear_bad(p4d)        do { } while (0)
#endif

void pmd_clear_bad(pmd_t *);

static inline int pgd_none_or_clear_bad(pgd_t *pgd)
{
        if (pgd_none(*pgd))
                return 1;
        if (unlikely(pgd_bad(*pgd))) {
                pgd_clear_bad(pgd);
                return 1;
        }
        return 0;
}

static inline int p4d_none_or_clear_bad(p4d_t *p4d)
{
        if (p4d_none(*p4d))
                return 1;
        if (unlikely(p4d_bad(*p4d))) {
                p4d_clear_bad(p4d);
                return 1;
        }
        return 0;
}

static inline int pud_none_or_clear_bad(pud_t *pud)
{
        if (pud_none(*pud))
                return 1;
        if (unlikely(pud_bad(*pud))) {
                pud_clear_bad(pud);
                return 1;
        }
        return 0;
}

static inline int pmd_none_or_clear_bad(pmd_t *pmd)
{
        if (pmd_none(*pmd))
                return 1;
        if (unlikely(pmd_bad(*pmd))) {
                pmd_clear_bad(pmd);
                return 1;
        }
        return 0;
}

static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep)
{
        /*
         * Get the current pte state, but zero it out to make it
         * non-present, preventing the hardware from asynchronously
         * updating it.
         */
        return ptep_get_and_clear(vma->vm_mm, addr, ptep);
}

static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma,
                                             unsigned long addr,
                                             pte_t *ptep, pte_t pte)
{
        /*
         * The pte is non-present, so there's no hardware state to
         * preserve.
         */
        set_pte_at(vma->vm_mm, addr, ptep, pte);
}

#ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
/*
 * Start a pte protection read-modify-write transaction, which
 * protects against asynchronous hardware modifications to the pte.
 * The intention is not to prevent the hardware from making pte
 * updates, but to prevent any updates it may make from being lost.
 *
 * This does not protect against other software modifications of the
 * pte; the appropriate pte lock must be held over the transaction.
 *
 * Note that this interface is intended to be batchable, meaning that
 * ptep_modify_prot_commit may not actually update the pte, but merely
 * queue the update to be done at some later time.  The update must be
 * actually committed before the pte lock is released, however.
 */
static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep)
{
        return __ptep_modify_prot_start(vma, addr, ptep);
}

/*
 * Commit an update to a pte, leaving any hardware-controlled bits in
 * the PTE unmodified.
 */
static inline void ptep_modify_prot_commit(struct vm_area_struct *vma,
                                           unsigned long addr,
                                           pte_t *ptep, pte_t old_pte, pte_t pte)
{
        __ptep_modify_prot_commit(vma, addr, ptep, pte);
}
#endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */
#endif /* CONFIG_MMU */

/*
 * No-op macros that just return the current protection value. Defined here
 * because these macros can be used even if CONFIG_MMU is not defined.
 */

#ifndef pgprot_nx
#define pgprot_nx(prot)        (prot)
#endif

#ifndef pgprot_noncached
#define pgprot_noncached(prot)        (prot)
#endif

#ifndef pgprot_writecombine
#define pgprot_writecombine pgprot_noncached
#endif

#ifndef pgprot_writethrough
#define pgprot_writethrough pgprot_noncached
#endif

#ifndef pgprot_device
#define pgprot_device pgprot_noncached
#endif

#ifndef pgprot_mhp
#define pgprot_mhp(prot)        (prot)
#endif

#ifdef CONFIG_MMU
#ifndef pgprot_modify
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
{
        if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot)))
                newprot = pgprot_noncached(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot)))
                newprot = pgprot_writecombine(newprot);
        if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot)))
                newprot = pgprot_device(newprot);
        return newprot;
}
#endif
#endif /* CONFIG_MMU */

#ifndef pgprot_encrypted
#define pgprot_encrypted(prot)        (prot)
#endif

#ifndef pgprot_decrypted
#define pgprot_decrypted(prot)        (prot)
#endif

/*
 * A facility to provide batching of the reload of page tables and
 * other process state with the actual context switch code for
 * paravirtualized guests.  By convention, only one of the batched
 * update (lazy) modes (CPU, MMU) should be active at any given time,
 * entry should never be nested, and entry and exits should always be
 * paired.  This is for sanity of maintaining and reasoning about the
 * kernel code.  In this case, the exit (end of the context switch) is
 * in architecture-specific code, and so doesn't need a generic
 * definition.
 */
#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
#define arch_start_context_switch(prev)        do {} while (0)
#endif

#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
#ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION
static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif
#else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */
static inline int pte_soft_dirty(pte_t pte)
{
        return 0;
}

static inline int pmd_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pte_t pte_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
{
        return pte;
}

static inline int pte_swp_soft_dirty(pte_t pte)
{
        return 0;
}

static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
{
        return pte;
}

static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd)
{
        return pmd;
}

static inline int pmd_swp_soft_dirty(pmd_t pmd)
{
        return 0;
}

static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd)
{
        return pmd;
}
#endif

#ifndef __HAVE_PFNMAP_TRACKING
/*
 * Interfaces that can be used by architecture code to keep track of
 * memory type of pfn mappings specified by the remap_pfn_range,
 * vmf_insert_pfn.
 */

/*
 * track_pfn_remap is called when a _new_ pfn mapping is being established
 * by remap_pfn_range() for physical range indicated by pfn and size.
 */
static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                                  unsigned long pfn, unsigned long addr,
                                  unsigned long size)
{
        return 0;
}

/*
 * track_pfn_insert is called when a _new_ single pfn is established
 * by vmf_insert_pfn().
 */
static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                                    pfn_t pfn)
{
}

/*
 * track_pfn_copy is called when a VM_PFNMAP VMA is about to get the page
 * tables copied during copy_page_range(). Will store the pfn to be
 * passed to untrack_pfn_copy() only if there is something to be untracked.
 * Callers should initialize the pfn to 0.
 */
static inline int track_pfn_copy(struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, unsigned long *pfn)
{
        return 0;
}

/*
 * untrack_pfn_copy is called when a VM_PFNMAP VMA failed to copy during
 * copy_page_range(), but after track_pfn_copy() was already called. Can
 * be called even if track_pfn_copy() did not actually track anything:
 * handled internally.
 */
static inline void untrack_pfn_copy(struct vm_area_struct *dst_vma,
                unsigned long pfn)
{
}

/*
 * untrack_pfn is called while unmapping a pfnmap for a region.
 * untrack can be called for a specific region indicated by pfn and size or
 * can be for the entire vma (in which case pfn, size are zero).
 */
static inline void untrack_pfn(struct vm_area_struct *vma,
                               unsigned long pfn, unsigned long size,
                               bool mm_wr_locked)
{
}

/*
 * untrack_pfn_clear is called in the following cases on a VM_PFNMAP VMA:
 *
 * 1) During mremap() on the src VMA after the page tables were moved.
 * 2) During fork() on the dst VMA, immediately after duplicating the src VMA.
 */
static inline void untrack_pfn_clear(struct vm_area_struct *vma)
{
}
#else
extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
                           unsigned long pfn, unsigned long addr,
                           unsigned long size);
extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
                             pfn_t pfn);
extern int track_pfn_copy(struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, unsigned long *pfn);
extern void untrack_pfn_copy(struct vm_area_struct *dst_vma,
                unsigned long pfn);
extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
                        unsigned long size, bool mm_wr_locked);
extern void untrack_pfn_clear(struct vm_area_struct *vma);
#endif

#ifdef CONFIG_MMU
#ifdef __HAVE_COLOR_ZERO_PAGE
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        unsigned long offset_from_zero_pfn = pfn - zero_pfn;
        return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
}

#define my_zero_pfn(addr)        page_to_pfn(ZERO_PAGE(addr))

#else
static inline int is_zero_pfn(unsigned long pfn)
{
        extern unsigned long zero_pfn;
        return pfn == zero_pfn;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        extern unsigned long zero_pfn;
        return zero_pfn;
}
#endif
#else
static inline int is_zero_pfn(unsigned long pfn)
{
        return 0;
}

static inline unsigned long my_zero_pfn(unsigned long addr)
{
        return 0;
}
#endif /* CONFIG_MMU */

#ifdef CONFIG_MMU

#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
        return 0;
}
#ifndef pmd_write
static inline int pmd_write(pmd_t pmd)
{
        BUG();
        return 0;
}
#endif /* pmd_write */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#ifndef pud_write
static inline int pud_write(pud_t pud)
{
        BUG();
        return 0;
}
#endif /* pud_write */

#if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline int pmd_devmap(pmd_t pmd)
{
        return 0;
}
static inline int pud_devmap(pud_t pud)
{
        return 0;
}
static inline int pgd_devmap(pgd_t pgd)
{
        return 0;
}
#endif

#if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
        !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
static inline int pud_trans_huge(pud_t pud)
{
        return 0;
}
#endif

static inline int pud_trans_unstable(pud_t *pud)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
        defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
        pud_t pudval = READ_ONCE(*pud);

        if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval))
                return 1;
        if (unlikely(pud_bad(pudval))) {
                pud_clear_bad(pud);
                return 1;
        }
#endif
        return 0;
}

#ifndef CONFIG_NUMA_BALANCING
/*
 * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is
 * perfectly valid to indicate "no" in that case, which is why our default
 * implementation defaults to "always no".
 *
 * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE
 * page protection due to NUMA hinting. NUMA hinting faults only apply in
 * accessible VMAs.
 *
 * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault,
 * looking at the VMA accessibility is sufficient.
 */
static inline int pte_protnone(pte_t pte)
{
        return 0;
}

static inline int pmd_protnone(pmd_t pmd)
{
        return 0;
}
#endif /* CONFIG_NUMA_BALANCING */

#endif /* CONFIG_MMU */

#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP

#ifndef __PAGETABLE_P4D_FOLDED
int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot);
void p4d_clear_huge(p4d_t *p4d);
#else
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
#endif /* !__PAGETABLE_P4D_FOLDED */

int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd);
int p4d_free_pud_page(p4d_t *p4d, unsigned long addr);
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else        /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
        return 0;
}
static inline void p4d_clear_huge(p4d_t *p4d) { }
static inline int pud_clear_huge(pud_t *pud)
{
        return 0;
}
static inline int pmd_clear_huge(pmd_t *pmd)
{
        return 0;
}
static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
{
        return 0;
}
static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
        return 0;
}
static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
        return 0;
}
#endif        /* CONFIG_HAVE_ARCH_HUGE_VMAP */

#ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/*
 * ARCHes with special requirements for evicting THP backing TLB entries can
 * implement this. Otherwise also, it can help optimize normal TLB flush in
 * THP regime. Stock flush_tlb_range() typically has optimization to nuke the
 * entire TLB if flush span is greater than a threshold, which will
 * likely be true for a single huge page. Thus a single THP flush will
 * invalidate the entire TLB which is not desirable.
 * e.g. see arch/arc: flush_pmd_tlb_range
 */
#define flush_pmd_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#define flush_pud_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
#else
#define flush_pmd_tlb_range(vma, addr, end)        BUILD_BUG()
#define flush_pud_tlb_range(vma, addr, end)        BUILD_BUG()
#endif
#endif

struct file;
int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
                        unsigned long size, pgprot_t *vma_prot);

#ifndef CONFIG_X86_ESPFIX64
static inline void init_espfix_bsp(void) { }
#endif

extern void __init pgtable_cache_init(void);

#ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED
static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
{
        return true;
}

static inline bool arch_has_pfn_modify_check(void)
{
        return false;
}
#endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */

/*
 * Architecture PAGE_KERNEL_* fallbacks
 *
 * Some architectures don't define certain PAGE_KERNEL_* flags. This is either
 * because they really don't support them, or the port needs to be updated to
 * reflect the required functionality. Below are a set of relatively safe
 * fallbacks, as best effort, which we can count on in lieu of the architectures
 * not defining them on their own yet.
 */

#ifndef PAGE_KERNEL_RO
# define PAGE_KERNEL_RO PAGE_KERNEL
#endif

#ifndef PAGE_KERNEL_EXEC
# define PAGE_KERNEL_EXEC PAGE_KERNEL
#endif

/*
 * Page Table Modification bits for pgtbl_mod_mask.
 *
 * These are used by the p?d_alloc_track*() set of functions an in the generic
 * vmalloc/ioremap code to track at which page-table levels entries have been
 * modified. Based on that the code can better decide when vmalloc and ioremap
 * mapping changes need to be synchronized to other page-tables in the system.
 */
#define                __PGTBL_PGD_MODIFIED        0
#define                __PGTBL_P4D_MODIFIED        1
#define                __PGTBL_PUD_MODIFIED        2
#define                __PGTBL_PMD_MODIFIED        3
#define                __PGTBL_PTE_MODIFIED        4

#define                PGTBL_PGD_MODIFIED        BIT(__PGTBL_PGD_MODIFIED)
#define                PGTBL_P4D_MODIFIED        BIT(__PGTBL_P4D_MODIFIED)
#define                PGTBL_PUD_MODIFIED        BIT(__PGTBL_PUD_MODIFIED)
#define                PGTBL_PMD_MODIFIED        BIT(__PGTBL_PMD_MODIFIED)
#define                PGTBL_PTE_MODIFIED        BIT(__PGTBL_PTE_MODIFIED)

/* Page-Table Modification Mask */
typedef unsigned int pgtbl_mod_mask;

#endif /* !__ASSEMBLY__ */

#if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT)
#ifdef CONFIG_PHYS_ADDR_T_64BIT
/*
 * ZSMALLOC needs to know the highest PFN on 32-bit architectures
 * with physical address space extension, but falls back to
 * BITS_PER_LONG otherwise.
 */
#error Missing MAX_POSSIBLE_PHYSMEM_BITS definition
#else
#define MAX_POSSIBLE_PHYSMEM_BITS 32
#endif
#endif

#ifndef has_transparent_hugepage
#define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
#endif

#ifndef has_transparent_pud_hugepage
#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
#endif
/*
 * On some architectures it depends on the mm if the p4d/pud or pmd
 * layer of the page table hierarchy is folded or not.
 */
#ifndef mm_p4d_folded
#define mm_p4d_folded(mm)        __is_defined(__PAGETABLE_P4D_FOLDED)
#endif

#ifndef mm_pud_folded
#define mm_pud_folded(mm)        __is_defined(__PAGETABLE_PUD_FOLDED)
#endif

#ifndef mm_pmd_folded
#define mm_pmd_folded(mm)        __is_defined(__PAGETABLE_PMD_FOLDED)
#endif

#ifndef p4d_offset_lockless
#define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address)
#endif
#ifndef pud_offset_lockless
#define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address)
#endif
#ifndef pmd_offset_lockless
#define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address)
#endif

/*
 * pXd_leaf() is the API to check whether a pgtable entry is a huge page
 * mapping.  It should work globally across all archs, without any
 * dependency on CONFIG_* options.  For architectures that do not support
 * huge mappings on specific levels, below fallbacks will be used.
 *
 * A leaf pgtable entry should always imply the following:
 *
 * - It is a "present" entry.  IOW, before using this API, please check it
 *   with pXd_present() first. NOTE: it may not always mean the "present
 *   bit" is set.  For example, PROT_NONE entries are always "present".
 *
 * - It should _never_ be a swap entry of any type.  Above "present" check
 *   should have guarded this, but let's be crystal clear on this.
 *
 * - It should contain a huge PFN, which points to a huge page larger than
 *   PAGE_SIZE of the platform.  The PFN format isn't important here.
 *
 * - It should cover all kinds of huge mappings (e.g., pXd_trans_huge(),
 *   pXd_devmap(), or hugetlb mappings).
 */
#ifndef pgd_leaf
#define pgd_leaf(x)        false
#endif
#ifndef p4d_leaf
#define p4d_leaf(x)        false
#endif
#ifndef pud_leaf
#define pud_leaf(x)        false
#endif
#ifndef pmd_leaf
#define pmd_leaf(x)        false
#endif

#ifndef pgd_leaf_size
#define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT)
#endif
#ifndef p4d_leaf_size
#define p4d_leaf_size(x) P4D_SIZE
#endif
#ifndef pud_leaf_size
#define pud_leaf_size(x) PUD_SIZE
#endif
#ifndef pmd_leaf_size
#define pmd_leaf_size(x) PMD_SIZE
#endif
#ifndef __pte_leaf_size
#ifndef pte_leaf_size
#define pte_leaf_size(x) PAGE_SIZE
#endif
#define __pte_leaf_size(x,y) pte_leaf_size(y)
#endif

/*
 * We always define pmd_pfn for all archs as it's used in lots of generic
 * code.  Now it happens too for pud_pfn (and can happen for larger
 * mappings too in the future; we're not there yet).  Instead of defining
 * it for all archs (like pmd_pfn), provide a fallback.
 *
 * Note that returning 0 here means any arch that didn't define this can
 * get severely wrong when it hits a real pud leaf.  It's arch's
 * responsibility to properly define it when a huge pud is possible.
 */
#ifndef pud_pfn
#define pud_pfn(x) 0
#endif

/*
 * Some architectures have MMUs that are configurable or selectable at boot
 * time. These lead to variable PTRS_PER_x. For statically allocated arrays it
 * helps to have a static maximum value.
 */

#ifndef MAX_PTRS_PER_PTE
#define MAX_PTRS_PER_PTE PTRS_PER_PTE
#endif

#ifndef MAX_PTRS_PER_PMD
#define MAX_PTRS_PER_PMD PTRS_PER_PMD
#endif

#ifndef MAX_PTRS_PER_PUD
#define MAX_PTRS_PER_PUD PTRS_PER_PUD
#endif

#ifndef MAX_PTRS_PER_P4D
#define MAX_PTRS_PER_P4D PTRS_PER_P4D
#endif

#ifndef pte_pgprot
#define pte_pgprot(x) ((pgprot_t) {0})
#endif

#ifndef pmd_pgprot
#define pmd_pgprot(x) ((pgprot_t) {0})
#endif

#ifndef pud_pgprot
#define pud_pgprot(x) ((pgprot_t) {0})
#endif

/* description of effects of mapping type and prot in current implementation.
 * this is due to the limited x86 page protection hardware.  The expected
 * behavior is in parens:
 *
 * map_type        prot
 *                PROT_NONE        PROT_READ        PROT_WRITE        PROT_EXEC
 * MAP_SHARED        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (yes) yes        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * MAP_PRIVATE        r: (no) no        r: (yes) yes        r: (no) yes        r: (no) yes
 *                w: (no) no        w: (no) no        w: (copy) copy        w: (no) no
 *                x: (no) no        x: (no) yes        x: (no) yes        x: (yes) yes
 *
 * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
 * MAP_PRIVATE (with Enhanced PAN supported):
 *                                                                r: (no) no
 *                                                                w: (no) no
 *                                                                x: (yes) yes
 */
#define DECLARE_VM_GET_PAGE_PROT                                        \
pgprot_t vm_get_page_prot(unsigned long vm_flags)                        \
{                                                                        \
                return protection_map[vm_flags &                        \
                        (VM_READ | VM_WRITE | VM_EXEC | VM_SHARED)];        \
}                                                                        \
EXPORT_SYMBOL(vm_get_page_prot);

#endif /* _LINUX_PGTABLE_H */



































































































































































































































   21 
























































































   23 
















    8 
   22 























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Berkeley style UIO structures        -        Alan Cox 1994.
 */
#ifndef __LINUX_UIO_H
#define __LINUX_UIO_H

#include <linux/kernel.h>
#include <linux/mm_types.h>
#include <linux/ucopysize.h>
#include <uapi/linux/uio.h>

struct page;
struct folio_queue;

typedef unsigned int __bitwise iov_iter_extraction_t;

struct kvec {
        void *iov_base; /* and that should *never* hold a userland pointer */
        size_t iov_len;
};

enum iter_type {
        /* iter types */
        ITER_UBUF,
        ITER_IOVEC,
        ITER_BVEC,
        ITER_KVEC,
        ITER_FOLIOQ,
        ITER_XARRAY,
        ITER_DISCARD,
};

#define ITER_SOURCE        1        // == WRITE
#define ITER_DEST        0        // == READ

struct iov_iter_state {
        size_t iov_offset;
        size_t count;
        unsigned long nr_segs;
};

struct iov_iter {
        u8 iter_type;
        bool nofault;
        bool data_source;
        size_t iov_offset;
        /*
         * Hack alert: overlay ubuf_iovec with iovec + count, so
         * that the members resolve correctly regardless of the type
         * of iterator used. This means that you can use:
         *
         * &iter->__ubuf_iovec or iter->__iov
         *
         * interchangably for the user_backed cases, hence simplifying
         * some of the cases that need to deal with both.
         */
        union {
                /*
                 * This really should be a const, but we cannot do that without
                 * also modifying any of the zero-filling iter init functions.
                 * Leave it non-const for now, but it should be treated as such.
                 */
                struct iovec __ubuf_iovec;
                struct {
                        union {
                                /* use iter_iov() to get the current vec */
                                const struct iovec *__iov;
                                const struct kvec *kvec;
                                const struct bio_vec *bvec;
                                const struct folio_queue *folioq;
                                struct xarray *xarray;
                                void __user *ubuf;
                        };
                        size_t count;
                };
        };
        union {
                unsigned long nr_segs;
                u8 folioq_slot;
                loff_t xarray_start;
        };
};

typedef __u16 uio_meta_flags_t;

struct uio_meta {
        uio_meta_flags_t        flags;
        u16                        app_tag;
        u64                        seed;
        struct iov_iter                iter;
};

static inline const struct iovec *iter_iov(const struct iov_iter *iter)
{
        if (iter->iter_type == ITER_UBUF)
                return (const struct iovec *) &iter->__ubuf_iovec;
        return iter->__iov;
}

#define iter_iov_addr(iter)        (iter_iov(iter)->iov_base + (iter)->iov_offset)
#define iter_iov_len(iter)        (iter_iov(iter)->iov_len - (iter)->iov_offset)

static inline enum iter_type iov_iter_type(const struct iov_iter *i)
{
        return i->iter_type;
}

static inline void iov_iter_save_state(struct iov_iter *iter,
                                       struct iov_iter_state *state)
{
        state->iov_offset = iter->iov_offset;
        state->count = iter->count;
        state->nr_segs = iter->nr_segs;
}

static inline bool iter_is_ubuf(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_UBUF;
}

static inline bool iter_is_iovec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_IOVEC;
}

static inline bool iov_iter_is_kvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_KVEC;
}

static inline bool iov_iter_is_bvec(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_BVEC;
}

static inline bool iov_iter_is_discard(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_DISCARD;
}

static inline bool iov_iter_is_folioq(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_FOLIOQ;
}

static inline bool iov_iter_is_xarray(const struct iov_iter *i)
{
        return iov_iter_type(i) == ITER_XARRAY;
}

static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
        return i->data_source ? WRITE : READ;
}

static inline bool user_backed_iter(const struct iov_iter *i)
{
        return iter_is_ubuf(i) || iter_is_iovec(i);
}

/*
 * Total number of bytes covered by an iovec.
 *
 * NOTE that it is not safe to use this function until all the iovec's
 * segment lengths have been validated.  Because the individual lengths can
 * overflow a size_t when added together.
 */
static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs)
{
        unsigned long seg;
        size_t ret = 0;

        for (seg = 0; seg < nr_segs; seg++)
                ret += iov[seg].iov_len;
        return ret;
}

size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
                                  size_t bytes, struct iov_iter *i);
void iov_iter_advance(struct iov_iter *i, size_t bytes);
void iov_iter_revert(struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
size_t iov_iter_single_seg_count(const struct iov_iter *i);
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i);
size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i);

static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        return copy_page_to_iter(&folio->page, offset, bytes, i);
}

static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset,
                                          size_t bytes, struct iov_iter *i)
{
        return copy_page_from_iter(&folio->page, offset, bytes, i);
}

static inline size_t copy_folio_from_iter_atomic(struct folio *folio,
                size_t offset, size_t bytes, struct iov_iter *i)
{
        return copy_page_from_iter_atomic(&folio->page, offset, bytes, i);
}

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset,
                                 size_t bytes, struct iov_iter *i);

static __always_inline __must_check
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, true))
                return _copy_to_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_to_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

static __always_inline __must_check
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (check_copy_size(addr, bytes, false))
                return _copy_from_iter_nocache(addr, bytes, i);
        return 0;
}

static __always_inline __must_check
bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        size_t copied = copy_from_iter_nocache(addr, bytes, i);
        if (likely(copied == bytes))
                return true;
        iov_iter_revert(i, copied);
        return false;
}

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/*
 * Note, users like pmem that depend on the stricter semantics of
 * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for
 * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the
 * destination is flushed from the cache on return.
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_from_iter_flushcache _copy_from_iter_nocache
#endif

#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
#else
#define _copy_mc_to_iter _copy_to_iter
#endif

size_t iov_iter_zero(size_t bytes, struct iov_iter *);
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
                        unsigned len_mask);
unsigned long iov_iter_alignment(const struct iov_iter *i);
unsigned long iov_iter_gap_alignment(const struct iov_iter *i);
void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov,
                        unsigned long nr_segs, size_t count);
void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec,
                        unsigned long nr_segs, size_t count);
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
                          const struct folio_queue *folioq,
                          unsigned int first_slot, unsigned int offset, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
                     loff_t start, size_t count);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                        size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
                        size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);

static inline size_t iov_iter_count(const struct iov_iter *i)
{
        return i->count;
}

/*
 * Cap the iov_iter by given limit; note that the second argument is
 * *not* the new size - it's upper limit for such.  Passing it a value
 * greater than the amount of data in iov_iter is fine - it'll just do
 * nothing in that case.
 */
static inline void iov_iter_truncate(struct iov_iter *i, u64 count)
{
        /*
         * count doesn't have to fit in size_t - comparison extends both
         * operands to u64 here and any value that would be truncated by
         * conversion in assignement is by definition greater than all
         * values of size_t, including old i->count.
         */
        if (i->count > count)
                i->count = count;
}

/*
 * reexpand a previously truncated iterator; count must be no more than how much
 * we had shrunk it.
 */
static inline void iov_iter_reexpand(struct iov_iter *i, size_t count)
{
        i->count = count;
}

static inline int
iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes)
{
        size_t shorted = 0;
        int npages;

        if (iov_iter_count(i) > max_bytes) {
                shorted = iov_iter_count(i) - max_bytes;
                iov_iter_truncate(i, max_bytes);
        }
        npages = iov_iter_npages(i, maxpages);
        if (shorted)
                iov_iter_reexpand(i, iov_iter_count(i) + shorted);

        return npages;
}

struct iovec *iovec_from_user(const struct iovec __user *uvector,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat);
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i);
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat);
int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i);

static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
                        void __user *buf, size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_UBUF,
                .data_source = direction,
                .ubuf = buf,
                .count = count,
                .nr_segs = 1
        };
}
/* Flags for iov_iter_get/extract_pages*() */
/* Allow P2PDMA on the extracted pages */
#define ITER_ALLOW_P2PDMA        ((__force iov_iter_extraction_t)0x01)

ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages,
                               size_t maxsize, unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0);

/**
 * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained
 * @iter: The iterator
 *
 * Examine the iterator and indicate by returning true or false as to how, if
 * at all, pages extracted from the iterator will be retained by the extraction
 * function.
 *
 * %true indicates that the pages will have a pin placed in them that the
 * caller must unpin.  This is must be done for DMA/async DIO to force fork()
 * to forcibly copy a page for the child (the parent must retain the original
 * page).
 *
 * %false indicates that no measures are taken and that it's up to the caller
 * to retain the pages.
 */
static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter)
{
        return user_backed_iter(iter);
}

struct sg_table;
ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len,
                           struct sg_table *sgtable, unsigned int sg_max,
                           iov_iter_extraction_t extraction_flags);

#endif














    4 











    4 



    3 

    4 



    1 



    1 











    2 




    2 













    1 





    7 










    7 

    1 








    4 

    4 
    1 






    3 



    1 





    2 






    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
// SPDX-License-Identifier: GPL-2.0
// Copyright (C) 2019 Arm Ltd.

#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>
#include <linux/sched/stat.h>

#include <asm/kvm_mmu.h>
#include <asm/pvclock-abi.h>

#include <kvm/arm_hypercalls.h>

void kvm_update_stolen_time(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        u64 base = vcpu->arch.steal.base;
        u64 last_steal = vcpu->arch.steal.last_steal;
        u64 offset = offsetof(struct pvclock_vcpu_stolen_time, stolen_time);
        u64 steal = 0;
        int idx;

        if (base == INVALID_GPA)
                return;

        idx = srcu_read_lock(&kvm->srcu);
        if (!kvm_get_guest(kvm, base + offset, steal)) {
                steal = le64_to_cpu(steal);
                vcpu->arch.steal.last_steal = READ_ONCE(current->sched_info.run_delay);
                steal += vcpu->arch.steal.last_steal - last_steal;
                kvm_put_guest(kvm, base + offset, cpu_to_le64(steal));
        }
        srcu_read_unlock(&kvm->srcu, idx);
}

long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu)
{
        u32 feature = smccc_get_arg1(vcpu);
        long val = SMCCC_RET_NOT_SUPPORTED;

        switch (feature) {
        case ARM_SMCCC_HV_PV_TIME_FEATURES:
        case ARM_SMCCC_HV_PV_TIME_ST:
                if (vcpu->arch.steal.base != INVALID_GPA)
                        val = SMCCC_RET_SUCCESS;
                break;
        }

        return val;
}

gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu)
{
        struct pvclock_vcpu_stolen_time init_values = {};
        struct kvm *kvm = vcpu->kvm;
        u64 base = vcpu->arch.steal.base;

        if (base == INVALID_GPA)
                return base;

        /*
         * Start counting stolen time from the time the guest requests
         * the feature enabled.
         */
        vcpu->arch.steal.last_steal = current->sched_info.run_delay;
        kvm_write_guest_lock(kvm, base, &init_values, sizeof(init_values));

        return base;
}

bool kvm_arm_pvtime_supported(void)
{
        return !!sched_info_on();
}

int kvm_arm_pvtime_set_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr)
{
        u64 __user *user = (u64 __user *)attr->addr;
        struct kvm *kvm = vcpu->kvm;
        u64 ipa;
        int ret = 0;
        int idx;

        if (!kvm_arm_pvtime_supported() ||
            attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
                return -ENXIO;

        if (get_user(ipa, user))
                return -EFAULT;
        if (!IS_ALIGNED(ipa, 64))
                return -EINVAL;
        if (vcpu->arch.steal.base != INVALID_GPA)
                return -EEXIST;

        /* Check the address is in a valid memslot */
        idx = srcu_read_lock(&kvm->srcu);
        if (kvm_is_error_hva(gfn_to_hva(kvm, ipa >> PAGE_SHIFT)))
                ret = -EINVAL;
        srcu_read_unlock(&kvm->srcu, idx);

        if (!ret)
                vcpu->arch.steal.base = ipa;

        return ret;
}

int kvm_arm_pvtime_get_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr)
{
        u64 __user *user = (u64 __user *)attr->addr;
        u64 ipa;

        if (!kvm_arm_pvtime_supported() ||
            attr->attr != KVM_ARM_VCPU_PVTIME_IPA)
                return -ENXIO;

        ipa = vcpu->arch.steal.base;

        if (put_user(ipa, user))
                return -EFAULT;
        return 0;
}

int kvm_arm_pvtime_has_attr(struct kvm_vcpu *vcpu,
                            struct kvm_device_attr *attr)
{
        switch (attr->attr) {
        case KVM_ARM_VCPU_PVTIME_IPA:
                if (kvm_arm_pvtime_supported())
                        return 0;
        }
        return -ENXIO;
}






















































































































    3 











    2 
    2 

    2 





































































































































































































































































































    3 





    5 






















    1 





















    4 



    1 

    1 










    1 





    4 












    3 
    4 






    4 
    2 













































    3 
    3 

    3 





















    3 


    1 
    2 




















    6 



    1 






    3 





    3 




    1 







    1 
































































































































































































































    9 




































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/pipe.c
 *
 *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
 */

#include <linux/mm.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/log2.h>
#include <linux/mount.h>
#include <linux/pseudo_fs.h>
#include <linux/magic.h>
#include <linux/pipe_fs_i.h>
#include <linux/uio.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/audit.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <linux/memcontrol.h>
#include <linux/watch_queue.h>
#include <linux/sysctl.h>

#include <linux/uaccess.h>
#include <asm/ioctls.h>

#include "internal.h"

/*
 * New pipe buffers will be restricted to this size while the user is exceeding
 * their pipe buffer quota. The general pipe use case needs at least two
 * buffers: one for data yet to be read, and one for new data. If this is less
 * than two, then a write to a non-empty pipe may block even if the pipe is not
 * full. This can occur with GNU make jobserver or similar uses of pipes as
 * semaphores: multiple processes may be waiting to write tokens back to the
 * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
 *
 * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
 * own risk, namely: pipe writes to non-full pipes may block until the pipe is
 * emptied.
 */
#define PIPE_MIN_DEF_BUFFERS 2

/*
 * The max size that a non-root user is allowed to grow the pipe. Can
 * be set by root in /proc/sys/fs/pipe-max-size
 */
static unsigned int pipe_max_size = 1048576;

/* Maximum allocatable pages per user. Hard limit is unset by default, soft
 * matches default values.
 */
static unsigned long pipe_user_pages_hard;
static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;

/*
 * We use head and tail indices that aren't masked off, except at the point of
 * dereference, but rather they're allowed to wrap naturally.  This means there
 * isn't a dead spot in the buffer, but the ring has to be a power of two and
 * <= 2^31.
 * -- David Howells 2019-09-23.
 *
 * Reads with count = 0 should always return 0.
 * -- Julian Bradfield 1999-06-07.
 *
 * FIFOs and Pipes now generate SIGIO for both readers and writers.
 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
 *
 * pipe_read & write cleanup
 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
 */

#define cmp_int(l, r)                ((l > r) - (l < r))

#ifdef CONFIG_PROVE_LOCKING
static int pipe_lock_cmp_fn(const struct lockdep_map *a,
                            const struct lockdep_map *b)
{
        return cmp_int((unsigned long) a, (unsigned long) b);
}
#endif

void pipe_lock(struct pipe_inode_info *pipe)
{
        if (pipe->files)
                mutex_lock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_lock);

void pipe_unlock(struct pipe_inode_info *pipe)
{
        if (pipe->files)
                mutex_unlock(&pipe->mutex);
}
EXPORT_SYMBOL(pipe_unlock);

void pipe_double_lock(struct pipe_inode_info *pipe1,
                      struct pipe_inode_info *pipe2)
{
        BUG_ON(pipe1 == pipe2);

        if (pipe1 > pipe2)
                swap(pipe1, pipe2);

        pipe_lock(pipe1);
        pipe_lock(pipe2);
}

static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe)
{
        for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
                if (pipe->tmp_page[i]) {
                        struct page *page = pipe->tmp_page[i];
                        pipe->tmp_page[i] = NULL;
                        return page;
                }
        }

        return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
}

static void anon_pipe_put_page(struct pipe_inode_info *pipe,
                               struct page *page)
{
        if (page_count(page) == 1) {
                for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
                        if (!pipe->tmp_page[i]) {
                                pipe->tmp_page[i] = page;
                                return;
                        }
                }
        }

        put_page(page);
}

static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
                                  struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        anon_pipe_put_page(pipe, page);
}

static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        if (page_count(page) != 1)
                return false;
        memcg_kmem_uncharge_page(page, 0);
        __SetPageLocked(page);
        return true;
}

/**
 * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to attempt to steal
 *
 * Description:
 *        This function attempts to steal the &struct page attached to
 *        @buf. If successful, this function returns 0 and returns with
 *        the page locked. The caller may then reuse the page for whatever
 *        he wishes; the typical use is insertion into a different file
 *        page cache.
 */
bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
                struct pipe_buffer *buf)
{
        struct page *page = buf->page;

        /*
         * A reference of one is golden, that means that the owner of this
         * page is the only one holding a reference to it. lock the page
         * and return OK.
         */
        if (page_count(page) == 1) {
                lock_page(page);
                return true;
        }
        return false;
}
EXPORT_SYMBOL(generic_pipe_buf_try_steal);

/**
 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to get a reference to
 *
 * Description:
 *        This function grabs an extra reference to @buf. It's used in
 *        the tee() system call, when we duplicate the buffers in one
 *        pipe into another.
 */
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
        return try_get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);

/**
 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
 * @pipe:        the pipe that the buffer belongs to
 * @buf:        the buffer to put a reference to
 *
 * Description:
 *        This function releases a reference to @buf.
 */
void generic_pipe_buf_release(struct pipe_inode_info *pipe,
                              struct pipe_buffer *buf)
{
        put_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_release);

static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .release        = anon_pipe_buf_release,
        .try_steal        = anon_pipe_buf_try_steal,
        .get                = generic_pipe_buf_get,
};

/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_readable(const struct pipe_inode_info *pipe)
{
        union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
        unsigned int writers = READ_ONCE(pipe->writers);

        return !pipe_empty(idx.head, idx.tail) || !writers;
}

static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe,
                                            struct pipe_buffer *buf,
                                            unsigned int tail)
{
        pipe_buf_release(pipe, buf);

        /*
         * If the pipe has a watch_queue, we need additional protection
         * by the spinlock because notifications get posted with only
         * this spinlock, no mutex
         */
        if (pipe_has_watch_queue(pipe)) {
                spin_lock_irq(&pipe->rd_wait.lock);
#ifdef CONFIG_WATCH_QUEUE
                if (buf->flags & PIPE_BUF_FLAG_LOSS)
                        pipe->note_loss = true;
#endif
                pipe->tail = ++tail;
                spin_unlock_irq(&pipe->rd_wait.lock);
                return tail;
        }

        /*
         * Without a watch_queue, we can simply increment the tail
         * without the spinlock - the mutex is enough.
         */
        pipe->tail = ++tail;
        return tail;
}

static ssize_t
anon_pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
        size_t total_len = iov_iter_count(to);
        struct file *filp = iocb->ki_filp;
        struct pipe_inode_info *pipe = filp->private_data;
        bool wake_writer = false, wake_next_reader = false;
        ssize_t ret;

        /* Null read succeeds. */
        if (unlikely(total_len == 0))
                return 0;

        ret = 0;
        mutex_lock(&pipe->mutex);

        /*
         * We only wake up writers if the pipe was full when we started reading
         * and it is no longer full after reading to avoid unnecessary wakeups.
         *
         * But when we do wake up writers, we do so using a sync wakeup
         * (WF_SYNC), because we want them to get going and generate more
         * data for us.
         */
        for (;;) {
                /* Read ->head with a barrier vs post_one_notification() */
                unsigned int head = smp_load_acquire(&pipe->head);
                unsigned int tail = pipe->tail;

#ifdef CONFIG_WATCH_QUEUE
                if (pipe->note_loss) {
                        struct watch_notification n;

                        if (total_len < 8) {
                                if (ret == 0)
                                        ret = -ENOBUFS;
                                break;
                        }

                        n.type = WATCH_TYPE_META;
                        n.subtype = WATCH_META_LOSS_NOTIFICATION;
                        n.info = watch_sizeof(n);
                        if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
                                if (ret == 0)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += sizeof(n);
                        total_len -= sizeof(n);
                        pipe->note_loss = false;
                }
#endif

                if (!pipe_empty(head, tail)) {
                        struct pipe_buffer *buf = pipe_buf(pipe, tail);
                        size_t chars = buf->len;
                        size_t written;
                        int error;

                        if (chars > total_len) {
                                if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
                                        if (ret == 0)
                                                ret = -ENOBUFS;
                                        break;
                                }
                                chars = total_len;
                        }

                        error = pipe_buf_confirm(pipe, buf);
                        if (error) {
                                if (!ret)
                                        ret = error;
                                break;
                        }

                        written = copy_page_to_iter(buf->page, buf->offset, chars, to);
                        if (unlikely(written < chars)) {
                                if (!ret)
                                        ret = -EFAULT;
                                break;
                        }
                        ret += chars;
                        buf->offset += chars;
                        buf->len -= chars;

                        /* Was it a packet buffer? Clean up and exit */
                        if (buf->flags & PIPE_BUF_FLAG_PACKET) {
                                total_len = chars;
                                buf->len = 0;
                        }

                        if (!buf->len) {
                                wake_writer |= pipe_full(head, tail, pipe->max_usage);
                                tail = pipe_update_tail(pipe, buf, tail);
                        }
                        total_len -= chars;
                        if (!total_len)
                                break;        /* common path: read succeeded */
                        if (!pipe_empty(head, tail))        /* More to do? */
                                continue;
                }

                if (!pipe->writers)
                        break;
                if (ret)
                        break;
                if ((filp->f_flags & O_NONBLOCK) ||
                    (iocb->ki_flags & IOCB_NOWAIT)) {
                        ret = -EAGAIN;
                        break;
                }
                mutex_unlock(&pipe->mutex);
                /*
                 * We only get here if we didn't actually read anything.
                 *
                 * But because we didn't read anything, at this point we can
                 * just return directly with -ERESTARTSYS if we're interrupted,
                 * since we've done any required wakeups and there's no need
                 * to mark anything accessed. And we've dropped the lock.
                 */
                if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
                        return -ERESTARTSYS;

                wake_next_reader = true;
                mutex_lock(&pipe->mutex);
        }
        if (pipe_is_empty(pipe))
                wake_next_reader = false;
        mutex_unlock(&pipe->mutex);

        if (wake_writer)
                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
        if (wake_next_reader)
                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        return ret;
}

static ssize_t
fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
        int ret = anon_pipe_read(iocb, to);
        if (ret > 0)
                file_accessed(iocb->ki_filp);
        return ret;
}

static inline int is_packetized(struct file *file)
{
        return (file->f_flags & O_DIRECT) != 0;
}

/* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
static inline bool pipe_writable(const struct pipe_inode_info *pipe)
{
        union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) };
        unsigned int max_usage = READ_ONCE(pipe->max_usage);

        return !pipe_full(idx.head, idx.tail, max_usage) ||
                !READ_ONCE(pipe->readers);
}

static ssize_t
anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
        struct file *filp = iocb->ki_filp;
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int head;
        ssize_t ret = 0;
        size_t total_len = iov_iter_count(from);
        ssize_t chars;
        bool was_empty = false;
        bool wake_next_writer = false;

        /*
         * Reject writing to watch queue pipes before the point where we lock
         * the pipe.
         * Otherwise, lockdep would be unhappy if the caller already has another
         * pipe locked.
         * If we had to support locking a normal pipe and a notification pipe at
         * the same time, we could set up lockdep annotations for that, but
         * since we don't actually need that, it's simpler to just bail here.
         */
        if (pipe_has_watch_queue(pipe))
                return -EXDEV;

        /* Null write succeeds. */
        if (unlikely(total_len == 0))
                return 0;

        mutex_lock(&pipe->mutex);

        if (!pipe->readers) {
                send_sig(SIGPIPE, current, 0);
                ret = -EPIPE;
                goto out;
        }

        /*
         * If it wasn't empty we try to merge new data into
         * the last buffer.
         *
         * That naturally merges small writes, but it also
         * page-aligns the rest of the writes for large writes
         * spanning multiple pages.
         */
        head = pipe->head;
        was_empty = pipe_empty(head, pipe->tail);
        chars = total_len & (PAGE_SIZE-1);
        if (chars && !was_empty) {
                struct pipe_buffer *buf = pipe_buf(pipe, head - 1);
                int offset = buf->offset + buf->len;

                if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
                    offset + chars <= PAGE_SIZE) {
                        ret = pipe_buf_confirm(pipe, buf);
                        if (ret)
                                goto out;

                        ret = copy_page_from_iter(buf->page, offset, chars, from);
                        if (unlikely(ret < chars)) {
                                ret = -EFAULT;
                                goto out;
                        }

                        buf->len += ret;
                        if (!iov_iter_count(from))
                                goto out;
                }
        }

        for (;;) {
                if (!pipe->readers) {
                        send_sig(SIGPIPE, current, 0);
                        if (!ret)
                                ret = -EPIPE;
                        break;
                }

                head = pipe->head;
                if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
                        struct pipe_buffer *buf;
                        struct page *page;
                        int copied;

                        page = anon_pipe_get_page(pipe);
                        if (unlikely(!page)) {
                                if (!ret)
                                        ret = -ENOMEM;
                                break;
                        }

                        copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
                        if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
                                anon_pipe_put_page(pipe, page);
                                if (!ret)
                                        ret = -EFAULT;
                                break;
                        }

                        pipe->head = head + 1;
                        /* Insert it into the buffer array */
                        buf = pipe_buf(pipe, head);
                        buf->page = page;
                        buf->ops = &anon_pipe_buf_ops;
                        buf->offset = 0;
                        if (is_packetized(filp))
                                buf->flags = PIPE_BUF_FLAG_PACKET;
                        else
                                buf->flags = PIPE_BUF_FLAG_CAN_MERGE;

                        buf->len = copied;
                        ret += copied;

                        if (!iov_iter_count(from))
                                break;

                        continue;
                }

                /* Wait for buffer space to become available. */
                if ((filp->f_flags & O_NONBLOCK) ||
                    (iocb->ki_flags & IOCB_NOWAIT)) {
                        if (!ret)
                                ret = -EAGAIN;
                        break;
                }
                if (signal_pending(current)) {
                        if (!ret)
                                ret = -ERESTARTSYS;
                        break;
                }

                /*
                 * We're going to release the pipe lock and wait for more
                 * space. We wake up any readers if necessary, and then
                 * after waiting we need to re-check whether the pipe
                 * become empty while we dropped the lock.
                 */
                mutex_unlock(&pipe->mutex);
                if (was_empty)
                        wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
                mutex_lock(&pipe->mutex);
                was_empty = pipe_is_empty(pipe);
                wake_next_writer = true;
        }
out:
        if (pipe_is_full(pipe))
                wake_next_writer = false;
        mutex_unlock(&pipe->mutex);

        /*
         * If we do do a wakeup event, we do a 'sync' wakeup, because we
         * want the reader to start processing things asap, rather than
         * leave the data pending.
         *
         * This is particularly important for small writes, because of
         * how (for example) the GNU make jobserver uses small writes to
         * wake up pending jobs
         *
         * Epoll nonsensically wants a wakeup whether the pipe
         * was already empty or not.
         */
        if (was_empty || pipe->poll_usage)
                wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
        kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        if (wake_next_writer)
                wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
        return ret;
}

static ssize_t
fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from)
{
        int ret = anon_pipe_write(iocb, from);
        if (ret > 0) {
                struct file *filp = iocb->ki_filp;
                if (sb_start_write_trylock(file_inode(filp)->i_sb)) {
                        int err = file_update_time(filp);
                        if (err)
                                ret = err;
                        sb_end_write(file_inode(filp)->i_sb);
                }
        }
        return ret;
}

static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
        struct pipe_inode_info *pipe = filp->private_data;
        unsigned int count, head, tail;

        switch (cmd) {
        case FIONREAD:
                mutex_lock(&pipe->mutex);
                count = 0;
                head = pipe->head;
                tail = pipe->tail;

                while (!pipe_empty(head, tail)) {
                        count += pipe_buf(pipe, tail)->len;
                        tail++;
                }
                mutex_unlock(&pipe->mutex);

                return put_user(count, (int __user *)arg);

#ifdef CONFIG_WATCH_QUEUE
        case IOC_WATCH_QUEUE_SET_SIZE: {
                int ret;
                mutex_lock(&pipe->mutex);
                ret = watch_queue_set_size(pipe, arg);
                mutex_unlock(&pipe->mutex);
                return ret;
        }

        case IOC_WATCH_QUEUE_SET_FILTER:
                return watch_queue_set_filter(
                        pipe, (struct watch_notification_filter __user *)arg);
#endif

        default:
                return -ENOIOCTLCMD;
        }
}

/* No kernel lock held - fine */
static __poll_t
pipe_poll(struct file *filp, poll_table *wait)
{
        __poll_t mask;
        struct pipe_inode_info *pipe = filp->private_data;
        union pipe_index idx;

        /* Epoll has some historical nasty semantics, this enables them */
        WRITE_ONCE(pipe->poll_usage, true);

        /*
         * Reading pipe state only -- no need for acquiring the semaphore.
         *
         * But because this is racy, the code has to add the
         * entry to the poll table _first_ ..
         */
        if (filp->f_mode & FMODE_READ)
                poll_wait(filp, &pipe->rd_wait, wait);
        if (filp->f_mode & FMODE_WRITE)
                poll_wait(filp, &pipe->wr_wait, wait);

        /*
         * .. and only then can you do the racy tests. That way,
         * if something changes and you got it wrong, the poll
         * table entry will wake you up and fix it.
         */
        idx.head_tail = READ_ONCE(pipe->head_tail);

        mask = 0;
        if (filp->f_mode & FMODE_READ) {
                if (!pipe_empty(idx.head, idx.tail))
                        mask |= EPOLLIN | EPOLLRDNORM;
                if (!pipe->writers && filp->f_pipe != pipe->w_counter)
                        mask |= EPOLLHUP;
        }

        if (filp->f_mode & FMODE_WRITE) {
                if (!pipe_full(idx.head, idx.tail, pipe->max_usage))
                        mask |= EPOLLOUT | EPOLLWRNORM;
                /*
                 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
                 * behave exactly like pipes for poll().
                 */
                if (!pipe->readers)
                        mask |= EPOLLERR;
        }

        return mask;
}

static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
{
        int kill = 0;

        spin_lock(&inode->i_lock);
        if (!--pipe->files) {
                inode->i_pipe = NULL;
                kill = 1;
        }
        spin_unlock(&inode->i_lock);

        if (kill)
                free_pipe_info(pipe);
}

static int
pipe_release(struct inode *inode, struct file *file)
{
        struct pipe_inode_info *pipe = file->private_data;

        mutex_lock(&pipe->mutex);
        if (file->f_mode & FMODE_READ)
                pipe->readers--;
        if (file->f_mode & FMODE_WRITE)
                pipe->writers--;

        /* Was that the last reader or writer, but not the other side? */
        if (!pipe->readers != !pipe->writers) {
                wake_up_interruptible_all(&pipe->rd_wait);
                wake_up_interruptible_all(&pipe->wr_wait);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
                kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
        }
        mutex_unlock(&pipe->mutex);

        put_pipe_info(inode, pipe);
        return 0;
}

static int
pipe_fasync(int fd, struct file *filp, int on)
{
        struct pipe_inode_info *pipe = filp->private_data;
        int retval = 0;

        mutex_lock(&pipe->mutex);
        if (filp->f_mode & FMODE_READ)
                retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
        if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
                retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
                if (retval < 0 && (filp->f_mode & FMODE_READ))
                        /* this can happen only if on == T */
                        fasync_helper(-1, filp, 0, &pipe->fasync_readers);
        }
        mutex_unlock(&pipe->mutex);
        return retval;
}

unsigned long account_pipe_buffers(struct user_struct *user,
                                   unsigned long old, unsigned long new)
{
        return atomic_long_add_return(new - old, &user->pipe_bufs);
}

bool too_many_pipe_buffers_soft(unsigned long user_bufs)
{
        unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);

        return soft_limit && user_bufs > soft_limit;
}

bool too_many_pipe_buffers_hard(unsigned long user_bufs)
{
        unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);

        return hard_limit && user_bufs > hard_limit;
}

bool pipe_is_unprivileged_user(void)
{
        return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
}

struct pipe_inode_info *alloc_pipe_info(void)
{
        struct pipe_inode_info *pipe;
        unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
        struct user_struct *user = get_current_user();
        unsigned long user_bufs;
        unsigned int max_size = READ_ONCE(pipe_max_size);

        pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
        if (pipe == NULL)
                goto out_free_uid;

        if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
                pipe_bufs = max_size >> PAGE_SHIFT;

        user_bufs = account_pipe_buffers(user, 0, pipe_bufs);

        if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
                user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
                pipe_bufs = PIPE_MIN_DEF_BUFFERS;
        }

        if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
                goto out_revert_acct;

        pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
                             GFP_KERNEL_ACCOUNT);

        if (pipe->bufs) {
                init_waitqueue_head(&pipe->rd_wait);
                init_waitqueue_head(&pipe->wr_wait);
                pipe->r_counter = pipe->w_counter = 1;
                pipe->max_usage = pipe_bufs;
                pipe->ring_size = pipe_bufs;
                pipe->nr_accounted = pipe_bufs;
                pipe->user = user;
                mutex_init(&pipe->mutex);
                lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
                return pipe;
        }

out_revert_acct:
        (void) account_pipe_buffers(user, pipe_bufs, 0);
        kfree(pipe);
out_free_uid:
        free_uid(user);
        return NULL;
}

void free_pipe_info(struct pipe_inode_info *pipe)
{
        unsigned int i;

#ifdef CONFIG_WATCH_QUEUE
        if (pipe->watch_queue)
                watch_queue_clear(pipe->watch_queue);
#endif

        (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
        free_uid(pipe->user);
        for (i = 0; i < pipe->ring_size; i++) {
                struct pipe_buffer *buf = pipe->bufs + i;
                if (buf->ops)
                        pipe_buf_release(pipe, buf);
        }
#ifdef CONFIG_WATCH_QUEUE
        if (pipe->watch_queue)
                put_watch_queue(pipe->watch_queue);
#endif
        for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) {
                if (pipe->tmp_page[i])
                        __free_page(pipe->tmp_page[i]);
        }
        kfree(pipe->bufs);
        kfree(pipe);
}

static struct vfsmount *pipe_mnt __ro_after_init;

/*
 * pipefs_dname() is called from d_path().
 */
static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
{
        return dynamic_dname(buffer, buflen, "pipe:[%lu]",
                                d_inode(dentry)->i_ino);
}

static const struct dentry_operations pipefs_dentry_operations = {
        .d_dname        = pipefs_dname,
};

static const struct file_operations pipeanon_fops;

static struct inode * get_pipe_inode(void)
{
        struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
        struct pipe_inode_info *pipe;

        if (!inode)
                goto fail_inode;

        inode->i_ino = get_next_ino();

        pipe = alloc_pipe_info();
        if (!pipe)
                goto fail_iput;

        inode->i_pipe = pipe;
        pipe->files = 2;
        pipe->readers = pipe->writers = 1;
        inode->i_fop = &pipeanon_fops;

        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
         * list because "mark_inode_dirty()" will think
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        simple_inode_init_ts(inode);

        return inode;

fail_iput:
        iput(inode);

fail_inode:
        return NULL;
}

int create_pipe_files(struct file **res, int flags)
{
        struct inode *inode = get_pipe_inode();
        struct file *f;
        int error;

        if (!inode)
                return -ENFILE;

        if (flags & O_NOTIFICATION_PIPE) {
                error = watch_queue_init(inode->i_pipe);
                if (error) {
                        free_pipe_info(inode->i_pipe);
                        iput(inode);
                        return error;
                }
        }

        f = alloc_file_pseudo(inode, pipe_mnt, "",
                                O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
                                &pipeanon_fops);
        if (IS_ERR(f)) {
                free_pipe_info(inode->i_pipe);
                iput(inode);
                return PTR_ERR(f);
        }

        f->private_data = inode->i_pipe;
        f->f_pipe = 0;

        res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
                                  &pipeanon_fops);
        if (IS_ERR(res[0])) {
                put_pipe_info(inode, inode->i_pipe);
                fput(f);
                return PTR_ERR(res[0]);
        }
        res[0]->private_data = inode->i_pipe;
        res[0]->f_pipe = 0;
        res[1] = f;
        stream_open(inode, res[0]);
        stream_open(inode, res[1]);
        /*
         * Disable permission and pre-content events, but enable legacy
         * inotify events for legacy users.
         */
        file_set_fsnotify_mode(res[0], FMODE_NONOTIFY_PERM);
        file_set_fsnotify_mode(res[1], FMODE_NONOTIFY_PERM);
        return 0;
}

static int __do_pipe_flags(int *fd, struct file **files, int flags)
{
        int error;
        int fdw, fdr;

        if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
                return -EINVAL;

        error = create_pipe_files(files, flags);
        if (error)
                return error;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_read_pipe;
        fdr = error;

        error = get_unused_fd_flags(flags);
        if (error < 0)
                goto err_fdr;
        fdw = error;

        audit_fd_pair(fdr, fdw);
        fd[0] = fdr;
        fd[1] = fdw;
        /* pipe groks IOCB_NOWAIT */
        files[0]->f_mode |= FMODE_NOWAIT;
        files[1]->f_mode |= FMODE_NOWAIT;
        return 0;

 err_fdr:
        put_unused_fd(fdr);
 err_read_pipe:
        fput(files[0]);
        fput(files[1]);
        return error;
}

int do_pipe_flags(int *fd, int flags)
{
        struct file *files[2];
        int error = __do_pipe_flags(fd, files, flags);
        if (!error) {
                fd_install(fd[0], files[0]);
                fd_install(fd[1], files[1]);
        }
        return error;
}

/*
 * sys_pipe() is the normal C calling standard for creating
 * a pipe. It's not the way Unix traditionally does this, though.
 */
static int do_pipe2(int __user *fildes, int flags)
{
        struct file *files[2];
        int fd[2];
        int error;

        error = __do_pipe_flags(fd, files, flags);
        if (!error) {
                if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
                        fput(files[0]);
                        fput(files[1]);
                        put_unused_fd(fd[0]);
                        put_unused_fd(fd[1]);
                        error = -EFAULT;
                } else {
                        fd_install(fd[0], files[0]);
                        fd_install(fd[1], files[1]);
                }
        }
        return error;
}

SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
{
        return do_pipe2(fildes, flags);
}

SYSCALL_DEFINE1(pipe, int __user *, fildes)
{
        return do_pipe2(fildes, 0);
}

/*
 * This is the stupid "wait for pipe to be readable or writable"
 * model.
 *
 * See pipe_read/write() for the proper kind of exclusive wait,
 * but that requires that we wake up any other readers/writers
 * if we then do not end up reading everything (ie the whole
 * "wake_next_reader/writer" logic in pipe_read/write()).
 */
void pipe_wait_readable(struct pipe_inode_info *pipe)
{
        pipe_unlock(pipe);
        wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
        pipe_lock(pipe);
}

void pipe_wait_writable(struct pipe_inode_info *pipe)
{
        pipe_unlock(pipe);
        wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
        pipe_lock(pipe);
}

/*
 * This depends on both the wait (here) and the wakeup (wake_up_partner)
 * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
 * race with the count check and waitqueue prep.
 *
 * Normally in order to avoid races, you'd do the prepare_to_wait() first,
 * then check the condition you're waiting for, and only then sleep. But
 * because of the pipe lock, we can check the condition before being on
 * the wait queue.
 *
 * We use the 'rd_wait' waitqueue for pipe partner waiting.
 */
static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
{
        DEFINE_WAIT(rdwait);
        int cur = *cnt;

        while (cur == *cnt) {
                prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
                pipe_unlock(pipe);
                schedule();
                finish_wait(&pipe->rd_wait, &rdwait);
                pipe_lock(pipe);
                if (signal_pending(current))
                        break;
        }
        return cur == *cnt ? -ERESTARTSYS : 0;
}

static void wake_up_partner(struct pipe_inode_info *pipe)
{
        wake_up_interruptible_all(&pipe->rd_wait);
}

static int fifo_open(struct inode *inode, struct file *filp)
{
        bool is_pipe = inode->i_fop == &pipeanon_fops;
        struct pipe_inode_info *pipe;
        int ret;

        filp->f_pipe = 0;

        spin_lock(&inode->i_lock);
        if (inode->i_pipe) {
                pipe = inode->i_pipe;
                pipe->files++;
                spin_unlock(&inode->i_lock);
        } else {
                spin_unlock(&inode->i_lock);
                pipe = alloc_pipe_info();
                if (!pipe)
                        return -ENOMEM;
                pipe->files = 1;
                spin_lock(&inode->i_lock);
                if (unlikely(inode->i_pipe)) {
                        inode->i_pipe->files++;
                        spin_unlock(&inode->i_lock);
                        free_pipe_info(pipe);
                        pipe = inode->i_pipe;
                } else {
                        inode->i_pipe = pipe;
                        spin_unlock(&inode->i_lock);
                }
        }
        filp->private_data = pipe;
        /* OK, we have a pipe and it's pinned down */

        mutex_lock(&pipe->mutex);

        /* We can only do regular read/write on fifos */
        stream_open(inode, filp);

        switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
        case FMODE_READ:
        /*
         *  O_RDONLY
         *  POSIX.1 says that O_NONBLOCK means return with the FIFO
         *  opened, even when there is no process writing the FIFO.
         */
                pipe->r_counter++;
                if (pipe->readers++ == 0)
                        wake_up_partner(pipe);

                if (!is_pipe && !pipe->writers) {
                        if ((filp->f_flags & O_NONBLOCK)) {
                                /* suppress EPOLLHUP until we have
                                 * seen a writer */
                                filp->f_pipe = pipe->w_counter;
                        } else {
                                if (wait_for_partner(pipe, &pipe->w_counter))
                                        goto err_rd;
                        }
                }
                break;

        case FMODE_WRITE:
        /*
         *  O_WRONLY
         *  POSIX.1 says that O_NONBLOCK means return -1 with
         *  errno=ENXIO when there is no process reading the FIFO.
         */
                ret = -ENXIO;
                if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
                        goto err;

                pipe->w_counter++;
                if (!pipe->writers++)
                        wake_up_partner(pipe);

                if (!is_pipe && !pipe->readers) {
                        if (wait_for_partner(pipe, &pipe->r_counter))
                                goto err_wr;
                }
                break;

        case FMODE_READ | FMODE_WRITE:
        /*
         *  O_RDWR
         *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
         *  This implementation will NEVER block on a O_RDWR open, since
         *  the process can at least talk to itself.
         */

                pipe->readers++;
                pipe->writers++;
                pipe->r_counter++;
                pipe->w_counter++;
                if (pipe->readers == 1 || pipe->writers == 1)
                        wake_up_partner(pipe);
                break;

        default:
                ret = -EINVAL;
                goto err;
        }

        /* Ok! */
        mutex_unlock(&pipe->mutex);
        return 0;

err_rd:
        if (!--pipe->readers)
                wake_up_interruptible(&pipe->wr_wait);
        ret = -ERESTARTSYS;
        goto err;

err_wr:
        if (!--pipe->writers)
                wake_up_interruptible_all(&pipe->rd_wait);
        ret = -ERESTARTSYS;
        goto err;

err:
        mutex_unlock(&pipe->mutex);

        put_pipe_info(inode, pipe);
        return ret;
}

const struct file_operations pipefifo_fops = {
        .open                = fifo_open,
        .read_iter        = fifo_pipe_read,
        .write_iter        = fifo_pipe_write,
        .poll                = pipe_poll,
        .unlocked_ioctl        = pipe_ioctl,
        .release        = pipe_release,
        .fasync                = pipe_fasync,
        .splice_write        = iter_file_splice_write,
};

static const struct file_operations pipeanon_fops = {
        .open                = fifo_open,
        .read_iter        = anon_pipe_read,
        .write_iter        = anon_pipe_write,
        .poll                = pipe_poll,
        .unlocked_ioctl        = pipe_ioctl,
        .release        = pipe_release,
        .fasync                = pipe_fasync,
        .splice_write        = iter_file_splice_write,
};

/*
 * Currently we rely on the pipe array holding a power-of-2 number
 * of pages. Returns 0 on error.
 */
unsigned int round_pipe_size(unsigned int size)
{
        if (size > (1U << 31))
                return 0;

        /* Minimum pipe size, as required by POSIX */
        if (size < PAGE_SIZE)
                return PAGE_SIZE;

        return roundup_pow_of_two(size);
}

/*
 * Resize the pipe ring to a number of slots.
 *
 * Note the pipe can be reduced in capacity, but only if the current
 * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
 * returned instead.
 */
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
        struct pipe_buffer *bufs;
        unsigned int head, tail, mask, n;

        /* nr_slots larger than limits of pipe->{head,tail} */
        if (unlikely(nr_slots > (pipe_index_t)-1u))
                return -EINVAL;

        bufs = kcalloc(nr_slots, sizeof(*bufs),
                       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
        if (unlikely(!bufs))
                return -ENOMEM;

        spin_lock_irq(&pipe->rd_wait.lock);
        mask = pipe->ring_size - 1;
        head = pipe->head;
        tail = pipe->tail;

        n = pipe_occupancy(head, tail);
        if (nr_slots < n) {
                spin_unlock_irq(&pipe->rd_wait.lock);
                kfree(bufs);
                return -EBUSY;
        }

        /*
         * The pipe array wraps around, so just start the new one at zero
         * and adjust the indices.
         */
        if (n > 0) {
                unsigned int h = head & mask;
                unsigned int t = tail & mask;
                if (h > t) {
                        memcpy(bufs, pipe->bufs + t,
                               n * sizeof(struct pipe_buffer));
                } else {
                        unsigned int tsize = pipe->ring_size - t;
                        if (h > 0)
                                memcpy(bufs + tsize, pipe->bufs,
                                       h * sizeof(struct pipe_buffer));
                        memcpy(bufs, pipe->bufs + t,
                               tsize * sizeof(struct pipe_buffer));
                }
        }

        head = n;
        tail = 0;

        kfree(pipe->bufs);
        pipe->bufs = bufs;
        pipe->ring_size = nr_slots;
        if (pipe->max_usage > nr_slots)
                pipe->max_usage = nr_slots;
        pipe->tail = tail;
        pipe->head = head;

        if (!pipe_has_watch_queue(pipe)) {
                pipe->max_usage = nr_slots;
                pipe->nr_accounted = nr_slots;
        }

        spin_unlock_irq(&pipe->rd_wait.lock);

        /* This might have made more room for writers */
        wake_up_interruptible(&pipe->wr_wait);
        return 0;
}

/*
 * Allocate a new array of pipe buffers and copy the info over. Returns the
 * pipe size if successful, or return -ERROR on error.
 */
static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
{
        unsigned long user_bufs;
        unsigned int nr_slots, size;
        long ret = 0;

        if (pipe_has_watch_queue(pipe))
                return -EBUSY;

        size = round_pipe_size(arg);
        nr_slots = size >> PAGE_SHIFT;

        if (!nr_slots)
                return -EINVAL;

        /*
         * If trying to increase the pipe capacity, check that an
         * unprivileged user is not trying to exceed various limits
         * (soft limit check here, hard limit check just below).
         * Decreasing the pipe capacity is always permitted, even
         * if the user is currently over a limit.
         */
        if (nr_slots > pipe->max_usage &&
                        size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
                return -EPERM;

        user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);

        if (nr_slots > pipe->max_usage &&
                        (too_many_pipe_buffers_hard(user_bufs) ||
                         too_many_pipe_buffers_soft(user_bufs)) &&
                        pipe_is_unprivileged_user()) {
                ret = -EPERM;
                goto out_revert_acct;
        }

        ret = pipe_resize_ring(pipe, nr_slots);
        if (ret < 0)
                goto out_revert_acct;

        return pipe->max_usage * PAGE_SIZE;

out_revert_acct:
        (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
        return ret;
}

/*
 * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
 * not enough to verify that this is a pipe.
 */
struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
{
        struct pipe_inode_info *pipe = file->private_data;

        if (!pipe)
                return NULL;
        if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops)
                return NULL;
        if (for_splice && pipe_has_watch_queue(pipe))
                return NULL;
        return pipe;
}

long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
{
        struct pipe_inode_info *pipe;
        long ret;

        pipe = get_pipe_info(file, false);
        if (!pipe)
                return -EBADF;

        mutex_lock(&pipe->mutex);

        switch (cmd) {
        case F_SETPIPE_SZ:
                ret = pipe_set_size(pipe, arg);
                break;
        case F_GETPIPE_SZ:
                ret = pipe->max_usage * PAGE_SIZE;
                break;
        default:
                ret = -EINVAL;
                break;
        }

        mutex_unlock(&pipe->mutex);
        return ret;
}

static const struct super_operations pipefs_ops = {
        .destroy_inode = free_inode_nonrcu,
        .statfs = simple_statfs,
};

/*
 * pipefs should _never_ be mounted by userland - too much of security hassle,
 * no real gain from having the whole file system mounted. So we don't need
 * any operations on the root directory. However, we need a non-trivial
 * d_name - pipe: will go nicely and kill the special-casing in procfs.
 */

static int pipefs_init_fs_context(struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
        if (!ctx)
                return -ENOMEM;
        ctx->ops = &pipefs_ops;
        ctx->dops = &pipefs_dentry_operations;
        return 0;
}

static struct file_system_type pipe_fs_type = {
        .name                = "pipefs",
        .init_fs_context = pipefs_init_fs_context,
        .kill_sb        = kill_anon_super,
};

#ifdef CONFIG_SYSCTL
static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
                                        unsigned int *valp,
                                        int write, void *data)
{
        if (write) {
                unsigned int val;

                val = round_pipe_size(*lvalp);
                if (val == 0)
                        return -EINVAL;

                *valp = val;
        } else {
                unsigned int val = *valp;
                *lvalp = (unsigned long) val;
        }

        return 0;
}

static int proc_dopipe_max_size(const struct ctl_table *table, int write,
                                void *buffer, size_t *lenp, loff_t *ppos)
{
        return do_proc_douintvec(table, write, buffer, lenp, ppos,
                                 do_proc_dopipe_max_size_conv, NULL);
}

static const struct ctl_table fs_pipe_sysctls[] = {
        {
                .procname        = "pipe-max-size",
                .data                = &pipe_max_size,
                .maxlen                = sizeof(pipe_max_size),
                .mode                = 0644,
                .proc_handler        = proc_dopipe_max_size,
        },
        {
                .procname        = "pipe-user-pages-hard",
                .data                = &pipe_user_pages_hard,
                .maxlen                = sizeof(pipe_user_pages_hard),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
        {
                .procname        = "pipe-user-pages-soft",
                .data                = &pipe_user_pages_soft,
                .maxlen                = sizeof(pipe_user_pages_soft),
                .mode                = 0644,
                .proc_handler        = proc_doulongvec_minmax,
        },
};
#endif

static int __init init_pipe_fs(void)
{
        int err = register_filesystem(&pipe_fs_type);

        if (!err) {
                pipe_mnt = kern_mount(&pipe_fs_type);
                if (IS_ERR(pipe_mnt)) {
                        err = PTR_ERR(pipe_mnt);
                        unregister_filesystem(&pipe_fs_type);
                }
        }
#ifdef CONFIG_SYSCTL
        register_sysctl_init("fs", fs_pipe_sysctls);
#endif
        return err;
}

fs_initcall(init_pipe_fs);








































































































































































































































































































































































































































































































































   22 



























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
/*
 *  linux/include/linux/console.h
 *
 *  Copyright (C) 1993        Hamish Macdonald
 *
 * This file is subject to the terms and conditions of the GNU General Public
 * License.  See the file COPYING in the main directory of this archive
 * for more details.
 *
 * Changed:
 * 10-Mar-94: Arno Griffioen: Conversion for vt100 emulator port from PC LINUX
 */

#ifndef _LINUX_CONSOLE_H_
#define _LINUX_CONSOLE_H_ 1

#include <linux/atomic.h>
#include <linux/bits.h>
#include <linux/irq_work.h>
#include <linux/rculist.h>
#include <linux/rcuwait.h>
#include <linux/types.h>
#include <linux/vesa.h>

struct vc_data;
struct console_font_op;
struct console_font;
struct module;
struct tty_struct;
struct notifier_block;

enum con_scroll {
        SM_UP,
        SM_DOWN,
};

enum vc_intensity;

/**
 * struct consw - callbacks for consoles
 *
 * @owner:      the module to get references of when this console is used
 * @con_startup: set up the console and return its name (like VGA, EGA, ...)
 * @con_init:   initialize the console on @vc. @init is true for the very first
 *                call on this @vc.
 * @con_deinit: deinitialize the console from @vc.
 * @con_clear:  erase @count characters at [@x, @y] on @vc. @count >= 1.
 * @con_putc:   emit one character with attributes @ca to [@x, @y] on @vc.
 *                (optional -- @con_putcs would be called instead)
 * @con_putcs:  emit @count characters with attributes @s to [@x, @y] on @vc.
 * @con_cursor: enable/disable cursor depending on @enable
 * @con_scroll: move lines from @top to @bottom in direction @dir by @lines.
 *                Return true if no generic handling should be done.
 *                Invoked by csi_M and printing to the console.
 * @con_switch: notifier about the console switch; it is supposed to return
 *                true if a redraw is needed.
 * @con_blank:  blank/unblank the console. The target mode is passed in @blank.
 *                @mode_switch is set if changing from/to text/graphics. The hook
 *                is supposed to return true if a redraw is needed.
 * @con_font_set: set console @vc font to @font with height @vpitch. @flags can
 *                be %KD_FONT_FLAG_DONT_RECALC. (optional)
 * @con_font_get: fetch the current font on @vc of height @vpitch into @font.
 *                (optional)
 * @con_font_default: set default font on @vc. @name can be %NULL or font name
 *                to search for. @font can be filled back. (optional)
 * @con_resize:        resize the @vc console to @width x @height. @from_user is true
 *                when this change comes from the user space.
 * @con_set_palette: sets the palette of the console @vc to @table (optional)
 * @con_scrolldelta: the contents of the console should be scrolled by @lines.
 *                     Invoked by user. (optional)
 * @con_set_origin: set origin (see &vc_data::vc_origin) of the @vc. If not
 *                provided or returns false, the origin is set to
 *                @vc->vc_screenbuf. (optional)
 * @con_save_screen: save screen content into @vc->vc_screenbuf. Called e.g.
 *                upon entering graphics. (optional)
 * @con_build_attr: build attributes based on @color, @intensity and other
 *                parameters. The result is used for both normal and erase
 *                characters. (optional)
 * @con_invert_region: invert a region of length @count on @vc starting at @p.
 *                (optional)
 * @con_debug_enter: prepare the console for the debugger. This includes, but
 *                is not limited to, unblanking the console, loading an
 *                appropriate palette, and allowing debugger generated output.
 *                (optional)
 * @con_debug_leave: restore the console to its pre-debug state as closely as
 *                possible. (optional)
 */
struct consw {
        struct module *owner;
        const char *(*con_startup)(void);
        void        (*con_init)(struct vc_data *vc, bool init);
        void        (*con_deinit)(struct vc_data *vc);
        void        (*con_clear)(struct vc_data *vc, unsigned int y,
                             unsigned int x, unsigned int count);
        void        (*con_putc)(struct vc_data *vc, u16 ca, unsigned int y,
                            unsigned int x);
        void        (*con_putcs)(struct vc_data *vc, const u16 *s,
                             unsigned int count, unsigned int ypos,
                             unsigned int xpos);
        void        (*con_cursor)(struct vc_data *vc, bool enable);
        bool        (*con_scroll)(struct vc_data *vc, unsigned int top,
                        unsigned int bottom, enum con_scroll dir,
                        unsigned int lines);
        bool        (*con_switch)(struct vc_data *vc);
        bool        (*con_blank)(struct vc_data *vc, enum vesa_blank_mode blank,
                             bool mode_switch);
        int        (*con_font_set)(struct vc_data *vc,
                                const struct console_font *font,
                                unsigned int vpitch, unsigned int flags);
        int        (*con_font_get)(struct vc_data *vc, struct console_font *font,
                        unsigned int vpitch);
        int        (*con_font_default)(struct vc_data *vc,
                        struct console_font *font, const char *name);
        int     (*con_resize)(struct vc_data *vc, unsigned int width,
                              unsigned int height, bool from_user);
        void        (*con_set_palette)(struct vc_data *vc,
                        const unsigned char *table);
        void        (*con_scrolldelta)(struct vc_data *vc, int lines);
        bool        (*con_set_origin)(struct vc_data *vc);
        void        (*con_save_screen)(struct vc_data *vc);
        u8        (*con_build_attr)(struct vc_data *vc, u8 color,
                        enum vc_intensity intensity,
                        bool blink, bool underline, bool reverse, bool italic);
        void        (*con_invert_region)(struct vc_data *vc, u16 *p, int count);
        void        (*con_debug_enter)(struct vc_data *vc);
        void        (*con_debug_leave)(struct vc_data *vc);
};

extern const struct consw *conswitchp;

extern const struct consw dummy_con;        /* dummy console buffer */
extern const struct consw vga_con;        /* VGA text console */
extern const struct consw newport_con;        /* SGI Newport console  */

struct screen_info;
#ifdef CONFIG_VGA_CONSOLE
void vgacon_register_screen(struct screen_info *si);
#else
static inline void vgacon_register_screen(struct screen_info *si) { }
#endif

int con_is_bound(const struct consw *csw);
int do_unregister_con_driver(const struct consw *csw);
int do_take_over_console(const struct consw *sw, int first, int last, int deflt);
void give_up_console(const struct consw *sw);
#ifdef CONFIG_VT
void con_debug_enter(struct vc_data *vc);
void con_debug_leave(void);
#else
static inline void con_debug_enter(struct vc_data *vc) { }
static inline void con_debug_leave(void) { }
#endif

/*
 * The interface for a console, or any other device that wants to capture
 * console messages (printer driver?)
 */

/**
 * enum cons_flags - General console flags
 * @CON_PRINTBUFFER:        Used by newly registered consoles to avoid duplicate
 *                        output of messages that were already shown by boot
 *                        consoles or read by userspace via syslog() syscall.
 * @CON_CONSDEV:        Indicates that the console driver is backing
 *                        /dev/console.
 * @CON_ENABLED:        Indicates if a console is allowed to print records. If
 *                        false, the console also will not advance to later
 *                        records.
 * @CON_BOOT:                Marks the console driver as early console driver which
 *                        is used during boot before the real driver becomes
 *                        available. It will be automatically unregistered
 *                        when the real console driver is registered unless
 *                        "keep_bootcon" parameter is used.
 * @CON_ANYTIME:        A misnomed historical flag which tells the core code
 *                        that the legacy @console::write callback can be invoked
 *                        on a CPU which is marked OFFLINE. That is misleading as
 *                        it suggests that there is no contextual limit for
 *                        invoking the callback. The original motivation was
 *                        readiness of the per-CPU areas.
 * @CON_BRL:                Indicates a braille device which is exempt from
 *                        receiving the printk spam for obvious reasons.
 * @CON_EXTENDED:        The console supports the extended output format of
 *                        /dev/kmesg which requires a larger output buffer.
 * @CON_SUSPENDED:        Indicates if a console is suspended. If true, the
 *                        printing callbacks must not be called.
 * @CON_NBCON:                Console can operate outside of the legacy style console_lock
 *                        constraints.
 */
enum cons_flags {
        CON_PRINTBUFFER                = BIT(0),
        CON_CONSDEV                = BIT(1),
        CON_ENABLED                = BIT(2),
        CON_BOOT                = BIT(3),
        CON_ANYTIME                = BIT(4),
        CON_BRL                        = BIT(5),
        CON_EXTENDED                = BIT(6),
        CON_SUSPENDED                = BIT(7),
        CON_NBCON                = BIT(8),
};

/**
 * struct nbcon_state - console state for nbcon consoles
 * @atom:        Compound of the state fields for atomic operations
 *
 * @req_prio:                The priority of a handover request
 * @prio:                The priority of the current owner
 * @unsafe:                Console is busy in a non takeover region
 * @unsafe_takeover:        A hostile takeover in an unsafe state happened in the
 *                        past. The console cannot be safe until re-initialized.
 * @cpu:                The CPU on which the owner runs
 *
 * To be used for reading and preparing of the value stored in the nbcon
 * state variable @console::nbcon_state.
 *
 * The @prio and @req_prio fields are particularly important to allow
 * spin-waiting to timeout and give up without the risk of a waiter being
 * assigned the lock after giving up.
 */
struct nbcon_state {
        union {
                unsigned int        atom;
                struct {
                        unsigned int prio                :  2;
                        unsigned int req_prio                :  2;
                        unsigned int unsafe                :  1;
                        unsigned int unsafe_takeover        :  1;
                        unsigned int cpu                : 24;
                };
        };
};

/*
 * The nbcon_state struct is used to easily create and interpret values that
 * are stored in the @console::nbcon_state variable. Ensure this struct stays
 * within the size boundaries of the atomic variable's underlying type in
 * order to avoid any accidental truncation.
 */
static_assert(sizeof(struct nbcon_state) <= sizeof(int));

/**
 * enum nbcon_prio - console owner priority for nbcon consoles
 * @NBCON_PRIO_NONE:                Unused
 * @NBCON_PRIO_NORMAL:                Normal (non-emergency) usage
 * @NBCON_PRIO_EMERGENCY:        Emergency output (WARN/OOPS...)
 * @NBCON_PRIO_PANIC:                Panic output
 * @NBCON_PRIO_MAX:                The number of priority levels
 *
 * A higher priority context can takeover the console when it is
 * in the safe state. The final attempt to flush consoles in panic()
 * can be allowed to do so even in an unsafe state (Hope and pray).
 */
enum nbcon_prio {
        NBCON_PRIO_NONE = 0,
        NBCON_PRIO_NORMAL,
        NBCON_PRIO_EMERGENCY,
        NBCON_PRIO_PANIC,
        NBCON_PRIO_MAX,
};

struct console;
struct printk_buffers;

/**
 * struct nbcon_context - Context for console acquire/release
 * @console:                        The associated console
 * @spinwait_max_us:                Limit for spin-wait acquire
 * @prio:                        Priority of the context
 * @allow_unsafe_takeover:        Allow performing takeover even if unsafe. Can
 *                                be used only with NBCON_PRIO_PANIC @prio. It
 *                                might cause a system freeze when the console
 *                                is used later.
 * @backlog:                        Ringbuffer has pending records
 * @pbufs:                        Pointer to the text buffer for this context
 * @seq:                        The sequence number to print for this context
 */
struct nbcon_context {
        /* members set by caller */
        struct console                *console;
        unsigned int                spinwait_max_us;
        enum nbcon_prio                prio;
        unsigned int                allow_unsafe_takeover        : 1;

        /* members set by emit */
        unsigned int                backlog                        : 1;

        /* members set by acquire */
        struct printk_buffers        *pbufs;
        u64                        seq;
};

/**
 * struct nbcon_write_context - Context handed to the nbcon write callbacks
 * @ctxt:                The core console context
 * @outbuf:                Pointer to the text buffer for output
 * @len:                Length to write
 * @unsafe_takeover:        If a hostile takeover in an unsafe state has occurred
 */
struct nbcon_write_context {
        struct nbcon_context        __private ctxt;
        char                        *outbuf;
        unsigned int                len;
        bool                        unsafe_takeover;
};

/**
 * struct console - The console descriptor structure
 * @name:                The name of the console driver
 * @write:                Legacy write callback to output messages (Optional)
 * @read:                Read callback for console input (Optional)
 * @device:                The underlying TTY device driver (Optional)
 * @unblank:                Callback to unblank the console (Optional)
 * @setup:                Callback for initializing the console (Optional)
 * @exit:                Callback for teardown of the console (Optional)
 * @match:                Callback for matching a console (Optional)
 * @flags:                Console flags. See enum cons_flags
 * @index:                Console index, e.g. port number
 * @cflag:                TTY control mode flags
 * @ispeed:                TTY input speed
 * @ospeed:                TTY output speed
 * @seq:                Sequence number of the next ringbuffer record to print
 * @dropped:                Number of unreported dropped ringbuffer records
 * @data:                Driver private data
 * @node:                hlist node for the console list
 *
 * @nbcon_state:        State for nbcon consoles
 * @nbcon_seq:                Sequence number of the next record for nbcon to print
 * @nbcon_device_ctxt:        Context available for non-printing operations
 * @nbcon_prev_seq:        Seq num the previous nbcon owner was assigned to print
 * @pbufs:                Pointer to nbcon private buffer
 * @kthread:                Printer kthread for this console
 * @rcuwait:                RCU-safe wait object for @kthread waking
 * @irq_work:                Defer @kthread waking to IRQ work context
 */
struct console {
        char                        name[16];
        void                        (*write)(struct console *co, const char *s, unsigned int count);
        int                        (*read)(struct console *co, char *s, unsigned int count);
        struct tty_driver        *(*device)(struct console *co, int *index);
        void                        (*unblank)(void);
        int                        (*setup)(struct console *co, char *options);
        int                        (*exit)(struct console *co);
        int                        (*match)(struct console *co, char *name, int idx, char *options);
        short                        flags;
        short                        index;
        int                        cflag;
        uint                        ispeed;
        uint                        ospeed;
        u64                        seq;
        unsigned long                dropped;
        void                        *data;
        struct hlist_node        node;

        /* nbcon console specific members */

        /**
         * @write_atomic:
         *
         * NBCON callback to write out text in any context. (Optional)
         *
         * This callback is called with the console already acquired. However,
         * a higher priority context is allowed to take it over by default.
         *
         * The callback must call nbcon_enter_unsafe() and nbcon_exit_unsafe()
         * around any code where the takeover is not safe, for example, when
         * manipulating the serial port registers.
         *
         * nbcon_enter_unsafe() will fail if the context has lost the console
         * ownership in the meantime. In this case, the callback is no longer
         * allowed to go forward. It must back out immediately and carefully.
         * The buffer content is also no longer trusted since it no longer
         * belongs to the context.
         *
         * The callback should allow the takeover whenever it is safe. It
         * increases the chance to see messages when the system is in trouble.
         * If the driver must reacquire ownership in order to finalize or
         * revert hardware changes, nbcon_reacquire_nobuf() can be used.
         * However, on reacquire the buffer content is no longer available. A
         * reacquire cannot be used to resume printing.
         *
         * The callback can be called from any context (including NMI).
         * Therefore it must avoid usage of any locking and instead rely
         * on the console ownership for synchronization.
         */
        void (*write_atomic)(struct console *con, struct nbcon_write_context *wctxt);

        /**
         * @write_thread:
         *
         * NBCON callback to write out text in task context.
         *
         * This callback must be called only in task context with both
         * device_lock() and the nbcon console acquired with
         * NBCON_PRIO_NORMAL.
         *
         * The same rules for console ownership verification and unsafe
         * sections handling applies as with write_atomic().
         *
         * The console ownership handling is necessary for synchronization
         * against write_atomic() which is synchronized only via the context.
         *
         * The device_lock() provides the primary serialization for operations
         * on the device. It might be as relaxed (mutex)[*] or as tight
         * (disabled preemption and interrupts) as needed. It allows
         * the kthread to operate in the least restrictive mode[**].
         *
         * [*] Standalone nbcon_context_try_acquire() is not safe with
         *     the preemption enabled, see nbcon_owner_matches(). But it
         *     can be safe when always called in the preemptive context
         *     under the device_lock().
         *
         * [**] The device_lock() makes sure that nbcon_context_try_acquire()
         *      would never need to spin which is important especially with
         *      PREEMPT_RT.
         */
        void (*write_thread)(struct console *con, struct nbcon_write_context *wctxt);

        /**
         * @device_lock:
         *
         * NBCON callback to begin synchronization with driver code.
         *
         * Console drivers typically must deal with access to the hardware
         * via user input/output (such as an interactive login shell) and
         * output of kernel messages via printk() calls. This callback is
         * called by the printk-subsystem whenever it needs to synchronize
         * with hardware access by the driver. It should be implemented to
         * use whatever synchronization mechanism the driver is using for
         * itself (for example, the port lock for uart serial consoles).
         *
         * The callback is always called from task context. It may use any
         * synchronization method required by the driver.
         *
         * IMPORTANT: The callback MUST disable migration. The console driver
         *        may be using a synchronization mechanism that already takes
         *        care of this (such as spinlocks). Otherwise this function must
         *        explicitly call migrate_disable().
         *
         * The flags argument is provided as a convenience to the driver. It
         * will be passed again to device_unlock(). It can be ignored if the
         * driver does not need it.
         */
        void (*device_lock)(struct console *con, unsigned long *flags);

        /**
         * @device_unlock:
         *
         * NBCON callback to finish synchronization with driver code.
         *
         * It is the counterpart to device_lock().
         *
         * This callback is always called from task context. It must
         * appropriately re-enable migration (depending on how device_lock()
         * disabled migration).
         *
         * The flags argument is the value of the same variable that was
         * passed to device_lock().
         */
        void (*device_unlock)(struct console *con, unsigned long flags);

        atomic_t                __private nbcon_state;
        atomic_long_t                __private nbcon_seq;
        struct nbcon_context        __private nbcon_device_ctxt;
        atomic_long_t           __private nbcon_prev_seq;

        struct printk_buffers        *pbufs;
        struct task_struct        *kthread;
        struct rcuwait                rcuwait;
        struct irq_work                irq_work;
};

#ifdef CONFIG_LOCKDEP
extern void lockdep_assert_console_list_lock_held(void);
#else
static inline void lockdep_assert_console_list_lock_held(void)
{
}
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
extern bool console_srcu_read_lock_is_held(void);
#else
static inline bool console_srcu_read_lock_is_held(void)
{
        return 1;
}
#endif

extern int console_srcu_read_lock(void);
extern void console_srcu_read_unlock(int cookie);

extern void console_list_lock(void) __acquires(console_mutex);
extern void console_list_unlock(void) __releases(console_mutex);

extern struct hlist_head console_list;

/**
 * console_srcu_read_flags - Locklessly read flags of a possibly registered
 *                                console
 * @con:        struct console pointer of console to read flags from
 *
 * Locklessly reading @con->flags provides a consistent read value because
 * there is at most one CPU modifying @con->flags and that CPU is using only
 * read-modify-write operations to do so.
 *
 * Requires console_srcu_read_lock to be held, which implies that @con might
 * be a registered console. The purpose of holding console_srcu_read_lock is
 * to guarantee that the console state is valid (CON_SUSPENDED/CON_ENABLED)
 * and that no exit/cleanup routines will run if the console is currently
 * undergoing unregistration.
 *
 * If the caller is holding the console_list_lock or it is _certain_ that
 * @con is not and will not become registered, the caller may read
 * @con->flags directly instead.
 *
 * Context: Any context.
 * Return: The current value of the @con->flags field.
 */
static inline short console_srcu_read_flags(const struct console *con)
{
        WARN_ON_ONCE(!console_srcu_read_lock_is_held());

        /*
         * The READ_ONCE() matches the WRITE_ONCE() when @flags are modified
         * for registered consoles with console_srcu_write_flags().
         */
        return data_race(READ_ONCE(con->flags));
}

/**
 * console_srcu_write_flags - Write flags for a registered console
 * @con:        struct console pointer of console to write flags to
 * @flags:        new flags value to write
 *
 * Only use this function to write flags for registered consoles. It
 * requires holding the console_list_lock.
 *
 * Context: Any context.
 */
static inline void console_srcu_write_flags(struct console *con, short flags)
{
        lockdep_assert_console_list_lock_held();

        /* This matches the READ_ONCE() in console_srcu_read_flags(). */
        WRITE_ONCE(con->flags, flags);
}

/* Variant of console_is_registered() when the console_list_lock is held. */
static inline bool console_is_registered_locked(const struct console *con)
{
        lockdep_assert_console_list_lock_held();
        return !hlist_unhashed(&con->node);
}

/*
 * console_is_registered - Check if the console is registered
 * @con:        struct console pointer of console to check
 *
 * Context: Process context. May sleep while acquiring console list lock.
 * Return: true if the console is in the console list, otherwise false.
 *
 * If false is returned for a console that was previously registered, it
 * can be assumed that the console's unregistration is fully completed,
 * including the exit() callback after console list removal.
 */
static inline bool console_is_registered(const struct console *con)
{
        bool ret;

        console_list_lock();
        ret = console_is_registered_locked(con);
        console_list_unlock();
        return ret;
}

/**
 * for_each_console_srcu() - Iterator over registered consoles
 * @con:        struct console pointer used as loop cursor
 *
 * Although SRCU guarantees the console list will be consistent, the
 * struct console fields may be updated by other CPUs while iterating.
 *
 * Requires console_srcu_read_lock to be held. Can be invoked from
 * any context.
 */
#define for_each_console_srcu(con)                                        \
        hlist_for_each_entry_srcu(con, &console_list, node,                \
                                  console_srcu_read_lock_is_held())

/**
 * for_each_console() - Iterator over registered consoles
 * @con:        struct console pointer used as loop cursor
 *
 * The console list and the &console.flags are immutable while iterating.
 *
 * Requires console_list_lock to be held.
 */
#define for_each_console(con)                                                \
        lockdep_assert_console_list_lock_held();                        \
        hlist_for_each_entry(con, &console_list, node)

#ifdef CONFIG_PRINTK
extern void nbcon_cpu_emergency_enter(void);
extern void nbcon_cpu_emergency_exit(void);
extern bool nbcon_can_proceed(struct nbcon_write_context *wctxt);
extern bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt);
extern bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt);
extern void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt);
#else
static inline void nbcon_cpu_emergency_enter(void) { }
static inline void nbcon_cpu_emergency_exit(void) { }
static inline bool nbcon_can_proceed(struct nbcon_write_context *wctxt) { return false; }
static inline bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt) { return false; }
static inline bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt) { return false; }
static inline void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt) { }
#endif

extern int console_set_on_cmdline;
extern struct console *early_console;

enum con_flush_mode {
        CONSOLE_FLUSH_PENDING,
        CONSOLE_REPLAY_ALL,
};

extern int add_preferred_console(const char *name, const short idx, char *options);
extern void console_force_preferred_locked(struct console *con);
extern void register_console(struct console *);
extern int unregister_console(struct console *);
extern void console_lock(void);
extern int console_trylock(void);
extern void console_unlock(void);
extern void console_conditional_schedule(void);
extern void console_unblank(void);
extern void console_flush_on_panic(enum con_flush_mode mode);
extern struct tty_driver *console_device(int *);
extern void console_suspend(struct console *);
extern void console_resume(struct console *);
extern int is_console_locked(void);
extern int braille_register_console(struct console *, int index,
                char *console_options, char *braille_options);
extern int braille_unregister_console(struct console *);
#ifdef CONFIG_TTY
extern void console_sysfs_notify(void);
#else
static inline void console_sysfs_notify(void)
{ }
#endif
extern bool console_suspend_enabled;

/* Suspend and resume console messages over PM events */
extern void console_suspend_all(void);
extern void console_resume_all(void);

int mda_console_init(void);

void vcs_make_sysfs(int index);
void vcs_remove_sysfs(int index);

/* Some debug stub to catch some of the obvious races in the VT code */
#define WARN_CONSOLE_UNLOCKED()                                                \
        WARN_ON(!atomic_read(&ignore_console_lock_warning) &&                \
                !is_console_locked() && !oops_in_progress)
/*
 * Increment ignore_console_lock_warning if you need to quiet
 * WARN_CONSOLE_UNLOCKED() for debugging purposes.
 */
extern atomic_t ignore_console_lock_warning;

extern void console_init(void);

/* For deferred console takeover */
void dummycon_register_output_notifier(struct notifier_block *nb);
void dummycon_unregister_output_notifier(struct notifier_block *nb);

#endif /* _LINUX_CONSOLE_H */
































































































   39 


   39 
   39 
   39 





















































































































































































  502 



  505 




  127 
  494 


  504 





  503 




















  390 


  390 


  391 































































  495 










  497 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
// SPDX-License-Identifier: GPL-2.0
/*
 *  mm/pgtable-generic.c
 *
 *  Generic pgtable methods declared in linux/pgtable.h
 *
 *  Copyright (C) 2010  Linus Torvalds
 */

#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/mm_inline.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>

/*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
 * very seldom) called out from the p?d_none_or_clear_bad macros.
 */

void pgd_clear_bad(pgd_t *pgd)
{
        pgd_ERROR(*pgd);
        pgd_clear(pgd);
}

#ifndef __PAGETABLE_P4D_FOLDED
void p4d_clear_bad(p4d_t *p4d)
{
        p4d_ERROR(*p4d);
        p4d_clear(p4d);
}
#endif

#ifndef __PAGETABLE_PUD_FOLDED
void pud_clear_bad(pud_t *pud)
{
        pud_ERROR(*pud);
        pud_clear(pud);
}
#endif

/*
 * Note that the pmd variant below can't be stub'ed out just as for p4d/pud
 * above. pmd folding is special and typically pmd_* macros refer to upper
 * level even when folded
 */
void pmd_clear_bad(pmd_t *pmd)
{
        pmd_ERROR(*pmd);
        pmd_clear(pmd);
}

#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
 * Only sets the access flags (dirty, accessed), as well as write
 * permission. Furthermore, we know it always gets set to a "more
 * permissive" setting, which allows most architectures to optimize
 * this. We return whether the PTE actually changed, which in turn
 * instructs the caller to do things like update__mmu_cache.  This
 * used to be done in the caller, but sparc needs minor faults to
 * force that call on sun4c so we changed this macro slightly
 */
int ptep_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pte_t *ptep,
                          pte_t entry, int dirty)
{
        int changed = !pte_same(ptep_get(ptep), entry);
        if (changed) {
                set_pte_at(vma->vm_mm, address, ptep, entry);
                flush_tlb_fix_spurious_fault(vma, address, ptep);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
int ptep_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pte_t *ptep)
{
        int young;
        young = ptep_test_and_clear_young(vma, address, ptep);
        if (young)
                flush_tlb_page(vma, address);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
                       pte_t *ptep)
{
        struct mm_struct *mm = (vma)->vm_mm;
        pte_t pte;
        pte = ptep_get_and_clear(mm, address, ptep);
        if (pte_accessible(mm, pte))
                flush_tlb_page(vma, address);
        return pte;
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
int pmdp_set_access_flags(struct vm_area_struct *vma,
                          unsigned long address, pmd_t *pmdp,
                          pmd_t entry, int dirty)
{
        int changed = !pmd_same(*pmdp, entry);
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        if (changed) {
                set_pmd_at(vma->vm_mm, address, pmdp, entry);
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        }
        return changed;
}
#endif

#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
int pmdp_clear_flush_young(struct vm_area_struct *vma,
                           unsigned long address, pmd_t *pmdp)
{
        int young;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        young = pmdp_test_and_clear_young(vma, address, pmdp);
        if (young)
                flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return young;
}
#endif

#ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH
pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pmd_t *pmdp)
{
        pmd_t pmd;
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
                           !pmd_devmap(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address,
                            pud_t *pudp)
{
        pud_t pud;

        VM_BUG_ON(address & ~HPAGE_PUD_MASK);
        VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp));
        pud = pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
        flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
        return pud;
}
#endif
#endif

#ifndef __HAVE_ARCH_PGTABLE_DEPOSIT
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
                                pgtable_t pgtable)
{
        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        if (!pmd_huge_pte(mm, pmdp))
                INIT_LIST_HEAD(&pgtable->lru);
        else
                list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
        pmd_huge_pte(mm, pmdp) = pgtable;
}
#endif

#ifndef __HAVE_ARCH_PGTABLE_WITHDRAW
/* no "address" argument so destroys page coloring of some arch */
pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
        pgtable_t pgtable;

        assert_spin_locked(pmd_lockptr(mm, pmdp));

        /* FIFO */
        pgtable = pmd_huge_pte(mm, pmdp);
        pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru,
                                                          struct page, lru);
        if (pmd_huge_pte(mm, pmdp))
                list_del(&pgtable->lru);
        return pgtable;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
        flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return old;
}
#endif

#ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
                         pmd_t *pmdp)
{
        VM_WARN_ON_ONCE(!pmd_present(*pmdp));
        return pmdp_invalidate(vma, address, pmdp);
}
#endif

#ifndef pmdp_collapse_flush
pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
                          pmd_t *pmdp)
{
        /*
         * pmd and hugepage pte format are same. So we could
         * use the same function.
         */
        pmd_t pmd;

        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
        VM_BUG_ON(pmd_trans_huge(*pmdp));
        pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);

        /* collapse entails shooting down ptes not pmd */
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
        return pmd;
}
#endif

/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */
#ifndef pte_free_defer
static void pte_free_now(struct rcu_head *head)
{
        struct page *page;

        page = container_of(head, struct page, rcu_head);
        pte_free(NULL /* mm not passed and not used */, (pgtable_t)page);
}

void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
{
        struct page *page;

        page = pgtable;
        call_rcu(&page->rcu_head, pte_free_now);
}
#endif /* pte_free_defer */
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \
        (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU))
/*
 * See the comment above ptep_get_lockless() in include/linux/pgtable.h:
 * the barriers in pmdp_get_lockless() cannot guarantee that the value in
 * pmd_high actually belongs with the value in pmd_low; but holding interrupts
 * off blocks the TLB flush between present updates, which guarantees that a
 * successful __pte_offset_map() points to a page from matched halves.
 */
static unsigned long pmdp_get_lockless_start(void)
{
        unsigned long irqflags;

        local_irq_save(irqflags);
        return irqflags;
}
static void pmdp_get_lockless_end(unsigned long irqflags)
{
        local_irq_restore(irqflags);
}
#else
static unsigned long pmdp_get_lockless_start(void) { return 0; }
static void pmdp_get_lockless_end(unsigned long irqflags) { }
#endif

pte_t *___pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp)
{
        unsigned long irqflags;
        pmd_t pmdval;

        rcu_read_lock();
        irqflags = pmdp_get_lockless_start();
        pmdval = pmdp_get_lockless(pmd);
        pmdp_get_lockless_end(irqflags);

        if (pmdvalp)
                *pmdvalp = pmdval;
        if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval)))
                goto nomap;
        if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval)))
                goto nomap;
        if (unlikely(pmd_bad(pmdval))) {
                pmd_clear_bad(pmd);
                goto nomap;
        }
        return __pte_map(&pmdval, addr);
nomap:
        rcu_read_unlock();
        return NULL;
}

pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, spinlock_t **ptlp)
{
        pmd_t pmdval;
        pte_t *pte;

        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (likely(pte))
                *ptlp = pte_lockptr(mm, &pmdval);
        return pte;
}

pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd,
                                unsigned long addr, pmd_t *pmdvalp,
                                spinlock_t **ptlp)
{
        pte_t *pte;

        VM_WARN_ON_ONCE(!pmdvalp);
        pte = __pte_offset_map(pmd, addr, pmdvalp);
        if (likely(pte))
                *ptlp = pte_lockptr(mm, pmdvalp);
        return pte;
}

/*
 * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation
 * __pte_offset_map_lock() below, is usually called with the pmd pointer for
 * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while
 * holding mmap_lock or vma lock for read or for write; or in truncate or rmap
 * context, while holding file's i_mmap_lock or anon_vma lock for read (or for
 * write). In a few cases, it may be used with pmd pointing to a pmd_t already
 * copied to or constructed on the stack.
 *
 * When successful, it returns the pte pointer for addr, with its page table
 * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent
 * modification by software, with a pointer to that spinlock in ptlp (in some
 * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's
 * struct page).  pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards.
 *
 * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no
 * page table at *pmd: if, for example, the page table has just been removed,
 * or replaced by the huge pmd of a THP.  (When successful, *pmd is rechecked
 * after acquiring the ptlock, and retried internally if it changed: so that a
 * page table can be safely removed or replaced by THP while holding its lock.)
 *
 * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above,
 * just returns the pte pointer for addr, its page table kmapped if necessary;
 * or NULL if there is no page table at *pmd.  It does not attempt to lock the
 * page table, so cannot normally be used when the page table is to be updated,
 * or when entries read must be stable.  But it does take rcu_read_lock(): so
 * that even when page table is racily removed, it remains a valid though empty
 * and disconnected table.  Until pte_unmap(pte) unmaps and rcu_read_unlock()s
 * afterwards.
 *
 * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map();
 * but when successful, it also outputs a pointer to the spinlock in ptlp - as
 * pte_offset_map_lock() does, but in this case without locking it.  This helps
 * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time
 * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock
 * pointer for the page table that it returns. Even after grabbing the spinlock,
 * we might be looking either at a page table that is still mapped or one that
 * was unmapped and is about to get freed. But for R/O access this is sufficient.
 * So it is only applicable for read-only cases where any modification operations
 * to the page table are not allowed even if the corresponding spinlock is held
 * afterwards.
 *
 * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like
 * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval.
 * It is applicable for may-write cases where any modification operations to the
 * page table may happen after the corresponding spinlock is held afterwards.
 * But the users should make sure the page table is stable like checking pte_same()
 * or checking pmd_same() by using the output pmdval before performing the write
 * operations.
 *
 * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will
 * be read-only/read-write protected.
 *
 * Note that free_pgtables(), used after unmapping detached vmas, or when
 * exiting the whole mm, does not take page table lock before freeing a page
 * table, and may not use RCU at all: "outsiders" like khugepaged should avoid
 * pte_offset_map() and co once the vma is detached from mm or mm_users is zero.
 */
pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd,
                             unsigned long addr, spinlock_t **ptlp)
{
        spinlock_t *ptl;
        pmd_t pmdval;
        pte_t *pte;
again:
        pte = __pte_offset_map(pmd, addr, &pmdval);
        if (unlikely(!pte))
                return pte;
        ptl = pte_lockptr(mm, &pmdval);
        spin_lock(ptl);
        if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) {
                *ptlp = ptl;
                return pte;
        }
        pte_unmap_unlock(pte, ptl);
        goto again;
}

















































































    3 






    3 









    3 


    3 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 
    3 




    3 

    3 





    3 
    3 






































































































































































































































































































































































































































































































































































































































































































































    3 



    3 



    3 


    3 






    3 





    3 



    3 




    3 



    3 



    3 











    3 






    3 













    3 



    3 











    3 





    3 


    3 




    3 








    3 









    3 

    3 

























































    3 




    3 
    3 

    3 






    3 




    3 















































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux INET6 implementation
 *        Forwarding Information Database
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *
 *        Changes:
 *        Yuji SEKIYA @USAGI:        Support default route on router node;
 *                                remove ip6_null_entry from the top of
 *                                routing table.
 *        Ville Nuorvala:                Fixed routing subtrees.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/init.h>
#include <linux/list.h>
#include <linux/slab.h>

#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>

#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>

static struct kmem_cache *fib6_node_kmem __read_mostly;

struct fib6_cleaner {
        struct fib6_walker w;
        struct net *net;
        int (*func)(struct fib6_info *, void *arg);
        int sernum;
        void *arg;
        bool skip_notify;
};

#ifdef CONFIG_IPV6_SUBTREES
#define FWS_INIT FWS_S
#else
#define FWS_INIT FWS_L
#endif

static struct fib6_info *fib6_find_prefix(struct net *net,
                                         struct fib6_table *table,
                                         struct fib6_node *fn);
static struct fib6_node *fib6_repair_tree(struct net *net,
                                          struct fib6_table *table,
                                          struct fib6_node *fn);
static int fib6_walk(struct net *net, struct fib6_walker *w);
static int fib6_walk_continue(struct fib6_walker *w);

/*
 *        A routing update causes an increase of the serial number on the
 *        affected subtree. This allows for cached routes to be asynchronously
 *        tested when modifications are made to the destination cache as a
 *        result of redirects, path MTU changes, etc.
 */

static void fib6_gc_timer_cb(struct timer_list *t);

#define FOR_WALKERS(net, w) \
        list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)

static void fib6_walker_link(struct net *net, struct fib6_walker *w)
{
        write_lock_bh(&net->ipv6.fib6_walker_lock);
        list_add(&w->lh, &net->ipv6.fib6_walkers);
        write_unlock_bh(&net->ipv6.fib6_walker_lock);
}

static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
{
        write_lock_bh(&net->ipv6.fib6_walker_lock);
        list_del(&w->lh);
        write_unlock_bh(&net->ipv6.fib6_walker_lock);
}

static int fib6_new_sernum(struct net *net)
{
        int new, old = atomic_read(&net->ipv6.fib6_sernum);

        do {
                new = old < INT_MAX ? old + 1 : 1;
        } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));

        return new;
}

enum {
        FIB6_NO_SERNUM_CHANGE = 0,
};

void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
{
        struct fib6_node *fn;

        fn = rcu_dereference_protected(f6i->fib6_node,
                        lockdep_is_held(&f6i->fib6_table->tb6_lock));
        if (fn)
                WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
}

/*
 *        Auxiliary address test functions for the radix tree.
 *
 *        These assume a 32bit processor (although it will work on
 *        64bit processors)
 */

/*
 *        test bit
 */
#if defined(__LITTLE_ENDIAN)
# define BITOP_BE32_SWIZZLE        (0x1F & ~7)
#else
# define BITOP_BE32_SWIZZLE        0
#endif

static __be32 addr_bit_set(const void *token, int fn_bit)
{
        const __be32 *addr = token;
        /*
         * Here,
         *        1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
         * is optimized version of
         *        htonl(1 << ((~fn_bit)&0x1F))
         * See include/asm-generic/bitops/le.h.
         */
        return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
               addr[fn_bit >> 5];
}

struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
{
        struct fib6_info *f6i;
        size_t sz = sizeof(*f6i);

        if (with_fib6_nh)
                sz += sizeof(struct fib6_nh);

        f6i = kzalloc(sz, gfp_flags);
        if (!f6i)
                return NULL;

        /* fib6_siblings is a union with nh_list, so this initializes both */
        INIT_LIST_HEAD(&f6i->fib6_siblings);
        refcount_set(&f6i->fib6_ref, 1);

        INIT_HLIST_NODE(&f6i->gc_link);

        return f6i;
}

void fib6_info_destroy_rcu(struct rcu_head *head)
{
        struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);

        WARN_ON(f6i->fib6_node);

        if (f6i->nh)
                nexthop_put(f6i->nh);
        else
                fib6_nh_release(f6i->fib6_nh);

        ip_fib_metrics_put(f6i->fib6_metrics);
        kfree(f6i);
}
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);

static struct fib6_node *node_alloc(struct net *net)
{
        struct fib6_node *fn;

        fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
        if (fn)
                net->ipv6.rt6_stats->fib_nodes++;

        return fn;
}

static void node_free_immediate(struct net *net, struct fib6_node *fn)
{
        kmem_cache_free(fib6_node_kmem, fn);
        net->ipv6.rt6_stats->fib_nodes--;
}

static void node_free(struct net *net, struct fib6_node *fn)
{
        kfree_rcu(fn, rcu);
        net->ipv6.rt6_stats->fib_nodes--;
}

static void fib6_free_table(struct fib6_table *table)
{
        inetpeer_invalidate_tree(&table->tb6_peers);
        kfree(table);
}

static void fib6_link_table(struct net *net, struct fib6_table *tb)
{
        unsigned int h;

        /*
         * Initialize table lock at a single place to give lockdep a key,
         * tables aren't visible prior to being linked to the list.
         */
        spin_lock_init(&tb->tb6_lock);
        h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);

        /*
         * No protection necessary, this is the only list mutatation
         * operation, tables never disappear once they exist.
         */
        hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
}

#ifdef CONFIG_IPV6_MULTIPLE_TABLES

static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
{
        struct fib6_table *table;

        table = kzalloc(sizeof(*table), GFP_ATOMIC);
        if (table) {
                table->tb6_id = id;
                rcu_assign_pointer(table->tb6_root.leaf,
                                   net->ipv6.fib6_null_entry);
                table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
                inet_peer_base_init(&table->tb6_peers);
                INIT_HLIST_HEAD(&table->tb6_gc_hlist);
        }

        return table;
}

struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
        struct fib6_table *tb;

        if (id == 0)
                id = RT6_TABLE_MAIN;
        tb = fib6_get_table(net, id);
        if (tb)
                return tb;

        tb = fib6_alloc_table(net, id);
        if (tb)
                fib6_link_table(net, tb);

        return tb;
}
EXPORT_SYMBOL_GPL(fib6_new_table);

struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
        struct fib6_table *tb;
        struct hlist_head *head;
        unsigned int h;

        if (id == 0)
                id = RT6_TABLE_MAIN;
        h = id & (FIB6_TABLE_HASHSZ - 1);
        rcu_read_lock();
        head = &net->ipv6.fib_table_hash[h];
        hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                if (tb->tb6_id == id) {
                        rcu_read_unlock();
                        return tb;
                }
        }
        rcu_read_unlock();

        return NULL;
}
EXPORT_SYMBOL_GPL(fib6_get_table);

static void __net_init fib6_tables_init(struct net *net)
{
        fib6_link_table(net, net->ipv6.fib6_main_tbl);
        fib6_link_table(net, net->ipv6.fib6_local_tbl);
}
#else

struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
        return fib6_get_table(net, id);
}

struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
          return net->ipv6.fib6_main_tbl;
}

struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
                                   const struct sk_buff *skb,
                                   int flags, pol_lookup_t lookup)
{
        struct rt6_info *rt;

        rt = pol_lookup_func(lookup,
                        net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
        if (rt->dst.error == -EAGAIN) {
                ip6_rt_put_flags(rt, flags);
                rt = net->ipv6.ip6_null_entry;
                if (!(flags & RT6_LOOKUP_F_DST_NOREF))
                        dst_hold(&rt->dst);
        }

        return &rt->dst;
}

/* called with rcu lock held; no reference taken on fib6_info */
int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
                struct fib6_result *res, int flags)
{
        return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
                                 res, flags);
}

static void __net_init fib6_tables_init(struct net *net)
{
        fib6_link_table(net, net->ipv6.fib6_main_tbl);
}

#endif

unsigned int fib6_tables_seq_read(const struct net *net)
{
        unsigned int h, fib_seq = 0;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                const struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                const struct fib6_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb6_hlist)
                        fib_seq += READ_ONCE(tb->fib_seq);
        }
        rcu_read_unlock();

        return fib_seq;
}

static int call_fib6_entry_notifier(struct notifier_block *nb,
                                    enum fib_event_type event_type,
                                    struct fib6_info *rt,
                                    struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
        };

        return call_fib6_notifier(nb, event_type, &info.info);
}

static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
                                              enum fib_event_type event_type,
                                              struct fib6_info *rt,
                                              unsigned int nsiblings,
                                              struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
                .nsiblings = nsiblings,
        };

        return call_fib6_notifier(nb, event_type, &info.info);
}

int call_fib6_entry_notifiers(struct net *net,
                              enum fib_event_type event_type,
                              struct fib6_info *rt,
                              struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
        };

        WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
        return call_fib6_notifiers(net, event_type, &info.info);
}

int call_fib6_multipath_entry_notifiers(struct net *net,
                                        enum fib_event_type event_type,
                                        struct fib6_info *rt,
                                        unsigned int nsiblings,
                                        struct netlink_ext_ack *extack)
{
        struct fib6_entry_notifier_info info = {
                .info.extack = extack,
                .rt = rt,
                .nsiblings = nsiblings,
        };

        WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
        return call_fib6_notifiers(net, event_type, &info.info);
}

int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
{
        struct fib6_entry_notifier_info info = {
                .rt = rt,
                .nsiblings = rt->fib6_nsiblings,
        };

        WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
        return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
}

struct fib6_dump_arg {
        struct net *net;
        struct notifier_block *nb;
        struct netlink_ext_ack *extack;
};

static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
{
        enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
        int err;

        if (!rt || rt == arg->net->ipv6.fib6_null_entry)
                return 0;

        if (rt->fib6_nsiblings)
                err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
                                                         rt,
                                                         rt->fib6_nsiblings,
                                                         arg->extack);
        else
                err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
                                               arg->extack);

        return err;
}

static int fib6_node_dump(struct fib6_walker *w)
{
        int err;

        err = fib6_rt_dump(w->leaf, w->args);
        w->leaf = NULL;
        return err;
}

static int fib6_table_dump(struct net *net, struct fib6_table *tb,
                           struct fib6_walker *w)
{
        int err;

        w->root = &tb->tb6_root;
        spin_lock_bh(&tb->tb6_lock);
        err = fib6_walk(net, w);
        spin_unlock_bh(&tb->tb6_lock);
        return err;
}

/* Called with rcu_read_lock() */
int fib6_tables_dump(struct net *net, struct notifier_block *nb,
                     struct netlink_ext_ack *extack)
{
        struct fib6_dump_arg arg;
        struct fib6_walker *w;
        unsigned int h;
        int err = 0;

        w = kzalloc(sizeof(*w), GFP_ATOMIC);
        if (!w)
                return -ENOMEM;

        w->func = fib6_node_dump;
        arg.net = net;
        arg.nb = nb;
        arg.extack = extack;
        w->args = &arg;

        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                struct hlist_head *head = &net->ipv6.fib_table_hash[h];
                struct fib6_table *tb;

                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                        err = fib6_table_dump(net, tb, w);
                        if (err)
                                goto out;
                }
        }

out:
        kfree(w);

        /* The tree traversal function should never return a positive value. */
        return err > 0 ? -EINVAL : err;
}

static int fib6_dump_node(struct fib6_walker *w)
{
        int res;
        struct fib6_info *rt;

        for_each_fib6_walker_rt(w) {
                res = rt6_dump_route(rt, w->args, w->skip_in_node);
                if (res >= 0) {
                        /* Frame is full, suspend walking */
                        w->leaf = rt;

                        /* We'll restart from this node, so if some routes were
                         * already dumped, skip them next time.
                         */
                        w->skip_in_node += res;

                        return 1;
                }
                w->skip_in_node = 0;

                /* Multipath routes are dumped in one route with the
                 * RTA_MULTIPATH attribute. Jump 'rt' to point to the
                 * last sibling of this route (no need to dump the
                 * sibling routes again)
                 */
                if (rt->fib6_nsiblings)
                        rt = list_last_entry(&rt->fib6_siblings,
                                             struct fib6_info,
                                             fib6_siblings);
        }
        w->leaf = NULL;
        return 0;
}

static void fib6_dump_end(struct netlink_callback *cb)
{
        struct net *net = sock_net(cb->skb->sk);
        struct fib6_walker *w = (void *)cb->args[2];

        if (w) {
                if (cb->args[4]) {
                        cb->args[4] = 0;
                        fib6_walker_unlink(net, w);
                }
                cb->args[2] = 0;
                kfree(w);
        }
        cb->done = (void *)cb->args[3];
        cb->args[1] = 3;
}

static int fib6_dump_done(struct netlink_callback *cb)
{
        fib6_dump_end(cb);
        return cb->done ? cb->done(cb) : 0;
}

static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
                           struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct fib6_walker *w;
        int res;

        w = (void *)cb->args[2];
        w->root = &table->tb6_root;

        if (cb->args[4] == 0) {
                w->count = 0;
                w->skip = 0;
                w->skip_in_node = 0;

                spin_lock_bh(&table->tb6_lock);
                res = fib6_walk(net, w);
                spin_unlock_bh(&table->tb6_lock);
                if (res > 0) {
                        cb->args[4] = 1;
                        cb->args[5] = READ_ONCE(w->root->fn_sernum);
                }
        } else {
                int sernum = READ_ONCE(w->root->fn_sernum);
                if (cb->args[5] != sernum) {
                        /* Begin at the root if the tree changed */
                        cb->args[5] = sernum;
                        w->state = FWS_INIT;
                        w->node = w->root;
                        w->skip = w->count;
                        w->skip_in_node = 0;
                } else
                        w->skip = 0;

                spin_lock_bh(&table->tb6_lock);
                res = fib6_walk_continue(w);
                spin_unlock_bh(&table->tb6_lock);
                if (res <= 0) {
                        fib6_walker_unlink(net, w);
                        cb->args[4] = 0;
                }
        }

        return res;
}

static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rt6_rtnl_dump_arg arg = {
                .filter.dump_exceptions = true,
                .filter.dump_routes = true,
                .filter.rtnl_held = false,
        };
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        unsigned int e = 0, s_e;
        struct hlist_head *head;
        struct fib6_walker *w;
        struct fib6_table *tb;
        unsigned int h, s_h;
        int err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
                if (err < 0)
                        goto unlock;
        } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
                struct rtmsg *rtm = nlmsg_data(nlh);

                if (rtm->rtm_flags & RTM_F_PREFIX)
                        arg.filter.flags = RTM_F_PREFIX;
        }

        w = (void *)cb->args[2];
        if (!w) {
                /* New dump:
                 *
                 * 1. allocate and initialize walker.
                 */
                w = kzalloc(sizeof(*w), GFP_ATOMIC);
                if (!w) {
                        err = -ENOMEM;
                        goto unlock;
                }
                w->func = fib6_dump_node;
                cb->args[2] = (long)w;

                /* 2. hook callback destructor.
                 */
                cb->args[3] = (long)cb->done;
                cb->done = fib6_dump_done;

        }

        arg.skb = skb;
        arg.cb = cb;
        arg.net = net;
        w->args = &arg;

        if (arg.filter.table_id) {
                tb = fib6_get_table(net, arg.filter.table_id);
                if (!tb) {
                        if (rtnl_msg_family(cb->nlh) != PF_INET6)
                                goto unlock;

                        NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
                        err = -ENOENT;
                        goto unlock;
                }

                if (!cb->args[0]) {
                        err = fib6_dump_table(tb, skb, cb);
                        if (!err)
                                cb->args[0] = 1;
                }
                goto unlock;
        }

        s_h = cb->args[0];
        s_e = cb->args[1];

        for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
                e = 0;
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
                        if (e < s_e)
                                goto next;
                        err = fib6_dump_table(tb, skb, cb);
                        if (err != 0)
                                goto out;
next:
                        e++;
                }
        }
out:
        cb->args[1] = e;
        cb->args[0] = h;

unlock:
        rcu_read_unlock();
        if (err <= 0)
                fib6_dump_end(cb);
        return err;
}

void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
{
        if (!f6i)
                return;

        if (f6i->fib6_metrics == &dst_default_metrics) {
                struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);

                if (!p)
                        return;

                refcount_set(&p->refcnt, 1);
                f6i->fib6_metrics = p;
        }

        f6i->fib6_metrics->metrics[metric - 1] = val;
}

/*
 *        Routing Table
 *
 *        return the appropriate node for a routing tree "add" operation
 *        by either creating and inserting or by returning an existing
 *        node.
 */

static struct fib6_node *fib6_add_1(struct net *net,
                                    struct fib6_table *table,
                                    struct fib6_node *root,
                                    struct in6_addr *addr, int plen,
                                    int offset, int allow_create,
                                    int replace_required,
                                    struct netlink_ext_ack *extack)
{
        struct fib6_node *fn, *in, *ln;
        struct fib6_node *pn = NULL;
        struct rt6key *key;
        int        bit;
        __be32        dir = 0;

        /* insert node in tree */

        fn = root;

        do {
                struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                key = (struct rt6key *)((u8 *)leaf + offset);

                /*
                 *        Prefix match
                 */
                if (plen < fn->fn_bit ||
                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
                        if (!allow_create) {
                                if (replace_required) {
                                        NL_SET_ERR_MSG(extack,
                                                       "Can not replace route - no match found");
                                        pr_warn("Can't replace route, no match found\n");
                                        return ERR_PTR(-ENOENT);
                                }
                                pr_warn("NLM_F_CREATE should be set when creating new route\n");
                        }
                        goto insert_above;
                }

                /*
                 *        Exact match ?
                 */

                if (plen == fn->fn_bit) {
                        /* clean up an intermediate node */
                        if (!(fn->fn_flags & RTN_RTINFO)) {
                                RCU_INIT_POINTER(fn->leaf, NULL);
                                fib6_info_release(leaf);
                        /* remove null_entry in the root node */
                        } else if (fn->fn_flags & RTN_TL_ROOT &&
                                   rcu_access_pointer(fn->leaf) ==
                                   net->ipv6.fib6_null_entry) {
                                RCU_INIT_POINTER(fn->leaf, NULL);
                        }

                        return fn;
                }

                /*
                 *        We have more bits to go
                 */

                /* Try to walk down on tree. */
                dir = addr_bit_set(addr, fn->fn_bit);
                pn = fn;
                fn = dir ?
                     rcu_dereference_protected(fn->right,
                                        lockdep_is_held(&table->tb6_lock)) :
                     rcu_dereference_protected(fn->left,
                                        lockdep_is_held(&table->tb6_lock));
        } while (fn);

        if (!allow_create) {
                /* We should not create new node because
                 * NLM_F_REPLACE was specified without NLM_F_CREATE
                 * I assume it is safe to require NLM_F_CREATE when
                 * REPLACE flag is used! Later we may want to remove the
                 * check for replace_required, because according
                 * to netlink specification, NLM_F_CREATE
                 * MUST be specified if new route is created.
                 * That would keep IPv6 consistent with IPv4
                 */
                if (replace_required) {
                        NL_SET_ERR_MSG(extack,
                                       "Can not replace route - no match found");
                        pr_warn("Can't replace route, no match found\n");
                        return ERR_PTR(-ENOENT);
                }
                pr_warn("NLM_F_CREATE should be set when creating new route\n");
        }
        /*
         *        We walked to the bottom of tree.
         *        Create new leaf node without children.
         */

        ln = node_alloc(net);

        if (!ln)
                return ERR_PTR(-ENOMEM);
        ln->fn_bit = plen;
        RCU_INIT_POINTER(ln->parent, pn);

        if (dir)
                rcu_assign_pointer(pn->right, ln);
        else
                rcu_assign_pointer(pn->left, ln);

        return ln;


insert_above:
        /*
         * split since we don't have a common prefix anymore or
         * we have a less significant route.
         * we've to insert an intermediate node on the list
         * this new node will point to the one we need to create
         * and the current
         */

        pn = rcu_dereference_protected(fn->parent,
                                       lockdep_is_held(&table->tb6_lock));

        /* find 1st bit in difference between the 2 addrs.

           See comment in __ipv6_addr_diff: bit may be an invalid value,
           but if it is >= plen, the value is ignored in any case.
         */

        bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));

        /*
         *                (intermediate)[in]
         *                  /           \
         *        (new leaf node)[ln] (old node)[fn]
         */
        if (plen > bit) {
                in = node_alloc(net);
                ln = node_alloc(net);

                if (!in || !ln) {
                        if (in)
                                node_free_immediate(net, in);
                        if (ln)
                                node_free_immediate(net, ln);
                        return ERR_PTR(-ENOMEM);
                }

                /*
                 * new intermediate node.
                 * RTN_RTINFO will
                 * be off since that an address that chooses one of
                 * the branches would not match less specific routes
                 * in the other branch
                 */

                in->fn_bit = bit;

                RCU_INIT_POINTER(in->parent, pn);
                in->leaf = fn->leaf;
                fib6_info_hold(rcu_dereference_protected(in->leaf,
                                lockdep_is_held(&table->tb6_lock)));

                /* update parent pointer */
                if (dir)
                        rcu_assign_pointer(pn->right, in);
                else
                        rcu_assign_pointer(pn->left, in);

                ln->fn_bit = plen;

                RCU_INIT_POINTER(ln->parent, in);
                rcu_assign_pointer(fn->parent, in);

                if (addr_bit_set(addr, bit)) {
                        rcu_assign_pointer(in->right, ln);
                        rcu_assign_pointer(in->left, fn);
                } else {
                        rcu_assign_pointer(in->left, ln);
                        rcu_assign_pointer(in->right, fn);
                }
        } else { /* plen <= bit */

                /*
                 *                (new leaf node)[ln]
                 *                  /           \
                 *             (old node)[fn] NULL
                 */

                ln = node_alloc(net);

                if (!ln)
                        return ERR_PTR(-ENOMEM);

                ln->fn_bit = plen;

                RCU_INIT_POINTER(ln->parent, pn);

                if (addr_bit_set(&key->addr, plen))
                        RCU_INIT_POINTER(ln->right, fn);
                else
                        RCU_INIT_POINTER(ln->left, fn);

                rcu_assign_pointer(fn->parent, ln);

                if (dir)
                        rcu_assign_pointer(pn->right, ln);
                else
                        rcu_assign_pointer(pn->left, ln);
        }
        return ln;
}

static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
                                  const struct fib6_info *match,
                                  const struct fib6_table *table)
{
        int cpu;

        if (!fib6_nh->rt6i_pcpu)
                return;

        rcu_read_lock();
        /* release the reference to this fib entry from
         * all of its cached pcpu routes
         */
        for_each_possible_cpu(cpu) {
                struct rt6_info **ppcpu_rt;
                struct rt6_info *pcpu_rt;

                ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);

                /* Paired with xchg() in rt6_get_pcpu_route() */
                pcpu_rt = READ_ONCE(*ppcpu_rt);

                /* only dropping the 'from' reference if the cached route
                 * is using 'match'. The cached pcpu_rt->from only changes
                 * from a fib6_info to NULL (ip6_dst_destroy); it can never
                 * change from one fib6_info reference to another
                 */
                if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
                        struct fib6_info *from;

                        from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
                        fib6_info_release(from);
                }
        }
        rcu_read_unlock();
}

struct fib6_nh_pcpu_arg {
        struct fib6_info        *from;
        const struct fib6_table *table;
};

static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
{
        struct fib6_nh_pcpu_arg *arg = _arg;

        __fib6_drop_pcpu_from(nh, arg->from, arg->table);
        return 0;
}

static void fib6_drop_pcpu_from(struct fib6_info *f6i,
                                const struct fib6_table *table)
{
        /* Make sure rt6_make_pcpu_route() wont add other percpu routes
         * while we are cleaning them here.
         */
        f6i->fib6_destroying = 1;
        mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */

        if (f6i->nh) {
                struct fib6_nh_pcpu_arg arg = {
                        .from = f6i,
                        .table = table
                };

                nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
                                         &arg);
        } else {
                struct fib6_nh *fib6_nh;

                fib6_nh = f6i->fib6_nh;
                __fib6_drop_pcpu_from(fib6_nh, f6i, table);
        }
}

static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
                          struct net *net)
{
        struct fib6_table *table = rt->fib6_table;

        /* Flush all cached dst in exception table */
        rt6_flush_exceptions(rt);
        fib6_drop_pcpu_from(rt, table);

        if (rt->nh && !list_empty(&rt->nh_list))
                list_del_init(&rt->nh_list);

        if (refcount_read(&rt->fib6_ref) != 1) {
                /* This route is used as dummy address holder in some split
                 * nodes. It is not leaked, but it still holds other resources,
                 * which must be released in time. So, scan ascendant nodes
                 * and replace dummy references to this route with references
                 * to still alive ones.
                 */
                while (fn) {
                        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                        struct fib6_info *new_leaf;
                        if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
                                new_leaf = fib6_find_prefix(net, table, fn);
                                fib6_info_hold(new_leaf);

                                rcu_assign_pointer(fn->leaf, new_leaf);
                                fib6_info_release(rt);
                        }
                        fn = rcu_dereference_protected(fn->parent,
                                    lockdep_is_held(&table->tb6_lock));
                }
        }

        fib6_clean_expires(rt);
        fib6_remove_gc_list(rt);
}

/*
 *        Insert routing information in a node.
 */

static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
                            struct nl_info *info,
                            struct netlink_ext_ack *extack)
{
        struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
        struct fib6_info *iter = NULL;
        struct fib6_info __rcu **ins;
        struct fib6_info __rcu **fallback_ins = NULL;
        int replace = (info->nlh &&
                       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
        int add = (!info->nlh ||
                   (info->nlh->nlmsg_flags & NLM_F_CREATE));
        int found = 0;
        bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
        bool notify_sibling_rt = false;
        u16 nlflags = NLM_F_EXCL;
        int err;

        if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
                nlflags |= NLM_F_APPEND;

        ins = &fn->leaf;

        for (iter = leaf; iter;
             iter = rcu_dereference_protected(iter->fib6_next,
                                lockdep_is_held(&rt->fib6_table->tb6_lock))) {
                /*
                 *        Search for duplicates
                 */

                if (iter->fib6_metric == rt->fib6_metric) {
                        /*
                         *        Same priority level
                         */
                        if (info->nlh &&
                            (info->nlh->nlmsg_flags & NLM_F_EXCL))
                                return -EEXIST;

                        nlflags &= ~NLM_F_EXCL;
                        if (replace) {
                                if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
                                        found++;
                                        break;
                                }
                                fallback_ins = fallback_ins ?: ins;
                                goto next_iter;
                        }

                        if (rt6_duplicate_nexthop(iter, rt)) {
                                if (rt->fib6_nsiblings)
                                        rt->fib6_nsiblings = 0;
                                if (!(iter->fib6_flags & RTF_EXPIRES))
                                        return -EEXIST;
                                if (!(rt->fib6_flags & RTF_EXPIRES)) {
                                        fib6_clean_expires(iter);
                                        fib6_remove_gc_list(iter);
                                } else {
                                        fib6_set_expires(iter, rt->expires);
                                        fib6_add_gc_list(iter);
                                }

                                if (rt->fib6_pmtu)
                                        fib6_metric_set(iter, RTAX_MTU,
                                                        rt->fib6_pmtu);
                                return -EEXIST;
                        }
                        /* If we have the same destination and the same metric,
                         * but not the same gateway, then the route we try to
                         * add is sibling to this route, increment our counter
                         * of siblings, and later we will add our route to the
                         * list.
                         * Only static routes (which don't have flag
                         * RTF_EXPIRES) are used for ECMPv6.
                         *
                         * To avoid long list, we only had siblings if the
                         * route have a gateway.
                         */
                        if (rt_can_ecmp &&
                            rt6_qualify_for_ecmp(iter))
                                rt->fib6_nsiblings++;
                }

                if (iter->fib6_metric > rt->fib6_metric)
                        break;

next_iter:
                ins = &iter->fib6_next;
        }

        if (fallback_ins && !found) {
                /* No matching route with same ecmp-able-ness found, replace
                 * first matching route
                 */
                ins = fallback_ins;
                iter = rcu_dereference_protected(*ins,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                found++;
        }

        /* Reset round-robin state, if necessary */
        if (ins == &fn->leaf)
                fn->rr_ptr = NULL;

        /* Link this route to others same route. */
        if (rt->fib6_nsiblings) {
                unsigned int fib6_nsiblings;
                struct fib6_info *sibling, *temp_sibling;

                /* Find the first route that have the same metric */
                sibling = leaf;
                notify_sibling_rt = true;
                while (sibling) {
                        if (sibling->fib6_metric == rt->fib6_metric &&
                            rt6_qualify_for_ecmp(sibling)) {
                                list_add_tail_rcu(&rt->fib6_siblings,
                                                  &sibling->fib6_siblings);
                                break;
                        }
                        sibling = rcu_dereference_protected(sibling->fib6_next,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                        notify_sibling_rt = false;
                }
                /* For each sibling in the list, increment the counter of
                 * siblings. BUG() if counters does not match, list of siblings
                 * is broken!
                 */
                fib6_nsiblings = 0;
                list_for_each_entry_safe(sibling, temp_sibling,
                                         &rt->fib6_siblings, fib6_siblings) {
                        sibling->fib6_nsiblings++;
                        BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
                        fib6_nsiblings++;
                }
                BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
                rt6_multipath_rebalance(temp_sibling);
        }

        /*
         *        insert node
         */
        if (!replace) {
                if (!add)
                        pr_warn("NLM_F_CREATE should be set when creating new route\n");

add:
                nlflags |= NLM_F_CREATE;

                /* The route should only be notified if it is the first
                 * route in the node or if it is added as a sibling
                 * route to the first route in the node.
                 */
                if (!info->skip_notify_kernel &&
                    (notify_sibling_rt || ins == &fn->leaf)) {
                        enum fib_event_type fib_event;

                        if (notify_sibling_rt)
                                fib_event = FIB_EVENT_ENTRY_APPEND;
                        else
                                fib_event = FIB_EVENT_ENTRY_REPLACE;
                        err = call_fib6_entry_notifiers(info->nl_net,
                                                        fib_event, rt,
                                                        extack);
                        if (err) {
                                struct fib6_info *sibling, *next_sibling;

                                /* If the route has siblings, then it first
                                 * needs to be unlinked from them.
                                 */
                                if (!rt->fib6_nsiblings)
                                        return err;

                                list_for_each_entry_safe(sibling, next_sibling,
                                                         &rt->fib6_siblings,
                                                         fib6_siblings)
                                        sibling->fib6_nsiblings--;
                                rt->fib6_nsiblings = 0;
                                list_del_rcu(&rt->fib6_siblings);
                                rt6_multipath_rebalance(next_sibling);
                                return err;
                        }
                }

                rcu_assign_pointer(rt->fib6_next, iter);
                fib6_info_hold(rt);
                rcu_assign_pointer(rt->fib6_node, fn);
                rcu_assign_pointer(*ins, rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
                info->nl_net->ipv6.rt6_stats->fib_rt_entries++;

                if (!(fn->fn_flags & RTN_RTINFO)) {
                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                        fn->fn_flags |= RTN_RTINFO;
                }

        } else {
                int nsiblings;

                if (!found) {
                        if (add)
                                goto add;
                        pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
                        return -ENOENT;
                }

                if (!info->skip_notify_kernel && ins == &fn->leaf) {
                        err = call_fib6_entry_notifiers(info->nl_net,
                                                        FIB_EVENT_ENTRY_REPLACE,
                                                        rt, extack);
                        if (err)
                                return err;
                }

                fib6_info_hold(rt);
                rcu_assign_pointer(rt->fib6_node, fn);
                rt->fib6_next = iter->fib6_next;
                rcu_assign_pointer(*ins, rt);
                if (!info->skip_notify)
                        inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
                if (!(fn->fn_flags & RTN_RTINFO)) {
                        info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
                        fn->fn_flags |= RTN_RTINFO;
                }
                nsiblings = iter->fib6_nsiblings;
                iter->fib6_node = NULL;
                fib6_purge_rt(iter, fn, info->nl_net);
                if (rcu_access_pointer(fn->rr_ptr) == iter)
                        fn->rr_ptr = NULL;
                fib6_info_release(iter);

                if (nsiblings) {
                        /* Replacing an ECMP route, remove all siblings */
                        ins = &rt->fib6_next;
                        iter = rcu_dereference_protected(*ins,
                                    lockdep_is_held(&rt->fib6_table->tb6_lock));
                        while (iter) {
                                if (iter->fib6_metric > rt->fib6_metric)
                                        break;
                                if (rt6_qualify_for_ecmp(iter)) {
                                        *ins = iter->fib6_next;
                                        iter->fib6_node = NULL;
                                        fib6_purge_rt(iter, fn, info->nl_net);
                                        if (rcu_access_pointer(fn->rr_ptr) == iter)
                                                fn->rr_ptr = NULL;
                                        fib6_info_release(iter);
                                        nsiblings--;
                                        info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
                                } else {
                                        ins = &iter->fib6_next;
                                }
                                iter = rcu_dereference_protected(*ins,
                                        lockdep_is_held(&rt->fib6_table->tb6_lock));
                        }
                        WARN_ON(nsiblings != 0);
                }
        }

        return 0;
}

static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
        if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
            (rt->fib6_flags & RTF_EXPIRES))
                mod_timer(&net->ipv6.ip6_fib_timer,
                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}

void fib6_force_start_gc(struct net *net)
{
        if (!timer_pending(&net->ipv6.ip6_fib_timer))
                mod_timer(&net->ipv6.ip6_fib_timer,
                          jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
}

static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
                                           int sernum)
{
        struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));

        /* paired with smp_rmb() in fib6_get_cookie_safe() */
        smp_wmb();
        while (fn) {
                WRITE_ONCE(fn->fn_sernum, sernum);
                fn = rcu_dereference_protected(fn->parent,
                                lockdep_is_held(&rt->fib6_table->tb6_lock));
        }
}

void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
{
        __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}

/* allow ipv4 to update sernum via ipv6_stub */
void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
{
        spin_lock_bh(&f6i->fib6_table->tb6_lock);
        fib6_update_sernum_upto_root(net, f6i);
        spin_unlock_bh(&f6i->fib6_table->tb6_lock);
}

/*
 *        Add routing information to the routing tree.
 *        <destination addr>/<source addr>
 *        with source addr info in sub-trees
 *        Need to own table->tb6_lock
 */

int fib6_add(struct fib6_node *root, struct fib6_info *rt,
             struct nl_info *info, struct netlink_ext_ack *extack)
{
        struct fib6_table *table = rt->fib6_table;
        struct fib6_node *fn;
#ifdef CONFIG_IPV6_SUBTREES
        struct fib6_node *pn = NULL;
#endif
        int err = -ENOMEM;
        int allow_create = 1;
        int replace_required = 0;

        if (info->nlh) {
                if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
                        allow_create = 0;
                if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
                        replace_required = 1;
        }
        if (!allow_create && !replace_required)
                pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");

        fn = fib6_add_1(info->nl_net, table, root,
                        &rt->fib6_dst.addr, rt->fib6_dst.plen,
                        offsetof(struct fib6_info, fib6_dst), allow_create,
                        replace_required, extack);
        if (IS_ERR(fn)) {
                err = PTR_ERR(fn);
                fn = NULL;
                goto out;
        }

#ifdef CONFIG_IPV6_SUBTREES
        pn = fn;

        if (rt->fib6_src.plen) {
                struct fib6_node *sn;

                if (!rcu_access_pointer(fn->subtree)) {
                        struct fib6_node *sfn;

                        /*
                         * Create subtree.
                         *
                         *                fn[main tree]
                         *                |
                         *                sfn[subtree root]
                         *                   \
                         *                    sn[new leaf node]
                         */

                        /* Create subtree root node */
                        sfn = node_alloc(info->nl_net);
                        if (!sfn)
                                goto failure;

                        fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
                        rcu_assign_pointer(sfn->leaf,
                                           info->nl_net->ipv6.fib6_null_entry);
                        sfn->fn_flags = RTN_ROOT;

                        /* Now add the first leaf node to new subtree */

                        sn = fib6_add_1(info->nl_net, table, sfn,
                                        &rt->fib6_src.addr, rt->fib6_src.plen,
                                        offsetof(struct fib6_info, fib6_src),
                                        allow_create, replace_required, extack);

                        if (IS_ERR(sn)) {
                                /* If it is failed, discard just allocated
                                   root, and then (in failure) stale node
                                   in main tree.
                                 */
                                node_free_immediate(info->nl_net, sfn);
                                err = PTR_ERR(sn);
                                goto failure;
                        }

                        /* Now link new subtree to main tree */
                        rcu_assign_pointer(sfn->parent, fn);
                        rcu_assign_pointer(fn->subtree, sfn);
                } else {
                        sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
                                        &rt->fib6_src.addr, rt->fib6_src.plen,
                                        offsetof(struct fib6_info, fib6_src),
                                        allow_create, replace_required, extack);

                        if (IS_ERR(sn)) {
                                err = PTR_ERR(sn);
                                goto failure;
                        }
                }

                if (!rcu_access_pointer(fn->leaf)) {
                        if (fn->fn_flags & RTN_TL_ROOT) {
                                /* put back null_entry for root node */
                                rcu_assign_pointer(fn->leaf,
                                            info->nl_net->ipv6.fib6_null_entry);
                        } else {
                                fib6_info_hold(rt);
                                rcu_assign_pointer(fn->leaf, rt);
                        }
                }
                fn = sn;
        }
#endif

        err = fib6_add_rt2node(fn, rt, info, extack);
        if (!err) {
                if (rt->nh)
                        list_add(&rt->nh_list, &rt->nh->f6i_list);
                __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));

                if (rt->fib6_flags & RTF_EXPIRES)
                        fib6_add_gc_list(rt);

                fib6_start_gc(info->nl_net, rt);
        }

out:
        if (err) {
#ifdef CONFIG_IPV6_SUBTREES
                /*
                 * If fib6_add_1 has cleared the old leaf pointer in the
                 * super-tree leaf node we have to find a new one for it.
                 */
                if (pn != fn) {
                        struct fib6_info *pn_leaf =
                                rcu_dereference_protected(pn->leaf,
                                    lockdep_is_held(&table->tb6_lock));
                        if (pn_leaf == rt) {
                                pn_leaf = NULL;
                                RCU_INIT_POINTER(pn->leaf, NULL);
                                fib6_info_release(rt);
                        }
                        if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
                                pn_leaf = fib6_find_prefix(info->nl_net, table,
                                                           pn);
                                if (!pn_leaf)
                                        pn_leaf =
                                            info->nl_net->ipv6.fib6_null_entry;
                                fib6_info_hold(pn_leaf);
                                rcu_assign_pointer(pn->leaf, pn_leaf);
                        }
                }
#endif
                goto failure;
        } else if (fib6_requires_src(rt)) {
                fib6_routes_require_src_inc(info->nl_net);
        }
        return err;

failure:
        /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
         * 1. fn is an intermediate node and we failed to add the new
         * route to it in both subtree creation failure and fib6_add_rt2node()
         * failure case.
         * 2. fn is the root node in the table and we fail to add the first
         * default route to it.
         */
        if (fn &&
            (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
             (fn->fn_flags & RTN_TL_ROOT &&
              !rcu_access_pointer(fn->leaf))))
                fib6_repair_tree(info->nl_net, table, fn);
        return err;
}

/*
 *        Routing tree lookup
 *
 */

struct lookup_args {
        int                        offset;                /* key offset on fib6_info */
        const struct in6_addr        *addr;                /* search key                        */
};

static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
                                            struct lookup_args *args)
{
        struct fib6_node *fn;
        __be32 dir;

        if (unlikely(args->offset == 0))
                return NULL;

        /*
         *        Descend on a tree
         */

        fn = root;

        for (;;) {
                struct fib6_node *next;

                dir = addr_bit_set(args->addr, fn->fn_bit);

                next = dir ? rcu_dereference(fn->right) :
                             rcu_dereference(fn->left);

                if (next) {
                        fn = next;
                        continue;
                }
                break;
        }

        while (fn) {
                struct fib6_node *subtree = FIB6_SUBTREE(fn);

                if (subtree || fn->fn_flags & RTN_RTINFO) {
                        struct fib6_info *leaf = rcu_dereference(fn->leaf);
                        struct rt6key *key;

                        if (!leaf)
                                goto backtrack;

                        key = (struct rt6key *) ((u8 *)leaf + args->offset);

                        if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
#ifdef CONFIG_IPV6_SUBTREES
                                if (subtree) {
                                        struct fib6_node *sfn;
                                        sfn = fib6_node_lookup_1(subtree,
                                                                 args + 1);
                                        if (!sfn)
                                                goto backtrack;
                                        fn = sfn;
                                }
#endif
                                if (fn->fn_flags & RTN_RTINFO)
                                        return fn;
                        }
                }
backtrack:
                if (fn->fn_flags & RTN_ROOT)
                        break;

                fn = rcu_dereference(fn->parent);
        }

        return NULL;
}

/* called with rcu_read_lock() held
 */
struct fib6_node *fib6_node_lookup(struct fib6_node *root,
                                   const struct in6_addr *daddr,
                                   const struct in6_addr *saddr)
{
        struct fib6_node *fn;
        struct lookup_args args[] = {
                {
                        .offset = offsetof(struct fib6_info, fib6_dst),
                        .addr = daddr,
                },
#ifdef CONFIG_IPV6_SUBTREES
                {
                        .offset = offsetof(struct fib6_info, fib6_src),
                        .addr = saddr,
                },
#endif
                {
                        .offset = 0,        /* sentinel */
                }
        };

        fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
        if (!fn || fn->fn_flags & RTN_TL_ROOT)
                fn = root;

        return fn;
}

/*
 *        Get node with specified destination prefix (and source prefix,
 *        if subtrees are used)
 *        exact_match == true means we try to find fn with exact match of
 *        the passed in prefix addr
 *        exact_match == false means we try to find fn with longest prefix
 *        match of the passed in prefix addr. This is useful for finding fn
 *        for cached route as it will be stored in the exception table under
 *        the node with longest prefix length.
 */


static struct fib6_node *fib6_locate_1(struct fib6_node *root,
                                       const struct in6_addr *addr,
                                       int plen, int offset,
                                       bool exact_match)
{
        struct fib6_node *fn, *prev = NULL;

        for (fn = root; fn ; ) {
                struct fib6_info *leaf = rcu_dereference(fn->leaf);
                struct rt6key *key;

                /* This node is being deleted */
                if (!leaf) {
                        if (plen <= fn->fn_bit)
                                goto out;
                        else
                                goto next;
                }

                key = (struct rt6key *)((u8 *)leaf + offset);

                /*
                 *        Prefix match
                 */
                if (plen < fn->fn_bit ||
                    !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
                        goto out;

                if (plen == fn->fn_bit)
                        return fn;

                if (fn->fn_flags & RTN_RTINFO)
                        prev = fn;

next:
                /*
                 *        We have more bits to go
                 */
                if (addr_bit_set(addr, fn->fn_bit))
                        fn = rcu_dereference(fn->right);
                else
                        fn = rcu_dereference(fn->left);
        }
out:
        if (exact_match)
                return NULL;
        else
                return prev;
}

struct fib6_node *fib6_locate(struct fib6_node *root,
                              const struct in6_addr *daddr, int dst_len,
                              const struct in6_addr *saddr, int src_len,
                              bool exact_match)
{
        struct fib6_node *fn;

        fn = fib6_locate_1(root, daddr, dst_len,
                           offsetof(struct fib6_info, fib6_dst),
                           exact_match);

#ifdef CONFIG_IPV6_SUBTREES
        if (src_len) {
                WARN_ON(saddr == NULL);
                if (fn) {
                        struct fib6_node *subtree = FIB6_SUBTREE(fn);

                        if (subtree) {
                                fn = fib6_locate_1(subtree, saddr, src_len,
                                           offsetof(struct fib6_info, fib6_src),
                                           exact_match);
                        }
                }
        }
#endif

        if (fn && fn->fn_flags & RTN_RTINFO)
                return fn;

        return NULL;
}


/*
 *        Deletion
 *
 */

static struct fib6_info *fib6_find_prefix(struct net *net,
                                         struct fib6_table *table,
                                         struct fib6_node *fn)
{
        struct fib6_node *child_left, *child_right;

        if (fn->fn_flags & RTN_ROOT)
                return net->ipv6.fib6_null_entry;

        while (fn) {
                child_left = rcu_dereference_protected(fn->left,
                                    lockdep_is_held(&table->tb6_lock));
                child_right = rcu_dereference_protected(fn->right,
                                    lockdep_is_held(&table->tb6_lock));
                if (child_left)
                        return rcu_dereference_protected(child_left->leaf,
                                        lockdep_is_held(&table->tb6_lock));
                if (child_right)
                        return rcu_dereference_protected(child_right->leaf,
                                        lockdep_is_held(&table->tb6_lock));

                fn = FIB6_SUBTREE(fn);
        }
        return NULL;
}

/*
 *        Called to trim the tree of intermediate nodes when possible. "fn"
 *        is the node we want to try and remove.
 *        Need to own table->tb6_lock
 */

static struct fib6_node *fib6_repair_tree(struct net *net,
                                          struct fib6_table *table,
                                          struct fib6_node *fn)
{
        int children;
        int nstate;
        struct fib6_node *child;
        struct fib6_walker *w;
        int iter = 0;

        /* Set fn->leaf to null_entry for root node. */
        if (fn->fn_flags & RTN_TL_ROOT) {
                rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
                return fn;
        }

        for (;;) {
                struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn = rcu_dereference_protected(fn->parent,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
                                            lockdep_is_held(&table->tb6_lock));
                struct fib6_info *new_fn_leaf;

                pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
                iter++;

                WARN_ON(fn->fn_flags & RTN_RTINFO);
                WARN_ON(fn->fn_flags & RTN_TL_ROOT);
                WARN_ON(fn_leaf);

                children = 0;
                child = NULL;
                if (fn_r) {
                        child = fn_r;
                        children |= 1;
                }
                if (fn_l) {
                        child = fn_l;
                        children |= 2;
                }

                if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
                    /* Subtree root (i.e. fn) may have one child */
                    || (children && fn->fn_flags & RTN_ROOT)
#endif
                    ) {
                        new_fn_leaf = fib6_find_prefix(net, table, fn);
#if RT6_DEBUG >= 2
                        if (!new_fn_leaf) {
                                WARN_ON(!new_fn_leaf);
                                new_fn_leaf = net->ipv6.fib6_null_entry;
                        }
#endif
                        fib6_info_hold(new_fn_leaf);
                        rcu_assign_pointer(fn->leaf, new_fn_leaf);
                        return pn;
                }

#ifdef CONFIG_IPV6_SUBTREES
                if (FIB6_SUBTREE(pn) == fn) {
                        WARN_ON(!(fn->fn_flags & RTN_ROOT));
                        RCU_INIT_POINTER(pn->subtree, NULL);
                        nstate = FWS_L;
                } else {
                        WARN_ON(fn->fn_flags & RTN_ROOT);
#endif
                        if (pn_r == fn)
                                rcu_assign_pointer(pn->right, child);
                        else if (pn_l == fn)
                                rcu_assign_pointer(pn->left, child);
#if RT6_DEBUG >= 2
                        else
                                WARN_ON(1);
#endif
                        if (child)
                                rcu_assign_pointer(child->parent, pn);
                        nstate = FWS_R;
#ifdef CONFIG_IPV6_SUBTREES
                }
#endif

                read_lock(&net->ipv6.fib6_walker_lock);
                FOR_WALKERS(net, w) {
                        if (!child) {
                                if (w->node == fn) {
                                        pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
                                                 w, w->state, nstate);
                                        w->node = pn;
                                        w->state = nstate;
                                }
                        } else {
                                if (w->node == fn) {
                                        w->node = child;
                                        if (children&2) {
                                                pr_debug("W %p adjusted by delnode 2, s=%d\n",
                                                         w, w->state);
                                                w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
                                        } else {
                                                pr_debug("W %p adjusted by delnode 2, s=%d\n",
                                                         w, w->state);
                                                w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
                                        }
                                }
                        }
                }
                read_unlock(&net->ipv6.fib6_walker_lock);

                node_free(net, fn);
                if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
                        return pn;

                RCU_INIT_POINTER(pn->leaf, NULL);
                fib6_info_release(pn_leaf);
                fn = pn;
        }
}

static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
                           struct fib6_info __rcu **rtp, struct nl_info *info)
{
        struct fib6_info *leaf, *replace_rt = NULL;
        struct fib6_walker *w;
        struct fib6_info *rt = rcu_dereference_protected(*rtp,
                                    lockdep_is_held(&table->tb6_lock));
        struct net *net = info->nl_net;
        bool notify_del = false;

        /* If the deleted route is the first in the node and it is not part of
         * a multipath route, then we need to replace it with the next route
         * in the node, if exists.
         */
        leaf = rcu_dereference_protected(fn->leaf,
                                         lockdep_is_held(&table->tb6_lock));
        if (leaf == rt && !rt->fib6_nsiblings) {
                if (rcu_access_pointer(rt->fib6_next))
                        replace_rt = rcu_dereference_protected(rt->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                else
                        notify_del = true;
        }

        /* Unlink it */
        *rtp = rt->fib6_next;
        rt->fib6_node = NULL;
        net->ipv6.rt6_stats->fib_rt_entries--;
        net->ipv6.rt6_stats->fib_discarded_routes++;

        /* Reset round-robin state, if necessary */
        if (rcu_access_pointer(fn->rr_ptr) == rt)
                fn->rr_ptr = NULL;

        /* Remove this entry from other siblings */
        if (rt->fib6_nsiblings) {
                struct fib6_info *sibling, *next_sibling;

                /* The route is deleted from a multipath route. If this
                 * multipath route is the first route in the node, then we need
                 * to emit a delete notification. Otherwise, we need to skip
                 * the notification.
                 */
                if (rt->fib6_metric == leaf->fib6_metric &&
                    rt6_qualify_for_ecmp(leaf))
                        notify_del = true;
                list_for_each_entry_safe(sibling, next_sibling,
                                         &rt->fib6_siblings, fib6_siblings)
                        sibling->fib6_nsiblings--;
                rt->fib6_nsiblings = 0;
                list_del_rcu(&rt->fib6_siblings);
                rt6_multipath_rebalance(next_sibling);
        }

        /* Adjust walkers */
        read_lock(&net->ipv6.fib6_walker_lock);
        FOR_WALKERS(net, w) {
                if (w->state == FWS_C && w->leaf == rt) {
                        pr_debug("walker %p adjusted by delroute\n", w);
                        w->leaf = rcu_dereference_protected(rt->fib6_next,
                                            lockdep_is_held(&table->tb6_lock));
                        if (!w->leaf)
                                w->state = FWS_U;
                }
        }
        read_unlock(&net->ipv6.fib6_walker_lock);

        /* If it was last route, call fib6_repair_tree() to:
         * 1. For root node, put back null_entry as how the table was created.
         * 2. For other nodes, expunge its radix tree node.
         */
        if (!rcu_access_pointer(fn->leaf)) {
                if (!(fn->fn_flags & RTN_TL_ROOT)) {
                        fn->fn_flags &= ~RTN_RTINFO;
                        net->ipv6.rt6_stats->fib_route_nodes--;
                }
                fn = fib6_repair_tree(net, table, fn);
        }

        fib6_purge_rt(rt, fn, net);

        if (!info->skip_notify_kernel) {
                if (notify_del)
                        call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
                                                  rt, NULL);
                else if (replace_rt)
                        call_fib6_entry_notifiers_replace(net, replace_rt);
        }
        if (!info->skip_notify)
                inet6_rt_notify(RTM_DELROUTE, rt, info, 0);

        fib6_info_release(rt);
}

/* Need to own table->tb6_lock */
int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
        struct net *net = info->nl_net;
        struct fib6_info __rcu **rtp;
        struct fib6_info __rcu **rtp_next;
        struct fib6_table *table;
        struct fib6_node *fn;

        if (rt == net->ipv6.fib6_null_entry)
                return -ENOENT;

        table = rt->fib6_table;
        fn = rcu_dereference_protected(rt->fib6_node,
                                       lockdep_is_held(&table->tb6_lock));
        if (!fn)
                return -ENOENT;

        WARN_ON(!(fn->fn_flags & RTN_RTINFO));

        /*
         *        Walk the leaf entries looking for ourself
         */

        for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
                struct fib6_info *cur = rcu_dereference_protected(*rtp,
                                        lockdep_is_held(&table->tb6_lock));
                if (rt == cur) {
                        if (fib6_requires_src(cur))
                                fib6_routes_require_src_dec(info->nl_net);
                        fib6_del_route(table, fn, rtp, info);
                        return 0;
                }
                rtp_next = &cur->fib6_next;
        }
        return -ENOENT;
}

/*
 *        Tree traversal function.
 *
 *        Certainly, it is not interrupt safe.
 *        However, it is internally reenterable wrt itself and fib6_add/fib6_del.
 *        It means, that we can modify tree during walking
 *        and use this function for garbage collection, clone pruning,
 *        cleaning tree when a device goes down etc. etc.
 *
 *        It guarantees that every node will be traversed,
 *        and that it will be traversed only once.
 *
 *        Callback function w->func may return:
 *        0 -> continue walking.
 *        positive value -> walking is suspended (used by tree dumps,
 *        and probably by gc, if it will be split to several slices)
 *        negative value -> terminate walking.
 *
 *        The function itself returns:
 *        0   -> walk is complete.
 *        >0  -> walk is incomplete (i.e. suspended)
 *        <0  -> walk is terminated by an error.
 *
 *        This function is called with tb6_lock held.
 */

static int fib6_walk_continue(struct fib6_walker *w)
{
        struct fib6_node *fn, *pn, *left, *right;

        /* w->root should always be table->tb6_root */
        WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));

        for (;;) {
                fn = w->node;
                if (!fn)
                        return 0;

                switch (w->state) {
#ifdef CONFIG_IPV6_SUBTREES
                case FWS_S:
                        if (FIB6_SUBTREE(fn)) {
                                w->node = FIB6_SUBTREE(fn);
                                continue;
                        }
                        w->state = FWS_L;
                        fallthrough;
#endif
                case FWS_L:
                        left = rcu_dereference_protected(fn->left, 1);
                        if (left) {
                                w->node = left;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_R;
                        fallthrough;
                case FWS_R:
                        right = rcu_dereference_protected(fn->right, 1);
                        if (right) {
                                w->node = right;
                                w->state = FWS_INIT;
                                continue;
                        }
                        w->state = FWS_C;
                        w->leaf = rcu_dereference_protected(fn->leaf, 1);
                        fallthrough;
                case FWS_C:
                        if (w->leaf && fn->fn_flags & RTN_RTINFO) {
                                int err;

                                if (w->skip) {
                                        w->skip--;
                                        goto skip;
                                }

                                err = w->func(w);
                                if (err)
                                        return err;

                                w->count++;
                                continue;
                        }
skip:
                        w->state = FWS_U;
                        fallthrough;
                case FWS_U:
                        if (fn == w->root)
                                return 0;
                        pn = rcu_dereference_protected(fn->parent, 1);
                        left = rcu_dereference_protected(pn->left, 1);
                        right = rcu_dereference_protected(pn->right, 1);
                        w->node = pn;
#ifdef CONFIG_IPV6_SUBTREES
                        if (FIB6_SUBTREE(pn) == fn) {
                                WARN_ON(!(fn->fn_flags & RTN_ROOT));
                                w->state = FWS_L;
                                continue;
                        }
#endif
                        if (left == fn) {
                                w->state = FWS_R;
                                continue;
                        }
                        if (right == fn) {
                                w->state = FWS_C;
                                w->leaf = rcu_dereference_protected(w->node->leaf, 1);
                                continue;
                        }
#if RT6_DEBUG >= 2
                        WARN_ON(1);
#endif
                }
        }
}

static int fib6_walk(struct net *net, struct fib6_walker *w)
{
        int res;

        w->state = FWS_INIT;
        w->node = w->root;

        fib6_walker_link(net, w);
        res = fib6_walk_continue(w);
        if (res <= 0)
                fib6_walker_unlink(net, w);
        return res;
}

static int fib6_clean_node(struct fib6_walker *w)
{
        int res;
        struct fib6_info *rt;
        struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
        struct nl_info info = {
                .nl_net = c->net,
                .skip_notify = c->skip_notify,
        };

        if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
            READ_ONCE(w->node->fn_sernum) != c->sernum)
                WRITE_ONCE(w->node->fn_sernum, c->sernum);

        if (!c->func) {
                WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
                w->leaf = NULL;
                return 0;
        }

        for_each_fib6_walker_rt(w) {
                res = c->func(rt, c->arg);
                if (res == -1) {
                        w->leaf = rt;
                        res = fib6_del(rt, &info);
                        if (res) {
#if RT6_DEBUG >= 2
                                pr_debug("%s: del failed: rt=%p@%p err=%d\n",
                                         __func__, rt,
                                         rcu_access_pointer(rt->fib6_node),
                                         res);
#endif
                                continue;
                        }
                        return 0;
                } else if (res == -2) {
                        if (WARN_ON(!rt->fib6_nsiblings))
                                continue;
                        rt = list_last_entry(&rt->fib6_siblings,
                                             struct fib6_info, fib6_siblings);
                        continue;
                }
                WARN_ON(res != 0);
        }
        w->leaf = rt;
        return 0;
}

/*
 *        Convenient frontend to tree walker.
 *
 *        func is called on each route.
 *                It may return -2 -> skip multipath route.
 *                              -1 -> delete this route.
 *                              0  -> continue walking
 */

static void fib6_clean_tree(struct net *net, struct fib6_node *root,
                            int (*func)(struct fib6_info *, void *arg),
                            int sernum, void *arg, bool skip_notify)
{
        struct fib6_cleaner c;

        c.w.root = root;
        c.w.func = fib6_clean_node;
        c.w.count = 0;
        c.w.skip = 0;
        c.w.skip_in_node = 0;
        c.func = func;
        c.sernum = sernum;
        c.arg = arg;
        c.net = net;
        c.skip_notify = skip_notify;

        fib6_walk(net, &c.w);
}

static void __fib6_clean_all(struct net *net,
                             int (*func)(struct fib6_info *, void *),
                             int sernum, void *arg, bool skip_notify)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        spin_lock_bh(&table->tb6_lock);
                        fib6_clean_tree(net, &table->tb6_root,
                                        func, sernum, arg, skip_notify);
                        spin_unlock_bh(&table->tb6_lock);
                }
        }
        rcu_read_unlock();
}

void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
                    void *arg)
{
        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
}

void fib6_clean_all_skip_notify(struct net *net,
                                int (*func)(struct fib6_info *, void *),
                                void *arg)
{
        __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
}

static void fib6_flush_trees(struct net *net)
{
        int new_sernum = fib6_new_sernum(net);

        __fib6_clean_all(net, NULL, new_sernum, NULL, false);
}

/*
 *        Garbage collection
 */

static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
{
        unsigned long now = jiffies;

        /*
         *        check addrconf expiration here.
         *        Routes are expired even if they are in use.
         */

        if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
                if (time_after(now, rt->expires)) {
                        pr_debug("expiring %p\n", rt);
                        return -1;
                }
                gc_args->more++;
        }

        /*        Also age clones in the exception table.
         *        Note, that clones are aged out
         *        only if they are not in use now.
         */
        rt6_age_exceptions(rt, gc_args, now);

        return 0;
}

static void fib6_gc_table(struct net *net,
                          struct fib6_table *tb6,
                          struct fib6_gc_args *gc_args)
{
        struct fib6_info *rt;
        struct hlist_node *n;
        struct nl_info info = {
                .nl_net = net,
                .skip_notify = false,
        };

        hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
                if (fib6_age(rt, gc_args) == -1)
                        fib6_del(rt, &info);
}

static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
{
        struct fib6_table *table;
        struct hlist_head *head;
        unsigned int h;

        rcu_read_lock();
        for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
                head = &net->ipv6.fib_table_hash[h];
                hlist_for_each_entry_rcu(table, head, tb6_hlist) {
                        spin_lock_bh(&table->tb6_lock);

                        fib6_gc_table(net, table, gc_args);

                        spin_unlock_bh(&table->tb6_lock);
                }
        }
        rcu_read_unlock();
}

void fib6_run_gc(unsigned long expires, struct net *net, bool force)
{
        struct fib6_gc_args gc_args;
        unsigned long now;

        if (force) {
                spin_lock_bh(&net->ipv6.fib6_gc_lock);
        } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
                mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
                return;
        }
        gc_args.timeout = expires ? (int)expires :
                          net->ipv6.sysctl.ip6_rt_gc_interval;
        gc_args.more = 0;

        fib6_gc_all(net, &gc_args);
        now = jiffies;
        net->ipv6.ip6_rt_last_gc = now;

        if (gc_args.more)
                mod_timer(&net->ipv6.ip6_fib_timer,
                          round_jiffies(now
                                        + net->ipv6.sysctl.ip6_rt_gc_interval));
        else
                timer_delete(&net->ipv6.ip6_fib_timer);
        spin_unlock_bh(&net->ipv6.fib6_gc_lock);
}

static void fib6_gc_timer_cb(struct timer_list *t)
{
        struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer);

        fib6_run_gc(0, arg, true);
}

static int __net_init fib6_net_init(struct net *net)
{
        size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
        int err;

        err = fib6_notifier_init(net);
        if (err)
                return err;

        /* Default to 3-tuple */
        net->ipv6.sysctl.multipath_hash_fields =
                FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;

        spin_lock_init(&net->ipv6.fib6_gc_lock);
        rwlock_init(&net->ipv6.fib6_walker_lock);
        INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
        timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);

        net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
        if (!net->ipv6.rt6_stats)
                goto out_notifier;

        /* Avoid false sharing : Use at least a full cache line */
        size = max_t(size_t, size, L1_CACHE_BYTES);

        net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
        if (!net->ipv6.fib_table_hash)
                goto out_rt6_stats;

        net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
                                          GFP_KERNEL);
        if (!net->ipv6.fib6_main_tbl)
                goto out_fib_table_hash;

        net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
        rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
                           net->ipv6.fib6_null_entry);
        net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
        INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
        net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
                                           GFP_KERNEL);
        if (!net->ipv6.fib6_local_tbl)
                goto out_fib6_main_tbl;
        net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
        rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
                           net->ipv6.fib6_null_entry);
        net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
                RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
        inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
        INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
#endif
        fib6_tables_init(net);

        return 0;

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_fib6_main_tbl:
        kfree(net->ipv6.fib6_main_tbl);
#endif
out_fib_table_hash:
        kfree(net->ipv6.fib_table_hash);
out_rt6_stats:
        kfree(net->ipv6.rt6_stats);
out_notifier:
        fib6_notifier_exit(net);
        return -ENOMEM;
}

static void fib6_net_exit(struct net *net)
{
        unsigned int i;

        timer_delete_sync(&net->ipv6.ip6_fib_timer);

        for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
                struct hlist_head *head = &net->ipv6.fib_table_hash[i];
                struct hlist_node *tmp;
                struct fib6_table *tb;

                hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
                        hlist_del(&tb->tb6_hlist);
                        fib6_free_table(tb);
                }
        }

        kfree(net->ipv6.fib_table_hash);
        kfree(net->ipv6.rt6_stats);
        fib6_notifier_exit(net);
}

static struct pernet_operations fib6_net_ops = {
        .init = fib6_net_init,
        .exit = fib6_net_exit,
};

static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = {
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
         .dumpit = inet6_dump_fib,
         .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
};

int __init fib6_init(void)
{
        int ret = -ENOMEM;

        fib6_node_kmem = KMEM_CACHE(fib6_node,
                                    SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT);
        if (!fib6_node_kmem)
                goto out;

        ret = register_pernet_subsys(&fib6_net_ops);
        if (ret)
                goto out_kmem_cache_create;

        ret = rtnl_register_many(fib6_rtnl_msg_handlers);
        if (ret)
                goto out_unregister_subsys;

        __fib6_flush_trees = fib6_flush_trees;
out:
        return ret;

out_unregister_subsys:
        unregister_pernet_subsys(&fib6_net_ops);
out_kmem_cache_create:
        kmem_cache_destroy(fib6_node_kmem);
        goto out;
}

void fib6_gc_cleanup(void)
{
        unregister_pernet_subsys(&fib6_net_ops);
        kmem_cache_destroy(fib6_node_kmem);
}

#ifdef CONFIG_PROC_FS
static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
{
        struct fib6_info *rt = v;
        struct ipv6_route_iter *iter = seq->private;
        struct fib6_nh *fib6_nh = rt->fib6_nh;
        unsigned int flags = rt->fib6_flags;
        const struct net_device *dev;

        if (rt->nh)
                fib6_nh = nexthop_fib6_nh(rt->nh);

        seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);

#ifdef CONFIG_IPV6_SUBTREES
        seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
#else
        seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
        if (fib6_nh->fib_nh_gw_family) {
                flags |= RTF_GATEWAY;
                seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
        } else {
                seq_puts(seq, "00000000000000000000000000000000");
        }

        dev = fib6_nh->fib_nh_dev;
        seq_printf(seq, " %08x %08x %08x %08x %8s\n",
                   rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
                   flags, dev ? dev->name : "");
        iter->w.leaf = NULL;
        return 0;
}

static int ipv6_route_yield(struct fib6_walker *w)
{
        struct ipv6_route_iter *iter = w->args;

        if (!iter->skip)
                return 1;

        do {
                iter->w.leaf = rcu_dereference_protected(
                                iter->w.leaf->fib6_next,
                                lockdep_is_held(&iter->tbl->tb6_lock));
                iter->skip--;
                if (!iter->skip && iter->w.leaf)
                        return 1;
        } while (iter->w.leaf);

        return 0;
}

static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
                                      struct net *net)
{
        memset(&iter->w, 0, sizeof(iter->w));
        iter->w.func = ipv6_route_yield;
        iter->w.root = &iter->tbl->tb6_root;
        iter->w.state = FWS_INIT;
        iter->w.node = iter->w.root;
        iter->w.args = iter;
        iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
        INIT_LIST_HEAD(&iter->w.lh);
        fib6_walker_link(net, &iter->w);
}

static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
                                                    struct net *net)
{
        unsigned int h;
        struct hlist_node *node;

        if (tbl) {
                h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
                node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist));
        } else {
                h = 0;
                node = NULL;
        }

        while (!node && h < FIB6_TABLE_HASHSZ) {
                node = rcu_dereference(
                        hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
        }
        return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
}

static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
{
        int sernum = READ_ONCE(iter->w.root->fn_sernum);

        if (iter->sernum != sernum) {
                iter->sernum = sernum;
                iter->w.state = FWS_INIT;
                iter->w.node = iter->w.root;
                WARN_ON(iter->w.skip);
                iter->w.skip = iter->w.count;
        }
}

static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        int r;
        struct fib6_info *n;
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        ++(*pos);
        if (!v)
                goto iter_table;

        n = rcu_dereference(((struct fib6_info *)v)->fib6_next);
        if (n)
                return n;

iter_table:
        ipv6_route_check_sernum(iter);
        spin_lock_bh(&iter->tbl->tb6_lock);
        r = fib6_walk_continue(&iter->w);
        spin_unlock_bh(&iter->tbl->tb6_lock);
        if (r > 0) {
                return iter->w.leaf;
        } else if (r < 0) {
                fib6_walker_unlink(net, &iter->w);
                return NULL;
        }
        fib6_walker_unlink(net, &iter->w);

        iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
        if (!iter->tbl)
                return NULL;

        ipv6_route_seq_setup_walk(iter, net);
        goto iter_table;
}

static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(RCU)
{
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        rcu_read_lock();
        iter->tbl = ipv6_route_seq_next_table(NULL, net);
        iter->skip = *pos;

        if (iter->tbl) {
                loff_t p = 0;

                ipv6_route_seq_setup_walk(iter, net);
                return ipv6_route_seq_next(seq, NULL, &p);
        } else {
                return NULL;
        }
}

static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
{
        struct fib6_walker *w = &iter->w;
        return w->node && !(w->state == FWS_U && w->node == w->root);
}

static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
        __releases(RCU)
{
        struct net *net = seq_file_net(seq);
        struct ipv6_route_iter *iter = seq->private;

        if (ipv6_route_iter_active(iter))
                fib6_walker_unlink(net, &iter->w);

        rcu_read_unlock();
}

#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
                                    struct bpf_iter_meta *meta,
                                    void *v)
{
        struct bpf_iter__ipv6_route ctx;

        ctx.meta = meta;
        ctx.rt = v;
        return bpf_iter_run_prog(prog, &ctx);
}

static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
        struct ipv6_route_iter *iter = seq->private;
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;
        int ret;

        meta.seq = seq;
        prog = bpf_iter_get_info(&meta, false);
        if (!prog)
                return ipv6_route_native_seq_show(seq, v);

        ret = ipv6_route_prog_seq_show(prog, &meta, v);
        iter->w.leaf = NULL;

        return ret;
}

static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
        struct bpf_iter_meta meta;
        struct bpf_prog *prog;

        if (!v) {
                meta.seq = seq;
                prog = bpf_iter_get_info(&meta, true);
                if (prog)
                        (void)ipv6_route_prog_seq_show(prog, &meta, v);
        }

        ipv6_route_native_seq_stop(seq, v);
}
#else
static int ipv6_route_seq_show(struct seq_file *seq, void *v)
{
        return ipv6_route_native_seq_show(seq, v);
}

static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
{
        ipv6_route_native_seq_stop(seq, v);
}
#endif

const struct seq_operations ipv6_route_seq_ops = {
        .start        = ipv6_route_seq_start,
        .next        = ipv6_route_seq_next,
        .stop        = ipv6_route_seq_stop,
        .show        = ipv6_route_seq_show
};
#endif /* CONFIG_PROC_FS */
























































































































































   49 








   49 



    1 
























































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Directory notifications for Linux.
 *
 * Copyright (C) 2000,2001,2002 Stephen Rothwell
 *
 * Copyright (C) 2009 Eric Paris <Red Hat Inc>
 * dnotify was largly rewritten to use the new fsnotify infrastructure
 */
#include <linux/fs.h>
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/dnotify.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/fsnotify_backend.h>

static int dir_notify_enable __read_mostly = 1;
#ifdef CONFIG_SYSCTL
static const struct ctl_table dnotify_sysctls[] = {
        {
                .procname        = "dir-notify-enable",
                .data                = &dir_notify_enable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
};
static void __init dnotify_sysctl_init(void)
{
        register_sysctl_init("fs", dnotify_sysctls);
}
#else
#define dnotify_sysctl_init() do { } while (0)
#endif

static struct kmem_cache *dnotify_struct_cache __ro_after_init;
static struct kmem_cache *dnotify_mark_cache __ro_after_init;
static struct fsnotify_group *dnotify_group __ro_after_init;

/*
 * dnotify will attach one of these to each inode (i_fsnotify_marks) which
 * is being watched by dnotify.  If multiple userspace applications are watching
 * the same directory with dnotify their information is chained in dn
 */
struct dnotify_mark {
        struct fsnotify_mark fsn_mark;
        struct dnotify_struct *dn;
};

/*
 * When a process starts or stops watching an inode the set of events which
 * dnotify cares about for that inode may change.  This function runs the
 * list of everything receiving dnotify events about this directory and calculates
 * the set of all those events.  After it updates what dnotify is interested in
 * it calls the fsnotify function so it can update the set of all events relevant
 * to this inode.
 */
static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark)
{
        __u32 new_mask = 0;
        struct dnotify_struct *dn;
        struct dnotify_mark *dn_mark  = container_of(fsn_mark,
                                                     struct dnotify_mark,
                                                     fsn_mark);

        assert_spin_locked(&fsn_mark->lock);

        for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next)
                new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
        if (fsn_mark->mask == new_mask)
                return;
        fsn_mark->mask = new_mask;

        fsnotify_recalc_mask(fsn_mark->connector);
}

/*
 * Mains fsnotify call where events are delivered to dnotify.
 * Find the dnotify mark on the relevant inode, run the list of dnotify structs
 * on that mark and determine which of them has expressed interest in receiving
 * events of this type.  When found send the correct process and signal and
 * destroy the dnotify struct if it was not registered to receive multiple
 * events.
 */
static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
                                struct inode *inode, struct inode *dir,
                                const struct qstr *name, u32 cookie)
{
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct fown_struct *fown;
        __u32 test_mask = mask & ~FS_EVENT_ON_CHILD;

        /* not a dir, dnotify doesn't care */
        if (!dir && !(mask & FS_ISDIR))
                return 0;

        dn_mark = container_of(inode_mark, struct dnotify_mark, fsn_mark);

        spin_lock(&inode_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_mask & test_mask) == 0) {
                        prev = &dn->dn_next;
                        continue;
                }
                fown = file_f_owner(dn->dn_filp);
                send_sigio(fown, dn->dn_fd, POLL_MSG);
                if (dn->dn_mask & FS_DN_MULTISHOT)
                        prev = &dn->dn_next;
                else {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(inode_mark);
                }
        }

        spin_unlock(&inode_mark->lock);

        return 0;
}

static void dnotify_free_mark(struct fsnotify_mark *fsn_mark)
{
        struct dnotify_mark *dn_mark = container_of(fsn_mark,
                                                    struct dnotify_mark,
                                                    fsn_mark);

        BUG_ON(dn_mark->dn);

        kmem_cache_free(dnotify_mark_cache, dn_mark);
}

static const struct fsnotify_ops dnotify_fsnotify_ops = {
        .handle_inode_event = dnotify_handle_event,
        .free_mark = dnotify_free_mark,
};

/*
 * Called every time a file is closed.  Looks first for a dnotify mark on the
 * inode.  If one is found run all of the ->dn structures attached to that
 * mark for one relevant to this process closing the file and remove that
 * dnotify_struct.  If that was the last dnotify_struct also remove the
 * fsnotify_mark.
 */
void dnotify_flush(struct file *filp, fl_owner_t id)
{
        struct fsnotify_mark *fsn_mark;
        struct dnotify_mark *dn_mark;
        struct dnotify_struct *dn;
        struct dnotify_struct **prev;
        struct inode *inode;
        bool free = false;

        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode))
                return;

        fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
        if (!fsn_mark)
                return;
        dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);

        fsnotify_group_lock(dnotify_group);

        spin_lock(&fsn_mark->lock);
        prev = &dn_mark->dn;
        while ((dn = *prev) != NULL) {
                if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
                        *prev = dn->dn_next;
                        kmem_cache_free(dnotify_struct_cache, dn);
                        dnotify_recalc_inode_mask(fsn_mark);
                        break;
                }
                prev = &dn->dn_next;
        }

        spin_unlock(&fsn_mark->lock);

        /* nothing else could have found us thanks to the dnotify_groups
           mark_mutex */
        if (dn_mark->dn == NULL) {
                fsnotify_detach_mark(fsn_mark);
                free = true;
        }

        fsnotify_group_unlock(dnotify_group);

        if (free)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
}

/* this conversion is done only at watch creation */
static __u32 convert_arg(unsigned int arg)
{
        __u32 new_mask = FS_EVENT_ON_CHILD;

        if (arg & DN_MULTISHOT)
                new_mask |= FS_DN_MULTISHOT;
        if (arg & DN_DELETE)
                new_mask |= (FS_DELETE | FS_MOVED_FROM);
        if (arg & DN_MODIFY)
                new_mask |= FS_MODIFY;
        if (arg & DN_ACCESS)
                new_mask |= FS_ACCESS;
        if (arg & DN_ATTRIB)
                new_mask |= FS_ATTRIB;
        if (arg & DN_RENAME)
                new_mask |= FS_RENAME;
        if (arg & DN_CREATE)
                new_mask |= (FS_CREATE | FS_MOVED_TO);

        return new_mask;
}

/*
 * If multiple processes watch the same inode with dnotify there is only one
 * dnotify mark in inode->i_fsnotify_marks but we chain a dnotify_struct
 * onto that mark.  This function either attaches the new dnotify_struct onto
 * that list, or it |= the mask onto an existing dnofiy_struct.
 */
static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark,
                     fl_owner_t id, int fd, struct file *filp, __u32 mask)
{
        struct dnotify_struct *odn;

        odn = dn_mark->dn;
        while (odn != NULL) {
                /* adding more events to existing dnofiy_struct? */
                if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
                        odn->dn_fd = fd;
                        odn->dn_mask |= mask;
                        return -EEXIST;
                }
                odn = odn->dn_next;
        }

        dn->dn_mask = mask;
        dn->dn_fd = fd;
        dn->dn_filp = filp;
        dn->dn_owner = id;
        dn->dn_next = dn_mark->dn;
        dn_mark->dn = dn;

        return 0;
}

/*
 * When a process calls fcntl to attach a dnotify watch to a directory it ends
 * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
 * attached to the fsnotify_mark.
 */
int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
{
        struct dnotify_mark *new_dn_mark, *dn_mark;
        struct fsnotify_mark *new_fsn_mark, *fsn_mark;
        struct dnotify_struct *dn;
        struct inode *inode;
        fl_owner_t id = current->files;
        struct file *f = NULL;
        int destroy = 0, error = 0;
        __u32 mask;

        /* we use these to tell if we need to kfree */
        new_fsn_mark = NULL;
        dn = NULL;

        if (!dir_notify_enable) {
                error = -EINVAL;
                goto out_err;
        }

        /* a 0 mask means we are explicitly removing the watch */
        if ((arg & ~DN_MULTISHOT) == 0) {
                dnotify_flush(filp, id);
                error = 0;
                goto out_err;
        }

        /* dnotify only works on directories */
        inode = file_inode(filp);
        if (!S_ISDIR(inode->i_mode)) {
                error = -ENOTDIR;
                goto out_err;
        }

        /*
         * convert the userspace DN_* "arg" to the internal FS_*
         * defined in fsnotify
         */
        mask = convert_arg(arg);

        error = security_path_notify(&filp->f_path, mask,
                        FSNOTIFY_OBJ_TYPE_INODE);
        if (error)
                goto out_err;

        /* expect most fcntl to add new rather than augment old */
        dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
        if (!dn) {
                error = -ENOMEM;
                goto out_err;
        }

        /* new fsnotify mark, we expect most fcntl calls to add a new mark */
        new_dn_mark = kmem_cache_alloc(dnotify_mark_cache, GFP_KERNEL);
        if (!new_dn_mark) {
                error = -ENOMEM;
                goto out_err;
        }

        error = file_f_owner_allocate(filp);
        if (error)
                goto out_err;

        /* set up the new_fsn_mark and new_dn_mark */
        new_fsn_mark = &new_dn_mark->fsn_mark;
        fsnotify_init_mark(new_fsn_mark, dnotify_group);
        new_fsn_mark->mask = mask;
        new_dn_mark->dn = NULL;

        /* this is needed to prevent the fcntl/close race described below */
        fsnotify_group_lock(dnotify_group);

        /* add the new_fsn_mark or find an old one. */
        fsn_mark = fsnotify_find_inode_mark(inode, dnotify_group);
        if (fsn_mark) {
                dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
                spin_lock(&fsn_mark->lock);
        } else {
                error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
                if (error) {
                        fsnotify_group_unlock(dnotify_group);
                        goto out_err;
                }
                spin_lock(&new_fsn_mark->lock);
                fsn_mark = new_fsn_mark;
                dn_mark = new_dn_mark;
                /* we used new_fsn_mark, so don't free it */
                new_fsn_mark = NULL;
        }

        f = fget_raw(fd);

        /* if (f != filp) means that we lost a race and another task/thread
         * actually closed the fd we are still playing with before we grabbed
         * the dnotify_groups mark_mutex and fsn_mark->lock.  Since closing the
         * fd is the only time we clean up the marks we need to get our mark
         * off the list. */
        if (f != filp) {
                /* if we added ourselves, shoot ourselves, it's possible that
                 * the flush actually did shoot this fsn_mark.  That's fine too
                 * since multiple calls to destroy_mark is perfectly safe, if
                 * we found a dn_mark already attached to the inode, just sod
                 * off silently as the flush at close time dealt with it.
                 */
                if (dn_mark == new_dn_mark)
                        destroy = 1;
                error = 0;
                goto out;
        }

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);

        error = attach_dn(dn, dn_mark, id, fd, filp, mask);
        /* !error means that we attached the dn to the dn_mark, so don't free it */
        if (!error)
                dn = NULL;
        /* -EEXIST means that we didn't add this new dn and used an old one.
         * that isn't an error (and the unused dn should be freed) */
        else if (error == -EEXIST)
                error = 0;

        dnotify_recalc_inode_mask(fsn_mark);
out:
        spin_unlock(&fsn_mark->lock);

        if (destroy)
                fsnotify_detach_mark(fsn_mark);
        fsnotify_group_unlock(dnotify_group);
        if (destroy)
                fsnotify_free_mark(fsn_mark);
        fsnotify_put_mark(fsn_mark);
out_err:
        if (new_fsn_mark)
                fsnotify_put_mark(new_fsn_mark);
        if (dn)
                kmem_cache_free(dnotify_struct_cache, dn);
        if (f)
                fput(f);
        return error;
}

static int __init dnotify_init(void)
{
        dnotify_struct_cache = KMEM_CACHE(dnotify_struct,
                                          SLAB_PANIC|SLAB_ACCOUNT);
        dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);

        dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops, 0);
        if (IS_ERR(dnotify_group))
                panic("unable to allocate fsnotify group for dnotify\n");
        dnotify_sysctl_init();
        return 0;
}

module_init(dnotify_init)
















































































































































































































































































    8 







    8 
















































    8 




















    8 































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MIN_HEAP_H
#define _LINUX_MIN_HEAP_H

#include <linux/bug.h>
#include <linux/string.h>
#include <linux/types.h>

/*
 * The Min Heap API provides utilities for managing min-heaps, a binary tree
 * structure where each node's value is less than or equal to its children's
 * values, ensuring the smallest element is at the root.
 *
 * Users should avoid directly calling functions prefixed with __min_heap_*().
 * Instead, use the provided macro wrappers.
 *
 * For further details and examples, refer to Documentation/core-api/min_heap.rst.
 */

/**
 * Data structure to hold a min-heap.
 * @nr: Number of elements currently in the heap.
 * @size: Maximum number of elements that can be held in current storage.
 * @data: Pointer to the start of array holding the heap elements.
 * @preallocated: Start of the static preallocated array holding the heap elements.
 */
#define MIN_HEAP_PREALLOCATED(_type, _name, _nr)        \
struct _name {        \
        size_t nr;        \
        size_t size;        \
        _type *data;        \
        _type preallocated[_nr];        \
}

#define DEFINE_MIN_HEAP(_type, _name) MIN_HEAP_PREALLOCATED(_type, _name, 0)

typedef DEFINE_MIN_HEAP(char, min_heap_char) min_heap_char;

#define __minheap_cast(_heap)                (typeof((_heap)->data[0]) *)
#define __minheap_obj_size(_heap)        sizeof((_heap)->data[0])

/**
 * struct min_heap_callbacks - Data/functions to customise the min_heap.
 * @less: Partial order function for this heap.
 * @swp: Swap elements function.
 */
struct min_heap_callbacks {
        bool (*less)(const void *lhs, const void *rhs, void *args);
        void (*swp)(void *lhs, void *rhs, void *args);
};

/**
 * is_aligned - is this pointer & size okay for word-wide copying?
 * @base: pointer to data
 * @size: size of each element
 * @align: required alignment (typically 4 or 8)
 *
 * Returns true if elements can be copied using word loads and stores.
 * The size must be a multiple of the alignment, and the base address must
 * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
 *
 * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
 * to "if ((a | b) & mask)", so we do that by hand.
 */
__attribute_const__ __always_inline
static bool is_aligned(const void *base, size_t size, unsigned char align)
{
        unsigned char lsbits = (unsigned char)size;

        (void)base;
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
        lsbits |= (unsigned char)(uintptr_t)base;
#endif
        return (lsbits & (align - 1)) == 0;
}

/**
 * swap_words_32 - swap two elements in 32-bit chunks
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size (must be a multiple of 4)
 *
 * Exchange the two objects in memory.  This exploits base+index addressing,
 * which basically all CPUs have, to minimize loop overhead computations.
 *
 * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
 * bottom of the loop, even though the zero flag is still valid from the
 * subtract (since the intervening mov instructions don't alter the flags).
 * Gcc 8.1.0 doesn't have that problem.
 */
static __always_inline
void swap_words_32(void *a, void *b, size_t n)
{
        do {
                u32 t = *(u32 *)(a + (n -= 4));
                *(u32 *)(a + n) = *(u32 *)(b + n);
                *(u32 *)(b + n) = t;
        } while (n);
}

/**
 * swap_words_64 - swap two elements in 64-bit chunks
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size (must be a multiple of 8)
 *
 * Exchange the two objects in memory.  This exploits base+index
 * addressing, which basically all CPUs have, to minimize loop overhead
 * computations.
 *
 * We'd like to use 64-bit loads if possible.  If they're not, emulating
 * one requires base+index+4 addressing which x86 has but most other
 * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
 * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
 * x32 ABI).  Are there any cases the kernel needs to worry about?
 */
static __always_inline
void swap_words_64(void *a, void *b, size_t n)
{
        do {
#ifdef CONFIG_64BIT
                u64 t = *(u64 *)(a + (n -= 8));
                *(u64 *)(a + n) = *(u64 *)(b + n);
                *(u64 *)(b + n) = t;
#else
                /* Use two 32-bit transfers to avoid base+index+4 addressing */
                u32 t = *(u32 *)(a + (n -= 4));
                *(u32 *)(a + n) = *(u32 *)(b + n);
                *(u32 *)(b + n) = t;

                t = *(u32 *)(a + (n -= 4));
                *(u32 *)(a + n) = *(u32 *)(b + n);
                *(u32 *)(b + n) = t;
#endif
        } while (n);
}

/**
 * swap_bytes - swap two elements a byte at a time
 * @a: pointer to the first element to swap
 * @b: pointer to the second element to swap
 * @n: element size
 *
 * This is the fallback if alignment doesn't allow using larger chunks.
 */
static __always_inline
void swap_bytes(void *a, void *b, size_t n)
{
        do {
                char t = ((char *)a)[--n];
                ((char *)a)[n] = ((char *)b)[n];
                ((char *)b)[n] = t;
        } while (n);
}

/*
 * The values are arbitrary as long as they can't be confused with
 * a pointer, but small integers make for the smallest compare
 * instructions.
 */
#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0)
#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1)
#define SWAP_BYTES    ((void (*)(void *, void *, void *))2)

/*
 * Selects the appropriate swap function based on the element size.
 */
static __always_inline
void *select_swap_func(const void *base, size_t size)
{
        if (is_aligned(base, size, 8))
                return SWAP_WORDS_64;
        else if (is_aligned(base, size, 4))
                return SWAP_WORDS_32;
        else
                return SWAP_BYTES;
}

static __always_inline
void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args),
             void *priv)
{
        if (swap_func == SWAP_WORDS_64)
                swap_words_64(a, b, size);
        else if (swap_func == SWAP_WORDS_32)
                swap_words_32(a, b, size);
        else if (swap_func == SWAP_BYTES)
                swap_bytes(a, b, size);
        else
                swap_func(a, b, priv);
}

/**
 * parent - given the offset of the child, find the offset of the parent.
 * @i: the offset of the heap element whose parent is sought.  Non-zero.
 * @lsbit: a precomputed 1-bit mask, equal to "size & -size"
 * @size: size of each element
 *
 * In terms of array indexes, the parent of element j = @i/@size is simply
 * (j-1)/2.  But when working in byte offsets, we can't use implicit
 * truncation of integer divides.
 *
 * Fortunately, we only need one bit of the quotient, not the full divide.
 * @size has a least significant bit.  That bit will be clear if @i is
 * an even multiple of @size, and set if it's an odd multiple.
 *
 * Logically, we're doing "if (i & lsbit) i -= size;", but since the
 * branch is unpredictable, it's done with a bit of clever branch-free
 * code instead.
 */
__attribute_const__ __always_inline
static size_t parent(size_t i, unsigned int lsbit, size_t size)
{
        i -= size;
        i -= size & -(i & lsbit);
        return i / 2;
}

/* Initialize a min-heap. */
static __always_inline
void __min_heap_init_inline(min_heap_char *heap, void *data, size_t size)
{
        heap->nr = 0;
        heap->size = size;
        if (data)
                heap->data = data;
        else
                heap->data = heap->preallocated;
}

#define min_heap_init_inline(_heap, _data, _size)        \
        __min_heap_init_inline(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size)

/* Get the minimum element from the heap. */
static __always_inline
void *__min_heap_peek_inline(struct min_heap_char *heap)
{
        return heap->nr ? heap->data : NULL;
}

#define min_heap_peek_inline(_heap)        \
        (__minheap_cast(_heap)        \
         __min_heap_peek_inline(container_of(&(_heap)->nr, min_heap_char, nr)))

/* Check if the heap is full. */
static __always_inline
bool __min_heap_full_inline(min_heap_char *heap)
{
        return heap->nr == heap->size;
}

#define min_heap_full_inline(_heap)        \
        __min_heap_full_inline(container_of(&(_heap)->nr, min_heap_char, nr))

/* Sift the element at pos down the heap. */
static __always_inline
void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size,
                                 const struct min_heap_callbacks *func, void *args)
{
        const unsigned long lsbit = elem_size & -elem_size;
        void *data = heap->data;
        void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
        /* pre-scale counters for performance */
        size_t a = pos * elem_size;
        size_t b, c, d;
        size_t n = heap->nr * elem_size;

        if (!swp)
                swp = select_swap_func(data, elem_size);

        /* Find the sift-down path all the way to the leaves. */
        for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;)
                b = func->less(data + c, data + d, args) ? c : d;

        /* Special case for the last leaf with no sibling. */
        if (d == n)
                b = c;

        /* Backtrack to the correct location. */
        while (b != a && func->less(data + a, data + b, args))
                b = parent(b, lsbit, elem_size);

        /* Shift the element into its correct place. */
        c = b;
        while (b != a) {
                b = parent(b, lsbit, elem_size);
                do_swap(data + b, data + c, elem_size, swp, args);
        }
}

#define min_heap_sift_down_inline(_heap, _pos, _func, _args)        \
        __min_heap_sift_down_inline(container_of(&(_heap)->nr, min_heap_char, nr), _pos,        \
                                    __minheap_obj_size(_heap), _func, _args)

/* Sift up ith element from the heap, O(log2(nr)). */
static __always_inline
void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx,
                               const struct min_heap_callbacks *func, void *args)
{
        const unsigned long lsbit = elem_size & -elem_size;
        void *data = heap->data;
        void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
        /* pre-scale counters for performance */
        size_t a = idx * elem_size, b;

        if (!swp)
                swp = select_swap_func(data, elem_size);

        while (a) {
                b = parent(a, lsbit, elem_size);
                if (func->less(data + b, data + a, args))
                        break;
                do_swap(data + a, data + b, elem_size, swp, args);
                a = b;
        }
}

#define min_heap_sift_up_inline(_heap, _idx, _func, _args)        \
        __min_heap_sift_up_inline(container_of(&(_heap)->nr, min_heap_char, nr),        \
                                  __minheap_obj_size(_heap), _idx, _func, _args)

/* Floyd's approach to heapification that is O(nr). */
static __always_inline
void __min_heapify_all_inline(min_heap_char *heap, size_t elem_size,
                              const struct min_heap_callbacks *func, void *args)
{
        ssize_t i;

        for (i = heap->nr / 2 - 1; i >= 0; i--)
                __min_heap_sift_down_inline(heap, i, elem_size, func, args);
}

#define min_heapify_all_inline(_heap, _func, _args)        \
        __min_heapify_all_inline(container_of(&(_heap)->nr, min_heap_char, nr),        \
                                 __minheap_obj_size(_heap), _func, _args)

/* Remove minimum element from the heap, O(log2(nr)). */
static __always_inline
bool __min_heap_pop_inline(min_heap_char *heap, size_t elem_size,
                           const struct min_heap_callbacks *func, void *args)
{
        void *data = heap->data;

        if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
                return false;

        /* Place last element at the root (position 0) and then sift down. */
        heap->nr--;
        memcpy(data, data + (heap->nr * elem_size), elem_size);
        __min_heap_sift_down_inline(heap, 0, elem_size, func, args);

        return true;
}

#define min_heap_pop_inline(_heap, _func, _args)        \
        __min_heap_pop_inline(container_of(&(_heap)->nr, min_heap_char, nr),        \
                              __minheap_obj_size(_heap), _func, _args)

/*
 * Remove the minimum element and then push the given element. The
 * implementation performs 1 sift (O(log2(nr))) and is therefore more
 * efficient than a pop followed by a push that does 2.
 */
static __always_inline
void __min_heap_pop_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
                                const struct min_heap_callbacks *func, void *args)
{
        memcpy(heap->data, element, elem_size);
        __min_heap_sift_down_inline(heap, 0, elem_size, func, args);
}

#define min_heap_pop_push_inline(_heap, _element, _func, _args)        \
        __min_heap_pop_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element,        \
                                   __minheap_obj_size(_heap), _func, _args)

/* Push an element on to the heap, O(log2(nr)). */
static __always_inline
bool __min_heap_push_inline(min_heap_char *heap, const void *element, size_t elem_size,
                            const struct min_heap_callbacks *func, void *args)
{
        void *data = heap->data;
        size_t pos;

        if (WARN_ONCE(heap->nr >= heap->size, "Pushing on a full heap"))
                return false;

        /* Place at the end of data. */
        pos = heap->nr;
        memcpy(data + (pos * elem_size), element, elem_size);
        heap->nr++;

        /* Sift child at pos up. */
        __min_heap_sift_up_inline(heap, elem_size, pos, func, args);

        return true;
}

#define min_heap_push_inline(_heap, _element, _func, _args)        \
        __min_heap_push_inline(container_of(&(_heap)->nr, min_heap_char, nr), _element,        \
                                            __minheap_obj_size(_heap), _func, _args)

/* Remove ith element from the heap, O(log2(nr)). */
static __always_inline
bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx,
                           const struct min_heap_callbacks *func, void *args)
{
        void *data = heap->data;
        void (*swp)(void *lhs, void *rhs, void *args) = func->swp;

        if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
                return false;

        if (!swp)
                swp = select_swap_func(data, elem_size);

        /* Place last element at the root (position 0) and then sift down. */
        heap->nr--;
        if (idx == heap->nr)
                return true;
        do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args);
        __min_heap_sift_up_inline(heap, elem_size, idx, func, args);
        __min_heap_sift_down_inline(heap, idx, elem_size, func, args);

        return true;
}

#define min_heap_del_inline(_heap, _idx, _func, _args)        \
        __min_heap_del_inline(container_of(&(_heap)->nr, min_heap_char, nr),        \
                              __minheap_obj_size(_heap), _idx, _func, _args)

void __min_heap_init(min_heap_char *heap, void *data, size_t size);
void *__min_heap_peek(struct min_heap_char *heap);
bool __min_heap_full(min_heap_char *heap);
void __min_heap_sift_down(min_heap_char *heap, size_t pos, size_t elem_size,
                          const struct min_heap_callbacks *func, void *args);
void __min_heap_sift_up(min_heap_char *heap, size_t elem_size, size_t idx,
                        const struct min_heap_callbacks *func, void *args);
void __min_heapify_all(min_heap_char *heap, size_t elem_size,
                       const struct min_heap_callbacks *func, void *args);
bool __min_heap_pop(min_heap_char *heap, size_t elem_size,
                    const struct min_heap_callbacks *func, void *args);
void __min_heap_pop_push(min_heap_char *heap, const void *element, size_t elem_size,
                         const struct min_heap_callbacks *func, void *args);
bool __min_heap_push(min_heap_char *heap, const void *element, size_t elem_size,
                     const struct min_heap_callbacks *func, void *args);
bool __min_heap_del(min_heap_char *heap, size_t elem_size, size_t idx,
                    const struct min_heap_callbacks *func, void *args);

#define min_heap_init(_heap, _data, _size)        \
        __min_heap_init(container_of(&(_heap)->nr, min_heap_char, nr), _data, _size)
#define min_heap_peek(_heap)        \
        (__minheap_cast(_heap) __min_heap_peek(container_of(&(_heap)->nr, min_heap_char, nr)))
#define min_heap_full(_heap)        \
        __min_heap_full(container_of(&(_heap)->nr, min_heap_char, nr))
#define min_heap_sift_down(_heap, _pos, _func, _args)        \
        __min_heap_sift_down(container_of(&(_heap)->nr, min_heap_char, nr), _pos,        \
                             __minheap_obj_size(_heap), _func, _args)
#define min_heap_sift_up(_heap, _idx, _func, _args)        \
        __min_heap_sift_up(container_of(&(_heap)->nr, min_heap_char, nr),        \
                           __minheap_obj_size(_heap), _idx, _func, _args)
#define min_heapify_all(_heap, _func, _args)        \
        __min_heapify_all(container_of(&(_heap)->nr, min_heap_char, nr),        \
                          __minheap_obj_size(_heap), _func, _args)
#define min_heap_pop(_heap, _func, _args)        \
        __min_heap_pop(container_of(&(_heap)->nr, min_heap_char, nr),        \
                       __minheap_obj_size(_heap), _func, _args)
#define min_heap_pop_push(_heap, _element, _func, _args)        \
        __min_heap_pop_push(container_of(&(_heap)->nr, min_heap_char, nr), _element,        \
                            __minheap_obj_size(_heap), _func, _args)
#define min_heap_push(_heap, _element, _func, _args)        \
        __min_heap_push(container_of(&(_heap)->nr, min_heap_char, nr), _element,        \
                        __minheap_obj_size(_heap), _func, _args)
#define min_heap_del(_heap, _idx, _func, _args)        \
        __min_heap_del(container_of(&(_heap)->nr, min_heap_char, nr),        \
                       __minheap_obj_size(_heap), _idx, _func, _args)

#endif /* _LINUX_MIN_HEAP_H */



















    1 

   84 
































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Guest PC manipulation helpers
 *
 * Copyright (C) 2012,2013 - ARM Ltd
 * Copyright (C) 2020 - Google LLC
 * Author: Marc Zyngier <maz@kernel.org>
 */

#ifndef __ARM64_KVM_HYP_ADJUST_PC_H__
#define __ARM64_KVM_HYP_ADJUST_PC_H__

#include <asm/kvm_emulate.h>
#include <asm/kvm_host.h>

static inline void kvm_skip_instr(struct kvm_vcpu *vcpu)
{
        if (vcpu_mode_is_32bit(vcpu)) {
                kvm_skip_instr32(vcpu);
        } else {
                *vcpu_pc(vcpu) += 4;
                *vcpu_cpsr(vcpu) &= ~PSR_BTYPE_MASK;
        }

        /* advance the singlestep state machine */
        *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
}

/*
 * Skip an instruction which has been emulated at hyp while most guest sysregs
 * are live.
 */
static inline void __kvm_skip_instr(struct kvm_vcpu *vcpu)
{
        *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
        vcpu_gp_regs(vcpu)->pstate = read_sysreg_el2(SYS_SPSR);

        kvm_skip_instr(vcpu);

        write_sysreg_el2(vcpu_gp_regs(vcpu)->pstate, SYS_SPSR);
        write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);
}

/*
 * Skip an instruction while host sysregs are live.
 * Assumes host is always 64-bit.
 */
static inline void kvm_skip_host_instr(void)
{
        write_sysreg_el2(read_sysreg_el2(SYS_ELR) + 4, SYS_ELR);
}

#endif




























 1481 

  353 
















   22 

  566 

  755 
  740 
  708 










  251 
  681 

  790 

  168 















 1259 
  657 
  275 
  209 


 1500 
 1496 










































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/atomic.h
 *
 * Copyright (C) 1996 Russell King.
 * Copyright (C) 2002 Deep Blue Solutions Ltd.
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_ATOMIC_H
#define __ASM_ATOMIC_H

#include <linux/compiler.h>
#include <linux/types.h>

#include <asm/barrier.h>
#include <asm/cmpxchg.h>
#include <asm/lse.h>

#define ATOMIC_OP(op)                                                        \
static __always_inline void arch_##op(int i, atomic_t *v)                \
{                                                                        \
        __lse_ll_sc_body(op, i, v);                                        \
}

ATOMIC_OP(atomic_andnot)
ATOMIC_OP(atomic_or)
ATOMIC_OP(atomic_xor)
ATOMIC_OP(atomic_add)
ATOMIC_OP(atomic_and)
ATOMIC_OP(atomic_sub)

#undef ATOMIC_OP

#define ATOMIC_FETCH_OP(name, op)                                        \
static __always_inline int arch_##op##name(int i, atomic_t *v)                \
{                                                                        \
        return __lse_ll_sc_body(op##name, i, v);                        \
}

#define ATOMIC_FETCH_OPS(op)                                                \
        ATOMIC_FETCH_OP(_relaxed, op)                                        \
        ATOMIC_FETCH_OP(_acquire, op)                                        \
        ATOMIC_FETCH_OP(_release, op)                                        \
        ATOMIC_FETCH_OP(        , op)

ATOMIC_FETCH_OPS(atomic_fetch_andnot)
ATOMIC_FETCH_OPS(atomic_fetch_or)
ATOMIC_FETCH_OPS(atomic_fetch_xor)
ATOMIC_FETCH_OPS(atomic_fetch_add)
ATOMIC_FETCH_OPS(atomic_fetch_and)
ATOMIC_FETCH_OPS(atomic_fetch_sub)
ATOMIC_FETCH_OPS(atomic_add_return)
ATOMIC_FETCH_OPS(atomic_sub_return)

#undef ATOMIC_FETCH_OP
#undef ATOMIC_FETCH_OPS

#define ATOMIC64_OP(op)                                                        \
static __always_inline void arch_##op(long i, atomic64_t *v)                \
{                                                                        \
        __lse_ll_sc_body(op, i, v);                                        \
}

ATOMIC64_OP(atomic64_andnot)
ATOMIC64_OP(atomic64_or)
ATOMIC64_OP(atomic64_xor)
ATOMIC64_OP(atomic64_add)
ATOMIC64_OP(atomic64_and)
ATOMIC64_OP(atomic64_sub)

#undef ATOMIC64_OP

#define ATOMIC64_FETCH_OP(name, op)                                        \
static __always_inline long arch_##op##name(long i, atomic64_t *v)        \
{                                                                        \
        return __lse_ll_sc_body(op##name, i, v);                        \
}

#define ATOMIC64_FETCH_OPS(op)                                                \
        ATOMIC64_FETCH_OP(_relaxed, op)                                        \
        ATOMIC64_FETCH_OP(_acquire, op)                                        \
        ATOMIC64_FETCH_OP(_release, op)                                        \
        ATOMIC64_FETCH_OP(        , op)

ATOMIC64_FETCH_OPS(atomic64_fetch_andnot)
ATOMIC64_FETCH_OPS(atomic64_fetch_or)
ATOMIC64_FETCH_OPS(atomic64_fetch_xor)
ATOMIC64_FETCH_OPS(atomic64_fetch_add)
ATOMIC64_FETCH_OPS(atomic64_fetch_and)
ATOMIC64_FETCH_OPS(atomic64_fetch_sub)
ATOMIC64_FETCH_OPS(atomic64_add_return)
ATOMIC64_FETCH_OPS(atomic64_sub_return)

#undef ATOMIC64_FETCH_OP
#undef ATOMIC64_FETCH_OPS

static __always_inline long arch_atomic64_dec_if_positive(atomic64_t *v)
{
        return __lse_ll_sc_body(atomic64_dec_if_positive, v);
}

#define arch_atomic_read(v)                        __READ_ONCE((v)->counter)
#define arch_atomic_set(v, i)                        __WRITE_ONCE(((v)->counter), (i))

#define arch_atomic_add_return_relaxed                arch_atomic_add_return_relaxed
#define arch_atomic_add_return_acquire                arch_atomic_add_return_acquire
#define arch_atomic_add_return_release                arch_atomic_add_return_release
#define arch_atomic_add_return                        arch_atomic_add_return

#define arch_atomic_sub_return_relaxed                arch_atomic_sub_return_relaxed
#define arch_atomic_sub_return_acquire                arch_atomic_sub_return_acquire
#define arch_atomic_sub_return_release                arch_atomic_sub_return_release
#define arch_atomic_sub_return                        arch_atomic_sub_return

#define arch_atomic_fetch_add_relaxed                arch_atomic_fetch_add_relaxed
#define arch_atomic_fetch_add_acquire                arch_atomic_fetch_add_acquire
#define arch_atomic_fetch_add_release                arch_atomic_fetch_add_release
#define arch_atomic_fetch_add                        arch_atomic_fetch_add

#define arch_atomic_fetch_sub_relaxed                arch_atomic_fetch_sub_relaxed
#define arch_atomic_fetch_sub_acquire                arch_atomic_fetch_sub_acquire
#define arch_atomic_fetch_sub_release                arch_atomic_fetch_sub_release
#define arch_atomic_fetch_sub                        arch_atomic_fetch_sub

#define arch_atomic_fetch_and_relaxed                arch_atomic_fetch_and_relaxed
#define arch_atomic_fetch_and_acquire                arch_atomic_fetch_and_acquire
#define arch_atomic_fetch_and_release                arch_atomic_fetch_and_release
#define arch_atomic_fetch_and                        arch_atomic_fetch_and

#define arch_atomic_fetch_andnot_relaxed        arch_atomic_fetch_andnot_relaxed
#define arch_atomic_fetch_andnot_acquire        arch_atomic_fetch_andnot_acquire
#define arch_atomic_fetch_andnot_release        arch_atomic_fetch_andnot_release
#define arch_atomic_fetch_andnot                arch_atomic_fetch_andnot

#define arch_atomic_fetch_or_relaxed                arch_atomic_fetch_or_relaxed
#define arch_atomic_fetch_or_acquire                arch_atomic_fetch_or_acquire
#define arch_atomic_fetch_or_release                arch_atomic_fetch_or_release
#define arch_atomic_fetch_or                        arch_atomic_fetch_or

#define arch_atomic_fetch_xor_relaxed                arch_atomic_fetch_xor_relaxed
#define arch_atomic_fetch_xor_acquire                arch_atomic_fetch_xor_acquire
#define arch_atomic_fetch_xor_release                arch_atomic_fetch_xor_release
#define arch_atomic_fetch_xor                        arch_atomic_fetch_xor

#define arch_atomic_andnot                        arch_atomic_andnot

/*
 * 64-bit arch_atomic operations.
 */
#define ATOMIC64_INIT                                ATOMIC_INIT
#define arch_atomic64_read                        arch_atomic_read
#define arch_atomic64_set                        arch_atomic_set

#define arch_atomic64_add_return_relaxed        arch_atomic64_add_return_relaxed
#define arch_atomic64_add_return_acquire        arch_atomic64_add_return_acquire
#define arch_atomic64_add_return_release        arch_atomic64_add_return_release
#define arch_atomic64_add_return                arch_atomic64_add_return

#define arch_atomic64_sub_return_relaxed        arch_atomic64_sub_return_relaxed
#define arch_atomic64_sub_return_acquire        arch_atomic64_sub_return_acquire
#define arch_atomic64_sub_return_release        arch_atomic64_sub_return_release
#define arch_atomic64_sub_return                arch_atomic64_sub_return

#define arch_atomic64_fetch_add_relaxed                arch_atomic64_fetch_add_relaxed
#define arch_atomic64_fetch_add_acquire                arch_atomic64_fetch_add_acquire
#define arch_atomic64_fetch_add_release                arch_atomic64_fetch_add_release
#define arch_atomic64_fetch_add                        arch_atomic64_fetch_add

#define arch_atomic64_fetch_sub_relaxed                arch_atomic64_fetch_sub_relaxed
#define arch_atomic64_fetch_sub_acquire                arch_atomic64_fetch_sub_acquire
#define arch_atomic64_fetch_sub_release                arch_atomic64_fetch_sub_release
#define arch_atomic64_fetch_sub                        arch_atomic64_fetch_sub

#define arch_atomic64_fetch_and_relaxed                arch_atomic64_fetch_and_relaxed
#define arch_atomic64_fetch_and_acquire                arch_atomic64_fetch_and_acquire
#define arch_atomic64_fetch_and_release                arch_atomic64_fetch_and_release
#define arch_atomic64_fetch_and                        arch_atomic64_fetch_and

#define arch_atomic64_fetch_andnot_relaxed        arch_atomic64_fetch_andnot_relaxed
#define arch_atomic64_fetch_andnot_acquire        arch_atomic64_fetch_andnot_acquire
#define arch_atomic64_fetch_andnot_release        arch_atomic64_fetch_andnot_release
#define arch_atomic64_fetch_andnot                arch_atomic64_fetch_andnot

#define arch_atomic64_fetch_or_relaxed                arch_atomic64_fetch_or_relaxed
#define arch_atomic64_fetch_or_acquire                arch_atomic64_fetch_or_acquire
#define arch_atomic64_fetch_or_release                arch_atomic64_fetch_or_release
#define arch_atomic64_fetch_or                        arch_atomic64_fetch_or

#define arch_atomic64_fetch_xor_relaxed                arch_atomic64_fetch_xor_relaxed
#define arch_atomic64_fetch_xor_acquire                arch_atomic64_fetch_xor_acquire
#define arch_atomic64_fetch_xor_release                arch_atomic64_fetch_xor_release
#define arch_atomic64_fetch_xor                        arch_atomic64_fetch_xor

#define arch_atomic64_andnot                        arch_atomic64_andnot

#define arch_atomic64_dec_if_positive                arch_atomic64_dec_if_positive

#endif /* __ASM_ATOMIC_H */


















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    3 

    3 











































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
// SPDX-License-Identifier: GPL-2.0-only
/*
 * net/core/fib_rules.c                Generic Routing Rules
 *
 * Authors:        Thomas Graf <tgraf@suug.ch>
 */

#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/module.h>
#include <net/net_namespace.h>
#include <net/inet_dscp.h>
#include <net/sock.h>
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
#include <linux/indirect_call_wrapper.h>

#if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES)
#ifdef CONFIG_IP_MULTIPLE_TABLES
#define INDIRECT_CALL_MT(f, f2, f1, ...) \
        INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__)
#else
#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
#endif
#elif defined(CONFIG_IP_MULTIPLE_TABLES)
#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
#else
#define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__)
#endif

static const struct fib_kuid_range fib_kuid_range_unset = {
        KUIDT_INIT(0),
        KUIDT_INIT(~0),
};

bool fib_rule_matchall(const struct fib_rule *rule)
{
        if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) ||
            rule->mark || rule->tun_id || rule->flags)
                return false;
        if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1)
                return false;
        if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) ||
            !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end))
                return false;
        if (fib_rule_port_range_set(&rule->sport_range))
                return false;
        if (fib_rule_port_range_set(&rule->dport_range))
                return false;
        return true;
}
EXPORT_SYMBOL_GPL(fib_rule_matchall);

int fib_default_rule_add(struct fib_rules_ops *ops,
                         u32 pref, u32 table)
{
        struct fib_rule *r;

        r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
        if (r == NULL)
                return -ENOMEM;

        refcount_set(&r->refcnt, 1);
        r->action = FR_ACT_TO_TBL;
        r->pref = pref;
        r->table = table;
        r->proto = RTPROT_KERNEL;
        r->fr_net = ops->fro_net;
        r->uid_range = fib_kuid_range_unset;

        r->suppress_prefixlen = -1;
        r->suppress_ifgroup = -1;

        /* The lock is not required here, the list in unreachable
         * at the moment this function is called */
        list_add_tail(&r->list, &ops->rules_list);
        return 0;
}
EXPORT_SYMBOL(fib_default_rule_add);

static u32 fib_default_rule_pref(struct fib_rules_ops *ops)
{
        struct list_head *pos;
        struct fib_rule *rule;

        if (!list_empty(&ops->rules_list)) {
                pos = ops->rules_list.next;
                if (pos->next != &ops->rules_list) {
                        rule = list_entry(pos->next, struct fib_rule, list);
                        if (rule->pref)
                                return rule->pref - 1;
                }
        }

        return 0;
}

static void notify_rule_change(int event, struct fib_rule *rule,
                               struct fib_rules_ops *ops, struct nlmsghdr *nlh,
                               u32 pid);

static struct fib_rules_ops *lookup_rules_ops(const struct net *net,
                                              int family)
{
        struct fib_rules_ops *ops;

        rcu_read_lock();
        list_for_each_entry_rcu(ops, &net->rules_ops, list) {
                if (ops->family == family) {
                        if (!try_module_get(ops->owner))
                                ops = NULL;
                        rcu_read_unlock();
                        return ops;
                }
        }
        rcu_read_unlock();

        return NULL;
}

static void rules_ops_put(struct fib_rules_ops *ops)
{
        if (ops)
                module_put(ops->owner);
}

static void flush_route_cache(struct fib_rules_ops *ops)
{
        if (ops->flush_cache)
                ops->flush_cache(ops);
}

static int __fib_rules_register(struct fib_rules_ops *ops)
{
        int err = -EEXIST;
        struct fib_rules_ops *o;
        struct net *net;

        net = ops->fro_net;

        if (ops->rule_size < sizeof(struct fib_rule))
                return -EINVAL;

        if (ops->match == NULL || ops->configure == NULL ||
            ops->compare == NULL || ops->fill == NULL ||
            ops->action == NULL)
                return -EINVAL;

        spin_lock(&net->rules_mod_lock);
        list_for_each_entry(o, &net->rules_ops, list)
                if (ops->family == o->family)
                        goto errout;

        list_add_tail_rcu(&ops->list, &net->rules_ops);
        err = 0;
errout:
        spin_unlock(&net->rules_mod_lock);

        return err;
}

struct fib_rules_ops *
fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net)
{
        struct fib_rules_ops *ops;
        int err;

        ops = kmemdup(tmpl, sizeof(*ops), GFP_KERNEL);
        if (ops == NULL)
                return ERR_PTR(-ENOMEM);

        INIT_LIST_HEAD(&ops->rules_list);
        ops->fro_net = net;

        err = __fib_rules_register(ops);
        if (err) {
                kfree(ops);
                ops = ERR_PTR(err);
        }

        return ops;
}
EXPORT_SYMBOL_GPL(fib_rules_register);

static void fib_rules_cleanup_ops(struct fib_rules_ops *ops)
{
        struct fib_rule *rule, *tmp;

        list_for_each_entry_safe(rule, tmp, &ops->rules_list, list) {
                list_del_rcu(&rule->list);
                if (ops->delete)
                        ops->delete(rule);
                fib_rule_put(rule);
        }
}

void fib_rules_unregister(struct fib_rules_ops *ops)
{
        struct net *net = ops->fro_net;

        spin_lock(&net->rules_mod_lock);
        list_del_rcu(&ops->list);
        spin_unlock(&net->rules_mod_lock);

        fib_rules_cleanup_ops(ops);
        kfree_rcu(ops, rcu);
}
EXPORT_SYMBOL_GPL(fib_rules_unregister);

static int uid_range_set(struct fib_kuid_range *range)
{
        return uid_valid(range->start) && uid_valid(range->end);
}

static struct fib_kuid_range nla_get_kuid_range(struct nlattr **tb)
{
        struct fib_rule_uid_range *in;
        struct fib_kuid_range out;

        in = (struct fib_rule_uid_range *)nla_data(tb[FRA_UID_RANGE]);

        out.start = make_kuid(current_user_ns(), in->start);
        out.end = make_kuid(current_user_ns(), in->end);

        return out;
}

static int nla_put_uid_range(struct sk_buff *skb, struct fib_kuid_range *range)
{
        struct fib_rule_uid_range out = {
                from_kuid_munged(current_user_ns(), range->start),
                from_kuid_munged(current_user_ns(), range->end)
        };

        return nla_put(skb, FRA_UID_RANGE, sizeof(out), &out);
}

static int nla_get_port_range(struct nlattr *pattr,
                              struct fib_rule_port_range *port_range)
{
        const struct fib_rule_port_range *pr = nla_data(pattr);

        if (!fib_rule_port_range_valid(pr))
                return -EINVAL;

        port_range->start = pr->start;
        port_range->end = pr->end;

        return 0;
}

static int nla_put_port_range(struct sk_buff *skb, int attrtype,
                              struct fib_rule_port_range *range)
{
        return nla_put(skb, attrtype, sizeof(*range), range);
}

static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex,
                               const struct flowi *fl)
{
        u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master);

        return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) :
                                  fl->flowi_iif == iifindex;
}

static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex,
                               const struct flowi *fl)
{
        u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master);

        return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) :
                                  fl->flowi_oif == oifindex;
}

static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
                          struct flowi *fl, int flags,
                          struct fib_lookup_arg *arg)
{
        int iifindex, oifindex, ret = 0;

        iifindex = READ_ONCE(rule->iifindex);
        if (iifindex && !fib_rule_iif_match(rule, iifindex, fl))
                goto out;

        oifindex = READ_ONCE(rule->oifindex);
        if (oifindex && !fib_rule_oif_match(rule, oifindex, fl))
                goto out;

        if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
                goto out;

        if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id))
                goto out;

        if (rule->l3mdev && !l3mdev_fib_rule_match(rule->fr_net, fl, arg))
                goto out;

        if (uid_lt(fl->flowi_uid, rule->uid_range.start) ||
            uid_gt(fl->flowi_uid, rule->uid_range.end))
                goto out;

        ret = INDIRECT_CALL_MT(ops->match,
                               fib6_rule_match,
                               fib4_rule_match,
                               rule, fl, flags);
out:
        return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
}

int fib_rules_lookup(struct fib_rules_ops *ops, struct flowi *fl,
                     int flags, struct fib_lookup_arg *arg)
{
        struct fib_rule *rule;
        int err;

        rcu_read_lock();

        list_for_each_entry_rcu(rule, &ops->rules_list, list) {
jumped:
                if (!fib_rule_match(rule, ops, fl, flags, arg))
                        continue;

                if (rule->action == FR_ACT_GOTO) {
                        struct fib_rule *target;

                        target = rcu_dereference(rule->ctarget);
                        if (target == NULL) {
                                continue;
                        } else {
                                rule = target;
                                goto jumped;
                        }
                } else if (rule->action == FR_ACT_NOP)
                        continue;
                else
                        err = INDIRECT_CALL_MT(ops->action,
                                               fib6_rule_action,
                                               fib4_rule_action,
                                               rule, fl, flags, arg);

                if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress,
                                                              fib6_rule_suppress,
                                                              fib4_rule_suppress,
                                                              rule, flags, arg))
                        continue;

                if (err != -EAGAIN) {
                        if ((arg->flags & FIB_LOOKUP_NOREF) ||
                            likely(refcount_inc_not_zero(&rule->refcnt))) {
                                arg->rule = rule;
                                goto out;
                        }
                        break;
                }
        }

        err = -ESRCH;
out:
        rcu_read_unlock();

        return err;
}
EXPORT_SYMBOL_GPL(fib_rules_lookup);

static int call_fib_rule_notifier(struct notifier_block *nb,
                                  enum fib_event_type event_type,
                                  struct fib_rule *rule, int family,
                                  struct netlink_ext_ack *extack)
{
        struct fib_rule_notifier_info info = {
                .info.family = family,
                .info.extack = extack,
                .rule = rule,
        };

        return call_fib_notifier(nb, event_type, &info.info);
}

static int call_fib_rule_notifiers(struct net *net,
                                   enum fib_event_type event_type,
                                   struct fib_rule *rule,
                                   struct fib_rules_ops *ops,
                                   struct netlink_ext_ack *extack)
{
        struct fib_rule_notifier_info info = {
                .info.family = ops->family,
                .info.extack = extack,
                .rule = rule,
        };

        ASSERT_RTNL_NET(net);

        /* Paired with READ_ONCE() in fib_rules_seq() */
        WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1);
        return call_fib_notifiers(net, event_type, &info.info);
}

/* Called with rcu_read_lock() */
int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
                   struct netlink_ext_ack *extack)
{
        struct fib_rules_ops *ops;
        struct fib_rule *rule;
        int err = 0;

        ops = lookup_rules_ops(net, family);
        if (!ops)
                return -EAFNOSUPPORT;
        list_for_each_entry_rcu(rule, &ops->rules_list, list) {
                err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD,
                                             rule, family, extack);
                if (err)
                        break;
        }
        rules_ops_put(ops);

        return err;
}
EXPORT_SYMBOL_GPL(fib_rules_dump);

unsigned int fib_rules_seq_read(const struct net *net, int family)
{
        unsigned int fib_rules_seq;
        struct fib_rules_ops *ops;

        ops = lookup_rules_ops(net, family);
        if (!ops)
                return 0;
        /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */
        fib_rules_seq = READ_ONCE(ops->fib_rules_seq);
        rules_ops_put(ops);

        return fib_rules_seq;
}
EXPORT_SYMBOL_GPL(fib_rules_seq_read);

static struct fib_rule *rule_find(struct fib_rules_ops *ops,
                                  struct fib_rule_hdr *frh,
                                  struct nlattr **tb,
                                  struct fib_rule *rule,
                                  bool user_priority)
{
        struct fib_rule *r;

        list_for_each_entry(r, &ops->rules_list, list) {
                if (rule->action && r->action != rule->action)
                        continue;

                if (rule->table && r->table != rule->table)
                        continue;

                if (user_priority && r->pref != rule->pref)
                        continue;

                if (rule->iifname[0] &&
                    memcmp(r->iifname, rule->iifname, IFNAMSIZ))
                        continue;

                if (rule->oifname[0] &&
                    memcmp(r->oifname, rule->oifname, IFNAMSIZ))
                        continue;

                if (rule->mark && r->mark != rule->mark)
                        continue;

                if (rule->suppress_ifgroup != -1 &&
                    r->suppress_ifgroup != rule->suppress_ifgroup)
                        continue;

                if (rule->suppress_prefixlen != -1 &&
                    r->suppress_prefixlen != rule->suppress_prefixlen)
                        continue;

                if (rule->mark_mask && r->mark_mask != rule->mark_mask)
                        continue;

                if (rule->tun_id && r->tun_id != rule->tun_id)
                        continue;

                if (rule->l3mdev && r->l3mdev != rule->l3mdev)
                        continue;

                if (uid_range_set(&rule->uid_range) &&
                    (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
                    !uid_eq(r->uid_range.end, rule->uid_range.end)))
                        continue;

                if (rule->ip_proto && r->ip_proto != rule->ip_proto)
                        continue;

                if (rule->proto && r->proto != rule->proto)
                        continue;

                if (fib_rule_port_range_set(&rule->sport_range) &&
                    !fib_rule_port_range_compare(&r->sport_range,
                                                 &rule->sport_range))
                        continue;

                if (rule->sport_mask && r->sport_mask != rule->sport_mask)
                        continue;

                if (fib_rule_port_range_set(&rule->dport_range) &&
                    !fib_rule_port_range_compare(&r->dport_range,
                                                 &rule->dport_range))
                        continue;

                if (rule->dport_mask && r->dport_mask != rule->dport_mask)
                        continue;

                if (!ops->compare(r, frh, tb))
                        continue;
                return r;
        }

        return NULL;
}

#ifdef CONFIG_NET_L3_MASTER_DEV
static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
                              struct netlink_ext_ack *extack)
{
        nlrule->l3mdev = nla_get_u8(nla);
        if (nlrule->l3mdev != 1) {
                NL_SET_ERR_MSG(extack, "Invalid l3mdev attribute");
                return -1;
        }

        return 0;
}
#else
static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
                              struct netlink_ext_ack *extack)
{
        NL_SET_ERR_MSG(extack, "l3mdev support is not enabled in kernel");
        return -1;
}
#endif

static int fib_nl2rule_port_mask(const struct nlattr *mask_attr,
                                 const struct fib_rule_port_range *range,
                                 u16 *port_mask,
                                 struct netlink_ext_ack *extack)
{
        if (!fib_rule_port_range_valid(range)) {
                NL_SET_ERR_MSG_ATTR(extack, mask_attr,
                                    "Cannot specify port mask without port value");
                return -EINVAL;
        }

        if (fib_rule_port_is_range(range)) {
                NL_SET_ERR_MSG_ATTR(extack, mask_attr,
                                    "Cannot specify port mask for port range");
                return -EINVAL;
        }

        if (range->start & ~nla_get_u16(mask_attr)) {
                NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask");
                return -EINVAL;
        }

        *port_mask = nla_get_u16(mask_attr);

        return 0;
}

static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh,
                       struct netlink_ext_ack *extack,
                       struct fib_rules_ops *ops,
                       struct nlattr *tb[],
                       struct fib_rule **rule,
                       bool *user_priority)
{
        struct fib_rule_hdr *frh = nlmsg_data(nlh);
        struct fib_rule *nlrule = NULL;
        int err = -EINVAL;

        if (frh->src_len)
                if (!tb[FRA_SRC] ||
                    frh->src_len > (ops->addr_size * 8) ||
                    nla_len(tb[FRA_SRC]) != ops->addr_size) {
                        NL_SET_ERR_MSG(extack, "Invalid source address");
                        goto errout;
        }

        if (frh->dst_len)
                if (!tb[FRA_DST] ||
                    frh->dst_len > (ops->addr_size * 8) ||
                    nla_len(tb[FRA_DST]) != ops->addr_size) {
                        NL_SET_ERR_MSG(extack, "Invalid dst address");
                        goto errout;
        }

        nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
        if (!nlrule) {
                err = -ENOMEM;
                goto errout;
        }
        refcount_set(&nlrule->refcnt, 1);
        nlrule->fr_net = net;

        if (tb[FRA_PRIORITY]) {
                nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
                *user_priority = true;
        }

        nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC);

        if (tb[FRA_IIFNAME]) {
                nlrule->iifindex = -1;
                nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
        }

        if (tb[FRA_OIFNAME]) {
                nlrule->oifindex = -1;
                nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
        }

        if (tb[FRA_FWMARK]) {
                nlrule->mark = nla_get_u32(tb[FRA_FWMARK]);
                if (nlrule->mark)
                        /* compatibility: if the mark value is non-zero all bits
                         * are compared unless a mask is explicitly specified.
                         */
                        nlrule->mark_mask = 0xFFFFFFFF;
        }

        if (tb[FRA_FWMASK])
                nlrule->mark_mask = nla_get_u32(tb[FRA_FWMASK]);

        if (tb[FRA_TUN_ID])
                nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);

        if (tb[FRA_L3MDEV] &&
            fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
                goto errout_free;

        nlrule->action = frh->action;
        nlrule->flags = frh->flags;
        nlrule->table = frh_get_table(frh, tb);
        if (tb[FRA_SUPPRESS_PREFIXLEN])
                nlrule->suppress_prefixlen = nla_get_u32(tb[FRA_SUPPRESS_PREFIXLEN]);
        else
                nlrule->suppress_prefixlen = -1;

        if (tb[FRA_SUPPRESS_IFGROUP])
                nlrule->suppress_ifgroup = nla_get_u32(tb[FRA_SUPPRESS_IFGROUP]);
        else
                nlrule->suppress_ifgroup = -1;

        if (tb[FRA_GOTO]) {
                if (nlrule->action != FR_ACT_GOTO) {
                        NL_SET_ERR_MSG(extack, "Unexpected goto");
                        goto errout_free;
                }

                nlrule->target = nla_get_u32(tb[FRA_GOTO]);
        } else if (nlrule->action == FR_ACT_GOTO) {
                NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
                goto errout_free;
        }

        if (nlrule->l3mdev && nlrule->table) {
                NL_SET_ERR_MSG(extack, "l3mdev and table are mutually exclusive");
                goto errout_free;
        }

        if (tb[FRA_UID_RANGE]) {
                if (current_user_ns() != net->user_ns) {
                        err = -EPERM;
                        NL_SET_ERR_MSG(extack, "No permission to set uid");
                        goto errout_free;
                }

                nlrule->uid_range = nla_get_kuid_range(tb);

                if (!uid_range_set(&nlrule->uid_range) ||
                    !uid_lte(nlrule->uid_range.start, nlrule->uid_range.end)) {
                        NL_SET_ERR_MSG(extack, "Invalid uid range");
                        goto errout_free;
                }
        } else {
                nlrule->uid_range = fib_kuid_range_unset;
        }

        if (tb[FRA_IP_PROTO])
                nlrule->ip_proto = nla_get_u8(tb[FRA_IP_PROTO]);

        if (tb[FRA_SPORT_RANGE]) {
                err = nla_get_port_range(tb[FRA_SPORT_RANGE],
                                         &nlrule->sport_range);
                if (err) {
                        NL_SET_ERR_MSG(extack, "Invalid sport range");
                        goto errout_free;
                }
                if (!fib_rule_port_is_range(&nlrule->sport_range))
                        nlrule->sport_mask = U16_MAX;
        }

        if (tb[FRA_SPORT_MASK]) {
                err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK],
                                            &nlrule->sport_range,
                                            &nlrule->sport_mask, extack);
                if (err)
                        goto errout_free;
        }

        if (tb[FRA_DPORT_RANGE]) {
                err = nla_get_port_range(tb[FRA_DPORT_RANGE],
                                         &nlrule->dport_range);
                if (err) {
                        NL_SET_ERR_MSG(extack, "Invalid dport range");
                        goto errout_free;
                }
                if (!fib_rule_port_is_range(&nlrule->dport_range))
                        nlrule->dport_mask = U16_MAX;
        }

        if (tb[FRA_DPORT_MASK]) {
                err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK],
                                            &nlrule->dport_range,
                                            &nlrule->dport_mask, extack);
                if (err)
                        goto errout_free;
        }

        *rule = nlrule;

        return 0;

errout_free:
        kfree(nlrule);
errout:
        return err;
}

static int fib_nl2rule_rtnl(struct fib_rule *nlrule,
                            struct fib_rules_ops *ops,
                            struct nlattr *tb[],
                            struct netlink_ext_ack *extack)
{
        if (!tb[FRA_PRIORITY])
                nlrule->pref = fib_default_rule_pref(ops);

        /* Backward jumps are prohibited to avoid endless loops */
        if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) {
                NL_SET_ERR_MSG(extack, "Backward goto not supported");
                return -EINVAL;
        }

        if (tb[FRA_IIFNAME]) {
                struct net_device *dev;

                dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname);
                if (dev) {
                        nlrule->iifindex = dev->ifindex;
                        nlrule->iif_is_l3_master = netif_is_l3_master(dev);
                }
        }

        if (tb[FRA_OIFNAME]) {
                struct net_device *dev;

                dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname);
                if (dev) {
                        nlrule->oifindex = dev->ifindex;
                        nlrule->oif_is_l3_master = netif_is_l3_master(dev);
                }
        }

        return 0;
}

static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
                       struct nlattr **tb, struct fib_rule *rule)
{
        struct fib_rule *r;

        list_for_each_entry(r, &ops->rules_list, list) {
                if (r->action != rule->action)
                        continue;

                if (r->table != rule->table)
                        continue;

                if (r->pref != rule->pref)
                        continue;

                if (memcmp(r->iifname, rule->iifname, IFNAMSIZ))
                        continue;

                if (memcmp(r->oifname, rule->oifname, IFNAMSIZ))
                        continue;

                if (r->mark != rule->mark)
                        continue;

                if (r->suppress_ifgroup != rule->suppress_ifgroup)
                        continue;

                if (r->suppress_prefixlen != rule->suppress_prefixlen)
                        continue;

                if (r->mark_mask != rule->mark_mask)
                        continue;

                if (r->tun_id != rule->tun_id)
                        continue;

                if (r->l3mdev != rule->l3mdev)
                        continue;

                if (!uid_eq(r->uid_range.start, rule->uid_range.start) ||
                    !uid_eq(r->uid_range.end, rule->uid_range.end))
                        continue;

                if (r->ip_proto != rule->ip_proto)
                        continue;

                if (r->proto != rule->proto)
                        continue;

                if (!fib_rule_port_range_compare(&r->sport_range,
                                                 &rule->sport_range))
                        continue;

                if (r->sport_mask != rule->sport_mask)
                        continue;

                if (!fib_rule_port_range_compare(&r->dport_range,
                                                 &rule->dport_range))
                        continue;

                if (r->dport_mask != rule->dport_mask)
                        continue;

                if (!ops->compare(r, frh, tb))
                        continue;
                return 1;
        }
        return 0;
}

static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = {
        [FRA_UNSPEC]        = { .strict_start_type = FRA_DPORT_RANGE + 1 },
        [FRA_IIFNAME]        = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
        [FRA_OIFNAME]        = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
        [FRA_PRIORITY]        = { .type = NLA_U32 },
        [FRA_FWMARK]        = { .type = NLA_U32 },
        [FRA_FLOW]        = { .type = NLA_U32 },
        [FRA_TUN_ID]        = { .type = NLA_U64 },
        [FRA_FWMASK]        = { .type = NLA_U32 },
        [FRA_TABLE]     = { .type = NLA_U32 },
        [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 },
        [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 },
        [FRA_GOTO]        = { .type = NLA_U32 },
        [FRA_L3MDEV]        = { .type = NLA_U8 },
        [FRA_UID_RANGE]        = { .len = sizeof(struct fib_rule_uid_range) },
        [FRA_PROTOCOL]  = { .type = NLA_U8 },
        [FRA_IP_PROTO]  = { .type = NLA_U8 },
        [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
        [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
        [FRA_DSCP]        = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2),
        [FRA_FLOWLABEL] = { .type = NLA_BE32 },
        [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 },
        [FRA_SPORT_MASK] = { .type = NLA_U16 },
        [FRA_DPORT_MASK] = { .type = NLA_U16 },
        [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2),
};

int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
                struct netlink_ext_ack *extack, bool rtnl_held)
{
        struct fib_rule *rule = NULL, *r, *last = NULL;
        struct fib_rule_hdr *frh = nlmsg_data(nlh);
        int err = -EINVAL, unresolved = 0;
        struct fib_rules_ops *ops = NULL;
        struct nlattr *tb[FRA_MAX + 1];
        bool user_priority = false;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
                NL_SET_ERR_MSG(extack, "Invalid msg length");
                goto errout;
        }

        ops = lookup_rules_ops(net, frh->family);
        if (!ops) {
                err = -EAFNOSUPPORT;
                NL_SET_ERR_MSG(extack, "Rule family not supported");
                goto errout;
        }

        err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX,
                                     fib_rule_policy, extack);
        if (err < 0) {
                NL_SET_ERR_MSG(extack, "Error parsing msg");
                goto errout;
        }

        err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority);
        if (err)
                goto errout;

        if (!rtnl_held)
                rtnl_net_lock(net);

        err = fib_nl2rule_rtnl(rule, ops, tb, extack);
        if (err)
                goto errout_free;

        if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
            rule_exists(ops, frh, tb, rule)) {
                err = -EEXIST;
                goto errout_free;
        }

        err = ops->configure(rule, skb, frh, tb, extack);
        if (err < 0)
                goto errout_free;

        err = call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule, ops,
                                      extack);
        if (err < 0)
                goto errout_free;

        list_for_each_entry(r, &ops->rules_list, list) {
                if (r->pref == rule->target) {
                        RCU_INIT_POINTER(rule->ctarget, r);
                        break;
                }
        }

        if (rcu_dereference_protected(rule->ctarget, 1) == NULL)
                unresolved = 1;

        list_for_each_entry(r, &ops->rules_list, list) {
                if (r->pref > rule->pref)
                        break;
                last = r;
        }

        if (last)
                list_add_rcu(&rule->list, &last->list);
        else
                list_add_rcu(&rule->list, &ops->rules_list);

        if (ops->unresolved_rules) {
                /*
                 * There are unresolved goto rules in the list, check if
                 * any of them are pointing to this new rule.
                 */
                list_for_each_entry(r, &ops->rules_list, list) {
                        if (r->action == FR_ACT_GOTO &&
                            r->target == rule->pref &&
                            rtnl_dereference(r->ctarget) == NULL) {
                                rcu_assign_pointer(r->ctarget, rule);
                                if (--ops->unresolved_rules == 0)
                                        break;
                        }
                }
        }

        if (rule->action == FR_ACT_GOTO)
                ops->nr_goto_rules++;

        if (unresolved)
                ops->unresolved_rules++;

        if (rule->tun_id)
                ip_tunnel_need_metadata();

        fib_rule_get(rule);

        if (!rtnl_held)
                rtnl_net_unlock(net);

        notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
        fib_rule_put(rule);
        flush_route_cache(ops);
        rules_ops_put(ops);
        return 0;

errout_free:
        if (!rtnl_held)
                rtnl_net_unlock(net);
        kfree(rule);
errout:
        rules_ops_put(ops);
        return err;
}
EXPORT_SYMBOL_GPL(fib_newrule);

static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false);
}

int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
                struct netlink_ext_ack *extack, bool rtnl_held)
{
        struct fib_rule *rule = NULL, *nlrule = NULL;
        struct fib_rule_hdr *frh = nlmsg_data(nlh);
        struct fib_rules_ops *ops = NULL;
        struct nlattr *tb[FRA_MAX+1];
        bool user_priority = false;
        int err = -EINVAL;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
                NL_SET_ERR_MSG(extack, "Invalid msg length");
                goto errout;
        }

        ops = lookup_rules_ops(net, frh->family);
        if (ops == NULL) {
                err = -EAFNOSUPPORT;
                NL_SET_ERR_MSG(extack, "Rule family not supported");
                goto errout;
        }

        err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX,
                                     fib_rule_policy, extack);
        if (err < 0) {
                NL_SET_ERR_MSG(extack, "Error parsing msg");
                goto errout;
        }

        err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority);
        if (err)
                goto errout;

        if (!rtnl_held)
                rtnl_net_lock(net);

        err = fib_nl2rule_rtnl(nlrule, ops, tb, extack);
        if (err)
                goto errout_free;

        rule = rule_find(ops, frh, tb, nlrule, user_priority);
        if (!rule) {
                err = -ENOENT;
                goto errout_free;
        }

        if (rule->flags & FIB_RULE_PERMANENT) {
                err = -EPERM;
                goto errout_free;
        }

        if (ops->delete) {
                err = ops->delete(rule);
                if (err)
                        goto errout_free;
        }

        if (rule->tun_id)
                ip_tunnel_unneed_metadata();

        list_del_rcu(&rule->list);

        if (rule->action == FR_ACT_GOTO) {
                ops->nr_goto_rules--;
                if (rtnl_dereference(rule->ctarget) == NULL)
                        ops->unresolved_rules--;
        }

        /*
         * Check if this rule is a target to any of them. If so,
         * adjust to the next one with the same preference or
         * disable them. As this operation is eventually very
         * expensive, it is only performed if goto rules, except
         * current if it is goto rule, have actually been added.
         */
        if (ops->nr_goto_rules > 0) {
                struct fib_rule *n, *r;

                n = list_next_entry(rule, list);
                if (&n->list == &ops->rules_list || n->pref != rule->pref)
                        n = NULL;
                list_for_each_entry(r, &ops->rules_list, list) {
                        if (rtnl_dereference(r->ctarget) != rule)
                                continue;
                        rcu_assign_pointer(r->ctarget, n);
                        if (!n)
                                ops->unresolved_rules++;
                }
        }

        call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL);

        if (!rtnl_held)
                rtnl_net_unlock(net);

        notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
        fib_rule_put(rule);
        flush_route_cache(ops);
        rules_ops_put(ops);
        kfree(nlrule);
        return 0;

errout_free:
        if (!rtnl_held)
                rtnl_net_unlock(net);
        kfree(nlrule);
errout:
        rules_ops_put(ops);
        return err;
}
EXPORT_SYMBOL_GPL(fib_delrule);

static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false);
}

static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
                                         struct fib_rule *rule)
{
        size_t payload = NLMSG_ALIGN(sizeof(struct fib_rule_hdr))
                         + nla_total_size(IFNAMSIZ) /* FRA_IIFNAME */
                         + nla_total_size(IFNAMSIZ) /* FRA_OIFNAME */
                         + nla_total_size(4) /* FRA_PRIORITY */
                         + nla_total_size(4) /* FRA_TABLE */
                         + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */
                         + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */
                         + nla_total_size(4) /* FRA_FWMARK */
                         + nla_total_size(4) /* FRA_FWMASK */
                         + nla_total_size_64bit(8) /* FRA_TUN_ID */
                         + nla_total_size(sizeof(struct fib_kuid_range))
                         + nla_total_size(1) /* FRA_PROTOCOL */
                         + nla_total_size(1) /* FRA_IP_PROTO */
                         + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */
                         + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */
                         + nla_total_size(2) /* FRA_SPORT_MASK */
                         + nla_total_size(2); /* FRA_DPORT_MASK */

        if (ops->nlmsg_payload)
                payload += ops->nlmsg_payload(rule);

        return payload;
}

static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
                            u32 pid, u32 seq, int type, int flags,
                            struct fib_rules_ops *ops)
{
        struct nlmsghdr *nlh;
        struct fib_rule_hdr *frh;

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*frh), flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        frh = nlmsg_data(nlh);
        frh->family = ops->family;
        frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT;
        if (nla_put_u32(skb, FRA_TABLE, rule->table))
                goto nla_put_failure;
        if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
                goto nla_put_failure;
        frh->res1 = 0;
        frh->res2 = 0;
        frh->action = rule->action;
        frh->flags = rule->flags;

        if (nla_put_u8(skb, FRA_PROTOCOL, rule->proto))
                goto nla_put_failure;

        if (rule->action == FR_ACT_GOTO &&
            rcu_access_pointer(rule->ctarget) == NULL)
                frh->flags |= FIB_RULE_UNRESOLVED;

        if (rule->iifname[0]) {
                if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
                        goto nla_put_failure;
                if (READ_ONCE(rule->iifindex) == -1)
                        frh->flags |= FIB_RULE_IIF_DETACHED;
        }

        if (rule->oifname[0]) {
                if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
                        goto nla_put_failure;
                if (READ_ONCE(rule->oifindex) == -1)
                        frh->flags |= FIB_RULE_OIF_DETACHED;
        }

        if ((rule->pref &&
             nla_put_u32(skb, FRA_PRIORITY, rule->pref)) ||
            (rule->mark &&
             nla_put_u32(skb, FRA_FWMARK, rule->mark)) ||
            ((rule->mark_mask || rule->mark) &&
             nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) ||
            (rule->target &&
             nla_put_u32(skb, FRA_GOTO, rule->target)) ||
            (rule->tun_id &&
             nla_put_be64(skb, FRA_TUN_ID, rule->tun_id, FRA_PAD)) ||
            (rule->l3mdev &&
             nla_put_u8(skb, FRA_L3MDEV, rule->l3mdev)) ||
            (uid_range_set(&rule->uid_range) &&
             nla_put_uid_range(skb, &rule->uid_range)) ||
            (fib_rule_port_range_set(&rule->sport_range) &&
             nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) ||
            (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK,
                                             rule->sport_mask)) ||
            (fib_rule_port_range_set(&rule->dport_range) &&
             nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) ||
            (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK,
                                             rule->dport_mask)) ||
            (rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto)))
                goto nla_put_failure;

        if (rule->suppress_ifgroup != -1) {
                if (nla_put_u32(skb, FRA_SUPPRESS_IFGROUP, rule->suppress_ifgroup))
                        goto nla_put_failure;
        }

        if (ops->fill(rule, skb, frh) < 0)
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int dump_rules(struct sk_buff *skb, struct netlink_callback *cb,
                      struct fib_rules_ops *ops)
{
        int idx = 0;
        struct fib_rule *rule;
        int err = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(rule, &ops->rules_list, list) {
                if (idx < cb->args[1])
                        goto skip;

                err = fib_nl_fill_rule(skb, rule, NETLINK_CB(cb->skb).portid,
                                       cb->nlh->nlmsg_seq, RTM_NEWRULE,
                                       NLM_F_MULTI, ops);
                if (err)
                        break;
skip:
                idx++;
        }
        rcu_read_unlock();
        cb->args[1] = idx;
        rules_ops_put(ops);

        return err;
}

static int fib_valid_dumprule_req(const struct nlmsghdr *nlh,
                                   struct netlink_ext_ack *extack)
{
        struct fib_rule_hdr *frh;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
                NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request");
                return -EINVAL;
        }

        frh = nlmsg_data(nlh);
        if (frh->dst_len || frh->src_len || frh->tos || frh->table ||
            frh->res1 || frh->res2 || frh->action || frh->flags) {
                NL_SET_ERR_MSG(extack,
                               "Invalid values in header for fib rule dump request");
                return -EINVAL;
        }

        if (nlmsg_attrlen(nlh, sizeof(*frh))) {
                NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request");
                return -EINVAL;
        }

        return 0;
}

static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        struct fib_rules_ops *ops;
        int err, idx = 0, family;

        if (cb->strict_check) {
                err = fib_valid_dumprule_req(nlh, cb->extack);

                if (err < 0)
                        return err;
        }

        family = rtnl_msg_family(nlh);
        if (family != AF_UNSPEC) {
                /* Protocol specific dump request */
                ops = lookup_rules_ops(net, family);
                if (ops == NULL)
                        return -EAFNOSUPPORT;

                return dump_rules(skb, cb, ops);
        }

        err = 0;
        rcu_read_lock();
        list_for_each_entry_rcu(ops, &net->rules_ops, list) {
                if (idx < cb->args[0] || !try_module_get(ops->owner))
                        goto skip;

                err = dump_rules(skb, cb, ops);
                if (err < 0)
                        break;

                cb->args[1] = 0;
skip:
                idx++;
        }
        rcu_read_unlock();
        cb->args[0] = idx;

        return err;
}

static void notify_rule_change(int event, struct fib_rule *rule,
                               struct fib_rules_ops *ops, struct nlmsghdr *nlh,
                               u32 pid)
{
        struct net *net;
        struct sk_buff *skb;
        int err = -ENOMEM;

        net = ops->fro_net;
        skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
        if (skb == NULL)
                goto errout;

        err = fib_nl_fill_rule(skb, rule, pid, nlh->nlmsg_seq, event, 0, ops);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in fib_rule_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(net, ops->nlgroup, err);
}

static void attach_rules(struct list_head *rules, struct net_device *dev)
{
        struct fib_rule *rule;

        list_for_each_entry(rule, rules, list) {
                if (rule->iifindex == -1 &&
                    strcmp(dev->name, rule->iifname) == 0) {
                        WRITE_ONCE(rule->iifindex, dev->ifindex);
                        WRITE_ONCE(rule->iif_is_l3_master,
                                   netif_is_l3_master(dev));
                }
                if (rule->oifindex == -1 &&
                    strcmp(dev->name, rule->oifname) == 0) {
                        WRITE_ONCE(rule->oifindex, dev->ifindex);
                        WRITE_ONCE(rule->oif_is_l3_master,
                                   netif_is_l3_master(dev));
                }
        }
}

static void detach_rules(struct list_head *rules, struct net_device *dev)
{
        struct fib_rule *rule;

        list_for_each_entry(rule, rules, list) {
                if (rule->iifindex == dev->ifindex) {
                        WRITE_ONCE(rule->iifindex, -1);
                        WRITE_ONCE(rule->iif_is_l3_master, false);
                }
                if (rule->oifindex == dev->ifindex) {
                        WRITE_ONCE(rule->oifindex, -1);
                        WRITE_ONCE(rule->oif_is_l3_master, false);
                }
        }
}


static int fib_rules_event(struct notifier_block *this, unsigned long event,
                           void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);
        struct fib_rules_ops *ops;

        ASSERT_RTNL();

        switch (event) {
        case NETDEV_REGISTER:
                list_for_each_entry(ops, &net->rules_ops, list)
                        attach_rules(&ops->rules_list, dev);
                break;

        case NETDEV_CHANGENAME:
                list_for_each_entry(ops, &net->rules_ops, list) {
                        detach_rules(&ops->rules_list, dev);
                        attach_rules(&ops->rules_list, dev);
                }
                break;

        case NETDEV_UNREGISTER:
                list_for_each_entry(ops, &net->rules_ops, list)
                        detach_rules(&ops->rules_list, dev);
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block fib_rules_notifier = {
        .notifier_call = fib_rules_event,
};

static int __net_init fib_rules_net_init(struct net *net)
{
        INIT_LIST_HEAD(&net->rules_ops);
        spin_lock_init(&net->rules_mod_lock);
        return 0;
}

static void __net_exit fib_rules_net_exit(struct net *net)
{
        WARN_ON_ONCE(!list_empty(&net->rules_ops));
}

static struct pernet_operations fib_rules_net_ops = {
        .init = fib_rules_net_init,
        .exit = fib_rules_net_exit,
};

static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = {
        {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule,
         .flags = RTNL_FLAG_DOIT_PERNET},
        {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule,
         .flags = RTNL_FLAG_DOIT_PERNET},
        {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule,
         .flags = RTNL_FLAG_DUMP_UNLOCKED},
};

static int __init fib_rules_init(void)
{
        int err;

        rtnl_register_many(fib_rules_rtnl_msg_handlers);

        err = register_pernet_subsys(&fib_rules_net_ops);
        if (err < 0)
                goto fail;

        err = register_netdevice_notifier(&fib_rules_notifier);
        if (err < 0)
                goto fail_unregister;

        return 0;

fail_unregister:
        unregister_pernet_subsys(&fib_rules_net_ops);
fail:
        rtnl_unregister_many(fib_rules_rtnl_msg_handlers);
        return err;
}

subsys_initcall(fib_rules_init);



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 









    3 






































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
// SPDX-License-Identifier: GPL-2.0
/* Generic nexthop implementation
 *
 * Copyright (c) 2017-19 Cumulus Networks
 * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
 */

#include <linux/nexthop.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <net/arp.h>
#include <net/ipv6_stubs.h>
#include <net/lwtunnel.h>
#include <net/ndisc.h>
#include <net/nexthop.h>
#include <net/route.h>
#include <net/sock.h>

#define NH_RES_DEFAULT_IDLE_TIMER        (120 * HZ)
#define NH_RES_DEFAULT_UNBALANCED_TIMER        0        /* No forced rebalancing. */

static void remove_nexthop(struct net *net, struct nexthop *nh,
                           struct nl_info *nlinfo);

#define NH_DEV_HASHBITS  8
#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)

#define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS |                \
                               NHA_OP_FLAG_DUMP_HW_STATS)

static const struct nla_policy rtm_nh_policy_new[] = {
        [NHA_ID]                = { .type = NLA_U32 },
        [NHA_GROUP]                = { .type = NLA_BINARY },
        [NHA_GROUP_TYPE]        = { .type = NLA_U16 },
        [NHA_BLACKHOLE]                = { .type = NLA_FLAG },
        [NHA_OIF]                = { .type = NLA_U32 },
        [NHA_GATEWAY]                = { .type = NLA_BINARY },
        [NHA_ENCAP_TYPE]        = { .type = NLA_U16 },
        [NHA_ENCAP]                = { .type = NLA_NESTED },
        [NHA_FDB]                = { .type = NLA_FLAG },
        [NHA_RES_GROUP]                = { .type = NLA_NESTED },
        [NHA_HW_STATS_ENABLE]        = NLA_POLICY_MAX(NLA_U32, true),
};

static const struct nla_policy rtm_nh_policy_get[] = {
        [NHA_ID]                = { .type = NLA_U32 },
        [NHA_OP_FLAGS]                = NLA_POLICY_MASK(NLA_U32,
                                                  NHA_OP_FLAGS_DUMP_ALL),
};

static const struct nla_policy rtm_nh_policy_del[] = {
        [NHA_ID]                = { .type = NLA_U32 },
};

static const struct nla_policy rtm_nh_policy_dump[] = {
        [NHA_OIF]                = { .type = NLA_U32 },
        [NHA_GROUPS]                = { .type = NLA_FLAG },
        [NHA_MASTER]                = { .type = NLA_U32 },
        [NHA_FDB]                = { .type = NLA_FLAG },
        [NHA_OP_FLAGS]                = NLA_POLICY_MASK(NLA_U32,
                                                  NHA_OP_FLAGS_DUMP_ALL),
};

static const struct nla_policy rtm_nh_res_policy_new[] = {
        [NHA_RES_GROUP_BUCKETS]                        = { .type = NLA_U16 },
        [NHA_RES_GROUP_IDLE_TIMER]                = { .type = NLA_U32 },
        [NHA_RES_GROUP_UNBALANCED_TIMER]        = { .type = NLA_U32 },
};

static const struct nla_policy rtm_nh_policy_dump_bucket[] = {
        [NHA_ID]                = { .type = NLA_U32 },
        [NHA_OIF]                = { .type = NLA_U32 },
        [NHA_MASTER]                = { .type = NLA_U32 },
        [NHA_RES_BUCKET]        = { .type = NLA_NESTED },
};

static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = {
        [NHA_RES_BUCKET_NH_ID]        = { .type = NLA_U32 },
};

static const struct nla_policy rtm_nh_policy_get_bucket[] = {
        [NHA_ID]                = { .type = NLA_U32 },
        [NHA_RES_BUCKET]        = { .type = NLA_NESTED },
};

static const struct nla_policy rtm_nh_res_bucket_policy_get[] = {
        [NHA_RES_BUCKET_INDEX]        = { .type = NLA_U16 },
};

static bool nexthop_notifiers_is_empty(struct net *net)
{
        return !net->nexthop.notifier_chain.head;
}

static void
__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
                               const struct nh_info *nhi)
{
        nh_info->dev = nhi->fib_nhc.nhc_dev;
        nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
        if (nh_info->gw_family == AF_INET)
                nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4;
        else if (nh_info->gw_family == AF_INET6)
                nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;

        nh_info->id = nhi->nh_parent->id;
        nh_info->is_reject = nhi->reject_nh;
        nh_info->is_fdb = nhi->fdb_nh;
        nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
}

static int nh_notifier_single_info_init(struct nh_notifier_info *info,
                                        const struct nexthop *nh)
{
        struct nh_info *nhi = rtnl_dereference(nh->nh_info);

        info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
        info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
        if (!info->nh)
                return -ENOMEM;

        __nh_notifier_single_info_init(info->nh, nhi);

        return 0;
}

static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
{
        kfree(info->nh);
}

static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
                                       struct nh_group *nhg)
{
        u16 num_nh = nhg->num_nh;
        int i;

        info->type = NH_NOTIFIER_INFO_TYPE_GRP;
        info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
                               GFP_KERNEL);
        if (!info->nh_grp)
                return -ENOMEM;

        info->nh_grp->num_nh = num_nh;
        info->nh_grp->is_fdb = nhg->fdb_nh;
        info->nh_grp->hw_stats = nhg->hw_stats;

        for (i = 0; i < num_nh; i++) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
                struct nh_info *nhi;

                nhi = rtnl_dereference(nhge->nh->nh_info);
                info->nh_grp->nh_entries[i].weight = nhge->weight;
                __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
                                               nhi);
        }

        return 0;
}

static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
                                           struct nh_group *nhg)
{
        struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
        u16 num_nh_buckets = res_table->num_nh_buckets;
        unsigned long size;
        u16 i;

        info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE;
        size = struct_size(info->nh_res_table, nhs, num_nh_buckets);
        info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO |
                                       __GFP_NOWARN);
        if (!info->nh_res_table)
                return -ENOMEM;

        info->nh_res_table->num_nh_buckets = num_nh_buckets;
        info->nh_res_table->hw_stats = nhg->hw_stats;

        for (i = 0; i < num_nh_buckets; i++) {
                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
                struct nh_grp_entry *nhge;
                struct nh_info *nhi;

                nhge = rtnl_dereference(bucket->nh_entry);
                nhi = rtnl_dereference(nhge->nh->nh_info);
                __nh_notifier_single_info_init(&info->nh_res_table->nhs[i],
                                               nhi);
        }

        return 0;
}

static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
                                     const struct nexthop *nh)
{
        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);

        if (nhg->hash_threshold)
                return nh_notifier_mpath_info_init(info, nhg);
        else if (nhg->resilient)
                return nh_notifier_res_table_info_init(info, nhg);
        return -EINVAL;
}

static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
                                      const struct nexthop *nh)
{
        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);

        if (nhg->hash_threshold)
                kfree(info->nh_grp);
        else if (nhg->resilient)
                vfree(info->nh_res_table);
}

static int nh_notifier_info_init(struct nh_notifier_info *info,
                                 const struct nexthop *nh)
{
        info->id = nh->id;

        if (nh->is_group)
                return nh_notifier_grp_info_init(info, nh);
        else
                return nh_notifier_single_info_init(info, nh);
}

static void nh_notifier_info_fini(struct nh_notifier_info *info,
                                  const struct nexthop *nh)
{
        if (nh->is_group)
                nh_notifier_grp_info_fini(info, nh);
        else
                nh_notifier_single_info_fini(info);
}

static int call_nexthop_notifiers(struct net *net,
                                  enum nexthop_event_type event_type,
                                  struct nexthop *nh,
                                  struct netlink_ext_ack *extack)
{
        struct nh_notifier_info info = {
                .net = net,
                .extack = extack,
        };
        int err;

        ASSERT_RTNL();

        if (nexthop_notifiers_is_empty(net))
                return 0;

        err = nh_notifier_info_init(&info, nh);
        if (err) {
                NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
                return err;
        }

        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
                                           event_type, &info);
        nh_notifier_info_fini(&info, nh);

        return notifier_to_errno(err);
}

static int
nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info,
                                      bool force, unsigned int *p_idle_timer_ms)
{
        struct nh_res_table *res_table;
        struct nh_group *nhg;
        struct nexthop *nh;
        int err = 0;

        /* When 'force' is false, nexthop bucket replacement is performed
         * because the bucket was deemed to be idle. In this case, capable
         * listeners can choose to perform an atomic replacement: The bucket is
         * only replaced if it is inactive. However, if the idle timer interval
         * is smaller than the interval in which a listener is querying
         * buckets' activity from the device, then atomic replacement should
         * not be tried. Pass the idle timer value to listeners, so that they
         * could determine which type of replacement to perform.
         */
        if (force) {
                *p_idle_timer_ms = 0;
                return 0;
        }

        rcu_read_lock();

        nh = nexthop_find_by_id(info->net, info->id);
        if (!nh) {
                err = -EINVAL;
                goto out;
        }

        nhg = rcu_dereference(nh->nh_grp);
        res_table = rcu_dereference(nhg->res_table);
        *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer);

out:
        rcu_read_unlock();

        return err;
}

static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info,
                                            u16 bucket_index, bool force,
                                            struct nh_info *oldi,
                                            struct nh_info *newi)
{
        unsigned int idle_timer_ms;
        int err;

        err = nh_notifier_res_bucket_idle_timer_get(info, force,
                                                    &idle_timer_ms);
        if (err)
                return err;

        info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET;
        info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket),
                                      GFP_KERNEL);
        if (!info->nh_res_bucket)
                return -ENOMEM;

        info->nh_res_bucket->bucket_index = bucket_index;
        info->nh_res_bucket->idle_timer_ms = idle_timer_ms;
        info->nh_res_bucket->force = force;
        __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi);
        __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi);
        return 0;
}

static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info)
{
        kfree(info->nh_res_bucket);
}

static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
                                               u16 bucket_index, bool force,
                                               struct nh_info *oldi,
                                               struct nh_info *newi,
                                               struct netlink_ext_ack *extack)
{
        struct nh_notifier_info info = {
                .net = net,
                .extack = extack,
                .id = nhg_id,
        };
        int err;

        if (nexthop_notifiers_is_empty(net))
                return 0;

        err = nh_notifier_res_bucket_info_init(&info, bucket_index, force,
                                               oldi, newi);
        if (err)
                return err;

        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
                                           NEXTHOP_EVENT_BUCKET_REPLACE, &info);
        nh_notifier_res_bucket_info_fini(&info);

        return notifier_to_errno(err);
}

/* There are three users of RES_TABLE, and NHs etc. referenced from there:
 *
 * 1) a collection of callbacks for NH maintenance. This operates under
 *    RTNL,
 * 2) the delayed work that gradually balances the resilient table,
 * 3) and nexthop_select_path(), operating under RCU.
 *
 * Both the delayed work and the RTNL block are writers, and need to
 * maintain mutual exclusion. Since there are only two and well-known
 * writers for each table, the RTNL code can make sure it has exclusive
 * access thus:
 *
 * - Have the DW operate without locking;
 * - synchronously cancel the DW;
 * - do the writing;
 * - if the write was not actually a delete, call upkeep, which schedules
 *   DW again if necessary.
 *
 * The functions that are always called from the RTNL context use
 * rtnl_dereference(). The functions that can also be called from the DW do
 * a raw dereference and rely on the above mutual exclusion scheme.
 */
#define nh_res_dereference(p) (rcu_dereference_raw(p))

static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
                                             u16 bucket_index, bool force,
                                             struct nexthop *old_nh,
                                             struct nexthop *new_nh,
                                             struct netlink_ext_ack *extack)
{
        struct nh_info *oldi = nh_res_dereference(old_nh->nh_info);
        struct nh_info *newi = nh_res_dereference(new_nh->nh_info);

        return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index,
                                                   force, oldi, newi, extack);
}

static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh,
                                            struct netlink_ext_ack *extack)
{
        struct nh_notifier_info info = {
                .net = net,
                .extack = extack,
                .id = nh->id,
        };
        struct nh_group *nhg;
        int err;

        ASSERT_RTNL();

        if (nexthop_notifiers_is_empty(net))
                return 0;

        /* At this point, the nexthop buckets are still not populated. Only
         * emit a notification with the logical nexthops, so that a listener
         * could potentially veto it in case of unsupported configuration.
         */
        nhg = rtnl_dereference(nh->nh_grp);
        err = nh_notifier_mpath_info_init(&info, nhg);
        if (err) {
                NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
                return err;
        }

        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
                                           NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
                                           &info);
        kfree(info.nh_grp);

        return notifier_to_errno(err);
}

static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
                                 enum nexthop_event_type event_type,
                                 struct nexthop *nh,
                                 struct netlink_ext_ack *extack)
{
        struct nh_notifier_info info = {
                .net = net,
                .extack = extack,
        };
        int err;

        err = nh_notifier_info_init(&info, nh);
        if (err)
                return err;

        err = nb->notifier_call(nb, event_type, &info);
        nh_notifier_info_fini(&info, nh);

        return notifier_to_errno(err);
}

static unsigned int nh_dev_hashfn(unsigned int val)
{
        unsigned int mask = NH_DEV_HASHSIZE - 1;

        return (val ^
                (val >> NH_DEV_HASHBITS) ^
                (val >> (NH_DEV_HASHBITS * 2))) & mask;
}

static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
{
        struct net_device *dev = nhi->fib_nhc.nhc_dev;
        struct hlist_head *head;
        unsigned int hash;

        WARN_ON(!dev);

        hash = nh_dev_hashfn(dev->ifindex);
        head = &net->nexthop.devhash[hash];
        hlist_add_head(&nhi->dev_hash, head);
}

static void nexthop_free_group(struct nexthop *nh)
{
        struct nh_group *nhg;
        int i;

        nhg = rcu_dereference_raw(nh->nh_grp);
        for (i = 0; i < nhg->num_nh; ++i) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];

                WARN_ON(!list_empty(&nhge->nh_list));
                free_percpu(nhge->stats);
                nexthop_put(nhge->nh);
        }

        WARN_ON(nhg->spare == nhg);

        if (nhg->resilient)
                vfree(rcu_dereference_raw(nhg->res_table));

        kfree(nhg->spare);
        kfree(nhg);
}

static void nexthop_free_single(struct nexthop *nh)
{
        struct nh_info *nhi;

        nhi = rcu_dereference_raw(nh->nh_info);
        switch (nhi->family) {
        case AF_INET:
                fib_nh_release(nh->net, &nhi->fib_nh);
                break;
        case AF_INET6:
                ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
                break;
        }
        kfree(nhi);
}

void nexthop_free_rcu(struct rcu_head *head)
{
        struct nexthop *nh = container_of(head, struct nexthop, rcu);

        if (nh->is_group)
                nexthop_free_group(nh);
        else
                nexthop_free_single(nh);

        kfree(nh);
}
EXPORT_SYMBOL_GPL(nexthop_free_rcu);

static struct nexthop *nexthop_alloc(void)
{
        struct nexthop *nh;

        nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
        if (nh) {
                INIT_LIST_HEAD(&nh->fi_list);
                INIT_LIST_HEAD(&nh->f6i_list);
                INIT_LIST_HEAD(&nh->grp_list);
                INIT_LIST_HEAD(&nh->fdb_list);
        }
        return nh;
}

static struct nh_group *nexthop_grp_alloc(u16 num_nh)
{
        struct nh_group *nhg;

        nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
        if (nhg)
                nhg->num_nh = num_nh;

        return nhg;
}

static void nh_res_table_upkeep_dw(struct work_struct *work);

static struct nh_res_table *
nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg)
{
        const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets;
        struct nh_res_table *res_table;
        unsigned long size;

        size = struct_size(res_table, nh_buckets, num_nh_buckets);
        res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
        if (!res_table)
                return NULL;

        res_table->net = net;
        res_table->nhg_id = nhg_id;
        INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw);
        INIT_LIST_HEAD(&res_table->uw_nh_entries);
        res_table->idle_timer = cfg->nh_grp_res_idle_timer;
        res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer;
        res_table->num_nh_buckets = num_nh_buckets;
        return res_table;
}

static void nh_base_seq_inc(struct net *net)
{
        while (++net->nexthop.seq == 0)
                ;
}

/* no reference taken; rcu lock or rtnl must be held */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
{
        struct rb_node **pp, *parent = NULL, *next;

        pp = &net->nexthop.rb_root.rb_node;
        while (1) {
                struct nexthop *nh;

                next = rcu_dereference_raw(*pp);
                if (!next)
                        break;
                parent = next;

                nh = rb_entry(parent, struct nexthop, rb_node);
                if (id < nh->id)
                        pp = &next->rb_left;
                else if (id > nh->id)
                        pp = &next->rb_right;
                else
                        return nh;
        }
        return NULL;
}
EXPORT_SYMBOL_GPL(nexthop_find_by_id);

/* used for auto id allocation; called with rtnl held */
static u32 nh_find_unused_id(struct net *net)
{
        u32 id_start = net->nexthop.last_id_allocated;

        while (1) {
                net->nexthop.last_id_allocated++;
                if (net->nexthop.last_id_allocated == id_start)
                        break;

                if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
                        return net->nexthop.last_id_allocated;
        }
        return 0;
}

static void nh_res_time_set_deadline(unsigned long next_time,
                                     unsigned long *deadline)
{
        if (time_before(next_time, *deadline))
                *deadline = next_time;
}

static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table)
{
        if (list_empty(&res_table->uw_nh_entries))
                return 0;
        return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since);
}

static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg)
{
        struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
        struct nlattr *nest;

        nest = nla_nest_start(skb, NHA_RES_GROUP);
        if (!nest)
                return -EMSGSIZE;

        if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS,
                        res_table->num_nh_buckets) ||
            nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER,
                        jiffies_to_clock_t(res_table->idle_timer)) ||
            nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER,
                        jiffies_to_clock_t(res_table->unbalanced_timer)) ||
            nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME,
                              nh_res_table_unbalanced_time(res_table),
                              NHA_RES_GROUP_PAD))
                goto nla_put_failure;

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -EMSGSIZE;
}

static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge)
{
        struct nh_grp_entry_stats *cpu_stats;

        cpu_stats = get_cpu_ptr(nhge->stats);
        u64_stats_update_begin(&cpu_stats->syncp);
        u64_stats_inc(&cpu_stats->packets);
        u64_stats_update_end(&cpu_stats->syncp);
        put_cpu_ptr(cpu_stats);
}

static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge,
                                    u64 *ret_packets)
{
        int i;

        *ret_packets = 0;

        for_each_possible_cpu(i) {
                struct nh_grp_entry_stats *cpu_stats;
                unsigned int start;
                u64 packets;

                cpu_stats = per_cpu_ptr(nhge->stats, i);
                do {
                        start = u64_stats_fetch_begin(&cpu_stats->syncp);
                        packets = u64_stats_read(&cpu_stats->packets);
                } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));

                *ret_packets += packets;
        }
}

static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info,
                                         const struct nexthop *nh)
{
        struct nh_group *nhg;
        int i;

        ASSERT_RTNL();
        nhg = rtnl_dereference(nh->nh_grp);

        info->id = nh->id;
        info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS;
        info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats,
                                                    stats, nhg->num_nh),
                                        GFP_KERNEL);
        if (!info->nh_grp_hw_stats)
                return -ENOMEM;

        info->nh_grp_hw_stats->num_nh = nhg->num_nh;
        for (i = 0; i < nhg->num_nh; i++) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];

                info->nh_grp_hw_stats->stats[i].id = nhge->nh->id;
        }

        return 0;
}

static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info)
{
        kfree(info->nh_grp_hw_stats);
}

void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info,
                                  unsigned int nh_idx,
                                  u64 delta_packets)
{
        info->hw_stats_used = true;
        info->stats[nh_idx].packets += delta_packets;
}
EXPORT_SYMBOL(nh_grp_hw_stats_report_delta);

static void nh_grp_hw_stats_apply_update(struct nexthop *nh,
                                         struct nh_notifier_info *info)
{
        struct nh_group *nhg;
        int i;

        ASSERT_RTNL();
        nhg = rtnl_dereference(nh->nh_grp);

        for (i = 0; i < nhg->num_nh; i++) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];

                nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets;
        }
}

static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used)
{
        struct nh_notifier_info info = {
                .net = nh->net,
        };
        struct net *net = nh->net;
        int err;

        if (nexthop_notifiers_is_empty(net)) {
                *hw_stats_used = false;
                return 0;
        }

        err = nh_notifier_grp_hw_stats_init(&info, nh);
        if (err)
                return err;

        err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
                                           NEXTHOP_EVENT_HW_STATS_REPORT_DELTA,
                                           &info);

        /* Cache whatever we got, even if there was an error, otherwise the
         * successful stats retrievals would get lost.
         */
        nh_grp_hw_stats_apply_update(nh, &info);
        *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used;

        nh_notifier_grp_hw_stats_fini(&info);
        return notifier_to_errno(err);
}

static int nla_put_nh_group_stats_entry(struct sk_buff *skb,
                                        struct nh_grp_entry *nhge,
                                        u32 op_flags)
{
        struct nlattr *nest;
        u64 packets;

        nh_grp_entry_stats_read(nhge, &packets);

        nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY);
        if (!nest)
                return -EMSGSIZE;

        if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) ||
            nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS,
                         packets + nhge->packets_hw))
                goto nla_put_failure;

        if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS &&
            nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW,
                         nhge->packets_hw))
                goto nla_put_failure;

        nla_nest_end(skb, nest);
        return 0;

nla_put_failure:
        nla_nest_cancel(skb, nest);
        return -EMSGSIZE;
}

static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh,
                                  u32 op_flags)
{
        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
        struct nlattr *nest;
        bool hw_stats_used;
        int err;
        int i;

        if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats))
                goto err_out;

        if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS &&
            nhg->hw_stats) {
                err = nh_grp_hw_stats_update(nh, &hw_stats_used);
                if (err)
                        goto out;

                if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used))
                        goto err_out;
        }

        nest = nla_nest_start(skb, NHA_GROUP_STATS);
        if (!nest)
                goto err_out;

        for (i = 0; i < nhg->num_nh; i++)
                if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i],
                                                 op_flags))
                        goto cancel_out;

        nla_nest_end(skb, nest);
        return 0;

cancel_out:
        nla_nest_cancel(skb, nest);
err_out:
        err = -EMSGSIZE;
out:
        return err;
}

static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
                            u32 op_flags, u32 *resp_op_flags)
{
        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
        struct nexthop_grp *p;
        size_t len = nhg->num_nh * sizeof(*p);
        struct nlattr *nla;
        u16 group_type = 0;
        u16 weight;
        int i;

        *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0;

        if (nhg->hash_threshold)
                group_type = NEXTHOP_GRP_TYPE_MPATH;
        else if (nhg->resilient)
                group_type = NEXTHOP_GRP_TYPE_RES;

        if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
                goto nla_put_failure;

        nla = nla_reserve(skb, NHA_GROUP, len);
        if (!nla)
                goto nla_put_failure;

        p = nla_data(nla);
        for (i = 0; i < nhg->num_nh; ++i) {
                weight = nhg->nh_entries[i].weight - 1;

                *p++ = (struct nexthop_grp) {
                        .id = nhg->nh_entries[i].nh->id,
                        .weight = weight,
                        .weight_high = weight >> 8,
                };
        }

        if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
                goto nla_put_failure;

        if (op_flags & NHA_OP_FLAG_DUMP_STATS &&
            (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) ||
             nla_put_nh_group_stats(skb, nh, op_flags)))
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
                        int event, u32 portid, u32 seq, unsigned int nlflags,
                        u32 op_flags)
{
        struct fib6_nh *fib6_nh;
        struct fib_nh *fib_nh;
        struct nlmsghdr *nlh;
        struct nh_info *nhi;
        struct nhmsg *nhm;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
        if (!nlh)
                return -EMSGSIZE;

        nhm = nlmsg_data(nlh);
        nhm->nh_family = AF_UNSPEC;
        nhm->nh_flags = nh->nh_flags;
        nhm->nh_protocol = nh->protocol;
        nhm->nh_scope = 0;
        nhm->resvd = 0;

        if (nla_put_u32(skb, NHA_ID, nh->id))
                goto nla_put_failure;

        if (nh->is_group) {
                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
                u32 resp_op_flags = 0;

                if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
                        goto nla_put_failure;
                if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) ||
                    nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags))
                        goto nla_put_failure;
                goto out;
        }

        nhi = rtnl_dereference(nh->nh_info);
        nhm->nh_family = nhi->family;
        if (nhi->reject_nh) {
                if (nla_put_flag(skb, NHA_BLACKHOLE))
                        goto nla_put_failure;
                goto out;
        } else if (nhi->fdb_nh) {
                if (nla_put_flag(skb, NHA_FDB))
                        goto nla_put_failure;
        } else {
                const struct net_device *dev;

                dev = nhi->fib_nhc.nhc_dev;
                if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
                        goto nla_put_failure;
        }

        nhm->nh_scope = nhi->fib_nhc.nhc_scope;
        switch (nhi->family) {
        case AF_INET:
                fib_nh = &nhi->fib_nh;
                if (fib_nh->fib_nh_gw_family &&
                    nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
                        goto nla_put_failure;
                break;

        case AF_INET6:
                fib6_nh = &nhi->fib6_nh;
                if (fib6_nh->fib_nh_gw_family &&
                    nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
                        goto nla_put_failure;
                break;
        }

        if (nhi->fib_nhc.nhc_lwtstate &&
            lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
                                NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
                goto nla_put_failure;

out:
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg)
{
        return nla_total_size(0) +        /* NHA_RES_GROUP */
                nla_total_size(2) +        /* NHA_RES_GROUP_BUCKETS */
                nla_total_size(4) +        /* NHA_RES_GROUP_IDLE_TIMER */
                nla_total_size(4) +        /* NHA_RES_GROUP_UNBALANCED_TIMER */
                nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
}

static size_t nh_nlmsg_size_grp(struct nexthop *nh)
{
        struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
        size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
        size_t tot = nla_total_size(sz) +
                nla_total_size(2); /* NHA_GROUP_TYPE */

        if (nhg->resilient)
                tot += nh_nlmsg_size_grp_res(nhg);

        return tot;
}

static size_t nh_nlmsg_size_single(struct nexthop *nh)
{
        struct nh_info *nhi = rtnl_dereference(nh->nh_info);
        size_t sz;

        /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
         * are mutually exclusive
         */
        sz = nla_total_size(4);  /* NHA_OIF */

        switch (nhi->family) {
        case AF_INET:
                if (nhi->fib_nh.fib_nh_gw_family)
                        sz += nla_total_size(4);  /* NHA_GATEWAY */
                break;

        case AF_INET6:
                /* NHA_GATEWAY */
                if (nhi->fib6_nh.fib_nh_gw_family)
                        sz += nla_total_size(sizeof(const struct in6_addr));
                break;
        }

        if (nhi->fib_nhc.nhc_lwtstate) {
                sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
                sz += nla_total_size(2);  /* NHA_ENCAP_TYPE */
        }

        return sz;
}

static size_t nh_nlmsg_size(struct nexthop *nh)
{
        size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));

        sz += nla_total_size(4); /* NHA_ID */

        if (nh->is_group)
                sz += nh_nlmsg_size_grp(nh) +
                      nla_total_size(4) +        /* NHA_OP_FLAGS */
                      0;
        else
                sz += nh_nlmsg_size_single(nh);

        return sz;
}

static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
{
        unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
        u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
        if (!skb)
                goto errout;

        err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
                    info->nlh, gfp_any());
        return;
errout:
        rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
}

static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
{
        return (unsigned long)atomic_long_read(&bucket->used_time);
}

static unsigned long
nh_res_bucket_idle_point(const struct nh_res_table *res_table,
                         const struct nh_res_bucket *bucket,
                         unsigned long now)
{
        unsigned long time = nh_res_bucket_used_time(bucket);

        /* Bucket was not used since it was migrated. The idle time is now. */
        if (time == bucket->migrated_time)
                return now;

        return time + res_table->idle_timer;
}

static unsigned long
nh_res_table_unb_point(const struct nh_res_table *res_table)
{
        return res_table->unbalanced_since + res_table->unbalanced_timer;
}

static void nh_res_bucket_set_idle(const struct nh_res_table *res_table,
                                   struct nh_res_bucket *bucket)
{
        unsigned long now = jiffies;

        atomic_long_set(&bucket->used_time, (long)now);
        bucket->migrated_time = now;
}

static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket)
{
        atomic_long_set(&bucket->used_time, (long)jiffies);
}

static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket)
{
        unsigned long used_time = nh_res_bucket_used_time(bucket);

        return jiffies_delta_to_clock_t(jiffies - used_time);
}

static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh,
                              struct nh_res_bucket *bucket, u16 bucket_index,
                              int event, u32 portid, u32 seq,
                              unsigned int nlflags,
                              struct netlink_ext_ack *extack)
{
        struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
        struct nlmsghdr *nlh;
        struct nlattr *nest;
        struct nhmsg *nhm;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
        if (!nlh)
                return -EMSGSIZE;

        nhm = nlmsg_data(nlh);
        nhm->nh_family = AF_UNSPEC;
        nhm->nh_flags = bucket->nh_flags;
        nhm->nh_protocol = nh->protocol;
        nhm->nh_scope = 0;
        nhm->resvd = 0;

        if (nla_put_u32(skb, NHA_ID, nh->id))
                goto nla_put_failure;

        nest = nla_nest_start(skb, NHA_RES_BUCKET);
        if (!nest)
                goto nla_put_failure;

        if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) ||
            nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) ||
            nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME,
                              nh_res_bucket_idle_time(bucket),
                              NHA_RES_BUCKET_PAD))
                goto nla_put_failure_nest;

        nla_nest_end(skb, nest);
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure_nest:
        nla_nest_cancel(skb, nest);
nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static void nexthop_bucket_notify(struct nh_res_table *res_table,
                                  u16 bucket_index)
{
        struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
        struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
        struct nexthop *nh = nhge->nh_parent;
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                goto errout;

        err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
                                 RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE,
                                 NULL);
        if (err < 0) {
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
}

static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
                           bool *is_fdb, struct netlink_ext_ack *extack)
{
        if (nh->is_group) {
                struct nh_group *nhg = rtnl_dereference(nh->nh_grp);

                /* Nesting groups within groups is not supported. */
                if (nhg->hash_threshold) {
                        NL_SET_ERR_MSG(extack,
                                       "Hash-threshold group can not be a nexthop within a group");
                        return false;
                }
                if (nhg->resilient) {
                        NL_SET_ERR_MSG(extack,
                                       "Resilient group can not be a nexthop within a group");
                        return false;
                }
                *is_fdb = nhg->fdb_nh;
        } else {
                struct nh_info *nhi = rtnl_dereference(nh->nh_info);

                if (nhi->reject_nh && npaths > 1) {
                        NL_SET_ERR_MSG(extack,
                                       "Blackhole nexthop can not be used in a group with more than 1 path");
                        return false;
                }
                *is_fdb = nhi->fdb_nh;
        }

        return true;
}

static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
                                   struct netlink_ext_ack *extack)
{
        struct nh_info *nhi;

        nhi = rtnl_dereference(nh->nh_info);

        if (!nhi->fdb_nh) {
                NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
                return -EINVAL;
        }

        if (*nh_family == AF_UNSPEC) {
                *nh_family = nhi->family;
        } else if (*nh_family != nhi->family) {
                NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
                return -EINVAL;
        }

        return 0;
}

static int nh_check_attr_group(struct net *net,
                               struct nlattr *tb[], size_t tb_size,
                               u16 nh_grp_type, struct netlink_ext_ack *extack)
{
        unsigned int len = nla_len(tb[NHA_GROUP]);
        struct nexthop_grp *nhg;
        unsigned int i, j;

        if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
                NL_SET_ERR_MSG(extack,
                               "Invalid length for nexthop group attribute");
                return -EINVAL;
        }

        /* convert len to number of nexthop ids */
        len /= sizeof(*nhg);

        nhg = nla_data(tb[NHA_GROUP]);
        for (i = 0; i < len; ++i) {
                if (nhg[i].resvd2) {
                        NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0");
                        return -EINVAL;
                }
                if (nexthop_grp_weight(&nhg[i]) == 0) {
                        /* 0xffff got passed in, representing weight of 0x10000,
                         * which is too heavy.
                         */
                        NL_SET_ERR_MSG(extack, "Invalid value for weight");
                        return -EINVAL;
                }
                for (j = i + 1; j < len; ++j) {
                        if (nhg[i].id == nhg[j].id) {
                                NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
                                return -EINVAL;
                        }
                }
        }

        nhg = nla_data(tb[NHA_GROUP]);
        for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
                if (!tb[i])
                        continue;
                switch (i) {
                case NHA_HW_STATS_ENABLE:
                case NHA_FDB:
                        continue;
                case NHA_RES_GROUP:
                        if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
                                continue;
                        break;
                }
                NL_SET_ERR_MSG(extack,
                               "No other attributes can be set in nexthop groups");
                return -EINVAL;
        }

        return 0;
}

static int nh_check_attr_group_rtnl(struct net *net, struct nlattr *tb[],
                                    struct netlink_ext_ack *extack)
{
        u8 nh_family = AF_UNSPEC;
        struct nexthop_grp *nhg;
        unsigned int len;
        unsigned int i;
        u8 nhg_fdb;

        len = nla_len(tb[NHA_GROUP]) / sizeof(*nhg);
        nhg = nla_data(tb[NHA_GROUP]);
        nhg_fdb = !!tb[NHA_FDB];

        for (i = 0; i < len; i++) {
                struct nexthop *nh;
                bool is_fdb_nh;

                nh = nexthop_find_by_id(net, nhg[i].id);
                if (!nh) {
                        NL_SET_ERR_MSG(extack, "Invalid nexthop id");
                        return -EINVAL;
                }
                if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
                        return -EINVAL;

                if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
                        return -EINVAL;

                if (!nhg_fdb && is_fdb_nh) {
                        NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
                        return -EINVAL;
                }
        }

        return 0;
}

static bool ipv6_good_nh(const struct fib6_nh *nh)
{
        int state = NUD_REACHABLE;
        struct neighbour *n;

        rcu_read_lock();

        n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
        if (n)
                state = READ_ONCE(n->nud_state);

        rcu_read_unlock();

        return !!(state & NUD_VALID);
}

static bool ipv4_good_nh(const struct fib_nh *nh)
{
        int state = NUD_REACHABLE;
        struct neighbour *n;

        rcu_read_lock();

        n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
                                      (__force u32)nh->fib_nh_gw4);
        if (n)
                state = READ_ONCE(n->nud_state);

        rcu_read_unlock();

        return !!(state & NUD_VALID);
}

static bool nexthop_is_good_nh(const struct nexthop *nh)
{
        struct nh_info *nhi = rcu_dereference(nh->nh_info);

        switch (nhi->family) {
        case AF_INET:
                return ipv4_good_nh(&nhi->fib_nh);
        case AF_INET6:
                return ipv6_good_nh(&nhi->fib6_nh);
        }

        return false;
}

static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash)
{
        int i;

        for (i = 0; i < nhg->num_nh; i++) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];

                if (hash > atomic_read(&nhge->hthr.upper_bound))
                        continue;

                nh_grp_entry_stats_inc(nhge);
                return nhge->nh;
        }

        WARN_ON_ONCE(1);
        return NULL;
}

static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
{
        struct nh_grp_entry *nhge0 = NULL;
        int i;

        if (nhg->fdb_nh)
                return nexthop_select_path_fdb(nhg, hash);

        for (i = 0; i < nhg->num_nh; ++i) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];

                /* nexthops always check if it is good and does
                 * not rely on a sysctl for this behavior
                 */
                if (!nexthop_is_good_nh(nhge->nh))
                        continue;

                if (!nhge0)
                        nhge0 = nhge;

                if (hash > atomic_read(&nhge->hthr.upper_bound))
                        continue;

                nh_grp_entry_stats_inc(nhge);
                return nhge->nh;
        }

        if (!nhge0)
                nhge0 = &nhg->nh_entries[0];
        nh_grp_entry_stats_inc(nhge0);
        return nhge0->nh;
}

static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
{
        struct nh_res_table *res_table = rcu_dereference(nhg->res_table);
        u16 bucket_index = hash % res_table->num_nh_buckets;
        struct nh_res_bucket *bucket;
        struct nh_grp_entry *nhge;

        /* nexthop_select_path() is expected to return a non-NULL value, so
         * skip protocol validation and just hand out whatever there is.
         */
        bucket = &res_table->nh_buckets[bucket_index];
        nh_res_bucket_set_busy(bucket);
        nhge = rcu_dereference(bucket->nh_entry);
        nh_grp_entry_stats_inc(nhge);
        return nhge->nh;
}

struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
{
        struct nh_group *nhg;

        if (!nh->is_group)
                return nh;

        nhg = rcu_dereference(nh->nh_grp);
        if (nhg->hash_threshold)
                return nexthop_select_path_hthr(nhg, hash);
        else if (nhg->resilient)
                return nexthop_select_path_res(nhg, hash);

        /* Unreachable. */
        return NULL;
}
EXPORT_SYMBOL_GPL(nexthop_select_path);

int nexthop_for_each_fib6_nh(struct nexthop *nh,
                             int (*cb)(struct fib6_nh *nh, void *arg),
                             void *arg)
{
        struct nh_info *nhi;
        int err;

        if (nh->is_group) {
                struct nh_group *nhg;
                int i;

                nhg = rcu_dereference_rtnl(nh->nh_grp);
                for (i = 0; i < nhg->num_nh; i++) {
                        struct nh_grp_entry *nhge = &nhg->nh_entries[i];

                        nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
                        err = cb(&nhi->fib6_nh, arg);
                        if (err)
                                return err;
                }
        } else {
                nhi = rcu_dereference_rtnl(nh->nh_info);
                err = cb(&nhi->fib6_nh, arg);
                if (err)
                        return err;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);

static int check_src_addr(const struct in6_addr *saddr,
                          struct netlink_ext_ack *extack)
{
        if (!ipv6_addr_any(saddr)) {
                NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
                return -EINVAL;
        }
        return 0;
}

int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
                       struct netlink_ext_ack *extack)
{
        struct nh_info *nhi;
        bool is_fdb_nh;

        /* fib6_src is unique to a fib6_info and limits the ability to cache
         * routes in fib6_nh within a nexthop that is potentially shared
         * across multiple fib entries. If the config wants to use source
         * routing it can not use nexthop objects. mlxsw also does not allow
         * fib6_src on routes.
         */
        if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
                return -EINVAL;

        if (nh->is_group) {
                struct nh_group *nhg;

                nhg = rtnl_dereference(nh->nh_grp);
                if (nhg->has_v4)
                        goto no_v4_nh;
                is_fdb_nh = nhg->fdb_nh;
        } else {
                nhi = rtnl_dereference(nh->nh_info);
                if (nhi->family == AF_INET)
                        goto no_v4_nh;
                is_fdb_nh = nhi->fdb_nh;
        }

        if (is_fdb_nh) {
                NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
                return -EINVAL;
        }

        return 0;
no_v4_nh:
        NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(fib6_check_nexthop);

/* if existing nexthop has ipv6 routes linked to it, need
 * to verify this new spec works with ipv6
 */
static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
                              struct netlink_ext_ack *extack)
{
        struct fib6_info *f6i;

        if (list_empty(&old->f6i_list))
                return 0;

        list_for_each_entry(f6i, &old->f6i_list, nh_list) {
                if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
                        return -EINVAL;
        }

        return fib6_check_nexthop(new, NULL, extack);
}

static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
                               struct netlink_ext_ack *extack)
{
        if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
                NL_SET_ERR_MSG(extack,
                               "Route with host scope can not have a gateway");
                return -EINVAL;
        }

        if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
                NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
                return -EINVAL;
        }

        return 0;
}

/* Invoked by fib add code to verify nexthop by id is ok with
 * config for prefix; parts of fib_check_nh not done when nexthop
 * object is used.
 */
int fib_check_nexthop(struct nexthop *nh, u8 scope,
                      struct netlink_ext_ack *extack)
{
        struct nh_info *nhi;
        int err = 0;

        if (nh->is_group) {
                struct nh_group *nhg;

                nhg = rtnl_dereference(nh->nh_grp);
                if (nhg->fdb_nh) {
                        NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
                        err = -EINVAL;
                        goto out;
                }

                if (scope == RT_SCOPE_HOST) {
                        NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
                        err = -EINVAL;
                        goto out;
                }

                /* all nexthops in a group have the same scope */
                nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
                err = nexthop_check_scope(nhi, scope, extack);
        } else {
                nhi = rtnl_dereference(nh->nh_info);
                if (nhi->fdb_nh) {
                        NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
                        err = -EINVAL;
                        goto out;
                }
                err = nexthop_check_scope(nhi, scope, extack);
        }

out:
        return err;
}

static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
                             struct netlink_ext_ack *extack)
{
        struct fib_info *fi;

        list_for_each_entry(fi, &old->fi_list, nh_list) {
                int err;

                err = fib_check_nexthop(new, fi->fib_scope, extack);
                if (err)
                        return err;
        }
        return 0;
}

static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge)
{
        return nhge->res.count_buckets == nhge->res.wants_buckets;
}

static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge)
{
        return nhge->res.count_buckets > nhge->res.wants_buckets;
}

static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge)
{
        return nhge->res.count_buckets < nhge->res.wants_buckets;
}

static bool nh_res_table_is_balanced(const struct nh_res_table *res_table)
{
        return list_empty(&res_table->uw_nh_entries);
}

static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket)
{
        struct nh_grp_entry *nhge;

        if (bucket->occupied) {
                nhge = nh_res_dereference(bucket->nh_entry);
                nhge->res.count_buckets--;
                bucket->occupied = false;
        }
}

static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket,
                                 struct nh_grp_entry *nhge)
{
        nh_res_bucket_unset_nh(bucket);

        bucket->occupied = true;
        rcu_assign_pointer(bucket->nh_entry, nhge);
        nhge->res.count_buckets++;
}

static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table,
                                         struct nh_res_bucket *bucket,
                                         unsigned long *deadline, bool *force)
{
        unsigned long now = jiffies;
        struct nh_grp_entry *nhge;
        unsigned long idle_point;

        if (!bucket->occupied) {
                /* The bucket is not occupied, its NHGE pointer is either
                 * NULL or obsolete. We _have to_ migrate: set force.
                 */
                *force = true;
                return true;
        }

        nhge = nh_res_dereference(bucket->nh_entry);

        /* If the bucket is populated by an underweight or balanced
         * nexthop, do not migrate.
         */
        if (!nh_res_nhge_is_ow(nhge))
                return false;

        /* At this point we know that the bucket is populated with an
         * overweight nexthop. It needs to be migrated to a new nexthop if
         * the idle timer of unbalanced timer expired.
         */

        idle_point = nh_res_bucket_idle_point(res_table, bucket, now);
        if (time_after_eq(now, idle_point)) {
                /* The bucket is idle. We _can_ migrate: unset force. */
                *force = false;
                return true;
        }

        /* Unbalanced timer of 0 means "never force". */
        if (res_table->unbalanced_timer) {
                unsigned long unb_point;

                unb_point = nh_res_table_unb_point(res_table);
                if (time_after(now, unb_point)) {
                        /* The bucket is not idle, but the unbalanced timer
                         * expired. We _can_ migrate, but set force anyway,
                         * so that drivers know to ignore activity reports
                         * from the HW.
                         */
                        *force = true;
                        return true;
                }

                nh_res_time_set_deadline(unb_point, deadline);
        }

        nh_res_time_set_deadline(idle_point, deadline);
        return false;
}

static bool nh_res_bucket_migrate(struct nh_res_table *res_table,
                                  u16 bucket_index, bool notify,
                                  bool notify_nl, bool force)
{
        struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
        struct nh_grp_entry *new_nhge;
        struct netlink_ext_ack extack;
        int err;

        new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries,
                                            struct nh_grp_entry,
                                            res.uw_nh_entry);
        if (WARN_ON_ONCE(!new_nhge))
                /* If this function is called, "bucket" is either not
                 * occupied, or it belongs to a next hop that is
                 * overweight. In either case, there ought to be a
                 * corresponding underweight next hop.
                 */
                return false;

        if (notify) {
                struct nh_grp_entry *old_nhge;

                old_nhge = nh_res_dereference(bucket->nh_entry);
                err = call_nexthop_res_bucket_notifiers(res_table->net,
                                                        res_table->nhg_id,
                                                        bucket_index, force,
                                                        old_nhge->nh,
                                                        new_nhge->nh, &extack);
                if (err) {
                        pr_err_ratelimited("%s\n", extack._msg);
                        if (!force)
                                return false;
                        /* It is not possible to veto a forced replacement, so
                         * just clear the hardware flags from the nexthop
                         * bucket to indicate to user space that this bucket is
                         * not correctly populated in hardware.
                         */
                        bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
                }
        }

        nh_res_bucket_set_nh(bucket, new_nhge);
        nh_res_bucket_set_idle(res_table, bucket);

        if (notify_nl)
                nexthop_bucket_notify(res_table, bucket_index);

        if (nh_res_nhge_is_balanced(new_nhge))
                list_del(&new_nhge->res.uw_nh_entry);
        return true;
}

#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)

static void nh_res_table_upkeep(struct nh_res_table *res_table,
                                bool notify, bool notify_nl)
{
        unsigned long now = jiffies;
        unsigned long deadline;
        u16 i;

        /* Deadline is the next time that upkeep should be run. It is the
         * earliest time at which one of the buckets might be migrated.
         * Start at the most pessimistic estimate: either unbalanced_timer
         * from now, or if there is none, idle_timer from now. For each
         * encountered time point, call nh_res_time_set_deadline() to
         * refine the estimate.
         */
        if (res_table->unbalanced_timer)
                deadline = now + res_table->unbalanced_timer;
        else
                deadline = now + res_table->idle_timer;

        for (i = 0; i < res_table->num_nh_buckets; i++) {
                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
                bool force;

                if (nh_res_bucket_should_migrate(res_table, bucket,
                                                 &deadline, &force)) {
                        if (!nh_res_bucket_migrate(res_table, i, notify,
                                                   notify_nl, force)) {
                                unsigned long idle_point;

                                /* A driver can override the migration
                                 * decision if the HW reports that the
                                 * bucket is actually not idle. Therefore
                                 * remark the bucket as busy again and
                                 * update the deadline.
                                 */
                                nh_res_bucket_set_busy(bucket);
                                idle_point = nh_res_bucket_idle_point(res_table,
                                                                      bucket,
                                                                      now);
                                nh_res_time_set_deadline(idle_point, &deadline);
                        }
                }
        }

        /* If the group is still unbalanced, schedule the next upkeep to
         * either the deadline computed above, or the minimum deadline,
         * whichever comes later.
         */
        if (!nh_res_table_is_balanced(res_table)) {
                unsigned long now = jiffies;
                unsigned long min_deadline;

                min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL;
                if (time_before(deadline, min_deadline))
                        deadline = min_deadline;

                queue_delayed_work(system_power_efficient_wq,
                                   &res_table->upkeep_dw, deadline - now);
        }
}

static void nh_res_table_upkeep_dw(struct work_struct *work)
{
        struct delayed_work *dw = to_delayed_work(work);
        struct nh_res_table *res_table;

        res_table = container_of(dw, struct nh_res_table, upkeep_dw);
        nh_res_table_upkeep(res_table, true, true);
}

static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
{
        cancel_delayed_work_sync(&res_table->upkeep_dw);
}

static void nh_res_group_rebalance(struct nh_group *nhg,
                                   struct nh_res_table *res_table)
{
        u16 prev_upper_bound = 0;
        u32 total = 0;
        u32 w = 0;
        int i;

        INIT_LIST_HEAD(&res_table->uw_nh_entries);

        for (i = 0; i < nhg->num_nh; ++i)
                total += nhg->nh_entries[i].weight;

        for (i = 0; i < nhg->num_nh; ++i) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
                u16 upper_bound;
                u64 btw;

                w += nhge->weight;
                btw = ((u64)res_table->num_nh_buckets) * w;
                upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total);
                nhge->res.wants_buckets = upper_bound - prev_upper_bound;
                prev_upper_bound = upper_bound;

                if (nh_res_nhge_is_uw(nhge)) {
                        if (list_empty(&res_table->uw_nh_entries))
                                res_table->unbalanced_since = jiffies;
                        list_add(&nhge->res.uw_nh_entry,
                                 &res_table->uw_nh_entries);
                }
        }
}

/* Migrate buckets in res_table so that they reference NHGE's from NHG with
 * the right NH ID. Set those buckets that do not have a corresponding NHGE
 * entry in NHG as not occupied.
 */
static void nh_res_table_migrate_buckets(struct nh_res_table *res_table,
                                         struct nh_group *nhg)
{
        u16 i;

        for (i = 0; i < res_table->num_nh_buckets; i++) {
                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
                u32 id = rtnl_dereference(bucket->nh_entry)->nh->id;
                bool found = false;
                int j;

                for (j = 0; j < nhg->num_nh; j++) {
                        struct nh_grp_entry *nhge = &nhg->nh_entries[j];

                        if (nhge->nh->id == id) {
                                nh_res_bucket_set_nh(bucket, nhge);
                                found = true;
                                break;
                        }
                }

                if (!found)
                        nh_res_bucket_unset_nh(bucket);
        }
}

static void replace_nexthop_grp_res(struct nh_group *oldg,
                                    struct nh_group *newg)
{
        /* For NH group replacement, the new NHG might only have a stub
         * hash table with 0 buckets, because the number of buckets was not
         * specified. For NH removal, oldg and newg both reference the same
         * res_table. So in any case, in the following, we want to work
         * with oldg->res_table.
         */
        struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table);
        unsigned long prev_unbalanced_since = old_res_table->unbalanced_since;
        bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);

        nh_res_table_cancel_upkeep(old_res_table);
        nh_res_table_migrate_buckets(old_res_table, newg);
        nh_res_group_rebalance(newg, old_res_table);
        if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries))
                old_res_table->unbalanced_since = prev_unbalanced_since;
        nh_res_table_upkeep(old_res_table, true, false);
}

static void nh_hthr_group_rebalance(struct nh_group *nhg)
{
        u32 total = 0;
        u32 w = 0;
        int i;

        for (i = 0; i < nhg->num_nh; ++i)
                total += nhg->nh_entries[i].weight;

        for (i = 0; i < nhg->num_nh; ++i) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
                u32 upper_bound;

                w += nhge->weight;
                upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
                atomic_set(&nhge->hthr.upper_bound, upper_bound);
        }
}

static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
                                struct nl_info *nlinfo)
{
        struct nh_grp_entry *nhges, *new_nhges;
        struct nexthop *nhp = nhge->nh_parent;
        struct netlink_ext_ack extack;
        struct nexthop *nh = nhge->nh;
        struct nh_group *nhg, *newg;
        int i, j, err;

        WARN_ON(!nh);

        nhg = rtnl_dereference(nhp->nh_grp);
        newg = nhg->spare;

        /* last entry, keep it visible and remove the parent */
        if (nhg->num_nh == 1) {
                remove_nexthop(net, nhp, nlinfo);
                return;
        }

        newg->has_v4 = false;
        newg->is_multipath = nhg->is_multipath;
        newg->hash_threshold = nhg->hash_threshold;
        newg->resilient = nhg->resilient;
        newg->fdb_nh = nhg->fdb_nh;
        newg->num_nh = nhg->num_nh;

        /* copy old entries to new except the one getting removed */
        nhges = nhg->nh_entries;
        new_nhges = newg->nh_entries;
        for (i = 0, j = 0; i < nhg->num_nh; ++i) {
                struct nh_info *nhi;

                /* current nexthop getting removed */
                if (nhg->nh_entries[i].nh == nh) {
                        newg->num_nh--;
                        continue;
                }

                nhi = rtnl_dereference(nhges[i].nh->nh_info);
                if (nhi->family == AF_INET)
                        newg->has_v4 = true;

                list_del(&nhges[i].nh_list);
                new_nhges[j].stats = nhges[i].stats;
                new_nhges[j].nh_parent = nhges[i].nh_parent;
                new_nhges[j].nh = nhges[i].nh;
                new_nhges[j].weight = nhges[i].weight;
                list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
                j++;
        }

        if (newg->hash_threshold)
                nh_hthr_group_rebalance(newg);
        else if (newg->resilient)
                replace_nexthop_grp_res(nhg, newg);

        rcu_assign_pointer(nhp->nh_grp, newg);

        list_del(&nhge->nh_list);
        free_percpu(nhge->stats);
        nexthop_put(nhge->nh);

        /* Removal of a NH from a resilient group is notified through
         * bucket notifications.
         */
        if (newg->hash_threshold) {
                err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
                                             &extack);
                if (err)
                        pr_err("%s\n", extack._msg);
        }

        if (nlinfo)
                nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
}

static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
                                       struct nl_info *nlinfo)
{
        struct nh_grp_entry *nhge, *tmp;

        list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
                remove_nh_grp_entry(net, nhge, nlinfo);

        /* make sure all see the newly published array before releasing rtnl */
        synchronize_net();
}

static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
{
        struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
        struct nh_res_table *res_table;
        int i, num_nh = nhg->num_nh;

        for (i = 0; i < num_nh; ++i) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];

                if (WARN_ON(!nhge->nh))
                        continue;

                list_del_init(&nhge->nh_list);
        }

        if (nhg->resilient) {
                res_table = rtnl_dereference(nhg->res_table);
                nh_res_table_cancel_upkeep(res_table);
        }
}

/* not called for nexthop replace */
static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
{
        struct fib6_info *f6i, *tmp;
        bool do_flush = false;
        struct fib_info *fi;

        list_for_each_entry(fi, &nh->fi_list, nh_list) {
                fi->fib_flags |= RTNH_F_DEAD;
                do_flush = true;
        }
        if (do_flush)
                fib_flush(net);

        /* ip6_del_rt removes the entry from this list hence the _safe */
        list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
                /* __ip6_del_rt does a release, so do a hold here */
                fib6_info_hold(f6i);
                ipv6_stub->ip6_del_rt(net, f6i,
                                      !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
        }
}

static void __remove_nexthop(struct net *net, struct nexthop *nh,
                             struct nl_info *nlinfo)
{
        __remove_nexthop_fib(net, nh);

        if (nh->is_group) {
                remove_nexthop_group(nh, nlinfo);
        } else {
                struct nh_info *nhi;

                nhi = rtnl_dereference(nh->nh_info);
                if (nhi->fib_nhc.nhc_dev)
                        hlist_del(&nhi->dev_hash);

                remove_nexthop_from_groups(net, nh, nlinfo);
        }
}

static void remove_nexthop(struct net *net, struct nexthop *nh,
                           struct nl_info *nlinfo)
{
        call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);

        /* remove from the tree */
        rb_erase(&nh->rb_node, &net->nexthop.rb_root);

        if (nlinfo)
                nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);

        __remove_nexthop(net, nh, nlinfo);
        nh_base_seq_inc(net);

        nexthop_put(nh);
}

/* if any FIB entries reference this nexthop, any dst entries
 * need to be regenerated
 */
static void nh_rt_cache_flush(struct net *net, struct nexthop *nh,
                              struct nexthop *replaced_nh)
{
        struct fib6_info *f6i;
        struct nh_group *nhg;
        int i;

        if (!list_empty(&nh->fi_list))
                rt_cache_flush(net);

        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
                ipv6_stub->fib6_update_sernum(net, f6i);

        /* if an IPv6 group was replaced, we have to release all old
         * dsts to make sure all refcounts are released
         */
        if (!replaced_nh->is_group)
                return;

        nhg = rtnl_dereference(replaced_nh->nh_grp);
        for (i = 0; i < nhg->num_nh; i++) {
                struct nh_grp_entry *nhge = &nhg->nh_entries[i];
                struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info);

                if (nhi->family == AF_INET6)
                        ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh);
        }
}

static int replace_nexthop_grp(struct net *net, struct nexthop *old,
                               struct nexthop *new, const struct nh_config *cfg,
                               struct netlink_ext_ack *extack)
{
        struct nh_res_table *tmp_table = NULL;
        struct nh_res_table *new_res_table;
        struct nh_res_table *old_res_table;
        struct nh_group *oldg, *newg;
        int i, err;

        if (!new->is_group) {
                NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
                return -EINVAL;
        }

        oldg = rtnl_dereference(old->nh_grp);
        newg = rtnl_dereference(new->nh_grp);

        if (newg->hash_threshold != oldg->hash_threshold) {
                NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type.");
                return -EINVAL;
        }

        if (newg->hash_threshold) {
                err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new,
                                             extack);
                if (err)
                        return err;
        } else if (newg->resilient) {
                new_res_table = rtnl_dereference(newg->res_table);
                old_res_table = rtnl_dereference(oldg->res_table);

                /* Accept if num_nh_buckets was not given, but if it was
                 * given, demand that the value be correct.
                 */
                if (cfg->nh_grp_res_has_num_buckets &&
                    cfg->nh_grp_res_num_buckets !=
                    old_res_table->num_nh_buckets) {
                        NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group.");
                        return -EINVAL;
                }

                /* Emit a pre-replace notification so that listeners could veto
                 * a potentially unsupported configuration. Otherwise,
                 * individual bucket replacement notifications would need to be
                 * vetoed, which is something that should only happen if the
                 * bucket is currently active.
                 */
                err = call_nexthop_res_table_notifiers(net, new, extack);
                if (err)
                        return err;

                if (cfg->nh_grp_res_has_idle_timer)
                        old_res_table->idle_timer = cfg->nh_grp_res_idle_timer;
                if (cfg->nh_grp_res_has_unbalanced_timer)
                        old_res_table->unbalanced_timer =
                                cfg->nh_grp_res_unbalanced_timer;

                replace_nexthop_grp_res(oldg, newg);

                tmp_table = new_res_table;
                rcu_assign_pointer(newg->res_table, old_res_table);
                rcu_assign_pointer(newg->spare->res_table, old_res_table);
        }

        /* update parents - used by nexthop code for cleanup */
        for (i = 0; i < newg->num_nh; i++)
                newg->nh_entries[i].nh_parent = old;

        rcu_assign_pointer(old->nh_grp, newg);

        /* Make sure concurrent readers are not using 'oldg' anymore. */
        synchronize_net();

        if (newg->resilient) {
                rcu_assign_pointer(oldg->res_table, tmp_table);
                rcu_assign_pointer(oldg->spare->res_table, tmp_table);
        }

        for (i = 0; i < oldg->num_nh; i++)
                oldg->nh_entries[i].nh_parent = new;

        rcu_assign_pointer(new->nh_grp, oldg);

        return 0;
}

static void nh_group_v4_update(struct nh_group *nhg)
{
        struct nh_grp_entry *nhges;
        bool has_v4 = false;
        int i;

        nhges = nhg->nh_entries;
        for (i = 0; i < nhg->num_nh; i++) {
                struct nh_info *nhi;

                nhi = rtnl_dereference(nhges[i].nh->nh_info);
                if (nhi->family == AF_INET)
                        has_v4 = true;
        }
        nhg->has_v4 = has_v4;
}

static int replace_nexthop_single_notify_res(struct net *net,
                                             struct nh_res_table *res_table,
                                             struct nexthop *old,
                                             struct nh_info *oldi,
                                             struct nh_info *newi,
                                             struct netlink_ext_ack *extack)
{
        u32 nhg_id = res_table->nhg_id;
        int err;
        u16 i;

        for (i = 0; i < res_table->num_nh_buckets; i++) {
                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
                struct nh_grp_entry *nhge;

                nhge = rtnl_dereference(bucket->nh_entry);
                if (nhge->nh == old) {
                        err = __call_nexthop_res_bucket_notifiers(net, nhg_id,
                                                                  i, true,
                                                                  oldi, newi,
                                                                  extack);
                        if (err)
                                goto err_notify;
                }
        }

        return 0;

err_notify:
        while (i-- > 0) {
                struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
                struct nh_grp_entry *nhge;

                nhge = rtnl_dereference(bucket->nh_entry);
                if (nhge->nh == old)
                        __call_nexthop_res_bucket_notifiers(net, nhg_id, i,
                                                            true, newi, oldi,
                                                            extack);
        }
        return err;
}

static int replace_nexthop_single_notify(struct net *net,
                                         struct nexthop *group_nh,
                                         struct nexthop *old,
                                         struct nh_info *oldi,
                                         struct nh_info *newi,
                                         struct netlink_ext_ack *extack)
{
        struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp);
        struct nh_res_table *res_table;

        if (nhg->hash_threshold) {
                return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE,
                                              group_nh, extack);
        } else if (nhg->resilient) {
                res_table = rtnl_dereference(nhg->res_table);
                return replace_nexthop_single_notify_res(net, res_table,
                                                         old, oldi, newi,
                                                         extack);
        }

        return -EINVAL;
}

static int replace_nexthop_single(struct net *net, struct nexthop *old,
                                  struct nexthop *new,
                                  struct netlink_ext_ack *extack)
{
        u8 old_protocol, old_nh_flags;
        struct nh_info *oldi, *newi;
        struct nh_grp_entry *nhge;
        int err;

        if (new->is_group) {
                NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
                return -EINVAL;
        }

        err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
        if (err)
                return err;

        /* Hardware flags were set on 'old' as 'new' is not in the red-black
         * tree. Therefore, inherit the flags from 'old' to 'new'.
         */
        new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);

        oldi = rtnl_dereference(old->nh_info);
        newi = rtnl_dereference(new->nh_info);

        newi->nh_parent = old;
        oldi->nh_parent = new;

        old_protocol = old->protocol;
        old_nh_flags = old->nh_flags;

        old->protocol = new->protocol;
        old->nh_flags = new->nh_flags;

        rcu_assign_pointer(old->nh_info, newi);
        rcu_assign_pointer(new->nh_info, oldi);

        /* Send a replace notification for all the groups using the nexthop. */
        list_for_each_entry(nhge, &old->grp_list, nh_list) {
                struct nexthop *nhp = nhge->nh_parent;

                err = replace_nexthop_single_notify(net, nhp, old, oldi, newi,
                                                    extack);
                if (err)
                        goto err_notify;
        }

        /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
         * update IPv4 indication in all the groups using the nexthop.
         */
        if (oldi->family == AF_INET && newi->family == AF_INET6) {
                list_for_each_entry(nhge, &old->grp_list, nh_list) {
                        struct nexthop *nhp = nhge->nh_parent;
                        struct nh_group *nhg;

                        nhg = rtnl_dereference(nhp->nh_grp);
                        nh_group_v4_update(nhg);
                }
        }

        return 0;

err_notify:
        rcu_assign_pointer(new->nh_info, newi);
        rcu_assign_pointer(old->nh_info, oldi);
        old->nh_flags = old_nh_flags;
        old->protocol = old_protocol;
        oldi->nh_parent = old;
        newi->nh_parent = new;
        list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
                struct nexthop *nhp = nhge->nh_parent;

                replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL);
        }
        call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
        return err;
}

static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
                                     struct nl_info *info)
{
        struct fib6_info *f6i;

        if (!list_empty(&nh->fi_list)) {
                struct fib_info *fi;

                /* expectation is a few fib_info per nexthop and then
                 * a lot of routes per fib_info. So mark the fib_info
                 * and then walk the fib tables once
                 */
                list_for_each_entry(fi, &nh->fi_list, nh_list)
                        fi->nh_updated = true;

                fib_info_notify_update(net, info);

                list_for_each_entry(fi, &nh->fi_list, nh_list)
                        fi->nh_updated = false;
        }

        list_for_each_entry(f6i, &nh->f6i_list, nh_list)
                ipv6_stub->fib6_rt_update(net, f6i, info);
}

/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
 * linked to this nexthop and for all groups that the nexthop
 * is a member of
 */
static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
                                   struct nl_info *info)
{
        struct nh_grp_entry *nhge;

        __nexthop_replace_notify(net, nh, info);

        list_for_each_entry(nhge, &nh->grp_list, nh_list)
                __nexthop_replace_notify(net, nhge->nh_parent, info);
}

static int replace_nexthop(struct net *net, struct nexthop *old,
                           struct nexthop *new, const struct nh_config *cfg,
                           struct netlink_ext_ack *extack)
{
        bool new_is_reject = false;
        struct nh_grp_entry *nhge;
        int err;

        /* check that existing FIB entries are ok with the
         * new nexthop definition
         */
        err = fib_check_nh_list(old, new, extack);
        if (err)
                return err;

        err = fib6_check_nh_list(old, new, extack);
        if (err)
                return err;

        if (!new->is_group) {
                struct nh_info *nhi = rtnl_dereference(new->nh_info);

                new_is_reject = nhi->reject_nh;
        }

        list_for_each_entry(nhge, &old->grp_list, nh_list) {
                /* if new nexthop is a blackhole, any groups using this
                 * nexthop cannot have more than 1 path
                 */
                if (new_is_reject &&
                    nexthop_num_path(nhge->nh_parent) > 1) {
                        NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
                        return -EINVAL;
                }

                err = fib_check_nh_list(nhge->nh_parent, new, extack);
                if (err)
                        return err;

                err = fib6_check_nh_list(nhge->nh_parent, new, extack);
                if (err)
                        return err;
        }

        if (old->is_group)
                err = replace_nexthop_grp(net, old, new, cfg, extack);
        else
                err = replace_nexthop_single(net, old, new, extack);

        if (!err) {
                nh_rt_cache_flush(net, old, new);

                __remove_nexthop(net, new, NULL);
                nexthop_put(new);
        }

        return err;
}

/* called with rtnl_lock held */
static int insert_nexthop(struct net *net, struct nexthop *new_nh,
                          struct nh_config *cfg, struct netlink_ext_ack *extack)
{
        struct rb_node **pp, *parent = NULL, *next;
        struct rb_root *root = &net->nexthop.rb_root;
        bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
        bool create = !!(cfg->nlflags & NLM_F_CREATE);
        u32 new_id = new_nh->id;
        int replace_notify = 0;
        int rc = -EEXIST;

        pp = &root->rb_node;
        while (1) {
                struct nexthop *nh;

                next = *pp;
                if (!next)
                        break;

                parent = next;

                nh = rb_entry(parent, struct nexthop, rb_node);
                if (new_id < nh->id) {
                        pp = &next->rb_left;
                } else if (new_id > nh->id) {
                        pp = &next->rb_right;
                } else if (replace) {
                        rc = replace_nexthop(net, nh, new_nh, cfg, extack);
                        if (!rc) {
                                new_nh = nh; /* send notification with old nh */
                                replace_notify = 1;
                        }
                        goto out;
                } else {
                        /* id already exists and not a replace */
                        goto out;
                }
        }

        if (replace && !create) {
                NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
                rc = -ENOENT;
                goto out;
        }

        if (new_nh->is_group) {
                struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp);
                struct nh_res_table *res_table;

                if (nhg->resilient) {
                        res_table = rtnl_dereference(nhg->res_table);

                        /* Not passing the number of buckets is OK when
                         * replacing, but not when creating a new group.
                         */
                        if (!cfg->nh_grp_res_has_num_buckets) {
                                NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion");
                                rc = -EINVAL;
                                goto out;
                        }

                        nh_res_group_rebalance(nhg, res_table);

                        /* Do not send bucket notifications, we do full
                         * notification below.
                         */
                        nh_res_table_upkeep(res_table, false, false);
                }
        }

        rb_link_node_rcu(&new_nh->rb_node, parent, pp);
        rb_insert_color(&new_nh->rb_node, root);

        /* The initial insertion is a full notification for hash-threshold as
         * well as resilient groups.
         */
        rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
        if (rc)
                rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);

out:
        if (!rc) {
                nh_base_seq_inc(net);
                nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
                if (replace_notify &&
                    READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode))
                        nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
        }

        return rc;
}

/* rtnl */
/* remove all nexthops tied to a device being deleted */
static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
{
        unsigned int hash = nh_dev_hashfn(dev->ifindex);
        struct net *net = dev_net(dev);
        struct hlist_head *head = &net->nexthop.devhash[hash];
        struct hlist_node *n;
        struct nh_info *nhi;

        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
                if (nhi->fib_nhc.nhc_dev != dev)
                        continue;

                if (nhi->reject_nh &&
                    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
                        continue;

                remove_nexthop(net, nhi->nh_parent, NULL);
        }
}

/* rtnl; called when net namespace is deleted */
static void flush_all_nexthops(struct net *net)
{
        struct rb_root *root = &net->nexthop.rb_root;
        struct rb_node *node;
        struct nexthop *nh;

        while ((node = rb_first(root))) {
                nh = rb_entry(node, struct nexthop, rb_node);
                remove_nexthop(net, nh, NULL);
                cond_resched();
        }
}

static struct nexthop *nexthop_create_group(struct net *net,
                                            struct nh_config *cfg)
{
        struct nlattr *grps_attr = cfg->nh_grp;
        struct nexthop_grp *entry = nla_data(grps_attr);
        u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
        struct nh_group *nhg;
        struct nexthop *nh;
        int err;
        int i;

        nh = nexthop_alloc();
        if (!nh)
                return ERR_PTR(-ENOMEM);

        nh->is_group = 1;

        nhg = nexthop_grp_alloc(num_nh);
        if (!nhg) {
                kfree(nh);
                return ERR_PTR(-ENOMEM);
        }

        /* spare group used for removals */
        nhg->spare = nexthop_grp_alloc(num_nh);
        if (!nhg->spare) {
                kfree(nhg);
                kfree(nh);
                return ERR_PTR(-ENOMEM);
        }
        nhg->spare->spare = nhg;

        for (i = 0; i < nhg->num_nh; ++i) {
                struct nexthop *nhe;
                struct nh_info *nhi;

                nhe = nexthop_find_by_id(net, entry[i].id);
                if (!nexthop_get(nhe)) {
                        err = -ENOENT;
                        goto out_no_nh;
                }

                nhi = rtnl_dereference(nhe->nh_info);
                if (nhi->family == AF_INET)
                        nhg->has_v4 = true;

                nhg->nh_entries[i].stats =
                        netdev_alloc_pcpu_stats(struct nh_grp_entry_stats);
                if (!nhg->nh_entries[i].stats) {
                        err = -ENOMEM;
                        nexthop_put(nhe);
                        goto out_no_nh;
                }
                nhg->nh_entries[i].nh = nhe;
                nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]);

                list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
                nhg->nh_entries[i].nh_parent = nh;
        }

        if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
                nhg->hash_threshold = 1;
                nhg->is_multipath = true;
        } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
                struct nh_res_table *res_table;

                res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg);
                if (!res_table) {
                        err = -ENOMEM;
                        goto out_no_nh;
                }

                rcu_assign_pointer(nhg->spare->res_table, res_table);
                rcu_assign_pointer(nhg->res_table, res_table);
                nhg->resilient = true;
                nhg->is_multipath = true;
        }

        WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1);

        if (nhg->hash_threshold)
                nh_hthr_group_rebalance(nhg);

        if (cfg->nh_fdb)
                nhg->fdb_nh = 1;

        if (cfg->nh_hw_stats)
                nhg->hw_stats = true;

        rcu_assign_pointer(nh->nh_grp, nhg);

        return nh;

out_no_nh:
        for (i--; i >= 0; --i) {
                list_del(&nhg->nh_entries[i].nh_list);
                free_percpu(nhg->nh_entries[i].stats);
                nexthop_put(nhg->nh_entries[i].nh);
        }

        kfree(nhg->spare);
        kfree(nhg);
        kfree(nh);

        return ERR_PTR(err);
}

static int nh_create_ipv4(struct net *net, struct nexthop *nh,
                          struct nh_info *nhi, struct nh_config *cfg,
                          struct netlink_ext_ack *extack)
{
        struct fib_nh *fib_nh = &nhi->fib_nh;
        struct fib_config fib_cfg = {
                .fc_oif   = cfg->nh_ifindex,
                .fc_gw4   = cfg->gw.ipv4,
                .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
                .fc_flags = cfg->nh_flags,
                .fc_nlinfo = cfg->nlinfo,
                .fc_encap = cfg->nh_encap,
                .fc_encap_type = cfg->nh_encap_type,
        };
        u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
        int err;

        err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
        if (err) {
                fib_nh_release(net, fib_nh);
                goto out;
        }

        if (nhi->fdb_nh)
                goto out;

        /* sets nh_dev if successful */
        err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
        if (!err) {
                nh->nh_flags = fib_nh->fib_nh_flags;
                fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
                                          !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1);
        } else {
                fib_nh_release(net, fib_nh);
        }
out:
        return err;
}

static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
                          struct nh_info *nhi, struct nh_config *cfg,
                          struct netlink_ext_ack *extack)
{
        struct fib6_nh *fib6_nh = &nhi->fib6_nh;
        struct fib6_config fib6_cfg = {
                .fc_table = l3mdev_fib_table(cfg->dev),
                .fc_ifindex = cfg->nh_ifindex,
                .fc_gateway = cfg->gw.ipv6,
                .fc_flags = cfg->nh_flags,
                .fc_nlinfo = cfg->nlinfo,
                .fc_encap = cfg->nh_encap,
                .fc_encap_type = cfg->nh_encap_type,
                .fc_is_fdb = cfg->nh_fdb,
        };
        int err;

        if (!ipv6_addr_any(&cfg->gw.ipv6))
                fib6_cfg.fc_flags |= RTF_GATEWAY;

        /* sets nh_dev if successful */
        err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
                                      extack);
        if (err) {
                /* IPv6 is not enabled, don't call fib6_nh_release */
                if (err == -EAFNOSUPPORT)
                        goto out;
                ipv6_stub->fib6_nh_release(fib6_nh);
        } else {
                nh->nh_flags = fib6_nh->fib_nh_flags;
        }
out:
        return err;
}

static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
                                      struct netlink_ext_ack *extack)
{
        struct nh_info *nhi;
        struct nexthop *nh;
        int err = 0;

        nh = nexthop_alloc();
        if (!nh)
                return ERR_PTR(-ENOMEM);

        nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
        if (!nhi) {
                kfree(nh);
                return ERR_PTR(-ENOMEM);
        }

        nh->nh_flags = cfg->nh_flags;
        nh->net = net;

        nhi->nh_parent = nh;
        nhi->family = cfg->nh_family;
        nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;

        if (cfg->nh_fdb)
                nhi->fdb_nh = 1;

        if (cfg->nh_blackhole) {
                nhi->reject_nh = 1;
                cfg->nh_ifindex = net->loopback_dev->ifindex;
        }

        switch (cfg->nh_family) {
        case AF_INET:
                err = nh_create_ipv4(net, nh, nhi, cfg, extack);
                break;
        case AF_INET6:
                err = nh_create_ipv6(net, nh, nhi, cfg, extack);
                break;
        }

        if (err) {
                kfree(nhi);
                kfree(nh);
                return ERR_PTR(err);
        }

        /* add the entry to the device based hash */
        if (!nhi->fdb_nh)
                nexthop_devhash_add(net, nhi);

        rcu_assign_pointer(nh->nh_info, nhi);

        return nh;
}

/* called with rtnl lock held */
static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
                                   struct netlink_ext_ack *extack)
{
        struct nexthop *nh;
        int err;

        if (!cfg->nh_id) {
                cfg->nh_id = nh_find_unused_id(net);
                if (!cfg->nh_id) {
                        NL_SET_ERR_MSG(extack, "No unused id");
                        return ERR_PTR(-EINVAL);
                }
        }

        if (cfg->nh_grp)
                nh = nexthop_create_group(net, cfg);
        else
                nh = nexthop_create(net, cfg, extack);

        if (IS_ERR(nh))
                return nh;

        refcount_set(&nh->refcnt, 1);
        nh->id = cfg->nh_id;
        nh->protocol = cfg->nh_protocol;
        nh->net = net;

        err = insert_nexthop(net, nh, cfg, extack);
        if (err) {
                __remove_nexthop(net, nh, NULL);
                nexthop_put(nh);
                nh = ERR_PTR(err);
        }

        return nh;
}

static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback,
                            unsigned long *timer_p, bool *has_p,
                            struct netlink_ext_ack *extack)
{
        unsigned long timer;
        u32 value;

        if (!attr) {
                *timer_p = fallback;
                *has_p = false;
                return 0;
        }

        value = nla_get_u32(attr);
        timer = clock_t_to_jiffies(value);
        if (timer == ~0UL) {
                NL_SET_ERR_MSG(extack, "Timer value too large");
                return -EINVAL;
        }

        *timer_p = timer;
        *has_p = true;
        return 0;
}

static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
                                    struct netlink_ext_ack *extack)
{
        struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {};
        int err;

        if (res) {
                err = nla_parse_nested(tb,
                                       ARRAY_SIZE(rtm_nh_res_policy_new) - 1,
                                       res, rtm_nh_res_policy_new, extack);
                if (err < 0)
                        return err;
        }

        if (tb[NHA_RES_GROUP_BUCKETS]) {
                cfg->nh_grp_res_num_buckets =
                        nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]);
                cfg->nh_grp_res_has_num_buckets = true;
                if (!cfg->nh_grp_res_num_buckets) {
                        NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0");
                        return -EINVAL;
                }
        }

        err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER],
                               NH_RES_DEFAULT_IDLE_TIMER,
                               &cfg->nh_grp_res_idle_timer,
                               &cfg->nh_grp_res_has_idle_timer,
                               extack);
        if (err)
                return err;

        return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER],
                                NH_RES_DEFAULT_UNBALANCED_TIMER,
                                &cfg->nh_grp_res_unbalanced_timer,
                                &cfg->nh_grp_res_has_unbalanced_timer,
                                extack);
}

static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
                            struct nlmsghdr *nlh, struct nlattr **tb,
                            struct nh_config *cfg,
                            struct netlink_ext_ack *extack)
{
        struct nhmsg *nhm = nlmsg_data(nlh);
        int err;

        err = -EINVAL;
        if (nhm->resvd || nhm->nh_scope) {
                NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
                goto out;
        }
        if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
                NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
                goto out;
        }

        switch (nhm->nh_family) {
        case AF_INET:
        case AF_INET6:
                break;
        case AF_UNSPEC:
                if (tb[NHA_GROUP])
                        break;
                fallthrough;
        default:
                NL_SET_ERR_MSG(extack, "Invalid address family");
                goto out;
        }

        memset(cfg, 0, sizeof(*cfg));
        cfg->nlflags = nlh->nlmsg_flags;
        cfg->nlinfo.portid = NETLINK_CB(skb).portid;
        cfg->nlinfo.nlh = nlh;
        cfg->nlinfo.nl_net = net;

        cfg->nh_family = nhm->nh_family;
        cfg->nh_protocol = nhm->nh_protocol;
        cfg->nh_flags = nhm->nh_flags;

        if (tb[NHA_ID])
                cfg->nh_id = nla_get_u32(tb[NHA_ID]);

        if (tb[NHA_FDB]) {
                if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
                    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE]) {
                        NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
                        goto out;
                }
                if (nhm->nh_flags) {
                        NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
                        goto out;
                }
                cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
        }

        if (tb[NHA_GROUP]) {
                if (nhm->nh_family != AF_UNSPEC) {
                        NL_SET_ERR_MSG(extack, "Invalid family for group");
                        goto out;
                }
                cfg->nh_grp = tb[NHA_GROUP];

                cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
                if (tb[NHA_GROUP_TYPE])
                        cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);

                if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
                        NL_SET_ERR_MSG(extack, "Invalid group type");
                        goto out;
                }

                err = nh_check_attr_group(net, tb, ARRAY_SIZE(rtm_nh_policy_new),
                                          cfg->nh_grp_type, extack);
                if (err)
                        goto out;

                if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES)
                        err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
                                                       cfg, extack);

                if (tb[NHA_HW_STATS_ENABLE])
                        cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]);

                /* no other attributes should be set */
                goto out;
        }

        if (tb[NHA_BLACKHOLE]) {
                if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
                    tb[NHA_ENCAP]   || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
                        NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
                        goto out;
                }

                cfg->nh_blackhole = 1;
                err = 0;
                goto out;
        }

        if (!cfg->nh_fdb && !tb[NHA_OIF]) {
                NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
                goto out;
        }

        err = -EINVAL;
        if (tb[NHA_GATEWAY]) {
                struct nlattr *gwa = tb[NHA_GATEWAY];

                switch (cfg->nh_family) {
                case AF_INET:
                        if (nla_len(gwa) != sizeof(u32)) {
                                NL_SET_ERR_MSG(extack, "Invalid gateway");
                                goto out;
                        }
                        cfg->gw.ipv4 = nla_get_be32(gwa);
                        break;
                case AF_INET6:
                        if (nla_len(gwa) != sizeof(struct in6_addr)) {
                                NL_SET_ERR_MSG(extack, "Invalid gateway");
                                goto out;
                        }
                        cfg->gw.ipv6 = nla_get_in6_addr(gwa);
                        break;
                default:
                        NL_SET_ERR_MSG(extack,
                                       "Unknown address family for gateway");
                        goto out;
                }
        } else {
                /* device only nexthop (no gateway) */
                if (cfg->nh_flags & RTNH_F_ONLINK) {
                        NL_SET_ERR_MSG(extack,
                                       "ONLINK flag can not be set for nexthop without a gateway");
                        goto out;
                }
        }

        if (tb[NHA_ENCAP]) {
                cfg->nh_encap = tb[NHA_ENCAP];

                if (!tb[NHA_ENCAP_TYPE]) {
                        NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
                        goto out;
                }

                cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
                err = lwtunnel_valid_encap_type(cfg->nh_encap_type,
                                                extack, false);
                if (err < 0)
                        goto out;

        } else if (tb[NHA_ENCAP_TYPE]) {
                NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
                goto out;
        }

        if (tb[NHA_HW_STATS_ENABLE]) {
                NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops");
                goto out;
        }

        err = 0;
out:
        return err;
}

static int rtm_to_nh_config_rtnl(struct net *net, struct nlattr **tb,
                                 struct nh_config *cfg,
                                 struct netlink_ext_ack *extack)
{
        if (tb[NHA_GROUP])
                return nh_check_attr_group_rtnl(net, tb, extack);

        if (tb[NHA_OIF]) {
                cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
                if (cfg->nh_ifindex)
                        cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);

                if (!cfg->dev) {
                        NL_SET_ERR_MSG(extack, "Invalid device index");
                        return -EINVAL;
                }

                if (!(cfg->dev->flags & IFF_UP)) {
                        NL_SET_ERR_MSG(extack, "Nexthop device is not up");
                        return -ENETDOWN;
                }

                if (!netif_carrier_ok(cfg->dev)) {
                        NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
                        return -ENETDOWN;
                }
        }

        return 0;
}

/* rtnl */
static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
                           struct netlink_ext_ack *extack)
{
        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
        struct net *net = sock_net(skb->sk);
        struct nh_config cfg;
        struct nexthop *nh;
        int err;

        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
                          ARRAY_SIZE(rtm_nh_policy_new) - 1,
                          rtm_nh_policy_new, extack);
        if (err < 0)
                goto out;

        err = rtm_to_nh_config(net, skb, nlh, tb, &cfg, extack);
        if (err)
                goto out;

        if (cfg.nlflags & NLM_F_REPLACE && !cfg.nh_id) {
                NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
                err = -EINVAL;
                goto out;
        }

        rtnl_net_lock(net);

        err = rtm_to_nh_config_rtnl(net, tb, &cfg, extack);
        if (err)
                goto unlock;

        nh = nexthop_add(net, &cfg, extack);
        if (IS_ERR(nh))
                err = PTR_ERR(nh);

unlock:
        rtnl_net_unlock(net);
out:
        return err;
}

static int nh_valid_get_del_req(const struct nlmsghdr *nlh,
                                struct nlattr **tb, u32 *id, u32 *op_flags,
                                struct netlink_ext_ack *extack)
{
        struct nhmsg *nhm = nlmsg_data(nlh);

        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
                NL_SET_ERR_MSG(extack, "Invalid values in header");
                return -EINVAL;
        }

        if (!tb[NHA_ID]) {
                NL_SET_ERR_MSG(extack, "Nexthop id is missing");
                return -EINVAL;
        }

        *id = nla_get_u32(tb[NHA_ID]);
        if (!(*id)) {
                NL_SET_ERR_MSG(extack, "Invalid nexthop id");
                return -EINVAL;
        }

        if (op_flags)
                *op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0);

        return 0;
}

/* rtnl */
static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
                           struct netlink_ext_ack *extack)
{
        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)];
        struct net *net = sock_net(skb->sk);
        struct nl_info nlinfo = {
                .nlh = nlh,
                .nl_net = net,
                .portid = NETLINK_CB(skb).portid,
        };
        struct nexthop *nh;
        int err;
        u32 id;

        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
                          ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del,
                          extack);
        if (err < 0)
                return err;

        err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack);
        if (err)
                return err;

        rtnl_net_lock(net);

        nh = nexthop_find_by_id(net, id);
        if (nh)
                remove_nexthop(net, nh, &nlinfo);
        else
                err = -ENOENT;

        rtnl_net_unlock(net);

        return err;
}

/* rtnl */
static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                           struct netlink_ext_ack *extack)
{
        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)];
        struct net *net = sock_net(in_skb->sk);
        struct sk_buff *skb = NULL;
        struct nexthop *nh;
        u32 op_flags;
        int err;
        u32 id;

        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
                          ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get,
                          extack);
        if (err < 0)
                return err;

        err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack);
        if (err)
                return err;

        err = -ENOBUFS;
        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                goto out;

        err = -ENOENT;
        nh = nexthop_find_by_id(net, id);
        if (!nh)
                goto errout_free;

        err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
                           nlh->nlmsg_seq, 0, op_flags);
        if (err < 0) {
                WARN_ON(err == -EMSGSIZE);
                goto errout_free;
        }

        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
out:
        return err;
errout_free:
        kfree_skb(skb);
        goto out;
}

struct nh_dump_filter {
        u32 nh_id;
        int dev_idx;
        int master_idx;
        bool group_filter;
        bool fdb_filter;
        u32 res_bucket_nh_id;
        u32 op_flags;
};

static bool nh_dump_filtered(struct nexthop *nh,
                             struct nh_dump_filter *filter, u8 family)
{
        const struct net_device *dev;
        const struct nh_info *nhi;

        if (filter->group_filter && !nh->is_group)
                return true;

        if (!filter->dev_idx && !filter->master_idx && !family)
                return false;

        if (nh->is_group)
                return true;

        nhi = rtnl_dereference(nh->nh_info);
        if (family && nhi->family != family)
                return true;

        dev = nhi->fib_nhc.nhc_dev;
        if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx))
                return true;

        if (filter->master_idx) {
                struct net_device *master;

                if (!dev)
                        return true;

                master = netdev_master_upper_dev_get((struct net_device *)dev);
                if (!master || master->ifindex != filter->master_idx)
                        return true;
        }

        return false;
}

static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
                               struct nh_dump_filter *filter,
                               struct netlink_ext_ack *extack)
{
        struct nhmsg *nhm;
        u32 idx;

        if (tb[NHA_OIF]) {
                idx = nla_get_u32(tb[NHA_OIF]);
                if (idx > INT_MAX) {
                        NL_SET_ERR_MSG(extack, "Invalid device index");
                        return -EINVAL;
                }
                filter->dev_idx = idx;
        }
        if (tb[NHA_MASTER]) {
                idx = nla_get_u32(tb[NHA_MASTER]);
                if (idx > INT_MAX) {
                        NL_SET_ERR_MSG(extack, "Invalid master device index");
                        return -EINVAL;
                }
                filter->master_idx = idx;
        }
        filter->group_filter = nla_get_flag(tb[NHA_GROUPS]);
        filter->fdb_filter = nla_get_flag(tb[NHA_FDB]);

        nhm = nlmsg_data(nlh);
        if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
                NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
                return -EINVAL;
        }

        return 0;
}

static int nh_valid_dump_req(const struct nlmsghdr *nlh,
                             struct nh_dump_filter *filter,
                             struct netlink_callback *cb)
{
        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)];
        int err;

        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
                          ARRAY_SIZE(rtm_nh_policy_dump) - 1,
                          rtm_nh_policy_dump, cb->extack);
        if (err < 0)
                return err;

        filter->op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0);

        return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
}

struct rtm_dump_nh_ctx {
        u32 idx;
};

static struct rtm_dump_nh_ctx *
rtm_dump_nh_ctx(struct netlink_callback *cb)
{
        struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx;

        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
        return ctx;
}

static int rtm_dump_walk_nexthops(struct sk_buff *skb,
                                  struct netlink_callback *cb,
                                  struct rb_root *root,
                                  struct rtm_dump_nh_ctx *ctx,
                                  int (*nh_cb)(struct sk_buff *skb,
                                               struct netlink_callback *cb,
                                               struct nexthop *nh, void *data),
                                  void *data)
{
        struct rb_node *node;
        int s_idx;
        int err;

        s_idx = ctx->idx;
        for (node = rb_first(root); node; node = rb_next(node)) {
                struct nexthop *nh;

                nh = rb_entry(node, struct nexthop, rb_node);
                if (nh->id < s_idx)
                        continue;

                ctx->idx = nh->id;
                err = nh_cb(skb, cb, nh, data);
                if (err)
                        return err;
        }

        return 0;
}

static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
                               struct nexthop *nh, void *data)
{
        struct nhmsg *nhm = nlmsg_data(cb->nlh);
        struct nh_dump_filter *filter = data;

        if (nh_dump_filtered(nh, filter, nhm->nh_family))
                return 0;

        return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
                            NETLINK_CB(cb->skb).portid,
                            cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags);
}

/* rtnl */
static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb);
        struct net *net = sock_net(skb->sk);
        struct rb_root *root = &net->nexthop.rb_root;
        struct nh_dump_filter filter = {};
        int err;

        err = nh_valid_dump_req(cb->nlh, &filter, cb);
        if (err < 0)
                return err;

        err = rtm_dump_walk_nexthops(skb, cb, root, ctx,
                                     &rtm_dump_nexthop_cb, &filter);

        cb->seq = net->nexthop.seq;
        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
        return err;
}

static struct nexthop *
nexthop_find_group_resilient(struct net *net, u32 id,
                             struct netlink_ext_ack *extack)
{
        struct nh_group *nhg;
        struct nexthop *nh;

        nh = nexthop_find_by_id(net, id);
        if (!nh)
                return ERR_PTR(-ENOENT);

        if (!nh->is_group) {
                NL_SET_ERR_MSG(extack, "Not a nexthop group");
                return ERR_PTR(-EINVAL);
        }

        nhg = rtnl_dereference(nh->nh_grp);
        if (!nhg->resilient) {
                NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient");
                return ERR_PTR(-EINVAL);
        }

        return nh;
}

static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p,
                              struct netlink_ext_ack *extack)
{
        u32 idx;

        if (attr) {
                idx = nla_get_u32(attr);
                if (!idx) {
                        NL_SET_ERR_MSG(extack, "Invalid nexthop id");
                        return -EINVAL;
                }
                *nh_id_p = idx;
        } else {
                *nh_id_p = 0;
        }

        return 0;
}

static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
                                    struct nh_dump_filter *filter,
                                    struct netlink_callback *cb)
{
        struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)];
        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)];
        int err;

        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
                          ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1,
                          rtm_nh_policy_dump_bucket, NULL);
        if (err < 0)
                return err;

        err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack);
        if (err)
                return err;

        if (tb[NHA_RES_BUCKET]) {
                size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1;

                err = nla_parse_nested(res_tb, max,
                                       tb[NHA_RES_BUCKET],
                                       rtm_nh_res_bucket_policy_dump,
                                       cb->extack);
                if (err < 0)
                        return err;

                err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID],
                                         &filter->res_bucket_nh_id,
                                         cb->extack);
                if (err)
                        return err;
        }

        return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
}

struct rtm_dump_res_bucket_ctx {
        struct rtm_dump_nh_ctx nh;
        u16 bucket_index;
};

static struct rtm_dump_res_bucket_ctx *
rtm_dump_res_bucket_ctx(struct netlink_callback *cb)
{
        struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx;

        BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
        return ctx;
}

struct rtm_dump_nexthop_bucket_data {
        struct rtm_dump_res_bucket_ctx *ctx;
        struct nh_dump_filter filter;
};

static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
                                      struct netlink_callback *cb,
                                      struct nexthop *nh,
                                      struct rtm_dump_nexthop_bucket_data *dd)
{
        u32 portid = NETLINK_CB(cb->skb).portid;
        struct nhmsg *nhm = nlmsg_data(cb->nlh);
        struct nh_res_table *res_table;
        struct nh_group *nhg;
        u16 bucket_index;
        int err;

        nhg = rtnl_dereference(nh->nh_grp);
        res_table = rtnl_dereference(nhg->res_table);
        for (bucket_index = dd->ctx->bucket_index;
             bucket_index < res_table->num_nh_buckets;
             bucket_index++) {
                struct nh_res_bucket *bucket;
                struct nh_grp_entry *nhge;

                bucket = &res_table->nh_buckets[bucket_index];
                nhge = rtnl_dereference(bucket->nh_entry);
                if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family))
                        continue;

                if (dd->filter.res_bucket_nh_id &&
                    dd->filter.res_bucket_nh_id != nhge->nh->id)
                        continue;

                dd->ctx->bucket_index = bucket_index;
                err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
                                         RTM_NEWNEXTHOPBUCKET, portid,
                                         cb->nlh->nlmsg_seq, NLM_F_MULTI,
                                         cb->extack);
                if (err)
                        return err;
        }

        dd->ctx->bucket_index = 0;

        return 0;
}

static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
                                      struct netlink_callback *cb,
                                      struct nexthop *nh, void *data)
{
        struct rtm_dump_nexthop_bucket_data *dd = data;
        struct nh_group *nhg;

        if (!nh->is_group)
                return 0;

        nhg = rtnl_dereference(nh->nh_grp);
        if (!nhg->resilient)
                return 0;

        return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd);
}

/* rtnl */
static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
                                   struct netlink_callback *cb)
{
        struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb);
        struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx };
        struct net *net = sock_net(skb->sk);
        struct nexthop *nh;
        int err;

        err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb);
        if (err)
                return err;

        if (dd.filter.nh_id) {
                nh = nexthop_find_group_resilient(net, dd.filter.nh_id,
                                                  cb->extack);
                if (IS_ERR(nh))
                        return PTR_ERR(nh);
                err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd);
        } else {
                struct rb_root *root = &net->nexthop.rb_root;

                err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh,
                                             &rtm_dump_nexthop_bucket_cb, &dd);
        }

        cb->seq = net->nexthop.seq;
        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
        return err;
}

static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res,
                                              u16 *bucket_index,
                                              struct netlink_ext_ack *extack)
{
        struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)];
        int err;

        err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1,
                               res, rtm_nh_res_bucket_policy_get, extack);
        if (err < 0)
                return err;

        if (!tb[NHA_RES_BUCKET_INDEX]) {
                NL_SET_ERR_MSG(extack, "Bucket index is missing");
                return -EINVAL;
        }

        *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]);
        return 0;
}

static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
                                   u32 *id, u16 *bucket_index,
                                   struct netlink_ext_ack *extack)
{
        struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)];
        int err;

        err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
                          ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1,
                          rtm_nh_policy_get_bucket, extack);
        if (err < 0)
                return err;

        err = nh_valid_get_del_req(nlh, tb, id, NULL, extack);
        if (err)
                return err;

        if (!tb[NHA_RES_BUCKET]) {
                NL_SET_ERR_MSG(extack, "Bucket information is missing");
                return -EINVAL;
        }

        err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET],
                                                 bucket_index, extack);
        if (err)
                return err;

        return 0;
}

/* rtnl */
static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                                  struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nh_res_table *res_table;
        struct sk_buff *skb = NULL;
        struct nh_group *nhg;
        struct nexthop *nh;
        u16 bucket_index;
        int err;
        u32 id;

        err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack);
        if (err)
                return err;

        nh = nexthop_find_group_resilient(net, id, extack);
        if (IS_ERR(nh))
                return PTR_ERR(nh);

        nhg = rtnl_dereference(nh->nh_grp);
        res_table = rtnl_dereference(nhg->res_table);
        if (bucket_index >= res_table->num_nh_buckets) {
                NL_SET_ERR_MSG(extack, "Bucket index out of bounds");
                return -ENOENT;
        }

        skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
        if (!skb)
                return -ENOBUFS;

        err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index],
                                 bucket_index, RTM_NEWNEXTHOPBUCKET,
                                 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
                                 0, extack);
        if (err < 0) {
                WARN_ON(err == -EMSGSIZE);
                goto errout_free;
        }

        return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);

errout_free:
        kfree_skb(skb);
        return err;
}

static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
{
        unsigned int hash = nh_dev_hashfn(dev->ifindex);
        struct net *net = dev_net(dev);
        struct hlist_head *head = &net->nexthop.devhash[hash];
        struct hlist_node *n;
        struct nh_info *nhi;

        hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
                if (nhi->fib_nhc.nhc_dev == dev) {
                        if (nhi->family == AF_INET)
                                fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
                                                   orig_mtu);
                }
        }
}

/* rtnl */
static int nh_netdev_event(struct notifier_block *this,
                           unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netdev_notifier_info_ext *info_ext;

        switch (event) {
        case NETDEV_DOWN:
        case NETDEV_UNREGISTER:
                nexthop_flush_dev(dev, event);
                break;
        case NETDEV_CHANGE:
                if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
                        nexthop_flush_dev(dev, event);
                break;
        case NETDEV_CHANGEMTU:
                info_ext = ptr;
                nexthop_sync_mtu(dev, info_ext->ext.mtu);
                rt_cache_flush(dev_net(dev));
                break;
        }
        return NOTIFY_DONE;
}

static struct notifier_block nh_netdev_notifier = {
        .notifier_call = nh_netdev_event,
};

static int nexthops_dump(struct net *net, struct notifier_block *nb,
                         enum nexthop_event_type event_type,
                         struct netlink_ext_ack *extack)
{
        struct rb_root *root = &net->nexthop.rb_root;
        struct rb_node *node;
        int err = 0;

        for (node = rb_first(root); node; node = rb_next(node)) {
                struct nexthop *nh;

                nh = rb_entry(node, struct nexthop, rb_node);
                err = call_nexthop_notifier(nb, net, event_type, nh, extack);
                if (err)
                        break;
        }

        return err;
}

int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
                              struct netlink_ext_ack *extack)
{
        int err;

        rtnl_lock();
        err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack);
        if (err)
                goto unlock;
        err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
                                               nb);
unlock:
        rtnl_unlock();
        return err;
}
EXPORT_SYMBOL(register_nexthop_notifier);

int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
{
        int err;

        err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
                                                 nb);
        if (!err)
                nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
        return err;
}
EXPORT_SYMBOL(__unregister_nexthop_notifier);

int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
{
        int err;

        rtnl_lock();
        err = __unregister_nexthop_notifier(net, nb);
        rtnl_unlock();
        return err;
}
EXPORT_SYMBOL(unregister_nexthop_notifier);

void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap)
{
        struct nexthop *nexthop;

        rcu_read_lock();

        nexthop = nexthop_find_by_id(net, id);
        if (!nexthop)
                goto out;

        nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
        if (offload)
                nexthop->nh_flags |= RTNH_F_OFFLOAD;
        if (trap)
                nexthop->nh_flags |= RTNH_F_TRAP;

out:
        rcu_read_unlock();
}
EXPORT_SYMBOL(nexthop_set_hw_flags);

void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
                                 bool offload, bool trap)
{
        struct nh_res_table *res_table;
        struct nh_res_bucket *bucket;
        struct nexthop *nexthop;
        struct nh_group *nhg;

        rcu_read_lock();

        nexthop = nexthop_find_by_id(net, id);
        if (!nexthop || !nexthop->is_group)
                goto out;

        nhg = rcu_dereference(nexthop->nh_grp);
        if (!nhg->resilient)
                goto out;

        if (bucket_index >= nhg->res_table->num_nh_buckets)
                goto out;

        res_table = rcu_dereference(nhg->res_table);
        bucket = &res_table->nh_buckets[bucket_index];
        bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
        if (offload)
                bucket->nh_flags |= RTNH_F_OFFLOAD;
        if (trap)
                bucket->nh_flags |= RTNH_F_TRAP;

out:
        rcu_read_unlock();
}
EXPORT_SYMBOL(nexthop_bucket_set_hw_flags);

void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
                                     unsigned long *activity)
{
        struct nh_res_table *res_table;
        struct nexthop *nexthop;
        struct nh_group *nhg;
        u16 i;

        rcu_read_lock();

        nexthop = nexthop_find_by_id(net, id);
        if (!nexthop || !nexthop->is_group)
                goto out;

        nhg = rcu_dereference(nexthop->nh_grp);
        if (!nhg->resilient)
                goto out;

        /* Instead of silently ignoring some buckets, demand that the sizes
         * be the same.
         */
        res_table = rcu_dereference(nhg->res_table);
        if (num_buckets != res_table->num_nh_buckets)
                goto out;

        for (i = 0; i < num_buckets; i++) {
                if (test_bit(i, activity))
                        nh_res_bucket_set_busy(&res_table->nh_buckets[i]);
        }

out:
        rcu_read_unlock();
}
EXPORT_SYMBOL(nexthop_res_grp_activity_update);

static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list,
                                                   struct list_head *dev_to_kill)
{
        struct net *net;

        ASSERT_RTNL();
        list_for_each_entry(net, net_list, exit_list)
                flush_all_nexthops(net);
}

static void __net_exit nexthop_net_exit(struct net *net)
{
        kfree(net->nexthop.devhash);
        net->nexthop.devhash = NULL;
}

static int __net_init nexthop_net_init(struct net *net)
{
        size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;

        net->nexthop.rb_root = RB_ROOT;
        net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
        if (!net->nexthop.devhash)
                return -ENOMEM;
        BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);

        return 0;
}

static struct pernet_operations nexthop_net_ops = {
        .init = nexthop_net_init,
        .exit = nexthop_net_exit,
        .exit_batch_rtnl = nexthop_net_exit_batch_rtnl,
};

static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = {
        {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop,
         .flags = RTNL_FLAG_DOIT_PERNET},
        {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop,
         .flags = RTNL_FLAG_DOIT_PERNET},
        {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop,
         .dumpit = rtm_dump_nexthop},
        {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket,
         .dumpit = rtm_dump_nexthop_bucket},
        {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP,
         .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP,
         .dumpit = rtm_dump_nexthop},
        {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP,
         .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET},
        {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP,
         .dumpit = rtm_dump_nexthop},
};

static int __init nexthop_init(void)
{
        register_pernet_subsys(&nexthop_net_ops);

        register_netdevice_notifier(&nh_netdev_notifier);

        rtnl_register_many(nexthop_rtnl_msg_handlers);

        return 0;
}
subsys_initcall(nexthop_init);






























































































































































































































































  147 


  147 











































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   49 











   48 




































































  170 



  171 





















































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/fs/locks.c
 *
 * We implement four types of file locks: BSD locks, posix locks, open
 * file description locks, and leases.  For details about BSD locks,
 * see the flock(2) man page; for details about the other three, see
 * fcntl(2).
 *
 *
 * Locking conflicts and dependencies:
 * If multiple threads attempt to lock the same byte (or flock the same file)
 * only one can be granted the lock, and other must wait their turn.
 * The first lock has been "applied" or "granted", the others are "waiting"
 * and are "blocked" by the "applied" lock..
 *
 * Waiting and applied locks are all kept in trees whose properties are:
 *
 *        - the root of a tree may be an applied or waiting lock.
 *        - every other node in the tree is a waiting lock that
 *          conflicts with every ancestor of that node.
 *
 * Every such tree begins life as a waiting singleton which obviously
 * satisfies the above properties.
 *
 * The only ways we modify trees preserve these properties:
 *
 *        1. We may add a new leaf node, but only after first verifying that it
 *           conflicts with all of its ancestors.
 *        2. We may remove the root of a tree, creating a new singleton
 *           tree from the root and N new trees rooted in the immediate
 *           children.
 *        3. If the root of a tree is not currently an applied lock, we may
 *           apply it (if possible).
 *        4. We may upgrade the root of the tree (either extend its range,
 *           or upgrade its entire range from read to write).
 *
 * When an applied lock is modified in a way that reduces or downgrades any
 * part of its range, we remove all its children (2 above).  This particularly
 * happens when a lock is unlocked.
 *
 * For each of those child trees we "wake up" the thread which is
 * waiting for the lock so it can continue handling as follows: if the
 * root of the tree applies, we do so (3).  If it doesn't, it must
 * conflict with some applied lock.  We remove (wake up) all of its children
 * (2), and add it is a new leaf to the tree rooted in the applied
 * lock (1).  We then repeat the process recursively with those
 * children.
 *
 */
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/filelock.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/security.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/time.h>
#include <linux/rcupdate.h>
#include <linux/pid_namespace.h>
#include <linux/hashtable.h>
#include <linux/percpu.h>
#include <linux/sysctl.h>

#define CREATE_TRACE_POINTS
#include <trace/events/filelock.h>

#include <linux/uaccess.h>

static struct file_lock *file_lock(struct file_lock_core *flc)
{
        return container_of(flc, struct file_lock, c);
}

static struct file_lease *file_lease(struct file_lock_core *flc)
{
        return container_of(flc, struct file_lease, c);
}

static bool lease_breaking(struct file_lease *fl)
{
        return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
}

static int target_leasetype(struct file_lease *fl)
{
        if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                return F_UNLCK;
        if (fl->c.flc_flags & FL_DOWNGRADE_PENDING)
                return F_RDLCK;
        return fl->c.flc_type;
}

static int leases_enable = 1;
static int lease_break_time = 45;

#ifdef CONFIG_SYSCTL
static const struct ctl_table locks_sysctls[] = {
        {
                .procname        = "leases-enable",
                .data                = &leases_enable,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#ifdef CONFIG_MMU
        {
                .procname        = "lease-break-time",
                .data                = &lease_break_time,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif /* CONFIG_MMU */
};

static int __init init_fs_locks_sysctls(void)
{
        register_sysctl_init("fs", locks_sysctls);
        return 0;
}
early_initcall(init_fs_locks_sysctls);
#endif /* CONFIG_SYSCTL */

/*
 * The global file_lock_list is only used for displaying /proc/locks, so we
 * keep a list on each CPU, with each list protected by its own spinlock.
 * Global serialization is done using file_rwsem.
 *
 * Note that alterations to the list also require that the relevant flc_lock is
 * held.
 */
struct file_lock_list_struct {
        spinlock_t                lock;
        struct hlist_head        hlist;
};
static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list);
DEFINE_STATIC_PERCPU_RWSEM(file_rwsem);


/*
 * The blocked_hash is used to find POSIX lock loops for deadlock detection.
 * It is protected by blocked_lock_lock.
 *
 * We hash locks by lockowner in order to optimize searching for the lock a
 * particular lockowner is waiting on.
 *
 * FIXME: make this value scale via some heuristic? We generally will want more
 * buckets when we have more lockowners holding locks, but that's a little
 * difficult to determine without knowing what the workload will look like.
 */
#define BLOCKED_HASH_BITS        7
static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);

/*
 * This lock protects the blocked_hash. Generally, if you're accessing it, you
 * want to be holding this lock.
 *
 * In addition, it also protects the fl->fl_blocked_requests list, and the
 * fl->fl_blocker pointer for file_lock structures that are acting as lock
 * requests (in contrast to those that are acting as records of acquired locks).
 *
 * Note that when we acquire this lock in order to change the above fields,
 * we often hold the flc_lock as well. In certain cases, when reading the fields
 * protected by this lock, we can skip acquiring it iff we already hold the
 * flc_lock.
 */
static DEFINE_SPINLOCK(blocked_lock_lock);

static struct kmem_cache *flctx_cache __ro_after_init;
static struct kmem_cache *filelock_cache __ro_after_init;
static struct kmem_cache *filelease_cache __ro_after_init;

static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
{
        struct file_lock_context *ctx;

        /* paired with cmpxchg() below */
        ctx = locks_inode_context(inode);
        if (likely(ctx) || type == F_UNLCK)
                goto out;

        ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
        if (!ctx)
                goto out;

        spin_lock_init(&ctx->flc_lock);
        INIT_LIST_HEAD(&ctx->flc_flock);
        INIT_LIST_HEAD(&ctx->flc_posix);
        INIT_LIST_HEAD(&ctx->flc_lease);

        /*
         * Assign the pointer if it's not already assigned. If it is, then
         * free the context we just allocated.
         */
        if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
                kmem_cache_free(flctx_cache, ctx);
                ctx = locks_inode_context(inode);
        }
out:
        trace_locks_get_lock_context(inode, type, ctx);
        return ctx;
}

static void
locks_dump_ctx_list(struct list_head *list, char *list_type)
{
        struct file_lock_core *flc;

        list_for_each_entry(flc, list, flc_list)
                pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
                        list_type, flc->flc_owner, flc->flc_flags,
                        flc->flc_type, flc->flc_pid);
}

static void
locks_check_ctx_lists(struct inode *inode)
{
        struct file_lock_context *ctx = inode->i_flctx;

        if (unlikely(!list_empty(&ctx->flc_flock) ||
                     !list_empty(&ctx->flc_posix) ||
                     !list_empty(&ctx->flc_lease))) {
                pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
                        MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
                        inode->i_ino);
                locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
                locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
                locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
        }
}

static void
locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type)
{
        struct file_lock_core *flc;
        struct inode *inode = file_inode(filp);

        list_for_each_entry(flc, list, flc_list)
                if (flc->flc_file == filp)
                        pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
                                " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
                                list_type, MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino,
                                flc->flc_owner, flc->flc_flags,
                                flc->flc_type, flc->flc_pid);
}

void
locks_free_lock_context(struct inode *inode)
{
        struct file_lock_context *ctx = locks_inode_context(inode);

        if (unlikely(ctx)) {
                locks_check_ctx_lists(inode);
                kmem_cache_free(flctx_cache, ctx);
        }
}

static void locks_init_lock_heads(struct file_lock_core *flc)
{
        INIT_HLIST_NODE(&flc->flc_link);
        INIT_LIST_HEAD(&flc->flc_list);
        INIT_LIST_HEAD(&flc->flc_blocked_requests);
        INIT_LIST_HEAD(&flc->flc_blocked_member);
        init_waitqueue_head(&flc->flc_wait);
}

/* Allocate an empty lock structure. */
struct file_lock *locks_alloc_lock(void)
{
        struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);

        if (fl)
                locks_init_lock_heads(&fl->c);

        return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lock);

/* Allocate an empty lock structure. */
struct file_lease *locks_alloc_lease(void)
{
        struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL);

        if (fl)
                locks_init_lock_heads(&fl->c);

        return fl;
}
EXPORT_SYMBOL_GPL(locks_alloc_lease);

void locks_release_private(struct file_lock *fl)
{
        struct file_lock_core *flc = &fl->c;

        BUG_ON(waitqueue_active(&flc->flc_wait));
        BUG_ON(!list_empty(&flc->flc_list));
        BUG_ON(!list_empty(&flc->flc_blocked_requests));
        BUG_ON(!list_empty(&flc->flc_blocked_member));
        BUG_ON(!hlist_unhashed(&flc->flc_link));

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_release_private)
                        fl->fl_ops->fl_release_private(fl);
                fl->fl_ops = NULL;
        }

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_put_owner) {
                        fl->fl_lmops->lm_put_owner(flc->flc_owner);
                        flc->flc_owner = NULL;
                }
                fl->fl_lmops = NULL;
        }
}
EXPORT_SYMBOL_GPL(locks_release_private);

/**
 * locks_owner_has_blockers - Check for blocking lock requests
 * @flctx: file lock context
 * @owner: lock owner
 *
 * Return values:
 *   %true: @owner has at least one blocker
 *   %false: @owner has no blockers
 */
bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner)
{
        struct file_lock_core *flc;

        spin_lock(&flctx->flc_lock);
        list_for_each_entry(flc, &flctx->flc_posix, flc_list) {
                if (flc->flc_owner != owner)
                        continue;
                if (!list_empty(&flc->flc_blocked_requests)) {
                        spin_unlock(&flctx->flc_lock);
                        return true;
                }
        }
        spin_unlock(&flctx->flc_lock);
        return false;
}
EXPORT_SYMBOL_GPL(locks_owner_has_blockers);

/* Free a lock which is not in use. */
void locks_free_lock(struct file_lock *fl)
{
        locks_release_private(fl);
        kmem_cache_free(filelock_cache, fl);
}
EXPORT_SYMBOL(locks_free_lock);

/* Free a lease which is not in use. */
void locks_free_lease(struct file_lease *fl)
{
        kmem_cache_free(filelease_cache, fl);
}
EXPORT_SYMBOL(locks_free_lease);

static void
locks_dispose_list(struct list_head *dispose)
{
        struct file_lock_core *flc;

        while (!list_empty(dispose)) {
                flc = list_first_entry(dispose, struct file_lock_core, flc_list);
                list_del_init(&flc->flc_list);
                if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
                        locks_free_lease(file_lease(flc));
                else
                        locks_free_lock(file_lock(flc));
        }
}

void locks_init_lock(struct file_lock *fl)
{
        memset(fl, 0, sizeof(struct file_lock));
        locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lock);

void locks_init_lease(struct file_lease *fl)
{
        memset(fl, 0, sizeof(*fl));
        locks_init_lock_heads(&fl->c);
}
EXPORT_SYMBOL(locks_init_lease);

/*
 * Initialize a new lock from an existing file_lock structure.
 */
void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        new->c.flc_owner = fl->c.flc_owner;
        new->c.flc_pid = fl->c.flc_pid;
        new->c.flc_file = NULL;
        new->c.flc_flags = fl->c.flc_flags;
        new->c.flc_type = fl->c.flc_type;
        new->fl_start = fl->fl_start;
        new->fl_end = fl->fl_end;
        new->fl_lmops = fl->fl_lmops;
        new->fl_ops = NULL;

        if (fl->fl_lmops) {
                if (fl->fl_lmops->lm_get_owner)
                        fl->fl_lmops->lm_get_owner(fl->c.flc_owner);
        }
}
EXPORT_SYMBOL(locks_copy_conflock);

void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        /* "new" must be a freshly-initialized lock */
        WARN_ON_ONCE(new->fl_ops);

        locks_copy_conflock(new, fl);

        new->c.flc_file = fl->c.flc_file;
        new->fl_ops = fl->fl_ops;

        if (fl->fl_ops) {
                if (fl->fl_ops->fl_copy_lock)
                        fl->fl_ops->fl_copy_lock(new, fl);
        }
}
EXPORT_SYMBOL(locks_copy_lock);

static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
{
        struct file_lock *f;

        /*
         * As ctx->flc_lock is held, new requests cannot be added to
         * ->flc_blocked_requests, so we don't need a lock to check if it
         * is empty.
         */
        if (list_empty(&fl->c.flc_blocked_requests))
                return;
        spin_lock(&blocked_lock_lock);
        list_splice_init(&fl->c.flc_blocked_requests,
                         &new->c.flc_blocked_requests);
        list_for_each_entry(f, &new->c.flc_blocked_requests,
                            c.flc_blocked_member)
                f->c.flc_blocker = &new->c;
        spin_unlock(&blocked_lock_lock);
}

static inline int flock_translate_cmd(int cmd) {
        switch (cmd) {
        case LOCK_SH:
                return F_RDLCK;
        case LOCK_EX:
                return F_WRLCK;
        case LOCK_UN:
                return F_UNLCK;
        }
        return -EINVAL;
}

/* Fill in a file_lock structure with an appropriate FLOCK lock. */
static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
{
        locks_init_lock(fl);

        fl->c.flc_file = filp;
        fl->c.flc_owner = filp;
        fl->c.flc_pid = current->tgid;
        fl->c.flc_flags = FL_FLOCK;
        fl->c.flc_type = type;
        fl->fl_end = OFFSET_MAX;
}

static int assign_type(struct file_lock_core *flc, int type)
{
        switch (type) {
        case F_RDLCK:
        case F_WRLCK:
        case F_UNLCK:
                flc->flc_type = type;
                break;
        default:
                return -EINVAL;
        }
        return 0;
}

static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
                                 struct flock64 *l)
{
        switch (l->l_whence) {
        case SEEK_SET:
                fl->fl_start = 0;
                break;
        case SEEK_CUR:
                fl->fl_start = filp->f_pos;
                break;
        case SEEK_END:
                fl->fl_start = i_size_read(file_inode(filp));
                break;
        default:
                return -EINVAL;
        }
        if (l->l_start > OFFSET_MAX - fl->fl_start)
                return -EOVERFLOW;
        fl->fl_start += l->l_start;
        if (fl->fl_start < 0)
                return -EINVAL;

        /* POSIX-1996 leaves the case l->l_len < 0 undefined;
           POSIX-2001 defines it. */
        if (l->l_len > 0) {
                if (l->l_len - 1 > OFFSET_MAX - fl->fl_start)
                        return -EOVERFLOW;
                fl->fl_end = fl->fl_start + (l->l_len - 1);

        } else if (l->l_len < 0) {
                if (fl->fl_start + l->l_len < 0)
                        return -EINVAL;
                fl->fl_end = fl->fl_start - 1;
                fl->fl_start += l->l_len;
        } else
                fl->fl_end = OFFSET_MAX;

        fl->c.flc_owner = current->files;
        fl->c.flc_pid = current->tgid;
        fl->c.flc_file = filp;
        fl->c.flc_flags = FL_POSIX;
        fl->fl_ops = NULL;
        fl->fl_lmops = NULL;

        return assign_type(&fl->c, l->l_type);
}

/* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
 * style lock.
 */
static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
                               struct flock *l)
{
        struct flock64 ll = {
                .l_type = l->l_type,
                .l_whence = l->l_whence,
                .l_start = l->l_start,
                .l_len = l->l_len,
        };

        return flock64_to_posix_lock(filp, fl, &ll);
}

/* default lease lock manager operations */
static bool
lease_break_callback(struct file_lease *fl)
{
        kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
        return false;
}

static void
lease_setup(struct file_lease *fl, void **priv)
{
        struct file *filp = fl->c.flc_file;
        struct fasync_struct *fa = *priv;

        /*
         * fasync_insert_entry() returns the old entry if any. If there was no
         * old entry, then it used "priv" and inserted it into the fasync list.
         * Clear the pointer to indicate that it shouldn't be freed.
         */
        if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa))
                *priv = NULL;

        __f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
}

static const struct lease_manager_operations lease_manager_ops = {
        .lm_break = lease_break_callback,
        .lm_change = lease_modify,
        .lm_setup = lease_setup,
};

/*
 * Initialize a lease, use the default lock manager operations
 */
static int lease_init(struct file *filp, int type, struct file_lease *fl)
{
        if (assign_type(&fl->c, type) != 0)
                return -EINVAL;

        fl->c.flc_owner = filp;
        fl->c.flc_pid = current->tgid;

        fl->c.flc_file = filp;
        fl->c.flc_flags = FL_LEASE;
        fl->fl_lmops = &lease_manager_ops;
        return 0;
}

/* Allocate a file_lock initialised to this type of lease */
static struct file_lease *lease_alloc(struct file *filp, int type)
{
        struct file_lease *fl = locks_alloc_lease();
        int error = -ENOMEM;

        if (fl == NULL)
                return ERR_PTR(error);

        error = lease_init(filp, type, fl);
        if (error) {
                locks_free_lease(fl);
                return ERR_PTR(error);
        }
        return fl;
}

/* Check if two locks overlap each other.
 */
static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
{
        return ((fl1->fl_end >= fl2->fl_start) &&
                (fl2->fl_end >= fl1->fl_start));
}

/*
 * Check whether two locks have the same owner.
 */
static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2)
{
        return fl1->flc_owner == fl2->flc_owner;
}

/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock_core *flc)
{
        struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);

        percpu_rwsem_assert_held(&file_rwsem);

        spin_lock(&fll->lock);
        flc->flc_link_cpu = smp_processor_id();
        hlist_add_head(&flc->flc_link, &fll->hlist);
        spin_unlock(&fll->lock);
}

/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock_core *flc)
{
        struct file_lock_list_struct *fll;

        percpu_rwsem_assert_held(&file_rwsem);

        /*
         * Avoid taking lock if already unhashed. This is safe since this check
         * is done while holding the flc_lock, and new insertions into the list
         * also require that it be held.
         */
        if (hlist_unhashed(&flc->flc_link))
                return;

        fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu);
        spin_lock(&fll->lock);
        hlist_del_init(&flc->flc_link);
        spin_unlock(&fll->lock);
}

static unsigned long
posix_owner_key(struct file_lock_core *flc)
{
        return (unsigned long) flc->flc_owner;
}

static void locks_insert_global_blocked(struct file_lock_core *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter));
}

static void locks_delete_global_blocked(struct file_lock_core *waiter)
{
        lockdep_assert_held(&blocked_lock_lock);

        hash_del(&waiter->flc_link);
}

/* Remove waiter from blocker's block list.
 * When blocker ends up pointing to itself then the list is empty.
 *
 * Must be called with blocked_lock_lock held.
 */
static void __locks_unlink_block(struct file_lock_core *waiter)
{
        locks_delete_global_blocked(waiter);
        list_del_init(&waiter->flc_blocked_member);
}

static void __locks_wake_up_blocks(struct file_lock_core *blocker)
{
        while (!list_empty(&blocker->flc_blocked_requests)) {
                struct file_lock_core *waiter;
                struct file_lock *fl;

                waiter = list_first_entry(&blocker->flc_blocked_requests,
                                          struct file_lock_core, flc_blocked_member);

                fl = file_lock(waiter);
                __locks_unlink_block(waiter);
                if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) &&
                    fl->fl_lmops && fl->fl_lmops->lm_notify)
                        fl->fl_lmops->lm_notify(fl);
                else
                        locks_wake_up(fl);

                /*
                 * The setting of flc_blocker to NULL marks the "done"
                 * point in deleting a block. Paired with acquire at the top
                 * of locks_delete_block().
                 */
                smp_store_release(&waiter->flc_blocker, NULL);
        }
}

static int __locks_delete_block(struct file_lock_core *waiter)
{
        int status = -ENOENT;

        /*
         * If fl_blocker is NULL, it won't be set again as this thread "owns"
         * the lock and is the only one that might try to claim the lock.
         *
         * We use acquire/release to manage fl_blocker so that we can
         * optimize away taking the blocked_lock_lock in many cases.
         *
         * The smp_load_acquire guarantees two things:
         *
         * 1/ that fl_blocked_requests can be tested locklessly. If something
         * was recently added to that list it must have been in a locked region
         * *before* the locked region when fl_blocker was set to NULL.
         *
         * 2/ that no other thread is accessing 'waiter', so it is safe to free
         * it.  __locks_wake_up_blocks is careful not to touch waiter after
         * fl_blocker is released.
         *
         * If a lockless check of fl_blocker shows it to be NULL, we know that
         * no new locks can be inserted into its fl_blocked_requests list, and
         * can avoid doing anything further if the list is empty.
         */
        if (!smp_load_acquire(&waiter->flc_blocker) &&
            list_empty(&waiter->flc_blocked_requests))
                return status;

        spin_lock(&blocked_lock_lock);
        if (waiter->flc_blocker)
                status = 0;
        __locks_wake_up_blocks(waiter);
        __locks_unlink_block(waiter);

        /*
         * The setting of fl_blocker to NULL marks the "done" point in deleting
         * a block. Paired with acquire at the top of this function.
         */
        smp_store_release(&waiter->flc_blocker, NULL);
        spin_unlock(&blocked_lock_lock);
        return status;
}

/**
 *        locks_delete_block - stop waiting for a file lock
 *        @waiter: the lock which was waiting
 *
 *        lockd/nfsd need to disconnect the lock while working on it.
 */
int locks_delete_block(struct file_lock *waiter)
{
        return __locks_delete_block(&waiter->c);
}
EXPORT_SYMBOL(locks_delete_block);

/* Insert waiter into blocker's block list.
 * We use a circular list so that processes can be easily woken up in
 * the order they blocked. The documentation doesn't require this but
 * it seems like the reasonable thing to do.
 *
 * Must be called with both the flc_lock and blocked_lock_lock held. The
 * fl_blocked_requests list itself is protected by the blocked_lock_lock,
 * but by ensuring that the flc_lock is also held on insertions we can avoid
 * taking the blocked_lock_lock in some cases when we see that the
 * fl_blocked_requests list is empty.
 *
 * Rather than just adding to the list, we check for conflicts with any existing
 * waiters, and add beneath any waiter that blocks the new waiter.
 * Thus wakeups don't happen until needed.
 */
static void __locks_insert_block(struct file_lock_core *blocker,
                                 struct file_lock_core *waiter,
                                 bool conflict(struct file_lock_core *,
                                               struct file_lock_core *))
{
        struct file_lock_core *flc;

        BUG_ON(!list_empty(&waiter->flc_blocked_member));
new_blocker:
        list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member)
                if (conflict(flc, waiter)) {
                        blocker =  flc;
                        goto new_blocker;
                }
        waiter->flc_blocker = blocker;
        list_add_tail(&waiter->flc_blocked_member,
                      &blocker->flc_blocked_requests);

        if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX)
                locks_insert_global_blocked(waiter);

        /* The requests in waiter->flc_blocked are known to conflict with
         * waiter, but might not conflict with blocker, or the requests
         * and lock which block it.  So they all need to be woken.
         */
        __locks_wake_up_blocks(waiter);
}

/* Must be called with flc_lock held. */
static void locks_insert_block(struct file_lock_core *blocker,
                               struct file_lock_core *waiter,
                               bool conflict(struct file_lock_core *,
                                             struct file_lock_core *))
{
        spin_lock(&blocked_lock_lock);
        __locks_insert_block(blocker, waiter, conflict);
        spin_unlock(&blocked_lock_lock);
}

/*
 * Wake up processes blocked waiting for blocker.
 *
 * Must be called with the inode->flc_lock held!
 */
static void locks_wake_up_blocks(struct file_lock_core *blocker)
{
        /*
         * Avoid taking global lock if list is empty. This is safe since new
         * blocked requests are only added to the list under the flc_lock, and
         * the flc_lock is always held here. Note that removal from the
         * fl_blocked_requests list does not require the flc_lock, so we must
         * recheck list_empty() after acquiring the blocked_lock_lock.
         */
        if (list_empty(&blocker->flc_blocked_requests))
                return;

        spin_lock(&blocked_lock_lock);
        __locks_wake_up_blocks(blocker);
        spin_unlock(&blocked_lock_lock);
}

static void
locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before)
{
        list_add_tail(&fl->flc_list, before);
        locks_insert_global_locks(fl);
}

static void
locks_unlink_lock_ctx(struct file_lock_core *fl)
{
        locks_delete_global_locks(fl);
        list_del_init(&fl->flc_list);
        locks_wake_up_blocks(fl);
}

static void
locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose)
{
        locks_unlink_lock_ctx(fl);
        if (dispose)
                list_add(&fl->flc_list, dispose);
        else
                locks_free_lock(file_lock(fl));
}

/* Determine if lock sys_fl blocks lock caller_fl. Common functionality
 * checks for shared/exclusive status of overlapping locks.
 */
static bool locks_conflict(struct file_lock_core *caller_flc,
                           struct file_lock_core *sys_flc)
{
        if (sys_flc->flc_type == F_WRLCK)
                return true;
        if (caller_flc->flc_type == F_WRLCK)
                return true;
        return false;
}

/* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
 * checking before calling the locks_conflict().
 */
static bool posix_locks_conflict(struct file_lock_core *caller_flc,
                                 struct file_lock_core *sys_flc)
{
        struct file_lock *caller_fl = file_lock(caller_flc);
        struct file_lock *sys_fl = file_lock(sys_flc);

        /* POSIX locks owned by the same process do not conflict with
         * each other.
         */
        if (posix_same_owner(caller_flc, sys_flc))
                return false;

        /* Check whether they overlap */
        if (!locks_overlap(caller_fl, sys_fl))
                return false;

        return locks_conflict(caller_flc, sys_flc);
}

/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK
 * path so checks for additional GETLK-specific things like F_UNLCK.
 */
static bool posix_test_locks_conflict(struct file_lock *caller_fl,
                                      struct file_lock *sys_fl)
{
        struct file_lock_core *caller = &caller_fl->c;
        struct file_lock_core *sys = &sys_fl->c;

        /* F_UNLCK checks any locks on the same fd. */
        if (lock_is_unlock(caller_fl)) {
                if (!posix_same_owner(caller, sys))
                        return false;
                return locks_overlap(caller_fl, sys_fl);
        }
        return posix_locks_conflict(caller, sys);
}

/* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
 * checking before calling the locks_conflict().
 */
static bool flock_locks_conflict(struct file_lock_core *caller_flc,
                                 struct file_lock_core *sys_flc)
{
        /* FLOCK locks referring to the same filp do not conflict with
         * each other.
         */
        if (caller_flc->flc_file == sys_flc->flc_file)
                return false;

        return locks_conflict(caller_flc, sys_flc);
}

void
posix_test_lock(struct file *filp, struct file_lock *fl)
{
        struct file_lock *cfl;
        struct file_lock_context *ctx;
        struct inode *inode = file_inode(filp);
        void *owner;
        void (*func)(void);

        ctx = locks_inode_context(inode);
        if (!ctx || list_empty_careful(&ctx->flc_posix)) {
                fl->c.flc_type = F_UNLCK;
                return;
        }

retry:
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) {
                if (!posix_test_locks_conflict(fl, cfl))
                        continue;
                if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
                        && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
                        owner = cfl->fl_lmops->lm_mod_owner;
                        func = cfl->fl_lmops->lm_expire_lock;
                        __module_get(owner);
                        spin_unlock(&ctx->flc_lock);
                        (*func)();
                        module_put(owner);
                        goto retry;
                }
                locks_copy_conflock(fl, cfl);
                goto out;
        }
        fl->c.flc_type = F_UNLCK;
out:
        spin_unlock(&ctx->flc_lock);
        return;
}
EXPORT_SYMBOL(posix_test_lock);

/*
 * Deadlock detection:
 *
 * We attempt to detect deadlocks that are due purely to posix file
 * locks.
 *
 * We assume that a task can be waiting for at most one lock at a time.
 * So for any acquired lock, the process holding that lock may be
 * waiting on at most one other lock.  That lock in turns may be held by
 * someone waiting for at most one other lock.  Given a requested lock
 * caller_fl which is about to wait for a conflicting lock block_fl, we
 * follow this chain of waiters to ensure we are not about to create a
 * cycle.
 *
 * Since we do this before we ever put a process to sleep on a lock, we
 * are ensured that there is never a cycle; that is what guarantees that
 * the while() loop in posix_locks_deadlock() eventually completes.
 *
 * Note: the above assumption may not be true when handling lock
 * requests from a broken NFS client. It may also fail in the presence
 * of tasks (such as posix threads) sharing the same open file table.
 * To handle those cases, we just bail out after a few iterations.
 *
 * For FL_OFDLCK locks, the owner is the filp, not the files_struct.
 * Because the owner is not even nominally tied to a thread of
 * execution, the deadlock detection below can't reasonably work well. Just
 * skip it for those.
 *
 * In principle, we could do a more limited deadlock detection on FL_OFDLCK
 * locks that just checks for the case where two tasks are attempting to
 * upgrade from read to write locks on the same inode.
 */

#define MAX_DEADLK_ITERATIONS 10

/* Find a lock that the owner of the given @blocker is blocking on. */
static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker)
{
        struct file_lock_core *flc;

        hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) {
                if (posix_same_owner(flc, blocker)) {
                        while (flc->flc_blocker)
                                flc = flc->flc_blocker;
                        return flc;
                }
        }
        return NULL;
}

/* Must be called with the blocked_lock_lock held! */
static bool posix_locks_deadlock(struct file_lock *caller_fl,
                                 struct file_lock *block_fl)
{
        struct file_lock_core *caller = &caller_fl->c;
        struct file_lock_core *blocker = &block_fl->c;
        int i = 0;

        lockdep_assert_held(&blocked_lock_lock);

        /*
         * This deadlock detector can't reasonably detect deadlocks with
         * FL_OFDLCK locks, since they aren't owned by a process, per-se.
         */
        if (caller->flc_flags & FL_OFDLCK)
                return false;

        while ((blocker = what_owner_is_waiting_for(blocker))) {
                if (i++ > MAX_DEADLK_ITERATIONS)
                        return false;
                if (posix_same_owner(caller, blocker))
                        return true;
        }
        return false;
}

/* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
 * after any leases, but before any posix locks.
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
static int flock_lock_inode(struct inode *inode, struct file_lock *request)
{
        struct file_lock *new_fl = NULL;
        struct file_lock *fl;
        struct file_lock_context *ctx;
        int error = 0;
        bool found = false;
        LIST_HEAD(dispose);

        ctx = locks_get_lock_context(inode, request->c.flc_type);
        if (!ctx) {
                if (request->c.flc_type != F_UNLCK)
                        return -ENOMEM;
                return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0;
        }

        if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) {
                new_fl = locks_alloc_lock();
                if (!new_fl)
                        return -ENOMEM;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        if (request->c.flc_flags & FL_ACCESS)
                goto find_conflict;

        list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
                if (request->c.flc_file != fl->c.flc_file)
                        continue;
                if (request->c.flc_type == fl->c.flc_type)
                        goto out;
                found = true;
                locks_delete_lock_ctx(&fl->c, &dispose);
                break;
        }

        if (lock_is_unlock(request)) {
                if ((request->c.flc_flags & FL_EXISTS) && !found)
                        error = -ENOENT;
                goto out;
        }

find_conflict:
        list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
                if (!flock_locks_conflict(&request->c, &fl->c))
                        continue;
                error = -EAGAIN;
                if (!(request->c.flc_flags & FL_SLEEP))
                        goto out;
                error = FILE_LOCK_DEFERRED;
                locks_insert_block(&fl->c, &request->c, flock_locks_conflict);
                goto out;
        }
        if (request->c.flc_flags & FL_ACCESS)
                goto out;
        locks_copy_lock(new_fl, request);
        locks_move_blocks(new_fl, request);
        locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock);
        new_fl = NULL;
        error = 0;

out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        if (new_fl)
                locks_free_lock(new_fl);
        locks_dispose_list(&dispose);
        trace_flock_lock_inode(inode, request, error);
        return error;
}

static int posix_lock_inode(struct inode *inode, struct file_lock *request,
                            struct file_lock *conflock)
{
        struct file_lock *fl, *tmp;
        struct file_lock *new_fl = NULL;
        struct file_lock *new_fl2 = NULL;
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
        struct file_lock_context *ctx;
        int error;
        bool added = false;
        LIST_HEAD(dispose);
        void *owner;
        void (*func)(void);

        ctx = locks_get_lock_context(inode, request->c.flc_type);
        if (!ctx)
                return lock_is_unlock(request) ? 0 : -ENOMEM;

        /*
         * We may need two file_lock structures for this operation,
         * so we get them in advance to avoid races.
         *
         * In some cases we can be sure, that no new locks will be needed
         */
        if (!(request->c.flc_flags & FL_ACCESS) &&
            (request->c.flc_type != F_UNLCK ||
             request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
                new_fl = locks_alloc_lock();
                new_fl2 = locks_alloc_lock();
        }

retry:
        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        /*
         * New lock request. Walk all POSIX locks and look for conflicts. If
         * there are any, either return error or put the request on the
         * blocker's list of waiters and the global blocked_hash.
         */
        if (request->c.flc_type != F_UNLCK) {
                list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
                        if (!posix_locks_conflict(&request->c, &fl->c))
                                continue;
                        if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
                                && (*fl->fl_lmops->lm_lock_expirable)(fl)) {
                                owner = fl->fl_lmops->lm_mod_owner;
                                func = fl->fl_lmops->lm_expire_lock;
                                __module_get(owner);
                                spin_unlock(&ctx->flc_lock);
                                percpu_up_read(&file_rwsem);
                                (*func)();
                                module_put(owner);
                                goto retry;
                        }
                        if (conflock)
                                locks_copy_conflock(conflock, fl);
                        error = -EAGAIN;
                        if (!(request->c.flc_flags & FL_SLEEP))
                                goto out;
                        /*
                         * Deadlock detection and insertion into the blocked
                         * locks list must be done while holding the same lock!
                         */
                        error = -EDEADLK;
                        spin_lock(&blocked_lock_lock);
                        /*
                         * Ensure that we don't find any locks blocked on this
                         * request during deadlock detection.
                         */
                        __locks_wake_up_blocks(&request->c);
                        if (likely(!posix_locks_deadlock(request, fl))) {
                                error = FILE_LOCK_DEFERRED;
                                __locks_insert_block(&fl->c, &request->c,
                                                     posix_locks_conflict);
                        }
                        spin_unlock(&blocked_lock_lock);
                        goto out;
                }
        }

        /* If we're just looking for a conflict, we're done. */
        error = 0;
        if (request->c.flc_flags & FL_ACCESS)
                goto out;

        /* Find the first old lock with the same owner as the new lock */
        list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
                if (posix_same_owner(&request->c, &fl->c))
                        break;
        }

        /* Process locks with this owner. */
        list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) {
                if (!posix_same_owner(&request->c, &fl->c))
                        break;

                /* Detect adjacent or overlapping regions (if same lock type) */
                if (request->c.flc_type == fl->c.flc_type) {
                        /* In all comparisons of start vs end, use
                         * "start - 1" rather than "end + 1". If end
                         * is OFFSET_MAX, end + 1 will become negative.
                         */
                        if (fl->fl_end < request->fl_start - 1)
                                continue;
                        /* If the next lock in the list has entirely bigger
                         * addresses than the new one, insert the lock here.
                         */
                        if (fl->fl_start - 1 > request->fl_end)
                                break;

                        /* If we come here, the new and old lock are of the
                         * same type and adjacent or overlapping. Make one
                         * lock yielding from the lower start address of both
                         * locks to the higher end address.
                         */
                        if (fl->fl_start > request->fl_start)
                                fl->fl_start = request->fl_start;
                        else
                                request->fl_start = fl->fl_start;
                        if (fl->fl_end < request->fl_end)
                                fl->fl_end = request->fl_end;
                        else
                                request->fl_end = fl->fl_end;
                        if (added) {
                                locks_delete_lock_ctx(&fl->c, &dispose);
                                continue;
                        }
                        request = fl;
                        added = true;
                } else {
                        /* Processing for different lock types is a bit
                         * more complex.
                         */
                        if (fl->fl_end < request->fl_start)
                                continue;
                        if (fl->fl_start > request->fl_end)
                                break;
                        if (lock_is_unlock(request))
                                added = true;
                        if (fl->fl_start < request->fl_start)
                                left = fl;
                        /* If the next lock in the list has a higher end
                         * address than the new one, insert the new one here.
                         */
                        if (fl->fl_end > request->fl_end) {
                                right = fl;
                                break;
                        }
                        if (fl->fl_start >= request->fl_start) {
                                /* The new lock completely replaces an old
                                 * one (This may happen several times).
                                 */
                                if (added) {
                                        locks_delete_lock_ctx(&fl->c, &dispose);
                                        continue;
                                }
                                /*
                                 * Replace the old lock with new_fl, and
                                 * remove the old one. It's safe to do the
                                 * insert here since we know that we won't be
                                 * using new_fl later, and that the lock is
                                 * just replacing an existing lock.
                                 */
                                error = -ENOLCK;
                                if (!new_fl)
                                        goto out;
                                locks_copy_lock(new_fl, request);
                                locks_move_blocks(new_fl, request);
                                request = new_fl;
                                new_fl = NULL;
                                locks_insert_lock_ctx(&request->c,
                                                      &fl->c.flc_list);
                                locks_delete_lock_ctx(&fl->c, &dispose);
                                added = true;
                        }
                }
        }

        /*
         * The above code only modifies existing locks in case of merging or
         * replacing. If new lock(s) need to be inserted all modifications are
         * done below this, so it's safe yet to bail out.
         */
        error = -ENOLCK; /* "no luck" */
        if (right && left == right && !new_fl2)
                goto out;

        error = 0;
        if (!added) {
                if (lock_is_unlock(request)) {
                        if (request->c.flc_flags & FL_EXISTS)
                                error = -ENOENT;
                        goto out;
                }

                if (!new_fl) {
                        error = -ENOLCK;
                        goto out;
                }
                locks_copy_lock(new_fl, request);
                locks_move_blocks(new_fl, request);
                locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list);
                fl = new_fl;
                new_fl = NULL;
        }
        if (right) {
                if (left == right) {
                        /* The new lock breaks the old one in two pieces,
                         * so we have to use the second new lock.
                         */
                        left = new_fl2;
                        new_fl2 = NULL;
                        locks_copy_lock(left, right);
                        locks_insert_lock_ctx(&left->c, &fl->c.flc_list);
                }
                right->fl_start = request->fl_end + 1;
                locks_wake_up_blocks(&right->c);
        }
        if (left) {
                left->fl_end = request->fl_start - 1;
                locks_wake_up_blocks(&left->c);
        }
 out:
        trace_posix_lock_inode(inode, request, error);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        /*
         * Free any unused locks.
         */
        if (new_fl)
                locks_free_lock(new_fl);
        if (new_fl2)
                locks_free_lock(new_fl2);
        locks_dispose_list(&dispose);

        return error;
}

/**
 * posix_lock_file - Apply a POSIX-style lock to a file
 * @filp: The file to apply the lock to
 * @fl: The lock to be applied
 * @conflock: Place to return a copy of the conflicting lock, if found.
 *
 * Add a POSIX style lock to a file.
 * We merge adjacent & overlapping locks whenever possible.
 * POSIX locks are sorted by owner task, then by starting address
 *
 * Note that if called with an FL_EXISTS argument, the caller may determine
 * whether or not a lock was successfully freed by testing the return
 * value for -ENOENT.
 */
int posix_lock_file(struct file *filp, struct file_lock *fl,
                        struct file_lock *conflock)
{
        return posix_lock_inode(file_inode(filp), fl, conflock);
}
EXPORT_SYMBOL(posix_lock_file);

/**
 * posix_lock_inode_wait - Apply a POSIX-style lock to a file
 * @inode: inode of file to which lock request should be applied
 * @fl: The lock to be applied
 *
 * Apply a POSIX style lock request to an inode.
 */
static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep ();
        for (;;) {
                error = posix_lock_inode(inode, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

static void lease_clear_pending(struct file_lease *fl, int arg)
{
        switch (arg) {
        case F_UNLCK:
                fl->c.flc_flags &= ~FL_UNLOCK_PENDING;
                fallthrough;
        case F_RDLCK:
                fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING;
        }
}

/* We already had a lease on this file; just change its type */
int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
{
        int error = assign_type(&fl->c, arg);

        if (error)
                return error;
        lease_clear_pending(fl, arg);
        locks_wake_up_blocks(&fl->c);
        if (arg == F_UNLCK) {
                struct file *filp = fl->c.flc_file;

                f_delown(filp);
                file_f_owner(filp)->signum = 0;
                fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
                if (fl->fl_fasync != NULL) {
                        printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
                        fl->fl_fasync = NULL;
                }
                locks_delete_lock_ctx(&fl->c, dispose);
        }
        return 0;
}
EXPORT_SYMBOL(lease_modify);

static bool past_time(unsigned long then)
{
        if (!then)
                /* 0 is a special value meaning "this never expires": */
                return false;
        return time_after(jiffies, then);
}

static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lease *fl, *tmp;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
                trace_time_out_leases(inode, fl);
                if (past_time(fl->fl_downgrade_time))
                        lease_modify(fl, F_RDLCK, dispose);
                if (past_time(fl->fl_break_time))
                        lease_modify(fl, F_UNLCK, dispose);
        }
}

static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc)
{
        bool rc;
        struct file_lease *lease = file_lease(lc);
        struct file_lease *breaker = file_lease(bc);

        if (lease->fl_lmops->lm_breaker_owns_lease
                        && lease->fl_lmops->lm_breaker_owns_lease(lease))
                return false;
        if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) {
                rc = false;
                goto trace;
        }
        if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) {
                rc = false;
                goto trace;
        }

        rc = locks_conflict(bc, lc);
trace:
        trace_leases_conflict(rc, lease, breaker);
        return rc;
}

static bool
any_leases_conflict(struct inode *inode, struct file_lease *breaker)
{
        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lock_core *flc;

        lockdep_assert_held(&ctx->flc_lock);

        list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
                if (leases_conflict(flc, &breaker->c))
                        return true;
        }
        return false;
}

/**
 *        __break_lease        -        revoke all outstanding leases on file
 *        @inode: the inode of the file to return
 *        @mode: O_RDONLY: break only write leases; O_WRONLY or O_RDWR:
 *            break all leases
 *        @type: FL_LEASE: break leases and delegations; FL_DELEG: break
 *            only delegations
 *
 *        break_lease (inlined for speed) has checked there already is at least
 *        some kind of lock (maybe a lease) on this file.  Leases are broken on
 *        a call to open() or truncate().  This function can sleep unless you
 *        specified %O_NONBLOCK to your open().
 */
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
        int error = 0;
        struct file_lock_context *ctx;
        struct file_lease *new_fl, *fl, *tmp;
        unsigned long break_time;
        int want_write = (mode & O_ACCMODE) != O_RDONLY;
        LIST_HEAD(dispose);

        new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
        if (IS_ERR(new_fl))
                return PTR_ERR(new_fl);
        new_fl->c.flc_flags = type;

        /* typically we will check that ctx is non-NULL before calling */
        ctx = locks_inode_context(inode);
        if (!ctx) {
                WARN_ON_ONCE(1);
                goto free_lock;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);

        time_out_leases(inode, &dispose);

        if (!any_leases_conflict(inode, new_fl))
                goto out;

        break_time = 0;
        if (lease_break_time > 0) {
                break_time = jiffies + lease_break_time * HZ;
                if (break_time == 0)
                        break_time++;        /* so that 0 means no break time */
        }

        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
                if (!leases_conflict(&fl->c, &new_fl->c))
                        continue;
                if (want_write) {
                        if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                                continue;
                        fl->c.flc_flags |= FL_UNLOCK_PENDING;
                        fl->fl_break_time = break_time;
                } else {
                        if (lease_breaking(fl))
                                continue;
                        fl->c.flc_flags |= FL_DOWNGRADE_PENDING;
                        fl->fl_downgrade_time = break_time;
                }
                if (fl->fl_lmops->lm_break(fl))
                        locks_delete_lock_ctx(&fl->c, &dispose);
        }

        if (list_empty(&ctx->flc_lease))
                goto out;

        if (mode & O_NONBLOCK) {
                trace_break_lease_noblock(inode, new_fl);
                error = -EWOULDBLOCK;
                goto out;
        }

restart:
        fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
        break_time = fl->fl_break_time;
        if (break_time != 0)
                break_time -= jiffies;
        if (break_time == 0)
                break_time++;
        locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
        trace_break_lease_block(inode, new_fl);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        locks_dispose_list(&dispose);
        error = wait_event_interruptible_timeout(new_fl->c.flc_wait,
                                                 list_empty(&new_fl->c.flc_blocked_member),
                                                 break_time);

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        trace_break_lease_unblock(inode, new_fl);
        __locks_delete_block(&new_fl->c);
        if (error >= 0) {
                /*
                 * Wait for the next conflicting lease that has not been
                 * broken yet
                 */
                if (error == 0)
                        time_out_leases(inode, &dispose);
                if (any_leases_conflict(inode, new_fl))
                        goto restart;
                error = 0;
        }
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
free_lock:
        locks_free_lease(new_fl);
        return error;
}
EXPORT_SYMBOL(__break_lease);

/**
 *        lease_get_mtime - update modified time of an inode with exclusive lease
 *        @inode: the inode
 *      @time:  pointer to a timespec which contains the last modified time
 *
 * This is to force NFS clients to flush their caches for files with
 * exclusive leases.  The justification is that if someone has an
 * exclusive lease, then they could be modifying it.
 */
void lease_get_mtime(struct inode *inode, struct timespec64 *time)
{
        bool has_lease = false;
        struct file_lock_context *ctx;
        struct file_lock_core *flc;

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                spin_lock(&ctx->flc_lock);
                flc = list_first_entry_or_null(&ctx->flc_lease,
                                               struct file_lock_core, flc_list);
                if (flc && flc->flc_type == F_WRLCK)
                        has_lease = true;
                spin_unlock(&ctx->flc_lock);
        }

        if (has_lease)
                *time = current_time(inode);
}
EXPORT_SYMBOL(lease_get_mtime);

/**
 *        fcntl_getlease - Enquire what lease is currently active
 *        @filp: the file
 *
 *        The value returned by this function will be one of
 *        (if no lease break is pending):
 *
 *        %F_RDLCK to indicate a shared lease is held.
 *
 *        %F_WRLCK to indicate an exclusive lease is held.
 *
 *        %F_UNLCK to indicate no lease is held.
 *
 *        (if a lease break is pending):
 *
 *        %F_RDLCK to indicate an exclusive lease needs to be
 *                changed to a shared lease (or removed).
 *
 *        %F_UNLCK to indicate the lease needs to be removed.
 *
 *        XXX: sfr & willy disagree over whether F_INPROGRESS
 *        should be returned to userspace.
 */
int fcntl_getlease(struct file *filp)
{
        struct file_lease *fl;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        int type = F_UNLCK;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
                percpu_down_read(&file_rwsem);
                spin_lock(&ctx->flc_lock);
                time_out_leases(inode, &dispose);
                list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                        if (fl->c.flc_file != filp)
                                continue;
                        type = target_leasetype(fl);
                        break;
                }
                spin_unlock(&ctx->flc_lock);
                percpu_up_read(&file_rwsem);

                locks_dispose_list(&dispose);
        }
        return type;
}

/**
 * check_conflicting_open - see if the given file points to an inode that has
 *                            an existing open that would conflict with the
 *                            desired lease.
 * @filp:        file to check
 * @arg:        type of lease that we're trying to acquire
 * @flags:        current lock flags
 *
 * Check to see if there's an existing open fd on this file that would
 * conflict with the lease we're trying to set.
 */
static int
check_conflicting_open(struct file *filp, const int arg, int flags)
{
        struct inode *inode = file_inode(filp);
        int self_wcount = 0, self_rcount = 0;

        if (flags & FL_LAYOUT)
                return 0;
        if (flags & FL_DELEG)
                /* We leave these checks to the caller */
                return 0;

        if (arg == F_RDLCK)
                return inode_is_open_for_write(inode) ? -EAGAIN : 0;
        else if (arg != F_WRLCK)
                return 0;

        /*
         * Make sure that only read/write count is from lease requestor.
         * Note that this will result in denying write leases when i_writecount
         * is negative, which is what we want.  (We shouldn't grant write leases
         * on files open for execution.)
         */
        if (filp->f_mode & FMODE_WRITE)
                self_wcount = 1;
        else if (filp->f_mode & FMODE_READ)
                self_rcount = 1;

        if (atomic_read(&inode->i_writecount) != self_wcount ||
            atomic_read(&inode->i_readcount) != self_rcount)
                return -EAGAIN;

        return 0;
}

static int
generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv)
{
        struct file_lease *fl, *my_fl = NULL, *lease;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        bool is_deleg = (*flp)->c.flc_flags & FL_DELEG;
        int error;
        LIST_HEAD(dispose);

        lease = *flp;
        trace_generic_add_lease(inode, lease);

        error = file_f_owner_allocate(filp);
        if (error)
                return error;

        /* Note that arg is never F_UNLCK here */
        ctx = locks_get_lock_context(inode, arg);
        if (!ctx)
                return -ENOMEM;

        /*
         * In the delegation case we need mutual exclusion with
         * a number of operations that take the i_mutex.  We trylock
         * because delegations are an optional optimization, and if
         * there's some chance of a conflict--we'd rather not
         * bother, maybe that's a sign this just isn't a good file to
         * hand out a delegation on.
         */
        if (is_deleg && !inode_trylock(inode))
                return -EAGAIN;

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
        error = check_conflicting_open(filp, arg, lease->c.flc_flags);
        if (error)
                goto out;

        /*
         * At this point, we know that if there is an exclusive
         * lease on this file, then we hold it on this filp
         * (otherwise our open of this file would have blocked).
         * And if we are trying to acquire an exclusive lease,
         * then the file is not open by anyone (including us)
         * except for this filp.
         */
        error = -EAGAIN;
        list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                if (fl->c.flc_file == filp &&
                    fl->c.flc_owner == lease->c.flc_owner) {
                        my_fl = fl;
                        continue;
                }

                /*
                 * No exclusive leases if someone else has a lease on
                 * this file:
                 */
                if (arg == F_WRLCK)
                        goto out;
                /*
                 * Modifying our existing lease is OK, but no getting a
                 * new lease if someone else is opening for write:
                 */
                if (fl->c.flc_flags & FL_UNLOCK_PENDING)
                        goto out;
        }

        if (my_fl != NULL) {
                lease = my_fl;
                error = lease->fl_lmops->lm_change(lease, arg, &dispose);
                if (error)
                        goto out;
                goto out_setup;
        }

        error = -EINVAL;
        if (!leases_enable)
                goto out;

        locks_insert_lock_ctx(&lease->c, &ctx->flc_lease);
        /*
         * The check in break_lease() is lockless. It's possible for another
         * open to race in after we did the earlier check for a conflicting
         * open but before the lease was inserted. Check again for a
         * conflicting open and cancel the lease if there is one.
         *
         * We also add a barrier here to ensure that the insertion of the lock
         * precedes these checks.
         */
        smp_mb();
        error = check_conflicting_open(filp, arg, lease->c.flc_flags);
        if (error) {
                locks_unlink_lock_ctx(&lease->c);
                goto out;
        }

out_setup:
        if (lease->fl_lmops->lm_setup)
                lease->fl_lmops->lm_setup(lease, priv);
out:
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
        if (is_deleg)
                inode_unlock(inode);
        if (!error && !my_fl)
                *flp = NULL;
        return error;
}

static int generic_delete_lease(struct file *filp, void *owner)
{
        int error = -EAGAIN;
        struct file_lease *fl, *victim = NULL;
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        LIST_HEAD(dispose);

        ctx = locks_inode_context(inode);
        if (!ctx) {
                trace_generic_delete_lease(inode, NULL);
                return error;
        }

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
                if (fl->c.flc_file == filp &&
                    fl->c.flc_owner == owner) {
                        victim = fl;
                        break;
                }
        }
        trace_generic_delete_lease(inode, victim);
        if (victim)
                error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);
        locks_dispose_list(&dispose);
        return error;
}

/**
 *        generic_setlease        -        sets a lease on an open file
 *        @filp:        file pointer
 *        @arg:        type of lease to obtain
 *        @flp:        input - file_lock to use, output - file_lock inserted
 *        @priv:        private data for lm_setup (may be NULL if lm_setup
 *                doesn't require it)
 *
 *        The (input) flp->fl_lmops->lm_break function is required
 *        by break_lease().
 */
int generic_setlease(struct file *filp, int arg, struct file_lease **flp,
                        void **priv)
{
        switch (arg) {
        case F_UNLCK:
                return generic_delete_lease(filp, *priv);
        case F_RDLCK:
        case F_WRLCK:
                if (!(*flp)->fl_lmops->lm_break) {
                        WARN_ON_ONCE(1);
                        return -ENOLCK;
                }

                return generic_add_lease(filp, arg, flp, priv);
        default:
                return -EINVAL;
        }
}
EXPORT_SYMBOL(generic_setlease);

/*
 * Kernel subsystems can register to be notified on any attempt to set
 * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd
 * to close files that it may have cached when there is an attempt to set a
 * conflicting lease.
 */
static struct srcu_notifier_head lease_notifier_chain;

static inline void
lease_notifier_chain_init(void)
{
        srcu_init_notifier_head(&lease_notifier_chain);
}

static inline void
setlease_notifier(int arg, struct file_lease *lease)
{
        if (arg != F_UNLCK)
                srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
}

int lease_register_notifier(struct notifier_block *nb)
{
        return srcu_notifier_chain_register(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_register_notifier);

void lease_unregister_notifier(struct notifier_block *nb)
{
        srcu_notifier_chain_unregister(&lease_notifier_chain, nb);
}
EXPORT_SYMBOL_GPL(lease_unregister_notifier);


int
kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
        if (lease)
                setlease_notifier(arg, *lease);
        if (filp->f_op->setlease)
                return filp->f_op->setlease(filp, arg, lease, priv);
        else
                return generic_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(kernel_setlease);

/**
 * vfs_setlease        -       sets a lease on an open file
 * @filp:        file pointer
 * @arg:        type of lease to obtain
 * @lease:        file_lock to use when adding a lease
 * @priv:        private info for lm_setup when adding a lease (may be
 *                NULL if lm_setup doesn't require it)
 *
 * Call this to establish a lease on the file. The "lease" argument is not
 * used for F_UNLCK requests and may be NULL. For commands that set or alter
 * an existing lease, the ``(*lease)->fl_lmops->lm_break`` operation must be
 * set; if not, this function will return -ENOLCK (and generate a scary-looking
 * stack trace).
 *
 * The "priv" pointer is passed directly to the lm_setup function as-is. It
 * may be NULL if the lm_setup operation doesn't require it.
 */
int
vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
{
        struct inode *inode = file_inode(filp);
        vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
        int error;

        if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
                return -EACCES;
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
        error = security_file_lock(filp, arg);
        if (error)
                return error;
        return kernel_setlease(filp, arg, lease, priv);
}
EXPORT_SYMBOL_GPL(vfs_setlease);

static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
{
        struct file_lease *fl;
        struct fasync_struct *new;
        int error;

        fl = lease_alloc(filp, arg);
        if (IS_ERR(fl))
                return PTR_ERR(fl);

        new = fasync_alloc();
        if (!new) {
                locks_free_lease(fl);
                return -ENOMEM;
        }
        new->fa_fd = fd;

        error = vfs_setlease(filp, arg, &fl, (void **)&new);
        if (fl)
                locks_free_lease(fl);
        if (new)
                fasync_free(new);
        return error;
}

/**
 *        fcntl_setlease        -        sets a lease on an open file
 *        @fd: open file descriptor
 *        @filp: file pointer
 *        @arg: type of lease to obtain
 *
 *        Call this fcntl to establish a lease on the file.
 *        Note that you also need to call %F_SETSIG to
 *        receive a signal when the lease is broken.
 */
int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
        if (arg == F_UNLCK)
                return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
        return do_fcntl_add_lease(fd, filp, arg);
}

/**
 * flock_lock_inode_wait - Apply a FLOCK-style lock to a file
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a FLOCK style lock request to an inode.
 */
static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int error;
        might_sleep();
        for (;;) {
                error = flock_lock_inode(inode, fl);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);
        return error;
}

/**
 * locks_lock_inode_wait - Apply a lock to an inode
 * @inode: inode of the file to apply to
 * @fl: The lock to be applied
 *
 * Apply a POSIX or FLOCK style lock request to an inode.
 */
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        int res = 0;
        switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) {
                case FL_POSIX:
                        res = posix_lock_inode_wait(inode, fl);
                        break;
                case FL_FLOCK:
                        res = flock_lock_inode_wait(inode, fl);
                        break;
                default:
                        BUG();
        }
        return res;
}
EXPORT_SYMBOL(locks_lock_inode_wait);

/**
 *        sys_flock: - flock() system call.
 *        @fd: the file descriptor to lock.
 *        @cmd: the type of lock to apply.
 *
 *        Apply a %FL_FLOCK style lock to an open file descriptor.
 *        The @cmd can be one of:
 *
 *        - %LOCK_SH -- a shared lock.
 *        - %LOCK_EX -- an exclusive lock.
 *        - %LOCK_UN -- remove an existing lock.
 *        - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED)
 *
 *        %LOCK_MAND support has been removed from the kernel.
 */
SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
        int can_sleep, error, type;
        struct file_lock fl;

        /*
         * LOCK_MAND locks were broken for a long time in that they never
         * conflicted with one another and didn't prevent any sort of open,
         * read or write activity.
         *
         * Just ignore these requests now, to preserve legacy behavior, but
         * throw a warning to let people know that they don't actually work.
         */
        if (cmd & LOCK_MAND) {
                pr_warn_once("%s(%d): Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n", current->comm, current->pid);
                return 0;
        }

        type = flock_translate_cmd(cmd & ~LOCK_NB);
        if (type < 0)
                return type;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return -EBADF;

        if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE)))
                return -EBADF;

        flock_make_lock(fd_file(f), &fl, type);

        error = security_file_lock(fd_file(f), fl.c.flc_type);
        if (error)
                return error;

        can_sleep = !(cmd & LOCK_NB);
        if (can_sleep)
                fl.c.flc_flags |= FL_SLEEP;

        if (fd_file(f)->f_op->flock)
                error = fd_file(f)->f_op->flock(fd_file(f),
                                            (can_sleep) ? F_SETLKW : F_SETLK,
                                            &fl);
        else
                error = locks_lock_file_wait(fd_file(f), &fl);

        locks_release_private(&fl);
        return error;
}

/**
 * vfs_test_lock - test file byte range lock
 * @filp: The file to test lock for
 * @fl: The lock to test; also used to hold result
 *
 * Returns -ERRNO on failure.  Indicates presence of conflicting lock by
 * setting conf->fl_type to something other than F_UNLCK.
 */
int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, F_GETLK, fl);
        posix_test_lock(filp, fl);
        return 0;
}
EXPORT_SYMBOL_GPL(vfs_test_lock);

/**
 * locks_translate_pid - translate a file_lock's fl_pid number into a namespace
 * @fl: The file_lock who's fl_pid should be translated
 * @ns: The namespace into which the pid should be translated
 *
 * Used to translate a fl_pid into a namespace virtual pid number
 */
static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns)
{
        pid_t vnr;
        struct pid *pid;

        if (fl->flc_flags & FL_OFDLCK)
                return -1;

        /* Remote locks report a negative pid value */
        if (fl->flc_pid <= 0)
                return fl->flc_pid;

        /*
         * If the flock owner process is dead and its pid has been already
         * freed, the translation below won't work, but we still want to show
         * flock owner pid number in init pidns.
         */
        if (ns == &init_pid_ns)
                return (pid_t) fl->flc_pid;

        rcu_read_lock();
        pid = find_pid_ns(fl->flc_pid, &init_pid_ns);
        vnr = pid_nr_ns(pid, ns);
        rcu_read_unlock();
        return vnr;
}

static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
#if BITS_PER_LONG == 32
        /*
         * Make sure we can represent the posix lock via
         * legacy 32bit flock.
         */
        if (fl->fl_start > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
        if (fl->fl_end != OFFSET_MAX && fl->fl_end > OFFT_OFFSET_MAX)
                return -EOVERFLOW;
#endif
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->c.flc_type;
        return 0;
}

#if BITS_PER_LONG == 32
static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
{
        flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
        flock->l_start = fl->fl_start;
        flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
                fl->fl_end - fl->fl_start + 1;
        flock->l_whence = 0;
        flock->l_type = fl->c.flc_type;
}
#endif

/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;
        error = -EINVAL;
        if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
                        && flock->l_type != F_WRLCK)
                goto out;

        error = flock_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                fl->c.flc_flags |= FL_OFDLCK;
                fl->c.flc_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->c.flc_type;
        if (fl->c.flc_type != F_UNLCK) {
                error = posix_lock_to_flock(flock, fl);
                if (error)
                        goto out;
        }
out:
        locks_free_lock(fl);
        return error;
}

/**
 * vfs_lock_file - file byte range lock
 * @filp: The file to apply the lock to
 * @cmd: type of locking operation (F_SETLK, F_GETLK, etc.)
 * @fl: The lock to be applied
 * @conf: Place to return a copy of the conflicting lock, if found.
 *
 * A caller that doesn't care about the conflicting lock may pass NULL
 * as the final argument.
 *
 * If the filesystem defines a private ->lock() method, then @conf will
 * be left unchanged; so a caller that cares should initialize it to
 * some acceptable default.
 *
 * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
 * locks, the ->lock() interface may return asynchronously, before the lock has
 * been granted or denied by the underlying filesystem, if (and only if)
 * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
 * flags need to be set.
 *
 * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
 * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
 * blocking lock. When ->lock() does return asynchronously, it must return
 * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
 * If the request is for non-blocking lock the file system should return
 * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
 * with the result. If the request timed out the callback routine will return a
 * nonzero return code and the file system should release the lock. The file
 * system is also responsible to keep a corresponding posix lock when it
 * grants a lock so the VFS can find out which locks are locally held and do
 * the correct lock cleanup when required.
 * The underlying filesystem must not drop the kernel lock or call
 * ->lm_grant() before returning to the caller with a FILE_LOCK_DEFERRED
 * return code.
 */
int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, cmd, fl);
        else
                return posix_lock_file(filp, fl, conf);
}
EXPORT_SYMBOL_GPL(vfs_lock_file);

static int do_lock_file_wait(struct file *filp, unsigned int cmd,
                             struct file_lock *fl)
{
        int error;

        error = security_file_lock(filp, fl->c.flc_type);
        if (error)
                return error;

        for (;;) {
                error = vfs_lock_file(filp, cmd, fl, NULL);
                if (error != FILE_LOCK_DEFERRED)
                        break;
                error = wait_event_interruptible(fl->c.flc_wait,
                                                 list_empty(&fl->c.flc_blocked_member));
                if (error)
                        break;
        }
        locks_delete_block(fl);

        return error;
}

/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
static int
check_fmode_for_setlk(struct file_lock *fl)
{
        switch (fl->c.flc_type) {
        case F_RDLCK:
                if (!(fl->c.flc_file->f_mode & FMODE_READ))
                        return -EBADF;
                break;
        case F_WRLCK:
                if (!(fl->c.flc_file->f_mode & FMODE_WRITE))
                        return -EBADF;
        }
        return 0;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct inode *inode = file_inode(filp);
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        error = flock_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                fallthrough;
        case F_SETLKW:
                file_lock->c.flc_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Detect close/fcntl races and recover by zapping all POSIX locks
         * associated with this file and our files_struct, just like on
         * filp_flush(). There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->c.flc_type != F_UNLCK &&
            !(file_lock->c.flc_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        locks_remove_posix(filp, files);
                        error = -EBADF;
                }
        }
out:
        trace_fcntl_setlk(inode, file_lock, error);
        locks_free_lock(file_lock);
        return error;
}

#if BITS_PER_LONG == 32
/* Report the first existing lock that would conflict with l.
 * This implements the F_GETLK command of fcntl().
 */
int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
{
        struct file_lock *fl;
        int error;

        fl = locks_alloc_lock();
        if (fl == NULL)
                return -ENOMEM;

        error = -EINVAL;
        if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK
                        && flock->l_type != F_WRLCK)
                goto out;

        error = flock64_to_posix_lock(filp, fl, flock);
        if (error)
                goto out;

        if (cmd == F_OFD_GETLK) {
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                fl->c.flc_flags |= FL_OFDLCK;
                fl->c.flc_owner = filp;
        }

        error = vfs_test_lock(filp, fl);
        if (error)
                goto out;

        flock->l_type = fl->c.flc_type;
        if (fl->c.flc_type != F_UNLCK)
                posix_lock_to_flock64(flock, fl);

out:
        locks_free_lock(fl);
        return error;
}

/* Apply the lock described by l to an open file descriptor.
 * This implements both the F_SETLK and F_SETLKW commands of fcntl().
 */
int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
                struct flock64 *flock)
{
        struct file_lock *file_lock = locks_alloc_lock();
        struct file *f;
        int error;

        if (file_lock == NULL)
                return -ENOLCK;

        error = flock64_to_posix_lock(filp, file_lock, flock);
        if (error)
                goto out;

        error = check_fmode_for_setlk(file_lock);
        if (error)
                goto out;

        /*
         * If the cmd is requesting file-private locks, then set the
         * FL_OFDLCK flag and override the owner.
         */
        switch (cmd) {
        case F_OFD_SETLK:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLK64;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                break;
        case F_OFD_SETLKW:
                error = -EINVAL;
                if (flock->l_pid != 0)
                        goto out;

                cmd = F_SETLKW64;
                file_lock->c.flc_flags |= FL_OFDLCK;
                file_lock->c.flc_owner = filp;
                fallthrough;
        case F_SETLKW64:
                file_lock->c.flc_flags |= FL_SLEEP;
        }

        error = do_lock_file_wait(filp, cmd, file_lock);

        /*
         * Detect close/fcntl races and recover by zapping all POSIX locks
         * associated with this file and our files_struct, just like on
         * filp_flush(). There is no need to do that when we're
         * unlocking though, or for OFD locks.
         */
        if (!error && file_lock->c.flc_type != F_UNLCK &&
            !(file_lock->c.flc_flags & FL_OFDLCK)) {
                struct files_struct *files = current->files;
                /*
                 * We need that spin_lock here - it prevents reordering between
                 * update of i_flctx->flc_posix and check for it done in
                 * close(). rcu_read_lock() wouldn't do.
                 */
                spin_lock(&files->file_lock);
                f = files_lookup_fd_locked(files, fd);
                spin_unlock(&files->file_lock);
                if (f != filp) {
                        locks_remove_posix(filp, files);
                        error = -EBADF;
                }
        }
out:
        locks_free_lock(file_lock);
        return error;
}
#endif /* BITS_PER_LONG == 32 */

/*
 * This function is called when the file is being removed
 * from the task's fd array.  POSIX locks belonging to this task
 * are deleted at this time.
 */
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        int error;
        struct inode *inode = file_inode(filp);
        struct file_lock lock;
        struct file_lock_context *ctx;

        /*
         * If there are no locks held on this file, we don't need to call
         * posix_lock_file().  Another process could be setting a lock on this
         * file at the same time, but we wouldn't remove that lock anyway.
         */
        ctx = locks_inode_context(inode);
        if (!ctx || list_empty(&ctx->flc_posix))
                return;

        locks_init_lock(&lock);
        lock.c.flc_type = F_UNLCK;
        lock.c.flc_flags = FL_POSIX | FL_CLOSE;
        lock.fl_start = 0;
        lock.fl_end = OFFSET_MAX;
        lock.c.flc_owner = owner;
        lock.c.flc_pid = current->tgid;
        lock.c.flc_file = filp;
        lock.fl_ops = NULL;
        lock.fl_lmops = NULL;

        error = vfs_lock_file(filp, F_SETLK, &lock, NULL);

        if (lock.fl_ops && lock.fl_ops->fl_release_private)
                lock.fl_ops->fl_release_private(&lock);
        trace_locks_remove_posix(inode, &lock, error);
}
EXPORT_SYMBOL(locks_remove_posix);

/* The i_flctx must be valid when calling into here */
static void
locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
        struct file_lock fl;
        struct inode *inode = file_inode(filp);

        if (list_empty(&flctx->flc_flock))
                return;

        flock_make_lock(filp, &fl, F_UNLCK);
        fl.c.flc_flags |= FL_CLOSE;

        if (filp->f_op->flock)
                filp->f_op->flock(filp, F_SETLKW, &fl);
        else
                flock_lock_inode(inode, &fl);

        if (fl.fl_ops && fl.fl_ops->fl_release_private)
                fl.fl_ops->fl_release_private(&fl);
}

/* The i_flctx must be valid when calling into here */
static void
locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
        struct file_lease *fl, *tmp;
        LIST_HEAD(dispose);

        if (list_empty(&ctx->flc_lease))
                return;

        percpu_down_read(&file_rwsem);
        spin_lock(&ctx->flc_lock);
        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list)
                if (filp == fl->c.flc_file)
                        lease_modify(fl, F_UNLCK, &dispose);
        spin_unlock(&ctx->flc_lock);
        percpu_up_read(&file_rwsem);

        locks_dispose_list(&dispose);
}

/*
 * This function is called on the last close of an open file.
 */
void locks_remove_file(struct file *filp)
{
        struct file_lock_context *ctx;

        ctx = locks_inode_context(file_inode(filp));
        if (!ctx)
                return;

        /* remove any OFD locks */
        locks_remove_posix(filp, filp);

        /* remove flock locks */
        locks_remove_flock(filp, ctx);

        /* remove any leases */
        locks_remove_lease(filp, ctx);

        spin_lock(&ctx->flc_lock);
        locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX");
        locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK");
        locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE");
        spin_unlock(&ctx->flc_lock);
}

/**
 * vfs_cancel_lock - file byte range unblock lock
 * @filp: The file to apply the unblock to
 * @fl: The lock to be unblocked
 *
 * Used by lock managers to cancel blocked requests
 */
int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        WARN_ON_ONCE(filp != fl->c.flc_file);
        if (filp->f_op->lock)
                return filp->f_op->lock(filp, F_CANCELLK, fl);
        return 0;
}
EXPORT_SYMBOL_GPL(vfs_cancel_lock);

/**
 * vfs_inode_has_locks - are any file locks held on @inode?
 * @inode: inode to check for locks
 *
 * Return true if there are any FL_POSIX or FL_FLOCK locks currently
 * set on @inode.
 */
bool vfs_inode_has_locks(struct inode *inode)
{
        struct file_lock_context *ctx;
        bool ret;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return false;

        spin_lock(&ctx->flc_lock);
        ret = !list_empty(&ctx->flc_posix) || !list_empty(&ctx->flc_flock);
        spin_unlock(&ctx->flc_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(vfs_inode_has_locks);

#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>

struct locks_iterator {
        int        li_cpu;
        loff_t        li_pos;
};

static void lock_get_status(struct seq_file *f, struct file_lock_core *flc,
                            loff_t id, char *pfx, int repeat)
{
        struct inode *inode = NULL;
        unsigned int pid;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
        int type = flc->flc_type;
        struct file_lock *fl = file_lock(flc);

        pid = locks_translate_pid(flc, proc_pidns);

        /*
         * If lock owner is dead (and pid is freed) or not visible in current
         * pidns, zero is shown as a pid value. Check lock info from
         * init_pid_ns to get saved lock pid value.
         */
        if (flc->flc_file != NULL)
                inode = file_inode(flc->flc_file);

        seq_printf(f, "%lld: ", id);

        if (repeat)
                seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx);

        if (flc->flc_flags & FL_POSIX) {
                if (flc->flc_flags & FL_ACCESS)
                        seq_puts(f, "ACCESS");
                else if (flc->flc_flags & FL_OFDLCK)
                        seq_puts(f, "OFDLCK");
                else
                        seq_puts(f, "POSIX ");

                seq_printf(f, " %s ",
                             (inode == NULL) ? "*NOINODE*" : "ADVISORY ");
        } else if (flc->flc_flags & FL_FLOCK) {
                seq_puts(f, "FLOCK  ADVISORY  ");
        } else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) {
                struct file_lease *lease = file_lease(flc);

                type = target_leasetype(lease);

                if (flc->flc_flags & FL_DELEG)
                        seq_puts(f, "DELEG  ");
                else
                        seq_puts(f, "LEASE  ");

                if (lease_breaking(lease))
                        seq_puts(f, "BREAKING  ");
                else if (flc->flc_file)
                        seq_puts(f, "ACTIVE    ");
                else
                        seq_puts(f, "BREAKER   ");
        } else {
                seq_puts(f, "UNKNOWN UNKNOWN  ");
        }

        seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
                             (type == F_RDLCK) ? "READ" : "UNLCK");
        if (inode) {
                /* userspace relies on this representation of dev_t */
                seq_printf(f, "%d %02x:%02x:%lu ", pid,
                                MAJOR(inode->i_sb->s_dev),
                                MINOR(inode->i_sb->s_dev), inode->i_ino);
        } else {
                seq_printf(f, "%d <none>:0 ", pid);
        }
        if (flc->flc_flags & FL_POSIX) {
                if (fl->fl_end == OFFSET_MAX)
                        seq_printf(f, "%Ld EOF\n", fl->fl_start);
                else
                        seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
        } else {
                seq_puts(f, "0 EOF\n");
        }
}

static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node)
{
        struct file_lock_core *tmp;

        /* NULL node or root node */
        if (node == NULL || node->flc_blocker == NULL)
                return NULL;

        /* Next member in the linked list could be itself */
        tmp = list_next_entry(node, flc_blocked_member);
        if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests,
                               flc_blocked_member)
                || tmp == node) {
                return NULL;
        }

        return tmp;
}

static int locks_show(struct seq_file *f, void *v)
{
        struct locks_iterator *iter = f->private;
        struct file_lock_core *cur, *tmp;
        struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
        int level = 0;

        cur = hlist_entry(v, struct file_lock_core, flc_link);

        if (locks_translate_pid(cur, proc_pidns) == 0)
                return 0;

        /* View this crossed linked list as a binary tree, the first member of flc_blocked_requests
         * is the left child of current node, the next silibing in flc_blocked_member is the
         * right child, we can alse get the parent of current node from flc_blocker, so this
         * question becomes traversal of a binary tree
         */
        while (cur != NULL) {
                if (level)
                        lock_get_status(f, cur, iter->li_pos, "-> ", level);
                else
                        lock_get_status(f, cur, iter->li_pos, "", level);

                if (!list_empty(&cur->flc_blocked_requests)) {
                        /* Turn left */
                        cur = list_first_entry_or_null(&cur->flc_blocked_requests,
                                                       struct file_lock_core,
                                                       flc_blocked_member);
                        level++;
                } else {
                        /* Turn right */
                        tmp = get_next_blocked_member(cur);
                        /* Fall back to parent node */
                        while (tmp == NULL && cur->flc_blocker != NULL) {
                                cur = cur->flc_blocker;
                                level--;
                                tmp = get_next_blocked_member(cur);
                        }
                        cur = tmp;
                }
        }

        return 0;
}

static void __show_fd_locks(struct seq_file *f,
                        struct list_head *head, int *id,
                        struct file *filp, struct files_struct *files)
{
        struct file_lock_core *fl;

        list_for_each_entry(fl, head, flc_list) {

                if (filp != fl->flc_file)
                        continue;
                if (fl->flc_owner != files && fl->flc_owner != filp)
                        continue;

                (*id)++;
                seq_puts(f, "lock:\t");
                lock_get_status(f, fl, *id, "", 0);
        }
}

void show_fd_locks(struct seq_file *f,
                  struct file *filp, struct files_struct *files)
{
        struct inode *inode = file_inode(filp);
        struct file_lock_context *ctx;
        int id = 0;

        ctx = locks_inode_context(inode);
        if (!ctx)
                return;

        spin_lock(&ctx->flc_lock);
        __show_fd_locks(f, &ctx->flc_flock, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_posix, &id, filp, files);
        __show_fd_locks(f, &ctx->flc_lease, &id, filp, files);
        spin_unlock(&ctx->flc_lock);
}

static void *locks_start(struct seq_file *f, loff_t *pos)
        __acquires(&blocked_lock_lock)
{
        struct locks_iterator *iter = f->private;

        iter->li_pos = *pos + 1;
        percpu_down_write(&file_rwsem);
        spin_lock(&blocked_lock_lock);
        return seq_hlist_start_percpu(&file_lock_list.hlist, &iter->li_cpu, *pos);
}

static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
{
        struct locks_iterator *iter = f->private;

        ++iter->li_pos;
        return seq_hlist_next_percpu(v, &file_lock_list.hlist, &iter->li_cpu, pos);
}

static void locks_stop(struct seq_file *f, void *v)
        __releases(&blocked_lock_lock)
{
        spin_unlock(&blocked_lock_lock);
        percpu_up_write(&file_rwsem);
}

static const struct seq_operations locks_seq_operations = {
        .start        = locks_start,
        .next        = locks_next,
        .stop        = locks_stop,
        .show        = locks_show,
};

static int __init proc_locks_init(void)
{
        proc_create_seq_private("locks", 0, NULL, &locks_seq_operations,
                        sizeof(struct locks_iterator), NULL);
        return 0;
}
fs_initcall(proc_locks_init);
#endif

static int __init filelock_init(void)
{
        int i;

        flctx_cache = kmem_cache_create("file_lock_ctx",
                        sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);

        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);

        filelease_cache = kmem_cache_create("file_lease_cache",
                        sizeof(struct file_lease), 0, SLAB_PANIC, NULL);

        for_each_possible_cpu(i) {
                struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);

                spin_lock_init(&fll->lock);
                INIT_HLIST_HEAD(&fll->hlist);
        }

        lease_notifier_chain_init();
        return 0;
}
core_initcall(filelock_init);


























































  238 




































































































































   16 








  146 

  123 

  249 





































  301 

  168 


  235 





   22 

















  137 





















   17 






   59 








  167 


   16 


  161 





  328 







  329 
  309 
   79 






  151 
  249 
  269 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
  Red Black Trees
  (C) 1999  Andrea Arcangeli <andrea@suse.de>
  (C) 2002  David Woodhouse <dwmw2@infradead.org>
  (C) 2012  Michel Lespinasse <walken@google.com>


  linux/include/linux/rbtree_augmented.h
*/

#ifndef _LINUX_RBTREE_AUGMENTED_H
#define _LINUX_RBTREE_AUGMENTED_H

#include <linux/compiler.h>
#include <linux/rbtree.h>
#include <linux/rcupdate.h>

/*
 * Please note - only struct rb_augment_callbacks and the prototypes for
 * rb_insert_augmented() and rb_erase_augmented() are intended to be public.
 * The rest are implementation details you are not expected to depend on.
 *
 * See Documentation/core-api/rbtree.rst for documentation and samples.
 */

struct rb_augment_callbacks {
        void (*propagate)(struct rb_node *node, struct rb_node *stop);
        void (*copy)(struct rb_node *old, struct rb_node *new);
        void (*rotate)(struct rb_node *old, struct rb_node *new);
};

extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

/*
 * Fixup the rbtree and update the augmented information when rebalancing.
 *
 * On insertion, the user must update the augmented information on the path
 * leading to the inserted node, then call rb_link_node() as usual and
 * rb_insert_augmented() instead of the usual rb_insert_color() call.
 * If rb_insert_augmented() rebalances the rbtree, it will callback into
 * a user provided function to update the augmented information on the
 * affected subtrees.
 */
static inline void
rb_insert_augmented(struct rb_node *node, struct rb_root *root,
                    const struct rb_augment_callbacks *augment)
{
        __rb_insert_augmented(node, root, augment->rotate);
}

static inline void
rb_insert_augmented_cached(struct rb_node *node,
                           struct rb_root_cached *root, bool newleft,
                           const struct rb_augment_callbacks *augment)
{
        if (newleft)
                root->rb_leftmost = node;
        rb_insert_augmented(node, &root->rb_root, augment);
}

static __always_inline struct rb_node *
rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
                        bool (*less)(struct rb_node *, const struct rb_node *),
                        const struct rb_augment_callbacks *augment)
{
        struct rb_node **link = &tree->rb_root.rb_node;
        struct rb_node *parent = NULL;
        bool leftmost = true;

        while (*link) {
                parent = *link;
                if (less(node, parent)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = false;
                }
        }

        rb_link_node(node, parent, link);
        augment->propagate(parent, NULL); /* suboptimal */
        rb_insert_augmented_cached(node, tree, leftmost, augment);

        return leftmost ? node : NULL;
}

/*
 * Template for declaring augmented rbtree callbacks (generic case)
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that recomputes the RBAUGMENTED data
 */

#define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                \
                             RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE)        \
static inline void                                                        \
RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop)                \
{                                                                        \
        while (rb != stop) {                                                \
                RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD);        \
                if (RBCOMPUTE(node, true))                                \
                        break;                                                \
                rb = rb_parent(&node->RBFIELD);                                \
        }                                                                \
}                                                                        \
static inline void                                                        \
RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new)                \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
}                                                                        \
static void                                                                \
RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new)        \
{                                                                        \
        RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD);                \
        RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD);                \
        new->RBAUGMENTED = old->RBAUGMENTED;                                \
        RBCOMPUTE(old, false);                                                \
}                                                                        \
RBSTATIC const struct rb_augment_callbacks RBNAME = {                        \
        .propagate = RBNAME ## _propagate,                                \
        .copy = RBNAME ## _copy,                                        \
        .rotate = RBNAME ## _rotate                                        \
};

/*
 * Template for declaring augmented rbtree callbacks,
 * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes.
 *
 * RBSTATIC:    'static' or empty
 * RBNAME:      name of the rb_augment_callbacks structure
 * RBSTRUCT:    struct type of the tree nodes
 * RBFIELD:     name of struct rb_node field within RBSTRUCT
 * RBTYPE:      type of the RBAUGMENTED field
 * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree
 * RBCOMPUTE:   name of function that returns the per-node RBTYPE scalar
 */

#define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD,              \
                                 RBTYPE, RBAUGMENTED, RBCOMPUTE)              \
static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit)              \
{                                                                              \
        RBSTRUCT *child;                                                      \
        RBTYPE max = RBCOMPUTE(node);                                              \
        if (node->RBFIELD.rb_left) {                                              \
                child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD);   \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (node->RBFIELD.rb_right) {                                              \
                child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD);  \
                if (child->RBAUGMENTED > max)                                      \
                        max = child->RBAUGMENTED;                              \
        }                                                                      \
        if (exit && node->RBAUGMENTED == max)                                      \
                return true;                                                      \
        node->RBAUGMENTED = max;                                              \
        return false;                                                              \
}                                                                              \
RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME,                                              \
                     RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max)


#define        RB_RED                0
#define        RB_BLACK        1

#define __rb_parent(pc)    ((struct rb_node *)(pc & ~3))

#define __rb_color(pc)     ((pc) & 1)
#define __rb_is_black(pc)  __rb_color(pc)
#define __rb_is_red(pc)    (!__rb_color(pc))
#define rb_color(rb)       __rb_color((rb)->__rb_parent_color)
#define rb_is_red(rb)      __rb_is_red((rb)->__rb_parent_color)
#define rb_is_black(rb)    __rb_is_black((rb)->__rb_parent_color)

static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
        rb->__rb_parent_color = rb_color(rb) + (unsigned long)p;
}

static inline void rb_set_parent_color(struct rb_node *rb,
                                       struct rb_node *p, int color)
{
        rb->__rb_parent_color = (unsigned long)p + color;
}

static inline void
__rb_change_child(struct rb_node *old, struct rb_node *new,
                  struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        WRITE_ONCE(parent->rb_left, new);
                else
                        WRITE_ONCE(parent->rb_right, new);
        } else
                WRITE_ONCE(root->rb_node, new);
}

static inline void
__rb_change_child_rcu(struct rb_node *old, struct rb_node *new,
                      struct rb_node *parent, struct rb_root *root)
{
        if (parent) {
                if (parent->rb_left == old)
                        rcu_assign_pointer(parent->rb_left, new);
                else
                        rcu_assign_pointer(parent->rb_right, new);
        } else
                rcu_assign_pointer(root->rb_node, new);
}

extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
        void (*augment_rotate)(struct rb_node *old, struct rb_node *new));

static __always_inline struct rb_node *
__rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                     const struct rb_augment_callbacks *augment)
{
        struct rb_node *child = node->rb_right;
        struct rb_node *tmp = node->rb_left;
        struct rb_node *parent, *rebalance;
        unsigned long pc;

        if (!tmp) {
                /*
                 * Case 1: node to erase has no more than 1 child (easy!)
                 *
                 * Note that if there is one child it must be red due to 5)
                 * and node must be black due to 4). We adjust colors locally
                 * so as to bypass __rb_erase_color() later on.
                 */
                pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, child, parent, root);
                if (child) {
                        child->__rb_parent_color = pc;
                        rebalance = NULL;
                } else
                        rebalance = __rb_is_black(pc) ? parent : NULL;
                tmp = parent;
        } else if (!child) {
                /* Still case 1, but this time the child is node->rb_left */
                tmp->__rb_parent_color = pc = node->__rb_parent_color;
                parent = __rb_parent(pc);
                __rb_change_child(node, tmp, parent, root);
                rebalance = NULL;
                tmp = parent;
        } else {
                struct rb_node *successor = child, *child2;

                tmp = child->rb_left;
                if (!tmp) {
                        /*
                         * Case 2: node's successor is its right child
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (s)  ->  (x) (c)
                         *        \
                         *        (c)
                         */
                        parent = successor;
                        child2 = successor->rb_right;

                        augment->copy(node, successor);
                } else {
                        /*
                         * Case 3: node's successor is leftmost under
                         * node's right child subtree
                         *
                         *    (n)          (s)
                         *    / \          / \
                         *  (x) (y)  ->  (x) (y)
                         *      /            /
                         *    (p)          (p)
                         *    /            /
                         *  (s)          (c)
                         *    \
                         *    (c)
                         */
                        do {
                                parent = successor;
                                successor = tmp;
                                tmp = tmp->rb_left;
                        } while (tmp);
                        child2 = successor->rb_right;
                        WRITE_ONCE(parent->rb_left, child2);
                        WRITE_ONCE(successor->rb_right, child);
                        rb_set_parent(child, successor);

                        augment->copy(node, successor);
                        augment->propagate(parent, successor);
                }

                tmp = node->rb_left;
                WRITE_ONCE(successor->rb_left, tmp);
                rb_set_parent(tmp, successor);

                pc = node->__rb_parent_color;
                tmp = __rb_parent(pc);
                __rb_change_child(node, successor, tmp, root);

                if (child2) {
                        rb_set_parent_color(child2, parent, RB_BLACK);
                        rebalance = NULL;
                } else {
                        rebalance = rb_is_black(successor) ? parent : NULL;
                }
                successor->__rb_parent_color = pc;
                tmp = successor;
        }

        augment->propagate(tmp, NULL);
        return rebalance;
}

static __always_inline void
rb_erase_augmented(struct rb_node *node, struct rb_root *root,
                   const struct rb_augment_callbacks *augment)
{
        struct rb_node *rebalance = __rb_erase_augmented(node, root, augment);
        if (rebalance)
                __rb_erase_color(rebalance, root, augment->rotate);
}

static __always_inline void
rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root,
                          const struct rb_augment_callbacks *augment)
{
        if (root->rb_leftmost == node)
                root->rb_leftmost = rb_next(node);
        rb_erase_augmented(node, &root->rb_root, augment);
}

#endif        /* _LINUX_RBTREE_AUGMENTED_H */













































































































































   34 
































    1 









    1 
































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_USER_NAMESPACE_H
#define _LINUX_USER_NAMESPACE_H

#include <linux/kref.h>
#include <linux/nsproxy.h>
#include <linux/ns_common.h>
#include <linux/rculist_nulls.h>
#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/rcuref.h>
#include <linux/rwsem.h>
#include <linux/sysctl.h>
#include <linux/err.h>

#define UID_GID_MAP_MAX_BASE_EXTENTS 5
#define UID_GID_MAP_MAX_EXTENTS 340

struct uid_gid_extent {
        u32 first;
        u32 lower_first;
        u32 count;
};

struct uid_gid_map { /* 64 bytes -- 1 cache line */
        union {
                struct {
                        struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS];
                        u32 nr_extents;
                };
                struct {
                        struct uid_gid_extent *forward;
                        struct uid_gid_extent *reverse;
                };
        };
};

#define USERNS_SETGROUPS_ALLOWED 1UL

#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED

struct ucounts;

enum ucount_type {
        UCOUNT_USER_NAMESPACES,
        UCOUNT_PID_NAMESPACES,
        UCOUNT_UTS_NAMESPACES,
        UCOUNT_IPC_NAMESPACES,
        UCOUNT_NET_NAMESPACES,
        UCOUNT_MNT_NAMESPACES,
        UCOUNT_CGROUP_NAMESPACES,
        UCOUNT_TIME_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
        UCOUNT_INOTIFY_INSTANCES,
        UCOUNT_INOTIFY_WATCHES,
#endif
#ifdef CONFIG_FANOTIFY
        UCOUNT_FANOTIFY_GROUPS,
        UCOUNT_FANOTIFY_MARKS,
#endif
        UCOUNT_COUNTS,
};

enum rlimit_type {
        UCOUNT_RLIMIT_NPROC,
        UCOUNT_RLIMIT_MSGQUEUE,
        UCOUNT_RLIMIT_SIGPENDING,
        UCOUNT_RLIMIT_MEMLOCK,
        UCOUNT_RLIMIT_COUNTS,
};

#if IS_ENABLED(CONFIG_BINFMT_MISC)
struct binfmt_misc;
#endif

struct user_namespace {
        struct uid_gid_map        uid_map;
        struct uid_gid_map        gid_map;
        struct uid_gid_map        projid_map;
        struct user_namespace        *parent;
        int                        level;
        kuid_t                        owner;
        kgid_t                        group;
        struct ns_common        ns;
        unsigned long                flags;
        /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP
         * in its effective capability set at the child ns creation time. */
        bool                        parent_could_setfcap;

#ifdef CONFIG_KEYS
        /* List of joinable keyrings in this namespace.  Modification access of
         * these pointers is controlled by keyring_sem.  Once
         * user_keyring_register is set, it won't be changed, so it can be
         * accessed directly with READ_ONCE().
         */
        struct list_head        keyring_name_list;
        struct key                *user_keyring_register;
        struct rw_semaphore        keyring_sem;
#endif

        /* Register of per-UID persistent keyrings for this namespace */
#ifdef CONFIG_PERSISTENT_KEYRINGS
        struct key                *persistent_keyring_register;
#endif
        struct work_struct        work;
#ifdef CONFIG_SYSCTL
        struct ctl_table_set        set;
        struct ctl_table_header *sysctls;
#endif
        struct ucounts                *ucounts;
        long ucount_max[UCOUNT_COUNTS];
        long rlimit_max[UCOUNT_RLIMIT_COUNTS];

#if IS_ENABLED(CONFIG_BINFMT_MISC)
        struct binfmt_misc *binfmt_misc;
#endif
} __randomize_layout;

struct ucounts {
        struct hlist_nulls_node node;
        struct user_namespace *ns;
        kuid_t uid;
        struct rcu_head rcu;
        rcuref_t count;
        atomic_long_t ucount[UCOUNT_COUNTS];
        atomic_long_t rlimit[UCOUNT_RLIMIT_COUNTS];
};

extern struct user_namespace init_user_ns;
extern struct ucounts init_ucounts;

bool setup_userns_sysctls(struct user_namespace *ns);
void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
void put_ucounts(struct ucounts *ucounts);

static inline struct ucounts * __must_check get_ucounts(struct ucounts *ucounts)
{
        if (rcuref_get(&ucounts->count))
                return ucounts;
        return NULL;
}

static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type type)
{
        return atomic_long_read(&ucounts->rlimit[type]);
}

long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v);
long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
                            bool override_rlimit);
void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type);
bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max);

static inline long get_userns_rlimit_max(struct user_namespace *ns, enum rlimit_type type)
{
        return READ_ONCE(ns->rlimit_max[type]);
}

static inline void set_userns_rlimit_max(struct user_namespace *ns,
                enum rlimit_type type, unsigned long max)
{
        ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}

#ifdef CONFIG_USER_NS

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->ns.count);
        return ns;
}

extern int create_user_ns(struct cred *new);
extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred);
extern void __put_user_ns(struct user_namespace *ns);

static inline void put_user_ns(struct user_namespace *ns)
{
        if (ns && refcount_dec_and_test(&ns->ns.count))
                __put_user_ns(ns);
}

struct seq_operations;
extern const struct seq_operations proc_uid_seq_operations;
extern const struct seq_operations proc_gid_seq_operations;
extern const struct seq_operations proc_projid_seq_operations;
extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *);
extern int proc_setgroups_show(struct seq_file *m, void *v);
extern bool userns_may_setgroups(const struct user_namespace *ns);
extern bool in_userns(const struct user_namespace *ancestor,
                       const struct user_namespace *child);
extern bool current_in_userns(const struct user_namespace *target_ns);
struct ns_common *ns_get_owner(struct ns_common *ns);
#else

static inline struct user_namespace *get_user_ns(struct user_namespace *ns)
{
        return &init_user_ns;
}

static inline int create_user_ns(struct cred *new)
{
        return -EINVAL;
}

static inline int unshare_userns(unsigned long unshare_flags,
                                 struct cred **new_cred)
{
        if (unshare_flags & CLONE_NEWUSER)
                return -EINVAL;
        return 0;
}

static inline void put_user_ns(struct user_namespace *ns)
{
}

static inline bool userns_may_setgroups(const struct user_namespace *ns)
{
        return true;
}

static inline bool in_userns(const struct user_namespace *ancestor,
                             const struct user_namespace *child)
{
        return true;
}

static inline bool current_in_userns(const struct user_namespace *target_ns)
{
        return true;
}

static inline struct ns_common *ns_get_owner(struct ns_common *ns)
{
        return ERR_PTR(-EPERM);
}
#endif

#endif /* _LINUX_USER_H */




















































































































































































































































































































































    3 
    3 

    3 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
// SPDX-License-Identifier: GPL-2.0-or-later

#include <linux/netdevice.h>
#include <net/netdev_lock.h>

#include "dev.h"

/**
 * dev_change_name() - change name of a device
 * @dev: device
 * @newname: name (or format string) must be at least IFNAMSIZ
 *
 * Change name of a device, can pass format strings "eth%d".
 * for wildcarding.
 *
 * Return: 0 on success, -errno on failure.
 */
int dev_change_name(struct net_device *dev, const char *newname)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_change_name(dev, newname);
        netdev_unlock_ops(dev);

        return ret;
}

/**
 * dev_set_alias() - change ifalias of a device
 * @dev: device
 * @alias: name up to IFALIASZ
 * @len: limit of bytes to copy from info
 *
 * Set ifalias for a device.
 *
 * Return: 0 on success, -errno on failure.
 */
int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_set_alias(dev, alias, len);
        netdev_unlock_ops(dev);

        return ret;
}
EXPORT_SYMBOL(dev_set_alias);

/**
 * dev_change_flags() - change device settings
 * @dev: device
 * @flags: device state flags
 * @extack: netlink extended ack
 *
 * Change settings on device based state flags. The flags are
 * in the userspace exported format.
 *
 * Return: 0 on success, -errno on failure.
 */
int dev_change_flags(struct net_device *dev, unsigned int flags,
                     struct netlink_ext_ack *extack)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_change_flags(dev, flags, extack);
        netdev_unlock_ops(dev);

        return ret;
}
EXPORT_SYMBOL(dev_change_flags);

/**
 * dev_set_group() - change group this device belongs to
 * @dev: device
 * @new_group: group this device should belong to
 */
void dev_set_group(struct net_device *dev, int new_group)
{
        netdev_lock_ops(dev);
        netif_set_group(dev, new_group);
        netdev_unlock_ops(dev);
}

int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
                             struct netlink_ext_ack *extack)
{
        int ret;

        down_write(&dev_addr_sem);
        netdev_lock_ops(dev);
        ret = netif_set_mac_address(dev, sa, extack);
        netdev_unlock_ops(dev);
        up_write(&dev_addr_sem);

        return ret;
}
EXPORT_SYMBOL(dev_set_mac_address_user);

/**
 * dev_change_net_namespace() - move device to different nethost namespace
 * @dev: device
 * @net: network namespace
 * @pat: If not NULL name pattern to try if the current device name
 *       is already taken in the destination network namespace.
 *
 * This function shuts down a device interface and moves it
 * to a new network namespace. On success 0 is returned, on
 * a failure a netagive errno code is returned.
 *
 * Callers must hold the rtnl semaphore.
 *
 * Return: 0 on success, -errno on failure.
 */
int dev_change_net_namespace(struct net_device *dev, struct net *net,
                             const char *pat)
{
        return __dev_change_net_namespace(dev, net, pat, 0, NULL);
}
EXPORT_SYMBOL_GPL(dev_change_net_namespace);

/**
 * dev_change_carrier() - change device carrier
 * @dev: device
 * @new_carrier: new value
 *
 * Change device carrier
 *
 * Return: 0 on success, -errno on failure.
 */
int dev_change_carrier(struct net_device *dev, bool new_carrier)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_change_carrier(dev, new_carrier);
        netdev_unlock_ops(dev);

        return ret;
}

/**
 * dev_change_tx_queue_len() - change TX queue length of a netdevice
 * @dev: device
 * @new_len: new tx queue length
 *
 * Return: 0 on success, -errno on failure.
 */
int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_change_tx_queue_len(dev, new_len);
        netdev_unlock_ops(dev);

        return ret;
}

/**
 * dev_change_proto_down() - set carrier according to proto_down
 * @dev: device
 * @proto_down: new value
 *
 * Return: 0 on success, -errno on failure.
 */
int dev_change_proto_down(struct net_device *dev, bool proto_down)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_change_proto_down(dev, proto_down);
        netdev_unlock_ops(dev);

        return ret;
}

/**
 * dev_open() - prepare an interface for use
 * @dev: device to open
 * @extack: netlink extended ack
 *
 * Takes a device from down to up state. The device's private open
 * function is invoked and then the multicast lists are loaded. Finally
 * the device is moved into the up state and a %NETDEV_UP message is
 * sent to the netdev notifier chain.
 *
 * Calling this function on an active interface is a nop. On a failure
 * a negative errno code is returned.
 *
 * Return: 0 on success, -errno on failure.
 */
int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_open(dev, extack);
        netdev_unlock_ops(dev);

        return ret;
}
EXPORT_SYMBOL(dev_open);

/**
 * dev_close() - shutdown an interface
 * @dev: device to shutdown
 *
 * This function moves an active device into down state. A
 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 * chain.
 */
void dev_close(struct net_device *dev)
{
        netdev_lock_ops(dev);
        netif_close(dev);
        netdev_unlock_ops(dev);
}
EXPORT_SYMBOL(dev_close);

int dev_eth_ioctl(struct net_device *dev,
                  struct ifreq *ifr, unsigned int cmd)
{
        const struct net_device_ops *ops = dev->netdev_ops;
        int ret = -ENODEV;

        if (!ops->ndo_eth_ioctl)
                return -EOPNOTSUPP;

        netdev_lock_ops(dev);
        if (netif_device_present(dev))
                ret = ops->ndo_eth_ioctl(dev, ifr, cmd);
        netdev_unlock_ops(dev);

        return ret;
}
EXPORT_SYMBOL(dev_eth_ioctl);

int dev_set_mtu(struct net_device *dev, int new_mtu)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_set_mtu(dev, new_mtu);
        netdev_unlock_ops(dev);

        return ret;
}
EXPORT_SYMBOL(dev_set_mtu);

/**
 * dev_disable_lro() - disable Large Receive Offload on a device
 * @dev: device
 *
 * Disable Large Receive Offload (LRO) on a net device.  Must be
 * called under RTNL.  This is needed if received packets may be
 * forwarded to another interface.
 */
void dev_disable_lro(struct net_device *dev)
{
        netdev_lock_ops(dev);
        netif_disable_lro(dev);
        netdev_unlock_ops(dev);
}
EXPORT_SYMBOL(dev_disable_lro);

/**
 * dev_set_allmulti() - update allmulti count on a device
 * @dev: device
 * @inc: modifier
 *
 * Add or remove reception of all multicast frames to a device. While the
 * count in the device remains above zero the interface remains listening
 * to all interfaces. Once it hits zero the device reverts back to normal
 * filtering operation. A negative @inc value is used to drop the counter
 * when releasing a resource needing all multicasts.
 *
 * Return: 0 on success, -errno on failure.
 */

int dev_set_allmulti(struct net_device *dev, int inc)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_set_allmulti(dev, inc, true);
        netdev_unlock_ops(dev);

        return ret;
}
EXPORT_SYMBOL(dev_set_allmulti);

/**
 * dev_set_mac_address() - change Media Access Control Address
 * @dev: device
 * @sa: new address
 * @extack: netlink extended ack
 *
 * Change the hardware (MAC) address of the device
 *
 * Return: 0 on success, -errno on failure.
 */
int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
                        struct netlink_ext_ack *extack)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_set_mac_address(dev, sa, extack);
        netdev_unlock_ops(dev);

        return ret;
}
EXPORT_SYMBOL(dev_set_mac_address);

int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
{
        int ret;

        netdev_lock_ops(dev);
        ret = netif_xdp_propagate(dev, bpf);
        netdev_unlock_ops(dev);

        return ret;
}
EXPORT_SYMBOL_GPL(dev_xdp_propagate);

/**
 * netdev_state_change() - device changes state
 * @dev: device to cause notification
 *
 * Called to indicate a device has changed state. This function calls
 * the notifier chains for netdev_chain and sends a NEWLINK message
 * to the routing socket.
 */
void netdev_state_change(struct net_device *dev)
{
        netdev_lock_ops(dev);
        netif_state_change(dev);
        netdev_unlock_ops(dev);
}
EXPORT_SYMBOL(netdev_state_change);





























  248 
  247 


  248 






























   25 








































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/cpumask.h>
#include <linux/export.h>
#include <linux/memblock.h>
#include <linux/numa.h>

/* These are not inline because of header tangles. */
#ifdef CONFIG_CPUMASK_OFFSTACK
/**
 * alloc_cpumask_var_node - allocate a struct cpumask on a given node
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 * @flags: GFP_ flags
 * @node: memory node from which to allocate or %NUMA_NO_NODE
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop returning a constant 1 (in <linux/cpumask.h>).
 *
 * Return: TRUE if memory allocation succeeded, FALSE otherwise.
 *
 * In addition, mask will be NULL if this fails.  Note that gcc is
 * usually smart enough to know that mask can never be NULL if
 * CONFIG_CPUMASK_OFFSTACK=n, so does code elimination in that case
 * too.
 */
bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
{
        *mask = kmalloc_node(cpumask_size(), flags, node);

#ifdef CONFIG_DEBUG_PER_CPU_MAPS
        if (!*mask) {
                printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
                dump_stack();
        }
#endif

        return *mask != NULL;
}
EXPORT_SYMBOL(alloc_cpumask_var_node);

/**
 * alloc_bootmem_cpumask_var - allocate a struct cpumask from the bootmem arena.
 * @mask: pointer to cpumask_var_t where the cpumask is returned
 *
 * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is
 * a nop (in <linux/cpumask.h>).
 * Either returns an allocated (zero-filled) cpumask, or causes the
 * system to panic.
 */
void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask)
{
        *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES);
}

/**
 * free_cpumask_var - frees memory allocated for a struct cpumask.
 * @mask: cpumask to free
 *
 * This is safe on a NULL mask.
 */
void free_cpumask_var(cpumask_var_t mask)
{
        kfree(mask);
}
EXPORT_SYMBOL(free_cpumask_var);

/**
 * free_bootmem_cpumask_var - frees result of alloc_bootmem_cpumask_var
 * @mask: cpumask to free
 */
void __init free_bootmem_cpumask_var(cpumask_var_t mask)
{
        memblock_free(mask, cpumask_size());
}
#endif

/**
 * cpumask_local_spread - select the i'th cpu based on NUMA distances
 * @i: index number
 * @node: local numa_node
 *
 * Return: online CPU according to a numa aware policy; local cpus are returned
 * first, followed by non-local ones, then it wraps around.
 *
 * For those who wants to enumerate all CPUs based on their NUMA distances,
 * i.e. call this function in a loop, like:
 *
 * for (i = 0; i < num_online_cpus(); i++) {
 *        cpu = cpumask_local_spread(i, node);
 *        do_something(cpu);
 * }
 *
 * There's a better alternative based on for_each()-like iterators:
 *
 *        for_each_numa_hop_mask(mask, node) {
 *                for_each_cpu_andnot(cpu, mask, prev)
 *                        do_something(cpu);
 *                prev = mask;
 *        }
 *
 * It's simpler and more verbose than above. Complexity of iterator-based
 * enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while
 * cpumask_local_spread() when called for each cpu is
 * O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)).
 */
unsigned int cpumask_local_spread(unsigned int i, int node)
{
        unsigned int cpu;

        /* Wrap: we always want a cpu. */
        i %= num_online_cpus();

        cpu = sched_numa_find_nth_cpu(cpu_online_mask, i, node);

        WARN_ON(cpu >= nr_cpu_ids);
        return cpu;
}
EXPORT_SYMBOL(cpumask_local_spread);

static DEFINE_PER_CPU(int, distribute_cpu_mask_prev);

/**
 * cpumask_any_and_distribute - Return an arbitrary cpu within src1p & src2p.
 * @src1p: first &cpumask for intersection
 * @src2p: second &cpumask for intersection
 *
 * Iterated calls using the same srcp1 and srcp2 will be distributed within
 * their intersection.
 *
 * Return: >= nr_cpu_ids if the intersection is empty.
 */
unsigned int cpumask_any_and_distribute(const struct cpumask *src1p,
                               const struct cpumask *src2p)
{
        unsigned int next, prev;

        /* NOTE: our first selection will skip 0. */
        prev = __this_cpu_read(distribute_cpu_mask_prev);

        next = cpumask_next_and_wrap(prev, src1p, src2p);
        if (next < nr_cpu_ids)
                __this_cpu_write(distribute_cpu_mask_prev, next);

        return next;
}
EXPORT_SYMBOL(cpumask_any_and_distribute);

/**
 * cpumask_any_distribute - Return an arbitrary cpu from srcp
 * @srcp: &cpumask for selection
 *
 * Return: >= nr_cpu_ids if the intersection is empty.
 */
unsigned int cpumask_any_distribute(const struct cpumask *srcp)
{
        unsigned int next, prev;

        /* NOTE: our first selection will skip 0. */
        prev = __this_cpu_read(distribute_cpu_mask_prev);
        next = cpumask_next_wrap(prev, srcp);
        if (next < nr_cpu_ids)
                __this_cpu_write(distribute_cpu_mask_prev, next);

        return next;
}
EXPORT_SYMBOL(cpumask_any_distribute);





































   44 



















































































































  789 















  696 













  269 


































  592 










  319 

























































  252 










   68 
   66 

























































































  236 





























































   13 

















































































































































































































































































































































































































































































































































  171 
  288 












   14 












  181 
















  570 
  397 



































































    1 















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

#include <linux/container_of.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/poison.h>
#include <linux/const.h>

#include <asm/barrier.h>

/*
 * Circular doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)

/**
 * INIT_LIST_HEAD - Initialize a list_head structure
 * @list: list_head structure to be initialized.
 *
 * Initializes the list_head to point to itself.  If it is a list header,
 * the result is an empty list.
 */
static inline void INIT_LIST_HEAD(struct list_head *list)
{
        WRITE_ONCE(list->next, list);
        WRITE_ONCE(list->prev, list);
}

#ifdef CONFIG_LIST_HARDENED

#ifdef CONFIG_DEBUG_LIST
# define __list_valid_slowpath
#else
# define __list_valid_slowpath __cold __preserve_most
#endif

/*
 * Performs the full set of list corruption checks before __list_add().
 * On list corruption reports a warning, and returns false.
 */
extern bool __list_valid_slowpath __list_add_valid_or_report(struct list_head *new,
                                                             struct list_head *prev,
                                                             struct list_head *next);

/*
 * Performs list corruption checks before __list_add(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_add_valid_or_report().
 */
static __always_inline bool __list_add_valid(struct list_head *new,
                                             struct list_head *prev,
                                             struct list_head *next)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, since the immediate dereference of them below would
                 * result in a fault if NULL.
                 *
                 * With the reduced set of checks, we can afford to inline the
                 * checks, which also gives the compiler a chance to elide some
                 * of them completely if they can be proven at compile-time. If
                 * one of the pre-conditions does not hold, the slow-path will
                 * show a report which pre-condition failed.
                 */
                if (likely(next->prev == prev && prev->next == next && new != prev && new != next))
                        return true;
                ret = false;
        }

        ret &= __list_add_valid_or_report(new, prev, next);
        return ret;
}

/*
 * Performs the full set of list corruption checks before __list_del_entry().
 * On list corruption reports a warning, and returns false.
 */
extern bool __list_valid_slowpath __list_del_entry_valid_or_report(struct list_head *entry);

/*
 * Performs list corruption checks before __list_del_entry(). Returns false if a
 * corruption is detected, true otherwise.
 *
 * With CONFIG_LIST_HARDENED only, performs minimal list integrity checking
 * inline to catch non-faulting corruptions, and only if a corruption is
 * detected calls the reporting function __list_del_entry_valid_or_report().
 */
static __always_inline bool __list_del_entry_valid(struct list_head *entry)
{
        bool ret = true;

        if (!IS_ENABLED(CONFIG_DEBUG_LIST)) {
                struct list_head *prev = entry->prev;
                struct list_head *next = entry->next;

                /*
                 * With the hardening version, elide checking if next and prev
                 * are NULL, LIST_POISON1 or LIST_POISON2, since the immediate
                 * dereference of them below would result in a fault.
                 */
                if (likely(prev->next == entry && next->prev == entry))
                        return true;
                ret = false;
        }

        ret &= __list_del_entry_valid_or_report(entry);
        return ret;
}
#else
static inline bool __list_add_valid(struct list_head *new,
                                struct list_head *prev,
                                struct list_head *next)
{
        return true;
}
static inline bool __list_del_entry_valid(struct list_head *entry)
{
        return true;
}
#endif

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_add(struct list_head *new,
                              struct list_head *prev,
                              struct list_head *next)
{
        if (!__list_add_valid(new, prev, next))
                return;

        next->prev = new;
        new->next = next;
        new->prev = prev;
        WRITE_ONCE(prev->next, new);
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
        __list_add(new, head, head->next);
}


/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
        __list_add(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
        next->prev = prev;
        WRITE_ONCE(prev->next, next);
}

/*
 * Delete a list entry and clear the 'prev' pointer.
 *
 * This is a special-purpose list clearing method used in the networking code
 * for lists allocated as per-cpu, where we don't want to incur the extra
 * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this
 * needs to check the node 'prev' pointer instead of calling list_empty().
 */
static inline void __list_del_clearprev(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->prev = NULL;
}

static inline void __list_del_entry(struct list_head *entry)
{
        if (!__list_del_entry_valid(entry))
                return;

        __list_del(entry->prev, entry->next);
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty() on entry does not return true after this, the entry is
 * in an undefined state.
 */
static inline void list_del(struct list_head *entry)
{
        __list_del_entry(entry);
        entry->next = LIST_POISON1;
        entry->prev = LIST_POISON2;
}

/**
 * list_replace - replace old entry by new one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace(struct list_head *old,
                                struct list_head *new)
{
        new->next = old->next;
        new->next->prev = new;
        new->prev = old->prev;
        new->prev->next = new;
}

/**
 * list_replace_init - replace old entry by new one and initialize the old one
 * @old : the element to be replaced
 * @new : the new element to insert
 *
 * If @old was empty, it will be overwritten.
 */
static inline void list_replace_init(struct list_head *old,
                                     struct list_head *new)
{
        list_replace(old, new);
        INIT_LIST_HEAD(old);
}

/**
 * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position
 * @entry1: the location to place entry2
 * @entry2: the location to place entry1
 */
static inline void list_swap(struct list_head *entry1,
                             struct list_head *entry2)
{
        struct list_head *pos = entry2->prev;

        list_del(entry2);
        list_replace(entry1, entry2);
        if (pos == entry1)
                pos = entry2;
        list_add(entry1, pos);
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void list_del_init(struct list_head *entry)
{
        __list_del_entry(entry);
        INIT_LIST_HEAD(entry);
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void list_move(struct list_head *list, struct list_head *head)
{
        __list_del_entry(list);
        list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void list_move_tail(struct list_head *list,
                                  struct list_head *head)
{
        __list_del_entry(list);
        list_add_tail(list, head);
}

/**
 * list_bulk_move_tail - move a subsection of a list to its tail
 * @head: the head that will follow our entry
 * @first: first entry to move
 * @last: last entry to move, can be the same as first
 *
 * Move all entries between @first and including @last before @head.
 * All three entries must belong to the same linked list.
 */
static inline void list_bulk_move_tail(struct list_head *head,
                                       struct list_head *first,
                                       struct list_head *last)
{
        first->prev->next = last->next;
        last->next->prev = first->prev;

        head->prev->next = first;
        first->prev = head->prev;

        last->next = head;
        head->prev = last;
}

/**
 * list_is_first -- tests whether @list is the first entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_first(const struct list_head *list, const struct list_head *head)
{
        return list->prev == head;
}

/**
 * list_is_last - tests whether @list is the last entry in list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_last(const struct list_head *list, const struct list_head *head)
{
        return list->next == head;
}

/**
 * list_is_head - tests whether @list is the list @head
 * @list: the entry to test
 * @head: the head of the list
 */
static inline int list_is_head(const struct list_head *list, const struct list_head *head)
{
        return list == head;
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int list_empty(const struct list_head *head)
{
        return READ_ONCE(head->next) == head;
}

/**
 * list_del_init_careful - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 *
 * This is the same as list_del_init(), except designed to be used
 * together with list_empty_careful() in a way to guarantee ordering
 * of other memory operations.
 *
 * Any memory operations done before a list_del_init_careful() are
 * guaranteed to be visible after a list_empty_careful() test.
 */
static inline void list_del_init_careful(struct list_head *entry)
{
        __list_del_entry(entry);
        WRITE_ONCE(entry->prev, entry);
        smp_store_release(&entry->next, entry);
}

/**
 * list_empty_careful - tests whether a list is empty and not being modified
 * @head: the list to test
 *
 * Description:
 * tests whether a list is empty _and_ checks that no other CPU might be
 * in the process of modifying either member (next or prev)
 *
 * NOTE: using list_empty_careful() without synchronization
 * can only be safe if the only activity that can happen
 * to the list entry is list_del_init(). Eg. it cannot be used
 * if another CPU could re-list_add() it.
 */
static inline int list_empty_careful(const struct list_head *head)
{
        struct list_head *next = smp_load_acquire(&head->next);
        return list_is_head(next, head) && (next == READ_ONCE(head->prev));
}

/**
 * list_rotate_left - rotate the list to the left
 * @head: the head of the list
 */
static inline void list_rotate_left(struct list_head *head)
{
        struct list_head *first;

        if (!list_empty(head)) {
                first = head->next;
                list_move_tail(first, head);
        }
}

/**
 * list_rotate_to_front() - Rotate list to specific item.
 * @list: The desired new front of the list.
 * @head: The head of the list.
 *
 * Rotates list so that @list becomes the new front of the list.
 */
static inline void list_rotate_to_front(struct list_head *list,
                                        struct list_head *head)
{
        /*
         * Deletes the list head from the list denoted by @head and
         * places it as the tail of @list, this effectively rotates the
         * list so that @list is at the front.
         */
        list_move_tail(head, list);
}

/**
 * list_is_singular - tests whether a list has just one entry.
 * @head: the list to test.
 */
static inline int list_is_singular(const struct list_head *head)
{
        return !list_empty(head) && (head->next == head->prev);
}

static inline void __list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        struct list_head *new_first = entry->next;
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry;
        entry->next = list;
        head->next = new_first;
        new_first->prev = head;
}

/**
 * list_cut_position - cut a list into two
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *        and if so we won't cut the list
 *
 * This helper moves the initial part of @head, up to and
 * including @entry, from @head to @list. You should
 * pass on @entry an element you know is on @head. @list
 * should be an empty list or a list you do not care about
 * losing its data.
 *
 */
static inline void list_cut_position(struct list_head *list,
                struct list_head *head, struct list_head *entry)
{
        if (list_empty(head))
                return;
        if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next))
                return;
        if (list_is_head(entry, head))
                INIT_LIST_HEAD(list);
        else
                __list_cut_position(list, head, entry);
}

/**
 * list_cut_before - cut a list into two, before given entry
 * @list: a new list to add all removed entries
 * @head: a list with entries
 * @entry: an entry within head, could be the head itself
 *
 * This helper moves the initial part of @head, up to but
 * excluding @entry, from @head to @list.  You should pass
 * in @entry an element you know is on @head.  @list should
 * be an empty list or a list you do not care about losing
 * its data.
 * If @entry == @head, all entries on @head are moved to
 * @list.
 */
static inline void list_cut_before(struct list_head *list,
                                   struct list_head *head,
                                   struct list_head *entry)
{
        if (head->next == entry) {
                INIT_LIST_HEAD(list);
                return;
        }
        list->next = head->next;
        list->next->prev = list;
        list->prev = entry->prev;
        list->prev->next = list;
        head->next = entry;
        entry->prev = head;
}

static inline void __list_splice(const struct list_head *list,
                                 struct list_head *prev,
                                 struct list_head *next)
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;

        first->prev = prev;
        prev->next = first;

        last->next = next;
        next->prev = last;
}

/**
 * list_splice - join two lists, this is designed for stacks
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice(const struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head, head->next);
}

/**
 * list_splice_tail - join two lists, each list being a queue
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void list_splice_tail(struct list_head *list,
                                struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head->prev, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void list_splice_init(struct list_head *list,
                                    struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head, head->next);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_splice_tail_init - join two lists and reinitialise the emptied list
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * Each of the lists is a queue.
 * The list at @list is reinitialised
 */
static inline void list_splice_tail_init(struct list_head *list,
                                         struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head->prev, head);
                INIT_LIST_HEAD(list);
        }
}

/**
 * list_entry - get the struct for this entry
 * @ptr:        the &struct list_head pointer.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry(ptr, type, member) \
        container_of(ptr, type, member)

/**
 * list_first_entry - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_first_entry(ptr, type, member) \
        list_entry((ptr)->next, type, member)

/**
 * list_last_entry - get the last element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note, that list is expected to be not empty.
 */
#define list_last_entry(ptr, type, member) \
        list_entry((ptr)->prev, type, member)

/**
 * list_first_entry_or_null - get the first element from a list
 * @ptr:        the list head to take the element from.
 * @type:        the type of the struct this is embedded in.
 * @member:        the name of the list_head within the struct.
 *
 * Note that if the list is empty, it returns NULL.
 */
#define list_first_entry_or_null(ptr, type, member) ({ \
        struct list_head *head__ = (ptr); \
        struct list_head *pos__ = READ_ONCE(head__->next); \
        pos__ != head__ ? list_entry(pos__, type, member) : NULL; \
})

/**
 * list_next_entry - get the next element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_next_entry(pos, member) \
        list_entry((pos)->member.next, typeof(*(pos)), member)

/**
 * list_next_entry_circular - get the next element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the last element (return the first element).
 * Note, that list is expected to be not empty.
 */
#define list_next_entry_circular(pos, head, member) \
        (list_is_last(&(pos)->member, head) ? \
        list_first_entry(head, typeof(*(pos)), member) : list_next_entry(pos, member))

/**
 * list_prev_entry - get the prev element in list
 * @pos:        the type * to cursor
 * @member:        the name of the list_head within the struct.
 */
#define list_prev_entry(pos, member) \
        list_entry((pos)->member.prev, typeof(*(pos)), member)

/**
 * list_prev_entry_circular - get the prev element in list
 * @pos:        the type * to cursor.
 * @head:        the list head to take the element from.
 * @member:        the name of the list_head within the struct.
 *
 * Wraparound if pos is the first element (return the last element).
 * Note, that list is expected to be not empty.
 */
#define list_prev_entry_circular(pos, head, member) \
        (list_is_first(&(pos)->member, head) ? \
        list_last_entry(head, typeof(*(pos)), member) : list_prev_entry(pos, member))

/**
 * list_for_each        -        iterate over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each(pos, head) \
        for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_rcu - Iterate over a list in an RCU-safe fashion
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_rcu(pos, head)                  \
        for (pos = rcu_dereference((head)->next); \
             !list_is_head(pos, (head)); \
             pos = rcu_dereference(pos->next))

/**
 * list_for_each_continue - continue iteration over a list
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 *
 * Continue to iterate over a list, continuing after the current position.
 */
#define list_for_each_continue(pos, head) \
        for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next)

/**
 * list_for_each_prev        -        iterate over a list backwards
 * @pos:        the &struct list_head to use as a loop cursor.
 * @head:        the head for your list.
 */
#define list_for_each_prev(pos, head) \
        for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)

/**
 * list_for_each_safe - iterate over a list safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->next)

/**
 * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
 * @pos:        the &struct list_head to use as a loop cursor.
 * @n:                another &struct list_head to use as temporary storage
 * @head:        the head for your list.
 */
#define list_for_each_prev_safe(pos, n, head) \
        for (pos = (head)->prev, n = pos->prev; \
             !list_is_head(pos, (head)); \
             pos = n, n = pos->prev)

/**
 * list_count_nodes - count nodes in the list
 * @head:        the head for your list.
 */
static inline size_t list_count_nodes(struct list_head *head)
{
        struct list_head *pos;
        size_t count = 0;

        list_for_each(pos, head)
                count++;

        return count;
}

/**
 * list_entry_is_head - test if the entry points to the head of the list
 * @pos:        the type * to cursor
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_entry_is_head(pos, head, member)                                \
        list_is_head(&pos->member, (head))

/**
 * list_for_each_entry        -        iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry(pos, head, member)                                \
        for (pos = list_first_entry(head, typeof(*pos), member);        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_reverse - iterate backwards over list of given type.
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_reverse(pos, head, member)                        \
        for (pos = list_last_entry(head, typeof(*pos), member);                \
             !list_entry_is_head(pos, head, member);                         \
             pos = list_prev_entry(pos, member))

/**
 * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue()
 * @pos:        the type * to use as a start point
 * @head:        the head of the list
 * @member:        the name of the list_head within the struct.
 *
 * Prepares a pos entry for use as a start point in list_for_each_entry_continue().
 */
#define list_prepare_entry(pos, head, member) \
        ((pos) ? : list_entry(head, typeof(*pos), member))

/**
 * list_for_each_entry_continue - continue iteration over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Continue to iterate over list of given type, continuing after
 * the current position.
 */
#define list_for_each_entry_continue(pos, head, member)                 \
        for (pos = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_continue_reverse - iterate backwards from the given point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Start to iterate over list of given type backwards, continuing after
 * the current position.
 */
#define list_for_each_entry_continue_reverse(pos, head, member)                \
        for (pos = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_from - iterate over list of given type from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing from current position.
 */
#define list_for_each_entry_from(pos, head, member)                         \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_next_entry(pos, member))

/**
 * list_for_each_entry_from_reverse - iterate backwards over list of given type
 *                                    from the current point
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, continuing from current position.
 */
#define list_for_each_entry_from_reverse(pos, head, member)                \
        for (; !list_entry_is_head(pos, head, member);                        \
             pos = list_prev_entry(pos, member))

/**
 * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 */
#define list_for_each_entry_safe(pos, n, head, member)                        \
        for (pos = list_first_entry(head, typeof(*pos), member),        \
                n = list_next_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_continue - continue list iteration safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type, continuing after current point,
 * safe against removal of list entry.
 */
#define list_for_each_entry_safe_continue(pos, n, head, member)                 \
        for (pos = list_next_entry(pos, member),                                 \
                n = list_next_entry(pos, member);                                \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_from - iterate over list from current point safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate over list of given type from current point, safe against
 * removal of list entry.
 */
#define list_for_each_entry_safe_from(pos, n, head, member)                         \
        for (n = list_next_entry(pos, member);                                        \
             !list_entry_is_head(pos, head, member);                                \
             pos = n, n = list_next_entry(n, member))

/**
 * list_for_each_entry_safe_reverse - iterate backwards over list safe against removal
 * @pos:        the type * to use as a loop cursor.
 * @n:                another type * to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the list_head within the struct.
 *
 * Iterate backwards over list of given type, safe against removal
 * of list entry.
 */
#define list_for_each_entry_safe_reverse(pos, n, head, member)                \
        for (pos = list_last_entry(head, typeof(*pos), member),                \
                n = list_prev_entry(pos, member);                        \
             !list_entry_is_head(pos, head, member);                         \
             pos = n, n = list_prev_entry(n, member))

/**
 * list_safe_reset_next - reset a stale list_for_each_entry_safe loop
 * @pos:        the loop cursor used in the list_for_each_entry_safe loop
 * @n:                temporary storage used in list_for_each_entry_safe
 * @member:        the name of the list_head within the struct.
 *
 * list_safe_reset_next is not safe to use in general if the list may be
 * modified concurrently (eg. the lock is dropped in the loop body). An
 * exception to this is if the cursor element (pos) is pinned in the list,
 * and list_safe_reset_next is called after re-taking the lock and before
 * completing the current iteration of the loop body.
 */
#define list_safe_reset_next(pos, n, member)                                \
        n = list_next_entry(pos, member)

/*
 * Double linked lists with a single pointer list head.
 * Mostly useful for hash tables where the two pointer list head is
 * too wasteful.
 * You lose the ability to access the tail in O(1).
 */

#define HLIST_HEAD_INIT { .first = NULL }
#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
static inline void INIT_HLIST_NODE(struct hlist_node *h)
{
        h->next = NULL;
        h->pprev = NULL;
}

/**
 * hlist_unhashed - Has node been removed from list and reinitialized?
 * @h: Node to be checked
 *
 * Not that not all removal functions will leave a node in unhashed
 * state.  For example, hlist_nulls_del_init_rcu() does leave the
 * node in unhashed state, but hlist_nulls_del() does not.
 */
static inline int hlist_unhashed(const struct hlist_node *h)
{
        return !h->pprev;
}

/**
 * hlist_unhashed_lockless - Version of hlist_unhashed for lockless use
 * @h: Node to be checked
 *
 * This variant of hlist_unhashed() must be used in lockless contexts
 * to avoid potential load-tearing.  The READ_ONCE() is paired with the
 * various WRITE_ONCE() in hlist helpers that are defined below.
 */
static inline int hlist_unhashed_lockless(const struct hlist_node *h)
{
        return !READ_ONCE(h->pprev);
}

/**
 * hlist_empty - Is the specified hlist_head structure an empty hlist?
 * @h: Structure to check.
 */
static inline int hlist_empty(const struct hlist_head *h)
{
        return !READ_ONCE(h->first);
}

static inline void __hlist_del(struct hlist_node *n)
{
        struct hlist_node *next = n->next;
        struct hlist_node **pprev = n->pprev;

        WRITE_ONCE(*pprev, next);
        if (next)
                WRITE_ONCE(next->pprev, pprev);
}

/**
 * hlist_del - Delete the specified hlist_node from its list
 * @n: Node to delete.
 *
 * Note that this function leaves the node in hashed state.  Use
 * hlist_del_init() or similar instead to unhash @n.
 */
static inline void hlist_del(struct hlist_node *n)
{
        __hlist_del(n);
        n->next = LIST_POISON1;
        n->pprev = LIST_POISON2;
}

/**
 * hlist_del_init - Delete the specified hlist_node from its list and initialize
 * @n: Node to delete.
 *
 * Note that this function leaves the node in unhashed state.
 */
static inline void hlist_del_init(struct hlist_node *n)
{
        if (!hlist_unhashed(n)) {
                __hlist_del(n);
                INIT_HLIST_NODE(n);
        }
}

/**
 * hlist_add_head - add a new entry at the beginning of the hlist
 * @n: new entry to be added
 * @h: hlist head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
{
        struct hlist_node *first = h->first;
        WRITE_ONCE(n->next, first);
        if (first)
                WRITE_ONCE(first->pprev, &n->next);
        WRITE_ONCE(h->first, n);
        WRITE_ONCE(n->pprev, &h->first);
}

/**
 * hlist_add_before - add a new entry before the one specified
 * @n: new entry to be added
 * @next: hlist node to add it before, which must be non-NULL
 */
static inline void hlist_add_before(struct hlist_node *n,
                                    struct hlist_node *next)
{
        WRITE_ONCE(n->pprev, next->pprev);
        WRITE_ONCE(n->next, next);
        WRITE_ONCE(next->pprev, &n->next);
        WRITE_ONCE(*(n->pprev), n);
}

/**
 * hlist_add_behind - add a new entry after the one specified
 * @n: new entry to be added
 * @prev: hlist node to add it after, which must be non-NULL
 */
static inline void hlist_add_behind(struct hlist_node *n,
                                    struct hlist_node *prev)
{
        WRITE_ONCE(n->next, prev->next);
        WRITE_ONCE(prev->next, n);
        WRITE_ONCE(n->pprev, &prev->next);

        if (n->next)
                WRITE_ONCE(n->next->pprev, &n->next);
}

/**
 * hlist_add_fake - create a fake hlist consisting of a single headless node
 * @n: Node to make a fake list out of
 *
 * This makes @n appear to be its own predecessor on a headless hlist.
 * The point of this is to allow things like hlist_del() to work correctly
 * in cases where there is no list.
 */
static inline void hlist_add_fake(struct hlist_node *n)
{
        n->pprev = &n->next;
}

/**
 * hlist_fake: Is this node a fake hlist?
 * @h: Node to check for being a self-referential fake hlist.
 */
static inline bool hlist_fake(struct hlist_node *h)
{
        return h->pprev == &h->next;
}

/**
 * hlist_is_singular_node - is node the only element of the specified hlist?
 * @n: Node to check for singularity.
 * @h: Header for potentially singular list.
 *
 * Check whether the node is the only node of the head without
 * accessing head, thus avoiding unnecessary cache misses.
 */
static inline bool
hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h)
{
        return !n->next && n->pprev == &h->first;
}

/**
 * hlist_move_list - Move an hlist
 * @old: hlist_head for old list.
 * @new: hlist_head for new list.
 *
 * Move a list from one list head to another. Fixup the pprev
 * reference of the first entry if it exists.
 */
static inline void hlist_move_list(struct hlist_head *old,
                                   struct hlist_head *new)
{
        new->first = old->first;
        if (new->first)
                new->first->pprev = &new->first;
        old->first = NULL;
}

/**
 * hlist_splice_init() - move all entries from one list to another
 * @from: hlist_head from which entries will be moved
 * @last: last entry on the @from list
 * @to:   hlist_head to which entries will be moved
 *
 * @to can be empty, @from must contain at least @last.
 */
static inline void hlist_splice_init(struct hlist_head *from,
                                     struct hlist_node *last,
                                     struct hlist_head *to)
{
        if (to->first)
                to->first->pprev = &last->next;
        last->next = to->first;
        to->first = from->first;
        from->first->pprev = &to->first;
        from->first = NULL;
}

#define hlist_entry(ptr, type, member) container_of(ptr,type,member)

#define hlist_for_each(pos, head) \
        for (pos = (head)->first; pos ; pos = pos->next)

#define hlist_for_each_safe(pos, n, head) \
        for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
             pos = n)

#define hlist_entry_safe(ptr, type, member) \
        ({ typeof(ptr) ____ptr = (ptr); \
           ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
        })

/**
 * hlist_for_each_entry        - iterate over list of given type
 * @pos:        the type * to use as a loop cursor.
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry(pos, head, member)                                \
        for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_continue(pos, member)                        \
        for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
             pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_from - iterate over a hlist continuing from current point
 * @pos:        the type * to use as a loop cursor.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_from(pos, member)                                \
        for (; pos;                                                        \
             pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))

/**
 * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
 * @pos:        the type * to use as a loop cursor.
 * @n:                a &struct hlist_node to use as temporary storage
 * @head:        the head for your list.
 * @member:        the name of the hlist_node within the struct.
 */
#define hlist_for_each_entry_safe(pos, n, head, member)                 \
        for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\
             pos && ({ n = pos->member.next; 1; });                        \
             pos = hlist_entry_safe(n, typeof(*pos), member))

/**
 * hlist_count_nodes - count nodes in the hlist
 * @head:        the head for your hlist.
 */
static inline size_t hlist_count_nodes(struct hlist_head *head)
{
        struct hlist_node *pos;
        size_t count = 0;

        hlist_for_each(pos, head)
                count++;

        return count;
}

#endif





























































  161 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_MMU_H
#define __ASM_MMU_H

#include <asm/cputype.h>

#define MMCF_AARCH32        0x1        /* mm context flag for AArch32 executables */
#define USER_ASID_BIT        48
#define USER_ASID_FLAG        (UL(1) << USER_ASID_BIT)
#define TTBR_ASID_MASK        (UL(0xffff) << 48)

#ifndef __ASSEMBLY__

#include <linux/refcount.h>
#include <asm/cpufeature.h>

typedef struct {
        atomic64_t        id;
#ifdef CONFIG_COMPAT
        void                *sigpage;
#endif
        refcount_t        pinned;
        void                *vdso;
        unsigned long        flags;
        u8                pkey_allocation_map;
} mm_context_t;

/*
 * We use atomic64_read() here because the ASID for an 'mm_struct' can
 * be reallocated when scheduling one of its threads following a
 * rollover event (see new_context() and flush_context()). In this case,
 * a concurrent TLBI (e.g. via try_to_unmap_one() and ptep_clear_flush())
 * may use a stale ASID. This is fine in principle as the new ASID is
 * guaranteed to be clean in the TLB, but the TLBI routines have to take
 * care to handle the following race:
 *
 *    CPU 0                    CPU 1                          CPU 2
 *
 *    // ptep_clear_flush(mm)
 *    xchg_relaxed(pte, 0)
 *    DSB ISHST
 *    old = ASID(mm)
 *         |                                                  <rollover>
 *         |                   new = new_context(mm)
 *         \-----------------> atomic_set(mm->context.id, new)
 *                             cpu_switch_mm(mm)
 *                             // Hardware walk of pte using new ASID
 *    TLBI(old)
 *
 * In this scenario, the barrier on CPU 0 and the dependency on CPU 1
 * ensure that the page-table walker on CPU 1 *must* see the invalid PTE
 * written by CPU 0.
 */
#define ASID(mm)        (atomic64_read(&(mm)->context.id) & 0xffff)

static inline bool arm64_kernel_unmapped_at_el0(void)
{
        return alternative_has_cap_unlikely(ARM64_UNMAP_KERNEL_AT_EL0);
}

extern void arm64_memblock_init(void);
extern void paging_init(void);
extern void bootmem_init(void);
extern void create_mapping_noalloc(phys_addr_t phys, unsigned long virt,
                                   phys_addr_t size, pgprot_t prot);
extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
                               unsigned long virt, phys_addr_t size,
                               pgprot_t prot, bool page_mappings_only);
extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
extern void mark_linear_text_alias_ro(void);

/*
 * This check is triggered during the early boot before the cpufeature
 * is initialised. Checking the status on the local CPU allows the boot
 * CPU to detect the need for non-global mappings and thus avoiding a
 * pagetable re-write after all the CPUs are booted. This check will be
 * anyway run on individual CPUs, allowing us to get the consistent
 * state once the SMP CPUs are up and thus make the switch to non-global
 * mappings if required.
 */
static inline bool kaslr_requires_kpti(void)
{
        /*
         * E0PD does a similar job to KPTI so can be used instead
         * where available.
         */
        if (IS_ENABLED(CONFIG_ARM64_E0PD)) {
                u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1);
                if (cpuid_feature_extract_unsigned_field(mmfr2,
                                                ID_AA64MMFR2_EL1_E0PD_SHIFT))
                        return false;
        }

        return true;
}

#endif        /* !__ASSEMBLY__ */
#endif







































































































































  177 






  177 






  177 











































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM writeback

#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_WRITEBACK_H

#include <linux/tracepoint.h>
#include <linux/backing-dev.h>
#include <linux/writeback.h>

#define show_inode_state(state)                                        \
        __print_flags(state, "|",                                \
                {I_DIRTY_SYNC,                "I_DIRTY_SYNC"},        \
                {I_DIRTY_DATASYNC,        "I_DIRTY_DATASYNC"},        \
                {I_DIRTY_PAGES,                "I_DIRTY_PAGES"},        \
                {I_NEW,                        "I_NEW"},                \
                {I_WILL_FREE,                "I_WILL_FREE"},                \
                {I_FREEING,                "I_FREEING"},                \
                {I_CLEAR,                "I_CLEAR"},                \
                {I_SYNC,                "I_SYNC"},                \
                {I_DIRTY_TIME,                "I_DIRTY_TIME"},        \
                {I_REFERENCED,                "I_REFERENCED"},        \
                {I_LINKABLE,                "I_LINKABLE"},                \
                {I_WB_SWITCH,                "I_WB_SWITCH"},                \
                {I_OVL_INUSE,                "I_OVL_INUSE"},                \
                {I_CREATING,                "I_CREATING"},                \
                {I_DONTCACHE,                "I_DONTCACHE"},                \
                {I_SYNC_QUEUED,                "I_SYNC_QUEUED"},        \
                {I_PINNING_NETFS_WB,        "I_PINNING_NETFS_WB"},        \
                {I_LRU_ISOLATING,        "I_LRU_ISOLATING"}        \
        )

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a,b)         TRACE_DEFINE_ENUM(a);
#define EMe(a,b)        TRACE_DEFINE_ENUM(a);

#define WB_WORK_REASON                                                        \
        EM( WB_REASON_BACKGROUND,                "background")                \
        EM( WB_REASON_VMSCAN,                        "vmscan")                \
        EM( WB_REASON_SYNC,                        "sync")                        \
        EM( WB_REASON_PERIODIC,                        "periodic")                \
        EM( WB_REASON_LAPTOP_TIMER,                "laptop_timer")                \
        EM( WB_REASON_FS_FREE_SPACE,                "fs_free_space")        \
        EM( WB_REASON_FORKER_THREAD,                "forker_thread")        \
        EMe(WB_REASON_FOREIGN_FLUSH,                "foreign_flush")

WB_WORK_REASON

/*
 * Now redefine the EM() and EMe() macros to map the enums to the strings
 * that will be printed in the output.
 */
#undef EM
#undef EMe
#define EM(a,b)                { a, b },
#define EMe(a,b)        { a, b }

struct wb_writeback_work;

DECLARE_EVENT_CLASS(writeback_folio_template,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(pgoff_t, index)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
                                         NULL), 32);
                __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0;
                __entry->index = folio->index;
        ),

        TP_printk("bdi %s: ino=%lu index=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->index
        )
);

DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DEFINE_EVENT(writeback_folio_template, folio_wait_writeback,

        TP_PROTO(struct folio *folio, struct address_space *mapping),

        TP_ARGS(folio, mapping)
);

DECLARE_EVENT_CLASS(writeback_dirty_inode_template,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, flags)
        ),

        TP_fast_assign(
                struct backing_dev_info *bdi = inode_to_bdi(inode);

                /* may be called for files on pseudo FSes w/ unregistered bdi */
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->flags                = flags;
        ),

        TP_printk("bdi %s: ino=%lu state=%s flags=%s",
                __entry->name,
                (unsigned long)__entry->ino,
                show_inode_state(__entry->state),
                show_inode_state(__entry->flags)
        )
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,

        TP_PROTO(struct inode *inode, int flags),

        TP_ARGS(inode, flags)
);

#ifdef CREATE_TRACE_POINTS
#ifdef CONFIG_CGROUP_WRITEBACK

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return cgroup_ino(wb->memcg_css->cgroup);
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        if (wbc->wb)
                return __trace_wb_assign_cgroup(wbc->wb);
        else
                return 1;
}
#else        /* CONFIG_CGROUP_WRITEBACK */

static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb)
{
        return 1;
}

static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc)
{
        return 1;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */
#endif        /* CREATE_TRACE_POINTS */

#ifdef CONFIG_CGROUP_WRITEBACK
TRACE_EVENT(inode_foreign_history,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc,
                 unsigned int history),

        TP_ARGS(inode, wbc, history),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        history)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
                __entry->history        = history;
        ),

        TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->cgroup_ino,
                __entry->history
        )
);

TRACE_EVENT(inode_switch_wbs,

        TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb,
                 struct bdi_writeback *new_wb),

        TP_ARGS(inode, old_wb, new_wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                ino)
                __field(ino_t,                old_cgroup_ino)
                __field(ino_t,                new_cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32);
                __entry->ino                = inode->i_ino;
                __entry->old_cgroup_ino        = __trace_wb_assign_cgroup(old_wb);
                __entry->new_cgroup_ino        = __trace_wb_assign_cgroup(new_wb);
        ),

        TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                (unsigned long)__entry->old_cgroup_ino,
                (unsigned long)__entry->new_cgroup_ino
        )
);

TRACE_EVENT(track_foreign_dirty,

        TP_PROTO(struct folio *folio, struct bdi_writeback *wb),

        TP_ARGS(folio, wb),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(u64,                bdi_id)
                __field(ino_t,                ino)
                __field(unsigned int,        memcg_id)
                __field(ino_t,                cgroup_ino)
                __field(ino_t,                page_cgroup_ino)
        ),

        TP_fast_assign(
                struct address_space *mapping = folio_mapping(folio);
                struct inode *inode = mapping ? mapping->host : NULL;

                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->bdi_id                = wb->bdi->id;
                __entry->ino                = inode ? inode->i_ino : 0;
                __entry->memcg_id        = wb->memcg_css->id;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup);
        ),

        TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu",
                __entry->name,
                __entry->bdi_id,
                (unsigned long)__entry->ino,
                __entry->memcg_id,
                (unsigned long)__entry->cgroup_ino,
                (unsigned long)__entry->page_cgroup_ino
        )
);

TRACE_EVENT(flush_foreign,

        TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id,
                 unsigned int frn_memcg_id),

        TP_ARGS(wb, frn_bdi_id, frn_memcg_id),

        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(ino_t,                cgroup_ino)
                __field(unsigned int,        frn_bdi_id)
                __field(unsigned int,        frn_memcg_id)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
                __entry->frn_bdi_id        = frn_bdi_id;
                __entry->frn_memcg_id        = frn_memcg_id;
        ),

        TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u",
                __entry->name,
                (unsigned long)__entry->cgroup_ino,
                __entry->frn_bdi_id,
                __entry->frn_memcg_id
        )
);
#endif

DECLARE_EVENT_CLASS(writeback_write_inode_template,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc),

        TP_STRUCT__entry (
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(int, sync_mode)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu",
                __entry->name,
                (unsigned long)__entry->ino,
                __entry->sync_mode,
                (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,

        TP_PROTO(struct inode *inode, struct writeback_control *wbc),

        TP_ARGS(inode, wbc)
);

DECLARE_EVENT_CLASS(writeback_work_class,
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
        TP_ARGS(wb, work),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_pages)
                __field(dev_t, sb_dev)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, range_cyclic)
                __field(int, for_background)
                __field(int, reason)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->nr_pages = work->nr_pages;
                __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
                __entry->sync_mode = work->sync_mode;
                __entry->for_kupdate = work->for_kupdate;
                __entry->range_cyclic = work->range_cyclic;
                __entry->for_background        = work->for_background;
                __entry->reason = work->reason;
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
                  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu",
                  __entry->name,
                  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
                  __entry->nr_pages,
                  __entry->sync_mode,
                  __entry->for_kupdate,
                  __entry->range_cyclic,
                  __entry->for_background,
                  __print_symbolic(__entry->reason, WB_WORK_REASON),
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_WORK_EVENT(name) \
DEFINE_EVENT(writeback_work_class, name, \
        TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
        TP_ARGS(wb, work))
DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);

TRACE_EVENT(writeback_pages_written,
        TP_PROTO(long pages_written),
        TP_ARGS(pages_written),
        TP_STRUCT__entry(
                __field(long,                pages)
        ),
        TP_fast_assign(
                __entry->pages                = pages_written;
        ),
        TP_printk("%ld", __entry->pages)
);

DECLARE_EVENT_CLASS(writeback_class,
        TP_PROTO(struct bdi_writeback *wb),
        TP_ARGS(wb),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->cgroup_ino = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->cgroup_ino
        )
);
#define DEFINE_WRITEBACK_EVENT(name) \
DEFINE_EVENT(writeback_class, name, \
        TP_PROTO(struct bdi_writeback *wb), \
        TP_ARGS(wb))

DEFINE_WRITEBACK_EVENT(writeback_wake_background);

TRACE_EVENT(writeback_bdi_register,
        TP_PROTO(struct backing_dev_info *bdi),
        TP_ARGS(bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
        ),
        TP_printk("bdi %s",
                __entry->name
        )
);

DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
        TP_ARGS(wbc, bdi),
        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(long, nr_to_write)
                __field(long, pages_skipped)
                __field(int, sync_mode)
                __field(int, for_kupdate)
                __field(int, for_background)
                __field(int, for_reclaim)
                __field(int, range_cyclic)
                __field(long, range_start)
                __field(long, range_end)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(bdi), 32);
                __entry->nr_to_write        = wbc->nr_to_write;
                __entry->pages_skipped        = wbc->pages_skipped;
                __entry->sync_mode        = wbc->sync_mode;
                __entry->for_kupdate        = wbc->for_kupdate;
                __entry->for_background        = wbc->for_background;
                __entry->for_reclaim        = wbc->for_reclaim;
                __entry->range_cyclic        = wbc->range_cyclic;
                __entry->range_start        = (long)wbc->range_start;
                __entry->range_end        = (long)wbc->range_end;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
                "bgrd=%d reclm=%d cyclic=%d "
                "start=0x%lx end=0x%lx cgroup_ino=%lu",
                __entry->name,
                __entry->nr_to_write,
                __entry->pages_skipped,
                __entry->sync_mode,
                __entry->for_kupdate,
                __entry->for_background,
                __entry->for_reclaim,
                __entry->range_cyclic,
                __entry->range_start,
                __entry->range_end,
                (unsigned long)__entry->cgroup_ino
        )
)

#define DEFINE_WBC_EVENT(name) \
DEFINE_EVENT(wbc_class, name, \
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
        TP_ARGS(wbc, bdi))
DEFINE_WBC_EVENT(wbc_writepage);

TRACE_EVENT(writeback_queue_io,
        TP_PROTO(struct bdi_writeback *wb,
                 struct wb_writeback_work *work,
                 unsigned long dirtied_before,
                 int moved),
        TP_ARGS(wb, work, dirtied_before, moved),
        TP_STRUCT__entry(
                __array(char,                name, 32)
                __field(unsigned long,        older)
                __field(long,                age)
                __field(int,                moved)
                __field(int,                reason)
                __field(ino_t,                cgroup_ino)
        ),
        TP_fast_assign(
                strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32);
                __entry->older        = dirtied_before;
                __entry->age        = (jiffies - dirtied_before) * 1000 / HZ;
                __entry->moved        = moved;
                __entry->reason        = work->reason;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),
        TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu",
                __entry->name,
                __entry->older,        /* dirtied_before in jiffies */
                __entry->age,        /* dirtied_before in relative milliseconds */
                __entry->moved,
                __print_symbolic(__entry->reason, WB_WORK_REASON),
                (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(global_dirty_state,

        TP_PROTO(unsigned long background_thresh,
                 unsigned long dirty_thresh
        ),

        TP_ARGS(background_thresh,
                dirty_thresh
        ),

        TP_STRUCT__entry(
                __field(unsigned long,        nr_dirty)
                __field(unsigned long,        nr_writeback)
                __field(unsigned long,        background_thresh)
                __field(unsigned long,        dirty_thresh)
                __field(unsigned long,        dirty_limit)
                __field(unsigned long,        nr_dirtied)
                __field(unsigned long,        nr_written)
        ),

        TP_fast_assign(
                __entry->nr_dirty        = global_node_page_state(NR_FILE_DIRTY);
                __entry->nr_writeback        = global_node_page_state(NR_WRITEBACK);
                __entry->nr_dirtied        = global_node_page_state(NR_DIRTIED);
                __entry->nr_written        = global_node_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh        = dirty_thresh;
                __entry->dirty_limit        = global_wb_domain.dirty_limit;
        ),

        TP_printk("dirty=%lu writeback=%lu "
                  "bg_thresh=%lu thresh=%lu limit=%lu "
                  "dirtied=%lu written=%lu",
                  __entry->nr_dirty,
                  __entry->nr_writeback,
                  __entry->background_thresh,
                  __entry->dirty_thresh,
                  __entry->dirty_limit,
                  __entry->nr_dirtied,
                  __entry->nr_written
        )
);

#define KBps(x)                        ((x) << (PAGE_SHIFT - 10))

TRACE_EVENT(bdi_dirty_ratelimit,

        TP_PROTO(struct bdi_writeback *wb,
                 unsigned long dirty_rate,
                 unsigned long task_ratelimit),

        TP_ARGS(wb, dirty_rate, task_ratelimit),

        TP_STRUCT__entry(
                __array(char,                bdi, 32)
                __field(unsigned long,        write_bw)
                __field(unsigned long,        avg_write_bw)
                __field(unsigned long,        dirty_rate)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned long,        balanced_dirty_ratelimit)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);
                __entry->write_bw        = KBps(wb->write_bandwidth);
                __entry->avg_write_bw        = KBps(wb->avg_write_bandwidth);
                __entry->dirty_rate        = KBps(dirty_rate);
                __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
                                        KBps(wb->balanced_dirty_ratelimit);
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),

        TP_printk("bdi %s: "
                  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "balanced_dirty_ratelimit=%lu cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->write_bw,                /* write bandwidth */
                  __entry->avg_write_bw,        /* avg write bandwidth */
                  __entry->dirty_rate,                /* bdi dirty rate */
                  __entry->dirty_ratelimit,        /* base ratelimit */
                  __entry->task_ratelimit, /* ratelimit with position control */
                  __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
                  (unsigned long)__entry->cgroup_ino
        )
);

TRACE_EVENT(balance_dirty_pages,

        TP_PROTO(struct bdi_writeback *wb,
                 struct dirty_throttle_control *dtc,
                 unsigned long dirty_ratelimit,
                 unsigned long task_ratelimit,
                 unsigned long dirtied,
                 unsigned long period,
                 long pause,
                 unsigned long start_time),

        TP_ARGS(wb, dtc,
                dirty_ratelimit, task_ratelimit,
                dirtied, period, pause, start_time),

        TP_STRUCT__entry(
                __array(         char,        bdi, 32)
                __field(unsigned long,        limit)
                __field(unsigned long,        setpoint)
                __field(unsigned long,        dirty)
                __field(unsigned long,        wb_setpoint)
                __field(unsigned long,        wb_dirty)
                __field(unsigned long,        dirty_ratelimit)
                __field(unsigned long,        task_ratelimit)
                __field(unsigned int,        dirtied)
                __field(unsigned int,        dirtied_pause)
                __field(unsigned long,        paused)
                __field(         long,        pause)
                __field(unsigned long,        period)
                __field(         long,        think)
                __field(ino_t,                cgroup_ino)
        ),

        TP_fast_assign(
                unsigned long freerun = (dtc->thresh + dtc->bg_thresh) / 2;
                strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32);

                __entry->limit                = dtc->limit;
                __entry->setpoint        = (dtc->limit + freerun) / 2;
                __entry->dirty                = dtc->dirty;
                __entry->wb_setpoint        = __entry->setpoint *
                                                dtc->wb_thresh / (dtc->thresh + 1);
                __entry->wb_dirty        = dtc->wb_dirty;
                __entry->dirty_ratelimit = KBps(dirty_ratelimit);
                __entry->task_ratelimit        = KBps(task_ratelimit);
                __entry->dirtied        = dirtied;
                __entry->dirtied_pause        = current->nr_dirtied_pause;
                __entry->think                = current->dirty_paused_when == 0 ? 0 :
                         (long)(jiffies - current->dirty_paused_when) * 1000/HZ;
                __entry->period                = period * 1000 / HZ;
                __entry->pause                = pause * 1000 / HZ;
                __entry->paused                = (jiffies - start_time) * 1000 / HZ;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(wb);
        ),


        TP_printk("bdi %s: "
                  "limit=%lu setpoint=%lu dirty=%lu "
                  "wb_setpoint=%lu wb_dirty=%lu "
                  "dirty_ratelimit=%lu task_ratelimit=%lu "
                  "dirtied=%u dirtied_pause=%u "
                  "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu",
                  __entry->bdi,
                  __entry->limit,
                  __entry->setpoint,
                  __entry->dirty,
                  __entry->wb_setpoint,
                  __entry->wb_dirty,
                  __entry->dirty_ratelimit,
                  __entry->task_ratelimit,
                  __entry->dirtied,
                  __entry->dirtied_pause,
                  __entry->paused,        /* ms */
                  __entry->pause,        /* ms */
                  __entry->period,        /* ms */
                  __entry->think,        /* ms */
                  (unsigned long)__entry->cgroup_ino
          )
);

TRACE_EVENT(writeback_sb_inodes_requeue,

        TP_PROTO(struct inode *inode),
        TP_ARGS(inode),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->cgroup_ino        = __trace_wb_assign_cgroup(inode_to_wb(inode));
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  (unsigned long)__entry->cgroup_ino
        )
);

DECLARE_EVENT_CLASS(writeback_single_inode_template,

        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write
        ),

        TP_ARGS(inode, wbc, nr_to_write),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __field(ino_t, ino)
                __field(unsigned long, state)
                __field(unsigned long, dirtied_when)
                __field(unsigned long, writeback_index)
                __field(long, nr_to_write)
                __field(unsigned long, wrote)
                __field(ino_t, cgroup_ino)
        ),

        TP_fast_assign(
                strscpy_pad(__entry->name,
                            bdi_dev_name(inode_to_bdi(inode)), 32);
                __entry->ino                = inode->i_ino;
                __entry->state                = inode->i_state;
                __entry->dirtied_when        = inode->dirtied_when;
                __entry->writeback_index = inode->i_mapping->writeback_index;
                __entry->nr_to_write        = nr_to_write;
                __entry->wrote                = nr_to_write - wbc->nr_to_write;
                __entry->cgroup_ino        = __trace_wbc_assign_cgroup(wbc);
        ),

        TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
                  "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu",
                  __entry->name,
                  (unsigned long)__entry->ino,
                  show_inode_state(__entry->state),
                  __entry->dirtied_when,
                  (jiffies - __entry->dirtied_when) / HZ,
                  __entry->writeback_index,
                  __entry->nr_to_write,
                  __entry->wrote,
                  (unsigned long)__entry->cgroup_ino
        )
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
        TP_PROTO(struct inode *inode,
                 struct writeback_control *wbc,
                 unsigned long nr_to_write),
        TP_ARGS(inode, wbc, nr_to_write)
);

DECLARE_EVENT_CLASS(writeback_inode_template,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode),

        TP_STRUCT__entry(
                __field(        dev_t,        dev                        )
                __field(        ino_t,        ino                        )
                __field(unsigned long,        state                        )
                __field(        __u16, mode                        )
                __field(unsigned long, dirtied_when                )
        ),

        TP_fast_assign(
                __entry->dev        = inode->i_sb->s_dev;
                __entry->ino        = inode->i_ino;
                __entry->state        = inode->i_state;
                __entry->mode        = inode->i_mode;
                __entry->dirtied_when = inode->dirtied_when;
        ),

        TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  (unsigned long)__entry->ino, __entry->dirtied_when,
                  show_inode_state(__entry->state), __entry->mode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput,
        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue,

        TP_PROTO(struct inode *inode),

        TP_ARGS(inode)
);

/*
 * Inode writeback list tracking.
 */

DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback,
        TP_PROTO(struct inode *inode),
        TP_ARGS(inode)
);

#endif /* _TRACE_WRITEBACK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>
















  208 



   13 

  198 










  209 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2013 Huawei Ltd.
 * Author: Jiang Liu <liuj97@gmail.com>
 *
 * Based on arch/arm/kernel/jump_label.c
 */
#include <linux/kernel.h>
#include <linux/jump_label.h>
#include <linux/smp.h>
#include <asm/insn.h>
#include <asm/text-patching.h>

bool arch_jump_label_transform_queue(struct jump_entry *entry,
                                     enum jump_label_type type)
{
        void *addr = (void *)jump_entry_code(entry);
        u32 insn;

        if (type == JUMP_LABEL_JMP) {
                insn = aarch64_insn_gen_branch_imm(jump_entry_code(entry),
                                                   jump_entry_target(entry),
                                                   AARCH64_INSN_BRANCH_NOLINK);
        } else {
                insn = aarch64_insn_gen_nop();
        }

        aarch64_insn_patch_text_nosync(addr, insn);
        return true;
}

void arch_jump_label_transform_apply(void)
{
        kick_all_cpus_sync();
}










































































    8 














    8 


































    8 



















































    2 





    2 

    2 








    8 
    4 











    8 
    8 












    6 
    6 
































    2 







    2 

    2 

















    2 




    2 






    2 





    2 















    2 











    2 















































































































































































































































































































































































    5 


    7 





    1 













    7 

    1 





    2 


    2 









    6 




    6 
    2 
























































    6 































































    4 





















    1 












    8 










    8 








































































































































    6 





















    2 




























































































































































    6 
    8 





    6 
    8 





    8 










    8 











    6 
    6 













































































    6 



    6 

    6 



    6 








    6 






















    5 


    5 

    5 























































    8 












    8 
    8 
    8 
    8 











    5 







    5 



    5 


    8 
    4 
    8 























    8 











    8 


    9 

    8 

















    5 












    9 
























    8 



















































    8 



















    8 



    4 

    8 





    8 







































    5 















    8 





















    8 


    8 






















































    8 












    8 



    8 
























    8 









    8 

    8 


    8 







    8 



    8 

    8 

    8 

    8 


    5 
    6 










































































































































































    8 










    8 














































































































































































































































































    6 
    6 




    4 






    4 
    4 









    4 

    4 




    4 





    4 


    4 
    4 



    4 


    4 




    4 





    4 






    4 





    4 
    8 


    4 
    4 






    8 








    8 
    8 
    3 








    1 
    1 
















































































































































    3 


















    3 



    3 

















    6 




    6 

    6 











    6 






    6 




    6 

    6 


    2 






    6 
    6 



    6 



    6 















    2 





    6 






    5 























































    8 
    8 
    8 








    8 





    4 








    8 























    8 












    8 



    8 
    8 

    8 











    8 


    8 

    8 



    8 
    8 




















    8 


























    8 





    8 










    8 
    8 


    8 





    8 
















    8 


    7 

    8 

    8 
















    8 
    5 





    4 















































    8 
































    2 







    2 

    2 
    2 




    2 






    2 



    2 


    2 
















    3 

















    2 














    3 



    3 
















































































































































































































    4 







    4 






    4 

    4 






    4 










    4 













    8 







    8 





    8 
    8 



































    4 

    4 




    4 
















































































































































































































































































































































    8 
    8 












    8 








    7 










    8 























    8 



    8 


    8 




    8 
    8 


    8 

    8 




    8 

    4 

    8 



































    8 



    5 


    6 


    6 
    6 
    6 


    6 


    2 
    2 

    2 


























    8 



    8 

    8 




    8 







    8 

    8 


    8 

    8 











    8 

    8 








    8 
    8 




    8 
    8 





























































































































































































































































































































































































































































































































































    1 






    1 






    1 














    1 













    1 



    1 

    1 
    1 






    1 
























































































































    3 








    2 


































    1 






















































    8 








    8 





























    8 






























    4 


















    8 























    7 













































    8 

    5 
    5 











    8 

































































































    8 





































































































































































































































































































































    8 


    8 





















































    8 



    8 

    8 























































































































































































































































































































    3 
















    3 











    3 










































































































































































































































































































































































































































































































































































































































    6 




    6 
    6 
    6 






































    6 







































































    2 
    2 
    2 



    2 

























































































































































































































































































































































































































































































































    2 
    2 

    2 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































  269 


  271 











































































































































































































































































































































































































































































































































    6 






































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    6 





    6 



    6 







    6 




    2 







    2 































































































































































































































































































































































































    8 












    8 












    8 





    8 





    8 





    8 















































    8 











    7 










    8 


    8 




    8 




















    8 



















    8 

























    8 







    8 

    8 

    8 

    7 

    8 

    8 

    8 

    8 







    8 

    8 

    8 








    5 




    8 











    8 




    8 












    8 









    8 















































    8 
    8 



    8 









    8 






    8 









































    8 



    8 

    8 









    8 













    8 







    8 





    8 









    8 

























    8 












    8 



































































































































































































































































































































































































































































































































































































































































































































    8 
























    8 











    8 













    8 



















    8 
























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
7912
7913
7914
7915
7916
7917
7918
7919
7920
7921
7922
7923
7924
7925
7926
7927
7928
7929
7930
7931
7932
7933
7934
7935
7936
7937
7938
7939
7940
7941
7942
7943
7944
7945
7946
7947
7948
7949
7950
7951
7952
7953
7954
7955
7956
7957
7958
7959
7960
7961
7962
7963
7964
7965
7966
7967
7968
7969
7970
7971
7972
7973
7974
7975
7976
7977
7978
7979
7980
7981
7982
7983
7984
7985
7986
7987
7988
7989
7990
7991
7992
7993
7994
7995
7996
7997
7998
7999
8000
8001
8002
8003
8004
8005
8006
8007
8008
8009
8010
8011
8012
8013
8014
8015
8016
8017
8018
8019
8020
8021
8022
8023
8024
8025
8026
8027
8028
8029
8030
8031
8032
8033
8034
8035
8036
8037
8038
8039
8040
8041
8042
8043
8044
8045
8046
8047
8048
8049
8050
8051
8052
8053
8054
8055
8056
8057
8058
8059
8060
8061
8062
8063
8064
8065
8066
8067
8068
8069
8070
8071
8072
8073
8074
8075
8076
8077
8078
8079
8080
8081
8082
8083
8084
8085
8086
8087
8088
8089
8090
8091
8092
8093
8094
8095
8096
8097
8098
8099
8100
8101
8102
8103
8104
8105
8106
8107
8108
8109
8110
8111
8112
8113
8114
8115
8116
8117
8118
8119
8120
8121
8122
8123
8124
8125
8126
8127
8128
8129
8130
8131
8132
8133
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144
8145
8146
8147
8148
8149
8150
8151
8152
8153
8154
8155
8156
8157
8158
8159
8160
8161
8162
8163
8164
8165
8166
8167
8168
8169
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182
8183
8184
8185
8186
8187
8188
8189
8190
8191
8192
8193
8194
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206
8207
8208
8209
8210
8211
8212
8213
8214
8215
8216
8217
8218
8219
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237
8238
8239
8240
8241
8242
8243
8244
8245
8246
8247
8248
8249
8250
8251
8252
8253
8254
8255
8256
8257
8258
8259
8260
8261
8262
8263
8264
8265
8266
8267
8268
8269
8270
8271
8272
8273
8274
8275
8276
8277
8278
8279
8280
8281
8282
8283
8284
8285
8286
8287
8288
8289
8290
8291
8292
8293
8294
8295
8296
8297
8298
8299
8300
8301
8302
8303
8304
8305
8306
8307
8308
8309
8310
8311
8312
8313
8314
8315
8316
8317
8318
8319
8320
8321
8322
8323
8324
8325
8326
8327
8328
8329
8330
8331
8332
8333
8334
8335
8336
8337
8338
8339
8340
8341
8342
8343
8344
8345
8346
8347
8348
8349
8350
8351
8352
8353
8354
8355
8356
8357
8358
8359
8360
8361
8362
8363
8364
8365
8366
8367
8368
8369
8370
8371
8372
8373
8374
8375
8376
8377
8378
8379
8380
8381
8382
8383
8384
8385
8386
8387
8388
8389
8390
8391
8392
8393
8394
8395
8396
8397
8398
8399
8400
8401
8402
8403
8404
8405
8406
8407
8408
8409
8410
8411
8412
8413
8414
8415
8416
8417
8418
8419
8420
8421
8422
8423
8424
8425
8426
8427
8428
8429
8430
8431
8432
8433
8434
8435
8436
8437
8438
8439
8440
8441
8442
8443
8444
8445
8446
8447
8448
8449
8450
8451
8452
8453
8454
8455
8456
8457
8458
8459
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470
8471
8472
8473
8474
8475
8476
8477
8478
8479
8480
8481
8482
8483
8484
8485
8486
8487
8488
8489
8490
8491
8492
8493
8494
8495
8496
8497
8498
8499
8500
8501
8502
8503
8504
8505
8506
8507
8508
8509
8510
8511
8512
8513
8514
8515
8516
8517
8518
8519
8520
8521
8522
8523
8524
8525
8526
8527
8528
8529
8530
8531
8532
8533
8534
8535
8536
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547
8548
8549
8550
8551
8552
8553
8554
8555
8556
8557
8558
8559
8560
8561
8562
8563
8564
8565
8566
8567
8568
8569
8570
8571
8572
8573
8574
8575
8576
8577
8578
8579
8580
8581
8582
8583
8584
8585
8586
8587
8588
8589
8590
8591
8592
8593
8594
8595
8596
8597
8598
8599
8600
8601
8602
8603
8604
8605
8606
8607
8608
8609
8610
8611
8612
8613
8614
8615
8616
8617
8618
8619
8620
8621
8622
8623
8624
8625
8626
8627
8628
8629
8630
8631
8632
8633
8634
8635
8636
8637
8638
8639
8640
8641
8642
8643
8644
8645
8646
8647
8648
8649
8650
8651
8652
8653
8654
8655
8656
8657
8658
8659
8660
8661
8662
8663
8664
8665
8666
8667
8668
8669
8670
8671
8672
8673
8674
8675
8676
8677
8678
8679
8680
8681
8682
8683
8684
8685
8686
8687
8688
8689
8690
8691
8692
8693
8694
8695
8696
8697
8698
8699
8700
8701
8702
8703
8704
8705
8706
8707
8708
8709
8710
8711
8712
8713
8714
8715
8716
8717
8718
8719
8720
8721
8722
8723
8724
8725
8726
8727
8728
8729
8730
8731
8732
8733
8734
8735
8736
8737
8738
8739
8740
8741
8742
8743
8744
8745
8746
8747
8748
8749
8750
8751
8752
8753
8754
8755
8756
8757
8758
8759
8760
8761
8762
8763
8764
8765
8766
8767
8768
8769
8770
8771
8772
8773
8774
8775
8776
8777
8778
8779
8780
8781
8782
8783
8784
8785
8786
8787
8788
8789
8790
8791
8792
8793
8794
8795
8796
8797
8798
8799
8800
8801
8802
8803
8804
8805
8806
8807
8808
8809
8810
8811
8812
8813
8814
8815
8816
8817
8818
8819
8820
8821
8822
8823
8824
8825
8826
8827
8828
8829
8830
8831
8832
8833
8834
8835
8836
8837
8838
8839
8840
8841
8842
8843
8844
8845
8846
8847
8848
8849
8850
8851
8852
8853
8854
8855
8856
8857
8858
8859
8860
8861
8862
8863
8864
8865
8866
8867
8868
8869
8870
8871
8872
8873
8874
8875
8876
8877
8878
8879
8880
8881
8882
8883
8884
8885
8886
8887
8888
8889
8890
8891
8892
8893
8894
8895
8896
8897
8898
8899
8900
8901
8902
8903
8904
8905
8906
8907
8908
8909
8910
8911
8912
8913
8914
8915
8916
8917
8918
8919
8920
8921
8922
8923
8924
8925
8926
8927
8928
8929
8930
8931
8932
8933
8934
8935
8936
8937
8938
8939
8940
8941
8942
8943
8944
8945
8946
8947
8948
8949
8950
8951
8952
8953
8954
8955
8956
8957
8958
8959
8960
8961
8962
8963
8964
8965
8966
8967
8968
8969
8970
8971
8972
8973
8974
8975
8976
8977
8978
8979
8980
8981
8982
8983
8984
8985
8986
8987
8988
8989
8990
8991
8992
8993
8994
8995
8996
8997
8998
8999
9000
9001
9002
9003
9004
9005
9006
9007
9008
9009
9010
9011
9012
9013
9014
9015
9016
9017
9018
9019
9020
9021
9022
9023
9024
9025
9026
9027
9028
9029
9030
9031
9032
9033
9034
9035
9036
9037
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047
9048
9049
9050
9051
9052
9053
9054
9055
9056
9057
9058
9059
9060
9061
9062
9063
9064
9065
9066
9067
9068
9069
9070
9071
9072
9073
9074
9075
9076
9077
9078
9079
9080
9081
9082
9083
9084
9085
9086
9087
9088
9089
9090
9091
9092
9093
9094
9095
9096
9097
9098
9099
9100
9101
9102
9103
9104
9105
9106
9107
9108
9109
9110
9111
9112
9113
9114
9115
9116
9117
9118
9119
9120
9121
9122
9123
9124
9125
9126
9127
9128
9129
9130
9131
9132
9133
9134
9135
9136
9137
9138
9139
9140
9141
9142
9143
9144
9145
9146
9147
9148
9149
9150
9151
9152
9153
9154
9155
9156
9157
9158
9159
9160
9161
9162
9163
9164
9165
9166
9167
9168
9169
9170
9171
9172
9173
9174
9175
9176
9177
9178
9179
9180
9181
9182
9183
9184
9185
9186
9187
9188
9189
9190
9191
9192
9193
9194
9195
9196
9197
9198
9199
9200
9201
9202
9203
9204
9205
9206
9207
9208
9209
9210
9211
9212
9213
9214
9215
9216
9217
9218
9219
9220
9221
9222
9223
9224
9225
9226
9227
9228
9229
9230
9231
9232
9233
9234
9235
9236
9237
9238
9239
9240
9241
9242
9243
9244
9245
9246
9247
9248
9249
9250
9251
9252
9253
9254
9255
9256
9257
9258
9259
9260
9261
9262
9263
9264
9265
9266
9267
9268
9269
9270
9271
9272
9273
9274
9275
9276
9277
9278
9279
9280
9281
9282
9283
9284
9285
9286
9287
9288
9289
9290
9291
9292
9293
9294
9295
9296
9297
9298
9299
9300
9301
9302
9303
9304
9305
9306
9307
9308
9309
9310
9311
9312
9313
9314
9315
9316
9317
9318
9319
9320
9321
9322
9323
9324
9325
9326
9327
9328
9329
9330
9331
9332
9333
9334
9335
9336
9337
9338
9339
9340
9341
9342
9343
9344
9345
9346
9347
9348
9349
9350
9351
9352
9353
9354
9355
9356
9357
9358
9359
9360
9361
9362
9363
9364
9365
9366
9367
9368
9369
9370
9371
9372
9373
9374
9375
9376
9377
9378
9379
9380
9381
9382
9383
9384
9385
9386
9387
9388
9389
9390
9391
9392
9393
9394
9395
9396
9397
9398
9399
9400
9401
9402
9403
9404
9405
9406
9407
9408
9409
9410
9411
9412
9413
9414
9415
9416
9417
9418
9419
9420
9421
9422
9423
9424
9425
9426
9427
9428
9429
9430
9431
9432
9433
9434
9435
9436
9437
9438
9439
9440
9441
9442
9443
9444
9445
9446
9447
9448
9449
9450
9451
9452
9453
9454
9455
9456
9457
9458
9459
9460
9461
9462
9463
9464
9465
9466
9467
9468
9469
9470
9471
9472
9473
9474
9475
9476
9477
9478
9479
9480
9481
9482
9483
9484
9485
9486
9487
9488
9489
9490
9491
9492
9493
9494
9495
9496
9497
9498
9499
9500
9501
9502
9503
9504
9505
9506
9507
9508
9509
9510
9511
9512
9513
9514
9515
9516
9517
9518
9519
9520
9521
9522
9523
9524
9525
9526
9527
9528
9529
9530
9531
9532
9533
9534
9535
9536
9537
9538
9539
9540
9541
9542
9543
9544
9545
9546
9547
9548
9549
9550
9551
9552
9553
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566
9567
9568
9569
9570
9571
9572
9573
9574
9575
9576
9577
9578
9579
9580
9581
9582
9583
9584
9585
9586
9587
9588
9589
9590
9591
9592
9593
9594
9595
9596
9597
9598
9599
9600
9601
9602
9603
9604
9605
9606
9607
9608
9609
9610
9611
9612
9613
9614
9615
9616
9617
9618
9619
9620
9621
9622
9623
9624
9625
9626
9627
9628
9629
9630
9631
9632
9633
9634
9635
9636
9637
9638
9639
9640
9641
9642
9643
9644
9645
9646
9647
9648
9649
9650
9651
9652
9653
9654
9655
9656
9657
9658
9659
9660
9661
9662
9663
9664
9665
9666
9667
9668
9669
9670
9671
9672
9673
9674
9675
9676
9677
9678
9679
9680
9681
9682
9683
9684
9685
9686
9687
9688
9689
9690
9691
9692
9693
9694
9695
9696
9697
9698
9699
9700
9701
9702
9703
9704
9705
9706
9707
9708
9709
9710
9711
9712
9713
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730
9731
9732
9733
9734
9735
9736
9737
9738
9739
9740
9741
9742
9743
9744
9745
9746
9747
9748
9749
9750
9751
9752
9753
9754
9755
9756
9757
9758
9759
9760
9761
9762
9763
9764
9765
9766
9767
9768
9769
9770
9771
9772
9773
9774
9775
9776
9777
9778
9779
9780
9781
9782
9783
9784
9785
9786
9787
9788
9789
9790
9791
9792
9793
9794
9795
9796
9797
9798
9799
9800
9801
9802
9803
9804
9805
9806
9807
9808
9809
9810
9811
9812
9813
9814
9815
9816
9817
9818
9819
9820
9821
9822
9823
9824
9825
9826
9827
9828
9829
9830
9831
9832
9833
9834
9835
9836
9837
9838
9839
9840
9841
9842
9843
9844
9845
9846
9847
9848
9849
9850
9851
9852
9853
9854
9855
9856
9857
9858
9859
9860
9861
9862
9863
9864
9865
9866
9867
9868
9869
9870
9871
9872
9873
9874
9875
9876
9877
9878
9879
9880
9881
9882
9883
9884
9885
9886
9887
9888
9889
9890
9891
9892
9893
9894
9895
9896
9897
9898
9899
9900
9901
9902
9903
9904
9905
9906
9907
9908
9909
9910
9911
9912
9913
9914
9915
9916
9917
9918
9919
9920
9921
9922
9923
9924
9925
9926
9927
9928
9929
9930
9931
9932
9933
9934
9935
9936
9937
9938
9939
9940
9941
9942
9943
9944
9945
9946
9947
9948
9949
9950
9951
9952
9953
9954
9955
9956
9957
9958
9959
9960
9961
9962
9963
9964
9965
9966
9967
9968
9969
9970
9971
9972
9973
9974
9975
9976
9977
9978
9979
9980
9981
9982
9983
9984
9985
9986
9987
9988
9989
9990
9991
9992
9993
9994
9995
9996
9997
9998
9999
10000
10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
10167
10168
10169
10170
10171
10172
10173
10174
10175
10176
10177
10178
10179
10180
10181
10182
10183
10184
10185
10186
10187
10188
10189
10190
10191
10192
10193
10194
10195
10196
10197
10198
10199
10200
10201
10202
10203
10204
10205
10206
10207
10208
10209
10210
10211
10212
10213
10214
10215
10216
10217
10218
10219
10220
10221
10222
10223
10224
10225
10226
10227
10228
10229
10230
10231
10232
10233
10234
10235
10236
10237
10238
10239
10240
10241
10242
10243
10244
10245
10246
10247
10248
10249
10250
10251
10252
10253
10254
10255
10256
10257
10258
10259
10260
10261
10262
10263
10264
10265
10266
10267
10268
10269
10270
10271
10272
10273
10274
10275
10276
10277
10278
10279
10280
10281
10282
10283
10284
10285
10286
10287
10288
10289
10290
10291
10292
10293
10294
10295
10296
10297
10298
10299
10300
10301
10302
10303
10304
10305
10306
10307
10308
10309
10310
10311
10312
10313
10314
10315
10316
10317
10318
10319
10320
10321
10322
10323
10324
10325
10326
10327
10328
10329
10330
10331
10332
10333
10334
10335
10336
10337
10338
10339
10340
10341
10342
10343
10344
10345
10346
10347
10348
10349
10350
10351
10352
10353
10354
10355
10356
10357
10358
10359
10360
10361
10362
10363
10364
10365
10366
10367
10368
10369
10370
10371
10372
10373
10374
10375
10376
10377
10378
10379
10380
10381
10382
10383
10384
10385
10386
10387
10388
10389
10390
10391
10392
10393
10394
10395
10396
10397
10398
10399
10400
10401
10402
10403
10404
10405
10406
10407
10408
10409
10410
10411
10412
10413
10414
10415
10416
10417
10418
10419
10420
10421
10422
10423
10424
10425
10426
10427
10428
10429
10430
10431
10432
10433
10434
10435
10436
10437
10438
10439
10440
10441
10442
10443
10444
10445
10446
10447
10448
10449
10450
10451
10452
10453
10454
10455
10456
10457
10458
10459
10460
10461
10462
10463
10464
10465
10466
10467
10468
10469
10470
10471
10472
10473
10474
10475
10476
10477
10478
10479
10480
10481
10482
10483
10484
10485
10486
10487
10488
10489
10490
10491
10492
10493
10494
10495
10496
10497
10498
10499
10500
10501
10502
10503
10504
10505
10506
10507
10508
10509
10510
10511
10512
10513
10514
10515
10516
10517
10518
10519
10520
10521
10522
10523
10524
10525
10526
10527
10528
10529
10530
10531
10532
10533
10534
10535
10536
10537
10538
10539
10540
10541
10542
10543
10544
10545
10546
10547
10548
10549
10550
10551
10552
10553
10554
10555
10556
10557
10558
10559
10560
10561
10562
10563
10564
10565
10566
10567
10568
10569
10570
10571
10572
10573
10574
10575
10576
10577
10578
10579
10580
10581
10582
10583
10584
10585
10586
10587
10588
10589
10590
10591
10592
10593
10594
10595
10596
10597
10598
10599
10600
10601
10602
10603
10604
10605
10606
10607
10608
10609
10610
10611
10612
10613
10614
10615
10616
10617
10618
10619
10620
10621
10622
10623
10624
10625
10626
10627
10628
10629
10630
10631
10632
10633
10634
10635
10636
10637
10638
10639
10640
10641
10642
10643
10644
10645
10646
10647
10648
10649
10650
10651
10652
10653
10654
10655
10656
10657
10658
10659
10660
10661
10662
10663
10664
10665
10666
10667
10668
10669
10670
10671
10672
10673
10674
10675
10676
10677
10678
10679
10680
10681
10682
10683
10684
10685
10686
10687
10688
10689
10690
10691
10692
10693
10694
10695
10696
10697
10698
10699
10700
10701
10702
10703
10704
10705
10706
10707
10708
10709
10710
10711
10712
10713
10714
10715
10716
10717
10718
10719
10720
10721
10722
10723
10724
10725
10726
10727
10728
10729
10730
10731
10732
10733
10734
10735
10736
10737
10738
10739
10740
10741
10742
10743
10744
10745
10746
10747
10748
10749
10750
10751
10752
10753
10754
10755
10756
10757
10758
10759
10760
10761
10762
10763
10764
10765
10766
10767
10768
10769
10770
10771
10772
10773
10774
10775
10776
10777
10778
10779
10780
10781
10782
10783
10784
10785
10786
10787
10788
10789
10790
10791
10792
10793
10794
10795
10796
10797
10798
10799
10800
10801
10802
10803
10804
10805
10806
10807
10808
10809
10810
10811
10812
10813
10814
10815
10816
10817
10818
10819
10820
10821
10822
10823
10824
10825
10826
10827
10828
10829
10830
10831
10832
10833
10834
10835
10836
10837
10838
10839
10840
10841
10842
10843
10844
10845
10846
10847
10848
10849
10850
10851
10852
10853
10854
10855
10856
10857
10858
10859
10860
10861
10862
10863
10864
10865
10866
10867
10868
10869
10870
10871
10872
10873
10874
10875
10876
10877
10878
10879
10880
10881
10882
10883
10884
10885
10886
10887
10888
10889
10890
10891
10892
10893
10894
10895
10896
10897
10898
10899
10900
10901
10902
10903
10904
10905
10906
10907
10908
10909
10910
10911
10912
10913
10914
10915
10916
10917
10918
10919
10920
10921
10922
10923
10924
10925
10926
10927
10928
10929
10930
10931
10932
10933
10934
10935
10936
10937
10938
10939
10940
10941
10942
10943
10944
10945
10946
10947
10948
10949
10950
10951
10952
10953
10954
10955
10956
10957
10958
10959
10960
10961
10962
10963
10964
10965
10966
10967
10968
10969
10970
10971
10972
10973
10974
10975
10976
10977
10978
10979
10980
10981
10982
10983
10984
10985
10986
10987
10988
10989
10990
10991
10992
10993
10994
10995
10996
10997
10998
10999
11000
11001
11002
11003
11004
11005
11006
11007
11008
11009
11010
11011
11012
11013
11014
11015
11016
11017
11018
11019
11020
11021
11022
11023
11024
11025
11026
11027
11028
11029
11030
11031
11032
11033
11034
11035
11036
11037
11038
11039
11040
11041
11042
11043
11044
11045
11046
11047
11048
11049
11050
11051
11052
11053
11054
11055
11056
11057
11058
11059
11060
11061
11062
11063
11064
11065
11066
11067
11068
11069
11070
11071
11072
11073
11074
11075
11076
11077
11078
11079
11080
11081
11082
11083
11084
11085
11086
11087
11088
11089
11090
11091
11092
11093
11094
11095
11096
11097
11098
11099
11100
11101
11102
11103
11104
11105
11106
11107
11108
11109
11110
11111
11112
11113
11114
11115
11116
11117
11118
11119
11120
11121
11122
11123
11124
11125
11126
11127
11128
11129
11130
11131
11132
11133
11134
11135
11136
11137
11138
11139
11140
11141
11142
11143
11144
11145
11146
11147
11148
11149
11150
11151
11152
11153
11154
11155
11156
11157
11158
11159
11160
11161
11162
11163
11164
11165
11166
11167
11168
11169
11170
11171
11172
11173
11174
11175
11176
11177
11178
11179
11180
11181
11182
11183
11184
11185
11186
11187
11188
11189
11190
11191
11192
11193
11194
11195
11196
11197
11198
11199
11200
11201
11202
11203
11204
11205
11206
11207
11208
11209
11210
11211
11212
11213
11214
11215
11216
11217
11218
11219
11220
11221
11222
11223
11224
11225
11226
11227
11228
11229
11230
11231
11232
11233
11234
11235
11236
11237
11238
11239
11240
11241
11242
11243
11244
11245
11246
11247
11248
11249
11250
11251
11252
11253
11254
11255
11256
11257
11258
11259
11260
11261
11262
11263
11264
11265
11266
11267
11268
11269
11270
11271
11272
11273
11274
11275
11276
11277
11278
11279
11280
11281
11282
11283
11284
11285
11286
11287
11288
11289
11290
11291
11292
11293
11294
11295
11296
11297
11298
11299
11300
11301
11302
11303
11304
11305
11306
11307
11308
11309
11310
11311
11312
11313
11314
11315
11316
11317
11318
11319
11320
11321
11322
11323
11324
11325
11326
11327
11328
11329
11330
11331
11332
11333
11334
11335
11336
11337
11338
11339
11340
11341
11342
11343
11344
11345
11346
11347
11348
11349
11350
11351
11352
11353
11354
11355
11356
11357
11358
11359
11360
11361
11362
11363
11364
11365
11366
11367
11368
11369
11370
11371
11372
11373
11374
11375
11376
11377
11378
11379
11380
11381
11382
11383
11384
11385
11386
11387
11388
11389
11390
11391
11392
11393
11394
11395
11396
11397
11398
11399
11400
11401
11402
11403
11404
11405
11406
11407
11408
11409
11410
11411
11412
11413
11414
11415
11416
11417
11418
11419
11420
11421
11422
11423
11424
11425
11426
11427
11428
11429
11430
11431
11432
11433
11434
11435
11436
11437
11438
11439
11440
11441
11442
11443
11444
11445
11446
11447
11448
11449
11450
11451
11452
11453
11454
11455
11456
11457
11458
11459
11460
11461
11462
11463
11464
11465
11466
11467
11468
11469
11470
11471
11472
11473
11474
11475
11476
11477
11478
11479
11480
11481
11482
11483
11484
11485
11486
11487
11488
11489
11490
11491
11492
11493
11494
11495
11496
11497
11498
11499
11500
11501
11502
11503
11504
11505
11506
11507
11508
11509
11510
11511
11512
11513
11514
11515
11516
11517
11518
11519
11520
11521
11522
11523
11524
11525
11526
11527
11528
11529
11530
11531
11532
11533
11534
11535
11536
11537
11538
11539
11540
11541
11542
11543
11544
11545
11546
11547
11548
11549
11550
11551
11552
11553
11554
11555
11556
11557
11558
11559
11560
11561
11562
11563
11564
11565
11566
11567
11568
11569
11570
11571
11572
11573
11574
11575
11576
11577
11578
11579
11580
11581
11582
11583
11584
11585
11586
11587
11588
11589
11590
11591
11592
11593
11594
11595
11596
11597
11598
11599
11600
11601
11602
11603
11604
11605
11606
11607
11608
11609
11610
11611
11612
11613
11614
11615
11616
11617
11618
11619
11620
11621
11622
11623
11624
11625
11626
11627
11628
11629
11630
11631
11632
11633
11634
11635
11636
11637
11638
11639
11640
11641
11642
11643
11644
11645
11646
11647
11648
11649
11650
11651
11652
11653
11654
11655
11656
11657
11658
11659
11660
11661
11662
11663
11664
11665
11666
11667
11668
11669
11670
11671
11672
11673
11674
11675
11676
11677
11678
11679
11680
11681
11682
11683
11684
11685
11686
11687
11688
11689
11690
11691
11692
11693
11694
11695
11696
11697
11698
11699
11700
11701
11702
11703
11704
11705
11706
11707
11708
11709
11710
11711
11712
11713
11714
11715
11716
11717
11718
11719
11720
11721
11722
11723
11724
11725
11726
11727
11728
11729
11730
11731
11732
11733
11734
11735
11736
11737
11738
11739
11740
11741
11742
11743
11744
11745
11746
11747
11748
11749
11750
11751
11752
11753
11754
11755
11756
11757
11758
11759
11760
11761
11762
11763
11764
11765
11766
11767
11768
11769
11770
11771
11772
11773
11774
11775
11776
11777
11778
11779
11780
11781
11782
11783
11784
11785
11786
11787
11788
11789
11790
11791
11792
11793
11794
11795
11796
11797
11798
11799
11800
11801
11802
11803
11804
11805
11806
11807
11808
11809
11810
11811
11812
11813
11814
11815
11816
11817
11818
11819
11820
11821
11822
11823
11824
11825
11826
11827
11828
11829
11830
11831
11832
11833
11834
11835
11836
11837
11838
11839
11840
11841
11842
11843
11844
11845
11846
11847
11848
11849
11850
11851
11852
11853
11854
11855
11856
11857
11858
11859
11860
11861
11862
11863
11864
11865
11866
11867
11868
11869
11870
11871
11872
11873
11874
11875
11876
11877
11878
11879
11880
11881
11882
11883
11884
11885
11886
11887
11888
11889
11890
11891
11892
11893
11894
11895
11896
11897
11898
11899
11900
11901
11902
11903
11904
11905
11906
11907
11908
11909
11910
11911
11912
11913
11914
11915
11916
11917
11918
11919
11920
11921
11922
11923
11924
11925
11926
11927
11928
11929
11930
11931
11932
11933
11934
11935
11936
11937
11938
11939
11940
11941
11942
11943
11944
11945
11946
11947
11948
11949
11950
11951
11952
11953
11954
11955
11956
11957
11958
11959
11960
11961
11962
11963
11964
11965
11966
11967
11968
11969
11970
11971
11972
11973
11974
11975
11976
11977
11978
11979
11980
11981
11982
11983
11984
11985
11986
11987
11988
11989
11990
11991
11992
11993
11994
11995
11996
11997
11998
11999
12000
12001
12002
12003
12004
12005
12006
12007
12008
12009
12010
12011
12012
12013
12014
12015
12016
12017
12018
12019
12020
12021
12022
12023
12024
12025
12026
12027
12028
12029
12030
12031
12032
12033
12034
12035
12036
12037
12038
12039
12040
12041
12042
12043
12044
12045
12046
12047
12048
12049
12050
12051
12052
12053
12054
12055
12056
12057
12058
12059
12060
12061
12062
12063
12064
12065
12066
12067
12068
12069
12070
12071
12072
12073
12074
12075
12076
12077
12078
12079
12080
12081
12082
12083
12084
12085
12086
12087
12088
12089
12090
12091
12092
12093
12094
12095
12096
12097
12098
12099
12100
12101
12102
12103
12104
12105
12106
12107
12108
12109
12110
12111
12112
12113
12114
12115
12116
12117
12118
12119
12120
12121
12122
12123
12124
12125
12126
12127
12128
12129
12130
12131
12132
12133
12134
12135
12136
12137
12138
12139
12140
12141
12142
12143
12144
12145
12146
12147
12148
12149
12150
12151
12152
12153
12154
12155
12156
12157
12158
12159
12160
12161
12162
12163
12164
12165
12166
12167
12168
12169
12170
12171
12172
12173
12174
12175
12176
12177
12178
12179
12180
12181
12182
12183
12184
12185
12186
12187
12188
12189
12190
12191
12192
12193
12194
12195
12196
12197
12198
12199
12200
12201
12202
12203
12204
12205
12206
12207
12208
12209
12210
12211
12212
12213
12214
12215
12216
12217
12218
12219
12220
12221
12222
12223
12224
12225
12226
12227
12228
12229
12230
12231
12232
12233
12234
12235
12236
12237
12238
12239
12240
12241
12242
12243
12244
12245
12246
12247
12248
12249
12250
12251
12252
12253
12254
12255
12256
12257
12258
12259
12260
12261
12262
12263
12264
12265
12266
12267
12268
12269
12270
12271
12272
12273
12274
12275
12276
12277
12278
12279
12280
12281
12282
12283
12284
12285
12286
12287
12288
12289
12290
12291
12292
12293
12294
12295
12296
12297
12298
12299
12300
12301
12302
12303
12304
12305
12306
12307
12308
12309
12310
12311
12312
12313
12314
12315
12316
12317
12318
12319
12320
12321
12322
12323
12324
12325
12326
12327
12328
12329
12330
12331
12332
12333
12334
12335
12336
12337
12338
12339
12340
12341
12342
12343
12344
12345
12346
12347
12348
12349
12350
12351
12352
12353
12354
12355
12356
12357
12358
12359
12360
12361
12362
12363
12364
12365
12366
12367
12368
12369
12370
12371
12372
12373
12374
12375
12376
12377
12378
12379
12380
12381
12382
12383
12384
12385
12386
12387
12388
12389
12390
12391
12392
12393
12394
12395
12396
12397
12398
12399
12400
12401
12402
12403
12404
12405
12406
12407
12408
12409
12410
12411
12412
12413
12414
12415
12416
12417
12418
12419
12420
12421
12422
12423
12424
12425
12426
12427
12428
12429
12430
12431
12432
12433
12434
12435
12436
12437
12438
12439
12440
12441
12442
12443
12444
12445
12446
12447
12448
12449
12450
12451
12452
12453
12454
12455
12456
12457
12458
12459
12460
12461
12462
12463
12464
12465
12466
12467
12468
12469
12470
12471
12472
12473
12474
12475
12476
12477
12478
12479
12480
12481
12482
12483
12484
12485
12486
12487
12488
12489
12490
12491
12492
12493
12494
12495
12496
12497
12498
12499
12500
12501
12502
12503
12504
12505
12506
12507
12508
12509
12510
12511
12512
12513
12514
12515
12516
12517
12518
12519
12520
12521
12522
12523
12524
12525
12526
12527
12528
12529
12530
12531
12532
12533
12534
12535
12536
12537
12538
12539
12540
12541
12542
12543
12544
12545
12546
12547
12548
12549
12550
12551
12552
12553
12554
12555
12556
12557
12558
12559
12560
12561
12562
12563
12564
12565
12566
12567
12568
12569
12570
12571
12572
12573
12574
12575
12576
12577
12578
12579
12580
12581
12582
12583
12584
12585
12586
12587
12588
12589
12590
12591
12592
12593
12594
12595
12596
12597
12598
12599
12600
12601
12602
12603
12604
12605
12606
12607
12608
12609
12610
12611
12612
12613
12614
12615
12616
12617
12618
12619
12620
12621
12622
12623
12624
12625
12626
12627
12628
12629
12630
12631
12632
12633
12634
12635
12636
12637
12638
12639
12640
12641
12642
12643
12644
12645
12646
12647
12648
12649
12650
12651
12652
12653
12654
12655
12656
12657
12658
12659
12660
12661
12662
12663
12664
12665
12666
12667
12668
12669
12670
12671
12672
12673
12674
12675
12676
12677
12678
12679
12680
12681
12682
12683
12684
12685
12686
12687
12688
12689
12690
12691
12692
12693
12694
12695
12696
12697
12698
12699
12700
12701
12702
12703
12704
12705
12706
12707
12708
12709
12710
12711
12712
12713
12714
12715
12716
12717
12718
12719
12720
12721
12722
12723
12724
12725
12726
12727
12728
12729
12730
12731
12732
12733
12734
12735
12736
12737
12738
12739
12740
12741
12742
12743
12744
12745
12746
12747
12748
12749
12750
12751
12752
12753
12754
12755
12756
12757
12758
12759
12760
12761
12762
12763
12764
12765
12766
12767
12768
12769
12770
12771
12772
12773
12774
12775
12776
12777
12778
12779
12780
12781
12782
12783
12784
12785
12786
12787
12788
12789
12790
12791
12792
12793
12794
12795
12796
12797
12798
12799
12800
12801
12802
12803
12804
12805
12806
12807
12808
12809
12810
12811
12812
12813
12814
12815
12816
12817
12818
12819
12820
12821
12822
12823
12824
12825
12826
12827
12828
12829
12830
12831
12832
12833
12834
12835
12836
12837
12838
12839
12840
12841
12842
12843
12844
12845
12846
12847
12848
12849
12850
12851
12852
12853
12854
12855
12856
12857
12858
12859
12860
12861
12862
12863
12864
12865
12866
12867
12868
12869
12870
12871
12872
12873
12874
12875
12876
12877
12878
12879
12880
12881
12882
12883
12884
12885
12886
12887
12888
12889
12890
12891
12892
12893
12894
12895
12896
12897
12898
12899
12900
12901
12902
12903
12904
12905
12906
12907
12908
12909
12910
12911
12912
12913
12914
12915
12916
12917
12918
12919
12920
12921
12922
12923
12924
12925
12926
12927
12928
12929
12930
12931
12932
12933
12934
12935
12936
12937
12938
12939
12940
12941
12942
12943
12944
12945
12946
12947
12948
12949
12950
12951
12952
12953
12954
12955
12956
12957
12958
12959
12960
12961
12962
12963
12964
12965
12966
12967
12968
12969
12970
12971
12972
12973
12974
12975
12976
12977
12978
12979
12980
12981
12982
12983
12984
12985
12986
12987
12988
12989
12990
12991
12992
12993
12994
12995
12996
12997
12998
12999
13000
13001
13002
13003
13004
13005
13006
13007
13008
13009
13010
13011
13012
13013
13014
13015
13016
13017
13018
13019
13020
13021
13022
13023
13024
13025
13026
13027
13028
13029
13030
13031
13032
13033
13034
13035
13036
13037
13038
13039
13040
13041
13042
13043
13044
13045
13046
13047
13048
13049
13050
13051
13052
13053
13054
13055
13056
13057
13058
13059
13060
13061
13062
13063
13064
13065
13066
13067
13068
13069
13070
13071
13072
13073
13074
13075
13076
13077
13078
13079
13080
13081
13082
13083
13084
13085
13086
13087
13088
13089
13090
13091
13092
13093
13094
13095
13096
13097
13098
13099
13100
13101
13102
13103
13104
13105
13106
13107
13108
13109
13110
13111
13112
13113
13114
13115
13116
13117
13118
13119
13120
13121
13122
13123
13124
13125
13126
13127
13128
13129
13130
13131
13132
13133
13134
13135
13136
13137
13138
13139
13140
13141
13142
13143
13144
13145
13146
13147
13148
13149
13150
13151
13152
13153
13154
13155
13156
13157
13158
13159
13160
13161
13162
13163
13164
13165
13166
13167
13168
13169
13170
13171
13172
13173
13174
13175
13176
13177
13178
13179
13180
13181
13182
13183
13184
13185
13186
13187
13188
13189
13190
13191
13192
13193
13194
13195
13196
13197
13198
13199
13200
13201
13202
13203
13204
13205
13206
13207
13208
13209
13210
13211
13212
13213
13214
13215
13216
13217
13218
13219
13220
13221
13222
13223
13224
13225
13226
13227
13228
13229
13230
13231
13232
13233
13234
13235
13236
13237
13238
13239
13240
13241
13242
13243
13244
13245
13246
13247
13248
13249
13250
13251
13252
13253
13254
13255
13256
13257
13258
13259
13260
13261
13262
13263
13264
13265
13266
13267
13268
13269
13270
13271
13272
13273
13274
13275
13276
13277
13278
13279
13280
13281
13282
13283
13284
13285
13286
13287
13288
13289
13290
13291
13292
13293
13294
13295
13296
13297
13298
13299
13300
13301
13302
13303
13304
13305
13306
13307
13308
13309
13310
13311
13312
13313
13314
13315
13316
13317
13318
13319
13320
13321
13322
13323
13324
13325
13326
13327
13328
13329
13330
13331
13332
13333
13334
13335
13336
13337
13338
13339
13340
13341
13342
13343
13344
13345
13346
13347
13348
13349
13350
13351
13352
13353
13354
13355
13356
13357
13358
13359
13360
13361
13362
13363
13364
13365
13366
13367
13368
13369
13370
13371
13372
13373
13374
13375
13376
13377
13378
13379
13380
13381
13382
13383
13384
13385
13386
13387
13388
13389
13390
13391
13392
13393
13394
13395
13396
13397
13398
13399
13400
13401
13402
13403
13404
13405
13406
13407
13408
13409
13410
13411
13412
13413
13414
13415
13416
13417
13418
13419
13420
13421
13422
13423
13424
13425
13426
13427
13428
13429
13430
13431
13432
13433
13434
13435
13436
13437
13438
13439
13440
13441
13442
13443
13444
13445
13446
13447
13448
13449
13450
13451
13452
13453
13454
13455
13456
13457
13458
13459
13460
13461
13462
13463
13464
13465
13466
13467
13468
13469
13470
13471
13472
13473
13474
13475
13476
13477
13478
13479
13480
13481
13482
13483
13484
13485
13486
13487
13488
13489
13490
13491
13492
13493
13494
13495
13496
13497
13498
13499
13500
13501
13502
13503
13504
13505
13506
13507
13508
13509
13510
13511
13512
13513
13514
13515
13516
13517
13518
13519
13520
13521
13522
13523
13524
13525
13526
13527
13528
13529
13530
13531
13532
13533
13534
13535
13536
13537
13538
13539
13540
13541
13542
13543
13544
13545
13546
13547
13548
13549
13550
13551
13552
13553
13554
13555
13556
13557
13558
13559
13560
13561
13562
13563
13564
13565
13566
13567
13568
13569
13570
13571
13572
13573
13574
13575
13576
13577
13578
13579
13580
13581
13582
13583
13584
13585
13586
13587
13588
13589
13590
13591
13592
13593
13594
13595
13596
13597
13598
13599
13600
13601
13602
13603
13604
13605
13606
13607
13608
13609
13610
13611
13612
13613
13614
13615
13616
13617
13618
13619
13620
13621
13622
13623
13624
13625
13626
13627
13628
13629
13630
13631
13632
13633
13634
13635
13636
13637
13638
13639
13640
13641
13642
13643
13644
13645
13646
13647
13648
13649
13650
13651
13652
13653
13654
13655
13656
13657
13658
13659
13660
13661
13662
13663
13664
13665
13666
13667
13668
13669
13670
13671
13672
13673
13674
13675
13676
13677
13678
13679
13680
13681
13682
13683
13684
13685
13686
13687
13688
13689
13690
13691
13692
13693
13694
13695
13696
13697
13698
13699
13700
13701
13702
13703
13704
13705
13706
13707
13708
13709
13710
13711
13712
13713
13714
13715
13716
13717
13718
13719
13720
13721
13722
13723
13724
13725
13726
13727
13728
13729
13730
13731
13732
13733
13734
13735
13736
13737
13738
13739
13740
13741
13742
13743
13744
13745
13746
13747
13748
13749
13750
13751
13752
13753
13754
13755
13756
13757
13758
13759
13760
13761
13762
13763
13764
13765
13766
13767
13768
13769
13770
13771
13772
13773
13774
13775
13776
13777
13778
13779
13780
13781
13782
13783
13784
13785
13786
13787
13788
13789
13790
13791
13792
13793
13794
13795
13796
13797
13798
13799
13800
13801
13802
13803
13804
13805
13806
13807
13808
13809
13810
13811
13812
13813
13814
13815
13816
13817
13818
13819
13820
13821
13822
13823
13824
13825
13826
13827
13828
13829
13830
13831
13832
13833
13834
13835
13836
13837
13838
13839
13840
13841
13842
13843
13844
13845
13846
13847
13848
13849
13850
13851
13852
13853
13854
13855
13856
13857
13858
13859
13860
13861
13862
13863
13864
13865
13866
13867
13868
13869
13870
13871
13872
13873
13874
13875
13876
13877
13878
13879
13880
13881
13882
13883
13884
13885
13886
13887
13888
13889
13890
13891
13892
13893
13894
13895
13896
13897
13898
13899
13900
13901
13902
13903
13904
13905
13906
13907
13908
13909
13910
13911
13912
13913
13914
13915
13916
13917
13918
13919
13920
13921
13922
13923
13924
13925
13926
13927
13928
13929
13930
13931
13932
13933
13934
13935
13936
13937
13938
13939
13940
13941
13942
13943
13944
13945
13946
13947
13948
13949
13950
13951
13952
13953
13954
13955
13956
13957
13958
13959
13960
13961
13962
13963
13964
13965
13966
13967
13968
13969
13970
13971
13972
13973
13974
13975
13976
13977
13978
13979
13980
13981
13982
13983
13984
13985
13986
13987
13988
13989
13990
13991
13992
13993
13994
13995
13996
13997
13998
13999
14000
14001
14002
14003
14004
14005
14006
14007
14008
14009
14010
14011
14012
14013
14014
14015
14016
14017
14018
14019
14020
14021
14022
14023
14024
14025
14026
14027
14028
14029
14030
14031
14032
14033
14034
14035
14036
14037
14038
14039
14040
14041
14042
14043
14044
14045
14046
14047
14048
14049
14050
14051
14052
14053
14054
14055
14056
14057
14058
14059
14060
14061
14062
14063
14064
14065
14066
14067
14068
14069
14070
14071
14072
14073
14074
14075
14076
14077
14078
14079
14080
14081
14082
14083
14084
14085
14086
14087
14088
14089
14090
14091
14092
14093
14094
14095
14096
14097
14098
14099
14100
14101
14102
14103
14104
14105
14106
14107
14108
14109
14110
14111
14112
14113
14114
14115
14116
14117
14118
14119
14120
14121
14122
14123
14124
14125
14126
14127
14128
14129
14130
14131
14132
14133
14134
14135
14136
14137
14138
14139
14140
14141
14142
14143
14144
14145
14146
14147
14148
14149
14150
14151
14152
14153
14154
14155
14156
14157
14158
14159
14160
14161
14162
14163
14164
14165
14166
14167
14168
14169
14170
14171
14172
14173
14174
14175
14176
14177
14178
14179
14180
14181
14182
14183
14184
14185
14186
14187
14188
14189
14190
14191
14192
14193
14194
14195
14196
14197
14198
14199
14200
14201
14202
14203
14204
14205
14206
14207
14208
14209
14210
14211
14212
14213
14214
14215
14216
14217
14218
14219
14220
14221
14222
14223
14224
14225
14226
14227
14228
14229
14230
14231
14232
14233
14234
14235
14236
14237
14238
14239
14240
14241
14242
14243
14244
14245
14246
14247
14248
14249
14250
14251
14252
14253
14254
14255
14256
14257
14258
14259
14260
14261
14262
14263
14264
14265
14266
14267
14268
14269
14270
14271
14272
14273
14274
14275
14276
14277
14278
14279
14280
14281
14282
14283
14284
14285
14286
14287
14288
14289
14290
14291
14292
14293
14294
14295
14296
14297
14298
14299
14300
14301
14302
14303
14304
14305
14306
14307
14308
14309
14310
14311
14312
14313
14314
14315
14316
14317
14318
14319
14320
14321
14322
14323
14324
14325
14326
14327
14328
14329
14330
14331
14332
14333
14334
14335
14336
14337
14338
14339
14340
14341
14342
14343
14344
14345
14346
14347
14348
14349
14350
14351
14352
14353
14354
14355
14356
14357
14358
14359
14360
14361
14362
14363
14364
14365
14366
14367
14368
14369
14370
14371
14372
14373
14374
14375
14376
14377
14378
14379
14380
14381
14382
14383
14384
14385
14386
14387
14388
14389
14390
14391
14392
14393
14394
14395
14396
14397
14398
14399
14400
14401
14402
14403
14404
14405
14406
14407
14408
14409
14410
14411
14412
14413
14414
14415
14416
14417
14418
14419
14420
14421
14422
14423
14424
14425
14426
14427
14428
14429
14430
14431
14432
14433
14434
14435
14436
14437
14438
14439
14440
14441
14442
14443
14444
14445
14446
14447
14448
14449
14450
14451
14452
14453
14454
14455
14456
14457
14458
14459
14460
14461
14462
14463
14464
14465
14466
14467
14468
14469
14470
14471
14472
14473
14474
14475
14476
14477
14478
14479
14480
14481
14482
14483
14484
14485
14486
14487
14488
14489
14490
14491
14492
14493
14494
14495
14496
14497
14498
14499
14500
14501
14502
14503
14504
14505
14506
14507
14508
14509
14510
14511
14512
14513
14514
14515
14516
14517
14518
14519
14520
14521
14522
14523
14524
14525
14526
14527
14528
14529
14530
14531
14532
14533
14534
14535
14536
14537
14538
14539
14540
14541
14542
14543
14544
14545
14546
14547
14548
14549
14550
14551
14552
14553
14554
14555
14556
14557
14558
14559
14560
14561
14562
14563
14564
14565
14566
14567
14568
14569
14570
14571
14572
14573
14574
14575
14576
14577
14578
14579
14580
14581
14582
14583
14584
14585
14586
14587
14588
14589
14590
14591
14592
14593
14594
14595
14596
14597
14598
14599
14600
14601
14602
14603
14604
14605
14606
14607
14608
14609
14610
14611
14612
14613
14614
14615
14616
14617
14618
// SPDX-License-Identifier: GPL-2.0
/*
 * Performance events core code:
 *
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
 *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/idr.h>
#include <linux/file.h>
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
#include <linux/ptrace.h>
#include <linux/reboot.h>
#include <linux/vmstat.h>
#include <linux/device.h>
#include <linux/export.h>
#include <linux/vmalloc.h>
#include <linux/hardirq.h>
#include <linux/hugetlb.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
#include <linux/trace_events.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
#include <linux/module.h>
#include <linux/mman.h>
#include <linux/compat.h>
#include <linux/bpf.h>
#include <linux/filter.h>
#include <linux/namei.h>
#include <linux/parser.h>
#include <linux/sched/clock.h>
#include <linux/sched/mm.h>
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
#include <linux/highmem.h>
#include <linux/pgtable.h>
#include <linux/buildid.h>
#include <linux/task_work.h>
#include <linux/percpu-rwsem.h>

#include "internal.h"

#include <asm/irq_regs.h>

typedef int (*remote_function_f)(void *);

struct remote_function_call {
        struct task_struct        *p;
        remote_function_f        func;
        void                        *info;
        int                        ret;
};

static void remote_function(void *data)
{
        struct remote_function_call *tfc = data;
        struct task_struct *p = tfc->p;

        if (p) {
                /* -EAGAIN */
                if (task_cpu(p) != smp_processor_id())
                        return;

                /*
                 * Now that we're on right CPU with IRQs disabled, we can test
                 * if we hit the right task without races.
                 */

                tfc->ret = -ESRCH; /* No such (running) process */
                if (p != current)
                        return;
        }

        tfc->ret = tfc->func(tfc->info);
}

/**
 * task_function_call - call a function on the cpu on which a task runs
 * @p:                the task to evaluate
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func when the task is currently running. This might
 * be on the current CPU, which just calls the function directly.  This will
 * retry due to any failures in smp_call_function_single(), such as if the
 * task_cpu() goes offline concurrently.
 *
 * returns @func return value or -ESRCH or -ENXIO when the process isn't running
 */
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = p,
                .func        = func,
                .info        = info,
                .ret        = -EAGAIN,
        };
        int ret;

        for (;;) {
                ret = smp_call_function_single(task_cpu(p), remote_function,
                                               &data, 1);
                if (!ret)
                        ret = data.ret;

                if (ret != -EAGAIN)
                        break;

                cond_resched();
        }

        return ret;
}

/**
 * cpu_function_call - call a function on the cpu
 * @cpu:        target cpu to queue this function
 * @func:        the function to be called
 * @info:        the function call argument
 *
 * Calls the function @func on the remote cpu.
 *
 * returns: @func return value or -ENXIO when the cpu is offline
 */
static int cpu_function_call(int cpu, remote_function_f func, void *info)
{
        struct remote_function_call data = {
                .p        = NULL,
                .func        = func,
                .info        = info,
                .ret        = -ENXIO, /* No such CPU */
        };

        smp_call_function_single(cpu, remote_function, &data, 1);

        return data.ret;
}

enum event_type_t {
        EVENT_FLEXIBLE        = 0x01,
        EVENT_PINNED        = 0x02,
        EVENT_TIME        = 0x04,
        EVENT_FROZEN        = 0x08,
        /* see ctx_resched() for details */
        EVENT_CPU        = 0x10,
        EVENT_CGROUP        = 0x20,

        /* compound helpers */
        EVENT_ALL         = EVENT_FLEXIBLE | EVENT_PINNED,
        EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN,
};

static inline void __perf_ctx_lock(struct perf_event_context *ctx)
{
        raw_spin_lock(&ctx->lock);
        WARN_ON_ONCE(ctx->is_active & EVENT_FROZEN);
}

static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
                          struct perf_event_context *ctx)
{
        __perf_ctx_lock(&cpuctx->ctx);
        if (ctx)
                __perf_ctx_lock(ctx);
}

static inline void __perf_ctx_unlock(struct perf_event_context *ctx)
{
        /*
         * If ctx_sched_in() didn't again set any ALL flags, clean up
         * after ctx_sched_out() by clearing is_active.
         */
        if (ctx->is_active & EVENT_FROZEN) {
                if (!(ctx->is_active & EVENT_ALL))
                        ctx->is_active = 0;
                else
                        ctx->is_active &= ~EVENT_FROZEN;
        }
        raw_spin_unlock(&ctx->lock);
}

static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
                            struct perf_event_context *ctx)
{
        if (ctx)
                __perf_ctx_unlock(ctx);
        __perf_ctx_unlock(&cpuctx->ctx);
}

#define TASK_TOMBSTONE ((void *)-1L)

static bool is_kernel_event(struct perf_event *event)
{
        return READ_ONCE(event->owner) == TASK_TOMBSTONE;
}

static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);

struct perf_event_context *perf_cpu_task_ctx(void)
{
        lockdep_assert_irqs_disabled();
        return this_cpu_ptr(&perf_cpu_context)->task_ctx;
}

/*
 * On task ctx scheduling...
 *
 * When !ctx->nr_events a task context will not be scheduled. This means
 * we can disable the scheduler hooks (for performance) without leaving
 * pending task ctx state.
 *
 * This however results in two special cases:
 *
 *  - removing the last event from a task ctx; this is relatively straight
 *    forward and is done in __perf_remove_from_context.
 *
 *  - adding the first event to a task ctx; this is tricky because we cannot
 *    rely on ctx->is_active and therefore cannot use event_function_call().
 *    See perf_install_in_context().
 *
 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 */

typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
                        struct perf_event_context *, void *);

struct event_function_struct {
        struct perf_event *event;
        event_f func;
        void *data;
};

static int event_function(void *info)
{
        struct event_function_struct *efs = info;
        struct perf_event *event = efs->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        int ret = 0;

        lockdep_assert_irqs_disabled();

        perf_ctx_lock(cpuctx, task_ctx);
        /*
         * Since we do the IPI call without holding ctx->lock things can have
         * changed, double check we hit the task we set out to hit.
         */
        if (ctx->task) {
                if (ctx->task != current) {
                        ret = -ESRCH;
                        goto unlock;
                }

                /*
                 * We only use event_function_call() on established contexts,
                 * and event_function() is only ever called when active (or
                 * rather, we'll have bailed in task_function_call() or the
                 * above ctx->task != current test), therefore we must have
                 * ctx->is_active here.
                 */
                WARN_ON_ONCE(!ctx->is_active);
                /*
                 * And since we have ctx->is_active, cpuctx->task_ctx must
                 * match.
                 */
                WARN_ON_ONCE(task_ctx != ctx);
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        efs->func(event, cpuctx, ctx, efs->data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static void event_function_call(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
        struct perf_cpu_context *cpuctx;
        struct event_function_struct efs = {
                .event = event,
                .func = func,
                .data = data,
        };

        if (!event->parent) {
                /*
                 * If this is a !child event, we must hold ctx::mutex to
                 * stabilize the event->ctx relation. See
                 * perf_event_ctx_lock().
                 */
                lockdep_assert_held(&ctx->mutex);
        }

        if (!task) {
                cpu_function_call(event->cpu, event_function, &efs);
                return;
        }

        if (task == TASK_TOMBSTONE)
                return;

again:
        if (!task_function_call(task, event_function, &efs))
                return;

        local_irq_disable();
        cpuctx = this_cpu_ptr(&perf_cpu_context);
        perf_ctx_lock(cpuctx, ctx);
        /*
         * Reload the task pointer, it might have been changed by
         * a concurrent perf_event_context_sched_out().
         */
        task = ctx->task;
        if (task == TASK_TOMBSTONE)
                goto unlock;
        if (ctx->is_active) {
                perf_ctx_unlock(cpuctx, ctx);
                local_irq_enable();
                goto again;
        }
        func(event, NULL, ctx, data);
unlock:
        perf_ctx_unlock(cpuctx, ctx);
        local_irq_enable();
}

/*
 * Similar to event_function_call() + event_function(), but hard assumes IRQs
 * are already disabled and we're on the right CPU.
 */
static void event_function_local(struct perf_event *event, event_f func, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct task_struct *task = READ_ONCE(ctx->task);
        struct perf_event_context *task_ctx = NULL;

        lockdep_assert_irqs_disabled();

        if (task) {
                if (task == TASK_TOMBSTONE)
                        return;

                task_ctx = ctx;
        }

        perf_ctx_lock(cpuctx, task_ctx);

        task = ctx->task;
        if (task == TASK_TOMBSTONE)
                goto unlock;

        if (task) {
                /*
                 * We must be either inactive or active and the right task,
                 * otherwise we're screwed, since we cannot IPI to somewhere
                 * else.
                 */
                if (ctx->is_active) {
                        if (WARN_ON_ONCE(task != current))
                                goto unlock;

                        if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
                                goto unlock;
                }
        } else {
                WARN_ON_ONCE(&cpuctx->ctx != ctx);
        }

        func(event, cpuctx, ctx, data);
unlock:
        perf_ctx_unlock(cpuctx, task_ctx);
}

#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
                       PERF_FLAG_FD_OUTPUT  |\
                       PERF_FLAG_PID_CGROUP |\
                       PERF_FLAG_FD_CLOEXEC)

/*
 * branch priv levels that need permission checks
 */
#define PERF_SAMPLE_BRANCH_PERM_PLM \
        (PERF_SAMPLE_BRANCH_KERNEL |\
         PERF_SAMPLE_BRANCH_HV)

/*
 * perf_sched_events : >0 events exist
 */

static void perf_sched_delayed(struct work_struct *work);
DEFINE_STATIC_KEY_FALSE(perf_sched_events);
static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
static DEFINE_MUTEX(perf_sched_mutex);
static atomic_t perf_sched_count;

static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);

static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_namespaces_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
static atomic_t nr_freq_events __read_mostly;
static atomic_t nr_switch_events __read_mostly;
static atomic_t nr_ksymbol_events __read_mostly;
static atomic_t nr_bpf_events __read_mostly;
static atomic_t nr_cgroup_events __read_mostly;
static atomic_t nr_text_poke_events __read_mostly;
static atomic_t nr_build_id_events __read_mostly;

static LIST_HEAD(pmus);
static DEFINE_MUTEX(pmus_lock);
static struct srcu_struct pmus_srcu;
static cpumask_var_t perf_online_mask;
static cpumask_var_t perf_online_core_mask;
static cpumask_var_t perf_online_die_mask;
static cpumask_var_t perf_online_cluster_mask;
static cpumask_var_t perf_online_pkg_mask;
static cpumask_var_t perf_online_sys_mask;
static struct kmem_cache *perf_event_cache;

/*
 * perf event paranoia level:
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
 *   1 - disallow cpu events for unpriv
 *   2 - disallow kernel profiling for unpriv
 */
int sysctl_perf_event_paranoid __read_mostly = 2;

/* Minimum for 512 kiB + 1 user control page. 'free' kiB per user. */
static int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024);

/*
 * max perf event sample rate
 */
#define DEFAULT_MAX_SAMPLE_RATE                100000
#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
#define DEFAULT_CPU_TIME_MAX_PERCENT        25

int sysctl_perf_event_sample_rate __read_mostly        = DEFAULT_MAX_SAMPLE_RATE;
static int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;

static int max_samples_per_tick __read_mostly        = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
static int perf_sample_period_ns __read_mostly        = DEFAULT_SAMPLE_PERIOD_NS;

static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;

static void update_perf_cpu_limits(void)
{
        u64 tmp = perf_sample_period_ns;

        tmp *= sysctl_perf_cpu_time_max_percent;
        tmp = div_u64(tmp, 100);
        if (!tmp)
                tmp = 1;

        WRITE_ONCE(perf_sample_allowed_ns, tmp);
}

static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc);

static int perf_event_max_sample_rate_handler(const struct ctl_table *table, int write,
                                       void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        int perf_cpu = sysctl_perf_cpu_time_max_percent;
        /*
         * If throttling is disabled don't allow the write:
         */
        if (write && (perf_cpu == 100 || perf_cpu == 0))
                return -EINVAL;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;

        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();

        return 0;
}

static int perf_cpu_time_max_percent_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);

        if (ret || !write)
                return ret;

        if (sysctl_perf_cpu_time_max_percent == 100 ||
            sysctl_perf_cpu_time_max_percent == 0) {
                printk(KERN_WARNING
                       "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
                WRITE_ONCE(perf_sample_allowed_ns, 0);
        } else {
                update_perf_cpu_limits();
        }

        return 0;
}

static const struct ctl_table events_core_sysctl_table[] = {
        /*
         * User-space relies on this file as a feature check for
         * perf_events being enabled. It's an ABI, do not remove!
         */
        {
                .procname        = "perf_event_paranoid",
                .data                = &sysctl_perf_event_paranoid,
                .maxlen                = sizeof(sysctl_perf_event_paranoid),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "perf_event_mlock_kb",
                .data                = &sysctl_perf_event_mlock,
                .maxlen                = sizeof(sysctl_perf_event_mlock),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "perf_event_max_sample_rate",
                .data                = &sysctl_perf_event_sample_rate,
                .maxlen                = sizeof(sysctl_perf_event_sample_rate),
                .mode                = 0644,
                .proc_handler        = perf_event_max_sample_rate_handler,
                .extra1                = SYSCTL_ONE,
        },
        {
                .procname        = "perf_cpu_time_max_percent",
                .data                = &sysctl_perf_cpu_time_max_percent,
                .maxlen                = sizeof(sysctl_perf_cpu_time_max_percent),
                .mode                = 0644,
                .proc_handler        = perf_cpu_time_max_percent_handler,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE_HUNDRED,
        },
};

static int __init init_events_core_sysctls(void)
{
        register_sysctl_init("kernel", events_core_sysctl_table);
        return 0;
}
core_initcall(init_events_core_sysctls);


/*
 * perf samples are done in some very critical code paths (NMIs).
 * If they take too much CPU time, the system can lock up and not
 * get any real work done.  This will drop the sample rate when
 * we detect that events are taking too long.
 */
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);

static u64 __report_avg;
static u64 __report_allowed;

static void perf_duration_warn(struct irq_work *w)
{
        printk_ratelimited(KERN_INFO
                "perf: interrupt took too long (%lld > %lld), lowering "
                "kernel.perf_event_max_sample_rate to %d\n",
                __report_avg, __report_allowed,
                sysctl_perf_event_sample_rate);
}

static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);

void perf_sample_event_took(u64 sample_len_ns)
{
        u64 max_len = READ_ONCE(perf_sample_allowed_ns);
        u64 running_len;
        u64 avg_len;
        u32 max;

        if (max_len == 0)
                return;

        /* Decay the counter by 1 average sample. */
        running_len = __this_cpu_read(running_sample_length);
        running_len -= running_len/NR_ACCUMULATED_SAMPLES;
        running_len += sample_len_ns;
        __this_cpu_write(running_sample_length, running_len);

        /*
         * Note: this will be biased artificially low until we have
         * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
         * from having to maintain a count.
         */
        avg_len = running_len/NR_ACCUMULATED_SAMPLES;
        if (avg_len <= max_len)
                return;

        __report_avg = avg_len;
        __report_allowed = max_len;

        /*
         * Compute a throttle threshold 25% below the current duration.
         */
        avg_len += avg_len / 4;
        max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
        if (avg_len < max)
                max /= (u32)avg_len;
        else
                max = 1;

        WRITE_ONCE(perf_sample_allowed_ns, avg_len);
        WRITE_ONCE(max_samples_per_tick, max);

        sysctl_perf_event_sample_rate = max * HZ;
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;

        if (!irq_work_queue(&perf_duration_work)) {
                early_printk("perf: interrupt took too long (%lld > %lld), lowering "
                             "kernel.perf_event_max_sample_rate to %d\n",
                             __report_avg, __report_allowed,
                             sysctl_perf_event_sample_rate);
        }
}

static atomic64_t perf_event_id;

static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);

void __weak perf_event_print_debug(void)        { }

static inline u64 perf_clock(void)
{
        return local_clock();
}

static inline u64 perf_event_clock(struct perf_event *event)
{
        return event->clock();
}

/*
 * State based event timekeeping...
 *
 * The basic idea is to use event->state to determine which (if any) time
 * fields to increment with the current delta. This means we only need to
 * update timestamps when we change state or when they are explicitly requested
 * (read).
 *
 * Event groups make things a little more complicated, but not terribly so. The
 * rules for a group are that if the group leader is OFF the entire group is
 * OFF, irrespective of what the group member states are. This results in
 * __perf_effective_state().
 *
 * A further ramification is that when a group leader flips between OFF and
 * !OFF, we need to update all group member times.
 *
 *
 * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 * need to make sure the relevant context time is updated before we try and
 * update our timestamps.
 */

static __always_inline enum perf_event_state
__perf_effective_state(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;

        if (leader->state <= PERF_EVENT_STATE_OFF)
                return leader->state;

        return event->state;
}

static __always_inline void
__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
{
        enum perf_event_state state = __perf_effective_state(event);
        u64 delta = now - event->tstamp;

        *enabled = event->total_time_enabled;
        if (state >= PERF_EVENT_STATE_INACTIVE)
                *enabled += delta;

        *running = event->total_time_running;
        if (state >= PERF_EVENT_STATE_ACTIVE)
                *running += delta;
}

static void perf_event_update_time(struct perf_event *event)
{
        u64 now = perf_event_time(event);

        __perf_update_times(event, now, &event->total_time_enabled,
                                        &event->total_time_running);
        event->tstamp = now;
}

static void perf_event_update_sibling_time(struct perf_event *leader)
{
        struct perf_event *sibling;

        for_each_sibling_event(sibling, leader)
                perf_event_update_time(sibling);
}

static void
perf_event_set_state(struct perf_event *event, enum perf_event_state state)
{
        if (event->state == state)
                return;

        perf_event_update_time(event);
        /*
         * If a group leader gets enabled/disabled all its siblings
         * are affected too.
         */
        if ((event->state < 0) ^ (state < 0))
                perf_event_update_sibling_time(event);

        WRITE_ONCE(event->state, state);
}

/*
 * UP store-release, load-acquire
 */

#define __store_release(ptr, val)                                        \
do {                                                                        \
        barrier();                                                        \
        WRITE_ONCE(*(ptr), (val));                                        \
} while (0)

#define __load_acquire(ptr)                                                \
({                                                                        \
        __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr));        \
        barrier();                                                        \
        ___p;                                                                \
})

#define for_each_epc(_epc, _ctx, _pmu, _cgroup)                                \
        list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \
                if (_cgroup && !_epc->nr_cgroups)                        \
                        continue;                                        \
                else if (_pmu && _epc->pmu != _pmu)                        \
                        continue;                                        \
                else

static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup)
{
        struct perf_event_pmu_context *pmu_ctx;

        for_each_epc(pmu_ctx, ctx, NULL, cgroup)
                perf_pmu_disable(pmu_ctx->pmu);
}

static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup)
{
        struct perf_event_pmu_context *pmu_ctx;

        for_each_epc(pmu_ctx, ctx, NULL, cgroup)
                perf_pmu_enable(pmu_ctx->pmu);
}

static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);
static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type);

#ifdef CONFIG_CGROUP_PERF

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        /* @event doesn't care about cgroup */
        if (!event->cgrp)
                return true;

        /* wants specific cgroup scope but @cpuctx isn't associated with any */
        if (!cpuctx->cgrp)
                return false;

        /*
         * Cgroup scoping is recursive.  An event enabled for a cgroup is
         * also enabled for all its descendant cgroups.  If @cpuctx's
         * cgroup is a descendant of @event's (the test covers identity
         * case), it's a match.
         */
        return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
                                    event->cgrp->css.cgroup);
}

static inline void perf_detach_cgroup(struct perf_event *event)
{
        css_put(&event->cgrp->css);
        event->cgrp = NULL;
}

static inline int is_cgroup_event(struct perf_event *event)
{
        return event->cgrp != NULL;
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        return t->time;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_cgroup_info *t;

        t = per_cpu_ptr(event->cgrp->info, event->cpu);
        if (!__load_acquire(&t->active))
                return t->time;
        now += READ_ONCE(t->timeoffset);
        return now;
}

static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv)
{
        if (adv)
                info->time += now - info->timestamp;
        info->timestamp = now;
        /*
         * see update_context_time()
         */
        WRITE_ONCE(info->timeoffset, info->time - info->timestamp);
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final)
{
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct cgroup_subsys_state *css;
        struct perf_cgroup_info *info;

        if (cgrp) {
                u64 now = perf_clock();

                for (css = &cgrp->css; css; css = css->parent) {
                        cgrp = container_of(css, struct perf_cgroup, css);
                        info = this_cpu_ptr(cgrp->info);

                        __update_cgrp_time(info, now, true);
                        if (final)
                                __store_release(&info->active, 0);
                }
        }
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
        struct perf_cgroup_info *info;

        /*
         * ensure we access cgroup data only when needed and
         * when we know the cgroup is pinned (css_get)
         */
        if (!is_cgroup_event(event))
                return;

        info = this_cpu_ptr(event->cgrp->info);
        /*
         * Do not update time when cgroup is not active
         */
        if (info->active)
                __update_cgrp_time(info, perf_clock(), true);
}

static inline void
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
        struct perf_event_context *ctx = &cpuctx->ctx;
        struct perf_cgroup *cgrp = cpuctx->cgrp;
        struct perf_cgroup_info *info;
        struct cgroup_subsys_state *css;

        /*
         * ctx->lock held by caller
         * ensure we do not access cgroup data
         * unless we have the cgroup pinned (css_get)
         */
        if (!cgrp)
                return;

        WARN_ON_ONCE(!ctx->nr_cgroups);

        for (css = &cgrp->css; css; css = css->parent) {
                cgrp = container_of(css, struct perf_cgroup, css);
                info = this_cpu_ptr(cgrp->info);
                __update_cgrp_time(info, ctx->timestamp, false);
                __store_release(&info->active, 1);
        }
}

/*
 * reschedule events based on the cgroup constraint of task.
 */
static void perf_cgroup_switch(struct task_struct *task)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cgroup *cgrp;

        /*
         * cpuctx->cgrp is set when the first cgroup event enabled,
         * and is cleared when the last cgroup event disabled.
         */
        if (READ_ONCE(cpuctx->cgrp) == NULL)
                return;

        WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);

        cgrp = perf_cgroup_from_task(task, NULL);
        if (READ_ONCE(cpuctx->cgrp) == cgrp)
                return;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_ctx_disable(&cpuctx->ctx, true);

        ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);
        /*
         * must not be done before ctxswout due
         * to update_cgrp_time_from_cpuctx() in
         * ctx_sched_out()
         */
        cpuctx->cgrp = cgrp;
        /*
         * set cgrp before ctxsw in to allow
         * perf_cgroup_set_timestamp() in ctx_sched_in()
         * to not have to pass task around
         */
        ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP);

        perf_ctx_enable(&cpuctx->ctx, true);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static int perf_cgroup_ensure_storage(struct perf_event *event,
                                struct cgroup_subsys_state *css)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event **storage;
        int cpu, heap_size, ret = 0;

        /*
         * Allow storage to have sufficient space for an iterator for each
         * possibly nested cgroup plus an iterator for events with no cgroup.
         */
        for (heap_size = 1; css; css = css->parent)
                heap_size++;

        for_each_possible_cpu(cpu) {
                cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                if (heap_size <= cpuctx->heap_size)
                        continue;

                storage = kmalloc_node(heap_size * sizeof(struct perf_event *),
                                       GFP_KERNEL, cpu_to_node(cpu));
                if (!storage) {
                        ret = -ENOMEM;
                        break;
                }

                raw_spin_lock_irq(&cpuctx->ctx.lock);
                if (cpuctx->heap_size < heap_size) {
                        swap(cpuctx->heap, storage);
                        if (storage == cpuctx->heap_default)
                                storage = NULL;
                        cpuctx->heap_size = heap_size;
                }
                raw_spin_unlock_irq(&cpuctx->ctx.lock);

                kfree(storage);
        }

        return ret;
}

static inline int perf_cgroup_connect(int fd, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        struct perf_cgroup *cgrp;
        struct cgroup_subsys_state *css;
        CLASS(fd, f)(fd);
        int ret = 0;

        if (fd_empty(f))
                return -EBADF;

        css = css_tryget_online_from_dir(fd_file(f)->f_path.dentry,
                                         &perf_event_cgrp_subsys);
        if (IS_ERR(css))
                return PTR_ERR(css);

        ret = perf_cgroup_ensure_storage(event, css);
        if (ret)
                return ret;

        cgrp = container_of(css, struct perf_cgroup, css);
        event->cgrp = cgrp;

        /*
         * all events in a group must monitor
         * the same cgroup because a task belongs
         * to only one perf cgroup at a time
         */
        if (group_leader && group_leader->cgrp != cgrp) {
                perf_detach_cgroup(event);
                ret = -EINVAL;
        }
        return ret;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        event->pmu_ctx->nr_cgroups++;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (ctx->nr_cgroups++)
                return;

        cpuctx->cgrp = perf_cgroup_from_task(current, ctx);
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_cpu_context *cpuctx;

        if (!is_cgroup_event(event))
                return;

        event->pmu_ctx->nr_cgroups--;

        /*
         * Because cgroup events are always per-cpu events,
         * @ctx == &cpuctx->ctx.
         */
        cpuctx = container_of(ctx, struct perf_cpu_context, ctx);

        if (--ctx->nr_cgroups)
                return;

        cpuctx->cgrp = NULL;
}

#else /* !CONFIG_CGROUP_PERF */

static inline bool
perf_cgroup_match(struct perf_event *event)
{
        return true;
}

static inline void perf_detach_cgroup(struct perf_event *event)
{}

static inline int is_cgroup_event(struct perf_event *event)
{
        return 0;
}

static inline void update_cgrp_time_from_event(struct perf_event *event)
{
}

static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx,
                                                bool final)
{
}

static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
                                      struct perf_event_attr *attr,
                                      struct perf_event *group_leader)
{
        return -EINVAL;
}

static inline void
perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx)
{
}

static inline u64 perf_cgroup_event_time(struct perf_event *event)
{
        return 0;
}

static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now)
{
        return 0;
}

static inline void
perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static inline void
perf_cgroup_event_disable(struct perf_event *event, struct perf_event_context *ctx)
{
}

static void perf_cgroup_switch(struct task_struct *task)
{
}
#endif

/*
 * set default to be dependent on timer tick just
 * like original code
 */
#define PERF_CPU_HRTIMER (1000 / HZ)
/*
 * function must be called with interrupts disabled
 */
static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
{
        struct perf_cpu_pmu_context *cpc;
        bool rotations;

        lockdep_assert_irqs_disabled();

        cpc = container_of(hr, struct perf_cpu_pmu_context, hrtimer);
        rotations = perf_rotate_context(cpc);

        raw_spin_lock(&cpc->hrtimer_lock);
        if (rotations)
                hrtimer_forward_now(hr, cpc->hrtimer_interval);
        else
                cpc->hrtimer_active = 0;
        raw_spin_unlock(&cpc->hrtimer_lock);

        return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
}

static void __perf_mux_hrtimer_init(struct perf_cpu_pmu_context *cpc, int cpu)
{
        struct hrtimer *timer = &cpc->hrtimer;
        struct pmu *pmu = cpc->epc.pmu;
        u64 interval;

        /*
         * check default is sane, if not set then force to
         * default interval (1/tick)
         */
        interval = pmu->hrtimer_interval_ms;
        if (interval < 1)
                interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;

        cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);

        raw_spin_lock_init(&cpc->hrtimer_lock);
        hrtimer_setup(timer, perf_mux_hrtimer_handler, CLOCK_MONOTONIC,
                      HRTIMER_MODE_ABS_PINNED_HARD);
}

static int perf_mux_hrtimer_restart(struct perf_cpu_pmu_context *cpc)
{
        struct hrtimer *timer = &cpc->hrtimer;
        unsigned long flags;

        raw_spin_lock_irqsave(&cpc->hrtimer_lock, flags);
        if (!cpc->hrtimer_active) {
                cpc->hrtimer_active = 1;
                hrtimer_forward_now(timer, cpc->hrtimer_interval);
                hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD);
        }
        raw_spin_unlock_irqrestore(&cpc->hrtimer_lock, flags);

        return 0;
}

static int perf_mux_hrtimer_restart_ipi(void *arg)
{
        return perf_mux_hrtimer_restart(arg);
}

static __always_inline struct perf_cpu_pmu_context *this_cpc(struct pmu *pmu)
{
        return *this_cpu_ptr(pmu->cpu_pmu_context);
}

void perf_pmu_disable(struct pmu *pmu)
{
        int *count = &this_cpc(pmu)->pmu_disable_count;
        if (!(*count)++)
                pmu->pmu_disable(pmu);
}

void perf_pmu_enable(struct pmu *pmu)
{
        int *count = &this_cpc(pmu)->pmu_disable_count;
        if (!--(*count))
                pmu->pmu_enable(pmu);
}

static void perf_assert_pmu_disabled(struct pmu *pmu)
{
        int *count = &this_cpc(pmu)->pmu_disable_count;
        WARN_ON_ONCE(*count == 0);
}

static inline void perf_pmu_read(struct perf_event *event)
{
        if (event->state == PERF_EVENT_STATE_ACTIVE)
                event->pmu->read(event);
}

static void get_ctx(struct perf_event_context *ctx)
{
        refcount_inc(&ctx->refcount);
}

static void free_ctx(struct rcu_head *head)
{
        struct perf_event_context *ctx;

        ctx = container_of(head, struct perf_event_context, rcu_head);
        kfree(ctx);
}

static void put_ctx(struct perf_event_context *ctx)
{
        if (refcount_dec_and_test(&ctx->refcount)) {
                if (ctx->parent_ctx)
                        put_ctx(ctx->parent_ctx);
                if (ctx->task && ctx->task != TASK_TOMBSTONE)
                        put_task_struct(ctx->task);
                call_rcu(&ctx->rcu_head, free_ctx);
        }
}

/*
 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
 * perf_pmu_migrate_context() we need some magic.
 *
 * Those places that change perf_event::ctx will hold both
 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
 *
 * Lock ordering is by mutex address. There are two other sites where
 * perf_event_context::mutex nests and those are:
 *
 *  - perf_event_exit_task_context()        [ child , 0 ]
 *      perf_event_exit_event()
 *        put_event()                        [ parent, 1 ]
 *
 *  - perf_event_init_context()                [ parent, 0 ]
 *      inherit_task_group()
 *        inherit_group()
 *          inherit_event()
 *            perf_event_alloc()
 *              perf_init_event()
 *                perf_try_init_event()        [ child , 1 ]
 *
 * While it appears there is an obvious deadlock here -- the parent and child
 * nesting levels are inverted between the two. This is in fact safe because
 * life-time rules separate them. That is an exiting task cannot fork, and a
 * spawning task cannot (yet) exit.
 *
 * But remember that these are parent<->child context relations, and
 * migration does not affect children, therefore these two orderings should not
 * interact.
 *
 * The change in perf_event::ctx does not affect children (as claimed above)
 * because the sys_perf_event_open() case will install a new event and break
 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
 * concerned with cpuctx and that doesn't have children.
 *
 * The places that change perf_event::ctx will issue:
 *
 *   perf_remove_from_context();
 *   synchronize_rcu();
 *   perf_install_in_context();
 *
 * to affect the change. The remove_from_context() + synchronize_rcu() should
 * quiesce the event, after which we can install it in the new location. This
 * means that only external vectors (perf_fops, prctl) can perturb the event
 * while in transit. Therefore all such accessors should also acquire
 * perf_event_context::mutex to serialize against this.
 *
 * However; because event->ctx can change while we're waiting to acquire
 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
 * function.
 *
 * Lock order:
 *    exec_update_lock
 *        task_struct::perf_event_mutex
 *          perf_event_context::mutex
 *            perf_event::child_mutex;
 *              perf_event_context::lock
 *            mmap_lock
 *              perf_event::mmap_mutex
 *                perf_buffer::aux_mutex
 *              perf_addr_filters_head::lock
 *
 *    cpu_hotplug_lock
 *      pmus_lock
 *          cpuctx->mutex / perf_event_context::mutex
 */
static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
{
        struct perf_event_context *ctx;

again:
        rcu_read_lock();
        ctx = READ_ONCE(event->ctx);
        if (!refcount_inc_not_zero(&ctx->refcount)) {
                rcu_read_unlock();
                goto again;
        }
        rcu_read_unlock();

        mutex_lock_nested(&ctx->mutex, nesting);
        if (event->ctx != ctx) {
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);
                goto again;
        }

        return ctx;
}

static inline struct perf_event_context *
perf_event_ctx_lock(struct perf_event *event)
{
        return perf_event_ctx_lock_nested(event, 0);
}

static void perf_event_ctx_unlock(struct perf_event *event,
                                  struct perf_event_context *ctx)
{
        mutex_unlock(&ctx->mutex);
        put_ctx(ctx);
}

/*
 * This must be done under the ctx->lock, such as to serialize against
 * context_equiv(), therefore we cannot call put_ctx() since that might end up
 * calling scheduler related locks and ctx->lock nests inside those.
 */
static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context *ctx)
{
        struct perf_event_context *parent_ctx = ctx->parent_ctx;

        lockdep_assert_held(&ctx->lock);

        if (parent_ctx)
                ctx->parent_ctx = NULL;
        ctx->generation++;

        return parent_ctx;
}

static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
                                enum pid_type type)
{
        u32 nr;
        /*
         * only top level events have the pid namespace they were created in
         */
        if (event->parent)
                event = event->parent;

        nr = __task_pid_nr_ns(p, type, event->ns);
        /* avoid -1 if it is idle thread or runs in another ns */
        if (!nr && !pid_alive(p))
                nr = -1;
        return nr;
}

static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_TGID);
}

static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
{
        return perf_event_pid_type(event, p, PIDTYPE_PID);
}

/*
 * If we inherit events we want to return the parent event id
 * to userspace.
 */
static u64 primary_event_id(struct perf_event *event)
{
        u64 id = event->id;

        if (event->parent)
                id = event->parent->id;

        return id;
}

/*
 * Get the perf_event_context for a task and lock it.
 *
 * This has to cope with the fact that until it is locked,
 * the context could get moved to another task.
 */
static struct perf_event_context *
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
{
        struct perf_event_context *ctx;

retry:
        /*
         * One of the few rules of preemptible RCU is that one cannot do
         * rcu_read_unlock() while holding a scheduler (or nested) lock when
         * part of the read side critical section was irqs-enabled -- see
         * rcu_read_unlock_special().
         *
         * Since ctx->lock nests under rq->lock we must ensure the entire read
         * side critical section has interrupts disabled.
         */
        local_irq_save(*flags);
        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp);
        if (ctx) {
                /*
                 * If this context is a clone of another, it might
                 * get swapped for another underneath us by
                 * perf_event_task_sched_out, though the
                 * rcu_read_lock() protects us from any context
                 * getting freed.  Lock the context and check if it
                 * got swapped before we could get the lock, and retry
                 * if so.  If we locked the right context, then it
                 * can't get swapped on us any more.
                 */
                raw_spin_lock(&ctx->lock);
                if (ctx != rcu_dereference(task->perf_event_ctxp)) {
                        raw_spin_unlock(&ctx->lock);
                        rcu_read_unlock();
                        local_irq_restore(*flags);
                        goto retry;
                }

                if (ctx->task == TASK_TOMBSTONE ||
                    !refcount_inc_not_zero(&ctx->refcount)) {
                        raw_spin_unlock(&ctx->lock);
                        ctx = NULL;
                } else {
                        WARN_ON_ONCE(ctx->task != task);
                }
        }
        rcu_read_unlock();
        if (!ctx)
                local_irq_restore(*flags);
        return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
static struct perf_event_context *
perf_pin_task_context(struct task_struct *task)
{
        struct perf_event_context *ctx;
        unsigned long flags;

        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
        return ctx;
}

static void perf_unpin_context(struct perf_event_context *ctx)
{
        unsigned long flags;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        --ctx->pin_count;
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
}

/*
 * Update the record of the current time in a context.
 */
static void __update_context_time(struct perf_event_context *ctx, bool adv)
{
        u64 now = perf_clock();

        lockdep_assert_held(&ctx->lock);

        if (adv)
                ctx->time += now - ctx->timestamp;
        ctx->timestamp = now;

        /*
         * The above: time' = time + (now - timestamp), can be re-arranged
         * into: time` = now + (time - timestamp), which gives a single value
         * offset to compute future time without locks on.
         *
         * See perf_event_time_now(), which can be used from NMI context where
         * it's (obviously) not possible to acquire ctx->lock in order to read
         * both the above values in a consistent manner.
         */
        WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp);
}

static void update_context_time(struct perf_event_context *ctx)
{
        __update_context_time(ctx, true);
}

static u64 perf_event_time(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time(event);

        return ctx->time;
}

static u64 perf_event_time_now(struct perf_event *event, u64 now)
{
        struct perf_event_context *ctx = event->ctx;

        if (unlikely(!ctx))
                return 0;

        if (is_cgroup_event(event))
                return perf_cgroup_event_time_now(event, now);

        if (!(__load_acquire(&ctx->is_active) & EVENT_TIME))
                return ctx->time;

        now += READ_ONCE(ctx->timeoffset);
        return now;
}

static enum event_type_t get_event_type(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        enum event_type_t event_type;

        lockdep_assert_held(&ctx->lock);

        /*
         * It's 'group type', really, because if our group leader is
         * pinned, so are we.
         */
        if (event->group_leader != event)
                event = event->group_leader;

        event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
        if (!ctx->task)
                event_type |= EVENT_CPU;

        return event_type;
}

/*
 * Helper function to initialize event group nodes.
 */
static void init_event_group(struct perf_event *event)
{
        RB_CLEAR_NODE(&event->group_node);
        event->group_index = 0;
}

/*
 * Extract pinned or flexible groups from the context
 * based on event attrs bits.
 */
static struct perf_event_groups *
get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        if (event->attr.pinned)
                return &ctx->pinned_groups;
        else
                return &ctx->flexible_groups;
}

/*
 * Helper function to initializes perf_event_group trees.
 */
static void perf_event_groups_init(struct perf_event_groups *groups)
{
        groups->tree = RB_ROOT;
        groups->index = 0;
}

static inline struct cgroup *event_cgroup(const struct perf_event *event)
{
        struct cgroup *cgroup = NULL;

#ifdef CONFIG_CGROUP_PERF
        if (event->cgrp)
                cgroup = event->cgrp->css.cgroup;
#endif

        return cgroup;
}

/*
 * Compare function for event groups;
 *
 * Implements complex key that first sorts by CPU and then by virtual index
 * which provides ordering when rotating groups for the same CPU.
 */
static __always_inline int
perf_event_groups_cmp(const int left_cpu, const struct pmu *left_pmu,
                      const struct cgroup *left_cgroup, const u64 left_group_index,
                      const struct perf_event *right)
{
        if (left_cpu < right->cpu)
                return -1;
        if (left_cpu > right->cpu)
                return 1;

        if (left_pmu) {
                if (left_pmu < right->pmu_ctx->pmu)
                        return -1;
                if (left_pmu > right->pmu_ctx->pmu)
                        return 1;
        }

#ifdef CONFIG_CGROUP_PERF
        {
                const struct cgroup *right_cgroup = event_cgroup(right);

                if (left_cgroup != right_cgroup) {
                        if (!left_cgroup) {
                                /*
                                 * Left has no cgroup but right does, no
                                 * cgroups come first.
                                 */
                                return -1;
                        }
                        if (!right_cgroup) {
                                /*
                                 * Right has no cgroup but left does, no
                                 * cgroups come first.
                                 */
                                return 1;
                        }
                        /* Two dissimilar cgroups, order by id. */
                        if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup))
                                return -1;

                        return 1;
                }
        }
#endif

        if (left_group_index < right->group_index)
                return -1;
        if (left_group_index > right->group_index)
                return 1;

        return 0;
}

#define __node_2_pe(node) \
        rb_entry((node), struct perf_event, group_node)

static inline bool __group_less(struct rb_node *a, const struct rb_node *b)
{
        struct perf_event *e = __node_2_pe(a);
        return perf_event_groups_cmp(e->cpu, e->pmu_ctx->pmu, event_cgroup(e),
                                     e->group_index, __node_2_pe(b)) < 0;
}

struct __group_key {
        int cpu;
        struct pmu *pmu;
        struct cgroup *cgroup;
};

static inline int __group_cmp(const void *key, const struct rb_node *node)
{
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);

        /* partial/subtree match: @cpu, @pmu, @cgroup; ignore: @group_index */
        return perf_event_groups_cmp(a->cpu, a->pmu, a->cgroup, b->group_index, b);
}

static inline int
__group_cmp_ignore_cgroup(const void *key, const struct rb_node *node)
{
        const struct __group_key *a = key;
        const struct perf_event *b = __node_2_pe(node);

        /* partial/subtree match: @cpu, @pmu, ignore: @cgroup, @group_index */
        return perf_event_groups_cmp(a->cpu, a->pmu, event_cgroup(b),
                                     b->group_index, b);
}

/*
 * Insert @event into @groups' tree; using
 *   {@event->cpu, @event->pmu_ctx->pmu, event_cgroup(@event), ++@groups->index}
 * as key. This places it last inside the {cpu,pmu,cgroup} subtree.
 */
static void
perf_event_groups_insert(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        event->group_index = ++groups->index;

        rb_add(&event->group_node, &groups->tree, __group_less);
}

/*
 * Helper function to insert event into the pinned or flexible groups.
 */
static void
add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_insert(groups, event);
}

/*
 * Delete a group from a tree.
 */
static void
perf_event_groups_delete(struct perf_event_groups *groups,
                         struct perf_event *event)
{
        WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
                     RB_EMPTY_ROOT(&groups->tree));

        rb_erase(&event->group_node, &groups->tree);
        init_event_group(event);
}

/*
 * Helper function to delete event from its groups.
 */
static void
del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_groups *groups;

        groups = get_event_groups(event, ctx);
        perf_event_groups_delete(groups, event);
}

/*
 * Get the leftmost event in the {cpu,pmu,cgroup} subtree.
 */
static struct perf_event *
perf_event_groups_first(struct perf_event_groups *groups, int cpu,
                        struct pmu *pmu, struct cgroup *cgrp)
{
        struct __group_key key = {
                .cpu = cpu,
                .pmu = pmu,
                .cgroup = cgrp,
        };
        struct rb_node *node;

        node = rb_find_first(&key, &groups->tree, __group_cmp);
        if (node)
                return __node_2_pe(node);

        return NULL;
}

static struct perf_event *
perf_event_groups_next(struct perf_event *event, struct pmu *pmu)
{
        struct __group_key key = {
                .cpu = event->cpu,
                .pmu = pmu,
                .cgroup = event_cgroup(event),
        };
        struct rb_node *next;

        next = rb_next_match(&key, &event->group_node, __group_cmp);
        if (next)
                return __node_2_pe(next);

        return NULL;
}

#define perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu)                \
        for (event = perf_event_groups_first(groups, cpu, pmu, NULL);        \
             event; event = perf_event_groups_next(event, pmu))

/*
 * Iterate through the whole groups tree.
 */
#define perf_event_groups_for_each(event, groups)                        \
        for (event = rb_entry_safe(rb_first(&((groups)->tree)),                \
                                typeof(*event), group_node); event;        \
                event = rb_entry_safe(rb_next(&event->group_node),        \
                                typeof(*event), group_node))

/*
 * Does the event attribute request inherit with PERF_SAMPLE_READ
 */
static inline bool has_inherit_and_sample_read(struct perf_event_attr *attr)
{
        return attr->inherit && (attr->sample_type & PERF_SAMPLE_READ);
}

/*
 * Add an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
        lockdep_assert_held(&ctx->lock);

        WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
        event->attach_state |= PERF_ATTACH_CONTEXT;

        event->tstamp = perf_event_time(event);

        /*
         * If we're a stand alone event or group leader, we go to the context
         * list, group events are kept attached to the group so that
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
                event->group_caps = event->event_caps;
                add_event_to_groups(event, ctx);
        }

        list_add_rcu(&event->event_entry, &ctx->event_list);
        ctx->nr_events++;
        if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
                ctx->nr_user++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
        if (has_inherit_and_sample_read(&event->attr))
                local_inc(&ctx->nr_no_switch_fast);

        if (event->state > PERF_EVENT_STATE_OFF)
                perf_cgroup_event_enable(event, ctx);

        ctx->generation++;
        event->pmu_ctx->nr_events++;
}

/*
 * Initialize event state based on the perf_event_attr::disabled.
 */
static inline void perf_event__state_init(struct perf_event *event)
{
        event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
                                              PERF_EVENT_STATE_INACTIVE;
}

static int __perf_event_read_size(u64 read_format, int nr_siblings)
{
        int entry = sizeof(u64); /* value */
        int size = 0;
        int nr = 1;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                size += sizeof(u64);

        if (read_format & PERF_FORMAT_ID)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_LOST)
                entry += sizeof(u64);

        if (read_format & PERF_FORMAT_GROUP) {
                nr += nr_siblings;
                size += sizeof(u64);
        }

        /*
         * Since perf_event_validate_size() limits this to 16k and inhibits
         * adding more siblings, this will never overflow.
         */
        return size + nr * entry;
}

static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
{
        struct perf_sample_data *data;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);

        if (sample_type & PERF_SAMPLE_ADDR)
                size += sizeof(data->addr);

        if (sample_type & PERF_SAMPLE_PERIOD)
                size += sizeof(data->period);

        if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
                size += sizeof(data->weight.full);

        if (sample_type & PERF_SAMPLE_READ)
                size += event->read_size;

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                size += sizeof(data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                size += sizeof(data->txn);

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                size += sizeof(data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                size += sizeof(data->cgroup);

        if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
                size += sizeof(data->data_page_size);

        if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
                size += sizeof(data->code_page_size);

        event->header_size = size;
}

/*
 * Called at perf_event creation and when events are attached/detached from a
 * group.
 */
static void perf_event__header_size(struct perf_event *event)
{
        event->read_size =
                __perf_event_read_size(event->attr.read_format,
                                       event->group_leader->nr_siblings);
        __perf_event_header_size(event, event->attr.sample_type);
}

static void perf_event__id_header_size(struct perf_event *event)
{
        struct perf_sample_data *data;
        u64 sample_type = event->attr.sample_type;
        u16 size = 0;

        if (sample_type & PERF_SAMPLE_TID)
                size += sizeof(data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                size += sizeof(data->time);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_ID)
                size += sizeof(data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                size += sizeof(data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                size += sizeof(data->cpu_entry);

        event->id_header_size = size;
}

/*
 * Check that adding an event to the group does not result in anybody
 * overflowing the 64k event limit imposed by the output buffer.
 *
 * Specifically, check that the read_size for the event does not exceed 16k,
 * read_size being the one term that grows with groups size. Since read_size
 * depends on per-event read_format, also (re)check the existing events.
 *
 * This leaves 48k for the constant size fields and things like callchains,
 * branch stacks and register sets.
 */
static bool perf_event_validate_size(struct perf_event *event)
{
        struct perf_event *sibling, *group_leader = event->group_leader;

        if (__perf_event_read_size(event->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        if (__perf_event_read_size(group_leader->attr.read_format,
                                   group_leader->nr_siblings + 1) > 16*1024)
                return false;

        /*
         * When creating a new group leader, group_leader->ctx is initialized
         * after the size has been validated, but we cannot safely use
         * for_each_sibling_event() until group_leader->ctx is set. A new group
         * leader cannot have any siblings yet, so we can safely skip checking
         * the non-existent siblings.
         */
        if (event == group_leader)
                return true;

        for_each_sibling_event(sibling, group_leader) {
                if (__perf_event_read_size(sibling->attr.read_format,
                                           group_leader->nr_siblings + 1) > 16*1024)
                        return false;
        }

        return true;
}

static void perf_group_attach(struct perf_event *event)
{
        struct perf_event *group_leader = event->group_leader, *pos;

        lockdep_assert_held(&event->ctx->lock);

        /*
         * We can have double attach due to group movement (move_group) in
         * perf_event_open().
         */
        if (event->attach_state & PERF_ATTACH_GROUP)
                return;

        event->attach_state |= PERF_ATTACH_GROUP;

        if (group_leader == event)
                return;

        WARN_ON_ONCE(group_leader->ctx != event->ctx);

        group_leader->group_caps &= event->event_caps;

        list_add_tail(&event->sibling_list, &group_leader->sibling_list);
        group_leader->nr_siblings++;
        group_leader->group_generation++;

        perf_event__header_size(group_leader);

        for_each_sibling_event(pos, group_leader)
                perf_event__header_size(pos);
}

/*
 * Remove an event from the lists for its context.
 * Must be called with ctx->mutex and ctx->lock held.
 */
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_CONTEXT))
                return;

        event->attach_state &= ~PERF_ATTACH_CONTEXT;

        ctx->nr_events--;
        if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
                ctx->nr_user--;
        if (event->attr.inherit_stat)
                ctx->nr_stat--;
        if (has_inherit_and_sample_read(&event->attr))
                local_dec(&ctx->nr_no_switch_fast);

        list_del_rcu(&event->event_entry);

        if (event->group_leader == event)
                del_event_from_groups(event, ctx);

        /*
         * If event was in error state, then keep it
         * that way, otherwise bogus counts will be
         * returned on read(). The only way to get out
         * of error state is by explicit re-enabling
         * of the event
         */
        if (event->state > PERF_EVENT_STATE_OFF) {
                perf_cgroup_event_disable(event, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        }

        ctx->generation++;
        event->pmu_ctx->nr_events--;
}

static int
perf_aux_output_match(struct perf_event *event, struct perf_event *aux_event)
{
        if (!has_aux(aux_event))
                return 0;

        if (!event->pmu->aux_output_match)
                return 0;

        return event->pmu->aux_output_match(aux_event);
}

static void put_event(struct perf_event *event);
static void event_sched_out(struct perf_event *event,
                            struct perf_event_context *ctx);

static void perf_put_aux_event(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *iter;

        /*
         * If event uses aux_event tear down the link
         */
        if (event->aux_event) {
                iter = event->aux_event;
                event->aux_event = NULL;
                put_event(iter);
                return;
        }

        /*
         * If the event is an aux_event, tear down all links to
         * it from other events.
         */
        for_each_sibling_event(iter, event->group_leader) {
                if (iter->aux_event != event)
                        continue;

                iter->aux_event = NULL;
                put_event(event);

                /*
                 * If it's ACTIVE, schedule it out and put it into ERROR
                 * state so that we don't try to schedule it again. Note
                 * that perf_event_enable() will clear the ERROR status.
                 */
                event_sched_out(iter, ctx);
                perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
        }
}

static bool perf_need_aux_event(struct perf_event *event)
{
        return event->attr.aux_output || has_aux_action(event);
}

static int perf_get_aux_event(struct perf_event *event,
                              struct perf_event *group_leader)
{
        /*
         * Our group leader must be an aux event if we want to be
         * an aux_output. This way, the aux event will precede its
         * aux_output events in the group, and therefore will always
         * schedule first.
         */
        if (!group_leader)
                return 0;

        /*
         * aux_output and aux_sample_size are mutually exclusive.
         */
        if (event->attr.aux_output && event->attr.aux_sample_size)
                return 0;

        if (event->attr.aux_output &&
            !perf_aux_output_match(event, group_leader))
                return 0;

        if ((event->attr.aux_pause || event->attr.aux_resume) &&
            !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
                return 0;

        if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux)
                return 0;

        if (!atomic_long_inc_not_zero(&group_leader->refcount))
                return 0;

        /*
         * Link aux_outputs to their aux event; this is undone in
         * perf_group_detach() by perf_put_aux_event(). When the
         * group in torn down, the aux_output events loose their
         * link to the aux_event and can't schedule any more.
         */
        event->aux_event = group_leader;

        return 1;
}

static inline struct list_head *get_event_list(struct perf_event *event)
{
        return event->attr.pinned ? &event->pmu_ctx->pinned_active :
                                    &event->pmu_ctx->flexible_active;
}

/*
 * Events that have PERF_EV_CAP_SIBLING require being part of a group and
 * cannot exist on their own, schedule them out and move them into the ERROR
 * state. Also see _perf_event_enable(), it will not be able to recover
 * this ERROR state.
 */
static inline void perf_remove_sibling_event(struct perf_event *event)
{
        event_sched_out(event, event->ctx);
        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}

static void perf_group_detach(struct perf_event *event)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event *sibling, *tmp;
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->lock);

        /*
         * We can have double detach due to exit/hot-unplug + close.
         */
        if (!(event->attach_state & PERF_ATTACH_GROUP))
                return;

        event->attach_state &= ~PERF_ATTACH_GROUP;

        perf_put_aux_event(event);

        /*
         * If this is a sibling, remove it from its group.
         */
        if (leader != event) {
                list_del_init(&event->sibling_list);
                event->group_leader->nr_siblings--;
                event->group_leader->group_generation++;
                goto out;
        }

        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {

                if (sibling->event_caps & PERF_EV_CAP_SIBLING)
                        perf_remove_sibling_event(sibling);

                sibling->group_leader = sibling;
                list_del_init(&sibling->sibling_list);

                /* Inherit group flags from the previous leader */
                sibling->group_caps = event->group_caps;

                if (sibling->attach_state & PERF_ATTACH_CONTEXT) {
                        add_event_to_groups(sibling, event->ctx);

                        if (sibling->state == PERF_EVENT_STATE_ACTIVE)
                                list_add_tail(&sibling->active_list, get_event_list(sibling));
                }

                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }

out:
        for_each_sibling_event(tmp, leader)
                perf_event__header_size(tmp);

        perf_event__header_size(leader);
}

static void sync_child_event(struct perf_event *child_event);

static void perf_child_detach(struct perf_event *event)
{
        struct perf_event *parent_event = event->parent;

        if (!(event->attach_state & PERF_ATTACH_CHILD))
                return;

        event->attach_state &= ~PERF_ATTACH_CHILD;

        if (WARN_ON_ONCE(!parent_event))
                return;

        lockdep_assert_held(&parent_event->child_mutex);

        sync_child_event(event);
        list_del_init(&event->child_list);
}

static bool is_orphaned_event(struct perf_event *event)
{
        return event->state == PERF_EVENT_STATE_DEAD;
}

static inline int
event_filter_match(struct perf_event *event)
{
        return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
               perf_cgroup_match(event);
}

static void
event_sched_out(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
        enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;

        // XXX cpc serialization, probably per-cpu IRQ disabled

        WARN_ON_ONCE(event->ctx != ctx);
        lockdep_assert_held(&ctx->lock);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        /*
         * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
         * we can schedule events _OUT_ individually through things like
         * __perf_remove_from_context().
         */
        list_del_init(&event->active_list);

        perf_pmu_disable(event->pmu);

        event->pmu->del(event, 0);
        event->oncpu = -1;

        if (event->pending_disable) {
                event->pending_disable = 0;
                perf_cgroup_event_disable(event, ctx);
                state = PERF_EVENT_STATE_OFF;
        }

        perf_event_set_state(event, state);

        if (!is_software_event(event))
                cpc->active_oncpu--;
        if (event->attr.freq && event->attr.sample_freq) {
                ctx->nr_freq--;
                epc->nr_freq--;
        }
        if (event->attr.exclusive || !cpc->active_oncpu)
                cpc->exclusive = 0;

        perf_pmu_enable(event->pmu);
}

static void
group_sched_out(struct perf_event *group_event, struct perf_event_context *ctx)
{
        struct perf_event *event;

        if (group_event->state != PERF_EVENT_STATE_ACTIVE)
                return;

        perf_assert_pmu_disabled(group_event->pmu_ctx->pmu);

        event_sched_out(group_event, ctx);

        /*
         * Schedule out siblings (if any):
         */
        for_each_sibling_event(event, group_event)
                event_sched_out(event, ctx);
}

static inline void
__ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final)
{
        if (ctx->is_active & EVENT_TIME) {
                if (ctx->is_active & EVENT_FROZEN)
                        return;
                update_context_time(ctx);
                update_cgrp_time_from_cpuctx(cpuctx, final);
        }
}

static inline void
ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
{
        __ctx_time_update(cpuctx, ctx, false);
}

/*
 * To be used inside perf_ctx_lock() / perf_ctx_unlock(). Lasts until perf_ctx_unlock().
 */
static inline void
ctx_time_freeze(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx)
{
        ctx_time_update(cpuctx, ctx);
        if (ctx->is_active & EVENT_TIME)
                ctx->is_active |= EVENT_FROZEN;
}

static inline void
ctx_time_update_event(struct perf_event_context *ctx, struct perf_event *event)
{
        if (ctx->is_active & EVENT_TIME) {
                if (ctx->is_active & EVENT_FROZEN)
                        return;
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }
}

#define DETACH_GROUP        0x01UL
#define DETACH_CHILD        0x02UL
#define DETACH_DEAD        0x04UL
#define DETACH_EXIT        0x08UL

/*
 * Cross CPU call to remove a performance event
 *
 * We disable the event on the hardware level first. After that we
 * remove it from the context list.
 */
static void
__perf_remove_from_context(struct perf_event *event,
                           struct perf_cpu_context *cpuctx,
                           struct perf_event_context *ctx,
                           void *info)
{
        struct perf_event_pmu_context *pmu_ctx = event->pmu_ctx;
        enum perf_event_state state = PERF_EVENT_STATE_OFF;
        unsigned long flags = (unsigned long)info;

        ctx_time_update(cpuctx, ctx);

        /*
         * Ensure event_sched_out() switches to OFF, at the very least
         * this avoids raising perf_pending_task() at this time.
         */
        if (flags & DETACH_EXIT)
                state = PERF_EVENT_STATE_EXIT;
        if (flags & DETACH_DEAD) {
                event->pending_disable = 1;
                state = PERF_EVENT_STATE_DEAD;
        }
        event_sched_out(event, ctx);
        perf_event_set_state(event, min(event->state, state));
        if (flags & DETACH_GROUP)
                perf_group_detach(event);
        if (flags & DETACH_CHILD)
                perf_child_detach(event);
        list_del_event(event, ctx);

        if (!pmu_ctx->nr_events) {
                pmu_ctx->rotate_necessary = 0;

                if (ctx->task && ctx->is_active) {
                        struct perf_cpu_pmu_context *cpc = this_cpc(pmu_ctx->pmu);

                        WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
                        cpc->task_epc = NULL;
                }
        }

        if (!ctx->nr_events && ctx->is_active) {
                if (ctx == &cpuctx->ctx)
                        update_cgrp_time_from_cpuctx(cpuctx, true);

                ctx->is_active = 0;
                if (ctx->task) {
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                        cpuctx->task_ctx = NULL;
                }
        }
}

/*
 * Remove the event from a task's (or a CPU's) list of events.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
 * When called from perf_event_exit_task, it's OK because the
 * context has been detached from its task.
 */
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
        struct perf_event_context *ctx = event->ctx;

        lockdep_assert_held(&ctx->mutex);

        /*
         * Because of perf_event_exit_task(), perf_remove_from_context() ought
         * to work in the face of TASK_TOMBSTONE, unlike every other
         * event_function_call() user.
         */
        raw_spin_lock_irq(&ctx->lock);
        if (!ctx->is_active) {
                __perf_remove_from_context(event, this_cpu_ptr(&perf_cpu_context),
                                           ctx, (void *)flags);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_remove_from_context, (void *)flags);
}

/*
 * Cross CPU call to disable a performance event
 */
static void __perf_event_disable(struct perf_event *event,
                                 struct perf_cpu_context *cpuctx,
                                 struct perf_event_context *ctx,
                                 void *info)
{
        if (event->state < PERF_EVENT_STATE_INACTIVE)
                return;

        perf_pmu_disable(event->pmu_ctx->pmu);
        ctx_time_update_event(ctx, event);

        if (event == event->group_leader)
                group_sched_out(event, ctx);
        else
                event_sched_out(event, ctx);

        perf_event_set_state(event, PERF_EVENT_STATE_OFF);
        perf_cgroup_event_disable(event, ctx);

        perf_pmu_enable(event->pmu_ctx->pmu);
}

/*
 * Disable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in perf_event_exit_event().
 *
 * When called from perf_pending_disable it's OK because event->ctx
 * is the current context on this CPU and preemption is disabled,
 * hence we can't get into perf_event_task_sched_out for this context.
 */
static void _perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state <= PERF_EVENT_STATE_OFF) {
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_disable, NULL);
}

void perf_event_disable_local(struct perf_event *event)
{
        event_function_local(event, __perf_event_disable, NULL);
}

/*
 * Strictly speaking kernel users cannot create groups and therefore this
 * interface does not need the perf_event_ctx_lock() magic.
 */
void perf_event_disable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_disable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_disable);

void perf_event_disable_inatomic(struct perf_event *event)
{
        event->pending_disable = 1;
        irq_work_queue(&event->pending_disable_irq);
}

#define MAX_INTERRUPTS (~0ULL)

static void perf_log_throttle(struct perf_event *event, int enable);
static void perf_log_itrace_start(struct perf_event *event);

static int
event_sched_in(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);
        int ret = 0;

        WARN_ON_ONCE(event->ctx != ctx);

        lockdep_assert_held(&ctx->lock);

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        WRITE_ONCE(event->oncpu, smp_processor_id());
        /*
         * Order event::oncpu write to happen before the ACTIVE state is
         * visible. This allows perf_event_{stop,read}() to observe the correct
         * ->oncpu if it sees ACTIVE.
         */
        smp_wmb();
        perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);

        /*
         * Unthrottle events, since we scheduled we might have missed several
         * ticks already, also for a heavily scheduling task there is little
         * guarantee it'll get a tick in a timely manner.
         */
        if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
                perf_log_throttle(event, 1);
                event->hw.interrupts = 0;
        }

        perf_pmu_disable(event->pmu);

        perf_log_itrace_start(event);

        if (event->pmu->add(event, PERF_EF_START)) {
                perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
                event->oncpu = -1;
                ret = -EAGAIN;
                goto out;
        }

        if (!is_software_event(event))
                cpc->active_oncpu++;
        if (event->attr.freq && event->attr.sample_freq) {
                ctx->nr_freq++;
                epc->nr_freq++;
        }
        if (event->attr.exclusive)
                cpc->exclusive = 1;

out:
        perf_pmu_enable(event->pmu);

        return ret;
}

static int
group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx)
{
        struct perf_event *event, *partial_group = NULL;
        struct pmu *pmu = group_event->pmu_ctx->pmu;

        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;

        pmu->start_txn(pmu, PERF_PMU_TXN_ADD);

        if (event_sched_in(group_event, ctx))
                goto error;

        /*
         * Schedule in siblings as one group (if any):
         */
        for_each_sibling_event(event, group_event) {
                if (event_sched_in(event, ctx)) {
                        partial_group = event;
                        goto group_error;
                }
        }

        if (!pmu->commit_txn(pmu))
                return 0;

group_error:
        /*
         * Groups can be scheduled in as one unit only, so undo any
         * partial group before returning:
         * The events up to the failed event are scheduled out normally.
         */
        for_each_sibling_event(event, group_event) {
                if (event == partial_group)
                        break;

                event_sched_out(event, ctx);
        }
        event_sched_out(group_event, ctx);

error:
        pmu->cancel_txn(pmu);
        return -EAGAIN;
}

/*
 * Work out whether we can put this event group on the CPU now.
 */
static int group_can_go_on(struct perf_event *event, int can_add_hw)
{
        struct perf_event_pmu_context *epc = event->pmu_ctx;
        struct perf_cpu_pmu_context *cpc = this_cpc(epc->pmu);

        /*
         * Groups consisting entirely of software events can always go on.
         */
        if (event->group_caps & PERF_EV_CAP_SOFTWARE)
                return 1;
        /*
         * If an exclusive group is already on, no other hardware
         * events can go on.
         */
        if (cpc->exclusive)
                return 0;
        /*
         * If this group is exclusive and there are already
         * events on the CPU, it can't go on.
         */
        if (event->attr.exclusive && !list_empty(get_event_list(event)))
                return 0;
        /*
         * Otherwise, try to add it if all previous groups were able
         * to go on.
         */
        return can_add_hw;
}

static void add_event_to_ctx(struct perf_event *event,
                               struct perf_event_context *ctx)
{
        list_add_event(event, ctx);
        perf_group_attach(event);
}

static void task_ctx_sched_out(struct perf_event_context *ctx,
                               struct pmu *pmu,
                               enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);

        if (!cpuctx->task_ctx)
                return;

        if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
                return;

        ctx_sched_out(ctx, pmu, event_type);
}

static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                struct pmu *pmu)
{
        ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED);
        if (ctx)
                 ctx_sched_in(ctx, pmu, EVENT_PINNED);
        ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);
        if (ctx)
                 ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE);
}

/*
 * We want to maintain the following priority of scheduling:
 *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
 *  - task pinned (EVENT_PINNED)
 *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
 *  - task flexible (EVENT_FLEXIBLE).
 *
 * In order to avoid unscheduling and scheduling back in everything every
 * time an event is added, only do it for the groups of equal priority and
 * below.
 *
 * This can be called after a batch operation on task events, in which case
 * event_type is a bit mask of the types of events involved. For CPU events,
 * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
 */
static void ctx_resched(struct perf_cpu_context *cpuctx,
                        struct perf_event_context *task_ctx,
                        struct pmu *pmu, enum event_type_t event_type)
{
        bool cpu_event = !!(event_type & EVENT_CPU);
        struct perf_event_pmu_context *epc;

        /*
         * If pinned groups are involved, flexible groups also need to be
         * scheduled out.
         */
        if (event_type & EVENT_PINNED)
                event_type |= EVENT_FLEXIBLE;

        event_type &= EVENT_ALL;

        for_each_epc(epc, &cpuctx->ctx, pmu, false)
                perf_pmu_disable(epc->pmu);

        if (task_ctx) {
                for_each_epc(epc, task_ctx, pmu, false)
                        perf_pmu_disable(epc->pmu);

                task_ctx_sched_out(task_ctx, pmu, event_type);
        }

        /*
         * Decide which cpu ctx groups to schedule out based on the types
         * of events that caused rescheduling:
         *  - EVENT_CPU: schedule out corresponding groups;
         *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
         *  - otherwise, do nothing more.
         */
        if (cpu_event)
                ctx_sched_out(&cpuctx->ctx, pmu, event_type);
        else if (event_type & EVENT_PINNED)
                ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE);

        perf_event_sched_in(cpuctx, task_ctx, pmu);

        for_each_epc(epc, &cpuctx->ctx, pmu, false)
                perf_pmu_enable(epc->pmu);

        if (task_ctx) {
                for_each_epc(epc, task_ctx, pmu, false)
                        perf_pmu_enable(epc->pmu);
        }
}

void perf_pmu_resched(struct pmu *pmu)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;

        perf_ctx_lock(cpuctx, task_ctx);
        ctx_resched(cpuctx, task_ctx, pmu, EVENT_ALL|EVENT_CPU);
        perf_ctx_unlock(cpuctx, task_ctx);
}

/*
 * Cross CPU call to install and enable a performance event
 *
 * Very similar to remote_function() + event_function() but cannot assume that
 * things like ctx->is_active and cpuctx->task_ctx are set.
 */
static int  __perf_install_in_context(void *info)
{
        struct perf_event *event = info;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *task_ctx = cpuctx->task_ctx;
        bool reprogram = true;
        int ret = 0;

        raw_spin_lock(&cpuctx->ctx.lock);
        if (ctx->task) {
                raw_spin_lock(&ctx->lock);
                task_ctx = ctx;

                reprogram = (ctx->task == current);

                /*
                 * If the task is running, it must be running on this CPU,
                 * otherwise we cannot reprogram things.
                 *
                 * If its not running, we don't care, ctx->lock will
                 * serialize against it becoming runnable.
                 */
                if (task_curr(ctx->task) && !reprogram) {
                        ret = -ESRCH;
                        goto unlock;
                }

                WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
        } else if (task_ctx) {
                raw_spin_lock(&task_ctx->lock);
        }

#ifdef CONFIG_CGROUP_PERF
        if (event->state > PERF_EVENT_STATE_OFF && is_cgroup_event(event)) {
                /*
                 * If the current cgroup doesn't match the event's
                 * cgroup, we should not try to schedule it.
                 */
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
                reprogram = cgroup_is_descendant(cgrp->css.cgroup,
                                        event->cgrp->css.cgroup);
        }
#endif

        if (reprogram) {
                ctx_time_freeze(cpuctx, ctx);
                add_event_to_ctx(event, ctx);
                ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu,
                            get_event_type(event));
        } else {
                add_event_to_ctx(event, ctx);
        }

unlock:
        perf_ctx_unlock(cpuctx, task_ctx);

        return ret;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx);

/*
 * Attach a performance event to a context.
 *
 * Very similar to event_function_call, see comment there.
 */
static void
perf_install_in_context(struct perf_event_context *ctx,
                        struct perf_event *event,
                        int cpu)
{
        struct task_struct *task = READ_ONCE(ctx->task);

        lockdep_assert_held(&ctx->mutex);

        WARN_ON_ONCE(!exclusive_event_installable(event, ctx));

        if (event->cpu != -1)
                WARN_ON_ONCE(event->cpu != cpu);

        /*
         * Ensures that if we can observe event->ctx, both the event and ctx
         * will be 'complete'. See perf_iterate_sb_cpu().
         */
        smp_store_release(&event->ctx, ctx);

        /*
         * perf_event_attr::disabled events will not run and can be initialized
         * without IPI. Except when this is the first event for the context, in
         * that case we need the magic of the IPI to set ctx->is_active.
         *
         * The IOC_ENABLE that is sure to follow the creation of a disabled
         * event will issue the IPI and reprogram the hardware.
         */
        if (__perf_effective_state(event) == PERF_EVENT_STATE_OFF &&
            ctx->nr_events && !is_cgroup_event(event)) {
                raw_spin_lock_irq(&ctx->lock);
                if (ctx->task == TASK_TOMBSTONE) {
                        raw_spin_unlock_irq(&ctx->lock);
                        return;
                }
                add_event_to_ctx(event, ctx);
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        if (!task) {
                cpu_function_call(cpu, __perf_install_in_context, event);
                return;
        }

        /*
         * Should not happen, we validate the ctx is still alive before calling.
         */
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
                return;

        /*
         * Installing events is tricky because we cannot rely on ctx->is_active
         * to be set in case this is the nr_events 0 -> 1 transition.
         *
         * Instead we use task_curr(), which tells us if the task is running.
         * However, since we use task_curr() outside of rq::lock, we can race
         * against the actual state. This means the result can be wrong.
         *
         * If we get a false positive, we retry, this is harmless.
         *
         * If we get a false negative, things are complicated. If we are after
         * perf_event_context_sched_in() ctx::lock will serialize us, and the
         * value must be correct. If we're before, it doesn't matter since
         * perf_event_context_sched_in() will program the counter.
         *
         * However, this hinges on the remote context switch having observed
         * our task->perf_event_ctxp[] store, such that it will in fact take
         * ctx::lock in perf_event_context_sched_in().
         *
         * We do this by task_function_call(), if the IPI fails to hit the task
         * we know any future context switch of task must see the
         * perf_event_ctpx[] store.
         */

        /*
         * This smp_mb() orders the task->perf_event_ctxp[] store with the
         * task_cpu() load, such that if the IPI then does not find the task
         * running, a future context switch of that task must observe the
         * store.
         */
        smp_mb();
again:
        if (!task_function_call(task, __perf_install_in_context, event))
                return;

        raw_spin_lock_irq(&ctx->lock);
        task = ctx->task;
        if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
                /*
                 * Cannot happen because we already checked above (which also
                 * cannot happen), and we hold ctx->mutex, which serializes us
                 * against perf_event_exit_task_context().
                 */
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }
        /*
         * If the task is not running, ctx->lock will avoid it becoming so,
         * thus we can safely install the event.
         */
        if (task_curr(task)) {
                raw_spin_unlock_irq(&ctx->lock);
                goto again;
        }
        add_event_to_ctx(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
}

/*
 * Cross CPU call to enable a performance event
 */
static void __perf_event_enable(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        struct perf_event *leader = event->group_leader;
        struct perf_event_context *task_ctx;

        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <= PERF_EVENT_STATE_ERROR)
                return;

        ctx_time_freeze(cpuctx, ctx);

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
        perf_cgroup_event_enable(event, ctx);

        if (!ctx->is_active)
                return;

        if (!event_filter_match(event))
                return;

        /*
         * If the event is in a group and isn't the group leader,
         * then don't put it on unless the group is on.
         */
        if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
                return;

        task_ctx = cpuctx->task_ctx;
        if (ctx->task)
                WARN_ON_ONCE(task_ctx != ctx);

        ctx_resched(cpuctx, task_ctx, event->pmu_ctx->pmu, get_event_type(event));
}

/*
 * Enable an event.
 *
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
 * remains valid.  This condition is satisfied when called through
 * perf_event_for_each_child or perf_event_for_each as described
 * for perf_event_disable.
 */
static void _perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;

        raw_spin_lock_irq(&ctx->lock);
        if (event->state >= PERF_EVENT_STATE_INACTIVE ||
            event->state <  PERF_EVENT_STATE_ERROR) {
out:
                raw_spin_unlock_irq(&ctx->lock);
                return;
        }

        /*
         * If the event is in error state, clear that first.
         *
         * That way, if we see the event in error state below, we know that it
         * has gone back into error state, as distinct from the task having
         * been scheduled away before the cross-call arrived.
         */
        if (event->state == PERF_EVENT_STATE_ERROR) {
                /*
                 * Detached SIBLING events cannot leave ERROR state.
                 */
                if (event->event_caps & PERF_EV_CAP_SIBLING &&
                    event->group_leader == event)
                        goto out;

                event->state = PERF_EVENT_STATE_OFF;
        }
        raw_spin_unlock_irq(&ctx->lock);

        event_function_call(event, __perf_event_enable, NULL);
}

/*
 * See perf_event_disable();
 */
void perf_event_enable(struct perf_event *event)
{
        struct perf_event_context *ctx;

        ctx = perf_event_ctx_lock(event);
        _perf_event_enable(event);
        perf_event_ctx_unlock(event, ctx);
}
EXPORT_SYMBOL_GPL(perf_event_enable);

struct stop_event_data {
        struct perf_event        *event;
        unsigned int                restart;
};

static int __perf_event_stop(void *info)
{
        struct stop_event_data *sd = info;
        struct perf_event *event = sd->event;

        /* if it's already INACTIVE, do nothing */
        if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                return 0;

        /* matches smp_wmb() in event_sched_in() */
        smp_rmb();

        /*
         * There is a window with interrupts enabled before we get here,
         * so we need to check again lest we try to stop another CPU's event.
         */
        if (READ_ONCE(event->oncpu) != smp_processor_id())
                return -EAGAIN;

        event->pmu->stop(event, PERF_EF_UPDATE);

        /*
         * May race with the actual stop (through perf_pmu_output_stop()),
         * but it is only used for events with AUX ring buffer, and such
         * events will refuse to restart because of rb::aux_mmap_count==0,
         * see comments in perf_aux_output_begin().
         *
         * Since this is happening on an event-local CPU, no trace is lost
         * while restarting.
         */
        if (sd->restart)
                event->pmu->start(event, 0);

        return 0;
}

static int perf_event_stop(struct perf_event *event, int restart)
{
        struct stop_event_data sd = {
                .event                = event,
                .restart        = restart,
        };
        int ret = 0;

        do {
                if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
                        return 0;

                /* matches smp_wmb() in event_sched_in() */
                smp_rmb();

                /*
                 * We only want to restart ACTIVE events, so if the event goes
                 * inactive here (event->oncpu==-1), there's nothing more to do;
                 * fall through with ret==-ENXIO.
                 */
                ret = cpu_function_call(READ_ONCE(event->oncpu),
                                        __perf_event_stop, &sd);
        } while (ret == -EAGAIN);

        return ret;
}

/*
 * In order to contain the amount of racy and tricky in the address filter
 * configuration management, it is a two part process:
 *
 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
 *      we update the addresses of corresponding vmas in
 *        event::addr_filter_ranges array and bump the event::addr_filters_gen;
 * (p2) when an event is scheduled in (pmu::add), it calls
 *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
 *      if the generation has changed since the previous call.
 *
 * If (p1) happens while the event is active, we restart it to force (p2).
 *
 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
 *     pre-existing mappings, called once when new filters arrive via SET_FILTER
 *     ioctl;
 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
 *     registered mapping, called for every new mmap(), with mm::mmap_lock down
 *     for reading;
 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
 *     of exec.
 */
void perf_event_addr_filters_sync(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

        if (!has_addr_filter(event))
                return;

        raw_spin_lock(&ifh->lock);
        if (event->addr_filters_gen != event->hw.addr_filters_gen) {
                event->pmu->addr_filters_sync(event);
                event->hw.addr_filters_gen = event->addr_filters_gen;
        }
        raw_spin_unlock(&ifh->lock);
}
EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);

static int _perf_event_refresh(struct perf_event *event, int refresh)
{
        /*
         * not supported on inherited events
         */
        if (event->attr.inherit || !is_sampling_event(event))
                return -EINVAL;

        atomic_add(refresh, &event->event_limit);
        _perf_event_enable(event);

        return 0;
}

/*
 * See perf_event_disable()
 */
int perf_event_refresh(struct perf_event *event, int refresh)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_refresh(event, refresh);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_refresh);

static int perf_event_modify_breakpoint(struct perf_event *bp,
                                         struct perf_event_attr *attr)
{
        int err;

        _perf_event_disable(bp);

        err = modify_user_hw_breakpoint_check(bp, attr, true);

        if (!bp->attr.disabled)
                _perf_event_enable(bp);

        return err;
}

/*
 * Copy event-type-independent attributes that may be modified.
 */
static void perf_event_modify_copy_attr(struct perf_event_attr *to,
                                        const struct perf_event_attr *from)
{
        to->sig_data = from->sig_data;
}

static int perf_event_modify_attr(struct perf_event *event,
                                  struct perf_event_attr *attr)
{
        int (*func)(struct perf_event *, struct perf_event_attr *);
        struct perf_event *child;
        int err;

        if (event->attr.type != attr->type)
                return -EINVAL;

        switch (event->attr.type) {
        case PERF_TYPE_BREAKPOINT:
                func = perf_event_modify_breakpoint;
                break;
        default:
                /* Place holder for future additions. */
                return -EOPNOTSUPP;
        }

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        /*
         * Event-type-independent attributes must be copied before event-type
         * modification, which will validate that final attributes match the
         * source attributes after all relevant attributes have been copied.
         */
        perf_event_modify_copy_attr(&event->attr, attr);
        err = func(event, attr);
        if (err)
                goto out;
        list_for_each_entry(child, &event->child_list, child_list) {
                perf_event_modify_copy_attr(&child->attr, attr);
                err = func(child, attr);
                if (err)
                        goto out;
        }
out:
        mutex_unlock(&event->child_mutex);
        return err;
}

static void __pmu_ctx_sched_out(struct perf_event_pmu_context *pmu_ctx,
                                enum event_type_t event_type)
{
        struct perf_event_context *ctx = pmu_ctx->ctx;
        struct perf_event *event, *tmp;
        struct pmu *pmu = pmu_ctx->pmu;

        if (ctx->task && !(ctx->is_active & EVENT_ALL)) {
                struct perf_cpu_pmu_context *cpc = this_cpc(pmu);

                WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
                cpc->task_epc = NULL;
        }

        if (!(event_type & EVENT_ALL))
                return;

        perf_pmu_disable(pmu);
        if (event_type & EVENT_PINNED) {
                list_for_each_entry_safe(event, tmp,
                                         &pmu_ctx->pinned_active,
                                         active_list)
                        group_sched_out(event, ctx);
        }

        if (event_type & EVENT_FLEXIBLE) {
                list_for_each_entry_safe(event, tmp,
                                         &pmu_ctx->flexible_active,
                                         active_list)
                        group_sched_out(event, ctx);
                /*
                 * Since we cleared EVENT_FLEXIBLE, also clear
                 * rotate_necessary, is will be reset by
                 * ctx_flexible_sched_in() when needed.
                 */
                pmu_ctx->rotate_necessary = 0;
        }
        perf_pmu_enable(pmu);
}

/*
 * Be very careful with the @pmu argument since this will change ctx state.
 * The @pmu argument works for ctx_resched(), because that is symmetric in
 * ctx_sched_out() / ctx_sched_in() usage and the ctx state ends up invariant.
 *
 * However, if you were to be asymmetrical, you could end up with messed up
 * state, eg. ctx->is_active cleared even though most EPCs would still actually
 * be active.
 */
static void
ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_pmu_context *pmu_ctx;
        int is_active = ctx->is_active;
        bool cgroup = event_type & EVENT_CGROUP;

        event_type &= ~EVENT_CGROUP;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events)) {
                /*
                 * See __perf_remove_from_context().
                 */
                WARN_ON_ONCE(ctx->is_active);
                if (ctx->task)
                        WARN_ON_ONCE(cpuctx->task_ctx);
                return;
        }

        /*
         * Always update time if it was set; not only when it changes.
         * Otherwise we can 'forget' to update time for any but the last
         * context we sched out. For example:
         *
         *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
         *   ctx_sched_out(.event_type = EVENT_PINNED)
         *
         * would only update time for the pinned events.
         */
        __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx);

        /*
         * CPU-release for the below ->is_active store,
         * see __load_acquire() in perf_event_time_now()
         */
        barrier();
        ctx->is_active &= ~event_type;

        if (!(ctx->is_active & EVENT_ALL)) {
                /*
                 * For FROZEN, preserve TIME|FROZEN such that perf_event_time_now()
                 * does not observe a hole. perf_ctx_unlock() will clean up.
                 */
                if (ctx->is_active & EVENT_FROZEN)
                        ctx->is_active &= EVENT_TIME_FROZEN;
                else
                        ctx->is_active = 0;
        }

        if (ctx->task) {
                WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                if (!(ctx->is_active & EVENT_ALL))
                        cpuctx->task_ctx = NULL;
        }

        is_active ^= ctx->is_active; /* changed bits */

        for_each_epc(pmu_ctx, ctx, pmu, cgroup)
                __pmu_ctx_sched_out(pmu_ctx, is_active);
}

/*
 * Test whether two contexts are equivalent, i.e. whether they have both been
 * cloned from the same version of the same context.
 *
 * Equivalence is measured using a generation number in the context that is
 * incremented on each modification to it; see unclone_ctx(), list_add_event()
 * and list_del_event().
 */
static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
{
        lockdep_assert_held(&ctx1->lock);
        lockdep_assert_held(&ctx2->lock);

        /* Pinning disables the swap optimization */
        if (ctx1->pin_count || ctx2->pin_count)
                return 0;

        /* If ctx1 is the parent of ctx2 */
        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
                return 1;

        /* If ctx2 is the parent of ctx1 */
        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
                return 1;

        /*
         * If ctx1 and ctx2 have the same parent; we flatten the parent
         * hierarchy, see perf_event_init_context().
         */
        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
                        ctx1->parent_gen == ctx2->parent_gen)
                return 1;

        /* Unmatched */
        return 0;
}

static void __perf_event_sync_stat(struct perf_event *event,
                                     struct perf_event *next_event)
{
        u64 value;

        if (!event->attr.inherit_stat)
                return;

        /*
         * Update the event value, we cannot use perf_event_read()
         * because we're in the middle of a context switch and have IRQs
         * disabled, which upsets smp_call_function_single(), however
         * we know the event must be on the current CPU, therefore we
         * don't need to use it.
         */
        perf_pmu_read(event);

        perf_event_update_time(event);

        /*
         * In order to keep per-task stats reliable we need to flip the event
         * values when we flip the contexts.
         */
        value = local64_read(&next_event->count);
        value = local64_xchg(&event->count, value);
        local64_set(&next_event->count, value);

        swap(event->total_time_enabled, next_event->total_time_enabled);
        swap(event->total_time_running, next_event->total_time_running);

        /*
         * Since we swizzled the values, update the user visible data too.
         */
        perf_event_update_userpage(event);
        perf_event_update_userpage(next_event);
}

static void perf_event_sync_stat(struct perf_event_context *ctx,
                                   struct perf_event_context *next_ctx)
{
        struct perf_event *event, *next_event;

        if (!ctx->nr_stat)
                return;

        update_context_time(ctx);

        event = list_first_entry(&ctx->event_list,
                                   struct perf_event, event_entry);

        next_event = list_first_entry(&next_ctx->event_list,
                                        struct perf_event, event_entry);

        while (&event->event_entry != &ctx->event_list &&
               &next_event->event_entry != &next_ctx->event_list) {

                __perf_event_sync_stat(event, next_event);

                event = list_next_entry(event, event_entry);
                next_event = list_next_entry(next_event, event_entry);
        }
}

static void perf_ctx_sched_task_cb(struct perf_event_context *ctx,
                                   struct task_struct *task, bool sched_in)
{
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_cpu_pmu_context *cpc;

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                cpc = this_cpc(pmu_ctx->pmu);

                if (cpc->sched_cb_usage && pmu_ctx->pmu->sched_task)
                        pmu_ctx->pmu->sched_task(pmu_ctx, task, sched_in);
        }
}

static void
perf_event_context_sched_out(struct task_struct *task, struct task_struct *next)
{
        struct perf_event_context *ctx = task->perf_event_ctxp;
        struct perf_event_context *next_ctx;
        struct perf_event_context *parent, *next_parent;
        int do_switch = 1;

        if (likely(!ctx))
                return;

        rcu_read_lock();
        next_ctx = rcu_dereference(next->perf_event_ctxp);
        if (!next_ctx)
                goto unlock;

        parent = rcu_dereference(ctx->parent_ctx);
        next_parent = rcu_dereference(next_ctx->parent_ctx);

        /* If neither context have a parent context; they cannot be clones. */
        if (!parent && !next_parent)
                goto unlock;

        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                /*
                 * Looks like the two contexts are clones, so we might be
                 * able to optimize the context switch.  We lock both
                 * contexts and check that they are clones under the
                 * lock (including re-checking that neither has been
                 * uncloned in the meantime).  It doesn't matter which
                 * order we take the locks because no other cpu could
                 * be trying to lock both of these tasks.
                 */
                raw_spin_lock(&ctx->lock);
                raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
                if (context_equiv(ctx, next_ctx)) {

                        perf_ctx_disable(ctx, false);

                        /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */
                        if (local_read(&ctx->nr_no_switch_fast) ||
                            local_read(&next_ctx->nr_no_switch_fast)) {
                                /*
                                 * Must not swap out ctx when there's pending
                                 * events that rely on the ctx->task relation.
                                 *
                                 * Likewise, when a context contains inherit +
                                 * SAMPLE_READ events they should be switched
                                 * out using the slow path so that they are
                                 * treated as if they were distinct contexts.
                                 */
                                raw_spin_unlock(&next_ctx->lock);
                                rcu_read_unlock();
                                goto inside_switch;
                        }

                        WRITE_ONCE(ctx->task, next);
                        WRITE_ONCE(next_ctx->task, task);

                        perf_ctx_sched_task_cb(ctx, task, false);

                        perf_ctx_enable(ctx, false);

                        /*
                         * RCU_INIT_POINTER here is safe because we've not
                         * modified the ctx and the above modification of
                         * ctx->task is immaterial since this value is
                         * always verified under ctx->lock which we're now
                         * holding.
                         */
                        RCU_INIT_POINTER(task->perf_event_ctxp, next_ctx);
                        RCU_INIT_POINTER(next->perf_event_ctxp, ctx);

                        do_switch = 0;

                        perf_event_sync_stat(ctx, next_ctx);
                }
                raw_spin_unlock(&next_ctx->lock);
                raw_spin_unlock(&ctx->lock);
        }
unlock:
        rcu_read_unlock();

        if (do_switch) {
                raw_spin_lock(&ctx->lock);
                perf_ctx_disable(ctx, false);

inside_switch:
                perf_ctx_sched_task_cb(ctx, task, false);
                task_ctx_sched_out(ctx, NULL, EVENT_ALL);

                perf_ctx_enable(ctx, false);
                raw_spin_unlock(&ctx->lock);
        }
}

static DEFINE_PER_CPU(struct list_head, sched_cb_list);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);

void perf_sched_cb_dec(struct pmu *pmu)
{
        struct perf_cpu_pmu_context *cpc = this_cpc(pmu);

        this_cpu_dec(perf_sched_cb_usages);
        barrier();

        if (!--cpc->sched_cb_usage)
                list_del(&cpc->sched_cb_entry);
}


void perf_sched_cb_inc(struct pmu *pmu)
{
        struct perf_cpu_pmu_context *cpc = this_cpc(pmu);

        if (!cpc->sched_cb_usage++)
                list_add(&cpc->sched_cb_entry, this_cpu_ptr(&sched_cb_list));

        barrier();
        this_cpu_inc(perf_sched_cb_usages);
}

/*
 * This function provides the context switch callback to the lower code
 * layer. It is invoked ONLY when the context switch callback is enabled.
 *
 * This callback is relevant even to per-cpu events; for example multi event
 * PEBS requires this to provide PID/TID information. This requires we flush
 * all queued PEBS records before we context switch to a new task.
 */
static void __perf_pmu_sched_task(struct perf_cpu_pmu_context *cpc,
                                  struct task_struct *task, bool sched_in)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu;

        pmu = cpc->epc.pmu;

        /* software PMUs will not have sched_task */
        if (WARN_ON_ONCE(!pmu->sched_task))
                return;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        pmu->sched_task(cpc->task_epc, task, sched_in);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}

static void perf_pmu_sched_task(struct task_struct *prev,
                                struct task_struct *next,
                                bool sched_in)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_cpu_pmu_context *cpc;

        /* cpuctx->task_ctx will be handled in perf_event_context_sched_in/out */
        if (prev == next || cpuctx->task_ctx)
                return;

        list_for_each_entry(cpc, this_cpu_ptr(&sched_cb_list), sched_cb_entry)
                __perf_pmu_sched_task(cpc, sched_in ? next : prev, sched_in);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in);

/*
 * Called from scheduler to remove the events of the current task,
 * with interrupts disabled.
 *
 * We stop each event and update the event value in event->count.
 *
 * This does not protect us against NMI, but disable()
 * sets the disabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * not restart the event.
 */
void __perf_event_task_sched_out(struct task_struct *task,
                                 struct task_struct *next)
{
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, next, false);

        perf_event_context_sched_out(task, next);

        /*
         * if cgroup events exist on this CPU, then we need
         * to check if we have to switch out PMU state.
         * cgroup event are system-wide mode only
         */
        perf_cgroup_switch(next);
}

static bool perf_less_group_idx(const void *l, const void *r, void __always_unused *args)
{
        const struct perf_event *le = *(const struct perf_event **)l;
        const struct perf_event *re = *(const struct perf_event **)r;

        return le->group_index < re->group_index;
}

DEFINE_MIN_HEAP(struct perf_event *, perf_event_min_heap);

static const struct min_heap_callbacks perf_min_heap = {
        .less = perf_less_group_idx,
        .swp = NULL,
};

static void __heap_add(struct perf_event_min_heap *heap, struct perf_event *event)
{
        struct perf_event **itrs = heap->data;

        if (event) {
                itrs[heap->nr] = event;
                heap->nr++;
        }
}

static void __link_epc(struct perf_event_pmu_context *pmu_ctx)
{
        struct perf_cpu_pmu_context *cpc;

        if (!pmu_ctx->ctx->task)
                return;

        cpc = this_cpc(pmu_ctx->pmu);
        WARN_ON_ONCE(cpc->task_epc && cpc->task_epc != pmu_ctx);
        cpc->task_epc = pmu_ctx;
}

static noinline int visit_groups_merge(struct perf_event_context *ctx,
                                struct perf_event_groups *groups, int cpu,
                                struct pmu *pmu,
                                int (*func)(struct perf_event *, void *),
                                void *data)
{
#ifdef CONFIG_CGROUP_PERF
        struct cgroup_subsys_state *css = NULL;
#endif
        struct perf_cpu_context *cpuctx = NULL;
        /* Space for per CPU and/or any CPU event iterators. */
        struct perf_event *itrs[2];
        struct perf_event_min_heap event_heap;
        struct perf_event **evt;
        int ret;

        if (pmu->filter && pmu->filter(pmu, cpu))
                return 0;

        if (!ctx->task) {
                cpuctx = this_cpu_ptr(&perf_cpu_context);
                event_heap = (struct perf_event_min_heap){
                        .data = cpuctx->heap,
                        .nr = 0,
                        .size = cpuctx->heap_size,
                };

                lockdep_assert_held(&cpuctx->ctx.lock);

#ifdef CONFIG_CGROUP_PERF
                if (cpuctx->cgrp)
                        css = &cpuctx->cgrp->css;
#endif
        } else {
                event_heap = (struct perf_event_min_heap){
                        .data = itrs,
                        .nr = 0,
                        .size = ARRAY_SIZE(itrs),
                };
                /* Events not within a CPU context may be on any CPU. */
                __heap_add(&event_heap, perf_event_groups_first(groups, -1, pmu, NULL));
        }
        evt = event_heap.data;

        __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, NULL));

#ifdef CONFIG_CGROUP_PERF
        for (; css; css = css->parent)
                __heap_add(&event_heap, perf_event_groups_first(groups, cpu, pmu, css->cgroup));
#endif

        if (event_heap.nr) {
                __link_epc((*evt)->pmu_ctx);
                perf_assert_pmu_disabled((*evt)->pmu_ctx->pmu);
        }

        min_heapify_all_inline(&event_heap, &perf_min_heap, NULL);

        while (event_heap.nr) {
                ret = func(*evt, data);
                if (ret)
                        return ret;

                *evt = perf_event_groups_next(*evt, pmu);
                if (*evt)
                        min_heap_sift_down_inline(&event_heap, 0, &perf_min_heap, NULL);
                else
                        min_heap_pop_inline(&event_heap, &perf_min_heap, NULL);
        }

        return 0;
}

/*
 * Because the userpage is strictly per-event (there is no concept of context,
 * so there cannot be a context indirection), every userpage must be updated
 * when context time starts :-(
 *
 * IOW, we must not miss EVENT_TIME edges.
 */
static inline bool event_update_userpage(struct perf_event *event)
{
        if (likely(!atomic_read(&event->mmap_count)))
                return false;

        perf_event_update_time(event);
        perf_event_update_userpage(event);

        return true;
}

static inline void group_update_userpage(struct perf_event *group_event)
{
        struct perf_event *event;

        if (!event_update_userpage(group_event))
                return;

        for_each_sibling_event(event, group_event)
                event_update_userpage(event);
}

static int merge_sched_in(struct perf_event *event, void *data)
{
        struct perf_event_context *ctx = event->ctx;
        int *can_add_hw = data;

        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;

        if (!event_filter_match(event))
                return 0;

        if (group_can_go_on(event, *can_add_hw)) {
                if (!group_sched_in(event, ctx))
                        list_add_tail(&event->active_list, get_event_list(event));
        }

        if (event->state == PERF_EVENT_STATE_INACTIVE) {
                *can_add_hw = 0;
                if (event->attr.pinned) {
                        perf_cgroup_event_disable(event, ctx);
                        perf_event_set_state(event, PERF_EVENT_STATE_ERROR);

                        if (*perf_event_fasync(event))
                                event->pending_kill = POLL_ERR;

                        perf_event_wakeup(event);
                } else {
                        struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu);

                        event->pmu_ctx->rotate_necessary = 1;
                        perf_mux_hrtimer_restart(cpc);
                        group_update_userpage(event);
                }
        }

        return 0;
}

static void pmu_groups_sched_in(struct perf_event_context *ctx,
                                struct perf_event_groups *groups,
                                struct pmu *pmu)
{
        int can_add_hw = 1;
        visit_groups_merge(ctx, groups, smp_processor_id(), pmu,
                           merge_sched_in, &can_add_hw);
}

static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx,
                               enum event_type_t event_type)
{
        struct perf_event_context *ctx = pmu_ctx->ctx;

        if (event_type & EVENT_PINNED)
                pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu);
        if (event_type & EVENT_FLEXIBLE)
                pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu);
}

static void
ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_pmu_context *pmu_ctx;
        int is_active = ctx->is_active;
        bool cgroup = event_type & EVENT_CGROUP;

        event_type &= ~EVENT_CGROUP;

        lockdep_assert_held(&ctx->lock);

        if (likely(!ctx->nr_events))
                return;

        if (!(is_active & EVENT_TIME)) {
                /* start ctx time */
                __update_context_time(ctx, false);
                perf_cgroup_set_timestamp(cpuctx);
                /*
                 * CPU-release for the below ->is_active store,
                 * see __load_acquire() in perf_event_time_now()
                 */
                barrier();
        }

        ctx->is_active |= (event_type | EVENT_TIME);
        if (ctx->task) {
                if (!(is_active & EVENT_ALL))
                        cpuctx->task_ctx = ctx;
                else
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
        }

        is_active ^= ctx->is_active; /* changed bits */

        /*
         * First go through the list and put on any pinned groups
         * in order to give them the best chance of going on.
         */
        if (is_active & EVENT_PINNED) {
                for_each_epc(pmu_ctx, ctx, pmu, cgroup)
                        __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED);
        }

        /* Then walk through the lower prio flexible groups */
        if (is_active & EVENT_FLEXIBLE) {
                for_each_epc(pmu_ctx, ctx, pmu, cgroup)
                        __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE);
        }
}

static void perf_event_context_sched_in(struct task_struct *task)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx;

        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp);
        if (!ctx)
                goto rcu_unlock;

        if (cpuctx->task_ctx == ctx) {
                perf_ctx_lock(cpuctx, ctx);
                perf_ctx_disable(ctx, false);

                perf_ctx_sched_task_cb(ctx, task, true);

                perf_ctx_enable(ctx, false);
                perf_ctx_unlock(cpuctx, ctx);
                goto rcu_unlock;
        }

        perf_ctx_lock(cpuctx, ctx);
        /*
         * We must check ctx->nr_events while holding ctx->lock, such
         * that we serialize against perf_install_in_context().
         */
        if (!ctx->nr_events)
                goto unlock;

        perf_ctx_disable(ctx, false);
        /*
         * We want to keep the following priority order:
         * cpu pinned (that don't need to move), task pinned,
         * cpu flexible, task flexible.
         *
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) {
                perf_ctx_disable(&cpuctx->ctx, false);
                ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE);
        }

        perf_event_sched_in(cpuctx, ctx, NULL);

        perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true);

        if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                perf_ctx_enable(&cpuctx->ctx, false);

        perf_ctx_enable(ctx, false);

unlock:
        perf_ctx_unlock(cpuctx, ctx);
rcu_unlock:
        rcu_read_unlock();
}

/*
 * Called from scheduler to add the events of the current task
 * with interrupts disabled.
 *
 * We restore the event value and then enable it.
 *
 * This does not protect us against NMI, but enable()
 * sets the enabled bit in the control field of event _before_
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
void __perf_event_task_sched_in(struct task_struct *prev,
                                struct task_struct *task)
{
        perf_event_context_sched_in(task);

        if (atomic_read(&nr_switch_events))
                perf_event_switch(task, prev, true);

        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(prev, task, true);
}

static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
{
        u64 frequency = event->attr.sample_freq;
        u64 sec = NSEC_PER_SEC;
        u64 divisor, dividend;

        int count_fls, nsec_fls, frequency_fls, sec_fls;

        count_fls = fls64(count);
        nsec_fls = fls64(nsec);
        frequency_fls = fls64(frequency);
        sec_fls = 30;

        /*
         * We got @count in @nsec, with a target of sample_freq HZ
         * the target period becomes:
         *
         *             @count * 10^9
         * period = -------------------
         *          @nsec * sample_freq
         *
         */

        /*
         * Reduce accuracy by one bit such that @a and @b converge
         * to a similar magnitude.
         */
#define REDUCE_FLS(a, b)                \
do {                                        \
        if (a##_fls > b##_fls) {        \
                a >>= 1;                \
                a##_fls--;                \
        } else {                        \
                b >>= 1;                \
                b##_fls--;                \
        }                                \
} while (0)

        /*
         * Reduce accuracy until either term fits in a u64, then proceed with
         * the other, so that finally we can do a u64/u64 division.
         */
        while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
                REDUCE_FLS(nsec, frequency);
                REDUCE_FLS(sec, count);
        }

        if (count_fls + sec_fls > 64) {
                divisor = nsec * frequency;

                while (count_fls + sec_fls > 64) {
                        REDUCE_FLS(count, sec);
                        divisor >>= 1;
                }

                dividend = count * sec;
        } else {
                dividend = count * sec;

                while (nsec_fls + frequency_fls > 64) {
                        REDUCE_FLS(nsec, frequency);
                        dividend >>= 1;
                }

                divisor = nsec * frequency;
        }

        if (!divisor)
                return dividend;

        return div64_u64(dividend, divisor);
}

static DEFINE_PER_CPU(int, perf_throttled_count);
static DEFINE_PER_CPU(u64, perf_throttled_seq);

static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period, sample_period;
        s64 delta;

        period = perf_calculate_period(event, nsec, count);

        delta = (s64)(period - hwc->sample_period);
        if (delta >= 0)
                delta += 7;
        else
                delta -= 7;
        delta /= 8; /* low pass filter */

        sample_period = hwc->sample_period + delta;

        if (!sample_period)
                sample_period = 1;

        hwc->sample_period = sample_period;

        if (local64_read(&hwc->period_left) > 8*sample_period) {
                if (disable)
                        event->pmu->stop(event, PERF_EF_UPDATE);

                local64_set(&hwc->period_left, 0);

                if (disable)
                        event->pmu->start(event, PERF_EF_RELOAD);
        }
}

static void perf_adjust_freq_unthr_events(struct list_head *event_list)
{
        struct perf_event *event;
        struct hw_perf_event *hwc;
        u64 now, period = TICK_NSEC;
        s64 delta;

        list_for_each_entry(event, event_list, active_list) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;

                // XXX use visit thingy to avoid the -1,cpu match
                if (!event_filter_match(event))
                        continue;

                hwc = &event->hw;

                if (hwc->interrupts == MAX_INTERRUPTS) {
                        hwc->interrupts = 0;
                        perf_log_throttle(event, 1);
                        if (!event->attr.freq || !event->attr.sample_freq)
                                event->pmu->start(event, 0);
                }

                if (!event->attr.freq || !event->attr.sample_freq)
                        continue;

                /*
                 * stop the event and update event->count
                 */
                event->pmu->stop(event, PERF_EF_UPDATE);

                now = local64_read(&event->count);
                delta = now - hwc->freq_count_stamp;
                hwc->freq_count_stamp = now;

                /*
                 * restart the event
                 * reload only if value has changed
                 * we have stopped the event so tell that
                 * to perf_adjust_period() to avoid stopping it
                 * twice.
                 */
                if (delta > 0)
                        perf_adjust_period(event, period, delta, false);

                event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
        }
}

/*
 * combine freq adjustment with unthrottling to avoid two passes over the
 * events. At the same time, make sure, having freq events does not change
 * the rate of unthrottling as that would introduce bias.
 */
static void
perf_adjust_freq_unthr_context(struct perf_event_context *ctx, bool unthrottle)
{
        struct perf_event_pmu_context *pmu_ctx;

        /*
         * only need to iterate over all events iff:
         * - context have events in frequency mode (needs freq adjust)
         * - there are events to unthrottle on this cpu
         */
        if (!(ctx->nr_freq || unthrottle))
                return;

        raw_spin_lock(&ctx->lock);

        list_for_each_entry(pmu_ctx, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (!(pmu_ctx->nr_freq || unthrottle))
                        continue;
                if (!perf_pmu_ctx_is_active(pmu_ctx))
                        continue;
                if (pmu_ctx->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT)
                        continue;

                perf_pmu_disable(pmu_ctx->pmu);
                perf_adjust_freq_unthr_events(&pmu_ctx->pinned_active);
                perf_adjust_freq_unthr_events(&pmu_ctx->flexible_active);
                perf_pmu_enable(pmu_ctx->pmu);
        }

        raw_spin_unlock(&ctx->lock);
}

/*
 * Move @event to the tail of the @ctx's elegible events.
 */
static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
{
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
        if (ctx->rotate_disable)
                return;

        perf_event_groups_delete(&ctx->flexible_groups, event);
        perf_event_groups_insert(&ctx->flexible_groups, event);
}

/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
ctx_event_to_rotate(struct perf_event_pmu_context *pmu_ctx)
{
        struct perf_event *event;
        struct rb_node *node;
        struct rb_root *tree;
        struct __group_key key = {
                .pmu = pmu_ctx->pmu,
        };

        /* pick the first active flexible event */
        event = list_first_entry_or_null(&pmu_ctx->flexible_active,
                                         struct perf_event, active_list);
        if (event)
                goto out;

        /* if no active flexible event, pick the first event */
        tree = &pmu_ctx->ctx->flexible_groups.tree;

        if (!pmu_ctx->ctx->task) {
                key.cpu = smp_processor_id();

                node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
                if (node)
                        event = __node_2_pe(node);
                goto out;
        }

        key.cpu = -1;
        node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
        if (node) {
                event = __node_2_pe(node);
                goto out;
        }

        key.cpu = smp_processor_id();
        node = rb_find_first(&key, tree, __group_cmp_ignore_cgroup);
        if (node)
                event = __node_2_pe(node);

out:
        /*
         * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
         * finds there are unschedulable events, it will set it again.
         */
        pmu_ctx->rotate_necessary = 0;

        return event;
}

static bool perf_rotate_context(struct perf_cpu_pmu_context *cpc)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_pmu_context *cpu_epc, *task_epc = NULL;
        struct perf_event *cpu_event = NULL, *task_event = NULL;
        int cpu_rotate, task_rotate;
        struct pmu *pmu;

        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */

        cpu_epc = &cpc->epc;
        pmu = cpu_epc->pmu;
        task_epc = cpc->task_epc;

        cpu_rotate = cpu_epc->rotate_necessary;
        task_rotate = task_epc ? task_epc->rotate_necessary : 0;

        if (!(cpu_rotate || task_rotate))
                return false;

        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(pmu);

        if (task_rotate)
                task_event = ctx_event_to_rotate(task_epc);
        if (cpu_rotate)
                cpu_event = ctx_event_to_rotate(cpu_epc);

        /*
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
        if (task_event || (task_epc && cpu_event)) {
                update_context_time(task_epc->ctx);
                __pmu_ctx_sched_out(task_epc, EVENT_FLEXIBLE);
        }

        if (cpu_event) {
                update_context_time(&cpuctx->ctx);
                __pmu_ctx_sched_out(cpu_epc, EVENT_FLEXIBLE);
                rotate_ctx(&cpuctx->ctx, cpu_event);
                __pmu_ctx_sched_in(cpu_epc, EVENT_FLEXIBLE);
        }

        if (task_event)
                rotate_ctx(task_epc->ctx, task_event);

        if (task_event || (task_epc && cpu_event))
                __pmu_ctx_sched_in(task_epc, EVENT_FLEXIBLE);

        perf_pmu_enable(pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);

        return true;
}

void perf_event_task_tick(void)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx;
        int throttled;

        lockdep_assert_irqs_disabled();

        __this_cpu_inc(perf_throttled_seq);
        throttled = __this_cpu_xchg(perf_throttled_count, 0);
        tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);

        perf_adjust_freq_unthr_context(&cpuctx->ctx, !!throttled);

        rcu_read_lock();
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_adjust_freq_unthr_context(ctx, !!throttled);
        rcu_read_unlock();
}

static int event_enable_on_exec(struct perf_event *event,
                                struct perf_event_context *ctx)
{
        if (!event->attr.enable_on_exec)
                return 0;

        event->attr.enable_on_exec = 0;
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;

        perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);

        return 1;
}

/*
 * Enable all of a task's events that have been marked enable-on-exec.
 * This expects task == current.
 */
static void perf_event_enable_on_exec(struct perf_event_context *ctx)
{
        struct perf_event_context *clone_ctx = NULL;
        enum event_type_t event_type = 0;
        struct perf_cpu_context *cpuctx;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;

        local_irq_save(flags);
        if (WARN_ON_ONCE(current->perf_event_ctxp != ctx))
                goto out;

        if (!ctx->nr_events)
                goto out;

        cpuctx = this_cpu_ptr(&perf_cpu_context);
        perf_ctx_lock(cpuctx, ctx);
        ctx_time_freeze(cpuctx, ctx);

        list_for_each_entry(event, &ctx->event_list, event_entry) {
                enabled |= event_enable_on_exec(event, ctx);
                event_type |= get_event_type(event);
        }

        /*
         * Unclone and reschedule this context if we enabled any event.
         */
        if (enabled) {
                clone_ctx = unclone_ctx(ctx);
                ctx_resched(cpuctx, ctx, NULL, event_type);
        }
        perf_ctx_unlock(cpuctx, ctx);

out:
        local_irq_restore(flags);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

static void perf_remove_from_owner(struct perf_event *event);
static void perf_event_exit_event(struct perf_event *event,
                                  struct perf_event_context *ctx);

/*
 * Removes all events from the current task that have been marked
 * remove-on-exec, and feeds their values back to parent events.
 */
static void perf_event_remove_on_exec(struct perf_event_context *ctx)
{
        struct perf_event_context *clone_ctx = NULL;
        struct perf_event *event, *next;
        unsigned long flags;
        bool modified = false;

        mutex_lock(&ctx->mutex);

        if (WARN_ON_ONCE(ctx->task != current))
                goto unlock;

        list_for_each_entry_safe(event, next, &ctx->event_list, event_entry) {
                if (!event->attr.remove_on_exec)
                        continue;

                if (!is_kernel_event(event))
                        perf_remove_from_owner(event);

                modified = true;

                perf_event_exit_event(event, ctx);
        }

        raw_spin_lock_irqsave(&ctx->lock, flags);
        if (modified)
                clone_ctx = unclone_ctx(ctx);
        raw_spin_unlock_irqrestore(&ctx->lock, flags);

unlock:
        mutex_unlock(&ctx->mutex);

        if (clone_ctx)
                put_ctx(clone_ctx);
}

struct perf_read_data {
        struct perf_event *event;
        bool group;
        int ret;
};

static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu);

static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
{
        int local_cpu = smp_processor_id();
        u16 local_pkg, event_pkg;

        if ((unsigned)event_cpu >= nr_cpu_ids)
                return event_cpu;

        if (event->group_caps & PERF_EV_CAP_READ_SCOPE) {
                const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(event->pmu->scope, event_cpu);

                if (cpumask && cpumask_test_cpu(local_cpu, cpumask))
                        return local_cpu;
        }

        if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
                event_pkg = topology_physical_package_id(event_cpu);
                local_pkg = topology_physical_package_id(local_cpu);

                if (event_pkg == local_pkg)
                        return local_cpu;
        }

        return event_cpu;
}

/*
 * Cross CPU call to read the hardware event
 */
static void __perf_event_read(void *info)
{
        struct perf_read_data *data = info;
        struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct pmu *pmu = event->pmu;

        /*
         * If this is a task context, we need to check whether it is
         * the current task context of this cpu.  If not it has been
         * scheduled out before the smp call arrived.  In that case
         * event->count would have been updated to a recent sample
         * when the event was scheduled out.
         */
        if (ctx->task && cpuctx->task_ctx != ctx)
                return;

        raw_spin_lock(&ctx->lock);
        ctx_time_update_event(ctx, event);

        perf_event_update_time(event);
        if (data->group)
                perf_event_update_sibling_time(event);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                goto unlock;

        if (!data->group) {
                pmu->read(event);
                data->ret = 0;
                goto unlock;
        }

        pmu->start_txn(pmu, PERF_PMU_TXN_READ);

        pmu->read(event);

        for_each_sibling_event(sub, event)
                perf_pmu_read(sub);

        data->ret = pmu->commit_txn(pmu);

unlock:
        raw_spin_unlock(&ctx->lock);
}

static inline u64 perf_event_count(struct perf_event *event, bool self)
{
        if (self)
                return local64_read(&event->count);

        return local64_read(&event->count) + atomic64_read(&event->child_count);
}

static void calc_timer_values(struct perf_event *event,
                                u64 *now,
                                u64 *enabled,
                                u64 *running)
{
        u64 ctx_time;

        *now = perf_clock();
        ctx_time = perf_event_time_now(event, *now);
        __perf_update_times(event, ctx_time, enabled, running);
}

/*
 * NMI-safe method to read a local event, that is an event that
 * is:
 *   - either for the current task, or for this CPU
 *   - does not have inherit set, for inherited task events
 *     will not be local and we cannot read them atomically
 *   - must not have a pmu::count method
 */
int perf_event_read_local(struct perf_event *event, u64 *value,
                          u64 *enabled, u64 *running)
{
        unsigned long flags;
        int event_oncpu;
        int event_cpu;
        int ret = 0;

        /*
         * Disabling interrupts avoids all counter scheduling (context
         * switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        /*
         * It must not be an event with inherit set, we cannot read
         * all child counters from atomic context.
         */
        if (event->attr.inherit) {
                ret = -EOPNOTSUPP;
                goto out;
        }

        /* If this is a per-task event, it must be for current */
        if ((event->attach_state & PERF_ATTACH_TASK) &&
            event->hw.target != current) {
                ret = -EINVAL;
                goto out;
        }

        /*
         * Get the event CPU numbers, and adjust them to local if the event is
         * a per-package event that can be read locally
         */
        event_oncpu = __perf_event_read_cpu(event, event->oncpu);
        event_cpu = __perf_event_read_cpu(event, event->cpu);

        /* If this is a per-CPU event, it must be for this CPU */
        if (!(event->attach_state & PERF_ATTACH_TASK) &&
            event_cpu != smp_processor_id()) {
                ret = -EINVAL;
                goto out;
        }

        /* If this is a pinned event it must be running on this CPU */
        if (event->attr.pinned && event_oncpu != smp_processor_id()) {
                ret = -EBUSY;
                goto out;
        }

        /*
         * If the event is currently on this CPU, its either a per-task event,
         * or local to this CPU. Furthermore it means its ACTIVE (otherwise
         * oncpu == -1).
         */
        if (event_oncpu == smp_processor_id())
                event->pmu->read(event);

        *value = local64_read(&event->count);
        if (enabled || running) {
                u64 __enabled, __running, __now;

                calc_timer_values(event, &__now, &__enabled, &__running);
                if (enabled)
                        *enabled = __enabled;
                if (running)
                        *running = __running;
        }
out:
        local_irq_restore(flags);

        return ret;
}

static int perf_event_read(struct perf_event *event, bool group)
{
        enum perf_event_state state = READ_ONCE(event->state);
        int event_cpu, ret = 0;

        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
again:
        if (state == PERF_EVENT_STATE_ACTIVE) {
                struct perf_read_data data;

                /*
                 * Orders the ->state and ->oncpu loads such that if we see
                 * ACTIVE we must also see the right ->oncpu.
                 *
                 * Matches the smp_wmb() from event_sched_in().
                 */
                smp_rmb();

                event_cpu = READ_ONCE(event->oncpu);
                if ((unsigned)event_cpu >= nr_cpu_ids)
                        return 0;

                data = (struct perf_read_data){
                        .event = event,
                        .group = group,
                        .ret = 0,
                };

                preempt_disable();
                event_cpu = __perf_event_read_cpu(event, event_cpu);

                /*
                 * Purposely ignore the smp_call_function_single() return
                 * value.
                 *
                 * If event_cpu isn't a valid CPU it means the event got
                 * scheduled out and that will have updated the event count.
                 *
                 * Therefore, either way, we'll have an up-to-date event count
                 * after this.
                 */
                (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
                preempt_enable();
                ret = data.ret;

        } else if (state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;

                raw_spin_lock_irqsave(&ctx->lock, flags);
                state = event->state;
                if (state != PERF_EVENT_STATE_INACTIVE) {
                        raw_spin_unlock_irqrestore(&ctx->lock, flags);
                        goto again;
                }

                /*
                 * May read while context is not active (e.g., thread is
                 * blocked), in that case we cannot update context time
                 */
                ctx_time_update_event(ctx, event);

                perf_event_update_time(event);
                if (group)
                        perf_event_update_sibling_time(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }

        return ret;
}

/*
 * Initialize the perf_event context in a task_struct:
 */
static void __perf_event_init_context(struct perf_event_context *ctx)
{
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->pmu_ctx_list);
        perf_event_groups_init(&ctx->pinned_groups);
        perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
        refcount_set(&ctx->refcount, 1);
}

static void
__perf_init_event_pmu_context(struct perf_event_pmu_context *epc, struct pmu *pmu)
{
        epc->pmu = pmu;
        INIT_LIST_HEAD(&epc->pmu_ctx_entry);
        INIT_LIST_HEAD(&epc->pinned_active);
        INIT_LIST_HEAD(&epc->flexible_active);
        atomic_set(&epc->refcount, 1);
}

static struct perf_event_context *
alloc_perf_context(struct task_struct *task)
{
        struct perf_event_context *ctx;

        ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
        if (!ctx)
                return NULL;

        __perf_event_init_context(ctx);
        if (task)
                ctx->task = get_task_struct(task);

        return ctx;
}

static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)
{
        struct task_struct *task;

        rcu_read_lock();
        if (!vpid)
                task = current;
        else
                task = find_task_by_vpid(vpid);
        if (task)
                get_task_struct(task);
        rcu_read_unlock();

        if (!task)
                return ERR_PTR(-ESRCH);

        return task;
}

/*
 * Returns a matching context with refcount and pincount.
 */
static struct perf_event_context *
find_get_context(struct task_struct *task, struct perf_event *event)
{
        struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_cpu_context *cpuctx;
        unsigned long flags;
        int err;

        if (!task) {
                /* Must be root to operate on a CPU event: */
                err = perf_allow_cpu();
                if (err)
                        return ERR_PTR(err);

                cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
                raw_spin_lock_irqsave(&ctx->lock, flags);
                ++ctx->pin_count;
                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                return ctx;
        }

        err = -EINVAL;
retry:
        ctx = perf_lock_task_context(task, &flags);
        if (ctx) {
                clone_ctx = unclone_ctx(ctx);
                ++ctx->pin_count;

                raw_spin_unlock_irqrestore(&ctx->lock, flags);

                if (clone_ctx)
                        put_ctx(clone_ctx);
        } else {
                ctx = alloc_perf_context(task);
                err = -ENOMEM;
                if (!ctx)
                        goto errout;

                err = 0;
                mutex_lock(&task->perf_event_mutex);
                /*
                 * If it has already passed perf_event_exit_task().
                 * we must see PF_EXITING, it takes this mutex too.
                 */
                if (task->flags & PF_EXITING)
                        err = -ESRCH;
                else if (task->perf_event_ctxp)
                        err = -EAGAIN;
                else {
                        get_ctx(ctx);
                        ++ctx->pin_count;
                        rcu_assign_pointer(task->perf_event_ctxp, ctx);
                }
                mutex_unlock(&task->perf_event_mutex);

                if (unlikely(err)) {
                        put_ctx(ctx);

                        if (err == -EAGAIN)
                                goto retry;
                        goto errout;
                }
        }

        return ctx;

errout:
        return ERR_PTR(err);
}

static struct perf_event_pmu_context *
find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
                     struct perf_event *event)
{
        struct perf_event_pmu_context *new = NULL, *pos = NULL, *epc;

        if (!ctx->task) {
                /*
                 * perf_pmu_migrate_context() / __perf_pmu_install_event()
                 * relies on the fact that find_get_pmu_context() cannot fail
                 * for CPU contexts.
                 */
                struct perf_cpu_pmu_context *cpc;

                cpc = *per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
                epc = &cpc->epc;
                raw_spin_lock_irq(&ctx->lock);
                if (!epc->ctx) {
                        /*
                         * One extra reference for the pmu; see perf_pmu_free().
                         */
                        atomic_set(&epc->refcount, 2);
                        epc->embedded = 1;
                        list_add(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
                        epc->ctx = ctx;
                } else {
                        WARN_ON_ONCE(epc->ctx != ctx);
                        atomic_inc(&epc->refcount);
                }
                raw_spin_unlock_irq(&ctx->lock);
                return epc;
        }

        new = kzalloc(sizeof(*epc), GFP_KERNEL);
        if (!new)
                return ERR_PTR(-ENOMEM);

        __perf_init_event_pmu_context(new, pmu);

        /*
         * XXX
         *
         * lockdep_assert_held(&ctx->mutex);
         *
         * can't because perf_event_init_task() doesn't actually hold the
         * child_ctx->mutex.
         */

        raw_spin_lock_irq(&ctx->lock);
        list_for_each_entry(epc, &ctx->pmu_ctx_list, pmu_ctx_entry) {
                if (epc->pmu == pmu) {
                        WARN_ON_ONCE(epc->ctx != ctx);
                        atomic_inc(&epc->refcount);
                        goto found_epc;
                }
                /* Make sure the pmu_ctx_list is sorted by PMU type: */
                if (!pos && epc->pmu->type > pmu->type)
                        pos = epc;
        }

        epc = new;
        new = NULL;

        if (!pos)
                list_add_tail(&epc->pmu_ctx_entry, &ctx->pmu_ctx_list);
        else
                list_add(&epc->pmu_ctx_entry, pos->pmu_ctx_entry.prev);

        epc->ctx = ctx;

found_epc:
        raw_spin_unlock_irq(&ctx->lock);
        kfree(new);

        return epc;
}

static void get_pmu_ctx(struct perf_event_pmu_context *epc)
{
        WARN_ON_ONCE(!atomic_inc_not_zero(&epc->refcount));
}

static void free_cpc_rcu(struct rcu_head *head)
{
        struct perf_cpu_pmu_context *cpc =
                container_of(head, typeof(*cpc), epc.rcu_head);

        kfree(cpc);
}

static void free_epc_rcu(struct rcu_head *head)
{
        struct perf_event_pmu_context *epc = container_of(head, typeof(*epc), rcu_head);

        kfree(epc);
}

static void put_pmu_ctx(struct perf_event_pmu_context *epc)
{
        struct perf_event_context *ctx = epc->ctx;
        unsigned long flags;

        /*
         * XXX
         *
         * lockdep_assert_held(&ctx->mutex);
         *
         * can't because of the call-site in _free_event()/put_event()
         * which isn't always called under ctx->mutex.
         */
        if (!atomic_dec_and_raw_lock_irqsave(&epc->refcount, &ctx->lock, flags))
                return;

        WARN_ON_ONCE(list_empty(&epc->pmu_ctx_entry));

        list_del_init(&epc->pmu_ctx_entry);
        epc->ctx = NULL;

        WARN_ON_ONCE(!list_empty(&epc->pinned_active));
        WARN_ON_ONCE(!list_empty(&epc->flexible_active));

        raw_spin_unlock_irqrestore(&ctx->lock, flags);

        if (epc->embedded) {
                call_rcu(&epc->rcu_head, free_cpc_rcu);
                return;
        }

        call_rcu(&epc->rcu_head, free_epc_rcu);
}

static void perf_event_free_filter(struct perf_event *event);

static void free_event_rcu(struct rcu_head *head)
{
        struct perf_event *event = container_of(head, typeof(*event), rcu_head);

        if (event->ns)
                put_pid_ns(event->ns);
        perf_event_free_filter(event);
        kmem_cache_free(perf_event_cache, event);
}

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb);

static void detach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_del_rcu(&event->sb_list);
        raw_spin_unlock(&pel->lock);
}

static bool is_sb_event(struct perf_event *event)
{
        struct perf_event_attr *attr = &event->attr;

        if (event->parent)
                return false;

        if (event->attach_state & PERF_ATTACH_TASK)
                return false;

        if (attr->mmap || attr->mmap_data || attr->mmap2 ||
            attr->comm || attr->comm_exec ||
            attr->task || attr->ksymbol ||
            attr->context_switch || attr->text_poke ||
            attr->bpf_event)
                return true;
        return false;
}

static void unaccount_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                detach_sb_event(event);
}

#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif

static void unaccount_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        spin_lock(&nr_freq_lock);
        if (atomic_dec_and_test(&nr_freq_events))
                tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void unaccount_freq_event(void)
{
        if (tick_nohz_full_enabled())
                unaccount_freq_event_nohz();
        else
                atomic_dec(&nr_freq_events);
}


static struct perf_ctx_data *
alloc_perf_ctx_data(struct kmem_cache *ctx_cache, bool global)
{
        struct perf_ctx_data *cd;

        cd = kzalloc(sizeof(*cd), GFP_KERNEL);
        if (!cd)
                return NULL;

        cd->data = kmem_cache_zalloc(ctx_cache, GFP_KERNEL);
        if (!cd->data) {
                kfree(cd);
                return NULL;
        }

        cd->global = global;
        cd->ctx_cache = ctx_cache;
        refcount_set(&cd->refcount, 1);

        return cd;
}

static void free_perf_ctx_data(struct perf_ctx_data *cd)
{
        kmem_cache_free(cd->ctx_cache, cd->data);
        kfree(cd);
}

static void __free_perf_ctx_data_rcu(struct rcu_head *rcu_head)
{
        struct perf_ctx_data *cd;

        cd = container_of(rcu_head, struct perf_ctx_data, rcu_head);
        free_perf_ctx_data(cd);
}

static inline void perf_free_ctx_data_rcu(struct perf_ctx_data *cd)
{
        call_rcu(&cd->rcu_head, __free_perf_ctx_data_rcu);
}

static int
attach_task_ctx_data(struct task_struct *task, struct kmem_cache *ctx_cache,
                     bool global)
{
        struct perf_ctx_data *cd, *old = NULL;

        cd = alloc_perf_ctx_data(ctx_cache, global);
        if (!cd)
                return -ENOMEM;

        for (;;) {
                if (try_cmpxchg((struct perf_ctx_data **)&task->perf_ctx_data, &old, cd)) {
                        if (old)
                                perf_free_ctx_data_rcu(old);
                        return 0;
                }

                if (!old) {
                        /*
                         * After seeing a dead @old, we raced with
                         * removal and lost, try again to install @cd.
                         */
                        continue;
                }

                if (refcount_inc_not_zero(&old->refcount)) {
                        free_perf_ctx_data(cd); /* unused */
                        return 0;
                }

                /*
                 * @old is a dead object, refcount==0 is stable, try and
                 * replace it with @cd.
                 */
        }
        return 0;
}

static void __detach_global_ctx_data(void);
DEFINE_STATIC_PERCPU_RWSEM(global_ctx_data_rwsem);
static refcount_t global_ctx_data_ref;

static int
attach_global_ctx_data(struct kmem_cache *ctx_cache)
{
        struct task_struct *g, *p;
        struct perf_ctx_data *cd;
        int ret;

        if (refcount_inc_not_zero(&global_ctx_data_ref))
                return 0;

        guard(percpu_write)(&global_ctx_data_rwsem);
        if (refcount_inc_not_zero(&global_ctx_data_ref))
                return 0;
again:
        /* Allocate everything */
        scoped_guard (rcu) {
                for_each_process_thread(g, p) {
                        cd = rcu_dereference(p->perf_ctx_data);
                        if (cd && !cd->global) {
                                cd->global = 1;
                                if (!refcount_inc_not_zero(&cd->refcount))
                                        cd = NULL;
                        }
                        if (!cd) {
                                get_task_struct(p);
                                goto alloc;
                        }
                }
        }

        refcount_set(&global_ctx_data_ref, 1);

        return 0;
alloc:
        ret = attach_task_ctx_data(p, ctx_cache, true);
        put_task_struct(p);
        if (ret) {
                __detach_global_ctx_data();
                return ret;
        }
        goto again;
}

static int
attach_perf_ctx_data(struct perf_event *event)
{
        struct task_struct *task = event->hw.target;
        struct kmem_cache *ctx_cache = event->pmu->task_ctx_cache;
        int ret;

        if (!ctx_cache)
                return -ENOMEM;

        if (task)
                return attach_task_ctx_data(task, ctx_cache, false);

        ret = attach_global_ctx_data(ctx_cache);
        if (ret)
                return ret;

        event->attach_state |= PERF_ATTACH_GLOBAL_DATA;
        return 0;
}

static void
detach_task_ctx_data(struct task_struct *p)
{
        struct perf_ctx_data *cd;

        scoped_guard (rcu) {
                cd = rcu_dereference(p->perf_ctx_data);
                if (!cd || !refcount_dec_and_test(&cd->refcount))
                        return;
        }

        /*
         * The old ctx_data may be lost because of the race.
         * Nothing is required to do for the case.
         * See attach_task_ctx_data().
         */
        if (try_cmpxchg((struct perf_ctx_data **)&p->perf_ctx_data, &cd, NULL))
                perf_free_ctx_data_rcu(cd);
}

static void __detach_global_ctx_data(void)
{
        struct task_struct *g, *p;
        struct perf_ctx_data *cd;

again:
        scoped_guard (rcu) {
                for_each_process_thread(g, p) {
                        cd = rcu_dereference(p->perf_ctx_data);
                        if (!cd || !cd->global)
                                continue;
                        cd->global = 0;
                        get_task_struct(p);
                        goto detach;
                }
        }
        return;
detach:
        detach_task_ctx_data(p);
        put_task_struct(p);
        goto again;
}

static void detach_global_ctx_data(void)
{
        if (refcount_dec_not_one(&global_ctx_data_ref))
                return;

        guard(percpu_write)(&global_ctx_data_rwsem);
        if (!refcount_dec_and_test(&global_ctx_data_ref))
                return;

        /* remove everything */
        __detach_global_ctx_data();
}

static void detach_perf_ctx_data(struct perf_event *event)
{
        struct task_struct *task = event->hw.target;

        event->attach_state &= ~PERF_ATTACH_TASK_DATA;

        if (task)
                return detach_task_ctx_data(task);

        if (event->attach_state & PERF_ATTACH_GLOBAL_DATA) {
                detach_global_ctx_data();
                event->attach_state &= ~PERF_ATTACH_GLOBAL_DATA;
        }
}

static void unaccount_event(struct perf_event *event)
{
        bool dec = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                dec = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_dec(&nr_mmap_events);
        if (event->attr.build_id)
                atomic_dec(&nr_build_id_events);
        if (event->attr.comm)
                atomic_dec(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_dec(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_dec(&nr_cgroup_events);
        if (event->attr.task)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
                unaccount_freq_event();
        if (event->attr.context_switch) {
                dec = true;
                atomic_dec(&nr_switch_events);
        }
        if (is_cgroup_event(event))
                dec = true;
        if (has_branch_stack(event))
                dec = true;
        if (event->attr.ksymbol)
                atomic_dec(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_dec(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_dec(&nr_text_poke_events);

        if (dec) {
                if (!atomic_add_unless(&perf_sched_count, -1, 1))
                        schedule_delayed_work(&perf_sched_work, HZ);
        }

        unaccount_pmu_sb_event(event);
}

static void perf_sched_delayed(struct work_struct *work)
{
        mutex_lock(&perf_sched_mutex);
        if (atomic_dec_and_test(&perf_sched_count))
                static_branch_disable(&perf_sched_events);
        mutex_unlock(&perf_sched_mutex);
}

/*
 * The following implement mutual exclusion of events on "exclusive" pmus
 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
 * at a time, so we disallow creating events that might conflict, namely:
 *
 *  1) cpu-wide events in the presence of per-task events,
 *  2) per-task events in the presence of cpu-wide events,
 *  3) two matching events on the same perf_event_context.
 *
 * The former two cases are handled in the allocation path (perf_event_alloc(),
 * _free_event()), the latter -- before the first perf_install_in_context().
 */
static int exclusive_event_init(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        if (!is_exclusive_pmu(pmu))
                return 0;

        /*
         * Prevent co-existence of per-task and cpu-wide events on the
         * same exclusive pmu.
         *
         * Negative pmu::exclusive_cnt means there are cpu-wide
         * events on this "exclusive" pmu, positive means there are
         * per-task events.
         *
         * Since this is called in perf_event_alloc() path, event::ctx
         * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
         * to mean "per-task event", because unlike other attach states it
         * never gets cleared.
         */
        if (event->attach_state & PERF_ATTACH_TASK) {
                if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
                        return -EBUSY;
        } else {
                if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
                        return -EBUSY;
        }

        event->attach_state |= PERF_ATTACH_EXCLUSIVE;

        return 0;
}

static void exclusive_event_destroy(struct perf_event *event)
{
        struct pmu *pmu = event->pmu;

        /* see comment in exclusive_event_init() */
        if (event->attach_state & PERF_ATTACH_TASK)
                atomic_dec(&pmu->exclusive_cnt);
        else
                atomic_inc(&pmu->exclusive_cnt);

        event->attach_state &= ~PERF_ATTACH_EXCLUSIVE;
}

static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
{
        if ((e1->pmu == e2->pmu) &&
            (e1->cpu == e2->cpu ||
             e1->cpu == -1 ||
             e2->cpu == -1))
                return true;
        return false;
}

static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx)
{
        struct perf_event *iter_event;
        struct pmu *pmu = event->pmu;

        lockdep_assert_held(&ctx->mutex);

        if (!is_exclusive_pmu(pmu))
                return true;

        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
                if (exclusive_event_match(iter_event, event))
                        return false;
        }

        return true;
}

static void perf_free_addr_filters(struct perf_event *event);

/* vs perf_event_alloc() error */
static void __free_event(struct perf_event *event)
{
        if (event->attach_state & PERF_ATTACH_CALLCHAIN)
                put_callchain_buffers();

        kfree(event->addr_filter_ranges);

        if (event->attach_state & PERF_ATTACH_EXCLUSIVE)
                exclusive_event_destroy(event);

        if (is_cgroup_event(event))
                perf_detach_cgroup(event);

        if (event->attach_state & PERF_ATTACH_TASK_DATA)
                detach_perf_ctx_data(event);

        if (event->destroy)
                event->destroy(event);

        /*
         * Must be after ->destroy(), due to uprobe_perf_close() using
         * hw.target.
         */
        if (event->hw.target)
                put_task_struct(event->hw.target);

        if (event->pmu_ctx) {
                /*
                 * put_pmu_ctx() needs an event->ctx reference, because of
                 * epc->ctx.
                 */
                WARN_ON_ONCE(!event->ctx);
                WARN_ON_ONCE(event->pmu_ctx->ctx != event->ctx);
                put_pmu_ctx(event->pmu_ctx);
        }

        /*
         * perf_event_free_task() relies on put_ctx() being 'last', in
         * particular all task references must be cleaned up.
         */
        if (event->ctx)
                put_ctx(event->ctx);

        if (event->pmu)
                module_put(event->pmu->module);

        call_rcu(&event->rcu_head, free_event_rcu);
}

DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T))

/* vs perf_event_alloc() success */
static void _free_event(struct perf_event *event)
{
        irq_work_sync(&event->pending_irq);
        irq_work_sync(&event->pending_disable_irq);

        unaccount_event(event);

        security_perf_event_free(event);

        if (event->rb) {
                /*
                 * Can happen when we close an event with re-directed output.
                 *
                 * Since we have a 0 refcount, perf_mmap_close() will skip
                 * over us; possibly making our ring_buffer_put() the last.
                 */
                mutex_lock(&event->mmap_mutex);
                ring_buffer_attach(event, NULL);
                mutex_unlock(&event->mmap_mutex);
        }

        perf_event_free_bpf_prog(event);
        perf_free_addr_filters(event);

        __free_event(event);
}

/*
 * Used to free events which have a known refcount of 1, such as in error paths
 * where the event isn't exposed yet and inherited events.
 */
static void free_event(struct perf_event *event)
{
        if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
                                "unexpected event refcount: %ld; ptr=%p\n",
                                atomic_long_read(&event->refcount), event)) {
                /* leak to avoid use-after-free */
                return;
        }

        _free_event(event);
}

/*
 * Remove user event from the owner task.
 */
static void perf_remove_from_owner(struct perf_event *event)
{
        struct task_struct *owner;

        rcu_read_lock();
        /*
         * Matches the smp_store_release() in perf_event_exit_task(). If we
         * observe !owner it means the list deletion is complete and we can
         * indeed free this event, otherwise we need to serialize on
         * owner->perf_event_mutex.
         */
        owner = READ_ONCE(event->owner);
        if (owner) {
                /*
                 * Since delayed_put_task_struct() also drops the last
                 * task reference we can safely take a new reference
                 * while holding the rcu_read_lock().
                 */
                get_task_struct(owner);
        }
        rcu_read_unlock();

        if (owner) {
                /*
                 * If we're here through perf_event_exit_task() we're already
                 * holding ctx->mutex which would be an inversion wrt. the
                 * normal lock order.
                 *
                 * However we can safely take this lock because its the child
                 * ctx->mutex.
                 */
                mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);

                /*
                 * We have to re-check the event->owner field, if it is cleared
                 * we raced with perf_event_exit_task(), acquiring the mutex
                 * ensured they're done, and we can proceed with freeing the
                 * event.
                 */
                if (event->owner) {
                        list_del_init(&event->owner_entry);
                        smp_store_release(&event->owner, NULL);
                }
                mutex_unlock(&owner->perf_event_mutex);
                put_task_struct(owner);
        }
}

static void put_event(struct perf_event *event)
{
        struct perf_event *parent;

        if (!atomic_long_dec_and_test(&event->refcount))
                return;

        parent = event->parent;
        _free_event(event);

        /* Matches the refcount bump in inherit_event() */
        if (parent)
                put_event(parent);
}

/*
 * Kill an event dead; while event:refcount will preserve the event
 * object, it will not preserve its functionality. Once the last 'user'
 * gives up the object, we'll destroy the thing.
 */
int perf_event_release_kernel(struct perf_event *event)
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *child, *tmp;
        LIST_HEAD(free_list);

        /*
         * If we got here through err_alloc: free_event(event); we will not
         * have attached to a context yet.
         */
        if (!ctx) {
                WARN_ON_ONCE(event->attach_state &
                                (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
                goto no_ctx;
        }

        if (!is_kernel_event(event))
                perf_remove_from_owner(event);

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(ctx->parent_ctx);

        /*
         * Mark this event as STATE_DEAD, there is no external reference to it
         * anymore.
         *
         * Anybody acquiring event->child_mutex after the below loop _must_
         * also see this, most importantly inherit_event() which will avoid
         * placing more children on the list.
         *
         * Thus this guarantees that we will in fact observe and kill _ALL_
         * child events.
         */
        perf_remove_from_context(event, DETACH_GROUP|DETACH_DEAD);

        perf_event_ctx_unlock(event, ctx);

again:
        mutex_lock(&event->child_mutex);
        list_for_each_entry(child, &event->child_list, child_list) {
                void *var = NULL;

                /*
                 * Cannot change, child events are not migrated, see the
                 * comment with perf_event_ctx_lock_nested().
                 */
                ctx = READ_ONCE(child->ctx);
                /*
                 * Since child_mutex nests inside ctx::mutex, we must jump
                 * through hoops. We start by grabbing a reference on the ctx.
                 *
                 * Since the event cannot get freed while we hold the
                 * child_mutex, the context must also exist and have a !0
                 * reference count.
                 */
                get_ctx(ctx);

                /*
                 * Now that we have a ctx ref, we can drop child_mutex, and
                 * acquire ctx::mutex without fear of it going away. Then we
                 * can re-acquire child_mutex.
                 */
                mutex_unlock(&event->child_mutex);
                mutex_lock(&ctx->mutex);
                mutex_lock(&event->child_mutex);

                /*
                 * Now that we hold ctx::mutex and child_mutex, revalidate our
                 * state, if child is still the first entry, it didn't get freed
                 * and we can continue doing so.
                 */
                tmp = list_first_entry_or_null(&event->child_list,
                                               struct perf_event, child_list);
                if (tmp == child) {
                        perf_remove_from_context(child, DETACH_GROUP);
                        list_move(&child->child_list, &free_list);
                } else {
                        var = &ctx->refcount;
                }

                mutex_unlock(&event->child_mutex);
                mutex_unlock(&ctx->mutex);
                put_ctx(ctx);

                if (var) {
                        /*
                         * If perf_event_free_task() has deleted all events from the
                         * ctx while the child_mutex got released above, make sure to
                         * notify about the preceding put_ctx().
                         */
                        smp_mb(); /* pairs with wait_var_event() */
                        wake_up_var(var);
                }
                goto again;
        }
        mutex_unlock(&event->child_mutex);

        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
                void *var = &child->ctx->refcount;

                list_del(&child->child_list);
                /* Last reference unless ->pending_task work is pending */
                put_event(child);

                /*
                 * Wake any perf_event_free_task() waiting for this event to be
                 * freed.
                 */
                smp_mb(); /* pairs with wait_var_event() */
                wake_up_var(var);
        }

no_ctx:
        /*
         * Last reference unless ->pending_task work is pending on this event
         * or any of its children.
         */
        put_event(event);
        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);

/*
 * Called when the last reference to the file is gone.
 */
static int perf_release(struct inode *inode, struct file *file)
{
        perf_event_release_kernel(file->private_data);
        return 0;
}

static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event *child;
        u64 total = 0;

        *enabled = 0;
        *running = 0;

        mutex_lock(&event->child_mutex);

        (void)perf_event_read(event, false);
        total += perf_event_count(event, false);

        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);

        list_for_each_entry(child, &event->child_list, child_list) {
                (void)perf_event_read(child, false);
                total += perf_event_count(child, false);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
        mutex_unlock(&event->child_mutex);

        return total;
}

u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        count = __perf_event_read_value(event, enabled, running);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_read_value);

static int __perf_read_group_add(struct perf_event *leader,
                                        u64 read_format, u64 *values)
{
        struct perf_event_context *ctx = leader->ctx;
        struct perf_event *sub, *parent;
        unsigned long flags;
        int n = 1; /* skip @nr */
        int ret;

        ret = perf_event_read(leader, true);
        if (ret)
                return ret;

        raw_spin_lock_irqsave(&ctx->lock, flags);
        /*
         * Verify the grouping between the parent and child (inherited)
         * events is still in tact.
         *
         * Specifically:
         *  - leader->ctx->lock pins leader->sibling_list
         *  - parent->child_mutex pins parent->child_list
         *  - parent->ctx->mutex pins parent->sibling_list
         *
         * Because parent->ctx != leader->ctx (and child_list nests inside
         * ctx->mutex), group destruction is not atomic between children, also
         * see perf_event_release_kernel(). Additionally, parent can grow the
         * group.
         *
         * Therefore it is possible to have parent and child groups in a
         * different configuration and summing over such a beast makes no sense
         * what so ever.
         *
         * Reject this.
         */
        parent = leader->parent;
        if (parent &&
            (parent->group_generation != leader->group_generation ||
             parent->nr_siblings != leader->nr_siblings)) {
                ret = -ECHILD;
                goto unlock;
        }

        /*
         * Since we co-schedule groups, {enabled,running} times of siblings
         * will be identical to those of the leader, so we only publish one
         * set.
         */
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] += leader->total_time_enabled +
                        atomic64_read(&leader->child_total_time_enabled);
        }

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] += leader->total_time_running +
                        atomic64_read(&leader->child_total_time_running);
        }

        /*
         * Write {count,id} tuples for every sibling.
         */
        values[n++] += perf_event_count(leader, false);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        for_each_sibling_event(sub, leader) {
                values[n++] += perf_event_count(sub, false);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);
        }

unlock:
        raw_spin_unlock_irqrestore(&ctx->lock, flags);
        return ret;
}

static int perf_read_group(struct perf_event *event,
                                   u64 read_format, char __user *buf)
{
        struct perf_event *leader = event->group_leader, *child;
        struct perf_event_context *ctx = leader->ctx;
        int ret;
        u64 *values;

        lockdep_assert_held(&ctx->mutex);

        values = kzalloc(event->read_size, GFP_KERNEL);
        if (!values)
                return -ENOMEM;

        values[0] = 1 + leader->nr_siblings;

        mutex_lock(&leader->child_mutex);

        ret = __perf_read_group_add(leader, read_format, values);
        if (ret)
                goto unlock;

        list_for_each_entry(child, &leader->child_list, child_list) {
                ret = __perf_read_group_add(child, read_format, values);
                if (ret)
                        goto unlock;
        }

        mutex_unlock(&leader->child_mutex);

        ret = event->read_size;
        if (copy_to_user(buf, values, event->read_size))
                ret = -EFAULT;
        goto out;

unlock:
        mutex_unlock(&leader->child_mutex);
out:
        kfree(values);
        return ret;
}

static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
{
        u64 enabled, running;
        u64 values[5];
        int n = 0;

        values[n++] = __perf_event_read_value(event, &enabled, &running);
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        if (copy_to_user(buf, values, n * sizeof(u64)))
                return -EFAULT;

        return n * sizeof(u64);
}

static bool is_event_hup(struct perf_event *event)
{
        bool no_children;

        if (event->state > PERF_EVENT_STATE_EXIT)
                return false;

        mutex_lock(&event->child_mutex);
        no_children = list_empty(&event->child_list);
        mutex_unlock(&event->child_mutex);
        return no_children;
}

/*
 * Read the performance event - simple non blocking version for now
 */
static ssize_t
__perf_read(struct perf_event *event, char __user *buf, size_t count)
{
        u64 read_format = event->attr.read_format;
        int ret;

        /*
         * Return end-of-file for a read on an event that is in
         * error state (i.e. because it was pinned but it couldn't be
         * scheduled on to the CPU at some point).
         */
        if (event->state == PERF_EVENT_STATE_ERROR)
                return 0;

        if (count < event->read_size)
                return -ENOSPC;

        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
                ret = perf_read_group(event, read_format, buf);
        else
                ret = perf_read_one(event, read_format, buf);

        return ret;
}

static ssize_t
perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        int ret;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

static __poll_t perf_poll(struct file *file, poll_table *wait)
{
        struct perf_event *event = file->private_data;
        struct perf_buffer *rb;
        __poll_t events = EPOLLHUP;

        poll_wait(file, &event->waitq, wait);

        if (is_event_hup(event))
                return events;

        if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR &&
                     event->attr.pinned))
                return EPOLLERR;

        /*
         * Pin the event->rb by taking event->mmap_mutex; otherwise
         * perf_event_set_output() can swizzle our rb and make us miss wakeups.
         */
        mutex_lock(&event->mmap_mutex);
        rb = event->rb;
        if (rb)
                events = atomic_xchg(&rb->poll, 0);
        mutex_unlock(&event->mmap_mutex);
        return events;
}

static void _perf_event_reset(struct perf_event *event)
{
        (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
}

/* Assume it's not an event with inherit set. */
u64 perf_event_pause(struct perf_event *event, bool reset)
{
        struct perf_event_context *ctx;
        u64 count;

        ctx = perf_event_ctx_lock(event);
        WARN_ON_ONCE(event->attr.inherit);
        _perf_event_disable(event);
        count = local64_read(&event->count);
        if (reset)
                local64_set(&event->count, 0);
        perf_event_ctx_unlock(event, ctx);

        return count;
}
EXPORT_SYMBOL_GPL(perf_event_pause);

/*
 * Holding the top-level event's child_mutex means that any
 * descendant process that has inherited this event will block
 * in perf_event_exit_event() if it goes to exit, thus satisfying the
 * task existence requirements of perf_event_enable/disable.
 */
static void perf_event_for_each_child(struct perf_event *event,
                                        void (*func)(struct perf_event *))
{
        struct perf_event *child;

        WARN_ON_ONCE(event->ctx->parent_ctx);

        mutex_lock(&event->child_mutex);
        func(event);
        list_for_each_entry(child, &event->child_list, child_list)
                func(child);
        mutex_unlock(&event->child_mutex);
}

static void perf_event_for_each(struct perf_event *event,
                                  void (*func)(struct perf_event *))
{
        struct perf_event_context *ctx = event->ctx;
        struct perf_event *sibling;

        lockdep_assert_held(&ctx->mutex);

        event = event->group_leader;

        perf_event_for_each_child(event, func);
        for_each_sibling_event(sibling, event)
                perf_event_for_each_child(sibling, func);
}

static void __perf_event_period(struct perf_event *event,
                                struct perf_cpu_context *cpuctx,
                                struct perf_event_context *ctx,
                                void *info)
{
        u64 value = *((u64 *)info);
        bool active;

        if (event->attr.freq) {
                event->attr.sample_freq = value;
        } else {
                event->attr.sample_period = value;
                event->hw.sample_period = value;
        }

        active = (event->state == PERF_EVENT_STATE_ACTIVE);
        if (active) {
                perf_pmu_disable(event->pmu);
                /*
                 * We could be throttled; unthrottle now to avoid the tick
                 * trying to unthrottle while we already re-started the event.
                 */
                if (event->hw.interrupts == MAX_INTERRUPTS) {
                        event->hw.interrupts = 0;
                        perf_log_throttle(event, 1);
                }
                event->pmu->stop(event, PERF_EF_UPDATE);
        }

        local64_set(&event->hw.period_left, 0);

        if (active) {
                event->pmu->start(event, PERF_EF_RELOAD);
                perf_pmu_enable(event->pmu);
        }
}

static int perf_event_check_period(struct perf_event *event, u64 value)
{
        return event->pmu->check_period(event, value);
}

static int _perf_event_period(struct perf_event *event, u64 value)
{
        if (!is_sampling_event(event))
                return -EINVAL;

        if (!value)
                return -EINVAL;

        if (event->attr.freq) {
                if (value > sysctl_perf_event_sample_rate)
                        return -EINVAL;
        } else {
                if (perf_event_check_period(event, value))
                        return -EINVAL;
                if (value & (1ULL << 63))
                        return -EINVAL;
        }

        event_function_call(event, __perf_event_period, &value);

        return 0;
}

int perf_event_period(struct perf_event *event, u64 value)
{
        struct perf_event_context *ctx;
        int ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_event_period(event, value);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}
EXPORT_SYMBOL_GPL(perf_event_period);

static const struct file_operations perf_fops;

static inline bool is_perf_file(struct fd f)
{
        return !fd_empty(f) && fd_file(f)->f_op == &perf_fops;
}

static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr);

static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
        void (*func)(struct perf_event *);
        u32 flags = arg;

        switch (cmd) {
        case PERF_EVENT_IOC_ENABLE:
                func = _perf_event_enable;
                break;
        case PERF_EVENT_IOC_DISABLE:
                func = _perf_event_disable;
                break;
        case PERF_EVENT_IOC_RESET:
                func = _perf_event_reset;
                break;

        case PERF_EVENT_IOC_REFRESH:
                return _perf_event_refresh(event, arg);

        case PERF_EVENT_IOC_PERIOD:
        {
                u64 value;

                if (copy_from_user(&value, (u64 __user *)arg, sizeof(value)))
                        return -EFAULT;

                return _perf_event_period(event, value);
        }
        case PERF_EVENT_IOC_ID:
        {
                u64 id = primary_event_id(event);

                if (copy_to_user((void __user *)arg, &id, sizeof(id)))
                        return -EFAULT;
                return 0;
        }

        case PERF_EVENT_IOC_SET_OUTPUT:
        {
                CLASS(fd, output)(arg);             // arg == -1 => empty
                struct perf_event *output_event = NULL;
                if (arg != -1) {
                        if (!is_perf_file(output))
                                return -EBADF;
                        output_event = fd_file(output)->private_data;
                }
                return perf_event_set_output(event, output_event);
        }

        case PERF_EVENT_IOC_SET_FILTER:
                return perf_event_set_filter(event, (void __user *)arg);

        case PERF_EVENT_IOC_SET_BPF:
        {
                struct bpf_prog *prog;
                int err;

                prog = bpf_prog_get(arg);
                if (IS_ERR(prog))
                        return PTR_ERR(prog);

                err = perf_event_set_bpf_prog(event, prog, 0);
                if (err) {
                        bpf_prog_put(prog);
                        return err;
                }

                return 0;
        }

        case PERF_EVENT_IOC_PAUSE_OUTPUT: {
                struct perf_buffer *rb;

                rcu_read_lock();
                rb = rcu_dereference(event->rb);
                if (!rb || !rb->nr_pages) {
                        rcu_read_unlock();
                        return -EINVAL;
                }
                rb_toggle_paused(rb, !!arg);
                rcu_read_unlock();
                return 0;
        }

        case PERF_EVENT_IOC_QUERY_BPF:
                return perf_event_query_prog_array(event, (void __user *)arg);

        case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
                struct perf_event_attr new_attr;
                int err = perf_copy_attr((struct perf_event_attr __user *)arg,
                                         &new_attr);

                if (err)
                        return err;

                return perf_event_modify_attr(event,  &new_attr);
        }
        default:
                return -ENOTTY;
        }

        if (flags & PERF_IOC_FLAG_GROUP)
                perf_event_for_each(event, func);
        else
                perf_event_for_each_child(event, func);

        return 0;
}

static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
        struct perf_event *event = file->private_data;
        struct perf_event_context *ctx;
        long ret;

        /* Treat ioctl like writes as it is likely a mutating operation. */
        ret = security_perf_event_write(event);
        if (ret)
                return ret;

        ctx = perf_event_ctx_lock(event);
        ret = _perf_ioctl(event, cmd, arg);
        perf_event_ctx_unlock(event, ctx);

        return ret;
}

#ifdef CONFIG_COMPAT
static long perf_compat_ioctl(struct file *file, unsigned int cmd,
                                unsigned long arg)
{
        switch (_IOC_NR(cmd)) {
        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
        case _IOC_NR(PERF_EVENT_IOC_ID):
        case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
        case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
                        cmd &= ~IOCSIZE_MASK;
                        cmd |= sizeof(void *) << IOCSIZE_SHIFT;
                }
                break;
        }
        return perf_ioctl(file, cmd, arg);
}
#else
# define perf_compat_ioctl NULL
#endif

int perf_event_task_enable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_enable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

int perf_event_task_disable(void)
{
        struct perf_event_context *ctx;
        struct perf_event *event;

        mutex_lock(&current->perf_event_mutex);
        list_for_each_entry(event, &current->perf_event_list, owner_entry) {
                ctx = perf_event_ctx_lock(event);
                perf_event_for_each_child(event, _perf_event_disable);
                perf_event_ctx_unlock(event, ctx);
        }
        mutex_unlock(&current->perf_event_mutex);

        return 0;
}

static int perf_event_index(struct perf_event *event)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return 0;

        return event->pmu->event_idx(event);
}

static void perf_event_init_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        userpg = rb->user_page;

        /* Allow new userspace to detect that bit 0 is deprecated */
        userpg->cap_bit0_is_deprecated = 1;
        userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
        userpg->data_offset = PAGE_SIZE;
        userpg->data_size = perf_data_size(rb);

unlock:
        rcu_read_unlock();
}

void __weak arch_perf_update_userpage(
        struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
{
}

/*
 * Callers need to ensure there can be no nesting of this function, otherwise
 * the seqlock logic goes bad. We can not serialize this because the arch
 * code calls this from NMI context.
 */
void perf_event_update_userpage(struct perf_event *event)
{
        struct perf_event_mmap_page *userpg;
        struct perf_buffer *rb;
        u64 enabled, running, now;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (!rb)
                goto unlock;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we can be called in
         * NMI context
         */
        calc_timer_values(event, &now, &enabled, &running);

        userpg = rb->user_page;
        /*
         * Disable preemption to guarantee consistent time stamps are stored to
         * the user page.
         */
        preempt_disable();
        ++userpg->lock;
        barrier();
        userpg->index = perf_event_index(event);
        userpg->offset = perf_event_count(event, false);
        if (userpg->index)
                userpg->offset -= local64_read(&event->hw.prev_count);

        userpg->time_enabled = enabled +
                        atomic64_read(&event->child_total_time_enabled);

        userpg->time_running = running +
                        atomic64_read(&event->child_total_time_running);

        arch_perf_update_userpage(event, userpg, now);

        barrier();
        ++userpg->lock;
        preempt_enable();
unlock:
        rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);

static void ring_buffer_attach(struct perf_event *event,
                               struct perf_buffer *rb)
{
        struct perf_buffer *old_rb = NULL;
        unsigned long flags;

        WARN_ON_ONCE(event->parent);

        if (event->rb) {
                /*
                 * Should be impossible, we set this when removing
                 * event->rb_entry and wait/clear when adding event->rb_entry.
                 */
                WARN_ON_ONCE(event->rcu_pending);

                old_rb = event->rb;
                spin_lock_irqsave(&old_rb->event_lock, flags);
                list_del_rcu(&event->rb_entry);
                spin_unlock_irqrestore(&old_rb->event_lock, flags);

                event->rcu_batches = get_state_synchronize_rcu();
                event->rcu_pending = 1;
        }

        if (rb) {
                if (event->rcu_pending) {
                        cond_synchronize_rcu(event->rcu_batches);
                        event->rcu_pending = 0;
                }

                spin_lock_irqsave(&rb->event_lock, flags);
                list_add_rcu(&event->rb_entry, &rb->event_list);
                spin_unlock_irqrestore(&rb->event_lock, flags);
        }

        /*
         * Avoid racing with perf_mmap_close(AUX): stop the event
         * before swizzling the event::rb pointer; if it's getting
         * unmapped, its aux_mmap_count will be 0 and it won't
         * restart. See the comment in __perf_pmu_output_stop().
         *
         * Data will inevitably be lost when set_output is done in
         * mid-air, but then again, whoever does it like this is
         * not in for the data anyway.
         */
        if (has_aux(event))
                perf_event_stop(event, 0);

        rcu_assign_pointer(event->rb, rb);

        if (old_rb) {
                ring_buffer_put(old_rb);
                /*
                 * Since we detached before setting the new rb, so that we
                 * could attach the new rb, we could have missed a wakeup.
                 * Provide it now.
                 */
                wake_up_all(&event->waitq);
        }
}

static void ring_buffer_wakeup(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
                        wake_up_all(&event->waitq);
        }
        rcu_read_unlock();
}

struct perf_buffer *ring_buffer_get(struct perf_event *event)
{
        struct perf_buffer *rb;

        if (event->parent)
                event = event->parent;

        rcu_read_lock();
        rb = rcu_dereference(event->rb);
        if (rb) {
                if (!refcount_inc_not_zero(&rb->refcount))
                        rb = NULL;
        }
        rcu_read_unlock();

        return rb;
}

void ring_buffer_put(struct perf_buffer *rb)
{
        if (!refcount_dec_and_test(&rb->refcount))
                return;

        WARN_ON_ONCE(!list_empty(&rb->event_list));

        call_rcu(&rb->rcu_head, rb_free_rcu);
}

static void perf_mmap_open(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;

        atomic_inc(&event->mmap_count);
        atomic_inc(&event->rb->mmap_count);

        if (vma->vm_pgoff)
                atomic_inc(&event->rb->aux_mmap_count);

        if (event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);
}

static void perf_pmu_output_stop(struct perf_event *event);

/*
 * A buffer can be mmap()ed multiple times; either directly through the same
 * event, or through other events by use of perf_event_set_output().
 *
 * In order to undo the VM accounting done by perf_mmap() we need to destroy
 * the buffer here, where we still have a VM context. This means we need
 * to detach all events redirecting to us.
 */
static void perf_mmap_close(struct vm_area_struct *vma)
{
        struct perf_event *event = vma->vm_file->private_data;
        struct perf_buffer *rb = ring_buffer_get(event);
        struct user_struct *mmap_user = rb->mmap_user;
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
        bool detach_rest = false;

        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event, vma->vm_mm);

        /*
         * The AUX buffer is strictly a sub-buffer, serialize using aux_mutex
         * to avoid complications.
         */
        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
                /*
                 * Stop all AUX events that are writing to this buffer,
                 * so that we can free its AUX pages and corresponding PMU
                 * data. Note that after rb::aux_mmap_count dropped to zero,
                 * they won't start any more (see perf_aux_output_begin()).
                 */
                perf_pmu_output_stop(event);

                /* now it's safe to free the pages */
                atomic_long_sub(rb->aux_nr_pages - rb->aux_mmap_locked, &mmap_user->locked_vm);
                atomic64_sub(rb->aux_mmap_locked, &vma->vm_mm->pinned_vm);

                /* this has to be the last one */
                rb_free_aux(rb);
                WARN_ON_ONCE(refcount_read(&rb->aux_refcount));

                mutex_unlock(&rb->aux_mutex);
        }

        if (atomic_dec_and_test(&rb->mmap_count))
                detach_rest = true;

        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
                goto out_put;

        ring_buffer_attach(event, NULL);
        mutex_unlock(&event->mmap_mutex);

        /* If there's still other mmap()s of this buffer, we're done. */
        if (!detach_rest)
                goto out_put;

        /*
         * No other mmap()s, detach from all other events that might redirect
         * into the now unreachable buffer. Somewhat complicated by the
         * fact that rb::event_lock otherwise nests inside mmap_mutex.
         */
again:
        rcu_read_lock();
        list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
                if (!atomic_long_inc_not_zero(&event->refcount)) {
                        /*
                         * This event is en-route to free_event() which will
                         * detach it and remove it from the list.
                         */
                        continue;
                }
                rcu_read_unlock();

                mutex_lock(&event->mmap_mutex);
                /*
                 * Check we didn't race with perf_event_set_output() which can
                 * swizzle the rb from under us while we were waiting to
                 * acquire mmap_mutex.
                 *
                 * If we find a different rb; ignore this event, a next
                 * iteration will no longer find it on the list. We have to
                 * still restart the iteration to make sure we're not now
                 * iterating the wrong list.
                 */
                if (event->rb == rb)
                        ring_buffer_attach(event, NULL);

                mutex_unlock(&event->mmap_mutex);
                put_event(event);

                /*
                 * Restart the iteration; either we're on the wrong list or
                 * destroyed its integrity by doing a deletion.
                 */
                goto again;
        }
        rcu_read_unlock();

        /*
         * It could be there's still a few 0-ref events on the list; they'll
         * get cleaned up by free_event() -- they'll also still have their
         * ref on the rb and will free it whenever they are done with it.
         *
         * Aside from that, this buffer is 'fully' detached and unmapped,
         * undo the VM accounting.
         */

        atomic_long_sub((size >> PAGE_SHIFT) + 1 - mmap_locked,
                        &mmap_user->locked_vm);
        atomic64_sub(mmap_locked, &vma->vm_mm->pinned_vm);
        free_uid(mmap_user);

out_put:
        ring_buffer_put(rb); /* could be last */
}

static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
{
        /* The first page is the user control page, others are read-only. */
        return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}

static const struct vm_operations_struct perf_mmap_vmops = {
        .open                = perf_mmap_open,
        .close                = perf_mmap_close, /* non mergeable */
        .pfn_mkwrite        = perf_mmap_pfn_mkwrite,
};

static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
{
        unsigned long nr_pages = vma_pages(vma);
        int err = 0;
        unsigned long pagenum;

        /*
         * We map this as a VM_PFNMAP VMA.
         *
         * This is not ideal as this is designed broadly for mappings of PFNs
         * referencing memory-mapped I/O ranges or non-system RAM i.e. for which
         * !pfn_valid(pfn).
         *
         * We are mapping kernel-allocated memory (memory we manage ourselves)
         * which would more ideally be mapped using vm_insert_page() or a
         * similar mechanism, that is as a VM_MIXEDMAP mapping.
         *
         * However this won't work here, because:
         *
         * 1. It uses vma->vm_page_prot, but this field has not been completely
         *    setup at the point of the f_op->mmp() hook, so we are unable to
         *    indicate that this should be mapped CoW in order that the
         *    mkwrite() hook can be invoked to make the first page R/W and the
         *    rest R/O as desired.
         *
         * 2. Anything other than a VM_PFNMAP of valid PFNs will result in
         *    vm_normal_page() returning a struct page * pointer, which means
         *    vm_ops->page_mkwrite() will be invoked rather than
         *    vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
         *    to work around retry logic in the fault handler, however this
         *    field is no longer allowed to be used within struct page.
         *
         * 3. Having a struct page * made available in the fault logic also
         *    means that the page gets put on the rmap and becomes
         *    inappropriately accessible and subject to map and ref counting.
         *
         * Ideally we would have a mechanism that could explicitly express our
         * desires, but this is not currently the case, so we instead use
         * VM_PFNMAP.
         *
         * We manage the lifetime of these mappings with internal refcounts (see
         * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
         * this mapping is maintained correctly.
         */
        for (pagenum = 0; pagenum < nr_pages; pagenum++) {
                unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
                struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);

                if (page == NULL) {
                        err = -EINVAL;
                        break;
                }

                /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
                err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
                                      vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
                if (err)
                        break;
        }

#ifdef CONFIG_MMU
        /* Clear any partial mappings on error. */
        if (err)
                zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
#endif

        return err;
}

static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
        struct perf_event *event = file->private_data;
        unsigned long user_locked, user_lock_limit;
        struct user_struct *user = current_user();
        struct mutex *aux_mutex = NULL;
        struct perf_buffer *rb = NULL;
        unsigned long locked, lock_limit;
        unsigned long vma_size;
        unsigned long nr_pages;
        long user_extra = 0, extra = 0;
        int ret, flags = 0;

        /*
         * Don't allow mmap() of inherited per-task counters. This would
         * create a performance issue due to all children writing to the
         * same rb.
         */
        if (event->cpu == -1 && event->attr.inherit)
                return -EINVAL;

        if (!(vma->vm_flags & VM_SHARED))
                return -EINVAL;

        ret = security_perf_event_read(event);
        if (ret)
                return ret;

        vma_size = vma->vm_end - vma->vm_start;
        nr_pages = vma_size / PAGE_SIZE;

        if (nr_pages > INT_MAX)
                return -ENOMEM;

        if (vma_size != PAGE_SIZE * nr_pages)
                return -EINVAL;

        user_extra = nr_pages;

        mutex_lock(&event->mmap_mutex);
        ret = -EINVAL;

        if (vma->vm_pgoff == 0) {
                nr_pages -= 1;

                /*
                 * If we have rb pages ensure they're a power-of-two number, so we
                 * can do bitmasks instead of modulo.
                 */
                if (nr_pages != 0 && !is_power_of_2(nr_pages))
                        goto unlock;

                WARN_ON_ONCE(event->ctx->parent_ctx);

                if (event->rb) {
                        if (data_page_nr(event->rb) != nr_pages)
                                goto unlock;

                        if (atomic_inc_not_zero(&event->rb->mmap_count)) {
                                /*
                                 * Success -- managed to mmap() the same buffer
                                 * multiple times.
                                 */
                                ret = 0;
                                /* We need the rb to map pages. */
                                rb = event->rb;
                                goto unlock;
                        }

                        /*
                         * Raced against perf_mmap_close()'s
                         * atomic_dec_and_mutex_lock() remove the
                         * event and continue as if !event->rb
                         */
                        ring_buffer_attach(event, NULL);
                }

        } else {
                /*
                 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
                 * mapped, all subsequent mappings should have the same size
                 * and offset. Must be above the normal perf buffer.
                 */
                u64 aux_offset, aux_size;

                rb = event->rb;
                if (!rb)
                        goto aux_unlock;

                aux_mutex = &rb->aux_mutex;
                mutex_lock(aux_mutex);

                aux_offset = READ_ONCE(rb->user_page->aux_offset);
                aux_size = READ_ONCE(rb->user_page->aux_size);

                if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
                        goto aux_unlock;

                if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
                        goto aux_unlock;

                /* already mapped with a different offset */
                if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
                        goto aux_unlock;

                if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
                        goto aux_unlock;

                /* already mapped with a different size */
                if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
                        goto aux_unlock;

                if (!is_power_of_2(nr_pages))
                        goto aux_unlock;

                if (!atomic_inc_not_zero(&rb->mmap_count))
                        goto aux_unlock;

                if (rb_has_aux(rb)) {
                        atomic_inc(&rb->aux_mmap_count);
                        ret = 0;
                        goto unlock;
                }

                atomic_set(&rb->aux_mmap_count, 1);
        }

        user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);

        /*
         * Increase the limit linearly with more CPUs:
         */
        user_lock_limit *= num_online_cpus();

        user_locked = atomic_long_read(&user->locked_vm);

        /*
         * sysctl_perf_event_mlock may have changed, so that
         *     user->locked_vm > user_lock_limit
         */
        if (user_locked > user_lock_limit)
                user_locked = user_lock_limit;
        user_locked += user_extra;

        if (user_locked > user_lock_limit) {
                /*
                 * charge locked_vm until it hits user_lock_limit;
                 * charge the rest from pinned_vm
                 */
                extra = user_locked - user_lock_limit;
                user_extra -= extra;
        }

        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;

        if ((locked > lock_limit) && perf_is_paranoid() &&
                !capable(CAP_IPC_LOCK)) {
                ret = -EPERM;
                goto unlock;
        }

        WARN_ON(!rb && event->rb);

        if (vma->vm_flags & VM_WRITE)
                flags |= RING_BUFFER_WRITABLE;

        if (!rb) {
                rb = rb_alloc(nr_pages,
                              event->attr.watermark ? event->attr.wakeup_watermark : 0,
                              event->cpu, flags);

                if (!rb) {
                        ret = -ENOMEM;
                        goto unlock;
                }

                atomic_set(&rb->mmap_count, 1);
                rb->mmap_user = get_current_user();
                rb->mmap_locked = extra;

                ring_buffer_attach(event, rb);

                perf_event_update_time(event);
                perf_event_init_userpage(event);
                perf_event_update_userpage(event);
        } else {
                ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
                                   event->attr.aux_watermark, flags);
                if (!ret)
                        rb->aux_mmap_locked = extra;
        }

        ret = 0;

unlock:
        if (!ret) {
                atomic_long_add(user_extra, &user->locked_vm);
                atomic64_add(extra, &vma->vm_mm->pinned_vm);

                atomic_inc(&event->mmap_count);
        } else if (rb) {
                atomic_dec(&rb->mmap_count);
        }
aux_unlock:
        if (aux_mutex)
                mutex_unlock(aux_mutex);
        mutex_unlock(&event->mmap_mutex);

        /*
         * Since pinned accounting is per vm we cannot allow fork() to copy our
         * vma.
         */
        vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
        vma->vm_ops = &perf_mmap_vmops;

        if (!ret)
                ret = map_range(rb, vma);

        if (!ret && event->pmu->event_mapped)
                event->pmu->event_mapped(event, vma->vm_mm);

        return ret;
}

static int perf_fasync(int fd, struct file *filp, int on)
{
        struct inode *inode = file_inode(filp);
        struct perf_event *event = filp->private_data;
        int retval;

        inode_lock(inode);
        retval = fasync_helper(fd, filp, on, &event->fasync);
        inode_unlock(inode);

        if (retval < 0)
                return retval;

        return 0;
}

static const struct file_operations perf_fops = {
        .release                = perf_release,
        .read                        = perf_read,
        .poll                        = perf_poll,
        .unlocked_ioctl                = perf_ioctl,
        .compat_ioctl                = perf_compat_ioctl,
        .mmap                        = perf_mmap,
        .fasync                        = perf_fasync,
};

/*
 * Perf event wakeup
 *
 * If there's data, ensure we set the poll() state and publish everything
 * to user-space before waking everybody up.
 */

void perf_event_wakeup(struct perf_event *event)
{
        ring_buffer_wakeup(event);

        if (event->pending_kill) {
                kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                event->pending_kill = 0;
        }
}

static void perf_sigtrap(struct perf_event *event)
{
        /*
         * We'd expect this to only occur if the irq_work is delayed and either
         * ctx->task or current has changed in the meantime. This can be the
         * case on architectures that do not implement arch_irq_work_raise().
         */
        if (WARN_ON_ONCE(event->ctx->task != current))
                return;

        /*
         * Both perf_pending_task() and perf_pending_irq() can race with the
         * task exiting.
         */
        if (current->flags & PF_EXITING)
                return;

        send_sig_perf((void __user *)event->pending_addr,
                      event->orig_type, event->attr.sig_data);
}

/*
 * Deliver the pending work in-event-context or follow the context.
 */
static void __perf_pending_disable(struct perf_event *event)
{
        int cpu = READ_ONCE(event->oncpu);

        /*
         * If the event isn't running; we done. event_sched_out() will have
         * taken care of things.
         */
        if (cpu < 0)
                return;

        /*
         * Yay, we hit home and are in the context of the event.
         */
        if (cpu == smp_processor_id()) {
                if (event->pending_disable) {
                        event->pending_disable = 0;
                        perf_event_disable_local(event);
                }
                return;
        }

        /*
         *  CPU-A                        CPU-B
         *
         *  perf_event_disable_inatomic()
         *    @pending_disable = CPU-A;
         *    irq_work_queue();
         *
         *  sched-out
         *    @pending_disable = -1;
         *
         *                                sched-in
         *                                perf_event_disable_inatomic()
         *                                  @pending_disable = CPU-B;
         *                                  irq_work_queue(); // FAILS
         *
         *  irq_work_run()
         *    perf_pending_disable()
         *
         * But the event runs on CPU-B and wants disabling there.
         */
        irq_work_queue_on(&event->pending_disable_irq, cpu);
}

static void perf_pending_disable(struct irq_work *entry)
{
        struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        rctx = perf_swevent_get_recursion_context();
        __perf_pending_disable(event);
        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

static void perf_pending_irq(struct irq_work *entry)
{
        struct perf_event *event = container_of(entry, struct perf_event, pending_irq);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        rctx = perf_swevent_get_recursion_context();

        /*
         * The wakeup isn't bound to the context of the event -- it can happen
         * irrespective of where the event is.
         */
        if (event->pending_wakeup) {
                event->pending_wakeup = 0;
                perf_event_wakeup(event);
        }

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

static void perf_pending_task(struct callback_head *head)
{
        struct perf_event *event = container_of(head, struct perf_event, pending_task);
        int rctx;

        /*
         * If we 'fail' here, that's OK, it means recursion is already disabled
         * and we won't recurse 'further'.
         */
        rctx = perf_swevent_get_recursion_context();

        if (event->pending_work) {
                event->pending_work = 0;
                perf_sigtrap(event);
                local_dec(&event->ctx->nr_no_switch_fast);
        }
        put_event(event);

        if (rctx >= 0)
                perf_swevent_put_recursion_context(rctx);
}

#ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;

DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);

void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
                return;

        rcu_assign_pointer(perf_guest_cbs, cbs);
        static_call_update(__perf_guest_state, cbs->state);
        static_call_update(__perf_guest_get_ip, cbs->get_ip);

        /* Implementing ->handle_intel_pt_intr is optional. */
        if (cbs->handle_intel_pt_intr)
                static_call_update(__perf_guest_handle_intel_pt_intr,
                                   cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);

void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
        if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
                return;

        rcu_assign_pointer(perf_guest_cbs, NULL);
        static_call_update(__perf_guest_state, (void *)&__static_call_return0);
        static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
        static_call_update(__perf_guest_handle_intel_pt_intr,
                           (void *)&__static_call_return0);
        synchronize_rcu();
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
#endif

static bool should_sample_guest(struct perf_event *event)
{
        return !event->attr.exclude_guest && perf_guest_state();
}

unsigned long perf_misc_flags(struct perf_event *event,
                              struct pt_regs *regs)
{
        if (should_sample_guest(event))
                return perf_arch_guest_misc_flags(regs);

        return perf_arch_misc_flags(regs);
}

unsigned long perf_instruction_pointer(struct perf_event *event,
                                       struct pt_regs *regs)
{
        if (should_sample_guest(event))
                return perf_guest_get_ip();

        return perf_arch_instruction_pointer(regs);
}

static void
perf_output_sample_regs(struct perf_output_handle *handle,
                        struct pt_regs *regs, u64 mask)
{
        int bit;
        DECLARE_BITMAP(_mask, 64);

        bitmap_from_u64(_mask, mask);
        for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
                u64 val;

                val = perf_reg_value(regs, bit);
                perf_output_put(handle, val);
        }
}

static void perf_sample_regs_user(struct perf_regs *regs_user,
                                  struct pt_regs *regs)
{
        if (user_mode(regs)) {
                regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
        } else if (!(current->flags & PF_KTHREAD)) {
                perf_get_regs_user(regs_user, regs);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
                regs_user->regs = NULL;
        }
}

static void perf_sample_regs_intr(struct perf_regs *regs_intr,
                                  struct pt_regs *regs)
{
        regs_intr->regs = regs;
        regs_intr->abi  = perf_reg_abi(current);
}


/*
 * Get remaining task size from user stack pointer.
 *
 * It'd be better to take stack vma map and limit this more
 * precisely, but there's no way to get it safely under interrupt,
 * so using TASK_SIZE as limit.
 */
static u64 perf_ustack_task_size(struct pt_regs *regs)
{
        unsigned long addr = perf_user_stack_pointer(regs);

        if (!addr || addr >= TASK_SIZE)
                return 0;

        return TASK_SIZE - addr;
}

static u16
perf_sample_ustack_size(u16 stack_size, u16 header_size,
                        struct pt_regs *regs)
{
        u64 task_size;

        /* No regs, no stack pointer, no dump. */
        if (!regs)
                return 0;

        /*
         * Check if we fit in with the requested stack size into the:
         * - TASK_SIZE
         *   If we don't, we limit the size to the TASK_SIZE.
         *
         * - remaining sample size
         *   If we don't, we customize the stack size to
         *   fit in to the remaining sample size.
         */

        task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
        stack_size = min(stack_size, (u16) task_size);

        /* Current header size plus static size and dynamic size. */
        header_size += 2 * sizeof(u64);

        /* Do we fit in with the current stack dump size? */
        if ((u16) (header_size + stack_size) < header_size) {
                /*
                 * If we overflow the maximum size for the sample,
                 * we customize the stack dump size to fit in.
                 */
                stack_size = USHRT_MAX - header_size - sizeof(u64);
                stack_size = round_up(stack_size, sizeof(u64));
        }

        return stack_size;
}

static void
perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
                          struct pt_regs *regs)
{
        /* Case of a kernel thread, nothing to dump */
        if (!regs) {
                u64 size = 0;
                perf_output_put(handle, size);
        } else {
                unsigned long sp;
                unsigned int rem;
                u64 dyn_size;

                /*
                 * We dump:
                 * static size
                 *   - the size requested by user or the best one we can fit
                 *     in to the sample max size
                 * data
                 *   - user stack dump data
                 * dynamic size
                 *   - the actual dumped size
                 */

                /* Static size. */
                perf_output_put(handle, dump_size);

                /* Data. */
                sp = perf_user_stack_pointer(regs);
                rem = __output_copy_user(handle, (void *) sp, dump_size);
                dyn_size = dump_size - rem;

                perf_output_skip(handle, rem);

                /* Dynamic size. */
                perf_output_put(handle, dyn_size);
        }
}

static unsigned long perf_prepare_sample_aux(struct perf_event *event,
                                          struct perf_sample_data *data,
                                          size_t size)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;

        data->aux_size = 0;

        if (!sampler)
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->state) != PERF_EVENT_STATE_ACTIVE))
                goto out;

        if (WARN_ON_ONCE(READ_ONCE(sampler->oncpu) != smp_processor_id()))
                goto out;

        rb = ring_buffer_get(sampler);
        if (!rb)
                goto out;

        /*
         * If this is an NMI hit inside sampling code, don't take
         * the sample. See also perf_aux_sample_output().
         */
        if (READ_ONCE(rb->aux_in_sampling)) {
                data->aux_size = 0;
        } else {
                size = min_t(size_t, size, perf_aux_size(rb));
                data->aux_size = ALIGN(size, sizeof(u64));
        }
        ring_buffer_put(rb);

out:
        return data->aux_size;
}

static long perf_pmu_snapshot_aux(struct perf_buffer *rb,
                                 struct perf_event *event,
                                 struct perf_output_handle *handle,
                                 unsigned long size)
{
        unsigned long flags;
        long ret;

        /*
         * Normal ->start()/->stop() callbacks run in IRQ mode in scheduler
         * paths. If we start calling them in NMI context, they may race with
         * the IRQ ones, that is, for example, re-starting an event that's just
         * been stopped, which is why we're using a separate callback that
         * doesn't change the event state.
         *
         * IRQs need to be disabled to prevent IPIs from racing with us.
         */
        local_irq_save(flags);
        /*
         * Guard against NMI hits inside the critical section;
         * see also perf_prepare_sample_aux().
         */
        WRITE_ONCE(rb->aux_in_sampling, 1);
        barrier();

        ret = event->pmu->snapshot_aux(event, handle, size);

        barrier();
        WRITE_ONCE(rb->aux_in_sampling, 0);
        local_irq_restore(flags);

        return ret;
}

static void perf_aux_sample_output(struct perf_event *event,
                                   struct perf_output_handle *handle,
                                   struct perf_sample_data *data)
{
        struct perf_event *sampler = event->aux_event;
        struct perf_buffer *rb;
        unsigned long pad;
        long size;

        if (WARN_ON_ONCE(!sampler || !data->aux_size))
                return;

        rb = ring_buffer_get(sampler);
        if (!rb)
                return;

        size = perf_pmu_snapshot_aux(rb, sampler, handle, data->aux_size);

        /*
         * An error here means that perf_output_copy() failed (returned a
         * non-zero surplus that it didn't copy), which in its current
         * enlightened implementation is not possible. If that changes, we'd
         * like to know.
         */
        if (WARN_ON_ONCE(size < 0))
                goto out_put;

        /*
         * The pad comes from ALIGN()ing data->aux_size up to u64 in
         * perf_prepare_sample_aux(), so should not be more than that.
         */
        pad = data->aux_size - size;
        if (WARN_ON_ONCE(pad >= sizeof(u64)))
                pad = 8;

        if (pad) {
                u64 zero = 0;
                perf_output_copy(handle, &zero, pad);
        }

out_put:
        ring_buffer_put(rb);
}

/*
 * A set of common sample data types saved even for non-sample records
 * when event->attr.sample_id_all is set.
 */
#define PERF_SAMPLE_ID_ALL  (PERF_SAMPLE_TID | PERF_SAMPLE_TIME |        \
                             PERF_SAMPLE_ID | PERF_SAMPLE_STREAM_ID |        \
                             PERF_SAMPLE_CPU | PERF_SAMPLE_IDENTIFIER)

static void __perf_event_header__init_id(struct perf_sample_data *data,
                                         struct perf_event *event,
                                         u64 sample_type)
{
        data->type = event->attr.sample_type;
        data->sample_flags |= data->type & PERF_SAMPLE_ID_ALL;

        if (sample_type & PERF_SAMPLE_TID) {
                /* namespace issues */
                data->tid_entry.pid = perf_event_pid(event, current);
                data->tid_entry.tid = perf_event_tid(event, current);
        }

        if (sample_type & PERF_SAMPLE_TIME)
                data->time = perf_event_clock(event);

        if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
                data->id = primary_event_id(event);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                data->stream_id = event->id;

        if (sample_type & PERF_SAMPLE_CPU) {
                data->cpu_entry.cpu         = raw_smp_processor_id();
                data->cpu_entry.reserved = 0;
        }
}

void perf_event_header__init_id(struct perf_event_header *header,
                                struct perf_sample_data *data,
                                struct perf_event *event)
{
        if (event->attr.sample_id_all) {
                header->size += event->id_header_size;
                __perf_event_header__init_id(data, event, event->attr.sample_type);
        }
}

static void __perf_event__output_id_sample(struct perf_output_handle *handle,
                                           struct perf_sample_data *data)
{
        u64 sample_type = data->type;

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);
}

void perf_event__output_id_sample(struct perf_event *event,
                                  struct perf_output_handle *handle,
                                  struct perf_sample_data *sample)
{
        if (event->attr.sample_id_all)
                __perf_event__output_id_sample(handle, sample);
}

static void perf_output_read_one(struct perf_output_handle *handle,
                                 struct perf_event *event,
                                 u64 enabled, u64 running)
{
        u64 read_format = event->attr.read_format;
        u64 values[5];
        int n = 0;

        values[n++] = perf_event_count(event, has_inherit_and_sample_read(&event->attr));
        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
                values[n++] = enabled +
                        atomic64_read(&event->child_total_time_enabled);
        }
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
                values[n++] = running +
                        atomic64_read(&event->child_total_time_running);
        }
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(event);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&event->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));
}

static void perf_output_read_group(struct perf_output_handle *handle,
                                   struct perf_event *event,
                                   u64 enabled, u64 running)
{
        struct perf_event *leader = event->group_leader, *sub;
        u64 read_format = event->attr.read_format;
        unsigned long flags;
        u64 values[6];
        int n = 0;
        bool self = has_inherit_and_sample_read(&event->attr);

        /*
         * Disabling interrupts avoids all counter scheduling
         * (context switches, timer based rotation and IPIs).
         */
        local_irq_save(flags);

        values[n++] = 1 + leader->nr_siblings;

        if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
                values[n++] = enabled;

        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;

        if ((leader != event) && !handle->skip_read)
                perf_pmu_read(leader);

        values[n++] = perf_event_count(leader, self);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
        if (read_format & PERF_FORMAT_LOST)
                values[n++] = atomic64_read(&leader->lost_samples);

        __output_copy(handle, values, n * sizeof(u64));

        for_each_sibling_event(sub, leader) {
                n = 0;

                if ((sub != event) && !handle->skip_read)
                        perf_pmu_read(sub);

                values[n++] = perf_event_count(sub, self);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
                if (read_format & PERF_FORMAT_LOST)
                        values[n++] = atomic64_read(&sub->lost_samples);

                __output_copy(handle, values, n * sizeof(u64));
        }

        local_irq_restore(flags);
}

#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
                                 PERF_FORMAT_TOTAL_TIME_RUNNING)

/*
 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
 *
 * The problem is that its both hard and excessively expensive to iterate the
 * child list, not to mention that its impossible to IPI the children running
 * on another CPU, from interrupt/NMI context.
 *
 * Instead the combination of PERF_SAMPLE_READ and inherit will track per-thread
 * counts rather than attempting to accumulate some value across all children on
 * all cores.
 */
static void perf_output_read(struct perf_output_handle *handle,
                             struct perf_event *event)
{
        u64 enabled = 0, running = 0, now;
        u64 read_format = event->attr.read_format;

        /*
         * compute total_time_enabled, total_time_running
         * based on snapshot values taken when the event
         * was last scheduled in.
         *
         * we cannot simply called update_context_time()
         * because of locking issue as we are called in
         * NMI context
         */
        if (read_format & PERF_FORMAT_TOTAL_TIMES)
                calc_timer_values(event, &now, &enabled, &running);

        if (event->attr.read_format & PERF_FORMAT_GROUP)
                perf_output_read_group(handle, event, enabled, running);
        else
                perf_output_read_one(handle, event, enabled, running);
}

void perf_output_sample(struct perf_output_handle *handle,
                        struct perf_event_header *header,
                        struct perf_sample_data *data,
                        struct perf_event *event)
{
        u64 sample_type = data->type;

        if (data->sample_flags & PERF_SAMPLE_READ)
                handle->skip_read = 1;

        perf_output_put(handle, *header);

        if (sample_type & PERF_SAMPLE_IDENTIFIER)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_IP)
                perf_output_put(handle, data->ip);

        if (sample_type & PERF_SAMPLE_TID)
                perf_output_put(handle, data->tid_entry);

        if (sample_type & PERF_SAMPLE_TIME)
                perf_output_put(handle, data->time);

        if (sample_type & PERF_SAMPLE_ADDR)
                perf_output_put(handle, data->addr);

        if (sample_type & PERF_SAMPLE_ID)
                perf_output_put(handle, data->id);

        if (sample_type & PERF_SAMPLE_STREAM_ID)
                perf_output_put(handle, data->stream_id);

        if (sample_type & PERF_SAMPLE_CPU)
                perf_output_put(handle, data->cpu_entry);

        if (sample_type & PERF_SAMPLE_PERIOD)
                perf_output_put(handle, data->period);

        if (sample_type & PERF_SAMPLE_READ)
                perf_output_read(handle, event);

        if (sample_type & PERF_SAMPLE_CALLCHAIN) {
                int size = 1;

                size += data->callchain->nr;
                size *= sizeof(u64);
                __output_copy(handle, data->callchain, size);
        }

        if (sample_type & PERF_SAMPLE_RAW) {
                struct perf_raw_record *raw = data->raw;

                if (raw) {
                        struct perf_raw_frag *frag = &raw->frag;

                        perf_output_put(handle, raw->size);
                        do {
                                if (frag->copy) {
                                        __output_custom(handle, frag->copy,
                                                        frag->data, frag->size);
                                } else {
                                        __output_copy(handle, frag->data,
                                                      frag->size);
                                }
                                if (perf_raw_frag_last(frag))
                                        break;
                                frag = frag->next;
                        } while (1);
                        if (frag->pad)
                                __output_skip(handle, NULL, frag->pad);
                } else {
                        struct {
                                u32        size;
                                u32        data;
                        } raw = {
                                .size = sizeof(u32),
                                .data = 0,
                        };
                        perf_output_put(handle, raw);
                }
        }

        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
                if (data->br_stack) {
                        size_t size;

                        size = data->br_stack->nr
                             * sizeof(struct perf_branch_entry);

                        perf_output_put(handle, data->br_stack->nr);
                        if (branch_sample_hw_index(event))
                                perf_output_put(handle, data->br_stack->hw_idx);
                        perf_output_copy(handle, data->br_stack->entries, size);
                        /*
                         * Add the extension space which is appended
                         * right after the struct perf_branch_stack.
                         */
                        if (data->br_stack_cntr) {
                                size = data->br_stack->nr * sizeof(u64);
                                perf_output_copy(handle, data->br_stack_cntr, size);
                        }
                } else {
                        /*
                         * we always store at least the value of nr
                         */
                        u64 nr = 0;
                        perf_output_put(handle, nr);
                }
        }

        if (sample_type & PERF_SAMPLE_REGS_USER) {
                u64 abi = data->regs_user.abi;

                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_user;
                        perf_output_sample_regs(handle,
                                                data->regs_user.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_STACK_USER) {
                perf_output_sample_ustack(handle,
                                          data->stack_user_size,
                                          data->regs_user.regs);
        }

        if (sample_type & PERF_SAMPLE_WEIGHT_TYPE)
                perf_output_put(handle, data->weight.full);

        if (sample_type & PERF_SAMPLE_DATA_SRC)
                perf_output_put(handle, data->data_src.val);

        if (sample_type & PERF_SAMPLE_TRANSACTION)
                perf_output_put(handle, data->txn);

        if (sample_type & PERF_SAMPLE_REGS_INTR) {
                u64 abi = data->regs_intr.abi;
                /*
                 * If there are no regs to dump, notice it through
                 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
                 */
                perf_output_put(handle, abi);

                if (abi) {
                        u64 mask = event->attr.sample_regs_intr;

                        perf_output_sample_regs(handle,
                                                data->regs_intr.regs,
                                                mask);
                }
        }

        if (sample_type & PERF_SAMPLE_PHYS_ADDR)
                perf_output_put(handle, data->phys_addr);

        if (sample_type & PERF_SAMPLE_CGROUP)
                perf_output_put(handle, data->cgroup);

        if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
                perf_output_put(handle, data->data_page_size);

        if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE)
                perf_output_put(handle, data->code_page_size);

        if (sample_type & PERF_SAMPLE_AUX) {
                perf_output_put(handle, data->aux_size);

                if (data->aux_size)
                        perf_aux_sample_output(event, handle, data);
        }

        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;

                if (wakeup_events) {
                        struct perf_buffer *rb = handle->rb;
                        int events = local_inc_return(&rb->events);

                        if (events >= wakeup_events) {
                                local_sub(wakeup_events, &rb->events);
                                local_inc(&rb->wakeup);
                        }
                }
        }
}

static u64 perf_virt_to_phys(u64 virt)
{
        u64 phys_addr = 0;

        if (!virt)
                return 0;

        if (virt >= TASK_SIZE) {
                /* If it's vmalloc()d memory, leave phys_addr as 0 */
                if (virt_addr_valid((void *)(uintptr_t)virt) &&
                    !(virt >= VMALLOC_START && virt < VMALLOC_END))
                        phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
        } else {
                /*
                 * Walking the pages tables for user address.
                 * Interrupts are disabled, so it prevents any tear down
                 * of the page tables.
                 * Try IRQ-safe get_user_page_fast_only first.
                 * If failed, leave phys_addr as 0.
                 */
                if (current->mm != NULL) {
                        struct page *p;

                        pagefault_disable();
                        if (get_user_page_fast_only(virt, 0, &p)) {
                                phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
                                put_page(p);
                        }
                        pagefault_enable();
                }
        }

        return phys_addr;
}

/*
 * Return the pagetable size of a given virtual address.
 */
static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr)
{
        u64 size = 0;

#ifdef CONFIG_HAVE_GUP_FAST
        pgd_t *pgdp, pgd;
        p4d_t *p4dp, p4d;
        pud_t *pudp, pud;
        pmd_t *pmdp, pmd;
        pte_t *ptep, pte;

        pgdp = pgd_offset(mm, addr);
        pgd = READ_ONCE(*pgdp);
        if (pgd_none(pgd))
                return 0;

        if (pgd_leaf(pgd))
                return pgd_leaf_size(pgd);

        p4dp = p4d_offset_lockless(pgdp, pgd, addr);
        p4d = READ_ONCE(*p4dp);
        if (!p4d_present(p4d))
                return 0;

        if (p4d_leaf(p4d))
                return p4d_leaf_size(p4d);

        pudp = pud_offset_lockless(p4dp, p4d, addr);
        pud = READ_ONCE(*pudp);
        if (!pud_present(pud))
                return 0;

        if (pud_leaf(pud))
                return pud_leaf_size(pud);

        pmdp = pmd_offset_lockless(pudp, pud, addr);
again:
        pmd = pmdp_get_lockless(pmdp);
        if (!pmd_present(pmd))
                return 0;

        if (pmd_leaf(pmd))
                return pmd_leaf_size(pmd);

        ptep = pte_offset_map(&pmd, addr);
        if (!ptep)
                goto again;

        pte = ptep_get_lockless(ptep);
        if (pte_present(pte))
                size = __pte_leaf_size(pmd, pte);
        pte_unmap(ptep);
#endif /* CONFIG_HAVE_GUP_FAST */

        return size;
}

static u64 perf_get_page_size(unsigned long addr)
{
        struct mm_struct *mm;
        unsigned long flags;
        u64 size;

        if (!addr)
                return 0;

        /*
         * Software page-table walkers must disable IRQs,
         * which prevents any tear down of the page tables.
         */
        local_irq_save(flags);

        mm = current->mm;
        if (!mm) {
                /*
                 * For kernel threads and the like, use init_mm so that
                 * we can find kernel memory.
                 */
                mm = &init_mm;
        }

        size = perf_get_pgtable_size(mm, addr);

        local_irq_restore(flags);

        return size;
}

static struct perf_callchain_entry __empty_callchain = { .nr = 0, };

struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
        bool kernel = !event->attr.exclude_callchain_kernel;
        bool user   = !event->attr.exclude_callchain_user;
        /* Disallow cross-task user callchains. */
        bool crosstask = event->ctx->task && event->ctx->task != current;
        const u32 max_stack = event->attr.sample_max_stack;
        struct perf_callchain_entry *callchain;

        if (!kernel && !user)
                return &__empty_callchain;

        callchain = get_perf_callchain(regs, 0, kernel, user,
                                       max_stack, crosstask, true);
        return callchain ?: &__empty_callchain;
}

static __always_inline u64 __cond_set(u64 flags, u64 s, u64 d)
{
        return d * !!(flags & s);
}

void perf_prepare_sample(struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        u64 sample_type = event->attr.sample_type;
        u64 filtered_sample_type;

        /*
         * Add the sample flags that are dependent to others.  And clear the
         * sample flags that have already been done by the PMU driver.
         */
        filtered_sample_type = sample_type;
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_CODE_PAGE_SIZE,
                                           PERF_SAMPLE_IP);
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_DATA_PAGE_SIZE |
                                           PERF_SAMPLE_PHYS_ADDR, PERF_SAMPLE_ADDR);
        filtered_sample_type |= __cond_set(sample_type, PERF_SAMPLE_STACK_USER,
                                           PERF_SAMPLE_REGS_USER);
        filtered_sample_type &= ~data->sample_flags;

        if (filtered_sample_type == 0) {
                /* Make sure it has the correct data->type for output */
                data->type = event->attr.sample_type;
                return;
        }

        __perf_event_header__init_id(data, event, filtered_sample_type);

        if (filtered_sample_type & PERF_SAMPLE_IP) {
                data->ip = perf_instruction_pointer(event, regs);
                data->sample_flags |= PERF_SAMPLE_IP;
        }

        if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN)
                perf_sample_save_callchain(data, event, regs);

        if (filtered_sample_type & PERF_SAMPLE_RAW) {
                data->raw = NULL;
                data->dyn_size += sizeof(u64);
                data->sample_flags |= PERF_SAMPLE_RAW;
        }

        if (filtered_sample_type & PERF_SAMPLE_BRANCH_STACK) {
                data->br_stack = NULL;
                data->dyn_size += sizeof(u64);
                data->sample_flags |= PERF_SAMPLE_BRANCH_STACK;
        }

        if (filtered_sample_type & PERF_SAMPLE_REGS_USER)
                perf_sample_regs_user(&data->regs_user, regs);

        /*
         * It cannot use the filtered_sample_type here as REGS_USER can be set
         * by STACK_USER (using __cond_set() above) and we don't want to update
         * the dyn_size if it's not requested by users.
         */
        if ((sample_type & ~data->sample_flags) & PERF_SAMPLE_REGS_USER) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                if (data->regs_user.regs) {
                        u64 mask = event->attr.sample_regs_user;
                        size += hweight64(mask) * sizeof(u64);
                }

                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_REGS_USER;
        }

        if (filtered_sample_type & PERF_SAMPLE_STACK_USER) {
                /*
                 * Either we need PERF_SAMPLE_STACK_USER bit to be always
                 * processed as the last one or have additional check added
                 * in case new sample type is added, because we could eat
                 * up the rest of the sample size.
                 */
                u16 stack_size = event->attr.sample_stack_user;
                u16 header_size = perf_sample_data_size(data, event);
                u16 size = sizeof(u64);

                stack_size = perf_sample_ustack_size(stack_size, header_size,
                                                     data->regs_user.regs);

                /*
                 * If there is something to dump, add space for the dump
                 * itself and for the field that tells the dynamic size,
                 * which is how many have been actually dumped.
                 */
                if (stack_size)
                        size += sizeof(u64) + stack_size;

                data->stack_user_size = stack_size;
                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_STACK_USER;
        }

        if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) {
                data->weight.full = 0;
                data->sample_flags |= PERF_SAMPLE_WEIGHT_TYPE;
        }

        if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) {
                data->data_src.val = PERF_MEM_NA;
                data->sample_flags |= PERF_SAMPLE_DATA_SRC;
        }

        if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) {
                data->txn = 0;
                data->sample_flags |= PERF_SAMPLE_TRANSACTION;
        }

        if (filtered_sample_type & PERF_SAMPLE_ADDR) {
                data->addr = 0;
                data->sample_flags |= PERF_SAMPLE_ADDR;
        }

        if (filtered_sample_type & PERF_SAMPLE_REGS_INTR) {
                /* regs dump ABI info */
                int size = sizeof(u64);

                perf_sample_regs_intr(&data->regs_intr, regs);

                if (data->regs_intr.regs) {
                        u64 mask = event->attr.sample_regs_intr;

                        size += hweight64(mask) * sizeof(u64);
                }

                data->dyn_size += size;
                data->sample_flags |= PERF_SAMPLE_REGS_INTR;
        }

        if (filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) {
                data->phys_addr = perf_virt_to_phys(data->addr);
                data->sample_flags |= PERF_SAMPLE_PHYS_ADDR;
        }

#ifdef CONFIG_CGROUP_PERF
        if (filtered_sample_type & PERF_SAMPLE_CGROUP) {
                struct cgroup *cgrp;

                /* protected by RCU */
                cgrp = task_css_check(current, perf_event_cgrp_id, 1)->cgroup;
                data->cgroup = cgroup_id(cgrp);
                data->sample_flags |= PERF_SAMPLE_CGROUP;
        }
#endif

        /*
         * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't
         * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr,
         * but the value will not dump to the userspace.
         */
        if (filtered_sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) {
                data->data_page_size = perf_get_page_size(data->addr);
                data->sample_flags |= PERF_SAMPLE_DATA_PAGE_SIZE;
        }

        if (filtered_sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) {
                data->code_page_size = perf_get_page_size(data->ip);
                data->sample_flags |= PERF_SAMPLE_CODE_PAGE_SIZE;
        }

        if (filtered_sample_type & PERF_SAMPLE_AUX) {
                u64 size;
                u16 header_size = perf_sample_data_size(data, event);

                header_size += sizeof(u64); /* size */

                /*
                 * Given the 16bit nature of header::size, an AUX sample can
                 * easily overflow it, what with all the preceding sample bits.
                 * Make sure this doesn't happen by using up to U16_MAX bytes
                 * per sample in total (rounded down to 8 byte boundary).
                 */
                size = min_t(size_t, U16_MAX - header_size,
                             event->attr.aux_sample_size);
                size = rounddown(size, 8);
                size = perf_prepare_sample_aux(event, data, size);

                WARN_ON_ONCE(size + header_size > U16_MAX);
                data->dyn_size += size + sizeof(u64); /* size above */
                data->sample_flags |= PERF_SAMPLE_AUX;
        }
}

void perf_prepare_header(struct perf_event_header *header,
                         struct perf_sample_data *data,
                         struct perf_event *event,
                         struct pt_regs *regs)
{
        header->type = PERF_RECORD_SAMPLE;
        header->size = perf_sample_data_size(data, event);
        header->misc = perf_misc_flags(event, regs);

        /*
         * If you're adding more sample types here, you likely need to do
         * something about the overflowing header::size, like repurpose the
         * lowest 3 bits of size, which should be always zero at the moment.
         * This raises a more important question, do we really need 512k sized
         * samples and why, so good argumentation is in order for whatever you
         * do here next.
         */
        WARN_ON_ONCE(header->size & 7);
}

static void __perf_event_aux_pause(struct perf_event *event, bool pause)
{
        if (pause) {
                if (!event->hw.aux_paused) {
                        event->hw.aux_paused = 1;
                        event->pmu->stop(event, PERF_EF_PAUSE);
                }
        } else {
                if (event->hw.aux_paused) {
                        event->hw.aux_paused = 0;
                        event->pmu->start(event, PERF_EF_RESUME);
                }
        }
}

static void perf_event_aux_pause(struct perf_event *event, bool pause)
{
        struct perf_buffer *rb;

        if (WARN_ON_ONCE(!event))
                return;

        rb = ring_buffer_get(event);
        if (!rb)
                return;

        scoped_guard (irqsave) {
                /*
                 * Guard against self-recursion here. Another event could trip
                 * this same from NMI context.
                 */
                if (READ_ONCE(rb->aux_in_pause_resume))
                        break;

                WRITE_ONCE(rb->aux_in_pause_resume, 1);
                barrier();
                __perf_event_aux_pause(event, pause);
                barrier();
                WRITE_ONCE(rb->aux_in_pause_resume, 0);
        }
        ring_buffer_put(rb);
}

static __always_inline int
__perf_event_output(struct perf_event *event,
                    struct perf_sample_data *data,
                    struct pt_regs *regs,
                    int (*output_begin)(struct perf_output_handle *,
                                        struct perf_sample_data *,
                                        struct perf_event *,
                                        unsigned int))
{
        struct perf_output_handle handle;
        struct perf_event_header header;
        int err;

        /* protect the callchain buffers */
        rcu_read_lock();

        perf_prepare_sample(data, event, regs);
        perf_prepare_header(&header, data, event, regs);

        err = output_begin(&handle, data, event, header.size);
        if (err)
                goto exit;

        perf_output_sample(&handle, &header, data, event);

        perf_output_end(&handle);

exit:
        rcu_read_unlock();
        return err;
}

void
perf_event_output_forward(struct perf_event *event,
                         struct perf_sample_data *data,
                         struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_forward);
}

void
perf_event_output_backward(struct perf_event *event,
                           struct perf_sample_data *data,
                           struct pt_regs *regs)
{
        __perf_event_output(event, data, regs, perf_output_begin_backward);
}

int
perf_event_output(struct perf_event *event,
                  struct perf_sample_data *data,
                  struct pt_regs *regs)
{
        return __perf_event_output(event, data, regs, perf_output_begin);
}

/*
 * read event_id
 */

struct perf_read_event {
        struct perf_event_header        header;

        u32                                pid;
        u32                                tid;
};

static void
perf_event_read_event(struct perf_event *event,
                        struct task_struct *task)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_read_event read_event = {
                .header = {
                        .type = PERF_RECORD_READ,
                        .misc = 0,
                        .size = sizeof(read_event) + event->read_size,
                },
                .pid = perf_event_pid(event, task),
                .tid = perf_event_tid(event, task),
        };
        int ret;

        perf_event_header__init_id(&read_event.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, read_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, read_event);
        perf_output_read(&handle, event);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

typedef void (perf_iterate_f)(struct perf_event *event, void *data);

static void
perf_iterate_ctx(struct perf_event_context *ctx,
                   perf_iterate_f output,
                   void *data, bool all)
{
        struct perf_event *event;

        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (!all) {
                        if (event->state < PERF_EVENT_STATE_INACTIVE)
                                continue;
                        if (!event_filter_match(event))
                                continue;
                }

                output(event, data);
        }
}

static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
{
        struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
        struct perf_event *event;

        list_for_each_entry_rcu(event, &pel->list, sb_list) {
                /*
                 * Skip events that are not fully formed yet; ensure that
                 * if we observe event->ctx, both event and ctx will be
                 * complete enough. See perf_install_in_context().
                 */
                if (!smp_load_acquire(&event->ctx))
                        continue;

                if (event->state < PERF_EVENT_STATE_INACTIVE)
                        continue;
                if (!event_filter_match(event))
                        continue;
                output(event, data);
        }
}

/*
 * Iterate all events that need to receive side-band events.
 *
 * For new callers; ensure that account_pmu_sb_event() includes
 * your event, otherwise it might not get delivered.
 */
static void
perf_iterate_sb(perf_iterate_f output, void *data,
               struct perf_event_context *task_ctx)
{
        struct perf_event_context *ctx;

        rcu_read_lock();
        preempt_disable();

        /*
         * If we have task_ctx != NULL we only notify the task context itself.
         * The task_ctx is set only for EXIT events before releasing task
         * context.
         */
        if (task_ctx) {
                perf_iterate_ctx(task_ctx, output, data, false);
                goto done;
        }

        perf_iterate_sb_cpu(output, data);

        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_iterate_ctx(ctx, output, data, false);
done:
        preempt_enable();
        rcu_read_unlock();
}

/*
 * Clear all file-based filters at exec, they'll have to be
 * re-instated when/if these objects are mmapped again.
 */
static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;
                        restart++;
                }

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

void perf_event_exec(void)
{
        struct perf_event_context *ctx;

        ctx = perf_pin_task_context(current);
        if (!ctx)
                return;

        perf_event_enable_on_exec(ctx);
        perf_event_remove_on_exec(ctx);
        scoped_guard(rcu)
                perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true);

        perf_unpin_context(ctx);
        put_ctx(ctx);
}

struct remote_output {
        struct perf_buffer        *rb;
        int                        err;
};

static void __perf_event_output_stop(struct perf_event *event, void *data)
{
        struct perf_event *parent = event->parent;
        struct remote_output *ro = data;
        struct perf_buffer *rb = ro->rb;
        struct stop_event_data sd = {
                .event        = event,
        };

        if (!has_aux(event))
                return;

        if (!parent)
                parent = event;

        /*
         * In case of inheritance, it will be the parent that links to the
         * ring-buffer, but it will be the child that's actually using it.
         *
         * We are using event::rb to determine if the event should be stopped,
         * however this may race with ring_buffer_attach() (through set_output),
         * which will make us skip the event that actually needs to be stopped.
         * So ring_buffer_attach() has to stop an aux event before re-assigning
         * its rb pointer.
         */
        if (rcu_dereference(parent->rb) == rb)
                ro->err = __perf_event_stop(&sd);
}

static int __perf_pmu_output_stop(void *info)
{
        struct perf_event *event = info;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct remote_output ro = {
                .rb        = event->rb,
        };

        rcu_read_lock();
        perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
        if (cpuctx->task_ctx)
                perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
                                   &ro, false);
        rcu_read_unlock();

        return ro.err;
}

static void perf_pmu_output_stop(struct perf_event *event)
{
        struct perf_event *iter;
        int err, cpu;

restart:
        rcu_read_lock();
        list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
                /*
                 * For per-CPU events, we need to make sure that neither they
                 * nor their children are running; for cpu==-1 events it's
                 * sufficient to stop the event itself if it's active, since
                 * it can't have children.
                 */
                cpu = iter->cpu;
                if (cpu == -1)
                        cpu = READ_ONCE(iter->oncpu);

                if (cpu == -1)
                        continue;

                err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
                if (err == -EAGAIN) {
                        rcu_read_unlock();
                        goto restart;
                }
        }
        rcu_read_unlock();
}

/*
 * task tracking -- fork/exit
 *
 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
 */

struct perf_task_event {
        struct task_struct                *task;
        struct perf_event_context        *task_ctx;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                ppid;
                u32                                tid;
                u32                                ptid;
                u64                                time;
        } event_id;
};

static int perf_event_task_match(struct perf_event *event)
{
        return event->attr.comm  || event->attr.mmap ||
               event->attr.mmap2 || event->attr.mmap_data ||
               event->attr.task;
}

static void perf_event_task_output(struct perf_event *event,
                                   void *data)
{
        struct perf_task_event *task_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data        sample;
        struct task_struct *task = task_event->task;
        int ret, size = task_event->event_id.header.size;

        if (!perf_event_task_match(event))
                return;

        perf_event_header__init_id(&task_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                task_event->event_id.header.size);
        if (ret)
                goto out;

        task_event->event_id.pid = perf_event_pid(event, task);
        task_event->event_id.tid = perf_event_tid(event, task);

        if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
                task_event->event_id.ppid = perf_event_pid(event,
                                                        task->real_parent);
                task_event->event_id.ptid = perf_event_pid(event,
                                                        task->real_parent);
        } else {  /* PERF_RECORD_FORK */
                task_event->event_id.ppid = perf_event_pid(event, current);
                task_event->event_id.ptid = perf_event_tid(event, current);
        }

        task_event->event_id.time = perf_event_clock(event);

        perf_output_put(&handle, task_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        task_event->event_id.header.size = size;
}

static void perf_event_task(struct task_struct *task,
                              struct perf_event_context *task_ctx,
                              int new)
{
        struct perf_task_event task_event;

        if (!atomic_read(&nr_comm_events) &&
            !atomic_read(&nr_mmap_events) &&
            !atomic_read(&nr_task_events))
                return;

        task_event = (struct perf_task_event){
                .task          = task,
                .task_ctx = task_ctx,
                .event_id    = {
                        .header = {
                                .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
                                .misc = 0,
                                .size = sizeof(task_event.event_id),
                        },
                        /* .pid  */
                        /* .ppid */
                        /* .tid  */
                        /* .ptid */
                        /* .time */
                },
        };

        perf_iterate_sb(perf_event_task_output,
                       &task_event,
                       task_ctx);
}

/*
 * Allocate data for a new task when profiling system-wide
 * events which require PMU specific data
 */
static void
perf_event_alloc_task_data(struct task_struct *child,
                           struct task_struct *parent)
{
        struct kmem_cache *ctx_cache = NULL;
        struct perf_ctx_data *cd;

        if (!refcount_read(&global_ctx_data_ref))
                return;

        scoped_guard (rcu) {
                cd = rcu_dereference(parent->perf_ctx_data);
                if (cd)
                        ctx_cache = cd->ctx_cache;
        }

        if (!ctx_cache)
                return;

        guard(percpu_read)(&global_ctx_data_rwsem);
        scoped_guard (rcu) {
                cd = rcu_dereference(child->perf_ctx_data);
                if (!cd) {
                        /*
                         * A system-wide event may be unaccount,
                         * when attaching the perf_ctx_data.
                         */
                        if (!refcount_read(&global_ctx_data_ref))
                                return;
                        goto attach;
                }

                if (!cd->global) {
                        cd->global = 1;
                        refcount_inc(&cd->refcount);
                }
        }

        return;
attach:
        attach_task_ctx_data(child, ctx_cache, true);
}

void perf_event_fork(struct task_struct *task)
{
        perf_event_task(task, NULL, 1);
        perf_event_namespaces(task);
        perf_event_alloc_task_data(task, current);
}

/*
 * comm tracking
 */

struct perf_comm_event {
        struct task_struct        *task;
        char                        *comm;
        int                        comm_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
        } event_id;
};

static int perf_event_comm_match(struct perf_event *event)
{
        return event->attr.comm;
}

static void perf_event_comm_output(struct perf_event *event,
                                   void *data)
{
        struct perf_comm_event *comm_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = comm_event->event_id.header.size;
        int ret;

        if (!perf_event_comm_match(event))
                return;

        perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                comm_event->event_id.header.size);

        if (ret)
                goto out;

        comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
        comm_event->event_id.tid = perf_event_tid(event, comm_event->task);

        perf_output_put(&handle, comm_event->event_id);
        __output_copy(&handle, comm_event->comm,
                                   comm_event->comm_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        comm_event->event_id.header.size = size;
}

static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
        char comm[TASK_COMM_LEN];
        unsigned int size;

        memset(comm, 0, sizeof(comm));
        strscpy(comm, comm_event->task->comm);
        size = ALIGN(strlen(comm)+1, sizeof(u64));

        comm_event->comm = comm;
        comm_event->comm_size = size;

        comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;

        perf_iterate_sb(perf_event_comm_output,
                       comm_event,
                       NULL);
}

void perf_event_comm(struct task_struct *task, bool exec)
{
        struct perf_comm_event comm_event;

        if (!atomic_read(&nr_comm_events))
                return;

        comm_event = (struct perf_comm_event){
                .task        = task,
                /* .comm      */
                /* .comm_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_COMM,
                                .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                },
        };

        perf_event_comm_event(&comm_event);
}

/*
 * namespaces tracking
 */

struct perf_namespaces_event {
        struct task_struct                *task;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                nr_namespaces;
                struct perf_ns_link_info        link_info[NR_NAMESPACES];
        } event_id;
};

static int perf_event_namespaces_match(struct perf_event *event)
{
        return event->attr.namespaces;
}

static void perf_event_namespaces_output(struct perf_event *event,
                                         void *data)
{
        struct perf_namespaces_event *namespaces_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = namespaces_event->event_id.header.size;
        int ret;

        if (!perf_event_namespaces_match(event))
                return;

        perf_event_header__init_id(&namespaces_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                namespaces_event->event_id.header.size);
        if (ret)
                goto out;

        namespaces_event->event_id.pid = perf_event_pid(event,
                                                        namespaces_event->task);
        namespaces_event->event_id.tid = perf_event_tid(event,
                                                        namespaces_event->task);

        perf_output_put(&handle, namespaces_event->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        namespaces_event->event_id.header.size = header_size;
}

static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
                                   struct task_struct *task,
                                   const struct proc_ns_operations *ns_ops)
{
        struct path ns_path;
        struct inode *ns_inode;
        int error;

        error = ns_get_path(&ns_path, task, ns_ops);
        if (!error) {
                ns_inode = ns_path.dentry->d_inode;
                ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
                ns_link_info->ino = ns_inode->i_ino;
                path_put(&ns_path);
        }
}

void perf_event_namespaces(struct task_struct *task)
{
        struct perf_namespaces_event namespaces_event;
        struct perf_ns_link_info *ns_link_info;

        if (!atomic_read(&nr_namespaces_events))
                return;

        namespaces_event = (struct perf_namespaces_event){
                .task        = task,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_NAMESPACES,
                                .misc = 0,
                                .size = sizeof(namespaces_event.event_id),
                        },
                        /* .pid */
                        /* .tid */
                        .nr_namespaces = NR_NAMESPACES,
                        /* .link_info[NR_NAMESPACES] */
                },
        };

        ns_link_info = namespaces_event.event_id.link_info;

        perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
                               task, &mntns_operations);

#ifdef CONFIG_USER_NS
        perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
                               task, &userns_operations);
#endif
#ifdef CONFIG_NET_NS
        perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
                               task, &netns_operations);
#endif
#ifdef CONFIG_UTS_NS
        perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
                               task, &utsns_operations);
#endif
#ifdef CONFIG_IPC_NS
        perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
                               task, &ipcns_operations);
#endif
#ifdef CONFIG_PID_NS
        perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
                               task, &pidns_operations);
#endif
#ifdef CONFIG_CGROUPS
        perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
                               task, &cgroupns_operations);
#endif

        perf_iterate_sb(perf_event_namespaces_output,
                        &namespaces_event,
                        NULL);
}

/*
 * cgroup tracking
 */
#ifdef CONFIG_CGROUP_PERF

struct perf_cgroup_event {
        char                                *path;
        int                                path_size;
        struct {
                struct perf_event_header        header;
                u64                                id;
                char                                path[];
        } event_id;
};

static int perf_event_cgroup_match(struct perf_event *event)
{
        return event->attr.cgroup;
}

static void perf_event_cgroup_output(struct perf_event *event, void *data)
{
        struct perf_cgroup_event *cgroup_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u16 header_size = cgroup_event->event_id.header.size;
        int ret;

        if (!perf_event_cgroup_match(event))
                return;

        perf_event_header__init_id(&cgroup_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                cgroup_event->event_id.header.size);
        if (ret)
                goto out;

        perf_output_put(&handle, cgroup_event->event_id);
        __output_copy(&handle, cgroup_event->path, cgroup_event->path_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        cgroup_event->event_id.header.size = header_size;
}

static void perf_event_cgroup(struct cgroup *cgrp)
{
        struct perf_cgroup_event cgroup_event;
        char path_enomem[16] = "//enomem";
        char *pathname;
        size_t size;

        if (!atomic_read(&nr_cgroup_events))
                return;

        cgroup_event = (struct perf_cgroup_event){
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_CGROUP,
                                .misc = 0,
                                .size = sizeof(cgroup_event.event_id),
                        },
                        .id = cgroup_id(cgrp),
                },
        };

        pathname = kmalloc(PATH_MAX, GFP_KERNEL);
        if (pathname == NULL) {
                cgroup_event.path = path_enomem;
        } else {
                /* just to be sure to have enough space for alignment */
                cgroup_path(cgrp, pathname, PATH_MAX - sizeof(u64));
                cgroup_event.path = pathname;
        }

        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(cgroup_event.path) + 1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                cgroup_event.path[size++] = '\0';

        cgroup_event.event_id.header.size += size;
        cgroup_event.path_size = size;

        perf_iterate_sb(perf_event_cgroup_output,
                        &cgroup_event,
                        NULL);

        kfree(pathname);
}

#endif

/*
 * mmap tracking
 */

struct perf_mmap_event {
        struct vm_area_struct        *vma;

        const char                *file_name;
        int                        file_size;
        int                        maj, min;
        u64                        ino;
        u64                        ino_generation;
        u32                        prot, flags;
        u8                        build_id[BUILD_ID_SIZE_MAX];
        u32                        build_id_size;

        struct {
                struct perf_event_header        header;

                u32                                pid;
                u32                                tid;
                u64                                start;
                u64                                len;
                u64                                pgoff;
        } event_id;
};

static int perf_event_mmap_match(struct perf_event *event,
                                 void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct vm_area_struct *vma = mmap_event->vma;
        int executable = vma->vm_flags & VM_EXEC;

        return (!executable && event->attr.mmap_data) ||
               (executable && (event->attr.mmap || event->attr.mmap2));
}

static void perf_event_mmap_output(struct perf_event *event,
                                   void *data)
{
        struct perf_mmap_event *mmap_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
        u32 type = mmap_event->event_id.header.type;
        bool use_build_id;
        int ret;

        if (!perf_event_mmap_match(event, data))
                return;

        if (event->attr.mmap2) {
                mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
                mmap_event->event_id.header.size += sizeof(mmap_event->maj);
                mmap_event->event_id.header.size += sizeof(mmap_event->min);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino);
                mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
                mmap_event->event_id.header.size += sizeof(mmap_event->prot);
                mmap_event->event_id.header.size += sizeof(mmap_event->flags);
        }

        perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                mmap_event->event_id.header.size);
        if (ret)
                goto out;

        mmap_event->event_id.pid = perf_event_pid(event, current);
        mmap_event->event_id.tid = perf_event_tid(event, current);

        use_build_id = event->attr.build_id && mmap_event->build_id_size;

        if (event->attr.mmap2 && use_build_id)
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID;

        perf_output_put(&handle, mmap_event->event_id);

        if (event->attr.mmap2) {
                if (use_build_id) {
                        u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 };

                        __output_copy(&handle, size, 4);
                        __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX);
                } else {
                        perf_output_put(&handle, mmap_event->maj);
                        perf_output_put(&handle, mmap_event->min);
                        perf_output_put(&handle, mmap_event->ino);
                        perf_output_put(&handle, mmap_event->ino_generation);
                }
                perf_output_put(&handle, mmap_event->prot);
                perf_output_put(&handle, mmap_event->flags);
        }

        __output_copy(&handle, mmap_event->file_name,
                                   mmap_event->file_size);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
out:
        mmap_event->event_id.header.size = size;
        mmap_event->event_id.header.type = type;
}

static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
        struct vm_area_struct *vma = mmap_event->vma;
        struct file *file = vma->vm_file;
        int maj = 0, min = 0;
        u64 ino = 0, gen = 0;
        u32 prot = 0, flags = 0;
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
        char *name = NULL;

        if (vma->vm_flags & VM_READ)
                prot |= PROT_READ;
        if (vma->vm_flags & VM_WRITE)
                prot |= PROT_WRITE;
        if (vma->vm_flags & VM_EXEC)
                prot |= PROT_EXEC;

        if (vma->vm_flags & VM_MAYSHARE)
                flags = MAP_SHARED;
        else
                flags = MAP_PRIVATE;

        if (vma->vm_flags & VM_LOCKED)
                flags |= MAP_LOCKED;
        if (is_vm_hugetlb_page(vma))
                flags |= MAP_HUGETLB;

        if (file) {
                struct inode *inode;
                dev_t dev;

                buf = kmalloc(PATH_MAX, GFP_KERNEL);
                if (!buf) {
                        name = "//enomem";
                        goto cpy_name;
                }
                /*
                 * d_path() works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
                name = file_path(file, buf, PATH_MAX - sizeof(u64));
                if (IS_ERR(name)) {
                        name = "//toolong";
                        goto cpy_name;
                }
                inode = file_inode(vma->vm_file);
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);

                goto got_name;
        } else {
                if (vma->vm_ops && vma->vm_ops->name)
                        name = (char *) vma->vm_ops->name(vma);
                if (!name)
                        name = (char *)arch_vma_name(vma);
                if (!name) {
                        if (vma_is_initial_heap(vma))
                                name = "[heap]";
                        else if (vma_is_initial_stack(vma))
                                name = "[stack]";
                        else
                                name = "//anon";
                }
        }

cpy_name:
        strscpy(tmp, name);
        name = tmp;
got_name:
        /*
         * Since our buffer works in 8 byte units we need to align our string
         * size to a multiple of 8. However, we must guarantee the tail end is
         * zero'd out to avoid leaking random bits to userspace.
         */
        size = strlen(name)+1;
        while (!IS_ALIGNED(size, sizeof(u64)))
                name[size++] = '\0';

        mmap_event->file_name = name;
        mmap_event->file_size = size;
        mmap_event->maj = maj;
        mmap_event->min = min;
        mmap_event->ino = ino;
        mmap_event->ino_generation = gen;
        mmap_event->prot = prot;
        mmap_event->flags = flags;

        if (!(vma->vm_flags & VM_EXEC))
                mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;

        mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;

        if (atomic_read(&nr_build_id_events))
                build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size);

        perf_iterate_sb(perf_event_mmap_output,
                       mmap_event,
                       NULL);

        kfree(buf);
}

/*
 * Check whether inode and address range match filter criteria.
 */
static bool perf_addr_filter_match(struct perf_addr_filter *filter,
                                     struct file *file, unsigned long offset,
                                     unsigned long size)
{
        /* d_inode(NULL) won't be equal to any mapped user-space file */
        if (!filter->path.dentry)
                return false;

        if (d_inode(filter->path.dentry) != file_inode(file))
                return false;

        if (filter->offset > offset + size)
                return false;

        if (filter->offset + filter->size < offset)
                return false;

        return true;
}

static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
                                        struct vm_area_struct *vma,
                                        struct perf_addr_filter_range *fr)
{
        unsigned long vma_size = vma->vm_end - vma->vm_start;
        unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
        struct file *file = vma->vm_file;

        if (!perf_addr_filter_match(filter, file, off, vma_size))
                return false;

        if (filter->offset < off) {
                fr->start = vma->vm_start;
                fr->size = min(vma_size, filter->size - (off - filter->offset));
        } else {
                fr->start = vma->vm_start + filter->offset - off;
                fr->size = min(vma->vm_end - fr->start, filter->size);
        }

        return true;
}

static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct vm_area_struct *vma = data;
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
        unsigned long flags;

        if (!has_addr_filter(event))
                return;

        if (!vma->vm_file)
                return;

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (perf_addr_filter_vma_adjust(filter, vma,
                                                &event->addr_filter_ranges[count]))
                        restart++;

                count++;
        }

        if (restart)
                event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (restart)
                perf_event_stop(event, 1);
}

/*
 * Adjust all task's events' filters to the new vma
 */
static void perf_addr_filters_adjust(struct vm_area_struct *vma)
{
        struct perf_event_context *ctx;

        /*
         * Data tracing isn't supported yet and as such there is no need
         * to keep track of anything that isn't related to executable code:
         */
        if (!(vma->vm_flags & VM_EXEC))
                return;

        rcu_read_lock();
        ctx = rcu_dereference(current->perf_event_ctxp);
        if (ctx)
                perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
        rcu_read_unlock();
}

void perf_event_mmap(struct vm_area_struct *vma)
{
        struct perf_mmap_event mmap_event;

        if (!atomic_read(&nr_mmap_events))
                return;

        mmap_event = (struct perf_mmap_event){
                .vma        = vma,
                /* .file_name */
                /* .file_size */
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_MMAP,
                                .misc = PERF_RECORD_MISC_USER,
                                /* .size */
                        },
                        /* .pid */
                        /* .tid */
                        .start  = vma->vm_start,
                        .len    = vma->vm_end - vma->vm_start,
                        .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
                },
                /* .maj (attr_mmap2 only) */
                /* .min (attr_mmap2 only) */
                /* .ino (attr_mmap2 only) */
                /* .ino_generation (attr_mmap2 only) */
                /* .prot (attr_mmap2 only) */
                /* .flags (attr_mmap2 only) */
        };

        perf_addr_filters_adjust(vma);
        perf_event_mmap_event(&mmap_event);
}

void perf_event_aux_event(struct perf_event *event, unsigned long head,
                          unsigned long size, u64 flags)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                offset;
                u64                                size;
                u64                                flags;
        } rec = {
                .header = {
                        .type = PERF_RECORD_AUX,
                        .misc = 0,
                        .size = sizeof(rec),
                },
                .offset                = head,
                .size                = size,
                .flags                = flags,
        };
        int ret;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

/*
 * Lost/dropped samples logging
 */
void perf_log_lost_samples(struct perf_event *event, u64 lost)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                lost;
        } lost_samples_event = {
                .header = {
                        .type = PERF_RECORD_LOST_SAMPLES,
                        .misc = 0,
                        .size = sizeof(lost_samples_event),
                },
                .lost                = lost,
        };

        perf_event_header__init_id(&lost_samples_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                lost_samples_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, lost_samples_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * context_switch tracking
 */

struct perf_switch_event {
        struct task_struct        *task;
        struct task_struct        *next_prev;

        struct {
                struct perf_event_header        header;
                u32                                next_prev_pid;
                u32                                next_prev_tid;
        } event_id;
};

static int perf_event_switch_match(struct perf_event *event)
{
        return event->attr.context_switch;
}

static void perf_event_switch_output(struct perf_event *event, void *data)
{
        struct perf_switch_event *se = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_switch_match(event))
                return;

        /* Only CPU-wide events are allowed to see next/prev pid/tid */
        if (event->ctx->task) {
                se->event_id.header.type = PERF_RECORD_SWITCH;
                se->event_id.header.size = sizeof(se->event_id.header);
        } else {
                se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
                se->event_id.header.size = sizeof(se->event_id);
                se->event_id.next_prev_pid =
                                        perf_event_pid(event, se->next_prev);
                se->event_id.next_prev_tid =
                                        perf_event_tid(event, se->next_prev);
        }

        perf_event_header__init_id(&se->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size);
        if (ret)
                return;

        if (event->ctx->task)
                perf_output_put(&handle, se->event_id.header);
        else
                perf_output_put(&handle, se->event_id);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_switch(struct task_struct *task,
                              struct task_struct *next_prev, bool sched_in)
{
        struct perf_switch_event switch_event;

        /* N.B. caller checks nr_switch_events != 0 */

        switch_event = (struct perf_switch_event){
                .task                = task,
                .next_prev        = next_prev,
                .event_id        = {
                        .header = {
                                /* .type */
                                .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
                                /* .size */
                        },
                        /* .next_prev_pid */
                        /* .next_prev_tid */
                },
        };

        if (!sched_in && task_is_runnable(task)) {
                switch_event.event_id.header.misc |=
                                PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
        }

        perf_iterate_sb(perf_event_switch_output, &switch_event, NULL);
}

/*
 * IRQ throttle logging
 */

static void perf_log_throttle(struct perf_event *event, int enable)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        struct {
                struct perf_event_header        header;
                u64                                time;
                u64                                id;
                u64                                stream_id;
        } throttle_event = {
                .header = {
                        .type = PERF_RECORD_THROTTLE,
                        .misc = 0,
                        .size = sizeof(throttle_event),
                },
                .time                = perf_event_clock(event),
                .id                = primary_event_id(event),
                .stream_id        = event->id,
        };

        if (enable)
                throttle_event.header.type = PERF_RECORD_UNTHROTTLE;

        perf_event_header__init_id(&throttle_event.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                throttle_event.header.size);
        if (ret)
                return;

        perf_output_put(&handle, throttle_event);
        perf_event__output_id_sample(event, &handle, &sample);
        perf_output_end(&handle);
}

/*
 * ksymbol register/unregister tracking
 */

struct perf_ksymbol_event {
        const char        *name;
        int                name_len;
        struct {
                struct perf_event_header        header;
                u64                                addr;
                u32                                len;
                u16                                ksym_type;
                u16                                flags;
        } event_id;
};

static int perf_event_ksymbol_match(struct perf_event *event)
{
        return event->attr.ksymbol;
}

static void perf_event_ksymbol_output(struct perf_event *event, void *data)
{
        struct perf_ksymbol_event *ksymbol_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_ksymbol_match(event))
                return;

        perf_event_header__init_id(&ksymbol_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                ksymbol_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, ksymbol_event->event_id);
        __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister,
                        const char *sym)
{
        struct perf_ksymbol_event ksymbol_event;
        char name[KSYM_NAME_LEN];
        u16 flags = 0;
        int name_len;

        if (!atomic_read(&nr_ksymbol_events))
                return;

        if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX ||
            ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN)
                goto err;

        strscpy(name, sym);
        name_len = strlen(name) + 1;
        while (!IS_ALIGNED(name_len, sizeof(u64)))
                name[name_len++] = '\0';
        BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64));

        if (unregister)
                flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER;

        ksymbol_event = (struct perf_ksymbol_event){
                .name = name,
                .name_len = name_len,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_KSYMBOL,
                                .size = sizeof(ksymbol_event.event_id) +
                                        name_len,
                        },
                        .addr = addr,
                        .len = len,
                        .ksym_type = ksym_type,
                        .flags = flags,
                },
        };

        perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL);
        return;
err:
        WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type);
}

/*
 * bpf program load/unload tracking
 */

struct perf_bpf_event {
        struct bpf_prog        *prog;
        struct {
                struct perf_event_header        header;
                u16                                type;
                u16                                flags;
                u32                                id;
                u8                                tag[BPF_TAG_SIZE];
        } event_id;
};

static int perf_event_bpf_match(struct perf_event *event)
{
        return event->attr.bpf_event;
}

static void perf_event_bpf_output(struct perf_event *event, void *data)
{
        struct perf_bpf_event *bpf_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int ret;

        if (!perf_event_bpf_match(event))
                return;

        perf_event_header__init_id(&bpf_event->event_id.header,
                                   &sample, event);
        ret = perf_output_begin(&handle, &sample, event,
                                bpf_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, bpf_event->event_id);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog,
                                         enum perf_bpf_event_type type)
{
        bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD;
        int i;

        perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF,
                           (u64)(unsigned long)prog->bpf_func,
                           prog->jited_len, unregister,
                           prog->aux->ksym.name);

        for (i = 1; i < prog->aux->func_cnt; i++) {
                struct bpf_prog *subprog = prog->aux->func[i];

                perf_event_ksymbol(
                        PERF_RECORD_KSYMBOL_TYPE_BPF,
                        (u64)(unsigned long)subprog->bpf_func,
                        subprog->jited_len, unregister,
                        subprog->aux->ksym.name);
        }
}

void perf_event_bpf_event(struct bpf_prog *prog,
                          enum perf_bpf_event_type type,
                          u16 flags)
{
        struct perf_bpf_event bpf_event;

        switch (type) {
        case PERF_BPF_EVENT_PROG_LOAD:
        case PERF_BPF_EVENT_PROG_UNLOAD:
                if (atomic_read(&nr_ksymbol_events))
                        perf_event_bpf_emit_ksymbols(prog, type);
                break;
        default:
                return;
        }

        if (!atomic_read(&nr_bpf_events))
                return;

        bpf_event = (struct perf_bpf_event){
                .prog = prog,
                .event_id = {
                        .header = {
                                .type = PERF_RECORD_BPF_EVENT,
                                .size = sizeof(bpf_event.event_id),
                        },
                        .type = type,
                        .flags = flags,
                        .id = prog->aux->id,
                },
        };

        BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64));

        memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE);
        perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
}

struct perf_text_poke_event {
        const void                *old_bytes;
        const void                *new_bytes;
        size_t                        pad;
        u16                        old_len;
        u16                        new_len;

        struct {
                struct perf_event_header        header;

                u64                                addr;
        } event_id;
};

static int perf_event_text_poke_match(struct perf_event *event)
{
        return event->attr.text_poke;
}

static void perf_event_text_poke_output(struct perf_event *event, void *data)
{
        struct perf_text_poke_event *text_poke_event = data;
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        u64 padding = 0;
        int ret;

        if (!perf_event_text_poke_match(event))
                return;

        perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);

        ret = perf_output_begin(&handle, &sample, event,
                                text_poke_event->event_id.header.size);
        if (ret)
                return;

        perf_output_put(&handle, text_poke_event->event_id);
        perf_output_put(&handle, text_poke_event->old_len);
        perf_output_put(&handle, text_poke_event->new_len);

        __output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
        __output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);

        if (text_poke_event->pad)
                __output_copy(&handle, &padding, text_poke_event->pad);

        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_event_text_poke(const void *addr, const void *old_bytes,
                          size_t old_len, const void *new_bytes, size_t new_len)
{
        struct perf_text_poke_event text_poke_event;
        size_t tot, pad;

        if (!atomic_read(&nr_text_poke_events))
                return;

        tot  = sizeof(text_poke_event.old_len) + old_len;
        tot += sizeof(text_poke_event.new_len) + new_len;
        pad  = ALIGN(tot, sizeof(u64)) - tot;

        text_poke_event = (struct perf_text_poke_event){
                .old_bytes    = old_bytes,
                .new_bytes    = new_bytes,
                .pad          = pad,
                .old_len      = old_len,
                .new_len      = new_len,
                .event_id  = {
                        .header = {
                                .type = PERF_RECORD_TEXT_POKE,
                                .misc = PERF_RECORD_MISC_KERNEL,
                                .size = sizeof(text_poke_event.event_id) + tot + pad,
                        },
                        .addr = (unsigned long)addr,
                },
        };

        perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
}

void perf_event_itrace_started(struct perf_event *event)
{
        event->attach_state |= PERF_ATTACH_ITRACE;
}

static void perf_log_itrace_start(struct perf_event *event)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u32                                pid;
                u32                                tid;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
            event->attach_state & PERF_ATTACH_ITRACE)
                return;

        rec.header.type        = PERF_RECORD_ITRACE_START;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.pid        = perf_event_pid(event, current);
        rec.tid        = perf_event_tid(event, current);

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}

void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
{
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        struct perf_aux_event {
                struct perf_event_header        header;
                u64                                hw_id;
        } rec;
        int ret;

        if (event->parent)
                event = event->parent;

        rec.header.type        = PERF_RECORD_AUX_OUTPUT_HW_ID;
        rec.header.misc        = 0;
        rec.header.size        = sizeof(rec);
        rec.hw_id        = hw_id;

        perf_event_header__init_id(&rec.header, &sample, event);
        ret = perf_output_begin(&handle, &sample, event, rec.header.size);

        if (ret)
                return;

        perf_output_put(&handle, rec);
        perf_event__output_id_sample(event, &handle, &sample);

        perf_output_end(&handle);
}
EXPORT_SYMBOL_GPL(perf_report_aux_output_id);

static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
        struct hw_perf_event *hwc = &event->hw;
        int ret = 0;
        u64 seq;

        seq = __this_cpu_read(perf_throttled_seq);
        if (seq != hwc->interrupts_seq) {
                hwc->interrupts_seq = seq;
                hwc->interrupts = 1;
        } else {
                hwc->interrupts++;
                if (unlikely(throttle &&
                             hwc->interrupts > max_samples_per_tick)) {
                        __this_cpu_inc(perf_throttled_count);
                        tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
                        hwc->interrupts = MAX_INTERRUPTS;
                        perf_log_throttle(event, 0);
                        ret = 1;
                }
        }

        if (event->attr.freq) {
                u64 now = perf_clock();
                s64 delta = now - hwc->freq_time_stamp;

                hwc->freq_time_stamp = now;

                if (delta > 0 && delta < 2*TICK_NSEC)
                        perf_adjust_period(event, delta, hwc->last_period, true);
        }

        return ret;
}

int perf_event_account_interrupt(struct perf_event *event)
{
        return __perf_event_account_interrupt(event, 1);
}

static inline bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
{
        /*
         * Due to interrupt latency (AKA "skid"), we may enter the
         * kernel before taking an overflow, even if the PMU is only
         * counting user events.
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return false;

        return true;
}

#ifdef CONFIG_BPF_SYSCALL
static int bpf_overflow_handler(struct perf_event *event,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        struct bpf_perf_event_data_kern ctx = {
                .data = data,
                .event = event,
        };
        struct bpf_prog *prog;
        int ret = 0;

        ctx.regs = perf_arch_bpf_user_pt_regs(regs);
        if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
                goto out;
        rcu_read_lock();
        prog = READ_ONCE(event->prog);
        if (prog) {
                perf_prepare_sample(data, event, regs);
                ret = bpf_prog_run(prog, &ctx);
        }
        rcu_read_unlock();
out:
        __this_cpu_dec(bpf_prog_active);

        return ret;
}

static inline int perf_event_set_bpf_handler(struct perf_event *event,
                                             struct bpf_prog *prog,
                                             u64 bpf_cookie)
{
        if (event->overflow_handler_context)
                /* hw breakpoint or kernel counter */
                return -EINVAL;

        if (event->prog)
                return -EEXIST;

        if (prog->type != BPF_PROG_TYPE_PERF_EVENT)
                return -EINVAL;

        if (event->attr.precise_ip &&
            prog->call_get_stack &&
            (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) ||
             event->attr.exclude_callchain_kernel ||
             event->attr.exclude_callchain_user)) {
                /*
                 * On perf_event with precise_ip, calling bpf_get_stack()
                 * may trigger unwinder warnings and occasional crashes.
                 * bpf_get_[stack|stackid] works around this issue by using
                 * callchain attached to perf_sample_data. If the
                 * perf_event does not full (kernel and user) callchain
                 * attached to perf_sample_data, do not allow attaching BPF
                 * program that calls bpf_get_[stack|stackid].
                 */
                return -EPROTO;
        }

        event->prog = prog;
        event->bpf_cookie = bpf_cookie;
        return 0;
}

static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
        struct bpf_prog *prog = event->prog;

        if (!prog)
                return;

        event->prog = NULL;
        bpf_prog_put(prog);
}
#else
static inline int bpf_overflow_handler(struct perf_event *event,
                                       struct perf_sample_data *data,
                                       struct pt_regs *regs)
{
        return 1;
}

static inline int perf_event_set_bpf_handler(struct perf_event *event,
                                             struct bpf_prog *prog,
                                             u64 bpf_cookie)
{
        return -EOPNOTSUPP;
}

static inline void perf_event_free_bpf_handler(struct perf_event *event)
{
}
#endif

/*
 * Generic event overflow handling, sampling.
 */

static int __perf_event_overflow(struct perf_event *event,
                                 int throttle, struct perf_sample_data *data,
                                 struct pt_regs *regs)
{
        int events = atomic_read(&event->event_limit);
        int ret = 0;

        /*
         * Non-sampling counters might still use the PMI to fold short
         * hardware counters, ignore those.
         */
        if (unlikely(!is_sampling_event(event)))
                return 0;

        ret = __perf_event_account_interrupt(event, throttle);

        if (event->attr.aux_pause)
                perf_event_aux_pause(event->aux_event, true);

        if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT &&
            !bpf_overflow_handler(event, data, regs))
                goto out;

        /*
         * XXX event_limit might not quite work as expected on inherited
         * events
         */

        event->pending_kill = POLL_IN;
        if (events && atomic_dec_and_test(&event->event_limit)) {
                ret = 1;
                event->pending_kill = POLL_HUP;
                perf_event_disable_inatomic(event);
        }

        if (event->attr.sigtrap) {
                /*
                 * The desired behaviour of sigtrap vs invalid samples is a bit
                 * tricky; on the one hand, one should not loose the SIGTRAP if
                 * it is the first event, on the other hand, we should also not
                 * trigger the WARN or override the data address.
                 */
                bool valid_sample = sample_is_allowed(event, regs);
                unsigned int pending_id = 1;
                enum task_work_notify_mode notify_mode;

                if (regs)
                        pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1;

                notify_mode = in_nmi() ? TWA_NMI_CURRENT : TWA_RESUME;

                if (!event->pending_work &&
                    !task_work_add(current, &event->pending_task, notify_mode)) {
                        event->pending_work = pending_id;
                        local_inc(&event->ctx->nr_no_switch_fast);
                        WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount));

                        event->pending_addr = 0;
                        if (valid_sample && (data->sample_flags & PERF_SAMPLE_ADDR))
                                event->pending_addr = data->addr;

                } else if (event->attr.exclude_kernel && valid_sample) {
                        /*
                         * Should not be able to return to user space without
                         * consuming pending_work; with exceptions:
                         *
                         *  1. Where !exclude_kernel, events can overflow again
                         *     in the kernel without returning to user space.
                         *
                         *  2. Events that can overflow again before the IRQ-
                         *     work without user space progress (e.g. hrtimer).
                         *     To approximate progress (with false negatives),
                         *     check 32-bit hash of the current IP.
                         */
                        WARN_ON_ONCE(event->pending_work != pending_id);
                }
        }

        READ_ONCE(event->overflow_handler)(event, data, regs);

        if (*perf_event_fasync(event) && event->pending_kill) {
                event->pending_wakeup = 1;
                irq_work_queue(&event->pending_irq);
        }
out:
        if (event->attr.aux_resume)
                perf_event_aux_pause(event->aux_event, false);

        return ret;
}

int perf_event_overflow(struct perf_event *event,
                        struct perf_sample_data *data,
                        struct pt_regs *regs)
{
        return __perf_event_overflow(event, 1, data, regs);
}

/*
 * Generic software event infrastructure
 */

struct swevent_htable {
        struct swevent_hlist                *swevent_hlist;
        struct mutex                        hlist_mutex;
        int                                hlist_refcount;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);

/*
 * We directly increment event->count and keep a second value in
 * event->hw.period_left to count intervals. This period event
 * is kept in the range [-sample_period, 0] so that we can use the
 * sign as trigger.
 */

u64 perf_swevent_set_period(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
        u64 nr, offset;
        s64 old, val;

        hwc->last_period = hwc->sample_period;

        old = local64_read(&hwc->period_left);
        do {
                val = old;
                if (val < 0)
                        return 0;

                nr = div64_u64(period + val, period);
                offset = nr * period;
                val -= offset;
        } while (!local64_try_cmpxchg(&hwc->period_left, &old, val));

        return nr;
}

static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;

        if (!overflow)
                overflow = perf_swevent_set_period(event);

        if (hwc->interrupts == MAX_INTERRUPTS)
                return;

        for (; overflow; overflow--) {
                if (__perf_event_overflow(event, throttle,
                                            data, regs)) {
                        /*
                         * We inhibit the overflow from happening when
                         * hwc->interrupts == MAX_INTERRUPTS.
                         */
                        break;
                }
                throttle = 1;
        }
}

static void perf_swevent_event(struct perf_event *event, u64 nr,
                               struct perf_sample_data *data,
                               struct pt_regs *regs)
{
        struct hw_perf_event *hwc = &event->hw;

        local64_add(nr, &event->count);

        if (!regs)
                return;

        if (!is_sampling_event(event))
                return;

        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
                data->period = nr;
                return perf_swevent_overflow(event, 1, data, regs);
        } else
                data->period = event->hw.last_period;

        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);

        if (local64_add_negative(nr, &hwc->period_left))
                return;

        perf_swevent_overflow(event, 0, data, regs);
}

int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 1;

        if (regs) {
                if (event->attr.exclude_user && user_mode(regs))
                        return 1;

                if (event->attr.exclude_kernel && !user_mode(regs))
                        return 1;
        }

        return 0;
}

static int perf_swevent_match(struct perf_event *event,
                                enum perf_type_id type,
                                u32 event_id,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        if (event->attr.type != type)
                return 0;

        if (event->attr.config != event_id)
                return 0;

        if (perf_exclude_event(event, regs))
                return 0;

        return 1;
}

static inline u64 swevent_hash(u64 type, u32 event_id)
{
        u64 val = event_id | (type << 32);

        return hash_64(val, SWEVENT_HLIST_BITS);
}

static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
{
        u64 hash = swevent_hash(type, event_id);

        return &hlist->heads[hash];
}

/* For the read side: events when they trigger */
static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
{
        struct swevent_hlist *hlist;

        hlist = rcu_dereference(swhash->swevent_hlist);
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

/* For the event head insertion and removal in the hlist */
static inline struct hlist_head *
find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
{
        struct swevent_hlist *hlist;
        u32 event_id = event->attr.config;
        u64 type = event->attr.type;

        /*
         * Event scheduling is always serialized against hlist allocation
         * and release. Which makes the protected version suitable here.
         * The context lock guarantees that.
         */
        hlist = rcu_dereference_protected(swhash->swevent_hlist,
                                          lockdep_is_held(&event->ctx->lock));
        if (!hlist)
                return NULL;

        return __find_swevent_head(hlist, type, event_id);
}

static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
                                    u64 nr,
                                    struct perf_sample_data *data,
                                    struct pt_regs *regs)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct perf_event *event;
        struct hlist_head *head;

        rcu_read_lock();
        head = find_swevent_head_rcu(swhash, type, event_id);
        if (!head)
                goto end;

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_swevent_match(event, type, event_id, data, regs))
                        perf_swevent_event(event, nr, data, regs);
        }
end:
        rcu_read_unlock();
}

DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);

int perf_swevent_get_recursion_context(void)
{
        return get_recursion_context(current->perf_recursion);
}
EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);

void perf_swevent_put_recursion_context(int rctx)
{
        put_recursion_context(current->perf_recursion, rctx);
}

void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        struct perf_sample_data data;

        if (WARN_ON_ONCE(!regs))
                return;

        perf_sample_data_init(&data, addr, 0);
        do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
}

void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
{
        int rctx;

        preempt_disable_notrace();
        rctx = perf_swevent_get_recursion_context();
        if (unlikely(rctx < 0))
                goto fail;

        ___perf_sw_event(event_id, nr, regs, addr);

        perf_swevent_put_recursion_context(rctx);
fail:
        preempt_enable_notrace();
}

static void perf_swevent_read(struct perf_event *event)
{
}

static int perf_swevent_add(struct perf_event *event, int flags)
{
        struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
        struct hw_perf_event *hwc = &event->hw;
        struct hlist_head *head;

        if (is_sampling_event(event)) {
                hwc->last_period = hwc->sample_period;
                perf_swevent_set_period(event);
        }

        hwc->state = !(flags & PERF_EF_START);

        head = find_swevent_head(swhash, event);
        if (WARN_ON_ONCE(!head))
                return -EINVAL;

        hlist_add_head_rcu(&event->hlist_entry, head);
        perf_event_update_userpage(event);

        return 0;
}

static void perf_swevent_del(struct perf_event *event, int flags)
{
        hlist_del_rcu(&event->hlist_entry);
}

static void perf_swevent_start(struct perf_event *event, int flags)
{
        event->hw.state = 0;
}

static void perf_swevent_stop(struct perf_event *event, int flags)
{
        event->hw.state = PERF_HES_STOPPED;
}

/* Deref the hlist from the update side */
static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable *swhash)
{
        return rcu_dereference_protected(swhash->swevent_hlist,
                                         lockdep_is_held(&swhash->hlist_mutex));
}

static void swevent_hlist_release(struct swevent_htable *swhash)
{
        struct swevent_hlist *hlist = swevent_hlist_deref(swhash);

        if (!hlist)
                return;

        RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
        kfree_rcu(hlist, rcu_head);
}

static void swevent_hlist_put_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);

        if (!--swhash->hlist_refcount)
                swevent_hlist_release(swhash);

        mutex_unlock(&swhash->hlist_mutex);
}

static void swevent_hlist_put(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                swevent_hlist_put_cpu(cpu);
}

static int swevent_hlist_get_cpu(int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
        int err = 0;

        mutex_lock(&swhash->hlist_mutex);
        if (!swevent_hlist_deref(swhash) &&
            cpumask_test_cpu(cpu, perf_online_mask)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
                if (!hlist) {
                        err = -ENOMEM;
                        goto exit;
                }
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        swhash->hlist_refcount++;
exit:
        mutex_unlock(&swhash->hlist_mutex);

        return err;
}

static int swevent_hlist_get(void)
{
        int err, cpu, failed_cpu;

        mutex_lock(&pmus_lock);
        for_each_possible_cpu(cpu) {
                err = swevent_hlist_get_cpu(cpu);
                if (err) {
                        failed_cpu = cpu;
                        goto fail;
                }
        }
        mutex_unlock(&pmus_lock);
        return 0;
fail:
        for_each_possible_cpu(cpu) {
                if (cpu == failed_cpu)
                        break;
                swevent_hlist_put_cpu(cpu);
        }
        mutex_unlock(&pmus_lock);
        return err;
}

struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];

static void sw_perf_event_destroy(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        WARN_ON(event->parent);

        static_key_slow_dec(&perf_swevent_enabled[event_id]);
        swevent_hlist_put();
}

static struct pmu perf_cpu_clock; /* fwd declaration */
static struct pmu perf_task_clock;

static int perf_swevent_init(struct perf_event *event)
{
        u64 event_id = event->attr.config;

        if (event->attr.type != PERF_TYPE_SOFTWARE)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        switch (event_id) {
        case PERF_COUNT_SW_CPU_CLOCK:
                event->attr.type = perf_cpu_clock.type;
                return -ENOENT;
        case PERF_COUNT_SW_TASK_CLOCK:
                event->attr.type = perf_task_clock.type;
                return -ENOENT;

        default:
                break;
        }

        if (event_id >= PERF_COUNT_SW_MAX)
                return -ENOENT;

        if (!event->parent) {
                int err;

                err = swevent_hlist_get();
                if (err)
                        return err;

                static_key_slow_inc(&perf_swevent_enabled[event_id]);
                event->destroy = sw_perf_event_destroy;
        }

        return 0;
}

static struct pmu perf_swevent = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,

        .event_init        = perf_swevent_init,
        .add                = perf_swevent_add,
        .del                = perf_swevent_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

#ifdef CONFIG_EVENT_TRACING

static void tp_perf_event_destroy(struct perf_event *event)
{
        perf_trace_destroy(event);
}

static int perf_tp_event_init(struct perf_event *event)
{
        int err;

        if (event->attr.type != PERF_TYPE_TRACEPOINT)
                return -ENOENT;

        /*
         * no branch sampling for tracepoint events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        err = perf_trace_init(event);
        if (err)
                return err;

        event->destroy = tp_perf_event_destroy;

        return 0;
}

static struct pmu perf_tracepoint = {
        .task_ctx_nr        = perf_sw_context,

        .event_init        = perf_tp_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
};

static int perf_tp_filter_match(struct perf_event *event,
                                struct perf_raw_record *raw)
{
        void *record = raw->frag.data;

        /* only top level events have filters set */
        if (event->parent)
                event = event->parent;

        if (likely(!event->filter) || filter_match_preds(event->filter, record))
                return 1;
        return 0;
}

static int perf_tp_event_match(struct perf_event *event,
                                struct perf_raw_record *raw,
                                struct pt_regs *regs)
{
        if (event->hw.state & PERF_HES_STOPPED)
                return 0;
        /*
         * If exclude_kernel, only trace user-space tracepoints (uprobes)
         */
        if (event->attr.exclude_kernel && !user_mode(regs))
                return 0;

        if (!perf_tp_filter_match(event, raw))
                return 0;

        return 1;
}

void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
                               struct trace_event_call *call, u64 count,
                               struct pt_regs *regs, struct hlist_head *head,
                               struct task_struct *task)
{
        if (bpf_prog_array_valid(call)) {
                *(struct pt_regs **)raw_data = regs;
                if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
                        perf_swevent_put_recursion_context(rctx);
                        return;
                }
        }
        perf_tp_event(call->event.type, count, raw_data, size, regs, head,
                      rctx, task);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);

static void __perf_tp_event_target_task(u64 count, void *record,
                                        struct pt_regs *regs,
                                        struct perf_sample_data *data,
                                        struct perf_raw_record *raw,
                                        struct perf_event *event)
{
        struct trace_entry *entry = record;

        if (event->attr.config != entry->type)
                return;
        /* Cannot deliver synchronous signal to other task. */
        if (event->attr.sigtrap)
                return;
        if (perf_tp_event_match(event, raw, regs)) {
                perf_sample_data_init(data, 0, 0);
                perf_sample_save_raw_data(data, event, raw);
                perf_swevent_event(event, count, data, regs);
        }
}

static void perf_tp_event_target_task(u64 count, void *record,
                                      struct pt_regs *regs,
                                      struct perf_sample_data *data,
                                      struct perf_raw_record *raw,
                                      struct perf_event_context *ctx)
{
        unsigned int cpu = smp_processor_id();
        struct pmu *pmu = &perf_tracepoint;
        struct perf_event *event, *sibling;

        perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
                __perf_tp_event_target_task(count, record, regs, data, raw, event);
                for_each_sibling_event(sibling, event)
                        __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
        }

        perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
                __perf_tp_event_target_task(count, record, regs, data, raw, event);
                for_each_sibling_event(sibling, event)
                        __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
        }
}

void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
                   struct pt_regs *regs, struct hlist_head *head, int rctx,
                   struct task_struct *task)
{
        struct perf_sample_data data;
        struct perf_event *event;

        struct perf_raw_record raw = {
                .frag = {
                        .size = entry_size,
                        .data = record,
                },
        };

        perf_trace_buf_update(record, event_type);

        hlist_for_each_entry_rcu(event, head, hlist_entry) {
                if (perf_tp_event_match(event, &raw, regs)) {
                        /*
                         * Here use the same on-stack perf_sample_data,
                         * some members in data are event-specific and
                         * need to be re-computed for different sweveents.
                         * Re-initialize data->sample_flags safely to avoid
                         * the problem that next event skips preparing data
                         * because data->sample_flags is set.
                         */
                        perf_sample_data_init(&data, 0, 0);
                        perf_sample_save_raw_data(&data, event, &raw);
                        perf_swevent_event(event, count, &data, regs);
                }
        }

        /*
         * If we got specified a target task, also iterate its context and
         * deliver this event there too.
         */
        if (task && task != current) {
                struct perf_event_context *ctx;

                rcu_read_lock();
                ctx = rcu_dereference(task->perf_event_ctxp);
                if (!ctx)
                        goto unlock;

                raw_spin_lock(&ctx->lock);
                perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
                raw_spin_unlock(&ctx->lock);
unlock:
                rcu_read_unlock();
        }

        perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);

#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
/*
 * Flags in config, used by dynamic PMU kprobe and uprobe
 * The flags should match following PMU_FORMAT_ATTR().
 *
 * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
 *                               if not set, create kprobe/uprobe
 *
 * The following values specify a reference counter (or semaphore in the
 * terminology of tools like dtrace, systemtap, etc.) Userspace Statically
 * Defined Tracepoints (USDT). Currently, we use 40 bit for the offset.
 *
 * PERF_UPROBE_REF_CTR_OFFSET_BITS        # of bits in config as th offset
 * PERF_UPROBE_REF_CTR_OFFSET_SHIFT        # of bits to shift left
 */
enum perf_probe_config {
        PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
        PERF_UPROBE_REF_CTR_OFFSET_BITS = 32,
        PERF_UPROBE_REF_CTR_OFFSET_SHIFT = 64 - PERF_UPROBE_REF_CTR_OFFSET_BITS,
};

PMU_FORMAT_ATTR(retprobe, "config:0");
#endif

#ifdef CONFIG_KPROBE_EVENTS
static struct attribute *kprobe_attrs[] = {
        &format_attr_retprobe.attr,
        NULL,
};

static struct attribute_group kprobe_format_group = {
        .name = "format",
        .attrs = kprobe_attrs,
};

static const struct attribute_group *kprobe_attr_groups[] = {
        &kprobe_format_group,
        NULL,
};

static int perf_kprobe_event_init(struct perf_event *event);
static struct pmu perf_kprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_kprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = kprobe_attr_groups,
};

static int perf_kprobe_event_init(struct perf_event *event)
{
        int err;
        bool is_retprobe;

        if (event->attr.type != perf_kprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        err = perf_kprobe_init(event, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_kprobe_destroy;

        return 0;
}
#endif /* CONFIG_KPROBE_EVENTS */

#ifdef CONFIG_UPROBE_EVENTS
PMU_FORMAT_ATTR(ref_ctr_offset, "config:32-63");

static struct attribute *uprobe_attrs[] = {
        &format_attr_retprobe.attr,
        &format_attr_ref_ctr_offset.attr,
        NULL,
};

static struct attribute_group uprobe_format_group = {
        .name = "format",
        .attrs = uprobe_attrs,
};

static const struct attribute_group *uprobe_attr_groups[] = {
        &uprobe_format_group,
        NULL,
};

static int perf_uprobe_event_init(struct perf_event *event);
static struct pmu perf_uprobe = {
        .task_ctx_nr        = perf_sw_context,
        .event_init        = perf_uprobe_event_init,
        .add                = perf_trace_add,
        .del                = perf_trace_del,
        .start                = perf_swevent_start,
        .stop                = perf_swevent_stop,
        .read                = perf_swevent_read,
        .attr_groups        = uprobe_attr_groups,
};

static int perf_uprobe_event_init(struct perf_event *event)
{
        int err;
        unsigned long ref_ctr_offset;
        bool is_retprobe;

        if (event->attr.type != perf_uprobe.type)
                return -ENOENT;

        if (!perfmon_capable())
                return -EACCES;

        /*
         * no branch sampling for probe events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
        ref_ctr_offset = event->attr.config >> PERF_UPROBE_REF_CTR_OFFSET_SHIFT;
        err = perf_uprobe_init(event, ref_ctr_offset, is_retprobe);
        if (err)
                return err;

        event->destroy = perf_uprobe_destroy;

        return 0;
}
#endif /* CONFIG_UPROBE_EVENTS */

static inline void perf_tp_register(void)
{
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
#ifdef CONFIG_KPROBE_EVENTS
        perf_pmu_register(&perf_kprobe, "kprobe", -1);
#endif
#ifdef CONFIG_UPROBE_EVENTS
        perf_pmu_register(&perf_uprobe, "uprobe", -1);
#endif
}

static void perf_event_free_filter(struct perf_event *event)
{
        ftrace_profile_free_filter(event);
}

/*
 * returns true if the event is a tracepoint, or a kprobe/upprobe created
 * with perf_event_open()
 */
static inline bool perf_event_is_tracing(struct perf_event *event)
{
        if (event->pmu == &perf_tracepoint)
                return true;
#ifdef CONFIG_KPROBE_EVENTS
        if (event->pmu == &perf_kprobe)
                return true;
#endif
#ifdef CONFIG_UPROBE_EVENTS
        if (event->pmu == &perf_uprobe)
                return true;
#endif
        return false;
}

int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
                            u64 bpf_cookie)
{
        bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;

        if (!perf_event_is_tracing(event))
                return perf_event_set_bpf_handler(event, prog, bpf_cookie);

        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
        is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
        is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
        is_syscall_tp = is_syscall_trace_event(event->tp_event);
        if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
                /* bpf programs can only be attached to u/kprobe or tracepoint */
                return -EINVAL;

        if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
            (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
            (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
                return -EINVAL;

        if (prog->type == BPF_PROG_TYPE_KPROBE && prog->sleepable && !is_uprobe)
                /* only uprobe programs are allowed to be sleepable */
                return -EINVAL;

        /* Kprobe override only works for kprobes, not uprobes. */
        if (prog->kprobe_override && !is_kprobe)
                return -EINVAL;

        if (is_tracepoint || is_syscall_tp) {
                int off = trace_event_get_offsets(event->tp_event);

                if (prog->aux->max_ctx_offset > off)
                        return -EACCES;
        }

        return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
}

void perf_event_free_bpf_prog(struct perf_event *event)
{
        if (!event->prog)
                return;

        if (!perf_event_is_tracing(event)) {
                perf_event_free_bpf_handler(event);
                return;
        }
        perf_event_detach_bpf_prog(event);
}

#else

static inline void perf_tp_register(void)
{
}

static void perf_event_free_filter(struct perf_event *event)
{
}

int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
                            u64 bpf_cookie)
{
        return -ENOENT;
}

void perf_event_free_bpf_prog(struct perf_event *event)
{
}
#endif /* CONFIG_EVENT_TRACING */

#ifdef CONFIG_HAVE_HW_BREAKPOINT
void perf_bp_event(struct perf_event *bp, void *data)
{
        struct perf_sample_data sample;
        struct pt_regs *regs = data;

        perf_sample_data_init(&sample, bp->attr.bp_addr, 0);

        if (!bp->hw.state && !perf_exclude_event(bp, regs))
                perf_swevent_event(bp, 1, &sample, regs);
}
#endif

/*
 * Allocate a new address filter
 */
static struct perf_addr_filter *
perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
{
        int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
        struct perf_addr_filter *filter;

        filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
        if (!filter)
                return NULL;

        INIT_LIST_HEAD(&filter->entry);
        list_add_tail(&filter->entry, filters);

        return filter;
}

static void free_filters_list(struct list_head *filters)
{
        struct perf_addr_filter *filter, *iter;

        list_for_each_entry_safe(filter, iter, filters, entry) {
                path_put(&filter->path);
                list_del(&filter->entry);
                kfree(filter);
        }
}

/*
 * Free existing address filters and optionally install new ones
 */
static void perf_addr_filters_splice(struct perf_event *event,
                                     struct list_head *head)
{
        unsigned long flags;
        LIST_HEAD(list);

        if (!has_addr_filter(event))
                return;

        /* don't bother with children, they don't have their own filters */
        if (event->parent)
                return;

        raw_spin_lock_irqsave(&event->addr_filters.lock, flags);

        list_splice_init(&event->addr_filters.list, &list);
        if (head)
                list_splice(head, &event->addr_filters.list);

        raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);

        free_filters_list(&list);
}

static void perf_free_addr_filters(struct perf_event *event)
{
        /*
         * Used during free paths, there is no concurrency.
         */
        if (list_empty(&event->addr_filters.list))
                return;

        perf_addr_filters_splice(event, NULL);
}

/*
 * Scan through mm's vmas and see if one of them matches the
 * @filter; if so, adjust filter's address range.
 * Called with mm::mmap_lock down for reading.
 */
static void perf_addr_filter_apply(struct perf_addr_filter *filter,
                                   struct mm_struct *mm,
                                   struct perf_addr_filter_range *fr)
{
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        for_each_vma(vmi, vma) {
                if (!vma->vm_file)
                        continue;

                if (perf_addr_filter_vma_adjust(filter, vma, fr))
                        return;
        }
}

/*
 * Update event's address range filters based on the
 * task's existing mappings, if any.
 */
static void perf_event_addr_filters_apply(struct perf_event *event)
{
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct task_struct *task = READ_ONCE(event->ctx->task);
        struct perf_addr_filter *filter;
        struct mm_struct *mm = NULL;
        unsigned int count = 0;
        unsigned long flags;

        /*
         * We may observe TASK_TOMBSTONE, which means that the event tear-down
         * will stop on the parent's child_mutex that our caller is also holding
         */
        if (task == TASK_TOMBSTONE)
                return;

        if (ifh->nr_file_filters) {
                mm = get_task_mm(task);
                if (!mm)
                        goto restart;

                mmap_read_lock(mm);
        }

        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
                        /*
                         * Adjust base offset if the filter is associated to a
                         * binary that needs to be mapped:
                         */
                        event->addr_filter_ranges[count].start = 0;
                        event->addr_filter_ranges[count].size = 0;

                        perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
                } else {
                        event->addr_filter_ranges[count].start = filter->offset;
                        event->addr_filter_ranges[count].size  = filter->size;
                }

                count++;
        }

        event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);

        if (ifh->nr_file_filters) {
                mmap_read_unlock(mm);

                mmput(mm);
        }

restart:
        perf_event_stop(event, 1);
}

/*
 * Address range filtering: limiting the data to certain
 * instruction address ranges. Filters are ioctl()ed to us from
 * userspace as ascii strings.
 *
 * Filter string format:
 *
 * ACTION RANGE_SPEC
 * where ACTION is one of the
 *  * "filter": limit the trace to this region
 *  * "start": start tracing from this address
 *  * "stop": stop tracing at this address/region;
 * RANGE_SPEC is
 *  * for kernel addresses: <start address>[/<size>]
 *  * for object files:     <start address>[/<size>]@</path/to/object/file>
 *
 * if <size> is not specified or is zero, the range is treated as a single
 * address; not valid for ACTION=="filter".
 */
enum {
        IF_ACT_NONE = -1,
        IF_ACT_FILTER,
        IF_ACT_START,
        IF_ACT_STOP,
        IF_SRC_FILE,
        IF_SRC_KERNEL,
        IF_SRC_FILEADDR,
        IF_SRC_KERNELADDR,
};

enum {
        IF_STATE_ACTION = 0,
        IF_STATE_SOURCE,
        IF_STATE_END,
};

static const match_table_t if_tokens = {
        { IF_ACT_FILTER,        "filter" },
        { IF_ACT_START,                "start" },
        { IF_ACT_STOP,                "stop" },
        { IF_SRC_FILE,                "%u/%u@%s" },
        { IF_SRC_KERNEL,        "%u/%u" },
        { IF_SRC_FILEADDR,        "%u@%s" },
        { IF_SRC_KERNELADDR,        "%u" },
        { IF_ACT_NONE,                NULL },
};

/*
 * Address filter string parser
 */
static int
perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                             struct list_head *filters)
{
        struct perf_addr_filter *filter = NULL;
        char *start, *orig, *filename = NULL;
        substring_t args[MAX_OPT_ARGS];
        int state = IF_STATE_ACTION, token;
        unsigned int kernel = 0;
        int ret = -EINVAL;

        orig = fstr = kstrdup(fstr, GFP_KERNEL);
        if (!fstr)
                return -ENOMEM;

        while ((start = strsep(&fstr, " ,\n")) != NULL) {
                static const enum perf_addr_filter_action_t actions[] = {
                        [IF_ACT_FILTER]        = PERF_ADDR_FILTER_ACTION_FILTER,
                        [IF_ACT_START]        = PERF_ADDR_FILTER_ACTION_START,
                        [IF_ACT_STOP]        = PERF_ADDR_FILTER_ACTION_STOP,
                };
                ret = -EINVAL;

                if (!*start)
                        continue;

                /* filter definition begins */
                if (state == IF_STATE_ACTION) {
                        filter = perf_addr_filter_new(event, filters);
                        if (!filter)
                                goto fail;
                }

                token = match_token(start, if_tokens, args);
                switch (token) {
                case IF_ACT_FILTER:
                case IF_ACT_START:
                case IF_ACT_STOP:
                        if (state != IF_STATE_ACTION)
                                goto fail;

                        filter->action = actions[token];
                        state = IF_STATE_SOURCE;
                        break;

                case IF_SRC_KERNELADDR:
                case IF_SRC_KERNEL:
                        kernel = 1;
                        fallthrough;

                case IF_SRC_FILEADDR:
                case IF_SRC_FILE:
                        if (state != IF_STATE_SOURCE)
                                goto fail;

                        *args[0].to = 0;
                        ret = kstrtoul(args[0].from, 0, &filter->offset);
                        if (ret)
                                goto fail;

                        if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
                                *args[1].to = 0;
                                ret = kstrtoul(args[1].from, 0, &filter->size);
                                if (ret)
                                        goto fail;
                        }

                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
                                int fpos = token == IF_SRC_FILE ? 2 : 1;

                                kfree(filename);
                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
                                        goto fail;
                                }
                        }

                        state = IF_STATE_END;
                        break;

                default:
                        goto fail;
                }

                /*
                 * Filter definition is fully parsed, validate and install it.
                 * Make sure that it doesn't contradict itself or the event's
                 * attribute.
                 */
                if (state == IF_STATE_END) {
                        ret = -EINVAL;

                        /*
                         * ACTION "filter" must have a non-zero length region
                         * specified.
                         */
                        if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
                            !filter->size)
                                goto fail;

                        if (!kernel) {
                                if (!filename)
                                        goto fail;

                                /*
                                 * For now, we only support file-based filters
                                 * in per-task events; doing so for CPU-wide
                                 * events requires additional context switching
                                 * trickery, since same object code will be
                                 * mapped at different virtual addresses in
                                 * different processes.
                                 */
                                ret = -EOPNOTSUPP;
                                if (!event->ctx->task)
                                        goto fail;

                                /* look up the path and grab its inode */
                                ret = kern_path(filename, LOOKUP_FOLLOW,
                                                &filter->path);
                                if (ret)
                                        goto fail;

                                ret = -EINVAL;
                                if (!filter->path.dentry ||
                                    !S_ISREG(d_inode(filter->path.dentry)
                                             ->i_mode))
                                        goto fail;

                                event->addr_filters.nr_file_filters++;
                        }

                        /* ready to consume more filters */
                        kfree(filename);
                        filename = NULL;
                        state = IF_STATE_ACTION;
                        filter = NULL;
                        kernel = 0;
                }
        }

        if (state != IF_STATE_ACTION)
                goto fail;

        kfree(filename);
        kfree(orig);

        return 0;

fail:
        kfree(filename);
        free_filters_list(filters);
        kfree(orig);

        return ret;
}

static int
perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
{
        LIST_HEAD(filters);
        int ret;

        /*
         * Since this is called in perf_ioctl() path, we're already holding
         * ctx::mutex.
         */
        lockdep_assert_held(&event->ctx->mutex);

        if (WARN_ON_ONCE(event->parent))
                return -EINVAL;

        ret = perf_event_parse_addr_filter(event, filter_str, &filters);
        if (ret)
                goto fail_clear_files;

        ret = event->pmu->addr_filters_validate(&filters);
        if (ret)
                goto fail_free_filters;

        /* remove existing filters, if any */
        perf_addr_filters_splice(event, &filters);

        /* install new filters */
        perf_event_for_each_child(event, perf_event_addr_filters_apply);

        return ret;

fail_free_filters:
        free_filters_list(&filters);

fail_clear_files:
        event->addr_filters.nr_file_filters = 0;

        return ret;
}

static int perf_event_set_filter(struct perf_event *event, void __user *arg)
{
        int ret = -EINVAL;
        char *filter_str;

        filter_str = strndup_user(arg, PAGE_SIZE);
        if (IS_ERR(filter_str))
                return PTR_ERR(filter_str);

#ifdef CONFIG_EVENT_TRACING
        if (perf_event_is_tracing(event)) {
                struct perf_event_context *ctx = event->ctx;

                /*
                 * Beware, here be dragons!!
                 *
                 * the tracepoint muck will deadlock against ctx->mutex, but
                 * the tracepoint stuff does not actually need it. So
                 * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
                 * already have a reference on ctx.
                 *
                 * This can result in event getting moved to a different ctx,
                 * but that does not affect the tracepoint state.
                 */
                mutex_unlock(&ctx->mutex);
                ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
                mutex_lock(&ctx->mutex);
        } else
#endif
        if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);

        kfree(filter_str);
        return ret;
}

/*
 * hrtimer based swevent callback
 */

static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
{
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_sample_data data;
        struct pt_regs *regs;
        struct perf_event *event;
        u64 period;

        event = container_of(hrtimer, struct perf_event, hw.hrtimer);

        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return HRTIMER_NORESTART;

        event->pmu->read(event);

        perf_sample_data_init(&data, 0, event->hw.last_period);
        regs = get_irq_regs();

        if (regs && !perf_exclude_event(event, regs)) {
                if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (__perf_event_overflow(event, 1, &data, regs))
                                ret = HRTIMER_NORESTART;
        }

        period = max_t(u64, 10000, event->hw.sample_period);
        hrtimer_forward_now(hrtimer, ns_to_ktime(period));

        return ret;
}

static void perf_swevent_start_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;
        s64 period;

        if (!is_sampling_event(event))
                return;

        period = local64_read(&hwc->period_left);
        if (period) {
                if (period < 0)
                        period = 10000;

                local64_set(&hwc->period_left, 0);
        } else {
                period = max_t(u64, 10000, hwc->sample_period);
        }
        hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
                      HRTIMER_MODE_REL_PINNED_HARD);
}

static void perf_swevent_cancel_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (is_sampling_event(event)) {
                ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
                local64_set(&hwc->period_left, ktime_to_ns(remaining));

                hrtimer_cancel(&hwc->hrtimer);
        }
}

static void perf_swevent_init_hrtimer(struct perf_event *event)
{
        struct hw_perf_event *hwc = &event->hw;

        if (!is_sampling_event(event))
                return;

        hrtimer_setup(&hwc->hrtimer, perf_swevent_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);

        /*
         * Since hrtimers have a fixed rate, we can do a static freq->period
         * mapping and avoid the whole period adjust feedback stuff.
         */
        if (event->attr.freq) {
                long freq = event->attr.sample_freq;

                event->attr.sample_period = NSEC_PER_SEC / freq;
                hwc->sample_period = event->attr.sample_period;
                local64_set(&hwc->period_left, hwc->sample_period);
                hwc->last_period = hwc->sample_period;
                event->attr.freq = 0;
        }
}

/*
 * Software event: cpu wall time clock
 */

static void cpu_clock_event_update(struct perf_event *event)
{
        s64 prev;
        u64 now;

        now = local_clock();
        prev = local64_xchg(&event->hw.prev_count, now);
        local64_add(now - prev, &event->count);
}

static void cpu_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, local_clock());
        perf_swevent_start_hrtimer(event);
}

static void cpu_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        cpu_clock_event_update(event);
}

static int cpu_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                cpu_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void cpu_clock_event_del(struct perf_event *event, int flags)
{
        cpu_clock_event_stop(event, flags);
}

static void cpu_clock_event_read(struct perf_event *event)
{
        cpu_clock_event_update(event);
}

static int cpu_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != perf_cpu_clock.type)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_cpu_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,
        .dev                = PMU_NULL_DEV,

        .event_init        = cpu_clock_event_init,
        .add                = cpu_clock_event_add,
        .del                = cpu_clock_event_del,
        .start                = cpu_clock_event_start,
        .stop                = cpu_clock_event_stop,
        .read                = cpu_clock_event_read,
};

/*
 * Software event: task time clock
 */

static void task_clock_event_update(struct perf_event *event, u64 now)
{
        u64 prev;
        s64 delta;

        prev = local64_xchg(&event->hw.prev_count, now);
        delta = now - prev;
        local64_add(delta, &event->count);
}

static void task_clock_event_start(struct perf_event *event, int flags)
{
        local64_set(&event->hw.prev_count, event->ctx->time);
        perf_swevent_start_hrtimer(event);
}

static void task_clock_event_stop(struct perf_event *event, int flags)
{
        perf_swevent_cancel_hrtimer(event);
        task_clock_event_update(event, event->ctx->time);
}

static int task_clock_event_add(struct perf_event *event, int flags)
{
        if (flags & PERF_EF_START)
                task_clock_event_start(event, flags);
        perf_event_update_userpage(event);

        return 0;
}

static void task_clock_event_del(struct perf_event *event, int flags)
{
        task_clock_event_stop(event, PERF_EF_UPDATE);
}

static void task_clock_event_read(struct perf_event *event)
{
        u64 now = perf_clock();
        u64 delta = now - event->ctx->timestamp;
        u64 time = event->ctx->time + delta;

        task_clock_event_update(event, time);
}

static int task_clock_event_init(struct perf_event *event)
{
        if (event->attr.type != perf_task_clock.type)
                return -ENOENT;

        if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
                return -ENOENT;

        /*
         * no branch sampling for software events
         */
        if (has_branch_stack(event))
                return -EOPNOTSUPP;

        perf_swevent_init_hrtimer(event);

        return 0;
}

static struct pmu perf_task_clock = {
        .task_ctx_nr        = perf_sw_context,

        .capabilities        = PERF_PMU_CAP_NO_NMI,
        .dev                = PMU_NULL_DEV,

        .event_init        = task_clock_event_init,
        .add                = task_clock_event_add,
        .del                = task_clock_event_del,
        .start                = task_clock_event_start,
        .stop                = task_clock_event_stop,
        .read                = task_clock_event_read,
};

static void perf_pmu_nop_void(struct pmu *pmu)
{
}

static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
{
}

static int perf_pmu_nop_int(struct pmu *pmu)
{
        return 0;
}

static int perf_event_nop_int(struct perf_event *event, u64 value)
{
        return 0;
}

static DEFINE_PER_CPU(unsigned int, nop_txn_flags);

static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
{
        __this_cpu_write(nop_txn_flags, flags);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_disable(pmu);
}

static int perf_pmu_commit_txn(struct pmu *pmu)
{
        unsigned int flags = __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return 0;

        perf_pmu_enable(pmu);
        return 0;
}

static void perf_pmu_cancel_txn(struct pmu *pmu)
{
        unsigned int flags =  __this_cpu_read(nop_txn_flags);

        __this_cpu_write(nop_txn_flags, 0);

        if (flags & ~PERF_PMU_TXN_ADD)
                return;

        perf_pmu_enable(pmu);
}

static int perf_event_idx_default(struct perf_event *event)
{
        return 0;
}

/*
 * Let userspace know that this PMU supports address range filtering:
 */
static ssize_t nr_addr_filters_show(struct device *dev,
                                    struct device_attribute *attr,
                                    char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return sysfs_emit(page, "%d\n", pmu->nr_addr_filters);
}
DEVICE_ATTR_RO(nr_addr_filters);

static struct idr pmu_idr;

static ssize_t
type_show(struct device *dev, struct device_attribute *attr, char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return sysfs_emit(page, "%d\n", pmu->type);
}
static DEVICE_ATTR_RO(type);

static ssize_t
perf_event_mux_interval_ms_show(struct device *dev,
                                struct device_attribute *attr,
                                char *page)
{
        struct pmu *pmu = dev_get_drvdata(dev);

        return sysfs_emit(page, "%d\n", pmu->hrtimer_interval_ms);
}

static DEFINE_MUTEX(mux_interval_mutex);

static ssize_t
perf_event_mux_interval_ms_store(struct device *dev,
                                 struct device_attribute *attr,
                                 const char *buf, size_t count)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        int timer, cpu, ret;

        ret = kstrtoint(buf, 0, &timer);
        if (ret)
                return ret;

        if (timer < 1)
                return -EINVAL;

        /* same value, noting to do */
        if (timer == pmu->hrtimer_interval_ms)
                return count;

        mutex_lock(&mux_interval_mutex);
        pmu->hrtimer_interval_ms = timer;

        /* update all cpuctx for this PMU */
        cpus_read_lock();
        for_each_online_cpu(cpu) {
                struct perf_cpu_pmu_context *cpc;
                cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
                cpc->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);

                cpu_function_call(cpu, perf_mux_hrtimer_restart_ipi, cpc);
        }
        cpus_read_unlock();
        mutex_unlock(&mux_interval_mutex);

        return count;
}
static DEVICE_ATTR_RW(perf_event_mux_interval_ms);

static inline const struct cpumask *perf_scope_cpu_topology_cpumask(unsigned int scope, int cpu)
{
        switch (scope) {
        case PERF_PMU_SCOPE_CORE:
                return topology_sibling_cpumask(cpu);
        case PERF_PMU_SCOPE_DIE:
                return topology_die_cpumask(cpu);
        case PERF_PMU_SCOPE_CLUSTER:
                return topology_cluster_cpumask(cpu);
        case PERF_PMU_SCOPE_PKG:
                return topology_core_cpumask(cpu);
        case PERF_PMU_SCOPE_SYS_WIDE:
                return cpu_online_mask;
        }

        return NULL;
}

static inline struct cpumask *perf_scope_cpumask(unsigned int scope)
{
        switch (scope) {
        case PERF_PMU_SCOPE_CORE:
                return perf_online_core_mask;
        case PERF_PMU_SCOPE_DIE:
                return perf_online_die_mask;
        case PERF_PMU_SCOPE_CLUSTER:
                return perf_online_cluster_mask;
        case PERF_PMU_SCOPE_PKG:
                return perf_online_pkg_mask;
        case PERF_PMU_SCOPE_SYS_WIDE:
                return perf_online_sys_mask;
        }

        return NULL;
}

static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr,
                            char *buf)
{
        struct pmu *pmu = dev_get_drvdata(dev);
        struct cpumask *mask = perf_scope_cpumask(pmu->scope);

        if (mask)
                return cpumap_print_to_pagebuf(true, buf, mask);
        return 0;
}

static DEVICE_ATTR_RO(cpumask);

static struct attribute *pmu_dev_attrs[] = {
        &dev_attr_type.attr,
        &dev_attr_perf_event_mux_interval_ms.attr,
        &dev_attr_nr_addr_filters.attr,
        &dev_attr_cpumask.attr,
        NULL,
};

static umode_t pmu_dev_is_visible(struct kobject *kobj, struct attribute *a, int n)
{
        struct device *dev = kobj_to_dev(kobj);
        struct pmu *pmu = dev_get_drvdata(dev);

        if (n == 2 && !pmu->nr_addr_filters)
                return 0;

        /* cpumask */
        if (n == 3 && pmu->scope == PERF_PMU_SCOPE_NONE)
                return 0;

        return a->mode;
}

static struct attribute_group pmu_dev_attr_group = {
        .is_visible = pmu_dev_is_visible,
        .attrs = pmu_dev_attrs,
};

static const struct attribute_group *pmu_dev_groups[] = {
        &pmu_dev_attr_group,
        NULL,
};

static int pmu_bus_running;
static struct bus_type pmu_bus = {
        .name                = "event_source",
        .dev_groups        = pmu_dev_groups,
};

static void pmu_dev_release(struct device *dev)
{
        kfree(dev);
}

static int pmu_dev_alloc(struct pmu *pmu)
{
        int ret = -ENOMEM;

        pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
        if (!pmu->dev)
                goto out;

        pmu->dev->groups = pmu->attr_groups;
        device_initialize(pmu->dev);

        dev_set_drvdata(pmu->dev, pmu);
        pmu->dev->bus = &pmu_bus;
        pmu->dev->parent = pmu->parent;
        pmu->dev->release = pmu_dev_release;

        ret = dev_set_name(pmu->dev, "%s", pmu->name);
        if (ret)
                goto free_dev;

        ret = device_add(pmu->dev);
        if (ret)
                goto free_dev;

        if (pmu->attr_update) {
                ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update);
                if (ret)
                        goto del_dev;
        }

out:
        return ret;

del_dev:
        device_del(pmu->dev);

free_dev:
        put_device(pmu->dev);
        pmu->dev = NULL;
        goto out;
}

static struct lock_class_key cpuctx_mutex;
static struct lock_class_key cpuctx_lock;

static bool idr_cmpxchg(struct idr *idr, unsigned long id, void *old, void *new)
{
        void *tmp, *val = idr_find(idr, id);

        if (val != old)
                return false;

        tmp = idr_replace(idr, new, id);
        if (IS_ERR(tmp))
                return false;

        WARN_ON_ONCE(tmp != val);
        return true;
}

static void perf_pmu_free(struct pmu *pmu)
{
        if (pmu_bus_running && pmu->dev && pmu->dev != PMU_NULL_DEV) {
                if (pmu->nr_addr_filters)
                        device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
                device_del(pmu->dev);
                put_device(pmu->dev);
        }

        if (pmu->cpu_pmu_context) {
                int cpu;

                for_each_possible_cpu(cpu) {
                        struct perf_cpu_pmu_context *cpc;

                        cpc = *per_cpu_ptr(pmu->cpu_pmu_context, cpu);
                        if (!cpc)
                                continue;
                        if (cpc->epc.embedded) {
                                /* refcount managed */
                                put_pmu_ctx(&cpc->epc);
                                continue;
                        }
                        kfree(cpc);
                }
                free_percpu(pmu->cpu_pmu_context);
        }
}

DEFINE_FREE(pmu_unregister, struct pmu *, if (_T) perf_pmu_free(_T))

int perf_pmu_register(struct pmu *_pmu, const char *name, int type)
{
        int cpu, max = PERF_TYPE_MAX;

        struct pmu *pmu __free(pmu_unregister) = _pmu;
        guard(mutex)(&pmus_lock);

        if (WARN_ONCE(!name, "Can not register anonymous pmu.\n"))
                return -EINVAL;

        if (WARN_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE,
                      "Can not register a pmu with an invalid scope.\n"))
                return -EINVAL;

        pmu->name = name;

        if (type >= 0)
                max = type;

        CLASS(idr_alloc, pmu_type)(&pmu_idr, NULL, max, 0, GFP_KERNEL);
        if (pmu_type.id < 0)
                return pmu_type.id;

        WARN_ON(type >= 0 && pmu_type.id != type);

        pmu->type = pmu_type.id;
        atomic_set(&pmu->exclusive_cnt, 0);

        if (pmu_bus_running && !pmu->dev) {
                int ret = pmu_dev_alloc(pmu);
                if (ret)
                        return ret;
        }

        pmu->cpu_pmu_context = alloc_percpu(struct perf_cpu_pmu_context *);
        if (!pmu->cpu_pmu_context)
                return -ENOMEM;

        for_each_possible_cpu(cpu) {
                struct perf_cpu_pmu_context *cpc =
                        kmalloc_node(sizeof(struct perf_cpu_pmu_context),
                                     GFP_KERNEL | __GFP_ZERO,
                                     cpu_to_node(cpu));

                if (!cpc)
                        return -ENOMEM;

                *per_cpu_ptr(pmu->cpu_pmu_context, cpu) = cpc;
                __perf_init_event_pmu_context(&cpc->epc, pmu);
                __perf_mux_hrtimer_init(cpc, cpu);
        }

        if (!pmu->start_txn) {
                if (pmu->pmu_enable) {
                        /*
                         * If we have pmu_enable/pmu_disable calls, install
                         * transaction stubs that use that to try and batch
                         * hardware accesses.
                         */
                        pmu->start_txn  = perf_pmu_start_txn;
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
                        pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
        }

        if (!pmu->pmu_enable) {
                pmu->pmu_enable  = perf_pmu_nop_void;
                pmu->pmu_disable = perf_pmu_nop_void;
        }

        if (!pmu->check_period)
                pmu->check_period = perf_event_nop_int;

        if (!pmu->event_idx)
                pmu->event_idx = perf_event_idx_default;

        /*
         * Now that the PMU is complete, make it visible to perf_try_init_event().
         */
        if (!idr_cmpxchg(&pmu_idr, pmu->type, NULL, pmu))
                return -EINVAL;
        list_add_rcu(&pmu->entry, &pmus);

        take_idr_id(pmu_type);
        _pmu = no_free_ptr(pmu); // let it rip
        return 0;
}
EXPORT_SYMBOL_GPL(perf_pmu_register);

void perf_pmu_unregister(struct pmu *pmu)
{
        scoped_guard (mutex, &pmus_lock) {
                list_del_rcu(&pmu->entry);
                idr_remove(&pmu_idr, pmu->type);
        }

        /*
         * We dereference the pmu list under both SRCU and regular RCU, so
         * synchronize against both of those.
         */
        synchronize_srcu(&pmus_srcu);
        synchronize_rcu();

        perf_pmu_free(pmu);
}
EXPORT_SYMBOL_GPL(perf_pmu_unregister);

static inline bool has_extended_regs(struct perf_event *event)
{
        return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) ||
               (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK);
}

static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
{
        struct perf_event_context *ctx = NULL;
        int ret;

        if (!try_module_get(pmu->module))
                return -ENODEV;

        /*
         * A number of pmu->event_init() methods iterate the sibling_list to,
         * for example, validate if the group fits on the PMU. Therefore,
         * if this is a sibling event, acquire the ctx->mutex to protect
         * the sibling_list.
         */
        if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
                /*
                 * This ctx->mutex can nest when we're called through
                 * inheritance. See the perf_event_ctx_lock_nested() comment.
                 */
                ctx = perf_event_ctx_lock_nested(event->group_leader,
                                                 SINGLE_DEPTH_NESTING);
                BUG_ON(!ctx);
        }

        event->pmu = pmu;
        ret = pmu->event_init(event);

        if (ctx)
                perf_event_ctx_unlock(event->group_leader, ctx);

        if (ret)
                goto err_pmu;

        if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) &&
            has_extended_regs(event)) {
                ret = -EOPNOTSUPP;
                goto err_destroy;
        }

        if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE &&
            event_has_any_exclude_flag(event)) {
                ret = -EINVAL;
                goto err_destroy;
        }

        if (pmu->scope != PERF_PMU_SCOPE_NONE && event->cpu >= 0) {
                const struct cpumask *cpumask;
                struct cpumask *pmu_cpumask;
                int cpu;

                cpumask = perf_scope_cpu_topology_cpumask(pmu->scope, event->cpu);
                pmu_cpumask = perf_scope_cpumask(pmu->scope);

                ret = -ENODEV;
                if (!pmu_cpumask || !cpumask)
                        goto err_destroy;

                cpu = cpumask_any_and(pmu_cpumask, cpumask);
                if (cpu >= nr_cpu_ids)
                        goto err_destroy;

                event->event_caps |= PERF_EV_CAP_READ_SCOPE;
        }

        return 0;

err_destroy:
        if (event->destroy) {
                event->destroy(event);
                event->destroy = NULL;
        }

err_pmu:
        event->pmu = NULL;
        module_put(pmu->module);
        return ret;
}

static struct pmu *perf_init_event(struct perf_event *event)
{
        bool extended_type = false;
        struct pmu *pmu;
        int type, ret;

        guard(srcu)(&pmus_srcu);

        /*
         * Save original type before calling pmu->event_init() since certain
         * pmus overwrites event->attr.type to forward event to another pmu.
         */
        event->orig_type = event->attr.type;

        /* Try parent's PMU first: */
        if (event->parent && event->parent->pmu) {
                pmu = event->parent->pmu;
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        return pmu;
        }

        /*
         * PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
         * are often aliases for PERF_TYPE_RAW.
         */
        type = event->attr.type;
        if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE) {
                type = event->attr.config >> PERF_PMU_TYPE_SHIFT;
                if (!type) {
                        type = PERF_TYPE_RAW;
                } else {
                        extended_type = true;
                        event->attr.config &= PERF_HW_EVENT_MASK;
                }
        }

again:
        scoped_guard (rcu)
                pmu = idr_find(&pmu_idr, type);
        if (pmu) {
                if (event->attr.type != type && type != PERF_TYPE_RAW &&
                    !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
                        return ERR_PTR(-ENOENT);

                ret = perf_try_init_event(pmu, event);
                if (ret == -ENOENT && event->attr.type != type && !extended_type) {
                        type = event->attr.type;
                        goto again;
                }

                if (ret)
                        return ERR_PTR(ret);

                return pmu;
        }

        list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
                ret = perf_try_init_event(pmu, event);
                if (!ret)
                        return pmu;

                if (ret != -ENOENT)
                        return ERR_PTR(ret);
        }

        return ERR_PTR(-ENOENT);
}

static void attach_sb_event(struct perf_event *event)
{
        struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);

        raw_spin_lock(&pel->lock);
        list_add_rcu(&event->sb_list, &pel->list);
        raw_spin_unlock(&pel->lock);
}

/*
 * We keep a list of all !task (and therefore per-cpu) events
 * that need to receive side-band records.
 *
 * This avoids having to scan all the various PMU per-cpu contexts
 * looking for them.
 */
static void account_pmu_sb_event(struct perf_event *event)
{
        if (is_sb_event(event))
                attach_sb_event(event);
}

/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
        /* Lock so we don't race with concurrent unaccount */
        spin_lock(&nr_freq_lock);
        if (atomic_inc_return(&nr_freq_events) == 1)
                tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
        spin_unlock(&nr_freq_lock);
#endif
}

static void account_freq_event(void)
{
        if (tick_nohz_full_enabled())
                account_freq_event_nohz();
        else
                atomic_inc(&nr_freq_events);
}


static void account_event(struct perf_event *event)
{
        bool inc = false;

        if (event->parent)
                return;

        if (event->attach_state & (PERF_ATTACH_TASK | PERF_ATTACH_SCHED_CB))
                inc = true;
        if (event->attr.mmap || event->attr.mmap_data)
                atomic_inc(&nr_mmap_events);
        if (event->attr.build_id)
                atomic_inc(&nr_build_id_events);
        if (event->attr.comm)
                atomic_inc(&nr_comm_events);
        if (event->attr.namespaces)
                atomic_inc(&nr_namespaces_events);
        if (event->attr.cgroup)
                atomic_inc(&nr_cgroup_events);
        if (event->attr.task)
                atomic_inc(&nr_task_events);
        if (event->attr.freq)
                account_freq_event();
        if (event->attr.context_switch) {
                atomic_inc(&nr_switch_events);
                inc = true;
        }
        if (has_branch_stack(event))
                inc = true;
        if (is_cgroup_event(event))
                inc = true;
        if (event->attr.ksymbol)
                atomic_inc(&nr_ksymbol_events);
        if (event->attr.bpf_event)
                atomic_inc(&nr_bpf_events);
        if (event->attr.text_poke)
                atomic_inc(&nr_text_poke_events);

        if (inc) {
                /*
                 * We need the mutex here because static_branch_enable()
                 * must complete *before* the perf_sched_count increment
                 * becomes visible.
                 */
                if (atomic_inc_not_zero(&perf_sched_count))
                        goto enabled;

                mutex_lock(&perf_sched_mutex);
                if (!atomic_read(&perf_sched_count)) {
                        static_branch_enable(&perf_sched_events);
                        /*
                         * Guarantee that all CPUs observe they key change and
                         * call the perf scheduling hooks before proceeding to
                         * install events that need them.
                         */
                        synchronize_rcu();
                }
                /*
                 * Now that we have waited for the sync_sched(), allow further
                 * increments to by-pass the mutex.
                 */
                atomic_inc(&perf_sched_count);
                mutex_unlock(&perf_sched_mutex);
        }
enabled:

        account_pmu_sb_event(event);
}

/*
 * Allocate and initialize an event structure
 */
static struct perf_event *
perf_event_alloc(struct perf_event_attr *attr, int cpu,
                 struct task_struct *task,
                 struct perf_event *group_leader,
                 struct perf_event *parent_event,
                 perf_overflow_handler_t overflow_handler,
                 void *context, int cgroup_fd)
{
        struct pmu *pmu;
        struct hw_perf_event *hwc;
        long err = -EINVAL;
        int node;

        if ((unsigned)cpu >= nr_cpu_ids) {
                if (!task || cpu != -1)
                        return ERR_PTR(-EINVAL);
        }
        if (attr->sigtrap && !task) {
                /* Requires a task: avoid signalling random tasks. */
                return ERR_PTR(-EINVAL);
        }

        node = (cpu >= 0) ? cpu_to_node(cpu) : -1;
        struct perf_event *event __free(__free_event) =
                kmem_cache_alloc_node(perf_event_cache, GFP_KERNEL | __GFP_ZERO, node);
        if (!event)
                return ERR_PTR(-ENOMEM);

        /*
         * Single events are their own group leaders, with an
         * empty sibling list:
         */
        if (!group_leader)
                group_leader = event;

        mutex_init(&event->child_mutex);
        INIT_LIST_HEAD(&event->child_list);

        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->active_list);
        init_event_group(event);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
        INIT_LIST_HEAD(&event->addr_filters.list);
        INIT_HLIST_NODE(&event->hlist_entry);


        init_waitqueue_head(&event->waitq);
        init_irq_work(&event->pending_irq, perf_pending_irq);
        event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable);
        init_task_work(&event->pending_task, perf_pending_task);

        mutex_init(&event->mmap_mutex);
        raw_spin_lock_init(&event->addr_filters.lock);

        atomic_long_set(&event->refcount, 1);
        event->cpu                = cpu;
        event->attr                = *attr;
        event->group_leader        = group_leader;
        event->pmu                = NULL;
        event->oncpu                = -1;

        event->parent                = parent_event;

        event->ns                = get_pid_ns(task_active_pid_ns(current));
        event->id                = atomic64_inc_return(&perf_event_id);

        event->state                = PERF_EVENT_STATE_INACTIVE;

        if (parent_event)
                event->event_caps = parent_event->event_caps;

        if (task) {
                event->attach_state = PERF_ATTACH_TASK;
                /*
                 * XXX pmu::event_init needs to know what task to account to
                 * and we cannot use the ctx information because we need the
                 * pmu before we get a ctx.
                 */
                event->hw.target = get_task_struct(task);
        }

        event->clock = &local_clock;
        if (parent_event)
                event->clock = parent_event->clock;

        if (!overflow_handler && parent_event) {
                overflow_handler = parent_event->overflow_handler;
                context = parent_event->overflow_handler_context;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
                if (parent_event->prog) {
                        struct bpf_prog *prog = parent_event->prog;

                        bpf_prog_inc(prog);
                        event->prog = prog;
                }
#endif
        }

        if (overflow_handler) {
                event->overflow_handler        = overflow_handler;
                event->overflow_handler_context = context;
        } else if (is_write_backward(event)){
                event->overflow_handler = perf_event_output_backward;
                event->overflow_handler_context = NULL;
        } else {
                event->overflow_handler = perf_event_output_forward;
                event->overflow_handler_context = NULL;
        }

        perf_event__state_init(event);

        pmu = NULL;

        hwc = &event->hw;
        hwc->sample_period = attr->sample_period;
        if (attr->freq && attr->sample_freq)
                hwc->sample_period = 1;
        hwc->last_period = hwc->sample_period;

        local64_set(&hwc->period_left, hwc->sample_period);

        /*
         * We do not support PERF_SAMPLE_READ on inherited events unless
         * PERF_SAMPLE_TID is also selected, which allows inherited events to
         * collect per-thread samples.
         * See perf_output_read().
         */
        if (has_inherit_and_sample_read(attr) && !(attr->sample_type & PERF_SAMPLE_TID))
                return ERR_PTR(-EINVAL);

        if (!has_branch_stack(event))
                event->attr.branch_sample_type = 0;

        pmu = perf_init_event(event);
        if (IS_ERR(pmu))
                return (void*)pmu;

        /*
         * The PERF_ATTACH_TASK_DATA is set in the event_init()->hw_config().
         * The attach should be right after the perf_init_event().
         * Otherwise, the __free_event() would mistakenly detach the non-exist
         * perf_ctx_data because of the other errors between them.
         */
        if (event->attach_state & PERF_ATTACH_TASK_DATA) {
                err = attach_perf_ctx_data(event);
                if (err)
                        return ERR_PTR(err);
        }

        /*
         * Disallow uncore-task events. Similarly, disallow uncore-cgroup
         * events (they don't make sense as the cgroup will be different
         * on other CPUs in the uncore mask).
         */
        if (pmu->task_ctx_nr == perf_invalid_context && (task || cgroup_fd != -1))
                return ERR_PTR(-EINVAL);

        if (event->attr.aux_output &&
            (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) ||
             event->attr.aux_pause || event->attr.aux_resume))
                return ERR_PTR(-EOPNOTSUPP);

        if (event->attr.aux_pause && event->attr.aux_resume)
                return ERR_PTR(-EINVAL);

        if (event->attr.aux_start_paused) {
                if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE))
                        return ERR_PTR(-EOPNOTSUPP);
                event->hw.aux_paused = 1;
        }

        if (cgroup_fd != -1) {
                err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
                if (err)
                        return ERR_PTR(err);
        }

        err = exclusive_event_init(event);
        if (err)
                return ERR_PTR(err);

        if (has_addr_filter(event)) {
                event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
                                                    sizeof(struct perf_addr_filter_range),
                                                    GFP_KERNEL);
                if (!event->addr_filter_ranges)
                        return ERR_PTR(-ENOMEM);

                /*
                 * Clone the parent's vma offsets: they are valid until exec()
                 * even if the mm is not shared with the parent.
                 */
                if (event->parent) {
                        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);

                        raw_spin_lock_irq(&ifh->lock);
                        memcpy(event->addr_filter_ranges,
                               event->parent->addr_filter_ranges,
                               pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
                        raw_spin_unlock_irq(&ifh->lock);
                }

                /* force hw sync on the address filters */
                event->addr_filters_gen = 1;
        }

        if (!event->parent) {
                if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
                        err = get_callchain_buffers(attr->sample_max_stack);
                        if (err)
                                return ERR_PTR(err);
                        event->attach_state |= PERF_ATTACH_CALLCHAIN;
                }
        }

        err = security_perf_event_alloc(event);
        if (err)
                return ERR_PTR(err);

        /* symmetric to unaccount_event() in _free_event() */
        account_event(event);

        return_ptr(event);
}

static int perf_copy_attr(struct perf_event_attr __user *uattr,
                          struct perf_event_attr *attr)
{
        u32 size;
        int ret;

        /* Zero the full structure, so that a short copy will be nice. */
        memset(attr, 0, sizeof(*attr));

        ret = get_user(size, &uattr->size);
        if (ret)
                return ret;

        /* ABI compatibility quirk: */
        if (!size)
                size = PERF_ATTR_SIZE_VER0;
        if (size < PERF_ATTR_SIZE_VER0 || size > PAGE_SIZE)
                goto err_size;

        ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
        if (ret) {
                if (ret == -E2BIG)
                        goto err_size;
                return ret;
        }

        attr->size = size;

        if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3)
                return -EINVAL;

        if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
                return -EINVAL;

        if (attr->read_format & ~(PERF_FORMAT_MAX-1))
                return -EINVAL;

        if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
                u64 mask = attr->branch_sample_type;

                /* only using defined bits */
                if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
                        return -EINVAL;

                /* at least one branch bit must be set */
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;

                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {

                        /* exclude_kernel checked on syscall entry */
                        if (!attr->exclude_kernel)
                                mask |= PERF_SAMPLE_BRANCH_KERNEL;

                        if (!attr->exclude_user)
                                mask |= PERF_SAMPLE_BRANCH_USER;

                        if (!attr->exclude_hv)
                                mask |= PERF_SAMPLE_BRANCH_HV;
                        /*
                         * adjust user setting (for HW filter setup)
                         */
                        attr->branch_sample_type = mask;
                }
                /* privileged levels capture (kernel, hv): check permissions */
                if (mask & PERF_SAMPLE_BRANCH_PERM_PLM) {
                        ret = perf_allow_kernel();
                        if (ret)
                                return ret;
                }
        }

        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
                ret = perf_reg_validate(attr->sample_regs_user);
                if (ret)
                        return ret;
        }

        if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
                if (!arch_perf_have_user_stack_dump())
                        return -ENOSYS;

                /*
                 * We have __u32 type for the size, but so far
                 * we can only use __u16 as maximum due to the
                 * __u16 sample size limit.
                 */
                if (attr->sample_stack_user >= USHRT_MAX)
                        return -EINVAL;
                else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
                        return -EINVAL;
        }

        if (!attr->sample_max_stack)
                attr->sample_max_stack = sysctl_perf_event_max_stack;

        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
                ret = perf_reg_validate(attr->sample_regs_intr);

#ifndef CONFIG_CGROUP_PERF
        if (attr->sample_type & PERF_SAMPLE_CGROUP)
                return -EINVAL;
#endif
        if ((attr->sample_type & PERF_SAMPLE_WEIGHT) &&
            (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT))
                return -EINVAL;

        if (!attr->inherit && attr->inherit_thread)
                return -EINVAL;

        if (attr->remove_on_exec && attr->enable_on_exec)
                return -EINVAL;

        if (attr->sigtrap && !attr->remove_on_exec)
                return -EINVAL;

out:
        return ret;

err_size:
        put_user(sizeof(*attr), &uattr->size);
        ret = -E2BIG;
        goto out;
}

static void mutex_lock_double(struct mutex *a, struct mutex *b)
{
        if (b < a)
                swap(a, b);

        mutex_lock(a);
        mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
}

static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
        struct perf_buffer *rb = NULL;
        int ret = -EINVAL;

        if (!output_event) {
                mutex_lock(&event->mmap_mutex);
                goto set;
        }

        /* don't allow circular references */
        if (event == output_event)
                goto out;

        /*
         * Don't allow cross-cpu buffers
         */
        if (output_event->cpu != event->cpu)
                goto out;

        /*
         * If its not a per-cpu rb, it must be the same task.
         */
        if (output_event->cpu == -1 && output_event->hw.target != event->hw.target)
                goto out;

        /*
         * Mixing clocks in the same buffer is trouble you don't need.
         */
        if (output_event->clock != event->clock)
                goto out;

        /*
         * Either writing ring buffer from beginning or from end.
         * Mixing is not allowed.
         */
        if (is_write_backward(output_event) != is_write_backward(event))
                goto out;

        /*
         * If both events generate aux data, they must be on the same PMU
         */
        if (has_aux(event) && has_aux(output_event) &&
            event->pmu != output_event->pmu)
                goto out;

        /*
         * Hold both mmap_mutex to serialize against perf_mmap_close().  Since
         * output_event is already on rb->event_list, and the list iteration
         * restarts after every removal, it is guaranteed this new event is
         * observed *OR* if output_event is already removed, it's guaranteed we
         * observe !rb->mmap_count.
         */
        mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
        /* Can't redirect output if we've got an active mmap() */
        if (atomic_read(&event->mmap_count))
                goto unlock;

        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
                if (!rb)
                        goto unlock;

                /* did we race against perf_mmap_close() */
                if (!atomic_read(&rb->mmap_count)) {
                        ring_buffer_put(rb);
                        goto unlock;
                }
        }

        ring_buffer_attach(event, rb);

        ret = 0;
unlock:
        mutex_unlock(&event->mmap_mutex);
        if (output_event)
                mutex_unlock(&output_event->mmap_mutex);

out:
        return ret;
}

static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
{
        bool nmi_safe = false;

        switch (clk_id) {
        case CLOCK_MONOTONIC:
                event->clock = &ktime_get_mono_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_MONOTONIC_RAW:
                event->clock = &ktime_get_raw_fast_ns;
                nmi_safe = true;
                break;

        case CLOCK_REALTIME:
                event->clock = &ktime_get_real_ns;
                break;

        case CLOCK_BOOTTIME:
                event->clock = &ktime_get_boottime_ns;
                break;

        case CLOCK_TAI:
                event->clock = &ktime_get_clocktai_ns;
                break;

        default:
                return -EINVAL;
        }

        if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
                return -EINVAL;

        return 0;
}

static bool
perf_check_permission(struct perf_event_attr *attr, struct task_struct *task)
{
        unsigned int ptrace_mode = PTRACE_MODE_READ_REALCREDS;
        bool is_capable = perfmon_capable();

        if (attr->sigtrap) {
                /*
                 * perf_event_attr::sigtrap sends signals to the other task.
                 * Require the current task to also have CAP_KILL.
                 */
                rcu_read_lock();
                is_capable &= ns_capable(__task_cred(task)->user_ns, CAP_KILL);
                rcu_read_unlock();

                /*
                 * If the required capabilities aren't available, checks for
                 * ptrace permissions: upgrade to ATTACH, since sending signals
                 * can effectively change the target task.
                 */
                ptrace_mode = PTRACE_MODE_ATTACH_REALCREDS;
        }

        /*
         * Preserve ptrace permission check for backwards compatibility. The
         * ptrace check also includes checks that the current task and other
         * task have matching uids, and is therefore not done here explicitly.
         */
        return is_capable || ptrace_may_access(task, ptrace_mode);
}

/**
 * sys_perf_event_open - open a performance event, associate it to a task/cpu
 *
 * @attr_uptr:        event_id type attributes for monitoring/sampling
 * @pid:                target pid
 * @cpu:                target cpu
 * @group_fd:                group leader event fd
 * @flags:                perf event open flags
 */
SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_attr __user *, attr_uptr,
                pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
{
        struct perf_event *group_leader = NULL, *output_event = NULL;
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *event, *sibling;
        struct perf_event_attr attr;
        struct perf_event_context *ctx;
        struct file *event_file = NULL;
        struct task_struct *task = NULL;
        struct pmu *pmu;
        int event_fd;
        int move_group = 0;
        int err;
        int f_flags = O_RDWR;
        int cgroup_fd = -1;

        /* for future expandability... */
        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;

        err = perf_copy_attr(attr_uptr, &attr);
        if (err)
                return err;

        /* Do we allow access to perf_event_open(2) ? */
        err = security_perf_event_open(PERF_SECURITY_OPEN);
        if (err)
                return err;

        if (!attr.exclude_kernel) {
                err = perf_allow_kernel();
                if (err)
                        return err;
        }

        if (attr.namespaces) {
                if (!perfmon_capable())
                        return -EACCES;
        }

        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
        } else {
                if (attr.sample_period & (1ULL << 63))
                        return -EINVAL;
        }

        /* Only privileged users can get physical addresses */
        if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR)) {
                err = perf_allow_kernel();
                if (err)
                        return err;
        }

        /* REGS_INTR can leak data, lockdown must prevent this */
        if (attr.sample_type & PERF_SAMPLE_REGS_INTR) {
                err = security_locked_down(LOCKDOWN_PERF);
                if (err)
                        return err;
        }

        /*
         * In cgroup mode, the pid argument is used to pass the fd
         * opened to the cgroup directory in cgroupfs. The cpu argument
         * designates the cpu on which to monitor threads from that
         * cgroup.
         */
        if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
                return -EINVAL;

        if (flags & PERF_FLAG_FD_CLOEXEC)
                f_flags |= O_CLOEXEC;

        event_fd = get_unused_fd_flags(f_flags);
        if (event_fd < 0)
                return event_fd;

        CLASS(fd, group)(group_fd);     // group_fd == -1 => empty
        if (group_fd != -1) {
                if (!is_perf_file(group)) {
                        err = -EBADF;
                        goto err_fd;
                }
                group_leader = fd_file(group)->private_data;
                if (flags & PERF_FLAG_FD_OUTPUT)
                        output_event = group_leader;
                if (flags & PERF_FLAG_FD_NO_GROUP)
                        group_leader = NULL;
        }

        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
                task = find_lively_task_by_vpid(pid);
                if (IS_ERR(task)) {
                        err = PTR_ERR(task);
                        goto err_fd;
                }
        }

        if (task && group_leader &&
            group_leader->attr.inherit != attr.inherit) {
                err = -EINVAL;
                goto err_task;
        }

        if (flags & PERF_FLAG_PID_CGROUP)
                cgroup_fd = pid;

        event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
                                 NULL, NULL, cgroup_fd);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err_task;
        }

        if (is_sampling_event(event)) {
                if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
                        err = -EOPNOTSUPP;
                        goto err_alloc;
                }
        }

        /*
         * Special case software events and allow them to be part of
         * any hardware group.
         */
        pmu = event->pmu;

        if (attr.use_clockid) {
                err = perf_event_set_clock(event, attr.clockid);
                if (err)
                        goto err_alloc;
        }

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        if (task) {
                err = down_read_interruptible(&task->signal->exec_update_lock);
                if (err)
                        goto err_alloc;

                /*
                 * We must hold exec_update_lock across this and any potential
                 * perf_install_in_context() call for this new event to
                 * serialize against exec() altering our credentials (and the
                 * perf_event_exit_task() that could imply).
                 */
                err = -EACCES;
                if (!perf_check_permission(&attr, task))
                        goto err_cred;
        }

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_cred;
        }

        mutex_lock(&ctx->mutex);

        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_locked;
        }

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx = per_cpu_ptr(&perf_cpu_context, event->cpu);

                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_locked;
                }
        }

        if (group_leader) {
                err = -EINVAL;

                /*
                 * Do not allow a recursive hierarchy (this new sibling
                 * becoming part of another group-sibling):
                 */
                if (group_leader->group_leader != group_leader)
                        goto err_locked;

                /* All events in a group should have the same clock */
                if (group_leader->clock != event->clock)
                        goto err_locked;

                /*
                 * Make sure we're both events for the same CPU;
                 * grouping events for different CPUs is broken; since
                 * you can never concurrently schedule them anyhow.
                 */
                if (group_leader->cpu != event->cpu)
                        goto err_locked;

                /*
                 * Make sure we're both on the same context; either task or cpu.
                 */
                if (group_leader->ctx != ctx)
                        goto err_locked;

                /*
                 * Only a group leader can be exclusive or pinned
                 */
                if (attr.exclusive || attr.pinned)
                        goto err_locked;

                if (is_software_event(event) &&
                    !in_software_context(group_leader)) {
                        /*
                         * If the event is a sw event, but the group_leader
                         * is on hw context.
                         *
                         * Allow the addition of software events to hw
                         * groups, this is safe because software events
                         * never fail to schedule.
                         *
                         * Note the comment that goes with struct
                         * perf_event_pmu_context.
                         */
                        pmu = group_leader->pmu_ctx->pmu;
                } else if (!is_software_event(event)) {
                        if (is_software_event(group_leader) &&
                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
                                /*
                                 * In case the group is a pure software group, and we
                                 * try to add a hardware event, move the whole group to
                                 * the hardware context.
                                 */
                                move_group = 1;
                        }

                        /* Don't allow group of multiple hw events from different pmus */
                        if (!in_software_context(group_leader) &&
                            group_leader->pmu_ctx->pmu != pmu)
                                goto err_locked;
                }
        }

        /*
         * Now that we're certain of the pmu; find the pmu_ctx.
         */
        pmu_ctx = find_get_pmu_context(pmu, ctx, event);
        if (IS_ERR(pmu_ctx)) {
                err = PTR_ERR(pmu_ctx);
                goto err_locked;
        }
        event->pmu_ctx = pmu_ctx;

        if (output_event) {
                err = perf_event_set_output(event, output_event);
                if (err)
                        goto err_context;
        }

        if (!perf_event_validate_size(event)) {
                err = -E2BIG;
                goto err_context;
        }

        if (perf_need_aux_event(event) && !perf_get_aux_event(event, group_leader)) {
                err = -EINVAL;
                goto err_context;
        }

        /*
         * Must be under the same ctx::mutex as perf_install_in_context(),
         * because we need to serialize with concurrent event creation.
         */
        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_context;
        }

        WARN_ON_ONCE(ctx->parent_ctx);

        event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, f_flags);
        if (IS_ERR(event_file)) {
                err = PTR_ERR(event_file);
                event_file = NULL;
                goto err_context;
        }

        /*
         * This is the point on no return; we cannot fail hereafter. This is
         * where we start modifying current state.
         */

        if (move_group) {
                perf_remove_from_context(group_leader, 0);
                put_pmu_ctx(group_leader->pmu_ctx);

                for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
                        put_pmu_ctx(sibling->pmu_ctx);
                }

                /*
                 * Install the group siblings before the group leader.
                 *
                 * Because a group leader will try and install the entire group
                 * (through the sibling list, which is still in-tact), we can
                 * end up with siblings installed in the wrong context.
                 *
                 * By installing siblings first we NO-OP because they're not
                 * reachable through the group lists.
                 */
                for_each_sibling_event(sibling, group_leader) {
                        sibling->pmu_ctx = pmu_ctx;
                        get_pmu_ctx(pmu_ctx);
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                }

                /*
                 * Removing from the context ends up with disabled
                 * event. What we want here is event in the initial
                 * startup state, ready to be add into new context.
                 */
                group_leader->pmu_ctx = pmu_ctx;
                get_pmu_ctx(pmu_ctx);
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
        }

        /*
         * Precalculate sample_data sizes; do while holding ctx::mutex such
         * that we're serialized against further additions and before
         * perf_install_in_context() which is the point the event is active and
         * can use these values.
         */
        perf_event__header_size(event);
        perf_event__id_header_size(event);

        event->owner = current;

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);

        mutex_unlock(&ctx->mutex);

        if (task) {
                up_read(&task->signal->exec_update_lock);
                put_task_struct(task);
        }

        mutex_lock(&current->perf_event_mutex);
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);

        /*
         * File reference in group guarantees that group_leader has been
         * kept alive until we place the new event on the sibling_list.
         * This ensures destruction of the group leader will find
         * the pointer to itself in perf_group_detach().
         */
        fd_install(event_fd, event_file);
        return event_fd;

err_context:
        put_pmu_ctx(event->pmu_ctx);
        event->pmu_ctx = NULL; /* _free_event() */
err_locked:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_cred:
        if (task)
                up_read(&task->signal->exec_update_lock);
err_alloc:
        free_event(event);
err_task:
        if (task)
                put_task_struct(task);
err_fd:
        put_unused_fd(event_fd);
        return err;
}

/**
 * perf_event_create_kernel_counter
 *
 * @attr: attributes of the counter to create
 * @cpu: cpu in which the counter is bound
 * @task: task to profile (NULL for percpu)
 * @overflow_handler: callback to trigger when we hit the event
 * @context: context data could be used in overflow_handler callback
 */
struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                                 struct task_struct *task,
                                 perf_overflow_handler_t overflow_handler,
                                 void *context)
{
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event_context *ctx;
        struct perf_event *event;
        struct pmu *pmu;
        int err;

        /*
         * Grouping is not supported for kernel events, neither is 'AUX',
         * make sure the caller's intentions are adjusted.
         */
        if (attr->aux_output || attr->aux_action)
                return ERR_PTR(-EINVAL);

        event = perf_event_alloc(attr, cpu, task, NULL, NULL,
                                 overflow_handler, context, -1);
        if (IS_ERR(event)) {
                err = PTR_ERR(event);
                goto err;
        }

        /* Mark owner so we could distinguish it from user events. */
        event->owner = TASK_TOMBSTONE;
        pmu = event->pmu;

        if (pmu->task_ctx_nr == perf_sw_context)
                event->event_caps |= PERF_EV_CAP_SOFTWARE;

        /*
         * Get the target context (task or percpu):
         */
        ctx = find_get_context(task, event);
        if (IS_ERR(ctx)) {
                err = PTR_ERR(ctx);
                goto err_alloc;
        }

        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        if (ctx->task == TASK_TOMBSTONE) {
                err = -ESRCH;
                goto err_unlock;
        }

        pmu_ctx = find_get_pmu_context(pmu, ctx, event);
        if (IS_ERR(pmu_ctx)) {
                err = PTR_ERR(pmu_ctx);
                goto err_unlock;
        }
        event->pmu_ctx = pmu_ctx;

        if (!task) {
                /*
                 * Check if the @cpu we're creating an event for is online.
                 *
                 * We use the perf_cpu_context::ctx::mutex to serialize against
                 * the hotplug notifiers. See perf_event_{init,exit}_cpu().
                 */
                struct perf_cpu_context *cpuctx =
                        container_of(ctx, struct perf_cpu_context, ctx);
                if (!cpuctx->online) {
                        err = -ENODEV;
                        goto err_pmu_ctx;
                }
        }

        if (!exclusive_event_installable(event, ctx)) {
                err = -EBUSY;
                goto err_pmu_ctx;
        }

        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);

        return event;

err_pmu_ctx:
        put_pmu_ctx(pmu_ctx);
        event->pmu_ctx = NULL; /* _free_event() */
err_unlock:
        mutex_unlock(&ctx->mutex);
        perf_unpin_context(ctx);
        put_ctx(ctx);
err_alloc:
        free_event(event);
err:
        return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);

static void __perf_pmu_remove(struct perf_event_context *ctx,
                              int cpu, struct pmu *pmu,
                              struct perf_event_groups *groups,
                              struct list_head *events)
{
        struct perf_event *event, *sibling;

        perf_event_groups_for_cpu_pmu(event, groups, cpu, pmu) {
                perf_remove_from_context(event, 0);
                put_pmu_ctx(event->pmu_ctx);
                list_add(&event->migrate_entry, events);

                for_each_sibling_event(sibling, event) {
                        perf_remove_from_context(sibling, 0);
                        put_pmu_ctx(sibling->pmu_ctx);
                        list_add(&sibling->migrate_entry, events);
                }
        }
}

static void __perf_pmu_install_event(struct pmu *pmu,
                                     struct perf_event_context *ctx,
                                     int cpu, struct perf_event *event)
{
        struct perf_event_pmu_context *epc;
        struct perf_event_context *old_ctx = event->ctx;

        get_ctx(ctx); /* normally find_get_context() */

        event->cpu = cpu;
        epc = find_get_pmu_context(pmu, ctx, event);
        event->pmu_ctx = epc;

        if (event->state >= PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_INACTIVE;
        perf_install_in_context(ctx, event, cpu);

        /*
         * Now that event->ctx is updated and visible, put the old ctx.
         */
        put_ctx(old_ctx);
}

static void __perf_pmu_install(struct perf_event_context *ctx,
                               int cpu, struct pmu *pmu, struct list_head *events)
{
        struct perf_event *event, *tmp;

        /*
         * Re-instate events in 2 passes.
         *
         * Skip over group leaders and only install siblings on this first
         * pass, siblings will not get enabled without a leader, however a
         * leader will enable its siblings, even if those are still on the old
         * context.
         */
        list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                if (event->group_leader == event)
                        continue;

                list_del(&event->migrate_entry);
                __perf_pmu_install_event(pmu, ctx, cpu, event);
        }

        /*
         * Once all the siblings are setup properly, install the group leaders
         * to make it go.
         */
        list_for_each_entry_safe(event, tmp, events, migrate_entry) {
                list_del(&event->migrate_entry);
                __perf_pmu_install_event(pmu, ctx, cpu, event);
        }
}

void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
{
        struct perf_event_context *src_ctx, *dst_ctx;
        LIST_HEAD(events);

        /*
         * Since per-cpu context is persistent, no need to grab an extra
         * reference.
         */
        src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
        dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;

        /*
         * See perf_event_ctx_lock() for comments on the details
         * of swizzling perf_event::ctx.
         */
        mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);

        __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->pinned_groups, &events);
        __perf_pmu_remove(src_ctx, src_cpu, pmu, &src_ctx->flexible_groups, &events);

        if (!list_empty(&events)) {
                /*
                 * Wait for the events to quiesce before re-instating them.
                 */
                synchronize_rcu();

                __perf_pmu_install(dst_ctx, dst_cpu, pmu, &events);
        }

        mutex_unlock(&dst_ctx->mutex);
        mutex_unlock(&src_ctx->mutex);
}
EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);

static void sync_child_event(struct perf_event *child_event)
{
        struct perf_event *parent_event = child_event->parent;
        u64 child_val;

        if (child_event->attr.inherit_stat) {
                struct task_struct *task = child_event->ctx->task;

                if (task && task != TASK_TOMBSTONE)
                        perf_event_read_event(child_event, task);
        }

        child_val = perf_event_count(child_event, false);

        /*
         * Add back the child's count to the parent's count:
         */
        atomic64_add(child_val, &parent_event->child_count);
        atomic64_add(child_event->total_time_enabled,
                     &parent_event->child_total_time_enabled);
        atomic64_add(child_event->total_time_running,
                     &parent_event->child_total_time_running);
}

static void
perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx)
{
        struct perf_event *parent_event = event->parent;
        unsigned long detach_flags = 0;

        if (parent_event) {
                /*
                 * Do not destroy the 'original' grouping; because of the
                 * context switch optimization the original events could've
                 * ended up in a random child task.
                 *
                 * If we were to destroy the original group, all group related
                 * operations would cease to function properly after this
                 * random child dies.
                 *
                 * Do destroy all inherited groups, we don't care about those
                 * and being thorough is better.
                 */
                detach_flags = DETACH_GROUP | DETACH_CHILD;
                mutex_lock(&parent_event->child_mutex);
        }

        perf_remove_from_context(event, detach_flags | DETACH_EXIT);

        /*
         * Child events can be freed.
         */
        if (parent_event) {
                mutex_unlock(&parent_event->child_mutex);
                /*
                 * Kick perf_poll() for is_event_hup();
                 */
                perf_event_wakeup(parent_event);
                put_event(event);
                return;
        }

        /*
         * Parent events are governed by their filedesc, retain them.
         */
        perf_event_wakeup(event);
}

static void perf_event_exit_task_context(struct task_struct *child)
{
        struct perf_event_context *child_ctx, *clone_ctx = NULL;
        struct perf_event *child_event, *next;

        WARN_ON_ONCE(child != current);

        child_ctx = perf_pin_task_context(child);
        if (!child_ctx)
                return;

        /*
         * In order to reduce the amount of tricky in ctx tear-down, we hold
         * ctx::mutex over the entire thing. This serializes against almost
         * everything that wants to access the ctx.
         *
         * The exception is sys_perf_event_open() /
         * perf_event_create_kernel_count() which does find_get_context()
         * without ctx::mutex (it cannot because of the move_group double mutex
         * lock thing). See the comments in perf_install_in_context().
         */
        mutex_lock(&child_ctx->mutex);

        /*
         * In a single ctx::lock section, de-schedule the events and detach the
         * context from the task such that we cannot ever get it scheduled back
         * in.
         */
        raw_spin_lock_irq(&child_ctx->lock);
        task_ctx_sched_out(child_ctx, NULL, EVENT_ALL);

        /*
         * Now that the context is inactive, destroy the task <-> ctx relation
         * and mark the context dead.
         */
        RCU_INIT_POINTER(child->perf_event_ctxp, NULL);
        put_ctx(child_ctx); /* cannot be last */
        WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
        put_task_struct(current); /* cannot be last */

        clone_ctx = unclone_ctx(child_ctx);
        raw_spin_unlock_irq(&child_ctx->lock);

        if (clone_ctx)
                put_ctx(clone_ctx);

        /*
         * Report the task dead after unscheduling the events so that we
         * won't get any samples after PERF_RECORD_EXIT. We can however still
         * get a few PERF_RECORD_READ events.
         */
        perf_event_task(child, child_ctx, 0);

        list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
                perf_event_exit_event(child_event, child_ctx);

        mutex_unlock(&child_ctx->mutex);

        put_ctx(child_ctx);
}

/*
 * When a child task exits, feed back event values to parent events.
 *
 * Can be called with exec_update_lock held when called from
 * setup_new_exec().
 */
void perf_event_exit_task(struct task_struct *child)
{
        struct perf_event *event, *tmp;

        mutex_lock(&child->perf_event_mutex);
        list_for_each_entry_safe(event, tmp, &child->perf_event_list,
                                 owner_entry) {
                list_del_init(&event->owner_entry);

                /*
                 * Ensure the list deletion is visible before we clear
                 * the owner, closes a race against perf_release() where
                 * we need to serialize on the owner->perf_event_mutex.
                 */
                smp_store_release(&event->owner, NULL);
        }
        mutex_unlock(&child->perf_event_mutex);

        perf_event_exit_task_context(child);

        /*
         * The perf_event_exit_task_context calls perf_event_task
         * with child's task_ctx, which generates EXIT events for
         * child contexts and sets child->perf_event_ctxp[] to NULL.
         * At this point we need to send EXIT events to cpu contexts.
         */
        perf_event_task(child, NULL, 0);

        /*
         * Detach the perf_ctx_data for the system-wide event.
         */
        guard(percpu_read)(&global_ctx_data_rwsem);
        detach_task_ctx_data(child);
}

static void perf_free_event(struct perf_event *event,
                            struct perf_event_context *ctx)
{
        struct perf_event *parent = event->parent;

        if (WARN_ON_ONCE(!parent))
                return;

        mutex_lock(&parent->child_mutex);
        list_del_init(&event->child_list);
        mutex_unlock(&parent->child_mutex);

        raw_spin_lock_irq(&ctx->lock);
        perf_group_detach(event);
        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
        put_event(event);
}

/*
 * Free a context as created by inheritance by perf_event_init_task() below,
 * used by fork() in case of fail.
 *
 * Even though the task has never lived, the context and events have been
 * exposed through the child_list, so we must take care tearing it all down.
 */
void perf_event_free_task(struct task_struct *task)
{
        struct perf_event_context *ctx;
        struct perf_event *event, *tmp;

        ctx = rcu_access_pointer(task->perf_event_ctxp);
        if (!ctx)
                return;

        mutex_lock(&ctx->mutex);
        raw_spin_lock_irq(&ctx->lock);
        /*
         * Destroy the task <-> ctx relation and mark the context dead.
         *
         * This is important because even though the task hasn't been
         * exposed yet the context has been (through child_list).
         */
        RCU_INIT_POINTER(task->perf_event_ctxp, NULL);
        WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
        put_task_struct(task); /* cannot be last */
        raw_spin_unlock_irq(&ctx->lock);


        list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
                perf_free_event(event, ctx);

        mutex_unlock(&ctx->mutex);

        /*
         * perf_event_release_kernel() could've stolen some of our
         * child events and still have them on its free_list. In that
         * case we must wait for these events to have been freed (in
         * particular all their references to this task must've been
         * dropped).
         *
         * Without this copy_process() will unconditionally free this
         * task (irrespective of its reference count) and
         * _free_event()'s put_task_struct(event->hw.target) will be a
         * use-after-free.
         *
         * Wait for all events to drop their context reference.
         */
        wait_var_event(&ctx->refcount, refcount_read(&ctx->refcount) == 1);
        put_ctx(ctx); /* must be last */
}

void perf_event_delayed_put(struct task_struct *task)
{
        WARN_ON_ONCE(task->perf_event_ctxp);
}

struct file *perf_event_get(unsigned int fd)
{
        struct file *file = fget(fd);
        if (!file)
                return ERR_PTR(-EBADF);

        if (file->f_op != &perf_fops) {
                fput(file);
                return ERR_PTR(-EBADF);
        }

        return file;
}

const struct perf_event *perf_get_event(struct file *file)
{
        if (file->f_op != &perf_fops)
                return ERR_PTR(-EINVAL);

        return file->private_data;
}

const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
{
        if (!event)
                return ERR_PTR(-EINVAL);

        return &event->attr;
}

int perf_allow_kernel(void)
{
        if (sysctl_perf_event_paranoid > 1 && !perfmon_capable())
                return -EACCES;

        return security_perf_event_open(PERF_SECURITY_KERNEL);
}
EXPORT_SYMBOL_GPL(perf_allow_kernel);

/*
 * Inherit an event from parent task to child task.
 *
 * Returns:
 *  - valid pointer on success
 *  - NULL for orphaned events
 *  - IS_ERR() on error
 */
static struct perf_event *
inherit_event(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event *group_leader,
              struct perf_event_context *child_ctx)
{
        enum perf_event_state parent_state = parent_event->state;
        struct perf_event_pmu_context *pmu_ctx;
        struct perf_event *child_event;
        unsigned long flags;

        /*
         * Instead of creating recursive hierarchies of events,
         * we link inherited events back to the original parent,
         * which has a filp for sure, which we use as the reference
         * count:
         */
        if (parent_event->parent)
                parent_event = parent_event->parent;

        child_event = perf_event_alloc(&parent_event->attr,
                                           parent_event->cpu,
                                           child,
                                           group_leader, parent_event,
                                           NULL, NULL, -1);
        if (IS_ERR(child_event))
                return child_event;

        get_ctx(child_ctx);
        child_event->ctx = child_ctx;

        pmu_ctx = find_get_pmu_context(child_event->pmu, child_ctx, child_event);
        if (IS_ERR(pmu_ctx)) {
                free_event(child_event);
                return ERR_CAST(pmu_ctx);
        }
        child_event->pmu_ctx = pmu_ctx;

        /*
         * is_orphaned_event() and list_add_tail(&parent_event->child_list)
         * must be under the same lock in order to serialize against
         * perf_event_release_kernel(), such that either we must observe
         * is_orphaned_event() or they will observe us on the child_list.
         */
        mutex_lock(&parent_event->child_mutex);
        if (is_orphaned_event(parent_event) ||
            !atomic_long_inc_not_zero(&parent_event->refcount)) {
                mutex_unlock(&parent_event->child_mutex);
                free_event(child_event);
                return NULL;
        }

        /*
         * Make the child state follow the state of the parent event,
         * not its attr.disabled bit.  We hold the parent's mutex,
         * so we won't race with perf_event_{en, dis}able_family.
         */
        if (parent_state >= PERF_EVENT_STATE_INACTIVE)
                child_event->state = PERF_EVENT_STATE_INACTIVE;
        else
                child_event->state = PERF_EVENT_STATE_OFF;

        if (parent_event->attr.freq) {
                u64 sample_period = parent_event->hw.sample_period;
                struct hw_perf_event *hwc = &child_event->hw;

                hwc->sample_period = sample_period;
                hwc->last_period   = sample_period;

                local64_set(&hwc->period_left, sample_period);
        }

        child_event->overflow_handler = parent_event->overflow_handler;
        child_event->overflow_handler_context
                = parent_event->overflow_handler_context;

        /*
         * Precalculate sample_data sizes
         */
        perf_event__header_size(child_event);
        perf_event__id_header_size(child_event);

        /*
         * Link it up in the child's context:
         */
        raw_spin_lock_irqsave(&child_ctx->lock, flags);
        add_event_to_ctx(child_event, child_ctx);
        child_event->attach_state |= PERF_ATTACH_CHILD;
        raw_spin_unlock_irqrestore(&child_ctx->lock, flags);

        /*
         * Link this into the parent event's child list
         */
        list_add_tail(&child_event->child_list, &parent_event->child_list);
        mutex_unlock(&parent_event->child_mutex);

        return child_event;
}

/*
 * Inherits an event group.
 *
 * This will quietly suppress orphaned events; !inherit_event() is not an error.
 * This matches with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int inherit_group(struct perf_event *parent_event,
              struct task_struct *parent,
              struct perf_event_context *parent_ctx,
              struct task_struct *child,
              struct perf_event_context *child_ctx)
{
        struct perf_event *leader;
        struct perf_event *sub;
        struct perf_event *child_ctr;

        leader = inherit_event(parent_event, parent, parent_ctx,
                                 child, NULL, child_ctx);
        if (IS_ERR(leader))
                return PTR_ERR(leader);
        /*
         * @leader can be NULL here because of is_orphaned_event(). In this
         * case inherit_event() will create individual events, similar to what
         * perf_group_detach() would do anyway.
         */
        for_each_sibling_event(sub, parent_event) {
                child_ctr = inherit_event(sub, parent, parent_ctx,
                                            child, leader, child_ctx);
                if (IS_ERR(child_ctr))
                        return PTR_ERR(child_ctr);

                if (sub->aux_event == parent_event && child_ctr &&
                    !perf_get_aux_event(child_ctr, leader))
                        return -EINVAL;
        }
        if (leader)
                leader->group_generation = parent_event->group_generation;
        return 0;
}

/*
 * Creates the child task context and tries to inherit the event-group.
 *
 * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
 * inherited_all set when we 'fail' to inherit an orphaned event; this is
 * consistent with perf_event_release_kernel() removing all child events.
 *
 * Returns:
 *  - 0 on success
 *  - <0 on error
 */
static int
inherit_task_group(struct perf_event *event, struct task_struct *parent,
                   struct perf_event_context *parent_ctx,
                   struct task_struct *child,
                   u64 clone_flags, int *inherited_all)
{
        struct perf_event_context *child_ctx;
        int ret;

        if (!event->attr.inherit ||
            (event->attr.inherit_thread && !(clone_flags & CLONE_THREAD)) ||
            /* Do not inherit if sigtrap and signal handlers were cleared. */
            (event->attr.sigtrap && (clone_flags & CLONE_CLEAR_SIGHAND))) {
                *inherited_all = 0;
                return 0;
        }

        child_ctx = child->perf_event_ctxp;
        if (!child_ctx) {
                /*
                 * This is executed from the parent task context, so
                 * inherit events that have been marked for cloning.
                 * First allocate and initialize a context for the
                 * child.
                 */
                child_ctx = alloc_perf_context(child);
                if (!child_ctx)
                        return -ENOMEM;

                child->perf_event_ctxp = child_ctx;
        }

        ret = inherit_group(event, parent, parent_ctx, child, child_ctx);
        if (ret)
                *inherited_all = 0;

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
static int perf_event_init_context(struct task_struct *child, u64 clone_flags)
{
        struct perf_event_context *child_ctx, *parent_ctx;
        struct perf_event_context *cloned_ctx;
        struct perf_event *event;
        struct task_struct *parent = current;
        int inherited_all = 1;
        unsigned long flags;
        int ret = 0;

        if (likely(!parent->perf_event_ctxp))
                return 0;

        /*
         * If the parent's context is a clone, pin it so it won't get
         * swapped under us.
         */
        parent_ctx = perf_pin_task_context(parent);
        if (!parent_ctx)
                return 0;

        /*
         * No need to check if parent_ctx != NULL here; since we saw
         * it non-NULL earlier, the only reason for it to become NULL
         * is if we exit, and since we're currently in the middle of
         * a fork we can't be exiting at the same time.
         */

        /*
         * Lock the parent list. No need to lock the child - not PID
         * hashed yet and not running, so nobody can access it.
         */
        mutex_lock(&parent_ctx->mutex);

        /*
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
        perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        /*
         * We can't hold ctx->lock when iterating the ->flexible_group list due
         * to allocations, but we need to prevent rotation because
         * rotate_ctx() will change the list from interrupt context.
         */
        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);

        perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, clone_flags, &inherited_all);
                if (ret)
                        goto out_unlock;
        }

        raw_spin_lock_irqsave(&parent_ctx->lock, flags);
        parent_ctx->rotate_disable = 0;

        child_ctx = child->perf_event_ctxp;

        if (child_ctx && inherited_all) {
                /*
                 * Mark the child context as a clone of the parent
                 * context, or of whatever the parent is a clone of.
                 *
                 * Note that if the parent is a clone, the holding of
                 * parent_ctx->lock avoids it from being uncloned.
                 */
                cloned_ctx = parent_ctx->parent_ctx;
                if (cloned_ctx) {
                        child_ctx->parent_ctx = cloned_ctx;
                        child_ctx->parent_gen = parent_ctx->parent_gen;
                } else {
                        child_ctx->parent_ctx = parent_ctx;
                        child_ctx->parent_gen = parent_ctx->generation;
                }
                get_ctx(child_ctx->parent_ctx);
        }

        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
out_unlock:
        mutex_unlock(&parent_ctx->mutex);

        perf_unpin_context(parent_ctx);
        put_ctx(parent_ctx);

        return ret;
}

/*
 * Initialize the perf_event context in task_struct
 */
int perf_event_init_task(struct task_struct *child, u64 clone_flags)
{
        int ret;

        memset(child->perf_recursion, 0, sizeof(child->perf_recursion));
        child->perf_event_ctxp = NULL;
        mutex_init(&child->perf_event_mutex);
        INIT_LIST_HEAD(&child->perf_event_list);
        child->perf_ctx_data = NULL;

        ret = perf_event_init_context(child, clone_flags);
        if (ret) {
                perf_event_free_task(child);
                return ret;
        }

        return 0;
}

static void __init perf_event_init_all_cpus(void)
{
        struct swevent_htable *swhash;
        struct perf_cpu_context *cpuctx;
        int cpu;

        zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_core_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_die_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_cluster_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_pkg_mask, GFP_KERNEL);
        zalloc_cpumask_var(&perf_online_sys_mask, GFP_KERNEL);


        for_each_possible_cpu(cpu) {
                swhash = &per_cpu(swevent_htable, cpu);
                mutex_init(&swhash->hlist_mutex);

                INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
                raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));

                INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));

                cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
                __perf_event_init_context(&cpuctx->ctx);
                lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
                cpuctx->heap_size = ARRAY_SIZE(cpuctx->heap_default);
                cpuctx->heap = cpuctx->heap_default;
        }
}

static void perf_swevent_init_cpu(unsigned int cpu)
{
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);

        mutex_lock(&swhash->hlist_mutex);
        if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
                struct swevent_hlist *hlist;

                hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
                WARN_ON(!hlist);
                rcu_assign_pointer(swhash->swevent_hlist, hlist);
        }
        mutex_unlock(&swhash->hlist_mutex);
}

#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
        struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context);
        struct perf_event_context *ctx = __info;
        struct perf_event *event;

        raw_spin_lock(&ctx->lock);
        ctx_sched_out(ctx, NULL, EVENT_TIME);
        list_for_each_entry(event, &ctx->event_list, event_entry)
                __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
        raw_spin_unlock(&ctx->lock);
}

static void perf_event_clear_cpumask(unsigned int cpu)
{
        int target[PERF_PMU_MAX_SCOPE];
        unsigned int scope;
        struct pmu *pmu;

        cpumask_clear_cpu(cpu, perf_online_mask);

        for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
                const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);
                struct cpumask *pmu_cpumask = perf_scope_cpumask(scope);

                target[scope] = -1;
                if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
                        continue;

                if (!cpumask_test_and_clear_cpu(cpu, pmu_cpumask))
                        continue;
                target[scope] = cpumask_any_but(cpumask, cpu);
                if (target[scope] < nr_cpu_ids)
                        cpumask_set_cpu(target[scope], pmu_cpumask);
        }

        /* migrate */
        list_for_each_entry(pmu, &pmus, entry) {
                if (pmu->scope == PERF_PMU_SCOPE_NONE ||
                    WARN_ON_ONCE(pmu->scope >= PERF_PMU_MAX_SCOPE))
                        continue;

                if (target[pmu->scope] >= 0 && target[pmu->scope] < nr_cpu_ids)
                        perf_pmu_migrate_context(pmu, cpu, target[pmu->scope]);
        }
}

static void perf_event_exit_cpu_context(int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;

        // XXX simplify cpuctx->online
        mutex_lock(&pmus_lock);
        /*
         * Clear the cpumasks, and migrate to other CPUs if possible.
         * Must be invoked before the __perf_event_exit_context.
         */
        perf_event_clear_cpumask(cpu);
        cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
        ctx = &cpuctx->ctx;

        mutex_lock(&ctx->mutex);
        smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
        cpuctx->online = 0;
        mutex_unlock(&ctx->mutex);
        mutex_unlock(&pmus_lock);
}
#else

static void perf_event_exit_cpu_context(int cpu) { }

#endif

static void perf_event_setup_cpumask(unsigned int cpu)
{
        struct cpumask *pmu_cpumask;
        unsigned int scope;

        /*
         * Early boot stage, the cpumask hasn't been set yet.
         * The perf_online_<domain>_masks includes the first CPU of each domain.
         * Always unconditionally set the boot CPU for the perf_online_<domain>_masks.
         */
        if (cpumask_empty(perf_online_mask)) {
                for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
                        pmu_cpumask = perf_scope_cpumask(scope);
                        if (WARN_ON_ONCE(!pmu_cpumask))
                                continue;
                        cpumask_set_cpu(cpu, pmu_cpumask);
                }
                goto end;
        }

        for (scope = PERF_PMU_SCOPE_NONE + 1; scope < PERF_PMU_MAX_SCOPE; scope++) {
                const struct cpumask *cpumask = perf_scope_cpu_topology_cpumask(scope, cpu);

                pmu_cpumask = perf_scope_cpumask(scope);

                if (WARN_ON_ONCE(!pmu_cpumask || !cpumask))
                        continue;

                if (!cpumask_empty(cpumask) &&
                    cpumask_any_and(pmu_cpumask, cpumask) >= nr_cpu_ids)
                        cpumask_set_cpu(cpu, pmu_cpumask);
        }
end:
        cpumask_set_cpu(cpu, perf_online_mask);
}

int perf_event_init_cpu(unsigned int cpu)
{
        struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;

        perf_swevent_init_cpu(cpu);

        mutex_lock(&pmus_lock);
        perf_event_setup_cpumask(cpu);
        cpuctx = per_cpu_ptr(&perf_cpu_context, cpu);
        ctx = &cpuctx->ctx;

        mutex_lock(&ctx->mutex);
        cpuctx->online = 1;
        mutex_unlock(&ctx->mutex);
        mutex_unlock(&pmus_lock);

        return 0;
}

int perf_event_exit_cpu(unsigned int cpu)
{
        perf_event_exit_cpu_context(cpu);
        return 0;
}

static int
perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
{
        int cpu;

        for_each_online_cpu(cpu)
                perf_event_exit_cpu(cpu);

        return NOTIFY_OK;
}

/*
 * Run the perf reboot notifier at the very last possible moment so that
 * the generic watchdog code runs as long as possible.
 */
static struct notifier_block perf_reboot_notifier = {
        .notifier_call = perf_reboot,
        .priority = INT_MIN,
};

void __init perf_event_init(void)
{
        int ret;

        idr_init(&pmu_idr);

        perf_event_init_all_cpus();
        init_srcu_struct(&pmus_srcu);
        perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
        perf_pmu_register(&perf_cpu_clock, "cpu_clock", -1);
        perf_pmu_register(&perf_task_clock, "task_clock", -1);
        perf_tp_register();
        perf_event_init_cpu(smp_processor_id());
        register_reboot_notifier(&perf_reboot_notifier);

        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);

        perf_event_cache = KMEM_CACHE(perf_event, SLAB_PANIC);

        /*
         * Build time assertion that we keep the data_head at the intended
         * location.  IOW, validation we got the __reserved[] size right.
         */
        BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
                     != 1024);
}

ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
                              char *page)
{
        struct perf_pmu_events_attr *pmu_attr =
                container_of(attr, struct perf_pmu_events_attr, attr);

        if (pmu_attr->event_str)
                return sprintf(page, "%s\n", pmu_attr->event_str);

        return 0;
}
EXPORT_SYMBOL_GPL(perf_event_sysfs_show);

static int __init perf_event_sysfs_init(void)
{
        struct pmu *pmu;
        int ret;

        mutex_lock(&pmus_lock);

        ret = bus_register(&pmu_bus);
        if (ret)
                goto unlock;

        list_for_each_entry(pmu, &pmus, entry) {
                if (pmu->dev)
                        continue;

                ret = pmu_dev_alloc(pmu);
                WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
        }
        pmu_bus_running = 1;
        ret = 0;

unlock:
        mutex_unlock(&pmus_lock);

        return ret;
}
device_initcall(perf_event_sysfs_init);

#ifdef CONFIG_CGROUP_PERF
static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
        struct perf_cgroup *jc;

        jc = kzalloc(sizeof(*jc), GFP_KERNEL);
        if (!jc)
                return ERR_PTR(-ENOMEM);

        jc->info = alloc_percpu(struct perf_cgroup_info);
        if (!jc->info) {
                kfree(jc);
                return ERR_PTR(-ENOMEM);
        }

        return &jc->css;
}

static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
{
        struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);

        free_percpu(jc->info);
        kfree(jc);
}

static int perf_cgroup_css_online(struct cgroup_subsys_state *css)
{
        perf_event_cgroup(css->cgroup);
        return 0;
}

static int __perf_cgroup_move(void *info)
{
        struct task_struct *task = info;

        preempt_disable();
        perf_cgroup_switch(task);
        preempt_enable();

        return 0;
}

static void perf_cgroup_attach(struct cgroup_taskset *tset)
{
        struct task_struct *task;
        struct cgroup_subsys_state *css;

        cgroup_taskset_for_each(task, css, tset)
                task_function_call(task, __perf_cgroup_move, task);
}

struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc        = perf_cgroup_css_alloc,
        .css_free        = perf_cgroup_css_free,
        .css_online        = perf_cgroup_css_online,
        .attach                = perf_cgroup_attach,
        /*
         * Implicitly enable on dfl hierarchy so that perf events can
         * always be filtered by cgroup2 path as long as perf_event
         * controller is not mounted on a legacy hierarchy.
         */
        .implicit_on_dfl = true,
        .threaded        = true,
};
#endif /* CONFIG_CGROUP_PERF */

DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);





















































































  300 



  301 























  300 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/mm/mmap.c
 *
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/io.h>
#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/types.h>

#include <asm/cpufeature.h>
#include <asm/page.h>

static pgprot_t protection_map[16] __ro_after_init = {
        [VM_NONE]                                        = PAGE_NONE,
        [VM_READ]                                        = PAGE_READONLY,
        [VM_WRITE]                                        = PAGE_READONLY,
        [VM_WRITE | VM_READ]                                = PAGE_READONLY,
        /* PAGE_EXECONLY if Enhanced PAN */
        [VM_EXEC]                                        = PAGE_READONLY_EXEC,
        [VM_EXEC | VM_READ]                                = PAGE_READONLY_EXEC,
        [VM_EXEC | VM_WRITE]                                = PAGE_READONLY_EXEC,
        [VM_EXEC | VM_WRITE | VM_READ]                        = PAGE_READONLY_EXEC,
        [VM_SHARED]                                        = PAGE_NONE,
        [VM_SHARED | VM_READ]                                = PAGE_READONLY,
        [VM_SHARED | VM_WRITE]                                = PAGE_SHARED,
        [VM_SHARED | VM_WRITE | VM_READ]                = PAGE_SHARED,
        /* PAGE_EXECONLY if Enhanced PAN */
        [VM_SHARED | VM_EXEC]                                = PAGE_READONLY_EXEC,
        [VM_SHARED | VM_EXEC | VM_READ]                        = PAGE_READONLY_EXEC,
        [VM_SHARED | VM_EXEC | VM_WRITE]                = PAGE_SHARED_EXEC,
        [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]        = PAGE_SHARED_EXEC
};

/*
 * You really shouldn't be using read() or write() on /dev/mem.  This might go
 * away in the future.
 */
int valid_phys_addr_range(phys_addr_t addr, size_t size)
{
        /*
         * Check whether addr is covered by a memory region without the
         * MEMBLOCK_NOMAP attribute, and whether that region covers the
         * entire range. In theory, this could lead to false negatives
         * if the range is covered by distinct but adjacent memory regions
         * that only differ in other attributes. However, few of such
         * attributes have been defined, and it is debatable whether it
         * follows that /dev/mem read() calls should be able traverse
         * such boundaries.
         */
        return memblock_is_region_memory(addr, size) &&
               memblock_is_map_memory(addr);
}

/*
 * Do not allow /dev/mem mappings beyond the supported physical range.
 */
int valid_mmap_phys_addr_range(unsigned long pfn, size_t size)
{
        return !(((pfn << PAGE_SHIFT) + size) & ~PHYS_MASK);
}

static int __init adjust_protection_map(void)
{
        /*
         * With Enhanced PAN we can honour the execute-only permissions as
         * there is no PAN override with such mappings.
         */
        if (cpus_have_cap(ARM64_HAS_EPAN)) {
                protection_map[VM_EXEC] = PAGE_EXECONLY;
                protection_map[VM_EXEC | VM_SHARED] = PAGE_EXECONLY;
        }

        if (lpa2_is_enabled())
                for (int i = 0; i < ARRAY_SIZE(protection_map); i++)
                        pgprot_val(protection_map[i]) &= ~PTE_SHARED;

        return 0;
}
arch_initcall(adjust_protection_map);

pgprot_t vm_get_page_prot(unsigned long vm_flags)
{
        pteval_t prot;

        /* Short circuit GCS to avoid bloating the table. */
        if (system_supports_gcs() && (vm_flags & VM_SHADOW_STACK)) {
                prot = _PAGE_GCS_RO;
        } else {
                prot = pgprot_val(protection_map[vm_flags &
                                   (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
        }

        if (vm_flags & VM_ARM64_BTI)
                prot |= PTE_GP;

        /*
         * There are two conditions required for returning a Normal Tagged
         * memory type: (1) the user requested it via PROT_MTE passed to
         * mmap() or mprotect() and (2) the corresponding vma supports MTE. We
         * register (1) as VM_MTE in the vma->vm_flags and (2) as
         * VM_MTE_ALLOWED. Note that the latter can only be set during the
         * mmap() call since mprotect() does not accept MAP_* flags.
         * Checking for VM_MTE only is sufficient since arch_validate_flags()
         * does not permit (VM_MTE & !VM_MTE_ALLOWED).
         */
        if (vm_flags & VM_MTE)
                prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED);

#ifdef CONFIG_ARCH_HAS_PKEYS
        if (system_supports_poe()) {
                if (vm_flags & VM_PKEY_BIT0)
                        prot |= PTE_PO_IDX_0;
                if (vm_flags & VM_PKEY_BIT1)
                        prot |= PTE_PO_IDX_1;
                if (vm_flags & VM_PKEY_BIT2)
                        prot |= PTE_PO_IDX_2;
        }
#endif

        return __pgprot(prot);
}
EXPORT_SYMBOL(vm_get_page_prot);





































































    2 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * kernel/workqueue_internal.h
 *
 * Workqueue internal header file.  Only to be included by workqueue and
 * core kernel subsystems.
 */
#ifndef _KERNEL_WORKQUEUE_INTERNAL_H
#define _KERNEL_WORKQUEUE_INTERNAL_H

#include <linux/workqueue.h>
#include <linux/kthread.h>
#include <linux/preempt.h>

struct worker_pool;

/*
 * The poor guys doing the actual heavy lifting.  All on-duty workers are
 * either serving the manager role, on idle list or on busy hash.  For
 * details on the locking annotation (L, I, X...), refer to workqueue.c.
 *
 * Only to be used in workqueue and async.
 */
struct worker {
        /* on idle list while idle, on busy hash table while busy */
        union {
                struct list_head        entry;        /* L: while idle */
                struct hlist_node        hentry;        /* L: while busy */
        };

        struct work_struct        *current_work;        /* K: work being processed and its */
        work_func_t                current_func;        /* K: function */
        struct pool_workqueue        *current_pwq;        /* K: pwq */
        u64                        current_at;        /* K: runtime at start or last wakeup */
        unsigned int                current_color;        /* K: color */

        int                        sleeping;        /* S: is worker sleeping? */

        /* used by the scheduler to determine a worker's last known identity */
        work_func_t                last_func;        /* K: last work's fn */

        struct list_head        scheduled;        /* L: scheduled works */

        struct task_struct        *task;                /* I: worker task */
        struct worker_pool        *pool;                /* A: the associated pool */
                                                /* L: for rescuers */
        struct list_head        node;                /* A: anchored at pool->workers */
                                                /* A: runs through worker->node */

        unsigned long                last_active;        /* K: last active timestamp */
        unsigned int                flags;                /* L: flags */
        int                        id;                /* I: worker id */

        /*
         * Opaque string set with work_set_desc().  Printed out with task
         * dump for debugging - WARN, BUG, panic or sysrq.
         */
        char                        desc[WORKER_DESC_LEN];

        /* used only by rescuers to point to the target workqueue */
        struct workqueue_struct        *rescue_wq;        /* I: the workqueue to rescue */
};

/**
 * current_wq_worker - return struct worker if %current is a workqueue worker
 */
static inline struct worker *current_wq_worker(void)
{
        if (in_task() && (current->flags & PF_WQ_WORKER))
                return kthread_data(current);
        return NULL;
}

/*
 * Scheduler hooks for concurrency managed workqueue.  Only to be used from
 * sched/ and workqueue.c.
 */
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);

#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
















































































































   36 




























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_NAMEI_H
#define _LINUX_NAMEI_H

#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/path.h>
#include <linux/fcntl.h>
#include <linux/errno.h>

enum { MAX_NESTED_LINKS = 8 };

#define MAXSYMLINKS 40

/*
 * Type of the last component on LOOKUP_PARENT
 */
enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};

/* pathwalk mode */
#define LOOKUP_FOLLOW                BIT(0)        /* follow links at the end */
#define LOOKUP_DIRECTORY        BIT(1)        /* require a directory */
#define LOOKUP_AUTOMOUNT        BIT(2)  /* force terminal automount */
#define LOOKUP_EMPTY                BIT(3)        /* accept empty path [user_... only] */
#define LOOKUP_LINKAT_EMPTY        BIT(4) /* Linkat request with empty path. */
#define LOOKUP_DOWN                BIT(5)        /* follow mounts in the starting point */
#define LOOKUP_MOUNTPOINT        BIT(6)        /* follow mounts in the end */
#define LOOKUP_REVAL                BIT(7)        /* tell ->d_revalidate() to trust no cache */
#define LOOKUP_RCU                BIT(8)        /* RCU pathwalk mode; semi-internal */
#define LOOKUP_CACHED                BIT(9) /* Only do cached lookup */
#define LOOKUP_PARENT                BIT(10)        /* Looking up final parent in path */
/* 5 spare bits for pathwalk */

/* These tell filesystem methods that we are dealing with the final component... */
#define LOOKUP_OPEN                BIT(16)        /* ... in open */
#define LOOKUP_CREATE                BIT(17)        /* ... in object creation */
#define LOOKUP_EXCL                BIT(18)        /* ... in target must not exist */
#define LOOKUP_RENAME_TARGET        BIT(19)        /* ... in destination of rename() */

/* 4 spare bits for intent */

/* Scoping flags for lookup. */
#define LOOKUP_NO_SYMLINKS        BIT(24) /* No symlink crossing. */
#define LOOKUP_NO_MAGICLINKS        BIT(25) /* No nd_jump_link() crossing. */
#define LOOKUP_NO_XDEV                BIT(26) /* No mountpoint crossing. */
#define LOOKUP_BENEATH                BIT(27) /* No escaping from starting point. */
#define LOOKUP_IN_ROOT                BIT(28) /* Treat dirfd as fs root. */
/* LOOKUP_* flags which do scope-related checks based on the dirfd. */
#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
/* 3 spare bits for scoping */

extern int path_pts(struct path *path);

extern int user_path_at(int, const char __user *, unsigned, struct path *);

struct dentry *lookup_one_qstr_excl(const struct qstr *name,
                                    struct dentry *base,
                                    unsigned int flags);
extern int kern_path(const char *, unsigned, struct path *);

extern struct dentry *kern_path_create(int, const char *, struct path *, unsigned int);
extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int);
extern void done_path_create(struct path *, struct dentry *);
extern struct dentry *kern_path_locked(const char *, struct path *);
extern struct dentry *kern_path_locked_negative(const char *, struct path *);
extern struct dentry *user_path_locked_at(int , const char __user *, struct path *);
int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
                           struct path *parent, struct qstr *last, int *type,
                           const struct path *root);
int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *,
                    unsigned int, struct path *);

extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int);
struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int);
struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
                                   const char *name, struct dentry *base,
                                   int len);
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
                                            const char *name,
                                            struct dentry *base, int len);

extern int follow_down_one(struct path *);
extern int follow_down(struct path *path, unsigned int flags);
extern int follow_up(struct path *);

extern struct dentry *lock_rename(struct dentry *, struct dentry *);
extern struct dentry *lock_rename_child(struct dentry *, struct dentry *);
extern void unlock_rename(struct dentry *, struct dentry *);

/**
 * mode_strip_umask - handle vfs umask stripping
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode to be created in @dir
 *
 * In most filesystems, umask stripping depends on whether or not the
 * filesystem supports POSIX ACLs. If the filesystem doesn't support it umask
 * stripping is done directly in here. If the filesystem does support POSIX
 * ACLs umask stripping is deferred until the filesystem calls
 * posix_acl_create().
 *
 * Some filesystems (like NFSv4) also want to avoid umask stripping by the
 * VFS, but don't support POSIX ACLs. Those filesystems can set SB_I_NOUMASK
 * to get this effect without declaring that they support POSIX ACLs.
 *
 * Returns: mode
 */
static inline umode_t __must_check mode_strip_umask(const struct inode *dir, umode_t mode)
{
        if (!IS_POSIXACL(dir) && !(dir->i_sb->s_iflags & SB_I_NOUMASK))
                mode &= ~current_umask();
        return mode;
}

extern int __must_check nd_jump_link(const struct path *path);

static inline void nd_terminate_link(void *name, size_t len, size_t maxlen)
{
        ((char *) name)[min(len, maxlen)] = '\0';
}

/**
 * retry_estale - determine whether the caller should retry an operation
 * @error: the error that would currently be returned
 * @flags: flags being used for next lookup attempt
 *
 * Check to see if the error code was -ESTALE, and then determine whether
 * to retry the call based on whether "flags" already has LOOKUP_REVAL set.
 *
 * Returns true if the caller should try the operation again.
 */
static inline bool
retry_estale(const long error, const unsigned int flags)
{
        return unlikely(error == -ESTALE && !(flags & LOOKUP_REVAL));
}

#endif /* _LINUX_NAMEI_H */
















































































































































































































































    2 





    2 

    2 











































































































   46 


   46 






   59 



   59 
















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2017 ARM Ltd.
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/kvm_host.h>
#include <linux/irqchip/arm-gic-v3.h>

#include "vgic.h"

/*
 * How KVM uses GICv4 (insert rude comments here):
 *
 * The vgic-v4 layer acts as a bridge between several entities:
 * - The GICv4 ITS representation offered by the ITS driver
 * - VFIO, which is in charge of the PCI endpoint
 * - The virtual ITS, which is the only thing the guest sees
 *
 * The configuration of VLPIs is triggered by a callback from VFIO,
 * instructing KVM that a PCI device has been configured to deliver
 * MSIs to a vITS.
 *
 * kvm_vgic_v4_set_forwarding() is thus called with the routing entry,
 * and this is used to find the corresponding vITS data structures
 * (ITS instance, device, event and irq) using a process that is
 * extremely similar to the injection of an MSI.
 *
 * At this stage, we can link the guest's view of an LPI (uniquely
 * identified by the routing entry) and the host irq, using the GICv4
 * driver mapping operation. Should the mapping succeed, we've then
 * successfully upgraded the guest's LPI to a VLPI. We can then start
 * with updating GICv4's view of the property table and generating an
 * INValidation in order to kickstart the delivery of this VLPI to the
 * guest directly, without software intervention. Well, almost.
 *
 * When the PCI endpoint is deconfigured, this operation is reversed
 * with VFIO calling kvm_vgic_v4_unset_forwarding().
 *
 * Once the VLPI has been mapped, it needs to follow any change the
 * guest performs on its LPI through the vITS. For that, a number of
 * command handlers have hooks to communicate these changes to the HW:
 * - Any invalidation triggers a call to its_prop_update_vlpi()
 * - The INT command results in a irq_set_irqchip_state(), which
 *   generates an INT on the corresponding VLPI.
 * - The CLEAR command results in a irq_set_irqchip_state(), which
 *   generates an CLEAR on the corresponding VLPI.
 * - DISCARD translates into an unmap, similar to a call to
 *   kvm_vgic_v4_unset_forwarding().
 * - MOVI is translated by an update of the existing mapping, changing
 *   the target vcpu, resulting in a VMOVI being generated.
 * - MOVALL is translated by a string of mapping updates (similar to
 *   the handling of MOVI). MOVALL is horrible.
 *
 * Note that a DISCARD/MAPTI sequence emitted from the guest without
 * reprogramming the PCI endpoint after MAPTI does not result in a
 * VLPI being mapped, as there is no callback from VFIO (the guest
 * will get the interrupt via the normal SW injection). Fixing this is
 * not trivial, and requires some horrible messing with the VFIO
 * internals. Not fun. Don't do that.
 *
 * Then there is the scheduling. Each time a vcpu is about to run on a
 * physical CPU, KVM must tell the corresponding redistributor about
 * it. And if we've migrated our vcpu from one CPU to another, we must
 * tell the ITS (so that the messages reach the right redistributor).
 * This is done in two steps: first issue a irq_set_affinity() on the
 * irq corresponding to the vcpu, then call its_make_vpe_resident().
 * You must be in a non-preemptible context. On exit, a call to
 * its_make_vpe_non_resident() tells the redistributor that we're done
 * with the vcpu.
 *
 * Finally, the doorbell handling: Each vcpu is allocated an interrupt
 * which will fire each time a VLPI is made pending whilst the vcpu is
 * not running. Each time the vcpu gets blocked, the doorbell
 * interrupt gets enabled. When the vcpu is unblocked (for whatever
 * reason), the doorbell interrupt is disabled.
 */

#define DB_IRQ_FLAGS        (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING)

static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info)
{
        struct kvm_vcpu *vcpu = info;

        /* We got the message, no need to fire again */
        if (!kvm_vgic_global_state.has_gicv4_1 &&
            !irqd_irq_disabled(&irq_to_desc(irq)->irq_data))
                disable_irq_nosync(irq);

        /*
         * The v4.1 doorbell can fire concurrently with the vPE being
         * made non-resident. Ensure we only update pending_last
         * *after* the non-residency sequence has completed.
         */
        raw_spin_lock(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vpe_lock);
        vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true;
        raw_spin_unlock(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vpe_lock);

        kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
        kvm_vcpu_kick(vcpu);

        return IRQ_HANDLED;
}

static void vgic_v4_sync_sgi_config(struct its_vpe *vpe, struct vgic_irq *irq)
{
        vpe->sgi_config[irq->intid].enabled        = irq->enabled;
        vpe->sgi_config[irq->intid].group         = irq->group;
        vpe->sgi_config[irq->intid].priority        = irq->priority;
}

static void vgic_v4_enable_vsgis(struct kvm_vcpu *vcpu)
{
        struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
        int i;

        /*
         * With GICv4.1, every virtual SGI can be directly injected. So
         * let's pretend that they are HW interrupts, tied to a host
         * IRQ. The SGI code will do its magic.
         */
        for (i = 0; i < VGIC_NR_SGIS; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i);
                struct irq_desc *desc;
                unsigned long flags;
                int ret;

                raw_spin_lock_irqsave(&irq->irq_lock, flags);

                if (irq->hw)
                        goto unlock;

                irq->hw = true;
                irq->host_irq = irq_find_mapping(vpe->sgi_domain, i);

                /* Transfer the full irq state to the vPE */
                vgic_v4_sync_sgi_config(vpe, irq);
                desc = irq_to_desc(irq->host_irq);
                ret = irq_domain_activate_irq(irq_desc_get_irq_data(desc),
                                              false);
                if (!WARN_ON(ret)) {
                        /* Transfer pending state */
                        ret = irq_set_irqchip_state(irq->host_irq,
                                                    IRQCHIP_STATE_PENDING,
                                                    irq->pending_latch);
                        WARN_ON(ret);
                        irq->pending_latch = false;
                }
        unlock:
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
}

static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
{
        int i;

        for (i = 0; i < VGIC_NR_SGIS; i++) {
                struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i);
                struct irq_desc *desc;
                unsigned long flags;
                int ret;

                raw_spin_lock_irqsave(&irq->irq_lock, flags);

                if (!irq->hw)
                        goto unlock;

                irq->hw = false;
                ret = irq_get_irqchip_state(irq->host_irq,
                                            IRQCHIP_STATE_PENDING,
                                            &irq->pending_latch);
                WARN_ON(ret);

                desc = irq_to_desc(irq->host_irq);
                irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
        unlock:
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                vgic_put_irq(vcpu->kvm, irq);
        }
}

void vgic_v4_configure_vsgis(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu;
        unsigned long i;

        lockdep_assert_held(&kvm->arch.config_lock);

        kvm_arm_halt_guest(kvm);

        kvm_for_each_vcpu(i, vcpu, kvm) {
                if (dist->nassgireq)
                        vgic_v4_enable_vsgis(vcpu);
                else
                        vgic_v4_disable_vsgis(vcpu);
        }

        kvm_arm_resume_guest(kvm);
}

/*
 * Must be called with GICv4.1 and the vPE unmapped, which
 * indicates the invalidation of any VPT caches associated
 * with the vPE, thus we can get the VLPI state by peeking
 * at the VPT.
 */
void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val)
{
        struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
        int mask = BIT(irq->intid % BITS_PER_BYTE);
        void *va;
        u8 *ptr;

        va = page_address(vpe->vpt_page);
        ptr = va + irq->intid / BITS_PER_BYTE;

        *val = !!(*ptr & mask);
}

int vgic_v4_request_vpe_irq(struct kvm_vcpu *vcpu, int irq)
{
        return request_irq(irq, vgic_v4_doorbell_handler, 0, "vcpu", vcpu);
}

/**
 * vgic_v4_init - Initialize the GICv4 data structures
 * @kvm:        Pointer to the VM being initialized
 *
 * We may be called each time a vITS is created, or when the
 * vgic is initialized. In both cases, the number of vcpus
 * should now be fixed.
 */
int vgic_v4_init(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu;
        int nr_vcpus, ret;
        unsigned long i;

        lockdep_assert_held(&kvm->arch.config_lock);

        if (!kvm_vgic_global_state.has_gicv4)
                return 0; /* Nothing to see here... move along. */

        if (dist->its_vm.vpes)
                return 0;

        nr_vcpus = atomic_read(&kvm->online_vcpus);

        dist->its_vm.vpes = kcalloc(nr_vcpus, sizeof(*dist->its_vm.vpes),
                                    GFP_KERNEL_ACCOUNT);
        if (!dist->its_vm.vpes)
                return -ENOMEM;

        dist->its_vm.nr_vpes = nr_vcpus;

        kvm_for_each_vcpu(i, vcpu, kvm)
                dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;

        ret = its_alloc_vcpu_irqs(&dist->its_vm);
        if (ret < 0) {
                kvm_err("VPE IRQ allocation failure\n");
                kfree(dist->its_vm.vpes);
                dist->its_vm.nr_vpes = 0;
                dist->its_vm.vpes = NULL;
                return ret;
        }

        kvm_for_each_vcpu(i, vcpu, kvm) {
                int irq = dist->its_vm.vpes[i]->irq;
                unsigned long irq_flags = DB_IRQ_FLAGS;

                /*
                 * Don't automatically enable the doorbell, as we're
                 * flipping it back and forth when the vcpu gets
                 * blocked. Also disable the lazy disabling, as the
                 * doorbell could kick us out of the guest too
                 * early...
                 *
                 * On GICv4.1, the doorbell is managed in HW and must
                 * be left enabled.
                 */
                if (kvm_vgic_global_state.has_gicv4_1)
                        irq_flags &= ~IRQ_NOAUTOEN;
                irq_set_status_flags(irq, irq_flags);

                ret = vgic_v4_request_vpe_irq(vcpu, irq);
                if (ret) {
                        kvm_err("failed to allocate vcpu IRQ%d\n", irq);
                        /*
                         * Trick: adjust the number of vpes so we know
                         * how many to nuke on teardown...
                         */
                        dist->its_vm.nr_vpes = i;
                        break;
                }
        }

        if (ret)
                vgic_v4_teardown(kvm);

        return ret;
}

/**
 * vgic_v4_teardown - Free the GICv4 data structures
 * @kvm:        Pointer to the VM being destroyed
 */
void vgic_v4_teardown(struct kvm *kvm)
{
        struct its_vm *its_vm = &kvm->arch.vgic.its_vm;
        int i;

        lockdep_assert_held(&kvm->arch.config_lock);

        if (!its_vm->vpes)
                return;

        for (i = 0; i < its_vm->nr_vpes; i++) {
                struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i);
                int irq = its_vm->vpes[i]->irq;

                irq_clear_status_flags(irq, DB_IRQ_FLAGS);
                free_irq(irq, vcpu);
        }

        its_free_vcpu_irqs(its_vm);
        kfree(its_vm->vpes);
        its_vm->nr_vpes = 0;
        its_vm->vpes = NULL;
}

static inline bool vgic_v4_want_doorbell(struct kvm_vcpu *vcpu)
{
        if (vcpu_get_flag(vcpu, IN_WFI))
                return true;

        if (likely(!vcpu_has_nv(vcpu)))
                return false;

        /*
         * GICv4 hardware is only ever used for the L1. Mark the vPE (i.e. the
         * L1 context) nonresident and request a doorbell to kick us out of the
         * L2 when an IRQ becomes pending.
         */
        return vcpu_get_flag(vcpu, IN_NESTED_ERET);
}

int vgic_v4_put(struct kvm_vcpu *vcpu)
{
        struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;

        if (!vgic_supports_direct_msis(vcpu->kvm) || !vpe->resident)
                return 0;

        return its_make_vpe_non_resident(vpe, vgic_v4_want_doorbell(vcpu));
}

int vgic_v4_load(struct kvm_vcpu *vcpu)
{
        struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;
        int err;

        if (!vgic_supports_direct_msis(vcpu->kvm) || vpe->resident)
                return 0;

        if (vcpu_get_flag(vcpu, IN_WFI))
                return 0;

        /*
         * Before making the VPE resident, make sure the redistributor
         * corresponding to our current CPU expects us here. See the
         * doc in drivers/irqchip/irq-gic-v4.c to understand how this
         * turns into a VMOVP command at the ITS level.
         */
        err = irq_set_affinity(vpe->irq, cpumask_of(smp_processor_id()));
        if (err)
                return err;

        err = its_make_vpe_resident(vpe, false, vcpu->kvm->arch.vgic.enabled);
        if (err)
                return err;

        /*
         * Now that the VPE is resident, let's get rid of a potential
         * doorbell interrupt that would still be pending. This is a
         * GICv4.0 only "feature"...
         */
        if (!kvm_vgic_global_state.has_gicv4_1)
                err = irq_set_irqchip_state(vpe->irq, IRQCHIP_STATE_PENDING, false);

        return err;
}

void vgic_v4_commit(struct kvm_vcpu *vcpu)
{
        struct its_vpe *vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe;

        /*
         * No need to wait for the vPE to be ready across a shallow guest
         * exit, as only a vcpu_put will invalidate it.
         */
        if (!vpe->ready)
                its_commit_vpe(vpe);
}

static struct vgic_its *vgic_get_its(struct kvm *kvm,
                                     struct kvm_kernel_irq_routing_entry *irq_entry)
{
        struct kvm_msi msi  = (struct kvm_msi) {
                .address_lo        = irq_entry->msi.address_lo,
                .address_hi        = irq_entry->msi.address_hi,
                .data                = irq_entry->msi.data,
                .flags                = irq_entry->msi.flags,
                .devid                = irq_entry->msi.devid,
        };

        return vgic_msi_to_its(kvm, &msi);
}

int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq,
                               struct kvm_kernel_irq_routing_entry *irq_entry)
{
        struct vgic_its *its;
        struct vgic_irq *irq;
        struct its_vlpi_map map;
        unsigned long flags;
        int ret = 0;

        if (!vgic_supports_direct_msis(kvm))
                return 0;

        /*
         * Get the ITS, and escape early on error (not a valid
         * doorbell for any of our vITSs).
         */
        its = vgic_get_its(kvm, irq_entry);
        if (IS_ERR(its))
                return 0;

        mutex_lock(&its->its_lock);

        /*
         * Perform the actual DevID/EventID -> LPI translation.
         *
         * Silently exit if translation fails as the guest (or userspace!) has
         * managed to do something stupid. Emulated LPI injection will still
         * work if the guest figures itself out at a later time.
         */
        if (vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
                                 irq_entry->msi.data, &irq))
                goto out;

        /* Silently exit if the vLPI is already mapped */
        if (irq->hw)
                goto out;

        /*
         * Emit the mapping request. If it fails, the ITS probably
         * isn't v4 compatible, so let's silently bail out. Holding
         * the ITS lock should ensure that nothing can modify the
         * target vcpu.
         */
        map = (struct its_vlpi_map) {
                .vm                = &kvm->arch.vgic.its_vm,
                .vpe                = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe,
                .vintid                = irq->intid,
                .properties        = ((irq->priority & 0xfc) |
                                   (irq->enabled ? LPI_PROP_ENABLED : 0) |
                                   LPI_PROP_GROUP1),
                .db_enabled        = true,
        };

        ret = its_map_vlpi(virq, &map);
        if (ret)
                goto out;

        irq->hw                = true;
        irq->host_irq        = virq;
        atomic_inc(&map.vpe->vlpi_count);

        /* Transfer pending state */
        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        if (irq->pending_latch) {
                ret = irq_set_irqchip_state(irq->host_irq,
                                            IRQCHIP_STATE_PENDING,
                                            irq->pending_latch);
                WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq);

                /*
                 * Clear pending_latch and communicate this state
                 * change via vgic_queue_irq_unlock.
                 */
                irq->pending_latch = false;
                vgic_queue_irq_unlock(kvm, irq, flags);
        } else {
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
        }

out:
        mutex_unlock(&its->its_lock);
        return ret;
}

int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq,
                                 struct kvm_kernel_irq_routing_entry *irq_entry)
{
        struct vgic_its *its;
        struct vgic_irq *irq;
        int ret;

        if (!vgic_supports_direct_msis(kvm))
                return 0;

        /*
         * Get the ITS, and escape early on error (not a valid
         * doorbell for any of our vITSs).
         */
        its = vgic_get_its(kvm, irq_entry);
        if (IS_ERR(its))
                return 0;

        mutex_lock(&its->its_lock);

        ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid,
                                   irq_entry->msi.data, &irq);
        if (ret)
                goto out;

        WARN_ON(irq->hw && irq->host_irq != virq);
        if (irq->hw) {
                atomic_dec(&irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count);
                irq->hw = false;
                ret = its_unmap_vlpi(virq);
        }

out:
        mutex_unlock(&its->its_lock);
        return ret;
}



















   26 

   26 
   26 




































   20 







































    6 





















    2 







    9 














    9 




    1 

















    4 

    1 

    3 











   26 

    9 

   17 






































    8 

    2 

    6 



    3 

















    2 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Fault injection for both 32 and 64bit guests.
 *
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Based on arch/arm/kvm/emulate.c
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_nested.h>
#include <asm/esr.h>

static void pend_sync_exception(struct kvm_vcpu *vcpu)
{
        /* If not nesting, EL1 is the only possible exception target */
        if (likely(!vcpu_has_nv(vcpu))) {
                kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
                return;
        }

        /*
         * With NV, we need to pick between EL1 and EL2. Note that we
         * never deal with a nesting exception here, hence never
         * changing context, and the exception itself can be delayed
         * until the next entry.
         */
        switch(*vcpu_cpsr(vcpu) & PSR_MODE_MASK) {
        case PSR_MODE_EL2h:
        case PSR_MODE_EL2t:
                kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
                break;
        case PSR_MODE_EL1h:
        case PSR_MODE_EL1t:
                kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
                break;
        case PSR_MODE_EL0t:
                if (vcpu_el2_tge_is_set(vcpu))
                        kvm_pend_exception(vcpu, EXCEPT_AA64_EL2_SYNC);
                else
                        kvm_pend_exception(vcpu, EXCEPT_AA64_EL1_SYNC);
                break;
        default:
                BUG();
        }
}

static bool match_target_el(struct kvm_vcpu *vcpu, unsigned long target)
{
        return (vcpu_get_flag(vcpu, EXCEPT_MASK) == target);
}

static void inject_abt64(struct kvm_vcpu *vcpu, bool is_iabt, unsigned long addr)
{
        unsigned long cpsr = *vcpu_cpsr(vcpu);
        bool is_aarch32 = vcpu_mode_is_32bit(vcpu);
        u64 esr = 0;

        pend_sync_exception(vcpu);

        /*
         * Build an {i,d}abort, depending on the level and the
         * instruction set. Report an external synchronous abort.
         */
        if (kvm_vcpu_trap_il_is32bit(vcpu))
                esr |= ESR_ELx_IL;

        /*
         * Here, the guest runs in AArch64 mode when in EL1. If we get
         * an AArch32 fault, it means we managed to trap an EL0 fault.
         */
        if (is_aarch32 || (cpsr & PSR_MODE_MASK) == PSR_MODE_EL0t)
                esr |= (ESR_ELx_EC_IABT_LOW << ESR_ELx_EC_SHIFT);
        else
                esr |= (ESR_ELx_EC_IABT_CUR << ESR_ELx_EC_SHIFT);

        if (!is_iabt)
                esr |= ESR_ELx_EC_DABT_LOW << ESR_ELx_EC_SHIFT;

        esr |= ESR_ELx_FSC_EXTABT;

        if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC))) {
                vcpu_write_sys_reg(vcpu, addr, FAR_EL1);
                vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
        } else {
                vcpu_write_sys_reg(vcpu, addr, FAR_EL2);
                vcpu_write_sys_reg(vcpu, esr, ESR_EL2);
        }
}

static void inject_undef64(struct kvm_vcpu *vcpu)
{
        u64 esr = (ESR_ELx_EC_UNKNOWN << ESR_ELx_EC_SHIFT);

        pend_sync_exception(vcpu);

        /*
         * Build an unknown exception, depending on the instruction
         * set.
         */
        if (kvm_vcpu_trap_il_is32bit(vcpu))
                esr |= ESR_ELx_IL;

        if (match_target_el(vcpu, unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC)))
                vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
        else
                vcpu_write_sys_reg(vcpu, esr, ESR_EL2);
}

#define DFSR_FSC_EXTABT_LPAE        0x10
#define DFSR_FSC_EXTABT_nLPAE        0x08
#define DFSR_LPAE                BIT(9)
#define TTBCR_EAE                BIT(31)

static void inject_undef32(struct kvm_vcpu *vcpu)
{
        kvm_pend_exception(vcpu, EXCEPT_AA32_UND);
}

/*
 * Modelled after TakeDataAbortException() and TakePrefetchAbortException
 * pseudocode.
 */
static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, u32 addr)
{
        u64 far;
        u32 fsr;

        /* Give the guest an IMPLEMENTATION DEFINED exception */
        if (vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE) {
                fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
        } else {
                /* no need to shuffle FS[4] into DFSR[10] as it's 0 */
                fsr = DFSR_FSC_EXTABT_nLPAE;
        }

        far = vcpu_read_sys_reg(vcpu, FAR_EL1);

        if (is_pabt) {
                kvm_pend_exception(vcpu, EXCEPT_AA32_IABT);
                far &= GENMASK(31, 0);
                far |= (u64)addr << 32;
                vcpu_write_sys_reg(vcpu, fsr, IFSR32_EL2);
        } else { /* !iabt */
                kvm_pend_exception(vcpu, EXCEPT_AA32_DABT);
                far &= GENMASK(63, 32);
                far |= addr;
                vcpu_write_sys_reg(vcpu, fsr, ESR_EL1);
        }

        vcpu_write_sys_reg(vcpu, far, FAR_EL1);
}

/**
 * kvm_inject_dabt - inject a data abort into the guest
 * @vcpu: The VCPU to receive the data abort
 * @addr: The address to report in the DFAR
 *
 * It is assumed that this code is called from the VCPU thread and that the
 * VCPU therefore is not currently executing guest code.
 */
void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr)
{
        if (vcpu_el1_is_32bit(vcpu))
                inject_abt32(vcpu, false, addr);
        else
                inject_abt64(vcpu, false, addr);
}

/**
 * kvm_inject_pabt - inject a prefetch abort into the guest
 * @vcpu: The VCPU to receive the prefetch abort
 * @addr: The address to report in the DFAR
 *
 * It is assumed that this code is called from the VCPU thread and that the
 * VCPU therefore is not currently executing guest code.
 */
void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr)
{
        if (vcpu_el1_is_32bit(vcpu))
                inject_abt32(vcpu, true, addr);
        else
                inject_abt64(vcpu, true, addr);
}

void kvm_inject_size_fault(struct kvm_vcpu *vcpu)
{
        unsigned long addr, esr;

        addr  = kvm_vcpu_get_fault_ipa(vcpu);
        addr |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);

        if (kvm_vcpu_trap_is_iabt(vcpu))
                kvm_inject_pabt(vcpu, addr);
        else
                kvm_inject_dabt(vcpu, addr);

        /*
         * If AArch64 or LPAE, set FSC to 0 to indicate an Address
         * Size Fault at level 0, as if exceeding PARange.
         *
         * Non-LPAE guests will only get the external abort, as there
         * is no way to describe the ASF.
         */
        if (vcpu_el1_is_32bit(vcpu) &&
            !(vcpu_read_sys_reg(vcpu, TCR_EL1) & TTBCR_EAE))
                return;

        esr = vcpu_read_sys_reg(vcpu, ESR_EL1);
        esr &= ~GENMASK_ULL(5, 0);
        vcpu_write_sys_reg(vcpu, esr, ESR_EL1);
}

/**
 * kvm_inject_undefined - inject an undefined instruction into the guest
 * @vcpu: The vCPU in which to inject the exception
 *
 * It is assumed that this code is called from the VCPU thread and that the
 * VCPU therefore is not currently executing guest code.
 */
void kvm_inject_undefined(struct kvm_vcpu *vcpu)
{
        if (vcpu_el1_is_32bit(vcpu))
                inject_undef32(vcpu);
        else
                inject_undef64(vcpu);
}

void kvm_set_sei_esr(struct kvm_vcpu *vcpu, u64 esr)
{
        vcpu_set_vsesr(vcpu, esr & ESR_ELx_ISS_MASK);
        *vcpu_hcr(vcpu) |= HCR_VSE;
}

/**
 * kvm_inject_vabt - inject an async abort / SError into the guest
 * @vcpu: The VCPU to receive the exception
 *
 * It is assumed that this code is called from the VCPU thread and that the
 * VCPU therefore is not currently executing guest code.
 *
 * Systems with the RAS Extensions specify an imp-def ESR (ISV/IDS = 1) with
 * the remaining ISS all-zeros so that this error is not interpreted as an
 * uncategorized RAS error. Without the RAS Extensions we can't specify an ESR
 * value, so the CPU generates an imp-def value.
 */
void kvm_inject_vabt(struct kvm_vcpu *vcpu)
{
        kvm_set_sei_esr(vcpu, ESR_ELx_ISV);
}

























    3 







































    3 






































































































































    3 








    3 





    1 







    3 

    1 

    3 

    3 





    2 



    1 



    3 









    3 







  166 
















  108 

   58 
































  166 




  129 


   45 
   84 





   45 

   84 








  129 











   46 











   46 




















   59 











   59 















































































   45 








   46 
   46 
    1 
   45 



   59 





    1 
   58 
   59 














































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012-2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#include <hyp/adjust_pc.h>

#include <linux/compiler.h>
#include <linux/irqchip/arm-gic-v3.h>
#include <linux/kvm_host.h>

#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>

#define vtr_to_max_lr_idx(v)                ((v) & 0xf)
#define vtr_to_nr_pre_bits(v)                ((((u32)(v) >> 26) & 7) + 1)
#define vtr_to_nr_apr_regs(v)                (1 << (vtr_to_nr_pre_bits(v) - 5))

u64 __gic_v3_get_lr(unsigned int lr)
{
        switch (lr & 0xf) {
        case 0:
                return read_gicreg(ICH_LR0_EL2);
        case 1:
                return read_gicreg(ICH_LR1_EL2);
        case 2:
                return read_gicreg(ICH_LR2_EL2);
        case 3:
                return read_gicreg(ICH_LR3_EL2);
        case 4:
                return read_gicreg(ICH_LR4_EL2);
        case 5:
                return read_gicreg(ICH_LR5_EL2);
        case 6:
                return read_gicreg(ICH_LR6_EL2);
        case 7:
                return read_gicreg(ICH_LR7_EL2);
        case 8:
                return read_gicreg(ICH_LR8_EL2);
        case 9:
                return read_gicreg(ICH_LR9_EL2);
        case 10:
                return read_gicreg(ICH_LR10_EL2);
        case 11:
                return read_gicreg(ICH_LR11_EL2);
        case 12:
                return read_gicreg(ICH_LR12_EL2);
        case 13:
                return read_gicreg(ICH_LR13_EL2);
        case 14:
                return read_gicreg(ICH_LR14_EL2);
        case 15:
                return read_gicreg(ICH_LR15_EL2);
        }

        unreachable();
}

static void __gic_v3_set_lr(u64 val, int lr)
{
        switch (lr & 0xf) {
        case 0:
                write_gicreg(val, ICH_LR0_EL2);
                break;
        case 1:
                write_gicreg(val, ICH_LR1_EL2);
                break;
        case 2:
                write_gicreg(val, ICH_LR2_EL2);
                break;
        case 3:
                write_gicreg(val, ICH_LR3_EL2);
                break;
        case 4:
                write_gicreg(val, ICH_LR4_EL2);
                break;
        case 5:
                write_gicreg(val, ICH_LR5_EL2);
                break;
        case 6:
                write_gicreg(val, ICH_LR6_EL2);
                break;
        case 7:
                write_gicreg(val, ICH_LR7_EL2);
                break;
        case 8:
                write_gicreg(val, ICH_LR8_EL2);
                break;
        case 9:
                write_gicreg(val, ICH_LR9_EL2);
                break;
        case 10:
                write_gicreg(val, ICH_LR10_EL2);
                break;
        case 11:
                write_gicreg(val, ICH_LR11_EL2);
                break;
        case 12:
                write_gicreg(val, ICH_LR12_EL2);
                break;
        case 13:
                write_gicreg(val, ICH_LR13_EL2);
                break;
        case 14:
                write_gicreg(val, ICH_LR14_EL2);
                break;
        case 15:
                write_gicreg(val, ICH_LR15_EL2);
                break;
        }
}

static void __vgic_v3_write_ap0rn(u32 val, int n)
{
        switch (n) {
        case 0:
                write_gicreg(val, ICH_AP0R0_EL2);
                break;
        case 1:
                write_gicreg(val, ICH_AP0R1_EL2);
                break;
        case 2:
                write_gicreg(val, ICH_AP0R2_EL2);
                break;
        case 3:
                write_gicreg(val, ICH_AP0R3_EL2);
                break;
        }
}

static void __vgic_v3_write_ap1rn(u32 val, int n)
{
        switch (n) {
        case 0:
                write_gicreg(val, ICH_AP1R0_EL2);
                break;
        case 1:
                write_gicreg(val, ICH_AP1R1_EL2);
                break;
        case 2:
                write_gicreg(val, ICH_AP1R2_EL2);
                break;
        case 3:
                write_gicreg(val, ICH_AP1R3_EL2);
                break;
        }
}

static u32 __vgic_v3_read_ap0rn(int n)
{
        u32 val;

        switch (n) {
        case 0:
                val = read_gicreg(ICH_AP0R0_EL2);
                break;
        case 1:
                val = read_gicreg(ICH_AP0R1_EL2);
                break;
        case 2:
                val = read_gicreg(ICH_AP0R2_EL2);
                break;
        case 3:
                val = read_gicreg(ICH_AP0R3_EL2);
                break;
        default:
                unreachable();
        }

        return val;
}

static u32 __vgic_v3_read_ap1rn(int n)
{
        u32 val;

        switch (n) {
        case 0:
                val = read_gicreg(ICH_AP1R0_EL2);
                break;
        case 1:
                val = read_gicreg(ICH_AP1R1_EL2);
                break;
        case 2:
                val = read_gicreg(ICH_AP1R2_EL2);
                break;
        case 3:
                val = read_gicreg(ICH_AP1R3_EL2);
                break;
        default:
                unreachable();
        }

        return val;
}

void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
{
        u64 used_lrs = cpu_if->used_lrs;

        /*
         * Make sure stores to the GIC via the memory mapped interface
         * are now visible to the system register interface when reading the
         * LRs, and when reading back the VMCR on non-VHE systems.
         */
        if (used_lrs || !has_vhe()) {
                if (!cpu_if->vgic_sre) {
                        dsb(sy);
                        isb();
                }
        }

        if (used_lrs || cpu_if->its_vpe.its_vm) {
                int i;
                u32 elrsr;

                elrsr = read_gicreg(ICH_ELRSR_EL2);

                write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2);

                for (i = 0; i < used_lrs; i++) {
                        if (elrsr & (1 << i))
                                cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
                        else
                                cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);

                        __gic_v3_set_lr(0, i);
                }
        }
}

void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if)
{
        u64 used_lrs = cpu_if->used_lrs;
        int i;

        if (used_lrs || cpu_if->its_vpe.its_vm) {
                write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);

                for (i = 0; i < used_lrs; i++)
                        __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
        }

        /*
         * Ensure that writes to the LRs, and on non-VHE systems ensure that
         * the write to the VMCR in __vgic_v3_activate_traps(), will have
         * reached the (re)distributors. This ensure the guest will read the
         * correct values from the memory-mapped interface.
         */
        if (used_lrs || !has_vhe()) {
                if (!cpu_if->vgic_sre) {
                        isb();
                        dsb(sy);
                }
        }
}

void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
{
        /*
         * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
         * Group0 interrupt (as generated in GICv2 mode) to be
         * delivered as a FIQ to the guest, with potentially fatal
         * consequences. So we must make sure that ICC_SRE_EL1 has
         * been actually programmed with the value we want before
         * starting to mess with the rest of the GIC, and VMCR_EL2 in
         * particular.  This logic must be called before
         * __vgic_v3_restore_state().
         *
         * However, if the vgic is disabled (ICH_HCR_EL2.EN==0), no GIC is
         * provisioned at all. In order to prevent illegal accesses to the
         * system registers to trap to EL1 (duh), force ICC_SRE_EL1.SRE to 1
         * so that the trap bits can take effect. Yes, we *loves* the GIC.
         */
        if (!(cpu_if->vgic_hcr & ICH_HCR_EL2_En)) {
                write_gicreg(ICC_SRE_EL1_SRE, ICC_SRE_EL1);
                isb();
        } else if (!cpu_if->vgic_sre) {
                write_gicreg(0, ICC_SRE_EL1);
                isb();
                write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);


                if (has_vhe()) {
                        /*
                         * Ensure that the write to the VMCR will have reached
                         * the (re)distributors. This ensure the guest will
                         * read the correct values from the memory-mapped
                         * interface.
                         */
                        isb();
                        dsb(sy);
                }
        }

        /*
         * Prevent the guest from touching the ICC_SRE_EL1 system
         * register. Note that this may not have any effect, as
         * ICC_SRE_EL2.Enable being RAO/WI is a valid implementation.
         */
        write_gicreg(read_gicreg(ICC_SRE_EL2) & ~ICC_SRE_EL2_ENABLE,
                     ICC_SRE_EL2);

        /*
         * If we need to trap system registers, we must write
         * ICH_HCR_EL2 anyway, even if no interrupts are being
         * injected. Note that this also applies if we don't expect
         * any system register access (no vgic at all).
         */
        if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
            cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
                write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
}

void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
{
        u64 val;

        if (!cpu_if->vgic_sre) {
                cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
        }

        val = read_gicreg(ICC_SRE_EL2);
        write_gicreg(val | ICC_SRE_EL2_ENABLE, ICC_SRE_EL2);

        if (!cpu_if->vgic_sre) {
                /* Make sure ENABLE is set at EL2 before setting SRE at EL1 */
                isb();
                write_gicreg(1, ICC_SRE_EL1);
        }

        /*
         * If we were trapping system registers, we enabled the VGIC even if
         * no interrupts were being injected, and we disable it again here.
         */
        if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
            cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
                write_gicreg(0, ICH_HCR_EL2);
}

static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
{
        u64 val;
        u32 nr_pre_bits;

        val = read_gicreg(ICH_VTR_EL2);
        nr_pre_bits = vtr_to_nr_pre_bits(val);

        switch (nr_pre_bits) {
        case 7:
                cpu_if->vgic_ap0r[3] = __vgic_v3_read_ap0rn(3);
                cpu_if->vgic_ap0r[2] = __vgic_v3_read_ap0rn(2);
                fallthrough;
        case 6:
                cpu_if->vgic_ap0r[1] = __vgic_v3_read_ap0rn(1);
                fallthrough;
        default:
                cpu_if->vgic_ap0r[0] = __vgic_v3_read_ap0rn(0);
        }

        switch (nr_pre_bits) {
        case 7:
                cpu_if->vgic_ap1r[3] = __vgic_v3_read_ap1rn(3);
                cpu_if->vgic_ap1r[2] = __vgic_v3_read_ap1rn(2);
                fallthrough;
        case 6:
                cpu_if->vgic_ap1r[1] = __vgic_v3_read_ap1rn(1);
                fallthrough;
        default:
                cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0);
        }
}

static void __vgic_v3_restore_aprs(struct vgic_v3_cpu_if *cpu_if)
{
        u64 val;
        u32 nr_pre_bits;

        val = read_gicreg(ICH_VTR_EL2);
        nr_pre_bits = vtr_to_nr_pre_bits(val);

        switch (nr_pre_bits) {
        case 7:
                __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[3], 3);
                __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[2], 2);
                fallthrough;
        case 6:
                __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[1], 1);
                fallthrough;
        default:
                __vgic_v3_write_ap0rn(cpu_if->vgic_ap0r[0], 0);
        }

        switch (nr_pre_bits) {
        case 7:
                __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[3], 3);
                __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[2], 2);
                fallthrough;
        case 6:
                __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[1], 1);
                fallthrough;
        default:
                __vgic_v3_write_ap1rn(cpu_if->vgic_ap1r[0], 0);
        }
}

void __vgic_v3_init_lrs(void)
{
        int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
        int i;

        for (i = 0; i <= max_lr_idx; i++)
                __gic_v3_set_lr(0, i);
}

/*
 * Return the GIC CPU configuration:
 * - [31:0]  ICH_VTR_EL2
 * - [62:32] RES0
 * - [63]    MMIO (GICv2) capable
 */
u64 __vgic_v3_get_gic_config(void)
{
        u64 val, sre = read_gicreg(ICC_SRE_EL1);
        unsigned long flags = 0;

        /*
         * To check whether we have a MMIO-based (GICv2 compatible)
         * CPU interface, we need to disable the system register
         * view.
         *
         * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates
         * that to be able to set ICC_SRE_EL1.SRE to 0, all the
         * interrupt overrides must be set. You've got to love this.
         *
         * As we always run VHE with HCR_xMO set, no extra xMO
         * manipulation is required in that case.
         *
         * To safely disable SRE, we have to prevent any interrupt
         * from firing (which would be deadly). This only makes sense
         * on VHE, as interrupts are already masked for nVHE as part
         * of the exception entry to EL2.
         */
        if (has_vhe()) {
                flags = local_daif_save();
        } else {
                sysreg_clear_set_hcr(0, HCR_AMO | HCR_FMO | HCR_IMO);
                isb();
        }

        write_gicreg(0, ICC_SRE_EL1);
        isb();

        val = read_gicreg(ICC_SRE_EL1);

        write_gicreg(sre, ICC_SRE_EL1);
        isb();

        if (has_vhe()) {
                local_daif_restore(flags);
        } else {
                sysreg_clear_set_hcr(HCR_AMO | HCR_FMO | HCR_IMO, 0);
                isb();
        }

        val  = (val & ICC_SRE_EL1_SRE) ? 0 : (1ULL << 63);
        val |= read_gicreg(ICH_VTR_EL2);

        return val;
}

static u64 __vgic_v3_read_vmcr(void)
{
        return read_gicreg(ICH_VMCR_EL2);
}

static void __vgic_v3_write_vmcr(u32 vmcr)
{
        write_gicreg(vmcr, ICH_VMCR_EL2);
}

void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
{
        __vgic_v3_save_aprs(cpu_if);
        if (cpu_if->vgic_sre)
                cpu_if->vgic_vmcr = __vgic_v3_read_vmcr();
}

void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
{
        /*
         * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen
         * is dependent on ICC_SRE_EL1.SRE, and we have to perform the
         * VMCR_EL2 save/restore in the world switch.
         */
        if (cpu_if->vgic_sre)
                __vgic_v3_write_vmcr(cpu_if->vgic_vmcr);
        __vgic_v3_restore_aprs(cpu_if);
}

static int __vgic_v3_bpr_min(void)
{
        /* See Pseudocode for VPriorityGroup */
        return 8 - vtr_to_nr_pre_bits(read_gicreg(ICH_VTR_EL2));
}

static int __vgic_v3_get_group(struct kvm_vcpu *vcpu)
{
        u64 esr = kvm_vcpu_get_esr(vcpu);
        u8 crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT;

        return crm != 8;
}

#define GICv3_IDLE_PRIORITY        0xff

static int __vgic_v3_highest_priority_lr(struct kvm_vcpu *vcpu, u32 vmcr,
                                         u64 *lr_val)
{
        unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
        u8 priority = GICv3_IDLE_PRIORITY;
        int i, lr = -1;

        for (i = 0; i < used_lrs; i++) {
                u64 val = __gic_v3_get_lr(i);
                u8 lr_prio = (val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;

                /* Not pending in the state? */
                if ((val & ICH_LR_STATE) != ICH_LR_PENDING_BIT)
                        continue;

                /* Group-0 interrupt, but Group-0 disabled? */
                if (!(val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG0_MASK))
                        continue;

                /* Group-1 interrupt, but Group-1 disabled? */
                if ((val & ICH_LR_GROUP) && !(vmcr & ICH_VMCR_ENG1_MASK))
                        continue;

                /* Not the highest priority? */
                if (lr_prio >= priority)
                        continue;

                /* This is a candidate */
                priority = lr_prio;
                *lr_val = val;
                lr = i;
        }

        if (lr == -1)
                *lr_val = ICC_IAR1_EL1_SPURIOUS;

        return lr;
}

static int __vgic_v3_find_active_lr(struct kvm_vcpu *vcpu, int intid,
                                    u64 *lr_val)
{
        unsigned int used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
        int i;

        for (i = 0; i < used_lrs; i++) {
                u64 val = __gic_v3_get_lr(i);

                if ((val & ICH_LR_VIRTUAL_ID_MASK) == intid &&
                    (val & ICH_LR_ACTIVE_BIT)) {
                        *lr_val = val;
                        return i;
                }
        }

        *lr_val = ICC_IAR1_EL1_SPURIOUS;
        return -1;
}

static int __vgic_v3_get_highest_active_priority(void)
{
        u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
        u32 hap = 0;
        int i;

        for (i = 0; i < nr_apr_regs; i++) {
                u32 val;

                /*
                 * The ICH_AP0Rn_EL2 and ICH_AP1Rn_EL2 registers
                 * contain the active priority levels for this VCPU
                 * for the maximum number of supported priority
                 * levels, and we return the full priority level only
                 * if the BPR is programmed to its minimum, otherwise
                 * we return a combination of the priority level and
                 * subpriority, as determined by the setting of the
                 * BPR, but without the full subpriority.
                 */
                val  = __vgic_v3_read_ap0rn(i);
                val |= __vgic_v3_read_ap1rn(i);
                if (!val) {
                        hap += 32;
                        continue;
                }

                return (hap + __ffs(val)) << __vgic_v3_bpr_min();
        }

        return GICv3_IDLE_PRIORITY;
}

static unsigned int __vgic_v3_get_bpr0(u32 vmcr)
{
        return (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
}

static unsigned int __vgic_v3_get_bpr1(u32 vmcr)
{
        unsigned int bpr;

        if (vmcr & ICH_VMCR_CBPR_MASK) {
                bpr = __vgic_v3_get_bpr0(vmcr);
                if (bpr < 7)
                        bpr++;
        } else {
                bpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
        }

        return bpr;
}

/*
 * Convert a priority to a preemption level, taking the relevant BPR
 * into account by zeroing the sub-priority bits.
 */
static u8 __vgic_v3_pri_to_pre(u8 pri, u32 vmcr, int grp)
{
        unsigned int bpr;

        if (!grp)
                bpr = __vgic_v3_get_bpr0(vmcr) + 1;
        else
                bpr = __vgic_v3_get_bpr1(vmcr);

        return pri & (GENMASK(7, 0) << bpr);
}

/*
 * The priority value is independent of any of the BPR values, so we
 * normalize it using the minimal BPR value. This guarantees that no
 * matter what the guest does with its BPR, we can always set/get the
 * same value of a priority.
 */
static void __vgic_v3_set_active_priority(u8 pri, u32 vmcr, int grp)
{
        u8 pre, ap;
        u32 val;
        int apr;

        pre = __vgic_v3_pri_to_pre(pri, vmcr, grp);
        ap = pre >> __vgic_v3_bpr_min();
        apr = ap / 32;

        if (!grp) {
                val = __vgic_v3_read_ap0rn(apr);
                __vgic_v3_write_ap0rn(val | BIT(ap % 32), apr);
        } else {
                val = __vgic_v3_read_ap1rn(apr);
                __vgic_v3_write_ap1rn(val | BIT(ap % 32), apr);
        }
}

static int __vgic_v3_clear_highest_active_priority(void)
{
        u8 nr_apr_regs = vtr_to_nr_apr_regs(read_gicreg(ICH_VTR_EL2));
        u32 hap = 0;
        int i;

        for (i = 0; i < nr_apr_regs; i++) {
                u32 ap0, ap1;
                int c0, c1;

                ap0 = __vgic_v3_read_ap0rn(i);
                ap1 = __vgic_v3_read_ap1rn(i);
                if (!ap0 && !ap1) {
                        hap += 32;
                        continue;
                }

                c0 = ap0 ? __ffs(ap0) : 32;
                c1 = ap1 ? __ffs(ap1) : 32;

                /* Always clear the LSB, which is the highest priority */
                if (c0 < c1) {
                        ap0 &= ~BIT(c0);
                        __vgic_v3_write_ap0rn(ap0, i);
                        hap += c0;
                } else {
                        ap1 &= ~BIT(c1);
                        __vgic_v3_write_ap1rn(ap1, i);
                        hap += c1;
                }

                /* Rescale to 8 bits of priority */
                return hap << __vgic_v3_bpr_min();
        }

        return GICv3_IDLE_PRIORITY;
}

static void __vgic_v3_read_iar(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u64 lr_val;
        u8 lr_prio, pmr;
        int lr, grp;

        grp = __vgic_v3_get_group(vcpu);

        lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
        if (lr < 0)
                goto spurious;

        if (grp != !!(lr_val & ICH_LR_GROUP))
                goto spurious;

        pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
        lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;
        if (pmr <= lr_prio)
                goto spurious;

        if (__vgic_v3_get_highest_active_priority() <= __vgic_v3_pri_to_pre(lr_prio, vmcr, grp))
                goto spurious;

        lr_val &= ~ICH_LR_STATE;
        lr_val |= ICH_LR_ACTIVE_BIT;
        __gic_v3_set_lr(lr_val, lr);
        __vgic_v3_set_active_priority(lr_prio, vmcr, grp);
        vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
        return;

spurious:
        vcpu_set_reg(vcpu, rt, ICC_IAR1_EL1_SPURIOUS);
}

static void __vgic_v3_clear_active_lr(int lr, u64 lr_val)
{
        lr_val &= ~ICH_LR_ACTIVE_BIT;
        if (lr_val & ICH_LR_HW) {
                u32 pid;

                pid = (lr_val & ICH_LR_PHYS_ID_MASK) >> ICH_LR_PHYS_ID_SHIFT;
                gic_write_dir(pid);
        }

        __gic_v3_set_lr(lr_val, lr);
}

static void __vgic_v3_bump_eoicount(void)
{
        u32 hcr;

        hcr = read_gicreg(ICH_HCR_EL2);
        hcr += 1 << ICH_HCR_EL2_EOIcount_SHIFT;
        write_gicreg(hcr, ICH_HCR_EL2);
}

static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u32 vid = vcpu_get_reg(vcpu, rt);
        u64 lr_val;
        int lr;

        /* EOImode == 0, nothing to be done here */
        if (!(vmcr & ICH_VMCR_EOIM_MASK))
                return;

        /* No deactivate to be performed on an LPI */
        if (vid >= VGIC_MIN_LPI)
                return;

        lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
        if (lr == -1) {
                __vgic_v3_bump_eoicount();
                return;
        }

        __vgic_v3_clear_active_lr(lr, lr_val);
}

static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u32 vid = vcpu_get_reg(vcpu, rt);
        u64 lr_val;
        u8 lr_prio, act_prio;
        int lr, grp;

        grp = __vgic_v3_get_group(vcpu);

        /* Drop priority in any case */
        act_prio = __vgic_v3_clear_highest_active_priority();

        lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
        if (lr == -1) {
                /* Do not bump EOIcount for LPIs that aren't in the LRs */
                if (!(vid >= VGIC_MIN_LPI))
                        __vgic_v3_bump_eoicount();
                return;
        }

        /* EOImode == 1 and not an LPI, nothing to be done here */
        if ((vmcr & ICH_VMCR_EOIM_MASK) && !(vid >= VGIC_MIN_LPI))
                return;

        lr_prio = (lr_val & ICH_LR_PRIORITY_MASK) >> ICH_LR_PRIORITY_SHIFT;

        /* If priorities or group do not match, the guest has fscked-up. */
        if (grp != !!(lr_val & ICH_LR_GROUP) ||
            __vgic_v3_pri_to_pre(lr_prio, vmcr, grp) != act_prio)
                return;

        /* Let's now perform the deactivation */
        __vgic_v3_clear_active_lr(lr, lr_val);
}

static void __vgic_v3_read_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG0_MASK));
}

static void __vgic_v3_read_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        vcpu_set_reg(vcpu, rt, !!(vmcr & ICH_VMCR_ENG1_MASK));
}

static void __vgic_v3_write_igrpen0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u64 val = vcpu_get_reg(vcpu, rt);

        if (val & 1)
                vmcr |= ICH_VMCR_ENG0_MASK;
        else
                vmcr &= ~ICH_VMCR_ENG0_MASK;

        __vgic_v3_write_vmcr(vmcr);
}

static void __vgic_v3_write_igrpen1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u64 val = vcpu_get_reg(vcpu, rt);

        if (val & 1)
                vmcr |= ICH_VMCR_ENG1_MASK;
        else
                vmcr &= ~ICH_VMCR_ENG1_MASK;

        __vgic_v3_write_vmcr(vmcr);
}

static void __vgic_v3_read_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr0(vmcr));
}

static void __vgic_v3_read_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        vcpu_set_reg(vcpu, rt, __vgic_v3_get_bpr1(vmcr));
}

static void __vgic_v3_write_bpr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u64 val = vcpu_get_reg(vcpu, rt);
        u8 bpr_min = __vgic_v3_bpr_min() - 1;

        /* Enforce BPR limiting */
        if (val < bpr_min)
                val = bpr_min;

        val <<= ICH_VMCR_BPR0_SHIFT;
        val &= ICH_VMCR_BPR0_MASK;
        vmcr &= ~ICH_VMCR_BPR0_MASK;
        vmcr |= val;

        __vgic_v3_write_vmcr(vmcr);
}

static void __vgic_v3_write_bpr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u64 val = vcpu_get_reg(vcpu, rt);
        u8 bpr_min = __vgic_v3_bpr_min();

        if (vmcr & ICH_VMCR_CBPR_MASK)
                return;

        /* Enforce BPR limiting */
        if (val < bpr_min)
                val = bpr_min;

        val <<= ICH_VMCR_BPR1_SHIFT;
        val &= ICH_VMCR_BPR1_MASK;
        vmcr &= ~ICH_VMCR_BPR1_MASK;
        vmcr |= val;

        __vgic_v3_write_vmcr(vmcr);
}

static void __vgic_v3_read_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
{
        u32 val;

        if (!__vgic_v3_get_group(vcpu))
                val = __vgic_v3_read_ap0rn(n);
        else
                val = __vgic_v3_read_ap1rn(n);

        vcpu_set_reg(vcpu, rt, val);
}

static void __vgic_v3_write_apxrn(struct kvm_vcpu *vcpu, int rt, int n)
{
        u32 val = vcpu_get_reg(vcpu, rt);

        if (!__vgic_v3_get_group(vcpu))
                __vgic_v3_write_ap0rn(val, n);
        else
                __vgic_v3_write_ap1rn(val, n);
}

static void __vgic_v3_read_apxr0(struct kvm_vcpu *vcpu,
                                            u32 vmcr, int rt)
{
        __vgic_v3_read_apxrn(vcpu, rt, 0);
}

static void __vgic_v3_read_apxr1(struct kvm_vcpu *vcpu,
                                            u32 vmcr, int rt)
{
        __vgic_v3_read_apxrn(vcpu, rt, 1);
}

static void __vgic_v3_read_apxr2(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        __vgic_v3_read_apxrn(vcpu, rt, 2);
}

static void __vgic_v3_read_apxr3(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        __vgic_v3_read_apxrn(vcpu, rt, 3);
}

static void __vgic_v3_write_apxr0(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        __vgic_v3_write_apxrn(vcpu, rt, 0);
}

static void __vgic_v3_write_apxr1(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        __vgic_v3_write_apxrn(vcpu, rt, 1);
}

static void __vgic_v3_write_apxr2(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        __vgic_v3_write_apxrn(vcpu, rt, 2);
}

static void __vgic_v3_write_apxr3(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        __vgic_v3_write_apxrn(vcpu, rt, 3);
}

static void __vgic_v3_read_hppir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u64 lr_val;
        int lr, lr_grp, grp;

        grp = __vgic_v3_get_group(vcpu);

        lr = __vgic_v3_highest_priority_lr(vcpu, vmcr, &lr_val);
        if (lr == -1)
                goto spurious;

        lr_grp = !!(lr_val & ICH_LR_GROUP);
        if (lr_grp != grp)
                lr_val = ICC_IAR1_EL1_SPURIOUS;

spurious:
        vcpu_set_reg(vcpu, rt, lr_val & ICH_LR_VIRTUAL_ID_MASK);
}

static void __vgic_v3_read_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        vmcr &= ICH_VMCR_PMR_MASK;
        vmcr >>= ICH_VMCR_PMR_SHIFT;
        vcpu_set_reg(vcpu, rt, vmcr);
}

static void __vgic_v3_write_pmr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u32 val = vcpu_get_reg(vcpu, rt);

        val <<= ICH_VMCR_PMR_SHIFT;
        val &= ICH_VMCR_PMR_MASK;
        vmcr &= ~ICH_VMCR_PMR_MASK;
        vmcr |= val;

        write_gicreg(vmcr, ICH_VMCR_EL2);
}

static void __vgic_v3_read_rpr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u32 val = __vgic_v3_get_highest_active_priority();
        vcpu_set_reg(vcpu, rt, val);
}

static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u32 vtr, val;

        vtr = read_gicreg(ICH_VTR_EL2);
        /* PRIbits */
        val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
        /* IDbits */
        val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
        /* A3V */
        val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
        /* EOImode */
        val |= ((vmcr & ICH_VMCR_EOIM_MASK) >> ICH_VMCR_EOIM_SHIFT) << ICC_CTLR_EL1_EOImode_SHIFT;
        /* CBPR */
        val |= (vmcr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT;

        vcpu_set_reg(vcpu, rt, val);
}

static void __vgic_v3_write_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
        u32 val = vcpu_get_reg(vcpu, rt);

        if (val & ICC_CTLR_EL1_CBPR_MASK)
                vmcr |= ICH_VMCR_CBPR_MASK;
        else
                vmcr &= ~ICH_VMCR_CBPR_MASK;

        if (val & ICC_CTLR_EL1_EOImode_MASK)
                vmcr |= ICH_VMCR_EOIM_MASK;
        else
                vmcr &= ~ICH_VMCR_EOIM_MASK;

        write_gicreg(vmcr, ICH_VMCR_EL2);
}

static bool __vgic_v3_check_trap_forwarding(struct kvm_vcpu *vcpu,
                                            u32 sysreg, bool is_read)
{
        u64 ich_hcr;

        if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu))
                return false;

        ich_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);

        switch (sysreg) {
        case SYS_ICC_IGRPEN0_EL1:
                if (is_read &&
                    (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1))
                        return true;

                if (!is_read &&
                    (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1))
                        return true;

                fallthrough;

        case SYS_ICC_AP0Rn_EL1(0):
        case SYS_ICC_AP0Rn_EL1(1):
        case SYS_ICC_AP0Rn_EL1(2):
        case SYS_ICC_AP0Rn_EL1(3):
        case SYS_ICC_BPR0_EL1:
        case SYS_ICC_EOIR0_EL1:
        case SYS_ICC_HPPIR0_EL1:
        case SYS_ICC_IAR0_EL1:
                return ich_hcr & ICH_HCR_EL2_TALL0;

        case SYS_ICC_IGRPEN1_EL1:
                if (is_read &&
                    (__vcpu_sys_reg(vcpu, HFGRTR_EL2) & HFGRTR_EL2_ICC_IGRPENn_EL1))
                        return true;

                if (!is_read &&
                    (__vcpu_sys_reg(vcpu, HFGWTR_EL2) & HFGWTR_EL2_ICC_IGRPENn_EL1))
                        return true;

                fallthrough;

        case SYS_ICC_AP1Rn_EL1(0):
        case SYS_ICC_AP1Rn_EL1(1):
        case SYS_ICC_AP1Rn_EL1(2):
        case SYS_ICC_AP1Rn_EL1(3):
        case SYS_ICC_BPR1_EL1:
        case SYS_ICC_EOIR1_EL1:
        case SYS_ICC_HPPIR1_EL1:
        case SYS_ICC_IAR1_EL1:
                return ich_hcr & ICH_HCR_EL2_TALL1;

        case SYS_ICC_DIR_EL1:
                if (ich_hcr & ICH_HCR_EL2_TDIR)
                        return true;

                fallthrough;

        case SYS_ICC_RPR_EL1:
        case SYS_ICC_CTLR_EL1:
        case SYS_ICC_PMR_EL1:
                return ich_hcr & ICH_HCR_EL2_TC;

        default:
                return false;
        }
}

int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
{
        int rt;
        u64 esr;
        u32 vmcr;
        void (*fn)(struct kvm_vcpu *, u32, int);
        bool is_read;
        u32 sysreg;

        if (kern_hyp_va(vcpu->kvm)->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
                return 0;

        esr = kvm_vcpu_get_esr(vcpu);
        if (vcpu_mode_is_32bit(vcpu)) {
                if (!kvm_condition_valid(vcpu)) {
                        __kvm_skip_instr(vcpu);
                        return 1;
                }

                sysreg = esr_cp15_to_sysreg(esr);
        } else {
                sysreg = esr_sys64_to_sysreg(esr);
        }

        is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ;

        if (__vgic_v3_check_trap_forwarding(vcpu, sysreg, is_read))
                return 0;

        switch (sysreg) {
        case SYS_ICC_IAR0_EL1:
        case SYS_ICC_IAR1_EL1:
                if (unlikely(!is_read))
                        return 0;
                fn = __vgic_v3_read_iar;
                break;
        case SYS_ICC_EOIR0_EL1:
        case SYS_ICC_EOIR1_EL1:
                if (unlikely(is_read))
                        return 0;
                fn = __vgic_v3_write_eoir;
                break;
        case SYS_ICC_IGRPEN1_EL1:
                if (is_read)
                        fn = __vgic_v3_read_igrpen1;
                else
                        fn = __vgic_v3_write_igrpen1;
                break;
        case SYS_ICC_BPR1_EL1:
                if (is_read)
                        fn = __vgic_v3_read_bpr1;
                else
                        fn = __vgic_v3_write_bpr1;
                break;
        case SYS_ICC_AP0Rn_EL1(0):
        case SYS_ICC_AP1Rn_EL1(0):
                if (is_read)
                        fn = __vgic_v3_read_apxr0;
                else
                        fn = __vgic_v3_write_apxr0;
                break;
        case SYS_ICC_AP0Rn_EL1(1):
        case SYS_ICC_AP1Rn_EL1(1):
                if (is_read)
                        fn = __vgic_v3_read_apxr1;
                else
                        fn = __vgic_v3_write_apxr1;
                break;
        case SYS_ICC_AP0Rn_EL1(2):
        case SYS_ICC_AP1Rn_EL1(2):
                if (is_read)
                        fn = __vgic_v3_read_apxr2;
                else
                        fn = __vgic_v3_write_apxr2;
                break;
        case SYS_ICC_AP0Rn_EL1(3):
        case SYS_ICC_AP1Rn_EL1(3):
                if (is_read)
                        fn = __vgic_v3_read_apxr3;
                else
                        fn = __vgic_v3_write_apxr3;
                break;
        case SYS_ICC_HPPIR0_EL1:
        case SYS_ICC_HPPIR1_EL1:
                if (unlikely(!is_read))
                        return 0;
                fn = __vgic_v3_read_hppir;
                break;
        case SYS_ICC_IGRPEN0_EL1:
                if (is_read)
                        fn = __vgic_v3_read_igrpen0;
                else
                        fn = __vgic_v3_write_igrpen0;
                break;
        case SYS_ICC_BPR0_EL1:
                if (is_read)
                        fn = __vgic_v3_read_bpr0;
                else
                        fn = __vgic_v3_write_bpr0;
                break;
        case SYS_ICC_DIR_EL1:
                if (unlikely(is_read))
                        return 0;
                fn = __vgic_v3_write_dir;
                break;
        case SYS_ICC_RPR_EL1:
                if (unlikely(!is_read))
                        return 0;
                fn = __vgic_v3_read_rpr;
                break;
        case SYS_ICC_CTLR_EL1:
                if (is_read)
                        fn = __vgic_v3_read_ctlr;
                else
                        fn = __vgic_v3_write_ctlr;
                break;
        case SYS_ICC_PMR_EL1:
                if (is_read)
                        fn = __vgic_v3_read_pmr;
                else
                        fn = __vgic_v3_write_pmr;
                break;
        default:
                return 0;
        }

        vmcr = __vgic_v3_read_vmcr();
        rt = kvm_vcpu_sys_get_rt(vcpu);
        fn(vcpu, vmcr, rt);

        __kvm_skip_instr(vcpu);

        return 1;
}




















  494 






































  263 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/* SPDX-License-Identifier: GPL-2.0 */
/* Perform sanity checking for object sizes for uaccess.h and uio.h. */
#ifndef __LINUX_UCOPYSIZE_H__
#define __LINUX_UCOPYSIZE_H__

#include <linux/bug.h>

#ifdef CONFIG_HARDENED_USERCOPY
#include <linux/jump_label.h>
extern void __check_object_size(const void *ptr, unsigned long n,
                                        bool to_user);

DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
                           validate_usercopy_range);

static __always_inline void check_object_size(const void *ptr, unsigned long n,
                                              bool to_user)
{
        if (!__builtin_constant_p(n) &&
            static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON,
                                &validate_usercopy_range)) {
                __check_object_size(ptr, n, to_user);
        }
}
#else
static inline void check_object_size(const void *ptr, unsigned long n,
                                     bool to_user)
{ }
#endif /* CONFIG_HARDENED_USERCOPY */

extern void __compiletime_error("copy source size is too small")
__bad_copy_from(void);
extern void __compiletime_error("copy destination size is too small")
__bad_copy_to(void);

void __copy_overflow(int size, unsigned long count);

static inline void copy_overflow(int size, unsigned long count)
{
        if (IS_ENABLED(CONFIG_BUG))
                __copy_overflow(size, count);
}

static __always_inline __must_check bool
check_copy_size(const void *addr, size_t bytes, bool is_source)
{
        int sz = __builtin_object_size(addr, 0);
        if (unlikely(sz >= 0 && sz < bytes)) {
                if (!__builtin_constant_p(bytes))
                        copy_overflow(sz, bytes);
                else if (is_source)
                        __bad_copy_from();
                else
                        __bad_copy_to();
                return false;
        }
        if (WARN_ON_ONCE(bytes > INT_MAX))
                return false;
        check_object_size(addr, bytes, is_source);
        return true;
}

#endif /* __LINUX_UCOPYSIZE_H__ */





















































































































































































































































































































































































































































  202 


















    8 








    8 


    8 

























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FILELOCK_H
#define _LINUX_FILELOCK_H

#include <linux/fs.h>

#define FL_POSIX        1
#define FL_FLOCK        2
#define FL_DELEG        4        /* NFSv4 delegation */
#define FL_ACCESS        8        /* not trying to lock, just looking */
#define FL_EXISTS        16        /* when unlocking, test for existence */
#define FL_LEASE        32        /* lease held on this file */
#define FL_CLOSE        64        /* unlock on close */
#define FL_SLEEP        128        /* A blocking lock */
#define FL_DOWNGRADE_PENDING        256 /* Lease is being downgraded */
#define FL_UNLOCK_PENDING        512 /* Lease is being broken */
#define FL_OFDLCK        1024        /* lock is "owned" by struct file */
#define FL_LAYOUT        2048        /* outstanding pNFS layout */
#define FL_RECLAIM        4096        /* reclaiming from a reboot server */

#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)

/*
 * Special return value from posix_lock_file() and vfs_lock_file() for
 * asynchronous locking.
 */
#define FILE_LOCK_DEFERRED 1

struct file_lock;
struct file_lease;

struct file_lock_operations {
        void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
        void (*fl_release_private)(struct file_lock *);
};

struct lock_manager_operations {
        void *lm_mod_owner;
        fl_owner_t (*lm_get_owner)(fl_owner_t);
        void (*lm_put_owner)(fl_owner_t);
        void (*lm_notify)(struct file_lock *);        /* unblock callback */
        int (*lm_grant)(struct file_lock *, int);
        bool (*lm_lock_expirable)(struct file_lock *cfl);
        void (*lm_expire_lock)(void);
};

struct lease_manager_operations {
        bool (*lm_break)(struct file_lease *);
        int (*lm_change)(struct file_lease *, int, struct list_head *);
        void (*lm_setup)(struct file_lease *, void **);
        bool (*lm_breaker_owns_lease)(struct file_lease *);
};

struct lock_manager {
        struct list_head list;
        /*
         * NFSv4 and up also want opens blocked during the grace period;
         * NLM doesn't care:
         */
        bool block_opens;
};

struct net;
void locks_start_grace(struct net *, struct lock_manager *);
void locks_end_grace(struct lock_manager *);
bool locks_in_grace(struct net *);
bool opens_in_grace(struct net *);

/*
 * struct file_lock has a union that some filesystems use to track
 * their own private info. The NFS side of things is defined here:
 */
#include <linux/nfs_fs_i.h>

/*
 * struct file_lock represents a generic "file lock". It's used to represent
 * POSIX byte range locks, BSD (flock) locks, and leases. It's important to
 * note that the same struct is used to represent both a request for a lock and
 * the lock itself, but the same object is never used for both.
 *
 * FIXME: should we create a separate "struct lock_request" to help distinguish
 * these two uses?
 *
 * The varous i_flctx lists are ordered by:
 *
 * 1) lock owner
 * 2) lock range start
 * 3) lock range end
 *
 * Obviously, the last two criteria only matter for POSIX locks.
 */

struct file_lock_core {
        struct file_lock_core *flc_blocker;        /* The lock that is blocking us */
        struct list_head flc_list;        /* link into file_lock_context */
        struct hlist_node flc_link;        /* node in global lists */
        struct list_head flc_blocked_requests;        /* list of requests with
                                                 * ->fl_blocker pointing here
                                                 */
        struct list_head flc_blocked_member;        /* node in
                                                 * ->fl_blocker->fl_blocked_requests
                                                 */
        fl_owner_t flc_owner;
        unsigned int flc_flags;
        unsigned char flc_type;
        pid_t flc_pid;
        int flc_link_cpu;                /* what cpu's list is this on? */
        wait_queue_head_t flc_wait;
        struct file *flc_file;
};

struct file_lock {
        struct file_lock_core c;
        loff_t fl_start;
        loff_t fl_end;

        const struct file_lock_operations *fl_ops;        /* Callbacks for filesystems */
        const struct lock_manager_operations *fl_lmops;        /* Callbacks for lockmanagers */
        union {
                struct nfs_lock_info        nfs_fl;
                struct nfs4_lock_info        nfs4_fl;
                struct {
                        struct list_head link;        /* link in AFS vnode's pending_locks list */
                        int state;                /* state of grant or error if -ve */
                        unsigned int        debug_id;
                } afs;
                struct {
                        struct inode *inode;
                } ceph;
        } fl_u;
} __randomize_layout;

struct file_lease {
        struct file_lock_core c;
        struct fasync_struct *        fl_fasync; /* for lease break notifications */
        /* for lease breaks: */
        unsigned long fl_break_time;
        unsigned long fl_downgrade_time;
        const struct lease_manager_operations *fl_lmops; /* Callbacks for lease managers */
} __randomize_layout;

struct file_lock_context {
        spinlock_t                flc_lock;
        struct list_head        flc_flock;
        struct list_head        flc_posix;
        struct list_head        flc_lease;
};

#ifdef CONFIG_FILE_LOCKING
int fcntl_getlk(struct file *, unsigned int, struct flock *);
int fcntl_setlk(unsigned int, struct file *, unsigned int,
                        struct flock *);

#if BITS_PER_LONG == 32
int fcntl_getlk64(struct file *, unsigned int, struct flock64 *);
int fcntl_setlk64(unsigned int, struct file *, unsigned int,
                        struct flock64 *);
#endif

int fcntl_setlease(unsigned int fd, struct file *filp, int arg);
int fcntl_getlease(struct file *filp);

static inline bool lock_is_unlock(struct file_lock *fl)
{
        return fl->c.flc_type == F_UNLCK;
}

static inline bool lock_is_read(struct file_lock *fl)
{
        return fl->c.flc_type == F_RDLCK;
}

static inline bool lock_is_write(struct file_lock *fl)
{
        return fl->c.flc_type == F_WRLCK;
}

static inline void locks_wake_up(struct file_lock *fl)
{
        wake_up(&fl->c.flc_wait);
}

static inline bool locks_can_async_lock(const struct file_operations *fops)
{
        return !fops->lock || fops->fop_flags & FOP_ASYNC_LOCK;
}

/* fs/locks.c */
void locks_free_lock_context(struct inode *inode);
void locks_free_lock(struct file_lock *fl);
void locks_init_lock(struct file_lock *);
struct file_lock *locks_alloc_lock(void);
void locks_copy_lock(struct file_lock *, struct file_lock *);
void locks_copy_conflock(struct file_lock *, struct file_lock *);
void locks_remove_posix(struct file *, fl_owner_t);
void locks_remove_file(struct file *);
void locks_release_private(struct file_lock *);
void posix_test_lock(struct file *, struct file_lock *);
int posix_lock_file(struct file *, struct file_lock *, struct file_lock *);
int locks_delete_block(struct file_lock *);
int vfs_test_lock(struct file *, struct file_lock *);
int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *);
int vfs_cancel_lock(struct file *filp, struct file_lock *fl);
bool vfs_inode_has_locks(struct inode *inode);
int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl);

void locks_init_lease(struct file_lease *);
void locks_free_lease(struct file_lease *fl);
struct file_lease *locks_alloc_lease(void);
int __break_lease(struct inode *inode, unsigned int flags, unsigned int type);
void lease_get_mtime(struct inode *, struct timespec64 *time);
int generic_setlease(struct file *, int, struct file_lease **, void **priv);
int kernel_setlease(struct file *, int, struct file_lease **, void **);
int vfs_setlease(struct file *, int, struct file_lease **, void **);
int lease_modify(struct file_lease *, int, struct list_head *);

struct notifier_block;
int lease_register_notifier(struct notifier_block *);
void lease_unregister_notifier(struct notifier_block *);

struct files_struct;
void show_fd_locks(struct seq_file *f,
                         struct file *filp, struct files_struct *files);
bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner);

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        return smp_load_acquire(&inode->i_flctx);
}

#else /* !CONFIG_FILE_LOCKING */
static inline int fcntl_getlk(struct file *file, unsigned int cmd,
                              struct flock __user *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk(unsigned int fd, struct file *file,
                              unsigned int cmd, struct flock __user *user)
{
        return -EACCES;
}

#if BITS_PER_LONG == 32
static inline int fcntl_getlk64(struct file *file, unsigned int cmd,
                                struct flock64 *user)
{
        return -EINVAL;
}

static inline int fcntl_setlk64(unsigned int fd, struct file *file,
                                unsigned int cmd, struct flock64 *user)
{
        return -EACCES;
}
#endif
static inline int fcntl_setlease(unsigned int fd, struct file *filp, int arg)
{
        return -EINVAL;
}

static inline int fcntl_getlease(struct file *filp)
{
        return F_UNLCK;
}

static inline bool lock_is_unlock(struct file_lock *fl)
{
        return false;
}

static inline bool lock_is_read(struct file_lock *fl)
{
        return false;
}

static inline bool lock_is_write(struct file_lock *fl)
{
        return false;
}

static inline void locks_wake_up(struct file_lock *fl)
{
}

static inline void
locks_free_lock_context(struct inode *inode)
{
}

static inline void locks_init_lock(struct file_lock *fl)
{
        return;
}

static inline void locks_init_lease(struct file_lease *fl)
{
        return;
}

static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
{
        return;
}

static inline void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
        return;
}

static inline void locks_remove_file(struct file *filp)
{
        return;
}

static inline void posix_test_lock(struct file *filp, struct file_lock *fl)
{
        return;
}

static inline int posix_lock_file(struct file *filp, struct file_lock *fl,
                                  struct file_lock *conflock)
{
        return -ENOLCK;
}

static inline int locks_delete_block(struct file_lock *waiter)
{
        return -ENOENT;
}

static inline int vfs_test_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline int vfs_lock_file(struct file *filp, unsigned int cmd,
                                struct file_lock *fl, struct file_lock *conf)
{
        return -ENOLCK;
}

static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
{
        return 0;
}

static inline bool vfs_inode_has_locks(struct inode *inode)
{
        return false;
}

static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
        return -ENOLCK;
}

static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
        return 0;
}

static inline void lease_get_mtime(struct inode *inode,
                                   struct timespec64 *time)
{
        return;
}

static inline int generic_setlease(struct file *filp, int arg,
                                    struct file_lease **flp, void **priv)
{
        return -EINVAL;
}

static inline int kernel_setlease(struct file *filp, int arg,
                               struct file_lease **lease, void **priv)
{
        return -EINVAL;
}

static inline int vfs_setlease(struct file *filp, int arg,
                               struct file_lease **lease, void **priv)
{
        return -EINVAL;
}

static inline int lease_modify(struct file_lease *fl, int arg,
                               struct list_head *dispose)
{
        return -EINVAL;
}

struct files_struct;
static inline void show_fd_locks(struct seq_file *f,
                        struct file *filp, struct files_struct *files) {}
static inline bool locks_owner_has_blockers(struct file_lock_context *flctx,
                        fl_owner_t owner)
{
        return false;
}

static inline struct file_lock_context *
locks_inode_context(const struct inode *inode)
{
        return NULL;
}

#endif /* !CONFIG_FILE_LOCKING */

/* for walking lists of file_locks linked by fl_list */
#define for_each_file_lock(_fl, _head)        list_for_each_entry(_fl, _head, c.flc_list)

static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl)
{
        return locks_lock_inode_wait(file_inode(filp), fl);
}

#ifdef CONFIG_FILE_LOCKING
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        struct file_lock_context *flctx;

        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        flctx = READ_ONCE(inode->i_flctx);
        if (!flctx)
                return 0;
        smp_mb();
        if (!list_empty_careful(&flctx->flc_lease))
                return __break_lease(inode, mode, FL_LEASE);
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        struct file_lock_context *flctx;

        /*
         * Since this check is lockless, we must ensure that any refcounts
         * taken are done before checking i_flctx->flc_lease. Otherwise, we
         * could end up racing with tasks trying to set a new lease on this
         * file.
         */
        flctx = READ_ONCE(inode->i_flctx);
        if (!flctx)
                return 0;
        smp_mb();
        if (!list_empty_careful(&flctx->flc_lease))
                return __break_lease(inode, mode, FL_DELEG);
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(inode, O_WRONLY|O_NONBLOCK);
        if (ret == -EWOULDBLOCK && delegated_inode) {
                *delegated_inode = inode;
                ihold(inode);
        }
        return ret;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        int ret;

        ret = break_deleg(*delegated_inode, O_WRONLY);
        iput(*delegated_inode);
        *delegated_inode = NULL;
        return ret;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        smp_mb();
        if (inode->i_flctx && !list_empty_careful(&inode->i_flctx->flc_lease))
                return __break_lease(inode,
                                wait ? O_WRONLY : O_WRONLY | O_NONBLOCK,
                                FL_LAYOUT);
        return 0;
}

#else /* !CONFIG_FILE_LOCKING */
static inline int break_lease(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int break_deleg(struct inode *inode, unsigned int mode)
{
        return 0;
}

static inline int try_break_deleg(struct inode *inode, struct inode **delegated_inode)
{
        return 0;
}

static inline int break_deleg_wait(struct inode **delegated_inode)
{
        BUG();
        return 0;
}

static inline int break_layout(struct inode *inode, bool wait)
{
        return 0;
}

#endif /* CONFIG_FILE_LOCKING */

#endif /* _LINUX_FILELOCK_H */





















































  246 


















   45 










   45 




   43 


    2 




    2 




   42 
    1 














   42 























   42 

















   34 














   34 














   34 

































   40 



   40 















   40 








   40 

   40 









   40 
































   72 










   65 











   13 











   26 












   35 





   35 

    1 



    1 



   33 
    1 

   34 








   34 





   34 
   26 












   28 




















    7 



   18 


   24 






   12 












    5 
















    7 
    4 






   11 








   24 








   23 


   24 



   18 
    7 












   63 


   63 






    1 
























   60 





    2 














   59 






















   57 


    1 











































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015, 2016 ARM Ltd.
 */

#include <linux/uaccess.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/kvm_host.h>
#include <kvm/arm_vgic.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include "vgic.h"

/*
 * Initialization rules: there are multiple stages to the vgic
 * initialization, both for the distributor and the CPU interfaces.  The basic
 * idea is that even though the VGIC is not functional or not requested from
 * user space, the critical path of the run loop can still call VGIC functions
 * that just won't do anything, without them having to check additional
 * initialization flags to ensure they don't look at uninitialized data
 * structures.
 *
 * Distributor:
 *
 * - kvm_vgic_early_init(): initialization of static data that doesn't
 *   depend on any sizing information or emulation type. No allocation
 *   is allowed there.
 *
 * - vgic_init(): allocation and initialization of the generic data
 *   structures that depend on sizing information (number of CPUs,
 *   number of interrupts). Also initializes the vcpu specific data
 *   structures. Can be executed lazily for GICv2.
 *
 * CPU Interface:
 *
 * - kvm_vgic_vcpu_init(): initialization of static data that doesn't depend
 *   on any sizing information. Private interrupts are allocated if not
 *   already allocated at vgic-creation time.
 */

/* EARLY INIT */

/**
 * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures
 * @kvm: The VM whose VGIC districutor should be initialized
 *
 * Only do initialization of static structures that don't require any
 * allocation or sizing information from userspace.  vgic_init() called
 * kvm_vgic_dist_init() which takes care of the rest.
 */
void kvm_vgic_early_init(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;

        xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
}

/* CREATION */

static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type);

/**
 * kvm_vgic_create: triggered by the instantiation of the VGIC device by
 * user space, either through the legacy KVM_CREATE_IRQCHIP ioctl (v2 only)
 * or through the generic KVM_CREATE_DEVICE API ioctl.
 * irqchip_in_kernel() tells you if this function succeeded or not.
 * @kvm: kvm struct pointer
 * @type: KVM_DEV_TYPE_ARM_VGIC_V[23]
 */
int kvm_vgic_create(struct kvm *kvm, u32 type)
{
        struct kvm_vcpu *vcpu;
        unsigned long i;
        int ret;

        /*
         * This function is also called by the KVM_CREATE_IRQCHIP handler,
         * which had no chance yet to check the availability of the GICv2
         * emulation. So check this here again. KVM_CREATE_DEVICE does
         * the proper checks already.
         */
        if (type == KVM_DEV_TYPE_ARM_VGIC_V2 &&
                !kvm_vgic_global_state.can_emulate_gicv2)
                return -ENODEV;

        /* Must be held to avoid race with vCPU creation */
        lockdep_assert_held(&kvm->lock);

        ret = -EBUSY;
        if (!lock_all_vcpus(kvm))
                return ret;

        mutex_lock(&kvm->arch.config_lock);

        if (irqchip_in_kernel(kvm)) {
                ret = -EEXIST;
                goto out_unlock;
        }

        kvm_for_each_vcpu(i, vcpu, kvm) {
                if (vcpu_has_run_once(vcpu))
                        goto out_unlock;
        }
        ret = 0;

        if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
                kvm->max_vcpus = VGIC_V2_MAX_CPUS;
        else
                kvm->max_vcpus = VGIC_V3_MAX_CPUS;

        if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) {
                ret = -E2BIG;
                goto out_unlock;
        }

        kvm_for_each_vcpu(i, vcpu, kvm) {
                ret = vgic_allocate_private_irqs_locked(vcpu, type);
                if (ret)
                        break;
        }

        if (ret) {
                kvm_for_each_vcpu(i, vcpu, kvm) {
                        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
                        kfree(vgic_cpu->private_irqs);
                        vgic_cpu->private_irqs = NULL;
                }

                goto out_unlock;
        }

        kvm->arch.vgic.in_kernel = true;
        kvm->arch.vgic.vgic_model = type;

        kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;

        if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
                kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
        else
                INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);

out_unlock:
        mutex_unlock(&kvm->arch.config_lock);
        unlock_all_vcpus(kvm);
        return ret;
}

/* INIT/DESTROY */

/**
 * kvm_vgic_dist_init: initialize the dist data structures
 * @kvm: kvm struct pointer
 * @nr_spis: number of spis, frozen by caller
 */
static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
        int i;

        dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT);
        if (!dist->spis)
                return  -ENOMEM;

        /*
         * In the following code we do not take the irq struct lock since
         * no other action on irq structs can happen while the VGIC is
         * not initialized yet:
         * If someone wants to inject an interrupt or does a MMIO access, we
         * require prior initialization in case of a virtual GICv3 or trigger
         * initialization when using a virtual GICv2.
         */
        for (i = 0; i < nr_spis; i++) {
                struct vgic_irq *irq = &dist->spis[i];

                irq->intid = i + VGIC_NR_PRIVATE_IRQS;
                INIT_LIST_HEAD(&irq->ap_list);
                raw_spin_lock_init(&irq->irq_lock);
                irq->vcpu = NULL;
                irq->target_vcpu = vcpu0;
                kref_init(&irq->refcount);
                switch (dist->vgic_model) {
                case KVM_DEV_TYPE_ARM_VGIC_V2:
                        irq->targets = 0;
                        irq->group = 0;
                        break;
                case KVM_DEV_TYPE_ARM_VGIC_V3:
                        irq->mpidr = 0;
                        irq->group = 1;
                        break;
                default:
                        kfree(dist->spis);
                        dist->spis = NULL;
                        return -EINVAL;
                }
        }
        return 0;
}

/* Default GICv3 Maintenance Interrupt INTID, as per SBSA */
#define DEFAULT_MI_INTID        25

int kvm_vgic_vcpu_nv_init(struct kvm_vcpu *vcpu)
{
        int ret;

        guard(mutex)(&vcpu->kvm->arch.config_lock);

        /*
         * Matching the tradition established with the timers, provide
         * a default PPI for the maintenance interrupt. It makes
         * things easier to reason about.
         */
        if (vcpu->kvm->arch.vgic.mi_intid == 0)
                vcpu->kvm->arch.vgic.mi_intid = DEFAULT_MI_INTID;
        ret = kvm_vgic_set_owner(vcpu, vcpu->kvm->arch.vgic.mi_intid, vcpu);

        return ret;
}

static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        int i;

        lockdep_assert_held(&vcpu->kvm->arch.config_lock);

        if (vgic_cpu->private_irqs)
                return 0;

        vgic_cpu->private_irqs = kcalloc(VGIC_NR_PRIVATE_IRQS,
                                         sizeof(struct vgic_irq),
                                         GFP_KERNEL_ACCOUNT);

        if (!vgic_cpu->private_irqs)
                return -ENOMEM;

        /*
         * Enable and configure all SGIs to be edge-triggered and
         * configure all PPIs as level-triggered.
         */
        for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
                struct vgic_irq *irq = &vgic_cpu->private_irqs[i];

                INIT_LIST_HEAD(&irq->ap_list);
                raw_spin_lock_init(&irq->irq_lock);
                irq->intid = i;
                irq->vcpu = NULL;
                irq->target_vcpu = vcpu;
                kref_init(&irq->refcount);
                if (vgic_irq_is_sgi(i)) {
                        /* SGIs */
                        irq->enabled = 1;
                        irq->config = VGIC_CONFIG_EDGE;
                } else {
                        /* PPIs */
                        irq->config = VGIC_CONFIG_LEVEL;
                }

                switch (type) {
                case KVM_DEV_TYPE_ARM_VGIC_V3:
                        irq->group = 1;
                        irq->mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
                        break;
                case KVM_DEV_TYPE_ARM_VGIC_V2:
                        irq->group = 0;
                        irq->targets = BIT(vcpu->vcpu_id);
                        break;
                }
        }

        return 0;
}

static int vgic_allocate_private_irqs(struct kvm_vcpu *vcpu, u32 type)
{
        int ret;

        mutex_lock(&vcpu->kvm->arch.config_lock);
        ret = vgic_allocate_private_irqs_locked(vcpu, type);
        mutex_unlock(&vcpu->kvm->arch.config_lock);

        return ret;
}

/**
 * kvm_vgic_vcpu_init() - Initialize static VGIC VCPU data
 * structures and register VCPU-specific KVM iodevs
 *
 * @vcpu: pointer to the VCPU being created and initialized
 *
 * Only do initialization, but do not actually enable the
 * VGIC CPU interface
 */
int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
        int ret = 0;

        vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;

        INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
        raw_spin_lock_init(&vgic_cpu->ap_list_lock);
        atomic_set(&vgic_cpu->vgic_v3.its_vpe.vlpi_count, 0);

        if (!irqchip_in_kernel(vcpu->kvm))
                return 0;

        ret = vgic_allocate_private_irqs(vcpu, dist->vgic_model);
        if (ret)
                return ret;

        /*
         * If we are creating a VCPU with a GICv3 we must also register the
         * KVM io device for the redistributor that belongs to this VCPU.
         */
        if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
                mutex_lock(&vcpu->kvm->slots_lock);
                ret = vgic_register_redist_iodev(vcpu);
                mutex_unlock(&vcpu->kvm->slots_lock);
        }
        return ret;
}

static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
{
        if (kvm_vgic_global_state.type == VGIC_V2)
                vgic_v2_enable(vcpu);
        else
                vgic_v3_enable(vcpu);
}

/*
 * vgic_init: allocates and initializes dist and vcpu data structures
 * depending on two dimensioning parameters:
 * - the number of spis
 * - the number of vcpus
 * The function is generally called when nr_spis has been explicitly set
 * by the guest through the KVM DEVICE API. If not nr_spis is set to 256.
 * vgic_initialized() returns true when this function has succeeded.
 */
int vgic_init(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct kvm_vcpu *vcpu;
        int ret = 0;
        unsigned long idx;

        lockdep_assert_held(&kvm->arch.config_lock);

        if (vgic_initialized(kvm))
                return 0;

        /* Are we also in the middle of creating a VCPU? */
        if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus))
                return -EBUSY;

        /* freeze the number of spis */
        if (!dist->nr_spis)
                dist->nr_spis = VGIC_NR_IRQS_LEGACY - VGIC_NR_PRIVATE_IRQS;

        ret = kvm_vgic_dist_init(kvm, dist->nr_spis);
        if (ret)
                goto out;

        /*
         * If we have GICv4.1 enabled, unconditionally request enable the
         * v4 support so that we get HW-accelerated vSGIs. Otherwise, only
         * enable it if we present a virtual ITS to the guest.
         */
        if (vgic_supports_direct_msis(kvm)) {
                ret = vgic_v4_init(kvm);
                if (ret)
                        goto out;
        }

        kvm_for_each_vcpu(idx, vcpu, kvm)
                kvm_vgic_vcpu_enable(vcpu);

        ret = kvm_vgic_setup_default_irq_routing(kvm);
        if (ret)
                goto out;

        vgic_debug_init(kvm);

        /*
         * If userspace didn't set the GIC implementation revision,
         * default to the latest and greatest. You know want it.
         */
        if (!dist->implementation_rev)
                dist->implementation_rev = KVM_VGIC_IMP_REV_LATEST;
        dist->initialized = true;

out:
        return ret;
}

static void kvm_vgic_dist_destroy(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct vgic_redist_region *rdreg, *next;

        dist->ready = false;
        dist->initialized = false;

        kfree(dist->spis);
        dist->spis = NULL;
        dist->nr_spis = 0;
        dist->vgic_dist_base = VGIC_ADDR_UNDEF;

        if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
                list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list)
                        vgic_v3_free_redist_region(kvm, rdreg);
                INIT_LIST_HEAD(&dist->rd_regions);
        } else {
                dist->vgic_cpu_base = VGIC_ADDR_UNDEF;
        }

        if (vgic_supports_direct_msis(kvm))
                vgic_v4_teardown(kvm);

        xa_destroy(&dist->lpi_xa);
}

static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;

        /*
         * Retire all pending LPIs on this vcpu anyway as we're
         * going to destroy it.
         */
        vgic_flush_pending_lpis(vcpu);

        INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
        kfree(vgic_cpu->private_irqs);
        vgic_cpu->private_irqs = NULL;

        if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
                /*
                 * If this vCPU is being destroyed because of a failed creation
                 * then unregister the redistributor to avoid leaving behind a
                 * dangling pointer to the vCPU struct.
                 *
                 * vCPUs that have been successfully created (i.e. added to
                 * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as
                 * this function gets called while holding kvm->arch.config_lock
                 * in the VM teardown path and would otherwise introduce a lock
                 * inversion w.r.t. kvm->srcu.
                 *
                 * vCPUs that failed creation are torn down outside of the
                 * kvm->arch.config_lock and do not get unregistered in
                 * kvm_vgic_destroy(), meaning it is both safe and necessary to
                 * do so here.
                 */
                if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu)
                        vgic_unregister_redist_iodev(vcpu);

                vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF;
        }
}

void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;

        mutex_lock(&kvm->slots_lock);
        __kvm_vgic_vcpu_destroy(vcpu);
        mutex_unlock(&kvm->slots_lock);
}

void kvm_vgic_destroy(struct kvm *kvm)
{
        struct kvm_vcpu *vcpu;
        unsigned long i;

        mutex_lock(&kvm->slots_lock);
        mutex_lock(&kvm->arch.config_lock);

        vgic_debug_destroy(kvm);

        kvm_for_each_vcpu(i, vcpu, kvm)
                __kvm_vgic_vcpu_destroy(vcpu);

        kvm_vgic_dist_destroy(kvm);

        mutex_unlock(&kvm->arch.config_lock);

        if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
                kvm_for_each_vcpu(i, vcpu, kvm)
                        vgic_unregister_redist_iodev(vcpu);

        mutex_unlock(&kvm->slots_lock);
}

/**
 * vgic_lazy_init: Lazy init is only allowed if the GIC exposed to the guest
 * is a GICv2. A GICv3 must be explicitly initialized by userspace using the
 * KVM_DEV_ARM_VGIC_GRP_CTRL KVM_DEVICE group.
 * @kvm: kvm struct pointer
 */
int vgic_lazy_init(struct kvm *kvm)
{
        int ret = 0;

        if (unlikely(!vgic_initialized(kvm))) {
                /*
                 * We only provide the automatic initialization of the VGIC
                 * for the legacy case of a GICv2. Any other type must
                 * be explicitly initialized once setup with the respective
                 * KVM device call.
                 */
                if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
                        return -EBUSY;

                mutex_lock(&kvm->arch.config_lock);
                ret = vgic_init(kvm);
                mutex_unlock(&kvm->arch.config_lock);
        }

        return ret;
}

/* RESOURCE MAPPING */

/**
 * kvm_vgic_map_resources - map the MMIO regions
 * @kvm: kvm struct pointer
 *
 * Map the MMIO regions depending on the VGIC model exposed to the guest
 * called on the first VCPU run.
 * Also map the virtual CPU interface into the VM.
 * v2 calls vgic_init() if not already done.
 * v3 and derivatives return an error if the VGIC is not initialized.
 * vgic_ready() returns true if this function has succeeded.
 */
int kvm_vgic_map_resources(struct kvm *kvm)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        enum vgic_type type;
        gpa_t dist_base;
        int ret = 0;

        if (likely(vgic_ready(kvm)))
                return 0;

        mutex_lock(&kvm->slots_lock);
        mutex_lock(&kvm->arch.config_lock);
        if (vgic_ready(kvm))
                goto out;

        if (!irqchip_in_kernel(kvm))
                goto out;

        if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) {
                ret = vgic_v2_map_resources(kvm);
                type = VGIC_V2;
        } else {
                ret = vgic_v3_map_resources(kvm);
                type = VGIC_V3;
        }

        if (ret)
                goto out;

        dist_base = dist->vgic_dist_base;
        mutex_unlock(&kvm->arch.config_lock);

        ret = vgic_register_dist_iodev(kvm, dist_base, type);
        if (ret) {
                kvm_err("Unable to register VGIC dist MMIO regions\n");
                goto out_slots;
        }

        /*
         * kvm_io_bus_register_dev() guarantees all readers see the new MMIO
         * registration before returning through synchronize_srcu(), which also
         * implies a full memory barrier. As such, marking the distributor as
         * 'ready' here is guaranteed to be ordered after all vCPUs having seen
         * a completely configured distributor.
         */
        dist->ready = true;
        goto out_slots;
out:
        mutex_unlock(&kvm->arch.config_lock);
out_slots:
        if (ret)
                kvm_vm_dead(kvm);

        mutex_unlock(&kvm->slots_lock);

        return ret;
}

/* GENERIC PROBE */

void kvm_vgic_cpu_up(void)
{
        enable_percpu_irq(kvm_vgic_global_state.maint_irq, 0);
}


void kvm_vgic_cpu_down(void)
{
        disable_percpu_irq(kvm_vgic_global_state.maint_irq);
}

static irqreturn_t vgic_maintenance_handler(int irq, void *data)
{
        struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)data;

        /*
         * We cannot rely on the vgic maintenance interrupt to be
         * delivered synchronously. This means we can only use it to
         * exit the VM, and we perform the handling of EOIed
         * interrupts on the exit path (see vgic_fold_lr_state).
         *
         * Of course, NV throws a wrench in this plan, and needs
         * something special.
         */
        if (vcpu && vgic_state_is_nested(vcpu))
                vgic_v3_handle_nested_maint_irq(vcpu);

        return IRQ_HANDLED;
}

static struct gic_kvm_info *gic_kvm_info;

void __init vgic_set_kvm_info(const struct gic_kvm_info *info)
{
        BUG_ON(gic_kvm_info != NULL);
        gic_kvm_info = kmalloc(sizeof(*info), GFP_KERNEL);
        if (gic_kvm_info)
                *gic_kvm_info = *info;
}

/**
 * kvm_vgic_init_cpu_hardware - initialize the GIC VE hardware
 *
 * For a specific CPU, initialize the GIC VE hardware.
 */
void kvm_vgic_init_cpu_hardware(void)
{
        BUG_ON(preemptible());

        /*
         * We want to make sure the list registers start out clear so that we
         * only have the program the used registers.
         */
        if (kvm_vgic_global_state.type == VGIC_V2)
                vgic_v2_init_lrs();
        else
                kvm_call_hyp(__vgic_v3_init_lrs);
}

/**
 * kvm_vgic_hyp_init: populates the kvm_vgic_global_state variable
 * according to the host GIC model. Accordingly calls either
 * vgic_v2/v3_probe which registers the KVM_DEVICE that can be
 * instantiated by a guest later on .
 */
int kvm_vgic_hyp_init(void)
{
        bool has_mask;
        int ret;

        if (!gic_kvm_info)
                return -ENODEV;

        has_mask = !gic_kvm_info->no_maint_irq_mask;

        if (has_mask && !gic_kvm_info->maint_irq) {
                kvm_err("No vgic maintenance irq\n");
                return -ENXIO;
        }

        /*
         * If we get one of these oddball non-GICs, taint the kernel,
         * as we have no idea of how they *really* behave.
         */
        if (gic_kvm_info->no_hw_deactivation) {
                kvm_info("Non-architectural vgic, tainting kernel\n");
                add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
                kvm_vgic_global_state.no_hw_deactivation = true;
        }

        switch (gic_kvm_info->type) {
        case GIC_V2:
                ret = vgic_v2_probe(gic_kvm_info);
                break;
        case GIC_V3:
                ret = vgic_v3_probe(gic_kvm_info);
                if (!ret) {
                        static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
                        kvm_info("GIC system register CPU interface enabled\n");
                }
                break;
        default:
                ret = -ENODEV;
        }

        kvm_vgic_global_state.maint_irq = gic_kvm_info->maint_irq;

        kfree(gic_kvm_info);
        gic_kvm_info = NULL;

        if (ret)
                return ret;

        if (!has_mask && !kvm_vgic_global_state.maint_irq)
                return 0;

        ret = request_percpu_irq(kvm_vgic_global_state.maint_irq,
                                 vgic_maintenance_handler,
                                 "vgic", kvm_get_running_vcpus());
        if (ret) {
                kvm_err("Cannot register interrupt %d\n",
                        kvm_vgic_global_state.maint_irq);
                return ret;
        }

        kvm_info("vgic interrupt IRQ%d\n", kvm_vgic_global_state.maint_irq);
        return 0;
}










   22 



























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM printk

#if !defined(_TRACE_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_PRINTK_H

#include <linux/tracepoint.h>

TRACE_EVENT(console,
        TP_PROTO(const char *text, size_t len),

        TP_ARGS(text, len),

        TP_STRUCT__entry(
                __dynamic_array(char, msg, len + 1)
        ),

        TP_fast_assign(
                /*
                 * Each trace entry is printed in a new line.
                 * If the msg finishes with '\n', cut it off
                 * to avoid blank lines in the trace.
                 */
                if ((len > 0) && (text[len-1] == '\n'))
                        len -= 1;

                memcpy(__get_str(msg), text, len);
                __get_str(msg)[len] = 0;
        ),

        TP_printk("%s", __get_str(msg))
);
#endif /* _TRACE_PRINTK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Linux Socket Filter Data Structures
 */
#ifndef __LINUX_FILTER_H__
#define __LINUX_FILTER_H__

#include <linux/atomic.h>
#include <linux/bpf.h>
#include <linux/refcount.h>
#include <linux/compat.h>
#include <linux/skbuff.h>
#include <linux/linkage.h>
#include <linux/printk.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
#include <linux/sched/clock.h>
#include <linux/capability.h>
#include <linux/set_memory.h>
#include <linux/kallsyms.h>
#include <linux/if_vlan.h>
#include <linux/vmalloc.h>
#include <linux/sockptr.h>
#include <crypto/sha1.h>
#include <linux/u64_stats_sync.h>

#include <net/sch_generic.h>

#include <asm/byteorder.h>
#include <uapi/linux/filter.h>

struct sk_buff;
struct sock;
struct seccomp_data;
struct bpf_prog_aux;
struct xdp_rxq_info;
struct xdp_buff;
struct sock_reuseport;
struct ctl_table;
struct ctl_table_header;

/* ArgX, context and stack frame pointer register positions. Note,
 * Arg1, Arg2, Arg3, etc are used as argument mappings of function
 * calls in BPF_CALL instruction.
 */
#define BPF_REG_ARG1        BPF_REG_1
#define BPF_REG_ARG2        BPF_REG_2
#define BPF_REG_ARG3        BPF_REG_3
#define BPF_REG_ARG4        BPF_REG_4
#define BPF_REG_ARG5        BPF_REG_5
#define BPF_REG_CTX        BPF_REG_6
#define BPF_REG_FP        BPF_REG_10

/* Additional register mappings for converted user programs. */
#define BPF_REG_A        BPF_REG_0
#define BPF_REG_X        BPF_REG_7
#define BPF_REG_TMP        BPF_REG_2        /* scratch reg */
#define BPF_REG_D        BPF_REG_8        /* data, callee-saved */
#define BPF_REG_H        BPF_REG_9        /* hlen, callee-saved */

/* Kernel hidden auxiliary/helper register. */
#define BPF_REG_AX                MAX_BPF_REG
#define MAX_BPF_EXT_REG                (MAX_BPF_REG + 1)
#define MAX_BPF_JIT_REG                MAX_BPF_EXT_REG

/* unused opcode to mark special call to bpf_tail_call() helper */
#define BPF_TAIL_CALL        0xf0

/* unused opcode to mark special load instruction. Same as BPF_ABS */
#define BPF_PROBE_MEM        0x20

/* unused opcode to mark special ldsx instruction. Same as BPF_IND */
#define BPF_PROBE_MEMSX        0x40

/* unused opcode to mark special load instruction. Same as BPF_MSH */
#define BPF_PROBE_MEM32        0xa0

/* unused opcode to mark special atomic instruction */
#define BPF_PROBE_ATOMIC 0xe0

/* unused opcode to mark call to interpreter with arguments */
#define BPF_CALL_ARGS        0xe0

/* unused opcode to mark speculation barrier for mitigating
 * Speculative Store Bypass
 */
#define BPF_NOSPEC        0xc0

/* As per nm, we expose JITed images as text (code) section for
 * kallsyms. That way, tools like perf can find it to match
 * addresses.
 */
#define BPF_SYM_ELF_TYPE        't'

/* BPF program can access up to 512 bytes of stack space. */
#define MAX_BPF_STACK        512

/* Helper macros for filter block array initializers. */

/* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */

#define BPF_ALU64_REG_OFF(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_ALU64_REG(OP, DST, SRC)                                \
        BPF_ALU64_REG_OFF(OP, DST, SRC, 0)

#define BPF_ALU32_REG_OFF(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_ALU32_REG(OP, DST, SRC)                                \
        BPF_ALU32_REG_OFF(OP, DST, SRC, 0)

/* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */

#define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })
#define BPF_ALU64_IMM(OP, DST, IMM)                                \
        BPF_ALU64_IMM_OFF(OP, DST, IMM, 0)

#define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })
#define BPF_ALU32_IMM(OP, DST, IMM)                                \
        BPF_ALU32_IMM_OFF(OP, DST, IMM, 0)

/* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */

#define BPF_ENDIAN(TYPE, DST, LEN)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_END | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Byte Swap, bswap16/32/64 */

#define BPF_BSWAP(DST, LEN)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE),        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = LEN })

/* Short form of mov, dst_reg = src_reg */

#define BPF_MOV64_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

#define BPF_MOV32_REG(DST, SRC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
 * dst_reg = src_reg + <percpu_base_off>
 * BPF_ADDR_PERCPU is used as a special insn->off value.
 */
#define BPF_ADDR_PERCPU        (-1)

#define BPF_MOV64_PERCPU_REG(DST, SRC)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = BPF_ADDR_PERCPU,                        \
                .imm   = 0 })

static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
}

/* Short form of mov, dst_reg = imm32 */

#define BPF_MOV64_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_IMM(DST, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */

#define BPF_MOVSX64_REG(DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

#define BPF_MOVSX32_REG(DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Special form of mov32, used for doing explicit zero extension on dst. */
#define BPF_ZEXT_REG(DST)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = DST,                                        \
                .off   = 0,                                        \
                .imm   = 1 })

static inline bool insn_is_zext(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
}

/* addr_space_cast from as(0) to as(1) is for converting bpf arena pointers
 * to pointers in user vma.
 */
static inline bool insn_is_cast_user(const struct bpf_insn *insn)
{
        return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) &&
                              insn->off == BPF_ADDR_SPACE_CAST &&
                              insn->imm == 1U << 16;
}

/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
#define BPF_LD_IMM64(DST, IMM)                                        \
        BPF_LD_IMM64_RAW(DST, 0, IMM)

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_DW | BPF_IMM,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = (__u32) (IMM) }),                        \
        ((struct bpf_insn) {                                        \
                .code  = 0, /* zero is reserved opcode */        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = ((__u64) (IMM)) >> 32 })

/* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */
#define BPF_LD_MAP_FD(DST, MAP_FD)                                \
        BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD)

/* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */

#define BPF_MOV64_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

#define BPF_MOV32_RAW(TYPE, DST, SRC, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ALU | BPF_MOV | BPF_SRC(TYPE),        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Direct packet access, R0 = *(uint *) (skb->data + imm32) */

#define BPF_LD_ABS(SIZE, IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS,        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */

#define BPF_LD_IND(SIZE, SRC, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LD | BPF_SIZE(SIZE) | BPF_IND,        \
                .dst_reg = 0,                                        \
                .src_reg = SRC,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Memory load, dst_reg = *(uint *) (src_reg + off16) */

#define BPF_LDX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory load, dst_reg = *(signed size *) (src_reg + off16) */

#define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Memory store, *(uint *) (dst_reg + off16) = src_reg */

#define BPF_STX_MEM(SIZE, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })


/*
 * Atomic operations:
 *
 *   BPF_ADD                  *(uint *) (dst_reg + off16) += src_reg
 *   BPF_AND                  *(uint *) (dst_reg + off16) &= src_reg
 *   BPF_OR                   *(uint *) (dst_reg + off16) |= src_reg
 *   BPF_XOR                  *(uint *) (dst_reg + off16) ^= src_reg
 *   BPF_ADD | BPF_FETCH      src_reg = atomic_fetch_add(dst_reg + off16, src_reg);
 *   BPF_AND | BPF_FETCH      src_reg = atomic_fetch_and(dst_reg + off16, src_reg);
 *   BPF_OR | BPF_FETCH       src_reg = atomic_fetch_or(dst_reg + off16, src_reg);
 *   BPF_XOR | BPF_FETCH      src_reg = atomic_fetch_xor(dst_reg + off16, src_reg);
 *   BPF_XCHG                 src_reg = atomic_xchg(dst_reg + off16, src_reg)
 *   BPF_CMPXCHG              r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg)
 *   BPF_LOAD_ACQ             dst_reg = smp_load_acquire(src_reg + off16)
 *   BPF_STORE_REL            smp_store_release(dst_reg + off16, src_reg)
 */

#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = OP })

/* Legacy alias */
#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF)

/* Memory store, *(uint *) (dst_reg + off16) = imm32 */

#define BPF_ST_MEM(SIZE, DST, OFF, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */

#define BPF_JMP_REG(OP, DST, SRC, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_X,                \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */

#define BPF_JMP_IMM(OP, DST, IMM, OFF)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_OP(OP) | BPF_K,                \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_REG(OP, DST, SRC, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_X,        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */

#define BPF_JMP32_IMM(OP, DST, IMM, OFF)                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_OP(OP) | BPF_K,        \
                .dst_reg = DST,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Unconditional jumps, goto pc + off16 */

#define BPF_JMP_A(OFF)                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_JA,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = OFF,                                        \
                .imm   = 0 })

/* Unconditional jumps, gotol pc + imm32 */

#define BPF_JMP32_A(IMM)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP32 | BPF_JA,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = IMM })

/* Relative call */

#define BPF_CALL_REL(TGT)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = BPF_PSEUDO_CALL,                        \
                .off   = 0,                                        \
                .imm   = TGT })

/* Convert function address to BPF immediate */

#define BPF_CALL_IMM(x)        ((void *)(x) - (void *)__bpf_call_base)

#define BPF_EMIT_CALL(FUNC)                                        \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = BPF_CALL_IMM(FUNC) })

/* Kfunc call */

#define BPF_CALL_KFUNC(OFF, IMM)                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_CALL,                        \
                .dst_reg = 0,                                        \
                .src_reg = BPF_PSEUDO_KFUNC_CALL,                \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Raw code statement block */

#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM)                        \
        ((struct bpf_insn) {                                        \
                .code  = CODE,                                        \
                .dst_reg = DST,                                        \
                .src_reg = SRC,                                        \
                .off   = OFF,                                        \
                .imm   = IMM })

/* Program exit */

#define BPF_EXIT_INSN()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_JMP | BPF_EXIT,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Speculation barrier */

#define BPF_ST_NOSPEC()                                                \
        ((struct bpf_insn) {                                        \
                .code  = BPF_ST | BPF_NOSPEC,                        \
                .dst_reg = 0,                                        \
                .src_reg = 0,                                        \
                .off   = 0,                                        \
                .imm   = 0 })

/* Internal classic blocks for direct assignment */

#define __BPF_STMT(CODE, K)                                        \
        ((struct sock_filter) BPF_STMT(CODE, K))

#define __BPF_JUMP(CODE, K, JT, JF)                                \
        ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF))

#define bytes_to_bpf_size(bytes)                                \
({                                                                \
        int bpf_size = -EINVAL;                                        \
                                                                \
        if (bytes == sizeof(u8))                                \
                bpf_size = BPF_B;                                \
        else if (bytes == sizeof(u16))                                \
                bpf_size = BPF_H;                                \
        else if (bytes == sizeof(u32))                                \
                bpf_size = BPF_W;                                \
        else if (bytes == sizeof(u64))                                \
                bpf_size = BPF_DW;                                \
                                                                \
        bpf_size;                                                \
})

#define bpf_size_to_bytes(bpf_size)                                \
({                                                                \
        int bytes = -EINVAL;                                        \
                                                                \
        if (bpf_size == BPF_B)                                        \
                bytes = sizeof(u8);                                \
        else if (bpf_size == BPF_H)                                \
                bytes = sizeof(u16);                                \
        else if (bpf_size == BPF_W)                                \
                bytes = sizeof(u32);                                \
        else if (bpf_size == BPF_DW)                                \
                bytes = sizeof(u64);                                \
                                                                \
        bytes;                                                        \
})

#define BPF_SIZEOF(type)                                        \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof(type)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_FIELD_SIZEOF(type, field)                                \
        ({                                                        \
                const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \
                BUILD_BUG_ON(__size < 0);                        \
                __size;                                                \
        })

#define BPF_LDST_BYTES(insn)                                        \
        ({                                                        \
                const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \
                WARN_ON(__size < 0);                                \
                __size;                                                \
        })

#define __BPF_MAP_0(m, v, ...) v
#define __BPF_MAP_1(m, v, t, a, ...) m(t, a)
#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__)
#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__)
#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__)
#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__)

#define __BPF_REG_0(...) __BPF_PAD(5)
#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4)
#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3)
#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2)
#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1)
#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__)

#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__)
#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__)

#define __BPF_CAST(t, a)                                                       \
        (__force t)                                                               \
        (__force                                                               \
         typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long),      \
                                      (unsigned long)0, (t)0))) a
#define __BPF_V void
#define __BPF_N

#define __BPF_DECL_ARGS(t, a) t   a
#define __BPF_DECL_REGS(t, a) u64 a

#define __BPF_PAD(n)                                                               \
        __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2,       \
                  u64, __ur_3, u64, __ur_4, u64, __ur_5)

#define BPF_CALL_x(x, attr, name, ...)                                               \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__));   \
        typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__));    \
        attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__))     \
        {                                                                       \
                return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\
        }                                                                       \
        static __always_inline                                                       \
        u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__))

#define __NOATTR
#define BPF_CALL_0(name, ...)        BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_1(name, ...)        BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_2(name, ...)        BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_3(name, ...)        BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_4(name, ...)        BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__)
#define BPF_CALL_5(name, ...)        BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__)

#define NOTRACE_BPF_CALL_1(name, ...)        BPF_CALL_x(1, notrace, name, __VA_ARGS__)

#define bpf_ctx_range(TYPE, MEMBER)                                                \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2)                                \
        offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1
#if BITS_PER_LONG == 64
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1
#else
# define bpf_ctx_range_ptr(TYPE, MEMBER)                                        \
        offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1
#endif /* BITS_PER_LONG == 64 */

#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE)                                \
        ({                                                                        \
                BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE));                \
                *(PTR_SIZE) = (SIZE);                                                \
                offsetof(TYPE, MEMBER);                                                \
        })

/* A struct sock_filter is architecture independent. */
struct compat_sock_fprog {
        u16                len;
        compat_uptr_t        filter;        /* struct sock_filter * */
};

struct sock_fprog_kern {
        u16                        len;
        struct sock_filter        *filter;
};

/* Some arches need doubleword alignment for their instructions and/or data */
#define BPF_IMAGE_ALIGNMENT 8

struct bpf_binary_header {
        u32 size;
        u8 image[] __aligned(BPF_IMAGE_ALIGNMENT);
};

struct bpf_prog_stats {
        u64_stats_t cnt;
        u64_stats_t nsecs;
        u64_stats_t misses;
        struct u64_stats_sync syncp;
} __aligned(2 * sizeof(u64));

struct bpf_timed_may_goto {
        u64 count;
        u64 timestamp;
};

struct sk_filter {
        refcount_t        refcnt;
        struct rcu_head        rcu;
        struct bpf_prog        *prog;
};

DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);

extern struct mutex nf_conn_btf_access_lock;
extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
                                     const struct bpf_reg_state *reg,
                                     int off, int size);

typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx,
                                          const struct bpf_insn *insnsi,
                                          unsigned int (*bpf_func)(const void *,
                                                                   const struct bpf_insn *));

static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
                                          const void *ctx,
                                          bpf_dispatcher_fn dfunc)
{
        u32 ret;

        cant_migrate();
        if (static_branch_unlikely(&bpf_stats_enabled_key)) {
                struct bpf_prog_stats *stats;
                u64 duration, start = sched_clock();
                unsigned long flags;

                ret = dfunc(ctx, prog->insnsi, prog->bpf_func);

                duration = sched_clock() - start;
                stats = this_cpu_ptr(prog->stats);
                flags = u64_stats_update_begin_irqsave(&stats->syncp);
                u64_stats_inc(&stats->cnt);
                u64_stats_add(&stats->nsecs, duration);
                u64_stats_update_end_irqrestore(&stats->syncp, flags);
        } else {
                ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
        }
        return ret;
}

static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx)
{
        return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func);
}

/*
 * Use in preemptible and therefore migratable context to make sure that
 * the execution of the BPF program runs on one CPU.
 *
 * This uses migrate_disable/enable() explicitly to document that the
 * invocation of a BPF program does not require reentrancy protection
 * against a BPF program which is invoked from a preempting task.
 */
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
                                          const void *ctx)
{
        u32 ret;

        migrate_disable();
        ret = bpf_prog_run(prog, ctx);
        migrate_enable();
        return ret;
}

#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN

struct bpf_skb_data_end {
        struct qdisc_skb_cb qdisc_cb;
        void *data_meta;
        void *data_end;
};

struct bpf_nh_params {
        u32 nh_family;
        union {
                u32 ipv4_nh;
                struct in6_addr ipv6_nh;
        };
};

/* flags for bpf_redirect_info kern_flags */
#define BPF_RI_F_RF_NO_DIRECT        BIT(0)        /* no napi_direct on return_frame */
#define BPF_RI_F_RI_INIT        BIT(1)
#define BPF_RI_F_CPU_MAP_INIT        BIT(2)
#define BPF_RI_F_DEV_MAP_INIT        BIT(3)
#define BPF_RI_F_XSK_MAP_INIT        BIT(4)

struct bpf_redirect_info {
        u64 tgt_index;
        void *tgt_value;
        struct bpf_map *map;
        u32 flags;
        u32 map_id;
        enum bpf_map_type map_type;
        struct bpf_nh_params nh;
        u32 kern_flags;
};

struct bpf_net_context {
        struct bpf_redirect_info ri;
        struct list_head cpu_map_flush_list;
        struct list_head dev_map_flush_list;
        struct list_head xskmap_map_flush_list;
};

static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx)
{
        struct task_struct *tsk = current;

        if (tsk->bpf_net_context != NULL)
                return NULL;
        bpf_net_ctx->ri.kern_flags = 0;

        tsk->bpf_net_context = bpf_net_ctx;
        return bpf_net_ctx;
}

static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx)
{
        if (bpf_net_ctx)
                current->bpf_net_context = NULL;
}

static inline struct bpf_net_context *bpf_net_ctx_get(void)
{
        return current->bpf_net_context;
}

static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) {
                memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh));
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT;
        }

        return &bpf_net_ctx->ri;
}

static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT;
        }

        return &bpf_net_ctx->cpu_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT;
        }

        return &bpf_net_ctx->dev_map_flush_list;
}

static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();

        if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) {
                INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list);
                bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT;
        }

        return &bpf_net_ctx->xskmap_map_flush_list;
}

static inline void bpf_net_ctx_get_all_used_flush_lists(struct list_head **lh_map,
                                                        struct list_head **lh_dev,
                                                        struct list_head **lh_xsk)
{
        struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get();
        u32 kern_flags = bpf_net_ctx->ri.kern_flags;
        struct list_head *lh;

        *lh_map = *lh_dev = *lh_xsk = NULL;

        if (!IS_ENABLED(CONFIG_BPF_SYSCALL))
                return;

        lh = &bpf_net_ctx->dev_map_flush_list;
        if (kern_flags & BPF_RI_F_DEV_MAP_INIT && !list_empty(lh))
                *lh_dev = lh;

        lh = &bpf_net_ctx->cpu_map_flush_list;
        if (kern_flags & BPF_RI_F_CPU_MAP_INIT && !list_empty(lh))
                *lh_map = lh;

        lh = &bpf_net_ctx->xskmap_map_flush_list;
        if (IS_ENABLED(CONFIG_XDP_SOCKETS) &&
            kern_flags & BPF_RI_F_XSK_MAP_INIT && !list_empty(lh))
                *lh_xsk = lh;
}

/* Compute the linear packet data range [data, data_end) which
 * will be accessed by various program types (cls_bpf, act_bpf,
 * lwt, ...). Subsystems allowing direct data access must (!)
 * ensure that cb[] area can be written to when BPF program is
 * invoked (otherwise cb[] save/restore is necessary).
 */
static inline void bpf_compute_data_pointers(struct sk_buff *skb)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb));
        cb->data_meta = skb->data - skb_metadata_len(skb);
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Similar to bpf_compute_data_pointers(), except that save orginal
 * data in cb->data and cb->meta_data for restore.
 */
static inline void bpf_compute_and_save_data_end(
        struct sk_buff *skb, void **saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        *saved_data_end = cb->data_end;
        cb->data_end  = skb->data + skb_headlen(skb);
}

/* Restore data saved by bpf_compute_and_save_data_end(). */
static inline void bpf_restore_data_end(
        struct sk_buff *skb, void *saved_data_end)
{
        struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb;

        cb->data_end = saved_data_end;
}

static inline u8 *bpf_skb_cb(const struct sk_buff *skb)
{
        /* eBPF programs may read/write skb->cb[] area to transfer meta
         * data between tail calls. Since this also needs to work with
         * tc, that scratch memory is mapped to qdisc_skb_cb's data area.
         *
         * In some socket filter cases, the cb unfortunately needs to be
         * saved/restored so that protocol specific skb->cb[] data won't
         * be lost. In any case, due to unpriviledged eBPF programs
         * attached to sockets, we need to clear the bpf_skb_cb() area
         * to not leak previous contents to user space.
         */
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN);
        BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) !=
                     sizeof_field(struct qdisc_skb_cb, data));

        return qdisc_skb_cb(skb)->data;
}

/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                         const void *ctx)
{
        const struct sk_buff *skb = ctx;
        u8 *cb_data = bpf_skb_cb(skb);
        u8 cb_saved[BPF_SKB_CB_LEN];
        u32 res;

        if (unlikely(prog->cb_access)) {
                memcpy(cb_saved, cb_data, sizeof(cb_saved));
                memset(cb_data, 0, sizeof(cb_saved));
        }

        res = bpf_prog_run(prog, skb);

        if (unlikely(prog->cb_access))
                memcpy(cb_data, cb_saved, sizeof(cb_saved));

        return res;
}

static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
                                       struct sk_buff *skb)
{
        u32 res;

        migrate_disable();
        res = __bpf_prog_run_save_cb(prog, skb);
        migrate_enable();
        return res;
}

static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
                                        struct sk_buff *skb)
{
        u8 *cb_data = bpf_skb_cb(skb);
        u32 res;

        if (unlikely(prog->cb_access))
                memset(cb_data, 0, BPF_SKB_CB_LEN);

        res = bpf_prog_run_pin_on_cpu(prog, skb);
        return res;
}

DECLARE_BPF_DISPATCHER(xdp)

DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);

u32 xdp_master_redirect(struct xdp_buff *xdp);

void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog);

static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog)
{
        return prog->len * sizeof(struct bpf_insn);
}

static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog)
{
        return round_up(bpf_prog_insn_size(prog) +
                        sizeof(__be64) + 1, SHA1_BLOCK_SIZE);
}

static inline unsigned int bpf_prog_size(unsigned int proglen)
{
        return max(sizeof(struct bpf_prog),
                   offsetof(struct bpf_prog, insns[proglen]));
}

static inline bool bpf_prog_was_classic(const struct bpf_prog *prog)
{
        /* When classic BPF programs have been loaded and the arch
         * does not have a classic BPF JIT (anymore), they have been
         * converted via bpf_migrate_filter() to eBPF and thus always
         * have an unspec program type.
         */
        return prog->type == BPF_PROG_TYPE_UNSPEC;
}

static inline u32 bpf_ctx_off_adjust_machine(u32 size)
{
        const u32 size_machine = sizeof(unsigned long);

        if (size > size_machine && size % size_machine == 0)
                size = size_machine;

        return size;
}

static inline bool
bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default)
{
        return size <= size_default && (size & (size - 1)) == 0;
}

static inline u8
bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default)
{
        u8 access_off = off & (size_default - 1);

#ifdef __LITTLE_ENDIAN
        return access_off;
#else
        return size_default - (access_off + size);
#endif
}

#define bpf_ctx_wide_access_ok(off, size, type, field)                        \
        (size == sizeof(__u64) &&                                        \
        off >= offsetof(type, field) &&                                        \
        off + sizeof(__u64) <= offsetofend(type, field) &&                \
        off % sizeof(__u64) == 0)

#define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0]))

static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp)
{
#ifndef CONFIG_BPF_JIT_ALWAYS_ON
        if (!fp->jited) {
                set_vm_flush_reset_perms(fp);
                return set_memory_ro((unsigned long)fp, fp->pages);
        }
#endif
        return 0;
}

static inline int __must_check
bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr)
{
        set_vm_flush_reset_perms(hdr);
        return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT);
}

int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
{
        return sk_filter_trim_cap(sk, skb, 1);
}

struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err);
void bpf_prog_free(struct bpf_prog *fp);

bool bpf_opcode_in_insntable(u8 code);

void bpf_prog_fill_jited_linfo(struct bpf_prog *prog,
                               const u32 *insn_to_jit_off);
int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog);
void bpf_prog_jit_attempt_done(struct bpf_prog *prog);

struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags);
struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
                                  gfp_t gfp_extra_flags);
void __bpf_prog_free(struct bpf_prog *fp);

static inline void bpf_prog_unlock_free(struct bpf_prog *fp)
{
        __bpf_prog_free(fp);
}

typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter,
                                       unsigned int flen);

int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
                              bpf_aux_classic_check_t trans, bool save_orig);
void bpf_prog_destroy(struct bpf_prog *fp);

int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_attach_bpf(u32 ufd, struct sock *sk);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk);
void sk_reuseport_prog_free(struct bpf_prog *prog);
int sk_detach_filter(struct sock *sk);
int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len);

bool sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);

u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
#define __bpf_call_base_args \
        ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \
         (void *)__bpf_call_base)

struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void);
bool bpf_jit_inlines_helper_call(s32 imm);
bool bpf_jit_supports_subprog_tailcalls(void);
bool bpf_jit_supports_percpu_insn(void);
bool bpf_jit_supports_kfunc_call(void);
bool bpf_jit_supports_far_kfunc_call(void);
bool bpf_jit_supports_exceptions(void);
bool bpf_jit_supports_ptr_xchg(void);
bool bpf_jit_supports_arena(void);
bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena);
bool bpf_jit_supports_private_stack(void);
bool bpf_jit_supports_timed_may_goto(void);
u64 bpf_arch_uaddress_limit(void);
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
u64 arch_bpf_timed_may_goto(void);
u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *);
bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id);

static inline bool bpf_dump_raw_ok(const struct cred *cred)
{
        /* Reconstruction of call-sites is dependent on kallsyms,
         * thus make dump the same restriction.
         */
        return kallsyms_show_value(cred);
}

struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
                                       const struct bpf_insn *patch, u32 len);
int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);

static inline bool xdp_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_set_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT;
}

static inline void xdp_clear_return_frame_no_direct(void)
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();

        ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT;
}

static inline int xdp_ok_fwd_dev(const struct net_device *fwd,
                                 unsigned int pktlen)
{
        unsigned int len;

        if (unlikely(!(fwd->flags & IFF_UP)))
                return -ENETDOWN;

        len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN;
        if (pktlen > len)
                return -EMSGSIZE;

        return 0;
}

/* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the
 * same cpu context. Further for best results no more than a single map
 * for the do_redirect/do_flush pair should be used. This limitation is
 * because we only track one map and force a flush when the map changes.
 * This does not appear to be a real limitation for existing software.
 */
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
                            struct xdp_buff *xdp, const struct bpf_prog *prog);
int xdp_do_redirect(struct net_device *dev,
                    struct xdp_buff *xdp,
                    const struct bpf_prog *prog);
int xdp_do_redirect_frame(struct net_device *dev,
                          struct xdp_buff *xdp,
                          struct xdp_frame *xdpf,
                          const struct bpf_prog *prog);
void xdp_do_flush(void);

void bpf_warn_invalid_xdp_action(const struct net_device *dev,
                                 const struct bpf_prog *prog, u32 act);

#ifdef CONFIG_INET
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                                  struct bpf_prog *prog, struct sk_buff *skb,
                                  struct sock *migrating_sk,
                                  u32 hash);
#else
static inline struct sock *
bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
                     struct bpf_prog *prog, struct sk_buff *skb,
                     struct sock *migrating_sk,
                     u32 hash)
{
        return NULL;
}
#endif

#ifdef CONFIG_BPF_JIT
extern int bpf_jit_enable;
extern int bpf_jit_harden;
extern int bpf_jit_kallsyms;
extern long bpf_jit_limit;
extern long bpf_jit_limit_max;

typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);

void bpf_jit_fill_hole_with_zero(void *area, unsigned int size);

struct bpf_binary_header *
bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
                     unsigned int alignment,
                     bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_jit_binary_free(struct bpf_binary_header *hdr);
u64 bpf_jit_alloc_exec_limit(void);
void *bpf_jit_alloc_exec(unsigned long size);
void bpf_jit_free_exec(void *addr);
void bpf_jit_free(struct bpf_prog *fp);
struct bpf_binary_header *
bpf_jit_binary_pack_hdr(const struct bpf_prog *fp);

void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns);
void bpf_prog_pack_free(void *ptr, u32 size);

static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
{
        return list_empty(&fp->aux->ksym.lnode) ||
               fp->aux->ksym.lnode.prev == LIST_POISON2;
}

struct bpf_binary_header *
bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image,
                          unsigned int alignment,
                          struct bpf_binary_header **rw_hdr,
                          u8 **rw_image,
                          bpf_jit_fill_hole_t bpf_fill_ill_insns);
int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header,
                                 struct bpf_binary_header *rw_header);
void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
                              struct bpf_binary_header *rw_header);

int bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                                struct bpf_jit_poke_descriptor *poke);

int bpf_jit_get_func_addr(const struct bpf_prog *prog,
                          const struct bpf_insn *insn, bool extra_pass,
                          u64 *func_addr, bool *func_addr_fixed);

struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp);
void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other);

static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
                                u32 pass, void *image)
{
        pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen,
               proglen, pass, image, current->comm, task_pid_nr(current));

        if (image)
                print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
                               16, 1, image, proglen, false);
}

static inline bool bpf_jit_is_ebpf(void)
{
# ifdef CONFIG_HAVE_EBPF_JIT
        return true;
# else
        return false;
# endif
}

static inline bool ebpf_jit_enabled(void)
{
        return bpf_jit_enable && bpf_jit_is_ebpf();
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return fp->jited && bpf_jit_is_ebpf();
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        /* These are the prerequisites, should someone ever have the
         * idea to call blinding outside of them, we make sure to
         * bail out.
         */
        if (!bpf_jit_is_ebpf())
                return false;
        if (!prog->jit_requested)
                return false;
        if (!bpf_jit_harden)
                return false;
        if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF))
                return false;

        return true;
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        /* There are a couple of corner cases where kallsyms should
         * not be enabled f.e. on hardening.
         */
        if (bpf_jit_harden)
                return false;
        if (!bpf_jit_kallsyms)
                return false;
        if (bpf_jit_kallsyms == 1)
                return true;

        return false;
}

int __bpf_address_lookup(unsigned long addr, unsigned long *size,
                                 unsigned long *off, char *sym);
bool is_bpf_text_address(unsigned long addr);
int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
                    char *sym);
struct bpf_prog *bpf_prog_ksym_find(unsigned long addr);

static inline int
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        int ret = __bpf_address_lookup(addr, size, off, sym);

        if (ret && modname)
                *modname = NULL;
        return ret;
}

void bpf_prog_kallsyms_add(struct bpf_prog *fp);
void bpf_prog_kallsyms_del(struct bpf_prog *fp);

#else /* CONFIG_BPF_JIT */

static inline bool ebpf_jit_enabled(void)
{
        return false;
}

static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog)
{
        return false;
}

static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp)
{
        return false;
}

static inline int
bpf_jit_add_poke_descriptor(struct bpf_prog *prog,
                            struct bpf_jit_poke_descriptor *poke)
{
        return -ENOTSUPP;
}

static inline void bpf_jit_free(struct bpf_prog *fp)
{
        bpf_prog_unlock_free(fp);
}

static inline bool bpf_jit_kallsyms_enabled(void)
{
        return false;
}

static inline int
__bpf_address_lookup(unsigned long addr, unsigned long *size,
                     unsigned long *off, char *sym)
{
        return 0;
}

static inline bool is_bpf_text_address(unsigned long addr)
{
        return false;
}

static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value,
                                  char *type, char *sym)
{
        return -ERANGE;
}

static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr)
{
        return NULL;
}

static inline int
bpf_address_lookup(unsigned long addr, unsigned long *size,
                   unsigned long *off, char **modname, char *sym)
{
        return 0;
}

static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
}

static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp)
{
}

#endif /* CONFIG_BPF_JIT */

void bpf_prog_kallsyms_del_all(struct bpf_prog *fp);

#define BPF_ANC                BIT(15)

static inline bool bpf_needs_clear_a(const struct sock_filter *first)
{
        switch (first->code) {
        case BPF_RET | BPF_K:
        case BPF_LD | BPF_W | BPF_LEN:
                return false;

        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
                if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X)
                        return true;
                return false;

        default:
                return true;
        }
}

static inline u16 bpf_anc_helper(const struct sock_filter *ftest)
{
        BUG_ON(ftest->code & BPF_ANC);

        switch (ftest->code) {
        case BPF_LD | BPF_W | BPF_ABS:
        case BPF_LD | BPF_H | BPF_ABS:
        case BPF_LD | BPF_B | BPF_ABS:
#define BPF_ANCILLARY(CODE)        case SKF_AD_OFF + SKF_AD_##CODE:        \
                                return BPF_ANC | SKF_AD_##CODE
                switch (ftest->k) {
                BPF_ANCILLARY(PROTOCOL);
                BPF_ANCILLARY(PKTTYPE);
                BPF_ANCILLARY(IFINDEX);
                BPF_ANCILLARY(NLATTR);
                BPF_ANCILLARY(NLATTR_NEST);
                BPF_ANCILLARY(MARK);
                BPF_ANCILLARY(QUEUE);
                BPF_ANCILLARY(HATYPE);
                BPF_ANCILLARY(RXHASH);
                BPF_ANCILLARY(CPU);
                BPF_ANCILLARY(ALU_XOR_X);
                BPF_ANCILLARY(VLAN_TAG);
                BPF_ANCILLARY(VLAN_TAG_PRESENT);
                BPF_ANCILLARY(PAY_OFFSET);
                BPF_ANCILLARY(RANDOM);
                BPF_ANCILLARY(VLAN_TPID);
                }
                fallthrough;
        default:
                return ftest->code;
        }
}

void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
                                           int k, unsigned int size);

static inline int bpf_tell_extensions(void)
{
        return SKF_AD_MAX;
}

struct bpf_sock_addr_kern {
        struct sock *sk;
        struct sockaddr *uaddr;
        /* Temporary "register" to make indirect stores to nested structures
         * defined above. We need three registers to make such a store, but
         * only two (src and dst) are available at convert_ctx_access time
         */
        u64 tmp_reg;
        void *t_ctx;        /* Attach type specific context. */
        u32 uaddrlen;
};

struct bpf_sock_ops_kern {
        struct        sock *sk;
        union {
                u32 args[4];
                u32 reply;
                u32 replylong[4];
        };
        struct sk_buff        *syn_skb;
        struct sk_buff        *skb;
        void        *skb_data_end;
        u8        op;
        u8        is_fullsock;
        u8        is_locked_tcp_sock;
        u8        remaining_opt_len;
        u64        temp;                        /* temp and everything after is not
                                         * initialized to 0 before calling
                                         * the BPF program. New fields that
                                         * should be initialized to 0 should
                                         * be inserted before temp.
                                         * temp is scratch storage used by
                                         * sock_ops_convert_ctx_access
                                         * as temporary storage of a register.
                                         */
};

struct bpf_sysctl_kern {
        struct ctl_table_header *head;
        const struct ctl_table *table;
        void *cur_val;
        size_t cur_len;
        void *new_val;
        size_t new_len;
        int new_updated;
        int write;
        loff_t *ppos;
        /* Temporary "register" for indirect stores to ppos. */
        u64 tmp_reg;
};

#define BPF_SOCKOPT_KERN_BUF_SIZE        32
struct bpf_sockopt_buf {
        u8                data[BPF_SOCKOPT_KERN_BUF_SIZE];
};

struct bpf_sockopt_kern {
        struct sock        *sk;
        u8                *optval;
        u8                *optval_end;
        s32                level;
        s32                optname;
        s32                optlen;
        /* for retval in struct bpf_cg_run_ctx */
        struct task_struct *current_task;
        /* Temporary "register" for indirect stores to ppos. */
        u64                tmp_reg;
};

int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len);

struct bpf_sk_lookup_kern {
        u16                family;
        u16                protocol;
        __be16                sport;
        u16                dport;
        struct {
                __be32 saddr;
                __be32 daddr;
        } v4;
        struct {
                const struct in6_addr *saddr;
                const struct in6_addr *daddr;
        } v6;
        struct sock        *selected_sk;
        u32                ingress_ifindex;
        bool                no_reuseport;
};

extern struct static_key_false bpf_sk_lookup_enabled;

/* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup.
 *
 * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and
 * SK_DROP. Their meaning is as follows:
 *
 *  SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result
 *  SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup
 *  SK_DROP                           : terminate lookup with -ECONNREFUSED
 *
 * This macro aggregates return values and selected sockets from
 * multiple BPF programs according to following rules in order:
 *
 *  1. If any program returned SK_PASS and a non-NULL ctx.selected_sk,
 *     macro result is SK_PASS and last ctx.selected_sk is used.
 *  2. If any program returned SK_DROP return value,
 *     macro result is SK_DROP.
 *  3. Otherwise result is SK_PASS and ctx.selected_sk is NULL.
 *
 * Caller must ensure that the prog array is non-NULL, and that the
 * array as well as the programs it contains remain valid.
 */
#define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func)                        \
        ({                                                                \
                struct bpf_sk_lookup_kern *_ctx = &(ctx);                \
                struct bpf_prog_array_item *_item;                        \
                struct sock *_selected_sk = NULL;                        \
                bool _no_reuseport = false;                                \
                struct bpf_prog *_prog;                                        \
                bool _all_pass = true;                                        \
                u32 _ret;                                                \
                                                                        \
                migrate_disable();                                        \
                _item = &(array)->items[0];                                \
                while ((_prog = READ_ONCE(_item->prog))) {                \
                        /* restore most recent selection */                \
                        _ctx->selected_sk = _selected_sk;                \
                        _ctx->no_reuseport = _no_reuseport;                \
                                                                        \
                        _ret = func(_prog, _ctx);                        \
                        if (_ret == SK_PASS && _ctx->selected_sk) {        \
                                /* remember last non-NULL socket */        \
                                _selected_sk = _ctx->selected_sk;        \
                                _no_reuseport = _ctx->no_reuseport;        \
                        } else if (_ret == SK_DROP && _all_pass) {        \
                                _all_pass = false;                        \
                        }                                                \
                        _item++;                                        \
                }                                                        \
                _ctx->selected_sk = _selected_sk;                        \
                _ctx->no_reuseport = _no_reuseport;                        \
                migrate_enable();                                        \
                _all_pass || _selected_sk ? SK_PASS : SK_DROP;                \
         })

static inline bool bpf_sk_lookup_run_v4(const struct net *net, int protocol,
                                        const __be32 saddr, const __be16 sport,
                                        const __be32 daddr, const u16 dport,
                                        const int ifindex, struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET,
                        .protocol        = protocol,
                        .v4.saddr        = saddr,
                        .v4.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                        .ingress_ifindex        = ifindex,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}

#if IS_ENABLED(CONFIG_IPV6)
static inline bool bpf_sk_lookup_run_v6(const struct net *net, int protocol,
                                        const struct in6_addr *saddr,
                                        const __be16 sport,
                                        const struct in6_addr *daddr,
                                        const u16 dport,
                                        const int ifindex, struct sock **psk)
{
        struct bpf_prog_array *run_array;
        struct sock *selected_sk = NULL;
        bool no_reuseport = false;

        rcu_read_lock();
        run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]);
        if (run_array) {
                struct bpf_sk_lookup_kern ctx = {
                        .family                = AF_INET6,
                        .protocol        = protocol,
                        .v6.saddr        = saddr,
                        .v6.daddr        = daddr,
                        .sport                = sport,
                        .dport                = dport,
                        .ingress_ifindex        = ifindex,
                };
                u32 act;

                act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run);
                if (act == SK_PASS) {
                        selected_sk = ctx.selected_sk;
                        no_reuseport = ctx.no_reuseport;
                } else {
                        selected_sk = ERR_PTR(-ECONNREFUSED);
                }
        }
        rcu_read_unlock();
        *psk = selected_sk;
        return no_reuseport;
}
#endif /* IS_ENABLED(CONFIG_IPV6) */

static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index,
                                                   u64 flags, const u64 flag_mask,
                                                   void *lookup_elem(struct bpf_map *map, u32 key))
{
        struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
        const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX;

        /* Lower bits of the flags are used as return code on lookup failure */
        if (unlikely(flags & ~(action_mask | flag_mask)))
                return XDP_ABORTED;

        ri->tgt_value = lookup_elem(map, index);
        if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) {
                /* If the lookup fails we want to clear out the state in the
                 * redirect_info struct completely, so that if an eBPF program
                 * performs multiple lookups, the last one always takes
                 * precedence.
                 */
                ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */
                ri->map_type = BPF_MAP_TYPE_UNSPEC;
                return flags & action_mask;
        }

        ri->tgt_index = index;
        ri->map_id = map->id;
        ri->map_type = map->map_type;

        if (flags & BPF_F_BROADCAST) {
                WRITE_ONCE(ri->map, map);
                ri->flags = flags;
        } else {
                WRITE_ONCE(ri->map, NULL);
                ri->flags = 0;
        }

        return XDP_REDIRECT;
}

#ifdef CONFIG_NET
int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len);
int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
                          u32 len, u64 flags);
int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len);
void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len);
void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
                      void *buf, unsigned long len, bool flush);
#else /* CONFIG_NET */
static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset,
                                       void *to, u32 len)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset,
                                        const void *from, u32 len, u64 flags)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset,
                                       void *buf, u32 len)
{
        return -EOPNOTSUPP;
}

static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset,
                                        void *buf, u32 len)
{
        return -EOPNOTSUPP;
}

static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
{
        return NULL;
}

static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf,
                                    unsigned long len, bool flush)
{
}
#endif /* CONFIG_NET */

#endif /* __LINUX_FILTER_H__ */













































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __NET_UDP_TUNNEL_H
#define __NET_UDP_TUNNEL_H

#include <net/ip_tunnels.h>
#include <net/udp.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/ipv6_stubs.h>
#endif

struct udp_port_cfg {
        u8                        family;

        /* Used only for kernel-created sockets */
        union {
                struct in_addr                local_ip;
#if IS_ENABLED(CONFIG_IPV6)
                struct in6_addr                local_ip6;
#endif
        };

        union {
                struct in_addr                peer_ip;
#if IS_ENABLED(CONFIG_IPV6)
                struct in6_addr                peer_ip6;
#endif
        };

        __be16                        local_udp_port;
        __be16                        peer_udp_port;
        int                        bind_ifindex;
        unsigned int                use_udp_checksums:1,
                                use_udp6_tx_checksums:1,
                                use_udp6_rx_checksums:1,
                                ipv6_v6only:1;
};

int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
                     struct socket **sockp);

#if IS_ENABLED(CONFIG_IPV6)
int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
                     struct socket **sockp);
#else
static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
                                   struct socket **sockp)
{
        return 0;
}
#endif

static inline int udp_sock_create(struct net *net,
                                  struct udp_port_cfg *cfg,
                                  struct socket **sockp)
{
        if (cfg->family == AF_INET)
                return udp_sock_create4(net, cfg, sockp);

        if (cfg->family == AF_INET6)
                return udp_sock_create6(net, cfg, sockp);

        return -EPFNOSUPPORT;
}

typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk,
                                             struct sk_buff *skb);
typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk,
                                           struct sk_buff *skb, int err,
                                           __be16 port, u32 info, u8 *payload);
typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
                                                    struct list_head *head,
                                                    struct sk_buff *skb);
typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb,
                                         int nhoff);

struct udp_tunnel_sock_cfg {
        void *sk_user_data;     /* user data used by encap_rcv call back */
        /* Used for setting up udp_sock fields, see udp.h for details */
        __u8  encap_type;
        udp_tunnel_encap_rcv_t encap_rcv;
        udp_tunnel_encap_err_lookup_t encap_err_lookup;
        udp_tunnel_encap_err_rcv_t encap_err_rcv;
        udp_tunnel_encap_destroy_t encap_destroy;
        udp_tunnel_gro_receive_t gro_receive;
        udp_tunnel_gro_complete_t gro_complete;
};

/* Setup the given (UDP) sock to receive UDP encapsulated packets */
void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
                           struct udp_tunnel_sock_cfg *sock_cfg);

/* -- List of parsable UDP tunnel types --
 *
 * Adding to this list will result in serious debate.  The main issue is
 * that this list is essentially a list of workarounds for either poorly
 * designed tunnels, or poorly designed device offloads.
 *
 * The parsing supported via these types should really be used for Rx
 * traffic only as the network stack will have already inserted offsets for
 * the location of the headers in the skb.  In addition any ports that are
 * pushed should be kept within the namespace without leaking to other
 * devices such as VFs or other ports on the same device.
 *
 * It is strongly encouraged to use CHECKSUM_COMPLETE for Rx to avoid the
 * need to use this for Rx checksum offload.  It should not be necessary to
 * call this function to perform Tx offloads on outgoing traffic.
 */
enum udp_parsable_tunnel_type {
        UDP_TUNNEL_TYPE_VXLAN          = BIT(0), /* RFC 7348 */
        UDP_TUNNEL_TYPE_GENEVE          = BIT(1), /* draft-ietf-nvo3-geneve */
        UDP_TUNNEL_TYPE_VXLAN_GPE = BIT(2), /* draft-ietf-nvo3-vxlan-gpe */
};

struct udp_tunnel_info {
        unsigned short type;
        sa_family_t sa_family;
        __be16 port;
        u8 hw_priv;
};

/* Notify network devices of offloadable types */
void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
                             unsigned short type);
void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
                             unsigned short type);
void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type);
void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type);

static inline void udp_tunnel_get_rx_info(struct net_device *dev)
{
        ASSERT_RTNL();
        if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
                return;
        call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev);
}

static inline void udp_tunnel_drop_rx_info(struct net_device *dev)
{
        ASSERT_RTNL();
        if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
                return;
        call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev);
}

/* Transmit the skb using UDP encapsulation. */
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
                         __be32 src, __be32 dst, __u8 tos, __u8 ttl,
                         __be16 df, __be16 src_port, __be16 dst_port,
                         bool xnet, bool nocheck);

int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
                         struct sk_buff *skb,
                         struct net_device *dev,
                         const struct in6_addr *saddr,
                         const struct in6_addr *daddr,
                         __u8 prio, __u8 ttl, __be32 label,
                         __be16 src_port, __be16 dst_port, bool nocheck);

void udp_tunnel_sock_release(struct socket *sock);

struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
                                     struct net_device *dev,
                                     struct net *net, int oif,
                                     __be32 *saddr,
                                     const struct ip_tunnel_key *key,
                                     __be16 sport, __be16 dport, u8 tos,
                                     struct dst_cache *dst_cache);
struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
                                         struct net_device *dev,
                                         struct net *net,
                                         struct socket *sock, int oif,
                                         struct in6_addr *saddr,
                                         const struct ip_tunnel_key *key,
                                         __be16 sport, __be16 dport, u8 dsfield,
                                         struct dst_cache *dst_cache);

struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
                                    const unsigned long *flags,
                                    __be64 tunnel_id, int md_size);

#ifdef CONFIG_INET
static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
{
        int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;

        return iptunnel_handle_offloads(skb, type);
}
#endif

static inline void udp_tunnel_encap_enable(struct sock *sk)
{
        if (udp_test_and_set_bit(ENCAP_ENABLED, sk))
                return;

#if IS_ENABLED(CONFIG_IPV6)
        if (READ_ONCE(sk->sk_family) == PF_INET6)
                ipv6_stub->udpv6_encap_enable();
#endif
        udp_encap_enable();
}

#define UDP_TUNNEL_NIC_MAX_TABLES        4

enum udp_tunnel_nic_info_flags {
        /* Device callbacks may sleep */
        UDP_TUNNEL_NIC_INFO_MAY_SLEEP        = BIT(0),
        /* Device only supports offloads when it's open, all ports
         * will be removed before close and re-added after open.
         */
        UDP_TUNNEL_NIC_INFO_OPEN_ONLY        = BIT(1),
        /* Device supports only IPv4 tunnels */
        UDP_TUNNEL_NIC_INFO_IPV4_ONLY        = BIT(2),
        /* Device has hard-coded the IANA VXLAN port (4789) as VXLAN.
         * This port must not be counted towards n_entries of any table.
         * Driver will not receive any callback associated with port 4789.
         */
        UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN        = BIT(3),
};

struct udp_tunnel_nic;

#define UDP_TUNNEL_NIC_MAX_SHARING_DEVICES        (U16_MAX / 2)

struct udp_tunnel_nic_shared {
        struct udp_tunnel_nic *udp_tunnel_nic_info;

        struct list_head devices;
};

struct udp_tunnel_nic_shared_node {
        struct net_device *dev;
        struct list_head list;
};

/**
 * struct udp_tunnel_nic_info - driver UDP tunnel offload information
 * @set_port:        callback for adding a new port
 * @unset_port:        callback for removing a port
 * @sync_table:        callback for syncing the entire port table at once
 * @shared:        reference to device global state (optional)
 * @flags:        device flags from enum udp_tunnel_nic_info_flags
 * @tables:        UDP port tables this device has
 * @tables.n_entries:                number of entries in this table
 * @tables.tunnel_types:        types of tunnels this table accepts
 *
 * Drivers are expected to provide either @set_port and @unset_port callbacks
 * or the @sync_table callback. Callbacks are invoked with rtnl lock held.
 *
 * Devices which (misguidedly) share the UDP tunnel port table across multiple
 * netdevs should allocate an instance of struct udp_tunnel_nic_shared and
 * point @shared at it.
 * There must never be more than %UDP_TUNNEL_NIC_MAX_SHARING_DEVICES devices
 * sharing a table.
 *
 * Known limitations:
 *  - UDP tunnel port notifications are fundamentally best-effort -
 *    it is likely the driver will both see skbs which use a UDP tunnel port,
 *    while not being a tunneled skb, and tunnel skbs from other ports -
 *    drivers should only use these ports for non-critical RX-side offloads,
 *    e.g. the checksum offload;
 *  - none of the devices care about the socket family at present, so we don't
 *    track it. Please extend this code if you care.
 */
struct udp_tunnel_nic_info {
        /* one-by-one */
        int (*set_port)(struct net_device *dev,
                        unsigned int table, unsigned int entry,
                        struct udp_tunnel_info *ti);
        int (*unset_port)(struct net_device *dev,
                          unsigned int table, unsigned int entry,
                          struct udp_tunnel_info *ti);

        /* all at once */
        int (*sync_table)(struct net_device *dev, unsigned int table);

        struct udp_tunnel_nic_shared *shared;

        unsigned int flags;

        struct udp_tunnel_nic_table_info {
                unsigned int n_entries;
                unsigned int tunnel_types;
        } tables[UDP_TUNNEL_NIC_MAX_TABLES];
};

/* UDP tunnel module dependencies
 *
 * Tunnel drivers are expected to have a hard dependency on the udp_tunnel
 * module. NIC drivers are not, they just attach their
 * struct udp_tunnel_nic_info to the netdev and wait for callbacks to come.
 * Loading a tunnel driver will cause the udp_tunnel module to be loaded
 * and only then will all the required state structures be allocated.
 * Since we want a weak dependency from the drivers and the core to udp_tunnel
 * we call things through the following stubs.
 */
struct udp_tunnel_nic_ops {
        void (*get_port)(struct net_device *dev, unsigned int table,
                         unsigned int idx, struct udp_tunnel_info *ti);
        void (*set_port_priv)(struct net_device *dev, unsigned int table,
                              unsigned int idx, u8 priv);
        void (*add_port)(struct net_device *dev, struct udp_tunnel_info *ti);
        void (*del_port)(struct net_device *dev, struct udp_tunnel_info *ti);
        void (*reset_ntf)(struct net_device *dev);

        size_t (*dump_size)(struct net_device *dev, unsigned int table);
        int (*dump_write)(struct net_device *dev, unsigned int table,
                          struct sk_buff *skb);
};

#ifdef CONFIG_INET
extern const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops;
#else
#define udp_tunnel_nic_ops        ((struct udp_tunnel_nic_ops *)NULL)
#endif

static inline void
udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table,
                        unsigned int idx, struct udp_tunnel_info *ti)
{
        /* This helper is used from .sync_table, we indicate empty entries
         * by zero'ed @ti. Drivers which need to know the details of a port
         * when it gets deleted should use the .set_port / .unset_port
         * callbacks.
         * Zero out here, otherwise !CONFIG_INET causes uninitilized warnings.
         */
        memset(ti, 0, sizeof(*ti));

        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->get_port(dev, table, idx, ti);
}

static inline void
udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
                             unsigned int idx, u8 priv)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv);
}

static inline void
udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti)
{
        if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
                return;
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->add_port(dev, ti);
}

static inline void
udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti)
{
        if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
                return;
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->del_port(dev, ti);
}

/**
 * udp_tunnel_nic_reset_ntf() - device-originating reset notification
 * @dev: network interface device structure
 *
 * Called by the driver to inform the core that the entire UDP tunnel port
 * state has been lost, usually due to device reset. Core will assume device
 * forgot all the ports and issue .set_port and .sync_table callbacks as
 * necessary.
 *
 * This function must be called with rtnl lock held, and will issue all
 * the callbacks before returning.
 */
static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev)
{
        if (udp_tunnel_nic_ops)
                udp_tunnel_nic_ops->reset_ntf(dev);
}

static inline size_t
udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
{
        if (!udp_tunnel_nic_ops)
                return 0;
        return udp_tunnel_nic_ops->dump_size(dev, table);
}

static inline int
udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
                          struct sk_buff *skb)
{
        if (!udp_tunnel_nic_ops)
                return 0;
        return udp_tunnel_nic_ops->dump_write(dev, table, skb);
}
#endif






















































































  164 






























































    8 


















































































   22 






































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PID_H
#define _LINUX_PID_H

#include <linux/pid_types.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <linux/sched.h>
#include <linux/wait.h>

/*
 * What is struct pid?
 *
 * A struct pid is the kernel's internal notion of a process identifier.
 * It refers to individual tasks, process groups, and sessions.  While
 * there are processes attached to it the struct pid lives in a hash
 * table, so it and then the processes that it refers to can be found
 * quickly from the numeric pid value.  The attached processes may be
 * quickly accessed by following pointers from struct pid.
 *
 * Storing pid_t values in the kernel and referring to them later has a
 * problem.  The process originally with that pid may have exited and the
 * pid allocator wrapped, and another process could have come along
 * and been assigned that pid.
 *
 * Referring to user space processes by holding a reference to struct
 * task_struct has a problem.  When the user space process exits
 * the now useless task_struct is still kept.  A task_struct plus a
 * stack consumes around 10K of low kernel memory.  More precisely
 * this is THREAD_SIZE + sizeof(struct task_struct).  By comparison
 * a struct pid is about 64 bytes.
 *
 * Holding a reference to struct pid solves both of these problems.
 * It is small so holding a reference does not consume a lot of
 * resources, and since a new struct pid is allocated when the numeric pid
 * value is reused (when pids wrap around) we don't mistakenly refer to new
 * processes.
 */


/*
 * struct upid is used to get the id of the struct pid, as it is
 * seen in particular namespace. Later the struct pid is found with
 * find_pid_ns() using the int nr and struct pid_namespace *ns.
 */

#define RESERVED_PIDS 300

struct upid {
        int nr;
        struct pid_namespace *ns;
};

struct pid
{
        refcount_t count;
        unsigned int level;
        spinlock_t lock;
        struct dentry *stashed;
        u64 ino;
        struct rb_node pidfs_node;
        /* lists of tasks that use this pid */
        struct hlist_head tasks[PIDTYPE_MAX];
        struct hlist_head inodes;
        /* wait queue for pidfd notifications */
        wait_queue_head_t wait_pidfd;
        struct rcu_head rcu;
        struct upid numbers[];
};

extern seqcount_spinlock_t pidmap_lock_seq;
extern struct pid init_struct_pid;

struct file;

struct pid *pidfd_pid(const struct file *file);
struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags);
int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret);
void do_notify_pidfd(struct task_struct *task);

static inline struct pid *get_pid(struct pid *pid)
{
        if (pid)
                refcount_inc(&pid->count);
        return pid;
}

extern void put_pid(struct pid *pid);
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
{
        return !hlist_empty(&pid->tasks[type]);
}
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);

extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);

/*
 * these helpers must be called with the tasklist_lock write-held.
 */
extern void attach_pid(struct task_struct *task, enum pid_type);
void detach_pid(struct pid **pids, struct task_struct *task, enum pid_type);
void change_pid(struct pid **pids, struct task_struct *task, enum pid_type,
                struct pid *pid);
extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
                         enum pid_type);

/*
 * look up a PID in the hash table. Must be called with the tasklist_lock
 * or rcu_read_lock() held.
 *
 * find_pid_ns() finds the pid in the namespace specified
 * find_vpid() finds the pid by its virtual id, i.e. in the current namespace
 *
 * see also find_task_by_vpid() set in include/linux/sched.h
 */
extern struct pid *find_pid_ns(int nr, struct pid_namespace *ns);
extern struct pid *find_vpid(int nr);

/*
 * Lookup a PID in the hash table, and return with it's count elevated.
 */
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);

extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
                             size_t set_tid_size);
extern void free_pid(struct pid *pid);
void free_pids(struct pid **pids);
extern void disable_pid_allocation(struct pid_namespace *ns);

/*
 * ns_of_pid() returns the pid namespace in which the specified pid was
 * allocated.
 *
 * NOTE:
 *         ns_of_pid() is expected to be called for a process (task) that has
 *         an attached 'struct pid' (see attach_pid(), detach_pid()) i.e @pid
 *         is expected to be non-NULL. If @pid is NULL, caller should handle
 *         the resulting NULL pid-ns.
 */
static inline struct pid_namespace *ns_of_pid(struct pid *pid)
{
        struct pid_namespace *ns = NULL;
        if (pid)
                ns = pid->numbers[pid->level].ns;
        return ns;
}

/*
 * is_child_reaper returns true if the pid is the init process
 * of the current namespace. As this one could be checked before
 * pid_ns->child_reaper is assigned in copy_process, we check
 * with the pid number.
 */
static inline bool is_child_reaper(struct pid *pid)
{
        return pid->numbers[pid->level].nr == 1;
}

/*
 * the helpers to get the pid's id seen from different namespaces
 *
 * pid_nr()    : global id, i.e. the id seen from the init namespace;
 * pid_vnr()   : virtual id, i.e. the id seen from the pid namespace of
 *               current.
 * pid_nr_ns() : id seen from the ns specified.
 *
 * see also task_xid_nr() etc in include/linux/sched.h
 */

static inline pid_t pid_nr(struct pid *pid)
{
        pid_t nr = 0;
        if (pid)
                nr = pid->numbers[0].nr;
        return nr;
}

pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
pid_t pid_vnr(struct pid *pid);

#define do_each_pid_task(pid, type, task)                                \
        do {                                                                \
                if ((pid) != NULL)                                        \
                        hlist_for_each_entry_rcu((task),                \
                                &(pid)->tasks[type], pid_links[type]) {

                        /*
                         * Both old and new leaders may be attached to
                         * the same pid in the middle of de_thread().
                         */
#define while_each_pid_task(pid, type, task)                                \
                                if (type == PIDTYPE_PID)                \
                                        break;                                \
                        }                                                \
        } while (0)

#define do_each_pid_thread(pid, type, task)                                \
        do_each_pid_task(pid, type, task) {                                \
                struct task_struct *tg___ = task;                        \
                for_each_thread(tg___, task) {

#define while_each_pid_thread(pid, type, task)                                \
                }                                                        \
                task = tg___;                                                \
        } while_each_pid_task(pid, type, task)

static inline struct pid *task_pid(struct task_struct *task)
{
        return task->thread_pid;
}

/*
 * the helpers to get the task's different pids as they are seen
 * from various namespaces
 *
 * task_xid_nr()     : global id, i.e. the id seen from the init namespace;
 * task_xid_vnr()    : virtual id, i.e. the id seen from the pid namespace of
 *                     current.
 * task_xid_nr_ns()  : id seen from the ns specified;
 *
 * see also pid_nr() etc in include/linux/pid.h
 */
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns);

static inline pid_t task_pid_nr(struct task_struct *tsk)
{
        return tsk->pid;
}

static inline pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
}

static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}


static inline pid_t task_tgid_nr(struct task_struct *tsk)
{
        return tsk->tgid;
}

/**
 * pid_alive - check that a task structure is not stale
 * @p: Task structure to be checked.
 *
 * Test if a process is not yet dead (at most zombie state)
 * If pid_alive fails, then pointers within the task structure
 * can be stale and must not be dereferenced.
 *
 * Return: 1 if the process is alive. 0 otherwise.
 */
static inline int pid_alive(const struct task_struct *p)
{
        return p->thread_pid != NULL;
}

static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}

static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}


static inline pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}

static inline pid_t task_session_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}

static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, ns);
}

static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
        return __task_pid_nr_ns(tsk, PIDTYPE_TGID, NULL);
}

static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
{
        pid_t pid = 0;

        rcu_read_lock();
        if (pid_alive(tsk))
                pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
        rcu_read_unlock();

        return pid;
}

static inline pid_t task_ppid_nr(const struct task_struct *tsk)
{
        return task_ppid_nr_ns(tsk, &init_pid_ns);
}

/* Obsolete, do not use: */
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
{
        return task_pgrp_nr_ns(tsk, &init_pid_ns);
}

/**
 * is_global_init - check if a task structure is init. Since init
 * is free to have sub-threads we need to check tgid.
 * @tsk: Task structure to be checked.
 *
 * Check if a task structure is the first user space task the kernel created.
 *
 * Return: 1 if the task structure is init. 0 otherwise.
 */
static inline int is_global_init(struct task_struct *tsk)
{
        return task_tgid_nr(tsk) == 1;
}

#endif /* _LINUX_PID_H */




























































































































  662 



















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
// SPDX-License-Identifier: GPL-2.0
#include <linux/err.h>
#include <linux/bug.h>
#include <linux/atomic.h>
#include <linux/errseq.h>
#include <linux/log2.h>

/*
 * An errseq_t is a way of recording errors in one place, and allowing any
 * number of "subscribers" to tell whether it has changed since a previous
 * point where it was sampled.
 *
 * It's implemented as an unsigned 32-bit value. The low order bits are
 * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits
 * are used as a counter. This is done with atomics instead of locking so that
 * these functions can be called from any context.
 *
 * The general idea is for consumers to sample an errseq_t value. That value
 * can later be used to tell whether any new errors have occurred since that
 * sampling was done.
 *
 * Note that there is a risk of collisions if new errors are being recorded
 * frequently, since we have so few bits to use as a counter.
 *
 * To mitigate this, one bit is used as a flag to tell whether the value has
 * been sampled since a new value was recorded. That allows us to avoid bumping
 * the counter if no one has sampled it since the last time an error was
 * recorded.
 *
 * A new errseq_t should always be zeroed out.  A errseq_t value of all zeroes
 * is the special (but common) case where there has never been an error. An all
 * zero value thus serves as the "epoch" if one wishes to know whether there
 * has ever been an error set since it was first initialized.
 */

/* The low bits are designated for error code (max of MAX_ERRNO) */
#define ERRSEQ_SHIFT                ilog2(MAX_ERRNO + 1)

/* This bit is used as a flag to indicate whether the value has been seen */
#define ERRSEQ_SEEN                (1 << ERRSEQ_SHIFT)

/* The lowest bit of the counter */
#define ERRSEQ_CTR_INC                (1 << (ERRSEQ_SHIFT + 1))

/**
 * errseq_set - set a errseq_t for later reporting
 * @eseq: errseq_t field that should be set
 * @err: error to set (must be between -1 and -MAX_ERRNO)
 *
 * This function sets the error in @eseq, and increments the sequence counter
 * if the last sequence was sampled at some point in the past.
 *
 * Any error set will always overwrite an existing error.
 *
 * Return: The previous value, primarily for debugging purposes. The
 * return value should not be used as a previously sampled value in later
 * calls as it will not have the SEEN flag set.
 */
errseq_t errseq_set(errseq_t *eseq, int err)
{
        errseq_t cur, old;

        /* MAX_ERRNO must be able to serve as a mask */
        BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1);

        /*
         * Ensure the error code actually fits where we want it to go. If it
         * doesn't then just throw a warning and don't record anything. We
         * also don't accept zero here as that would effectively clear a
         * previous error.
         */
        old = READ_ONCE(*eseq);

        if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO),
                                "err = %d\n", err))
                return old;

        for (;;) {
                errseq_t new;

                /* Clear out error bits and set new error */
                new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err;

                /* Only increment if someone has looked at it */
                if (old & ERRSEQ_SEEN)
                        new += ERRSEQ_CTR_INC;

                /* If there would be no change, then call it done */
                if (new == old) {
                        cur = new;
                        break;
                }

                /* Try to swap the new value into place */
                cur = cmpxchg(eseq, old, new);

                /*
                 * Call it success if we did the swap or someone else beat us
                 * to it for the same value.
                 */
                if (likely(cur == old || cur == new))
                        break;

                /* Raced with an update, try again */
                old = cur;
        }
        return cur;
}
EXPORT_SYMBOL(errseq_set);

/**
 * errseq_sample() - Grab current errseq_t value.
 * @eseq: Pointer to errseq_t to be sampled.
 *
 * This function allows callers to initialise their errseq_t variable.
 * If the error has been "seen", new callers will not see an old error.
 * If there is an unseen error in @eseq, the caller of this function will
 * see it the next time it checks for an error.
 *
 * Context: Any context.
 * Return: The current errseq value.
 */
errseq_t errseq_sample(errseq_t *eseq)
{
        errseq_t old = READ_ONCE(*eseq);

        /* If nobody has seen this error yet, then we can be the first. */
        if (!(old & ERRSEQ_SEEN))
                old = 0;
        return old;
}
EXPORT_SYMBOL(errseq_sample);

/**
 * errseq_check() - Has an error occurred since a particular sample point?
 * @eseq: Pointer to errseq_t value to be checked.
 * @since: Previously-sampled errseq_t from which to check.
 *
 * Grab the value that eseq points to, and see if it has changed @since
 * the given value was sampled. The @since value is not advanced, so there
 * is no need to mark the value as seen.
 *
 * Return: The latest error set in the errseq_t or 0 if it hasn't changed.
 */
int errseq_check(errseq_t *eseq, errseq_t since)
{
        errseq_t cur = READ_ONCE(*eseq);

        if (likely(cur == since))
                return 0;
        return -(cur & MAX_ERRNO);
}
EXPORT_SYMBOL(errseq_check);

/**
 * errseq_check_and_advance() - Check an errseq_t and advance to current value.
 * @eseq: Pointer to value being checked and reported.
 * @since: Pointer to previously-sampled errseq_t to check against and advance.
 *
 * Grab the eseq value, and see whether it matches the value that @since
 * points to. If it does, then just return 0.
 *
 * If it doesn't, then the value has changed. Set the "seen" flag, and try to
 * swap it into place as the new eseq value. Then, set that value as the new
 * "since" value, and return whatever the error portion is set to.
 *
 * Note that no locking is provided here for concurrent updates to the "since"
 * value. The caller must provide that if necessary. Because of this, callers
 * may want to do a lockless errseq_check before taking the lock and calling
 * this.
 *
 * Return: Negative errno if one has been stored, or 0 if no new error has
 * occurred.
 */
int errseq_check_and_advance(errseq_t *eseq, errseq_t *since)
{
        int err = 0;
        errseq_t old, new;

        /*
         * Most callers will want to use the inline wrapper to check this,
         * so that the common case of no error is handled without needing
         * to take the lock that protects the "since" value.
         */
        old = READ_ONCE(*eseq);
        if (old != *since) {
                /*
                 * Set the flag and try to swap it into place if it has
                 * changed.
                 *
                 * We don't care about the outcome of the swap here. If the
                 * swap doesn't occur, then it has either been updated by a
                 * writer who is altering the value in some way (updating
                 * counter or resetting the error), or another reader who is
                 * just setting the "seen" flag. Either outcome is OK, and we
                 * can advance "since" and return an error based on what we
                 * have.
                 */
                new = old | ERRSEQ_SEEN;
                if (new != old)
                        cmpxchg(eseq, old, new);
                *since = new;
                err = -(new & MAX_ERRNO);
        }
        return err;
}
EXPORT_SYMBOL(errseq_check_and_advance);























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 




























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
 *
 * The code this is based on carried the following copyright notice:
 * ---
 * (C) Copyright 2001-2006
 * Alex Zeffertt, Cambridge Broadband Ltd, ajz@cambridgebroadband.com
 * Re-worked by Ben Greear <greearb@candelatech.com>
 * ---
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/rculist.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/net_tstamp.h>
#include <linux/ethtool.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
#include <linux/if_link.h>
#include <linux/if_macvlan.h>
#include <linux/hash.h>
#include <linux/workqueue.h>
#include <net/netdev_lock.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
#include <linux/netpoll.h>
#include <linux/phy.h>

#define MACVLAN_HASH_BITS        8
#define MACVLAN_HASH_SIZE        (1<<MACVLAN_HASH_BITS)
#define MACVLAN_DEFAULT_BC_QUEUE_LEN        1000

#define MACVLAN_F_PASSTHRU        1
#define MACVLAN_F_ADDRCHANGE        2

struct macvlan_port {
        struct net_device        *dev;
        struct hlist_head        vlan_hash[MACVLAN_HASH_SIZE];
        struct list_head        vlans;
        struct sk_buff_head        bc_queue;
        struct work_struct        bc_work;
        u32                        bc_queue_len_used;
        int                        bc_cutoff;
        u32                        flags;
        int                        count;
        struct hlist_head        vlan_source_hash[MACVLAN_HASH_SIZE];
        DECLARE_BITMAP(bc_filter, MACVLAN_MC_FILTER_SZ);
        DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
        unsigned char           perm_addr[ETH_ALEN];
};

struct macvlan_source_entry {
        struct hlist_node        hlist;
        struct macvlan_dev        *vlan;
        unsigned char                addr[6+2] __aligned(sizeof(u16));
        struct rcu_head                rcu;
};

struct macvlan_skb_cb {
        const struct macvlan_dev *src;
};

#define MACVLAN_SKB_CB(__skb) ((struct macvlan_skb_cb *)&((__skb)->cb[0]))

static void macvlan_port_destroy(struct net_device *dev);
static void update_port_bc_queue_len(struct macvlan_port *port);

static inline bool macvlan_passthru(const struct macvlan_port *port)
{
        return port->flags & MACVLAN_F_PASSTHRU;
}

static inline void macvlan_set_passthru(struct macvlan_port *port)
{
        port->flags |= MACVLAN_F_PASSTHRU;
}

static inline bool macvlan_addr_change(const struct macvlan_port *port)
{
        return port->flags & MACVLAN_F_ADDRCHANGE;
}

static inline void macvlan_set_addr_change(struct macvlan_port *port)
{
        port->flags |= MACVLAN_F_ADDRCHANGE;
}

static inline void macvlan_clear_addr_change(struct macvlan_port *port)
{
        port->flags &= ~MACVLAN_F_ADDRCHANGE;
}

/* Hash Ethernet address */
static u32 macvlan_eth_hash(const unsigned char *addr)
{
        u64 value = get_unaligned((u64 *)addr);

        /* only want 6 bytes */
#ifdef __BIG_ENDIAN
        value >>= 16;
#else
        value <<= 16;
#endif
        return hash_64(value, MACVLAN_HASH_BITS);
}

static struct macvlan_port *macvlan_port_get_rcu(const struct net_device *dev)
{
        return rcu_dereference(dev->rx_handler_data);
}

static struct macvlan_port *macvlan_port_get_rtnl(const struct net_device *dev)
{
        return rtnl_dereference(dev->rx_handler_data);
}

static struct macvlan_dev *macvlan_hash_lookup(const struct macvlan_port *port,
                                               const unsigned char *addr)
{
        struct macvlan_dev *vlan;
        u32 idx = macvlan_eth_hash(addr);

        hlist_for_each_entry_rcu(vlan, &port->vlan_hash[idx], hlist,
                                 lockdep_rtnl_is_held()) {
                if (ether_addr_equal_64bits(vlan->dev->dev_addr, addr))
                        return vlan;
        }
        return NULL;
}

static struct macvlan_source_entry *macvlan_hash_lookup_source(
        const struct macvlan_dev *vlan,
        const unsigned char *addr)
{
        struct macvlan_source_entry *entry;
        u32 idx = macvlan_eth_hash(addr);
        struct hlist_head *h = &vlan->port->vlan_source_hash[idx];

        hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) {
                if (ether_addr_equal_64bits(entry->addr, addr) &&
                    entry->vlan == vlan)
                        return entry;
        }
        return NULL;
}

static int macvlan_hash_add_source(struct macvlan_dev *vlan,
                                   const unsigned char *addr)
{
        struct macvlan_port *port = vlan->port;
        struct macvlan_source_entry *entry;
        struct hlist_head *h;

        entry = macvlan_hash_lookup_source(vlan, addr);
        if (entry)
                return 0;

        entry = kmalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
                return -ENOMEM;

        ether_addr_copy(entry->addr, addr);
        entry->vlan = vlan;
        h = &port->vlan_source_hash[macvlan_eth_hash(addr)];
        hlist_add_head_rcu(&entry->hlist, h);
        vlan->macaddr_count++;

        return 0;
}

static void macvlan_hash_add(struct macvlan_dev *vlan)
{
        struct macvlan_port *port = vlan->port;
        const unsigned char *addr = vlan->dev->dev_addr;
        u32 idx = macvlan_eth_hash(addr);

        hlist_add_head_rcu(&vlan->hlist, &port->vlan_hash[idx]);
}

static void macvlan_hash_del_source(struct macvlan_source_entry *entry)
{
        hlist_del_rcu(&entry->hlist);
        kfree_rcu(entry, rcu);
}

static void macvlan_hash_del(struct macvlan_dev *vlan, bool sync)
{
        hlist_del_rcu(&vlan->hlist);
        if (sync)
                synchronize_rcu();
}

static void macvlan_hash_change_addr(struct macvlan_dev *vlan,
                                        const unsigned char *addr)
{
        macvlan_hash_del(vlan, true);
        /* Now that we are unhashed it is safe to change the device
         * address without confusing packet delivery.
         */
        eth_hw_addr_set(vlan->dev, addr);
        macvlan_hash_add(vlan);
}

static bool macvlan_addr_busy(const struct macvlan_port *port,
                              const unsigned char *addr)
{
        /* Test to see if the specified address is
         * currently in use by the underlying device or
         * another macvlan.
         */
        if (!macvlan_passthru(port) && !macvlan_addr_change(port) &&
            ether_addr_equal_64bits(port->dev->dev_addr, addr))
                return true;

        if (macvlan_hash_lookup(port, addr))
                return true;

        return false;
}


static int macvlan_broadcast_one(struct sk_buff *skb,
                                 const struct macvlan_dev *vlan,
                                 const struct ethhdr *eth, bool local)
{
        struct net_device *dev = vlan->dev;

        if (local)
                return __dev_forward_skb(dev, skb);

        skb->dev = dev;
        if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
                skb->pkt_type = PACKET_BROADCAST;
        else
                skb->pkt_type = PACKET_MULTICAST;

        return 0;
}

static u32 macvlan_hash_mix(const struct macvlan_dev *vlan)
{
        return (u32)(((unsigned long)vlan) >> L1_CACHE_SHIFT);
}


static unsigned int mc_hash(const struct macvlan_dev *vlan,
                            const unsigned char *addr)
{
        u32 val = __get_unaligned_cpu32(addr + 2);

        val ^= macvlan_hash_mix(vlan);
        return hash_32(val, MACVLAN_MC_FILTER_BITS);
}

static void macvlan_broadcast(struct sk_buff *skb,
                              const struct macvlan_port *port,
                              struct net_device *src,
                              enum macvlan_mode mode)
{
        const struct ethhdr *eth = eth_hdr(skb);
        const struct macvlan_dev *vlan;
        struct sk_buff *nskb;
        unsigned int i;
        int err;
        unsigned int hash;

        if (skb->protocol == htons(ETH_P_PAUSE))
                return;

        hash_for_each_rcu(port->vlan_hash, i, vlan, hlist) {
                if (vlan->dev == src || !(vlan->mode & mode))
                        continue;

                hash = mc_hash(vlan, eth->h_dest);
                if (!test_bit(hash, vlan->mc_filter))
                        continue;

                err = NET_RX_DROP;
                nskb = skb_clone(skb, GFP_ATOMIC);
                if (likely(nskb))
                        err = macvlan_broadcast_one(nskb, vlan, eth,
                                        mode == MACVLAN_MODE_BRIDGE) ?:
                              netif_rx(nskb);
                macvlan_count_rx(vlan, skb->len + ETH_HLEN,
                                 err == NET_RX_SUCCESS, true);
        }
}

static void macvlan_multicast_rx(const struct macvlan_port *port,
                                 const struct macvlan_dev *src,
                                 struct sk_buff *skb)
{
        if (!src)
                /* frame comes from an external address */
                macvlan_broadcast(skb, port, NULL,
                                  MACVLAN_MODE_PRIVATE |
                                  MACVLAN_MODE_VEPA    |
                                  MACVLAN_MODE_PASSTHRU|
                                  MACVLAN_MODE_BRIDGE);
        else if (src->mode == MACVLAN_MODE_VEPA)
                /* flood to everyone except source */
                macvlan_broadcast(skb, port, src->dev,
                                  MACVLAN_MODE_VEPA |
                                  MACVLAN_MODE_BRIDGE);
        else
                /*
                 * flood only to VEPA ports, bridge ports
                 * already saw the frame on the way out.
                 */
                macvlan_broadcast(skb, port, src->dev,
                                  MACVLAN_MODE_VEPA);
}

static void macvlan_process_broadcast(struct work_struct *w)
{
        struct macvlan_port *port = container_of(w, struct macvlan_port,
                                                 bc_work);
        struct sk_buff *skb;
        struct sk_buff_head list;

        __skb_queue_head_init(&list);

        spin_lock_bh(&port->bc_queue.lock);
        skb_queue_splice_tail_init(&port->bc_queue, &list);
        spin_unlock_bh(&port->bc_queue.lock);

        while ((skb = __skb_dequeue(&list))) {
                const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;

                rcu_read_lock();
                macvlan_multicast_rx(port, src, skb);
                rcu_read_unlock();

                if (src)
                        dev_put(src->dev);
                consume_skb(skb);

                cond_resched();
        }
}

static void macvlan_broadcast_enqueue(struct macvlan_port *port,
                                      const struct macvlan_dev *src,
                                      struct sk_buff *skb)
{
        struct sk_buff *nskb;
        int err = -ENOMEM;

        nskb = skb_clone(skb, GFP_ATOMIC);
        if (!nskb)
                goto err;

        MACVLAN_SKB_CB(nskb)->src = src;

        spin_lock(&port->bc_queue.lock);
        if (skb_queue_len(&port->bc_queue) < port->bc_queue_len_used) {
                if (src)
                        dev_hold(src->dev);
                __skb_queue_tail(&port->bc_queue, nskb);
                err = 0;
        }
        spin_unlock(&port->bc_queue.lock);

        queue_work(system_unbound_wq, &port->bc_work);

        if (err)
                goto free_nskb;

        return;

free_nskb:
        kfree_skb(nskb);
err:
        dev_core_stats_rx_dropped_inc(skb->dev);
}

static void macvlan_flush_sources(struct macvlan_port *port,
                                  struct macvlan_dev *vlan)
{
        struct macvlan_source_entry *entry;
        struct hlist_node *next;
        int i;

        hash_for_each_safe(port->vlan_source_hash, i, next, entry, hlist)
                if (entry->vlan == vlan)
                        macvlan_hash_del_source(entry);

        vlan->macaddr_count = 0;
}

static void macvlan_forward_source_one(struct sk_buff *skb,
                                       struct macvlan_dev *vlan)
{
        struct sk_buff *nskb;
        struct net_device *dev;
        int len;
        int ret;

        dev = vlan->dev;
        if (unlikely(!(dev->flags & IFF_UP)))
                return;

        nskb = skb_clone(skb, GFP_ATOMIC);
        if (!nskb)
                return;

        len = nskb->len + ETH_HLEN;
        nskb->dev = dev;

        if (ether_addr_equal_64bits(eth_hdr(skb)->h_dest, dev->dev_addr))
                nskb->pkt_type = PACKET_HOST;

        ret = __netif_rx(nskb);
        macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
}

static bool macvlan_forward_source(struct sk_buff *skb,
                                   struct macvlan_port *port,
                                   const unsigned char *addr)
{
        struct macvlan_source_entry *entry;
        u32 idx = macvlan_eth_hash(addr);
        struct hlist_head *h = &port->vlan_source_hash[idx];
        bool consume = false;

        hlist_for_each_entry_rcu(entry, h, hlist) {
                if (ether_addr_equal_64bits(entry->addr, addr)) {
                        if (entry->vlan->flags & MACVLAN_FLAG_NODST)
                                consume = true;
                        macvlan_forward_source_one(skb, entry->vlan);
                }
        }

        return consume;
}

/* called under rcu_read_lock() from netif_receive_skb */
static rx_handler_result_t macvlan_handle_frame(struct sk_buff **pskb)
{
        struct macvlan_port *port;
        struct sk_buff *skb = *pskb;
        const struct ethhdr *eth = eth_hdr(skb);
        const struct macvlan_dev *vlan;
        const struct macvlan_dev *src;
        struct net_device *dev;
        unsigned int len = 0;
        int ret;
        rx_handler_result_t handle_res;

        /* Packets from dev_loopback_xmit() do not have L2 header, bail out */
        if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
                return RX_HANDLER_PASS;

        port = macvlan_port_get_rcu(skb->dev);
        if (is_multicast_ether_addr(eth->h_dest)) {
                unsigned int hash;

                skb = ip_check_defrag(dev_net(skb->dev), skb, IP_DEFRAG_MACVLAN);
                if (!skb)
                        return RX_HANDLER_CONSUMED;
                *pskb = skb;
                eth = eth_hdr(skb);
                if (macvlan_forward_source(skb, port, eth->h_source)) {
                        kfree_skb(skb);
                        return RX_HANDLER_CONSUMED;
                }
                src = macvlan_hash_lookup(port, eth->h_source);
                if (src && src->mode != MACVLAN_MODE_VEPA &&
                    src->mode != MACVLAN_MODE_BRIDGE) {
                        /* forward to original port. */
                        vlan = src;
                        ret = macvlan_broadcast_one(skb, vlan, eth, 0) ?:
                              __netif_rx(skb);
                        handle_res = RX_HANDLER_CONSUMED;
                        goto out;
                }

                hash = mc_hash(NULL, eth->h_dest);
                if (test_bit(hash, port->bc_filter))
                        macvlan_broadcast_enqueue(port, src, skb);
                else if (test_bit(hash, port->mc_filter))
                        macvlan_multicast_rx(port, src, skb);

                return RX_HANDLER_PASS;
        }

        if (macvlan_forward_source(skb, port, eth->h_source)) {
                kfree_skb(skb);
                return RX_HANDLER_CONSUMED;
        }
        if (macvlan_passthru(port))
                vlan = list_first_or_null_rcu(&port->vlans,
                                              struct macvlan_dev, list);
        else
                vlan = macvlan_hash_lookup(port, eth->h_dest);
        if (!vlan || vlan->mode == MACVLAN_MODE_SOURCE)
                return RX_HANDLER_PASS;

        dev = vlan->dev;
        if (unlikely(!(dev->flags & IFF_UP))) {
                kfree_skb(skb);
                return RX_HANDLER_CONSUMED;
        }
        len = skb->len + ETH_HLEN;
        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb) {
                ret = NET_RX_DROP;
                handle_res = RX_HANDLER_CONSUMED;
                goto out;
        }

        *pskb = skb;
        skb->dev = dev;
        skb->pkt_type = PACKET_HOST;

        ret = NET_RX_SUCCESS;
        handle_res = RX_HANDLER_ANOTHER;
out:
        macvlan_count_rx(vlan, len, ret == NET_RX_SUCCESS, false);
        return handle_res;
}

static int macvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
        const struct macvlan_dev *vlan = netdev_priv(dev);
        const struct macvlan_port *port = vlan->port;
        const struct macvlan_dev *dest;

        if (vlan->mode == MACVLAN_MODE_BRIDGE) {
                const struct ethhdr *eth = skb_eth_hdr(skb);

                /* send to other bridge ports directly */
                if (is_multicast_ether_addr(eth->h_dest)) {
                        skb_reset_mac_header(skb);
                        macvlan_broadcast(skb, port, dev, MACVLAN_MODE_BRIDGE);
                        goto xmit_world;
                }

                dest = macvlan_hash_lookup(port, eth->h_dest);
                if (dest && dest->mode == MACVLAN_MODE_BRIDGE) {
                        /* send to lowerdev first for its network taps */
                        dev_forward_skb(vlan->lowerdev, skb);

                        return NET_XMIT_SUCCESS;
                }
        }
xmit_world:
        skb->dev = vlan->lowerdev;
        return dev_queue_xmit_accel(skb,
                                    netdev_get_sb_channel(dev) ? dev : NULL);
}

static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb)
{
#ifdef CONFIG_NET_POLL_CONTROLLER
        return netpoll_send_skb(vlan->netpoll, skb);
#else
        BUG();
        return NETDEV_TX_OK;
#endif
}

static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
                                      struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        unsigned int len = skb->len;
        int ret;

        if (unlikely(netpoll_tx_running(dev)))
                return macvlan_netpoll_send_skb(vlan, skb);

        ret = macvlan_queue_xmit(skb, dev);

        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
                struct vlan_pcpu_stats *pcpu_stats;

                pcpu_stats = this_cpu_ptr(vlan->pcpu_stats);
                u64_stats_update_begin(&pcpu_stats->syncp);
                u64_stats_inc(&pcpu_stats->tx_packets);
                u64_stats_add(&pcpu_stats->tx_bytes, len);
                u64_stats_update_end(&pcpu_stats->syncp);
        } else {
                this_cpu_inc(vlan->pcpu_stats->tx_dropped);
        }
        return ret;
}

static int macvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
                               unsigned short type, const void *daddr,
                               const void *saddr, unsigned len)
{
        const struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;

        return dev_hard_header(skb, lowerdev, type, daddr,
                               saddr ? : dev->dev_addr, len);
}

static const struct header_ops macvlan_hard_header_ops = {
        .create          = macvlan_hard_header,
        .parse                = eth_header_parse,
        .cache                = eth_header_cache,
        .cache_update        = eth_header_cache_update,
        .parse_protocol        = eth_header_parse_protocol,
};

static int macvlan_open(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;
        int err;

        if (macvlan_passthru(vlan->port)) {
                if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC)) {
                        err = dev_set_promiscuity(lowerdev, 1);
                        if (err < 0)
                                goto out;
                }
                goto hash_add;
        }

        err = -EADDRINUSE;
        if (macvlan_addr_busy(vlan->port, dev->dev_addr))
                goto out;

        /* Attempt to populate accel_priv which is used to offload the L2
         * forwarding requests for unicast packets.
         */
        if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD)
                vlan->accel_priv =
                      lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);

        /* If earlier attempt to offload failed, or accel_priv is not
         * populated we must add the unicast address to the lower device.
         */
        if (IS_ERR_OR_NULL(vlan->accel_priv)) {
                vlan->accel_priv = NULL;
                err = dev_uc_add(lowerdev, dev->dev_addr);
                if (err < 0)
                        goto out;
        }

        if (dev->flags & IFF_ALLMULTI) {
                err = dev_set_allmulti(lowerdev, 1);
                if (err < 0)
                        goto del_unicast;
        }

        if (dev->flags & IFF_PROMISC) {
                err = dev_set_promiscuity(lowerdev, 1);
                if (err < 0)
                        goto clear_multi;
        }

hash_add:
        macvlan_hash_add(vlan);
        return 0;

clear_multi:
        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(lowerdev, -1);
del_unicast:
        if (vlan->accel_priv) {
                lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
                                                           vlan->accel_priv);
                vlan->accel_priv = NULL;
        } else {
                dev_uc_del(lowerdev, dev->dev_addr);
        }
out:
        return err;
}

static int macvlan_stop(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;

        if (vlan->accel_priv) {
                lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
                                                           vlan->accel_priv);
                vlan->accel_priv = NULL;
        }

        dev_uc_unsync(lowerdev, dev);
        dev_mc_unsync(lowerdev, dev);

        if (macvlan_passthru(vlan->port)) {
                if (!(vlan->flags & MACVLAN_FLAG_NOPROMISC))
                        dev_set_promiscuity(lowerdev, -1);
                goto hash_del;
        }

        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(lowerdev, -1);

        if (dev->flags & IFF_PROMISC)
                dev_set_promiscuity(lowerdev, -1);

        dev_uc_del(lowerdev, dev->dev_addr);

hash_del:
        macvlan_hash_del(vlan, !dev->dismantle);
        return 0;
}

static int macvlan_sync_address(struct net_device *dev,
                                const unsigned char *addr)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;
        struct macvlan_port *port = vlan->port;
        int err;

        if (!(dev->flags & IFF_UP)) {
                /* Just copy in the new address */
                eth_hw_addr_set(dev, addr);
        } else {
                /* Rehash and update the device filters */
                if (macvlan_addr_busy(vlan->port, addr))
                        return -EADDRINUSE;

                if (!macvlan_passthru(port)) {
                        err = dev_uc_add(lowerdev, addr);
                        if (err)
                                return err;

                        dev_uc_del(lowerdev, dev->dev_addr);
                }

                macvlan_hash_change_addr(vlan, addr);
        }
        if (macvlan_passthru(port) && !macvlan_addr_change(port)) {
                /* Since addr_change isn't set, we are here due to lower
                 * device change.  Save the lower-dev address so we can
                 * restore it later.
                 */
                ether_addr_copy(vlan->port->perm_addr,
                                lowerdev->dev_addr);
        }
        macvlan_clear_addr_change(port);
        return 0;
}

static int macvlan_set_mac_address(struct net_device *dev, void *p)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct sockaddr *addr = p;

        if (!is_valid_ether_addr(addr->sa_data))
                return -EADDRNOTAVAIL;

        /* If the addresses are the same, this is a no-op */
        if (ether_addr_equal(dev->dev_addr, addr->sa_data))
                return 0;

        if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
                macvlan_set_addr_change(vlan->port);
                return dev_set_mac_address(vlan->lowerdev, addr, NULL);
        }

        if (macvlan_addr_busy(vlan->port, addr->sa_data))
                return -EADDRINUSE;

        return macvlan_sync_address(dev, addr->sa_data);
}

static void macvlan_change_rx_flags(struct net_device *dev, int change)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;

        if (dev->flags & IFF_UP) {
                if (change & IFF_ALLMULTI)
                        dev_set_allmulti(lowerdev, dev->flags & IFF_ALLMULTI ? 1 : -1);
                if (!macvlan_passthru(vlan->port) && change & IFF_PROMISC)
                        dev_set_promiscuity(lowerdev,
                                            dev->flags & IFF_PROMISC ? 1 : -1);

        }
}

static void macvlan_compute_filter(unsigned long *mc_filter,
                                   struct net_device *dev,
                                   struct macvlan_dev *vlan, int cutoff)
{
        if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
                bitmap_fill(mc_filter, MACVLAN_MC_FILTER_SZ);
        } else {
                DECLARE_BITMAP(filter, MACVLAN_MC_FILTER_SZ);
                struct netdev_hw_addr *ha;

                bitmap_zero(filter, MACVLAN_MC_FILTER_SZ);
                netdev_for_each_mc_addr(ha, dev) {
                        if (!vlan && ha->synced <= cutoff)
                                continue;

                        __set_bit(mc_hash(vlan, ha->addr), filter);
                }

                __set_bit(mc_hash(vlan, dev->broadcast), filter);

                bitmap_copy(mc_filter, filter, MACVLAN_MC_FILTER_SZ);
        }
}

static void macvlan_recompute_bc_filter(struct macvlan_dev *vlan)
{
        if (vlan->port->bc_cutoff < 0) {
                bitmap_zero(vlan->port->bc_filter, MACVLAN_MC_FILTER_SZ);
                return;
        }

        macvlan_compute_filter(vlan->port->bc_filter, vlan->lowerdev, NULL,
                               vlan->port->bc_cutoff);
}

static void macvlan_set_mac_lists(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);

        macvlan_compute_filter(vlan->mc_filter, dev, vlan, 0);

        dev_uc_sync(vlan->lowerdev, dev);
        dev_mc_sync(vlan->lowerdev, dev);

        /* This is slightly inaccurate as we're including the subscription
         * list of vlan->lowerdev too.
         *
         * Bug alert: This only works if everyone has the same broadcast
         * address as lowerdev.  As soon as someone changes theirs this
         * will break.
         *
         * However, this is already broken as when you change your broadcast
         * address we don't get called.
         *
         * The solution is to maintain a list of broadcast addresses like
         * we do for uc/mc, if you care.
         */
        macvlan_compute_filter(vlan->port->mc_filter, vlan->lowerdev, NULL,
                               0);
        macvlan_recompute_bc_filter(vlan);
}

static void update_port_bc_cutoff(struct macvlan_dev *vlan, int cutoff)
{
        if (vlan->port->bc_cutoff == cutoff)
                return;

        vlan->port->bc_cutoff = cutoff;
        macvlan_recompute_bc_filter(vlan);
}

static int macvlan_change_mtu(struct net_device *dev, int new_mtu)
{
        struct macvlan_dev *vlan = netdev_priv(dev);

        if (vlan->lowerdev->mtu < new_mtu)
                return -EINVAL;
        WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
}

static int macvlan_hwtstamp_get(struct net_device *dev,
                                struct kernel_hwtstamp_config *cfg)
{
        struct net_device *real_dev = macvlan_dev_real_dev(dev);

        return generic_hwtstamp_get_lower(real_dev, cfg);
}

static int macvlan_hwtstamp_set(struct net_device *dev,
                                struct kernel_hwtstamp_config *cfg,
                                struct netlink_ext_ack *extack)
{
        struct net_device *real_dev = macvlan_dev_real_dev(dev);

        if (!net_eq(dev_net(dev), &init_net))
                return -EOPNOTSUPP;

        return generic_hwtstamp_set_lower(real_dev, cfg, extack);
}

/*
 * macvlan network devices have devices nesting below it and are a special
 * "super class" of normal network devices; split their locks off into a
 * separate class since they always nest.
 */
static struct lock_class_key macvlan_netdev_addr_lock_key;

#define ALWAYS_ON_OFFLOADS \
        (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \
         NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL)

#define ALWAYS_ON_FEATURES ALWAYS_ON_OFFLOADS

#define MACVLAN_FEATURES \
        (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
         NETIF_F_GSO | NETIF_F_TSO | NETIF_F_LRO | \
         NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \
         NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)

#define MACVLAN_STATE_MASK \
        ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))

static void macvlan_set_lockdep_class(struct net_device *dev)
{
        netdev_lockdep_set_classes(dev);
        lockdep_set_class(&dev->addr_list_lock,
                          &macvlan_netdev_addr_lock_key);
}

static int macvlan_init(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;
        struct macvlan_port *port = vlan->port;

        dev->state                = (dev->state & ~MACVLAN_STATE_MASK) |
                                  (lowerdev->state & MACVLAN_STATE_MASK);
        dev->features                 = lowerdev->features & MACVLAN_FEATURES;
        dev->features                |= ALWAYS_ON_FEATURES;
        dev->hw_features        |= NETIF_F_LRO;
        dev->vlan_features        = lowerdev->vlan_features & MACVLAN_FEATURES;
        dev->vlan_features        |= ALWAYS_ON_OFFLOADS;
        dev->hw_enc_features    |= dev->features;
        dev->lltx                = true;
        netif_inherit_tso_max(dev, lowerdev);
        dev->hard_header_len        = lowerdev->hard_header_len;
        macvlan_set_lockdep_class(dev);

        vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
        if (!vlan->pcpu_stats)
                return -ENOMEM;

        port->count += 1;

        /* Get macvlan's reference to lowerdev */
        netdev_hold(lowerdev, &vlan->dev_tracker, GFP_KERNEL);

        return 0;
}

static void macvlan_uninit(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct macvlan_port *port = vlan->port;

        free_percpu(vlan->pcpu_stats);

        macvlan_flush_sources(port, vlan);
        port->count -= 1;
        if (!port->count)
                macvlan_port_destroy(port->dev);
}

static void macvlan_dev_get_stats64(struct net_device *dev,
                                    struct rtnl_link_stats64 *stats)
{
        struct macvlan_dev *vlan = netdev_priv(dev);

        if (vlan->pcpu_stats) {
                struct vlan_pcpu_stats *p;
                u64 rx_packets, rx_bytes, rx_multicast, tx_packets, tx_bytes;
                u32 rx_errors = 0, tx_dropped = 0;
                unsigned int start;
                int i;

                for_each_possible_cpu(i) {
                        p = per_cpu_ptr(vlan->pcpu_stats, i);
                        do {
                                start = u64_stats_fetch_begin(&p->syncp);
                                rx_packets        = u64_stats_read(&p->rx_packets);
                                rx_bytes        = u64_stats_read(&p->rx_bytes);
                                rx_multicast        = u64_stats_read(&p->rx_multicast);
                                tx_packets        = u64_stats_read(&p->tx_packets);
                                tx_bytes        = u64_stats_read(&p->tx_bytes);
                        } while (u64_stats_fetch_retry(&p->syncp, start));

                        stats->rx_packets        += rx_packets;
                        stats->rx_bytes                += rx_bytes;
                        stats->multicast        += rx_multicast;
                        stats->tx_packets        += tx_packets;
                        stats->tx_bytes                += tx_bytes;
                        /* rx_errors & tx_dropped are u32, updated
                         * without syncp protection.
                         */
                        rx_errors        += READ_ONCE(p->rx_errors);
                        tx_dropped        += READ_ONCE(p->tx_dropped);
                }
                stats->rx_errors        = rx_errors;
                stats->rx_dropped        = rx_errors;
                stats->tx_dropped        = tx_dropped;
        }
}

static int macvlan_vlan_rx_add_vid(struct net_device *dev,
                                   __be16 proto, u16 vid)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;

        return vlan_vid_add(lowerdev, proto, vid);
}

static int macvlan_vlan_rx_kill_vid(struct net_device *dev,
                                    __be16 proto, u16 vid)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *lowerdev = vlan->lowerdev;

        vlan_vid_del(lowerdev, proto, vid);
        return 0;
}

static int macvlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
                           struct net_device *dev,
                           const unsigned char *addr, u16 vid,
                           u16 flags, bool *notified,
                           struct netlink_ext_ack *extack)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        int err = -EINVAL;

        /* Support unicast filter only on passthru devices.
         * Multicast filter should be allowed on all devices.
         */
        if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr))
                return -EOPNOTSUPP;

        if (flags & NLM_F_REPLACE)
                return -EOPNOTSUPP;

        if (is_unicast_ether_addr(addr))
                err = dev_uc_add_excl(dev, addr);
        else if (is_multicast_ether_addr(addr))
                err = dev_mc_add_excl(dev, addr);

        return err;
}

static int macvlan_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
                           struct net_device *dev,
                           const unsigned char *addr, u16 vid, bool *notified,
                           struct netlink_ext_ack *extack)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        int err = -EINVAL;

        /* Support unicast filter only on passthru devices.
         * Multicast filter should be allowed on all devices.
         */
        if (!macvlan_passthru(vlan->port) && is_unicast_ether_addr(addr))
                return -EOPNOTSUPP;

        if (is_unicast_ether_addr(addr))
                err = dev_uc_del(dev, addr);
        else if (is_multicast_ether_addr(addr))
                err = dev_mc_del(dev, addr);

        return err;
}

static void macvlan_ethtool_get_drvinfo(struct net_device *dev,
                                        struct ethtool_drvinfo *drvinfo)
{
        strscpy(drvinfo->driver, "macvlan", sizeof(drvinfo->driver));
        strscpy(drvinfo->version, "0.1", sizeof(drvinfo->version));
}

static int macvlan_ethtool_get_link_ksettings(struct net_device *dev,
                                              struct ethtool_link_ksettings *cmd)
{
        const struct macvlan_dev *vlan = netdev_priv(dev);

        return __ethtool_get_link_ksettings(vlan->lowerdev, cmd);
}

static int macvlan_ethtool_get_ts_info(struct net_device *dev,
                                       struct kernel_ethtool_ts_info *info)
{
        struct net_device *real_dev = macvlan_dev_real_dev(dev);

        return ethtool_get_ts_info_by_layer(real_dev, info);
}

static netdev_features_t macvlan_fix_features(struct net_device *dev,
                                              netdev_features_t features)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        netdev_features_t lowerdev_features = vlan->lowerdev->features;
        netdev_features_t mask;

        features |= NETIF_F_ALL_FOR_ALL;
        features &= (vlan->set_features | ~MACVLAN_FEATURES);
        mask = features;

        lowerdev_features &= (features | ~NETIF_F_LRO);
        features = netdev_increment_features(lowerdev_features, features, mask);
        features |= ALWAYS_ON_FEATURES;
        features &= (ALWAYS_ON_FEATURES | MACVLAN_FEATURES);

        return features;
}

#ifdef CONFIG_NET_POLL_CONTROLLER
static void macvlan_dev_poll_controller(struct net_device *dev)
{
        return;
}

static int macvlan_dev_netpoll_setup(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct net_device *real_dev = vlan->lowerdev;
        struct netpoll *netpoll;
        int err;

        netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
        err = -ENOMEM;
        if (!netpoll)
                goto out;

        err = __netpoll_setup(netpoll, real_dev);
        if (err) {
                kfree(netpoll);
                goto out;
        }

        vlan->netpoll = netpoll;

out:
        return err;
}

static void macvlan_dev_netpoll_cleanup(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct netpoll *netpoll = vlan->netpoll;

        if (!netpoll)
                return;

        vlan->netpoll = NULL;

        __netpoll_free(netpoll);
}
#endif        /* CONFIG_NET_POLL_CONTROLLER */

static int macvlan_dev_get_iflink(const struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);

        return READ_ONCE(vlan->lowerdev->ifindex);
}

static const struct ethtool_ops macvlan_ethtool_ops = {
        .get_link                = ethtool_op_get_link,
        .get_link_ksettings        = macvlan_ethtool_get_link_ksettings,
        .get_drvinfo                = macvlan_ethtool_get_drvinfo,
        .get_ts_info                = macvlan_ethtool_get_ts_info,
};

static const struct net_device_ops macvlan_netdev_ops = {
        .ndo_init                = macvlan_init,
        .ndo_uninit                = macvlan_uninit,
        .ndo_open                = macvlan_open,
        .ndo_stop                = macvlan_stop,
        .ndo_start_xmit                = macvlan_start_xmit,
        .ndo_change_mtu                = macvlan_change_mtu,
        .ndo_fix_features        = macvlan_fix_features,
        .ndo_change_rx_flags        = macvlan_change_rx_flags,
        .ndo_set_mac_address        = macvlan_set_mac_address,
        .ndo_set_rx_mode        = macvlan_set_mac_lists,
        .ndo_get_stats64        = macvlan_dev_get_stats64,
        .ndo_validate_addr        = eth_validate_addr,
        .ndo_vlan_rx_add_vid        = macvlan_vlan_rx_add_vid,
        .ndo_vlan_rx_kill_vid        = macvlan_vlan_rx_kill_vid,
        .ndo_fdb_add                = macvlan_fdb_add,
        .ndo_fdb_del                = macvlan_fdb_del,
        .ndo_fdb_dump                = ndo_dflt_fdb_dump,
#ifdef CONFIG_NET_POLL_CONTROLLER
        .ndo_poll_controller        = macvlan_dev_poll_controller,
        .ndo_netpoll_setup        = macvlan_dev_netpoll_setup,
        .ndo_netpoll_cleanup        = macvlan_dev_netpoll_cleanup,
#endif
        .ndo_get_iflink                = macvlan_dev_get_iflink,
        .ndo_features_check        = passthru_features_check,
        .ndo_hwtstamp_get        = macvlan_hwtstamp_get,
        .ndo_hwtstamp_set        = macvlan_hwtstamp_set,
};

static void macvlan_dev_free(struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);

        /* Get rid of the macvlan's reference to lowerdev */
        netdev_put(vlan->lowerdev, &vlan->dev_tracker);
}

void macvlan_common_setup(struct net_device *dev)
{
        ether_setup(dev);

        /* ether_setup() has set dev->min_mtu to ETH_MIN_MTU. */
        dev->max_mtu                = ETH_MAX_MTU;
        dev->priv_flags               &= ~IFF_TX_SKB_SHARING;
        netif_keep_dst(dev);
        dev->priv_flags               |= IFF_UNICAST_FLT;
        dev->change_proto_down        = true;
        dev->netdev_ops                = &macvlan_netdev_ops;
        dev->needs_free_netdev        = true;
        dev->priv_destructor        = macvlan_dev_free;
        dev->header_ops                = &macvlan_hard_header_ops;
        dev->ethtool_ops        = &macvlan_ethtool_ops;
}
EXPORT_SYMBOL_GPL(macvlan_common_setup);

static void macvlan_setup(struct net_device *dev)
{
        macvlan_common_setup(dev);
        dev->priv_flags |= IFF_NO_QUEUE;
}

static int macvlan_port_create(struct net_device *dev)
{
        struct macvlan_port *port;
        unsigned int i;
        int err;

        if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK)
                return -EINVAL;

        if (netdev_is_rx_handler_busy(dev))
                return -EBUSY;

        port = kzalloc(sizeof(*port), GFP_KERNEL);
        if (port == NULL)
                return -ENOMEM;

        port->dev = dev;
        ether_addr_copy(port->perm_addr, dev->dev_addr);
        INIT_LIST_HEAD(&port->vlans);
        for (i = 0; i < MACVLAN_HASH_SIZE; i++)
                INIT_HLIST_HEAD(&port->vlan_hash[i]);
        for (i = 0; i < MACVLAN_HASH_SIZE; i++)
                INIT_HLIST_HEAD(&port->vlan_source_hash[i]);

        port->bc_queue_len_used = 0;
        port->bc_cutoff = 1;
        skb_queue_head_init(&port->bc_queue);
        INIT_WORK(&port->bc_work, macvlan_process_broadcast);

        err = netdev_rx_handler_register(dev, macvlan_handle_frame, port);
        if (err)
                kfree(port);
        else
                dev->priv_flags |= IFF_MACVLAN_PORT;
        return err;
}

static void macvlan_port_destroy(struct net_device *dev)
{
        struct macvlan_port *port = macvlan_port_get_rtnl(dev);
        struct sk_buff *skb;

        dev->priv_flags &= ~IFF_MACVLAN_PORT;
        netdev_rx_handler_unregister(dev);

        /* After this point, no packet can schedule bc_work anymore,
         * but we need to cancel it and purge left skbs if any.
         */
        cancel_work_sync(&port->bc_work);

        while ((skb = __skb_dequeue(&port->bc_queue))) {
                const struct macvlan_dev *src = MACVLAN_SKB_CB(skb)->src;

                if (src)
                        dev_put(src->dev);

                kfree_skb(skb);
        }

        /* If the lower device address has been changed by passthru
         * macvlan, put it back.
         */
        if (macvlan_passthru(port) &&
            !ether_addr_equal(port->dev->dev_addr, port->perm_addr)) {
                struct sockaddr sa;

                sa.sa_family = port->dev->type;
                memcpy(&sa.sa_data, port->perm_addr, port->dev->addr_len);
                dev_set_mac_address(port->dev, &sa, NULL);
        }

        kfree(port);
}

static int macvlan_validate(struct nlattr *tb[], struct nlattr *data[],
                            struct netlink_ext_ack *extack)
{
        struct nlattr *nla, *head;
        int rem, len;

        if (tb[IFLA_ADDRESS]) {
                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
                        return -EINVAL;
                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
                        return -EADDRNOTAVAIL;
        }

        if (!data)
                return 0;

        if (data[IFLA_MACVLAN_FLAGS] &&
            nla_get_u16(data[IFLA_MACVLAN_FLAGS]) & ~(MACVLAN_FLAG_NOPROMISC |
                                                      MACVLAN_FLAG_NODST))
                return -EINVAL;

        if (data[IFLA_MACVLAN_MODE]) {
                switch (nla_get_u32(data[IFLA_MACVLAN_MODE])) {
                case MACVLAN_MODE_PRIVATE:
                case MACVLAN_MODE_VEPA:
                case MACVLAN_MODE_BRIDGE:
                case MACVLAN_MODE_PASSTHRU:
                case MACVLAN_MODE_SOURCE:
                        break;
                default:
                        return -EINVAL;
                }
        }

        if (data[IFLA_MACVLAN_MACADDR_MODE]) {
                switch (nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE])) {
                case MACVLAN_MACADDR_ADD:
                case MACVLAN_MACADDR_DEL:
                case MACVLAN_MACADDR_FLUSH:
                case MACVLAN_MACADDR_SET:
                        break;
                default:
                        return -EINVAL;
                }
        }

        if (data[IFLA_MACVLAN_MACADDR]) {
                if (nla_len(data[IFLA_MACVLAN_MACADDR]) != ETH_ALEN)
                        return -EINVAL;

                if (!is_valid_ether_addr(nla_data(data[IFLA_MACVLAN_MACADDR])))
                        return -EADDRNOTAVAIL;
        }

        if (data[IFLA_MACVLAN_MACADDR_DATA]) {
                head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]);
                len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]);

                nla_for_each_attr(nla, head, len, rem) {
                        if (nla_type(nla) != IFLA_MACVLAN_MACADDR ||
                            nla_len(nla) != ETH_ALEN)
                                return -EINVAL;

                        if (!is_valid_ether_addr(nla_data(nla)))
                                return -EADDRNOTAVAIL;
                }
        }

        if (data[IFLA_MACVLAN_MACADDR_COUNT])
                return -EINVAL;

        return 0;
}

/*
 * reconfigure list of remote source mac address
 * (only for macvlan devices in source mode)
 * Note regarding alignment: all netlink data is aligned to 4 Byte, which
 * suffices for both ether_addr_copy and ether_addr_equal_64bits usage.
 */
static int macvlan_changelink_sources(struct macvlan_dev *vlan, u32 mode,
                                      struct nlattr *data[])
{
        char *addr = NULL;
        int ret, rem, len;
        struct nlattr *nla, *head;
        struct macvlan_source_entry *entry;

        if (data[IFLA_MACVLAN_MACADDR])
                addr = nla_data(data[IFLA_MACVLAN_MACADDR]);

        if (mode == MACVLAN_MACADDR_ADD) {
                if (!addr)
                        return -EINVAL;

                return macvlan_hash_add_source(vlan, addr);

        } else if (mode == MACVLAN_MACADDR_DEL) {
                if (!addr)
                        return -EINVAL;

                entry = macvlan_hash_lookup_source(vlan, addr);
                if (entry) {
                        macvlan_hash_del_source(entry);
                        vlan->macaddr_count--;
                }
        } else if (mode == MACVLAN_MACADDR_FLUSH) {
                macvlan_flush_sources(vlan->port, vlan);
        } else if (mode == MACVLAN_MACADDR_SET) {
                macvlan_flush_sources(vlan->port, vlan);

                if (addr) {
                        ret = macvlan_hash_add_source(vlan, addr);
                        if (ret)
                                return ret;
                }

                if (!data[IFLA_MACVLAN_MACADDR_DATA])
                        return 0;

                head = nla_data(data[IFLA_MACVLAN_MACADDR_DATA]);
                len = nla_len(data[IFLA_MACVLAN_MACADDR_DATA]);

                nla_for_each_attr(nla, head, len, rem) {
                        addr = nla_data(nla);
                        ret = macvlan_hash_add_source(vlan, addr);
                        if (ret)
                                return ret;
                }
        } else {
                return -EINVAL;
        }

        return 0;
}

int macvlan_common_newlink(struct net_device *dev,
                           struct rtnl_newlink_params *params,
                           struct netlink_ext_ack *extack)
{
        struct net *link_net = rtnl_newlink_link_net(params);
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct net_device *lowerdev;
        struct macvlan_port *port;
        bool create = false;
        int macmode;
        int err;

        if (!tb[IFLA_LINK])
                return -EINVAL;

        lowerdev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK]));
        if (lowerdev == NULL)
                return -ENODEV;

        /* When creating macvlans or macvtaps on top of other macvlans - use
         * the real device as the lowerdev.
         */
        if (netif_is_macvlan(lowerdev))
                lowerdev = macvlan_dev_real_dev(lowerdev);

        if (!tb[IFLA_MTU])
                dev->mtu = lowerdev->mtu;
        else if (dev->mtu > lowerdev->mtu)
                return -EINVAL;

        /* MTU range: 68 - lowerdev->max_mtu */
        dev->min_mtu = ETH_MIN_MTU;
        dev->max_mtu = lowerdev->max_mtu;

        if (!tb[IFLA_ADDRESS])
                eth_hw_addr_random(dev);

        if (!netif_is_macvlan_port(lowerdev)) {
                err = macvlan_port_create(lowerdev);
                if (err < 0)
                        return err;
                create = true;
        }
        port = macvlan_port_get_rtnl(lowerdev);

        /* Only 1 macvlan device can be created in passthru mode */
        if (macvlan_passthru(port)) {
                /* The macvlan port must be not created this time,
                 * still goto destroy_macvlan_port for readability.
                 */
                err = -EINVAL;
                goto destroy_macvlan_port;
        }

        vlan->lowerdev = lowerdev;
        vlan->dev      = dev;
        vlan->port     = port;
        vlan->set_features = MACVLAN_FEATURES;

        vlan->mode     = MACVLAN_MODE_VEPA;
        if (data && data[IFLA_MACVLAN_MODE])
                vlan->mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);

        if (data && data[IFLA_MACVLAN_FLAGS])
                vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);

        if (vlan->mode == MACVLAN_MODE_PASSTHRU) {
                if (port->count) {
                        err = -EINVAL;
                        goto destroy_macvlan_port;
                }
                macvlan_set_passthru(port);
                eth_hw_addr_inherit(dev, lowerdev);
        }

        if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
                if (vlan->mode != MACVLAN_MODE_SOURCE) {
                        err = -EINVAL;
                        goto destroy_macvlan_port;
                }
                macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]);
                err = macvlan_changelink_sources(vlan, macmode, data);
                if (err)
                        goto destroy_macvlan_port;
        }

        vlan->bc_queue_len_req = MACVLAN_DEFAULT_BC_QUEUE_LEN;
        if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN])
                vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]);

        if (data && data[IFLA_MACVLAN_BC_CUTOFF])
                update_port_bc_cutoff(
                        vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF]));

        err = register_netdevice(dev);
        if (err < 0)
                goto destroy_macvlan_port;

        dev->priv_flags |= IFF_MACVLAN;
        err = netdev_upper_dev_link(lowerdev, dev, extack);
        if (err)
                goto unregister_netdev;

        list_add_tail_rcu(&vlan->list, &port->vlans);
        update_port_bc_queue_len(vlan->port);
        netif_stacked_transfer_operstate(lowerdev, dev);
        linkwatch_fire_event(dev);

        return 0;

unregister_netdev:
        /* macvlan_uninit would free the macvlan port */
        unregister_netdevice(dev);
        return err;
destroy_macvlan_port:
        /* the macvlan port may be freed by macvlan_uninit when fail to register.
         * so we destroy the macvlan port only when it's valid.
         */
        if (create && macvlan_port_get_rtnl(lowerdev)) {
                macvlan_flush_sources(port, vlan);
                macvlan_port_destroy(port->dev);
        }
        return err;
}
EXPORT_SYMBOL_GPL(macvlan_common_newlink);

static int macvlan_newlink(struct net_device *dev,
                           struct rtnl_newlink_params *params,
                           struct netlink_ext_ack *extack)
{
        return macvlan_common_newlink(dev, params, extack);
}

void macvlan_dellink(struct net_device *dev, struct list_head *head)
{
        struct macvlan_dev *vlan = netdev_priv(dev);

        if (vlan->mode == MACVLAN_MODE_SOURCE)
                macvlan_flush_sources(vlan->port, vlan);
        list_del_rcu(&vlan->list);
        update_port_bc_queue_len(vlan->port);
        unregister_netdevice_queue(dev, head);
        netdev_upper_dev_unlink(vlan->lowerdev, dev);
}
EXPORT_SYMBOL_GPL(macvlan_dellink);

static int macvlan_changelink(struct net_device *dev,
                              struct nlattr *tb[], struct nlattr *data[],
                              struct netlink_ext_ack *extack)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        enum macvlan_mode mode;
        bool set_mode = false;
        enum macvlan_macaddr_mode macmode;
        int ret;

        /* Validate mode, but don't set yet: setting flags may fail. */
        if (data && data[IFLA_MACVLAN_MODE]) {
                set_mode = true;
                mode = nla_get_u32(data[IFLA_MACVLAN_MODE]);
                /* Passthrough mode can't be set or cleared dynamically */
                if ((mode == MACVLAN_MODE_PASSTHRU) !=
                    (vlan->mode == MACVLAN_MODE_PASSTHRU))
                        return -EINVAL;
                if (vlan->mode == MACVLAN_MODE_SOURCE &&
                    vlan->mode != mode)
                        macvlan_flush_sources(vlan->port, vlan);
        }

        if (data && data[IFLA_MACVLAN_FLAGS]) {
                __u16 flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]);
                bool promisc = (flags ^ vlan->flags) & MACVLAN_FLAG_NOPROMISC;
                if (macvlan_passthru(vlan->port) && promisc) {
                        int err;

                        if (flags & MACVLAN_FLAG_NOPROMISC)
                                err = dev_set_promiscuity(vlan->lowerdev, -1);
                        else
                                err = dev_set_promiscuity(vlan->lowerdev, 1);
                        if (err < 0)
                                return err;
                }
                vlan->flags = flags;
        }

        if (data && data[IFLA_MACVLAN_BC_QUEUE_LEN]) {
                vlan->bc_queue_len_req = nla_get_u32(data[IFLA_MACVLAN_BC_QUEUE_LEN]);
                update_port_bc_queue_len(vlan->port);
        }

        if (data && data[IFLA_MACVLAN_BC_CUTOFF])
                update_port_bc_cutoff(
                        vlan, nla_get_s32(data[IFLA_MACVLAN_BC_CUTOFF]));

        if (set_mode)
                vlan->mode = mode;
        if (data && data[IFLA_MACVLAN_MACADDR_MODE]) {
                if (vlan->mode != MACVLAN_MODE_SOURCE)
                        return -EINVAL;
                macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]);
                ret = macvlan_changelink_sources(vlan, macmode, data);
                if (ret)
                        return ret;
        }
        return 0;
}

static size_t macvlan_get_size_mac(const struct macvlan_dev *vlan)
{
        if (vlan->macaddr_count == 0)
                return 0;
        return nla_total_size(0) /* IFLA_MACVLAN_MACADDR_DATA */
                + vlan->macaddr_count * nla_total_size(sizeof(u8) * ETH_ALEN);
}

static size_t macvlan_get_size(const struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);

        return (0
                + nla_total_size(4) /* IFLA_MACVLAN_MODE */
                + nla_total_size(2) /* IFLA_MACVLAN_FLAGS */
                + nla_total_size(4) /* IFLA_MACVLAN_MACADDR_COUNT */
                + macvlan_get_size_mac(vlan) /* IFLA_MACVLAN_MACADDR */
                + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN */
                + nla_total_size(4) /* IFLA_MACVLAN_BC_QUEUE_LEN_USED */
                );
}

static int macvlan_fill_info_macaddr(struct sk_buff *skb,
                                     const struct macvlan_dev *vlan,
                                     const int i)
{
        struct hlist_head *h = &vlan->port->vlan_source_hash[i];
        struct macvlan_source_entry *entry;

        hlist_for_each_entry_rcu(entry, h, hlist, lockdep_rtnl_is_held()) {
                if (entry->vlan != vlan)
                        continue;
                if (nla_put(skb, IFLA_MACVLAN_MACADDR, ETH_ALEN, entry->addr))
                        return 1;
        }
        return 0;
}

static int macvlan_fill_info(struct sk_buff *skb,
                                const struct net_device *dev)
{
        struct macvlan_dev *vlan = netdev_priv(dev);
        struct macvlan_port *port = vlan->port;
        int i;
        struct nlattr *nest;

        if (nla_put_u32(skb, IFLA_MACVLAN_MODE, vlan->mode))
                goto nla_put_failure;
        if (nla_put_u16(skb, IFLA_MACVLAN_FLAGS, vlan->flags))
                goto nla_put_failure;
        if (nla_put_u32(skb, IFLA_MACVLAN_MACADDR_COUNT, vlan->macaddr_count))
                goto nla_put_failure;
        if (vlan->macaddr_count > 0) {
                nest = nla_nest_start_noflag(skb, IFLA_MACVLAN_MACADDR_DATA);
                if (nest == NULL)
                        goto nla_put_failure;

                for (i = 0; i < MACVLAN_HASH_SIZE; i++) {
                        if (macvlan_fill_info_macaddr(skb, vlan, i))
                                goto nla_put_failure;
                }
                nla_nest_end(skb, nest);
        }
        if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN, vlan->bc_queue_len_req))
                goto nla_put_failure;
        if (nla_put_u32(skb, IFLA_MACVLAN_BC_QUEUE_LEN_USED, port->bc_queue_len_used))
                goto nla_put_failure;
        if (port->bc_cutoff != 1 &&
            nla_put_s32(skb, IFLA_MACVLAN_BC_CUTOFF, port->bc_cutoff))
                goto nla_put_failure;
        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static const struct nla_policy macvlan_policy[IFLA_MACVLAN_MAX + 1] = {
        [IFLA_MACVLAN_MODE]  = { .type = NLA_U32 },
        [IFLA_MACVLAN_FLAGS] = { .type = NLA_U16 },
        [IFLA_MACVLAN_MACADDR_MODE] = { .type = NLA_U32 },
        [IFLA_MACVLAN_MACADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
        [IFLA_MACVLAN_MACADDR_DATA] = { .type = NLA_NESTED },
        [IFLA_MACVLAN_MACADDR_COUNT] = { .type = NLA_U32 },
        [IFLA_MACVLAN_BC_QUEUE_LEN] = { .type = NLA_U32 },
        [IFLA_MACVLAN_BC_QUEUE_LEN_USED] = { .type = NLA_REJECT },
        [IFLA_MACVLAN_BC_CUTOFF] = { .type = NLA_S32 },
};

int macvlan_link_register(struct rtnl_link_ops *ops)
{
        /* common fields */
        ops->validate                = macvlan_validate;
        ops->maxtype                = IFLA_MACVLAN_MAX;
        ops->policy                = macvlan_policy;
        ops->changelink                = macvlan_changelink;
        ops->get_size                = macvlan_get_size;
        ops->fill_info                = macvlan_fill_info;

        return rtnl_link_register(ops);
};
EXPORT_SYMBOL_GPL(macvlan_link_register);

static struct net *macvlan_get_link_net(const struct net_device *dev)
{
        return dev_net(macvlan_dev_real_dev(dev));
}

static struct rtnl_link_ops macvlan_link_ops = {
        .kind                = "macvlan",
        .setup                = macvlan_setup,
        .newlink        = macvlan_newlink,
        .dellink        = macvlan_dellink,
        .get_link_net        = macvlan_get_link_net,
        .priv_size      = sizeof(struct macvlan_dev),
};

static void update_port_bc_queue_len(struct macvlan_port *port)
{
        u32 max_bc_queue_len_req = 0;
        struct macvlan_dev *vlan;

        list_for_each_entry(vlan, &port->vlans, list) {
                if (vlan->bc_queue_len_req > max_bc_queue_len_req)
                        max_bc_queue_len_req = vlan->bc_queue_len_req;
        }
        port->bc_queue_len_used = max_bc_queue_len_req;
}

static int macvlan_device_event(struct notifier_block *unused,
                                unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct macvlan_dev *vlan, *next;
        struct macvlan_port *port;
        LIST_HEAD(list_kill);

        if (!netif_is_macvlan_port(dev))
                return NOTIFY_DONE;

        port = macvlan_port_get_rtnl(dev);

        switch (event) {
        case NETDEV_UP:
        case NETDEV_DOWN:
        case NETDEV_CHANGE:
                list_for_each_entry(vlan, &port->vlans, list)
                        netif_stacked_transfer_operstate(vlan->lowerdev,
                                                         vlan->dev);
                break;
        case NETDEV_FEAT_CHANGE:
                list_for_each_entry(vlan, &port->vlans, list) {
                        netif_inherit_tso_max(vlan->dev, dev);
                        netdev_update_features(vlan->dev);
                }
                break;
        case NETDEV_CHANGEMTU:
                list_for_each_entry(vlan, &port->vlans, list) {
                        if (vlan->dev->mtu <= dev->mtu)
                                continue;
                        dev_set_mtu(vlan->dev, dev->mtu);
                }
                break;
        case NETDEV_CHANGEADDR:
                if (!macvlan_passthru(port))
                        return NOTIFY_DONE;

                vlan = list_first_entry_or_null(&port->vlans,
                                                struct macvlan_dev,
                                                list);

                if (vlan && macvlan_sync_address(vlan->dev, dev->dev_addr))
                        return NOTIFY_BAD;

                break;
        case NETDEV_UNREGISTER:
                /* twiddle thumbs on netns device moves */
                if (dev->reg_state != NETREG_UNREGISTERING)
                        break;

                list_for_each_entry_safe(vlan, next, &port->vlans, list)
                        vlan->dev->rtnl_link_ops->dellink(vlan->dev, &list_kill);
                unregister_netdevice_many(&list_kill);
                break;
        case NETDEV_PRE_TYPE_CHANGE:
                /* Forbid underlying device to change its type. */
                return NOTIFY_BAD;

        case NETDEV_NOTIFY_PEERS:
        case NETDEV_BONDING_FAILOVER:
        case NETDEV_RESEND_IGMP:
                /* Propagate to all vlans */
                list_for_each_entry(vlan, &port->vlans, list)
                        call_netdevice_notifiers(event, vlan->dev);
        }
        return NOTIFY_DONE;
}

static struct notifier_block macvlan_notifier_block __read_mostly = {
        .notifier_call        = macvlan_device_event,
};

static int __init macvlan_init_module(void)
{
        int err;

        register_netdevice_notifier(&macvlan_notifier_block);

        err = macvlan_link_register(&macvlan_link_ops);
        if (err < 0)
                goto err1;
        return 0;
err1:
        unregister_netdevice_notifier(&macvlan_notifier_block);
        return err;
}

static void __exit macvlan_cleanup_module(void)
{
        rtnl_link_unregister(&macvlan_link_ops);
        unregister_netdevice_notifier(&macvlan_notifier_block);
}

module_init(macvlan_init_module);
module_exit(macvlan_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_DESCRIPTION("Driver for MAC address based VLANs");
MODULE_ALIAS_RTNL_LINK("macvlan");




























































  249 














  313 
    3 

  274 
   38 




  311 
















































































































































































































   33 












   33 




























































































































































































































































































   23 




   23 












   23 
   23 





   23 
   23 
   23 












   23 

   23 

   23 


   23 



   23 





   23 






























































































































































































































































































































































































































  291 


  291 

















   24 



   24 

































































































































































































































































































































































































































































































































































































































































































































































































































































































































  102 















  102 






  102 




  102 




























































































  496 









    3 


    3 
    3 
    3 

    1 


    3 



























    3 
    3 








    3 









    3 




    3 
    3 










    3 




















    3 






    1 




    3 




    3 
    3 



    3 
    3 





    2 














    1 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
// SPDX-License-Identifier: GPL-2.0-only
/*
 *        fs/libfs.c
 *        Library for filesystems writers.
 */

#include <linux/blkdev.h>
#include <linux/export.h>
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/mount.h>
#include <linux/vfs.h>
#include <linux/quotaops.h>
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
#include <linux/iversion.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
#include <linux/pseudo_fs.h>
#include <linux/fsnotify.h>
#include <linux/unicode.h>
#include <linux/fscrypt.h>
#include <linux/pidfs.h>

#include <linux/uaccess.h>

#include "internal.h"

int simple_getattr(struct mnt_idmap *idmap, const struct path *path,
                   struct kstat *stat, u32 request_mask,
                   unsigned int query_flags)
{
        struct inode *inode = d_inode(path->dentry);
        generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
        stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
        return 0;
}
EXPORT_SYMBOL(simple_getattr);

int simple_statfs(struct dentry *dentry, struct kstatfs *buf)
{
        u64 id = huge_encode_dev(dentry->d_sb->s_dev);

        buf->f_fsid = u64_to_fsid(id);
        buf->f_type = dentry->d_sb->s_magic;
        buf->f_bsize = PAGE_SIZE;
        buf->f_namelen = NAME_MAX;
        return 0;
}
EXPORT_SYMBOL(simple_statfs);

/*
 * Retaining negative dentries for an in-memory filesystem just wastes
 * memory and lookup time: arrange for them to be deleted immediately.
 */
int always_delete_dentry(const struct dentry *dentry)
{
        return 1;
}
EXPORT_SYMBOL(always_delete_dentry);

const struct dentry_operations simple_dentry_operations = {
        .d_delete = always_delete_dentry,
};
EXPORT_SYMBOL(simple_dentry_operations);

/*
 * Lookup the data. This is trivial - if the dentry didn't already
 * exist, we know it is negative.  Set d_op to delete negative dentries.
 */
struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        if (dentry->d_name.len > NAME_MAX)
                return ERR_PTR(-ENAMETOOLONG);
        if (!dentry->d_sb->s_d_op)
                d_set_d_op(dentry, &simple_dentry_operations);

        if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
                return NULL;

        d_add(dentry, NULL);
        return NULL;
}
EXPORT_SYMBOL(simple_lookup);

int dcache_dir_open(struct inode *inode, struct file *file)
{
        file->private_data = d_alloc_cursor(file->f_path.dentry);

        return file->private_data ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(dcache_dir_open);

int dcache_dir_close(struct inode *inode, struct file *file)
{
        dput(file->private_data);
        return 0;
}
EXPORT_SYMBOL(dcache_dir_close);

/* parent is locked at least shared */
/*
 * Returns an element of siblings' list.
 * We are looking for <count>th positive after <p>; if
 * found, dentry is grabbed and returned to caller.
 * If no such element exists, NULL is returned.
 */
static struct dentry *scan_positives(struct dentry *cursor,
                                        struct hlist_node **p,
                                        loff_t count,
                                        struct dentry *last)
{
        struct dentry *dentry = cursor->d_parent, *found = NULL;

        spin_lock(&dentry->d_lock);
        while (*p) {
                struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
                p = &d->d_sib.next;
                // we must at least skip cursors, to avoid livelocks
                if (d->d_flags & DCACHE_DENTRY_CURSOR)
                        continue;
                if (simple_positive(d) && !--count) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                found = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(found))
                                break;
                        count = 1;
                }
                if (need_resched()) {
                        if (!hlist_unhashed(&cursor->d_sib))
                                __hlist_del(&cursor->d_sib);
                        hlist_add_behind(&cursor->d_sib, &d->d_sib);
                        p = &cursor->d_sib.next;
                        spin_unlock(&dentry->d_lock);
                        cond_resched();
                        spin_lock(&dentry->d_lock);
                }
        }
        spin_unlock(&dentry->d_lock);
        dput(last);
        return found;
}

loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
{
        struct dentry *dentry = file->f_path.dentry;
        switch (whence) {
                case 1:
                        offset += file->f_pos;
                        fallthrough;
                case 0:
                        if (offset >= 0)
                                break;
                        fallthrough;
                default:
                        return -EINVAL;
        }
        if (offset != file->f_pos) {
                struct dentry *cursor = file->private_data;
                struct dentry *to = NULL;

                inode_lock_shared(dentry->d_inode);

                if (offset > 2)
                        to = scan_positives(cursor, &dentry->d_children.first,
                                            offset - 2, NULL);
                spin_lock(&dentry->d_lock);
                hlist_del_init(&cursor->d_sib);
                if (to)
                        hlist_add_behind(&cursor->d_sib, &to->d_sib);
                spin_unlock(&dentry->d_lock);
                dput(to);

                file->f_pos = offset;

                inode_unlock_shared(dentry->d_inode);
        }
        return offset;
}
EXPORT_SYMBOL(dcache_dir_lseek);

/*
 * Directory is locked and all positive dentries in it are safe, since
 * for ramfs-type trees they can't go away without unlink() or rmdir(),
 * both impossible due to the lock on directory.
 */

int dcache_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dentry = file->f_path.dentry;
        struct dentry *cursor = file->private_data;
        struct dentry *next = NULL;
        struct hlist_node **p;

        if (!dir_emit_dots(file, ctx))
                return 0;

        if (ctx->pos == 2)
                p = &dentry->d_children.first;
        else
                p = &cursor->d_sib.next;

        while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
                if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
                              d_inode(next)->i_ino,
                              fs_umode_to_dtype(d_inode(next)->i_mode)))
                        break;
                ctx->pos++;
                p = &next->d_sib.next;
        }
        spin_lock(&dentry->d_lock);
        hlist_del_init(&cursor->d_sib);
        if (next)
                hlist_add_before(&cursor->d_sib, &next->d_sib);
        spin_unlock(&dentry->d_lock);
        dput(next);

        return 0;
}
EXPORT_SYMBOL(dcache_readdir);

ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos)
{
        return -EISDIR;
}
EXPORT_SYMBOL(generic_read_dir);

const struct file_operations simple_dir_operations = {
        .open                = dcache_dir_open,
        .release        = dcache_dir_close,
        .llseek                = dcache_dir_lseek,
        .read                = generic_read_dir,
        .iterate_shared        = dcache_readdir,
        .fsync                = noop_fsync,
};
EXPORT_SYMBOL(simple_dir_operations);

const struct inode_operations simple_dir_inode_operations = {
        .lookup                = simple_lookup,
};
EXPORT_SYMBOL(simple_dir_inode_operations);

/* simple_offset_add() never assigns these to a dentry */
enum {
        DIR_OFFSET_FIRST        = 2,                /* Find first real entry */
        DIR_OFFSET_EOD                = S32_MAX,
};

/* simple_offset_add() allocation range */
enum {
        DIR_OFFSET_MIN                = DIR_OFFSET_FIRST + 1,
        DIR_OFFSET_MAX                = DIR_OFFSET_EOD - 1,
};

static void offset_set(struct dentry *dentry, long offset)
{
        dentry->d_fsdata = (void *)offset;
}

static long dentry2offset(struct dentry *dentry)
{
        return (long)dentry->d_fsdata;
}

static struct lock_class_key simple_offset_lock_class;

/**
 * simple_offset_init - initialize an offset_ctx
 * @octx: directory offset map to be initialized
 *
 */
void simple_offset_init(struct offset_ctx *octx)
{
        mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE);
        lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class);
        octx->next_offset = DIR_OFFSET_MIN;
}

/**
 * simple_offset_add - Add an entry to a directory's offset map
 * @octx: directory offset ctx to be updated
 * @dentry: new dentry being added
 *
 * Returns zero on success. @octx and the dentry's offset are updated.
 * Otherwise, a negative errno value is returned.
 */
int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
{
        unsigned long offset;
        int ret;

        if (dentry2offset(dentry) != 0)
                return -EBUSY;

        ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
                                 DIR_OFFSET_MAX, &octx->next_offset,
                                 GFP_KERNEL);
        if (unlikely(ret < 0))
                return ret == -EBUSY ? -ENOSPC : ret;

        offset_set(dentry, offset);
        return 0;
}

static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry,
                                 long offset)
{
        int ret;

        ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL);
        if (ret)
                return ret;
        offset_set(dentry, offset);
        return 0;
}

/**
 * simple_offset_remove - Remove an entry to a directory's offset map
 * @octx: directory offset ctx to be updated
 * @dentry: dentry being removed
 *
 */
void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
{
        long offset;

        offset = dentry2offset(dentry);
        if (offset == 0)
                return;

        mtree_erase(&octx->mt, offset);
        offset_set(dentry, 0);
}

/**
 * simple_offset_rename - handle directory offsets for rename
 * @old_dir: parent directory of source entry
 * @old_dentry: dentry of source entry
 * @new_dir: parent_directory of destination entry
 * @new_dentry: dentry of destination
 *
 * Caller provides appropriate serialization.
 *
 * User space expects the directory offset value of the replaced
 * (new) directory entry to be unchanged after a rename.
 *
 * Returns zero on success, a negative errno value on failure.
 */
int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry,
                         struct inode *new_dir, struct dentry *new_dentry)
{
        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
        long new_offset = dentry2offset(new_dentry);

        simple_offset_remove(old_ctx, old_dentry);

        if (new_offset) {
                offset_set(new_dentry, 0);
                return simple_offset_replace(new_ctx, old_dentry, new_offset);
        }
        return simple_offset_add(new_ctx, old_dentry);
}

/**
 * simple_offset_rename_exchange - exchange rename with directory offsets
 * @old_dir: parent of dentry being moved
 * @old_dentry: dentry being moved
 * @new_dir: destination parent
 * @new_dentry: destination dentry
 *
 * This API preserves the directory offset values. Caller provides
 * appropriate serialization.
 *
 * Returns zero on success. Otherwise a negative errno is returned and the
 * rename is rolled back.
 */
int simple_offset_rename_exchange(struct inode *old_dir,
                                  struct dentry *old_dentry,
                                  struct inode *new_dir,
                                  struct dentry *new_dentry)
{
        struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
        struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
        long old_index = dentry2offset(old_dentry);
        long new_index = dentry2offset(new_dentry);
        int ret;

        simple_offset_remove(old_ctx, old_dentry);
        simple_offset_remove(new_ctx, new_dentry);

        ret = simple_offset_replace(new_ctx, old_dentry, new_index);
        if (ret)
                goto out_restore;

        ret = simple_offset_replace(old_ctx, new_dentry, old_index);
        if (ret) {
                simple_offset_remove(new_ctx, old_dentry);
                goto out_restore;
        }

        ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
        if (ret) {
                simple_offset_remove(new_ctx, old_dentry);
                simple_offset_remove(old_ctx, new_dentry);
                goto out_restore;
        }
        return 0;

out_restore:
        (void)simple_offset_replace(old_ctx, old_dentry, old_index);
        (void)simple_offset_replace(new_ctx, new_dentry, new_index);
        return ret;
}

/**
 * simple_offset_destroy - Release offset map
 * @octx: directory offset ctx that is about to be destroyed
 *
 * During fs teardown (eg. umount), a directory's offset map might still
 * contain entries. xa_destroy() cleans out anything that remains.
 */
void simple_offset_destroy(struct offset_ctx *octx)
{
        mtree_destroy(&octx->mt);
}

/**
 * offset_dir_llseek - Advance the read position of a directory descriptor
 * @file: an open directory whose position is to be updated
 * @offset: a byte offset
 * @whence: enumerator describing the starting position for this update
 *
 * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories.
 *
 * Returns the updated read position if successful; otherwise a
 * negative errno is returned and the read position remains unchanged.
 */
static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
{
        switch (whence) {
        case SEEK_CUR:
                offset += file->f_pos;
                fallthrough;
        case SEEK_SET:
                if (offset >= 0)
                        break;
                fallthrough;
        default:
                return -EINVAL;
        }

        return vfs_setpos(file, offset, LONG_MAX);
}

static struct dentry *find_positive_dentry(struct dentry *parent,
                                           struct dentry *dentry,
                                           bool next)
{
        struct dentry *found = NULL;

        spin_lock(&parent->d_lock);
        if (next)
                dentry = d_next_sibling(dentry);
        else if (!dentry)
                dentry = d_first_child(parent);
        hlist_for_each_entry_from(dentry, d_sib) {
                if (!simple_positive(dentry))
                        continue;
                spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
                if (simple_positive(dentry))
                        found = dget_dlock(dentry);
                spin_unlock(&dentry->d_lock);
                if (likely(found))
                        break;
        }
        spin_unlock(&parent->d_lock);
        return found;
}

static noinline_for_stack struct dentry *
offset_dir_lookup(struct dentry *parent, loff_t offset)
{
        struct inode *inode = d_inode(parent);
        struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
        struct dentry *child, *found = NULL;

        MA_STATE(mas, &octx->mt, offset, offset);

        if (offset == DIR_OFFSET_FIRST)
                found = find_positive_dentry(parent, NULL, false);
        else {
                rcu_read_lock();
                child = mas_find_rev(&mas, DIR_OFFSET_MIN);
                found = find_positive_dentry(parent, child, false);
                rcu_read_unlock();
        }
        return found;
}

static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len,
                        inode->i_ino, fs_umode_to_dtype(inode->i_mode));
}

static void offset_iterate_dir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dir = file->f_path.dentry;
        struct dentry *dentry;

        dentry = offset_dir_lookup(dir, ctx->pos);
        if (!dentry)
                goto out_eod;
        while (true) {
                struct dentry *next;

                ctx->pos = dentry2offset(dentry);
                if (!offset_dir_emit(ctx, dentry))
                        break;

                next = find_positive_dentry(dir, dentry, true);
                dput(dentry);

                if (!next)
                        goto out_eod;
                dentry = next;
        }
        dput(dentry);
        return;

out_eod:
        ctx->pos = DIR_OFFSET_EOD;
}

/**
 * offset_readdir - Emit entries starting at offset @ctx->pos
 * @file: an open directory to iterate over
 * @ctx: directory iteration context
 *
 * Caller must hold @file's i_rwsem to prevent insertion or removal of
 * entries during this call.
 *
 * On entry, @ctx->pos contains an offset that represents the first entry
 * to be read from the directory.
 *
 * The operation continues until there are no more entries to read, or
 * until the ctx->actor indicates there is no more space in the caller's
 * output buffer.
 *
 * On return, @ctx->pos contains an offset that will read the next entry
 * in this directory when offset_readdir() is called again with @ctx.
 * Caller places this value in the d_off field of the last entry in the
 * user's buffer.
 *
 * Return values:
 *   %0 - Complete
 */
static int offset_readdir(struct file *file, struct dir_context *ctx)
{
        struct dentry *dir = file->f_path.dentry;

        lockdep_assert_held(&d_inode(dir)->i_rwsem);

        if (!dir_emit_dots(file, ctx))
                return 0;
        if (ctx->pos != DIR_OFFSET_EOD)
                offset_iterate_dir(file, ctx);
        return 0;
}

const struct file_operations simple_offset_dir_operations = {
        .llseek                = offset_dir_llseek,
        .iterate_shared        = offset_readdir,
        .read                = generic_read_dir,
        .fsync                = noop_fsync,
};

static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
        struct dentry *child = NULL, *d;

        spin_lock(&parent->d_lock);
        d = prev ? d_next_sibling(prev) : d_first_child(parent);
        hlist_for_each_entry_from(d, d_sib) {
                if (simple_positive(d)) {
                        spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
                        if (simple_positive(d))
                                child = dget_dlock(d);
                        spin_unlock(&d->d_lock);
                        if (likely(child))
                                break;
                }
        }
        spin_unlock(&parent->d_lock);
        dput(prev);
        return child;
}

void simple_recursive_removal(struct dentry *dentry,
                              void (*callback)(struct dentry *))
{
        struct dentry *this = dget(dentry);
        while (true) {
                struct dentry *victim = NULL, *child;
                struct inode *inode = this->d_inode;

                inode_lock(inode);
                if (d_is_dir(this))
                        inode->i_flags |= S_DEAD;
                while ((child = find_next_child(this, victim)) == NULL) {
                        // kill and ascend
                        // update metadata while it's still locked
                        inode_set_ctime_current(inode);
                        clear_nlink(inode);
                        inode_unlock(inode);
                        victim = this;
                        this = this->d_parent;
                        inode = this->d_inode;
                        inode_lock(inode);
                        if (simple_positive(victim)) {
                                d_invalidate(victim);        // avoid lost mounts
                                if (d_is_dir(victim))
                                        fsnotify_rmdir(inode, victim);
                                else
                                        fsnotify_unlink(inode, victim);
                                if (callback)
                                        callback(victim);
                                dput(victim);                // unpin it
                        }
                        if (victim == dentry) {
                                inode_set_mtime_to_ts(inode,
                                                      inode_set_ctime_current(inode));
                                if (d_is_dir(dentry))
                                        drop_nlink(inode);
                                inode_unlock(inode);
                                dput(dentry);
                                return;
                        }
                }
                inode_unlock(inode);
                this = child;
        }
}
EXPORT_SYMBOL(simple_recursive_removal);

static const struct super_operations simple_super_operations = {
        .statfs                = simple_statfs,
};

static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc)
{
        struct pseudo_fs_context *ctx = fc->fs_private;
        struct inode *root;

        s->s_maxbytes = MAX_LFS_FILESIZE;
        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = ctx->magic;
        s->s_op = ctx->ops ?: &simple_super_operations;
        s->s_export_op = ctx->eops;
        s->s_xattr = ctx->xattr;
        s->s_time_gran = 1;
        root = new_inode(s);
        if (!root)
                return -ENOMEM;

        /*
         * since this is the first inode, make it number 1. New inodes created
         * after this must take care not to collide with it (by passing
         * max_reserved of 1 to iunique).
         */
        root->i_ino = 1;
        root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
        simple_inode_init_ts(root);
        s->s_root = d_make_root(root);
        if (!s->s_root)
                return -ENOMEM;
        s->s_d_op = ctx->dops;
        return 0;
}

static int pseudo_fs_get_tree(struct fs_context *fc)
{
        return get_tree_nodev(fc, pseudo_fs_fill_super);
}

static void pseudo_fs_free(struct fs_context *fc)
{
        kfree(fc->fs_private);
}

static const struct fs_context_operations pseudo_fs_context_ops = {
        .free                = pseudo_fs_free,
        .get_tree        = pseudo_fs_get_tree,
};

/*
 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that
 * will never be mountable)
 */
struct pseudo_fs_context *init_pseudo(struct fs_context *fc,
                                        unsigned long magic)
{
        struct pseudo_fs_context *ctx;

        ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL);
        if (likely(ctx)) {
                ctx->magic = magic;
                fc->fs_private = ctx;
                fc->ops = &pseudo_fs_context_ops;
                fc->sb_flags |= SB_NOUSER;
                fc->global = true;
        }
        return ctx;
}
EXPORT_SYMBOL(init_pseudo);

int simple_open(struct inode *inode, struct file *file)
{
        if (inode->i_private)
                file->private_data = inode->i_private;
        return 0;
}
EXPORT_SYMBOL(simple_open);

int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(old_dentry);

        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        inc_nlink(inode);
        ihold(inode);
        dget(dentry);
        d_instantiate(dentry, inode);
        return 0;
}
EXPORT_SYMBOL(simple_link);

int simple_empty(struct dentry *dentry)
{
        struct dentry *child;
        int ret = 0;

        spin_lock(&dentry->d_lock);
        hlist_for_each_entry(child, &dentry->d_children, d_sib) {
                spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
                if (simple_positive(child)) {
                        spin_unlock(&child->d_lock);
                        goto out;
                }
                spin_unlock(&child->d_lock);
        }
        ret = 1;
out:
        spin_unlock(&dentry->d_lock);
        return ret;
}
EXPORT_SYMBOL(simple_empty);

int simple_unlink(struct inode *dir, struct dentry *dentry)
{
        struct inode *inode = d_inode(dentry);

        inode_set_mtime_to_ts(dir,
                              inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode)));
        drop_nlink(inode);
        dput(dentry);
        return 0;
}
EXPORT_SYMBOL(simple_unlink);

int simple_rmdir(struct inode *dir, struct dentry *dentry)
{
        if (!simple_empty(dentry))
                return -ENOTEMPTY;

        drop_nlink(d_inode(dentry));
        simple_unlink(dir, dentry);
        drop_nlink(dir);
        return 0;
}
EXPORT_SYMBOL(simple_rmdir);

/**
 * simple_rename_timestamp - update the various inode timestamps for rename
 * @old_dir: old parent directory
 * @old_dentry: dentry that is being renamed
 * @new_dir: new parent directory
 * @new_dentry: target for rename
 *
 * POSIX mandates that the old and new parent directories have their ctime and
 * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have
 * their ctime updated.
 */
void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry,
                             struct inode *new_dir, struct dentry *new_dentry)
{
        struct inode *newino = d_inode(new_dentry);

        inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir));
        if (new_dir != old_dir)
                inode_set_mtime_to_ts(new_dir,
                                      inode_set_ctime_current(new_dir));
        inode_set_ctime_current(d_inode(old_dentry));
        if (newino)
                inode_set_ctime_current(newino);
}
EXPORT_SYMBOL_GPL(simple_rename_timestamp);

int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry)
{
        bool old_is_dir = d_is_dir(old_dentry);
        bool new_is_dir = d_is_dir(new_dentry);

        if (old_dir != new_dir && old_is_dir != new_is_dir) {
                if (old_is_dir) {
                        drop_nlink(old_dir);
                        inc_nlink(new_dir);
                } else {
                        drop_nlink(new_dir);
                        inc_nlink(old_dir);
                }
        }
        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        return 0;
}
EXPORT_SYMBOL_GPL(simple_rename_exchange);

int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir,
                  struct dentry *old_dentry, struct inode *new_dir,
                  struct dentry *new_dentry, unsigned int flags)
{
        int they_are_dirs = d_is_dir(old_dentry);

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
                return -EINVAL;

        if (flags & RENAME_EXCHANGE)
                return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);

        if (!simple_empty(new_dentry))
                return -ENOTEMPTY;

        if (d_really_is_positive(new_dentry)) {
                simple_unlink(new_dir, new_dentry);
                if (they_are_dirs) {
                        drop_nlink(d_inode(new_dentry));
                        drop_nlink(old_dir);
                }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);
        }

        simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
        return 0;
}
EXPORT_SYMBOL(simple_rename);

/**
 * simple_setattr - setattr for simple filesystem
 * @idmap: idmap of the target mount
 * @dentry: dentry
 * @iattr: iattr structure
 *
 * Returns 0 on success, -error on failure.
 *
 * simple_setattr is a simple ->setattr implementation without a proper
 * implementation of size changes.
 *
 * It can either be used for in-memory filesystems or special files
 * on simple regular filesystems.  Anything that needs to change on-disk
 * or wire state on size changes needs its own setattr method.
 */
int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
                   struct iattr *iattr)
{
        struct inode *inode = d_inode(dentry);
        int error;

        error = setattr_prepare(idmap, dentry, iattr);
        if (error)
                return error;

        if (iattr->ia_valid & ATTR_SIZE)
                truncate_setsize(inode, iattr->ia_size);
        setattr_copy(idmap, inode, iattr);
        mark_inode_dirty(inode);
        return 0;
}
EXPORT_SYMBOL(simple_setattr);

static int simple_read_folio(struct file *file, struct folio *folio)
{
        folio_zero_range(folio, 0, folio_size(folio));
        flush_dcache_folio(folio);
        folio_mark_uptodate(folio);
        folio_unlock(folio);
        return 0;
}

int simple_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len,
                        struct folio **foliop, void **fsdata)
{
        struct folio *folio;

        folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN,
                        mapping_gfp_mask(mapping));
        if (IS_ERR(folio))
                return PTR_ERR(folio);

        *foliop = folio;

        if (!folio_test_uptodate(folio) && (len != folio_size(folio))) {
                size_t from = offset_in_folio(folio, pos);

                folio_zero_segments(folio, 0, from,
                                from + len, folio_size(folio));
        }
        return 0;
}
EXPORT_SYMBOL(simple_write_begin);

/**
 * simple_write_end - .write_end helper for non-block-device FSes
 * @file: See .write_end of address_space_operations
 * @mapping:                 "
 * @pos:                 "
 * @len:                 "
 * @copied:                 "
 * @folio:                 "
 * @fsdata:                 "
 *
 * simple_write_end does the minimum needed for updating a folio after
 * writing is done. It has the same API signature as the .write_end of
 * address_space_operations vector. So it can just be set onto .write_end for
 * FSes that don't need any other processing. i_mutex is assumed to be held.
 * Block based filesystems should use generic_write_end().
 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty
 * is not called, so a filesystem that actually does store data in .write_inode
 * should extend on what's done here with a call to mark_inode_dirty() in the
 * case that i_size has changed.
 *
 * Use *ONLY* with simple_read_folio()
 */
static int simple_write_end(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned copied,
                        struct folio *folio, void *fsdata)
{
        struct inode *inode = folio->mapping->host;
        loff_t last_pos = pos + copied;

        /* zero the stale part of the folio if we did a short copy */
        if (!folio_test_uptodate(folio)) {
                if (copied < len) {
                        size_t from = offset_in_folio(folio, pos);

                        folio_zero_range(folio, from + copied, len - copied);
                }
                folio_mark_uptodate(folio);
        }
        /*
         * No need to use i_size_read() here, the i_size
         * cannot change under us because we hold the i_mutex.
         */
        if (last_pos > inode->i_size)
                i_size_write(inode, last_pos);

        folio_mark_dirty(folio);
        folio_unlock(folio);
        folio_put(folio);

        return copied;
}

/*
 * Provides ramfs-style behavior: data in the pagecache, but no writeback.
 */
const struct address_space_operations ram_aops = {
        .read_folio        = simple_read_folio,
        .write_begin        = simple_write_begin,
        .write_end        = simple_write_end,
        .dirty_folio        = noop_dirty_folio,
};
EXPORT_SYMBOL(ram_aops);

/*
 * the inodes created here are not hashed. If you use iunique to generate
 * unique inode values later for this filesystem, then you must take care
 * to pass it an appropriate max_reserved value to avoid collisions.
 */
int simple_fill_super(struct super_block *s, unsigned long magic,
                      const struct tree_descr *files)
{
        struct inode *inode;
        struct dentry *dentry;
        int i;

        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = magic;
        s->s_op = &simple_super_operations;
        s->s_time_gran = 1;

        inode = new_inode(s);
        if (!inode)
                return -ENOMEM;
        /*
         * because the root inode is 1, the files array must not contain an
         * entry at index 1
         */
        inode->i_ino = 1;
        inode->i_mode = S_IFDIR | 0755;
        simple_inode_init_ts(inode);
        inode->i_op = &simple_dir_inode_operations;
        inode->i_fop = &simple_dir_operations;
        set_nlink(inode, 2);
        s->s_root = d_make_root(inode);
        if (!s->s_root)
                return -ENOMEM;
        for (i = 0; !files->name || files->name[0]; i++, files++) {
                if (!files->name)
                        continue;

                /* warn if it tries to conflict with the root inode */
                if (unlikely(i == 1))
                        printk(KERN_WARNING "%s: %s passed in a files array"
                                "with an index of 1!\n", __func__,
                                s->s_type->name);

                dentry = d_alloc_name(s->s_root, files->name);
                if (!dentry)
                        return -ENOMEM;
                inode = new_inode(s);
                if (!inode) {
                        dput(dentry);
                        return -ENOMEM;
                }
                inode->i_mode = S_IFREG | files->mode;
                simple_inode_init_ts(inode);
                inode->i_fop = files->ops;
                inode->i_ino = i;
                d_add(dentry, inode);
        }
        return 0;
}
EXPORT_SYMBOL(simple_fill_super);

static DEFINE_SPINLOCK(pin_fs_lock);

int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt = NULL;
        spin_lock(&pin_fs_lock);
        if (unlikely(!*mount)) {
                spin_unlock(&pin_fs_lock);
                mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
                if (IS_ERR(mnt))
                        return PTR_ERR(mnt);
                spin_lock(&pin_fs_lock);
                if (!*mount)
                        *mount = mnt;
        }
        mntget(*mount);
        ++*count;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
        return 0;
}
EXPORT_SYMBOL(simple_pin_fs);

void simple_release_fs(struct vfsmount **mount, int *count)
{
        struct vfsmount *mnt;
        spin_lock(&pin_fs_lock);
        mnt = *mount;
        if (!--*count)
                *mount = NULL;
        spin_unlock(&pin_fs_lock);
        mntput(mnt);
}
EXPORT_SYMBOL(simple_release_fs);

/**
 * simple_read_from_buffer - copy data from the buffer to user space
 * @to: the user space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The simple_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the user space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;
        size_t ret;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        ret = copy_to_user(to, from + pos, count);
        if (ret == count)
                return -EFAULT;
        count -= ret;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_read_from_buffer);

/**
 * simple_write_to_buffer - copy data from user space to the buffer
 * @to: the buffer to write to
 * @available: the size of the buffer
 * @ppos: the current position in the buffer
 * @from: the user space buffer to read from
 * @count: the maximum number of bytes to read
 *
 * The simple_write_to_buffer() function reads up to @count bytes from the user
 * space address starting at @from into the buffer @to at offset @ppos.
 *
 * On success, the number of bytes written is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos,
                const void __user *from, size_t count)
{
        loff_t pos = *ppos;
        size_t res;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available || !count)
                return 0;
        if (count > available - pos)
                count = available - pos;
        res = copy_from_user(to + pos, from, count);
        if (res == count)
                return -EFAULT;
        count -= res;
        *ppos = pos + count;
        return count;
}
EXPORT_SYMBOL(simple_write_to_buffer);

/**
 * memory_read_from_buffer - copy data from the buffer
 * @to: the kernel space buffer to read to
 * @count: the maximum number of bytes to read
 * @ppos: the current position in the buffer
 * @from: the buffer to read from
 * @available: the size of the buffer
 *
 * The memory_read_from_buffer() function reads up to @count bytes from the
 * buffer @from at offset @ppos into the kernel space address starting at @to.
 *
 * On success, the number of bytes read is returned and the offset @ppos is
 * advanced by this number, or negative value is returned on error.
 **/
ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
                                const void *from, size_t available)
{
        loff_t pos = *ppos;

        if (pos < 0)
                return -EINVAL;
        if (pos >= available)
                return 0;
        if (count > available - pos)
                count = available - pos;
        memcpy(to, from + pos, count);
        *ppos = pos + count;

        return count;
}
EXPORT_SYMBOL(memory_read_from_buffer);

/*
 * Transaction based IO.
 * The file expects a single write which triggers the transaction, and then
 * possibly a read which collects the result - which is stored in a
 * file-local buffer.
 */

void simple_transaction_set(struct file *file, size_t n)
{
        struct simple_transaction_argresp *ar = file->private_data;

        BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);

        /*
         * The barrier ensures that ar->size will really remain zero until
         * ar->data is ready for reading.
         */
        smp_mb();
        ar->size = n;
}
EXPORT_SYMBOL(simple_transaction_set);

char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
{
        struct simple_transaction_argresp *ar;
        static DEFINE_SPINLOCK(simple_transaction_lock);

        if (size > SIMPLE_TRANSACTION_LIMIT - 1)
                return ERR_PTR(-EFBIG);

        ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL);
        if (!ar)
                return ERR_PTR(-ENOMEM);

        spin_lock(&simple_transaction_lock);

        /* only one write allowed per open */
        if (file->private_data) {
                spin_unlock(&simple_transaction_lock);
                free_page((unsigned long)ar);
                return ERR_PTR(-EBUSY);
        }

        file->private_data = ar;

        spin_unlock(&simple_transaction_lock);

        if (copy_from_user(ar->data, buf, size))
                return ERR_PTR(-EFAULT);

        return ar->data;
}
EXPORT_SYMBOL(simple_transaction_get);

ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos)
{
        struct simple_transaction_argresp *ar = file->private_data;

        if (!ar)
                return 0;
        return simple_read_from_buffer(buf, size, pos, ar->data, ar->size);
}
EXPORT_SYMBOL(simple_transaction_read);

int simple_transaction_release(struct inode *inode, struct file *file)
{
        free_page((unsigned long)file->private_data);
        return 0;
}
EXPORT_SYMBOL(simple_transaction_release);

/* Simple attribute files */

struct simple_attr {
        int (*get)(void *, u64 *);
        int (*set)(void *, u64);
        char get_buf[24];        /* enough to store a u64 and "\n\0" */
        char set_buf[24];
        void *data;
        const char *fmt;        /* format for read operation */
        struct mutex mutex;        /* protects access to these buffers */
};

/* simple_attr_open is called by an actual attribute open file operation
 * to set the attribute specific access operations. */
int simple_attr_open(struct inode *inode, struct file *file,
                     int (*get)(void *, u64 *), int (*set)(void *, u64),
                     const char *fmt)
{
        struct simple_attr *attr;

        attr = kzalloc(sizeof(*attr), GFP_KERNEL);
        if (!attr)
                return -ENOMEM;

        attr->get = get;
        attr->set = set;
        attr->data = inode->i_private;
        attr->fmt = fmt;
        mutex_init(&attr->mutex);

        file->private_data = attr;

        return nonseekable_open(inode, file);
}
EXPORT_SYMBOL_GPL(simple_attr_open);

int simple_attr_release(struct inode *inode, struct file *file)
{
        kfree(file->private_data);
        return 0;
}
EXPORT_SYMBOL_GPL(simple_attr_release);        /* GPL-only?  This?  Really? */

/* read from the buffer that is filled with the get function */
ssize_t simple_attr_read(struct file *file, char __user *buf,
                         size_t len, loff_t *ppos)
{
        struct simple_attr *attr;
        size_t size;
        ssize_t ret;

        attr = file->private_data;

        if (!attr->get)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        if (*ppos && attr->get_buf[0]) {
                /* continued read */
                size = strlen(attr->get_buf);
        } else {
                /* first read */
                u64 val;
                ret = attr->get(attr->data, &val);
                if (ret)
                        goto out;

                size = scnprintf(attr->get_buf, sizeof(attr->get_buf),
                                 attr->fmt, (unsigned long long)val);
        }

        ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size);
out:
        mutex_unlock(&attr->mutex);
        return ret;
}
EXPORT_SYMBOL_GPL(simple_attr_read);

/* interpret the buffer as a number to call the set function with */
static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos, bool is_signed)
{
        struct simple_attr *attr;
        unsigned long long val;
        size_t size;
        ssize_t ret;

        attr = file->private_data;
        if (!attr->set)
                return -EACCES;

        ret = mutex_lock_interruptible(&attr->mutex);
        if (ret)
                return ret;

        ret = -EFAULT;
        size = min(sizeof(attr->set_buf) - 1, len);
        if (copy_from_user(attr->set_buf, buf, size))
                goto out;

        attr->set_buf[size] = '\0';
        if (is_signed)
                ret = kstrtoll(attr->set_buf, 0, &val);
        else
                ret = kstrtoull(attr->set_buf, 0, &val);
        if (ret)
                goto out;
        ret = attr->set(attr->data, val);
        if (ret == 0)
                ret = len; /* on success, claim we got the whole input */
out:
        mutex_unlock(&attr->mutex);
        return ret;
}

ssize_t simple_attr_write(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, false);
}
EXPORT_SYMBOL_GPL(simple_attr_write);

ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
                          size_t len, loff_t *ppos)
{
        return simple_attr_write_xsigned(file, buf, len, ppos, true);
}
EXPORT_SYMBOL_GPL(simple_attr_write_signed);

/**
 * generic_encode_ino32_fh - generic export_operations->encode_fh function
 * @inode:   the object to encode
 * @fh:      where to store the file handle fragment
 * @max_len: maximum length to store there (in 4 byte units)
 * @parent:  parent directory inode, if wanted
 *
 * This generic encode_fh function assumes that the 32 inode number
 * is suitable for locating an inode, and that the generation number
 * can be used to check that it is still valid.  It places them in the
 * filehandle fragment where export_decode_fh expects to find them.
 */
int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len,
                            struct inode *parent)
{
        struct fid *fid = (void *)fh;
        int len = *max_len;
        int type = FILEID_INO32_GEN;

        if (parent && (len < 4)) {
                *max_len = 4;
                return FILEID_INVALID;
        } else if (len < 2) {
                *max_len = 2;
                return FILEID_INVALID;
        }

        len = 2;
        fid->i32.ino = inode->i_ino;
        fid->i32.gen = inode->i_generation;
        if (parent) {
                fid->i32.parent_ino = parent->i_ino;
                fid->i32.parent_gen = parent->i_generation;
                len = 4;
                type = FILEID_INO32_GEN_PARENT;
        }
        *max_len = len;
        return type;
}
EXPORT_SYMBOL_GPL(generic_encode_ino32_fh);

/**
 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the object specified in the file handle.
 */
struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len < 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN:
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.ino, fid->i32.gen);
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_dentry);

/**
 * generic_fh_to_parent - generic helper for the fh_to_parent export operation
 * @sb:                filesystem to do the file handle conversion on
 * @fid:        file handle to convert
 * @fh_len:        length of the file handle in bytes
 * @fh_type:        type of file handle
 * @get_inode:        filesystem callback to retrieve inode
 *
 * This function decodes @fid as long as it has one of the well-known
 * Linux filehandle types and calls @get_inode on it to retrieve the
 * inode for the _parent_ object specified in the file handle if it
 * is specified in the file handle, or NULL otherwise.
 */
struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
                int fh_len, int fh_type, struct inode *(*get_inode)
                        (struct super_block *sb, u64 ino, u32 gen))
{
        struct inode *inode = NULL;

        if (fh_len <= 2)
                return NULL;

        switch (fh_type) {
        case FILEID_INO32_GEN_PARENT:
                inode = get_inode(sb, fid->i32.parent_ino,
                                  (fh_len > 3 ? fid->i32.parent_gen : 0));
                break;
        }

        return d_obtain_alias(inode);
}
EXPORT_SYMBOL_GPL(generic_fh_to_parent);

/**
 * __generic_file_fsync - generic fsync implementation for simple filesystems
 *
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure.
 */
int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
                                 int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;

        err = file_write_and_wait_range(file, start, end);
        if (err)
                return err;

        inode_lock(inode);
        ret = sync_mapping_buffers(inode->i_mapping);
        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;

        err = sync_inode_metadata(inode, 1);
        if (ret == 0)
                ret = err;

out:
        inode_unlock(inode);
        /* check and advance again to catch errors after syncing out buffers */
        err = file_check_and_advance_wb_err(file);
        if (ret == 0)
                ret = err;
        return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);

/**
 * generic_file_fsync - generic fsync implementation for simple filesystems
 *                        with flush
 * @file:        file to synchronize
 * @start:        start offset in bytes
 * @end:        end offset in bytes (inclusive)
 * @datasync:        only synchronize essential metadata if true
 *
 */

int generic_file_fsync(struct file *file, loff_t start, loff_t end,
                       int datasync)
{
        struct inode *inode = file->f_mapping->host;
        int err;

        err = __generic_file_fsync(file, start, end, datasync);
        if (err)
                return err;
        return blkdev_issue_flush(inode->i_sb->s_bdev);
}
EXPORT_SYMBOL(generic_file_fsync);

/**
 * generic_check_addressable - Check addressability of file system
 * @blocksize_bits:        log of file system block size
 * @num_blocks:                number of blocks in file system
 *
 * Determine whether a file system with @num_blocks blocks (and a
 * block size of 2**@blocksize_bits) is addressable by the sector_t
 * and page cache of the system.  Return 0 if so and -EFBIG otherwise.
 */
int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks)
{
        u64 last_fs_block = num_blocks - 1;
        u64 last_fs_page =
                last_fs_block >> (PAGE_SHIFT - blocksize_bits);

        if (unlikely(num_blocks == 0))
                return 0;

        if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT))
                return -EINVAL;

        if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) ||
            (last_fs_page > (pgoff_t)(~0ULL))) {
                return -EFBIG;
        }
        return 0;
}
EXPORT_SYMBOL(generic_check_addressable);

/*
 * No-op implementation of ->fsync for in-memory filesystems.
 */
int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
        return 0;
}
EXPORT_SYMBOL(noop_fsync);

ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
        /*
         * iomap based filesystems support direct I/O without need for
         * this callback. However, it still needs to be set in
         * inode->a_ops so that open/fcntl know that direct I/O is
         * generally supported.
         */
        return -EINVAL;
}
EXPORT_SYMBOL_GPL(noop_direct_IO);

/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
void kfree_link(void *p)
{
        kfree(p);
}
EXPORT_SYMBOL(kfree_link);

struct inode *alloc_anon_inode(struct super_block *s)
{
        static const struct address_space_operations anon_aops = {
                .dirty_folio        = noop_dirty_folio,
        };
        struct inode *inode = new_inode_pseudo(s);

        if (!inode)
                return ERR_PTR(-ENOMEM);

        inode->i_ino = get_next_ino();
        inode->i_mapping->a_ops = &anon_aops;

        /*
         * Mark the inode dirty from the very beginning,
         * that way it will never be moved to the dirty
         * list because mark_inode_dirty() will think
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
        inode->i_mode = S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_flags |= S_PRIVATE;
        simple_inode_init_ts(inode);
        return inode;
}
EXPORT_SYMBOL(alloc_anon_inode);

/**
 * simple_nosetlease - generic helper for prohibiting leases
 * @filp: file pointer
 * @arg: type of lease to obtain
 * @flp: new lease supplied for insertion
 * @priv: private data for lm_setup operation
 *
 * Generic helper for filesystems that do not wish to allow leases to be set.
 * All arguments are ignored and it just returns -EINVAL.
 */
int
simple_nosetlease(struct file *filp, int arg, struct file_lease **flp,
                  void **priv)
{
        return -EINVAL;
}
EXPORT_SYMBOL(simple_nosetlease);

/**
 * simple_get_link - generic helper to get the target of "fast" symlinks
 * @dentry: not used here
 * @inode: the symlink inode
 * @done: not used here
 *
 * Generic helper for filesystems to use for symlink inodes where a pointer to
 * the symlink target is stored in ->i_link.  NOTE: this isn't normally called,
 * since as an optimization the path lookup code uses any non-NULL ->i_link
 * directly, without calling ->get_link().  But ->get_link() still must be set,
 * to mark the inode_operations as being for a symlink.
 *
 * Return: the symlink target
 */
const char *simple_get_link(struct dentry *dentry, struct inode *inode,
                            struct delayed_call *done)
{
        return inode->i_link;
}
EXPORT_SYMBOL(simple_get_link);

const struct inode_operations simple_symlink_inode_operations = {
        .get_link = simple_get_link,
};
EXPORT_SYMBOL(simple_symlink_inode_operations);

/*
 * Operations for a permanently empty directory.
 */
static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
        return ERR_PTR(-ENOENT);
}

static int empty_dir_setattr(struct mnt_idmap *idmap,
                             struct dentry *dentry, struct iattr *attr)
{
        return -EPERM;
}

static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size)
{
        return -EOPNOTSUPP;
}

static const struct inode_operations empty_dir_inode_operations = {
        .lookup                = empty_dir_lookup,
        .setattr        = empty_dir_setattr,
        .listxattr        = empty_dir_listxattr,
};

static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence)
{
        /* An empty directory has two entries . and .. at offsets 0 and 1 */
        return generic_file_llseek_size(file, offset, whence, 2, 2);
}

static int empty_dir_readdir(struct file *file, struct dir_context *ctx)
{
        dir_emit_dots(file, ctx);
        return 0;
}

static const struct file_operations empty_dir_operations = {
        .llseek                = empty_dir_llseek,
        .read                = generic_read_dir,
        .iterate_shared        = empty_dir_readdir,
        .fsync                = noop_fsync,
};


void make_empty_dir_inode(struct inode *inode)
{
        set_nlink(inode, 2);
        inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
        inode->i_uid = GLOBAL_ROOT_UID;
        inode->i_gid = GLOBAL_ROOT_GID;
        inode->i_rdev = 0;
        inode->i_size = 0;
        inode->i_blkbits = PAGE_SHIFT;
        inode->i_blocks = 0;

        inode->i_op = &empty_dir_inode_operations;
        inode->i_opflags &= ~IOP_XATTR;
        inode->i_fop = &empty_dir_operations;
}

bool is_empty_dir_inode(struct inode *inode)
{
        return (inode->i_fop == &empty_dir_operations) &&
                (inode->i_op == &empty_dir_inode_operations);
}

#if IS_ENABLED(CONFIG_UNICODE)
/**
 * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
 * @dentry:        dentry whose name we are checking against
 * @len:        len of name of dentry
 * @str:        str pointer to name of dentry
 * @name:        Name to compare against
 *
 * Return: 0 if names match, 1 if mismatch, or -ERRNO
 */
int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
                         const char *str, const struct qstr *name)
{
        const struct dentry *parent;
        const struct inode *dir;
        union shortname_store strbuf;
        struct qstr qstr;

        /*
         * Attempt a case-sensitive match first. It is cheaper and
         * should cover most lookups, including all the sane
         * applications that expect a case-sensitive filesystem.
         *
         * This comparison is safe under RCU because the caller
         * guarantees the consistency between str and len. See
         * __d_lookup_rcu_op_compare() for details.
         */
        if (len == name->len && !memcmp(str, name->name, len))
                return 0;

        parent = READ_ONCE(dentry->d_parent);
        dir = READ_ONCE(parent->d_inode);
        if (!dir || !IS_CASEFOLDED(dir))
                return 1;

        qstr.len = len;
        qstr.name = str;
        /*
         * If the dentry name is stored in-line, then it may be concurrently
         * modified by a rename.  If this happens, the VFS will eventually retry
         * the lookup, so it doesn't matter what ->d_compare() returns.
         * However, it's unsafe to call utf8_strncasecmp() with an unstable
         * string.  Therefore, we have to copy the name into a temporary buffer.
         * As above, len is guaranteed to match str, so the shortname case
         * is exactly when str points to ->d_shortname.
         */
        if (qstr.name == dentry->d_shortname.string) {
                strbuf = dentry->d_shortname; // NUL is guaranteed to be in there
                qstr.name = strbuf.string;
                /* prevent compiler from optimizing out the temporary buffer */
                barrier();
        }

        return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}
EXPORT_SYMBOL(generic_ci_d_compare);

/**
 * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
 * @dentry:        dentry of the parent directory
 * @str:        qstr of name whose hash we should fill in
 *
 * Return: 0 if hash was successful or unchanged, and -EINVAL on error
 */
int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
        const struct inode *dir = READ_ONCE(dentry->d_inode);
        struct super_block *sb = dentry->d_sb;
        const struct unicode_map *um = sb->s_encoding;
        int ret;

        if (!dir || !IS_CASEFOLDED(dir))
                return 0;

        ret = utf8_casefold_hash(um, dentry, str);
        if (ret < 0 && sb_has_strict_encoding(sb))
                return -EINVAL;
        return 0;
}
EXPORT_SYMBOL(generic_ci_d_hash);

static const struct dentry_operations generic_ci_dentry_ops = {
        .d_hash = generic_ci_d_hash,
        .d_compare = generic_ci_d_compare,
#ifdef CONFIG_FS_ENCRYPTION
        .d_revalidate = fscrypt_d_revalidate,
#endif
};

/**
 * generic_ci_match() - Match a name (case-insensitively) with a dirent.
 * This is a filesystem helper for comparison with directory entries.
 * generic_ci_d_compare should be used in VFS' ->d_compare instead.
 *
 * @parent: Inode of the parent of the dirent under comparison
 * @name: name under lookup.
 * @folded_name: Optional pre-folded name under lookup
 * @de_name: Dirent name.
 * @de_name_len: dirent name length.
 *
 * Test whether a case-insensitive directory entry matches the filename
 * being searched.  If @folded_name is provided, it is used instead of
 * recalculating the casefold of @name.
 *
 * Return: > 0 if the directory entry matches, 0 if it doesn't match, or
 * < 0 on error.
 */
int generic_ci_match(const struct inode *parent,
                     const struct qstr *name,
                     const struct qstr *folded_name,
                     const u8 *de_name, u32 de_name_len)
{
        const struct super_block *sb = parent->i_sb;
        const struct unicode_map *um = sb->s_encoding;
        struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len);
        struct qstr dirent = QSTR_INIT(de_name, de_name_len);
        int res = 0;

        if (IS_ENCRYPTED(parent)) {
                const struct fscrypt_str encrypted_name =
                        FSTR_INIT((u8 *) de_name, de_name_len);

                if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent)))
                        return -EINVAL;

                decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL);
                if (!decrypted_name.name)
                        return -ENOMEM;
                res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name,
                                                &decrypted_name);
                if (res < 0) {
                        kfree(decrypted_name.name);
                        return res;
                }
                dirent.name = decrypted_name.name;
                dirent.len = decrypted_name.len;
        }

        /*
         * Attempt a case-sensitive match first. It is cheaper and
         * should cover most lookups, including all the sane
         * applications that expect a case-sensitive filesystem.
         */

        if (dirent.len == name->len &&
            !memcmp(name->name, dirent.name, dirent.len))
                goto out;

        if (folded_name->name)
                res = utf8_strncasecmp_folded(um, folded_name, &dirent);
        else
                res = utf8_strncasecmp(um, name, &dirent);

out:
        kfree(decrypted_name.name);
        if (res < 0 && sb_has_strict_encoding(sb)) {
                pr_err_ratelimited("Directory contains filename that is invalid UTF-8");
                return 0;
        }
        return !res;
}
EXPORT_SYMBOL(generic_ci_match);
#endif

#ifdef CONFIG_FS_ENCRYPTION
static const struct dentry_operations generic_encrypted_dentry_ops = {
        .d_revalidate = fscrypt_d_revalidate,
};
#endif

/**
 * generic_set_sb_d_ops - helper for choosing the set of
 * filesystem-wide dentry operations for the enabled features
 * @sb: superblock to be configured
 *
 * Filesystems supporting casefolding and/or fscrypt can call this
 * helper at mount-time to configure sb->s_d_op to best set of dentry
 * operations required for the enabled features. The helper must be
 * called after these have been configured, but before the root dentry
 * is created.
 */
void generic_set_sb_d_ops(struct super_block *sb)
{
#if IS_ENABLED(CONFIG_UNICODE)
        if (sb->s_encoding) {
                sb->s_d_op = &generic_ci_dentry_ops;
                return;
        }
#endif
#ifdef CONFIG_FS_ENCRYPTION
        if (sb->s_cop) {
                sb->s_d_op = &generic_encrypted_dentry_ops;
                return;
        }
#endif
}
EXPORT_SYMBOL(generic_set_sb_d_ops);

/**
 * inode_maybe_inc_iversion - increments i_version
 * @inode: inode with the i_version that should be updated
 * @force: increment the counter even if it's not necessary?
 *
 * Every time the inode is modified, the i_version field must be seen to have
 * changed by any observer.
 *
 * If "force" is set or the QUERIED flag is set, then ensure that we increment
 * the value, and clear the queried flag.
 *
 * In the common case where neither is set, then we can return "false" without
 * updating i_version.
 *
 * If this function returns false, and no other metadata has changed, then we
 * can avoid logging the metadata.
 */
bool inode_maybe_inc_iversion(struct inode *inode, bool force)
{
        u64 cur, new;

        /*
         * The i_version field is not strictly ordered with any other inode
         * information, but the legacy inode_inc_iversion code used a spinlock
         * to serialize increments.
         *
         * We add a full memory barrier to ensure that any de facto ordering
         * with other state is preserved (either implicitly coming from cmpxchg
         * or explicitly from smp_mb if we don't know upfront if we will execute
         * the former).
         *
         * These barriers pair with inode_query_iversion().
         */
        cur = inode_peek_iversion_raw(inode);
        if (!force && !(cur & I_VERSION_QUERIED)) {
                smp_mb();
                cur = inode_peek_iversion_raw(inode);
        }

        do {
                /* If flag is clear then we needn't do anything */
                if (!force && !(cur & I_VERSION_QUERIED))
                        return false;

                /* Since lowest bit is flag, add 2 to avoid it */
                new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
        } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
        return true;
}
EXPORT_SYMBOL(inode_maybe_inc_iversion);

/**
 * inode_query_iversion - read i_version for later use
 * @inode: inode from which i_version should be read
 *
 * Read the inode i_version counter. This should be used by callers that wish
 * to store the returned i_version for later comparison. This will guarantee
 * that a later query of the i_version will result in a different value if
 * anything has changed.
 *
 * In this implementation, we fetch the current value, set the QUERIED flag and
 * then try to swap it into place with a cmpxchg, if it wasn't already set. If
 * that fails, we try again with the newly fetched value from the cmpxchg.
 */
u64 inode_query_iversion(struct inode *inode)
{
        u64 cur, new;
        bool fenced = false;

        /*
         * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with
         * inode_maybe_inc_iversion(), see that routine for more details.
         */
        cur = inode_peek_iversion_raw(inode);
        do {
                /* If flag is already set, then no need to swap */
                if (cur & I_VERSION_QUERIED) {
                        if (!fenced)
                                smp_mb();
                        break;
                }

                fenced = true;
                new = cur | I_VERSION_QUERIED;
        } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
        return cur >> I_VERSION_QUERIED_SHIFT;
}
EXPORT_SYMBOL(inode_query_iversion);

ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
                ssize_t direct_written, ssize_t buffered_written)
{
        struct address_space *mapping = iocb->ki_filp->f_mapping;
        loff_t pos = iocb->ki_pos - buffered_written;
        loff_t end = iocb->ki_pos - 1;
        int err;

        /*
         * If the buffered write fallback returned an error, we want to return
         * the number of bytes which were written by direct I/O, or the error
         * code if that was zero.
         *
         * Note that this differs from normal direct-io semantics, which will
         * return -EFOO even if some bytes were written.
         */
        if (unlikely(buffered_written < 0)) {
                if (direct_written)
                        return direct_written;
                return buffered_written;
        }

        /*
         * We need to ensure that the page cache pages are written to disk and
         * invalidated to preserve the expected O_DIRECT semantics.
         */
        err = filemap_write_and_wait_range(mapping, pos, end);
        if (err < 0) {
                /*
                 * We don't know how much we wrote, so just return the number of
                 * bytes which were direct-written
                 */
                iocb->ki_pos -= buffered_written;
                if (direct_written)
                        return direct_written;
                return err;
        }
        invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
        return direct_written + buffered_written;
}
EXPORT_SYMBOL_GPL(direct_write_fallback);

/**
 * simple_inode_init_ts - initialize the timestamps for a new inode
 * @inode: inode to be initialized
 *
 * When a new inode is created, most filesystems set the timestamps to the
 * current time. Add a helper to do this.
 */
struct timespec64 simple_inode_init_ts(struct inode *inode)
{
        struct timespec64 ts = inode_set_ctime_current(inode);

        inode_set_atime_to_ts(inode, ts);
        inode_set_mtime_to_ts(inode, ts);
        return ts;
}
EXPORT_SYMBOL(simple_inode_init_ts);

struct dentry *stashed_dentry_get(struct dentry **stashed)
{
        struct dentry *dentry;

        guard(rcu)();
        dentry = rcu_dereference(*stashed);
        if (!dentry)
                return NULL;
        if (!lockref_get_not_dead(&dentry->d_lockref))
                return NULL;
        return dentry;
}

static struct dentry *prepare_anon_dentry(struct dentry **stashed,
                                          struct super_block *sb,
                                          void *data)
{
        struct dentry *dentry;
        struct inode *inode;
        const struct stashed_operations *sops = sb->s_fs_info;
        int ret;

        inode = new_inode_pseudo(sb);
        if (!inode) {
                sops->put_data(data);
                return ERR_PTR(-ENOMEM);
        }

        inode->i_flags |= S_IMMUTABLE;
        inode->i_mode = S_IFREG;
        simple_inode_init_ts(inode);

        ret = sops->init_inode(inode, data);
        if (ret < 0) {
                iput(inode);
                return ERR_PTR(ret);
        }

        /* Notice when this is changed. */
        WARN_ON_ONCE(!S_ISREG(inode->i_mode));
        WARN_ON_ONCE(!IS_IMMUTABLE(inode));

        dentry = d_alloc_anon(sb);
        if (!dentry) {
                iput(inode);
                return ERR_PTR(-ENOMEM);
        }

        /* Store address of location where dentry's supposed to be stashed. */
        dentry->d_fsdata = stashed;

        /* @data is now owned by the fs */
        d_instantiate(dentry, inode);
        return dentry;
}

static struct dentry *stash_dentry(struct dentry **stashed,
                                   struct dentry *dentry)
{
        guard(rcu)();
        for (;;) {
                struct dentry *old;

                /* Assume any old dentry was cleared out. */
                old = cmpxchg(stashed, NULL, dentry);
                if (likely(!old))
                        return dentry;

                /* Check if somebody else installed a reusable dentry. */
                if (lockref_get_not_dead(&old->d_lockref))
                        return old;

                /* There's an old dead dentry there, try to take it over. */
                if (likely(try_cmpxchg(stashed, &old, dentry)))
                        return dentry;
        }
}

/**
 * path_from_stashed - create path from stashed or new dentry
 * @stashed:    where to retrieve or stash dentry
 * @mnt:        mnt of the filesystems to use
 * @data:       data to store in inode->i_private
 * @path:       path to create
 *
 * The function tries to retrieve a stashed dentry from @stashed. If the dentry
 * is still valid then it will be reused. If the dentry isn't able the function
 * will allocate a new dentry and inode. It will then check again whether it
 * can reuse an existing dentry in case one has been added in the meantime or
 * update @stashed with the newly added dentry.
 *
 * Special-purpose helper for nsfs and pidfs.
 *
 * Return: On success zero and on failure a negative error is returned.
 */
int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
                      struct path *path)
{
        struct dentry *dentry;
        const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;

        /* See if dentry can be reused. */
        path->dentry = stashed_dentry_get(stashed);
        if (path->dentry) {
                sops->put_data(data);
                goto out_path;
        }

        /* Allocate a new dentry. */
        dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);

        /* Added a new dentry. @data is now owned by the filesystem. */
        path->dentry = stash_dentry(stashed, dentry);
        if (path->dentry != dentry)
                dput(dentry);

out_path:
        WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
        WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
        path->mnt = mntget(mnt);
        return 0;
}

void stashed_dentry_prune(struct dentry *dentry)
{
        struct dentry **stashed = dentry->d_fsdata;
        struct inode *inode = d_inode(dentry);

        if (WARN_ON_ONCE(!stashed))
                return;

        if (!inode)
                return;

        /*
         * Only replace our own @dentry as someone else might've
         * already cleared out @dentry and stashed their own
         * dentry in there.
         */
        cmpxchg(stashed, dentry, NULL);
}
































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/barrier.h
 *
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_BARRIER_H
#define __ASM_BARRIER_H

#ifndef __ASSEMBLY__

#include <linux/kasan-checks.h>

#include <asm/alternative-macros.h>

#define __nops(n)        ".rept        " #n "\nnop\n.endr\n"
#define nops(n)                asm volatile(__nops(n))

#define sev()                asm volatile("sev" : : : "memory")
#define wfe()                asm volatile("wfe" : : : "memory")
#define wfet(val)        asm volatile("msr s0_3_c1_c0_0, %0"        \
                                     : : "r" (val) : "memory")
#define wfi()                asm volatile("wfi" : : : "memory")
#define wfit(val)        asm volatile("msr s0_3_c1_c0_1, %0"        \
                                     : : "r" (val) : "memory")

#define isb()                asm volatile("isb" : : : "memory")
#define dmb(opt)        asm volatile("dmb " #opt : : : "memory")
#define dsb(opt)        asm volatile("dsb " #opt : : : "memory")

#define psb_csync()        asm volatile("hint #17" : : : "memory")
#define __tsb_csync()        asm volatile("hint #18" : : : "memory")
#define csdb()                asm volatile("hint #20" : : : "memory")

/*
 * Data Gathering Hint:
 * This instruction prevents merging memory accesses with Normal-NC or
 * Device-GRE attributes before the hint instruction with any memory accesses
 * appearing after the hint instruction.
 */
#define dgh()                asm volatile("hint #6" : : : "memory")

#define spec_bar()        asm volatile(ALTERNATIVE("dsb nsh\nisb\n",                \
                                                 SB_BARRIER_INSN"nop\n",        \
                                                 ARM64_HAS_SB))

#ifdef CONFIG_ARM64_PSEUDO_NMI
#define pmr_sync()                                                \
        do {                                                        \
                asm volatile(                                        \
                ALTERNATIVE_CB("dsb sy",                        \
                               ARM64_HAS_GIC_PRIO_RELAXED_SYNC,        \
                               alt_cb_patch_nops)                \
                );                                                \
        } while(0)
#else
#define pmr_sync()        do {} while (0)
#endif

#define __mb()                dsb(sy)
#define __rmb()                dsb(ld)
#define __wmb()                dsb(st)

#define __dma_mb()        dmb(osh)
#define __dma_rmb()        dmb(oshld)
#define __dma_wmb()        dmb(oshst)

#define io_stop_wc()        dgh()

#define tsb_csync()                                                                \
        do {                                                                        \
                /*                                                                \
                 * CPUs affected by Arm Erratum 2054223 or 2067961 needs        \
                 * another TSB to ensure the trace is flushed. The barriers        \
                 * don't have to be strictly back to back, as long as the        \
                 * CPU is in trace prohibited state.                                \
                 */                                                                \
                if (cpus_have_final_cap(ARM64_WORKAROUND_TSB_FLUSH_FAILURE))        \
                        __tsb_csync();                                                \
                __tsb_csync();                                                        \
        } while (0)

/*
 * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz
 * and 0 otherwise.
 */
#define array_index_mask_nospec array_index_mask_nospec
static inline unsigned long array_index_mask_nospec(unsigned long idx,
                                                    unsigned long sz)
{
        unsigned long mask;

        asm volatile(
        "        cmp        %1, %2\n"
        "        sbc        %0, xzr, xzr\n"
        : "=r" (mask)
        : "r" (idx), "Ir" (sz)
        : "cc");

        csdb();
        return mask;
}

/*
 * Ensure that reads of the counter are treated the same as memory reads
 * for the purposes of ordering by subsequent memory barriers.
 *
 * This insanity brought to you by speculative system register reads,
 * out-of-order memory accesses, sequence locks and Thomas Gleixner.
 *
 * https://lore.kernel.org/r/alpine.DEB.2.21.1902081950260.1662@nanos.tec.linutronix.de/
 */
#define arch_counter_enforce_ordering(val) do {                                \
        u64 tmp, _val = (val);                                                \
                                                                        \
        asm volatile(                                                        \
        "        eor        %0, %1, %1\n"                                        \
        "        add        %0, sp, %0\n"                                        \
        "        ldr        xzr, [%0]"                                        \
        : "=r" (tmp) : "r" (_val));                                        \
} while (0)

#define __smp_mb()        dmb(ish)
#define __smp_rmb()        dmb(ishld)
#define __smp_wmb()        dmb(ishst)

#define __smp_store_release(p, v)                                        \
do {                                                                        \
        typeof(p) __p = (p);                                                \
        union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u =        \
                { .__val = (__force __unqual_scalar_typeof(*p)) (v) };        \
        compiletime_assert_atomic_type(*p);                                \
        kasan_check_write(__p, sizeof(*p));                                \
        switch (sizeof(*p)) {                                                \
        case 1:                                                                \
                asm volatile ("stlrb %w1, %0"                                \
                                : "=Q" (*__p)                                \
                                : "rZ" (*(__u8 *)__u.__c)                \
                                : "memory");                                \
                break;                                                        \
        case 2:                                                                \
                asm volatile ("stlrh %w1, %0"                                \
                                : "=Q" (*__p)                                \
                                : "rZ" (*(__u16 *)__u.__c)                \
                                : "memory");                                \
                break;                                                        \
        case 4:                                                                \
                asm volatile ("stlr %w1, %0"                                \
                                : "=Q" (*__p)                                \
                                : "rZ" (*(__u32 *)__u.__c)                \
                                : "memory");                                \
                break;                                                        \
        case 8:                                                                \
                asm volatile ("stlr %x1, %0"                                \
                                : "=Q" (*__p)                                \
                                : "rZ" (*(__u64 *)__u.__c)                \
                                : "memory");                                \
                break;                                                        \
        }                                                                \
} while (0)

#define __smp_load_acquire(p)                                                \
({                                                                        \
        union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u;        \
        typeof(p) __p = (p);                                                \
        compiletime_assert_atomic_type(*p);                                \
        kasan_check_read(__p, sizeof(*p));                                \
        switch (sizeof(*p)) {                                                \
        case 1:                                                                \
                asm volatile ("ldarb %w0, %1"                                \
                        : "=r" (*(__u8 *)__u.__c)                        \
                        : "Q" (*__p) : "memory");                        \
                break;                                                        \
        case 2:                                                                \
                asm volatile ("ldarh %w0, %1"                                \
                        : "=r" (*(__u16 *)__u.__c)                        \
                        : "Q" (*__p) : "memory");                        \
                break;                                                        \
        case 4:                                                                \
                asm volatile ("ldar %w0, %1"                                \
                        : "=r" (*(__u32 *)__u.__c)                        \
                        : "Q" (*__p) : "memory");                        \
                break;                                                        \
        case 8:                                                                \
                asm volatile ("ldar %0, %1"                                \
                        : "=r" (*(__u64 *)__u.__c)                        \
                        : "Q" (*__p) : "memory");                        \
                break;                                                        \
        }                                                                \
        (typeof(*p))__u.__val;                                                \
})

#define smp_cond_load_relaxed(ptr, cond_expr)                                \
({                                                                        \
        typeof(ptr) __PTR = (ptr);                                        \
        __unqual_scalar_typeof(*ptr) VAL;                                \
        for (;;) {                                                        \
                VAL = READ_ONCE(*__PTR);                                \
                if (cond_expr)                                                \
                        break;                                                \
                __cmpwait_relaxed(__PTR, VAL);                                \
        }                                                                \
        (typeof(*ptr))VAL;                                                \
})

#define smp_cond_load_acquire(ptr, cond_expr)                                \
({                                                                        \
        typeof(ptr) __PTR = (ptr);                                        \
        __unqual_scalar_typeof(*ptr) VAL;                                \
        for (;;) {                                                        \
                VAL = smp_load_acquire(__PTR);                                \
                if (cond_expr)                                                \
                        break;                                                \
                __cmpwait_relaxed(__PTR, VAL);                                \
        }                                                                \
        (typeof(*ptr))VAL;                                                \
})

#include <asm-generic/barrier.h>

#endif        /* __ASSEMBLY__ */

#endif        /* __ASM_BARRIER_H */

























































































   24 




   24 



























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_U64_STATS_SYNC_H
#define _LINUX_U64_STATS_SYNC_H

/*
 * Protect against 64-bit values tearing on 32-bit architectures. This is
 * typically used for statistics read/update in different subsystems.
 *
 * Key points :
 *
 * -  Use a seqcount on 32-bit
 * -  The whole thing is a no-op on 64-bit architectures.
 *
 * Usage constraints:
 *
 * 1) Write side must ensure mutual exclusion, or one seqcount update could
 *    be lost, thus blocking readers forever.
 *
 * 2) Write side must disable preemption, or a seqcount reader can preempt the
 *    writer and also spin forever.
 *
 * 3) Write side must use the _irqsave() variant if other writers, or a reader,
 *    can be invoked from an IRQ context. On 64bit systems this variant does not
 *    disable interrupts.
 *
 * 4) If reader fetches several counters, there is no guarantee the whole values
 *    are consistent w.r.t. each other (remember point #2: seqcounts are not
 *    used for 64bit architectures).
 *
 * 5) Readers are allowed to sleep or be preempted/interrupted: they perform
 *    pure reads.
 *
 * Usage :
 *
 * Stats producer (writer) should use following template granted it already got
 * an exclusive access to counters (a lock is already taken, or per cpu
 * data is used [in a non preemptable context])
 *
 *   spin_lock_bh(...) or other synchronization to get exclusive access
 *   ...
 *   u64_stats_update_begin(&stats->syncp);
 *   u64_stats_add(&stats->bytes64, len); // non atomic operation
 *   u64_stats_inc(&stats->packets64);    // non atomic operation
 *   u64_stats_update_end(&stats->syncp);
 *
 * While a consumer (reader) should use following template to get consistent
 * snapshot for each variable (but no guarantee on several ones)
 *
 * u64 tbytes, tpackets;
 * unsigned int start;
 *
 * do {
 *         start = u64_stats_fetch_begin(&stats->syncp);
 *         tbytes = u64_stats_read(&stats->bytes64); // non atomic operation
 *         tpackets = u64_stats_read(&stats->packets64); // non atomic operation
 * } while (u64_stats_fetch_retry(&stats->syncp, start));
 *
 *
 * Example of use in drivers/net/loopback.c, using per_cpu containers,
 * in BH disabled context.
 */
#include <linux/seqlock.h>

struct u64_stats_sync {
#if BITS_PER_LONG == 32
        seqcount_t        seq;
#endif
};

#if BITS_PER_LONG == 64
#include <asm/local64.h>

typedef struct {
        local64_t        v;
} u64_stats_t ;

static inline u64 u64_stats_read(const u64_stats_t *p)
{
        return local64_read(&p->v);
}

static inline void u64_stats_set(u64_stats_t *p, u64 val)
{
        local64_set(&p->v, val);
}

static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
{
        local64_add(val, &p->v);
}

static inline void u64_stats_inc(u64_stats_t *p)
{
        local64_inc(&p->v);
}

static inline void u64_stats_init(struct u64_stats_sync *syncp) { }
static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp) { }
static inline void __u64_stats_update_end(struct u64_stats_sync *syncp) { }
static inline unsigned long __u64_stats_irqsave(void) { return 0; }
static inline void __u64_stats_irqrestore(unsigned long flags) { }
static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
        return 0;
}
static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
                                           unsigned int start)
{
        return false;
}

#else /* 64 bit */

typedef struct {
        u64                v;
} u64_stats_t;

static inline u64 u64_stats_read(const u64_stats_t *p)
{
        return p->v;
}

static inline void u64_stats_set(u64_stats_t *p, u64 val)
{
        p->v = val;
}

static inline void u64_stats_add(u64_stats_t *p, unsigned long val)
{
        p->v += val;
}

static inline void u64_stats_inc(u64_stats_t *p)
{
        p->v++;
}

#define u64_stats_init(syncp)                                \
        do {                                                \
                struct u64_stats_sync *__s = (syncp);        \
                seqcount_init(&__s->seq);                \
        } while (0)

static inline void __u64_stats_update_begin(struct u64_stats_sync *syncp)
{
        preempt_disable_nested();
        write_seqcount_begin(&syncp->seq);
}

static inline void __u64_stats_update_end(struct u64_stats_sync *syncp)
{
        write_seqcount_end(&syncp->seq);
        preempt_enable_nested();
}

static inline unsigned long __u64_stats_irqsave(void)
{
        unsigned long flags;

        local_irq_save(flags);
        return flags;
}

static inline void __u64_stats_irqrestore(unsigned long flags)
{
        local_irq_restore(flags);
}

static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
        return read_seqcount_begin(&syncp->seq);
}

static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
                                           unsigned int start)
{
        return read_seqcount_retry(&syncp->seq, start);
}
#endif /* !64 bit */

static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
{
        __u64_stats_update_begin(syncp);
}

static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
{
        __u64_stats_update_end(syncp);
}

static inline unsigned long u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp)
{
        unsigned long flags = __u64_stats_irqsave();

        __u64_stats_update_begin(syncp);
        return flags;
}

static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp,
                                                   unsigned long flags)
{
        __u64_stats_update_end(syncp);
        __u64_stats_irqrestore(flags);
}

static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
{
        return __u64_stats_fetch_begin(syncp);
}

static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
                                         unsigned int start)
{
        return __u64_stats_fetch_retry(syncp, start);
}

#endif /* _LINUX_U64_STATS_SYNC_H */







































































   27 


   27 


   27 










   41 



   15 




    1 
   27 





   12 





   63 



   10 












   70 



   70 


   24 






    5 



   12 






   12 




































    4 


    4 

















   37 



    3 







   36 
    4 



















































































    1 






















   36 


   36 


   36 



















































    3 













    3 
    3 





    3 
























   63 





    1 





   63 


   60 

    4 
    1 




   61 









    2 



























   58 
















   58 
   58 






   58 
















    1 
    1 





    1 





















































   57 








   58 


   58 




   58 























    3 







    3 






    2 
















    3 































































    3 






    3 




    3 







    3 




















    3 

    3 




    3 


















    3 

    3 
    3 




    3 









    3 




    3 
    3 

    3 





    3 








    3 
    3 




    3 









    3 







    3 




  163 








  162 



  163 


    3 
    3 

    3 




    1 
    3 
    3 







    3 




  162 


























  163 














  163 







    3 



    3 
    3 

    3 




  166 
  109 

  109 






   59 



  129 
   83 

   83 






   46 



    6 






    2 









    4 
















    4 







    4 
    4 







   59 







   58 

   59 




























    3 
    3 













1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015, 2016 ARM Ltd.
 */

#include <linux/interrupt.h>
#include <linux/irq.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/list_sort.h>
#include <linux/nospec.h>

#include <asm/kvm_hyp.h>

#include "vgic.h"

#define CREATE_TRACE_POINTS
#include "trace.h"

struct vgic_global kvm_vgic_global_state __ro_after_init = {
        .gicv3_cpuif = STATIC_KEY_FALSE_INIT,
};

/*
 * Locking order is always:
 * kvm->lock (mutex)
 *   vcpu->mutex (mutex)
 *     kvm->arch.config_lock (mutex)
 *       its->cmd_lock (mutex)
 *         its->its_lock (mutex)
 *           vgic_cpu->ap_list_lock                must be taken with IRQs disabled
 *             vgic_dist->lpi_xa.xa_lock        must be taken with IRQs disabled
 *               vgic_irq->irq_lock                must be taken with IRQs disabled
 *
 * As the ap_list_lock might be taken from the timer interrupt handler,
 * we have to disable IRQs before taking this lock and everything lower
 * than it.
 *
 * The config_lock has additional ordering requirements:
 * kvm->slots_lock
 *   kvm->srcu
 *     kvm->arch.config_lock
 *
 * If you need to take multiple locks, always take the upper lock first,
 * then the lower ones, e.g. first take the its_lock, then the irq_lock.
 * If you are already holding a lock and need to take a higher one, you
 * have to drop the lower ranking lock first and re-acquire it after having
 * taken the upper one.
 *
 * When taking more than one ap_list_lock at the same time, always take the
 * lowest numbered VCPU's ap_list_lock first, so:
 *   vcpuX->vcpu_id < vcpuY->vcpu_id:
 *     raw_spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock);
 *     raw_spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
 *
 * Since the VGIC must support injecting virtual interrupts from ISRs, we have
 * to use the raw_spin_lock_irqsave/raw_spin_unlock_irqrestore versions of outer
 * spinlocks for any lock that may be taken while injecting an interrupt.
 */

/*
 * Index the VM's xarray of mapped LPIs and return a reference to the IRQ
 * structure. The caller is expected to call vgic_put_irq() later once it's
 * finished with the IRQ.
 */
static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        struct vgic_irq *irq = NULL;

        rcu_read_lock();

        irq = xa_load(&dist->lpi_xa, intid);
        if (!vgic_try_get_irq_kref(irq))
                irq = NULL;

        rcu_read_unlock();

        return irq;
}

/*
 * This looks up the virtual interrupt ID to get the corresponding
 * struct vgic_irq. It also increases the refcount, so any caller is expected
 * to call vgic_put_irq() once it's finished with this IRQ.
 */
struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid)
{
        /* SPIs */
        if (intid >= VGIC_NR_PRIVATE_IRQS &&
            intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
                intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
                return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
        }

        /* LPIs */
        if (intid >= VGIC_MIN_LPI)
                return vgic_get_lpi(kvm, intid);

        return NULL;
}

struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
{
        if (WARN_ON(!vcpu))
                return NULL;

        /* SGIs and PPIs */
        if (intid < VGIC_NR_PRIVATE_IRQS) {
                intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS);
                return &vcpu->arch.vgic_cpu.private_irqs[intid];
        }

        return vgic_get_irq(vcpu->kvm, intid);
}

/*
 * We can't do anything in here, because we lack the kvm pointer to
 * lock and remove the item from the lpi_list. So we keep this function
 * empty and use the return value of kref_put() to trigger the freeing.
 */
static void vgic_irq_release(struct kref *ref)
{
}

void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
{
        struct vgic_dist *dist = &kvm->arch.vgic;
        unsigned long flags;

        if (irq->intid < VGIC_MIN_LPI)
                return;

        if (!kref_put(&irq->refcount, vgic_irq_release))
                return;

        xa_lock_irqsave(&dist->lpi_xa, flags);
        __xa_erase(&dist->lpi_xa, irq->intid);
        xa_unlock_irqrestore(&dist->lpi_xa, flags);

        kfree_rcu(irq, rcu);
}

void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_irq *irq, *tmp;
        unsigned long flags;

        raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);

        list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
                if (irq->intid >= VGIC_MIN_LPI) {
                        raw_spin_lock(&irq->irq_lock);
                        list_del(&irq->ap_list);
                        irq->vcpu = NULL;
                        raw_spin_unlock(&irq->irq_lock);
                        vgic_put_irq(vcpu->kvm, irq);
                }
        }

        raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);
}

void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending)
{
        WARN_ON(irq_set_irqchip_state(irq->host_irq,
                                      IRQCHIP_STATE_PENDING,
                                      pending));
}

bool vgic_get_phys_line_level(struct vgic_irq *irq)
{
        bool line_level;

        BUG_ON(!irq->hw);

        if (irq->ops && irq->ops->get_input_level)
                return irq->ops->get_input_level(irq->intid);

        WARN_ON(irq_get_irqchip_state(irq->host_irq,
                                      IRQCHIP_STATE_PENDING,
                                      &line_level));
        return line_level;
}

/* Set/Clear the physical active state */
void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
{

        BUG_ON(!irq->hw);
        WARN_ON(irq_set_irqchip_state(irq->host_irq,
                                      IRQCHIP_STATE_ACTIVE,
                                      active));
}

/**
 * vgic_target_oracle - compute the target vcpu for an irq
 *
 * @irq:        The irq to route. Must be already locked.
 *
 * Based on the current state of the interrupt (enabled, pending,
 * active, vcpu and target_vcpu), compute the next vcpu this should be
 * given to. Return NULL if this shouldn't be injected at all.
 *
 * Requires the IRQ lock to be held.
 */
static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
{
        lockdep_assert_held(&irq->irq_lock);

        /* If the interrupt is active, it must stay on the current vcpu */
        if (irq->active)
                return irq->vcpu ? : irq->target_vcpu;

        /*
         * If the IRQ is not active but enabled and pending, we should direct
         * it to its configured target VCPU.
         * If the distributor is disabled, pending interrupts shouldn't be
         * forwarded.
         */
        if (irq->enabled && irq_is_pending(irq)) {
                if (unlikely(irq->target_vcpu &&
                             !irq->target_vcpu->kvm->arch.vgic.enabled))
                        return NULL;

                return irq->target_vcpu;
        }

        /* If neither active nor pending and enabled, then this IRQ should not
         * be queued to any VCPU.
         */
        return NULL;
}

/*
 * The order of items in the ap_lists defines how we'll pack things in LRs as
 * well, the first items in the list being the first things populated in the
 * LRs.
 *
 * A hard rule is that active interrupts can never be pushed out of the LRs
 * (and therefore take priority) since we cannot reliably trap on deactivation
 * of IRQs and therefore they have to be present in the LRs.
 *
 * Otherwise things should be sorted by the priority field and the GIC
 * hardware support will take care of preemption of priority groups etc.
 *
 * Return negative if "a" sorts before "b", 0 to preserve order, and positive
 * to sort "b" before "a".
 */
static int vgic_irq_cmp(void *priv, const struct list_head *a,
                        const struct list_head *b)
{
        struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
        struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
        bool penda, pendb;
        int ret;

        /*
         * list_sort may call this function with the same element when
         * the list is fairly long.
         */
        if (unlikely(irqa == irqb))
                return 0;

        raw_spin_lock(&irqa->irq_lock);
        raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);

        if (irqa->active || irqb->active) {
                ret = (int)irqb->active - (int)irqa->active;
                goto out;
        }

        penda = irqa->enabled && irq_is_pending(irqa);
        pendb = irqb->enabled && irq_is_pending(irqb);

        if (!penda || !pendb) {
                ret = (int)pendb - (int)penda;
                goto out;
        }

        /* Both pending and enabled, sort by priority */
        ret = irqa->priority - irqb->priority;
out:
        raw_spin_unlock(&irqb->irq_lock);
        raw_spin_unlock(&irqa->irq_lock);
        return ret;
}

/* Must be called with the ap_list_lock held */
static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;

        lockdep_assert_held(&vgic_cpu->ap_list_lock);

        list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
}

/*
 * Only valid injection if changing level for level-triggered IRQs or for a
 * rising edge, and in-kernel connected IRQ lines can only be controlled by
 * their owner.
 */
static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owner)
{
        if (irq->owner != owner)
                return false;

        switch (irq->config) {
        case VGIC_CONFIG_LEVEL:
                return irq->line_level != level;
        case VGIC_CONFIG_EDGE:
                return level;
        }

        return false;
}

/*
 * Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
 * Do the queuing if necessary, taking the right locks in the right order.
 * Returns true when the IRQ was queued, false otherwise.
 *
 * Needs to be entered with the IRQ lock already held, but will return
 * with all locks dropped.
 */
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
                           unsigned long flags) __releases(&irq->irq_lock)
{
        struct kvm_vcpu *vcpu;

        lockdep_assert_held(&irq->irq_lock);

retry:
        vcpu = vgic_target_oracle(irq);
        if (irq->vcpu || !vcpu) {
                /*
                 * If this IRQ is already on a VCPU's ap_list, then it
                 * cannot be moved or modified and there is no more work for
                 * us to do.
                 *
                 * Otherwise, if the irq is not pending and enabled, it does
                 * not need to be inserted into an ap_list and there is also
                 * no more work for us to do.
                 */
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

                /*
                 * We have to kick the VCPU here, because we could be
                 * queueing an edge-triggered interrupt for which we
                 * get no EOI maintenance interrupt. In that case,
                 * while the IRQ is already on the VCPU's AP list, the
                 * VCPU could have EOI'ed the original interrupt and
                 * won't see this one until it exits for some other
                 * reason.
                 */
                if (vcpu) {
                        kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
                        kvm_vcpu_kick(vcpu);
                }
                return false;
        }

        /*
         * We must unlock the irq lock to take the ap_list_lock where
         * we are going to insert this new pending interrupt.
         */
        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

        /* someone can do stuff here, which we re-check below */

        raw_spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
        raw_spin_lock(&irq->irq_lock);

        /*
         * Did something change behind our backs?
         *
         * There are two cases:
         * 1) The irq lost its pending state or was disabled behind our
         *    backs and/or it was queued to another VCPU's ap_list.
         * 2) Someone changed the affinity on this irq behind our
         *    backs and we are now holding the wrong ap_list_lock.
         *
         * In both cases, drop the locks and retry.
         */

        if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) {
                raw_spin_unlock(&irq->irq_lock);
                raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock,
                                           flags);

                raw_spin_lock_irqsave(&irq->irq_lock, flags);
                goto retry;
        }

        /*
         * Grab a reference to the irq to reflect the fact that it is
         * now in the ap_list. This is safe as the caller must already hold a
         * reference on the irq.
         */
        vgic_get_irq_kref(irq);
        list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
        irq->vcpu = vcpu;

        raw_spin_unlock(&irq->irq_lock);
        raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);

        kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
        kvm_vcpu_kick(vcpu);

        return true;
}

/**
 * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
 * @kvm:     The VM structure pointer
 * @vcpu:    The CPU for PPIs or NULL for global interrupts
 * @intid:   The INTID to inject a new state to.
 * @level:   Edge-triggered:  true:  to trigger the interrupt
 *                              false: to ignore the call
 *             Level-sensitive  true:  raise the input signal
 *                              false: lower the input signal
 * @owner:   The opaque pointer to the owner of the IRQ being raised to verify
 *           that the caller is allowed to inject this IRQ.  Userspace
 *           injections will have owner == NULL.
 *
 * The VGIC is not concerned with devices being active-LOW or active-HIGH for
 * level-sensitive interrupts.  You can think of the level parameter as 1
 * being HIGH and 0 being LOW and all devices being active-HIGH.
 */
int kvm_vgic_inject_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
                        unsigned int intid, bool level, void *owner)
{
        struct vgic_irq *irq;
        unsigned long flags;
        int ret;

        ret = vgic_lazy_init(kvm);
        if (ret)
                return ret;

        if (!vcpu && intid < VGIC_NR_PRIVATE_IRQS)
                return -EINVAL;

        trace_vgic_update_irq_pending(vcpu ? vcpu->vcpu_idx : 0, intid, level);

        if (intid < VGIC_NR_PRIVATE_IRQS)
                irq = vgic_get_vcpu_irq(vcpu, intid);
        else
                irq = vgic_get_irq(kvm, intid);
        if (!irq)
                return -EINVAL;

        raw_spin_lock_irqsave(&irq->irq_lock, flags);

        if (!vgic_validate_injection(irq, level, owner)) {
                /* Nothing to see here, move along... */
                raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
                vgic_put_irq(kvm, irq);
                return 0;
        }

        if (irq->config == VGIC_CONFIG_LEVEL)
                irq->line_level = level;
        else
                irq->pending_latch = true;

        vgic_queue_irq_unlock(kvm, irq, flags);
        vgic_put_irq(kvm, irq);

        return 0;
}

/* @irq->irq_lock must be held */
static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
                            unsigned int host_irq,
                            struct irq_ops *ops)
{
        struct irq_desc *desc;
        struct irq_data *data;

        /*
         * Find the physical IRQ number corresponding to @host_irq
         */
        desc = irq_to_desc(host_irq);
        if (!desc) {
                kvm_err("%s: no interrupt descriptor\n", __func__);
                return -EINVAL;
        }
        data = irq_desc_get_irq_data(desc);
        while (data->parent_data)
                data = data->parent_data;

        irq->hw = true;
        irq->host_irq = host_irq;
        irq->hwintid = data->hwirq;
        irq->ops = ops;
        return 0;
}

/* @irq->irq_lock must be held */
static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq)
{
        irq->hw = false;
        irq->hwintid = 0;
        irq->ops = NULL;
}

int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq,
                          u32 vintid, struct irq_ops *ops)
{
        struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
        unsigned long flags;
        int ret;

        BUG_ON(!irq);

        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        ret = kvm_vgic_map_irq(vcpu, irq, host_irq, ops);
        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
        vgic_put_irq(vcpu->kvm, irq);

        return ret;
}

/**
 * kvm_vgic_reset_mapped_irq - Reset a mapped IRQ
 * @vcpu: The VCPU pointer
 * @vintid: The INTID of the interrupt
 *
 * Reset the active and pending states of a mapped interrupt.  Kernel
 * subsystems injecting mapped interrupts should reset their interrupt lines
 * when we are doing a reset of the VM.
 */
void kvm_vgic_reset_mapped_irq(struct kvm_vcpu *vcpu, u32 vintid)
{
        struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
        unsigned long flags;

        if (!irq->hw)
                goto out;

        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        irq->active = false;
        irq->pending_latch = false;
        irq->line_level = false;
        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
out:
        vgic_put_irq(vcpu->kvm, irq);
}

int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid)
{
        struct vgic_irq *irq;
        unsigned long flags;

        if (!vgic_initialized(vcpu->kvm))
                return -EAGAIN;

        irq = vgic_get_vcpu_irq(vcpu, vintid);
        BUG_ON(!irq);

        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        kvm_vgic_unmap_irq(irq);
        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
        vgic_put_irq(vcpu->kvm, irq);

        return 0;
}

int kvm_vgic_get_map(struct kvm_vcpu *vcpu, unsigned int vintid)
{
        struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, vintid);
        unsigned long flags;
        int ret = -1;

        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        if (irq->hw)
                ret = irq->hwintid;
        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

        vgic_put_irq(vcpu->kvm, irq);
        return ret;
}

/**
 * kvm_vgic_set_owner - Set the owner of an interrupt for a VM
 *
 * @vcpu:   Pointer to the VCPU (used for PPIs)
 * @intid:  The virtual INTID identifying the interrupt (PPI or SPI)
 * @owner:  Opaque pointer to the owner
 *
 * Returns 0 if intid is not already used by another in-kernel device and the
 * owner is set, otherwise returns an error code.
 */
int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner)
{
        struct vgic_irq *irq;
        unsigned long flags;
        int ret = 0;

        if (!vgic_initialized(vcpu->kvm))
                return -EAGAIN;

        /* SGIs and LPIs cannot be wired up to any device */
        if (!irq_is_ppi(intid) && !vgic_valid_spi(vcpu->kvm, intid))
                return -EINVAL;

        irq = vgic_get_vcpu_irq(vcpu, intid);
        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        if (irq->owner && irq->owner != owner)
                ret = -EEXIST;
        else
                irq->owner = owner;
        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);

        return ret;
}

/**
 * vgic_prune_ap_list - Remove non-relevant interrupts from the list
 *
 * @vcpu: The VCPU pointer
 *
 * Go over the list of "interesting" interrupts, and prune those that we
 * won't have to consider in the near future.
 */
static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_irq *irq, *tmp;

        DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());

retry:
        raw_spin_lock(&vgic_cpu->ap_list_lock);

        list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) {
                struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB;
                bool target_vcpu_needs_kick = false;

                raw_spin_lock(&irq->irq_lock);

                BUG_ON(vcpu != irq->vcpu);

                target_vcpu = vgic_target_oracle(irq);

                if (!target_vcpu) {
                        /*
                         * We don't need to process this interrupt any
                         * further, move it off the list.
                         */
                        list_del(&irq->ap_list);
                        irq->vcpu = NULL;
                        raw_spin_unlock(&irq->irq_lock);

                        /*
                         * This vgic_put_irq call matches the
                         * vgic_get_irq_kref in vgic_queue_irq_unlock,
                         * where we added the LPI to the ap_list. As
                         * we remove the irq from the list, we drop
                         * also drop the refcount.
                         */
                        vgic_put_irq(vcpu->kvm, irq);
                        continue;
                }

                if (target_vcpu == vcpu) {
                        /* We're on the right CPU */
                        raw_spin_unlock(&irq->irq_lock);
                        continue;
                }

                /* This interrupt looks like it has to be migrated. */

                raw_spin_unlock(&irq->irq_lock);
                raw_spin_unlock(&vgic_cpu->ap_list_lock);

                /*
                 * Ensure locking order by always locking the smallest
                 * ID first.
                 */
                if (vcpu->vcpu_id < target_vcpu->vcpu_id) {
                        vcpuA = vcpu;
                        vcpuB = target_vcpu;
                } else {
                        vcpuA = target_vcpu;
                        vcpuB = vcpu;
                }

                raw_spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock);
                raw_spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock,
                                      SINGLE_DEPTH_NESTING);
                raw_spin_lock(&irq->irq_lock);

                /*
                 * If the affinity has been preserved, move the
                 * interrupt around. Otherwise, it means things have
                 * changed while the interrupt was unlocked, and we
                 * need to replay this.
                 *
                 * In all cases, we cannot trust the list not to have
                 * changed, so we restart from the beginning.
                 */
                if (target_vcpu == vgic_target_oracle(irq)) {
                        struct vgic_cpu *new_cpu = &target_vcpu->arch.vgic_cpu;

                        list_del(&irq->ap_list);
                        irq->vcpu = target_vcpu;
                        list_add_tail(&irq->ap_list, &new_cpu->ap_list_head);
                        target_vcpu_needs_kick = true;
                }

                raw_spin_unlock(&irq->irq_lock);
                raw_spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock);
                raw_spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock);

                if (target_vcpu_needs_kick) {
                        kvm_make_request(KVM_REQ_IRQ_PENDING, target_vcpu);
                        kvm_vcpu_kick(target_vcpu);
                }

                goto retry;
        }

        raw_spin_unlock(&vgic_cpu->ap_list_lock);
}

static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
{
        if (kvm_vgic_global_state.type == VGIC_V2)
                vgic_v2_fold_lr_state(vcpu);
        else
                vgic_v3_fold_lr_state(vcpu);
}

/* Requires the irq_lock to be held. */
static inline void vgic_populate_lr(struct kvm_vcpu *vcpu,
                                    struct vgic_irq *irq, int lr)
{
        lockdep_assert_held(&irq->irq_lock);

        if (kvm_vgic_global_state.type == VGIC_V2)
                vgic_v2_populate_lr(vcpu, irq, lr);
        else
                vgic_v3_populate_lr(vcpu, irq, lr);
}

static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
{
        if (kvm_vgic_global_state.type == VGIC_V2)
                vgic_v2_clear_lr(vcpu, lr);
        else
                vgic_v3_clear_lr(vcpu, lr);
}

static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
{
        if (kvm_vgic_global_state.type == VGIC_V2)
                vgic_v2_set_underflow(vcpu);
        else
                vgic_v3_set_underflow(vcpu);
}

/* Requires the ap_list_lock to be held. */
static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
                                 bool *multi_sgi)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_irq *irq;
        int count = 0;

        *multi_sgi = false;

        lockdep_assert_held(&vgic_cpu->ap_list_lock);

        list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
                int w;

                raw_spin_lock(&irq->irq_lock);
                /* GICv2 SGIs can count for more than one... */
                w = vgic_irq_get_lr_count(irq);
                raw_spin_unlock(&irq->irq_lock);

                count += w;
                *multi_sgi |= (w > 1);
        }
        return count;
}

/* Requires the VCPU's ap_list_lock to be held. */
static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_irq *irq;
        int count;
        bool multi_sgi;
        u8 prio = 0xff;
        int i = 0;

        lockdep_assert_held(&vgic_cpu->ap_list_lock);

        count = compute_ap_list_depth(vcpu, &multi_sgi);
        if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
                vgic_sort_ap_list(vcpu);

        count = 0;

        list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
                raw_spin_lock(&irq->irq_lock);

                /*
                 * If we have multi-SGIs in the pipeline, we need to
                 * guarantee that they are all seen before any IRQ of
                 * lower priority. In that case, we need to filter out
                 * these interrupts by exiting early. This is easy as
                 * the AP list has been sorted already.
                 */
                if (multi_sgi && irq->priority > prio) {
                        _raw_spin_unlock(&irq->irq_lock);
                        break;
                }

                if (likely(vgic_target_oracle(irq) == vcpu)) {
                        vgic_populate_lr(vcpu, irq, count++);

                        if (irq->source)
                                prio = irq->priority;
                }

                raw_spin_unlock(&irq->irq_lock);

                if (count == kvm_vgic_global_state.nr_lr) {
                        if (!list_is_last(&irq->ap_list,
                                          &vgic_cpu->ap_list_head))
                                vgic_set_underflow(vcpu);
                        break;
                }
        }

        /* Nuke remaining LRs */
        for (i = count ; i < kvm_vgic_global_state.nr_lr; i++)
                vgic_clear_lr(vcpu, i);

        if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
                vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count;
        else
                vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count;
}

static inline bool can_access_vgic_from_kernel(void)
{
        /*
         * GICv2 can always be accessed from the kernel because it is
         * memory-mapped, and VHE systems can access GICv3 EL2 system
         * registers.
         */
        return !static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) || has_vhe();
}

static inline void vgic_save_state(struct kvm_vcpu *vcpu)
{
        if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
                vgic_v2_save_state(vcpu);
        else
                __vgic_v3_save_state(&vcpu->arch.vgic_cpu.vgic_v3);
}

/* Sync back the hardware VGIC state into our emulation after a guest's run. */
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
        int used_lrs;

        /* If nesting, emulate the HW effect from L0 to L1 */
        if (vgic_state_is_nested(vcpu)) {
                vgic_v3_sync_nested(vcpu);
                return;
        }

        if (vcpu_has_nv(vcpu))
                vgic_v3_nested_update_mi(vcpu);

        /* An empty ap_list_head implies used_lrs == 0 */
        if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
                return;

        if (can_access_vgic_from_kernel())
                vgic_save_state(vcpu);

        if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
                used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
        else
                used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;

        if (used_lrs)
                vgic_fold_lr_state(vcpu);
        vgic_prune_ap_list(vcpu);
}

static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
{
        if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
                vgic_v2_restore_state(vcpu);
        else
                __vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
}

/* Flush our emulation state into the GIC hardware before entering the guest. */
void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
{
        /*
         * If in a nested state, we must return early. Two possibilities:
         *
         * - If we have any pending IRQ for the guest and the guest
         *   expects IRQs to be handled in its virtual EL2 mode (the
         *   virtual IMO bit is set) and it is not already running in
         *   virtual EL2 mode, then we have to emulate an IRQ
         *   exception to virtual EL2.
         *
         *   We do that by placing a request to ourselves which will
         *   abort the entry procedure and inject the exception at the
         *   beginning of the run loop.
         *
         * - Otherwise, do exactly *NOTHING*. The guest state is
         *   already loaded, and we can carry on with running it.
         *
         * If we have NV, but are not in a nested state, compute the
         * maintenance interrupt state, as it may fire.
         */
        if (vgic_state_is_nested(vcpu)) {
                if (kvm_vgic_vcpu_pending_irq(vcpu))
                        kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu);

                return;
        }

        if (vcpu_has_nv(vcpu))
                vgic_v3_nested_update_mi(vcpu);

        /*
         * If there are no virtual interrupts active or pending for this
         * VCPU, then there is no work to do and we can bail out without
         * taking any lock.  There is a potential race with someone injecting
         * interrupts to the VCPU, but it is a benign race as the VCPU will
         * either observe the new interrupt before or after doing this check,
         * and introducing additional synchronization mechanism doesn't change
         * this.
         *
         * Note that we still need to go through the whole thing if anything
         * can be directly injected (GICv4).
         */
        if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
            !vgic_supports_direct_msis(vcpu->kvm))
                return;

        DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());

        if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
                raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
                vgic_flush_lr_state(vcpu);
                raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
        }

        if (can_access_vgic_from_kernel())
                vgic_restore_state(vcpu);

        if (vgic_supports_direct_msis(vcpu->kvm))
                vgic_v4_commit(vcpu);
}

void kvm_vgic_load(struct kvm_vcpu *vcpu)
{
        if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
                if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
                        __vgic_v3_activate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
                return;
        }

        if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
                vgic_v2_load(vcpu);
        else
                vgic_v3_load(vcpu);
}

void kvm_vgic_put(struct kvm_vcpu *vcpu)
{
        if (unlikely(!irqchip_in_kernel(vcpu->kvm) || !vgic_initialized(vcpu->kvm))) {
                if (has_vhe() && static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
                        __vgic_v3_deactivate_traps(&vcpu->arch.vgic_cpu.vgic_v3);
                return;
        }

        if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
                vgic_v2_put(vcpu);
        else
                vgic_v3_put(vcpu);
}

int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
{
        struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_irq *irq;
        bool pending = false;
        unsigned long flags;
        struct vgic_vmcr vmcr;

        if (!vcpu->kvm->arch.vgic.enabled)
                return false;

        if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
                return true;

        vgic_get_vmcr(vcpu, &vmcr);

        raw_spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);

        list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
                raw_spin_lock(&irq->irq_lock);
                pending = irq_is_pending(irq) && irq->enabled &&
                          !irq->active &&
                          irq->priority < vmcr.pmr;
                raw_spin_unlock(&irq->irq_lock);

                if (pending)
                        break;
        }

        raw_spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags);

        return pending;
}

void vgic_kick_vcpus(struct kvm *kvm)
{
        struct kvm_vcpu *vcpu;
        unsigned long c;

        /*
         * We've injected an interrupt, time to find out who deserves
         * a good kick...
         */
        kvm_for_each_vcpu(c, vcpu, kvm) {
                if (kvm_vgic_vcpu_pending_irq(vcpu)) {
                        kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
                        kvm_vcpu_kick(vcpu);
                }
        }
}

bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid)
{
        struct vgic_irq *irq;
        bool map_is_active;
        unsigned long flags;

        if (!vgic_initialized(vcpu->kvm))
                return false;

        irq = vgic_get_vcpu_irq(vcpu, vintid);
        raw_spin_lock_irqsave(&irq->irq_lock, flags);
        map_is_active = irq->hw && irq->active;
        raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
        vgic_put_irq(vcpu->kvm, irq);

        return map_is_active;
}

/*
 * Level-triggered mapped IRQs are special because we only observe rising
 * edges as input to the VGIC.
 *
 * If the guest never acked the interrupt we have to sample the physical
 * line and set the line level, because the device state could have changed
 * or we simply need to process the still pending interrupt later.
 *
 * We could also have entered the guest with the interrupt active+pending.
 * On the next exit, we need to re-evaluate the pending state, as it could
 * otherwise result in a spurious interrupt by injecting a now potentially
 * stale pending state.
 *
 * If this causes us to lower the level, we have to also clear the physical
 * active state, since we will otherwise never be told when the interrupt
 * becomes asserted again.
 *
 * Another case is when the interrupt requires a helping hand on
 * deactivation (no HW deactivation, for example).
 */
void vgic_irq_handle_resampling(struct vgic_irq *irq,
                                bool lr_deactivated, bool lr_pending)
{
        if (vgic_irq_is_mapped_level(irq)) {
                bool resample = false;

                if (unlikely(vgic_irq_needs_resampling(irq))) {
                        resample = !(irq->active || irq->pending_latch);
                } else if (lr_pending || (lr_deactivated && irq->line_level)) {
                        irq->line_level = vgic_get_phys_line_level(irq);
                        resample = !irq->line_level;
                }

                if (resample)
                        vgic_irq_set_phys_active(irq, false);
        }
}






















































































    4 












    4 




    4 






























    3 
























    4 







    4 
























































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
// SPDX-License-Identifier: GPL-2.0-or-later

#include <linux/syscalls.h>
#include <linux/time_namespace.h>

#include "futex.h"

/*
 * Support for robust futexes: the kernel cleans up held futexes at
 * thread exit time.
 *
 * Implementation: user-space maintains a per-thread list of locks it
 * is holding. Upon do_exit(), the kernel carefully walks this list,
 * and marks all locks that are owned by this thread with the
 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
 * always manipulated with the lock held, so the list is private and
 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
 * field, to allow the kernel to clean up if the thread dies after
 * acquiring the lock, but just before it could have added itself to
 * the list. There can only be one such pending lock.
 */

/**
 * sys_set_robust_list() - Set the robust-futex list head of a task
 * @head:        pointer to the list-head
 * @len:        length of the list-head, as userspace expects
 */
SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
                size_t, len)
{
        /*
         * The kernel knows only one size for now:
         */
        if (unlikely(len != sizeof(*head)))
                return -EINVAL;

        current->robust_list = head;

        return 0;
}

/**
 * sys_get_robust_list() - Get the robust-futex list head of a task
 * @pid:        pid of the process [zero for current task]
 * @head_ptr:        pointer to a list-head pointer, the kernel fills it in
 * @len_ptr:        pointer to a length field, the kernel fills in the header size
 */
SYSCALL_DEFINE3(get_robust_list, int, pid,
                struct robust_list_head __user * __user *, head_ptr,
                size_t __user *, len_ptr)
{
        struct robust_list_head __user *head;
        unsigned long ret;
        struct task_struct *p;

        rcu_read_lock();

        ret = -ESRCH;
        if (!pid)
                p = current;
        else {
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
        }

        ret = -EPERM;
        if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
                goto err_unlock;

        head = p->robust_list;
        rcu_read_unlock();

        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(head, head_ptr);

err_unlock:
        rcu_read_unlock();

        return ret;
}

long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
{
        unsigned int flags = futex_to_flags(op);
        int cmd = op & FUTEX_CMD_MASK;

        if (flags & FLAGS_CLOCKRT) {
                if (cmd != FUTEX_WAIT_BITSET &&
                    cmd != FUTEX_WAIT_REQUEUE_PI &&
                    cmd != FUTEX_LOCK_PI2)
                        return -ENOSYS;
        }

        switch (cmd) {
        case FUTEX_WAIT:
                val3 = FUTEX_BITSET_MATCH_ANY;
                fallthrough;
        case FUTEX_WAIT_BITSET:
                return futex_wait(uaddr, flags, val, timeout, val3);
        case FUTEX_WAKE:
                val3 = FUTEX_BITSET_MATCH_ANY;
                fallthrough;
        case FUTEX_WAKE_BITSET:
                return futex_wake(uaddr, flags, val, val3);
        case FUTEX_REQUEUE:
                return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, NULL, 0);
        case FUTEX_CMP_REQUEUE:
                return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 0);
        case FUTEX_WAKE_OP:
                return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
        case FUTEX_LOCK_PI:
                flags |= FLAGS_CLOCKRT;
                fallthrough;
        case FUTEX_LOCK_PI2:
                return futex_lock_pi(uaddr, flags, timeout, 0);
        case FUTEX_UNLOCK_PI:
                return futex_unlock_pi(uaddr, flags);
        case FUTEX_TRYLOCK_PI:
                return futex_lock_pi(uaddr, flags, NULL, 1);
        case FUTEX_WAIT_REQUEUE_PI:
                val3 = FUTEX_BITSET_MATCH_ANY;
                return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
                                             uaddr2);
        case FUTEX_CMP_REQUEUE_PI:
                return futex_requeue(uaddr, flags, uaddr2, flags, val, val2, &val3, 1);
        }
        return -ENOSYS;
}

static __always_inline bool futex_cmd_has_timeout(u32 cmd)
{
        switch (cmd) {
        case FUTEX_WAIT:
        case FUTEX_LOCK_PI:
        case FUTEX_LOCK_PI2:
        case FUTEX_WAIT_BITSET:
        case FUTEX_WAIT_REQUEUE_PI:
                return true;
        }
        return false;
}

static __always_inline int
futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
{
        if (!timespec64_valid(ts))
                return -EINVAL;

        *t = timespec64_to_ktime(*ts);
        if (cmd == FUTEX_WAIT)
                *t = ktime_add_safe(ktime_get(), *t);
        else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
                *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
        return 0;
}

SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
                const struct __kernel_timespec __user *, utime,
                u32 __user *, uaddr2, u32, val3)
{
        int ret, cmd = op & FUTEX_CMD_MASK;
        ktime_t t, *tp = NULL;
        struct timespec64 ts;

        if (utime && futex_cmd_has_timeout(cmd)) {
                if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
                        return -EFAULT;
                if (get_timespec64(&ts, utime))
                        return -EFAULT;
                ret = futex_init_timeout(cmd, op, &ts, &t);
                if (ret)
                        return ret;
                tp = &t;
        }

        return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}

/**
 * futex_parse_waitv - Parse a waitv array from userspace
 * @futexv:        Kernel side list of waiters to be filled
 * @uwaitv:     Userspace list to be parsed
 * @nr_futexes: Length of futexv
 * @wake:        Wake to call when futex is woken
 * @wake_data:        Data for the wake handler
 *
 * Return: Error code on failure, 0 on success
 */
int futex_parse_waitv(struct futex_vector *futexv,
                      struct futex_waitv __user *uwaitv,
                      unsigned int nr_futexes, futex_wake_fn *wake,
                      void *wake_data)
{
        struct futex_waitv aux;
        unsigned int i;

        for (i = 0; i < nr_futexes; i++) {
                unsigned int flags;

                if (copy_from_user(&aux, &uwaitv[i], sizeof(aux)))
                        return -EFAULT;

                if ((aux.flags & ~FUTEX2_VALID_MASK) || aux.__reserved)
                        return -EINVAL;

                flags = futex2_to_flags(aux.flags);
                if (!futex_flags_valid(flags))
                        return -EINVAL;

                if (!futex_validate_input(flags, aux.val))
                        return -EINVAL;

                futexv[i].w.flags = flags;
                futexv[i].w.val = aux.val;
                futexv[i].w.uaddr = aux.uaddr;
                futexv[i].q = futex_q_init;
                futexv[i].q.wake = wake;
                futexv[i].q.wake_data = wake_data;
        }

        return 0;
}

static int futex2_setup_timeout(struct __kernel_timespec __user *timeout,
                                clockid_t clockid, struct hrtimer_sleeper *to)
{
        int flag_clkid = 0, flag_init = 0;
        struct timespec64 ts;
        ktime_t time;
        int ret;

        if (!timeout)
                return 0;

        if (clockid == CLOCK_REALTIME) {
                flag_clkid = FLAGS_CLOCKRT;
                flag_init = FUTEX_CLOCK_REALTIME;
        }

        if (clockid != CLOCK_REALTIME && clockid != CLOCK_MONOTONIC)
                return -EINVAL;

        if (get_timespec64(&ts, timeout))
                return -EFAULT;

        /*
         * Since there's no opcode for futex_waitv, use
         * FUTEX_WAIT_BITSET that uses absolute timeout as well
         */
        ret = futex_init_timeout(FUTEX_WAIT_BITSET, flag_init, &ts, &time);
        if (ret)
                return ret;

        futex_setup_timer(&time, to, flag_clkid, 0);
        return 0;
}

static inline void futex2_destroy_timeout(struct hrtimer_sleeper *to)
{
        hrtimer_cancel(&to->timer);
        destroy_hrtimer_on_stack(&to->timer);
}

/**
 * sys_futex_waitv - Wait on a list of futexes
 * @waiters:    List of futexes to wait on
 * @nr_futexes: Length of futexv
 * @flags:      Flag for timeout (monotonic/realtime)
 * @timeout:        Optional absolute timeout.
 * @clockid:        Clock to be used for the timeout, realtime or monotonic.
 *
 * Given an array of `struct futex_waitv`, wait on each uaddr. The thread wakes
 * if a futex_wake() is performed at any uaddr. The syscall returns immediately
 * if any waiter has *uaddr != val. *timeout is an optional timeout value for
 * the operation. Each waiter has individual flags. The `flags` argument for
 * the syscall should be used solely for specifying the timeout as realtime, if
 * needed. Flags for private futexes, sizes, etc. should be used on the
 * individual flags of each waiter.
 *
 * Returns the array index of one of the woken futexes. No further information
 * is provided: any number of other futexes may also have been woken by the
 * same event, and if more than one futex was woken, the retrned index may
 * refer to any one of them. (It is not necessaryily the futex with the
 * smallest index, nor the one most recently woken, nor...)
 */

SYSCALL_DEFINE5(futex_waitv, struct futex_waitv __user *, waiters,
                unsigned int, nr_futexes, unsigned int, flags,
                struct __kernel_timespec __user *, timeout, clockid_t, clockid)
{
        struct hrtimer_sleeper to;
        struct futex_vector *futexv;
        int ret;

        /* This syscall supports no flags for now */
        if (flags)
                return -EINVAL;

        if (!nr_futexes || nr_futexes > FUTEX_WAITV_MAX || !waiters)
                return -EINVAL;

        if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
                return ret;

        futexv = kcalloc(nr_futexes, sizeof(*futexv), GFP_KERNEL);
        if (!futexv) {
                ret = -ENOMEM;
                goto destroy_timer;
        }

        ret = futex_parse_waitv(futexv, waiters, nr_futexes, futex_wake_mark,
                                NULL);
        if (!ret)
                ret = futex_wait_multiple(futexv, nr_futexes, timeout ? &to : NULL);

        kfree(futexv);

destroy_timer:
        if (timeout)
                futex2_destroy_timeout(&to);
        return ret;
}

/*
 * sys_futex_wake - Wake a number of futexes
 * @uaddr:        Address of the futex(es) to wake
 * @mask:        bitmask
 * @nr:                Number of the futexes to wake
 * @flags:        FUTEX2 flags
 *
 * Identical to the traditional FUTEX_WAKE_BITSET op, except it is part of the
 * futex2 family of calls.
 */

SYSCALL_DEFINE4(futex_wake,
                void __user *, uaddr,
                unsigned long, mask,
                int, nr,
                unsigned int, flags)
{
        if (flags & ~FUTEX2_VALID_MASK)
                return -EINVAL;

        flags = futex2_to_flags(flags);
        if (!futex_flags_valid(flags))
                return -EINVAL;

        if (!futex_validate_input(flags, mask))
                return -EINVAL;

        return futex_wake(uaddr, FLAGS_STRICT | flags, nr, mask);
}

/*
 * sys_futex_wait - Wait on a futex
 * @uaddr:        Address of the futex to wait on
 * @val:        Value of @uaddr
 * @mask:        bitmask
 * @flags:        FUTEX2 flags
 * @timeout:        Optional absolute timeout
 * @clockid:        Clock to be used for the timeout, realtime or monotonic
 *
 * Identical to the traditional FUTEX_WAIT_BITSET op, except it is part of the
 * futex2 familiy of calls.
 */

SYSCALL_DEFINE6(futex_wait,
                void __user *, uaddr,
                unsigned long, val,
                unsigned long, mask,
                unsigned int, flags,
                struct __kernel_timespec __user *, timeout,
                clockid_t, clockid)
{
        struct hrtimer_sleeper to;
        int ret;

        if (flags & ~FUTEX2_VALID_MASK)
                return -EINVAL;

        flags = futex2_to_flags(flags);
        if (!futex_flags_valid(flags))
                return -EINVAL;

        if (!futex_validate_input(flags, val) ||
            !futex_validate_input(flags, mask))
                return -EINVAL;

        if (timeout && (ret = futex2_setup_timeout(timeout, clockid, &to)))
                return ret;

        ret = __futex_wait(uaddr, flags, val, timeout ? &to : NULL, mask);

        if (timeout)
                futex2_destroy_timeout(&to);

        return ret;
}

/*
 * sys_futex_requeue - Requeue a waiter from one futex to another
 * @waiters:        array describing the source and destination futex
 * @flags:        unused
 * @nr_wake:        number of futexes to wake
 * @nr_requeue:        number of futexes to requeue
 *
 * Identical to the traditional FUTEX_CMP_REQUEUE op, except it is part of the
 * futex2 family of calls.
 */

SYSCALL_DEFINE4(futex_requeue,
                struct futex_waitv __user *, waiters,
                unsigned int, flags,
                int, nr_wake,
                int, nr_requeue)
{
        struct futex_vector futexes[2];
        u32 cmpval;
        int ret;

        if (flags)
                return -EINVAL;

        if (!waiters)
                return -EINVAL;

        ret = futex_parse_waitv(futexes, waiters, 2, futex_wake_mark, NULL);
        if (ret)
                return ret;

        cmpval = futexes[0].w.val;

        return futex_requeue(u64_to_user_ptr(futexes[0].w.uaddr), futexes[0].w.flags,
                             u64_to_user_ptr(futexes[1].w.uaddr), futexes[1].w.flags,
                             nr_wake, nr_requeue, &cmpval, 0);
}

#ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE2(set_robust_list,
                struct compat_robust_list_head __user *, head,
                compat_size_t, len)
{
        if (unlikely(len != sizeof(*head)))
                return -EINVAL;

        current->compat_robust_list = head;

        return 0;
}

COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
                        compat_uptr_t __user *, head_ptr,
                        compat_size_t __user *, len_ptr)
{
        struct compat_robust_list_head __user *head;
        unsigned long ret;
        struct task_struct *p;

        rcu_read_lock();

        ret = -ESRCH;
        if (!pid)
                p = current;
        else {
                p = find_task_by_vpid(pid);
                if (!p)
                        goto err_unlock;
        }

        ret = -EPERM;
        if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
                goto err_unlock;

        head = p->compat_robust_list;
        rcu_read_unlock();

        if (put_user(sizeof(*head), len_ptr))
                return -EFAULT;
        return put_user(ptr_to_compat(head), head_ptr);

err_unlock:
        rcu_read_unlock();

        return ret;
}
#endif /* CONFIG_COMPAT */

#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
                const struct old_timespec32 __user *, utime, u32 __user *, uaddr2,
                u32, val3)
{
        int ret, cmd = op & FUTEX_CMD_MASK;
        ktime_t t, *tp = NULL;
        struct timespec64 ts;

        if (utime && futex_cmd_has_timeout(cmd)) {
                if (get_old_timespec32(&ts, utime))
                        return -EFAULT;
                ret = futex_init_timeout(cmd, op, &ts, &t);
                if (ret)
                        return ret;
                tp = &t;
        }

        return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#endif /* CONFIG_COMPAT_32BIT_TIME */





































































































































































































































  383 



  302 








  373 








   22 



















   18 

























  384 
  370 


   18 























































































































































 1257 
















 1251 


    1 




  363 




 1255 









  387 









 1217 

  383 







   54 
 1256 
    3 
   22 




 1257 
    1 
    1 



 1260 















 1258 
    3 
   11 




 1259 





 1264 
   23 
 1260 



 1261 






















   21 









   21 
















 1279 


 1282 




   21 

















 1281 

 1282 

 1254 
 1066 









































































 1281 



 1285 

























































    1 








































    1 











    1 







    1 









































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 















































































































































































































































































































































































































































































































































































































































































































































    4 







































    3 

















































    1 




















 1280 

























 1283 

























 1284 






 1277 





 1280 

   53 





   54 
 1282 









 1277 






















































 1281 

 1258 







 1274 
 1258 










































































 1279 





















 1285 







 1255 

 1261 










  366 

 1257 
















   21 






   21 



   21 








 1280 



    4 

    4 























 1257 

 1260 

   22 
























   22 







   22 





















 1231 




































































  287 


























































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/lib/vsprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/* vsprintf.c -- Lars Wirzenius & Linus Torvalds. */
/*
 * Wirzenius wrote this portably, Torvalds fucked it up :-)
 */

/*
 * Fri Jul 13 2001 Crutcher Dunnavant <crutcher+kernel@datastacks.com>
 * - changed to provide snprintf and vsnprintf functions
 * So Feb  1 16:51:32 CET 2004 Juergen Quade <quade@hsnr.de>
 * - scnprintf and vscnprintf
 */

#include <linux/stdarg.h>
#include <linux/build_bug.h>
#include <linux/clk.h>
#include <linux/clk-provider.h>
#include <linux/errname.h>
#include <linux/module.h>        /* for KSYM_SYMBOL_LEN */
#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/kernel.h>
#include <linux/kallsyms.h>
#include <linux/math64.h>
#include <linux/uaccess.h>
#include <linux/ioport.h>
#include <linux/dcache.h>
#include <linux/cred.h>
#include <linux/rtc.h>
#include <linux/sprintf.h>
#include <linux/time.h>
#include <linux/uuid.h>
#include <linux/of.h>
#include <net/addrconf.h>
#include <linux/siphash.h>
#include <linux/compiler.h>
#include <linux/property.h>
#include <linux/notifier.h>
#ifdef CONFIG_BLOCK
#include <linux/blkdev.h>
#endif

#include "../mm/internal.h"        /* For the trace_print_flags arrays */

#include <asm/page.h>                /* for PAGE_SIZE */
#include <asm/byteorder.h>        /* cpu_to_le16 */
#include <linux/unaligned.h>

#include <linux/string_helpers.h>
#include "kstrtox.h"

/* Disable pointer hashing if requested */
bool no_hash_pointers __ro_after_init;
EXPORT_SYMBOL_GPL(no_hash_pointers);

noinline
static unsigned long long simple_strntoull(const char *startp, char **endp, unsigned int base, size_t max_chars)
{
        const char *cp;
        unsigned long long result = 0ULL;
        size_t prefix_chars;
        unsigned int rv;

        cp = _parse_integer_fixup_radix(startp, &base);
        prefix_chars = cp - startp;
        if (prefix_chars < max_chars) {
                rv = _parse_integer_limit(cp, base, &result, max_chars - prefix_chars);
                /* FIXME */
                cp += (rv & ~KSTRTOX_OVERFLOW);
        } else {
                /* Field too short for prefix + digit, skip over without converting */
                cp = startp + max_chars;
        }

        if (endp)
                *endp = (char *)cp;

        return result;
}

/**
 * simple_strtoull - convert a string to an unsigned long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoull instead.
 */
noinline
unsigned long long simple_strtoull(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoull(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoull);

/**
 * simple_strtoul - convert a string to an unsigned long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoul instead.
 */
unsigned long simple_strtoul(const char *cp, char **endp, unsigned int base)
{
        return simple_strtoull(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtoul);

unsigned long simple_strntoul(const char *cp, char **endp, unsigned int base,
                              size_t max_chars)
{
        return simple_strntoull(cp, endp, base, max_chars);
}
EXPORT_SYMBOL(simple_strntoul);

/**
 * simple_strtol - convert a string to a signed long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtol instead.
 */
long simple_strtol(const char *cp, char **endp, unsigned int base)
{
        if (*cp == '-')
                return -simple_strtoul(cp + 1, endp, base);

        return simple_strtoul(cp, endp, base);
}
EXPORT_SYMBOL(simple_strtol);

noinline
static long long simple_strntoll(const char *cp, char **endp, unsigned int base, size_t max_chars)
{
        /*
         * simple_strntoull() safely handles receiving max_chars==0 in the
         * case cp[0] == '-' && max_chars == 1.
         * If max_chars == 0 we can drop through and pass it to simple_strntoull()
         * and the content of *cp is irrelevant.
         */
        if (*cp == '-' && max_chars > 0)
                return -simple_strntoull(cp + 1, endp, base, max_chars - 1);

        return simple_strntoull(cp, endp, base, max_chars);
}

/**
 * simple_strtoll - convert a string to a signed long long
 * @cp: The start of the string
 * @endp: A pointer to the end of the parsed string will be placed here
 * @base: The number base to use
 *
 * This function has caveats. Please use kstrtoll instead.
 */
long long simple_strtoll(const char *cp, char **endp, unsigned int base)
{
        return simple_strntoll(cp, endp, base, INT_MAX);
}
EXPORT_SYMBOL(simple_strtoll);

static inline int skip_atoi(const char **s)
{
        int i = 0;

        do {
                i = i*10 + *((*s)++) - '0';
        } while (isdigit(**s));

        return i;
}

/*
 * Decimal conversion is by far the most typical, and is used for
 * /proc and /sys data. This directly impacts e.g. top performance
 * with many processes running. We optimize it for speed by emitting
 * two characters at a time, using a 200 byte lookup table. This
 * roughly halves the number of multiplications compared to computing
 * the digits one at a time. Implementation strongly inspired by the
 * previous version, which in turn used ideas described at
 * <http://www.cs.uiowa.edu/~jones/bcd/divide.html> (with permission
 * from the author, Douglas W. Jones).
 *
 * It turns out there is precisely one 26 bit fixed-point
 * approximation a of 64/100 for which x/100 == (x * (u64)a) >> 32
 * holds for all x in [0, 10^8-1], namely a = 0x28f5c29. The actual
 * range happens to be somewhat larger (x <= 1073741898), but that's
 * irrelevant for our purpose.
 *
 * For dividing a number in the range [10^4, 10^6-1] by 100, we still
 * need a 32x32->64 bit multiply, so we simply use the same constant.
 *
 * For dividing a number in the range [100, 10^4-1] by 100, there are
 * several options. The simplest is (x * 0x147b) >> 19, which is valid
 * for all x <= 43698.
 */

static const u16 decpair[100] = {
#define _(x) (__force u16) cpu_to_le16(((x % 10) | ((x / 10) << 8)) + 0x3030)
        _( 0), _( 1), _( 2), _( 3), _( 4), _( 5), _( 6), _( 7), _( 8), _( 9),
        _(10), _(11), _(12), _(13), _(14), _(15), _(16), _(17), _(18), _(19),
        _(20), _(21), _(22), _(23), _(24), _(25), _(26), _(27), _(28), _(29),
        _(30), _(31), _(32), _(33), _(34), _(35), _(36), _(37), _(38), _(39),
        _(40), _(41), _(42), _(43), _(44), _(45), _(46), _(47), _(48), _(49),
        _(50), _(51), _(52), _(53), _(54), _(55), _(56), _(57), _(58), _(59),
        _(60), _(61), _(62), _(63), _(64), _(65), _(66), _(67), _(68), _(69),
        _(70), _(71), _(72), _(73), _(74), _(75), _(76), _(77), _(78), _(79),
        _(80), _(81), _(82), _(83), _(84), _(85), _(86), _(87), _(88), _(89),
        _(90), _(91), _(92), _(93), _(94), _(95), _(96), _(97), _(98), _(99),
#undef _
};

/*
 * This will print a single '0' even if r == 0, since we would
 * immediately jump to out_r where two 0s would be written but only
 * one of them accounted for in buf. This is needed by ip4_string
 * below. All other callers pass a non-zero value of r.
*/
static noinline_for_stack
char *put_dec_trunc8(char *buf, unsigned r)
{
        unsigned q;

        /* 1 <= r < 10^8 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 1 <= q < 10^6 */
        if (q < 100)
                goto out_q;

        /*  100 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 1 <= r < 10^4 */
        if (r < 100)
                goto out_r;

        /* 100 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
out_q:
        /* 1 <= q < 100 */
        r = q;
out_r:
        /* 1 <= r < 100 */
        *((u16 *)buf) = decpair[r];
        buf += r < 10 ? 1 : 2;
        return buf;
}

#if BITS_PER_LONG == 64 && BITS_PER_LONG_LONG == 64
static noinline_for_stack
char *put_dec_full8(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^8 */
        q = (r * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 10^6 */
        r = (q * (u64)0x28f5c29) >> 32;
        *((u16 *)buf) = decpair[q - 100*r];
        buf += 2;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;

        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
        buf += 2;
        return buf;
}

static noinline_for_stack
char *put_dec(char *buf, unsigned long long n)
{
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n <= 1.6e11 */
        if (n >= 100*1000*1000)
                buf = put_dec_full8(buf, do_div(n, 100*1000*1000));
        /* 1 <= n < 1e8 */
        return put_dec_trunc8(buf, n);
}

#elif BITS_PER_LONG == 32 && BITS_PER_LONG_LONG == 64

static void
put_dec_full4(char *buf, unsigned r)
{
        unsigned q;

        /* 0 <= r < 10^4 */
        q = (r * 0x147b) >> 19;
        *((u16 *)buf) = decpair[r - 100*q];
        buf += 2;
        /* 0 <= q < 100 */
        *((u16 *)buf) = decpair[q];
}

/*
 * Call put_dec_full4 on x % 10000, return x / 10000.
 * The approximation x/10000 == (x * 0x346DC5D7) >> 43
 * holds for all x < 1,128,869,999.  The largest value this
 * helper will ever be asked to convert is 1,125,520,955.
 * (second call in the put_dec code, assuming n is all-ones).
 */
static noinline_for_stack
unsigned put_dec_helper4(char *buf, unsigned x)
{
        uint32_t q = (x * (uint64_t)0x346DC5D7) >> 43;

        put_dec_full4(buf, x - q * 10000);
        return q;
}

/* Based on code by Douglas W. Jones found at
 * <http://www.cs.uiowa.edu/~jones/bcd/decimal.html#sixtyfour>
 * (with permission from the author).
 * Performs no 64-bit division and hence should be fast on 32-bit machines.
 */
static
char *put_dec(char *buf, unsigned long long n)
{
        uint32_t d3, d2, d1, q, h;

        if (n < 100*1000*1000)
                return put_dec_trunc8(buf, n);

        d1  = ((uint32_t)n >> 16); /* implicit "& 0xffff" */
        h   = (n >> 32);
        d2  = (h      ) & 0xffff;
        d3  = (h >> 16); /* implicit "& 0xffff" */

        /* n = 2^48 d3 + 2^32 d2 + 2^16 d1 + d0
             = 281_4749_7671_0656 d3 + 42_9496_7296 d2 + 6_5536 d1 + d0 */
        q   = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff);
        q = put_dec_helper4(buf, q);

        q += 7671 * d3 + 9496 * d2 + 6 * d1;
        q = put_dec_helper4(buf+4, q);

        q += 4749 * d3 + 42 * d2;
        q = put_dec_helper4(buf+8, q);

        q += 281 * d3;
        buf += 12;
        if (q)
                buf = put_dec_trunc8(buf, q);
        else while (buf[-1] == '0')
                --buf;

        return buf;
}

#endif

/*
 * Convert passed number to decimal string.
 * Returns the length of string.  On buffer overflow, returns 0.
 *
 * If speed is not important, use snprintf(). It's easy to read the code.
 */
int num_to_str(char *buf, int size, unsigned long long num, unsigned int width)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[sizeof(num) * 3] __aligned(2);
        int idx, len;

        /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */
        if (num <= 9) {
                tmp[0] = '0' + num;
                len = 1;
        } else {
                len = put_dec(tmp, num) - tmp;
        }

        if (len > size || width > size)
                return 0;

        if (width > len) {
                width = width - len;
                for (idx = 0; idx < width; idx++)
                        buf[idx] = ' ';
        } else {
                width = 0;
        }

        for (idx = 0; idx < len; ++idx)
                buf[idx + width] = tmp[len - idx - 1];

        return len + width;
}

#define SIGN        1                /* unsigned/signed */
#define LEFT        2                /* left justified */
#define PLUS        4                /* show plus */
#define SPACE        8                /* space if plus */
#define ZEROPAD        16                /* pad with zero, must be 16 == '0' - ' ' */
#define SMALL        32                /* use lowercase in hex (must be 32 == 0x20) */
#define SPECIAL        64                /* prefix hex with "0x", octal with "0" */

static_assert(ZEROPAD == ('0' - ' '));
static_assert(SMALL == ('a' ^ 'A'));

enum format_state {
        FORMAT_STATE_NONE, /* Just a string part */
        FORMAT_STATE_NUM,
        FORMAT_STATE_WIDTH,
        FORMAT_STATE_PRECISION,
        FORMAT_STATE_CHAR,
        FORMAT_STATE_STR,
        FORMAT_STATE_PTR,
        FORMAT_STATE_PERCENT_CHAR,
        FORMAT_STATE_INVALID,
};

struct printf_spec {
        unsigned char        flags;                /* flags to number() */
        unsigned char        base;                /* number base, 8, 10 or 16 only */
        short                precision;        /* # of digits/chars */
        int                field_width;        /* width of output field */
} __packed;
static_assert(sizeof(struct printf_spec) == 8);

#define FIELD_WIDTH_MAX ((1 << 23) - 1)
#define PRECISION_MAX ((1 << 15) - 1)

static noinline_for_stack
char *number(char *buf, char *end, unsigned long long num,
             struct printf_spec spec)
{
        /* put_dec requires 2-byte alignment of the buffer. */
        char tmp[3 * sizeof(num)] __aligned(2);
        char sign;
        char locase;
        int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10);
        int i;
        bool is_zero = num == 0LL;
        int field_width = spec.field_width;
        int precision = spec.precision;

        /* locase = 0 or 0x20. ORing digits or letters with 'locase'
         * produces same digits or (maybe lowercased) letters */
        locase = (spec.flags & SMALL);
        if (spec.flags & LEFT)
                spec.flags &= ~ZEROPAD;
        sign = 0;
        if (spec.flags & SIGN) {
                if ((signed long long)num < 0) {
                        sign = '-';
                        num = -(signed long long)num;
                        field_width--;
                } else if (spec.flags & PLUS) {
                        sign = '+';
                        field_width--;
                } else if (spec.flags & SPACE) {
                        sign = ' ';
                        field_width--;
                }
        }
        if (need_pfx) {
                if (spec.base == 16)
                        field_width -= 2;
                else if (!is_zero)
                        field_width--;
        }

        /* generate full string in tmp[], in reverse order */
        i = 0;
        if (num < spec.base)
                tmp[i++] = hex_asc_upper[num] | locase;
        else if (spec.base != 10) { /* 8 or 16 */
                int mask = spec.base - 1;
                int shift = 3;

                if (spec.base == 16)
                        shift = 4;
                do {
                        tmp[i++] = (hex_asc_upper[((unsigned char)num) & mask] | locase);
                        num >>= shift;
                } while (num);
        } else { /* base 10 */
                i = put_dec(tmp, num) - tmp;
        }

        /* printing 100 using %2d gives "100", not "00" */
        if (i > precision)
                precision = i;
        /* leading space padding */
        field_width -= precision;
        if (!(spec.flags & (ZEROPAD | LEFT))) {
                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = ' ';
                        ++buf;
                }
        }
        /* sign */
        if (sign) {
                if (buf < end)
                        *buf = sign;
                ++buf;
        }
        /* "0x" / "0" prefix */
        if (need_pfx) {
                if (spec.base == 16 || !is_zero) {
                        if (buf < end)
                                *buf = '0';
                        ++buf;
                }
                if (spec.base == 16) {
                        if (buf < end)
                                *buf = ('X' | locase);
                        ++buf;
                }
        }
        /* zero or space padding */
        if (!(spec.flags & LEFT)) {
                char c = ' ' + (spec.flags & ZEROPAD);

                while (--field_width >= 0) {
                        if (buf < end)
                                *buf = c;
                        ++buf;
                }
        }
        /* hmm even more zero padding? */
        while (i <= --precision) {
                if (buf < end)
                        *buf = '0';
                ++buf;
        }
        /* actual digits of result */
        while (--i >= 0) {
                if (buf < end)
                        *buf = tmp[i];
                ++buf;
        }
        /* trailing space padding */
        while (--field_width >= 0) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }

        return buf;
}

static noinline_for_stack
char *special_hex_number(char *buf, char *end, unsigned long long num, int size)
{
        struct printf_spec spec;

        spec.field_width = 2 + 2 * size;        /* 0x + hex */
        spec.flags = SPECIAL | SMALL | ZEROPAD;
        spec.base = 16;
        spec.precision = -1;

        return number(buf, end, num, spec);
}

static void move_right(char *buf, char *end, unsigned len, unsigned spaces)
{
        size_t size;
        if (buf >= end)        /* nowhere to put anything */
                return;
        size = end - buf;
        if (size <= spaces) {
                memset(buf, ' ', size);
                return;
        }
        if (len) {
                if (len > size - spaces)
                        len = size - spaces;
                memmove(buf + spaces, buf, len);
        }
        memset(buf, ' ', spaces);
}

/*
 * Handle field width padding for a string.
 * @buf: current buffer position
 * @n: length of string
 * @end: end of output buffer
 * @spec: for field width and flags
 * Returns: new buffer position after padding.
 */
static noinline_for_stack
char *widen_string(char *buf, int n, char *end, struct printf_spec spec)
{
        unsigned spaces;

        if (likely(n >= spec.field_width))
                return buf;
        /* we want to pad the sucker */
        spaces = spec.field_width - n;
        if (!(spec.flags & LEFT)) {
                move_right(buf - n, end, n, spaces);
                return buf + spaces;
        }
        while (spaces--) {
                if (buf < end)
                        *buf = ' ';
                ++buf;
        }
        return buf;
}

/* Handle string from a well known address. */
static char *string_nocheck(char *buf, char *end, const char *s,
                            struct printf_spec spec)
{
        int len = 0;
        int lim = spec.precision;

        while (lim--) {
                char c = *s++;
                if (!c)
                        break;
                if (buf < end)
                        *buf = c;
                ++buf;
                ++len;
        }
        return widen_string(buf, len, end, spec);
}

static char *err_ptr(char *buf, char *end, void *ptr,
                     struct printf_spec spec)
{
        int err = PTR_ERR(ptr);
        const char *sym = errname(err);

        if (sym)
                return string_nocheck(buf, end, sym, spec);

        /*
         * Somebody passed ERR_PTR(-1234) or some other non-existing
         * Efoo - or perhaps CONFIG_SYMBOLIC_ERRNAME=n. Fall back to
         * printing it as its decimal representation.
         */
        spec.flags |= SIGN;
        spec.base = 10;
        return number(buf, end, err, spec);
}

/* Be careful: error messages must fit into the given buffer. */
static char *error_string(char *buf, char *end, const char *s,
                          struct printf_spec spec)
{
        /*
         * Hard limit to avoid a completely insane messages. It actually
         * works pretty well because most error messages are in
         * the many pointer format modifiers.
         */
        if (spec.precision == -1)
                spec.precision = 2 * sizeof(void *);

        return string_nocheck(buf, end, s, spec);
}

/*
 * Do not call any complex external code here. Nested printk()/vsprintf()
 * might cause infinite loops. Failures might break printk() and would
 * be hard to debug.
 */
static const char *check_pointer_msg(const void *ptr)
{
        if (!ptr)
                return "(null)";

        if ((unsigned long)ptr < PAGE_SIZE || IS_ERR_VALUE(ptr))
                return "(efault)";

        return NULL;
}

static int check_pointer(char **buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        const char *err_msg;

        err_msg = check_pointer_msg(ptr);
        if (err_msg) {
                *buf = error_string(*buf, end, err_msg, spec);
                return -EFAULT;
        }

        return 0;
}

static noinline_for_stack
char *string(char *buf, char *end, const char *s,
             struct printf_spec spec)
{
        if (check_pointer(&buf, end, s, spec))
                return buf;

        return string_nocheck(buf, end, s, spec);
}

static char *pointer_string(char *buf, char *end,
                            const void *ptr,
                            struct printf_spec spec)
{
        spec.base = 16;
        spec.flags |= SMALL;
        if (spec.field_width == -1) {
                spec.field_width = 2 * sizeof(ptr);
                spec.flags |= ZEROPAD;
        }

        return number(buf, end, (unsigned long int)ptr, spec);
}

/* Make pointers available for printing early in the boot sequence. */
static int debug_boot_weak_hash __ro_after_init;

static int __init debug_boot_weak_hash_enable(char *str)
{
        debug_boot_weak_hash = 1;
        pr_info("debug_boot_weak_hash enabled\n");
        return 0;
}
early_param("debug_boot_weak_hash", debug_boot_weak_hash_enable);

static bool filled_random_ptr_key __read_mostly;
static siphash_key_t ptr_key __read_mostly;

static int fill_ptr_key(struct notifier_block *nb, unsigned long action, void *data)
{
        get_random_bytes(&ptr_key, sizeof(ptr_key));

        /* Pairs with smp_rmb() before reading ptr_key. */
        smp_wmb();
        WRITE_ONCE(filled_random_ptr_key, true);
        return NOTIFY_DONE;
}

static int __init vsprintf_init_hashval(void)
{
        static struct notifier_block fill_ptr_key_nb = { .notifier_call = fill_ptr_key };
        execute_with_initialized_rng(&fill_ptr_key_nb);
        return 0;
}
subsys_initcall(vsprintf_init_hashval)

/* Maps a pointer to a 32 bit unique identifier. */
static inline int __ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        unsigned long hashval;

        if (!READ_ONCE(filled_random_ptr_key))
                return -EBUSY;

        /* Pairs with smp_wmb() after writing ptr_key. */
        smp_rmb();

#ifdef CONFIG_64BIT
        hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
        /*
         * Mask off the first 32 bits, this makes explicit that we have
         * modified the address (and 32 bits is plenty for a unique ID).
         */
        hashval = hashval & 0xffffffff;
#else
        hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
#endif
        *hashval_out = hashval;
        return 0;
}

int ptr_to_hashval(const void *ptr, unsigned long *hashval_out)
{
        return __ptr_to_hashval(ptr, hashval_out);
}

static char *ptr_to_id(char *buf, char *end, const void *ptr,
                       struct printf_spec spec)
{
        const char *str = sizeof(ptr) == 8 ? "(____ptrval____)" : "(ptrval)";
        unsigned long hashval;
        int ret;

        /*
         * Print the real pointer value for NULL and error pointers,
         * as they are not actual addresses.
         */
        if (IS_ERR_OR_NULL(ptr))
                return pointer_string(buf, end, ptr, spec);

        /* When debugging early boot use non-cryptographically secure hash. */
        if (unlikely(debug_boot_weak_hash)) {
                hashval = hash_long((unsigned long)ptr, 32);
                return pointer_string(buf, end, (const void *)hashval, spec);
        }

        ret = __ptr_to_hashval(ptr, &hashval);
        if (ret) {
                spec.field_width = 2 * sizeof(ptr);
                /* string length must be less than default_width */
                return error_string(buf, end, str, spec);
        }

        return pointer_string(buf, end, (const void *)hashval, spec);
}

static char *default_pointer(char *buf, char *end, const void *ptr,
                             struct printf_spec spec)
{
        /*
         * default is to _not_ leak addresses, so hash before printing,
         * unless no_hash_pointers is specified on the command line.
         */
        if (unlikely(no_hash_pointers))
                return pointer_string(buf, end, ptr, spec);

        return ptr_to_id(buf, end, ptr, spec);
}

int kptr_restrict __read_mostly;

static noinline_for_stack
char *restricted_pointer(char *buf, char *end, const void *ptr,
                         struct printf_spec spec)
{
        switch (kptr_restrict) {
        case 0:
                /* Handle as %p, hash and do _not_ leak addresses. */
                return default_pointer(buf, end, ptr, spec);
        case 1: {
                const struct cred *cred;

                /*
                 * kptr_restrict==1 cannot be used in IRQ context
                 * because its test for CAP_SYSLOG would be meaningless.
                 */
                if (in_hardirq() || in_serving_softirq() || in_nmi()) {
                        if (spec.field_width == -1)
                                spec.field_width = 2 * sizeof(ptr);
                        return error_string(buf, end, "pK-error", spec);
                }

                /*
                 * Only print the real pointer value if the current
                 * process has CAP_SYSLOG and is running with the
                 * same credentials it started with. This is because
                 * access to files is checked at open() time, but %pK
                 * checks permission at read() time. We don't want to
                 * leak pointer values if a binary opens a file using
                 * %pK and then elevates privileges before reading it.
                 */
                cred = current_cred();
                if (!has_capability_noaudit(current, CAP_SYSLOG) ||
                    !uid_eq(cred->euid, cred->uid) ||
                    !gid_eq(cred->egid, cred->gid))
                        ptr = NULL;
                break;
        }
        case 2:
        default:
                /* Always print 0's for %pK */
                ptr = NULL;
                break;
        }

        return pointer_string(buf, end, ptr, spec);
}

static noinline_for_stack
char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec,
                  const char *fmt)
{
        const char *array[4], *s;
        const struct dentry *p;
        int depth;
        int i, n;

        switch (fmt[1]) {
                case '2': case '3': case '4':
                        depth = fmt[1] - '0';
                        break;
                default:
                        depth = 1;
        }

        rcu_read_lock();
        for (i = 0; i < depth; i++, d = p) {
                if (check_pointer(&buf, end, d, spec)) {
                        rcu_read_unlock();
                        return buf;
                }

                p = READ_ONCE(d->d_parent);
                array[i] = READ_ONCE(d->d_name.name);
                if (p == d) {
                        if (i)
                                array[i] = "";
                        i++;
                        break;
                }
        }
        s = array[--i];
        for (n = 0; n != spec.precision; n++, buf++) {
                char c = *s++;
                if (!c) {
                        if (!i)
                                break;
                        c = '/';
                        s = array[--i];
                }
                if (buf < end)
                        *buf = c;
        }
        rcu_read_unlock();
        return widen_string(buf, n, end, spec);
}

static noinline_for_stack
char *file_dentry_name(char *buf, char *end, const struct file *f,
                        struct printf_spec spec, const char *fmt)
{
        if (check_pointer(&buf, end, f, spec))
                return buf;

        return dentry_name(buf, end, f->f_path.dentry, spec, fmt);
}
#ifdef CONFIG_BLOCK
static noinline_for_stack
char *bdev_name(char *buf, char *end, struct block_device *bdev,
                struct printf_spec spec, const char *fmt)
{
        struct gendisk *hd;

        if (check_pointer(&buf, end, bdev, spec))
                return buf;

        hd = bdev->bd_disk;
        buf = string(buf, end, hd->disk_name, spec);
        if (bdev_is_partition(bdev)) {
                if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) {
                        if (buf < end)
                                *buf = 'p';
                        buf++;
                }
                buf = number(buf, end, bdev_partno(bdev), spec);
        }
        return buf;
}
#endif

static noinline_for_stack
char *symbol_string(char *buf, char *end, void *ptr,
                    struct printf_spec spec, const char *fmt)
{
        unsigned long value;
#ifdef CONFIG_KALLSYMS
        char sym[KSYM_SYMBOL_LEN];
#endif

        if (fmt[1] == 'R')
                ptr = __builtin_extract_return_addr(ptr);
        value = (unsigned long)ptr;

#ifdef CONFIG_KALLSYMS
        if (*fmt == 'B' && fmt[1] == 'b')
                sprint_backtrace_build_id(sym, value);
        else if (*fmt == 'B')
                sprint_backtrace(sym, value);
        else if (*fmt == 'S' && (fmt[1] == 'b' || (fmt[1] == 'R' && fmt[2] == 'b')))
                sprint_symbol_build_id(sym, value);
        else if (*fmt != 's')
                sprint_symbol(sym, value);
        else
                sprint_symbol_no_offset(sym, value);

        return string_nocheck(buf, end, sym, spec);
#else
        return special_hex_number(buf, end, value, sizeof(void *));
#endif
}

static const struct printf_spec default_str_spec = {
        .field_width = -1,
        .precision = -1,
};

static const struct printf_spec default_flag_spec = {
        .base = 16,
        .precision = -1,
        .flags = SPECIAL | SMALL,
};

static const struct printf_spec default_dec_spec = {
        .base = 10,
        .precision = -1,
};

static const struct printf_spec default_dec02_spec = {
        .base = 10,
        .field_width = 2,
        .precision = -1,
        .flags = ZEROPAD,
};

static const struct printf_spec default_dec04_spec = {
        .base = 10,
        .field_width = 4,
        .precision = -1,
        .flags = ZEROPAD,
};

static noinline_for_stack
char *hex_range(char *buf, char *end, u64 start_val, u64 end_val,
                struct printf_spec spec)
{
        buf = number(buf, end, start_val, spec);
        if (start_val == end_val)
                return buf;

        if (buf < end)
                *buf = '-';
        ++buf;
        return number(buf, end, end_val, spec);
}

static noinline_for_stack
char *resource_string(char *buf, char *end, struct resource *res,
                      struct printf_spec spec, const char *fmt)
{
#ifndef IO_RSRC_PRINTK_SIZE
#define IO_RSRC_PRINTK_SIZE        6
#endif

#ifndef MEM_RSRC_PRINTK_SIZE
#define MEM_RSRC_PRINTK_SIZE        10
#endif
        static const struct printf_spec io_spec = {
                .base = 16,
                .field_width = IO_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec mem_spec = {
                .base = 16,
                .field_width = MEM_RSRC_PRINTK_SIZE,
                .precision = -1,
                .flags = SPECIAL | SMALL | ZEROPAD,
        };
        static const struct printf_spec bus_spec = {
                .base = 16,
                .field_width = 2,
                .precision = -1,
                .flags = SMALL | ZEROPAD,
        };
        static const struct printf_spec str_spec = {
                .field_width = -1,
                .precision = 10,
                .flags = LEFT,
        };

        /* 32-bit res (sizeof==4): 10 chars in dec, 10 in hex ("0x" + 8)
         * 64-bit res (sizeof==8): 20 chars in dec, 18 in hex ("0x" + 16) */
#define RSRC_BUF_SIZE                ((2 * sizeof(resource_size_t)) + 4)
#define FLAG_BUF_SIZE                (2 * sizeof(res->flags))
#define DECODED_BUF_SIZE        sizeof("[mem - 64bit pref window disabled]")
#define RAW_BUF_SIZE                sizeof("[mem - flags 0x]")
        char sym[MAX(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
                     2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];

        char *p = sym, *pend = sym + sizeof(sym);
        int decode = (fmt[0] == 'R') ? 1 : 0;
        const struct printf_spec *specp;

        if (check_pointer(&buf, end, res, spec))
                return buf;

        *p++ = '[';
        if (res->flags & IORESOURCE_IO) {
                p = string_nocheck(p, pend, "io  ", str_spec);
                specp = &io_spec;
        } else if (res->flags & IORESOURCE_MEM) {
                p = string_nocheck(p, pend, "mem ", str_spec);
                specp = &mem_spec;
        } else if (res->flags & IORESOURCE_IRQ) {
                p = string_nocheck(p, pend, "irq ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_DMA) {
                p = string_nocheck(p, pend, "dma ", str_spec);
                specp = &default_dec_spec;
        } else if (res->flags & IORESOURCE_BUS) {
                p = string_nocheck(p, pend, "bus ", str_spec);
                specp = &bus_spec;
        } else {
                p = string_nocheck(p, pend, "??? ", str_spec);
                specp = &mem_spec;
                decode = 0;
        }
        if (decode && res->flags & IORESOURCE_UNSET) {
                p = string_nocheck(p, pend, "size ", str_spec);
                p = number(p, pend, resource_size(res), *specp);
        } else {
                p = hex_range(p, pend, res->start, res->end, *specp);
        }
        if (decode) {
                if (res->flags & IORESOURCE_MEM_64)
                        p = string_nocheck(p, pend, " 64bit", str_spec);
                if (res->flags & IORESOURCE_PREFETCH)
                        p = string_nocheck(p, pend, " pref", str_spec);
                if (res->flags & IORESOURCE_WINDOW)
                        p = string_nocheck(p, pend, " window", str_spec);
                if (res->flags & IORESOURCE_DISABLED)
                        p = string_nocheck(p, pend, " disabled", str_spec);
        } else {
                p = string_nocheck(p, pend, " flags ", str_spec);
                p = number(p, pend, res->flags, default_flag_spec);
        }
        *p++ = ']';
        *p = '\0';

        return string_nocheck(buf, end, sym, spec);
}

static noinline_for_stack
char *range_string(char *buf, char *end, const struct range *range,
                   struct printf_spec spec, const char *fmt)
{
        char sym[sizeof("[range 0x0123456789abcdef-0x0123456789abcdef]")];
        char *p = sym, *pend = sym + sizeof(sym);

        struct printf_spec range_spec = {
                .field_width = 2 + 2 * sizeof(range->start), /* 0x + 2 * 8 */
                .flags = SPECIAL | SMALL | ZEROPAD,
                .base = 16,
                .precision = -1,
        };

        if (check_pointer(&buf, end, range, spec))
                return buf;

        p = string_nocheck(p, pend, "[range ", default_str_spec);
        p = hex_range(p, pend, range->start, range->end, range_spec);
        *p++ = ']';
        *p = '\0';

        return string_nocheck(buf, end, sym, spec);
}

static noinline_for_stack
char *hex_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                 const char *fmt)
{
        int i, len = 1;                /* if we pass '%ph[CDN]', field width remains
                                   negative value, fallback to the default */
        char separator;

        if (spec.field_width == 0)
                /* nothing to print */
                return buf;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'C':
                separator = ':';
                break;
        case 'D':
                separator = '-';
                break;
        case 'N':
                separator = 0;
                break;
        default:
                separator = ' ';
                break;
        }

        if (spec.field_width > 0)
                len = min_t(int, spec.field_width, 64);

        for (i = 0; i < len; ++i) {
                if (buf < end)
                        *buf = hex_asc_hi(addr[i]);
                ++buf;
                if (buf < end)
                        *buf = hex_asc_lo(addr[i]);
                ++buf;

                if (separator && i != len - 1) {
                        if (buf < end)
                                *buf = separator;
                        ++buf;
                }
        }

        return buf;
}

static noinline_for_stack
char *bitmap_string(char *buf, char *end, const unsigned long *bitmap,
                    struct printf_spec spec, const char *fmt)
{
        const int CHUNKSZ = 32;
        int nr_bits = max_t(int, spec.field_width, 0);
        int i, chunksz;
        bool first = true;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        /* reused to print numbers */
        spec = (struct printf_spec){ .flags = SMALL | ZEROPAD, .base = 16 };

        chunksz = nr_bits & (CHUNKSZ - 1);
        if (chunksz == 0)
                chunksz = CHUNKSZ;

        i = ALIGN(nr_bits, CHUNKSZ) - CHUNKSZ;
        for (; i >= 0; i -= CHUNKSZ) {
                u32 chunkmask, val;
                int word, bit;

                chunkmask = ((1ULL << chunksz) - 1);
                word = i / BITS_PER_LONG;
                bit = i % BITS_PER_LONG;
                val = (bitmap[word] >> bit) & chunkmask;

                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                spec.field_width = DIV_ROUND_UP(chunksz, 4);
                buf = number(buf, end, val, spec);

                chunksz = CHUNKSZ;
        }
        return buf;
}

static noinline_for_stack
char *bitmap_list_string(char *buf, char *end, const unsigned long *bitmap,
                         struct printf_spec spec, const char *fmt)
{
        int nr_bits = max_t(int, spec.field_width, 0);
        bool first = true;
        int rbot, rtop;

        if (check_pointer(&buf, end, bitmap, spec))
                return buf;

        for_each_set_bitrange(rbot, rtop, bitmap, nr_bits) {
                if (!first) {
                        if (buf < end)
                                *buf = ',';
                        buf++;
                }
                first = false;

                buf = number(buf, end, rbot, default_dec_spec);
                if (rtop == rbot + 1)
                        continue;

                if (buf < end)
                        *buf = '-';
                buf = number(++buf, end, rtop - 1, default_dec_spec);
        }
        return buf;
}

static noinline_for_stack
char *mac_address_string(char *buf, char *end, u8 *addr,
                         struct printf_spec spec, const char *fmt)
{
        char mac_addr[sizeof("xx:xx:xx:xx:xx:xx")];
        char *p = mac_addr;
        int i;
        char separator;
        bool reversed = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                separator = '-';
                break;

        case 'R':
                reversed = true;
                fallthrough;

        default:
                separator = ':';
                break;
        }

        for (i = 0; i < 6; i++) {
                if (reversed)
                        p = hex_byte_pack(p, addr[5 - i]);
                else
                        p = hex_byte_pack(p, addr[i]);

                if (fmt[0] == 'M' && i != 5)
                        *p++ = separator;
        }
        *p = '\0';

        return string_nocheck(buf, end, mac_addr, spec);
}

static noinline_for_stack
char *ip4_string(char *p, const u8 *addr, const char *fmt)
{
        int i;
        bool leading_zeros = (fmt[0] == 'i');
        int index;
        int step;

        switch (fmt[2]) {
        case 'h':
#ifdef __BIG_ENDIAN
                index = 0;
                step = 1;
#else
                index = 3;
                step = -1;
#endif
                break;
        case 'l':
                index = 3;
                step = -1;
                break;
        case 'n':
        case 'b':
        default:
                index = 0;
                step = 1;
                break;
        }
        for (i = 0; i < 4; i++) {
                char temp[4] __aligned(2);        /* hold each IP quad in reverse order */
                int digits = put_dec_trunc8(temp, addr[index]) - temp;
                if (leading_zeros) {
                        if (digits < 3)
                                *p++ = '0';
                        if (digits < 2)
                                *p++ = '0';
                }
                /* reverse the digits in the quad */
                while (digits--)
                        *p++ = temp[digits];
                if (i < 3)
                        *p++ = '.';
                index += step;
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_compressed_string(char *p, const char *addr)
{
        int i, j, range;
        unsigned char zerolength[8];
        int longest = 1;
        int colonpos = -1;
        u16 word;
        u8 hi, lo;
        bool needcolon = false;
        bool useIPv4;
        struct in6_addr in6;

        memcpy(&in6, addr, sizeof(struct in6_addr));

        useIPv4 = ipv6_addr_v4mapped(&in6) || ipv6_addr_is_isatap(&in6);

        memset(zerolength, 0, sizeof(zerolength));

        if (useIPv4)
                range = 6;
        else
                range = 8;

        /* find position of longest 0 run */
        for (i = 0; i < range; i++) {
                for (j = i; j < range; j++) {
                        if (in6.s6_addr16[j] != 0)
                                break;
                        zerolength[i]++;
                }
        }
        for (i = 0; i < range; i++) {
                if (zerolength[i] > longest) {
                        longest = zerolength[i];
                        colonpos = i;
                }
        }
        if (longest == 1)                /* don't compress a single 0 */
                colonpos = -1;

        /* emit address */
        for (i = 0; i < range; i++) {
                if (i == colonpos) {
                        if (needcolon || i == 0)
                                *p++ = ':';
                        *p++ = ':';
                        needcolon = false;
                        i += longest - 1;
                        continue;
                }
                if (needcolon) {
                        *p++ = ':';
                        needcolon = false;
                }
                /* hex u16 without leading 0s */
                word = ntohs(in6.s6_addr16[i]);
                hi = word >> 8;
                lo = word & 0xff;
                if (hi) {
                        if (hi > 0x0f)
                                p = hex_byte_pack(p, hi);
                        else
                                *p++ = hex_asc_lo(hi);
                        p = hex_byte_pack(p, lo);
                }
                else if (lo > 0x0f)
                        p = hex_byte_pack(p, lo);
                else
                        *p++ = hex_asc_lo(lo);
                needcolon = true;
        }

        if (useIPv4) {
                if (needcolon)
                        *p++ = ':';
                p = ip4_string(p, &in6.s6_addr[12], "I4");
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_string(char *p, const char *addr, const char *fmt)
{
        int i;

        for (i = 0; i < 8; i++) {
                p = hex_byte_pack(p, *addr++);
                p = hex_byte_pack(p, *addr++);
                if (fmt[0] == 'I' && i != 7)
                        *p++ = ':';
        }
        *p = '\0';

        return p;
}

static noinline_for_stack
char *ip6_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip6_addr[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255")];

        if (fmt[0] == 'I' && fmt[2] == 'c')
                ip6_compressed_string(ip6_addr, addr);
        else
                ip6_string(ip6_addr, addr, fmt);

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string(char *buf, char *end, const u8 *addr,
                      struct printf_spec spec, const char *fmt)
{
        char ip4_addr[sizeof("255.255.255.255")];

        ip4_string(ip4_addr, addr, fmt);

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip6_addr_string_sa(char *buf, char *end, const struct sockaddr_in6 *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false, have_s = false, have_f = false, have_c = false;
        char ip6_addr[sizeof("[xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:255.255.255.255]") +
                      sizeof(":12345") + sizeof("/123456789") +
                      sizeof("%1234567890")];
        char *p = ip6_addr, *pend = ip6_addr + sizeof(ip6_addr);
        const u8 *addr = (const u8 *) &sa->sin6_addr;
        char fmt6[2] = { fmt[0], '6' };
        u8 off = 0;

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'f':
                        have_f = true;
                        break;
                case 's':
                        have_s = true;
                        break;
                case 'c':
                        have_c = true;
                        break;
                }
        }

        if (have_p || have_s || have_f) {
                *p = '[';
                off = 1;
        }

        if (fmt6[0] == 'I' && have_c)
                p = ip6_compressed_string(ip6_addr + off, addr);
        else
                p = ip6_string(ip6_addr + off, addr, fmt6);

        if (have_p || have_s || have_f)
                *p++ = ']';

        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin6_port), spec);
        }
        if (have_f) {
                *p++ = '/';
                p = number(p, pend, ntohl(sa->sin6_flowinfo &
                                          IPV6_FLOWINFO_MASK), spec);
        }
        if (have_s) {
                *p++ = '%';
                p = number(p, pend, sa->sin6_scope_id, spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip6_addr, spec);
}

static noinline_for_stack
char *ip4_addr_string_sa(char *buf, char *end, const struct sockaddr_in *sa,
                         struct printf_spec spec, const char *fmt)
{
        bool have_p = false;
        char *p, ip4_addr[sizeof("255.255.255.255") + sizeof(":12345")];
        char *pend = ip4_addr + sizeof(ip4_addr);
        const u8 *addr = (const u8 *) &sa->sin_addr.s_addr;
        char fmt4[3] = { fmt[0], '4', 0 };

        fmt++;
        while (isalpha(*++fmt)) {
                switch (*fmt) {
                case 'p':
                        have_p = true;
                        break;
                case 'h':
                case 'l':
                case 'n':
                case 'b':
                        fmt4[2] = *fmt;
                        break;
                }
        }

        p = ip4_string(ip4_addr, addr, fmt4);
        if (have_p) {
                *p++ = ':';
                p = number(p, pend, ntohs(sa->sin_port), spec);
        }
        *p = '\0';

        return string_nocheck(buf, end, ip4_addr, spec);
}

static noinline_for_stack
char *ip_addr_string(char *buf, char *end, const void *ptr,
                     struct printf_spec spec, const char *fmt)
{
        char *err_fmt_msg;

        if (check_pointer(&buf, end, ptr, spec))
                return buf;

        switch (fmt[1]) {
        case '6':
                return ip6_addr_string(buf, end, ptr, spec, fmt);
        case '4':
                return ip4_addr_string(buf, end, ptr, spec, fmt);
        case 'S': {
                const union {
                        struct sockaddr                raw;
                        struct sockaddr_in        v4;
                        struct sockaddr_in6        v6;
                } *sa = ptr;

                switch (sa->raw.sa_family) {
                case AF_INET:
                        return ip4_addr_string_sa(buf, end, &sa->v4, spec, fmt);
                case AF_INET6:
                        return ip6_addr_string_sa(buf, end, &sa->v6, spec, fmt);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }}
        }

        err_fmt_msg = fmt[0] == 'i' ? "(%pi?)" : "(%pI?)";
        return error_string(buf, end, err_fmt_msg, spec);
}

static noinline_for_stack
char *escaped_string(char *buf, char *end, u8 *addr, struct printf_spec spec,
                     const char *fmt)
{
        bool found = true;
        int count = 1;
        unsigned int flags = 0;
        int len;

        if (spec.field_width == 0)
                return buf;                                /* nothing to print */

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        do {
                switch (fmt[count++]) {
                case 'a':
                        flags |= ESCAPE_ANY;
                        break;
                case 'c':
                        flags |= ESCAPE_SPECIAL;
                        break;
                case 'h':
                        flags |= ESCAPE_HEX;
                        break;
                case 'n':
                        flags |= ESCAPE_NULL;
                        break;
                case 'o':
                        flags |= ESCAPE_OCTAL;
                        break;
                case 'p':
                        flags |= ESCAPE_NP;
                        break;
                case 's':
                        flags |= ESCAPE_SPACE;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (!flags)
                flags = ESCAPE_ANY_NP;

        len = spec.field_width < 0 ? 1 : spec.field_width;

        /*
         * string_escape_mem() writes as many characters as it can to
         * the given buffer, and returns the total size of the output
         * had the buffer been big enough.
         */
        buf += string_escape_mem(addr, len, buf, buf < end ? end - buf : 0, flags, NULL);

        return buf;
}

#pragma GCC diagnostic push
#ifndef __clang__
#pragma GCC diagnostic ignored "-Wsuggest-attribute=format"
#endif
static char *va_format(char *buf, char *end, struct va_format *va_fmt,
                       struct printf_spec spec)
{
        va_list va;

        if (check_pointer(&buf, end, va_fmt, spec))
                return buf;

        va_copy(va, *va_fmt->va);
        buf += vsnprintf(buf, end > buf ? end - buf : 0, va_fmt->fmt, va);
        va_end(va);

        return buf;
}
#pragma GCC diagnostic pop

static noinline_for_stack
char *uuid_string(char *buf, char *end, const u8 *addr,
                  struct printf_spec spec, const char *fmt)
{
        char uuid[UUID_STRING_LEN + 1];
        char *p = uuid;
        int i;
        const u8 *index = uuid_index;
        bool uc = false;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (*(++fmt)) {
        case 'L':
                uc = true;
                fallthrough;
        case 'l':
                index = guid_index;
                break;
        case 'B':
                uc = true;
                break;
        }

        for (i = 0; i < 16; i++) {
                if (uc)
                        p = hex_byte_pack_upper(p, addr[index[i]]);
                else
                        p = hex_byte_pack(p, addr[index[i]]);
                switch (i) {
                case 3:
                case 5:
                case 7:
                case 9:
                        *p++ = '-';
                        break;
                }
        }

        *p = 0;

        return string_nocheck(buf, end, uuid, spec);
}

static noinline_for_stack
char *netdev_bits(char *buf, char *end, const void *addr,
                  struct printf_spec spec,  const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'F':
                num = *(const netdev_features_t *)addr;
                size = sizeof(netdev_features_t);
                break;
        default:
                return error_string(buf, end, "(%pN?)", spec);
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *fourcc_string(char *buf, char *end, const u32 *fourcc,
                    struct printf_spec spec, const char *fmt)
{
        char output[sizeof("0123 little-endian (0x01234567)")];
        char *p = output;
        unsigned int i;
        u32 orig, val;

        if (fmt[1] != 'c' || fmt[2] != 'c')
                return error_string(buf, end, "(%p4?)", spec);

        if (check_pointer(&buf, end, fourcc, spec))
                return buf;

        orig = get_unaligned(fourcc);
        val = orig & ~BIT(31);

        for (i = 0; i < sizeof(u32); i++) {
                unsigned char c = val >> (i * 8);

                /* Print non-control ASCII characters as-is, dot otherwise */
                *p++ = isascii(c) && isprint(c) ? c : '.';
        }

        *p++ = ' ';
        strcpy(p, orig & BIT(31) ? "big-endian" : "little-endian");
        p += strlen(p);

        *p++ = ' ';
        *p++ = '(';
        p = special_hex_number(p, output + sizeof(output) - 2, orig, sizeof(u32));
        *p++ = ')';
        *p = '\0';

        return string(buf, end, output, spec);
}

static noinline_for_stack
char *address_val(char *buf, char *end, const void *addr,
                  struct printf_spec spec, const char *fmt)
{
        unsigned long long num;
        int size;

        if (check_pointer(&buf, end, addr, spec))
                return buf;

        switch (fmt[1]) {
        case 'd':
                num = *(const dma_addr_t *)addr;
                size = sizeof(dma_addr_t);
                break;
        case 'p':
        default:
                num = *(const phys_addr_t *)addr;
                size = sizeof(phys_addr_t);
                break;
        }

        return special_hex_number(buf, end, num, size);
}

static noinline_for_stack
char *date_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        int year = tm->tm_year + (r ? 0 : 1900);
        int mon = tm->tm_mon + (r ? 0 : 1);

        buf = number(buf, end, year, default_dec04_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        buf = number(buf, end, mon, default_dec02_spec);
        if (buf < end)
                *buf = '-';
        buf++;

        return number(buf, end, tm->tm_mday, default_dec02_spec);
}

static noinline_for_stack
char *time_str(char *buf, char *end, const struct rtc_time *tm, bool r)
{
        buf = number(buf, end, tm->tm_hour, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        buf = number(buf, end, tm->tm_min, default_dec02_spec);
        if (buf < end)
                *buf = ':';
        buf++;

        return number(buf, end, tm->tm_sec, default_dec02_spec);
}

static noinline_for_stack
char *rtc_str(char *buf, char *end, const struct rtc_time *tm,
              struct printf_spec spec, const char *fmt)
{
        bool have_t = true, have_d = true;
        bool raw = false, iso8601_separator = true;
        bool found = true;
        int count = 2;

        if (check_pointer(&buf, end, tm, spec))
                return buf;

        switch (fmt[count]) {
        case 'd':
                have_t = false;
                count++;
                break;
        case 't':
                have_d = false;
                count++;
                break;
        }

        do {
                switch (fmt[count++]) {
                case 'r':
                        raw = true;
                        break;
                case 's':
                        iso8601_separator = false;
                        break;
                default:
                        found = false;
                        break;
                }
        } while (found);

        if (have_d)
                buf = date_str(buf, end, tm, raw);
        if (have_d && have_t) {
                if (buf < end)
                        *buf = iso8601_separator ? 'T' : ' ';
                buf++;
        }
        if (have_t)
                buf = time_str(buf, end, tm, raw);

        return buf;
}

static noinline_for_stack
char *time64_str(char *buf, char *end, const time64_t time,
                 struct printf_spec spec, const char *fmt)
{
        struct rtc_time rtc_time;
        struct tm tm;

        time64_to_tm(time, 0, &tm);

        rtc_time.tm_sec = tm.tm_sec;
        rtc_time.tm_min = tm.tm_min;
        rtc_time.tm_hour = tm.tm_hour;
        rtc_time.tm_mday = tm.tm_mday;
        rtc_time.tm_mon = tm.tm_mon;
        rtc_time.tm_year = tm.tm_year;
        rtc_time.tm_wday = tm.tm_wday;
        rtc_time.tm_yday = tm.tm_yday;

        rtc_time.tm_isdst = 0;

        return rtc_str(buf, end, &rtc_time, spec, fmt);
}

static noinline_for_stack
char *time_and_date(char *buf, char *end, void *ptr, struct printf_spec spec,
                    const char *fmt)
{
        switch (fmt[1]) {
        case 'R':
                return rtc_str(buf, end, (const struct rtc_time *)ptr, spec, fmt);
        case 'T':
                return time64_str(buf, end, *(const time64_t *)ptr, spec, fmt);
        default:
                return error_string(buf, end, "(%pt?)", spec);
        }
}

static noinline_for_stack
char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
            const char *fmt)
{
        if (!IS_ENABLED(CONFIG_HAVE_CLK))
                return error_string(buf, end, "(%pC?)", spec);

        if (check_pointer(&buf, end, clk, spec))
                return buf;

        switch (fmt[1]) {
        case 'n':
        default:
#ifdef CONFIG_COMMON_CLK
                return string(buf, end, __clk_get_name(clk), spec);
#else
                return ptr_to_id(buf, end, clk, spec);
#endif
        }
}

static
char *format_flags(char *buf, char *end, unsigned long flags,
                                        const struct trace_print_flags *names)
{
        unsigned long mask;

        for ( ; flags && names->name; names++) {
                mask = names->mask;
                if ((flags & mask) != mask)
                        continue;

                buf = string(buf, end, names->name, default_str_spec);

                flags &= ~mask;
                if (flags) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }
        }

        if (flags)
                buf = number(buf, end, flags, default_flag_spec);

        return buf;
}

struct page_flags_fields {
        int width;
        int shift;
        int mask;
        const struct printf_spec *spec;
        const char *name;
};

static const struct page_flags_fields pff[] = {
        {SECTIONS_WIDTH, SECTIONS_PGSHIFT, SECTIONS_MASK,
         &default_dec_spec, "section"},
        {NODES_WIDTH, NODES_PGSHIFT, NODES_MASK,
         &default_dec_spec, "node"},
        {ZONES_WIDTH, ZONES_PGSHIFT, ZONES_MASK,
         &default_dec_spec, "zone"},
        {LAST_CPUPID_WIDTH, LAST_CPUPID_PGSHIFT, LAST_CPUPID_MASK,
         &default_flag_spec, "lastcpupid"},
        {KASAN_TAG_WIDTH, KASAN_TAG_PGSHIFT, KASAN_TAG_MASK,
         &default_flag_spec, "kasantag"},
};

static
char *format_page_flags(char *buf, char *end, unsigned long flags)
{
        unsigned long main_flags = flags & PAGEFLAGS_MASK;
        bool append = false;
        int i;

        buf = number(buf, end, flags, default_flag_spec);
        if (buf < end)
                *buf = '(';
        buf++;

        /* Page flags from the main area. */
        if (main_flags) {
                buf = format_flags(buf, end, main_flags, pageflag_names);
                append = true;
        }

        /* Page flags from the fields area */
        for (i = 0; i < ARRAY_SIZE(pff); i++) {
                /* Skip undefined fields. */
                if (!pff[i].width)
                        continue;

                /* Format: Flag Name + '=' (equals sign) + Number + '|' (separator) */
                if (append) {
                        if (buf < end)
                                *buf = '|';
                        buf++;
                }

                buf = string(buf, end, pff[i].name, default_str_spec);
                if (buf < end)
                        *buf = '=';
                buf++;
                buf = number(buf, end, (flags >> pff[i].shift) & pff[i].mask,
                             *pff[i].spec);

                append = true;
        }
        if (buf < end)
                *buf = ')';
        buf++;

        return buf;
}

static noinline_for_stack
char *flags_string(char *buf, char *end, void *flags_ptr,
                   struct printf_spec spec, const char *fmt)
{
        unsigned long flags;
        const struct trace_print_flags *names;

        if (check_pointer(&buf, end, flags_ptr, spec))
                return buf;

        switch (fmt[1]) {
        case 'p':
                return format_page_flags(buf, end, *(unsigned long *)flags_ptr);
        case 'v':
                flags = *(unsigned long *)flags_ptr;
                names = vmaflag_names;
                break;
        case 'g':
                flags = (__force unsigned long)(*(gfp_t *)flags_ptr);
                names = gfpflag_names;
                break;
        default:
                return error_string(buf, end, "(%pG?)", spec);
        }

        return format_flags(buf, end, flags, names);
}

static noinline_for_stack
char *fwnode_full_name_string(struct fwnode_handle *fwnode, char *buf,
                              char *end)
{
        int depth;

        /* Loop starting from the root node to the current node. */
        for (depth = fwnode_count_parents(fwnode); depth >= 0; depth--) {
                /*
                 * Only get a reference for other nodes (i.e. parent nodes).
                 * fwnode refcount may be 0 here.
                 */
                struct fwnode_handle *__fwnode = depth ?
                        fwnode_get_nth_parent(fwnode, depth) : fwnode;

                buf = string(buf, end, fwnode_get_name_prefix(__fwnode),
                             default_str_spec);
                buf = string(buf, end, fwnode_get_name(__fwnode),
                             default_str_spec);

                if (depth)
                        fwnode_handle_put(__fwnode);
        }

        return buf;
}

static noinline_for_stack
char *device_node_string(char *buf, char *end, struct device_node *dn,
                         struct printf_spec spec, const char *fmt)
{
        char tbuf[sizeof("xxxx") + 1];
        const char *p;
        int ret;
        char *buf_start = buf;
        struct property *prop;
        bool has_mult, pass;

        struct printf_spec str_spec = spec;
        str_spec.field_width = -1;

        if (fmt[0] != 'F')
                return error_string(buf, end, "(%pO?)", spec);

        if (!IS_ENABLED(CONFIG_OF))
                return error_string(buf, end, "(%pOF?)", spec);

        if (check_pointer(&buf, end, dn, spec))
                return buf;

        /* simple case without anything any more format specifiers */
        fmt++;
        if (fmt[0] == '\0' || strcspn(fmt,"fnpPFcC") > 0)
                fmt = "f";

        for (pass = false; strspn(fmt,"fnpPFcC"); fmt++, pass = true) {
                int precision;
                if (pass) {
                        if (buf < end)
                                *buf = ':';
                        buf++;
                }

                switch (*fmt) {
                case 'f':        /* full_name */
                        buf = fwnode_full_name_string(of_fwnode_handle(dn), buf,
                                                      end);
                        break;
                case 'n':        /* name */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        precision = str_spec.precision;
                        str_spec.precision = strchrnul(p, '@') - p;
                        buf = string(buf, end, p, str_spec);
                        str_spec.precision = precision;
                        break;
                case 'p':        /* phandle */
                        buf = number(buf, end, (unsigned int)dn->phandle, default_dec_spec);
                        break;
                case 'P':        /* path-spec */
                        p = fwnode_get_name(of_fwnode_handle(dn));
                        if (!p[1])
                                p = "/";
                        buf = string(buf, end, p, str_spec);
                        break;
                case 'F':        /* flags */
                        tbuf[0] = of_node_check_flag(dn, OF_DYNAMIC) ? 'D' : '-';
                        tbuf[1] = of_node_check_flag(dn, OF_DETACHED) ? 'd' : '-';
                        tbuf[2] = of_node_check_flag(dn, OF_POPULATED) ? 'P' : '-';
                        tbuf[3] = of_node_check_flag(dn, OF_POPULATED_BUS) ? 'B' : '-';
                        tbuf[4] = 0;
                        buf = string_nocheck(buf, end, tbuf, str_spec);
                        break;
                case 'c':        /* major compatible string */
                        ret = of_property_read_string(dn, "compatible", &p);
                        if (!ret)
                                buf = string(buf, end, p, str_spec);
                        break;
                case 'C':        /* full compatible string */
                        has_mult = false;
                        of_property_for_each_string(dn, "compatible", prop, p) {
                                if (has_mult)
                                        buf = string_nocheck(buf, end, ",", str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);
                                buf = string(buf, end, p, str_spec);
                                buf = string_nocheck(buf, end, "\"", str_spec);

                                has_mult = true;
                        }
                        break;
                default:
                        break;
                }
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

static noinline_for_stack
char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode,
                    struct printf_spec spec, const char *fmt)
{
        struct printf_spec str_spec = spec;
        char *buf_start = buf;

        str_spec.field_width = -1;

        if (*fmt != 'w')
                return error_string(buf, end, "(%pf?)", spec);

        if (check_pointer(&buf, end, fwnode, spec))
                return buf;

        fmt++;

        switch (*fmt) {
        case 'P':        /* name */
                buf = string(buf, end, fwnode_get_name(fwnode), str_spec);
                break;
        case 'f':        /* full_name */
        default:
                buf = fwnode_full_name_string(fwnode, buf, end);
                break;
        }

        return widen_string(buf, buf - buf_start, end, spec);
}

static noinline_for_stack
char *resource_or_range(const char *fmt, char *buf, char *end, void *ptr,
                        struct printf_spec spec)
{
        if (*fmt == 'r' && fmt[1] == 'a')
                return range_string(buf, end, ptr, spec, fmt);
        return resource_string(buf, end, ptr, spec, fmt);
}

int __init no_hash_pointers_enable(char *str)
{
        if (no_hash_pointers)
                return 0;

        no_hash_pointers = true;

        pr_warn("**********************************************************\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** This system shows unhashed kernel memory addresses   **\n");
        pr_warn("** via the console, logs, and other interfaces. This    **\n");
        pr_warn("** might reduce the security of your system.            **\n");
        pr_warn("**                                                      **\n");
        pr_warn("** If you see this message and you are not debugging    **\n");
        pr_warn("** the kernel, report this immediately to your system   **\n");
        pr_warn("** administrator!                                       **\n");
        pr_warn("**                                                      **\n");
        pr_warn("**   NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE   **\n");
        pr_warn("**********************************************************\n");

        return 0;
}
early_param("no_hash_pointers", no_hash_pointers_enable);

/*
 * Show a '%p' thing.  A kernel extension is that the '%p' is followed
 * by an extra set of alphanumeric characters that are extended format
 * specifiers.
 *
 * Please update scripts/checkpatch.pl when adding/removing conversion
 * characters.  (Search for "check for vsprintf extension").
 *
 * Right now we handle:
 *
 * - 'S' For symbolic direct pointers (or function descriptors) with offset
 * - 's' For symbolic direct pointers (or function descriptors) without offset
 * - '[Ss]R' as above with __builtin_extract_return_addr() translation
 * - 'S[R]b' as above with module build ID (for use in backtraces)
 * - '[Ff]' %pf and %pF were obsoleted and later removed in favor of
 *            %ps and %pS. Be careful when re-using these specifiers.
 * - 'B' For backtraced symbolic direct pointers with offset
 * - 'Bb' as above with module build ID (for use in backtraces)
 * - 'R' For decoded struct resource, e.g., [mem 0x0-0x1f 64bit pref]
 * - 'r' For raw struct resource, e.g., [mem 0x0-0x1f flags 0x201]
 * - 'ra' For struct ranges, e.g., [range 0x0000000000000000 - 0x00000000000000ff]
 * - 'b[l]' For a bitmap, the number of bits is determined by the field
 *       width which must be explicitly specified either as part of the
 *       format string '%32b[l]' or through '%*b[l]', [l] selects
 *       range-list format instead of hex format
 * - 'M' For a 6-byte MAC address, it prints the address in the
 *       usual colon-separated hex notation
 * - 'm' For a 6-byte MAC address, it prints the hex address without colons
 * - 'MF' For a 6-byte MAC FDDI address, it prints the address
 *       with a dash-separated hex notation
 * - '[mM]R' For a 6-byte MAC address, Reverse order (Bluetooth)
 * - 'I' [46] for IPv4/IPv6 addresses printed in the usual way
 *       IPv4 uses dot-separated decimal without leading 0's (1.2.3.4)
 *       IPv6 uses colon separated network-order 16 bit hex with leading 0's
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - 'i' [46] for 'raw' IPv4/IPv6 addresses
 *       IPv6 omits the colons (01020304...0f)
 *       IPv4 uses dot-separated decimal with leading 0's (010.123.045.006)
 *       [S][pfs]
 *       Generic IPv4/IPv6 address (struct sockaddr *) that falls back to
 *       [4] or [6] and is able to print port [p], flowinfo [f], scope [s]
 * - '[Ii][4S][hnbl]' IPv4 addresses in host, network, big or little endian order
 * - 'I[6S]c' for IPv6 addresses printed as specified by
 *       https://tools.ietf.org/html/rfc5952
 * - 'E[achnops]' For an escaped buffer, where rules are defined by combination
 *                of the following flags (see string_escape_mem() for the
 *                details):
 *                  a - ESCAPE_ANY
 *                  c - ESCAPE_SPECIAL
 *                  h - ESCAPE_HEX
 *                  n - ESCAPE_NULL
 *                  o - ESCAPE_OCTAL
 *                  p - ESCAPE_NP
 *                  s - ESCAPE_SPACE
 *                By default ESCAPE_ANY_NP is used.
 * - 'U' For a 16 byte UUID/GUID, it prints the UUID/GUID in the form
 *       "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
 *       Options for %pU are:
 *         b big endian lower case hex (default)
 *         B big endian UPPER case hex
 *         l little endian lower case hex
 *         L little endian UPPER case hex
 *           big endian output byte order is:
 *             [0][1][2][3]-[4][5]-[6][7]-[8][9]-[10][11][12][13][14][15]
 *           little endian output byte order is:
 *             [3][2][1][0]-[5][4]-[7][6]-[8][9]-[10][11][12][13][14][15]
 * - 'V' For a struct va_format which contains a format string * and va_list *,
 *       call vsnprintf(->format, *->va_list).
 *       Implements a "recursive vsnprintf".
 *       Do not use this feature without some mechanism to verify the
 *       correctness of the format string and va_list arguments.
 * - 'K' For a kernel pointer that should be hidden from unprivileged users.
 *       Use only for procfs, sysfs and similar files, not printk(); please
 *       read the documentation (path below) first.
 * - 'NF' For a netdev_features_t
 * - '4cc' V4L2 or DRM FourCC code, with endianness and raw numerical value.
 * - 'h[CDN]' For a variable-length buffer, it prints it as a hex string with
 *            a certain separator (' ' by default):
 *              C colon
 *              D dash
 *              N no separator
 *            The maximum supported length is 64 bytes of the input. Consider
 *            to use print_hex_dump() for the larger input.
 * - 'a[pd]' For address types [p] phys_addr_t, [d] dma_addr_t and derivatives
 *           (default assumed to be phys_addr_t, passed by reference)
 * - 'd[234]' For a dentry name (optionally 2-4 last components)
 * - 'D[234]' Same as 'd' but for a struct file
 * - 'g' For block_device name (gendisk + partition number)
 * - 't[RT][dt][r][s]' For time and date as represented by:
 *      R    struct rtc_time
 *      T    time64_t
 * - 'C' For a clock, it prints the name (Common Clock Framework) or address
 *       (legacy clock framework) of the clock
 * - 'Cn' For a clock, it prints the name (Common Clock Framework) or address
 *        (legacy clock framework) of the clock
 * - 'G' For flags to be printed as a collection of symbolic strings that would
 *       construct the specific value. Supported flags given by option:
 *       p page flags (see struct page) given as pointer to unsigned long
 *       g gfp flags (GFP_* and __GFP_*) given as pointer to gfp_t
 *       v vma flags (VM_*) given as pointer to unsigned long
 * - 'OF[fnpPcCF]'  For a device tree object
 *                  Without any optional arguments prints the full_name
 *                  f device node full_name
 *                  n device node name
 *                  p device node phandle
 *                  P device node path spec (name + @unit)
 *                  F device node flags
 *                  c major compatible string
 *                  C full compatible string
 * - 'fw[fP]'        For a firmware node (struct fwnode_handle) pointer
 *                Without an option prints the full name of the node
 *                f full name
 *                P node name, including a possible unit address
 * - 'x' For printing the address unmodified. Equivalent to "%lx".
 *       Please read the documentation (path below) before using!
 * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of
 *           bpf_trace_printk() where [ku] prefix specifies either kernel (k)
 *           or user (u) memory to probe, and:
 *              s a string, equivalent to "%s" on direct vsnprintf() use
 *
 * ** When making changes please also update:
 *        Documentation/core-api/printk-formats.rst
 *
 * Note: The default behaviour (unadorned %p) is to hash the address,
 * rendering it useful as a unique identifier.
 *
 * There is also a '%pA' format specifier, but it is only intended to be used
 * from Rust code to format core::fmt::Arguments. Do *not* use it from C.
 * See rust/kernel/print.rs for details.
 */
static noinline_for_stack
char *pointer(const char *fmt, char *buf, char *end, void *ptr,
              struct printf_spec spec)
{
        switch (*fmt) {
        case 'S':
        case 's':
                ptr = dereference_symbol_descriptor(ptr);
                fallthrough;
        case 'B':
                return symbol_string(buf, end, ptr, spec, fmt);
        case 'R':
        case 'r':
                return resource_or_range(fmt, buf, end, ptr, spec);
        case 'h':
                return hex_string(buf, end, ptr, spec, fmt);
        case 'b':
                switch (fmt[1]) {
                case 'l':
                        return bitmap_list_string(buf, end, ptr, spec, fmt);
                default:
                        return bitmap_string(buf, end, ptr, spec, fmt);
                }
        case 'M':                        /* Colon separated: 00:01:02:03:04:05 */
        case 'm':                        /* Contiguous: 000102030405 */
                                        /* [mM]F (FDDI) */
                                        /* [mM]R (Reverse order; Bluetooth) */
                return mac_address_string(buf, end, ptr, spec, fmt);
        case 'I':                        /* Formatted IP supported
                                         * 4:        1.2.3.4
                                         * 6:        0001:0203:...:0708
                                         * 6c:        1::708 or 1::1.2.3.4
                                         */
        case 'i':                        /* Contiguous:
                                         * 4:        001.002.003.004
                                         * 6:   000102...0f
                                         */
                return ip_addr_string(buf, end, ptr, spec, fmt);
        case 'E':
                return escaped_string(buf, end, ptr, spec, fmt);
        case 'U':
                return uuid_string(buf, end, ptr, spec, fmt);
        case 'V':
                return va_format(buf, end, ptr, spec);
        case 'K':
                return restricted_pointer(buf, end, ptr, spec);
        case 'N':
                return netdev_bits(buf, end, ptr, spec, fmt);
        case '4':
                return fourcc_string(buf, end, ptr, spec, fmt);
        case 'a':
                return address_val(buf, end, ptr, spec, fmt);
        case 'd':
                return dentry_name(buf, end, ptr, spec, fmt);
        case 't':
                return time_and_date(buf, end, ptr, spec, fmt);
        case 'C':
                return clock(buf, end, ptr, spec, fmt);
        case 'D':
                return file_dentry_name(buf, end, ptr, spec, fmt);
#ifdef CONFIG_BLOCK
        case 'g':
                return bdev_name(buf, end, ptr, spec, fmt);
#endif

        case 'G':
                return flags_string(buf, end, ptr, spec, fmt);
        case 'O':
                return device_node_string(buf, end, ptr, spec, fmt + 1);
        case 'f':
                return fwnode_string(buf, end, ptr, spec, fmt + 1);
        case 'A':
                if (!IS_ENABLED(CONFIG_RUST)) {
                        WARN_ONCE(1, "Please remove %%pA from non-Rust code\n");
                        return error_string(buf, end, "(%pA?)", spec);
                }
                return rust_fmt_argument(buf, end, ptr);
        case 'x':
                return pointer_string(buf, end, ptr, spec);
        case 'e':
                /* %pe with a non-ERR_PTR gets treated as plain %p */
                if (!IS_ERR(ptr))
                        return default_pointer(buf, end, ptr, spec);
                return err_ptr(buf, end, ptr, spec);
        case 'u':
        case 'k':
                switch (fmt[1]) {
                case 's':
                        return string(buf, end, ptr, spec);
                default:
                        return error_string(buf, end, "(einval)", spec);
                }
        default:
                return default_pointer(buf, end, ptr, spec);
        }
}

struct fmt {
        const char *str;
        unsigned char state;        // enum format_state
        unsigned char size;        // size of numbers
};

#define SPEC_CHAR(x, flag) [(x)-32] = flag
static unsigned char spec_flag(unsigned char c)
{
        static const unsigned char spec_flag_array[] = {
                SPEC_CHAR(' ', SPACE),
                SPEC_CHAR('#', SPECIAL),
                SPEC_CHAR('+', PLUS),
                SPEC_CHAR('-', LEFT),
                SPEC_CHAR('0', ZEROPAD),
        };
        c -= 32;
        return (c < sizeof(spec_flag_array)) ? spec_flag_array[c] : 0;
}

/*
 * Helper function to decode printf style format.
 * Each call decode a token from the format and return the
 * number of characters read (or likely the delta where it wants
 * to go on the next call).
 * The decoded token is returned through the parameters
 *
 * 'h', 'l', or 'L' for integer fields
 * 'z' support added 23/7/1999 S.H.
 * 'z' changed to 'Z' --davidm 1/25/99
 * 'Z' changed to 'z' --adobriyan 2017-01-25
 * 't' added for ptrdiff_t
 *
 * @fmt: the format string
 * @type of the token returned
 * @flags: various flags such as +, -, # tokens..
 * @field_width: overwritten width
 * @base: base of the number (octal, hex, ...)
 * @precision: precision of a number
 * @qualifier: qualifier of a number (long, size_t, ...)
 */
static noinline_for_stack
struct fmt format_decode(struct fmt fmt, struct printf_spec *spec)
{
        const char *start = fmt.str;
        char flag;

        /* we finished early by reading the field width */
        if (unlikely(fmt.state == FORMAT_STATE_WIDTH)) {
                if (spec->field_width < 0) {
                        spec->field_width = -spec->field_width;
                        spec->flags |= LEFT;
                }
                fmt.state = FORMAT_STATE_NONE;
                goto precision;
        }

        /* we finished early by reading the precision */
        if (unlikely(fmt.state == FORMAT_STATE_PRECISION)) {
                if (spec->precision < 0)
                        spec->precision = 0;

                fmt.state = FORMAT_STATE_NONE;
                goto qualifier;
        }

        /* By default */
        fmt.state = FORMAT_STATE_NONE;

        for (; *fmt.str ; fmt.str++) {
                if (*fmt.str == '%')
                        break;
        }

        /* Return the current non-format string */
        if (fmt.str != start || !*fmt.str)
                return fmt;

        /* Process flags. This also skips the first '%' */
        spec->flags = 0;
        do {
                /* this also skips first '%' */
                flag = spec_flag(*++fmt.str);
                spec->flags |= flag;
        } while (flag);

        /* get field width */
        spec->field_width = -1;

        if (isdigit(*fmt.str))
                spec->field_width = skip_atoi(&fmt.str);
        else if (unlikely(*fmt.str == '*')) {
                /* it's the next argument */
                fmt.state = FORMAT_STATE_WIDTH;
                fmt.str++;
                return fmt;
        }

precision:
        /* get the precision */
        spec->precision = -1;
        if (unlikely(*fmt.str == '.')) {
                fmt.str++;
                if (isdigit(*fmt.str)) {
                        spec->precision = skip_atoi(&fmt.str);
                        if (spec->precision < 0)
                                spec->precision = 0;
                } else if (*fmt.str == '*') {
                        /* it's the next argument */
                        fmt.state = FORMAT_STATE_PRECISION;
                        fmt.str++;
                        return fmt;
                }
        }

qualifier:
        /* Set up default numeric format */
        spec->base = 10;
        fmt.state = FORMAT_STATE_NUM;
        fmt.size = sizeof(int);
        static const struct format_state {
                unsigned char state;
                unsigned char size;
                unsigned char flags_or_double_size;
                unsigned char base;
        } lookup_state[256] = {
                // Length
                ['l'] = { 0, sizeof(long), sizeof(long long) },
                ['L'] = { 0, sizeof(long long) },
                ['h'] = { 0, sizeof(short), sizeof(char) },
                ['H'] = { 0, sizeof(char) },        // Questionable historical
                ['z'] = { 0, sizeof(size_t) },
                ['t'] = { 0, sizeof(ptrdiff_t) },

                // Non-numeric formats
                ['c'] = { FORMAT_STATE_CHAR },
                ['s'] = { FORMAT_STATE_STR },
                ['p'] = { FORMAT_STATE_PTR },
                ['%'] = { FORMAT_STATE_PERCENT_CHAR },

                // Numerics
                ['o'] = { FORMAT_STATE_NUM, 0, 0, 8 },
                ['x'] = { FORMAT_STATE_NUM, 0, SMALL, 16 },
                ['X'] = { FORMAT_STATE_NUM, 0, 0, 16 },
                ['d'] = { FORMAT_STATE_NUM, 0, SIGN, 10 },
                ['i'] = { FORMAT_STATE_NUM, 0, SIGN, 10 },
                ['u'] = { FORMAT_STATE_NUM, 0, 0, 10, },

                /*
                 * Since %n poses a greater security risk than
                 * utility, treat it as any other invalid or
                 * unsupported format specifier.
                 */
        };

        const struct format_state *p = lookup_state + (u8)*fmt.str;
        if (p->size) {
                fmt.size = p->size;
                if (p->flags_or_double_size && fmt.str[0] == fmt.str[1]) {
                        fmt.size = p->flags_or_double_size;
                        fmt.str++;
                }
                fmt.str++;
                p = lookup_state + *fmt.str;
        }
        if (p->state) {
                if (p->base)
                        spec->base = p->base;
                spec->flags |= p->flags_or_double_size;
                fmt.state = p->state;
                fmt.str++;
                return fmt;
        }

        WARN_ONCE(1, "Please remove unsupported %%%c in format string\n", *fmt.str);
        fmt.state = FORMAT_STATE_INVALID;
        return fmt;
}

static void
set_field_width(struct printf_spec *spec, int width)
{
        spec->field_width = width;
        if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) {
                spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX);
        }
}

static void
set_precision(struct printf_spec *spec, int prec)
{
        spec->precision = prec;
        if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) {
                spec->precision = clamp(prec, 0, PRECISION_MAX);
        }
}

/*
 * Turn a 1/2/4-byte value into a 64-bit one for printing: truncate
 * as necessary and deal with signedness.
 *
 * 'size' is the size of the value in bytes.
 */
static unsigned long long convert_num_spec(unsigned int val, int size, struct printf_spec spec)
{
        unsigned int shift = 32 - size*8;

        val <<= shift;
        if (!(spec.flags & SIGN))
                return val >> shift;
        return (int)val >> shift;
}

/**
 * vsnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt_str: The format string to use
 * @args: Arguments for the format string
 *
 * This function generally follows C99 vsnprintf, but has some
 * extensions and a few limitations:
 *
 *  - ``%n`` is unsupported
 *  - ``%p*`` is handled by pointer()
 *
 * See pointer() or Documentation/core-api/printk-formats.rst for more
 * extensive description.
 *
 * **Please update the documentation in both places when making changes**
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 *
 * If you're not already dealing with a va_list consider using snprintf().
 */
int vsnprintf(char *buf, size_t size, const char *fmt_str, va_list args)
{
        char *str, *end;
        struct printf_spec spec = {0};
        struct fmt fmt = {
                .str = fmt_str,
                .state = FORMAT_STATE_NONE,
        };

        /* Reject out-of-range values early.  Large positive sizes are
           used for unknown buffer sizes. */
        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt.str) {
                const char *old_fmt = fmt.str;

                fmt = format_decode(fmt, &spec);

                switch (fmt.state) {
                case FORMAT_STATE_NONE: {
                        int read = fmt.str - old_fmt;
                        if (str < end) {
                                int copy = read;
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        continue;
                }

                case FORMAT_STATE_NUM: {
                        unsigned long long num;
                        if (fmt.size <= sizeof(int))
                                num = convert_num_spec(va_arg(args, int), fmt.size, spec);
                        else
                                num = va_arg(args, long long);
                        str = number(str, end, num, spec);
                        continue;
                }

                case FORMAT_STATE_WIDTH:
                        set_field_width(&spec, va_arg(args, int));
                        continue;

                case FORMAT_STATE_PRECISION:
                        set_precision(&spec, va_arg(args, int));
                        continue;

                case FORMAT_STATE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;

                                }
                        }
                        c = (unsigned char) va_arg(args, int);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        continue;
                }

                case FORMAT_STATE_STR:
                        str = string(str, end, va_arg(args, char *), spec);
                        continue;

                case FORMAT_STATE_PTR:
                        str = pointer(fmt.str, str, end, va_arg(args, void *),
                                      spec);
                        while (isalnum(*fmt.str))
                                fmt.str++;
                        continue;

                case FORMAT_STATE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        continue;

                default:
                        /*
                         * Presumably the arguments passed gcc's type
                         * checking, but there is no safe or sane way
                         * for us to continue parsing the format and
                         * fetching from the va_list; the remaining
                         * specifiers and arguments would be out of
                         * sync.
                         */
                        goto out;
                }
        }

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

        /* the trailing null byte doesn't count towards the total */
        return str-buf;

}
EXPORT_SYMBOL(vsnprintf);

/**
 * vscnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The return value is the number of characters which have been written into
 * the @buf not including the trailing '\0'. If @size is == 0 the function
 * returns 0.
 *
 * If you're not already dealing with a va_list consider using scnprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vscnprintf(char *buf, size_t size, const char *fmt, va_list args)
{
        int i;

        if (unlikely(!size))
                return 0;

        i = vsnprintf(buf, size, fmt, args);

        if (likely(i < size))
                return i;

        return size - 1;
}
EXPORT_SYMBOL(vscnprintf);

/**
 * snprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters which would be
 * generated for the given input, excluding the trailing null,
 * as per ISO C99.  If the return is greater than or equal to
 * @size, the resulting string is truncated.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int snprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(snprintf);

/**
 * scnprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The return value is the number of characters written into @buf not including
 * the trailing '\0'. If @size is == 0 the function returns 0.
 */

int scnprintf(char *buf, size_t size, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vscnprintf(buf, size, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(scnprintf);

/**
 * vsprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @args: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use vsnprintf() or vscnprintf() in order to avoid
 * buffer overflows.
 *
 * If you're not already dealing with a va_list consider using sprintf().
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int vsprintf(char *buf, const char *fmt, va_list args)
{
        return vsnprintf(buf, INT_MAX, fmt, args);
}
EXPORT_SYMBOL(vsprintf);

/**
 * sprintf - Format a string and place it in a buffer
 * @buf: The buffer to place the result into
 * @fmt: The format string to use
 * @...: Arguments for the format string
 *
 * The function returns the number of characters written
 * into @buf. Use snprintf() or scnprintf() in order to avoid
 * buffer overflows.
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
int sprintf(char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsnprintf(buf, INT_MAX, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sprintf);

#ifdef CONFIG_BINARY_PRINTF
/*
 * bprintf service:
 * vbin_printf() - VA arguments to binary data
 * bstr_printf() - Binary data to text string
 */

/**
 * vbin_printf - Parse a format string and place args' binary value in a buffer
 * @bin_buf: The buffer to place args' binary value
 * @size: The size of the buffer(by words(32bits), not characters)
 * @fmt_str: The format string to use
 * @args: Arguments for the format string
 *
 * The format follows C99 vsnprintf, except %n is ignored, and its argument
 * is skipped.
 *
 * The return value is the number of words(32bits) which would be generated for
 * the given input.
 *
 * NOTE:
 * If the return value is greater than @size, the resulting bin_buf is NOT
 * valid for bstr_printf().
 */
int vbin_printf(u32 *bin_buf, size_t size, const char *fmt_str, va_list args)
{
        struct fmt fmt = {
                .str = fmt_str,
                .state = FORMAT_STATE_NONE,
        };
        struct printf_spec spec = {0};
        char *str, *end;
        int width;

        str = (char *)bin_buf;
        end = (char *)(bin_buf + size);

#define save_arg(type)                                                        \
({                                                                        \
        unsigned long long value;                                        \
        if (sizeof(type) == 8) {                                        \
                unsigned long long val8;                                \
                str = PTR_ALIGN(str, sizeof(u32));                        \
                val8 = va_arg(args, unsigned long long);                \
                if (str + sizeof(type) <= end) {                        \
                        *(u32 *)str = *(u32 *)&val8;                        \
                        *(u32 *)(str + 4) = *((u32 *)&val8 + 1);        \
                }                                                        \
                value = val8;                                                \
        } else {                                                        \
                unsigned int val4;                                        \
                str = PTR_ALIGN(str, sizeof(type));                        \
                val4 = va_arg(args, int);                                \
                if (str + sizeof(type) <= end)                                \
                        *(typeof(type) *)str = (type)(long)val4;        \
                value = (unsigned long long)val4;                        \
        }                                                                \
        str += sizeof(type);                                                \
        value;                                                                \
})

        while (*fmt.str) {
                fmt = format_decode(fmt, &spec);

                switch (fmt.state) {
                case FORMAT_STATE_NONE:
                case FORMAT_STATE_PERCENT_CHAR:
                        break;
                case FORMAT_STATE_INVALID:
                        goto out;

                case FORMAT_STATE_WIDTH:
                case FORMAT_STATE_PRECISION:
                        width = (int)save_arg(int);
                        /* Pointers may require the width */
                        if (*fmt.str == 'p')
                                set_field_width(&spec, width);
                        break;

                case FORMAT_STATE_CHAR:
                        save_arg(char);
                        break;

                case FORMAT_STATE_STR: {
                        const char *save_str = va_arg(args, char *);
                        const char *err_msg;
                        size_t len;

                        err_msg = check_pointer_msg(save_str);
                        if (err_msg)
                                save_str = err_msg;

                        len = strlen(save_str) + 1;
                        if (str + len < end)
                                memcpy(str, save_str, len);
                        str += len;
                        break;
                }

                case FORMAT_STATE_PTR:
                        /* Dereferenced pointers must be done now */
                        switch (*fmt.str) {
                        /* Dereference of functions is still OK */
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                save_arg(void *);
                                break;
                        default:
                                if (!isalnum(*fmt.str)) {
                                        save_arg(void *);
                                        break;
                                }
                                str = pointer(fmt.str, str, end, va_arg(args, void *),
                                              spec);
                                if (str + 1 < end)
                                        *str++ = '\0';
                                else
                                        end[-1] = '\0'; /* Must be nul terminated */
                        }
                        /* skip all alphanumeric pointer suffixes */
                        while (isalnum(*fmt.str))
                                fmt.str++;
                        break;

                case FORMAT_STATE_NUM:
                        if (fmt.size > sizeof(int)) {
                                save_arg(long long);
                        } else {
                                save_arg(int);
                        }
                }
        }

out:
        return (u32 *)(PTR_ALIGN(str, sizeof(u32))) - bin_buf;
#undef save_arg
}
EXPORT_SYMBOL_GPL(vbin_printf);

/**
 * bstr_printf - Format a string from binary arguments and place it in a buffer
 * @buf: The buffer to place the result into
 * @size: The size of the buffer, including the trailing null space
 * @fmt_str: The format string to use
 * @bin_buf: Binary arguments for the format string
 *
 * This function like C99 vsnprintf, but the difference is that vsnprintf gets
 * arguments from stack, and bstr_printf gets arguments from @bin_buf which is
 * a binary buffer that generated by vbin_printf.
 *
 * The format follows C99 vsnprintf, but has some extensions:
 *  see vsnprintf comment for details.
 *
 * The return value is the number of characters which would
 * be generated for the given input, excluding the trailing
 * '\0', as per ISO C99. If you want to have the exact
 * number of characters written into @buf as return value
 * (not including the trailing '\0'), use vscnprintf(). If the
 * return is greater than or equal to @size, the resulting
 * string is truncated.
 */
int bstr_printf(char *buf, size_t size, const char *fmt_str, const u32 *bin_buf)
{
        struct fmt fmt = {
                .str = fmt_str,
                .state = FORMAT_STATE_NONE,
        };
        struct printf_spec spec = {0};
        char *str, *end;
        const char *args = (const char *)bin_buf;

        if (WARN_ON_ONCE(size > INT_MAX))
                return 0;

        str = buf;
        end = buf + size;

#define get_arg(type)                                                        \
({                                                                        \
        typeof(type) value;                                                \
        if (sizeof(type) == 8) {                                        \
                args = PTR_ALIGN(args, sizeof(u32));                        \
                *(u32 *)&value = *(u32 *)args;                                \
                *((u32 *)&value + 1) = *(u32 *)(args + 4);                \
        } else {                                                        \
                args = PTR_ALIGN(args, sizeof(type));                        \
                value = *(typeof(type) *)args;                                \
        }                                                                \
        args += sizeof(type);                                                \
        value;                                                                \
})

        /* Make sure end is always >= buf */
        if (end < buf) {
                end = ((void *)-1);
                size = end - buf;
        }

        while (*fmt.str) {
                const char *old_fmt = fmt.str;
                unsigned long long num;

                fmt = format_decode(fmt, &spec);
                switch (fmt.state) {
                case FORMAT_STATE_NONE: {
                        int read = fmt.str - old_fmt;
                        if (str < end) {
                                int copy = read;
                                if (copy > end - str)
                                        copy = end - str;
                                memcpy(str, old_fmt, copy);
                        }
                        str += read;
                        continue;
                }

                case FORMAT_STATE_WIDTH:
                        set_field_width(&spec, get_arg(int));
                        continue;

                case FORMAT_STATE_PRECISION:
                        set_precision(&spec, get_arg(int));
                        continue;

                case FORMAT_STATE_CHAR: {
                        char c;

                        if (!(spec.flags & LEFT)) {
                                while (--spec.field_width > 0) {
                                        if (str < end)
                                                *str = ' ';
                                        ++str;
                                }
                        }
                        c = (unsigned char) get_arg(char);
                        if (str < end)
                                *str = c;
                        ++str;
                        while (--spec.field_width > 0) {
                                if (str < end)
                                        *str = ' ';
                                ++str;
                        }
                        continue;
                }

                case FORMAT_STATE_STR: {
                        const char *str_arg = args;
                        args += strlen(str_arg) + 1;
                        str = string(str, end, (char *)str_arg, spec);
                        continue;
                }

                case FORMAT_STATE_PTR: {
                        bool process = false;
                        int copy, len;
                        /* Non function dereferences were already done */
                        switch (*fmt.str) {
                        case 'S':
                        case 's':
                        case 'x':
                        case 'K':
                        case 'e':
                                process = true;
                                break;
                        default:
                                if (!isalnum(*fmt.str)) {
                                        process = true;
                                        break;
                                }
                                /* Pointer dereference was already processed */
                                if (str < end) {
                                        len = copy = strlen(args);
                                        if (copy > end - str)
                                                copy = end - str;
                                        memcpy(str, args, copy);
                                        str += len;
                                        args += len + 1;
                                }
                        }
                        if (process)
                                str = pointer(fmt.str, str, end, get_arg(void *), spec);

                        while (isalnum(*fmt.str))
                                fmt.str++;
                        continue;
                }

                case FORMAT_STATE_PERCENT_CHAR:
                        if (str < end)
                                *str = '%';
                        ++str;
                        continue;

                case FORMAT_STATE_INVALID:
                        goto out;

                case FORMAT_STATE_NUM:
                        if (fmt.size > sizeof(int)) {
                                num = get_arg(long long);
                        } else {
                                num = convert_num_spec(get_arg(int), fmt.size, spec);
                        }
                        str = number(str, end, num, spec);
                        continue;
                }
        } /* while(*fmt.str) */

out:
        if (size > 0) {
                if (str < end)
                        *str = '\0';
                else
                        end[-1] = '\0';
        }

#undef get_arg

        /* the trailing null byte doesn't count towards the total */
        return str - buf;
}
EXPORT_SYMBOL_GPL(bstr_printf);

#endif /* CONFIG_BINARY_PRINTF */

/**
 * vsscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        format of buffer
 * @args:        arguments
 */
int vsscanf(const char *buf, const char *fmt, va_list args)
{
        const char *str = buf;
        char *next;
        char digit;
        int num = 0;
        u8 qualifier;
        unsigned int base;
        union {
                long long s;
                unsigned long long u;
        } val;
        s16 field_width;
        bool is_sign;

        while (*fmt) {
                /* skip any white space in format */
                /* white space in format matches any amount of
                 * white space, including none, in the input.
                 */
                if (isspace(*fmt)) {
                        fmt = skip_spaces(++fmt);
                        str = skip_spaces(str);
                }

                /* anything that is not a conversion must match exactly */
                if (*fmt != '%' && *fmt) {
                        if (*fmt++ != *str++)
                                break;
                        continue;
                }

                if (!*fmt)
                        break;
                ++fmt;

                /* skip this conversion.
                 * advance both strings to next white space
                 */
                if (*fmt == '*') {
                        if (!*str)
                                break;
                        while (!isspace(*fmt) && *fmt != '%' && *fmt) {
                                /* '%*[' not yet supported, invalid format */
                                if (*fmt == '[')
                                        return num;
                                fmt++;
                        }
                        while (!isspace(*str) && *str)
                                str++;
                        continue;
                }

                /* get field width */
                field_width = -1;
                if (isdigit(*fmt)) {
                        field_width = skip_atoi(&fmt);
                        if (field_width <= 0)
                                break;
                }

                /* get conversion qualifier */
                qualifier = -1;
                if (*fmt == 'h' || _tolower(*fmt) == 'l' ||
                    *fmt == 'z') {
                        qualifier = *fmt++;
                        if (unlikely(qualifier == *fmt)) {
                                if (qualifier == 'h') {
                                        qualifier = 'H';
                                        fmt++;
                                } else if (qualifier == 'l') {
                                        qualifier = 'L';
                                        fmt++;
                                }
                        }
                }

                if (!*fmt)
                        break;

                if (*fmt == 'n') {
                        /* return number of characters read so far */
                        *va_arg(args, int *) = str - buf;
                        ++fmt;
                        continue;
                }

                if (!*str)
                        break;

                base = 10;
                is_sign = false;

                switch (*fmt++) {
                case 'c':
                {
                        char *s = (char *)va_arg(args, char*);
                        if (field_width == -1)
                                field_width = 1;
                        do {
                                *s++ = *str++;
                        } while (--field_width > 0 && *str);
                        num++;
                }
                continue;
                case 's':
                {
                        char *s = (char *)va_arg(args, char *);
                        if (field_width == -1)
                                field_width = SHRT_MAX;
                        /* first, skip leading white space in buffer */
                        str = skip_spaces(str);

                        /* now copy until next white space */
                        while (*str && !isspace(*str) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        num++;
                }
                continue;
                /*
                 * Warning: This implementation of the '[' conversion specifier
                 * deviates from its glibc counterpart in the following ways:
                 * (1) It does NOT support ranges i.e. '-' is NOT a special
                 *     character
                 * (2) It cannot match the closing bracket ']' itself
                 * (3) A field width is required
                 * (4) '%*[' (discard matching input) is currently not supported
                 *
                 * Example usage:
                 * ret = sscanf("00:0a:95","%2[^:]:%2[^:]:%2[^:]",
                 *                buf1, buf2, buf3);
                 * if (ret < 3)
                 *    // etc..
                 */
                case '[':
                {
                        char *s = (char *)va_arg(args, char *);
                        DECLARE_BITMAP(set, 256) = {0};
                        unsigned int len = 0;
                        bool negate = (*fmt == '^');

                        /* field width is required */
                        if (field_width == -1)
                                return num;

                        if (negate)
                                ++fmt;

                        for ( ; *fmt && *fmt != ']'; ++fmt, ++len)
                                __set_bit((u8)*fmt, set);

                        /* no ']' or no character set found */
                        if (!*fmt || !len)
                                return num;
                        ++fmt;

                        if (negate) {
                                bitmap_complement(set, set, 256);
                                /* exclude null '\0' byte */
                                __clear_bit(0, set);
                        }

                        /* match must be non-empty */
                        if (!test_bit((u8)*str, set))
                                return num;

                        while (test_bit((u8)*str, set) && field_width--)
                                *s++ = *str++;
                        *s = '\0';
                        ++num;
                }
                continue;
                case 'o':
                        base = 8;
                        break;
                case 'x':
                case 'X':
                        base = 16;
                        break;
                case 'i':
                        base = 0;
                        fallthrough;
                case 'd':
                        is_sign = true;
                        fallthrough;
                case 'u':
                        break;
                case '%':
                        /* looking for '%' in str */
                        if (*str++ != '%')
                                return num;
                        continue;
                default:
                        /* invalid format; stop here */
                        return num;
                }

                /* have some sort of integer conversion.
                 * first, skip white space in buffer.
                 */
                str = skip_spaces(str);

                digit = *str;
                if (is_sign && digit == '-') {
                        if (field_width == 1)
                                break;

                        digit = *(str + 1);
                }

                if (!digit
                    || (base == 16 && !isxdigit(digit))
                    || (base == 10 && !isdigit(digit))
                    || (base == 8 && !isodigit(digit))
                    || (base == 0 && !isdigit(digit)))
                        break;

                if (is_sign)
                        val.s = simple_strntoll(str, &next, base,
                                                field_width >= 0 ? field_width : INT_MAX);
                else
                        val.u = simple_strntoull(str, &next, base,
                                                 field_width >= 0 ? field_width : INT_MAX);

                switch (qualifier) {
                case 'H':        /* that's 'hh' in format */
                        if (is_sign)
                                *va_arg(args, signed char *) = val.s;
                        else
                                *va_arg(args, unsigned char *) = val.u;
                        break;
                case 'h':
                        if (is_sign)
                                *va_arg(args, short *) = val.s;
                        else
                                *va_arg(args, unsigned short *) = val.u;
                        break;
                case 'l':
                        if (is_sign)
                                *va_arg(args, long *) = val.s;
                        else
                                *va_arg(args, unsigned long *) = val.u;
                        break;
                case 'L':
                        if (is_sign)
                                *va_arg(args, long long *) = val.s;
                        else
                                *va_arg(args, unsigned long long *) = val.u;
                        break;
                case 'z':
                        *va_arg(args, size_t *) = val.u;
                        break;
                default:
                        if (is_sign)
                                *va_arg(args, int *) = val.s;
                        else
                                *va_arg(args, unsigned int *) = val.u;
                        break;
                }
                num++;

                if (!next)
                        break;
                str = next;
        }

        return num;
}
EXPORT_SYMBOL(vsscanf);

/**
 * sscanf - Unformat a buffer into a list of arguments
 * @buf:        input buffer
 * @fmt:        formatting of buffer
 * @...:        resulting arguments
 */
int sscanf(const char *buf, const char *fmt, ...)
{
        va_list args;
        int i;

        va_start(args, fmt);
        i = vsscanf(buf, fmt, args);
        va_end(args);

        return i;
}
EXPORT_SYMBOL(sscanf);






















 1510 



 1513 


 1512 






 1511 








 1509 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
// SPDX-License-Identifier: GPL-2.0-only
/*
 * arch/arm64/kernel/return_address.c
 *
 * Copyright (C) 2013 Linaro Limited
 * Author: AKASHI Takahiro <takahiro.akashi@linaro.org>
 */

#include <linux/export.h>
#include <linux/ftrace.h>
#include <linux/kprobes.h>
#include <linux/stacktrace.h>

#include <asm/stack_pointer.h>

struct return_address_data {
        unsigned int level;
        void *addr;
};

static bool save_return_addr(void *d, unsigned long pc)
{
        struct return_address_data *data = d;

        if (!data->level) {
                data->addr = (void *)pc;
                return false;
        } else {
                --data->level;
                return true;
        }
}
NOKPROBE_SYMBOL(save_return_addr);

void *return_address(unsigned int level)
{
        struct return_address_data data;

        data.level = level + 2;
        data.addr = NULL;

        arch_stack_walk(save_return_addr, &data, current, NULL);

        if (!data.level)
                return data.addr;
        else
                return NULL;
}
EXPORT_SYMBOL_GPL(return_address);
NOKPROBE_SYMBOL(return_address);



















































































































































   22 












 1393 
 1390 




 1390 
 1394 
















































































   11 






















































































































































   11 
 1392 






























































   22 


























































    8 





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Sleepable Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright (C) IBM Corporation, 2006
 * Copyright (C) Fujitsu, 2012
 *
 * Author: Paul McKenney <paulmck@linux.ibm.com>
 *           Lai Jiangshan <laijs@cn.fujitsu.com>
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                Documentation/RCU/ *.txt
 *
 */

#ifndef _LINUX_SRCU_H
#define _LINUX_SRCU_H

#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/rcu_segcblist.h>

struct srcu_struct;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

int __init_srcu_struct(struct srcu_struct *ssp, const char *name,
                       struct lock_class_key *key);

#define init_srcu_struct(ssp) \
({ \
        static struct lock_class_key __srcu_key; \
        \
        __init_srcu_struct((ssp), #ssp, &__srcu_key); \
})

#define __SRCU_DEP_MAP_INIT(srcu_name)        .dep_map = { .name = #srcu_name },
#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

int init_srcu_struct(struct srcu_struct *ssp);

#define __SRCU_DEP_MAP_INIT(srcu_name)
#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

/* Values for SRCU Tree srcu_data ->srcu_reader_flavor, but also used by rcutorture. */
#define SRCU_READ_FLAVOR_NORMAL        0x1                // srcu_read_lock().
#define SRCU_READ_FLAVOR_NMI        0x2                // srcu_read_lock_nmisafe().
#define SRCU_READ_FLAVOR_LITE        0x4                // srcu_read_lock_lite().
#define SRCU_READ_FLAVOR_FAST        0x8                // srcu_read_lock_fast().
#define SRCU_READ_FLAVOR_ALL   (SRCU_READ_FLAVOR_NORMAL | SRCU_READ_FLAVOR_NMI | \
                                SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST) // All of the above.
#define SRCU_READ_FLAVOR_SLOWGP        (SRCU_READ_FLAVOR_LITE | SRCU_READ_FLAVOR_FAST)
                                                // Flavors requiring synchronize_rcu()
                                                // instead of smp_mb().
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);

#ifdef CONFIG_TINY_SRCU
#include <linux/srcutiny.h>
#elif defined(CONFIG_TREE_SRCU)
#include <linux/srcutree.h>
#else
#error "Unknown SRCU implementation specified to kernel configuration"
#endif

void call_srcu(struct srcu_struct *ssp, struct rcu_head *head,
                void (*func)(struct rcu_head *head));
void cleanup_srcu_struct(struct srcu_struct *ssp);
void synchronize_srcu(struct srcu_struct *ssp);

#define SRCU_GET_STATE_COMPLETED 0x1

/**
 * get_completed_synchronize_srcu - Return a pre-completed polled state cookie
 *
 * Returns a value that poll_state_synchronize_srcu() will always treat
 * as a cookie whose grace period has already completed.
 */
static inline unsigned long get_completed_synchronize_srcu(void)
{
        return SRCU_GET_STATE_COMPLETED;
}

unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);

// Maximum number of unsigned long values corresponding to
// not-yet-completed SRCU grace periods.
#define NUM_ACTIVE_SRCU_POLL_OLDSTATE 2

/**
 * same_state_synchronize_srcu - Are two old-state values identical?
 * @oldstate1: First old-state value.
 * @oldstate2: Second old-state value.
 *
 * The two old-state values must have been obtained from either
 * get_state_synchronize_srcu(), start_poll_synchronize_srcu(), or
 * get_completed_synchronize_srcu().  Returns @true if the two values are
 * identical and @false otherwise.  This allows structures whose lifetimes
 * are tracked by old-state values to push these values to a list header,
 * allowing those structures to be slightly smaller.
 */
static inline bool same_state_synchronize_srcu(unsigned long oldstate1, unsigned long oldstate2)
{
        return oldstate1 == oldstate2;
}

#ifdef CONFIG_NEED_SRCU_NMI_SAFE
int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp);
void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) __releases(ssp);
#else
static inline int __srcu_read_lock_nmisafe(struct srcu_struct *ssp)
{
        return __srcu_read_lock(ssp);
}
static inline void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
{
        __srcu_read_unlock(ssp, idx);
}
#endif /* CONFIG_NEED_SRCU_NMI_SAFE */

void srcu_init(void);

#ifdef CONFIG_DEBUG_LOCK_ALLOC

/**
 * srcu_read_lock_held - might we be in SRCU read-side critical section?
 * @ssp: The srcu_struct structure to check
 *
 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an SRCU
 * read-side critical section.  In absence of CONFIG_DEBUG_LOCK_ALLOC,
 * this assumes we are in an SRCU read-side critical section unless it can
 * prove otherwise.
 *
 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
 * and while lockdep is disabled.
 *
 * Note that SRCU is based on its own statemachine and it doesn't
 * relies on normal RCU, it can be called from the CPU which
 * is in the idle loop from an RCU point of view or offline.
 */
static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        if (!debug_lockdep_rcu_enabled())
                return 1;
        return lock_is_held(&ssp->dep_map);
}

/*
 * Annotations provide deadlock detection for SRCU.
 *
 * Similar to other lockdep annotations, except there is an additional
 * srcu_lock_sync(), which is basically an empty *write*-side critical section,
 * see lock_sync() for more information.
 */

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_acquire(struct lockdep_map *map)
{
        lock_map_acquire_read(map);
}

/* Annotates a srcu_read_lock() */
static inline void srcu_lock_release(struct lockdep_map *map)
{
        lock_map_release(map);
}

/* Annotates a synchronize_srcu() */
static inline void srcu_lock_sync(struct lockdep_map *map)
{
        lock_map_sync(map);
}

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

static inline int srcu_read_lock_held(const struct srcu_struct *ssp)
{
        return 1;
}

#define srcu_lock_acquire(m) do { } while (0)
#define srcu_lock_release(m) do { } while (0)
#define srcu_lock_sync(m) do { } while (0)

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */


/**
 * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 * @c: condition to check for update-side use
 *
 * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
 * critical section will result in an RCU-lockdep splat, unless @c evaluates
 * to 1.  The @c argument will normally be a logical expression containing
 * lockdep_is_held() calls.
 */
#define srcu_dereference_check(p, ssp, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || srcu_read_lock_held(ssp), __rcu)

/**
 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 *
 * Makes rcu_dereference_check() do the dirty work.  If PROVE_RCU
 * is enabled, invoking this outside of an RCU read-side critical
 * section will result in an RCU-lockdep splat.
 */
#define srcu_dereference(p, ssp) srcu_dereference_check((p), (ssp), 0)

/**
 * srcu_dereference_notrace - no tracing and no lockdep calls from here
 * @p: the pointer to fetch and protect for later dereferencing
 * @ssp: pointer to the srcu_struct, which is used to check that we
 *        really are in an SRCU read-side critical section.
 */
#define srcu_dereference_notrace(p, ssp) srcu_dereference_check((p), (ssp), 1)

/**
 * srcu_read_lock - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section.  Note that SRCU read-side
 * critical sections may be nested.  However, it is illegal to
 * call anything that waits on an SRCU grace period for the same
 * srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().
 *
 * The return value from srcu_read_lock() is guaranteed to be
 * non-negative.  This value must be passed unaltered to the matching
 * srcu_read_unlock().  Note that srcu_read_lock() and the matching
 * srcu_read_unlock() must occur in the same context, for example, it is
 * illegal to invoke srcu_read_unlock() in an irq handler if the matching
 * srcu_read_lock() was invoked in process context.  Or, for that matter to
 * invoke srcu_read_unlock() from one task and the matching srcu_read_lock()
 * from another.
 */
static inline int srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        retval = __srcu_read_lock(ssp);
        srcu_lock_acquire(&ssp->dep_map);
        return retval;
}

/**
 * srcu_read_lock_fast - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but for a light-weight
 * smp_mb()-free reader.  See srcu_read_lock() for more information.
 *
 * If srcu_read_lock_fast() is ever used on an srcu_struct structure,
 * then none of the other flavors may be used, whether before, during,
 * or after.  Note that grace-period auto-expediting is disabled for _fast
 * srcu_struct structures because auto-expedited grace periods invoke
 * synchronize_rcu_expedited(), IPIs and all.
 *
 * Note that srcu_read_lock_fast() can be invoked only from those contexts
 * where RCU is watching, that is, from contexts where it would be legal
 * to invoke rcu_read_lock().  Otherwise, lockdep will complain.
 */
static inline struct srcu_ctr __percpu *srcu_read_lock_fast(struct srcu_struct *ssp) __acquires(ssp)
{
        struct srcu_ctr __percpu *retval;

        srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
        retval = __srcu_read_lock_fast(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/**
 * srcu_down_read_fast - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter a semaphore-like SRCU read-side critical section, but for
 * a light-weight smp_mb()-free reader.  See srcu_read_lock_fast() and
 * srcu_down_read() for more information.
 *
 * The same srcu_struct may be used concurrently by srcu_down_read_fast()
 * and srcu_read_lock_fast().
 */
static inline struct srcu_ctr __percpu *srcu_down_read_fast(struct srcu_struct *ssp) __acquires(ssp)
{
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
        srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_FAST);
        return __srcu_read_lock_fast(ssp);
}

/**
 * srcu_read_lock_lite - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but for a light-weight
 * smp_mb()-free reader.  See srcu_read_lock() for more information.
 *
 * If srcu_read_lock_lite() is ever used on an srcu_struct structure,
 * then none of the other flavors may be used, whether before, during,
 * or after.  Note that grace-period auto-expediting is disabled for _lite
 * srcu_struct structures because auto-expedited grace periods invoke
 * synchronize_rcu_expedited(), IPIs and all.
 *
 * Note that srcu_read_lock_lite() can be invoked only from those contexts
 * where RCU is watching, that is, from contexts where it would be legal
 * to invoke rcu_read_lock().  Otherwise, lockdep will complain.
 */
static inline int srcu_read_lock_lite(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_read_flavor_force(ssp, SRCU_READ_FLAVOR_LITE);
        retval = __srcu_read_lock_lite(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/**
 * srcu_read_lock_nmisafe - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter an SRCU read-side critical section, but in an NMI-safe manner.
 * See srcu_read_lock() for more information.
 *
 * If srcu_read_lock_nmisafe() is ever used on an srcu_struct structure,
 * then none of the other flavors may be used, whether before, during,
 * or after.
 */
static inline int srcu_read_lock_nmisafe(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
        retval = __srcu_read_lock_nmisafe(ssp);
        rcu_try_lock_acquire(&ssp->dep_map);
        return retval;
}

/* Used by tracing, cannot be traced and cannot invoke lockdep. */
static inline notrace int
srcu_read_lock_notrace(struct srcu_struct *ssp) __acquires(ssp)
{
        int retval;

        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        retval = __srcu_read_lock(ssp);
        return retval;
}

/**
 * srcu_down_read - register a new reader for an SRCU-protected structure.
 * @ssp: srcu_struct in which to register the new reader.
 *
 * Enter a semaphore-like SRCU read-side critical section.  Note that
 * SRCU read-side critical sections may be nested.  However, it is
 * illegal to call anything that waits on an SRCU grace period for the
 * same srcu_struct, whether directly or indirectly.  Please note that
 * one way to indirectly wait on an SRCU grace period is to acquire
 * a mutex that is held elsewhere while calling synchronize_srcu() or
 * synchronize_srcu_expedited().  But if you want lockdep to help you
 * keep this stuff straight, you should instead use srcu_read_lock().
 *
 * The semaphore-like nature of srcu_down_read() means that the matching
 * srcu_up_read() can be invoked from some other context, for example,
 * from some other task or from an irq handler.  However, neither
 * srcu_down_read() nor srcu_up_read() may be invoked from an NMI handler.
 *
 * Calls to srcu_down_read() may be nested, similar to the manner in
 * which calls to down_read() may be nested.  The same srcu_struct may be
 * used concurrently by srcu_down_read() and srcu_read_lock().
 */
static inline int srcu_down_read(struct srcu_struct *ssp) __acquires(ssp)
{
        WARN_ON_ONCE(in_nmi());
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        return __srcu_read_lock(ssp);
}

/**
 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section.
 */
static inline void srcu_read_unlock(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_read_unlock_fast - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @scp: return value from corresponding srcu_read_lock_fast().
 *
 * Exit a light-weight SRCU read-side critical section.
 */
static inline void srcu_read_unlock_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
        __releases(ssp)
{
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_fast(ssp, scp);
}

/**
 * srcu_up_read_fast - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @scp: return value from corresponding srcu_read_lock_fast().
 *
 * Exit an SRCU read-side critical section, but not necessarily from
 * the same context as the maching srcu_down_read_fast().
 */
static inline void srcu_up_read_fast(struct srcu_struct *ssp, struct srcu_ctr __percpu *scp)
        __releases(ssp)
{
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && in_nmi());
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_FAST);
        __srcu_read_unlock_fast(ssp, scp);
}

/**
 * srcu_read_unlock_lite - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock_lite().
 *
 * Exit a light-weight SRCU read-side critical section.
 */
static inline void srcu_read_unlock_lite(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_LITE);
        srcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_lite(ssp, idx);
}

/**
 * srcu_read_unlock_nmisafe - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock_nmisafe().
 *
 * Exit an SRCU read-side critical section, but in an NMI-safe manner.
 */
static inline void srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NMI);
        rcu_lock_release(&ssp->dep_map);
        __srcu_read_unlock_nmisafe(ssp, idx);
}

/* Used by tracing, cannot be traced and cannot call lockdep. */
static inline notrace void
srcu_read_unlock_notrace(struct srcu_struct *ssp, int idx) __releases(ssp)
{
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        __srcu_read_unlock(ssp, idx);
}

/**
 * srcu_up_read - unregister a old reader from an SRCU-protected structure.
 * @ssp: srcu_struct in which to unregister the old reader.
 * @idx: return value from corresponding srcu_read_lock().
 *
 * Exit an SRCU read-side critical section, but not necessarily from
 * the same context as the maching srcu_down_read().
 */
static inline void srcu_up_read(struct srcu_struct *ssp, int idx)
        __releases(ssp)
{
        WARN_ON_ONCE(idx & ~0x1);
        WARN_ON_ONCE(in_nmi());
        srcu_check_read_flavor(ssp, SRCU_READ_FLAVOR_NORMAL);
        __srcu_read_unlock(ssp, idx);
}

/**
 * smp_mb__after_srcu_read_unlock - ensure full ordering after srcu_read_unlock
 *
 * Converts the preceding srcu_read_unlock into a two-way memory barrier.
 *
 * Call this after srcu_read_unlock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_unlock will appear to happen after
 * the preceding srcu_read_unlock.
 */
static inline void smp_mb__after_srcu_read_unlock(void)
{
        /* __srcu_read_unlock has smp_mb() internally so nothing to do here. */
}

/**
 * smp_mb__after_srcu_read_lock - ensure full ordering after srcu_read_lock
 *
 * Converts the preceding srcu_read_lock into a two-way memory barrier.
 *
 * Call this after srcu_read_lock, to guarantee that all memory operations
 * that occur after smp_mb__after_srcu_read_lock will appear to happen after
 * the preceding srcu_read_lock.
 */
static inline void smp_mb__after_srcu_read_lock(void)
{
        /* __srcu_read_lock has smp_mb() internally so nothing to do here. */
}

DEFINE_LOCK_GUARD_1(srcu, struct srcu_struct,
                    _T->idx = srcu_read_lock(_T->lock),
                    srcu_read_unlock(_T->lock, _T->idx),
                    int idx)

#endif



























































    3 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM notifier

#if !defined(_TRACE_NOTIFIERS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_NOTIFIERS_H

#include <linux/tracepoint.h>

DECLARE_EVENT_CLASS(notifier_info,

        TP_PROTO(void *cb),

        TP_ARGS(cb),

        TP_STRUCT__entry(
                __field(void *, cb)
        ),

        TP_fast_assign(
                __entry->cb = cb;
        ),

        TP_printk("%ps", __entry->cb)
);

/*
 * notifier_register - called upon notifier callback registration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_register,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_unregister - called upon notifier callback unregistration
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_unregister,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

/*
 * notifier_run - called upon notifier callback execution
 *
 * @cb:                callback pointer
 *
 */
DEFINE_EVENT(notifier_info, notifier_run,

        TP_PROTO(void *cb),

        TP_ARGS(cb)
);

#endif /* _TRACE_NOTIFIERS_H */

/* This part must be outside protection */
#include <trace/define_trace.h>











































  451 















































  576 








































  398 

  399 









  213 












  213 






1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/lockref.h>

#if USE_CMPXCHG_LOCKREF

/*
 * Note that the "cmpxchg()" reloads the "old" value for the
 * failure case.
 */
#define CMPXCHG_LOOP(CODE, SUCCESS) do {                                        \
        int retry = 100;                                                        \
        struct lockref old;                                                        \
        BUILD_BUG_ON(sizeof(old) != 8);                                                \
        old.lock_count = READ_ONCE(lockref->lock_count);                        \
        while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {          \
                struct lockref new = old;                                        \
                CODE                                                                \
                if (likely(try_cmpxchg64_relaxed(&lockref->lock_count,                \
                                                 &old.lock_count,                \
                                                 new.lock_count))) {                \
                        SUCCESS;                                                \
                }                                                                \
                if (!--retry)                                                        \
                        break;                                                        \
        }                                                                        \
} while (0)

#else

#define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0)

#endif

/**
 * lockref_get - Increments reference count unconditionally
 * @lockref: pointer to lockref structure
 *
 * This operation is only valid if you already hold a reference
 * to the object, so you know the count cannot be zero.
 */
void lockref_get(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count++;
        ,
                return;
        );

        spin_lock(&lockref->lock);
        lockref->count++;
        spin_unlock(&lockref->lock);
}
EXPORT_SYMBOL(lockref_get);

/**
 * lockref_get_not_zero - Increments count unless the count is 0 or dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count was zero
 */
bool lockref_get_not_zero(struct lockref *lockref)
{
        bool retval = false;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count <= 0)
                        return false;
        ,
                return true;
        );

        spin_lock(&lockref->lock);
        if (lockref->count > 0) {
                lockref->count++;
                retval = true;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_zero);

/**
 * lockref_put_return - Decrement reference count if possible
 * @lockref: pointer to lockref structure
 *
 * Decrement the reference count and return the new value.
 * If the lockref was dead or locked, return -1.
 */
int lockref_put_return(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 0)
                        return -1;
        ,
                return new.count;
        );
        return -1;
}
EXPORT_SYMBOL(lockref_put_return);

/**
 * lockref_put_or_lock - decrements count unless count <= 1 before decrement
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken
 */
bool lockref_put_or_lock(struct lockref *lockref)
{
        CMPXCHG_LOOP(
                new.count--;
                if (old.count <= 1)
                        break;
        ,
                return true;
        );

        spin_lock(&lockref->lock);
        if (lockref->count <= 1)
                return false;
        lockref->count--;
        spin_unlock(&lockref->lock);
        return true;
}
EXPORT_SYMBOL(lockref_put_or_lock);

/**
 * lockref_mark_dead - mark lockref dead
 * @lockref: pointer to lockref structure
 */
void lockref_mark_dead(struct lockref *lockref)
{
        assert_spin_locked(&lockref->lock);
        lockref->count = -128;
}
EXPORT_SYMBOL(lockref_mark_dead);

/**
 * lockref_get_not_dead - Increments count unless the ref is dead
 * @lockref: pointer to lockref structure
 * Return: 1 if count updated successfully or 0 if lockref was dead
 */
bool lockref_get_not_dead(struct lockref *lockref)
{
        bool retval = false;

        CMPXCHG_LOOP(
                new.count++;
                if (old.count < 0)
                        return false;
        ,
                return true;
        );

        spin_lock(&lockref->lock);
        if (lockref->count >= 0) {
                lockref->count++;
                retval = true;
        }
        spin_unlock(&lockref->lock);
        return retval;
}
EXPORT_SYMBOL(lockref_get_not_dead);



























































































































































































































































    3 


    3 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Network interface table.
 *
 * Network interfaces (devices) do not have a security field, so we
 * maintain a table associating each interface with a SID.
 *
 * Author: James Morris <jmorris@redhat.com>
 *
 * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com>
 * Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
 *                      Paul Moore <paul@paul-moore.com>
 */
#include <linux/init.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/netdevice.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>

#include "security.h"
#include "objsec.h"
#include "netif.h"

#define SEL_NETIF_HASH_SIZE        64
#define SEL_NETIF_HASH_MAX        1024

struct sel_netif {
        struct list_head list;
        struct netif_security_struct nsec;
        struct rcu_head rcu_head;
};

static u32 sel_netif_total;
static DEFINE_SPINLOCK(sel_netif_lock);
static struct list_head sel_netif_hash[SEL_NETIF_HASH_SIZE];

/**
 * sel_netif_hashfn - Hashing function for the interface table
 * @ns: the network namespace
 * @ifindex: the network interface
 *
 * Description:
 * This is the hashing function for the network interface table, it returns the
 * bucket number for the given interface.
 *
 */
static inline u32 sel_netif_hashfn(const struct net *ns, int ifindex)
{
        return (((uintptr_t)ns + ifindex) & (SEL_NETIF_HASH_SIZE - 1));
}

/**
 * sel_netif_find - Search for an interface record
 * @ns: the network namespace
 * @ifindex: the network interface
 *
 * Description:
 * Search the network interface table and return the record matching @ifindex.
 * If an entry can not be found in the table return NULL.
 *
 */
static inline struct sel_netif *sel_netif_find(const struct net *ns,
                                               int ifindex)
{
        u32 idx = sel_netif_hashfn(ns, ifindex);
        struct sel_netif *netif;

        list_for_each_entry_rcu(netif, &sel_netif_hash[idx], list)
                if (net_eq(netif->nsec.ns, ns) &&
                    netif->nsec.ifindex == ifindex)
                        return netif;

        return NULL;
}

/**
 * sel_netif_insert - Insert a new interface into the table
 * @netif: the new interface record
 *
 * Description:
 * Add a new interface record to the network interface hash table.  Returns
 * zero on success, negative values on failure.
 *
 */
static int sel_netif_insert(struct sel_netif *netif)
{
        u32 idx;

        if (sel_netif_total >= SEL_NETIF_HASH_MAX)
                return -ENOSPC;

        idx = sel_netif_hashfn(netif->nsec.ns, netif->nsec.ifindex);
        list_add_rcu(&netif->list, &sel_netif_hash[idx]);
        sel_netif_total++;

        return 0;
}

/**
 * sel_netif_destroy - Remove an interface record from the table
 * @netif: the existing interface record
 *
 * Description:
 * Remove an existing interface record from the network interface table.
 *
 */
static void sel_netif_destroy(struct sel_netif *netif)
{
        list_del_rcu(&netif->list);
        sel_netif_total--;
        kfree_rcu(netif, rcu_head);
}

/**
 * sel_netif_sid_slow - Lookup the SID of a network interface using the policy
 * @ns: the network namespace
 * @ifindex: the network interface
 * @sid: interface SID
 *
 * Description:
 * This function determines the SID of a network interface by querying the
 * security policy.  The result is added to the network interface table to
 * speedup future queries.  Returns zero on success, negative values on
 * failure.
 *
 */
static int sel_netif_sid_slow(struct net *ns, int ifindex, u32 *sid)
{
        int ret = 0;
        struct sel_netif *netif;
        struct sel_netif *new;
        struct net_device *dev;

        /* NOTE: we always use init's network namespace since we don't
         * currently support containers */

        dev = dev_get_by_index(ns, ifindex);
        if (unlikely(dev == NULL)) {
                pr_warn("SELinux: failure in %s(), invalid network interface (%d)\n",
                        __func__, ifindex);
                return -ENOENT;
        }

        spin_lock_bh(&sel_netif_lock);
        netif = sel_netif_find(ns, ifindex);
        if (netif != NULL) {
                *sid = netif->nsec.sid;
                goto out;
        }

        ret = security_netif_sid(dev->name, sid);
        if (ret != 0)
                goto out;
        new = kzalloc(sizeof(*new), GFP_ATOMIC);
        if (new) {
                new->nsec.ns = ns;
                new->nsec.ifindex = ifindex;
                new->nsec.sid = *sid;
                if (sel_netif_insert(new))
                        kfree(new);
        }

out:
        spin_unlock_bh(&sel_netif_lock);
        dev_put(dev);
        if (unlikely(ret))
                pr_warn("SELinux: failure in %s(), unable to determine network interface label (%d)\n",
                        __func__, ifindex);
        return ret;
}

/**
 * sel_netif_sid - Lookup the SID of a network interface
 * @ns: the network namespace
 * @ifindex: the network interface
 * @sid: interface SID
 *
 * Description:
 * This function determines the SID of a network interface using the fastest
 * method possible.  First the interface table is queried, but if an entry
 * can't be found then the policy is queried and the result is added to the
 * table to speedup future queries.  Returns zero on success, negative values
 * on failure.
 *
 */
int sel_netif_sid(struct net *ns, int ifindex, u32 *sid)
{
        struct sel_netif *netif;

        rcu_read_lock();
        netif = sel_netif_find(ns, ifindex);
        if (likely(netif != NULL)) {
                *sid = netif->nsec.sid;
                rcu_read_unlock();
                return 0;
        }
        rcu_read_unlock();

        return sel_netif_sid_slow(ns, ifindex, sid);
}

/**
 * sel_netif_kill - Remove an entry from the network interface table
 * @ns: the network namespace
 * @ifindex: the network interface
 *
 * Description:
 * This function removes the entry matching @ifindex from the network interface
 * table if it exists.
 *
 */
static void sel_netif_kill(const struct net *ns, int ifindex)
{
        struct sel_netif *netif;

        rcu_read_lock();
        spin_lock_bh(&sel_netif_lock);
        netif = sel_netif_find(ns, ifindex);
        if (netif)
                sel_netif_destroy(netif);
        spin_unlock_bh(&sel_netif_lock);
        rcu_read_unlock();
}

/**
 * sel_netif_flush - Flush the entire network interface table
 *
 * Description:
 * Remove all entries from the network interface table.
 *
 */
void sel_netif_flush(void)
{
        int idx;
        struct sel_netif *netif;

        spin_lock_bh(&sel_netif_lock);
        for (idx = 0; idx < SEL_NETIF_HASH_SIZE; idx++)
                list_for_each_entry(netif, &sel_netif_hash[idx], list)
                        sel_netif_destroy(netif);
        spin_unlock_bh(&sel_netif_lock);
}

static int sel_netif_netdev_notifier_handler(struct notifier_block *this,
                                             unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        if (event == NETDEV_DOWN)
                sel_netif_kill(dev_net(dev), dev->ifindex);

        return NOTIFY_DONE;
}

static struct notifier_block sel_netif_netdev_notifier = {
        .notifier_call = sel_netif_netdev_notifier_handler,
};

static __init int sel_netif_init(void)
{
        int i;

        if (!selinux_enabled_boot)
                return 0;

        for (i = 0; i < SEL_NETIF_HASH_SIZE; i++)
                INIT_LIST_HEAD(&sel_netif_hash[i]);

        register_netdevice_notifier(&sel_netif_netdev_notifier);

        return 0;
}

__initcall(sel_netif_init);






























































































































































































































































































  149 


















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/ptrace.h
 *
 * Copyright (C) 1996-2003 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_PTRACE_H
#define __ASM_PTRACE_H

#include <asm/cpufeature.h>

#include <uapi/asm/ptrace.h>

/* Current Exception Level values, as contained in CurrentEL */
#define CurrentEL_EL1                (1 << 2)
#define CurrentEL_EL2                (2 << 2)

#define INIT_PSTATE_EL1 \
        (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h)
#define INIT_PSTATE_EL2 \
        (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL2h)

#include <linux/irqchip/arm-gic-v3-prio.h>

#define GIC_PRIO_IRQON                GICV3_PRIO_UNMASKED
#define GIC_PRIO_IRQOFF                GICV3_PRIO_IRQ

#define GIC_PRIO_PSR_I_SET        GICV3_PRIO_PSR_I_SET

/* Additional SPSR bits not exposed in the UABI */
#define PSR_MODE_THREAD_BIT        (1 << 0)
#define PSR_IL_BIT                (1 << 20)

/* AArch32-specific ptrace requests */
#define COMPAT_PTRACE_GETREGS                12
#define COMPAT_PTRACE_SETREGS                13
#define COMPAT_PTRACE_GET_THREAD_AREA        22
#define COMPAT_PTRACE_SET_SYSCALL        23
#define COMPAT_PTRACE_GETVFPREGS        27
#define COMPAT_PTRACE_SETVFPREGS        28
#define COMPAT_PTRACE_GETHBPREGS        29
#define COMPAT_PTRACE_SETHBPREGS        30

/* SPSR_ELx bits for exceptions taken from AArch32 */
#define PSR_AA32_MODE_MASK        0x0000001f
#define PSR_AA32_MODE_USR        0x00000010
#define PSR_AA32_MODE_FIQ        0x00000011
#define PSR_AA32_MODE_IRQ        0x00000012
#define PSR_AA32_MODE_SVC        0x00000013
#define PSR_AA32_MODE_ABT        0x00000017
#define PSR_AA32_MODE_HYP        0x0000001a
#define PSR_AA32_MODE_UND        0x0000001b
#define PSR_AA32_MODE_SYS        0x0000001f
#define PSR_AA32_T_BIT                0x00000020
#define PSR_AA32_F_BIT                0x00000040
#define PSR_AA32_I_BIT                0x00000080
#define PSR_AA32_A_BIT                0x00000100
#define PSR_AA32_E_BIT                0x00000200
#define PSR_AA32_PAN_BIT        0x00400000
#define PSR_AA32_SSBS_BIT        0x00800000
#define PSR_AA32_DIT_BIT        0x01000000
#define PSR_AA32_Q_BIT                0x08000000
#define PSR_AA32_V_BIT                0x10000000
#define PSR_AA32_C_BIT                0x20000000
#define PSR_AA32_Z_BIT                0x40000000
#define PSR_AA32_N_BIT                0x80000000
#define PSR_AA32_IT_MASK        0x0600fc00        /* If-Then execution state mask */
#define PSR_AA32_GE_MASK        0x000f0000

#ifdef CONFIG_CPU_BIG_ENDIAN
#define PSR_AA32_ENDSTATE        PSR_AA32_E_BIT
#else
#define PSR_AA32_ENDSTATE        0
#endif

/* AArch32 CPSR bits, as seen in AArch32 */
#define COMPAT_PSR_DIT_BIT        0x00200000

/*
 * These are 'magic' values for PTRACE_PEEKUSR that return info about where a
 * process is located in memory.
 */
#define COMPAT_PT_TEXT_ADDR                0x10000
#define COMPAT_PT_DATA_ADDR                0x10004
#define COMPAT_PT_TEXT_END_ADDR                0x10008

/*
 * If pt_regs.syscallno == NO_SYSCALL, then the thread is not executing
 * a syscall -- i.e., its most recent entry into the kernel from
 * userspace was not via SVC, or otherwise a tracer cancelled the syscall.
 *
 * This must have the value -1, for ABI compatibility with ptrace etc.
 */
#define NO_SYSCALL (-1)

#ifndef __ASSEMBLY__
#include <linux/bug.h>
#include <linux/types.h>

#include <asm/stacktrace/frame.h>

/* sizeof(struct user) for AArch32 */
#define COMPAT_USER_SZ        296

/* Architecturally defined mapping between AArch32 and AArch64 registers */
#define compat_usr(x)        regs[(x)]
#define compat_fp        regs[11]
#define compat_sp        regs[13]
#define compat_lr        regs[14]
#define compat_sp_hyp        regs[15]
#define compat_lr_irq        regs[16]
#define compat_sp_irq        regs[17]
#define compat_lr_svc        regs[18]
#define compat_sp_svc        regs[19]
#define compat_lr_abt        regs[20]
#define compat_sp_abt        regs[21]
#define compat_lr_und        regs[22]
#define compat_sp_und        regs[23]
#define compat_r8_fiq        regs[24]
#define compat_r9_fiq        regs[25]
#define compat_r10_fiq        regs[26]
#define compat_r11_fiq        regs[27]
#define compat_r12_fiq        regs[28]
#define compat_sp_fiq        regs[29]
#define compat_lr_fiq        regs[30]

static inline unsigned long compat_psr_to_pstate(const unsigned long psr)
{
        unsigned long pstate;

        pstate = psr & ~COMPAT_PSR_DIT_BIT;

        if (psr & COMPAT_PSR_DIT_BIT)
                pstate |= PSR_AA32_DIT_BIT;

        return pstate;
}

static inline unsigned long pstate_to_compat_psr(const unsigned long pstate)
{
        unsigned long psr;

        psr = pstate & ~PSR_AA32_DIT_BIT;

        if (pstate & PSR_AA32_DIT_BIT)
                psr |= COMPAT_PSR_DIT_BIT;

        return psr;
}

/*
 * This struct defines the way the registers are stored on the stack during an
 * exception. struct user_pt_regs must form a prefix of struct pt_regs.
 */
struct pt_regs {
        union {
                struct user_pt_regs user_regs;
                struct {
                        u64 regs[31];
                        u64 sp;
                        u64 pc;
                        u64 pstate;
                };
        };
        u64 orig_x0;
        s32 syscallno;
        u32 pmr;

        u64 sdei_ttbr1;
        struct frame_record_meta stackframe;

        /* Only valid for some EL1 exceptions. */
        u64 lockdep_hardirqs;
        u64 exit_rcu;
};

/* For correct stack alignment, pt_regs has to be a multiple of 16 bytes. */
static_assert(IS_ALIGNED(sizeof(struct pt_regs), 16));

static inline bool in_syscall(struct pt_regs const *regs)
{
        return regs->syscallno != NO_SYSCALL;
}

static inline void forget_syscall(struct pt_regs *regs)
{
        regs->syscallno = NO_SYSCALL;
}

#define MAX_REG_OFFSET offsetof(struct pt_regs, pstate)

#define arch_has_single_step()        (1)

#ifdef CONFIG_COMPAT
#define compat_thumb_mode(regs) \
        (((regs)->pstate & PSR_AA32_T_BIT))
#else
#define compat_thumb_mode(regs) (0)
#endif

#define user_mode(regs)        \
        (((regs)->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t)

#define compat_user_mode(regs)        \
        (((regs)->pstate & (PSR_MODE32_BIT | PSR_MODE_MASK)) == \
         (PSR_MODE32_BIT | PSR_MODE_EL0t))

#define processor_mode(regs) \
        ((regs)->pstate & PSR_MODE_MASK)

#define irqs_priority_unmasked(regs)                                        \
        (system_uses_irq_prio_masking() ?                                \
                (regs)->pmr == GIC_PRIO_IRQON :                                \
                true)

#define interrupts_enabled(regs)                        \
        (!((regs)->pstate & PSR_I_BIT) && irqs_priority_unmasked(regs))

#define fast_interrupts_enabled(regs) \
        (!((regs)->pstate & PSR_F_BIT))

static inline unsigned long user_stack_pointer(struct pt_regs *regs)
{
        if (compat_user_mode(regs))
                return regs->compat_sp;
        return regs->sp;
}

extern int regs_query_register_offset(const char *name);
extern unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
                                               unsigned int n);

/**
 * regs_get_register() - get register value from its offset
 * @regs:        pt_regs from which register value is gotten
 * @offset:        offset of the register.
 *
 * regs_get_register returns the value of a register whose offset from @regs.
 * The @offset is the offset of the register in struct pt_regs.
 * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
 */
static inline u64 regs_get_register(struct pt_regs *regs, unsigned int offset)
{
        u64 val = 0;

        WARN_ON(offset & 7);

        offset >>= 3;
        switch (offset) {
        case 0 ... 30:
                val = regs->regs[offset];
                break;
        case offsetof(struct pt_regs, sp) >> 3:
                val = regs->sp;
                break;
        case offsetof(struct pt_regs, pc) >> 3:
                val = regs->pc;
                break;
        case offsetof(struct pt_regs, pstate) >> 3:
                val = regs->pstate;
                break;
        default:
                val = 0;
        }

        return val;
}

/*
 * Read a register given an architectural register index r.
 * This handles the common case where 31 means XZR, not SP.
 */
static inline unsigned long pt_regs_read_reg(const struct pt_regs *regs, int r)
{
        return (r == 31) ? 0 : regs->regs[r];
}

/*
 * Write a register given an architectural register index r.
 * This handles the common case where 31 means XZR, not SP.
 */
static inline void pt_regs_write_reg(struct pt_regs *regs, int r,
                                     unsigned long val)
{
        if (r != 31)
                regs->regs[r] = val;
}

/* Valid only for Kernel mode traps. */
static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
{
        return regs->sp;
}

static inline unsigned long regs_return_value(struct pt_regs *regs)
{
        unsigned long val = regs->regs[0];

        /*
         * Audit currently uses regs_return_value() instead of
         * syscall_get_return_value(). Apply the same sign-extension here until
         * audit is updated to use syscall_get_return_value().
         */
        if (compat_user_mode(regs))
                val = sign_extend64(val, 31);

        return val;
}

static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
{
        regs->regs[0] = rc;
}

/**
 * regs_get_kernel_argument() - get Nth function argument in kernel
 * @regs:        pt_regs of that context
 * @n:                function argument number (start from 0)
 *
 * regs_get_argument() returns @n th argument of the function call.
 *
 * Note that this chooses the most likely register mapping. In very rare
 * cases this may not return correct data, for example, if one of the
 * function parameters is 16 bytes or bigger. In such cases, we cannot
 * get access the parameter correctly and the register assignment of
 * subsequent parameters will be shifted.
 */
static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs,
                                                     unsigned int n)
{
#define NR_REG_ARGUMENTS 8
        if (n < NR_REG_ARGUMENTS)
                return pt_regs_read_reg(regs, n);
        return 0;
}

/* We must avoid circular header include via sched.h */
struct task_struct;
int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task);

static inline unsigned long instruction_pointer(struct pt_regs *regs)
{
        return regs->pc;
}
static inline void instruction_pointer_set(struct pt_regs *regs,
                unsigned long val)
{
        regs->pc = val;
}

static inline unsigned long frame_pointer(struct pt_regs *regs)
{
        return regs->regs[29];
}

#define procedure_link_pointer(regs)        ((regs)->regs[30])

static inline void procedure_link_pointer_set(struct pt_regs *regs,
                                           unsigned long val)
{
        procedure_link_pointer(regs) = val;
}

extern unsigned long profile_pc(struct pt_regs *regs);

#endif /* __ASSEMBLY__ */
#endif
























































































































  265 








  265 









  265 













  265 




  265 





  265 





































































































































































































































































































































































































































































































  163 


  163 













































































    6 

    6 



    6 






































































































































































































































































































































  265 





  265 
  265 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
// SPDX-License-Identifier: GPL-2.0
/*
 * kobject.c - library routines for handling generic kernel objects
 *
 * Copyright (c) 2002-2003 Patrick Mochel <mochel@osdl.org>
 * Copyright (c) 2006-2007 Greg Kroah-Hartman <greg@kroah.com>
 * Copyright (c) 2006-2007 Novell Inc.
 *
 * Please see the file Documentation/core-api/kobject.rst for critical information
 * about using the kobject interface.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/export.h>
#include <linux/stat.h>
#include <linux/slab.h>
#include <linux/random.h>

/**
 * kobject_namespace() - Return @kobj's namespace tag.
 * @kobj: kobject in question
 *
 * Returns namespace tag of @kobj if its parent has namespace ops enabled
 * and thus @kobj should have a namespace tag associated with it.  Returns
 * %NULL otherwise.
 */
const void *kobject_namespace(const struct kobject *kobj)
{
        const struct kobj_ns_type_operations *ns_ops = kobj_ns_ops(kobj);

        if (!ns_ops || ns_ops->type == KOBJ_NS_TYPE_NONE)
                return NULL;

        return kobj->ktype->namespace(kobj);
}

/**
 * kobject_get_ownership() - Get sysfs ownership data for @kobj.
 * @kobj: kobject in question
 * @uid: kernel user ID for sysfs objects
 * @gid: kernel group ID for sysfs objects
 *
 * Returns initial uid/gid pair that should be used when creating sysfs
 * representation of given kobject. Normally used to adjust ownership of
 * objects in a container.
 */
void kobject_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        *uid = GLOBAL_ROOT_UID;
        *gid = GLOBAL_ROOT_GID;

        if (kobj->ktype->get_ownership)
                kobj->ktype->get_ownership(kobj, uid, gid);
}

static bool kobj_ns_type_is_valid(enum kobj_ns_type type)
{
        if ((type <= KOBJ_NS_TYPE_NONE) || (type >= KOBJ_NS_TYPES))
                return false;

        return true;
}

static int create_dir(struct kobject *kobj)
{
        const struct kobj_type *ktype = get_ktype(kobj);
        const struct kobj_ns_type_operations *ops;
        int error;

        error = sysfs_create_dir_ns(kobj, kobject_namespace(kobj));
        if (error)
                return error;

        if (ktype) {
                error = sysfs_create_groups(kobj, ktype->default_groups);
                if (error) {
                        sysfs_remove_dir(kobj);
                        return error;
                }
        }

        /*
         * @kobj->sd may be deleted by an ancestor going away.  Hold an
         * extra reference so that it stays until @kobj is gone.
         */
        sysfs_get(kobj->sd);

        /*
         * If @kobj has ns_ops, its children need to be filtered based on
         * their namespace tags.  Enable namespace support on @kobj->sd.
         */
        ops = kobj_child_ns_ops(kobj);
        if (ops) {
                BUG_ON(!kobj_ns_type_is_valid(ops->type));
                BUG_ON(!kobj_ns_type_registered(ops->type));

                sysfs_enable_ns(kobj->sd);
        }

        return 0;
}

static int get_kobj_path_length(const struct kobject *kobj)
{
        int length = 1;
        const struct kobject *parent = kobj;

        /* walk up the ancestors until we hit the one pointing to the
         * root.
         * Add 1 to strlen for leading '/' of each level.
         */
        do {
                if (kobject_name(parent) == NULL)
                        return 0;
                length += strlen(kobject_name(parent)) + 1;
                parent = parent->parent;
        } while (parent);
        return length;
}

static int fill_kobj_path(const struct kobject *kobj, char *path, int length)
{
        const struct kobject *parent;

        --length;
        for (parent = kobj; parent; parent = parent->parent) {
                int cur = strlen(kobject_name(parent));
                /* back up enough to print this name with '/' */
                length -= cur;
                if (length <= 0)
                        return -EINVAL;
                memcpy(path + length, kobject_name(parent), cur);
                *(path + --length) = '/';
        }

        pr_debug("'%s' (%p): %s: path = '%s'\n", kobject_name(kobj),
                 kobj, __func__, path);

        return 0;
}

/**
 * kobject_get_path() - Allocate memory and fill in the path for @kobj.
 * @kobj:        kobject in question, with which to build the path
 * @gfp_mask:        the allocation type used to allocate the path
 *
 * Return: The newly allocated memory, caller must free with kfree().
 */
char *kobject_get_path(const struct kobject *kobj, gfp_t gfp_mask)
{
        char *path;
        int len;

retry:
        len = get_kobj_path_length(kobj);
        if (len == 0)
                return NULL;
        path = kzalloc(len, gfp_mask);
        if (!path)
                return NULL;
        if (fill_kobj_path(kobj, path, len)) {
                kfree(path);
                goto retry;
        }

        return path;
}
EXPORT_SYMBOL_GPL(kobject_get_path);

/* add the kobject to its kset's list */
static void kobj_kset_join(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        kset_get(kobj->kset);
        spin_lock(&kobj->kset->list_lock);
        list_add_tail(&kobj->entry, &kobj->kset->list);
        spin_unlock(&kobj->kset->list_lock);
}

/* remove the kobject from its kset's list */
static void kobj_kset_leave(struct kobject *kobj)
{
        if (!kobj->kset)
                return;

        spin_lock(&kobj->kset->list_lock);
        list_del_init(&kobj->entry);
        spin_unlock(&kobj->kset->list_lock);
        kset_put(kobj->kset);
}

static void kobject_init_internal(struct kobject *kobj)
{
        if (!kobj)
                return;
        kref_init(&kobj->kref);
        INIT_LIST_HEAD(&kobj->entry);
        kobj->state_in_sysfs = 0;
        kobj->state_add_uevent_sent = 0;
        kobj->state_remove_uevent_sent = 0;
        kobj->state_initialized = 1;
}


static int kobject_add_internal(struct kobject *kobj)
{
        int error = 0;
        struct kobject *parent;

        if (!kobj)
                return -ENOENT;

        if (!kobj->name || !kobj->name[0]) {
                WARN(1,
                     "kobject: (%p): attempted to be registered with empty name!\n",
                     kobj);
                return -EINVAL;
        }

        parent = kobject_get(kobj->parent);

        /* join kset if set, use it as parent if we do not already have one */
        if (kobj->kset) {
                if (!parent)
                        parent = kobject_get(&kobj->kset->kobj);
                kobj_kset_join(kobj);
                kobj->parent = parent;
        }

        pr_debug("'%s' (%p): %s: parent: '%s', set: '%s'\n",
                 kobject_name(kobj), kobj, __func__,
                 parent ? kobject_name(parent) : "<NULL>",
                 kobj->kset ? kobject_name(&kobj->kset->kobj) : "<NULL>");

        error = create_dir(kobj);
        if (error) {
                kobj_kset_leave(kobj);
                kobject_put(parent);
                kobj->parent = NULL;

                /* be noisy on error issues */
                if (error == -EEXIST)
                        pr_err("%s failed for %s with -EEXIST, don't try to register things with the same name in the same directory.\n",
                               __func__, kobject_name(kobj));
                else
                        pr_err("%s failed for %s (error: %d parent: %s)\n",
                               __func__, kobject_name(kobj), error,
                               parent ? kobject_name(parent) : "'none'");
        } else
                kobj->state_in_sysfs = 1;

        return error;
}

/**
 * kobject_set_name_vargs() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 * @vargs: vargs to format the string.
 */
int kobject_set_name_vargs(struct kobject *kobj, const char *fmt,
                                  va_list vargs)
{
        const char *s;

        if (kobj->name && !fmt)
                return 0;

        s = kvasprintf_const(GFP_KERNEL, fmt, vargs);
        if (!s)
                return -ENOMEM;

        /*
         * ewww... some of these buggers have '/' in the name ... If
         * that's the case, we need to make sure we have an actual
         * allocated copy to modify, since kvasprintf_const may have
         * returned something from .rodata.
         */
        if (strchr(s, '/')) {
                char *t;

                t = kstrdup(s, GFP_KERNEL);
                kfree_const(s);
                if (!t)
                        return -ENOMEM;
                s = strreplace(t, '/', '!');
        }
        kfree_const(kobj->name);
        kobj->name = s;

        return 0;
}

/**
 * kobject_set_name() - Set the name of a kobject.
 * @kobj: struct kobject to set the name of
 * @fmt: format string used to build the name
 *
 * This sets the name of the kobject.  If you have already added the
 * kobject to the system, you must call kobject_rename() in order to
 * change the name of the kobject.
 */
int kobject_set_name(struct kobject *kobj, const char *fmt, ...)
{
        va_list vargs;
        int retval;

        va_start(vargs, fmt);
        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        va_end(vargs);

        return retval;
}
EXPORT_SYMBOL(kobject_set_name);

/**
 * kobject_init() - Initialize a kobject structure.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 *
 * This function will properly initialize a kobject such that it can then
 * be passed to the kobject_add() call.
 *
 * After this function is called, the kobject MUST be cleaned up by a call
 * to kobject_put(), not by a call to kfree directly to ensure that all of
 * the memory is cleaned up properly.
 */
void kobject_init(struct kobject *kobj, const struct kobj_type *ktype)
{
        char *err_str;

        if (!kobj) {
                err_str = "invalid kobject pointer!";
                goto error;
        }
        if (!ktype) {
                err_str = "must have a ktype to be initialized properly!\n";
                goto error;
        }
        if (kobj->state_initialized) {
                /* do not error out as sometimes we can recover */
                pr_err("kobject (%p): tried to init an initialized object, something is seriously wrong.\n",
                       kobj);
                dump_stack_lvl(KERN_ERR);
        }

        kobject_init_internal(kobj);
        kobj->ktype = ktype;
        return;

error:
        pr_err("kobject (%p): %s\n", kobj, err_str);
        dump_stack_lvl(KERN_ERR);
}
EXPORT_SYMBOL(kobject_init);

static __printf(3, 0) int kobject_add_varg(struct kobject *kobj,
                                           struct kobject *parent,
                                           const char *fmt, va_list vargs)
{
        int retval;

        retval = kobject_set_name_vargs(kobj, fmt, vargs);
        if (retval) {
                pr_err("can not set name properly!\n");
                return retval;
        }
        kobj->parent = parent;
        return kobject_add_internal(kobj);
}

/**
 * kobject_add() - The main kobject add function.
 * @kobj: the kobject to add
 * @parent: pointer to the parent of the kobject.
 * @fmt: format to name the kobject with.
 *
 * The kobject name is set and added to the kobject hierarchy in this
 * function.
 *
 * If @parent is set, then the parent of the @kobj will be set to it.
 * If @parent is NULL, then the parent of the @kobj will be set to the
 * kobject associated with the kset assigned to this kobject.  If no kset
 * is assigned to the kobject, then the kobject will be located in the
 * root of the sysfs tree.
 *
 * Note, no "add" uevent will be created with this call, the caller should set
 * up all of the necessary sysfs files for the object and then call
 * kobject_uevent() with the UEVENT_ADD parameter to ensure that
 * userspace is properly notified of this kobject's creation.
 *
 * Return: If this function returns an error, kobject_put() must be
 *         called to properly clean up the memory associated with the
 *         object.  Under no instance should the kobject that is passed
 *         to this function be directly freed with a call to kfree(),
 *         that can leak memory.
 *
 *         If this function returns success, kobject_put() must also be called
 *         in order to properly clean up the memory associated with the object.
 *
 *         In short, once this function is called, kobject_put() MUST be called
 *         when the use of the object is finished in order to properly free
 *         everything.
 */
int kobject_add(struct kobject *kobj, struct kobject *parent,
                const char *fmt, ...)
{
        va_list args;
        int retval;

        if (!kobj)
                return -EINVAL;

        if (!kobj->state_initialized) {
                pr_err("kobject '%s' (%p): tried to add an uninitialized object, something is seriously wrong.\n",
                       kobject_name(kobj), kobj);
                dump_stack_lvl(KERN_ERR);
                return -EINVAL;
        }
        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL(kobject_add);

/**
 * kobject_init_and_add() - Initialize a kobject structure and add it to
 *                          the kobject hierarchy.
 * @kobj: pointer to the kobject to initialize
 * @ktype: pointer to the ktype for this kobject.
 * @parent: pointer to the parent of this kobject.
 * @fmt: the name of the kobject.
 *
 * This function combines the call to kobject_init() and kobject_add().
 *
 * If this function returns an error, kobject_put() must be called to
 * properly clean up the memory associated with the object.  This is the
 * same type of error handling after a call to kobject_add() and kobject
 * lifetime rules are the same here.
 */
int kobject_init_and_add(struct kobject *kobj, const struct kobj_type *ktype,
                         struct kobject *parent, const char *fmt, ...)
{
        va_list args;
        int retval;

        kobject_init(kobj, ktype);

        va_start(args, fmt);
        retval = kobject_add_varg(kobj, parent, fmt, args);
        va_end(args);

        return retval;
}
EXPORT_SYMBOL_GPL(kobject_init_and_add);

/**
 * kobject_rename() - Change the name of an object.
 * @kobj: object in question.
 * @new_name: object's new name
 *
 * It is the responsibility of the caller to provide mutual
 * exclusion between two different calls of kobject_rename
 * on the same kobject and to ensure that new_name is valid and
 * won't conflict with other kobjects.
 */
int kobject_rename(struct kobject *kobj, const char *new_name)
{
        int error = 0;
        const char *devpath = NULL;
        const char *dup_name = NULL, *name;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        if (!kobj->parent) {
                kobject_put(kobj);
                return -EINVAL;
        }

        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;

        name = dup_name = kstrdup_const(new_name, GFP_KERNEL);
        if (!name) {
                error = -ENOMEM;
                goto out;
        }

        error = sysfs_rename_dir_ns(kobj, new_name, kobject_namespace(kobj));
        if (error)
                goto out;

        /* Install the new kobject name */
        dup_name = kobj->name;
        kobj->name = name;

        /* This function is mostly/only used for network interface.
         * Some hotplug package track interfaces by their name and
         * therefore want to know when the name is changed by the user. */
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);

out:
        kfree_const(dup_name);
        kfree(devpath_string);
        kfree(devpath);
        kobject_put(kobj);

        return error;
}
EXPORT_SYMBOL_GPL(kobject_rename);

/**
 * kobject_move() - Move object to another parent.
 * @kobj: object in question.
 * @new_parent: object's new parent (can be NULL)
 */
int kobject_move(struct kobject *kobj, struct kobject *new_parent)
{
        int error;
        struct kobject *old_parent;
        const char *devpath = NULL;
        char *devpath_string = NULL;
        char *envp[2];

        kobj = kobject_get(kobj);
        if (!kobj)
                return -EINVAL;
        new_parent = kobject_get(new_parent);
        if (!new_parent) {
                if (kobj->kset)
                        new_parent = kobject_get(&kobj->kset->kobj);
        }

        /* old object path */
        devpath = kobject_get_path(kobj, GFP_KERNEL);
        if (!devpath) {
                error = -ENOMEM;
                goto out;
        }
        devpath_string = kmalloc(strlen(devpath) + 15, GFP_KERNEL);
        if (!devpath_string) {
                error = -ENOMEM;
                goto out;
        }
        sprintf(devpath_string, "DEVPATH_OLD=%s", devpath);
        envp[0] = devpath_string;
        envp[1] = NULL;
        error = sysfs_move_dir_ns(kobj, new_parent, kobject_namespace(kobj));
        if (error)
                goto out;
        old_parent = kobj->parent;
        kobj->parent = new_parent;
        new_parent = NULL;
        kobject_put(old_parent);
        kobject_uevent_env(kobj, KOBJ_MOVE, envp);
out:
        kobject_put(new_parent);
        kobject_put(kobj);
        kfree(devpath_string);
        kfree(devpath);
        return error;
}
EXPORT_SYMBOL_GPL(kobject_move);

static void __kobject_del(struct kobject *kobj)
{
        struct kernfs_node *sd;
        const struct kobj_type *ktype;

        sd = kobj->sd;
        ktype = get_ktype(kobj);

        if (ktype)
                sysfs_remove_groups(kobj, ktype->default_groups);

        /* send "remove" if the caller did not do it but sent "add" */
        if (kobj->state_add_uevent_sent && !kobj->state_remove_uevent_sent) {
                pr_debug("'%s' (%p): auto cleanup 'remove' event\n",
                         kobject_name(kobj), kobj);
                kobject_uevent(kobj, KOBJ_REMOVE);
        }

        sysfs_remove_dir(kobj);
        sysfs_put(sd);

        kobj->state_in_sysfs = 0;
        kobj_kset_leave(kobj);
        kobj->parent = NULL;
}

/**
 * kobject_del() - Unlink kobject from hierarchy.
 * @kobj: object.
 *
 * This is the function that should be called to delete an object
 * successfully added via kobject_add().
 */
void kobject_del(struct kobject *kobj)
{
        struct kobject *parent;

        if (!kobj)
                return;

        parent = kobj->parent;
        __kobject_del(kobj);
        kobject_put(parent);
}
EXPORT_SYMBOL(kobject_del);

/**
 * kobject_get() - Increment refcount for object.
 * @kobj: object.
 */
struct kobject *kobject_get(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_get() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_get(&kobj->kref);
        }
        return kobj;
}
EXPORT_SYMBOL(kobject_get);

struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
{
        if (!kobj)
                return NULL;
        if (!kref_get_unless_zero(&kobj->kref))
                kobj = NULL;
        return kobj;
}
EXPORT_SYMBOL(kobject_get_unless_zero);

/*
 * kobject_cleanup - free kobject resources.
 * @kobj: object to cleanup
 */
static void kobject_cleanup(struct kobject *kobj)
{
        struct kobject *parent = kobj->parent;
        const struct kobj_type *t = get_ktype(kobj);
        const char *name = kobj->name;

        pr_debug("'%s' (%p): %s, parent %p\n",
                 kobject_name(kobj), kobj, __func__, kobj->parent);

        if (t && !t->release)
                pr_debug("'%s' (%p): does not have a release() function, it is broken and must be fixed. See Documentation/core-api/kobject.rst.\n",
                         kobject_name(kobj), kobj);

        /* remove from sysfs if the caller did not do it */
        if (kobj->state_in_sysfs) {
                pr_debug("'%s' (%p): auto cleanup kobject_del\n",
                         kobject_name(kobj), kobj);
                __kobject_del(kobj);
        } else {
                /* avoid dropping the parent reference unnecessarily */
                parent = NULL;
        }

        if (t && t->release) {
                pr_debug("'%s' (%p): calling ktype release\n",
                         kobject_name(kobj), kobj);
                t->release(kobj);
        }

        /* free name if we allocated it */
        if (name) {
                pr_debug("'%s': free name\n", name);
                kfree_const(name);
        }

        kobject_put(parent);
}

#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
static void kobject_delayed_cleanup(struct work_struct *work)
{
        kobject_cleanup(container_of(to_delayed_work(work),
                                     struct kobject, release));
}
#endif

static void kobject_release(struct kref *kref)
{
        struct kobject *kobj = container_of(kref, struct kobject, kref);
#ifdef CONFIG_DEBUG_KOBJECT_RELEASE
        unsigned long delay = HZ + HZ * get_random_u32_below(4);
        pr_info("'%s' (%p): %s, parent %p (delayed %ld)\n",
                kobject_name(kobj), kobj, __func__, kobj->parent, delay);
        INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);

        schedule_delayed_work(&kobj->release, delay);
#else
        kobject_cleanup(kobj);
#endif
}

/**
 * kobject_put() - Decrement refcount for object.
 * @kobj: object.
 *
 * Decrement the refcount, and if 0, call kobject_cleanup().
 */
void kobject_put(struct kobject *kobj)
{
        if (kobj) {
                if (!kobj->state_initialized)
                        WARN(1, KERN_WARNING
                                "kobject: '%s' (%p): is not initialized, yet kobject_put() is being called.\n",
                             kobject_name(kobj), kobj);
                kref_put(&kobj->kref, kobject_release);
        }
}
EXPORT_SYMBOL(kobject_put);

static void dynamic_kobj_release(struct kobject *kobj)
{
        pr_debug("(%p): %s\n", kobj, __func__);
        kfree(kobj);
}

static const struct kobj_type dynamic_kobj_ktype = {
        .release        = dynamic_kobj_release,
        .sysfs_ops        = &kobj_sysfs_ops,
};

/**
 * kobject_create() - Create a struct kobject dynamically.
 *
 * This function creates a kobject structure dynamically and sets it up
 * to be a "dynamic" kobject with a default release function set up.
 *
 * If the kobject was not able to be created, NULL will be returned.
 * The kobject structure returned from here must be cleaned up with a
 * call to kobject_put() and not kfree(), as kobject_init() has
 * already been called on this structure.
 */
static struct kobject *kobject_create(void)
{
        struct kobject *kobj;

        kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
        if (!kobj)
                return NULL;

        kobject_init(kobj, &dynamic_kobj_ktype);
        return kobj;
}

/**
 * kobject_create_and_add() - Create a struct kobject dynamically and
 *                            register it with sysfs.
 * @name: the name for the kobject
 * @parent: the parent kobject of this kobject, if any.
 *
 * This function creates a kobject structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kobject_put() and the structure will be dynamically freed when
 * it is no longer being used.
 *
 * If the kobject was not able to be created, NULL will be returned.
 */
struct kobject *kobject_create_and_add(const char *name, struct kobject *parent)
{
        struct kobject *kobj;
        int retval;

        kobj = kobject_create();
        if (!kobj)
                return NULL;

        retval = kobject_add(kobj, parent, "%s", name);
        if (retval) {
                pr_warn("%s: kobject_add error: %d\n", __func__, retval);
                kobject_put(kobj);
                kobj = NULL;
        }
        return kobj;
}
EXPORT_SYMBOL_GPL(kobject_create_and_add);

/**
 * kset_init() - Initialize a kset for use.
 * @k: kset
 */
void kset_init(struct kset *k)
{
        kobject_init_internal(&k->kobj);
        INIT_LIST_HEAD(&k->list);
        spin_lock_init(&k->list_lock);
}

/* default kobject attribute operations */
static ssize_t kobj_attr_show(struct kobject *kobj, struct attribute *attr,
                              char *buf)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->show)
                ret = kattr->show(kobj, kattr, buf);
        return ret;
}

static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr,
                               const char *buf, size_t count)
{
        struct kobj_attribute *kattr;
        ssize_t ret = -EIO;

        kattr = container_of(attr, struct kobj_attribute, attr);
        if (kattr->store)
                ret = kattr->store(kobj, kattr, buf, count);
        return ret;
}

const struct sysfs_ops kobj_sysfs_ops = {
        .show        = kobj_attr_show,
        .store        = kobj_attr_store,
};
EXPORT_SYMBOL_GPL(kobj_sysfs_ops);

/**
 * kset_register() - Initialize and add a kset.
 * @k: kset.
 *
 * NOTE: On error, the kset.kobj.name allocated by() kobj_set_name()
 * is freed, it can not be used any more.
 */
int kset_register(struct kset *k)
{
        int err;

        if (!k)
                return -EINVAL;

        if (!k->kobj.ktype) {
                pr_err("must have a ktype to be initialized properly!\n");
                return -EINVAL;
        }

        kset_init(k);
        err = kobject_add_internal(&k->kobj);
        if (err) {
                kfree_const(k->kobj.name);
                /* Set it to NULL to avoid accessing bad pointer in callers. */
                k->kobj.name = NULL;
                return err;
        }
        kobject_uevent(&k->kobj, KOBJ_ADD);
        return 0;
}
EXPORT_SYMBOL(kset_register);

/**
 * kset_unregister() - Remove a kset.
 * @k: kset.
 */
void kset_unregister(struct kset *k)
{
        if (!k)
                return;
        kobject_del(&k->kobj);
        kobject_put(&k->kobj);
}
EXPORT_SYMBOL(kset_unregister);

/**
 * kset_find_obj() - Search for object in kset.
 * @kset: kset we're looking in.
 * @name: object's name.
 *
 * Lock kset via @kset->subsys, and iterate over @kset->list,
 * looking for a matching kobject. If matching object is found
 * take a reference and return the object.
 */
struct kobject *kset_find_obj(struct kset *kset, const char *name)
{
        struct kobject *k;
        struct kobject *ret = NULL;

        spin_lock(&kset->list_lock);

        list_for_each_entry(k, &kset->list, entry) {
                if (kobject_name(k) && !strcmp(kobject_name(k), name)) {
                        ret = kobject_get_unless_zero(k);
                        break;
                }
        }

        spin_unlock(&kset->list_lock);
        return ret;
}
EXPORT_SYMBOL_GPL(kset_find_obj);

static void kset_release(struct kobject *kobj)
{
        struct kset *kset = container_of(kobj, struct kset, kobj);
        pr_debug("'%s' (%p): %s\n",
                 kobject_name(kobj), kobj, __func__);
        kfree(kset);
}

static void kset_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
        if (kobj->parent)
                kobject_get_ownership(kobj->parent, uid, gid);
}

static const struct kobj_type kset_ktype = {
        .sysfs_ops        = &kobj_sysfs_ops,
        .release        = kset_release,
        .get_ownership        = kset_get_ownership,
};

/**
 * kset_create() - Create a struct kset dynamically.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically.  This structure can
 * then be registered with the system and show up in sysfs with a call to
 * kset_register().  When you are finished with this structure, if
 * kset_register() has been called, call kset_unregister() and the
 * structure will be dynamically freed when it is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
static struct kset *kset_create(const char *name,
                                const struct kset_uevent_ops *uevent_ops,
                                struct kobject *parent_kobj)
{
        struct kset *kset;
        int retval;

        kset = kzalloc(sizeof(*kset), GFP_KERNEL);
        if (!kset)
                return NULL;
        retval = kobject_set_name(&kset->kobj, "%s", name);
        if (retval) {
                kfree(kset);
                return NULL;
        }
        kset->uevent_ops = uevent_ops;
        kset->kobj.parent = parent_kobj;

        /*
         * The kobject of this kset will have a type of kset_ktype and belong to
         * no kset itself.  That way we can properly free it when it is
         * finished being used.
         */
        kset->kobj.ktype = &kset_ktype;
        kset->kobj.kset = NULL;

        return kset;
}

/**
 * kset_create_and_add() - Create a struct kset dynamically and add it to sysfs.
 *
 * @name: the name for the kset
 * @uevent_ops: a struct kset_uevent_ops for the kset
 * @parent_kobj: the parent kobject of this kset, if any.
 *
 * This function creates a kset structure dynamically and registers it
 * with sysfs.  When you are finished with this structure, call
 * kset_unregister() and the structure will be dynamically freed when it
 * is no longer being used.
 *
 * If the kset was not able to be created, NULL will be returned.
 */
struct kset *kset_create_and_add(const char *name,
                                 const struct kset_uevent_ops *uevent_ops,
                                 struct kobject *parent_kobj)
{
        struct kset *kset;
        int error;

        kset = kset_create(name, uevent_ops, parent_kobj);
        if (!kset)
                return NULL;
        error = kset_register(kset);
        if (error) {
                kfree(kset);
                return NULL;
        }
        return kset;
}
EXPORT_SYMBOL_GPL(kset_create_and_add);


static DEFINE_SPINLOCK(kobj_ns_type_lock);
static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES];

int kobj_ns_type_register(const struct kobj_ns_type_operations *ops)
{
        enum kobj_ns_type type = ops->type;
        int error;

        spin_lock(&kobj_ns_type_lock);

        error = -EINVAL;
        if (!kobj_ns_type_is_valid(type))
                goto out;

        error = -EBUSY;
        if (kobj_ns_ops_tbl[type])
                goto out;

        error = 0;
        kobj_ns_ops_tbl[type] = ops;

out:
        spin_unlock(&kobj_ns_type_lock);
        return error;
}

int kobj_ns_type_registered(enum kobj_ns_type type)
{
        int registered = 0;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type))
                registered = kobj_ns_ops_tbl[type] != NULL;
        spin_unlock(&kobj_ns_type_lock);

        return registered;
}

const struct kobj_ns_type_operations *kobj_child_ns_ops(const struct kobject *parent)
{
        const struct kobj_ns_type_operations *ops = NULL;

        if (parent && parent->ktype && parent->ktype->child_ns_type)
                ops = parent->ktype->child_ns_type(parent);

        return ops;
}

const struct kobj_ns_type_operations *kobj_ns_ops(const struct kobject *kobj)
{
        return kobj_child_ns_ops(kobj->parent);
}

bool kobj_ns_current_may_mount(enum kobj_ns_type type)
{
        bool may_mount = true;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                may_mount = kobj_ns_ops_tbl[type]->current_may_mount();
        spin_unlock(&kobj_ns_type_lock);

        return may_mount;
}

void *kobj_ns_grab_current(enum kobj_ns_type type)
{
        void *ns = NULL;

        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) && kobj_ns_ops_tbl[type])
                ns = kobj_ns_ops_tbl[type]->grab_current_ns();
        spin_unlock(&kobj_ns_type_lock);

        return ns;
}
EXPORT_SYMBOL_GPL(kobj_ns_grab_current);

void kobj_ns_drop(enum kobj_ns_type type, void *ns)
{
        spin_lock(&kobj_ns_type_lock);
        if (kobj_ns_type_is_valid(type) &&
            kobj_ns_ops_tbl[type] && kobj_ns_ops_tbl[type]->drop_ns)
                kobj_ns_ops_tbl[type]->drop_ns(ns);
        spin_unlock(&kobj_ns_type_lock);
}
EXPORT_SYMBOL_GPL(kobj_ns_drop);






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


























    3 
    3 



    3 

    3 





























































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Neighbour Discovery for IPv6
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Mike Shaver                <shaver@ingenia.com>
 */

/*
 *        Changes:
 *
 *        Alexey I. Froloff                :        RFC6106 (DNSSL) support
 *        Pierre Ynard                        :        export userland ND options
 *                                                through netlink (RDNSS support)
 *        Lars Fenneberg                        :        fixed MTU setting on receipt
 *                                                of an RA.
 *        Janos Farkas                        :        kmalloc failure checks
 *        Alexey Kuznetsov                :        state machine reworked
 *                                                and moved to net/core.
 *        Pekka Savola                        :        RFC2461 validation
 *        YOSHIFUJI Hideaki @USAGI        :        Verify ND options properly
 */

#define pr_fmt(fmt) "ICMPv6: " fmt

#include <linux/module.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/sched.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/route.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

#include <linux/if_addr.h>
#include <linux/if_ether.h>
#include <linux/if_arp.h>
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/jhash.h>

#include <net/sock.h>
#include <net/snmp.h>

#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/icmp.h>

#include <net/netlink.h>
#include <linux/rtnetlink.h>

#include <net/flow.h>
#include <net/ip6_checksum.h>
#include <net/inet_common.h>
#include <linux/proc_fs.h>

#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>

static u32 ndisc_hash(const void *pkey,
                      const struct net_device *dev,
                      __u32 *hash_rnd);
static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey);
static bool ndisc_allow_add(const struct net_device *dev,
                            struct netlink_ext_ack *extack);
static int ndisc_constructor(struct neighbour *neigh);
static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
static int pndisc_constructor(struct pneigh_entry *n);
static void pndisc_destructor(struct pneigh_entry *n);
static void pndisc_redo(struct sk_buff *skb);
static int ndisc_is_multicast(const void *pkey);

static const struct neigh_ops ndisc_generic_ops = {
        .family =                AF_INET6,
        .solicit =                ndisc_solicit,
        .error_report =                ndisc_error_report,
        .output =                neigh_resolve_output,
        .connected_output =        neigh_connected_output,
};

static const struct neigh_ops ndisc_hh_ops = {
        .family =                AF_INET6,
        .solicit =                ndisc_solicit,
        .error_report =                ndisc_error_report,
        .output =                neigh_resolve_output,
        .connected_output =        neigh_resolve_output,
};


static const struct neigh_ops ndisc_direct_ops = {
        .family =                AF_INET6,
        .output =                neigh_direct_output,
        .connected_output =        neigh_direct_output,
};

struct neigh_table nd_tbl = {
        .family =        AF_INET6,
        .key_len =        sizeof(struct in6_addr),
        .protocol =        cpu_to_be16(ETH_P_IPV6),
        .hash =                ndisc_hash,
        .key_eq =        ndisc_key_eq,
        .constructor =        ndisc_constructor,
        .pconstructor =        pndisc_constructor,
        .pdestructor =        pndisc_destructor,
        .proxy_redo =        pndisc_redo,
        .is_multicast =        ndisc_is_multicast,
        .allow_add  =   ndisc_allow_add,
        .id =                "ndisc_cache",
        .parms = {
                .tbl                        = &nd_tbl,
                .reachable_time                = ND_REACHABLE_TIME,
                .data = {
                        [NEIGH_VAR_MCAST_PROBES] = 3,
                        [NEIGH_VAR_UCAST_PROBES] = 3,
                        [NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER,
                        [NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
                        [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
                        [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
                        [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
                        [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
                        [NEIGH_VAR_PROXY_QLEN] = 64,
                        [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
                        [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
                },
        },
        .gc_interval =          30 * HZ,
        .gc_thresh1 =         128,
        .gc_thresh2 =         512,
        .gc_thresh3 =        1024,
};
EXPORT_SYMBOL_GPL(nd_tbl);

void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data,
                              int data_len, int pad)
{
        int space = __ndisc_opt_addr_space(data_len, pad);
        u8 *opt = skb_put(skb, space);

        opt[0] = type;
        opt[1] = space>>3;

        memset(opt + 2, 0, pad);
        opt   += pad;
        space -= pad;

        memcpy(opt+2, data, data_len);
        data_len += 2;
        opt += data_len;
        space -= data_len;
        if (space > 0)
                memset(opt, 0, space);
}
EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option);

static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type,
                                          const void *data, u8 icmp6_type)
{
        __ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len,
                                 ndisc_addr_option_pad(skb->dev->type));
        ndisc_ops_fill_addr_option(skb->dev, skb, icmp6_type);
}

static inline void ndisc_fill_redirect_addr_option(struct sk_buff *skb,
                                                   void *ha,
                                                   const u8 *ops_data)
{
        ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR, ha, NDISC_REDIRECT);
        ndisc_ops_fill_redirect_addr_option(skb->dev, skb, ops_data);
}

static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
                                            struct nd_opt_hdr *end)
{
        int type;
        if (!cur || !end || cur >= end)
                return NULL;
        type = cur->nd_opt_type;
        do {
                cur = ((void *)cur) + (cur->nd_opt_len << 3);
        } while (cur < end && cur->nd_opt_type != type);
        return cur <= end && cur->nd_opt_type == type ? cur : NULL;
}

static inline int ndisc_is_useropt(const struct net_device *dev,
                                   struct nd_opt_hdr *opt)
{
        return opt->nd_opt_type == ND_OPT_PREFIX_INFO ||
                opt->nd_opt_type == ND_OPT_RDNSS ||
                opt->nd_opt_type == ND_OPT_DNSSL ||
                opt->nd_opt_type == ND_OPT_6CO ||
                opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
                opt->nd_opt_type == ND_OPT_PREF64;
}

static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev,
                                             struct nd_opt_hdr *cur,
                                             struct nd_opt_hdr *end)
{
        if (!cur || !end || cur >= end)
                return NULL;
        do {
                cur = ((void *)cur) + (cur->nd_opt_len << 3);
        } while (cur < end && !ndisc_is_useropt(dev, cur));
        return cur <= end && ndisc_is_useropt(dev, cur) ? cur : NULL;
}

struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
                                          u8 *opt, int opt_len,
                                          struct ndisc_options *ndopts)
{
        struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt;

        if (!nd_opt || opt_len < 0 || !ndopts)
                return NULL;
        memset(ndopts, 0, sizeof(*ndopts));
        while (opt_len) {
                bool unknown = false;
                int l;
                if (opt_len < sizeof(struct nd_opt_hdr))
                        return NULL;
                l = nd_opt->nd_opt_len << 3;
                if (opt_len < l || l == 0)
                        return NULL;
                if (ndisc_ops_parse_options(dev, nd_opt, ndopts))
                        goto next_opt;
                switch (nd_opt->nd_opt_type) {
                case ND_OPT_SOURCE_LL_ADDR:
                case ND_OPT_TARGET_LL_ADDR:
                case ND_OPT_MTU:
                case ND_OPT_NONCE:
                case ND_OPT_REDIRECT_HDR:
                        if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
                                ND_PRINTK(2, warn,
                                          "%s: duplicated ND6 option found: type=%d\n",
                                          __func__, nd_opt->nd_opt_type);
                        } else {
                                ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
                        }
                        break;
                case ND_OPT_PREFIX_INFO:
                        ndopts->nd_opts_pi_end = nd_opt;
                        if (!ndopts->nd_opt_array[nd_opt->nd_opt_type])
                                ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
                        break;
#ifdef CONFIG_IPV6_ROUTE_INFO
                case ND_OPT_ROUTE_INFO:
                        ndopts->nd_opts_ri_end = nd_opt;
                        if (!ndopts->nd_opts_ri)
                                ndopts->nd_opts_ri = nd_opt;
                        break;
#endif
                default:
                        unknown = true;
                }
                if (ndisc_is_useropt(dev, nd_opt)) {
                        ndopts->nd_useropts_end = nd_opt;
                        if (!ndopts->nd_useropts)
                                ndopts->nd_useropts = nd_opt;
                } else if (unknown) {
                        /*
                         * Unknown options must be silently ignored,
                         * to accommodate future extension to the
                         * protocol.
                         */
                        ND_PRINTK(2, notice,
                                  "%s: ignored unsupported option; type=%d, len=%d\n",
                                  __func__,
                                  nd_opt->nd_opt_type,
                                  nd_opt->nd_opt_len);
                }
next_opt:
                opt_len -= l;
                nd_opt = ((void *)nd_opt) + l;
        }
        return ndopts;
}

int ndisc_mc_map(const struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
{
        switch (dev->type) {
        case ARPHRD_ETHER:
        case ARPHRD_IEEE802:        /* Not sure. Check it later. --ANK */
        case ARPHRD_FDDI:
                ipv6_eth_mc_map(addr, buf);
                return 0;
        case ARPHRD_ARCNET:
                ipv6_arcnet_mc_map(addr, buf);
                return 0;
        case ARPHRD_INFINIBAND:
                ipv6_ib_mc_map(addr, dev->broadcast, buf);
                return 0;
        case ARPHRD_IPGRE:
                return ipv6_ipgre_mc_map(addr, dev->broadcast, buf);
        default:
                if (dir) {
                        memcpy(buf, dev->broadcast, dev->addr_len);
                        return 0;
                }
        }
        return -EINVAL;
}
EXPORT_SYMBOL(ndisc_mc_map);

static u32 ndisc_hash(const void *pkey,
                      const struct net_device *dev,
                      __u32 *hash_rnd)
{
        return ndisc_hashfn(pkey, dev, hash_rnd);
}

static bool ndisc_key_eq(const struct neighbour *n, const void *pkey)
{
        return neigh_key_eq128(n, pkey);
}

static int ndisc_constructor(struct neighbour *neigh)
{
        struct in6_addr *addr = (struct in6_addr *)&neigh->primary_key;
        struct net_device *dev = neigh->dev;
        struct inet6_dev *in6_dev;
        struct neigh_parms *parms;
        bool is_multicast = ipv6_addr_is_multicast(addr);

        in6_dev = in6_dev_get(dev);
        if (!in6_dev) {
                return -EINVAL;
        }

        parms = in6_dev->nd_parms;
        __neigh_parms_put(neigh->parms);
        neigh->parms = neigh_parms_clone(parms);

        neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST;
        if (!dev->header_ops) {
                neigh->nud_state = NUD_NOARP;
                neigh->ops = &ndisc_direct_ops;
                neigh->output = neigh_direct_output;
        } else {
                if (is_multicast) {
                        neigh->nud_state = NUD_NOARP;
                        ndisc_mc_map(addr, neigh->ha, dev, 1);
                } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
                        neigh->nud_state = NUD_NOARP;
                        memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
                        if (dev->flags&IFF_LOOPBACK)
                                neigh->type = RTN_LOCAL;
                } else if (dev->flags&IFF_POINTOPOINT) {
                        neigh->nud_state = NUD_NOARP;
                        memcpy(neigh->ha, dev->broadcast, dev->addr_len);
                }
                if (dev->header_ops->cache)
                        neigh->ops = &ndisc_hh_ops;
                else
                        neigh->ops = &ndisc_generic_ops;
                if (neigh->nud_state&NUD_VALID)
                        neigh->output = neigh->ops->connected_output;
                else
                        neigh->output = neigh->ops->output;
        }
        in6_dev_put(in6_dev);
        return 0;
}

static int pndisc_constructor(struct pneigh_entry *n)
{
        struct in6_addr *addr = (struct in6_addr *)&n->key;
        struct in6_addr maddr;
        struct net_device *dev = n->dev;

        if (!dev || !__in6_dev_get(dev))
                return -EINVAL;
        addrconf_addr_solict_mult(addr, &maddr);
        ipv6_dev_mc_inc(dev, &maddr);
        return 0;
}

static void pndisc_destructor(struct pneigh_entry *n)
{
        struct in6_addr *addr = (struct in6_addr *)&n->key;
        struct in6_addr maddr;
        struct net_device *dev = n->dev;

        if (!dev || !__in6_dev_get(dev))
                return;
        addrconf_addr_solict_mult(addr, &maddr);
        ipv6_dev_mc_dec(dev, &maddr);
}

/* called with rtnl held */
static bool ndisc_allow_add(const struct net_device *dev,
                            struct netlink_ext_ack *extack)
{
        struct inet6_dev *idev = __in6_dev_get(dev);

        if (!idev || idev->cnf.disable_ipv6) {
                NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device");
                return false;
        }

        return true;
}

static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
                                       int len)
{
        int hlen = LL_RESERVED_SPACE(dev);
        int tlen = dev->needed_tailroom;
        struct sk_buff *skb;

        skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
        if (!skb)
                return NULL;

        skb->protocol = htons(ETH_P_IPV6);
        skb->dev = dev;

        skb_reserve(skb, hlen + sizeof(struct ipv6hdr));
        skb_reset_transport_header(skb);

        /* Manually assign socket ownership as we avoid calling
         * sock_alloc_send_pskb() to bypass wmem buffer limits
         */
        rcu_read_lock();
        skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk);
        rcu_read_unlock();

        return skb;
}

static void ip6_nd_hdr(struct sk_buff *skb,
                       const struct in6_addr *saddr,
                       const struct in6_addr *daddr,
                       int hop_limit, int len)
{
        struct ipv6hdr *hdr;
        struct inet6_dev *idev;
        unsigned tclass;

        rcu_read_lock();
        idev = __in6_dev_get(skb->dev);
        tclass = idev ? READ_ONCE(idev->cnf.ndisc_tclass) : 0;
        rcu_read_unlock();

        skb_push(skb, sizeof(*hdr));
        skb_reset_network_header(skb);
        hdr = ipv6_hdr(skb);

        ip6_flow_hdr(hdr, tclass, 0);

        hdr->payload_len = htons(len);
        hdr->nexthdr = IPPROTO_ICMPV6;
        hdr->hop_limit = hop_limit;

        hdr->saddr = *saddr;
        hdr->daddr = *daddr;
}

void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
                    const struct in6_addr *saddr)
{
        struct icmp6hdr *icmp6h = icmp6_hdr(skb);
        struct dst_entry *dst = skb_dst(skb);
        struct inet6_dev *idev;
        struct net *net;
        struct sock *sk;
        int err;
        u8 type;

        type = icmp6h->icmp6_type;

        rcu_read_lock();

        net = dev_net_rcu(skb->dev);
        sk = net->ipv6.ndisc_sk;
        if (!dst) {
                struct flowi6 fl6;
                int oif = skb->dev->ifindex;

                icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
                dst = icmp6_dst_alloc(skb->dev, &fl6);
                if (IS_ERR(dst)) {
                        rcu_read_unlock();
                        kfree_skb(skb);
                        return;
                }

                skb_dst_set(skb, dst);
        }

        icmp6h->icmp6_cksum = csum_ipv6_magic(saddr, daddr, skb->len,
                                              IPPROTO_ICMPV6,
                                              csum_partial(icmp6h,
                                                           skb->len, 0));

        ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);

        idev = __in6_dev_get(dst->dev);
        IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);

        err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
                      net, sk, skb, NULL, dst->dev,
                      dst_output);
        if (!err) {
                ICMP6MSGOUT_INC_STATS(net, idev, type);
                ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
        }

        rcu_read_unlock();
}
EXPORT_SYMBOL(ndisc_send_skb);

void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
                   const struct in6_addr *solicited_addr,
                   bool router, bool solicited, bool override, bool inc_opt)
{
        struct sk_buff *skb;
        struct in6_addr tmpaddr;
        struct inet6_ifaddr *ifp;
        const struct in6_addr *src_addr;
        struct nd_msg *msg;
        int optlen = 0;

        /* for anycast or proxy, solicited_addr != src_addr */
        ifp = ipv6_get_ifaddr(dev_net(dev), solicited_addr, dev, 1);
        if (ifp) {
                src_addr = solicited_addr;
                if (ifp->flags & IFA_F_OPTIMISTIC)
                        override = false;
                inc_opt |= READ_ONCE(ifp->idev->cnf.force_tllao);
                in6_ifa_put(ifp);
        } else {
                if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr,
                                       inet6_sk(dev_net(dev)->ipv6.ndisc_sk)->srcprefs,
                                       &tmpaddr))
                        return;
                src_addr = &tmpaddr;
        }

        if (!dev->addr_len)
                inc_opt = false;
        if (inc_opt)
                optlen += ndisc_opt_addr_space(dev,
                                               NDISC_NEIGHBOUR_ADVERTISEMENT);

        skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
        if (!skb)
                return;

        msg = skb_put(skb, sizeof(*msg));
        *msg = (struct nd_msg) {
                .icmph = {
                        .icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT,
                        .icmp6_router = router,
                        .icmp6_solicited = solicited,
                        .icmp6_override = override,
                },
                .target = *solicited_addr,
        };

        if (inc_opt)
                ndisc_fill_addr_option(skb, ND_OPT_TARGET_LL_ADDR,
                                       dev->dev_addr,
                                       NDISC_NEIGHBOUR_ADVERTISEMENT);

        ndisc_send_skb(skb, daddr, src_addr);
}

static void ndisc_send_unsol_na(struct net_device *dev)
{
        struct inet6_dev *idev;
        struct inet6_ifaddr *ifa;

        idev = in6_dev_get(dev);
        if (!idev)
                return;

        read_lock_bh(&idev->lock);
        list_for_each_entry(ifa, &idev->addr_list, if_list) {
                /* skip tentative addresses until dad completes */
                if (ifa->flags & IFA_F_TENTATIVE &&
                    !(ifa->flags & IFA_F_OPTIMISTIC))
                        continue;

                ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifa->addr,
                              /*router=*/ !!idev->cnf.forwarding,
                              /*solicited=*/ false, /*override=*/ true,
                              /*inc_opt=*/ true);
        }
        read_unlock_bh(&idev->lock);

        in6_dev_put(idev);
}

struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit,
                                const struct in6_addr *saddr, u64 nonce)
{
        int inc_opt = dev->addr_len;
        struct sk_buff *skb;
        struct nd_msg *msg;
        int optlen = 0;

        if (!saddr)
                return NULL;

        if (ipv6_addr_any(saddr))
                inc_opt = false;
        if (inc_opt)
                optlen += ndisc_opt_addr_space(dev,
                                               NDISC_NEIGHBOUR_SOLICITATION);
        if (nonce != 0)
                optlen += 8;

        skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
        if (!skb)
                return NULL;

        msg = skb_put(skb, sizeof(*msg));
        *msg = (struct nd_msg) {
                .icmph = {
                        .icmp6_type = NDISC_NEIGHBOUR_SOLICITATION,
                },
                .target = *solicit,
        };

        if (inc_opt)
                ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
                                       dev->dev_addr,
                                       NDISC_NEIGHBOUR_SOLICITATION);
        if (nonce != 0) {
                u8 *opt = skb_put(skb, 8);

                opt[0] = ND_OPT_NONCE;
                opt[1] = 8 >> 3;
                memcpy(opt + 2, &nonce, 6);
        }

        return skb;
}
EXPORT_SYMBOL(ndisc_ns_create);

void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
                   const struct in6_addr *daddr, const struct in6_addr *saddr,
                   u64 nonce)
{
        struct in6_addr addr_buf;
        struct sk_buff *skb;

        if (!saddr) {
                if (ipv6_get_lladdr(dev, &addr_buf,
                                    (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)))
                        return;
                saddr = &addr_buf;
        }

        skb = ndisc_ns_create(dev, solicit, saddr, nonce);

        if (skb)
                ndisc_send_skb(skb, daddr, saddr);
}

void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
                   const struct in6_addr *daddr)
{
        struct sk_buff *skb;
        struct rs_msg *msg;
        int send_sllao = dev->addr_len;
        int optlen = 0;

#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        /*
         * According to section 2.2 of RFC 4429, we must not
         * send router solicitations with a sllao from
         * optimistic addresses, but we may send the solicitation
         * if we don't include the sllao.  So here we check
         * if our address is optimistic, and if so, we
         * suppress the inclusion of the sllao.
         */
        if (send_sllao) {
                struct inet6_ifaddr *ifp = ipv6_get_ifaddr(dev_net(dev), saddr,
                                                           dev, 1);
                if (ifp) {
                        if (ifp->flags & IFA_F_OPTIMISTIC)  {
                                send_sllao = 0;
                        }
                        in6_ifa_put(ifp);
                } else {
                        send_sllao = 0;
                }
        }
#endif
        if (send_sllao)
                optlen += ndisc_opt_addr_space(dev, NDISC_ROUTER_SOLICITATION);

        skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
        if (!skb)
                return;

        msg = skb_put(skb, sizeof(*msg));
        *msg = (struct rs_msg) {
                .icmph = {
                        .icmp6_type = NDISC_ROUTER_SOLICITATION,
                },
        };

        if (send_sllao)
                ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
                                       dev->dev_addr,
                                       NDISC_ROUTER_SOLICITATION);

        ndisc_send_skb(skb, daddr, saddr);
}


static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
        /*
         *        "The sender MUST return an ICMP
         *         destination unreachable"
         */
        dst_link_failure(skb);
        kfree_skb(skb);
}

/* Called with locked neigh: either read or both */

static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
        struct in6_addr *saddr = NULL;
        struct in6_addr mcaddr;
        struct net_device *dev = neigh->dev;
        struct in6_addr *target = (struct in6_addr *)&neigh->primary_key;
        int probes = atomic_read(&neigh->probes);

        if (skb && ipv6_chk_addr_and_flags(dev_net(dev), &ipv6_hdr(skb)->saddr,
                                           dev, false, 1,
                                           IFA_F_TENTATIVE|IFA_F_OPTIMISTIC))
                saddr = &ipv6_hdr(skb)->saddr;
        probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
        if (probes < 0) {
                if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) {
                        ND_PRINTK(1, dbg,
                                  "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
                                  __func__, target);
                }
                ndisc_send_ns(dev, target, target, saddr, 0);
        } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
                neigh_app_ns(neigh);
        } else {
                addrconf_addr_solict_mult(target, &mcaddr);
                ndisc_send_ns(dev, target, &mcaddr, saddr, 0);
        }
}

static int pndisc_is_router(const void *pkey,
                            struct net_device *dev)
{
        struct pneigh_entry *n;
        int ret = -1;

        read_lock_bh(&nd_tbl.lock);
        n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev);
        if (n)
                ret = !!(n->flags & NTF_ROUTER);
        read_unlock_bh(&nd_tbl.lock);

        return ret;
}

void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
                  const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type,
                  struct ndisc_options *ndopts)
{
        neigh_update(neigh, lladdr, new, flags, 0);
        /* report ndisc ops about neighbour update */
        ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts);
}

static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb)
{
        struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
        const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
        const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
        u8 *lladdr = NULL;
        u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
                                    offsetof(struct nd_msg, opt));
        struct ndisc_options ndopts;
        struct net_device *dev = skb->dev;
        struct inet6_ifaddr *ifp;
        struct inet6_dev *idev = NULL;
        struct neighbour *neigh;
        int dad = ipv6_addr_any(saddr);
        int is_router = -1;
        SKB_DR(reason);
        u64 nonce = 0;
        bool inc;

        if (skb->len < sizeof(struct nd_msg))
                return SKB_DROP_REASON_PKT_TOO_SMALL;

        if (ipv6_addr_is_multicast(&msg->target)) {
                ND_PRINTK(2, warn, "NS: multicast target address\n");
                return reason;
        }

        /*
         * RFC2461 7.1.1:
         * DAD has to be destined for solicited node multicast address.
         */
        if (dad && !ipv6_addr_is_solict_mult(daddr)) {
                ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n");
                return reason;
        }

        if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts))
                return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;

        if (ndopts.nd_opts_src_lladdr) {
                lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev);
                if (!lladdr) {
                        ND_PRINTK(2, warn,
                                  "NS: invalid link-layer address length\n");
                        return reason;
                }

                /* RFC2461 7.1.1:
                 *        If the IP source address is the unspecified address,
                 *        there MUST NOT be source link-layer address option
                 *        in the message.
                 */
                if (dad) {
                        ND_PRINTK(2, warn,
                                  "NS: bad DAD packet (link-layer address option)\n");
                        return reason;
                }
        }
        if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1)
                memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);

        inc = ipv6_addr_is_multicast(daddr);

        ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
        if (ifp) {
have_ifp:
                if (ifp->flags & (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)) {
                        if (dad) {
                                if (nonce != 0 && ifp->dad_nonce == nonce) {
                                        u8 *np = (u8 *)&nonce;
                                        /* Matching nonce if looped back */
                                        ND_PRINTK(2, notice,
                                                  "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
                                                  ifp->idev->dev->name,
                                                  &ifp->addr, np);
                                        goto out;
                                }
                                /*
                                 * We are colliding with another node
                                 * who is doing DAD
                                 * so fail our DAD process
                                 */
                                addrconf_dad_failure(skb, ifp);
                                return reason;
                        } else {
                                /*
                                 * This is not a dad solicitation.
                                 * If we are an optimistic node,
                                 * we should respond.
                                 * Otherwise, we should ignore it.
                                 */
                                if (!(ifp->flags & IFA_F_OPTIMISTIC))
                                        goto out;
                        }
                }

                idev = ifp->idev;
        } else {
                struct net *net = dev_net(dev);

                /* perhaps an address on the master device */
                if (netif_is_l3_slave(dev)) {
                        struct net_device *mdev;

                        mdev = netdev_master_upper_dev_get_rcu(dev);
                        if (mdev) {
                                ifp = ipv6_get_ifaddr(net, &msg->target, mdev, 1);
                                if (ifp)
                                        goto have_ifp;
                        }
                }

                idev = in6_dev_get(dev);
                if (!idev) {
                        /* XXX: count this drop? */
                        return reason;
                }

                if (ipv6_chk_acast_addr(net, dev, &msg->target) ||
                    (READ_ONCE(idev->cnf.forwarding) &&
                     (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) ||
                      READ_ONCE(idev->cnf.proxy_ndp)) &&
                     (is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
                        if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
                            skb->pkt_type != PACKET_HOST &&
                            inc &&
                            NEIGH_VAR(idev->nd_parms, PROXY_DELAY) != 0) {
                                /*
                                 * for anycast or proxy,
                                 * sender should delay its response
                                 * by a random time between 0 and
                                 * MAX_ANYCAST_DELAY_TIME seconds.
                                 * (RFC2461) -- yoshfuji
                                 */
                                struct sk_buff *n = skb_clone(skb, GFP_ATOMIC);
                                if (n)
                                        pneigh_enqueue(&nd_tbl, idev->nd_parms, n);
                                goto out;
                        }
                } else {
                        SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST);
                        goto out;
                }
        }

        if (is_router < 0)
                is_router = READ_ONCE(idev->cnf.forwarding);

        if (dad) {
                ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target,
                              !!is_router, false, (ifp != NULL), true);
                goto out;
        }

        if (inc)
                NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast);
        else
                NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast);

        /*
         *        update / create cache entry
         *        for the source address
         */
        neigh = __neigh_lookup(&nd_tbl, saddr, dev,
                               !inc || lladdr || !dev->addr_len);
        if (neigh)
                ndisc_update(dev, neigh, lladdr, NUD_STALE,
                             NEIGH_UPDATE_F_WEAK_OVERRIDE|
                             NEIGH_UPDATE_F_OVERRIDE,
                             NDISC_NEIGHBOUR_SOLICITATION, &ndopts);
        if (neigh || !dev->header_ops) {
                ndisc_send_na(dev, saddr, &msg->target, !!is_router,
                              true, (ifp != NULL && inc), inc);
                if (neigh)
                        neigh_release(neigh);
                reason = SKB_CONSUMED;
        }

out:
        if (ifp)
                in6_ifa_put(ifp);
        else
                in6_dev_put(idev);
        return reason;
}

static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr)
{
        struct inet6_dev *idev = __in6_dev_get(dev);

        switch (READ_ONCE(idev->cnf.accept_untracked_na)) {
        case 0: /* Don't accept untracked na (absent in neighbor cache) */
                return 0;
        case 1: /* Create new entries from na if currently untracked */
                return 1;
        case 2: /* Create new entries from untracked na only if saddr is in the
                 * same subnet as an address configured on the interface that
                 * received the na
                 */
                return !!ipv6_chk_prefix(saddr, dev);
        default:
                return 0;
        }
}

static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
{
        struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
        struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
        const struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
        u8 *lladdr = NULL;
        u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
                                    offsetof(struct nd_msg, opt));
        struct ndisc_options ndopts;
        struct net_device *dev = skb->dev;
        struct inet6_dev *idev = __in6_dev_get(dev);
        struct inet6_ifaddr *ifp;
        struct neighbour *neigh;
        SKB_DR(reason);
        u8 new_state;

        if (skb->len < sizeof(struct nd_msg))
                return SKB_DROP_REASON_PKT_TOO_SMALL;

        if (ipv6_addr_is_multicast(&msg->target)) {
                ND_PRINTK(2, warn, "NA: target address is multicast\n");
                return reason;
        }

        if (ipv6_addr_is_multicast(daddr) &&
            msg->icmph.icmp6_solicited) {
                ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n");
                return reason;
        }

        /* For some 802.11 wireless deployments (and possibly other networks),
         * there will be a NA proxy and unsolicitd packets are attacks
         * and thus should not be accepted.
         * drop_unsolicited_na takes precedence over accept_untracked_na
         */
        if (!msg->icmph.icmp6_solicited && idev &&
            READ_ONCE(idev->cnf.drop_unsolicited_na))
                return reason;

        if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts))
                return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;

        if (ndopts.nd_opts_tgt_lladdr) {
                lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev);
                if (!lladdr) {
                        ND_PRINTK(2, warn,
                                  "NA: invalid link-layer address length\n");
                        return reason;
                }
        }
        ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
        if (ifp) {
                if (skb->pkt_type != PACKET_LOOPBACK
                    && (ifp->flags & IFA_F_TENTATIVE)) {
                                addrconf_dad_failure(skb, ifp);
                                return reason;
                }
                /* What should we make now? The advertisement
                   is invalid, but ndisc specs say nothing
                   about it. It could be misconfiguration, or
                   an smart proxy agent tries to help us :-)

                   We should not print the error if NA has been
                   received from loopback - it is just our own
                   unsolicited advertisement.
                 */
                if (skb->pkt_type != PACKET_LOOPBACK)
                        ND_PRINTK(1, warn,
                                  "NA: %pM advertised our address %pI6c on %s!\n",
                                  eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name);
                in6_ifa_put(ifp);
                return reason;
        }

        neigh = neigh_lookup(&nd_tbl, &msg->target, dev);

        /* RFC 9131 updates original Neighbour Discovery RFC 4861.
         * NAs with Target LL Address option without a corresponding
         * entry in the neighbour cache can now create a STALE neighbour
         * cache entry on routers.
         *
         *   entry accept  fwding  solicited        behaviour
         * ------- ------  ------  ---------    ----------------------
         * present      X       X         0     Set state to STALE
         * present      X       X         1     Set state to REACHABLE
         *  absent      0       X         X     Do nothing
         *  absent      1       0         X     Do nothing
         *  absent      1       1         X     Add a new STALE entry
         *
         * Note that we don't do a (daddr == all-routers-mcast) check.
         */
        new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE;
        if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) {
                if (accept_untracked_na(dev, saddr)) {
                        neigh = neigh_create(&nd_tbl, &msg->target, dev);
                        new_state = NUD_STALE;
                }
        }

        if (neigh && !IS_ERR(neigh)) {
                u8 old_flags = neigh->flags;
                struct net *net = dev_net(dev);

                if (READ_ONCE(neigh->nud_state) & NUD_FAILED)
                        goto out;

                /*
                 * Don't update the neighbor cache entry on a proxy NA from
                 * ourselves because either the proxied node is off link or it
                 * has already sent a NA to us.
                 */
                if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
                    READ_ONCE(net->ipv6.devconf_all->forwarding) &&
                    READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
                    pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) {
                        /* XXX: idev->cnf.proxy_ndp */
                        goto out;
                }

                ndisc_update(dev, neigh, lladdr,
                             new_state,
                             NEIGH_UPDATE_F_WEAK_OVERRIDE|
                             (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
                             NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
                             (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0),
                             NDISC_NEIGHBOUR_ADVERTISEMENT, &ndopts);

                if ((old_flags & ~neigh->flags) & NTF_ROUTER) {
                        /*
                         * Change: router to host
                         */
                        rt6_clean_tohost(dev_net(dev),  saddr);
                }
                reason = SKB_CONSUMED;
out:
                neigh_release(neigh);
        }
        return reason;
}

static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb)
{
        struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb);
        unsigned long ndoptlen = skb->len - sizeof(*rs_msg);
        struct neighbour *neigh;
        struct inet6_dev *idev;
        const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
        struct ndisc_options ndopts;
        u8 *lladdr = NULL;
        SKB_DR(reason);

        if (skb->len < sizeof(*rs_msg))
                return SKB_DROP_REASON_PKT_TOO_SMALL;

        idev = __in6_dev_get(skb->dev);
        if (!idev) {
                ND_PRINTK(1, err, "RS: can't find in6 device\n");
                return reason;
        }

        /* Don't accept RS if we're not in router mode */
        if (!READ_ONCE(idev->cnf.forwarding))
                goto out;

        /*
         * Don't update NCE if src = ::;
         * this implies that the source node has no ip address assigned yet.
         */
        if (ipv6_addr_any(saddr))
                goto out;

        /* Parse ND options */
        if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts))
                return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;

        if (ndopts.nd_opts_src_lladdr) {
                lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
                                             skb->dev);
                if (!lladdr)
                        goto out;
        }

        neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1);
        if (neigh) {
                ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
                             NEIGH_UPDATE_F_WEAK_OVERRIDE|
                             NEIGH_UPDATE_F_OVERRIDE|
                             NEIGH_UPDATE_F_OVERRIDE_ISROUTER,
                             NDISC_ROUTER_SOLICITATION, &ndopts);
                neigh_release(neigh);
                reason = SKB_CONSUMED;
        }
out:
        return reason;
}

static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
{
        struct icmp6hdr *icmp6h = (struct icmp6hdr *)skb_transport_header(ra);
        struct sk_buff *skb;
        struct nlmsghdr *nlh;
        struct nduseroptmsg *ndmsg;
        struct net *net = dev_net(ra->dev);
        int err;
        int base_size = NLMSG_ALIGN(sizeof(struct nduseroptmsg)
                                    + (opt->nd_opt_len << 3));
        size_t msg_size = base_size + nla_total_size(sizeof(struct in6_addr));

        skb = nlmsg_new(msg_size, GFP_ATOMIC);
        if (!skb) {
                err = -ENOBUFS;
                goto errout;
        }

        nlh = nlmsg_put(skb, 0, 0, RTM_NEWNDUSEROPT, base_size, 0);
        if (!nlh) {
                goto nla_put_failure;
        }

        ndmsg = nlmsg_data(nlh);
        ndmsg->nduseropt_family = AF_INET6;
        ndmsg->nduseropt_ifindex = ra->dev->ifindex;
        ndmsg->nduseropt_icmp_type = icmp6h->icmp6_type;
        ndmsg->nduseropt_icmp_code = icmp6h->icmp6_code;
        ndmsg->nduseropt_opts_len = opt->nd_opt_len << 3;

        memcpy(ndmsg + 1, opt, opt->nd_opt_len << 3);

        if (nla_put_in6_addr(skb, NDUSEROPT_SRCADDR, &ipv6_hdr(ra)->saddr))
                goto nla_put_failure;
        nlmsg_end(skb, nlh);

        rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC);
        return;

nla_put_failure:
        nlmsg_free(skb);
        err = -EMSGSIZE;
errout:
        rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
}

static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
{
        struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
        bool send_ifinfo_notify = false;
        struct neighbour *neigh = NULL;
        struct ndisc_options ndopts;
        struct fib6_info *rt = NULL;
        struct inet6_dev *in6_dev;
        struct fib6_table *table;
        u32 defrtr_usr_metric;
        unsigned int pref = 0;
        __u32 old_if_flags;
        struct net *net;
        SKB_DR(reason);
        int lifetime;
        int optlen;

        __u8 *opt = (__u8 *)(ra_msg + 1);

        optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) -
                sizeof(struct ra_msg);

        ND_PRINTK(2, info,
                  "RA: %s, dev: %s\n",
                  __func__, skb->dev->name);
        if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
                ND_PRINTK(2, warn, "RA: source address is not link-local\n");
                return reason;
        }
        if (optlen < 0)
                return SKB_DROP_REASON_PKT_TOO_SMALL;

#ifdef CONFIG_IPV6_NDISC_NODETYPE
        if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) {
                ND_PRINTK(2, warn, "RA: from host or unauthorized router\n");
                return reason;
        }
#endif

        in6_dev = __in6_dev_get(skb->dev);
        if (!in6_dev) {
                ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n",
                          skb->dev->name);
                return reason;
        }

        if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts))
                return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;

        if (!ipv6_accept_ra(in6_dev)) {
                ND_PRINTK(2, info,
                          "RA: %s, did not accept ra for dev: %s\n",
                          __func__, skb->dev->name);
                goto skip_linkparms;
        }

#ifdef CONFIG_IPV6_NDISC_NODETYPE
        /* skip link-specific parameters from interior routers */
        if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
                ND_PRINTK(2, info,
                          "RA: %s, nodetype is NODEFAULT, dev: %s\n",
                          __func__, skb->dev->name);
                goto skip_linkparms;
        }
#endif

        if (in6_dev->if_flags & IF_RS_SENT) {
                /*
                 *        flag that an RA was received after an RS was sent
                 *        out on this interface.
                 */
                in6_dev->if_flags |= IF_RA_RCVD;
        }

        /*
         * Remember the managed/otherconf flags from most recently
         * received RA message (RFC 2462) -- yoshfuji
         */
        old_if_flags = in6_dev->if_flags;
        in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED |
                                IF_RA_OTHERCONF)) |
                                (ra_msg->icmph.icmp6_addrconf_managed ?
                                        IF_RA_MANAGED : 0) |
                                (ra_msg->icmph.icmp6_addrconf_other ?
                                        IF_RA_OTHERCONF : 0);

        if (old_if_flags != in6_dev->if_flags)
                send_ifinfo_notify = true;

        if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) {
                ND_PRINTK(2, info,
                          "RA: %s, defrtr is false for dev: %s\n",
                          __func__, skb->dev->name);
                goto skip_defrtr;
        }

        lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
        if (lifetime != 0 &&
            lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) {
                ND_PRINTK(2, info,
                          "RA: router lifetime (%ds) is too short: %s\n",
                          lifetime, skb->dev->name);
                goto skip_defrtr;
        }

        /* Do not accept RA with source-addr found on local machine unless
         * accept_ra_from_local is set to true.
         */
        net = dev_net(in6_dev->dev);
        if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
            ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
                ND_PRINTK(2, info,
                          "RA from local address detected on dev: %s: default router ignored\n",
                          skb->dev->name);
                goto skip_defrtr;
        }

#ifdef CONFIG_IPV6_ROUTER_PREF
        pref = ra_msg->icmph.icmp6_router_pref;
        /* 10b is handled as if it were 00b (medium) */
        if (pref == ICMPV6_ROUTER_PREF_INVALID ||
            !READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref))
                pref = ICMPV6_ROUTER_PREF_MEDIUM;
#endif
        /* routes added from RAs do not use nexthop objects */
        rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
        if (rt) {
                neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
                                         rt->fib6_nh->fib_nh_dev, NULL,
                                          &ipv6_hdr(skb)->saddr);
                if (!neigh) {
                        ND_PRINTK(0, err,
                                  "RA: %s got default router without neighbour\n",
                                  __func__);
                        fib6_info_release(rt);
                        return reason;
                }
        }
        /* Set default route metric as specified by user */
        defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric;
        /* delete the route if lifetime is 0 or if metric needs change */
        if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) {
                ip6_del_rt(net, rt, false);
                rt = NULL;
        }

        ND_PRINTK(3, info, "RA: rt: %p  lifetime: %d, metric: %d, for dev: %s\n",
                  rt, lifetime, defrtr_usr_metric, skb->dev->name);
        if (!rt && lifetime) {
                ND_PRINTK(3, info, "RA: adding default router\n");

                if (neigh)
                        neigh_release(neigh);

                rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
                                         skb->dev, pref, defrtr_usr_metric,
                                         lifetime);
                if (!rt) {
                        ND_PRINTK(0, err,
                                  "RA: %s failed to add default route\n",
                                  __func__);
                        return reason;
                }

                neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
                                         rt->fib6_nh->fib_nh_dev, NULL,
                                          &ipv6_hdr(skb)->saddr);
                if (!neigh) {
                        ND_PRINTK(0, err,
                                  "RA: %s got default router without neighbour\n",
                                  __func__);
                        fib6_info_release(rt);
                        return reason;
                }
                neigh->flags |= NTF_ROUTER;
        } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) {
                struct nl_info nlinfo = {
                        .nl_net = net,
                };
                rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
                inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE);
        }

        if (rt) {
                table = rt->fib6_table;
                spin_lock_bh(&table->tb6_lock);

                fib6_set_expires(rt, jiffies + (HZ * lifetime));
                fib6_add_gc_list(rt);

                spin_unlock_bh(&table->tb6_lock);
        }
        if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) < 256 &&
            ra_msg->icmph.icmp6_hop_limit) {
                if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) <=
                    ra_msg->icmph.icmp6_hop_limit) {
                        WRITE_ONCE(in6_dev->cnf.hop_limit,
                                   ra_msg->icmph.icmp6_hop_limit);
                        fib6_metric_set(rt, RTAX_HOPLIMIT,
                                        ra_msg->icmph.icmp6_hop_limit);
                } else {
                        ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
                }
        }

skip_defrtr:

        /*
         *        Update Reachable Time and Retrans Timer
         */

        if (in6_dev->nd_parms) {
                unsigned long rtime = ntohl(ra_msg->retrans_timer);

                if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
                        rtime = (rtime*HZ)/1000;
                        if (rtime < HZ/100)
                                rtime = HZ/100;
                        NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
                        in6_dev->tstamp = jiffies;
                        send_ifinfo_notify = true;
                }

                rtime = ntohl(ra_msg->reachable_time);
                if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) {
                        rtime = (rtime*HZ)/1000;

                        if (rtime < HZ/10)
                                rtime = HZ/10;

                        if (rtime != NEIGH_VAR(in6_dev->nd_parms, BASE_REACHABLE_TIME)) {
                                NEIGH_VAR_SET(in6_dev->nd_parms,
                                              BASE_REACHABLE_TIME, rtime);
                                NEIGH_VAR_SET(in6_dev->nd_parms,
                                              GC_STALETIME, 3 * rtime);
                                in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime);
                                in6_dev->tstamp = jiffies;
                                send_ifinfo_notify = true;
                        }
                }
        }

skip_linkparms:

        /*
         *        Process options.
         */

        if (!neigh)
                neigh = __neigh_lookup(&nd_tbl, &ipv6_hdr(skb)->saddr,
                                       skb->dev, 1);
        if (neigh) {
                u8 *lladdr = NULL;
                if (ndopts.nd_opts_src_lladdr) {
                        lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
                                                     skb->dev);
                        if (!lladdr) {
                                ND_PRINTK(2, warn,
                                          "RA: invalid link-layer address length\n");
                                goto out;
                        }
                }
                ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
                             NEIGH_UPDATE_F_WEAK_OVERRIDE|
                             NEIGH_UPDATE_F_OVERRIDE|
                             NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
                             NEIGH_UPDATE_F_ISROUTER,
                             NDISC_ROUTER_ADVERTISEMENT, &ndopts);
                reason = SKB_CONSUMED;
        }

        if (!ipv6_accept_ra(in6_dev)) {
                ND_PRINTK(2, info,
                          "RA: %s, accept_ra is false for dev: %s\n",
                          __func__, skb->dev->name);
                goto out;
        }

#ifdef CONFIG_IPV6_ROUTE_INFO
        if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
            ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
                          in6_dev->dev, 0)) {
                ND_PRINTK(2, info,
                          "RA from local address detected on dev: %s: router info ignored.\n",
                          skb->dev->name);
                goto skip_routeinfo;
        }

        if (READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref) && ndopts.nd_opts_ri) {
                struct nd_opt_hdr *p;
                for (p = ndopts.nd_opts_ri;
                     p;
                     p = ndisc_next_option(p, ndopts.nd_opts_ri_end)) {
                        struct route_info *ri = (struct route_info *)p;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
                        if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT &&
                            ri->prefix_len == 0)
                                continue;
#endif
                        if (ri->prefix_len == 0 &&
                            !READ_ONCE(in6_dev->cnf.accept_ra_defrtr))
                                continue;
                        if (ri->lifetime != 0 &&
                            ntohl(ri->lifetime) < READ_ONCE(in6_dev->cnf.accept_ra_min_lft))
                                continue;
                        if (ri->prefix_len < READ_ONCE(in6_dev->cnf.accept_ra_rt_info_min_plen))
                                continue;
                        if (ri->prefix_len > READ_ONCE(in6_dev->cnf.accept_ra_rt_info_max_plen))
                                continue;
                        rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,
                                      &ipv6_hdr(skb)->saddr);
                }
        }

skip_routeinfo:
#endif

#ifdef CONFIG_IPV6_NDISC_NODETYPE
        /* skip link-specific ndopts from interior routers */
        if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
                ND_PRINTK(2, info,
                          "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n",
                          __func__, skb->dev->name);
                goto out;
        }
#endif

        if (READ_ONCE(in6_dev->cnf.accept_ra_pinfo) && ndopts.nd_opts_pi) {
                struct nd_opt_hdr *p;
                for (p = ndopts.nd_opts_pi;
                     p;
                     p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) {
                        addrconf_prefix_rcv(skb->dev, (u8 *)p,
                                            (p->nd_opt_len) << 3,
                                            ndopts.nd_opts_src_lladdr != NULL);
                }
        }

        if (ndopts.nd_opts_mtu && READ_ONCE(in6_dev->cnf.accept_ra_mtu)) {
                __be32 n;
                u32 mtu;

                memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
                mtu = ntohl(n);

                if (in6_dev->ra_mtu != mtu) {
                        in6_dev->ra_mtu = mtu;
                        send_ifinfo_notify = true;
                }

                if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
                        ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
                } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) {
                        WRITE_ONCE(in6_dev->cnf.mtu6, mtu);
                        fib6_metric_set(rt, RTAX_MTU, mtu);
                        rt6_mtu_change(skb->dev, mtu);
                }
        }

        if (ndopts.nd_useropts) {
                struct nd_opt_hdr *p;
                for (p = ndopts.nd_useropts;
                     p;
                     p = ndisc_next_useropt(skb->dev, p,
                                            ndopts.nd_useropts_end)) {
                        ndisc_ra_useropt(skb, p);
                }
        }

        if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) {
                ND_PRINTK(2, warn, "RA: invalid RA options\n");
        }
out:
        /* Send a notify if RA changed managed/otherconf flags or
         * timer settings or ra_mtu value
         */
        if (send_ifinfo_notify)
                inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);

        fib6_info_release(rt);
        if (neigh)
                neigh_release(neigh);
        return reason;
}

static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb)
{
        struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb);
        u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
                                    offsetof(struct rd_msg, opt));
        struct ndisc_options ndopts;
        SKB_DR(reason);
        u8 *hdr;

#ifdef CONFIG_IPV6_NDISC_NODETYPE
        switch (skb->ndisc_nodetype) {
        case NDISC_NODETYPE_HOST:
        case NDISC_NODETYPE_NODEFAULT:
                ND_PRINTK(2, warn,
                          "Redirect: from host or unauthorized router\n");
                return reason;
        }
#endif

        if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
                ND_PRINTK(2, warn,
                          "Redirect: source address is not link-local\n");
                return reason;
        }

        if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts))
                return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;

        if (!ndopts.nd_opts_rh) {
                ip6_redirect_no_header(skb, dev_net(skb->dev),
                                        skb->dev->ifindex);
                return reason;
        }

        hdr = (u8 *)ndopts.nd_opts_rh;
        hdr += 8;
        if (!pskb_pull(skb, hdr - skb_transport_header(skb)))
                return SKB_DROP_REASON_PKT_TOO_SMALL;

        return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0);
}

static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb,
                                           struct sk_buff *orig_skb,
                                           int rd_len)
{
        u8 *opt = skb_put(skb, rd_len);

        memset(opt, 0, 8);
        *(opt++) = ND_OPT_REDIRECT_HDR;
        *(opt++) = (rd_len >> 3);
        opt += 6;

        skb_copy_bits(orig_skb, skb_network_offset(orig_skb), opt,
                      rd_len - 8);
}

void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
{
        struct net_device *dev = skb->dev;
        struct net *net = dev_net_rcu(dev);
        struct sock *sk = net->ipv6.ndisc_sk;
        int optlen = 0;
        struct inet_peer *peer;
        struct sk_buff *buff;
        struct rd_msg *msg;
        struct in6_addr saddr_buf;
        struct rt6_info *rt;
        struct dst_entry *dst;
        struct flowi6 fl6;
        int rd_len;
        u8 ha_buf[MAX_ADDR_LEN], *ha = NULL,
           ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
        bool ret;

        if (netif_is_l3_master(dev)) {
                dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
                if (!dev)
                        return;
        }

        if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
                ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
                          dev->name);
                return;
        }

        if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) &&
            ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
                ND_PRINTK(2, warn,
                          "Redirect: target address is not link-local unicast\n");
                return;
        }

        icmpv6_flow_init(sk, &fl6, NDISC_REDIRECT,
                         &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex);

        dst = ip6_route_output(net, NULL, &fl6);
        if (dst->error) {
                dst_release(dst);
                return;
        }
        dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
        if (IS_ERR(dst))
                return;

        rt = dst_rt6_info(dst);

        if (rt->rt6i_flags & RTF_GATEWAY) {
                ND_PRINTK(2, warn,
                          "Redirect: destination is not a neighbour\n");
                goto release;
        }

        peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr);
        ret = inet_peer_xrlim_allow(peer, 1*HZ);

        if (!ret)
                goto release;

        if (dev->addr_len) {
                struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target);
                if (!neigh) {
                        ND_PRINTK(2, warn,
                                  "Redirect: no neigh for target address\n");
                        goto release;
                }

                read_lock_bh(&neigh->lock);
                if (neigh->nud_state & NUD_VALID) {
                        memcpy(ha_buf, neigh->ha, dev->addr_len);
                        read_unlock_bh(&neigh->lock);
                        ha = ha_buf;
                        optlen += ndisc_redirect_opt_addr_space(dev, neigh,
                                                                ops_data_buf,
                                                                &ops_data);
                } else
                        read_unlock_bh(&neigh->lock);

                neigh_release(neigh);
        }

        rd_len = min_t(unsigned int,
                       IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(*msg) - optlen,
                       skb->len + 8);
        rd_len &= ~0x7;
        optlen += rd_len;

        buff = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
        if (!buff)
                goto release;

        msg = skb_put(buff, sizeof(*msg));
        *msg = (struct rd_msg) {
                .icmph = {
                        .icmp6_type = NDISC_REDIRECT,
                },
                .target = *target,
                .dest = ipv6_hdr(skb)->daddr,
        };

        /*
         *        include target_address option
         */

        if (ha)
                ndisc_fill_redirect_addr_option(buff, ha, ops_data);

        /*
         *        build redirect option and copy skb over to the new packet.
         */

        if (rd_len)
                ndisc_fill_redirect_hdr_option(buff, skb, rd_len);

        skb_dst_set(buff, dst);
        ndisc_send_skb(buff, &ipv6_hdr(skb)->saddr, &saddr_buf);
        return;

release:
        dst_release(dst);
}

static void pndisc_redo(struct sk_buff *skb)
{
        enum skb_drop_reason reason = ndisc_recv_ns(skb);

        kfree_skb_reason(skb, reason);
}

static int ndisc_is_multicast(const void *pkey)
{
        return ipv6_addr_is_multicast((struct in6_addr *)pkey);
}

static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
{
        struct inet6_dev *idev = __in6_dev_get(skb->dev);

        if (!idev)
                return true;
        if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED &&
            READ_ONCE(idev->cnf.suppress_frag_ndisc)) {
                net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n");
                return true;
        }
        return false;
}

enum skb_drop_reason ndisc_rcv(struct sk_buff *skb)
{
        struct nd_msg *msg;
        SKB_DR(reason);

        if (ndisc_suppress_frag_ndisc(skb))
                return SKB_DROP_REASON_IPV6_NDISC_FRAG;

        if (skb_linearize(skb))
                return SKB_DROP_REASON_NOMEM;

        msg = (struct nd_msg *)skb_transport_header(skb);

        __skb_push(skb, skb->data - skb_transport_header(skb));

        if (ipv6_hdr(skb)->hop_limit != 255) {
                ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n",
                          ipv6_hdr(skb)->hop_limit);
                return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT;
        }

        if (msg->icmph.icmp6_code != 0) {
                ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n",
                          msg->icmph.icmp6_code);
                return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE;
        }

        switch (msg->icmph.icmp6_type) {
        case NDISC_NEIGHBOUR_SOLICITATION:
                memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
                reason = ndisc_recv_ns(skb);
                break;

        case NDISC_NEIGHBOUR_ADVERTISEMENT:
                reason = ndisc_recv_na(skb);
                break;

        case NDISC_ROUTER_SOLICITATION:
                reason = ndisc_recv_rs(skb);
                break;

        case NDISC_ROUTER_ADVERTISEMENT:
                reason = ndisc_router_discovery(skb);
                break;

        case NDISC_REDIRECT:
                reason = ndisc_redirect_rcv(skb);
                break;
        }

        return reason;
}

static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netdev_notifier_change_info *change_info;
        struct net *net = dev_net(dev);
        struct inet6_dev *idev;
        bool evict_nocarrier;

        switch (event) {
        case NETDEV_CHANGEADDR:
                neigh_changeaddr(&nd_tbl, dev);
                fib6_run_gc(0, net, false);
                fallthrough;
        case NETDEV_UP:
                idev = in6_dev_get(dev);
                if (!idev)
                        break;
                if (READ_ONCE(idev->cnf.ndisc_notify) ||
                    READ_ONCE(net->ipv6.devconf_all->ndisc_notify))
                        ndisc_send_unsol_na(dev);
                in6_dev_put(idev);
                break;
        case NETDEV_CHANGE:
                idev = in6_dev_get(dev);
                if (!idev)
                        evict_nocarrier = true;
                else {
                        evict_nocarrier = READ_ONCE(idev->cnf.ndisc_evict_nocarrier) &&
                                          READ_ONCE(net->ipv6.devconf_all->ndisc_evict_nocarrier);
                        in6_dev_put(idev);
                }

                change_info = ptr;
                if (change_info->flags_changed & IFF_NOARP)
                        neigh_changeaddr(&nd_tbl, dev);
                if (evict_nocarrier && !netif_carrier_ok(dev))
                        neigh_carrier_down(&nd_tbl, dev);
                break;
        case NETDEV_DOWN:
                neigh_ifdown(&nd_tbl, dev);
                fib6_run_gc(0, net, false);
                break;
        case NETDEV_NOTIFY_PEERS:
                ndisc_send_unsol_na(dev);
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block ndisc_netdev_notifier = {
        .notifier_call = ndisc_netdev_event,
        .priority = ADDRCONF_NOTIFY_PRIORITY - 5,
};

#ifdef CONFIG_SYSCTL
static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl,
                                         const char *func, const char *dev_name)
{
        static char warncomm[TASK_COMM_LEN];
        static int warned;
        if (strcmp(warncomm, current->comm) && warned < 5) {
                strscpy(warncomm, current->comm);
                pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n",
                        warncomm, func,
                        dev_name, ctl->procname,
                        dev_name, ctl->procname);
                warned++;
        }
}

int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        struct net_device *dev = ctl->extra1;
        struct inet6_dev *idev;
        int ret;

        if ((strcmp(ctl->procname, "retrans_time") == 0) ||
            (strcmp(ctl->procname, "base_reachable_time") == 0))
                ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default");

        if (strcmp(ctl->procname, "retrans_time") == 0)
                ret = neigh_proc_dointvec(ctl, write, buffer, lenp, ppos);

        else if (strcmp(ctl->procname, "base_reachable_time") == 0)
                ret = neigh_proc_dointvec_jiffies(ctl, write,
                                                  buffer, lenp, ppos);

        else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) ||
                 (strcmp(ctl->procname, "base_reachable_time_ms") == 0))
                ret = neigh_proc_dointvec_ms_jiffies(ctl, write,
                                                     buffer, lenp, ppos);
        else
                ret = -1;

        if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) {
                if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME))
                        idev->nd_parms->reachable_time =
                                        neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME));
                WRITE_ONCE(idev->tstamp, jiffies);
                inet6_ifinfo_notify(RTM_NEWLINK, idev);
                in6_dev_put(idev);
        }
        return ret;
}


#endif

static int __net_init ndisc_net_init(struct net *net)
{
        struct ipv6_pinfo *np;
        struct sock *sk;
        int err;

        err = inet_ctl_sock_create(&sk, PF_INET6,
                                   SOCK_RAW, IPPROTO_ICMPV6, net);
        if (err < 0) {
                ND_PRINTK(0, err,
                          "NDISC: Failed to initialize the control socket (err %d)\n",
                          err);
                return err;
        }

        net->ipv6.ndisc_sk = sk;

        np = inet6_sk(sk);
        np->hop_limit = 255;
        /* Do not loopback ndisc messages */
        inet6_clear_bit(MC6_LOOP, sk);

        return 0;
}

static void __net_exit ndisc_net_exit(struct net *net)
{
        inet_ctl_sock_destroy(net->ipv6.ndisc_sk);
}

static struct pernet_operations ndisc_net_ops = {
        .init = ndisc_net_init,
        .exit = ndisc_net_exit,
};

int __init ndisc_init(void)
{
        int err;

        err = register_pernet_subsys(&ndisc_net_ops);
        if (err)
                return err;
        /*
         * Initialize the neighbour table
         */
        neigh_table_init(NEIGH_ND_TABLE, &nd_tbl);

#ifdef CONFIG_SYSCTL
        err = neigh_sysctl_register(NULL, &nd_tbl.parms,
                                    ndisc_ifinfo_sysctl_change);
        if (err)
                goto out_unregister_pernet;
out:
#endif
        return err;

#ifdef CONFIG_SYSCTL
out_unregister_pernet:
        unregister_pernet_subsys(&ndisc_net_ops);
        goto out;
#endif
}

int __init ndisc_late_init(void)
{
        return register_netdevice_notifier(&ndisc_netdev_notifier);
}

void ndisc_late_cleanup(void)
{
        unregister_netdevice_notifier(&ndisc_netdev_notifier);
}

void ndisc_cleanup(void)
{
#ifdef CONFIG_SYSCTL
        neigh_sysctl_unregister(&nd_tbl.parms);
#endif
        neigh_table_clear(NEIGH_ND_TABLE, &nd_tbl);
        unregister_pernet_subsys(&ndisc_net_ops);
}























   22 
























































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __KERNEL_PRINTK__
#define __KERNEL_PRINTK__

#include <linux/stdarg.h>
#include <linux/init.h>
#include <linux/kern_levels.h>
#include <linux/linkage.h>
#include <linux/ratelimit_types.h>
#include <linux/once_lite.h>

struct console;

extern const char linux_banner[];
extern const char linux_proc_banner[];

extern int oops_in_progress;        /* If set, an oops, panic(), BUG() or die() is in progress */

#define PRINTK_MAX_SINGLE_HEADER_LEN 2

static inline int printk_get_level(const char *buffer)
{
        if (buffer[0] == KERN_SOH_ASCII && buffer[1]) {
                switch (buffer[1]) {
                case '0' ... '7':
                case 'c':        /* KERN_CONT */
                        return buffer[1];
                }
        }
        return 0;
}

static inline const char *printk_skip_level(const char *buffer)
{
        if (printk_get_level(buffer))
                return buffer + 2;

        return buffer;
}

static inline const char *printk_skip_headers(const char *buffer)
{
        while (printk_get_level(buffer))
                buffer = printk_skip_level(buffer);

        return buffer;
}

/* printk's without a loglevel use this.. */
#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT

/* We show everything that is MORE important than this.. */
#define CONSOLE_LOGLEVEL_SILENT  0 /* Mum's the word */
#define CONSOLE_LOGLEVEL_MIN         1 /* Minimum loglevel we let people use */
#define CONSOLE_LOGLEVEL_DEBUG        10 /* issue debug messages */
#define CONSOLE_LOGLEVEL_MOTORMOUTH 15        /* You can't shut this one up */

/*
 * Default used to be hard-coded at 7, quiet used to be hardcoded at 4,
 * we're now allowing both to be set from kernel config.
 */
#define CONSOLE_LOGLEVEL_DEFAULT CONFIG_CONSOLE_LOGLEVEL_DEFAULT
#define CONSOLE_LOGLEVEL_QUIET         CONFIG_CONSOLE_LOGLEVEL_QUIET

int match_devname_and_update_preferred_console(const char *match,
                                               const char *name,
                                               const short idx);

extern int console_printk[];

#define console_loglevel (console_printk[0])
#define default_message_loglevel (console_printk[1])
#define minimum_console_loglevel (console_printk[2])
#define default_console_loglevel (console_printk[3])

extern void console_verbose(void);

/* strlen("ratelimit") + 1 */
#define DEVKMSG_STR_MAX_SIZE 10
extern char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE];
struct ctl_table;

extern int suppress_printk;

struct va_format {
        const char *fmt;
        va_list *va;
};

/*
 * FW_BUG
 * Add this to a message where you are sure the firmware is buggy or behaves
 * really stupid or out of spec. Be aware that the responsible BIOS developer
 * should be able to fix this issue or at least get a concrete idea of the
 * problem by reading your message without the need of looking at the kernel
 * code.
 *
 * Use it for definite and high priority BIOS bugs.
 *
 * FW_WARN
 * Use it for not that clear (e.g. could the kernel messed up things already?)
 * and medium priority BIOS bugs.
 *
 * FW_INFO
 * Use this one if you want to tell the user or vendor about something
 * suspicious, but generally harmless related to the firmware.
 *
 * Use it for information or very low priority BIOS bugs.
 */
#define FW_BUG                "[Firmware Bug]: "
#define FW_WARN                "[Firmware Warn]: "
#define FW_INFO                "[Firmware Info]: "

/*
 * HW_ERR
 * Add this to a message for hardware errors, so that user can report
 * it to hardware vendor instead of LKML or software vendor.
 */
#define HW_ERR                "[Hardware Error]: "

/*
 * DEPRECATED
 * Add this to a message whenever you want to warn user space about the use
 * of a deprecated aspect of an API so they can stop using it
 */
#define DEPRECATED        "[Deprecated]: "

/*
 * Dummy printk for disabled debugging statements to use whilst maintaining
 * gcc's format checking.
 */
#define no_printk(fmt, ...)                                \
({                                                        \
        if (0)                                                \
                _printk(fmt, ##__VA_ARGS__);                \
        0;                                                \
})

#ifdef CONFIG_EARLY_PRINTK
extern asmlinkage __printf(1, 2)
void early_printk(const char *fmt, ...);
#else
static inline __printf(1, 2) __cold
void early_printk(const char *s, ...) { }
#endif

struct dev_printk_info;

#ifdef CONFIG_PRINTK
asmlinkage __printf(4, 0)
int vprintk_emit(int facility, int level,
                 const struct dev_printk_info *dev_info,
                 const char *fmt, va_list args);

asmlinkage __printf(1, 0)
int vprintk(const char *fmt, va_list args);

asmlinkage __printf(1, 2) __cold
int _printk(const char *fmt, ...);

/*
 * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
 */
__printf(1, 2) __cold int _printk_deferred(const char *fmt, ...);

extern void __printk_deferred_enter(void);
extern void __printk_deferred_exit(void);

extern void printk_force_console_enter(void);
extern void printk_force_console_exit(void);

/*
 * The printk_deferred_enter/exit macros are available only as a hack for
 * some code paths that need to defer all printk console printing. Interrupts
 * must be disabled for the deferred duration.
 */
#define printk_deferred_enter() __printk_deferred_enter()
#define printk_deferred_exit() __printk_deferred_exit()

/*
 * Please don't use printk_ratelimit(), because it shares ratelimiting state
 * with all other unrelated printk_ratelimit() callsites.  Instead use
 * printk_ratelimited() or plain old __ratelimit().
 */
extern int __printk_ratelimit(const char *func);
#define printk_ratelimit() __printk_ratelimit(__func__)
extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                   unsigned int interval_msec);

extern int printk_delay_msec;
extern int dmesg_restrict;

extern void wake_up_klogd(void);

char *log_buf_addr_get(void);
u32 log_buf_len_get(void);
void log_buf_vmcoreinfo_setup(void);
void __init setup_log_buf(int early);
__printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...);
void dump_stack_print_info(const char *log_lvl);
void show_regs_print_info(const char *log_lvl);
extern asmlinkage void dump_stack_lvl(const char *log_lvl) __cold;
extern asmlinkage void dump_stack(void) __cold;
void printk_trigger_flush(void);
void console_try_replay_all(void);
void printk_legacy_allow_panic_sync(void);
extern bool nbcon_device_try_acquire(struct console *con);
extern void nbcon_device_release(struct console *con);
void nbcon_atomic_flush_unsafe(void);
bool pr_flush(int timeout_ms, bool reset_on_progress);
#else
static inline __printf(1, 0)
int vprintk(const char *s, va_list args)
{
        return 0;
}
static inline __printf(1, 2) __cold
int _printk(const char *s, ...)
{
        return 0;
}
static inline __printf(1, 2) __cold
int _printk_deferred(const char *s, ...)
{
        return 0;
}

static inline void printk_deferred_enter(void)
{
}

static inline void printk_deferred_exit(void)
{
}

static inline void printk_force_console_enter(void)
{
}

static inline void printk_force_console_exit(void)
{
}

static inline int printk_ratelimit(void)
{
        return 0;
}
static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                                          unsigned int interval_msec)
{
        return false;
}

static inline void wake_up_klogd(void)
{
}

static inline char *log_buf_addr_get(void)
{
        return NULL;
}

static inline u32 log_buf_len_get(void)
{
        return 0;
}

static inline void log_buf_vmcoreinfo_setup(void)
{
}

static inline void setup_log_buf(int early)
{
}

static inline __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...)
{
}

static inline void dump_stack_print_info(const char *log_lvl)
{
}

static inline void show_regs_print_info(const char *log_lvl)
{
}

static inline void dump_stack_lvl(const char *log_lvl)
{
}

static inline void dump_stack(void)
{
}
static inline void printk_trigger_flush(void)
{
}
static inline void console_try_replay_all(void)
{
}

static inline void printk_legacy_allow_panic_sync(void)
{
}

static inline bool nbcon_device_try_acquire(struct console *con)
{
        return false;
}

static inline void nbcon_device_release(struct console *con)
{
}

static inline void nbcon_atomic_flush_unsafe(void)
{
}

static inline bool pr_flush(int timeout_ms, bool reset_on_progress)
{
        return true;
}

#endif

bool this_cpu_in_panic(void);

#ifdef CONFIG_SMP
extern int __printk_cpu_sync_try_get(void);
extern void __printk_cpu_sync_wait(void);
extern void __printk_cpu_sync_put(void);

#else

#define __printk_cpu_sync_try_get() true
#define __printk_cpu_sync_wait()
#define __printk_cpu_sync_put()
#endif /* CONFIG_SMP */

/**
 * printk_cpu_sync_get_irqsave() - Disable interrupts and acquire the printk
 *                                 cpu-reentrant spinning lock.
 * @flags: Stack-allocated storage for saving local interrupt state,
 *         to be passed to printk_cpu_sync_put_irqrestore().
 *
 * If the lock is owned by another CPU, spin until it becomes available.
 * Interrupts are restored while spinning.
 *
 * CAUTION: This function must be used carefully. It does not behave like a
 * typical lock. Here are important things to watch out for...
 *
 *     * This function is reentrant on the same CPU. Therefore the calling
 *       code must not assume exclusive access to data if code accessing the
 *       data can run reentrant or within NMI context on the same CPU.
 *
 *     * If there exists usage of this function from NMI context, it becomes
 *       unsafe to perform any type of locking or spinning to wait for other
 *       CPUs after calling this function from any context. This includes
 *       using spinlocks or any other busy-waiting synchronization methods.
 */
#define printk_cpu_sync_get_irqsave(flags)                \
        for (;;) {                                        \
                local_irq_save(flags);                        \
                if (__printk_cpu_sync_try_get())        \
                        break;                                \
                local_irq_restore(flags);                \
                __printk_cpu_sync_wait();                \
        }

/**
 * printk_cpu_sync_put_irqrestore() - Release the printk cpu-reentrant spinning
 *                                    lock and restore interrupts.
 * @flags: Caller's saved interrupt state, from printk_cpu_sync_get_irqsave().
 */
#define printk_cpu_sync_put_irqrestore(flags)        \
        do {                                        \
                __printk_cpu_sync_put();        \
                local_irq_restore(flags);        \
        } while (0)

extern int kptr_restrict;

/**
 * pr_fmt - used by the pr_*() macros to generate the printk format string
 * @fmt: format string passed from a pr_*() macro
 *
 * This macro can be used to generate a unified format string for pr_*()
 * macros. A common use is to prefix all pr_*() messages in a file with a common
 * string. For example, defining this at the top of a source file:
 *
 *        #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 *
 * would prefix all pr_info, pr_emerg... messages in the file with the module
 * name.
 */
#ifndef pr_fmt
#define pr_fmt(fmt) fmt
#endif

struct module;

#ifdef CONFIG_PRINTK_INDEX
struct pi_entry {
        const char *fmt;
        const char *func;
        const char *file;
        unsigned int line;

        /*
         * While printk and pr_* have the level stored in the string at compile
         * time, some subsystems dynamically add it at runtime through the
         * format string. For these dynamic cases, we allow the subsystem to
         * tell us the level at compile time.
         *
         * NULL indicates that the level, if any, is stored in fmt.
         */
        const char *level;

        /*
         * The format string used by various subsystem specific printk()
         * wrappers to prefix the message.
         *
         * Note that the static prefix defined by the pr_fmt() macro is stored
         * directly in the message format (@fmt), not here.
         */
        const char *subsys_fmt_prefix;
} __packed;

#define __printk_index_emit(_fmt, _level, _subsys_fmt_prefix)                \
        do {                                                                \
                if (__builtin_constant_p(_fmt) && __builtin_constant_p(_level)) { \
                        /*
                         * We check __builtin_constant_p multiple times here
                         * for the same input because GCC will produce an error
                         * if we try to assign a static variable to fmt if it
                         * is not a constant, even with the outer if statement.
                         */                                                \
                        static const struct pi_entry _entry                \
                        __used = {                                        \
                                .fmt = __builtin_constant_p(_fmt) ? (_fmt) : NULL, \
                                .func = __func__,                        \
                                .file = __FILE__,                        \
                                .line = __LINE__,                        \
                                .level = __builtin_constant_p(_level) ? (_level) : NULL, \
                                .subsys_fmt_prefix = _subsys_fmt_prefix,\
                        };                                                \
                        static const struct pi_entry *_entry_ptr        \
                        __used __section(".printk_index") = &_entry;        \
                }                                                        \
        } while (0)

#else /* !CONFIG_PRINTK_INDEX */
#define __printk_index_emit(...) do {} while (0)
#endif /* CONFIG_PRINTK_INDEX */

/*
 * Some subsystems have their own custom printk that applies a va_format to a
 * generic format, for example, to include a device number or other metadata
 * alongside the format supplied by the caller.
 *
 * In order to store these in the way they would be emitted by the printk
 * infrastructure, the subsystem provides us with the start, fixed string, and
 * any subsequent text in the format string.
 *
 * We take a variable argument list as pr_fmt/dev_fmt/etc are sometimes passed
 * as multiple arguments (eg: `"%s: ", "blah"`), and we must only take the
 * first one.
 *
 * subsys_fmt_prefix must be known at compile time, or compilation will fail
 * (since this is a mistake). If fmt or level is not known at compile time, no
 * index entry will be made (since this can legitimately happen).
 */
#define printk_index_subsys_emit(subsys_fmt_prefix, level, fmt, ...) \
        __printk_index_emit(fmt, level, subsys_fmt_prefix)

#define printk_index_wrap(_p_func, _fmt, ...)                                \
        ({                                                                \
                __printk_index_emit(_fmt, NULL, NULL);                        \
                _p_func(_fmt, ##__VA_ARGS__);                                \
        })


/**
 * printk - print a kernel message
 * @fmt: format string
 *
 * This is printk(). It can be called from any context. We want it to work.
 *
 * If printk indexing is enabled, _printk() is called from printk_index_wrap.
 * Otherwise, printk is simply #defined to _printk.
 *
 * We try to grab the console_lock. If we succeed, it's easy - we log the
 * output and call the console drivers.  If we fail to get the semaphore, we
 * place the output into the log buffer and return. The current holder of
 * the console_sem will notice the new output in console_unlock(); and will
 * send it to the consoles before releasing the lock.
 *
 * One effect of this deferred printing is that code which calls printk() and
 * then changes console_loglevel may break. This is because console_loglevel
 * is inspected when the actual printing occurs.
 *
 * See also:
 * printf(3)
 *
 * See the vsnprintf() documentation for format string extensions over C99.
 */
#define printk(fmt, ...) printk_index_wrap(_printk, fmt, ##__VA_ARGS__)
#define printk_deferred(fmt, ...)                                        \
        printk_index_wrap(_printk_deferred, fmt, ##__VA_ARGS__)

/**
 * pr_emerg - Print an emergency-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_EMERG loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_emerg(fmt, ...) \
        printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_alert - Print an alert-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ALERT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_alert(fmt, ...) \
        printk(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_crit - Print a critical-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CRIT loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_crit(fmt, ...) \
        printk(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_err - Print an error-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_ERR loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_err(fmt, ...) \
        printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_warn - Print a warning-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_WARNING loglevel. It uses pr_fmt()
 * to generate the format string.
 */
#define pr_warn(fmt, ...) \
        printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_notice - Print a notice-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_NOTICE loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_notice(fmt, ...) \
        printk(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
/**
 * pr_info - Print an info-level message
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_INFO loglevel. It uses pr_fmt() to
 * generate the format string.
 */
#define pr_info(fmt, ...) \
        printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)

/**
 * pr_cont - Continues a previous log message in the same line.
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_CONT loglevel. It should only be
 * used when continuing a log message with no newline ('\n') enclosed. Otherwise
 * it defaults back to KERN_DEFAULT loglevel.
 */
#define pr_cont(fmt, ...) \
        printk(KERN_CONT fmt, ##__VA_ARGS__)

/**
 * pr_devel - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to a printk with KERN_DEBUG loglevel if DEBUG is
 * defined. Otherwise it does nothing.
 *
 * It uses pr_fmt() to generate the format string.
 */
#ifdef DEBUG
#define pr_devel(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif


/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#include <linux/dynamic_debug.h>

/**
 * pr_debug - Print a debug-level message conditionally
 * @fmt: format string
 * @...: arguments for the format string
 *
 * This macro expands to dynamic_pr_debug() if CONFIG_DYNAMIC_DEBUG is
 * set. Otherwise, if DEBUG is defined, it's equivalent to a printk with
 * KERN_DEBUG loglevel. If DEBUG is not defined it does nothing.
 *
 * It uses pr_fmt() to generate the format string (dynamic_pr_debug() uses
 * pr_fmt() internally).
 */
#define pr_debug(fmt, ...)                        \
        dynamic_pr_debug(fmt, ##__VA_ARGS__)
#elif defined(DEBUG)
#define pr_debug(fmt, ...) \
        printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * Print a one-time message (analogous to WARN_ONCE() et al):
 */

#ifdef CONFIG_PRINTK
#define printk_once(fmt, ...)                                        \
        DO_ONCE_LITE(printk, fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...)                                \
        DO_ONCE_LITE(printk_deferred, fmt, ##__VA_ARGS__)
#else
#define printk_once(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#define printk_deferred_once(fmt, ...)                                \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_once(fmt, ...)                                        \
        printk_once(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_once(fmt, ...)                                        \
        printk_once(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_once(fmt, ...)                                        \
        printk_once(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_once(fmt, ...)                                        \
        printk_once(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_once(fmt, ...)                                        \
        printk_once(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_once(fmt, ...)                                \
        printk_once(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_once(fmt, ...)                                        \
        printk_once(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_once, don't do that... */

#if defined(DEBUG)
#define pr_devel_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(DEBUG)
#define pr_debug_once(fmt, ...)                                        \
        printk_once(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_once(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/*
 * ratelimited messages with local ratelimit_state,
 * no local ratelimit_state used in the !PRINTK case
 */
#ifdef CONFIG_PRINTK
#define printk_ratelimited(fmt, ...)                                        \
({                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
                                                                        \
        if (__ratelimit(&_rs))                                                \
                printk(fmt, ##__VA_ARGS__);                                \
})
#else
#define printk_ratelimited(fmt, ...)                                        \
        no_printk(fmt, ##__VA_ARGS__)
#endif

#define pr_emerg_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
#define pr_alert_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_crit_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
#define pr_err_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
#define pr_warn_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
#define pr_notice_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
#define pr_info_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
/* no pr_cont_ratelimited, don't do that... */

#if defined(DEBUG)
#define pr_devel_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_devel_ratelimited(fmt, ...)                                        \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

/* If you are writing a driver, please use dev_dbg instead */
#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
/* descriptor check is first to prevent flooding with "callbacks suppressed" */
#define pr_debug_ratelimited(fmt, ...)                                        \
do {                                                                        \
        static DEFINE_RATELIMIT_STATE(_rs,                                \
                                      DEFAULT_RATELIMIT_INTERVAL,        \
                                      DEFAULT_RATELIMIT_BURST);                \
        DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, pr_fmt(fmt));                \
        if (DYNAMIC_DEBUG_BRANCH(descriptor) &&                                \
            __ratelimit(&_rs))                                                \
                __dynamic_pr_debug(&descriptor, pr_fmt(fmt), ##__VA_ARGS__);        \
} while (0)
#elif defined(DEBUG)
#define pr_debug_ratelimited(fmt, ...)                                        \
        printk_ratelimited(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#else
#define pr_debug_ratelimited(fmt, ...) \
        no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
#endif

extern const struct file_operations kmsg_fops;

enum {
        DUMP_PREFIX_NONE,
        DUMP_PREFIX_ADDRESS,
        DUMP_PREFIX_OFFSET
};
extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize,
                              int groupsize, char *linebuf, size_t linebuflen,
                              bool ascii);
#ifdef CONFIG_PRINTK
extern void print_hex_dump(const char *level, const char *prefix_str,
                           int prefix_type, int rowsize, int groupsize,
                           const void *buf, size_t len, bool ascii);
#else
static inline void print_hex_dump(const char *level, const char *prefix_str,
                                  int prefix_type, int rowsize, int groupsize,
                                  const void *buf, size_t len, bool ascii)
{
}
static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
                                        const void *buf, size_t len)
{
}

#endif

#if defined(CONFIG_DYNAMIC_DEBUG) || \
        (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE))
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,        \
                             groupsize, buf, len, ascii)        \
        dynamic_hex_dump(prefix_str, prefix_type, rowsize,        \
                         groupsize, buf, len, ascii)
#elif defined(DEBUG)
#define print_hex_dump_debug(prefix_str, prefix_type, rowsize,                \
                             groupsize, buf, len, ascii)                \
        print_hex_dump(KERN_DEBUG, prefix_str, prefix_type, rowsize,        \
                       groupsize, buf, len, ascii)
#else
static inline void print_hex_dump_debug(const char *prefix_str, int prefix_type,
                                        int rowsize, int groupsize,
                                        const void *buf, size_t len, bool ascii)
{
}
#endif

/**
 * print_hex_dump_bytes - shorthand form of print_hex_dump() with default params
 * @prefix_str: string to prefix each line with;
 *  caller supplies trailing spaces for alignment if desired
 * @prefix_type: controls whether prefix of an offset, address, or none
 *  is printed (%DUMP_PREFIX_OFFSET, %DUMP_PREFIX_ADDRESS, %DUMP_PREFIX_NONE)
 * @buf: data blob to dump
 * @len: number of bytes in the @buf
 *
 * Calls print_hex_dump(), with log level of KERN_DEBUG,
 * rowsize of 16, groupsize of 1, and ASCII output included.
 */
#define print_hex_dump_bytes(prefix_str, prefix_type, buf, len)        \
        print_hex_dump_debug(prefix_str, prefix_type, 16, 1, buf, len, true)

#endif





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   22 







   22 
   22 








   22 














































































   22 








   22 















   22 





   22 



















































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
// SPDX-License-Identifier: GPL-2.0-only
// Copyright (C) 2022 Linutronix GmbH, John Ogness
// Copyright (C) 2022 Intel, Thomas Gleixner

#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/console.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/init.h>
#include <linux/irqflags.h>
#include <linux/kthread.h>
#include <linux/minmax.h>
#include <linux/percpu.h>
#include <linux/preempt.h>
#include <linux/slab.h>
#include <linux/smp.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
#include "internal.h"
#include "printk_ringbuffer.h"
/*
 * Printk console printing implementation for consoles which does not depend
 * on the legacy style console_lock mechanism.
 *
 * The state of the console is maintained in the "nbcon_state" atomic
 * variable.
 *
 * The console is locked when:
 *
 *   - The 'prio' field contains the priority of the context that owns the
 *     console. Only higher priority contexts are allowed to take over the
 *     lock. A value of 0 (NBCON_PRIO_NONE) means the console is not locked.
 *
 *   - The 'cpu' field denotes on which CPU the console is locked. It is used
 *     to prevent busy waiting on the same CPU. Also it informs the lock owner
 *     that it has lost the lock in a more complex scenario when the lock was
 *     taken over by a higher priority context, released, and taken on another
 *     CPU with the same priority as the interrupted owner.
 *
 * The acquire mechanism uses a few more fields:
 *
 *   - The 'req_prio' field is used by the handover approach to make the
 *     current owner aware that there is a context with a higher priority
 *     waiting for the friendly handover.
 *
 *   - The 'unsafe' field allows to take over the console in a safe way in the
 *     middle of emitting a message. The field is set only when accessing some
 *     shared resources or when the console device is manipulated. It can be
 *     cleared, for example, after emitting one character when the console
 *     device is in a consistent state.
 *
 *   - The 'unsafe_takeover' field is set when a hostile takeover took the
 *     console in an unsafe state. The console will stay in the unsafe state
 *     until re-initialized.
 *
 * The acquire mechanism uses three approaches:
 *
 *   1) Direct acquire when the console is not owned or is owned by a lower
 *      priority context and is in a safe state.
 *
 *   2) Friendly handover mechanism uses a request/grant handshake. It is used
 *      when the current owner has lower priority and the console is in an
 *      unsafe state.
 *
 *      The requesting context:
 *
 *        a) Sets its priority into the 'req_prio' field.
 *
 *        b) Waits (with a timeout) for the owning context to unlock the
 *           console.
 *
 *        c) Takes the lock and clears the 'req_prio' field.
 *
 *      The owning context:
 *
 *        a) Observes the 'req_prio' field set on exit from the unsafe
 *           console state.
 *
 *        b) Gives up console ownership by clearing the 'prio' field.
 *
 *   3) Unsafe hostile takeover allows to take over the lock even when the
 *      console is an unsafe state. It is used only in panic() by the final
 *      attempt to flush consoles in a try and hope mode.
 *
 *      Note that separate record buffers are used in panic(). As a result,
 *      the messages can be read and formatted without any risk even after
 *      using the hostile takeover in unsafe state.
 *
 * The release function simply clears the 'prio' field.
 *
 * All operations on @console::nbcon_state are atomic cmpxchg based to
 * handle concurrency.
 *
 * The acquire/release functions implement only minimal policies:
 *
 *   - Preference for higher priority contexts.
 *   - Protection of the panic CPU.
 *
 * All other policy decisions must be made at the call sites:
 *
 *   - What is marked as an unsafe section.
 *   - Whether to spin-wait if there is already an owner and the console is
 *     in an unsafe state.
 *   - Whether to attempt an unsafe hostile takeover.
 *
 * The design allows to implement the well known:
 *
 *     acquire()
 *     output_one_printk_record()
 *     release()
 *
 * The output of one printk record might be interrupted with a higher priority
 * context. The new owner is supposed to reprint the entire interrupted record
 * from scratch.
 */

/**
 * nbcon_state_set - Helper function to set the console state
 * @con:        Console to update
 * @new:        The new state to write
 *
 * Only to be used when the console is not yet or no longer visible in the
 * system. Otherwise use nbcon_state_try_cmpxchg().
 */
static inline void nbcon_state_set(struct console *con, struct nbcon_state *new)
{
        atomic_set(&ACCESS_PRIVATE(con, nbcon_state), new->atom);
}

/**
 * nbcon_state_read - Helper function to read the console state
 * @con:        Console to read
 * @state:        The state to store the result
 */
static inline void nbcon_state_read(struct console *con, struct nbcon_state *state)
{
        state->atom = atomic_read(&ACCESS_PRIVATE(con, nbcon_state));
}

/**
 * nbcon_state_try_cmpxchg() - Helper function for atomic_try_cmpxchg() on console state
 * @con:        Console to update
 * @cur:        Old/expected state
 * @new:        New state
 *
 * Return: True on success. False on fail and @cur is updated.
 */
static inline bool nbcon_state_try_cmpxchg(struct console *con, struct nbcon_state *cur,
                                           struct nbcon_state *new)
{
        return atomic_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_state), &cur->atom, new->atom);
}

/**
 * nbcon_seq_read - Read the current console sequence
 * @con:        Console to read the sequence of
 *
 * Return:        Sequence number of the next record to print on @con.
 */
u64 nbcon_seq_read(struct console *con)
{
        unsigned long nbcon_seq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_seq));

        return __ulseq_to_u64seq(prb, nbcon_seq);
}

/**
 * nbcon_seq_force - Force console sequence to a specific value
 * @con:        Console to work on
 * @seq:        Sequence number value to set
 *
 * Only to be used during init (before registration) or in extreme situations
 * (such as panic with CONSOLE_REPLAY_ALL).
 */
void nbcon_seq_force(struct console *con, u64 seq)
{
        /*
         * If the specified record no longer exists, the oldest available record
         * is chosen. This is especially important on 32bit systems because only
         * the lower 32 bits of the sequence number are stored. The upper 32 bits
         * are derived from the sequence numbers available in the ringbuffer.
         */
        u64 valid_seq = max_t(u64, seq, prb_first_valid_seq(prb));

        atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), __u64seq_to_ulseq(valid_seq));
}

/**
 * nbcon_seq_try_update - Try to update the console sequence number
 * @ctxt:        Pointer to an acquire context that contains
 *                all information about the acquire mode
 * @new_seq:        The new sequence number to set
 *
 * @ctxt->seq is updated to the new value of @con::nbcon_seq (expanded to
 * the 64bit value). This could be a different value than @new_seq if
 * nbcon_seq_force() was used or the current context no longer owns the
 * console. In the later case, it will stop printing anyway.
 */
static void nbcon_seq_try_update(struct nbcon_context *ctxt, u64 new_seq)
{
        unsigned long nbcon_seq = __u64seq_to_ulseq(ctxt->seq);
        struct console *con = ctxt->console;

        if (atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_seq), &nbcon_seq,
                                    __u64seq_to_ulseq(new_seq))) {
                ctxt->seq = new_seq;
        } else {
                ctxt->seq = nbcon_seq_read(con);
        }
}

/**
 * nbcon_context_try_acquire_direct - Try to acquire directly
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * Acquire the console when it is released. Also acquire the console when
 * the current owner has a lower priority and the console is in a safe state.
 *
 * Return:        0 on success. Otherwise, an error code on failure. Also @cur
 *                is updated to the latest state when failed to modify it.
 *
 * Errors:
 *
 *        -EPERM:                A panic is in progress and this is not the panic CPU.
 *                        Or the current owner or waiter has the same or higher
 *                        priority. No acquire method can be successful in
 *                        this case.
 *
 *        -EBUSY:                The current owner has a lower priority but the console
 *                        in an unsafe state. The caller should try using
 *                        the handover acquire method.
 */
static int nbcon_context_try_acquire_direct(struct nbcon_context *ctxt,
                                            struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;

        do {
                /*
                 * Panic does not imply that the console is owned. However, it
                 * is critical that non-panic CPUs during panic are unable to
                 * acquire ownership in order to satisfy the assumptions of
                 * nbcon_waiter_matches(). In particular, the assumption that
                 * lower priorities are ignored during panic.
                 */
                if (other_cpu_in_panic())
                        return -EPERM;

                if (ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio)
                        return -EPERM;

                if (cur->unsafe)
                        return -EBUSY;

                /*
                 * The console should never be safe for a direct acquire
                 * if an unsafe hostile takeover has ever happened.
                 */
                WARN_ON_ONCE(cur->unsafe_takeover);

                new.atom = cur->atom;
                new.prio        = ctxt->prio;
                new.req_prio        = NBCON_PRIO_NONE;
                new.unsafe        = cur->unsafe_takeover;
                new.cpu                = cpu;

        } while (!nbcon_state_try_cmpxchg(con, cur, &new));

        return 0;
}

static bool nbcon_waiter_matches(struct nbcon_state *cur, int expected_prio)
{
        /*
         * The request context is well defined by the @req_prio because:
         *
         * - Only a context with a priority higher than the owner can become
         *   a waiter.
         * - Only a context with a priority higher than the waiter can
         *   directly take over the request.
         * - There are only three priorities.
         * - Only one CPU is allowed to request PANIC priority.
         * - Lower priorities are ignored during panic() until reboot.
         *
         * As a result, the following scenario is *not* possible:
         *
         * 1. This context is currently a waiter.
         * 2. Another context with a higher priority than this context
         *    directly takes ownership.
         * 3. The higher priority context releases the ownership.
         * 4. Another lower priority context takes the ownership.
         * 5. Another context with the same priority as this context
         *    creates a request and starts waiting.
         *
         * Event #1 implies this context is EMERGENCY.
         * Event #2 implies the new context is PANIC.
         * Event #3 occurs when panic() has flushed the console.
         * Events #4 and #5 are not possible due to the other_cpu_in_panic()
         * check in nbcon_context_try_acquire_direct().
         */

        return (cur->req_prio == expected_prio);
}

/**
 * nbcon_context_try_acquire_requested - Try to acquire after having
 *                                         requested a handover
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * This is a helper function for nbcon_context_try_acquire_handover().
 * It is called when the console is in an unsafe state. The current
 * owner will release the console on exit from the unsafe region.
 *
 * Return:        0 on success and @cur is updated to the new console state.
 *                Otherwise an error code on failure.
 *
 * Errors:
 *
 *        -EPERM:                A panic is in progress and this is not the panic CPU
 *                        or this context is no longer the waiter.
 *
 *        -EBUSY:                The console is still locked. The caller should
 *                        continue waiting.
 *
 * Note: The caller must still remove the request when an error has occurred
 *       except when this context is no longer the waiter.
 */
static int nbcon_context_try_acquire_requested(struct nbcon_context *ctxt,
                                               struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;

        /* Note that the caller must still remove the request! */
        if (other_cpu_in_panic())
                return -EPERM;

        /*
         * Note that the waiter will also change if there was an unsafe
         * hostile takeover.
         */
        if (!nbcon_waiter_matches(cur, ctxt->prio))
                return -EPERM;

        /* If still locked, caller should continue waiting. */
        if (cur->prio != NBCON_PRIO_NONE)
                return -EBUSY;

        /*
         * The previous owner should have never released ownership
         * in an unsafe region.
         */
        WARN_ON_ONCE(cur->unsafe);

        new.atom = cur->atom;
        new.prio        = ctxt->prio;
        new.req_prio        = NBCON_PRIO_NONE;
        new.unsafe        = cur->unsafe_takeover;
        new.cpu                = cpu;

        if (!nbcon_state_try_cmpxchg(con, cur, &new)) {
                /*
                 * The acquire could fail only when it has been taken
                 * over by a higher priority context.
                 */
                WARN_ON_ONCE(nbcon_waiter_matches(cur, ctxt->prio));
                return -EPERM;
        }

        /* Handover success. This context now owns the console. */
        return 0;
}

/**
 * nbcon_context_try_acquire_handover - Try to acquire via handover
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * The function must be called only when the context has higher priority
 * than the current owner and the console is in an unsafe state.
 * It is the case when nbcon_context_try_acquire_direct() returns -EBUSY.
 *
 * The function sets "req_prio" field to make the current owner aware of
 * the request. Then it waits until the current owner releases the console,
 * or an even higher context takes over the request, or timeout expires.
 *
 * The current owner checks the "req_prio" field on exit from the unsafe
 * region and releases the console. It does not touch the "req_prio" field
 * so that the console stays reserved for the waiter.
 *
 * Return:        0 on success. Otherwise, an error code on failure. Also @cur
 *                is updated to the latest state when failed to modify it.
 *
 * Errors:
 *
 *        -EPERM:                A panic is in progress and this is not the panic CPU.
 *                        Or a higher priority context has taken over the
 *                        console or the handover request.
 *
 *        -EBUSY:                The current owner is on the same CPU so that the hand
 *                        shake could not work. Or the current owner is not
 *                        willing to wait (zero timeout). Or the console does
 *                        not enter the safe state before timeout passed. The
 *                        caller might still use the unsafe hostile takeover
 *                        when allowed.
 *
 *        -EAGAIN:        @cur has changed when creating the handover request.
 *                        The caller should retry with direct acquire.
 */
static int nbcon_context_try_acquire_handover(struct nbcon_context *ctxt,
                                              struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;
        int timeout;
        int request_err = -EBUSY;

        /*
         * Check that the handover is called when the direct acquire failed
         * with -EBUSY.
         */
        WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
        WARN_ON_ONCE(!cur->unsafe);

        /* Handover is not possible on the same CPU. */
        if (cur->cpu == cpu)
                return -EBUSY;

        /*
         * Console stays unsafe after an unsafe takeover until re-initialized.
         * Waiting is not going to help in this case.
         */
        if (cur->unsafe_takeover)
                return -EBUSY;

        /* Is the caller willing to wait? */
        if (ctxt->spinwait_max_us == 0)
                return -EBUSY;

        /*
         * Setup a request for the handover. The caller should try to acquire
         * the console directly when the current state has been modified.
         */
        new.atom = cur->atom;
        new.req_prio = ctxt->prio;
        if (!nbcon_state_try_cmpxchg(con, cur, &new))
                return -EAGAIN;

        cur->atom = new.atom;

        /* Wait until there is no owner and then acquire the console. */
        for (timeout = ctxt->spinwait_max_us; timeout >= 0; timeout--) {
                /* On successful acquire, this request is cleared. */
                request_err = nbcon_context_try_acquire_requested(ctxt, cur);
                if (!request_err)
                        return 0;

                /*
                 * If the acquire should be aborted, it must be ensured
                 * that the request is removed before returning to caller.
                 */
                if (request_err == -EPERM)
                        break;

                udelay(1);

                /* Re-read the state because some time has passed. */
                nbcon_state_read(con, cur);
        }

        /* Timed out or aborted. Carefully remove handover request. */
        do {
                /*
                 * No need to remove request if there is a new waiter. This
                 * can only happen if a higher priority context has taken over
                 * the console or the handover request.
                 */
                if (!nbcon_waiter_matches(cur, ctxt->prio))
                        return -EPERM;

                /* Unset request for handover. */
                new.atom = cur->atom;
                new.req_prio = NBCON_PRIO_NONE;
                if (nbcon_state_try_cmpxchg(con, cur, &new)) {
                        /*
                         * Request successfully unset. Report failure of
                         * acquiring via handover.
                         */
                        cur->atom = new.atom;
                        return request_err;
                }

                /*
                 * Unable to remove request. Try to acquire in case
                 * the owner has released the lock.
                 */
        } while (nbcon_context_try_acquire_requested(ctxt, cur));

        /* Lucky timing. The acquire succeeded while removing the request. */
        return 0;
}

/**
 * nbcon_context_try_acquire_hostile - Acquire via unsafe hostile takeover
 * @ctxt:        The context of the caller
 * @cur:        The current console state
 *
 * Acquire the console even in the unsafe state.
 *
 * It can be permitted by setting the 'allow_unsafe_takeover' field only
 * by the final attempt to flush messages in panic().
 *
 * Return:        0 on success. -EPERM when not allowed by the context.
 */
static int nbcon_context_try_acquire_hostile(struct nbcon_context *ctxt,
                                             struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state new;

        if (!ctxt->allow_unsafe_takeover)
                return -EPERM;

        /* Ensure caller is allowed to perform unsafe hostile takeovers. */
        if (WARN_ON_ONCE(ctxt->prio != NBCON_PRIO_PANIC))
                return -EPERM;

        /*
         * Check that try_acquire_direct() and try_acquire_handover() returned
         * -EBUSY in the right situation.
         */
        WARN_ON_ONCE(ctxt->prio <= cur->prio || ctxt->prio <= cur->req_prio);
        WARN_ON_ONCE(cur->unsafe != true);

        do {
                new.atom = cur->atom;
                new.cpu                        = cpu;
                new.prio                = ctxt->prio;
                new.unsafe                |= cur->unsafe_takeover;
                new.unsafe_takeover        |= cur->unsafe;

        } while (!nbcon_state_try_cmpxchg(con, cur, &new));

        return 0;
}

static struct printk_buffers panic_nbcon_pbufs;

/**
 * nbcon_context_try_acquire - Try to acquire nbcon console
 * @ctxt:        The context of the caller
 *
 * Context:        Under @ctxt->con->device_lock() or local_irq_save().
 * Return:        True if the console was acquired. False otherwise.
 *
 * If the caller allowed an unsafe hostile takeover, on success the
 * caller should check the current console state to see if it is
 * in an unsafe state. Otherwise, on success the caller may assume
 * the console is not in an unsafe state.
 */
static bool nbcon_context_try_acquire(struct nbcon_context *ctxt)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state cur;
        int err;

        nbcon_state_read(con, &cur);
try_again:
        err = nbcon_context_try_acquire_direct(ctxt, &cur);
        if (err != -EBUSY)
                goto out;

        err = nbcon_context_try_acquire_handover(ctxt, &cur);
        if (err == -EAGAIN)
                goto try_again;
        if (err != -EBUSY)
                goto out;

        err = nbcon_context_try_acquire_hostile(ctxt, &cur);
out:
        if (err)
                return false;

        /* Acquire succeeded. */

        /* Assign the appropriate buffer for this context. */
        if (atomic_read(&panic_cpu) == cpu)
                ctxt->pbufs = &panic_nbcon_pbufs;
        else
                ctxt->pbufs = con->pbufs;

        /* Set the record sequence for this context to print. */
        ctxt->seq = nbcon_seq_read(ctxt->console);

        return true;
}

static bool nbcon_owner_matches(struct nbcon_state *cur, int expected_cpu,
                                int expected_prio)
{
        /*
         * A similar function, nbcon_waiter_matches(), only deals with
         * EMERGENCY and PANIC priorities. However, this function must also
         * deal with the NORMAL priority, which requires additional checks
         * and constraints.
         *
         * For the case where preemption and interrupts are disabled, it is
         * enough to also verify that the owning CPU has not changed.
         *
         * For the case where preemption or interrupts are enabled, an
         * external synchronization method *must* be used. In particular,
         * the driver-specific locking mechanism used in device_lock()
         * (including disabling migration) should be used. It prevents
         * scenarios such as:
         *
         * 1. [Task A] owns a context with NBCON_PRIO_NORMAL on [CPU X] and
         *    is scheduled out.
         * 2. Another context takes over the lock with NBCON_PRIO_EMERGENCY
         *    and releases it.
         * 3. [Task B] acquires a context with NBCON_PRIO_NORMAL on [CPU X]
         *    and is scheduled out.
         * 4. [Task A] gets running on [CPU X] and sees that the console is
         *    still owned by a task on [CPU X] with NBON_PRIO_NORMAL. Thus
         *    [Task A] thinks it is the owner when it is not.
         */

        if (cur->prio != expected_prio)
                return false;

        if (cur->cpu != expected_cpu)
                return false;

        return true;
}

/**
 * nbcon_context_release - Release the console
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 */
static void nbcon_context_release(struct nbcon_context *ctxt)
{
        unsigned int cpu = smp_processor_id();
        struct console *con = ctxt->console;
        struct nbcon_state cur;
        struct nbcon_state new;

        nbcon_state_read(con, &cur);

        do {
                if (!nbcon_owner_matches(&cur, cpu, ctxt->prio))
                        break;

                new.atom = cur.atom;
                new.prio = NBCON_PRIO_NONE;

                /*
                 * If @unsafe_takeover is set, it is kept set so that
                 * the state remains permanently unsafe.
                 */
                new.unsafe |= cur.unsafe_takeover;

        } while (!nbcon_state_try_cmpxchg(con, &cur, &new));

        ctxt->pbufs = NULL;
}

/**
 * nbcon_context_can_proceed - Check whether ownership can proceed
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 * @cur:        The current console state
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * Must be invoked when entering the unsafe state to make sure that it still
 * owns the lock. Also must be invoked when exiting the unsafe context
 * to eventually free the lock for a higher priority context which asked
 * for the friendly handover.
 *
 * It can be called inside an unsafe section when the console is just
 * temporary in safe state instead of exiting and entering the unsafe
 * state.
 *
 * Also it can be called in the safe context before doing an expensive
 * safe operation. It does not make sense to do the operation when
 * a higher priority context took the lock.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
static bool nbcon_context_can_proceed(struct nbcon_context *ctxt, struct nbcon_state *cur)
{
        unsigned int cpu = smp_processor_id();

        /* Make sure this context still owns the console. */
        if (!nbcon_owner_matches(cur, cpu, ctxt->prio))
                return false;

        /* The console owner can proceed if there is no waiter. */
        if (cur->req_prio == NBCON_PRIO_NONE)
                return true;

        /*
         * A console owner within an unsafe region is always allowed to
         * proceed, even if there are waiters. It can perform a handover
         * when exiting the unsafe region. Otherwise the waiter will
         * need to perform an unsafe hostile takeover.
         */
        if (cur->unsafe)
                return true;

        /* Waiters always have higher priorities than owners. */
        WARN_ON_ONCE(cur->req_prio <= cur->prio);

        /*
         * Having a safe point for take over and eventually a few
         * duplicated characters or a full line is way better than a
         * hostile takeover. Post processing can take care of the garbage.
         * Release and hand over.
         */
        nbcon_context_release(ctxt);

        /*
         * It is not clear whether the waiter really took over ownership. The
         * outermost callsite must make the final decision whether console
         * ownership is needed for it to proceed. If yes, it must reacquire
         * ownership (possibly hostile) before carefully proceeding.
         *
         * The calling context no longer owns the console so go back all the
         * way instead of trying to implement reacquire heuristics in tons of
         * places.
         */
        return false;
}

/**
 * nbcon_can_proceed - Check whether ownership can proceed
 * @wctxt:        The write context that was handed to the write function
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * It is used in nbcon_enter_unsafe() to make sure that it still owns the
 * lock. Also it is used in nbcon_exit_unsafe() to eventually free the lock
 * for a higher priority context which asked for the friendly handover.
 *
 * It can be called inside an unsafe section when the console is just
 * temporary in safe state instead of exiting and entering the unsafe state.
 *
 * Also it can be called in the safe context before doing an expensive safe
 * operation. It does not make sense to do the operation when a higher
 * priority context took the lock.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
bool nbcon_can_proceed(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        struct nbcon_state cur;

        nbcon_state_read(con, &cur);

        return nbcon_context_can_proceed(ctxt, &cur);
}
EXPORT_SYMBOL_GPL(nbcon_can_proceed);

#define nbcon_context_enter_unsafe(c)        __nbcon_context_update_unsafe(c, true)
#define nbcon_context_exit_unsafe(c)        __nbcon_context_update_unsafe(c, false)

/**
 * __nbcon_context_update_unsafe - Update the unsafe bit in @con->nbcon_state
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 * @unsafe:        The new value for the unsafe bit
 *
 * Return:        True if the unsafe state was updated and this context still
 *                owns the console. Otherwise false if ownership was handed
 *                over or taken.
 *
 * This function allows console owners to modify the unsafe status of the
 * console.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 *
 * Internal helper to avoid duplicated code.
 */
static bool __nbcon_context_update_unsafe(struct nbcon_context *ctxt, bool unsafe)
{
        struct console *con = ctxt->console;
        struct nbcon_state cur;
        struct nbcon_state new;

        nbcon_state_read(con, &cur);

        do {
                /*
                 * The unsafe bit must not be cleared if an
                 * unsafe hostile takeover has occurred.
                 */
                if (!unsafe && cur.unsafe_takeover)
                        goto out;

                if (!nbcon_context_can_proceed(ctxt, &cur))
                        return false;

                new.atom = cur.atom;
                new.unsafe = unsafe;
        } while (!nbcon_state_try_cmpxchg(con, &cur, &new));

        cur.atom = new.atom;
out:
        return nbcon_context_can_proceed(ctxt, &cur);
}

static void nbcon_write_context_set_buf(struct nbcon_write_context *wctxt,
                                        char *buf, unsigned int len)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        struct nbcon_state cur;

        wctxt->outbuf = buf;
        wctxt->len = len;
        nbcon_state_read(con, &cur);
        wctxt->unsafe_takeover = cur.unsafe_takeover;
}

/**
 * nbcon_enter_unsafe - Enter an unsafe region in the driver
 * @wctxt:        The write context that was handed to the write function
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
bool nbcon_enter_unsafe(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        bool is_owner;

        is_owner = nbcon_context_enter_unsafe(ctxt);
        if (!is_owner)
                nbcon_write_context_set_buf(wctxt, NULL, 0);
        return is_owner;
}
EXPORT_SYMBOL_GPL(nbcon_enter_unsafe);

/**
 * nbcon_exit_unsafe - Exit an unsafe region in the driver
 * @wctxt:        The write context that was handed to the write function
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context.
 */
bool nbcon_exit_unsafe(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        bool ret;

        ret = nbcon_context_exit_unsafe(ctxt);
        if (!ret)
                nbcon_write_context_set_buf(wctxt, NULL, 0);
        return ret;
}
EXPORT_SYMBOL_GPL(nbcon_exit_unsafe);

/**
 * nbcon_reacquire_nobuf - Reacquire a console after losing ownership
 *                                while printing
 * @wctxt:        The write context that was handed to the write callback
 *
 * Since ownership can be lost at any time due to handover or takeover, a
 * printing context _must_ be prepared to back out immediately and
 * carefully. However, there are scenarios where the printing context must
 * reacquire ownership in order to finalize or revert hardware changes.
 *
 * This function allows a printing context to reacquire ownership using the
 * same priority as its previous ownership.
 *
 * Note that after a successful reacquire the printing context will have no
 * output buffer because that has been lost. This function cannot be used to
 * resume printing.
 */
void nbcon_reacquire_nobuf(struct nbcon_write_context *wctxt)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);

        while (!nbcon_context_try_acquire(ctxt))
                cpu_relax();

        nbcon_write_context_set_buf(wctxt, NULL, 0);
}
EXPORT_SYMBOL_GPL(nbcon_reacquire_nobuf);

/**
 * nbcon_emit_next_record - Emit a record in the acquired context
 * @wctxt:        The write context that will be handed to the write function
 * @use_atomic:        True if the write_atomic() callback is to be used
 *
 * Return:        True if this context still owns the console. False if
 *                ownership was handed over or taken.
 *
 * When this function returns false then the calling context no longer owns
 * the console and is no longer allowed to go forward. In this case it must
 * back out immediately and carefully. The buffer content is also no longer
 * trusted since it no longer belongs to the calling context. If the caller
 * wants to do more it must reacquire the console first.
 *
 * When true is returned, @wctxt->ctxt.backlog indicates whether there are
 * still records pending in the ringbuffer,
 */
static bool nbcon_emit_next_record(struct nbcon_write_context *wctxt, bool use_atomic)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
        struct printk_message pmsg = {
                .pbufs = ctxt->pbufs,
        };
        unsigned long con_dropped;
        struct nbcon_state cur;
        unsigned long dropped;
        unsigned long ulseq;

        /*
         * This function should never be called for consoles that have not
         * implemented the necessary callback for writing: i.e. legacy
         * consoles and, when atomic, nbcon consoles with no write_atomic().
         * Handle it as if ownership was lost and try to continue.
         *
         * Note that for nbcon consoles the write_thread() callback is
         * mandatory and was already checked in nbcon_alloc().
         */
        if (WARN_ON_ONCE((use_atomic && !con->write_atomic) ||
                         !(console_srcu_read_flags(con) & CON_NBCON))) {
                nbcon_context_release(ctxt);
                return false;
        }

        /*
         * The printk buffers are filled within an unsafe section. This
         * prevents NBCON_PRIO_NORMAL and NBCON_PRIO_EMERGENCY from
         * clobbering each other.
         */

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        ctxt->backlog = printk_get_next_message(&pmsg, ctxt->seq, is_extended, true);
        if (!ctxt->backlog)
                return nbcon_context_exit_unsafe(ctxt);

        /*
         * @con->dropped is not protected in case of an unsafe hostile
         * takeover. In that situation the update can be racy so
         * annotate it accordingly.
         */
        con_dropped = data_race(READ_ONCE(con->dropped));

        dropped = con_dropped + pmsg.dropped;
        if (dropped && !is_extended)
                console_prepend_dropped(&pmsg, dropped);

        /*
         * If the previous owner was assigned the same record, this context
         * has taken over ownership and is replaying the record. Prepend a
         * message to let the user know the record is replayed.
         */
        ulseq = atomic_long_read(&ACCESS_PRIVATE(con, nbcon_prev_seq));
        if (__ulseq_to_u64seq(prb, ulseq) == pmsg.seq) {
                console_prepend_replay(&pmsg);
        } else {
                /*
                 * Ensure this context is still the owner before trying to
                 * update @nbcon_prev_seq. Otherwise the value in @ulseq may
                 * not be from the previous owner and instead be some later
                 * value from the context that took over ownership.
                 */
                nbcon_state_read(con, &cur);
                if (!nbcon_context_can_proceed(ctxt, &cur))
                        return false;

                atomic_long_try_cmpxchg(&ACCESS_PRIVATE(con, nbcon_prev_seq), &ulseq,
                                        __u64seq_to_ulseq(pmsg.seq));
        }

        if (!nbcon_context_exit_unsafe(ctxt))
                return false;

        /* For skipped records just update seq/dropped in @con. */
        if (pmsg.outbuf_len == 0)
                goto update_con;

        /* Initialize the write context for driver callbacks. */
        nbcon_write_context_set_buf(wctxt, &pmsg.pbufs->outbuf[0], pmsg.outbuf_len);

        if (use_atomic)
                con->write_atomic(con, wctxt);
        else
                con->write_thread(con, wctxt);

        if (!wctxt->outbuf) {
                /*
                 * Ownership was lost and reacquired by the driver. Handle it
                 * as if ownership was lost.
                 */
                nbcon_context_release(ctxt);
                return false;
        }

        /*
         * Ownership may have been lost but _not_ reacquired by the driver.
         * This case is detected and handled when entering unsafe to update
         * dropped/seq values.
         */

        /*
         * Since any dropped message was successfully output, reset the
         * dropped count for the console.
         */
        dropped = 0;
update_con:
        /*
         * The dropped count and the sequence number are updated within an
         * unsafe section. This limits update races to the panic context and
         * allows the panic context to win.
         */

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        if (dropped != con_dropped) {
                /* Counterpart to the READ_ONCE() above. */
                WRITE_ONCE(con->dropped, dropped);
        }

        nbcon_seq_try_update(ctxt, pmsg.seq + 1);

        return nbcon_context_exit_unsafe(ctxt);
}

/*
 * nbcon_emit_one - Print one record for an nbcon console using the
 *                        specified callback
 * @wctxt:        An initialized write context struct to use for this context
 * @use_atomic:        True if the write_atomic() callback is to be used
 *
 * Return:        True, when a record has been printed and there are still
 *                pending records. The caller might want to continue flushing.
 *
 *                False, when there is no pending record, or when the console
 *                context cannot be acquired, or the ownership has been lost.
 *                The caller should give up. Either the job is done, cannot be
 *                done, or will be handled by the owning context.
 *
 * This is an internal helper to handle the locking of the console before
 * calling nbcon_emit_next_record().
 */
static bool nbcon_emit_one(struct nbcon_write_context *wctxt, bool use_atomic)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(wctxt, ctxt);
        struct console *con = ctxt->console;
        unsigned long flags;
        bool ret = false;

        if (!use_atomic) {
                con->device_lock(con, &flags);

                /*
                 * Ensure this stays on the CPU to make handover and
                 * takeover possible.
                 */
                cant_migrate();
        }

        if (!nbcon_context_try_acquire(ctxt))
                goto out;

        /*
         * nbcon_emit_next_record() returns false when the console was
         * handed over or taken over. In both cases the context is no
         * longer valid.
         *
         * The higher priority printing context takes over responsibility
         * to print the pending records.
         */
        if (!nbcon_emit_next_record(wctxt, use_atomic))
                goto out;

        nbcon_context_release(ctxt);

        ret = ctxt->backlog;
out:
        if (!use_atomic)
                con->device_unlock(con, flags);
        return ret;
}

/**
 * nbcon_kthread_should_wakeup - Check whether a printer thread should wakeup
 * @con:        Console to operate on
 * @ctxt:        The nbcon context from nbcon_context_try_acquire()
 *
 * Return:        True if the thread should shutdown or if the console is
 *                allowed to print and a record is available. False otherwise.
 *
 * After the thread wakes up, it must first check if it should shutdown before
 * attempting any printing.
 */
static bool nbcon_kthread_should_wakeup(struct console *con, struct nbcon_context *ctxt)
{
        bool ret = false;
        short flags;
        int cookie;

        if (kthread_should_stop())
                return true;

        cookie = console_srcu_read_lock();

        flags = console_srcu_read_flags(con);
        if (console_is_usable(con, flags, false)) {
                /* Bring the sequence in @ctxt up to date */
                ctxt->seq = nbcon_seq_read(con);

                ret = prb_read_valid(prb, ctxt->seq, NULL);
        }

        console_srcu_read_unlock(cookie);
        return ret;
}

/**
 * nbcon_kthread_func - The printer thread function
 * @__console:        Console to operate on
 *
 * Return:        0
 */
static int nbcon_kthread_func(void *__console)
{
        struct console *con = __console;
        struct nbcon_write_context wctxt = {
                .ctxt.console        = con,
                .ctxt.prio        = NBCON_PRIO_NORMAL,
        };
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
        short con_flags;
        bool backlog;
        int cookie;

wait_for_event:
        /*
         * Guarantee this task is visible on the rcuwait before
         * checking the wake condition.
         *
         * The full memory barrier within set_current_state() of
         * ___rcuwait_wait_event() pairs with the full memory
         * barrier within rcuwait_has_sleeper().
         *
         * This pairs with rcuwait_has_sleeper:A and nbcon_kthread_wake:A.
         */
        rcuwait_wait_event(&con->rcuwait,
                           nbcon_kthread_should_wakeup(con, ctxt),
                           TASK_INTERRUPTIBLE); /* LMM(nbcon_kthread_func:A) */

        do {
                if (kthread_should_stop())
                        return 0;

                backlog = false;

                /*
                 * Keep the srcu read lock around the entire operation so that
                 * synchronize_srcu() can guarantee that the kthread stopped
                 * or suspended printing.
                 */
                cookie = console_srcu_read_lock();

                con_flags = console_srcu_read_flags(con);

                if (console_is_usable(con, con_flags, false))
                        backlog = nbcon_emit_one(&wctxt, false);

                console_srcu_read_unlock(cookie);

                cond_resched();

        } while (backlog);

        goto wait_for_event;
}

/**
 * nbcon_irq_work - irq work to wake console printer thread
 * @irq_work:        The irq work to operate on
 */
static void nbcon_irq_work(struct irq_work *irq_work)
{
        struct console *con = container_of(irq_work, struct console, irq_work);

        nbcon_kthread_wake(con);
}

static inline bool rcuwait_has_sleeper(struct rcuwait *w)
{
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the rcuwait is empty.
         *
         * This full memory barrier pairs with the full memory barrier within
         * set_current_state() of ___rcuwait_wait_event(), which is called
         * after prepare_to_rcuwait() adds the waiter but before it has
         * checked the wait condition.
         *
         * This pairs with nbcon_kthread_func:A.
         */
        smp_mb(); /* LMM(rcuwait_has_sleeper:A) */
        return rcuwait_active(w);
}

/**
 * nbcon_kthreads_wake - Wake up printing threads using irq_work
 */
void nbcon_kthreads_wake(void)
{
        struct console *con;
        int cookie;

        if (!printk_kthreads_running)
                return;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                if (!(console_srcu_read_flags(con) & CON_NBCON))
                        continue;

                /*
                 * Only schedule irq_work if the printing thread is
                 * actively waiting. If not waiting, the thread will
                 * notice by itself that it has work to do.
                 */
                if (rcuwait_has_sleeper(&con->rcuwait))
                        irq_work_queue(&con->irq_work);
        }
        console_srcu_read_unlock(cookie);
}

/*
 * nbcon_kthread_stop - Stop a console printer thread
 * @con:        Console to operate on
 */
void nbcon_kthread_stop(struct console *con)
{
        lockdep_assert_console_list_lock_held();

        if (!con->kthread)
                return;

        kthread_stop(con->kthread);
        con->kthread = NULL;
}

/**
 * nbcon_kthread_create - Create a console printer thread
 * @con:        Console to operate on
 *
 * Return:        True if the kthread was started or already exists.
 *                Otherwise false and @con must not be registered.
 *
 * This function is called when it will be expected that nbcon consoles are
 * flushed using the kthread. The messages printed with NBCON_PRIO_NORMAL
 * will be no longer flushed by the legacy loop. This is why failure must
 * be fatal for console registration.
 *
 * If @con was already registered and this function fails, @con must be
 * unregistered before the global state variable @printk_kthreads_running
 * can be set.
 */
bool nbcon_kthread_create(struct console *con)
{
        struct task_struct *kt;

        lockdep_assert_console_list_lock_held();

        if (con->kthread)
                return true;

        kt = kthread_run(nbcon_kthread_func, con, "pr/%s%d", con->name, con->index);
        if (WARN_ON(IS_ERR(kt))) {
                con_printk(KERN_ERR, con, "failed to start printing thread\n");
                return false;
        }

        con->kthread = kt;

        /*
         * It is important that console printing threads are scheduled
         * shortly after a printk call and with generous runtime budgets.
         */
        sched_set_normal(con->kthread, -20);

        return true;
}

/* Track the nbcon emergency nesting per CPU. */
static DEFINE_PER_CPU(unsigned int, nbcon_pcpu_emergency_nesting);
static unsigned int early_nbcon_pcpu_emergency_nesting __initdata;

/**
 * nbcon_get_cpu_emergency_nesting - Get the per CPU emergency nesting pointer
 *
 * Context:        For reading, any context. For writing, any context which could
 *                not be migrated to another CPU.
 * Return:        Either a pointer to the per CPU emergency nesting counter of
 *                the current CPU or to the init data during early boot.
 *
 * The function is safe for reading per-CPU variables in any context because
 * preemption is disabled if the current CPU is in the emergency state. See
 * also nbcon_cpu_emergency_enter().
 */
static __ref unsigned int *nbcon_get_cpu_emergency_nesting(void)
{
        /*
         * The value of __printk_percpu_data_ready gets set in normal
         * context and before SMP initialization. As a result it could
         * never change while inside an nbcon emergency section.
         */
        if (!printk_percpu_data_ready())
                return &early_nbcon_pcpu_emergency_nesting;

        return raw_cpu_ptr(&nbcon_pcpu_emergency_nesting);
}

/**
 * nbcon_get_default_prio - The appropriate nbcon priority to use for nbcon
 *                                printing on the current CPU
 *
 * Context:        Any context.
 * Return:        The nbcon_prio to use for acquiring an nbcon console in this
 *                context for printing.
 *
 * The function is safe for reading per-CPU data in any context because
 * preemption is disabled if the current CPU is in the emergency or panic
 * state.
 */
enum nbcon_prio nbcon_get_default_prio(void)
{
        unsigned int *cpu_emergency_nesting;

        if (this_cpu_in_panic())
                return NBCON_PRIO_PANIC;

        cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
        if (*cpu_emergency_nesting)
                return NBCON_PRIO_EMERGENCY;

        return NBCON_PRIO_NORMAL;
}

/**
 * nbcon_legacy_emit_next_record - Print one record for an nbcon console
 *                                        in legacy contexts
 * @con:        The console to print on
 * @handover:        Will be set to true if a printk waiter has taken over the
 *                console_lock, in which case the caller is no longer holding
 *                both the console_lock and the SRCU read lock. Otherwise it
 *                is set to false.
 * @cookie:        The cookie from the SRCU read lock.
 * @use_atomic: Set true when called in an atomic or unknown context.
 *                It affects which nbcon callback will be used: write_atomic()
 *                or write_thread().
 *
 *                When false, the write_thread() callback is used and would be
 *                called in a preemtible context unless disabled by the
 *                device_lock. The legacy handover is not allowed in this mode.
 *
 * Context:        Any context except NMI.
 * Return:        True, when a record has been printed and there are still
 *                pending records. The caller might want to continue flushing.
 *
 *                False, when there is no pending record, or when the console
 *                context cannot be acquired, or the ownership has been lost.
 *                The caller should give up. Either the job is done, cannot be
 *                done, or will be handled by the owning context.
 *
 * This function is meant to be called by console_flush_all() to print records
 * on nbcon consoles from legacy context (printing via console unlocking).
 * Essentially it is the nbcon version of console_emit_next_record().
 */
bool nbcon_legacy_emit_next_record(struct console *con, bool *handover,
                                   int cookie, bool use_atomic)
{
        struct nbcon_write_context wctxt = { };
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
        unsigned long flags;
        bool progress;

        ctxt->console        = con;
        ctxt->prio        = nbcon_get_default_prio();

        if (use_atomic) {
                /*
                 * In an atomic or unknown context, use the same procedure as
                 * in console_emit_next_record(). It allows to handover.
                 */
                printk_safe_enter_irqsave(flags);
                console_lock_spinning_enable();
                stop_critical_timings();
        }

        progress = nbcon_emit_one(&wctxt, use_atomic);

        if (use_atomic) {
                start_critical_timings();
                *handover = console_lock_spinning_disable_and_check(cookie);
                printk_safe_exit_irqrestore(flags);
        } else {
                /* Non-atomic does not perform legacy spinning handovers. */
                *handover = false;
        }

        return progress;
}

/**
 * __nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
 *                                        write_atomic() callback
 * @con:                        The nbcon console to flush
 * @stop_seq:                        Flush up until this record
 * @allow_unsafe_takeover:        True, to allow unsafe hostile takeovers
 *
 * Return:        0 if @con was flushed up to @stop_seq Otherwise, error code on
 *                failure.
 *
 * Errors:
 *
 *        -EPERM:                Unable to acquire console ownership.
 *
 *        -EAGAIN:        Another context took over ownership while printing.
 *
 *        -ENOENT:        A record before @stop_seq is not available.
 *
 * If flushing up to @stop_seq was not successful, it only makes sense for the
 * caller to try again when -EAGAIN was returned. When -EPERM is returned,
 * this context is not allowed to acquire the console. When -ENOENT is
 * returned, it cannot be expected that the unfinalized record will become
 * available.
 */
static int __nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
                                            bool allow_unsafe_takeover)
{
        struct nbcon_write_context wctxt = { };
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(&wctxt, ctxt);
        int err = 0;

        ctxt->console                        = con;
        ctxt->spinwait_max_us                = 2000;
        ctxt->prio                        = nbcon_get_default_prio();
        ctxt->allow_unsafe_takeover        = allow_unsafe_takeover;

        if (!nbcon_context_try_acquire(ctxt))
                return -EPERM;

        while (nbcon_seq_read(con) < stop_seq) {
                /*
                 * nbcon_emit_next_record() returns false when the console was
                 * handed over or taken over. In both cases the context is no
                 * longer valid.
                 */
                if (!nbcon_emit_next_record(&wctxt, true))
                        return -EAGAIN;

                if (!ctxt->backlog) {
                        /* Are there reserved but not yet finalized records? */
                        if (nbcon_seq_read(con) < stop_seq)
                                err = -ENOENT;
                        break;
                }
        }

        nbcon_context_release(ctxt);
        return err;
}

/**
 * nbcon_atomic_flush_pending_con - Flush specified nbcon console using its
 *                                        write_atomic() callback
 * @con:                        The nbcon console to flush
 * @stop_seq:                        Flush up until this record
 * @allow_unsafe_takeover:        True, to allow unsafe hostile takeovers
 *
 * This will stop flushing before @stop_seq if another context has ownership.
 * That context is then responsible for the flushing. Likewise, if new records
 * are added while this context was flushing and there is no other context
 * to handle the printing, this context must also flush those records.
 */
static void nbcon_atomic_flush_pending_con(struct console *con, u64 stop_seq,
                                           bool allow_unsafe_takeover)
{
        struct console_flush_type ft;
        unsigned long flags;
        int err;

again:
        /*
         * Atomic flushing does not use console driver synchronization (i.e.
         * it does not hold the port lock for uart consoles). Therefore IRQs
         * must be disabled to avoid being interrupted and then calling into
         * a driver that will deadlock trying to acquire console ownership.
         */
        local_irq_save(flags);

        err = __nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);

        local_irq_restore(flags);

        /*
         * If there was a new owner (-EPERM, -EAGAIN), that context is
         * responsible for completing.
         *
         * Do not wait for records not yet finalized (-ENOENT) to avoid a
         * possible deadlock. They will either get flushed by the writer or
         * eventually skipped on panic CPU.
         */
        if (err)
                return;

        /*
         * If flushing was successful but more records are available, this
         * context must flush those remaining records if the printer thread
         * is not available do it.
         */
        printk_get_console_flush_type(&ft);
        if (!ft.nbcon_offload &&
            prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
                stop_seq = prb_next_reserve_seq(prb);
                goto again;
        }
}

/**
 * __nbcon_atomic_flush_pending - Flush all nbcon consoles using their
 *                                        write_atomic() callback
 * @stop_seq:                        Flush up until this record
 * @allow_unsafe_takeover:        True, to allow unsafe hostile takeovers
 */
static void __nbcon_atomic_flush_pending(u64 stop_seq, bool allow_unsafe_takeover)
{
        struct console *con;
        int cookie;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                short flags = console_srcu_read_flags(con);

                if (!(flags & CON_NBCON))
                        continue;

                if (!console_is_usable(con, flags, true))
                        continue;

                if (nbcon_seq_read(con) >= stop_seq)
                        continue;

                nbcon_atomic_flush_pending_con(con, stop_seq, allow_unsafe_takeover);
        }
        console_srcu_read_unlock(cookie);
}

/**
 * nbcon_atomic_flush_pending - Flush all nbcon consoles using their
 *                                write_atomic() callback
 *
 * Flush the backlog up through the currently newest record. Any new
 * records added while flushing will not be flushed if there is another
 * context available to handle the flushing. This is to avoid one CPU
 * printing unbounded because other CPUs continue to add records.
 */
void nbcon_atomic_flush_pending(void)
{
        __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), false);
}

/**
 * nbcon_atomic_flush_unsafe - Flush all nbcon consoles using their
 *        write_atomic() callback and allowing unsafe hostile takeovers
 *
 * Flush the backlog up through the currently newest record. Unsafe hostile
 * takeovers will be performed, if necessary.
 */
void nbcon_atomic_flush_unsafe(void)
{
        __nbcon_atomic_flush_pending(prb_next_reserve_seq(prb), true);
}

/**
 * nbcon_cpu_emergency_enter - Enter an emergency section where printk()
 *                                messages for that CPU are flushed directly
 *
 * Context:        Any context. Disables preemption.
 *
 * When within an emergency section, printk() calls will attempt to flush any
 * pending messages in the ringbuffer.
 */
void nbcon_cpu_emergency_enter(void)
{
        unsigned int *cpu_emergency_nesting;

        preempt_disable();

        cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();
        (*cpu_emergency_nesting)++;
}

/**
 * nbcon_cpu_emergency_exit - Exit an emergency section
 *
 * Context:        Within an emergency section. Enables preemption.
 */
void nbcon_cpu_emergency_exit(void)
{
        unsigned int *cpu_emergency_nesting;

        cpu_emergency_nesting = nbcon_get_cpu_emergency_nesting();

        if (!WARN_ON_ONCE(*cpu_emergency_nesting == 0))
                (*cpu_emergency_nesting)--;

        preempt_enable();
}

/**
 * nbcon_alloc - Allocate and init the nbcon console specific data
 * @con:        Console to initialize
 *
 * Return:        True if the console was fully allocated and initialized.
 *                Otherwise @con must not be registered.
 *
 * When allocation and init was successful, the console must be properly
 * freed using nbcon_free() once it is no longer needed.
 */
bool nbcon_alloc(struct console *con)
{
        struct nbcon_state state = { };

        /* The write_thread() callback is mandatory. */
        if (WARN_ON(!con->write_thread))
                return false;

        rcuwait_init(&con->rcuwait);
        init_irq_work(&con->irq_work, nbcon_irq_work);
        atomic_long_set(&ACCESS_PRIVATE(con, nbcon_prev_seq), -1UL);
        nbcon_state_set(con, &state);

        /*
         * Initialize @nbcon_seq to the highest possible sequence number so
         * that practically speaking it will have nothing to print until a
         * desired initial sequence number has been set via nbcon_seq_force().
         */
        atomic_long_set(&ACCESS_PRIVATE(con, nbcon_seq), ULSEQ_MAX(prb));

        if (con->flags & CON_BOOT) {
                /*
                 * Boot console printing is synchronized with legacy console
                 * printing, so boot consoles can share the same global printk
                 * buffers.
                 */
                con->pbufs = &printk_shared_pbufs;
        } else {
                con->pbufs = kmalloc(sizeof(*con->pbufs), GFP_KERNEL);
                if (!con->pbufs) {
                        con_printk(KERN_ERR, con, "failed to allocate printing buffer\n");
                        return false;
                }

                if (printk_kthreads_running) {
                        if (!nbcon_kthread_create(con)) {
                                kfree(con->pbufs);
                                con->pbufs = NULL;
                                return false;
                        }
                }
        }

        return true;
}

/**
 * nbcon_free - Free and cleanup the nbcon console specific data
 * @con:        Console to free/cleanup nbcon data
 */
void nbcon_free(struct console *con)
{
        struct nbcon_state state = { };

        if (printk_kthreads_running)
                nbcon_kthread_stop(con);

        nbcon_state_set(con, &state);

        /* Boot consoles share global printk buffers. */
        if (!(con->flags & CON_BOOT))
                kfree(con->pbufs);

        con->pbufs = NULL;
}

/**
 * nbcon_device_try_acquire - Try to acquire nbcon console and enter unsafe
 *                                section
 * @con:        The nbcon console to acquire
 *
 * Context:        Under the locking mechanism implemented in
 *                @con->device_lock() including disabling migration.
 * Return:        True if the console was acquired. False otherwise.
 *
 * Console drivers will usually use their own internal synchronization
 * mechasism to synchronize between console printing and non-printing
 * activities (such as setting baud rates). However, nbcon console drivers
 * supporting atomic consoles may also want to mark unsafe sections when
 * performing non-printing activities in order to synchronize against their
 * atomic_write() callback.
 *
 * This function acquires the nbcon console using priority NBCON_PRIO_NORMAL
 * and marks it unsafe for handover/takeover.
 */
bool nbcon_device_try_acquire(struct console *con)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);

        cant_migrate();

        memset(ctxt, 0, sizeof(*ctxt));
        ctxt->console        = con;
        ctxt->prio        = NBCON_PRIO_NORMAL;

        if (!nbcon_context_try_acquire(ctxt))
                return false;

        if (!nbcon_context_enter_unsafe(ctxt))
                return false;

        return true;
}
EXPORT_SYMBOL_GPL(nbcon_device_try_acquire);

/**
 * nbcon_device_release - Exit unsafe section and release the nbcon console
 * @con:        The nbcon console acquired in nbcon_device_try_acquire()
 */
void nbcon_device_release(struct console *con)
{
        struct nbcon_context *ctxt = &ACCESS_PRIVATE(con, nbcon_device_ctxt);
        struct console_flush_type ft;
        int cookie;

        if (!nbcon_context_exit_unsafe(ctxt))
                return;

        nbcon_context_release(ctxt);

        /*
         * This context must flush any new records added while the console
         * was locked if the printer thread is not available to do it. The
         * console_srcu_read_lock must be taken to ensure the console is
         * usable throughout flushing.
         */
        cookie = console_srcu_read_lock();
        printk_get_console_flush_type(&ft);
        if (console_is_usable(con, console_srcu_read_flags(con), true) &&
            !ft.nbcon_offload &&
            prb_read_valid(prb, nbcon_seq_read(con), NULL)) {
                /*
                 * If nbcon_atomic flushing is not available, fallback to
                 * using the legacy loop.
                 */
                if (ft.nbcon_atomic) {
                        __nbcon_atomic_flush_pending_con(con, prb_next_reserve_seq(prb), false);
                } else if (ft.legacy_direct) {
                        if (console_trylock())
                                console_unlock();
                } else if (ft.legacy_offload) {
                        printk_trigger_flush();
                }
        }
        console_srcu_read_unlock(cookie);
}
EXPORT_SYMBOL_GPL(nbcon_device_release);


























































  477 





















  275 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* SPDX-License-Identifier: GPL-2.0 */

/*
 * This file provides wrappers with sanitizer instrumentation for bit
 * locking operations.
 *
 * To use this functionality, an arch's bitops.h file needs to define each of
 * the below bit operations with an arch_ prefix (e.g. arch_set_bit(),
 * arch___set_bit(), etc.).
 */
#ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H
#define _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H

#include <linux/instrumented.h>

/**
 * clear_bit_unlock - Clear a bit in memory, for unlock
 * @nr: the bit to set
 * @addr: the address to start counting from
 *
 * This operation is atomic and provides release barrier semantics.
 */
static inline void clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        kcsan_release();
        instrument_atomic_write(addr + BIT_WORD(nr), sizeof(long));
        arch_clear_bit_unlock(nr, addr);
}

/**
 * __clear_bit_unlock - Clears a bit in memory
 * @nr: Bit to clear
 * @addr: Address to start counting from
 *
 * This is a non-atomic operation but implies a release barrier before the
 * memory operation. It can be used for an unlock if no other CPUs can
 * concurrently modify other bits in the word.
 */
static inline void __clear_bit_unlock(long nr, volatile unsigned long *addr)
{
        kcsan_release();
        instrument_write(addr + BIT_WORD(nr), sizeof(long));
        arch___clear_bit_unlock(nr, addr);
}

/**
 * test_and_set_bit_lock - Set a bit and return its old value, for lock
 * @nr: Bit to set
 * @addr: Address to count from
 *
 * This operation is atomic and provides acquire barrier semantics if
 * the returned value is 0.
 * It can be used to implement bit locks.
 */
static inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr)
{
        instrument_atomic_read_write(addr + BIT_WORD(nr), sizeof(long));
        return arch_test_and_set_bit_lock(nr, addr);
}

/**
 * xor_unlock_is_negative_byte - XOR a single byte in memory and test if
 * it is negative, for unlock.
 * @mask: Change the bits which are set in this mask.
 * @addr: The address of the word containing the byte to change.
 *
 * Changes some of bits 0-6 in the word pointed to by @addr.
 * This operation is atomic and provides release barrier semantics.
 * Used to optimise some folio operations which are commonly paired
 * with an unlock or end of writeback.  Bit 7 is used as PG_waiters to
 * indicate whether anybody is waiting for the unlock.
 *
 * Return: Whether the top bit of the byte is set.
 */
static inline bool xor_unlock_is_negative_byte(unsigned long mask,
                volatile unsigned long *addr)
{
        kcsan_release();
        instrument_atomic_write(addr, sizeof(long));
        return arch_xor_unlock_is_negative_byte(mask, addr);
}
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_LOCK_H */













































   54 















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_KSM_H
#define __LINUX_KSM_H
/*
 * Memory merging support.
 *
 * This code enables dynamic sharing of identical pages found in different
 * memory areas, even if they are not shared by fork().
 */

#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/sched.h>

#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags);

void ksm_add_vma(struct vm_area_struct *vma);
int ksm_enable_merge_any(struct mm_struct *mm);
int ksm_disable_merge_any(struct mm_struct *mm);
int ksm_disable(struct mm_struct *mm);

int __ksm_enter(struct mm_struct *mm);
void __ksm_exit(struct mm_struct *mm);
/*
 * To identify zeropages that were mapped by KSM, we reuse the dirty bit
 * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when
 * deduplicating memory.
 */
#define is_ksm_zero_pte(pte)        (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte))

extern atomic_long_t ksm_zero_pages;

static inline void ksm_map_zero_page(struct mm_struct *mm)
{
        atomic_long_inc(&ksm_zero_pages);
        atomic_long_inc(&mm->ksm_zero_pages);
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
        if (is_ksm_zero_pte(pte)) {
                atomic_long_dec(&ksm_zero_pages);
                atomic_long_dec(&mm->ksm_zero_pages);
        }
}

static inline long mm_ksm_zero_pages(struct mm_struct *mm)
{
        return atomic_long_read(&mm->ksm_zero_pages);
}

static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
        /* Adding mm to ksm is best effort on fork. */
        if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
                __ksm_enter(mm);
}

static inline int ksm_execve(struct mm_struct *mm)
{
        if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
                return __ksm_enter(mm);

        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
        if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
                __ksm_exit(mm);
}

/*
 * When do_swap_page() first faults in from swap what used to be a KSM page,
 * no problem, it will be assigned to this vma's anon_vma; but thereafter,
 * it might be faulted into a different anon_vma (or perhaps to a different
 * offset in the same anon_vma).  do_swap_page() cannot do all the locking
 * needed to reconstitute a cross-anon_vma KSM page: for now it has to make
 * a copy, and leave remerging the pages to a later pass of ksmd.
 *
 * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE,
 * but what if the vma was unmerged while the page was swapped out?
 */
struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr);

void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
void collect_procs_ksm(const struct folio *folio, const struct page *page,
                struct list_head *to_kill, int force_early);
long ksm_process_profit(struct mm_struct *);
bool ksm_process_mergeable(struct mm_struct *mm);

#else  /* !CONFIG_KSM */

static inline void ksm_add_vma(struct vm_area_struct *vma)
{
}

static inline int ksm_disable(struct mm_struct *mm)
{
        return 0;
}

static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
}

static inline int ksm_execve(struct mm_struct *mm)
{
        return 0;
}

static inline void ksm_exit(struct mm_struct *mm)
{
}

static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
{
}

static inline void collect_procs_ksm(const struct folio *folio,
                const struct page *page, struct list_head *to_kill,
                int force_early)
{
}

#ifdef CONFIG_MMU
static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                unsigned long end, int advice, unsigned long *vm_flags)
{
        return 0;
}

static inline struct folio *ksm_might_need_to_copy(struct folio *folio,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return folio;
}

static inline void rmap_walk_ksm(struct folio *folio,
                        struct rmap_walk_control *rwc)
{
}

static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old)
{
}
#endif /* CONFIG_MMU */
#endif /* !CONFIG_KSM */

#endif /* __LINUX_KSM_H */








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 




    1 




    2 



    2 



    1 















































































































































































































































































































































    3 












    3 


    3 





    3 

















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
// SPDX-License-Identifier: GPL-2.0-or-later
/* linux/net/ipv4/arp.c
 *
 * Copyright (C) 1994 by Florian  La Roche
 *
 * This module implements the Address Resolution Protocol ARP (RFC 826),
 * which is used to convert IP addresses (or in the future maybe other
 * high-level addresses) into a low-level hardware address (like an Ethernet
 * address).
 *
 * Fixes:
 *                Alan Cox        :        Removed the Ethernet assumptions in
 *                                        Florian's code
 *                Alan Cox        :        Fixed some small errors in the ARP
 *                                        logic
 *                Alan Cox        :        Allow >4K in /proc
 *                Alan Cox        :        Make ARP add its own protocol entry
 *                Ross Martin     :       Rewrote arp_rcv() and arp_get_info()
 *                Stephen Henson        :        Add AX25 support to arp_get_info()
 *                Alan Cox        :        Drop data when a device is downed.
 *                Alan Cox        :        Use init_timer().
 *                Alan Cox        :        Double lock fixes.
 *                Martin Seine        :        Move the arphdr structure
 *                                        to if_arp.h for compatibility.
 *                                        with BSD based programs.
 *                Andrew Tridgell :       Added ARP netmask code and
 *                                        re-arranged proxy handling.
 *                Alan Cox        :        Changed to use notifiers.
 *                Niibe Yutaka        :        Reply for this device or proxies only.
 *                Alan Cox        :        Don't proxy across hardware types!
 *                Jonathan Naylor :        Added support for NET/ROM.
 *                Mike Shaver     :       RFC1122 checks.
 *                Jonathan Naylor :        Only lookup the hardware address for
 *                                        the correct hardware type.
 *                Germano Caronni        :        Assorted subtle races.
 *                Craig Schlenter :        Don't modify permanent entry
 *                                        during arp_rcv.
 *                Russ Nelson        :        Tidied up a few bits.
 *                Alexey Kuznetsov:        Major changes to caching and behaviour,
 *                                        eg intelligent arp probing and
 *                                        generation
 *                                        of host down events.
 *                Alan Cox        :        Missing unlock in device events.
 *                Eckes                :        ARP ioctl control errors.
 *                Alexey Kuznetsov:        Arp free fix.
 *                Manuel Rodriguez:        Gratuitous ARP.
 *              Jonathan Layes  :       Added arpd support through kerneld
 *                                      message queue (960314)
 *                Mike Shaver        :        /proc/sys/net/ipv4/arp_* support
 *                Mike McLagan    :        Routing by source
 *                Stuart Cheshire        :        Metricom and grat arp fixes
 *                                        *** FOR 2.1 clean this up ***
 *                Lawrence V. Stefani: (08/12/96) Added FDDI support.
 *                Alan Cox        :        Took the AP1000 nasty FDDI hack and
 *                                        folded into the mainstream FDDI code.
 *                                        Ack spit, Linus how did you allow that
 *                                        one in...
 *                Jes Sorensen        :        Make FDDI work again in 2.1.x and
 *                                        clean up the APFDDI & gen. FDDI bits.
 *                Alexey Kuznetsov:        new arp state machine;
 *                                        now it is in net/core/neighbour.c.
 *                Krzysztof Halasa:        Added Frame Relay ARP support.
 *                Arnaldo C. Melo :        convert /proc/net/arp to seq_file
 *                Shmulik Hen:                Split arp_send to arp_create and
 *                                        arp_xmit so intermediate drivers like
 *                                        bonding can change the skb before
 *                                        sending (e.g. insert 8021q tag).
 *                Harald Welte        :        convert to make use of jenkins hash
 *                Jesper D. Brouer:       Proxy ARP PVLAN RFC 3069 support.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/module.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/capability.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/fddidevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/stat.h>
#include <linux/init.h>
#include <linux/net.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/route.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/arp.h>
#include <net/ax25.h>
#include <net/netrom.h>
#include <net/dst_metadata.h>
#include <net/ip_tunnels.h>

#include <linux/uaccess.h>

#include <linux/netfilter_arp.h>

/*
 *        Interface to generic neighbour cache.
 */
static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd);
static bool arp_key_eq(const struct neighbour *n, const void *pkey);
static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
static void parp_redo(struct sk_buff *skb);
static int arp_is_multicast(const void *pkey);

static const struct neigh_ops arp_generic_ops = {
        .family =                AF_INET,
        .solicit =                arp_solicit,
        .error_report =                arp_error_report,
        .output =                neigh_resolve_output,
        .connected_output =        neigh_connected_output,
};

static const struct neigh_ops arp_hh_ops = {
        .family =                AF_INET,
        .solicit =                arp_solicit,
        .error_report =                arp_error_report,
        .output =                neigh_resolve_output,
        .connected_output =        neigh_resolve_output,
};

static const struct neigh_ops arp_direct_ops = {
        .family =                AF_INET,
        .output =                neigh_direct_output,
        .connected_output =        neigh_direct_output,
};

struct neigh_table arp_tbl = {
        .family                = AF_INET,
        .key_len        = 4,
        .protocol        = cpu_to_be16(ETH_P_IP),
        .hash                = arp_hash,
        .key_eq                = arp_key_eq,
        .constructor        = arp_constructor,
        .proxy_redo        = parp_redo,
        .is_multicast        = arp_is_multicast,
        .id                = "arp_cache",
        .parms                = {
                .tbl                        = &arp_tbl,
                .reachable_time                = 30 * HZ,
                .data        = {
                        [NEIGH_VAR_MCAST_PROBES] = 3,
                        [NEIGH_VAR_UCAST_PROBES] = 3,
                        [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
                        [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
                        [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
                        [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
                        [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
                        [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
                        [NEIGH_VAR_PROXY_QLEN] = 64,
                        [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
                        [NEIGH_VAR_PROXY_DELAY]        = (8 * HZ) / 10,
                        [NEIGH_VAR_LOCKTIME] = 1 * HZ,
                },
        },
        .gc_interval        = 30 * HZ,
        .gc_thresh1        = 128,
        .gc_thresh2        = 512,
        .gc_thresh3        = 1024,
};
EXPORT_SYMBOL(arp_tbl);

int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
{
        switch (dev->type) {
        case ARPHRD_ETHER:
        case ARPHRD_FDDI:
        case ARPHRD_IEEE802:
                ip_eth_mc_map(addr, haddr);
                return 0;
        case ARPHRD_INFINIBAND:
                ip_ib_mc_map(addr, dev->broadcast, haddr);
                return 0;
        case ARPHRD_IPGRE:
                ip_ipgre_mc_map(addr, dev->broadcast, haddr);
                return 0;
        default:
                if (dir) {
                        memcpy(haddr, dev->broadcast, dev->addr_len);
                        return 0;
                }
        }
        return -EINVAL;
}


static u32 arp_hash(const void *pkey,
                    const struct net_device *dev,
                    __u32 *hash_rnd)
{
        return arp_hashfn(pkey, dev, hash_rnd);
}

static bool arp_key_eq(const struct neighbour *neigh, const void *pkey)
{
        return neigh_key_eq32(neigh, pkey);
}

static int arp_constructor(struct neighbour *neigh)
{
        __be32 addr;
        struct net_device *dev = neigh->dev;
        struct in_device *in_dev;
        struct neigh_parms *parms;
        u32 inaddr_any = INADDR_ANY;

        if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
                memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len);

        addr = *(__be32 *)neigh->primary_key;
        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev) {
                rcu_read_unlock();
                return -EINVAL;
        }

        neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr);

        parms = in_dev->arp_parms;
        __neigh_parms_put(neigh->parms);
        neigh->parms = neigh_parms_clone(parms);
        rcu_read_unlock();

        if (!dev->header_ops) {
                neigh->nud_state = NUD_NOARP;
                neigh->ops = &arp_direct_ops;
                neigh->output = neigh_direct_output;
        } else {
                /* Good devices (checked by reading texts, but only Ethernet is
                   tested)

                   ARPHRD_ETHER: (ethernet, apfddi)
                   ARPHRD_FDDI: (fddi)
                   ARPHRD_IEEE802: (tr)
                   ARPHRD_METRICOM: (strip)
                   ARPHRD_ARCNET:
                   etc. etc. etc.

                   ARPHRD_IPDDP will also work, if author repairs it.
                   I did not it, because this driver does not work even
                   in old paradigm.
                 */

                if (neigh->type == RTN_MULTICAST) {
                        neigh->nud_state = NUD_NOARP;
                        arp_mc_map(addr, neigh->ha, dev, 1);
                } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
                        neigh->nud_state = NUD_NOARP;
                        memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
                } else if (neigh->type == RTN_BROADCAST ||
                           (dev->flags & IFF_POINTOPOINT)) {
                        neigh->nud_state = NUD_NOARP;
                        memcpy(neigh->ha, dev->broadcast, dev->addr_len);
                }

                if (dev->header_ops->cache)
                        neigh->ops = &arp_hh_ops;
                else
                        neigh->ops = &arp_generic_ops;

                if (neigh->nud_state & NUD_VALID)
                        neigh->output = neigh->ops->connected_output;
                else
                        neigh->output = neigh->ops->output;
        }
        return 0;
}

static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
        dst_link_failure(skb);
        kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
}

/* Create and send an arp packet. */
static void arp_send_dst(int type, int ptype, __be32 dest_ip,
                         struct net_device *dev, __be32 src_ip,
                         const unsigned char *dest_hw,
                         const unsigned char *src_hw,
                         const unsigned char *target_hw,
                         struct dst_entry *dst)
{
        struct sk_buff *skb;

        /* arp on this interface. */
        if (dev->flags & IFF_NOARP)
                return;

        skb = arp_create(type, ptype, dest_ip, dev, src_ip,
                         dest_hw, src_hw, target_hw);
        if (!skb)
                return;

        skb_dst_set(skb, dst_clone(dst));
        arp_xmit(skb);
}

void arp_send(int type, int ptype, __be32 dest_ip,
              struct net_device *dev, __be32 src_ip,
              const unsigned char *dest_hw, const unsigned char *src_hw,
              const unsigned char *target_hw)
{
        arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw,
                     target_hw, NULL);
}
EXPORT_SYMBOL(arp_send);

static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
{
        __be32 saddr = 0;
        u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL;
        struct net_device *dev = neigh->dev;
        __be32 target = *(__be32 *)neigh->primary_key;
        int probes = atomic_read(&neigh->probes);
        struct in_device *in_dev;
        struct dst_entry *dst = NULL;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev) {
                rcu_read_unlock();
                return;
        }
        switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
        default:
        case 0:                /* By default announce any local IP */
                if (skb && inet_addr_type_dev_table(dev_net(dev), dev,
                                          ip_hdr(skb)->saddr) == RTN_LOCAL)
                        saddr = ip_hdr(skb)->saddr;
                break;
        case 1:                /* Restrict announcements of saddr in same subnet */
                if (!skb)
                        break;
                saddr = ip_hdr(skb)->saddr;
                if (inet_addr_type_dev_table(dev_net(dev), dev,
                                             saddr) == RTN_LOCAL) {
                        /* saddr should be known to target */
                        if (inet_addr_onlink(in_dev, target, saddr))
                                break;
                }
                saddr = 0;
                break;
        case 2:                /* Avoid secondary IPs, get a primary/preferred one */
                break;
        }
        rcu_read_unlock();

        if (!saddr)
                saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);

        probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
        if (probes < 0) {
                if (!(READ_ONCE(neigh->nud_state) & NUD_VALID))
                        pr_debug("trying to ucast probe in NUD_INVALID\n");
                neigh_ha_snapshot(dst_ha, neigh, dev);
                dst_hw = dst_ha;
        } else {
                probes -= NEIGH_VAR(neigh->parms, APP_PROBES);
                if (probes < 0) {
                        neigh_app_ns(neigh);
                        return;
                }
        }

        if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE))
                dst = skb_dst(skb);
        arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
                     dst_hw, dev->dev_addr, NULL, dst);
}

static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
{
        struct net *net = dev_net(in_dev->dev);
        int scope;

        switch (IN_DEV_ARP_IGNORE(in_dev)) {
        case 0:        /* Reply, the tip is already validated */
                return 0;
        case 1:        /* Reply only if tip is configured on the incoming interface */
                sip = 0;
                scope = RT_SCOPE_HOST;
                break;
        case 2:        /*
                 * Reply only if tip is configured on the incoming interface
                 * and is in same subnet as sip
                 */
                scope = RT_SCOPE_HOST;
                break;
        case 3:        /* Do not reply for scope host addresses */
                sip = 0;
                scope = RT_SCOPE_LINK;
                in_dev = NULL;
                break;
        case 4:        /* Reserved */
        case 5:
        case 6:
        case 7:
                return 0;
        case 8:        /* Do not reply */
                return 1;
        default:
                return 0;
        }
        return !inet_confirm_addr(net, in_dev, sip, tip, scope);
}

static int arp_accept(struct in_device *in_dev, __be32 sip)
{
        struct net *net = dev_net(in_dev->dev);
        int scope = RT_SCOPE_LINK;

        switch (IN_DEV_ARP_ACCEPT(in_dev)) {
        case 0: /* Don't create new entries from garp */
                return 0;
        case 1: /* Create new entries from garp */
                return 1;
        case 2: /* Create a neighbor in the arp table only if sip
                 * is in the same subnet as an address configured
                 * on the interface that received the garp message
                 */
                return !!inet_confirm_addr(net, in_dev, sip, 0, scope);
        default:
                return 0;
        }
}

static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
{
        struct rtable *rt;
        int flag = 0;
        /*unsigned long now; */
        struct net *net = dev_net(dev);

        rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev),
                             RT_SCOPE_UNIVERSE);
        if (IS_ERR(rt))
                return 1;
        if (rt->dst.dev != dev) {
                __NET_INC_STATS(net, LINUX_MIB_ARPFILTER);
                flag = 1;
        }
        ip_rt_put(rt);
        return flag;
}

/*
 * Check if we can use proxy ARP for this path
 */
static inline int arp_fwd_proxy(struct in_device *in_dev,
                                struct net_device *dev,        struct rtable *rt)
{
        struct in_device *out_dev;
        int imi, omi = -1;

        if (rt->dst.dev == dev)
                return 0;

        if (!IN_DEV_PROXY_ARP(in_dev))
                return 0;
        imi = IN_DEV_MEDIUM_ID(in_dev);
        if (imi == 0)
                return 1;
        if (imi == -1)
                return 0;

        /* place to check for proxy_arp for routes */

        out_dev = __in_dev_get_rcu(rt->dst.dev);
        if (out_dev)
                omi = IN_DEV_MEDIUM_ID(out_dev);

        return omi != imi && omi != -1;
}

/*
 * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
 *
 * RFC3069 supports proxy arp replies back to the same interface.  This
 * is done to support (ethernet) switch features, like RFC 3069, where
 * the individual ports are not allowed to communicate with each
 * other, BUT they are allowed to talk to the upstream router.  As
 * described in RFC 3069, it is possible to allow these hosts to
 * communicate through the upstream router, by proxy_arp'ing.
 *
 * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
 *
 *  This technology is known by different names:
 *    In RFC 3069 it is called VLAN Aggregation.
 *    Cisco and Allied Telesyn call it Private VLAN.
 *    Hewlett-Packard call it Source-Port filtering or port-isolation.
 *    Ericsson call it MAC-Forced Forwarding (RFC Draft).
 *
 */
static inline int arp_fwd_pvlan(struct in_device *in_dev,
                                struct net_device *dev,        struct rtable *rt,
                                __be32 sip, __be32 tip)
{
        /* Private VLAN is only concerned about the same ethernet segment */
        if (rt->dst.dev != dev)
                return 0;

        /* Don't reply on self probes (often done by windowz boxes)*/
        if (sip == tip)
                return 0;

        if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
                return 1;
        else
                return 0;
}

/*
 *        Interface to link layer: send routine and receive handler.
 */

/*
 *        Create an arp packet. If dest_hw is not set, we create a broadcast
 *        message.
 */
struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
                           struct net_device *dev, __be32 src_ip,
                           const unsigned char *dest_hw,
                           const unsigned char *src_hw,
                           const unsigned char *target_hw)
{
        struct sk_buff *skb;
        struct arphdr *arp;
        unsigned char *arp_ptr;
        int hlen = LL_RESERVED_SPACE(dev);
        int tlen = dev->needed_tailroom;

        /*
         *        Allocate a buffer
         */

        skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC);
        if (!skb)
                return NULL;

        skb_reserve(skb, hlen);
        skb_reset_network_header(skb);
        arp = skb_put(skb, arp_hdr_len(dev));
        skb->dev = dev;
        skb->protocol = htons(ETH_P_ARP);
        if (!src_hw)
                src_hw = dev->dev_addr;
        if (!dest_hw)
                dest_hw = dev->broadcast;

        /*
         *        Fill the device header for the ARP frame
         */
        if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0)
                goto out;

        /*
         * Fill out the arp protocol part.
         *
         * The arp hardware type should match the device type, except for FDDI,
         * which (according to RFC 1390) should always equal 1 (Ethernet).
         */
        /*
         *        Exceptions everywhere. AX.25 uses the AX.25 PID value not the
         *        DIX code for the protocol. Make these device structure fields.
         */
        switch (dev->type) {
        default:
                arp->ar_hrd = htons(dev->type);
                arp->ar_pro = htons(ETH_P_IP);
                break;

#if IS_ENABLED(CONFIG_AX25)
        case ARPHRD_AX25:
                arp->ar_hrd = htons(ARPHRD_AX25);
                arp->ar_pro = htons(AX25_P_IP);
                break;

#if IS_ENABLED(CONFIG_NETROM)
        case ARPHRD_NETROM:
                arp->ar_hrd = htons(ARPHRD_NETROM);
                arp->ar_pro = htons(AX25_P_IP);
                break;
#endif
#endif

#if IS_ENABLED(CONFIG_FDDI)
        case ARPHRD_FDDI:
                arp->ar_hrd = htons(ARPHRD_ETHER);
                arp->ar_pro = htons(ETH_P_IP);
                break;
#endif
        }

        arp->ar_hln = dev->addr_len;
        arp->ar_pln = 4;
        arp->ar_op = htons(type);

        arp_ptr = (unsigned char *)(arp + 1);

        memcpy(arp_ptr, src_hw, dev->addr_len);
        arp_ptr += dev->addr_len;
        memcpy(arp_ptr, &src_ip, 4);
        arp_ptr += 4;

        switch (dev->type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
        case ARPHRD_IEEE1394:
                break;
#endif
        default:
                if (target_hw)
                        memcpy(arp_ptr, target_hw, dev->addr_len);
                else
                        memset(arp_ptr, 0, dev->addr_len);
                arp_ptr += dev->addr_len;
        }
        memcpy(arp_ptr, &dest_ip, 4);

        return skb;

out:
        kfree_skb(skb);
        return NULL;
}
EXPORT_SYMBOL(arp_create);

static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return dev_queue_xmit(skb);
}

/*
 *        Send an arp packet.
 */
void arp_xmit(struct sk_buff *skb)
{
        rcu_read_lock();
        /* Send it off, maybe filter it using firewalling first.  */
        NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
                dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev,
                arp_xmit_finish);
        rcu_read_unlock();
}
EXPORT_SYMBOL(arp_xmit);

static bool arp_is_garp(struct net *net, struct net_device *dev,
                        int *addr_type, __be16 ar_op,
                        __be32 sip, __be32 tip,
                        unsigned char *sha, unsigned char *tha)
{
        bool is_garp = tip == sip;

        /* Gratuitous ARP _replies_ also require target hwaddr to be
         * the same as source.
         */
        if (is_garp && ar_op == htons(ARPOP_REPLY))
                is_garp =
                        /* IPv4 over IEEE 1394 doesn't provide target
                         * hardware address field in its ARP payload.
                         */
                        tha &&
                        !memcmp(tha, sha, dev->addr_len);

        if (is_garp) {
                *addr_type = inet_addr_type_dev_table(net, dev, sip);
                if (*addr_type != RTN_UNICAST)
                        is_garp = false;
        }
        return is_garp;
}

/*
 *        Process an arp request.
 */

static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev = skb->dev;
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        struct arphdr *arp;
        unsigned char *arp_ptr;
        struct rtable *rt;
        unsigned char *sha;
        unsigned char *tha = NULL;
        __be32 sip, tip;
        u16 dev_type = dev->type;
        int addr_type;
        struct neighbour *n;
        struct dst_entry *reply_dst = NULL;
        bool is_garp = false;

        /* arp_rcv below verifies the ARP header and verifies the device
         * is ARP'able.
         */

        if (!in_dev)
                goto out_free_skb;

        arp = arp_hdr(skb);

        switch (dev_type) {
        default:
                if (arp->ar_pro != htons(ETH_P_IP) ||
                    htons(dev_type) != arp->ar_hrd)
                        goto out_free_skb;
                break;
        case ARPHRD_ETHER:
        case ARPHRD_FDDI:
        case ARPHRD_IEEE802:
                /*
                 * ETHERNET, and Fibre Channel (which are IEEE 802
                 * devices, according to RFC 2625) devices will accept ARP
                 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
                 * This is the case also of FDDI, where the RFC 1390 says that
                 * FDDI devices should accept ARP hardware of (1) Ethernet,
                 * however, to be more robust, we'll accept both 1 (Ethernet)
                 * or 6 (IEEE 802.2)
                 */
                if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
                     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
                    arp->ar_pro != htons(ETH_P_IP))
                        goto out_free_skb;
                break;
        case ARPHRD_AX25:
                if (arp->ar_pro != htons(AX25_P_IP) ||
                    arp->ar_hrd != htons(ARPHRD_AX25))
                        goto out_free_skb;
                break;
        case ARPHRD_NETROM:
                if (arp->ar_pro != htons(AX25_P_IP) ||
                    arp->ar_hrd != htons(ARPHRD_NETROM))
                        goto out_free_skb;
                break;
        }

        /* Understand only these message types */

        if (arp->ar_op != htons(ARPOP_REPLY) &&
            arp->ar_op != htons(ARPOP_REQUEST))
                goto out_free_skb;

/*
 *        Extract fields
 */
        arp_ptr = (unsigned char *)(arp + 1);
        sha        = arp_ptr;
        arp_ptr += dev->addr_len;
        memcpy(&sip, arp_ptr, 4);
        arp_ptr += 4;
        switch (dev_type) {
#if IS_ENABLED(CONFIG_FIREWIRE_NET)
        case ARPHRD_IEEE1394:
                break;
#endif
        default:
                tha = arp_ptr;
                arp_ptr += dev->addr_len;
        }
        memcpy(&tip, arp_ptr, 4);
/*
 *        Check for bad requests for 127.x.x.x and requests for multicast
 *        addresses.  If this is one such, delete it.
 */
        if (ipv4_is_multicast(tip) ||
            (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
                goto out_free_skb;

 /*
  *        For some 802.11 wireless deployments (and possibly other networks),
  *        there will be an ARP proxy and gratuitous ARP frames are attacks
  *        and thus should not be accepted.
  */
        if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
                goto out_free_skb;

/*
 *     Special case: We must set Frame Relay source Q.922 address
 */
        if (dev_type == ARPHRD_DLCI)
                sha = dev->broadcast;

/*
 *  Process entry.  The idea here is we want to send a reply if it is a
 *  request for us or if it is a request for someone else that we hold
 *  a proxy for.  We want to add an entry to our cache if it is a reply
 *  to us or if it is a request for our address.
 *  (The assumption for this last is that if someone is requesting our
 *  address, they are probably intending to talk to us, so it saves time
 *  if we cache their address.  Their address is also probably not in
 *  our cache, since ours is not in their cache.)
 *
 *  Putting this another way, we only care about replies if they are to
 *  us, in which case we add them to the cache.  For requests, we care
 *  about those for us and those for our proxies.  We reply to both,
 *  and in the case of requests for us we add the requester to the arp
 *  cache.
 */

        if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb))
                reply_dst = (struct dst_entry *)
                            iptunnel_metadata_reply(skb_metadata_dst(skb),
                                                    GFP_ATOMIC);

        /* Special case: IPv4 duplicate address detection packet (RFC2131) */
        if (sip == 0) {
                if (arp->ar_op == htons(ARPOP_REQUEST) &&
                    inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL &&
                    !arp_ignore(in_dev, sip, tip))
                        arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
                                     sha, dev->dev_addr, sha, reply_dst);
                goto out_consume_skb;
        }

        if (arp->ar_op == htons(ARPOP_REQUEST) &&
            ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {

                rt = skb_rtable(skb);
                addr_type = rt->rt_type;

                if (addr_type == RTN_LOCAL) {
                        int dont_send;

                        dont_send = arp_ignore(in_dev, sip, tip);
                        if (!dont_send && IN_DEV_ARPFILTER(in_dev))
                                dont_send = arp_filter(sip, tip, dev);
                        if (!dont_send) {
                                n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
                                if (n) {
                                        arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
                                                     sip, dev, tip, sha,
                                                     dev->dev_addr, sha,
                                                     reply_dst);
                                        neigh_release(n);
                                }
                        }
                        goto out_consume_skb;
                } else if (IN_DEV_FORWARD(in_dev)) {
                        if (addr_type == RTN_UNICAST  &&
                            (arp_fwd_proxy(in_dev, dev, rt) ||
                             arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
                             (rt->dst.dev != dev &&
                              pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
                                n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
                                if (n)
                                        neigh_release(n);

                                if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
                                    skb->pkt_type == PACKET_HOST ||
                                    NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) {
                                        arp_send_dst(ARPOP_REPLY, ETH_P_ARP,
                                                     sip, dev, tip, sha,
                                                     dev->dev_addr, sha,
                                                     reply_dst);
                                } else {
                                        pneigh_enqueue(&arp_tbl,
                                                       in_dev->arp_parms, skb);
                                        goto out_free_dst;
                                }
                                goto out_consume_skb;
                        }
                }
        }

        /* Update our ARP tables */

        n = __neigh_lookup(&arp_tbl, &sip, dev, 0);

        addr_type = -1;
        if (n || arp_accept(in_dev, sip)) {
                is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
                                      sip, tip, sha, tha);
        }

        if (arp_accept(in_dev, sip)) {
                /* Unsolicited ARP is not accepted by default.
                   It is possible, that this option should be enabled for some
                   devices (strip is candidate)
                 */
                if (!n &&
                    (is_garp ||
                     (arp->ar_op == htons(ARPOP_REPLY) &&
                      (addr_type == RTN_UNICAST ||
                       (addr_type < 0 &&
                        /* postpone calculation to as late as possible */
                        inet_addr_type_dev_table(net, dev, sip) ==
                                RTN_UNICAST)))))
                        n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
        }

        if (n) {
                int state = NUD_REACHABLE;
                int override;

                /* If several different ARP replies follows back-to-back,
                   use the FIRST one. It is possible, if several proxy
                   agents are active. Taking the first reply prevents
                   arp trashing and chooses the fastest router.
                 */
                override = time_after(jiffies,
                                      n->updated +
                                      NEIGH_VAR(n->parms, LOCKTIME)) ||
                           is_garp;

                /* Broadcast replies and request packets
                   do not assert neighbour reachability.
                 */
                if (arp->ar_op != htons(ARPOP_REPLY) ||
                    skb->pkt_type != PACKET_HOST)
                        state = NUD_STALE;
                neigh_update(n, sha, state,
                             override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0);
                neigh_release(n);
        }

out_consume_skb:
        consume_skb(skb);

out_free_dst:
        dst_release(reply_dst);
        return NET_RX_SUCCESS;

out_free_skb:
        kfree_skb(skb);
        return NET_RX_DROP;
}

static void parp_redo(struct sk_buff *skb)
{
        arp_process(dev_net(skb->dev), NULL, skb);
}

static int arp_is_multicast(const void *pkey)
{
        return ipv4_is_multicast(*((__be32 *)pkey));
}

/*
 *        Receive an arp request from the device layer.
 */

static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
                   struct packet_type *pt, struct net_device *orig_dev)
{
        const struct arphdr *arp;

        /* do not tweak dropwatch on an ARP we will ignore */
        if (dev->flags & IFF_NOARP ||
            skb->pkt_type == PACKET_OTHERHOST ||
            skb->pkt_type == PACKET_LOOPBACK)
                goto consumeskb;

        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb)
                goto out_of_mem;

        /* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
        if (!pskb_may_pull(skb, arp_hdr_len(dev)))
                goto freeskb;

        arp = arp_hdr(skb);
        if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
                goto freeskb;

        memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));

        return NF_HOOK(NFPROTO_ARP, NF_ARP_IN,
                       dev_net(dev), NULL, skb, dev, NULL,
                       arp_process);

consumeskb:
        consume_skb(skb);
        return NET_RX_SUCCESS;
freeskb:
        kfree_skb(skb);
out_of_mem:
        return NET_RX_DROP;
}

/*
 *        User level interface (ioctl)
 */

static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r,
                                              bool getarp)
{
        struct net_device *dev;

        if (getarp)
                dev = dev_get_by_name_rcu(net, r->arp_dev);
        else
                dev = __dev_get_by_name(net, r->arp_dev);
        if (!dev)
                return ERR_PTR(-ENODEV);

        /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */
        if (!r->arp_ha.sa_family)
                r->arp_ha.sa_family = dev->type;

        if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type)
                return ERR_PTR(-EINVAL);

        return dev;
}

static struct net_device *arp_req_dev(struct net *net, struct arpreq *r)
{
        struct net_device *dev;
        struct rtable *rt;
        __be32 ip;

        if (r->arp_dev[0])
                return arp_req_dev_by_name(net, r, false);

        if (r->arp_flags & ATF_PUBL)
                return NULL;

        ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

        rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK);
        if (IS_ERR(rt))
                return ERR_CAST(rt);

        dev = rt->dst.dev;
        ip_rt_put(rt);

        if (!dev)
                return ERR_PTR(-EINVAL);

        return dev;
}

/*
 *        Set (create) an ARP cache entry.
 */

static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
{
        if (!dev) {
                IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
                return 0;
        }
        if (__in_dev_get_rtnl_net(dev)) {
                IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on);
                return 0;
        }
        return -ENXIO;
}

static int arp_req_set_public(struct net *net, struct arpreq *r,
                struct net_device *dev)
{
        __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;

        if (!dev && (r->arp_flags & ATF_COM)) {
                dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
                                      r->arp_ha.sa_data);
                if (!dev)
                        return -ENODEV;
        }
        if (mask) {
                __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

                if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
                        return -ENOBUFS;
                return 0;
        }

        return arp_req_set_proxy(net, dev, 1);
}

static int arp_req_set(struct net *net, struct arpreq *r)
{
        struct neighbour *neigh;
        struct net_device *dev;
        __be32 ip;
        int err;

        dev = arp_req_dev(net, r);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        if (r->arp_flags & ATF_PUBL)
                return arp_req_set_public(net, r, dev);

        switch (dev->type) {
#if IS_ENABLED(CONFIG_FDDI)
        case ARPHRD_FDDI:
                /*
                 * According to RFC 1390, FDDI devices should accept ARP
                 * hardware types of 1 (Ethernet).  However, to be more
                 * robust, we'll accept hardware types of either 1 (Ethernet)
                 * or 6 (IEEE 802.2).
                 */
                if (r->arp_ha.sa_family != ARPHRD_FDDI &&
                    r->arp_ha.sa_family != ARPHRD_ETHER &&
                    r->arp_ha.sa_family != ARPHRD_IEEE802)
                        return -EINVAL;
                break;
#endif
        default:
                if (r->arp_ha.sa_family != dev->type)
                        return -EINVAL;
                break;
        }

        ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

        neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
        err = PTR_ERR(neigh);
        if (!IS_ERR(neigh)) {
                unsigned int state = NUD_STALE;

                if (r->arp_flags & ATF_PERM) {
                        r->arp_flags |= ATF_COM;
                        state = NUD_PERMANENT;
                }

                err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
                                   r->arp_ha.sa_data : NULL, state,
                                   NEIGH_UPDATE_F_OVERRIDE |
                                   NEIGH_UPDATE_F_ADMIN, 0);
                neigh_release(neigh);
        }
        return err;
}

static unsigned int arp_state_to_flags(struct neighbour *neigh)
{
        if (neigh->nud_state&NUD_PERMANENT)
                return ATF_PERM | ATF_COM;
        else if (neigh->nud_state&NUD_VALID)
                return ATF_COM;
        else
                return 0;
}

/*
 *        Get an ARP cache entry.
 */

static int arp_req_get(struct net *net, struct arpreq *r)
{
        __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
        struct neighbour *neigh;
        struct net_device *dev;

        if (!r->arp_dev[0])
                return -ENODEV;

        dev = arp_req_dev_by_name(net, r, true);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        neigh = neigh_lookup(&arp_tbl, &ip, dev);
        if (!neigh)
                return -ENXIO;

        if (READ_ONCE(neigh->nud_state) & NUD_NOARP) {
                neigh_release(neigh);
                return -ENXIO;
        }

        read_lock_bh(&neigh->lock);
        memcpy(r->arp_ha.sa_data, neigh->ha,
               min(dev->addr_len, sizeof(r->arp_ha.sa_data_min)));
        r->arp_flags = arp_state_to_flags(neigh);
        read_unlock_bh(&neigh->lock);

        neigh_release(neigh);

        r->arp_ha.sa_family = dev->type;
        netdev_copy_name(dev, r->arp_dev);

        return 0;
}

int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
{
        struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
        int err = -ENXIO;
        struct neigh_table *tbl = &arp_tbl;

        if (neigh) {
                if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) {
                        neigh_release(neigh);
                        return 0;
                }

                if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP)
                        err = neigh_update(neigh, NULL, NUD_FAILED,
                                           NEIGH_UPDATE_F_OVERRIDE|
                                           NEIGH_UPDATE_F_ADMIN, 0);
                write_lock_bh(&tbl->lock);
                neigh_release(neigh);
                neigh_remove_one(neigh);
                write_unlock_bh(&tbl->lock);
        }

        return err;
}

static int arp_req_delete_public(struct net *net, struct arpreq *r,
                struct net_device *dev)
{
        __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;

        if (mask) {
                __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

                return pneigh_delete(&arp_tbl, net, &ip, dev);
        }

        return arp_req_set_proxy(net, dev, 0);
}

static int arp_req_delete(struct net *net, struct arpreq *r)
{
        struct net_device *dev;
        __be32 ip;

        dev = arp_req_dev(net, r);
        if (IS_ERR(dev))
                return PTR_ERR(dev);

        if (r->arp_flags & ATF_PUBL)
                return arp_req_delete_public(net, r, dev);

        ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;

        return arp_invalidate(dev, ip, true);
}

/*
 *        Handle an ARP layer I/O control request.
 */

int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
        struct arpreq r;
        __be32 *netmask;
        int err;

        switch (cmd) {
        case SIOCDARP:
        case SIOCSARP:
                if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                fallthrough;
        case SIOCGARP:
                err = copy_from_user(&r, arg, sizeof(struct arpreq));
                if (err)
                        return -EFAULT;
                break;
        default:
                return -EINVAL;
        }

        if (r.arp_pa.sa_family != AF_INET)
                return -EPFNOSUPPORT;

        if (!(r.arp_flags & ATF_PUBL) &&
            (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
                return -EINVAL;

        netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr;
        if (!(r.arp_flags & ATF_NETMASK))
                *netmask = htonl(0xFFFFFFFFUL);
        else if (*netmask && *netmask != htonl(0xFFFFFFFFUL))
                return -EINVAL;

        switch (cmd) {
        case SIOCDARP:
                rtnl_net_lock(net);
                err = arp_req_delete(net, &r);
                rtnl_net_unlock(net);
                break;
        case SIOCSARP:
                rtnl_net_lock(net);
                err = arp_req_set(net, &r);
                rtnl_net_unlock(net);
                break;
        case SIOCGARP:
                rcu_read_lock();
                err = arp_req_get(net, &r);
                rcu_read_unlock();

                if (!err && copy_to_user(arg, &r, sizeof(r)))
                        err = -EFAULT;
                break;
        }

        return err;
}

static int arp_netdev_event(struct notifier_block *this, unsigned long event,
                            void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netdev_notifier_change_info *change_info;
        struct in_device *in_dev;
        bool evict_nocarrier;

        switch (event) {
        case NETDEV_CHANGEADDR:
                neigh_changeaddr(&arp_tbl, dev);
                rt_cache_flush(dev_net(dev));
                break;
        case NETDEV_CHANGE:
                change_info = ptr;
                if (change_info->flags_changed & IFF_NOARP)
                        neigh_changeaddr(&arp_tbl, dev);

                in_dev = __in_dev_get_rtnl(dev);
                if (!in_dev)
                        evict_nocarrier = true;
                else
                        evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev);

                if (evict_nocarrier && !netif_carrier_ok(dev))
                        neigh_carrier_down(&arp_tbl, dev);
                break;
        default:
                break;
        }

        return NOTIFY_DONE;
}

static struct notifier_block arp_netdev_notifier = {
        .notifier_call = arp_netdev_event,
};

/* Note, that it is not on notifier chain.
   It is necessary, that this routine was called after route cache will be
   flushed.
 */
void arp_ifdown(struct net_device *dev)
{
        neigh_ifdown(&arp_tbl, dev);
}


/*
 *        Called once on startup.
 */

static struct packet_type arp_packet_type __read_mostly = {
        .type =        cpu_to_be16(ETH_P_ARP),
        .func =        arp_rcv,
};

#ifdef CONFIG_PROC_FS
#if IS_ENABLED(CONFIG_AX25)

/*
 *        ax25 -> ASCII conversion
 */
static void ax2asc2(ax25_address *a, char *buf)
{
        char c, *s;
        int n;

        for (n = 0, s = buf; n < 6; n++) {
                c = (a->ax25_call[n] >> 1) & 0x7F;

                if (c != ' ')
                        *s++ = c;
        }

        *s++ = '-';
        n = (a->ax25_call[6] >> 1) & 0x0F;
        if (n > 9) {
                *s++ = '1';
                n -= 10;
        }

        *s++ = n + '0';
        *s++ = '\0';

        if (*buf == '\0' || *buf == '-') {
                buf[0] = '*';
                buf[1] = '\0';
        }
}
#endif /* CONFIG_AX25 */

#define HBUFFERLEN 30

static void arp_format_neigh_entry(struct seq_file *seq,
                                   struct neighbour *n)
{
        char hbuffer[HBUFFERLEN];
        int k, j;
        char tbuf[16];
        struct net_device *dev = n->dev;
        int hatype = dev->type;

        read_lock(&n->lock);
        /* Convert hardware address to XX:XX:XX:XX ... form. */
#if IS_ENABLED(CONFIG_AX25)
        if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
                ax2asc2((ax25_address *)n->ha, hbuffer);
        else {
#endif
        for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
                hbuffer[k++] = hex_asc_hi(n->ha[j]);
                hbuffer[k++] = hex_asc_lo(n->ha[j]);
                hbuffer[k++] = ':';
        }
        if (k != 0)
                --k;
        hbuffer[k] = 0;
#if IS_ENABLED(CONFIG_AX25)
        }
#endif
        sprintf(tbuf, "%pI4", n->primary_key);
        seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s     *        %s\n",
                   tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
        read_unlock(&n->lock);
}

static void arp_format_pneigh_entry(struct seq_file *seq,
                                    struct pneigh_entry *n)
{
        struct net_device *dev = n->dev;
        int hatype = dev ? dev->type : 0;
        char tbuf[16];

        sprintf(tbuf, "%pI4", n->key);
        seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
                   tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
                   dev ? dev->name : "*");
}

static int arp_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "IP address       HW type     Flags       "
                              "HW address            Mask     Device\n");
        } else {
                struct neigh_seq_state *state = seq->private;

                if (state->flags & NEIGH_SEQ_IS_PNEIGH)
                        arp_format_pneigh_entry(seq, v);
                else
                        arp_format_neigh_entry(seq, v);
        }

        return 0;
}

static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
{
        /* Don't want to confuse "arp -a" w/ magic entries,
         * so we tell the generic iterator to skip NUD_NOARP.
         */
        return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
}

static const struct seq_operations arp_seq_ops = {
        .start        = arp_seq_start,
        .next        = neigh_seq_next,
        .stop        = neigh_seq_stop,
        .show        = arp_seq_show,
};
#endif /* CONFIG_PROC_FS */

static int __net_init arp_net_init(struct net *net)
{
        if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops,
                        sizeof(struct neigh_seq_state)))
                return -ENOMEM;
        return 0;
}

static void __net_exit arp_net_exit(struct net *net)
{
        remove_proc_entry("arp", net->proc_net);
}

static struct pernet_operations arp_net_ops = {
        .init = arp_net_init,
        .exit = arp_net_exit,
};

void __init arp_init(void)
{
        neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);

        dev_add_pack(&arp_packet_type);
        register_pernet_subsys(&arp_net_ops);
#ifdef CONFIG_SYSCTL
        neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
#endif
        register_netdevice_notifier(&arp_netdev_notifier);
}





































  396 

  396 











  396 
  396 

  396 







































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_LOCAL_LOCK_H
# error "Do not include directly, include linux/local_lock.h"
#endif

#include <linux/percpu-defs.h>
#include <linux/lockdep.h>

#ifndef CONFIG_PREEMPT_RT

typedef struct {
#ifdef CONFIG_DEBUG_LOCK_ALLOC
        struct lockdep_map        dep_map;
        struct task_struct        *owner;
#endif
} local_lock_t;

/* local_trylock() and local_trylock_irqsave() only work with local_trylock_t */
typedef struct {
        local_lock_t        llock;
        u8                acquired;
} local_trylock_t;

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define LOCAL_LOCK_DEBUG_INIT(lockname)                \
        .dep_map = {                                        \
                .name = #lockname,                        \
                .wait_type_inner = LD_WAIT_CONFIG,        \
                .lock_type = LD_LOCK_PERCPU,                \
        },                                                \
        .owner = NULL,

# define LOCAL_TRYLOCK_DEBUG_INIT(lockname)                \
        .llock = { LOCAL_LOCK_DEBUG_INIT((lockname).llock) },

static inline void local_lock_acquire(local_lock_t *l)
{
        lock_map_acquire(&l->dep_map);
        DEBUG_LOCKS_WARN_ON(l->owner);
        l->owner = current;
}

static inline void local_trylock_acquire(local_lock_t *l)
{
        lock_map_acquire_try(&l->dep_map);
        DEBUG_LOCKS_WARN_ON(l->owner);
        l->owner = current;
}

static inline void local_lock_release(local_lock_t *l)
{
        DEBUG_LOCKS_WARN_ON(l->owner != current);
        l->owner = NULL;
        lock_map_release(&l->dep_map);
}

static inline void local_lock_debug_init(local_lock_t *l)
{
        l->owner = NULL;
}
#else /* CONFIG_DEBUG_LOCK_ALLOC */
# define LOCAL_LOCK_DEBUG_INIT(lockname)
# define LOCAL_TRYLOCK_DEBUG_INIT(lockname)
static inline void local_lock_acquire(local_lock_t *l) { }
static inline void local_trylock_acquire(local_lock_t *l) { }
static inline void local_lock_release(local_lock_t *l) { }
static inline void local_lock_debug_init(local_lock_t *l) { }
#endif /* !CONFIG_DEBUG_LOCK_ALLOC */

#define INIT_LOCAL_LOCK(lockname)        { LOCAL_LOCK_DEBUG_INIT(lockname) }
#define INIT_LOCAL_TRYLOCK(lockname)        { LOCAL_TRYLOCK_DEBUG_INIT(lockname) }

#define __local_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
        lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
                              0, LD_WAIT_CONFIG, LD_WAIT_INV,        \
                              LD_LOCK_PERCPU);                        \
        local_lock_debug_init(lock);                                \
} while (0)

#define __local_trylock_init(lock) __local_lock_init(lock.llock)

#define __spinlock_nested_bh_init(lock)                                \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        debug_check_no_locks_freed((void *)lock, sizeof(*lock));\
        lockdep_init_map_type(&(lock)->dep_map, #lock, &__key,  \
                              0, LD_WAIT_CONFIG, LD_WAIT_INV,        \
                              LD_LOCK_NORMAL);                        \
        local_lock_debug_init(lock);                                \
} while (0)

#define __local_lock_acquire(lock)                                        \
        do {                                                                \
                local_trylock_t *tl;                                        \
                local_lock_t *l;                                        \
                                                                        \
                l = (local_lock_t *)this_cpu_ptr(lock);                        \
                tl = (local_trylock_t *)l;                                \
                _Generic((lock),                                        \
                        __percpu local_trylock_t *: ({                        \
                                lockdep_assert(tl->acquired == 0);        \
                                WRITE_ONCE(tl->acquired, 1);                \
                        }),                                                \
                        __percpu local_lock_t *: (void)0);                \
                local_lock_acquire(l);                                        \
        } while (0)

#define __local_lock(lock)                                        \
        do {                                                        \
                preempt_disable();                                \
                __local_lock_acquire(lock);                        \
        } while (0)

#define __local_lock_irq(lock)                                        \
        do {                                                        \
                local_irq_disable();                                \
                __local_lock_acquire(lock);                        \
        } while (0)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                local_irq_save(flags);                                \
                __local_lock_acquire(lock);                        \
        } while (0)

#define __local_trylock(lock)                                        \
        ({                                                        \
                local_trylock_t *tl;                                \
                                                                \
                preempt_disable();                                \
                tl = this_cpu_ptr(lock);                        \
                if (READ_ONCE(tl->acquired)) {                        \
                        preempt_enable();                        \
                        tl = NULL;                                \
                } else {                                        \
                        WRITE_ONCE(tl->acquired, 1);                \
                        local_trylock_acquire(                        \
                                (local_lock_t *)tl);                \
                }                                                \
                !!tl;                                                \
        })

#define __local_trylock_irqsave(lock, flags)                        \
        ({                                                        \
                local_trylock_t *tl;                                \
                                                                \
                local_irq_save(flags);                                \
                tl = this_cpu_ptr(lock);                        \
                if (READ_ONCE(tl->acquired)) {                        \
                        local_irq_restore(flags);                \
                        tl = NULL;                                \
                } else {                                        \
                        WRITE_ONCE(tl->acquired, 1);                \
                        local_trylock_acquire(                        \
                                (local_lock_t *)tl);                \
                }                                                \
                !!tl;                                                \
        })

#define __local_lock_release(lock)                                        \
        do {                                                                \
                local_trylock_t *tl;                                        \
                local_lock_t *l;                                        \
                                                                        \
                l = (local_lock_t *)this_cpu_ptr(lock);                        \
                tl = (local_trylock_t *)l;                                \
                local_lock_release(l);                                        \
                _Generic((lock),                                        \
                        __percpu local_trylock_t *: ({                        \
                                lockdep_assert(tl->acquired == 1);        \
                                WRITE_ONCE(tl->acquired, 0);                \
                        }),                                                \
                        __percpu local_lock_t *: (void)0);                \
        } while (0)

#define __local_unlock(lock)                                        \
        do {                                                        \
                __local_lock_release(lock);                        \
                preempt_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                                \
        do {                                                        \
                __local_lock_release(lock);                        \
                local_irq_enable();                                \
        } while (0)

#define __local_unlock_irqrestore(lock, flags)                        \
        do {                                                        \
                __local_lock_release(lock);                        \
                local_irq_restore(flags);                        \
        } while (0)

#define __local_lock_nested_bh(lock)                                \
        do {                                                        \
                lockdep_assert_in_softirq();                        \
                local_lock_acquire(this_cpu_ptr(lock));        \
        } while (0)

#define __local_unlock_nested_bh(lock)                                \
        local_lock_release(this_cpu_ptr(lock))

#else /* !CONFIG_PREEMPT_RT */

/*
 * On PREEMPT_RT local_lock maps to a per CPU spinlock, which protects the
 * critical section while staying preemptible.
 */
typedef spinlock_t local_lock_t;
typedef spinlock_t local_trylock_t;

#define INIT_LOCAL_LOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))
#define INIT_LOCAL_TRYLOCK(lockname) __LOCAL_SPIN_LOCK_UNLOCKED((lockname))

#define __local_lock_init(l)                                        \
        do {                                                        \
                local_spin_lock_init((l));                        \
        } while (0)

#define __local_trylock_init(l)                        __local_lock_init(l)

#define __local_lock(__lock)                                        \
        do {                                                        \
                migrate_disable();                                \
                spin_lock(this_cpu_ptr((__lock)));                \
        } while (0)

#define __local_lock_irq(lock)                        __local_lock(lock)

#define __local_lock_irqsave(lock, flags)                        \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                flags = 0;                                        \
                __local_lock(lock);                                \
        } while (0)

#define __local_unlock(__lock)                                        \
        do {                                                        \
                spin_unlock(this_cpu_ptr((__lock)));                \
                migrate_enable();                                \
        } while (0)

#define __local_unlock_irq(lock)                __local_unlock(lock)

#define __local_unlock_irqrestore(lock, flags)        __local_unlock(lock)

#define __local_lock_nested_bh(lock)                                \
do {                                                                \
        lockdep_assert_in_softirq_func();                        \
        spin_lock(this_cpu_ptr(lock));                                \
} while (0)

#define __local_unlock_nested_bh(lock)                                \
do {                                                                \
        spin_unlock(this_cpu_ptr((lock)));                        \
} while (0)

#define __local_trylock(lock)                                        \
        ({                                                        \
                int __locked;                                        \
                                                                \
                if (in_nmi() | in_hardirq()) {                        \
                        __locked = 0;                                \
                } else {                                        \
                        migrate_disable();                        \
                        __locked = spin_trylock(this_cpu_ptr((lock)));        \
                        if (!__locked)                                \
                                migrate_enable();                \
                }                                                \
                __locked;                                        \
        })

#define __local_trylock_irqsave(lock, flags)                        \
        ({                                                        \
                typecheck(unsigned long, flags);                \
                flags = 0;                                        \
                __local_trylock(lock);                                \
        })

#endif /* CONFIG_PREEMPT_RT */





























































































































































   35 











1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/fs.h>
#include <linux/path.h>
#include <linux/slab.h>
#include <linux/fs_struct.h>
#include "internal.h"

/*
 * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_root(struct fs_struct *fs, const struct path *path)
{
        struct path old_root;

        path_get(path);
        spin_lock(&fs->lock);
        write_seqcount_begin(&fs->seq);
        old_root = fs->root;
        fs->root = *path;
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);
        if (old_root.dentry)
                path_put(&old_root);
}

/*
 * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
 * It can block.
 */
void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
        struct path old_pwd;

        path_get(path);
        spin_lock(&fs->lock);
        write_seqcount_begin(&fs->seq);
        old_pwd = fs->pwd;
        fs->pwd = *path;
        write_seqcount_end(&fs->seq);
        spin_unlock(&fs->lock);

        if (old_pwd.dentry)
                path_put(&old_pwd);
}

static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
{
        if (likely(p->dentry != old->dentry || p->mnt != old->mnt))
                return 0;
        *p = *new;
        return 1;
}

void chroot_fs_refs(const struct path *old_root, const struct path *new_root)
{
        struct task_struct *g, *p;
        struct fs_struct *fs;
        int count = 0;

        read_lock(&tasklist_lock);
        for_each_process_thread(g, p) {
                task_lock(p);
                fs = p->fs;
                if (fs) {
                        int hits = 0;
                        spin_lock(&fs->lock);
                        write_seqcount_begin(&fs->seq);
                        hits += replace_path(&fs->root, old_root, new_root);
                        hits += replace_path(&fs->pwd, old_root, new_root);
                        write_seqcount_end(&fs->seq);
                        while (hits--) {
                                count++;
                                path_get(new_root);
                        }
                        spin_unlock(&fs->lock);
                }
                task_unlock(p);
        }
        read_unlock(&tasklist_lock);
        while (count--)
                path_put(old_root);
}

void free_fs_struct(struct fs_struct *fs)
{
        path_put(&fs->root);
        path_put(&fs->pwd);
        kmem_cache_free(fs_cachep, fs);
}

void exit_fs(struct task_struct *tsk)
{
        struct fs_struct *fs = tsk->fs;

        if (fs) {
                int kill;
                task_lock(tsk);
                spin_lock(&fs->lock);
                tsk->fs = NULL;
                kill = !--fs->users;
                spin_unlock(&fs->lock);
                task_unlock(tsk);
                if (kill)
                        free_fs_struct(fs);
        }
}

struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
        struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
        /* We don't need to lock fs - think why ;-) */
        if (fs) {
                fs->users = 1;
                fs->in_exec = 0;
                spin_lock_init(&fs->lock);
                seqcount_spinlock_init(&fs->seq, &fs->lock);
                fs->umask = old->umask;

                spin_lock(&old->lock);
                fs->root = old->root;
                path_get(&fs->root);
                fs->pwd = old->pwd;
                path_get(&fs->pwd);
                spin_unlock(&old->lock);
        }
        return fs;
}

int unshare_fs_struct(void)
{
        struct fs_struct *fs = current->fs;
        struct fs_struct *new_fs = copy_fs_struct(fs);
        int kill;

        if (!new_fs)
                return -ENOMEM;

        task_lock(current);
        spin_lock(&fs->lock);
        kill = !--fs->users;
        current->fs = new_fs;
        spin_unlock(&fs->lock);
        task_unlock(current);

        if (kill)
                free_fs_struct(fs);

        return 0;
}
EXPORT_SYMBOL_GPL(unshare_fs_struct);

int current_umask(void)
{
        return current->fs->umask;
}
EXPORT_SYMBOL(current_umask);

/* to be mentioned only in INIT_TASK */
struct fs_struct init_fs = {
        .users                = 1,
        .lock                = __SPIN_LOCK_UNLOCKED(init_fs.lock),
        .seq                = SEQCNT_SPINLOCK_ZERO(init_fs.seq, &init_fs.lock),
        .umask                = 0022,
};




















































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/atomic.h
 *
 * Copyright (C) 1996 Russell King.
 * Copyright (C) 2002 Deep Blue Solutions Ltd.
 * Copyright (C) 2012 ARM Ltd.
 */

#ifndef __ASM_ATOMIC_LL_SC_H
#define __ASM_ATOMIC_LL_SC_H

#include <linux/stringify.h>

#ifndef CONFIG_CC_HAS_K_CONSTRAINT
#define K
#endif

/*
 * AArch64 UP and SMP safe atomic ops.  We use load exclusive and
 * store exclusive to ensure that these are atomic.  We may loop
 * to ensure that the update happens.
 */

#define ATOMIC_OP(op, asm_op, constraint)                                \
static __always_inline void                                                \
__ll_sc_atomic_##op(int i, atomic_t *v)                                        \
{                                                                        \
        unsigned long tmp;                                                \
        int result;                                                        \
                                                                        \
        asm volatile("// atomic_" #op "\n"                                \
        "        prfm        pstl1strm, %2\n"                                \
        "1:        ldxr        %w0, %2\n"                                        \
        "        " #asm_op "        %w0, %w0, %w3\n"                        \
        "        stxr        %w1, %w0, %2\n"                                        \
        "        cbnz        %w1, 1b\n"                                        \
        : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)                \
        : __stringify(constraint) "r" (i));                                \
}

#define ATOMIC_OP_RETURN(name, mb, acq, rel, cl, op, asm_op, constraint)\
static __always_inline int                                                \
__ll_sc_atomic_##op##_return##name(int i, atomic_t *v)                        \
{                                                                        \
        unsigned long tmp;                                                \
        int result;                                                        \
                                                                        \
        asm volatile("// atomic_" #op "_return" #name "\n"                \
        "        prfm        pstl1strm, %2\n"                                \
        "1:        ld" #acq "xr        %w0, %2\n"                                \
        "        " #asm_op "        %w0, %w0, %w3\n"                        \
        "        st" #rel "xr        %w1, %w0, %2\n"                                \
        "        cbnz        %w1, 1b\n"                                        \
        "        " #mb                                                        \
        : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)                \
        : __stringify(constraint) "r" (i)                                \
        : cl);                                                                \
                                                                        \
        return result;                                                        \
}

#define ATOMIC_FETCH_OP(name, mb, acq, rel, cl, op, asm_op, constraint) \
static __always_inline int                                                \
__ll_sc_atomic_fetch_##op##name(int i, atomic_t *v)                        \
{                                                                        \
        unsigned long tmp;                                                \
        int val, result;                                                \
                                                                        \
        asm volatile("// atomic_fetch_" #op #name "\n"                        \
        "        prfm        pstl1strm, %3\n"                                \
        "1:        ld" #acq "xr        %w0, %3\n"                                \
        "        " #asm_op "        %w1, %w0, %w4\n"                        \
        "        st" #rel "xr        %w2, %w1, %3\n"                                \
        "        cbnz        %w2, 1b\n"                                        \
        "        " #mb                                                        \
        : "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)        \
        : __stringify(constraint) "r" (i)                                \
        : cl);                                                                \
                                                                        \
        return result;                                                        \
}

#define ATOMIC_OPS(...)                                                        \
        ATOMIC_OP(__VA_ARGS__)                                                \
        ATOMIC_OP_RETURN(        , dmb ish,  , l, "memory", __VA_ARGS__)\
        ATOMIC_OP_RETURN(_relaxed,        ,  ,  ,         , __VA_ARGS__)\
        ATOMIC_OP_RETURN(_acquire,        , a,  , "memory", __VA_ARGS__)\
        ATOMIC_OP_RETURN(_release,        ,  , l, "memory", __VA_ARGS__)\
        ATOMIC_FETCH_OP (        , dmb ish,  , l, "memory", __VA_ARGS__)\
        ATOMIC_FETCH_OP (_relaxed,        ,  ,  ,         , __VA_ARGS__)\
        ATOMIC_FETCH_OP (_acquire,        , a,  , "memory", __VA_ARGS__)\
        ATOMIC_FETCH_OP (_release,        ,  , l, "memory", __VA_ARGS__)

ATOMIC_OPS(add, add, I)
ATOMIC_OPS(sub, sub, J)

#undef ATOMIC_OPS
#define ATOMIC_OPS(...)                                                        \
        ATOMIC_OP(__VA_ARGS__)                                                \
        ATOMIC_FETCH_OP (        , dmb ish,  , l, "memory", __VA_ARGS__)\
        ATOMIC_FETCH_OP (_relaxed,        ,  ,  ,         , __VA_ARGS__)\
        ATOMIC_FETCH_OP (_acquire,        , a,  , "memory", __VA_ARGS__)\
        ATOMIC_FETCH_OP (_release,        ,  , l, "memory", __VA_ARGS__)

ATOMIC_OPS(and, and, K)
ATOMIC_OPS(or, orr, K)
ATOMIC_OPS(xor, eor, K)
/*
 * GAS converts the mysterious and undocumented BIC (immediate) alias to
 * an AND (immediate) instruction with the immediate inverted. We don't
 * have a constraint for this, so fall back to register.
 */
ATOMIC_OPS(andnot, bic, )

#undef ATOMIC_OPS
#undef ATOMIC_FETCH_OP
#undef ATOMIC_OP_RETURN
#undef ATOMIC_OP

#define ATOMIC64_OP(op, asm_op, constraint)                                \
static __always_inline void                                                \
__ll_sc_atomic64_##op(s64 i, atomic64_t *v)                                \
{                                                                        \
        s64 result;                                                        \
        unsigned long tmp;                                                \
                                                                        \
        asm volatile("// atomic64_" #op "\n"                                \
        "        prfm        pstl1strm, %2\n"                                \
        "1:        ldxr        %0, %2\n"                                        \
        "        " #asm_op "        %0, %0, %3\n"                                \
        "        stxr        %w1, %0, %2\n"                                        \
        "        cbnz        %w1, 1b"                                        \
        : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)                \
        : __stringify(constraint) "r" (i));                                \
}

#define ATOMIC64_OP_RETURN(name, mb, acq, rel, cl, op, asm_op, constraint)\
static __always_inline long                                                \
__ll_sc_atomic64_##op##_return##name(s64 i, atomic64_t *v)                \
{                                                                        \
        s64 result;                                                        \
        unsigned long tmp;                                                \
                                                                        \
        asm volatile("// atomic64_" #op "_return" #name "\n"                \
        "        prfm        pstl1strm, %2\n"                                \
        "1:        ld" #acq "xr        %0, %2\n"                                \
        "        " #asm_op "        %0, %0, %3\n"                                \
        "        st" #rel "xr        %w1, %0, %2\n"                                \
        "        cbnz        %w1, 1b\n"                                        \
        "        " #mb                                                        \
        : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)                \
        : __stringify(constraint) "r" (i)                                \
        : cl);                                                                \
                                                                        \
        return result;                                                        \
}

#define ATOMIC64_FETCH_OP(name, mb, acq, rel, cl, op, asm_op, constraint)\
static __always_inline long                                                \
__ll_sc_atomic64_fetch_##op##name(s64 i, atomic64_t *v)                        \
{                                                                        \
        s64 result, val;                                                \
        unsigned long tmp;                                                \
                                                                        \
        asm volatile("// atomic64_fetch_" #op #name "\n"                \
        "        prfm        pstl1strm, %3\n"                                \
        "1:        ld" #acq "xr        %0, %3\n"                                \
        "        " #asm_op "        %1, %0, %4\n"                                \
        "        st" #rel "xr        %w2, %1, %3\n"                                \
        "        cbnz        %w2, 1b\n"                                        \
        "        " #mb                                                        \
        : "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)        \
        : __stringify(constraint) "r" (i)                                \
        : cl);                                                                \
                                                                        \
        return result;                                                        \
}

#define ATOMIC64_OPS(...)                                                \
        ATOMIC64_OP(__VA_ARGS__)                                        \
        ATOMIC64_OP_RETURN(, dmb ish,  , l, "memory", __VA_ARGS__)        \
        ATOMIC64_OP_RETURN(_relaxed,,  ,  ,         , __VA_ARGS__)        \
        ATOMIC64_OP_RETURN(_acquire,, a,  , "memory", __VA_ARGS__)        \
        ATOMIC64_OP_RETURN(_release,,  , l, "memory", __VA_ARGS__)        \
        ATOMIC64_FETCH_OP (, dmb ish,  , l, "memory", __VA_ARGS__)        \
        ATOMIC64_FETCH_OP (_relaxed,,  ,  ,         , __VA_ARGS__)        \
        ATOMIC64_FETCH_OP (_acquire,, a,  , "memory", __VA_ARGS__)        \
        ATOMIC64_FETCH_OP (_release,,  , l, "memory", __VA_ARGS__)

ATOMIC64_OPS(add, add, I)
ATOMIC64_OPS(sub, sub, J)

#undef ATOMIC64_OPS
#define ATOMIC64_OPS(...)                                                \
        ATOMIC64_OP(__VA_ARGS__)                                        \
        ATOMIC64_FETCH_OP (, dmb ish,  , l, "memory", __VA_ARGS__)        \
        ATOMIC64_FETCH_OP (_relaxed,,  ,  ,         , __VA_ARGS__)        \
        ATOMIC64_FETCH_OP (_acquire,, a,  , "memory", __VA_ARGS__)        \
        ATOMIC64_FETCH_OP (_release,,  , l, "memory", __VA_ARGS__)

ATOMIC64_OPS(and, and, L)
ATOMIC64_OPS(or, orr, L)
ATOMIC64_OPS(xor, eor, L)
/*
 * GAS converts the mysterious and undocumented BIC (immediate) alias to
 * an AND (immediate) instruction with the immediate inverted. We don't
 * have a constraint for this, so fall back to register.
 */
ATOMIC64_OPS(andnot, bic, )

#undef ATOMIC64_OPS
#undef ATOMIC64_FETCH_OP
#undef ATOMIC64_OP_RETURN
#undef ATOMIC64_OP

static __always_inline s64
__ll_sc_atomic64_dec_if_positive(atomic64_t *v)
{
        s64 result;
        unsigned long tmp;

        asm volatile("// atomic64_dec_if_positive\n"
        "        prfm        pstl1strm, %2\n"
        "1:        ldxr        %0, %2\n"
        "        subs        %0, %0, #1\n"
        "        b.lt        2f\n"
        "        stlxr        %w1, %0, %2\n"
        "        cbnz        %w1, 1b\n"
        "        dmb        ish\n"
        "2:"
        : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
        :
        : "cc", "memory");

        return result;
}

#define __CMPXCHG_CASE(w, sfx, name, sz, mb, acq, rel, cl, constraint)        \
static __always_inline u##sz                                                \
__ll_sc__cmpxchg_case_##name##sz(volatile void *ptr,                        \
                                         unsigned long old,                \
                                         u##sz new)                        \
{                                                                        \
        unsigned long tmp;                                                \
        u##sz oldval;                                                        \
                                                                        \
        /*                                                                \
         * Sub-word sizes require explicit casting so that the compare  \
         * part of the cmpxchg doesn't end up interpreting non-zero        \
         * upper bits of the register containing "old".                        \
         */                                                                \
        if (sz < 32)                                                        \
                old = (u##sz)old;                                        \
                                                                        \
        asm volatile(                                                        \
        "        prfm        pstl1strm, %[v]\n"                                \
        "1:        ld" #acq "xr" #sfx "\t%" #w "[oldval], %[v]\n"                \
        "        eor        %" #w "[tmp], %" #w "[oldval], %" #w "[old]\n"        \
        "        cbnz        %" #w "[tmp], 2f\n"                                \
        "        st" #rel "xr" #sfx "\t%w[tmp], %" #w "[new], %[v]\n"        \
        "        cbnz        %w[tmp], 1b\n"                                        \
        "        " #mb "\n"                                                \
        "2:"                                                                \
        : [tmp] "=&r" (tmp), [oldval] "=&r" (oldval),                        \
          [v] "+Q" (*(u##sz *)ptr)                                        \
        : [old] __stringify(constraint) "r" (old), [new] "r" (new)        \
        : cl);                                                                \
                                                                        \
        return oldval;                                                        \
}

/*
 * Earlier versions of GCC (no later than 8.1.0) appear to incorrectly
 * handle the 'K' constraint for the value 4294967295 - thus we use no
 * constraint for 32 bit operations.
 */
__CMPXCHG_CASE(w, b,     ,  8,        ,  ,  ,         , K)
__CMPXCHG_CASE(w, h,     , 16,        ,  ,  ,         , K)
__CMPXCHG_CASE(w,  ,     , 32,        ,  ,  ,         , K)
__CMPXCHG_CASE( ,  ,     , 64,        ,  ,  ,         , L)
__CMPXCHG_CASE(w, b, acq_,  8,        , a,  , "memory", K)
__CMPXCHG_CASE(w, h, acq_, 16,        , a,  , "memory", K)
__CMPXCHG_CASE(w,  , acq_, 32,        , a,  , "memory", K)
__CMPXCHG_CASE( ,  , acq_, 64,        , a,  , "memory", L)
__CMPXCHG_CASE(w, b, rel_,  8,        ,  , l, "memory", K)
__CMPXCHG_CASE(w, h, rel_, 16,        ,  , l, "memory", K)
__CMPXCHG_CASE(w,  , rel_, 32,        ,  , l, "memory", K)
__CMPXCHG_CASE( ,  , rel_, 64,        ,  , l, "memory", L)
__CMPXCHG_CASE(w, b,  mb_,  8, dmb ish,  , l, "memory", K)
__CMPXCHG_CASE(w, h,  mb_, 16, dmb ish,  , l, "memory", K)
__CMPXCHG_CASE(w,  ,  mb_, 32, dmb ish,  , l, "memory", K)
__CMPXCHG_CASE( ,  ,  mb_, 64, dmb ish,  , l, "memory", L)

#undef __CMPXCHG_CASE

union __u128_halves {
        u128 full;
        struct {
                u64 low, high;
        };
};

#define __CMPXCHG128(name, mb, rel, cl...)                             \
static __always_inline u128                                                \
__ll_sc__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)        \
{                                                                        \
        union __u128_halves r, o = { .full = (old) },                        \
                               n = { .full = (new) };                        \
       unsigned int tmp;                                               \
                                                                        \
        asm volatile("// __cmpxchg128" #name "\n"                        \
       "       prfm    pstl1strm, %[v]\n"                              \
       "1:     ldxp    %[rl], %[rh], %[v]\n"                           \
       "       cmp     %[rl], %[ol]\n"                                 \
       "       ccmp    %[rh], %[oh], 0, eq\n"                          \
       "       b.ne    2f\n"                                           \
       "       st" #rel "xp    %w[tmp], %[nl], %[nh], %[v]\n"          \
       "       cbnz    %w[tmp], 1b\n"                                  \
        "        " #mb "\n"                                                \
        "2:"                                                                \
       : [v] "+Q" (*(u128 *)ptr),                                      \
         [rl] "=&r" (r.low), [rh] "=&r" (r.high),                      \
         [tmp] "=&r" (tmp)                                             \
       : [ol] "r" (o.low), [oh] "r" (o.high),                          \
         [nl] "r" (n.low), [nh] "r" (n.high)                           \
       : "cc", ##cl);                                                  \
                                                                        \
        return r.full;                                                        \
}

__CMPXCHG128(   ,        ,  )
__CMPXCHG128(_mb, dmb ish, l, "memory")

#undef __CMPXCHG128

#undef K

#endif        /* __ASM_ATOMIC_LL_SC_H */

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   34 


















   34 






   34 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Based on arch/arm/kernel/ptrace.c
 *
 * By Ross Biro 1/23/92
 * edited by Linus Torvalds
 * ARM modifications Copyright (C) 2000 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */

#include <linux/audit.h>
#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/mm.h>
#include <linux/nospec.h>
#include <linux/smp.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/seccomp.h>
#include <linux/security.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/regset.h>
#include <linux/elf.h>
#include <linux/rseq.h>

#include <asm/compat.h>
#include <asm/cpufeature.h>
#include <asm/debug-monitors.h>
#include <asm/fpsimd.h>
#include <asm/gcs.h>
#include <asm/mte.h>
#include <asm/pointer_auth.h>
#include <asm/stacktrace.h>
#include <asm/syscall.h>
#include <asm/traps.h>
#include <asm/system_misc.h>

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>

struct pt_regs_offset {
        const char *name;
        int offset;
};

#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
#define REG_OFFSET_END {.name = NULL, .offset = 0}
#define GPR_OFFSET_NAME(r) \
        {.name = "x" #r, .offset = offsetof(struct pt_regs, regs[r])}

static const struct pt_regs_offset regoffset_table[] = {
        GPR_OFFSET_NAME(0),
        GPR_OFFSET_NAME(1),
        GPR_OFFSET_NAME(2),
        GPR_OFFSET_NAME(3),
        GPR_OFFSET_NAME(4),
        GPR_OFFSET_NAME(5),
        GPR_OFFSET_NAME(6),
        GPR_OFFSET_NAME(7),
        GPR_OFFSET_NAME(8),
        GPR_OFFSET_NAME(9),
        GPR_OFFSET_NAME(10),
        GPR_OFFSET_NAME(11),
        GPR_OFFSET_NAME(12),
        GPR_OFFSET_NAME(13),
        GPR_OFFSET_NAME(14),
        GPR_OFFSET_NAME(15),
        GPR_OFFSET_NAME(16),
        GPR_OFFSET_NAME(17),
        GPR_OFFSET_NAME(18),
        GPR_OFFSET_NAME(19),
        GPR_OFFSET_NAME(20),
        GPR_OFFSET_NAME(21),
        GPR_OFFSET_NAME(22),
        GPR_OFFSET_NAME(23),
        GPR_OFFSET_NAME(24),
        GPR_OFFSET_NAME(25),
        GPR_OFFSET_NAME(26),
        GPR_OFFSET_NAME(27),
        GPR_OFFSET_NAME(28),
        GPR_OFFSET_NAME(29),
        GPR_OFFSET_NAME(30),
        {.name = "lr", .offset = offsetof(struct pt_regs, regs[30])},
        REG_OFFSET_NAME(sp),
        REG_OFFSET_NAME(pc),
        REG_OFFSET_NAME(pstate),
        REG_OFFSET_END,
};

/**
 * regs_query_register_offset() - query register offset from its name
 * @name:        the name of a register
 *
 * regs_query_register_offset() returns the offset of a register in struct
 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
 */
int regs_query_register_offset(const char *name)
{
        const struct pt_regs_offset *roff;

        for (roff = regoffset_table; roff->name != NULL; roff++)
                if (!strcmp(roff->name, name))
                        return roff->offset;
        return -EINVAL;
}

/**
 * regs_within_kernel_stack() - check the address in the stack
 * @regs:      pt_regs which contains kernel stack pointer.
 * @addr:      address which is checked.
 *
 * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
 * If @addr is within the kernel stack, it returns true. If not, returns false.
 */
static bool regs_within_kernel_stack(struct pt_regs *regs, unsigned long addr)
{
        return ((addr & ~(THREAD_SIZE - 1))  ==
                (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) ||
                on_irq_stack(addr, sizeof(unsigned long));
}

/**
 * regs_get_kernel_stack_nth() - get Nth entry of the stack
 * @regs:        pt_regs which contains kernel stack pointer.
 * @n:                stack entry number.
 *
 * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
 * is specified by @regs. If the @n th entry is NOT in the kernel stack,
 * this returns 0.
 */
unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, unsigned int n)
{
        unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);

        addr += n;
        if (regs_within_kernel_stack(regs, (unsigned long)addr))
                return *addr;
        else
                return 0;
}

/*
 * TODO: does not yet catch signals sent when the child dies.
 * in exit.c or in signal.c.
 */

/*
 * Called by kernel/ptrace.c when detaching..
 */
void ptrace_disable(struct task_struct *child)
{
        /*
         * This would be better off in core code, but PTRACE_DETACH has
         * grown its fair share of arch-specific worts and changing it
         * is likely to cause regressions on obscure architectures.
         */
        user_disable_single_step(child);
}

#ifdef CONFIG_HAVE_HW_BREAKPOINT
/*
 * Handle hitting a HW-breakpoint.
 */
static void ptrace_hbptriggered(struct perf_event *bp,
                                struct perf_sample_data *data,
                                struct pt_regs *regs)
{
        struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp);
        const char *desc = "Hardware breakpoint trap (ptrace)";

        if (is_compat_task()) {
                int si_errno = 0;
                int i;

                for (i = 0; i < ARM_MAX_BRP; ++i) {
                        if (current->thread.debug.hbp_break[i] == bp) {
                                si_errno = (i << 1) + 1;
                                break;
                        }
                }

                for (i = 0; i < ARM_MAX_WRP; ++i) {
                        if (current->thread.debug.hbp_watch[i] == bp) {
                                si_errno = -((i << 1) + 1);
                                break;
                        }
                }
                arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger,
                                                  desc);
                return;
        }

        arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc);
}

/*
 * Unregister breakpoints from this task and reset the pointers in
 * the thread_struct.
 */
void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
{
        int i;
        struct thread_struct *t = &tsk->thread;

        for (i = 0; i < ARM_MAX_BRP; i++) {
                if (t->debug.hbp_break[i]) {
                        unregister_hw_breakpoint(t->debug.hbp_break[i]);
                        t->debug.hbp_break[i] = NULL;
                }
        }

        for (i = 0; i < ARM_MAX_WRP; i++) {
                if (t->debug.hbp_watch[i]) {
                        unregister_hw_breakpoint(t->debug.hbp_watch[i]);
                        t->debug.hbp_watch[i] = NULL;
                }
        }
}

void ptrace_hw_copy_thread(struct task_struct *tsk)
{
        memset(&tsk->thread.debug, 0, sizeof(struct debug_info));
}

static struct perf_event *ptrace_hbp_get_event(unsigned int note_type,
                                               struct task_struct *tsk,
                                               unsigned long idx)
{
        struct perf_event *bp = ERR_PTR(-EINVAL);

        switch (note_type) {
        case NT_ARM_HW_BREAK:
                if (idx >= ARM_MAX_BRP)
                        goto out;
                idx = array_index_nospec(idx, ARM_MAX_BRP);
                bp = tsk->thread.debug.hbp_break[idx];
                break;
        case NT_ARM_HW_WATCH:
                if (idx >= ARM_MAX_WRP)
                        goto out;
                idx = array_index_nospec(idx, ARM_MAX_WRP);
                bp = tsk->thread.debug.hbp_watch[idx];
                break;
        }

out:
        return bp;
}

static int ptrace_hbp_set_event(unsigned int note_type,
                                struct task_struct *tsk,
                                unsigned long idx,
                                struct perf_event *bp)
{
        int err = -EINVAL;

        switch (note_type) {
        case NT_ARM_HW_BREAK:
                if (idx >= ARM_MAX_BRP)
                        goto out;
                idx = array_index_nospec(idx, ARM_MAX_BRP);
                tsk->thread.debug.hbp_break[idx] = bp;
                err = 0;
                break;
        case NT_ARM_HW_WATCH:
                if (idx >= ARM_MAX_WRP)
                        goto out;
                idx = array_index_nospec(idx, ARM_MAX_WRP);
                tsk->thread.debug.hbp_watch[idx] = bp;
                err = 0;
                break;
        }

out:
        return err;
}

static struct perf_event *ptrace_hbp_create(unsigned int note_type,
                                            struct task_struct *tsk,
                                            unsigned long idx)
{
        struct perf_event *bp;
        struct perf_event_attr attr;
        int err, type;

        switch (note_type) {
        case NT_ARM_HW_BREAK:
                type = HW_BREAKPOINT_X;
                break;
        case NT_ARM_HW_WATCH:
                type = HW_BREAKPOINT_RW;
                break;
        default:
                return ERR_PTR(-EINVAL);
        }

        ptrace_breakpoint_init(&attr);

        /*
         * Initialise fields to sane defaults
         * (i.e. values that will pass validation).
         */
        attr.bp_addr        = 0;
        attr.bp_len        = HW_BREAKPOINT_LEN_4;
        attr.bp_type        = type;
        attr.disabled        = 1;

        bp = register_user_hw_breakpoint(&attr, ptrace_hbptriggered, NULL, tsk);
        if (IS_ERR(bp))
                return bp;

        err = ptrace_hbp_set_event(note_type, tsk, idx, bp);
        if (err)
                return ERR_PTR(err);

        return bp;
}

static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type,
                                     struct arch_hw_breakpoint_ctrl ctrl,
                                     struct perf_event_attr *attr)
{
        int err, len, type, offset, disabled = !ctrl.enabled;

        attr->disabled = disabled;
        if (disabled)
                return 0;

        err = arch_bp_generic_fields(ctrl, &len, &type, &offset);
        if (err)
                return err;

        switch (note_type) {
        case NT_ARM_HW_BREAK:
                if ((type & HW_BREAKPOINT_X) != type)
                        return -EINVAL;
                break;
        case NT_ARM_HW_WATCH:
                if ((type & HW_BREAKPOINT_RW) != type)
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        attr->bp_len        = len;
        attr->bp_type        = type;
        attr->bp_addr        += offset;

        return 0;
}

static int ptrace_hbp_get_resource_info(unsigned int note_type, u32 *info)
{
        u8 num;
        u32 reg = 0;

        switch (note_type) {
        case NT_ARM_HW_BREAK:
                num = hw_breakpoint_slots(TYPE_INST);
                break;
        case NT_ARM_HW_WATCH:
                num = hw_breakpoint_slots(TYPE_DATA);
                break;
        default:
                return -EINVAL;
        }

        reg |= debug_monitors_arch();
        reg <<= 8;
        reg |= num;

        *info = reg;
        return 0;
}

static int ptrace_hbp_get_ctrl(unsigned int note_type,
                               struct task_struct *tsk,
                               unsigned long idx,
                               u32 *ctrl)
{
        struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx);

        if (IS_ERR(bp))
                return PTR_ERR(bp);

        *ctrl = bp ? encode_ctrl_reg(counter_arch_bp(bp)->ctrl) : 0;
        return 0;
}

static int ptrace_hbp_get_addr(unsigned int note_type,
                               struct task_struct *tsk,
                               unsigned long idx,
                               u64 *addr)
{
        struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx);

        if (IS_ERR(bp))
                return PTR_ERR(bp);

        *addr = bp ? counter_arch_bp(bp)->address : 0;
        return 0;
}

static struct perf_event *ptrace_hbp_get_initialised_bp(unsigned int note_type,
                                                        struct task_struct *tsk,
                                                        unsigned long idx)
{
        struct perf_event *bp = ptrace_hbp_get_event(note_type, tsk, idx);

        if (!bp)
                bp = ptrace_hbp_create(note_type, tsk, idx);

        return bp;
}

static int ptrace_hbp_set_ctrl(unsigned int note_type,
                               struct task_struct *tsk,
                               unsigned long idx,
                               u32 uctrl)
{
        int err;
        struct perf_event *bp;
        struct perf_event_attr attr;
        struct arch_hw_breakpoint_ctrl ctrl;

        bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx);
        if (IS_ERR(bp)) {
                err = PTR_ERR(bp);
                return err;
        }

        attr = bp->attr;
        decode_ctrl_reg(uctrl, &ctrl);
        err = ptrace_hbp_fill_attr_ctrl(note_type, ctrl, &attr);
        if (err)
                return err;

        return modify_user_hw_breakpoint(bp, &attr);
}

static int ptrace_hbp_set_addr(unsigned int note_type,
                               struct task_struct *tsk,
                               unsigned long idx,
                               u64 addr)
{
        int err;
        struct perf_event *bp;
        struct perf_event_attr attr;

        bp = ptrace_hbp_get_initialised_bp(note_type, tsk, idx);
        if (IS_ERR(bp)) {
                err = PTR_ERR(bp);
                return err;
        }

        attr = bp->attr;
        attr.bp_addr = addr;
        err = modify_user_hw_breakpoint(bp, &attr);
        return err;
}

#define PTRACE_HBP_ADDR_SZ        sizeof(u64)
#define PTRACE_HBP_CTRL_SZ        sizeof(u32)
#define PTRACE_HBP_PAD_SZ        sizeof(u32)

static int hw_break_get(struct task_struct *target,
                        const struct user_regset *regset,
                        struct membuf to)
{
        unsigned int note_type = regset->core_note_type;
        int ret, idx = 0;
        u32 info, ctrl;
        u64 addr;

        /* Resource info */
        ret = ptrace_hbp_get_resource_info(note_type, &info);
        if (ret)
                return ret;

        membuf_write(&to, &info, sizeof(info));
        membuf_zero(&to, sizeof(u32));
        /* (address, ctrl) registers */
        while (to.left) {
                ret = ptrace_hbp_get_addr(note_type, target, idx, &addr);
                if (ret)
                        return ret;
                ret = ptrace_hbp_get_ctrl(note_type, target, idx, &ctrl);
                if (ret)
                        return ret;
                membuf_store(&to, addr);
                membuf_store(&to, ctrl);
                membuf_zero(&to, sizeof(u32));
                idx++;
        }
        return 0;
}

static int hw_break_set(struct task_struct *target,
                        const struct user_regset *regset,
                        unsigned int pos, unsigned int count,
                        const void *kbuf, const void __user *ubuf)
{
        unsigned int note_type = regset->core_note_type;
        int ret, idx = 0, offset, limit;
        u32 ctrl;
        u64 addr;

        /* Resource info and pad */
        offset = offsetof(struct user_hwdebug_state, dbg_regs);
        user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, 0, offset);

        /* (address, ctrl) registers */
        limit = regset->n * regset->size;
        while (count && offset < limit) {
                if (count < PTRACE_HBP_ADDR_SZ)
                        return -EINVAL;
                ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &addr,
                                         offset, offset + PTRACE_HBP_ADDR_SZ);
                if (ret)
                        return ret;
                ret = ptrace_hbp_set_addr(note_type, target, idx, addr);
                if (ret)
                        return ret;
                offset += PTRACE_HBP_ADDR_SZ;

                if (!count)
                        break;
                ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl,
                                         offset, offset + PTRACE_HBP_CTRL_SZ);
                if (ret)
                        return ret;
                ret = ptrace_hbp_set_ctrl(note_type, target, idx, ctrl);
                if (ret)
                        return ret;
                offset += PTRACE_HBP_CTRL_SZ;

                user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
                                          offset, offset + PTRACE_HBP_PAD_SZ);
                offset += PTRACE_HBP_PAD_SZ;
                idx++;
        }

        return 0;
}
#endif        /* CONFIG_HAVE_HW_BREAKPOINT */

static int gpr_get(struct task_struct *target,
                   const struct user_regset *regset,
                   struct membuf to)
{
        struct user_pt_regs *uregs = &task_pt_regs(target)->user_regs;
        return membuf_write(&to, uregs, sizeof(*uregs));
}

static int gpr_set(struct task_struct *target, const struct user_regset *regset,
                   unsigned int pos, unsigned int count,
                   const void *kbuf, const void __user *ubuf)
{
        int ret;
        struct user_pt_regs newregs = task_pt_regs(target)->user_regs;

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newregs, 0, -1);
        if (ret)
                return ret;

        if (!valid_user_regs(&newregs, target))
                return -EINVAL;

        task_pt_regs(target)->user_regs = newregs;
        return 0;
}

static int fpr_active(struct task_struct *target, const struct user_regset *regset)
{
        if (!system_supports_fpsimd())
                return -ENODEV;
        return regset->n;
}

/*
 * TODO: update fp accessors for lazy context switching (sync/flush hwstate)
 */
static int __fpr_get(struct task_struct *target,
                     const struct user_regset *regset,
                     struct membuf to)
{
        struct user_fpsimd_state *uregs;

        sve_sync_to_fpsimd(target);

        uregs = &target->thread.uw.fpsimd_state;

        return membuf_write(&to, uregs, sizeof(*uregs));
}

static int fpr_get(struct task_struct *target, const struct user_regset *regset,
                   struct membuf to)
{
        if (!system_supports_fpsimd())
                return -EINVAL;

        if (target == current)
                fpsimd_preserve_current_state();

        return __fpr_get(target, regset, to);
}

static int __fpr_set(struct task_struct *target,
                     const struct user_regset *regset,
                     unsigned int pos, unsigned int count,
                     const void *kbuf, const void __user *ubuf,
                     unsigned int start_pos)
{
        int ret;
        struct user_fpsimd_state newstate;

        /*
         * Ensure target->thread.uw.fpsimd_state is up to date, so that a
         * short copyin can't resurrect stale data.
         */
        sve_sync_to_fpsimd(target);

        newstate = target->thread.uw.fpsimd_state;

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &newstate,
                                 start_pos, start_pos + sizeof(newstate));
        if (ret)
                return ret;

        target->thread.uw.fpsimd_state = newstate;

        return ret;
}

static int fpr_set(struct task_struct *target, const struct user_regset *regset,
                   unsigned int pos, unsigned int count,
                   const void *kbuf, const void __user *ubuf)
{
        int ret;

        if (!system_supports_fpsimd())
                return -EINVAL;

        ret = __fpr_set(target, regset, pos, count, kbuf, ubuf, 0);
        if (ret)
                return ret;

        sve_sync_from_fpsimd_zeropad(target);
        fpsimd_flush_task_state(target);

        return ret;
}

static int tls_get(struct task_struct *target, const struct user_regset *regset,
                   struct membuf to)
{
        int ret;

        if (target == current)
                tls_preserve_current_state();

        ret = membuf_store(&to, target->thread.uw.tp_value);
        if (system_supports_tpidr2())
                ret = membuf_store(&to, target->thread.tpidr2_el0);
        else
                ret = membuf_zero(&to, sizeof(u64));

        return ret;
}

static int tls_set(struct task_struct *target, const struct user_regset *regset,
                   unsigned int pos, unsigned int count,
                   const void *kbuf, const void __user *ubuf)
{
        int ret;
        unsigned long tls[2];

        tls[0] = target->thread.uw.tp_value;
        if (system_supports_tpidr2())
                tls[1] = target->thread.tpidr2_el0;

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, tls, 0, count);
        if (ret)
                return ret;

        target->thread.uw.tp_value = tls[0];
        if (system_supports_tpidr2())
                target->thread.tpidr2_el0 = tls[1];

        return ret;
}

static int fpmr_get(struct task_struct *target, const struct user_regset *regset,
                   struct membuf to)
{
        if (!system_supports_fpmr())
                return -EINVAL;

        if (target == current)
                fpsimd_preserve_current_state();

        return membuf_store(&to, target->thread.uw.fpmr);
}

static int fpmr_set(struct task_struct *target, const struct user_regset *regset,
                   unsigned int pos, unsigned int count,
                   const void *kbuf, const void __user *ubuf)
{
        int ret;
        unsigned long fpmr;

        if (!system_supports_fpmr())
                return -EINVAL;

        fpmr = target->thread.uw.fpmr;

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpmr, 0, count);
        if (ret)
                return ret;

        target->thread.uw.fpmr = fpmr;

        fpsimd_flush_task_state(target);

        return 0;
}

static int system_call_get(struct task_struct *target,
                           const struct user_regset *regset,
                           struct membuf to)
{
        return membuf_store(&to, task_pt_regs(target)->syscallno);
}

static int system_call_set(struct task_struct *target,
                           const struct user_regset *regset,
                           unsigned int pos, unsigned int count,
                           const void *kbuf, const void __user *ubuf)
{
        int syscallno = task_pt_regs(target)->syscallno;
        int ret;

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &syscallno, 0, -1);
        if (ret)
                return ret;

        task_pt_regs(target)->syscallno = syscallno;
        return ret;
}

#ifdef CONFIG_ARM64_SVE

static void sve_init_header_from_task(struct user_sve_header *header,
                                      struct task_struct *target,
                                      enum vec_type type)
{
        unsigned int vq;
        bool active;
        enum vec_type task_type;

        memset(header, 0, sizeof(*header));

        /* Check if the requested registers are active for the task */
        if (thread_sm_enabled(&target->thread))
                task_type = ARM64_VEC_SME;
        else
                task_type = ARM64_VEC_SVE;
        active = (task_type == type);

        switch (type) {
        case ARM64_VEC_SVE:
                if (test_tsk_thread_flag(target, TIF_SVE_VL_INHERIT))
                        header->flags |= SVE_PT_VL_INHERIT;
                break;
        case ARM64_VEC_SME:
                if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT))
                        header->flags |= SVE_PT_VL_INHERIT;
                break;
        default:
                WARN_ON_ONCE(1);
                return;
        }

        if (active) {
                if (target->thread.fp_type == FP_STATE_FPSIMD) {
                        header->flags |= SVE_PT_REGS_FPSIMD;
                } else {
                        header->flags |= SVE_PT_REGS_SVE;
                }
        }

        header->vl = task_get_vl(target, type);
        vq = sve_vq_from_vl(header->vl);

        header->max_vl = vec_max_vl(type);
        header->size = SVE_PT_SIZE(vq, header->flags);
        header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
                                      SVE_PT_REGS_SVE);
}

static unsigned int sve_size_from_header(struct user_sve_header const *header)
{
        return ALIGN(header->size, SVE_VQ_BYTES);
}

static int sve_get_common(struct task_struct *target,
                          const struct user_regset *regset,
                          struct membuf to,
                          enum vec_type type)
{
        struct user_sve_header header;
        unsigned int vq;
        unsigned long start, end;

        /* Header */
        sve_init_header_from_task(&header, target, type);
        vq = sve_vq_from_vl(header.vl);

        membuf_write(&to, &header, sizeof(header));

        if (target == current)
                fpsimd_preserve_current_state();

        BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
        BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));

        switch ((header.flags & SVE_PT_REGS_MASK)) {
        case SVE_PT_REGS_FPSIMD:
                return __fpr_get(target, regset, to);

        case SVE_PT_REGS_SVE:
                start = SVE_PT_SVE_OFFSET;
                end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
                membuf_write(&to, target->thread.sve_state, end - start);

                start = end;
                end = SVE_PT_SVE_FPSR_OFFSET(vq);
                membuf_zero(&to, end - start);

                /*
                 * Copy fpsr, and fpcr which must follow contiguously in
                 * struct fpsimd_state:
                 */
                start = end;
                end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
                membuf_write(&to, &target->thread.uw.fpsimd_state.fpsr,
                             end - start);

                start = end;
                end = sve_size_from_header(&header);
                return membuf_zero(&to, end - start);

        default:
                return 0;
        }
}

static int sve_get(struct task_struct *target,
                   const struct user_regset *regset,
                   struct membuf to)
{
        if (!system_supports_sve())
                return -EINVAL;

        return sve_get_common(target, regset, to, ARM64_VEC_SVE);
}

static int sve_set_common(struct task_struct *target,
                          const struct user_regset *regset,
                          unsigned int pos, unsigned int count,
                          const void *kbuf, const void __user *ubuf,
                          enum vec_type type)
{
        int ret;
        struct user_sve_header header;
        unsigned int vq;
        unsigned long start, end;

        /* Header */
        if (count < sizeof(header))
                return -EINVAL;
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header,
                                 0, sizeof(header));
        if (ret)
                goto out;

        /*
         * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by
         * vec_set_vector_length(), which will also validate them for us:
         */
        ret = vec_set_vector_length(target, type, header.vl,
                ((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
        if (ret)
                goto out;

        /*
         * Actual VL set may be different from what the user asked
         * for, or we may have configured the _ONEXEC VL not the
         * current VL:
         */
        vq = sve_vq_from_vl(task_get_vl(target, type));

        /* Enter/exit streaming mode */
        if (system_supports_sme()) {
                u64 old_svcr = target->thread.svcr;

                switch (type) {
                case ARM64_VEC_SVE:
                        target->thread.svcr &= ~SVCR_SM_MASK;
                        break;
                case ARM64_VEC_SME:
                        target->thread.svcr |= SVCR_SM_MASK;

                        /*
                         * Disable traps and ensure there is SME storage but
                         * preserve any currently set values in ZA/ZT.
                         */
                        sme_alloc(target, false);
                        set_tsk_thread_flag(target, TIF_SME);
                        break;
                default:
                        WARN_ON_ONCE(1);
                        ret = -EINVAL;
                        goto out;
                }

                /*
                 * If we switched then invalidate any existing SVE
                 * state and ensure there's storage.
                 */
                if (target->thread.svcr != old_svcr)
                        sve_alloc(target, true);
        }

        /* Registers: FPSIMD-only case */

        BUILD_BUG_ON(SVE_PT_FPSIMD_OFFSET != sizeof(header));
        if ((header.flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD) {
                ret = __fpr_set(target, regset, pos, count, kbuf, ubuf,
                                SVE_PT_FPSIMD_OFFSET);
                clear_tsk_thread_flag(target, TIF_SVE);
                target->thread.fp_type = FP_STATE_FPSIMD;
                goto out;
        }

        /*
         * Otherwise: no registers or full SVE case.  For backwards
         * compatibility reasons we treat empty flags as SVE registers.
         */

        /*
         * If setting a different VL from the requested VL and there is
         * register data, the data layout will be wrong: don't even
         * try to set the registers in this case.
         */
        if (count && vq != sve_vq_from_vl(header.vl)) {
                ret = -EIO;
                goto out;
        }

        sve_alloc(target, true);
        if (!target->thread.sve_state) {
                ret = -ENOMEM;
                clear_tsk_thread_flag(target, TIF_SVE);
                target->thread.fp_type = FP_STATE_FPSIMD;
                goto out;
        }

        /*
         * Ensure target->thread.sve_state is up to date with target's
         * FPSIMD regs, so that a short copyin leaves trailing
         * registers unmodified.  Only enable SVE if we are
         * configuring normal SVE, a system with streaming SVE may not
         * have normal SVE.
         */
        fpsimd_sync_to_sve(target);
        if (type == ARM64_VEC_SVE)
                set_tsk_thread_flag(target, TIF_SVE);
        target->thread.fp_type = FP_STATE_SVE;

        BUILD_BUG_ON(SVE_PT_SVE_OFFSET != sizeof(header));
        start = SVE_PT_SVE_OFFSET;
        end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                 target->thread.sve_state,
                                 start, end);
        if (ret)
                goto out;

        start = end;
        end = SVE_PT_SVE_FPSR_OFFSET(vq);
        user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, start, end);

        /*
         * Copy fpsr, and fpcr which must follow contiguously in
         * struct fpsimd_state:
         */
        start = end;
        end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                 &target->thread.uw.fpsimd_state.fpsr,
                                 start, end);

out:
        fpsimd_flush_task_state(target);
        return ret;
}

static int sve_set(struct task_struct *target,
                   const struct user_regset *regset,
                   unsigned int pos, unsigned int count,
                   const void *kbuf, const void __user *ubuf)
{
        if (!system_supports_sve())
                return -EINVAL;

        return sve_set_common(target, regset, pos, count, kbuf, ubuf,
                              ARM64_VEC_SVE);
}

#endif /* CONFIG_ARM64_SVE */

#ifdef CONFIG_ARM64_SME

static int ssve_get(struct task_struct *target,
                   const struct user_regset *regset,
                   struct membuf to)
{
        if (!system_supports_sme())
                return -EINVAL;

        return sve_get_common(target, regset, to, ARM64_VEC_SME);
}

static int ssve_set(struct task_struct *target,
                    const struct user_regset *regset,
                    unsigned int pos, unsigned int count,
                    const void *kbuf, const void __user *ubuf)
{
        if (!system_supports_sme())
                return -EINVAL;

        return sve_set_common(target, regset, pos, count, kbuf, ubuf,
                              ARM64_VEC_SME);
}

static int za_get(struct task_struct *target,
                  const struct user_regset *regset,
                  struct membuf to)
{
        struct user_za_header header;
        unsigned int vq;
        unsigned long start, end;

        if (!system_supports_sme())
                return -EINVAL;

        /* Header */
        memset(&header, 0, sizeof(header));

        if (test_tsk_thread_flag(target, TIF_SME_VL_INHERIT))
                header.flags |= ZA_PT_VL_INHERIT;

        header.vl = task_get_sme_vl(target);
        vq = sve_vq_from_vl(header.vl);
        header.max_vl = sme_max_vl();
        header.max_size = ZA_PT_SIZE(vq);

        /* If ZA is not active there is only the header */
        if (thread_za_enabled(&target->thread))
                header.size = ZA_PT_SIZE(vq);
        else
                header.size = ZA_PT_ZA_OFFSET;

        membuf_write(&to, &header, sizeof(header));

        BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header));
        end = ZA_PT_ZA_OFFSET;

        if (target == current)
                fpsimd_preserve_current_state();

        /* Any register data to include? */
        if (thread_za_enabled(&target->thread)) {
                start = end;
                end = ZA_PT_SIZE(vq);
                membuf_write(&to, target->thread.sme_state, end - start);
        }

        /* Zero any trailing padding */
        start = end;
        end = ALIGN(header.size, SVE_VQ_BYTES);
        return membuf_zero(&to, end - start);
}

static int za_set(struct task_struct *target,
                  const struct user_regset *regset,
                  unsigned int pos, unsigned int count,
                  const void *kbuf, const void __user *ubuf)
{
        int ret;
        struct user_za_header header;
        unsigned int vq;
        unsigned long start, end;

        if (!system_supports_sme())
                return -EINVAL;

        /* Header */
        if (count < sizeof(header))
                return -EINVAL;
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &header,
                                 0, sizeof(header));
        if (ret)
                goto out;

        /*
         * All current ZA_PT_* flags are consumed by
         * vec_set_vector_length(), which will also validate them for
         * us:
         */
        ret = vec_set_vector_length(target, ARM64_VEC_SME, header.vl,
                ((unsigned long)header.flags) << 16);
        if (ret)
                goto out;

        /*
         * Actual VL set may be different from what the user asked
         * for, or we may have configured the _ONEXEC rather than
         * current VL:
         */
        vq = sve_vq_from_vl(task_get_sme_vl(target));

        /* Ensure there is some SVE storage for streaming mode */
        if (!target->thread.sve_state) {
                sve_alloc(target, false);
                if (!target->thread.sve_state) {
                        ret = -ENOMEM;
                        goto out;
                }
        }

        /*
         * Only flush the storage if PSTATE.ZA was not already set,
         * otherwise preserve any existing data.
         */
        sme_alloc(target, !thread_za_enabled(&target->thread));
        if (!target->thread.sme_state)
                return -ENOMEM;

        /* If there is no data then disable ZA */
        if (!count) {
                target->thread.svcr &= ~SVCR_ZA_MASK;
                goto out;
        }

        /*
         * If setting a different VL from the requested VL and there is
         * register data, the data layout will be wrong: don't even
         * try to set the registers in this case.
         */
        if (vq != sve_vq_from_vl(header.vl)) {
                ret = -EIO;
                goto out;
        }

        BUILD_BUG_ON(ZA_PT_ZA_OFFSET != sizeof(header));
        start = ZA_PT_ZA_OFFSET;
        end = ZA_PT_SIZE(vq);
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                 target->thread.sme_state,
                                 start, end);
        if (ret)
                goto out;

        /* Mark ZA as active and let userspace use it */
        set_tsk_thread_flag(target, TIF_SME);
        target->thread.svcr |= SVCR_ZA_MASK;

out:
        fpsimd_flush_task_state(target);
        return ret;
}

static int zt_get(struct task_struct *target,
                  const struct user_regset *regset,
                  struct membuf to)
{
        if (!system_supports_sme2())
                return -EINVAL;

        /*
         * If PSTATE.ZA is not set then ZT will be zeroed when it is
         * enabled so report the current register value as zero.
         */
        if (thread_za_enabled(&target->thread))
                membuf_write(&to, thread_zt_state(&target->thread),
                             ZT_SIG_REG_BYTES);
        else
                membuf_zero(&to, ZT_SIG_REG_BYTES);

        return 0;
}

static int zt_set(struct task_struct *target,
                  const struct user_regset *regset,
                  unsigned int pos, unsigned int count,
                  const void *kbuf, const void __user *ubuf)
{
        int ret;

        if (!system_supports_sme2())
                return -EINVAL;

        /* Ensure SVE storage in case this is first use of SME */
        sve_alloc(target, false);
        if (!target->thread.sve_state)
                return -ENOMEM;

        if (!thread_za_enabled(&target->thread)) {
                sme_alloc(target, true);
                if (!target->thread.sme_state)
                        return -ENOMEM;
        }

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                 thread_zt_state(&target->thread),
                                 0, ZT_SIG_REG_BYTES);
        if (ret == 0) {
                target->thread.svcr |= SVCR_ZA_MASK;
                set_tsk_thread_flag(target, TIF_SME);
        }

        fpsimd_flush_task_state(target);

        return ret;
}

#endif /* CONFIG_ARM64_SME */

#ifdef CONFIG_ARM64_PTR_AUTH
static int pac_mask_get(struct task_struct *target,
                        const struct user_regset *regset,
                        struct membuf to)
{
        /*
         * The PAC bits can differ across data and instruction pointers
         * depending on TCR_EL1.TBID*, which we may make use of in future, so
         * we expose separate masks.
         */
        unsigned long mask = ptrauth_user_pac_mask();
        struct user_pac_mask uregs = {
                .data_mask = mask,
                .insn_mask = mask,
        };

        if (!system_supports_address_auth())
                return -EINVAL;

        return membuf_write(&to, &uregs, sizeof(uregs));
}

static int pac_enabled_keys_get(struct task_struct *target,
                                const struct user_regset *regset,
                                struct membuf to)
{
        long enabled_keys = ptrauth_get_enabled_keys(target);

        if (IS_ERR_VALUE(enabled_keys))
                return enabled_keys;

        return membuf_write(&to, &enabled_keys, sizeof(enabled_keys));
}

static int pac_enabled_keys_set(struct task_struct *target,
                                const struct user_regset *regset,
                                unsigned int pos, unsigned int count,
                                const void *kbuf, const void __user *ubuf)
{
        int ret;
        long enabled_keys = ptrauth_get_enabled_keys(target);

        if (IS_ERR_VALUE(enabled_keys))
                return enabled_keys;

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &enabled_keys, 0,
                                 sizeof(long));
        if (ret)
                return ret;

        return ptrauth_set_enabled_keys(target, PR_PAC_ENABLED_KEYS_MASK,
                                        enabled_keys);
}

#ifdef CONFIG_CHECKPOINT_RESTORE
static __uint128_t pac_key_to_user(const struct ptrauth_key *key)
{
        return (__uint128_t)key->hi << 64 | key->lo;
}

static struct ptrauth_key pac_key_from_user(__uint128_t ukey)
{
        struct ptrauth_key key = {
                .lo = (unsigned long)ukey,
                .hi = (unsigned long)(ukey >> 64),
        };

        return key;
}

static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys,
                                     const struct ptrauth_keys_user *keys)
{
        ukeys->apiakey = pac_key_to_user(&keys->apia);
        ukeys->apibkey = pac_key_to_user(&keys->apib);
        ukeys->apdakey = pac_key_to_user(&keys->apda);
        ukeys->apdbkey = pac_key_to_user(&keys->apdb);
}

static void pac_address_keys_from_user(struct ptrauth_keys_user *keys,
                                       const struct user_pac_address_keys *ukeys)
{
        keys->apia = pac_key_from_user(ukeys->apiakey);
        keys->apib = pac_key_from_user(ukeys->apibkey);
        keys->apda = pac_key_from_user(ukeys->apdakey);
        keys->apdb = pac_key_from_user(ukeys->apdbkey);
}

static int pac_address_keys_get(struct task_struct *target,
                                const struct user_regset *regset,
                                struct membuf to)
{
        struct ptrauth_keys_user *keys = &target->thread.keys_user;
        struct user_pac_address_keys user_keys;

        if (!system_supports_address_auth())
                return -EINVAL;

        pac_address_keys_to_user(&user_keys, keys);

        return membuf_write(&to, &user_keys, sizeof(user_keys));
}

static int pac_address_keys_set(struct task_struct *target,
                                const struct user_regset *regset,
                                unsigned int pos, unsigned int count,
                                const void *kbuf, const void __user *ubuf)
{
        struct ptrauth_keys_user *keys = &target->thread.keys_user;
        struct user_pac_address_keys user_keys;
        int ret;

        if (!system_supports_address_auth())
                return -EINVAL;

        pac_address_keys_to_user(&user_keys, keys);
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                 &user_keys, 0, -1);
        if (ret)
                return ret;
        pac_address_keys_from_user(keys, &user_keys);

        return 0;
}

static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys,
                                     const struct ptrauth_keys_user *keys)
{
        ukeys->apgakey = pac_key_to_user(&keys->apga);
}

static void pac_generic_keys_from_user(struct ptrauth_keys_user *keys,
                                       const struct user_pac_generic_keys *ukeys)
{
        keys->apga = pac_key_from_user(ukeys->apgakey);
}

static int pac_generic_keys_get(struct task_struct *target,
                                const struct user_regset *regset,
                                struct membuf to)
{
        struct ptrauth_keys_user *keys = &target->thread.keys_user;
        struct user_pac_generic_keys user_keys;

        if (!system_supports_generic_auth())
                return -EINVAL;

        pac_generic_keys_to_user(&user_keys, keys);

        return membuf_write(&to, &user_keys, sizeof(user_keys));
}

static int pac_generic_keys_set(struct task_struct *target,
                                const struct user_regset *regset,
                                unsigned int pos, unsigned int count,
                                const void *kbuf, const void __user *ubuf)
{
        struct ptrauth_keys_user *keys = &target->thread.keys_user;
        struct user_pac_generic_keys user_keys;
        int ret;

        if (!system_supports_generic_auth())
                return -EINVAL;

        pac_generic_keys_to_user(&user_keys, keys);
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
                                 &user_keys, 0, -1);
        if (ret)
                return ret;
        pac_generic_keys_from_user(keys, &user_keys);

        return 0;
}
#endif /* CONFIG_CHECKPOINT_RESTORE */
#endif /* CONFIG_ARM64_PTR_AUTH */

#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
static int tagged_addr_ctrl_get(struct task_struct *target,
                                const struct user_regset *regset,
                                struct membuf to)
{
        long ctrl = get_tagged_addr_ctrl(target);

        if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl)))
                return ctrl;

        return membuf_write(&to, &ctrl, sizeof(ctrl));
}

static int tagged_addr_ctrl_set(struct task_struct *target, const struct
                                user_regset *regset, unsigned int pos,
                                unsigned int count, const void *kbuf, const
                                void __user *ubuf)
{
        int ret;
        long ctrl;

        ctrl = get_tagged_addr_ctrl(target);
        if (WARN_ON_ONCE(IS_ERR_VALUE(ctrl)))
                return ctrl;

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1);
        if (ret)
                return ret;

        return set_tagged_addr_ctrl(target, ctrl);
}
#endif

#ifdef CONFIG_ARM64_POE
static int poe_get(struct task_struct *target,
                   const struct user_regset *regset,
                   struct membuf to)
{
        if (!system_supports_poe())
                return -EINVAL;

        return membuf_write(&to, &target->thread.por_el0,
                            sizeof(target->thread.por_el0));
}

static int poe_set(struct task_struct *target, const struct
                   user_regset *regset, unsigned int pos,
                   unsigned int count, const void *kbuf, const
                   void __user *ubuf)
{
        int ret;
        long ctrl;

        if (!system_supports_poe())
                return -EINVAL;

        ctrl = target->thread.por_el0;

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ctrl, 0, -1);
        if (ret)
                return ret;

        target->thread.por_el0 = ctrl;

        return 0;
}
#endif

#ifdef CONFIG_ARM64_GCS
static void task_gcs_to_user(struct user_gcs *user_gcs,
                             const struct task_struct *target)
{
        user_gcs->features_enabled = target->thread.gcs_el0_mode;
        user_gcs->features_locked = target->thread.gcs_el0_locked;
        user_gcs->gcspr_el0 = target->thread.gcspr_el0;
}

static void task_gcs_from_user(struct task_struct *target,
                               const struct user_gcs *user_gcs)
{
        target->thread.gcs_el0_mode = user_gcs->features_enabled;
        target->thread.gcs_el0_locked = user_gcs->features_locked;
        target->thread.gcspr_el0 = user_gcs->gcspr_el0;
}

static int gcs_get(struct task_struct *target,
                   const struct user_regset *regset,
                   struct membuf to)
{
        struct user_gcs user_gcs;

        if (!system_supports_gcs())
                return -EINVAL;

        if (target == current)
                gcs_preserve_current_state();

        task_gcs_to_user(&user_gcs, target);

        return membuf_write(&to, &user_gcs, sizeof(user_gcs));
}

static int gcs_set(struct task_struct *target, const struct
                   user_regset *regset, unsigned int pos,
                   unsigned int count, const void *kbuf, const
                   void __user *ubuf)
{
        int ret;
        struct user_gcs user_gcs;

        if (!system_supports_gcs())
                return -EINVAL;

        task_gcs_to_user(&user_gcs, target);

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &user_gcs, 0, -1);
        if (ret)
                return ret;

        if (user_gcs.features_enabled & ~PR_SHADOW_STACK_SUPPORTED_STATUS_MASK)
                return -EINVAL;

        task_gcs_from_user(target, &user_gcs);

        return 0;
}
#endif

enum aarch64_regset {
        REGSET_GPR,
        REGSET_FPR,
        REGSET_TLS,
#ifdef CONFIG_HAVE_HW_BREAKPOINT
        REGSET_HW_BREAK,
        REGSET_HW_WATCH,
#endif
        REGSET_FPMR,
        REGSET_SYSTEM_CALL,
#ifdef CONFIG_ARM64_SVE
        REGSET_SVE,
#endif
#ifdef CONFIG_ARM64_SME
        REGSET_SSVE,
        REGSET_ZA,
        REGSET_ZT,
#endif
#ifdef CONFIG_ARM64_PTR_AUTH
        REGSET_PAC_MASK,
        REGSET_PAC_ENABLED_KEYS,
#ifdef CONFIG_CHECKPOINT_RESTORE
        REGSET_PACA_KEYS,
        REGSET_PACG_KEYS,
#endif
#endif
#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
        REGSET_TAGGED_ADDR_CTRL,
#endif
#ifdef CONFIG_ARM64_POE
        REGSET_POE,
#endif
#ifdef CONFIG_ARM64_GCS
        REGSET_GCS,
#endif
};

static const struct user_regset aarch64_regsets[] = {
        [REGSET_GPR] = {
                .core_note_type = NT_PRSTATUS,
                .n = sizeof(struct user_pt_regs) / sizeof(u64),
                .size = sizeof(u64),
                .align = sizeof(u64),
                .regset_get = gpr_get,
                .set = gpr_set
        },
        [REGSET_FPR] = {
                .core_note_type = NT_PRFPREG,
                .n = sizeof(struct user_fpsimd_state) / sizeof(u32),
                /*
                 * We pretend we have 32-bit registers because the fpsr and
                 * fpcr are 32-bits wide.
                 */
                .size = sizeof(u32),
                .align = sizeof(u32),
                .active = fpr_active,
                .regset_get = fpr_get,
                .set = fpr_set
        },
        [REGSET_TLS] = {
                .core_note_type = NT_ARM_TLS,
                .n = 2,
                .size = sizeof(void *),
                .align = sizeof(void *),
                .regset_get = tls_get,
                .set = tls_set,
        },
#ifdef CONFIG_HAVE_HW_BREAKPOINT
        [REGSET_HW_BREAK] = {
                .core_note_type = NT_ARM_HW_BREAK,
                .n = sizeof(struct user_hwdebug_state) / sizeof(u32),
                .size = sizeof(u32),
                .align = sizeof(u32),
                .regset_get = hw_break_get,
                .set = hw_break_set,
        },
        [REGSET_HW_WATCH] = {
                .core_note_type = NT_ARM_HW_WATCH,
                .n = sizeof(struct user_hwdebug_state) / sizeof(u32),
                .size = sizeof(u32),
                .align = sizeof(u32),
                .regset_get = hw_break_get,
                .set = hw_break_set,
        },
#endif
        [REGSET_SYSTEM_CALL] = {
                .core_note_type = NT_ARM_SYSTEM_CALL,
                .n = 1,
                .size = sizeof(int),
                .align = sizeof(int),
                .regset_get = system_call_get,
                .set = system_call_set,
        },
        [REGSET_FPMR] = {
                .core_note_type = NT_ARM_FPMR,
                .n = 1,
                .size = sizeof(u64),
                .align = sizeof(u64),
                .regset_get = fpmr_get,
                .set = fpmr_set,
        },
#ifdef CONFIG_ARM64_SVE
        [REGSET_SVE] = { /* Scalable Vector Extension */
                .core_note_type = NT_ARM_SVE,
                .n = DIV_ROUND_UP(SVE_PT_SIZE(ARCH_SVE_VQ_MAX,
                                              SVE_PT_REGS_SVE),
                                  SVE_VQ_BYTES),
                .size = SVE_VQ_BYTES,
                .align = SVE_VQ_BYTES,
                .regset_get = sve_get,
                .set = sve_set,
        },
#endif
#ifdef CONFIG_ARM64_SME
        [REGSET_SSVE] = { /* Streaming mode SVE */
                .core_note_type = NT_ARM_SSVE,
                .n = DIV_ROUND_UP(SVE_PT_SIZE(SME_VQ_MAX, SVE_PT_REGS_SVE),
                                  SVE_VQ_BYTES),
                .size = SVE_VQ_BYTES,
                .align = SVE_VQ_BYTES,
                .regset_get = ssve_get,
                .set = ssve_set,
        },
        [REGSET_ZA] = { /* SME ZA */
                .core_note_type = NT_ARM_ZA,
                /*
                 * ZA is a single register but it's variably sized and
                 * the ptrace core requires that the size of any data
                 * be an exact multiple of the configured register
                 * size so report as though we had SVE_VQ_BYTES
                 * registers. These values aren't exposed to
                 * userspace.
                 */
                .n = DIV_ROUND_UP(ZA_PT_SIZE(SME_VQ_MAX), SVE_VQ_BYTES),
                .size = SVE_VQ_BYTES,
                .align = SVE_VQ_BYTES,
                .regset_get = za_get,
                .set = za_set,
        },
        [REGSET_ZT] = { /* SME ZT */
                .core_note_type = NT_ARM_ZT,
                .n = 1,
                .size = ZT_SIG_REG_BYTES,
                .align = sizeof(u64),
                .regset_get = zt_get,
                .set = zt_set,
        },
#endif
#ifdef CONFIG_ARM64_PTR_AUTH
        [REGSET_PAC_MASK] = {
                .core_note_type = NT_ARM_PAC_MASK,
                .n = sizeof(struct user_pac_mask) / sizeof(u64),
                .size = sizeof(u64),
                .align = sizeof(u64),
                .regset_get = pac_mask_get,
                /* this cannot be set dynamically */
        },
        [REGSET_PAC_ENABLED_KEYS] = {
                .core_note_type = NT_ARM_PAC_ENABLED_KEYS,
                .n = 1,
                .size = sizeof(long),
                .align = sizeof(long),
                .regset_get = pac_enabled_keys_get,
                .set = pac_enabled_keys_set,
        },
#ifdef CONFIG_CHECKPOINT_RESTORE
        [REGSET_PACA_KEYS] = {
                .core_note_type = NT_ARM_PACA_KEYS,
                .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t),
                .size = sizeof(__uint128_t),
                .align = sizeof(__uint128_t),
                .regset_get = pac_address_keys_get,
                .set = pac_address_keys_set,
        },
        [REGSET_PACG_KEYS] = {
                .core_note_type = NT_ARM_PACG_KEYS,
                .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t),
                .size = sizeof(__uint128_t),
                .align = sizeof(__uint128_t),
                .regset_get = pac_generic_keys_get,
                .set = pac_generic_keys_set,
        },
#endif
#endif
#ifdef CONFIG_ARM64_TAGGED_ADDR_ABI
        [REGSET_TAGGED_ADDR_CTRL] = {
                .core_note_type = NT_ARM_TAGGED_ADDR_CTRL,
                .n = 1,
                .size = sizeof(long),
                .align = sizeof(long),
                .regset_get = tagged_addr_ctrl_get,
                .set = tagged_addr_ctrl_set,
        },
#endif
#ifdef CONFIG_ARM64_POE
        [REGSET_POE] = {
                .core_note_type = NT_ARM_POE,
                .n = 1,
                .size = sizeof(long),
                .align = sizeof(long),
                .regset_get = poe_get,
                .set = poe_set,
        },
#endif
#ifdef CONFIG_ARM64_GCS
        [REGSET_GCS] = {
                .core_note_type = NT_ARM_GCS,
                .n = sizeof(struct user_gcs) / sizeof(u64),
                .size = sizeof(u64),
                .align = sizeof(u64),
                .regset_get = gcs_get,
                .set = gcs_set,
        },
#endif
};

static const struct user_regset_view user_aarch64_view = {
        .name = "aarch64", .e_machine = EM_AARCH64,
        .regsets = aarch64_regsets, .n = ARRAY_SIZE(aarch64_regsets)
};

enum compat_regset {
        REGSET_COMPAT_GPR,
        REGSET_COMPAT_VFP,
};

static inline compat_ulong_t compat_get_user_reg(struct task_struct *task, int idx)
{
        struct pt_regs *regs = task_pt_regs(task);

        switch (idx) {
        case 15:
                return regs->pc;
        case 16:
                return pstate_to_compat_psr(regs->pstate);
        case 17:
                return regs->orig_x0;
        default:
                return regs->regs[idx];
        }
}

static int compat_gpr_get(struct task_struct *target,
                          const struct user_regset *regset,
                          struct membuf to)
{
        int i = 0;

        while (to.left)
                membuf_store(&to, compat_get_user_reg(target, i++));
        return 0;
}

static int compat_gpr_set(struct task_struct *target,
                          const struct user_regset *regset,
                          unsigned int pos, unsigned int count,
                          const void *kbuf, const void __user *ubuf)
{
        struct pt_regs newregs;
        int ret = 0;
        unsigned int i, start, num_regs;

        /* Calculate the number of AArch32 registers contained in count */
        num_regs = count / regset->size;

        /* Convert pos into an register number */
        start = pos / regset->size;

        if (start + num_regs > regset->n)
                return -EIO;

        newregs = *task_pt_regs(target);

        for (i = 0; i < num_regs; ++i) {
                unsigned int idx = start + i;
                compat_ulong_t reg;

                if (kbuf) {
                        memcpy(&reg, kbuf, sizeof(reg));
                        kbuf += sizeof(reg);
                } else {
                        ret = copy_from_user(&reg, ubuf, sizeof(reg));
                        if (ret) {
                                ret = -EFAULT;
                                break;
                        }

                        ubuf += sizeof(reg);
                }

                switch (idx) {
                case 15:
                        newregs.pc = reg;
                        break;
                case 16:
                        reg = compat_psr_to_pstate(reg);
                        newregs.pstate = reg;
                        break;
                case 17:
                        newregs.orig_x0 = reg;
                        break;
                default:
                        newregs.regs[idx] = reg;
                }

        }

        if (valid_user_regs(&newregs.user_regs, target))
                *task_pt_regs(target) = newregs;
        else
                ret = -EINVAL;

        return ret;
}

static int compat_vfp_get(struct task_struct *target,
                          const struct user_regset *regset,
                          struct membuf to)
{
        struct user_fpsimd_state *uregs;
        compat_ulong_t fpscr;

        if (!system_supports_fpsimd())
                return -EINVAL;

        uregs = &target->thread.uw.fpsimd_state;

        if (target == current)
                fpsimd_preserve_current_state();

        /*
         * The VFP registers are packed into the fpsimd_state, so they all sit
         * nicely together for us. We just need to create the fpscr separately.
         */
        membuf_write(&to, uregs, VFP_STATE_SIZE - sizeof(compat_ulong_t));
        fpscr = (uregs->fpsr & VFP_FPSCR_STAT_MASK) |
                (uregs->fpcr & VFP_FPSCR_CTRL_MASK);
        return membuf_store(&to, fpscr);
}

static int compat_vfp_set(struct task_struct *target,
                          const struct user_regset *regset,
                          unsigned int pos, unsigned int count,
                          const void *kbuf, const void __user *ubuf)
{
        struct user_fpsimd_state *uregs;
        compat_ulong_t fpscr;
        int ret, vregs_end_pos;

        if (!system_supports_fpsimd())
                return -EINVAL;

        uregs = &target->thread.uw.fpsimd_state;

        vregs_end_pos = VFP_STATE_SIZE - sizeof(compat_ulong_t);
        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, uregs, 0,
                                 vregs_end_pos);

        if (count && !ret) {
                ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpscr,
                                         vregs_end_pos, VFP_STATE_SIZE);
                if (!ret) {
                        uregs->fpsr = fpscr & VFP_FPSCR_STAT_MASK;
                        uregs->fpcr = fpscr & VFP_FPSCR_CTRL_MASK;
                }
        }

        fpsimd_flush_task_state(target);
        return ret;
}

static int compat_tls_get(struct task_struct *target,
                          const struct user_regset *regset,
                          struct membuf to)
{
        return membuf_store(&to, (compat_ulong_t)target->thread.uw.tp_value);
}

static int compat_tls_set(struct task_struct *target,
                          const struct user_regset *regset, unsigned int pos,
                          unsigned int count, const void *kbuf,
                          const void __user *ubuf)
{
        int ret;
        compat_ulong_t tls = target->thread.uw.tp_value;

        ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &tls, 0, -1);
        if (ret)
                return ret;

        target->thread.uw.tp_value = tls;
        return ret;
}

static const struct user_regset aarch32_regsets[] = {
        [REGSET_COMPAT_GPR] = {
                .core_note_type = NT_PRSTATUS,
                .n = COMPAT_ELF_NGREG,
                .size = sizeof(compat_elf_greg_t),
                .align = sizeof(compat_elf_greg_t),
                .regset_get = compat_gpr_get,
                .set = compat_gpr_set
        },
        [REGSET_COMPAT_VFP] = {
                .core_note_type = NT_ARM_VFP,
                .n = VFP_STATE_SIZE / sizeof(compat_ulong_t),
                .size = sizeof(compat_ulong_t),
                .align = sizeof(compat_ulong_t),
                .active = fpr_active,
                .regset_get = compat_vfp_get,
                .set = compat_vfp_set
        },
};

static const struct user_regset_view user_aarch32_view = {
        .name = "aarch32", .e_machine = EM_ARM,
        .regsets = aarch32_regsets, .n = ARRAY_SIZE(aarch32_regsets)
};

static const struct user_regset aarch32_ptrace_regsets[] = {
        [REGSET_GPR] = {
                .core_note_type = NT_PRSTATUS,
                .n = COMPAT_ELF_NGREG,
                .size = sizeof(compat_elf_greg_t),
                .align = sizeof(compat_elf_greg_t),
                .regset_get = compat_gpr_get,
                .set = compat_gpr_set
        },
        [REGSET_FPR] = {
                .core_note_type = NT_ARM_VFP,
                .n = VFP_STATE_SIZE / sizeof(compat_ulong_t),
                .size = sizeof(compat_ulong_t),
                .align = sizeof(compat_ulong_t),
                .regset_get = compat_vfp_get,
                .set = compat_vfp_set
        },
        [REGSET_TLS] = {
                .core_note_type = NT_ARM_TLS,
                .n = 1,
                .size = sizeof(compat_ulong_t),
                .align = sizeof(compat_ulong_t),
                .regset_get = compat_tls_get,
                .set = compat_tls_set,
        },
#ifdef CONFIG_HAVE_HW_BREAKPOINT
        [REGSET_HW_BREAK] = {
                .core_note_type = NT_ARM_HW_BREAK,
                .n = sizeof(struct user_hwdebug_state) / sizeof(u32),
                .size = sizeof(u32),
                .align = sizeof(u32),
                .regset_get = hw_break_get,
                .set = hw_break_set,
        },
        [REGSET_HW_WATCH] = {
                .core_note_type = NT_ARM_HW_WATCH,
                .n = sizeof(struct user_hwdebug_state) / sizeof(u32),
                .size = sizeof(u32),
                .align = sizeof(u32),
                .regset_get = hw_break_get,
                .set = hw_break_set,
        },
#endif
        [REGSET_SYSTEM_CALL] = {
                .core_note_type = NT_ARM_SYSTEM_CALL,
                .n = 1,
                .size = sizeof(int),
                .align = sizeof(int),
                .regset_get = system_call_get,
                .set = system_call_set,
        },
};

static const struct user_regset_view user_aarch32_ptrace_view = {
        .name = "aarch32", .e_machine = EM_ARM,
        .regsets = aarch32_ptrace_regsets, .n = ARRAY_SIZE(aarch32_ptrace_regsets)
};

#ifdef CONFIG_COMPAT
static int compat_ptrace_read_user(struct task_struct *tsk, compat_ulong_t off,
                                   compat_ulong_t __user *ret)
{
        compat_ulong_t tmp;

        if (off & 3)
                return -EIO;

        if (off == COMPAT_PT_TEXT_ADDR)
                tmp = tsk->mm->start_code;
        else if (off == COMPAT_PT_DATA_ADDR)
                tmp = tsk->mm->start_data;
        else if (off == COMPAT_PT_TEXT_END_ADDR)
                tmp = tsk->mm->end_code;
        else if (off < sizeof(compat_elf_gregset_t))
                tmp = compat_get_user_reg(tsk, off >> 2);
        else if (off >= COMPAT_USER_SZ)
                return -EIO;
        else
                tmp = 0;

        return put_user(tmp, ret);
}

static int compat_ptrace_write_user(struct task_struct *tsk, compat_ulong_t off,
                                    compat_ulong_t val)
{
        struct pt_regs newregs = *task_pt_regs(tsk);
        unsigned int idx = off / 4;

        if (off & 3 || off >= COMPAT_USER_SZ)
                return -EIO;

        if (off >= sizeof(compat_elf_gregset_t))
                return 0;

        switch (idx) {
        case 15:
                newregs.pc = val;
                break;
        case 16:
                newregs.pstate = compat_psr_to_pstate(val);
                break;
        case 17:
                newregs.orig_x0 = val;
                break;
        default:
                newregs.regs[idx] = val;
        }

        if (!valid_user_regs(&newregs.user_regs, tsk))
                return -EINVAL;

        *task_pt_regs(tsk) = newregs;
        return 0;
}

#ifdef CONFIG_HAVE_HW_BREAKPOINT

/*
 * Convert a virtual register number into an index for a thread_info
 * breakpoint array. Breakpoints are identified using positive numbers
 * whilst watchpoints are negative. The registers are laid out as pairs
 * of (address, control), each pair mapping to a unique hw_breakpoint struct.
 * Register 0 is reserved for describing resource information.
 */
static int compat_ptrace_hbp_num_to_idx(compat_long_t num)
{
        return (abs(num) - 1) >> 1;
}

static int compat_ptrace_hbp_get_resource_info(u32 *kdata)
{
        u8 num_brps, num_wrps, debug_arch, wp_len;
        u32 reg = 0;

        num_brps        = hw_breakpoint_slots(TYPE_INST);
        num_wrps        = hw_breakpoint_slots(TYPE_DATA);

        debug_arch        = debug_monitors_arch();
        wp_len                = 8;
        reg                |= debug_arch;
        reg                <<= 8;
        reg                |= wp_len;
        reg                <<= 8;
        reg                |= num_wrps;
        reg                <<= 8;
        reg                |= num_brps;

        *kdata = reg;
        return 0;
}

static int compat_ptrace_hbp_get(unsigned int note_type,
                                 struct task_struct *tsk,
                                 compat_long_t num,
                                 u32 *kdata)
{
        u64 addr = 0;
        u32 ctrl = 0;

        int err, idx = compat_ptrace_hbp_num_to_idx(num);

        if (num & 1) {
                err = ptrace_hbp_get_addr(note_type, tsk, idx, &addr);
                *kdata = (u32)addr;
        } else {
                err = ptrace_hbp_get_ctrl(note_type, tsk, idx, &ctrl);
                *kdata = ctrl;
        }

        return err;
}

static int compat_ptrace_hbp_set(unsigned int note_type,
                                 struct task_struct *tsk,
                                 compat_long_t num,
                                 u32 *kdata)
{
        u64 addr;
        u32 ctrl;

        int err, idx = compat_ptrace_hbp_num_to_idx(num);

        if (num & 1) {
                addr = *kdata;
                err = ptrace_hbp_set_addr(note_type, tsk, idx, addr);
        } else {
                ctrl = *kdata;
                err = ptrace_hbp_set_ctrl(note_type, tsk, idx, ctrl);
        }

        return err;
}

static int compat_ptrace_gethbpregs(struct task_struct *tsk, compat_long_t num,
                                    compat_ulong_t __user *data)
{
        int ret;
        u32 kdata;

        /* Watchpoint */
        if (num < 0) {
                ret = compat_ptrace_hbp_get(NT_ARM_HW_WATCH, tsk, num, &kdata);
        /* Resource info */
        } else if (num == 0) {
                ret = compat_ptrace_hbp_get_resource_info(&kdata);
        /* Breakpoint */
        } else {
                ret = compat_ptrace_hbp_get(NT_ARM_HW_BREAK, tsk, num, &kdata);
        }

        if (!ret)
                ret = put_user(kdata, data);

        return ret;
}

static int compat_ptrace_sethbpregs(struct task_struct *tsk, compat_long_t num,
                                    compat_ulong_t __user *data)
{
        int ret;
        u32 kdata = 0;

        if (num == 0)
                return 0;

        ret = get_user(kdata, data);
        if (ret)
                return ret;

        if (num < 0)
                ret = compat_ptrace_hbp_set(NT_ARM_HW_WATCH, tsk, num, &kdata);
        else
                ret = compat_ptrace_hbp_set(NT_ARM_HW_BREAK, tsk, num, &kdata);

        return ret;
}
#endif        /* CONFIG_HAVE_HW_BREAKPOINT */

long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
                        compat_ulong_t caddr, compat_ulong_t cdata)
{
        unsigned long addr = caddr;
        unsigned long data = cdata;
        void __user *datap = compat_ptr(data);
        int ret;

        switch (request) {
                case PTRACE_PEEKUSR:
                        ret = compat_ptrace_read_user(child, addr, datap);
                        break;

                case PTRACE_POKEUSR:
                        ret = compat_ptrace_write_user(child, addr, data);
                        break;

                case COMPAT_PTRACE_GETREGS:
                        ret = copy_regset_to_user(child,
                                                  &user_aarch32_view,
                                                  REGSET_COMPAT_GPR,
                                                  0, sizeof(compat_elf_gregset_t),
                                                  datap);
                        break;

                case COMPAT_PTRACE_SETREGS:
                        ret = copy_regset_from_user(child,
                                                    &user_aarch32_view,
                                                    REGSET_COMPAT_GPR,
                                                    0, sizeof(compat_elf_gregset_t),
                                                    datap);
                        break;

                case COMPAT_PTRACE_GET_THREAD_AREA:
                        ret = put_user((compat_ulong_t)child->thread.uw.tp_value,
                                       (compat_ulong_t __user *)datap);
                        break;

                case COMPAT_PTRACE_SET_SYSCALL:
                        task_pt_regs(child)->syscallno = data;
                        ret = 0;
                        break;

                case COMPAT_PTRACE_GETVFPREGS:
                        ret = copy_regset_to_user(child,
                                                  &user_aarch32_view,
                                                  REGSET_COMPAT_VFP,
                                                  0, VFP_STATE_SIZE,
                                                  datap);
                        break;

                case COMPAT_PTRACE_SETVFPREGS:
                        ret = copy_regset_from_user(child,
                                                    &user_aarch32_view,
                                                    REGSET_COMPAT_VFP,
                                                    0, VFP_STATE_SIZE,
                                                    datap);
                        break;

#ifdef CONFIG_HAVE_HW_BREAKPOINT
                case COMPAT_PTRACE_GETHBPREGS:
                        ret = compat_ptrace_gethbpregs(child, addr, datap);
                        break;

                case COMPAT_PTRACE_SETHBPREGS:
                        ret = compat_ptrace_sethbpregs(child, addr, datap);
                        break;
#endif

                default:
                        ret = compat_ptrace_request(child, request, addr,
                                                    data);
                        break;
        }

        return ret;
}
#endif /* CONFIG_COMPAT */

const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
        /*
         * Core dumping of 32-bit tasks or compat ptrace requests must use the
         * user_aarch32_view compatible with arm32. Native ptrace requests on
         * 32-bit children use an extended user_aarch32_ptrace_view to allow
         * access to the TLS register.
         */
        if (is_compat_task())
                return &user_aarch32_view;
        else if (is_compat_thread(task_thread_info(task)))
                return &user_aarch32_ptrace_view;

        return &user_aarch64_view;
}

long arch_ptrace(struct task_struct *child, long request,
                 unsigned long addr, unsigned long data)
{
        switch (request) {
        case PTRACE_PEEKMTETAGS:
        case PTRACE_POKEMTETAGS:
                return mte_ptrace_copy_tags(child, request, addr, data);
        }

        return ptrace_request(child, request, addr, data);
}

enum ptrace_syscall_dir {
        PTRACE_SYSCALL_ENTER = 0,
        PTRACE_SYSCALL_EXIT,
};

static void report_syscall(struct pt_regs *regs, enum ptrace_syscall_dir dir)
{
        int regno;
        unsigned long saved_reg;

        /*
         * We have some ABI weirdness here in the way that we handle syscall
         * exit stops because we indicate whether or not the stop has been
         * signalled from syscall entry or syscall exit by clobbering a general
         * purpose register (ip/r12 for AArch32, x7 for AArch64) in the tracee
         * and restoring its old value after the stop. This means that:
         *
         * - Any writes by the tracer to this register during the stop are
         *   ignored/discarded.
         *
         * - The actual value of the register is not available during the stop,
         *   so the tracer cannot save it and restore it later.
         *
         * - Syscall stops behave differently to seccomp and pseudo-step traps
         *   (the latter do not nobble any registers).
         */
        regno = (is_compat_task() ? 12 : 7);
        saved_reg = regs->regs[regno];
        regs->regs[regno] = dir;

        if (dir == PTRACE_SYSCALL_ENTER) {
                if (ptrace_report_syscall_entry(regs))
                        forget_syscall(regs);
                regs->regs[regno] = saved_reg;
        } else if (!test_thread_flag(TIF_SINGLESTEP)) {
                ptrace_report_syscall_exit(regs, 0);
                regs->regs[regno] = saved_reg;
        } else {
                regs->regs[regno] = saved_reg;

                /*
                 * Signal a pseudo-step exception since we are stepping but
                 * tracer modifications to the registers may have rewound the
                 * state machine.
                 */
                ptrace_report_syscall_exit(regs, 1);
        }
}

int syscall_trace_enter(struct pt_regs *regs)
{
        unsigned long flags = read_thread_flags();

        if (flags & (_TIF_SYSCALL_EMU | _TIF_SYSCALL_TRACE)) {
                report_syscall(regs, PTRACE_SYSCALL_ENTER);
                if (flags & _TIF_SYSCALL_EMU)
                        return NO_SYSCALL;
        }

        /* Do the secure computing after ptrace; failures should be fast. */
        if (secure_computing() == -1)
                return NO_SYSCALL;

        if (test_thread_flag(TIF_SYSCALL_TRACEPOINT))
                trace_sys_enter(regs, regs->syscallno);

        audit_syscall_entry(regs->syscallno, regs->orig_x0, regs->regs[1],
                            regs->regs[2], regs->regs[3]);

        return regs->syscallno;
}

void syscall_trace_exit(struct pt_regs *regs)
{
        unsigned long flags = read_thread_flags();

        audit_syscall_exit(regs);

        if (flags & _TIF_SYSCALL_TRACEPOINT)
                trace_sys_exit(regs, syscall_get_return_value(current, regs));

        if (flags & (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP))
                report_syscall(regs, PTRACE_SYSCALL_EXIT);

        rseq_syscall(regs);
}

/*
 * SPSR_ELx bits which are always architecturally RES0 per ARM DDI 0487D.a.
 * We permit userspace to set SSBS (AArch64 bit 12, AArch32 bit 23) which is
 * not described in ARM DDI 0487D.a.
 * We treat PAN and UAO as RES0 bits, as they are meaningless at EL0, and may
 * be allocated an EL0 meaning in future.
 * Userspace cannot use these until they have an architectural meaning.
 * Note that this follows the SPSR_ELx format, not the AArch32 PSR format.
 * We also reserve IL for the kernel; SS is handled dynamically.
 */
#define SPSR_EL1_AARCH64_RES0_BITS \
        (GENMASK_ULL(63, 32) | GENMASK_ULL(27, 26) | GENMASK_ULL(23, 22) | \
         GENMASK_ULL(20, 13) | GENMASK_ULL(5, 5))
#define SPSR_EL1_AARCH32_RES0_BITS \
        (GENMASK_ULL(63, 32) | GENMASK_ULL(22, 22) | GENMASK_ULL(20, 20))

static int valid_compat_regs(struct user_pt_regs *regs)
{
        regs->pstate &= ~SPSR_EL1_AARCH32_RES0_BITS;

        if (!system_supports_mixed_endian_el0()) {
                if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
                        regs->pstate |= PSR_AA32_E_BIT;
                else
                        regs->pstate &= ~PSR_AA32_E_BIT;
        }

        if (user_mode(regs) && (regs->pstate & PSR_MODE32_BIT) &&
            (regs->pstate & PSR_AA32_A_BIT) == 0 &&
            (regs->pstate & PSR_AA32_I_BIT) == 0 &&
            (regs->pstate & PSR_AA32_F_BIT) == 0) {
                return 1;
        }

        /*
         * Force PSR to a valid 32-bit EL0t, preserving the same bits as
         * arch/arm.
         */
        regs->pstate &= PSR_AA32_N_BIT | PSR_AA32_Z_BIT |
                        PSR_AA32_C_BIT | PSR_AA32_V_BIT |
                        PSR_AA32_Q_BIT | PSR_AA32_IT_MASK |
                        PSR_AA32_GE_MASK | PSR_AA32_E_BIT |
                        PSR_AA32_T_BIT;
        regs->pstate |= PSR_MODE32_BIT;

        return 0;
}

static int valid_native_regs(struct user_pt_regs *regs)
{
        regs->pstate &= ~SPSR_EL1_AARCH64_RES0_BITS;

        if (user_mode(regs) && !(regs->pstate & PSR_MODE32_BIT) &&
            (regs->pstate & PSR_D_BIT) == 0 &&
            (regs->pstate & PSR_A_BIT) == 0 &&
            (regs->pstate & PSR_I_BIT) == 0 &&
            (regs->pstate & PSR_F_BIT) == 0) {
                return 1;
        }

        /* Force PSR to a valid 64-bit EL0t */
        regs->pstate &= PSR_N_BIT | PSR_Z_BIT | PSR_C_BIT | PSR_V_BIT;

        return 0;
}

/*
 * Are the current registers suitable for user mode? (used to maintain
 * security in signal handlers)
 */
int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task)
{
        /* https://lore.kernel.org/lkml/20191118131525.GA4180@willie-the-truck */
        user_regs_reset_single_step(regs, task);

        if (is_compat_thread(task_thread_info(task)))
                return valid_compat_regs(regs);
        else
                return valid_native_regs(regs);
}



























  167 
  167 






















  166 














  165 












  156 

  157 










  157 


  157 

  157 






























  126 


  127 

  127 













  126 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
// SPDX-License-Identifier: GPL-2.0
/*
 * arch/arm64/kvm/fpsimd.c: Guest/host FPSIMD context coordination helpers
 *
 * Copyright 2018 Arm Limited
 * Author: Dave Martin <Dave.Martin@arm.com>
 */
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/kvm_host.h>
#include <asm/fpsimd.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/sysreg.h>

/*
 * Called on entry to KVM_RUN unless this vcpu previously ran at least
 * once and the most recent prior KVM_RUN for this vcpu was called from
 * the same task as current (highly likely).
 *
 * This is guaranteed to execute before kvm_arch_vcpu_load_fp(vcpu),
 * such that on entering hyp the relevant parts of current are already
 * mapped.
 */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu)
{
        struct user_fpsimd_state *fpsimd = &current->thread.uw.fpsimd_state;
        int ret;

        /* pKVM has its own tracking of the host fpsimd state. */
        if (is_protected_kvm_enabled())
                return 0;

        /* Make sure the host task fpsimd state is visible to hyp: */
        ret = kvm_share_hyp(fpsimd, fpsimd + 1);
        if (ret)
                return ret;

        return 0;
}

/*
 * Prepare vcpu for saving the host's FPSIMD state and loading the guest's.
 * The actual loading is done by the FPSIMD access trap taken to hyp.
 *
 * Here, we just set the correct metadata to indicate that the FPSIMD
 * state in the cpu regs (if any) belongs to current on the host.
 */
void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu)
{
        BUG_ON(!current->mm);

        if (!system_supports_fpsimd())
                return;

        /*
         * Ensure that any host FPSIMD/SVE/SME state is saved and unbound such
         * that the host kernel is responsible for restoring this state upon
         * return to userspace, and the hyp code doesn't need to save anything.
         *
         * When the host may use SME, fpsimd_save_and_flush_cpu_state() ensures
         * that PSTATE.{SM,ZA} == {0,0}.
         */
        fpsimd_save_and_flush_cpu_state();
        *host_data_ptr(fp_owner) = FP_STATE_FREE;

        WARN_ON_ONCE(system_supports_sme() && read_sysreg_s(SYS_SVCR));
}

/*
 * Called just before entering the guest once we are no longer preemptible
 * and interrupts are disabled. If we have managed to run anything using
 * FP while we were preemptible (such as off the back of an interrupt),
 * then neither the host nor the guest own the FP hardware (and it was the
 * responsibility of the code that used FP to save the existing state).
 */
void kvm_arch_vcpu_ctxflush_fp(struct kvm_vcpu *vcpu)
{
        if (test_thread_flag(TIF_FOREIGN_FPSTATE))
                *host_data_ptr(fp_owner) = FP_STATE_FREE;
}

/*
 * Called just after exiting the guest. If the guest FPSIMD state
 * was loaded, update the host's context tracking data mark the CPU
 * FPSIMD regs as dirty and belonging to vcpu so that they will be
 * written back if the kernel clobbers them due to kernel-mode NEON
 * before re-entry into the guest.
 */
void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu)
{
        struct cpu_fp_state fp_state;

        WARN_ON_ONCE(!irqs_disabled());

        if (guest_owns_fp_regs()) {
                /*
                 * Currently we do not support SME guests so SVCR is
                 * always 0 and we just need a variable to point to.
                 */
                fp_state.st = &vcpu->arch.ctxt.fp_regs;
                fp_state.sve_state = vcpu->arch.sve_state;
                fp_state.sve_vl = vcpu->arch.sve_max_vl;
                fp_state.sme_state = NULL;
                fp_state.svcr = &__vcpu_sys_reg(vcpu, SVCR);
                fp_state.fpmr = &__vcpu_sys_reg(vcpu, FPMR);
                fp_state.fp_type = &vcpu->arch.fp_type;

                if (vcpu_has_sve(vcpu))
                        fp_state.to_save = FP_STATE_SVE;
                else
                        fp_state.to_save = FP_STATE_FPSIMD;

                fpsimd_bind_state_to_cpu(&fp_state);

                clear_thread_flag(TIF_FOREIGN_FPSTATE);
        }
}

/*
 * Write back the vcpu FPSIMD regs if they are dirty, and invalidate the
 * cpu FPSIMD regs so that they can't be spuriously reused if this vcpu
 * disappears and another task or vcpu appears that recycles the same
 * struct fpsimd_state.
 */
void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu)
{
        unsigned long flags;

        local_irq_save(flags);

        if (guest_owns_fp_regs()) {
                /*
                 * Flush (save and invalidate) the fpsimd/sve state so that if
                 * the host tries to use fpsimd/sve, it's not using stale data
                 * from the guest.
                 *
                 * Flushing the state sets the TIF_FOREIGN_FPSTATE bit for the
                 * context unconditionally, in both nVHE and VHE. This allows
                 * the kernel to restore the fpsimd/sve state, including ZCR_EL1
                 * when needed.
                 */
                fpsimd_save_and_flush_cpu_state();
        }

        local_irq_restore(flags);
}


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 


































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
// SPDX-License-Identifier: GPL-2.0-only
/*
 * VXLAN: Virtual eXtensible Local Area Network
 *
 * Copyright (c) 2012-2013 Vyatta Inc.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/if_ether.h>
#include <linux/ethtool.h>
#include <net/arp.h>
#include <net/ndisc.h>
#include <net/gro.h>
#include <net/ipv6_stubs.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/rtnetlink.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/netdev_lock.h>
#include <net/tun_proto.h>
#include <net/vxlan.h>
#include <net/nexthop.h>

#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_tunnel.h>
#include <net/ip6_checksum.h>
#endif

#include "vxlan_private.h"

#define VXLAN_VERSION        "0.1"

#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ)        /* rescan interval */

/* UDP port for VXLAN traffic.
 * The IANA assigned port is 4789, but the Linux default is 8472
 * for compatibility with early adopters.
 */
static unsigned short vxlan_port __read_mostly = 8472;
module_param_named(udp_port, vxlan_port, ushort, 0444);
MODULE_PARM_DESC(udp_port, "Destination UDP port");

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

unsigned int vxlan_net_id;

const u8 all_zeros_mac[ETH_ALEN + 2];
static struct rtnl_link_ops vxlan_link_ops;

static int vxlan_sock_add(struct vxlan_dev *vxlan);

static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);

/* salt for hash table */
static u32 vxlan_salt __read_mostly;

static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
{
        return vs->flags & VXLAN_F_COLLECT_METADATA ||
               ip_tunnel_collect_metadata();
}

/* Find VXLAN socket based on network namespace, address family, UDP port,
 * enabled unshareable flags and socket device binding (see l3mdev with
 * non-default VRF).
 */
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
                                          __be16 port, u32 flags, int ifindex)
{
        struct vxlan_sock *vs;

        flags &= VXLAN_F_RCV_FLAGS;

        hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
                if (inet_sk(vs->sock->sk)->inet_sport == port &&
                    vxlan_get_sk_family(vs) == family &&
                    vs->flags == flags &&
                    vs->sock->sk->sk_bound_dev_if == ifindex)
                        return vs;
        }
        return NULL;
}

static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs,
                                           int ifindex, __be32 vni,
                                           struct vxlan_vni_node **vninode)
{
        struct vxlan_vni_node *vnode;
        struct vxlan_dev_node *node;

        /* For flow based devices, map all packets to VNI 0 */
        if (vs->flags & VXLAN_F_COLLECT_METADATA &&
            !(vs->flags & VXLAN_F_VNIFILTER))
                vni = 0;

        hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
                if (!node->vxlan)
                        continue;
                vnode = NULL;
                if (node->vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
                        vnode = vxlan_vnifilter_lookup(node->vxlan, vni);
                        if (!vnode)
                                continue;
                } else if (node->vxlan->default_dst.remote_vni != vni) {
                        continue;
                }

                if (IS_ENABLED(CONFIG_IPV6)) {
                        const struct vxlan_config *cfg = &node->vxlan->cfg;

                        if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
                            cfg->remote_ifindex != ifindex)
                                continue;
                }

                if (vninode)
                        *vninode = vnode;
                return node->vxlan;
        }

        return NULL;
}

/* Look up VNI in a per net namespace table */
static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
                                        __be32 vni, sa_family_t family,
                                        __be16 port, u32 flags)
{
        struct vxlan_sock *vs;

        vs = vxlan_find_sock(net, family, port, flags, ifindex);
        if (!vs)
                return NULL;

        return vxlan_vs_find_vni(vs, ifindex, vni, NULL);
}

/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
                          const struct vxlan_fdb *fdb,
                          u32 portid, u32 seq, int type, unsigned int flags,
                          const struct vxlan_rdst *rdst)
{
        unsigned long now = jiffies;
        struct nda_cacheinfo ci;
        bool send_ip, send_eth;
        struct nlmsghdr *nlh;
        struct nexthop *nh;
        struct ndmsg *ndm;
        int nh_family;
        u32 nh_id;

        nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
        if (nlh == NULL)
                return -EMSGSIZE;

        ndm = nlmsg_data(nlh);
        memset(ndm, 0, sizeof(*ndm));

        send_eth = send_ip = true;

        rcu_read_lock();
        nh = rcu_dereference(fdb->nh);
        if (nh) {
                nh_family = nexthop_get_family(nh);
                nh_id = nh->id;
        }
        rcu_read_unlock();

        if (type == RTM_GETNEIGH) {
                if (rdst) {
                        send_ip = !vxlan_addr_any(&rdst->remote_ip);
                        ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
                } else if (nh) {
                        ndm->ndm_family = nh_family;
                }
                send_eth = !is_zero_ether_addr(fdb->eth_addr);
        } else
                ndm->ndm_family        = AF_BRIDGE;
        ndm->ndm_state = fdb->state;
        ndm->ndm_ifindex = vxlan->dev->ifindex;
        ndm->ndm_flags = fdb->flags;
        if (rdst && rdst->offloaded)
                ndm->ndm_flags |= NTF_OFFLOADED;
        ndm->ndm_type = RTN_UNICAST;

        if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
            nla_put_s32(skb, NDA_LINK_NETNSID,
                        peernet2id(dev_net(vxlan->dev), vxlan->net)))
                goto nla_put_failure;

        if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
                goto nla_put_failure;
        if (nh) {
                if (nla_put_u32(skb, NDA_NH_ID, nh_id))
                        goto nla_put_failure;
        } else if (rdst) {
                if (send_ip && vxlan_nla_put_addr(skb, NDA_DST,
                                                  &rdst->remote_ip))
                        goto nla_put_failure;

                if (rdst->remote_port &&
                    rdst->remote_port != vxlan->cfg.dst_port &&
                    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
                        goto nla_put_failure;
                if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
                    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
                        goto nla_put_failure;
                if (rdst->remote_ifindex &&
                    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
                        goto nla_put_failure;
        }

        if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
            nla_put_u32(skb, NDA_SRC_VNI,
                        be32_to_cpu(fdb->vni)))
                goto nla_put_failure;

        ci.ndm_used         = jiffies_to_clock_t(now - READ_ONCE(fdb->used));
        ci.ndm_confirmed = 0;
        ci.ndm_updated         = jiffies_to_clock_t(now - READ_ONCE(fdb->updated));
        ci.ndm_refcnt         = 0;

        if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static inline size_t vxlan_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ndmsg))
                + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
                + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
                + nla_total_size(sizeof(__be16)) /* NDA_PORT */
                + nla_total_size(sizeof(__be32)) /* NDA_VNI */
                + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
                + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
                + nla_total_size(sizeof(struct nda_cacheinfo));
}

static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
                               struct vxlan_rdst *rd, int type)
{
        struct net *net = dev_net(vxlan->dev);
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
        if (skb == NULL)
                goto errout;

        err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}

static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan,
                            const struct vxlan_fdb *fdb,
                            const struct vxlan_rdst *rd,
                            struct netlink_ext_ack *extack,
                            struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
        fdb_info->info.dev = vxlan->dev;
        fdb_info->info.extack = extack;
        fdb_info->remote_ip = rd->remote_ip;
        fdb_info->remote_port = rd->remote_port;
        fdb_info->remote_vni = rd->remote_vni;
        fdb_info->remote_ifindex = rd->remote_ifindex;
        memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN);
        fdb_info->vni = fdb->vni;
        fdb_info->offloaded = rd->offloaded;
        fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER;
}

static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
                                              struct vxlan_fdb *fdb,
                                              struct vxlan_rdst *rd,
                                              bool adding,
                                              struct netlink_ext_ack *extack)
{
        struct switchdev_notifier_vxlan_fdb_info info;
        enum switchdev_notifier_type notifier_type;
        int ret;

        if (WARN_ON(!rd))
                return 0;

        notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
                               : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
        vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info);
        ret = call_switchdev_notifiers(notifier_type, vxlan->dev,
                                       &info.info, extack);
        return notifier_to_errno(ret);
}

static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
                            struct vxlan_rdst *rd, int type, bool swdev_notify,
                            struct netlink_ext_ack *extack)
{
        int err;

        if (swdev_notify && rd) {
                switch (type) {
                case RTM_NEWNEIGH:
                        err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
                                                                 true, extack);
                        if (err)
                                return err;
                        break;
                case RTM_DELNEIGH:
                        vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
                                                           false, extack);
                        break;
                }
        }

        __vxlan_fdb_notify(vxlan, fdb, rd, type);
        return 0;
}

static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_fdb f = {
                .state = NUD_STALE,
        };
        struct vxlan_rdst remote = {
                .remote_ip = *ipa, /* goes to NDA_DST */
                .remote_vni = cpu_to_be32(VXLAN_N_VID),
        };

        vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
}

static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
{
        struct vxlan_fdb f = {
                .state = NUD_STALE,
        };
        struct vxlan_rdst remote = { };

        memcpy(f.eth_addr, eth_addr, ETH_ALEN);

        vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
}

/* Hash Ethernet address */
static u32 eth_hash(const unsigned char *addr)
{
        u64 value = get_unaligned((u64 *)addr);

        /* only want 6 bytes */
#ifdef __BIG_ENDIAN
        value >>= 16;
#else
        value <<= 16;
#endif
        return hash_64(value, FDB_HASH_BITS);
}

u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
{
        /* use 1 byte of OUI and 3 bytes of NIC */
        u32 key = get_unaligned((u32 *)(addr + 2));

        return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
}

u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
{
        if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
                return eth_vni_hash(mac, vni);
        else
                return eth_hash(mac);
}

/* Hash chain to use given mac address */
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
                                                const u8 *mac, __be32 vni)
{
        return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)];
}

/* Look up Ethernet address in forwarding table */
static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
                                          const u8 *mac, __be32 vni)
{
        struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
        struct vxlan_fdb *f;

        hlist_for_each_entry_rcu(f, head, hlist) {
                if (ether_addr_equal(mac, f->eth_addr)) {
                        if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
                                if (vni == f->vni)
                                        return f;
                        } else {
                                return f;
                        }
                }
        }

        return NULL;
}

static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
                                        const u8 *mac, __be32 vni)
{
        struct vxlan_fdb *f;

        f = __vxlan_find_mac(vxlan, mac, vni);
        if (f) {
                unsigned long now = jiffies;

                if (READ_ONCE(f->used) != now)
                        WRITE_ONCE(f->used, now);
        }

        return f;
}

/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
                                              union vxlan_addr *ip, __be16 port,
                                              __be32 vni, __u32 ifindex)
{
        struct vxlan_rdst *rd;

        list_for_each_entry(rd, &f->remotes, list) {
                if (vxlan_addr_equal(&rd->remote_ip, ip) &&
                    rd->remote_port == port &&
                    rd->remote_vni == vni &&
                    rd->remote_ifindex == ifindex)
                        return rd;
        }

        return NULL;
}

int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
                      struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        u8 eth_addr[ETH_ALEN + 2] = { 0 };
        struct vxlan_rdst *rdst;
        struct vxlan_fdb *f;
        int rc = 0;

        if (is_multicast_ether_addr(mac) ||
            is_zero_ether_addr(mac))
                return -EINVAL;

        ether_addr_copy(eth_addr, mac);

        rcu_read_lock();

        f = __vxlan_find_mac(vxlan, eth_addr, vni);
        if (!f) {
                rc = -ENOENT;
                goto out;
        }

        rdst = first_remote_rcu(f);
        vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info);

out:
        rcu_read_unlock();
        return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);

static int vxlan_fdb_notify_one(struct notifier_block *nb,
                                const struct vxlan_dev *vxlan,
                                const struct vxlan_fdb *f,
                                const struct vxlan_rdst *rdst,
                                struct netlink_ext_ack *extack)
{
        struct switchdev_notifier_vxlan_fdb_info fdb_info;
        int rc;

        vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info);
        rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
                               &fdb_info);
        return notifier_to_errno(rc);
}

int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
                     struct notifier_block *nb,
                     struct netlink_ext_ack *extack)
{
        struct vxlan_dev *vxlan;
        struct vxlan_rdst *rdst;
        struct vxlan_fdb *f;
        unsigned int h;
        int rc = 0;

        if (!netif_is_vxlan(dev))
                return -EINVAL;
        vxlan = netdev_priv(dev);

        for (h = 0; h < FDB_HASH_SIZE; ++h) {
                spin_lock_bh(&vxlan->hash_lock[h]);
                hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
                        if (f->vni == vni) {
                                list_for_each_entry(rdst, &f->remotes, list) {
                                        rc = vxlan_fdb_notify_one(nb, vxlan,
                                                                  f, rdst,
                                                                  extack);
                                        if (rc)
                                                goto unlock;
                                }
                        }
                }
                spin_unlock_bh(&vxlan->hash_lock[h]);
        }
        return 0;

unlock:
        spin_unlock_bh(&vxlan->hash_lock[h]);
        return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_replay);

void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
{
        struct vxlan_dev *vxlan;
        struct vxlan_rdst *rdst;
        struct vxlan_fdb *f;
        unsigned int h;

        if (!netif_is_vxlan(dev))
                return;
        vxlan = netdev_priv(dev);

        for (h = 0; h < FDB_HASH_SIZE; ++h) {
                spin_lock_bh(&vxlan->hash_lock[h]);
                hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
                        if (f->vni == vni)
                                list_for_each_entry(rdst, &f->remotes, list)
                                        rdst->offloaded = false;
                spin_unlock_bh(&vxlan->hash_lock[h]);
        }

}
EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);

/* Replace destination of unicast mac */
static int vxlan_fdb_replace(struct vxlan_fdb *f,
                             union vxlan_addr *ip, __be16 port, __be32 vni,
                             __u32 ifindex, struct vxlan_rdst *oldrd)
{
        struct vxlan_rdst *rd;

        rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
        if (rd)
                return 0;

        rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
        if (!rd)
                return 0;

        *oldrd = *rd;
        dst_cache_reset(&rd->dst_cache);
        rd->remote_ip = *ip;
        rd->remote_port = port;
        rd->remote_vni = vni;
        rd->remote_ifindex = ifindex;
        rd->offloaded = false;
        return 1;
}

/* Add/update destinations for multicast */
static int vxlan_fdb_append(struct vxlan_fdb *f,
                            union vxlan_addr *ip, __be16 port, __be32 vni,
                            __u32 ifindex, struct vxlan_rdst **rdp)
{
        struct vxlan_rdst *rd;

        rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
        if (rd)
                return 0;

        rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
        if (rd == NULL)
                return -ENOMEM;

        if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
                kfree(rd);
                return -ENOMEM;
        }

        rd->remote_ip = *ip;
        rd->remote_port = port;
        rd->offloaded = false;
        rd->remote_vni = vni;
        rd->remote_ifindex = ifindex;

        list_add_tail_rcu(&rd->list, &f->remotes);

        *rdp = rd;
        return 1;
}

static bool vxlan_parse_gpe_proto(const struct vxlanhdr *hdr, __be16 *protocol)
{
        const struct vxlanhdr_gpe *gpe = (const struct vxlanhdr_gpe *)hdr;

        /* Need to have Next Protocol set for interfaces in GPE mode. */
        if (!gpe->np_applied)
                return false;
        /* "The initial version is 0. If a receiver does not support the
         * version indicated it MUST drop the packet.
         */
        if (gpe->version != 0)
                return false;
        /* "When the O bit is set to 1, the packet is an OAM packet and OAM
         * processing MUST occur." However, we don't implement OAM
         * processing, thus drop the packet.
         */
        if (gpe->oam_flag)
                return false;

        *protocol = tun_p_to_eth_p(gpe->next_protocol);
        if (!*protocol)
                return false;

        return true;
}

static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
                                          unsigned int off,
                                          struct vxlanhdr *vh, size_t hdrlen,
                                          __be32 vni_field,
                                          struct gro_remcsum *grc,
                                          bool nopartial)
{
        size_t start, offset;

        if (skb->remcsum_offload)
                return vh;

        if (!NAPI_GRO_CB(skb)->csum_valid)
                return NULL;

        start = vxlan_rco_start(vni_field);
        offset = start + vxlan_rco_offset(vni_field);

        vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
                                     start, offset, grc, nopartial);

        skb->remcsum_offload = 1;

        return vh;
}

static struct vxlanhdr *vxlan_gro_prepare_receive(struct sock *sk,
                                                  struct list_head *head,
                                                  struct sk_buff *skb,
                                                  struct gro_remcsum *grc)
{
        struct sk_buff *p;
        struct vxlanhdr *vh, *vh2;
        unsigned int hlen, off_vx;
        struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
        __be32 flags;

        skb_gro_remcsum_init(grc);

        off_vx = skb_gro_offset(skb);
        hlen = off_vx + sizeof(*vh);
        vh = skb_gro_header(skb, hlen, off_vx);
        if (unlikely(!vh))
                return NULL;

        skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));

        flags = vh->vx_flags;

        if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
                vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
                                       vh->vx_vni, grc,
                                       !!(vs->flags &
                                          VXLAN_F_REMCSUM_NOPARTIAL));

                if (!vh)
                        return NULL;
        }

        skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */

        list_for_each_entry(p, head, list) {
                if (!NAPI_GRO_CB(p)->same_flow)
                        continue;

                vh2 = (struct vxlanhdr *)(p->data + off_vx);
                if (vh->vx_flags != vh2->vx_flags ||
                    vh->vx_vni != vh2->vx_vni) {
                        NAPI_GRO_CB(p)->same_flow = 0;
                        continue;
                }
        }

        return vh;
}

static struct sk_buff *vxlan_gro_receive(struct sock *sk,
                                         struct list_head *head,
                                         struct sk_buff *skb)
{
        struct sk_buff *pp = NULL;
        struct gro_remcsum grc;
        int flush = 1;

        if (vxlan_gro_prepare_receive(sk, head, skb, &grc)) {
                pp = call_gro_receive(eth_gro_receive, head, skb);
                flush = 0;
        }
        skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
        return pp;
}

static struct sk_buff *vxlan_gpe_gro_receive(struct sock *sk,
                                             struct list_head *head,
                                             struct sk_buff *skb)
{
        const struct packet_offload *ptype;
        struct sk_buff *pp = NULL;
        struct gro_remcsum grc;
        struct vxlanhdr *vh;
        __be16 protocol;
        int flush = 1;

        vh = vxlan_gro_prepare_receive(sk, head, skb, &grc);
        if (vh) {
                if (!vxlan_parse_gpe_proto(vh, &protocol))
                        goto out;
                ptype = gro_find_receive_by_type(protocol);
                if (!ptype)
                        goto out;
                pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
                flush = 0;
        }
out:
        skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
        return pp;
}

static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
        /* Sets 'skb->inner_mac_header' since we are always called with
         * 'skb->encapsulation' set.
         */
        return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
}

static int vxlan_gpe_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
        struct vxlanhdr *vh = (struct vxlanhdr *)(skb->data + nhoff);
        const struct packet_offload *ptype;
        int err = -ENOSYS;
        __be16 protocol;

        if (!vxlan_parse_gpe_proto(vh, &protocol))
                return err;
        ptype = gro_find_complete_by_type(protocol);
        if (ptype)
                err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
        return err;
}

static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac,
                                         __u16 state, __be32 src_vni,
                                         __u16 ndm_flags)
{
        struct vxlan_fdb *f;

        f = kmalloc(sizeof(*f), GFP_ATOMIC);
        if (!f)
                return NULL;
        f->state = state;
        f->flags = ndm_flags;
        f->updated = f->used = jiffies;
        f->vni = src_vni;
        f->nh = NULL;
        RCU_INIT_POINTER(f->vdev, vxlan);
        INIT_LIST_HEAD(&f->nh_list);
        INIT_LIST_HEAD(&f->remotes);
        memcpy(f->eth_addr, mac, ETH_ALEN);

        return f;
}

static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac,
                             __be32 src_vni, struct vxlan_fdb *f)
{
        ++vxlan->addrcnt;
        hlist_add_head_rcu(&f->hlist,
                           vxlan_fdb_head(vxlan, mac, src_vni));
}

static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
                               u32 nhid, struct netlink_ext_ack *extack)
{
        struct nexthop *old_nh = rtnl_dereference(fdb->nh);
        struct nexthop *nh;
        int err = -EINVAL;

        if (old_nh && old_nh->id == nhid)
                return 0;

        nh = nexthop_find_by_id(vxlan->net, nhid);
        if (!nh) {
                NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
                goto err_inval;
        }

        if (!nexthop_get(nh)) {
                NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
                nh = NULL;
                goto err_inval;
        }
        if (!nexthop_is_fdb(nh)) {
                NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop");
                goto err_inval;
        }

        if (!nexthop_is_multipath(nh)) {
                NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group");
                goto err_inval;
        }

        /* check nexthop group family */
        switch (vxlan->default_dst.remote_ip.sa.sa_family) {
        case AF_INET:
                if (!nexthop_has_v4(nh)) {
                        err = -EAFNOSUPPORT;
                        NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
                        goto err_inval;
                }
                break;
        case AF_INET6:
                if (nexthop_has_v4(nh)) {
                        err = -EAFNOSUPPORT;
                        NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
                        goto err_inval;
                }
        }

        if (old_nh) {
                list_del_rcu(&fdb->nh_list);
                nexthop_put(old_nh);
        }
        rcu_assign_pointer(fdb->nh, nh);
        list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list);
        return 1;

err_inval:
        if (nh)
                nexthop_put(nh);
        return err;
}

int vxlan_fdb_create(struct vxlan_dev *vxlan,
                     const u8 *mac, union vxlan_addr *ip,
                     __u16 state, __be16 port, __be32 src_vni,
                     __be32 vni, __u32 ifindex, __u16 ndm_flags,
                     u32 nhid, struct vxlan_fdb **fdb,
                     struct netlink_ext_ack *extack)
{
        struct vxlan_rdst *rd = NULL;
        struct vxlan_fdb *f;
        int rc;

        if (vxlan->cfg.addrmax &&
            vxlan->addrcnt >= vxlan->cfg.addrmax)
                return -ENOSPC;

        netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
        f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
        if (!f)
                return -ENOMEM;

        if (nhid)
                rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
        else
                rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
        if (rc < 0)
                goto errout;

        *fdb = f;

        return 0;

errout:
        kfree(f);
        return rc;
}

static void __vxlan_fdb_free(struct vxlan_fdb *f)
{
        struct vxlan_rdst *rd, *nd;
        struct nexthop *nh;

        nh = rcu_dereference_raw(f->nh);
        if (nh) {
                rcu_assign_pointer(f->nh, NULL);
                rcu_assign_pointer(f->vdev, NULL);
                nexthop_put(nh);
        }

        list_for_each_entry_safe(rd, nd, &f->remotes, list) {
                dst_cache_destroy(&rd->dst_cache);
                kfree(rd);
        }
        kfree(f);
}

static void vxlan_fdb_free(struct rcu_head *head)
{
        struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);

        __vxlan_fdb_free(f);
}

static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
                              bool do_notify, bool swdev_notify)
{
        struct vxlan_rdst *rd;

        netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);

        --vxlan->addrcnt;
        if (do_notify) {
                if (rcu_access_pointer(f->nh))
                        vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH,
                                         swdev_notify, NULL);
                else
                        list_for_each_entry(rd, &f->remotes, list)
                                vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
                                                 swdev_notify, NULL);
        }

        hlist_del_rcu(&f->hlist);
        list_del_rcu(&f->nh_list);
        call_rcu(&f->rcu, vxlan_fdb_free);
}

static void vxlan_dst_free(struct rcu_head *head)
{
        struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);

        dst_cache_destroy(&rd->dst_cache);
        kfree(rd);
}

static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
                                     union vxlan_addr *ip,
                                     __u16 state, __u16 flags,
                                     __be16 port, __be32 vni,
                                     __u32 ifindex, __u16 ndm_flags,
                                     struct vxlan_fdb *f, u32 nhid,
                                     bool swdev_notify,
                                     struct netlink_ext_ack *extack)
{
        __u16 fdb_flags = (ndm_flags & ~NTF_USE);
        struct vxlan_rdst *rd = NULL;
        struct vxlan_rdst oldrd;
        int notify = 0;
        int rc = 0;
        int err;

        if (nhid && !rcu_access_pointer(f->nh)) {
                NL_SET_ERR_MSG(extack,
                               "Cannot replace an existing non nexthop fdb with a nexthop");
                return -EOPNOTSUPP;
        }

        if (nhid && (flags & NLM_F_APPEND)) {
                NL_SET_ERR_MSG(extack,
                               "Cannot append to a nexthop fdb");
                return -EOPNOTSUPP;
        }

        /* Do not allow an externally learned entry to take over an entry added
         * by the user.
         */
        if (!(fdb_flags & NTF_EXT_LEARNED) ||
            !(f->flags & NTF_VXLAN_ADDED_BY_USER)) {
                if (f->state != state) {
                        f->state = state;
                        notify = 1;
                }
                if (f->flags != fdb_flags) {
                        f->flags = fdb_flags;
                        notify = 1;
                }
        }

        if ((flags & NLM_F_REPLACE)) {
                /* Only change unicasts */
                if (!(is_multicast_ether_addr(f->eth_addr) ||
                      is_zero_ether_addr(f->eth_addr))) {
                        if (nhid) {
                                rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
                                if (rc < 0)
                                        return rc;
                        } else {
                                rc = vxlan_fdb_replace(f, ip, port, vni,
                                                       ifindex, &oldrd);
                        }
                        notify |= rc;
                } else {
                        NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries");
                        return -EOPNOTSUPP;
                }
        }
        if ((flags & NLM_F_APPEND) &&
            (is_multicast_ether_addr(f->eth_addr) ||
             is_zero_ether_addr(f->eth_addr))) {
                rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);

                if (rc < 0)
                        return rc;
                notify |= rc;
        }

        if (ndm_flags & NTF_USE)
                WRITE_ONCE(f->updated, jiffies);

        if (notify) {
                if (rd == NULL)
                        rd = first_remote_rtnl(f);

                WRITE_ONCE(f->updated, jiffies);
                err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH,
                                       swdev_notify, extack);
                if (err)
                        goto err_notify;
        }

        return 0;

err_notify:
        if (nhid)
                return err;
        if ((flags & NLM_F_REPLACE) && rc)
                *rd = oldrd;
        else if ((flags & NLM_F_APPEND) && rc) {
                list_del_rcu(&rd->list);
                call_rcu(&rd->rcu, vxlan_dst_free);
        }
        return err;
}

static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
                                   const u8 *mac, union vxlan_addr *ip,
                                   __u16 state, __u16 flags,
                                   __be16 port, __be32 src_vni, __be32 vni,
                                   __u32 ifindex, __u16 ndm_flags, u32 nhid,
                                   bool swdev_notify,
                                   struct netlink_ext_ack *extack)
{
        __u16 fdb_flags = (ndm_flags & ~NTF_USE);
        struct vxlan_fdb *f;
        int rc;

        /* Disallow replace to add a multicast entry */
        if ((flags & NLM_F_REPLACE) &&
            (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
                return -EOPNOTSUPP;

        netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
        rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
                              vni, ifindex, fdb_flags, nhid, &f, extack);
        if (rc < 0)
                return rc;

        vxlan_fdb_insert(vxlan, mac, src_vni, f);
        rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH,
                              swdev_notify, extack);
        if (rc)
                goto err_notify;

        return 0;

err_notify:
        vxlan_fdb_destroy(vxlan, f, false, false);
        return rc;
}

/* Add new entry to forwarding table -- assumes lock held */
int vxlan_fdb_update(struct vxlan_dev *vxlan,
                     const u8 *mac, union vxlan_addr *ip,
                     __u16 state, __u16 flags,
                     __be16 port, __be32 src_vni, __be32 vni,
                     __u32 ifindex, __u16 ndm_flags, u32 nhid,
                     bool swdev_notify,
                     struct netlink_ext_ack *extack)
{
        struct vxlan_fdb *f;

        f = __vxlan_find_mac(vxlan, mac, src_vni);
        if (f) {
                if (flags & NLM_F_EXCL) {
                        netdev_dbg(vxlan->dev,
                                   "lost race to create %pM\n", mac);
                        return -EEXIST;
                }

                return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
                                                 vni, ifindex, ndm_flags, f,
                                                 nhid, swdev_notify, extack);
        } else {
                if (!(flags & NLM_F_CREATE))
                        return -ENOENT;

                return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
                                               port, src_vni, vni, ifindex,
                                               ndm_flags, nhid, swdev_notify,
                                               extack);
        }
}

static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
                                  struct vxlan_rdst *rd, bool swdev_notify)
{
        list_del_rcu(&rd->list);
        vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL);
        call_rcu(&rd->rcu, vxlan_dst_free);
}

static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
                           union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
                           __be32 *vni, u32 *ifindex, u32 *nhid,
                           struct netlink_ext_ack *extack)
{
        struct net *net = dev_net(vxlan->dev);
        int err;

        if (tb[NDA_NH_ID] &&
            (tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] || tb[NDA_PORT])) {
                NL_SET_ERR_MSG(extack, "DST, VNI, ifindex and port are mutually exclusive with NH_ID");
                return -EINVAL;
        }

        if (tb[NDA_DST]) {
                err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
                if (err) {
                        NL_SET_ERR_MSG(extack, "Unsupported address family");
                        return err;
                }
        } else {
                union vxlan_addr *remote = &vxlan->default_dst.remote_ip;

                if (remote->sa.sa_family == AF_INET) {
                        ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
                        ip->sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
                } else {
                        ip->sin6.sin6_addr = in6addr_any;
                        ip->sa.sa_family = AF_INET6;
#endif
                }
        }

        if (tb[NDA_PORT]) {
                if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) {
                        NL_SET_ERR_MSG(extack, "Invalid vxlan port");
                        return -EINVAL;
                }
                *port = nla_get_be16(tb[NDA_PORT]);
        } else {
                *port = vxlan->cfg.dst_port;
        }

        if (tb[NDA_VNI]) {
                if (nla_len(tb[NDA_VNI]) != sizeof(u32)) {
                        NL_SET_ERR_MSG(extack, "Invalid vni");
                        return -EINVAL;
                }
                *vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
        } else {
                *vni = vxlan->default_dst.remote_vni;
        }

        if (tb[NDA_SRC_VNI]) {
                if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32)) {
                        NL_SET_ERR_MSG(extack, "Invalid src vni");
                        return -EINVAL;
                }
                *src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
        } else {
                *src_vni = vxlan->default_dst.remote_vni;
        }

        if (tb[NDA_IFINDEX]) {
                struct net_device *tdev;

                if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) {
                        NL_SET_ERR_MSG(extack, "Invalid ifindex");
                        return -EINVAL;
                }
                *ifindex = nla_get_u32(tb[NDA_IFINDEX]);
                tdev = __dev_get_by_index(net, *ifindex);
                if (!tdev) {
                        NL_SET_ERR_MSG(extack, "Device not found");
                        return -EADDRNOTAVAIL;
                }
        } else {
                *ifindex = 0;
        }

        *nhid = nla_get_u32_default(tb[NDA_NH_ID], 0);

        return 0;
}

/* Add static entry (via netlink) */
static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
                         struct net_device *dev,
                         const unsigned char *addr, u16 vid, u16 flags,
                         bool *notified, struct netlink_ext_ack *extack)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        /* struct net *net = dev_net(vxlan->dev); */
        union vxlan_addr ip;
        __be16 port;
        __be32 src_vni, vni;
        u32 ifindex, nhid;
        u32 hash_index;
        int err;

        if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
                pr_info("RTM_NEWNEIGH with invalid state %#x\n",
                        ndm->ndm_state);
                return -EINVAL;
        }

        if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID]))
                return -EINVAL;

        err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
                              &nhid, extack);
        if (err)
                return err;

        if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
                return -EAFNOSUPPORT;

        hash_index = fdb_head_index(vxlan, addr, src_vni);
        spin_lock_bh(&vxlan->hash_lock[hash_index]);
        err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
                               port, src_vni, vni, ifindex,
                               ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
                               nhid, true, extack);
        spin_unlock_bh(&vxlan->hash_lock[hash_index]);

        if (!err)
                *notified = true;

        return err;
}

int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
                       const unsigned char *addr, union vxlan_addr ip,
                       __be16 port, __be32 src_vni, __be32 vni,
                       u32 ifindex, bool swdev_notify)
{
        struct vxlan_rdst *rd = NULL;
        struct vxlan_fdb *f;
        int err = -ENOENT;

        f = __vxlan_find_mac(vxlan, addr, src_vni);
        if (!f)
                return err;

        if (!vxlan_addr_any(&ip)) {
                rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
                if (!rd)
                        goto out;
        }

        /* remove a destination if it's not the only one on the list,
         * otherwise destroy the fdb entry
         */
        if (rd && !list_is_singular(&f->remotes)) {
                vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify);
                goto out;
        }

        vxlan_fdb_destroy(vxlan, f, true, swdev_notify);

out:
        return 0;
}

/* Delete entry (via netlink) */
static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
                            struct net_device *dev,
                            const unsigned char *addr, u16 vid, bool *notified,
                            struct netlink_ext_ack *extack)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        union vxlan_addr ip;
        __be32 src_vni, vni;
        u32 ifindex, nhid;
        u32 hash_index;
        __be16 port;
        int err;

        err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
                              &nhid, extack);
        if (err)
                return err;

        hash_index = fdb_head_index(vxlan, addr, src_vni);
        spin_lock_bh(&vxlan->hash_lock[hash_index]);
        err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
                                 true);
        spin_unlock_bh(&vxlan->hash_lock[hash_index]);

        if (!err)
                *notified = true;

        return err;
}

/* Dump forwarding table */
static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
                          struct net_device *dev,
                          struct net_device *filter_dev, int *idx)
{
        struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
        struct vxlan_dev *vxlan = netdev_priv(dev);
        unsigned int h;
        int err = 0;

        for (h = 0; h < FDB_HASH_SIZE; ++h) {
                struct vxlan_fdb *f;

                rcu_read_lock();
                hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
                        struct vxlan_rdst *rd;

                        if (rcu_access_pointer(f->nh)) {
                                if (*idx < ctx->fdb_idx)
                                        goto skip_nh;
                                err = vxlan_fdb_info(skb, vxlan, f,
                                                     NETLINK_CB(cb->skb).portid,
                                                     cb->nlh->nlmsg_seq,
                                                     RTM_NEWNEIGH,
                                                     NLM_F_MULTI, NULL);
                                if (err < 0) {
                                        rcu_read_unlock();
                                        goto out;
                                }
skip_nh:
                                *idx += 1;
                                continue;
                        }

                        list_for_each_entry_rcu(rd, &f->remotes, list) {
                                if (*idx < ctx->fdb_idx)
                                        goto skip;

                                err = vxlan_fdb_info(skb, vxlan, f,
                                                     NETLINK_CB(cb->skb).portid,
                                                     cb->nlh->nlmsg_seq,
                                                     RTM_NEWNEIGH,
                                                     NLM_F_MULTI, rd);
                                if (err < 0) {
                                        rcu_read_unlock();
                                        goto out;
                                }
skip:
                                *idx += 1;
                        }
                }
                rcu_read_unlock();
        }
out:
        return err;
}

static int vxlan_fdb_get(struct sk_buff *skb,
                         struct nlattr *tb[],
                         struct net_device *dev,
                         const unsigned char *addr,
                         u16 vid, u32 portid, u32 seq,
                         struct netlink_ext_ack *extack)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_fdb *f;
        __be32 vni;
        int err;

        if (tb[NDA_VNI])
                vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
        else
                vni = vxlan->default_dst.remote_vni;

        rcu_read_lock();

        f = __vxlan_find_mac(vxlan, addr, vni);
        if (!f) {
                NL_SET_ERR_MSG(extack, "Fdb entry not found");
                err = -ENOENT;
                goto errout;
        }

        err = vxlan_fdb_info(skb, vxlan, f, portid, seq,
                             RTM_NEWNEIGH, 0, first_remote_rcu(f));
errout:
        rcu_read_unlock();
        return err;
}

/* Watch incoming packets to learn mapping between Ethernet address
 * and Tunnel endpoint.
 */
static enum skb_drop_reason vxlan_snoop(struct net_device *dev,
                                        union vxlan_addr *src_ip,
                                        const u8 *src_mac, u32 src_ifindex,
                                        __be32 vni)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_fdb *f;
        u32 ifindex = 0;

        /* Ignore packets from invalid src-address */
        if (!is_valid_ether_addr(src_mac))
                return SKB_DROP_REASON_MAC_INVALID_SOURCE;

#if IS_ENABLED(CONFIG_IPV6)
        if (src_ip->sa.sa_family == AF_INET6 &&
            (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
                ifindex = src_ifindex;
#endif

        f = __vxlan_find_mac(vxlan, src_mac, vni);
        if (likely(f)) {
                struct vxlan_rdst *rdst = first_remote_rcu(f);
                unsigned long now = jiffies;

                if (READ_ONCE(f->updated) != now)
                        WRITE_ONCE(f->updated, now);

                if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
                           rdst->remote_ifindex == ifindex))
                        return SKB_NOT_DROPPED_YET;

                /* Don't migrate static entries, drop packets */
                if (f->state & (NUD_PERMANENT | NUD_NOARP))
                        return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;

                /* Don't override an fdb with nexthop with a learnt entry */
                if (rcu_access_pointer(f->nh))
                        return SKB_DROP_REASON_VXLAN_ENTRY_EXISTS;

                if (net_ratelimit())
                        netdev_info(dev,
                                    "%pM migrated from %pIS to %pIS\n",
                                    src_mac, &rdst->remote_ip.sa, &src_ip->sa);

                rdst->remote_ip = *src_ip;
                vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL);
        } else {
                u32 hash_index = fdb_head_index(vxlan, src_mac, vni);

                /* learned new entry */
                spin_lock(&vxlan->hash_lock[hash_index]);

                /* close off race between vxlan_flush and incoming packets */
                if (netif_running(dev))
                        vxlan_fdb_update(vxlan, src_mac, src_ip,
                                         NUD_REACHABLE,
                                         NLM_F_EXCL|NLM_F_CREATE,
                                         vxlan->cfg.dst_port,
                                         vni,
                                         vxlan->default_dst.remote_vni,
                                         ifindex, NTF_SELF, 0, true, NULL);
                spin_unlock(&vxlan->hash_lock[hash_index]);
        }

        return SKB_NOT_DROPPED_YET;
}

static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
{
        struct vxlan_net *vn;

        if (!vs)
                return false;
        if (!refcount_dec_and_test(&vs->refcnt))
                return false;

        vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
        spin_lock(&vn->sock_lock);
        hlist_del_rcu(&vs->hlist);
        udp_tunnel_notify_del_rx_port(vs->sock,
                                      (vs->flags & VXLAN_F_GPE) ?
                                      UDP_TUNNEL_TYPE_VXLAN_GPE :
                                      UDP_TUNNEL_TYPE_VXLAN);
        spin_unlock(&vn->sock_lock);

        return true;
}

static void vxlan_sock_release(struct vxlan_dev *vxlan)
{
        struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
#if IS_ENABLED(CONFIG_IPV6)
        struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

        RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
#endif

        RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
        synchronize_net();

        if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
                vxlan_vs_del_vnigrp(vxlan);
        else
                vxlan_vs_del_dev(vxlan);

        if (__vxlan_sock_release_prep(sock4)) {
                udp_tunnel_sock_release(sock4->sock);
                kfree(sock4);
        }

#if IS_ENABLED(CONFIG_IPV6)
        if (__vxlan_sock_release_prep(sock6)) {
                udp_tunnel_sock_release(sock6->sock);
                kfree(sock6);
        }
#endif
}

static enum skb_drop_reason vxlan_remcsum(struct sk_buff *skb, u32 vxflags)
{
        const struct vxlanhdr *vh = vxlan_hdr(skb);
        enum skb_drop_reason reason;
        size_t start, offset;

        if (!(vh->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
                return SKB_NOT_DROPPED_YET;

        start = vxlan_rco_start(vh->vx_vni);
        offset = start + vxlan_rco_offset(vh->vx_vni);

        reason = pskb_may_pull_reason(skb, offset + sizeof(u16));
        if (reason)
                return reason;

        skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
                            !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
        return SKB_NOT_DROPPED_YET;
}

static void vxlan_parse_gbp_hdr(struct sk_buff *skb, u32 vxflags,
                                struct vxlan_metadata *md)
{
        const struct vxlanhdr *vh = vxlan_hdr(skb);
        const struct vxlanhdr_gbp *gbp;
        struct metadata_dst *tun_dst;

        gbp = (const struct vxlanhdr_gbp *)vh;

        if (!(vh->vx_flags & VXLAN_HF_GBP))
                return;

        md->gbp = ntohs(gbp->policy_id);

        tun_dst = (struct metadata_dst *)skb_dst(skb);
        if (tun_dst) {
                __set_bit(IP_TUNNEL_VXLAN_OPT_BIT,
                          tun_dst->u.tun_info.key.tun_flags);
                tun_dst->u.tun_info.options_len = sizeof(*md);
        }
        if (gbp->dont_learn)
                md->gbp |= VXLAN_GBP_DONT_LEARN;

        if (gbp->policy_applied)
                md->gbp |= VXLAN_GBP_POLICY_APPLIED;

        /* In flow-based mode, GBP is carried in dst_metadata */
        if (!(vxflags & VXLAN_F_COLLECT_METADATA))
                skb->mark = md->gbp;
}

static enum skb_drop_reason vxlan_set_mac(struct vxlan_dev *vxlan,
                                          struct vxlan_sock *vs,
                                          struct sk_buff *skb, __be32 vni)
{
        union vxlan_addr saddr;
        u32 ifindex = skb->dev->ifindex;

        skb_reset_mac_header(skb);
        skb->protocol = eth_type_trans(skb, vxlan->dev);
        skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

        /* Ignore packet loops (and multicast echo) */
        if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
                return SKB_DROP_REASON_LOCAL_MAC;

        /* Get address from the outer IP header */
        if (vxlan_get_sk_family(vs) == AF_INET) {
                saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
                saddr.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
                saddr.sa.sa_family = AF_INET6;
#endif
        }

        if (!(vxlan->cfg.flags & VXLAN_F_LEARN))
                return SKB_NOT_DROPPED_YET;

        return vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source,
                           ifindex, vni);
}

static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
                                  struct sk_buff *skb)
{
        int err = 0;

        if (vxlan_get_sk_family(vs) == AF_INET)
                err = IP_ECN_decapsulate(oiph, skb);
#if IS_ENABLED(CONFIG_IPV6)
        else
                err = IP6_ECN_decapsulate(oiph, skb);
#endif

        if (unlikely(err) && log_ecn_error) {
                if (vxlan_get_sk_family(vs) == AF_INET)
                        net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
                                             &((struct iphdr *)oiph)->saddr,
                                             ((struct iphdr *)oiph)->tos);
                else
                        net_info_ratelimited("non-ECT from %pI6\n",
                                             &((struct ipv6hdr *)oiph)->saddr);
        }
        return err <= 1;
}

static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
{
        struct vxlan_vni_node *vninode = NULL;
        const struct vxlanhdr *vh;
        struct vxlan_dev *vxlan;
        struct vxlan_sock *vs;
        struct vxlan_metadata _md;
        struct vxlan_metadata *md = &_md;
        __be16 protocol = htons(ETH_P_TEB);
        enum skb_drop_reason reason;
        bool raw_proto = false;
        void *oiph;
        __be32 vni = 0;
        int nh;

        /* Need UDP and VXLAN header to be present */
        reason = pskb_may_pull_reason(skb, VXLAN_HLEN);
        if (reason)
                goto drop;

        vh = vxlan_hdr(skb);
        /* VNI flag always required to be set */
        if (!(vh->vx_flags & VXLAN_HF_VNI)) {
                netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
                           ntohl(vh->vx_flags), ntohl(vh->vx_vni));
                reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
                /* Return non vxlan pkt */
                goto drop;
        }

        vs = rcu_dereference_sk_user_data(sk);
        if (!vs)
                goto drop;

        vni = vxlan_vni(vh->vx_vni);

        vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode);
        if (!vxlan) {
                reason = SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND;
                goto drop;
        }

        if (vh->vx_flags & vxlan->cfg.reserved_bits.vx_flags ||
            vh->vx_vni & vxlan->cfg.reserved_bits.vx_vni) {
                /* If the header uses bits besides those enabled by the
                 * netdevice configuration, treat this as a malformed packet.
                 * This behavior diverges from VXLAN RFC (RFC7348) which
                 * stipulates that bits in reserved in reserved fields are to be
                 * ignored. The approach here maintains compatibility with
                 * previous stack code, and also is more robust and provides a
                 * little more security in adding extensions to VXLAN.
                 */
                reason = SKB_DROP_REASON_VXLAN_INVALID_HDR;
                DEV_STATS_INC(vxlan->dev, rx_frame_errors);
                DEV_STATS_INC(vxlan->dev, rx_errors);
                vxlan_vnifilter_count(vxlan, vni, vninode,
                                      VXLAN_VNI_STATS_RX_ERRORS, 0);
                goto drop;
        }

        if (vxlan->cfg.flags & VXLAN_F_GPE) {
                if (!vxlan_parse_gpe_proto(vh, &protocol))
                        goto drop;
                raw_proto = true;
        }

        if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
                                   !net_eq(vxlan->net, dev_net(vxlan->dev)))) {
                reason = SKB_DROP_REASON_NOMEM;
                goto drop;
        }

        if (vxlan->cfg.flags & VXLAN_F_REMCSUM_RX) {
                reason = vxlan_remcsum(skb, vxlan->cfg.flags);
                if (unlikely(reason))
                        goto drop;
        }

        if (vxlan_collect_metadata(vs)) {
                IP_TUNNEL_DECLARE_FLAGS(flags) = { };
                struct metadata_dst *tun_dst;

                __set_bit(IP_TUNNEL_KEY_BIT, flags);
                tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), flags,
                                         key32_to_tunnel_id(vni), sizeof(*md));

                if (!tun_dst) {
                        reason = SKB_DROP_REASON_NOMEM;
                        goto drop;
                }

                md = ip_tunnel_info_opts(&tun_dst->u.tun_info);

                skb_dst_set(skb, (struct dst_entry *)tun_dst);
        } else {
                memset(md, 0, sizeof(*md));
        }

        if (vxlan->cfg.flags & VXLAN_F_GBP)
                vxlan_parse_gbp_hdr(skb, vxlan->cfg.flags, md);
        /* Note that GBP and GPE can never be active together. This is
         * ensured in vxlan_dev_configure.
         */

        if (!raw_proto) {
                reason = vxlan_set_mac(vxlan, vs, skb, vni);
                if (reason)
                        goto drop;
        } else {
                skb_reset_mac_header(skb);
                skb->dev = vxlan->dev;
                skb->pkt_type = PACKET_HOST;
        }

        /* Save offset of outer header relative to skb->head,
         * because we are going to reset the network header to the inner header
         * and might change skb->head.
         */
        nh = skb_network_header(skb) - skb->head;

        skb_reset_network_header(skb);

        reason = pskb_inet_may_pull_reason(skb);
        if (reason) {
                DEV_STATS_INC(vxlan->dev, rx_length_errors);
                DEV_STATS_INC(vxlan->dev, rx_errors);
                vxlan_vnifilter_count(vxlan, vni, vninode,
                                      VXLAN_VNI_STATS_RX_ERRORS, 0);
                goto drop;
        }

        /* Get the outer header. */
        oiph = skb->head + nh;

        if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
                reason = SKB_DROP_REASON_IP_TUNNEL_ECN;
                DEV_STATS_INC(vxlan->dev, rx_frame_errors);
                DEV_STATS_INC(vxlan->dev, rx_errors);
                vxlan_vnifilter_count(vxlan, vni, vninode,
                                      VXLAN_VNI_STATS_RX_ERRORS, 0);
                goto drop;
        }

        rcu_read_lock();

        if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
                rcu_read_unlock();
                dev_dstats_rx_dropped(vxlan->dev);
                vxlan_vnifilter_count(vxlan, vni, vninode,
                                      VXLAN_VNI_STATS_RX_DROPS, 0);
                reason = SKB_DROP_REASON_DEV_READY;
                goto drop;
        }

        dev_dstats_rx_add(vxlan->dev, skb->len);
        vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len);
        gro_cells_receive(&vxlan->gro_cells, skb);

        rcu_read_unlock();

        return 0;

drop:
        reason = reason ?: SKB_DROP_REASON_NOT_SPECIFIED;
        /* Consume bad packet */
        kfree_skb_reason(skb, reason);
        return 0;
}

static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
{
        struct vxlan_dev *vxlan;
        struct vxlan_sock *vs;
        struct vxlanhdr *hdr;
        __be32 vni;

        if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN))
                return -EINVAL;

        hdr = vxlan_hdr(skb);

        if (!(hdr->vx_flags & VXLAN_HF_VNI))
                return -EINVAL;

        vs = rcu_dereference_sk_user_data(sk);
        if (!vs)
                return -ENOENT;

        vni = vxlan_vni(hdr->vx_vni);
        vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, NULL);
        if (!vxlan)
                return -ENOENT;

        return 0;
}

static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct arphdr *parp;
        u8 *arpptr, *sha;
        __be32 sip, tip;
        struct neighbour *n;

        if (dev->flags & IFF_NOARP)
                goto out;

        if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
                dev_dstats_tx_dropped(dev);
                vxlan_vnifilter_count(vxlan, vni, NULL,
                                      VXLAN_VNI_STATS_TX_DROPS, 0);
                goto out;
        }
        parp = arp_hdr(skb);

        if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
             parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
            parp->ar_pro != htons(ETH_P_IP) ||
            parp->ar_op != htons(ARPOP_REQUEST) ||
            parp->ar_hln != dev->addr_len ||
            parp->ar_pln != 4)
                goto out;
        arpptr = (u8 *)parp + sizeof(struct arphdr);
        sha = arpptr;
        arpptr += dev->addr_len;        /* sha */
        memcpy(&sip, arpptr, sizeof(sip));
        arpptr += sizeof(sip);
        arpptr += dev->addr_len;        /* tha */
        memcpy(&tip, arpptr, sizeof(tip));

        if (ipv4_is_loopback(tip) ||
            ipv4_is_multicast(tip))
                goto out;

        n = neigh_lookup(&arp_tbl, &tip, dev);

        if (n) {
                struct vxlan_fdb *f;
                struct sk_buff        *reply;

                if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) {
                        neigh_release(n);
                        goto out;
                }

                f = vxlan_find_mac(vxlan, n->ha, vni);
                if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
                        /* bridge-local neighbor */
                        neigh_release(n);
                        goto out;
                }

                reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
                                n->ha, sha);

                neigh_release(n);

                if (reply == NULL)
                        goto out;

                skb_reset_mac_header(reply);
                __skb_pull(reply, skb_network_offset(reply));
                reply->ip_summed = CHECKSUM_UNNECESSARY;
                reply->pkt_type = PACKET_HOST;

                if (netif_rx(reply) == NET_RX_DROP) {
                        dev_dstats_rx_dropped(dev);
                        vxlan_vnifilter_count(vxlan, vni, NULL,
                                              VXLAN_VNI_STATS_RX_DROPS, 0);
                }

        } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
                union vxlan_addr ipa = {
                        .sin.sin_addr.s_addr = tip,
                        .sin.sin_family = AF_INET,
                };

                vxlan_ip_miss(dev, &ipa);
        }
out:
        consume_skb(skb);
        return NETDEV_TX_OK;
}

#if IS_ENABLED(CONFIG_IPV6)
static struct sk_buff *vxlan_na_create(struct sk_buff *request,
        struct neighbour *n, bool isrouter)
{
        struct net_device *dev = request->dev;
        struct sk_buff *reply;
        struct nd_msg *ns, *na;
        struct ipv6hdr *pip6;
        u8 *daddr;
        int na_olen = 8; /* opt hdr + ETH_ALEN for target */
        int ns_olen;
        int i, len;

        if (dev == NULL || !pskb_may_pull(request, request->len))
                return NULL;

        len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
                sizeof(*na) + na_olen + dev->needed_tailroom;
        reply = alloc_skb(len, GFP_ATOMIC);
        if (reply == NULL)
                return NULL;

        reply->protocol = htons(ETH_P_IPV6);
        reply->dev = dev;
        skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
        skb_push(reply, sizeof(struct ethhdr));
        skb_reset_mac_header(reply);

        ns = (struct nd_msg *)(ipv6_hdr(request) + 1);

        daddr = eth_hdr(request)->h_source;
        ns_olen = request->len - skb_network_offset(request) -
                sizeof(struct ipv6hdr) - sizeof(*ns);
        for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
                if (!ns->opt[i + 1]) {
                        kfree_skb(reply);
                        return NULL;
                }
                if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
                        daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
                        break;
                }
        }

        /* Ethernet header */
        ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
        ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
        eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
        reply->protocol = htons(ETH_P_IPV6);

        skb_pull(reply, sizeof(struct ethhdr));
        skb_reset_network_header(reply);
        skb_put(reply, sizeof(struct ipv6hdr));

        /* IPv6 header */

        pip6 = ipv6_hdr(reply);
        memset(pip6, 0, sizeof(struct ipv6hdr));
        pip6->version = 6;
        pip6->priority = ipv6_hdr(request)->priority;
        pip6->nexthdr = IPPROTO_ICMPV6;
        pip6->hop_limit = 255;
        pip6->daddr = ipv6_hdr(request)->saddr;
        pip6->saddr = *(struct in6_addr *)n->primary_key;

        skb_pull(reply, sizeof(struct ipv6hdr));
        skb_reset_transport_header(reply);

        /* Neighbor Advertisement */
        na = skb_put_zero(reply, sizeof(*na) + na_olen);
        na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
        na->icmph.icmp6_router = isrouter;
        na->icmph.icmp6_override = 1;
        na->icmph.icmp6_solicited = 1;
        na->target = ns->target;
        ether_addr_copy(&na->opt[2], n->ha);
        na->opt[0] = ND_OPT_TARGET_LL_ADDR;
        na->opt[1] = na_olen >> 3;

        na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
                &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
                csum_partial(na, sizeof(*na)+na_olen, 0));

        pip6->payload_len = htons(sizeof(*na)+na_olen);

        skb_push(reply, sizeof(struct ipv6hdr));

        reply->ip_summed = CHECKSUM_UNNECESSARY;

        return reply;
}

static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        const struct in6_addr *daddr;
        const struct ipv6hdr *iphdr;
        struct inet6_dev *in6_dev;
        struct neighbour *n;
        struct nd_msg *msg;

        rcu_read_lock();
        in6_dev = __in6_dev_get(dev);
        if (!in6_dev)
                goto out;

        iphdr = ipv6_hdr(skb);
        daddr = &iphdr->daddr;
        msg = (struct nd_msg *)(iphdr + 1);

        if (ipv6_addr_loopback(daddr) ||
            ipv6_addr_is_multicast(&msg->target))
                goto out;

        n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);

        if (n) {
                struct vxlan_fdb *f;
                struct sk_buff *reply;

                if (!(READ_ONCE(n->nud_state) & NUD_CONNECTED)) {
                        neigh_release(n);
                        goto out;
                }

                f = vxlan_find_mac(vxlan, n->ha, vni);
                if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
                        /* bridge-local neighbor */
                        neigh_release(n);
                        goto out;
                }

                reply = vxlan_na_create(skb, n,
                                        !!(f ? f->flags & NTF_ROUTER : 0));

                neigh_release(n);

                if (reply == NULL)
                        goto out;

                if (netif_rx(reply) == NET_RX_DROP) {
                        dev_dstats_rx_dropped(dev);
                        vxlan_vnifilter_count(vxlan, vni, NULL,
                                              VXLAN_VNI_STATS_RX_DROPS, 0);
                }
        } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
                union vxlan_addr ipa = {
                        .sin6.sin6_addr = msg->target,
                        .sin6.sin6_family = AF_INET6,
                };

                vxlan_ip_miss(dev, &ipa);
        }

out:
        rcu_read_unlock();
        consume_skb(skb);
        return NETDEV_TX_OK;
}
#endif

static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct neighbour *n;

        if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
                return false;

        n = NULL;
        switch (ntohs(eth_hdr(skb)->h_proto)) {
        case ETH_P_IP:
        {
                struct iphdr *pip;

                if (!pskb_may_pull(skb, sizeof(struct iphdr)))
                        return false;
                pip = ip_hdr(skb);
                n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
                if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
                        union vxlan_addr ipa = {
                                .sin.sin_addr.s_addr = pip->daddr,
                                .sin.sin_family = AF_INET,
                        };

                        vxlan_ip_miss(dev, &ipa);
                        return false;
                }

                break;
        }
#if IS_ENABLED(CONFIG_IPV6)
        case ETH_P_IPV6:
        {
                struct ipv6hdr *pip6;

                if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
                        return false;
                pip6 = ipv6_hdr(skb);
                n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
                if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
                        union vxlan_addr ipa = {
                                .sin6.sin6_addr = pip6->daddr,
                                .sin6.sin6_family = AF_INET6,
                        };

                        vxlan_ip_miss(dev, &ipa);
                        return false;
                }

                break;
        }
#endif
        default:
                return false;
        }

        if (n) {
                bool diff;

                diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
                if (diff) {
                        memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
                                dev->addr_len);
                        memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
                }
                neigh_release(n);
                return diff;
        }

        return false;
}

static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, __be16 protocol)
{
        struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;

        gpe->np_applied = 1;
        gpe->next_protocol = tun_p_from_eth_p(protocol);
        if (!gpe->next_protocol)
                return -EPFNOSUPPORT;
        return 0;
}

static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
                           int iphdr_len, __be32 vni,
                           struct vxlan_metadata *md, u32 vxflags,
                           bool udp_sum)
{
        struct vxlanhdr *vxh;
        int min_headroom;
        int err;
        int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
        __be16 inner_protocol = htons(ETH_P_TEB);

        if ((vxflags & VXLAN_F_REMCSUM_TX) &&
            skb->ip_summed == CHECKSUM_PARTIAL) {
                int csum_start = skb_checksum_start_offset(skb);

                if (csum_start <= VXLAN_MAX_REMCSUM_START &&
                    !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
                    (skb->csum_offset == offsetof(struct udphdr, check) ||
                     skb->csum_offset == offsetof(struct tcphdr, check)))
                        type |= SKB_GSO_TUNNEL_REMCSUM;
        }

        min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
                        + VXLAN_HLEN + iphdr_len;

        /* Need space for new headers (invalidates iph ptr) */
        err = skb_cow_head(skb, min_headroom);
        if (unlikely(err))
                return err;

        err = iptunnel_handle_offloads(skb, type);
        if (err)
                return err;

        vxh = __skb_push(skb, sizeof(*vxh));
        vxh->vx_flags = VXLAN_HF_VNI;
        vxh->vx_vni = vxlan_vni_field(vni);

        if (type & SKB_GSO_TUNNEL_REMCSUM) {
                unsigned int start;

                start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
                vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
                vxh->vx_flags |= VXLAN_HF_RCO;

                if (!skb_is_gso(skb)) {
                        skb->ip_summed = CHECKSUM_NONE;
                        skb->encapsulation = 0;
                }
        }

        if (vxflags & VXLAN_F_GBP)
                vxlan_build_gbp_hdr(vxh, md);
        if (vxflags & VXLAN_F_GPE) {
                err = vxlan_build_gpe_hdr(vxh, skb->protocol);
                if (err < 0)
                        return err;
                inner_protocol = skb->protocol;
        }

        skb_set_inner_protocol(skb, inner_protocol);
        return 0;
}

/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
                               struct vxlan_dev *dst_vxlan, __be32 vni,
                               bool snoop)
{
        union vxlan_addr loopback;
        union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
        unsigned int len = skb->len;
        struct net_device *dev;

        skb->pkt_type = PACKET_HOST;
        skb->encapsulation = 0;
        skb->dev = dst_vxlan->dev;
        __skb_pull(skb, skb_network_offset(skb));

        if (remote_ip->sa.sa_family == AF_INET) {
                loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
                loopback.sa.sa_family =  AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                loopback.sin6.sin6_addr = in6addr_loopback;
                loopback.sa.sa_family =  AF_INET6;
#endif
        }

        rcu_read_lock();
        dev = skb->dev;
        if (unlikely(!(dev->flags & IFF_UP))) {
                kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY);
                goto drop;
        }

        if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop)
                vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);

        dev_dstats_tx_add(src_vxlan->dev, len);
        vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len);

        if (__netif_rx(skb) == NET_RX_SUCCESS) {
                dev_dstats_rx_add(dst_vxlan->dev, len);
                vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX,
                                      len);
        } else {
drop:
                dev_dstats_rx_dropped(dev);
                vxlan_vnifilter_count(dst_vxlan, vni, NULL,
                                      VXLAN_VNI_STATS_RX_DROPS, 0);
        }
        rcu_read_unlock();
}

static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
                                 struct vxlan_dev *vxlan,
                                 int addr_family,
                                 __be16 dst_port, int dst_ifindex, __be32 vni,
                                 struct dst_entry *dst,
                                 u32 rt_flags)
{
#if IS_ENABLED(CONFIG_IPV6)
        /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
         * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
         * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
         */
        BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
#endif
        /* Bypass encapsulation if the destination is local */
        if (rt_flags & RTCF_LOCAL &&
            !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
            vxlan->cfg.flags & VXLAN_F_LOCALBYPASS) {
                struct vxlan_dev *dst_vxlan;

                dst_release(dst);
                dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
                                           addr_family, dst_port,
                                           vxlan->cfg.flags);
                if (!dst_vxlan) {
                        DEV_STATS_INC(dev, tx_errors);
                        vxlan_vnifilter_count(vxlan, vni, NULL,
                                              VXLAN_VNI_STATS_TX_ERRORS, 0);
                        kfree_skb_reason(skb, SKB_DROP_REASON_VXLAN_VNI_NOT_FOUND);

                        return -ENOENT;
                }
                vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
                return 1;
        }

        return 0;
}

void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
                    __be32 default_vni, struct vxlan_rdst *rdst, bool did_rsc)
{
        struct dst_cache *dst_cache;
        struct ip_tunnel_info *info;
        struct ip_tunnel_key *pkey;
        struct ip_tunnel_key key;
        struct vxlan_dev *vxlan = netdev_priv(dev);
        const struct iphdr *old_iph;
        struct vxlan_metadata _md;
        struct vxlan_metadata *md = &_md;
        unsigned int pkt_len = skb->len;
        __be16 src_port = 0, dst_port;
        struct dst_entry *ndst = NULL;
        int addr_family;
        __u8 tos, ttl;
        int ifindex;
        int err;
        u32 flags = vxlan->cfg.flags;
        bool use_cache;
        bool udp_sum = false;
        bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
        enum skb_drop_reason reason;
        bool no_eth_encap;
        __be32 vni = 0;

        no_eth_encap = flags & VXLAN_F_GPE && skb->protocol != htons(ETH_P_TEB);
        reason = skb_vlan_inet_prepare(skb, no_eth_encap);
        if (reason)
                goto drop;

        reason = SKB_DROP_REASON_NOT_SPECIFIED;
        old_iph = ip_hdr(skb);

        info = skb_tunnel_info(skb);
        use_cache = ip_tunnel_dst_cache_usable(skb, info);

        if (rdst) {
                memset(&key, 0, sizeof(key));
                pkey = &key;

                if (vxlan_addr_any(&rdst->remote_ip)) {
                        if (did_rsc) {
                                /* short-circuited back to local bridge */
                                vxlan_encap_bypass(skb, vxlan, vxlan,
                                                   default_vni, true);
                                return;
                        }
                        goto drop;
                }

                addr_family = vxlan->cfg.saddr.sa.sa_family;
                dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
                vni = (rdst->remote_vni) ? : default_vni;
                ifindex = rdst->remote_ifindex;

                if (addr_family == AF_INET) {
                        key.u.ipv4.src = vxlan->cfg.saddr.sin.sin_addr.s_addr;
                        key.u.ipv4.dst = rdst->remote_ip.sin.sin_addr.s_addr;
                } else {
                        key.u.ipv6.src = vxlan->cfg.saddr.sin6.sin6_addr;
                        key.u.ipv6.dst = rdst->remote_ip.sin6.sin6_addr;
                }

                dst_cache = &rdst->dst_cache;
                md->gbp = skb->mark;
                if (flags & VXLAN_F_TTL_INHERIT) {
                        ttl = ip_tunnel_get_ttl(old_iph, skb);
                } else {
                        ttl = vxlan->cfg.ttl;
                        if (!ttl && vxlan_addr_multicast(&rdst->remote_ip))
                                ttl = 1;
                }
                tos = vxlan->cfg.tos;
                if (tos == 1)
                        tos = ip_tunnel_get_dsfield(old_iph, skb);
                if (tos && !info)
                        use_cache = false;

                if (addr_family == AF_INET)
                        udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
                else
                        udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
#if IS_ENABLED(CONFIG_IPV6)
                switch (vxlan->cfg.label_policy) {
                case VXLAN_LABEL_FIXED:
                        key.label = vxlan->cfg.label;
                        break;
                case VXLAN_LABEL_INHERIT:
                        key.label = ip_tunnel_get_flowlabel(old_iph, skb);
                        break;
                default:
                        DEBUG_NET_WARN_ON_ONCE(1);
                        goto drop;
                }
#endif
        } else {
                if (!info) {
                        WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
                                  dev->name);
                        goto drop;
                }
                pkey = &info->key;
                addr_family = ip_tunnel_info_af(info);
                dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
                vni = tunnel_id_to_key32(info->key.tun_id);
                ifindex = 0;
                dst_cache = &info->dst_cache;
                if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
                        if (info->options_len < sizeof(*md))
                                goto drop;
                        md = ip_tunnel_info_opts(info);
                }
                ttl = info->key.ttl;
                tos = info->key.tos;
                udp_sum = test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
        }
        src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
                                     vxlan->cfg.port_max, true);

        rcu_read_lock();
        if (addr_family == AF_INET) {
                struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
                struct rtable *rt;
                __be16 df = 0;
                __be32 saddr;

                if (!ifindex)
                        ifindex = sock4->sock->sk->sk_bound_dev_if;

                rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, ifindex,
                                           &saddr, pkey, src_port, dst_port,
                                           tos, use_cache ? dst_cache : NULL);
                if (IS_ERR(rt)) {
                        err = PTR_ERR(rt);
                        reason = SKB_DROP_REASON_IP_OUTNOROUTES;
                        goto tx_error;
                }

                if (!info) {
                        /* Bypass encapsulation if the destination is local */
                        err = encap_bypass_if_local(skb, dev, vxlan, AF_INET,
                                                    dst_port, ifindex, vni,
                                                    &rt->dst, rt->rt_flags);
                        if (err)
                                goto out_unlock;

                        if (vxlan->cfg.df == VXLAN_DF_SET) {
                                df = htons(IP_DF);
                        } else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
                                struct ethhdr *eth = eth_hdr(skb);

                                if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
                                    (ntohs(eth->h_proto) == ETH_P_IP &&
                                     old_iph->frag_off & htons(IP_DF)))
                                        df = htons(IP_DF);
                        }
                } else if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT,
                                    info->key.tun_flags)) {
                        df = htons(IP_DF);
                }

                ndst = &rt->dst;
                err = skb_tunnel_check_pmtu(skb, ndst, vxlan_headroom(flags & VXLAN_F_GPE),
                                            netif_is_any_bridge_port(dev));
                if (err < 0) {
                        goto tx_error;
                } else if (err) {
                        if (info) {
                                struct ip_tunnel_info *unclone;

                                unclone = skb_tunnel_info_unclone(skb);
                                if (unlikely(!unclone))
                                        goto tx_error;

                                unclone->key.u.ipv4.src = pkey->u.ipv4.dst;
                                unclone->key.u.ipv4.dst = saddr;
                        }
                        vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
                        dst_release(ndst);
                        goto out_unlock;
                }

                tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
                ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
                err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
                                      vni, md, flags, udp_sum);
                if (err < 0) {
                        reason = SKB_DROP_REASON_NOMEM;
                        goto tx_error;
                }

                udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr,
                                    pkey->u.ipv4.dst, tos, ttl, df,
                                    src_port, dst_port, xnet, !udp_sum);
#if IS_ENABLED(CONFIG_IPV6)
        } else {
                struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
                struct in6_addr saddr;

                if (!ifindex)
                        ifindex = sock6->sock->sk->sk_bound_dev_if;

                ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock,
                                              ifindex, &saddr, pkey,
                                              src_port, dst_port, tos,
                                              use_cache ? dst_cache : NULL);
                if (IS_ERR(ndst)) {
                        err = PTR_ERR(ndst);
                        ndst = NULL;
                        reason = SKB_DROP_REASON_IP_OUTNOROUTES;
                        goto tx_error;
                }

                if (!info) {
                        u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags;

                        err = encap_bypass_if_local(skb, dev, vxlan, AF_INET6,
                                                    dst_port, ifindex, vni,
                                                    ndst, rt6i_flags);
                        if (err)
                                goto out_unlock;
                }

                err = skb_tunnel_check_pmtu(skb, ndst,
                                            vxlan_headroom((flags & VXLAN_F_GPE) | VXLAN_F_IPV6),
                                            netif_is_any_bridge_port(dev));
                if (err < 0) {
                        goto tx_error;
                } else if (err) {
                        if (info) {
                                struct ip_tunnel_info *unclone;

                                unclone = skb_tunnel_info_unclone(skb);
                                if (unlikely(!unclone))
                                        goto tx_error;

                                unclone->key.u.ipv6.src = pkey->u.ipv6.dst;
                                unclone->key.u.ipv6.dst = saddr;
                        }

                        vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
                        dst_release(ndst);
                        goto out_unlock;
                }

                tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
                ttl = ttl ? : ip6_dst_hoplimit(ndst);
                skb_scrub_packet(skb, xnet);
                err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
                                      vni, md, flags, udp_sum);
                if (err < 0) {
                        reason = SKB_DROP_REASON_NOMEM;
                        goto tx_error;
                }

                udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
                                     &saddr, &pkey->u.ipv6.dst, tos, ttl,
                                     pkey->label, src_port, dst_port, !udp_sum);
#endif
        }
        vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len);
out_unlock:
        rcu_read_unlock();
        return;

drop:
        dev_dstats_tx_dropped(dev);
        vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0);
        kfree_skb_reason(skb, reason);
        return;

tx_error:
        rcu_read_unlock();
        if (err == -ELOOP)
                DEV_STATS_INC(dev, collisions);
        else if (err == -ENETUNREACH)
                DEV_STATS_INC(dev, tx_carrier_errors);
        dst_release(ndst);
        DEV_STATS_INC(dev, tx_errors);
        vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0);
        kfree_skb_reason(skb, reason);
}

static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
                          struct vxlan_fdb *f, __be32 vni, bool did_rsc)
{
        struct vxlan_rdst nh_rdst;
        struct nexthop *nh;
        bool do_xmit;
        u32 hash;

        memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
        hash = skb_get_hash(skb);

        rcu_read_lock();
        nh = rcu_dereference(f->nh);
        if (!nh) {
                rcu_read_unlock();
                goto drop;
        }
        do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
        rcu_read_unlock();

        if (likely(do_xmit))
                vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc);
        else
                goto drop;

        return;

drop:
        dev_dstats_tx_dropped(dev);
        vxlan_vnifilter_count(netdev_priv(dev), vni, NULL,
                              VXLAN_VNI_STATS_TX_DROPS, 0);
        dev_kfree_skb(skb);
}

static netdev_tx_t vxlan_xmit_nhid(struct sk_buff *skb, struct net_device *dev,
                                   u32 nhid, __be32 vni)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_rdst nh_rdst;
        struct nexthop *nh;
        bool do_xmit;
        u32 hash;

        memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
        hash = skb_get_hash(skb);

        rcu_read_lock();
        nh = nexthop_find_by_id(dev_net(dev), nhid);
        if (unlikely(!nh || !nexthop_is_fdb(nh) || !nexthop_is_multipath(nh))) {
                rcu_read_unlock();
                goto drop;
        }
        do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
        rcu_read_unlock();

        if (vxlan->cfg.saddr.sa.sa_family != nh_rdst.remote_ip.sa.sa_family)
                goto drop;

        if (likely(do_xmit))
                vxlan_xmit_one(skb, dev, vni, &nh_rdst, false);
        else
                goto drop;

        return NETDEV_TX_OK;

drop:
        dev_dstats_tx_dropped(dev);
        vxlan_vnifilter_count(netdev_priv(dev), vni, NULL,
                              VXLAN_VNI_STATS_TX_DROPS, 0);
        dev_kfree_skb(skb);
        return NETDEV_TX_OK;
}

/* Transmit local packets over Vxlan
 *
 * Outer IP header inherits ECN and DF from inner header.
 * Outer UDP destination is the VXLAN assigned port.
 *           source port is based on hash of flow
 */
static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_rdst *rdst, *fdst = NULL;
        const struct ip_tunnel_info *info;
        struct vxlan_fdb *f;
        struct ethhdr *eth;
        __be32 vni = 0;
        u32 nhid = 0;
        bool did_rsc;

        info = skb_tunnel_info(skb);

        skb_reset_mac_header(skb);

        if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
                if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
                    info->mode & IP_TUNNEL_INFO_TX) {
                        vni = tunnel_id_to_key32(info->key.tun_id);
                        nhid = info->key.nhid;
                } else {
                        if (info && info->mode & IP_TUNNEL_INFO_TX)
                                vxlan_xmit_one(skb, dev, vni, NULL, false);
                        else
                                kfree_skb_reason(skb, SKB_DROP_REASON_TUNNEL_TXINFO);
                        return NETDEV_TX_OK;
                }
        }

        if (vxlan->cfg.flags & VXLAN_F_PROXY) {
                eth = eth_hdr(skb);
                if (ntohs(eth->h_proto) == ETH_P_ARP)
                        return arp_reduce(dev, skb, vni);
#if IS_ENABLED(CONFIG_IPV6)
                else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
                         pskb_may_pull(skb, sizeof(struct ipv6hdr) +
                                            sizeof(struct nd_msg)) &&
                         ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
                        struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1);

                        if (m->icmph.icmp6_code == 0 &&
                            m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
                                return neigh_reduce(dev, skb, vni);
                }
#endif
        }

        if (nhid)
                return vxlan_xmit_nhid(skb, dev, nhid, vni);

        if (vxlan->cfg.flags & VXLAN_F_MDB) {
                struct vxlan_mdb_entry *mdb_entry;

                rcu_read_lock();
                mdb_entry = vxlan_mdb_entry_skb_get(vxlan, skb, vni);
                if (mdb_entry) {
                        netdev_tx_t ret;

                        ret = vxlan_mdb_xmit(vxlan, mdb_entry, skb);
                        rcu_read_unlock();
                        return ret;
                }
                rcu_read_unlock();
        }

        eth = eth_hdr(skb);
        f = vxlan_find_mac(vxlan, eth->h_dest, vni);
        did_rsc = false;

        if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
            (ntohs(eth->h_proto) == ETH_P_IP ||
             ntohs(eth->h_proto) == ETH_P_IPV6)) {
                did_rsc = route_shortcircuit(dev, skb);
                if (did_rsc)
                        f = vxlan_find_mac(vxlan, eth->h_dest, vni);
        }

        if (f == NULL) {
                f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
                if (f == NULL) {
                        if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
                            !is_multicast_ether_addr(eth->h_dest))
                                vxlan_fdb_miss(vxlan, eth->h_dest);

                        dev_dstats_tx_dropped(dev);
                        vxlan_vnifilter_count(vxlan, vni, NULL,
                                              VXLAN_VNI_STATS_TX_DROPS, 0);
                        kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET);
                        return NETDEV_TX_OK;
                }
        }

        if (rcu_access_pointer(f->nh)) {
                vxlan_xmit_nh(skb, dev, f,
                              (vni ? : vxlan->default_dst.remote_vni), did_rsc);
        } else {
                list_for_each_entry_rcu(rdst, &f->remotes, list) {
                        struct sk_buff *skb1;

                        if (!fdst) {
                                fdst = rdst;
                                continue;
                        }
                        skb1 = skb_clone(skb, GFP_ATOMIC);
                        if (skb1)
                                vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
                }
                if (fdst)
                        vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
                else
                        kfree_skb_reason(skb, SKB_DROP_REASON_NO_TX_TARGET);
        }

        return NETDEV_TX_OK;
}

/* Walk the forwarding table and purge stale entries */
static void vxlan_cleanup(struct timer_list *t)
{
        struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
        unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
        unsigned int h;

        if (!netif_running(vxlan->dev))
                return;

        for (h = 0; h < FDB_HASH_SIZE; ++h) {
                struct hlist_node *p, *n;

                spin_lock(&vxlan->hash_lock[h]);
                hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
                        struct vxlan_fdb *f
                                = container_of(p, struct vxlan_fdb, hlist);
                        unsigned long timeout;

                        if (f->state & (NUD_PERMANENT | NUD_NOARP))
                                continue;

                        if (f->flags & NTF_EXT_LEARNED)
                                continue;

                        timeout = READ_ONCE(f->updated) + vxlan->cfg.age_interval * HZ;
                        if (time_before_eq(timeout, jiffies)) {
                                netdev_dbg(vxlan->dev,
                                           "garbage collect %pM\n",
                                           f->eth_addr);
                                f->state = NUD_STALE;
                                vxlan_fdb_destroy(vxlan, f, true, true);
                        } else if (time_before(timeout, next_timer))
                                next_timer = timeout;
                }
                spin_unlock(&vxlan->hash_lock[h]);
        }

        mod_timer(&vxlan->age_timer, next_timer);
}

static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
{
        struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);

        spin_lock(&vn->sock_lock);
        hlist_del_init_rcu(&vxlan->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
        hlist_del_init_rcu(&vxlan->hlist6.hlist);
#endif
        spin_unlock(&vn->sock_lock);
}

static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
                             struct vxlan_dev_node *node)
{
        struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
        __be32 vni = vxlan->default_dst.remote_vni;

        node->vxlan = vxlan;
        spin_lock(&vn->sock_lock);
        hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
        spin_unlock(&vn->sock_lock);
}

/* Setup stats when device is created */
static int vxlan_init(struct net_device *dev)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        int err;

        if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
                err = vxlan_vnigroup_init(vxlan);
                if (err)
                        return err;
        }

        err = gro_cells_init(&vxlan->gro_cells, dev);
        if (err)
                goto err_vnigroup_uninit;

        err = vxlan_mdb_init(vxlan);
        if (err)
                goto err_gro_cells_destroy;

        netdev_lockdep_set_classes(dev);
        return 0;

err_gro_cells_destroy:
        gro_cells_destroy(&vxlan->gro_cells);
err_vnigroup_uninit:
        if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
                vxlan_vnigroup_uninit(vxlan);
        return err;
}

static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
{
        struct vxlan_fdb *f;
        u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni);

        spin_lock_bh(&vxlan->hash_lock[hash_index]);
        f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
        if (f)
                vxlan_fdb_destroy(vxlan, f, true, true);
        spin_unlock_bh(&vxlan->hash_lock[hash_index]);
}

static void vxlan_uninit(struct net_device *dev)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);

        vxlan_mdb_fini(vxlan);

        if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
                vxlan_vnigroup_uninit(vxlan);

        gro_cells_destroy(&vxlan->gro_cells);

        vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
}

/* Start ageing timer and join group when device is brought up */
static int vxlan_open(struct net_device *dev)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        int ret;

        ret = vxlan_sock_add(vxlan);
        if (ret < 0)
                return ret;

        ret = vxlan_multicast_join(vxlan);
        if (ret) {
                vxlan_sock_release(vxlan);
                return ret;
        }

        if (vxlan->cfg.age_interval)
                mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);

        return ret;
}

struct vxlan_fdb_flush_desc {
        bool                                ignore_default_entry;
        unsigned long                   state;
        unsigned long                        state_mask;
        unsigned long                   flags;
        unsigned long                        flags_mask;
        __be32                                src_vni;
        u32                                nhid;
        __be32                                vni;
        __be16                                port;
        union vxlan_addr                dst_ip;
};

static bool vxlan_fdb_is_default_entry(const struct vxlan_fdb *f,
                                       const struct vxlan_dev *vxlan)
{
        return is_zero_ether_addr(f->eth_addr) && f->vni == vxlan->cfg.vni;
}

static bool vxlan_fdb_nhid_matches(const struct vxlan_fdb *f, u32 nhid)
{
        struct nexthop *nh = rtnl_dereference(f->nh);

        return nh && nh->id == nhid;
}

static bool vxlan_fdb_flush_matches(const struct vxlan_fdb *f,
                                    const struct vxlan_dev *vxlan,
                                    const struct vxlan_fdb_flush_desc *desc)
{
        if (desc->state_mask && (f->state & desc->state_mask) != desc->state)
                return false;

        if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags)
                return false;

        if (desc->ignore_default_entry && vxlan_fdb_is_default_entry(f, vxlan))
                return false;

        if (desc->src_vni && f->vni != desc->src_vni)
                return false;

        if (desc->nhid && !vxlan_fdb_nhid_matches(f, desc->nhid))
                return false;

        return true;
}

static bool
vxlan_fdb_flush_should_match_remotes(const struct vxlan_fdb_flush_desc *desc)
{
        return desc->vni || desc->port || desc->dst_ip.sa.sa_family;
}

static bool
vxlan_fdb_flush_remote_matches(const struct vxlan_fdb_flush_desc *desc,
                               const struct vxlan_rdst *rd)
{
        if (desc->vni && rd->remote_vni != desc->vni)
                return false;

        if (desc->port && rd->remote_port != desc->port)
                return false;

        if (desc->dst_ip.sa.sa_family &&
            !vxlan_addr_equal(&rd->remote_ip, &desc->dst_ip))
                return false;

        return true;
}

static void
vxlan_fdb_flush_match_remotes(struct vxlan_fdb *f, struct vxlan_dev *vxlan,
                              const struct vxlan_fdb_flush_desc *desc,
                              bool *p_destroy_fdb)
{
        bool remotes_flushed = false;
        struct vxlan_rdst *rd, *tmp;

        list_for_each_entry_safe(rd, tmp, &f->remotes, list) {
                if (!vxlan_fdb_flush_remote_matches(desc, rd))
                        continue;

                vxlan_fdb_dst_destroy(vxlan, f, rd, true);
                remotes_flushed = true;
        }

        *p_destroy_fdb = remotes_flushed && list_empty(&f->remotes);
}

/* Purge the forwarding table */
static void vxlan_flush(struct vxlan_dev *vxlan,
                        const struct vxlan_fdb_flush_desc *desc)
{
        bool match_remotes = vxlan_fdb_flush_should_match_remotes(desc);
        unsigned int h;

        for (h = 0; h < FDB_HASH_SIZE; ++h) {
                struct hlist_node *p, *n;

                spin_lock_bh(&vxlan->hash_lock[h]);
                hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
                        struct vxlan_fdb *f
                                = container_of(p, struct vxlan_fdb, hlist);

                        if (!vxlan_fdb_flush_matches(f, vxlan, desc))
                                continue;

                        if (match_remotes) {
                                bool destroy_fdb = false;

                                vxlan_fdb_flush_match_remotes(f, vxlan, desc,
                                                              &destroy_fdb);

                                if (!destroy_fdb)
                                        continue;
                        }

                        vxlan_fdb_destroy(vxlan, f, true, true);
                }
                spin_unlock_bh(&vxlan->hash_lock[h]);
        }
}

static const struct nla_policy vxlan_del_bulk_policy[NDA_MAX + 1] = {
        [NDA_SRC_VNI]   = { .type = NLA_U32 },
        [NDA_NH_ID]        = { .type = NLA_U32 },
        [NDA_VNI]        = { .type = NLA_U32 },
        [NDA_PORT]        = { .type = NLA_U16 },
        [NDA_DST]        = NLA_POLICY_RANGE(NLA_BINARY, sizeof(struct in_addr),
                                           sizeof(struct in6_addr)),
        [NDA_NDM_STATE_MASK]        = { .type = NLA_U16 },
        [NDA_NDM_FLAGS_MASK]        = { .type = NLA_U8 },
};

#define VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF)
#define VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP)
#define VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_EXT_LEARNED | NTF_OFFLOADED | \
                                           NTF_ROUTER)

static int vxlan_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
                                 struct netlink_ext_ack *extack)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_fdb_flush_desc desc = {};
        struct ndmsg *ndm = nlmsg_data(nlh);
        struct nlattr *tb[NDA_MAX + 1];
        u8 ndm_flags;
        int err;

        ndm_flags = ndm->ndm_flags & ~VXLAN_FDB_FLUSH_IGNORED_NDM_FLAGS;

        err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, vxlan_del_bulk_policy,
                          extack);
        if (err)
                return err;

        if (ndm_flags & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_FLAGS) {
                NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set");
                return -EINVAL;
        }
        if (ndm->ndm_state & ~VXLAN_FDB_FLUSH_ALLOWED_NDM_STATES) {
                NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set");
                return -EINVAL;
        }

        desc.state = ndm->ndm_state;
        desc.flags = ndm_flags;

        if (tb[NDA_NDM_STATE_MASK])
                desc.state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]);

        if (tb[NDA_NDM_FLAGS_MASK])
                desc.flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]);

        if (tb[NDA_SRC_VNI])
                desc.src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));

        if (tb[NDA_NH_ID])
                desc.nhid = nla_get_u32(tb[NDA_NH_ID]);

        if (tb[NDA_VNI])
                desc.vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));

        if (tb[NDA_PORT])
                desc.port = nla_get_be16(tb[NDA_PORT]);

        if (tb[NDA_DST]) {
                union vxlan_addr ip;

                err = vxlan_nla_get_addr(&ip, tb[NDA_DST]);
                if (err) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[NDA_DST],
                                            "Unsupported address family");
                        return err;
                }
                desc.dst_ip = ip;
        }

        vxlan_flush(vxlan, &desc);

        return 0;
}

/* Cleanup timer and forwarding table on shutdown */
static int vxlan_stop(struct net_device *dev)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_fdb_flush_desc desc = {
                /* Default entry is deleted at vxlan_uninit. */
                .ignore_default_entry = true,
                .state = 0,
                .state_mask = NUD_PERMANENT | NUD_NOARP,
        };

        vxlan_multicast_leave(vxlan);

        timer_delete_sync(&vxlan->age_timer);

        vxlan_flush(vxlan, &desc);
        vxlan_sock_release(vxlan);

        return 0;
}

/* Stub, nothing needs to be done. */
static void vxlan_set_multicast_list(struct net_device *dev)
{
}

static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_rdst *dst = &vxlan->default_dst;
        struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
                                                         dst->remote_ifindex);

        /* This check is different than dev->max_mtu, because it looks at
         * the lowerdev->mtu, rather than the static dev->max_mtu
         */
        if (lowerdev) {
                int max_mtu = lowerdev->mtu - vxlan_headroom(vxlan->cfg.flags);
                if (new_mtu > max_mtu)
                        return -EINVAL;
        }

        WRITE_ONCE(dev->mtu, new_mtu);
        return 0;
}

static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct ip_tunnel_info *info = skb_tunnel_info(skb);
        __be16 sport, dport;

        sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
                                  vxlan->cfg.port_max, true);
        dport = info->key.tp_dst ? : vxlan->cfg.dst_port;

        if (ip_tunnel_info_af(info) == AF_INET) {
                struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
                struct rtable *rt;

                if (!sock4)
                        return -EIO;

                rt = udp_tunnel_dst_lookup(skb, dev, vxlan->net, 0,
                                           &info->key.u.ipv4.src,
                                           &info->key,
                                           sport, dport, info->key.tos,
                                           &info->dst_cache);
                if (IS_ERR(rt))
                        return PTR_ERR(rt);
                ip_rt_put(rt);
        } else {
#if IS_ENABLED(CONFIG_IPV6)
                struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
                struct dst_entry *ndst;

                if (!sock6)
                        return -EIO;

                ndst = udp_tunnel6_dst_lookup(skb, dev, vxlan->net, sock6->sock,
                                              0, &info->key.u.ipv6.src,
                                              &info->key,
                                              sport, dport, info->key.tos,
                                              &info->dst_cache);
                if (IS_ERR(ndst))
                        return PTR_ERR(ndst);
                dst_release(ndst);
#else /* !CONFIG_IPV6 */
                return -EPFNOSUPPORT;
#endif
        }
        info->key.tp_src = sport;
        info->key.tp_dst = dport;
        return 0;
}

static const struct net_device_ops vxlan_netdev_ether_ops = {
        .ndo_init                = vxlan_init,
        .ndo_uninit                = vxlan_uninit,
        .ndo_open                = vxlan_open,
        .ndo_stop                = vxlan_stop,
        .ndo_start_xmit                = vxlan_xmit,
        .ndo_set_rx_mode        = vxlan_set_multicast_list,
        .ndo_change_mtu                = vxlan_change_mtu,
        .ndo_validate_addr        = eth_validate_addr,
        .ndo_set_mac_address        = eth_mac_addr,
        .ndo_fdb_add                = vxlan_fdb_add,
        .ndo_fdb_del                = vxlan_fdb_delete,
        .ndo_fdb_del_bulk        = vxlan_fdb_delete_bulk,
        .ndo_fdb_dump                = vxlan_fdb_dump,
        .ndo_fdb_get                = vxlan_fdb_get,
        .ndo_mdb_add                = vxlan_mdb_add,
        .ndo_mdb_del                = vxlan_mdb_del,
        .ndo_mdb_del_bulk        = vxlan_mdb_del_bulk,
        .ndo_mdb_dump                = vxlan_mdb_dump,
        .ndo_mdb_get                = vxlan_mdb_get,
        .ndo_fill_metadata_dst        = vxlan_fill_metadata_dst,
};

static const struct net_device_ops vxlan_netdev_raw_ops = {
        .ndo_init                = vxlan_init,
        .ndo_uninit                = vxlan_uninit,
        .ndo_open                = vxlan_open,
        .ndo_stop                = vxlan_stop,
        .ndo_start_xmit                = vxlan_xmit,
        .ndo_change_mtu                = vxlan_change_mtu,
        .ndo_fill_metadata_dst        = vxlan_fill_metadata_dst,
};

/* Info for udev, that this is a virtual tunnel endpoint */
static const struct device_type vxlan_type = {
        .name = "vxlan",
};

/* Calls the ndo_udp_tunnel_add of the caller in order to
 * supply the listening VXLAN udp ports. Callers are expected
 * to implement the ndo_udp_tunnel_add.
 */
static void vxlan_offload_rx_ports(struct net_device *dev, bool push)
{
        struct vxlan_sock *vs;
        struct net *net = dev_net(dev);
        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
        unsigned int i;

        spin_lock(&vn->sock_lock);
        for (i = 0; i < PORT_HASH_SIZE; ++i) {
                hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
                        unsigned short type;

                        if (vs->flags & VXLAN_F_GPE)
                                type = UDP_TUNNEL_TYPE_VXLAN_GPE;
                        else
                                type = UDP_TUNNEL_TYPE_VXLAN;

                        if (push)
                                udp_tunnel_push_rx_port(dev, vs->sock, type);
                        else
                                udp_tunnel_drop_rx_port(dev, vs->sock, type);
                }
        }
        spin_unlock(&vn->sock_lock);
}

/* Initialize the device structure. */
static void vxlan_setup(struct net_device *dev)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        unsigned int h;

        eth_hw_addr_random(dev);
        ether_setup(dev);

        dev->needs_free_netdev = true;
        SET_NETDEV_DEVTYPE(dev, &vxlan_type);

        dev->features        |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
        dev->features   |= NETIF_F_RXCSUM;
        dev->features   |= NETIF_F_GSO_SOFTWARE;

        dev->vlan_features = dev->features;
        dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
        dev->hw_features |= NETIF_F_RXCSUM;
        dev->hw_features |= NETIF_F_GSO_SOFTWARE;
        netif_keep_dst(dev);
        dev->priv_flags |= IFF_NO_QUEUE;
        dev->change_proto_down = true;
        dev->lltx = true;

        /* MTU range: 68 - 65535 */
        dev->min_mtu = ETH_MIN_MTU;
        dev->max_mtu = ETH_MAX_MTU;

        dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
        INIT_LIST_HEAD(&vxlan->next);

        timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);

        vxlan->dev = dev;

        for (h = 0; h < FDB_HASH_SIZE; ++h) {
                spin_lock_init(&vxlan->hash_lock[h]);
                INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
        }
}

static void vxlan_ether_setup(struct net_device *dev)
{
        dev->priv_flags &= ~IFF_TX_SKB_SHARING;
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
        dev->netdev_ops = &vxlan_netdev_ether_ops;
}

static void vxlan_raw_setup(struct net_device *dev)
{
        dev->header_ops = NULL;
        dev->type = ARPHRD_NONE;
        dev->hard_header_len = 0;
        dev->addr_len = 0;
        dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
        dev->netdev_ops = &vxlan_netdev_raw_ops;
}

static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
        [IFLA_VXLAN_UNSPEC]     = { .strict_start_type = IFLA_VXLAN_LOCALBYPASS },
        [IFLA_VXLAN_ID]                = { .type = NLA_U32 },
        [IFLA_VXLAN_GROUP]        = { .len = sizeof_field(struct iphdr, daddr) },
        [IFLA_VXLAN_GROUP6]        = { .len = sizeof(struct in6_addr) },
        [IFLA_VXLAN_LINK]        = { .type = NLA_U32 },
        [IFLA_VXLAN_LOCAL]        = { .len = sizeof_field(struct iphdr, saddr) },
        [IFLA_VXLAN_LOCAL6]        = { .len = sizeof(struct in6_addr) },
        [IFLA_VXLAN_TOS]        = { .type = NLA_U8 },
        [IFLA_VXLAN_TTL]        = { .type = NLA_U8 },
        [IFLA_VXLAN_LABEL]        = { .type = NLA_U32 },
        [IFLA_VXLAN_LEARNING]        = { .type = NLA_U8 },
        [IFLA_VXLAN_AGEING]        = { .type = NLA_U32 },
        [IFLA_VXLAN_LIMIT]        = { .type = NLA_U32 },
        [IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
        [IFLA_VXLAN_PROXY]        = { .type = NLA_U8 },
        [IFLA_VXLAN_RSC]        = { .type = NLA_U8 },
        [IFLA_VXLAN_L2MISS]        = { .type = NLA_U8 },
        [IFLA_VXLAN_L3MISS]        = { .type = NLA_U8 },
        [IFLA_VXLAN_COLLECT_METADATA]        = { .type = NLA_U8 },
        [IFLA_VXLAN_PORT]        = { .type = NLA_U16 },
        [IFLA_VXLAN_UDP_CSUM]        = { .type = NLA_U8 },
        [IFLA_VXLAN_UDP_ZERO_CSUM6_TX]        = { .type = NLA_U8 },
        [IFLA_VXLAN_UDP_ZERO_CSUM6_RX]        = { .type = NLA_U8 },
        [IFLA_VXLAN_REMCSUM_TX]        = { .type = NLA_U8 },
        [IFLA_VXLAN_REMCSUM_RX]        = { .type = NLA_U8 },
        [IFLA_VXLAN_GBP]        = { .type = NLA_FLAG, },
        [IFLA_VXLAN_GPE]        = { .type = NLA_FLAG, },
        [IFLA_VXLAN_REMCSUM_NOPARTIAL]        = { .type = NLA_FLAG },
        [IFLA_VXLAN_TTL_INHERIT]        = { .type = NLA_FLAG },
        [IFLA_VXLAN_DF]                = { .type = NLA_U8 },
        [IFLA_VXLAN_VNIFILTER]        = { .type = NLA_U8 },
        [IFLA_VXLAN_LOCALBYPASS]        = NLA_POLICY_MAX(NLA_U8, 1),
        [IFLA_VXLAN_LABEL_POLICY]       = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX),
        [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)),
};

static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
                          struct netlink_ext_ack *extack)
{
        if (tb[IFLA_ADDRESS]) {
                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
                                            "Provided link layer address is not Ethernet");
                        return -EINVAL;
                }

                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
                                            "Provided Ethernet address is not unicast");
                        return -EADDRNOTAVAIL;
                }
        }

        if (tb[IFLA_MTU]) {
                u32 mtu = nla_get_u32(tb[IFLA_MTU]);

                if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
                                            "MTU must be between 68 and 65535");
                        return -EINVAL;
                }
        }

        if (!data) {
                NL_SET_ERR_MSG(extack,
                               "Required attributes not provided to perform the operation");
                return -EINVAL;
        }

        if (data[IFLA_VXLAN_ID]) {
                u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);

                if (id >= VXLAN_N_VID) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID],
                                            "VXLAN ID must be lower than 16777216");
                        return -ERANGE;
                }
        }

        if (data[IFLA_VXLAN_PORT_RANGE]) {
                const struct ifla_vxlan_port_range *p
                        = nla_data(data[IFLA_VXLAN_PORT_RANGE]);

                if (ntohs(p->high) < ntohs(p->low)) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE],
                                            "Invalid source port range");
                        return -EINVAL;
                }
        }

        if (data[IFLA_VXLAN_DF]) {
                enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);

                if (df < 0 || df > VXLAN_DF_MAX) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF],
                                            "Invalid DF attribute");
                        return -EINVAL;
                }
        }

        return 0;
}

static void vxlan_get_drvinfo(struct net_device *netdev,
                              struct ethtool_drvinfo *drvinfo)
{
        strscpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
        strscpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
}

static int vxlan_get_link_ksettings(struct net_device *dev,
                                    struct ethtool_link_ksettings *cmd)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_rdst *dst = &vxlan->default_dst;
        struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
                                                         dst->remote_ifindex);

        if (!lowerdev) {
                cmd->base.duplex = DUPLEX_UNKNOWN;
                cmd->base.port = PORT_OTHER;
                cmd->base.speed = SPEED_UNKNOWN;

                return 0;
        }

        return __ethtool_get_link_ksettings(lowerdev, cmd);
}

static const struct ethtool_ops vxlan_ethtool_ops = {
        .get_drvinfo                = vxlan_get_drvinfo,
        .get_link                = ethtool_op_get_link,
        .get_link_ksettings        = vxlan_get_link_ksettings,
};

static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
                                        __be16 port, u32 flags, int ifindex)
{
        struct socket *sock;
        struct udp_port_cfg udp_conf;
        int err;

        memset(&udp_conf, 0, sizeof(udp_conf));

        if (ipv6) {
                udp_conf.family = AF_INET6;
                udp_conf.use_udp6_rx_checksums =
                    !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
                udp_conf.ipv6_v6only = 1;
        } else {
                udp_conf.family = AF_INET;
        }

        udp_conf.local_udp_port = port;
        udp_conf.bind_ifindex = ifindex;

        /* Open UDP socket */
        err = udp_sock_create(net, &udp_conf, &sock);
        if (err < 0)
                return ERR_PTR(err);

        udp_allow_gso(sock->sk);
        return sock;
}

/* Create new listen socket if needed */
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
                                              __be16 port, u32 flags,
                                              int ifindex)
{
        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
        struct vxlan_sock *vs;
        struct socket *sock;
        unsigned int h;
        struct udp_tunnel_sock_cfg tunnel_cfg;

        vs = kzalloc(sizeof(*vs), GFP_KERNEL);
        if (!vs)
                return ERR_PTR(-ENOMEM);

        for (h = 0; h < VNI_HASH_SIZE; ++h)
                INIT_HLIST_HEAD(&vs->vni_list[h]);

        sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
        if (IS_ERR(sock)) {
                kfree(vs);
                return ERR_CAST(sock);
        }

        vs->sock = sock;
        refcount_set(&vs->refcnt, 1);
        vs->flags = (flags & VXLAN_F_RCV_FLAGS);

        spin_lock(&vn->sock_lock);
        hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
        udp_tunnel_notify_add_rx_port(sock,
                                      (vs->flags & VXLAN_F_GPE) ?
                                      UDP_TUNNEL_TYPE_VXLAN_GPE :
                                      UDP_TUNNEL_TYPE_VXLAN);
        spin_unlock(&vn->sock_lock);

        /* Mark socket as an encapsulation socket. */
        memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
        tunnel_cfg.sk_user_data = vs;
        tunnel_cfg.encap_type = 1;
        tunnel_cfg.encap_rcv = vxlan_rcv;
        tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
        tunnel_cfg.encap_destroy = NULL;
        if (vs->flags & VXLAN_F_GPE) {
                tunnel_cfg.gro_receive = vxlan_gpe_gro_receive;
                tunnel_cfg.gro_complete = vxlan_gpe_gro_complete;
        } else {
                tunnel_cfg.gro_receive = vxlan_gro_receive;
                tunnel_cfg.gro_complete = vxlan_gro_complete;
        }

        setup_udp_tunnel_sock(net, sock, &tunnel_cfg);

        return vs;
}

static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
{
        struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
        bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
        struct vxlan_sock *vs = NULL;
        struct vxlan_dev_node *node;
        int l3mdev_index = 0;

        if (vxlan->cfg.remote_ifindex)
                l3mdev_index = l3mdev_master_upper_ifindex_by_index(
                        vxlan->net, vxlan->cfg.remote_ifindex);

        if (!vxlan->cfg.no_share) {
                spin_lock(&vn->sock_lock);
                vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
                                     vxlan->cfg.dst_port, vxlan->cfg.flags,
                                     l3mdev_index);
                if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
                        spin_unlock(&vn->sock_lock);
                        return -EBUSY;
                }
                spin_unlock(&vn->sock_lock);
        }
        if (!vs)
                vs = vxlan_socket_create(vxlan->net, ipv6,
                                         vxlan->cfg.dst_port, vxlan->cfg.flags,
                                         l3mdev_index);
        if (IS_ERR(vs))
                return PTR_ERR(vs);
#if IS_ENABLED(CONFIG_IPV6)
        if (ipv6) {
                rcu_assign_pointer(vxlan->vn6_sock, vs);
                node = &vxlan->hlist6;
        } else
#endif
        {
                rcu_assign_pointer(vxlan->vn4_sock, vs);
                node = &vxlan->hlist4;
        }

        if (metadata && (vxlan->cfg.flags & VXLAN_F_VNIFILTER))
                vxlan_vs_add_vnigrp(vxlan, vs, ipv6);
        else
                vxlan_vs_add_dev(vs, vxlan, node);

        return 0;
}

static int vxlan_sock_add(struct vxlan_dev *vxlan)
{
        bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
        bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
        bool ipv4 = !ipv6 || metadata;
        int ret = 0;

        RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
#if IS_ENABLED(CONFIG_IPV6)
        RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
        if (ipv6) {
                ret = __vxlan_sock_add(vxlan, true);
                if (ret < 0 && ret != -EAFNOSUPPORT)
                        ipv4 = false;
        }
#endif
        if (ipv4)
                ret = __vxlan_sock_add(vxlan, false);
        if (ret < 0)
                vxlan_sock_release(vxlan);
        return ret;
}

int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan,
                     struct vxlan_config *conf, __be32 vni)
{
        struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
        struct vxlan_dev *tmp;

        list_for_each_entry(tmp, &vn->vxlan_list, next) {
                if (tmp == vxlan)
                        continue;
                if (tmp->cfg.flags & VXLAN_F_VNIFILTER) {
                        if (!vxlan_vnifilter_lookup(tmp, vni))
                                continue;
                } else if (tmp->cfg.vni != vni) {
                        continue;
                }
                if (tmp->cfg.dst_port != conf->dst_port)
                        continue;
                if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
                    (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
                        continue;

                if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
                    tmp->cfg.remote_ifindex != conf->remote_ifindex)
                        continue;

                return -EEXIST;
        }

        return 0;
}

static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
                                 struct net_device **lower,
                                 struct vxlan_dev *old,
                                 struct netlink_ext_ack *extack)
{
        bool use_ipv6 = false;

        if (conf->flags & VXLAN_F_GPE) {
                /* For now, allow GPE only together with
                 * COLLECT_METADATA. This can be relaxed later; in such
                 * case, the other side of the PtP link will have to be
                 * provided.
                 */
                if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
                    !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
                        NL_SET_ERR_MSG(extack,
                                       "VXLAN GPE does not support this combination of attributes");
                        return -EINVAL;
                }
        }

        if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
                /* Unless IPv6 is explicitly requested, assume IPv4 */
                conf->remote_ip.sa.sa_family = AF_INET;
                conf->saddr.sa.sa_family = AF_INET;
        } else if (!conf->remote_ip.sa.sa_family) {
                conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
        } else if (!conf->saddr.sa.sa_family) {
                conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
        }

        if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
                NL_SET_ERR_MSG(extack,
                               "Local and remote address must be from the same family");
                return -EINVAL;
        }

        if (vxlan_addr_multicast(&conf->saddr)) {
                NL_SET_ERR_MSG(extack, "Local address cannot be multicast");
                return -EINVAL;
        }

        if (conf->saddr.sa.sa_family == AF_INET6) {
                if (!IS_ENABLED(CONFIG_IPV6)) {
                        NL_SET_ERR_MSG(extack,
                                       "IPv6 support not enabled in the kernel");
                        return -EPFNOSUPPORT;
                }
                use_ipv6 = true;
                conf->flags |= VXLAN_F_IPV6;

                if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
                        int local_type =
                                ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
                        int remote_type =
                                ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);

                        if (local_type & IPV6_ADDR_LINKLOCAL) {
                                if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
                                    (remote_type != IPV6_ADDR_ANY)) {
                                        NL_SET_ERR_MSG(extack,
                                                       "Invalid combination of local and remote address scopes");
                                        return -EINVAL;
                                }

                                conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
                        } else {
                                if (remote_type ==
                                    (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
                                        NL_SET_ERR_MSG(extack,
                                                       "Invalid combination of local and remote address scopes");
                                        return -EINVAL;
                                }

                                conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
                        }
                }
        }

        if (conf->label && !use_ipv6) {
                NL_SET_ERR_MSG(extack,
                               "Label attribute only applies to IPv6 VXLAN devices");
                return -EINVAL;
        }

        if (conf->label_policy && !use_ipv6) {
                NL_SET_ERR_MSG(extack,
                               "Label policy only applies to IPv6 VXLAN devices");
                return -EINVAL;
        }

        if (conf->remote_ifindex) {
                struct net_device *lowerdev;

                lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
                if (!lowerdev) {
                        NL_SET_ERR_MSG(extack,
                                       "Invalid local interface, device not found");
                        return -ENODEV;
                }

#if IS_ENABLED(CONFIG_IPV6)
                if (use_ipv6) {
                        struct inet6_dev *idev = __in6_dev_get(lowerdev);

                        if (idev && idev->cnf.disable_ipv6) {
                                NL_SET_ERR_MSG(extack,
                                               "IPv6 support disabled by administrator");
                                return -EPERM;
                        }
                }
#endif

                *lower = lowerdev;
        } else {
                if (vxlan_addr_multicast(&conf->remote_ip)) {
                        NL_SET_ERR_MSG(extack,
                                       "Local interface required for multicast remote destination");

                        return -EINVAL;
                }

#if IS_ENABLED(CONFIG_IPV6)
                if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
                        NL_SET_ERR_MSG(extack,
                                       "Local interface required for link-local local/remote addresses");
                        return -EINVAL;
                }
#endif

                *lower = NULL;
        }

        if (!conf->dst_port) {
                if (conf->flags & VXLAN_F_GPE)
                        conf->dst_port = htons(IANA_VXLAN_GPE_UDP_PORT);
                else
                        conf->dst_port = htons(vxlan_port);
        }

        if (!conf->age_interval)
                conf->age_interval = FDB_AGE_DEFAULT;

        if (vxlan_vni_in_use(src_net, old, conf, conf->vni)) {
                NL_SET_ERR_MSG(extack,
                               "A VXLAN device with the specified VNI already exists");
                return -EEXIST;
        }

        return 0;
}

static void vxlan_config_apply(struct net_device *dev,
                               struct vxlan_config *conf,
                               struct net_device *lowerdev,
                               struct net *src_net,
                               bool changelink)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_rdst *dst = &vxlan->default_dst;
        unsigned short needed_headroom = ETH_HLEN;
        int max_mtu = ETH_MAX_MTU;
        u32 flags = conf->flags;

        if (!changelink) {
                if (flags & VXLAN_F_GPE)
                        vxlan_raw_setup(dev);
                else
                        vxlan_ether_setup(dev);

                if (conf->mtu)
                        dev->mtu = conf->mtu;

                vxlan->net = src_net;
        }

        dst->remote_vni = conf->vni;

        memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));

        if (lowerdev) {
                dst->remote_ifindex = conf->remote_ifindex;

                netif_inherit_tso_max(dev, lowerdev);

                needed_headroom = lowerdev->hard_header_len;
                needed_headroom += lowerdev->needed_headroom;

                dev->needed_tailroom = lowerdev->needed_tailroom;

                max_mtu = lowerdev->mtu - vxlan_headroom(flags);
                if (max_mtu < ETH_MIN_MTU)
                        max_mtu = ETH_MIN_MTU;

                if (!changelink && !conf->mtu)
                        dev->mtu = max_mtu;
        }

        if (dev->mtu > max_mtu)
                dev->mtu = max_mtu;

        if (flags & VXLAN_F_COLLECT_METADATA)
                flags |= VXLAN_F_IPV6;
        needed_headroom += vxlan_headroom(flags);
        dev->needed_headroom = needed_headroom;

        memcpy(&vxlan->cfg, conf, sizeof(*conf));
}

static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
                               struct vxlan_config *conf,
                               struct netlink_ext_ack *extack)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct net_device *lowerdev;
        int ret;

        ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack);
        if (ret)
                return ret;

        vxlan_config_apply(dev, conf, lowerdev, src_net, false);

        return 0;
}

static int __vxlan_dev_create(struct net *net, struct net_device *dev,
                              struct vxlan_config *conf,
                              struct netlink_ext_ack *extack)
{
        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct net_device *remote_dev = NULL;
        struct vxlan_fdb *f = NULL;
        bool unregister = false;
        struct vxlan_rdst *dst;
        int err;

        dst = &vxlan->default_dst;
        err = vxlan_dev_configure(net, dev, conf, extack);
        if (err)
                return err;

        dev->ethtool_ops = &vxlan_ethtool_ops;

        /* create an fdb entry for a valid default destination */
        if (!vxlan_addr_any(&dst->remote_ip)) {
                err = vxlan_fdb_create(vxlan, all_zeros_mac,
                                       &dst->remote_ip,
                                       NUD_REACHABLE | NUD_PERMANENT,
                                       vxlan->cfg.dst_port,
                                       dst->remote_vni,
                                       dst->remote_vni,
                                       dst->remote_ifindex,
                                       NTF_SELF, 0, &f, extack);
                if (err)
                        return err;
        }

        err = register_netdevice(dev);
        if (err)
                goto errout;
        unregister = true;

        if (dst->remote_ifindex) {
                remote_dev = __dev_get_by_index(net, dst->remote_ifindex);
                if (!remote_dev) {
                        err = -ENODEV;
                        goto errout;
                }

                err = netdev_upper_dev_link(remote_dev, dev, extack);
                if (err)
                        goto errout;
        }

        err = rtnl_configure_link(dev, NULL, 0, NULL);
        if (err < 0)
                goto unlink;

        if (f) {
                vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f);

                /* notify default fdb entry */
                err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f),
                                       RTM_NEWNEIGH, true, extack);
                if (err) {
                        vxlan_fdb_destroy(vxlan, f, false, false);
                        if (remote_dev)
                                netdev_upper_dev_unlink(remote_dev, dev);
                        goto unregister;
                }
        }

        list_add(&vxlan->next, &vn->vxlan_list);
        if (remote_dev)
                dst->remote_dev = remote_dev;
        return 0;
unlink:
        if (remote_dev)
                netdev_upper_dev_unlink(remote_dev, dev);
errout:
        /* unregister_netdevice() destroys the default FDB entry with deletion
         * notification. But the addition notification was not sent yet, so
         * destroy the entry by hand here.
         */
        if (f)
                __vxlan_fdb_free(f);
unregister:
        if (unregister)
                unregister_netdevice(dev);
        return err;
}

/* Set/clear flags based on attribute */
static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[],
                          int attrtype, unsigned long mask, bool changelink,
                          bool changelink_supported,
                          struct netlink_ext_ack *extack)
{
        unsigned long flags;

        if (!tb[attrtype])
                return 0;

        if (changelink && !changelink_supported) {
                vxlan_flag_attr_error(attrtype, extack);
                return -EOPNOTSUPP;
        }

        if (vxlan_policy[attrtype].type == NLA_FLAG)
                flags = conf->flags | mask;
        else if (nla_get_u8(tb[attrtype]))
                flags = conf->flags | mask;
        else
                flags = conf->flags & ~mask;

        conf->flags = flags;

        return 0;
}

static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
                         struct net_device *dev, struct vxlan_config *conf,
                         bool changelink, struct netlink_ext_ack *extack)
{
        struct vxlanhdr used_bits = {
                .vx_flags = VXLAN_HF_VNI,
                .vx_vni = VXLAN_VNI_MASK,
        };
        struct vxlan_dev *vxlan = netdev_priv(dev);
        int err = 0;

        memset(conf, 0, sizeof(*conf));

        /* if changelink operation, start with old existing cfg */
        if (changelink)
                memcpy(conf, &vxlan->cfg, sizeof(*conf));

        if (data[IFLA_VXLAN_ID]) {
                __be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));

                if (changelink && (vni != conf->vni)) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI");
                        return -EOPNOTSUPP;
                }
                conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
        }

        if (data[IFLA_VXLAN_GROUP]) {
                if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group");
                        return -EOPNOTSUPP;
                }

                conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
                conf->remote_ip.sa.sa_family = AF_INET;
        } else if (data[IFLA_VXLAN_GROUP6]) {
                if (!IS_ENABLED(CONFIG_IPV6)) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel");
                        return -EPFNOSUPPORT;
                }

                if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group");
                        return -EOPNOTSUPP;
                }

                conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
                conf->remote_ip.sa.sa_family = AF_INET6;
        }

        if (data[IFLA_VXLAN_LOCAL]) {
                if (changelink && (conf->saddr.sa.sa_family != AF_INET)) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old");
                        return -EOPNOTSUPP;
                }

                conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
                conf->saddr.sa.sa_family = AF_INET;
        } else if (data[IFLA_VXLAN_LOCAL6]) {
                if (!IS_ENABLED(CONFIG_IPV6)) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel");
                        return -EPFNOSUPPORT;
                }

                if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old");
                        return -EOPNOTSUPP;
                }

                /* TODO: respect scope id */
                conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
                conf->saddr.sa.sa_family = AF_INET6;
        }

        if (data[IFLA_VXLAN_LINK])
                conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);

        if (data[IFLA_VXLAN_TOS])
                conf->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);

        if (data[IFLA_VXLAN_TTL])
                conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);

        if (data[IFLA_VXLAN_TTL_INHERIT]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT,
                                    VXLAN_F_TTL_INHERIT, changelink, false,
                                    extack);
                if (err)
                        return err;

        }

        if (data[IFLA_VXLAN_LABEL])
                conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
                             IPV6_FLOWLABEL_MASK;
        if (data[IFLA_VXLAN_LABEL_POLICY])
                conf->label_policy = nla_get_u32(data[IFLA_VXLAN_LABEL_POLICY]);

        if (data[IFLA_VXLAN_LEARNING]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
                                    VXLAN_F_LEARN, changelink, true,
                                    extack);
                if (err)
                        return err;
        } else if (!changelink) {
                /* default to learn on a new device */
                conf->flags |= VXLAN_F_LEARN;
        }

        if (data[IFLA_VXLAN_AGEING])
                conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);

        if (data[IFLA_VXLAN_PROXY]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY,
                                    VXLAN_F_PROXY, changelink, false,
                                    extack);
                if (err)
                        return err;
        }

        if (data[IFLA_VXLAN_RSC]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC,
                                    VXLAN_F_RSC, changelink, false,
                                    extack);
                if (err)
                        return err;
        }

        if (data[IFLA_VXLAN_L2MISS]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS,
                                    VXLAN_F_L2MISS, changelink, false,
                                    extack);
                if (err)
                        return err;
        }

        if (data[IFLA_VXLAN_L3MISS]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS,
                                    VXLAN_F_L3MISS, changelink, false,
                                    extack);
                if (err)
                        return err;
        }

        if (data[IFLA_VXLAN_LIMIT]) {
                if (changelink) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT],
                                            "Cannot change limit");
                        return -EOPNOTSUPP;
                }
                conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
        }

        if (data[IFLA_VXLAN_COLLECT_METADATA]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA,
                                    VXLAN_F_COLLECT_METADATA, changelink, false,
                                    extack);
                if (err)
                        return err;
        }

        if (data[IFLA_VXLAN_PORT_RANGE]) {
                if (!changelink) {
                        const struct ifla_vxlan_port_range *p
                                = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
                        conf->port_min = ntohs(p->low);
                        conf->port_max = ntohs(p->high);
                } else {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
                                            "Cannot change port range");
                        return -EOPNOTSUPP;
                }
        }

        if (data[IFLA_VXLAN_PORT]) {
                if (changelink) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT],
                                            "Cannot change port");
                        return -EOPNOTSUPP;
                }
                conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
        }

        if (data[IFLA_VXLAN_UDP_CSUM]) {
                if (changelink) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM],
                                            "Cannot change UDP_CSUM flag");
                        return -EOPNOTSUPP;
                }
                if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
                        conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
        }

        if (data[IFLA_VXLAN_LOCALBYPASS]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LOCALBYPASS,
                                    VXLAN_F_LOCALBYPASS, changelink,
                                    true, extack);
                if (err)
                        return err;
        } else if (!changelink) {
                /* default to local bypass on a new device */
                conf->flags |= VXLAN_F_LOCALBYPASS;
        }

        if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
                                    VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,
                                    false, extack);
                if (err)
                        return err;
        }

        if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
                                    VXLAN_F_UDP_ZERO_CSUM6_RX, changelink,
                                    false, extack);
                if (err)
                        return err;
        }

        if (data[IFLA_VXLAN_REMCSUM_TX]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX,
                                    VXLAN_F_REMCSUM_TX, changelink, false,
                                    extack);
                if (err)
                        return err;
        }

        if (data[IFLA_VXLAN_REMCSUM_RX]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX,
                                    VXLAN_F_REMCSUM_RX, changelink, false,
                                    extack);
                if (err)
                        return err;
                used_bits.vx_flags |= VXLAN_HF_RCO;
                used_bits.vx_vni |= ~VXLAN_VNI_MASK;
        }

        if (data[IFLA_VXLAN_GBP]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP,
                                    VXLAN_F_GBP, changelink, false, extack);
                if (err)
                        return err;
                used_bits.vx_flags |= VXLAN_GBP_USED_BITS;
        }

        if (data[IFLA_VXLAN_GPE]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE,
                                    VXLAN_F_GPE, changelink, false,
                                    extack);
                if (err)
                        return err;

                used_bits.vx_flags |= VXLAN_GPE_USED_BITS;
        }

        if (data[IFLA_VXLAN_RESERVED_BITS]) {
                struct vxlanhdr reserved_bits;

                if (changelink) {
                        NL_SET_ERR_MSG_ATTR(extack,
                                            data[IFLA_VXLAN_RESERVED_BITS],
                                            "Cannot change reserved_bits");
                        return -EOPNOTSUPP;
                }

                nla_memcpy(&reserved_bits, data[IFLA_VXLAN_RESERVED_BITS],
                           sizeof(reserved_bits));
                if (used_bits.vx_flags & reserved_bits.vx_flags ||
                    used_bits.vx_vni & reserved_bits.vx_vni) {
                        __be64 ub_be64, rb_be64;

                        memcpy(&ub_be64, &used_bits, sizeof(ub_be64));
                        memcpy(&rb_be64, &reserved_bits, sizeof(rb_be64));

                        NL_SET_ERR_MSG_ATTR_FMT(extack,
                                                data[IFLA_VXLAN_RESERVED_BITS],
                                                "Used bits %#018llx cannot overlap reserved bits %#018llx",
                                                be64_to_cpu(ub_be64),
                                                be64_to_cpu(rb_be64));
                        return -EINVAL;
                }

                conf->reserved_bits = reserved_bits;
        } else {
                /* For backwards compatibility, only allow reserved fields to be
                 * used by VXLAN extensions if explicitly requested.
                 */
                conf->reserved_bits = (struct vxlanhdr) {
                        .vx_flags = ~used_bits.vx_flags,
                        .vx_vni = ~used_bits.vx_vni,
                };
        }

        if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL,
                                    VXLAN_F_REMCSUM_NOPARTIAL, changelink,
                                    false, extack);
                if (err)
                        return err;
        }

        if (tb[IFLA_MTU]) {
                if (changelink) {
                        NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
                                            "Cannot change mtu");
                        return -EOPNOTSUPP;
                }
                conf->mtu = nla_get_u32(tb[IFLA_MTU]);
        }

        if (data[IFLA_VXLAN_DF])
                conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);

        if (data[IFLA_VXLAN_VNIFILTER]) {
                err = vxlan_nl2flag(conf, data, IFLA_VXLAN_VNIFILTER,
                                    VXLAN_F_VNIFILTER, changelink, false,
                                    extack);
                if (err)
                        return err;

                if ((conf->flags & VXLAN_F_VNIFILTER) &&
                    !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
                        NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_VNIFILTER],
                                            "vxlan vnifilter only valid in collect metadata mode");
                        return -EINVAL;
                }
        }

        return 0;
}

static int vxlan_newlink(struct net_device *dev,
                         struct rtnl_newlink_params *params,
                         struct netlink_ext_ack *extack)
{
        struct net *link_net = rtnl_newlink_link_net(params);
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        struct vxlan_config conf;
        int err;

        err = vxlan_nl2conf(tb, data, dev, &conf, false, extack);
        if (err)
                return err;

        return __vxlan_dev_create(link_net, dev, &conf, extack);
}

static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
                            struct nlattr *data[],
                            struct netlink_ext_ack *extack)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        bool rem_ip_changed, change_igmp;
        struct net_device *lowerdev;
        struct vxlan_config conf;
        struct vxlan_rdst *dst;
        int err;

        dst = &vxlan->default_dst;
        err = vxlan_nl2conf(tb, data, dev, &conf, true, extack);
        if (err)
                return err;

        err = vxlan_config_validate(vxlan->net, &conf, &lowerdev,
                                    vxlan, extack);
        if (err)
                return err;

        if (dst->remote_dev == lowerdev)
                lowerdev = NULL;

        err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev,
                                             extack);
        if (err)
                return err;

        rem_ip_changed = !vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip);
        change_igmp = vxlan->dev->flags & IFF_UP &&
                      (rem_ip_changed ||
                       dst->remote_ifindex != conf.remote_ifindex);

        /* handle default dst entry */
        if (rem_ip_changed) {
                u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni);

                spin_lock_bh(&vxlan->hash_lock[hash_index]);
                if (!vxlan_addr_any(&conf.remote_ip)) {
                        err = vxlan_fdb_update(vxlan, all_zeros_mac,
                                               &conf.remote_ip,
                                               NUD_REACHABLE | NUD_PERMANENT,
                                               NLM_F_APPEND | NLM_F_CREATE,
                                               vxlan->cfg.dst_port,
                                               conf.vni, conf.vni,
                                               conf.remote_ifindex,
                                               NTF_SELF, 0, true, extack);
                        if (err) {
                                spin_unlock_bh(&vxlan->hash_lock[hash_index]);
                                netdev_adjacent_change_abort(dst->remote_dev,
                                                             lowerdev, dev);
                                return err;
                        }
                }
                if (!vxlan_addr_any(&dst->remote_ip))
                        __vxlan_fdb_delete(vxlan, all_zeros_mac,
                                           dst->remote_ip,
                                           vxlan->cfg.dst_port,
                                           dst->remote_vni,
                                           dst->remote_vni,
                                           dst->remote_ifindex,
                                           true);
                spin_unlock_bh(&vxlan->hash_lock[hash_index]);

                /* If vni filtering device, also update fdb entries of
                 * all vnis that were using default remote ip
                 */
                if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
                        err = vxlan_vnilist_update_group(vxlan, &dst->remote_ip,
                                                         &conf.remote_ip, extack);
                        if (err) {
                                netdev_adjacent_change_abort(dst->remote_dev,
                                                             lowerdev, dev);
                                return err;
                        }
                }
        }

        if (change_igmp && vxlan_addr_multicast(&dst->remote_ip))
                err = vxlan_multicast_leave(vxlan);

        if (conf.age_interval != vxlan->cfg.age_interval)
                mod_timer(&vxlan->age_timer, jiffies);

        netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev);
        if (lowerdev && lowerdev != dst->remote_dev)
                dst->remote_dev = lowerdev;
        vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true);

        if (!err && change_igmp &&
            vxlan_addr_multicast(&dst->remote_ip))
                err = vxlan_multicast_join(vxlan);

        return err;
}

static void vxlan_dellink(struct net_device *dev, struct list_head *head)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_fdb_flush_desc desc = {
                /* Default entry is deleted at vxlan_uninit. */
                .ignore_default_entry = true,
        };

        vxlan_flush(vxlan, &desc);

        list_del(&vxlan->next);
        unregister_netdevice_queue(dev, head);
        if (vxlan->default_dst.remote_dev)
                netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev);
}

static size_t vxlan_get_size(const struct net_device *dev)
{
        return nla_total_size(sizeof(__u32)) +        /* IFLA_VXLAN_ID */
                nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
                nla_total_size(sizeof(__u32)) +        /* IFLA_VXLAN_LINK */
                nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_TTL */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_TTL_INHERIT */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_TOS */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_DF */
                nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
                nla_total_size(sizeof(__u32)) +  /* IFLA_VXLAN_LABEL_POLICY */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_LEARNING */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_PROXY */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_RSC */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_L2MISS */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_L3MISS */
                nla_total_size(sizeof(__u8)) +        /* IFLA_VXLAN_COLLECT_METADATA */
                nla_total_size(sizeof(__u32)) +        /* IFLA_VXLAN_AGEING */
                nla_total_size(sizeof(__u32)) +        /* IFLA_VXLAN_LIMIT */
                nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
                nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
                nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
                nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
                nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
                nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
                nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LOCALBYPASS */
                /* IFLA_VXLAN_PORT_RANGE */
                nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
                nla_total_size(0) + /* IFLA_VXLAN_GBP */
                nla_total_size(0) + /* IFLA_VXLAN_GPE */
                nla_total_size(0) + /* IFLA_VXLAN_REMCSUM_NOPARTIAL */
                nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_VNIFILTER */
                /* IFLA_VXLAN_RESERVED_BITS */
                nla_total_size(sizeof(struct vxlanhdr)) +
                0;
}

static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
        const struct vxlan_dev *vxlan = netdev_priv(dev);
        const struct vxlan_rdst *dst = &vxlan->default_dst;
        struct ifla_vxlan_port_range ports = {
                .low =  htons(vxlan->cfg.port_min),
                .high = htons(vxlan->cfg.port_max),
        };

        if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
                goto nla_put_failure;

        if (!vxlan_addr_any(&dst->remote_ip)) {
                if (dst->remote_ip.sa.sa_family == AF_INET) {
                        if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
                                            dst->remote_ip.sin.sin_addr.s_addr))
                                goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
                } else {
                        if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
                                             &dst->remote_ip.sin6.sin6_addr))
                                goto nla_put_failure;
#endif
                }
        }

        if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
                goto nla_put_failure;

        if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
                if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
                        if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
                                            vxlan->cfg.saddr.sin.sin_addr.s_addr))
                                goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
                } else {
                        if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
                                             &vxlan->cfg.saddr.sin6.sin6_addr))
                                goto nla_put_failure;
#endif
                }
        }

        if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
            nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
                       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
            nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
            nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
            nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
            nla_put_u32(skb, IFLA_VXLAN_LABEL_POLICY, vxlan->cfg.label_policy) ||
            nla_put_u8(skb, IFLA_VXLAN_LEARNING,
                       !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
            nla_put_u8(skb, IFLA_VXLAN_PROXY,
                       !!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
            nla_put_u8(skb, IFLA_VXLAN_RSC,
                       !!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
            nla_put_u8(skb, IFLA_VXLAN_L2MISS,
                       !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
            nla_put_u8(skb, IFLA_VXLAN_L3MISS,
                       !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
            nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
                       !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
            nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
            nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
            nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
            nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
                       !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
            nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
                       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
            nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
                       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
            nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
                       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
            nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
                       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)) ||
            nla_put_u8(skb, IFLA_VXLAN_LOCALBYPASS,
                       !!(vxlan->cfg.flags & VXLAN_F_LOCALBYPASS)))
                goto nla_put_failure;

        if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
                goto nla_put_failure;

        if (vxlan->cfg.flags & VXLAN_F_GBP &&
            nla_put_flag(skb, IFLA_VXLAN_GBP))
                goto nla_put_failure;

        if (vxlan->cfg.flags & VXLAN_F_GPE &&
            nla_put_flag(skb, IFLA_VXLAN_GPE))
                goto nla_put_failure;

        if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
            nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
                goto nla_put_failure;

        if (vxlan->cfg.flags & VXLAN_F_VNIFILTER &&
            nla_put_u8(skb, IFLA_VXLAN_VNIFILTER,
                       !!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)))
                goto nla_put_failure;

        if (nla_put(skb, IFLA_VXLAN_RESERVED_BITS,
                    sizeof(vxlan->cfg.reserved_bits),
                    &vxlan->cfg.reserved_bits))
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static struct net *vxlan_get_link_net(const struct net_device *dev)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);

        return READ_ONCE(vxlan->net);
}

static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
        .kind                = "vxlan",
        .maxtype        = IFLA_VXLAN_MAX,
        .policy                = vxlan_policy,
        .priv_size        = sizeof(struct vxlan_dev),
        .setup                = vxlan_setup,
        .validate        = vxlan_validate,
        .newlink        = vxlan_newlink,
        .changelink        = vxlan_changelink,
        .dellink        = vxlan_dellink,
        .get_size        = vxlan_get_size,
        .fill_info        = vxlan_fill_info,
        .get_link_net        = vxlan_get_link_net,
};

struct net_device *vxlan_dev_create(struct net *net, const char *name,
                                    u8 name_assign_type,
                                    struct vxlan_config *conf)
{
        struct nlattr *tb[IFLA_MAX + 1];
        struct net_device *dev;
        int err;

        memset(&tb, 0, sizeof(tb));

        dev = rtnl_create_link(net, name, name_assign_type,
                               &vxlan_link_ops, tb, NULL);
        if (IS_ERR(dev))
                return dev;

        err = __vxlan_dev_create(net, dev, conf, NULL);
        if (err < 0) {
                free_netdev(dev);
                return ERR_PTR(err);
        }

        err = rtnl_configure_link(dev, NULL, 0, NULL);
        if (err < 0) {
                LIST_HEAD(list_kill);

                vxlan_dellink(dev, &list_kill);
                unregister_netdevice_many(&list_kill);
                return ERR_PTR(err);
        }

        return dev;
}
EXPORT_SYMBOL_GPL(vxlan_dev_create);

static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
                                             struct net_device *dev)
{
        struct vxlan_dev *vxlan, *next;
        LIST_HEAD(list_kill);

        list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
                struct vxlan_rdst *dst = &vxlan->default_dst;

                /* In case we created vxlan device with carrier
                 * and we loose the carrier due to module unload
                 * we also need to remove vxlan device. In other
                 * cases, it's not necessary and remote_ifindex
                 * is 0 here, so no matches.
                 */
                if (dst->remote_ifindex == dev->ifindex)
                        vxlan_dellink(vxlan->dev, &list_kill);
        }

        unregister_netdevice_many(&list_kill);
}

static int vxlan_netdevice_event(struct notifier_block *unused,
                                 unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);

        if (event == NETDEV_UNREGISTER)
                vxlan_handle_lowerdev_unregister(vn, dev);
        else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
                vxlan_offload_rx_ports(dev, true);
        else if (event == NETDEV_UDP_TUNNEL_DROP_INFO)
                vxlan_offload_rx_ports(dev, false);

        return NOTIFY_DONE;
}

static struct notifier_block vxlan_notifier_block __read_mostly = {
        .notifier_call = vxlan_netdevice_event,
};

static void
vxlan_fdb_offloaded_set(struct net_device *dev,
                        struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_rdst *rdst;
        struct vxlan_fdb *f;
        u32 hash_index;

        hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);

        spin_lock_bh(&vxlan->hash_lock[hash_index]);

        f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
        if (!f)
                goto out;

        rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
                                   fdb_info->remote_port,
                                   fdb_info->remote_vni,
                                   fdb_info->remote_ifindex);
        if (!rdst)
                goto out;

        rdst->offloaded = fdb_info->offloaded;

out:
        spin_unlock_bh(&vxlan->hash_lock[hash_index]);
}

static int
vxlan_fdb_external_learn_add(struct net_device *dev,
                             struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct netlink_ext_ack *extack;
        u32 hash_index;
        int err;

        hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
        extack = switchdev_notifier_info_to_extack(&fdb_info->info);

        spin_lock_bh(&vxlan->hash_lock[hash_index]);
        err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
                               NUD_REACHABLE,
                               NLM_F_CREATE | NLM_F_REPLACE,
                               fdb_info->remote_port,
                               fdb_info->vni,
                               fdb_info->remote_vni,
                               fdb_info->remote_ifindex,
                               NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
                               0, false, extack);
        spin_unlock_bh(&vxlan->hash_lock[hash_index]);

        return err;
}

static int
vxlan_fdb_external_learn_del(struct net_device *dev,
                             struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_fdb *f;
        u32 hash_index;
        int err = 0;

        hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
        spin_lock_bh(&vxlan->hash_lock[hash_index]);

        f = __vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
        if (!f)
                err = -ENOENT;
        else if (f->flags & NTF_EXT_LEARNED)
                err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr,
                                         fdb_info->remote_ip,
                                         fdb_info->remote_port,
                                         fdb_info->vni,
                                         fdb_info->remote_vni,
                                         fdb_info->remote_ifindex,
                                         false);

        spin_unlock_bh(&vxlan->hash_lock[hash_index]);

        return err;
}

static int vxlan_switchdev_event(struct notifier_block *unused,
                                 unsigned long event, void *ptr)
{
        struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
        struct switchdev_notifier_vxlan_fdb_info *fdb_info;
        int err = 0;

        switch (event) {
        case SWITCHDEV_VXLAN_FDB_OFFLOADED:
                vxlan_fdb_offloaded_set(dev, ptr);
                break;
        case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE:
                fdb_info = ptr;
                err = vxlan_fdb_external_learn_add(dev, fdb_info);
                if (err) {
                        err = notifier_from_errno(err);
                        break;
                }
                fdb_info->offloaded = true;
                vxlan_fdb_offloaded_set(dev, fdb_info);
                break;
        case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE:
                fdb_info = ptr;
                err = vxlan_fdb_external_learn_del(dev, fdb_info);
                if (err) {
                        err = notifier_from_errno(err);
                        break;
                }
                fdb_info->offloaded = false;
                vxlan_fdb_offloaded_set(dev, fdb_info);
                break;
        }

        return err;
}

static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
        .notifier_call = vxlan_switchdev_event,
};

static void vxlan_fdb_nh_flush(struct nexthop *nh)
{
        struct vxlan_fdb *fdb;
        struct vxlan_dev *vxlan;
        u32 hash_index;

        rcu_read_lock();
        list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) {
                vxlan = rcu_dereference(fdb->vdev);
                WARN_ON(!vxlan);
                hash_index = fdb_head_index(vxlan, fdb->eth_addr,
                                            vxlan->default_dst.remote_vni);
                spin_lock_bh(&vxlan->hash_lock[hash_index]);
                if (!hlist_unhashed(&fdb->hlist))
                        vxlan_fdb_destroy(vxlan, fdb, false, false);
                spin_unlock_bh(&vxlan->hash_lock[hash_index]);
        }
        rcu_read_unlock();
}

static int vxlan_nexthop_event(struct notifier_block *nb,
                               unsigned long event, void *ptr)
{
        struct nh_notifier_info *info = ptr;
        struct nexthop *nh;

        if (event != NEXTHOP_EVENT_DEL)
                return NOTIFY_DONE;

        nh = nexthop_find_by_id(info->net, info->id);
        if (!nh)
                return NOTIFY_DONE;

        vxlan_fdb_nh_flush(nh);

        return NOTIFY_DONE;
}

static __net_init int vxlan_init_net(struct net *net)
{
        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
        unsigned int h;

        INIT_LIST_HEAD(&vn->vxlan_list);
        spin_lock_init(&vn->sock_lock);
        vn->nexthop_notifier_block.notifier_call = vxlan_nexthop_event;

        for (h = 0; h < PORT_HASH_SIZE; ++h)
                INIT_HLIST_HEAD(&vn->sock_list[h]);

        return register_nexthop_notifier(net, &vn->nexthop_notifier_block,
                                         NULL);
}

static void __net_exit vxlan_destroy_tunnels(struct vxlan_net *vn,
                                             struct list_head *dev_to_kill)
{
        struct vxlan_dev *vxlan, *next;

        list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next)
                vxlan_dellink(vxlan->dev, dev_to_kill);
}

static void __net_exit vxlan_exit_batch_rtnl(struct list_head *net_list,
                                             struct list_head *dev_to_kill)
{
        struct net *net;

        ASSERT_RTNL();
        list_for_each_entry(net, net_list, exit_list) {
                struct vxlan_net *vn = net_generic(net, vxlan_net_id);

                __unregister_nexthop_notifier(net, &vn->nexthop_notifier_block);

                vxlan_destroy_tunnels(vn, dev_to_kill);
        }
}

static void __net_exit vxlan_exit_net(struct net *net)
{
        struct vxlan_net *vn = net_generic(net, vxlan_net_id);
        unsigned int h;

        for (h = 0; h < PORT_HASH_SIZE; ++h)
                WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h]));
}

static struct pernet_operations vxlan_net_ops = {
        .init = vxlan_init_net,
        .exit_batch_rtnl = vxlan_exit_batch_rtnl,
        .exit = vxlan_exit_net,
        .id   = &vxlan_net_id,
        .size = sizeof(struct vxlan_net),
};

static int __init vxlan_init_module(void)
{
        int rc;

        get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));

        rc = register_pernet_subsys(&vxlan_net_ops);
        if (rc)
                goto out1;

        rc = register_netdevice_notifier(&vxlan_notifier_block);
        if (rc)
                goto out2;

        rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
        if (rc)
                goto out3;

        rc = rtnl_link_register(&vxlan_link_ops);
        if (rc)
                goto out4;

        rc = vxlan_vnifilter_init();
        if (rc)
                goto out5;

        return 0;
out5:
        rtnl_link_unregister(&vxlan_link_ops);
out4:
        unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
out3:
        unregister_netdevice_notifier(&vxlan_notifier_block);
out2:
        unregister_pernet_subsys(&vxlan_net_ops);
out1:
        return rc;
}
late_initcall(vxlan_init_module);

static void __exit vxlan_cleanup_module(void)
{
        vxlan_vnifilter_uninit();
        rtnl_link_unregister(&vxlan_link_ops);
        unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
        unregister_netdevice_notifier(&vxlan_notifier_block);
        unregister_pernet_subsys(&vxlan_net_ops);
        /* rcu_barrier() is called by netns */
}
module_exit(vxlan_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_VERSION(VXLAN_VERSION);
MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
MODULE_ALIAS_RTNL_LINK("vxlan");


































































































































































































































































































































































































































































































































































 1481 

















  740 















































































   34 






































































































  566 





















  353 

















  699 





















































   34 






































































































  755 

















































 1481 

























  710 































































































































































































  352 

























  650 



































































































































































































































































































































































































































































   22 















































































































































































































































































































































  533 




























  177 




















































  668 






























  458 
  442 
































  156 
    2 





























































  668 























  261 



















  619 



















  240 


























  132 











































































   34 



























  207 
























  207 




















    5 

























  151 








































































































































































  790 

















 1500 






















































































































































































  209 





















  168 

















 1496 














































































































































































































  262 

























  271 











































































































































































  209 



















  145 

























 1396 






















































































































































































































































































































  251 




















 1259 





































































































  681 

















  585 


























  476 


















































































































































  275 

































































































































































  186 
























































  142 
























  164 






























  181 
  121 

































































  142 
   22 




























   22 
   22 






















  186 


































































 1306 
















































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
// SPDX-License-Identifier: GPL-2.0

// Generated by scripts/atomic/gen-atomic-fallback.sh
// DO NOT MODIFY THIS FILE DIRECTLY

#ifndef _LINUX_ATOMIC_FALLBACK_H
#define _LINUX_ATOMIC_FALLBACK_H

#include <linux/compiler.h>

#if defined(arch_xchg)
#define raw_xchg arch_xchg
#elif defined(arch_xchg_relaxed)
#define raw_xchg(...) \
        __atomic_op_fence(arch_xchg, __VA_ARGS__)
#else
extern void raw_xchg_not_implemented(void);
#define raw_xchg(...) raw_xchg_not_implemented()
#endif

#if defined(arch_xchg_acquire)
#define raw_xchg_acquire arch_xchg_acquire
#elif defined(arch_xchg_relaxed)
#define raw_xchg_acquire(...) \
        __atomic_op_acquire(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_acquire arch_xchg
#else
extern void raw_xchg_acquire_not_implemented(void);
#define raw_xchg_acquire(...) raw_xchg_acquire_not_implemented()
#endif

#if defined(arch_xchg_release)
#define raw_xchg_release arch_xchg_release
#elif defined(arch_xchg_relaxed)
#define raw_xchg_release(...) \
        __atomic_op_release(arch_xchg, __VA_ARGS__)
#elif defined(arch_xchg)
#define raw_xchg_release arch_xchg
#else
extern void raw_xchg_release_not_implemented(void);
#define raw_xchg_release(...) raw_xchg_release_not_implemented()
#endif

#if defined(arch_xchg_relaxed)
#define raw_xchg_relaxed arch_xchg_relaxed
#elif defined(arch_xchg)
#define raw_xchg_relaxed arch_xchg
#else
extern void raw_xchg_relaxed_not_implemented(void);
#define raw_xchg_relaxed(...) raw_xchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg)
#define raw_cmpxchg arch_cmpxchg
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg(...) \
        __atomic_op_fence(arch_cmpxchg, __VA_ARGS__)
#else
extern void raw_cmpxchg_not_implemented(void);
#define raw_cmpxchg(...) raw_cmpxchg_not_implemented()
#endif

#if defined(arch_cmpxchg_acquire)
#define raw_cmpxchg_acquire arch_cmpxchg_acquire
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_acquire arch_cmpxchg
#else
extern void raw_cmpxchg_acquire_not_implemented(void);
#define raw_cmpxchg_acquire(...) raw_cmpxchg_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg_release)
#define raw_cmpxchg_release arch_cmpxchg_release
#elif defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_release(...) \
        __atomic_op_release(arch_cmpxchg, __VA_ARGS__)
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_release arch_cmpxchg
#else
extern void raw_cmpxchg_release_not_implemented(void);
#define raw_cmpxchg_release(...) raw_cmpxchg_release_not_implemented()
#endif

#if defined(arch_cmpxchg_relaxed)
#define raw_cmpxchg_relaxed arch_cmpxchg_relaxed
#elif defined(arch_cmpxchg)
#define raw_cmpxchg_relaxed arch_cmpxchg
#else
extern void raw_cmpxchg_relaxed_not_implemented(void);
#define raw_cmpxchg_relaxed(...) raw_cmpxchg_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg64)
#define raw_cmpxchg64 arch_cmpxchg64
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64(...) \
        __atomic_op_fence(arch_cmpxchg64, __VA_ARGS__)
#else
extern void raw_cmpxchg64_not_implemented(void);
#define raw_cmpxchg64(...) raw_cmpxchg64_not_implemented()
#endif

#if defined(arch_cmpxchg64_acquire)
#define raw_cmpxchg64_acquire arch_cmpxchg64_acquire
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_acquire arch_cmpxchg64
#else
extern void raw_cmpxchg64_acquire_not_implemented(void);
#define raw_cmpxchg64_acquire(...) raw_cmpxchg64_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg64_release)
#define raw_cmpxchg64_release arch_cmpxchg64_release
#elif defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_release(...) \
        __atomic_op_release(arch_cmpxchg64, __VA_ARGS__)
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_release arch_cmpxchg64
#else
extern void raw_cmpxchg64_release_not_implemented(void);
#define raw_cmpxchg64_release(...) raw_cmpxchg64_release_not_implemented()
#endif

#if defined(arch_cmpxchg64_relaxed)
#define raw_cmpxchg64_relaxed arch_cmpxchg64_relaxed
#elif defined(arch_cmpxchg64)
#define raw_cmpxchg64_relaxed arch_cmpxchg64
#else
extern void raw_cmpxchg64_relaxed_not_implemented(void);
#define raw_cmpxchg64_relaxed(...) raw_cmpxchg64_relaxed_not_implemented()
#endif

#if defined(arch_cmpxchg128)
#define raw_cmpxchg128 arch_cmpxchg128
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128(...) \
        __atomic_op_fence(arch_cmpxchg128, __VA_ARGS__)
#else
extern void raw_cmpxchg128_not_implemented(void);
#define raw_cmpxchg128(...) raw_cmpxchg128_not_implemented()
#endif

#if defined(arch_cmpxchg128_acquire)
#define raw_cmpxchg128_acquire arch_cmpxchg128_acquire
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_acquire arch_cmpxchg128
#else
extern void raw_cmpxchg128_acquire_not_implemented(void);
#define raw_cmpxchg128_acquire(...) raw_cmpxchg128_acquire_not_implemented()
#endif

#if defined(arch_cmpxchg128_release)
#define raw_cmpxchg128_release arch_cmpxchg128_release
#elif defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_release(...) \
        __atomic_op_release(arch_cmpxchg128, __VA_ARGS__)
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_release arch_cmpxchg128
#else
extern void raw_cmpxchg128_release_not_implemented(void);
#define raw_cmpxchg128_release(...) raw_cmpxchg128_release_not_implemented()
#endif

#if defined(arch_cmpxchg128_relaxed)
#define raw_cmpxchg128_relaxed arch_cmpxchg128_relaxed
#elif defined(arch_cmpxchg128)
#define raw_cmpxchg128_relaxed arch_cmpxchg128
#else
extern void raw_cmpxchg128_relaxed_not_implemented(void);
#define raw_cmpxchg128_relaxed(...) raw_cmpxchg128_relaxed_not_implemented()
#endif

#if defined(arch_try_cmpxchg)
#define raw_try_cmpxchg arch_try_cmpxchg
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg(...) \
        __atomic_op_fence(arch_try_cmpxchg, __VA_ARGS__)
#else
#define raw_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_acquire)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg_acquire
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_acquire arch_try_cmpxchg
#else
#define raw_try_cmpxchg_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_release)
#define raw_try_cmpxchg_release arch_try_cmpxchg_release
#elif defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_release(...) \
        __atomic_op_release(arch_try_cmpxchg, __VA_ARGS__)
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_release arch_try_cmpxchg
#else
#define raw_try_cmpxchg_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg_relaxed)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg_relaxed
#elif defined(arch_try_cmpxchg)
#define raw_try_cmpxchg_relaxed arch_try_cmpxchg
#else
#define raw_try_cmpxchg_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64 arch_try_cmpxchg64
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64(...) \
        __atomic_op_fence(arch_try_cmpxchg64, __VA_ARGS__)
#else
#define raw_try_cmpxchg64(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_acquire)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64_acquire
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_acquire arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_release)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64_release
#elif defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_release(...) \
        __atomic_op_release(arch_try_cmpxchg64, __VA_ARGS__)
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_release arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg64_relaxed)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64_relaxed
#elif defined(arch_try_cmpxchg64)
#define raw_try_cmpxchg64_relaxed arch_try_cmpxchg64
#else
#define raw_try_cmpxchg64_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128 arch_try_cmpxchg128
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128(...) \
        __atomic_op_fence(arch_try_cmpxchg128, __VA_ARGS__)
#else
#define raw_try_cmpxchg128(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_acquire)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128_acquire
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_acquire(...) \
        __atomic_op_acquire(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_acquire arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_acquire(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_acquire((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_release)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128_release
#elif defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_release(...) \
        __atomic_op_release(arch_try_cmpxchg128, __VA_ARGS__)
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_release arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_release(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_release((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#if defined(arch_try_cmpxchg128_relaxed)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128_relaxed
#elif defined(arch_try_cmpxchg128)
#define raw_try_cmpxchg128_relaxed arch_try_cmpxchg128
#else
#define raw_try_cmpxchg128_relaxed(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_relaxed((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg_local arch_cmpxchg_local

#ifdef arch_try_cmpxchg_local
#define raw_try_cmpxchg_local arch_try_cmpxchg_local
#else
#define raw_try_cmpxchg_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg64_local arch_cmpxchg64_local

#ifdef arch_try_cmpxchg64_local
#define raw_try_cmpxchg64_local arch_try_cmpxchg64_local
#else
#define raw_try_cmpxchg64_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg64_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_cmpxchg128_local arch_cmpxchg128_local

#ifdef arch_try_cmpxchg128_local
#define raw_try_cmpxchg128_local arch_try_cmpxchg128_local
#else
#define raw_try_cmpxchg128_local(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_cmpxchg128_local((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

#define raw_sync_cmpxchg arch_sync_cmpxchg

#ifdef arch_sync_try_cmpxchg
#define raw_sync_try_cmpxchg arch_sync_try_cmpxchg
#else
#define raw_sync_try_cmpxchg(_ptr, _oldp, _new) \
({ \
        typeof(*(_ptr)) *___op = (_oldp), ___o = *___op, ___r; \
        ___r = raw_sync_cmpxchg((_ptr), ___o, (_new)); \
        if (unlikely(___r != ___o)) \
                *___op = ___r; \
        likely(___r == ___o); \
})
#endif

/**
 * raw_atomic_read() - atomic load with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read(const atomic_t *v)
{
        return arch_atomic_read(v);
}

/**
 * raw_atomic_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline int
raw_atomic_read_acquire(const atomic_t *v)
{
#if defined(arch_atomic_read_acquire)
        return arch_atomic_read_acquire(v);
#else
        int ret;

        if (__native_word(atomic_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic_set() - atomic set with relaxed ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set(atomic_t *v, int i)
{
        arch_atomic_set(v, i);
}

/**
 * raw_atomic_set_release() - atomic set with release ordering
 * @v: pointer to atomic_t
 * @i: int value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_set_release(atomic_t *v, int i)
{
#if defined(arch_atomic_set_release)
        arch_atomic_set_release(v, i);
#else
        if (__native_word(atomic_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic_set(v, i);
        }
#endif
}

/**
 * raw_atomic_add() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_add(int i, atomic_t *v)
{
        arch_atomic_add(i, v);
}

/**
 * raw_atomic_add_return() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_add_return"
#endif
}

/**
 * raw_atomic_add_return_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_acquire)
        return arch_atomic_add_return_acquire(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        int ret = arch_atomic_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_acquire"
#endif
}

/**
 * raw_atomic_add_return_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_release)
        return arch_atomic_add_return_release(i, v);
#elif defined(arch_atomic_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_release"
#endif
}

/**
 * raw_atomic_add_return_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_add_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_return_relaxed)
        return arch_atomic_add_return_relaxed(i, v);
#elif defined(arch_atomic_add_return)
        return arch_atomic_add_return(i, v);
#else
#error "Unable to define raw_atomic_add_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_add() - atomic add with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_add"
#endif
}

/**
 * raw_atomic_fetch_add_acquire() - atomic add with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_acquire)
        return arch_atomic_fetch_add_acquire(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        int ret = arch_atomic_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_acquire"
#endif
}

/**
 * raw_atomic_fetch_add_release() - atomic add with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_release)
        return arch_atomic_fetch_add_release(i, v);
#elif defined(arch_atomic_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_release"
#endif
}

/**
 * raw_atomic_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_add_relaxed)
        return arch_atomic_fetch_add_relaxed(i, v);
#elif defined(arch_atomic_fetch_add)
        return arch_atomic_fetch_add(i, v);
#else
#error "Unable to define raw_atomic_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic_sub() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_sub(int i, atomic_t *v)
{
        arch_atomic_sub(i, v);
}

/**
 * raw_atomic_sub_return() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_sub_return"
#endif
}

/**
 * raw_atomic_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_acquire)
        return arch_atomic_sub_return_acquire(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        int ret = arch_atomic_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_acquire"
#endif
}

/**
 * raw_atomic_sub_return_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_release(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_release)
        return arch_atomic_sub_return_release(i, v);
#elif defined(arch_atomic_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_release"
#endif
}

/**
 * raw_atomic_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_sub_return_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_return_relaxed)
        return arch_atomic_sub_return_relaxed(i, v);
#elif defined(arch_atomic_sub_return)
        return arch_atomic_sub_return(i, v);
#else
#error "Unable to define raw_atomic_sub_return_relaxed"
#endif
}

/**
 * raw_atomic_fetch_sub() - atomic subtract with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_sub"
#endif
}

/**
 * raw_atomic_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_acquire)
        return arch_atomic_fetch_sub_acquire(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        int ret = arch_atomic_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic_fetch_sub_release() - atomic subtract with release ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_release)
        return arch_atomic_fetch_sub_release(i, v);
#elif defined(arch_atomic_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_release"
#endif
}

/**
 * raw_atomic_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_sub_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_sub_relaxed)
        return arch_atomic_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic_fetch_sub)
        return arch_atomic_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_inc(atomic_t *v)
{
#if defined(arch_atomic_inc)
        arch_atomic_inc(v);
#else
        raw_atomic_add(1, v);
#endif
}

/**
 * raw_atomic_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return(atomic_t *v)
{
#if defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(1, v);
#endif
}

/**
 * raw_atomic_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_inc_return_acquire)
        return arch_atomic_inc_return_acquire(v);
#elif defined(arch_atomic_inc_return_relaxed)
        int ret = arch_atomic_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_release(atomic_t *v)
{
#if defined(arch_atomic_inc_return_release)
        return arch_atomic_inc_return_release(v);
#elif defined(arch_atomic_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_release(1, v);
#endif
}

/**
 * raw_atomic_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_inc_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_inc_return_relaxed)
        return arch_atomic_inc_return_relaxed(v);
#elif defined(arch_atomic_inc_return)
        return arch_atomic_inc_return(v);
#else
        return raw_atomic_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_add(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_acquire)
        return arch_atomic_fetch_inc_acquire(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        int ret = arch_atomic_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_release)
        return arch_atomic_fetch_inc_release(v);
#elif defined(arch_atomic_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_inc_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_inc_relaxed)
        return arch_atomic_fetch_inc_relaxed(v);
#elif defined(arch_atomic_fetch_inc)
        return arch_atomic_fetch_inc(v);
#else
        return raw_atomic_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_dec(atomic_t *v)
{
#if defined(arch_atomic_dec)
        arch_atomic_dec(v);
#else
        raw_atomic_sub(1, v);
#endif
}

/**
 * raw_atomic_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return(atomic_t *v)
{
#if defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_sub_return(1, v);
#endif
}

/**
 * raw_atomic_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_acquire(atomic_t *v)
{
#if defined(arch_atomic_dec_return_acquire)
        return arch_atomic_dec_return_acquire(v);
#elif defined(arch_atomic_dec_return_relaxed)
        int ret = arch_atomic_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_release(atomic_t *v)
{
#if defined(arch_atomic_dec_return_release)
        return arch_atomic_dec_return_release(v);
#elif defined(arch_atomic_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline int
raw_atomic_dec_return_relaxed(atomic_t *v)
{
#if defined(arch_atomic_dec_return_relaxed)
        return arch_atomic_dec_return_relaxed(v);
#elif defined(arch_atomic_dec_return)
        return arch_atomic_dec_return(v);
#else
        return raw_atomic_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_acquire(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_acquire)
        return arch_atomic_fetch_dec_acquire(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        int ret = arch_atomic_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_release(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_release)
        return arch_atomic_fetch_dec_release(v);
#elif defined(arch_atomic_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_dec_relaxed(atomic_t *v)
{
#if defined(arch_atomic_fetch_dec_relaxed)
        return arch_atomic_fetch_dec_relaxed(v);
#elif defined(arch_atomic_fetch_dec)
        return arch_atomic_fetch_dec(v);
#else
        return raw_atomic_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic_and() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_and(int i, atomic_t *v)
{
        arch_atomic_and(i, v);
}

/**
 * raw_atomic_fetch_and() - atomic bitwise AND with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_and"
#endif
}

/**
 * raw_atomic_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_acquire)
        return arch_atomic_fetch_and_acquire(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        int ret = arch_atomic_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_acquire"
#endif
}

/**
 * raw_atomic_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_release)
        return arch_atomic_fetch_and_release(i, v);
#elif defined(arch_atomic_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_release"
#endif
}

/**
 * raw_atomic_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_and_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_and_relaxed)
        return arch_atomic_fetch_and_relaxed(i, v);
#elif defined(arch_atomic_fetch_and)
        return arch_atomic_fetch_and(i, v);
#else
#error "Unable to define raw_atomic_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_andnot)
        arch_atomic_andnot(i, v);
#else
        raw_atomic_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_acquire)
        return arch_atomic_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        int ret = arch_atomic_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_release)
        return arch_atomic_fetch_andnot_release(i, v);
#elif defined(arch_atomic_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_andnot_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_andnot_relaxed)
        return arch_atomic_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic_fetch_andnot)
        return arch_atomic_fetch_andnot(i, v);
#else
        return raw_atomic_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic_or() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_or(int i, atomic_t *v)
{
        arch_atomic_or(i, v);
}

/**
 * raw_atomic_fetch_or() - atomic bitwise OR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_or"
#endif
}

/**
 * raw_atomic_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_acquire)
        return arch_atomic_fetch_or_acquire(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        int ret = arch_atomic_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_acquire"
#endif
}

/**
 * raw_atomic_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_release)
        return arch_atomic_fetch_or_release(i, v);
#elif defined(arch_atomic_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_release"
#endif
}

/**
 * raw_atomic_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_or_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_or_relaxed)
        return arch_atomic_fetch_or_relaxed(i, v);
#elif defined(arch_atomic_fetch_or)
        return arch_atomic_fetch_or(i, v);
#else
#error "Unable to define raw_atomic_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic_xor() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic_xor(int i, atomic_t *v)
{
        arch_atomic_xor(i, v);
}

/**
 * raw_atomic_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic_fetch_xor"
#endif
}

/**
 * raw_atomic_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_acquire)
        return arch_atomic_fetch_xor_acquire(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        int ret = arch_atomic_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_release(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_release)
        return arch_atomic_fetch_xor_release(i, v);
#elif defined(arch_atomic_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_release"
#endif
}

/**
 * raw_atomic_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: int value
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_xor_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_fetch_xor_relaxed)
        return arch_atomic_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic_fetch_xor)
        return arch_atomic_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_acquire(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_acquire)
        return arch_atomic_xchg_acquire(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        int ret = arch_atomic_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_release(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_release)
        return arch_atomic_xchg_release(v, new);
#elif defined(arch_atomic_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @new: int value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_xchg_relaxed(atomic_t *v, int new)
{
#if defined(arch_atomic_xchg_relaxed)
        return arch_atomic_xchg_relaxed(v, new);
#elif defined(arch_atomic_xchg)
        return arch_atomic_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_acquire(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_acquire)
        return arch_atomic_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        int ret = arch_atomic_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_release(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_release)
        return arch_atomic_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_cmpxchg_relaxed(atomic_t *v, int old, int new)
{
#if defined(arch_atomic_cmpxchg_relaxed)
        return arch_atomic_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_cmpxchg)
        return arch_atomic_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_acquire)
        return arch_atomic_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        bool ret = arch_atomic_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_release(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_release)
        return arch_atomic_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic_t
 * @old: pointer to int value to compare with
 * @new: int value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new)
{
#if defined(arch_atomic_try_cmpxchg_relaxed)
        return arch_atomic_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic_try_cmpxchg)
        return arch_atomic_try_cmpxchg(v, old, new);
#else
        int r, o = *old;
        r = raw_atomic_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: int value to subtract
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_sub_and_test(int i, atomic_t *v)
{
#if defined(arch_atomic_sub_and_test)
        return arch_atomic_sub_and_test(i, v);
#else
        return raw_atomic_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_and_test(atomic_t *v)
{
#if defined(arch_atomic_dec_and_test)
        return arch_atomic_dec_and_test(v);
#else
        return raw_atomic_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_and_test(atomic_t *v)
{
#if defined(arch_atomic_inc_and_test)
        return arch_atomic_inc_and_test(v);
#else
        return raw_atomic_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic_add_negative() - atomic add and test if negative with full ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_acquire(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_acquire)
        return arch_atomic_add_negative_acquire(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        bool ret = arch_atomic_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_release() - atomic add and test if negative with release ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_release(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_release)
        return arch_atomic_add_negative_release(i, v);
#elif defined(arch_atomic_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: int value to add
 * @v: pointer to atomic_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_negative_relaxed(int i, atomic_t *v)
{
#if defined(arch_atomic_add_negative_relaxed)
        return arch_atomic_add_negative_relaxed(i, v);
#elif defined(arch_atomic_add_negative)
        return arch_atomic_add_negative(i, v);
#else
        return raw_atomic_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline int
raw_atomic_fetch_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_fetch_add_unless)
        return arch_atomic_fetch_add_unless(v, a, u);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic_t
 * @a: int value to add
 * @u: int value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_add_unless(atomic_t *v, int a, int u)
{
#if defined(arch_atomic_add_unless)
        return arch_atomic_add_unless(v, a, u);
#else
        return raw_atomic_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_not_zero(atomic_t *v)
{
#if defined(arch_atomic_inc_not_zero)
        return arch_atomic_inc_not_zero(v);
#else
        return raw_atomic_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_inc_unless_negative(atomic_t *v)
{
#if defined(arch_atomic_inc_unless_negative)
        return arch_atomic_inc_unless_negative(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic_dec_unless_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_unless_positive)
        return arch_atomic_dec_unless_positive(v);
#else
        int c = raw_atomic_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline int
raw_atomic_dec_if_positive(atomic_t *v)
{
#if defined(arch_atomic_dec_if_positive)
        return arch_atomic_dec_if_positive(v);
#else
        int dec, c = raw_atomic_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#ifdef CONFIG_GENERIC_ATOMIC64
#include <asm-generic/atomic64.h>
#endif

/**
 * raw_atomic64_read() - atomic load with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read(const atomic64_t *v)
{
        return arch_atomic64_read(v);
}

/**
 * raw_atomic64_read_acquire() - atomic load with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically loads the value of @v with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_read_acquire() elsewhere.
 *
 * Return: The value loaded from @v.
 */
static __always_inline s64
raw_atomic64_read_acquire(const atomic64_t *v)
{
#if defined(arch_atomic64_read_acquire)
        return arch_atomic64_read_acquire(v);
#else
        s64 ret;

        if (__native_word(atomic64_t)) {
                ret = smp_load_acquire(&(v)->counter);
        } else {
                ret = raw_atomic64_read(v);
                __atomic_acquire_fence();
        }

        return ret;
#endif
}

/**
 * raw_atomic64_set() - atomic set with relaxed ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set(atomic64_t *v, s64 i)
{
        arch_atomic64_set(v, i);
}

/**
 * raw_atomic64_set_release() - atomic set with release ordering
 * @v: pointer to atomic64_t
 * @i: s64 value to assign
 *
 * Atomically sets @v to @i with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_set_release() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_set_release(atomic64_t *v, s64 i)
{
#if defined(arch_atomic64_set_release)
        arch_atomic64_set_release(v, i);
#else
        if (__native_word(atomic64_t)) {
                smp_store_release(&(v)->counter, i);
        } else {
                __atomic_release_fence();
                raw_atomic64_set(v, i);
        }
#endif
}

/**
 * raw_atomic64_add() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_add(s64 i, atomic64_t *v)
{
        arch_atomic64_add(i, v);
}

/**
 * raw_atomic64_add_return() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_add_return"
#endif
}

/**
 * raw_atomic64_add_return_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_acquire)
        return arch_atomic64_add_return_acquire(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        s64 ret = arch_atomic64_add_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_acquire"
#endif
}

/**
 * raw_atomic64_add_return_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_release)
        return arch_atomic64_add_return_release(i, v);
#elif defined(arch_atomic64_add_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_release"
#endif
}

/**
 * raw_atomic64_add_return_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_add_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_return_relaxed)
        return arch_atomic64_add_return_relaxed(i, v);
#elif defined(arch_atomic64_add_return)
        return arch_atomic64_add_return(i, v);
#else
#error "Unable to define raw_atomic64_add_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_add() - atomic add with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_add"
#endif
}

/**
 * raw_atomic64_fetch_add_acquire() - atomic add with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_acquire)
        return arch_atomic64_fetch_add_acquire(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        s64 ret = arch_atomic64_fetch_add_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_acquire"
#endif
}

/**
 * raw_atomic64_fetch_add_release() - atomic add with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_release)
        return arch_atomic64_fetch_add_release(i, v);
#elif defined(arch_atomic64_fetch_add_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_release"
#endif
}

/**
 * raw_atomic64_fetch_add_relaxed() - atomic add with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_add_relaxed)
        return arch_atomic64_fetch_add_relaxed(i, v);
#elif defined(arch_atomic64_fetch_add)
        return arch_atomic64_fetch_add(i, v);
#else
#error "Unable to define raw_atomic64_fetch_add_relaxed"
#endif
}

/**
 * raw_atomic64_sub() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_sub(s64 i, atomic64_t *v)
{
        arch_atomic64_sub(i, v);
}

/**
 * raw_atomic64_sub_return() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_sub_return"
#endif
}

/**
 * raw_atomic64_sub_return_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_acquire)
        return arch_atomic64_sub_return_acquire(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        s64 ret = arch_atomic64_sub_return_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_acquire"
#endif
}

/**
 * raw_atomic64_sub_return_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_release)
        return arch_atomic64_sub_return_release(i, v);
#elif defined(arch_atomic64_sub_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_release"
#endif
}

/**
 * raw_atomic64_sub_return_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_sub_return_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_return_relaxed)
        return arch_atomic64_sub_return_relaxed(i, v);
#elif defined(arch_atomic64_sub_return)
        return arch_atomic64_sub_return(i, v);
#else
#error "Unable to define raw_atomic64_sub_return_relaxed"
#endif
}

/**
 * raw_atomic64_fetch_sub() - atomic subtract with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_sub"
#endif
}

/**
 * raw_atomic64_fetch_sub_acquire() - atomic subtract with acquire ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_acquire)
        return arch_atomic64_fetch_sub_acquire(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        s64 ret = arch_atomic64_fetch_sub_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_acquire"
#endif
}

/**
 * raw_atomic64_fetch_sub_release() - atomic subtract with release ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_release)
        return arch_atomic64_fetch_sub_release(i, v);
#elif defined(arch_atomic64_fetch_sub_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_release"
#endif
}

/**
 * raw_atomic64_fetch_sub_relaxed() - atomic subtract with relaxed ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_sub_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_sub_relaxed)
        return arch_atomic64_fetch_sub_relaxed(i, v);
#elif defined(arch_atomic64_fetch_sub)
        return arch_atomic64_fetch_sub(i, v);
#else
#error "Unable to define raw_atomic64_fetch_sub_relaxed"
#endif
}

/**
 * raw_atomic64_inc() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_inc(atomic64_t *v)
{
#if defined(arch_atomic64_inc)
        arch_atomic64_inc(v);
#else
        raw_atomic64_add(1, v);
#endif
}

/**
 * raw_atomic64_inc_return() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_acquire)
        return arch_atomic64_inc_return_acquire(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        s64 ret = arch_atomic64_inc_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_release)
        return arch_atomic64_inc_return_release(v);
#elif defined(arch_atomic64_inc_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_release(1, v);
#endif
}

/**
 * raw_atomic64_inc_return_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_inc_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_inc_return_relaxed)
        return arch_atomic64_inc_return_relaxed(v);
#elif defined(arch_atomic64_inc_return)
        return arch_atomic64_inc_return(v);
#else
        return raw_atomic64_add_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc() - atomic increment with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_add(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_acquire() - atomic increment with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_acquire)
        return arch_atomic64_fetch_inc_acquire(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        s64 ret = arch_atomic64_fetch_inc_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_release() - atomic increment with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_release)
        return arch_atomic64_fetch_inc_release(v);
#elif defined(arch_atomic64_fetch_inc_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_inc_relaxed() - atomic increment with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_inc_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_inc_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_inc_relaxed)
        return arch_atomic64_fetch_inc_relaxed(v);
#elif defined(arch_atomic64_fetch_inc)
        return arch_atomic64_fetch_inc(v);
#else
        return raw_atomic64_fetch_add_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_dec() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_dec(atomic64_t *v)
{
#if defined(arch_atomic64_dec)
        arch_atomic64_dec(v);
#else
        raw_atomic64_sub(1, v);
#endif
}

/**
 * raw_atomic64_dec_return() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_sub_return(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_acquire() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_acquire)
        return arch_atomic64_dec_return_acquire(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        s64 ret = arch_atomic64_dec_return_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_acquire(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_release() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_release(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_release)
        return arch_atomic64_dec_return_release(v);
#elif defined(arch_atomic64_dec_return_relaxed)
        __atomic_release_fence();
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_release(1, v);
#endif
}

/**
 * raw_atomic64_dec_return_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_return_relaxed() elsewhere.
 *
 * Return: The updated value of @v.
 */
static __always_inline s64
raw_atomic64_dec_return_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_dec_return_relaxed)
        return arch_atomic64_dec_return_relaxed(v);
#elif defined(arch_atomic64_dec_return)
        return arch_atomic64_dec_return(v);
#else
        return raw_atomic64_sub_return_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec() - atomic decrement with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_sub(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_acquire() - atomic decrement with acquire ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_acquire(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_acquire)
        return arch_atomic64_fetch_dec_acquire(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        s64 ret = arch_atomic64_fetch_dec_relaxed(v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_acquire(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_release() - atomic decrement with release ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_release(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_release)
        return arch_atomic64_fetch_dec_release(v);
#elif defined(arch_atomic64_fetch_dec_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_release(1, v);
#endif
}

/**
 * raw_atomic64_fetch_dec_relaxed() - atomic decrement with relaxed ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_dec_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_dec_relaxed(atomic64_t *v)
{
#if defined(arch_atomic64_fetch_dec_relaxed)
        return arch_atomic64_fetch_dec_relaxed(v);
#elif defined(arch_atomic64_fetch_dec)
        return arch_atomic64_fetch_dec(v);
#else
        return raw_atomic64_fetch_sub_relaxed(1, v);
#endif
}

/**
 * raw_atomic64_and() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_and() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_and(s64 i, atomic64_t *v)
{
        arch_atomic64_and(i, v);
}

/**
 * raw_atomic64_fetch_and() - atomic bitwise AND with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_and"
#endif
}

/**
 * raw_atomic64_fetch_and_acquire() - atomic bitwise AND with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_acquire)
        return arch_atomic64_fetch_and_acquire(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        s64 ret = arch_atomic64_fetch_and_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_acquire"
#endif
}

/**
 * raw_atomic64_fetch_and_release() - atomic bitwise AND with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_release)
        return arch_atomic64_fetch_and_release(i, v);
#elif defined(arch_atomic64_fetch_and_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_release"
#endif
}

/**
 * raw_atomic64_fetch_and_relaxed() - atomic bitwise AND with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_and_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_and_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_and_relaxed)
        return arch_atomic64_fetch_and_relaxed(i, v);
#elif defined(arch_atomic64_fetch_and)
        return arch_atomic64_fetch_and(i, v);
#else
#error "Unable to define raw_atomic64_fetch_and_relaxed"
#endif
}

/**
 * raw_atomic64_andnot() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_andnot() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_andnot)
        arch_atomic64_andnot(i, v);
#else
        raw_atomic64_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot() - atomic bitwise AND NOT with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_fetch_and(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_acquire() - atomic bitwise AND NOT with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_acquire)
        return arch_atomic64_fetch_andnot_acquire(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        s64 ret = arch_atomic64_fetch_andnot_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_acquire(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_release() - atomic bitwise AND NOT with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_release)
        return arch_atomic64_fetch_andnot_release(i, v);
#elif defined(arch_atomic64_fetch_andnot_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_release(~i, v);
#endif
}

/**
 * raw_atomic64_fetch_andnot_relaxed() - atomic bitwise AND NOT with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v & ~@i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_andnot_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_andnot_relaxed)
        return arch_atomic64_fetch_andnot_relaxed(i, v);
#elif defined(arch_atomic64_fetch_andnot)
        return arch_atomic64_fetch_andnot(i, v);
#else
        return raw_atomic64_fetch_and_relaxed(~i, v);
#endif
}

/**
 * raw_atomic64_or() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_or() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_or(s64 i, atomic64_t *v)
{
        arch_atomic64_or(i, v);
}

/**
 * raw_atomic64_fetch_or() - atomic bitwise OR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_or"
#endif
}

/**
 * raw_atomic64_fetch_or_acquire() - atomic bitwise OR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_acquire)
        return arch_atomic64_fetch_or_acquire(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        s64 ret = arch_atomic64_fetch_or_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_acquire"
#endif
}

/**
 * raw_atomic64_fetch_or_release() - atomic bitwise OR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_release)
        return arch_atomic64_fetch_or_release(i, v);
#elif defined(arch_atomic64_fetch_or_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_release"
#endif
}

/**
 * raw_atomic64_fetch_or_relaxed() - atomic bitwise OR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v | @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_or_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_or_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_or_relaxed)
        return arch_atomic64_fetch_or_relaxed(i, v);
#elif defined(arch_atomic64_fetch_or)
        return arch_atomic64_fetch_or(i, v);
#else
#error "Unable to define raw_atomic64_fetch_or_relaxed"
#endif
}

/**
 * raw_atomic64_xor() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xor() elsewhere.
 *
 * Return: Nothing.
 */
static __always_inline void
raw_atomic64_xor(s64 i, atomic64_t *v)
{
        arch_atomic64_xor(i, v);
}

/**
 * raw_atomic64_fetch_xor() - atomic bitwise XOR with full ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
#error "Unable to define raw_atomic64_fetch_xor"
#endif
}

/**
 * raw_atomic64_fetch_xor_acquire() - atomic bitwise XOR with acquire ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_acquire)
        return arch_atomic64_fetch_xor_acquire(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        s64 ret = arch_atomic64_fetch_xor_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_acquire"
#endif
}

/**
 * raw_atomic64_fetch_xor_release() - atomic bitwise XOR with release ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_release)
        return arch_atomic64_fetch_xor_release(i, v);
#elif defined(arch_atomic64_fetch_xor_relaxed)
        __atomic_release_fence();
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_release"
#endif
}

/**
 * raw_atomic64_fetch_xor_relaxed() - atomic bitwise XOR with relaxed ordering
 * @i: s64 value
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v ^ @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_xor_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_fetch_xor_relaxed)
        return arch_atomic64_fetch_xor_relaxed(i, v);
#elif defined(arch_atomic64_fetch_xor)
        return arch_atomic64_fetch_xor(i, v);
#else
#error "Unable to define raw_atomic64_fetch_xor_relaxed"
#endif
}

/**
 * raw_atomic64_xchg() - atomic exchange with full ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_xchg(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_acquire() - atomic exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_acquire(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_acquire)
        return arch_atomic64_xchg_acquire(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        s64 ret = arch_atomic64_xchg_relaxed(v, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_acquire(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_release() - atomic exchange with release ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_release(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_release)
        return arch_atomic64_xchg_release(v, new);
#elif defined(arch_atomic64_xchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_release(&v->counter, new);
#endif
}

/**
 * raw_atomic64_xchg_relaxed() - atomic exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @new: s64 value to assign
 *
 * Atomically updates @v to @new with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_xchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_xchg_relaxed(atomic64_t *v, s64 new)
{
#if defined(arch_atomic64_xchg_relaxed)
        return arch_atomic64_xchg_relaxed(v, new);
#elif defined(arch_atomic64_xchg)
        return arch_atomic64_xchg(v, new);
#else
        return raw_xchg_relaxed(&v->counter, new);
#endif
}

/**
 * raw_atomic64_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_cmpxchg(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_acquire() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_acquire)
        return arch_atomic64_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        s64 ret = arch_atomic64_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_acquire(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_release() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_release)
        return arch_atomic64_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_release(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_cmpxchg_relaxed() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new)
{
#if defined(arch_atomic64_cmpxchg_relaxed)
        return arch_atomic64_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_cmpxchg)
        return arch_atomic64_cmpxchg(v, old, new);
#else
        return raw_cmpxchg_relaxed(&v->counter, old, new);
#endif
}

/**
 * raw_atomic64_try_cmpxchg() - atomic compare and exchange with full ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with full ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_post_full_fence();
        return ret;
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_acquire() - atomic compare and exchange with acquire ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with acquire ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_acquire() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_acquire)
        return arch_atomic64_try_cmpxchg_acquire(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        bool ret = arch_atomic64_try_cmpxchg_relaxed(v, old, new);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_acquire(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_release() - atomic compare and exchange with release ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with release ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_release() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_release)
        return arch_atomic64_try_cmpxchg_release(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg_relaxed)
        __atomic_release_fence();
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_release(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_try_cmpxchg_relaxed() - atomic compare and exchange with relaxed ordering
 * @v: pointer to atomic64_t
 * @old: pointer to s64 value to compare with
 * @new: s64 value to assign
 *
 * If (@v == @old), atomically updates @v to @new with relaxed ordering.
 * Otherwise, @v is not modified, @old is updated to the current value of @v,
 * and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_try_cmpxchg_relaxed() elsewhere.
 *
 * Return: @true if the exchange occured, @false otherwise.
 */
static __always_inline bool
raw_atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new)
{
#if defined(arch_atomic64_try_cmpxchg_relaxed)
        return arch_atomic64_try_cmpxchg_relaxed(v, old, new);
#elif defined(arch_atomic64_try_cmpxchg)
        return arch_atomic64_try_cmpxchg(v, old, new);
#else
        s64 r, o = *old;
        r = raw_atomic64_cmpxchg_relaxed(v, o, new);
        if (unlikely(r != o))
                *old = r;
        return likely(r == o);
#endif
}

/**
 * raw_atomic64_sub_and_test() - atomic subtract and test if zero with full ordering
 * @i: s64 value to subtract
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_sub_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_sub_and_test(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_sub_and_test)
        return arch_atomic64_sub_and_test(i, v);
#else
        return raw_atomic64_sub_return(i, v) == 0;
#endif
}

/**
 * raw_atomic64_dec_and_test() - atomic decrement and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v - 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_dec_and_test)
        return arch_atomic64_dec_and_test(v);
#else
        return raw_atomic64_dec_return(v) == 0;
#endif
}

/**
 * raw_atomic64_inc_and_test() - atomic increment and test if zero with full ordering
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + 1) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_and_test() elsewhere.
 *
 * Return: @true if the resulting value of @v is zero, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_and_test(atomic64_t *v)
{
#if defined(arch_atomic64_inc_and_test)
        return arch_atomic64_inc_and_test(v);
#else
        return raw_atomic64_inc_return(v) == 0;
#endif
}

/**
 * raw_atomic64_add_negative() - atomic add and test if negative with full ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with full ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret;
        __atomic_pre_full_fence();
        ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_post_full_fence();
        return ret;
#else
        return raw_atomic64_add_return(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_acquire() - atomic add and test if negative with acquire ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with acquire ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_acquire() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_acquire(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_acquire)
        return arch_atomic64_add_negative_acquire(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        bool ret = arch_atomic64_add_negative_relaxed(i, v);
        __atomic_acquire_fence();
        return ret;
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_acquire(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_release() - atomic add and test if negative with release ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with release ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_release() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_release(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_release)
        return arch_atomic64_add_negative_release(i, v);
#elif defined(arch_atomic64_add_negative_relaxed)
        __atomic_release_fence();
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_release(i, v) < 0;
#endif
}

/**
 * raw_atomic64_add_negative_relaxed() - atomic add and test if negative with relaxed ordering
 * @i: s64 value to add
 * @v: pointer to atomic64_t
 *
 * Atomically updates @v to (@v + @i) with relaxed ordering.
 *
 * Safe to use in noinstr code; prefer atomic64_add_negative_relaxed() elsewhere.
 *
 * Return: @true if the resulting value of @v is negative, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_negative_relaxed(s64 i, atomic64_t *v)
{
#if defined(arch_atomic64_add_negative_relaxed)
        return arch_atomic64_add_negative_relaxed(i, v);
#elif defined(arch_atomic64_add_negative)
        return arch_atomic64_add_negative(i, v);
#else
        return raw_atomic64_add_return_relaxed(i, v) < 0;
#endif
}

/**
 * raw_atomic64_fetch_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_fetch_add_unless() elsewhere.
 *
 * Return: The original value of @v.
 */
static __always_inline s64
raw_atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_fetch_add_unless)
        return arch_atomic64_fetch_add_unless(v, a, u);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c == u))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + a));

        return c;
#endif
}

/**
 * raw_atomic64_add_unless() - atomic add unless value with full ordering
 * @v: pointer to atomic64_t
 * @a: s64 value to add
 * @u: s64 value to compare with
 *
 * If (@v != @u), atomically updates @v to (@v + @a) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_add_unless() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_add_unless(atomic64_t *v, s64 a, s64 u)
{
#if defined(arch_atomic64_add_unless)
        return arch_atomic64_add_unless(v, a, u);
#else
        return raw_atomic64_fetch_add_unless(v, a, u) != u;
#endif
}

/**
 * raw_atomic64_inc_not_zero() - atomic increment unless zero with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v != 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_not_zero() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_not_zero(atomic64_t *v)
{
#if defined(arch_atomic64_inc_not_zero)
        return arch_atomic64_inc_not_zero(v);
#else
        return raw_atomic64_add_unless(v, 1, 0);
#endif
}

/**
 * raw_atomic64_inc_unless_negative() - atomic increment unless negative with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v >= 0), atomically updates @v to (@v + 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_inc_unless_negative() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_inc_unless_negative(atomic64_t *v)
{
#if defined(arch_atomic64_inc_unless_negative)
        return arch_atomic64_inc_unless_negative(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c < 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c + 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_unless_positive() - atomic decrement unless positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v <= 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_unless_positive() elsewhere.
 *
 * Return: @true if @v was updated, @false otherwise.
 */
static __always_inline bool
raw_atomic64_dec_unless_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_unless_positive)
        return arch_atomic64_dec_unless_positive(v);
#else
        s64 c = raw_atomic64_read(v);

        do {
                if (unlikely(c > 0))
                        return false;
        } while (!raw_atomic64_try_cmpxchg(v, &c, c - 1));

        return true;
#endif
}

/**
 * raw_atomic64_dec_if_positive() - atomic decrement if positive with full ordering
 * @v: pointer to atomic64_t
 *
 * If (@v > 0), atomically updates @v to (@v - 1) with full ordering.
 * Otherwise, @v is not modified and relaxed ordering is provided.
 *
 * Safe to use in noinstr code; prefer atomic64_dec_if_positive() elsewhere.
 *
 * Return: The old value of (@v - 1), regardless of whether @v was updated.
 */
static __always_inline s64
raw_atomic64_dec_if_positive(atomic64_t *v)
{
#if defined(arch_atomic64_dec_if_positive)
        return arch_atomic64_dec_if_positive(v);
#else
        s64 dec, c = raw_atomic64_read(v);

        do {
                dec = c - 1;
                if (unlikely(dec < 0))
                        break;
        } while (!raw_atomic64_try_cmpxchg(v, &c, dec));

        return dec;
#endif
}

#endif /* _LINUX_ATOMIC_FALLBACK_H */
// b565db590afeeff0d7c9485ccbca5bb6e155749f




















































































































   64 
   64 














  246 





























    8 









    8 














































































































































































































































  300 


  301 










  301 
  301 
  301 









































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RMAP_H
#define _LINUX_RMAP_H
/*
 * Declarations for Reverse Mapping functions in mm/rmap.c
 */

#include <linux/list.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/rwsem.h>
#include <linux/memcontrol.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/memremap.h>
#include <linux/bit_spinlock.h>

/*
 * The anon_vma heads a list of private "related" vmas, to scan if
 * an anonymous page pointing to this anon_vma needs to be unmapped:
 * the vmas on the list will be related by forking, or by splitting.
 *
 * Since vmas come and go as they are split and merged (particularly
 * in mprotect), the mapping field of an anonymous page cannot point
 * directly to a vma: instead it points to an anon_vma, on whose list
 * the related vmas can be easily linked or unlinked.
 *
 * After unlinking the last vma on the list, we must garbage collect
 * the anon_vma object itself: we're guaranteed no page can be
 * pointing to this anon_vma once its vma list is empty.
 */
struct anon_vma {
        struct anon_vma *root;                /* Root of this anon_vma tree */
        struct rw_semaphore rwsem;        /* W: modification, R: walking the list */
        /*
         * The refcount is taken on an anon_vma when there is no
         * guarantee that the vma of page tables will exist for
         * the duration of the operation. A caller that takes
         * the reference is responsible for clearing up the
         * anon_vma if they are the last user on release
         */
        atomic_t refcount;

        /*
         * Count of child anon_vmas. Equals to the count of all anon_vmas that
         * have ->parent pointing to this one, including itself.
         *
         * This counter is used for making decision about reusing anon_vma
         * instead of forking new one. See comments in function anon_vma_clone.
         */
        unsigned long num_children;
        /* Count of VMAs whose ->anon_vma pointer points to this object. */
        unsigned long num_active_vmas;

        struct anon_vma *parent;        /* Parent of this anon_vma */

        /*
         * NOTE: the LSB of the rb_root.rb_node is set by
         * mm_take_all_locks() _after_ taking the above lock. So the
         * rb_root must only be read/written after taking the above lock
         * to be sure to see a valid next pointer. The LSB bit itself
         * is serialized by a system wide lock only visible to
         * mm_take_all_locks() (mm_all_locks_mutex).
         */

        /* Interval tree of private "related" vmas */
        struct rb_root_cached rb_root;
};

/*
 * The copy-on-write semantics of fork mean that an anon_vma
 * can become associated with multiple processes. Furthermore,
 * each child process will have its own anon_vma, where new
 * pages for that process are instantiated.
 *
 * This structure allows us to find the anon_vmas associated
 * with a VMA, or the VMAs associated with an anon_vma.
 * The "same_vma" list contains the anon_vma_chains linking
 * all the anon_vmas associated with this VMA.
 * The "rb" field indexes on an interval tree the anon_vma_chains
 * which link all the VMAs associated with this anon_vma.
 */
struct anon_vma_chain {
        struct vm_area_struct *vma;
        struct anon_vma *anon_vma;
        struct list_head same_vma;   /* locked by mmap_lock & page_table_lock */
        struct rb_node rb;                        /* locked by anon_vma->rwsem */
        unsigned long rb_subtree_last;
#ifdef CONFIG_DEBUG_VM_RB
        unsigned long cached_vma_start, cached_vma_last;
#endif
};

enum ttu_flags {
        TTU_SPLIT_HUGE_PMD        = 0x4,        /* split huge PMD if any */
        TTU_IGNORE_MLOCK        = 0x8,        /* ignore mlock */
        TTU_SYNC                = 0x10,        /* avoid racy checks with PVMW_SYNC */
        TTU_HWPOISON                = 0x20,        /* do convert pte to hwpoison entry */
        TTU_BATCH_FLUSH                = 0x40,        /* Batch TLB flushes where possible
                                         * and caller guarantees they will
                                         * do a final flush if necessary */
        TTU_RMAP_LOCKED                = 0x80,        /* do not grab rmap lock:
                                         * caller holds it */
};

#ifdef CONFIG_MMU
static inline void get_anon_vma(struct anon_vma *anon_vma)
{
        atomic_inc(&anon_vma->refcount);
}

void __put_anon_vma(struct anon_vma *anon_vma);

static inline void put_anon_vma(struct anon_vma *anon_vma)
{
        if (atomic_dec_and_test(&anon_vma->refcount))
                __put_anon_vma(anon_vma);
}

static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
        down_write(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_write(struct anon_vma *anon_vma)
{
        return down_write_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_write(struct anon_vma *anon_vma)
{
        up_write(&anon_vma->root->rwsem);
}

static inline void anon_vma_lock_read(struct anon_vma *anon_vma)
{
        down_read(&anon_vma->root->rwsem);
}

static inline int anon_vma_trylock_read(struct anon_vma *anon_vma)
{
        return down_read_trylock(&anon_vma->root->rwsem);
}

static inline void anon_vma_unlock_read(struct anon_vma *anon_vma)
{
        up_read(&anon_vma->root->rwsem);
}


/*
 * anon_vma helper functions.
 */
void anon_vma_init(void);        /* create anon_vma_cachep */
int  __anon_vma_prepare(struct vm_area_struct *);
void unlink_anon_vmas(struct vm_area_struct *);
int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);

static inline int anon_vma_prepare(struct vm_area_struct *vma)
{
        if (likely(vma->anon_vma))
                return 0;

        return __anon_vma_prepare(vma);
}

static inline void anon_vma_merge(struct vm_area_struct *vma,
                                  struct vm_area_struct *next)
{
        VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma);
        unlink_anon_vmas(next);
}

struct anon_vma *folio_get_anon_vma(const struct folio *folio);

#ifdef CONFIG_MM_ID
static __always_inline void folio_lock_large_mapcount(struct folio *folio)
{
        bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
}

static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
{
        __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
}

static inline unsigned int folio_mm_id(const struct folio *folio, int idx)
{
        VM_WARN_ON_ONCE(idx != 0 && idx != 1);
        return folio->_mm_id[idx] & MM_ID_MASK;
}

static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id)
{
        VM_WARN_ON_ONCE(idx != 0 && idx != 1);
        folio->_mm_id[idx] &= ~MM_ID_MASK;
        folio->_mm_id[idx] |= id;
}

static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
                int diff, mm_id_t mm_id)
{
        VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio));
        VM_WARN_ON_ONCE(diff <= 0);
        VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX);

        /*
         * Make sure we can detect at least one complete PTE mapping of the
         * folio in a single MM as "exclusively mapped". This is primarily
         * a check on 32bit, where we currently reduce the size of the per-MM
         * mapcount to a short.
         */
        VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
        VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX);

        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[0] != -1);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[0] < 0);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[1] != -1);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
                        folio->_mm_id_mapcount[1] < 0);
        VM_WARN_ON_ONCE(!folio_mapped(folio) &&
                        folio_test_large_maybe_mapped_shared(folio));
}

static __always_inline void folio_set_large_mapcount(struct folio *folio,
                int mapcount, struct vm_area_struct *vma)
{
        __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);

        VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
        VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);

        /* Note: mapcounts start at -1. */
        atomic_set(&folio->_large_mapcount, mapcount - 1);
        folio->_mm_id_mapcount[0] = mapcount - 1;
        folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
}

static __always_inline int folio_add_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        const mm_id_t mm_id = vma->vm_mm->mm_id;
        int new_mapcount_val;

        folio_lock_large_mapcount(folio);
        __folio_large_mapcount_sanity_checks(folio, diff, mm_id);

        new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
        atomic_set(&folio->_large_mapcount, new_mapcount_val);

        /*
         * If a folio is mapped more than once into an MM on 32bit, we
         * can in theory overflow the per-MM mapcount (although only for
         * fairly large folios), turning it negative. In that case, just
         * free up the slot and mark the folio "mapped shared", otherwise
         * we might be in trouble when unmapping pages later.
         */
        if (folio_mm_id(folio, 0) == mm_id) {
                folio->_mm_id_mapcount[0] += diff;
                if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) {
                        folio->_mm_id_mapcount[0] = -1;
                        folio_set_mm_id(folio, 0, MM_ID_DUMMY);
                        folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
                }
        } else if (folio_mm_id(folio, 1) == mm_id) {
                folio->_mm_id_mapcount[1] += diff;
                if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) {
                        folio->_mm_id_mapcount[1] = -1;
                        folio_set_mm_id(folio, 1, MM_ID_DUMMY);
                        folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
                }
        } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) {
                folio_set_mm_id(folio, 0, mm_id);
                folio->_mm_id_mapcount[0] = diff - 1;
                /* We might have other mappings already. */
                if (new_mapcount_val != diff - 1)
                        folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
        } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) {
                folio_set_mm_id(folio, 1, mm_id);
                folio->_mm_id_mapcount[1] = diff - 1;
                /* Slot 0 certainly has mappings as well. */
                folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
        }
        folio_unlock_large_mapcount(folio);
        return new_mapcount_val + 1;
}
#define folio_add_large_mapcount folio_add_return_large_mapcount

static __always_inline int folio_sub_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        const mm_id_t mm_id = vma->vm_mm->mm_id;
        int new_mapcount_val;

        folio_lock_large_mapcount(folio);
        __folio_large_mapcount_sanity_checks(folio, diff, mm_id);

        new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
        atomic_set(&folio->_large_mapcount, new_mapcount_val);

        /*
         * There are valid corner cases where we might underflow a per-MM
         * mapcount (some mappings added when no slot was free, some mappings
         * added once a slot was free), so we always set it to -1 once we go
         * negative.
         */
        if (folio_mm_id(folio, 0) == mm_id) {
                folio->_mm_id_mapcount[0] -= diff;
                if (folio->_mm_id_mapcount[0] >= 0)
                        goto out;
                folio->_mm_id_mapcount[0] = -1;
                folio_set_mm_id(folio, 0, MM_ID_DUMMY);
        } else if (folio_mm_id(folio, 1) == mm_id) {
                folio->_mm_id_mapcount[1] -= diff;
                if (folio->_mm_id_mapcount[1] >= 0)
                        goto out;
                folio->_mm_id_mapcount[1] = -1;
                folio_set_mm_id(folio, 1, MM_ID_DUMMY);
        }

        /*
         * If one MM slot owns all mappings, the folio is mapped exclusively.
         * Note that if the folio is now unmapped (new_mapcount_val == -1), both
         * slots must be free (mapcount == -1), and we'll also mark it as
         * exclusive.
         */
        if (folio->_mm_id_mapcount[0] == new_mapcount_val ||
            folio->_mm_id_mapcount[1] == new_mapcount_val)
                folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
out:
        folio_unlock_large_mapcount(folio);
        return new_mapcount_val + 1;
}
#define folio_sub_large_mapcount folio_sub_return_large_mapcount
#else /* !CONFIG_MM_ID */
/*
 * See __folio_rmap_sanity_checks(), we might map large folios even without
 * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
 */
static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
                struct vm_area_struct *vma)
{
        /* Note: mapcounts start at -1. */
        atomic_set(&folio->_large_mapcount, mapcount - 1);
}

static inline void folio_add_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        atomic_add(diff, &folio->_large_mapcount);
}

static inline int folio_add_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        BUILD_BUG();
}

static inline void folio_sub_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        atomic_sub(diff, &folio->_large_mapcount);
}

static inline int folio_sub_return_large_mapcount(struct folio *folio,
                int diff, struct vm_area_struct *vma)
{
        BUILD_BUG();
}
#endif /* CONFIG_MM_ID */

#define folio_inc_large_mapcount(folio, vma) \
        folio_add_large_mapcount(folio, 1, vma)
#define folio_inc_return_large_mapcount(folio, vma) \
        folio_add_return_large_mapcount(folio, 1, vma)
#define folio_dec_large_mapcount(folio, vma) \
        folio_sub_large_mapcount(folio, 1, vma)
#define folio_dec_return_large_mapcount(folio, vma) \
        folio_sub_return_large_mapcount(folio, 1, vma)

/* RMAP flags, currently only relevant for some anon rmap operations. */
typedef int __bitwise rmap_t;

/*
 * No special request: A mapped anonymous (sub)page is possibly shared between
 * processes.
 */
#define RMAP_NONE                ((__force rmap_t)0)

/* The anonymous (sub)page is exclusive to a single process. */
#define RMAP_EXCLUSIVE                ((__force rmap_t)BIT(0))

/*
 * Internally, we're using an enum to specify the granularity. We make the
 * compiler emit specialized code for each granularity.
 */
enum rmap_level {
        RMAP_LEVEL_PTE = 0,
        RMAP_LEVEL_PMD,
        RMAP_LEVEL_PUD,
};

static inline void __folio_rmap_sanity_checks(const struct folio *folio,
                const struct page *page, int nr_pages, enum rmap_level level)
{
        /* hugetlb folios are handled separately. */
        VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);

        /* When (un)mapping zeropages, we should never touch ref+mapcount. */
        VM_WARN_ON_FOLIO(is_zero_folio(folio), folio);

        /*
         * TODO: we get driver-allocated folios that have nothing to do with
         * the rmap using vm_insert_page(); therefore, we cannot assume that
         * folio_test_large_rmappable() holds for large folios. We should
         * handle any desired mapcount+stats accounting for these folios in
         * VM_MIXEDMAP VMAs separately, and then sanity-check here that
         * we really only get rmappable folios.
         */

        VM_WARN_ON_ONCE(nr_pages <= 0);
        VM_WARN_ON_FOLIO(page_folio(page) != folio, folio);
        VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio);

        switch (level) {
        case RMAP_LEVEL_PTE:
                break;
        case RMAP_LEVEL_PMD:
                /*
                 * We don't support folios larger than a single PMD yet. So
                 * when RMAP_LEVEL_PMD is set, we assume that we are creating
                 * a single "entire" mapping of the folio.
                 */
                VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio);
                VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio);
                break;
        case RMAP_LEVEL_PUD:
                /*
                 * Assume that we are creating a single "entire" mapping of the
                 * folio.
                 */
                VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PUD_NR, folio);
                VM_WARN_ON_FOLIO(nr_pages != HPAGE_PUD_NR, folio);
                break;
        default:
                VM_WARN_ON_ONCE(true);
        }
}

/*
 * rmap interfaces called when adding or removing pte of page
 */
void folio_move_anon_rmap(struct folio *, struct vm_area_struct *);
void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
#define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \
        folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags)
void folio_add_anon_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *, unsigned long address, rmap_t flags);
void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address, rmap_t flags);
void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_add_file_rmap_pte(folio, page, vma) \
        folio_add_file_rmap_ptes(folio, page, 1, vma)
void folio_add_file_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_add_file_rmap_pud(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages,
                struct vm_area_struct *);
#define folio_remove_rmap_pte(folio, page, vma) \
        folio_remove_rmap_ptes(folio, page, 1, vma)
void folio_remove_rmap_pmd(struct folio *, struct page *,
                struct vm_area_struct *);
void folio_remove_rmap_pud(struct folio *, struct page *,
                struct vm_area_struct *);

void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address, rmap_t flags);
void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
                unsigned long address);

/* See folio_try_dup_anon_rmap_*() */
static inline int hugetlb_try_dup_anon_rmap(struct folio *folio,
                struct vm_area_struct *vma)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);

        if (PageAnonExclusive(&folio->page)) {
                if (unlikely(folio_needs_cow_for_dma(vma, folio)))
                        return -EBUSY;
                ClearPageAnonExclusive(&folio->page);
        }
        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
        return 0;
}

/* See folio_try_share_anon_rmap_*() */
static inline int hugetlb_try_share_anon_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio);

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(&folio->page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

static inline void hugetlb_add_file_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);
        VM_WARN_ON_FOLIO(folio_test_anon(folio), folio);

        atomic_inc(&folio->_entire_mapcount);
        atomic_inc(&folio->_large_mapcount);
}

static inline void hugetlb_remove_rmap(struct folio *folio)
{
        VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio);

        atomic_dec(&folio->_entire_mapcount);
        atomic_dec(&folio->_large_mapcount);
}

static __always_inline void __folio_dup_file_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
                enum rmap_level level)
{
        const int orig_nr_pages = nr_pages;

        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        switch (level) {
        case RMAP_LEVEL_PTE:
                if (!folio_test_large(folio)) {
                        atomic_inc(&folio->_mapcount);
                        break;
                }

                if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) {
                        do {
                                atomic_inc(&page->_mapcount);
                        } while (page++, --nr_pages > 0);
                }
                folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
                break;
        case RMAP_LEVEL_PMD:
        case RMAP_LEVEL_PUD:
                atomic_inc(&folio->_entire_mapcount);
                folio_inc_large_mapcount(folio, dst_vma);
                break;
        }
}

/**
 * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 * @dst_vma:        The destination vm area
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma)
{
        __folio_dup_file_rmap(folio, page, nr_pages, dst_vma, RMAP_LEVEL_PTE);
}

static __always_inline void folio_dup_file_rmap_pte(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma)
{
        __folio_dup_file_rmap(folio, page, 1, dst_vma, RMAP_LEVEL_PTE);
}

/**
 * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 * @dst_vma:        The destination vm area
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock.
 */
static inline void folio_dup_file_rmap_pmd(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, dst_vma, RMAP_LEVEL_PTE);
#else
        WARN_ON_ONCE(true);
#endif
}

static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma, enum rmap_level level)
{
        const int orig_nr_pages = nr_pages;
        bool maybe_pinned;
        int i;

        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /*
         * If this folio may have been pinned by the parent process,
         * don't allow to duplicate the mappings but instead require to e.g.,
         * copy the subpage immediately for the child so that we'll always
         * guarantee the pinned folio won't be randomly replaced in the
         * future on write faults.
         */
        maybe_pinned = likely(!folio_is_device_private(folio)) &&
                       unlikely(folio_needs_cow_for_dma(src_vma, folio));

        /*
         * No need to check+clear for already shared PTEs/PMDs of the
         * folio. But if any page is PageAnonExclusive, we must fallback to
         * copying if the folio maybe pinned.
         */
        switch (level) {
        case RMAP_LEVEL_PTE:
                if (unlikely(maybe_pinned)) {
                        for (i = 0; i < nr_pages; i++)
                                if (PageAnonExclusive(page + i))
                                        return -EBUSY;
                }

                if (!folio_test_large(folio)) {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        atomic_inc(&folio->_mapcount);
                        break;
                }

                do {
                        if (PageAnonExclusive(page))
                                ClearPageAnonExclusive(page);
                        if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT))
                                atomic_inc(&page->_mapcount);
                } while (page++, --nr_pages > 0);
                folio_add_large_mapcount(folio, orig_nr_pages, dst_vma);
                break;
        case RMAP_LEVEL_PMD:
        case RMAP_LEVEL_PUD:
                if (PageAnonExclusive(page)) {
                        if (unlikely(maybe_pinned))
                                return -EBUSY;
                        ClearPageAnonExclusive(page);
                }
                atomic_inc(&folio->_entire_mapcount);
                folio_inc_large_mapcount(folio, dst_vma);
                break;
        }
        return 0;
}

/**
 * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range
 *                                  of a folio
 * @folio:        The folio to duplicate the mappings of
 * @page:        The first page to duplicate the mappings of
 * @nr_pages:        The number of pages of which the mapping will be duplicated
 * @dst_vma:        The destination vm area
 * @src_vma:        The vm area from which the mappings are duplicated
 *
 * The page range of the folio is defined by [page, page + nr_pages)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mappings can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio,
                struct page *page, int nr_pages, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, nr_pages, dst_vma,
                                         src_vma, RMAP_LEVEL_PTE);
}

static __always_inline int folio_try_dup_anon_rmap_pte(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma)
{
        return __folio_try_dup_anon_rmap(folio, page, 1, dst_vma, src_vma,
                                         RMAP_LEVEL_PTE);
}

/**
 * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range
 *                                 of a folio
 * @folio:        The folio to duplicate the mapping of
 * @page:        The first page to duplicate the mapping of
 * @dst_vma:        The destination vm area
 * @src_vma:        The vm area from which the mapping is duplicated
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and the
 * vma->vma_mm->write_protect_seq.
 *
 * Duplicating the mapping can only fail if the folio may be pinned; device
 * private folios cannot get pinned and consequently this function cannot fail
 * for them.
 *
 * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in
 * the parent and the child. They must *not* be writable after this call
 * succeeded.
 *
 * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise.
 */
static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio,
                struct page *page, struct vm_area_struct *dst_vma,
                struct vm_area_struct *src_vma)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, dst_vma,
                                         src_vma, RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

static __always_inline int __folio_try_share_anon_rmap(struct folio *folio,
                struct page *page, int nr_pages, enum rmap_level level)
{
        VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio);
        VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio);
        __folio_rmap_sanity_checks(folio, page, nr_pages, level);

        /* device private folios cannot get pinned via GUP. */
        if (unlikely(folio_is_device_private(folio))) {
                ClearPageAnonExclusive(page);
                return 0;
        }

        /*
         * We have to make sure that when we clear PageAnonExclusive, that
         * the page is not pinned and that concurrent GUP-fast won't succeed in
         * concurrently pinning the page.
         *
         * Conceptually, PageAnonExclusive clearing consists of:
         * (A1) Clear PTE
         * (A2) Check if the page is pinned; back off if so.
         * (A3) Clear PageAnonExclusive
         * (A4) Restore PTE (optional, but certainly not writable)
         *
         * When clearing PageAnonExclusive, we cannot possibly map the page
         * writable again, because anon pages that may be shared must never
         * be writable. So in any case, if the PTE was writable it cannot
         * be writable anymore afterwards and there would be a PTE change. Only
         * if the PTE wasn't writable, there might not be a PTE change.
         *
         * Conceptually, GUP-fast pinning of an anon page consists of:
         * (B1) Read the PTE
         * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so.
         * (B3) Pin the mapped page
         * (B4) Check if the PTE changed by re-reading it; back off if so.
         * (B5) If the original PTE is not writable, check if
         *        PageAnonExclusive is not set; back off if so.
         *
         * If the PTE was writable, we only have to make sure that GUP-fast
         * observes a PTE change and properly backs off.
         *
         * If the PTE was not writable, we have to make sure that GUP-fast either
         * detects a (temporary) PTE change or that PageAnonExclusive is cleared
         * and properly backs off.
         *
         * Consequently, when clearing PageAnonExclusive(), we have to make
         * sure that (A1), (A2)/(A3) and (A4) happen in the right memory
         * order. In GUP-fast pinning code, we have to make sure that (B3),(B4)
         * and (B5) happen in the right memory order.
         *
         * We assume that there might not be a memory barrier after
         * clearing/invalidating the PTE (A1) and before restoring the PTE (A4),
         * so we use explicit ones here.
         */

        /* Paired with the memory barrier in try_grab_folio(). */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb();

        if (unlikely(folio_maybe_dma_pinned(folio)))
                return -EBUSY;
        ClearPageAnonExclusive(page);

        /*
         * This is conceptually a smp_wmb() paired with the smp_rmb() in
         * gup_must_unshare().
         */
        if (IS_ENABLED(CONFIG_HAVE_GUP_FAST))
                smp_mb__after_atomic();
        return 0;
}

/**
 * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page
 *                                   mapped by a PTE possibly shared to prepare
 *                                   for KSM or temporary unmapping
 * @folio:        The folio to share a mapping of
 * @page:        The mapped exclusive page
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during
 * fork() to duplicate mappings, but instead to prepare for KSM or temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte().
 *
 * Marking the mapped page shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped page possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pte(struct folio *folio,
                struct page *page)
{
        return __folio_try_share_anon_rmap(folio, page, 1, RMAP_LEVEL_PTE);
}

/**
 * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page
 *                                   range mapped by a PMD possibly shared to
 *                                   prepare for temporary unmapping
 * @folio:        The folio to share the mapping of
 * @page:        The first page to share the mapping of
 *
 * The page range of the folio is defined by [page, page + HPAGE_PMD_NR)
 *
 * The caller needs to hold the page table lock and has to have the page table
 * entries cleared/invalidated.
 *
 * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during
 * fork() to duplicate a mapping, but instead to prepare for temporarily
 * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd().
 *
 * Marking the mapped pages shared can only fail if the folio maybe pinned;
 * device private folios cannot get pinned and consequently this function cannot
 * fail.
 *
 * Returns 0 if marking the mapped pages possibly shared succeeded. Returns
 * -EBUSY otherwise.
 */
static inline int folio_try_share_anon_rmap_pmd(struct folio *folio,
                struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
        return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR,
                                           RMAP_LEVEL_PMD);
#else
        WARN_ON_ONCE(true);
        return -EBUSY;
#endif
}

/*
 * Called from mm/vmscan.c to handle paging out
 */
int folio_referenced(struct folio *, int is_locked,
                        struct mem_cgroup *memcg, unsigned long *vm_flags);

void try_to_migrate(struct folio *folio, enum ttu_flags flags);
void try_to_unmap(struct folio *, enum ttu_flags flags);

struct page *make_device_exclusive(struct mm_struct *mm, unsigned long addr,
                void *owner, struct folio **foliop);

/* Avoid racy checks */
#define PVMW_SYNC                (1 << 0)
/* Look for migration entries rather than present PTEs */
#define PVMW_MIGRATION                (1 << 1)

struct page_vma_mapped_walk {
        unsigned long pfn;
        unsigned long nr_pages;
        pgoff_t pgoff;
        struct vm_area_struct *vma;
        unsigned long address;
        pmd_t *pmd;
        pte_t *pte;
        spinlock_t *ptl;
        unsigned int flags;
};

#define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags)        \
        struct page_vma_mapped_walk name = {                                \
                .pfn = folio_pfn(_folio),                                \
                .nr_pages = folio_nr_pages(_folio),                        \
                .pgoff = folio_pgoff(_folio),                                \
                .vma = _vma,                                                \
                .address = _address,                                        \
                .flags = _flags,                                        \
        }

static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
{
        /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
        if (pvmw->pte && !is_vm_hugetlb_page(pvmw->vma))
                pte_unmap(pvmw->pte);
        if (pvmw->ptl)
                spin_unlock(pvmw->ptl);
}

/**
 * page_vma_mapped_walk_restart - Restart the page table walk.
 * @pvmw: Pointer to struct page_vma_mapped_walk.
 *
 * It restarts the page table walk when changes occur in the page
 * table, such as splitting a PMD. Ensures that the PTL held during
 * the previous walk is released and resets the state to allow for
 * a new walk starting at the current address stored in pvmw->address.
 */
static inline void
page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw)
{
        WARN_ON_ONCE(!pvmw->pmd && !pvmw->pte);

        if (likely(pvmw->ptl))
                spin_unlock(pvmw->ptl);
        else
                WARN_ON_ONCE(1);

        pvmw->ptl = NULL;
        pvmw->pmd = NULL;
        pvmw->pte = NULL;
}

bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
unsigned long page_address_in_vma(const struct folio *folio,
                const struct page *, const struct vm_area_struct *);

/*
 * Cleans the PTEs of shared mappings.
 * (and since clean PTEs should also be readonly, write protects them too)
 *
 * returns the number of cleaned PTEs.
 */
int folio_mkclean(struct folio *);

int mapping_wrprotect_range(struct address_space *mapping, pgoff_t pgoff,
                unsigned long pfn, unsigned long nr_pages);

int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
                      struct vm_area_struct *vma);

enum rmp_flags {
        RMP_LOCKED                = 1 << 0,
        RMP_USE_SHARED_ZEROPAGE        = 1 << 1,
};

void remove_migration_ptes(struct folio *src, struct folio *dst, int flags);

/*
 * rmap_walk_control: To control rmap traversing for specific needs
 *
 * arg: passed to rmap_one() and invalid_vma()
 * try_lock: bail out if the rmap lock is contended
 * contended: indicate the rmap traversal bailed out due to lock contention
 * rmap_one: executed on each vma where page is mapped
 * done: for checking traversing termination condition
 * anon_lock: for getting anon_lock by optimized way rather than default
 * invalid_vma: for skipping uninterested vma
 */
struct rmap_walk_control {
        void *arg;
        bool try_lock;
        bool contended;
        /*
         * Return false if page table scanning in rmap_walk should be stopped.
         * Otherwise, return true.
         */
        bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma,
                                        unsigned long addr, void *arg);
        int (*done)(struct folio *folio);
        struct anon_vma *(*anon_lock)(const struct folio *folio,
                                      struct rmap_walk_control *rwc);
        bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
};

void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc);
void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc);
struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio,
                                          struct rmap_walk_control *rwc);

#else        /* !CONFIG_MMU */

#define anon_vma_init()                do {} while (0)
#define anon_vma_prepare(vma)        (0)

static inline int folio_referenced(struct folio *folio, int is_locked,
                                  struct mem_cgroup *memcg,
                                  unsigned long *vm_flags)
{
        *vm_flags = 0;
        return 0;
}

static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags)
{
}

static inline int folio_mkclean(struct folio *folio)
{
        return 0;
}
#endif        /* CONFIG_MMU */

#endif        /* _LINUX_RMAP_H */





































    3 


















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (C) 2019 Arm Ltd. */

#ifndef __KVM_ARM_HYPERCALLS_H
#define __KVM_ARM_HYPERCALLS_H

#include <asm/kvm_emulate.h>

int kvm_smccc_call_handler(struct kvm_vcpu *vcpu);

static inline u32 smccc_get_function(struct kvm_vcpu *vcpu)
{
        return vcpu_get_reg(vcpu, 0);
}

static inline unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu)
{
        return vcpu_get_reg(vcpu, 1);
}

static inline unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu)
{
        return vcpu_get_reg(vcpu, 2);
}

static inline unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu)
{
        return vcpu_get_reg(vcpu, 3);
}

static inline void smccc_set_retval(struct kvm_vcpu *vcpu,
                                    unsigned long a0,
                                    unsigned long a1,
                                    unsigned long a2,
                                    unsigned long a3)
{
        vcpu_set_reg(vcpu, 0, a0);
        vcpu_set_reg(vcpu, 1, a1);
        vcpu_set_reg(vcpu, 2, a2);
        vcpu_set_reg(vcpu, 3, a3);
}

struct kvm_one_reg;

void kvm_arm_init_hypercalls(struct kvm *kvm);
void kvm_arm_teardown_hypercalls(struct kvm *kvm);
int kvm_arm_get_fw_num_regs(struct kvm_vcpu *vcpu);
int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices);
int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);

int kvm_vm_smccc_has_attr(struct kvm *kvm, struct kvm_device_attr *attr);
int kvm_vm_smccc_set_attr(struct kvm *kvm, struct kvm_device_attr *attr);

#endif































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2020 ARM Ltd.
 */
#ifndef __ASM_MTE_H
#define __ASM_MTE_H

#include <asm/compiler.h>
#include <asm/mte-def.h>

#ifndef __ASSEMBLY__

#include <linux/bitfield.h>
#include <linux/kasan-enabled.h>
#include <linux/page-flags.h>
#include <linux/sched.h>
#include <linux/types.h>

#include <asm/pgtable-types.h>

void mte_clear_page_tags(void *addr);
unsigned long mte_copy_tags_from_user(void *to, const void __user *from,
                                      unsigned long n);
unsigned long mte_copy_tags_to_user(void __user *to, void *from,
                                    unsigned long n);
int mte_save_tags(struct page *page);
void mte_save_page_tags(const void *page_addr, void *tag_storage);
void mte_restore_tags(swp_entry_t entry, struct page *page);
void mte_restore_page_tags(void *page_addr, const void *tag_storage);
void mte_invalidate_tags(int type, pgoff_t offset);
void mte_invalidate_tags_area(int type);
void *mte_allocate_tag_storage(void);
void mte_free_tag_storage(char *storage);

#ifdef CONFIG_ARM64_MTE

/* track which pages have valid allocation tags */
#define PG_mte_tagged        PG_arch_2
/* simple lock to avoid multiple threads tagging the same page */
#define PG_mte_lock        PG_arch_3

static inline void set_page_mte_tagged(struct page *page)
{
        VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));

        /*
         * Ensure that the tags written prior to this function are visible
         * before the page flags update.
         */
        smp_wmb();
        set_bit(PG_mte_tagged, &page->flags);
}

static inline bool page_mte_tagged(struct page *page)
{
        bool ret = test_bit(PG_mte_tagged, &page->flags);

        VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));

        /*
         * If the page is tagged, ensure ordering with a likely subsequent
         * read of the tags.
         */
        if (ret)
                smp_rmb();
        return ret;
}

/*
 * Lock the page for tagging and return 'true' if the page can be tagged,
 * 'false' if already tagged. PG_mte_tagged is never cleared and therefore the
 * locking only happens once for page initialisation.
 *
 * The page MTE lock state:
 *
 *   Locked:        PG_mte_lock && !PG_mte_tagged
 *   Unlocked:        !PG_mte_lock || PG_mte_tagged
 *
 * Acquire semantics only if the page is tagged (returning 'false').
 */
static inline bool try_page_mte_tagging(struct page *page)
{
        VM_WARN_ON_ONCE(folio_test_hugetlb(page_folio(page)));

        if (!test_and_set_bit(PG_mte_lock, &page->flags))
                return true;

        /*
         * The tags are either being initialised or may have been initialised
         * already. Check if the PG_mte_tagged flag has been set or wait
         * otherwise.
         */
        smp_cond_load_acquire(&page->flags, VAL & (1UL << PG_mte_tagged));

        return false;
}

void mte_zero_clear_page_tags(void *addr);
void mte_sync_tags(pte_t pte, unsigned int nr_pages);
void mte_copy_page_tags(void *kto, const void *kfrom);
void mte_thread_init_user(void);
void mte_thread_switch(struct task_struct *next);
void mte_cpu_setup(void);
void mte_suspend_enter(void);
void mte_suspend_exit(void);
long set_mte_ctrl(struct task_struct *task, unsigned long arg);
long get_mte_ctrl(struct task_struct *task);
int mte_ptrace_copy_tags(struct task_struct *child, long request,
                         unsigned long addr, unsigned long data);
size_t mte_probe_user_range(const char __user *uaddr, size_t size);

#else /* CONFIG_ARM64_MTE */

/* unused if !CONFIG_ARM64_MTE, silence the compiler */
#define PG_mte_tagged        0

static inline void set_page_mte_tagged(struct page *page)
{
}
static inline bool page_mte_tagged(struct page *page)
{
        return false;
}
static inline bool try_page_mte_tagging(struct page *page)
{
        return false;
}
static inline void mte_zero_clear_page_tags(void *addr)
{
}
static inline void mte_sync_tags(pte_t pte, unsigned int nr_pages)
{
}
static inline void mte_copy_page_tags(void *kto, const void *kfrom)
{
}
static inline void mte_thread_init_user(void)
{
}
static inline void mte_thread_switch(struct task_struct *next)
{
}
static inline void mte_suspend_enter(void)
{
}
static inline void mte_suspend_exit(void)
{
}
static inline long set_mte_ctrl(struct task_struct *task, unsigned long arg)
{
        return 0;
}
static inline long get_mte_ctrl(struct task_struct *task)
{
        return 0;
}
static inline int mte_ptrace_copy_tags(struct task_struct *child,
                                       long request, unsigned long addr,
                                       unsigned long data)
{
        return -EIO;
}

#endif /* CONFIG_ARM64_MTE */

#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_ARM64_MTE)
static inline void folio_set_hugetlb_mte_tagged(struct folio *folio)
{
        VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));

        /*
         * Ensure that the tags written prior to this function are visible
         * before the folio flags update.
         */
        smp_wmb();
        set_bit(PG_mte_tagged, &folio->flags);

}

static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
{
        bool ret = test_bit(PG_mte_tagged, &folio->flags);

        VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));

        /*
         * If the folio is tagged, ensure ordering with a likely subsequent
         * read of the tags.
         */
        if (ret)
                smp_rmb();
        return ret;
}

static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio)
{
        VM_WARN_ON_ONCE(!folio_test_hugetlb(folio));

        if (!test_and_set_bit(PG_mte_lock, &folio->flags))
                return true;

        /*
         * The tags are either being initialised or may have been initialised
         * already. Check if the PG_mte_tagged flag has been set or wait
         * otherwise.
         */
        smp_cond_load_acquire(&folio->flags, VAL & (1UL << PG_mte_tagged));

        return false;
}
#else
static inline void folio_set_hugetlb_mte_tagged(struct folio *folio)
{
}

static inline bool folio_test_hugetlb_mte_tagged(struct folio *folio)
{
        return false;
}

static inline bool folio_try_hugetlb_mte_tagging(struct folio *folio)
{
        return false;
}
#endif

static inline void mte_disable_tco_entry(struct task_struct *task)
{
        if (!system_supports_mte())
                return;

        /*
         * Re-enable tag checking (TCO set on exception entry). This is only
         * necessary if MTE is enabled in either the kernel or the userspace
         * task in synchronous or asymmetric mode (SCTLR_EL1.TCF0 bit 0 is set
         * for both). With MTE disabled in the kernel and disabled or
         * asynchronous in userspace, tag check faults (including in uaccesses)
         * are not reported, therefore there is no need to re-enable checking.
         * This is beneficial on microarchitectures where re-enabling TCO is
         * expensive.
         */
        if (kasan_hw_tags_enabled() ||
            (task->thread.sctlr_user & (1UL << SCTLR_EL1_TCF0_SHIFT)))
                asm volatile(SET_PSTATE_TCO(0));
}

#ifdef CONFIG_KASAN_HW_TAGS
void mte_check_tfsr_el1(void);

static inline void mte_check_tfsr_entry(void)
{
        if (!kasan_hw_tags_enabled())
                return;

        mte_check_tfsr_el1();
}

static inline void mte_check_tfsr_exit(void)
{
        if (!kasan_hw_tags_enabled())
                return;

        /*
         * The asynchronous faults are sync'ed automatically with
         * TFSR_EL1 on kernel entry but for exit an explicit dsb()
         * is required.
         */
        dsb(nsh);
        isb();

        mte_check_tfsr_el1();
}
#else
static inline void mte_check_tfsr_el1(void)
{
}
static inline void mte_check_tfsr_entry(void)
{
}
static inline void mte_check_tfsr_exit(void)
{
}
#endif /* CONFIG_KASAN_HW_TAGS */

#endif /* __ASSEMBLY__ */
#endif /* __ASM_MTE_H  */























  307 









































































   31 


























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM vmalloc

#if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_VMALLOC_H

#include <linux/tracepoint.h>

/**
 * alloc_vmap_area - called when a new vmap allocation occurs
 * @addr:        an allocated address
 * @size:        a requested size
 * @align:        a requested alignment
 * @vstart:        a requested start range
 * @vend:        a requested end range
 * @failed:        an allocation failed or not
 *
 * This event is used for a debug purpose, it can give an extra
 * information for a developer about how often it occurs and which
 * parameters are passed for further validation.
 */
TRACE_EVENT(alloc_vmap_area,

        TP_PROTO(unsigned long addr, unsigned long size, unsigned long align,
                unsigned long vstart, unsigned long vend, int failed),

        TP_ARGS(addr, size, align, vstart, vend, failed),

        TP_STRUCT__entry(
                __field(unsigned long, addr)
                __field(unsigned long, size)
                __field(unsigned long, align)
                __field(unsigned long, vstart)
                __field(unsigned long, vend)
                __field(int, failed)
        ),

        TP_fast_assign(
                __entry->addr = addr;
                __entry->size = size;
                __entry->align = align;
                __entry->vstart = vstart;
                __entry->vend = vend;
                __entry->failed = failed;
        ),

        TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d",
                __entry->addr, __entry->size, __entry->align,
                __entry->vstart, __entry->vend, __entry->failed)
);

/**
 * purge_vmap_area_lazy - called when vmap areas were lazily freed
 * @start:                purging start address
 * @end:                purging end address
 * @npurged:        numbed of purged vmap areas
 *
 * This event is used for a debug purpose. It gives some
 * indication about start:end range and how many objects
 * are released.
 */
TRACE_EVENT(purge_vmap_area_lazy,

        TP_PROTO(unsigned long start, unsigned long end,
                unsigned int npurged),

        TP_ARGS(start, end, npurged),

        TP_STRUCT__entry(
                __field(unsigned long, start)
                __field(unsigned long, end)
                __field(unsigned int, npurged)
        ),

        TP_fast_assign(
                __entry->start = start;
                __entry->end = end;
                __entry->npurged = npurged;
        ),

        TP_printk("start=0x%lx end=0x%lx num_purged=%u",
                __entry->start, __entry->end, __entry->npurged)
);

/**
 * free_vmap_area_noflush - called when a vmap area is freed
 * @va_start:                a start address of VA
 * @nr_lazy:                number of current lazy pages
 * @nr_lazy_max:        number of maximum lazy pages
 *
 * This event is used for a debug purpose. It gives some
 * indication about a VA that is released, number of current
 * outstanding areas and a maximum allowed threshold before
 * dropping all of them.
 */
TRACE_EVENT(free_vmap_area_noflush,

        TP_PROTO(unsigned long va_start, unsigned long nr_lazy,
                unsigned long nr_lazy_max),

        TP_ARGS(va_start, nr_lazy, nr_lazy_max),

        TP_STRUCT__entry(
                __field(unsigned long, va_start)
                __field(unsigned long, nr_lazy)
                __field(unsigned long, nr_lazy_max)
        ),

        TP_fast_assign(
                __entry->va_start = va_start;
                __entry->nr_lazy = nr_lazy;
                __entry->nr_lazy_max = nr_lazy_max;
        ),

        TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu",
                __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max)
);

#endif /*  _TRACE_VMALLOC_H */

/* This part must be outside protection */
#include <trace/define_trace.h>





































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 


    3 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/rtnetlink.h>
#include <net/busy_poll.h>
#include <net/net_namespace.h>
#include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
#include <net/sock.h>
#include <net/xdp.h>
#include <net/xdp_sock.h>
#include <net/page_pool/memory_provider.h>

#include "dev.h"
#include "devmem.h"
#include "netdev-genl-gen.h"

struct netdev_nl_dump_ctx {
        unsigned long        ifindex;
        unsigned int        rxq_idx;
        unsigned int        txq_idx;
        unsigned int        napi_id;
};

static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb)
{
        NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx);

        return (struct netdev_nl_dump_ctx *)cb->ctx;
}

static int
netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
                   const struct genl_info *info)
{
        u64 xsk_features = 0;
        u64 xdp_rx_meta = 0;
        void *hdr;

        hdr = genlmsg_iput(rsp, info);
        if (!hdr)
                return -EMSGSIZE;

#define XDP_METADATA_KFUNC(_, flag, __, xmo) \
        if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \
                xdp_rx_meta |= flag;
XDP_METADATA_KFUNC_xxx
#undef XDP_METADATA_KFUNC

        if (netdev->xsk_tx_metadata_ops) {
                if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp)
                        xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
                if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
                        xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
                if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time)
                        xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO;
        }

        if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
            nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
                              netdev->xdp_features, NETDEV_A_DEV_PAD) ||
            nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
                              xdp_rx_meta, NETDEV_A_DEV_PAD) ||
            nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
                              xsk_features, NETDEV_A_DEV_PAD))
                goto err_cancel_msg;

        if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
                if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
                                netdev->xdp_zc_max_segs))
                        goto err_cancel_msg;
        }

        genlmsg_end(rsp, hdr);

        return 0;

err_cancel_msg:
        genlmsg_cancel(rsp, hdr);
        return -EMSGSIZE;
}

static void
netdev_genl_dev_notify(struct net_device *netdev, int cmd)
{
        struct genl_info info;
        struct sk_buff *ntf;

        if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev),
                                NETDEV_NLGRP_MGMT))
                return;

        genl_info_init_ntf(&info, &netdev_nl_family, cmd);

        ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!ntf)
                return;

        if (netdev_nl_dev_fill(netdev, ntf, &info)) {
                nlmsg_free(ntf);
                return;
        }

        genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf,
                                0, NETDEV_NLGRP_MGMT, GFP_KERNEL);
}

int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct net_device *netdev;
        struct sk_buff *rsp;
        u32 ifindex;
        int err;

        if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX))
                return -EINVAL;

        ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);

        rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!rsp)
                return -ENOMEM;

        rtnl_lock();

        netdev = __dev_get_by_index(genl_info_net(info), ifindex);
        if (netdev)
                err = netdev_nl_dev_fill(netdev, rsp, info);
        else
                err = -ENODEV;

        rtnl_unlock();

        if (err)
                goto err_free_msg;

        return genlmsg_reply(rsp, info);

err_free_msg:
        nlmsg_free(rsp);
        return err;
}

int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
        struct net *net = sock_net(skb->sk);
        struct net_device *netdev;
        int err = 0;

        rtnl_lock();
        for_each_netdev_dump(net, netdev, ctx->ifindex) {
                err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb));
                if (err < 0)
                        break;
        }
        rtnl_unlock();

        return err;
}

static int
netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
                        const struct genl_info *info)
{
        unsigned long irq_suspend_timeout;
        unsigned long gro_flush_timeout;
        u32 napi_defer_hard_irqs;
        void *hdr;
        pid_t pid;

        if (!napi->dev->up)
                return 0;

        hdr = genlmsg_iput(rsp, info);
        if (!hdr)
                return -EMSGSIZE;

        if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
                goto nla_put_failure;

        if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex))
                goto nla_put_failure;

        if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq))
                goto nla_put_failure;

        if (napi->thread) {
                pid = task_pid_nr(napi->thread);
                if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid))
                        goto nla_put_failure;
        }

        napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi);
        if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS,
                        napi_defer_hard_irqs))
                goto nla_put_failure;

        irq_suspend_timeout = napi_get_irq_suspend_timeout(napi);
        if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
                         irq_suspend_timeout))
                goto nla_put_failure;

        gro_flush_timeout = napi_get_gro_flush_timeout(napi);
        if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
                         gro_flush_timeout))
                goto nla_put_failure;

        genlmsg_end(rsp, hdr);

        return 0;

nla_put_failure:
        genlmsg_cancel(rsp, hdr);
        return -EMSGSIZE;
}

int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct napi_struct *napi;
        struct sk_buff *rsp;
        u32 napi_id;
        int err;

        if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
                return -EINVAL;

        napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);

        rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!rsp)
                return -ENOMEM;

        napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
        if (napi) {
                err = netdev_nl_napi_fill_one(rsp, napi, info);
                netdev_unlock(napi->dev);
        } else {
                NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
                err = -ENOENT;
        }

        if (err) {
                goto err_free_msg;
        } else if (!rsp->len) {
                err = -ENOENT;
                goto err_free_msg;
        }

        return genlmsg_reply(rsp, info);

err_free_msg:
        nlmsg_free(rsp);
        return err;
}

static int
netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp,
                        const struct genl_info *info,
                        struct netdev_nl_dump_ctx *ctx)
{
        struct napi_struct *napi;
        unsigned int prev_id;
        int err = 0;

        if (!netdev->up)
                return err;

        prev_id = UINT_MAX;
        list_for_each_entry(napi, &netdev->napi_list, dev_list) {
                if (!napi_id_valid(napi->napi_id))
                        continue;

                /* Dump continuation below depends on the list being sorted */
                WARN_ON_ONCE(napi->napi_id >= prev_id);
                prev_id = napi->napi_id;

                if (ctx->napi_id && napi->napi_id >= ctx->napi_id)
                        continue;

                err = netdev_nl_napi_fill_one(rsp, napi, info);
                if (err)
                        return err;
                ctx->napi_id = napi->napi_id;
        }
        return err;
}

int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
        const struct genl_info *info = genl_info_dump(cb);
        struct net *net = sock_net(skb->sk);
        struct net_device *netdev;
        u32 ifindex = 0;
        int err = 0;

        if (info->attrs[NETDEV_A_NAPI_IFINDEX])
                ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]);

        if (ifindex) {
                netdev = netdev_get_by_index_lock(net, ifindex);
                if (netdev) {
                        err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
                        netdev_unlock(netdev);
                } else {
                        err = -ENODEV;
                }
        } else {
                for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
                        err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
                        if (err < 0)
                                break;
                        ctx->napi_id = 0;
                }
        }

        return err;
}

static int
netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
{
        u64 irq_suspend_timeout = 0;
        u64 gro_flush_timeout = 0;
        u32 defer = 0;

        if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) {
                defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]);
                napi_set_defer_hard_irqs(napi, defer);
        }

        if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) {
                irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]);
                napi_set_irq_suspend_timeout(napi, irq_suspend_timeout);
        }

        if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
                gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
                napi_set_gro_flush_timeout(napi, gro_flush_timeout);
        }

        return 0;
}

int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct napi_struct *napi;
        unsigned int napi_id;
        int err;

        if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
                return -EINVAL;

        napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);

        napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
        if (napi) {
                err = netdev_nl_napi_set_config(napi, info);
                netdev_unlock(napi->dev);
        } else {
                NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
                err = -ENOENT;
        }

        return err;
}

static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi)
{
        if (napi && napi_id_valid(napi->napi_id))
                return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id);
        return 0;
}

static int
netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
                         u32 q_idx, u32 q_type, const struct genl_info *info)
{
        struct pp_memory_provider_params *params;
        struct netdev_rx_queue *rxq;
        struct netdev_queue *txq;
        void *hdr;

        hdr = genlmsg_iput(rsp, info);
        if (!hdr)
                return -EMSGSIZE;

        if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) ||
            nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) ||
            nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex))
                goto nla_put_failure;

        switch (q_type) {
        case NETDEV_QUEUE_TYPE_RX:
                rxq = __netif_get_rx_queue(netdev, q_idx);
                if (nla_put_napi_id(rsp, rxq->napi))
                        goto nla_put_failure;

                params = &rxq->mp_params;
                if (params->mp_ops &&
                    params->mp_ops->nl_fill(params->mp_priv, rsp, rxq))
                        goto nla_put_failure;
#ifdef CONFIG_XDP_SOCKETS
                if (rxq->pool)
                        if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
                                goto nla_put_failure;
#endif

                break;
        case NETDEV_QUEUE_TYPE_TX:
                txq = netdev_get_tx_queue(netdev, q_idx);
                if (nla_put_napi_id(rsp, txq->napi))
                        goto nla_put_failure;
#ifdef CONFIG_XDP_SOCKETS
                if (txq->pool)
                        if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
                                goto nla_put_failure;
#endif
                break;
        }

        genlmsg_end(rsp, hdr);

        return 0;

nla_put_failure:
        genlmsg_cancel(rsp, hdr);
        return -EMSGSIZE;
}

static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id,
                                    u32 q_type)
{
        switch (q_type) {
        case NETDEV_QUEUE_TYPE_RX:
                if (q_id >= netdev->real_num_rx_queues)
                        return -EINVAL;
                return 0;
        case NETDEV_QUEUE_TYPE_TX:
                if (q_id >= netdev->real_num_tx_queues)
                        return -EINVAL;
        }
        return 0;
}

static int
netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx,
                     u32 q_type, const struct genl_info *info)
{
        int err;

        if (!netdev->up)
                return -ENOENT;

        err = netdev_nl_queue_validate(netdev, q_idx, q_type);
        if (err)
                return err;

        return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info);
}

int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info)
{
        u32 q_id, q_type, ifindex;
        struct net_device *netdev;
        struct sk_buff *rsp;
        int err;

        if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) ||
            GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) ||
            GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX))
                return -EINVAL;

        q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]);
        q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]);
        ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);

        rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!rsp)
                return -ENOMEM;

        rtnl_lock();

        netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
        if (netdev) {
                err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info);
                netdev_unlock(netdev);
        } else {
                err = -ENODEV;
        }

        rtnl_unlock();

        if (err)
                goto err_free_msg;

        return genlmsg_reply(rsp, info);

err_free_msg:
        nlmsg_free(rsp);
        return err;
}

static int
netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp,
                         const struct genl_info *info,
                         struct netdev_nl_dump_ctx *ctx)
{
        int err = 0;

        if (!netdev->up)
                return err;

        for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) {
                err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx,
                                               NETDEV_QUEUE_TYPE_RX, info);
                if (err)
                        return err;
        }
        for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) {
                err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx,
                                               NETDEV_QUEUE_TYPE_TX, info);
                if (err)
                        return err;
        }

        return err;
}

int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
        const struct genl_info *info = genl_info_dump(cb);
        struct net *net = sock_net(skb->sk);
        struct net_device *netdev;
        u32 ifindex = 0;
        int err = 0;

        if (info->attrs[NETDEV_A_QUEUE_IFINDEX])
                ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);

        rtnl_lock();
        if (ifindex) {
                netdev = netdev_get_by_index_lock(net, ifindex);
                if (netdev) {
                        err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
                        netdev_unlock(netdev);
                } else {
                        err = -ENODEV;
                }
        } else {
                for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
                        err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
                        if (err < 0)
                                break;
                        ctx->rxq_idx = 0;
                        ctx->txq_idx = 0;
                }
        }
        rtnl_unlock();

        return err;
}

#define NETDEV_STAT_NOT_SET                (~0ULL)

static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size)
{
        const u64 *add = _add;
        u64 *sum = _sum;

        while (size) {
                if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET)
                        *sum += *add;
                sum++;
                add++;
                size -= 8;
        }
}

static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value)
{
        if (value == NETDEV_STAT_NOT_SET)
                return 0;
        return nla_put_uint(rsp, attr_id, value);
}

static int
netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)
{
        if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits))
                return -EMSGSIZE;
        return 0;
}

static int
netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)
{
        if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) ||
            netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake))
                return -EMSGSIZE;
        return 0;
}

static int
netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp,
                      u32 q_type, int i, const struct genl_info *info)
{
        const struct netdev_stat_ops *ops = netdev->stat_ops;
        struct netdev_queue_stats_rx rx;
        struct netdev_queue_stats_tx tx;
        void *hdr;

        hdr = genlmsg_iput(rsp, info);
        if (!hdr)
                return -EMSGSIZE;
        if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) ||
            nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) ||
            nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i))
                goto nla_put_failure;

        switch (q_type) {
        case NETDEV_QUEUE_TYPE_RX:
                memset(&rx, 0xff, sizeof(rx));
                ops->get_queue_stats_rx(netdev, i, &rx);
                if (!memchr_inv(&rx, 0xff, sizeof(rx)))
                        goto nla_cancel;
                if (netdev_nl_stats_write_rx(rsp, &rx))
                        goto nla_put_failure;
                break;
        case NETDEV_QUEUE_TYPE_TX:
                memset(&tx, 0xff, sizeof(tx));
                ops->get_queue_stats_tx(netdev, i, &tx);
                if (!memchr_inv(&tx, 0xff, sizeof(tx)))
                        goto nla_cancel;
                if (netdev_nl_stats_write_tx(rsp, &tx))
                        goto nla_put_failure;
                break;
        }

        genlmsg_end(rsp, hdr);
        return 0;

nla_cancel:
        genlmsg_cancel(rsp, hdr);
        return 0;
nla_put_failure:
        genlmsg_cancel(rsp, hdr);
        return -EMSGSIZE;
}

static int
netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp,
                         const struct genl_info *info,
                         struct netdev_nl_dump_ctx *ctx)
{
        const struct netdev_stat_ops *ops = netdev->stat_ops;
        int i, err;

        if (!(netdev->flags & IFF_UP))
                return 0;

        i = ctx->rxq_idx;
        while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) {
                err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX,
                                            i, info);
                if (err)
                        return err;
                ctx->rxq_idx = ++i;
        }
        i = ctx->txq_idx;
        while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) {
                err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX,
                                            i, info);
                if (err)
                        return err;
                ctx->txq_idx = ++i;
        }

        ctx->rxq_idx = 0;
        ctx->txq_idx = 0;
        return 0;
}

static int
netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp,
                          const struct genl_info *info)
{
        struct netdev_queue_stats_rx rx_sum, rx;
        struct netdev_queue_stats_tx tx_sum, tx;
        const struct netdev_stat_ops *ops;
        void *hdr;
        int i;

        ops = netdev->stat_ops;
        /* Netdev can't guarantee any complete counters */
        if (!ops->get_base_stats)
                return 0;

        memset(&rx_sum, 0xff, sizeof(rx_sum));
        memset(&tx_sum, 0xff, sizeof(tx_sum));

        ops->get_base_stats(netdev, &rx_sum, &tx_sum);

        /* The op was there, but nothing reported, don't bother */
        if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) &&
            !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum)))
                return 0;

        hdr = genlmsg_iput(rsp, info);
        if (!hdr)
                return -EMSGSIZE;
        if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex))
                goto nla_put_failure;

        for (i = 0; i < netdev->real_num_rx_queues; i++) {
                memset(&rx, 0xff, sizeof(rx));
                if (ops->get_queue_stats_rx)
                        ops->get_queue_stats_rx(netdev, i, &rx);
                netdev_nl_stats_add(&rx_sum, &rx, sizeof(rx));
        }
        for (i = 0; i < netdev->real_num_tx_queues; i++) {
                memset(&tx, 0xff, sizeof(tx));
                if (ops->get_queue_stats_tx)
                        ops->get_queue_stats_tx(netdev, i, &tx);
                netdev_nl_stats_add(&tx_sum, &tx, sizeof(tx));
        }

        if (netdev_nl_stats_write_rx(rsp, &rx_sum) ||
            netdev_nl_stats_write_tx(rsp, &tx_sum))
                goto nla_put_failure;

        genlmsg_end(rsp, hdr);
        return 0;

nla_put_failure:
        genlmsg_cancel(rsp, hdr);
        return -EMSGSIZE;
}

static int
netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope,
                              struct sk_buff *skb, const struct genl_info *info,
                              struct netdev_nl_dump_ctx *ctx)
{
        if (!netdev->stat_ops)
                return 0;

        switch (scope) {
        case 0:
                return netdev_nl_stats_by_netdev(netdev, skb, info);
        case NETDEV_QSTATS_SCOPE_QUEUE:
                return netdev_nl_stats_by_queue(netdev, skb, info, ctx);
        }

        return -EINVAL;        /* Should not happen, per netlink policy */
}

int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
                                struct netlink_callback *cb)
{
        struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
        const struct genl_info *info = genl_info_dump(cb);
        struct net *net = sock_net(skb->sk);
        struct net_device *netdev;
        unsigned int ifindex;
        unsigned int scope;
        int err = 0;

        scope = 0;
        if (info->attrs[NETDEV_A_QSTATS_SCOPE])
                scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]);

        ifindex = 0;
        if (info->attrs[NETDEV_A_QSTATS_IFINDEX])
                ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]);

        rtnl_lock();
        if (ifindex) {
                netdev = __dev_get_by_index(net, ifindex);
                if (netdev && netdev->stat_ops) {
                        err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
                                                            info, ctx);
                } else {
                        NL_SET_BAD_ATTR(info->extack,
                                        info->attrs[NETDEV_A_QSTATS_IFINDEX]);
                        err = netdev ? -EOPNOTSUPP : -ENODEV;
                }
        } else {
                for_each_netdev_dump(net, netdev, ctx->ifindex) {
                        err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
                                                            info, ctx);
                        if (err < 0)
                                break;
                }
        }
        rtnl_unlock();

        return err;
}

int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
        struct net_devmem_dmabuf_binding *binding;
        u32 ifindex, dmabuf_fd, rxq_idx;
        struct netdev_nl_sock *priv;
        struct net_device *netdev;
        struct sk_buff *rsp;
        struct nlattr *attr;
        int rem, err = 0;
        void *hdr;

        if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
            GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) ||
            GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES))
                return -EINVAL;

        ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
        dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);

        priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
        if (IS_ERR(priv))
                return PTR_ERR(priv);

        rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
        if (!rsp)
                return -ENOMEM;

        hdr = genlmsg_iput(rsp, info);
        if (!hdr) {
                err = -EMSGSIZE;
                goto err_genlmsg_free;
        }

        mutex_lock(&priv->lock);

        err = 0;
        netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
        if (!netdev) {
                err = -ENODEV;
                goto err_unlock_sock;
        }
        if (!netif_device_present(netdev))
                err = -ENODEV;
        else if (!netdev_need_ops_lock(netdev))
                err = -EOPNOTSUPP;
        if (err) {
                NL_SET_BAD_ATTR(info->extack,
                                info->attrs[NETDEV_A_DEV_IFINDEX]);
                goto err_unlock;
        }

        binding = net_devmem_bind_dmabuf(netdev, dmabuf_fd, info->extack);
        if (IS_ERR(binding)) {
                err = PTR_ERR(binding);
                goto err_unlock;
        }

        nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
                               genlmsg_data(info->genlhdr),
                               genlmsg_len(info->genlhdr), rem) {
                err = nla_parse_nested(
                        tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr,
                        netdev_queue_id_nl_policy, info->extack);
                if (err < 0)
                        goto err_unbind;

                if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
                    NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) {
                        err = -EINVAL;
                        goto err_unbind;
                }

                if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
                        NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
                        err = -EINVAL;
                        goto err_unbind;
                }

                rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);

                err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding,
                                                      info->extack);
                if (err)
                        goto err_unbind;
        }

        list_add(&binding->list, &priv->bindings);

        nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
        genlmsg_end(rsp, hdr);

        err = genlmsg_reply(rsp, info);
        if (err)
                goto err_unbind;

        netdev_unlock(netdev);

        mutex_unlock(&priv->lock);

        return 0;

err_unbind:
        net_devmem_unbind_dmabuf(binding);
err_unlock:
        netdev_unlock(netdev);
err_unlock_sock:
        mutex_unlock(&priv->lock);
err_genlmsg_free:
        nlmsg_free(rsp);
        return err;
}

void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
{
        INIT_LIST_HEAD(&priv->bindings);
        mutex_init(&priv->lock);
}

void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv)
{
        struct net_devmem_dmabuf_binding *binding;
        struct net_devmem_dmabuf_binding *temp;
        struct net_device *dev;

        mutex_lock(&priv->lock);
        list_for_each_entry_safe(binding, temp, &priv->bindings, list) {
                dev = binding->dev;
                netdev_lock(dev);
                net_devmem_unbind_dmabuf(binding);
                netdev_unlock(dev);
        }
        mutex_unlock(&priv->lock);
}

static int netdev_genl_netdevice_event(struct notifier_block *nb,
                                       unsigned long event, void *ptr)
{
        struct net_device *netdev = netdev_notifier_info_to_dev(ptr);

        switch (event) {
        case NETDEV_REGISTER:
                netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF);
                break;
        case NETDEV_UNREGISTER:
                netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF);
                break;
        case NETDEV_XDP_FEAT_CHANGE:
                netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF);
                break;
        }

        return NOTIFY_OK;
}

static struct notifier_block netdev_genl_nb = {
        .notifier_call        = netdev_genl_netdevice_event,
};

static int __init netdev_genl_init(void)
{
        int err;

        err = register_netdevice_notifier(&netdev_genl_nb);
        if (err)
                return err;

        err = genl_register_family(&netdev_nl_family);
        if (err)
                goto err_unreg_ntf;

        return 0;

err_unreg_ntf:
        unregister_netdevice_notifier(&netdev_genl_nb);
        return err;
}

subsys_initcall(netdev_genl_init);

























































































































































































































































   22 































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Runtime locking correctness validator
 *
 *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 *
 * see Documentation/locking/lockdep-design.rst for more details.
 */
#ifndef __LINUX_LOCKDEP_H
#define __LINUX_LOCKDEP_H

#include <linux/lockdep_types.h>
#include <linux/smp.h>
#include <asm/percpu.h>

struct task_struct;

#ifdef CONFIG_LOCKDEP

#include <linux/linkage.h>
#include <linux/list.h>
#include <linux/debug_locks.h>
#include <linux/stacktrace.h>

static inline void lockdep_copy_map(struct lockdep_map *to,
                                    struct lockdep_map *from)
{
        int i;

        *to = *from;
        /*
         * Since the class cache can be modified concurrently we could observe
         * half pointers (64bit arch using 32bit copy insns). Therefore clear
         * the caches and take the performance hit.
         *
         * XXX it doesn't work well with lockdep_set_class_and_subclass(), since
         *     that relies on cache abuse.
         */
        for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
                to->class_cache[i] = NULL;
}

/*
 * Every lock has a list of other locks that were taken after it.
 * We only grow the list, never remove from it:
 */
struct lock_list {
        struct list_head                entry;
        struct lock_class                *class;
        struct lock_class                *links_to;
        const struct lock_trace                *trace;
        u16                                distance;
        /* bitmap of different dependencies from head to this */
        u8                                dep;
        /* used by BFS to record whether "prev -> this" only has -(*R)-> */
        u8                                only_xr;

        /*
         * The parent field is used to implement breadth-first search, and the
         * bit 0 is reused to indicate if the lock has been accessed in BFS.
         */
        struct lock_list                *parent;
};

/**
 * struct lock_chain - lock dependency chain record
 *
 * @irq_context: the same as irq_context in held_lock below
 * @depth:       the number of held locks in this chain
 * @base:        the index in chain_hlocks for this chain
 * @entry:       the collided lock chains in lock_chain hash list
 * @chain_key:   the hash key of this lock_chain
 */
struct lock_chain {
        /* see BUILD_BUG_ON()s in add_chain_cache() */
        unsigned int                        irq_context :  2,
                                        depth       :  6,
                                        base            : 24;
        /* 4 byte hole */
        struct hlist_node                entry;
        u64                                chain_key;
};

/*
 * Initialization, self-test and debugging-output methods:
 */
extern void lockdep_init(void);
extern void lockdep_reset(void);
extern void lockdep_reset_lock(struct lockdep_map *lock);
extern void lockdep_free_key_range(void *start, unsigned long size);
extern asmlinkage void lockdep_sys_exit(void);
extern void lockdep_set_selftest_task(struct task_struct *task);

extern void lockdep_init_task(struct task_struct *task);

/*
 * Split the recursion counter in two to readily detect 'off' vs recursion.
 */
#define LOCKDEP_RECURSION_BITS        16
#define LOCKDEP_OFF                (1U << LOCKDEP_RECURSION_BITS)
#define LOCKDEP_RECURSION_MASK        (LOCKDEP_OFF - 1)

/*
 * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due
 * to header dependencies.
 */

#define lockdep_off()                                        \
do {                                                        \
        current->lockdep_recursion += LOCKDEP_OFF;        \
} while (0)

#define lockdep_on()                                        \
do {                                                        \
        current->lockdep_recursion -= LOCKDEP_OFF;        \
} while (0)

extern void lockdep_register_key(struct lock_class_key *key);
extern void lockdep_unregister_key(struct lock_class_key *key);

/*
 * These methods are used by specific locking variants (spinlocks,
 * rwlocks, mutexes and rwsems) to pass init/acquire/release events
 * to lockdep:
 */

extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name,
        struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type);

static inline void
lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
                       struct lock_class_key *key, int subclass, u8 inner, u8 outer)
{
        lockdep_init_map_type(lock, name, key, subclass, inner, outer, LD_LOCK_NORMAL);
}

static inline void
lockdep_init_map_wait(struct lockdep_map *lock, const char *name,
                      struct lock_class_key *key, int subclass, u8 inner)
{
        lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV);
}

static inline void lockdep_init_map(struct lockdep_map *lock, const char *name,
                             struct lock_class_key *key, int subclass)
{
        lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV);
}

/*
 * Reinitialize a lock key - for cases where there is special locking or
 * special initialization of locks so that the validator gets the scope
 * of dependencies wrong: they are either too broad (they need a class-split)
 * or they are too narrow (they suffer from a false class-split):
 */
#define lockdep_set_class(lock, key)                                \
        lockdep_init_map_type(&(lock)->dep_map, #key, key, 0,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_class_and_name(lock, key, name)                \
        lockdep_init_map_type(&(lock)->dep_map, name, key, 0,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_class_and_subclass(lock, key, sub)                \
        lockdep_init_map_type(&(lock)->dep_map, #key, key, sub,        \
                              (lock)->dep_map.wait_type_inner,        \
                              (lock)->dep_map.wait_type_outer,        \
                              (lock)->dep_map.lock_type)

#define lockdep_set_subclass(lock, sub)                                        \
        lockdep_init_map_type(&(lock)->dep_map, (lock)->dep_map.name, (lock)->dep_map.key, sub,\
                              (lock)->dep_map.wait_type_inner,                \
                              (lock)->dep_map.wait_type_outer,                \
                              (lock)->dep_map.lock_type)

/**
 * lockdep_set_novalidate_class: disable checking of lock ordering on a given
 * lock
 * @lock: Lock to mark
 *
 * Lockdep will still record that this lock has been taken, and print held
 * instances when dumping locks
 */
#define lockdep_set_novalidate_class(lock) \
        lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock)

/**
 * lockdep_set_notrack_class: disable lockdep tracking of a given lock entirely
 * @lock: Lock to mark
 *
 * Bigger hammer than lockdep_set_novalidate_class: so far just for bcachefs,
 * which takes more locks than lockdep is able to track (48).
 */
#define lockdep_set_notrack_class(lock) \
        lockdep_set_class_and_name(lock, &__lockdep_no_track__, #lock)

/*
 * Compare locking classes
 */
#define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key)

static inline int lockdep_match_key(struct lockdep_map *lock,
                                    struct lock_class_key *key)
{
        return lock->key == key;
}

/*
 * Acquire a lock.
 *
 * Values for "read":
 *
 *   0: exclusive (write) acquire
 *   1: read-acquire (no recursion allowed)
 *   2: read-acquire with same-instance recursion allowed
 *
 * Values for check:
 *
 *   0: simple checks (freeing, held-at-exit-time, etc.)
 *   1: full validation
 */
extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
                         int trylock, int read, int check,
                         struct lockdep_map *nest_lock, unsigned long ip);

extern void lock_release(struct lockdep_map *lock, unsigned long ip);

extern void lock_sync(struct lockdep_map *lock, unsigned int subclass,
                      int read, int check, struct lockdep_map *nest_lock,
                      unsigned long ip);

/* lock_is_held_type() returns */
#define LOCK_STATE_UNKNOWN        -1
#define LOCK_STATE_NOT_HELD        0
#define LOCK_STATE_HELD                1

/*
 * Same "read" as for lock_acquire(), except -1 means any.
 */
extern int lock_is_held_type(const struct lockdep_map *lock, int read);

static inline int lock_is_held(const struct lockdep_map *lock)
{
        return lock_is_held_type(lock, -1);
}

#define lockdep_is_held(lock)                lock_is_held(&(lock)->dep_map)
#define lockdep_is_held_type(lock, r)        lock_is_held_type(&(lock)->dep_map, (r))

extern void lock_set_class(struct lockdep_map *lock, const char *name,
                           struct lock_class_key *key, unsigned int subclass,
                           unsigned long ip);

#define lock_set_novalidate_class(l, n, i) \
        lock_set_class(l, n, &__lockdep_no_validate__, 0, i)

static inline void lock_set_subclass(struct lockdep_map *lock,
                unsigned int subclass, unsigned long ip)
{
        lock_set_class(lock, lock->name, lock->key, subclass, ip);
}

extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);

#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }

extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);

#define lockdep_depth(tsk)        (debug_locks ? (tsk)->lockdep_depth : 0)

#define lockdep_assert(cond)                \
        do { WARN_ON(debug_locks && !(cond)); } while (0)

#define lockdep_assert_once(cond)        \
        do { WARN_ON_ONCE(debug_locks && !(cond)); } while (0)

#define lockdep_assert_held(l)                \
        lockdep_assert(lockdep_is_held(l) != LOCK_STATE_NOT_HELD)

#define lockdep_assert_not_held(l)        \
        lockdep_assert(lockdep_is_held(l) != LOCK_STATE_HELD)

#define lockdep_assert_held_write(l)        \
        lockdep_assert(lockdep_is_held_type(l, 0))

#define lockdep_assert_held_read(l)        \
        lockdep_assert(lockdep_is_held_type(l, 1))

#define lockdep_assert_held_once(l)                \
        lockdep_assert_once(lockdep_is_held(l) != LOCK_STATE_NOT_HELD)

#define lockdep_assert_none_held_once()                \
        lockdep_assert_once(!current->lockdep_depth)

#define lockdep_recursing(tsk)        ((tsk)->lockdep_recursion)

#define lockdep_pin_lock(l)        lock_pin_lock(&(l)->dep_map)
#define lockdep_repin_lock(l,c)        lock_repin_lock(&(l)->dep_map, (c))
#define lockdep_unpin_lock(l,c)        lock_unpin_lock(&(l)->dep_map, (c))

/*
 * Must use lock_map_aquire_try() with override maps to avoid
 * lockdep thinking they participate in the block chain.
 */
#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)        \
        struct lockdep_map _name = {                        \
                .name = #_name "-wait-type-override",        \
                .wait_type_inner = _wait_type,                \
                .lock_type = LD_LOCK_WAIT_OVERRIDE, }

#else /* !CONFIG_LOCKDEP */

static inline void lockdep_init_task(struct task_struct *task)
{
}

static inline void lockdep_off(void)
{
}

static inline void lockdep_on(void)
{
}

static inline void lockdep_set_selftest_task(struct task_struct *task)
{
}

# define lock_acquire(l, s, t, r, c, n, i)        do { } while (0)
# define lock_release(l, i)                        do { } while (0)
# define lock_downgrade(l, i)                        do { } while (0)
# define lock_set_class(l, n, key, s, i)        do { (void)(key); } while (0)
# define lock_set_novalidate_class(l, n, i)        do { } while (0)
# define lock_set_subclass(l, s, i)                do { } while (0)
# define lockdep_init()                                do { } while (0)
# define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map_wait(lock, name, key, sub, inner) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_init_map(lock, name, key, sub) \
                do { (void)(name); (void)(key); } while (0)
# define lockdep_set_class(lock, key)                do { (void)(key); } while (0)
# define lockdep_set_class_and_name(lock, key, name) \
                do { (void)(key); (void)(name); } while (0)
#define lockdep_set_class_and_subclass(lock, key, sub) \
                do { (void)(key); } while (0)
#define lockdep_set_subclass(lock, sub)                do { } while (0)

#define lockdep_set_novalidate_class(lock) do { } while (0)
#define lockdep_set_notrack_class(lock) do { } while (0)

/*
 * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP
 * case since the result is not well defined and the caller should rather
 * #ifdef the call himself.
 */

# define lockdep_reset()                do { debug_locks = 1; } while (0)
# define lockdep_free_key_range(start, size)        do { } while (0)
# define lockdep_sys_exit()                         do { } while (0)

static inline void lockdep_register_key(struct lock_class_key *key)
{
}

static inline void lockdep_unregister_key(struct lock_class_key *key)
{
}

#define lockdep_depth(tsk)        (0)

/*
 * Dummy forward declarations, allow users to write less ifdef-y code
 * and depend on dead code elimination.
 */
extern int lock_is_held(const void *);
extern int lockdep_is_held(const void *);
#define lockdep_is_held_type(l, r)                (1)

#define lockdep_assert(c)                        do { } while (0)
#define lockdep_assert_once(c)                        do { } while (0)

#define lockdep_assert_held(l)                        do { (void)(l); } while (0)
#define lockdep_assert_not_held(l)                do { (void)(l); } while (0)
#define lockdep_assert_held_write(l)                do { (void)(l); } while (0)
#define lockdep_assert_held_read(l)                do { (void)(l); } while (0)
#define lockdep_assert_held_once(l)                do { (void)(l); } while (0)
#define lockdep_assert_none_held_once()        do { } while (0)

#define lockdep_recursing(tsk)                        (0)

#define NIL_COOKIE (struct pin_cookie){ }

#define lockdep_pin_lock(l)                        ({ struct pin_cookie cookie = { }; cookie; })
#define lockdep_repin_lock(l, c)                do { (void)(l); (void)(c); } while (0)
#define lockdep_unpin_lock(l, c)                do { (void)(l); (void)(c); } while (0)

#define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type)        \
        struct lockdep_map __maybe_unused _name = {}

#endif /* !LOCKDEP */

#ifdef CONFIG_PROVE_LOCKING
void lockdep_set_lock_cmp_fn(struct lockdep_map *, lock_cmp_fn, lock_print_fn);

#define lock_set_cmp_fn(lock, ...)        lockdep_set_lock_cmp_fn(&(lock)->dep_map, __VA_ARGS__)
#else
#define lock_set_cmp_fn(lock, ...)        do { } while (0)
#endif

enum xhlock_context_t {
        XHLOCK_HARD,
        XHLOCK_SOFT,
        XHLOCK_CTX_NR,
};

/*
 * To initialize a lockdep_map statically use this macro.
 * Note that _name must not be NULL.
 */
#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
        { .name = (_name), .key = (void *)(_key), }

static inline void lockdep_invariant_state(bool force) {}
static inline void lockdep_free_task(struct task_struct *task) {}

#ifdef CONFIG_LOCK_STAT

extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
extern void lock_acquired(struct lockdep_map *lock, unsigned long ip);

#define LOCK_CONTENDED(_lock, try, lock)                        \
do {                                                                \
        if (!try(_lock)) {                                        \
                lock_contended(&(_lock)->dep_map, _RET_IP_);        \
                lock(_lock);                                        \
        }                                                        \
        lock_acquired(&(_lock)->dep_map, _RET_IP_);                        \
} while (0)

#define LOCK_CONTENDED_RETURN(_lock, try, lock)                        \
({                                                                \
        int ____err = 0;                                        \
        if (!try(_lock)) {                                        \
                lock_contended(&(_lock)->dep_map, _RET_IP_);        \
                ____err = lock(_lock);                                \
        }                                                        \
        if (!____err)                                                \
                lock_acquired(&(_lock)->dep_map, _RET_IP_);        \
        ____err;                                                \
})

#else /* CONFIG_LOCK_STAT */

#define lock_contended(lockdep_map, ip) do {} while (0)
#define lock_acquired(lockdep_map, ip) do {} while (0)

#define LOCK_CONTENDED(_lock, try, lock) \
        lock(_lock)

#define LOCK_CONTENDED_RETURN(_lock, try, lock) \
        lock(_lock)

#endif /* CONFIG_LOCK_STAT */

#ifdef CONFIG_PROVE_LOCKING
extern void print_irqtrace_events(struct task_struct *curr);
#else
static inline void print_irqtrace_events(struct task_struct *curr)
{
}
#endif

/* Variable used to make lockdep treat read_lock() as recursive in selftests */
#ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS
extern unsigned int force_read_lock_recursive;
#else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */
#define force_read_lock_recursive 0
#endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */

#ifdef CONFIG_LOCKDEP
extern bool read_lock_is_recursive(void);
#else /* CONFIG_LOCKDEP */
/* If !LOCKDEP, the value is meaningless */
#define read_lock_is_recursive() 0
#endif

/*
 * For trivial one-depth nesting of a lock-class, the following
 * global define can be used. (Subsystems with multiple levels
 * of nesting should define their own lock-nesting subclasses.)
 */
#define SINGLE_DEPTH_NESTING                        1

/*
 * Map the dependency ops to NOP or to real lockdep ops, depending
 * on the per lock-class debug mode:
 */

#define lock_acquire_exclusive(l, s, t, n, i)                lock_acquire(l, s, t, 0, 1, n, i)
#define lock_acquire_shared(l, s, t, n, i)                lock_acquire(l, s, t, 1, 1, n, i)
#define lock_acquire_shared_recursive(l, s, t, n, i)        lock_acquire(l, s, t, 2, 1, n, i)

#define spin_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define spin_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define spin_release(l, i)                        lock_release(l, i)

#define rwlock_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define rwlock_acquire_read(l, s, t, i)                                        \
do {                                                                        \
        if (read_lock_is_recursive())                                        \
                lock_acquire_shared_recursive(l, s, t, NULL, i);        \
        else                                                                \
                lock_acquire_shared(l, s, t, NULL, i);                        \
} while (0)

#define rwlock_release(l, i)                        lock_release(l, i)

#define seqcount_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define seqcount_acquire_read(l, s, t, i)        lock_acquire_shared_recursive(l, s, t, NULL, i)
#define seqcount_release(l, i)                        lock_release(l, i)

#define mutex_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define mutex_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define mutex_release(l, i)                        lock_release(l, i)

#define rwsem_acquire(l, s, t, i)                lock_acquire_exclusive(l, s, t, NULL, i)
#define rwsem_acquire_nest(l, s, t, n, i)        lock_acquire_exclusive(l, s, t, n, i)
#define rwsem_acquire_read(l, s, t, i)                lock_acquire_shared(l, s, t, NULL, i)
#define rwsem_release(l, i)                        lock_release(l, i)

#define lock_map_acquire(l)                        lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_try(l)                        lock_acquire_exclusive(l, 0, 1, NULL, _THIS_IP_)
#define lock_map_acquire_read(l)                lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_)
#define lock_map_acquire_tryread(l)                lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_)
#define lock_map_release(l)                        lock_release(l, _THIS_IP_)
#define lock_map_sync(l)                        lock_sync(l, 0, 0, 1, NULL, _THIS_IP_)

#ifdef CONFIG_PROVE_LOCKING
# define might_lock(lock)                                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_);        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)
# define might_lock_read(lock)                                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_);        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)
# define might_lock_nested(lock, subclass)                                \
do {                                                                        \
        typecheck(struct lockdep_map *, &(lock)->dep_map);                \
        lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL,                \
                     _THIS_IP_);                                        \
        lock_release(&(lock)->dep_map, _THIS_IP_);                        \
} while (0)

DECLARE_PER_CPU(int, hardirqs_enabled);
DECLARE_PER_CPU(int, hardirq_context);
DECLARE_PER_CPU(unsigned int, lockdep_recursion);

#define __lockdep_enabled        (debug_locks && !this_cpu_read(lockdep_recursion))

#define lockdep_assert_irqs_enabled()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \
} while (0)

#define lockdep_assert_irqs_disabled()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \
} while (0)

#define lockdep_assert_in_irq()                                                \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \
} while (0)

#define lockdep_assert_no_hardirq()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled && (this_cpu_read(hardirq_context) || \
                                           !this_cpu_read(hardirqs_enabled))); \
} while (0)

#define lockdep_assert_preemption_enabled()                                \
do {                                                                        \
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)        &&                \
                     __lockdep_enabled                        &&                \
                     (preempt_count() != 0                ||                \
                      !this_cpu_read(hardirqs_enabled)));                \
} while (0)

#define lockdep_assert_preemption_disabled()                                \
do {                                                                        \
        WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)        &&                \
                     __lockdep_enabled                        &&                \
                     (preempt_count() == 0                &&                \
                      this_cpu_read(hardirqs_enabled)));                \
} while (0)

/*
 * Acceptable for protecting per-CPU resources accessed from BH.
 * Much like in_softirq() - semantics are ambiguous, use carefully.
 */
#define lockdep_assert_in_softirq()                                        \
do {                                                                        \
        WARN_ON_ONCE(__lockdep_enabled                        &&                \
                     (!in_softirq() || in_irq() || in_nmi()));                \
} while (0)

extern void lockdep_assert_in_softirq_func(void);

#else
# define might_lock(lock) do { } while (0)
# define might_lock_read(lock) do { } while (0)
# define might_lock_nested(lock, subclass) do { } while (0)

# define lockdep_assert_irqs_enabled() do { } while (0)
# define lockdep_assert_irqs_disabled() do { } while (0)
# define lockdep_assert_in_irq() do { } while (0)
# define lockdep_assert_no_hardirq() do { } while (0)

# define lockdep_assert_preemption_enabled() do { } while (0)
# define lockdep_assert_preemption_disabled() do { } while (0)
# define lockdep_assert_in_softirq() do { } while (0)
# define lockdep_assert_in_softirq_func() do { } while (0)
#endif

#ifdef CONFIG_PROVE_RAW_LOCK_NESTING

# define lockdep_assert_RT_in_threaded_ctx() do {                        \
                WARN_ONCE(debug_locks && !current->lockdep_recursion &&        \
                          lockdep_hardirq_context() &&                        \
                          !(current->hardirq_threaded || current->irq_config),        \
                          "Not in threaded context on PREEMPT_RT as expected\n");        \
} while (0)

#else

# define lockdep_assert_RT_in_threaded_ctx() do { } while (0)

#endif

#ifdef CONFIG_LOCKDEP
void lockdep_rcu_suspicious(const char *file, const int line, const char *s);
#else
static inline void
lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
}
#endif

#endif /* __LINUX_LOCKDEP_H */






























   58 







































































































































































































































































   57 
    1 
    1 



   58 



























   58 


















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
/* SPDX-License-Identifier: GPL-2.0-or-later */
/* I/O iterator iteration building functions.
 *
 * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
 * Written by David Howells (dhowells@redhat.com)
 */

#ifndef _LINUX_IOV_ITER_H
#define _LINUX_IOV_ITER_H

#include <linux/uio.h>
#include <linux/bvec.h>
#include <linux/folio_queue.h>

typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len,
                             void *priv, void *priv2);
typedef size_t (*iov_ustep_f)(void __user *iter_base, size_t progress, size_t len,
                              void *priv, void *priv2);

/*
 * Handle ITER_UBUF.
 */
static __always_inline
size_t iterate_ubuf(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_ustep_f step)
{
        void __user *base = iter->ubuf;
        size_t progress = 0, remain;

        remain = step(base + iter->iov_offset, 0, len, priv, priv2);
        progress = len - remain;
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_IOVEC.
 */
static __always_inline
size_t iterate_iovec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                     iov_ustep_f step)
{
        const struct iovec *p = iter->__iov;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->__iov;
        iter->__iov = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_KVEC.
 */
static __always_inline
size_t iterate_kvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct kvec *p = iter->kvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t part = min(len, p->iov_len - skip);

                if (likely(part)) {
                        remain = step(p->iov_base + skip, progress, part, priv, priv2);
                        consumed = part - remain;
                        progress += consumed;
                        skip += consumed;
                        len -= consumed;
                        if (skip < p->iov_len)
                                break;
                }
                p++;
                skip = 0;
        } while (len);

        iter->nr_segs -= p - iter->kvec;
        iter->kvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_BVEC.
 */
static __always_inline
size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                    iov_step_f step)
{
        const struct bio_vec *p = iter->bvec;
        size_t progress = 0, skip = iter->iov_offset;

        do {
                size_t remain, consumed;
                size_t offset = p->bv_offset + skip, part;
                void *kaddr = kmap_local_page(p->bv_page + offset / PAGE_SIZE);

                part = min3(len,
                           (size_t)(p->bv_len - skip),
                           (size_t)(PAGE_SIZE - offset % PAGE_SIZE));
                remain = step(kaddr + offset % PAGE_SIZE, progress, part, priv, priv2);
                kunmap_local(kaddr);
                consumed = part - remain;
                len -= consumed;
                progress += consumed;
                skip += consumed;
                if (skip >= p->bv_len) {
                        skip = 0;
                        p++;
                }
                if (remain)
                        break;
        } while (len);

        iter->nr_segs -= p - iter->bvec;
        iter->bvec = p;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_FOLIOQ.
 */
static __always_inline
size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        const struct folio_queue *folioq = iter->folioq;
        unsigned int slot = iter->folioq_slot;
        size_t progress = 0, skip = iter->iov_offset;

        if (slot == folioq_nr_slots(folioq)) {
                /* The iterator may have been extended. */
                folioq = folioq->next;
                slot = 0;
        }

        do {
                struct folio *folio = folioq_folio(folioq, slot);
                size_t part, remain, consumed;
                size_t fsize;
                void *base;

                if (!folio)
                        break;

                fsize = folioq_folio_size(folioq, slot);
                base = kmap_local_folio(folio, skip);
                part = umin(len, PAGE_SIZE - skip % PAGE_SIZE);
                remain = step(base, progress, part, priv, priv2);
                kunmap_local(base);
                consumed = part - remain;
                len -= consumed;
                progress += consumed;
                skip += consumed;
                if (skip >= fsize) {
                        skip = 0;
                        slot++;
                        if (slot == folioq_nr_slots(folioq) && folioq->next) {
                                folioq = folioq->next;
                                slot = 0;
                        }
                }
                if (remain)
                        break;
        } while (len);

        iter->folioq_slot = slot;
        iter->folioq = folioq;
        iter->iov_offset = skip;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_XARRAY.
 */
static __always_inline
size_t iterate_xarray(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        struct folio *folio;
        size_t progress = 0;
        loff_t start = iter->xarray_start + iter->iov_offset;
        pgoff_t index = start / PAGE_SIZE;
        XA_STATE(xas, iter->xarray, index);

        rcu_read_lock();
        xas_for_each(&xas, folio, ULONG_MAX) {
                size_t remain, consumed, offset, part, flen;

                if (xas_retry(&xas, folio))
                        continue;
                if (WARN_ON(xa_is_value(folio)))
                        break;
                if (WARN_ON(folio_test_hugetlb(folio)))
                        break;

                offset = offset_in_folio(folio, start + progress);
                flen = min(folio_size(folio) - offset, len);

                while (flen) {
                        void *base = kmap_local_folio(folio, offset);

                        part = min_t(size_t, flen,
                                     PAGE_SIZE - offset_in_page(offset));
                        remain = step(base, progress, part, priv, priv2);
                        kunmap_local(base);

                        consumed = part - remain;
                        progress += consumed;
                        len -= consumed;

                        if (remain || len == 0)
                                goto out;
                        flen -= consumed;
                        offset += consumed;
                }
        }

out:
        rcu_read_unlock();
        iter->iov_offset += progress;
        iter->count -= progress;
        return progress;
}

/*
 * Handle ITER_DISCARD.
 */
static __always_inline
size_t iterate_discard(struct iov_iter *iter, size_t len, void *priv, void *priv2,
                      iov_step_f step)
{
        size_t progress = len;

        iter->count -= progress;
        return progress;
}

/**
 * iterate_and_advance2 - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @priv2: More data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * Iterate over the next part of an iterator, up to the specified length.  The
 * buffer is presented in segments, which for kernel iteration are broken up by
 * physical pages and mapped, with the mapped address being presented.
 *
 * Two step functions, @step and @ustep, must be provided, one for handling
 * mapped kernel addresses and the other is given user addresses which have the
 * potential to fault since no pinning is performed.
 *
 * The step functions are passed the address and length of the segment, @priv,
 * @priv2 and the amount of data so far iterated over (which can, for example,
 * be added to @priv to point to the right part of a second buffer).  The step
 * functions should return the amount of the segment they didn't process (ie. 0
 * indicates complete processsing).
 *
 * This function returns the amount of data processed (ie. 0 means nothing was
 * processed and the value of @len means processes to completion).
 */
static __always_inline
size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv,
                            void *priv2, iov_ustep_f ustep, iov_step_f step)
{
        if (unlikely(iter->count < len))
                len = iter->count;
        if (unlikely(!len))
                return 0;

        if (likely(iter_is_ubuf(iter)))
                return iterate_ubuf(iter, len, priv, priv2, ustep);
        if (likely(iter_is_iovec(iter)))
                return iterate_iovec(iter, len, priv, priv2, ustep);
        if (iov_iter_is_bvec(iter))
                return iterate_bvec(iter, len, priv, priv2, step);
        if (iov_iter_is_kvec(iter))
                return iterate_kvec(iter, len, priv, priv2, step);
        if (iov_iter_is_folioq(iter))
                return iterate_folioq(iter, len, priv, priv2, step);
        if (iov_iter_is_xarray(iter))
                return iterate_xarray(iter, len, priv, priv2, step);
        return iterate_discard(iter, len, priv, priv2, step);
}

/**
 * iterate_and_advance - Iterate over an iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @ustep: Function for UBUF/IOVEC iterators; given __user addresses.
 * @step: Function for other iterators; given kernel addresses.
 *
 * As iterate_and_advance2(), but priv2 is always NULL.
 */
static __always_inline
size_t iterate_and_advance(struct iov_iter *iter, size_t len, void *priv,
                           iov_ustep_f ustep, iov_step_f step)
{
        return iterate_and_advance2(iter, len, priv, NULL, ustep, step);
}

/**
 * iterate_and_advance_kernel - Iterate over a kernel-internal iterator
 * @iter: The iterator to iterate over.
 * @len: The amount to iterate over.
 * @priv: Data for the step functions.
 * @priv2: More data for the step functions.
 * @step: Function for other iterators; given kernel addresses.
 *
 * Iterate over the next part of an iterator, up to the specified length.  The
 * buffer is presented in segments, which for kernel iteration are broken up by
 * physical pages and mapped, with the mapped address being presented.
 *
 * [!] Note This will only handle BVEC, KVEC, FOLIOQ, XARRAY and DISCARD-type
 * iterators; it will not handle UBUF or IOVEC-type iterators.
 *
 * A step functions, @step, must be provided, one for handling mapped kernel
 * addresses and the other is given user addresses which have the potential to
 * fault since no pinning is performed.
 *
 * The step functions are passed the address and length of the segment, @priv,
 * @priv2 and the amount of data so far iterated over (which can, for example,
 * be added to @priv to point to the right part of a second buffer).  The step
 * functions should return the amount of the segment they didn't process (ie. 0
 * indicates complete processsing).
 *
 * This function returns the amount of data processed (ie. 0 means nothing was
 * processed and the value of @len means processes to completion).
 */
static __always_inline
size_t iterate_and_advance_kernel(struct iov_iter *iter, size_t len, void *priv,
                                  void *priv2, iov_step_f step)
{
        if (unlikely(iter->count < len))
                len = iter->count;
        if (unlikely(!len))
                return 0;
        if (iov_iter_is_bvec(iter))
                return iterate_bvec(iter, len, priv, priv2, step);
        if (iov_iter_is_kvec(iter))
                return iterate_kvec(iter, len, priv, priv2, step);
        if (iov_iter_is_folioq(iter))
                return iterate_folioq(iter, len, priv, priv2, step);
        if (iov_iter_is_xarray(iter))
                return iterate_xarray(iter, len, priv, priv2, step);
        return iterate_discard(iter, len, priv, priv2, step);
}

#endif /* _LINUX_IOV_ITER_H */













































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 





    3 



























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * drivers/net/macsec.c - MACsec device
 *
 * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net>
 */

#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/module.h>
#include <crypto/aead.h>
#include <linux/etherdevice.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/refcount.h>
#include <net/genetlink.h>
#include <net/sock.h>
#include <net/gro_cells.h>
#include <net/macsec.h>
#include <net/dst_metadata.h>
#include <net/netdev_lock.h>
#include <linux/phy.h>
#include <linux/byteorder/generic.h>
#include <linux/if_arp.h>

#include <uapi/linux/if_macsec.h>

/* SecTAG length = macsec_eth_header without the optional SCI */
#define MACSEC_TAG_LEN 6

struct macsec_eth_header {
        struct ethhdr eth;
        /* SecTAG */
        u8  tci_an;
#if defined(__LITTLE_ENDIAN_BITFIELD)
        u8  short_length:6,
                  unused:2;
#elif defined(__BIG_ENDIAN_BITFIELD)
        u8        unused:2,
            short_length:6;
#else
#error        "Please fix <asm/byteorder.h>"
#endif
        __be32 packet_number;
        u8 secure_channel_id[8]; /* optional */
} __packed;

/* minimum secure data length deemed "not short", see IEEE 802.1AE-2006 9.7 */
#define MIN_NON_SHORT_LEN 48

#define GCM_AES_IV_LEN 12

#define for_each_rxsc(secy, sc)                                \
        for (sc = rcu_dereference_bh(secy->rx_sc);        \
             sc;                                        \
             sc = rcu_dereference_bh(sc->next))
#define for_each_rxsc_rtnl(secy, sc)                        \
        for (sc = rtnl_dereference(secy->rx_sc);        \
             sc;                                        \
             sc = rtnl_dereference(sc->next))

#define pn_same_half(pn1, pn2) (!(((pn1) >> 31) ^ ((pn2) >> 31)))

struct gcm_iv_xpn {
        union {
                u8 short_secure_channel_id[4];
                ssci_t ssci;
        };
        __be64 pn;
} __packed;

struct gcm_iv {
        union {
                u8 secure_channel_id[8];
                sci_t sci;
        };
        __be32 pn;
};

#define MACSEC_VALIDATE_DEFAULT MACSEC_VALIDATE_STRICT

struct pcpu_secy_stats {
        struct macsec_dev_stats stats;
        struct u64_stats_sync syncp;
};

/**
 * struct macsec_dev - private data
 * @secy: SecY config
 * @real_dev: pointer to underlying netdevice
 * @dev_tracker: refcount tracker for @real_dev reference
 * @stats: MACsec device stats
 * @secys: linked list of SecY's on the underlying device
 * @gro_cells: pointer to the Generic Receive Offload cell
 * @offload: status of offloading on the MACsec device
 * @insert_tx_tag: when offloading, device requires to insert an
 *        additional tag
 */
struct macsec_dev {
        struct macsec_secy secy;
        struct net_device *real_dev;
        netdevice_tracker dev_tracker;
        struct pcpu_secy_stats __percpu *stats;
        struct list_head secys;
        struct gro_cells gro_cells;
        enum macsec_offload offload;
        bool insert_tx_tag;
};

/**
 * struct macsec_rxh_data - rx_handler private argument
 * @secys: linked list of SecY's on this underlying device
 */
struct macsec_rxh_data {
        struct list_head secys;
};

static struct macsec_dev *macsec_priv(const struct net_device *dev)
{
        return (struct macsec_dev *)netdev_priv(dev);
}

static struct macsec_rxh_data *macsec_data_rcu(const struct net_device *dev)
{
        return rcu_dereference_bh(dev->rx_handler_data);
}

static struct macsec_rxh_data *macsec_data_rtnl(const struct net_device *dev)
{
        return rtnl_dereference(dev->rx_handler_data);
}

struct macsec_cb {
        struct aead_request *req;
        union {
                struct macsec_tx_sa *tx_sa;
                struct macsec_rx_sa *rx_sa;
        };
        u8 assoc_num;
        bool valid;
        bool has_sci;
};

static struct macsec_rx_sa *macsec_rxsa_get(struct macsec_rx_sa __rcu *ptr)
{
        struct macsec_rx_sa *sa = rcu_dereference_bh(ptr);

        if (!sa || !sa->active)
                return NULL;

        if (!refcount_inc_not_zero(&sa->refcnt))
                return NULL;

        return sa;
}

static void free_rx_sc_rcu(struct rcu_head *head)
{
        struct macsec_rx_sc *rx_sc = container_of(head, struct macsec_rx_sc, rcu_head);

        free_percpu(rx_sc->stats);
        kfree(rx_sc);
}

static struct macsec_rx_sc *macsec_rxsc_get(struct macsec_rx_sc *sc)
{
        return refcount_inc_not_zero(&sc->refcnt) ? sc : NULL;
}

static void macsec_rxsc_put(struct macsec_rx_sc *sc)
{
        if (refcount_dec_and_test(&sc->refcnt))
                call_rcu(&sc->rcu_head, free_rx_sc_rcu);
}

static void free_rxsa(struct rcu_head *head)
{
        struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu);

        crypto_free_aead(sa->key.tfm);
        free_percpu(sa->stats);
        kfree(sa);
}

static void macsec_rxsa_put(struct macsec_rx_sa *sa)
{
        if (refcount_dec_and_test(&sa->refcnt))
                call_rcu(&sa->rcu, free_rxsa);
}

static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr)
{
        struct macsec_tx_sa *sa = rcu_dereference_bh(ptr);

        if (!sa || !sa->active)
                return NULL;

        if (!refcount_inc_not_zero(&sa->refcnt))
                return NULL;

        return sa;
}

static void free_txsa(struct rcu_head *head)
{
        struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu);

        crypto_free_aead(sa->key.tfm);
        free_percpu(sa->stats);
        kfree(sa);
}

static void macsec_txsa_put(struct macsec_tx_sa *sa)
{
        if (refcount_dec_and_test(&sa->refcnt))
                call_rcu(&sa->rcu, free_txsa);
}

static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb)
{
        BUILD_BUG_ON(sizeof(struct macsec_cb) > sizeof(skb->cb));
        return (struct macsec_cb *)skb->cb;
}

#define MACSEC_PORT_SCB (0x0000)
#define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL)
#define MACSEC_UNDEF_SSCI ((__force ssci_t)0xffffffff)

#define MACSEC_GCM_AES_128_SAK_LEN 16
#define MACSEC_GCM_AES_256_SAK_LEN 32

#define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN
#define DEFAULT_XPN false
#define DEFAULT_SEND_SCI true
#define DEFAULT_ENCRYPT false
#define DEFAULT_ENCODING_SA 0
#define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1))

static sci_t make_sci(const u8 *addr, __be16 port)
{
        sci_t sci;

        memcpy(&sci, addr, ETH_ALEN);
        memcpy(((char *)&sci) + ETH_ALEN, &port, sizeof(port));

        return sci;
}

static sci_t macsec_frame_sci(struct macsec_eth_header *hdr, bool sci_present)
{
        sci_t sci;

        if (sci_present)
                memcpy(&sci, hdr->secure_channel_id,
                       sizeof(hdr->secure_channel_id));
        else
                sci = make_sci(hdr->eth.h_source, MACSEC_PORT_ES);

        return sci;
}

static unsigned int macsec_sectag_len(bool sci_present)
{
        return MACSEC_TAG_LEN + (sci_present ? MACSEC_SCI_LEN : 0);
}

static unsigned int macsec_hdr_len(bool sci_present)
{
        return macsec_sectag_len(sci_present) + ETH_HLEN;
}

static unsigned int macsec_extra_len(bool sci_present)
{
        return macsec_sectag_len(sci_present) + sizeof(__be16);
}

/* Fill SecTAG according to IEEE 802.1AE-2006 10.5.3 */
static void macsec_fill_sectag(struct macsec_eth_header *h,
                               const struct macsec_secy *secy, u32 pn,
                               bool sci_present)
{
        const struct macsec_tx_sc *tx_sc = &secy->tx_sc;

        memset(&h->tci_an, 0, macsec_sectag_len(sci_present));
        h->eth.h_proto = htons(ETH_P_MACSEC);

        if (sci_present) {
                h->tci_an |= MACSEC_TCI_SC;
                memcpy(&h->secure_channel_id, &secy->sci,
                       sizeof(h->secure_channel_id));
        } else {
                if (tx_sc->end_station)
                        h->tci_an |= MACSEC_TCI_ES;
                if (tx_sc->scb)
                        h->tci_an |= MACSEC_TCI_SCB;
        }

        h->packet_number = htonl(pn);

        /* with GCM, C/E clear for !encrypt, both set for encrypt */
        if (tx_sc->encrypt)
                h->tci_an |= MACSEC_TCI_CONFID;
        else if (secy->icv_len != MACSEC_DEFAULT_ICV_LEN)
                h->tci_an |= MACSEC_TCI_C;

        h->tci_an |= tx_sc->encoding_sa;
}

static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len)
{
        if (data_len < MIN_NON_SHORT_LEN)
                h->short_length = data_len;
}

/* Checks if a MACsec interface is being offloaded to an hardware engine */
static bool macsec_is_offloaded(struct macsec_dev *macsec)
{
        if (macsec->offload == MACSEC_OFFLOAD_MAC ||
            macsec->offload == MACSEC_OFFLOAD_PHY)
                return true;

        return false;
}

/* Checks if underlying layers implement MACsec offloading functions. */
static bool macsec_check_offload(enum macsec_offload offload,
                                 struct macsec_dev *macsec)
{
        if (!macsec || !macsec->real_dev)
                return false;

        if (offload == MACSEC_OFFLOAD_PHY)
                return macsec->real_dev->phydev &&
                       macsec->real_dev->phydev->macsec_ops;
        else if (offload == MACSEC_OFFLOAD_MAC)
                return macsec->real_dev->features & NETIF_F_HW_MACSEC &&
                       macsec->real_dev->macsec_ops;

        return false;
}

static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload,
                                                 struct macsec_dev *macsec,
                                                 struct macsec_context *ctx)
{
        if (ctx) {
                memset(ctx, 0, sizeof(*ctx));
                ctx->offload = offload;

                if (offload == MACSEC_OFFLOAD_PHY)
                        ctx->phydev = macsec->real_dev->phydev;
                else if (offload == MACSEC_OFFLOAD_MAC)
                        ctx->netdev = macsec->real_dev;
        }

        if (offload == MACSEC_OFFLOAD_PHY)
                return macsec->real_dev->phydev->macsec_ops;
        else
                return macsec->real_dev->macsec_ops;
}

/* Returns a pointer to the MACsec ops struct if any and updates the MACsec
 * context device reference if provided.
 */
static const struct macsec_ops *macsec_get_ops(struct macsec_dev *macsec,
                                               struct macsec_context *ctx)
{
        if (!macsec_check_offload(macsec->offload, macsec))
                return NULL;

        return __macsec_get_ops(macsec->offload, macsec, ctx);
}

/* validate MACsec packet according to IEEE 802.1AE-2018 9.12 */
static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len, bool xpn)
{
        struct macsec_eth_header *h = (struct macsec_eth_header *)skb->data;
        int len = skb->len - 2 * ETH_ALEN;
        int extra_len = macsec_extra_len(!!(h->tci_an & MACSEC_TCI_SC)) + icv_len;

        /* a) It comprises at least 17 octets */
        if (skb->len <= 16)
                return false;

        /* b) MACsec EtherType: already checked */

        /* c) V bit is clear */
        if (h->tci_an & MACSEC_TCI_VERSION)
                return false;

        /* d) ES or SCB => !SC */
        if ((h->tci_an & MACSEC_TCI_ES || h->tci_an & MACSEC_TCI_SCB) &&
            (h->tci_an & MACSEC_TCI_SC))
                return false;

        /* e) Bits 7 and 8 of octet 4 of the SecTAG are clear */
        if (h->unused)
                return false;

        /* rx.pn != 0 if not XPN (figure 10-5 with 802.11AEbw-2013 amendment) */
        if (!h->packet_number && !xpn)
                return false;

        /* length check, f) g) h) i) */
        if (h->short_length)
                return len == extra_len + h->short_length;
        return len >= extra_len + MIN_NON_SHORT_LEN;
}

#define MACSEC_NEEDED_HEADROOM (macsec_extra_len(true))
#define MACSEC_NEEDED_TAILROOM MACSEC_STD_ICV_LEN

static void macsec_fill_iv_xpn(unsigned char *iv, ssci_t ssci, u64 pn,
                               salt_t salt)
{
        struct gcm_iv_xpn *gcm_iv = (struct gcm_iv_xpn *)iv;

        gcm_iv->ssci = ssci ^ salt.ssci;
        gcm_iv->pn = cpu_to_be64(pn) ^ salt.pn;
}

static void macsec_fill_iv(unsigned char *iv, sci_t sci, u32 pn)
{
        struct gcm_iv *gcm_iv = (struct gcm_iv *)iv;

        gcm_iv->sci = sci;
        gcm_iv->pn = htonl(pn);
}

static struct macsec_eth_header *macsec_ethhdr(struct sk_buff *skb)
{
        return (struct macsec_eth_header *)skb_mac_header(skb);
}

static void __macsec_pn_wrapped(struct macsec_secy *secy,
                                struct macsec_tx_sa *tx_sa)
{
        pr_debug("PN wrapped, transitioning to !oper\n");
        tx_sa->active = false;
        if (secy->protect_frames)
                secy->operational = false;
}

void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa)
{
        spin_lock_bh(&tx_sa->lock);
        __macsec_pn_wrapped(secy, tx_sa);
        spin_unlock_bh(&tx_sa->lock);
}
EXPORT_SYMBOL_GPL(macsec_pn_wrapped);

static pn_t tx_sa_update_pn(struct macsec_tx_sa *tx_sa,
                            struct macsec_secy *secy)
{
        pn_t pn;

        spin_lock_bh(&tx_sa->lock);

        pn = tx_sa->next_pn_halves;
        if (secy->xpn)
                tx_sa->next_pn++;
        else
                tx_sa->next_pn_halves.lower++;

        if (tx_sa->next_pn == 0)
                __macsec_pn_wrapped(secy, tx_sa);
        spin_unlock_bh(&tx_sa->lock);

        return pn;
}

static void macsec_encrypt_finish(struct sk_buff *skb, struct net_device *dev)
{
        struct macsec_dev *macsec = netdev_priv(dev);

        skb->dev = macsec->real_dev;
        skb_reset_mac_header(skb);
        skb->protocol = eth_hdr(skb)->h_proto;
}

static unsigned int macsec_msdu_len(struct sk_buff *skb)
{
        struct macsec_dev *macsec = macsec_priv(skb->dev);
        struct macsec_secy *secy = &macsec->secy;
        bool sci_present = macsec_skb_cb(skb)->has_sci;

        return skb->len - macsec_hdr_len(sci_present) - secy->icv_len;
}

static void macsec_count_tx(struct sk_buff *skb, struct macsec_tx_sc *tx_sc,
                            struct macsec_tx_sa *tx_sa)
{
        unsigned int msdu_len = macsec_msdu_len(skb);
        struct pcpu_tx_sc_stats *txsc_stats = this_cpu_ptr(tx_sc->stats);

        u64_stats_update_begin(&txsc_stats->syncp);
        if (tx_sc->encrypt) {
                txsc_stats->stats.OutOctetsEncrypted += msdu_len;
                txsc_stats->stats.OutPktsEncrypted++;
                this_cpu_inc(tx_sa->stats->OutPktsEncrypted);
        } else {
                txsc_stats->stats.OutOctetsProtected += msdu_len;
                txsc_stats->stats.OutPktsProtected++;
                this_cpu_inc(tx_sa->stats->OutPktsProtected);
        }
        u64_stats_update_end(&txsc_stats->syncp);
}

static void count_tx(struct net_device *dev, int ret, int len)
{
        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN))
                dev_sw_netstats_tx_add(dev, 1, len);
}

static void macsec_encrypt_done(void *data, int err)
{
        struct sk_buff *skb = data;
        struct net_device *dev = skb->dev;
        struct macsec_dev *macsec = macsec_priv(dev);
        struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa;
        int len, ret;

        aead_request_free(macsec_skb_cb(skb)->req);

        rcu_read_lock_bh();
        macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa);
        /* packet is encrypted/protected so tx_bytes must be calculated */
        len = macsec_msdu_len(skb) + 2 * ETH_ALEN;
        macsec_encrypt_finish(skb, dev);
        ret = dev_queue_xmit(skb);
        count_tx(dev, ret, len);
        rcu_read_unlock_bh();

        macsec_txsa_put(sa);
        dev_put(dev);
}

static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm,
                                             unsigned char **iv,
                                             struct scatterlist **sg,
                                             int num_frags)
{
        size_t size, iv_offset, sg_offset;
        struct aead_request *req;
        void *tmp;

        size = sizeof(struct aead_request) + crypto_aead_reqsize(tfm);
        iv_offset = size;
        size += GCM_AES_IV_LEN;

        size = ALIGN(size, __alignof__(struct scatterlist));
        sg_offset = size;
        size += sizeof(struct scatterlist) * num_frags;

        tmp = kmalloc(size, GFP_ATOMIC);
        if (!tmp)
                return NULL;

        *iv = (unsigned char *)(tmp + iv_offset);
        *sg = (struct scatterlist *)(tmp + sg_offset);
        req = tmp;

        aead_request_set_tfm(req, tfm);

        return req;
}

static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
                                      struct net_device *dev)
{
        int ret;
        struct scatterlist *sg;
        struct sk_buff *trailer;
        unsigned char *iv;
        struct ethhdr *eth;
        struct macsec_eth_header *hh;
        size_t unprotected_len;
        struct aead_request *req;
        struct macsec_secy *secy;
        struct macsec_tx_sc *tx_sc;
        struct macsec_tx_sa *tx_sa;
        struct macsec_dev *macsec = macsec_priv(dev);
        bool sci_present;
        pn_t pn;

        secy = &macsec->secy;
        tx_sc = &secy->tx_sc;

        /* 10.5.1 TX SA assignment */
        tx_sa = macsec_txsa_get(tx_sc->sa[tx_sc->encoding_sa]);
        if (!tx_sa) {
                secy->operational = false;
                kfree_skb(skb);
                return ERR_PTR(-EINVAL);
        }

        if (unlikely(skb_headroom(skb) < MACSEC_NEEDED_HEADROOM ||
                     skb_tailroom(skb) < MACSEC_NEEDED_TAILROOM)) {
                struct sk_buff *nskb = skb_copy_expand(skb,
                                                       MACSEC_NEEDED_HEADROOM,
                                                       MACSEC_NEEDED_TAILROOM,
                                                       GFP_ATOMIC);
                if (likely(nskb)) {
                        consume_skb(skb);
                        skb = nskb;
                } else {
                        macsec_txsa_put(tx_sa);
                        kfree_skb(skb);
                        return ERR_PTR(-ENOMEM);
                }
        } else {
                skb = skb_unshare(skb, GFP_ATOMIC);
                if (!skb) {
                        macsec_txsa_put(tx_sa);
                        return ERR_PTR(-ENOMEM);
                }
        }

        unprotected_len = skb->len;
        eth = eth_hdr(skb);
        sci_present = macsec_send_sci(secy);
        hh = skb_push(skb, macsec_extra_len(sci_present));
        memmove(hh, eth, 2 * ETH_ALEN);

        pn = tx_sa_update_pn(tx_sa, secy);
        if (pn.full64 == 0) {
                macsec_txsa_put(tx_sa);
                kfree_skb(skb);
                return ERR_PTR(-ENOLINK);
        }
        macsec_fill_sectag(hh, secy, pn.lower, sci_present);
        macsec_set_shortlen(hh, unprotected_len - 2 * ETH_ALEN);

        skb_put(skb, secy->icv_len);

        if (skb->len - ETH_HLEN > macsec_priv(dev)->real_dev->mtu) {
                struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats);

                u64_stats_update_begin(&secy_stats->syncp);
                secy_stats->stats.OutPktsTooLong++;
                u64_stats_update_end(&secy_stats->syncp);

                macsec_txsa_put(tx_sa);
                kfree_skb(skb);
                return ERR_PTR(-EINVAL);
        }

        ret = skb_cow_data(skb, 0, &trailer);
        if (unlikely(ret < 0)) {
                macsec_txsa_put(tx_sa);
                kfree_skb(skb);
                return ERR_PTR(ret);
        }

        req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret);
        if (!req) {
                macsec_txsa_put(tx_sa);
                kfree_skb(skb);
                return ERR_PTR(-ENOMEM);
        }

        if (secy->xpn)
                macsec_fill_iv_xpn(iv, tx_sa->ssci, pn.full64, tx_sa->key.salt);
        else
                macsec_fill_iv(iv, secy->sci, pn.lower);

        sg_init_table(sg, ret);
        ret = skb_to_sgvec(skb, sg, 0, skb->len);
        if (unlikely(ret < 0)) {
                aead_request_free(req);
                macsec_txsa_put(tx_sa);
                kfree_skb(skb);
                return ERR_PTR(ret);
        }

        if (tx_sc->encrypt) {
                int len = skb->len - macsec_hdr_len(sci_present) -
                          secy->icv_len;
                aead_request_set_crypt(req, sg, sg, len, iv);
                aead_request_set_ad(req, macsec_hdr_len(sci_present));
        } else {
                aead_request_set_crypt(req, sg, sg, 0, iv);
                aead_request_set_ad(req, skb->len - secy->icv_len);
        }

        macsec_skb_cb(skb)->req = req;
        macsec_skb_cb(skb)->tx_sa = tx_sa;
        macsec_skb_cb(skb)->has_sci = sci_present;
        aead_request_set_callback(req, 0, macsec_encrypt_done, skb);

        dev_hold(skb->dev);
        ret = crypto_aead_encrypt(req);
        if (ret == -EINPROGRESS) {
                return ERR_PTR(ret);
        } else if (ret != 0) {
                dev_put(skb->dev);
                kfree_skb(skb);
                aead_request_free(req);
                macsec_txsa_put(tx_sa);
                return ERR_PTR(-EINVAL);
        }

        dev_put(skb->dev);
        aead_request_free(req);
        macsec_txsa_put(tx_sa);

        return skb;
}

static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u32 pn)
{
        struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa;
        struct pcpu_rx_sc_stats *rxsc_stats = this_cpu_ptr(rx_sa->sc->stats);
        struct macsec_eth_header *hdr = macsec_ethhdr(skb);
        u32 lowest_pn = 0;

        spin_lock(&rx_sa->lock);
        if (rx_sa->next_pn_halves.lower >= secy->replay_window)
                lowest_pn = rx_sa->next_pn_halves.lower - secy->replay_window;

        /* Now perform replay protection check again
         * (see IEEE 802.1AE-2006 figure 10-5)
         */
        if (secy->replay_protect && pn < lowest_pn &&
            (!secy->xpn || pn_same_half(pn, lowest_pn))) {
                spin_unlock(&rx_sa->lock);
                u64_stats_update_begin(&rxsc_stats->syncp);
                rxsc_stats->stats.InPktsLate++;
                u64_stats_update_end(&rxsc_stats->syncp);
                DEV_STATS_INC(secy->netdev, rx_dropped);
                return false;
        }

        if (secy->validate_frames != MACSEC_VALIDATE_DISABLED) {
                unsigned int msdu_len = macsec_msdu_len(skb);
                u64_stats_update_begin(&rxsc_stats->syncp);
                if (hdr->tci_an & MACSEC_TCI_E)
                        rxsc_stats->stats.InOctetsDecrypted += msdu_len;
                else
                        rxsc_stats->stats.InOctetsValidated += msdu_len;
                u64_stats_update_end(&rxsc_stats->syncp);
        }

        if (!macsec_skb_cb(skb)->valid) {
                spin_unlock(&rx_sa->lock);

                /* 10.6.5 */
                if (hdr->tci_an & MACSEC_TCI_C ||
                    secy->validate_frames == MACSEC_VALIDATE_STRICT) {
                        u64_stats_update_begin(&rxsc_stats->syncp);
                        rxsc_stats->stats.InPktsNotValid++;
                        u64_stats_update_end(&rxsc_stats->syncp);
                        this_cpu_inc(rx_sa->stats->InPktsNotValid);
                        DEV_STATS_INC(secy->netdev, rx_errors);
                        return false;
                }

                u64_stats_update_begin(&rxsc_stats->syncp);
                if (secy->validate_frames == MACSEC_VALIDATE_CHECK) {
                        rxsc_stats->stats.InPktsInvalid++;
                        this_cpu_inc(rx_sa->stats->InPktsInvalid);
                } else if (pn < lowest_pn) {
                        rxsc_stats->stats.InPktsDelayed++;
                } else {
                        rxsc_stats->stats.InPktsUnchecked++;
                }
                u64_stats_update_end(&rxsc_stats->syncp);
        } else {
                u64_stats_update_begin(&rxsc_stats->syncp);
                if (pn < lowest_pn) {
                        rxsc_stats->stats.InPktsDelayed++;
                } else {
                        rxsc_stats->stats.InPktsOK++;
                        this_cpu_inc(rx_sa->stats->InPktsOK);
                }
                u64_stats_update_end(&rxsc_stats->syncp);

                // Instead of "pn >=" - to support pn overflow in xpn
                if (pn + 1 > rx_sa->next_pn_halves.lower) {
                        rx_sa->next_pn_halves.lower = pn + 1;
                } else if (secy->xpn &&
                           !pn_same_half(pn, rx_sa->next_pn_halves.lower)) {
                        rx_sa->next_pn_halves.upper++;
                        rx_sa->next_pn_halves.lower = pn + 1;
                }

                spin_unlock(&rx_sa->lock);
        }

        return true;
}

static void macsec_reset_skb(struct sk_buff *skb, struct net_device *dev)
{
        skb->pkt_type = PACKET_HOST;
        skb->protocol = eth_type_trans(skb, dev);

        skb_reset_network_header(skb);
        if (!skb_transport_header_was_set(skb))
                skb_reset_transport_header(skb);
        skb_reset_mac_len(skb);
}

static void macsec_finalize_skb(struct sk_buff *skb, u8 icv_len, u8 hdr_len)
{
        skb->ip_summed = CHECKSUM_NONE;
        memmove(skb->data + hdr_len, skb->data, 2 * ETH_ALEN);
        skb_pull(skb, hdr_len);
        pskb_trim_unique(skb, skb->len - icv_len);
}

static void count_rx(struct net_device *dev, int len)
{
        dev_sw_netstats_rx_add(dev, len);
}

static void macsec_decrypt_done(void *data, int err)
{
        struct sk_buff *skb = data;
        struct net_device *dev = skb->dev;
        struct macsec_dev *macsec = macsec_priv(dev);
        struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa;
        struct macsec_rx_sc *rx_sc = rx_sa->sc;
        int len;
        u32 pn;

        aead_request_free(macsec_skb_cb(skb)->req);

        if (!err)
                macsec_skb_cb(skb)->valid = true;

        rcu_read_lock_bh();
        pn = ntohl(macsec_ethhdr(skb)->packet_number);
        if (!macsec_post_decrypt(skb, &macsec->secy, pn)) {
                rcu_read_unlock_bh();
                kfree_skb(skb);
                goto out;
        }

        macsec_finalize_skb(skb, macsec->secy.icv_len,
                            macsec_extra_len(macsec_skb_cb(skb)->has_sci));
        len = skb->len;
        macsec_reset_skb(skb, macsec->secy.netdev);

        if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS)
                count_rx(dev, len);

        rcu_read_unlock_bh();

out:
        macsec_rxsa_put(rx_sa);
        macsec_rxsc_put(rx_sc);
        dev_put(dev);
}

static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
                                      struct net_device *dev,
                                      struct macsec_rx_sa *rx_sa,
                                      sci_t sci,
                                      struct macsec_secy *secy)
{
        int ret;
        struct scatterlist *sg;
        struct sk_buff *trailer;
        unsigned char *iv;
        struct aead_request *req;
        struct macsec_eth_header *hdr;
        u32 hdr_pn;
        u16 icv_len = secy->icv_len;

        macsec_skb_cb(skb)->valid = false;
        skb = skb_share_check(skb, GFP_ATOMIC);
        if (!skb)
                return ERR_PTR(-ENOMEM);

        ret = skb_cow_data(skb, 0, &trailer);
        if (unlikely(ret < 0)) {
                kfree_skb(skb);
                return ERR_PTR(ret);
        }
        req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret);
        if (!req) {
                kfree_skb(skb);
                return ERR_PTR(-ENOMEM);
        }

        hdr = (struct macsec_eth_header *)skb->data;
        hdr_pn = ntohl(hdr->packet_number);

        if (secy->xpn) {
                pn_t recovered_pn = rx_sa->next_pn_halves;

                recovered_pn.lower = hdr_pn;
                if (hdr_pn < rx_sa->next_pn_halves.lower &&
                    !pn_same_half(hdr_pn, rx_sa->next_pn_halves.lower))
                        recovered_pn.upper++;

                macsec_fill_iv_xpn(iv, rx_sa->ssci, recovered_pn.full64,
                                   rx_sa->key.salt);
        } else {
                macsec_fill_iv(iv, sci, hdr_pn);
        }

        sg_init_table(sg, ret);
        ret = skb_to_sgvec(skb, sg, 0, skb->len);
        if (unlikely(ret < 0)) {
                aead_request_free(req);
                kfree_skb(skb);
                return ERR_PTR(ret);
        }

        if (hdr->tci_an & MACSEC_TCI_E) {
                /* confidentiality: ethernet + macsec header
                 * authenticated, encrypted payload
                 */
                int len = skb->len - macsec_hdr_len(macsec_skb_cb(skb)->has_sci);

                aead_request_set_crypt(req, sg, sg, len, iv);
                aead_request_set_ad(req, macsec_hdr_len(macsec_skb_cb(skb)->has_sci));
                skb = skb_unshare(skb, GFP_ATOMIC);
                if (!skb) {
                        aead_request_free(req);
                        return ERR_PTR(-ENOMEM);
                }
        } else {
                /* integrity only: all headers + data authenticated */
                aead_request_set_crypt(req, sg, sg, icv_len, iv);
                aead_request_set_ad(req, skb->len - icv_len);
        }

        macsec_skb_cb(skb)->req = req;
        skb->dev = dev;
        aead_request_set_callback(req, 0, macsec_decrypt_done, skb);

        dev_hold(dev);
        ret = crypto_aead_decrypt(req);
        if (ret == -EINPROGRESS) {
                return ERR_PTR(ret);
        } else if (ret != 0) {
                /* decryption/authentication failed
                 * 10.6 if validateFrames is disabled, deliver anyway
                 */
                if (ret != -EBADMSG) {
                        kfree_skb(skb);
                        skb = ERR_PTR(ret);
                }
        } else {
                macsec_skb_cb(skb)->valid = true;
        }
        dev_put(dev);

        aead_request_free(req);

        return skb;
}

static struct macsec_rx_sc *find_rx_sc(struct macsec_secy *secy, sci_t sci)
{
        struct macsec_rx_sc *rx_sc;

        for_each_rxsc(secy, rx_sc) {
                if (rx_sc->sci == sci)
                        return rx_sc;
        }

        return NULL;
}

static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci)
{
        struct macsec_rx_sc *rx_sc;

        for_each_rxsc_rtnl(secy, rx_sc) {
                if (rx_sc->sci == sci)
                        return rx_sc;
        }

        return NULL;
}

static enum rx_handler_result handle_not_macsec(struct sk_buff *skb)
{
        /* Deliver to the uncontrolled port by default */
        enum rx_handler_result ret = RX_HANDLER_PASS;
        struct ethhdr *hdr = eth_hdr(skb);
        struct metadata_dst *md_dst;
        struct macsec_rxh_data *rxd;
        struct macsec_dev *macsec;
        bool is_macsec_md_dst;

        rcu_read_lock();
        rxd = macsec_data_rcu(skb->dev);
        md_dst = skb_metadata_dst(skb);
        is_macsec_md_dst = md_dst && md_dst->type == METADATA_MACSEC;

        list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
                struct sk_buff *nskb;
                struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats);
                struct net_device *ndev = macsec->secy.netdev;

                /* If h/w offloading is enabled, HW decodes frames and strips
                 * the SecTAG, so we have to deduce which port to deliver to.
                 */
                if (macsec_is_offloaded(macsec) && netif_running(ndev)) {
                        const struct macsec_ops *ops;

                        ops = macsec_get_ops(macsec, NULL);

                        if (ops->rx_uses_md_dst && !is_macsec_md_dst)
                                continue;

                        if (is_macsec_md_dst) {
                                struct macsec_rx_sc *rx_sc;

                                /* All drivers that implement MACsec offload
                                 * support using skb metadata destinations must
                                 * indicate that they do so.
                                 */
                                DEBUG_NET_WARN_ON_ONCE(!ops->rx_uses_md_dst);
                                rx_sc = find_rx_sc(&macsec->secy,
                                                   md_dst->u.macsec_info.sci);
                                if (!rx_sc)
                                        continue;
                                /* device indicated macsec offload occurred */
                                skb->dev = ndev;
                                skb->pkt_type = PACKET_HOST;
                                eth_skb_pkt_type(skb, ndev);
                                ret = RX_HANDLER_ANOTHER;
                                goto out;
                        }

                        /* This datapath is insecure because it is unable to
                         * enforce isolation of broadcast/multicast traffic and
                         * unicast traffic with promiscuous mode on the macsec
                         * netdev. Since the core stack has no mechanism to
                         * check that the hardware did indeed receive MACsec
                         * traffic, it is possible that the response handling
                         * done by the MACsec port was to a plaintext packet.
                         * This violates the MACsec protocol standard.
                         */
                        if (ether_addr_equal_64bits(hdr->h_dest,
                                                    ndev->dev_addr)) {
                                /* exact match, divert skb to this port */
                                skb->dev = ndev;
                                skb->pkt_type = PACKET_HOST;
                                ret = RX_HANDLER_ANOTHER;
                                goto out;
                        } else if (is_multicast_ether_addr_64bits(
                                           hdr->h_dest)) {
                                /* multicast frame, deliver on this port too */
                                nskb = skb_clone(skb, GFP_ATOMIC);
                                if (!nskb)
                                        break;

                                nskb->dev = ndev;
                                eth_skb_pkt_type(nskb, ndev);

                                __netif_rx(nskb);
                        } else if (ndev->flags & IFF_PROMISC) {
                                skb->dev = ndev;
                                skb->pkt_type = PACKET_HOST;
                                ret = RX_HANDLER_ANOTHER;
                                goto out;
                        }

                        continue;
                }

                /* 10.6 If the management control validateFrames is not
                 * Strict, frames without a SecTAG are received, counted, and
                 * delivered to the Controlled Port
                 */
                if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) {
                        u64_stats_update_begin(&secy_stats->syncp);
                        secy_stats->stats.InPktsNoTag++;
                        u64_stats_update_end(&secy_stats->syncp);
                        DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
                        continue;
                }

                /* deliver on this port */
                nskb = skb_clone(skb, GFP_ATOMIC);
                if (!nskb)
                        break;

                nskb->dev = ndev;

                if (__netif_rx(nskb) == NET_RX_SUCCESS) {
                        u64_stats_update_begin(&secy_stats->syncp);
                        secy_stats->stats.InPktsUntagged++;
                        u64_stats_update_end(&secy_stats->syncp);
                }
        }

out:
        rcu_read_unlock();
        return ret;
}

static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb)
{
        struct sk_buff *skb = *pskb;
        struct net_device *dev = skb->dev;
        struct macsec_eth_header *hdr;
        struct macsec_secy *secy = NULL;
        struct macsec_rx_sc *rx_sc;
        struct macsec_rx_sa *rx_sa;
        struct macsec_rxh_data *rxd;
        struct macsec_dev *macsec;
        unsigned int len;
        sci_t sci;
        u32 hdr_pn;
        bool cbit;
        struct pcpu_rx_sc_stats *rxsc_stats;
        struct pcpu_secy_stats *secy_stats;
        bool pulled_sci;
        int ret;

        if (skb_headroom(skb) < ETH_HLEN)
                goto drop_direct;

        hdr = macsec_ethhdr(skb);
        if (hdr->eth.h_proto != htons(ETH_P_MACSEC))
                return handle_not_macsec(skb);

        skb = skb_unshare(skb, GFP_ATOMIC);
        *pskb = skb;
        if (!skb)
                return RX_HANDLER_CONSUMED;

        pulled_sci = pskb_may_pull(skb, macsec_extra_len(true));
        if (!pulled_sci) {
                if (!pskb_may_pull(skb, macsec_extra_len(false)))
                        goto drop_direct;
        }

        hdr = macsec_ethhdr(skb);

        /* Frames with a SecTAG that has the TCI E bit set but the C
         * bit clear are discarded, as this reserved encoding is used
         * to identify frames with a SecTAG that are not to be
         * delivered to the Controlled Port.
         */
        if ((hdr->tci_an & (MACSEC_TCI_C | MACSEC_TCI_E)) == MACSEC_TCI_E)
                return RX_HANDLER_PASS;

        /* now, pull the extra length */
        if (hdr->tci_an & MACSEC_TCI_SC) {
                if (!pulled_sci)
                        goto drop_direct;
        }

        /* ethernet header is part of crypto processing */
        skb_push(skb, ETH_HLEN);

        macsec_skb_cb(skb)->has_sci = !!(hdr->tci_an & MACSEC_TCI_SC);
        macsec_skb_cb(skb)->assoc_num = hdr->tci_an & MACSEC_AN_MASK;
        sci = macsec_frame_sci(hdr, macsec_skb_cb(skb)->has_sci);

        rcu_read_lock();
        rxd = macsec_data_rcu(skb->dev);

        list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
                struct macsec_rx_sc *sc = find_rx_sc(&macsec->secy, sci);

                sc = sc ? macsec_rxsc_get(sc) : NULL;

                if (sc) {
                        secy = &macsec->secy;
                        rx_sc = sc;
                        break;
                }
        }

        if (!secy)
                goto nosci;

        dev = secy->netdev;
        macsec = macsec_priv(dev);
        secy_stats = this_cpu_ptr(macsec->stats);
        rxsc_stats = this_cpu_ptr(rx_sc->stats);

        if (!macsec_validate_skb(skb, secy->icv_len, secy->xpn)) {
                u64_stats_update_begin(&secy_stats->syncp);
                secy_stats->stats.InPktsBadTag++;
                u64_stats_update_end(&secy_stats->syncp);
                DEV_STATS_INC(secy->netdev, rx_errors);
                goto drop_nosa;
        }

        rx_sa = macsec_rxsa_get(rx_sc->sa[macsec_skb_cb(skb)->assoc_num]);
        if (!rx_sa) {
                /* 10.6.1 if the SA is not in use */

                /* If validateFrames is Strict or the C bit in the
                 * SecTAG is set, discard
                 */
                if (hdr->tci_an & MACSEC_TCI_C ||
                    secy->validate_frames == MACSEC_VALIDATE_STRICT) {
                        u64_stats_update_begin(&rxsc_stats->syncp);
                        rxsc_stats->stats.InPktsNotUsingSA++;
                        u64_stats_update_end(&rxsc_stats->syncp);
                        DEV_STATS_INC(secy->netdev, rx_errors);
                        goto drop_nosa;
                }

                /* not Strict, the frame (with the SecTAG and ICV
                 * removed) is delivered to the Controlled Port.
                 */
                u64_stats_update_begin(&rxsc_stats->syncp);
                rxsc_stats->stats.InPktsUnusedSA++;
                u64_stats_update_end(&rxsc_stats->syncp);
                goto deliver;
        }

        /* First, PN check to avoid decrypting obviously wrong packets */
        hdr_pn = ntohl(hdr->packet_number);
        if (secy->replay_protect) {
                bool late;

                spin_lock(&rx_sa->lock);
                late = rx_sa->next_pn_halves.lower >= secy->replay_window &&
                       hdr_pn < (rx_sa->next_pn_halves.lower - secy->replay_window);

                if (secy->xpn)
                        late = late && pn_same_half(rx_sa->next_pn_halves.lower, hdr_pn);
                spin_unlock(&rx_sa->lock);

                if (late) {
                        u64_stats_update_begin(&rxsc_stats->syncp);
                        rxsc_stats->stats.InPktsLate++;
                        u64_stats_update_end(&rxsc_stats->syncp);
                        DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
                        goto drop;
                }
        }

        macsec_skb_cb(skb)->rx_sa = rx_sa;

        /* Disabled && !changed text => skip validation */
        if (hdr->tci_an & MACSEC_TCI_C ||
            secy->validate_frames != MACSEC_VALIDATE_DISABLED)
                skb = macsec_decrypt(skb, dev, rx_sa, sci, secy);

        if (IS_ERR(skb)) {
                /* the decrypt callback needs the reference */
                if (PTR_ERR(skb) != -EINPROGRESS) {
                        macsec_rxsa_put(rx_sa);
                        macsec_rxsc_put(rx_sc);
                }
                rcu_read_unlock();
                *pskb = NULL;
                return RX_HANDLER_CONSUMED;
        }

        if (!macsec_post_decrypt(skb, secy, hdr_pn))
                goto drop;

deliver:
        macsec_finalize_skb(skb, secy->icv_len,
                            macsec_extra_len(macsec_skb_cb(skb)->has_sci));
        len = skb->len;
        macsec_reset_skb(skb, secy->netdev);

        if (rx_sa)
                macsec_rxsa_put(rx_sa);
        macsec_rxsc_put(rx_sc);

        skb_orphan(skb);
        ret = gro_cells_receive(&macsec->gro_cells, skb);
        if (ret == NET_RX_SUCCESS)
                count_rx(dev, len);
        else
                DEV_STATS_INC(macsec->secy.netdev, rx_dropped);

        rcu_read_unlock();

        *pskb = NULL;
        return RX_HANDLER_CONSUMED;

drop:
        macsec_rxsa_put(rx_sa);
drop_nosa:
        macsec_rxsc_put(rx_sc);
        rcu_read_unlock();
drop_direct:
        kfree_skb(skb);
        *pskb = NULL;
        return RX_HANDLER_CONSUMED;

nosci:
        /* 10.6.1 if the SC is not found */
        cbit = !!(hdr->tci_an & MACSEC_TCI_C);
        if (!cbit)
                macsec_finalize_skb(skb, MACSEC_DEFAULT_ICV_LEN,
                                    macsec_extra_len(macsec_skb_cb(skb)->has_sci));

        list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
                struct sk_buff *nskb;

                secy_stats = this_cpu_ptr(macsec->stats);

                /* If validateFrames is Strict or the C bit in the
                 * SecTAG is set, discard
                 */
                if (cbit ||
                    macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) {
                        u64_stats_update_begin(&secy_stats->syncp);
                        secy_stats->stats.InPktsNoSCI++;
                        u64_stats_update_end(&secy_stats->syncp);
                        DEV_STATS_INC(macsec->secy.netdev, rx_errors);
                        continue;
                }

                /* not strict, the frame (with the SecTAG and ICV
                 * removed) is delivered to the Controlled Port.
                 */
                nskb = skb_clone(skb, GFP_ATOMIC);
                if (!nskb)
                        break;

                macsec_reset_skb(nskb, macsec->secy.netdev);

                ret = __netif_rx(nskb);
                if (ret == NET_RX_SUCCESS) {
                        u64_stats_update_begin(&secy_stats->syncp);
                        secy_stats->stats.InPktsUnknownSCI++;
                        u64_stats_update_end(&secy_stats->syncp);
                } else {
                        DEV_STATS_INC(macsec->secy.netdev, rx_dropped);
                }
        }

        rcu_read_unlock();
        *pskb = skb;
        return RX_HANDLER_PASS;
}

static struct crypto_aead *macsec_alloc_tfm(char *key, int key_len, int icv_len)
{
        struct crypto_aead *tfm;
        int ret;

        tfm = crypto_alloc_aead("gcm(aes)", 0, 0);

        if (IS_ERR(tfm))
                return tfm;

        ret = crypto_aead_setkey(tfm, key, key_len);
        if (ret < 0)
                goto fail;

        ret = crypto_aead_setauthsize(tfm, icv_len);
        if (ret < 0)
                goto fail;

        return tfm;
fail:
        crypto_free_aead(tfm);
        return ERR_PTR(ret);
}

static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len,
                      int icv_len)
{
        rx_sa->stats = alloc_percpu(struct macsec_rx_sa_stats);
        if (!rx_sa->stats)
                return -ENOMEM;

        rx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len);
        if (IS_ERR(rx_sa->key.tfm)) {
                free_percpu(rx_sa->stats);
                return PTR_ERR(rx_sa->key.tfm);
        }

        rx_sa->ssci = MACSEC_UNDEF_SSCI;
        rx_sa->active = false;
        rx_sa->next_pn = 1;
        refcount_set(&rx_sa->refcnt, 1);
        spin_lock_init(&rx_sa->lock);

        return 0;
}

static void clear_rx_sa(struct macsec_rx_sa *rx_sa)
{
        rx_sa->active = false;

        macsec_rxsa_put(rx_sa);
}

static void free_rx_sc(struct macsec_rx_sc *rx_sc)
{
        int i;

        for (i = 0; i < MACSEC_NUM_AN; i++) {
                struct macsec_rx_sa *sa = rtnl_dereference(rx_sc->sa[i]);

                RCU_INIT_POINTER(rx_sc->sa[i], NULL);
                if (sa)
                        clear_rx_sa(sa);
        }

        macsec_rxsc_put(rx_sc);
}

static struct macsec_rx_sc *del_rx_sc(struct macsec_secy *secy, sci_t sci)
{
        struct macsec_rx_sc *rx_sc, __rcu **rx_scp;

        for (rx_scp = &secy->rx_sc, rx_sc = rtnl_dereference(*rx_scp);
             rx_sc;
             rx_scp = &rx_sc->next, rx_sc = rtnl_dereference(*rx_scp)) {
                if (rx_sc->sci == sci) {
                        if (rx_sc->active)
                                secy->n_rx_sc--;
                        rcu_assign_pointer(*rx_scp, rx_sc->next);
                        return rx_sc;
                }
        }

        return NULL;
}

static struct macsec_rx_sc *create_rx_sc(struct net_device *dev, sci_t sci,
                                         bool active)
{
        struct macsec_rx_sc *rx_sc;
        struct macsec_dev *macsec;
        struct net_device *real_dev = macsec_priv(dev)->real_dev;
        struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);
        struct macsec_secy *secy;

        list_for_each_entry(macsec, &rxd->secys, secys) {
                if (find_rx_sc_rtnl(&macsec->secy, sci))
                        return ERR_PTR(-EEXIST);
        }

        rx_sc = kzalloc(sizeof(*rx_sc), GFP_KERNEL);
        if (!rx_sc)
                return ERR_PTR(-ENOMEM);

        rx_sc->stats = netdev_alloc_pcpu_stats(struct pcpu_rx_sc_stats);
        if (!rx_sc->stats) {
                kfree(rx_sc);
                return ERR_PTR(-ENOMEM);
        }

        rx_sc->sci = sci;
        rx_sc->active = active;
        refcount_set(&rx_sc->refcnt, 1);

        secy = &macsec_priv(dev)->secy;
        rcu_assign_pointer(rx_sc->next, secy->rx_sc);
        rcu_assign_pointer(secy->rx_sc, rx_sc);

        if (rx_sc->active)
                secy->n_rx_sc++;

        return rx_sc;
}

static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len,
                      int icv_len)
{
        tx_sa->stats = alloc_percpu(struct macsec_tx_sa_stats);
        if (!tx_sa->stats)
                return -ENOMEM;

        tx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len);
        if (IS_ERR(tx_sa->key.tfm)) {
                free_percpu(tx_sa->stats);
                return PTR_ERR(tx_sa->key.tfm);
        }

        tx_sa->ssci = MACSEC_UNDEF_SSCI;
        tx_sa->active = false;
        refcount_set(&tx_sa->refcnt, 1);
        spin_lock_init(&tx_sa->lock);

        return 0;
}

static void clear_tx_sa(struct macsec_tx_sa *tx_sa)
{
        tx_sa->active = false;

        macsec_txsa_put(tx_sa);
}

static struct genl_family macsec_fam;

static struct net_device *get_dev_from_nl(struct net *net,
                                          struct nlattr **attrs)
{
        int ifindex = nla_get_u32(attrs[MACSEC_ATTR_IFINDEX]);
        struct net_device *dev;

        dev = __dev_get_by_index(net, ifindex);
        if (!dev)
                return ERR_PTR(-ENODEV);

        if (!netif_is_macsec(dev))
                return ERR_PTR(-ENODEV);

        return dev;
}

static enum macsec_offload nla_get_offload(const struct nlattr *nla)
{
        return (__force enum macsec_offload)nla_get_u8(nla);
}

static sci_t nla_get_sci(const struct nlattr *nla)
{
        return (__force sci_t)nla_get_u64(nla);
}

static int nla_put_sci(struct sk_buff *skb, int attrtype, sci_t value,
                       int padattr)
{
        return nla_put_u64_64bit(skb, attrtype, (__force u64)value, padattr);
}

static ssci_t nla_get_ssci(const struct nlattr *nla)
{
        return (__force ssci_t)nla_get_u32(nla);
}

static int nla_put_ssci(struct sk_buff *skb, int attrtype, ssci_t value)
{
        return nla_put_u32(skb, attrtype, (__force u64)value);
}

static struct macsec_tx_sa *get_txsa_from_nl(struct net *net,
                                             struct nlattr **attrs,
                                             struct nlattr **tb_sa,
                                             struct net_device **devp,
                                             struct macsec_secy **secyp,
                                             struct macsec_tx_sc **scp,
                                             u8 *assoc_num)
{
        struct net_device *dev;
        struct macsec_secy *secy;
        struct macsec_tx_sc *tx_sc;
        struct macsec_tx_sa *tx_sa;

        if (!tb_sa[MACSEC_SA_ATTR_AN])
                return ERR_PTR(-EINVAL);

        *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);

        dev = get_dev_from_nl(net, attrs);
        if (IS_ERR(dev))
                return ERR_CAST(dev);

        if (*assoc_num >= MACSEC_NUM_AN)
                return ERR_PTR(-EINVAL);

        secy = &macsec_priv(dev)->secy;
        tx_sc = &secy->tx_sc;

        tx_sa = rtnl_dereference(tx_sc->sa[*assoc_num]);
        if (!tx_sa)
                return ERR_PTR(-ENODEV);

        *devp = dev;
        *scp = tx_sc;
        *secyp = secy;
        return tx_sa;
}

static struct macsec_rx_sc *get_rxsc_from_nl(struct net *net,
                                             struct nlattr **attrs,
                                             struct nlattr **tb_rxsc,
                                             struct net_device **devp,
                                             struct macsec_secy **secyp)
{
        struct net_device *dev;
        struct macsec_secy *secy;
        struct macsec_rx_sc *rx_sc;
        sci_t sci;

        dev = get_dev_from_nl(net, attrs);
        if (IS_ERR(dev))
                return ERR_CAST(dev);

        secy = &macsec_priv(dev)->secy;

        if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI])
                return ERR_PTR(-EINVAL);

        sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]);
        rx_sc = find_rx_sc_rtnl(secy, sci);
        if (!rx_sc)
                return ERR_PTR(-ENODEV);

        *secyp = secy;
        *devp = dev;

        return rx_sc;
}

static struct macsec_rx_sa *get_rxsa_from_nl(struct net *net,
                                             struct nlattr **attrs,
                                             struct nlattr **tb_rxsc,
                                             struct nlattr **tb_sa,
                                             struct net_device **devp,
                                             struct macsec_secy **secyp,
                                             struct macsec_rx_sc **scp,
                                             u8 *assoc_num)
{
        struct macsec_rx_sc *rx_sc;
        struct macsec_rx_sa *rx_sa;

        if (!tb_sa[MACSEC_SA_ATTR_AN])
                return ERR_PTR(-EINVAL);

        *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);
        if (*assoc_num >= MACSEC_NUM_AN)
                return ERR_PTR(-EINVAL);

        rx_sc = get_rxsc_from_nl(net, attrs, tb_rxsc, devp, secyp);
        if (IS_ERR(rx_sc))
                return ERR_CAST(rx_sc);

        rx_sa = rtnl_dereference(rx_sc->sa[*assoc_num]);
        if (!rx_sa)
                return ERR_PTR(-ENODEV);

        *scp = rx_sc;
        return rx_sa;
}

static const struct nla_policy macsec_genl_policy[NUM_MACSEC_ATTR] = {
        [MACSEC_ATTR_IFINDEX] = { .type = NLA_U32 },
        [MACSEC_ATTR_RXSC_CONFIG] = { .type = NLA_NESTED },
        [MACSEC_ATTR_SA_CONFIG] = { .type = NLA_NESTED },
        [MACSEC_ATTR_OFFLOAD] = { .type = NLA_NESTED },
};

static const struct nla_policy macsec_genl_rxsc_policy[NUM_MACSEC_RXSC_ATTR] = {
        [MACSEC_RXSC_ATTR_SCI] = { .type = NLA_U64 },
        [MACSEC_RXSC_ATTR_ACTIVE] = { .type = NLA_U8 },
};

static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = {
        [MACSEC_SA_ATTR_AN] = { .type = NLA_U8 },
        [MACSEC_SA_ATTR_ACTIVE] = { .type = NLA_U8 },
        [MACSEC_SA_ATTR_PN] = NLA_POLICY_MIN_LEN(4),
        [MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY,
                                   .len = MACSEC_KEYID_LEN, },
        [MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY,
                                 .len = MACSEC_MAX_KEY_LEN, },
        [MACSEC_SA_ATTR_SSCI] = { .type = NLA_U32 },
        [MACSEC_SA_ATTR_SALT] = { .type = NLA_BINARY,
                                  .len = MACSEC_SALT_LEN, },
};

static const struct nla_policy macsec_genl_offload_policy[NUM_MACSEC_OFFLOAD_ATTR] = {
        [MACSEC_OFFLOAD_ATTR_TYPE] = { .type = NLA_U8 },
};

/* Offloads an operation to a device driver */
static int macsec_offload(int (* const func)(struct macsec_context *),
                          struct macsec_context *ctx)
{
        int ret;

        if (unlikely(!func))
                return 0;

        if (ctx->offload == MACSEC_OFFLOAD_PHY)
                mutex_lock(&ctx->phydev->lock);

        ret = (*func)(ctx);

        if (ctx->offload == MACSEC_OFFLOAD_PHY)
                mutex_unlock(&ctx->phydev->lock);

        return ret;
}

static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa)
{
        if (!attrs[MACSEC_ATTR_SA_CONFIG])
                return -EINVAL;

        if (nla_parse_nested_deprecated(tb_sa, MACSEC_SA_ATTR_MAX, attrs[MACSEC_ATTR_SA_CONFIG], macsec_genl_sa_policy, NULL))
                return -EINVAL;

        return 0;
}

static int parse_rxsc_config(struct nlattr **attrs, struct nlattr **tb_rxsc)
{
        if (!attrs[MACSEC_ATTR_RXSC_CONFIG])
                return -EINVAL;

        if (nla_parse_nested_deprecated(tb_rxsc, MACSEC_RXSC_ATTR_MAX, attrs[MACSEC_ATTR_RXSC_CONFIG], macsec_genl_rxsc_policy, NULL))
                return -EINVAL;

        return 0;
}

static bool validate_add_rxsa(struct nlattr **attrs)
{
        if (!attrs[MACSEC_SA_ATTR_AN] ||
            !attrs[MACSEC_SA_ATTR_KEY] ||
            !attrs[MACSEC_SA_ATTR_KEYID])
                return false;

        if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
                return false;

        if (attrs[MACSEC_SA_ATTR_PN] &&
            nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
                return false;

        if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
                if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1)
                        return false;
        }

        if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN)
                return false;

        return true;
}

static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info)
{
        struct net_device *dev;
        struct nlattr **attrs = info->attrs;
        struct macsec_secy *secy;
        struct macsec_rx_sc *rx_sc;
        struct macsec_rx_sa *rx_sa;
        unsigned char assoc_num;
        int pn_len;
        struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
        struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
        int err;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (parse_sa_config(attrs, tb_sa))
                return -EINVAL;

        if (parse_rxsc_config(attrs, tb_rxsc))
                return -EINVAL;

        if (!validate_add_rxsa(tb_sa))
                return -EINVAL;

        rtnl_lock();
        rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy);
        if (IS_ERR(rx_sc)) {
                rtnl_unlock();
                return PTR_ERR(rx_sc);
        }

        assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);

        if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) {
                pr_notice("macsec: nl: add_rxsa: bad key length: %d != %d\n",
                          nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len);
                rtnl_unlock();
                return -EINVAL;
        }

        pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
        if (tb_sa[MACSEC_SA_ATTR_PN] &&
            nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
                pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n",
                          nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
                rtnl_unlock();
                return -EINVAL;
        }

        if (secy->xpn) {
                if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) {
                        rtnl_unlock();
                        return -EINVAL;
                }

                if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) {
                        pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n",
                                  nla_len(tb_sa[MACSEC_SA_ATTR_SALT]),
                                  MACSEC_SALT_LEN);
                        rtnl_unlock();
                        return -EINVAL;
                }
        }

        rx_sa = rtnl_dereference(rx_sc->sa[assoc_num]);
        if (rx_sa) {
                rtnl_unlock();
                return -EBUSY;
        }

        rx_sa = kmalloc(sizeof(*rx_sa), GFP_KERNEL);
        if (!rx_sa) {
                rtnl_unlock();
                return -ENOMEM;
        }

        err = init_rx_sa(rx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
                         secy->key_len, secy->icv_len);
        if (err < 0) {
                kfree(rx_sa);
                rtnl_unlock();
                return err;
        }

        if (tb_sa[MACSEC_SA_ATTR_PN]) {
                spin_lock_bh(&rx_sa->lock);
                rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
                spin_unlock_bh(&rx_sa->lock);
        }

        if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
                rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);

        rx_sa->sc = rx_sc;

        if (secy->xpn) {
                rx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]);
                nla_memcpy(rx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT],
                           MACSEC_SALT_LEN);
        }

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(netdev_priv(dev))) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        err = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.sa.assoc_num = assoc_num;
                ctx.sa.rx_sa = rx_sa;
                ctx.secy = secy;
                memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
                       secy->key_len);

                err = macsec_offload(ops->mdo_add_rxsa, &ctx);
                memzero_explicit(ctx.sa.key, secy->key_len);
                if (err)
                        goto cleanup;
        }

        nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
        rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa);

        rtnl_unlock();

        return 0;

cleanup:
        macsec_rxsa_put(rx_sa);
        rtnl_unlock();
        return err;
}

static bool validate_add_rxsc(struct nlattr **attrs)
{
        if (!attrs[MACSEC_RXSC_ATTR_SCI])
                return false;

        if (attrs[MACSEC_RXSC_ATTR_ACTIVE]) {
                if (nla_get_u8(attrs[MACSEC_RXSC_ATTR_ACTIVE]) > 1)
                        return false;
        }

        return true;
}

static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info)
{
        struct net_device *dev;
        sci_t sci = MACSEC_UNDEF_SCI;
        struct nlattr **attrs = info->attrs;
        struct macsec_rx_sc *rx_sc;
        struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
        struct macsec_secy *secy;
        bool active = true;
        int ret;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (parse_rxsc_config(attrs, tb_rxsc))
                return -EINVAL;

        if (!validate_add_rxsc(tb_rxsc))
                return -EINVAL;

        rtnl_lock();
        dev = get_dev_from_nl(genl_info_net(info), attrs);
        if (IS_ERR(dev)) {
                rtnl_unlock();
                return PTR_ERR(dev);
        }

        secy = &macsec_priv(dev)->secy;
        sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]);

        if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE])
                active = nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]);

        rx_sc = create_rx_sc(dev, sci, active);
        if (IS_ERR(rx_sc)) {
                rtnl_unlock();
                return PTR_ERR(rx_sc);
        }

        if (macsec_is_offloaded(netdev_priv(dev))) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        ret = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.rx_sc = rx_sc;
                ctx.secy = secy;

                ret = macsec_offload(ops->mdo_add_rxsc, &ctx);
                if (ret)
                        goto cleanup;
        }

        rtnl_unlock();

        return 0;

cleanup:
        del_rx_sc(secy, sci);
        free_rx_sc(rx_sc);
        rtnl_unlock();
        return ret;
}

static bool validate_add_txsa(struct nlattr **attrs)
{
        if (!attrs[MACSEC_SA_ATTR_AN] ||
            !attrs[MACSEC_SA_ATTR_PN] ||
            !attrs[MACSEC_SA_ATTR_KEY] ||
            !attrs[MACSEC_SA_ATTR_KEYID])
                return false;

        if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
                return false;

        if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
                return false;

        if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
                if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1)
                        return false;
        }

        if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN)
                return false;

        return true;
}

static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info)
{
        struct net_device *dev;
        struct nlattr **attrs = info->attrs;
        struct macsec_secy *secy;
        struct macsec_tx_sc *tx_sc;
        struct macsec_tx_sa *tx_sa;
        unsigned char assoc_num;
        int pn_len;
        struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
        bool was_operational;
        int err;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (parse_sa_config(attrs, tb_sa))
                return -EINVAL;

        if (!validate_add_txsa(tb_sa))
                return -EINVAL;

        rtnl_lock();
        dev = get_dev_from_nl(genl_info_net(info), attrs);
        if (IS_ERR(dev)) {
                rtnl_unlock();
                return PTR_ERR(dev);
        }

        secy = &macsec_priv(dev)->secy;
        tx_sc = &secy->tx_sc;

        assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]);

        if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) {
                pr_notice("macsec: nl: add_txsa: bad key length: %d != %d\n",
                          nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len);
                rtnl_unlock();
                return -EINVAL;
        }

        pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
        if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
                pr_notice("macsec: nl: add_txsa: bad pn length: %d != %d\n",
                          nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
                rtnl_unlock();
                return -EINVAL;
        }

        if (secy->xpn) {
                if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) {
                        rtnl_unlock();
                        return -EINVAL;
                }

                if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) {
                        pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n",
                                  nla_len(tb_sa[MACSEC_SA_ATTR_SALT]),
                                  MACSEC_SALT_LEN);
                        rtnl_unlock();
                        return -EINVAL;
                }
        }

        tx_sa = rtnl_dereference(tx_sc->sa[assoc_num]);
        if (tx_sa) {
                rtnl_unlock();
                return -EBUSY;
        }

        tx_sa = kmalloc(sizeof(*tx_sa), GFP_KERNEL);
        if (!tx_sa) {
                rtnl_unlock();
                return -ENOMEM;
        }

        err = init_tx_sa(tx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
                         secy->key_len, secy->icv_len);
        if (err < 0) {
                kfree(tx_sa);
                rtnl_unlock();
                return err;
        }

        spin_lock_bh(&tx_sa->lock);
        tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
        spin_unlock_bh(&tx_sa->lock);

        if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
                tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);

        was_operational = secy->operational;
        if (assoc_num == tx_sc->encoding_sa && tx_sa->active)
                secy->operational = true;

        if (secy->xpn) {
                tx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]);
                nla_memcpy(tx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT],
                           MACSEC_SALT_LEN);
        }

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(netdev_priv(dev))) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        err = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.sa.assoc_num = assoc_num;
                ctx.sa.tx_sa = tx_sa;
                ctx.secy = secy;
                memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
                       secy->key_len);

                err = macsec_offload(ops->mdo_add_txsa, &ctx);
                memzero_explicit(ctx.sa.key, secy->key_len);
                if (err)
                        goto cleanup;
        }

        nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
        rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa);

        rtnl_unlock();

        return 0;

cleanup:
        secy->operational = was_operational;
        macsec_txsa_put(tx_sa);
        rtnl_unlock();
        return err;
}

static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr **attrs = info->attrs;
        struct net_device *dev;
        struct macsec_secy *secy;
        struct macsec_rx_sc *rx_sc;
        struct macsec_rx_sa *rx_sa;
        u8 assoc_num;
        struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
        struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
        int ret;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (parse_sa_config(attrs, tb_sa))
                return -EINVAL;

        if (parse_rxsc_config(attrs, tb_rxsc))
                return -EINVAL;

        rtnl_lock();
        rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa,
                                 &dev, &secy, &rx_sc, &assoc_num);
        if (IS_ERR(rx_sa)) {
                rtnl_unlock();
                return PTR_ERR(rx_sa);
        }

        if (rx_sa->active) {
                rtnl_unlock();
                return -EBUSY;
        }

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(netdev_priv(dev))) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        ret = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.sa.assoc_num = assoc_num;
                ctx.sa.rx_sa = rx_sa;
                ctx.secy = secy;

                ret = macsec_offload(ops->mdo_del_rxsa, &ctx);
                if (ret)
                        goto cleanup;
        }

        RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL);
        clear_rx_sa(rx_sa);

        rtnl_unlock();

        return 0;

cleanup:
        rtnl_unlock();
        return ret;
}

static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr **attrs = info->attrs;
        struct net_device *dev;
        struct macsec_secy *secy;
        struct macsec_rx_sc *rx_sc;
        sci_t sci;
        struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
        int ret;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (parse_rxsc_config(attrs, tb_rxsc))
                return -EINVAL;

        if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI])
                return -EINVAL;

        rtnl_lock();
        dev = get_dev_from_nl(genl_info_net(info), info->attrs);
        if (IS_ERR(dev)) {
                rtnl_unlock();
                return PTR_ERR(dev);
        }

        secy = &macsec_priv(dev)->secy;
        sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]);

        rx_sc = del_rx_sc(secy, sci);
        if (!rx_sc) {
                rtnl_unlock();
                return -ENODEV;
        }

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(netdev_priv(dev))) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        ret = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.rx_sc = rx_sc;
                ctx.secy = secy;
                ret = macsec_offload(ops->mdo_del_rxsc, &ctx);
                if (ret)
                        goto cleanup;
        }

        free_rx_sc(rx_sc);
        rtnl_unlock();

        return 0;

cleanup:
        rtnl_unlock();
        return ret;
}

static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr **attrs = info->attrs;
        struct net_device *dev;
        struct macsec_secy *secy;
        struct macsec_tx_sc *tx_sc;
        struct macsec_tx_sa *tx_sa;
        u8 assoc_num;
        struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
        int ret;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (parse_sa_config(attrs, tb_sa))
                return -EINVAL;

        rtnl_lock();
        tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa,
                                 &dev, &secy, &tx_sc, &assoc_num);
        if (IS_ERR(tx_sa)) {
                rtnl_unlock();
                return PTR_ERR(tx_sa);
        }

        if (tx_sa->active) {
                rtnl_unlock();
                return -EBUSY;
        }

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(netdev_priv(dev))) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        ret = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.sa.assoc_num = assoc_num;
                ctx.sa.tx_sa = tx_sa;
                ctx.secy = secy;

                ret = macsec_offload(ops->mdo_del_txsa, &ctx);
                if (ret)
                        goto cleanup;
        }

        RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL);
        clear_tx_sa(tx_sa);

        rtnl_unlock();

        return 0;

cleanup:
        rtnl_unlock();
        return ret;
}

static bool validate_upd_sa(struct nlattr **attrs)
{
        if (!attrs[MACSEC_SA_ATTR_AN] ||
            attrs[MACSEC_SA_ATTR_KEY] ||
            attrs[MACSEC_SA_ATTR_KEYID] ||
            attrs[MACSEC_SA_ATTR_SSCI] ||
            attrs[MACSEC_SA_ATTR_SALT])
                return false;

        if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN)
                return false;

        if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0)
                return false;

        if (attrs[MACSEC_SA_ATTR_ACTIVE]) {
                if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1)
                        return false;
        }

        return true;
}

static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr **attrs = info->attrs;
        struct net_device *dev;
        struct macsec_secy *secy;
        struct macsec_tx_sc *tx_sc;
        struct macsec_tx_sa *tx_sa;
        u8 assoc_num;
        struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
        bool was_operational, was_active;
        pn_t prev_pn;
        int ret = 0;

        prev_pn.full64 = 0;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (parse_sa_config(attrs, tb_sa))
                return -EINVAL;

        if (!validate_upd_sa(tb_sa))
                return -EINVAL;

        rtnl_lock();
        tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa,
                                 &dev, &secy, &tx_sc, &assoc_num);
        if (IS_ERR(tx_sa)) {
                rtnl_unlock();
                return PTR_ERR(tx_sa);
        }

        if (tb_sa[MACSEC_SA_ATTR_PN]) {
                int pn_len;

                pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
                if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
                        pr_notice("macsec: nl: upd_txsa: bad pn length: %d != %d\n",
                                  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
                        rtnl_unlock();
                        return -EINVAL;
                }

                spin_lock_bh(&tx_sa->lock);
                prev_pn = tx_sa->next_pn_halves;
                tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
                spin_unlock_bh(&tx_sa->lock);
        }

        was_active = tx_sa->active;
        if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
                tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);

        was_operational = secy->operational;
        if (assoc_num == tx_sc->encoding_sa)
                secy->operational = tx_sa->active;

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(netdev_priv(dev))) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        ret = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.sa.assoc_num = assoc_num;
                ctx.sa.tx_sa = tx_sa;
                ctx.sa.update_pn = !!prev_pn.full64;
                ctx.secy = secy;

                ret = macsec_offload(ops->mdo_upd_txsa, &ctx);
                if (ret)
                        goto cleanup;
        }

        rtnl_unlock();

        return 0;

cleanup:
        if (tb_sa[MACSEC_SA_ATTR_PN]) {
                spin_lock_bh(&tx_sa->lock);
                tx_sa->next_pn_halves = prev_pn;
                spin_unlock_bh(&tx_sa->lock);
        }
        tx_sa->active = was_active;
        secy->operational = was_operational;
        rtnl_unlock();
        return ret;
}

static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr **attrs = info->attrs;
        struct net_device *dev;
        struct macsec_secy *secy;
        struct macsec_rx_sc *rx_sc;
        struct macsec_rx_sa *rx_sa;
        u8 assoc_num;
        struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
        struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
        bool was_active;
        pn_t prev_pn;
        int ret = 0;

        prev_pn.full64 = 0;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (parse_rxsc_config(attrs, tb_rxsc))
                return -EINVAL;

        if (parse_sa_config(attrs, tb_sa))
                return -EINVAL;

        if (!validate_upd_sa(tb_sa))
                return -EINVAL;

        rtnl_lock();
        rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa,
                                 &dev, &secy, &rx_sc, &assoc_num);
        if (IS_ERR(rx_sa)) {
                rtnl_unlock();
                return PTR_ERR(rx_sa);
        }

        if (tb_sa[MACSEC_SA_ATTR_PN]) {
                int pn_len;

                pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN;
                if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) {
                        pr_notice("macsec: nl: upd_rxsa: bad pn length: %d != %d\n",
                                  nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len);
                        rtnl_unlock();
                        return -EINVAL;
                }

                spin_lock_bh(&rx_sa->lock);
                prev_pn = rx_sa->next_pn_halves;
                rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]);
                spin_unlock_bh(&rx_sa->lock);
        }

        was_active = rx_sa->active;
        if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
                rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(netdev_priv(dev))) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        ret = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.sa.assoc_num = assoc_num;
                ctx.sa.rx_sa = rx_sa;
                ctx.sa.update_pn = !!prev_pn.full64;
                ctx.secy = secy;

                ret = macsec_offload(ops->mdo_upd_rxsa, &ctx);
                if (ret)
                        goto cleanup;
        }

        rtnl_unlock();
        return 0;

cleanup:
        if (tb_sa[MACSEC_SA_ATTR_PN]) {
                spin_lock_bh(&rx_sa->lock);
                rx_sa->next_pn_halves = prev_pn;
                spin_unlock_bh(&rx_sa->lock);
        }
        rx_sa->active = was_active;
        rtnl_unlock();
        return ret;
}

static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr **attrs = info->attrs;
        struct net_device *dev;
        struct macsec_secy *secy;
        struct macsec_rx_sc *rx_sc;
        struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
        unsigned int prev_n_rx_sc;
        bool was_active;
        int ret;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (parse_rxsc_config(attrs, tb_rxsc))
                return -EINVAL;

        if (!validate_add_rxsc(tb_rxsc))
                return -EINVAL;

        rtnl_lock();
        rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy);
        if (IS_ERR(rx_sc)) {
                rtnl_unlock();
                return PTR_ERR(rx_sc);
        }

        was_active = rx_sc->active;
        prev_n_rx_sc = secy->n_rx_sc;
        if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) {
                bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]);

                if (rx_sc->active != new)
                        secy->n_rx_sc += new ? 1 : -1;

                rx_sc->active = new;
        }

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(netdev_priv(dev))) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        ret = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.rx_sc = rx_sc;
                ctx.secy = secy;

                ret = macsec_offload(ops->mdo_upd_rxsc, &ctx);
                if (ret)
                        goto cleanup;
        }

        rtnl_unlock();

        return 0;

cleanup:
        secy->n_rx_sc = prev_n_rx_sc;
        rx_sc->active = was_active;
        rtnl_unlock();
        return ret;
}

static bool macsec_is_configured(struct macsec_dev *macsec)
{
        struct macsec_secy *secy = &macsec->secy;
        struct macsec_tx_sc *tx_sc = &secy->tx_sc;
        int i;

        if (secy->rx_sc)
                return true;

        for (i = 0; i < MACSEC_NUM_AN; i++)
                if (tx_sc->sa[i])
                        return true;

        return false;
}

static bool macsec_needs_tx_tag(struct macsec_dev *macsec,
                                const struct macsec_ops *ops)
{
        return macsec->offload == MACSEC_OFFLOAD_PHY &&
                ops->mdo_insert_tx_tag;
}

static void macsec_set_head_tail_room(struct net_device *dev)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct net_device *real_dev = macsec->real_dev;
        int needed_headroom, needed_tailroom;
        const struct macsec_ops *ops;

        ops = macsec_get_ops(macsec, NULL);
        if (ops) {
                needed_headroom = ops->needed_headroom;
                needed_tailroom = ops->needed_tailroom;
        } else {
                needed_headroom = MACSEC_NEEDED_HEADROOM;
                needed_tailroom = MACSEC_NEEDED_TAILROOM;
        }

        dev->needed_headroom = real_dev->needed_headroom + needed_headroom;
        dev->needed_tailroom = real_dev->needed_tailroom + needed_tailroom;
}

static void macsec_inherit_tso_max(struct net_device *dev)
{
        struct macsec_dev *macsec = macsec_priv(dev);

        /* if macsec is offloaded, we need to follow the lower
         * device's capabilities. otherwise, we can ignore them.
         */
        if (macsec_is_offloaded(macsec))
                netif_inherit_tso_max(dev, macsec->real_dev);
}

static int macsec_update_offload(struct net_device *dev, enum macsec_offload offload)
{
        enum macsec_offload prev_offload;
        const struct macsec_ops *ops;
        struct macsec_context ctx;
        struct macsec_dev *macsec;
        int ret = 0;

        macsec = macsec_priv(dev);

        /* Check if the offloading mode is supported by the underlying layers */
        if (offload != MACSEC_OFFLOAD_OFF &&
            !macsec_check_offload(offload, macsec))
                return -EOPNOTSUPP;

        /* Check if the net device is busy. */
        if (netif_running(dev))
                return -EBUSY;

        /* Check if the device already has rules configured: we do not support
         * rules migration.
         */
        if (macsec_is_configured(macsec))
                return -EBUSY;

        prev_offload = macsec->offload;

        ops = __macsec_get_ops(offload == MACSEC_OFFLOAD_OFF ? prev_offload : offload,
                               macsec, &ctx);
        if (!ops)
                return -EOPNOTSUPP;

        macsec->offload = offload;

        ctx.secy = &macsec->secy;
        ret = offload == MACSEC_OFFLOAD_OFF ? macsec_offload(ops->mdo_del_secy, &ctx)
                                            : macsec_offload(ops->mdo_add_secy, &ctx);
        if (ret) {
                macsec->offload = prev_offload;
                return ret;
        }

        macsec_set_head_tail_room(dev);
        macsec->insert_tx_tag = macsec_needs_tx_tag(macsec, ops);

        macsec_inherit_tso_max(dev);

        netdev_update_features(dev);

        return ret;
}

static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info)
{
        struct nlattr *tb_offload[MACSEC_OFFLOAD_ATTR_MAX + 1];
        struct nlattr **attrs = info->attrs;
        enum macsec_offload offload;
        struct macsec_dev *macsec;
        struct net_device *dev;
        int ret = 0;

        if (!attrs[MACSEC_ATTR_IFINDEX])
                return -EINVAL;

        if (!attrs[MACSEC_ATTR_OFFLOAD])
                return -EINVAL;

        if (nla_parse_nested_deprecated(tb_offload, MACSEC_OFFLOAD_ATTR_MAX,
                                        attrs[MACSEC_ATTR_OFFLOAD],
                                        macsec_genl_offload_policy, NULL))
                return -EINVAL;

        rtnl_lock();

        dev = get_dev_from_nl(genl_info_net(info), attrs);
        if (IS_ERR(dev)) {
                ret = PTR_ERR(dev);
                goto out;
        }
        macsec = macsec_priv(dev);

        if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) {
                ret = -EINVAL;
                goto out;
        }

        offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]);

        if (macsec->offload != offload)
                ret = macsec_update_offload(dev, offload);
out:
        rtnl_unlock();
        return ret;
}

static void get_tx_sa_stats(struct net_device *dev, int an,
                            struct macsec_tx_sa *tx_sa,
                            struct macsec_tx_sa_stats *sum)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        int cpu;

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(macsec, &ctx);
                if (ops) {
                        ctx.sa.assoc_num = an;
                        ctx.sa.tx_sa = tx_sa;
                        ctx.stats.tx_sa_stats = sum;
                        ctx.secy = &macsec_priv(dev)->secy;
                        macsec_offload(ops->mdo_get_tx_sa_stats, &ctx);
                }
                return;
        }

        for_each_possible_cpu(cpu) {
                const struct macsec_tx_sa_stats *stats =
                        per_cpu_ptr(tx_sa->stats, cpu);

                sum->OutPktsProtected += stats->OutPktsProtected;
                sum->OutPktsEncrypted += stats->OutPktsEncrypted;
        }
}

static int copy_tx_sa_stats(struct sk_buff *skb, struct macsec_tx_sa_stats *sum)
{
        if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED,
                        sum->OutPktsProtected) ||
            nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED,
                        sum->OutPktsEncrypted))
                return -EMSGSIZE;

        return 0;
}

static void get_rx_sa_stats(struct net_device *dev,
                            struct macsec_rx_sc *rx_sc, int an,
                            struct macsec_rx_sa *rx_sa,
                            struct macsec_rx_sa_stats *sum)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        int cpu;

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(macsec, &ctx);
                if (ops) {
                        ctx.sa.assoc_num = an;
                        ctx.sa.rx_sa = rx_sa;
                        ctx.stats.rx_sa_stats = sum;
                        ctx.secy = &macsec_priv(dev)->secy;
                        ctx.rx_sc = rx_sc;
                        macsec_offload(ops->mdo_get_rx_sa_stats, &ctx);
                }
                return;
        }

        for_each_possible_cpu(cpu) {
                const struct macsec_rx_sa_stats *stats =
                        per_cpu_ptr(rx_sa->stats, cpu);

                sum->InPktsOK         += stats->InPktsOK;
                sum->InPktsInvalid    += stats->InPktsInvalid;
                sum->InPktsNotValid   += stats->InPktsNotValid;
                sum->InPktsNotUsingSA += stats->InPktsNotUsingSA;
                sum->InPktsUnusedSA   += stats->InPktsUnusedSA;
        }
}

static int copy_rx_sa_stats(struct sk_buff *skb,
                            struct macsec_rx_sa_stats *sum)
{
        if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK) ||
            nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID,
                        sum->InPktsInvalid) ||
            nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID,
                        sum->InPktsNotValid) ||
            nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA,
                        sum->InPktsNotUsingSA) ||
            nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA,
                        sum->InPktsUnusedSA))
                return -EMSGSIZE;

        return 0;
}

static void get_rx_sc_stats(struct net_device *dev,
                            struct macsec_rx_sc *rx_sc,
                            struct macsec_rx_sc_stats *sum)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        int cpu;

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(macsec, &ctx);
                if (ops) {
                        ctx.stats.rx_sc_stats = sum;
                        ctx.secy = &macsec_priv(dev)->secy;
                        ctx.rx_sc = rx_sc;
                        macsec_offload(ops->mdo_get_rx_sc_stats, &ctx);
                }
                return;
        }

        for_each_possible_cpu(cpu) {
                const struct pcpu_rx_sc_stats *stats;
                struct macsec_rx_sc_stats tmp;
                unsigned int start;

                stats = per_cpu_ptr(rx_sc->stats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        memcpy(&tmp, &stats->stats, sizeof(tmp));
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                sum->InOctetsValidated += tmp.InOctetsValidated;
                sum->InOctetsDecrypted += tmp.InOctetsDecrypted;
                sum->InPktsUnchecked   += tmp.InPktsUnchecked;
                sum->InPktsDelayed     += tmp.InPktsDelayed;
                sum->InPktsOK          += tmp.InPktsOK;
                sum->InPktsInvalid     += tmp.InPktsInvalid;
                sum->InPktsLate        += tmp.InPktsLate;
                sum->InPktsNotValid    += tmp.InPktsNotValid;
                sum->InPktsNotUsingSA  += tmp.InPktsNotUsingSA;
                sum->InPktsUnusedSA    += tmp.InPktsUnusedSA;
        }
}

static int copy_rx_sc_stats(struct sk_buff *skb, struct macsec_rx_sc_stats *sum)
{
        if (nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED,
                              sum->InOctetsValidated,
                              MACSEC_RXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED,
                              sum->InOctetsDecrypted,
                              MACSEC_RXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED,
                              sum->InPktsUnchecked,
                              MACSEC_RXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED,
                              sum->InPktsDelayed,
                              MACSEC_RXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK,
                              sum->InPktsOK,
                              MACSEC_RXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID,
                              sum->InPktsInvalid,
                              MACSEC_RXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE,
                              sum->InPktsLate,
                              MACSEC_RXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID,
                              sum->InPktsNotValid,
                              MACSEC_RXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA,
                              sum->InPktsNotUsingSA,
                              MACSEC_RXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA,
                              sum->InPktsUnusedSA,
                              MACSEC_RXSC_STATS_ATTR_PAD))
                return -EMSGSIZE;

        return 0;
}

static void get_tx_sc_stats(struct net_device *dev,
                            struct macsec_tx_sc_stats *sum)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        int cpu;

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(macsec, &ctx);
                if (ops) {
                        ctx.stats.tx_sc_stats = sum;
                        ctx.secy = &macsec_priv(dev)->secy;
                        macsec_offload(ops->mdo_get_tx_sc_stats, &ctx);
                }
                return;
        }

        for_each_possible_cpu(cpu) {
                const struct pcpu_tx_sc_stats *stats;
                struct macsec_tx_sc_stats tmp;
                unsigned int start;

                stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        memcpy(&tmp, &stats->stats, sizeof(tmp));
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                sum->OutPktsProtected   += tmp.OutPktsProtected;
                sum->OutPktsEncrypted   += tmp.OutPktsEncrypted;
                sum->OutOctetsProtected += tmp.OutOctetsProtected;
                sum->OutOctetsEncrypted += tmp.OutOctetsEncrypted;
        }
}

static int copy_tx_sc_stats(struct sk_buff *skb, struct macsec_tx_sc_stats *sum)
{
        if (nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED,
                              sum->OutPktsProtected,
                              MACSEC_TXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED,
                              sum->OutPktsEncrypted,
                              MACSEC_TXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED,
                              sum->OutOctetsProtected,
                              MACSEC_TXSC_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED,
                              sum->OutOctetsEncrypted,
                              MACSEC_TXSC_STATS_ATTR_PAD))
                return -EMSGSIZE;

        return 0;
}

static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        int cpu;

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(macsec, &ctx);
                if (ops) {
                        ctx.stats.dev_stats = sum;
                        ctx.secy = &macsec_priv(dev)->secy;
                        macsec_offload(ops->mdo_get_dev_stats, &ctx);
                }
                return;
        }

        for_each_possible_cpu(cpu) {
                const struct pcpu_secy_stats *stats;
                struct macsec_dev_stats tmp;
                unsigned int start;

                stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu);
                do {
                        start = u64_stats_fetch_begin(&stats->syncp);
                        memcpy(&tmp, &stats->stats, sizeof(tmp));
                } while (u64_stats_fetch_retry(&stats->syncp, start));

                sum->OutPktsUntagged  += tmp.OutPktsUntagged;
                sum->InPktsUntagged   += tmp.InPktsUntagged;
                sum->OutPktsTooLong   += tmp.OutPktsTooLong;
                sum->InPktsNoTag      += tmp.InPktsNoTag;
                sum->InPktsBadTag     += tmp.InPktsBadTag;
                sum->InPktsUnknownSCI += tmp.InPktsUnknownSCI;
                sum->InPktsNoSCI      += tmp.InPktsNoSCI;
                sum->InPktsOverrun    += tmp.InPktsOverrun;
        }
}

static int copy_secy_stats(struct sk_buff *skb, struct macsec_dev_stats *sum)
{
        if (nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED,
                              sum->OutPktsUntagged,
                              MACSEC_SECY_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED,
                              sum->InPktsUntagged,
                              MACSEC_SECY_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG,
                              sum->OutPktsTooLong,
                              MACSEC_SECY_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG,
                              sum->InPktsNoTag,
                              MACSEC_SECY_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG,
                              sum->InPktsBadTag,
                              MACSEC_SECY_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI,
                              sum->InPktsUnknownSCI,
                              MACSEC_SECY_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI,
                              sum->InPktsNoSCI,
                              MACSEC_SECY_STATS_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN,
                              sum->InPktsOverrun,
                              MACSEC_SECY_STATS_ATTR_PAD))
                return -EMSGSIZE;

        return 0;
}

static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb)
{
        struct macsec_tx_sc *tx_sc = &secy->tx_sc;
        struct nlattr *secy_nest = nla_nest_start_noflag(skb,
                                                         MACSEC_ATTR_SECY);
        u64 csid;

        if (!secy_nest)
                return 1;

        switch (secy->key_len) {
        case MACSEC_GCM_AES_128_SAK_LEN:
                csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID;
                break;
        case MACSEC_GCM_AES_256_SAK_LEN:
                csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256;
                break;
        default:
                goto cancel;
        }

        if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci,
                        MACSEC_SECY_ATTR_PAD) ||
            nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE,
                              csid, MACSEC_SECY_ATTR_PAD) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_REPLAY, secy->replay_protect) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_VALIDATE, secy->validate_frames) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_ENCRYPT, tx_sc->encrypt) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_INC_SCI, tx_sc->send_sci) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_ES, tx_sc->end_station) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_SCB, tx_sc->scb) ||
            nla_put_u8(skb, MACSEC_SECY_ATTR_ENCODING_SA, tx_sc->encoding_sa))
                goto cancel;

        if (secy->replay_protect) {
                if (nla_put_u32(skb, MACSEC_SECY_ATTR_WINDOW, secy->replay_window))
                        goto cancel;
        }

        nla_nest_end(skb, secy_nest);
        return 0;

cancel:
        nla_nest_cancel(skb, secy_nest);
        return 1;
}

static noinline_for_stack int
dump_secy(struct macsec_secy *secy, struct net_device *dev,
          struct sk_buff *skb, struct netlink_callback *cb)
{
        struct macsec_tx_sc_stats tx_sc_stats = {0, };
        struct macsec_tx_sa_stats tx_sa_stats = {0, };
        struct macsec_rx_sc_stats rx_sc_stats = {0, };
        struct macsec_rx_sa_stats rx_sa_stats = {0, };
        struct macsec_dev *macsec = netdev_priv(dev);
        struct macsec_dev_stats dev_stats = {0, };
        struct macsec_tx_sc *tx_sc = &secy->tx_sc;
        struct nlattr *txsa_list, *rxsc_list;
        struct macsec_rx_sc *rx_sc;
        struct nlattr *attr;
        void *hdr;
        int i, j;

        hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
                          &macsec_fam, NLM_F_MULTI, MACSEC_CMD_GET_TXSC);
        if (!hdr)
                return -EMSGSIZE;

        genl_dump_check_consistent(cb, hdr);

        if (nla_put_u32(skb, MACSEC_ATTR_IFINDEX, dev->ifindex))
                goto nla_put_failure;

        attr = nla_nest_start_noflag(skb, MACSEC_ATTR_OFFLOAD);
        if (!attr)
                goto nla_put_failure;
        if (nla_put_u8(skb, MACSEC_OFFLOAD_ATTR_TYPE, macsec->offload))
                goto nla_put_failure;
        nla_nest_end(skb, attr);

        if (nla_put_secy(secy, skb))
                goto nla_put_failure;

        attr = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSC_STATS);
        if (!attr)
                goto nla_put_failure;

        get_tx_sc_stats(dev, &tx_sc_stats);
        if (copy_tx_sc_stats(skb, &tx_sc_stats)) {
                nla_nest_cancel(skb, attr);
                goto nla_put_failure;
        }
        nla_nest_end(skb, attr);

        attr = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY_STATS);
        if (!attr)
                goto nla_put_failure;
        get_secy_stats(dev, &dev_stats);
        if (copy_secy_stats(skb, &dev_stats)) {
                nla_nest_cancel(skb, attr);
                goto nla_put_failure;
        }
        nla_nest_end(skb, attr);

        txsa_list = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSA_LIST);
        if (!txsa_list)
                goto nla_put_failure;
        for (i = 0, j = 1; i < MACSEC_NUM_AN; i++) {
                struct macsec_tx_sa *tx_sa = rtnl_dereference(tx_sc->sa[i]);
                struct nlattr *txsa_nest;
                u64 pn;
                int pn_len;

                if (!tx_sa)
                        continue;

                txsa_nest = nla_nest_start_noflag(skb, j++);
                if (!txsa_nest) {
                        nla_nest_cancel(skb, txsa_list);
                        goto nla_put_failure;
                }

                attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS);
                if (!attr) {
                        nla_nest_cancel(skb, txsa_nest);
                        nla_nest_cancel(skb, txsa_list);
                        goto nla_put_failure;
                }
                memset(&tx_sa_stats, 0, sizeof(tx_sa_stats));
                get_tx_sa_stats(dev, i, tx_sa, &tx_sa_stats);
                if (copy_tx_sa_stats(skb, &tx_sa_stats)) {
                        nla_nest_cancel(skb, attr);
                        nla_nest_cancel(skb, txsa_nest);
                        nla_nest_cancel(skb, txsa_list);
                        goto nla_put_failure;
                }
                nla_nest_end(skb, attr);

                if (secy->xpn) {
                        pn = tx_sa->next_pn;
                        pn_len = MACSEC_XPN_PN_LEN;
                } else {
                        pn = tx_sa->next_pn_halves.lower;
                        pn_len = MACSEC_DEFAULT_PN_LEN;
                }

                if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) ||
                    nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) ||
                    nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, tx_sa->key.id) ||
                    (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, tx_sa->ssci)) ||
                    nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, tx_sa->active)) {
                        nla_nest_cancel(skb, txsa_nest);
                        nla_nest_cancel(skb, txsa_list);
                        goto nla_put_failure;
                }

                nla_nest_end(skb, txsa_nest);
        }
        nla_nest_end(skb, txsa_list);

        rxsc_list = nla_nest_start_noflag(skb, MACSEC_ATTR_RXSC_LIST);
        if (!rxsc_list)
                goto nla_put_failure;

        j = 1;
        for_each_rxsc_rtnl(secy, rx_sc) {
                int k;
                struct nlattr *rxsa_list;
                struct nlattr *rxsc_nest = nla_nest_start_noflag(skb, j++);

                if (!rxsc_nest) {
                        nla_nest_cancel(skb, rxsc_list);
                        goto nla_put_failure;
                }

                if (nla_put_u8(skb, MACSEC_RXSC_ATTR_ACTIVE, rx_sc->active) ||
                    nla_put_sci(skb, MACSEC_RXSC_ATTR_SCI, rx_sc->sci,
                                MACSEC_RXSC_ATTR_PAD)) {
                        nla_nest_cancel(skb, rxsc_nest);
                        nla_nest_cancel(skb, rxsc_list);
                        goto nla_put_failure;
                }

                attr = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_STATS);
                if (!attr) {
                        nla_nest_cancel(skb, rxsc_nest);
                        nla_nest_cancel(skb, rxsc_list);
                        goto nla_put_failure;
                }
                memset(&rx_sc_stats, 0, sizeof(rx_sc_stats));
                get_rx_sc_stats(dev, rx_sc, &rx_sc_stats);
                if (copy_rx_sc_stats(skb, &rx_sc_stats)) {
                        nla_nest_cancel(skb, attr);
                        nla_nest_cancel(skb, rxsc_nest);
                        nla_nest_cancel(skb, rxsc_list);
                        goto nla_put_failure;
                }
                nla_nest_end(skb, attr);

                rxsa_list = nla_nest_start_noflag(skb,
                                                  MACSEC_RXSC_ATTR_SA_LIST);
                if (!rxsa_list) {
                        nla_nest_cancel(skb, rxsc_nest);
                        nla_nest_cancel(skb, rxsc_list);
                        goto nla_put_failure;
                }

                for (i = 0, k = 1; i < MACSEC_NUM_AN; i++) {
                        struct macsec_rx_sa *rx_sa = rtnl_dereference(rx_sc->sa[i]);
                        struct nlattr *rxsa_nest;
                        u64 pn;
                        int pn_len;

                        if (!rx_sa)
                                continue;

                        rxsa_nest = nla_nest_start_noflag(skb, k++);
                        if (!rxsa_nest) {
                                nla_nest_cancel(skb, rxsa_list);
                                nla_nest_cancel(skb, rxsc_nest);
                                nla_nest_cancel(skb, rxsc_list);
                                goto nla_put_failure;
                        }

                        attr = nla_nest_start_noflag(skb,
                                                     MACSEC_SA_ATTR_STATS);
                        if (!attr) {
                                nla_nest_cancel(skb, rxsa_list);
                                nla_nest_cancel(skb, rxsc_nest);
                                nla_nest_cancel(skb, rxsc_list);
                                goto nla_put_failure;
                        }
                        memset(&rx_sa_stats, 0, sizeof(rx_sa_stats));
                        get_rx_sa_stats(dev, rx_sc, i, rx_sa, &rx_sa_stats);
                        if (copy_rx_sa_stats(skb, &rx_sa_stats)) {
                                nla_nest_cancel(skb, attr);
                                nla_nest_cancel(skb, rxsa_list);
                                nla_nest_cancel(skb, rxsc_nest);
                                nla_nest_cancel(skb, rxsc_list);
                                goto nla_put_failure;
                        }
                        nla_nest_end(skb, attr);

                        if (secy->xpn) {
                                pn = rx_sa->next_pn;
                                pn_len = MACSEC_XPN_PN_LEN;
                        } else {
                                pn = rx_sa->next_pn_halves.lower;
                                pn_len = MACSEC_DEFAULT_PN_LEN;
                        }

                        if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) ||
                            nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) ||
                            nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, rx_sa->key.id) ||
                            (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, rx_sa->ssci)) ||
                            nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, rx_sa->active)) {
                                nla_nest_cancel(skb, rxsa_nest);
                                nla_nest_cancel(skb, rxsc_nest);
                                nla_nest_cancel(skb, rxsc_list);
                                goto nla_put_failure;
                        }
                        nla_nest_end(skb, rxsa_nest);
                }

                nla_nest_end(skb, rxsa_list);
                nla_nest_end(skb, rxsc_nest);
        }

        nla_nest_end(skb, rxsc_list);

        genlmsg_end(skb, hdr);

        return 0;

nla_put_failure:
        genlmsg_cancel(skb, hdr);
        return -EMSGSIZE;
}

static int macsec_generation = 1; /* protected by RTNL */

static int macsec_dump_txsc(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct net_device *dev;
        int dev_idx, d;

        dev_idx = cb->args[0];

        d = 0;
        rtnl_lock();

        cb->seq = macsec_generation;

        for_each_netdev(net, dev) {
                struct macsec_secy *secy;

                if (d < dev_idx)
                        goto next;

                if (!netif_is_macsec(dev))
                        goto next;

                secy = &macsec_priv(dev)->secy;
                if (dump_secy(secy, dev, skb, cb) < 0)
                        goto done;
next:
                d++;
        }

done:
        rtnl_unlock();
        cb->args[0] = d;
        return skb->len;
}

static const struct genl_small_ops macsec_genl_ops[] = {
        {
                .cmd = MACSEC_CMD_GET_TXSC,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .dumpit = macsec_dump_txsc,
        },
        {
                .cmd = MACSEC_CMD_ADD_RXSC,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_add_rxsc,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = MACSEC_CMD_DEL_RXSC,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_del_rxsc,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = MACSEC_CMD_UPD_RXSC,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_upd_rxsc,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = MACSEC_CMD_ADD_TXSA,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_add_txsa,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = MACSEC_CMD_DEL_TXSA,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_del_txsa,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = MACSEC_CMD_UPD_TXSA,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_upd_txsa,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = MACSEC_CMD_ADD_RXSA,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_add_rxsa,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = MACSEC_CMD_DEL_RXSA,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_del_rxsa,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = MACSEC_CMD_UPD_RXSA,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_upd_rxsa,
                .flags = GENL_ADMIN_PERM,
        },
        {
                .cmd = MACSEC_CMD_UPD_OFFLOAD,
                .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
                .doit = macsec_upd_offload,
                .flags = GENL_ADMIN_PERM,
        },
};

static struct genl_family macsec_fam __ro_after_init = {
        .name                = MACSEC_GENL_NAME,
        .hdrsize        = 0,
        .version        = MACSEC_GENL_VERSION,
        .maxattr        = MACSEC_ATTR_MAX,
        .policy = macsec_genl_policy,
        .netnsok        = true,
        .module                = THIS_MODULE,
        .small_ops        = macsec_genl_ops,
        .n_small_ops        = ARRAY_SIZE(macsec_genl_ops),
        .resv_start_op        = MACSEC_CMD_UPD_OFFLOAD + 1,
};

static struct sk_buff *macsec_insert_tx_tag(struct sk_buff *skb,
                                            struct net_device *dev)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        const struct macsec_ops *ops;
        struct phy_device *phydev;
        struct macsec_context ctx;
        int skb_final_len;
        int err;

        ops = macsec_get_ops(macsec, &ctx);
        skb_final_len = skb->len - ETH_HLEN + ops->needed_headroom +
                ops->needed_tailroom;
        if (unlikely(skb_final_len > macsec->real_dev->mtu)) {
                err = -EINVAL;
                goto cleanup;
        }

        phydev = macsec->real_dev->phydev;

        err = skb_ensure_writable_head_tail(skb, dev);
        if (unlikely(err < 0))
                goto cleanup;

        err = ops->mdo_insert_tx_tag(phydev, skb);
        if (unlikely(err))
                goto cleanup;

        return skb;
cleanup:
        kfree_skb(skb);
        return ERR_PTR(err);
}

static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
                                     struct net_device *dev)
{
        struct macsec_dev *macsec = netdev_priv(dev);
        struct macsec_secy *secy = &macsec->secy;
        struct pcpu_secy_stats *secy_stats;
        int ret, len;

        if (macsec_is_offloaded(netdev_priv(dev))) {
                struct metadata_dst *md_dst = secy->tx_sc.md_dst;

                skb_dst_drop(skb);
                dst_hold(&md_dst->dst);
                skb_dst_set(skb, &md_dst->dst);

                if (macsec->insert_tx_tag) {
                        skb = macsec_insert_tx_tag(skb, dev);
                        if (IS_ERR(skb)) {
                                DEV_STATS_INC(dev, tx_dropped);
                                return NETDEV_TX_OK;
                        }
                }

                skb->dev = macsec->real_dev;
                return dev_queue_xmit(skb);
        }

        /* 10.5 */
        if (!secy->protect_frames) {
                secy_stats = this_cpu_ptr(macsec->stats);
                u64_stats_update_begin(&secy_stats->syncp);
                secy_stats->stats.OutPktsUntagged++;
                u64_stats_update_end(&secy_stats->syncp);
                skb->dev = macsec->real_dev;
                len = skb->len;
                ret = dev_queue_xmit(skb);
                count_tx(dev, ret, len);
                return ret;
        }

        if (!secy->operational) {
                kfree_skb(skb);
                DEV_STATS_INC(dev, tx_dropped);
                return NETDEV_TX_OK;
        }

        len = skb->len;
        skb = macsec_encrypt(skb, dev);
        if (IS_ERR(skb)) {
                if (PTR_ERR(skb) != -EINPROGRESS)
                        DEV_STATS_INC(dev, tx_dropped);
                return NETDEV_TX_OK;
        }

        macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa);

        macsec_encrypt_finish(skb, dev);
        ret = dev_queue_xmit(skb);
        count_tx(dev, ret, len);
        return ret;
}

#define MACSEC_FEATURES \
        (NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST)

#define MACSEC_OFFLOAD_FEATURES \
        (MACSEC_FEATURES | NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES | \
         NETIF_F_LRO | NETIF_F_RXHASH | NETIF_F_CSUM_MASK | NETIF_F_RXCSUM)

static int macsec_dev_init(struct net_device *dev)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct net_device *real_dev = macsec->real_dev;
        int err;

        err = gro_cells_init(&macsec->gro_cells, dev);
        if (err)
                return err;

        macsec_inherit_tso_max(dev);

        dev->hw_features = real_dev->hw_features & MACSEC_OFFLOAD_FEATURES;
        dev->hw_features |= NETIF_F_GSO_SOFTWARE;

        dev->features = real_dev->features & MACSEC_OFFLOAD_FEATURES;
        dev->features |= NETIF_F_GSO_SOFTWARE;
        dev->lltx = true;
        dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;

        macsec_set_head_tail_room(dev);

        if (is_zero_ether_addr(dev->dev_addr))
                eth_hw_addr_inherit(dev, real_dev);
        if (is_zero_ether_addr(dev->broadcast))
                memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len);

        /* Get macsec's reference to real_dev */
        netdev_hold(real_dev, &macsec->dev_tracker, GFP_KERNEL);

        return 0;
}

static void macsec_dev_uninit(struct net_device *dev)
{
        struct macsec_dev *macsec = macsec_priv(dev);

        gro_cells_destroy(&macsec->gro_cells);
}

static netdev_features_t macsec_fix_features(struct net_device *dev,
                                             netdev_features_t features)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct net_device *real_dev = macsec->real_dev;
        netdev_features_t mask;

        mask = macsec_is_offloaded(macsec) ? MACSEC_OFFLOAD_FEATURES
                                           : MACSEC_FEATURES;

        features &= (real_dev->features & mask) |
                    NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES;

        return features;
}

static int macsec_dev_open(struct net_device *dev)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct net_device *real_dev = macsec->real_dev;
        int err;

        err = dev_uc_add(real_dev, dev->dev_addr);
        if (err < 0)
                return err;

        if (dev->flags & IFF_ALLMULTI) {
                err = dev_set_allmulti(real_dev, 1);
                if (err < 0)
                        goto del_unicast;
        }

        if (dev->flags & IFF_PROMISC) {
                err = dev_set_promiscuity(real_dev, 1);
                if (err < 0)
                        goto clear_allmulti;
        }

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        err = -EOPNOTSUPP;
                        goto clear_allmulti;
                }

                ctx.secy = &macsec->secy;
                err = macsec_offload(ops->mdo_dev_open, &ctx);
                if (err)
                        goto clear_allmulti;
        }

        if (netif_carrier_ok(real_dev))
                netif_carrier_on(dev);

        return 0;
clear_allmulti:
        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(real_dev, -1);
del_unicast:
        dev_uc_del(real_dev, dev->dev_addr);
        netif_carrier_off(dev);
        return err;
}

static int macsec_dev_stop(struct net_device *dev)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct net_device *real_dev = macsec->real_dev;

        netif_carrier_off(dev);

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(macsec, &ctx);
                if (ops) {
                        ctx.secy = &macsec->secy;
                        macsec_offload(ops->mdo_dev_stop, &ctx);
                }
        }

        dev_mc_unsync(real_dev, dev);
        dev_uc_unsync(real_dev, dev);

        if (dev->flags & IFF_ALLMULTI)
                dev_set_allmulti(real_dev, -1);

        if (dev->flags & IFF_PROMISC)
                dev_set_promiscuity(real_dev, -1);

        dev_uc_del(real_dev, dev->dev_addr);

        return 0;
}

static void macsec_dev_change_rx_flags(struct net_device *dev, int change)
{
        struct net_device *real_dev = macsec_priv(dev)->real_dev;

        if (!(dev->flags & IFF_UP))
                return;

        if (change & IFF_ALLMULTI)
                dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);

        if (change & IFF_PROMISC)
                dev_set_promiscuity(real_dev,
                                    dev->flags & IFF_PROMISC ? 1 : -1);
}

static void macsec_dev_set_rx_mode(struct net_device *dev)
{
        struct net_device *real_dev = macsec_priv(dev)->real_dev;

        dev_mc_sync(real_dev, dev);
        dev_uc_sync(real_dev, dev);
}

static int macsec_set_mac_address(struct net_device *dev, void *p)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct net_device *real_dev = macsec->real_dev;
        struct sockaddr *addr = p;
        u8  old_addr[ETH_ALEN];
        int err;

        if (!is_valid_ether_addr(addr->sa_data))
                return -EADDRNOTAVAIL;

        if (dev->flags & IFF_UP) {
                err = dev_uc_add(real_dev, addr->sa_data);
                if (err < 0)
                        return err;
        }

        ether_addr_copy(old_addr, dev->dev_addr);
        eth_hw_addr_set(dev, addr->sa_data);

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(macsec, &ctx);
                if (!ops) {
                        err = -EOPNOTSUPP;
                        goto restore_old_addr;
                }

                ctx.secy = &macsec->secy;
                err = macsec_offload(ops->mdo_upd_secy, &ctx);
                if (err)
                        goto restore_old_addr;
        }

        if (dev->flags & IFF_UP)
                dev_uc_del(real_dev, old_addr);

        return 0;

restore_old_addr:
        if (dev->flags & IFF_UP)
                dev_uc_del(real_dev, addr->sa_data);

        eth_hw_addr_set(dev, old_addr);

        return err;
}

static int macsec_change_mtu(struct net_device *dev, int new_mtu)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        unsigned int extra = macsec->secy.icv_len + macsec_extra_len(true);

        if (macsec->real_dev->mtu - extra < new_mtu)
                return -ERANGE;

        WRITE_ONCE(dev->mtu, new_mtu);

        return 0;
}

static void macsec_get_stats64(struct net_device *dev,
                               struct rtnl_link_stats64 *s)
{
        if (!dev->tstats)
                return;

        dev_fetch_sw_netstats(s, dev->tstats);

        s->rx_dropped = DEV_STATS_READ(dev, rx_dropped);
        s->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
        s->rx_errors = DEV_STATS_READ(dev, rx_errors);
}

static int macsec_get_iflink(const struct net_device *dev)
{
        return READ_ONCE(macsec_priv(dev)->real_dev->ifindex);
}

static const struct net_device_ops macsec_netdev_ops = {
        .ndo_init                = macsec_dev_init,
        .ndo_uninit                = macsec_dev_uninit,
        .ndo_open                = macsec_dev_open,
        .ndo_stop                = macsec_dev_stop,
        .ndo_fix_features        = macsec_fix_features,
        .ndo_change_mtu                = macsec_change_mtu,
        .ndo_set_rx_mode        = macsec_dev_set_rx_mode,
        .ndo_change_rx_flags        = macsec_dev_change_rx_flags,
        .ndo_set_mac_address        = macsec_set_mac_address,
        .ndo_start_xmit                = macsec_start_xmit,
        .ndo_get_stats64        = macsec_get_stats64,
        .ndo_get_iflink                = macsec_get_iflink,
};

static const struct device_type macsec_type = {
        .name = "macsec",
};

static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = {
        [IFLA_MACSEC_SCI] = { .type = NLA_U64 },
        [IFLA_MACSEC_PORT] = { .type = NLA_U16 },
        [IFLA_MACSEC_ICV_LEN] = { .type = NLA_U8 },
        [IFLA_MACSEC_CIPHER_SUITE] = { .type = NLA_U64 },
        [IFLA_MACSEC_WINDOW] = { .type = NLA_U32 },
        [IFLA_MACSEC_ENCODING_SA] = { .type = NLA_U8 },
        [IFLA_MACSEC_ENCRYPT] = { .type = NLA_U8 },
        [IFLA_MACSEC_PROTECT] = { .type = NLA_U8 },
        [IFLA_MACSEC_INC_SCI] = { .type = NLA_U8 },
        [IFLA_MACSEC_ES] = { .type = NLA_U8 },
        [IFLA_MACSEC_SCB] = { .type = NLA_U8 },
        [IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 },
        [IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 },
        [IFLA_MACSEC_OFFLOAD] = { .type = NLA_U8 },
};

static void macsec_free_netdev(struct net_device *dev)
{
        struct macsec_dev *macsec = macsec_priv(dev);

        dst_release(&macsec->secy.tx_sc.md_dst->dst);
        free_percpu(macsec->stats);
        free_percpu(macsec->secy.tx_sc.stats);

        /* Get rid of the macsec's reference to real_dev */
        netdev_put(macsec->real_dev, &macsec->dev_tracker);
}

static void macsec_setup(struct net_device *dev)
{
        ether_setup(dev);
        dev->min_mtu = 0;
        dev->max_mtu = ETH_MAX_MTU;
        dev->priv_flags |= IFF_NO_QUEUE;
        dev->netdev_ops = &macsec_netdev_ops;
        dev->needs_free_netdev = true;
        dev->priv_destructor = macsec_free_netdev;
        SET_NETDEV_DEVTYPE(dev, &macsec_type);

        eth_zero_addr(dev->broadcast);
}

static int macsec_changelink_common(struct net_device *dev,
                                    struct nlattr *data[])
{
        struct macsec_secy *secy;
        struct macsec_tx_sc *tx_sc;

        secy = &macsec_priv(dev)->secy;
        tx_sc = &secy->tx_sc;

        if (data[IFLA_MACSEC_ENCODING_SA]) {
                struct macsec_tx_sa *tx_sa;

                tx_sc->encoding_sa = nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]);
                tx_sa = rtnl_dereference(tx_sc->sa[tx_sc->encoding_sa]);

                secy->operational = tx_sa && tx_sa->active;
        }

        if (data[IFLA_MACSEC_ENCRYPT])
                tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]);

        if (data[IFLA_MACSEC_PROTECT])
                secy->protect_frames = !!nla_get_u8(data[IFLA_MACSEC_PROTECT]);

        if (data[IFLA_MACSEC_INC_SCI])
                tx_sc->send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]);

        if (data[IFLA_MACSEC_ES])
                tx_sc->end_station = !!nla_get_u8(data[IFLA_MACSEC_ES]);

        if (data[IFLA_MACSEC_SCB])
                tx_sc->scb = !!nla_get_u8(data[IFLA_MACSEC_SCB]);

        if (data[IFLA_MACSEC_REPLAY_PROTECT])
                secy->replay_protect = !!nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT]);

        if (data[IFLA_MACSEC_VALIDATION])
                secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]);

        if (data[IFLA_MACSEC_CIPHER_SUITE]) {
                switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) {
                case MACSEC_CIPHER_ID_GCM_AES_128:
                case MACSEC_DEFAULT_CIPHER_ID:
                        secy->key_len = MACSEC_GCM_AES_128_SAK_LEN;
                        secy->xpn = false;
                        break;
                case MACSEC_CIPHER_ID_GCM_AES_256:
                        secy->key_len = MACSEC_GCM_AES_256_SAK_LEN;
                        secy->xpn = false;
                        break;
                case MACSEC_CIPHER_ID_GCM_AES_XPN_128:
                        secy->key_len = MACSEC_GCM_AES_128_SAK_LEN;
                        secy->xpn = true;
                        break;
                case MACSEC_CIPHER_ID_GCM_AES_XPN_256:
                        secy->key_len = MACSEC_GCM_AES_256_SAK_LEN;
                        secy->xpn = true;
                        break;
                default:
                        return -EINVAL;
                }
        }

        if (data[IFLA_MACSEC_WINDOW]) {
                secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]);

                /* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window
                 * for XPN cipher suites */
                if (secy->xpn &&
                    secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW)
                        return -EINVAL;
        }

        return 0;
}

static int macsec_changelink(struct net_device *dev, struct nlattr *tb[],
                             struct nlattr *data[],
                             struct netlink_ext_ack *extack)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        bool macsec_offload_state_change = false;
        enum macsec_offload offload;
        struct macsec_tx_sc tx_sc;
        struct macsec_secy secy;
        int ret;

        if (!data)
                return 0;

        if (data[IFLA_MACSEC_CIPHER_SUITE] ||
            data[IFLA_MACSEC_ICV_LEN] ||
            data[IFLA_MACSEC_SCI] ||
            data[IFLA_MACSEC_PORT])
                return -EINVAL;

        /* Keep a copy of unmodified secy and tx_sc, in case the offload
         * propagation fails, to revert macsec_changelink_common.
         */
        memcpy(&secy, &macsec->secy, sizeof(secy));
        memcpy(&tx_sc, &macsec->secy.tx_sc, sizeof(tx_sc));

        ret = macsec_changelink_common(dev, data);
        if (ret)
                goto cleanup;

        if (data[IFLA_MACSEC_OFFLOAD]) {
                offload = nla_get_u8(data[IFLA_MACSEC_OFFLOAD]);
                if (macsec->offload != offload) {
                        macsec_offload_state_change = true;
                        ret = macsec_update_offload(dev, offload);
                        if (ret)
                                goto cleanup;
                }
        }

        /* If h/w offloading is available, propagate to the device */
        if (!macsec_offload_state_change && macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (!ops) {
                        ret = -EOPNOTSUPP;
                        goto cleanup;
                }

                ctx.secy = &macsec->secy;
                ret = macsec_offload(ops->mdo_upd_secy, &ctx);
                if (ret)
                        goto cleanup;
        }

        return 0;

cleanup:
        memcpy(&macsec->secy.tx_sc, &tx_sc, sizeof(tx_sc));
        memcpy(&macsec->secy, &secy, sizeof(secy));

        return ret;
}

static void macsec_del_dev(struct macsec_dev *macsec)
{
        int i;

        while (macsec->secy.rx_sc) {
                struct macsec_rx_sc *rx_sc = rtnl_dereference(macsec->secy.rx_sc);

                rcu_assign_pointer(macsec->secy.rx_sc, rx_sc->next);
                free_rx_sc(rx_sc);
        }

        for (i = 0; i < MACSEC_NUM_AN; i++) {
                struct macsec_tx_sa *sa = rtnl_dereference(macsec->secy.tx_sc.sa[i]);

                if (sa) {
                        RCU_INIT_POINTER(macsec->secy.tx_sc.sa[i], NULL);
                        clear_tx_sa(sa);
                }
        }
}

static void macsec_common_dellink(struct net_device *dev, struct list_head *head)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct net_device *real_dev = macsec->real_dev;

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(netdev_priv(dev), &ctx);
                if (ops) {
                        ctx.secy = &macsec->secy;
                        macsec_offload(ops->mdo_del_secy, &ctx);
                }
        }

        unregister_netdevice_queue(dev, head);
        list_del_rcu(&macsec->secys);
        macsec_del_dev(macsec);
        netdev_upper_dev_unlink(real_dev, dev);

        macsec_generation++;
}

static void macsec_dellink(struct net_device *dev, struct list_head *head)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct net_device *real_dev = macsec->real_dev;
        struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);

        macsec_common_dellink(dev, head);

        if (list_empty(&rxd->secys)) {
                netdev_rx_handler_unregister(real_dev);
                kfree(rxd);
        }
}

static int register_macsec_dev(struct net_device *real_dev,
                               struct net_device *dev)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);

        if (!rxd) {
                int err;

                rxd = kmalloc(sizeof(*rxd), GFP_KERNEL);
                if (!rxd)
                        return -ENOMEM;

                INIT_LIST_HEAD(&rxd->secys);

                err = netdev_rx_handler_register(real_dev, macsec_handle_frame,
                                                 rxd);
                if (err < 0) {
                        kfree(rxd);
                        return err;
                }
        }

        list_add_tail_rcu(&macsec->secys, &rxd->secys);
        return 0;
}

static bool sci_exists(struct net_device *dev, sci_t sci)
{
        struct macsec_rxh_data *rxd = macsec_data_rtnl(dev);
        struct macsec_dev *macsec;

        list_for_each_entry(macsec, &rxd->secys, secys) {
                if (macsec->secy.sci == sci)
                        return true;
        }

        return false;
}

static sci_t dev_to_sci(struct net_device *dev, __be16 port)
{
        return make_sci(dev->dev_addr, port);
}

static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len)
{
        struct macsec_dev *macsec = macsec_priv(dev);
        struct macsec_secy *secy = &macsec->secy;

        macsec->stats = netdev_alloc_pcpu_stats(struct pcpu_secy_stats);
        if (!macsec->stats)
                return -ENOMEM;

        secy->tx_sc.stats = netdev_alloc_pcpu_stats(struct pcpu_tx_sc_stats);
        if (!secy->tx_sc.stats)
                return -ENOMEM;

        secy->tx_sc.md_dst = metadata_dst_alloc(0, METADATA_MACSEC, GFP_KERNEL);
        if (!secy->tx_sc.md_dst)
                /* macsec and secy percpu stats will be freed when unregistering
                 * net_device in macsec_free_netdev()
                 */
                return -ENOMEM;

        if (sci == MACSEC_UNDEF_SCI)
                sci = dev_to_sci(dev, MACSEC_PORT_ES);

        secy->netdev = dev;
        secy->operational = true;
        secy->key_len = DEFAULT_SAK_LEN;
        secy->icv_len = icv_len;
        secy->validate_frames = MACSEC_VALIDATE_DEFAULT;
        secy->protect_frames = true;
        secy->replay_protect = false;
        secy->xpn = DEFAULT_XPN;

        secy->sci = sci;
        secy->tx_sc.md_dst->u.macsec_info.sci = sci;
        secy->tx_sc.active = true;
        secy->tx_sc.encoding_sa = DEFAULT_ENCODING_SA;
        secy->tx_sc.encrypt = DEFAULT_ENCRYPT;
        secy->tx_sc.send_sci = DEFAULT_SEND_SCI;
        secy->tx_sc.end_station = false;
        secy->tx_sc.scb = false;

        return 0;
}

static struct lock_class_key macsec_netdev_addr_lock_key;

static int macsec_newlink(struct net_device *dev,
                          struct rtnl_newlink_params *params,
                          struct netlink_ext_ack *extack)
{
        struct net *link_net = rtnl_newlink_link_net(params);
        struct macsec_dev *macsec = macsec_priv(dev);
        struct nlattr **data = params->data;
        struct nlattr **tb = params->tb;
        rx_handler_func_t *rx_handler;
        u8 icv_len = MACSEC_DEFAULT_ICV_LEN;
        struct net_device *real_dev;
        int err, mtu;
        sci_t sci;

        if (!tb[IFLA_LINK])
                return -EINVAL;
        real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK]));
        if (!real_dev)
                return -ENODEV;
        if (real_dev->type != ARPHRD_ETHER)
                return -EINVAL;

        dev->priv_flags |= IFF_MACSEC;

        macsec->real_dev = real_dev;

        if (data && data[IFLA_MACSEC_OFFLOAD])
                macsec->offload = nla_get_offload(data[IFLA_MACSEC_OFFLOAD]);
        else
                /* MACsec offloading is off by default */
                macsec->offload = MACSEC_OFFLOAD_OFF;

        /* Check if the offloading mode is supported by the underlying layers */
        if (macsec->offload != MACSEC_OFFLOAD_OFF &&
            !macsec_check_offload(macsec->offload, macsec))
                return -EOPNOTSUPP;

        /* send_sci must be set to true when transmit sci explicitly is set */
        if ((data && data[IFLA_MACSEC_SCI]) &&
            (data && data[IFLA_MACSEC_INC_SCI])) {
                u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]);

                if (!send_sci)
                        return -EINVAL;
        }

        if (data && data[IFLA_MACSEC_ICV_LEN])
                icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]);
        mtu = real_dev->mtu - icv_len - macsec_extra_len(true);
        if (mtu < 0)
                dev->mtu = 0;
        else
                dev->mtu = mtu;

        rx_handler = rtnl_dereference(real_dev->rx_handler);
        if (rx_handler && rx_handler != macsec_handle_frame)
                return -EBUSY;

        err = register_netdevice(dev);
        if (err < 0)
                return err;

        netdev_lockdep_set_classes(dev);
        lockdep_set_class(&dev->addr_list_lock,
                          &macsec_netdev_addr_lock_key);

        err = netdev_upper_dev_link(real_dev, dev, extack);
        if (err < 0)
                goto unregister;

        /* need to be already registered so that ->init has run and
         * the MAC addr is set
         */
        if (data && data[IFLA_MACSEC_SCI])
                sci = nla_get_sci(data[IFLA_MACSEC_SCI]);
        else if (data && data[IFLA_MACSEC_PORT])
                sci = dev_to_sci(dev, nla_get_be16(data[IFLA_MACSEC_PORT]));
        else
                sci = dev_to_sci(dev, MACSEC_PORT_ES);

        if (rx_handler && sci_exists(real_dev, sci)) {
                err = -EBUSY;
                goto unlink;
        }

        err = macsec_add_dev(dev, sci, icv_len);
        if (err)
                goto unlink;

        if (data) {
                err = macsec_changelink_common(dev, data);
                if (err)
                        goto del_dev;
        }

        /* If h/w offloading is available, propagate to the device */
        if (macsec_is_offloaded(macsec)) {
                const struct macsec_ops *ops;
                struct macsec_context ctx;

                ops = macsec_get_ops(macsec, &ctx);
                if (ops) {
                        ctx.secy = &macsec->secy;
                        err = macsec_offload(ops->mdo_add_secy, &ctx);
                        if (err)
                                goto del_dev;

                        macsec->insert_tx_tag =
                                macsec_needs_tx_tag(macsec, ops);
                }
        }

        err = register_macsec_dev(real_dev, dev);
        if (err < 0)
                goto del_dev;

        netif_stacked_transfer_operstate(real_dev, dev);
        linkwatch_fire_event(dev);

        macsec_generation++;

        return 0;

del_dev:
        macsec_del_dev(macsec);
unlink:
        netdev_upper_dev_unlink(real_dev, dev);
unregister:
        unregister_netdevice(dev);
        return err;
}

static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[],
                                struct netlink_ext_ack *extack)
{
        u64 csid = MACSEC_DEFAULT_CIPHER_ID;
        u8 icv_len = MACSEC_DEFAULT_ICV_LEN;
        int flag;
        bool es, scb, sci;

        if (!data)
                return 0;

        if (data[IFLA_MACSEC_CIPHER_SUITE])
                csid = nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE]);

        if (data[IFLA_MACSEC_ICV_LEN]) {
                icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]);
                if (icv_len != MACSEC_DEFAULT_ICV_LEN) {
                        char dummy_key[DEFAULT_SAK_LEN] = { 0 };
                        struct crypto_aead *dummy_tfm;

                        dummy_tfm = macsec_alloc_tfm(dummy_key,
                                                     DEFAULT_SAK_LEN,
                                                     icv_len);
                        if (IS_ERR(dummy_tfm))
                                return PTR_ERR(dummy_tfm);
                        crypto_free_aead(dummy_tfm);
                }
        }

        switch (csid) {
        case MACSEC_CIPHER_ID_GCM_AES_128:
        case MACSEC_CIPHER_ID_GCM_AES_256:
        case MACSEC_CIPHER_ID_GCM_AES_XPN_128:
        case MACSEC_CIPHER_ID_GCM_AES_XPN_256:
        case MACSEC_DEFAULT_CIPHER_ID:
                if (icv_len < MACSEC_MIN_ICV_LEN ||
                    icv_len > MACSEC_STD_ICV_LEN)
                        return -EINVAL;
                break;
        default:
                return -EINVAL;
        }

        if (data[IFLA_MACSEC_ENCODING_SA]) {
                if (nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]) >= MACSEC_NUM_AN)
                        return -EINVAL;
        }

        for (flag = IFLA_MACSEC_ENCODING_SA + 1;
             flag < IFLA_MACSEC_VALIDATION;
             flag++) {
                if (data[flag]) {
                        if (nla_get_u8(data[flag]) > 1)
                                return -EINVAL;
                }
        }

        es  = nla_get_u8_default(data[IFLA_MACSEC_ES], false);
        sci = nla_get_u8_default(data[IFLA_MACSEC_INC_SCI], false);
        scb = nla_get_u8_default(data[IFLA_MACSEC_SCB], false);

        if ((sci && (scb || es)) || (scb && es))
                return -EINVAL;

        if (data[IFLA_MACSEC_VALIDATION] &&
            nla_get_u8(data[IFLA_MACSEC_VALIDATION]) > MACSEC_VALIDATE_MAX)
                return -EINVAL;

        if ((data[IFLA_MACSEC_REPLAY_PROTECT] &&
             nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT])) &&
            !data[IFLA_MACSEC_WINDOW])
                return -EINVAL;

        return 0;
}

static struct net *macsec_get_link_net(const struct net_device *dev)
{
        return dev_net(macsec_priv(dev)->real_dev);
}

struct net_device *macsec_get_real_dev(const struct net_device *dev)
{
        return macsec_priv(dev)->real_dev;
}
EXPORT_SYMBOL_GPL(macsec_get_real_dev);

bool macsec_netdev_is_offloaded(struct net_device *dev)
{
        return macsec_is_offloaded(macsec_priv(dev));
}
EXPORT_SYMBOL_GPL(macsec_netdev_is_offloaded);

static size_t macsec_get_size(const struct net_device *dev)
{
        return  nla_total_size_64bit(8) + /* IFLA_MACSEC_SCI */
                nla_total_size(1) + /* IFLA_MACSEC_ICV_LEN */
                nla_total_size_64bit(8) + /* IFLA_MACSEC_CIPHER_SUITE */
                nla_total_size(4) + /* IFLA_MACSEC_WINDOW */
                nla_total_size(1) + /* IFLA_MACSEC_ENCODING_SA */
                nla_total_size(1) + /* IFLA_MACSEC_ENCRYPT */
                nla_total_size(1) + /* IFLA_MACSEC_PROTECT */
                nla_total_size(1) + /* IFLA_MACSEC_INC_SCI */
                nla_total_size(1) + /* IFLA_MACSEC_ES */
                nla_total_size(1) + /* IFLA_MACSEC_SCB */
                nla_total_size(1) + /* IFLA_MACSEC_REPLAY_PROTECT */
                nla_total_size(1) + /* IFLA_MACSEC_VALIDATION */
                nla_total_size(1) + /* IFLA_MACSEC_OFFLOAD */
                0;
}

static int macsec_fill_info(struct sk_buff *skb,
                            const struct net_device *dev)
{
        struct macsec_tx_sc *tx_sc;
        struct macsec_dev *macsec;
        struct macsec_secy *secy;
        u64 csid;

        macsec = macsec_priv(dev);
        secy = &macsec->secy;
        tx_sc = &secy->tx_sc;

        switch (secy->key_len) {
        case MACSEC_GCM_AES_128_SAK_LEN:
                csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID;
                break;
        case MACSEC_GCM_AES_256_SAK_LEN:
                csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256;
                break;
        default:
                goto nla_put_failure;
        }

        if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci,
                        IFLA_MACSEC_PAD) ||
            nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) ||
            nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE,
                              csid, IFLA_MACSEC_PAD) ||
            nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) ||
            nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) ||
            nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) ||
            nla_put_u8(skb, IFLA_MACSEC_INC_SCI, tx_sc->send_sci) ||
            nla_put_u8(skb, IFLA_MACSEC_ES, tx_sc->end_station) ||
            nla_put_u8(skb, IFLA_MACSEC_SCB, tx_sc->scb) ||
            nla_put_u8(skb, IFLA_MACSEC_REPLAY_PROTECT, secy->replay_protect) ||
            nla_put_u8(skb, IFLA_MACSEC_VALIDATION, secy->validate_frames) ||
            nla_put_u8(skb, IFLA_MACSEC_OFFLOAD, macsec->offload) ||
            0)
                goto nla_put_failure;

        if (secy->replay_protect) {
                if (nla_put_u32(skb, IFLA_MACSEC_WINDOW, secy->replay_window))
                        goto nla_put_failure;
        }

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static struct rtnl_link_ops macsec_link_ops __read_mostly = {
        .kind                = "macsec",
        .priv_size        = sizeof(struct macsec_dev),
        .maxtype        = IFLA_MACSEC_MAX,
        .policy                = macsec_rtnl_policy,
        .setup                = macsec_setup,
        .validate        = macsec_validate_attr,
        .newlink        = macsec_newlink,
        .changelink        = macsec_changelink,
        .dellink        = macsec_dellink,
        .get_size        = macsec_get_size,
        .fill_info        = macsec_fill_info,
        .get_link_net        = macsec_get_link_net,
};

static bool is_macsec_master(struct net_device *dev)
{
        return rcu_access_pointer(dev->rx_handler) == macsec_handle_frame;
}

static int macsec_notify(struct notifier_block *this, unsigned long event,
                         void *ptr)
{
        struct net_device *real_dev = netdev_notifier_info_to_dev(ptr);
        struct macsec_rxh_data *rxd;
        struct macsec_dev *m, *n;
        LIST_HEAD(head);

        if (!is_macsec_master(real_dev))
                return NOTIFY_DONE;

        rxd = macsec_data_rtnl(real_dev);

        switch (event) {
        case NETDEV_DOWN:
        case NETDEV_UP:
        case NETDEV_CHANGE:
                list_for_each_entry_safe(m, n, &rxd->secys, secys) {
                        struct net_device *dev = m->secy.netdev;

                        netif_stacked_transfer_operstate(real_dev, dev);
                }
                break;
        case NETDEV_UNREGISTER:
                list_for_each_entry_safe(m, n, &rxd->secys, secys) {
                        macsec_common_dellink(m->secy.netdev, &head);
                }

                netdev_rx_handler_unregister(real_dev);
                kfree(rxd);

                unregister_netdevice_many(&head);
                break;
        case NETDEV_CHANGEMTU:
                list_for_each_entry(m, &rxd->secys, secys) {
                        struct net_device *dev = m->secy.netdev;
                        unsigned int mtu = real_dev->mtu - (m->secy.icv_len +
                                                            macsec_extra_len(true));

                        if (dev->mtu > mtu)
                                dev_set_mtu(dev, mtu);
                }
                break;
        case NETDEV_FEAT_CHANGE:
                list_for_each_entry(m, &rxd->secys, secys) {
                        macsec_inherit_tso_max(m->secy.netdev);
                        netdev_update_features(m->secy.netdev);
                }
                break;
        }

        return NOTIFY_OK;
}

static struct notifier_block macsec_notifier = {
        .notifier_call = macsec_notify,
};

static int __init macsec_init(void)
{
        int err;

        pr_info("MACsec IEEE 802.1AE\n");
        err = register_netdevice_notifier(&macsec_notifier);
        if (err)
                return err;

        err = rtnl_link_register(&macsec_link_ops);
        if (err)
                goto notifier;

        err = genl_register_family(&macsec_fam);
        if (err)
                goto rtnl;

        return 0;

rtnl:
        rtnl_link_unregister(&macsec_link_ops);
notifier:
        unregister_netdevice_notifier(&macsec_notifier);
        return err;
}

static void __exit macsec_exit(void)
{
        genl_unregister_family(&macsec_fam);
        rtnl_link_unregister(&macsec_link_ops);
        unregister_netdevice_notifier(&macsec_notifier);
        rcu_barrier();
}

module_init(macsec_init);
module_exit(macsec_exit);

MODULE_ALIAS_RTNL_LINK("macsec");
MODULE_ALIAS_GENL_FAMILY("macsec");

MODULE_DESCRIPTION("MACsec IEEE 802.1AE");
MODULE_LICENSE("GPL v2");










  307 





  307 






  306 












  307 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_PGALLOC_TRACK_H
#define _LINUX_PGALLOC_TRACK_H

#if defined(CONFIG_MMU)
static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(pgd_none(*pgd))) {
                if (__p4d_alloc(mm, pgd, address))
                        return NULL;
                *mod_mask |= PGTBL_PGD_MODIFIED;
        }

        return p4d_offset(pgd, address);
}

static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(p4d_none(*p4d))) {
                if (__pud_alloc(mm, p4d, address))
                        return NULL;
                *mod_mask |= PGTBL_P4D_MODIFIED;
        }

        return pud_offset(p4d, address);
}

static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
                                     unsigned long address,
                                     pgtbl_mod_mask *mod_mask)
{
        if (unlikely(pud_none(*pud))) {
                if (__pmd_alloc(mm, pud, address))
                        return NULL;
                *mod_mask |= PGTBL_PUD_MODIFIED;
        }

        return pmd_offset(pud, address);
}
#endif /* CONFIG_MMU */

#define pte_alloc_kernel_track(pmd, address, mask)                        \
        ((unlikely(pmd_none(*(pmd))) &&                                        \
          (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
                NULL: pte_offset_kernel(pmd, address))

#endif /* _LINUX_PGALLOC_TRACK_H */

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * vrf.c: device driver to encapsulate a VRF space
 *
 * Copyright (c) 2015 Cumulus Networks. All rights reserved.
 * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com>
 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
 *
 * Based on dummy, team and ipvlan drivers
 */

#include <linux/ethtool.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ip.h>
#include <linux/init.h>
#include <linux/moduleparam.h>
#include <linux/netfilter.h>
#include <linux/rtnetlink.h>
#include <net/rtnetlink.h>
#include <linux/u64_stats_sync.h>
#include <linux/hashtable.h>
#include <linux/spinlock_types.h>

#include <linux/inetdevice.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/route.h>
#include <net/addrconf.h>
#include <net/l3mdev.h>
#include <net/fib_rules.h>
#include <net/netdev_lock.h>
#include <net/sch_generic.h>
#include <net/netns/generic.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/inet_dscp.h>

#define DRV_NAME        "vrf"
#define DRV_VERSION        "1.1"

#define FIB_RULE_PREF  1000       /* default preference for FIB rules */

#define HT_MAP_BITS        4
#define HASH_INITVAL        ((u32)0xcafef00d)

struct  vrf_map {
        DECLARE_HASHTABLE(ht, HT_MAP_BITS);
        spinlock_t vmap_lock;

        /* shared_tables:
         * count how many distinct tables do not comply with the strict mode
         * requirement.
         * shared_tables value must be 0 in order to enable the strict mode.
         *
         * example of the evolution of shared_tables:
         *                                                        | time
         * add  vrf0 --> table 100        shared_tables = 0       | t0
         * add  vrf1 --> table 101        shared_tables = 0       | t1
         * add  vrf2 --> table 100        shared_tables = 1       | t2
         * add  vrf3 --> table 100        shared_tables = 1       | t3
         * add  vrf4 --> table 101        shared_tables = 2       v t4
         *
         * shared_tables is a "step function" (or "staircase function")
         * and it is increased by one when the second vrf is associated to a
         * table.
         *
         * at t2, vrf0 and vrf2 are bound to table 100: shared_tables = 1.
         *
         * at t3, another dev (vrf3) is bound to the same table 100 but the
         * value of shared_tables is still 1.
         * This means that no matter how many new vrfs will register on the
         * table 100, the shared_tables will not increase (considering only
         * table 100).
         *
         * at t4, vrf4 is bound to table 101, and shared_tables = 2.
         *
         * Looking at the value of shared_tables we can immediately know if
         * the strict_mode can or cannot be enforced. Indeed, strict_mode
         * can be enforced iff shared_tables = 0.
         *
         * Conversely, shared_tables is decreased when a vrf is de-associated
         * from a table with exactly two associated vrfs.
         */
        u32 shared_tables;

        bool strict_mode;
};

struct vrf_map_elem {
        struct hlist_node hnode;
        struct list_head vrf_list;  /* VRFs registered to this table */

        u32 table_id;
        int users;
        int ifindex;
};

static unsigned int vrf_net_id;

/* per netns vrf data */
struct netns_vrf {
        /* protected by rtnl lock */
        bool add_fib_rules;

        struct vrf_map vmap;
        struct ctl_table_header        *ctl_hdr;
};

struct net_vrf {
        struct rtable __rcu        *rth;
        struct rt6_info        __rcu        *rt6;
#if IS_ENABLED(CONFIG_IPV6)
        struct fib6_table        *fib6_table;
#endif
        u32                     tb_id;

        struct list_head        me_list;   /* entry in vrf_map_elem */
        int                        ifindex;
};

static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
{
        vrf_dev->stats.tx_errors++;
        kfree_skb(skb);
}

static struct vrf_map *netns_vrf_map(struct net *net)
{
        struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);

        return &nn_vrf->vmap;
}

static struct vrf_map *netns_vrf_map_by_dev(struct net_device *dev)
{
        return netns_vrf_map(dev_net(dev));
}

static int vrf_map_elem_get_vrf_ifindex(struct vrf_map_elem *me)
{
        struct list_head *me_head = &me->vrf_list;
        struct net_vrf *vrf;

        if (list_empty(me_head))
                return -ENODEV;

        vrf = list_first_entry(me_head, struct net_vrf, me_list);

        return vrf->ifindex;
}

static struct vrf_map_elem *vrf_map_elem_alloc(gfp_t flags)
{
        struct vrf_map_elem *me;

        me = kmalloc(sizeof(*me), flags);
        if (!me)
                return NULL;

        return me;
}

static void vrf_map_elem_free(struct vrf_map_elem *me)
{
        kfree(me);
}

static void vrf_map_elem_init(struct vrf_map_elem *me, int table_id,
                              int ifindex, int users)
{
        me->table_id = table_id;
        me->ifindex = ifindex;
        me->users = users;
        INIT_LIST_HEAD(&me->vrf_list);
}

static struct vrf_map_elem *vrf_map_lookup_elem(struct vrf_map *vmap,
                                                u32 table_id)
{
        struct vrf_map_elem *me;
        u32 key;

        key = jhash_1word(table_id, HASH_INITVAL);
        hash_for_each_possible(vmap->ht, me, hnode, key) {
                if (me->table_id == table_id)
                        return me;
        }

        return NULL;
}

static void vrf_map_add_elem(struct vrf_map *vmap, struct vrf_map_elem *me)
{
        u32 table_id = me->table_id;
        u32 key;

        key = jhash_1word(table_id, HASH_INITVAL);
        hash_add(vmap->ht, &me->hnode, key);
}

static void vrf_map_del_elem(struct vrf_map_elem *me)
{
        hash_del(&me->hnode);
}

static void vrf_map_lock(struct vrf_map *vmap) __acquires(&vmap->vmap_lock)
{
        spin_lock(&vmap->vmap_lock);
}

static void vrf_map_unlock(struct vrf_map *vmap) __releases(&vmap->vmap_lock)
{
        spin_unlock(&vmap->vmap_lock);
}

/* called with rtnl lock held */
static int
vrf_map_register_dev(struct net_device *dev, struct netlink_ext_ack *extack)
{
        struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
        struct net_vrf *vrf = netdev_priv(dev);
        struct vrf_map_elem *new_me, *me;
        u32 table_id = vrf->tb_id;
        bool free_new_me = false;
        int users;
        int res;

        /* we pre-allocate elements used in the spin-locked section (so that we
         * keep the spinlock as short as possible).
         */
        new_me = vrf_map_elem_alloc(GFP_KERNEL);
        if (!new_me)
                return -ENOMEM;

        vrf_map_elem_init(new_me, table_id, dev->ifindex, 0);

        vrf_map_lock(vmap);

        me = vrf_map_lookup_elem(vmap, table_id);
        if (!me) {
                me = new_me;
                vrf_map_add_elem(vmap, me);
                goto link_vrf;
        }

        /* we already have an entry in the vrf_map, so it means there is (at
         * least) a vrf registered on the specific table.
         */
        free_new_me = true;
        if (vmap->strict_mode) {
                /* vrfs cannot share the same table */
                NL_SET_ERR_MSG(extack, "Table is used by another VRF");
                res = -EBUSY;
                goto unlock;
        }

link_vrf:
        users = ++me->users;
        if (users == 2)
                ++vmap->shared_tables;

        list_add(&vrf->me_list, &me->vrf_list);

        res = 0;

unlock:
        vrf_map_unlock(vmap);

        /* clean-up, if needed */
        if (free_new_me)
                vrf_map_elem_free(new_me);

        return res;
}

/* called with rtnl lock held */
static void vrf_map_unregister_dev(struct net_device *dev)
{
        struct vrf_map *vmap = netns_vrf_map_by_dev(dev);
        struct net_vrf *vrf = netdev_priv(dev);
        u32 table_id = vrf->tb_id;
        struct vrf_map_elem *me;
        int users;

        vrf_map_lock(vmap);

        me = vrf_map_lookup_elem(vmap, table_id);
        if (!me)
                goto unlock;

        list_del(&vrf->me_list);

        users = --me->users;
        if (users == 1) {
                --vmap->shared_tables;
        } else if (users == 0) {
                vrf_map_del_elem(me);

                /* no one will refer to this element anymore */
                vrf_map_elem_free(me);
        }

unlock:
        vrf_map_unlock(vmap);
}

/* return the vrf device index associated with the table_id */
static int vrf_ifindex_lookup_by_table_id(struct net *net, u32 table_id)
{
        struct vrf_map *vmap = netns_vrf_map(net);
        struct vrf_map_elem *me;
        int ifindex;

        vrf_map_lock(vmap);

        if (!vmap->strict_mode) {
                ifindex = -EPERM;
                goto unlock;
        }

        me = vrf_map_lookup_elem(vmap, table_id);
        if (!me) {
                ifindex = -ENODEV;
                goto unlock;
        }

        ifindex = vrf_map_elem_get_vrf_ifindex(me);

unlock:
        vrf_map_unlock(vmap);

        return ifindex;
}

/* by default VRF devices do not have a qdisc and are expected
 * to be created with only a single queue.
 */
static bool qdisc_tx_is_default(const struct net_device *dev)
{
        struct netdev_queue *txq;
        struct Qdisc *qdisc;

        if (dev->num_tx_queues > 1)
                return false;

        txq = netdev_get_tx_queue(dev, 0);
        qdisc = rcu_access_pointer(txq->qdisc);

        return !qdisc->enqueue;
}

/* Local traffic destined to local address. Reinsert the packet to rx
 * path, similar to loopback handling.
 */
static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
                          struct dst_entry *dst)
{
        unsigned int len = skb->len;

        skb_orphan(skb);

        skb_dst_set(skb, dst);

        /* set pkt_type to avoid skb hitting packet taps twice -
         * once on Tx and again in Rx processing
         */
        skb->pkt_type = PACKET_LOOPBACK;

        skb->protocol = eth_type_trans(skb, dev);

        if (likely(__netif_rx(skb) == NET_RX_SUCCESS))
                dev_dstats_rx_add(dev, len);
        else
                dev_dstats_rx_dropped(dev);

        return NETDEV_TX_OK;
}

static void vrf_nf_set_untracked(struct sk_buff *skb)
{
        if (skb_get_nfct(skb) == 0)
                nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
}

static void vrf_nf_reset_ct(struct sk_buff *skb)
{
        if (skb_get_nfct(skb) == IP_CT_UNTRACKED)
                nf_reset_ct(skb);
}

#if IS_ENABLED(CONFIG_IPV6)
static int vrf_ip6_local_out(struct net *net, struct sock *sk,
                             struct sk_buff *skb)
{
        int err;

        vrf_nf_reset_ct(skb);

        err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net,
                      sk, skb, NULL, skb_dst(skb)->dev, dst_output);

        if (likely(err == 1))
                err = dst_output(net, sk, skb);

        return err;
}

static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
                                           struct net_device *dev)
{
        const struct ipv6hdr *iph;
        struct net *net = dev_net(skb->dev);
        struct flowi6 fl6;
        int ret = NET_XMIT_DROP;
        struct dst_entry *dst;
        struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst;

        if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
                goto err;

        iph = ipv6_hdr(skb);

        memset(&fl6, 0, sizeof(fl6));
        /* needed to match OIF rule */
        fl6.flowi6_l3mdev = dev->ifindex;
        fl6.flowi6_iif = LOOPBACK_IFINDEX;
        fl6.daddr = iph->daddr;
        fl6.saddr = iph->saddr;
        fl6.flowlabel = ip6_flowinfo(iph);
        fl6.flowi6_mark = skb->mark;
        fl6.flowi6_proto = iph->nexthdr;

        dst = ip6_dst_lookup_flow(net, NULL, &fl6, NULL);
        if (IS_ERR(dst) || dst == dst_null)
                goto err;

        skb_dst_drop(skb);

        /* if dst.dev is the VRF device again this is locally originated traffic
         * destined to a local address. Short circuit to Rx path.
         */
        if (dst->dev == dev)
                return vrf_local_xmit(skb, dev, dst);

        skb_dst_set(skb, dst);

        /* strip the ethernet header added for pass through VRF device */
        __skb_pull(skb, skb_network_offset(skb));

        memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
        ret = vrf_ip6_local_out(net, skb->sk, skb);
        if (unlikely(net_xmit_eval(ret)))
                dev->stats.tx_errors++;
        else
                ret = NET_XMIT_SUCCESS;

        return ret;
err:
        vrf_tx_error(dev, skb);
        return NET_XMIT_DROP;
}
#else
static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
                                           struct net_device *dev)
{
        vrf_tx_error(dev, skb);
        return NET_XMIT_DROP;
}
#endif

/* based on ip_local_out; can't use it b/c the dst is switched pointing to us */
static int vrf_ip_local_out(struct net *net, struct sock *sk,
                            struct sk_buff *skb)
{
        int err;

        vrf_nf_reset_ct(skb);

        err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
                      skb, NULL, skb_dst(skb)->dev, dst_output);
        if (likely(err == 1))
                err = dst_output(net, sk, skb);

        return err;
}

static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
                                           struct net_device *vrf_dev)
{
        struct iphdr *ip4h;
        int ret = NET_XMIT_DROP;
        struct flowi4 fl4;
        struct net *net = dev_net(vrf_dev);
        struct rtable *rt;

        if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
                goto err;

        ip4h = ip_hdr(skb);

        memset(&fl4, 0, sizeof(fl4));
        /* needed to match OIF rule */
        fl4.flowi4_l3mdev = vrf_dev->ifindex;
        fl4.flowi4_iif = LOOPBACK_IFINDEX;
        fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h));
        fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
        fl4.flowi4_proto = ip4h->protocol;
        fl4.daddr = ip4h->daddr;
        fl4.saddr = ip4h->saddr;

        rt = ip_route_output_flow(net, &fl4, NULL);
        if (IS_ERR(rt))
                goto err;

        skb_dst_drop(skb);

        /* if dst.dev is the VRF device again this is locally originated traffic
         * destined to a local address. Short circuit to Rx path.
         */
        if (rt->dst.dev == vrf_dev)
                return vrf_local_xmit(skb, vrf_dev, &rt->dst);

        skb_dst_set(skb, &rt->dst);

        /* strip the ethernet header added for pass through VRF device */
        __skb_pull(skb, skb_network_offset(skb));

        if (!ip4h->saddr) {
                ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
                                               RT_SCOPE_LINK);
        }

        memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
        ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
        if (unlikely(net_xmit_eval(ret)))
                vrf_dev->stats.tx_errors++;
        else
                ret = NET_XMIT_SUCCESS;

out:
        return ret;
err:
        vrf_tx_error(vrf_dev, skb);
        goto out;
}

static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev)
{
        switch (skb->protocol) {
        case htons(ETH_P_IP):
                return vrf_process_v4_outbound(skb, dev);
        case htons(ETH_P_IPV6):
                return vrf_process_v6_outbound(skb, dev);
        default:
                vrf_tx_error(dev, skb);
                return NET_XMIT_DROP;
        }
}

static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
{
        unsigned int len = skb->len;
        netdev_tx_t ret;

        ret = is_ip_tx_frame(skb, dev);
        if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN))
                dev_dstats_tx_add(dev, len);
        else
                dev_dstats_tx_dropped(dev);

        return ret;
}

static void vrf_finish_direct(struct sk_buff *skb)
{
        struct net_device *vrf_dev = skb->dev;

        if (!list_empty(&vrf_dev->ptype_all) &&
            likely(skb_headroom(skb) >= ETH_HLEN)) {
                struct ethhdr *eth = skb_push(skb, ETH_HLEN);

                ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
                eth_zero_addr(eth->h_dest);
                eth->h_proto = skb->protocol;

                rcu_read_lock_bh();
                dev_queue_xmit_nit(skb, vrf_dev);
                rcu_read_unlock_bh();

                skb_pull(skb, ETH_HLEN);
        }

        vrf_nf_reset_ct(skb);
}

#if IS_ENABLED(CONFIG_IPV6)
/* modelled after ip6_finish_output2 */
static int vrf_finish_output6(struct net *net, struct sock *sk,
                              struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        struct net_device *dev = dst->dev;
        const struct in6_addr *nexthop;
        struct neighbour *neigh;
        int ret;

        vrf_nf_reset_ct(skb);

        skb->protocol = htons(ETH_P_IPV6);
        skb->dev = dev;

        rcu_read_lock();
        nexthop = rt6_nexthop(dst_rt6_info(dst), &ipv6_hdr(skb)->daddr);
        neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
        if (unlikely(!neigh))
                neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
        if (!IS_ERR(neigh)) {
                sock_confirm_neigh(skb, neigh);
                ret = neigh_output(neigh, skb, false);
                rcu_read_unlock();
                return ret;
        }
        rcu_read_unlock();

        IP6_INC_STATS(dev_net(dst->dev),
                      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
        kfree_skb(skb);
        return -EINVAL;
}

/* modelled after ip6_output */
static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
                            net, sk, skb, NULL, skb_dst(skb)->dev,
                            vrf_finish_output6,
                            !(IP6CB(skb)->flags & IP6SKB_REROUTED));
}

/* set dst on skb to send packet to us via dev_xmit path. Allows
 * packet to go through device based features such as qdisc, netfilter
 * hooks and packet sockets with skb->dev set to vrf device.
 */
static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev,
                                            struct sk_buff *skb)
{
        struct net_vrf *vrf = netdev_priv(vrf_dev);
        struct dst_entry *dst = NULL;
        struct rt6_info *rt6;

        rcu_read_lock();

        rt6 = rcu_dereference(vrf->rt6);
        if (likely(rt6)) {
                dst = &rt6->dst;
                dst_hold(dst);
        }

        rcu_read_unlock();

        if (unlikely(!dst)) {
                vrf_tx_error(vrf_dev, skb);
                return NULL;
        }

        skb_dst_drop(skb);
        skb_dst_set(skb, dst);

        return skb;
}

static int vrf_output6_direct_finish(struct net *net, struct sock *sk,
                                     struct sk_buff *skb)
{
        vrf_finish_direct(skb);

        return vrf_ip6_local_out(net, sk, skb);
}

static int vrf_output6_direct(struct net *net, struct sock *sk,
                              struct sk_buff *skb)
{
        int err = 1;

        skb->protocol = htons(ETH_P_IPV6);

        if (!(IPCB(skb)->flags & IPSKB_REROUTED))
                err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb,
                              NULL, skb->dev, vrf_output6_direct_finish);

        if (likely(err == 1))
                vrf_finish_direct(skb);

        return err;
}

static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk,
                                     struct sk_buff *skb)
{
        int err;

        err = vrf_output6_direct(net, sk, skb);
        if (likely(err == 1))
                err = vrf_ip6_local_out(net, sk, skb);

        return err;
}

static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev,
                                          struct sock *sk,
                                          struct sk_buff *skb)
{
        struct net *net = dev_net(vrf_dev);
        int err;

        skb->dev = vrf_dev;

        err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk,
                      skb, NULL, vrf_dev, vrf_ip6_out_direct_finish);

        if (likely(err == 1))
                err = vrf_output6_direct(net, sk, skb);

        if (likely(err == 1))
                return skb;

        return NULL;
}

static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
                                   struct sock *sk,
                                   struct sk_buff *skb)
{
        /* don't divert link scope packets */
        if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
                return skb;

        vrf_nf_set_untracked(skb);

        if (qdisc_tx_is_default(vrf_dev) ||
            IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
                return vrf_ip6_out_direct(vrf_dev, sk, skb);

        return vrf_ip6_out_redirect(vrf_dev, skb);
}

/* holding rtnl */
static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
{
        struct rt6_info *rt6 = rtnl_dereference(vrf->rt6);
        struct net *net = dev_net(dev);
        struct dst_entry *dst;

        RCU_INIT_POINTER(vrf->rt6, NULL);
        synchronize_rcu();

        /* move dev in dst's to loopback so this VRF device can be deleted
         * - based on dst_ifdown
         */
        if (rt6) {
                dst = &rt6->dst;
                netdev_ref_replace(dst->dev, net->loopback_dev,
                                   &dst->dev_tracker, GFP_KERNEL);
                dst->dev = net->loopback_dev;
                dst_release(dst);
        }
}

static int vrf_rt6_create(struct net_device *dev)
{
        int flags = DST_NOPOLICY | DST_NOXFRM;
        struct net_vrf *vrf = netdev_priv(dev);
        struct net *net = dev_net(dev);
        struct rt6_info *rt6;
        int rc = -ENOMEM;

        /* IPv6 can be CONFIG enabled and then disabled runtime */
        if (!ipv6_mod_enabled())
                return 0;

        vrf->fib6_table = fib6_new_table(net, vrf->tb_id);
        if (!vrf->fib6_table)
                goto out;

        /* create a dst for routing packets out a VRF device */
        rt6 = ip6_dst_alloc(net, dev, flags);
        if (!rt6)
                goto out;

        rt6->dst.output        = vrf_output6;

        rcu_assign_pointer(vrf->rt6, rt6);

        rc = 0;
out:
        return rc;
}
#else
static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
                                   struct sock *sk,
                                   struct sk_buff *skb)
{
        return skb;
}

static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
{
}

static int vrf_rt6_create(struct net_device *dev)
{
        return 0;
}
#endif

/* modelled after ip_finish_output2 */
static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct dst_entry *dst = skb_dst(skb);
        struct rtable *rt = dst_rtable(dst);
        struct net_device *dev = dst->dev;
        unsigned int hh_len = LL_RESERVED_SPACE(dev);
        struct neighbour *neigh;
        bool is_v6gw = false;

        vrf_nf_reset_ct(skb);

        /* Be paranoid, rather than too clever. */
        if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
                skb = skb_expand_head(skb, hh_len);
                if (!skb) {
                        dev->stats.tx_errors++;
                        return -ENOMEM;
                }
        }

        rcu_read_lock();

        neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
        if (!IS_ERR(neigh)) {
                int ret;

                sock_confirm_neigh(skb, neigh);
                /* if crossing protocols, can not use the cached header */
                ret = neigh_output(neigh, skb, is_v6gw);
                rcu_read_unlock();
                return ret;
        }

        rcu_read_unlock();
        vrf_tx_error(skb->dev, skb);
        return -EINVAL;
}

static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        struct net_device *dev = skb_dst(skb)->dev;

        IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);

        skb->dev = dev;
        skb->protocol = htons(ETH_P_IP);

        return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
                            net, sk, skb, NULL, dev,
                            vrf_finish_output,
                            !(IPCB(skb)->flags & IPSKB_REROUTED));
}

/* set dst on skb to send packet to us via dev_xmit path. Allows
 * packet to go through device based features such as qdisc, netfilter
 * hooks and packet sockets with skb->dev set to vrf device.
 */
static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev,
                                           struct sk_buff *skb)
{
        struct net_vrf *vrf = netdev_priv(vrf_dev);
        struct dst_entry *dst = NULL;
        struct rtable *rth;

        rcu_read_lock();

        rth = rcu_dereference(vrf->rth);
        if (likely(rth)) {
                dst = &rth->dst;
                dst_hold(dst);
        }

        rcu_read_unlock();

        if (unlikely(!dst)) {
                vrf_tx_error(vrf_dev, skb);
                return NULL;
        }

        skb_dst_drop(skb);
        skb_dst_set(skb, dst);

        return skb;
}

static int vrf_output_direct_finish(struct net *net, struct sock *sk,
                                    struct sk_buff *skb)
{
        vrf_finish_direct(skb);

        return vrf_ip_local_out(net, sk, skb);
}

static int vrf_output_direct(struct net *net, struct sock *sk,
                             struct sk_buff *skb)
{
        int err = 1;

        skb->protocol = htons(ETH_P_IP);

        if (!(IPCB(skb)->flags & IPSKB_REROUTED))
                err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb,
                              NULL, skb->dev, vrf_output_direct_finish);

        if (likely(err == 1))
                vrf_finish_direct(skb);

        return err;
}

static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk,
                                    struct sk_buff *skb)
{
        int err;

        err = vrf_output_direct(net, sk, skb);
        if (likely(err == 1))
                err = vrf_ip_local_out(net, sk, skb);

        return err;
}

static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev,
                                         struct sock *sk,
                                         struct sk_buff *skb)
{
        struct net *net = dev_net(vrf_dev);
        int err;

        skb->dev = vrf_dev;

        err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
                      skb, NULL, vrf_dev, vrf_ip_out_direct_finish);

        if (likely(err == 1))
                err = vrf_output_direct(net, sk, skb);

        if (likely(err == 1))
                return skb;

        return NULL;
}

static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
                                  struct sock *sk,
                                  struct sk_buff *skb)
{
        /* don't divert multicast or local broadcast */
        if (ipv4_is_multicast(ip_hdr(skb)->daddr) ||
            ipv4_is_lbcast(ip_hdr(skb)->daddr))
                return skb;

        vrf_nf_set_untracked(skb);

        if (qdisc_tx_is_default(vrf_dev) ||
            IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
                return vrf_ip_out_direct(vrf_dev, sk, skb);

        return vrf_ip_out_redirect(vrf_dev, skb);
}

/* called with rcu lock held */
static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev,
                                  struct sock *sk,
                                  struct sk_buff *skb,
                                  u16 proto)
{
        switch (proto) {
        case AF_INET:
                return vrf_ip_out(vrf_dev, sk, skb);
        case AF_INET6:
                return vrf_ip6_out(vrf_dev, sk, skb);
        }

        return skb;
}

/* holding rtnl */
static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf)
{
        struct rtable *rth = rtnl_dereference(vrf->rth);
        struct net *net = dev_net(dev);
        struct dst_entry *dst;

        RCU_INIT_POINTER(vrf->rth, NULL);
        synchronize_rcu();

        /* move dev in dst's to loopback so this VRF device can be deleted
         * - based on dst_ifdown
         */
        if (rth) {
                dst = &rth->dst;
                netdev_ref_replace(dst->dev, net->loopback_dev,
                                   &dst->dev_tracker, GFP_KERNEL);
                dst->dev = net->loopback_dev;
                dst_release(dst);
        }
}

static int vrf_rtable_create(struct net_device *dev)
{
        struct net_vrf *vrf = netdev_priv(dev);
        struct rtable *rth;

        if (!fib_new_table(dev_net(dev), vrf->tb_id))
                return -ENOMEM;

        /* create a dst for routing packets out through a VRF device */
        rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1);
        if (!rth)
                return -ENOMEM;

        rth->dst.output        = vrf_output;

        rcu_assign_pointer(vrf->rth, rth);

        return 0;
}

/**************************** device handling ********************/

/* cycle interface to flush neighbor cache and move routes across tables */
static void cycle_netdev(struct net_device *dev,
                         struct netlink_ext_ack *extack)
{
        unsigned int flags = dev->flags;
        int ret;

        if (!netif_running(dev))
                return;

        ret = dev_change_flags(dev, flags & ~IFF_UP, extack);
        if (ret >= 0)
                ret = dev_change_flags(dev, flags, extack);

        if (ret < 0) {
                netdev_err(dev,
                           "Failed to cycle device %s; route tables might be wrong!\n",
                           dev->name);
        }
}

static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
                            struct netlink_ext_ack *extack)
{
        int ret;

        /* do not allow loopback device to be enslaved to a VRF.
         * The vrf device acts as the loopback for the vrf.
         */
        if (port_dev == dev_net(dev)->loopback_dev) {
                NL_SET_ERR_MSG(extack,
                               "Can not enslave loopback device to a VRF");
                return -EOPNOTSUPP;
        }

        port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
        ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack);
        if (ret < 0)
                goto err;

        cycle_netdev(port_dev, extack);

        return 0;

err:
        port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
        return ret;
}

static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
                         struct netlink_ext_ack *extack)
{
        if (netif_is_l3_master(port_dev)) {
                NL_SET_ERR_MSG(extack,
                               "Can not enslave an L3 master device to a VRF");
                return -EINVAL;
        }

        if (netif_is_l3_slave(port_dev))
                return -EINVAL;

        return do_vrf_add_slave(dev, port_dev, extack);
}

/* inverse of do_vrf_add_slave */
static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
{
        netdev_upper_dev_unlink(port_dev, dev);
        port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;

        cycle_netdev(port_dev, NULL);

        return 0;
}

static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
{
        return do_vrf_del_slave(dev, port_dev);
}

static void vrf_dev_uninit(struct net_device *dev)
{
        struct net_vrf *vrf = netdev_priv(dev);

        vrf_rtable_release(dev, vrf);
        vrf_rt6_release(dev, vrf);
}

static int vrf_dev_init(struct net_device *dev)
{
        struct net_vrf *vrf = netdev_priv(dev);

        /* create the default dst which points back to us */
        if (vrf_rtable_create(dev) != 0)
                goto out_nomem;

        if (vrf_rt6_create(dev) != 0)
                goto out_rth;

        dev->flags = IFF_MASTER | IFF_NOARP;

        /* similarly, oper state is irrelevant; set to up to avoid confusion */
        dev->operstate = IF_OPER_UP;
        netdev_lockdep_set_classes(dev);
        return 0;

out_rth:
        vrf_rtable_release(dev, vrf);
out_nomem:
        return -ENOMEM;
}

static const struct net_device_ops vrf_netdev_ops = {
        .ndo_init                = vrf_dev_init,
        .ndo_uninit                = vrf_dev_uninit,
        .ndo_start_xmit                = vrf_xmit,
        .ndo_set_mac_address        = eth_mac_addr,
        .ndo_add_slave                = vrf_add_slave,
        .ndo_del_slave                = vrf_del_slave,
};

static u32 vrf_fib_table(const struct net_device *dev)
{
        struct net_vrf *vrf = netdev_priv(dev);

        return vrf->tb_id;
}

static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
        kfree_skb(skb);
        return 0;
}

static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook,
                                      struct sk_buff *skb,
                                      struct net_device *dev)
{
        struct net *net = dev_net(dev);

        if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
                skb = NULL;    /* kfree_skb(skb) handled by nf code */

        return skb;
}

static int vrf_prepare_mac_header(struct sk_buff *skb,
                                  struct net_device *vrf_dev, u16 proto)
{
        struct ethhdr *eth;
        int err;

        /* in general, we do not know if there is enough space in the head of
         * the packet for hosting the mac header.
         */
        err = skb_cow_head(skb, LL_RESERVED_SPACE(vrf_dev));
        if (unlikely(err))
                /* no space in the skb head */
                return -ENOBUFS;

        __skb_push(skb, ETH_HLEN);
        eth = (struct ethhdr *)skb->data;

        skb_reset_mac_header(skb);
        skb_reset_mac_len(skb);

        /* we set the ethernet destination and the source addresses to the
         * address of the VRF device.
         */
        ether_addr_copy(eth->h_dest, vrf_dev->dev_addr);
        ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
        eth->h_proto = htons(proto);

        /* the destination address of the Ethernet frame corresponds to the
         * address set on the VRF interface; therefore, the packet is intended
         * to be processed locally.
         */
        skb->protocol = eth->h_proto;
        skb->pkt_type = PACKET_HOST;

        skb_postpush_rcsum(skb, skb->data, ETH_HLEN);

        skb_pull_inline(skb, ETH_HLEN);

        return 0;
}

/* prepare and add the mac header to the packet if it was not set previously.
 * In this way, packet sniffers such as tcpdump can parse the packet correctly.
 * If the mac header was already set, the original mac header is left
 * untouched and the function returns immediately.
 */
static int vrf_add_mac_header_if_unset(struct sk_buff *skb,
                                       struct net_device *vrf_dev,
                                       u16 proto, struct net_device *orig_dev)
{
        if (skb_mac_header_was_set(skb) && dev_has_header(orig_dev))
                return 0;

        return vrf_prepare_mac_header(skb, vrf_dev, proto);
}

#if IS_ENABLED(CONFIG_IPV6)
/* neighbor handling is done with actual device; do not want
 * to flip skb->dev for those ndisc packets. This really fails
 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
 * a start.
 */
static bool ipv6_ndisc_frame(const struct sk_buff *skb)
{
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        bool rc = false;

        if (iph->nexthdr == NEXTHDR_ICMP) {
                const struct icmp6hdr *icmph;
                struct icmp6hdr _icmph;

                icmph = skb_header_pointer(skb, sizeof(*iph),
                                           sizeof(_icmph), &_icmph);
                if (!icmph)
                        goto out;

                switch (icmph->icmp6_type) {
                case NDISC_ROUTER_SOLICITATION:
                case NDISC_ROUTER_ADVERTISEMENT:
                case NDISC_NEIGHBOUR_SOLICITATION:
                case NDISC_NEIGHBOUR_ADVERTISEMENT:
                case NDISC_REDIRECT:
                        rc = true;
                        break;
                }
        }

out:
        return rc;
}

static struct rt6_info *vrf_ip6_route_lookup(struct net *net,
                                             const struct net_device *dev,
                                             struct flowi6 *fl6,
                                             int ifindex,
                                             const struct sk_buff *skb,
                                             int flags)
{
        struct net_vrf *vrf = netdev_priv(dev);

        return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags);
}

static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev,
                              int ifindex)
{
        const struct ipv6hdr *iph = ipv6_hdr(skb);
        struct flowi6 fl6 = {
                .flowi6_iif     = ifindex,
                .flowi6_mark    = skb->mark,
                .flowi6_proto   = iph->nexthdr,
                .daddr          = iph->daddr,
                .saddr          = iph->saddr,
                .flowlabel      = ip6_flowinfo(iph),
        };
        struct net *net = dev_net(vrf_dev);
        struct rt6_info *rt6;

        rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb,
                                   RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE);
        if (unlikely(!rt6))
                return;

        if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst))
                return;

        skb_dst_set(skb, &rt6->dst);
}

static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
                                   struct sk_buff *skb)
{
        int orig_iif = skb->skb_iif;
        bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr);
        bool is_ndisc = ipv6_ndisc_frame(skb);

        /* loopback, multicast & non-ND link-local traffic; do not push through
         * packet taps again. Reset pkt_type for upper layers to process skb.
         * For non-loopback strict packets, determine the dst using the original
         * ifindex.
         */
        if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
                skb->dev = vrf_dev;
                skb->skb_iif = vrf_dev->ifindex;
                IP6CB(skb)->flags |= IP6SKB_L3SLAVE;

                if (skb->pkt_type == PACKET_LOOPBACK)
                        skb->pkt_type = PACKET_HOST;
                else
                        vrf_ip6_input_dst(skb, vrf_dev, orig_iif);

                goto out;
        }

        /* if packet is NDISC then keep the ingress interface */
        if (!is_ndisc) {
                struct net_device *orig_dev = skb->dev;

                dev_dstats_rx_add(vrf_dev, skb->len);
                skb->dev = vrf_dev;
                skb->skb_iif = vrf_dev->ifindex;

                if (!list_empty(&vrf_dev->ptype_all)) {
                        int err;

                        err = vrf_add_mac_header_if_unset(skb, vrf_dev,
                                                          ETH_P_IPV6,
                                                          orig_dev);
                        if (likely(!err)) {
                                skb_push(skb, skb->mac_len);
                                dev_queue_xmit_nit(skb, vrf_dev);
                                skb_pull(skb, skb->mac_len);
                        }
                }

                IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
        }

        if (need_strict)
                vrf_ip6_input_dst(skb, vrf_dev, orig_iif);

        skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev);
out:
        return skb;
}

#else
static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
                                   struct sk_buff *skb)
{
        return skb;
}
#endif

static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
                                  struct sk_buff *skb)
{
        struct net_device *orig_dev = skb->dev;

        skb->dev = vrf_dev;
        skb->skb_iif = vrf_dev->ifindex;
        IPCB(skb)->flags |= IPSKB_L3SLAVE;

        if (ipv4_is_multicast(ip_hdr(skb)->daddr))
                goto out;

        /* loopback traffic; do not push through packet taps again.
         * Reset pkt_type for upper layers to process skb
         */
        if (skb->pkt_type == PACKET_LOOPBACK) {
                skb->pkt_type = PACKET_HOST;
                goto out;
        }

        dev_dstats_rx_add(vrf_dev, skb->len);

        if (!list_empty(&vrf_dev->ptype_all)) {
                int err;

                err = vrf_add_mac_header_if_unset(skb, vrf_dev, ETH_P_IP,
                                                  orig_dev);
                if (likely(!err)) {
                        skb_push(skb, skb->mac_len);
                        dev_queue_xmit_nit(skb, vrf_dev);
                        skb_pull(skb, skb->mac_len);
                }
        }

        skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev);
out:
        return skb;
}

/* called with rcu lock held */
static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
                                  struct sk_buff *skb,
                                  u16 proto)
{
        switch (proto) {
        case AF_INET:
                return vrf_ip_rcv(vrf_dev, skb);
        case AF_INET6:
                return vrf_ip6_rcv(vrf_dev, skb);
        }

        return skb;
}

#if IS_ENABLED(CONFIG_IPV6)
/* send to link-local or multicast address via interface enslaved to
 * VRF device. Force lookup to VRF table without changing flow struct
 * Note: Caller to this function must hold rcu_read_lock() and no refcnt
 * is taken on the dst by this function.
 */
static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
                                              struct flowi6 *fl6)
{
        struct net *net = dev_net(dev);
        int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_DST_NOREF;
        struct dst_entry *dst = NULL;
        struct rt6_info *rt;

        /* VRF device does not have a link-local address and
         * sending packets to link-local or mcast addresses over
         * a VRF device does not make sense
         */
        if (fl6->flowi6_oif == dev->ifindex) {
                dst = &net->ipv6.ip6_null_entry->dst;
                return dst;
        }

        if (!ipv6_addr_any(&fl6->saddr))
                flags |= RT6_LOOKUP_F_HAS_SADDR;

        rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags);
        if (rt)
                dst = &rt->dst;

        return dst;
}
#endif

static const struct l3mdev_ops vrf_l3mdev_ops = {
        .l3mdev_fib_table        = vrf_fib_table,
        .l3mdev_l3_rcv                = vrf_l3_rcv,
        .l3mdev_l3_out                = vrf_l3_out,
#if IS_ENABLED(CONFIG_IPV6)
        .l3mdev_link_scope_lookup = vrf_link_scope_lookup,
#endif
};

static void vrf_get_drvinfo(struct net_device *dev,
                            struct ethtool_drvinfo *info)
{
        strscpy(info->driver, DRV_NAME, sizeof(info->driver));
        strscpy(info->version, DRV_VERSION, sizeof(info->version));
}

static const struct ethtool_ops vrf_ethtool_ops = {
        .get_drvinfo        = vrf_get_drvinfo,
};

static inline size_t vrf_fib_rule_nl_size(void)
{
        size_t sz;

        sz  = NLMSG_ALIGN(sizeof(struct fib_rule_hdr));
        sz += nla_total_size(sizeof(u8));        /* FRA_L3MDEV */
        sz += nla_total_size(sizeof(u32));        /* FRA_PRIORITY */
        sz += nla_total_size(sizeof(u8));       /* FRA_PROTOCOL */

        return sz;
}

static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it)
{
        struct fib_rule_hdr *frh;
        struct nlmsghdr *nlh;
        struct sk_buff *skb;
        int err;

        if ((family == AF_INET6 || family == RTNL_FAMILY_IP6MR) &&
            !ipv6_mod_enabled())
                return 0;

        skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL);
        if (!skb)
                return -ENOMEM;

        nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0);
        if (!nlh)
                goto nla_put_failure;

        /* rule only needs to appear once */
        nlh->nlmsg_flags |= NLM_F_EXCL;

        frh = nlmsg_data(nlh);
        memset(frh, 0, sizeof(*frh));
        frh->family = family;
        frh->action = FR_ACT_TO_TBL;

        if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL))
                goto nla_put_failure;

        if (nla_put_u8(skb, FRA_L3MDEV, 1))
                goto nla_put_failure;

        if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF))
                goto nla_put_failure;

        nlmsg_end(skb, nlh);

        if (add_it) {
                err = fib_newrule(dev_net(dev), skb, nlh, NULL, true);
                if (err == -EEXIST)
                        err = 0;
        } else {
                err = fib_delrule(dev_net(dev), skb, nlh, NULL, true);
                if (err == -ENOENT)
                        err = 0;
        }
        nlmsg_free(skb);

        return err;

nla_put_failure:
        nlmsg_free(skb);

        return -EMSGSIZE;
}

static int vrf_add_fib_rules(const struct net_device *dev)
{
        int err;

        err = vrf_fib_rule(dev, AF_INET,  true);
        if (err < 0)
                goto out_err;

        err = vrf_fib_rule(dev, AF_INET6, true);
        if (err < 0)
                goto ipv6_err;

#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
        err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true);
        if (err < 0)
                goto ipmr_err;
#endif

#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
        err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true);
        if (err < 0)
                goto ip6mr_err;
#endif

        return 0;

#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
ip6mr_err:
        vrf_fib_rule(dev, RTNL_FAMILY_IPMR,  false);
#endif

#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
ipmr_err:
        vrf_fib_rule(dev, AF_INET6,  false);
#endif

ipv6_err:
        vrf_fib_rule(dev, AF_INET,  false);

out_err:
        netdev_err(dev, "Failed to add FIB rules.\n");
        return err;
}

static void vrf_setup(struct net_device *dev)
{
        ether_setup(dev);

        /* Initialize the device structure. */
        dev->netdev_ops = &vrf_netdev_ops;
        dev->l3mdev_ops = &vrf_l3mdev_ops;
        dev->ethtool_ops = &vrf_ethtool_ops;
        dev->needs_free_netdev = true;

        /* Fill in device structure with ethernet-generic values. */
        eth_hw_addr_random(dev);

        /* don't acquire vrf device's netif_tx_lock when transmitting */
        dev->lltx = true;

        /* don't allow vrf devices to change network namespaces. */
        dev->netns_immutable = true;

        /* does not make sense for a VLAN to be added to a vrf device */
        dev->features   |= NETIF_F_VLAN_CHALLENGED;

        /* enable offload features */
        dev->features   |= NETIF_F_GSO_SOFTWARE;
        dev->features   |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC;
        dev->features   |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;

        dev->hw_features = dev->features;
        dev->hw_enc_features = dev->features;

        /* default to no qdisc; user can add if desired */
        dev->priv_flags |= IFF_NO_QUEUE;
        dev->priv_flags |= IFF_NO_RX_HANDLER;
        dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;

        /* VRF devices do not care about MTU, but if the MTU is set
         * too low then the ipv4 and ipv6 protocols are disabled
         * which breaks networking.
         */
        dev->min_mtu = IPV6_MIN_MTU;
        dev->max_mtu = IP6_MAX_MTU;
        dev->mtu = dev->max_mtu;

        dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
}

static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],
                        struct netlink_ext_ack *extack)
{
        if (tb[IFLA_ADDRESS]) {
                if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
                        NL_SET_ERR_MSG(extack, "Invalid hardware address");
                        return -EINVAL;
                }
                if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
                        NL_SET_ERR_MSG(extack, "Invalid hardware address");
                        return -EADDRNOTAVAIL;
                }
        }
        return 0;
}

static void vrf_dellink(struct net_device *dev, struct list_head *head)
{
        struct net_device *port_dev;
        struct list_head *iter;

        netdev_for_each_lower_dev(dev, port_dev, iter)
                vrf_del_slave(dev, port_dev);

        vrf_map_unregister_dev(dev);

        unregister_netdevice_queue(dev, head);
}

static int vrf_newlink(struct net_device *dev,
                       struct rtnl_newlink_params *params,
                       struct netlink_ext_ack *extack)
{
        struct net_vrf *vrf = netdev_priv(dev);
        struct nlattr **data = params->data;
        struct netns_vrf *nn_vrf;
        bool *add_fib_rules;
        struct net *net;
        int err;

        if (!data || !data[IFLA_VRF_TABLE]) {
                NL_SET_ERR_MSG(extack, "VRF table id is missing");
                return -EINVAL;
        }

        vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]);
        if (vrf->tb_id == RT_TABLE_UNSPEC) {
                NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE],
                                    "Invalid VRF table id");
                return -EINVAL;
        }

        dev->priv_flags |= IFF_L3MDEV_MASTER;

        err = register_netdevice(dev);
        if (err)
                goto out;

        /* mapping between table_id and vrf;
         * note: such binding could not be done in the dev init function
         * because dev->ifindex id is not available yet.
         */
        vrf->ifindex = dev->ifindex;

        err = vrf_map_register_dev(dev, extack);
        if (err) {
                unregister_netdevice(dev);
                goto out;
        }

        net = dev_net(dev);
        nn_vrf = net_generic(net, vrf_net_id);

        add_fib_rules = &nn_vrf->add_fib_rules;
        if (*add_fib_rules) {
                err = vrf_add_fib_rules(dev);
                if (err) {
                        vrf_map_unregister_dev(dev);
                        unregister_netdevice(dev);
                        goto out;
                }
                *add_fib_rules = false;
        }

out:
        return err;
}

static size_t vrf_nl_getsize(const struct net_device *dev)
{
        return nla_total_size(sizeof(u32));  /* IFLA_VRF_TABLE */
}

static int vrf_fillinfo(struct sk_buff *skb,
                        const struct net_device *dev)
{
        struct net_vrf *vrf = netdev_priv(dev);

        return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
}

static size_t vrf_get_slave_size(const struct net_device *bond_dev,
                                 const struct net_device *slave_dev)
{
        return nla_total_size(sizeof(u32));  /* IFLA_VRF_PORT_TABLE */
}

static int vrf_fill_slave_info(struct sk_buff *skb,
                               const struct net_device *vrf_dev,
                               const struct net_device *slave_dev)
{
        struct net_vrf *vrf = netdev_priv(vrf_dev);

        if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id))
                return -EMSGSIZE;

        return 0;
}

static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
        [IFLA_VRF_TABLE] = { .type = NLA_U32 },
};

static struct rtnl_link_ops vrf_link_ops __read_mostly = {
        .kind                = DRV_NAME,
        .priv_size        = sizeof(struct net_vrf),

        .get_size        = vrf_nl_getsize,
        .policy                = vrf_nl_policy,
        .validate        = vrf_validate,
        .fill_info        = vrf_fillinfo,

        .get_slave_size  = vrf_get_slave_size,
        .fill_slave_info = vrf_fill_slave_info,

        .newlink        = vrf_newlink,
        .dellink        = vrf_dellink,
        .setup                = vrf_setup,
        .maxtype        = IFLA_VRF_MAX,
};

static int vrf_device_event(struct notifier_block *unused,
                            unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        /* only care about unregister events to drop slave references */
        if (event == NETDEV_UNREGISTER) {
                struct net_device *vrf_dev;

                if (!netif_is_l3_slave(dev))
                        goto out;

                vrf_dev = netdev_master_upper_dev_get(dev);
                vrf_del_slave(vrf_dev, dev);
        }
out:
        return NOTIFY_DONE;
}

static struct notifier_block vrf_notifier_block __read_mostly = {
        .notifier_call = vrf_device_event,
};

static int vrf_map_init(struct vrf_map *vmap)
{
        spin_lock_init(&vmap->vmap_lock);
        hash_init(vmap->ht);

        vmap->strict_mode = false;

        return 0;
}

#ifdef CONFIG_SYSCTL
static bool vrf_strict_mode(struct vrf_map *vmap)
{
        bool strict_mode;

        vrf_map_lock(vmap);
        strict_mode = vmap->strict_mode;
        vrf_map_unlock(vmap);

        return strict_mode;
}

static int vrf_strict_mode_change(struct vrf_map *vmap, bool new_mode)
{
        bool *cur_mode;
        int res = 0;

        vrf_map_lock(vmap);

        cur_mode = &vmap->strict_mode;
        if (*cur_mode == new_mode)
                goto unlock;

        if (*cur_mode) {
                /* disable strict mode */
                *cur_mode = false;
        } else {
                if (vmap->shared_tables) {
                        /* we cannot allow strict_mode because there are some
                         * vrfs that share one or more tables.
                         */
                        res = -EBUSY;
                        goto unlock;
                }

                /* no tables are shared among vrfs, so we can go back
                 * to 1:1 association between a vrf with its table.
                 */
                *cur_mode = true;
        }

unlock:
        vrf_map_unlock(vmap);

        return res;
}

static int vrf_shared_table_handler(const struct ctl_table *table, int write,
                                    void *buffer, size_t *lenp, loff_t *ppos)
{
        struct net *net = (struct net *)table->extra1;
        struct vrf_map *vmap = netns_vrf_map(net);
        int proc_strict_mode = 0;
        struct ctl_table tmp = {
                .procname        = table->procname,
                .data                = &proc_strict_mode,
                .maxlen                = sizeof(int),
                .mode                = table->mode,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        };
        int ret;

        if (!write)
                proc_strict_mode = vrf_strict_mode(vmap);

        ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);

        if (write && ret == 0)
                ret = vrf_strict_mode_change(vmap, (bool)proc_strict_mode);

        return ret;
}

static const struct ctl_table vrf_table[] = {
        {
                .procname        = "strict_mode",
                .data                = NULL,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = vrf_shared_table_handler,
                /* set by the vrf_netns_init */
                .extra1                = NULL,
        },
};

static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf)
{
        struct ctl_table *table;

        table = kmemdup(vrf_table, sizeof(vrf_table), GFP_KERNEL);
        if (!table)
                return -ENOMEM;

        /* init the extra1 parameter with the reference to current netns */
        table[0].extra1 = net;

        nn_vrf->ctl_hdr = register_net_sysctl_sz(net, "net/vrf", table,
                                                 ARRAY_SIZE(vrf_table));
        if (!nn_vrf->ctl_hdr) {
                kfree(table);
                return -ENOMEM;
        }

        return 0;
}

static void vrf_netns_exit_sysctl(struct net *net)
{
        struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);
        const struct ctl_table *table;

        table = nn_vrf->ctl_hdr->ctl_table_arg;
        unregister_net_sysctl_table(nn_vrf->ctl_hdr);
        kfree(table);
}
#else
static int vrf_netns_init_sysctl(struct net *net, struct netns_vrf *nn_vrf)
{
        return 0;
}

static void vrf_netns_exit_sysctl(struct net *net)
{
}
#endif

/* Initialize per network namespace state */
static int __net_init vrf_netns_init(struct net *net)
{
        struct netns_vrf *nn_vrf = net_generic(net, vrf_net_id);

        nn_vrf->add_fib_rules = true;
        vrf_map_init(&nn_vrf->vmap);

        return vrf_netns_init_sysctl(net, nn_vrf);
}

static void __net_exit vrf_netns_exit(struct net *net)
{
        vrf_netns_exit_sysctl(net);
}

static struct pernet_operations vrf_net_ops __net_initdata = {
        .init = vrf_netns_init,
        .exit = vrf_netns_exit,
        .id   = &vrf_net_id,
        .size = sizeof(struct netns_vrf),
};

static int __init vrf_init_module(void)
{
        int rc;

        register_netdevice_notifier(&vrf_notifier_block);

        rc = register_pernet_subsys(&vrf_net_ops);
        if (rc < 0)
                goto error;

        rc = l3mdev_table_lookup_register(L3MDEV_TYPE_VRF,
                                          vrf_ifindex_lookup_by_table_id);
        if (rc < 0)
                goto unreg_pernet;

        rc = rtnl_link_register(&vrf_link_ops);
        if (rc < 0)
                goto table_lookup_unreg;

        return 0;

table_lookup_unreg:
        l3mdev_table_lookup_unregister(L3MDEV_TYPE_VRF,
                                       vrf_ifindex_lookup_by_table_id);

unreg_pernet:
        unregister_pernet_subsys(&vrf_net_ops);

error:
        unregister_netdevice_notifier(&vrf_notifier_block);
        return rc;
}

module_init(vrf_init_module);
MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK(DRV_NAME);
MODULE_VERSION(DRV_VERSION);








































   14 


   14 












   14 










    1 



    1 



1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
 *        Skb ref helpers.
 *
 */

#ifndef _LINUX_SKBUFF_REF_H
#define _LINUX_SKBUFF_REF_H

#include <linux/skbuff.h>

/**
 * __skb_frag_ref - take an addition reference on a paged fragment.
 * @frag: the paged fragment
 *
 * Takes an additional reference on the paged fragment @frag.
 */
static inline void __skb_frag_ref(skb_frag_t *frag)
{
        get_page(skb_frag_page(frag));
}

/**
 * skb_frag_ref - take an addition reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset.
 *
 * Takes an additional reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_ref(struct sk_buff *skb, int f)
{
        __skb_frag_ref(&skb_shinfo(skb)->frags[f]);
}

bool napi_pp_put_page(netmem_ref netmem);

static inline void skb_page_unref(netmem_ref netmem, bool recycle)
{
#ifdef CONFIG_PAGE_POOL
        if (recycle && napi_pp_put_page(netmem))
                return;
#endif
        put_page(netmem_to_page(netmem));
}

/**
 * __skb_frag_unref - release a reference on a paged fragment.
 * @frag: the paged fragment
 * @recycle: recycle the page if allocated via page_pool
 *
 * Releases a reference on the paged fragment @frag
 * or recycles the page via the page_pool API.
 */
static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
{
        skb_page_unref(skb_frag_netmem(frag), recycle);
}

/**
 * skb_frag_unref - release a reference on a paged fragment of an skb.
 * @skb: the buffer
 * @f: the fragment offset
 *
 * Releases a reference on the @f'th paged fragment of @skb.
 */
static inline void skb_frag_unref(struct sk_buff *skb, int f)
{
        struct skb_shared_info *shinfo = skb_shinfo(skb);

        if (!skb_zcopy_managed(skb))
                __skb_frag_unref(&shinfo->frags[f], skb->pp_recycle);
}

#endif        /* _LINUX_SKBUFF_REF_H */






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        Linux NET3:        Internet Group Management Protocol  [IGMP]
 *
 *        This code implements the IGMP protocol as defined in RFC1112. There has
 *        been a further revision of this protocol since which is now supported.
 *
 *        If you have trouble with this module be careful what gcc you have used,
 *        the older version didn't come out right using gcc 2.5.8, the newer one
 *        seems to fall out with gcc 2.6.2.
 *
 *        Authors:
 *                Alan Cox <alan@lxorguk.ukuu.org.uk>
 *
 *        Fixes:
 *
 *                Alan Cox        :        Added lots of __inline__ to optimise
 *                                        the memory usage of all the tiny little
 *                                        functions.
 *                Alan Cox        :        Dumped the header building experiment.
 *                Alan Cox        :        Minor tweaks ready for multicast routing
 *                                        and extended IGMP protocol.
 *                Alan Cox        :        Removed a load of inline directives. Gcc 2.5.8
 *                                        writes utterly bogus code otherwise (sigh)
 *                                        fixed IGMP loopback to behave in the manner
 *                                        desired by mrouted, fixed the fact it has been
 *                                        broken since 1.3.6 and cleaned up a few minor
 *                                        points.
 *
 *                Chih-Jen Chang        :        Tried to revise IGMP to Version 2
 *                Tsu-Sheng Tsao                E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
 *                                        The enhancements are mainly based on Steve Deering's
 *                                         ipmulti-3.5 source code.
 *                Chih-Jen Chang        :        Added the igmp_get_mrouter_info and
 *                Tsu-Sheng Tsao                igmp_set_mrouter_info to keep track of
 *                                        the mrouted version on that device.
 *                Chih-Jen Chang        :        Added the max_resp_time parameter to
 *                Tsu-Sheng Tsao                igmp_heard_query(). Using this parameter
 *                                        to identify the multicast router version
 *                                        and do what the IGMP version 2 specified.
 *                Chih-Jen Chang        :        Added a timer to revert to IGMP V2 router
 *                Tsu-Sheng Tsao                if the specified time expired.
 *                Alan Cox        :        Stop IGMP from 0.0.0.0 being accepted.
 *                Alan Cox        :        Use GFP_ATOMIC in the right places.
 *                Christian Daudt :        igmp timer wasn't set for local group
 *                                        memberships but was being deleted,
 *                                        which caused a "del_timer() called
 *                                        from %p with timer not initialized\n"
 *                                        message (960131).
 *                Christian Daudt :        removed del_timer from
 *                                        igmp_timer_expire function (960205).
 *             Christian Daudt :       igmp_heard_report now only calls
 *                                     igmp_timer_expire if tm->running is
 *                                     true (960216).
 *                Malcolm Beattie :        ttl comparison wrong in igmp_rcv made
 *                                        igmp_heard_query never trigger. Expiry
 *                                        miscalculation fixed in igmp_heard_query
 *                                        and random() made to return unsigned to
 *                                        prevent negative expiry times.
 *                Alexey Kuznetsov:        Wrong group leaving behaviour, backport
 *                                        fix from pending 2.1.x patches.
 *                Alan Cox:                Forget to enable FDDI support earlier.
 *                Alexey Kuznetsov:        Fixed leaving groups on device down.
 *                Alexey Kuznetsov:        Accordance to igmp-v2-06 draft.
 *                David L Stevens:        IGMPv3 support, with help from
 *                                        Vinay Kulkarni
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include "igmp_internal.h"
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
#include <linux/pkt_sched.h>
#include <linux/byteorder/generic.h>

#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/addrconf.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/sock.h>
#include <net/checksum.h>
#include <net/inet_common.h>
#include <linux/netfilter_ipv4.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
#ifdef CONFIG_PROC_FS
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#endif

#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */

#define IGMP_QUERY_INTERVAL                        (125*HZ)
#define IGMP_QUERY_RESPONSE_INTERVAL                (10*HZ)

#define IGMP_INITIAL_REPORT_DELAY                (1)

/* IGMP_INITIAL_REPORT_DELAY is not from IGMP specs!
 * IGMP specs require to report membership immediately after
 * joining a group, but we delay the first report by a
 * small interval. It seems more natural and still does not
 * contradict to specs provided this delay is small enough.
 */

#define IGMP_V1_SEEN(in_dev) \
        (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
         IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
         ((in_dev)->mr_v1_seen && \
          time_before(jiffies, (in_dev)->mr_v1_seen)))
#define IGMP_V2_SEEN(in_dev) \
        (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
         IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
         ((in_dev)->mr_v2_seen && \
          time_before(jiffies, (in_dev)->mr_v2_seen)))

static int unsolicited_report_interval(struct in_device *in_dev)
{
        int interval_ms, interval_jiffies;

        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
                interval_ms = IN_DEV_CONF_GET(
                        in_dev,
                        IGMPV2_UNSOLICITED_REPORT_INTERVAL);
        else /* v3 */
                interval_ms = IN_DEV_CONF_GET(
                        in_dev,
                        IGMPV3_UNSOLICITED_REPORT_INTERVAL);

        interval_jiffies = msecs_to_jiffies(interval_ms);

        /* _timer functions can't handle a delay of 0 jiffies so ensure
         *  we always return a positive value.
         */
        if (interval_jiffies <= 0)
                interval_jiffies = 1;
        return interval_jiffies;
}

static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
                              gfp_t gfp);
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_clear_delrec(struct in_device *in_dev);
static int sf_setstate(struct ip_mc_list *pmc);
static void sf_markstate(struct ip_mc_list *pmc);
#endif
static void ip_mc_clear_src(struct ip_mc_list *pmc);
static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
                         int sfcount, __be32 *psfsrc, int delta);

static void ip_ma_put(struct ip_mc_list *im)
{
        if (refcount_dec_and_test(&im->refcnt)) {
                in_dev_put(im->interface);
                kfree_rcu(im, rcu);
        }
}

#define for_each_pmc_rcu(in_dev, pmc)                                \
        for (pmc = rcu_dereference(in_dev->mc_list);                \
             pmc != NULL;                                        \
             pmc = rcu_dereference(pmc->next_rcu))

#define for_each_pmc_rtnl(in_dev, pmc)                                \
        for (pmc = rtnl_dereference(in_dev->mc_list);                \
             pmc != NULL;                                        \
             pmc = rtnl_dereference(pmc->next_rcu))

static void ip_sf_list_clear_all(struct ip_sf_list *psf)
{
        struct ip_sf_list *next;

        while (psf) {
                next = psf->sf_next;
                kfree(psf);
                psf = next;
        }
}

#ifdef CONFIG_IP_MULTICAST

/*
 *        Timer management
 */

static void igmp_stop_timer(struct ip_mc_list *im)
{
        spin_lock_bh(&im->lock);
        if (timer_delete(&im->timer))
                refcount_dec(&im->refcnt);
        im->tm_running = 0;
        im->reporter = 0;
        im->unsolicit_count = 0;
        spin_unlock_bh(&im->lock);
}

/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
        int tv = get_random_u32_below(max_delay);

        im->tm_running = 1;
        if (refcount_inc_not_zero(&im->refcnt)) {
                if (mod_timer(&im->timer, jiffies + tv + 2))
                        ip_ma_put(im);
        }
}

static void igmp_gq_start_timer(struct in_device *in_dev)
{
        int tv = get_random_u32_below(in_dev->mr_maxdelay);
        unsigned long exp = jiffies + tv + 2;

        if (in_dev->mr_gq_running &&
            time_after_eq(exp, (in_dev->mr_gq_timer).expires))
                return;

        in_dev->mr_gq_running = 1;
        if (!mod_timer(&in_dev->mr_gq_timer, exp))
                in_dev_hold(in_dev);
}

static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
        int tv = get_random_u32_below(delay);

        if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
                in_dev_hold(in_dev);
}

static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
{
        spin_lock_bh(&im->lock);
        im->unsolicit_count = 0;
        if (timer_delete(&im->timer)) {
                if ((long)(im->timer.expires-jiffies) < max_delay) {
                        add_timer(&im->timer);
                        im->tm_running = 1;
                        spin_unlock_bh(&im->lock);
                        return;
                }
                refcount_dec(&im->refcnt);
        }
        igmp_start_timer(im, max_delay);
        spin_unlock_bh(&im->lock);
}


/*
 *        Send an IGMP report.
 */

#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)


static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
        int gdeleted, int sdeleted)
{
        switch (type) {
        case IGMPV3_MODE_IS_INCLUDE:
        case IGMPV3_MODE_IS_EXCLUDE:
                if (gdeleted || sdeleted)
                        return 0;
                if (!(pmc->gsquery && !psf->sf_gsresp)) {
                        if (pmc->sfmode == MCAST_INCLUDE)
                                return 1;
                        /* don't include if this source is excluded
                         * in all filters
                         */
                        if (psf->sf_count[MCAST_INCLUDE])
                                return type == IGMPV3_MODE_IS_INCLUDE;
                        return pmc->sfcount[MCAST_EXCLUDE] ==
                                psf->sf_count[MCAST_EXCLUDE];
                }
                return 0;
        case IGMPV3_CHANGE_TO_INCLUDE:
                if (gdeleted || sdeleted)
                        return 0;
                return psf->sf_count[MCAST_INCLUDE] != 0;
        case IGMPV3_CHANGE_TO_EXCLUDE:
                if (gdeleted || sdeleted)
                        return 0;
                if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
                    psf->sf_count[MCAST_INCLUDE])
                        return 0;
                return pmc->sfcount[MCAST_EXCLUDE] ==
                        psf->sf_count[MCAST_EXCLUDE];
        case IGMPV3_ALLOW_NEW_SOURCES:
                if (gdeleted || !psf->sf_crcount)
                        return 0;
                return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
        case IGMPV3_BLOCK_OLD_SOURCES:
                if (pmc->sfmode == MCAST_INCLUDE)
                        return gdeleted || (psf->sf_crcount && sdeleted);
                return psf->sf_crcount && !gdeleted && !sdeleted;
        }
        return 0;
}

static int
igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
{
        struct ip_sf_list *psf;
        int scount = 0;

        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (!is_in(pmc, psf, type, gdeleted, sdeleted))
                        continue;
                scount++;
        }
        return scount;
}

/* source address selection per RFC 3376 section 4.2.13 */
static __be32 igmpv3_get_srcaddr(struct net_device *dev,
                                 const struct flowi4 *fl4)
{
        struct in_device *in_dev = __in_dev_get_rcu(dev);
        const struct in_ifaddr *ifa;

        if (!in_dev)
                return htonl(INADDR_ANY);

        in_dev_for_each_ifa_rcu(ifa, in_dev) {
                if (fl4->saddr == ifa->ifa_local)
                        return fl4->saddr;
        }

        return htonl(INADDR_ANY);
}

static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
{
        struct sk_buff *skb;
        struct rtable *rt;
        struct iphdr *pip;
        struct igmpv3_report *pig;
        struct net *net = dev_net(dev);
        struct flowi4 fl4;
        int hlen = LL_RESERVED_SPACE(dev);
        int tlen = dev->needed_tailroom;
        unsigned int size;

        size = min(mtu, IP_MAX_MTU);
        while (1) {
                skb = alloc_skb(size + hlen + tlen,
                                GFP_ATOMIC | __GFP_NOWARN);
                if (skb)
                        break;
                size >>= 1;
                if (size < 256)
                        return NULL;
        }
        skb->priority = TC_PRIO_CONTROL;

        rt = ip_route_output_ports(net, &fl4, NULL, IGMPV3_ALL_MCR, 0,
                                   0, 0,
                                   IPPROTO_IGMP, 0, dev->ifindex);
        if (IS_ERR(rt)) {
                kfree_skb(skb);
                return NULL;
        }

        skb_dst_set(skb, &rt->dst);
        skb->dev = dev;

        skb_reserve(skb, hlen);
        skb_tailroom_reserve(skb, mtu, tlen);

        skb_reset_network_header(skb);
        pip = ip_hdr(skb);
        skb_put(skb, sizeof(struct iphdr) + 4);

        pip->version  = 4;
        pip->ihl      = (sizeof(struct iphdr)+4)>>2;
        pip->tos      = 0xc0;
        pip->frag_off = htons(IP_DF);
        pip->ttl      = 1;
        pip->daddr    = fl4.daddr;

        rcu_read_lock();
        pip->saddr    = igmpv3_get_srcaddr(dev, &fl4);
        rcu_read_unlock();

        pip->protocol = IPPROTO_IGMP;
        pip->tot_len  = 0;        /* filled in later */
        ip_select_ident(net, skb, NULL);
        ((u8 *)&pip[1])[0] = IPOPT_RA;
        ((u8 *)&pip[1])[1] = 4;
        ((u8 *)&pip[1])[2] = 0;
        ((u8 *)&pip[1])[3] = 0;

        skb->transport_header = skb->network_header + sizeof(struct iphdr) + 4;
        skb_put(skb, sizeof(*pig));
        pig = igmpv3_report_hdr(skb);
        pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
        pig->resv1 = 0;
        pig->csum = 0;
        pig->resv2 = 0;
        pig->ngrec = 0;
        return skb;
}

static int igmpv3_sendpack(struct sk_buff *skb)
{
        struct igmphdr *pig = igmp_hdr(skb);
        const int igmplen = skb_tail_pointer(skb) - skb_transport_header(skb);

        pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);

        return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
}

static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
{
        return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc, type, gdel, sdel);
}

static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
        int type, struct igmpv3_grec **ppgr, unsigned int mtu)
{
        struct net_device *dev = pmc->interface->dev;
        struct igmpv3_report *pih;
        struct igmpv3_grec *pgr;

        if (!skb) {
                skb = igmpv3_newpack(dev, mtu);
                if (!skb)
                        return NULL;
        }
        pgr = skb_put(skb, sizeof(struct igmpv3_grec));
        pgr->grec_type = type;
        pgr->grec_auxwords = 0;
        pgr->grec_nsrcs = 0;
        pgr->grec_mca = pmc->multiaddr;
        pih = igmpv3_report_hdr(skb);
        pih->ngrec = htons(ntohs(pih->ngrec)+1);
        *ppgr = pgr;
        return skb;
}

#define AVAILABLE(skb)        ((skb) ? skb_availroom(skb) : 0)

static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
        int type, int gdeleted, int sdeleted)
{
        struct net_device *dev = pmc->interface->dev;
        struct net *net = dev_net(dev);
        struct igmpv3_report *pih;
        struct igmpv3_grec *pgr = NULL;
        struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
        int scount, stotal, first, isquery, truncate;
        unsigned int mtu;

        if (pmc->multiaddr == IGMP_ALL_HOSTS)
                return skb;
        if (ipv4_is_local_multicast(pmc->multiaddr) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return skb;

        mtu = READ_ONCE(dev->mtu);
        if (mtu < IPV4_MIN_MTU)
                return skb;

        isquery = type == IGMPV3_MODE_IS_INCLUDE ||
                  type == IGMPV3_MODE_IS_EXCLUDE;
        truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
                    type == IGMPV3_CHANGE_TO_EXCLUDE;

        stotal = scount = 0;

        psf_list = sdeleted ? &pmc->tomb : &pmc->sources;

        if (!*psf_list)
                goto empty_source;

        pih = skb ? igmpv3_report_hdr(skb) : NULL;

        /* EX and TO_EX get a fresh packet, if needed */
        if (truncate) {
                if (pih && pih->ngrec &&
                    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
                        if (skb)
                                igmpv3_sendpack(skb);
                        skb = igmpv3_newpack(dev, mtu);
                }
        }
        first = 1;
        psf_prev = NULL;
        for (psf = *psf_list; psf; psf = psf_next) {
                __be32 *psrc;

                psf_next = psf->sf_next;

                if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
                        psf_prev = psf;
                        continue;
                }

                /* Based on RFC3376 5.1. Should not send source-list change
                 * records when there is a filter mode change.
                 */
                if (((gdeleted && pmc->sfmode == MCAST_EXCLUDE) ||
                     (!gdeleted && pmc->crcount)) &&
                    (type == IGMPV3_ALLOW_NEW_SOURCES ||
                     type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount)
                        goto decrease_sf_crcount;

                /* clear marks on query responses */
                if (isquery)
                        psf->sf_gsresp = 0;

                if (AVAILABLE(skb) < sizeof(__be32) +
                    first*sizeof(struct igmpv3_grec)) {
                        if (truncate && !first)
                                break;         /* truncate these */
                        if (pgr)
                                pgr->grec_nsrcs = htons(scount);
                        if (skb)
                                igmpv3_sendpack(skb);
                        skb = igmpv3_newpack(dev, mtu);
                        first = 1;
                        scount = 0;
                }
                if (first) {
                        skb = add_grhead(skb, pmc, type, &pgr, mtu);
                        first = 0;
                }
                if (!skb)
                        return NULL;
                psrc = skb_put(skb, sizeof(__be32));
                *psrc = psf->sf_inaddr;
                scount++; stotal++;
                if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
                     type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
decrease_sf_crcount:
                        psf->sf_crcount--;
                        if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
                                if (psf_prev)
                                        psf_prev->sf_next = psf->sf_next;
                                else
                                        *psf_list = psf->sf_next;
                                kfree(psf);
                                continue;
                        }
                }
                psf_prev = psf;
        }

empty_source:
        if (!stotal) {
                if (type == IGMPV3_ALLOW_NEW_SOURCES ||
                    type == IGMPV3_BLOCK_OLD_SOURCES)
                        return skb;
                if (pmc->crcount || isquery) {
                        /* make sure we have room for group header */
                        if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)) {
                                igmpv3_sendpack(skb);
                                skb = NULL; /* add_grhead will get a new one */
                        }
                        skb = add_grhead(skb, pmc, type, &pgr, mtu);
                }
        }
        if (pgr)
                pgr->grec_nsrcs = htons(scount);

        if (isquery)
                pmc->gsquery = 0;        /* clear query state on report */
        return skb;
}

static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
{
        struct sk_buff *skb = NULL;
        struct net *net = dev_net(in_dev->dev);
        int type;

        if (!pmc) {
                rcu_read_lock();
                for_each_pmc_rcu(in_dev, pmc) {
                        if (pmc->multiaddr == IGMP_ALL_HOSTS)
                                continue;
                        if (ipv4_is_local_multicast(pmc->multiaddr) &&
                            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                                continue;
                        spin_lock_bh(&pmc->lock);
                        if (pmc->sfcount[MCAST_EXCLUDE])
                                type = IGMPV3_MODE_IS_EXCLUDE;
                        else
                                type = IGMPV3_MODE_IS_INCLUDE;
                        skb = add_grec(skb, pmc, type, 0, 0);
                        spin_unlock_bh(&pmc->lock);
                }
                rcu_read_unlock();
        } else {
                spin_lock_bh(&pmc->lock);
                if (pmc->sfcount[MCAST_EXCLUDE])
                        type = IGMPV3_MODE_IS_EXCLUDE;
                else
                        type = IGMPV3_MODE_IS_INCLUDE;
                skb = add_grec(skb, pmc, type, 0, 0);
                spin_unlock_bh(&pmc->lock);
        }
        if (!skb)
                return 0;
        return igmpv3_sendpack(skb);
}

/*
 * remove zero-count source records from a source filter list
 */
static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
{
        struct ip_sf_list *psf_prev, *psf_next, *psf;

        psf_prev = NULL;
        for (psf = *ppsf; psf; psf = psf_next) {
                psf_next = psf->sf_next;
                if (psf->sf_crcount == 0) {
                        if (psf_prev)
                                psf_prev->sf_next = psf->sf_next;
                        else
                                *ppsf = psf->sf_next;
                        kfree(psf);
                } else
                        psf_prev = psf;
        }
}

static void kfree_pmc(struct ip_mc_list *pmc)
{
        ip_sf_list_clear_all(pmc->sources);
        ip_sf_list_clear_all(pmc->tomb);
        kfree(pmc);
}

static void igmpv3_send_cr(struct in_device *in_dev)
{
        struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
        struct sk_buff *skb = NULL;
        int type, dtype;

        rcu_read_lock();
        spin_lock_bh(&in_dev->mc_tomb_lock);

        /* deleted MCA's */
        pmc_prev = NULL;
        for (pmc = in_dev->mc_tomb; pmc; pmc = pmc_next) {
                pmc_next = pmc->next;
                if (pmc->sfmode == MCAST_INCLUDE) {
                        type = IGMPV3_BLOCK_OLD_SOURCES;
                        dtype = IGMPV3_BLOCK_OLD_SOURCES;
                        skb = add_grec(skb, pmc, type, 1, 0);
                        skb = add_grec(skb, pmc, dtype, 1, 1);
                }
                if (pmc->crcount) {
                        if (pmc->sfmode == MCAST_EXCLUDE) {
                                type = IGMPV3_CHANGE_TO_INCLUDE;
                                skb = add_grec(skb, pmc, type, 1, 0);
                        }
                        pmc->crcount--;
                        if (pmc->crcount == 0) {
                                igmpv3_clear_zeros(&pmc->tomb);
                                igmpv3_clear_zeros(&pmc->sources);
                        }
                }
                if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
                        if (pmc_prev)
                                pmc_prev->next = pmc_next;
                        else
                                in_dev->mc_tomb = pmc_next;
                        in_dev_put(pmc->interface);
                        kfree_pmc(pmc);
                } else
                        pmc_prev = pmc;
        }
        spin_unlock_bh(&in_dev->mc_tomb_lock);

        /* change recs */
        for_each_pmc_rcu(in_dev, pmc) {
                spin_lock_bh(&pmc->lock);
                if (pmc->sfcount[MCAST_EXCLUDE]) {
                        type = IGMPV3_BLOCK_OLD_SOURCES;
                        dtype = IGMPV3_ALLOW_NEW_SOURCES;
                } else {
                        type = IGMPV3_ALLOW_NEW_SOURCES;
                        dtype = IGMPV3_BLOCK_OLD_SOURCES;
                }
                skb = add_grec(skb, pmc, type, 0, 0);
                skb = add_grec(skb, pmc, dtype, 0, 1);        /* deleted sources */

                /* filter mode changes */
                if (pmc->crcount) {
                        if (pmc->sfmode == MCAST_EXCLUDE)
                                type = IGMPV3_CHANGE_TO_EXCLUDE;
                        else
                                type = IGMPV3_CHANGE_TO_INCLUDE;
                        skb = add_grec(skb, pmc, type, 0, 0);
                        pmc->crcount--;
                }
                spin_unlock_bh(&pmc->lock);
        }
        rcu_read_unlock();

        if (!skb)
                return;
        (void) igmpv3_sendpack(skb);
}

static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
        int type)
{
        struct sk_buff *skb;
        struct iphdr *iph;
        struct igmphdr *ih;
        struct rtable *rt;
        struct net_device *dev = in_dev->dev;
        struct net *net = dev_net(dev);
        __be32        group = pmc ? pmc->multiaddr : 0;
        struct flowi4 fl4;
        __be32        dst;
        int hlen, tlen;

        if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
                return igmpv3_send_report(in_dev, pmc);

        if (ipv4_is_local_multicast(group) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return 0;

        if (type == IGMP_HOST_LEAVE_MESSAGE)
                dst = IGMP_ALL_ROUTER;
        else
                dst = group;

        rt = ip_route_output_ports(net, &fl4, NULL, dst, 0,
                                   0, 0,
                                   IPPROTO_IGMP, 0, dev->ifindex);
        if (IS_ERR(rt))
                return -1;

        hlen = LL_RESERVED_SPACE(dev);
        tlen = dev->needed_tailroom;
        skb = alloc_skb(IGMP_SIZE + hlen + tlen, GFP_ATOMIC);
        if (!skb) {
                ip_rt_put(rt);
                return -1;
        }
        skb->priority = TC_PRIO_CONTROL;

        skb_dst_set(skb, &rt->dst);

        skb_reserve(skb, hlen);

        skb_reset_network_header(skb);
        iph = ip_hdr(skb);
        skb_put(skb, sizeof(struct iphdr) + 4);

        iph->version  = 4;
        iph->ihl      = (sizeof(struct iphdr)+4)>>2;
        iph->tos      = 0xc0;
        iph->frag_off = htons(IP_DF);
        iph->ttl      = 1;
        iph->daddr    = dst;
        iph->saddr    = fl4.saddr;
        iph->protocol = IPPROTO_IGMP;
        ip_select_ident(net, skb, NULL);
        ((u8 *)&iph[1])[0] = IPOPT_RA;
        ((u8 *)&iph[1])[1] = 4;
        ((u8 *)&iph[1])[2] = 0;
        ((u8 *)&iph[1])[3] = 0;

        ih = skb_put(skb, sizeof(struct igmphdr));
        ih->type = type;
        ih->code = 0;
        ih->csum = 0;
        ih->group = group;
        ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));

        return ip_local_out(net, skb->sk, skb);
}

static void igmp_gq_timer_expire(struct timer_list *t)
{
        struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer);

        in_dev->mr_gq_running = 0;
        igmpv3_send_report(in_dev, NULL);
        in_dev_put(in_dev);
}

static void igmp_ifc_timer_expire(struct timer_list *t)
{
        struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer);
        u32 mr_ifc_count;

        igmpv3_send_cr(in_dev);
restart:
        mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count);

        if (mr_ifc_count) {
                if (cmpxchg(&in_dev->mr_ifc_count,
                            mr_ifc_count,
                            mr_ifc_count - 1) != mr_ifc_count)
                        goto restart;
                igmp_ifc_start_timer(in_dev,
                                     unsolicited_report_interval(in_dev));
        }
        in_dev_put(in_dev);
}

static void igmp_ifc_event(struct in_device *in_dev)
{
        struct net *net = dev_net(in_dev->dev);
        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
                return;
        WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv));
        igmp_ifc_start_timer(in_dev, 1);
}


static void igmp_timer_expire(struct timer_list *t)
{
        struct ip_mc_list *im = from_timer(im, t, timer);
        struct in_device *in_dev = im->interface;

        spin_lock(&im->lock);
        im->tm_running = 0;

        if (im->unsolicit_count && --im->unsolicit_count)
                igmp_start_timer(im, unsolicited_report_interval(in_dev));

        im->reporter = 1;
        spin_unlock(&im->lock);

        if (IGMP_V1_SEEN(in_dev))
                igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
        else if (IGMP_V2_SEEN(in_dev))
                igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
        else
                igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);

        ip_ma_put(im);
}

/* mark EXCLUDE-mode sources */
static int igmp_xmarksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
{
        struct ip_sf_list *psf;
        int i, scount;

        scount = 0;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (scount == nsrcs)
                        break;
                for (i = 0; i < nsrcs; i++) {
                        /* skip inactive filters */
                        if (psf->sf_count[MCAST_INCLUDE] ||
                            pmc->sfcount[MCAST_EXCLUDE] !=
                            psf->sf_count[MCAST_EXCLUDE])
                                break;
                        if (srcs[i] == psf->sf_inaddr) {
                                scount++;
                                break;
                        }
                }
        }
        pmc->gsquery = 0;
        if (scount == nsrcs)        /* all sources excluded */
                return 0;
        return 1;
}

static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
{
        struct ip_sf_list *psf;
        int i, scount;

        if (pmc->sfmode == MCAST_EXCLUDE)
                return igmp_xmarksources(pmc, nsrcs, srcs);

        /* mark INCLUDE-mode sources */
        scount = 0;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (scount == nsrcs)
                        break;
                for (i = 0; i < nsrcs; i++)
                        if (srcs[i] == psf->sf_inaddr) {
                                psf->sf_gsresp = 1;
                                scount++;
                                break;
                        }
        }
        if (!scount) {
                pmc->gsquery = 0;
                return 0;
        }
        pmc->gsquery = 1;
        return 1;
}

/* return true if packet was dropped */
static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
{
        struct ip_mc_list *im;
        struct net *net = dev_net(in_dev->dev);

        /* Timers are only set for non-local groups */

        if (group == IGMP_ALL_HOSTS)
                return false;
        if (ipv4_is_local_multicast(group) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return false;

        rcu_read_lock();
        for_each_pmc_rcu(in_dev, im) {
                if (im->multiaddr == group) {
                        igmp_stop_timer(im);
                        break;
                }
        }
        rcu_read_unlock();
        return false;
}

/* return true if packet was dropped */
static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
        int len)
{
        struct igmphdr                 *ih = igmp_hdr(skb);
        struct igmpv3_query *ih3 = igmpv3_query_hdr(skb);
        struct ip_mc_list        *im;
        __be32                        group = ih->group;
        int                        max_delay;
        int                        mark = 0;
        struct net                *net = dev_net(in_dev->dev);


        if (len == 8) {
                if (ih->code == 0) {
                        /* Alas, old v1 router presents here. */

                        max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
                        in_dev->mr_v1_seen = jiffies +
                                (in_dev->mr_qrv * in_dev->mr_qi) +
                                in_dev->mr_qri;
                        group = 0;
                } else {
                        /* v2 router present */
                        max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
                        in_dev->mr_v2_seen = jiffies +
                                (in_dev->mr_qrv * in_dev->mr_qi) +
                                in_dev->mr_qri;
                }
                /* cancel the interface change timer */
                WRITE_ONCE(in_dev->mr_ifc_count, 0);
                if (timer_delete(&in_dev->mr_ifc_timer))
                        __in_dev_put(in_dev);
                /* clear deleted report items */
                igmpv3_clear_delrec(in_dev);
        } else if (len < 12) {
                return true;        /* ignore bogus packet; freed by caller */
        } else if (IGMP_V1_SEEN(in_dev)) {
                /* This is a v3 query with v1 queriers present */
                max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
                group = 0;
        } else if (IGMP_V2_SEEN(in_dev)) {
                /* this is a v3 query with v2 queriers present;
                 * Interpretation of the max_delay code is problematic here.
                 * A real v2 host would use ih_code directly, while v3 has a
                 * different encoding. We use the v3 encoding as more likely
                 * to be intended in a v3 query.
                 */
                max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
                if (!max_delay)
                        max_delay = 1;        /* can't mod w/ 0 */
        } else { /* v3 */
                if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
                        return true;

                ih3 = igmpv3_query_hdr(skb);
                if (ih3->nsrcs) {
                        if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)
                                           + ntohs(ih3->nsrcs)*sizeof(__be32)))
                                return true;
                        ih3 = igmpv3_query_hdr(skb);
                }

                max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
                if (!max_delay)
                        max_delay = 1;        /* can't mod w/ 0 */
                in_dev->mr_maxdelay = max_delay;

                /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
                 * received value was zero, use the default or statically
                 * configured value.
                 */
                in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;

                /* RFC3376, 8.3. Query Response Interval:
                 * The number of seconds represented by the [Query Response
                 * Interval] must be less than the [Query Interval].
                 */
                if (in_dev->mr_qri >= in_dev->mr_qi)
                        in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;

                if (!group) { /* general query */
                        if (ih3->nsrcs)
                                return true;        /* no sources allowed */
                        igmp_gq_start_timer(in_dev);
                        return false;
                }
                /* mark sources to include, if group & source-specific */
                mark = ih3->nsrcs != 0;
        }

        /*
         * - Start the timers in all of our membership records
         *   that the query applies to for the interface on
         *   which the query arrived excl. those that belong
         *   to a "local" group (224.0.0.X)
         * - For timers already running check if they need to
         *   be reset.
         * - Use the igmp->igmp_code field as the maximum
         *   delay possible
         */
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, im) {
                int changed;

                if (group && group != im->multiaddr)
                        continue;
                if (im->multiaddr == IGMP_ALL_HOSTS)
                        continue;
                if (ipv4_is_local_multicast(im->multiaddr) &&
                    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                        continue;
                spin_lock_bh(&im->lock);
                if (im->tm_running)
                        im->gsquery = im->gsquery && mark;
                else
                        im->gsquery = mark;
                changed = !im->gsquery ||
                        igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
                spin_unlock_bh(&im->lock);
                if (changed)
                        igmp_mod_timer(im, max_delay);
        }
        rcu_read_unlock();
        return false;
}

/* called in rcu_read_lock() section */
int igmp_rcv(struct sk_buff *skb)
{
        /* This basically follows the spec line by line -- see RFC1112 */
        struct igmphdr *ih;
        struct net_device *dev = skb->dev;
        struct in_device *in_dev;
        int len = skb->len;
        bool dropped = true;

        if (netif_is_l3_master(dev)) {
                dev = dev_get_by_index_rcu(dev_net(dev), IPCB(skb)->iif);
                if (!dev)
                        goto drop;
        }

        in_dev = __in_dev_get_rcu(dev);
        if (!in_dev)
                goto drop;

        if (!pskb_may_pull(skb, sizeof(struct igmphdr)))
                goto drop;

        if (skb_checksum_simple_validate(skb))
                goto drop;

        ih = igmp_hdr(skb);
        switch (ih->type) {
        case IGMP_HOST_MEMBERSHIP_QUERY:
                dropped = igmp_heard_query(in_dev, skb, len);
                break;
        case IGMP_HOST_MEMBERSHIP_REPORT:
        case IGMPV2_HOST_MEMBERSHIP_REPORT:
                /* Is it our report looped back? */
                if (rt_is_output_route(skb_rtable(skb)))
                        break;
                /* don't rely on MC router hearing unicast reports */
                if (skb->pkt_type == PACKET_MULTICAST ||
                    skb->pkt_type == PACKET_BROADCAST)
                        dropped = igmp_heard_report(in_dev, ih->group);
                break;
        case IGMP_PIM:
#ifdef CONFIG_IP_PIMSM_V1
                return pim_rcv_v1(skb);
#endif
        case IGMPV3_HOST_MEMBERSHIP_REPORT:
        case IGMP_DVMRP:
        case IGMP_TRACE:
        case IGMP_HOST_LEAVE_MESSAGE:
        case IGMP_MTRACE:
        case IGMP_MTRACE_RESP:
                break;
        default:
                break;
        }

drop:
        if (dropped)
                kfree_skb(skb);
        else
                consume_skb(skb);
        return 0;
}

#endif


/*
 *        Add a filter to a device
 */

static void ip_mc_filter_add(struct in_device *in_dev, __be32 addr)
{
        char buf[MAX_ADDR_LEN];
        struct net_device *dev = in_dev->dev;

        /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
           We will get multicast token leakage, when IFF_MULTICAST
           is changed. This check should be done in ndo_set_rx_mode
           routine. Something sort of:
           if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
           --ANK
           */
        if (arp_mc_map(addr, buf, dev, 0) == 0)
                dev_mc_add(dev, buf);
}

/*
 *        Remove a filter from a device
 */

static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
{
        char buf[MAX_ADDR_LEN];
        struct net_device *dev = in_dev->dev;

        if (arp_mc_map(addr, buf, dev, 0) == 0)
                dev_mc_del(dev, buf);
}

#ifdef CONFIG_IP_MULTICAST
/*
 * deleted ip_mc_list manipulation
 */
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
                              gfp_t gfp)
{
        struct ip_mc_list *pmc;
        struct net *net = dev_net(in_dev->dev);

        /* this is an "ip_mc_list" for convenience; only the fields below
         * are actually used. In particular, the refcnt and users are not
         * used for management of the delete list. Using the same structure
         * for deleted items allows change reports to use common code with
         * non-deleted or query-response MCA's.
         */
        pmc = kzalloc(sizeof(*pmc), gfp);
        if (!pmc)
                return;
        spin_lock_init(&pmc->lock);
        spin_lock_bh(&im->lock);
        pmc->interface = im->interface;
        in_dev_hold(in_dev);
        pmc->multiaddr = im->multiaddr;
        pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
        pmc->sfmode = im->sfmode;
        if (pmc->sfmode == MCAST_INCLUDE) {
                struct ip_sf_list *psf;

                pmc->tomb = im->tomb;
                pmc->sources = im->sources;
                im->tomb = im->sources = NULL;
                for (psf = pmc->sources; psf; psf = psf->sf_next)
                        psf->sf_crcount = pmc->crcount;
        }
        spin_unlock_bh(&im->lock);

        spin_lock_bh(&in_dev->mc_tomb_lock);
        pmc->next = in_dev->mc_tomb;
        in_dev->mc_tomb = pmc;
        spin_unlock_bh(&in_dev->mc_tomb_lock);
}

/*
 * restore ip_mc_list deleted records
 */
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
        struct ip_mc_list *pmc, *pmc_prev;
        struct ip_sf_list *psf;
        struct net *net = dev_net(in_dev->dev);
        __be32 multiaddr = im->multiaddr;

        spin_lock_bh(&in_dev->mc_tomb_lock);
        pmc_prev = NULL;
        for (pmc = in_dev->mc_tomb; pmc; pmc = pmc->next) {
                if (pmc->multiaddr == multiaddr)
                        break;
                pmc_prev = pmc;
        }
        if (pmc) {
                if (pmc_prev)
                        pmc_prev->next = pmc->next;
                else
                        in_dev->mc_tomb = pmc->next;
        }
        spin_unlock_bh(&in_dev->mc_tomb_lock);

        spin_lock_bh(&im->lock);
        if (pmc) {
                im->interface = pmc->interface;
                if (im->sfmode == MCAST_INCLUDE) {
                        swap(im->tomb, pmc->tomb);
                        swap(im->sources, pmc->sources);
                        for (psf = im->sources; psf; psf = psf->sf_next)
                                psf->sf_crcount = in_dev->mr_qrv ?:
                                        READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                } else {
                        im->crcount = in_dev->mr_qrv ?:
                                READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                }
                in_dev_put(pmc->interface);
                kfree_pmc(pmc);
        }
        spin_unlock_bh(&im->lock);
}

/*
 * flush ip_mc_list deleted records
 */
static void igmpv3_clear_delrec(struct in_device *in_dev)
{
        struct ip_mc_list *pmc, *nextpmc;

        spin_lock_bh(&in_dev->mc_tomb_lock);
        pmc = in_dev->mc_tomb;
        in_dev->mc_tomb = NULL;
        spin_unlock_bh(&in_dev->mc_tomb_lock);

        for (; pmc; pmc = nextpmc) {
                nextpmc = pmc->next;
                ip_mc_clear_src(pmc);
                in_dev_put(pmc->interface);
                kfree_pmc(pmc);
        }
        /* clear dead sources, too */
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, pmc) {
                struct ip_sf_list *psf;

                spin_lock_bh(&pmc->lock);
                psf = pmc->tomb;
                pmc->tomb = NULL;
                spin_unlock_bh(&pmc->lock);
                ip_sf_list_clear_all(psf);
        }
        rcu_read_unlock();
}
#endif

static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp)
{
        struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
        struct net *net = dev_net(in_dev->dev);
        int reporter;
#endif

        if (im->loaded) {
                im->loaded = 0;
                ip_mc_filter_del(in_dev, im->multiaddr);
        }

#ifdef CONFIG_IP_MULTICAST
        if (im->multiaddr == IGMP_ALL_HOSTS)
                return;
        if (ipv4_is_local_multicast(im->multiaddr) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return;

        reporter = im->reporter;
        igmp_stop_timer(im);

        if (!in_dev->dead) {
                if (IGMP_V1_SEEN(in_dev))
                        return;
                if (IGMP_V2_SEEN(in_dev)) {
                        if (reporter)
                                igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
                        return;
                }
                /* IGMPv3 */
                igmpv3_add_delrec(in_dev, im, gfp);

                igmp_ifc_event(in_dev);
        }
#endif
}

static void igmp_group_dropped(struct ip_mc_list *im)
{
        __igmp_group_dropped(im, GFP_KERNEL);
}

static void igmp_group_added(struct ip_mc_list *im)
{
        struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
        struct net *net = dev_net(in_dev->dev);
#endif

        if (im->loaded == 0) {
                im->loaded = 1;
                ip_mc_filter_add(in_dev, im->multiaddr);
        }

#ifdef CONFIG_IP_MULTICAST
        if (im->multiaddr == IGMP_ALL_HOSTS)
                return;
        if (ipv4_is_local_multicast(im->multiaddr) &&
            !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                return;

        if (in_dev->dead)
                return;

        im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
        if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
                spin_lock_bh(&im->lock);
                igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
                spin_unlock_bh(&im->lock);
                return;
        }
        /* else, v3 */

        /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
         * not send filter-mode change record as the mode should be from
         * IN() to IN(A).
         */
        if (im->sfmode == MCAST_EXCLUDE)
                im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);

        igmp_ifc_event(in_dev);
#endif
}


/*
 *        Multicast list managers
 */

static u32 ip_mc_hash(const struct ip_mc_list *im)
{
        return hash_32((__force u32)im->multiaddr, MC_HASH_SZ_LOG);
}

static void ip_mc_hash_add(struct in_device *in_dev,
                           struct ip_mc_list *im)
{
        struct ip_mc_list __rcu **mc_hash;
        u32 hash;

        mc_hash = rtnl_dereference(in_dev->mc_hash);
        if (mc_hash) {
                hash = ip_mc_hash(im);
                im->next_hash = mc_hash[hash];
                rcu_assign_pointer(mc_hash[hash], im);
                return;
        }

        /* do not use a hash table for small number of items */
        if (in_dev->mc_count < 4)
                return;

        mc_hash = kzalloc(sizeof(struct ip_mc_list *) << MC_HASH_SZ_LOG,
                          GFP_KERNEL);
        if (!mc_hash)
                return;

        for_each_pmc_rtnl(in_dev, im) {
                hash = ip_mc_hash(im);
                im->next_hash = mc_hash[hash];
                RCU_INIT_POINTER(mc_hash[hash], im);
        }

        rcu_assign_pointer(in_dev->mc_hash, mc_hash);
}

static void ip_mc_hash_remove(struct in_device *in_dev,
                              struct ip_mc_list *im)
{
        struct ip_mc_list __rcu **mc_hash = rtnl_dereference(in_dev->mc_hash);
        struct ip_mc_list *aux;

        if (!mc_hash)
                return;
        mc_hash += ip_mc_hash(im);
        while ((aux = rtnl_dereference(*mc_hash)) != im)
                mc_hash = &aux->next_hash;
        *mc_hash = im->next_hash;
}

int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
                       const struct ip_mc_list *im,
                       struct inet_fill_args *args)
{
        struct ifa_cacheinfo ci;
        struct ifaddrmsg *ifm;
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
                        sizeof(struct ifaddrmsg), args->flags);
        if (!nlh)
                return -EMSGSIZE;

        ifm = nlmsg_data(nlh);
        ifm->ifa_family = AF_INET;
        ifm->ifa_prefixlen = 32;
        ifm->ifa_flags = IFA_F_PERMANENT;
        ifm->ifa_scope = RT_SCOPE_UNIVERSE;
        ifm->ifa_index = dev->ifindex;

        ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ;
        ci.tstamp = ci.cstamp;
        ci.ifa_prefered = INFINITY_LIFE_TIME;
        ci.ifa_valid = INFINITY_LIFE_TIME;

        if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 ||
            nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

static void inet_ifmcaddr_notify(struct net_device *dev,
                                 const struct ip_mc_list *im, int event)
{
        struct inet_fill_args fillargs = {
                .event = event,
        };
        struct net *net = dev_net(dev);
        struct sk_buff *skb;
        int err = -ENOMEM;

        skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
                        nla_total_size(sizeof(__be32)) +
                        nla_total_size(sizeof(struct ifa_cacheinfo)),
                        GFP_KERNEL);
        if (!skb)
                goto error;

        err = inet_fill_ifmcaddr(skb, dev, im, &fillargs);
        if (err < 0) {
                WARN_ON_ONCE(err == -EMSGSIZE);
                nlmsg_free(skb);
                goto error;
        }

        rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL);
        return;
error:
        rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err);
}

/*
 *        A socket has joined a multicast group on device dev.
 */
static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
                                unsigned int mode, gfp_t gfp)
{
        struct ip_mc_list __rcu **mc_hash;
        struct ip_mc_list *im;

        ASSERT_RTNL();

        mc_hash = rtnl_dereference(in_dev->mc_hash);
        if (mc_hash) {
                u32 hash = hash_32((__force u32)addr, MC_HASH_SZ_LOG);

                for (im = rtnl_dereference(mc_hash[hash]);
                     im;
                     im = rtnl_dereference(im->next_hash)) {
                        if (im->multiaddr == addr)
                                break;
                }
        } else {
                for_each_pmc_rtnl(in_dev, im) {
                        if (im->multiaddr == addr)
                                break;
                }
        }

        if  (im) {
                im->users++;
                ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
                goto out;
        }

        im = kzalloc(sizeof(*im), gfp);
        if (!im)
                goto out;

        im->users = 1;
        im->interface = in_dev;
        in_dev_hold(in_dev);
        im->multiaddr = addr;
        im->mca_cstamp = jiffies;
        im->mca_tstamp = im->mca_cstamp;
        /* initial mode is (EX, empty) */
        im->sfmode = mode;
        im->sfcount[mode] = 1;
        refcount_set(&im->refcnt, 1);
        spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
        timer_setup(&im->timer, igmp_timer_expire, 0);
#endif

        im->next_rcu = in_dev->mc_list;
        in_dev->mc_count++;
        rcu_assign_pointer(in_dev->mc_list, im);

        ip_mc_hash_add(in_dev, im);

#ifdef CONFIG_IP_MULTICAST
        igmpv3_del_delrec(in_dev, im);
#endif
        igmp_group_added(im);
        inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST);
        if (!in_dev->dead)
                ip_rt_multicast_event(in_dev);
out:
        return;
}

void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
{
        ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp);
}
EXPORT_SYMBOL(__ip_mc_inc_group);

void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
        __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
}
EXPORT_SYMBOL(ip_mc_inc_group);

static int ip_mc_check_iphdr(struct sk_buff *skb)
{
        const struct iphdr *iph;
        unsigned int len;
        unsigned int offset = skb_network_offset(skb) + sizeof(*iph);

        if (!pskb_may_pull(skb, offset))
                return -EINVAL;

        iph = ip_hdr(skb);

        if (iph->version != 4 || ip_hdrlen(skb) < sizeof(*iph))
                return -EINVAL;

        offset += ip_hdrlen(skb) - sizeof(*iph);

        if (!pskb_may_pull(skb, offset))
                return -EINVAL;

        iph = ip_hdr(skb);

        if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
                return -EINVAL;

        len = skb_network_offset(skb) + ntohs(iph->tot_len);
        if (skb->len < len || len < offset)
                return -EINVAL;

        skb_set_transport_header(skb, offset);

        return 0;
}

static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
{
        unsigned int len = skb_transport_offset(skb);

        len += sizeof(struct igmpv3_report);

        return ip_mc_may_pull(skb, len) ? 0 : -EINVAL;
}

static int ip_mc_check_igmp_query(struct sk_buff *skb)
{
        unsigned int transport_len = ip_transport_len(skb);
        unsigned int len;

        /* IGMPv{1,2}? */
        if (transport_len != sizeof(struct igmphdr)) {
                /* or IGMPv3? */
                if (transport_len < sizeof(struct igmpv3_query))
                        return -EINVAL;

                len = skb_transport_offset(skb) + sizeof(struct igmpv3_query);
                if (!ip_mc_may_pull(skb, len))
                        return -EINVAL;
        }

        /* RFC2236+RFC3376 (IGMPv2+IGMPv3) require the multicast link layer
         * all-systems destination addresses (224.0.0.1) for general queries
         */
        if (!igmp_hdr(skb)->group &&
            ip_hdr(skb)->daddr != htonl(INADDR_ALLHOSTS_GROUP))
                return -EINVAL;

        return 0;
}

static int ip_mc_check_igmp_msg(struct sk_buff *skb)
{
        switch (igmp_hdr(skb)->type) {
        case IGMP_HOST_LEAVE_MESSAGE:
        case IGMP_HOST_MEMBERSHIP_REPORT:
        case IGMPV2_HOST_MEMBERSHIP_REPORT:
                return 0;
        case IGMPV3_HOST_MEMBERSHIP_REPORT:
                return ip_mc_check_igmp_reportv3(skb);
        case IGMP_HOST_MEMBERSHIP_QUERY:
                return ip_mc_check_igmp_query(skb);
        default:
                return -ENOMSG;
        }
}

static __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
{
        return skb_checksum_simple_validate(skb);
}

static int ip_mc_check_igmp_csum(struct sk_buff *skb)
{
        unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
        unsigned int transport_len = ip_transport_len(skb);
        struct sk_buff *skb_chk;

        if (!ip_mc_may_pull(skb, len))
                return -EINVAL;

        skb_chk = skb_checksum_trimmed(skb, transport_len,
                                       ip_mc_validate_checksum);
        if (!skb_chk)
                return -EINVAL;

        if (skb_chk != skb)
                kfree_skb(skb_chk);

        return 0;
}

/**
 * ip_mc_check_igmp - checks whether this is a sane IGMP packet
 * @skb: the skb to validate
 *
 * Checks whether an IPv4 packet is a valid IGMP packet. If so sets
 * skb transport header accordingly and returns zero.
 *
 * -EINVAL: A broken packet was detected, i.e. it violates some internet
 *  standard
 * -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
 * -ENOMEM: A memory allocation failure happened.
 *
 * Caller needs to set the skb network header and free any returned skb if it
 * differs from the provided skb.
 */
int ip_mc_check_igmp(struct sk_buff *skb)
{
        int ret = ip_mc_check_iphdr(skb);

        if (ret < 0)
                return ret;

        if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
                return -ENOMSG;

        ret = ip_mc_check_igmp_csum(skb);
        if (ret < 0)
                return ret;

        return ip_mc_check_igmp_msg(skb);
}
EXPORT_SYMBOL(ip_mc_check_igmp);

/*
 *        Resend IGMP JOIN report; used by netdev notifier.
 */
static void ip_mc_rejoin_groups(struct in_device *in_dev)
{
#ifdef CONFIG_IP_MULTICAST
        struct ip_mc_list *im;
        int type;
        struct net *net = dev_net(in_dev->dev);

        ASSERT_RTNL();

        for_each_pmc_rtnl(in_dev, im) {
                if (im->multiaddr == IGMP_ALL_HOSTS)
                        continue;
                if (ipv4_is_local_multicast(im->multiaddr) &&
                    !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
                        continue;

                /* a failover is happening and switches
                 * must be notified immediately
                 */
                if (IGMP_V1_SEEN(in_dev))
                        type = IGMP_HOST_MEMBERSHIP_REPORT;
                else if (IGMP_V2_SEEN(in_dev))
                        type = IGMPV2_HOST_MEMBERSHIP_REPORT;
                else
                        type = IGMPV3_HOST_MEMBERSHIP_REPORT;
                igmp_send_report(in_dev, im, type);
        }
#endif
}

/*
 *        A socket has left a multicast group on device dev
 */

void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
{
        struct ip_mc_list *i;
        struct ip_mc_list __rcu **ip;

        ASSERT_RTNL();

        for (ip = &in_dev->mc_list;
             (i = rtnl_dereference(*ip)) != NULL;
             ip = &i->next_rcu) {
                if (i->multiaddr == addr) {
                        if (--i->users == 0) {
                                ip_mc_hash_remove(in_dev, i);
                                *ip = i->next_rcu;
                                in_dev->mc_count--;
                                __igmp_group_dropped(i, gfp);
                                inet_ifmcaddr_notify(in_dev->dev, i,
                                                     RTM_DELMULTICAST);
                                ip_mc_clear_src(i);

                                if (!in_dev->dead)
                                        ip_rt_multicast_event(in_dev);

                                ip_ma_put(i);
                                return;
                        }
                        break;
                }
        }
}
EXPORT_SYMBOL(__ip_mc_dec_group);

/* Device changing type */

void ip_mc_unmap(struct in_device *in_dev)
{
        struct ip_mc_list *pmc;

        ASSERT_RTNL();

        for_each_pmc_rtnl(in_dev, pmc)
                igmp_group_dropped(pmc);
}

void ip_mc_remap(struct in_device *in_dev)
{
        struct ip_mc_list *pmc;

        ASSERT_RTNL();

        for_each_pmc_rtnl(in_dev, pmc) {
#ifdef CONFIG_IP_MULTICAST
                igmpv3_del_delrec(in_dev, pmc);
#endif
                igmp_group_added(pmc);
        }
}

/* Device going down */

void ip_mc_down(struct in_device *in_dev)
{
        struct ip_mc_list *pmc;

        ASSERT_RTNL();

        for_each_pmc_rtnl(in_dev, pmc)
                igmp_group_dropped(pmc);

#ifdef CONFIG_IP_MULTICAST
        WRITE_ONCE(in_dev->mr_ifc_count, 0);
        if (timer_delete(&in_dev->mr_ifc_timer))
                __in_dev_put(in_dev);
        in_dev->mr_gq_running = 0;
        if (timer_delete(&in_dev->mr_gq_timer))
                __in_dev_put(in_dev);
#endif

        ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
}

#ifdef CONFIG_IP_MULTICAST
static void ip_mc_reset(struct in_device *in_dev)
{
        struct net *net = dev_net(in_dev->dev);

        in_dev->mr_qi = IGMP_QUERY_INTERVAL;
        in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL;
        in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
}
#else
static void ip_mc_reset(struct in_device *in_dev)
{
}
#endif

void ip_mc_init_dev(struct in_device *in_dev)
{
        ASSERT_RTNL();

#ifdef CONFIG_IP_MULTICAST
        timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
        timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
#endif
        ip_mc_reset(in_dev);

        spin_lock_init(&in_dev->mc_tomb_lock);
}

/* Device going up */

void ip_mc_up(struct in_device *in_dev)
{
        struct ip_mc_list *pmc;

        ASSERT_RTNL();

        ip_mc_reset(in_dev);
        ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);

        for_each_pmc_rtnl(in_dev, pmc) {
#ifdef CONFIG_IP_MULTICAST
                igmpv3_del_delrec(in_dev, pmc);
#endif
                igmp_group_added(pmc);
        }
}

/*
 *        Device is about to be destroyed: clean up.
 */

void ip_mc_destroy_dev(struct in_device *in_dev)
{
        struct ip_mc_list *i;

        ASSERT_RTNL();

        /* Deactivate timers */
        ip_mc_down(in_dev);
#ifdef CONFIG_IP_MULTICAST
        igmpv3_clear_delrec(in_dev);
#endif

        while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
                in_dev->mc_list = i->next_rcu;
                in_dev->mc_count--;
                ip_mc_clear_src(i);
                ip_ma_put(i);
        }
}

/* RTNL is locked */
static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
{
        struct net_device *dev = NULL;
        struct in_device *idev = NULL;

        if (imr->imr_ifindex) {
                idev = inetdev_by_index(net, imr->imr_ifindex);
                return idev;
        }
        if (imr->imr_address.s_addr) {
                dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
                if (!dev)
                        return NULL;
        }

        if (!dev) {
                struct rtable *rt = ip_route_output(net,
                                                    imr->imr_multiaddr.s_addr,
                                                    0, 0, 0,
                                                    RT_SCOPE_UNIVERSE);
                if (!IS_ERR(rt)) {
                        dev = rt->dst.dev;
                        ip_rt_put(rt);
                }
        }
        if (dev) {
                imr->imr_ifindex = dev->ifindex;
                idev = __in_dev_get_rtnl(dev);
        }
        return idev;
}

/*
 *        Join a socket to a group
 */

static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
        __be32 *psfsrc)
{
        struct ip_sf_list *psf, *psf_prev;
        int rv = 0;

        psf_prev = NULL;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (psf->sf_inaddr == *psfsrc)
                        break;
                psf_prev = psf;
        }
        if (!psf || psf->sf_count[sfmode] == 0) {
                /* source filter not found, or count wrong =>  bug */
                return -ESRCH;
        }
        psf->sf_count[sfmode]--;
        if (psf->sf_count[sfmode] == 0) {
                ip_rt_multicast_event(pmc->interface);
        }
        if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
                struct in_device *in_dev = pmc->interface;
                struct net *net = dev_net(in_dev->dev);
#endif

                /* no more filters for this source */
                if (psf_prev)
                        psf_prev->sf_next = psf->sf_next;
                else
                        pmc->sources = psf->sf_next;
#ifdef CONFIG_IP_MULTICAST
                if (psf->sf_oldin &&
                    !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
                        psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                        psf->sf_next = pmc->tomb;
                        pmc->tomb = psf;
                        rv = 1;
                } else
#endif
                        kfree(psf);
        }
        return rv;
}

#ifndef CONFIG_IP_MULTICAST
#define igmp_ifc_event(x)        do { } while (0)
#endif

static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
                         int sfcount, __be32 *psfsrc, int delta)
{
        struct ip_mc_list *pmc;
        int        changerec = 0;
        int        i, err;

        if (!in_dev)
                return -ENODEV;
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, pmc) {
                if (*pmca == pmc->multiaddr)
                        break;
        }
        if (!pmc) {
                /* MCA not found?? bug */
                rcu_read_unlock();
                return -ESRCH;
        }
        spin_lock_bh(&pmc->lock);
        rcu_read_unlock();
#ifdef CONFIG_IP_MULTICAST
        sf_markstate(pmc);
#endif
        if (!delta) {
                err = -EINVAL;
                if (!pmc->sfcount[sfmode])
                        goto out_unlock;
                pmc->sfcount[sfmode]--;
        }
        err = 0;
        for (i = 0; i < sfcount; i++) {
                int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);

                changerec |= rv > 0;
                if (!err && rv < 0)
                        err = rv;
        }
        if (pmc->sfmode == MCAST_EXCLUDE &&
            pmc->sfcount[MCAST_EXCLUDE] == 0 &&
            pmc->sfcount[MCAST_INCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
                struct ip_sf_list *psf;
                struct net *net = dev_net(in_dev->dev);
#endif

                /* filter mode change */
                pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
                pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
                for (psf = pmc->sources; psf; psf = psf->sf_next)
                        psf->sf_crcount = 0;
                igmp_ifc_event(pmc->interface);
        } else if (sf_setstate(pmc) || changerec) {
                igmp_ifc_event(pmc->interface);
#endif
        }
out_unlock:
        spin_unlock_bh(&pmc->lock);
        return err;
}

/*
 * Add multicast single-source filter to the interface list
 */
static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
        __be32 *psfsrc)
{
        struct ip_sf_list *psf, *psf_prev;

        psf_prev = NULL;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (psf->sf_inaddr == *psfsrc)
                        break;
                psf_prev = psf;
        }
        if (!psf) {
                psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
                if (!psf)
                        return -ENOBUFS;
                psf->sf_inaddr = *psfsrc;
                if (psf_prev) {
                        psf_prev->sf_next = psf;
                } else
                        pmc->sources = psf;
        }
        psf->sf_count[sfmode]++;
        if (psf->sf_count[sfmode] == 1) {
                ip_rt_multicast_event(pmc->interface);
        }
        return 0;
}

#ifdef CONFIG_IP_MULTICAST
static void sf_markstate(struct ip_mc_list *pmc)
{
        struct ip_sf_list *psf;
        int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];

        for (psf = pmc->sources; psf; psf = psf->sf_next)
                if (pmc->sfcount[MCAST_EXCLUDE]) {
                        psf->sf_oldin = mca_xcount ==
                                psf->sf_count[MCAST_EXCLUDE] &&
                                !psf->sf_count[MCAST_INCLUDE];
                } else
                        psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
}

static int sf_setstate(struct ip_mc_list *pmc)
{
        struct ip_sf_list *psf, *dpsf;
        int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
        int qrv = pmc->interface->mr_qrv;
        int new_in, rv;

        rv = 0;
        for (psf = pmc->sources; psf; psf = psf->sf_next) {
                if (pmc->sfcount[MCAST_EXCLUDE]) {
                        new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
                                !psf->sf_count[MCAST_INCLUDE];
                } else
                        new_in = psf->sf_count[MCAST_INCLUDE] != 0;
                if (new_in) {
                        if (!psf->sf_oldin) {
                                struct ip_sf_list *prev = NULL;

                                for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next) {
                                        if (dpsf->sf_inaddr == psf->sf_inaddr)
                                                break;
                                        prev = dpsf;
                                }
                                if (dpsf) {
                                        if (prev)
                                                prev->sf_next = dpsf->sf_next;
                                        else
                                                pmc->tomb = dpsf->sf_next;
                                        kfree(dpsf);
                                }
                                psf->sf_crcount = qrv;
                                rv++;
                        }
                } else if (psf->sf_oldin) {

                        psf->sf_crcount = 0;
                        /*
                         * add or update "delete" records if an active filter
                         * is now inactive
                         */
                        for (dpsf = pmc->tomb; dpsf; dpsf = dpsf->sf_next)
                                if (dpsf->sf_inaddr == psf->sf_inaddr)
                                        break;
                        if (!dpsf) {
                                dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
                                if (!dpsf)
                                        continue;
                                *dpsf = *psf;
                                /* pmc->lock held by callers */
                                dpsf->sf_next = pmc->tomb;
                                pmc->tomb = dpsf;
                        }
                        dpsf->sf_crcount = qrv;
                        rv++;
                }
        }
        return rv;
}
#endif

/*
 * Add multicast source filter list to the interface list
 */
static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
                         int sfcount, __be32 *psfsrc, int delta)
{
        struct ip_mc_list *pmc;
        int        isexclude;
        int        i, err;

        if (!in_dev)
                return -ENODEV;
        rcu_read_lock();
        for_each_pmc_rcu(in_dev, pmc) {
                if (*pmca == pmc->multiaddr)
                        break;
        }
        if (!pmc) {
                /* MCA not found?? bug */
                rcu_read_unlock();
                return -ESRCH;
        }
        spin_lock_bh(&pmc->lock);
        rcu_read_unlock();

#ifdef CONFIG_IP_MULTICAST
        sf_markstate(pmc);
#endif
        isexclude = pmc->sfmode == MCAST_EXCLUDE;
        if (!delta)
                pmc->sfcount[sfmode]++;
        err = 0;
        for (i = 0; i < sfcount; i++) {
                err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i]);
                if (err)
                        break;
        }
        if (err) {
                int j;

                if (!delta)
                        pmc->sfcount[sfmode]--;
                for (j = 0; j < i; j++)
                        (void) ip_mc_del1_src(pmc, sfmode, &psfsrc[j]);
        } else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
                struct ip_sf_list *psf;
                struct net *net = dev_net(pmc->interface->dev);
                in_dev = pmc->interface;
#endif

                /* filter mode change */
                if (pmc->sfcount[MCAST_EXCLUDE])
                        pmc->sfmode = MCAST_EXCLUDE;
                else if (pmc->sfcount[MCAST_INCLUDE])
                        pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
                /* else no filters; keep old mode for reports */

                pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
                WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
                for (psf = pmc->sources; psf; psf = psf->sf_next)
                        psf->sf_crcount = 0;
                igmp_ifc_event(in_dev);
        } else if (sf_setstate(pmc)) {
                igmp_ifc_event(in_dev);
#endif
        }
        spin_unlock_bh(&pmc->lock);
        return err;
}

static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
        struct ip_sf_list *tomb, *sources;

        spin_lock_bh(&pmc->lock);
        tomb = pmc->tomb;
        pmc->tomb = NULL;
        sources = pmc->sources;
        pmc->sources = NULL;
        pmc->sfmode = MCAST_EXCLUDE;
        pmc->sfcount[MCAST_INCLUDE] = 0;
        pmc->sfcount[MCAST_EXCLUDE] = 1;
        spin_unlock_bh(&pmc->lock);

        ip_sf_list_clear_all(tomb);
        ip_sf_list_clear_all(sources);
}

/* Join a multicast group
 */
static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
                              unsigned int mode)
{
        __be32 addr = imr->imr_multiaddr.s_addr;
        struct ip_mc_socklist *iml, *i;
        struct in_device *in_dev;
        struct inet_sock *inet = inet_sk(sk);
        struct net *net = sock_net(sk);
        int ifindex;
        int count = 0;
        int err;

        ASSERT_RTNL();

        if (!ipv4_is_multicast(addr))
                return -EINVAL;

        in_dev = ip_mc_find_dev(net, imr);

        if (!in_dev) {
                err = -ENODEV;
                goto done;
        }

        err = -EADDRINUSE;
        ifindex = imr->imr_ifindex;
        for_each_pmc_rtnl(inet, i) {
                if (i->multi.imr_multiaddr.s_addr == addr &&
                    i->multi.imr_ifindex == ifindex)
                        goto done;
                count++;
        }
        err = -ENOBUFS;
        if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships))
                goto done;
        iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
        if (!iml)
                goto done;

        memcpy(&iml->multi, imr, sizeof(*imr));
        iml->next_rcu = inet->mc_list;
        iml->sflist = NULL;
        iml->sfmode = mode;
        rcu_assign_pointer(inet->mc_list, iml);
        ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
        err = 0;
done:
        return err;
}

/* Join ASM (Any-Source Multicast) group
 */
int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
{
        return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
}
EXPORT_SYMBOL(ip_mc_join_group);

/* Join SSM (Source-Specific Multicast) group
 */
int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
                         unsigned int mode)
{
        return __ip_mc_join_group(sk, imr, mode);
}

static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
                           struct in_device *in_dev)
{
        struct ip_sf_socklist *psf = rtnl_dereference(iml->sflist);
        int err;

        if (!psf) {
                /* any-source empty exclude case */
                return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
                        iml->sfmode, 0, NULL, 0);
        }
        err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
                        iml->sfmode, psf->sl_count, psf->sl_addr, 0);
        RCU_INIT_POINTER(iml->sflist, NULL);
        /* decrease mem now to avoid the memleak warning */
        atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc);
        kfree_rcu(psf, rcu);
        return err;
}

int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ip_mc_socklist *iml;
        struct ip_mc_socklist __rcu **imlp;
        struct in_device *in_dev;
        struct net *net = sock_net(sk);
        __be32 group = imr->imr_multiaddr.s_addr;
        u32 ifindex;
        int ret = -EADDRNOTAVAIL;

        ASSERT_RTNL();

        in_dev = ip_mc_find_dev(net, imr);
        if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
                ret = -ENODEV;
                goto out;
        }
        ifindex = imr->imr_ifindex;
        for (imlp = &inet->mc_list;
             (iml = rtnl_dereference(*imlp)) != NULL;
             imlp = &iml->next_rcu) {
                if (iml->multi.imr_multiaddr.s_addr != group)
                        continue;
                if (ifindex) {
                        if (iml->multi.imr_ifindex != ifindex)
                                continue;
                } else if (imr->imr_address.s_addr && imr->imr_address.s_addr !=
                                iml->multi.imr_address.s_addr)
                        continue;

                (void) ip_mc_leave_src(sk, iml, in_dev);

                *imlp = iml->next_rcu;

                if (in_dev)
                        ip_mc_dec_group(in_dev, group);

                /* decrease mem now to avoid the memleak warning */
                atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
                kfree_rcu(iml, rcu);
                return 0;
        }
out:
        return ret;
}
EXPORT_SYMBOL(ip_mc_leave_group);

int ip_mc_source(int add, int omode, struct sock *sk, struct
        ip_mreq_source *mreqs, int ifindex)
{
        int err;
        struct ip_mreqn imr;
        __be32 addr = mreqs->imr_multiaddr;
        struct ip_mc_socklist *pmc;
        struct in_device *in_dev = NULL;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *psl;
        struct net *net = sock_net(sk);
        int leavegroup = 0;
        int i, j, rv;

        if (!ipv4_is_multicast(addr))
                return -EINVAL;

        ASSERT_RTNL();

        imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
        imr.imr_address.s_addr = mreqs->imr_interface;
        imr.imr_ifindex = ifindex;
        in_dev = ip_mc_find_dev(net, &imr);

        if (!in_dev) {
                err = -ENODEV;
                goto done;
        }
        err = -EADDRNOTAVAIL;

        for_each_pmc_rtnl(inet, pmc) {
                if ((pmc->multi.imr_multiaddr.s_addr ==
                     imr.imr_multiaddr.s_addr) &&
                    (pmc->multi.imr_ifindex == imr.imr_ifindex))
                        break;
        }
        if (!pmc) {                /* must have a prior join */
                err = -EINVAL;
                goto done;
        }
        /* if a source filter was set, must be the same mode as before */
        if (pmc->sflist) {
                if (pmc->sfmode != omode) {
                        err = -EINVAL;
                        goto done;
                }
        } else if (pmc->sfmode != omode) {
                /* allow mode switches for empty-set filters */
                ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
                ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0,
                        NULL, 0);
                pmc->sfmode = omode;
        }

        psl = rtnl_dereference(pmc->sflist);
        if (!add) {
                if (!psl)
                        goto done;        /* err = -EADDRNOTAVAIL */
                rv = !0;
                for (i = 0; i < psl->sl_count; i++) {
                        rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
                                sizeof(__be32));
                        if (rv == 0)
                                break;
                }
                if (rv)                /* source not found */
                        goto done;        /* err = -EADDRNOTAVAIL */

                /* special case - (INCLUDE, empty) == LEAVE_GROUP */
                if (psl->sl_count == 1 && omode == MCAST_INCLUDE) {
                        leavegroup = 1;
                        goto done;
                }

                /* update the interface filter */
                ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
                        &mreqs->imr_sourceaddr, 1);

                for (j = i+1; j < psl->sl_count; j++)
                        psl->sl_addr[j-1] = psl->sl_addr[j];
                psl->sl_count--;
                err = 0;
                goto done;
        }
        /* else, add a new source to the filter */

        if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) {
                err = -ENOBUFS;
                goto done;
        }
        if (!psl || psl->sl_count == psl->sl_max) {
                struct ip_sf_socklist *newpsl;
                int count = IP_SFBLOCK;

                if (psl)
                        count += psl->sl_max;
                newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
                                      GFP_KERNEL);
                if (!newpsl) {
                        err = -ENOBUFS;
                        goto done;
                }
                newpsl->sl_max = count;
                newpsl->sl_count = count - IP_SFBLOCK;
                if (psl) {
                        for (i = 0; i < psl->sl_count; i++)
                                newpsl->sl_addr[i] = psl->sl_addr[i];
                        /* decrease mem now to avoid the memleak warning */
                        atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                                   &sk->sk_omem_alloc);
                }
                rcu_assign_pointer(pmc->sflist, newpsl);
                if (psl)
                        kfree_rcu(psl, rcu);
                psl = newpsl;
        }
        rv = 1;        /* > 0 for insert logic below if sl_count is 0 */
        for (i = 0; i < psl->sl_count; i++) {
                rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
                        sizeof(__be32));
                if (rv == 0)
                        break;
        }
        if (rv == 0)                /* address already there is an error */
                goto done;
        for (j = psl->sl_count-1; j >= i; j--)
                psl->sl_addr[j+1] = psl->sl_addr[j];
        psl->sl_addr[i] = mreqs->imr_sourceaddr;
        psl->sl_count++;
        err = 0;
        /* update the interface list */
        ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1,
                &mreqs->imr_sourceaddr, 1);
done:
        if (leavegroup)
                err = ip_mc_leave_group(sk, &imr);
        return err;
}

int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
{
        int err = 0;
        struct ip_mreqn        imr;
        __be32 addr = msf->imsf_multiaddr;
        struct ip_mc_socklist *pmc;
        struct in_device *in_dev;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *newpsl, *psl;
        struct net *net = sock_net(sk);
        int leavegroup = 0;

        if (!ipv4_is_multicast(addr))
                return -EINVAL;
        if (msf->imsf_fmode != MCAST_INCLUDE &&
            msf->imsf_fmode != MCAST_EXCLUDE)
                return -EINVAL;

        ASSERT_RTNL();

        imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
        imr.imr_address.s_addr = msf->imsf_interface;
        imr.imr_ifindex = ifindex;
        in_dev = ip_mc_find_dev(net, &imr);

        if (!in_dev) {
                err = -ENODEV;
                goto done;
        }

        /* special case - (INCLUDE, empty) == LEAVE_GROUP */
        if (msf->imsf_fmode == MCAST_INCLUDE && msf->imsf_numsrc == 0) {
                leavegroup = 1;
                goto done;
        }

        for_each_pmc_rtnl(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
                    pmc->multi.imr_ifindex == imr.imr_ifindex)
                        break;
        }
        if (!pmc) {                /* must have a prior join */
                err = -EINVAL;
                goto done;
        }
        if (msf->imsf_numsrc) {
                newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
                                                      msf->imsf_numsrc),
                                      GFP_KERNEL);
                if (!newpsl) {
                        err = -ENOBUFS;
                        goto done;
                }
                newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
                memcpy(newpsl->sl_addr, msf->imsf_slist_flex,
                       flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc));
                err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
                        msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
                if (err) {
                        sock_kfree_s(sk, newpsl,
                                     struct_size(newpsl, sl_addr,
                                                 newpsl->sl_max));
                        goto done;
                }
        } else {
                newpsl = NULL;
                (void) ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
                                     msf->imsf_fmode, 0, NULL, 0);
        }
        psl = rtnl_dereference(pmc->sflist);
        if (psl) {
                (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
                        psl->sl_count, psl->sl_addr, 0);
                /* decrease mem now to avoid the memleak warning */
                atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
                           &sk->sk_omem_alloc);
        } else {
                (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
                        0, NULL, 0);
        }
        rcu_assign_pointer(pmc->sflist, newpsl);
        if (psl)
                kfree_rcu(psl, rcu);
        pmc->sfmode = msf->imsf_fmode;
        err = 0;
done:
        if (leavegroup)
                err = ip_mc_leave_group(sk, &imr);
        return err;
}
int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
                 sockptr_t optval, sockptr_t optlen)
{
        int err, len, count, copycount, msf_size;
        struct ip_mreqn        imr;
        __be32 addr = msf->imsf_multiaddr;
        struct ip_mc_socklist *pmc;
        struct in_device *in_dev;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *psl;
        struct net *net = sock_net(sk);

        ASSERT_RTNL();

        if (!ipv4_is_multicast(addr))
                return -EINVAL;

        imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
        imr.imr_address.s_addr = msf->imsf_interface;
        imr.imr_ifindex = 0;
        in_dev = ip_mc_find_dev(net, &imr);

        if (!in_dev) {
                err = -ENODEV;
                goto done;
        }
        err = -EADDRNOTAVAIL;

        for_each_pmc_rtnl(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
                    pmc->multi.imr_ifindex == imr.imr_ifindex)
                        break;
        }
        if (!pmc)                /* must have a prior join */
                goto done;
        msf->imsf_fmode = pmc->sfmode;
        psl = rtnl_dereference(pmc->sflist);
        if (!psl) {
                count = 0;
        } else {
                count = psl->sl_count;
        }
        copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
        len = flex_array_size(psl, sl_addr, copycount);
        msf->imsf_numsrc = count;
        msf_size = IP_MSFILTER_SIZE(copycount);
        if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) ||
            copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) {
                return -EFAULT;
        }
        if (len &&
            copy_to_sockptr_offset(optval,
                                   offsetof(struct ip_msfilter, imsf_slist_flex),
                                   psl->sl_addr, len))
                return -EFAULT;
        return 0;
done:
        return err;
}

int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
                 sockptr_t optval, size_t ss_offset)
{
        int i, count, copycount;
        struct sockaddr_in *psin;
        __be32 addr;
        struct ip_mc_socklist *pmc;
        struct inet_sock *inet = inet_sk(sk);
        struct ip_sf_socklist *psl;

        ASSERT_RTNL();

        psin = (struct sockaddr_in *)&gsf->gf_group;
        if (psin->sin_family != AF_INET)
                return -EINVAL;
        addr = psin->sin_addr.s_addr;
        if (!ipv4_is_multicast(addr))
                return -EINVAL;

        for_each_pmc_rtnl(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == addr &&
                    pmc->multi.imr_ifindex == gsf->gf_interface)
                        break;
        }
        if (!pmc)                /* must have a prior join */
                return -EADDRNOTAVAIL;
        gsf->gf_fmode = pmc->sfmode;
        psl = rtnl_dereference(pmc->sflist);
        count = psl ? psl->sl_count : 0;
        copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
        gsf->gf_numsrc = count;
        for (i = 0; i < copycount; i++) {
                struct sockaddr_storage ss;

                psin = (struct sockaddr_in *)&ss;
                memset(&ss, 0, sizeof(ss));
                psin->sin_family = AF_INET;
                psin->sin_addr.s_addr = psl->sl_addr[i];
                if (copy_to_sockptr_offset(optval, ss_offset,
                                           &ss, sizeof(ss)))
                        return -EFAULT;
                ss_offset += sizeof(ss);
        }
        return 0;
}

/*
 * check if a multicast source filter allows delivery for a given <src,dst,intf>
 */
int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
                   int dif, int sdif)
{
        const struct inet_sock *inet = inet_sk(sk);
        struct ip_mc_socklist *pmc;
        struct ip_sf_socklist *psl;
        int i;
        int ret;

        ret = 1;
        if (!ipv4_is_multicast(loc_addr))
                goto out;

        rcu_read_lock();
        for_each_pmc_rcu(inet, pmc) {
                if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
                    (pmc->multi.imr_ifindex == dif ||
                     (sdif && pmc->multi.imr_ifindex == sdif)))
                        break;
        }
        ret = inet_test_bit(MC_ALL, sk);
        if (!pmc)
                goto unlock;
        psl = rcu_dereference(pmc->sflist);
        ret = (pmc->sfmode == MCAST_EXCLUDE);
        if (!psl)
                goto unlock;

        for (i = 0; i < psl->sl_count; i++) {
                if (psl->sl_addr[i] == rmt_addr)
                        break;
        }
        ret = 0;
        if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
                goto unlock;
        if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
                goto unlock;
        ret = 1;
unlock:
        rcu_read_unlock();
out:
        return ret;
}

/*
 *        A socket is closing.
 */

void ip_mc_drop_socket(struct sock *sk)
{
        struct inet_sock *inet = inet_sk(sk);
        struct ip_mc_socklist *iml;
        struct net *net = sock_net(sk);

        if (!inet->mc_list)
                return;

        rtnl_lock();
        while ((iml = rtnl_dereference(inet->mc_list)) != NULL) {
                struct in_device *in_dev;

                inet->mc_list = iml->next_rcu;
                in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
                (void) ip_mc_leave_src(sk, iml, in_dev);
                if (in_dev)
                        ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
                /* decrease mem now to avoid the memleak warning */
                atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
                kfree_rcu(iml, rcu);
        }
        rtnl_unlock();
}

/* called with rcu_read_lock() */
int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u8 proto)
{
        struct ip_mc_list *im;
        struct ip_mc_list __rcu **mc_hash;
        struct ip_sf_list *psf;
        int rv = 0;

        mc_hash = rcu_dereference(in_dev->mc_hash);
        if (mc_hash) {
                u32 hash = hash_32((__force u32)mc_addr, MC_HASH_SZ_LOG);

                for (im = rcu_dereference(mc_hash[hash]);
                     im != NULL;
                     im = rcu_dereference(im->next_hash)) {
                        if (im->multiaddr == mc_addr)
                                break;
                }
        } else {
                for_each_pmc_rcu(in_dev, im) {
                        if (im->multiaddr == mc_addr)
                                break;
                }
        }
        if (im && proto == IPPROTO_IGMP) {
                rv = 1;
        } else if (im) {
                if (src_addr) {
                        spin_lock_bh(&im->lock);
                        for (psf = im->sources; psf; psf = psf->sf_next) {
                                if (psf->sf_inaddr == src_addr)
                                        break;
                        }
                        if (psf)
                                rv = psf->sf_count[MCAST_INCLUDE] ||
                                        psf->sf_count[MCAST_EXCLUDE] !=
                                        im->sfcount[MCAST_EXCLUDE];
                        else
                                rv = im->sfcount[MCAST_EXCLUDE] != 0;
                        spin_unlock_bh(&im->lock);
                } else
                        rv = 1; /* unspecified source; tentatively allow */
        }
        return rv;
}

#if defined(CONFIG_PROC_FS)
struct igmp_mc_iter_state {
        struct seq_net_private p;
        struct net_device *dev;
        struct in_device *in_dev;
};

#define        igmp_mc_seq_private(seq)        ((struct igmp_mc_iter_state *)(seq)->private)

static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
{
        struct net *net = seq_file_net(seq);
        struct ip_mc_list *im = NULL;
        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);

        state->in_dev = NULL;
        for_each_netdev_rcu(net, state->dev) {
                struct in_device *in_dev;

                in_dev = __in_dev_get_rcu(state->dev);
                if (!in_dev)
                        continue;
                im = rcu_dereference(in_dev->mc_list);
                if (im) {
                        state->in_dev = in_dev;
                        break;
                }
        }
        return im;
}

static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
{
        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);

        im = rcu_dereference(im->next_rcu);
        while (!im) {
                state->dev = next_net_device_rcu(state->dev);
                if (!state->dev) {
                        state->in_dev = NULL;
                        break;
                }
                state->in_dev = __in_dev_get_rcu(state->dev);
                if (!state->in_dev)
                        continue;
                im = rcu_dereference(state->in_dev->mc_list);
        }
        return im;
}

static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
{
        struct ip_mc_list *im = igmp_mc_get_first(seq);
        if (im)
                while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
                        --pos;
        return pos ? NULL : im;
}

static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(rcu)
{
        rcu_read_lock();
        return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct ip_mc_list *im;
        if (v == SEQ_START_TOKEN)
                im = igmp_mc_get_first(seq);
        else
                im = igmp_mc_get_next(seq, v);
        ++*pos;
        return im;
}

static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
        __releases(rcu)
{
        struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);

        state->in_dev = NULL;
        state->dev = NULL;
        rcu_read_unlock();
}

static int igmp_mc_seq_show(struct seq_file *seq, void *v)
{
        if (v == SEQ_START_TOKEN)
                seq_puts(seq,
                         "Idx\tDevice    : Count Querier\tGroup    Users Timer\tReporter\n");
        else {
                struct ip_mc_list *im = v;
                struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
                char   *querier;
                long delta;

#ifdef CONFIG_IP_MULTICAST
                querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
                          IGMP_V2_SEEN(state->in_dev) ? "V2" :
                          "V3";
#else
                querier = "NONE";
#endif

                if (rcu_access_pointer(state->in_dev->mc_list) == im) {
                        seq_printf(seq, "%d\t%-10s: %5d %7s\n",
                                   state->dev->ifindex, state->dev->name, state->in_dev->mc_count, querier);
                }

                delta = im->timer.expires - jiffies;
                seq_printf(seq,
                           "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n",
                           im->multiaddr, im->users,
                           im->tm_running,
                           im->tm_running ? jiffies_delta_to_clock_t(delta) : 0,
                           im->reporter);
        }
        return 0;
}

static const struct seq_operations igmp_mc_seq_ops = {
        .start        =        igmp_mc_seq_start,
        .next        =        igmp_mc_seq_next,
        .stop        =        igmp_mc_seq_stop,
        .show        =        igmp_mc_seq_show,
};

struct igmp_mcf_iter_state {
        struct seq_net_private p;
        struct net_device *dev;
        struct in_device *idev;
        struct ip_mc_list *im;
};

#define igmp_mcf_seq_private(seq)        ((struct igmp_mcf_iter_state *)(seq)->private)

static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
{
        struct net *net = seq_file_net(seq);
        struct ip_sf_list *psf = NULL;
        struct ip_mc_list *im = NULL;
        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);

        state->idev = NULL;
        state->im = NULL;
        for_each_netdev_rcu(net, state->dev) {
                struct in_device *idev;
                idev = __in_dev_get_rcu(state->dev);
                if (unlikely(!idev))
                        continue;
                im = rcu_dereference(idev->mc_list);
                if (likely(im)) {
                        spin_lock_bh(&im->lock);
                        psf = im->sources;
                        if (likely(psf)) {
                                state->im = im;
                                state->idev = idev;
                                break;
                        }
                        spin_unlock_bh(&im->lock);
                }
        }
        return psf;
}

static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
{
        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);

        psf = psf->sf_next;
        while (!psf) {
                spin_unlock_bh(&state->im->lock);
                state->im = state->im->next;
                while (!state->im) {
                        state->dev = next_net_device_rcu(state->dev);
                        if (!state->dev) {
                                state->idev = NULL;
                                goto out;
                        }
                        state->idev = __in_dev_get_rcu(state->dev);
                        if (!state->idev)
                                continue;
                        state->im = rcu_dereference(state->idev->mc_list);
                }
                spin_lock_bh(&state->im->lock);
                psf = state->im->sources;
        }
out:
        return psf;
}

static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
{
        struct ip_sf_list *psf = igmp_mcf_get_first(seq);
        if (psf)
                while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
                        --pos;
        return pos ? NULL : psf;
}

static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(rcu)
{
        rcu_read_lock();
        return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}

static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct ip_sf_list *psf;
        if (v == SEQ_START_TOKEN)
                psf = igmp_mcf_get_first(seq);
        else
                psf = igmp_mcf_get_next(seq, v);
        ++*pos;
        return psf;
}

static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
        __releases(rcu)
{
        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
        if (likely(state->im)) {
                spin_unlock_bh(&state->im->lock);
                state->im = NULL;
        }
        state->idev = NULL;
        state->dev = NULL;
        rcu_read_unlock();
}

static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
{
        struct ip_sf_list *psf = v;
        struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);

        if (v == SEQ_START_TOKEN) {
                seq_puts(seq, "Idx Device        MCA        SRC    INC    EXC\n");
        } else {
                seq_printf(seq,
                           "%3d %6.6s 0x%08x "
                           "0x%08x %6lu %6lu\n",
                           state->dev->ifindex, state->dev->name,
                           ntohl(state->im->multiaddr),
                           ntohl(psf->sf_inaddr),
                           psf->sf_count[MCAST_INCLUDE],
                           psf->sf_count[MCAST_EXCLUDE]);
        }
        return 0;
}

static const struct seq_operations igmp_mcf_seq_ops = {
        .start        =        igmp_mcf_seq_start,
        .next        =        igmp_mcf_seq_next,
        .stop        =        igmp_mcf_seq_stop,
        .show        =        igmp_mcf_seq_show,
};

static int __net_init igmp_net_init(struct net *net)
{
        struct proc_dir_entry *pde;
        int err;

        pde = proc_create_net("igmp", 0444, net->proc_net, &igmp_mc_seq_ops,
                        sizeof(struct igmp_mc_iter_state));
        if (!pde)
                goto out_igmp;
        pde = proc_create_net("mcfilter", 0444, net->proc_net,
                        &igmp_mcf_seq_ops, sizeof(struct igmp_mcf_iter_state));
        if (!pde)
                goto out_mcfilter;
        err = inet_ctl_sock_create(&net->ipv4.mc_autojoin_sk, AF_INET,
                                   SOCK_DGRAM, 0, net);
        if (err < 0) {
                pr_err("Failed to initialize the IGMP autojoin socket (err %d)\n",
                       err);
                goto out_sock;
        }

        return 0;

out_sock:
        remove_proc_entry("mcfilter", net->proc_net);
out_mcfilter:
        remove_proc_entry("igmp", net->proc_net);
out_igmp:
        return -ENOMEM;
}

static void __net_exit igmp_net_exit(struct net *net)
{
        remove_proc_entry("mcfilter", net->proc_net);
        remove_proc_entry("igmp", net->proc_net);
        inet_ctl_sock_destroy(net->ipv4.mc_autojoin_sk);
}

static struct pernet_operations igmp_net_ops = {
        .init = igmp_net_init,
        .exit = igmp_net_exit,
};
#endif

static int igmp_netdev_event(struct notifier_block *this,
                             unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct in_device *in_dev;

        switch (event) {
        case NETDEV_RESEND_IGMP:
                in_dev = __in_dev_get_rtnl(dev);
                if (in_dev)
                        ip_mc_rejoin_groups(in_dev);
                break;
        default:
                break;
        }
        return NOTIFY_DONE;
}

static struct notifier_block igmp_notifier = {
        .notifier_call = igmp_netdev_event,
};

int __init igmp_mc_init(void)
{
#if defined(CONFIG_PROC_FS)
        int err;

        err = register_pernet_subsys(&igmp_net_ops);
        if (err)
                return err;
        err = register_netdevice_notifier(&igmp_notifier);
        if (err)
                goto reg_notif_fail;
        return 0;

reg_notif_fail:
        unregister_pernet_subsys(&igmp_net_ops);
        return err;
#else
        return register_netdevice_notifier(&igmp_notifier);
#endif
}





























   22 







   22 
  902 




   22 
  899 



  892 

  899 



  895 

  896 



   22 









   22 

   22 



   22 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * printk_safe.c - Safe printk for printk-deadlock-prone contexts
 */

#include <linux/preempt.h>
#include <linux/kdb.h>
#include <linux/smp.h>
#include <linux/cpumask.h>
#include <linux/printk.h>
#include <linux/kprobes.h>

#include "internal.h"

/* Context where printk messages are never suppressed */
static atomic_t force_con;

void printk_force_console_enter(void)
{
        atomic_inc(&force_con);
}

void printk_force_console_exit(void)
{
        atomic_dec(&force_con);
}

bool is_printk_force_console(void)
{
        return atomic_read(&force_con);
}

static DEFINE_PER_CPU(int, printk_context);

/* Can be preempted by NMI. */
void __printk_safe_enter(void)
{
        this_cpu_inc(printk_context);
}

/* Can be preempted by NMI. */
void __printk_safe_exit(void)
{
        this_cpu_dec(printk_context);
}

void __printk_deferred_enter(void)
{
        cant_migrate();
        __printk_safe_enter();
}

void __printk_deferred_exit(void)
{
        cant_migrate();
        __printk_safe_exit();
}

bool is_printk_legacy_deferred(void)
{
        /*
         * The per-CPU variable @printk_context can be read safely in any
         * context. CPU migration is always disabled when set.
         *
         * A context holding the printk_cpu_sync must not spin waiting for
         * another CPU. For legacy printing, it could be the console_lock
         * or the port lock.
         */
        return (force_legacy_kthread() ||
                this_cpu_read(printk_context) ||
                in_nmi() ||
                is_printk_cpu_sync_owner());
}

asmlinkage int vprintk(const char *fmt, va_list args)
{
#ifdef CONFIG_KGDB_KDB
        /* Allow to pass printk() to kdb but avoid a recursion. */
        if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
                return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
#endif
        return vprintk_default(fmt, args);
}
EXPORT_SYMBOL(vprintk);

















































































































































































































































































































































































































































  338 



  150 
  206 





























  149 
  206 

  149 
  206 





   55 
   97 




























































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_MMU_NOTIFIER_H
#define _LINUX_MMU_NOTIFIER_H

#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/mm_types.h>
#include <linux/mmap_lock.h>
#include <linux/srcu.h>
#include <linux/interval_tree.h>

struct mmu_notifier_subscriptions;
struct mmu_notifier;
struct mmu_notifier_range;
struct mmu_interval_notifier;

/**
 * enum mmu_notifier_event - reason for the mmu notifier callback
 * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
 * move the range
 *
 * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
 * madvise() or replacing a page by another one, ...).
 *
 * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
 * ie using the vma access permission (vm_page_prot) to update the whole range
 * is enough no need to inspect changes to the CPU page table (mprotect()
 * syscall)
 *
 * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
 * pages in the range so to mirror those changes the user must inspect the CPU
 * page table (from the end callback).
 *
 * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
 * access flags). User should soft dirty the page in the end callback to make
 * sure that anyone relying on soft dirtiness catch pages that might be written
 * through non CPU mappings.
 *
 * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal
 * that the mm refcount is zero and the range is no longer accessible.
 *
 * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal
 * a device driver to possibly ignore the invalidation if the
 * owner field matches the driver's device private pgmap owner.
 *
 * @MMU_NOTIFY_EXCLUSIVE: conversion of a page table entry to device-exclusive.
 * The owner is initialized to the value provided by the caller of
 * make_device_exclusive(), such that this caller can filter out these
 * events.
 */
enum mmu_notifier_event {
        MMU_NOTIFY_UNMAP = 0,
        MMU_NOTIFY_CLEAR,
        MMU_NOTIFY_PROTECTION_VMA,
        MMU_NOTIFY_PROTECTION_PAGE,
        MMU_NOTIFY_SOFT_DIRTY,
        MMU_NOTIFY_RELEASE,
        MMU_NOTIFY_MIGRATE,
        MMU_NOTIFY_EXCLUSIVE,
};

#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

struct mmu_notifier_ops {
        /*
         * Called either by mmu_notifier_unregister or when the mm is
         * being destroyed by exit_mmap, always before all pages are
         * freed. This can run concurrently with other mmu notifier
         * methods (the ones invoked outside the mm context) and it
         * should tear down all secondary mmu mappings and freeze the
         * secondary mmu. If this method isn't implemented you've to
         * be sure that nothing could possibly write to the pages
         * through the secondary mmu by the time the last thread with
         * tsk->mm == mm exits.
         *
         * As side note: the pages freed after ->release returns could
         * be immediately reallocated by the gart at an alias physical
         * address with a different cache model, so if ->release isn't
         * implemented because all _software_ driven memory accesses
         * through the secondary mmu are terminated by the time the
         * last thread of this mm quits, you've also to be sure that
         * speculative _hardware_ operations can't allocate dirty
         * cachelines in the cpu that could not be snooped and made
         * coherent with the other read and write operations happening
         * through the gart alias address, so leading to memory
         * corruption.
         */
        void (*release)(struct mmu_notifier *subscription,
                        struct mm_struct *mm);

        /*
         * clear_flush_young is called after the VM is
         * test-and-clearing the young/accessed bitflag in the
         * pte. This way the VM will provide proper aging to the
         * accesses to the page through the secondary MMUs and not
         * only to the ones through the Linux pte.
         * Start-end is necessary in case the secondary MMU is mapping the page
         * at a smaller granularity than the primary MMU.
         */
        int (*clear_flush_young)(struct mmu_notifier *subscription,
                                 struct mm_struct *mm,
                                 unsigned long start,
                                 unsigned long end);

        /*
         * clear_young is a lightweight version of clear_flush_young. Like the
         * latter, it is supposed to test-and-clear the young/accessed bitflag
         * in the secondary pte, but it may omit flushing the secondary tlb.
         */
        int (*clear_young)(struct mmu_notifier *subscription,
                           struct mm_struct *mm,
                           unsigned long start,
                           unsigned long end);

        /*
         * test_young is called to check the young/accessed bitflag in
         * the secondary pte. This is used to know if the page is
         * frequently used without actually clearing the flag or tearing
         * down the secondary mapping on the page.
         */
        int (*test_young)(struct mmu_notifier *subscription,
                          struct mm_struct *mm,
                          unsigned long address);

        /*
         * invalidate_range_start() and invalidate_range_end() must be
         * paired and are called only when the mmap_lock and/or the
         * locks protecting the reverse maps are held. If the subsystem
         * can't guarantee that no additional references are taken to
         * the pages in the range, it has to implement the
         * invalidate_range() notifier to remove any references taken
         * after invalidate_range_start().
         *
         * Invalidation of multiple concurrent ranges may be
         * optionally permitted by the driver. Either way the
         * establishment of sptes is forbidden in the range passed to
         * invalidate_range_begin/end for the whole duration of the
         * invalidate_range_begin/end critical section.
         *
         * invalidate_range_start() is called when all pages in the
         * range are still mapped and have at least a refcount of one.
         *
         * invalidate_range_end() is called when all pages in the
         * range have been unmapped and the pages have been freed by
         * the VM.
         *
         * The VM will remove the page table entries and potentially
         * the page between invalidate_range_start() and
         * invalidate_range_end(). If the page must not be freed
         * because of pending I/O or other circumstances then the
         * invalidate_range_start() callback (or the initial mapping
         * by the driver) must make sure that the refcount is kept
         * elevated.
         *
         * If the driver increases the refcount when the pages are
         * initially mapped into an address space then either
         * invalidate_range_start() or invalidate_range_end() may
         * decrease the refcount. If the refcount is decreased on
         * invalidate_range_start() then the VM can free pages as page
         * table entries are removed.  If the refcount is only
         * dropped on invalidate_range_end() then the driver itself
         * will drop the last refcount but it must take care to flush
         * any secondary tlb before doing the final free on the
         * page. Pages will no longer be referenced by the linux
         * address space but may still be referenced by sptes until
         * the last refcount is dropped.
         *
         * If blockable argument is set to false then the callback cannot
         * sleep and has to return with -EAGAIN if sleeping would be required.
         * 0 should be returned otherwise. Please note that notifiers that can
         * fail invalidate_range_start are not allowed to implement
         * invalidate_range_end, as there is no mechanism for informing the
         * notifier that its start failed.
         */
        int (*invalidate_range_start)(struct mmu_notifier *subscription,
                                      const struct mmu_notifier_range *range);
        void (*invalidate_range_end)(struct mmu_notifier *subscription,
                                     const struct mmu_notifier_range *range);

        /*
         * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB
         * which shares page-tables with the CPU. The
         * invalidate_range_start()/end() callbacks should not be implemented as
         * invalidate_secondary_tlbs() already catches the points in time when
         * an external TLB needs to be flushed.
         *
         * This requires arch_invalidate_secondary_tlbs() to be called while
         * holding the ptl spin-lock and therefore this callback is not allowed
         * to sleep.
         *
         * This is called by architecture code whenever invalidating a TLB
         * entry. It is assumed that any secondary TLB has the same rules for
         * when invalidations are required. If this is not the case architecture
         * code will need to call this explicitly when required for secondary
         * TLB invalidation.
         */
        void (*arch_invalidate_secondary_tlbs)(
                                        struct mmu_notifier *subscription,
                                        struct mm_struct *mm,
                                        unsigned long start,
                                        unsigned long end);

        /*
         * These callbacks are used with the get/put interface to manage the
         * lifetime of the mmu_notifier memory. alloc_notifier() returns a new
         * notifier for use with the mm.
         *
         * free_notifier() is only called after the mmu_notifier has been
         * fully put, calls to any ops callback are prevented and no ops
         * callbacks are currently running. It is called from a SRCU callback
         * and cannot sleep.
         */
        struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
        void (*free_notifier)(struct mmu_notifier *subscription);
};

/*
 * The notifier chains are protected by mmap_lock and/or the reverse map
 * semaphores. Notifier chains are only changed when all reverse maps and
 * the mmap_lock locks are taken.
 *
 * Therefore notifier chains can only be traversed when either
 *
 * 1. mmap_lock is held.
 * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem).
 * 3. No other concurrent thread can access the list (release)
 */
struct mmu_notifier {
        struct hlist_node hlist;
        const struct mmu_notifier_ops *ops;
        struct mm_struct *mm;
        struct rcu_head rcu;
        unsigned int users;
};

/**
 * struct mmu_interval_notifier_ops
 * @invalidate: Upon return the caller must stop using any SPTEs within this
 *              range. This function can sleep. Return false only if sleeping
 *              was required but mmu_notifier_range_blockable(range) is false.
 */
struct mmu_interval_notifier_ops {
        bool (*invalidate)(struct mmu_interval_notifier *interval_sub,
                           const struct mmu_notifier_range *range,
                           unsigned long cur_seq);
};

struct mmu_interval_notifier {
        struct interval_tree_node interval_tree;
        const struct mmu_interval_notifier_ops *ops;
        struct mm_struct *mm;
        struct hlist_node deferred_item;
        unsigned long invalidate_seq;
};

#ifdef CONFIG_MMU_NOTIFIER

#ifdef CONFIG_LOCKDEP
extern struct lockdep_map __mmu_notifier_invalidate_range_start_map;
#endif

struct mmu_notifier_range {
        struct mm_struct *mm;
        unsigned long start;
        unsigned long end;
        unsigned flags;
        enum mmu_notifier_event event;
        void *owner;
};

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return unlikely(mm->notifier_subscriptions);
}

struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
                                             struct mm_struct *mm);
static inline struct mmu_notifier *
mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
{
        struct mmu_notifier *ret;

        mmap_write_lock(mm);
        ret = mmu_notifier_get_locked(ops, mm);
        mmap_write_unlock(mm);
        return ret;
}
void mmu_notifier_put(struct mmu_notifier *subscription);
void mmu_notifier_synchronize(void);

extern int mmu_notifier_register(struct mmu_notifier *subscription,
                                 struct mm_struct *mm);
extern int __mmu_notifier_register(struct mmu_notifier *subscription,
                                   struct mm_struct *mm);
extern void mmu_notifier_unregister(struct mmu_notifier *subscription,
                                    struct mm_struct *mm);

unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub);
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long length,
                                 const struct mmu_interval_notifier_ops *ops);
int mmu_interval_notifier_insert_locked(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        unsigned long start, unsigned long length,
        const struct mmu_interval_notifier_ops *ops);
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub);

/**
 * mmu_interval_set_seq - Save the invalidation sequence
 * @interval_sub - The subscription passed to invalidate
 * @cur_seq - The cur_seq passed to the invalidate() callback
 *
 * This must be called unconditionally from the invalidate callback of a
 * struct mmu_interval_notifier_ops under the same lock that is used to call
 * mmu_interval_read_retry(). It updates the sequence number for later use by
 * mmu_interval_read_retry(). The provided cur_seq will always be odd.
 *
 * If the caller does not call mmu_interval_read_begin() or
 * mmu_interval_read_retry() then this call is not required.
 */
static inline void
mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub,
                     unsigned long cur_seq)
{
        WRITE_ONCE(interval_sub->invalidate_seq, cur_seq);
}

/**
 * mmu_interval_read_retry - End a read side critical section against a VA range
 * interval_sub: The subscription
 * seq: The return of the paired mmu_interval_read_begin()
 *
 * This MUST be called under a user provided lock that is also held
 * unconditionally by op->invalidate() when it calls mmu_interval_set_seq().
 *
 * Each call should be paired with a single mmu_interval_read_begin() and
 * should be used to conclude the read side.
 *
 * Returns true if an invalidation collided with this critical section, and
 * the caller should retry.
 */
static inline bool
mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub,
                        unsigned long seq)
{
        return interval_sub->invalidate_seq != seq;
}

/**
 * mmu_interval_check_retry - Test if a collision has occurred
 * interval_sub: The subscription
 * seq: The return of the matching mmu_interval_read_begin()
 *
 * This can be used in the critical section between mmu_interval_read_begin()
 * and mmu_interval_read_retry().  A return of true indicates an invalidation
 * has collided with this critical region and a future
 * mmu_interval_read_retry() will return true.
 *
 * False is not reliable and only suggests a collision may not have
 * occurred. It can be called many times and does not have to hold the user
 * provided lock.
 *
 * This call can be used as part of loops and other expensive operations to
 * expedite a retry.
 */
static inline bool
mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub,
                         unsigned long seq)
{
        /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
        return READ_ONCE(interval_sub->invalidate_seq) != seq;
}

extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm);
extern void __mmu_notifier_release(struct mm_struct *mm);
extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end);
extern int __mmu_notifier_clear_young(struct mm_struct *mm,
                                      unsigned long start,
                                      unsigned long end);
extern int __mmu_notifier_test_young(struct mm_struct *mm,
                                     unsigned long address);
extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r);
extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r);
extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end);
extern bool
mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_release(mm);
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_flush_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_clear_young(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_clear_young(mm, start, end);
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        if (mm_has_notifiers(mm))
                return __mmu_notifier_test_young(mm, address);
        return 0;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
        might_sleep();

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
                __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}

/*
 * This version of mmu_notifier_invalidate_range_start() avoids blocking, but it
 * can return an error if a notifier can't proceed without blocking, in which
 * case you're not allowed to modify PTEs in the specified range.
 *
 * This is mainly intended for OOM handling.
 */
static inline int __must_check
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        int ret = 0;

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (mm_has_notifiers(range->mm)) {
                range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
                ret = __mmu_notifier_invalidate_range_start(range);
        }
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
        return ret;
}

static inline void
mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
        if (mmu_notifier_range_blockable(range))
                might_sleep();

        if (mm_has_notifiers(range->mm))
                __mmu_notifier_invalidate_range_end(range);
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
        mm->notifier_subscriptions = NULL;
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
        if (mm_has_notifiers(mm))
                __mmu_notifier_subscriptions_destroy(mm);
}


static inline void mmu_notifier_range_init(struct mmu_notifier_range *range,
                                           enum mmu_notifier_event event,
                                           unsigned flags,
                                           struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        range->event = event;
        range->mm = mm;
        range->start = start;
        range->end = end;
        range->flags = flags;
}

static inline void mmu_notifier_range_init_owner(
                        struct mmu_notifier_range *range,
                        enum mmu_notifier_event event, unsigned int flags,
                        struct mm_struct *mm, unsigned long start,
                        unsigned long end, void *owner)
{
        mmu_notifier_range_init(range, event, flags, mm, start, end);
        range->owner = owner;
}

#define ptep_clear_flush_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_clear_flush_young(___vma, ___address, __ptep);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_flush_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_clear_flush_young(___vma, ___address, __pmdp);        \
        __young |= mmu_notifier_clear_flush_young(___vma->vm_mm,        \
                                                  ___address,                \
                                                  ___address +                \
                                                        PMD_SIZE);        \
        __young;                                                        \
})

#define ptep_clear_young_notify(__vma, __address, __ptep)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PAGE_SIZE);        \
        __young;                                                        \
})

#define pmdp_clear_young_notify(__vma, __address, __pmdp)                \
({                                                                        \
        int __young;                                                        \
        struct vm_area_struct *___vma = __vma;                                \
        unsigned long ___address = __address;                                \
        __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
        __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address,        \
                                            ___address + PMD_SIZE);        \
        __young;                                                        \
})

#else /* CONFIG_MMU_NOTIFIER */

struct mmu_notifier_range {
        unsigned long start;
        unsigned long end;
};

static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
                                            unsigned long start,
                                            unsigned long end)
{
        range->start = start;
        range->end = end;
}

#define mmu_notifier_range_init(range,event,flags,mm,start,end)  \
        _mmu_notifier_range_init(range, start, end)
#define mmu_notifier_range_init_owner(range, event, flags, mm, start, \
                                        end, owner) \
        _mmu_notifier_range_init(range, start, end)

static inline bool
mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
{
        return true;
}

static inline int mm_has_notifiers(struct mm_struct *mm)
{
        return 0;
}

static inline void mmu_notifier_release(struct mm_struct *mm)
{
}

static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                          unsigned long start,
                                          unsigned long end)
{
        return 0;
}

static inline int mmu_notifier_clear_young(struct mm_struct *mm,
                                           unsigned long start,
                                           unsigned long end)
{
        return 0;
}

static inline int mmu_notifier_test_young(struct mm_struct *mm,
                                          unsigned long address)
{
        return 0;
}

static inline void
mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
}

static inline int
mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
{
        return 0;
}

static inline
void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
}

static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                  unsigned long start, unsigned long end)
{
}

static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm)
{
}

static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
}

#define mmu_notifier_range_update_to_read_only(r) false

#define ptep_clear_flush_young_notify ptep_clear_flush_young
#define pmdp_clear_flush_young_notify pmdp_clear_flush_young
#define ptep_clear_young_notify ptep_test_and_clear_young
#define pmdp_clear_young_notify pmdp_test_and_clear_young
#define        ptep_clear_flush_notify ptep_clear_flush
#define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush
#define pudp_huge_clear_flush_notify pudp_huge_clear_flush

static inline void mmu_notifier_synchronize(void)
{
}

#endif /* CONFIG_MMU_NOTIFIER */

#endif /* _LINUX_MMU_NOTIFIER_H */


























































































































































































































































































































































































































































































































































































































































  318 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SWAP_H
#define _LINUX_SWAP_H

#include <linux/spinlock.h>
#include <linux/linkage.h>
#include <linux/mmzone.h>
#include <linux/list.h>
#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/node.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/atomic.h>
#include <linux/page-flags.h>
#include <uapi/linux/mempolicy.h>
#include <asm/page.h>

struct notifier_block;

struct bio;

struct pagevec;

#define SWAP_FLAG_PREFER        0x8000        /* set if swap priority specified */
#define SWAP_FLAG_PRIO_MASK        0x7fff
#define SWAP_FLAG_DISCARD        0x10000 /* enable discard for swap */
#define SWAP_FLAG_DISCARD_ONCE        0x20000 /* discard swap area at swapon-time */
#define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */

#define SWAP_FLAGS_VALID        (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \
                                 SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \
                                 SWAP_FLAG_DISCARD_PAGES)
#define SWAP_BATCH 64

static inline int current_is_kswapd(void)
{
        return current->flags & PF_KSWAPD;
}

/*
 * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
 * be swapped to.  The swap type and the offset into that swap type are
 * encoded into pte's and into pgoff_t's in the swapcache.  Using five bits
 * for the type means that the maximum number of swapcache pages is 27 bits
 * on 32-bit-pgoff_t architectures.  And that assumes that the architecture packs
 * the type/offset into the pte as 5/27 as well.
 */
#define MAX_SWAPFILES_SHIFT        5

/*
 * Use some of the swap files numbers for other purposes. This
 * is a convenient way to hook into the VM to trigger special
 * actions on faults.
 */

/*
 * PTE markers are used to persist information onto PTEs that otherwise
 * should be a none pte.  As its name "PTE" hints, it should only be
 * applied to the leaves of pgtables.
 */
#define SWP_PTE_MARKER_NUM 1
#define SWP_PTE_MARKER     (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
                            SWP_MIGRATION_NUM + SWP_DEVICE_NUM)

/*
 * Unaddressable device memory support. See include/linux/hmm.h and
 * Documentation/mm/hmm.rst. Short description is we need struct pages for
 * device memory that is unaddressable (inaccessible) by CPU, so that we can
 * migrate part of a process memory to device memory.
 *
 * When a page is migrated from CPU to device, we set the CPU page table entry
 * to a special SWP_DEVICE_{READ|WRITE} entry.
 *
 * When a page is mapped by the device for exclusive access we set the CPU page
 * table entries to a special SWP_DEVICE_EXCLUSIVE entry.
 */
#ifdef CONFIG_DEVICE_PRIVATE
#define SWP_DEVICE_NUM 3
#define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM)
#define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1)
#define SWP_DEVICE_EXCLUSIVE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+2)
#else
#define SWP_DEVICE_NUM 0
#endif

/*
 * Page migration support.
 *
 * SWP_MIGRATION_READ_EXCLUSIVE is only applicable to anonymous pages and
 * indicates that the referenced (part of) an anonymous page is exclusive to
 * a single process. For SWP_MIGRATION_WRITE, that information is implicit:
 * (part of) an anonymous page that are mapped writable are exclusive to a
 * single process.
 */
#ifdef CONFIG_MIGRATION
#define SWP_MIGRATION_NUM 3
#define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM)
#define SWP_MIGRATION_READ_EXCLUSIVE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1)
#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 2)
#else
#define SWP_MIGRATION_NUM 0
#endif

/*
 * Handling of hardware poisoned pages with memory corruption.
 */
#ifdef CONFIG_MEMORY_FAILURE
#define SWP_HWPOISON_NUM 1
#define SWP_HWPOISON                MAX_SWAPFILES
#else
#define SWP_HWPOISON_NUM 0
#endif

#define MAX_SWAPFILES \
        ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
        SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
        SWP_PTE_MARKER_NUM)

/*
 * Magic header for a swap area. The first part of the union is
 * what the swap magic looks like for the old (limited to 128MB)
 * swap area format, the second part of the union adds - in the
 * old reserved area - some extra information. Note that the first
 * kilobyte is reserved for boot loader or disk label stuff...
 *
 * Having the magic at the end of the PAGE_SIZE makes detecting swap
 * areas somewhat tricky on machines that support multiple page sizes.
 * For 2.5 we'll probably want to move the magic to just beyond the
 * bootbits...
 */
union swap_header {
        struct {
                char reserved[PAGE_SIZE - 10];
                char magic[10];                        /* SWAP-SPACE or SWAPSPACE2 */
        } magic;
        struct {
                char                bootbits[1024];        /* Space for disklabel etc. */
                __u32                version;
                __u32                last_page;
                __u32                nr_badpages;
                unsigned char        sws_uuid[16];
                unsigned char        sws_volume[16];
                __u32                padding[117];
                __u32                badpages[1];
        } info;
};

/*
 * current->reclaim_state points to one of these when a task is running
 * memory reclaim
 */
struct reclaim_state {
        /* pages reclaimed outside of LRU-based reclaim */
        unsigned long reclaimed;
#ifdef CONFIG_LRU_GEN
        /* per-thread mm walk data */
        struct lru_gen_mm_walk *mm_walk;
#endif
};

/*
 * mm_account_reclaimed_pages(): account reclaimed pages outside of LRU-based
 * reclaim
 * @pages: number of pages reclaimed
 *
 * If the current process is undergoing a reclaim operation, increment the
 * number of reclaimed pages by @pages.
 */
static inline void mm_account_reclaimed_pages(unsigned long pages)
{
        if (current->reclaim_state)
                current->reclaim_state->reclaimed += pages;
}

#ifdef __KERNEL__

struct address_space;
struct sysinfo;
struct writeback_control;
struct zone;

/*
 * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
 * disk blocks.  A rbtree of swap extents maps the entire swapfile (Where the
 * term `swapfile' refers to either a blockdevice or an IS_REG file). Apart
 * from setup, they're handled identically.
 *
 * We always assume that blocks are of size PAGE_SIZE.
 */
struct swap_extent {
        struct rb_node rb_node;
        pgoff_t start_page;
        pgoff_t nr_pages;
        sector_t start_block;
};

/*
 * Max bad pages in the new format..
 */
#define MAX_SWAP_BADPAGES \
        ((offsetof(union swap_header, magic.magic) - \
          offsetof(union swap_header, info.badpages)) / sizeof(int))

enum {
        SWP_USED        = (1 << 0),        /* is slot in swap_info[] used? */
        SWP_WRITEOK        = (1 << 1),        /* ok to write to this swap?        */
        SWP_DISCARDABLE = (1 << 2),        /* blkdev support discard */
        SWP_DISCARDING        = (1 << 3),        /* now discarding a free cluster */
        SWP_SOLIDSTATE        = (1 << 4),        /* blkdev seeks are cheap */
        SWP_CONTINUED        = (1 << 5),        /* swap_map has count continuation */
        SWP_BLKDEV        = (1 << 6),        /* its a block device */
        SWP_ACTIVATED        = (1 << 7),        /* set after swap_activate success */
        SWP_FS_OPS        = (1 << 8),        /* swapfile operations go through fs */
        SWP_AREA_DISCARD = (1 << 9),        /* single-time swap area discards */
        SWP_PAGE_DISCARD = (1 << 10),        /* freed swap page-cluster discards */
        SWP_STABLE_WRITES = (1 << 11),        /* no overwrite PG_writeback pages */
        SWP_SYNCHRONOUS_IO = (1 << 12),        /* synchronous IO is efficient */
                                        /* add others here before... */
};

#define SWAP_CLUSTER_MAX 32UL
#define SWAP_CLUSTER_MAX_SKIPPED (SWAP_CLUSTER_MAX << 10)
#define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX

/* Bit flag in swap_map */
#define SWAP_HAS_CACHE        0x40        /* Flag page is cached, in first swap_map */
#define COUNT_CONTINUED        0x80        /* Flag swap_map continuation for full count */

/* Special value in first swap_map */
#define SWAP_MAP_MAX        0x3e        /* Max count */
#define SWAP_MAP_BAD        0x3f        /* Note page is bad */
#define SWAP_MAP_SHMEM        0xbf        /* Owned by shmem/tmpfs */

/* Special value in each swap_map continuation */
#define SWAP_CONT_MAX        0x7f        /* Max count */

/*
 * We use this to track usage of a cluster. A cluster is a block of swap disk
 * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
 * free clusters are organized into a list. We fetch an entry from the list to
 * get a free cluster.
 *
 * The flags field determines if a cluster is free. This is
 * protected by cluster lock.
 */
struct swap_cluster_info {
        spinlock_t lock;        /*
                                 * Protect swap_cluster_info fields
                                 * other than list, and swap_info_struct->swap_map
                                 * elements corresponding to the swap cluster.
                                 */
        u16 count;
        u8 flags;
        u8 order;
        struct list_head list;
};

/* All on-list cluster must have a non-zero flag. */
enum swap_cluster_flags {
        CLUSTER_FLAG_NONE = 0, /* For temporary off-list cluster */
        CLUSTER_FLAG_FREE,
        CLUSTER_FLAG_NONFULL,
        CLUSTER_FLAG_FRAG,
        /* Clusters with flags above are allocatable */
        CLUSTER_FLAG_USABLE = CLUSTER_FLAG_FRAG,
        CLUSTER_FLAG_FULL,
        CLUSTER_FLAG_DISCARD,
        CLUSTER_FLAG_MAX,
};

/*
 * The first page in the swap file is the swap header, which is always marked
 * bad to prevent it from being allocated as an entry. This also prevents the
 * cluster to which it belongs being marked free. Therefore 0 is safe to use as
 * a sentinel to indicate an entry is not valid.
 */
#define SWAP_ENTRY_INVALID        0

#ifdef CONFIG_THP_SWAP
#define SWAP_NR_ORDERS                (PMD_ORDER + 1)
#else
#define SWAP_NR_ORDERS                1
#endif

/*
 * We keep using same cluster for rotational device so IO will be sequential.
 * The purpose is to optimize SWAP throughput on these device.
 */
struct swap_sequential_cluster {
        unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
};

/*
 * The in-memory structure used to track swap areas.
 */
struct swap_info_struct {
        struct percpu_ref users;        /* indicate and keep swap device valid. */
        unsigned long        flags;                /* SWP_USED etc: see above */
        signed short        prio;                /* swap priority of this type */
        struct plist_node list;                /* entry in swap_active_head */
        signed char        type;                /* strange name for an index */
        unsigned int        max;                /* extent of the swap_map */
        unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
        unsigned long *zeromap;                /* kvmalloc'ed bitmap to track zero pages */
        struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
        struct list_head free_clusters; /* free clusters list */
        struct list_head full_clusters; /* full clusters list */
        struct list_head nonfull_clusters[SWAP_NR_ORDERS];
                                        /* list of cluster that contains at least one free slot */
        struct list_head frag_clusters[SWAP_NR_ORDERS];
                                        /* list of cluster that are fragmented or contented */
        atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
        unsigned int pages;                /* total of usable pages of swap */
        atomic_long_t inuse_pages;        /* number of those currently in use */
        struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
        spinlock_t global_cluster_lock;        /* Serialize usage of global cluster */
        struct rb_root swap_extent_root;/* root of the swap extent rbtree */
        struct block_device *bdev;        /* swap device or bdev of swap file */
        struct file *swap_file;                /* seldom referenced */
        struct completion comp;                /* seldom referenced */
        spinlock_t lock;                /*
                                         * protect map scan related fields like
                                         * swap_map, lowest_bit, highest_bit,
                                         * inuse_pages, cluster_next,
                                         * cluster_nr, lowest_alloc,
                                         * highest_alloc, free/discard cluster
                                         * list. other fields are only changed
                                         * at swapon/swapoff, so are protected
                                         * by swap_lock. changing flags need
                                         * hold this lock and swap_lock. If
                                         * both locks need hold, hold swap_lock
                                         * first.
                                         */
        spinlock_t cont_lock;                /*
                                         * protect swap count continuation page
                                         * list.
                                         */
        struct work_struct discard_work; /* discard worker */
        struct work_struct reclaim_work; /* reclaim worker */
        struct list_head discard_clusters; /* discard clusters list */
        struct plist_node avail_lists[]; /*
                                           * entries in swap_avail_heads, one
                                           * entry per node.
                                           * Must be last as the number of the
                                           * array is nr_node_ids, which is not
                                           * a fixed value so have to allocate
                                           * dynamically.
                                           * And it has to be an array so that
                                           * plist_for_each_* can work.
                                           */
};

static inline swp_entry_t page_swap_entry(struct page *page)
{
        struct folio *folio = page_folio(page);
        swp_entry_t entry = folio->swap;

        entry.val += folio_page_idx(folio, page);
        return entry;
}

/* linux/mm/workingset.c */
bool workingset_test_recent(void *shadow, bool file, bool *workingset,
                                bool flush);
void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages);
void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg);
void workingset_refault(struct folio *folio, void *shadow);
void workingset_activation(struct folio *folio);

/* linux/mm/page_alloc.c */
extern unsigned long totalreserve_pages;

/* Definition of global_zone_page_state not available yet */
#define nr_free_pages() global_zone_page_state(NR_FREE_PAGES)


/* linux/mm/swap.c */
void lru_note_cost(struct lruvec *lruvec, bool file,
                   unsigned int nr_io, unsigned int nr_rotated);
void lru_note_cost_refault(struct folio *);
void folio_add_lru(struct folio *);
void folio_add_lru_vma(struct folio *, struct vm_area_struct *);
void mark_page_accessed(struct page *);
void folio_mark_accessed(struct folio *);

extern atomic_t lru_disable_count;

static inline bool lru_cache_disabled(void)
{
        return atomic_read(&lru_disable_count);
}

static inline void lru_cache_enable(void)
{
        atomic_dec(&lru_disable_count);
}

extern void lru_cache_disable(void);
extern void lru_add_drain(void);
extern void lru_add_drain_cpu(int cpu);
extern void lru_add_drain_cpu_zone(struct zone *zone);
extern void lru_add_drain_all(void);
void folio_deactivate(struct folio *folio);
void folio_mark_lazyfree(struct folio *folio);
extern void swap_setup(void);

/* linux/mm/vmscan.c */
extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                        gfp_t gfp_mask, nodemask_t *mask);

#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)
#define MIN_SWAPPINESS 0
#define MAX_SWAPPINESS 200
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                                  unsigned long nr_pages,
                                                  gfp_t gfp_mask,
                                                  unsigned int reclaim_options,
                                                  int *swappiness);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                pg_data_t *pgdat,
                                                unsigned long *nr_scanned);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
long remove_mapping(struct address_space *mapping, struct folio *folio);

#ifdef CONFIG_NUMA
extern int sysctl_min_unmapped_ratio;
extern int sysctl_min_slab_ratio;
#endif

void check_move_unevictable_folios(struct folio_batch *fbatch);

extern void __meminit kswapd_run(int nid);
extern void __meminit kswapd_stop(int nid);

#ifdef CONFIG_SWAP

int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block);
int generic_swapfile_activate(struct swap_info_struct *, struct file *,
                sector_t *);

static inline unsigned long total_swapcache_pages(void)
{
        return global_node_page_state(NR_SWAPCACHE);
}

void free_swap_cache(struct folio *folio);
void free_page_and_swap_cache(struct page *);
void free_pages_and_swap_cache(struct encoded_page **, int);
/* linux/mm/swapfile.c */
extern atomic_long_t nr_swap_pages;
extern long total_swap_pages;
extern atomic_t nr_rotate_swap;

/* Swap 50% full? Release swapcache more aggressively.. */
static inline bool vm_swap_full(void)
{
        return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages;
}

static inline long get_nr_swap_pages(void)
{
        return atomic_long_read(&nr_swap_pages);
}

extern void si_swapinfo(struct sysinfo *);
int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask);
bool folio_free_swap(struct folio *folio);
void put_swap_folio(struct folio *folio, swp_entry_t entry);
extern swp_entry_t get_swap_page_of_type(int);
extern int add_swap_count_continuation(swp_entry_t, gfp_t);
extern void swap_shmem_alloc(swp_entry_t, int);
extern int swap_duplicate(swp_entry_t);
extern int swapcache_prepare(swp_entry_t entry, int nr);
extern void swap_free_nr(swp_entry_t entry, int nr_pages);
extern void free_swap_and_cache_nr(swp_entry_t entry, int nr);
int swap_type_of(dev_t device, sector_t offset);
int find_first_swap(dev_t *device);
extern unsigned int count_swap_pages(int, int);
extern sector_t swapdev_block(int, pgoff_t);
extern int __swap_count(swp_entry_t entry);
extern bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry);
extern int swp_swapcount(swp_entry_t entry);
struct swap_info_struct *swp_swap_info(swp_entry_t entry);
struct backing_dev_info;
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
extern void exit_swap_address_space(unsigned int type);
extern struct swap_info_struct *get_swap_device(swp_entry_t entry);
sector_t swap_folio_sector(struct folio *folio);

static inline void put_swap_device(struct swap_info_struct *si)
{
        percpu_ref_put(&si->users);
}

#else /* CONFIG_SWAP */
static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
        return NULL;
}

static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
{
        return NULL;
}

static inline void put_swap_device(struct swap_info_struct *si)
{
}

#define get_nr_swap_pages()                        0L
#define total_swap_pages                        0L
#define total_swapcache_pages()                        0UL
#define vm_swap_full()                                0

#define si_swapinfo(val) \
        do { (val)->freeswap = (val)->totalswap = 0; } while (0)
/* only sparc can not include linux/pagemap.h in this file
 * so leave put_page and release_pages undeclared... */
#define free_page_and_swap_cache(page) \
        put_page(page)
#define free_pages_and_swap_cache(pages, nr) \
        release_pages((pages), (nr));

static inline void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
}

static inline void free_swap_cache(struct folio *folio)
{
}

static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask)
{
        return 0;
}

static inline void swap_shmem_alloc(swp_entry_t swp, int nr)
{
}

static inline int swap_duplicate(swp_entry_t swp)
{
        return 0;
}

static inline int swapcache_prepare(swp_entry_t swp, int nr)
{
        return 0;
}

static inline void swap_free_nr(swp_entry_t entry, int nr_pages)
{
}

static inline void put_swap_folio(struct folio *folio, swp_entry_t swp)
{
}

static inline int __swap_count(swp_entry_t entry)
{
        return 0;
}

static inline bool swap_entry_swapped(struct swap_info_struct *si, swp_entry_t entry)
{
        return false;
}

static inline int swp_swapcount(swp_entry_t entry)
{
        return 0;
}

static inline int folio_alloc_swap(struct folio *folio, gfp_t gfp_mask)
{
        return -EINVAL;
}

static inline bool folio_free_swap(struct folio *folio)
{
        return false;
}

static inline int add_swap_extent(struct swap_info_struct *sis,
                                  unsigned long start_page,
                                  unsigned long nr_pages, sector_t start_block)
{
        return -EINVAL;
}
#endif /* CONFIG_SWAP */

static inline void free_swap_and_cache(swp_entry_t entry)
{
        free_swap_and_cache_nr(entry, 1);
}

static inline void swap_free(swp_entry_t entry)
{
        swap_free_nr(entry, 1);
}

#ifdef CONFIG_MEMCG
static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg)
{
        /* Cgroup2 doesn't have per-cgroup swappiness */
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return READ_ONCE(vm_swappiness);

        /* root ? */
        if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
                return READ_ONCE(vm_swappiness);

        return READ_ONCE(memcg->swappiness);
}
#else
static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
{
        return READ_ONCE(vm_swappiness);
}
#endif

#if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp);
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
        if (mem_cgroup_disabled())
                return;
        __folio_throttle_swaprate(folio, gfp);
}
#else
static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp)
{
}
#endif

#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP)
int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry);
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                swp_entry_t entry)
{
        if (mem_cgroup_disabled())
                return 0;
        return __mem_cgroup_try_charge_swap(folio, entry);
}

extern void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages);
static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
        if (mem_cgroup_disabled())
                return;
        __mem_cgroup_uncharge_swap(entry, nr_pages);
}

extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg);
extern bool mem_cgroup_swap_full(struct folio *folio);
#else
static inline int mem_cgroup_try_charge_swap(struct folio *folio,
                                             swp_entry_t entry)
{
        return 0;
}

static inline void mem_cgroup_uncharge_swap(swp_entry_t entry,
                                            unsigned int nr_pages)
{
}

static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
        return get_nr_swap_pages();
}

static inline bool mem_cgroup_swap_full(struct folio *folio)
{
        return vm_swap_full();
}
#endif

#endif /* __KERNEL__*/
#endif /* _LINUX_SWAP_H */
















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright (c) 2023 Isovalent */
#ifndef __NET_TCX_H
#define __NET_TCX_H

#include <linux/bpf.h>
#include <linux/bpf_mprog.h>

#include <net/sch_generic.h>

struct mini_Qdisc;

struct tcx_entry {
        struct mini_Qdisc __rcu *miniq;
        struct bpf_mprog_bundle bundle;
        u32 miniq_active;
        struct rcu_head rcu;
};

struct tcx_link {
        struct bpf_link link;
        struct net_device *dev;
        u32 location;
};

static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress)
{
#ifdef CONFIG_NET_XGRESS
        skb->tc_at_ingress = ingress;
#endif
}

#ifdef CONFIG_NET_XGRESS
static inline struct tcx_entry *tcx_entry(struct bpf_mprog_entry *entry)
{
        struct bpf_mprog_bundle *bundle = entry->parent;

        return container_of(bundle, struct tcx_entry, bundle);
}

static inline struct tcx_link *tcx_link(const struct bpf_link *link)
{
        return container_of(link, struct tcx_link, link);
}

void tcx_inc(void);
void tcx_dec(void);

static inline void tcx_entry_sync(void)
{
        /* bpf_mprog_entry got a/b swapped, therefore ensure that
         * there are no inflight users on the old one anymore.
         */
        synchronize_rcu();
}

static inline void
tcx_entry_update(struct net_device *dev, struct bpf_mprog_entry *entry,
                 bool ingress)
{
        ASSERT_RTNL();
        if (ingress)
                rcu_assign_pointer(dev->tcx_ingress, entry);
        else
                rcu_assign_pointer(dev->tcx_egress, entry);
}

static inline struct bpf_mprog_entry *
tcx_entry_fetch(struct net_device *dev, bool ingress)
{
        ASSERT_RTNL();
        if (ingress)
                return rcu_dereference_rtnl(dev->tcx_ingress);
        else
                return rcu_dereference_rtnl(dev->tcx_egress);
}

static inline struct bpf_mprog_entry *tcx_entry_create_noprof(void)
{
        struct tcx_entry *tcx = kzalloc_noprof(sizeof(*tcx), GFP_KERNEL);

        if (tcx) {
                bpf_mprog_bundle_init(&tcx->bundle);
                return &tcx->bundle.a;
        }
        return NULL;
}
#define tcx_entry_create(...)        alloc_hooks(tcx_entry_create_noprof(__VA_ARGS__))

static inline void tcx_entry_free(struct bpf_mprog_entry *entry)
{
        kfree_rcu(tcx_entry(entry), rcu);
}

static inline struct bpf_mprog_entry *
tcx_entry_fetch_or_create(struct net_device *dev, bool ingress, bool *created)
{
        struct bpf_mprog_entry *entry = tcx_entry_fetch(dev, ingress);

        *created = false;
        if (!entry) {
                entry = tcx_entry_create();
                if (!entry)
                        return NULL;
                *created = true;
        }
        return entry;
}

static inline void tcx_skeys_inc(bool ingress)
{
        tcx_inc();
        if (ingress)
                net_inc_ingress_queue();
        else
                net_inc_egress_queue();
}

static inline void tcx_skeys_dec(bool ingress)
{
        if (ingress)
                net_dec_ingress_queue();
        else
                net_dec_egress_queue();
        tcx_dec();
}

static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        tcx_entry(entry)->miniq_active++;
}

static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        tcx_entry(entry)->miniq_active--;
}

static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)
{
        ASSERT_RTNL();
        return bpf_mprog_total(entry) || tcx_entry(entry)->miniq_active;
}

static inline enum tcx_action_base tcx_action_code(struct sk_buff *skb,
                                                   int code)
{
        switch (code) {
        case TCX_PASS:
                skb->tc_index = qdisc_skb_cb(skb)->tc_classid;
                fallthrough;
        case TCX_DROP:
        case TCX_REDIRECT:
                return code;
        case TCX_NEXT:
        default:
                return TCX_NEXT;
        }
}
#endif /* CONFIG_NET_XGRESS */

#if defined(CONFIG_NET_XGRESS) && defined(CONFIG_BPF_SYSCALL)
int tcx_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int tcx_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
int tcx_prog_detach(const union bpf_attr *attr, struct bpf_prog *prog);
void tcx_uninstall(struct net_device *dev, bool ingress);

int tcx_prog_query(const union bpf_attr *attr,
                   union bpf_attr __user *uattr);

static inline void dev_tcx_uninstall(struct net_device *dev)
{
        ASSERT_RTNL();
        tcx_uninstall(dev, true);
        tcx_uninstall(dev, false);
}
#else
static inline int tcx_prog_attach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_link_attach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_prog_detach(const union bpf_attr *attr,
                                  struct bpf_prog *prog)
{
        return -EINVAL;
}

static inline int tcx_prog_query(const union bpf_attr *attr,
                                 union bpf_attr __user *uattr)
{
        return -EINVAL;
}

static inline void dev_tcx_uninstall(struct net_device *dev)
{
}
#endif /* CONFIG_NET_XGRESS && CONFIG_BPF_SYSCALL */
#endif /* __NET_TCX_H */

















   21 



   21 

   21 





















































   21 





































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * The "hash function" used as the core of the ChaCha stream cipher (RFC7539)
 *
 * Copyright (C) 2015 Martin Willi
 */

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/bitops.h>
#include <linux/string.h>
#include <linux/unaligned.h>
#include <crypto/chacha.h>

static void chacha_permute(u32 *x, int nrounds)
{
        int i;

        /* whitelist the allowed round counts */
        WARN_ON_ONCE(nrounds != 20 && nrounds != 12);

        for (i = 0; i < nrounds; i += 2) {
                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],  16);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],  16);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],  16);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],  16);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],  12);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],  12);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10], 12);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11], 12);

                x[0]  += x[4];    x[12] = rol32(x[12] ^ x[0],   8);
                x[1]  += x[5];    x[13] = rol32(x[13] ^ x[1],   8);
                x[2]  += x[6];    x[14] = rol32(x[14] ^ x[2],   8);
                x[3]  += x[7];    x[15] = rol32(x[15] ^ x[3],   8);

                x[8]  += x[12];   x[4]  = rol32(x[4]  ^ x[8],   7);
                x[9]  += x[13];   x[5]  = rol32(x[5]  ^ x[9],   7);
                x[10] += x[14];   x[6]  = rol32(x[6]  ^ x[10],  7);
                x[11] += x[15];   x[7]  = rol32(x[7]  ^ x[11],  7);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],  16);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],  16);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],  16);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],  16);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10], 12);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11], 12);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],  12);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],  12);

                x[0]  += x[5];    x[15] = rol32(x[15] ^ x[0],   8);
                x[1]  += x[6];    x[12] = rol32(x[12] ^ x[1],   8);
                x[2]  += x[7];    x[13] = rol32(x[13] ^ x[2],   8);
                x[3]  += x[4];    x[14] = rol32(x[14] ^ x[3],   8);

                x[10] += x[15];   x[5]  = rol32(x[5]  ^ x[10],  7);
                x[11] += x[12];   x[6]  = rol32(x[6]  ^ x[11],  7);
                x[8]  += x[13];   x[7]  = rol32(x[7]  ^ x[8],   7);
                x[9]  += x[14];   x[4]  = rol32(x[4]  ^ x[9],   7);
        }
}

/**
 * chacha_block_generic - generate one keystream block and increment block counter
 * @state: input state matrix (16 32-bit words)
 * @stream: output keystream block (64 bytes)
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * This is the ChaCha core, a function from 64-byte strings to 64-byte strings.
 * The caller has already converted the endianness of the input.  This function
 * also handles incrementing the block counter in the input matrix.
 */
void chacha_block_generic(u32 *state, u8 *stream, int nrounds)
{
        u32 x[16];
        int i;

        memcpy(x, state, 64);

        chacha_permute(x, nrounds);

        for (i = 0; i < ARRAY_SIZE(x); i++)
                put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]);

        state[12]++;
}
EXPORT_SYMBOL(chacha_block_generic);

/**
 * hchacha_block_generic - abbreviated ChaCha core, for XChaCha
 * @state: input state matrix (16 32-bit words)
 * @stream: output (8 32-bit words)
 * @nrounds: number of rounds (20 or 12; 20 is recommended)
 *
 * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step
 * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf).  HChaCha
 * skips the final addition of the initial state, and outputs only certain words
 * of the state.  It should not be used for streaming directly.
 */
void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds)
{
        u32 x[16];

        memcpy(x, state, 64);

        chacha_permute(x, nrounds);

        memcpy(&stream[0], &x[0], 16);
        memcpy(&stream[4], &x[12], 16);
}
EXPORT_SYMBOL(hchacha_block_generic);


















  681 






  251 
















  585 










 1259 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_ATOMIC_H_
#define _ASM_GENERIC_BITOPS_ATOMIC_H_

#include <linux/atomic.h>
#include <linux/compiler.h>
#include <asm/barrier.h>

/*
 * Implementation of atomic bitops using atomic-fetch ops.
 * See Documentation/atomic_bitops.txt for details.
 */

static __always_inline void
arch_set_bit(unsigned int nr, volatile unsigned long *p)
{
        p += BIT_WORD(nr);
        raw_atomic_long_or(BIT_MASK(nr), (atomic_long_t *)p);
}

static __always_inline void
arch_clear_bit(unsigned int nr, volatile unsigned long *p)
{
        p += BIT_WORD(nr);
        raw_atomic_long_andnot(BIT_MASK(nr), (atomic_long_t *)p);
}

static __always_inline void
arch_change_bit(unsigned int nr, volatile unsigned long *p)
{
        p += BIT_WORD(nr);
        raw_atomic_long_xor(BIT_MASK(nr), (atomic_long_t *)p);
}

static __always_inline int
arch_test_and_set_bit(unsigned int nr, volatile unsigned long *p)
{
        long old;
        unsigned long mask = BIT_MASK(nr);

        p += BIT_WORD(nr);
        old = raw_atomic_long_fetch_or(mask, (atomic_long_t *)p);
        return !!(old & mask);
}

static __always_inline int
arch_test_and_clear_bit(unsigned int nr, volatile unsigned long *p)
{
        long old;
        unsigned long mask = BIT_MASK(nr);

        p += BIT_WORD(nr);
        old = raw_atomic_long_fetch_andnot(mask, (atomic_long_t *)p);
        return !!(old & mask);
}

static __always_inline int
arch_test_and_change_bit(unsigned int nr, volatile unsigned long *p)
{
        long old;
        unsigned long mask = BIT_MASK(nr);

        p += BIT_WORD(nr);
        old = raw_atomic_long_fetch_xor(mask, (atomic_long_t *)p);
        return !!(old & mask);
}

#include <asm-generic/bitops/instrumented-atomic.h>

#endif /* _ASM_GENERIC_BITOPS_ATOMIC_H */






















































   95 















   95 


   95 


















































   55 




   55 























































































































   43 







   43 



   43 






    7 



    7 
    7 

   25 


   25 

   11 
   11 








   24 






   24 


   24 





    8 











   14 






    2 





   24 


    3 









   19 

    3 





















































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
// SPDX-License-Identifier: GPL-2.0-only
/*
 * arch/arm64/mm/hugetlbpage.c
 *
 * Copyright (C) 2013 Linaro Ltd.
 *
 * Based on arch/x86/mm/hugetlbpage.c.
 */

#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/err.h>
#include <linux/sysctl.h>
#include <asm/mman.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>

/*
 * HugeTLB Support Matrix
 *
 * ---------------------------------------------------
 * | Page Size | CONT PTE |  PMD  | CONT PMD |  PUD  |
 * ---------------------------------------------------
 * |     4K    |   64K    |   2M  |    32M   |   1G  |
 * |    16K    |    2M    |  32M  |     1G   |       |
 * |    64K    |    2M    | 512M  |    16G   |       |
 * ---------------------------------------------------
 */

/*
 * Reserve CMA areas for the largest supported gigantic
 * huge page when requested. Any other smaller gigantic
 * huge pages could still be served from those areas.
 */
#ifdef CONFIG_CMA
void __init arm64_hugetlb_cma_reserve(void)
{
        int order;

        if (pud_sect_supported())
                order = PUD_SHIFT - PAGE_SHIFT;
        else
                order = CONT_PMD_SHIFT - PAGE_SHIFT;

        hugetlb_cma_reserve(order);
}
#endif /* CONFIG_CMA */

static bool __hugetlb_valid_size(unsigned long size)
{
        switch (size) {
#ifndef __PAGETABLE_PMD_FOLDED
        case PUD_SIZE:
                return pud_sect_supported();
#endif
        case CONT_PMD_SIZE:
        case PMD_SIZE:
        case CONT_PTE_SIZE:
                return true;
        }

        return false;
}

#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
bool arch_hugetlb_migration_supported(struct hstate *h)
{
        size_t pagesize = huge_page_size(h);

        if (!__hugetlb_valid_size(pagesize)) {
                pr_warn("%s: unrecognized huge page size 0x%lx\n",
                        __func__, pagesize);
                return false;
        }
        return true;
}
#endif

static int find_num_contig(struct mm_struct *mm, unsigned long addr,
                           pte_t *ptep, size_t *pgsize)
{
        pgd_t *pgdp = pgd_offset(mm, addr);
        p4d_t *p4dp;
        pud_t *pudp;
        pmd_t *pmdp;

        *pgsize = PAGE_SIZE;
        p4dp = p4d_offset(pgdp, addr);
        pudp = pud_offset(p4dp, addr);
        pmdp = pmd_offset(pudp, addr);
        if ((pte_t *)pmdp == ptep) {
                *pgsize = PMD_SIZE;
                return CONT_PMDS;
        }
        return CONT_PTES;
}

static inline int num_contig_ptes(unsigned long size, size_t *pgsize)
{
        int contig_ptes = 1;

        *pgsize = size;

        switch (size) {
        case CONT_PMD_SIZE:
                *pgsize = PMD_SIZE;
                contig_ptes = CONT_PMDS;
                break;
        case CONT_PTE_SIZE:
                *pgsize = PAGE_SIZE;
                contig_ptes = CONT_PTES;
                break;
        default:
                WARN_ON(!__hugetlb_valid_size(size));
        }

        return contig_ptes;
}

pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
        int ncontig, i;
        size_t pgsize;
        pte_t orig_pte = __ptep_get(ptep);

        if (!pte_present(orig_pte) || !pte_cont(orig_pte))
                return orig_pte;

        ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
        for (i = 0; i < ncontig; i++, ptep++) {
                pte_t pte = __ptep_get(ptep);

                if (pte_dirty(pte))
                        orig_pte = pte_mkdirty(orig_pte);

                if (pte_young(pte))
                        orig_pte = pte_mkyoung(orig_pte);
        }
        return orig_pte;
}

/*
 * Changing some bits of contiguous entries requires us to follow a
 * Break-Before-Make approach, breaking the whole contiguous set
 * before we can change any entries. See ARM DDI 0487A.k_iss10775,
 * "Misprogramming of the Contiguous bit", page D4-1762.
 *
 * This helper performs the break step.
 */
static pte_t get_clear_contig(struct mm_struct *mm,
                             unsigned long addr,
                             pte_t *ptep,
                             unsigned long pgsize,
                             unsigned long ncontig)
{
        pte_t pte, tmp_pte;
        bool present;

        pte = __ptep_get_and_clear(mm, addr, ptep);
        present = pte_present(pte);
        while (--ncontig) {
                ptep++;
                addr += pgsize;
                tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
                if (present) {
                        if (pte_dirty(tmp_pte))
                                pte = pte_mkdirty(pte);
                        if (pte_young(tmp_pte))
                                pte = pte_mkyoung(pte);
                }
        }
        return pte;
}

static pte_t get_clear_contig_flush(struct mm_struct *mm,
                                    unsigned long addr,
                                    pte_t *ptep,
                                    unsigned long pgsize,
                                    unsigned long ncontig)
{
        pte_t orig_pte = get_clear_contig(mm, addr, ptep, pgsize, ncontig);
        struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);

        flush_tlb_range(&vma, addr, addr + (pgsize * ncontig));
        return orig_pte;
}

/*
 * Changing some bits of contiguous entries requires us to follow a
 * Break-Before-Make approach, breaking the whole contiguous set
 * before we can change any entries. See ARM DDI 0487A.k_iss10775,
 * "Misprogramming of the Contiguous bit", page D4-1762.
 *
 * This helper performs the break step for use cases where the
 * original pte is not needed.
 */
static void clear_flush(struct mm_struct *mm,
                             unsigned long addr,
                             pte_t *ptep,
                             unsigned long pgsize,
                             unsigned long ncontig)
{
        struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
        unsigned long i, saddr = addr;

        for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
                __ptep_get_and_clear(mm, addr, ptep);

        flush_tlb_range(&vma, saddr, addr);
}

void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                            pte_t *ptep, pte_t pte, unsigned long sz)
{
        size_t pgsize;
        int i;
        int ncontig;
        unsigned long pfn, dpfn;
        pgprot_t hugeprot;

        ncontig = num_contig_ptes(sz, &pgsize);

        if (!pte_present(pte)) {
                for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
                        __set_ptes(mm, addr, ptep, pte, 1);
                return;
        }

        if (!pte_cont(pte)) {
                __set_ptes(mm, addr, ptep, pte, 1);
                return;
        }

        pfn = pte_pfn(pte);
        dpfn = pgsize >> PAGE_SHIFT;
        hugeprot = pte_pgprot(pte);

        clear_flush(mm, addr, ptep, pgsize, ncontig);

        for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
                __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
}

pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, unsigned long sz)
{
        pgd_t *pgdp;
        p4d_t *p4dp;
        pud_t *pudp;
        pmd_t *pmdp;
        pte_t *ptep = NULL;

        pgdp = pgd_offset(mm, addr);
        p4dp = p4d_alloc(mm, pgdp, addr);
        if (!p4dp)
                return NULL;

        pudp = pud_alloc(mm, p4dp, addr);
        if (!pudp)
                return NULL;

        if (sz == PUD_SIZE) {
                ptep = (pte_t *)pudp;
        } else if (sz == (CONT_PTE_SIZE)) {
                pmdp = pmd_alloc(mm, pudp, addr);
                if (!pmdp)
                        return NULL;

                WARN_ON(addr & (sz - 1));
                ptep = pte_alloc_huge(mm, pmdp, addr);
        } else if (sz == PMD_SIZE) {
                if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
                        ptep = huge_pmd_share(mm, vma, addr, pudp);
                else
                        ptep = (pte_t *)pmd_alloc(mm, pudp, addr);
        } else if (sz == (CONT_PMD_SIZE)) {
                pmdp = pmd_alloc(mm, pudp, addr);
                WARN_ON(addr & (sz - 1));
                return (pte_t *)pmdp;
        }

        return ptep;
}

pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz)
{
        pgd_t *pgdp;
        p4d_t *p4dp;
        pud_t *pudp, pud;
        pmd_t *pmdp, pmd;

        pgdp = pgd_offset(mm, addr);
        if (!pgd_present(READ_ONCE(*pgdp)))
                return NULL;

        p4dp = p4d_offset(pgdp, addr);
        if (!p4d_present(READ_ONCE(*p4dp)))
                return NULL;

        pudp = pud_offset(p4dp, addr);
        pud = READ_ONCE(*pudp);
        if (sz != PUD_SIZE && pud_none(pud))
                return NULL;
        /* hugepage or swap? */
        if (pud_leaf(pud) || !pud_present(pud))
                return (pte_t *)pudp;
        /* table; check the next level */

        if (sz == CONT_PMD_SIZE)
                addr &= CONT_PMD_MASK;

        pmdp = pmd_offset(pudp, addr);
        pmd = READ_ONCE(*pmdp);
        if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) &&
            pmd_none(pmd))
                return NULL;
        if (pmd_leaf(pmd) || !pmd_present(pmd))
                return (pte_t *)pmdp;

        if (sz == CONT_PTE_SIZE)
                return pte_offset_huge(pmdp, (addr & CONT_PTE_MASK));

        return NULL;
}

unsigned long hugetlb_mask_last_page(struct hstate *h)
{
        unsigned long hp_size = huge_page_size(h);

        switch (hp_size) {
#ifndef __PAGETABLE_PMD_FOLDED
        case PUD_SIZE:
                if (pud_sect_supported())
                        return PGDIR_SIZE - PUD_SIZE;
                break;
#endif
        case CONT_PMD_SIZE:
                return PUD_SIZE - CONT_PMD_SIZE;
        case PMD_SIZE:
                return PUD_SIZE - PMD_SIZE;
        case CONT_PTE_SIZE:
                return PMD_SIZE - CONT_PTE_SIZE;
        default:
                break;
        }

        return 0UL;
}

pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
{
        size_t pagesize = 1UL << shift;

        switch (pagesize) {
#ifndef __PAGETABLE_PMD_FOLDED
        case PUD_SIZE:
                if (pud_sect_supported())
                        return pud_pte(pud_mkhuge(pte_pud(entry)));
                break;
#endif
        case CONT_PMD_SIZE:
                return pmd_pte(pmd_mkhuge(pmd_mkcont(pte_pmd(entry))));
        case PMD_SIZE:
                return pmd_pte(pmd_mkhuge(pte_pmd(entry)));
        case CONT_PTE_SIZE:
                return pte_mkcont(entry);
        default:
                break;
        }
        pr_warn("%s: unrecognized huge page size 0x%lx\n",
                __func__, pagesize);
        return entry;
}

void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
                    pte_t *ptep, unsigned long sz)
{
        int i, ncontig;
        size_t pgsize;

        ncontig = num_contig_ptes(sz, &pgsize);

        for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
                __pte_clear(mm, addr, ptep);
}

pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
                              pte_t *ptep, unsigned long sz)
{
        int ncontig;
        size_t pgsize;

        ncontig = num_contig_ptes(sz, &pgsize);
        return get_clear_contig(mm, addr, ptep, pgsize, ncontig);
}

/*
 * huge_ptep_set_access_flags will update access flags (dirty, accesssed)
 * and write permission.
 *
 * For a contiguous huge pte range we need to check whether or not write
 * permission has to change only on the first pte in the set. Then for
 * all the contiguous ptes we need to check whether or not there is a
 * discrepancy between dirty or young.
 */
static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
{
        int i;

        if (pte_write(pte) != pte_write(__ptep_get(ptep)))
                return 1;

        for (i = 0; i < ncontig; i++) {
                pte_t orig_pte = __ptep_get(ptep + i);

                if (pte_dirty(pte) != pte_dirty(orig_pte))
                        return 1;

                if (pte_young(pte) != pte_young(orig_pte))
                        return 1;
        }

        return 0;
}

int huge_ptep_set_access_flags(struct vm_area_struct *vma,
                               unsigned long addr, pte_t *ptep,
                               pte_t pte, int dirty)
{
        int ncontig, i;
        size_t pgsize = 0;
        unsigned long pfn = pte_pfn(pte), dpfn;
        struct mm_struct *mm = vma->vm_mm;
        pgprot_t hugeprot;
        pte_t orig_pte;

        if (!pte_cont(pte))
                return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);

        ncontig = find_num_contig(mm, addr, ptep, &pgsize);
        dpfn = pgsize >> PAGE_SHIFT;

        if (!__cont_access_flags_changed(ptep, pte, ncontig))
                return 0;

        orig_pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);

        /* Make sure we don't lose the dirty or young state */
        if (pte_dirty(orig_pte))
                pte = pte_mkdirty(pte);

        if (pte_young(orig_pte))
                pte = pte_mkyoung(pte);

        hugeprot = pte_pgprot(pte);
        for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
                __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);

        return 1;
}

void huge_ptep_set_wrprotect(struct mm_struct *mm,
                             unsigned long addr, pte_t *ptep)
{
        unsigned long pfn, dpfn;
        pgprot_t hugeprot;
        int ncontig, i;
        size_t pgsize;
        pte_t pte;

        if (!pte_cont(__ptep_get(ptep))) {
                __ptep_set_wrprotect(mm, addr, ptep);
                return;
        }

        ncontig = find_num_contig(mm, addr, ptep, &pgsize);
        dpfn = pgsize >> PAGE_SHIFT;

        pte = get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
        pte = pte_wrprotect(pte);

        hugeprot = pte_pgprot(pte);
        pfn = pte_pfn(pte);

        for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
                __set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
}

pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
                            unsigned long addr, pte_t *ptep)
{
        struct mm_struct *mm = vma->vm_mm;
        size_t pgsize;
        int ncontig;

        if (!pte_cont(__ptep_get(ptep)))
                return ptep_clear_flush(vma, addr, ptep);

        ncontig = find_num_contig(mm, addr, ptep, &pgsize);
        return get_clear_contig_flush(mm, addr, ptep, pgsize, ncontig);
}

static int __init hugetlbpage_init(void)
{
        /*
         * HugeTLB pages are supported on maximum four page table
         * levels (PUD, CONT PMD, PMD, CONT PTE) for a given base
         * page size, corresponding to hugetlb_add_hstate() calls
         * here.
         *
         * HUGE_MAX_HSTATE should at least match maximum supported
         * HugeTLB page sizes on the platform. Any new addition to
         * supported HugeTLB page sizes will also require changing
         * HUGE_MAX_HSTATE as well.
         */
        BUILD_BUG_ON(HUGE_MAX_HSTATE < 4);
        if (pud_sect_supported())
                hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);

        hugetlb_add_hstate(CONT_PMD_SHIFT - PAGE_SHIFT);
        hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
        hugetlb_add_hstate(CONT_PTE_SHIFT - PAGE_SHIFT);

        return 0;
}
arch_initcall(hugetlbpage_init);

bool __init arch_hugetlb_valid_size(unsigned long size)
{
        return __hugetlb_valid_size(size);
}

pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
{
        unsigned long psize = huge_page_size(hstate_vma(vma));

        if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) {
                /*
                 * Break-before-make (BBM) is required for all user space mappings
                 * when the permission changes from executable to non-executable
                 * in cases where cpu is affected with errata #2645198.
                 */
                if (pte_user_exec(__ptep_get(ptep)))
                        return huge_ptep_clear_flush(vma, addr, ptep);
        }
        return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize);
}

void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
                                  pte_t old_pte, pte_t pte)
{
        unsigned long psize = huge_page_size(hstate_vma(vma));

        set_huge_pte_at(vma->vm_mm, addr, ptep, pte, psize);
}




























































































































































































   55 






   56 


   57 






















   57 






   56 


   56 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
// SPDX-License-Identifier: GPL-2.0-or-later

#define pr_fmt(fmt) "ref_tracker: " fmt

#include <linux/export.h>
#include <linux/list_sort.h>
#include <linux/ref_tracker.h>
#include <linux/slab.h>
#include <linux/stacktrace.h>
#include <linux/stackdepot.h>

#define REF_TRACKER_STACK_ENTRIES 16
#define STACK_BUF_SIZE 1024

struct ref_tracker {
        struct list_head        head;   /* anchor into dir->list or dir->quarantine */
        bool                        dead;
        depot_stack_handle_t        alloc_stack_handle;
        depot_stack_handle_t        free_stack_handle;
};

struct ref_tracker_dir_stats {
        int total;
        int count;
        struct {
                depot_stack_handle_t stack_handle;
                unsigned int count;
        } stacks[];
};

static struct ref_tracker_dir_stats *
ref_tracker_get_stats(struct ref_tracker_dir *dir, unsigned int limit)
{
        struct ref_tracker_dir_stats *stats;
        struct ref_tracker *tracker;

        stats = kmalloc(struct_size(stats, stacks, limit),
                        GFP_NOWAIT | __GFP_NOWARN);
        if (!stats)
                return ERR_PTR(-ENOMEM);
        stats->total = 0;
        stats->count = 0;

        list_for_each_entry(tracker, &dir->list, head) {
                depot_stack_handle_t stack = tracker->alloc_stack_handle;
                int i;

                ++stats->total;
                for (i = 0; i < stats->count; ++i)
                        if (stats->stacks[i].stack_handle == stack)
                                break;
                if (i >= limit)
                        continue;
                if (i >= stats->count) {
                        stats->stacks[i].stack_handle = stack;
                        stats->stacks[i].count = 0;
                        ++stats->count;
                }
                ++stats->stacks[i].count;
        }

        return stats;
}

struct ostream {
        char *buf;
        int size, used;
};

#define pr_ostream(stream, fmt, args...) \
({ \
        struct ostream *_s = (stream); \
\
        if (!_s->buf) { \
                pr_err(fmt, ##args); \
        } else { \
                int ret, len = _s->size - _s->used; \
                ret = snprintf(_s->buf + _s->used, len, pr_fmt(fmt), ##args); \
                _s->used += min(ret, len); \
        } \
})

static void
__ref_tracker_dir_pr_ostream(struct ref_tracker_dir *dir,
                             unsigned int display_limit, struct ostream *s)
{
        struct ref_tracker_dir_stats *stats;
        unsigned int i = 0, skipped;
        depot_stack_handle_t stack;
        char *sbuf;

        lockdep_assert_held(&dir->lock);

        if (list_empty(&dir->list))
                return;

        stats = ref_tracker_get_stats(dir, display_limit);
        if (IS_ERR(stats)) {
                pr_ostream(s, "%s@%pK: couldn't get stats, error %pe\n",
                           dir->name, dir, stats);
                return;
        }

        sbuf = kmalloc(STACK_BUF_SIZE, GFP_NOWAIT | __GFP_NOWARN);

        for (i = 0, skipped = stats->total; i < stats->count; ++i) {
                stack = stats->stacks[i].stack_handle;
                if (sbuf && !stack_depot_snprint(stack, sbuf, STACK_BUF_SIZE, 4))
                        sbuf[0] = 0;
                pr_ostream(s, "%s@%pK has %d/%d users at\n%s\n", dir->name, dir,
                           stats->stacks[i].count, stats->total, sbuf);
                skipped -= stats->stacks[i].count;
        }

        if (skipped)
                pr_ostream(s, "%s@%pK skipped reports about %d/%d users.\n",
                           dir->name, dir, skipped, stats->total);

        kfree(sbuf);

        kfree(stats);
}

void ref_tracker_dir_print_locked(struct ref_tracker_dir *dir,
                                  unsigned int display_limit)
{
        struct ostream os = {};

        __ref_tracker_dir_pr_ostream(dir, display_limit, &os);
}
EXPORT_SYMBOL(ref_tracker_dir_print_locked);

void ref_tracker_dir_print(struct ref_tracker_dir *dir,
                           unsigned int display_limit)
{
        unsigned long flags;

        spin_lock_irqsave(&dir->lock, flags);
        ref_tracker_dir_print_locked(dir, display_limit);
        spin_unlock_irqrestore(&dir->lock, flags);
}
EXPORT_SYMBOL(ref_tracker_dir_print);

int ref_tracker_dir_snprint(struct ref_tracker_dir *dir, char *buf, size_t size)
{
        struct ostream os = { .buf = buf, .size = size };
        unsigned long flags;

        spin_lock_irqsave(&dir->lock, flags);
        __ref_tracker_dir_pr_ostream(dir, 16, &os);
        spin_unlock_irqrestore(&dir->lock, flags);

        return os.used;
}
EXPORT_SYMBOL(ref_tracker_dir_snprint);

void ref_tracker_dir_exit(struct ref_tracker_dir *dir)
{
        struct ref_tracker *tracker, *n;
        unsigned long flags;
        bool leak = false;

        dir->dead = true;
        spin_lock_irqsave(&dir->lock, flags);
        list_for_each_entry_safe(tracker, n, &dir->quarantine, head) {
                list_del(&tracker->head);
                kfree(tracker);
                dir->quarantine_avail++;
        }
        if (!list_empty(&dir->list)) {
                ref_tracker_dir_print_locked(dir, 16);
                leak = true;
                list_for_each_entry_safe(tracker, n, &dir->list, head) {
                        list_del(&tracker->head);
                        kfree(tracker);
                }
        }
        spin_unlock_irqrestore(&dir->lock, flags);
        WARN_ON_ONCE(leak);
        WARN_ON_ONCE(refcount_read(&dir->untracked) != 1);
        WARN_ON_ONCE(refcount_read(&dir->no_tracker) != 1);
}
EXPORT_SYMBOL(ref_tracker_dir_exit);

int ref_tracker_alloc(struct ref_tracker_dir *dir,
                      struct ref_tracker **trackerp,
                      gfp_t gfp)
{
        unsigned long entries[REF_TRACKER_STACK_ENTRIES];
        struct ref_tracker *tracker;
        unsigned int nr_entries;
        gfp_t gfp_mask = gfp | __GFP_NOWARN;
        unsigned long flags;

        WARN_ON_ONCE(dir->dead);

        if (!trackerp) {
                refcount_inc(&dir->no_tracker);
                return 0;
        }
        if (gfp & __GFP_DIRECT_RECLAIM)
                gfp_mask |= __GFP_NOFAIL;
        *trackerp = tracker = kzalloc(sizeof(*tracker), gfp_mask);
        if (unlikely(!tracker)) {
                pr_err_once("memory allocation failure, unreliable refcount tracker.\n");
                refcount_inc(&dir->untracked);
                return -ENOMEM;
        }
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
        tracker->alloc_stack_handle = stack_depot_save(entries, nr_entries, gfp);

        spin_lock_irqsave(&dir->lock, flags);
        list_add(&tracker->head, &dir->list);
        spin_unlock_irqrestore(&dir->lock, flags);
        return 0;
}
EXPORT_SYMBOL_GPL(ref_tracker_alloc);

int ref_tracker_free(struct ref_tracker_dir *dir,
                     struct ref_tracker **trackerp)
{
        unsigned long entries[REF_TRACKER_STACK_ENTRIES];
        depot_stack_handle_t stack_handle;
        struct ref_tracker *tracker;
        unsigned int nr_entries;
        unsigned long flags;

        WARN_ON_ONCE(dir->dead);

        if (!trackerp) {
                refcount_dec(&dir->no_tracker);
                return 0;
        }
        tracker = *trackerp;
        if (!tracker) {
                refcount_dec(&dir->untracked);
                return -EEXIST;
        }
        nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 1);
        stack_handle = stack_depot_save(entries, nr_entries,
                                        GFP_NOWAIT | __GFP_NOWARN);

        spin_lock_irqsave(&dir->lock, flags);
        if (tracker->dead) {
                pr_err("reference already released.\n");
                if (tracker->alloc_stack_handle) {
                        pr_err("allocated in:\n");
                        stack_depot_print(tracker->alloc_stack_handle);
                }
                if (tracker->free_stack_handle) {
                        pr_err("freed in:\n");
                        stack_depot_print(tracker->free_stack_handle);
                }
                spin_unlock_irqrestore(&dir->lock, flags);
                WARN_ON_ONCE(1);
                return -EINVAL;
        }
        tracker->dead = true;

        tracker->free_stack_handle = stack_handle;

        list_move_tail(&tracker->head, &dir->quarantine);
        if (!dir->quarantine_avail) {
                tracker = list_first_entry(&dir->quarantine, struct ref_tracker, head);
                list_del(&tracker->head);
        } else {
                dir->quarantine_avail--;
                tracker = NULL;
        }
        spin_unlock_irqrestore(&dir->lock, flags);

        kfree(tracker);
        return 0;
}
EXPORT_SYMBOL_GPL(ref_tracker_free);








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
/* SPDX-License-Identifier: GPL-2.0-or-later */

#ifndef __DSA_USER_H
#define __DSA_USER_H

#include <linux/if_bridge.h>
#include <linux/if_vlan.h>
#include <linux/list.h>
#include <linux/netpoll.h>
#include <linux/types.h>
#include <net/dsa.h>
#include <net/gro_cells.h>

struct net_device;
struct netlink_ext_ack;

extern struct notifier_block dsa_user_switchdev_notifier;
extern struct notifier_block dsa_user_switchdev_blocking_notifier;

struct dsa_user_priv {
        /* Copy of CPU port xmit for faster access in user transmit hot path */
        struct sk_buff *        (*xmit)(struct sk_buff *skb,
                                        struct net_device *dev);

        struct gro_cells        gcells;

        /* DSA port data, such as switch, port index, etc. */
        struct dsa_port                *dp;

#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll                *netpoll;
#endif

        /* TC context */
        struct list_head        mall_tc_list;
};

void dsa_user_mii_bus_init(struct dsa_switch *ds);
int dsa_user_create(struct dsa_port *dp);
void dsa_user_destroy(struct net_device *user_dev);
int dsa_user_suspend(struct net_device *user_dev);
int dsa_user_resume(struct net_device *user_dev);
int dsa_user_register_notifier(void);
void dsa_user_unregister_notifier(void);
int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr);
void dsa_user_host_uc_uninstall(struct net_device *dev);
void dsa_user_sync_ha(struct net_device *dev);
void dsa_user_unsync_ha(struct net_device *dev);
void dsa_user_setup_tagger(struct net_device *user);
int dsa_user_change_mtu(struct net_device *dev, int new_mtu);
int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
                            struct netlink_ext_ack *extack);
int dsa_user_manage_vlan_filtering(struct net_device *dev,
                                   bool vlan_filtering);

static inline struct dsa_port *dsa_user_to_port(const struct net_device *dev)
{
        struct dsa_user_priv *p = netdev_priv(dev);

        return p->dp;
}

static inline struct net_device *
dsa_user_to_conduit(const struct net_device *dev)
{
        struct dsa_port *dp = dsa_user_to_port(dev);

        return dsa_port_to_conduit(dp);
}

#endif















































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
/* SPDX-License-Identifier: GPL-1.0+ */
/*
 * Bond several ethernet interfaces into a Cisco, running 'Etherchannel'.
 *
 * Portions are (c) Copyright 1995 Simon "Guru Aleph-Null" Janes
 * NCM: Network and Communications Management, Inc.
 *
 * BUT, I'm the one who modified it for ethernet, so:
 * (c) Copyright 1999, Thomas Davis, tadavis@lbl.gov
 *
 */

#ifndef _NET_BONDING_H
#define _NET_BONDING_H

#include <linux/timer.h>
#include <linux/proc_fs.h>
#include <linux/if_bonding.h>
#include <linux/cpumask.h>
#include <linux/in6.h>
#include <linux/netpoll.h>
#include <linux/inetdevice.h>
#include <linux/etherdevice.h>
#include <linux/reciprocal_div.h>
#include <linux/if_link.h>

#include <net/bond_3ad.h>
#include <net/bond_alb.h>
#include <net/bond_options.h>
#include <net/ipv6.h>
#include <net/addrconf.h>

#define BOND_MAX_ARP_TARGETS        16
#define BOND_MAX_NS_TARGETS        BOND_MAX_ARP_TARGETS

#define BOND_DEFAULT_MIIMON        100

#ifndef __long_aligned
#define __long_aligned __attribute__((aligned((sizeof(long)))))
#endif

#define slave_info(bond_dev, slave_dev, fmt, ...) \
        netdev_info(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__)
#define slave_warn(bond_dev, slave_dev, fmt, ...) \
        netdev_warn(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__)
#define slave_dbg(bond_dev, slave_dev, fmt, ...) \
        netdev_dbg(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__)
#define slave_err(bond_dev, slave_dev, fmt, ...) \
        netdev_err(bond_dev, "(slave %s): " fmt, (slave_dev)->name, ##__VA_ARGS__)

#define BOND_MODE(bond) ((bond)->params.mode)

/* slave list primitives */
#define bond_slave_list(bond) (&(bond)->dev->adj_list.lower)

#define bond_has_slaves(bond) !list_empty(bond_slave_list(bond))

/* IMPORTANT: bond_first/last_slave can return NULL in case of an empty list */
#define bond_first_slave(bond) \
        (bond_has_slaves(bond) ? \
                netdev_adjacent_get_private(bond_slave_list(bond)->next) : \
                NULL)
#define bond_last_slave(bond) \
        (bond_has_slaves(bond) ? \
                netdev_adjacent_get_private(bond_slave_list(bond)->prev) : \
                NULL)

/* Caller must have rcu_read_lock */
#define bond_first_slave_rcu(bond) \
        netdev_lower_get_first_private_rcu(bond->dev)

#define bond_is_first_slave(bond, pos) (pos == bond_first_slave(bond))
#define bond_is_last_slave(bond, pos) (pos == bond_last_slave(bond))

/**
 * bond_for_each_slave - iterate over all slaves
 * @bond:        the bond holding this list
 * @pos:        current slave
 * @iter:        list_head * iterator
 *
 * Caller must hold RTNL
 */
#define bond_for_each_slave(bond, pos, iter) \
        netdev_for_each_lower_private((bond)->dev, pos, iter)

/* Caller must have rcu_read_lock */
#define bond_for_each_slave_rcu(bond, pos, iter) \
        netdev_for_each_lower_private_rcu((bond)->dev, pos, iter)

#define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \
                            NETIF_F_GSO_ESP)

#ifdef CONFIG_NET_POLL_CONTROLLER
extern atomic_t netpoll_block_tx;

static inline void block_netpoll_tx(void)
{
        atomic_inc(&netpoll_block_tx);
}

static inline void unblock_netpoll_tx(void)
{
        atomic_dec(&netpoll_block_tx);
}

static inline int is_netpoll_tx_blocked(struct net_device *dev)
{
        if (unlikely(netpoll_tx_running(dev)))
                return atomic_read(&netpoll_block_tx);
        return 0;
}
#else
#define block_netpoll_tx()
#define unblock_netpoll_tx()
#define is_netpoll_tx_blocked(dev) (0)
#endif

struct bond_params {
        int mode;
        int xmit_policy;
        int miimon;
        u8 num_peer_notif;
        u8 missed_max;
        int arp_interval;
        int arp_validate;
        int arp_all_targets;
        int use_carrier;
        int fail_over_mac;
        int updelay;
        int downdelay;
        int peer_notif_delay;
        int lacp_active;
        int lacp_fast;
        unsigned int min_links;
        int ad_select;
        char primary[IFNAMSIZ];
        int primary_reselect;
        __be32 arp_targets[BOND_MAX_ARP_TARGETS];
        int tx_queues;
        int all_slaves_active;
        int resend_igmp;
        int lp_interval;
        int packets_per_slave;
        int tlb_dynamic_lb;
        struct reciprocal_value reciprocal_packets_per_slave;
        u16 ad_actor_sys_prio;
        u16 ad_user_port_key;
#if IS_ENABLED(CONFIG_IPV6)
        struct in6_addr ns_targets[BOND_MAX_NS_TARGETS];
#endif
        int coupled_control;

        /* 2 bytes of padding : see ether_addr_equal_64bits() */
        u8 ad_actor_system[ETH_ALEN + 2];
};

struct slave {
        struct net_device *dev; /* first - useful for panic debug */
        struct bonding *bond; /* our master */
        int    delay;
        /* all 4 in jiffies */
        unsigned long last_link_up;
        unsigned long last_tx;
        unsigned long last_rx;
        unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS];
        s8     link;                /* one of BOND_LINK_XXXX */
        s8     link_new_state;        /* one of BOND_LINK_XXXX */
        u8     backup:1,   /* indicates backup slave. Value corresponds with
                              BOND_STATE_ACTIVE and BOND_STATE_BACKUP */
               inactive:1, /* indicates inactive slave */
               rx_disabled:1, /* indicates whether slave's Rx is disabled */
               should_notify:1, /* indicates whether the state changed */
               should_notify_link:1; /* indicates whether the link changed */
        u8     duplex;
        u32    original_mtu;
        u32    link_failure_count;
        u32    speed;
        u16    queue_id;
        u8     perm_hwaddr[MAX_ADDR_LEN];
        int    prio;
        struct ad_slave_info *ad_info;
        struct tlb_slave_info tlb_info;
#ifdef CONFIG_NET_POLL_CONTROLLER
        struct netpoll *np;
#endif
        struct delayed_work notify_work;
        struct kobject kobj;
        struct rtnl_link_stats64 slave_stats;
};

static inline struct slave *to_slave(struct kobject *kobj)
{
        return container_of(kobj, struct slave, kobj);
}

struct bond_up_slave {
        unsigned int        count;
        struct rcu_head rcu;
        struct slave        *arr[];
};

/*
 * Link pseudo-state only used internally by monitors
 */
#define BOND_LINK_NOCHANGE -1

struct bond_ipsec {
        struct list_head list;
        struct xfrm_state *xs;
};

/*
 * Here are the locking policies for the two bonding locks:
 * Get rcu_read_lock when reading or RTNL when writing slave list.
 */
struct bonding {
        struct   net_device *dev; /* first - useful for panic debug */
        struct   slave __rcu *curr_active_slave;
        struct   slave __rcu *current_arp_slave;
        struct   slave __rcu *primary_slave;
        struct   bond_up_slave __rcu *usable_slaves;
        struct   bond_up_slave __rcu *all_slaves;
        bool     force_primary;
        bool     notifier_ctx;
        s32      slave_cnt; /* never change this value outside the attach/detach wrappers */
        int     (*recv_probe)(const struct sk_buff *, struct bonding *,
                              struct slave *);
        /* mode_lock is used for mode-specific locking needs, currently used by:
         * 3ad mode (4) - protect against running bond_3ad_unbind_slave() and
         *                bond_3ad_state_machine_handler() concurrently and also
         *                the access to the state machine shared variables.
         * TLB mode (5) - to sync the use and modifications of its hash table
         * ALB mode (6) - to sync the use and modifications of its hash table
         */
        spinlock_t mode_lock;
        spinlock_t stats_lock;
        u32         send_peer_notif;
        u8       igmp_retrans;
#ifdef CONFIG_PROC_FS
        struct   proc_dir_entry *proc_entry;
        char     proc_file_name[IFNAMSIZ];
#endif /* CONFIG_PROC_FS */
        struct   list_head bond_list;
        u32 __percpu *rr_tx_counter;
        struct   ad_bond_info ad_info;
        struct   alb_bond_info alb_info;
        struct   bond_params params;
        struct   workqueue_struct *wq;
        struct   delayed_work mii_work;
        struct   delayed_work arp_work;
        struct   delayed_work alb_work;
        struct   delayed_work ad_work;
        struct   delayed_work mcast_work;
        struct   delayed_work slave_arr_work;
#ifdef CONFIG_DEBUG_FS
        /* debugging support via debugfs */
        struct         dentry *debug_dir;
#endif /* CONFIG_DEBUG_FS */
        struct rtnl_link_stats64 bond_stats;
#ifdef CONFIG_XFRM_OFFLOAD
        struct list_head ipsec_list;
        /* protecting ipsec_list */
        struct mutex ipsec_lock;
#endif /* CONFIG_XFRM_OFFLOAD */
        struct bpf_prog *xdp_prog;
};

#define bond_slave_get_rcu(dev) \
        ((struct slave *) rcu_dereference(dev->rx_handler_data))

#define bond_slave_get_rtnl(dev) \
        ((struct slave *) rtnl_dereference(dev->rx_handler_data))

void bond_queue_slave_event(struct slave *slave);
void bond_lower_state_changed(struct slave *slave);

struct bond_vlan_tag {
        __be16                vlan_proto;
        unsigned short        vlan_id;
};

/*
 * Returns NULL if the net_device does not belong to any of the bond's slaves
 *
 * Caller must hold bond lock for read
 */
static inline struct slave *bond_get_slave_by_dev(struct bonding *bond,
                                                  struct net_device *slave_dev)
{
        return netdev_lower_dev_get_private(bond->dev, slave_dev);
}

static inline struct bonding *bond_get_bond_by_slave(struct slave *slave)
{
        return slave->bond;
}

static inline bool bond_should_override_tx_queue(struct bonding *bond)
{
        return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP ||
               BOND_MODE(bond) == BOND_MODE_ROUNDROBIN;
}

static inline bool bond_is_lb(const struct bonding *bond)
{
        return BOND_MODE(bond) == BOND_MODE_TLB ||
               BOND_MODE(bond) == BOND_MODE_ALB;
}

static inline bool bond_needs_speed_duplex(const struct bonding *bond)
{
        return BOND_MODE(bond) == BOND_MODE_8023AD || bond_is_lb(bond);
}

static inline bool bond_is_nondyn_tlb(const struct bonding *bond)
{
        return (bond_is_lb(bond) && bond->params.tlb_dynamic_lb == 0);
}

static inline bool bond_mode_can_use_xmit_hash(const struct bonding *bond)
{
        return (BOND_MODE(bond) == BOND_MODE_8023AD ||
                BOND_MODE(bond) == BOND_MODE_XOR ||
                BOND_MODE(bond) == BOND_MODE_TLB ||
                BOND_MODE(bond) == BOND_MODE_ALB);
}

static inline bool bond_mode_uses_xmit_hash(const struct bonding *bond)
{
        return (BOND_MODE(bond) == BOND_MODE_8023AD ||
                BOND_MODE(bond) == BOND_MODE_XOR ||
                bond_is_nondyn_tlb(bond));
}

static inline bool bond_mode_uses_arp(int mode)
{
        return mode != BOND_MODE_8023AD && mode != BOND_MODE_TLB &&
               mode != BOND_MODE_ALB;
}

static inline bool bond_mode_uses_primary(int mode)
{
        return mode == BOND_MODE_ACTIVEBACKUP || mode == BOND_MODE_TLB ||
               mode == BOND_MODE_ALB;
}

static inline bool bond_uses_primary(struct bonding *bond)
{
        return bond_mode_uses_primary(BOND_MODE(bond));
}

static inline struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond)
{
        struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave);

        return bond_uses_primary(bond) && slave ? slave->dev : NULL;
}

static inline bool bond_slave_is_up(struct slave *slave)
{
        return netif_running(slave->dev) && netif_carrier_ok(slave->dev);
}

static inline void bond_set_active_slave(struct slave *slave)
{
        if (slave->backup) {
                slave->backup = 0;
                bond_queue_slave_event(slave);
                bond_lower_state_changed(slave);
        }
}

static inline void bond_set_backup_slave(struct slave *slave)
{
        if (!slave->backup) {
                slave->backup = 1;
                bond_queue_slave_event(slave);
                bond_lower_state_changed(slave);
        }
}

static inline void bond_set_slave_state(struct slave *slave,
                                        int slave_state, bool notify)
{
        if (slave->backup == slave_state)
                return;

        slave->backup = slave_state;
        if (notify) {
                bond_lower_state_changed(slave);
                bond_queue_slave_event(slave);
                slave->should_notify = 0;
        } else {
                if (slave->should_notify)
                        slave->should_notify = 0;
                else
                        slave->should_notify = 1;
        }
}

static inline void bond_slave_state_change(struct bonding *bond)
{
        struct list_head *iter;
        struct slave *tmp;

        bond_for_each_slave(bond, tmp, iter) {
                if (tmp->link == BOND_LINK_UP)
                        bond_set_active_slave(tmp);
                else if (tmp->link == BOND_LINK_DOWN)
                        bond_set_backup_slave(tmp);
        }
}

static inline void bond_slave_state_notify(struct bonding *bond)
{
        struct list_head *iter;
        struct slave *tmp;

        bond_for_each_slave(bond, tmp, iter) {
                if (tmp->should_notify) {
                        bond_lower_state_changed(tmp);
                        tmp->should_notify = 0;
                }
        }
}

static inline int bond_slave_state(struct slave *slave)
{
        return slave->backup;
}

static inline bool bond_is_active_slave(struct slave *slave)
{
        return !bond_slave_state(slave);
}

static inline bool bond_slave_can_tx(struct slave *slave)
{
        return bond_slave_is_up(slave) && slave->link == BOND_LINK_UP &&
               bond_is_active_slave(slave);
}

static inline bool bond_is_active_slave_dev(const struct net_device *slave_dev)
{
        struct slave *slave;
        bool active;

        rcu_read_lock();
        slave = bond_slave_get_rcu(slave_dev);
        active = bond_is_active_slave(slave);
        rcu_read_unlock();

        return active;
}

static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len)
{
        if (len == ETH_ALEN) {
                ether_addr_copy(dst, src);
                return;
        }

        memcpy(dst, src, len);
}

#define BOND_PRI_RESELECT_ALWAYS        0
#define BOND_PRI_RESELECT_BETTER        1
#define BOND_PRI_RESELECT_FAILURE        2

#define BOND_FOM_NONE                        0
#define BOND_FOM_ACTIVE                        1
#define BOND_FOM_FOLLOW                        2

#define BOND_ARP_TARGETS_ANY                0
#define BOND_ARP_TARGETS_ALL                1

#define BOND_ARP_VALIDATE_NONE                0
#define BOND_ARP_VALIDATE_ACTIVE        (1 << BOND_STATE_ACTIVE)
#define BOND_ARP_VALIDATE_BACKUP        (1 << BOND_STATE_BACKUP)
#define BOND_ARP_VALIDATE_ALL                (BOND_ARP_VALIDATE_ACTIVE | \
                                         BOND_ARP_VALIDATE_BACKUP)
#define BOND_ARP_FILTER                        (BOND_ARP_VALIDATE_ALL + 1)
#define BOND_ARP_FILTER_ACTIVE                (BOND_ARP_VALIDATE_ACTIVE | \
                                         BOND_ARP_FILTER)
#define BOND_ARP_FILTER_BACKUP                (BOND_ARP_VALIDATE_BACKUP | \
                                         BOND_ARP_FILTER)

#define BOND_SLAVE_NOTIFY_NOW                true
#define BOND_SLAVE_NOTIFY_LATER                false

static inline int slave_do_arp_validate(struct bonding *bond,
                                        struct slave *slave)
{
        return bond->params.arp_validate & (1 << bond_slave_state(slave));
}

static inline int slave_do_arp_validate_only(struct bonding *bond)
{
        return bond->params.arp_validate & BOND_ARP_FILTER;
}

static inline int bond_is_ip_target_ok(__be32 addr)
{
        return !ipv4_is_lbcast(addr) && !ipv4_is_zeronet(addr);
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int bond_is_ip6_target_ok(struct in6_addr *addr)
{
        return !ipv6_addr_any(addr) &&
               !ipv6_addr_loopback(addr) &&
               !ipv6_addr_is_multicast(addr);
}
#endif

/* Get the oldest arp which we've received on this slave for bond's
 * arp_targets.
 */
static inline unsigned long slave_oldest_target_arp_rx(struct bonding *bond,
                                                       struct slave *slave)
{
        int i = 1;
        unsigned long ret = slave->target_last_arp_rx[0];

        for (; (i < BOND_MAX_ARP_TARGETS) && bond->params.arp_targets[i]; i++)
                if (time_before(slave->target_last_arp_rx[i], ret))
                        ret = slave->target_last_arp_rx[i];

        return ret;
}

static inline unsigned long slave_last_rx(struct bonding *bond,
                                        struct slave *slave)
{
        if (bond->params.arp_all_targets == BOND_ARP_TARGETS_ALL)
                return slave_oldest_target_arp_rx(bond, slave);

        return slave->last_rx;
}

static inline void slave_update_last_tx(struct slave *slave)
{
        WRITE_ONCE(slave->last_tx, jiffies);
}

static inline unsigned long slave_last_tx(struct slave *slave)
{
        return READ_ONCE(slave->last_tx);
}

#ifdef CONFIG_NET_POLL_CONTROLLER
static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave,
                                         struct sk_buff *skb)
{
        return netpoll_send_skb(slave->np, skb);
}
#else
static inline netdev_tx_t bond_netpoll_send_skb(const struct slave *slave,
                                         struct sk_buff *skb)
{
        BUG();
        return NETDEV_TX_OK;
}
#endif

static inline void bond_set_slave_inactive_flags(struct slave *slave,
                                                 bool notify)
{
        if (!bond_is_lb(slave->bond))
                bond_set_slave_state(slave, BOND_STATE_BACKUP, notify);
        if (!slave->bond->params.all_slaves_active)
                slave->inactive = 1;
        if (BOND_MODE(slave->bond) == BOND_MODE_8023AD)
                slave->rx_disabled = 1;
}

static inline void bond_set_slave_tx_disabled_flags(struct slave *slave,
                                                 bool notify)
{
        bond_set_slave_state(slave, BOND_STATE_BACKUP, notify);
}

static inline void bond_set_slave_active_flags(struct slave *slave,
                                               bool notify)
{
        bond_set_slave_state(slave, BOND_STATE_ACTIVE, notify);
        slave->inactive = 0;
        if (BOND_MODE(slave->bond) == BOND_MODE_8023AD)
                slave->rx_disabled = 0;
}

static inline void bond_set_slave_rx_enabled_flags(struct slave *slave,
                                               bool notify)
{
        slave->rx_disabled = 0;
}

static inline bool bond_is_slave_inactive(struct slave *slave)
{
        return slave->inactive;
}

static inline bool bond_is_slave_rx_disabled(struct slave *slave)
{
        return slave->rx_disabled;
}

static inline void bond_propose_link_state(struct slave *slave, int state)
{
        slave->link_new_state = state;
}

static inline void bond_commit_link_state(struct slave *slave, bool notify)
{
        if (slave->link_new_state == BOND_LINK_NOCHANGE)
                return;

        slave->link = slave->link_new_state;
        if (notify) {
                bond_queue_slave_event(slave);
                bond_lower_state_changed(slave);
                slave->should_notify_link = 0;
        } else {
                if (slave->should_notify_link)
                        slave->should_notify_link = 0;
                else
                        slave->should_notify_link = 1;
        }
}

static inline void bond_set_slave_link_state(struct slave *slave, int state,
                                             bool notify)
{
        bond_propose_link_state(slave, state);
        bond_commit_link_state(slave, notify);
}

static inline void bond_slave_link_notify(struct bonding *bond)
{
        struct list_head *iter;
        struct slave *tmp;

        bond_for_each_slave(bond, tmp, iter) {
                if (tmp->should_notify_link) {
                        bond_queue_slave_event(tmp);
                        bond_lower_state_changed(tmp);
                        tmp->should_notify_link = 0;
                }
        }
}

static inline __be32 bond_confirm_addr(struct net_device *dev, __be32 dst, __be32 local)
{
        struct in_device *in_dev;
        __be32 addr = 0;

        rcu_read_lock();
        in_dev = __in_dev_get_rcu(dev);

        if (in_dev)
                addr = inet_confirm_addr(dev_net(dev), in_dev, dst, local,
                                         RT_SCOPE_HOST);
        rcu_read_unlock();
        return addr;
}

struct bond_net {
        struct net                *net;        /* Associated network namespace */
        struct list_head        dev_list;
#ifdef CONFIG_PROC_FS
        struct proc_dir_entry        *proc_dir;
#endif
        struct class_attribute        class_attr_bonding_masters;
};

int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave);
netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev);
int bond_create(struct net *net, const char *name);
int bond_create_sysfs(struct bond_net *net);
void bond_destroy_sysfs(struct bond_net *net);
void bond_prepare_sysfs_group(struct bonding *bond);
int bond_sysfs_slave_add(struct slave *slave);
void bond_sysfs_slave_del(struct slave *slave);
void bond_xdp_set_features(struct net_device *bond_dev);
int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev,
                 struct netlink_ext_ack *extack);
int bond_release(struct net_device *bond_dev, struct net_device *slave_dev);
u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb);
int bond_set_carrier(struct bonding *bond);
void bond_select_active_slave(struct bonding *bond);
void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
void bond_create_debugfs(void);
void bond_destroy_debugfs(void);
void bond_debug_register(struct bonding *bond);
void bond_debug_unregister(struct bonding *bond);
void bond_debug_reregister(struct bonding *bond);
const char *bond_mode_name(int mode);
bool bond_xdp_check(struct bonding *bond, int mode);
void bond_setup(struct net_device *bond_dev);
unsigned int bond_get_num_tx_queues(void);
int bond_netlink_init(void);
void bond_netlink_fini(void);
struct net_device *bond_option_active_slave_get_rcu(struct bonding *bond);
const char *bond_slave_link_status(s8 link);
struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev,
                                              struct net_device *end_dev,
                                              int level);
int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave);
void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay);
void bond_work_init_all(struct bonding *bond);

#ifdef CONFIG_PROC_FS
void bond_create_proc_entry(struct bonding *bond);
void bond_remove_proc_entry(struct bonding *bond);
void bond_create_proc_dir(struct bond_net *bn);
void bond_destroy_proc_dir(struct bond_net *bn);
#else
static inline void bond_create_proc_entry(struct bonding *bond)
{
}

static inline void bond_remove_proc_entry(struct bonding *bond)
{
}

static inline void bond_create_proc_dir(struct bond_net *bn)
{
}

static inline void bond_destroy_proc_dir(struct bond_net *bn)
{
}
#endif

static inline struct slave *bond_slave_has_mac(struct bonding *bond,
                                               const u8 *mac)
{
        struct list_head *iter;
        struct slave *tmp;

        bond_for_each_slave(bond, tmp, iter)
                if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr))
                        return tmp;

        return NULL;
}

/* Caller must hold rcu_read_lock() for read */
static inline bool bond_slave_has_mac_rcu(struct bonding *bond, const u8 *mac)
{
        struct list_head *iter;
        struct slave *tmp;

        bond_for_each_slave_rcu(bond, tmp, iter)
                if (ether_addr_equal_64bits(mac, tmp->dev->dev_addr))
                        return true;
        return false;
}

/* Check if the ip is present in arp ip list, or first free slot if ip == 0
 * Returns -1 if not found, index if found
 */
static inline int bond_get_targets_ip(__be32 *targets, __be32 ip)
{
        int i;

        for (i = 0; i < BOND_MAX_ARP_TARGETS; i++)
                if (targets[i] == ip)
                        return i;
                else if (targets[i] == 0)
                        break;

        return -1;
}

#if IS_ENABLED(CONFIG_IPV6)
static inline int bond_get_targets_ip6(struct in6_addr *targets, struct in6_addr *ip)
{
        struct in6_addr mcaddr;
        int i;

        for (i = 0; i < BOND_MAX_NS_TARGETS; i++) {
                addrconf_addr_solict_mult(&targets[i], &mcaddr);
                if ((ipv6_addr_equal(&targets[i], ip)) ||
                    (ipv6_addr_equal(&mcaddr, ip)))
                        return i;
                else if (ipv6_addr_any(&targets[i]))
                        break;
        }

        return -1;
}
#endif

/* exported from bond_main.c */
extern unsigned int bond_net_id;

/* exported from bond_netlink.c */
extern struct rtnl_link_ops bond_link_ops;

/* exported from bond_sysfs_slave.c */
extern const struct sysfs_ops slave_sysfs_ops;

/* exported from bond_3ad.c */
extern const u8 lacpdu_mcast_addr[];

static inline netdev_tx_t bond_tx_drop(struct net_device *dev, struct sk_buff *skb)
{
        dev_core_stats_tx_dropped_inc(dev);
        dev_kfree_skb_any(skb);
        return NET_XMIT_DROP;
}

#endif /* _NET_BONDING_H */































    3 
   12 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#ifndef __KVM_ARM_PSCI_H__
#define __KVM_ARM_PSCI_H__

#include <linux/kvm_host.h>
#include <uapi/linux/psci.h>

#define KVM_ARM_PSCI_0_1        PSCI_VERSION(0, 1)
#define KVM_ARM_PSCI_0_2        PSCI_VERSION(0, 2)
#define KVM_ARM_PSCI_1_0        PSCI_VERSION(1, 0)
#define KVM_ARM_PSCI_1_1        PSCI_VERSION(1, 1)
#define KVM_ARM_PSCI_1_2        PSCI_VERSION(1, 2)
#define KVM_ARM_PSCI_1_3        PSCI_VERSION(1, 3)

#define KVM_ARM_PSCI_LATEST        KVM_ARM_PSCI_1_3

static inline int kvm_psci_version(struct kvm_vcpu *vcpu)
{
        /*
         * Our PSCI implementation stays the same across versions from
         * v0.2 onward, only adding the few mandatory functions (such
         * as FEATURES with 1.0) that are required by newer
         * revisions. It is thus safe to return the latest, unless
         * userspace has instructed us otherwise.
         */
        if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PSCI_0_2)) {
                if (vcpu->kvm->arch.psci_version)
                        return vcpu->kvm->arch.psci_version;

                return KVM_ARM_PSCI_LATEST;
        }

        return KVM_ARM_PSCI_0_1;
}


int kvm_psci_call(struct kvm_vcpu *vcpu);

#endif /* __KVM_ARM_PSCI_H__ */






























































   61 




















   61 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef IOPRIO_H
#define IOPRIO_H

#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/iocontext.h>

#include <uapi/linux/ioprio.h>

/*
 * Default IO priority.
 */
#define IOPRIO_DEFAULT        IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0)

/*
 * Check that a priority value has a valid class.
 */
static inline bool ioprio_valid(unsigned short ioprio)
{
        unsigned short class = IOPRIO_PRIO_CLASS(ioprio);

        return class > IOPRIO_CLASS_NONE && class <= IOPRIO_CLASS_IDLE;
}

/*
 * if process has set io priority explicitly, use that. if not, convert
 * the cpu scheduler nice value to an io priority
 */
static inline int task_nice_ioprio(struct task_struct *task)
{
        return (task_nice(task) + 20) / 5;
}

/*
 * This is for the case where the task hasn't asked for a specific IO class.
 * Check for idle and rt task process, and return appropriate IO class.
 */
static inline int task_nice_ioclass(struct task_struct *task)
{
        if (task->policy == SCHED_IDLE)
                return IOPRIO_CLASS_IDLE;
        else if (rt_or_dl_task_policy(task))
                return IOPRIO_CLASS_RT;
        else
                return IOPRIO_CLASS_BE;
}

#ifdef CONFIG_BLOCK
/*
 * If the task has set an I/O priority, use that. Otherwise, return
 * the default I/O priority.
 *
 * Expected to be called for current task or with task_lock() held to keep
 * io_context stable.
 */
static inline int __get_task_ioprio(struct task_struct *p)
{
        struct io_context *ioc = p->io_context;
        int prio;

        if (!ioc)
                return IOPRIO_DEFAULT;

        if (p != current)
                lockdep_assert_held(&p->alloc_lock);

        prio = ioc->ioprio;
        if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
                prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
                                         task_nice_ioprio(p));
        return prio;
}
#else
static inline int __get_task_ioprio(struct task_struct *p)
{
        return IOPRIO_DEFAULT;
}
#endif /* CONFIG_BLOCK */

static inline int get_current_ioprio(void)
{
        return __get_task_ioprio(current);
}

extern int set_task_ioprio(struct task_struct *task, int ioprio);

#ifdef CONFIG_BLOCK
extern int ioprio_check_cap(int ioprio);
#else
static inline int ioprio_check_cap(int ioprio)
{
        return -ENOTBLK;
}
#endif /* CONFIG_BLOCK */

#endif
















    5 













    5 























    5 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/lib/kasprintf.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

#include <linux/stdarg.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>

/* Simplified asprintf. */
char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
{
        unsigned int first, second;
        char *p;
        va_list aq;

        va_copy(aq, ap);
        first = vsnprintf(NULL, 0, fmt, aq);
        va_end(aq);

        p = kmalloc_track_caller(first+1, gfp);
        if (!p)
                return NULL;

        second = vsnprintf(p, first+1, fmt, ap);
        WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)",
             first, second, fmt);

        return p;
}
EXPORT_SYMBOL(kvasprintf);

/*
 * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt
 * (or the sole vararg) points to rodata, we will then save a memory
 * allocation and string copy. In any case, the return value should be
 * freed using kfree_const().
 */
const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap)
{
        if (!strchr(fmt, '%'))
                return kstrdup_const(fmt, gfp);
        if (!strcmp(fmt, "%s"))
                return kstrdup_const(va_arg(ap, const char*), gfp);
        return kvasprintf(gfp, fmt, ap);
}
EXPORT_SYMBOL(kvasprintf_const);

char *kasprintf(gfp_t gfp, const char *fmt, ...)
{
        va_list ap;
        char *p;

        va_start(ap, fmt);
        p = kvasprintf(gfp, fmt, ap);
        va_end(ap);

        return p;
}
EXPORT_SYMBOL(kasprintf);



































  216 


































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 ARM Ltd.
 */
#ifndef __ASM_WORD_AT_A_TIME_H
#define __ASM_WORD_AT_A_TIME_H

#include <linux/uaccess.h>

#ifndef __AARCH64EB__

#include <linux/bitops.h>
#include <linux/wordpart.h>

struct word_at_a_time {
        const unsigned long one_bits, high_bits;
};

#define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) }

static inline unsigned long has_zero(unsigned long a, unsigned long *bits,
                                     const struct word_at_a_time *c)
{
        unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits;
        *bits = mask;
        return mask;
}

#define prep_zero_mask(a, bits, c) (bits)
#define create_zero_mask(bits) (bits)
#define find_zero(bits) (__ffs(bits) >> 3)

static inline unsigned long zero_bytemask(unsigned long bits)
{
        bits = (bits - 1) & ~bits;
        return bits >> 7;
}

#else        /* __AARCH64EB__ */
#include <asm-generic/word-at-a-time.h>
#endif

/*
 * Load an unaligned word from kernel space.
 *
 * In the (very unlikely) case of the word being a page-crosser
 * and the next page not being mapped, take the exception and
 * return zeroes in the non-existing part.
 */
static inline unsigned long load_unaligned_zeropad(const void *addr)
{
        unsigned long ret;

        __mte_enable_tco_async();

        /* Load word from unaligned pointer addr */
        asm(
        "1:        ldr        %0, %2\n"
        "2:\n"
        _ASM_EXTABLE_LOAD_UNALIGNED_ZEROPAD(1b, 2b, %0, %1)
        : "=&r" (ret)
        : "r" (addr), "Q" (*(unsigned long *)addr));

        __mte_disable_tco_async();

        return ret;
}

#endif /* __ASM_WORD_AT_A_TIME_H */










































































































































































































































































































































 1509 
 1511 



   22 
   22 



 1508 
 1510 




















































































































































































































































































































































































































































































































 1508 

























  421 
 1511 

























    1 










    1 


























  835 

















  837 



  836 







































































































































































































  212 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
/* SPDX-License-Identifier: GPL-2.0+ */
/*
 * Read-Copy Update mechanism for mutual exclusion
 *
 * Copyright IBM Corporation, 2001
 *
 * Author: Dipankar Sarma <dipankar@in.ibm.com>
 *
 * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com>
 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
 * Papers:
 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
 *
 * For detailed explanation of Read-Copy Update mechanism see -
 *                http://lse.sourceforge.net/locking/rcupdate.html
 *
 */

#ifndef __LINUX_RCUPDATE_H
#define __LINUX_RCUPDATE_H

#include <linux/types.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
#include <linux/irqflags.h>
#include <linux/preempt.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/processor.h>
#include <linux/context_tracking_irq.h>

#define ULONG_CMP_GE(a, b)        (ULONG_MAX / 2 >= (a) - (b))
#define ULONG_CMP_LT(a, b)        (ULONG_MAX / 2 < (a) - (b))

#define RCU_SEQ_CTR_SHIFT    2
#define RCU_SEQ_STATE_MASK   ((1 << RCU_SEQ_CTR_SHIFT) - 1)

/* Exported common interfaces */
void call_rcu(struct rcu_head *head, rcu_callback_t func);
void rcu_barrier_tasks(void);
void synchronize_rcu(void);

struct rcu_gp_oldstate;
unsigned long get_completed_synchronize_rcu(void);
void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp);

// Maximum number of unsigned long values corresponding to
// not-yet-completed RCU grace periods.
#define NUM_ACTIVE_RCU_POLL_OLDSTATE 2

/**
 * same_state_synchronize_rcu - Are two old-state values identical?
 * @oldstate1: First old-state value.
 * @oldstate2: Second old-state value.
 *
 * The two old-state values must have been obtained from either
 * get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or
 * get_completed_synchronize_rcu().  Returns @true if the two values are
 * identical and @false otherwise.  This allows structures whose lifetimes
 * are tracked by old-state values to push these values to a list header,
 * allowing those structures to be slightly smaller.
 */
static inline bool same_state_synchronize_rcu(unsigned long oldstate1, unsigned long oldstate2)
{
        return oldstate1 == oldstate2;
}

#ifdef CONFIG_PREEMPT_RCU

void __rcu_read_lock(void);
void __rcu_read_unlock(void);

/*
 * Defined as a macro as it is a very low level header included from
 * areas that don't even know about current.  This gives the rcu_read_lock()
 * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
 */
#define rcu_preempt_depth() READ_ONCE(current->rcu_read_lock_nesting)

#else /* #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_TINY_RCU
#define rcu_read_unlock_strict() do { } while (0)
#else
void rcu_read_unlock_strict(void);
#endif

static inline void __rcu_read_lock(void)
{
        preempt_disable();
}

static inline void __rcu_read_unlock(void)
{
        if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
                rcu_read_unlock_strict();
        preempt_enable();
}

static inline int rcu_preempt_depth(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_PREEMPT_RCU */

#ifdef CONFIG_RCU_LAZY
void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func);
#else
static inline void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func)
{
        call_rcu(head, func);
}
#endif

/* Internal to kernel */
void rcu_init(void);
extern int rcu_scheduler_active;
void rcu_sched_clock_irq(int user);

#ifdef CONFIG_RCU_STALL_COMMON
void rcu_sysrq_start(void);
void rcu_sysrq_end(void);
#else /* #ifdef CONFIG_RCU_STALL_COMMON */
static inline void rcu_sysrq_start(void) { }
static inline void rcu_sysrq_end(void) { }
#endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */

#if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK))
void rcu_irq_work_resched(void);
#else
static __always_inline void rcu_irq_work_resched(void) { }
#endif

#ifdef CONFIG_RCU_NOCB_CPU
void rcu_init_nohz(void);
int rcu_nocb_cpu_offload(int cpu);
int rcu_nocb_cpu_deoffload(int cpu);
void rcu_nocb_flush_deferred_wakeup(void);

#define RCU_NOCB_LOCKDEP_WARN(c, s) RCU_LOCKDEP_WARN(c, s)

#else /* #ifdef CONFIG_RCU_NOCB_CPU */

static inline void rcu_init_nohz(void) { }
static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; }
static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; }
static inline void rcu_nocb_flush_deferred_wakeup(void) { }

#define RCU_NOCB_LOCKDEP_WARN(c, s)

#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */

/*
 * Note a quasi-voluntary context switch for RCU-tasks's benefit.
 * This is a macro rather than an inline function to avoid #include hell.
 */
#ifdef CONFIG_TASKS_RCU_GENERIC

# ifdef CONFIG_TASKS_RCU
# define rcu_tasks_classic_qs(t, preempt)                                \
        do {                                                                \
                if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout))        \
                        WRITE_ONCE((t)->rcu_tasks_holdout, false);        \
        } while (0)
void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func);
void synchronize_rcu_tasks(void);
void rcu_tasks_torture_stats_print(char *tt, char *tf);
# else
# define rcu_tasks_classic_qs(t, preempt) do { } while (0)
# define call_rcu_tasks call_rcu
# define synchronize_rcu_tasks synchronize_rcu
# endif

# ifdef CONFIG_TASKS_TRACE_RCU
// Bits for ->trc_reader_special.b.need_qs field.
#define TRC_NEED_QS                0x1  // Task needs a quiescent state.
#define TRC_NEED_QS_CHECKED        0x2  // Task has been checked for needing quiescent state.

u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new);
void rcu_tasks_trace_qs_blkd(struct task_struct *t);

# define rcu_tasks_trace_qs(t)                                                        \
        do {                                                                        \
                int ___rttq_nesting = READ_ONCE((t)->trc_reader_nesting);        \
                                                                                \
                if (unlikely(READ_ONCE((t)->trc_reader_special.b.need_qs) == TRC_NEED_QS) &&        \
                    likely(!___rttq_nesting)) {                                        \
                        rcu_trc_cmpxchg_need_qs((t), TRC_NEED_QS, TRC_NEED_QS_CHECKED);        \
                } else if (___rttq_nesting && ___rttq_nesting != INT_MIN &&        \
                           !READ_ONCE((t)->trc_reader_special.b.blocked)) {        \
                        rcu_tasks_trace_qs_blkd(t);                                \
                }                                                                \
        } while (0)
void rcu_tasks_trace_torture_stats_print(char *tt, char *tf);
# else
# define rcu_tasks_trace_qs(t) do { } while (0)
# endif

#define rcu_tasks_qs(t, preempt)                                        \
do {                                                                        \
        rcu_tasks_classic_qs((t), (preempt));                                \
        rcu_tasks_trace_qs(t);                                                \
} while (0)

# ifdef CONFIG_TASKS_RUDE_RCU
void synchronize_rcu_tasks_rude(void);
void rcu_tasks_rude_torture_stats_print(char *tt, char *tf);
# endif

#define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false)
void exit_tasks_rcu_start(void);
void exit_tasks_rcu_finish(void);
#else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
#define rcu_tasks_classic_qs(t, preempt) do { } while (0)
#define rcu_tasks_qs(t, preempt) do { } while (0)
#define rcu_note_voluntary_context_switch(t) do { } while (0)
#define call_rcu_tasks call_rcu
#define synchronize_rcu_tasks synchronize_rcu
static inline void exit_tasks_rcu_start(void) { }
static inline void exit_tasks_rcu_finish(void) { }
#endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */

/**
 * rcu_trace_implies_rcu_gp - does an RCU Tasks Trace grace period imply an RCU grace period?
 *
 * As an accident of implementation, an RCU Tasks Trace grace period also
 * acts as an RCU grace period.  However, this could change at any time.
 * Code relying on this accident must call this function to verify that
 * this accident is still happening.
 *
 * You have been warned!
 */
static inline bool rcu_trace_implies_rcu_gp(void) { return true; }

/**
 * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU
 *
 * This macro resembles cond_resched(), except that it is defined to
 * report potential quiescent states to RCU-tasks even if the cond_resched()
 * machinery were to be shut off, as some advocate for PREEMPTION kernels.
 */
#define cond_resched_tasks_rcu_qs() \
do { \
        rcu_tasks_qs(current, false); \
        cond_resched(); \
} while (0)

/**
 * rcu_softirq_qs_periodic - Report RCU and RCU-Tasks quiescent states
 * @old_ts: jiffies at start of processing.
 *
 * This helper is for long-running softirq handlers, such as NAPI threads in
 * networking. The caller should initialize the variable passed in as @old_ts
 * at the beginning of the softirq handler. When invoked frequently, this macro
 * will invoke rcu_softirq_qs() every 100 milliseconds thereafter, which will
 * provide both RCU and RCU-Tasks quiescent states. Note that this macro
 * modifies its old_ts argument.
 *
 * Because regions of code that have disabled softirq act as RCU read-side
 * critical sections, this macro should be invoked with softirq (and
 * preemption) enabled.
 *
 * The macro is not needed when CONFIG_PREEMPT_RT is defined. RT kernels would
 * have more chance to invoke schedule() calls and provide necessary quiescent
 * states. As a contrast, calling cond_resched() only won't achieve the same
 * effect because cond_resched() does not provide RCU-Tasks quiescent states.
 */
#define rcu_softirq_qs_periodic(old_ts) \
do { \
        if (!IS_ENABLED(CONFIG_PREEMPT_RT) && \
            time_after(jiffies, (old_ts) + HZ / 10)) { \
                preempt_disable(); \
                rcu_softirq_qs(); \
                preempt_enable(); \
                (old_ts) = jiffies; \
        } \
} while (0)

/*
 * Infrastructure to implement the synchronize_() primitives in
 * TREE_RCU and rcu_barrier_() primitives in TINY_RCU.
 */

#if defined(CONFIG_TREE_RCU)
#include <linux/rcutree.h>
#elif defined(CONFIG_TINY_RCU)
#include <linux/rcutiny.h>
#else
#error "Unknown RCU implementation specified to kernel configuration"
#endif

/*
 * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls
 * are needed for dynamic initialization and destruction of rcu_head
 * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for
 * dynamic initialization and destruction of statically allocated rcu_head
 * structures.  However, rcu_head structures allocated dynamically in the
 * heap don't need any initialization.
 */
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
void init_rcu_head(struct rcu_head *head);
void destroy_rcu_head(struct rcu_head *head);
void init_rcu_head_on_stack(struct rcu_head *head);
void destroy_rcu_head_on_stack(struct rcu_head *head);
#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
static inline void init_rcu_head(struct rcu_head *head) { }
static inline void destroy_rcu_head(struct rcu_head *head) { }
static inline void init_rcu_head_on_stack(struct rcu_head *head) { }
static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { }
#endif        /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */

#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU)
bool rcu_lockdep_current_cpu_online(void);
#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */
static inline bool rcu_lockdep_current_cpu_online(void) { return true; }
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */

extern struct lockdep_map rcu_lock_map;
extern struct lockdep_map rcu_bh_lock_map;
extern struct lockdep_map rcu_sched_lock_map;
extern struct lockdep_map rcu_callback_map;

#ifdef CONFIG_DEBUG_LOCK_ALLOC

static inline void rcu_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_try_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 1, 2, 0, NULL, _THIS_IP_);
}

static inline void rcu_lock_release(struct lockdep_map *map)
{
        lock_release(map, _THIS_IP_);
}

int debug_lockdep_rcu_enabled(void);
int rcu_read_lock_held(void);
int rcu_read_lock_bh_held(void);
int rcu_read_lock_sched_held(void);
int rcu_read_lock_any_held(void);

#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */

# define rcu_lock_acquire(a)                do { } while (0)
# define rcu_try_lock_acquire(a)        do { } while (0)
# define rcu_lock_release(a)                do { } while (0)

static inline int rcu_read_lock_held(void)
{
        return 1;
}

static inline int rcu_read_lock_bh_held(void)
{
        return 1;
}

static inline int rcu_read_lock_sched_held(void)
{
        return !preemptible();
}

static inline int rcu_read_lock_any_held(void)
{
        return !preemptible();
}

static inline int debug_lockdep_rcu_enabled(void)
{
        return 0;
}

#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */

#ifdef CONFIG_PROVE_RCU

/**
 * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met
 * @c: condition to check
 * @s: informative message
 *
 * This checks debug_lockdep_rcu_enabled() before checking (c) to
 * prevent early boot splats due to lockdep not yet being initialized,
 * and rechecks it after checking (c) to prevent false-positive splats
 * due to races with lockdep being disabled.  See commit 3066820034b5dd
 * ("rcu: Reject RCU_LOCKDEP_WARN() false positives") for more detail.
 */
#define RCU_LOCKDEP_WARN(c, s)                                                \
        do {                                                                \
                static bool __section(".data..unlikely") __warned;        \
                if (debug_lockdep_rcu_enabled() && (c) &&                \
                    debug_lockdep_rcu_enabled() && !__warned) {                \
                        __warned = true;                                \
                        lockdep_rcu_suspicious(__FILE__, __LINE__, s);        \
                }                                                        \
        } while (0)

#ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void)
{
        RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
                         "Illegal context switch in RCU read-side critical section");
}
#else // #ifndef CONFIG_PREEMPT_RCU
static inline void rcu_preempt_sleep_check(void) { }
#endif // #else // #ifndef CONFIG_PREEMPT_RCU

#define rcu_sleep_check()                                                \
        do {                                                                \
                rcu_preempt_sleep_check();                                \
                if (!IS_ENABLED(CONFIG_PREEMPT_RT))                        \
                    RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),        \
                                 "Illegal context switch in RCU-bh read-side critical section"); \
                RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),        \
                                 "Illegal context switch in RCU-sched read-side critical section"); \
        } while (0)

// See RCU_LOCKDEP_WARN() for an explanation of the double call to
// debug_lockdep_rcu_enabled().
static inline bool lockdep_assert_rcu_helper(bool c)
{
        return debug_lockdep_rcu_enabled() &&
               (c || !rcu_is_watching() || !rcu_lockdep_current_cpu_online()) &&
               debug_lockdep_rcu_enabled();
}

/**
 * lockdep_assert_in_rcu_read_lock - WARN if not protected by rcu_read_lock()
 *
 * Splats if lockdep is enabled and there is no rcu_read_lock() in effect.
 */
#define lockdep_assert_in_rcu_read_lock() \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map)))

/**
 * lockdep_assert_in_rcu_read_lock_bh - WARN if not protected by rcu_read_lock_bh()
 *
 * Splats if lockdep is enabled and there is no rcu_read_lock_bh() in effect.
 * Note that local_bh_disable() and friends do not suffice here, instead an
 * actual rcu_read_lock_bh() is required.
 */
#define lockdep_assert_in_rcu_read_lock_bh() \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_bh_lock_map)))

/**
 * lockdep_assert_in_rcu_read_lock_sched - WARN if not protected by rcu_read_lock_sched()
 *
 * Splats if lockdep is enabled and there is no rcu_read_lock_sched()
 * in effect.  Note that preempt_disable() and friends do not suffice here,
 * instead an actual rcu_read_lock_sched() is required.
 */
#define lockdep_assert_in_rcu_read_lock_sched() \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_sched_lock_map)))

/**
 * lockdep_assert_in_rcu_reader - WARN if not within some type of RCU reader
 *
 * Splats if lockdep is enabled and there is no RCU reader of any
 * type in effect.  Note that regions of code protected by things like
 * preempt_disable, local_bh_disable(), and local_irq_disable() all qualify
 * as RCU readers.
 *
 * Note that this will never trigger in PREEMPT_NONE or PREEMPT_VOLUNTARY
 * kernels that are not also built with PREEMPT_COUNT.  But if you have
 * lockdep enabled, you might as well also enable PREEMPT_COUNT.
 */
#define lockdep_assert_in_rcu_reader()                                                                \
        WARN_ON_ONCE(lockdep_assert_rcu_helper(!lock_is_held(&rcu_lock_map) &&                        \
                                               !lock_is_held(&rcu_bh_lock_map) &&                \
                                               !lock_is_held(&rcu_sched_lock_map) &&                \
                                               preemptible()))

#else /* #ifdef CONFIG_PROVE_RCU */

#define RCU_LOCKDEP_WARN(c, s) do { } while (0 && (c))
#define rcu_sleep_check() do { } while (0)

#define lockdep_assert_in_rcu_read_lock() do { } while (0)
#define lockdep_assert_in_rcu_read_lock_bh() do { } while (0)
#define lockdep_assert_in_rcu_read_lock_sched() do { } while (0)
#define lockdep_assert_in_rcu_reader() do { } while (0)

#endif /* #else #ifdef CONFIG_PROVE_RCU */

/*
 * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
 * and rcu_assign_pointer().  Some of these could be folded into their
 * callers, but they are left separate in order to ease introduction of
 * multiple pointers markings to match different RCU implementations
 * (e.g., __srcu), should this make sense in the future.
 */

#ifdef __CHECKER__
#define rcu_check_sparse(p, space) \
        ((void)(((typeof(*p) space *)p) == p))
#else /* #ifdef __CHECKER__ */
#define rcu_check_sparse(p, space)
#endif /* #else #ifdef __CHECKER__ */

#define __unrcu_pointer(p, local)                                        \
({                                                                        \
        typeof(*p) *local = (typeof(*p) *__force)(p);                        \
        rcu_check_sparse(p, __rcu);                                        \
        ((typeof(*p) __force __kernel *)(local));                         \
})
/**
 * unrcu_pointer - mark a pointer as not being RCU protected
 * @p: pointer needing to lose its __rcu property
 *
 * Converts @p from an __rcu pointer to a __kernel pointer.
 * This allows an __rcu pointer to be used with xchg() and friends.
 */
#define unrcu_pointer(p) __unrcu_pointer(p, __UNIQUE_ID(rcu))

#define __rcu_access_pointer(p, local, space) \
({ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_check(p, local, c, space) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(*p) *local = (typeof(*p) *__force)READ_ONCE(p); \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define __rcu_dereference_protected(p, local, c, space) \
({ \
        RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \
        rcu_check_sparse(p, space); \
        ((typeof(*p) __force __kernel *)(p)); \
})
#define __rcu_dereference_raw(p, local) \
({ \
        /* Dependency order vs. p above. */ \
        typeof(p) local = READ_ONCE(p); \
        ((typeof(*p) __force __kernel *)(local)); \
})
#define rcu_dereference_raw(p) __rcu_dereference_raw(p, __UNIQUE_ID(rcu))

/**
 * RCU_INITIALIZER() - statically initialize an RCU-protected global variable
 * @v: The value to statically initialize with.
 */
#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v)

/**
 * rcu_assign_pointer() - assign to RCU-protected pointer
 * @p: pointer to assign to
 * @v: value to assign (publish)
 *
 * Assigns the specified value to the specified RCU-protected
 * pointer, ensuring that any concurrent RCU readers will see
 * any prior initialization.
 *
 * Inserts memory barriers on architectures that require them
 * (which is most of them), and also prevents the compiler from
 * reordering the code that initializes the structure after the pointer
 * assignment.  More importantly, this call documents which pointers
 * will be dereferenced by RCU read-side code.
 *
 * In some special cases, you may use RCU_INIT_POINTER() instead
 * of rcu_assign_pointer().  RCU_INIT_POINTER() is a bit faster due
 * to the fact that it does not constrain either the CPU or the compiler.
 * That said, using RCU_INIT_POINTER() when you should have used
 * rcu_assign_pointer() is a very bad thing that results in
 * impossible-to-diagnose memory corruption.  So please be careful.
 * See the RCU_INIT_POINTER() comment header for details.
 *
 * Note that rcu_assign_pointer() evaluates each of its arguments only
 * once, appearances notwithstanding.  One of the "extra" evaluations
 * is in typeof() and the other visible only to sparse (__CHECKER__),
 * neither of which actually execute the argument.  As with most cpp
 * macros, this execute-arguments-only-once property is important, so
 * please be careful when making changes to rcu_assign_pointer() and the
 * other macros that it invokes.
 */
#define rcu_assign_pointer(p, v)                                              \
do {                                                                              \
        uintptr_t _r_a_p__v = (uintptr_t)(v);                                      \
        rcu_check_sparse(p, __rcu);                                              \
                                                                              \
        if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL)              \
                WRITE_ONCE((p), (typeof(p))(_r_a_p__v));                      \
        else                                                                      \
                smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \
} while (0)

/**
 * rcu_replace_pointer() - replace an RCU pointer, returning its old value
 * @rcu_ptr: RCU pointer, whose old value is returned
 * @ptr: regular pointer
 * @c: the lockdep conditions under which the dereference will take place
 *
 * Perform a replacement, where @rcu_ptr is an RCU-annotated
 * pointer and @c is the lockdep argument that is passed to the
 * rcu_dereference_protected() call used to read that pointer.  The old
 * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr.
 */
#define rcu_replace_pointer(rcu_ptr, ptr, c)                                \
({                                                                        \
        typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c));        \
        rcu_assign_pointer((rcu_ptr), (ptr));                                \
        __tmp;                                                                \
})

/**
 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
 * @p: The pointer to read
 *
 * Return the value of the specified RCU-protected pointer, but omit the
 * lockdep checks for being in an RCU read-side critical section.  This is
 * useful when the value of this pointer is accessed, but the pointer is
 * not dereferenced, for example, when testing an RCU-protected pointer
 * against NULL.  Although rcu_access_pointer() may also be used in cases
 * where update-side locks prevent the value of the pointer from changing,
 * you should instead use rcu_dereference_protected() for this use case.
 * Within an RCU read-side critical section, there is little reason to
 * use rcu_access_pointer().
 *
 * It is usually best to test the rcu_access_pointer() return value
 * directly in order to avoid accidental dereferences being introduced
 * by later inattentive changes.  In other words, assigning the
 * rcu_access_pointer() return value to a local variable results in an
 * accident waiting to happen.
 *
 * It is also permissible to use rcu_access_pointer() when read-side
 * access to the pointer was removed at least one grace period ago, as is
 * the case in the context of the RCU callback that is freeing up the data,
 * or after a synchronize_rcu() returns.  This can be useful when tearing
 * down multi-linked structures after a grace period has elapsed.  However,
 * rcu_dereference_protected() is normally preferred for this use case.
 */
#define rcu_access_pointer(p) __rcu_access_pointer((p), __UNIQUE_ID(rcu), __rcu)

/**
 * rcu_dereference_check() - rcu_dereference with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Do an rcu_dereference(), but check that the conditions under which the
 * dereference will take place are correct.  Typically the conditions
 * indicate the various locking conditions that should be held at that
 * point.  The check should return true if the conditions are satisfied.
 * An implicit check for being in an RCU read-side critical section
 * (rcu_read_lock()) is included.
 *
 * For example:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
 *
 * could be used to indicate to lockdep that foo->bar may only be dereferenced
 * if either rcu_read_lock() is held, or that the lock required to replace
 * the bar struct at foo->bar is held.
 *
 * Note that the list of conditions may also include indications of when a lock
 * need not be held, for example during initialisation or destruction of the
 * target struct:
 *
 *        bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
 *                                              atomic_read(&foo->usage) == 0);
 *
 * Inserts memory barriers on architectures that require them
 * (currently only the Alpha), prevents the compiler from refetching
 * (and from merging fetches), and, more importantly, documents exactly
 * which pointers are protected by RCU and checks that the pointer is
 * annotated as __rcu.
 */
#define rcu_dereference_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_held(), __rcu)

/**
 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-bh counterpart to rcu_dereference_check().  However,
 * please note that starting in v5.0 kernels, vanilla RCU grace periods
 * wait for local_bh_disable() regions of code in addition to regions of
 * code demarked by rcu_read_lock() and rcu_read_unlock().  This means
 * that synchronize_rcu(), call_rcu, and friends all take not only
 * rcu_read_lock() but also rcu_read_lock_bh() into account.
 */
#define rcu_dereference_bh_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_bh_held(), __rcu)

/**
 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * This is the RCU-sched counterpart to rcu_dereference_check().
 * However, please note that starting in v5.0 kernels, vanilla RCU grace
 * periods wait for preempt_disable() regions of code in addition to
 * regions of code demarked by rcu_read_lock() and rcu_read_unlock().
 * This means that synchronize_rcu(), call_rcu, and friends all take not
 * only rcu_read_lock() but also rcu_read_lock_sched() into account.
 */
#define rcu_dereference_sched_check(p, c) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), \
                                (c) || rcu_read_lock_sched_held(), \
                                __rcu)

/*
 * The tracing infrastructure traces RCU (we want that), but unfortunately
 * some of the RCU checks causes tracing to lock up the system.
 *
 * The no-tracing version of rcu_dereference_raw() must not call
 * rcu_read_lock_held().
 */
#define rcu_dereference_raw_check(p) \
        __rcu_dereference_check((p), __UNIQUE_ID(rcu), 1, __rcu)

/**
 * rcu_dereference_protected() - fetch RCU pointer when updates prevented
 * @p: The pointer to read, prior to dereferencing
 * @c: The conditions under which the dereference will take place
 *
 * Return the value of the specified RCU-protected pointer, but omit
 * the READ_ONCE().  This is useful in cases where update-side locks
 * prevent the value of the pointer from changing.  Please note that this
 * primitive does *not* prevent the compiler from repeating this reference
 * or combining it with other references, so it should not be used without
 * protection of appropriate locks.
 *
 * This function is only for update-side use.  Using this function
 * when protected only by rcu_read_lock() will result in infrequent
 * but very ugly failures.
 */
#define rcu_dereference_protected(p, c) \
        __rcu_dereference_protected((p), __UNIQUE_ID(rcu), (c), __rcu)


/**
 * rcu_dereference() - fetch RCU-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * This is a simple wrapper around rcu_dereference_check().
 */
#define rcu_dereference(p) rcu_dereference_check(p, 0)

/**
 * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)

/**
 * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
 * @p: The pointer to read, prior to dereferencing
 *
 * Makes rcu_dereference_check() do the dirty work.
 */
#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)

/**
 * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism
 * @p: The pointer to hand off
 *
 * This is simply an identity function, but it documents where a pointer
 * is handed off from RCU to some other synchronization mechanism, for
 * example, reference counting or locking.  In C11, it would map to
 * kill_dependency().  It could be used as follows::
 *
 *        rcu_read_lock();
 *        p = rcu_dereference(gp);
 *        long_lived = is_long_lived(p);
 *        if (long_lived) {
 *                if (!atomic_inc_not_zero(p->refcnt))
 *                        long_lived = false;
 *                else
 *                        p = rcu_pointer_handoff(p);
 *        }
 *        rcu_read_unlock();
 */
#define rcu_pointer_handoff(p) (p)

/**
 * rcu_read_lock() - mark the beginning of an RCU read-side critical section
 *
 * When synchronize_rcu() is invoked on one CPU while other CPUs
 * are within RCU read-side critical sections, then the
 * synchronize_rcu() is guaranteed to block until after all the other
 * CPUs exit their critical sections.  Similarly, if call_rcu() is invoked
 * on one CPU while other CPUs are within RCU read-side critical
 * sections, invocation of the corresponding RCU callback is deferred
 * until after the all the other CPUs exit their critical sections.
 *
 * Both synchronize_rcu() and call_rcu() also wait for regions of code
 * with preemption disabled, including regions of code with interrupts or
 * softirqs disabled.
 *
 * Note, however, that RCU callbacks are permitted to run concurrently
 * with new RCU read-side critical sections.  One way that this can happen
 * is via the following sequence of events: (1) CPU 0 enters an RCU
 * read-side critical section, (2) CPU 1 invokes call_rcu() to register
 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
 * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU
 * callback is invoked.  This is legal, because the RCU read-side critical
 * section that was running concurrently with the call_rcu() (and which
 * therefore might be referencing something that the corresponding RCU
 * callback would free up) has completed before the corresponding
 * RCU callback is invoked.
 *
 * RCU read-side critical sections may be nested.  Any deferred actions
 * will be deferred until the outermost RCU read-side critical section
 * completes.
 *
 * You can avoid reading and understanding the next paragraph by
 * following this rule: don't put anything in an rcu_read_lock() RCU
 * read-side critical section that would block in a !PREEMPTION kernel.
 * But if you want the full story, read on!
 *
 * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU),
 * it is illegal to block while in an RCU read-side critical section.
 * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
 * kernel builds, RCU read-side critical sections may be preempted,
 * but explicit blocking is illegal.  Finally, in preemptible RCU
 * implementations in real-time (with -rt patchset) kernel builds, RCU
 * read-side critical sections may be preempted and they may also block, but
 * only when acquiring spinlocks that are subject to priority inheritance.
 */
static __always_inline void rcu_read_lock(void)
{
        __rcu_read_lock();
        __acquire(RCU);
        rcu_lock_acquire(&rcu_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock() used illegally while idle");
}

/*
 * So where is rcu_write_lock()?  It does not exist, as there is no
 * way for writers to lock out RCU readers.  This is a feature, not
 * a bug -- this property is what provides RCU's performance benefits.
 * Of course, writers must coordinate with each other.  The normal
 * spinlock primitives work well for this, but any other technique may be
 * used as well.  RCU does not care how the writers keep out of each
 * others' way, as long as they do so.
 */

/**
 * rcu_read_unlock() - marks the end of an RCU read-side critical section.
 *
 * In almost all situations, rcu_read_unlock() is immune from deadlock.
 * This deadlock immunity also extends to the scheduler's runqueue
 * and priority-inheritance spinlocks, courtesy of the quiescent-state
 * deferral that is carried out when rcu_read_unlock() is invoked with
 * interrupts disabled.
 *
 * See rcu_read_lock() for more information.
 */
static inline void rcu_read_unlock(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock() used illegally while idle");
        rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
        __release(RCU);
        __rcu_read_unlock();
}

/**
 * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables softirqs.
 * Note that anything else that disables softirqs can also serve as an RCU
 * read-side critical section.  However, please note that this equivalence
 * applies only to v5.0 and later.  Before v5.0, rcu_read_lock() and
 * rcu_read_lock_bh() were unrelated.
 *
 * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh()
 * was invoked from some other task.
 */
static inline void rcu_read_lock_bh(void)
{
        local_bh_disable();
        __acquire(RCU_BH);
        rcu_lock_acquire(&rcu_bh_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_bh() used illegally while idle");
}

/**
 * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section
 *
 * See rcu_read_lock_bh() for more information.
 */
static inline void rcu_read_unlock_bh(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_bh() used illegally while idle");
        rcu_lock_release(&rcu_bh_lock_map);
        __release(RCU_BH);
        local_bh_enable();
}

/**
 * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
 *
 * This is equivalent to rcu_read_lock(), but also disables preemption.
 * Read-side critical sections can also be introduced by anything else that
 * disables preemption, including local_irq_disable() and friends.  However,
 * please note that the equivalence to rcu_read_lock() applies only to
 * v5.0 and later.  Before v5.0, rcu_read_lock() and rcu_read_lock_sched()
 * were unrelated.
 *
 * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched()
 * must occur in the same context, for example, it is illegal to invoke
 * rcu_read_unlock_sched() from process context if the matching
 * rcu_read_lock_sched() was invoked from an NMI handler.
 */
static inline void rcu_read_lock_sched(void)
{
        preempt_disable();
        __acquire(RCU_SCHED);
        rcu_lock_acquire(&rcu_sched_lock_map);
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_lock_sched() used illegally while idle");
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_lock_sched_notrace(void)
{
        preempt_disable_notrace();
        __acquire(RCU_SCHED);
}

/**
 * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section
 *
 * See rcu_read_lock_sched() for more information.
 */
static inline void rcu_read_unlock_sched(void)
{
        RCU_LOCKDEP_WARN(!rcu_is_watching(),
                         "rcu_read_unlock_sched() used illegally while idle");
        rcu_lock_release(&rcu_sched_lock_map);
        __release(RCU_SCHED);
        preempt_enable();
}

/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
static inline notrace void rcu_read_unlock_sched_notrace(void)
{
        __release(RCU_SCHED);
        preempt_enable_notrace();
}

/**
 * RCU_INIT_POINTER() - initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * Initialize an RCU-protected pointer in special cases where readers
 * do not need ordering constraints on the CPU or the compiler.  These
 * special cases are:
 *
 * 1.        This use of RCU_INIT_POINTER() is NULLing out the pointer *or*
 * 2.        The caller has taken whatever steps are required to prevent
 *        RCU readers from concurrently accessing this pointer *or*
 * 3.        The referenced data structure has already been exposed to
 *        readers either at compile time or via rcu_assign_pointer() *and*
 *
 *        a.        You have not made *any* reader-visible changes to
 *                this structure since then *or*
 *        b.        It is OK for readers accessing this structure from its
 *                new location to see the old state of the structure.  (For
 *                example, the changes were to statistical counters or to
 *                other state where exact synchronization is not required.)
 *
 * Failure to follow these rules governing use of RCU_INIT_POINTER() will
 * result in impossible-to-diagnose memory corruption.  As in the structures
 * will look OK in crash dumps, but any concurrent RCU readers might
 * see pre-initialized values of the referenced data structure.  So
 * please be very careful how you use RCU_INIT_POINTER()!!!
 *
 * If you are creating an RCU-protected linked structure that is accessed
 * by a single external-to-structure RCU-protected pointer, then you may
 * use RCU_INIT_POINTER() to initialize the internal RCU-protected
 * pointers, but you must use rcu_assign_pointer() to initialize the
 * external-to-structure pointer *after* you have completely initialized
 * the reader-accessible portions of the linked structure.
 *
 * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no
 * ordering guarantees for either the CPU or the compiler.
 */
#define RCU_INIT_POINTER(p, v) \
        do { \
                rcu_check_sparse(p, __rcu); \
                WRITE_ONCE(p, RCU_INITIALIZER(v)); \
        } while (0)

/**
 * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer
 * @p: The pointer to be initialized.
 * @v: The value to initialized the pointer to.
 *
 * GCC-style initialization for an RCU-protected pointer in a structure field.
 */
#define RCU_POINTER_INITIALIZER(p, v) \
                .p = RCU_INITIALIZER(v)

/**
 * kfree_rcu() - kfree an object after a grace period.
 * @ptr: pointer to kfree for double-argument invocations.
 * @rhf: the name of the struct rcu_head within the type of @ptr.
 *
 * Many rcu callbacks functions just call kfree() on the base structure.
 * These functions are trivial, but their size adds up, and furthermore
 * when they are used in a kernel module, that module must invoke the
 * high-latency rcu_barrier() function at module-unload time.
 *
 * The kfree_rcu() function handles this issue. In order to have a universal
 * callback function handling different offsets of rcu_head, the callback needs
 * to determine the starting address of the freed object, which can be a large
 * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to
 * page boundary for those, only offsets up to 4095 bytes can be accommodated.
 * If the offset is larger than 4095 bytes, a compile-time error will
 * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can
 * either fall back to use of call_rcu() or rearrange the structure to
 * position the rcu_head structure into the first 4096 bytes.
 *
 * The object to be freed can be allocated either by kmalloc() or
 * kmem_cache_alloc().
 *
 * Note that the allowable offset might decrease in the future.
 *
 * The BUILD_BUG_ON check must not involve any function calls, hence the
 * checks are done in macros here.
 */
#define kfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)
#define kvfree_rcu(ptr, rhf) kvfree_rcu_arg_2(ptr, rhf)

/**
 * kfree_rcu_mightsleep() - kfree an object after a grace period.
 * @ptr: pointer to kfree for single-argument invocations.
 *
 * When it comes to head-less variant, only one argument
 * is passed and that is just a pointer which has to be
 * freed after a grace period. Therefore the semantic is
 *
 *     kfree_rcu_mightsleep(ptr);
 *
 * where @ptr is the pointer to be freed by kvfree().
 *
 * Please note, head-less way of freeing is permitted to
 * use from a context that has to follow might_sleep()
 * annotation. Otherwise, please switch and embed the
 * rcu_head structure within the type of @ptr.
 */
#define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)
#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr)

/*
 * In mm/slab_common.c, no suitable header to include here.
 */
void kvfree_call_rcu(struct rcu_head *head, void *ptr);

/*
 * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the
 * comment of kfree_rcu() for details.
 */
#define kvfree_rcu_arg_2(ptr, rhf)                                        \
do {                                                                        \
        typeof (ptr) ___p = (ptr);                                        \
                                                                        \
        if (___p) {                                                        \
                BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096);        \
                kvfree_call_rcu(&((___p)->rhf), (void *) (___p));        \
        }                                                                \
} while (0)

#define kvfree_rcu_arg_1(ptr)                                        \
do {                                                                \
        typeof(ptr) ___p = (ptr);                                \
                                                                \
        if (___p)                                                \
                kvfree_call_rcu(NULL, (void *) (___p));                \
} while (0)

/*
 * Place this after a lock-acquisition primitive to guarantee that
 * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies
 * if the UNLOCK and LOCK are executed by the same CPU or if the
 * UNLOCK and LOCK operate on the same lock variable.
 */
#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE
#define smp_mb__after_unlock_lock()        smp_mb()  /* Full ordering for lock. */
#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */
#define smp_mb__after_unlock_lock()        do { } while (0)
#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */


/* Has the specified rcu_head structure been handed to call_rcu()? */

/**
 * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu()
 * @rhp: The rcu_head structure to initialize.
 *
 * If you intend to invoke rcu_head_after_call_rcu() to test whether a
 * given rcu_head structure has already been passed to call_rcu(), then
 * you must also invoke this rcu_head_init() function on it just after
 * allocating that structure.  Calls to this function must not race with
 * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation.
 */
static inline void rcu_head_init(struct rcu_head *rhp)
{
        rhp->func = (rcu_callback_t)~0L;
}

/**
 * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()?
 * @rhp: The rcu_head structure to test.
 * @f: The function passed to call_rcu() along with @rhp.
 *
 * Returns @true if the @rhp has been passed to call_rcu() with @func,
 * and @false otherwise.  Emits a warning in any other case, including
 * the case where @rhp has already been invoked after a grace period.
 * Calls to this function must not race with callback invocation.  One way
 * to avoid such races is to enclose the call to rcu_head_after_call_rcu()
 * in an RCU read-side critical section that includes a read-side fetch
 * of the pointer to the structure containing @rhp.
 */
static inline bool
rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f)
{
        rcu_callback_t func = READ_ONCE(rhp->func);

        if (func == f)
                return true;
        WARN_ON_ONCE(func != (rcu_callback_t)~0L);
        return false;
}

/* kernel/ksysfs.c definitions */
extern int rcu_expedited;
extern int rcu_normal;

DEFINE_LOCK_GUARD_0(rcu,
        do {
                rcu_read_lock();
                /*
                 * sparse doesn't call the cleanup function,
                 * so just release immediately and don't track
                 * the context. We don't need to anyway, since
                 * the whole point of the guard is to not need
                 * the explicit unlock.
                 */
                __release(RCU);
        } while (0),
        rcu_read_unlock())

#endif /* __LINUX_RCUPDATE_H */

































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   30 







   31 




























































   31 



























































































































































































































































































































































































































































































































































































   65 
   30 
   61 




































































































































































   65 














   53 




   18 






































   31 














   31 










































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/page-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
 *
 * 10Apr2002        Andrew Morton
 *                Initial version
 */

#include <linux/kernel.h>
#include <linux/math64.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/init.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/blkdev.h>
#include <linux/mpage.h>
#include <linux/rmap.h>
#include <linux/percpu.h>
#include <linux/smp.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/syscalls.h>
#include <linux/pagevec.h>
#include <linux/timer.h>
#include <linux/sched/rt.h>
#include <linux/sched/signal.h>
#include <linux/mm_inline.h>
#include <trace/events/writeback.h>

#include "internal.h"

/*
 * Sleep at most 200ms at a time in balance_dirty_pages().
 */
#define MAX_PAUSE                max(HZ/5, 1)

/*
 * Try to keep balance_dirty_pages() call intervals higher than this many pages
 * by raising pause time to max_pause when falls below it.
 */
#define DIRTY_POLL_THRESH        (128 >> (PAGE_SHIFT - 10))

/*
 * Estimate write bandwidth or update dirty limit at 200ms intervals.
 */
#define BANDWIDTH_INTERVAL        max(HZ/5, 1)

#define RATELIMIT_CALC_SHIFT        10

/*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
 * will look to see if it needs to force writeback or throttling.
 */
static long ratelimit_pages = 32;

/* The following parameters are exported via /proc/sys/vm */

/*
 * Start background writeback (via writeback threads) at this percentage
 */
static int dirty_background_ratio = 10;

/*
 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
 * dirty_background_ratio * the amount of dirtyable memory
 */
static unsigned long dirty_background_bytes;

/*
 * free highmem will not be subtracted from the total free memory
 * for calculating free ratios if vm_highmem_is_dirtyable is true
 */
static int vm_highmem_is_dirtyable;

/*
 * The generator of dirty data starts writeback at this percentage
 */
static int vm_dirty_ratio = 20;

/*
 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
 * vm_dirty_ratio * the amount of dirtyable memory
 */
static unsigned long vm_dirty_bytes;

/*
 * The interval between `kupdate'-style writebacks
 */
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */

EXPORT_SYMBOL_GPL(dirty_writeback_interval);

/*
 * The longest time for which data is allowed to remain dirty
 */
unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */

/*
 * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
 * a full sync is triggered after this time elapses without any disk activity.
 */
int laptop_mode;

EXPORT_SYMBOL(laptop_mode);

/* End of sysctl-exported parameters */

struct wb_domain global_wb_domain;

/*
 * Length of period for aging writeout fractions of bdis. This is an
 * arbitrarily chosen number. The longer the period, the slower fractions will
 * reflect changes in current writeout rate.
 */
#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)

#ifdef CONFIG_CGROUP_WRITEBACK

#define GDTC_INIT(__wb)                .wb = (__wb),                                \
                                .dom = &global_wb_domain,                \
                                .wb_completions = &(__wb)->completions

#define GDTC_INIT_NO_WB                .dom = &global_wb_domain

#define MDTC_INIT(__wb, __gdtc)        .wb = (__wb),                                \
                                .dom = mem_cgroup_wb_domain(__wb),        \
                                .wb_completions = &(__wb)->memcg_completions, \
                                .gdtc = __gdtc

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
        return dtc->dom;
}

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
        return dtc->dom;
}

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
        return mdtc->gdtc;
}

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
        return &wb->memcg_completions;
}

static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
{
        unsigned long this_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
        unsigned long long min = wb->bdi->min_ratio;
        unsigned long long max = wb->bdi->max_ratio;

        /*
         * @wb may already be clean by the time control reaches here and
         * the total may not include its bw.
         */
        if (this_bw < tot_bw) {
                if (min) {
                        min *= this_bw;
                        min = div64_ul(min, tot_bw);
                }
                if (max < 100 * BDI_RATIO_SCALE) {
                        max *= this_bw;
                        max = div64_ul(max, tot_bw);
                }
        }

        *minp = min;
        *maxp = max;
}

#else        /* CONFIG_CGROUP_WRITEBACK */

#define GDTC_INIT(__wb)                .wb = (__wb),                           \
                                .wb_completions = &(__wb)->completions
#define GDTC_INIT_NO_WB
#define MDTC_INIT(__wb, __gdtc)

static bool mdtc_valid(struct dirty_throttle_control *dtc)
{
        return false;
}

static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
{
        return &global_wb_domain;
}

static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
{
        return NULL;
}

static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
{
        return NULL;
}

static void wb_min_max_ratio(struct bdi_writeback *wb,
                             unsigned long *minp, unsigned long *maxp)
{
        *minp = wb->bdi->min_ratio;
        *maxp = wb->bdi->max_ratio;
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * In a memory zone, there is a certain amount of pages we consider
 * available for the page cache, which is essentially the number of
 * free and reclaimable pages, minus some zone reserves to protect
 * lowmem and the ability to uphold the zone's watermarks without
 * requiring writeback.
 *
 * This number of dirtyable pages is the base value of which the
 * user-configurable dirty ratio is the effective number of pages that
 * are allowed to be actually dirtied.  Per individual zone, or
 * globally by using the sum of dirtyable pages over all zones.
 *
 * Because the user is allowed to specify the dirty limit globally as
 * absolute number of bytes, calculating the per-zone dirty limit can
 * require translating the configured limit into a percentage of
 * global dirtyable memory first.
 */

/**
 * node_dirtyable_memory - number of dirtyable pages in a node
 * @pgdat: the node
 *
 * Return: the node's number of pages potentially available for dirty
 * page cache.  This is the base value for the per-node dirty limits.
 */
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
{
        unsigned long nr_pages = 0;
        int z;

        for (z = 0; z < MAX_NR_ZONES; z++) {
                struct zone *zone = pgdat->node_zones + z;

                if (!populated_zone(zone))
                        continue;

                nr_pages += zone_page_state(zone, NR_FREE_PAGES);
        }

        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
        nr_pages -= min(nr_pages, pgdat->totalreserve_pages);

        nr_pages += node_page_state(pgdat, NR_INACTIVE_FILE);
        nr_pages += node_page_state(pgdat, NR_ACTIVE_FILE);

        return nr_pages;
}

static unsigned long highmem_dirtyable_memory(unsigned long total)
{
#ifdef CONFIG_HIGHMEM
        int node;
        unsigned long x = 0;
        int i;

        for_each_node_state(node, N_HIGH_MEMORY) {
                for (i = ZONE_NORMAL + 1; i < MAX_NR_ZONES; i++) {
                        struct zone *z;
                        unsigned long nr_pages;

                        if (!is_highmem_idx(i))
                                continue;

                        z = &NODE_DATA(node)->node_zones[i];
                        if (!populated_zone(z))
                                continue;

                        nr_pages = zone_page_state(z, NR_FREE_PAGES);
                        /* watch for underflows */
                        nr_pages -= min(nr_pages, high_wmark_pages(z));
                        nr_pages += zone_page_state(z, NR_ZONE_INACTIVE_FILE);
                        nr_pages += zone_page_state(z, NR_ZONE_ACTIVE_FILE);
                        x += nr_pages;
                }
        }

        /*
         * Make sure that the number of highmem pages is never larger
         * than the number of the total dirtyable memory. This can only
         * occur in very strange VM situations but we want to make sure
         * that this does not occur.
         */
        return min(x, total);
#else
        return 0;
#endif
}

/**
 * global_dirtyable_memory - number of globally dirtyable pages
 *
 * Return: the global number of pages potentially available for dirty
 * page cache.  This is the base value for the global dirty limits.
 */
static unsigned long global_dirtyable_memory(void)
{
        unsigned long x;

        x = global_zone_page_state(NR_FREE_PAGES);
        /*
         * Pages reserved for the kernel should not be considered
         * dirtyable, to prevent a situation where reclaim has to
         * clean pages in order to balance the zones.
         */
        x -= min(x, totalreserve_pages);

        x += global_node_page_state(NR_INACTIVE_FILE);
        x += global_node_page_state(NR_ACTIVE_FILE);

        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);

        return x + 1;        /* Ensure that we never return 0 */
}

/**
 * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
 * @dtc: dirty_throttle_control of interest
 *
 * Calculate @dtc->thresh and ->bg_thresh considering
 * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
 * must ensure that @dtc->avail is set before calling this function.  The
 * dirty limits will be lifted by 1/4 for real-time tasks.
 */
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
        const unsigned long available_memory = dtc->avail;
        struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
        unsigned long bytes = vm_dirty_bytes;
        unsigned long bg_bytes = dirty_background_bytes;
        /* convert ratios to per-PAGE_SIZE for higher precision */
        unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100;
        unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100;
        unsigned long thresh;
        unsigned long bg_thresh;
        struct task_struct *tsk;

        /* gdtc is !NULL iff @dtc is for memcg domain */
        if (gdtc) {
                unsigned long global_avail = gdtc->avail;

                /*
                 * The byte settings can't be applied directly to memcg
                 * domains.  Convert them to ratios by scaling against
                 * globally available memory.  As the ratios are in
                 * per-PAGE_SIZE, they can be obtained by dividing bytes by
                 * number of pages.
                 */
                if (bytes)
                        ratio = min(DIV_ROUND_UP(bytes, global_avail),
                                    PAGE_SIZE);
                if (bg_bytes)
                        bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail),
                                       PAGE_SIZE);
                bytes = bg_bytes = 0;
        }

        if (bytes)
                thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
        else
                thresh = (ratio * available_memory) / PAGE_SIZE;

        if (bg_bytes)
                bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
        else
                bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE;

        tsk = current;
        if (rt_or_dl_task(tsk)) {
                bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
                thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
        }
        /*
         * Dirty throttling logic assumes the limits in page units fit into
         * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
         */
        if (thresh > UINT_MAX)
                thresh = UINT_MAX;
        /* This makes sure bg_thresh is within 32-bits as well */
        if (bg_thresh >= thresh)
                bg_thresh = thresh / 2;
        dtc->thresh = thresh;
        dtc->bg_thresh = bg_thresh;

        /* we should eventually report the domain in the TP */
        if (!gdtc)
                trace_global_dirty_state(bg_thresh, thresh);
}

/**
 * global_dirty_limits - background-writeback and dirty-throttling thresholds
 * @pbackground: out parameter for bg_thresh
 * @pdirty: out parameter for thresh
 *
 * Calculate bg_thresh and thresh for global_wb_domain.  See
 * domain_dirty_limits() for details.
 */
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };

        gdtc.avail = global_dirtyable_memory();
        domain_dirty_limits(&gdtc);

        *pbackground = gdtc.bg_thresh;
        *pdirty = gdtc.thresh;
}

/**
 * node_dirty_limit - maximum number of dirty pages allowed in a node
 * @pgdat: the node
 *
 * Return: the maximum number of dirty pages allowed in a node, based
 * on the node's dirtyable memory.
 */
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
{
        unsigned long node_memory = node_dirtyable_memory(pgdat);
        struct task_struct *tsk = current;
        unsigned long dirty;

        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
                        node_memory / global_dirtyable_memory();
        else
                dirty = vm_dirty_ratio * node_memory / 100;

        if (rt_or_dl_task(tsk))
                dirty += dirty / 4;

        /*
         * Dirty throttling logic assumes the limits in page units fit into
         * 32-bits. This gives 16TB dirty limits max which is hopefully enough.
         */
        return min_t(unsigned long, dirty, UINT_MAX);
}

/**
 * node_dirty_ok - tells whether a node is within its dirty limits
 * @pgdat: the node to check
 *
 * Return: %true when the dirty pages in @pgdat are within the node's
 * dirty limit, %false if the limit is exceeded.
 */
bool node_dirty_ok(struct pglist_data *pgdat)
{
        unsigned long limit = node_dirty_limit(pgdat);
        unsigned long nr_pages = 0;

        nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
        nr_pages += node_page_state(pgdat, NR_WRITEBACK);

        return nr_pages <= limit;
}

#ifdef CONFIG_SYSCTL
static int dirty_background_ratio_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write)
                dirty_background_bytes = 0;
        return ret;
}

static int dirty_background_bytes_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int ret;
        unsigned long old_bytes = dirty_background_bytes;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write) {
                if (DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE) >
                                                                UINT_MAX) {
                        dirty_background_bytes = old_bytes;
                        return -ERANGE;
                }
                dirty_background_ratio = 0;
        }
        return ret;
}

static int dirty_ratio_handler(const struct ctl_table *table, int write, void *buffer,
                size_t *lenp, loff_t *ppos)
{
        int old_ratio = vm_dirty_ratio;
        int ret;

        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
                writeback_set_ratelimit();
                vm_dirty_bytes = 0;
        }
        return ret;
}

static int dirty_bytes_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        unsigned long old_bytes = vm_dirty_bytes;
        int ret;

        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
                if (DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) > UINT_MAX) {
                        vm_dirty_bytes = old_bytes;
                        return -ERANGE;
                }
                writeback_set_ratelimit();
                vm_dirty_ratio = 0;
        }
        return ret;
}
#endif

static unsigned long wp_next_time(unsigned long cur_time)
{
        cur_time += VM_COMPLETIONS_PERIOD_LEN;
        /* 0 has a special meaning... */
        if (!cur_time)
                return 1;
        return cur_time;
}

static void wb_domain_writeout_add(struct wb_domain *dom,
                                   struct fprop_local_percpu *completions,
                                   unsigned int max_prop_frac, long nr)
{
        __fprop_add_percpu_max(&dom->completions, completions,
                               max_prop_frac, nr);
        /* First event after period switching was turned off? */
        if (unlikely(!dom->period_time)) {
                /*
                 * We can race with other wb_domain_writeout_add calls here but
                 * it does not cause any harm since the resulting time when
                 * timer will fire and what is in writeout_period_time will be
                 * roughly the same.
                 */
                dom->period_time = wp_next_time(jiffies);
                mod_timer(&dom->period_timer, dom->period_time);
        }
}

/*
 * Increment @wb's writeout completion count and the global writeout
 * completion count. Called from __folio_end_writeback().
 */
static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
{
        struct wb_domain *cgdom;

        wb_stat_mod(wb, WB_WRITTEN, nr);
        wb_domain_writeout_add(&global_wb_domain, &wb->completions,
                               wb->bdi->max_prop_frac, nr);

        cgdom = mem_cgroup_wb_domain(wb);
        if (cgdom)
                wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
                                       wb->bdi->max_prop_frac, nr);
}

void wb_writeout_inc(struct bdi_writeback *wb)
{
        unsigned long flags;

        local_irq_save(flags);
        __wb_writeout_add(wb, 1);
        local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(wb_writeout_inc);

/*
 * On idle system, we can be called long after we scheduled because we use
 * deferred timers so count with missed periods.
 */
static void writeout_period(struct timer_list *t)
{
        struct wb_domain *dom = from_timer(dom, t, period_timer);
        int miss_periods = (jiffies - dom->period_time) /
                                                 VM_COMPLETIONS_PERIOD_LEN;

        if (fprop_new_period(&dom->completions, miss_periods + 1)) {
                dom->period_time = wp_next_time(dom->period_time +
                                miss_periods * VM_COMPLETIONS_PERIOD_LEN);
                mod_timer(&dom->period_timer, dom->period_time);
        } else {
                /*
                 * Aging has zeroed all fractions. Stop wasting CPU on period
                 * updates.
                 */
                dom->period_time = 0;
        }
}

int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
{
        memset(dom, 0, sizeof(*dom));

        spin_lock_init(&dom->lock);

        timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE);

        dom->dirty_limit_tstamp = jiffies;

        return fprop_global_init(&dom->completions, gfp);
}

#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
        timer_delete_sync(&dom->period_timer);
        fprop_global_destroy(&dom->completions);
}
#endif

/*
 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 * registered backing devices, which, for obvious reasons, can not
 * exceed 100%.
 */
static unsigned int bdi_min_ratio;

static int bdi_check_pages_limit(unsigned long pages)
{
        unsigned long max_dirty_pages = global_dirtyable_memory();

        if (pages > max_dirty_pages)
                return -EINVAL;

        return 0;
}

static unsigned long bdi_ratio_from_pages(unsigned long pages)
{
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long ratio;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        if (!dirty_thresh)
                return -EINVAL;
        ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh);

        return ratio;
}

static u64 bdi_get_bytes(unsigned int ratio)
{
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        u64 bytes;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100;

        return bytes;
}

static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        unsigned int delta;
        int ret = 0;

        if (min_ratio > 100 * BDI_RATIO_SCALE)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (min_ratio > bdi->max_ratio) {
                ret = -EINVAL;
        } else {
                if (min_ratio < bdi->min_ratio) {
                        delta = bdi->min_ratio - min_ratio;
                        bdi_min_ratio -= delta;
                        bdi->min_ratio = min_ratio;
                } else {
                        delta = min_ratio - bdi->min_ratio;
                        if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) {
                                bdi_min_ratio += delta;
                                bdi->min_ratio = min_ratio;
                        } else {
                                ret = -EINVAL;
                        }
                }
        }
        spin_unlock_bh(&bdi_lock);

        return ret;
}

static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{
        int ret = 0;

        if (max_ratio > 100 * BDI_RATIO_SCALE)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (bdi->min_ratio > max_ratio) {
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
                bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) /
                                                (100 * BDI_RATIO_SCALE);
        }
        spin_unlock_bh(&bdi_lock);

        return ret;
}

int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        return __bdi_set_min_ratio(bdi, min_ratio);
}

int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio)
{
        return __bdi_set_max_ratio(bdi, max_ratio);
}

int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
{
        return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE);
}

int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio)
{
        return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE);
}
EXPORT_SYMBOL(bdi_set_max_ratio);

u64 bdi_get_min_bytes(struct backing_dev_info *bdi)
{
        return bdi_get_bytes(bdi->min_ratio);
}

int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes)
{
        int ret;
        unsigned long pages = min_bytes >> PAGE_SHIFT;
        long min_ratio;

        ret = bdi_check_pages_limit(pages);
        if (ret)
                return ret;

        min_ratio = bdi_ratio_from_pages(pages);
        if (min_ratio < 0)
                return min_ratio;
        return __bdi_set_min_ratio(bdi, min_ratio);
}

u64 bdi_get_max_bytes(struct backing_dev_info *bdi)
{
        return bdi_get_bytes(bdi->max_ratio);
}

int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes)
{
        int ret;
        unsigned long pages = max_bytes >> PAGE_SHIFT;
        long max_ratio;

        ret = bdi_check_pages_limit(pages);
        if (ret)
                return ret;

        max_ratio = bdi_ratio_from_pages(pages);
        if (max_ratio < 0)
                return max_ratio;
        return __bdi_set_max_ratio(bdi, max_ratio);
}

int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit)
{
        if (strict_limit > 1)
                return -EINVAL;

        spin_lock_bh(&bdi_lock);
        if (strict_limit)
                bdi->capabilities |= BDI_CAP_STRICTLIMIT;
        else
                bdi->capabilities &= ~BDI_CAP_STRICTLIMIT;
        spin_unlock_bh(&bdi_lock);

        return 0;
}

static unsigned long dirty_freerun_ceiling(unsigned long thresh,
                                           unsigned long bg_thresh)
{
        return (thresh + bg_thresh) / 2;
}

static unsigned long hard_dirty_limit(struct wb_domain *dom,
                                      unsigned long thresh)
{
        return max(thresh, dom->dirty_limit);
}

/*
 * Memory which can be further allocated to a memcg domain is capped by
 * system-wide clean memory excluding the amount being used in the domain.
 */
static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
                            unsigned long filepages, unsigned long headroom)
{
        struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
        unsigned long clean = filepages - min(filepages, mdtc->dirty);
        unsigned long global_clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
        unsigned long other_clean = global_clean - min(global_clean, clean);

        mdtc->avail = filepages + min(headroom, other_clean);
}

static inline bool dtc_is_global(struct dirty_throttle_control *dtc)
{
        return mdtc_gdtc(dtc) == NULL;
}

/*
 * Dirty background will ignore pages being written as we're trying to
 * decide whether to put more under writeback.
 */
static void domain_dirty_avail(struct dirty_throttle_control *dtc,
                               bool include_writeback)
{
        if (dtc_is_global(dtc)) {
                dtc->avail = global_dirtyable_memory();
                dtc->dirty = global_node_page_state(NR_FILE_DIRTY);
                if (include_writeback)
                        dtc->dirty += global_node_page_state(NR_WRITEBACK);
        } else {
                unsigned long filepages = 0, headroom = 0, writeback = 0;

                mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty,
                                    &writeback);
                if (include_writeback)
                        dtc->dirty += writeback;
                mdtc_calc_avail(dtc, filepages, headroom);
        }
}

/**
 * __wb_calc_thresh - @wb's share of dirty threshold
 * @dtc: dirty_throttle_context of interest
 * @thresh: dirty throttling or dirty background threshold of wb_domain in @dtc
 *
 * Note that balance_dirty_pages() will only seriously take dirty throttling
 * threshold as a hard limit when sleeping max_pause per page is not enough
 * to keep the dirty pages under control. For example, when the device is
 * completely stalled due to some error conditions, or when there are 1000
 * dd tasks writing to a slow 10MB/s USB key.
 * In the other normal situations, it acts more gently by throttling the tasks
 * more (rather than completely block them) when the wb dirty pages go high.
 *
 * It allocates high/low dirty limits to fast/slow devices, in order to prevent
 * - starving fast devices
 * - piling up dirty pages (that will take long time to sync) on slow devices
 *
 * The wb's share of dirty limit will be adapting to its throughput and
 * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
 *
 * Return: @wb's dirty limit in pages. For dirty throttling limit, the term
 * "dirty" in the context of dirty balancing includes all PG_dirty and
 * PG_writeback pages.
 */
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc,
                                      unsigned long thresh)
{
        struct wb_domain *dom = dtc_dom(dtc);
        struct bdi_writeback *wb = dtc->wb;
        u64 wb_thresh;
        u64 wb_max_thresh;
        unsigned long numerator, denominator;
        unsigned long wb_min_ratio, wb_max_ratio;

        /*
         * Calculate this wb's share of the thresh ratio.
         */
        fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
                              &numerator, &denominator);

        wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE);
        wb_thresh *= numerator;
        wb_thresh = div64_ul(wb_thresh, denominator);

        wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio);

        wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE);

        /*
         * It's very possible that wb_thresh is close to 0 not because the
         * device is slow, but that it has remained inactive for long time.
         * Honour such devices a reasonable good (hopefully IO efficient)
         * threshold, so that the occasional writes won't be blocked and active
         * writes can rampup the threshold quickly.
         */
        if (thresh > dtc->dirty) {
                if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT))
                        wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 100);
                else
                        wb_thresh = max(wb_thresh, (thresh - dtc->dirty) / 8);
        }

        wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE);
        if (wb_thresh > wb_max_thresh)
                wb_thresh = wb_max_thresh;

        return wb_thresh;
}

unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };

        domain_dirty_avail(&gdtc, true);
        return __wb_calc_thresh(&gdtc, thresh);
}

unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
        struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };

        domain_dirty_avail(&gdtc, true);
        domain_dirty_avail(&mdtc, true);
        domain_dirty_limits(&mdtc);

        return __wb_calc_thresh(&mdtc, mdtc.thresh);
}

/*
 *                           setpoint - dirty 3
 *        f(dirty) := 1.0 + (----------------)
 *                           limit - setpoint
 *
 * it's a 3rd order polynomial that subjects to
 *
 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
 * (2) f(setpoint) = 1.0 => the balance point
 * (3) f(limit)    = 0   => the hard limit
 * (4) df/dx      <= 0         => negative feedback control
 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
 *     => fast response on large errors; small oscillation near setpoint
 */
static long long pos_ratio_polynom(unsigned long setpoint,
                                          unsigned long dirty,
                                          unsigned long limit)
{
        long long pos_ratio;
        long x;

        x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
                      (limit - setpoint) | 1);
        pos_ratio = x;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio += 1 << RATELIMIT_CALC_SHIFT;

        return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
}

/*
 * Dirty position control.
 *
 * (o) global/bdi setpoints
 *
 * We want the dirty pages be balanced around the global/wb setpoints.
 * When the number of dirty pages is higher/lower than the setpoint, the
 * dirty position control ratio (and hence task dirty ratelimit) will be
 * decreased/increased to bring the dirty pages back to the setpoint.
 *
 *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
 *
 *     if (dirty < setpoint) scale up   pos_ratio
 *     if (dirty > setpoint) scale down pos_ratio
 *
 *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
 *     if (wb_dirty > wb_setpoint) scale down pos_ratio
 *
 *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
 *
 * (o) global control line
 *
 *     ^ pos_ratio
 *     |
 *     |            |<===== global dirty control scope ======>|
 * 2.0  * * * * * * *
 *     |            .*
 *     |            . *
 *     |            .   *
 *     |            .     *
 *     |            .        *
 *     |            .            *
 * 1.0 ................................*
 *     |            .                  .     *
 *     |            .                  .          *
 *     |            .                  .              *
 *     |            .                  .                 *
 *     |            .                  .                    *
 *   0 +------------.------------------.----------------------*------------->
 *           freerun^          setpoint^                 limit^   dirty pages
 *
 * (o) wb control line
 *
 *     ^ pos_ratio
 *     |
 *     |            *
 *     |              *
 *     |                *
 *     |                  *
 *     |                    * |<=========== span ============>|
 * 1.0 .......................*
 *     |                      . *
 *     |                      .   *
 *     |                      .     *
 *     |                      .       *
 *     |                      .         *
 *     |                      .           *
 *     |                      .             *
 *     |                      .               *
 *     |                      .                 *
 *     |                      .                   *
 *     |                      .                     *
 * 1/4 ...............................................* * * * * * * * * * * *
 *     |                      .                         .
 *     |                      .                           .
 *     |                      .                             .
 *   0 +----------------------.-------------------------------.------------->
 *                wb_setpoint^                    x_intercept^
 *
 * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
 * be smoothly throttled down to normal if it starts high in situations like
 * - start writing to a slow SD card and a fast disk at the same time. The SD
 *   card's wb_dirty may rush to many times higher than wb_setpoint.
 * - the wb dirty thresh drops quickly due to change of JBOD workload
 */
static void wb_position_ratio(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long write_bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = dtc->limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long wb_thresh = dtc->wb_thresh;
        unsigned long x_intercept;
        unsigned long setpoint;                /* dirty pages' target balance point */
        unsigned long wb_setpoint;
        unsigned long span;
        long long pos_ratio;                /* for scaling up/down the rate limit */
        long x;

        dtc->pos_ratio = 0;

        if (unlikely(dtc->dirty >= limit))
                return;

        /*
         * global setpoint
         *
         * See comment for pos_ratio_polynom().
         */
        setpoint = (freerun + limit) / 2;
        pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);

        /*
         * The strictlimit feature is a tool preventing mistrusted filesystems
         * from growing a large number of dirty pages before throttling. For
         * such filesystems balance_dirty_pages always checks wb counters
         * against wb limits. Even if global "nr_dirty" is under "freerun".
         * This is especially important for fuse which sets bdi->max_ratio to
         * 1% by default. Without strictlimit feature, fuse writeback may
         * consume arbitrary amount of RAM because it is accounted in
         * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
         *
         * Here, in wb_position_ratio(), we calculate pos_ratio based on
         * two values: wb_dirty and wb_thresh. Let's consider an example:
         * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
         * limits are set by default to 10% and 20% (background and throttle).
         * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
         * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
         * about ~6K pages (as the average of background and throttle wb
         * limits). The 3rd order polynomial will provide positive feedback if
         * wb_dirty is under wb_setpoint and vice versa.
         *
         * Note, that we cannot use global counters in these calculations
         * because we want to throttle process writing to a strictlimit wb
         * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
         * in the example above).
         */
        if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                long long wb_pos_ratio;

                if (dtc->wb_dirty >= wb_thresh)
                        return;

                wb_setpoint = dirty_freerun_ceiling(wb_thresh,
                                                    dtc->wb_bg_thresh);

                if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
                        return;

                wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
                                                 wb_thresh);

                /*
                 * Typically, for strictlimit case, wb_setpoint << setpoint
                 * and pos_ratio >> wb_pos_ratio. In the other words global
                 * state ("dirty") is not limiting factor and we have to
                 * make decision based on wb counters. But there is an
                 * important case when global pos_ratio should get precedence:
                 * global limits are exceeded (e.g. due to activities on other
                 * wb's) while given strictlimit wb is below limit.
                 *
                 * "pos_ratio * wb_pos_ratio" would work for the case above,
                 * but it would look too non-natural for the case of all
                 * activity in the system coming from a single strictlimit wb
                 * with bdi->max_ratio == 100%.
                 *
                 * Note that min() below somewhat changes the dynamics of the
                 * control system. Normally, pos_ratio value can be well over 3
                 * (when globally we are at freerun and wb is well below wb
                 * setpoint). Now the maximum pos_ratio in the same situation
                 * is 2. We might want to tweak this if we observe the control
                 * system is too slow to adapt.
                 */
                dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
                return;
        }

        /*
         * We have computed basic pos_ratio above based on global situation. If
         * the wb is over/under its share of dirty pages, we want to scale
         * pos_ratio further down/up. That is done by the following mechanism.
         */

        /*
         * wb setpoint
         *
         *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
         *
         *                        x_intercept - wb_dirty
         *                     := --------------------------
         *                        x_intercept - wb_setpoint
         *
         * The main wb control line is a linear function that subjects to
         *
         * (1) f(wb_setpoint) = 1.0
         * (2) k = - 1 / (8 * write_bw)  (in single wb case)
         *     or equally: x_intercept = wb_setpoint + 8 * write_bw
         *
         * For single wb case, the dirty pages are observed to fluctuate
         * regularly within range
         *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
         * for various filesystems, where (2) can yield in a reasonable 12.5%
         * fluctuation range for pos_ratio.
         *
         * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
         * own size, so move the slope over accordingly and choose a slope that
         * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
         */
        if (unlikely(wb_thresh > dtc->thresh))
                wb_thresh = dtc->thresh;
        /*
         * scale global setpoint to wb's:
         *        wb_setpoint = setpoint * wb_thresh / thresh
         */
        x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
        wb_setpoint = setpoint * (u64)x >> 16;
        /*
         * Use span=(8*write_bw) in single wb case as indicated by
         * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
         *
         *        wb_thresh                    thresh - wb_thresh
         * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
         *         thresh                           thresh
         */
        span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
        x_intercept = wb_setpoint + span;

        if (dtc->wb_dirty < x_intercept - span / 4) {
                pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
                                      (x_intercept - wb_setpoint) | 1);
        } else
                pos_ratio /= 4;

        /*
         * wb reserve area, safeguard against dirty pool underrun and disk idle
         * It may push the desired control point of global dirty pages higher
         * than setpoint.
         */
        x_intercept = wb_thresh / 2;
        if (dtc->wb_dirty < x_intercept) {
                if (dtc->wb_dirty > x_intercept / 8)
                        pos_ratio = div_u64(pos_ratio * x_intercept,
                                            dtc->wb_dirty);
                else
                        pos_ratio *= 8;
        }

        dtc->pos_ratio = pos_ratio;
}

static void wb_update_write_bandwidth(struct bdi_writeback *wb,
                                      unsigned long elapsed,
                                      unsigned long written)
{
        const unsigned long period = roundup_pow_of_two(3 * HZ);
        unsigned long avg = wb->avg_write_bandwidth;
        unsigned long old = wb->write_bandwidth;
        u64 bw;

        /*
         * bw = written * HZ / elapsed
         *
         *                   bw * elapsed + write_bandwidth * (period - elapsed)
         * write_bandwidth = ---------------------------------------------------
         *                                          period
         *
         * @written may have decreased due to folio_redirty_for_writepage().
         * Avoid underflowing @bw calculation.
         */
        bw = written - min(written, wb->written_stamp);
        bw *= HZ;
        if (unlikely(elapsed > period)) {
                bw = div64_ul(bw, elapsed);
                avg = bw;
                goto out;
        }
        bw += (u64)wb->write_bandwidth * (period - elapsed);
        bw >>= ilog2(period);

        /*
         * one more level of smoothing, for filtering out sudden spikes
         */
        if (avg > old && old >= (unsigned long)bw)
                avg -= (avg - old) >> 3;

        if (avg < old && old <= (unsigned long)bw)
                avg += (old - avg) >> 3;

out:
        /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
        avg = max(avg, 1LU);
        if (wb_has_dirty_io(wb)) {
                long delta = avg - wb->avg_write_bandwidth;
                WARN_ON_ONCE(atomic_long_add_return(delta,
                                        &wb->bdi->tot_write_bandwidth) <= 0);
        }
        wb->write_bandwidth = bw;
        WRITE_ONCE(wb->avg_write_bandwidth, avg);
}

static void update_dirty_limit(struct dirty_throttle_control *dtc)
{
        struct wb_domain *dom = dtc_dom(dtc);
        unsigned long thresh = dtc->thresh;
        unsigned long limit = dom->dirty_limit;

        /*
         * Follow up in one step.
         */
        if (limit < thresh) {
                limit = thresh;
                goto update;
        }

        /*
         * Follow down slowly. Use the higher one as the target, because thresh
         * may drop below dirty. This is exactly the reason to introduce
         * dom->dirty_limit which is guaranteed to lie above the dirty pages.
         */
        thresh = max(thresh, dtc->dirty);
        if (limit > thresh) {
                limit -= (limit - thresh) >> 5;
                goto update;
        }
        return;
update:
        dom->dirty_limit = limit;
}

static void domain_update_dirty_limit(struct dirty_throttle_control *dtc,
                                      unsigned long now)
{
        struct wb_domain *dom = dtc_dom(dtc);

        /*
         * check locklessly first to optimize away locking for the most time
         */
        if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
                return;

        spin_lock(&dom->lock);
        if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
                update_dirty_limit(dtc);
                dom->dirty_limit_tstamp = now;
        }
        spin_unlock(&dom->lock);
}

/*
 * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
 *
 * Normal wb tasks will be curbed at or below it in long term.
 * Obviously it should be around (write_bw / N) when there are N dd tasks.
 */
static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
                                      unsigned long dirtied,
                                      unsigned long elapsed)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long dirty = dtc->dirty;
        unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
        unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long setpoint = (freerun + limit) / 2;
        unsigned long write_bw = wb->avg_write_bandwidth;
        unsigned long dirty_ratelimit = wb->dirty_ratelimit;
        unsigned long dirty_rate;
        unsigned long task_ratelimit;
        unsigned long balanced_dirty_ratelimit;
        unsigned long step;
        unsigned long x;
        unsigned long shift;

        /*
         * The dirty rate will match the writeout rate in long term, except
         * when dirty pages are truncated by userspace or re-dirtied by FS.
         */
        dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;

        /*
         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
         */
        task_ratelimit = (u64)dirty_ratelimit *
                                        dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */

        /*
         * A linear estimation of the "balanced" throttle rate. The theory is,
         * if there are N dd tasks, each throttled at task_ratelimit, the wb's
         * dirty_rate will be measured to be (N * task_ratelimit). So the below
         * formula will yield the balanced rate limit (write_bw / N).
         *
         * Note that the expanded form is not a pure rate feedback:
         *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate)                     (1)
         * but also takes pos_ratio into account:
         *        rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
         *
         * (1) is not realistic because pos_ratio also takes part in balancing
         * the dirty rate.  Consider the state
         *        pos_ratio = 0.5                                                     (3)
         *        rate = 2 * (write_bw / N)                                     (4)
         * If (1) is used, it will stuck in that state! Because each dd will
         * be throttled at
         *        task_ratelimit = pos_ratio * rate = (write_bw / N)             (5)
         * yielding
         *        dirty_rate = N * task_ratelimit = write_bw                     (6)
         * put (6) into (1) we get
         *        rate_(i+1) = rate_(i)                                             (7)
         *
         * So we end up using (2) to always keep
         *        rate_(i+1) ~= (write_bw / N)                                     (8)
         * regardless of the value of pos_ratio. As long as (8) is satisfied,
         * pos_ratio is able to drive itself to 1.0, which is not only where
         * the dirty count meet the setpoint, but also where the slope of
         * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
         */
        balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
                                           dirty_rate | 1);
        /*
         * balanced_dirty_ratelimit ~= (write_bw / N) <= write_bw
         */
        if (unlikely(balanced_dirty_ratelimit > write_bw))
                balanced_dirty_ratelimit = write_bw;

        /*
         * We could safely do this and return immediately:
         *
         *        wb->dirty_ratelimit = balanced_dirty_ratelimit;
         *
         * However to get a more stable dirty_ratelimit, the below elaborated
         * code makes use of task_ratelimit to filter out singular points and
         * limit the step size.
         *
         * The below code essentially only uses the relative value of
         *
         *        task_ratelimit - dirty_ratelimit
         *        = (pos_ratio - 1) * dirty_ratelimit
         *
         * which reflects the direction and size of dirty position error.
         */

        /*
         * dirty_ratelimit will follow balanced_dirty_ratelimit iff
         * task_ratelimit is on the same side of dirty_ratelimit, too.
         * For example, when
         * - dirty_ratelimit > balanced_dirty_ratelimit
         * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
         * lowering dirty_ratelimit will help meet both the position and rate
         * control targets. Otherwise, don't update dirty_ratelimit if it will
         * only help meet the rate target. After all, what the users ultimately
         * feel and care are stable dirty rate and small position error.
         *
         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
         * and filter out the singular points of balanced_dirty_ratelimit. Which
         * keeps jumping around randomly and can even leap far away at times
         * due to the small 200ms estimation period of dirty_rate (we want to
         * keep that period small to reduce time lags).
         */
        step = 0;

        /*
         * For strictlimit case, calculations above were based on wb counters
         * and limits (starting from pos_ratio = wb_position_ratio() and up to
         * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
         * Hence, to calculate "step" properly, we have to use wb_dirty as
         * "dirty" and wb_setpoint as "setpoint".
         */
        if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
                dirty = dtc->wb_dirty;
                setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
        }

        if (dirty < setpoint) {
                x = min3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit < x)
                        step = x - dirty_ratelimit;
        } else {
                x = max3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit > x)
                        step = dirty_ratelimit - x;
        }

        /*
         * Don't pursue 100% rate matching. It's impossible since the balanced
         * rate itself is constantly fluctuating. So decrease the track speed
         * when it gets close to the target. Helps eliminate pointless tremors.
         */
        shift = dirty_ratelimit / (2 * step + 1);
        if (shift < BITS_PER_LONG)
                step = DIV_ROUND_UP(step >> shift, 8);
        else
                step = 0;

        if (dirty_ratelimit < balanced_dirty_ratelimit)
                dirty_ratelimit += step;
        else
                dirty_ratelimit -= step;

        WRITE_ONCE(wb->dirty_ratelimit, max(dirty_ratelimit, 1UL));
        wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;

        trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
}

static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
                                  struct dirty_throttle_control *mdtc,
                                  bool update_ratelimit)
{
        struct bdi_writeback *wb = gdtc->wb;
        unsigned long now = jiffies;
        unsigned long elapsed;
        unsigned long dirtied;
        unsigned long written;

        spin_lock(&wb->list_lock);

        /*
         * Lockless checks for elapsed time are racy and delayed update after
         * IO completion doesn't do it at all (to make sure written pages are
         * accounted reasonably quickly). Make sure elapsed >= 1 to avoid
         * division errors.
         */
        elapsed = max(now - wb->bw_time_stamp, 1UL);
        dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
        written = percpu_counter_read(&wb->stat[WB_WRITTEN]);

        if (update_ratelimit) {
                domain_update_dirty_limit(gdtc, now);
                wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);

                /*
                 * @mdtc is always NULL if !CGROUP_WRITEBACK but the
                 * compiler has no way to figure that out.  Help it.
                 */
                if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
                        domain_update_dirty_limit(mdtc, now);
                        wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
                }
        }
        wb_update_write_bandwidth(wb, elapsed, written);

        wb->dirtied_stamp = dirtied;
        wb->written_stamp = written;
        WRITE_ONCE(wb->bw_time_stamp, now);
        spin_unlock(&wb->list_lock);
}

void wb_update_bandwidth(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };

        __wb_update_bandwidth(&gdtc, NULL, false);
}

/* Interval after which we consider wb idle and don't estimate bandwidth */
#define WB_BANDWIDTH_IDLE_JIF (HZ)

static void wb_bandwidth_estimate_start(struct bdi_writeback *wb)
{
        unsigned long now = jiffies;
        unsigned long elapsed = now - READ_ONCE(wb->bw_time_stamp);

        if (elapsed > WB_BANDWIDTH_IDLE_JIF &&
            !atomic_read(&wb->writeback_inodes)) {
                spin_lock(&wb->list_lock);
                wb->dirtied_stamp = wb_stat(wb, WB_DIRTIED);
                wb->written_stamp = wb_stat(wb, WB_WRITTEN);
                WRITE_ONCE(wb->bw_time_stamp, now);
                spin_unlock(&wb->list_lock);
        }
}

/*
 * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
 * will look to see if it needs to start dirty throttling.
 *
 * If dirty_poll_interval is too low, big NUMA machines will call the expensive
 * global_zone_page_state() too often. So scale it near-sqrt to the safety margin
 * (the number of pages we may dirty without exceeding the dirty limits).
 */
static unsigned long dirty_poll_interval(unsigned long dirty,
                                         unsigned long thresh)
{
        if (thresh > dirty)
                return 1UL << (ilog2(thresh - dirty) >> 1);

        return 1;
}

static unsigned long wb_max_pause(struct bdi_writeback *wb,
                                  unsigned long wb_dirty)
{
        unsigned long bw = READ_ONCE(wb->avg_write_bandwidth);
        unsigned long t;

        /*
         * Limit pause time for small memory systems. If sleeping for too long
         * time, a small pool of dirty/writeback pages may go empty and disk go
         * idle.
         *
         * 8 serves as the safety ratio.
         */
        t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
        t++;

        return min_t(unsigned long, t, MAX_PAUSE);
}

static long wb_min_pause(struct bdi_writeback *wb,
                         long max_pause,
                         unsigned long task_ratelimit,
                         unsigned long dirty_ratelimit,
                         int *nr_dirtied_pause)
{
        long hi = ilog2(READ_ONCE(wb->avg_write_bandwidth));
        long lo = ilog2(READ_ONCE(wb->dirty_ratelimit));
        long t;                /* target pause */
        long pause;        /* estimated next pause */
        int pages;        /* target nr_dirtied_pause */

        /* target for 10ms pause on 1-dd case */
        t = max(1, HZ / 100);

        /*
         * Scale up pause time for concurrent dirtiers in order to reduce CPU
         * overheads.
         *
         * (N * 10ms) on 2^N concurrent tasks.
         */
        if (hi > lo)
                t += (hi - lo) * (10 * HZ) / 1024;

        /*
         * This is a bit convoluted. We try to base the next nr_dirtied_pause
         * on the much more stable dirty_ratelimit. However the next pause time
         * will be computed based on task_ratelimit and the two rate limits may
         * depart considerably at some time. Especially if task_ratelimit goes
         * below dirty_ratelimit/2 and the target pause is max_pause, the next
         * pause time will be max_pause*2 _trimmed down_ to max_pause.  As a
         * result task_ratelimit won't be executed faithfully, which could
         * eventually bring down dirty_ratelimit.
         *
         * We apply two rules to fix it up:
         * 1) try to estimate the next pause time and if necessary, use a lower
         *    nr_dirtied_pause so as not to exceed max_pause. When this happens,
         *    nr_dirtied_pause will be "dancing" with task_ratelimit.
         * 2) limit the target pause time to max_pause/2, so that the normal
         *    small fluctuations of task_ratelimit won't trigger rule (1) and
         *    nr_dirtied_pause will remain as stable as dirty_ratelimit.
         */
        t = min(t, 1 + max_pause / 2);
        pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);

        /*
         * Tiny nr_dirtied_pause is found to hurt I/O performance in the test
         * case fio-mmap-randwrite-64k, which does 16*{sync read, async write}.
         * When the 16 consecutive reads are often interrupted by some dirty
         * throttling pause during the async writes, cfq will go into idles
         * (deadline is fine). So push nr_dirtied_pause as high as possible
         * until reaches DIRTY_POLL_THRESH=32 pages.
         */
        if (pages < DIRTY_POLL_THRESH) {
                t = max_pause;
                pages = dirty_ratelimit * t / roundup_pow_of_two(HZ);
                if (pages > DIRTY_POLL_THRESH) {
                        pages = DIRTY_POLL_THRESH;
                        t = HZ * DIRTY_POLL_THRESH / dirty_ratelimit;
                }
        }

        pause = HZ * pages / (task_ratelimit + 1);
        if (pause > max_pause) {
                t = max_pause;
                pages = task_ratelimit * t / roundup_pow_of_two(HZ);
        }

        *nr_dirtied_pause = pages;
        /*
         * The minimal pause time will normally be half the target pause time.
         */
        return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}

static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;
        unsigned long wb_reclaimable;

        /*
         * wb_thresh is not treated as some limiting factor as
         * dirty_thresh, due to reasons
         * - in JBOD setup, wb_thresh can fluctuate a lot
         * - in a system with HDD and USB key, the USB key may somehow
         *   go into state (wb_dirty >> wb_thresh) either because
         *   wb_dirty starts high, or because wb_thresh drops low.
         *   In this case we don't want to hard throttle the USB key
         *   dirtiers for 100 seconds until wb_dirty drops under
         *   wb_thresh. Instead the auxiliary wb control line in
         *   wb_position_ratio() will let the dirtier task progress
         *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
         */
        dtc->wb_thresh = __wb_calc_thresh(dtc, dtc->thresh);
        dtc->wb_bg_thresh = dtc->thresh ?
                div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;

        /*
         * In order to avoid the stacked BDI deadlock we need
         * to ensure we accurately count the 'dirty' pages when
         * the threshold is low.
         *
         * Otherwise it would be possible to get thresh+n pages
         * reported dirty, even though there are thresh-m pages
         * actually dirty; with m+n sitting in the percpu
         * deltas.
         */
        if (dtc->wb_thresh < 2 * wb_stat_error()) {
                wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
                dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
        } else {
                wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
                dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
        }
}

static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc,
                                      bool strictlimit)
{
        unsigned long dirty, thresh;

        if (strictlimit) {
                dirty = dtc->wb_dirty;
                thresh = dtc->wb_thresh;
        } else {
                dirty = dtc->dirty;
                thresh = dtc->thresh;
        }

        return dirty_poll_interval(dirty, thresh);
}

/*
 * Throttle it only when the background writeback cannot catch-up. This avoids
 * (excessively) small writeouts when the wb limits are ramping up in case of
 * !strictlimit.
 *
 * In strictlimit case make decision based on the wb counters and limits. Small
 * writeouts when the wb limits are ramping up are the price we consciously pay
 * for strictlimit-ing.
 */
static void domain_dirty_freerun(struct dirty_throttle_control *dtc,
                                 bool strictlimit)
{
        unsigned long dirty, thresh, bg_thresh;

        if (unlikely(strictlimit)) {
                wb_dirty_limits(dtc);
                dirty = dtc->wb_dirty;
                thresh = dtc->wb_thresh;
                bg_thresh = dtc->wb_bg_thresh;
        } else {
                dirty = dtc->dirty;
                thresh = dtc->thresh;
                bg_thresh = dtc->bg_thresh;
        }
        dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh);
}

static void balance_domain_limits(struct dirty_throttle_control *dtc,
                                  bool strictlimit)
{
        domain_dirty_avail(dtc, true);
        domain_dirty_limits(dtc);
        domain_dirty_freerun(dtc, strictlimit);
}

static void wb_dirty_freerun(struct dirty_throttle_control *dtc,
                             bool strictlimit)
{
        dtc->freerun = false;

        /* was already handled in domain_dirty_freerun */
        if (strictlimit)
                return;

        wb_dirty_limits(dtc);
        /*
         * LOCAL_THROTTLE tasks must not be throttled when below the per-wb
         * freerun ceiling.
         */
        if (!(current->flags & PF_LOCAL_THROTTLE))
                return;

        dtc->freerun = dtc->wb_dirty <
                       dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh);
}

static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc,
                                     bool strictlimit)
{
        dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) &&
                ((dtc->dirty > dtc->thresh) || strictlimit);
}

/*
 * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is
 * in freerun state. Please don't use these invalid fields in freerun case.
 */
static void balance_wb_limits(struct dirty_throttle_control *dtc,
                              bool strictlimit)
{
        wb_dirty_freerun(dtc, strictlimit);
        if (dtc->freerun)
                return;

        wb_dirty_exceeded(dtc, strictlimit);
        wb_position_ratio(dtc);
}

/*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
 * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
 * If we're over `background_thresh' then the writeback threads are woken to
 * perform some writeout.
 */
static int balance_dirty_pages(struct bdi_writeback *wb,
                               unsigned long pages_dirtied, unsigned int flags)
{
        struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
        struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
        struct dirty_throttle_control * const gdtc = &gdtc_stor;
        struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
                                                     &mdtc_stor : NULL;
        struct dirty_throttle_control *sdtc;
        unsigned long nr_dirty;
        long period;
        long pause;
        long max_pause;
        long min_pause;
        int nr_dirtied_pause;
        unsigned long task_ratelimit;
        unsigned long dirty_ratelimit;
        struct backing_dev_info *bdi = wb->bdi;
        bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
        unsigned long start_time = jiffies;
        int ret = 0;

        for (;;) {
                unsigned long now = jiffies;

                nr_dirty = global_node_page_state(NR_FILE_DIRTY);

                balance_domain_limits(gdtc, strictlimit);
                if (mdtc) {
                        /*
                         * If @wb belongs to !root memcg, repeat the same
                         * basic calculations for the memcg domain.
                         */
                        balance_domain_limits(mdtc, strictlimit);
                }

                /*
                 * In laptop mode, we wait until hitting the higher threshold
                 * before starting background writeout, and then write out all
                 * the way down to the lower threshold.  So slow writers cause
                 * minimal disk activity.
                 *
                 * In normal mode, we start background writeout at the lower
                 * background_thresh, to keep the amount of dirty memory low.
                 */
                if (!laptop_mode && nr_dirty > gdtc->bg_thresh &&
                    !writeback_in_progress(wb))
                        wb_start_background_writeback(wb);

                /*
                 * If memcg domain is in effect, @dirty should be under
                 * both global and memcg freerun ceilings.
                 */
                if (gdtc->freerun && (!mdtc || mdtc->freerun)) {
                        unsigned long intv;
                        unsigned long m_intv;

free_running:
                        intv = domain_poll_intv(gdtc, strictlimit);
                        m_intv = ULONG_MAX;

                        current->dirty_paused_when = now;
                        current->nr_dirtied = 0;
                        if (mdtc)
                                m_intv = domain_poll_intv(mdtc, strictlimit);
                        current->nr_dirtied_pause = min(intv, m_intv);
                        break;
                }

                /* Start writeback even when in laptop mode */
                if (unlikely(!writeback_in_progress(wb)))
                        wb_start_background_writeback(wb);

                mem_cgroup_flush_foreign(wb);

                /*
                 * Calculate global domain's pos_ratio and select the
                 * global dtc by default.
                 */
                balance_wb_limits(gdtc, strictlimit);
                if (gdtc->freerun)
                        goto free_running;
                sdtc = gdtc;

                if (mdtc) {
                        /*
                         * If memcg domain is in effect, calculate its
                         * pos_ratio.  @wb should satisfy constraints from
                         * both global and memcg domains.  Choose the one
                         * w/ lower pos_ratio.
                         */
                        balance_wb_limits(mdtc, strictlimit);
                        if (mdtc->freerun)
                                goto free_running;
                        if (mdtc->pos_ratio < gdtc->pos_ratio)
                                sdtc = mdtc;
                }

                wb->dirty_exceeded = gdtc->dirty_exceeded ||
                                     (mdtc && mdtc->dirty_exceeded);
                if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
                                           BANDWIDTH_INTERVAL))
                        __wb_update_bandwidth(gdtc, mdtc, true);

                /* throttle according to the chosen dtc */
                dirty_ratelimit = READ_ONCE(wb->dirty_ratelimit);
                task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                        RATELIMIT_CALC_SHIFT;
                max_pause = wb_max_pause(wb, sdtc->wb_dirty);
                min_pause = wb_min_pause(wb, max_pause,
                                         task_ratelimit, dirty_ratelimit,
                                         &nr_dirtied_pause);

                if (unlikely(task_ratelimit == 0)) {
                        period = max_pause;
                        pause = max_pause;
                        goto pause;
                }
                period = HZ * pages_dirtied / task_ratelimit;
                pause = period;
                if (current->dirty_paused_when)
                        pause -= now - current->dirty_paused_when;
                /*
                 * For less than 1s think time (ext3/4 may block the dirtier
                 * for up to 800ms from time to time on 1-HDD; so does xfs,
                 * however at much less frequency), try to compensate it in
                 * future periods by updating the virtual time; otherwise just
                 * do a reset, as it may be a light dirtier.
                 */
                if (pause < min_pause) {
                        trace_balance_dirty_pages(wb,
                                                  sdtc,
                                                  dirty_ratelimit,
                                                  task_ratelimit,
                                                  pages_dirtied,
                                                  period,
                                                  min(pause, 0L),
                                                  start_time);
                        if (pause < -HZ) {
                                current->dirty_paused_when = now;
                                current->nr_dirtied = 0;
                        } else if (period) {
                                current->dirty_paused_when += period;
                                current->nr_dirtied = 0;
                        } else if (current->nr_dirtied_pause <= pages_dirtied)
                                current->nr_dirtied_pause += pages_dirtied;
                        break;
                }
                if (unlikely(pause > max_pause)) {
                        /* for occasional dropped task_ratelimit */
                        now += min(pause - max_pause, max_pause);
                        pause = max_pause;
                }

pause:
                trace_balance_dirty_pages(wb,
                                          sdtc,
                                          dirty_ratelimit,
                                          task_ratelimit,
                                          pages_dirtied,
                                          period,
                                          pause,
                                          start_time);
                if (flags & BDP_ASYNC) {
                        ret = -EAGAIN;
                        break;
                }
                __set_current_state(TASK_KILLABLE);
                bdi->last_bdp_sleep = jiffies;
                io_schedule_timeout(pause);

                current->dirty_paused_when = now + pause;
                current->nr_dirtied = 0;
                current->nr_dirtied_pause = nr_dirtied_pause;

                /*
                 * This is typically equal to (dirty < thresh) and can also
                 * keep "1000+ dd on a slow USB stick" under control.
                 */
                if (task_ratelimit)
                        break;

                /*
                 * In the case of an unresponsive NFS server and the NFS dirty
                 * pages exceeds dirty_thresh, give the other good wb's a pipe
                 * to go through, so that tasks on them still remain responsive.
                 *
                 * In theory 1 page is enough to keep the consumer-producer
                 * pipe going: the flusher cleans 1 page => the task dirties 1
                 * more page. However wb_dirty has accounting errors.  So use
                 * the larger and more IO friendly wb_stat_error.
                 */
                if (sdtc->wb_dirty <= wb_stat_error())
                        break;

                if (fatal_signal_pending(current))
                        break;
        }
        return ret;
}

static DEFINE_PER_CPU(int, bdp_ratelimits);

/*
 * Normal tasks are throttled by
 *        loop {
 *                dirty tsk->nr_dirtied_pause pages;
 *                take a snap in balance_dirty_pages();
 *        }
 * However there is a worst case. If every task exit immediately when dirtied
 * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
 * called to throttle the page dirties. The solution is to save the not yet
 * throttled page dirties in dirty_throttle_leaks on task exit and charge them
 * randomly into the running tasks. This works well for the above worst case,
 * as the new task will pick up and accumulate the old task's leaked dirty
 * count and eventually get throttled.
 */
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;

/**
 * balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
 * @mapping: address_space which was dirtied.
 * @flags: BDP flags.
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
 *
 * See balance_dirty_pages_ratelimited() for details.
 *
 * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
 * indicate that memory is out of balance and the caller must wait
 * for I/O to complete.  Otherwise, it will return 0 to indicate
 * that either memory was already in balance, or it was able to sleep
 * until the amount of dirty memory returned to balance.
 */
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
                                        unsigned int flags)
{
        struct inode *inode = mapping->host;
        struct backing_dev_info *bdi = inode_to_bdi(inode);
        struct bdi_writeback *wb = NULL;
        int ratelimit;
        int ret = 0;
        int *p;

        if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
                return ret;

        if (inode_cgwb_enabled(inode))
                wb = wb_get_create_current(bdi, GFP_KERNEL);
        if (!wb)
                wb = &bdi->wb;

        ratelimit = current->nr_dirtied_pause;
        if (wb->dirty_exceeded)
                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));

        preempt_disable();
        /*
         * This prevents one CPU to accumulate too many dirtied pages without
         * calling into balance_dirty_pages(), which can happen when there are
         * 1000+ tasks, all of them start dirtying pages at exactly the same
         * time, hence all honoured too large initial task->nr_dirtied_pause.
         */
        p =  this_cpu_ptr(&bdp_ratelimits);
        if (unlikely(current->nr_dirtied >= ratelimit))
                *p = 0;
        else if (unlikely(*p >= ratelimit_pages)) {
                *p = 0;
                ratelimit = 0;
        }
        /*
         * Pick up the dirtied pages by the exited tasks. This avoids lots of
         * short-lived tasks (eg. gcc invocations in a kernel build) escaping
         * the dirty throttling and livelock other long-run dirtiers.
         */
        p = this_cpu_ptr(&dirty_throttle_leaks);
        if (*p > 0 && current->nr_dirtied < ratelimit) {
                unsigned long nr_pages_dirtied;
                nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
                *p -= nr_pages_dirtied;
                current->nr_dirtied += nr_pages_dirtied;
        }
        preempt_enable();

        if (unlikely(current->nr_dirtied >= ratelimit))
                ret = balance_dirty_pages(wb, current->nr_dirtied, flags);

        wb_put(wb);
        return ret;
}
EXPORT_SYMBOL_GPL(balance_dirty_pages_ratelimited_flags);

/**
 * balance_dirty_pages_ratelimited - balance dirty memory state.
 * @mapping: address_space which was dirtied.
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
 * dirty state and will initiate writeback if needed.
 *
 * Once we're over the dirty memory limit we decrease the ratelimiting
 * by a lot, to prevent individual processes from overshooting the limit
 * by (ratelimit_pages) each.
 */
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
        balance_dirty_pages_ratelimited_flags(mapping, 0);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);

/*
 * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty
 * and thresh, but it's for background writeback.
 */
static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc)
{
        struct bdi_writeback *wb = dtc->wb;

        dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh);
        if (dtc->wb_bg_thresh < 2 * wb_stat_error())
                dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE);
        else
                dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE);
}

static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc)
{
        domain_dirty_avail(dtc, false);
        domain_dirty_limits(dtc);
        if (dtc->dirty > dtc->bg_thresh)
                return true;

        wb_bg_dirty_limits(dtc);
        if (dtc->wb_dirty > dtc->wb_bg_thresh)
                return true;

        return false;
}

/**
 * wb_over_bg_thresh - does @wb need to be written back?
 * @wb: bdi_writeback of interest
 *
 * Determines whether background writeback should keep writing @wb or it's
 * clean enough.
 *
 * Return: %true if writeback should continue.
 */
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
        struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
        struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };

        if (domain_over_bg_thresh(&gdtc))
                return true;

        if (mdtc_valid(&mdtc))
                return domain_over_bg_thresh(&mdtc);

        return false;
}

#ifdef CONFIG_SYSCTL
/*
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
static int dirty_writeback_centisecs_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos)
{
        unsigned int old_interval = dirty_writeback_interval;
        int ret;

        ret = proc_dointvec(table, write, buffer, length, ppos);

        /*
         * Writing 0 to dirty_writeback_interval will disable periodic writeback
         * and a different non-zero value will wakeup the writeback threads.
         * wb_wakeup_delayed() would be more appropriate, but it's a pain to
         * iterate over all bdis and wbs.
         * The reason we do this is to make the change take effect immediately.
         */
        if (!ret && write && dirty_writeback_interval &&
                dirty_writeback_interval != old_interval)
                wakeup_flusher_threads(WB_REASON_PERIODIC);

        return ret;
}
#endif

void laptop_mode_timer_fn(struct timer_list *t)
{
        struct backing_dev_info *backing_dev_info =
                from_timer(backing_dev_info, t, laptop_mode_wb_timer);

        wakeup_flusher_threads_bdi(backing_dev_info, WB_REASON_LAPTOP_TIMER);
}

/*
 * We've spun up the disk and we're in laptop mode: schedule writeback
 * of all dirty data a few seconds from now.  If the flush is already scheduled
 * then push it back - the user is still using the disk.
 */
void laptop_io_completion(struct backing_dev_info *info)
{
        mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
}

/*
 * We're in laptop mode and we've just synced. The sync's writes will have
 * caused another writeback to be scheduled by laptop_io_completion.
 * Nothing needs to be written back anymore, so we unschedule the writeback.
 */
void laptop_sync_completion(void)
{
        struct backing_dev_info *bdi;

        rcu_read_lock();

        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
                timer_delete(&bdi->laptop_mode_wb_timer);

        rcu_read_unlock();
}

/*
 * If ratelimit_pages is too high then we can get into dirty-data overload
 * if a large number of processes all perform writes at the same time.
 *
 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
 * thresholds.
 */

void writeback_set_ratelimit(void)
{
        struct wb_domain *dom = &global_wb_domain;
        unsigned long background_thresh;
        unsigned long dirty_thresh;

        global_dirty_limits(&background_thresh, &dirty_thresh);
        dom->dirty_limit = dirty_thresh;
        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
}

static int page_writeback_cpu_online(unsigned int cpu)
{
        writeback_set_ratelimit();
        return 0;
}

#ifdef CONFIG_SYSCTL

/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
static const unsigned long dirty_bytes_min = 2 * PAGE_SIZE;

static const struct ctl_table vm_page_writeback_sysctls[] = {
        {
                .procname   = "dirty_background_ratio",
                .data       = &dirty_background_ratio,
                .maxlen     = sizeof(dirty_background_ratio),
                .mode       = 0644,
                .proc_handler   = dirty_background_ratio_handler,
                .extra1     = SYSCTL_ZERO,
                .extra2     = SYSCTL_ONE_HUNDRED,
        },
        {
                .procname   = "dirty_background_bytes",
                .data       = &dirty_background_bytes,
                .maxlen     = sizeof(dirty_background_bytes),
                .mode       = 0644,
                .proc_handler   = dirty_background_bytes_handler,
                .extra1     = SYSCTL_LONG_ONE,
        },
        {
                .procname   = "dirty_ratio",
                .data       = &vm_dirty_ratio,
                .maxlen     = sizeof(vm_dirty_ratio),
                .mode       = 0644,
                .proc_handler   = dirty_ratio_handler,
                .extra1     = SYSCTL_ZERO,
                .extra2     = SYSCTL_ONE_HUNDRED,
        },
        {
                .procname   = "dirty_bytes",
                .data       = &vm_dirty_bytes,
                .maxlen     = sizeof(vm_dirty_bytes),
                .mode       = 0644,
                .proc_handler   = dirty_bytes_handler,
                .extra1     = (void *)&dirty_bytes_min,
        },
        {
                .procname   = "dirty_writeback_centisecs",
                .data       = &dirty_writeback_interval,
                .maxlen     = sizeof(dirty_writeback_interval),
                .mode       = 0644,
                .proc_handler   = dirty_writeback_centisecs_handler,
        },
        {
                .procname   = "dirty_expire_centisecs",
                .data       = &dirty_expire_interval,
                .maxlen     = sizeof(dirty_expire_interval),
                .mode       = 0644,
                .proc_handler   = proc_dointvec_minmax,
                .extra1     = SYSCTL_ZERO,
        },
#ifdef CONFIG_HIGHMEM
        {
                .procname        = "highmem_is_dirtyable",
                .data                = &vm_highmem_is_dirtyable,
                .maxlen                = sizeof(vm_highmem_is_dirtyable),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#endif
        {
                .procname        = "laptop_mode",
                .data                = &laptop_mode,
                .maxlen                = sizeof(laptop_mode),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
};
#endif

/*
 * Called early on to tune the page writeback dirty limits.
 *
 * We used to scale dirty pages according to how total memory
 * related to pages that could be allocated for buffers.
 *
 * However, that was when we used "dirty_ratio" to scale with
 * all memory, and we don't do that any more. "dirty_ratio"
 * is now applied to total non-HIGHPAGE memory, and as such we can't
 * get into the old insane situation any more where we had
 * large amounts of dirty pages compared to a small amount of
 * non-HIGHMEM memory.
 *
 * But we might still want to scale the dirty_ratio by how
 * much memory the box has..
 */
void __init page_writeback_init(void)
{
        BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));

        cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/writeback:online",
                          page_writeback_cpu_online, NULL);
        cpuhp_setup_state(CPUHP_MM_WRITEBACK_DEAD, "mm/writeback:dead", NULL,
                          page_writeback_cpu_online);
#ifdef CONFIG_SYSCTL
        register_sysctl_init("vm", vm_page_writeback_sysctls);
#endif
}

/**
 * tag_pages_for_writeback - tag pages to be written by writeback
 * @mapping: address space structure to write
 * @start: starting page index
 * @end: ending page index (inclusive)
 *
 * This function scans the page range from @start to @end (inclusive) and tags
 * all pages that have DIRTY tag set with a special TOWRITE tag.  The caller
 * can then use the TOWRITE tag to identify pages eligible for writeback.
 * This mechanism is used to avoid livelocking of writeback by a process
 * steadily creating new dirty pages in the file (thus it is important for this
 * function to be quick so that it can tag pages faster than a dirtying process
 * can create them).
 */
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end)
{
        XA_STATE(xas, &mapping->i_pages, start);
        unsigned int tagged = 0;
        void *page;

        xas_lock_irq(&xas);
        xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
                xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
                if (++tagged % XA_CHECK_SCHED)
                        continue;

                xas_pause(&xas);
                xas_unlock_irq(&xas);
                cond_resched();
                xas_lock_irq(&xas);
        }
        xas_unlock_irq(&xas);
}
EXPORT_SYMBOL(tag_pages_for_writeback);

static bool folio_prepare_writeback(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio)
{
        /*
         * Folio truncated or invalidated. We can freely skip it then,
         * even for data integrity operations: the folio has disappeared
         * concurrently, so there could be no real expectation of this
         * data integrity operation even if there is now a new, dirty
         * folio at the same pagecache index.
         */
        if (unlikely(folio->mapping != mapping))
                return false;

        /*
         * Did somebody else write it for us?
         */
        if (!folio_test_dirty(folio))
                return false;

        if (folio_test_writeback(folio)) {
                if (wbc->sync_mode == WB_SYNC_NONE)
                        return false;
                folio_wait_writeback(folio);
        }
        BUG_ON(folio_test_writeback(folio));

        if (!folio_clear_dirty_for_io(folio))
                return false;

        return true;
}

static xa_mark_t wbc_to_tag(struct writeback_control *wbc)
{
        if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                return PAGECACHE_TAG_TOWRITE;
        return PAGECACHE_TAG_DIRTY;
}

static pgoff_t wbc_end(struct writeback_control *wbc)
{
        if (wbc->range_cyclic)
                return -1;
        return wbc->range_end >> PAGE_SHIFT;
}

static struct folio *writeback_get_folio(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct folio *folio;

retry:
        folio = folio_batch_next(&wbc->fbatch);
        if (!folio) {
                folio_batch_release(&wbc->fbatch);
                cond_resched();
                filemap_get_folios_tag(mapping, &wbc->index, wbc_end(wbc),
                                wbc_to_tag(wbc), &wbc->fbatch);
                folio = folio_batch_next(&wbc->fbatch);
                if (!folio)
                        return NULL;
        }

        folio_lock(folio);
        if (unlikely(!folio_prepare_writeback(mapping, wbc, folio))) {
                folio_unlock(folio);
                goto retry;
        }

        trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
        return folio;
}

/**
 * writeback_iter - iterate folio of a mapping for writeback
 * @mapping: address space structure to write
 * @wbc: writeback context
 * @folio: previously iterated folio (%NULL to start)
 * @error: in-out pointer for writeback errors (see below)
 *
 * This function returns the next folio for the writeback operation described by
 * @wbc on @mapping and  should be called in a while loop in the ->writepages
 * implementation.
 *
 * To start the writeback operation, %NULL is passed in the @folio argument, and
 * for every subsequent iteration the folio returned previously should be passed
 * back in.
 *
 * If there was an error in the per-folio writeback inside the writeback_iter()
 * loop, @error should be set to the error value.
 *
 * Once the writeback described in @wbc has finished, this function will return
 * %NULL and if there was an error in any iteration restore it to @error.
 *
 * Note: callers should not manually break out of the loop using break or goto
 * but must keep calling writeback_iter() until it returns %NULL.
 *
 * Return: the folio to write or %NULL if the loop is done.
 */
struct folio *writeback_iter(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio, int *error)
{
        if (!folio) {
                folio_batch_init(&wbc->fbatch);
                wbc->saved_err = *error = 0;

                /*
                 * For range cyclic writeback we remember where we stopped so
                 * that we can continue where we stopped.
                 *
                 * For non-cyclic writeback we always start at the beginning of
                 * the passed in range.
                 */
                if (wbc->range_cyclic)
                        wbc->index = mapping->writeback_index;
                else
                        wbc->index = wbc->range_start >> PAGE_SHIFT;

                /*
                 * To avoid livelocks when other processes dirty new pages, we
                 * first tag pages which should be written back and only then
                 * start writing them.
                 *
                 * For data-integrity writeback we have to be careful so that we
                 * do not miss some pages (e.g., because some other process has
                 * cleared the TOWRITE tag we set).  The rule we follow is that
                 * TOWRITE tag can be cleared only by the process clearing the
                 * DIRTY tag (and submitting the page for I/O).
                 */
                if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
                        tag_pages_for_writeback(mapping, wbc->index,
                                        wbc_end(wbc));
        } else {
                wbc->nr_to_write -= folio_nr_pages(folio);

                WARN_ON_ONCE(*error > 0);

                /*
                 * For integrity writeback we have to keep going until we have
                 * written all the folios we tagged for writeback above, even if
                 * we run past wbc->nr_to_write or encounter errors.
                 * We stash away the first error we encounter in wbc->saved_err
                 * so that it can be retrieved when we're done.  This is because
                 * the file system may still have state to clear for each folio.
                 *
                 * For background writeback we exit as soon as we run past
                 * wbc->nr_to_write or encounter the first error.
                 */
                if (wbc->sync_mode == WB_SYNC_ALL) {
                        if (*error && !wbc->saved_err)
                                wbc->saved_err = *error;
                } else {
                        if (*error || wbc->nr_to_write <= 0)
                                goto done;
                }
        }

        folio = writeback_get_folio(mapping, wbc);
        if (!folio) {
                /*
                 * To avoid deadlocks between range_cyclic writeback and callers
                 * that hold pages in PageWriteback to aggregate I/O until
                 * the writeback iteration finishes, we do not loop back to the
                 * start of the file.  Doing so causes a page lock/page
                 * writeback access order inversion - we should only ever lock
                 * multiple pages in ascending page->index order, and looping
                 * back to the start of the file violates that rule and causes
                 * deadlocks.
                 */
                if (wbc->range_cyclic)
                        mapping->writeback_index = 0;

                /*
                 * Return the first error we encountered (if there was any) to
                 * the caller.
                 */
                *error = wbc->saved_err;
        }
        return folio;

done:
        if (wbc->range_cyclic)
                mapping->writeback_index = folio_next_index(folio);
        folio_batch_release(&wbc->fbatch);
        return NULL;
}
EXPORT_SYMBOL_GPL(writeback_iter);

/**
 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 * @writepage: function called for each page
 * @data: data passed to writepage function
 *
 * Return: %0 on success, negative error code otherwise
 *
 * Note: please use writeback_iter() instead.
 */
int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data)
{
        struct folio *folio = NULL;
        int error;

        while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
                error = writepage(folio, wbc, data);
                if (error == AOP_WRITEPAGE_ACTIVATE) {
                        folio_unlock(folio);
                        error = 0;
                }
        }

        return error;
}
EXPORT_SYMBOL(write_cache_pages);

static int writeback_use_writepage(struct address_space *mapping,
                struct writeback_control *wbc)
{
        struct folio *folio = NULL;
        struct blk_plug plug;
        int err;

        blk_start_plug(&plug);
        while ((folio = writeback_iter(mapping, wbc, folio, &err))) {
                err = mapping->a_ops->writepage(&folio->page, wbc);
                if (err == AOP_WRITEPAGE_ACTIVATE) {
                        folio_unlock(folio);
                        err = 0;
                }
                mapping_set_error(mapping, err);
        }
        blk_finish_plug(&plug);

        return err;
}

int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
        int ret;
        struct bdi_writeback *wb;

        if (wbc->nr_to_write <= 0)
                return 0;
        wb = inode_to_wb_wbc(mapping->host, wbc);
        wb_bandwidth_estimate_start(wb);
        while (1) {
                if (mapping->a_ops->writepages) {
                        ret = mapping->a_ops->writepages(mapping, wbc);
                } else if (mapping->a_ops->writepage) {
                        ret = writeback_use_writepage(mapping, wbc);
                } else {
                        /* deal with chardevs and other special files */
                        ret = 0;
                }
                if (ret != -ENOMEM || wbc->sync_mode != WB_SYNC_ALL)
                        break;

                /*
                 * Lacking an allocation context or the locality or writeback
                 * state of any of the inode's pages, throttle based on
                 * writeback activity on the local node. It's as good a
                 * guess as any.
                 */
                reclaim_throttle(NODE_DATA(numa_node_id()),
                        VMSCAN_THROTTLE_WRITEBACK);
        }
        /*
         * Usually few pages are written by now from those we've just submitted
         * but if there's constant writeback being submitted, this makes sure
         * writeback bandwidth is updated once in a while.
         */
        if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
                                   BANDWIDTH_INTERVAL))
                wb_update_bandwidth(wb);
        return ret;
}

/*
 * For address_spaces which do not use buffers nor write back.
 */
bool noop_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        if (!folio_test_dirty(folio))
                return !folio_test_set_dirty(folio);
        return false;
}
EXPORT_SYMBOL(noop_dirty_folio);

/*
 * Helper function for set_page_dirty family.
 *
 * NOTE: This relies on being atomic wrt interrupts.
 */
static void folio_account_dirtied(struct folio *folio,
                struct address_space *mapping)
{
        struct inode *inode = mapping->host;

        trace_writeback_dirty_folio(folio, mapping);

        if (mapping_can_writeback(mapping)) {
                struct bdi_writeback *wb;
                long nr = folio_nr_pages(folio);

                inode_attach_wb(inode, folio);
                wb = inode_to_wb(inode);

                __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
                __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
                __node_stat_mod_folio(folio, NR_DIRTIED, nr);
                wb_stat_mod(wb, WB_RECLAIMABLE, nr);
                wb_stat_mod(wb, WB_DIRTIED, nr);
                task_io_account_write(nr * PAGE_SIZE);
                current->nr_dirtied += nr;
                __this_cpu_add(bdp_ratelimits, nr);

                mem_cgroup_track_foreign_dirty(folio, wb);
        }
}

/*
 * Helper function for deaccounting dirty page without writeback.
 *
 */
void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
{
        long nr = folio_nr_pages(folio);

        lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
        wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
        task_io_account_cancelled_write(nr * PAGE_SIZE);
}

/*
 * Mark the folio dirty, and set it dirty in the page cache.
 *
 * If warn is true, then emit a warning if the folio is not uptodate and has
 * not been truncated.
 *
 * It is the caller's responsibility to prevent the folio from being truncated
 * while this function is in progress, although it may have been truncated
 * before this function is called.  Most callers have the folio locked.
 * A few have the folio blocked from truncation through other means (e.g.
 * zap_vma_pages() has it mapped and is holding the page table lock).
 * When called from mark_buffer_dirty(), the filesystem should hold a
 * reference to the buffer_head that is being marked dirty, which causes
 * try_to_free_buffers() to fail.
 */
void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
                             int warn)
{
        unsigned long flags;

        xa_lock_irqsave(&mapping->i_pages, flags);
        if (folio->mapping) {        /* Race with truncate? */
                WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
                folio_account_dirtied(folio, mapping);
                __xa_set_mark(&mapping->i_pages, folio_index(folio),
                                PAGECACHE_TAG_DIRTY);
        }
        xa_unlock_irqrestore(&mapping->i_pages, flags);
}

/**
 * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
 * @mapping: Address space this folio belongs to.
 * @folio: Folio to be marked as dirty.
 *
 * Filesystems which do not use buffer heads should call this function
 * from their dirty_folio address space operation.  It ignores the
 * contents of folio_get_private(), so if the filesystem marks individual
 * blocks as dirty, the filesystem should handle that itself.
 *
 * This is also sometimes used by filesystems which use buffer_heads when
 * a single buffer is being dirtied: we want to set the folio dirty in
 * that case, but not all the buffers.  This is a "bottom-up" dirtying,
 * whereas block_dirty_folio() is a "top-down" dirtying.
 *
 * The caller must ensure this doesn't race with truncation.  Most will
 * simply hold the folio lock, but e.g. zap_pte_range() calls with the
 * folio mapped and the pte lock held, which also locks out truncation.
 */
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
        if (folio_test_set_dirty(folio))
                return false;

        __folio_mark_dirty(folio, mapping, !folio_test_private(folio));

        if (mapping->host) {
                /* !PageAnon && !swapper_space */
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
        return true;
}
EXPORT_SYMBOL(filemap_dirty_folio);

/**
 * folio_redirty_for_writepage - Decline to write a dirty folio.
 * @wbc: The writeback control.
 * @folio: The folio.
 *
 * When a writepage implementation decides that it doesn't want to write
 * @folio for some reason, it should call this function, unlock @folio and
 * return 0.
 *
 * Return: True if we redirtied the folio.  False if someone else dirtied
 * it first.
 */
bool folio_redirty_for_writepage(struct writeback_control *wbc,
                struct folio *folio)
{
        struct address_space *mapping = folio->mapping;
        long nr = folio_nr_pages(folio);
        bool ret;

        wbc->pages_skipped += nr;
        ret = filemap_dirty_folio(mapping, folio);
        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                wb = unlocked_inode_to_wb_begin(inode, &cookie);
                current->nr_dirtied -= nr;
                node_stat_mod_folio(folio, NR_DIRTIED, -nr);
                wb_stat_mod(wb, WB_DIRTIED, -nr);
                unlocked_inode_to_wb_end(inode, &cookie);
        }
        return ret;
}
EXPORT_SYMBOL(folio_redirty_for_writepage);

/**
 * folio_mark_dirty - Mark a folio as being modified.
 * @folio: The folio.
 *
 * The folio may not be truncated while this function is running.
 * Holding the folio lock is sufficient to prevent truncation, but some
 * callers cannot acquire a sleeping lock.  These callers instead hold
 * the page table lock for a page table which contains at least one page
 * in this folio.  Truncation will block on the page table lock as it
 * unmaps pages before removing the folio from its mapping.
 *
 * Return: True if the folio was newly dirtied, false if it was already dirty.
 */
bool folio_mark_dirty(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        if (likely(mapping)) {
                /*
                 * readahead/folio_deactivate could remain
                 * PG_readahead/PG_reclaim due to race with folio_end_writeback
                 * About readahead, if the folio is written, the flags would be
                 * reset. So no problem.
                 * About folio_deactivate, if the folio is redirtied,
                 * the flag will be reset. So no problem. but if the
                 * folio is used by readahead it will confuse readahead
                 * and make it restart the size rampup process. But it's
                 * a trivial problem.
                 */
                if (folio_test_reclaim(folio))
                        folio_clear_reclaim(folio);
                return mapping->a_ops->dirty_folio(mapping, folio);
        }

        return noop_dirty_folio(mapping, folio);
}
EXPORT_SYMBOL(folio_mark_dirty);

/*
 * folio_mark_dirty() is racy if the caller has no reference against
 * folio->mapping->host, and if the folio is unlocked.  This is because another
 * CPU could truncate the folio off the mapping and then free the mapping.
 *
 * Usually, the folio _is_ locked, or the caller is a user-space process which
 * holds a reference on the inode by having an open file.
 *
 * In other cases, the folio should be locked before running folio_mark_dirty().
 */
bool folio_mark_dirty_lock(struct folio *folio)
{
        bool ret;

        folio_lock(folio);
        ret = folio_mark_dirty(folio);
        folio_unlock(folio);
        return ret;
}
EXPORT_SYMBOL(folio_mark_dirty_lock);

/*
 * This cancels just the dirty bit on the kernel page itself, it does NOT
 * actually remove dirty bits on any mmap's that may be around. It also
 * leaves the page tagged dirty, so any sync activity will still find it on
 * the dirty lists, and in particular, clear_page_dirty_for_io() will still
 * look at the dirty bits in the VM.
 *
 * Doing this should *normally* only ever be done when a page is truncated,
 * and is not actually mapped anywhere at all. However, fs/buffer.c does
 * this when it notices that somebody has cleaned out all the buffers on a
 * page without actually doing it through the VM. Can you say "ext3 is
 * horribly ugly"? Thought you could.
 */
void __folio_cancel_dirty(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        if (mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                wb = unlocked_inode_to_wb_begin(inode, &cookie);

                if (folio_test_clear_dirty(folio))
                        folio_account_cleaned(folio, wb);

                unlocked_inode_to_wb_end(inode, &cookie);
        } else {
                folio_clear_dirty(folio);
        }
}
EXPORT_SYMBOL(__folio_cancel_dirty);

/*
 * Clear a folio's dirty flag, while caring for dirty memory accounting.
 * Returns true if the folio was previously dirty.
 *
 * This is for preparing to put the folio under writeout.  We leave
 * the folio tagged as dirty in the xarray so that a concurrent
 * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
 * The ->writepage implementation will run either folio_start_writeback()
 * or folio_mark_dirty(), at which stage we bring the folio's dirty flag
 * and xarray dirty tag back into sync.
 *
 * This incoherency between the folio's dirty flag and xarray tag is
 * unfortunate, but it only exists while the folio is locked.
 */
bool folio_clear_dirty_for_io(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);
        bool ret = false;

        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (mapping && mapping_can_writeback(mapping)) {
                struct inode *inode = mapping->host;
                struct bdi_writeback *wb;
                struct wb_lock_cookie cookie = {};

                /*
                 * Yes, Virginia, this is indeed insane.
                 *
                 * We use this sequence to make sure that
                 *  (a) we account for dirty stats properly
                 *  (b) we tell the low-level filesystem to
                 *      mark the whole folio dirty if it was
                 *      dirty in a pagetable. Only to then
                 *  (c) clean the folio again and return 1 to
                 *      cause the writeback.
                 *
                 * This way we avoid all nasty races with the
                 * dirty bit in multiple places and clearing
                 * them concurrently from different threads.
                 *
                 * Note! Normally the "folio_mark_dirty(folio)"
                 * has no effect on the actual dirty bit - since
                 * that will already usually be set. But we
                 * need the side effects, and it can help us
                 * avoid races.
                 *
                 * We basically use the folio "master dirty bit"
                 * as a serialization point for all the different
                 * threads doing their things.
                 */
                if (folio_mkclean(folio))
                        folio_mark_dirty(folio);
                /*
                 * We carefully synchronise fault handlers against
                 * installing a dirty pte and marking the folio dirty
                 * at this point.  We do this by having them hold the
                 * page lock while dirtying the folio, and folios are
                 * always locked coming in here, so we get the desired
                 * exclusion.
                 */
                wb = unlocked_inode_to_wb_begin(inode, &cookie);
                if (folio_test_clear_dirty(folio)) {
                        long nr = folio_nr_pages(folio);
                        lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
                        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
                        wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
                        ret = true;
                }
                unlocked_inode_to_wb_end(inode, &cookie);
                return ret;
        }
        return folio_test_clear_dirty(folio);
}
EXPORT_SYMBOL(folio_clear_dirty_for_io);

static void wb_inode_writeback_start(struct bdi_writeback *wb)
{
        atomic_inc(&wb->writeback_inodes);
}

static void wb_inode_writeback_end(struct bdi_writeback *wb)
{
        unsigned long flags;
        atomic_dec(&wb->writeback_inodes);
        /*
         * Make sure estimate of writeback throughput gets updated after
         * writeback completed. We delay the update by BANDWIDTH_INTERVAL
         * (which is the interval other bandwidth updates use for batching) so
         * that if multiple inodes end writeback at a similar time, they get
         * batched into one bandwidth update.
         */
        spin_lock_irqsave(&wb->work_lock, flags);
        if (test_bit(WB_registered, &wb->state))
                queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
        spin_unlock_irqrestore(&wb->work_lock, flags);
}

bool __folio_end_writeback(struct folio *folio)
{
        long nr = folio_nr_pages(folio);
        struct address_space *mapping = folio_mapping(folio);
        bool ret;

        if (mapping && mapping_use_writeback_tags(mapping)) {
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;

                xa_lock_irqsave(&mapping->i_pages, flags);
                ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
                __xa_clear_mark(&mapping->i_pages, folio_index(folio),
                                        PAGECACHE_TAG_WRITEBACK);
                if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
                        struct bdi_writeback *wb = inode_to_wb(inode);

                        wb_stat_mod(wb, WB_WRITEBACK, -nr);
                        __wb_writeout_add(wb, nr);
                        if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
                                wb_inode_writeback_end(wb);
                }

                if (mapping->host && !mapping_tagged(mapping,
                                                     PAGECACHE_TAG_WRITEBACK))
                        sb_clear_inode_writeback(mapping->host);

                xa_unlock_irqrestore(&mapping->i_pages, flags);
        } else {
                ret = folio_xor_flags_has_waiters(folio, 1 << PG_writeback);
        }

        lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
        node_stat_mod_folio(folio, NR_WRITTEN, nr);

        return ret;
}

void __folio_start_writeback(struct folio *folio, bool keep_write)
{
        long nr = folio_nr_pages(folio);
        struct address_space *mapping = folio_mapping(folio);
        int access_ret;

        VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
        VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);

        if (mapping && mapping_use_writeback_tags(mapping)) {
                XA_STATE(xas, &mapping->i_pages, folio_index(folio));
                struct inode *inode = mapping->host;
                struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;
                bool on_wblist;

                xas_lock_irqsave(&xas, flags);
                xas_load(&xas);
                folio_test_set_writeback(folio);

                on_wblist = mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);

                xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
                if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
                        struct bdi_writeback *wb = inode_to_wb(inode);

                        wb_stat_mod(wb, WB_WRITEBACK, nr);
                        if (!on_wblist)
                                wb_inode_writeback_start(wb);
                }

                /*
                 * We can come through here when swapping anonymous
                 * folios, so we don't necessarily have an inode to
                 * track for sync.
                 */
                if (mapping->host && !on_wblist)
                        sb_mark_inode_writeback(mapping->host);
                if (!folio_test_dirty(folio))
                        xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
                if (!keep_write)
                        xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
                xas_unlock_irqrestore(&xas, flags);
        } else {
                folio_test_set_writeback(folio);
        }

        lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
        zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);

        access_ret = arch_make_folio_accessible(folio);
        /*
         * If writeback has been triggered on a page that cannot be made
         * accessible, it is too late to recover here.
         */
        VM_BUG_ON_FOLIO(access_ret != 0, folio);
}
EXPORT_SYMBOL(__folio_start_writeback);

/**
 * folio_wait_writeback - Wait for a folio to finish writeback.
 * @folio: The folio to wait for.
 *
 * If the folio is currently being written back to storage, wait for the
 * I/O to complete.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 */
void folio_wait_writeback(struct folio *folio)
{
        while (folio_test_writeback(folio)) {
                trace_folio_wait_writeback(folio, folio_mapping(folio));
                folio_wait_bit(folio, PG_writeback);
        }
}
EXPORT_SYMBOL_GPL(folio_wait_writeback);

/**
 * folio_wait_writeback_killable - Wait for a folio to finish writeback.
 * @folio: The folio to wait for.
 *
 * If the folio is currently being written back to storage, wait for the
 * I/O to complete or a fatal signal to arrive.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
 */
int folio_wait_writeback_killable(struct folio *folio)
{
        while (folio_test_writeback(folio)) {
                trace_folio_wait_writeback(folio, folio_mapping(folio));
                if (folio_wait_bit_killable(folio, PG_writeback))
                        return -EINTR;
        }

        return 0;
}
EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);

/**
 * folio_wait_stable() - wait for writeback to finish, if necessary.
 * @folio: The folio to wait on.
 *
 * This function determines if the given folio is related to a backing
 * device that requires folio contents to be held stable during writeback.
 * If so, then it will wait for any pending writeback to complete.
 *
 * Context: Sleeps.  Must be called in process context and with
 * no spinlocks held.  Caller should hold a reference on the folio.
 * If the folio is not locked, writeback may start again after writeback
 * has finished.
 */
void folio_wait_stable(struct folio *folio)
{
        if (mapping_stable_writes(folio_mapping(folio)))
                folio_wait_writeback(folio);
}
EXPORT_SYMBOL_GPL(folio_wait_stable);



















































































  351 






































  209 





























  243 




























  244 


























    3 



















































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2022 Christian Brauner <brauner@kernel.org> */

#include <linux/cred.h>
#include <linux/fs.h>
#include <linux/mnt_idmapping.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
#include <linux/seq_file.h>

#include "internal.h"

/*
 * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
 * never from raw values. These are just internal helpers.
 */
#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }

struct mnt_idmap {
        struct uid_gid_map uid_map;
        struct uid_gid_map gid_map;
        refcount_t count;
};

/*
 * Carries the initial idmapping of 0:0:4294967295 which is an identity
 * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is
 * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
 */
struct mnt_idmap nop_mnt_idmap = {
        .count        = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL_GPL(nop_mnt_idmap);

/*
 * Carries the invalid idmapping of a full 0-4294967295 {g,u}id range.
 * This means that all {g,u}ids are mapped to INVALID_VFS{G,U}ID.
 */
struct mnt_idmap invalid_mnt_idmap = {
        .count        = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL_GPL(invalid_mnt_idmap);

/**
 * initial_idmapping - check whether this is the initial mapping
 * @ns: idmapping to check
 *
 * Check whether this is the initial mapping, mapping 0 to 0, 1 to 1,
 * [...], 1000 to 1000 [...].
 *
 * Return: true if this is the initial mapping, false if not.
 */
static inline bool initial_idmapping(const struct user_namespace *ns)
{
        return ns == &init_user_ns;
}

/**
 * make_vfsuid - map a filesystem kuid according to an idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @kuid : kuid to be mapped
 *
 * Take a @kuid and remap it from @fs_userns into @idmap. Use this
 * function when preparing a @kuid to be reported to userspace.
 *
 * If initial_idmapping() determines that this is not an idmapped mount
 * we can simply return @kuid unchanged.
 * If initial_idmapping() tells us that the filesystem is not mounted with an
 * idmapping we know the value of @kuid won't change when calling
 * from_kuid() so we can simply retrieve the value via __kuid_val()
 * directly.
 *
 * Return: @kuid mapped according to @idmap.
 * If @kuid has no mapping in either @idmap or @fs_userns INVALID_UID is
 * returned.
 */

vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
                     struct user_namespace *fs_userns,
                     kuid_t kuid)
{
        uid_t uid;

        if (idmap == &nop_mnt_idmap)
                return VFSUIDT_INIT(kuid);
        if (idmap == &invalid_mnt_idmap)
                return INVALID_VFSUID;
        if (initial_idmapping(fs_userns))
                uid = __kuid_val(kuid);
        else
                uid = from_kuid(fs_userns, kuid);
        if (uid == (uid_t)-1)
                return INVALID_VFSUID;
        return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
}
EXPORT_SYMBOL_GPL(make_vfsuid);

/**
 * make_vfsgid - map a filesystem kgid according to an idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @kgid : kgid to be mapped
 *
 * Take a @kgid and remap it from @fs_userns into @idmap. Use this
 * function when preparing a @kgid to be reported to userspace.
 *
 * If initial_idmapping() determines that this is not an idmapped mount
 * we can simply return @kgid unchanged.
 * If initial_idmapping() tells us that the filesystem is not mounted with an
 * idmapping we know the value of @kgid won't change when calling
 * from_kgid() so we can simply retrieve the value via __kgid_val()
 * directly.
 *
 * Return: @kgid mapped according to @idmap.
 * If @kgid has no mapping in either @idmap or @fs_userns INVALID_GID is
 * returned.
 */
vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
                     struct user_namespace *fs_userns, kgid_t kgid)
{
        gid_t gid;

        if (idmap == &nop_mnt_idmap)
                return VFSGIDT_INIT(kgid);
        if (idmap == &invalid_mnt_idmap)
                return INVALID_VFSGID;
        if (initial_idmapping(fs_userns))
                gid = __kgid_val(kgid);
        else
                gid = from_kgid(fs_userns, kgid);
        if (gid == (gid_t)-1)
                return INVALID_VFSGID;
        return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
}
EXPORT_SYMBOL_GPL(make_vfsgid);

/**
 * from_vfsuid - map a vfsuid into the filesystem idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @vfsuid : vfsuid to be mapped
 *
 * Map @vfsuid into the filesystem idmapping. This function has to be used in
 * order to e.g. write @vfsuid to inode->i_uid.
 *
 * Return: @vfsuid mapped into the filesystem idmapping
 */
kuid_t from_vfsuid(struct mnt_idmap *idmap,
                   struct user_namespace *fs_userns, vfsuid_t vfsuid)
{
        uid_t uid;

        if (idmap == &nop_mnt_idmap)
                return AS_KUIDT(vfsuid);
        if (idmap == &invalid_mnt_idmap)
                return INVALID_UID;
        uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
        if (uid == (uid_t)-1)
                return INVALID_UID;
        if (initial_idmapping(fs_userns))
                return KUIDT_INIT(uid);
        return make_kuid(fs_userns, uid);
}
EXPORT_SYMBOL_GPL(from_vfsuid);

/**
 * from_vfsgid - map a vfsgid into the filesystem idmapping
 * @idmap: the mount's idmapping
 * @fs_userns: the filesystem's idmapping
 * @vfsgid : vfsgid to be mapped
 *
 * Map @vfsgid into the filesystem idmapping. This function has to be used in
 * order to e.g. write @vfsgid to inode->i_gid.
 *
 * Return: @vfsgid mapped into the filesystem idmapping
 */
kgid_t from_vfsgid(struct mnt_idmap *idmap,
                   struct user_namespace *fs_userns, vfsgid_t vfsgid)
{
        gid_t gid;

        if (idmap == &nop_mnt_idmap)
                return AS_KGIDT(vfsgid);
        if (idmap == &invalid_mnt_idmap)
                return INVALID_GID;
        gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
        if (gid == (gid_t)-1)
                return INVALID_GID;
        if (initial_idmapping(fs_userns))
                return KGIDT_INIT(gid);
        return make_kgid(fs_userns, gid);
}
EXPORT_SYMBOL_GPL(from_vfsgid);

#ifdef CONFIG_MULTIUSER
/**
 * vfsgid_in_group_p() - check whether a vfsuid matches the caller's groups
 * @vfsgid: the mnt gid to match
 *
 * This function can be used to determine whether @vfsuid matches any of the
 * caller's groups.
 *
 * Return: 1 if vfsuid matches caller's groups, 0 if not.
 */
int vfsgid_in_group_p(vfsgid_t vfsgid)
{
        return in_group_p(AS_KGIDT(vfsgid));
}
#else
int vfsgid_in_group_p(vfsgid_t vfsgid)
{
        return 1;
}
#endif
EXPORT_SYMBOL_GPL(vfsgid_in_group_p);

static int copy_mnt_idmap(struct uid_gid_map *map_from,
                          struct uid_gid_map *map_to)
{
        struct uid_gid_extent *forward, *reverse;
        u32 nr_extents = READ_ONCE(map_from->nr_extents);
        /* Pairs with smp_wmb() when writing the idmapping. */
        smp_rmb();

        /*
         * Don't blindly copy @map_to into @map_from if nr_extents is
         * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
         * read @nr_extents someone could have written an idmapping and
         * then we might end up with inconsistent data. So just don't do
         * anything at all.
         */
        if (nr_extents == 0)
                return -EINVAL;

        /*
         * Here we know that nr_extents is greater than zero which means
         * a map has been written. Since idmappings can't be changed
         * once they have been written we know that we can safely copy
         * from @map_to into @map_from.
         */

        if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                *map_to = *map_from;
                return 0;
        }

        forward = kmemdup_array(map_from->forward, nr_extents,
                                sizeof(struct uid_gid_extent),
                                GFP_KERNEL_ACCOUNT);
        if (!forward)
                return -ENOMEM;

        reverse = kmemdup_array(map_from->reverse, nr_extents,
                                sizeof(struct uid_gid_extent),
                                GFP_KERNEL_ACCOUNT);
        if (!reverse) {
                kfree(forward);
                return -ENOMEM;
        }

        /*
         * The idmapping isn't exposed anywhere so we don't need to care
         * about ordering between extent pointers and @nr_extents
         * initialization.
         */
        map_to->forward = forward;
        map_to->reverse = reverse;
        map_to->nr_extents = nr_extents;
        return 0;
}

static void free_mnt_idmap(struct mnt_idmap *idmap)
{
        if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(idmap->uid_map.forward);
                kfree(idmap->uid_map.reverse);
        }
        if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(idmap->gid_map.forward);
                kfree(idmap->gid_map.reverse);
        }
        kfree(idmap);
}

struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
{
        struct mnt_idmap *idmap;
        int ret;

        idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
        if (!idmap)
                return ERR_PTR(-ENOMEM);

        refcount_set(&idmap->count, 1);
        ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
        if (!ret)
                ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
        if (ret) {
                free_mnt_idmap(idmap);
                idmap = ERR_PTR(ret);
        }
        return idmap;
}

/**
 * mnt_idmap_get - get a reference to an idmapping
 * @idmap: the idmap to bump the reference on
 *
 * If @idmap is not the @nop_mnt_idmap bump the reference count.
 *
 * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed.
 */
struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap)
{
        if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap)
                refcount_inc(&idmap->count);

        return idmap;
}
EXPORT_SYMBOL_GPL(mnt_idmap_get);

/**
 * mnt_idmap_put - put a reference to an idmapping
 * @idmap: the idmap to put the reference on
 *
 * If this is a non-initial idmapping, put the reference count when a mount is
 * released and free it if we're the last user.
 */
void mnt_idmap_put(struct mnt_idmap *idmap)
{
        if (idmap != &nop_mnt_idmap && idmap != &invalid_mnt_idmap &&
            refcount_dec_and_test(&idmap->count))
                free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);

int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map)
{
        struct uid_gid_map *map, *map_up;
        u32 idx, nr_mappings;

        if (!is_valid_mnt_idmap(idmap))
                return 0;

        /*
         * Idmappings are shown relative to the caller's idmapping.
         * This is both the most intuitive and most useful solution.
         */
        if (uid_map) {
                map = &idmap->uid_map;
                map_up = &current_user_ns()->uid_map;
        } else {
                map = &idmap->gid_map;
                map_up = &current_user_ns()->gid_map;
        }

        for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) {
                uid_t lower;
                struct uid_gid_extent *extent;

                if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        extent = &map->extent[idx];
                else
                        extent = &map->forward[idx];

                /*
                 * Verify that the whole range of the mapping can be
                 * resolved in the caller's idmapping. If it cannot be
                 * resolved skip the mapping.
                 */
                lower = map_id_range_up(map_up, extent->lower_first, extent->count);
                if (lower == (uid_t) -1)
                        continue;

                seq_printf(seq, "%u %u %u", extent->first, lower, extent->count);

                seq->count++; /* mappings are separated by \0 */
                if (seq_has_overflowed(seq))
                        return -EAGAIN;

                nr_mappings++;
        }

        return nr_mappings;
}



































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright (C) 2023 Arm Ltd.
 */

#ifndef _ASM_ARM64_POR_H
#define _ASM_ARM64_POR_H

#include <asm/sysreg.h>

#define POR_EL0_INIT        POR_ELx_PERM_PREP(0, POE_RWX)

static inline bool por_elx_allows_read(u64 por, u8 pkey)
{
        u8 perm = POR_ELx_PERM_GET(pkey, por);

        return perm & POE_R;
}

static inline bool por_elx_allows_write(u64 por, u8 pkey)
{
        u8 perm = POR_ELx_PERM_GET(pkey, por);

        return perm & POE_W;
}

static inline bool por_elx_allows_exec(u64 por, u8 pkey)
{
        u8 perm = POR_ELx_PERM_GET(pkey, por);

        return perm & POE_X;
}

#endif /* _ASM_ARM64_POR_H */

























































































































































































































   33 



   31 
   33 
   33 
   33 



































































































   31 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   33 





   32 

   33 




















   33 






















   32 


































   33 

   33 







































   33 










   33 






   32 










   32 















   33 
   33 





















    7 











    7 








    7 











   33 











   31 





















   33 























   33 
    7 













   32 
   33 





   25 



   26 


   25 









   25 







































































































































































































































































   33 






   31 
















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
// SPDX-License-Identifier: GPL-2.0-or-later
/* audit.c -- Auditing support
 * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
 * System-call specific features have moved to auditsc.c
 *
 * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina.
 * All Rights Reserved.
 *
 * Written by Rickard E. (Rik) Faith <faith@redhat.com>
 *
 * Goals: 1) Integrate fully with Security Modules.
 *          2) Minimal run-time overhead:
 *             a) Minimal when syscall auditing is disabled (audit_enable=0).
 *             b) Small when syscall auditing is enabled and no audit record
 *                is generated (defer as much work as possible to record
 *                generation time):
 *                i) context is allocated,
 *                ii) names from getname are stored without a copy, and
 *                iii) inode information stored from path_lookup.
 *          3) Ability to disable syscall auditing at boot time (audit=0).
 *          4) Usable by other parts of the kernel (if audit_log* is called,
 *             then a syscall record will be generated automatically for the
 *             current syscall).
 *          5) Netlink interface to user-space.
 *          6) Support low-overhead kernel-based filtering to minimize the
 *             information that must be passed to user-space.
 *
 * Audit userspace, documentation, tests, and bug/issue trackers:
 *         https://github.com/linux-audit
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/file.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/atomic.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kthread.h>
#include <linux/kernel.h>
#include <linux/syscalls.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/pid.h>

#include <linux/audit.h>

#include <net/sock.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
#include <linux/security.h>
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <net/netns/generic.h>

#include "audit.h"

/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
 * (Initialization happens after skb_init is called.) */
#define AUDIT_DISABLED                -1
#define AUDIT_UNINITIALIZED        0
#define AUDIT_INITIALIZED        1
static int        audit_initialized = AUDIT_UNINITIALIZED;

u32                audit_enabled = AUDIT_OFF;
bool                audit_ever_enabled = !!AUDIT_OFF;

EXPORT_SYMBOL_GPL(audit_enabled);

/* Default state when kernel boots without any parameters. */
static u32        audit_default = AUDIT_OFF;

/* If auditing cannot proceed, audit_failure selects what happens. */
static u32        audit_failure = AUDIT_FAIL_PRINTK;

/* private audit network namespace index */
static unsigned int audit_net_id;

/**
 * struct audit_net - audit private network namespace data
 * @sk: communication socket
 */
struct audit_net {
        struct sock *sk;
};

/**
 * struct auditd_connection - kernel/auditd connection state
 * @pid: auditd PID
 * @portid: netlink portid
 * @net: the associated network namespace
 * @rcu: RCU head
 *
 * Description:
 * This struct is RCU protected; you must either hold the RCU lock for reading
 * or the associated spinlock for writing.
 */
struct auditd_connection {
        struct pid *pid;
        u32 portid;
        struct net *net;
        struct rcu_head rcu;
};
static struct auditd_connection __rcu *auditd_conn;
static DEFINE_SPINLOCK(auditd_conn_lock);

/* If audit_rate_limit is non-zero, limit the rate of sending audit records
 * to that number per second.  This prevents DoS attacks, but results in
 * audit records being dropped. */
static u32        audit_rate_limit;

/* Number of outstanding audit_buffers allowed.
 * When set to zero, this means unlimited. */
static u32        audit_backlog_limit = 64;
#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
static u32        audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;

/* The identity of the user shutting down the audit system. */
static kuid_t                audit_sig_uid = INVALID_UID;
static pid_t                audit_sig_pid = -1;
static struct lsm_prop        audit_sig_lsm;

/* Records can be lost in several ways:
   0) [suppressed in audit_alloc]
   1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
   2) out of memory in audit_log_move [alloc_skb]
   3) suppressed due to audit_rate_limit
   4) suppressed due to audit_backlog_limit
*/
static atomic_t        audit_lost = ATOMIC_INIT(0);

/* Monotonically increasing sum of time the kernel has spent
 * waiting while the backlog limit is exceeded.
 */
static atomic_t audit_backlog_wait_time_actual = ATOMIC_INIT(0);

/* Hash for inode-based rules */
struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];

static struct kmem_cache *audit_buffer_cache;

/* queue msgs to send via kauditd_task */
static struct sk_buff_head audit_queue;
/* queue msgs due to temporary unicast send problems */
static struct sk_buff_head audit_retry_queue;
/* queue msgs waiting for new auditd connection */
static struct sk_buff_head audit_hold_queue;

/* queue servicing thread */
static struct task_struct *kauditd_task;
static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);

/* waitqueue for callers who are blocked on the audit backlog */
static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);

static struct audit_features af = {.vers = AUDIT_FEATURE_VERSION,
                                   .mask = -1,
                                   .features = 0,
                                   .lock = 0,};

static char *audit_feature_names[2] = {
        "only_unset_loginuid",
        "loginuid_immutable",
};

/**
 * struct audit_ctl_mutex - serialize requests from userspace
 * @lock: the mutex used for locking
 * @owner: the task which owns the lock
 *
 * Description:
 * This is the lock struct used to ensure we only process userspace requests
 * in an orderly fashion.  We can't simply use a mutex/lock here because we
 * need to track lock ownership so we don't end up blocking the lock owner in
 * audit_log_start() or similar.
 */
static struct audit_ctl_mutex {
        struct mutex lock;
        void *owner;
} audit_cmd_mutex;

/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
 * audit records.  Since printk uses a 1024 byte buffer, this buffer
 * should be at least that large. */
#define AUDIT_BUFSIZ 1024

/* The audit_buffer is used when formatting an audit record.  The caller
 * locks briefly to get the record off the freelist or to allocate the
 * buffer, and locks briefly to send the buffer to the netlink layer or
 * to place it on a transmit queue.  Multiple audit_buffers can be in
 * use simultaneously. */
struct audit_buffer {
        struct sk_buff       *skb;        /* formatted skb ready to send */
        struct audit_context *ctx;        /* NULL or associated context */
        gfp_t                     gfp_mask;
};

struct audit_reply {
        __u32 portid;
        struct net *net;
        struct sk_buff *skb;
};

/**
 * auditd_test_task - Check to see if a given task is an audit daemon
 * @task: the task to check
 *
 * Description:
 * Return 1 if the task is a registered audit daemon, 0 otherwise.
 */
int auditd_test_task(struct task_struct *task)
{
        int rc;
        struct auditd_connection *ac;

        rcu_read_lock();
        ac = rcu_dereference(auditd_conn);
        rc = (ac && ac->pid == task_tgid(task) ? 1 : 0);
        rcu_read_unlock();

        return rc;
}

/**
 * audit_ctl_lock - Take the audit control lock
 */
void audit_ctl_lock(void)
{
        mutex_lock(&audit_cmd_mutex.lock);
        audit_cmd_mutex.owner = current;
}

/**
 * audit_ctl_unlock - Drop the audit control lock
 */
void audit_ctl_unlock(void)
{
        audit_cmd_mutex.owner = NULL;
        mutex_unlock(&audit_cmd_mutex.lock);
}

/**
 * audit_ctl_owner_current - Test to see if the current task owns the lock
 *
 * Description:
 * Return true if the current task owns the audit control lock, false if it
 * doesn't own the lock.
 */
static bool audit_ctl_owner_current(void)
{
        return (current == audit_cmd_mutex.owner);
}

/**
 * auditd_pid_vnr - Return the auditd PID relative to the namespace
 *
 * Description:
 * Returns the PID in relation to the namespace, 0 on failure.
 */
static pid_t auditd_pid_vnr(void)
{
        pid_t pid;
        const struct auditd_connection *ac;

        rcu_read_lock();
        ac = rcu_dereference(auditd_conn);
        if (!ac || !ac->pid)
                pid = 0;
        else
                pid = pid_vnr(ac->pid);
        rcu_read_unlock();

        return pid;
}

/**
 * audit_get_sk - Return the audit socket for the given network namespace
 * @net: the destination network namespace
 *
 * Description:
 * Returns the sock pointer if valid, NULL otherwise.  The caller must ensure
 * that a reference is held for the network namespace while the sock is in use.
 */
static struct sock *audit_get_sk(const struct net *net)
{
        struct audit_net *aunet;

        if (!net)
                return NULL;

        aunet = net_generic(net, audit_net_id);
        return aunet->sk;
}

void audit_panic(const char *message)
{
        switch (audit_failure) {
        case AUDIT_FAIL_SILENT:
                break;
        case AUDIT_FAIL_PRINTK:
                if (printk_ratelimit())
                        pr_err("%s\n", message);
                break;
        case AUDIT_FAIL_PANIC:
                panic("audit: %s\n", message);
                break;
        }
}

static inline int audit_rate_check(void)
{
        static unsigned long        last_check = 0;
        static int                messages   = 0;
        static DEFINE_SPINLOCK(lock);
        unsigned long                flags;
        unsigned long                now;
        int                        retval           = 0;

        if (!audit_rate_limit)
                return 1;

        spin_lock_irqsave(&lock, flags);
        if (++messages < audit_rate_limit) {
                retval = 1;
        } else {
                now = jiffies;
                if (time_after(now, last_check + HZ)) {
                        last_check = now;
                        messages   = 0;
                        retval     = 1;
                }
        }
        spin_unlock_irqrestore(&lock, flags);

        return retval;
}

/**
 * audit_log_lost - conditionally log lost audit message event
 * @message: the message stating reason for lost audit message
 *
 * Emit at least 1 message per second, even if audit_rate_check is
 * throttling.
 * Always increment the lost messages counter.
*/
void audit_log_lost(const char *message)
{
        static unsigned long        last_msg = 0;
        static DEFINE_SPINLOCK(lock);
        unsigned long                flags;
        unsigned long                now;
        int                        print;

        atomic_inc(&audit_lost);

        print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);

        if (!print) {
                spin_lock_irqsave(&lock, flags);
                now = jiffies;
                if (time_after(now, last_msg + HZ)) {
                        print = 1;
                        last_msg = now;
                }
                spin_unlock_irqrestore(&lock, flags);
        }

        if (print) {
                if (printk_ratelimit())
                        pr_warn("audit_lost=%u audit_rate_limit=%u audit_backlog_limit=%u\n",
                                atomic_read(&audit_lost),
                                audit_rate_limit,
                                audit_backlog_limit);
                audit_panic(message);
        }
}

static int audit_log_config_change(char *function_name, u32 new, u32 old,
                                   int allow_changes)
{
        struct audit_buffer *ab;
        int rc = 0;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (unlikely(!ab))
                return rc;
        audit_log_format(ab, "op=set %s=%u old=%u ", function_name, new, old);
        audit_log_session_info(ab);
        rc = audit_log_task_context(ab);
        if (rc)
                allow_changes = 0; /* Something weird, deny request */
        audit_log_format(ab, " res=%d", allow_changes);
        audit_log_end(ab);
        return rc;
}

static int audit_do_config_change(char *function_name, u32 *to_change, u32 new)
{
        int allow_changes, rc = 0;
        u32 old = *to_change;

        /* check if we are locked */
        if (audit_enabled == AUDIT_LOCKED)
                allow_changes = 0;
        else
                allow_changes = 1;

        if (audit_enabled != AUDIT_OFF) {
                rc = audit_log_config_change(function_name, new, old, allow_changes);
                if (rc)
                        allow_changes = 0;
        }

        /* If we are allowed, make the change */
        if (allow_changes == 1)
                *to_change = new;
        /* Not allowed, update reason */
        else if (rc == 0)
                rc = -EPERM;
        return rc;
}

static int audit_set_rate_limit(u32 limit)
{
        return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
}

static int audit_set_backlog_limit(u32 limit)
{
        return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
}

static int audit_set_backlog_wait_time(u32 timeout)
{
        return audit_do_config_change("audit_backlog_wait_time",
                                      &audit_backlog_wait_time, timeout);
}

static int audit_set_enabled(u32 state)
{
        int rc;
        if (state > AUDIT_LOCKED)
                return -EINVAL;

        rc =  audit_do_config_change("audit_enabled", &audit_enabled, state);
        if (!rc)
                audit_ever_enabled |= !!state;

        return rc;
}

static int audit_set_failure(u32 state)
{
        if (state != AUDIT_FAIL_SILENT
            && state != AUDIT_FAIL_PRINTK
            && state != AUDIT_FAIL_PANIC)
                return -EINVAL;

        return audit_do_config_change("audit_failure", &audit_failure, state);
}

/**
 * auditd_conn_free - RCU helper to release an auditd connection struct
 * @rcu: RCU head
 *
 * Description:
 * Drop any references inside the auditd connection tracking struct and free
 * the memory.
 */
static void auditd_conn_free(struct rcu_head *rcu)
{
        struct auditd_connection *ac;

        ac = container_of(rcu, struct auditd_connection, rcu);
        put_pid(ac->pid);
        put_net(ac->net);
        kfree(ac);
}

/**
 * auditd_set - Set/Reset the auditd connection state
 * @pid: auditd PID
 * @portid: auditd netlink portid
 * @net: auditd network namespace pointer
 * @skb: the netlink command from the audit daemon
 * @ack: netlink ack flag, cleared if ack'd here
 *
 * Description:
 * This function will obtain and drop network namespace references as
 * necessary.  Returns zero on success, negative values on failure.
 */
static int auditd_set(struct pid *pid, u32 portid, struct net *net,
                      struct sk_buff *skb, bool *ack)
{
        unsigned long flags;
        struct auditd_connection *ac_old, *ac_new;
        struct nlmsghdr *nlh;

        if (!pid || !net)
                return -EINVAL;

        ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL);
        if (!ac_new)
                return -ENOMEM;
        ac_new->pid = get_pid(pid);
        ac_new->portid = portid;
        ac_new->net = get_net(net);

        /* send the ack now to avoid a race with the queue backlog */
        if (*ack) {
                nlh = nlmsg_hdr(skb);
                netlink_ack(skb, nlh, 0, NULL);
                *ack = false;
        }

        spin_lock_irqsave(&auditd_conn_lock, flags);
        ac_old = rcu_dereference_protected(auditd_conn,
                                           lockdep_is_held(&auditd_conn_lock));
        rcu_assign_pointer(auditd_conn, ac_new);
        spin_unlock_irqrestore(&auditd_conn_lock, flags);

        if (ac_old)
                call_rcu(&ac_old->rcu, auditd_conn_free);

        return 0;
}

/**
 * kauditd_printk_skb - Print the audit record to the ring buffer
 * @skb: audit record
 *
 * Whatever the reason, this packet may not make it to the auditd connection
 * so write it via printk so the information isn't completely lost.
 */
static void kauditd_printk_skb(struct sk_buff *skb)
{
        struct nlmsghdr *nlh = nlmsg_hdr(skb);
        char *data = nlmsg_data(nlh);

        if (nlh->nlmsg_type != AUDIT_EOE && printk_ratelimit())
                pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
}

/**
 * kauditd_rehold_skb - Handle a audit record send failure in the hold queue
 * @skb: audit record
 * @error: error code (unused)
 *
 * Description:
 * This should only be used by the kauditd_thread when it fails to flush the
 * hold queue.
 */
static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
        /* put the record back in the queue */
        skb_queue_tail(&audit_hold_queue, skb);
}

/**
 * kauditd_hold_skb - Queue an audit record, waiting for auditd
 * @skb: audit record
 * @error: error code
 *
 * Description:
 * Queue the audit record, waiting for an instance of auditd.  When this
 * function is called we haven't given up yet on sending the record, but things
 * are not looking good.  The first thing we want to do is try to write the
 * record via printk and then see if we want to try and hold on to the record
 * and queue it, if we have room.  If we want to hold on to the record, but we
 * don't have room, record a record lost message.
 */
static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
        /* at this point it is uncertain if we will ever send this to auditd so
         * try to send the message via printk before we go any further */
        kauditd_printk_skb(skb);

        /* can we just silently drop the message? */
        if (!audit_default)
                goto drop;

        /* the hold queue is only for when the daemon goes away completely,
         * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
         * record on the retry queue unless it's full, in which case drop it
         */
        if (error == -EAGAIN) {
                if (!audit_backlog_limit ||
                    skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
                        skb_queue_tail(&audit_retry_queue, skb);
                        return;
                }
                audit_log_lost("kauditd retry queue overflow");
                goto drop;
        }

        /* if we have room in the hold queue, queue the message */
        if (!audit_backlog_limit ||
            skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
                skb_queue_tail(&audit_hold_queue, skb);
                return;
        }

        /* we have no other options - drop the message */
        audit_log_lost("kauditd hold queue overflow");
drop:
        kfree_skb(skb);
}

/**
 * kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
 * @skb: audit record
 * @error: error code (unused)
 *
 * Description:
 * Not as serious as kauditd_hold_skb() as we still have a connected auditd,
 * but for some reason we are having problems sending it audit records so
 * queue the given record and attempt to resend.
 */
static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
        if (!audit_backlog_limit ||
            skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
                skb_queue_tail(&audit_retry_queue, skb);
                return;
        }

        /* we have to drop the record, send it via printk as a last effort */
        kauditd_printk_skb(skb);
        audit_log_lost("kauditd retry queue overflow");
        kfree_skb(skb);
}

/**
 * auditd_reset - Disconnect the auditd connection
 * @ac: auditd connection state
 *
 * Description:
 * Break the auditd/kauditd connection and move all the queued records into the
 * hold queue in case auditd reconnects.  It is important to note that the @ac
 * pointer should never be dereferenced inside this function as it may be NULL
 * or invalid, you can only compare the memory address!  If @ac is NULL then
 * the connection will always be reset.
 */
static void auditd_reset(const struct auditd_connection *ac)
{
        unsigned long flags;
        struct sk_buff *skb;
        struct auditd_connection *ac_old;

        /* if it isn't already broken, break the connection */
        spin_lock_irqsave(&auditd_conn_lock, flags);
        ac_old = rcu_dereference_protected(auditd_conn,
                                           lockdep_is_held(&auditd_conn_lock));
        if (ac && ac != ac_old) {
                /* someone already registered a new auditd connection */
                spin_unlock_irqrestore(&auditd_conn_lock, flags);
                return;
        }
        rcu_assign_pointer(auditd_conn, NULL);
        spin_unlock_irqrestore(&auditd_conn_lock, flags);

        if (ac_old)
                call_rcu(&ac_old->rcu, auditd_conn_free);

        /* flush the retry queue to the hold queue, but don't touch the main
         * queue since we need to process that normally for multicast */
        while ((skb = skb_dequeue(&audit_retry_queue)))
                kauditd_hold_skb(skb, -ECONNREFUSED);
}

/**
 * auditd_send_unicast_skb - Send a record via unicast to auditd
 * @skb: audit record
 *
 * Description:
 * Send a skb to the audit daemon, returns positive/zero values on success and
 * negative values on failure; in all cases the skb will be consumed by this
 * function.  If the send results in -ECONNREFUSED the connection with auditd
 * will be reset.  This function may sleep so callers should not hold any locks
 * where this would cause a problem.
 */
static int auditd_send_unicast_skb(struct sk_buff *skb)
{
        int rc;
        u32 portid;
        struct net *net;
        struct sock *sk;
        struct auditd_connection *ac;

        /* NOTE: we can't call netlink_unicast while in the RCU section so
         *       take a reference to the network namespace and grab local
         *       copies of the namespace, the sock, and the portid; the
         *       namespace and sock aren't going to go away while we hold a
         *       reference and if the portid does become invalid after the RCU
         *       section netlink_unicast() should safely return an error */

        rcu_read_lock();
        ac = rcu_dereference(auditd_conn);
        if (!ac) {
                rcu_read_unlock();
                kfree_skb(skb);
                rc = -ECONNREFUSED;
                goto err;
        }
        net = get_net(ac->net);
        sk = audit_get_sk(net);
        portid = ac->portid;
        rcu_read_unlock();

        rc = netlink_unicast(sk, skb, portid, 0);
        put_net(net);
        if (rc < 0)
                goto err;

        return rc;

err:
        if (ac && rc == -ECONNREFUSED)
                auditd_reset(ac);
        return rc;
}

/**
 * kauditd_send_queue - Helper for kauditd_thread to flush skb queues
 * @sk: the sending sock
 * @portid: the netlink destination
 * @queue: the skb queue to process
 * @retry_limit: limit on number of netlink unicast failures
 * @skb_hook: per-skb hook for additional processing
 * @err_hook: hook called if the skb fails the netlink unicast send
 *
 * Description:
 * Run through the given queue and attempt to send the audit records to auditd,
 * returns zero on success, negative values on failure.  It is up to the caller
 * to ensure that the @sk is valid for the duration of this function.
 *
 */
static int kauditd_send_queue(struct sock *sk, u32 portid,
                              struct sk_buff_head *queue,
                              unsigned int retry_limit,
                              void (*skb_hook)(struct sk_buff *skb),
                              void (*err_hook)(struct sk_buff *skb, int error))
{
        int rc = 0;
        struct sk_buff *skb = NULL;
        struct sk_buff *skb_tail;
        unsigned int failed = 0;

        /* NOTE: kauditd_thread takes care of all our locking, we just use
         *       the netlink info passed to us (e.g. sk and portid) */

        skb_tail = skb_peek_tail(queue);
        while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
                /* call the skb_hook for each skb we touch */
                if (skb_hook)
                        (*skb_hook)(skb);

                /* can we send to anyone via unicast? */
                if (!sk) {
                        if (err_hook)
                                (*err_hook)(skb, -ECONNREFUSED);
                        continue;
                }

retry:
                /* grab an extra skb reference in case of error */
                skb_get(skb);
                rc = netlink_unicast(sk, skb, portid, 0);
                if (rc < 0) {
                        /* send failed - try a few times unless fatal error */
                        if (++failed >= retry_limit ||
                            rc == -ECONNREFUSED || rc == -EPERM) {
                                sk = NULL;
                                if (err_hook)
                                        (*err_hook)(skb, rc);
                                if (rc == -EAGAIN)
                                        rc = 0;
                                /* continue to drain the queue */
                                continue;
                        } else
                                goto retry;
                } else {
                        /* skb sent - drop the extra reference and continue */
                        consume_skb(skb);
                        failed = 0;
                }
        }

        return (rc >= 0 ? 0 : rc);
}

/*
 * kauditd_send_multicast_skb - Send a record to any multicast listeners
 * @skb: audit record
 *
 * Description:
 * Write a multicast message to anyone listening in the initial network
 * namespace.  This function doesn't consume an skb as might be expected since
 * it has to copy it anyways.
 */
static void kauditd_send_multicast_skb(struct sk_buff *skb)
{
        struct sk_buff *copy;
        struct sock *sock = audit_get_sk(&init_net);
        struct nlmsghdr *nlh;

        /* NOTE: we are not taking an additional reference for init_net since
         *       we don't have to worry about it going away */

        if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
                return;

        /*
         * The seemingly wasteful skb_copy() rather than bumping the refcount
         * using skb_get() is necessary because non-standard mods are made to
         * the skb by the original kaudit unicast socket send routine.  The
         * existing auditd daemon assumes this breakage.  Fixing this would
         * require co-ordinating a change in the established protocol between
         * the kaudit kernel subsystem and the auditd userspace code.  There is
         * no reason for new multicast clients to continue with this
         * non-compliance.
         */
        copy = skb_copy(skb, GFP_KERNEL);
        if (!copy)
                return;
        nlh = nlmsg_hdr(copy);
        nlh->nlmsg_len = skb->len;

        nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
}

/**
 * kauditd_thread - Worker thread to send audit records to userspace
 * @dummy: unused
 */
static int kauditd_thread(void *dummy)
{
        int rc;
        u32 portid = 0;
        struct net *net = NULL;
        struct sock *sk = NULL;
        struct auditd_connection *ac;

#define UNICAST_RETRIES 5

        set_freezable();
        while (!kthread_should_stop()) {
                /* NOTE: see the lock comments in auditd_send_unicast_skb() */
                rcu_read_lock();
                ac = rcu_dereference(auditd_conn);
                if (!ac) {
                        rcu_read_unlock();
                        goto main_queue;
                }
                net = get_net(ac->net);
                sk = audit_get_sk(net);
                portid = ac->portid;
                rcu_read_unlock();

                /* attempt to flush the hold queue */
                rc = kauditd_send_queue(sk, portid,
                                        &audit_hold_queue, UNICAST_RETRIES,
                                        NULL, kauditd_rehold_skb);
                if (rc < 0) {
                        sk = NULL;
                        auditd_reset(ac);
                        goto main_queue;
                }

                /* attempt to flush the retry queue */
                rc = kauditd_send_queue(sk, portid,
                                        &audit_retry_queue, UNICAST_RETRIES,
                                        NULL, kauditd_hold_skb);
                if (rc < 0) {
                        sk = NULL;
                        auditd_reset(ac);
                        goto main_queue;
                }

main_queue:
                /* process the main queue - do the multicast send and attempt
                 * unicast, dump failed record sends to the retry queue; if
                 * sk == NULL due to previous failures we will just do the
                 * multicast send and move the record to the hold queue */
                rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
                                        kauditd_send_multicast_skb,
                                        (sk ?
                                         kauditd_retry_skb : kauditd_hold_skb));
                if (ac && rc < 0)
                        auditd_reset(ac);
                sk = NULL;

                /* drop our netns reference, no auditd sends past this line */
                if (net) {
                        put_net(net);
                        net = NULL;
                }

                /* we have processed all the queues so wake everyone */
                wake_up(&audit_backlog_wait);

                /* NOTE: we want to wake up if there is anything on the queue,
                 *       regardless of if an auditd is connected, as we need to
                 *       do the multicast send and rotate records from the
                 *       main queue to the retry/hold queues */
                wait_event_freezable(kauditd_wait,
                                     (skb_queue_len(&audit_queue) ? 1 : 0));
        }

        return 0;
}

int audit_send_list_thread(void *_dest)
{
        struct audit_netlink_list *dest = _dest;
        struct sk_buff *skb;
        struct sock *sk = audit_get_sk(dest->net);

        /* wait for parent to finish and send an ACK */
        audit_ctl_lock();
        audit_ctl_unlock();

        while ((skb = __skb_dequeue(&dest->q)) != NULL)
                netlink_unicast(sk, skb, dest->portid, 0);

        put_net(dest->net);
        kfree(dest);

        return 0;
}

struct sk_buff *audit_make_reply(int seq, int type, int done,
                                 int multi, const void *payload, int size)
{
        struct sk_buff        *skb;
        struct nlmsghdr        *nlh;
        void                *data;
        int                flags = multi ? NLM_F_MULTI : 0;
        int                t     = done  ? NLMSG_DONE  : type;

        skb = nlmsg_new(size, GFP_KERNEL);
        if (!skb)
                return NULL;

        nlh        = nlmsg_put(skb, 0, seq, t, size, flags);
        if (!nlh)
                goto out_kfree_skb;
        data = nlmsg_data(nlh);
        memcpy(data, payload, size);
        return skb;

out_kfree_skb:
        kfree_skb(skb);
        return NULL;
}

static void audit_free_reply(struct audit_reply *reply)
{
        if (!reply)
                return;

        kfree_skb(reply->skb);
        if (reply->net)
                put_net(reply->net);
        kfree(reply);
}

static int audit_send_reply_thread(void *arg)
{
        struct audit_reply *reply = (struct audit_reply *)arg;

        audit_ctl_lock();
        audit_ctl_unlock();

        /* Ignore failure. It'll only happen if the sender goes away,
           because our timeout is set to infinite. */
        netlink_unicast(audit_get_sk(reply->net), reply->skb, reply->portid, 0);
        reply->skb = NULL;
        audit_free_reply(reply);
        return 0;
}

/**
 * audit_send_reply - send an audit reply message via netlink
 * @request_skb: skb of request we are replying to (used to target the reply)
 * @seq: sequence number
 * @type: audit message type
 * @done: done (last) flag
 * @multi: multi-part message flag
 * @payload: payload data
 * @size: payload size
 *
 * Allocates a skb, builds the netlink message, and sends it to the port id.
 */
static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
                             int multi, const void *payload, int size)
{
        struct task_struct *tsk;
        struct audit_reply *reply;

        reply = kzalloc(sizeof(*reply), GFP_KERNEL);
        if (!reply)
                return;

        reply->skb = audit_make_reply(seq, type, done, multi, payload, size);
        if (!reply->skb)
                goto err;
        reply->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
        reply->portid = NETLINK_CB(request_skb).portid;

        tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
        if (IS_ERR(tsk))
                goto err;

        return;

err:
        audit_free_reply(reply);
}

/*
 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
 * control messages.
 */
static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
{
        int err = 0;

        /* Only support initial user namespace for now. */
        /*
         * We return ECONNREFUSED because it tricks userspace into thinking
         * that audit was not configured into the kernel.  Lots of users
         * configure their PAM stack (because that's what the distro does)
         * to reject login if unable to send messages to audit.  If we return
         * ECONNREFUSED the PAM stack thinks the kernel does not have audit
         * configured in and will let login proceed.  If we return EPERM
         * userspace will reject all logins.  This should be removed when we
         * support non init namespaces!!
         */
        if (current_user_ns() != &init_user_ns)
                return -ECONNREFUSED;

        switch (msg_type) {
        case AUDIT_LIST:
        case AUDIT_ADD:
        case AUDIT_DEL:
                return -EOPNOTSUPP;
        case AUDIT_GET:
        case AUDIT_SET:
        case AUDIT_GET_FEATURE:
        case AUDIT_SET_FEATURE:
        case AUDIT_LIST_RULES:
        case AUDIT_ADD_RULE:
        case AUDIT_DEL_RULE:
        case AUDIT_SIGNAL_INFO:
        case AUDIT_TTY_GET:
        case AUDIT_TTY_SET:
        case AUDIT_TRIM:
        case AUDIT_MAKE_EQUIV:
                /* Only support auditd and auditctl in initial pid namespace
                 * for now. */
                if (task_active_pid_ns(current) != &init_pid_ns)
                        return -EPERM;

                if (!netlink_capable(skb, CAP_AUDIT_CONTROL))
                        err = -EPERM;
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
                if (!netlink_capable(skb, CAP_AUDIT_WRITE))
                        err = -EPERM;
                break;
        default:  /* bad msg */
                err = -EINVAL;
        }

        return err;
}

static void audit_log_common_recv_msg(struct audit_context *context,
                                        struct audit_buffer **ab, u16 msg_type)
{
        uid_t uid = from_kuid(&init_user_ns, current_uid());
        pid_t pid = task_tgid_nr(current);

        if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
                *ab = NULL;
                return;
        }

        *ab = audit_log_start(context, GFP_KERNEL, msg_type);
        if (unlikely(!*ab))
                return;
        audit_log_format(*ab, "pid=%d uid=%u ", pid, uid);
        audit_log_session_info(*ab);
        audit_log_task_context(*ab);
}

static inline void audit_log_user_recv_msg(struct audit_buffer **ab,
                                           u16 msg_type)
{
        audit_log_common_recv_msg(NULL, ab, msg_type);
}

static int is_audit_feature_set(int i)
{
        return af.features & AUDIT_FEATURE_TO_MASK(i);
}


static int audit_get_feature(struct sk_buff *skb)
{
        u32 seq;

        seq = nlmsg_hdr(skb)->nlmsg_seq;

        audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af));

        return 0;
}

static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature,
                                     u32 old_lock, u32 new_lock, int res)
{
        struct audit_buffer *ab;

        if (audit_enabled == AUDIT_OFF)
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_FEATURE_CHANGE);
        if (!ab)
                return;
        audit_log_task_info(ab);
        audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
                         audit_feature_names[which], !!old_feature, !!new_feature,
                         !!old_lock, !!new_lock, res);
        audit_log_end(ab);
}

static int audit_set_feature(struct audit_features *uaf)
{
        int i;

        BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));

        /* if there is ever a version 2 we should handle that here */

        for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
                u32 feature = AUDIT_FEATURE_TO_MASK(i);
                u32 old_feature, new_feature, old_lock, new_lock;

                /* if we are not changing this feature, move along */
                if (!(feature & uaf->mask))
                        continue;

                old_feature = af.features & feature;
                new_feature = uaf->features & feature;
                new_lock = (uaf->lock | af.lock) & feature;
                old_lock = af.lock & feature;

                /* are we changing a locked feature? */
                if (old_lock && (new_feature != old_feature)) {
                        audit_log_feature_change(i, old_feature, new_feature,
                                                 old_lock, new_lock, 0);
                        return -EPERM;
                }
        }
        /* nothing invalid, do the changes */
        for (i = 0; i <= AUDIT_LAST_FEATURE; i++) {
                u32 feature = AUDIT_FEATURE_TO_MASK(i);
                u32 old_feature, new_feature, old_lock, new_lock;

                /* if we are not changing this feature, move along */
                if (!(feature & uaf->mask))
                        continue;

                old_feature = af.features & feature;
                new_feature = uaf->features & feature;
                old_lock = af.lock & feature;
                new_lock = (uaf->lock | af.lock) & feature;

                if (new_feature != old_feature)
                        audit_log_feature_change(i, old_feature, new_feature,
                                                 old_lock, new_lock, 1);

                if (new_feature)
                        af.features |= feature;
                else
                        af.features &= ~feature;
                af.lock |= new_lock;
        }

        return 0;
}

static int audit_replace(struct pid *pid)
{
        pid_t pvnr;
        struct sk_buff *skb;

        pvnr = pid_vnr(pid);
        skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr));
        if (!skb)
                return -ENOMEM;
        return auditd_send_unicast_skb(skb);
}

static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
                             bool *ack)
{
        u32                        seq;
        void                        *data;
        int                        data_len;
        int                        err;
        struct audit_buffer        *ab;
        u16                        msg_type = nlh->nlmsg_type;
        struct audit_sig_info   *sig_data;
        struct lsm_context        lsmctx = { NULL, 0, 0 };

        err = audit_netlink_ok(skb, msg_type);
        if (err)
                return err;

        seq  = nlh->nlmsg_seq;
        data = nlmsg_data(nlh);
        data_len = nlmsg_len(nlh);

        switch (msg_type) {
        case AUDIT_GET: {
                struct audit_status        s;
                memset(&s, 0, sizeof(s));
                s.enabled                   = audit_enabled;
                s.failure                   = audit_failure;
                /* NOTE: use pid_vnr() so the PID is relative to the current
                 *       namespace */
                s.pid                           = auditd_pid_vnr();
                s.rate_limit                   = audit_rate_limit;
                s.backlog_limit                   = audit_backlog_limit;
                s.lost                           = atomic_read(&audit_lost);
                s.backlog                   = skb_queue_len(&audit_queue);
                s.feature_bitmap           = AUDIT_FEATURE_BITMAP_ALL;
                s.backlog_wait_time           = audit_backlog_wait_time;
                s.backlog_wait_time_actual = atomic_read(&audit_backlog_wait_time_actual);
                audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_SET: {
                struct audit_status        s;
                memset(&s, 0, sizeof(s));
                /* guard against past and future API changes */
                memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
                if (s.mask & AUDIT_STATUS_ENABLED) {
                        err = audit_set_enabled(s.enabled);
                        if (err < 0)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_FAILURE) {
                        err = audit_set_failure(s.failure);
                        if (err < 0)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_PID) {
                        /* NOTE: we are using the vnr PID functions below
                         *       because the s.pid value is relative to the
                         *       namespace of the caller; at present this
                         *       doesn't matter much since you can really only
                         *       run auditd from the initial pid namespace, but
                         *       something to keep in mind if this changes */
                        pid_t new_pid = s.pid;
                        pid_t auditd_pid;
                        struct pid *req_pid = task_tgid(current);

                        /* Sanity check - PID values must match. Setting
                         * pid to 0 is how auditd ends auditing. */
                        if (new_pid && (new_pid != pid_vnr(req_pid)))
                                return -EINVAL;

                        /* test the auditd connection */
                        audit_replace(req_pid);

                        auditd_pid = auditd_pid_vnr();
                        if (auditd_pid) {
                                /* replacing a healthy auditd is not allowed */
                                if (new_pid) {
                                        audit_log_config_change("audit_pid",
                                                        new_pid, auditd_pid, 0);
                                        return -EEXIST;
                                }
                                /* only current auditd can unregister itself */
                                if (pid_vnr(req_pid) != auditd_pid) {
                                        audit_log_config_change("audit_pid",
                                                        new_pid, auditd_pid, 0);
                                        return -EACCES;
                                }
                        }

                        if (new_pid) {
                                /* register a new auditd connection */
                                err = auditd_set(req_pid,
                                                 NETLINK_CB(skb).portid,
                                                 sock_net(NETLINK_CB(skb).sk),
                                                 skb, ack);
                                if (audit_enabled != AUDIT_OFF)
                                        audit_log_config_change("audit_pid",
                                                                new_pid,
                                                                auditd_pid,
                                                                err ? 0 : 1);
                                if (err)
                                        return err;

                                /* try to process any backlog */
                                wake_up_interruptible(&kauditd_wait);
                        } else {
                                if (audit_enabled != AUDIT_OFF)
                                        audit_log_config_change("audit_pid",
                                                                new_pid,
                                                                auditd_pid, 1);

                                /* unregister the auditd connection */
                                auditd_reset(NULL);
                        }
                }
                if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
                        err = audit_set_rate_limit(s.rate_limit);
                        if (err < 0)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_BACKLOG_LIMIT) {
                        err = audit_set_backlog_limit(s.backlog_limit);
                        if (err < 0)
                                return err;
                }
                if (s.mask & AUDIT_STATUS_BACKLOG_WAIT_TIME) {
                        if (sizeof(s) > (size_t)nlh->nlmsg_len)
                                return -EINVAL;
                        if (s.backlog_wait_time > 10*AUDIT_BACKLOG_WAIT_TIME)
                                return -EINVAL;
                        err = audit_set_backlog_wait_time(s.backlog_wait_time);
                        if (err < 0)
                                return err;
                }
                if (s.mask == AUDIT_STATUS_LOST) {
                        u32 lost = atomic_xchg(&audit_lost, 0);

                        audit_log_config_change("lost", 0, lost, 1);
                        return lost;
                }
                if (s.mask == AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL) {
                        u32 actual = atomic_xchg(&audit_backlog_wait_time_actual, 0);

                        audit_log_config_change("backlog_wait_time_actual", 0, actual, 1);
                        return actual;
                }
                break;
        }
        case AUDIT_GET_FEATURE:
                err = audit_get_feature(skb);
                if (err)
                        return err;
                break;
        case AUDIT_SET_FEATURE:
                if (data_len < sizeof(struct audit_features))
                        return -EINVAL;
                err = audit_set_feature(data);
                if (err)
                        return err;
                break;
        case AUDIT_USER:
        case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
        case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
                if (!audit_enabled && msg_type != AUDIT_USER_AVC)
                        return 0;
                /* exit early if there isn't at least one character to print */
                if (data_len < 2)
                        return -EINVAL;

                err = audit_filter(msg_type, AUDIT_FILTER_USER);
                if (err == 1) { /* match or error */
                        char *str = data;

                        err = 0;
                        if (msg_type == AUDIT_USER_TTY) {
                                err = tty_audit_push();
                                if (err)
                                        break;
                        }
                        audit_log_user_recv_msg(&ab, msg_type);
                        if (msg_type != AUDIT_USER_TTY) {
                                /* ensure NULL termination */
                                str[data_len - 1] = '\0';
                                audit_log_format(ab, " msg='%.*s'",
                                                 AUDIT_MESSAGE_TEXT_MAX,
                                                 str);
                        } else {
                                audit_log_format(ab, " data=");
                                if (str[data_len - 1] == '\0')
                                        data_len--;
                                audit_log_n_untrustedstring(ab, str, data_len);
                        }
                        audit_log_end(ab);
                }
                break;
        case AUDIT_ADD_RULE:
        case AUDIT_DEL_RULE:
                if (data_len < sizeof(struct audit_rule_data))
                        return -EINVAL;
                if (audit_enabled == AUDIT_LOCKED) {
                        audit_log_common_recv_msg(audit_context(), &ab,
                                                  AUDIT_CONFIG_CHANGE);
                        audit_log_format(ab, " op=%s audit_enabled=%d res=0",
                                         msg_type == AUDIT_ADD_RULE ?
                                                "add_rule" : "remove_rule",
                                         audit_enabled);
                        audit_log_end(ab);
                        return -EPERM;
                }
                err = audit_rule_change(msg_type, seq, data, data_len);
                break;
        case AUDIT_LIST_RULES:
                err = audit_list_rules_send(skb, seq);
                break;
        case AUDIT_TRIM:
                audit_trim_trees();
                audit_log_common_recv_msg(audit_context(), &ab,
                                          AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, " op=trim res=1");
                audit_log_end(ab);
                break;
        case AUDIT_MAKE_EQUIV: {
                void *bufp = data;
                u32 sizes[2];
                size_t msglen = data_len;
                char *old, *new;

                err = -EINVAL;
                if (msglen < 2 * sizeof(u32))
                        break;
                memcpy(sizes, bufp, 2 * sizeof(u32));
                bufp += 2 * sizeof(u32);
                msglen -= 2 * sizeof(u32);
                old = audit_unpack_string(&bufp, &msglen, sizes[0]);
                if (IS_ERR(old)) {
                        err = PTR_ERR(old);
                        break;
                }
                new = audit_unpack_string(&bufp, &msglen, sizes[1]);
                if (IS_ERR(new)) {
                        err = PTR_ERR(new);
                        kfree(old);
                        break;
                }
                /* OK, here comes... */
                err = audit_tag_tree(old, new);

                audit_log_common_recv_msg(audit_context(), &ab,
                                          AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, " op=make_equiv old=");
                audit_log_untrustedstring(ab, old);
                audit_log_format(ab, " new=");
                audit_log_untrustedstring(ab, new);
                audit_log_format(ab, " res=%d", !err);
                audit_log_end(ab);
                kfree(old);
                kfree(new);
                break;
        }
        case AUDIT_SIGNAL_INFO:
                if (lsmprop_is_set(&audit_sig_lsm)) {
                        err = security_lsmprop_to_secctx(&audit_sig_lsm,
                                                         &lsmctx);
                        if (err < 0)
                                return err;
                }
                sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len),
                                   GFP_KERNEL);
                if (!sig_data) {
                        if (lsmprop_is_set(&audit_sig_lsm))
                                security_release_secctx(&lsmctx);
                        return -ENOMEM;
                }
                sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
                sig_data->pid = audit_sig_pid;
                if (lsmprop_is_set(&audit_sig_lsm)) {
                        memcpy(sig_data->ctx, lsmctx.context, lsmctx.len);
                        security_release_secctx(&lsmctx);
                }
                audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
                                 sig_data, struct_size(sig_data, ctx,
                                                       lsmctx.len));
                kfree(sig_data);
                break;
        case AUDIT_TTY_GET: {
                struct audit_tty_status s;
                unsigned int t;

                t = READ_ONCE(current->signal->audit_tty);
                s.enabled = t & AUDIT_TTY_ENABLE;
                s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);

                audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
                break;
        }
        case AUDIT_TTY_SET: {
                struct audit_tty_status s, old;
                struct audit_buffer        *ab;
                unsigned int t;

                memset(&s, 0, sizeof(s));
                /* guard against past and future API changes */
                memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
                /* check if new data is valid */
                if ((s.enabled != 0 && s.enabled != 1) ||
                    (s.log_passwd != 0 && s.log_passwd != 1))
                        err = -EINVAL;

                if (err)
                        t = READ_ONCE(current->signal->audit_tty);
                else {
                        t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD);
                        t = xchg(&current->signal->audit_tty, t);
                }
                old.enabled = t & AUDIT_TTY_ENABLE;
                old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD);

                audit_log_common_recv_msg(audit_context(), &ab,
                                          AUDIT_CONFIG_CHANGE);
                audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d"
                                 " old-log_passwd=%d new-log_passwd=%d res=%d",
                                 old.enabled, s.enabled, old.log_passwd,
                                 s.log_passwd, !err);
                audit_log_end(ab);
                break;
        }
        default:
                err = -EINVAL;
                break;
        }

        return err < 0 ? err : 0;
}

/**
 * audit_receive - receive messages from a netlink control socket
 * @skb: the message buffer
 *
 * Parse the provided skb and deal with any messages that may be present,
 * malformed skbs are discarded.
 */
static void audit_receive(struct sk_buff *skb)
{
        struct nlmsghdr *nlh;
        bool ack;
        /*
         * len MUST be signed for nlmsg_next to be able to dec it below 0
         * if the nlmsg_len was not aligned
         */
        int len;
        int err;

        nlh = nlmsg_hdr(skb);
        len = skb->len;

        audit_ctl_lock();
        while (nlmsg_ok(nlh, len)) {
                ack = nlh->nlmsg_flags & NLM_F_ACK;
                err = audit_receive_msg(skb, nlh, &ack);

                /* send an ack if the user asked for one and audit_receive_msg
                 * didn't already do it, or if there was an error. */
                if (ack || err)
                        netlink_ack(skb, nlh, err, NULL);

                nlh = nlmsg_next(nlh, &len);
        }
        audit_ctl_unlock();

        /* can't block with the ctrl lock, so penalize the sender now */
        if (audit_backlog_limit &&
            (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
                DECLARE_WAITQUEUE(wait, current);

                /* wake kauditd to try and flush the queue */
                wake_up_interruptible(&kauditd_wait);

                add_wait_queue_exclusive(&audit_backlog_wait, &wait);
                set_current_state(TASK_UNINTERRUPTIBLE);
                schedule_timeout(audit_backlog_wait_time);
                remove_wait_queue(&audit_backlog_wait, &wait);
        }
}

/* Log information about who is connecting to the audit multicast socket */
static void audit_log_multicast(int group, const char *op, int err)
{
        const struct cred *cred;
        struct tty_struct *tty;
        char comm[sizeof(current->comm)];
        struct audit_buffer *ab;

        if (!audit_enabled)
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_EVENT_LISTENER);
        if (!ab)
                return;

        cred = current_cred();
        tty = audit_get_tty();
        audit_log_format(ab, "pid=%u uid=%u auid=%u tty=%s ses=%u",
                         task_tgid_nr(current),
                         from_kuid(&init_user_ns, cred->uid),
                         from_kuid(&init_user_ns, audit_get_loginuid(current)),
                         tty ? tty_name(tty) : "(none)",
                         audit_get_sessionid(current));
        audit_put_tty(tty);
        audit_log_task_context(ab); /* subj= */
        audit_log_format(ab, " comm=");
        audit_log_untrustedstring(ab, get_task_comm(comm, current));
        audit_log_d_path_exe(ab, current->mm); /* exe= */
        audit_log_format(ab, " nl-mcgrp=%d op=%s res=%d", group, op, !err);
        audit_log_end(ab);
}

/* Run custom bind function on netlink socket group connect or bind requests. */
static int audit_multicast_bind(struct net *net, int group)
{
        int err = 0;

        if (!capable(CAP_AUDIT_READ))
                err = -EPERM;
        audit_log_multicast(group, "connect", err);
        return err;
}

static void audit_multicast_unbind(struct net *net, int group)
{
        audit_log_multicast(group, "disconnect", 0);
}

static int __net_init audit_net_init(struct net *net)
{
        struct netlink_kernel_cfg cfg = {
                .input        = audit_receive,
                .bind        = audit_multicast_bind,
                .unbind        = audit_multicast_unbind,
                .flags        = NL_CFG_F_NONROOT_RECV,
                .groups        = AUDIT_NLGRP_MAX,
        };

        struct audit_net *aunet = net_generic(net, audit_net_id);

        aunet->sk = netlink_kernel_create(net, NETLINK_AUDIT, &cfg);
        if (aunet->sk == NULL) {
                audit_panic("cannot initialize netlink socket in namespace");
                return -ENOMEM;
        }
        /* limit the timeout in case auditd is blocked/stopped */
        aunet->sk->sk_sndtimeo = HZ / 10;

        return 0;
}

static void __net_exit audit_net_exit(struct net *net)
{
        struct audit_net *aunet = net_generic(net, audit_net_id);

        /* NOTE: you would think that we would want to check the auditd
         * connection and potentially reset it here if it lives in this
         * namespace, but since the auditd connection tracking struct holds a
         * reference to this namespace (see auditd_set()) we are only ever
         * going to get here after that connection has been released */

        netlink_kernel_release(aunet->sk);
}

static struct pernet_operations audit_net_ops __net_initdata = {
        .init = audit_net_init,
        .exit = audit_net_exit,
        .id = &audit_net_id,
        .size = sizeof(struct audit_net),
};

/* Initialize audit support at boot time. */
static int __init audit_init(void)
{
        int i;

        if (audit_initialized == AUDIT_DISABLED)
                return 0;

        audit_buffer_cache = KMEM_CACHE(audit_buffer, SLAB_PANIC);

        skb_queue_head_init(&audit_queue);
        skb_queue_head_init(&audit_retry_queue);
        skb_queue_head_init(&audit_hold_queue);

        for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
                INIT_LIST_HEAD(&audit_inode_hash[i]);

        mutex_init(&audit_cmd_mutex.lock);
        audit_cmd_mutex.owner = NULL;

        pr_info("initializing netlink subsys (%s)\n",
                str_enabled_disabled(audit_default));
        register_pernet_subsys(&audit_net_ops);

        audit_initialized = AUDIT_INITIALIZED;

        kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
        if (IS_ERR(kauditd_task)) {
                int err = PTR_ERR(kauditd_task);
                panic("audit: failed to start the kauditd thread (%d)\n", err);
        }

        audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL,
                "state=initialized audit_enabled=%u res=1",
                 audit_enabled);

        return 0;
}
postcore_initcall(audit_init);

/*
 * Process kernel command-line parameter at boot time.
 * audit={0|off} or audit={1|on}.
 */
static int __init audit_enable(char *str)
{
        if (!strcasecmp(str, "off") || !strcmp(str, "0"))
                audit_default = AUDIT_OFF;
        else if (!strcasecmp(str, "on") || !strcmp(str, "1"))
                audit_default = AUDIT_ON;
        else {
                pr_err("audit: invalid 'audit' parameter value (%s)\n", str);
                audit_default = AUDIT_ON;
        }

        if (audit_default == AUDIT_OFF)
                audit_initialized = AUDIT_DISABLED;
        if (audit_set_enabled(audit_default))
                pr_err("audit: error setting audit state (%d)\n",
                       audit_default);

        pr_info("%s\n", audit_default ?
                "enabled (after initialization)" : "disabled (until reboot)");

        return 1;
}
__setup("audit=", audit_enable);

/* Process kernel command-line parameter at boot time.
 * audit_backlog_limit=<n> */
static int __init audit_backlog_limit_set(char *str)
{
        u32 audit_backlog_limit_arg;

        pr_info("audit_backlog_limit: ");
        if (kstrtouint(str, 0, &audit_backlog_limit_arg)) {
                pr_cont("using default of %u, unable to parse %s\n",
                        audit_backlog_limit, str);
                return 1;
        }

        audit_backlog_limit = audit_backlog_limit_arg;
        pr_cont("%d\n", audit_backlog_limit);

        return 1;
}
__setup("audit_backlog_limit=", audit_backlog_limit_set);

static void audit_buffer_free(struct audit_buffer *ab)
{
        if (!ab)
                return;

        kfree_skb(ab->skb);
        kmem_cache_free(audit_buffer_cache, ab);
}

static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx,
                                               gfp_t gfp_mask, int type)
{
        struct audit_buffer *ab;

        ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask);
        if (!ab)
                return NULL;

        ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask);
        if (!ab->skb)
                goto err;
        if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0))
                goto err;

        ab->ctx = ctx;
        ab->gfp_mask = gfp_mask;

        return ab;

err:
        audit_buffer_free(ab);
        return NULL;
}

/**
 * audit_serial - compute a serial number for the audit record
 *
 * Compute a serial number for the audit record.  Audit records are
 * written to user-space as soon as they are generated, so a complete
 * audit record may be written in several pieces.  The timestamp of the
 * record and this serial number are used by the user-space tools to
 * determine which pieces belong to the same audit record.  The
 * (timestamp,serial) tuple is unique for each syscall and is live from
 * syscall entry to syscall exit.
 *
 * NOTE: Another possibility is to store the formatted records off the
 * audit context (for those records that have a context), and emit them
 * all at syscall exit.  However, this could delay the reporting of
 * significant errors until syscall exit (or never, if the system
 * halts).
 */
unsigned int audit_serial(void)
{
        static atomic_t serial = ATOMIC_INIT(0);

        return atomic_inc_return(&serial);
}

static inline void audit_get_stamp(struct audit_context *ctx,
                                   struct timespec64 *t, unsigned int *serial)
{
        if (!ctx || !auditsc_get_stamp(ctx, t, serial)) {
                ktime_get_coarse_real_ts64(t);
                *serial = audit_serial();
        }
}

/**
 * audit_log_start - obtain an audit buffer
 * @ctx: audit_context (may be NULL)
 * @gfp_mask: type of allocation
 * @type: audit message type
 *
 * Returns audit_buffer pointer on success or NULL on error.
 *
 * Obtain an audit buffer.  This routine does locking to obtain the
 * audit buffer, but then no locking is required for calls to
 * audit_log_*format.  If the task (ctx) is a task that is currently in a
 * syscall, then the syscall is marked as auditable and an audit record
 * will be written at syscall exit.  If there is no associated task, then
 * task context (ctx) should be NULL.
 */
struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
                                     int type)
{
        struct audit_buffer *ab;
        struct timespec64 t;
        unsigned int serial;

        if (audit_initialized != AUDIT_INITIALIZED)
                return NULL;

        if (unlikely(!audit_filter(type, AUDIT_FILTER_EXCLUDE)))
                return NULL;

        /* NOTE: don't ever fail/sleep on these two conditions:
         * 1. auditd generated record - since we need auditd to drain the
         *    queue; also, when we are checking for auditd, compare PIDs using
         *    task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
         *    using a PID anchored in the caller's namespace
         * 2. generator holding the audit_cmd_mutex - we don't want to block
         *    while holding the mutex, although we do penalize the sender
         *    later in audit_receive() when it is safe to block
         */
        if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
                long stime = audit_backlog_wait_time;

                while (audit_backlog_limit &&
                       (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
                        /* wake kauditd to try and flush the queue */
                        wake_up_interruptible(&kauditd_wait);

                        /* sleep if we are allowed and we haven't exhausted our
                         * backlog wait limit */
                        if (gfpflags_allow_blocking(gfp_mask) && (stime > 0)) {
                                long rtime = stime;

                                DECLARE_WAITQUEUE(wait, current);

                                add_wait_queue_exclusive(&audit_backlog_wait,
                                                         &wait);
                                set_current_state(TASK_UNINTERRUPTIBLE);
                                stime = schedule_timeout(rtime);
                                atomic_add(rtime - stime, &audit_backlog_wait_time_actual);
                                remove_wait_queue(&audit_backlog_wait, &wait);
                        } else {
                                if (audit_rate_check() && printk_ratelimit())
                                        pr_warn("audit_backlog=%d > audit_backlog_limit=%d\n",
                                                skb_queue_len(&audit_queue),
                                                audit_backlog_limit);
                                audit_log_lost("backlog limit exceeded");
                                return NULL;
                        }
                }
        }

        ab = audit_buffer_alloc(ctx, gfp_mask, type);
        if (!ab) {
                audit_log_lost("out of memory in audit_log_start");
                return NULL;
        }

        audit_get_stamp(ab->ctx, &t, &serial);
        /* cancel dummy context to enable supporting records */
        if (ctx)
                ctx->dummy = 0;
        audit_log_format(ab, "audit(%llu.%03lu:%u): ",
                         (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial);

        return ab;
}

/**
 * audit_expand - expand skb in the audit buffer
 * @ab: audit_buffer
 * @extra: space to add at tail of the skb
 *
 * Returns 0 (no space) on failed expansion, or available space if
 * successful.
 */
static inline int audit_expand(struct audit_buffer *ab, int extra)
{
        struct sk_buff *skb = ab->skb;
        int oldtail = skb_tailroom(skb);
        int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask);
        int newtail = skb_tailroom(skb);

        if (ret < 0) {
                audit_log_lost("out of memory in audit_expand");
                return 0;
        }

        skb->truesize += newtail - oldtail;
        return newtail;
}

/*
 * Format an audit message into the audit buffer.  If there isn't enough
 * room in the audit buffer, more room will be allocated and vsnprint
 * will be called a second time.  Currently, we assume that a printk
 * can't format message larger than 1024 bytes, so we don't either.
 */
static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
                              va_list args)
{
        int len, avail;
        struct sk_buff *skb;
        va_list args2;

        if (!ab)
                return;

        BUG_ON(!ab->skb);
        skb = ab->skb;
        avail = skb_tailroom(skb);
        if (avail == 0) {
                avail = audit_expand(ab, AUDIT_BUFSIZ);
                if (!avail)
                        goto out;
        }
        va_copy(args2, args);
        len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args);
        if (len >= avail) {
                /* The printk buffer is 1024 bytes long, so if we get
                 * here and AUDIT_BUFSIZ is at least 1024, then we can
                 * log everything that printk could have logged. */
                avail = audit_expand(ab,
                        max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
                if (!avail)
                        goto out_va_end;
                len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
        }
        if (len > 0)
                skb_put(skb, len);
out_va_end:
        va_end(args2);
out:
        return;
}

/**
 * audit_log_format - format a message into the audit buffer.
 * @ab: audit_buffer
 * @fmt: format string
 * @...: optional parameters matching @fmt string
 *
 * All the work is done in audit_log_vformat.
 */
void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
{
        va_list args;

        if (!ab)
                return;
        va_start(args, fmt);
        audit_log_vformat(ab, fmt, args);
        va_end(args);
}

/**
 * audit_log_n_hex - convert a buffer to hex and append it to the audit skb
 * @ab: the audit_buffer
 * @buf: buffer to convert to hex
 * @len: length of @buf to be converted
 *
 * No return value; failure to expand is silently ignored.
 *
 * This function will take the passed buf and convert it into a string of
 * ascii hex digits. The new string is placed onto the skb.
 */
void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf,
                size_t len)
{
        int i, avail, new_len;
        unsigned char *ptr;
        struct sk_buff *skb;

        if (!ab)
                return;

        BUG_ON(!ab->skb);
        skb = ab->skb;
        avail = skb_tailroom(skb);
        new_len = len<<1;
        if (new_len >= avail) {
                /* Round the buffer request up to the next multiple */
                new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
                avail = audit_expand(ab, new_len);
                if (!avail)
                        return;
        }

        ptr = skb_tail_pointer(skb);
        for (i = 0; i < len; i++)
                ptr = hex_byte_pack_upper(ptr, buf[i]);
        *ptr = 0;
        skb_put(skb, len << 1); /* new string is twice the old string */
}

/*
 * Format a string of no more than slen characters into the audit buffer,
 * enclosed in quote marks.
 */
void audit_log_n_string(struct audit_buffer *ab, const char *string,
                        size_t slen)
{
        int avail, new_len;
        unsigned char *ptr;
        struct sk_buff *skb;

        if (!ab)
                return;

        BUG_ON(!ab->skb);
        skb = ab->skb;
        avail = skb_tailroom(skb);
        new_len = slen + 3;        /* enclosing quotes + null terminator */
        if (new_len > avail) {
                avail = audit_expand(ab, new_len);
                if (!avail)
                        return;
        }
        ptr = skb_tail_pointer(skb);
        *ptr++ = '"';
        memcpy(ptr, string, slen);
        ptr += slen;
        *ptr++ = '"';
        *ptr = 0;
        skb_put(skb, slen + 2);        /* don't include null terminator */
}

/**
 * audit_string_contains_control - does a string need to be logged in hex
 * @string: string to be checked
 * @len: max length of the string to check
 */
bool audit_string_contains_control(const char *string, size_t len)
{
        const unsigned char *p;
        for (p = string; p < (const unsigned char *)string + len; p++) {
                if (*p == '"' || *p < 0x21 || *p > 0x7e)
                        return true;
        }
        return false;
}

/**
 * audit_log_n_untrustedstring - log a string that may contain random characters
 * @ab: audit_buffer
 * @string: string to be logged
 * @len: length of string (not including trailing null)
 *
 * This code will escape a string that is passed to it if the string
 * contains a control character, unprintable character, double quote mark,
 * or a space. Unescaped strings will start and end with a double quote mark.
 * Strings that are escaped are printed in hex (2 digits per char).
 *
 * The caller specifies the number of characters in the string to log, which may
 * or may not be the entire string.
 */
void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string,
                                 size_t len)
{
        if (audit_string_contains_control(string, len))
                audit_log_n_hex(ab, string, len);
        else
                audit_log_n_string(ab, string, len);
}

/**
 * audit_log_untrustedstring - log a string that may contain random characters
 * @ab: audit_buffer
 * @string: string to be logged
 *
 * Same as audit_log_n_untrustedstring(), except that strlen is used to
 * determine string length.
 */
void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
{
        audit_log_n_untrustedstring(ab, string, strlen(string));
}

/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
                      const struct path *path)
{
        char *p, *pathname;

        if (prefix)
                audit_log_format(ab, "%s", prefix);

        /* We will allow 11 spaces for ' (deleted)' to be appended */
        pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
        if (!pathname) {
                audit_log_format(ab, "\"<no_memory>\"");
                return;
        }
        p = d_path(path, pathname, PATH_MAX+11);
        if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
                /* FIXME: can we save some information here? */
                audit_log_format(ab, "\"<too_long>\"");
        } else
                audit_log_untrustedstring(ab, p);
        kfree(pathname);
}

void audit_log_session_info(struct audit_buffer *ab)
{
        unsigned int sessionid = audit_get_sessionid(current);
        uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));

        audit_log_format(ab, "auid=%u ses=%u", auid, sessionid);
}

void audit_log_key(struct audit_buffer *ab, char *key)
{
        audit_log_format(ab, " key=");
        if (key)
                audit_log_untrustedstring(ab, key);
        else
                audit_log_format(ab, "(null)");
}

int audit_log_task_context(struct audit_buffer *ab)
{
        struct lsm_prop prop;
        struct lsm_context ctx;
        int error;

        security_current_getlsmprop_subj(&prop);
        if (!lsmprop_is_set(&prop))
                return 0;

        error = security_lsmprop_to_secctx(&prop, &ctx);
        if (error < 0) {
                if (error != -EINVAL)
                        goto error_path;
                return 0;
        }

        audit_log_format(ab, " subj=%s", ctx.context);
        security_release_secctx(&ctx);
        return 0;

error_path:
        audit_panic("error in audit_log_task_context");
        return error;
}
EXPORT_SYMBOL(audit_log_task_context);

void audit_log_d_path_exe(struct audit_buffer *ab,
                          struct mm_struct *mm)
{
        struct file *exe_file;

        if (!mm)
                goto out_null;

        exe_file = get_mm_exe_file(mm);
        if (!exe_file)
                goto out_null;

        audit_log_d_path(ab, " exe=", &exe_file->f_path);
        fput(exe_file);
        return;
out_null:
        audit_log_format(ab, " exe=(null)");
}

struct tty_struct *audit_get_tty(void)
{
        struct tty_struct *tty = NULL;
        unsigned long flags;

        spin_lock_irqsave(&current->sighand->siglock, flags);
        if (current->signal)
                tty = tty_kref_get(current->signal->tty);
        spin_unlock_irqrestore(&current->sighand->siglock, flags);
        return tty;
}

void audit_put_tty(struct tty_struct *tty)
{
        tty_kref_put(tty);
}

void audit_log_task_info(struct audit_buffer *ab)
{
        const struct cred *cred;
        char comm[sizeof(current->comm)];
        struct tty_struct *tty;

        if (!ab)
                return;

        cred = current_cred();
        tty = audit_get_tty();
        audit_log_format(ab,
                         " ppid=%d pid=%d auid=%u uid=%u gid=%u"
                         " euid=%u suid=%u fsuid=%u"
                         " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
                         task_ppid_nr(current),
                         task_tgid_nr(current),
                         from_kuid(&init_user_ns, audit_get_loginuid(current)),
                         from_kuid(&init_user_ns, cred->uid),
                         from_kgid(&init_user_ns, cred->gid),
                         from_kuid(&init_user_ns, cred->euid),
                         from_kuid(&init_user_ns, cred->suid),
                         from_kuid(&init_user_ns, cred->fsuid),
                         from_kgid(&init_user_ns, cred->egid),
                         from_kgid(&init_user_ns, cred->sgid),
                         from_kgid(&init_user_ns, cred->fsgid),
                         tty ? tty_name(tty) : "(none)",
                         audit_get_sessionid(current));
        audit_put_tty(tty);
        audit_log_format(ab, " comm=");
        audit_log_untrustedstring(ab, get_task_comm(comm, current));
        audit_log_d_path_exe(ab, current->mm);
        audit_log_task_context(ab);
}
EXPORT_SYMBOL(audit_log_task_info);

/**
 * audit_log_path_denied - report a path restriction denial
 * @type: audit message type (AUDIT_ANOM_LINK, AUDIT_ANOM_CREAT, etc)
 * @operation: specific operation name
 */
void audit_log_path_denied(int type, const char *operation)
{
        struct audit_buffer *ab;

        if (!audit_enabled || audit_dummy_context())
                return;

        /* Generate log with subject, operation, outcome. */
        ab = audit_log_start(audit_context(), GFP_KERNEL, type);
        if (!ab)
                return;
        audit_log_format(ab, "op=%s", operation);
        audit_log_task_info(ab);
        audit_log_format(ab, " res=0");
        audit_log_end(ab);
}

/* global counter which is incremented every time something logs in */
static atomic_t session_id = ATOMIC_INIT(0);

static int audit_set_loginuid_perm(kuid_t loginuid)
{
        /* if we are unset, we don't need privs */
        if (!audit_loginuid_set(current))
                return 0;
        /* if AUDIT_FEATURE_LOGINUID_IMMUTABLE means never ever allow a change*/
        if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
                return -EPERM;
        /* it is set, you need permission */
        if (!capable(CAP_AUDIT_CONTROL))
                return -EPERM;
        /* reject if this is not an unset and we don't allow that */
        if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID)
                                 && uid_valid(loginuid))
                return -EPERM;
        return 0;
}

static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,
                                   unsigned int oldsessionid,
                                   unsigned int sessionid, int rc)
{
        struct audit_buffer *ab;
        uid_t uid, oldloginuid, loginuid;
        struct tty_struct *tty;

        if (!audit_enabled)
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_LOGIN);
        if (!ab)
                return;

        uid = from_kuid(&init_user_ns, task_uid(current));
        oldloginuid = from_kuid(&init_user_ns, koldloginuid);
        loginuid = from_kuid(&init_user_ns, kloginuid);
        tty = audit_get_tty();

        audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
        audit_log_task_context(ab);
        audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
                         oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
                         oldsessionid, sessionid, !rc);
        audit_put_tty(tty);
        audit_log_end(ab);
}

/**
 * audit_set_loginuid - set current task's loginuid
 * @loginuid: loginuid value
 *
 * Returns 0.
 *
 * Called (set) from fs/proc/base.c::proc_loginuid_write().
 */
int audit_set_loginuid(kuid_t loginuid)
{
        unsigned int oldsessionid, sessionid = AUDIT_SID_UNSET;
        kuid_t oldloginuid;
        int rc;

        oldloginuid = audit_get_loginuid(current);
        oldsessionid = audit_get_sessionid(current);

        rc = audit_set_loginuid_perm(loginuid);
        if (rc)
                goto out;

        /* are we setting or clearing? */
        if (uid_valid(loginuid)) {
                sessionid = (unsigned int)atomic_inc_return(&session_id);
                if (unlikely(sessionid == AUDIT_SID_UNSET))
                        sessionid = (unsigned int)atomic_inc_return(&session_id);
        }

        current->sessionid = sessionid;
        current->loginuid = loginuid;
out:
        audit_log_set_loginuid(oldloginuid, loginuid, oldsessionid, sessionid, rc);
        return rc;
}

/**
 * audit_signal_info - record signal info for shutting down audit subsystem
 * @sig: signal value
 * @t: task being signaled
 *
 * If the audit subsystem is being terminated, record the task (pid)
 * and uid that is doing that.
 */
int audit_signal_info(int sig, struct task_struct *t)
{
        kuid_t uid = current_uid(), auid;

        if (auditd_test_task(t) &&
            (sig == SIGTERM || sig == SIGHUP ||
             sig == SIGUSR1 || sig == SIGUSR2)) {
                audit_sig_pid = task_tgid_nr(current);
                auid = audit_get_loginuid(current);
                if (uid_valid(auid))
                        audit_sig_uid = auid;
                else
                        audit_sig_uid = uid;
                security_current_getlsmprop_subj(&audit_sig_lsm);
        }

        return audit_signal_info_syscall(t);
}

/**
 * audit_log_end - end one audit record
 * @ab: the audit_buffer
 *
 * We can not do a netlink send inside an irq context because it blocks (last
 * arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed on a
 * queue and a kthread is scheduled to remove them from the queue outside the
 * irq context.  May be called in any context.
 */
void audit_log_end(struct audit_buffer *ab)
{
        struct sk_buff *skb;
        struct nlmsghdr *nlh;

        if (!ab)
                return;

        if (audit_rate_check()) {
                skb = ab->skb;
                ab->skb = NULL;

                /* setup the netlink header, see the comments in
                 * kauditd_send_multicast_skb() for length quirks */
                nlh = nlmsg_hdr(skb);
                nlh->nlmsg_len = skb->len - NLMSG_HDRLEN;

                /* queue the netlink packet and poke the kauditd thread */
                skb_queue_tail(&audit_queue, skb);
                wake_up_interruptible(&kauditd_wait);
        } else
                audit_log_lost("rate limit exceeded");

        audit_buffer_free(ab);
}

/**
 * audit_log - Log an audit record
 * @ctx: audit context
 * @gfp_mask: type of allocation
 * @type: audit message type
 * @fmt: format string to use
 * @...: variable parameters matching the format string
 *
 * This is a convenience function that calls audit_log_start,
 * audit_log_vformat, and audit_log_end.  It may be called
 * in any context.
 */
void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
               const char *fmt, ...)
{
        struct audit_buffer *ab;
        va_list args;

        ab = audit_log_start(ctx, gfp_mask, type);
        if (ab) {
                va_start(args, fmt);
                audit_log_vformat(ab, fmt, args);
                va_end(args);
                audit_log_end(ab);
        }
}

EXPORT_SYMBOL(audit_log_start);
EXPORT_SYMBOL(audit_log_end);
EXPORT_SYMBOL(audit_log_format);
EXPORT_SYMBOL(audit_log);































































































































































































































































































































































  312 







































  482 









    6 



























































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_SPINLOCK_H
#define __LINUX_SPINLOCK_H
#define __LINUX_INSIDE_SPINLOCK_H

/*
 * include/linux/spinlock.h - generic spinlock/rwlock declarations
 *
 * here's the role of the various spinlock/rwlock related include files:
 *
 * on SMP builds:
 *
 *  asm/spinlock_types.h: contains the arch_spinlock_t/arch_rwlock_t and the
 *                        initializers
 *
 *  linux/spinlock_types_raw:
 *                          The raw types and initializers
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  asm/spinlock.h:       contains the arch_spin_*()/etc. lowlevel
 *                        implementations, mostly inline assembly code
 *
 *   (also included on UP-debug builds:)
 *
 *  linux/spinlock_api_smp.h:
 *                        contains the prototypes for the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 *
 * on UP builds:
 *
 *  linux/spinlock_type_up.h:
 *                        contains the generic, simplified UP spinlock type.
 *                        (which is an empty structure on non-debug builds)
 *
 *  linux/spinlock_types_raw:
 *                          The raw RT types and initializers
 *  linux/spinlock_types.h:
 *                        defines the generic type and initializers
 *
 *  linux/spinlock_up.h:
 *                        contains the arch_spin_*()/etc. version of UP
 *                        builds. (which are NOPs on non-debug, non-preempt
 *                        builds)
 *
 *   (included on UP-non-debug builds:)
 *
 *  linux/spinlock_api_up.h:
 *                        builds the _spin_*() APIs.
 *
 *  linux/spinlock.h:     builds the final spin_*() APIs.
 */

#include <linux/typecheck.h>
#include <linux/preempt.h>
#include <linux/linkage.h>
#include <linux/compiler.h>
#include <linux/irqflags.h>
#include <linux/thread_info.h>
#include <linux/stringify.h>
#include <linux/bottom_half.h>
#include <linux/lockdep.h>
#include <linux/cleanup.h>
#include <asm/barrier.h>
#include <asm/mmiowb.h>


/*
 * Must define these before including other files, inline functions need them
 */
#define LOCK_SECTION_NAME ".text..lock."KBUILD_BASENAME

#define LOCK_SECTION_START(extra)               \
        ".subsection 1\n\t"                     \
        extra                                   \
        ".ifndef " LOCK_SECTION_NAME "\n\t"     \
        LOCK_SECTION_NAME ":\n\t"               \
        ".endif\n"

#define LOCK_SECTION_END                        \
        ".previous\n\t"

#define __lockfunc __section(".spinlock.text")

/*
 * Pull the arch_spinlock_t and arch_rwlock_t definitions:
 */
#include <linux/spinlock_types.h>

/*
 * Pull the arch_spin*() functions/declarations (UP-nondebug doesn't need them):
 */
#ifdef CONFIG_SMP
# include <asm/spinlock.h>
#else
# include <linux/spinlock_up.h>
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
  extern void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name,
                                   struct lock_class_key *key, short inner);

# define raw_spin_lock_init(lock)                                        \
do {                                                                        \
        static struct lock_class_key __key;                                \
                                                                        \
        __raw_spin_lock_init((lock), #lock, &__key, LD_WAIT_SPIN);        \
} while (0)

#else
# define raw_spin_lock_init(lock)                                \
        do { *(lock) = __RAW_SPIN_LOCK_UNLOCKED(lock); } while (0)
#endif

#define raw_spin_is_locked(lock)        arch_spin_is_locked(&(lock)->raw_lock)

#ifdef arch_spin_is_contended
#define raw_spin_is_contended(lock)        arch_spin_is_contended(&(lock)->raw_lock)
#else
#define raw_spin_is_contended(lock)        (((void)(lock), 0))
#endif /*arch_spin_is_contended*/

/*
 * smp_mb__after_spinlock() provides the equivalent of a full memory barrier
 * between program-order earlier lock acquisitions and program-order later
 * memory accesses.
 *
 * This guarantees that the following two properties hold:
 *
 *   1) Given the snippet:
 *
 *          { X = 0;  Y = 0; }
 *
 *          CPU0                                CPU1
 *
 *          WRITE_ONCE(X, 1);                WRITE_ONCE(Y, 1);
 *          spin_lock(S);                        smp_mb();
 *          smp_mb__after_spinlock();        r1 = READ_ONCE(X);
 *          r0 = READ_ONCE(Y);
 *          spin_unlock(S);
 *
 *      it is forbidden that CPU0 does not observe CPU1's store to Y (r0 = 0)
 *      and CPU1 does not observe CPU0's store to X (r1 = 0); see the comments
 *      preceding the call to smp_mb__after_spinlock() in __schedule() and in
 *      try_to_wake_up().
 *
 *   2) Given the snippet:
 *
 *  { X = 0;  Y = 0; }
 *
 *  CPU0                CPU1                                CPU2
 *
 *  spin_lock(S);        spin_lock(S);                        r1 = READ_ONCE(Y);
 *  WRITE_ONCE(X, 1);        smp_mb__after_spinlock();        smp_rmb();
 *  spin_unlock(S);        r0 = READ_ONCE(X);                r2 = READ_ONCE(X);
 *                        WRITE_ONCE(Y, 1);
 *                        spin_unlock(S);
 *
 *      it is forbidden that CPU0's critical section executes before CPU1's
 *      critical section (r0 = 1), CPU2 observes CPU1's store to Y (r1 = 1)
 *      and CPU2 does not observe CPU0's store to X (r2 = 0); see the comments
 *      preceding the calls to smp_rmb() in try_to_wake_up() for similar
 *      snippets but "projected" onto two CPUs.
 *
 * Property (2) upgrades the lock to an RCsc lock.
 *
 * Since most load-store architectures implement ACQUIRE with an smp_mb() after
 * the LL/SC loop, they need no further barriers. Similarly all our TSO
 * architectures imply an smp_mb() for each atomic instruction and equally don't
 * need more.
 *
 * Architectures that can implement ACQUIRE better need to take care.
 */
#ifndef smp_mb__after_spinlock
#define smp_mb__after_spinlock()        kcsan_mb()
#endif

#ifdef CONFIG_DEBUG_SPINLOCK
 extern void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock);
 extern int do_raw_spin_trylock(raw_spinlock_t *lock);
 extern void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock);
#else
static inline void do_raw_spin_lock(raw_spinlock_t *lock) __acquires(lock)
{
        __acquire(lock);
        arch_spin_lock(&lock->raw_lock);
        mmiowb_spin_lock();
}

static inline int do_raw_spin_trylock(raw_spinlock_t *lock)
{
        int ret = arch_spin_trylock(&(lock)->raw_lock);

        if (ret)
                mmiowb_spin_lock();

        return ret;
}

static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
{
        mmiowb_spin_unlock();
        arch_spin_unlock(&lock->raw_lock);
        __release(lock);
}
#endif

/*
 * Define the various spin_lock methods.  Note we define these
 * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The
 * various methods are defined as nops in the case they are not
 * required.
 */
#define raw_spin_trylock(lock)        __cond_lock(lock, _raw_spin_trylock(lock))

#define raw_spin_lock(lock)        _raw_spin_lock(lock)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
# define raw_spin_lock_nested(lock, subclass) \
        _raw_spin_lock_nested(lock, subclass)

# define raw_spin_lock_nest_lock(lock, nest_lock)                        \
         do {                                                                \
                 typecheck(struct lockdep_map *, &(nest_lock)->dep_map);\
                 _raw_spin_lock_nest_lock(lock, &(nest_lock)->dep_map);        \
         } while (0)
#else
/*
 * Always evaluate the 'subclass' argument to avoid that the compiler
 * warns about set-but-not-used variables when building with
 * CONFIG_DEBUG_LOCK_ALLOC=n and with W=1.
 */
# define raw_spin_lock_nested(lock, subclass)                \
        _raw_spin_lock(((void)(subclass), (lock)))
# define raw_spin_lock_nest_lock(lock, nest_lock)        _raw_spin_lock(lock)
#endif

#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)

#define raw_spin_lock_irqsave(lock, flags)                        \
        do {                                                \
                typecheck(unsigned long, flags);        \
                flags = _raw_spin_lock_irqsave(lock);        \
        } while (0)

#ifdef CONFIG_DEBUG_LOCK_ALLOC
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave_nested(lock, subclass);        \
        } while (0)
#else
#define raw_spin_lock_irqsave_nested(lock, flags, subclass)                \
        do {                                                                \
                typecheck(unsigned long, flags);                        \
                flags = _raw_spin_lock_irqsave(lock);                        \
        } while (0)
#endif

#else

#define raw_spin_lock_irqsave(lock, flags)                \
        do {                                                \
                typecheck(unsigned long, flags);        \
                _raw_spin_lock_irqsave(lock, flags);        \
        } while (0)

#define raw_spin_lock_irqsave_nested(lock, flags, subclass)        \
        raw_spin_lock_irqsave(lock, flags)

#endif

#define raw_spin_lock_irq(lock)                _raw_spin_lock_irq(lock)
#define raw_spin_lock_bh(lock)                _raw_spin_lock_bh(lock)
#define raw_spin_unlock(lock)                _raw_spin_unlock(lock)
#define raw_spin_unlock_irq(lock)        _raw_spin_unlock_irq(lock)

#define raw_spin_unlock_irqrestore(lock, flags)                \
        do {                                                        \
                typecheck(unsigned long, flags);                \
                _raw_spin_unlock_irqrestore(lock, flags);        \
        } while (0)
#define raw_spin_unlock_bh(lock)        _raw_spin_unlock_bh(lock)

#define raw_spin_trylock_bh(lock) \
        __cond_lock(lock, _raw_spin_trylock_bh(lock))

#define raw_spin_trylock_irq(lock) \
({ \
        local_irq_disable(); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_enable(); 0;  }); \
})

#define raw_spin_trylock_irqsave(lock, flags) \
({ \
        local_irq_save(flags); \
        raw_spin_trylock(lock) ? \
        1 : ({ local_irq_restore(flags); 0; }); \
})

#ifndef CONFIG_PREEMPT_RT
/* Include rwlock functions for !RT */
#include <linux/rwlock.h>
#endif

/*
 * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
 */
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
# include <linux/spinlock_api_smp.h>
#else
# include <linux/spinlock_api_up.h>
#endif

/* Non PREEMPT_RT kernel, map to raw spinlocks: */
#ifndef CONFIG_PREEMPT_RT

/*
 * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
 */

static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
{
        return &lock->rlock;
}

#ifdef CONFIG_DEBUG_SPINLOCK

# define spin_lock_init(lock)                                        \
do {                                                                \
        static struct lock_class_key __key;                        \
                                                                \
        __raw_spin_lock_init(spinlock_check(lock),                \
                             #lock, &__key, LD_WAIT_CONFIG);        \
} while (0)

#else

# define spin_lock_init(_lock)                        \
do {                                                \
        spinlock_check(_lock);                        \
        *(_lock) = __SPIN_LOCK_UNLOCKED(_lock);        \
} while (0)

#endif

static __always_inline void spin_lock(spinlock_t *lock)
{
        raw_spin_lock(&lock->rlock);
}

static __always_inline void spin_lock_bh(spinlock_t *lock)
{
        raw_spin_lock_bh(&lock->rlock);
}

static __always_inline int spin_trylock(spinlock_t *lock)
{
        return raw_spin_trylock(&lock->rlock);
}

#define spin_lock_nested(lock, subclass)                        \
do {                                                                \
        raw_spin_lock_nested(spinlock_check(lock), subclass);        \
} while (0)

#define spin_lock_nest_lock(lock, nest_lock)                                \
do {                                                                        \
        raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);        \
} while (0)

static __always_inline void spin_lock_irq(spinlock_t *lock)
{
        raw_spin_lock_irq(&lock->rlock);
}

#define spin_lock_irqsave(lock, flags)                                \
do {                                                                \
        raw_spin_lock_irqsave(spinlock_check(lock), flags);        \
} while (0)

#define spin_lock_irqsave_nested(lock, flags, subclass)                        \
do {                                                                        \
        raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
} while (0)

static __always_inline void spin_unlock(spinlock_t *lock)
{
        raw_spin_unlock(&lock->rlock);
}

static __always_inline void spin_unlock_bh(spinlock_t *lock)
{
        raw_spin_unlock_bh(&lock->rlock);
}

static __always_inline void spin_unlock_irq(spinlock_t *lock)
{
        raw_spin_unlock_irq(&lock->rlock);
}

static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
{
        raw_spin_unlock_irqrestore(&lock->rlock, flags);
}

static __always_inline int spin_trylock_bh(spinlock_t *lock)
{
        return raw_spin_trylock_bh(&lock->rlock);
}

static __always_inline int spin_trylock_irq(spinlock_t *lock)
{
        return raw_spin_trylock_irq(&lock->rlock);
}

#define spin_trylock_irqsave(lock, flags)                        \
({                                                                \
        raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
})

/**
 * spin_is_locked() - Check whether a spinlock is locked.
 * @lock: Pointer to the spinlock.
 *
 * This function is NOT required to provide any memory ordering
 * guarantees; it could be used for debugging purposes or, when
 * additional synchronization is needed, accompanied with other
 * constructs (memory barriers) enforcing the synchronization.
 *
 * Returns: 1 if @lock is locked, 0 otherwise.
 *
 * Note that the function only tells you that the spinlock is
 * seen to be locked, not that it is locked on your CPU.
 *
 * Further, on CONFIG_SMP=n builds with CONFIG_DEBUG_SPINLOCK=n,
 * the return value is always 0 (see include/linux/spinlock_up.h).
 * Therefore you should not rely heavily on the return value.
 */
static __always_inline int spin_is_locked(spinlock_t *lock)
{
        return raw_spin_is_locked(&lock->rlock);
}

static __always_inline int spin_is_contended(spinlock_t *lock)
{
        return raw_spin_is_contended(&lock->rlock);
}

#define assert_spin_locked(lock)        assert_raw_spin_locked(&(lock)->rlock)

#else  /* !CONFIG_PREEMPT_RT */
# include <linux/spinlock_rt.h>
#endif /* CONFIG_PREEMPT_RT */

/*
 * Does a critical section need to be broken due to another
 * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
 * but a general need for low latency)
 */
static inline int spin_needbreak(spinlock_t *lock)
{
        if (!preempt_model_preemptible())
                return 0;

        return spin_is_contended(lock);
}

/*
 * Check if a rwlock is contended.
 * Returns non-zero if there is another task waiting on the rwlock.
 * Returns zero if the lock is not contended or the system / underlying
 * rwlock implementation does not support contention detection.
 * Technically does not depend on CONFIG_PREEMPTION, but a general need
 * for low latency.
 */
static inline int rwlock_needbreak(rwlock_t *lock)
{
        if (!preempt_model_preemptible())
                return 0;

        return rwlock_is_contended(lock);
}

/*
 * Pull the atomic_t declaration:
 * (asm-mips/atomic.h needs above definitions)
 */
#include <linux/atomic.h>
/**
 * atomic_dec_and_lock - lock on reaching reference count zero
 * @atomic: the atomic counter
 * @lock: the spinlock in question
 *
 * Decrements @atomic by 1.  If the result is 0, returns true and locks
 * @lock.  Returns false for all other cases.
 */
extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
#define atomic_dec_and_lock(atomic, lock) \
                __cond_lock(lock, _atomic_dec_and_lock(atomic, lock))

extern int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                        unsigned long *flags);
#define atomic_dec_and_lock_irqsave(atomic, lock, flags) \
                __cond_lock(lock, _atomic_dec_and_lock_irqsave(atomic, lock, &(flags)))

extern int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock);
#define atomic_dec_and_raw_lock(atomic, lock) \
                __cond_lock(lock, _atomic_dec_and_raw_lock(atomic, lock))

extern int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
                                        unsigned long *flags);
#define atomic_dec_and_raw_lock_irqsave(atomic, lock, flags) \
                __cond_lock(lock, _atomic_dec_and_raw_lock_irqsave(atomic, lock, &(flags)))

int __alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *lock_mask,
                             size_t max_size, unsigned int cpu_mult,
                             gfp_t gfp, const char *name,
                             struct lock_class_key *key);

#define alloc_bucket_spinlocks(locks, lock_mask, max_size, cpu_mult, gfp)    \
        ({                                                                     \
                static struct lock_class_key key;                             \
                int ret;                                                     \
                                                                             \
                ret = __alloc_bucket_spinlocks(locks, lock_mask, max_size,   \
                                               cpu_mult, gfp, #locks, &key); \
                ret;                                                             \
        })

void free_bucket_spinlocks(spinlock_t *locks);

DEFINE_LOCK_GUARD_1(raw_spinlock, raw_spinlock_t,
                    raw_spin_lock(_T->lock),
                    raw_spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1_COND(raw_spinlock, _try, raw_spin_trylock(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_nested, raw_spinlock_t,
                    raw_spin_lock_nested(_T->lock, SINGLE_DEPTH_NESTING),
                    raw_spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_irq, raw_spinlock_t,
                    raw_spin_lock_irq(_T->lock),
                    raw_spin_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irq, _try, raw_spin_trylock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_bh, raw_spinlock_t,
                    raw_spin_lock_bh(_T->lock),
                    raw_spin_unlock_bh(_T->lock))

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_bh, _try, raw_spin_trylock_bh(_T->lock))

DEFINE_LOCK_GUARD_1(raw_spinlock_irqsave, raw_spinlock_t,
                    raw_spin_lock_irqsave(_T->lock, _T->flags),
                    raw_spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1_COND(raw_spinlock_irqsave, _try,
                         raw_spin_trylock_irqsave(_T->lock, _T->flags))

DEFINE_LOCK_GUARD_1(spinlock, spinlock_t,
                    spin_lock(_T->lock),
                    spin_unlock(_T->lock))

DEFINE_LOCK_GUARD_1_COND(spinlock, _try, spin_trylock(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_irq, spinlock_t,
                    spin_lock_irq(_T->lock),
                    spin_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1_COND(spinlock_irq, _try,
                         spin_trylock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_bh, spinlock_t,
                    spin_lock_bh(_T->lock),
                    spin_unlock_bh(_T->lock))

DEFINE_LOCK_GUARD_1_COND(spinlock_bh, _try,
                         spin_trylock_bh(_T->lock))

DEFINE_LOCK_GUARD_1(spinlock_irqsave, spinlock_t,
                    spin_lock_irqsave(_T->lock, _T->flags),
                    spin_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1_COND(spinlock_irqsave, _try,
                         spin_trylock_irqsave(_T->lock, _T->flags))

DEFINE_LOCK_GUARD_1(read_lock, rwlock_t,
                    read_lock(_T->lock),
                    read_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(read_lock_irq, rwlock_t,
                    read_lock_irq(_T->lock),
                    read_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(read_lock_irqsave, rwlock_t,
                    read_lock_irqsave(_T->lock, _T->flags),
                    read_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

DEFINE_LOCK_GUARD_1(write_lock, rwlock_t,
                    write_lock(_T->lock),
                    write_unlock(_T->lock))

DEFINE_LOCK_GUARD_1(write_lock_irq, rwlock_t,
                    write_lock_irq(_T->lock),
                    write_unlock_irq(_T->lock))

DEFINE_LOCK_GUARD_1(write_lock_irqsave, rwlock_t,
                    write_lock_irqsave(_T->lock, _T->flags),
                    write_unlock_irqrestore(_T->lock, _T->flags),
                    unsigned long flags)

#undef __LINUX_INSIDE_SPINLOCK_H
#endif /* __LINUX_SPINLOCK_H */
































    1 

    1 

    4 









    1 
















    1 









    1 








    7 


    7 










  246 






  246 













   24 

   24 




    4 



    1 

















    3 













   11 



    1 




   10 
    5 
    6 







    5 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
// SPDX-License-Identifier: GPL-2.0
/*
 * KVM coalesced MMIO
 *
 * Copyright (c) 2008 Bull S.A.S.
 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
 *
 *  Author: Laurent Vivier <Laurent.Vivier@bull.net>
 *
 */

#include <kvm/iodev.h>

#include <linux/kvm_host.h>
#include <linux/slab.h>
#include <linux/kvm.h>

#include "coalesced_mmio.h"

static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
{
        return container_of(dev, struct kvm_coalesced_mmio_dev, dev);
}

static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
                                   gpa_t addr, int len)
{
        /* is it in a batchable area ?
         * (addr,len) is fully included in
         * (zone->addr, zone->size)
         */
        if (len < 0)
                return 0;
        if (addr + len < addr)
                return 0;
        if (addr < dev->zone.addr)
                return 0;
        if (addr + len > dev->zone.addr + dev->zone.size)
                return 0;
        return 1;
}

static int coalesced_mmio_write(struct kvm_vcpu *vcpu,
                                struct kvm_io_device *this, gpa_t addr,
                                int len, const void *val)
{
        struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
        struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
        __u32 insert;

        if (!coalesced_mmio_in_range(dev, addr, len))
                return -EOPNOTSUPP;

        spin_lock(&dev->kvm->ring_lock);

        /*
         * last is the index of the entry to fill.  Verify userspace hasn't
         * set last to be out of range, and that there is room in the ring.
         * Leave one entry free in the ring so that userspace can differentiate
         * between an empty ring and a full ring.
         */
        insert = READ_ONCE(ring->last);
        if (insert >= KVM_COALESCED_MMIO_MAX ||
            (insert + 1) % KVM_COALESCED_MMIO_MAX == READ_ONCE(ring->first)) {
                spin_unlock(&dev->kvm->ring_lock);
                return -EOPNOTSUPP;
        }

        /* copy data in first free entry of the ring */

        ring->coalesced_mmio[insert].phys_addr = addr;
        ring->coalesced_mmio[insert].len = len;
        memcpy(ring->coalesced_mmio[insert].data, val, len);
        ring->coalesced_mmio[insert].pio = dev->zone.pio;
        smp_wmb();
        ring->last = (insert + 1) % KVM_COALESCED_MMIO_MAX;
        spin_unlock(&dev->kvm->ring_lock);
        return 0;
}

static void coalesced_mmio_destructor(struct kvm_io_device *this)
{
        struct kvm_coalesced_mmio_dev *dev = to_mmio(this);

        list_del(&dev->list);

        kfree(dev);
}

static const struct kvm_io_device_ops coalesced_mmio_ops = {
        .write      = coalesced_mmio_write,
        .destructor = coalesced_mmio_destructor,
};

int kvm_coalesced_mmio_init(struct kvm *kvm)
{
        struct page *page;

        page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        if (!page)
                return -ENOMEM;

        kvm->coalesced_mmio_ring = page_address(page);

        /*
         * We're using this spinlock to sync access to the coalesced ring.
         * The list doesn't need its own lock since device registration and
         * unregistration should only happen when kvm->slots_lock is held.
         */
        spin_lock_init(&kvm->ring_lock);
        INIT_LIST_HEAD(&kvm->coalesced_zones);

        return 0;
}

void kvm_coalesced_mmio_free(struct kvm *kvm)
{
        if (kvm->coalesced_mmio_ring)
                free_page((unsigned long)kvm->coalesced_mmio_ring);
}

int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
                                         struct kvm_coalesced_mmio_zone *zone)
{
        int ret;
        struct kvm_coalesced_mmio_dev *dev;

        if (zone->pio != 1 && zone->pio != 0)
                return -EINVAL;

        dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev),
                      GFP_KERNEL_ACCOUNT);
        if (!dev)
                return -ENOMEM;

        kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
        dev->kvm = kvm;
        dev->zone = *zone;

        mutex_lock(&kvm->slots_lock);
        ret = kvm_io_bus_register_dev(kvm,
                                zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS,
                                zone->addr, zone->size, &dev->dev);
        if (ret < 0)
                goto out_free_dev;
        list_add_tail(&dev->list, &kvm->coalesced_zones);
        mutex_unlock(&kvm->slots_lock);

        return 0;

out_free_dev:
        mutex_unlock(&kvm->slots_lock);
        kfree(dev);

        return ret;
}

int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
                                           struct kvm_coalesced_mmio_zone *zone)
{
        struct kvm_coalesced_mmio_dev *dev, *tmp;
        int r;

        if (zone->pio != 1 && zone->pio != 0)
                return -EINVAL;

        mutex_lock(&kvm->slots_lock);

        list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) {
                if (zone->pio == dev->zone.pio &&
                    coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
                        r = kvm_io_bus_unregister_dev(kvm,
                                zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
                        /*
                         * On failure, unregister destroys all devices on the
                         * bus, including the target device. There's no need
                         * to restart the walk as there aren't any zones left.
                         */
                        if (r)
                                break;
                }
        }

        mutex_unlock(&kvm->slots_lock);

        /*
         * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's
         * perspective, the coalesced MMIO is most definitely unregistered.
         */
        return 0;
}





































  248 
















   26 







































































































































































































  247 






























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_MM_H
#define _LINUX_SCHED_MM_H

#include <linux/kernel.h>
#include <linux/atomic.h>
#include <linux/sched.h>
#include <linux/mm_types.h>
#include <linux/gfp.h>
#include <linux/sync_core.h>
#include <linux/sched/coredump.h>

/*
 * Routines for handling mm_structs
 */
extern struct mm_struct *mm_alloc(void);

/**
 * mmgrab() - Pin a &struct mm_struct.
 * @mm: The &struct mm_struct to pin.
 *
 * Make sure that @mm will not get freed even after the owning task
 * exits. This doesn't guarantee that the associated address space
 * will still exist later on and mmget_not_zero() has to be used before
 * accessing it.
 *
 * This is a preferred way to pin @mm for a longer/unbounded amount
 * of time.
 *
 * Use mmdrop() to release the reference acquired by mmgrab().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmgrab(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_count);
}

static inline void smp_mb__after_mmgrab(void)
{
        smp_mb__after_atomic();
}

extern void __mmdrop(struct mm_struct *mm);

static inline void mmdrop(struct mm_struct *mm)
{
        /*
         * The implicit full barrier implied by atomic_dec_and_test() is
         * required by the membarrier system call before returning to
         * user-space, after storing to rq->curr.
         */
        if (unlikely(atomic_dec_and_test(&mm->mm_count)))
                __mmdrop(mm);
}

#ifdef CONFIG_PREEMPT_RT
/*
 * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is
 * by far the least expensive way to do that.
 */
static inline void __mmdrop_delayed(struct rcu_head *rhp)
{
        struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop);

        __mmdrop(mm);
}

/*
 * Invoked from finish_task_switch(). Delegates the heavy lifting on RT
 * kernels via RCU.
 */
static inline void mmdrop_sched(struct mm_struct *mm)
{
        /* Provides a full memory barrier. See mmdrop() */
        if (atomic_dec_and_test(&mm->mm_count))
                call_rcu(&mm->delayed_drop, __mmdrop_delayed);
}
#else
static inline void mmdrop_sched(struct mm_struct *mm)
{
        mmdrop(mm);
}
#endif

/* Helpers for lazy TLB mm refcounting */
static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmgrab(mm);
}

static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT)) {
                mmdrop(mm);
        } else {
                /*
                 * mmdrop_lazy_tlb must provide a full memory barrier, see the
                 * membarrier comment finish_task_switch which relies on this.
                 */
                smp_mb();
        }
}

static inline void mmdrop_lazy_tlb_sched(struct mm_struct *mm)
{
        if (IS_ENABLED(CONFIG_MMU_LAZY_TLB_REFCOUNT))
                mmdrop_sched(mm);
        else
                smp_mb(); /* see mmdrop_lazy_tlb() above */
}

/**
 * mmget() - Pin the address space associated with a &struct mm_struct.
 * @mm: The address space to pin.
 *
 * Make sure that the address space of the given &struct mm_struct doesn't
 * go away. This does not protect against parts of the address space being
 * modified or freed, however.
 *
 * Never use this function to pin this address space for an
 * unbounded/indefinite amount of time.
 *
 * Use mmput() to release the reference acquired by mmget().
 *
 * See also <Documentation/mm/active_mm.rst> for an in-depth explanation
 * of &mm_struct.mm_count vs &mm_struct.mm_users.
 */
static inline void mmget(struct mm_struct *mm)
{
        atomic_inc(&mm->mm_users);
}

static inline bool mmget_not_zero(struct mm_struct *mm)
{
        return atomic_inc_not_zero(&mm->mm_users);
}

/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
#ifdef CONFIG_MMU
/* same as above but performs the slow path from the async context. Can
 * be called from the atomic context as well
 */
void mmput_async(struct mm_struct *);
#endif

/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
 * Grab a reference to a task's mm, if it is not already going away
 * and ptrace_may_access with the mode parameter passed to it
 * succeeds.
 */
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct on exit() */
extern void exit_mm_release(struct task_struct *, struct mm_struct *);
/* Remove the current tasks stale references to the old mm_struct on exec() */
extern void exec_mm_release(struct task_struct *, struct mm_struct *);

#ifdef CONFIG_MEMCG
extern void mm_update_next_owner(struct mm_struct *mm);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MMU
#ifndef arch_get_mmap_end
#define arch_get_mmap_end(addr, len, flags)        (TASK_SIZE)
#endif

#ifndef arch_get_mmap_base
#define arch_get_mmap_base(addr, base) (base)
#endif

extern void arch_pick_mmap_layout(struct mm_struct *mm,
                                  struct rlimit *rlim_stack);

unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
                       unsigned long len, unsigned long pgoff,
                       unsigned long flags, vm_flags_t vm_flags);
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                               unsigned long len, unsigned long pgoff,
                               unsigned long flags, vm_flags_t);

unsigned long mm_get_unmapped_area(struct mm_struct *mm, struct file *filp,
                                   unsigned long addr, unsigned long len,
                                   unsigned long pgoff, unsigned long flags);

unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm,
                                           struct file *filp,
                                           unsigned long addr,
                                           unsigned long len,
                                           unsigned long pgoff,
                                           unsigned long flags,
                                           vm_flags_t vm_flags);

unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
                          unsigned long len, unsigned long pgoff,
                          unsigned long flags, vm_flags_t vm_flags);
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
                                  unsigned long len, unsigned long pgoff,
                                  unsigned long flags, vm_flags_t vm_flags);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm,
                                         struct rlimit *rlim_stack) {}
#endif

static inline bool in_vfork(struct task_struct *tsk)
{
        bool ret;

        /*
         * need RCU to access ->real_parent if CLONE_VM was used along with
         * CLONE_PARENT.
         *
         * We check real_parent->mm == tsk->mm because CLONE_VFORK does not
         * imply CLONE_VM
         *
         * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus
         * ->real_parent is not necessarily the task doing vfork(), so in
         * theory we can't rely on task_lock() if we want to dereference it.
         *
         * And in this case we can't trust the real_parent->mm == tsk->mm
         * check, it can be false negative. But we do not care, if init or
         * another oom-unkillable task does this it should blame itself.
         */
        rcu_read_lock();
        ret = tsk->vfork_done &&
                        rcu_dereference(tsk->real_parent)->mm == tsk->mm;
        rcu_read_unlock();

        return ret;
}

/*
 * Applies per-task gfp context to the given allocation flags.
 * PF_MEMALLOC_NOIO implies GFP_NOIO
 * PF_MEMALLOC_NOFS implies GFP_NOFS
 * PF_MEMALLOC_PIN  implies !GFP_MOVABLE
 */
static inline gfp_t current_gfp_context(gfp_t flags)
{
        unsigned int pflags = READ_ONCE(current->flags);

        if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) {
                /*
                 * NOIO implies both NOIO and NOFS and it is a weaker context
                 * so always make sure it makes precedence
                 */
                if (pflags & PF_MEMALLOC_NOIO)
                        flags &= ~(__GFP_IO | __GFP_FS);
                else if (pflags & PF_MEMALLOC_NOFS)
                        flags &= ~__GFP_FS;

                if (pflags & PF_MEMALLOC_PIN)
                        flags &= ~__GFP_MOVABLE;
        }
        return flags;
}

#ifdef CONFIG_LOCKDEP
extern void __fs_reclaim_acquire(unsigned long ip);
extern void __fs_reclaim_release(unsigned long ip);
extern void fs_reclaim_acquire(gfp_t gfp_mask);
extern void fs_reclaim_release(gfp_t gfp_mask);
#else
static inline void __fs_reclaim_acquire(unsigned long ip) { }
static inline void __fs_reclaim_release(unsigned long ip) { }
static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
#endif

/* Any memory-allocation retry loop should use
 * memalloc_retry_wait(), and pass the flags for the most
 * constrained allocation attempt that might have failed.
 * This provides useful documentation of where loops are,
 * and a central place to fine tune the waiting as the MM
 * implementation changes.
 */
static inline void memalloc_retry_wait(gfp_t gfp_flags)
{
        /* We use io_schedule_timeout because waiting for memory
         * typically included waiting for dirty pages to be
         * written out, which requires IO.
         */
        __set_current_state(TASK_UNINTERRUPTIBLE);
        gfp_flags = current_gfp_context(gfp_flags);
        if (gfpflags_allow_blocking(gfp_flags) &&
            !(gfp_flags & __GFP_NORETRY))
                /* Probably waited already, no need for much more */
                io_schedule_timeout(1);
        else
                /* Probably didn't wait, and has now released a lock,
                 * so now is a good time to wait
                 */
                io_schedule_timeout(HZ/50);
}

/**
 * might_alloc - Mark possible allocation sites
 * @gfp_mask: gfp_t flags that would be used to allocate
 *
 * Similar to might_sleep() and other annotations, this can be used in functions
 * that might allocate, but often don't. Compiles to nothing without
 * CONFIG_LOCKDEP. Includes a conditional might_sleep() if @gfp allows blocking.
 */
static inline void might_alloc(gfp_t gfp_mask)
{
        fs_reclaim_acquire(gfp_mask);
        fs_reclaim_release(gfp_mask);

        might_sleep_if(gfpflags_allow_blocking(gfp_mask));
}

/**
 * memalloc_flags_save - Add a PF_* flag to current->flags, save old value
 *
 * This allows PF_* flags to be conveniently added, irrespective of current
 * value, and then the old version restored with memalloc_flags_restore().
 */
static inline unsigned memalloc_flags_save(unsigned flags)
{
        unsigned oldflags = ~current->flags & flags;
        current->flags |= flags;
        return oldflags;
}

static inline void memalloc_flags_restore(unsigned flags)
{
        current->flags &= ~flags;
}

/**
 * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope.
 *
 * This functions marks the beginning of the GFP_NOIO allocation scope.
 * All further allocations will implicitly drop __GFP_IO flag and so
 * they are safe for the IO critical section from the allocation recursion
 * point of view. Use memalloc_noio_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_noio_restore.
 */
static inline unsigned int memalloc_noio_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOIO);
}

/**
 * memalloc_noio_restore - Ends the implicit GFP_NOIO scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_noio_save call.
 */
static inline void memalloc_noio_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope.
 *
 * This functions marks the beginning of the GFP_NOFS allocation scope.
 * All further allocations will implicitly drop __GFP_FS flag and so
 * they are safe for the FS critical section from the allocation recursion
 * point of view. Use memalloc_nofs_restore to end the scope with flags
 * returned by this function.
 *
 * Context: This function is safe to be used from any context.
 * Return: The saved flags to be passed to memalloc_nofs_restore.
 */
static inline unsigned int memalloc_nofs_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_NOFS);
}

/**
 * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function.
 * Always make sure that the given flags is the return value from the
 * pairing memalloc_nofs_save call.
 */
static inline void memalloc_nofs_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_noreclaim_save - Marks implicit __GFP_MEMALLOC scope.
 *
 * This function marks the beginning of the __GFP_MEMALLOC allocation scope.
 * All further allocations will implicitly add the __GFP_MEMALLOC flag, which
 * prevents entering reclaim and allows access to all memory reserves. This
 * should only be used when the caller guarantees the allocation will allow more
 * memory to be freed very shortly, i.e. it needs to allocate some memory in
 * the process of freeing memory, and cannot reclaim due to potential recursion.
 *
 * Users of this scope have to be extremely careful to not deplete the reserves
 * completely and implement a throttling mechanism which controls the
 * consumption of the reserve based on the amount of freed memory. Usage of a
 * pre-allocated pool (e.g. mempool) should be always considered before using
 * this scope.
 *
 * Individual allocations under the scope can opt out using __GFP_NOMEMALLOC
 *
 * Context: This function should not be used in an interrupt context as that one
 *          does not give PF_MEMALLOC access to reserves.
 *          See __gfp_pfmemalloc_flags().
 * Return: The saved flags to be passed to memalloc_noreclaim_restore.
 */
static inline unsigned int memalloc_noreclaim_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC);
}

/**
 * memalloc_noreclaim_restore - Ends the implicit __GFP_MEMALLOC scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit __GFP_MEMALLOC scope started by memalloc_noreclaim_save
 * function. Always make sure that the given flags is the return value from the
 * pairing memalloc_noreclaim_save call.
 */
static inline void memalloc_noreclaim_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

/**
 * memalloc_pin_save - Marks implicit ~__GFP_MOVABLE scope.
 *
 * This function marks the beginning of the ~__GFP_MOVABLE allocation scope.
 * All further allocations will implicitly remove the __GFP_MOVABLE flag, which
 * will constraint the allocations to zones that allow long term pinning, i.e.
 * not ZONE_MOVABLE zones.
 *
 * Return: The saved flags to be passed to memalloc_pin_restore.
 */
static inline unsigned int memalloc_pin_save(void)
{
        return memalloc_flags_save(PF_MEMALLOC_PIN);
}

/**
 * memalloc_pin_restore - Ends the implicit ~__GFP_MOVABLE scope.
 * @flags: Flags to restore.
 *
 * Ends the implicit ~__GFP_MOVABLE scope started by memalloc_pin_save function.
 * Always make sure that the given flags is the return value from the pairing
 * memalloc_pin_save call.
 */
static inline void memalloc_pin_restore(unsigned int flags)
{
        memalloc_flags_restore(flags);
}

#ifdef CONFIG_MEMCG
DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg);
/**
 * set_active_memcg - Starts the remote memcg charging scope.
 * @memcg: memcg to charge.
 *
 * This function marks the beginning of the remote memcg charging scope. All the
 * __GFP_ACCOUNT allocations till the end of the scope will be charged to the
 * given memcg.
 *
 * Please, make sure that caller has a reference to the passed memcg structure,
 * so its lifetime is guaranteed to exceed the scope between two
 * set_active_memcg() calls.
 *
 * NOTE: This function can nest. Users must save the return value and
 * reset the previous value after their own charging scope is over.
 */
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        struct mem_cgroup *old;

        if (!in_task()) {
                old = this_cpu_read(int_active_memcg);
                this_cpu_write(int_active_memcg, memcg);
        } else {
                old = current->active_memcg;
                current->active_memcg = memcg;
        }

        return old;
}
#else
static inline struct mem_cgroup *
set_active_memcg(struct mem_cgroup *memcg)
{
        return NULL;
}
#endif

#ifdef CONFIG_MEMBARRIER
enum {
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY                = (1U << 0),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED                        = (1U << 1),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY                        = (1U << 2),
        MEMBARRIER_STATE_GLOBAL_EXPEDITED                        = (1U << 3),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY        = (1U << 4),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE                = (1U << 5),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY                = (1U << 6),
        MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ                        = (1U << 7),
};

enum {
        MEMBARRIER_FLAG_SYNC_CORE        = (1U << 0),
        MEMBARRIER_FLAG_RSEQ                = (1U << 1),
};

#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
#include <asm/membarrier.h>
#endif

static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
        /*
         * The atomic_read() below prevents CSE. The following should
         * help the compiler generate more efficient code on architectures
         * where sync_core_before_usermode() is a no-op.
         */
        if (!IS_ENABLED(CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE))
                return;
        if (current->mm != mm)
                return;
        if (likely(!(atomic_read(&mm->membarrier_state) &
                     MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE)))
                return;
        sync_core_before_usermode();
}

extern void membarrier_exec_mmap(struct mm_struct *mm);

extern void membarrier_update_current_mm(struct mm_struct *next_mm);

#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
                                             struct mm_struct *next,
                                             struct task_struct *tsk)
{
}
#endif
static inline void membarrier_exec_mmap(struct mm_struct *mm)
{
}
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
{
}
#endif

#endif /* _LINUX_SCHED_MM_H */



















































































































































  165 





  165 


















































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * IRQ subsystem internal functions and variables:
 *
 * Do not ever include this file from anything else than
 * kernel/irq/. Do not even think about using any information outside
 * of this file for your non core code.
 */
#include <linux/irqdesc.h>
#include <linux/kernel_stat.h>
#include <linux/pm_runtime.h>
#include <linux/sched/clock.h>

#ifdef CONFIG_SPARSE_IRQ
# define MAX_SPARSE_IRQS        INT_MAX
#else
# define MAX_SPARSE_IRQS        NR_IRQS
#endif

#define istate core_internal_state__do_not_mess_with_it

extern bool noirqdebug;

extern struct irqaction chained_action;

/*
 * Bits used by threaded handlers:
 * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
 * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
 * IRQTF_AFFINITY  - irq thread is requested to adjust affinity
 * IRQTF_FORCED_THREAD  - irq action is force threaded
 * IRQTF_READY     - signals that irq thread is ready
 */
enum {
        IRQTF_RUNTHREAD,
        IRQTF_WARNED,
        IRQTF_AFFINITY,
        IRQTF_FORCED_THREAD,
        IRQTF_READY,
};

/*
 * Bit masks for desc->core_internal_state__do_not_mess_with_it
 *
 * IRQS_AUTODETECT                - autodetection in progress
 * IRQS_SPURIOUS_DISABLED        - was disabled due to spurious interrupt
 *                                  detection
 * IRQS_POLL_INPROGRESS                - polling in progress
 * IRQS_ONESHOT                        - irq is not unmasked in primary handler
 * IRQS_REPLAY                        - irq has been resent and will not be resent
 *                                   again until the handler has run and cleared
 *                                   this flag.
 * IRQS_WAITING                        - irq is waiting
 * IRQS_PENDING                        - irq needs to be resent and should be resent
 *                                   at the next available opportunity.
 * IRQS_SUSPENDED                - irq is suspended
 * IRQS_NMI                        - irq line is used to deliver NMIs
 * IRQS_SYSFS                        - descriptor has been added to sysfs
 */
enum {
        IRQS_AUTODETECT                = 0x00000001,
        IRQS_SPURIOUS_DISABLED        = 0x00000002,
        IRQS_POLL_INPROGRESS        = 0x00000008,
        IRQS_ONESHOT                = 0x00000020,
        IRQS_REPLAY                = 0x00000040,
        IRQS_WAITING                = 0x00000080,
        IRQS_PENDING                = 0x00000200,
        IRQS_SUSPENDED                = 0x00000800,
        IRQS_TIMINGS                = 0x00001000,
        IRQS_NMI                = 0x00002000,
        IRQS_SYSFS                = 0x00004000,
};

#include "debug.h"
#include "settings.h"

extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags);
extern void __disable_irq(struct irq_desc *desc);
extern void __enable_irq(struct irq_desc *desc);

#define IRQ_RESEND        true
#define IRQ_NORESEND        false

#define IRQ_START_FORCE        true
#define IRQ_START_COND        false

extern int irq_activate(struct irq_desc *desc);
extern int irq_activate_and_startup(struct irq_desc *desc, bool resend);
extern int irq_startup(struct irq_desc *desc, bool resend, bool force);

extern void irq_shutdown(struct irq_desc *desc);
extern void irq_shutdown_and_deactivate(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
extern void mask_irq(struct irq_desc *desc);
extern void unmask_irq(struct irq_desc *desc);
extern void unmask_threaded_irq(struct irq_desc *desc);

#ifdef CONFIG_SPARSE_IRQ
static inline void irq_mark_irq(unsigned int irq) { }
#else
extern void irq_mark_irq(unsigned int irq);
#endif

irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);

/* Resending of interrupts :*/
int check_irq_resend(struct irq_desc *desc, bool inject);
void clear_irq_resend(struct irq_desc *desc);
void irq_resend_init(struct irq_desc *desc);
bool irq_wait_for_poll(struct irq_desc *desc);
void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);

void wake_threads_waitq(struct irq_desc *desc);

#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
extern void register_handler_proc(unsigned int irq, struct irqaction *action);
extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
#else
static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
static inline void register_handler_proc(unsigned int irq,
                                         struct irqaction *action) { }
static inline void unregister_handler_proc(unsigned int irq,
                                           struct irqaction *action) { }
#endif

extern bool irq_can_set_affinity_usr(unsigned int irq);

extern int irq_do_set_affinity(struct irq_data *data,
                               const struct cpumask *dest, bool force);

#ifdef CONFIG_SMP
extern int irq_setup_affinity(struct irq_desc *desc);
#else
static inline int irq_setup_affinity(struct irq_desc *desc) { return 0; }
#endif

/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
        if (unlikely(desc->irq_data.chip->irq_bus_lock))
                desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
}

static inline void chip_bus_sync_unlock(struct irq_desc *desc)
{
        if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
                desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
}

#define _IRQ_DESC_CHECK                (1 << 0)
#define _IRQ_DESC_PERCPU        (1 << 1)

#define IRQ_GET_DESC_CHECK_GLOBAL        (_IRQ_DESC_CHECK)
#define IRQ_GET_DESC_CHECK_PERCPU        (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)

#define for_each_action_of_desc(desc, act)                        \
        for (act = desc->action; act; act = act->next)

struct irq_desc *
__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
                    unsigned int check);
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);

static inline struct irq_desc *
irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
{
        return __irq_get_desc_lock(irq, flags, true, check);
}

static inline void
irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
{
        __irq_put_desc_unlock(desc, flags, true);
}

static inline struct irq_desc *
irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
{
        return __irq_get_desc_lock(irq, flags, false, check);
}

static inline void
irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
{
        __irq_put_desc_unlock(desc, flags, false);
}

#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)

static inline unsigned int irqd_get(struct irq_data *d)
{
        return __irqd_to_state(d);
}

/*
 * Manipulation functions for irq_data.state
 */
static inline void irqd_set_move_pending(struct irq_data *d)
{
        __irqd_to_state(d) |= IRQD_SETAFFINITY_PENDING;
}

static inline void irqd_clr_move_pending(struct irq_data *d)
{
        __irqd_to_state(d) &= ~IRQD_SETAFFINITY_PENDING;
}

static inline void irqd_set_managed_shutdown(struct irq_data *d)
{
        __irqd_to_state(d) |= IRQD_MANAGED_SHUTDOWN;
}

static inline void irqd_clr_managed_shutdown(struct irq_data *d)
{
        __irqd_to_state(d) &= ~IRQD_MANAGED_SHUTDOWN;
}

static inline void irqd_clear(struct irq_data *d, unsigned int mask)
{
        __irqd_to_state(d) &= ~mask;
}

static inline void irqd_set(struct irq_data *d, unsigned int mask)
{
        __irqd_to_state(d) |= mask;
}

static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
{
        return __irqd_to_state(d) & mask;
}

static inline void irq_state_set_disabled(struct irq_desc *desc)
{
        irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
}

static inline void irq_state_set_masked(struct irq_desc *desc)
{
        irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
}

#undef __irqd_to_state

static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
        __this_cpu_inc(desc->kstat_irqs->cnt);
        __this_cpu_inc(kstat.irqs_sum);
}

static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
        __kstat_incr_irqs_this_cpu(desc);
        desc->tot_count++;
}

static inline int irq_desc_get_node(struct irq_desc *desc)
{
        return irq_common_data_get_node(&desc->irq_common_data);
}

static inline int irq_desc_is_chained(struct irq_desc *desc)
{
        return (desc->action && desc->action == &chained_action);
}

static inline bool irq_is_nmi(struct irq_desc *desc)
{
        return desc->istate & IRQS_NMI;
}

#ifdef CONFIG_PM_SLEEP
bool irq_pm_check_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
#else
static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
static inline void
irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
static inline void
irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
#endif

#ifdef CONFIG_IRQ_TIMINGS

#define IRQ_TIMINGS_SHIFT        5
#define IRQ_TIMINGS_SIZE        (1 << IRQ_TIMINGS_SHIFT)
#define IRQ_TIMINGS_MASK        (IRQ_TIMINGS_SIZE - 1)

/**
 * struct irq_timings - irq timings storing structure
 * @values: a circular buffer of u64 encoded <timestamp,irq> values
 * @count: the number of elements in the array
 */
struct irq_timings {
        u64        values[IRQ_TIMINGS_SIZE];
        int        count;
};

DECLARE_PER_CPU(struct irq_timings, irq_timings);

extern void irq_timings_free(int irq);
extern int irq_timings_alloc(int irq);

static inline void irq_remove_timings(struct irq_desc *desc)
{
        desc->istate &= ~IRQS_TIMINGS;

        irq_timings_free(irq_desc_get_irq(desc));
}

static inline void irq_setup_timings(struct irq_desc *desc, struct irqaction *act)
{
        int irq = irq_desc_get_irq(desc);
        int ret;

        /*
         * We don't need the measurement because the idle code already
         * knows the next expiry event.
         */
        if (act->flags & __IRQF_TIMER)
                return;

        /*
         * In case the timing allocation fails, we just want to warn,
         * not fail, so letting the system boot anyway.
         */
        ret = irq_timings_alloc(irq);
        if (ret) {
                pr_warn("Failed to allocate irq timing stats for irq%d (%d)",
                        irq, ret);
                return;
        }

        desc->istate |= IRQS_TIMINGS;
}

extern void irq_timings_enable(void);
extern void irq_timings_disable(void);

DECLARE_STATIC_KEY_FALSE(irq_timing_enabled);

/*
 * The interrupt number and the timestamp are encoded into a single
 * u64 variable to optimize the size.
 * 48 bit time stamp and 16 bit IRQ number is way sufficient.
 *  Who cares an IRQ after 78 hours of idle time?
 */
static inline u64 irq_timing_encode(u64 timestamp, int irq)
{
        return (timestamp << 16) | irq;
}

static inline int irq_timing_decode(u64 value, u64 *timestamp)
{
        *timestamp = value >> 16;
        return value & U16_MAX;
}

static __always_inline void irq_timings_push(u64 ts, int irq)
{
        struct irq_timings *timings = this_cpu_ptr(&irq_timings);

        timings->values[timings->count & IRQ_TIMINGS_MASK] =
                irq_timing_encode(ts, irq);

        timings->count++;
}

/*
 * The function record_irq_time is only called in one place in the
 * interrupts handler. We want this function always inline so the code
 * inside is embedded in the function and the static key branching
 * code can act at the higher level. Without the explicit
 * __always_inline we can end up with a function call and a small
 * overhead in the hotpath for nothing.
 */
static __always_inline void record_irq_time(struct irq_desc *desc)
{
        if (!static_branch_likely(&irq_timing_enabled))
                return;

        if (desc->istate & IRQS_TIMINGS)
                irq_timings_push(local_clock(), irq_desc_get_irq(desc));
}
#else
static inline void irq_remove_timings(struct irq_desc *desc) {}
static inline void irq_setup_timings(struct irq_desc *desc,
                                     struct irqaction *act) {};
static inline void record_irq_time(struct irq_desc *desc) {}
#endif /* CONFIG_IRQ_TIMINGS */


#ifdef CONFIG_GENERIC_IRQ_CHIP
void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
                           int num_ct, unsigned int irq_base,
                           void __iomem *reg_base, irq_flow_handler_t handler);
#else
static inline void
irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
                      int num_ct, unsigned int irq_base,
                      void __iomem *reg_base, irq_flow_handler_t handler) { }
#endif /* CONFIG_GENERIC_IRQ_CHIP */

#ifdef CONFIG_GENERIC_PENDING_IRQ
static inline bool irq_can_move_pcntxt(struct irq_data *data)
{
        return !(data->chip->flags & IRQCHIP_MOVE_DEFERRED);
}
static inline bool irq_move_pending(struct irq_data *data)
{
        return irqd_is_setaffinity_pending(data);
}
static inline void
irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
{
        cpumask_copy(desc->pending_mask, mask);
}
static inline void
irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
{
        cpumask_copy(mask, desc->pending_mask);
}
static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
{
        return desc->pending_mask;
}
bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
void irq_force_complete_move(struct irq_desc *desc);
#else /* CONFIG_GENERIC_PENDING_IRQ */
static inline bool irq_can_move_pcntxt(struct irq_data *data)
{
        return true;
}
static inline bool irq_move_pending(struct irq_data *data)
{
        return false;
}
static inline void
irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
{
}
static inline void
irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
{
}
static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
{
        return NULL;
}
static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
{
        return false;
}
static inline void irq_force_complete_move(struct irq_desc *desc) { }
#endif /* !CONFIG_GENERIC_PENDING_IRQ */

#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve)
{
        irqd_set_activated(data);
        return 0;
}
static inline void irq_domain_deactivate_irq(struct irq_data *data)
{
        irqd_clr_activated(data);
}
#endif

static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
{
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
        return irqd->parent_data;
#else
        return NULL;
#endif
}

#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
#include <linux/debugfs.h>

struct irq_bit_descr {
        unsigned int        mask;
        char                *name;
};

#define BIT_MASK_DESCR(m)        { .mask = m, .name = #m }

void irq_debug_show_bits(struct seq_file *m, int ind, unsigned int state,
                         const struct irq_bit_descr *sd, int size);

void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *desc);
static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
{
        debugfs_remove(desc->debugfs_file);
        kfree(desc->dev_name);
}
void irq_debugfs_copy_devname(int irq, struct device *dev);
# ifdef CONFIG_IRQ_DOMAIN
void irq_domain_debugfs_init(struct dentry *root);
# else
static inline void irq_domain_debugfs_init(struct dentry *root)
{
}
# endif
#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
{
}
static inline void irq_remove_debugfs_entry(struct irq_desc *d)
{
}
static inline void irq_debugfs_copy_devname(int irq, struct device *dev)
{
}
#endif /* CONFIG_GENERIC_IRQ_DEBUGFS */









































































































































































































































































    4 

    4 













    4 











































    4 

























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _FUTEX_H
#define _FUTEX_H

#include <linux/futex.h>
#include <linux/rtmutex.h>
#include <linux/sched/wake_q.h>
#include <linux/compat.h>
#include <linux/uaccess.h>

#ifdef CONFIG_PREEMPT_RT
#include <linux/rcuwait.h>
#endif

#include <asm/futex.h>

/*
 * Futex flags used to encode options to functions and preserve them across
 * restarts.
 */
#define FLAGS_SIZE_8                0x0000
#define FLAGS_SIZE_16                0x0001
#define FLAGS_SIZE_32                0x0002
#define FLAGS_SIZE_64                0x0003

#define FLAGS_SIZE_MASK                0x0003

#ifdef CONFIG_MMU
# define FLAGS_SHARED                0x0010
#else
/*
 * NOMMU does not have per process address space. Let the compiler optimize
 * code away.
 */
# define FLAGS_SHARED                0x0000
#endif
#define FLAGS_CLOCKRT                0x0020
#define FLAGS_HAS_TIMEOUT        0x0040
#define FLAGS_NUMA                0x0080
#define FLAGS_STRICT                0x0100

/* FUTEX_ to FLAGS_ */
static inline unsigned int futex_to_flags(unsigned int op)
{
        unsigned int flags = FLAGS_SIZE_32;

        if (!(op & FUTEX_PRIVATE_FLAG))
                flags |= FLAGS_SHARED;

        if (op & FUTEX_CLOCK_REALTIME)
                flags |= FLAGS_CLOCKRT;

        return flags;
}

#define FUTEX2_VALID_MASK (FUTEX2_SIZE_MASK | FUTEX2_PRIVATE)

/* FUTEX2_ to FLAGS_ */
static inline unsigned int futex2_to_flags(unsigned int flags2)
{
        unsigned int flags = flags2 & FUTEX2_SIZE_MASK;

        if (!(flags2 & FUTEX2_PRIVATE))
                flags |= FLAGS_SHARED;

        if (flags2 & FUTEX2_NUMA)
                flags |= FLAGS_NUMA;

        return flags;
}

static inline unsigned int futex_size(unsigned int flags)
{
        return 1 << (flags & FLAGS_SIZE_MASK);
}

static inline bool futex_flags_valid(unsigned int flags)
{
        /* Only 64bit futexes for 64bit code */
        if (!IS_ENABLED(CONFIG_64BIT) || in_compat_syscall()) {
                if ((flags & FLAGS_SIZE_MASK) == FLAGS_SIZE_64)
                        return false;
        }

        /* Only 32bit futexes are implemented -- for now */
        if ((flags & FLAGS_SIZE_MASK) != FLAGS_SIZE_32)
                return false;

        return true;
}

static inline bool futex_validate_input(unsigned int flags, u64 val)
{
        int bits = 8 * futex_size(flags);

        if (bits < 64 && (val >> bits))
                return false;

        return true;
}

#ifdef CONFIG_FAIL_FUTEX
extern bool should_fail_futex(bool fshared);
#else
static inline bool should_fail_futex(bool fshared)
{
        return false;
}
#endif

/*
 * Hash buckets are shared by all the futex_keys that hash to the same
 * location.  Each key may have multiple futex_q structures, one for each task
 * waiting on a futex.
 */
struct futex_hash_bucket {
        atomic_t waiters;
        spinlock_t lock;
        struct plist_head chain;
} ____cacheline_aligned_in_smp;

/*
 * Priority Inheritance state:
 */
struct futex_pi_state {
        /*
         * list of 'owned' pi_state instances - these have to be
         * cleaned up in do_exit() if the task exits prematurely:
         */
        struct list_head list;

        /*
         * The PI object:
         */
        struct rt_mutex_base pi_mutex;

        struct task_struct *owner;
        refcount_t refcount;

        union futex_key key;
} __randomize_layout;

struct futex_q;
typedef void (futex_wake_fn)(struct wake_q_head *wake_q, struct futex_q *q);

/**
 * struct futex_q - The hashed futex queue entry, one per waiting task
 * @list:                priority-sorted list of tasks waiting on this futex
 * @task:                the task waiting on the futex
 * @lock_ptr:                the hash bucket lock
 * @wake:                the wake handler for this queue
 * @wake_data:                data associated with the wake handler
 * @key:                the key the futex is hashed on
 * @pi_state:                optional priority inheritance state
 * @rt_waiter:                rt_waiter storage for use with requeue_pi
 * @requeue_pi_key:        the requeue_pi target futex key
 * @bitset:                bitset for the optional bitmasked wakeup
 * @requeue_state:        State field for futex_requeue_pi()
 * @requeue_wait:        RCU wait for futex_requeue_pi() (RT only)
 *
 * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so
 * we can wake only the relevant ones (hashed queues may be shared).
 *
 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
 * The order of wakeup is always to make the first condition true, then
 * the second.
 *
 * PI futexes are typically woken before they are removed from the hash list via
 * the rt_mutex code. See futex_unqueue_pi().
 */
struct futex_q {
        struct plist_node list;

        struct task_struct *task;
        spinlock_t *lock_ptr;
        futex_wake_fn *wake;
        void *wake_data;
        union futex_key key;
        struct futex_pi_state *pi_state;
        struct rt_mutex_waiter *rt_waiter;
        union futex_key *requeue_pi_key;
        u32 bitset;
        atomic_t requeue_state;
#ifdef CONFIG_PREEMPT_RT
        struct rcuwait requeue_wait;
#endif
} __randomize_layout;

extern const struct futex_q futex_q_init;

enum futex_access {
        FUTEX_READ,
        FUTEX_WRITE
};

extern int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key,
                         enum futex_access rw);

extern struct hrtimer_sleeper *
futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
                  int flags, u64 range_ns);

extern struct futex_hash_bucket *futex_hash(union futex_key *key);

/**
 * futex_match - Check whether two futex keys are equal
 * @key1:        Pointer to key1
 * @key2:        Pointer to key2
 *
 * Return 1 if two futex_keys are equal, 0 otherwise.
 */
static inline int futex_match(union futex_key *key1, union futex_key *key2)
{
        return (key1 && key2
                && key1->both.word == key2->both.word
                && key1->both.ptr == key2->both.ptr
                && key1->both.offset == key2->both.offset);
}

extern int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                            struct futex_q *q, struct futex_hash_bucket **hb);
extern void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
                                   struct hrtimer_sleeper *timeout);
extern bool __futex_wake_mark(struct futex_q *q);
extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q);

extern int fault_in_user_writeable(u32 __user *uaddr);
extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key);

static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
{
        int ret;

        pagefault_disable();
        ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
        pagefault_enable();

        return ret;
}

/*
 * This does a plain atomic user space read, and the user pointer has
 * already been verified earlier by get_futex_key() to be both aligned
 * and actually in user space, just like futex_atomic_cmpxchg_inatomic().
 *
 * We still want to avoid any speculation, and while __get_user() is
 * the traditional model for this, it's actually slower than doing
 * this manually these days.
 *
 * We could just have a per-architecture special function for it,
 * the same way we do futex_atomic_cmpxchg_inatomic(), but rather
 * than force everybody to do that, write it out long-hand using
 * the low-level user-access infrastructure.
 *
 * This looks a bit overkill, but generally just results in a couple
 * of instructions.
 */
static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from)
{
        u32 val;

        if (can_do_masked_user_access())
                from = masked_user_access_begin(from);
        else if (!user_read_access_begin(from, sizeof(*from)))
                return -EFAULT;
        unsafe_get_user(val, from, Efault);
        user_read_access_end();
        *dest = val;
        return 0;
Efault:
        user_read_access_end();
        return -EFAULT;
}

static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
{
        int ret;

        pagefault_disable();
        ret = futex_read_inatomic(dest, from);
        pagefault_enable();

        return ret;
}

extern void __futex_unqueue(struct futex_q *q);
extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
                                struct task_struct *task);
extern int futex_unqueue(struct futex_q *q);

/**
 * futex_queue() - Enqueue the futex_q on the futex_hash_bucket
 * @q:        The futex_q to enqueue
 * @hb:        The destination hash bucket
 * @task: Task queueing this futex
 *
 * The hb->lock must be held by the caller, and is released here. A call to
 * futex_queue() is typically paired with exactly one call to futex_unqueue().  The
 * exceptions involve the PI related operations, which may use futex_unqueue_pi()
 * or nothing if the unqueue is done as part of the wake process and the unqueue
 * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
 * an example).
 *
 * Note that @task may be NULL, for async usage of futexes.
 */
static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
                               struct task_struct *task)
        __releases(&hb->lock)
{
        __futex_queue(q, hb, task);
        spin_unlock(&hb->lock);
}

extern void futex_unqueue_pi(struct futex_q *q);

extern void wait_for_owner_exiting(int ret, struct task_struct *exiting);

/*
 * Reflects a new waiter being added to the waitqueue.
 */
static inline void futex_hb_waiters_inc(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
        atomic_inc(&hb->waiters);
        /*
         * Full barrier (A), see the ordering comment above.
         */
        smp_mb__after_atomic();
#endif
}

/*
 * Reflects a waiter being removed from the waitqueue by wakeup
 * paths.
 */
static inline void futex_hb_waiters_dec(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
        atomic_dec(&hb->waiters);
#endif
}

static inline int futex_hb_waiters_pending(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
        /*
         * Full barrier (B), see the ordering comment above.
         */
        smp_mb();
        return atomic_read(&hb->waiters);
#else
        return 1;
#endif
}

extern struct futex_hash_bucket *futex_q_lock(struct futex_q *q);
extern void futex_q_unlock(struct futex_hash_bucket *hb);


extern int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
                                union futex_key *key,
                                struct futex_pi_state **ps,
                                struct task_struct *task,
                                struct task_struct **exiting,
                                int set_waiters);

extern int refill_pi_state_cache(void);
extern void get_pi_state(struct futex_pi_state *pi_state);
extern void put_pi_state(struct futex_pi_state *pi_state);
extern int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked);

/*
 * Express the locking dependencies for lockdep:
 */
static inline void
double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
        if (hb1 > hb2)
                swap(hb1, hb2);

        spin_lock(&hb1->lock);
        if (hb1 != hb2)
                spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
}

static inline void
double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
        spin_unlock(&hb1->lock);
        if (hb1 != hb2)
                spin_unlock(&hb2->lock);
}

/* syscalls */

extern int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32
                                 val, ktime_t *abs_time, u32 bitset, u32 __user
                                 *uaddr2);

extern int futex_requeue(u32 __user *uaddr1, unsigned int flags1,
                         u32 __user *uaddr2, unsigned int flags2,
                         int nr_wake, int nr_requeue,
                         u32 *cmpval, int requeue_pi);

extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
                        struct hrtimer_sleeper *to, u32 bitset);

extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
                      ktime_t *abs_time, u32 bitset);

/**
 * struct futex_vector - Auxiliary struct for futex_waitv()
 * @w: Userspace provided data
 * @q: Kernel side data
 *
 * Struct used to build an array with all data need for futex_waitv()
 */
struct futex_vector {
        struct futex_waitv w;
        struct futex_q q;
};

extern int futex_parse_waitv(struct futex_vector *futexv,
                             struct futex_waitv __user *uwaitv,
                             unsigned int nr_futexes, futex_wake_fn *wake,
                             void *wake_data);

extern int futex_wait_multiple_setup(struct futex_vector *vs, int count,
                                     int *woken);

extern int futex_unqueue_multiple(struct futex_vector *v, int count);

extern int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
                               struct hrtimer_sleeper *to);

extern int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset);

extern int futex_wake_op(u32 __user *uaddr1, unsigned int flags,
                         u32 __user *uaddr2, int nr_wake, int nr_wake2, int op);

extern int futex_unlock_pi(u32 __user *uaddr, unsigned int flags);

extern int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock);

#endif /* _FUTEX_H */







































































    3 









1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
/* Copyright (c) 2002-2007 Volkswagen Group Electronic Research
 * Copyright (c) 2017 Pengutronix, Marc Kleine-Budde <kernel@pengutronix.de>
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of Volkswagen nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * Alternatively, provided that this notice is retained in full, this
 * software may be distributed under the terms of the GNU General
 * Public License ("GPL") version 2, in which case the provisions of the
 * GPL apply INSTEAD OF those given above.
 *
 * The provided data structures and external interfaces from this code
 * are not restricted to be used by modules with a GPL compatible license.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 */

#ifndef CAN_ML_H
#define CAN_ML_H

#include <linux/can.h>
#include <linux/list.h>
#include <linux/netdevice.h>

#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
#define CAN_EFF_RCV_HASH_BITS 10
#define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS)

enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX };

struct can_dev_rcv_lists {
        struct hlist_head rx[RX_MAX];
        struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ];
        struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ];
        int entries;
};

struct can_ml_priv {
        struct can_dev_rcv_lists dev_rcv_lists;
#ifdef CAN_J1939
        struct j1939_priv *j1939_priv;
#endif
};

static inline struct can_ml_priv *can_get_ml_priv(struct net_device *dev)
{
        return netdev_get_ml_priv(dev, ML_PRIV_CAN);
}

static inline void can_set_ml_priv(struct net_device *dev,
                                   struct can_ml_priv *ml_priv)
{
        netdev_set_ml_priv(dev, ml_priv, ML_PRIV_CAN);
}

#endif /* CAN_ML_H */














































    2 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/* SPDX-License-Identifier: GPL-2.0-only */

#ifndef __KVM_IODEV_H__
#define __KVM_IODEV_H__

#include <linux/kvm_types.h>
#include <linux/errno.h>

struct kvm_io_device;
struct kvm_vcpu;

/**
 * kvm_io_device_ops are called under kvm slots_lock.
 * read and write handlers return 0 if the transaction has been handled,
 * or non-zero to have it passed to the next device.
 **/
struct kvm_io_device_ops {
        int (*read)(struct kvm_vcpu *vcpu,
                    struct kvm_io_device *this,
                    gpa_t addr,
                    int len,
                    void *val);
        int (*write)(struct kvm_vcpu *vcpu,
                     struct kvm_io_device *this,
                     gpa_t addr,
                     int len,
                     const void *val);
        void (*destructor)(struct kvm_io_device *this);
};


struct kvm_io_device {
        const struct kvm_io_device_ops *ops;
};

static inline void kvm_iodevice_init(struct kvm_io_device *dev,
                                     const struct kvm_io_device_ops *ops)
{
        dev->ops = ops;
}

static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu,
                                    struct kvm_io_device *dev, gpa_t addr,
                                    int l, void *v)
{
        return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v)
                                : -EOPNOTSUPP;
}

static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
                                     struct kvm_io_device *dev, gpa_t addr,
                                     int l, const void *v)
{
        return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v)
                                 : -EOPNOTSUPP;
}

#endif /* __KVM_IODEV_H__ */





















































































































































































































































































































































































































































































   43 






  206 


  206 

  206 



















  206 














  205 





  206 




  206 




    4 
  206 











  206 


  205 



  206 



  206 



  206 




  206 


    4 
  206 





  119 




  119 


  119 




  119 









  246 



  246 






  246 

























   72 



















  194 



  246 




  246 






  246 





























  246 


  246 

  246 




























































































   24 















   24 
   24 






   24 











   24 








































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/mmu_notifier.c
 *
 *  Copyright (C) 2008  Qumranet, Inc.
 *  Copyright (C) 2008  SGI
 *             Christoph Lameter <cl@linux.com>
 */

#include <linux/rculist.h>
#include <linux/mmu_notifier.h>
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/err.h>
#include <linux/interval_tree.h>
#include <linux/srcu.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>

#include "vma.h"

/* global SRCU for all MMs */
DEFINE_STATIC_SRCU(srcu);

#ifdef CONFIG_LOCKDEP
struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
        .name = "mmu_notifier_invalidate_range_start"
};
#endif

/*
 * The mmu_notifier_subscriptions structure is allocated and installed in
 * mm->notifier_subscriptions inside the mm_take_all_locks() protected
 * critical section and it's released only when mm_count reaches zero
 * in mmdrop().
 */
struct mmu_notifier_subscriptions {
        /* all mmu notifiers registered in this mm are queued in this list */
        struct hlist_head list;
        bool has_itree;
        /* to serialize the list modifications and hlist_unhashed */
        spinlock_t lock;
        unsigned long invalidate_seq;
        unsigned long active_invalidate_ranges;
        struct rb_root_cached itree;
        wait_queue_head_t wq;
        struct hlist_head deferred_list;
};

/*
 * This is a collision-retry read-side/write-side 'lock', a lot like a
 * seqcount, however this allows multiple write-sides to hold it at
 * once. Conceptually the write side is protecting the values of the PTEs in
 * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
 * writer exists.
 *
 * Note that the core mm creates nested invalidate_range_start()/end() regions
 * within the same thread, and runs invalidate_range_start()/end() in parallel
 * on multiple CPUs. This is designed to not reduce concurrency or block
 * progress on the mm side.
 *
 * As a secondary function, holding the full write side also serves to prevent
 * writers for the itree, this is an optimization to avoid extra locking
 * during invalidate_range_start/end notifiers.
 *
 * The write side has two states, fully excluded:
 *  - mm->active_invalidate_ranges != 0
 *  - subscriptions->invalidate_seq & 1 == True (odd)
 *  - some range on the mm_struct is being invalidated
 *  - the itree is not allowed to change
 *
 * And partially excluded:
 *  - mm->active_invalidate_ranges != 0
 *  - subscriptions->invalidate_seq & 1 == False (even)
 *  - some range on the mm_struct is being invalidated
 *  - the itree is allowed to change
 *
 * Operations on notifier_subscriptions->invalidate_seq (under spinlock):
 *    seq |= 1  # Begin writing
 *    seq++     # Release the writing state
 *    seq & 1   # True if a writer exists
 *
 * The later state avoids some expensive work on inv_end in the common case of
 * no mmu_interval_notifier monitoring the VA.
 */
static bool
mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions)
{
        lockdep_assert_held(&subscriptions->lock);
        return subscriptions->invalidate_seq & 1;
}

static struct mmu_interval_notifier *
mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions,
                         const struct mmu_notifier_range *range,
                         unsigned long *seq)
{
        struct interval_tree_node *node;
        struct mmu_interval_notifier *res = NULL;

        spin_lock(&subscriptions->lock);
        subscriptions->active_invalidate_ranges++;
        node = interval_tree_iter_first(&subscriptions->itree, range->start,
                                        range->end - 1);
        if (node) {
                subscriptions->invalidate_seq |= 1;
                res = container_of(node, struct mmu_interval_notifier,
                                   interval_tree);
        }

        *seq = subscriptions->invalidate_seq;
        spin_unlock(&subscriptions->lock);
        return res;
}

static struct mmu_interval_notifier *
mn_itree_inv_next(struct mmu_interval_notifier *interval_sub,
                  const struct mmu_notifier_range *range)
{
        struct interval_tree_node *node;

        node = interval_tree_iter_next(&interval_sub->interval_tree,
                                       range->start, range->end - 1);
        if (!node)
                return NULL;
        return container_of(node, struct mmu_interval_notifier, interval_tree);
}

static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
{
        struct mmu_interval_notifier *interval_sub;
        struct hlist_node *next;

        spin_lock(&subscriptions->lock);
        if (--subscriptions->active_invalidate_ranges ||
            !mn_itree_is_invalidating(subscriptions)) {
                spin_unlock(&subscriptions->lock);
                return;
        }

        /* Make invalidate_seq even */
        subscriptions->invalidate_seq++;

        /*
         * The inv_end incorporates a deferred mechanism like rtnl_unlock().
         * Adds and removes are queued until the final inv_end happens then
         * they are progressed. This arrangement for tree updates is used to
         * avoid using a blocking lock during invalidate_range_start.
         */
        hlist_for_each_entry_safe(interval_sub, next,
                                  &subscriptions->deferred_list,
                                  deferred_item) {
                if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb))
                        interval_tree_insert(&interval_sub->interval_tree,
                                             &subscriptions->itree);
                else
                        interval_tree_remove(&interval_sub->interval_tree,
                                             &subscriptions->itree);
                hlist_del(&interval_sub->deferred_item);
        }
        spin_unlock(&subscriptions->lock);

        wake_up_all(&subscriptions->wq);
}

/**
 * mmu_interval_read_begin - Begin a read side critical section against a VA
 *                           range
 * @interval_sub: The interval subscription
 *
 * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
 * collision-retry scheme similar to seqcount for the VA range under
 * subscription. If the mm invokes invalidation during the critical section
 * then mmu_interval_read_retry() will return true.
 *
 * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
 * require a blocking context.  The critical region formed by this can sleep,
 * and the required 'user_lock' can also be a sleeping lock.
 *
 * The caller is required to provide a 'user_lock' to serialize both teardown
 * and setup.
 *
 * The return value should be passed to mmu_interval_read_retry().
 */
unsigned long
mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
{
        struct mmu_notifier_subscriptions *subscriptions =
                interval_sub->mm->notifier_subscriptions;
        unsigned long seq;
        bool is_invalidating;

        /*
         * If the subscription has a different seq value under the user_lock
         * than we started with then it has collided.
         *
         * If the subscription currently has the same seq value as the
         * subscriptions seq, then it is currently between
         * invalidate_start/end and is colliding.
         *
         * The locking looks broadly like this:
         *   mn_itree_inv_start():                 mmu_interval_read_begin():
         *                                         spin_lock
         *                                          seq = READ_ONCE(interval_sub->invalidate_seq);
         *                                          seq == subs->invalidate_seq
         *                                         spin_unlock
         *    spin_lock
         *     seq = ++subscriptions->invalidate_seq
         *    spin_unlock
         *     op->invalidate():
         *       user_lock
         *        mmu_interval_set_seq()
         *         interval_sub->invalidate_seq = seq
         *       user_unlock
         *
         *                          [Required: mmu_interval_read_retry() == true]
         *
         *   mn_itree_inv_end():
         *    spin_lock
         *     seq = ++subscriptions->invalidate_seq
         *    spin_unlock
         *
         *                                        user_lock
         *                                         mmu_interval_read_retry():
         *                                          interval_sub->invalidate_seq != seq
         *                                        user_unlock
         *
         * Barriers are not needed here as any races here are closed by an
         * eventual mmu_interval_read_retry(), which provides a barrier via the
         * user_lock.
         */
        spin_lock(&subscriptions->lock);
        /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
        seq = READ_ONCE(interval_sub->invalidate_seq);
        is_invalidating = seq == subscriptions->invalidate_seq;
        spin_unlock(&subscriptions->lock);

        /*
         * interval_sub->invalidate_seq must always be set to an odd value via
         * mmu_interval_set_seq() using the provided cur_seq from
         * mn_itree_inv_start_range(). This ensures that if seq does wrap we
         * will always clear the below sleep in some reasonable time as
         * subscriptions->invalidate_seq is even in the idle state.
         */
        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
        if (is_invalidating)
                wait_event(subscriptions->wq,
                           READ_ONCE(subscriptions->invalidate_seq) != seq);

        /*
         * Notice that mmu_interval_read_retry() can already be true at this
         * point, avoiding loops here allows the caller to provide a global
         * time bound.
         */

        return seq;
}
EXPORT_SYMBOL_GPL(mmu_interval_read_begin);

static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
                             struct mm_struct *mm)
{
        struct mmu_notifier_range range = {
                .flags = MMU_NOTIFIER_RANGE_BLOCKABLE,
                .event = MMU_NOTIFY_RELEASE,
                .mm = mm,
                .start = 0,
                .end = ULONG_MAX,
        };
        struct mmu_interval_notifier *interval_sub;
        unsigned long cur_seq;
        bool ret;

        for (interval_sub =
                     mn_itree_inv_start_range(subscriptions, &range, &cur_seq);
             interval_sub;
             interval_sub = mn_itree_inv_next(interval_sub, &range)) {
                ret = interval_sub->ops->invalidate(interval_sub, &range,
                                                    cur_seq);
                WARN_ON(!ret);
        }

        mn_itree_inv_end(subscriptions);
}

/*
 * This function can't run concurrently against mmu_notifier_register
 * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
 * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
 * in parallel despite there being no task using this mm any more,
 * through the vmas outside of the exit_mmap context, such as with
 * vmtruncate. This serializes against mmu_notifier_unregister with
 * the notifier_subscriptions->lock in addition to SRCU and it serializes
 * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions
 * can't go away from under us as exit_mmap holds an mm_count pin
 * itself.
 */
static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions,
                             struct mm_struct *mm)
{
        struct mmu_notifier *subscription;
        int id;

        /*
         * SRCU here will block mmu_notifier_unregister until
         * ->release returns.
         */
        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
                                 srcu_read_lock_held(&srcu))
                /*
                 * If ->release runs before mmu_notifier_unregister it must be
                 * handled, as it's the only way for the driver to flush all
                 * existing sptes and stop the driver from establishing any more
                 * sptes before all the pages in the mm are freed.
                 */
                if (subscription->ops->release)
                        subscription->ops->release(subscription, mm);

        spin_lock(&subscriptions->lock);
        while (unlikely(!hlist_empty(&subscriptions->list))) {
                subscription = hlist_entry(subscriptions->list.first,
                                           struct mmu_notifier, hlist);
                /*
                 * We arrived before mmu_notifier_unregister so
                 * mmu_notifier_unregister will do nothing other than to wait
                 * for ->release to finish and for mmu_notifier_unregister to
                 * return.
                 */
                hlist_del_init_rcu(&subscription->hlist);
        }
        spin_unlock(&subscriptions->lock);
        srcu_read_unlock(&srcu, id);

        /*
         * synchronize_srcu here prevents mmu_notifier_release from returning to
         * exit_mmap (which would proceed with freeing all pages in the mm)
         * until the ->release method returns, if it was invoked by
         * mmu_notifier_unregister.
         *
         * The notifier_subscriptions can't go away from under us because
         * one mm_count is held by exit_mmap.
         */
        synchronize_srcu(&srcu);
}

void __mmu_notifier_release(struct mm_struct *mm)
{
        struct mmu_notifier_subscriptions *subscriptions =
                mm->notifier_subscriptions;

        if (subscriptions->has_itree)
                mn_itree_release(subscriptions, mm);

        if (!hlist_empty(&subscriptions->list))
                mn_hlist_release(subscriptions, mm);
}

/*
 * If no young bitflag is supported by the hardware, ->clear_flush_young can
 * unmap the address and return 1 or 0 depending if the mapping previously
 * existed or not.
 */
int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
                                        unsigned long start,
                                        unsigned long end)
{
        struct mmu_notifier *subscription;
        int young = 0, id;

        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription,
                                 &mm->notifier_subscriptions->list, hlist,
                                 srcu_read_lock_held(&srcu)) {
                if (subscription->ops->clear_flush_young)
                        young |= subscription->ops->clear_flush_young(
                                subscription, mm, start, end);
        }
        srcu_read_unlock(&srcu, id);

        return young;
}

int __mmu_notifier_clear_young(struct mm_struct *mm,
                               unsigned long start,
                               unsigned long end)
{
        struct mmu_notifier *subscription;
        int young = 0, id;

        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription,
                                 &mm->notifier_subscriptions->list, hlist,
                                 srcu_read_lock_held(&srcu)) {
                if (subscription->ops->clear_young)
                        young |= subscription->ops->clear_young(subscription,
                                                                mm, start, end);
        }
        srcu_read_unlock(&srcu, id);

        return young;
}

int __mmu_notifier_test_young(struct mm_struct *mm,
                              unsigned long address)
{
        struct mmu_notifier *subscription;
        int young = 0, id;

        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription,
                                 &mm->notifier_subscriptions->list, hlist,
                                 srcu_read_lock_held(&srcu)) {
                if (subscription->ops->test_young) {
                        young = subscription->ops->test_young(subscription, mm,
                                                              address);
                        if (young)
                                break;
                }
        }
        srcu_read_unlock(&srcu, id);

        return young;
}

static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
                               const struct mmu_notifier_range *range)
{
        struct mmu_interval_notifier *interval_sub;
        unsigned long cur_seq;

        for (interval_sub =
                     mn_itree_inv_start_range(subscriptions, range, &cur_seq);
             interval_sub;
             interval_sub = mn_itree_inv_next(interval_sub, range)) {
                bool ret;

                ret = interval_sub->ops->invalidate(interval_sub, range,
                                                    cur_seq);
                if (!ret) {
                        if (WARN_ON(mmu_notifier_range_blockable(range)))
                                continue;
                        goto out_would_block;
                }
        }
        return 0;

out_would_block:
        /*
         * On -EAGAIN the non-blocking caller is not allowed to call
         * invalidate_range_end()
         */
        mn_itree_inv_end(subscriptions);
        return -EAGAIN;
}

static int mn_hlist_invalidate_range_start(
        struct mmu_notifier_subscriptions *subscriptions,
        struct mmu_notifier_range *range)
{
        struct mmu_notifier *subscription;
        int ret = 0;
        int id;

        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
                                 srcu_read_lock_held(&srcu)) {
                const struct mmu_notifier_ops *ops = subscription->ops;

                if (ops->invalidate_range_start) {
                        int _ret;

                        if (!mmu_notifier_range_blockable(range))
                                non_block_start();
                        _ret = ops->invalidate_range_start(subscription, range);
                        if (!mmu_notifier_range_blockable(range))
                                non_block_end();
                        if (_ret) {
                                pr_info("%pS callback failed with %d in %sblockable context.\n",
                                        ops->invalidate_range_start, _ret,
                                        !mmu_notifier_range_blockable(range) ?
                                                "non-" :
                                                "");
                                WARN_ON(mmu_notifier_range_blockable(range) ||
                                        _ret != -EAGAIN);
                                /*
                                 * We call all the notifiers on any EAGAIN,
                                 * there is no way for a notifier to know if
                                 * its start method failed, thus a start that
                                 * does EAGAIN can't also do end.
                                 */
                                WARN_ON(ops->invalidate_range_end);
                                ret = _ret;
                        }
                }
        }

        if (ret) {
                /*
                 * Must be non-blocking to get here.  If there are multiple
                 * notifiers and one or more failed start, any that succeeded
                 * start are expecting their end to be called.  Do so now.
                 */
                hlist_for_each_entry_rcu(subscription, &subscriptions->list,
                                         hlist, srcu_read_lock_held(&srcu)) {
                        if (!subscription->ops->invalidate_range_end)
                                continue;

                        subscription->ops->invalidate_range_end(subscription,
                                                                range);
                }
        }
        srcu_read_unlock(&srcu, id);

        return ret;
}

int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
        struct mmu_notifier_subscriptions *subscriptions =
                range->mm->notifier_subscriptions;
        int ret;

        if (subscriptions->has_itree) {
                ret = mn_itree_invalidate(subscriptions, range);
                if (ret)
                        return ret;
        }
        if (!hlist_empty(&subscriptions->list))
                return mn_hlist_invalidate_range_start(subscriptions, range);
        return 0;
}

static void
mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
                        struct mmu_notifier_range *range)
{
        struct mmu_notifier *subscription;
        int id;

        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
                                 srcu_read_lock_held(&srcu)) {
                if (subscription->ops->invalidate_range_end) {
                        if (!mmu_notifier_range_blockable(range))
                                non_block_start();
                        subscription->ops->invalidate_range_end(subscription,
                                                                range);
                        if (!mmu_notifier_range_blockable(range))
                                non_block_end();
                }
        }
        srcu_read_unlock(&srcu, id);
}

void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range)
{
        struct mmu_notifier_subscriptions *subscriptions =
                range->mm->notifier_subscriptions;

        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        if (subscriptions->has_itree)
                mn_itree_inv_end(subscriptions);

        if (!hlist_empty(&subscriptions->list))
                mn_hlist_invalidate_end(subscriptions, range);
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}

void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm,
                                        unsigned long start, unsigned long end)
{
        struct mmu_notifier *subscription;
        int id;

        id = srcu_read_lock(&srcu);
        hlist_for_each_entry_rcu(subscription,
                                 &mm->notifier_subscriptions->list, hlist,
                                 srcu_read_lock_held(&srcu)) {
                if (subscription->ops->arch_invalidate_secondary_tlbs)
                        subscription->ops->arch_invalidate_secondary_tlbs(
                                subscription, mm,
                                start, end);
        }
        srcu_read_unlock(&srcu, id);
}

/*
 * Same as mmu_notifier_register but here the caller must hold the mmap_lock in
 * write mode. A NULL mn signals the notifier is being registered for itree
 * mode.
 */
int __mmu_notifier_register(struct mmu_notifier *subscription,
                            struct mm_struct *mm)
{
        struct mmu_notifier_subscriptions *subscriptions = NULL;
        int ret;

        mmap_assert_write_locked(mm);
        BUG_ON(atomic_read(&mm->mm_users) <= 0);

        /*
         * Subsystems should only register for invalidate_secondary_tlbs() or
         * invalidate_range_start()/end() callbacks, not both.
         */
        if (WARN_ON_ONCE(subscription &&
                         (subscription->ops->arch_invalidate_secondary_tlbs &&
                         (subscription->ops->invalidate_range_start ||
                          subscription->ops->invalidate_range_end))))
                return -EINVAL;

        if (!mm->notifier_subscriptions) {
                /*
                 * kmalloc cannot be called under mm_take_all_locks(), but we
                 * know that mm->notifier_subscriptions can't change while we
                 * hold the write side of the mmap_lock.
                 */
                subscriptions = kzalloc(
                        sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL);
                if (!subscriptions)
                        return -ENOMEM;

                INIT_HLIST_HEAD(&subscriptions->list);
                spin_lock_init(&subscriptions->lock);
                subscriptions->invalidate_seq = 2;
                subscriptions->itree = RB_ROOT_CACHED;
                init_waitqueue_head(&subscriptions->wq);
                INIT_HLIST_HEAD(&subscriptions->deferred_list);
        }

        ret = mm_take_all_locks(mm);
        if (unlikely(ret))
                goto out_clean;

        /*
         * Serialize the update against mmu_notifier_unregister. A
         * side note: mmu_notifier_release can't run concurrently with
         * us because we hold the mm_users pin (either implicitly as
         * current->mm or explicitly with get_task_mm() or similar).
         * We can't race against any other mmu notifier method either
         * thanks to mm_take_all_locks().
         *
         * release semantics on the initialization of the
         * mmu_notifier_subscriptions's contents are provided for unlocked
         * readers.  acquire can only be used while holding the mmgrab or
         * mmget, and is safe because once created the
         * mmu_notifier_subscriptions is not freed until the mm is destroyed.
         * As above, users holding the mmap_lock or one of the
         * mm_take_all_locks() do not need to use acquire semantics.
         */
        if (subscriptions)
                smp_store_release(&mm->notifier_subscriptions, subscriptions);

        if (subscription) {
                /* Pairs with the mmdrop in mmu_notifier_unregister_* */
                mmgrab(mm);
                subscription->mm = mm;
                subscription->users = 1;

                spin_lock(&mm->notifier_subscriptions->lock);
                hlist_add_head_rcu(&subscription->hlist,
                                   &mm->notifier_subscriptions->list);
                spin_unlock(&mm->notifier_subscriptions->lock);
        } else
                mm->notifier_subscriptions->has_itree = true;

        mm_drop_all_locks(mm);
        BUG_ON(atomic_read(&mm->mm_users) <= 0);
        return 0;

out_clean:
        kfree(subscriptions);
        return ret;
}
EXPORT_SYMBOL_GPL(__mmu_notifier_register);

/**
 * mmu_notifier_register - Register a notifier on a mm
 * @subscription: The notifier to attach
 * @mm: The mm to attach the notifier to
 *
 * Must not hold mmap_lock nor any other VM related lock when calling
 * this registration function. Must also ensure mm_users can't go down
 * to zero while this runs to avoid races with mmu_notifier_release,
 * so mm has to be current->mm or the mm should be pinned safely such
 * as with get_task_mm(). If the mm is not current->mm, the mm_users
 * pin should be released by calling mmput after mmu_notifier_register
 * returns.
 *
 * mmu_notifier_unregister() or mmu_notifier_put() must be always called to
 * unregister the notifier.
 *
 * While the caller has a mmu_notifier get the subscription->mm pointer will remain
 * valid, and can be converted to an active mm pointer via mmget_not_zero().
 */
int mmu_notifier_register(struct mmu_notifier *subscription,
                          struct mm_struct *mm)
{
        int ret;

        mmap_write_lock(mm);
        ret = __mmu_notifier_register(subscription, mm);
        mmap_write_unlock(mm);
        return ret;
}
EXPORT_SYMBOL_GPL(mmu_notifier_register);

static struct mmu_notifier *
find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
{
        struct mmu_notifier *subscription;

        spin_lock(&mm->notifier_subscriptions->lock);
        hlist_for_each_entry_rcu(subscription,
                                 &mm->notifier_subscriptions->list, hlist,
                                 lockdep_is_held(&mm->notifier_subscriptions->lock)) {
                if (subscription->ops != ops)
                        continue;

                if (likely(subscription->users != UINT_MAX))
                        subscription->users++;
                else
                        subscription = ERR_PTR(-EOVERFLOW);
                spin_unlock(&mm->notifier_subscriptions->lock);
                return subscription;
        }
        spin_unlock(&mm->notifier_subscriptions->lock);
        return NULL;
}

/**
 * mmu_notifier_get_locked - Return the single struct mmu_notifier for
 *                           the mm & ops
 * @ops: The operations struct being subscribe with
 * @mm : The mm to attach notifiers too
 *
 * This function either allocates a new mmu_notifier via
 * ops->alloc_notifier(), or returns an already existing notifier on the
 * list. The value of the ops pointer is used to determine when two notifiers
 * are the same.
 *
 * Each call to mmu_notifier_get() must be paired with a call to
 * mmu_notifier_put(). The caller must hold the write side of mm->mmap_lock.
 *
 * While the caller has a mmu_notifier get the mm pointer will remain valid,
 * and can be converted to an active mm pointer via mmget_not_zero().
 */
struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
                                             struct mm_struct *mm)
{
        struct mmu_notifier *subscription;
        int ret;

        mmap_assert_write_locked(mm);

        if (mm->notifier_subscriptions) {
                subscription = find_get_mmu_notifier(mm, ops);
                if (subscription)
                        return subscription;
        }

        subscription = ops->alloc_notifier(mm);
        if (IS_ERR(subscription))
                return subscription;
        subscription->ops = ops;
        ret = __mmu_notifier_register(subscription, mm);
        if (ret)
                goto out_free;
        return subscription;
out_free:
        subscription->ops->free_notifier(subscription);
        return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(mmu_notifier_get_locked);

/* this is called after the last mmu_notifier_unregister() returned */
void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
{
        BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list));
        kfree(mm->notifier_subscriptions);
        mm->notifier_subscriptions = LIST_POISON1; /* debug */
}

/*
 * This releases the mm_count pin automatically and frees the mm
 * structure if it was the last user of it. It serializes against
 * running mmu notifiers with SRCU and against mmu_notifier_unregister
 * with the unregister lock + SRCU. All sptes must be dropped before
 * calling mmu_notifier_unregister. ->release or any other notifier
 * method may be invoked concurrently with mmu_notifier_unregister,
 * and only after mmu_notifier_unregister returned we're guaranteed
 * that ->release or any other method can't run anymore.
 */
void mmu_notifier_unregister(struct mmu_notifier *subscription,
                             struct mm_struct *mm)
{
        BUG_ON(atomic_read(&mm->mm_count) <= 0);

        if (!hlist_unhashed(&subscription->hlist)) {
                /*
                 * SRCU here will force exit_mmap to wait for ->release to
                 * finish before freeing the pages.
                 */
                int id;

                id = srcu_read_lock(&srcu);
                /*
                 * exit_mmap will block in mmu_notifier_release to guarantee
                 * that ->release is called before freeing the pages.
                 */
                if (subscription->ops->release)
                        subscription->ops->release(subscription, mm);
                srcu_read_unlock(&srcu, id);

                spin_lock(&mm->notifier_subscriptions->lock);
                /*
                 * Can not use list_del_rcu() since __mmu_notifier_release
                 * can delete it before we hold the lock.
                 */
                hlist_del_init_rcu(&subscription->hlist);
                spin_unlock(&mm->notifier_subscriptions->lock);
        }

        /*
         * Wait for any running method to finish, of course including
         * ->release if it was run by mmu_notifier_release instead of us.
         */
        synchronize_srcu(&srcu);

        BUG_ON(atomic_read(&mm->mm_count) <= 0);

        mmdrop(mm);
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);

static void mmu_notifier_free_rcu(struct rcu_head *rcu)
{
        struct mmu_notifier *subscription =
                container_of(rcu, struct mmu_notifier, rcu);
        struct mm_struct *mm = subscription->mm;

        subscription->ops->free_notifier(subscription);
        /* Pairs with the get in __mmu_notifier_register() */
        mmdrop(mm);
}

/**
 * mmu_notifier_put - Release the reference on the notifier
 * @subscription: The notifier to act on
 *
 * This function must be paired with each mmu_notifier_get(), it releases the
 * reference obtained by the get. If this is the last reference then process
 * to free the notifier will be run asynchronously.
 *
 * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release
 * when the mm_struct is destroyed. Instead free_notifier is always called to
 * release any resources held by the user.
 *
 * As ops->release is not guaranteed to be called, the user must ensure that
 * all sptes are dropped, and no new sptes can be established before
 * mmu_notifier_put() is called.
 *
 * This function can be called from the ops->release callback, however the
 * caller must still ensure it is called pairwise with mmu_notifier_get().
 *
 * Modules calling this function must call mmu_notifier_synchronize() in
 * their __exit functions to ensure the async work is completed.
 */
void mmu_notifier_put(struct mmu_notifier *subscription)
{
        struct mm_struct *mm = subscription->mm;

        spin_lock(&mm->notifier_subscriptions->lock);
        if (WARN_ON(!subscription->users) || --subscription->users)
                goto out_unlock;
        hlist_del_init_rcu(&subscription->hlist);
        spin_unlock(&mm->notifier_subscriptions->lock);

        call_srcu(&srcu, &subscription->rcu, mmu_notifier_free_rcu);
        return;

out_unlock:
        spin_unlock(&mm->notifier_subscriptions->lock);
}
EXPORT_SYMBOL_GPL(mmu_notifier_put);

static int __mmu_interval_notifier_insert(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        struct mmu_notifier_subscriptions *subscriptions, unsigned long start,
        unsigned long length, const struct mmu_interval_notifier_ops *ops)
{
        interval_sub->mm = mm;
        interval_sub->ops = ops;
        RB_CLEAR_NODE(&interval_sub->interval_tree.rb);
        interval_sub->interval_tree.start = start;
        /*
         * Note that the representation of the intervals in the interval tree
         * considers the ending point as contained in the interval.
         */
        if (length == 0 ||
            check_add_overflow(start, length - 1,
                               &interval_sub->interval_tree.last))
                return -EOVERFLOW;

        /* Must call with a mmget() held */
        if (WARN_ON(atomic_read(&mm->mm_users) <= 0))
                return -EINVAL;

        /* pairs with mmdrop in mmu_interval_notifier_remove() */
        mmgrab(mm);

        /*
         * If some invalidate_range_start/end region is going on in parallel
         * we don't know what VA ranges are affected, so we must assume this
         * new range is included.
         *
         * If the itree is invalidating then we are not allowed to change
         * it. Retrying until invalidation is done is tricky due to the
         * possibility for live lock, instead defer the add to
         * mn_itree_inv_end() so this algorithm is deterministic.
         *
         * In all cases the value for the interval_sub->invalidate_seq should be
         * odd, see mmu_interval_read_begin()
         */
        spin_lock(&subscriptions->lock);
        if (subscriptions->active_invalidate_ranges) {
                if (mn_itree_is_invalidating(subscriptions))
                        hlist_add_head(&interval_sub->deferred_item,
                                       &subscriptions->deferred_list);
                else {
                        subscriptions->invalidate_seq |= 1;
                        interval_tree_insert(&interval_sub->interval_tree,
                                             &subscriptions->itree);
                }
                interval_sub->invalidate_seq = subscriptions->invalidate_seq;
        } else {
                WARN_ON(mn_itree_is_invalidating(subscriptions));
                /*
                 * The starting seq for a subscription not under invalidation
                 * should be odd, not equal to the current invalidate_seq and
                 * invalidate_seq should not 'wrap' to the new seq any time
                 * soon.
                 */
                interval_sub->invalidate_seq =
                        subscriptions->invalidate_seq - 1;
                interval_tree_insert(&interval_sub->interval_tree,
                                     &subscriptions->itree);
        }
        spin_unlock(&subscriptions->lock);
        return 0;
}

/**
 * mmu_interval_notifier_insert - Insert an interval notifier
 * @interval_sub: Interval subscription to register
 * @start: Starting virtual address to monitor
 * @length: Length of the range to monitor
 * @mm: mm_struct to attach to
 * @ops: Interval notifier operations to be called on matching events
 *
 * This function subscribes the interval notifier for notifications from the
 * mm.  Upon return the ops related to mmu_interval_notifier will be called
 * whenever an event that intersects with the given range occurs.
 *
 * Upon return the range_notifier may not be present in the interval tree yet.
 * The caller must use the normal interval notifier read flow via
 * mmu_interval_read_begin() to establish SPTEs for this range.
 */
int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
                                 struct mm_struct *mm, unsigned long start,
                                 unsigned long length,
                                 const struct mmu_interval_notifier_ops *ops)
{
        struct mmu_notifier_subscriptions *subscriptions;
        int ret;

        might_lock(&mm->mmap_lock);

        subscriptions = smp_load_acquire(&mm->notifier_subscriptions);
        if (!subscriptions || !subscriptions->has_itree) {
                ret = mmu_notifier_register(NULL, mm);
                if (ret)
                        return ret;
                subscriptions = mm->notifier_subscriptions;
        }
        return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
                                              start, length, ops);
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert);

int mmu_interval_notifier_insert_locked(
        struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
        unsigned long start, unsigned long length,
        const struct mmu_interval_notifier_ops *ops)
{
        struct mmu_notifier_subscriptions *subscriptions =
                mm->notifier_subscriptions;
        int ret;

        mmap_assert_write_locked(mm);

        if (!subscriptions || !subscriptions->has_itree) {
                ret = __mmu_notifier_register(NULL, mm);
                if (ret)
                        return ret;
                subscriptions = mm->notifier_subscriptions;
        }
        return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
                                              start, length, ops);
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);

static bool
mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
                          unsigned long seq)
{
        bool ret;

        spin_lock(&subscriptions->lock);
        ret = subscriptions->invalidate_seq != seq;
        spin_unlock(&subscriptions->lock);
        return ret;
}

/**
 * mmu_interval_notifier_remove - Remove a interval notifier
 * @interval_sub: Interval subscription to unregister
 *
 * This function must be paired with mmu_interval_notifier_insert(). It cannot
 * be called from any ops callback.
 *
 * Once this returns ops callbacks are no longer running on other CPUs and
 * will not be called in future.
 */
void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
{
        struct mm_struct *mm = interval_sub->mm;
        struct mmu_notifier_subscriptions *subscriptions =
                mm->notifier_subscriptions;
        unsigned long seq = 0;

        might_sleep();

        spin_lock(&subscriptions->lock);
        if (mn_itree_is_invalidating(subscriptions)) {
                /*
                 * remove is being called after insert put this on the
                 * deferred list, but before the deferred list was processed.
                 */
                if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) {
                        hlist_del(&interval_sub->deferred_item);
                } else {
                        hlist_add_head(&interval_sub->deferred_item,
                                       &subscriptions->deferred_list);
                        seq = subscriptions->invalidate_seq;
                }
        } else {
                WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb));
                interval_tree_remove(&interval_sub->interval_tree,
                                     &subscriptions->itree);
        }
        spin_unlock(&subscriptions->lock);

        /*
         * The possible sleep on progress in the invalidation requires the
         * caller not hold any locks held by invalidation callbacks.
         */
        lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
        lock_map_release(&__mmu_notifier_invalidate_range_start_map);
        if (seq)
                wait_event(subscriptions->wq,
                           mmu_interval_seq_released(subscriptions, seq));

        /* pairs with mmgrab in mmu_interval_notifier_insert() */
        mmdrop(mm);
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove);

/**
 * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
 *
 * This function ensures that all outstanding async SRU work from
 * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops
 * associated with an unused mmu_notifier will no longer be called.
 *
 * Before using the caller must ensure that all of its mmu_notifiers have been
 * fully released via mmu_notifier_put().
 *
 * Modules using the mmu_notifier_put() API should call this in their __exit
 * function to avoid module unloading races.
 */
void mmu_notifier_synchronize(void)
{
        synchronize_srcu(&srcu);
}
EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);

































































































































































































































































































































































































































































































































































































    4 







































































































































































  334 

  335 























































  317 

























































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
/* SPDX-License-Identifier: GPL-2.0+ */
#ifndef _LINUX_MAPLE_TREE_H
#define _LINUX_MAPLE_TREE_H
/*
 * Maple Tree - An RCU-safe adaptive tree for storing ranges
 * Copyright (c) 2018-2022 Oracle
 * Authors:     Liam R. Howlett <Liam.Howlett@Oracle.com>
 *              Matthew Wilcox <willy@infradead.org>
 */

#include <linux/kernel.h>
#include <linux/rcupdate.h>
#include <linux/spinlock.h>
/* #define CONFIG_MAPLE_RCU_DISABLED */

/*
 * Allocated nodes are mutable until they have been inserted into the tree,
 * at which time they cannot change their type until they have been removed
 * from the tree and an RCU grace period has passed.
 *
 * Removed nodes have their ->parent set to point to themselves.  RCU readers
 * check ->parent before relying on the value that they loaded from the
 * slots array.  This lets us reuse the slots array for the RCU head.
 *
 * Nodes in the tree point to their parent unless bit 0 is set.
 */
#if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64)
/* 64bit sizes */
#define MAPLE_NODE_SLOTS        31        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        16        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        10        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 1)
#else
/* 32bit sizes */
#define MAPLE_NODE_SLOTS        63        /* 256 bytes including ->parent */
#define MAPLE_RANGE64_SLOTS        32        /* 256 bytes */
#define MAPLE_ARANGE64_SLOTS        21        /* 240 bytes */
#define MAPLE_ALLOC_SLOTS        (MAPLE_NODE_SLOTS - 2)
#endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */

#define MAPLE_NODE_MASK                255UL

/*
 * The node->parent of the root node has bit 0 set and the rest of the pointer
 * is a pointer to the tree itself.  No more bits are available in this pointer
 * (on m68k, the data structure may only be 2-byte aligned).
 *
 * Internal non-root nodes can only have maple_range_* nodes as parents.  The
 * parent pointer is 256B aligned like all other tree nodes.  When storing a 32
 * or 64 bit values, the offset can fit into 4 bits.  The 16 bit values need an
 * extra bit to store the offset.  This extra bit comes from a reuse of the last
 * bit in the node type.  This is possible by using bit 1 to indicate if bit 2
 * is part of the type or the slot.
 *
 * Once the type is decided, the decision of an allocation range type or a
 * range type is done by examining the immutable tree flag for the
 * MT_FLAGS_ALLOC_RANGE flag.
 *
 *  Node types:
 *   0x??1 = Root
 *   0x?00 = 16 bit nodes
 *   0x010 = 32 bit nodes
 *   0x110 = 64 bit nodes
 *
 *  Slot size and location in the parent pointer:
 *   type  : slot location
 *   0x??1 : Root
 *   0x?00 : 16 bit values, type in 0-1, slot in 2-6
 *   0x010 : 32 bit values, type in 0-2, slot in 3-6
 *   0x110 : 64 bit values, type in 0-2, slot in 3-6
 */

/*
 * This metadata is used to optimize the gap updating code and in reverse
 * searching for gaps or any other code that needs to find the end of the data.
 */
struct maple_metadata {
        unsigned char end;
        unsigned char gap;
};

/*
 * Leaf nodes do not store pointers to nodes, they store user data.  Users may
 * store almost any bit pattern.  As noted above, the optimisation of storing an
 * entry at 0 in the root pointer cannot be done for data which have the bottom
 * two bits set to '10'.  We also reserve values with the bottom two bits set to
 * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use.  Some APIs
 * return errnos as a negative errno shifted right by two bits and the bottom
 * two bits set to '10', and while choosing to store these values in the array
 * is not an error, it may lead to confusion if you're testing for an error with
 * mas_is_err().
 *
 * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits
 * 3-6), bit 2 is reserved.  That leaves bits 0-1 unused for now.
 *
 * In regular B-Tree terms, pivots are called keys.  The term pivot is used to
 * indicate that the tree is specifying ranges,  Pivots may appear in the
 * subtree with an entry attached to the value whereas keys are unique to a
 * specific position of a B-tree.  Pivot values are inclusive of the slot with
 * the same index.
 */

struct maple_range_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_RANGE64_SLOTS - 1];
        union {
                void __rcu *slot[MAPLE_RANGE64_SLOTS];
                struct {
                        void __rcu *pad[MAPLE_RANGE64_SLOTS - 1];
                        struct maple_metadata meta;
                };
        };
};

/*
 * At tree creation time, the user can specify that they're willing to trade off
 * storing fewer entries in a tree in return for storing more information in
 * each node.
 *
 * The maple tree supports recording the largest range of NULL entries available
 * in this node, also called gaps.  This optimises the tree for allocating a
 * range.
 */
struct maple_arange_64 {
        struct maple_pnode *parent;
        unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1];
        void __rcu *slot[MAPLE_ARANGE64_SLOTS];
        unsigned long gap[MAPLE_ARANGE64_SLOTS];
        struct maple_metadata meta;
};

struct maple_alloc {
        unsigned long total;
        unsigned char node_count;
        unsigned int request_count;
        struct maple_alloc *slot[MAPLE_ALLOC_SLOTS];
};

struct maple_topiary {
        struct maple_pnode *parent;
        struct maple_enode *next; /* Overlaps the pivot */
};

enum maple_type {
        maple_dense,
        maple_leaf_64,
        maple_range_64,
        maple_arange_64,
};

enum store_type {
        wr_invalid,
        wr_new_root,
        wr_store_root,
        wr_exact_fit,
        wr_spanning_store,
        wr_split_store,
        wr_rebalance,
        wr_append,
        wr_node_store,
        wr_slot_store,
};

/**
 * DOC: Maple tree flags
 *
 * * MT_FLAGS_ALLOC_RANGE        - Track gaps in this tree
 * * MT_FLAGS_USE_RCU                - Operate in RCU mode
 * * MT_FLAGS_HEIGHT_OFFSET        - The position of the tree height in the flags
 * * MT_FLAGS_HEIGHT_MASK        - The mask for the maple tree height value
 * * MT_FLAGS_LOCK_MASK                - How the mt_lock is used
 * * MT_FLAGS_LOCK_IRQ                - Acquired irq-safe
 * * MT_FLAGS_LOCK_BH                - Acquired bh-safe
 * * MT_FLAGS_LOCK_EXTERN        - mt_lock is not used
 *
 * MAPLE_HEIGHT_MAX        The largest height that can be stored
 */
#define MT_FLAGS_ALLOC_RANGE        0x01
#define MT_FLAGS_USE_RCU        0x02
#define MT_FLAGS_HEIGHT_OFFSET        0x02
#define MT_FLAGS_HEIGHT_MASK        0x7C
#define MT_FLAGS_LOCK_MASK        0x300
#define MT_FLAGS_LOCK_IRQ        0x100
#define MT_FLAGS_LOCK_BH        0x200
#define MT_FLAGS_LOCK_EXTERN        0x300
#define MT_FLAGS_ALLOC_WRAPPED        0x0800

#define MAPLE_HEIGHT_MAX        31


#define MAPLE_NODE_TYPE_MASK        0x0F
#define MAPLE_NODE_TYPE_SHIFT        0x03

#define MAPLE_RESERVED_RANGE        4096

#ifdef CONFIG_LOCKDEP
typedef struct lockdep_map *lockdep_map_p;
#define mt_lock_is_held(mt)                                             \
        (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock))

#define mt_write_lock_is_held(mt)                                        \
        (!(mt)->ma_external_lock ||                                        \
         lock_is_held_type((mt)->ma_external_lock, 0))

#define mt_set_external_lock(mt, lock)                                        \
        (mt)->ma_external_lock = &(lock)->dep_map

#define mt_on_stack(mt)                        (mt).ma_external_lock = NULL
#else
typedef struct { /* nothing */ } lockdep_map_p;
#define mt_lock_is_held(mt)                1
#define mt_write_lock_is_held(mt)        1
#define mt_set_external_lock(mt, lock)        do { } while (0)
#define mt_on_stack(mt)                        do { } while (0)
#endif

/*
 * If the tree contains a single entry at index 0, it is usually stored in
 * tree->ma_root.  To optimise for the page cache, an entry which ends in '00',
 * '01' or '11' is stored in the root, but an entry which ends in '10' will be
 * stored in a node.  Bits 3-6 are used to store enum maple_type.
 *
 * The flags are used both to store some immutable information about this tree
 * (set at tree creation time) and dynamic information set under the spinlock.
 *
 * Another use of flags are to indicate global states of the tree.  This is the
 * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in
 * RCU mode.  This mode was added to allow the tree to reuse nodes instead of
 * re-allocating and RCU freeing nodes when there is a single user.
 */
struct maple_tree {
        union {
                spinlock_t        ma_lock;
                lockdep_map_p        ma_external_lock;
        };
        unsigned int        ma_flags;
        void __rcu      *ma_root;
};

/**
 * MTREE_INIT() - Initialize a maple tree
 * @name: The maple tree name
 * @__flags: The maple tree flags
 *
 */
#define MTREE_INIT(name, __flags) {                                        \
        .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock),                \
        .ma_flags = __flags,                                                \
        .ma_root = NULL,                                                \
}

/**
 * MTREE_INIT_EXT() - Initialize a maple tree with an external lock.
 * @name: The tree name
 * @__flags: The maple tree flags
 * @__lock: The external lock
 */
#ifdef CONFIG_LOCKDEP
#define MTREE_INIT_EXT(name, __flags, __lock) {                                \
        .ma_external_lock = &(__lock).dep_map,                                \
        .ma_flags = (__flags),                                                \
        .ma_root = NULL,                                                \
}
#else
#define MTREE_INIT_EXT(name, __flags, __lock)        MTREE_INIT(name, __flags)
#endif

#define DEFINE_MTREE(name)                                                \
        struct maple_tree name = MTREE_INIT(name, 0)

#define mtree_lock(mt)                spin_lock((&(mt)->ma_lock))
#define mtree_lock_nested(mas, subclass) \
                spin_lock_nested((&(mt)->ma_lock), subclass)
#define mtree_unlock(mt)        spin_unlock((&(mt)->ma_lock))

/*
 * The Maple Tree squeezes various bits in at various points which aren't
 * necessarily obvious.  Usually, this is done by observing that pointers are
 * N-byte aligned and thus the bottom log_2(N) bits are available for use.  We
 * don't use the high bits of pointers to store additional information because
 * we don't know what bits are unused on any given architecture.
 *
 * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8
 * low bits for our own purposes.  Nodes are currently of 4 types:
 * 1. Single pointer (Range is 0-0)
 * 2. Non-leaf Allocation Range nodes
 * 3. Non-leaf Range nodes
 * 4. Leaf Range nodes All nodes consist of a number of node slots,
 *    pivots, and a parent pointer.
 */

struct maple_node {
        union {
                struct {
                        struct maple_pnode *parent;
                        void __rcu *slot[MAPLE_NODE_SLOTS];
                };
                struct {
                        void *pad;
                        struct rcu_head rcu;
                        struct maple_enode *piv_parent;
                        unsigned char parent_slot;
                        enum maple_type type;
                        unsigned char slot_len;
                        unsigned int ma_flags;
                };
                struct maple_range_64 mr64;
                struct maple_arange_64 ma64;
                struct maple_alloc alloc;
        };
};

/*
 * More complicated stores can cause two nodes to become one or three and
 * potentially alter the height of the tree.  Either half of the tree may need
 * to be rebalanced against the other.  The ma_topiary struct is used to track
 * which nodes have been 'cut' from the tree so that the change can be done
 * safely at a later date.  This is done to support RCU.
 */
struct ma_topiary {
        struct maple_enode *head;
        struct maple_enode *tail;
        struct maple_tree *mtree;
};

void *mtree_load(struct maple_tree *mt, unsigned long index);

int mtree_insert(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
int mtree_insert_range(struct maple_tree *mt, unsigned long first,
                unsigned long last, void *entry, gfp_t gfp);
int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);
int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);
int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp,
                void *entry, unsigned long size, unsigned long min,
                unsigned long max, gfp_t gfp);

int mtree_store_range(struct maple_tree *mt, unsigned long first,
                      unsigned long last, void *entry, gfp_t gfp);
int mtree_store(struct maple_tree *mt, unsigned long index,
                void *entry, gfp_t gfp);
void *mtree_erase(struct maple_tree *mt, unsigned long index);

int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);
int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp);

void mtree_destroy(struct maple_tree *mt);
void __mt_destroy(struct maple_tree *mt);

/**
 * mtree_empty() - Determine if a tree has any present entries.
 * @mt: Maple Tree.
 *
 * Context: Any context.
 * Return: %true if the tree contains only NULL pointers.
 */
static inline bool mtree_empty(const struct maple_tree *mt)
{
        return mt->ma_root == NULL;
}

/* Advanced API */

/*
 * Maple State Status
 * ma_active means the maple state is pointing to a node and offset and can
 * continue operating on the tree.
 * ma_start means we have not searched the tree.
 * ma_root means we have searched the tree and the entry we found lives in
 * the root of the tree (ie it has index 0, length 1 and is the only entry in
 * the tree).
 * ma_none means we have searched the tree and there is no node in the
 * tree for this entry.  For example, we searched for index 1 in an empty
 * tree.  Or we have a tree which points to a full leaf node and we
 * searched for an entry which is larger than can be contained in that
 * leaf node.
 * ma_pause means the data within the maple state may be stale, restart the
 * operation
 * ma_overflow means the search has reached the upper limit of the search
 * ma_underflow means the search has reached the lower limit of the search
 * ma_error means there was an error, check the node for the error number.
 */
enum maple_status {
        ma_active,
        ma_start,
        ma_root,
        ma_none,
        ma_pause,
        ma_overflow,
        ma_underflow,
        ma_error,
};

/*
 * The maple state is defined in the struct ma_state and is used to keep track
 * of information during operations, and even between operations when using the
 * advanced API.
 *
 * If state->node has bit 0 set then it references a tree location which is not
 * a node (eg the root).  If bit 1 is set, the rest of the bits are a negative
 * errno.  Bit 2 (the 'unallocated slots' bit) is clear.  Bits 3-6 indicate the
 * node type.
 *
 * state->alloc either has a request number of nodes or an allocated node.  If
 * stat->alloc has a requested number of nodes, the first bit will be set (0x1)
 * and the remaining bits are the value.  If state->alloc is a node, then the
 * node will be of type maple_alloc.  maple_alloc has MAPLE_NODE_SLOTS - 1 for
 * storing more allocated nodes, a total number of nodes allocated, and the
 * node_count in this node.  node_count is the number of allocated nodes in this
 * node.  The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further
 * nodes into state->alloc->slot[0]'s node.  Nodes are taken from state->alloc
 * by removing a node from the state->alloc node until state->alloc->node_count
 * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted
 * to state->alloc.  Nodes are pushed onto state->alloc by putting the current
 * state->alloc into the pushed node's slot[0].
 *
 * The state also contains the implied min/max of the state->node, the depth of
 * this search, and the offset. The implied min/max are either from the parent
 * node or are 0-oo for the root node.  The depth is incremented or decremented
 * every time a node is walked down or up.  The offset is the slot/pivot of
 * interest in the node - either for reading or writing.
 *
 * When returning a value the maple state index and last respectively contain
 * the start and end of the range for the entry.  Ranges are inclusive in the
 * Maple Tree.
 *
 * The status of the state is used to determine how the next action should treat
 * the state.  For instance, if the status is ma_start then the next action
 * should start at the root of the tree and walk down.  If the status is
 * ma_pause then the node may be stale data and should be discarded.  If the
 * status is ma_overflow, then the last action hit the upper limit.
 *
 */
struct ma_state {
        struct maple_tree *tree;        /* The tree we're operating in */
        unsigned long index;                /* The index we're operating on - range start */
        unsigned long last;                /* The last index we're operating on - range end */
        struct maple_enode *node;        /* The node containing this entry */
        unsigned long min;                /* The minimum index of this node - implied pivot min */
        unsigned long max;                /* The maximum index of this node - implied pivot max */
        struct maple_alloc *alloc;        /* Allocated nodes for this operation */
        enum maple_status status;        /* The status of the state (active, start, none, etc) */
        unsigned char depth;                /* depth of tree descent during write */
        unsigned char offset;
        unsigned char mas_flags;
        unsigned char end;                /* The end of the node */
        enum store_type store_type;        /* The type of store needed for this operation */
};

struct ma_wr_state {
        struct ma_state *mas;
        struct maple_node *node;        /* Decoded mas->node */
        unsigned long r_min;                /* range min */
        unsigned long r_max;                /* range max */
        enum maple_type type;                /* mas->node type */
        unsigned char offset_end;        /* The offset where the write ends */
        unsigned long *pivots;                /* mas->node->pivots pointer */
        unsigned long end_piv;                /* The pivot at the offset end */
        void __rcu **slots;                /* mas->node->slots pointer */
        void *entry;                        /* The entry to write */
        void *content;                        /* The existing entry that is being overwritten */
};

#define mas_lock(mas)           spin_lock(&((mas)->tree->ma_lock))
#define mas_lock_nested(mas, subclass) \
                spin_lock_nested(&((mas)->tree->ma_lock), subclass)
#define mas_unlock(mas)         spin_unlock(&((mas)->tree->ma_lock))

/*
 * Special values for ma_state.node.
 * MA_ERROR represents an errno.  After dropping the lock and attempting
 * to resolve the error, the walk would have to be restarted from the
 * top of the tree as the tree may have been modified.
 */
#define MA_ERROR(err) \
                ((struct maple_enode *)(((unsigned long)err << 2) | 2UL))

#define MA_STATE(name, mt, first, end)                                        \
        struct ma_state name = {                                        \
                .tree = mt,                                                \
                .index = first,                                                \
                .last = end,                                                \
                .node = NULL,                                                \
                .status = ma_start,                                        \
                .min = 0,                                                \
                .max = ULONG_MAX,                                        \
                .alloc = NULL,                                                \
                .mas_flags = 0,                                                \
                .store_type = wr_invalid,                                \
        }

#define MA_WR_STATE(name, ma_state, wr_entry)                                \
        struct ma_wr_state name = {                                        \
                .mas = ma_state,                                        \
                .content = NULL,                                        \
                .entry = wr_entry,                                        \
        }

#define MA_TOPIARY(name, tree)                                                \
        struct ma_topiary name = {                                        \
                .head = NULL,                                                \
                .tail = NULL,                                                \
                .mtree = tree,                                                \
        }

void *mas_walk(struct ma_state *mas);
void *mas_store(struct ma_state *mas, void *entry);
void *mas_erase(struct ma_state *mas);
int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp);
void mas_store_prealloc(struct ma_state *mas, void *entry);
void *mas_find(struct ma_state *mas, unsigned long max);
void *mas_find_range(struct ma_state *mas, unsigned long max);
void *mas_find_rev(struct ma_state *mas, unsigned long min);
void *mas_find_range_rev(struct ma_state *mas, unsigned long max);
int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp);
int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp,
                void *entry, unsigned long range_lo, unsigned long range_hi,
                unsigned long *next, gfp_t gfp);

bool mas_nomem(struct ma_state *mas, gfp_t gfp);
void mas_pause(struct ma_state *mas);
void maple_tree_init(void);
void mas_destroy(struct ma_state *mas);
int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries);

void *mas_prev(struct ma_state *mas, unsigned long min);
void *mas_prev_range(struct ma_state *mas, unsigned long max);
void *mas_next(struct ma_state *mas, unsigned long max);
void *mas_next_range(struct ma_state *mas, unsigned long max);

int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max,
                   unsigned long size);
/*
 * This finds an empty area from the highest address to the lowest.
 * AKA "Topdown" version,
 */
int mas_empty_area_rev(struct ma_state *mas, unsigned long min,
                       unsigned long max, unsigned long size);

static inline void mas_init(struct ma_state *mas, struct maple_tree *tree,
                            unsigned long addr)
{
        memset(mas, 0, sizeof(struct ma_state));
        mas->tree = tree;
        mas->index = mas->last = addr;
        mas->max = ULONG_MAX;
        mas->status = ma_start;
        mas->node = NULL;
}

static inline bool mas_is_active(struct ma_state *mas)
{
        return mas->status == ma_active;
}

static inline bool mas_is_err(struct ma_state *mas)
{
        return mas->status == ma_error;
}

/**
 * mas_reset() - Reset a Maple Tree operation state.
 * @mas: Maple Tree operation state.
 *
 * Resets the error or walk state of the @mas so future walks of the
 * array will start from the root.  Use this if you have dropped the
 * lock and want to reuse the ma_state.
 *
 * Context: Any context.
 */
static __always_inline void mas_reset(struct ma_state *mas)
{
        mas->status = ma_start;
        mas->node = NULL;
}

/**
 * mas_for_each() - Iterate over a range of the maple tree.
 * @__mas: Maple Tree operation state (maple_state)
 * @__entry: Entry retrieved from the tree
 * @__max: maximum index to retrieve from the tree
 *
 * When returned, mas->index and mas->last will hold the entire range for the
 * entry.
 *
 * Note: may return the zero entry.
 */
#define mas_for_each(__mas, __entry, __max) \
        while (((__entry) = mas_find((__mas), (__max))) != NULL)

/**
 * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order.
 * @__mas: Maple Tree operation state (maple_state)
 * @__entry: Entry retrieved from the tree
 * @__min: minimum index to retrieve from the tree
 *
 * When returned, mas->index and mas->last will hold the entire range for the
 * entry.
 *
 * Note: may return the zero entry.
 */
#define mas_for_each_rev(__mas, __entry, __min) \
        while (((__entry) = mas_find_rev((__mas), (__min))) != NULL)

#ifdef CONFIG_DEBUG_MAPLE_TREE
enum mt_dump_format {
        mt_dump_dec,
        mt_dump_hex,
};

extern atomic_t maple_tree_tests_run;
extern atomic_t maple_tree_tests_passed;

void mt_dump(const struct maple_tree *mt, enum mt_dump_format format);
void mas_dump(const struct ma_state *mas);
void mas_wr_dump(const struct ma_wr_state *wr_mas);
void mt_validate(struct maple_tree *mt);
void mt_cache_shrink(void);
#define MT_BUG_ON(__tree, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_BUG_ON(__mas, __x) do {                                        \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MAS_WR_BUG_ON(__wrmas, __x) do {                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (__x) {                                                        \
                pr_info("BUG at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
} while (0)

#define MT_WARN_ON(__tree, __x)  ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mt_dump(__tree, mt_dump_hex);                                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WARN_ON(__mas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_dump(__mas);                                        \
                mt_dump((__mas)->tree, mt_dump_hex);                        \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})

#define MAS_WR_WARN_ON(__wrmas, __x) ({                                        \
        int ret = !!(__x);                                                \
        atomic_inc(&maple_tree_tests_run);                                \
        if (ret) {                                                        \
                pr_info("WARN at %s:%d (%u)\n",                                \
                __func__, __LINE__, __x);                                \
                mas_wr_dump(__wrmas);                                        \
                mas_dump((__wrmas)->mas);                                \
                mt_dump((__wrmas)->mas->tree, mt_dump_hex);                \
                pr_info("Pass: %u Run:%u\n",                                \
                        atomic_read(&maple_tree_tests_passed),                \
                        atomic_read(&maple_tree_tests_run));                \
                dump_stack();                                                \
        } else {                                                        \
                atomic_inc(&maple_tree_tests_passed);                        \
        }                                                                \
        unlikely(ret);                                                        \
})
#else
#define MT_BUG_ON(__tree, __x)                BUG_ON(__x)
#define MAS_BUG_ON(__mas, __x)                BUG_ON(__x)
#define MAS_WR_BUG_ON(__mas, __x)        BUG_ON(__x)
#define MT_WARN_ON(__tree, __x)                WARN_ON(__x)
#define MAS_WARN_ON(__mas, __x)                WARN_ON(__x)
#define MAS_WR_WARN_ON(__mas, __x)        WARN_ON(__x)
#endif /* CONFIG_DEBUG_MAPLE_TREE */

/**
 * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the
 * current location.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * set the internal maple state values to a sub-range.
 * Please use mas_set_range() if you do not know where you are in the tree.
 */
static inline void __mas_set_range(struct ma_state *mas, unsigned long start,
                unsigned long last)
{
        /* Ensure the range starts within the current slot */
        MAS_WARN_ON(mas, mas_is_active(mas) &&
                   (mas->index > start || mas->last < start));
        mas->index = start;
        mas->last = last;
}

/**
 * mas_set_range() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @start: New start of range in the Maple Tree.
 * @last: New end of range in the Maple Tree.
 *
 * Move the operation state to refer to a different range.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline
void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last)
{
        mas_reset(mas);
        __mas_set_range(mas, start, last);
}

/**
 * mas_set() - Set up Maple Tree operation state for a different index.
 * @mas: Maple Tree operation state.
 * @index: New index into the Maple Tree.
 *
 * Move the operation state to refer to a different index.  This will
 * have the effect of starting a walk from the top; see mas_next()
 * to move to an adjacent index.
 */
static inline void mas_set(struct ma_state *mas, unsigned long index)
{

        mas_set_range(mas, index, index);
}

static inline bool mt_external_lock(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN;
}

/**
 * mt_init_flags() - Initialise an empty maple tree with flags.
 * @mt: Maple Tree
 * @flags: maple tree flags.
 *
 * If you need to initialise a Maple Tree with special flags (eg, an
 * allocation tree), use this function.
 *
 * Context: Any context.
 */
static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags)
{
        mt->ma_flags = flags;
        if (!mt_external_lock(mt))
                spin_lock_init(&mt->ma_lock);
        rcu_assign_pointer(mt->ma_root, NULL);
}

/**
 * mt_init() - Initialise an empty maple tree.
 * @mt: Maple Tree
 *
 * An empty Maple Tree.
 *
 * Context: Any context.
 */
static inline void mt_init(struct maple_tree *mt)
{
        mt_init_flags(mt, 0);
}

static inline bool mt_in_rcu(struct maple_tree *mt)
{
#ifdef CONFIG_MAPLE_RCU_DISABLED
        return false;
#endif
        return mt->ma_flags & MT_FLAGS_USE_RCU;
}

/**
 * mt_clear_in_rcu() - Switch the tree to non-RCU mode.
 * @mt: The Maple Tree
 */
static inline void mt_clear_in_rcu(struct maple_tree *mt)
{
        if (!mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags &= ~MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

/**
 * mt_set_in_rcu() - Switch the tree to RCU safe mode.
 * @mt: The Maple Tree
 */
static inline void mt_set_in_rcu(struct maple_tree *mt)
{
        if (mt_in_rcu(mt))
                return;

        if (mt_external_lock(mt)) {
                WARN_ON(!mt_lock_is_held(mt));
                mt->ma_flags |= MT_FLAGS_USE_RCU;
        } else {
                mtree_lock(mt);
                mt->ma_flags |= MT_FLAGS_USE_RCU;
                mtree_unlock(mt);
        }
}

static inline unsigned int mt_height(const struct maple_tree *mt)
{
        return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET;
}

void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max);
void *mt_find_after(struct maple_tree *mt, unsigned long *index,
                    unsigned long max);
void *mt_prev(struct maple_tree *mt, unsigned long index,  unsigned long min);
void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max);

/**
 * mt_for_each - Iterate over each entry starting at index until max.
 * @__tree: The Maple Tree
 * @__entry: The current entry
 * @__index: The index to start the search from. Subsequently used as iterator.
 * @__max: The maximum limit for @index
 *
 * This iterator skips all entries, which resolve to a NULL pointer,
 * e.g. entries which has been reserved with XA_ZERO_ENTRY.
 */
#define mt_for_each(__tree, __entry, __index, __max) \
        for (__entry = mt_find(__tree, &(__index), __max); \
                __entry; __entry = mt_find_after(__tree, &(__index), __max))

#endif /*_LINUX_MAPLE_TREE_H */
























































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_DAX_H
#define _LINUX_DAX_H

#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/radix-tree.h>

typedef unsigned long dax_entry_t;

struct dax_device;
struct gendisk;
struct iomap_ops;
struct iomap_iter;
struct iomap;

enum dax_access_mode {
        DAX_ACCESS,
        DAX_RECOVERY_WRITE,
};

struct dax_operations {
        /*
         * direct_access: translate a device-relative
         * logical-page-offset into an absolute physical pfn. Return the
         * number of pages available for DAX at that pfn.
         */
        long (*direct_access)(struct dax_device *, pgoff_t, long,
                        enum dax_access_mode, void **, pfn_t *);
        /* zero_page_range: required operation. Zero page range   */
        int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
        /*
         * recovery_write: recover a poisoned range by DAX device driver
         * capable of clearing poison.
         */
        size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff,
                        void *addr, size_t bytes, struct iov_iter *iter);
};

struct dax_holder_operations {
        /*
         * notify_failure - notify memory failure into inner holder device
         * @dax_dev: the dax device which contains the holder
         * @offset: offset on this dax device where memory failure occurs
         * @len: length of this memory failure event
         * @flags: action flags for memory failure handler
         */
        int (*notify_failure)(struct dax_device *dax_dev, u64 offset,
                        u64 len, int mf_flags);
};

#if IS_ENABLED(CONFIG_DAX)
struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
void *dax_holder(struct dax_device *dax_dev);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
bool dax_synchronous(struct dax_device *dax_dev);
void set_dax_nocache(struct dax_device *dax_dev);
void set_dax_nomc(struct dax_device *dax_dev);
void set_dax_synchronous(struct dax_device *dax_dev);
size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
                void *addr, size_t bytes, struct iov_iter *i);
/*
 * Check if given mapping is supported by the file / underlying device.
 */
static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
                                             struct dax_device *dax_dev)
{
        if (!(vma->vm_flags & VM_SYNC))
                return true;
        if (!IS_DAX(file_inode(vma->vm_file)))
                return false;
        return dax_synchronous(dax_dev);
}
#else
static inline void *dax_holder(struct dax_device *dax_dev)
{
        return NULL;
}
static inline struct dax_device *alloc_dax(void *private,
                const struct dax_operations *ops)
{
        return ERR_PTR(-EOPNOTSUPP);
}
static inline void put_dax(struct dax_device *dax_dev)
{
}
static inline void kill_dax(struct dax_device *dax_dev)
{
}
static inline void dax_write_cache(struct dax_device *dax_dev, bool wc)
{
}
static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
{
        return false;
}
static inline bool dax_synchronous(struct dax_device *dax_dev)
{
        return true;
}
static inline void set_dax_nocache(struct dax_device *dax_dev)
{
}
static inline void set_dax_nomc(struct dax_device *dax_dev)
{
}
static inline void set_dax_synchronous(struct dax_device *dax_dev)
{
}
static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
                                struct dax_device *dax_dev)
{
        return !(vma->vm_flags & VM_SYNC);
}
static inline size_t dax_recovery_write(struct dax_device *dax_dev,
                pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
{
        return 0;
}
#endif

struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
                void *holder, const struct dax_holder_operations *ops);
void fs_put_dax(struct dax_device *dax_dev, void *holder);
#else
static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
        return 0;
}
static inline void dax_remove_host(struct gendisk *disk)
{
}
static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
                u64 *start_off, void *holder,
                const struct dax_holder_operations *ops)
{
        return NULL;
}
static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
{
}
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */

#if IS_ENABLED(CONFIG_FS_DAX)
int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc);

struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
dax_entry_t dax_lock_folio(struct folio *folio);
void dax_unlock_folio(struct folio *folio, dax_entry_t cookie);
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
                unsigned long index, struct page **page);
void dax_unlock_mapping_entry(struct address_space *mapping,
                unsigned long index, dax_entry_t cookie);
#else
static inline struct page *dax_layout_busy_page(struct address_space *mapping)
{
        return NULL;
}

static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
{
        return NULL;
}

static inline int dax_writeback_mapping_range(struct address_space *mapping,
                struct dax_device *dax_dev, struct writeback_control *wbc)
{
        return -EOPNOTSUPP;
}

static inline dax_entry_t dax_lock_folio(struct folio *folio)
{
        if (IS_DAX(folio->mapping->host))
                return ~0UL;
        return 0;
}

static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{
}

static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
                unsigned long index, struct page **page)
{
        return 0;
}

static inline void dax_unlock_mapping_entry(struct address_space *mapping,
                unsigned long index, dax_entry_t cookie)
{
}
#endif

int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
                const struct iomap_ops *ops);
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
                const struct iomap_ops *ops);
int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
                const struct iomap_ops *ops);

static inline bool dax_page_is_idle(struct page *page)
{
        return page && page_ref_count(page) == 0;
}

#if IS_ENABLED(CONFIG_DAX)
int dax_read_lock(void);
void dax_read_unlock(int id);
#else
static inline int dax_read_lock(void)
{
        return 0;
}

static inline void dax_read_unlock(int id)
{
}
#endif /* CONFIG_DAX */

#if !IS_ENABLED(CONFIG_FS_DAX)
static inline int __must_check dax_break_layout(struct inode *inode,
                            loff_t start, loff_t end, void (cb)(struct inode *))
{
        return 0;
}

static inline void dax_break_layout_final(struct inode *inode)
{
}
#endif

bool dax_alive(struct dax_device *dax_dev);
void *dax_get_private(struct dax_device *dax_dev);
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
                enum dax_access_mode mode, void **kaddr, pfn_t *pfn);
size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
                        size_t nr_pages);
int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len,
                int mf_flags);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);

ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops);
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
                    pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
                unsigned int order, pfn_t pfn);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
void dax_delete_mapping_range(struct address_space *mapping,
                                loff_t start, loff_t end);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
                                      pgoff_t index);
int __must_check dax_break_layout(struct inode *inode, loff_t start,
                                loff_t end, void (cb)(struct inode *));
static inline int __must_check dax_break_layout_inode(struct inode *inode,
                                                void (cb)(struct inode *))
{
        return dax_break_layout(inode, 0, LLONG_MAX, cb);
}
void dax_break_layout_final(struct inode *inode);
int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
                                  struct inode *dest, loff_t destoff,
                                  loff_t len, bool *is_same,
                                  const struct iomap_ops *ops);
int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                              struct file *file_out, loff_t pos_out,
                              loff_t *len, unsigned int remap_flags,
                              const struct iomap_ops *ops);
static inline bool dax_mapping(struct address_space *mapping)
{
        return mapping->host && IS_DAX(mapping->host);
}

/*
 * Due to dax's memory and block duo personalities, hwpoison reporting
 * takes into consideration which personality is presently visible.
 * When dax acts like a block device, such as in block IO, an encounter of
 * dax hwpoison is reported as -EIO.
 * When dax acts like memory, such as in page fault, a detection of hwpoison
 * is reported as -EHWPOISON which leads to VM_FAULT_HWPOISON.
 */
static inline int dax_mem2blk_err(int err)
{
        return (err == -EHWPOISON) ? -EIO : err;
}

#ifdef CONFIG_DEV_DAX_HMEM_DEVICES
void hmem_register_resource(int target_nid, struct resource *r);
#else
static inline void hmem_register_resource(int target_nid, struct resource *r)
{
}
#endif

typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
                            const struct resource *res);
int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
#endif

















































































































































































































































































































    1 


  506 













  506 





  506 








  506 



















   42 






























   42 





   38 








   42 
















  498 

  500 
















   38 

   40 






















    2 

    2 

    2 



















  499 

  500 
















   38 

   40 





















    2 

    2 

    2 



















    5 

    5 
















































































































































































































































































































































































































































































































































































































































































































































































































    1 
    1 





































    1 













    1 























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/export.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
#include <linux/user_namespace.h>
#include <linux/proc_ns.h>
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
#include <linux/security.h>
#include <linux/keyctl.h>
#include <linux/key-type.h>
#include <keys/user-type.h>
#include <linux/seq_file.h>
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
#include <linux/fs_struct.h>
#include <linux/bsearch.h>
#include <linux/sort.h>

static struct kmem_cache *user_ns_cachep __ro_after_init;
static DEFINE_MUTEX(userns_state_mutex);

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *map);
static void free_user_ns(struct work_struct *work);

static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid)
{
        return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES);
}

static void dec_user_namespaces(struct ucounts *ucounts)
{
        return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES);
}

static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
{
        /* Start with the same capabilities as init but useless for doing
         * anything as the capabilities are bound to the new user namespace.
         */
        cred->securebits = SECUREBITS_DEFAULT;
        cred->cap_inheritable = CAP_EMPTY_SET;
        cred->cap_permitted = CAP_FULL_SET;
        cred->cap_effective = CAP_FULL_SET;
        cred->cap_ambient = CAP_EMPTY_SET;
        cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
        key_put(cred->request_key_auth);
        cred->request_key_auth = NULL;
#endif
        /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
        cred->user_ns = user_ns;
}

static unsigned long enforced_nproc_rlimit(void)
{
        unsigned long limit = RLIM_INFINITY;

        /* Is RLIMIT_NPROC currently enforced? */
        if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) ||
            (current_user_ns() != &init_user_ns))
                limit = rlimit(RLIMIT_NPROC);

        return limit;
}

/*
 * Create a new user namespace, deriving the creator from the user in the
 * passed credentials, and replacing that user with the new root user for the
 * new namespace.
 *
 * This is called by copy_creds(), which will finish setting the target task's
 * credentials.
 */
int create_user_ns(struct cred *new)
{
        struct user_namespace *ns, *parent_ns = new->user_ns;
        kuid_t owner = new->euid;
        kgid_t group = new->egid;
        struct ucounts *ucounts;
        int ret, i;

        ret = -ENOSPC;
        if (parent_ns->level > 32)
                goto fail;

        ucounts = inc_user_namespaces(parent_ns, owner);
        if (!ucounts)
                goto fail;

        /*
         * Verify that we can not violate the policy of which files
         * may be accessed that is specified by the root directory,
         * by verifying that the root directory is at the root of the
         * mount namespace which allows all files to be accessed.
         */
        ret = -EPERM;
        if (current_chrooted())
                goto fail_dec;

        /* The creator needs a mapping in the parent user namespace
         * or else we won't be able to reasonably tell userspace who
         * created a user_namespace.
         */
        ret = -EPERM;
        if (!kuid_has_mapping(parent_ns, owner) ||
            !kgid_has_mapping(parent_ns, group))
                goto fail_dec;

        ret = security_create_user_ns(new);
        if (ret < 0)
                goto fail_dec;

        ret = -ENOMEM;
        ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL);
        if (!ns)
                goto fail_dec;

        ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP);
        ret = ns_alloc_inum(&ns->ns);
        if (ret)
                goto fail_free;
        ns->ns.ops = &userns_operations;

        refcount_set(&ns->ns.count, 1);
        /* Leave the new->user_ns reference with the new user namespace. */
        ns->parent = parent_ns;
        ns->level = parent_ns->level + 1;
        ns->owner = owner;
        ns->group = group;
        INIT_WORK(&ns->work, free_user_ns);
        for (i = 0; i < UCOUNT_COUNTS; i++) {
                ns->ucount_max[i] = INT_MAX;
        }
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit());
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
        set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
        ns->ucounts = ucounts;

        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
        mutex_lock(&userns_state_mutex);
        ns->flags = parent_ns->flags;
        mutex_unlock(&userns_state_mutex);

#ifdef CONFIG_KEYS
        INIT_LIST_HEAD(&ns->keyring_name_list);
        init_rwsem(&ns->keyring_sem);
#endif
        ret = -ENOMEM;
        if (!setup_userns_sysctls(ns))
                goto fail_keyring;

        set_cred_user_ns(new, ns);
        return 0;
fail_keyring:
#ifdef CONFIG_PERSISTENT_KEYRINGS
        key_put(ns->persistent_keyring_register);
#endif
        ns_free_inum(&ns->ns);
fail_free:
        kmem_cache_free(user_ns_cachep, ns);
fail_dec:
        dec_user_namespaces(ucounts);
fail:
        return ret;
}

int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
        struct cred *cred;
        int err = -ENOMEM;

        if (!(unshare_flags & CLONE_NEWUSER))
                return 0;

        cred = prepare_creds();
        if (cred) {
                err = create_user_ns(cred);
                if (err)
                        put_cred(cred);
                else
                        *new_cred = cred;
        }

        return err;
}

static void free_user_ns(struct work_struct *work)
{
        struct user_namespace *parent, *ns =
                container_of(work, struct user_namespace, work);

        do {
                struct ucounts *ucounts = ns->ucounts;
                parent = ns->parent;
                if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->gid_map.forward);
                        kfree(ns->gid_map.reverse);
                }
                if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->uid_map.forward);
                        kfree(ns->uid_map.reverse);
                }
                if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                        kfree(ns->projid_map.forward);
                        kfree(ns->projid_map.reverse);
                }
#if IS_ENABLED(CONFIG_BINFMT_MISC)
                kfree(ns->binfmt_misc);
#endif
                retire_userns_sysctls(ns);
                key_free_user_ns(ns);
                ns_free_inum(&ns->ns);
                kmem_cache_free(user_ns_cachep, ns);
                dec_user_namespaces(ucounts);
                ns = parent;
        } while (refcount_dec_and_test(&parent->ns.count));
}

void __put_user_ns(struct user_namespace *ns)
{
        schedule_work(&ns->work);
}
EXPORT_SYMBOL(__put_user_ns);

/*
 * struct idmap_key - holds the information necessary to find an idmapping in a
 * sorted idmap array. It is passed to cmp_map_id() as first argument.
 */
struct idmap_key {
        bool map_up; /* true  -> id from kid; false -> kid from id */
        u32 id; /* id to find */
        u32 count;
};

/*
 * cmp_map_id - Function to be passed to bsearch() to find the requested
 * idmapping. Expects struct idmap_key to be passed via @k.
 */
static int cmp_map_id(const void *k, const void *e)
{
        u32 first, last, id2;
        const struct idmap_key *key = k;
        const struct uid_gid_extent *el = e;

        id2 = key->id + key->count - 1;

        /* handle map_id_{down,up}() */
        if (key->map_up)
                first = el->lower_first;
        else
                first = el->first;

        last = first + el->count - 1;

        if (key->id >= first && key->id <= last &&
            (id2 >= first && id2 <= last))
                return 0;

        if (key->id < first || id2 < first)
                return -1;

        return 1;
}

/*
 * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        struct idmap_key key;

        key.map_up = false;
        key.count = count;
        key.id = id;

        return bsearch(&key, map->forward, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

/*
 * map_id_range_down_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        unsigned idx;
        u32 first, last, id2;

        id2 = id + count - 1;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last &&
                    (id2 >= first && id2 <= last))
                        return &map->extent[idx];
        }
        return NULL;
}

static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_range_down_base(extents, map, id, count);
        else
                extent = map_id_range_down_max(extents, map, id, count);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->first) + extent->lower_first;
        else
                id = (u32) -1;

        return id;
}

u32 map_id_down(struct uid_gid_map *map, u32 id)
{
        return map_id_range_down(map, id, 1);
}

/*
 * map_id_up_base - Find idmap via binary search in static extent array.
 * Can only be called if number of mappings is equal or less than
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        unsigned idx;
        u32 first, last, id2;

        id2 = id + count - 1;

        /* Find the matching extent */
        for (idx = 0; idx < extents; idx++) {
                first = map->extent[idx].lower_first;
                last = first + map->extent[idx].count - 1;
                if (id >= first && id <= last &&
                    (id2 >= first && id2 <= last))
                        return &map->extent[idx];
        }
        return NULL;
}

/*
 * map_id_up_max - Find idmap via binary search in ordered idmap array.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static struct uid_gid_extent *
map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
{
        struct idmap_key key;

        key.map_up = true;
        key.count = count;
        key.id = id;

        return bsearch(&key, map->reverse, extents,
                       sizeof(struct uid_gid_extent), cmp_map_id);
}

u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count)
{
        struct uid_gid_extent *extent;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                extent = map_id_range_up_base(extents, map, id, count);
        else
                extent = map_id_range_up_max(extents, map, id, count);

        /* Map the id or note failure */
        if (extent)
                id = (id - extent->lower_first) + extent->first;
        else
                id = (u32) -1;

        return id;
}

u32 map_id_up(struct uid_gid_map *map, u32 id)
{
        return map_id_range_up(map, id, 1);
}

/**
 *        make_kuid - Map a user-namespace uid pair into a kuid.
 *        @ns:  User namespace that the uid is in
 *        @uid: User identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace uid
 *        pair INVALID_UID is returned.  Callers are expected to test
 *        for and handle INVALID_UID being returned.  INVALID_UID
 *        may be tested for using uid_valid().
 */
kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
{
        /* Map the uid to a global kernel uid */
        return KUIDT_INIT(map_id_down(&ns->uid_map, uid));
}
EXPORT_SYMBOL(make_kuid);

/**
 *        from_kuid - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kuid has no mapping in @targ (uid_t)-1 is returned.
 */
uid_t from_kuid(struct user_namespace *targ, kuid_t kuid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->uid_map, __kuid_val(kuid));
}
EXPORT_SYMBOL(from_kuid);

/**
 *        from_kuid_munged - Create a uid from a kuid user-namespace pair.
 *        @targ: The user namespace we want a uid in.
 *        @kuid: The kernel internal uid to start with.
 *
 *        Map @kuid into the user-namespace specified by @targ and
 *        return the resulting uid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kuid from_kuid_munged never fails and always
 *        returns a valid uid.  This makes from_kuid_munged appropriate
 *        for use in syscalls like stat and getuid where failing the
 *        system call and failing to provide a valid uid are not an
 *        options.
 *
 *        If @kuid has no mapping in @targ overflowuid is returned.
 */
uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid)
{
        uid_t uid;
        uid = from_kuid(targ, kuid);

        if (uid == (uid_t) -1)
                uid = overflowuid;
        return uid;
}
EXPORT_SYMBOL(from_kuid_munged);

/**
 *        make_kgid - Map a user-namespace gid pair into a kgid.
 *        @ns:  User namespace that the gid is in
 *        @gid: group identifier
 *
 *        Maps a user-namespace gid pair into a kernel internal kgid,
 *        and returns that kgid.
 *
 *        When there is no mapping defined for the user-namespace gid
 *        pair INVALID_GID is returned.  Callers are expected to test
 *        for and handle INVALID_GID being returned.  INVALID_GID may be
 *        tested for using gid_valid().
 */
kgid_t make_kgid(struct user_namespace *ns, gid_t gid)
{
        /* Map the gid to a global kernel gid */
        return KGIDT_INIT(map_id_down(&ns->gid_map, gid));
}
EXPORT_SYMBOL(make_kgid);

/**
 *        from_kgid - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kgid has no mapping in @targ (gid_t)-1 is returned.
 */
gid_t from_kgid(struct user_namespace *targ, kgid_t kgid)
{
        /* Map the gid from a global kernel gid */
        return map_id_up(&targ->gid_map, __kgid_val(kgid));
}
EXPORT_SYMBOL(from_kgid);

/**
 *        from_kgid_munged - Create a gid from a kgid user-namespace pair.
 *        @targ: The user namespace we want a gid in.
 *        @kgid: The kernel internal gid to start with.
 *
 *        Map @kgid into the user-namespace specified by @targ and
 *        return the resulting gid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kgid from_kgid_munged never fails and always
 *        returns a valid gid.  This makes from_kgid_munged appropriate
 *        for use in syscalls like stat and getgid where failing the
 *        system call and failing to provide a valid gid are not options.
 *
 *        If @kgid has no mapping in @targ overflowgid is returned.
 */
gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid)
{
        gid_t gid;
        gid = from_kgid(targ, kgid);

        if (gid == (gid_t) -1)
                gid = overflowgid;
        return gid;
}
EXPORT_SYMBOL(from_kgid_munged);

/**
 *        make_kprojid - Map a user-namespace projid pair into a kprojid.
 *        @ns:  User namespace that the projid is in
 *        @projid: Project identifier
 *
 *        Maps a user-namespace uid pair into a kernel internal kuid,
 *        and returns that kuid.
 *
 *        When there is no mapping defined for the user-namespace projid
 *        pair INVALID_PROJID is returned.  Callers are expected to test
 *        for and handle INVALID_PROJID being returned.  INVALID_PROJID
 *        may be tested for using projid_valid().
 */
kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid)
{
        /* Map the uid to a global kernel uid */
        return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid));
}
EXPORT_SYMBOL(make_kprojid);

/**
 *        from_kprojid - Create a projid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal project identifier to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        If @kprojid has no mapping in @targ (projid_t)-1 is returned.
 */
projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid)
{
        /* Map the uid from a global kernel uid */
        return map_id_up(&targ->projid_map, __kprojid_val(kprojid));
}
EXPORT_SYMBOL(from_kprojid);

/**
 *        from_kprojid_munged - Create a projiid from a kprojid user-namespace pair.
 *        @targ: The user namespace we want a projid in.
 *        @kprojid: The kernel internal projid to start with.
 *
 *        Map @kprojid into the user-namespace specified by @targ and
 *        return the resulting projid.
 *
 *        There is always a mapping into the initial user_namespace.
 *
 *        Unlike from_kprojid from_kprojid_munged never fails and always
 *        returns a valid projid.  This makes from_kprojid_munged
 *        appropriate for use in syscalls like stat and where
 *        failing the system call and failing to provide a valid projid are
 *        not an options.
 *
 *        If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned.
 */
projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid)
{
        projid_t projid;
        projid = from_kprojid(targ, kprojid);

        if (projid == (projid_t) -1)
                projid = OVERFLOW_PROJID;
        return projid;
}
EXPORT_SYMBOL(from_kprojid_munged);


static int uid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        uid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int gid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        gid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static int projid_m_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        struct uid_gid_extent *extent = v;
        struct user_namespace *lower_ns;
        projid_t lower;

        lower_ns = seq_user_ns(seq);
        if ((lower_ns == ns) && lower_ns->parent)
                lower_ns = lower_ns->parent;

        lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first));

        seq_printf(seq, "%10u %10u %10u\n",
                extent->first,
                lower,
                extent->count);

        return 0;
}

static void *m_start(struct seq_file *seq, loff_t *ppos,
                     struct uid_gid_map *map)
{
        loff_t pos = *ppos;
        unsigned extents = map->nr_extents;
        smp_rmb();

        if (pos >= extents)
                return NULL;

        if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return &map->extent[pos];

        return &map->forward[pos];
}

static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->uid_map);
}

static void *gid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->gid_map);
}

static void *projid_m_start(struct seq_file *seq, loff_t *ppos)
{
        struct user_namespace *ns = seq->private;

        return m_start(seq, ppos, &ns->projid_map);
}

static void *m_next(struct seq_file *seq, void *v, loff_t *pos)
{
        (*pos)++;
        return seq->op->start(seq, pos);
}

static void m_stop(struct seq_file *seq, void *v)
{
        return;
}

const struct seq_operations proc_uid_seq_operations = {
        .start = uid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = uid_m_show,
};

const struct seq_operations proc_gid_seq_operations = {
        .start = gid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = gid_m_show,
};

const struct seq_operations proc_projid_seq_operations = {
        .start = projid_m_start,
        .stop = m_stop,
        .next = m_next,
        .show = projid_m_show,
};

static bool mappings_overlap(struct uid_gid_map *new_map,
                             struct uid_gid_extent *extent)
{
        u32 upper_first, lower_first, upper_last, lower_last;
        unsigned idx;

        upper_first = extent->first;
        lower_first = extent->lower_first;
        upper_last = upper_first + extent->count - 1;
        lower_last = lower_first + extent->count - 1;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                u32 prev_upper_first, prev_lower_first;
                u32 prev_upper_last, prev_lower_last;
                struct uid_gid_extent *prev;

                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        prev = &new_map->extent[idx];
                else
                        prev = &new_map->forward[idx];

                prev_upper_first = prev->first;
                prev_lower_first = prev->lower_first;
                prev_upper_last = prev_upper_first + prev->count - 1;
                prev_lower_last = prev_lower_first + prev->count - 1;

                /* Does the upper range intersect a previous extent? */
                if ((prev_upper_first <= upper_last) &&
                    (prev_upper_last >= upper_first))
                        return true;

                /* Does the lower range intersect a previous extent? */
                if ((prev_lower_first <= lower_last) &&
                    (prev_lower_last >= lower_first))
                        return true;
        }
        return false;
}

/*
 * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
 * Takes care to allocate a 4K block of memory if the number of mappings exceeds
 * UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
{
        struct uid_gid_extent *dest;

        if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
                struct uid_gid_extent *forward;

                /* Allocate memory for 340 mappings. */
                forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS,
                                        sizeof(struct uid_gid_extent),
                                        GFP_KERNEL);
                if (!forward)
                        return -ENOMEM;

                /* Copy over memory. Only set up memory for the forward pointer.
                 * Defer the memory setup for the reverse pointer.
                 */
                memcpy(forward, map->extent,
                       map->nr_extents * sizeof(map->extent[0]));

                map->forward = forward;
                map->reverse = NULL;
        }

        if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
                dest = &map->extent[map->nr_extents];
        else
                dest = &map->forward[map->nr_extents];

        *dest = *extent;
        map->nr_extents++;
        return 0;
}

/* cmp function to sort() forward mappings */
static int cmp_extents_forward(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->first < e2->first)
                return -1;

        if (e1->first > e2->first)
                return 1;

        return 0;
}

/* cmp function to sort() reverse mappings */
static int cmp_extents_reverse(const void *a, const void *b)
{
        const struct uid_gid_extent *e1 = a;
        const struct uid_gid_extent *e2 = b;

        if (e1->lower_first < e2->lower_first)
                return -1;

        if (e1->lower_first > e2->lower_first)
                return 1;

        return 0;
}

/*
 * sort_idmaps - Sorts an array of idmap entries.
 * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
 */
static int sort_idmaps(struct uid_gid_map *map)
{
        if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                return 0;

        /* Sort forward array. */
        sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_forward, NULL);

        /* Only copy the memory from forward we actually need. */
        map->reverse = kmemdup_array(map->forward, map->nr_extents,
                                     sizeof(struct uid_gid_extent), GFP_KERNEL);
        if (!map->reverse)
                return -ENOMEM;

        /* Sort reverse array. */
        sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
             cmp_extents_reverse, NULL);

        return 0;
}

/**
 * verify_root_map() - check the uid 0 mapping
 * @file: idmapping file
 * @map_ns: user namespace of the target process
 * @new_map: requested idmap
 *
 * If a process requests mapping parent uid 0 into the new ns, verify that the
 * process writing the map had the CAP_SETFCAP capability as the target process
 * will be able to write fscaps that are valid in ancestor user namespaces.
 *
 * Return: true if the mapping is allowed, false if not.
 */
static bool verify_root_map(const struct file *file,
                            struct user_namespace *map_ns,
                            struct uid_gid_map *new_map)
{
        int idx;
        const struct user_namespace *file_ns = file->f_cred->user_ns;
        struct uid_gid_extent *extent0 = NULL;

        for (idx = 0; idx < new_map->nr_extents; idx++) {
                if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        extent0 = &new_map->extent[idx];
                else
                        extent0 = &new_map->forward[idx];
                if (extent0->lower_first == 0)
                        break;

                extent0 = NULL;
        }

        if (!extent0)
                return true;

        if (map_ns == file_ns) {
                /* The process unshared its ns and is writing to its own
                 * /proc/self/uid_map.  User already has full capabilites in
                 * the new namespace.  Verify that the parent had CAP_SETFCAP
                 * when it unshared.
                 * */
                if (!file_ns->parent_could_setfcap)
                        return false;
        } else {
                /* Process p1 is writing to uid_map of p2, who is in a child
                 * user namespace to p1's.  Verify that the opener of the map
                 * file has CAP_SETFCAP against the parent of the new map
                 * namespace */
                if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP))
                        return false;
        }

        return true;
}

static ssize_t map_write(struct file *file, const char __user *buf,
                         size_t count, loff_t *ppos,
                         int cap_setid,
                         struct uid_gid_map *map,
                         struct uid_gid_map *parent_map)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *map_ns = seq->private;
        struct uid_gid_map new_map;
        unsigned idx;
        struct uid_gid_extent extent;
        char *kbuf, *pos, *next_line;
        ssize_t ret;

        /* Only allow < page size writes at the beginning of the file */
        if ((*ppos != 0) || (count >= PAGE_SIZE))
                return -EINVAL;

        /* Slurp in the user data */
        kbuf = memdup_user_nul(buf, count);
        if (IS_ERR(kbuf))
                return PTR_ERR(kbuf);

        /*
         * The userns_state_mutex serializes all writes to any given map.
         *
         * Any map is only ever written once.
         *
         * An id map fits within 1 cache line on most architectures.
         *
         * On read nothing needs to be done unless you are on an
         * architecture with a crazy cache coherency model like alpha.
         *
         * There is a one time data dependency between reading the
         * count of the extents and the values of the extents.  The
         * desired behavior is to see the values of the extents that
         * were written before the count of the extents.
         *
         * To achieve this smp_wmb() is used on guarantee the write
         * order and smp_rmb() is guaranteed that we don't have crazy
         * architectures returning stale data.
         */
        mutex_lock(&userns_state_mutex);

        memset(&new_map, 0, sizeof(struct uid_gid_map));

        ret = -EPERM;
        /* Only allow one successful write to the map */
        if (map->nr_extents != 0)
                goto out;

        /*
         * Adjusting namespace settings requires capabilities on the target.
         */
        if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN))
                goto out;

        /* Parse the user data */
        ret = -EINVAL;
        pos = kbuf;
        for (; pos; pos = next_line) {

                /* Find the end of line and ensure I don't look past it */
                next_line = strchr(pos, '\n');
                if (next_line) {
                        *next_line = '\0';
                        next_line++;
                        if (*next_line == '\0')
                                next_line = NULL;
                }

                pos = skip_spaces(pos);
                extent.first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.lower_first = simple_strtoul(pos, &pos, 10);
                if (!isspace(*pos))
                        goto out;

                pos = skip_spaces(pos);
                extent.count = simple_strtoul(pos, &pos, 10);
                if (*pos && !isspace(*pos))
                        goto out;

                /* Verify there is not trailing junk on the line */
                pos = skip_spaces(pos);
                if (*pos != '\0')
                        goto out;

                /* Verify we have been given valid starting values */
                if ((extent.first == (u32) -1) ||
                    (extent.lower_first == (u32) -1))
                        goto out;

                /* Verify count is not zero and does not cause the
                 * extent to wrap
                 */
                if ((extent.first + extent.count) <= extent.first)
                        goto out;
                if ((extent.lower_first + extent.count) <=
                     extent.lower_first)
                        goto out;

                /* Do the ranges in extent overlap any previous extents? */
                if (mappings_overlap(&new_map, &extent))
                        goto out;

                if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
                    (next_line != NULL))
                        goto out;

                ret = insert_extent(&new_map, &extent);
                if (ret < 0)
                        goto out;
                ret = -EINVAL;
        }
        /* Be very certain the new map actually exists */
        if (new_map.nr_extents == 0)
                goto out;

        ret = -EPERM;
        /* Validate the user is allowed to use user id's mapped to. */
        if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map))
                goto out;

        ret = -EPERM;
        /* Map the lower ids from the parent user namespace to the
         * kernel global id space.
         */
        for (idx = 0; idx < new_map.nr_extents; idx++) {
                struct uid_gid_extent *e;
                u32 lower_first;

                if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
                        e = &new_map.extent[idx];
                else
                        e = &new_map.forward[idx];

                lower_first = map_id_range_down(parent_map,
                                                e->lower_first,
                                                e->count);

                /* Fail if we can not map the specified extent to
                 * the kernel global id space.
                 */
                if (lower_first == (u32) -1)
                        goto out;

                e->lower_first = lower_first;
        }

        /*
         * If we want to use binary search for lookup, this clones the extent
         * array and sorts both copies.
         */
        ret = sort_idmaps(&new_map);
        if (ret < 0)
                goto out;

        /* Install the map */
        if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
                memcpy(map->extent, new_map.extent,
                       new_map.nr_extents * sizeof(new_map.extent[0]));
        } else {
                map->forward = new_map.forward;
                map->reverse = new_map.reverse;
        }
        smp_wmb();
        map->nr_extents = new_map.nr_extents;

        *ppos = count;
        ret = count;
out:
        if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
                kfree(new_map.forward);
                kfree(new_map.reverse);
                map->forward = NULL;
                map->reverse = NULL;
                map->nr_extents = 0;
        }

        mutex_unlock(&userns_state_mutex);
        kfree(kbuf);
        return ret;
}

ssize_t proc_uid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETUID,
                         &ns->uid_map, &ns->parent->uid_map);
}

ssize_t proc_gid_map_write(struct file *file, const char __user *buf,
                           size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        return map_write(file, buf, size, ppos, CAP_SETGID,
                         &ns->gid_map, &ns->parent->gid_map);
}

ssize_t proc_projid_map_write(struct file *file, const char __user *buf,
                              size_t size, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        struct user_namespace *seq_ns = seq_user_ns(seq);

        if (!ns->parent)
                return -EPERM;

        if ((seq_ns != ns) && (seq_ns != ns->parent))
                return -EPERM;

        /* Anyone can set any valid project id no capability needed */
        return map_write(file, buf, size, ppos, -1,
                         &ns->projid_map, &ns->parent->projid_map);
}

static bool new_idmap_permitted(const struct file *file,
                                struct user_namespace *ns, int cap_setid,
                                struct uid_gid_map *new_map)
{
        const struct cred *cred = file->f_cred;

        if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map))
                return false;

        /* Don't allow mappings that would allow anything that wouldn't
         * be allowed without the establishment of unprivileged mappings.
         */
        if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) &&
            uid_eq(ns->owner, cred->euid)) {
                u32 id = new_map->extent[0].lower_first;
                if (cap_setid == CAP_SETUID) {
                        kuid_t uid = make_kuid(ns->parent, id);
                        if (uid_eq(uid, cred->euid))
                                return true;
                } else if (cap_setid == CAP_SETGID) {
                        kgid_t gid = make_kgid(ns->parent, id);
                        if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) &&
                            gid_eq(gid, cred->egid))
                                return true;
                }
        }

        /* Allow anyone to set a mapping that doesn't require privilege */
        if (!cap_valid(cap_setid))
                return true;

        /* Allow the specified ids if we have the appropriate capability
         * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
         * And the opener of the id file also has the appropriate capability.
         */
        if (ns_capable(ns->parent, cap_setid) &&
            file_ns_capable(file, ns->parent, cap_setid))
                return true;

        return false;
}

int proc_setgroups_show(struct seq_file *seq, void *v)
{
        struct user_namespace *ns = seq->private;
        unsigned long userns_flags = READ_ONCE(ns->flags);

        seq_printf(seq, "%s\n",
                   (userns_flags & USERNS_SETGROUPS_ALLOWED) ?
                   "allow" : "deny");
        return 0;
}

ssize_t proc_setgroups_write(struct file *file, const char __user *buf,
                             size_t count, loff_t *ppos)
{
        struct seq_file *seq = file->private_data;
        struct user_namespace *ns = seq->private;
        char kbuf[8], *pos;
        bool setgroups_allowed;
        ssize_t ret;

        /* Only allow a very narrow range of strings to be written */
        ret = -EINVAL;
        if ((*ppos != 0) || (count >= sizeof(kbuf)))
                goto out;

        /* What was written? */
        ret = -EFAULT;
        if (copy_from_user(kbuf, buf, count))
                goto out;
        kbuf[count] = '\0';
        pos = kbuf;

        /* What is being requested? */
        ret = -EINVAL;
        if (strncmp(pos, "allow", 5) == 0) {
                pos += 5;
                setgroups_allowed = true;
        }
        else if (strncmp(pos, "deny", 4) == 0) {
                pos += 4;
                setgroups_allowed = false;
        }
        else
                goto out;

        /* Verify there is not trailing junk on the line */
        pos = skip_spaces(pos);
        if (*pos != '\0')
                goto out;

        ret = -EPERM;
        mutex_lock(&userns_state_mutex);
        if (setgroups_allowed) {
                /* Enabling setgroups after setgroups has been disabled
                 * is not allowed.
                 */
                if (!(ns->flags & USERNS_SETGROUPS_ALLOWED))
                        goto out_unlock;
        } else {
                /* Permanently disabling setgroups after setgroups has
                 * been enabled by writing the gid_map is not allowed.
                 */
                if (ns->gid_map.nr_extents != 0)
                        goto out_unlock;
                ns->flags &= ~USERNS_SETGROUPS_ALLOWED;
        }
        mutex_unlock(&userns_state_mutex);

        /* Report a successful write */
        *ppos = count;
        ret = count;
out:
        return ret;
out_unlock:
        mutex_unlock(&userns_state_mutex);
        goto out;
}

bool userns_may_setgroups(const struct user_namespace *ns)
{
        bool allowed;

        mutex_lock(&userns_state_mutex);
        /* It is not safe to use setgroups until a gid mapping in
         * the user namespace has been established.
         */
        allowed = ns->gid_map.nr_extents != 0;
        /* Is setgroups allowed? */
        allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED);
        mutex_unlock(&userns_state_mutex);

        return allowed;
}

/*
 * Returns true if @child is the same namespace or a descendant of
 * @ancestor.
 */
bool in_userns(const struct user_namespace *ancestor,
               const struct user_namespace *child)
{
        const struct user_namespace *ns;
        for (ns = child; ns->level > ancestor->level; ns = ns->parent)
                ;
        return (ns == ancestor);
}

bool current_in_userns(const struct user_namespace *target_ns)
{
        return in_userns(target_ns, current_user_ns());
}
EXPORT_SYMBOL(current_in_userns);

static inline struct user_namespace *to_user_ns(struct ns_common *ns)
{
        return container_of(ns, struct user_namespace, ns);
}

static struct ns_common *userns_get(struct task_struct *task)
{
        struct user_namespace *user_ns;

        rcu_read_lock();
        user_ns = get_user_ns(__task_cred(task)->user_ns);
        rcu_read_unlock();

        return user_ns ? &user_ns->ns : NULL;
}

static void userns_put(struct ns_common *ns)
{
        put_user_ns(to_user_ns(ns));
}

static int userns_install(struct nsset *nsset, struct ns_common *ns)
{
        struct user_namespace *user_ns = to_user_ns(ns);
        struct cred *cred;

        /* Don't allow gaining capabilities by reentering
         * the same user namespace.
         */
        if (user_ns == current_user_ns())
                return -EINVAL;

        /* Tasks that share a thread group must share a user namespace */
        if (!thread_group_empty(current))
                return -EINVAL;

        if (current->fs->users != 1)
                return -EINVAL;

        if (!ns_capable(user_ns, CAP_SYS_ADMIN))
                return -EPERM;

        cred = nsset_cred(nsset);
        if (!cred)
                return -EINVAL;

        put_user_ns(cred->user_ns);
        set_cred_user_ns(cred, get_user_ns(user_ns));

        if (set_cred_ucounts(cred) < 0)
                return -EINVAL;

        return 0;
}

struct ns_common *ns_get_owner(struct ns_common *ns)
{
        struct user_namespace *my_user_ns = current_user_ns();
        struct user_namespace *owner, *p;

        /* See if the owner is in the current user namespace */
        owner = p = ns->ops->owner(ns);
        for (;;) {
                if (!p)
                        return ERR_PTR(-EPERM);
                if (p == my_user_ns)
                        break;
                p = p->parent;
        }

        return &get_user_ns(owner)->ns;
}

static struct user_namespace *userns_owner(struct ns_common *ns)
{
        return to_user_ns(ns)->parent;
}

const struct proc_ns_operations userns_operations = {
        .name                = "user",
        .type                = CLONE_NEWUSER,
        .get                = userns_get,
        .put                = userns_put,
        .install        = userns_install,
        .owner                = userns_owner,
        .get_parent        = ns_get_owner,
};

static __init int user_namespaces_init(void)
{
        user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT);
        return 0;
}
subsys_initcall(user_namespaces_init);











































































































































































































































































































































































































   40 














   39 
   40 






























































































   39 




  572 





  566 















   39 





   40 







   40 





   39 

   40 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
 * Authors: David Chinner and Glauber Costa
 *
 * Generic LRU infrastructure
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
#include "slab.h"
#include "internal.h"

#ifdef CONFIG_MEMCG
static LIST_HEAD(memcg_list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return lru->memcg_aware;
}

static void list_lru_register(struct list_lru *lru)
{
        if (!list_lru_memcg_aware(lru))
                return;

        mutex_lock(&list_lrus_mutex);
        list_add(&lru->list, &memcg_list_lrus);
        mutex_unlock(&list_lrus_mutex);
}

static void list_lru_unregister(struct list_lru *lru)
{
        if (!list_lru_memcg_aware(lru))
                return;

        mutex_lock(&list_lrus_mutex);
        list_del(&lru->list);
        mutex_unlock(&list_lrus_mutex);
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return lru->shrinker_id;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
        if (list_lru_memcg_aware(lru) && idx >= 0) {
                struct list_lru_memcg *mlru = xa_load(&lru->xa, idx);

                return mlru ? &mlru->node[nid] : NULL;
        }
        return &lru->node[nid].lru;
}

static inline struct list_lru_one *
lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                       bool irq, bool skip_empty)
{
        struct list_lru_one *l;
        long nr_items;

        rcu_read_lock();
again:
        l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
        if (likely(l)) {
                if (irq)
                        spin_lock_irq(&l->lock);
                else
                        spin_lock(&l->lock);
                nr_items = READ_ONCE(l->nr_items);
                if (likely(nr_items != LONG_MIN)) {
                        rcu_read_unlock();
                        return l;
                }
                if (irq)
                        spin_unlock_irq(&l->lock);
                else
                        spin_unlock(&l->lock);
        }
        /*
         * Caller may simply bail out if raced with reparenting or
         * may iterate through the list_lru and expect empty slots.
         */
        if (skip_empty) {
                rcu_read_unlock();
                return NULL;
        }
        VM_WARN_ON(!css_is_dying(&memcg->css));
        memcg = parent_mem_cgroup(memcg);
        goto again;
}

static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
{
        if (irq_off)
                spin_unlock_irq(&l->lock);
        else
                spin_unlock(&l->lock);
}
#else
static void list_lru_register(struct list_lru *lru)
{
}

static void list_lru_unregister(struct list_lru *lru)
{
}

static int lru_shrinker_id(struct list_lru *lru)
{
        return -1;
}

static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
        return false;
}

static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
{
        return &lru->node[nid].lru;
}

static inline struct list_lru_one *
lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                       bool irq, bool skip_empty)
{
        struct list_lru_one *l = &lru->node[nid].lru;

        if (irq)
                spin_lock_irq(&l->lock);
        else
                spin_lock(&l->lock);

        return l;
}

static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off)
{
        if (irq_off)
                spin_unlock_irq(&l->lock);
        else
                spin_unlock(&l->lock);
}
#endif /* CONFIG_MEMCG */

/* The caller must ensure the memcg lifetime. */
bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
                  struct mem_cgroup *memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;

        l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
        if (!l)
                return false;
        if (list_empty(item)) {
                list_add_tail(item, &l->list);
                /* Set shrinker bit if the first element was added */
                if (!l->nr_items++)
                        set_shrinker_bit(memcg, nid, lru_shrinker_id(lru));
                unlock_list_lru(l, false);
                atomic_long_inc(&nlru->nr_items);
                return true;
        }
        unlock_list_lru(l, false);
        return false;
}

bool list_lru_add_obj(struct list_lru *lru, struct list_head *item)
{
        bool ret;
        int nid = page_to_nid(virt_to_page(item));

        if (list_lru_memcg_aware(lru)) {
                rcu_read_lock();
                ret = list_lru_add(lru, item, nid, mem_cgroup_from_slab_obj(item));
                rcu_read_unlock();
        } else {
                ret = list_lru_add(lru, item, nid, NULL);
        }

        return ret;
}
EXPORT_SYMBOL_GPL(list_lru_add_obj);

/* The caller must ensure the memcg lifetime. */
bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
                  struct mem_cgroup *memcg)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l;
        l = lock_list_lru_of_memcg(lru, nid, memcg, false, false);
        if (!l)
                return false;
        if (!list_empty(item)) {
                list_del_init(item);
                l->nr_items--;
                unlock_list_lru(l, false);
                atomic_long_dec(&nlru->nr_items);
                return true;
        }
        unlock_list_lru(l, false);
        return false;
}

bool list_lru_del_obj(struct list_lru *lru, struct list_head *item)
{
        bool ret;
        int nid = page_to_nid(virt_to_page(item));

        if (list_lru_memcg_aware(lru)) {
                rcu_read_lock();
                ret = list_lru_del(lru, item, nid, mem_cgroup_from_slab_obj(item));
                rcu_read_unlock();
        } else {
                ret = list_lru_del(lru, item, nid, NULL);
        }

        return ret;
}
EXPORT_SYMBOL_GPL(list_lru_del_obj);

void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
{
        list_del_init(item);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate);

void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
                           struct list_head *head)
{
        list_move(item, head);
        list->nr_items--;
}
EXPORT_SYMBOL_GPL(list_lru_isolate_move);

unsigned long list_lru_count_one(struct list_lru *lru,
                                 int nid, struct mem_cgroup *memcg)
{
        struct list_lru_one *l;
        long count;

        rcu_read_lock();
        l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
        count = l ? READ_ONCE(l->nr_items) : 0;
        rcu_read_unlock();

        if (unlikely(count < 0))
                count = 0;

        return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_one);

unsigned long list_lru_count_node(struct list_lru *lru, int nid)
{
        struct list_lru_node *nlru;

        nlru = &lru->node[nid];
        return atomic_long_read(&nlru->nr_items);
}
EXPORT_SYMBOL_GPL(list_lru_count_node);

static unsigned long
__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                    list_lru_walk_cb isolate, void *cb_arg,
                    unsigned long *nr_to_walk, bool irq_off)
{
        struct list_lru_node *nlru = &lru->node[nid];
        struct list_lru_one *l = NULL;
        struct list_head *item, *n;
        unsigned long isolated = 0;

restart:
        l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true);
        if (!l)
                return isolated;
        list_for_each_safe(item, n, &l->list) {
                enum lru_status ret;

                /*
                 * decrement nr_to_walk first so that we don't livelock if we
                 * get stuck on large numbers of LRU_RETRY items
                 */
                if (!*nr_to_walk)
                        break;
                --*nr_to_walk;

                ret = isolate(item, l, cb_arg);
                switch (ret) {
                /*
                 * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru
                 * lock. List traversal will have to restart from scratch.
                 */
                case LRU_RETRY:
                        goto restart;
                case LRU_REMOVED_RETRY:
                        fallthrough;
                case LRU_REMOVED:
                        isolated++;
                        atomic_long_dec(&nlru->nr_items);
                        if (ret == LRU_REMOVED_RETRY)
                                goto restart;
                        break;
                case LRU_ROTATE:
                        list_move_tail(item, &l->list);
                        break;
                case LRU_SKIP:
                        break;
                case LRU_STOP:
                        goto out;
                default:
                        BUG();
                }
        }
        unlock_list_lru(l, irq_off);
out:
        return isolated;
}

unsigned long
list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                  list_lru_walk_cb isolate, void *cb_arg,
                  unsigned long *nr_to_walk)
{
        return __list_lru_walk_one(lru, nid, memcg, isolate,
                                   cb_arg, nr_to_walk, false);
}
EXPORT_SYMBOL_GPL(list_lru_walk_one);

unsigned long
list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
                      list_lru_walk_cb isolate, void *cb_arg,
                      unsigned long *nr_to_walk)
{
        return __list_lru_walk_one(lru, nid, memcg, isolate,
                                   cb_arg, nr_to_walk, true);
}

unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
                                 list_lru_walk_cb isolate, void *cb_arg,
                                 unsigned long *nr_to_walk)
{
        long isolated = 0;

        isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg,
                                      nr_to_walk);

#ifdef CONFIG_MEMCG
        if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
                struct list_lru_memcg *mlru;
                struct mem_cgroup *memcg;
                unsigned long index;

                xa_for_each(&lru->xa, index, mlru) {
                        rcu_read_lock();
                        memcg = mem_cgroup_from_id(index);
                        if (!mem_cgroup_tryget(memcg)) {
                                rcu_read_unlock();
                                continue;
                        }
                        rcu_read_unlock();
                        isolated += __list_lru_walk_one(lru, nid, memcg,
                                                        isolate, cb_arg,
                                                        nr_to_walk, false);
                        mem_cgroup_put(memcg);

                        if (*nr_to_walk <= 0)
                                break;
                }
        }
#endif

        return isolated;
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);

static void init_one_lru(struct list_lru *lru, struct list_lru_one *l)
{
        INIT_LIST_HEAD(&l->list);
        spin_lock_init(&l->lock);
        l->nr_items = 0;
#ifdef CONFIG_LOCKDEP
        if (lru->key)
                lockdep_set_class(&l->lock, lru->key);
#endif
}

#ifdef CONFIG_MEMCG
static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp)
{
        int nid;
        struct list_lru_memcg *mlru;

        mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp);
        if (!mlru)
                return NULL;

        for_each_node(nid)
                init_one_lru(lru, &mlru->node[nid]);

        return mlru;
}

static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
        if (memcg_aware)
                xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ);
        lru->memcg_aware = memcg_aware;
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
        XA_STATE(xas, &lru->xa, 0);
        struct list_lru_memcg *mlru;

        if (!list_lru_memcg_aware(lru))
                return;

        xas_lock_irq(&xas);
        xas_for_each(&xas, mlru, ULONG_MAX) {
                kfree(mlru);
                xas_store(&xas, NULL);
        }
        xas_unlock_irq(&xas);
}

static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid,
                                        struct list_lru_one *src,
                                        struct mem_cgroup *dst_memcg)
{
        int dst_idx = dst_memcg->kmemcg_id;
        struct list_lru_one *dst;

        spin_lock_irq(&src->lock);
        dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
        spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING);

        list_splice_init(&src->list, &dst->list);
        if (src->nr_items) {
                WARN_ON(src->nr_items < 0);
                dst->nr_items += src->nr_items;
                set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
        }
        /* Mark the list_lru_one dead */
        src->nr_items = LONG_MIN;

        spin_unlock(&dst->lock);
        spin_unlock_irq(&src->lock);
}

void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
{
        struct list_lru *lru;
        int i;

        mutex_lock(&list_lrus_mutex);
        list_for_each_entry(lru, &memcg_list_lrus, list) {
                struct list_lru_memcg *mlru;
                XA_STATE(xas, &lru->xa, memcg->kmemcg_id);

                /*
                 * Lock the Xarray to ensure no on going list_lru_memcg
                 * allocation and further allocation will see css_is_dying().
                 */
                xas_lock_irq(&xas);
                mlru = xas_store(&xas, NULL);
                xas_unlock_irq(&xas);
                if (!mlru)
                        continue;

                /*
                 * With Xarray value set to NULL, holding the lru lock below
                 * prevents list_lru_{add,del,isolate} from touching the lru,
                 * safe to reparent.
                 */
                for_each_node(i)
                        memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent);

                /*
                 * Here all list_lrus corresponding to the cgroup are guaranteed
                 * to remain empty, we can safely free this lru, any further
                 * memcg_list_lru_alloc() call will simply bail out.
                 */
                kvfree_rcu(mlru, rcu);
        }
        mutex_unlock(&list_lrus_mutex);
}

static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
                                            struct list_lru *lru)
{
        int idx = memcg->kmemcg_id;

        return idx < 0 || xa_load(&lru->xa, idx);
}

int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
                         gfp_t gfp)
{
        unsigned long flags;
        struct list_lru_memcg *mlru = NULL;
        struct mem_cgroup *pos, *parent;
        XA_STATE(xas, &lru->xa, 0);

        if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
                return 0;

        gfp &= GFP_RECLAIM_MASK;
        /*
         * Because the list_lru can be reparented to the parent cgroup's
         * list_lru, we should make sure that this cgroup and all its
         * ancestors have allocated list_lru_memcg.
         */
        do {
                /*
                 * Keep finding the farest parent that wasn't populated
                 * until found memcg itself.
                 */
                pos = memcg;
                parent = parent_mem_cgroup(pos);
                while (!memcg_list_lru_allocated(parent, lru)) {
                        pos = parent;
                        parent = parent_mem_cgroup(pos);
                }

                if (!mlru) {
                        mlru = memcg_init_list_lru_one(lru, gfp);
                        if (!mlru)
                                return -ENOMEM;
                }
                xas_set(&xas, pos->kmemcg_id);
                do {
                        xas_lock_irqsave(&xas, flags);
                        if (!xas_load(&xas) && !css_is_dying(&pos->css)) {
                                xas_store(&xas, mlru);
                                if (!xas_error(&xas))
                                        mlru = NULL;
                        }
                        xas_unlock_irqrestore(&xas, flags);
                } while (xas_nomem(&xas, gfp));
        } while (pos != memcg && !css_is_dying(&pos->css));

        if (unlikely(mlru))
                kfree(mlru);

        return xas_error(&xas);
}
#else
static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
{
}

static void memcg_destroy_list_lru(struct list_lru *lru)
{
}
#endif /* CONFIG_MEMCG */

int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker)
{
        int i;

#ifdef CONFIG_MEMCG
        if (shrinker)
                lru->shrinker_id = shrinker->id;
        else
                lru->shrinker_id = -1;

        if (mem_cgroup_kmem_disabled())
                memcg_aware = false;
#endif

        lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
        if (!lru->node)
                return -ENOMEM;

        for_each_node(i)
                init_one_lru(lru, &lru->node[i].lru);

        memcg_init_list_lru(lru, memcg_aware);
        list_lru_register(lru);

        return 0;
}
EXPORT_SYMBOL_GPL(__list_lru_init);

void list_lru_destroy(struct list_lru *lru)
{
        /* Already destroyed or not yet initialized? */
        if (!lru->node)
                return;

        list_lru_unregister(lru);

        memcg_destroy_list_lru(lru);
        kfree(lru->node);
        lru->node = NULL;

#ifdef CONFIG_MEMCG
        lru->shrinker_id = -1;
#endif
}
EXPORT_SYMBOL_GPL(list_lru_destroy);















































  248 





































































































































































































































































































































































































































































































































































































































































































































































































   97 



   97 

   97 

































































































































































































































   68 


   68 

   68 










   45 


   45 

   44 
























   24 


   24 





















































































































































































































































































































































































































































































































































































































































































   25 
   24 










   25 



   25 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2017 - Columbia University and Linaro Ltd.
 * Author: Jintack Lim <jintack.lim@linaro.org>
 */

#include <linux/bitfield.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>

#include <asm/fixmap.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/sysreg.h>

#include "sys_regs.h"

struct vncr_tlb {
        /* The guest's VNCR_EL2 */
        u64                        gva;
        struct s1_walk_info        wi;
        struct s1_walk_result        wr;

        u64                        hpa;

        /* -1 when not mapped on a CPU */
        int                        cpu;

        /*
         * true if the TLB is valid. Can only be changed with the
         * mmu_lock held.
         */
        bool                        valid;
};

/*
 * Ratio of live shadow S2 MMU per vcpu. This is a trade-off between
 * memory usage and potential number of different sets of S2 PTs in
 * the guests. Running out of S2 MMUs only affects performance (we
 * will invalidate them more often).
 */
#define S2_MMU_PER_VCPU                2

void kvm_init_nested(struct kvm *kvm)
{
        kvm->arch.nested_mmus = NULL;
        kvm->arch.nested_mmus_size = 0;
        atomic_set(&kvm->arch.vncr_map_count, 0);
}

static int init_nested_s2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu)
{
        /*
         * We only initialise the IPA range on the canonical MMU, which
         * defines the contract between KVM and userspace on where the
         * "hardware" is in the IPA space. This affects the validity of MMIO
         * exits forwarded to userspace, for example.
         *
         * For nested S2s, we use the PARange as exposed to the guest, as it
         * is allowed to use it at will to expose whatever memory map it
         * wants to its own guests as it would be on real HW.
         */
        return kvm_init_stage2_mmu(kvm, mmu, kvm_get_pa_bits(kvm));
}

int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        struct kvm_s2_mmu *tmp;
        int num_mmus, ret = 0;

        if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features) &&
            !cpus_have_final_cap(ARM64_HAS_HCR_NV1))
                return -EINVAL;

        if (!vcpu->arch.ctxt.vncr_array)
                vcpu->arch.ctxt.vncr_array = (u64 *)__get_free_page(GFP_KERNEL_ACCOUNT |
                                                                    __GFP_ZERO);

        if (!vcpu->arch.ctxt.vncr_array)
                return -ENOMEM;

        /*
         * Let's treat memory allocation failures as benign: If we fail to
         * allocate anything, return an error and keep the allocated array
         * alive. Userspace may try to recover by intializing the vcpu
         * again, and there is no reason to affect the whole VM for this.
         */
        num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU;
        tmp = kvrealloc(kvm->arch.nested_mmus,
                        size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus),
                        GFP_KERNEL_ACCOUNT | __GFP_ZERO);
        if (!tmp)
                return -ENOMEM;

        swap(kvm->arch.nested_mmus, tmp);

        /*
         * If we went through a realocation, adjust the MMU back-pointers in
         * the previously initialised kvm_pgtable structures.
         */
        if (kvm->arch.nested_mmus != tmp)
                for (int i = 0; i < kvm->arch.nested_mmus_size; i++)
                        kvm->arch.nested_mmus[i].pgt->mmu = &kvm->arch.nested_mmus[i];

        for (int i = kvm->arch.nested_mmus_size; !ret && i < num_mmus; i++)
                ret = init_nested_s2_mmu(kvm, &kvm->arch.nested_mmus[i]);

        if (ret) {
                for (int i = kvm->arch.nested_mmus_size; i < num_mmus; i++)
                        kvm_free_stage2_pgd(&kvm->arch.nested_mmus[i]);

                free_page((unsigned long)vcpu->arch.ctxt.vncr_array);
                vcpu->arch.ctxt.vncr_array = NULL;

                return ret;
        }

        kvm->arch.nested_mmus_size = num_mmus;

        return 0;
}

struct s2_walk_info {
        int             (*read_desc)(phys_addr_t pa, u64 *desc, void *data);
        void             *data;
        u64             baddr;
        unsigned int max_oa_bits;
        unsigned int pgshift;
        unsigned int sl;
        unsigned int t0sz;
        bool             be;
};

static u32 compute_fsc(int level, u32 fsc)
{
        return fsc | (level & 0x3);
}

static int esr_s2_fault(struct kvm_vcpu *vcpu, int level, u32 fsc)
{
        u32 esr;

        esr = kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC;
        esr |= compute_fsc(level, fsc);
        return esr;
}

static int get_ia_size(struct s2_walk_info *wi)
{
        return 64 - wi->t0sz;
}

static int check_base_s2_limits(struct s2_walk_info *wi,
                                int level, int input_size, int stride)
{
        int start_size, ia_size;

        ia_size = get_ia_size(wi);

        /* Check translation limits */
        switch (BIT(wi->pgshift)) {
        case SZ_64K:
                if (level == 0 || (level == 1 && ia_size <= 42))
                        return -EFAULT;
                break;
        case SZ_16K:
                if (level == 0 || (level == 1 && ia_size <= 40))
                        return -EFAULT;
                break;
        case SZ_4K:
                if (level < 0 || (level == 0 && ia_size <= 42))
                        return -EFAULT;
                break;
        }

        /* Check input size limits */
        if (input_size > ia_size)
                return -EFAULT;

        /* Check number of entries in starting level table */
        start_size = input_size - ((3 - level) * stride + wi->pgshift);
        if (start_size < 1 || start_size > stride + 4)
                return -EFAULT;

        return 0;
}

/* Check if output is within boundaries */
static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
{
        unsigned int output_size = wi->max_oa_bits;

        if (output_size != 48 && (output & GENMASK_ULL(47, output_size)))
                return -1;

        return 0;
}

/*
 * This is essentially a C-version of the pseudo code from the ARM ARM
 * AArch64.TranslationTableWalk  function.  I strongly recommend looking at
 * that pseudocode in trying to understand this.
 *
 * Must be called with the kvm->srcu read lock held
 */
static int walk_nested_s2_pgd(phys_addr_t ipa,
                              struct s2_walk_info *wi, struct kvm_s2_trans *out)
{
        int first_block_level, level, stride, input_size, base_lower_bound;
        phys_addr_t base_addr;
        unsigned int addr_top, addr_bottom;
        u64 desc;  /* page table entry */
        int ret;
        phys_addr_t paddr;

        switch (BIT(wi->pgshift)) {
        default:
        case SZ_64K:
        case SZ_16K:
                level = 3 - wi->sl;
                first_block_level = 2;
                break;
        case SZ_4K:
                level = 2 - wi->sl;
                first_block_level = 1;
                break;
        }

        stride = wi->pgshift - 3;
        input_size = get_ia_size(wi);
        if (input_size > 48 || input_size < 25)
                return -EFAULT;

        ret = check_base_s2_limits(wi, level, input_size, stride);
        if (WARN_ON(ret))
                return ret;

        base_lower_bound = 3 + input_size - ((3 - level) * stride +
                           wi->pgshift);
        base_addr = wi->baddr & GENMASK_ULL(47, base_lower_bound);

        if (check_output_size(wi, base_addr)) {
                out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
                return 1;
        }

        addr_top = input_size - 1;

        while (1) {
                phys_addr_t index;

                addr_bottom = (3 - level) * stride + wi->pgshift;
                index = (ipa & GENMASK_ULL(addr_top, addr_bottom))
                        >> (addr_bottom - 3);

                paddr = base_addr | index;
                ret = wi->read_desc(paddr, &desc, wi->data);
                if (ret < 0)
                        return ret;

                /*
                 * Handle reversedescriptors if endianness differs between the
                 * host and the guest hypervisor.
                 */
                if (wi->be)
                        desc = be64_to_cpu((__force __be64)desc);
                else
                        desc = le64_to_cpu((__force __le64)desc);

                /* Check for valid descriptor at this point */
                if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
                        out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
                        out->desc = desc;
                        return 1;
                }

                /* We're at the final level or block translation level */
                if ((desc & 3) == 1 || level == 3)
                        break;

                if (check_output_size(wi, desc)) {
                        out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
                        out->desc = desc;
                        return 1;
                }

                base_addr = desc & GENMASK_ULL(47, wi->pgshift);

                level += 1;
                addr_top = addr_bottom - 1;
        }

        if (level < first_block_level) {
                out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
                out->desc = desc;
                return 1;
        }

        if (check_output_size(wi, desc)) {
                out->esr = compute_fsc(level, ESR_ELx_FSC_ADDRSZ);
                out->desc = desc;
                return 1;
        }

        if (!(desc & BIT(10))) {
                out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
                out->desc = desc;
                return 1;
        }

        addr_bottom += contiguous_bit_shift(desc, wi, level);

        /* Calculate and return the result */
        paddr = (desc & GENMASK_ULL(47, addr_bottom)) |
                (ipa & GENMASK_ULL(addr_bottom - 1, 0));
        out->output = paddr;
        out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
        out->readable = desc & (0b01 << 6);
        out->writable = desc & (0b10 << 6);
        out->level = level;
        out->desc = desc;
        return 0;
}

static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data)
{
        struct kvm_vcpu *vcpu = data;

        return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc));
}

static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
{
        wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK;

        switch (vtcr & VTCR_EL2_TG0_MASK) {
        case VTCR_EL2_TG0_4K:
                wi->pgshift = 12;         break;
        case VTCR_EL2_TG0_16K:
                wi->pgshift = 14;         break;
        case VTCR_EL2_TG0_64K:
        default:            /* IMPDEF: treat any other value as 64k */
                wi->pgshift = 16;         break;
        }

        wi->sl = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr);
        /* Global limit for now, should eventually be per-VM */
        wi->max_oa_bits = min(get_kvm_ipa_limit(),
                              ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr)));
}

int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
                       struct kvm_s2_trans *result)
{
        u64 vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
        struct s2_walk_info wi;
        int ret;

        result->esr = 0;

        if (!vcpu_has_nv(vcpu))
                return 0;

        wi.read_desc = read_guest_s2_desc;
        wi.data = vcpu;
        wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);

        vtcr_to_walk_info(vtcr, &wi);

        wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;

        ret = walk_nested_s2_pgd(gipa, &wi, result);
        if (ret)
                result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);

        return ret;
}

static unsigned int ttl_to_size(u8 ttl)
{
        int level = ttl & 3;
        int gran = (ttl >> 2) & 3;
        unsigned int max_size = 0;

        switch (gran) {
        case TLBI_TTL_TG_4K:
                switch (level) {
                case 0:
                        break;
                case 1:
                        max_size = SZ_1G;
                        break;
                case 2:
                        max_size = SZ_2M;
                        break;
                case 3:
                        max_size = SZ_4K;
                        break;
                }
                break;
        case TLBI_TTL_TG_16K:
                switch (level) {
                case 0:
                case 1:
                        break;
                case 2:
                        max_size = SZ_32M;
                        break;
                case 3:
                        max_size = SZ_16K;
                        break;
                }
                break;
        case TLBI_TTL_TG_64K:
                switch (level) {
                case 0:
                case 1:
                        /* No 52bit IPA support */
                        break;
                case 2:
                        max_size = SZ_512M;
                        break;
                case 3:
                        max_size = SZ_64K;
                        break;
                }
                break;
        default:                        /* No size information */
                break;
        }

        return max_size;
}

static u8 pgshift_level_to_ttl(u16 shift, u8 level)
{
        u8 ttl;

        switch(shift) {
        case 12:
                ttl = TLBI_TTL_TG_4K;
                break;
        case 14:
                ttl = TLBI_TTL_TG_16K;
                break;
        case 16:
                ttl = TLBI_TTL_TG_64K;
                break;
        default:
                BUG();
        }

        ttl <<= 2;
        ttl |= level & 3;

        return ttl;
}

/*
 * Compute the equivalent of the TTL field by parsing the shadow PT.  The
 * granule size is extracted from the cached VTCR_EL2.TG0 while the level is
 * retrieved from first entry carrying the level as a tag.
 */
static u8 get_guest_mapping_ttl(struct kvm_s2_mmu *mmu, u64 addr)
{
        u64 tmp, sz = 0, vtcr = mmu->tlb_vtcr;
        kvm_pte_t pte;
        u8 ttl, level;

        lockdep_assert_held_write(&kvm_s2_mmu_to_kvm(mmu)->mmu_lock);

        switch (vtcr & VTCR_EL2_TG0_MASK) {
        case VTCR_EL2_TG0_4K:
                ttl = (TLBI_TTL_TG_4K << 2);
                break;
        case VTCR_EL2_TG0_16K:
                ttl = (TLBI_TTL_TG_16K << 2);
                break;
        case VTCR_EL2_TG0_64K:
        default:            /* IMPDEF: treat any other value as 64k */
                ttl = (TLBI_TTL_TG_64K << 2);
                break;
        }

        tmp = addr;

again:
        /* Iteratively compute the block sizes for a particular granule size */
        switch (vtcr & VTCR_EL2_TG0_MASK) {
        case VTCR_EL2_TG0_4K:
                if        (sz < SZ_4K)        sz = SZ_4K;
                else if (sz < SZ_2M)        sz = SZ_2M;
                else if (sz < SZ_1G)        sz = SZ_1G;
                else                        sz = 0;
                break;
        case VTCR_EL2_TG0_16K:
                if        (sz < SZ_16K)        sz = SZ_16K;
                else if (sz < SZ_32M)        sz = SZ_32M;
                else                        sz = 0;
                break;
        case VTCR_EL2_TG0_64K:
        default:            /* IMPDEF: treat any other value as 64k */
                if        (sz < SZ_64K)        sz = SZ_64K;
                else if (sz < SZ_512M)        sz = SZ_512M;
                else                        sz = 0;
                break;
        }

        if (sz == 0)
                return 0;

        tmp &= ~(sz - 1);
        if (kvm_pgtable_get_leaf(mmu->pgt, tmp, &pte, NULL))
                goto again;
        if (!(pte & PTE_VALID))
                goto again;
        level = FIELD_GET(KVM_NV_GUEST_MAP_SZ, pte);
        if (!level)
                goto again;

        ttl |= level;

        /*
         * We now have found some level information in the shadow S2. Check
         * that the resulting range is actually including the original IPA.
         */
        sz = ttl_to_size(ttl);
        if (addr < (tmp + sz))
                return ttl;

        return 0;
}

unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val)
{
        struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        unsigned long max_size;
        u8 ttl;

        ttl = FIELD_GET(TLBI_TTL_MASK, val);

        if (!ttl || !kvm_has_feat(kvm, ID_AA64MMFR2_EL1, TTL, IMP)) {
                /* No TTL, check the shadow S2 for a hint */
                u64 addr = (val & GENMASK_ULL(35, 0)) << 12;
                ttl = get_guest_mapping_ttl(mmu, addr);
        }

        max_size = ttl_to_size(ttl);

        if (!max_size) {
                /* Compute the maximum extent of the invalidation */
                switch (mmu->tlb_vtcr & VTCR_EL2_TG0_MASK) {
                case VTCR_EL2_TG0_4K:
                        max_size = SZ_1G;
                        break;
                case VTCR_EL2_TG0_16K:
                        max_size = SZ_32M;
                        break;
                case VTCR_EL2_TG0_64K:
                default:    /* IMPDEF: treat any other value as 64k */
                        /*
                         * No, we do not support 52bit IPA in nested yet. Once
                         * we do, this should be 4TB.
                         */
                        max_size = SZ_512M;
                        break;
                }
        }

        WARN_ON(!max_size);
        return max_size;
}

/*
 * We can have multiple *different* MMU contexts with the same VMID:
 *
 * - S2 being enabled or not, hence differing by the HCR_EL2.VM bit
 *
 * - Multiple vcpus using private S2s (huh huh...), hence differing by the
 *   VBBTR_EL2.BADDR address
 *
 * - A combination of the above...
 *
 * We can always identify which MMU context to pick at run-time.  However,
 * TLB invalidation involving a VMID must take action on all the TLBs using
 * this particular VMID. This translates into applying the same invalidation
 * operation to all the contexts that are using this VMID. Moar phun!
 */
void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
                                const union tlbi_info *info,
                                void (*tlbi_callback)(struct kvm_s2_mmu *,
                                                      const union tlbi_info *))
{
        write_lock(&kvm->mmu_lock);

        for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
                struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];

                if (!kvm_s2_mmu_valid(mmu))
                        continue;

                if (vmid == get_vmid(mmu->tlb_vttbr))
                        tlbi_callback(mmu, info);
        }

        write_unlock(&kvm->mmu_lock);
}

struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        bool nested_stage2_enabled;
        u64 vttbr, vtcr, hcr;

        lockdep_assert_held_write(&kvm->mmu_lock);

        vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
        vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
        hcr = vcpu_read_sys_reg(vcpu, HCR_EL2);

        nested_stage2_enabled = hcr & HCR_VM;

        /* Don't consider the CnP bit for the vttbr match */
        vttbr &= ~VTTBR_CNP_BIT;

        /*
         * Two possibilities when looking up a S2 MMU context:
         *
         * - either S2 is enabled in the guest, and we need a context that is
         *   S2-enabled and matches the full VTTBR (VMID+BADDR) and VTCR,
         *   which makes it safe from a TLB conflict perspective (a broken
         *   guest won't be able to generate them),
         *
         * - or S2 is disabled, and we need a context that is S2-disabled
         *   and matches the VMID only, as all TLBs are tagged by VMID even
         *   if S2 translation is disabled.
         */
        for (int i = 0; i < kvm->arch.nested_mmus_size; i++) {
                struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];

                if (!kvm_s2_mmu_valid(mmu))
                        continue;

                if (nested_stage2_enabled &&
                    mmu->nested_stage2_enabled &&
                    vttbr == mmu->tlb_vttbr &&
                    vtcr == mmu->tlb_vtcr)
                        return mmu;

                if (!nested_stage2_enabled &&
                    !mmu->nested_stage2_enabled &&
                    get_vmid(vttbr) == get_vmid(mmu->tlb_vttbr))
                        return mmu;
        }
        return NULL;
}

static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        struct kvm_s2_mmu *s2_mmu;
        int i;

        lockdep_assert_held_write(&vcpu->kvm->mmu_lock);

        s2_mmu = lookup_s2_mmu(vcpu);
        if (s2_mmu)
                goto out;

        /*
         * Make sure we don't always search from the same point, or we
         * will always reuse a potentially active context, leaving
         * free contexts unused.
         */
        for (i = kvm->arch.nested_mmus_next;
             i < (kvm->arch.nested_mmus_size + kvm->arch.nested_mmus_next);
             i++) {
                s2_mmu = &kvm->arch.nested_mmus[i % kvm->arch.nested_mmus_size];

                if (atomic_read(&s2_mmu->refcnt) == 0)
                        break;
        }
        BUG_ON(atomic_read(&s2_mmu->refcnt)); /* We have struct MMUs to spare */

        /* Set the scene for the next search */
        kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size;

        /* Make sure we don't forget to do the laundry */
        if (kvm_s2_mmu_valid(s2_mmu))
                s2_mmu->pending_unmap = true;

        /*
         * The virtual VMID (modulo CnP) will be used as a key when matching
         * an existing kvm_s2_mmu.
         *
         * We cache VTCR at allocation time, once and for all. It'd be great
         * if the guest didn't screw that one up, as this is not very
         * forgiving...
         */
        s2_mmu->tlb_vttbr = vcpu_read_sys_reg(vcpu, VTTBR_EL2) & ~VTTBR_CNP_BIT;
        s2_mmu->tlb_vtcr = vcpu_read_sys_reg(vcpu, VTCR_EL2);
        s2_mmu->nested_stage2_enabled = vcpu_read_sys_reg(vcpu, HCR_EL2) & HCR_VM;

out:
        atomic_inc(&s2_mmu->refcnt);

        /*
         * Set the vCPU request to perform an unmap, even if the pending unmap
         * originates from another vCPU. This guarantees that the MMU has been
         * completely unmapped before any vCPU actually uses it, and allows
         * multiple vCPUs to lend a hand with completing the unmap.
         */
        if (s2_mmu->pending_unmap)
                kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu);

        return s2_mmu;
}

void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
{
        /* CnP being set denotes an invalid entry */
        mmu->tlb_vttbr = VTTBR_CNP_BIT;
        mmu->nested_stage2_enabled = false;
        atomic_set(&mmu->refcnt, 0);
}

void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
{
        /*
         * If the vCPU kept its reference on the MMU after the last put,
         * keep rolling with it.
         */
        if (is_hyp_ctxt(vcpu)) {
                if (!vcpu->arch.hw_mmu)
                        vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu;
        } else {
                if (!vcpu->arch.hw_mmu) {
                        scoped_guard(write_lock, &vcpu->kvm->mmu_lock)
                                vcpu->arch.hw_mmu = get_s2_mmu_nested(vcpu);
                }

                if (__vcpu_sys_reg(vcpu, HCR_EL2) & HCR_NV)
                        kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
        }
}

void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu)
{
        /* Unconditionally drop the VNCR mapping if we have one */
        if (host_data_test_flag(L1_VNCR_MAPPED)) {
                BUG_ON(vcpu->arch.vncr_tlb->cpu != smp_processor_id());
                BUG_ON(is_hyp_ctxt(vcpu));

                clear_fixmap(vncr_fixmap(vcpu->arch.vncr_tlb->cpu));
                vcpu->arch.vncr_tlb->cpu = -1;
                host_data_clear_flag(L1_VNCR_MAPPED);
                atomic_dec(&vcpu->kvm->arch.vncr_map_count);
        }

        /*
         * Keep a reference on the associated stage-2 MMU if the vCPU is
         * scheduling out and not in WFI emulation, suggesting it is likely to
         * reuse the MMU sometime soon.
         */
        if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI))
                return;

        if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu))
                atomic_dec(&vcpu->arch.hw_mmu->refcnt);

        vcpu->arch.hw_mmu = NULL;
}

/*
 * Returns non-zero if permission fault is handled by injecting it to the next
 * level hypervisor.
 */
int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
{
        bool forward_fault = false;

        trans->esr = 0;

        if (!kvm_vcpu_trap_is_permission_fault(vcpu))
                return 0;

        if (kvm_vcpu_trap_is_iabt(vcpu)) {
                forward_fault = !kvm_s2_trans_executable(trans);
        } else {
                bool write_fault = kvm_is_write_fault(vcpu);

                forward_fault = ((write_fault && !trans->writable) ||
                                 (!write_fault && !trans->readable));
        }

        if (forward_fault)
                trans->esr = esr_s2_fault(vcpu, trans->level, ESR_ELx_FSC_PERM);

        return forward_fault;
}

int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2)
{
        vcpu_write_sys_reg(vcpu, vcpu->arch.fault.far_el2, FAR_EL2);
        vcpu_write_sys_reg(vcpu, vcpu->arch.fault.hpfar_el2, HPFAR_EL2);

        return kvm_inject_nested_sync(vcpu, esr_el2);
}

static void invalidate_vncr(struct vncr_tlb *vt)
{
        vt->valid = false;
        if (vt->cpu != -1)
                clear_fixmap(vncr_fixmap(vt->cpu));
}

static void kvm_invalidate_vncr_ipa(struct kvm *kvm, u64 start, u64 end)
{
        struct kvm_vcpu *vcpu;
        unsigned long i;

        lockdep_assert_held_write(&kvm->mmu_lock);

        if (!kvm_has_feat(kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
                return;

        kvm_for_each_vcpu(i, vcpu, kvm) {
                struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
                u64 ipa_start, ipa_end, ipa_size;

                /*
                 * Careful here: We end-up here from an MMU notifier,
                 * and this can race against a vcpu not being onlined
                 * yet, without the pseudo-TLB being allocated.
                 *
                 * Skip those, as they obviously don't participate in
                 * the invalidation at this stage.
                 */
                if (!vt)
                        continue;

                if (!vt->valid)
                        continue;

                ipa_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
                                                            vt->wr.level));
                ipa_start = vt->wr.pa & (ipa_size - 1);
                ipa_end = ipa_start + ipa_size;

                if (ipa_end <= start || ipa_start >= end)
                        continue;

                invalidate_vncr(vt);
        }
}

struct s1e2_tlbi_scope {
        enum {
                TLBI_ALL,
                TLBI_VA,
                TLBI_VAA,
                TLBI_ASID,
        } type;

        u16 asid;
        u64 va;
        u64 size;
};

static void invalidate_vncr_va(struct kvm *kvm,
                               struct s1e2_tlbi_scope *scope)
{
        struct kvm_vcpu *vcpu;
        unsigned long i;

        lockdep_assert_held_write(&kvm->mmu_lock);

        kvm_for_each_vcpu(i, vcpu, kvm) {
                struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
                u64 va_start, va_end, va_size;

                if (!vt->valid)
                        continue;

                va_size = ttl_to_size(pgshift_level_to_ttl(vt->wi.pgshift,
                                                           vt->wr.level));
                va_start = vt->gva & (va_size - 1);
                va_end = va_start + va_size;

                switch (scope->type) {
                case TLBI_ALL:
                        break;

                case TLBI_VA:
                        if (va_end <= scope->va ||
                            va_start >= (scope->va + scope->size))
                                continue;
                        if (vt->wr.nG && vt->wr.asid != scope->asid)
                                continue;
                        break;

                case TLBI_VAA:
                        if (va_end <= scope->va ||
                            va_start >= (scope->va + scope->size))
                                continue;
                        break;

                case TLBI_ASID:
                        if (!vt->wr.nG || vt->wr.asid != scope->asid)
                                continue;
                        break;
                }

                invalidate_vncr(vt);
        }
}

static void compute_s1_tlbi_range(struct kvm_vcpu *vcpu, u32 inst, u64 val,
                                  struct s1e2_tlbi_scope *scope)
{
        switch (inst) {
        case OP_TLBI_ALLE2:
        case OP_TLBI_ALLE2IS:
        case OP_TLBI_ALLE2OS:
        case OP_TLBI_VMALLE1:
        case OP_TLBI_VMALLE1IS:
        case OP_TLBI_VMALLE1OS:
        case OP_TLBI_ALLE2NXS:
        case OP_TLBI_ALLE2ISNXS:
        case OP_TLBI_ALLE2OSNXS:
        case OP_TLBI_VMALLE1NXS:
        case OP_TLBI_VMALLE1ISNXS:
        case OP_TLBI_VMALLE1OSNXS:
                scope->type = TLBI_ALL;
                break;
        case OP_TLBI_VAE2:
        case OP_TLBI_VAE2IS:
        case OP_TLBI_VAE2OS:
        case OP_TLBI_VAE1:
        case OP_TLBI_VAE1IS:
        case OP_TLBI_VAE1OS:
        case OP_TLBI_VAE2NXS:
        case OP_TLBI_VAE2ISNXS:
        case OP_TLBI_VAE2OSNXS:
        case OP_TLBI_VAE1NXS:
        case OP_TLBI_VAE1ISNXS:
        case OP_TLBI_VAE1OSNXS:
        case OP_TLBI_VALE2:
        case OP_TLBI_VALE2IS:
        case OP_TLBI_VALE2OS:
        case OP_TLBI_VALE1:
        case OP_TLBI_VALE1IS:
        case OP_TLBI_VALE1OS:
        case OP_TLBI_VALE2NXS:
        case OP_TLBI_VALE2ISNXS:
        case OP_TLBI_VALE2OSNXS:
        case OP_TLBI_VALE1NXS:
        case OP_TLBI_VALE1ISNXS:
        case OP_TLBI_VALE1OSNXS:
                scope->type = TLBI_VA;
                scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
                if (!scope->size)
                        scope->size = SZ_1G;
                scope->va = (val << 12) & ~(scope->size - 1);
                scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
                break;
        case OP_TLBI_ASIDE1:
        case OP_TLBI_ASIDE1IS:
        case OP_TLBI_ASIDE1OS:
        case OP_TLBI_ASIDE1NXS:
        case OP_TLBI_ASIDE1ISNXS:
        case OP_TLBI_ASIDE1OSNXS:
                scope->type = TLBI_ASID;
                scope->asid = FIELD_GET(TLBIR_ASID_MASK, val);
                break;
        case OP_TLBI_VAAE1:
        case OP_TLBI_VAAE1IS:
        case OP_TLBI_VAAE1OS:
        case OP_TLBI_VAAE1NXS:
        case OP_TLBI_VAAE1ISNXS:
        case OP_TLBI_VAAE1OSNXS:
        case OP_TLBI_VAALE1:
        case OP_TLBI_VAALE1IS:
        case OP_TLBI_VAALE1OS:
        case OP_TLBI_VAALE1NXS:
        case OP_TLBI_VAALE1ISNXS:
        case OP_TLBI_VAALE1OSNXS:
                scope->type = TLBI_VAA;
                scope->size = ttl_to_size(FIELD_GET(TLBI_TTL_MASK, val));
                if (!scope->size)
                        scope->size = SZ_1G;
                scope->va = (val << 12) & ~(scope->size - 1);
                break;
        case OP_TLBI_RVAE2:
        case OP_TLBI_RVAE2IS:
        case OP_TLBI_RVAE2OS:
        case OP_TLBI_RVAE1:
        case OP_TLBI_RVAE1IS:
        case OP_TLBI_RVAE1OS:
        case OP_TLBI_RVAE2NXS:
        case OP_TLBI_RVAE2ISNXS:
        case OP_TLBI_RVAE2OSNXS:
        case OP_TLBI_RVAE1NXS:
        case OP_TLBI_RVAE1ISNXS:
        case OP_TLBI_RVAE1OSNXS:
        case OP_TLBI_RVALE2:
        case OP_TLBI_RVALE2IS:
        case OP_TLBI_RVALE2OS:
        case OP_TLBI_RVALE1:
        case OP_TLBI_RVALE1IS:
        case OP_TLBI_RVALE1OS:
        case OP_TLBI_RVALE2NXS:
        case OP_TLBI_RVALE2ISNXS:
        case OP_TLBI_RVALE2OSNXS:
        case OP_TLBI_RVALE1NXS:
        case OP_TLBI_RVALE1ISNXS:
        case OP_TLBI_RVALE1OSNXS:
                scope->type = TLBI_VA;
                scope->va = decode_range_tlbi(val, &scope->size, &scope->asid);
                break;
        case OP_TLBI_RVAAE1:
        case OP_TLBI_RVAAE1IS:
        case OP_TLBI_RVAAE1OS:
        case OP_TLBI_RVAAE1NXS:
        case OP_TLBI_RVAAE1ISNXS:
        case OP_TLBI_RVAAE1OSNXS:
        case OP_TLBI_RVAALE1:
        case OP_TLBI_RVAALE1IS:
        case OP_TLBI_RVAALE1OS:
        case OP_TLBI_RVAALE1NXS:
        case OP_TLBI_RVAALE1ISNXS:
        case OP_TLBI_RVAALE1OSNXS:
                scope->type = TLBI_VAA;
                scope->va = decode_range_tlbi(val, &scope->size, NULL);
                break;
        }
}

void kvm_handle_s1e2_tlbi(struct kvm_vcpu *vcpu, u32 inst, u64 val)
{
        struct s1e2_tlbi_scope scope = {};

        compute_s1_tlbi_range(vcpu, inst, val, &scope);

        guard(write_lock)(&vcpu->kvm->mmu_lock);
        invalidate_vncr_va(vcpu->kvm, &scope);
}

void kvm_nested_s2_wp(struct kvm *kvm)
{
        int i;

        lockdep_assert_held_write(&kvm->mmu_lock);

        for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
                struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];

                if (kvm_s2_mmu_valid(mmu))
                        kvm_stage2_wp_range(mmu, 0, kvm_phys_size(mmu));
        }

        kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
}

void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
{
        int i;

        lockdep_assert_held_write(&kvm->mmu_lock);

        for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
                struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];

                if (kvm_s2_mmu_valid(mmu))
                        kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
        }

        kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
}

void kvm_nested_s2_flush(struct kvm *kvm)
{
        int i;

        lockdep_assert_held_write(&kvm->mmu_lock);

        for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
                struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];

                if (kvm_s2_mmu_valid(mmu))
                        kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
        }
}

void kvm_arch_flush_shadow_all(struct kvm *kvm)
{
        int i;

        for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
                struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];

                if (!WARN_ON(atomic_read(&mmu->refcnt)))
                        kvm_free_stage2_pgd(mmu);
        }
        kvfree(kvm->arch.nested_mmus);
        kvm->arch.nested_mmus = NULL;
        kvm->arch.nested_mmus_size = 0;
        kvm_uninit_stage2_mmu(kvm);
}

/*
 * Dealing with VNCR_EL2 exposed by the *guest* is a complicated matter:
 *
 * - We introduce an internal representation of a vcpu-private TLB,
 *   representing the mapping between the guest VA contained in VNCR_EL2,
 *   the IPA the guest's EL2 PTs point to, and the actual PA this lives at.
 *
 * - On translation fault from a nested VNCR access, we create such a TLB.
 *   If there is no mapping to describe, the guest inherits the fault.
 *   Crucially, no actual mapping is done at this stage.
 *
 * - On vcpu_load() in a non-HYP context with HCR_EL2.NV==1, if the above
 *   TLB exists, we map it in the fixmap for this CPU, and run with it. We
 *   have to respect the permissions dictated by the guest, but not the
 *   memory type (FWB is a must).
 *
 * - Note that we usually don't do a vcpu_load() on the back of a fault
 *   (unless we are preempted), so the resolution of a translation fault
 *   must go via a request that will map the VNCR page in the fixmap.
 *   vcpu_load() might as well use the same mechanism.
 *
 * - On vcpu_put() in a non-HYP context with HCR_EL2.NV==1, if the TLB was
 *   mapped, we unmap it. Yes it is that simple. The TLB still exists
 *   though, and may be reused at a later load.
 *
 * - On permission fault, we simply forward the fault to the guest's EL2.
 *   Get out of my way.
 *
 * - On any TLBI for the EL2&0 translation regime, we must find any TLB that
 *   intersects with the TLBI request, invalidate it, and unmap the page
 *   from the fixmap. Because we need to look at all the vcpu-private TLBs,
 *   this requires some wide-ranging locking to ensure that nothing races
 *   against it. This may require some refcounting to avoid the search when
 *   no such TLB is present.
 *
 * - On MMU notifiers, we must invalidate our TLB in a similar way, but
 *   looking at the IPA instead. The funny part is that there may not be a
 *   stage-2 mapping for this page if L1 hasn't accessed it using LD/ST
 *   instructions.
 */

int kvm_vcpu_allocate_vncr_tlb(struct kvm_vcpu *vcpu)
{
        if (!kvm_has_feat(vcpu->kvm, ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY))
                return 0;

        vcpu->arch.vncr_tlb = kzalloc(sizeof(*vcpu->arch.vncr_tlb),
                                      GFP_KERNEL_ACCOUNT);
        if (!vcpu->arch.vncr_tlb)
                return -ENOMEM;

        return 0;
}

static u64 read_vncr_el2(struct kvm_vcpu *vcpu)
{
        return (u64)sign_extend64(__vcpu_sys_reg(vcpu, VNCR_EL2), 48);
}

static int kvm_translate_vncr(struct kvm_vcpu *vcpu)
{
        bool write_fault, writable;
        unsigned long mmu_seq;
        struct vncr_tlb *vt;
        struct page *page;
        u64 va, pfn, gfn;
        int ret;

        vt = vcpu->arch.vncr_tlb;

        /*
         * If we're about to walk the EL2 S1 PTs, we must invalidate the
         * current TLB, as it could be sampled from another vcpu doing a
         * TLBI *IS. A real CPU wouldn't do that, but we only keep a single
         * translation, so not much of a choice.
         *
         * We also prepare the next walk wilst we're at it.
         */
        scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
                invalidate_vncr(vt);

                vt->wi = (struct s1_walk_info) {
                        .regime        = TR_EL20,
                        .as_el0        = false,
                        .pan        = false,
                };
                vt->wr = (struct s1_walk_result){};
        }

        guard(srcu)(&vcpu->kvm->srcu);

        va =  read_vncr_el2(vcpu);

        ret = __kvm_translate_va(vcpu, &vt->wi, &vt->wr, va);
        if (ret)
                return ret;

        write_fault = kvm_is_write_fault(vcpu);

        mmu_seq = vcpu->kvm->mmu_invalidate_seq;
        smp_rmb();

        gfn = vt->wr.pa >> PAGE_SHIFT;
        pfn = kvm_faultin_pfn(vcpu, gfn, write_fault, &writable, &page);
        if (is_error_noslot_pfn(pfn) || (write_fault && !writable))
                return -EFAULT;

        scoped_guard(write_lock, &vcpu->kvm->mmu_lock) {
                if (mmu_invalidate_retry(vcpu->kvm, mmu_seq))
                        return -EAGAIN;

                vt->gva = va;
                vt->hpa = pfn << PAGE_SHIFT;
                vt->valid = true;
                vt->cpu = -1;

                kvm_make_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu);
                kvm_release_faultin_page(vcpu->kvm, page, false, vt->wr.pw);
        }

        if (vt->wr.pw)
                mark_page_dirty(vcpu->kvm, gfn);

        return 0;
}

static void inject_vncr_perm(struct kvm_vcpu *vcpu)
{
        struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
        u64 esr = kvm_vcpu_get_esr(vcpu);

        /* Adjust the fault level to reflect that of the guest's */
        esr &= ~ESR_ELx_FSC;
        esr |= FIELD_PREP(ESR_ELx_FSC,
                          ESR_ELx_FSC_PERM_L(vt->wr.level));

        kvm_inject_nested_sync(vcpu, esr);
}

static bool kvm_vncr_tlb_lookup(struct kvm_vcpu *vcpu)
{
        struct vncr_tlb *vt = vcpu->arch.vncr_tlb;

        lockdep_assert_held_read(&vcpu->kvm->mmu_lock);

        if (!vt->valid)
                return false;

        if (read_vncr_el2(vcpu) != vt->gva)
                return false;

        if (vt->wr.nG) {
                u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
                u64 ttbr = ((tcr & TCR_A1) ?
                            vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
                            vcpu_read_sys_reg(vcpu, TTBR0_EL2));
                u16 asid;

                asid = FIELD_GET(TTBR_ASID_MASK, ttbr);
                if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
                    !(tcr & TCR_ASID16))
                        asid &= GENMASK(7, 0);

                return asid != vt->wr.asid;
        }

        return true;
}

int kvm_handle_vncr_abort(struct kvm_vcpu *vcpu)
{
        struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
        u64 esr = kvm_vcpu_get_esr(vcpu);

        BUG_ON(!(esr & ESR_ELx_VNCR_SHIFT));

        if (esr_fsc_is_permission_fault(esr)) {
                inject_vncr_perm(vcpu);
        } else if (esr_fsc_is_translation_fault(esr)) {
                bool valid;
                int ret;

                scoped_guard(read_lock, &vcpu->kvm->mmu_lock)
                        valid = kvm_vncr_tlb_lookup(vcpu);

                if (!valid)
                        ret = kvm_translate_vncr(vcpu);
                else
                        ret = -EPERM;

                switch (ret) {
                case -EAGAIN:
                case -ENOMEM:
                        /* Let's try again... */
                        break;
                case -EFAULT:
                case -EINVAL:
                case -ENOENT:
                case -EACCES:
                        /*
                         * Translation failed, inject the corresponding
                         * exception back to EL2.
                         */
                        BUG_ON(!vt->wr.failed);

                        esr &= ~ESR_ELx_FSC;
                        esr |= FIELD_PREP(ESR_ELx_FSC, vt->wr.fst);

                        kvm_inject_nested_sync(vcpu, esr);
                        break;
                case -EPERM:
                        /* Hack to deal with POE until we get kernel support */
                        inject_vncr_perm(vcpu);
                        break;
                case 0:
                        break;
                }
        } else {
                WARN_ONCE(1, "Unhandled VNCR abort, ESR=%llx\n", esr);
        }

        return 1;
}

static void kvm_map_l1_vncr(struct kvm_vcpu *vcpu)
{
        struct vncr_tlb *vt = vcpu->arch.vncr_tlb;
        pgprot_t prot;

        guard(preempt)();
        guard(read_lock)(&vcpu->kvm->mmu_lock);

        /*
         * The request to map VNCR may have raced against some other
         * event, such as an interrupt, and may not be valid anymore.
         */
        if (is_hyp_ctxt(vcpu))
                return;

        /*
         * Check that the pseudo-TLB is valid and that VNCR_EL2 still
         * contains the expected value. If it doesn't, we simply bail out
         * without a mapping -- a transformed MSR/MRS will generate the
         * fault and allows us to populate the pseudo-TLB.
         */
        if (!vt->valid)
                return;

        if (read_vncr_el2(vcpu) != vt->gva)
                return;

        if (vt->wr.nG) {
                u64 tcr = vcpu_read_sys_reg(vcpu, TCR_EL2);
                u64 ttbr = ((tcr & TCR_A1) ?
                            vcpu_read_sys_reg(vcpu, TTBR1_EL2) :
                            vcpu_read_sys_reg(vcpu, TTBR0_EL2));
                u16 asid;

                asid = FIELD_GET(TTBR_ASID_MASK, ttbr);
                if (!kvm_has_feat_enum(vcpu->kvm, ID_AA64MMFR0_EL1, ASIDBITS, 16) ||
                    !(tcr & TCR_ASID16))
                        asid &= GENMASK(7, 0);

                if (asid != vt->wr.asid)
                        return;
        }

        vt->cpu = smp_processor_id();

        if (vt->wr.pw && vt->wr.pr)
                prot = PAGE_KERNEL;
        else if (vt->wr.pr)
                prot = PAGE_KERNEL_RO;
        else
                prot = PAGE_NONE;

        /*
         * We can't map write-only (or no permission at all) in the kernel,
         * but the guest can do it if using POE, so we'll have to turn a
         * translation fault into a permission fault at runtime.
         * FIXME: WO doesn't work at all, need POE support in the kernel.
         */
        if (pgprot_val(prot) != pgprot_val(PAGE_NONE)) {
                __set_fixmap(vncr_fixmap(vt->cpu), vt->hpa, prot);
                host_data_set_flag(L1_VNCR_MAPPED);
                atomic_inc(&vcpu->kvm->arch.vncr_map_count);
        }
}

/*
 * Our emulated CPU doesn't support all the possible features. For the
 * sake of simplicity (and probably mental sanity), wipe out a number
 * of feature bits we don't intend to support for the time being.
 * This list should get updated as new features get added to the NV
 * support, and new extension to the architecture.
 */
u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
{
        switch (reg) {
        case SYS_ID_AA64ISAR0_EL1:
                /* Support everything but TME */
                val &= ~ID_AA64ISAR0_EL1_TME;
                break;

        case SYS_ID_AA64ISAR1_EL1:
                /* Support everything but LS64 and Spec Invalidation */
                val &= ~(ID_AA64ISAR1_EL1_LS64        |
                         ID_AA64ISAR1_EL1_SPECRES);
                break;

        case SYS_ID_AA64PFR0_EL1:
                /* No RME, AMU, MPAM, S-EL2, or RAS */
                val &= ~(ID_AA64PFR0_EL1_RME        |
                         ID_AA64PFR0_EL1_AMU        |
                         ID_AA64PFR0_EL1_MPAM        |
                         ID_AA64PFR0_EL1_SEL2        |
                         ID_AA64PFR0_EL1_RAS        |
                         ID_AA64PFR0_EL1_EL3        |
                         ID_AA64PFR0_EL1_EL2        |
                         ID_AA64PFR0_EL1_EL1        |
                         ID_AA64PFR0_EL1_EL0);
                /* 64bit only at any EL */
                val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL0, IMP);
                val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL1, IMP);
                val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL2, IMP);
                val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, EL3, IMP);
                break;

        case SYS_ID_AA64PFR1_EL1:
                /* Only support BTI, SSBS, CSV2_frac */
                val &= (ID_AA64PFR1_EL1_BT        |
                        ID_AA64PFR1_EL1_SSBS        |
                        ID_AA64PFR1_EL1_CSV2_frac);
                break;

        case SYS_ID_AA64MMFR0_EL1:
                /* Hide ExS, Secure Memory */
                val &= ~(ID_AA64MMFR0_EL1_EXS                |
                         ID_AA64MMFR0_EL1_TGRAN4_2        |
                         ID_AA64MMFR0_EL1_TGRAN16_2        |
                         ID_AA64MMFR0_EL1_TGRAN64_2        |
                         ID_AA64MMFR0_EL1_SNSMEM);

                /* Hide CNTPOFF if present */
                val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, ECV, IMP);

                /* Disallow unsupported S2 page sizes */
                switch (PAGE_SIZE) {
                case SZ_64K:
                        val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, NI);
                        fallthrough;
                case SZ_16K:
                        val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, NI);
                        fallthrough;
                case SZ_4K:
                        /* Support everything */
                        break;
                }

                /*
                 * Since we can't support a guest S2 page size smaller
                 * than the host's own page size (due to KVM only
                 * populating its own S2 using the kernel's page
                 * size), advertise the limitation using FEAT_GTG.
                 */
                switch (PAGE_SIZE) {
                case SZ_4K:
                        val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN4_2, IMP);
                        fallthrough;
                case SZ_16K:
                        val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN16_2, IMP);
                        fallthrough;
                case SZ_64K:
                        val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR0_EL1, TGRAN64_2, IMP);
                        break;
                }

                /* Cap PARange to 48bits */
                val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR0_EL1, PARANGE, 48);
                break;

        case SYS_ID_AA64MMFR1_EL1:
                val &= (ID_AA64MMFR1_EL1_HCX        |
                        ID_AA64MMFR1_EL1_PAN        |
                        ID_AA64MMFR1_EL1_LO        |
                        ID_AA64MMFR1_EL1_HPDS        |
                        ID_AA64MMFR1_EL1_VH        |
                        ID_AA64MMFR1_EL1_VMIDBits);
                /* FEAT_E2H0 implies no VHE */
                if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
                        val &= ~ID_AA64MMFR1_EL1_VH;
                break;

        case SYS_ID_AA64MMFR2_EL1:
                val &= ~(ID_AA64MMFR2_EL1_BBM        |
                         ID_AA64MMFR2_EL1_TTL        |
                         GENMASK_ULL(47, 44)        |
                         ID_AA64MMFR2_EL1_ST        |
                         ID_AA64MMFR2_EL1_CCIDX        |
                         ID_AA64MMFR2_EL1_VARange);

                /* Force TTL support */
                val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR2_EL1, TTL, IMP);
                break;

        case SYS_ID_AA64MMFR4_EL1:
                /*
                 * You get EITHER
                 *
                 * - FEAT_VHE without FEAT_E2H0
                 * - FEAT_NV limited to FEAT_NV2
                 * - HCR_EL2.NV1 being RES0
                 *
                 * OR
                 *
                 * - FEAT_E2H0 without FEAT_VHE nor FEAT_NV
                 *
                 * Life is too short for anything else.
                 */
                if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features)) {
                        val = 0;
                } else {
                        val = SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, NV_frac, NV2_ONLY);
                        val |= SYS_FIELD_PREP_ENUM(ID_AA64MMFR4_EL1, E2H0, NI_NV1);
                }
                break;

        case SYS_ID_AA64DFR0_EL1:
                /* Only limited support for PMU, Debug, BPs, WPs, and HPMN0 */
                val &= (ID_AA64DFR0_EL1_PMUVer        |
                        ID_AA64DFR0_EL1_WRPs        |
                        ID_AA64DFR0_EL1_BRPs        |
                        ID_AA64DFR0_EL1_DebugVer|
                        ID_AA64DFR0_EL1_HPMN0);

                /* Cap Debug to ARMv8.1 */
                val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1, DebugVer, VHE);
                break;
        }

        return val;
}

u64 kvm_vcpu_apply_reg_masks(const struct kvm_vcpu *vcpu,
                             enum vcpu_sysreg sr, u64 v)
{
        struct kvm_sysreg_masks *masks;

        masks = vcpu->kvm->arch.sysreg_masks;

        if (masks) {
                sr -= __SANITISED_REG_START__;

                v &= ~masks->mask[sr].res0;
                v |= masks->mask[sr].res1;
        }

        return v;
}

static __always_inline void set_sysreg_masks(struct kvm *kvm, int sr, u64 res0, u64 res1)
{
        int i = sr - __SANITISED_REG_START__;

        BUILD_BUG_ON(!__builtin_constant_p(sr));
        BUILD_BUG_ON(sr < __SANITISED_REG_START__);
        BUILD_BUG_ON(sr >= NR_SYS_REGS);

        kvm->arch.sysreg_masks->mask[i].res0 = res0;
        kvm->arch.sysreg_masks->mask[i].res1 = res1;
}

int kvm_init_nv_sysregs(struct kvm_vcpu *vcpu)
{
        struct kvm *kvm = vcpu->kvm;
        u64 res0, res1;

        lockdep_assert_held(&kvm->arch.config_lock);

        if (kvm->arch.sysreg_masks)
                goto out;

        kvm->arch.sysreg_masks = kzalloc(sizeof(*(kvm->arch.sysreg_masks)),
                                         GFP_KERNEL_ACCOUNT);
        if (!kvm->arch.sysreg_masks)
                return -ENOMEM;

        /* VTTBR_EL2 */
        res0 = res1 = 0;
        if (!kvm_has_feat_enum(kvm, ID_AA64MMFR1_EL1, VMIDBits, 16))
                res0 |= GENMASK(63, 56);
        if (!kvm_has_feat(kvm, ID_AA64MMFR2_EL1, CnP, IMP))
                res0 |= VTTBR_CNP_BIT;
        set_sysreg_masks(kvm, VTTBR_EL2, res0, res1);

        /* VTCR_EL2 */
        res0 = GENMASK(63, 32) | GENMASK(30, 20);
        res1 = BIT(31);
        set_sysreg_masks(kvm, VTCR_EL2, res0, res1);

        /* VMPIDR_EL2 */
        res0 = GENMASK(63, 40) | GENMASK(30, 24);
        res1 = BIT(31);
        set_sysreg_masks(kvm, VMPIDR_EL2, res0, res1);

        /* HCR_EL2 */
        get_reg_fixed_bits(kvm, HCR_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HCR_EL2, res0, res1);

        /* HCRX_EL2 */
        get_reg_fixed_bits(kvm, HCRX_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HCRX_EL2, res0, res1);

        /* HFG[RW]TR_EL2 */
        get_reg_fixed_bits(kvm, HFGRTR_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HFGRTR_EL2, res0, res1);
        get_reg_fixed_bits(kvm, HFGWTR_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HFGWTR_EL2, res0, res1);

        /* HDFG[RW]TR_EL2 */
        get_reg_fixed_bits(kvm, HDFGRTR_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HDFGRTR_EL2, res0, res1);
        get_reg_fixed_bits(kvm, HDFGWTR_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HDFGWTR_EL2, res0, res1);

        /* HFGITR_EL2 */
        get_reg_fixed_bits(kvm, HFGITR_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HFGITR_EL2, res0, res1);

        /* HAFGRTR_EL2 - not a lot to see here */
        get_reg_fixed_bits(kvm, HAFGRTR_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HAFGRTR_EL2, res0, res1);

        /* HFG[RW]TR2_EL2 */
        get_reg_fixed_bits(kvm, HFGRTR2_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HFGRTR2_EL2, res0, res1);
        get_reg_fixed_bits(kvm, HFGWTR2_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HFGWTR2_EL2, res0, res1);

        /* HDFG[RW]TR2_EL2 */
        get_reg_fixed_bits(kvm, HDFGRTR2_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HDFGRTR2_EL2, res0, res1);
        get_reg_fixed_bits(kvm, HDFGWTR2_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HDFGWTR2_EL2, res0, res1);

        /* HFGITR2_EL2 */
        get_reg_fixed_bits(kvm, HFGITR2_EL2, &res0, &res1);
        set_sysreg_masks(kvm, HFGITR2_EL2, res0, res1);

        /* TCR2_EL2 */
        res0 = TCR2_EL2_RES0;
        res1 = TCR2_EL2_RES1;
        if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, D128, IMP))
                res0 |= (TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1 | TCR2_EL2_D128);
        if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, MEC, IMP))
                res0 |= TCR2_EL2_AMEC1 | TCR2_EL2_AMEC0;
        if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, HAFDBS, HAFT))
                res0 |= TCR2_EL2_HAFT;
        if (!kvm_has_feat(kvm, ID_AA64PFR1_EL1, THE, IMP))
                res0 |= TCR2_EL2_PTTWI | TCR2_EL2_PnCH;
        if (!kvm_has_feat(kvm, ID_AA64MMFR3_EL1, AIE, IMP))
                res0 |= TCR2_EL2_AIE;
        if (!kvm_has_s1poe(kvm))
                res0 |= TCR2_EL2_POE | TCR2_EL2_E0POE;
        if (!kvm_has_s1pie(kvm))
                res0 |= TCR2_EL2_PIE;
        if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
                res0 |= (TCR2_EL2_E0POE | TCR2_EL2_D128 |
                         TCR2_EL2_AMEC1 | TCR2_EL2_DisCH0 | TCR2_EL2_DisCH1);
        set_sysreg_masks(kvm, TCR2_EL2, res0, res1);

        /* SCTLR_EL1 */
        res0 = SCTLR_EL1_RES0;
        res1 = SCTLR_EL1_RES1;
        if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, PAN, PAN3))
                res0 |= SCTLR_EL1_EPAN;
        set_sysreg_masks(kvm, SCTLR_EL1, res0, res1);

        /* MDCR_EL2 */
        res0 = MDCR_EL2_RES0;
        res1 = MDCR_EL2_RES1;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, IMP))
                res0 |= (MDCR_EL2_HPMN | MDCR_EL2_TPMCR |
                         MDCR_EL2_TPM | MDCR_EL2_HPME);
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, IMP))
                res0 |= MDCR_EL2_E2PB | MDCR_EL2_TPMS;
        if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, SPMU, IMP))
                res0 |= MDCR_EL2_EnSPM;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P1))
                res0 |= MDCR_EL2_HPMD;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceFilt, IMP))
                res0 |= MDCR_EL2_TTRF;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P5))
                res0 |= MDCR_EL2_HCCD | MDCR_EL2_HLP;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, TraceBuffer, IMP))
                res0 |= MDCR_EL2_E2TB;
        if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, FGT, IMP))
                res0 |= MDCR_EL2_TDCC;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, MTPMU, IMP) ||
            kvm_has_feat(kvm, ID_AA64PFR0_EL1, EL3, IMP))
                res0 |= MDCR_EL2_MTPME;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMUVer, V3P7))
                res0 |= MDCR_EL2_HPMFZO;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSS, IMP))
                res0 |= MDCR_EL2_PMSSE;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, PMSVer, V1P2))
                res0 |= MDCR_EL2_HPMFZS;
        if (!kvm_has_feat(kvm, ID_AA64DFR1_EL1, EBEP, IMP))
                res0 |= MDCR_EL2_PMEE;
        if (!kvm_has_feat(kvm, ID_AA64DFR0_EL1, DebugVer, V8P9))
                res0 |= MDCR_EL2_EBWE;
        if (!kvm_has_feat(kvm, ID_AA64DFR2_EL1, STEP, IMP))
                res0 |= MDCR_EL2_EnSTEPOP;
        set_sysreg_masks(kvm, MDCR_EL2, res0, res1);

        /* CNTHCTL_EL2 */
        res0 = GENMASK(63, 20);
        res1 = 0;
        if (!kvm_has_feat(kvm, ID_AA64PFR0_EL1, RME, IMP))
                res0 |= CNTHCTL_CNTPMASK | CNTHCTL_CNTVMASK;
        if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, CNTPOFF)) {
                res0 |= CNTHCTL_ECV;
                if (!kvm_has_feat(kvm, ID_AA64MMFR0_EL1, ECV, IMP))
                        res0 |= (CNTHCTL_EL1TVT | CNTHCTL_EL1TVCT |
                                 CNTHCTL_EL1NVPCT | CNTHCTL_EL1NVVCT);
        }
        if (!kvm_has_feat(kvm, ID_AA64MMFR1_EL1, VH, IMP))
                res0 |= GENMASK(11, 8);
        set_sysreg_masks(kvm, CNTHCTL_EL2, res0, res1);

        /* ICH_HCR_EL2 */
        res0 = ICH_HCR_EL2_RES0;
        res1 = ICH_HCR_EL2_RES1;
        if (!(kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_TDS))
                res0 |= ICH_HCR_EL2_TDIR;
        /* No GICv4 is presented to the guest */
        res0 |= ICH_HCR_EL2_DVIM | ICH_HCR_EL2_vSGIEOICount;
        set_sysreg_masks(kvm, ICH_HCR_EL2, res0, res1);

        /* VNCR_EL2 */
        set_sysreg_masks(kvm, VNCR_EL2, VNCR_EL2_RES0, VNCR_EL2_RES1);

out:
        for (enum vcpu_sysreg sr = __SANITISED_REG_START__; sr < NR_SYS_REGS; sr++)
                (void)__vcpu_sys_reg(vcpu, sr);

        return 0;
}

void check_nested_vcpu_requests(struct kvm_vcpu *vcpu)
{
        if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) {
                struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu;

                write_lock(&vcpu->kvm->mmu_lock);
                if (mmu->pending_unmap) {
                        kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true);
                        mmu->pending_unmap = false;
                }
                write_unlock(&vcpu->kvm->mmu_lock);
        }

        if (kvm_check_request(KVM_REQ_MAP_L1_VNCR_EL2, vcpu))
                kvm_map_l1_vncr(vcpu);

        /* Must be last, as may switch context! */
        if (kvm_check_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu))
                kvm_inject_nested_irq(vcpu);
}
























































































  818 









































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2016 ARM Ltd.
 */
#ifndef __ASM_PGTABLE_PROT_H
#define __ASM_PGTABLE_PROT_H

#include <asm/memory.h>
#include <asm/pgtable-hwdef.h>

#include <linux/const.h>

/*
 * Software defined PTE bits definition.
 */
#define PTE_WRITE                (PTE_DBM)                 /* same as DBM (51) */
#define PTE_SWP_EXCLUSIVE        (_AT(pteval_t, 1) << 2)         /* only for swp ptes */
#define PTE_DIRTY                (_AT(pteval_t, 1) << 55)
#define PTE_SPECIAL                (_AT(pteval_t, 1) << 56)
#define PTE_DEVMAP                (_AT(pteval_t, 1) << 57)

/*
 * PTE_PRESENT_INVALID=1 & PTE_VALID=0 indicates that the pte's fields should be
 * interpreted according to the HW layout by SW but any attempted HW access to
 * the address will result in a fault. pte_present() returns true.
 */
#define PTE_PRESENT_INVALID        (PTE_NG)                 /* only when !PTE_VALID */

#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_WP
#define PTE_UFFD_WP                (_AT(pteval_t, 1) << 58) /* uffd-wp tracking */
#define PTE_SWP_UFFD_WP                (_AT(pteval_t, 1) << 3)         /* only for swp ptes */
#else
#define PTE_UFFD_WP                (_AT(pteval_t, 0))
#define PTE_SWP_UFFD_WP                (_AT(pteval_t, 0))
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */

#define _PROT_DEFAULT                (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED)

#define PROT_DEFAULT                (PTE_TYPE_PAGE | PTE_MAYBE_NG | PTE_MAYBE_SHARED | PTE_AF)
#define PROT_SECT_DEFAULT        (PMD_TYPE_SECT | PMD_MAYBE_NG | PMD_MAYBE_SHARED | PMD_SECT_AF)

#define PROT_DEVICE_nGnRnE        (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE))
#define PROT_DEVICE_nGnRE        (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE))
#define PROT_NORMAL_NC                (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_NC))
#define PROT_NORMAL                (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL))
#define PROT_NORMAL_TAGGED        (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_WRITE | PTE_ATTRINDX(MT_NORMAL_TAGGED))

#define PROT_SECT_DEVICE_nGnRE        (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_DEVICE_nGnRE))
#define PROT_SECT_NORMAL        (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PTE_WRITE | PMD_ATTRINDX(MT_NORMAL))
#define PROT_SECT_NORMAL_EXEC        (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL))

#define _PAGE_DEFAULT                (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL))

#define _PAGE_KERNEL                (PROT_NORMAL)
#define _PAGE_KERNEL_RO                ((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY)
#define _PAGE_KERNEL_ROX        ((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY)
#define _PAGE_KERNEL_EXEC        (PROT_NORMAL & ~PTE_PXN)
#define _PAGE_KERNEL_EXEC_CONT        ((PROT_NORMAL & ~PTE_PXN) | PTE_CONT)

#define _PAGE_SHARED                (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE)
#define _PAGE_SHARED_EXEC        (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_WRITE)
#define _PAGE_READONLY                (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
#define _PAGE_READONLY_EXEC        (_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN)
#define _PAGE_EXECONLY                (_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN)

#ifndef __ASSEMBLY__

#include <asm/cpufeature.h>
#include <asm/pgtable-types.h>
#include <asm/rsi.h>

extern bool arm64_use_ng_mappings;
extern unsigned long prot_ns_shared;

#define PROT_NS_SHARED                (is_realm_world() ? prot_ns_shared : 0)

#define PTE_MAYBE_NG                (arm64_use_ng_mappings ? PTE_NG : 0)
#define PMD_MAYBE_NG                (arm64_use_ng_mappings ? PMD_SECT_NG : 0)

#ifndef CONFIG_ARM64_LPA2
#define lpa2_is_enabled()        false
#define PTE_MAYBE_SHARED        PTE_SHARED
#define PMD_MAYBE_SHARED        PMD_SECT_S
#define PHYS_MASK_SHIFT                (CONFIG_ARM64_PA_BITS)
#else
static inline bool __pure lpa2_is_enabled(void)
{
        return read_tcr() & TCR_DS;
}

#define PTE_MAYBE_SHARED        (lpa2_is_enabled() ? 0 : PTE_SHARED)
#define PMD_MAYBE_SHARED        (lpa2_is_enabled() ? 0 : PMD_SECT_S)
#define PHYS_MASK_SHIFT                (lpa2_is_enabled() ? CONFIG_ARM64_PA_BITS : 48)
#endif

/*
 * Highest possible physical address supported.
 */
#define PHYS_MASK                ((UL(1) << PHYS_MASK_SHIFT) - 1)

/*
 * If we have userspace only BTI we don't want to mark kernel pages
 * guarded even if the system does support BTI.
 */
#define PTE_MAYBE_GP                (system_supports_bti_kernel() ? PTE_GP : 0)

#define PAGE_KERNEL                __pgprot(_PAGE_KERNEL)
#define PAGE_KERNEL_RO                __pgprot(_PAGE_KERNEL_RO)
#define PAGE_KERNEL_ROX                __pgprot(_PAGE_KERNEL_ROX)
#define PAGE_KERNEL_EXEC        __pgprot(_PAGE_KERNEL_EXEC)
#define PAGE_KERNEL_EXEC_CONT        __pgprot(_PAGE_KERNEL_EXEC_CONT)

#define PAGE_S2_MEMATTR(attr, has_fwb)                                        \
        ({                                                                \
                u64 __val;                                                \
                if (has_fwb)                                                \
                        __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr);        \
                else                                                        \
                        __val = PTE_S2_MEMATTR(MT_S2_ ## attr);                \
                __val;                                                        \
         })

#define PAGE_NONE                __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PRESENT_INVALID | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN)
/* shared+writable pages are clean by default, hence PTE_RDONLY|PTE_WRITE */
#define PAGE_SHARED                __pgprot(_PAGE_SHARED)
#define PAGE_SHARED_EXEC        __pgprot(_PAGE_SHARED_EXEC)
#define PAGE_READONLY                __pgprot(_PAGE_READONLY)
#define PAGE_READONLY_EXEC        __pgprot(_PAGE_READONLY_EXEC)
#define PAGE_EXECONLY                __pgprot(_PAGE_EXECONLY)

#endif /* __ASSEMBLY__ */

#define pte_pi_index(pte) ( \
        ((pte & BIT(PTE_PI_IDX_3)) >> (PTE_PI_IDX_3 - 3)) | \
        ((pte & BIT(PTE_PI_IDX_2)) >> (PTE_PI_IDX_2 - 2)) | \
        ((pte & BIT(PTE_PI_IDX_1)) >> (PTE_PI_IDX_1 - 1)) | \
        ((pte & BIT(PTE_PI_IDX_0)) >> (PTE_PI_IDX_0 - 0)))

/*
 * Page types used via Permission Indirection Extension (PIE). PIE uses
 * the USER, DBM, PXN and UXN bits to to generate an index which is used
 * to look up the actual permission in PIR_ELx and PIRE0_EL1. We define
 * combinations we use on non-PIE systems with the same encoding, for
 * convenience these are listed here as comments as are the unallocated
 * encodings.
 */

/* 0: PAGE_DEFAULT                                                  */
/* 1:                                                      PTE_USER */
/* 2:                                          PTE_WRITE            */
/* 3:                                          PTE_WRITE | PTE_USER */
/* 4: PAGE_EXECONLY                  PTE_PXN                        */
/* 5: PAGE_READONLY_EXEC             PTE_PXN |             PTE_USER */
/* 6:                                PTE_PXN | PTE_WRITE            */
/* 7: PAGE_SHARED_EXEC               PTE_PXN | PTE_WRITE | PTE_USER */
/* 8: PAGE_KERNEL_ROX      PTE_UXN                                  */
/* 9: PAGE_GCS_RO          PTE_UXN |                       PTE_USER */
/* a: PAGE_KERNEL_EXEC     PTE_UXN |           PTE_WRITE            */
/* b: PAGE_GCS             PTE_UXN |           PTE_WRITE | PTE_USER */
/* c: PAGE_KERNEL_RO       PTE_UXN | PTE_PXN                        */
/* d: PAGE_READONLY        PTE_UXN | PTE_PXN |             PTE_USER */
/* e: PAGE_KERNEL          PTE_UXN | PTE_PXN | PTE_WRITE            */
/* f: PAGE_SHARED          PTE_UXN | PTE_PXN | PTE_WRITE | PTE_USER */

#define _PAGE_GCS        (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_WRITE | PTE_USER)
#define _PAGE_GCS_RO        (_PAGE_DEFAULT | PTE_NG | PTE_UXN | PTE_USER)

#define PAGE_GCS        __pgprot(_PAGE_GCS)
#define PAGE_GCS_RO        __pgprot(_PAGE_GCS_RO)

#define PIE_E0        ( \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS),           PIE_GCS)  | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO),        PIE_R)   | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY),      PIE_X_O) | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_RX_O)  | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC),   PIE_RWX_O) | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY),      PIE_R_O)   | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED),        PIE_RW_O))

#define PIE_E1        ( \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS),           PIE_NONE_O) | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_GCS_RO),        PIE_NONE_O) | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_EXECONLY),      PIE_NONE_O) | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY_EXEC), PIE_R)      | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED_EXEC),   PIE_RW)     | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_READONLY),      PIE_R)      | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_SHARED),        PIE_RW)     | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_ROX),    PIE_RX)     | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_EXEC),   PIE_RWX)    | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL_RO),     PIE_R)      | \
        PIRx_ELx_PERM_PREP(pte_pi_index(_PAGE_KERNEL),        PIE_RW))

#endif /* __ASM_PGTABLE_PROT_H */






























































































 1514 


















 1518 






















































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_IRQFLAGS_H
#define __ASM_IRQFLAGS_H

#include <asm/barrier.h>
#include <asm/ptrace.h>
#include <asm/sysreg.h>

/*
 * Aarch64 has flags for masking: Debug, Asynchronous (serror), Interrupts and
 * FIQ exceptions, in the 'daif' register. We mask and unmask them in 'daif'
 * order:
 * Masking debug exceptions causes all other exceptions to be masked too/
 * Masking SError masks IRQ/FIQ, but not debug exceptions. IRQ and FIQ are
 * always masked and unmasked together, and have no side effects for other
 * flags. Keeping to this order makes it easier for entry.S to know which
 * exceptions should be unmasked.
 */

static __always_inline void __daif_local_irq_enable(void)
{
        barrier();
        asm volatile("msr daifclr, #3");
        barrier();
}

static __always_inline void __pmr_local_irq_enable(void)
{
        if (IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING)) {
                u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
                WARN_ON_ONCE(pmr != GIC_PRIO_IRQON && pmr != GIC_PRIO_IRQOFF);
        }

        barrier();
        write_sysreg_s(GIC_PRIO_IRQON, SYS_ICC_PMR_EL1);
        pmr_sync();
        barrier();
}

static inline void arch_local_irq_enable(void)
{
        if (system_uses_irq_prio_masking()) {
                __pmr_local_irq_enable();
        } else {
                __daif_local_irq_enable();
        }
}

static __always_inline void __daif_local_irq_disable(void)
{
        barrier();
        asm volatile("msr daifset, #3");
        barrier();
}

static __always_inline void __pmr_local_irq_disable(void)
{
        if (IS_ENABLED(CONFIG_ARM64_DEBUG_PRIORITY_MASKING)) {
                u32 pmr = read_sysreg_s(SYS_ICC_PMR_EL1);
                WARN_ON_ONCE(pmr != GIC_PRIO_IRQON && pmr != GIC_PRIO_IRQOFF);
        }

        barrier();
        write_sysreg_s(GIC_PRIO_IRQOFF, SYS_ICC_PMR_EL1);
        barrier();
}

static inline void arch_local_irq_disable(void)
{
        if (system_uses_irq_prio_masking()) {
                __pmr_local_irq_disable();
        } else {
                __daif_local_irq_disable();
        }
}

static __always_inline unsigned long __daif_local_save_flags(void)
{
        return read_sysreg(daif);
}

static __always_inline unsigned long __pmr_local_save_flags(void)
{
        return read_sysreg_s(SYS_ICC_PMR_EL1);
}

/*
 * Save the current interrupt enable state.
 */
static inline unsigned long arch_local_save_flags(void)
{
        if (system_uses_irq_prio_masking()) {
                return __pmr_local_save_flags();
        } else {
                return __daif_local_save_flags();
        }
}

static __always_inline bool __daif_irqs_disabled_flags(unsigned long flags)
{
        return flags & PSR_I_BIT;
}

static __always_inline bool __pmr_irqs_disabled_flags(unsigned long flags)
{
        return flags != GIC_PRIO_IRQON;
}

static inline bool arch_irqs_disabled_flags(unsigned long flags)
{
        if (system_uses_irq_prio_masking()) {
                return __pmr_irqs_disabled_flags(flags);
        } else {
                return __daif_irqs_disabled_flags(flags);
        }
}

static __always_inline bool __daif_irqs_disabled(void)
{
        return __daif_irqs_disabled_flags(__daif_local_save_flags());
}

static __always_inline bool __pmr_irqs_disabled(void)
{
        return __pmr_irqs_disabled_flags(__pmr_local_save_flags());
}

static inline bool arch_irqs_disabled(void)
{
        if (system_uses_irq_prio_masking()) {
                return __pmr_irqs_disabled();
        } else {
                return __daif_irqs_disabled();
        }
}

static __always_inline unsigned long __daif_local_irq_save(void)
{
        unsigned long flags = __daif_local_save_flags();

        __daif_local_irq_disable();

        return flags;
}

static __always_inline unsigned long __pmr_local_irq_save(void)
{
        unsigned long flags = __pmr_local_save_flags();

        /*
         * There are too many states with IRQs disabled, just keep the current
         * state if interrupts are already disabled/masked.
         */
        if (!__pmr_irqs_disabled_flags(flags))
                __pmr_local_irq_disable();

        return flags;
}

static inline unsigned long arch_local_irq_save(void)
{
        if (system_uses_irq_prio_masking()) {
                return __pmr_local_irq_save();
        } else {
                return __daif_local_irq_save();
        }
}

static __always_inline void __daif_local_irq_restore(unsigned long flags)
{
        barrier();
        write_sysreg(flags, daif);
        barrier();
}

static __always_inline void __pmr_local_irq_restore(unsigned long flags)
{
        barrier();
        write_sysreg_s(flags, SYS_ICC_PMR_EL1);
        pmr_sync();
        barrier();
}

/*
 * restore saved IRQ state
 */
static inline void arch_local_irq_restore(unsigned long flags)
{
        if (system_uses_irq_prio_masking()) {
                __pmr_local_irq_restore(flags);
        } else {
                __daif_local_irq_restore(flags);
        }
}

#endif /* __ASM_IRQFLAGS_H */









































  268 



  268 
  268 

  267 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * generic net pointers
 */

#ifndef __NET_GENERIC_H__
#define __NET_GENERIC_H__

#include <linux/bug.h>
#include <linux/rcupdate.h>
#include <net/net_namespace.h>

/*
 * Generic net pointers are to be used by modules to put some private
 * stuff on the struct net without explicit struct net modification
 *
 * The rules are simple:
 * 1. set pernet_operations->id.  After register_pernet_device you
 *    will have the id of your private pointer.
 * 2. set pernet_operations->size to have the code allocate and free
 *    a private structure pointed to from struct net.
 * 3. do not change this pointer while the net is alive;
 * 4. do not try to have any private reference on the net_generic object.
 *
 * After accomplishing all of the above, the private pointer can be
 * accessed with the net_generic() call.
 */

struct net_generic {
        union {
                struct {
                        unsigned int len;
                        struct rcu_head rcu;
                } s;

                DECLARE_FLEX_ARRAY(void *, ptr);
        };
};

static inline void *net_generic(const struct net *net, unsigned int id)
{
        struct net_generic *ng;
        void *ptr;

        rcu_read_lock();
        ng = rcu_dereference(net->gen);
        ptr = ng->ptr[id];
        rcu_read_unlock();

        return ptr;
}
#endif


























    9 






































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_LE_H_
#define _ASM_GENERIC_BITOPS_LE_H_

#include <asm/types.h>
#include <asm/byteorder.h>

#if defined(__LITTLE_ENDIAN)

#define BITOP_LE_SWIZZLE        0

#elif defined(__BIG_ENDIAN)

#define BITOP_LE_SWIZZLE        ((BITS_PER_LONG-1) & ~0x7)

#endif


static inline int test_bit_le(int nr, const void *addr)
{
        return test_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void set_bit_le(int nr, void *addr)
{
        set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void clear_bit_le(int nr, void *addr)
{
        clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void __set_bit_le(int nr, void *addr)
{
        __set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline void __clear_bit_le(int nr, void *addr)
{
        __clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int test_and_set_bit_le(int nr, void *addr)
{
        return test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int test_and_clear_bit_le(int nr, void *addr)
{
        return test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int __test_and_set_bit_le(int nr, void *addr)
{
        return __test_and_set_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

static inline int __test_and_clear_bit_le(int nr, void *addr)
{
        return __test_and_clear_bit(nr ^ BITOP_LE_SWIZZLE, addr);
}

#endif /* _ASM_GENERIC_BITOPS_LE_H_ */



























  331 






















































  331 



















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __LINUX_BIT_SPINLOCK_H
#define __LINUX_BIT_SPINLOCK_H

#include <linux/kernel.h>
#include <linux/preempt.h>
#include <linux/atomic.h>
#include <linux/bug.h>

/*
 *  bit-based spin_lock()
 *
 * Don't use this unless you really need to: spin_lock() and spin_unlock()
 * are significantly faster.
 */
static __always_inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
        /*
         * Assuming the lock is uncontended, this never enters
         * the body of the outer loop. If it is contended, then
         * within the inner loop a non-atomic test is used to
         * busywait with less bus contention for a good time to
         * attempt to acquire the lock bit.
         */
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                do {
                        cpu_relax();
                } while (test_bit(bitnum, addr));
                preempt_disable();
        }
#endif
        __acquire(bitlock);
}

/*
 * Return true if it was acquired
 */
static __always_inline int bit_spin_trylock(int bitnum, unsigned long *addr)
{
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        if (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                return 0;
        }
#endif
        __acquire(bitlock);
        return 1;
}

/*
 *  bit-based spin_unlock()
 */
static __always_inline void bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 *  bit-based spin_unlock()
 *  non-atomic version, which can be used eg. if the bit lock itself is
 *  protecting the rest of the flags in the word.
 */
static __always_inline void __bit_spin_unlock(int bitnum, unsigned long *addr)
{
#ifdef CONFIG_DEBUG_SPINLOCK
        BUG_ON(!test_bit(bitnum, addr));
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        __clear_bit_unlock(bitnum, addr);
#endif
        preempt_enable();
        __release(bitlock);
}

/*
 * Return true if the lock is held.
 */
static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
{
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        return test_bit(bitnum, addr);
#elif defined CONFIG_PREEMPT_COUNT
        return preempt_count();
#else
        return 1;
#endif
}

#endif /* __LINUX_BIT_SPINLOCK_H */












































  330 



































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 ARM Ltd.
 */
#ifndef __ASM_PERCPU_H
#define __ASM_PERCPU_H

#include <linux/preempt.h>

#include <asm/alternative.h>
#include <asm/cmpxchg.h>
#include <asm/stack_pointer.h>
#include <asm/sysreg.h>

static inline void set_my_cpu_offset(unsigned long off)
{
        asm volatile(ALTERNATIVE("msr tpidr_el1, %0",
                                 "msr tpidr_el2, %0",
                                 ARM64_HAS_VIRT_HOST_EXTN)
                        :: "r" (off) : "memory");
}

static inline unsigned long __hyp_my_cpu_offset(void)
{
        /*
         * Non-VHE hyp code runs with preemption disabled. No need to hazard
         * the register access against barrier() as in __kern_my_cpu_offset.
         */
        return read_sysreg(tpidr_el2);
}

static inline unsigned long __kern_my_cpu_offset(void)
{
        unsigned long off;

        /*
         * We want to allow caching the value, so avoid using volatile and
         * instead use a fake stack read to hazard against barrier().
         */
        asm(ALTERNATIVE("mrs %0, tpidr_el1",
                        "mrs %0, tpidr_el2",
                        ARM64_HAS_VIRT_HOST_EXTN)
                : "=r" (off) :
                "Q" (*(const unsigned long *)current_stack_pointer));

        return off;
}

#ifdef __KVM_NVHE_HYPERVISOR__
#define __my_cpu_offset __hyp_my_cpu_offset()
#else
#define __my_cpu_offset __kern_my_cpu_offset()
#endif

#define PERCPU_RW_OPS(sz)                                                \
static inline unsigned long __percpu_read_##sz(void *ptr)                \
{                                                                        \
        return READ_ONCE(*(u##sz *)ptr);                                \
}                                                                        \
                                                                        \
static inline void __percpu_write_##sz(void *ptr, unsigned long val)        \
{                                                                        \
        WRITE_ONCE(*(u##sz *)ptr, (u##sz)val);                                \
}

#define __PERCPU_OP_CASE(w, sfx, name, sz, op_llsc, op_lse)                \
static inline void                                                        \
__percpu_##name##_case_##sz(void *ptr, unsigned long val)                \
{                                                                        \
        unsigned int loop;                                                \
        u##sz tmp;                                                        \
                                                                        \
        asm volatile (ARM64_LSE_ATOMIC_INSN(                                \
        /* LL/SC */                                                        \
        "1:        ldxr" #sfx "\t%" #w "[tmp], %[ptr]\n"                        \
                #op_llsc "\t%" #w "[tmp], %" #w "[tmp], %" #w "[val]\n"        \
        "        stxr" #sfx "\t%w[loop], %" #w "[tmp], %[ptr]\n"                \
        "        cbnz        %w[loop], 1b",                                        \
        /* LSE atomics */                                                \
                #op_lse "\t%" #w "[val], %[ptr]\n"                        \
                __nops(3))                                                \
        : [loop] "=&r" (loop), [tmp] "=&r" (tmp),                        \
          [ptr] "+Q"(*(u##sz *)ptr)                                        \
        : [val] "r" ((u##sz)(val)));                                        \
}

#define __PERCPU_RET_OP_CASE(w, sfx, name, sz, op_llsc, op_lse)                \
static inline u##sz                                                        \
__percpu_##name##_return_case_##sz(void *ptr, unsigned long val)        \
{                                                                        \
        unsigned int loop;                                                \
        u##sz ret;                                                        \
                                                                        \
        asm volatile (ARM64_LSE_ATOMIC_INSN(                                \
        /* LL/SC */                                                        \
        "1:        ldxr" #sfx "\t%" #w "[ret], %[ptr]\n"                        \
                #op_llsc "\t%" #w "[ret], %" #w "[ret], %" #w "[val]\n"        \
        "        stxr" #sfx "\t%w[loop], %" #w "[ret], %[ptr]\n"                \
        "        cbnz        %w[loop], 1b",                                        \
        /* LSE atomics */                                                \
                #op_lse "\t%" #w "[val], %" #w "[ret], %[ptr]\n"        \
                #op_llsc "\t%" #w "[ret], %" #w "[ret], %" #w "[val]\n"        \
                __nops(2))                                                \
        : [loop] "=&r" (loop), [ret] "=&r" (ret),                        \
          [ptr] "+Q"(*(u##sz *)ptr)                                        \
        : [val] "r" ((u##sz)(val)));                                        \
                                                                        \
        return ret;                                                        \
}

#define PERCPU_OP(name, op_llsc, op_lse)                                \
        __PERCPU_OP_CASE(w, b, name,  8, op_llsc, op_lse)                \
        __PERCPU_OP_CASE(w, h, name, 16, op_llsc, op_lse)                \
        __PERCPU_OP_CASE(w,  , name, 32, op_llsc, op_lse)                \
        __PERCPU_OP_CASE( ,  , name, 64, op_llsc, op_lse)

#define PERCPU_RET_OP(name, op_llsc, op_lse)                                \
        __PERCPU_RET_OP_CASE(w, b, name,  8, op_llsc, op_lse)                \
        __PERCPU_RET_OP_CASE(w, h, name, 16, op_llsc, op_lse)                \
        __PERCPU_RET_OP_CASE(w,  , name, 32, op_llsc, op_lse)                \
        __PERCPU_RET_OP_CASE( ,  , name, 64, op_llsc, op_lse)

PERCPU_RW_OPS(8)
PERCPU_RW_OPS(16)
PERCPU_RW_OPS(32)
PERCPU_RW_OPS(64)
PERCPU_OP(add, add, stadd)
PERCPU_OP(andnot, bic, stclr)
PERCPU_OP(or, orr, stset)
PERCPU_RET_OP(add, add, ldadd)

#undef PERCPU_RW_OPS
#undef __PERCPU_OP_CASE
#undef __PERCPU_RET_OP_CASE
#undef PERCPU_OP
#undef PERCPU_RET_OP

/*
 * It would be nice to avoid the conditional call into the scheduler when
 * re-enabling preemption for preemptible kernels, but doing that in a way
 * which builds inside a module would mean messing directly with the preempt
 * count. If you do this, peterz and tglx will hunt you down.
 *
 * Not to mention it'll break the actual preemption model for missing a
 * preemption point when TIF_NEED_RESCHED gets set while preemption is
 * disabled.
 */

#define _pcp_protect(op, pcp, ...)                                        \
({                                                                        \
        preempt_disable_notrace();                                        \
        op(raw_cpu_ptr(&(pcp)), __VA_ARGS__);                                \
        preempt_enable_notrace();                                        \
})

#define _pcp_protect_return(op, pcp, args...)                                \
({                                                                        \
        typeof(pcp) __retval;                                                \
        preempt_disable_notrace();                                        \
        __retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);        \
        preempt_enable_notrace();                                        \
        __retval;                                                        \
})

#define this_cpu_read_1(pcp)                \
        _pcp_protect_return(__percpu_read_8, pcp)
#define this_cpu_read_2(pcp)                \
        _pcp_protect_return(__percpu_read_16, pcp)
#define this_cpu_read_4(pcp)                \
        _pcp_protect_return(__percpu_read_32, pcp)
#define this_cpu_read_8(pcp)                \
        _pcp_protect_return(__percpu_read_64, pcp)

#define this_cpu_write_1(pcp, val)        \
        _pcp_protect(__percpu_write_8, pcp, (unsigned long)val)
#define this_cpu_write_2(pcp, val)        \
        _pcp_protect(__percpu_write_16, pcp, (unsigned long)val)
#define this_cpu_write_4(pcp, val)        \
        _pcp_protect(__percpu_write_32, pcp, (unsigned long)val)
#define this_cpu_write_8(pcp, val)        \
        _pcp_protect(__percpu_write_64, pcp, (unsigned long)val)

#define this_cpu_add_1(pcp, val)        \
        _pcp_protect(__percpu_add_case_8, pcp, val)
#define this_cpu_add_2(pcp, val)        \
        _pcp_protect(__percpu_add_case_16, pcp, val)
#define this_cpu_add_4(pcp, val)        \
        _pcp_protect(__percpu_add_case_32, pcp, val)
#define this_cpu_add_8(pcp, val)        \
        _pcp_protect(__percpu_add_case_64, pcp, val)

#define this_cpu_add_return_1(pcp, val)        \
        _pcp_protect_return(__percpu_add_return_case_8, pcp, val)
#define this_cpu_add_return_2(pcp, val)        \
        _pcp_protect_return(__percpu_add_return_case_16, pcp, val)
#define this_cpu_add_return_4(pcp, val)        \
        _pcp_protect_return(__percpu_add_return_case_32, pcp, val)
#define this_cpu_add_return_8(pcp, val)        \
        _pcp_protect_return(__percpu_add_return_case_64, pcp, val)

#define this_cpu_and_1(pcp, val)        \
        _pcp_protect(__percpu_andnot_case_8, pcp, ~val)
#define this_cpu_and_2(pcp, val)        \
        _pcp_protect(__percpu_andnot_case_16, pcp, ~val)
#define this_cpu_and_4(pcp, val)        \
        _pcp_protect(__percpu_andnot_case_32, pcp, ~val)
#define this_cpu_and_8(pcp, val)        \
        _pcp_protect(__percpu_andnot_case_64, pcp, ~val)

#define this_cpu_or_1(pcp, val)                \
        _pcp_protect(__percpu_or_case_8, pcp, val)
#define this_cpu_or_2(pcp, val)                \
        _pcp_protect(__percpu_or_case_16, pcp, val)
#define this_cpu_or_4(pcp, val)                \
        _pcp_protect(__percpu_or_case_32, pcp, val)
#define this_cpu_or_8(pcp, val)                \
        _pcp_protect(__percpu_or_case_64, pcp, val)

#define this_cpu_xchg_1(pcp, val)        \
        _pcp_protect_return(xchg_relaxed, pcp, val)
#define this_cpu_xchg_2(pcp, val)        \
        _pcp_protect_return(xchg_relaxed, pcp, val)
#define this_cpu_xchg_4(pcp, val)        \
        _pcp_protect_return(xchg_relaxed, pcp, val)
#define this_cpu_xchg_8(pcp, val)        \
        _pcp_protect_return(xchg_relaxed, pcp, val)

#define this_cpu_cmpxchg_1(pcp, o, n)        \
        _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
#define this_cpu_cmpxchg_2(pcp, o, n)        \
        _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
#define this_cpu_cmpxchg_4(pcp, o, n)        \
        _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
#define this_cpu_cmpxchg_8(pcp, o, n)        \
        _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)

#define this_cpu_cmpxchg64(pcp, o, n)        this_cpu_cmpxchg_8(pcp, o, n)

#define this_cpu_cmpxchg128(pcp, o, n)                                        \
({                                                                        \
        typedef typeof(pcp) pcp_op_T__;                                        \
        u128 old__, new__, ret__;                                        \
        pcp_op_T__ *ptr__;                                                \
        old__ = o;                                                        \
        new__ = n;                                                        \
        preempt_disable_notrace();                                        \
        ptr__ = raw_cpu_ptr(&(pcp));                                        \
        ret__ = cmpxchg128_local((void *)ptr__, old__, new__);                \
        preempt_enable_notrace();                                        \
        ret__;                                                                \
})

#ifdef __KVM_NVHE_HYPERVISOR__
extern unsigned long __hyp_per_cpu_offset(unsigned int cpu);
#define __per_cpu_offset
#define per_cpu_offset(cpu)        __hyp_per_cpu_offset((cpu))
#endif

#include <asm-generic/percpu.h>

/* Redefine macros for nVHE hyp under DEBUG_PREEMPT to avoid its dependencies. */
#if defined(__KVM_NVHE_HYPERVISOR__) && defined(CONFIG_DEBUG_PREEMPT)
#undef        this_cpu_ptr
#define        this_cpu_ptr                raw_cpu_ptr
#undef        __this_cpu_read
#define        __this_cpu_read                raw_cpu_read
#undef        __this_cpu_write
#define        __this_cpu_write        raw_cpu_write
#endif

#endif /* __ASM_PERCPU_H */












































































































































































































































  165 










  109 
   77 
   77 















































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/writeback.h
 */
#ifndef WRITEBACK_H
#define WRITEBACK_H

#include <linux/sched.h>
#include <linux/workqueue.h>
#include <linux/fs.h>
#include <linux/flex_proportions.h>
#include <linux/backing-dev-defs.h>
#include <linux/blk_types.h>
#include <linux/pagevec.h>

struct bio;

DECLARE_PER_CPU(int, dirty_throttle_leaks);

/*
 * The global dirty threshold is normally equal to the global dirty limit,
 * except when the system suddenly allocates a lot of anonymous memory and
 * knocks down the global dirty threshold quickly, in which case the global
 * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
 */
#define DIRTY_SCOPE                8

struct backing_dev_info;

/*
 * fs/fs-writeback.c
 */
enum writeback_sync_modes {
        WB_SYNC_NONE,        /* Don't wait on anything */
        WB_SYNC_ALL,        /* Wait on every mapping */
};

/*
 * A control structure which tells the writeback code what to do.  These are
 * always on the stack, and hence need no locking.  They are always initialised
 * in a manner such that unspecified fields are set to zero.
 */
struct writeback_control {
        /* public fields that can be set and/or consumed by the caller: */
        long nr_to_write;                /* Write this many pages, and decrement
                                           this for each page written */
        long pages_skipped;                /* Pages which were not written */

        /*
         * For a_ops->writepages(): if start or end are non-zero then this is
         * a hint that the filesystem need only write out the pages inside that
         * byterange.  The byte at `end' is included in the writeout request.
         */
        loff_t range_start;
        loff_t range_end;

        enum writeback_sync_modes sync_mode;

        unsigned for_kupdate:1;                /* A kupdate writeback */
        unsigned for_background:1;        /* A background writeback */
        unsigned tagged_writepages:1;        /* tag-and-write to avoid livelock */
        unsigned for_reclaim:1;                /* Invoked from the page allocator */
        unsigned range_cyclic:1;        /* range_start is cyclic */
        unsigned for_sync:1;                /* sync(2) WB_SYNC_ALL writeback */
        unsigned unpinned_netfs_wb:1;        /* Cleared I_PINNING_NETFS_WB */

        /*
         * When writeback IOs are bounced through async layers, only the
         * initial synchronous phase should be accounted towards inode
         * cgroup ownership arbitration to avoid confusion.  Later stages
         * can set the following flag to disable the accounting.
         */
        unsigned no_cgroup_owner:1;

        /* To enable batching of swap writes to non-block-device backends,
         * "plug" can be set point to a 'struct swap_iocb *'.  When all swap
         * writes have been submitted, if with swap_iocb is not NULL,
         * swap_write_unplug() should be called.
         */
        struct swap_iocb **swap_plug;

        /* Target list for splitting a large folio */
        struct list_head *list;

        /* internal fields used by the ->writepages implementation: */
        struct folio_batch fbatch;
        pgoff_t index;
        int saved_err;

#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback *wb;        /* wb this writeback is issued under */
        struct inode *inode;                /* inode being written out */

        /* foreign inode detection, see wbc_detach_inode() */
        int wb_id;                        /* current wb id */
        int wb_lcand_id;                /* last foreign candidate wb id */
        int wb_tcand_id;                /* this foreign candidate wb id */
        size_t wb_bytes;                /* bytes written by current wb */
        size_t wb_lcand_bytes;                /* bytes written by last candidate */
        size_t wb_tcand_bytes;                /* bytes written by this candidate */
#endif
};

static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)
{
        blk_opf_t flags = 0;

        if (wbc->sync_mode == WB_SYNC_ALL)
                flags |= REQ_SYNC;
        else if (wbc->for_kupdate || wbc->for_background)
                flags |= REQ_BACKGROUND;

        return flags;
}

#ifdef CONFIG_CGROUP_WRITEBACK
#define wbc_blkcg_css(wbc) \
        ((wbc)->wb ? (wbc)->wb->blkcg_css : blkcg_root_css)
#else
#define wbc_blkcg_css(wbc)                (blkcg_root_css)
#endif /* CONFIG_CGROUP_WRITEBACK */

/*
 * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
 * and are measured against each other in.  There always is one global
 * domain, global_wb_domain, that every wb in the system is a member of.
 * This allows measuring the relative bandwidth of each wb to distribute
 * dirtyable memory accordingly.
 */
struct wb_domain {
        spinlock_t lock;

        /*
         * Scale the writeback cache size proportional to the relative
         * writeout speed.
         *
         * We do this by keeping a floating proportion between BDIs, based
         * on page writeback completions [end_page_writeback()]. Those
         * devices that write out pages fastest will get the larger share,
         * while the slower will get a smaller share.
         *
         * We use page writeout completions because we are interested in
         * getting rid of dirty pages. Having them written out is the
         * primary goal.
         *
         * We introduce a concept of time, a period over which we measure
         * these events, because demand can/will vary over time. The length
         * of this period itself is measured in page writeback completions.
         */
        struct fprop_global completions;
        struct timer_list period_timer;        /* timer for aging of completions */
        unsigned long period_time;

        /*
         * The dirtyable memory and dirty threshold could be suddenly
         * knocked down by a large amount (eg. on the startup of KVM in a
         * swapless system). This may throw the system into deep dirty
         * exceeded state and throttle heavy/light dirtiers alike. To
         * retain good responsiveness, maintain global_dirty_limit for
         * tracking slowly down to the knocked down dirty threshold.
         *
         * Both fields are protected by ->lock.
         */
        unsigned long dirty_limit_tstamp;
        unsigned long dirty_limit;
};

/**
 * wb_domain_size_changed - memory available to a wb_domain has changed
 * @dom: wb_domain of interest
 *
 * This function should be called when the amount of memory available to
 * @dom has changed.  It resets @dom's dirty limit parameters to prevent
 * the past values which don't match the current configuration from skewing
 * dirty throttling.  Without this, when memory size of a wb_domain is
 * greatly reduced, the dirty throttling logic may allow too many pages to
 * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
 * that situation.
 */
static inline void wb_domain_size_changed(struct wb_domain *dom)
{
        spin_lock(&dom->lock);
        dom->dirty_limit_tstamp = jiffies;
        dom->dirty_limit = 0;
        spin_unlock(&dom->lock);
}

/*
 * fs/fs-writeback.c
 */        
struct bdi_writeback;
void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
                                                        enum wb_reason reason);
void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason);
void sync_inodes_sb(struct super_block *);
void wakeup_flusher_threads(enum wb_reason reason);
void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
                                enum wb_reason reason);
void inode_wait_for_writeback(struct inode *inode);
void inode_io_list_del(struct inode *inode);

/* writeback.h requires fs.h; it, too, is not included from here. */
static inline void wait_on_inode(struct inode *inode)
{
        wait_var_event(inode_state_wait_address(inode, __I_NEW),
                       !(READ_ONCE(inode->i_state) & I_NEW));
}

#ifdef CONFIG_CGROUP_WRITEBACK

#include <linux/cgroup.h>
#include <linux/bio.h>

void __inode_attach_wb(struct inode *inode, struct folio *folio);
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
                              size_t bytes);
int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
                           enum wb_reason reason, struct wb_completion *done);
void cgroup_writeback_umount(struct super_block *sb);
bool cleanup_offline_cgwb(struct bdi_writeback *wb);

/**
 * inode_attach_wb - associate an inode with its wb
 * @inode: inode of interest
 * @folio: folio being dirtied (may be NULL)
 *
 * If @inode doesn't have its wb, associate it with the wb matching the
 * memcg of @folio or, if @folio is NULL, %current.  May be called w/ or w/o
 * @inode->i_lock.
 */
static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
        if (!inode->i_wb)
                __inode_attach_wb(inode, folio);
}

/**
 * inode_detach_wb - disassociate an inode from its wb
 * @inode: inode of interest
 *
 * @inode is being freed.  Detach from its wb.
 */
static inline void inode_detach_wb(struct inode *inode)
{
        if (inode->i_wb) {
                WARN_ON_ONCE(!(inode->i_state & I_CLEAR));
                wb_put(inode->i_wb);
                inode->i_wb = NULL;
        }
}

void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                struct inode *inode);

/**
 * wbc_init_bio - writeback specific initializtion of bio
 * @wbc: writeback_control for the writeback in progress
 * @bio: bio to be initialized
 *
 * @bio is a part of the writeback in progress controlled by @wbc.  Perform
 * writeback specific initialization.  This is used to apply the cgroup
 * writeback context.  Must be called after the bio has been associated with
 * a device.
 */
static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
        /*
         * pageout() path doesn't attach @wbc to the inode being written
         * out.  This is intentional as we don't want the function to block
         * behind a slow cgroup.  Ultimately, we want pageout() to kick off
         * regular writeback instead of writing things out itself.
         */
        if (wbc->wb)
                bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css);
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline void inode_attach_wb(struct inode *inode, struct folio *folio)
{
}

static inline void inode_detach_wb(struct inode *inode)
{
}

static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
                                               struct inode *inode)
{
}

static inline void wbc_detach_inode(struct writeback_control *wbc)
{
}

static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
{
}

static inline void wbc_account_cgroup_owner(struct writeback_control *wbc,
                                            struct folio *folio, size_t bytes)
{
}

static inline void cgroup_writeback_umount(struct super_block *sb)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

/*
 * mm/page-writeback.c
 */
/* consolidated parameters for balance_dirty_pages() and its subroutines */
struct dirty_throttle_control {
#ifdef CONFIG_CGROUP_WRITEBACK
        struct wb_domain        *dom;
        struct dirty_throttle_control *gdtc;        /* only set in memcg dtc's */
#endif
        struct bdi_writeback        *wb;
        struct fprop_local_percpu *wb_completions;

        unsigned long                avail;                /* dirtyable */
        unsigned long                dirty;                /* file_dirty + write + nfs */
        unsigned long                thresh;                /* dirty threshold */
        unsigned long                bg_thresh;        /* dirty background threshold */
        unsigned long                limit;                /* hard dirty limit */

        unsigned long                wb_dirty;        /* per-wb counterparts */
        unsigned long                wb_thresh;
        unsigned long                wb_bg_thresh;

        unsigned long                pos_ratio;
        bool                        freerun;
        bool                        dirty_exceeded;
};

void laptop_io_completion(struct backing_dev_info *info);
void laptop_sync_completion(void);
void laptop_mode_timer_fn(struct timer_list *t);
bool node_dirty_ok(struct pglist_data *pgdat);
int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom);
#endif

extern struct wb_domain global_wb_domain;

/* These are exported to sysctl. */
extern unsigned int dirty_writeback_interval;
extern unsigned int dirty_expire_interval;
extern int laptop_mode;

void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
unsigned long cgwb_calc_thresh(struct bdi_writeback *wb);

void wb_update_bandwidth(struct bdi_writeback *wb);

/* Invoke balance dirty pages in async mode. */
#define BDP_ASYNC 0x0001

void balance_dirty_pages_ratelimited(struct address_space *mapping);
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
                unsigned int flags);

bool wb_over_bg_thresh(struct bdi_writeback *wb);

struct folio *writeback_iter(struct address_space *mapping,
                struct writeback_control *wbc, struct folio *folio, int *error);

typedef int (*writepage_t)(struct folio *folio, struct writeback_control *wbc,
                                void *data);

int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
                             pgoff_t start, pgoff_t end);

bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio);
bool folio_redirty_for_writepage(struct writeback_control *, struct folio *);
bool redirty_page_for_writepage(struct writeback_control *, struct page *);

void sb_mark_inode_writeback(struct inode *inode);
void sb_clear_inode_writeback(struct inode *inode);

#endif                /* WRITEBACK_H */























































































  203 
    1 




























 1045 








































































































































  202 



  202 

  202 
















 1229 



 1233 
 1045 
 1192 
    3 















































































































































































































































































































  202 

























































































































 1232 















 1231 









 1234 

 1235 
















  202 




















    8 
  195 

  148 
   60 






  201 
















    8 




















    8 












    8 
    8 



    8 










































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
// SPDX-License-Identifier: GPL-2.0
/*
 * security/tomoyo/file.c
 *
 * Copyright (C) 2005-2011  NTT DATA CORPORATION
 */

#include "common.h"
#include <linux/slab.h>

/*
 * Mapping table from "enum tomoyo_path_acl_index" to "enum tomoyo_mac_index".
 */
static const u8 tomoyo_p2mac[TOMOYO_MAX_PATH_OPERATION] = {
        [TOMOYO_TYPE_EXECUTE]    = TOMOYO_MAC_FILE_EXECUTE,
        [TOMOYO_TYPE_READ]       = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_WRITE]      = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_APPEND]     = TOMOYO_MAC_FILE_OPEN,
        [TOMOYO_TYPE_UNLINK]     = TOMOYO_MAC_FILE_UNLINK,
        [TOMOYO_TYPE_GETATTR]    = TOMOYO_MAC_FILE_GETATTR,
        [TOMOYO_TYPE_RMDIR]      = TOMOYO_MAC_FILE_RMDIR,
        [TOMOYO_TYPE_TRUNCATE]   = TOMOYO_MAC_FILE_TRUNCATE,
        [TOMOYO_TYPE_SYMLINK]    = TOMOYO_MAC_FILE_SYMLINK,
        [TOMOYO_TYPE_CHROOT]     = TOMOYO_MAC_FILE_CHROOT,
        [TOMOYO_TYPE_UMOUNT]     = TOMOYO_MAC_FILE_UMOUNT,
};

/*
 * Mapping table from "enum tomoyo_mkdev_acl_index" to "enum tomoyo_mac_index".
 */
const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION] = {
        [TOMOYO_TYPE_MKBLOCK] = TOMOYO_MAC_FILE_MKBLOCK,
        [TOMOYO_TYPE_MKCHAR]  = TOMOYO_MAC_FILE_MKCHAR,
};

/*
 * Mapping table from "enum tomoyo_path2_acl_index" to "enum tomoyo_mac_index".
 */
const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION] = {
        [TOMOYO_TYPE_LINK]       = TOMOYO_MAC_FILE_LINK,
        [TOMOYO_TYPE_RENAME]     = TOMOYO_MAC_FILE_RENAME,
        [TOMOYO_TYPE_PIVOT_ROOT] = TOMOYO_MAC_FILE_PIVOT_ROOT,
};

/*
 * Mapping table from "enum tomoyo_path_number_acl_index" to
 * "enum tomoyo_mac_index".
 */
const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION] = {
        [TOMOYO_TYPE_CREATE] = TOMOYO_MAC_FILE_CREATE,
        [TOMOYO_TYPE_MKDIR]  = TOMOYO_MAC_FILE_MKDIR,
        [TOMOYO_TYPE_MKFIFO] = TOMOYO_MAC_FILE_MKFIFO,
        [TOMOYO_TYPE_MKSOCK] = TOMOYO_MAC_FILE_MKSOCK,
        [TOMOYO_TYPE_IOCTL]  = TOMOYO_MAC_FILE_IOCTL,
        [TOMOYO_TYPE_CHMOD]  = TOMOYO_MAC_FILE_CHMOD,
        [TOMOYO_TYPE_CHOWN]  = TOMOYO_MAC_FILE_CHOWN,
        [TOMOYO_TYPE_CHGRP]  = TOMOYO_MAC_FILE_CHGRP,
};

/**
 * tomoyo_put_name_union - Drop reference on "struct tomoyo_name_union".
 *
 * @ptr: Pointer to "struct tomoyo_name_union".
 *
 * Returns nothing.
 */
void tomoyo_put_name_union(struct tomoyo_name_union *ptr)
{
        tomoyo_put_group(ptr->group);
        tomoyo_put_name(ptr->filename);
}

/**
 * tomoyo_compare_name_union - Check whether a name matches "struct tomoyo_name_union" or not.
 *
 * @name: Pointer to "struct tomoyo_path_info".
 * @ptr:  Pointer to "struct tomoyo_name_union".
 *
 * Returns "struct tomoyo_path_info" if @name matches @ptr, NULL otherwise.
 */
const struct tomoyo_path_info *
tomoyo_compare_name_union(const struct tomoyo_path_info *name,
                          const struct tomoyo_name_union *ptr)
{
        if (ptr->group)
                return tomoyo_path_matches_group(name, ptr->group);
        if (tomoyo_path_matches_pattern(name, ptr->filename))
                return ptr->filename;
        return NULL;
}

/**
 * tomoyo_put_number_union - Drop reference on "struct tomoyo_number_union".
 *
 * @ptr: Pointer to "struct tomoyo_number_union".
 *
 * Returns nothing.
 */
void tomoyo_put_number_union(struct tomoyo_number_union *ptr)
{
        tomoyo_put_group(ptr->group);
}

/**
 * tomoyo_compare_number_union - Check whether a value matches "struct tomoyo_number_union" or not.
 *
 * @value: Number to check.
 * @ptr:   Pointer to "struct tomoyo_number_union".
 *
 * Returns true if @value matches @ptr, false otherwise.
 */
bool tomoyo_compare_number_union(const unsigned long value,
                                 const struct tomoyo_number_union *ptr)
{
        if (ptr->group)
                return tomoyo_number_matches_group(value, value, ptr->group);
        return value >= ptr->values[0] && value <= ptr->values[1];
}

/**
 * tomoyo_add_slash - Add trailing '/' if needed.
 *
 * @buf: Pointer to "struct tomoyo_path_info".
 *
 * Returns nothing.
 *
 * @buf must be generated by tomoyo_encode() because this function does not
 * allocate memory for adding '/'.
 */
static void tomoyo_add_slash(struct tomoyo_path_info *buf)
{
        if (buf->is_dir)
                return;
        /*
         * This is OK because tomoyo_encode() reserves space for appending "/".
         */
        strcat((char *) buf->name, "/");
        tomoyo_fill_path_info(buf);
}

/**
 * tomoyo_get_realpath - Get realpath.
 *
 * @buf:  Pointer to "struct tomoyo_path_info".
 * @path: Pointer to "struct path".
 *
 * Returns true on success, false otherwise.
 */
static bool tomoyo_get_realpath(struct tomoyo_path_info *buf, const struct path *path)
{
        buf->name = tomoyo_realpath_from_path(path);
        if (buf->name) {
                tomoyo_fill_path_info(buf);
                return true;
        }
        return false;
}

/**
 * tomoyo_audit_path_log - Audit path request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file %s %s\n", tomoyo_path_keyword
                                 [r->param.path.operation],
                                 r->param.path.filename->name);
}

/**
 * tomoyo_audit_path2_log - Audit path/path request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path2_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords
                                 [tomoyo_pp2mac[r->param.path2.operation]],
                                 r->param.path2.filename1->name,
                                 r->param.path2.filename2->name);
}

/**
 * tomoyo_audit_mkdev_log - Audit path/number/number/number request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_mkdev_log(struct tomoyo_request_info *r)
{
        return tomoyo_supervisor(r, "file %s %s 0%o %u %u\n",
                                 tomoyo_mac_keywords
                                 [tomoyo_pnnn2mac[r->param.mkdev.operation]],
                                 r->param.mkdev.filename->name,
                                 r->param.mkdev.mode, r->param.mkdev.major,
                                 r->param.mkdev.minor);
}

/**
 * tomoyo_audit_path_number_log - Audit path/number request log.
 *
 * @r: Pointer to "struct tomoyo_request_info".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r)
{
        const u8 type = r->param.path_number.operation;
        u8 radix;
        char buffer[64];

        switch (type) {
        case TOMOYO_TYPE_CREATE:
        case TOMOYO_TYPE_MKDIR:
        case TOMOYO_TYPE_MKFIFO:
        case TOMOYO_TYPE_MKSOCK:
        case TOMOYO_TYPE_CHMOD:
                radix = TOMOYO_VALUE_TYPE_OCTAL;
                break;
        case TOMOYO_TYPE_IOCTL:
                radix = TOMOYO_VALUE_TYPE_HEXADECIMAL;
                break;
        default:
                radix = TOMOYO_VALUE_TYPE_DECIMAL;
                break;
        }
        tomoyo_print_ulong(buffer, sizeof(buffer), r->param.path_number.number,
                           radix);
        return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords
                                 [tomoyo_pn2mac[type]],
                                 r->param.path_number.filename->name, buffer);
}

/**
 * tomoyo_check_path_acl - Check permission for path operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 *
 * To be able to use wildcard for domain transition, this function sets
 * matching entry on success. Since the caller holds tomoyo_read_lock(),
 * it is safe to set matching entry.
 */
static bool tomoyo_check_path_acl(struct tomoyo_request_info *r,
                                  const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl),
                                                         head);

        if (acl->perm & (1 << r->param.path.operation)) {
                r->param.path.matched_path =
                        tomoyo_compare_name_union(r->param.path.filename,
                                                  &acl->name);
                return r->param.path.matched_path != NULL;
        }
        return false;
}

/**
 * tomoyo_check_path_number_acl - Check permission for path number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r,
                                         const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path_number_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.path_number.operation)) &&
                tomoyo_compare_number_union(r->param.path_number.number,
                                            &acl->number) &&
                tomoyo_compare_name_union(r->param.path_number.filename,
                                          &acl->name);
}

/**
 * tomoyo_check_path2_acl - Check permission for path path operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_path2_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.path2.operation)) &&
                tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1)
                && tomoyo_compare_name_union(r->param.path2.filename2,
                                             &acl->name2);
}

/**
 * tomoyo_check_mkdev_acl - Check permission for path number number number operation.
 *
 * @r:   Pointer to "struct tomoyo_request_info".
 * @ptr: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if granted, false otherwise.
 */
static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r,
                                   const struct tomoyo_acl_info *ptr)
{
        const struct tomoyo_mkdev_acl *acl =
                container_of(ptr, typeof(*acl), head);

        return (acl->perm & (1 << r->param.mkdev.operation)) &&
                tomoyo_compare_number_union(r->param.mkdev.mode,
                                            &acl->mode) &&
                tomoyo_compare_number_union(r->param.mkdev.major,
                                            &acl->major) &&
                tomoyo_compare_number_union(r->param.mkdev.minor,
                                            &acl->minor) &&
                tomoyo_compare_name_union(r->param.mkdev.filename,
                                          &acl->name);
}

/**
 * tomoyo_same_path_acl - Check for duplicated "struct tomoyo_path_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a,
                                 const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name, &p2->name);
}

/**
 * tomoyo_merge_path_acl - Merge duplicated "struct tomoyo_path_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a,
                                  struct tomoyo_acl_info *b,
                                  const bool is_delete)
{
        u16 * const a_perm = &container_of(a, struct tomoyo_path_acl, head)
                ->perm;
        u16 perm = READ_ONCE(*a_perm);
        const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path_acl - Update "struct tomoyo_path_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_path_acl(const u16 perm,
                                  struct tomoyo_acl_param *param)
{
        struct tomoyo_path_acl e = {
                .head.type = TOMOYO_TYPE_PATH_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path_acl,
                                             tomoyo_merge_path_acl);
        tomoyo_put_name_union(&e.name);
        return error;
}

/**
 * tomoyo_same_mkdev_acl - Check for duplicated "struct tomoyo_mkdev_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a,
                                         const struct tomoyo_acl_info *b)
{
        const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->mode, &p2->mode) &&
                tomoyo_same_number_union(&p1->major, &p2->major) &&
                tomoyo_same_number_union(&p1->minor, &p2->minor);
}

/**
 * tomoyo_merge_mkdev_acl - Merge duplicated "struct tomoyo_mkdev_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a,
                                   struct tomoyo_acl_info *b,
                                   const bool is_delete)
{
        u8 *const a_perm = &container_of(a, struct tomoyo_mkdev_acl,
                                         head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head)
                ->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_mkdev_acl - Update "struct tomoyo_mkdev_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_mkdev_acl(const u8 perm,
                                   struct tomoyo_acl_param *param)
{
        struct tomoyo_mkdev_acl e = {
                .head.type = TOMOYO_TYPE_MKDEV_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.mode) ||
            !tomoyo_parse_number_union(param, &e.major) ||
            !tomoyo_parse_number_union(param, &e.minor))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_mkdev_acl,
                                             tomoyo_merge_mkdev_acl);
        tomoyo_put_name_union(&e.name);
        tomoyo_put_number_union(&e.mode);
        tomoyo_put_number_union(&e.major);
        tomoyo_put_number_union(&e.minor);
        return error;
}

/**
 * tomoyo_same_path2_acl - Check for duplicated "struct tomoyo_path2_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a,
                                  const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->name1, &p2->name1) &&
                tomoyo_same_name_union(&p1->name2, &p2->name2);
}

/**
 * tomoyo_merge_path2_acl - Merge duplicated "struct tomoyo_path2_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a,
                                   struct tomoyo_acl_info *b,
                                   const bool is_delete)
{
        u8 * const a_perm = &container_of(a, struct tomoyo_path2_acl, head)
                ->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path2_acl - Update "struct tomoyo_path2_acl" list.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_path2_acl(const u8 perm,
                                   struct tomoyo_acl_param *param)
{
        struct tomoyo_path2_acl e = {
                .head.type = TOMOYO_TYPE_PATH2_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name1) ||
            !tomoyo_parse_name_union(param, &e.name2))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path2_acl,
                                             tomoyo_merge_path2_acl);
        tomoyo_put_name_union(&e.name1);
        tomoyo_put_name_union(&e.name2);
        return error;
}

/**
 * tomoyo_path_permission - Check permission for single path operation.
 *
 * @r:         Pointer to "struct tomoyo_request_info".
 * @operation: Type of operation.
 * @filename:  Filename to check.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_path_permission(struct tomoyo_request_info *r, u8 operation,
                                  const struct tomoyo_path_info *filename)
{
        int error;

        r->type = tomoyo_p2mac[operation];
        r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type);
        if (r->mode == TOMOYO_CONFIG_DISABLED)
                return 0;
        r->param_type = TOMOYO_TYPE_PATH_ACL;
        r->param.path.filename = filename;
        r->param.path.operation = operation;
        do {
                tomoyo_check_acl(r, tomoyo_check_path_acl);
                error = tomoyo_audit_path_log(r);
        } while (error == TOMOYO_RETRY_REQUEST);
        return error;
}

/**
 * tomoyo_execute_permission - Check permission for execute operation.
 *
 * @r:         Pointer to "struct tomoyo_request_info".
 * @filename:  Filename to check.
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_execute_permission(struct tomoyo_request_info *r,
                              const struct tomoyo_path_info *filename)
{
        /*
         * Unlike other permission checks, this check is done regardless of
         * profile mode settings in order to check for domain transition
         * preference.
         */
        r->type = TOMOYO_MAC_FILE_EXECUTE;
        r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type);
        r->param_type = TOMOYO_TYPE_PATH_ACL;
        r->param.path.filename = filename;
        r->param.path.operation = TOMOYO_TYPE_EXECUTE;
        tomoyo_check_acl(r, tomoyo_check_path_acl);
        r->ee->transition = r->matched_acl && r->matched_acl->cond ?
                r->matched_acl->cond->transit : NULL;
        if (r->mode != TOMOYO_CONFIG_DISABLED)
                return tomoyo_audit_path_log(r);
        return 0;
}

/**
 * tomoyo_same_path_number_acl - Check for duplicated "struct tomoyo_path_number_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b except permission bits, false otherwise.
 */
static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a,
                                        const struct tomoyo_acl_info *b)
{
        const struct tomoyo_path_number_acl *p1 = container_of(a, typeof(*p1),
                                                               head);
        const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2),
                                                               head);

        return tomoyo_same_name_union(&p1->name, &p2->name) &&
                tomoyo_same_number_union(&p1->number, &p2->number);
}

/**
 * tomoyo_merge_path_number_acl - Merge duplicated "struct tomoyo_path_number_acl" entry.
 *
 * @a:         Pointer to "struct tomoyo_acl_info".
 * @b:         Pointer to "struct tomoyo_acl_info".
 * @is_delete: True for @a &= ~@b, false for @a |= @b.
 *
 * Returns true if @a is empty, false otherwise.
 */
static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a,
                                         struct tomoyo_acl_info *b,
                                         const bool is_delete)
{
        u8 * const a_perm = &container_of(a, struct tomoyo_path_number_acl,
                                          head)->perm;
        u8 perm = READ_ONCE(*a_perm);
        const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head)
                ->perm;

        if (is_delete)
                perm &= ~b_perm;
        else
                perm |= b_perm;
        WRITE_ONCE(*a_perm, perm);
        return !perm;
}

/**
 * tomoyo_update_path_number_acl - Update ioctl/chmod/chown/chgrp ACL.
 *
 * @perm:  Permission.
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 */
static int tomoyo_update_path_number_acl(const u8 perm,
                                         struct tomoyo_acl_param *param)
{
        struct tomoyo_path_number_acl e = {
                .head.type = TOMOYO_TYPE_PATH_NUMBER_ACL,
                .perm = perm
        };
        int error;

        if (!tomoyo_parse_name_union(param, &e.name) ||
            !tomoyo_parse_number_union(param, &e.number))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_path_number_acl,
                                             tomoyo_merge_path_number_acl);
        tomoyo_put_name_union(&e.name);
        tomoyo_put_number_union(&e.number);
        return error;
}

/**
 * tomoyo_path_number_perm - Check permission for "create", "mkdir", "mkfifo", "mksock", "ioctl", "chmod", "chown", "chgrp".
 *
 * @type:   Type of operation.
 * @path:   Pointer to "struct path".
 * @number: Number.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path_number_perm(const u8 type, const struct path *path,
                            unsigned long number)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error = -ENOMEM;
        struct tomoyo_path_info buf;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pn2mac[type])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf, path))
                goto out;
        r.obj = &obj;
        if (type == TOMOYO_TYPE_MKDIR)
                tomoyo_add_slash(&buf);
        r.param_type = TOMOYO_TYPE_PATH_NUMBER_ACL;
        r.param.path_number.operation = type;
        r.param.path_number.filename = &buf;
        r.param.path_number.number = number;
        do {
                tomoyo_check_acl(&r, tomoyo_check_path_number_acl);
                error = tomoyo_audit_path_number_log(&r);
        } while (error == TOMOYO_RETRY_REQUEST);
        kfree(buf.name);
 out:
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_check_open_permission - Check permission for "read" and "write".
 *
 * @domain: Pointer to "struct tomoyo_domain_info".
 * @path:   Pointer to "struct path".
 * @flag:   Flags for open().
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
                                 const struct path *path, const int flag)
{
        const u8 acc_mode = ACC_MODE(flag);
        int error = 0;
        struct tomoyo_path_info buf;
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int idx;

        buf.name = NULL;
        r.mode = TOMOYO_CONFIG_DISABLED;
        idx = tomoyo_read_lock();
        if (acc_mode &&
            tomoyo_init_request_info(&r, domain, TOMOYO_MAC_FILE_OPEN)
            != TOMOYO_CONFIG_DISABLED) {
                if (!tomoyo_get_realpath(&buf, path)) {
                        error = -ENOMEM;
                        goto out;
                }
                r.obj = &obj;
                if (acc_mode & MAY_READ)
                        error = tomoyo_path_permission(&r, TOMOYO_TYPE_READ,
                                                       &buf);
                if (!error && (acc_mode & MAY_WRITE))
                        error = tomoyo_path_permission(&r, (flag & O_APPEND) ?
                                                       TOMOYO_TYPE_APPEND :
                                                       TOMOYO_TYPE_WRITE,
                                                       &buf);
        }
 out:
        kfree(buf.name);
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_path_perm - Check permission for "unlink", "rmdir", "truncate", "symlink", "append", "chroot" and "unmount".
 *
 * @operation: Type of operation.
 * @path:      Pointer to "struct path".
 * @target:    Symlink's target if @operation is TOMOYO_TYPE_SYMLINK,
 *             NULL otherwise.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error;
        struct tomoyo_path_info buf;
        bool is_enforce;
        struct tomoyo_path_info symlink_target;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_p2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        is_enforce = (r.mode == TOMOYO_CONFIG_ENFORCING);
        error = -ENOMEM;
        buf.name = NULL;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf, path))
                goto out;
        r.obj = &obj;
        switch (operation) {
        case TOMOYO_TYPE_RMDIR:
        case TOMOYO_TYPE_CHROOT:
                tomoyo_add_slash(&buf);
                break;
        case TOMOYO_TYPE_SYMLINK:
                symlink_target.name = tomoyo_encode(target);
                if (!symlink_target.name)
                        goto out;
                tomoyo_fill_path_info(&symlink_target);
                obj.symlink_target = &symlink_target;
                break;
        }
        error = tomoyo_path_permission(&r, operation, &buf);
        if (operation == TOMOYO_TYPE_SYMLINK)
                kfree(symlink_target.name);
 out:
        kfree(buf.name);
        tomoyo_read_unlock(idx);
        if (!is_enforce)
                error = 0;
        return error;
}

/**
 * tomoyo_mkdev_perm - Check permission for "mkblock" and "mkchar".
 *
 * @operation: Type of operation. (TOMOYO_TYPE_MKCHAR or TOMOYO_TYPE_MKBLOCK)
 * @path:      Pointer to "struct path".
 * @mode:      Create mode.
 * @dev:       Device number.
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_mkdev_perm(const u8 operation, const struct path *path,
                      const unsigned int mode, unsigned int dev)
{
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path->mnt, .dentry = path->dentry },
        };
        int error = -ENOMEM;
        struct tomoyo_path_info buf;
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pnnn2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        idx = tomoyo_read_lock();
        error = -ENOMEM;
        if (tomoyo_get_realpath(&buf, path)) {
                r.obj = &obj;
                dev = new_decode_dev(dev);
                r.param_type = TOMOYO_TYPE_MKDEV_ACL;
                r.param.mkdev.filename = &buf;
                r.param.mkdev.operation = operation;
                r.param.mkdev.mode = mode;
                r.param.mkdev.major = MAJOR(dev);
                r.param.mkdev.minor = MINOR(dev);
                tomoyo_check_acl(&r, tomoyo_check_mkdev_acl);
                error = tomoyo_audit_mkdev_log(&r);
                kfree(buf.name);
        }
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_path2_perm - Check permission for "rename", "link" and "pivot_root".
 *
 * @operation: Type of operation.
 * @path1:      Pointer to "struct path".
 * @path2:      Pointer to "struct path".
 *
 * Returns 0 on success, negative value otherwise.
 */
int tomoyo_path2_perm(const u8 operation, const struct path *path1,
                      const struct path *path2)
{
        int error = -ENOMEM;
        struct tomoyo_path_info buf1;
        struct tomoyo_path_info buf2;
        struct tomoyo_request_info r;
        struct tomoyo_obj_info obj = {
                .path1 = { .mnt = path1->mnt, .dentry = path1->dentry },
                .path2 = { .mnt = path2->mnt, .dentry = path2->dentry }
        };
        int idx;

        if (tomoyo_init_request_info(&r, NULL, tomoyo_pp2mac[operation])
            == TOMOYO_CONFIG_DISABLED)
                return 0;
        buf1.name = NULL;
        buf2.name = NULL;
        idx = tomoyo_read_lock();
        if (!tomoyo_get_realpath(&buf1, path1) ||
            !tomoyo_get_realpath(&buf2, path2))
                goto out;
        switch (operation) {
        case TOMOYO_TYPE_RENAME:
        case TOMOYO_TYPE_LINK:
                if (!d_is_dir(path1->dentry))
                        break;
                fallthrough;
        case TOMOYO_TYPE_PIVOT_ROOT:
                tomoyo_add_slash(&buf1);
                tomoyo_add_slash(&buf2);
                break;
        }
        r.obj = &obj;
        r.param_type = TOMOYO_TYPE_PATH2_ACL;
        r.param.path2.operation = operation;
        r.param.path2.filename1 = &buf1;
        r.param.path2.filename2 = &buf2;
        do {
                tomoyo_check_acl(&r, tomoyo_check_path2_acl);
                error = tomoyo_audit_path2_log(&r);
        } while (error == TOMOYO_RETRY_REQUEST);
 out:
        kfree(buf1.name);
        kfree(buf2.name);
        tomoyo_read_unlock(idx);
        if (r.mode != TOMOYO_CONFIG_ENFORCING)
                error = 0;
        return error;
}

/**
 * tomoyo_same_mount_acl - Check for duplicated "struct tomoyo_mount_acl" entry.
 *
 * @a: Pointer to "struct tomoyo_acl_info".
 * @b: Pointer to "struct tomoyo_acl_info".
 *
 * Returns true if @a == @b, false otherwise.
 */
static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a,
                                  const struct tomoyo_acl_info *b)
{
        const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head);
        const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head);

        return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) &&
                tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) &&
                tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) &&
                tomoyo_same_number_union(&p1->flags, &p2->flags);
}

/**
 * tomoyo_update_mount_acl - Write "struct tomoyo_mount_acl" list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param)
{
        struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL };
        int error;

        if (!tomoyo_parse_name_union(param, &e.dev_name) ||
            !tomoyo_parse_name_union(param, &e.dir_name) ||
            !tomoyo_parse_name_union(param, &e.fs_type) ||
            !tomoyo_parse_number_union(param, &e.flags))
                error = -EINVAL;
        else
                error = tomoyo_update_domain(&e.head, sizeof(e), param,
                                             tomoyo_same_mount_acl, NULL);
        tomoyo_put_name_union(&e.dev_name);
        tomoyo_put_name_union(&e.dir_name);
        tomoyo_put_name_union(&e.fs_type);
        tomoyo_put_number_union(&e.flags);
        return error;
}

/**
 * tomoyo_write_file - Update file related list.
 *
 * @param: Pointer to "struct tomoyo_acl_param".
 *
 * Returns 0 on success, negative value otherwise.
 *
 * Caller holds tomoyo_read_lock().
 */
int tomoyo_write_file(struct tomoyo_acl_param *param)
{
        u16 perm = 0;
        u8 type;
        const char *operation = tomoyo_read_token(param);

        for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++)
                if (tomoyo_permstr(operation, tomoyo_path_keyword[type]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_PATH2_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pp2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path2_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_PATH_NUMBER_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pn2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_path_number_acl(perm, param);
        for (type = 0; type < TOMOYO_MAX_MKDEV_OPERATION; type++)
                if (tomoyo_permstr(operation,
                                   tomoyo_mac_keywords[tomoyo_pnnn2mac[type]]))
                        perm |= 1 << type;
        if (perm)
                return tomoyo_update_mkdev_acl(perm, param);
        if (tomoyo_permstr(operation,
                           tomoyo_mac_keywords[TOMOYO_MAC_FILE_MOUNT]))
                return tomoyo_update_mount_acl(param);
        return -EINVAL;
}

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_
#define _ASM_GENERIC_BITOPS_BUILTIN___FFS_H_

/**
 * __ffs - find first bit in word.
 * @word: The word to search
 *
 * Undefined if no bit exists, so code should check against 0 first.
 */
static __always_inline unsigned int __ffs(unsigned long word)
{
        return __builtin_ctzl(word);
}

#endif



















































































































































































































   95 









































  115 



  116 










































   41 

   16 

   16 
   27 







   41 

   16 

   16 
   27 











    8 
   17 




































   16 



    5 
   11 


























   64 







    8 
   17 








   80 



   26 






   58 








   56 






















   99 


























































































































   66 








   66 























   66 





   66 




































   66 










   66 















   66 
























































































































   66 





   66 


   66 





   66 










































   85 







   85 





















































































































































































































   41 





    3 














    3 







  153 
























  153 






   84 








   86 
   86 





   86 



















   16 





   41 









    3 




















   42 





















































































































































































   35 


























































































































































































































































































































































































































   81 


   80 
   80 















































































































































































































































































































































































































































   95 


   52 



   43 








































































   35 




   35 










   35 





















































   61 























   61 





   61 
















   61 


   61 
   61 
   61 
























   61 


































   61 

































































































   35 





   35 
   26 










    9 


































    9 






































   35 

















































































































































































































































































































   35 







   35 































   35 
   35 

















   35 













   35 


   35 










































































   35 


   35 






   35 


































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   86 


   86 



























   61 
























































   24 








   23 
   23 




















    5 
    5 










































































































































































































































































































































































































































   24 












   24 








   24 



   24 
   24 

    8 



   16 










   16 





































































































   24 














   24 





   81 
   62 


   23 
   24 

   24 




   81 


   62 












   24 





   24 

























































































































































































































































































































































   35 



   35 








































    8 





























    9 

   25 


















   35 












































































































   43 





   43 










































   43 




































   43 









   43 




















   43 





















































































































































































































































































































































































































































































































































  153 

















  148 






  153 








   58 




















   61 






   58 



    3 







   61 







   61 




























































   61 





   58 
    3 
    3 






   86 

















   86 


















































   25 




   22 









    3 



















   23 











































































   16 


   16 


   16 

    3 

   13 








































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
7544
7545
7546
7547
7548
7549
7550
7551
7552
7553
7554
7555
7556
7557
7558
7559
7560
7561
7562
7563
7564
7565
7566
7567
7568
7569
7570
7571
7572
7573
7574
7575
7576
7577
7578
7579
7580
7581
7582
7583
7584
7585
7586
7587
7588
7589
7590
7591
7592
7593
7594
7595
7596
7597
7598
7599
7600
7601
7602
7603
7604
7605
7606
7607
7608
7609
7610
7611
7612
7613
7614
7615
7616
7617
7618
7619
7620
7621
7622
7623
7624
7625
7626
7627
7628
7629
7630
7631
7632
7633
7634
7635
7636
7637
7638
7639
7640
7641
7642
7643
7644
7645
7646
7647
7648
7649
7650
7651
7652
7653
7654
7655
7656
7657
7658
7659
7660
7661
7662
7663
7664
7665
7666
7667
7668
7669
7670
7671
7672
7673
7674
7675
7676
7677
7678
7679
7680
7681
7682
7683
7684
7685
7686
7687
7688
7689
7690
7691
7692
7693
7694
7695
7696
7697
7698
7699
7700
7701
7702
7703
7704
7705
7706
7707
7708
7709
7710
7711
7712
7713
7714
7715
7716
7717
7718
7719
7720
7721
7722
7723
7724
7725
7726
7727
7728
7729
7730
7731
7732
7733
7734
7735
7736
7737
7738
7739
7740
7741
7742
7743
7744
7745
7746
7747
7748
7749
7750
7751
7752
7753
7754
7755
7756
7757
7758
7759
7760
7761
7762
7763
7764
7765
7766
7767
7768
7769
7770
7771
7772
7773
7774
7775
7776
7777
7778
7779
7780
7781
7782
7783
7784
7785
7786
7787
7788
7789
7790
7791
7792
7793
7794
7795
7796
7797
7798
7799
7800
7801
7802
7803
7804
7805
7806
7807
7808
7809
7810
7811
7812
7813
7814
7815
7816
7817
7818
7819
7820
7821
7822
7823
7824
7825
7826
7827
7828
7829
7830
7831
7832
7833
7834
7835
7836
7837
7838
7839
7840
7841
7842
7843
7844
7845
7846
7847
7848
7849
7850
7851
7852
7853
7854
7855
7856
7857
7858
7859
7860
7861
7862
7863
7864
7865
7866
7867
7868
7869
7870
7871
7872
7873
7874
7875
7876
7877
7878
7879
7880
7881
7882
7883
7884
7885
7886
7887
7888
7889
7890
7891
7892
7893
7894
7895
7896
7897
7898
7899
7900
7901
7902
7903
7904
7905
7906
7907
7908
7909
7910
7911
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Generic hugetlb support.
 * (C) Nadia Yvette Chambers, April 2004
 */
#include <linux/list.h>
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/seq_file.h>
#include <linux/sysctl.h>
#include <linux/highmem.h>
#include <linux/mmu_notifier.h>
#include <linux/nodemask.h>
#include <linux/pagemap.h>
#include <linux/mempolicy.h>
#include <linux/compiler.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
#include <linux/memblock.h>
#include <linux/minmax.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
#include <linux/string_helpers.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
#include <linux/numa.h>
#include <linux/llist.h>
#include <linux/cma.h>
#include <linux/migrate.h>
#include <linux/nospec.h>
#include <linux/delayacct.h>
#include <linux/memory.h>
#include <linux/mm_inline.h>
#include <linux/padata.h>

#include <asm/page.h>
#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <asm/setup.h>

#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/node.h>
#include <linux/page_owner.h>
#include "internal.h"
#include "hugetlb_vmemmap.h"
#include "hugetlb_cma.h"
#include <linux/page-isolation.h>

int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];

__initdata struct list_head huge_boot_pages[MAX_NUMNODES];
static unsigned long hstate_boot_nrinvalid[HUGE_MAX_HSTATE] __initdata;

/*
 * Due to ordering constraints across the init code for various
 * architectures, hugetlb hstate cmdline parameters can't simply
 * be early_param. early_param might call the setup function
 * before valid hugetlb page sizes are determined, leading to
 * incorrect rejection of valid hugepagesz= options.
 *
 * So, record the parameters early and consume them whenever the
 * init code is ready for them, by calling hugetlb_parse_params().
 */

/* one (hugepagesz=,hugepages=) pair per hstate, one default_hugepagesz */
#define HUGE_MAX_CMDLINE_ARGS        (2 * HUGE_MAX_HSTATE + 1)
struct hugetlb_cmdline {
        char *val;
        int (*setup)(char *val);
};

/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
static unsigned long hugepage_allocation_threads __initdata;

static char hstate_cmdline_buf[COMMAND_LINE_SIZE] __initdata;
static int hstate_cmdline_index __initdata;
static struct hugetlb_cmdline hugetlb_params[HUGE_MAX_CMDLINE_ARGS] __initdata;
static int hugetlb_param_index __initdata;
static __init int hugetlb_add_param(char *s, int (*setup)(char *val));
static __init void hugetlb_parse_params(void);

#define hugetlb_early_param(str, func) \
static __init int func##args(char *s) \
{ \
        return hugetlb_add_param(s, func); \
} \
early_param(str, func##args)

/*
 * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
 * free_huge_pages, and surplus_huge_pages.
 */
__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock);

/*
 * Serializes faults on the same logical page.  This is used to
 * prevent spurious OOMs when the hugepage pool is fully utilized.
 */
static int num_fault_mutexes __ro_after_init;
struct mutex *hugetlb_fault_mutex_table __ro_after_init;

/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                unsigned long start, unsigned long end);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);

static void hugetlb_free_folio(struct folio *folio)
{
        if (folio_test_hugetlb_cma(folio)) {
                hugetlb_cma_free_folio(folio);
                return;
        }

        folio_put(folio);
}

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
        if (spool->count)
                return false;
        if (spool->max_hpages != -1)
                return spool->used_hpages == 0;
        if (spool->min_hpages != -1)
                return spool->rsv_hpages == spool->min_hpages;

        return true;
}

static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
                                                unsigned long irq_flags)
{
        spin_unlock_irqrestore(&spool->lock, irq_flags);

        /* If no pages are used, and no other handles to the subpool
         * remain, give up any reservations based on minimum size and
         * free the subpool */
        if (subpool_is_free(spool)) {
                if (spool->min_hpages != -1)
                        hugetlb_acct_memory(spool->hstate,
                                                -spool->min_hpages);
                kfree(spool);
        }
}

struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
                                                long min_hpages)
{
        struct hugepage_subpool *spool;

        spool = kzalloc(sizeof(*spool), GFP_KERNEL);
        if (!spool)
                return NULL;

        spin_lock_init(&spool->lock);
        spool->count = 1;
        spool->max_hpages = max_hpages;
        spool->hstate = h;
        spool->min_hpages = min_hpages;

        if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
                kfree(spool);
                return NULL;
        }
        spool->rsv_hpages = min_hpages;

        return spool;
}

void hugepage_put_subpool(struct hugepage_subpool *spool)
{
        unsigned long flags;

        spin_lock_irqsave(&spool->lock, flags);
        BUG_ON(!spool->count);
        spool->count--;
        unlock_or_release_subpool(spool, flags);
}

/*
 * Subpool accounting for allocating and reserving pages.
 * Return -ENOMEM if there are not enough resources to satisfy the
 * request.  Otherwise, return the number of pages by which the
 * global pools must be adjusted (upward).  The returned value may
 * only be different than the passed value (delta) in the case where
 * a subpool minimum size must be maintained.
 */
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                                      long delta)
{
        long ret = delta;

        if (!spool)
                return ret;

        spin_lock_irq(&spool->lock);

        if (spool->max_hpages != -1) {                /* maximum size accounting */
                if ((spool->used_hpages + delta) <= spool->max_hpages)
                        spool->used_hpages += delta;
                else {
                        ret = -ENOMEM;
                        goto unlock_ret;
                }
        }

        /* minimum size accounting */
        if (spool->min_hpages != -1 && spool->rsv_hpages) {
                if (delta > spool->rsv_hpages) {
                        /*
                         * Asking for more reserves than those already taken on
                         * behalf of subpool.  Return difference.
                         */
                        ret = delta - spool->rsv_hpages;
                        spool->rsv_hpages = 0;
                } else {
                        ret = 0;        /* reserves already accounted for */
                        spool->rsv_hpages -= delta;
                }
        }

unlock_ret:
        spin_unlock_irq(&spool->lock);
        return ret;
}

/*
 * Subpool accounting for freeing and unreserving pages.
 * Return the number of global page reservations that must be dropped.
 * The return value may only be different than the passed value (delta)
 * in the case where a subpool minimum size must be maintained.
 */
static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                       long delta)
{
        long ret = delta;
        unsigned long flags;

        if (!spool)
                return delta;

        spin_lock_irqsave(&spool->lock, flags);

        if (spool->max_hpages != -1)                /* maximum size accounting */
                spool->used_hpages -= delta;

         /* minimum size accounting */
        if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
                if (spool->rsv_hpages + delta <= spool->min_hpages)
                        ret = 0;
                else
                        ret = spool->rsv_hpages + delta - spool->min_hpages;

                spool->rsv_hpages += delta;
                if (spool->rsv_hpages > spool->min_hpages)
                        spool->rsv_hpages = spool->min_hpages;
        }

        /*
         * If hugetlbfs_put_super couldn't free spool due to an outstanding
         * quota reference, free it now.
         */
        unlock_or_release_subpool(spool, flags);

        return ret;
}

static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
{
        return HUGETLBFS_SB(inode->i_sb)->spool;
}

static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
{
        return subpool_inode(file_inode(vma->vm_file));
}

/*
 * hugetlb vma_lock helper routines
 */
void hugetlb_vma_lock_read(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                down_read(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                down_read(&resv_map->rw_sema);
        }
}

void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                up_read(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                up_read(&resv_map->rw_sema);
        }
}

void hugetlb_vma_lock_write(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                down_write(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                down_write(&resv_map->rw_sema);
        }
}

void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                up_write(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                up_write(&resv_map->rw_sema);
        }
}

int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
{

        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                return down_write_trylock(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                return down_write_trylock(&resv_map->rw_sema);
        }

        return 1;
}

void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                lockdep_assert_held(&vma_lock->rw_sema);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                lockdep_assert_held(&resv_map->rw_sema);
        }
}

void hugetlb_vma_lock_release(struct kref *kref)
{
        struct hugetlb_vma_lock *vma_lock = container_of(kref,
                        struct hugetlb_vma_lock, refs);

        kfree(vma_lock);
}

static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
{
        struct vm_area_struct *vma = vma_lock->vma;

        /*
         * vma_lock structure may or not be released as a result of put,
         * it certainly will no longer be attached to vma so clear pointer.
         * Semaphore synchronizes access to vma_lock->vma field.
         */
        vma_lock->vma = NULL;
        vma->vm_private_data = NULL;
        up_write(&vma_lock->rw_sema);
        kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
}

static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
{
        if (__vma_shareable_lock(vma)) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                __hugetlb_vma_unlock_write_put(vma_lock);
        } else if (__vma_private_lock(vma)) {
                struct resv_map *resv_map = vma_resv_map(vma);

                /* no free for anon vmas, but still need to unlock */
                up_write(&resv_map->rw_sema);
        }
}

static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
{
        /*
         * Only present in sharable vmas.
         */
        if (!vma || !__vma_shareable_lock(vma))
                return;

        if (vma->vm_private_data) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                down_write(&vma_lock->rw_sema);
                __hugetlb_vma_unlock_write_put(vma_lock);
        }
}

static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
{
        struct hugetlb_vma_lock *vma_lock;

        /* Only establish in (flags) sharable vmas */
        if (!vma || !(vma->vm_flags & VM_MAYSHARE))
                return;

        /* Should never get here with non-NULL vm_private_data */
        if (vma->vm_private_data)
                return;

        vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
        if (!vma_lock) {
                /*
                 * If we can not allocate structure, then vma can not
                 * participate in pmd sharing.  This is only a possible
                 * performance enhancement and memory saving issue.
                 * However, the lock is also used to synchronize page
                 * faults with truncation.  If the lock is not present,
                 * unlikely races could leave pages in a file past i_size
                 * until the file is removed.  Warn in the unlikely case of
                 * allocation failure.
                 */
                pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
                return;
        }

        kref_init(&vma_lock->refs);
        init_rwsem(&vma_lock->rw_sema);
        vma_lock->vma = vma;
        vma->vm_private_data = vma_lock;
}

/* Helper that removes a struct file_region from the resv_map cache and returns
 * it for use.
 */
static struct file_region *
get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
        struct file_region *nrg;

        VM_BUG_ON(resv->region_cache_count <= 0);

        resv->region_cache_count--;
        nrg = list_first_entry(&resv->region_cache, struct file_region, link);
        list_del(&nrg->link);

        nrg->from = from;
        nrg->to = to;

        return nrg;
}

static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
                                              struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
        nrg->reservation_counter = rg->reservation_counter;
        nrg->css = rg->css;
        if (rg->css)
                css_get(rg->css);
#endif
}

/* Helper that records hugetlb_cgroup uncharge info. */
static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
                                                struct hstate *h,
                                                struct resv_map *resv,
                                                struct file_region *nrg)
{
#ifdef CONFIG_CGROUP_HUGETLB
        if (h_cg) {
                nrg->reservation_counter =
                        &h_cg->rsvd_hugepage[hstate_index(h)];
                nrg->css = &h_cg->css;
                /*
                 * The caller will hold exactly one h_cg->css reference for the
                 * whole contiguous reservation region. But this area might be
                 * scattered when there are already some file_regions reside in
                 * it. As a result, many file_regions may share only one css
                 * reference. In order to ensure that one file_region must hold
                 * exactly one h_cg->css reference, we should do css_get for
                 * each file_region and leave the reference held by caller
                 * untouched.
                 */
                css_get(&h_cg->css);
                if (!resv->pages_per_hpage)
                        resv->pages_per_hpage = pages_per_huge_page(h);
                /* pages_per_hpage should be the same for all entries in
                 * a resv_map.
                 */
                VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
        } else {
                nrg->reservation_counter = NULL;
                nrg->css = NULL;
        }
#endif
}

static void put_uncharge_info(struct file_region *rg)
{
#ifdef CONFIG_CGROUP_HUGETLB
        if (rg->css)
                css_put(rg->css);
#endif
}

static bool has_same_uncharge_info(struct file_region *rg,
                                   struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
        return rg->reservation_counter == org->reservation_counter &&
               rg->css == org->css;

#else
        return true;
#endif
}

static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
{
        struct file_region *nrg, *prg;

        prg = list_prev_entry(rg, link);
        if (&prg->link != &resv->regions && prg->to == rg->from &&
            has_same_uncharge_info(prg, rg)) {
                prg->to = rg->to;

                list_del(&rg->link);
                put_uncharge_info(rg);
                kfree(rg);

                rg = prg;
        }

        nrg = list_next_entry(rg, link);
        if (&nrg->link != &resv->regions && nrg->from == rg->to &&
            has_same_uncharge_info(nrg, rg)) {
                nrg->from = rg->from;

                list_del(&rg->link);
                put_uncharge_info(rg);
                kfree(rg);
        }
}

static inline long
hugetlb_resv_map_add(struct resv_map *map, struct list_head *rg, long from,
                     long to, struct hstate *h, struct hugetlb_cgroup *cg,
                     long *regions_needed)
{
        struct file_region *nrg;

        if (!regions_needed) {
                nrg = get_file_region_entry_from_cache(map, from, to);
                record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
                list_add(&nrg->link, rg);
                coalesce_file_region(map, nrg);
        } else
                *regions_needed += 1;

        return to - from;
}

/*
 * Must be called with resv->lock held.
 *
 * Calling this with regions_needed != NULL will count the number of pages
 * to be added but will not modify the linked list. And regions_needed will
 * indicate the number of file_regions needed in the cache to carry out to add
 * the regions for this range.
 */
static long add_reservation_in_range(struct resv_map *resv, long f, long t,
                                     struct hugetlb_cgroup *h_cg,
                                     struct hstate *h, long *regions_needed)
{
        long add = 0;
        struct list_head *head = &resv->regions;
        long last_accounted_offset = f;
        struct file_region *iter, *trg = NULL;
        struct list_head *rg = NULL;

        if (regions_needed)
                *regions_needed = 0;

        /* In this loop, we essentially handle an entry for the range
         * [last_accounted_offset, iter->from), at every iteration, with some
         * bounds checking.
         */
        list_for_each_entry_safe(iter, trg, head, link) {
                /* Skip irrelevant regions that start before our range. */
                if (iter->from < f) {
                        /* If this region ends after the last accounted offset,
                         * then we need to update last_accounted_offset.
                         */
                        if (iter->to > last_accounted_offset)
                                last_accounted_offset = iter->to;
                        continue;
                }

                /* When we find a region that starts beyond our range, we've
                 * finished.
                 */
                if (iter->from >= t) {
                        rg = iter->link.prev;
                        break;
                }

                /* Add an entry for last_accounted_offset -> iter->from, and
                 * update last_accounted_offset.
                 */
                if (iter->from > last_accounted_offset)
                        add += hugetlb_resv_map_add(resv, iter->link.prev,
                                                    last_accounted_offset,
                                                    iter->from, h, h_cg,
                                                    regions_needed);

                last_accounted_offset = iter->to;
        }

        /* Handle the case where our range extends beyond
         * last_accounted_offset.
         */
        if (!rg)
                rg = head->prev;
        if (last_accounted_offset < t)
                add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
                                            t, h, h_cg, regions_needed);

        return add;
}

/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
 */
static int allocate_file_region_entries(struct resv_map *resv,
                                        int regions_needed)
        __must_hold(&resv->lock)
{
        LIST_HEAD(allocated_regions);
        int to_allocate = 0, i = 0;
        struct file_region *trg = NULL, *rg = NULL;

        VM_BUG_ON(regions_needed < 0);

        /*
         * Check for sufficient descriptors in the cache to accommodate
         * the number of in progress add operations plus regions_needed.
         *
         * This is a while loop because when we drop the lock, some other call
         * to region_add or region_del may have consumed some region_entries,
         * so we keep looping here until we finally have enough entries for
         * (adds_in_progress + regions_needed).
         */
        while (resv->region_cache_count <
               (resv->adds_in_progress + regions_needed)) {
                to_allocate = resv->adds_in_progress + regions_needed -
                              resv->region_cache_count;

                /* At this point, we should have enough entries in the cache
                 * for all the existing adds_in_progress. We should only be
                 * needing to allocate for regions_needed.
                 */
                VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);

                spin_unlock(&resv->lock);
                for (i = 0; i < to_allocate; i++) {
                        trg = kmalloc(sizeof(*trg), GFP_KERNEL);
                        if (!trg)
                                goto out_of_memory;
                        list_add(&trg->link, &allocated_regions);
                }

                spin_lock(&resv->lock);

                list_splice(&allocated_regions, &resv->region_cache);
                resv->region_cache_count += to_allocate;
        }

        return 0;

out_of_memory:
        list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
                list_del(&rg->link);
                kfree(rg);
        }
        return -ENOMEM;
}

/*
 * Add the huge page range represented by [f, t) to the reserve
 * map.  Regions will be taken from the cache to fill in this range.
 * Sufficient regions should exist in the cache due to the previous
 * call to region_chg with the same range, but in some cases the cache will not
 * have sufficient entries due to races with other code doing region_add or
 * region_del.  The extra needed entries will be allocated.
 *
 * regions_needed is the out value provided by a previous call to region_chg.
 *
 * Return the number of new huge pages added to the map.  This number is greater
 * than or equal to zero.  If file_region entries needed to be allocated for
 * this operation and we were not able to allocate, it returns -ENOMEM.
 * region_add of regions of length 1 never allocate file_regions and cannot
 * fail; region_chg will always allocate at least 1 entry and a region_add for
 * 1 page will only require at most 1 entry.
 */
static long region_add(struct resv_map *resv, long f, long t,
                       long in_regions_needed, struct hstate *h,
                       struct hugetlb_cgroup *h_cg)
{
        long add = 0, actual_regions_needed = 0;

        spin_lock(&resv->lock);
retry:

        /* Count how many regions are actually needed to execute this add. */
        add_reservation_in_range(resv, f, t, NULL, NULL,
                                 &actual_regions_needed);

        /*
         * Check for sufficient descriptors in the cache to accommodate
         * this add operation. Note that actual_regions_needed may be greater
         * than in_regions_needed, as the resv_map may have been modified since
         * the region_chg call. In this case, we need to make sure that we
         * allocate extra entries, such that we have enough for all the
         * existing adds_in_progress, plus the excess needed for this
         * operation.
         */
        if (actual_regions_needed > in_regions_needed &&
            resv->region_cache_count <
                    resv->adds_in_progress +
                            (actual_regions_needed - in_regions_needed)) {
                /* region_add operation of range 1 should never need to
                 * allocate file_region entries.
                 */
                VM_BUG_ON(t - f <= 1);

                if (allocate_file_region_entries(
                            resv, actual_regions_needed - in_regions_needed)) {
                        return -ENOMEM;
                }

                goto retry;
        }

        add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);

        resv->adds_in_progress -= in_regions_needed;

        spin_unlock(&resv->lock);
        return add;
}

/*
 * Examine the existing reserve map and determine how many
 * huge pages in the specified range [f, t) are NOT currently
 * represented.  This routine is called before a subsequent
 * call to region_add that will actually modify the reserve
 * map to add the specified range [f, t).  region_chg does
 * not change the number of huge pages represented by the
 * map.  A number of new file_region structures is added to the cache as a
 * placeholder, for the subsequent region_add call to use. At least 1
 * file_region structure is added.
 *
 * out_regions_needed is the number of regions added to the
 * resv->adds_in_progress.  This value needs to be provided to a follow up call
 * to region_add or region_abort for proper accounting.
 *
 * Returns the number of huge pages that need to be added to the existing
 * reservation map for the range [f, t).  This number is greater or equal to
 * zero.  -ENOMEM is returned if a new file_region structure or cache entry
 * is needed and can not be allocated.
 */
static long region_chg(struct resv_map *resv, long f, long t,
                       long *out_regions_needed)
{
        long chg = 0;

        spin_lock(&resv->lock);

        /* Count how many hugepages in this range are NOT represented. */
        chg = add_reservation_in_range(resv, f, t, NULL, NULL,
                                       out_regions_needed);

        if (*out_regions_needed == 0)
                *out_regions_needed = 1;

        if (allocate_file_region_entries(resv, *out_regions_needed))
                return -ENOMEM;

        resv->adds_in_progress += *out_regions_needed;

        spin_unlock(&resv->lock);
        return chg;
}

/*
 * Abort the in progress add operation.  The adds_in_progress field
 * of the resv_map keeps track of the operations in progress between
 * calls to region_chg and region_add.  Operations are sometimes
 * aborted after the call to region_chg.  In such cases, region_abort
 * is called to decrement the adds_in_progress counter. regions_needed
 * is the value returned by the region_chg call, it is used to decrement
 * the adds_in_progress counter.
 *
 * NOTE: The range arguments [f, t) are not needed or used in this
 * routine.  They are kept to make reading the calling code easier as
 * arguments will match the associated region_chg call.
 */
static void region_abort(struct resv_map *resv, long f, long t,
                         long regions_needed)
{
        spin_lock(&resv->lock);
        VM_BUG_ON(!resv->region_cache_count);
        resv->adds_in_progress -= regions_needed;
        spin_unlock(&resv->lock);
}

/*
 * Delete the specified range [f, t) from the reserve map.  If the
 * t parameter is LONG_MAX, this indicates that ALL regions after f
 * should be deleted.  Locate the regions which intersect [f, t)
 * and either trim, delete or split the existing regions.
 *
 * Returns the number of huge pages deleted from the reserve map.
 * In the normal case, the return value is zero or more.  In the
 * case where a region must be split, a new region descriptor must
 * be allocated.  If the allocation fails, -ENOMEM will be returned.
 * NOTE: If the parameter t == LONG_MAX, then we will never split
 * a region and possibly return -ENOMEM.  Callers specifying
 * t == LONG_MAX do not need to check for -ENOMEM error.
 */
static long region_del(struct resv_map *resv, long f, long t)
{
        struct list_head *head = &resv->regions;
        struct file_region *rg, *trg;
        struct file_region *nrg = NULL;
        long del = 0;

retry:
        spin_lock(&resv->lock);
        list_for_each_entry_safe(rg, trg, head, link) {
                /*
                 * Skip regions before the range to be deleted.  file_region
                 * ranges are normally of the form [from, to).  However, there
                 * may be a "placeholder" entry in the map which is of the form
                 * (from, to) with from == to.  Check for placeholder entries
                 * at the beginning of the range to be deleted.
                 */
                if (rg->to <= f && (rg->to != rg->from || rg->to != f))
                        continue;

                if (rg->from >= t)
                        break;

                if (f > rg->from && t < rg->to) { /* Must split region */
                        /*
                         * Check for an entry in the cache before dropping
                         * lock and attempting allocation.
                         */
                        if (!nrg &&
                            resv->region_cache_count > resv->adds_in_progress) {
                                nrg = list_first_entry(&resv->region_cache,
                                                        struct file_region,
                                                        link);
                                list_del(&nrg->link);
                                resv->region_cache_count--;
                        }

                        if (!nrg) {
                                spin_unlock(&resv->lock);
                                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
                                if (!nrg)
                                        return -ENOMEM;
                                goto retry;
                        }

                        del += t - f;
                        hugetlb_cgroup_uncharge_file_region(
                                resv, rg, t - f, false);

                        /* New entry for end of split region */
                        nrg->from = t;
                        nrg->to = rg->to;

                        copy_hugetlb_cgroup_uncharge_info(nrg, rg);

                        INIT_LIST_HEAD(&nrg->link);

                        /* Original entry is trimmed */
                        rg->to = f;

                        list_add(&nrg->link, &rg->link);
                        nrg = NULL;
                        break;
                }

                if (f <= rg->from && t >= rg->to) { /* Remove entire region */
                        del += rg->to - rg->from;
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
                                                            rg->to - rg->from, true);
                        list_del(&rg->link);
                        kfree(rg);
                        continue;
                }

                if (f <= rg->from) {        /* Trim beginning of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
                                                            t - rg->from, false);

                        del += t - rg->from;
                        rg->from = t;
                } else {                /* Trim end of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
                                                            rg->to - f, false);

                        del += rg->to - f;
                        rg->to = f;
                }
        }

        spin_unlock(&resv->lock);
        kfree(nrg);
        return del;
}

/*
 * A rare out of memory error was encountered which prevented removal of
 * the reserve map region for a page.  The huge page itself was free'ed
 * and removed from the page cache.  This routine will adjust the subpool
 * usage count, and the global reserve count if needed.  By incrementing
 * these counts, the reserve map entry which could not be deleted will
 * appear as a "reserved" entry instead of simply dangling with incorrect
 * counts.
 */
void hugetlb_fix_reserve_counts(struct inode *inode)
{
        struct hugepage_subpool *spool = subpool_inode(inode);
        long rsv_adjust;
        bool reserved = false;

        rsv_adjust = hugepage_subpool_get_pages(spool, 1);
        if (rsv_adjust > 0) {
                struct hstate *h = hstate_inode(inode);

                if (!hugetlb_acct_memory(h, 1))
                        reserved = true;
        } else if (!rsv_adjust) {
                reserved = true;
        }

        if (!reserved)
                pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}

/*
 * Count and return the number of huge pages in the reserve map
 * that intersect with the range [f, t).
 */
static long region_count(struct resv_map *resv, long f, long t)
{
        struct list_head *head = &resv->regions;
        struct file_region *rg;
        long chg = 0;

        spin_lock(&resv->lock);
        /* Locate each segment we overlap with, and count that overlap. */
        list_for_each_entry(rg, head, link) {
                long seg_from;
                long seg_to;

                if (rg->to <= f)
                        continue;
                if (rg->from >= t)
                        break;

                seg_from = max(rg->from, f);
                seg_to = min(rg->to, t);

                chg += seg_to - seg_from;
        }
        spin_unlock(&resv->lock);

        return chg;
}

/*
 * Convert the address within this vma to the page offset within
 * the mapping, huge page units here.
 */
static pgoff_t vma_hugecache_offset(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address)
{
        return ((address - vma->vm_start) >> huge_page_shift(h)) +
                        (vma->vm_pgoff >> huge_page_order(h));
}

/**
 * vma_kernel_pagesize - Page size granularity for this VMA.
 * @vma: The user mapping.
 *
 * Folios in this VMA will be aligned to, and at least the size of the
 * number of bytes returned by this function.
 *
 * Return: The default size of the folios allocated when backing a VMA.
 */
unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
{
        if (vma->vm_ops && vma->vm_ops->pagesize)
                return vma->vm_ops->pagesize(vma);
        return PAGE_SIZE;
}
EXPORT_SYMBOL_GPL(vma_kernel_pagesize);

/*
 * Return the page size being used by the MMU to back a VMA. In the majority
 * of cases, the page size used by the kernel matches the MMU size. On
 * architectures where it differs, an architecture-specific 'strong'
 * version of this symbol is required.
 */
__weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
{
        return vma_kernel_pagesize(vma);
}

/*
 * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
 * bits of the reservation map pointer, which are always clear due to
 * alignment.
 */
#define HPAGE_RESV_OWNER    (1UL << 0)
#define HPAGE_RESV_UNMAPPED (1UL << 1)
#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)

/*
 * These helpers are used to track how many pages are reserved for
 * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
 * is guaranteed to have their future faults succeed.
 *
 * With the exception of hugetlb_dup_vma_private() which is called at fork(),
 * the reserve counters are updated with the hugetlb_lock held. It is safe
 * to reset the VMA at fork() time as it is not in use yet and there is no
 * chance of the global counters getting corrupted as a result of the values.
 *
 * The private mapping reservation is represented in a subtly different
 * manner to a shared mapping.  A shared mapping has a region map associated
 * with the underlying file, this region map represents the backing file
 * pages which have ever had a reservation assigned which this persists even
 * after the page is instantiated.  A private mapping has a region map
 * associated with the original mmap which is attached to all VMAs which
 * reference it, this region map represents those offsets which have consumed
 * reservation ie. where pages have been instantiated.
 */
static unsigned long get_vma_private_data(struct vm_area_struct *vma)
{
        return (unsigned long)vma->vm_private_data;
}

static void set_vma_private_data(struct vm_area_struct *vma,
                                                        unsigned long value)
{
        vma->vm_private_data = (void *)value;
}

static void
resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
                                          struct hugetlb_cgroup *h_cg,
                                          struct hstate *h)
{
#ifdef CONFIG_CGROUP_HUGETLB
        if (!h_cg || !h) {
                resv_map->reservation_counter = NULL;
                resv_map->pages_per_hpage = 0;
                resv_map->css = NULL;
        } else {
                resv_map->reservation_counter =
                        &h_cg->rsvd_hugepage[hstate_index(h)];
                resv_map->pages_per_hpage = pages_per_huge_page(h);
                resv_map->css = &h_cg->css;
        }
#endif
}

struct resv_map *resv_map_alloc(void)
{
        struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
        struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);

        if (!resv_map || !rg) {
                kfree(resv_map);
                kfree(rg);
                return NULL;
        }

        kref_init(&resv_map->refs);
        spin_lock_init(&resv_map->lock);
        INIT_LIST_HEAD(&resv_map->regions);
        init_rwsem(&resv_map->rw_sema);

        resv_map->adds_in_progress = 0;
        /*
         * Initialize these to 0. On shared mappings, 0's here indicate these
         * fields don't do cgroup accounting. On private mappings, these will be
         * re-initialized to the proper values, to indicate that hugetlb cgroup
         * reservations are to be un-charged from here.
         */
        resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);

        INIT_LIST_HEAD(&resv_map->region_cache);
        list_add(&rg->link, &resv_map->region_cache);
        resv_map->region_cache_count = 1;

        return resv_map;
}

void resv_map_release(struct kref *ref)
{
        struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
        struct list_head *head = &resv_map->region_cache;
        struct file_region *rg, *trg;

        /* Clear out any active regions before we release the map. */
        region_del(resv_map, 0, LONG_MAX);

        /* ... and any entries left in the cache */
        list_for_each_entry_safe(rg, trg, head, link) {
                list_del(&rg->link);
                kfree(rg);
        }

        VM_BUG_ON(resv_map->adds_in_progress);

        kfree(resv_map);
}

static inline struct resv_map *inode_resv_map(struct inode *inode)
{
        /*
         * At inode evict time, i_mapping may not point to the original
         * address space within the inode.  This original address space
         * contains the pointer to the resv_map.  So, always use the
         * address space embedded within the inode.
         * The VERY common case is inode->mapping == &inode->i_data but,
         * this may not be true for device special inodes.
         */
        return (struct resv_map *)(&inode->i_data)->i_private_data;
}

static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        if (vma->vm_flags & VM_MAYSHARE) {
                struct address_space *mapping = vma->vm_file->f_mapping;
                struct inode *inode = mapping->host;

                return inode_resv_map(inode);

        } else {
                return (struct resv_map *)(get_vma_private_data(vma) &
                                                        ~HPAGE_RESV_MASK);
        }
}

static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);

        set_vma_private_data(vma, (unsigned long)map);
}

static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);

        set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}

static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);

        return (get_vma_private_data(vma) & flag) != 0;
}

bool __vma_private_lock(struct vm_area_struct *vma)
{
        return !(vma->vm_flags & VM_MAYSHARE) &&
                get_vma_private_data(vma) & ~HPAGE_RESV_MASK &&
                is_vma_resv_set(vma, HPAGE_RESV_OWNER);
}

void hugetlb_dup_vma_private(struct vm_area_struct *vma)
{
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
        /*
         * Clear vm_private_data
         * - For shared mappings this is a per-vma semaphore that may be
         *   allocated in a subsequent call to hugetlb_vm_op_open.
         *   Before clearing, make sure pointer is not associated with vma
         *   as this will leak the structure.  This is the case when called
         *   via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
         *   been called to allocate a new structure.
         * - For MAP_PRIVATE mappings, this is the reserve map which does
         *   not apply to children.  Faults generated by the children are
         *   not guaranteed to succeed, even if read-only.
         */
        if (vma->vm_flags & VM_MAYSHARE) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                if (vma_lock && vma_lock->vma != vma)
                        vma->vm_private_data = NULL;
        } else
                vma->vm_private_data = NULL;
}

/*
 * Reset and decrement one ref on hugepage private reservation.
 * Called with mm->mmap_lock writer semaphore held.
 * This function should be only used by move_vma() and operate on
 * same sized vma. It should never come here with last ref on the
 * reservation.
 */
void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{
        /*
         * Clear the old hugetlb private page reservation.
         * It has already been transferred to new_vma.
         *
         * During a mremap() operation of a hugetlb vma we call move_vma()
         * which copies vma into new_vma and unmaps vma. After the copy
         * operation both new_vma and vma share a reference to the resv_map
         * struct, and at that point vma is about to be unmapped. We don't
         * want to return the reservation to the pool at unmap of vma because
         * the reservation still lives on in new_vma, so simply decrement the
         * ref here and remove the resv_map reference from this vma.
         */
        struct resv_map *reservations = vma_resv_map(vma);

        if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                resv_map_put_hugetlb_cgroup_uncharge_info(reservations);
                kref_put(&reservations->refs, resv_map_release);
        }

        hugetlb_dup_vma_private(vma);
}

static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio)
{
        int nid = folio_nid(folio);

        lockdep_assert_held(&hugetlb_lock);
        VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);

        list_move(&folio->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
        folio_set_hugetlb_freed(folio);
}

static struct folio *dequeue_hugetlb_folio_node_exact(struct hstate *h,
                                                                int nid)
{
        struct folio *folio;
        bool pin = !!(current->flags & PF_MEMALLOC_PIN);

        lockdep_assert_held(&hugetlb_lock);
        list_for_each_entry(folio, &h->hugepage_freelists[nid], lru) {
                if (pin && !folio_is_longterm_pinnable(folio))
                        continue;

                if (folio_test_hwpoison(folio))
                        continue;

                if (is_migrate_isolate_page(&folio->page))
                        continue;

                list_move(&folio->lru, &h->hugepage_activelist);
                folio_ref_unfreeze(folio, 1);
                folio_clear_hugetlb_freed(folio);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
                return folio;
        }

        return NULL;
}

static struct folio *dequeue_hugetlb_folio_nodemask(struct hstate *h, gfp_t gfp_mask,
                                                        int nid, nodemask_t *nmask)
{
        unsigned int cpuset_mems_cookie;
        struct zonelist *zonelist;
        struct zone *zone;
        struct zoneref *z;
        int node = NUMA_NO_NODE;

        /* 'nid' should not be NUMA_NO_NODE. Try to catch any misuse of it and rectifiy. */
        if (nid == NUMA_NO_NODE)
                nid = numa_node_id();

        zonelist = node_zonelist(nid, gfp_mask);

retry_cpuset:
        cpuset_mems_cookie = read_mems_allowed_begin();
        for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
                struct folio *folio;

                if (!cpuset_zone_allowed(zone, gfp_mask))
                        continue;
                /*
                 * no need to ask again on the same node. Pool is node rather than
                 * zone aware
                 */
                if (zone_to_nid(zone) == node)
                        continue;
                node = zone_to_nid(zone);

                folio = dequeue_hugetlb_folio_node_exact(h, node);
                if (folio)
                        return folio;
        }
        if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
                goto retry_cpuset;

        return NULL;
}

static unsigned long available_huge_pages(struct hstate *h)
{
        return h->free_huge_pages - h->resv_huge_pages;
}

static struct folio *dequeue_hugetlb_folio_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, long gbl_chg)
{
        struct folio *folio = NULL;
        struct mempolicy *mpol;
        gfp_t gfp_mask;
        nodemask_t *nodemask;
        int nid;

        /*
         * gbl_chg==1 means the allocation requires a new page that was not
         * reserved before.  Making sure there's at least one free page.
         */
        if (gbl_chg && !available_huge_pages(h))
                goto err;

        gfp_mask = htlb_alloc_mask(h);
        nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);

        if (mpol_is_preferred_many(mpol)) {
                folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
                                                        nid, nodemask);

                /* Fallback to all nodes if page==NULL */
                nodemask = NULL;
        }

        if (!folio)
                folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
                                                        nid, nodemask);

        mpol_cond_put(mpol);
        return folio;

err:
        return NULL;
}

/*
 * common helper functions for hstate_next_node_to_{alloc|free}.
 * We may have allocated or freed a huge page based on a different
 * nodes_allowed previously, so h->next_node_to_{alloc|free} might
 * be outside of *nodes_allowed.  Ensure that we use an allowed
 * node for alloc or free.
 */
static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{
        nid = next_node_in(nid, *nodes_allowed);
        VM_BUG_ON(nid >= MAX_NUMNODES);

        return nid;
}

static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
        if (!node_isset(nid, *nodes_allowed))
                nid = next_node_allowed(nid, nodes_allowed);
        return nid;
}

/*
 * returns the previously saved node ["this node"] from which to
 * allocate a persistent huge page for the pool and advance the
 * next node from which to allocate, handling wrap at end of node
 * mask.
 */
static int hstate_next_node_to_alloc(int *next_node,
                                        nodemask_t *nodes_allowed)
{
        int nid;

        VM_BUG_ON(!nodes_allowed);

        nid = get_valid_node_allowed(*next_node, nodes_allowed);
        *next_node = next_node_allowed(nid, nodes_allowed);

        return nid;
}

/*
 * helper for remove_pool_hugetlb_folio() - return the previously saved
 * node ["this node"] from which to free a huge page.  Advance the
 * next node id whether or not we find a free huge page to free so
 * that the next attempt to free addresses the next node.
 */
static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{
        int nid;

        VM_BUG_ON(!nodes_allowed);

        nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
        h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);

        return nid;
}

#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask)                \
        for (nr_nodes = nodes_weight(*mask);                                \
                nr_nodes > 0 &&                                                \
                ((node = hstate_next_node_to_alloc(next_node, mask)) || 1);        \
                nr_nodes--)

#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)                \
        for (nr_nodes = nodes_weight(*mask);                                \
                nr_nodes > 0 &&                                                \
                ((node = hstate_next_node_to_free(hs, mask)) || 1);        \
                nr_nodes--)

#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
#ifdef CONFIG_CONTIG_ALLOC
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
                int nid, nodemask_t *nodemask)
{
        struct folio *folio;
        int order = huge_page_order(h);
        bool retried = false;

        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
retry:
        folio = hugetlb_cma_alloc_folio(h, gfp_mask, nid, nodemask);
        if (!folio) {
                if (hugetlb_cma_exclusive_alloc())
                        return NULL;

                folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
                if (!folio)
                        return NULL;
        }

        if (folio_ref_freeze(folio, 1))
                return folio;

        pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio));
        hugetlb_free_folio(folio);
        if (!retried) {
                retried = true;
                goto retry;
        }
        return NULL;
}

#else /* !CONFIG_CONTIG_ALLOC */
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
                                        int nid, nodemask_t *nodemask)
{
        return NULL;
}
#endif /* CONFIG_CONTIG_ALLOC */

#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
                                        int nid, nodemask_t *nodemask)
{
        return NULL;
}
#endif

/*
 * Remove hugetlb folio from lists.
 * If vmemmap exists for the folio, clear the hugetlb flag so that the
 * folio appears as just a compound page.  Otherwise, wait until after
 * allocating vmemmap to clear the flag.
 *
 * Must be called with hugetlb lock held.
 */
static void remove_hugetlb_folio(struct hstate *h, struct folio *folio,
                                                        bool adjust_surplus)
{
        int nid = folio_nid(folio);

        VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio);
        VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio);

        lockdep_assert_held(&hugetlb_lock);
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;

        list_del(&folio->lru);

        if (folio_test_hugetlb_freed(folio)) {
                folio_clear_hugetlb_freed(folio);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
        }
        if (adjust_surplus) {
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
        }

        /*
         * We can only clear the hugetlb flag after allocating vmemmap
         * pages.  Otherwise, someone (memory error handling) may try to write
         * to tail struct pages.
         */
        if (!folio_test_hugetlb_vmemmap_optimized(folio))
                __folio_clear_hugetlb(folio);

        h->nr_huge_pages--;
        h->nr_huge_pages_node[nid]--;
}

static void add_hugetlb_folio(struct hstate *h, struct folio *folio,
                             bool adjust_surplus)
{
        int nid = folio_nid(folio);

        VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio);

        lockdep_assert_held(&hugetlb_lock);

        INIT_LIST_HEAD(&folio->lru);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;

        if (adjust_surplus) {
                h->surplus_huge_pages++;
                h->surplus_huge_pages_node[nid]++;
        }

        __folio_set_hugetlb(folio);
        folio_change_private(folio, NULL);
        /*
         * We have to set hugetlb_vmemmap_optimized again as above
         * folio_change_private(folio, NULL) cleared it.
         */
        folio_set_hugetlb_vmemmap_optimized(folio);

        arch_clear_hugetlb_flags(folio);
        enqueue_hugetlb_folio(h, folio);
}

static void __update_and_free_hugetlb_folio(struct hstate *h,
                                                struct folio *folio)
{
        bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio);

        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;

        /*
         * If we don't know which subpages are hwpoisoned, we can't free
         * the hugepage, so it's leaked intentionally.
         */
        if (folio_test_hugetlb_raw_hwp_unreliable(folio))
                return;

        /*
         * If folio is not vmemmap optimized (!clear_flag), then the folio
         * is no longer identified as a hugetlb page.  hugetlb_vmemmap_restore_folio
         * can only be passed hugetlb pages and will BUG otherwise.
         */
        if (clear_flag && hugetlb_vmemmap_restore_folio(h, folio)) {
                spin_lock_irq(&hugetlb_lock);
                /*
                 * If we cannot allocate vmemmap pages, just refuse to free the
                 * page and put the page back on the hugetlb free list and treat
                 * as a surplus page.
                 */
                add_hugetlb_folio(h, folio, true);
                spin_unlock_irq(&hugetlb_lock);
                return;
        }

        /*
         * If vmemmap pages were allocated above, then we need to clear the
         * hugetlb flag under the hugetlb lock.
         */
        if (folio_test_hugetlb(folio)) {
                spin_lock_irq(&hugetlb_lock);
                __folio_clear_hugetlb(folio);
                spin_unlock_irq(&hugetlb_lock);
        }

        /*
         * Move PageHWPoison flag from head page to the raw error pages,
         * which makes any healthy subpages reusable.
         */
        if (unlikely(folio_test_hwpoison(folio)))
                folio_clear_hugetlb_hwpoison(folio);

        folio_ref_unfreeze(folio, 1);

        hugetlb_free_folio(folio);
}

/*
 * As update_and_free_hugetlb_folio() can be called under any context, so we cannot
 * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
 * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
 * the vmemmap pages.
 *
 * free_hpage_workfn() locklessly retrieves the linked list of pages to be
 * freed and frees them one-by-one. As the page->mapping pointer is going
 * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
 * structure of a lockless linked list of huge pages to be freed.
 */
static LLIST_HEAD(hpage_freelist);

static void free_hpage_workfn(struct work_struct *work)
{
        struct llist_node *node;

        node = llist_del_all(&hpage_freelist);

        while (node) {
                struct folio *folio;
                struct hstate *h;

                folio = container_of((struct address_space **)node,
                                     struct folio, mapping);
                node = node->next;
                folio->mapping = NULL;
                /*
                 * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in
                 * folio_hstate() is going to trigger because a previous call to
                 * remove_hugetlb_folio() will clear the hugetlb bit, so do
                 * not use folio_hstate() directly.
                 */
                h = size_to_hstate(folio_size(folio));

                __update_and_free_hugetlb_folio(h, folio);

                cond_resched();
        }
}
static DECLARE_WORK(free_hpage_work, free_hpage_workfn);

static inline void flush_free_hpage_work(struct hstate *h)
{
        if (hugetlb_vmemmap_optimizable(h))
                flush_work(&free_hpage_work);
}

static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio,
                                 bool atomic)
{
        if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) {
                __update_and_free_hugetlb_folio(h, folio);
                return;
        }

        /*
         * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
         *
         * Only call schedule_work() if hpage_freelist is previously
         * empty. Otherwise, schedule_work() had been called but the workfn
         * hasn't retrieved the list yet.
         */
        if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist))
                schedule_work(&free_hpage_work);
}

static void bulk_vmemmap_restore_error(struct hstate *h,
                                        struct list_head *folio_list,
                                        struct list_head *non_hvo_folios)
{
        struct folio *folio, *t_folio;

        if (!list_empty(non_hvo_folios)) {
                /*
                 * Free any restored hugetlb pages so that restore of the
                 * entire list can be retried.
                 * The idea is that in the common case of ENOMEM errors freeing
                 * hugetlb pages with vmemmap we will free up memory so that we
                 * can allocate vmemmap for more hugetlb pages.
                 */
                list_for_each_entry_safe(folio, t_folio, non_hvo_folios, lru) {
                        list_del(&folio->lru);
                        spin_lock_irq(&hugetlb_lock);
                        __folio_clear_hugetlb(folio);
                        spin_unlock_irq(&hugetlb_lock);
                        update_and_free_hugetlb_folio(h, folio, false);
                        cond_resched();
                }
        } else {
                /*
                 * In the case where there are no folios which can be
                 * immediately freed, we loop through the list trying to restore
                 * vmemmap individually in the hope that someone elsewhere may
                 * have done something to cause success (such as freeing some
                 * memory).  If unable to restore a hugetlb page, the hugetlb
                 * page is made a surplus page and removed from the list.
                 * If are able to restore vmemmap and free one hugetlb page, we
                 * quit processing the list to retry the bulk operation.
                 */
                list_for_each_entry_safe(folio, t_folio, folio_list, lru)
                        if (hugetlb_vmemmap_restore_folio(h, folio)) {
                                list_del(&folio->lru);
                                spin_lock_irq(&hugetlb_lock);
                                add_hugetlb_folio(h, folio, true);
                                spin_unlock_irq(&hugetlb_lock);
                        } else {
                                list_del(&folio->lru);
                                spin_lock_irq(&hugetlb_lock);
                                __folio_clear_hugetlb(folio);
                                spin_unlock_irq(&hugetlb_lock);
                                update_and_free_hugetlb_folio(h, folio, false);
                                cond_resched();
                                break;
                        }
        }
}

static void update_and_free_pages_bulk(struct hstate *h,
                                                struct list_head *folio_list)
{
        long ret;
        struct folio *folio, *t_folio;
        LIST_HEAD(non_hvo_folios);

        /*
         * First allocate required vmemmmap (if necessary) for all folios.
         * Carefully handle errors and free up any available hugetlb pages
         * in an effort to make forward progress.
         */
retry:
        ret = hugetlb_vmemmap_restore_folios(h, folio_list, &non_hvo_folios);
        if (ret < 0) {
                bulk_vmemmap_restore_error(h, folio_list, &non_hvo_folios);
                goto retry;
        }

        /*
         * At this point, list should be empty, ret should be >= 0 and there
         * should only be pages on the non_hvo_folios list.
         * Do note that the non_hvo_folios list could be empty.
         * Without HVO enabled, ret will be 0 and there is no need to call
         * __folio_clear_hugetlb as this was done previously.
         */
        VM_WARN_ON(!list_empty(folio_list));
        VM_WARN_ON(ret < 0);
        if (!list_empty(&non_hvo_folios) && ret) {
                spin_lock_irq(&hugetlb_lock);
                list_for_each_entry(folio, &non_hvo_folios, lru)
                        __folio_clear_hugetlb(folio);
                spin_unlock_irq(&hugetlb_lock);
        }

        list_for_each_entry_safe(folio, t_folio, &non_hvo_folios, lru) {
                update_and_free_hugetlb_folio(h, folio, false);
                cond_resched();
        }
}

struct hstate *size_to_hstate(unsigned long size)
{
        struct hstate *h;

        for_each_hstate(h) {
                if (huge_page_size(h) == size)
                        return h;
        }
        return NULL;
}

void free_huge_folio(struct folio *folio)
{
        /*
         * Can't pass hstate in here because it is called from the
         * generic mm code.
         */
        struct hstate *h = folio_hstate(folio);
        int nid = folio_nid(folio);
        struct hugepage_subpool *spool = hugetlb_folio_subpool(folio);
        bool restore_reserve;
        unsigned long flags;

        VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
        VM_BUG_ON_FOLIO(folio_mapcount(folio), folio);

        hugetlb_set_folio_subpool(folio, NULL);
        if (folio_test_anon(folio))
                __ClearPageAnonExclusive(&folio->page);
        folio->mapping = NULL;
        restore_reserve = folio_test_hugetlb_restore_reserve(folio);
        folio_clear_hugetlb_restore_reserve(folio);

        /*
         * If HPageRestoreReserve was set on page, page allocation consumed a
         * reservation.  If the page was associated with a subpool, there
         * would have been a page reserved in the subpool before allocation
         * via hugepage_subpool_get_pages().  Since we are 'restoring' the
         * reservation, do not call hugepage_subpool_put_pages() as this will
         * remove the reserved page from the subpool.
         */
        if (!restore_reserve) {
                /*
                 * A return code of zero implies that the subpool will be
                 * under its minimum size if the reservation is not restored
                 * after page is free.  Therefore, force restore_reserve
                 * operation.
                 */
                if (hugepage_subpool_put_pages(spool, 1) == 0)
                        restore_reserve = true;
        }

        spin_lock_irqsave(&hugetlb_lock, flags);
        folio_clear_hugetlb_migratable(folio);
        hugetlb_cgroup_uncharge_folio(hstate_index(h),
                                     pages_per_huge_page(h), folio);
        hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h),
                                          pages_per_huge_page(h), folio);
        lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h));
        mem_cgroup_uncharge(folio);
        if (restore_reserve)
                h->resv_huge_pages++;

        if (folio_test_hugetlb_temporary(folio)) {
                remove_hugetlb_folio(h, folio, false);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
                update_and_free_hugetlb_folio(h, folio, true);
        } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
                remove_hugetlb_folio(h, folio, true);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
                update_and_free_hugetlb_folio(h, folio, true);
        } else {
                arch_clear_hugetlb_flags(folio);
                enqueue_hugetlb_folio(h, folio);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
        }
}

/*
 * Must be called with the hugetlb lock held
 */
static void __prep_account_new_huge_page(struct hstate *h, int nid)
{
        lockdep_assert_held(&hugetlb_lock);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;
}

static void init_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
        __folio_set_hugetlb(folio);
        INIT_LIST_HEAD(&folio->lru);
        hugetlb_set_folio_subpool(folio, NULL);
        set_hugetlb_cgroup(folio, NULL);
        set_hugetlb_cgroup_rsvd(folio, NULL);
}

static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio)
{
        init_new_hugetlb_folio(h, folio);
        hugetlb_vmemmap_optimize_folio(h, folio);
}

static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid)
{
        __prep_new_hugetlb_folio(h, folio);
        spin_lock_irq(&hugetlb_lock);
        __prep_account_new_huge_page(h, nid);
        spin_unlock_irq(&hugetlb_lock);
}

/*
 * Find and lock address space (mapping) in write mode.
 *
 * Upon entry, the folio is locked which means that folio_mapping() is
 * stable.  Due to locking order, we can only trylock_write.  If we can
 * not get the lock, simply return NULL to caller.
 */
struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
{
        struct address_space *mapping = folio_mapping(folio);

        if (!mapping)
                return mapping;

        if (i_mmap_trylock_write(mapping))
                return mapping;

        return NULL;
}

static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask,
                nodemask_t *node_alloc_noretry)
{
        int order = huge_page_order(h);
        struct folio *folio;
        bool alloc_try_hard = true;
        bool retry = true;

        /*
         * By default we always try hard to allocate the folio with
         * __GFP_RETRY_MAYFAIL flag.  However, if we are allocating folios in
         * a loop (to adjust global huge page counts) and previous allocation
         * failed, do not continue to try hard on the same node.  Use the
         * node_alloc_noretry bitmap to manage this state information.
         */
        if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
                alloc_try_hard = false;
        if (alloc_try_hard)
                gfp_mask |= __GFP_RETRY_MAYFAIL;
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
retry:
        folio = __folio_alloc(gfp_mask, order, nid, nmask);
        /* Ensure hugetlb folio won't have large_rmappable flag set. */
        if (folio)
                folio_clear_large_rmappable(folio);

        if (folio && !folio_ref_freeze(folio, 1)) {
                folio_put(folio);
                if (retry) {        /* retry once */
                        retry = false;
                        goto retry;
                }
                /* WOW!  twice in a row. */
                pr_warn("HugeTLB unexpected inflated folio ref count\n");
                folio = NULL;
        }

        /*
         * If we did not specify __GFP_RETRY_MAYFAIL, but still got a
         * folio this indicates an overall state change.  Clear bit so
         * that we resume normal 'try hard' allocations.
         */
        if (node_alloc_noretry && folio && !alloc_try_hard)
                node_clear(nid, *node_alloc_noretry);

        /*
         * If we tried hard to get a folio but failed, set bit so that
         * subsequent attempts will not try as hard until there is an
         * overall state change.
         */
        if (node_alloc_noretry && !folio && alloc_try_hard)
                node_set(nid, *node_alloc_noretry);

        if (!folio) {
                __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
                return NULL;
        }

        __count_vm_event(HTLB_BUDDY_PGALLOC);
        return folio;
}

static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask,
                nodemask_t *node_alloc_noretry)
{
        struct folio *folio;

        if (hstate_is_gigantic(h))
                folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
        else
                folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry);
        if (folio)
                init_new_hugetlb_folio(h, folio);
        return folio;
}

/*
 * Common helper to allocate a fresh hugetlb page. All specific allocators
 * should use this function to get new hugetlb pages
 *
 * Note that returned page is 'frozen':  ref count of head page and all tail
 * pages is zero.
 */
static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask)
{
        struct folio *folio;

        if (hstate_is_gigantic(h))
                folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
        else
                folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
        if (!folio)
                return NULL;

        prep_new_hugetlb_folio(h, folio, folio_nid(folio));
        return folio;
}

static void prep_and_add_allocated_folios(struct hstate *h,
                                        struct list_head *folio_list)
{
        unsigned long flags;
        struct folio *folio, *tmp_f;

        /* Send list for bulk vmemmap optimization processing */
        hugetlb_vmemmap_optimize_folios(h, folio_list);

        /* Add all new pool pages to free lists in one lock cycle */
        spin_lock_irqsave(&hugetlb_lock, flags);
        list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
                __prep_account_new_huge_page(h, folio_nid(folio));
                enqueue_hugetlb_folio(h, folio);
        }
        spin_unlock_irqrestore(&hugetlb_lock, flags);
}

/*
 * Allocates a fresh hugetlb page in a node interleaved manner.  The page
 * will later be added to the appropriate hugetlb pool.
 */
static struct folio *alloc_pool_huge_folio(struct hstate *h,
                                        nodemask_t *nodes_allowed,
                                        nodemask_t *node_alloc_noretry,
                                        int *next_node)
{
        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
        int nr_nodes, node;

        for_each_node_mask_to_alloc(next_node, nr_nodes, node, nodes_allowed) {
                struct folio *folio;

                folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, node,
                                        nodes_allowed, node_alloc_noretry);
                if (folio)
                        return folio;
        }

        return NULL;
}

/*
 * Remove huge page from pool from next node to free.  Attempt to keep
 * persistent huge pages more or less balanced over allowed nodes.
 * This routine only 'removes' the hugetlb page.  The caller must make
 * an additional call to free the page to low level allocators.
 * Called with hugetlb_lock locked.
 */
static struct folio *remove_pool_hugetlb_folio(struct hstate *h,
                nodemask_t *nodes_allowed, bool acct_surplus)
{
        int nr_nodes, node;
        struct folio *folio = NULL;

        lockdep_assert_held(&hugetlb_lock);
        for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                /*
                 * If we're returning unused surplus pages, only examine
                 * nodes with surplus pages.
                 */
                if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
                    !list_empty(&h->hugepage_freelists[node])) {
                        folio = list_entry(h->hugepage_freelists[node].next,
                                          struct folio, lru);
                        remove_hugetlb_folio(h, folio, acct_surplus);
                        break;
                }
        }

        return folio;
}

/*
 * Dissolve a given free hugetlb folio into free buddy pages. This function
 * does nothing for in-use hugetlb folios and non-hugetlb folios.
 * This function returns values like below:
 *
 *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
 *           when the system is under memory pressure and the feature of
 *           freeing unused vmemmap pages associated with each hugetlb page
 *           is enabled.
 *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
 *           (allocated or reserved.)
 *       0:  successfully dissolved free hugepages or the page is not a
 *           hugepage (considered as already dissolved)
 */
int dissolve_free_hugetlb_folio(struct folio *folio)
{
        int rc = -EBUSY;

retry:
        /* Not to disrupt normal path by vainly holding hugetlb_lock */
        if (!folio_test_hugetlb(folio))
                return 0;

        spin_lock_irq(&hugetlb_lock);
        if (!folio_test_hugetlb(folio)) {
                rc = 0;
                goto out;
        }

        if (!folio_ref_count(folio)) {
                struct hstate *h = folio_hstate(folio);
                bool adjust_surplus = false;

                if (!available_huge_pages(h))
                        goto out;

                /*
                 * We should make sure that the page is already on the free list
                 * when it is dissolved.
                 */
                if (unlikely(!folio_test_hugetlb_freed(folio))) {
                        spin_unlock_irq(&hugetlb_lock);
                        cond_resched();

                        /*
                         * Theoretically, we should return -EBUSY when we
                         * encounter this race. In fact, we have a chance
                         * to successfully dissolve the page if we do a
                         * retry. Because the race window is quite small.
                         * If we seize this opportunity, it is an optimization
                         * for increasing the success rate of dissolving page.
                         */
                        goto retry;
                }

                if (h->surplus_huge_pages_node[folio_nid(folio)])
                        adjust_surplus = true;
                remove_hugetlb_folio(h, folio, adjust_surplus);
                h->max_huge_pages--;
                spin_unlock_irq(&hugetlb_lock);

                /*
                 * Normally update_and_free_hugtlb_folio will allocate required vmemmmap
                 * before freeing the page.  update_and_free_hugtlb_folio will fail to
                 * free the page if it can not allocate required vmemmap.  We
                 * need to adjust max_huge_pages if the page is not freed.
                 * Attempt to allocate vmemmmap here so that we can take
                 * appropriate action on failure.
                 *
                 * The folio_test_hugetlb check here is because
                 * remove_hugetlb_folio will clear hugetlb folio flag for
                 * non-vmemmap optimized hugetlb folios.
                 */
                if (folio_test_hugetlb(folio)) {
                        rc = hugetlb_vmemmap_restore_folio(h, folio);
                        if (rc) {
                                spin_lock_irq(&hugetlb_lock);
                                add_hugetlb_folio(h, folio, adjust_surplus);
                                h->max_huge_pages++;
                                goto out;
                        }
                } else
                        rc = 0;

                update_and_free_hugetlb_folio(h, folio, false);
                return rc;
        }
out:
        spin_unlock_irq(&hugetlb_lock);
        return rc;
}

/*
 * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
 * make specified memory blocks removable from the system.
 * Note that this will dissolve a free gigantic hugepage completely, if any
 * part of it lies within the given range.
 * Also note that if dissolve_free_hugetlb_folio() returns with an error, all
 * free hugetlb folios that were dissolved before that error are lost.
 */
int dissolve_free_hugetlb_folios(unsigned long start_pfn, unsigned long end_pfn)
{
        unsigned long pfn;
        struct folio *folio;
        int rc = 0;
        unsigned int order;
        struct hstate *h;

        if (!hugepages_supported())
                return rc;

        order = huge_page_order(&default_hstate);
        for_each_hstate(h)
                order = min(order, huge_page_order(h));

        for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
                folio = pfn_folio(pfn);
                rc = dissolve_free_hugetlb_folio(folio);
                if (rc)
                        break;
        }

        return rc;
}

/*
 * Allocates a fresh surplus page from the page allocator.
 */
static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
                                gfp_t gfp_mask,        int nid, nodemask_t *nmask)
{
        struct folio *folio = NULL;

        if (hstate_is_gigantic(h))
                return NULL;

        spin_lock_irq(&hugetlb_lock);
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
                goto out_unlock;
        spin_unlock_irq(&hugetlb_lock);

        folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
        if (!folio)
                return NULL;

        hugetlb_vmemmap_optimize_folio(h, folio);

        spin_lock_irq(&hugetlb_lock);
        /*
         * nr_huge_pages needs to be adjusted within the same lock cycle
         * as surplus_pages, otherwise it might confuse
         * persistent_huge_pages() momentarily.
         */
        __prep_account_new_huge_page(h, folio_nid(folio));

        /*
         * We could have raced with the pool size change.
         * Double check that and simply deallocate the new page
         * if we would end up overcommiting the surpluses. Abuse
         * temporary page to workaround the nasty free_huge_folio
         * codeflow
         */
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                folio_set_hugetlb_temporary(folio);
                spin_unlock_irq(&hugetlb_lock);
                free_huge_folio(folio);
                return NULL;
        }

        h->surplus_huge_pages++;
        h->surplus_huge_pages_node[folio_nid(folio)]++;

out_unlock:
        spin_unlock_irq(&hugetlb_lock);

        return folio;
}

static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
                                     int nid, nodemask_t *nmask)
{
        struct folio *folio;

        if (hstate_is_gigantic(h))
                return NULL;

        folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask);
        if (!folio)
                return NULL;

        /* fresh huge pages are frozen */
        folio_ref_unfreeze(folio, 1);
        /*
         * We do not account these pages as surplus because they are only
         * temporary and will be released properly on the last reference
         */
        folio_set_hugetlb_temporary(folio);

        return folio;
}

/*
 * Use the VMA's mpolicy to allocate a huge page from the buddy.
 */
static
struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
                struct vm_area_struct *vma, unsigned long addr)
{
        struct folio *folio = NULL;
        struct mempolicy *mpol;
        gfp_t gfp_mask = htlb_alloc_mask(h);
        int nid;
        nodemask_t *nodemask;

        nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
        if (mpol_is_preferred_many(mpol)) {
                gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);

                folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);

                /* Fallback to all nodes if page==NULL */
                nodemask = NULL;
        }

        if (!folio)
                folio = alloc_surplus_hugetlb_folio(h, gfp_mask, nid, nodemask);
        mpol_cond_put(mpol);
        return folio;
}

struct folio *alloc_hugetlb_folio_reserve(struct hstate *h, int preferred_nid,
                nodemask_t *nmask, gfp_t gfp_mask)
{
        struct folio *folio;

        spin_lock_irq(&hugetlb_lock);
        folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, preferred_nid,
                                               nmask);
        if (folio) {
                VM_BUG_ON(!h->resv_huge_pages);
                h->resv_huge_pages--;
        }

        spin_unlock_irq(&hugetlb_lock);
        return folio;
}

/* folio migration callback function */
struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid,
                nodemask_t *nmask, gfp_t gfp_mask, bool allow_alloc_fallback)
{
        spin_lock_irq(&hugetlb_lock);
        if (available_huge_pages(h)) {
                struct folio *folio;

                folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
                                                preferred_nid, nmask);
                if (folio) {
                        spin_unlock_irq(&hugetlb_lock);
                        return folio;
                }
        }
        spin_unlock_irq(&hugetlb_lock);

        /* We cannot fallback to other nodes, as we could break the per-node pool. */
        if (!allow_alloc_fallback)
                gfp_mask |= __GFP_THISNODE;

        return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask);
}

static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
{
#ifdef CONFIG_NUMA
        struct mempolicy *mpol = get_task_policy(current);

        /*
         * Only enforce MPOL_BIND policy which overlaps with cpuset policy
         * (from policy_nodemask) specifically for hugetlb case
         */
        if (mpol->mode == MPOL_BIND &&
                (apply_policy_zone(mpol, gfp_zone(gfp)) &&
                 cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
                return &mpol->nodes;
#endif
        return NULL;
}

/*
 * Increase the hugetlb pool such that it can accommodate a reservation
 * of size 'delta'.
 */
static int gather_surplus_pages(struct hstate *h, long delta)
        __must_hold(&hugetlb_lock)
{
        LIST_HEAD(surplus_list);
        struct folio *folio, *tmp;
        int ret;
        long i;
        long needed, allocated;
        bool alloc_ok = true;
        int node;
        nodemask_t *mbind_nodemask, alloc_nodemask;

        mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
        if (mbind_nodemask)
                nodes_and(alloc_nodemask, *mbind_nodemask, cpuset_current_mems_allowed);
        else
                alloc_nodemask = cpuset_current_mems_allowed;

        lockdep_assert_held(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
                h->resv_huge_pages += delta;
                return 0;
        }

        allocated = 0;

        ret = -ENOMEM;
retry:
        spin_unlock_irq(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
                folio = NULL;

                /* Prioritize current node */
                if (node_isset(numa_mem_id(), alloc_nodemask))
                        folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
                                        numa_mem_id(), NULL);

                if (!folio) {
                        for_each_node_mask(node, alloc_nodemask) {
                                if (node == numa_mem_id())
                                        continue;
                                folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
                                                node, NULL);
                                if (folio)
                                        break;
                        }
                }
                if (!folio) {
                        alloc_ok = false;
                        break;
                }
                list_add(&folio->lru, &surplus_list);
                cond_resched();
        }
        allocated += i;

        /*
         * After retaking hugetlb_lock, we need to recalculate 'needed'
         * because either resv_huge_pages or free_huge_pages may have changed.
         */
        spin_lock_irq(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) -
                        (h->free_huge_pages + allocated);
        if (needed > 0) {
                if (alloc_ok)
                        goto retry;
                /*
                 * We were not able to allocate enough pages to
                 * satisfy the entire reservation so we free what
                 * we've allocated so far.
                 */
                goto free;
        }
        /*
         * The surplus_list now contains _at_least_ the number of extra pages
         * needed to accommodate the reservation.  Add the appropriate number
         * of pages to the hugetlb pool and free the extras back to the buddy
         * allocator.  Commit the entire reservation here to prevent another
         * process from stealing the pages as they are added to the pool but
         * before they are reserved.
         */
        needed += allocated;
        h->resv_huge_pages += delta;
        ret = 0;

        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(folio, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
                /* Add the page to the hugetlb allocator */
                enqueue_hugetlb_folio(h, folio);
        }
free:
        spin_unlock_irq(&hugetlb_lock);

        /*
         * Free unnecessary surplus pages to the buddy allocator.
         * Pages have no ref count, call free_huge_folio directly.
         */
        list_for_each_entry_safe(folio, tmp, &surplus_list, lru)
                free_huge_folio(folio);
        spin_lock_irq(&hugetlb_lock);

        return ret;
}

/*
 * This routine has two main purposes:
 * 1) Decrement the reservation count (resv_huge_pages) by the value passed
 *    in unused_resv_pages.  This corresponds to the prior adjustments made
 *    to the associated reservation map.
 * 2) Free any unused surplus pages that may have been allocated to satisfy
 *    the reservation.  As many as unused_resv_pages may be freed.
 */
static void return_unused_surplus_pages(struct hstate *h,
                                        unsigned long unused_resv_pages)
{
        unsigned long nr_pages;
        LIST_HEAD(page_list);

        lockdep_assert_held(&hugetlb_lock);
        /* Uncommit the reservation */
        h->resv_huge_pages -= unused_resv_pages;

        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                goto out;

        /*
         * Part (or even all) of the reservation could have been backed
         * by pre-allocated pages. Only free surplus pages.
         */
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);

        /*
         * We want to release as many surplus pages as possible, spread
         * evenly across all nodes with memory. Iterate across these nodes
         * until we can no longer free unreserved surplus pages. This occurs
         * when the nodes with surplus pages have no free pages.
         * remove_pool_hugetlb_folio() will balance the freed pages across the
         * on-line nodes with memory and will handle the hstate accounting.
         */
        while (nr_pages--) {
                struct folio *folio;

                folio = remove_pool_hugetlb_folio(h, &node_states[N_MEMORY], 1);
                if (!folio)
                        goto out;

                list_add(&folio->lru, &page_list);
        }

out:
        spin_unlock_irq(&hugetlb_lock);
        update_and_free_pages_bulk(h, &page_list);
        spin_lock_irq(&hugetlb_lock);
}


/*
 * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
 * are used by the huge page allocation routines to manage reservations.
 *
 * vma_needs_reservation is called to determine if the huge page at addr
 * within the vma has an associated reservation.  If a reservation is
 * needed, the value 1 is returned.  The caller is then responsible for
 * managing the global reservation and subpool usage counts.  After
 * the huge page has been allocated, vma_commit_reservation is called
 * to add the page to the reservation map.  If the page allocation fails,
 * the reservation must be ended instead of committed.  vma_end_reservation
 * is called in such cases.
 *
 * In the normal case, vma_commit_reservation returns the same value
 * as the preceding vma_needs_reservation call.  The only time this
 * is not the case is if a reserve map was changed between calls.  It
 * is the responsibility of the caller to notice the difference and
 * take appropriate action.
 *
 * vma_add_reservation is used in error paths where a reservation must
 * be restored when a newly allocated huge page must be freed.  It is
 * to be called after calling vma_needs_reservation to determine if a
 * reservation exists.
 *
 * vma_del_reservation is used in error paths where an entry in the reserve
 * map was created during huge page allocation and must be removed.  It is to
 * be called after calling vma_needs_reservation to determine if a reservation
 * exists.
 */
enum vma_resv_mode {
        VMA_NEEDS_RESV,
        VMA_COMMIT_RESV,
        VMA_END_RESV,
        VMA_ADD_RESV,
        VMA_DEL_RESV,
};
static long __vma_reservation_common(struct hstate *h,
                                struct vm_area_struct *vma, unsigned long addr,
                                enum vma_resv_mode mode)
{
        struct resv_map *resv;
        pgoff_t idx;
        long ret;
        long dummy_out_regions_needed;

        resv = vma_resv_map(vma);
        if (!resv)
                return 1;

        idx = vma_hugecache_offset(h, vma, addr);
        switch (mode) {
        case VMA_NEEDS_RESV:
                ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
                /* We assume that vma_reservation_* routines always operate on
                 * 1 page, and that adding to resv map a 1 page entry can only
                 * ever require 1 region.
                 */
                VM_BUG_ON(dummy_out_regions_needed != 1);
                break;
        case VMA_COMMIT_RESV:
                ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
                /* region_add calls of range 1 should never fail. */
                VM_BUG_ON(ret < 0);
                break;
        case VMA_END_RESV:
                region_abort(resv, idx, idx + 1, 1);
                ret = 0;
                break;
        case VMA_ADD_RESV:
                if (vma->vm_flags & VM_MAYSHARE) {
                        ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
                        /* region_add calls of range 1 should never fail. */
                        VM_BUG_ON(ret < 0);
                } else {
                        region_abort(resv, idx, idx + 1, 1);
                        ret = region_del(resv, idx, idx + 1);
                }
                break;
        case VMA_DEL_RESV:
                if (vma->vm_flags & VM_MAYSHARE) {
                        region_abort(resv, idx, idx + 1, 1);
                        ret = region_del(resv, idx, idx + 1);
                } else {
                        ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
                        /* region_add calls of range 1 should never fail. */
                        VM_BUG_ON(ret < 0);
                }
                break;
        default:
                BUG();
        }

        if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV)
                return ret;
        /*
         * We know private mapping must have HPAGE_RESV_OWNER set.
         *
         * In most cases, reserves always exist for private mappings.
         * However, a file associated with mapping could have been
         * hole punched or truncated after reserves were consumed.
         * As subsequent fault on such a range will not use reserves.
         * Subtle - The reserve map for private mappings has the
         * opposite meaning than that of shared mappings.  If NO
         * entry is in the reserve map, it means a reservation exists.
         * If an entry exists in the reserve map, it means the
         * reservation has already been consumed.  As a result, the
         * return value of this routine is the opposite of the
         * value returned from reserve map manipulation routines above.
         */
        if (ret > 0)
                return 0;
        if (ret == 0)
                return 1;
        return ret;
}

static long vma_needs_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
}

static long vma_commit_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
}

static void vma_end_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
}

static long vma_add_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
}

static long vma_del_reservation(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long addr)
{
        return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV);
}

/*
 * This routine is called to restore reservation information on error paths.
 * It should ONLY be called for folios allocated via alloc_hugetlb_folio(),
 * and the hugetlb mutex should remain held when calling this routine.
 *
 * It handles two specific cases:
 * 1) A reservation was in place and the folio consumed the reservation.
 *    hugetlb_restore_reserve is set in the folio.
 * 2) No reservation was in place for the page, so hugetlb_restore_reserve is
 *    not set.  However, alloc_hugetlb_folio always updates the reserve map.
 *
 * In case 1, free_huge_folio later in the error path will increment the
 * global reserve count.  But, free_huge_folio does not have enough context
 * to adjust the reservation map.  This case deals primarily with private
 * mappings.  Adjust the reserve map here to be consistent with global
 * reserve count adjustments to be made by free_huge_folio.  Make sure the
 * reserve map indicates there is a reservation present.
 *
 * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio.
 */
void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
                        unsigned long address, struct folio *folio)
{
        long rc = vma_needs_reservation(h, vma, address);

        if (folio_test_hugetlb_restore_reserve(folio)) {
                if (unlikely(rc < 0))
                        /*
                         * Rare out of memory condition in reserve map
                         * manipulation.  Clear hugetlb_restore_reserve so
                         * that global reserve count will not be incremented
                         * by free_huge_folio.  This will make it appear
                         * as though the reservation for this folio was
                         * consumed.  This may prevent the task from
                         * faulting in the folio at a later time.  This
                         * is better than inconsistent global huge page
                         * accounting of reserve counts.
                         */
                        folio_clear_hugetlb_restore_reserve(folio);
                else if (rc)
                        (void)vma_add_reservation(h, vma, address);
                else
                        vma_end_reservation(h, vma, address);
        } else {
                if (!rc) {
                        /*
                         * This indicates there is an entry in the reserve map
                         * not added by alloc_hugetlb_folio.  We know it was added
                         * before the alloc_hugetlb_folio call, otherwise
                         * hugetlb_restore_reserve would be set on the folio.
                         * Remove the entry so that a subsequent allocation
                         * does not consume a reservation.
                         */
                        rc = vma_del_reservation(h, vma, address);
                        if (rc < 0)
                                /*
                                 * VERY rare out of memory condition.  Since
                                 * we can not delete the entry, set
                                 * hugetlb_restore_reserve so that the reserve
                                 * count will be incremented when the folio
                                 * is freed.  This reserve will be consumed
                                 * on a subsequent allocation.
                                 */
                                folio_set_hugetlb_restore_reserve(folio);
                } else if (rc < 0) {
                        /*
                         * Rare out of memory condition from
                         * vma_needs_reservation call.  Memory allocation is
                         * only attempted if a new entry is needed.  Therefore,
                         * this implies there is not an entry in the
                         * reserve map.
                         *
                         * For shared mappings, no entry in the map indicates
                         * no reservation.  We are done.
                         */
                        if (!(vma->vm_flags & VM_MAYSHARE))
                                /*
                                 * For private mappings, no entry indicates
                                 * a reservation is present.  Since we can
                                 * not add an entry, set hugetlb_restore_reserve
                                 * on the folio so reserve count will be
                                 * incremented when freed.  This reserve will
                                 * be consumed on a subsequent allocation.
                                 */
                                folio_set_hugetlb_restore_reserve(folio);
                } else
                        /*
                         * No reservation present, do nothing
                         */
                         vma_end_reservation(h, vma, address);
        }
}

/*
 * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve
 * the old one
 * @h: struct hstate old page belongs to
 * @old_folio: Old folio to dissolve
 * @list: List to isolate the page in case we need to
 * Returns 0 on success, otherwise negated error.
 */
static int alloc_and_dissolve_hugetlb_folio(struct hstate *h,
                        struct folio *old_folio, struct list_head *list)
{
        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
        int nid = folio_nid(old_folio);
        struct folio *new_folio = NULL;
        int ret = 0;

retry:
        spin_lock_irq(&hugetlb_lock);
        if (!folio_test_hugetlb(old_folio)) {
                /*
                 * Freed from under us. Drop new_folio too.
                 */
                goto free_new;
        } else if (folio_ref_count(old_folio)) {
                bool isolated;

                /*
                 * Someone has grabbed the folio, try to isolate it here.
                 * Fail with -EBUSY if not possible.
                 */
                spin_unlock_irq(&hugetlb_lock);
                isolated = folio_isolate_hugetlb(old_folio, list);
                ret = isolated ? 0 : -EBUSY;
                spin_lock_irq(&hugetlb_lock);
                goto free_new;
        } else if (!folio_test_hugetlb_freed(old_folio)) {
                /*
                 * Folio's refcount is 0 but it has not been enqueued in the
                 * freelist yet. Race window is small, so we can succeed here if
                 * we retry.
                 */
                spin_unlock_irq(&hugetlb_lock);
                cond_resched();
                goto retry;
        } else {
                if (!new_folio) {
                        spin_unlock_irq(&hugetlb_lock);
                        new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid,
                                                              NULL, NULL);
                        if (!new_folio)
                                return -ENOMEM;
                        __prep_new_hugetlb_folio(h, new_folio);
                        goto retry;
                }

                /*
                 * Ok, old_folio is still a genuine free hugepage. Remove it from
                 * the freelist and decrease the counters. These will be
                 * incremented again when calling __prep_account_new_huge_page()
                 * and enqueue_hugetlb_folio() for new_folio. The counters will
                 * remain stable since this happens under the lock.
                 */
                remove_hugetlb_folio(h, old_folio, false);

                /*
                 * Ref count on new_folio is already zero as it was dropped
                 * earlier.  It can be directly added to the pool free list.
                 */
                __prep_account_new_huge_page(h, nid);
                enqueue_hugetlb_folio(h, new_folio);

                /*
                 * Folio has been replaced, we can safely free the old one.
                 */
                spin_unlock_irq(&hugetlb_lock);
                update_and_free_hugetlb_folio(h, old_folio, false);
        }

        return ret;

free_new:
        spin_unlock_irq(&hugetlb_lock);
        if (new_folio)
                update_and_free_hugetlb_folio(h, new_folio, false);

        return ret;
}

int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
{
        struct hstate *h;
        struct folio *folio = page_folio(page);
        int ret = -EBUSY;

        /*
         * The page might have been dissolved from under our feet, so make sure
         * to carefully check the state under the lock.
         * Return success when racing as if we dissolved the page ourselves.
         */
        spin_lock_irq(&hugetlb_lock);
        if (folio_test_hugetlb(folio)) {
                h = folio_hstate(folio);
        } else {
                spin_unlock_irq(&hugetlb_lock);
                return 0;
        }
        spin_unlock_irq(&hugetlb_lock);

        /*
         * Fence off gigantic pages as there is a cyclic dependency between
         * alloc_contig_range and them. Return -ENOMEM as this has the effect
         * of bailing out right away without further retrying.
         */
        if (hstate_is_gigantic(h))
                return -ENOMEM;

        if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list))
                ret = 0;
        else if (!folio_ref_count(folio))
                ret = alloc_and_dissolve_hugetlb_folio(h, folio, list);

        return ret;
}

/*
 *  replace_free_hugepage_folios - Replace free hugepage folios in a given pfn
 *  range with new folios.
 *  @start_pfn: start pfn of the given pfn range
 *  @end_pfn: end pfn of the given pfn range
 *  Returns 0 on success, otherwise negated error.
 */
int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn)
{
        struct hstate *h;
        struct folio *folio;
        int ret = 0;

        LIST_HEAD(isolate_list);

        while (start_pfn < end_pfn) {
                folio = pfn_folio(start_pfn);
                if (folio_test_hugetlb(folio)) {
                        h = folio_hstate(folio);
                } else {
                        start_pfn++;
                        continue;
                }

                if (!folio_ref_count(folio)) {
                        ret = alloc_and_dissolve_hugetlb_folio(h, folio,
                                                               &isolate_list);
                        if (ret)
                                break;

                        putback_movable_pages(&isolate_list);
                }
                start_pfn++;
        }

        return ret;
}

void wait_for_freed_hugetlb_folios(void)
{
        if (llist_empty(&hpage_freelist))
                return;

        flush_work(&free_hpage_work);
}

typedef enum {
        /*
         * For either 0/1: we checked the per-vma resv map, and one resv
         * count either can be reused (0), or an extra needed (1).
         */
        MAP_CHG_REUSE = 0,
        MAP_CHG_NEEDED = 1,
        /*
         * Cannot use per-vma resv count can be used, hence a new resv
         * count is enforced.
         *
         * NOTE: This is mostly identical to MAP_CHG_NEEDED, except
         * that currently vma_needs_reservation() has an unwanted side
         * effect to either use end() or commit() to complete the
         * transaction.         Hence it needs to differenciate from NEEDED.
         */
        MAP_CHG_ENFORCED = 2,
} map_chg_state;

/*
 * NOTE! "cow_from_owner" represents a very hacky usage only used in CoW
 * faults of hugetlb private mappings on top of a non-page-cache folio (in
 * which case even if there's a private vma resv map it won't cover such
 * allocation).  New call sites should (probably) never set it to true!!
 * When it's set, the allocation will bypass all vma level reservations.
 */
struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
                                    unsigned long addr, bool cow_from_owner)
{
        struct hugepage_subpool *spool = subpool_vma(vma);
        struct hstate *h = hstate_vma(vma);
        struct folio *folio;
        long retval, gbl_chg;
        map_chg_state map_chg;
        int ret, idx;
        struct hugetlb_cgroup *h_cg = NULL;
        gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;

        idx = hstate_index(h);

        /* Whether we need a separate per-vma reservation? */
        if (cow_from_owner) {
                /*
                 * Special case!  Since it's a CoW on top of a reserved
                 * page, the private resv map doesn't count.  So it cannot
                 * consume the per-vma resv map even if it's reserved.
                 */
                map_chg = MAP_CHG_ENFORCED;
        } else {
                /*
                 * Examine the region/reserve map to determine if the process
                 * has a reservation for the page to be allocated.  A return
                 * code of zero indicates a reservation exists (no change).
                 */
                retval = vma_needs_reservation(h, vma, addr);
                if (retval < 0)
                        return ERR_PTR(-ENOMEM);
                map_chg = retval ? MAP_CHG_NEEDED : MAP_CHG_REUSE;
        }

        /*
         * Whether we need a separate global reservation?
         *
         * Processes that did not create the mapping will have no
         * reserves as indicated by the region/reserve map. Check
         * that the allocation will not exceed the subpool limit.
         * Or if it can get one from the pool reservation directly.
         */
        if (map_chg) {
                gbl_chg = hugepage_subpool_get_pages(spool, 1);
                if (gbl_chg < 0)
                        goto out_end_reservation;
        } else {
                /*
                 * If we have the vma reservation ready, no need for extra
                 * global reservation.
                 */
                gbl_chg = 0;
        }

        /*
         * If this allocation is not consuming a per-vma reservation,
         * charge the hugetlb cgroup now.
         */
        if (map_chg) {
                ret = hugetlb_cgroup_charge_cgroup_rsvd(
                        idx, pages_per_huge_page(h), &h_cg);
                if (ret)
                        goto out_subpool_put;
        }

        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
        if (ret)
                goto out_uncharge_cgroup_reservation;

        spin_lock_irq(&hugetlb_lock);
        /*
         * glb_chg is passed to indicate whether or not a page must be taken
         * from the global free pool (global change).  gbl_chg == 0 indicates
         * a reservation exists for the allocation.
         */
        folio = dequeue_hugetlb_folio_vma(h, vma, addr, gbl_chg);
        if (!folio) {
                spin_unlock_irq(&hugetlb_lock);
                folio = alloc_buddy_hugetlb_folio_with_mpol(h, vma, addr);
                if (!folio)
                        goto out_uncharge_cgroup;
                spin_lock_irq(&hugetlb_lock);
                list_add(&folio->lru, &h->hugepage_activelist);
                folio_ref_unfreeze(folio, 1);
                /* Fall through */
        }

        /*
         * Either dequeued or buddy-allocated folio needs to add special
         * mark to the folio when it consumes a global reservation.
         */
        if (!gbl_chg) {
                folio_set_hugetlb_restore_reserve(folio);
                h->resv_huge_pages--;
        }

        hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, folio);
        /* If allocation is not consuming a reservation, also store the
         * hugetlb_cgroup pointer on the page.
         */
        if (map_chg) {
                hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
                                                  h_cg, folio);
        }

        spin_unlock_irq(&hugetlb_lock);

        hugetlb_set_folio_subpool(folio, spool);

        if (map_chg != MAP_CHG_ENFORCED) {
                /* commit() is only needed if the map_chg is not enforced */
                retval = vma_commit_reservation(h, vma, addr);
                /*
                 * Check for possible race conditions. When it happens..
                 * The page was added to the reservation map between
                 * vma_needs_reservation and vma_commit_reservation.
                 * This indicates a race with hugetlb_reserve_pages.
                 * Adjust for the subpool count incremented above AND
                 * in hugetlb_reserve_pages for the same page.        Also,
                 * the reservation count added in hugetlb_reserve_pages
                 * no longer applies.
                 */
                if (unlikely(map_chg == MAP_CHG_NEEDED && retval == 0)) {
                        long rsv_adjust;

                        rsv_adjust = hugepage_subpool_put_pages(spool, 1);
                        hugetlb_acct_memory(h, -rsv_adjust);
                        if (map_chg) {
                                spin_lock_irq(&hugetlb_lock);
                                hugetlb_cgroup_uncharge_folio_rsvd(
                                    hstate_index(h), pages_per_huge_page(h),
                                    folio);
                                spin_unlock_irq(&hugetlb_lock);
                        }
                }
        }

        ret = mem_cgroup_charge_hugetlb(folio, gfp);
        /*
         * Unconditionally increment NR_HUGETLB here. If it turns out that
         * mem_cgroup_charge_hugetlb failed, then immediately free the page and
         * decrement NR_HUGETLB.
         */
        lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h));

        if (ret == -ENOMEM) {
                free_huge_folio(folio);
                return ERR_PTR(-ENOMEM);
        }

        return folio;

out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
out_uncharge_cgroup_reservation:
        if (map_chg)
                hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
                                                    h_cg);
out_subpool_put:
        if (map_chg)
                hugepage_subpool_put_pages(spool, 1);
out_end_reservation:
        if (map_chg != MAP_CHG_ENFORCED)
                vma_end_reservation(h, vma, addr);
        return ERR_PTR(-ENOSPC);
}

static __init void *alloc_bootmem(struct hstate *h, int nid, bool node_exact)
{
        struct huge_bootmem_page *m;
        int listnode = nid;

        if (hugetlb_early_cma(h))
                m = hugetlb_cma_alloc_bootmem(h, &listnode, node_exact);
        else {
                if (node_exact)
                        m = memblock_alloc_exact_nid_raw(huge_page_size(h),
                                huge_page_size(h), 0,
                                MEMBLOCK_ALLOC_ACCESSIBLE, nid);
                else {
                        m = memblock_alloc_try_nid_raw(huge_page_size(h),
                                huge_page_size(h), 0,
                                MEMBLOCK_ALLOC_ACCESSIBLE, nid);
                        /*
                         * For pre-HVO to work correctly, pages need to be on
                         * the list for the node they were actually allocated
                         * from. That node may be different in the case of
                         * fallback by memblock_alloc_try_nid_raw. So,
                         * extract the actual node first.
                         */
                        if (m)
                                listnode = early_pfn_to_nid(PHYS_PFN(virt_to_phys(m)));
                }

                if (m) {
                        m->flags = 0;
                        m->cma = NULL;
                }
        }

        if (m) {
                /*
                 * Use the beginning of the huge page to store the
                 * huge_bootmem_page struct (until gather_bootmem
                 * puts them into the mem_map).
                 *
                 * Put them into a private list first because mem_map
                 * is not up yet.
                 */
                INIT_LIST_HEAD(&m->list);
                list_add(&m->list, &huge_boot_pages[listnode]);
                m->hstate = h;
        }

        return m;
}

int alloc_bootmem_huge_page(struct hstate *h, int nid)
        __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
int __alloc_bootmem_huge_page(struct hstate *h, int nid)
{
        struct huge_bootmem_page *m = NULL; /* initialize for clang */
        int nr_nodes, node = nid;

        /* do node specific alloc */
        if (nid != NUMA_NO_NODE) {
                m = alloc_bootmem(h, node, true);
                if (!m)
                        return 0;
                goto found;
        }

        /* allocate from next node when distributing huge pages */
        for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, &node_states[N_ONLINE]) {
                m = alloc_bootmem(h, node, false);
                if (!m)
                        return 0;
                goto found;
        }

found:

        /*
         * Only initialize the head struct page in memmap_init_reserved_pages,
         * rest of the struct pages will be initialized by the HugeTLB
         * subsystem itself.
         * The head struct page is used to get folio information by the HugeTLB
         * subsystem like zone id and node id.
         */
        memblock_reserved_mark_noinit(virt_to_phys((void *)m + PAGE_SIZE),
                huge_page_size(h) - PAGE_SIZE);

        return 1;
}

/* Initialize [start_page:end_page_number] tail struct pages of a hugepage */
static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
                                        unsigned long start_page_number,
                                        unsigned long end_page_number)
{
        enum zone_type zone = zone_idx(folio_zone(folio));
        int nid = folio_nid(folio);
        unsigned long head_pfn = folio_pfn(folio);
        unsigned long pfn, end_pfn = head_pfn + end_page_number;
        int ret;

        for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
                struct page *page = pfn_to_page(pfn);

                __init_single_page(page, pfn, zone, nid);
                prep_compound_tail((struct page *)folio, pfn - head_pfn);
                ret = page_ref_freeze(page, 1);
                VM_BUG_ON(!ret);
        }
}

static void __init hugetlb_folio_init_vmemmap(struct folio *folio,
                                              struct hstate *h,
                                              unsigned long nr_pages)
{
        int ret;

        /* Prepare folio head */
        __folio_clear_reserved(folio);
        __folio_set_head(folio);
        ret = folio_ref_freeze(folio, 1);
        VM_BUG_ON(!ret);
        /* Initialize the necessary tail struct pages */
        hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages);
        prep_compound_head((struct page *)folio, huge_page_order(h));
}

static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m)
{
        return m->flags & HUGE_BOOTMEM_HVO;
}

static bool __init hugetlb_bootmem_page_earlycma(struct huge_bootmem_page *m)
{
        return m->flags & HUGE_BOOTMEM_CMA;
}

/*
 * memblock-allocated pageblocks might not have the migrate type set
 * if marked with the 'noinit' flag. Set it to the default (MIGRATE_MOVABLE)
 * here, or MIGRATE_CMA if this was a page allocated through an early CMA
 * reservation.
 *
 * In case of vmemmap optimized folios, the tail vmemmap pages are mapped
 * read-only, but that's ok - for sparse vmemmap this does not write to
 * the page structure.
 */
static void __init hugetlb_bootmem_init_migratetype(struct folio *folio,
                                                          struct hstate *h)
{
        unsigned long nr_pages = pages_per_huge_page(h), i;

        WARN_ON_ONCE(!pageblock_aligned(folio_pfn(folio)));

        for (i = 0; i < nr_pages; i += pageblock_nr_pages) {
                if (folio_test_hugetlb_cma(folio))
                        init_cma_pageblock(folio_page(folio, i));
                else
                        set_pageblock_migratetype(folio_page(folio, i),
                                          MIGRATE_MOVABLE);
        }
}

static void __init prep_and_add_bootmem_folios(struct hstate *h,
                                        struct list_head *folio_list)
{
        unsigned long flags;
        struct folio *folio, *tmp_f;

        /* Send list for bulk vmemmap optimization processing */
        hugetlb_vmemmap_optimize_bootmem_folios(h, folio_list);

        list_for_each_entry_safe(folio, tmp_f, folio_list, lru) {
                if (!folio_test_hugetlb_vmemmap_optimized(folio)) {
                        /*
                         * If HVO fails, initialize all tail struct pages
                         * We do not worry about potential long lock hold
                         * time as this is early in boot and there should
                         * be no contention.
                         */
                        hugetlb_folio_init_tail_vmemmap(folio,
                                        HUGETLB_VMEMMAP_RESERVE_PAGES,
                                        pages_per_huge_page(h));
                }
                hugetlb_bootmem_init_migratetype(folio, h);
                /* Subdivide locks to achieve better parallel performance */
                spin_lock_irqsave(&hugetlb_lock, flags);
                __prep_account_new_huge_page(h, folio_nid(folio));
                enqueue_hugetlb_folio(h, folio);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
        }
}

bool __init hugetlb_bootmem_page_zones_valid(int nid,
                                             struct huge_bootmem_page *m)
{
        unsigned long start_pfn;
        bool valid;

        if (m->flags & HUGE_BOOTMEM_ZONES_VALID) {
                /*
                 * Already validated, skip check.
                 */
                return true;
        }

        if (hugetlb_bootmem_page_earlycma(m)) {
                valid = cma_validate_zones(m->cma);
                goto out;
        }

        start_pfn = virt_to_phys(m) >> PAGE_SHIFT;

        valid = !pfn_range_intersects_zones(nid, start_pfn,
                        pages_per_huge_page(m->hstate));
out:
        if (!valid)
                hstate_boot_nrinvalid[hstate_index(m->hstate)]++;

        return valid;
}

/*
 * Free a bootmem page that was found to be invalid (intersecting with
 * multiple zones).
 *
 * Since it intersects with multiple zones, we can't just do a free
 * operation on all pages at once, but instead have to walk all
 * pages, freeing them one by one.
 */
static void __init hugetlb_bootmem_free_invalid_page(int nid, struct page *page,
                                             struct hstate *h)
{
        unsigned long npages = pages_per_huge_page(h);
        unsigned long pfn;

        while (npages--) {
                pfn = page_to_pfn(page);
                __init_page_from_nid(pfn, nid);
                free_reserved_page(page);
                page++;
        }
}

/*
 * Put bootmem huge pages into the standard lists after mem_map is up.
 * Note: This only applies to gigantic (order > MAX_PAGE_ORDER) pages.
 */
static void __init gather_bootmem_prealloc_node(unsigned long nid)
{
        LIST_HEAD(folio_list);
        struct huge_bootmem_page *m, *tm;
        struct hstate *h = NULL, *prev_h = NULL;

        list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
                struct page *page = virt_to_page(m);
                struct folio *folio = (void *)page;

                h = m->hstate;
                if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
                        /*
                         * Can't use this page. Initialize the
                         * page structures if that hasn't already
                         * been done, and give them to the page
                         * allocator.
                         */
                        hugetlb_bootmem_free_invalid_page(nid, page, h);
                        continue;
                }

                /*
                 * It is possible to have multiple huge page sizes (hstates)
                 * in this list.  If so, process each size separately.
                 */
                if (h != prev_h && prev_h != NULL)
                        prep_and_add_bootmem_folios(prev_h, &folio_list);
                prev_h = h;

                VM_BUG_ON(!hstate_is_gigantic(h));
                WARN_ON(folio_ref_count(folio) != 1);

                hugetlb_folio_init_vmemmap(folio, h,
                                           HUGETLB_VMEMMAP_RESERVE_PAGES);
                init_new_hugetlb_folio(h, folio);

                if (hugetlb_bootmem_page_prehvo(m))
                        /*
                         * If pre-HVO was done, just set the
                         * flag, the HVO code will then skip
                         * this folio.
                         */
                        folio_set_hugetlb_vmemmap_optimized(folio);

                if (hugetlb_bootmem_page_earlycma(m))
                        folio_set_hugetlb_cma(folio);

                list_add(&folio->lru, &folio_list);

                /*
                 * We need to restore the 'stolen' pages to totalram_pages
                 * in order to fix confusing memory reports from free(1) and
                 * other side-effects, like CommitLimit going negative.
                 *
                 * For CMA pages, this is done in init_cma_pageblock
                 * (via hugetlb_bootmem_init_migratetype), so skip it here.
                 */
                if (!folio_test_hugetlb_cma(folio))
                        adjust_managed_page_count(page, pages_per_huge_page(h));
                cond_resched();
        }

        prep_and_add_bootmem_folios(h, &folio_list);
}

static void __init gather_bootmem_prealloc_parallel(unsigned long start,
                                                    unsigned long end, void *arg)
{
        int nid;

        for (nid = start; nid < end; nid++)
                gather_bootmem_prealloc_node(nid);
}

static void __init gather_bootmem_prealloc(void)
{
        struct padata_mt_job job = {
                .thread_fn        = gather_bootmem_prealloc_parallel,
                .fn_arg                = NULL,
                .start                = 0,
                .size                = nr_node_ids,
                .align                = 1,
                .min_chunk        = 1,
                .max_threads        = num_node_state(N_MEMORY),
                .numa_aware        = true,
        };

        padata_do_multithreaded(&job);
}

static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
{
        unsigned long i;
        char buf[32];
        LIST_HEAD(folio_list);

        for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
                if (hstate_is_gigantic(h)) {
                        if (!alloc_bootmem_huge_page(h, nid))
                                break;
                } else {
                        struct folio *folio;
                        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;

                        folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid,
                                        &node_states[N_MEMORY], NULL);
                        if (!folio)
                                break;
                        list_add(&folio->lru, &folio_list);
                }
                cond_resched();
        }

        if (!list_empty(&folio_list))
                prep_and_add_allocated_folios(h, &folio_list);

        if (i == h->max_huge_pages_node[nid])
                return;

        string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
        pr_warn("HugeTLB: allocating %u of page size %s failed node%d.  Only allocated %lu hugepages.\n",
                h->max_huge_pages_node[nid], buf, nid, i);
        h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
        h->max_huge_pages_node[nid] = i;
}

static bool __init hugetlb_hstate_alloc_pages_specific_nodes(struct hstate *h)
{
        int i;
        bool node_specific_alloc = false;

        for_each_online_node(i) {
                if (h->max_huge_pages_node[i] > 0) {
                        hugetlb_hstate_alloc_pages_onenode(h, i);
                        node_specific_alloc = true;
                }
        }

        return node_specific_alloc;
}

static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated, struct hstate *h)
{
        if (allocated < h->max_huge_pages) {
                char buf[32];

                string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
                pr_warn("HugeTLB: allocating %lu of page size %s failed.  Only allocated %lu hugepages.\n",
                        h->max_huge_pages, buf, allocated);
                h->max_huge_pages = allocated;
        }
}

static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg)
{
        struct hstate *h = (struct hstate *)arg;
        int i, num = end - start;
        nodemask_t node_alloc_noretry;
        LIST_HEAD(folio_list);
        int next_node = first_online_node;

        /* Bit mask controlling how hard we retry per-node allocations.*/
        nodes_clear(node_alloc_noretry);

        for (i = 0; i < num; ++i) {
                struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
                                                &node_alloc_noretry, &next_node);
                if (!folio)
                        break;

                list_move(&folio->lru, &folio_list);
                cond_resched();
        }

        prep_and_add_allocated_folios(h, &folio_list);
}

static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
{
        unsigned long i;

        for (i = 0; i < h->max_huge_pages; ++i) {
                if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
                        break;
                cond_resched();
        }

        return i;
}

static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
{
        struct padata_mt_job job = {
                .fn_arg                = h,
                .align                = 1,
                .numa_aware        = true
        };

        unsigned long jiffies_start;
        unsigned long jiffies_end;

        job.thread_fn        = hugetlb_pages_alloc_boot_node;
        job.start        = 0;
        job.size        = h->max_huge_pages;

        /*
         * job.max_threads is 25% of the available cpu threads by default.
         *
         * On large servers with terabytes of memory, huge page allocation
         * can consume a considerably amount of time.
         *
         * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages.
         * 2MiB huge pages. Using more threads can significantly improve allocation time.
         *
         * +-----------------------+-------+-------+-------+-------+-------+
         * | threads               |   8   |   16  |   32  |   64  |   128 |
         * +-----------------------+-------+-------+-------+-------+-------+
         * | skylake      144 cpus |   44s |   22s |   16s |   19s |   20s |
         * | cascade lake 192 cpus |   39s |   20s |   11s |   10s |    9s |
         * +-----------------------+-------+-------+-------+-------+-------+
         */
        if (hugepage_allocation_threads == 0) {
                hugepage_allocation_threads = num_online_cpus() / 4;
                hugepage_allocation_threads = max(hugepage_allocation_threads, 1);
        }

        job.max_threads        = hugepage_allocation_threads;
        job.min_chunk        = h->max_huge_pages / hugepage_allocation_threads;

        jiffies_start = jiffies;
        padata_do_multithreaded(&job);
        jiffies_end = jiffies;

        pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n",
                jiffies_to_msecs(jiffies_end - jiffies_start),
                hugepage_allocation_threads);

        return h->nr_huge_pages;
}

/*
 * NOTE: this routine is called in different contexts for gigantic and
 * non-gigantic pages.
 * - For gigantic pages, this is called early in the boot process and
 *   pages are allocated from memblock allocated or something similar.
 *   Gigantic pages are actually added to pools later with the routine
 *   gather_bootmem_prealloc.
 * - For non-gigantic pages, this is called later in the boot process after
 *   all of mm is up and functional.  Pages are allocated from buddy and
 *   then added to hugetlb pools.
 */
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
        unsigned long allocated;

        /*
         * Skip gigantic hugepages allocation if early CMA
         * reservations are not available.
         */
        if (hstate_is_gigantic(h) && hugetlb_cma_total_size() &&
            !hugetlb_early_cma(h)) {
                pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
                return;
        }

        /* do node specific alloc */
        if (hugetlb_hstate_alloc_pages_specific_nodes(h))
                return;

        /* below will do all node balanced alloc */
        if (hstate_is_gigantic(h))
                allocated = hugetlb_gigantic_pages_alloc_boot(h);
        else
                allocated = hugetlb_pages_alloc_boot(h);

        hugetlb_hstate_alloc_pages_errcheck(allocated, h);
}

static void __init hugetlb_init_hstates(void)
{
        struct hstate *h, *h2;

        for_each_hstate(h) {
                /* oversize hugepages were init'ed in early boot */
                if (!hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);

                /*
                 * Set demote order for each hstate.  Note that
                 * h->demote_order is initially 0.
                 * - We can not demote gigantic pages if runtime freeing
                 *   is not supported, so skip this.
                 * - If CMA allocation is possible, we can not demote
                 *   HUGETLB_PAGE_ORDER or smaller size pages.
                 */
                if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                        continue;
                if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER)
                        continue;
                for_each_hstate(h2) {
                        if (h2 == h)
                                continue;
                        if (h2->order < h->order &&
                            h2->order > h->demote_order)
                                h->demote_order = h2->order;
                }
        }
}

static void __init report_hugepages(void)
{
        struct hstate *h;
        unsigned long nrinvalid;

        for_each_hstate(h) {
                char buf[32];

                nrinvalid = hstate_boot_nrinvalid[hstate_index(h)];
                h->max_huge_pages -= nrinvalid;

                string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
                pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n",
                        buf, h->free_huge_pages);
                if (nrinvalid)
                        pr_info("HugeTLB: %s page size: %lu invalid page%s discarded\n",
                                        buf, nrinvalid, nrinvalid > 1 ? "s" : "");
                pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n",
                        hugetlb_vmemmap_optimizable_size(h) / SZ_1K, buf);
        }
}

#ifdef CONFIG_HIGHMEM
static void try_to_free_low(struct hstate *h, unsigned long count,
                                                nodemask_t *nodes_allowed)
{
        int i;
        LIST_HEAD(page_list);

        lockdep_assert_held(&hugetlb_lock);
        if (hstate_is_gigantic(h))
                return;

        /*
         * Collect pages to be freed on a list, and free after dropping lock
         */
        for_each_node_mask(i, *nodes_allowed) {
                struct folio *folio, *next;
                struct list_head *freel = &h->hugepage_freelists[i];
                list_for_each_entry_safe(folio, next, freel, lru) {
                        if (count >= h->nr_huge_pages)
                                goto out;
                        if (folio_test_highmem(folio))
                                continue;
                        remove_hugetlb_folio(h, folio, false);
                        list_add(&folio->lru, &page_list);
                }
        }

out:
        spin_unlock_irq(&hugetlb_lock);
        update_and_free_pages_bulk(h, &page_list);
        spin_lock_irq(&hugetlb_lock);
}
#else
static inline void try_to_free_low(struct hstate *h, unsigned long count,
                                                nodemask_t *nodes_allowed)
{
}
#endif

/*
 * Increment or decrement surplus_huge_pages.  Keep node-specific counters
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
                                int delta)
{
        int nr_nodes, node;

        lockdep_assert_held(&hugetlb_lock);
        VM_BUG_ON(delta != -1 && delta != 1);

        if (delta < 0) {
                for_each_node_mask_to_alloc(&h->next_nid_to_alloc, nr_nodes, node, nodes_allowed) {
                        if (h->surplus_huge_pages_node[node])
                                goto found;
                }
        } else {
                for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                        if (h->surplus_huge_pages_node[node] <
                                        h->nr_huge_pages_node[node])
                                goto found;
                }
        }
        return 0;

found:
        h->surplus_huge_pages += delta;
        h->surplus_huge_pages_node[node] += delta;
        return 1;
}

#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                              nodemask_t *nodes_allowed)
{
        unsigned long persistent_free_count;
        unsigned long min_count;
        unsigned long allocated;
        struct folio *folio;
        LIST_HEAD(page_list);
        NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);

        /*
         * Bit mask controlling how hard we retry per-node allocations.
         * If we can not allocate the bit mask, do not attempt to allocate
         * the requested huge pages.
         */
        if (node_alloc_noretry)
                nodes_clear(*node_alloc_noretry);
        else
                return -ENOMEM;

        /*
         * resize_lock mutex prevents concurrent adjustments to number of
         * pages in hstate via the proc/sysfs interfaces.
         */
        mutex_lock(&h->resize_lock);
        flush_free_hpage_work(h);
        spin_lock_irq(&hugetlb_lock);

        /*
         * Check for a node specific request.
         * Changing node specific huge page count may require a corresponding
         * change to the global count.  In any case, the passed node mask
         * (nodes_allowed) will restrict alloc/free to the specified node.
         */
        if (nid != NUMA_NO_NODE) {
                unsigned long old_count = count;

                count += persistent_huge_pages(h) -
                         (h->nr_huge_pages_node[nid] -
                          h->surplus_huge_pages_node[nid]);
                /*
                 * User may have specified a large count value which caused the
                 * above calculation to overflow.  In this case, they wanted
                 * to allocate as many huge pages as possible.  Set count to
                 * largest possible value to align with their intention.
                 */
                if (count < old_count)
                        count = ULONG_MAX;
        }

        /*
         * Gigantic pages runtime allocation depend on the capability for large
         * page range allocation.
         * If the system does not provide this feature, return an error when
         * the user tries to allocate gigantic pages but let the user free the
         * boottime allocated gigantic pages.
         */
        if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
                if (count > persistent_huge_pages(h)) {
                        spin_unlock_irq(&hugetlb_lock);
                        mutex_unlock(&h->resize_lock);
                        NODEMASK_FREE(node_alloc_noretry);
                        return -EINVAL;
                }
                /* Fall through to decrease pool */
        }

        /*
         * Increase the pool size
         * First take pages out of surplus state.  Then make up the
         * remaining difference by allocating fresh huge pages.
         *
         * We might race with alloc_surplus_hugetlb_folio() here and be unable
         * to convert a surplus huge page to a normal huge page. That is
         * not critical, though, it just means the overall size of the
         * pool might be one hugepage larger than it needs to be, but
         * within all the constraints specified by the sysctls.
         */
        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, -1))
                        break;
        }

        allocated = 0;
        while (count > (persistent_huge_pages(h) + allocated)) {
                /*
                 * If this allocation races such that we no longer need the
                 * page, free_huge_folio will handle it by freeing the page
                 * and reducing the surplus.
                 */
                spin_unlock_irq(&hugetlb_lock);

                /* yield cpu to avoid soft lockup */
                cond_resched();

                folio = alloc_pool_huge_folio(h, nodes_allowed,
                                                node_alloc_noretry,
                                                &h->next_nid_to_alloc);
                if (!folio) {
                        prep_and_add_allocated_folios(h, &page_list);
                        spin_lock_irq(&hugetlb_lock);
                        goto out;
                }

                list_add(&folio->lru, &page_list);
                allocated++;

                /* Bail for signals. Probably ctrl-c from user */
                if (signal_pending(current)) {
                        prep_and_add_allocated_folios(h, &page_list);
                        spin_lock_irq(&hugetlb_lock);
                        goto out;
                }

                spin_lock_irq(&hugetlb_lock);
        }

        /* Add allocated pages to the pool */
        if (!list_empty(&page_list)) {
                spin_unlock_irq(&hugetlb_lock);
                prep_and_add_allocated_folios(h, &page_list);
                spin_lock_irq(&hugetlb_lock);
        }

        /*
         * Decrease the pool size
         * First return free pages to the buddy allocator (being careful
         * to keep enough around to satisfy reservations).  Then place
         * pages into surplus state as needed so the pool will shrink
         * to the desired size as pages become free.
         *
         * By placing pages into the surplus state independent of the
         * overcommit value, we are allowing the surplus pool size to
         * exceed overcommit. There are few sane options here. Since
         * alloc_surplus_hugetlb_folio() is checking the global counter,
         * though, we'll note that we're not allowed to exceed surplus
         * and won't grow the pool anywhere else. Not until one of the
         * sysctls are changed, or the surplus pages go out of use.
         *
         * min_count is the expected number of persistent pages, we
         * shouldn't calculate min_count by using
         * resv_huge_pages + persistent_huge_pages() - free_huge_pages,
         * because there may exist free surplus huge pages, and this will
         * lead to subtracting twice. Free surplus huge pages come from HVO
         * failing to restore vmemmap, see comments in the callers of
         * hugetlb_vmemmap_restore_folio(). Thus, we should calculate
         * persistent free count first.
         */
        persistent_free_count = h->free_huge_pages;
        if (h->free_huge_pages > persistent_huge_pages(h)) {
                if (h->free_huge_pages > h->surplus_huge_pages)
                        persistent_free_count -= h->surplus_huge_pages;
                else
                        persistent_free_count = 0;
        }
        min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count;
        min_count = max(count, min_count);
        try_to_free_low(h, min_count, nodes_allowed);

        /*
         * Collect pages to be removed on list without dropping lock
         */
        while (min_count < persistent_huge_pages(h)) {
                folio = remove_pool_hugetlb_folio(h, nodes_allowed, 0);
                if (!folio)
                        break;

                list_add(&folio->lru, &page_list);
        }
        /* free the pages after dropping lock */
        spin_unlock_irq(&hugetlb_lock);
        update_and_free_pages_bulk(h, &page_list);
        flush_free_hpage_work(h);
        spin_lock_irq(&hugetlb_lock);

        while (count < persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, nodes_allowed, 1))
                        break;
        }
out:
        h->max_huge_pages = persistent_huge_pages(h);
        spin_unlock_irq(&hugetlb_lock);
        mutex_unlock(&h->resize_lock);

        NODEMASK_FREE(node_alloc_noretry);

        return 0;
}

static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
                                       struct list_head *src_list)
{
        long rc;
        struct folio *folio, *next;
        LIST_HEAD(dst_list);
        LIST_HEAD(ret_list);

        rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list);
        list_splice_init(&ret_list, src_list);

        /*
         * Taking target hstate mutex synchronizes with set_max_huge_pages.
         * Without the mutex, pages added to target hstate could be marked
         * as surplus.
         *
         * Note that we already hold src->resize_lock.  To prevent deadlock,
         * use the convention of always taking larger size hstate mutex first.
         */
        mutex_lock(&dst->resize_lock);

        list_for_each_entry_safe(folio, next, src_list, lru) {
                int i;

                if (folio_test_hugetlb_vmemmap_optimized(folio))
                        continue;

                list_del(&folio->lru);

                split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
                pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst));

                for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
                        struct page *page = folio_page(folio, i);
                        /* Careful: see __split_huge_page_tail() */
                        struct folio *new_folio = (struct folio *)page;

                        clear_compound_head(page);
                        prep_compound_page(page, dst->order);

                        new_folio->mapping = NULL;
                        init_new_hugetlb_folio(dst, new_folio);
                        list_add(&new_folio->lru, &dst_list);
                }
        }

        prep_and_add_allocated_folios(dst, &dst_list);

        mutex_unlock(&dst->resize_lock);

        return rc;
}

static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
                                  unsigned long nr_to_demote)
        __must_hold(&hugetlb_lock)
{
        int nr_nodes, node;
        struct hstate *dst;
        long rc = 0;
        long nr_demoted = 0;

        lockdep_assert_held(&hugetlb_lock);

        /* We should never get here if no demote order */
        if (!src->demote_order) {
                pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
                return -EINVAL;                /* internal error */
        }
        dst = size_to_hstate(PAGE_SIZE << src->demote_order);

        for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) {
                LIST_HEAD(list);
                struct folio *folio, *next;

                list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) {
                        if (folio_test_hwpoison(folio))
                                continue;

                        remove_hugetlb_folio(src, folio, false);
                        list_add(&folio->lru, &list);

                        if (++nr_demoted == nr_to_demote)
                                break;
                }

                spin_unlock_irq(&hugetlb_lock);

                rc = demote_free_hugetlb_folios(src, dst, &list);

                spin_lock_irq(&hugetlb_lock);

                list_for_each_entry_safe(folio, next, &list, lru) {
                        list_del(&folio->lru);
                        add_hugetlb_folio(src, folio, false);

                        nr_demoted--;
                }

                if (rc < 0 || nr_demoted == nr_to_demote)
                        break;
        }

        /*
         * Not absolutely necessary, but for consistency update max_huge_pages
         * based on pool changes for the demoted page.
         */
        src->max_huge_pages -= nr_demoted;
        dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst));

        if (rc < 0)
                return rc;

        if (nr_demoted)
                return nr_demoted;
        /*
         * Only way to get here is if all pages on free lists are poisoned.
         * Return -EBUSY so that caller will not retry.
         */
        return -EBUSY;
}

#define HSTATE_ATTR_RO(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)

#define HSTATE_ATTR_WO(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_WO(_name)

#define HSTATE_ATTR(_name) \
        static struct kobj_attribute _name##_attr = __ATTR_RW(_name)

static struct kobject *hugepages_kobj;
static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];

static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);

static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
{
        int i;

        for (i = 0; i < HUGE_MAX_HSTATE; i++)
                if (hstate_kobjs[i] == kobj) {
                        if (nidp)
                                *nidp = NUMA_NO_NODE;
                        return &hstates[i];
                }

        return kobj_to_node_hstate(kobj, nidp);
}

static ssize_t nr_hugepages_show_common(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h;
        unsigned long nr_huge_pages;
        int nid;

        h = kobj_to_hstate(kobj, &nid);
        if (nid == NUMA_NO_NODE)
                nr_huge_pages = h->nr_huge_pages;
        else
                nr_huge_pages = h->nr_huge_pages_node[nid];

        return sysfs_emit(buf, "%lu\n", nr_huge_pages);
}

static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
                                           struct hstate *h, int nid,
                                           unsigned long count, size_t len)
{
        int err;
        nodemask_t nodes_allowed, *n_mask;

        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return -EINVAL;

        if (nid == NUMA_NO_NODE) {
                /*
                 * global hstate attribute
                 */
                if (!(obey_mempolicy &&
                                init_nodemask_of_mempolicy(&nodes_allowed)))
                        n_mask = &node_states[N_MEMORY];
                else
                        n_mask = &nodes_allowed;
        } else {
                /*
                 * Node specific request.  count adjustment happens in
                 * set_max_huge_pages() after acquiring hugetlb_lock.
                 */
                init_nodemask_of_node(&nodes_allowed, nid);
                n_mask = &nodes_allowed;
        }

        err = set_max_huge_pages(h, count, nid, n_mask);

        return err ? err : len;
}

static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                                         struct kobject *kobj, const char *buf,
                                         size_t len)
{
        struct hstate *h;
        unsigned long count;
        int nid;
        int err;

        err = kstrtoul(buf, 10, &count);
        if (err)
                return err;

        h = kobj_to_hstate(kobj, &nid);
        return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len);
}

static ssize_t nr_hugepages_show(struct kobject *kobj,
                                       struct kobj_attribute *attr, char *buf)
{
        return nr_hugepages_show_common(kobj, attr, buf);
}

static ssize_t nr_hugepages_store(struct kobject *kobj,
               struct kobj_attribute *attr, const char *buf, size_t len)
{
        return nr_hugepages_store_common(false, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages);

#ifdef CONFIG_NUMA

/*
 * hstate attribute for optionally mempolicy-based constraint on persistent
 * huge page alloc/free.
 */
static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
                                           struct kobj_attribute *attr,
                                           char *buf)
{
        return nr_hugepages_show_common(kobj, attr, buf);
}

static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
               struct kobj_attribute *attr, const char *buf, size_t len)
{
        return nr_hugepages_store_common(true, kobj, buf, len);
}
HSTATE_ATTR(nr_hugepages_mempolicy);
#endif


static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
}

static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
{
        int err;
        unsigned long input;
        struct hstate *h = kobj_to_hstate(kobj, NULL);

        if (hstate_is_gigantic(h))
                return -EINVAL;

        err = kstrtoul(buf, 10, &input);
        if (err)
                return err;

        spin_lock_irq(&hugetlb_lock);
        h->nr_overcommit_huge_pages = input;
        spin_unlock_irq(&hugetlb_lock);

        return count;
}
HSTATE_ATTR(nr_overcommit_hugepages);

static ssize_t free_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h;
        unsigned long free_huge_pages;
        int nid;

        h = kobj_to_hstate(kobj, &nid);
        if (nid == NUMA_NO_NODE)
                free_huge_pages = h->free_huge_pages;
        else
                free_huge_pages = h->free_huge_pages_node[nid];

        return sysfs_emit(buf, "%lu\n", free_huge_pages);
}
HSTATE_ATTR_RO(free_hugepages);

static ssize_t resv_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
}
HSTATE_ATTR_RO(resv_hugepages);

static ssize_t surplus_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h;
        unsigned long surplus_huge_pages;
        int nid;

        h = kobj_to_hstate(kobj, &nid);
        if (nid == NUMA_NO_NODE)
                surplus_huge_pages = h->surplus_huge_pages;
        else
                surplus_huge_pages = h->surplus_huge_pages_node[nid];

        return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
}
HSTATE_ATTR_RO(surplus_hugepages);

static ssize_t demote_store(struct kobject *kobj,
               struct kobj_attribute *attr, const char *buf, size_t len)
{
        unsigned long nr_demote;
        unsigned long nr_available;
        nodemask_t nodes_allowed, *n_mask;
        struct hstate *h;
        int err;
        int nid;

        err = kstrtoul(buf, 10, &nr_demote);
        if (err)
                return err;
        h = kobj_to_hstate(kobj, &nid);

        if (nid != NUMA_NO_NODE) {
                init_nodemask_of_node(&nodes_allowed, nid);
                n_mask = &nodes_allowed;
        } else {
                n_mask = &node_states[N_MEMORY];
        }

        /* Synchronize with other sysfs operations modifying huge pages */
        mutex_lock(&h->resize_lock);
        spin_lock_irq(&hugetlb_lock);

        while (nr_demote) {
                long rc;

                /*
                 * Check for available pages to demote each time thorough the
                 * loop as demote_pool_huge_page will drop hugetlb_lock.
                 */
                if (nid != NUMA_NO_NODE)
                        nr_available = h->free_huge_pages_node[nid];
                else
                        nr_available = h->free_huge_pages;
                nr_available -= h->resv_huge_pages;
                if (!nr_available)
                        break;

                rc = demote_pool_huge_page(h, n_mask, nr_demote);
                if (rc < 0) {
                        err = rc;
                        break;
                }

                nr_demote -= rc;
        }

        spin_unlock_irq(&hugetlb_lock);
        mutex_unlock(&h->resize_lock);

        if (err)
                return err;
        return len;
}
HSTATE_ATTR_WO(demote);

static ssize_t demote_size_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
{
        struct hstate *h = kobj_to_hstate(kobj, NULL);
        unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;

        return sysfs_emit(buf, "%lukB\n", demote_size);
}

static ssize_t demote_size_store(struct kobject *kobj,
                                        struct kobj_attribute *attr,
                                        const char *buf, size_t count)
{
        struct hstate *h, *demote_hstate;
        unsigned long demote_size;
        unsigned int demote_order;

        demote_size = (unsigned long)memparse(buf, NULL);

        demote_hstate = size_to_hstate(demote_size);
        if (!demote_hstate)
                return -EINVAL;
        demote_order = demote_hstate->order;
        if (demote_order < HUGETLB_PAGE_ORDER)
                return -EINVAL;

        /* demote order must be smaller than hstate order */
        h = kobj_to_hstate(kobj, NULL);
        if (demote_order >= h->order)
                return -EINVAL;

        /* resize_lock synchronizes access to demote size and writes */
        mutex_lock(&h->resize_lock);
        h->demote_order = demote_order;
        mutex_unlock(&h->resize_lock);

        return count;
}
HSTATE_ATTR(demote_size);

static struct attribute *hstate_attrs[] = {
        &nr_hugepages_attr.attr,
        &nr_overcommit_hugepages_attr.attr,
        &free_hugepages_attr.attr,
        &resv_hugepages_attr.attr,
        &surplus_hugepages_attr.attr,
#ifdef CONFIG_NUMA
        &nr_hugepages_mempolicy_attr.attr,
#endif
        NULL,
};

static const struct attribute_group hstate_attr_group = {
        .attrs = hstate_attrs,
};

static struct attribute *hstate_demote_attrs[] = {
        &demote_size_attr.attr,
        &demote_attr.attr,
        NULL,
};

static const struct attribute_group hstate_demote_attr_group = {
        .attrs = hstate_demote_attrs,
};

static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                                    struct kobject **hstate_kobjs,
                                    const struct attribute_group *hstate_attr_group)
{
        int retval;
        int hi = hstate_index(h);

        hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
        if (!hstate_kobjs[hi])
                return -ENOMEM;

        retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
        if (retval) {
                kobject_put(hstate_kobjs[hi]);
                hstate_kobjs[hi] = NULL;
                return retval;
        }

        if (h->demote_order) {
                retval = sysfs_create_group(hstate_kobjs[hi],
                                            &hstate_demote_attr_group);
                if (retval) {
                        pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
                        sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
                        kobject_put(hstate_kobjs[hi]);
                        hstate_kobjs[hi] = NULL;
                        return retval;
                }
        }

        return 0;
}

#ifdef CONFIG_NUMA
static bool hugetlb_sysfs_initialized __ro_after_init;

/*
 * node_hstate/s - associate per node hstate attributes, via their kobjects,
 * with node devices in node_devices[] using a parallel array.  The array
 * index of a node device or _hstate == node id.
 * This is here to avoid any static dependency of the node device driver, in
 * the base kernel, on the hugetlb module.
 */
struct node_hstate {
        struct kobject                *hugepages_kobj;
        struct kobject                *hstate_kobjs[HUGE_MAX_HSTATE];
};
static struct node_hstate node_hstates[MAX_NUMNODES];

/*
 * A subset of global hstate attributes for node devices
 */
static struct attribute *per_node_hstate_attrs[] = {
        &nr_hugepages_attr.attr,
        &free_hugepages_attr.attr,
        &surplus_hugepages_attr.attr,
        NULL,
};

static const struct attribute_group per_node_hstate_attr_group = {
        .attrs = per_node_hstate_attrs,
};

/*
 * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj.
 * Returns node id via non-NULL nidp.
 */
static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
{
        int nid;

        for (nid = 0; nid < nr_node_ids; nid++) {
                struct node_hstate *nhs = &node_hstates[nid];
                int i;
                for (i = 0; i < HUGE_MAX_HSTATE; i++)
                        if (nhs->hstate_kobjs[i] == kobj) {
                                if (nidp)
                                        *nidp = nid;
                                return &hstates[i];
                        }
        }

        BUG();
        return NULL;
}

/*
 * Unregister hstate attributes from a single node device.
 * No-op if no hstate attributes attached.
 */
void hugetlb_unregister_node(struct node *node)
{
        struct hstate *h;
        struct node_hstate *nhs = &node_hstates[node->dev.id];

        if (!nhs->hugepages_kobj)
                return;                /* no hstate attributes */

        for_each_hstate(h) {
                int idx = hstate_index(h);
                struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];

                if (!hstate_kobj)
                        continue;
                if (h->demote_order)
                        sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
                sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
                kobject_put(hstate_kobj);
                nhs->hstate_kobjs[idx] = NULL;
        }

        kobject_put(nhs->hugepages_kobj);
        nhs->hugepages_kobj = NULL;
}


/*
 * Register hstate attributes for a single node device.
 * No-op if attributes already registered.
 */
void hugetlb_register_node(struct node *node)
{
        struct hstate *h;
        struct node_hstate *nhs = &node_hstates[node->dev.id];
        int err;

        if (!hugetlb_sysfs_initialized)
                return;

        if (nhs->hugepages_kobj)
                return;                /* already allocated */

        nhs->hugepages_kobj = kobject_create_and_add("hugepages",
                                                        &node->dev.kobj);
        if (!nhs->hugepages_kobj)
                return;

        for_each_hstate(h) {
                err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
                                                nhs->hstate_kobjs,
                                                &per_node_hstate_attr_group);
                if (err) {
                        pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
                                h->name, node->dev.id);
                        hugetlb_unregister_node(node);
                        break;
                }
        }
}

/*
 * hugetlb init time:  register hstate attributes for all registered node
 * devices of nodes that have memory.  All on-line nodes should have
 * registered their associated device by this time.
 */
static void __init hugetlb_register_all_nodes(void)
{
        int nid;

        for_each_online_node(nid)
                hugetlb_register_node(node_devices[nid]);
}
#else        /* !CONFIG_NUMA */

static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
{
        BUG();
        if (nidp)
                *nidp = -1;
        return NULL;
}

static void hugetlb_register_all_nodes(void) { }

#endif

static void __init hugetlb_sysfs_init(void)
{
        struct hstate *h;
        int err;

        hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
        if (!hugepages_kobj)
                return;

        for_each_hstate(h) {
                err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
                                         hstate_kobjs, &hstate_attr_group);
                if (err)
                        pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
        }

#ifdef CONFIG_NUMA
        hugetlb_sysfs_initialized = true;
#endif
        hugetlb_register_all_nodes();
}

#ifdef CONFIG_SYSCTL
static void hugetlb_sysctl_init(void);
#else
static inline void hugetlb_sysctl_init(void) { }
#endif

static int __init hugetlb_init(void)
{
        int i;

        BUILD_BUG_ON(sizeof_field(struct page, private) * BITS_PER_BYTE <
                        __NR_HPAGEFLAGS);

        if (!hugepages_supported()) {
                if (hugetlb_max_hstate || default_hstate_max_huge_pages)
                        pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
                return 0;
        }

        /*
         * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists.  Some
         * architectures depend on setup being done here.
         */
        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
        if (!parsed_default_hugepagesz) {
                /*
                 * If we did not parse a default huge page size, set
                 * default_hstate_idx to HPAGE_SIZE hstate. And, if the
                 * number of huge pages for this default size was implicitly
                 * specified, set that here as well.
                 * Note that the implicit setting will overwrite an explicit
                 * setting.  A warning will be printed in this case.
                 */
                default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
                if (default_hstate_max_huge_pages) {
                        if (default_hstate.max_huge_pages) {
                                char buf[32];

                                string_get_size(huge_page_size(&default_hstate),
                                        1, STRING_UNITS_2, buf, 32);
                                pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
                                        default_hstate.max_huge_pages, buf);
                                pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
                                        default_hstate_max_huge_pages);
                        }
                        default_hstate.max_huge_pages =
                                default_hstate_max_huge_pages;

                        for_each_online_node(i)
                                default_hstate.max_huge_pages_node[i] =
                                        default_hugepages_in_node[i];
                }
        }

        hugetlb_cma_check();
        hugetlb_init_hstates();
        gather_bootmem_prealloc();
        report_hugepages();

        hugetlb_sysfs_init();
        hugetlb_cgroup_file_init();
        hugetlb_sysctl_init();

#ifdef CONFIG_SMP
        num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus());
#else
        num_fault_mutexes = 1;
#endif
        hugetlb_fault_mutex_table =
                kmalloc_array(num_fault_mutexes, sizeof(struct mutex),
                              GFP_KERNEL);
        BUG_ON(!hugetlb_fault_mutex_table);

        for (i = 0; i < num_fault_mutexes; i++)
                mutex_init(&hugetlb_fault_mutex_table[i]);
        return 0;
}
subsys_initcall(hugetlb_init);

/* Overwritten by architectures with more huge page sizes */
bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
{
        return size == HPAGE_SIZE;
}

void __init hugetlb_add_hstate(unsigned int order)
{
        struct hstate *h;
        unsigned long i;

        if (size_to_hstate(PAGE_SIZE << order)) {
                return;
        }
        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order < order_base_2(__NR_USED_SUBPAGE));
        h = &hstates[hugetlb_max_hstate++];
        __mutex_init(&h->resize_lock, "resize mutex", &h->resize_key);
        h->order = order;
        h->mask = ~(huge_page_size(h) - 1);
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
        INIT_LIST_HEAD(&h->hugepage_activelist);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/SZ_1K);

        parsed_hstate = h;
}

bool __init __weak hugetlb_node_alloc_supported(void)
{
        return true;
}

static void __init hugepages_clear_pages_in_node(void)
{
        if (!hugetlb_max_hstate) {
                default_hstate_max_huge_pages = 0;
                memset(default_hugepages_in_node, 0,
                        sizeof(default_hugepages_in_node));
        } else {
                parsed_hstate->max_huge_pages = 0;
                memset(parsed_hstate->max_huge_pages_node, 0,
                        sizeof(parsed_hstate->max_huge_pages_node));
        }
}

static __init int hugetlb_add_param(char *s, int (*setup)(char *))
{
        size_t len;
        char *p;

        if (hugetlb_param_index >= HUGE_MAX_CMDLINE_ARGS)
                return -EINVAL;

        len = strlen(s) + 1;
        if (len + hstate_cmdline_index > sizeof(hstate_cmdline_buf))
                return -EINVAL;

        p = &hstate_cmdline_buf[hstate_cmdline_index];
        memcpy(p, s, len);
        hstate_cmdline_index += len;

        hugetlb_params[hugetlb_param_index].val = p;
        hugetlb_params[hugetlb_param_index].setup = setup;

        hugetlb_param_index++;

        return 0;
}

static __init void hugetlb_parse_params(void)
{
        int i;
        struct hugetlb_cmdline *hcp;

        for (i = 0; i < hugetlb_param_index; i++) {
                hcp = &hugetlb_params[i];

                hcp->setup(hcp->val);
        }

        hugetlb_cma_validate_params();
}

/*
 * hugepages command line processing
 * hugepages normally follows a valid hugepagsz or default_hugepagsz
 * specification.  If not, ignore the hugepages value.  hugepages can also
 * be the first huge page command line  option in which case it implicitly
 * specifies the number of huge pages for the default size.
 */
static int __init hugepages_setup(char *s)
{
        unsigned long *mhp;
        static unsigned long *last_mhp;
        int node = NUMA_NO_NODE;
        int count;
        unsigned long tmp;
        char *p = s;

        if (!parsed_valid_hugepagesz) {
                pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
                parsed_valid_hugepagesz = true;
                return -EINVAL;
        }

        /*
         * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
         * yet, so this hugepages= parameter goes to the "default hstate".
         * Otherwise, it goes with the previously parsed hugepagesz or
         * default_hugepagesz.
         */
        else if (!hugetlb_max_hstate)
                mhp = &default_hstate_max_huge_pages;
        else
                mhp = &parsed_hstate->max_huge_pages;

        if (mhp == last_mhp) {
                pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
                return 1;
        }

        while (*p) {
                count = 0;
                if (sscanf(p, "%lu%n", &tmp, &count) != 1)
                        goto invalid;
                /* Parameter is node format */
                if (p[count] == ':') {
                        if (!hugetlb_node_alloc_supported()) {
                                pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
                                return 1;
                        }
                        if (tmp >= MAX_NUMNODES || !node_online(tmp))
                                goto invalid;
                        node = array_index_nospec(tmp, MAX_NUMNODES);
                        p += count + 1;
                        /* Parse hugepages */
                        if (sscanf(p, "%lu%n", &tmp, &count) != 1)
                                goto invalid;
                        if (!hugetlb_max_hstate)
                                default_hugepages_in_node[node] = tmp;
                        else
                                parsed_hstate->max_huge_pages_node[node] = tmp;
                        *mhp += tmp;
                        /* Go to parse next node*/
                        if (p[count] == ',')
                                p += count + 1;
                        else
                                break;
                } else {
                        if (p != s)
                                goto invalid;
                        *mhp = tmp;
                        break;
                }
        }

        last_mhp = mhp;

        return 0;

invalid:
        pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
        hugepages_clear_pages_in_node();
        return -EINVAL;
}
hugetlb_early_param("hugepages", hugepages_setup);

/*
 * hugepagesz command line processing
 * A specific huge page size can only be specified once with hugepagesz.
 * hugepagesz is followed by hugepages on the command line.  The global
 * variable 'parsed_valid_hugepagesz' is used to determine if prior
 * hugepagesz argument was valid.
 */
static int __init hugepagesz_setup(char *s)
{
        unsigned long size;
        struct hstate *h;

        parsed_valid_hugepagesz = false;
        size = (unsigned long)memparse(s, NULL);

        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
                return -EINVAL;
        }

        h = size_to_hstate(size);
        if (h) {
                /*
                 * hstate for this size already exists.  This is normally
                 * an error, but is allowed if the existing hstate is the
                 * default hstate.  More specifically, it is only allowed if
                 * the number of huge pages for the default hstate was not
                 * previously specified.
                 */
                if (!parsed_default_hugepagesz ||  h != &default_hstate ||
                    default_hstate.max_huge_pages) {
                        pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
                        return -EINVAL;
                }

                /*
                 * No need to call hugetlb_add_hstate() as hstate already
                 * exists.  But, do set parsed_hstate so that a following
                 * hugepages= parameter will be applied to this hstate.
                 */
                parsed_hstate = h;
                parsed_valid_hugepagesz = true;
                return 0;
        }

        hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
        parsed_valid_hugepagesz = true;
        return 0;
}
hugetlb_early_param("hugepagesz", hugepagesz_setup);

/*
 * default_hugepagesz command line input
 * Only one instance of default_hugepagesz allowed on command line.
 */
static int __init default_hugepagesz_setup(char *s)
{
        unsigned long size;
        int i;

        parsed_valid_hugepagesz = false;
        if (parsed_default_hugepagesz) {
                pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
                return -EINVAL;
        }

        size = (unsigned long)memparse(s, NULL);

        if (!arch_hugetlb_valid_size(size)) {
                pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
                return -EINVAL;
        }

        hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
        parsed_valid_hugepagesz = true;
        parsed_default_hugepagesz = true;
        default_hstate_idx = hstate_index(size_to_hstate(size));

        /*
         * The number of default huge pages (for this size) could have been
         * specified as the first hugetlb parameter: hugepages=X.  If so,
         * then default_hstate_max_huge_pages is set.  If the default huge
         * page size is gigantic (> MAX_PAGE_ORDER), then the pages must be
         * allocated here from bootmem allocator.
         */
        if (default_hstate_max_huge_pages) {
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
                /*
                 * Since this is an early parameter, we can't check
                 * NUMA node state yet, so loop through MAX_NUMNODES.
                 */
                for (i = 0; i < MAX_NUMNODES; i++) {
                        if (default_hugepages_in_node[i] != 0)
                                default_hstate.max_huge_pages_node[i] =
                                        default_hugepages_in_node[i];
                }
                default_hstate_max_huge_pages = 0;
        }

        return 0;
}
hugetlb_early_param("default_hugepagesz", default_hugepagesz_setup);

static bool __hugetlb_bootmem_allocated __initdata;

bool __init hugetlb_bootmem_allocated(void)
{
        return __hugetlb_bootmem_allocated;
}

void __init hugetlb_bootmem_alloc(void)
{
        struct hstate *h;
        int i;

        if (__hugetlb_bootmem_allocated)
                return;

        for (i = 0; i < MAX_NUMNODES; i++)
                INIT_LIST_HEAD(&huge_boot_pages[i]);

        hugetlb_parse_params();

        for_each_hstate(h) {
                h->next_nid_to_alloc = first_online_node;
                h->next_nid_to_free = first_online_node;

                if (hstate_is_gigantic(h))
                        hugetlb_hstate_alloc_pages(h);
        }

        __hugetlb_bootmem_allocated = true;
}

/*
 * hugepage_alloc_threads command line parsing.
 *
 * When set, use this specific number of threads for the boot
 * allocation of hugepages.
 */
static int __init hugepage_alloc_threads_setup(char *s)
{
        unsigned long allocation_threads;

        if (kstrtoul(s, 0, &allocation_threads) != 0)
                return 1;

        if (allocation_threads == 0)
                return 1;

        hugepage_allocation_threads = allocation_threads;

        return 1;
}
__setup("hugepage_alloc_threads=", hugepage_alloc_threads_setup);

static unsigned int allowed_mems_nr(struct hstate *h)
{
        int node;
        unsigned int nr = 0;
        nodemask_t *mbind_nodemask;
        unsigned int *array = h->free_huge_pages_node;
        gfp_t gfp_mask = htlb_alloc_mask(h);

        mbind_nodemask = policy_mbind_nodemask(gfp_mask);
        for_each_node_mask(node, cpuset_current_mems_allowed) {
                if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
                        nr += array[node];
        }

        return nr;
}

#ifdef CONFIG_SYSCTL
static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write,
                                          void *buffer, size_t *length,
                                          loff_t *ppos, unsigned long *out)
{
        struct ctl_table dup_table;

        /*
         * In order to avoid races with __do_proc_doulongvec_minmax(), we
         * can duplicate the @table and alter the duplicate of it.
         */
        dup_table = *table;
        dup_table.data = out;

        return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
}

static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
                         const struct ctl_table *table, int write,
                         void *buffer, size_t *length, loff_t *ppos)
{
        struct hstate *h = &default_hstate;
        unsigned long tmp = h->max_huge_pages;
        int ret;

        if (!hugepages_supported())
                return -EOPNOTSUPP;

        ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
                                             &tmp);
        if (ret)
                goto out;

        if (write)
                ret = __nr_hugepages_store_common(obey_mempolicy, h,
                                                  NUMA_NO_NODE, tmp, *length);
out:
        return ret;
}

static int hugetlb_sysctl_handler(const struct ctl_table *table, int write,
                          void *buffer, size_t *length, loff_t *ppos)
{

        return hugetlb_sysctl_handler_common(false, table, write,
                                                        buffer, length, ppos);
}

#ifdef CONFIG_NUMA
static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write,
                          void *buffer, size_t *length, loff_t *ppos)
{
        return hugetlb_sysctl_handler_common(true, table, write,
                                                        buffer, length, ppos);
}
#endif /* CONFIG_NUMA */

static int hugetlb_overcommit_handler(const struct ctl_table *table, int write,
                void *buffer, size_t *length, loff_t *ppos)
{
        struct hstate *h = &default_hstate;
        unsigned long tmp;
        int ret;

        if (!hugepages_supported())
                return -EOPNOTSUPP;

        tmp = h->nr_overcommit_huge_pages;

        if (write && hstate_is_gigantic(h))
                return -EINVAL;

        ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
                                             &tmp);
        if (ret)
                goto out;

        if (write) {
                spin_lock_irq(&hugetlb_lock);
                h->nr_overcommit_huge_pages = tmp;
                spin_unlock_irq(&hugetlb_lock);
        }
out:
        return ret;
}

static const struct ctl_table hugetlb_table[] = {
        {
                .procname        = "nr_hugepages",
                .data                = NULL,
                .maxlen                = sizeof(unsigned long),
                .mode                = 0644,
                .proc_handler        = hugetlb_sysctl_handler,
        },
#ifdef CONFIG_NUMA
        {
                .procname       = "nr_hugepages_mempolicy",
                .data           = NULL,
                .maxlen         = sizeof(unsigned long),
                .mode           = 0644,
                .proc_handler   = &hugetlb_mempolicy_sysctl_handler,
        },
#endif
        {
                .procname        = "hugetlb_shm_group",
                .data                = &sysctl_hugetlb_shm_group,
                .maxlen                = sizeof(gid_t),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "nr_overcommit_hugepages",
                .data                = NULL,
                .maxlen                = sizeof(unsigned long),
                .mode                = 0644,
                .proc_handler        = hugetlb_overcommit_handler,
        },
};

static void __init hugetlb_sysctl_init(void)
{
        register_sysctl_init("vm", hugetlb_table);
}
#endif /* CONFIG_SYSCTL */

void hugetlb_report_meminfo(struct seq_file *m)
{
        struct hstate *h;
        unsigned long total = 0;

        if (!hugepages_supported())
                return;

        for_each_hstate(h) {
                unsigned long count = h->nr_huge_pages;

                total += huge_page_size(h) * count;

                if (h == &default_hstate)
                        seq_printf(m,
                                   "HugePages_Total:   %5lu\n"
                                   "HugePages_Free:    %5lu\n"
                                   "HugePages_Rsvd:    %5lu\n"
                                   "HugePages_Surp:    %5lu\n"
                                   "Hugepagesize:   %8lu kB\n",
                                   count,
                                   h->free_huge_pages,
                                   h->resv_huge_pages,
                                   h->surplus_huge_pages,
                                   huge_page_size(h) / SZ_1K);
        }

        seq_printf(m, "Hugetlb:        %8lu kB\n", total / SZ_1K);
}

int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
        struct hstate *h = &default_hstate;

        if (!hugepages_supported())
                return 0;

        return sysfs_emit_at(buf, len,
                             "Node %d HugePages_Total: %5u\n"
                             "Node %d HugePages_Free:  %5u\n"
                             "Node %d HugePages_Surp:  %5u\n",
                             nid, h->nr_huge_pages_node[nid],
                             nid, h->free_huge_pages_node[nid],
                             nid, h->surplus_huge_pages_node[nid]);
}

void hugetlb_show_meminfo_node(int nid)
{
        struct hstate *h;

        if (!hugepages_supported())
                return;

        for_each_hstate(h)
                printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
                        nid,
                        h->nr_huge_pages_node[nid],
                        h->free_huge_pages_node[nid],
                        h->surplus_huge_pages_node[nid],
                        huge_page_size(h) / SZ_1K);
}

void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
{
        seq_printf(m, "HugetlbPages:\t%8lu kB\n",
                   K(atomic_long_read(&mm->hugetlb_usage)));
}

/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
        struct hstate *h;
        unsigned long nr_total_pages = 0;

        for_each_hstate(h)
                nr_total_pages += h->nr_huge_pages * pages_per_huge_page(h);
        return nr_total_pages;
}

static int hugetlb_acct_memory(struct hstate *h, long delta)
{
        int ret = -ENOMEM;

        if (!delta)
                return 0;

        spin_lock_irq(&hugetlb_lock);
        /*
         * When cpuset is configured, it breaks the strict hugetlb page
         * reservation as the accounting is done on a global variable. Such
         * reservation is completely rubbish in the presence of cpuset because
         * the reservation is not checked against page availability for the
         * current cpuset. Application can still potentially OOM'ed by kernel
         * with lack of free htlb page in cpuset that the task is in.
         * Attempt to enforce strict accounting with cpuset is almost
         * impossible (or too ugly) because cpuset is too fluid that
         * task or memory node can be dynamically moved between cpusets.
         *
         * The change of semantics for shared hugetlb mapping with cpuset is
         * undesirable. However, in order to preserve some of the semantics,
         * we fall back to check against current free page availability as
         * a best attempt and hopefully to minimize the impact of changing
         * semantics that cpuset has.
         *
         * Apart from cpuset, we also have memory policy mechanism that
         * also determines from which node the kernel will allocate memory
         * in a NUMA system. So similar to cpuset, we also should consider
         * the memory policy of the current task. Similar to the description
         * above.
         */
        if (delta > 0) {
                if (gather_surplus_pages(h, delta) < 0)
                        goto out;

                if (delta > allowed_mems_nr(h)) {
                        return_unused_surplus_pages(h, delta);
                        goto out;
                }
        }

        ret = 0;
        if (delta < 0)
                return_unused_surplus_pages(h, (unsigned long) -delta);

out:
        spin_unlock_irq(&hugetlb_lock);
        return ret;
}

static void hugetlb_vm_op_open(struct vm_area_struct *vma)
{
        struct resv_map *resv = vma_resv_map(vma);

        /*
         * HPAGE_RESV_OWNER indicates a private mapping.
         * This new VMA should share its siblings reservation map if present.
         * The VMA will only ever have a valid reservation map pointer where
         * it is being copied for another still existing VMA.  As that VMA
         * has a reference to the reservation map it cannot disappear until
         * after this open call completes.  It is therefore safe to take a
         * new reference here without additional locking.
         */
        if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
                kref_get(&resv->refs);
        }

        /*
         * vma_lock structure for sharable mappings is vma specific.
         * Clear old pointer (if copied via vm_area_dup) and allocate
         * new structure.  Before clearing, make sure vma_lock is not
         * for this vma.
         */
        if (vma->vm_flags & VM_MAYSHARE) {
                struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;

                if (vma_lock) {
                        if (vma_lock->vma != vma) {
                                vma->vm_private_data = NULL;
                                hugetlb_vma_lock_alloc(vma);
                        } else
                                pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
                } else
                        hugetlb_vma_lock_alloc(vma);
        }
}

static void hugetlb_vm_op_close(struct vm_area_struct *vma)
{
        struct hstate *h = hstate_vma(vma);
        struct resv_map *resv;
        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve, start, end;
        long gbl_reserve;

        hugetlb_vma_lock_free(vma);

        resv = vma_resv_map(vma);
        if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                return;

        start = vma_hugecache_offset(h, vma, vma->vm_start);
        end = vma_hugecache_offset(h, vma, vma->vm_end);

        reserve = (end - start) - region_count(resv, start, end);
        hugetlb_cgroup_uncharge_counter(resv, start, end);
        if (reserve) {
                /*
                 * Decrement reserve counts.  The global reserve count may be
                 * adjusted if the subpool has a minimum size.
                 */
                gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
                hugetlb_acct_memory(h, -gbl_reserve);
        }

        kref_put(&resv->refs, resv_map_release);
}

static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
{
        if (addr & ~(huge_page_mask(hstate_vma(vma))))
                return -EINVAL;

        /*
         * PMD sharing is only possible for PUD_SIZE-aligned address ranges
         * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
         * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
         */
        if (addr & ~PUD_MASK) {
                /*
                 * hugetlb_vm_op_split is called right before we attempt to
                 * split the VMA. We will need to unshare PMDs in the old and
                 * new VMAs, so let's unshare before we split.
                 */
                unsigned long floor = addr & PUD_MASK;
                unsigned long ceil = floor + PUD_SIZE;

                if (floor >= vma->vm_start && ceil <= vma->vm_end)
                        hugetlb_unshare_pmds(vma, floor, ceil);
        }

        return 0;
}

static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
        return huge_page_size(hstate_vma(vma));
}

/*
 * We cannot handle pagefaults against hugetlb pages at all.  They cause
 * handle_mm_fault() to try to instantiate regular-sized pages in the
 * hugepage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
 * this far.
 */
static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf)
{
        BUG();
        return 0;
}

/*
 * When a new function is introduced to vm_operations_struct and added
 * to hugetlb_vm_ops, please consider adding the function to shm_vm_ops.
 * This is because under System V memory model, mappings created via
 * shmget/shmat with "huge page" specified are backed by hugetlbfs files,
 * their original vm_ops are overwritten with shm_vm_ops.
 */
const struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
        .open = hugetlb_vm_op_open,
        .close = hugetlb_vm_op_close,
        .may_split = hugetlb_vm_op_split,
        .pagesize = hugetlb_vm_op_pagesize,
};

static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
                bool try_mkwrite)
{
        pte_t entry;
        unsigned int shift = huge_page_shift(hstate_vma(vma));

        if (try_mkwrite && (vma->vm_flags & VM_WRITE)) {
                entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
                                         vma->vm_page_prot)));
        } else {
                entry = huge_pte_wrprotect(mk_huge_pte(page,
                                           vma->vm_page_prot));
        }
        entry = pte_mkyoung(entry);
        entry = arch_make_huge_pte(entry, shift, vma->vm_flags);

        return entry;
}

static void set_huge_ptep_writable(struct vm_area_struct *vma,
                                   unsigned long address, pte_t *ptep)
{
        pte_t entry;

        entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(vma->vm_mm, address, ptep)));
        if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1))
                update_mmu_cache(vma, address, ptep);
}

static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma,
                                         unsigned long address, pte_t *ptep)
{
        if (vma->vm_flags & VM_WRITE)
                set_huge_ptep_writable(vma, address, ptep);
}

bool is_hugetlb_entry_migration(pte_t pte)
{
        swp_entry_t swp;

        if (huge_pte_none(pte) || pte_present(pte))
                return false;
        swp = pte_to_swp_entry(pte);
        if (is_migration_entry(swp))
                return true;
        else
                return false;
}

bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
        swp_entry_t swp;

        if (huge_pte_none(pte) || pte_present(pte))
                return false;
        swp = pte_to_swp_entry(pte);
        if (is_hwpoison_entry(swp))
                return true;
        else
                return false;
}

static void
hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
                      struct folio *new_folio, pte_t old, unsigned long sz)
{
        pte_t newpte = make_huge_pte(vma, &new_folio->page, true);

        __folio_mark_uptodate(new_folio);
        hugetlb_add_new_anon_rmap(new_folio, vma, addr);
        if (userfaultfd_wp(vma) && huge_pte_uffd_wp(old))
                newpte = huge_pte_mkuffd_wp(newpte);
        set_huge_pte_at(vma->vm_mm, addr, ptep, newpte, sz);
        hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
        folio_set_hugetlb_migratable(new_folio);
}

int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *dst_vma,
                            struct vm_area_struct *src_vma)
{
        pte_t *src_pte, *dst_pte, entry;
        struct folio *pte_folio;
        unsigned long addr;
        bool cow = is_cow_mapping(src_vma->vm_flags);
        struct hstate *h = hstate_vma(src_vma);
        unsigned long sz = huge_page_size(h);
        unsigned long npages = pages_per_huge_page(h);
        struct mmu_notifier_range range;
        unsigned long last_addr_mask;
        int ret = 0;

        if (cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src,
                                        src_vma->vm_start,
                                        src_vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
                vma_assert_write_locked(src_vma);
                raw_write_seqcount_begin(&src->write_protect_seq);
        } else {
                /*
                 * For shared mappings the vma lock must be held before
                 * calling hugetlb_walk() in the src vma. Otherwise, the
                 * returned ptep could go away if part of a shared pmd and
                 * another thread calls huge_pmd_unshare.
                 */
                hugetlb_vma_lock_read(src_vma);
        }

        last_addr_mask = hugetlb_mask_last_page(h);
        for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
                spinlock_t *src_ptl, *dst_ptl;
                src_pte = hugetlb_walk(src_vma, addr, sz);
                if (!src_pte) {
                        addr |= last_addr_mask;
                        continue;
                }
                dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
                        break;
                }

                /*
                 * If the pagetables are shared don't copy or take references.
                 *
                 * dst_pte == src_pte is the common case of src/dest sharing.
                 * However, src could have 'unshared' and dst shares with
                 * another vma. So page_count of ptep page is checked instead
                 * to reliably determine whether pte is shared.
                 */
                if (page_count(virt_to_page(dst_pte)) > 1) {
                        addr |= last_addr_mask;
                        continue;
                }

                dst_ptl = huge_pte_lock(h, dst, dst_pte);
                src_ptl = huge_pte_lockptr(h, src, src_pte);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
again:
                if (huge_pte_none(entry)) {
                        /*
                         * Skip if src entry none.
                         */
                        ;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
                        if (!userfaultfd_wp(dst_vma))
                                entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_pte_at(dst, addr, dst_pte, entry, sz);
                } else if (unlikely(is_hugetlb_entry_migration(entry))) {
                        swp_entry_t swp_entry = pte_to_swp_entry(entry);
                        bool uffd_wp = pte_swp_uffd_wp(entry);

                        if (!is_readable_migration_entry(swp_entry) && cow) {
                                /*
                                 * COW mappings require pages in both
                                 * parent and child to be set to read.
                                 */
                                swp_entry = make_readable_migration_entry(
                                                        swp_offset(swp_entry));
                                entry = swp_entry_to_pte(swp_entry);
                                if (userfaultfd_wp(src_vma) && uffd_wp)
                                        entry = pte_swp_mkuffd_wp(entry);
                                set_huge_pte_at(src, addr, src_pte, entry, sz);
                        }
                        if (!userfaultfd_wp(dst_vma))
                                entry = huge_pte_clear_uffd_wp(entry);
                        set_huge_pte_at(dst, addr, dst_pte, entry, sz);
                } else if (unlikely(is_pte_marker(entry))) {
                        pte_marker marker = copy_pte_marker(
                                pte_to_swp_entry(entry), dst_vma);

                        if (marker)
                                set_huge_pte_at(dst, addr, dst_pte,
                                                make_pte_marker(marker), sz);
                } else {
                        entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
                        pte_folio = page_folio(pte_page(entry));
                        folio_get(pte_folio);

                        /*
                         * Failing to duplicate the anon rmap is a rare case
                         * where we see pinned hugetlb pages while they're
                         * prone to COW. We need to do the COW earlier during
                         * fork.
                         *
                         * When pre-allocating the page or copying data, we
                         * need to be without the pgtable locks since we could
                         * sleep during the process.
                         */
                        if (!folio_test_anon(pte_folio)) {
                                hugetlb_add_file_rmap(pte_folio);
                        } else if (hugetlb_try_dup_anon_rmap(pte_folio, src_vma)) {
                                pte_t src_pte_old = entry;
                                struct folio *new_folio;

                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                /* Do not use reserve as it's private owned */
                                new_folio = alloc_hugetlb_folio(dst_vma, addr, false);
                                if (IS_ERR(new_folio)) {
                                        folio_put(pte_folio);
                                        ret = PTR_ERR(new_folio);
                                        break;
                                }
                                ret = copy_user_large_folio(new_folio, pte_folio,
                                                            addr, dst_vma);
                                folio_put(pte_folio);
                                if (ret) {
                                        folio_put(new_folio);
                                        break;
                                }

                                /* Install the new hugetlb folio if src pte stable */
                                dst_ptl = huge_pte_lock(h, dst, dst_pte);
                                src_ptl = huge_pte_lockptr(h, src, src_pte);
                                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                                entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte);
                                if (!pte_same(src_pte_old, entry)) {
                                        restore_reserve_on_error(h, dst_vma, addr,
                                                                new_folio);
                                        folio_put(new_folio);
                                        /* huge_ptep of dst_pte won't change as in child */
                                        goto again;
                                }
                                hugetlb_install_folio(dst_vma, dst_pte, addr,
                                                      new_folio, src_pte_old, sz);
                                spin_unlock(src_ptl);
                                spin_unlock(dst_ptl);
                                continue;
                        }

                        if (cow) {
                                /*
                                 * No need to notify as we are downgrading page
                                 * table protection not changing it to point
                                 * to a new page.
                                 *
                                 * See Documentation/mm/mmu_notifier.rst
                                 */
                                huge_ptep_set_wrprotect(src, addr, src_pte);
                                entry = huge_pte_wrprotect(entry);
                        }

                        if (!userfaultfd_wp(dst_vma))
                                entry = huge_pte_clear_uffd_wp(entry);

                        set_huge_pte_at(dst, addr, dst_pte, entry, sz);
                        hugetlb_count_add(npages, dst);
                }
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
        }

        if (cow) {
                raw_write_seqcount_end(&src->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        } else {
                hugetlb_vma_unlock_read(src_vma);
        }

        return ret;
}

static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
                          unsigned long new_addr, pte_t *src_pte, pte_t *dst_pte,
                          unsigned long sz)
{
        bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
        struct hstate *h = hstate_vma(vma);
        struct mm_struct *mm = vma->vm_mm;
        spinlock_t *src_ptl, *dst_ptl;
        pte_t pte;

        dst_ptl = huge_pte_lock(h, mm, dst_pte);
        src_ptl = huge_pte_lockptr(h, mm, src_pte);

        /*
         * We don't have to worry about the ordering of src and dst ptlocks
         * because exclusive mmap_lock (or the i_mmap_lock) prevents deadlock.
         */
        if (src_ptl != dst_ptl)
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

        pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz);

        if (need_clear_uffd_wp && pte_marker_uffd_wp(pte))
                huge_pte_clear(mm, new_addr, dst_pte, sz);
        else {
                if (need_clear_uffd_wp) {
                        if (pte_present(pte))
                                pte = huge_pte_clear_uffd_wp(pte);
                        else if (is_swap_pte(pte))
                                pte = pte_swp_clear_uffd_wp(pte);
                }
                set_huge_pte_at(mm, new_addr, dst_pte, pte, sz);
        }

        if (src_ptl != dst_ptl)
                spin_unlock(src_ptl);
        spin_unlock(dst_ptl);
}

int move_hugetlb_page_tables(struct vm_area_struct *vma,
                             struct vm_area_struct *new_vma,
                             unsigned long old_addr, unsigned long new_addr,
                             unsigned long len)
{
        struct hstate *h = hstate_vma(vma);
        struct address_space *mapping = vma->vm_file->f_mapping;
        unsigned long sz = huge_page_size(h);
        struct mm_struct *mm = vma->vm_mm;
        unsigned long old_end = old_addr + len;
        unsigned long last_addr_mask;
        pte_t *src_pte, *dst_pte;
        struct mmu_notifier_range range;
        bool shared_pmd = false;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, old_addr,
                                old_end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
        /*
         * In case of shared PMDs, we should cover the maximum possible
         * range.
         */
        flush_cache_range(vma, range.start, range.end);

        mmu_notifier_invalidate_range_start(&range);
        last_addr_mask = hugetlb_mask_last_page(h);
        /* Prevent race with file truncation */
        hugetlb_vma_lock_write(vma);
        i_mmap_lock_write(mapping);
        for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
                src_pte = hugetlb_walk(vma, old_addr, sz);
                if (!src_pte) {
                        old_addr |= last_addr_mask;
                        new_addr |= last_addr_mask;
                        continue;
                }
                if (huge_pte_none(huge_ptep_get(mm, old_addr, src_pte)))
                        continue;

                if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
                        shared_pmd = true;
                        old_addr |= last_addr_mask;
                        new_addr |= last_addr_mask;
                        continue;
                }

                dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
                if (!dst_pte)
                        break;

                move_huge_pte(vma, old_addr, new_addr, src_pte, dst_pte, sz);
        }

        if (shared_pmd)
                flush_hugetlb_tlb_range(vma, range.start, range.end);
        else
                flush_hugetlb_tlb_range(vma, old_end - len, old_end);
        mmu_notifier_invalidate_range_end(&range);
        i_mmap_unlock_write(mapping);
        hugetlb_vma_unlock_write(vma);

        return len + old_addr - old_end;
}

void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                            unsigned long start, unsigned long end,
                            struct page *ref_page, zap_flags_t zap_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
        pte_t *ptep;
        pte_t pte;
        spinlock_t *ptl;
        struct page *page;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        bool adjust_reservation = false;
        unsigned long last_addr_mask;
        bool force_flush = false;

        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));

        /*
         * This is a hugetlb vma, all the pte entries should point
         * to huge page.
         */
        tlb_change_page_size(tlb, sz);
        tlb_start_vma(tlb, vma);

        last_addr_mask = hugetlb_mask_last_page(h);
        address = start;
        for (; address < end; address += sz) {
                ptep = hugetlb_walk(vma, address, sz);
                if (!ptep) {
                        address |= last_addr_mask;
                        continue;
                }

                ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(mm, vma, address, ptep)) {
                        spin_unlock(ptl);
                        tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
                        force_flush = true;
                        address |= last_addr_mask;
                        continue;
                }

                pte = huge_ptep_get(mm, address, ptep);
                if (huge_pte_none(pte)) {
                        spin_unlock(ptl);
                        continue;
                }

                /*
                 * Migrating hugepage or HWPoisoned hugepage is already
                 * unmapped and its refcount is dropped, so just clear pte here.
                 */
                if (unlikely(!pte_present(pte))) {
                        /*
                         * If the pte was wr-protected by uffd-wp in any of the
                         * swap forms, meanwhile the caller does not want to
                         * drop the uffd-wp bit in this zap, then replace the
                         * pte with a marker.
                         */
                        if (pte_swp_uffd_wp_any(pte) &&
                            !(zap_flags & ZAP_FLAG_DROP_MARKER))
                                set_huge_pte_at(mm, address, ptep,
                                                make_pte_marker(PTE_MARKER_UFFD_WP),
                                                sz);
                        else
                                huge_pte_clear(mm, address, ptep, sz);
                        spin_unlock(ptl);
                        continue;
                }

                page = pte_page(pte);
                /*
                 * If a reference page is supplied, it is because a specific
                 * page is being unmapped, not a range. Ensure the page we
                 * are about to unmap is the actual page of interest.
                 */
                if (ref_page) {
                        if (page != ref_page) {
                                spin_unlock(ptl);
                                continue;
                        }
                        /*
                         * Mark the VMA as having unmapped its page so that
                         * future faults in this VMA will fail rather than
                         * looking like data was lost
                         */
                        set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
                }

                pte = huge_ptep_get_and_clear(mm, address, ptep, sz);
                tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                if (huge_pte_dirty(pte))
                        set_page_dirty(page);
                /* Leave a uffd-wp pte marker if needed */
                if (huge_pte_uffd_wp(pte) &&
                    !(zap_flags & ZAP_FLAG_DROP_MARKER))
                        set_huge_pte_at(mm, address, ptep,
                                        make_pte_marker(PTE_MARKER_UFFD_WP),
                                        sz);
                hugetlb_count_sub(pages_per_huge_page(h), mm);
                hugetlb_remove_rmap(page_folio(page));

                /*
                 * Restore the reservation for anonymous page, otherwise the
                 * backing page could be stolen by someone.
                 * If there we are freeing a surplus, do not set the restore
                 * reservation bit.
                 */
                if (!h->surplus_huge_pages && __vma_private_lock(vma) &&
                    folio_test_anon(page_folio(page))) {
                        folio_set_hugetlb_restore_reserve(page_folio(page));
                        /* Reservation to be adjusted after the spin lock */
                        adjust_reservation = true;
                }

                spin_unlock(ptl);

                /*
                 * Adjust the reservation for the region that will have the
                 * reserve restored. Keep in mind that vma_needs_reservation() changes
                 * resv->adds_in_progress if it succeeds. If this is not done,
                 * do_exit() will not see it, and will keep the reservation
                 * forever.
                 */
                if (adjust_reservation) {
                        int rc = vma_needs_reservation(h, vma, address);

                        if (rc < 0)
                                /* Pressumably allocate_file_region_entries failed
                                 * to allocate a file_region struct. Clear
                                 * hugetlb_restore_reserve so that global reserve
                                 * count will not be incremented by free_huge_folio.
                                 * Act as if we consumed the reservation.
                                 */
                                folio_clear_hugetlb_restore_reserve(page_folio(page));
                        else if (rc)
                                vma_add_reservation(h, vma, address);
                }

                tlb_remove_page_size(tlb, page, huge_page_size(h));
                /*
                 * Bail out after unmapping reference page if supplied
                 */
                if (ref_page)
                        break;
        }
        tlb_end_vma(tlb, vma);

        /*
         * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
         * could defer the flush until now, since by holding i_mmap_rwsem we
         * guaranteed that the last refernece would not be dropped. But we must
         * do the flushing before we return, as otherwise i_mmap_rwsem will be
         * dropped and the last reference to the shared PMDs page might be
         * dropped as well.
         *
         * In theory we could defer the freeing of the PMD pages as well, but
         * huge_pmd_unshare() relies on the exact page_count for the PMD page to
         * detect sharing, so we cannot defer the release of the page either.
         * Instead, do flush now.
         */
        if (force_flush)
                tlb_flush_mmu_tlbonly(tlb);
}

void __hugetlb_zap_begin(struct vm_area_struct *vma,
                         unsigned long *start, unsigned long *end)
{
        if (!vma->vm_file)        /* hugetlbfs_file_mmap error */
                return;

        adjust_range_if_pmd_sharing_possible(vma, start, end);
        hugetlb_vma_lock_write(vma);
        if (vma->vm_file)
                i_mmap_lock_write(vma->vm_file->f_mapping);
}

void __hugetlb_zap_end(struct vm_area_struct *vma,
                       struct zap_details *details)
{
        zap_flags_t zap_flags = details ? details->zap_flags : 0;

        if (!vma->vm_file)        /* hugetlbfs_file_mmap error */
                return;

        if (zap_flags & ZAP_FLAG_UNMAP) {        /* final unmap */
                /*
                 * Unlock and free the vma lock before releasing i_mmap_rwsem.
                 * When the vma_lock is freed, this makes the vma ineligible
                 * for pmd sharing.  And, i_mmap_rwsem is required to set up
                 * pmd sharing.  This is important as page tables for this
                 * unmapped range will be asynchrously deleted.  If the page
                 * tables are shared, there will be issues when accessed by
                 * someone else.
                 */
                __hugetlb_vma_unlock_write_free(vma);
        } else {
                hugetlb_vma_unlock_write(vma);
        }

        if (vma->vm_file)
                i_mmap_unlock_write(vma->vm_file->f_mapping);
}

void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page,
                          zap_flags_t zap_flags)
{
        struct mmu_notifier_range range;
        struct mmu_gather tlb;

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm,
                                start, end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
        mmu_notifier_invalidate_range_start(&range);
        tlb_gather_mmu(&tlb, vma->vm_mm);

        __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);

        mmu_notifier_invalidate_range_end(&range);
        tlb_finish_mmu(&tlb);
}

/*
 * This is called when the original mapper is failing to COW a MAP_PRIVATE
 * mapping it owns the reserve page for. The intention is to unmap the page
 * from other VMAs and let the children be SIGKILLed if they are faulting the
 * same region.
 */
static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                              struct page *page, unsigned long address)
{
        struct hstate *h = hstate_vma(vma);
        struct vm_area_struct *iter_vma;
        struct address_space *mapping;
        pgoff_t pgoff;

        /*
         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
         * from page cache lookup which is in HPAGE_SIZE units.
         */
        address = address & huge_page_mask(h);
        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
        mapping = vma->vm_file->f_mapping;

        /*
         * Take the mapping lock for the duration of the table walk. As
         * this mapping should be shared between all the VMAs,
         * __unmap_hugepage_range() is called as the lock is already held
         */
        i_mmap_lock_write(mapping);
        vma_interval_tree_foreach(iter_vma, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
                        continue;

                /*
                 * Shared VMAs have their own reserves and do not affect
                 * MAP_PRIVATE accounting but it is possible that a shared
                 * VMA is using the same page so check and skip such VMAs.
                 */
                if (iter_vma->vm_flags & VM_MAYSHARE)
                        continue;

                /*
                 * Unmap the page from other VMAs without their own reserves.
                 * They get marked to be SIGKILLed if they fault in these
                 * areas. This is because a future no-page fault on this VMA
                 * could insert a zeroed page instead of the data existing
                 * from the time of fork. This would look like data corruption
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                        unmap_hugepage_range(iter_vma, address,
                                             address + huge_page_size(h), page, 0);
        }
        i_mmap_unlock_write(mapping);
}

/*
 * hugetlb_wp() should be called with page lock of the original hugepage held.
 * Called with hugetlb_fault_mutex_table held and pte_page locked so we
 * cannot race with other handlers or page migration.
 * Keep the pte_same checks anyway to make transition from the mutex easier.
 */
static vm_fault_t hugetlb_wp(struct folio *pagecache_folio,
                       struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
        pte_t pte = huge_ptep_get(mm, vmf->address, vmf->pte);
        struct hstate *h = hstate_vma(vma);
        struct folio *old_folio;
        struct folio *new_folio;
        bool cow_from_owner = 0;
        vm_fault_t ret = 0;
        struct mmu_notifier_range range;

        /*
         * Never handle CoW for uffd-wp protected pages.  It should be only
         * handled when the uffd-wp protection is removed.
         *
         * Note that only the CoW optimization path (in hugetlb_no_page())
         * can trigger this, because hugetlb_fault() will always resolve
         * uffd-wp bit first.
         */
        if (!unshare && huge_pte_uffd_wp(pte))
                return 0;

        /* Let's take out MAP_SHARED mappings first. */
        if (vma->vm_flags & VM_MAYSHARE) {
                set_huge_ptep_writable(vma, vmf->address, vmf->pte);
                return 0;
        }

        old_folio = page_folio(pte_page(pte));

        delayacct_wpcopy_start();

retry_avoidcopy:
        /*
         * If no-one else is actually using this page, we're the exclusive
         * owner and can reuse this page.
         *
         * Note that we don't rely on the (safer) folio refcount here, because
         * copying the hugetlb folio when there are unexpected (temporary)
         * folio references could harm simple fork()+exit() users when
         * we run out of free hugetlb folios: we would have to kill processes
         * in scenarios that used to work. As a side effect, there can still
         * be leaks between processes, for example, with FOLL_GET users.
         */
        if (folio_mapcount(old_folio) == 1 && folio_test_anon(old_folio)) {
                if (!PageAnonExclusive(&old_folio->page)) {
                        folio_move_anon_rmap(old_folio, vma);
                        SetPageAnonExclusive(&old_folio->page);
                }
                if (likely(!unshare))
                        set_huge_ptep_maybe_writable(vma, vmf->address,
                                                     vmf->pte);

                delayacct_wpcopy_end();
                return 0;
        }
        VM_BUG_ON_PAGE(folio_test_anon(old_folio) &&
                       PageAnonExclusive(&old_folio->page), &old_folio->page);

        /*
         * If the process that created a MAP_PRIVATE mapping is about to
         * perform a COW due to a shared page count, attempt to satisfy
         * the allocation without using the existing reserves. The pagecache
         * page is used to determine if the reserve at this address was
         * consumed or not. If reserves were used, a partial faulted mapping
         * at the time of fork() could consume its reserves on COW instead
         * of the full address range.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
                        old_folio != pagecache_folio)
                cow_from_owner = true;

        folio_get(old_folio);

        /*
         * Drop page table lock as buddy allocator may be called. It will
         * be acquired again before returning to the caller, as expected.
         */
        spin_unlock(vmf->ptl);
        new_folio = alloc_hugetlb_folio(vma, vmf->address, cow_from_owner);

        if (IS_ERR(new_folio)) {
                /*
                 * If a process owning a MAP_PRIVATE mapping fails to COW,
                 * it is due to references held by a child and an insufficient
                 * huge page pool. To guarantee the original mappers
                 * reliability, unmap the page from child processes. The child
                 * may get SIGKILLed if it later faults.
                 */
                if (cow_from_owner) {
                        struct address_space *mapping = vma->vm_file->f_mapping;
                        pgoff_t idx;
                        u32 hash;

                        folio_put(old_folio);
                        /*
                         * Drop hugetlb_fault_mutex and vma_lock before
                         * unmapping.  unmapping needs to hold vma_lock
                         * in write mode.  Dropping vma_lock in read mode
                         * here is OK as COW mappings do not interact with
                         * PMD sharing.
                         *
                         * Reacquire both after unmap operation.
                         */
                        idx = vma_hugecache_offset(h, vma, vmf->address);
                        hash = hugetlb_fault_mutex_hash(mapping, idx);
                        hugetlb_vma_unlock_read(vma);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);

                        unmap_ref_private(mm, vma, &old_folio->page,
                                        vmf->address);

                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        hugetlb_vma_lock_read(vma);
                        spin_lock(vmf->ptl);
                        vmf->pte = hugetlb_walk(vma, vmf->address,
                                        huge_page_size(h));
                        if (likely(vmf->pte &&
                                   pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte)))
                                goto retry_avoidcopy;
                        /*
                         * race occurs while re-acquiring page table
                         * lock, and our job is done.
                         */
                        delayacct_wpcopy_end();
                        return 0;
                }

                ret = vmf_error(PTR_ERR(new_folio));
                goto out_release_old;
        }

        /*
         * When the original hugepage is shared one, it does not have
         * anon_vma prepared.
         */
        ret = __vmf_anon_prepare(vmf);
        if (unlikely(ret))
                goto out_release_all;

        if (copy_user_large_folio(new_folio, old_folio, vmf->real_address, vma)) {
                ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h));
                goto out_release_all;
        }
        __folio_mark_uptodate(new_folio);

        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, vmf->address,
                                vmf->address + huge_page_size(h));
        mmu_notifier_invalidate_range_start(&range);

        /*
         * Retake the page table lock to check for racing updates
         * before the page tables are altered
         */
        spin_lock(vmf->ptl);
        vmf->pte = hugetlb_walk(vma, vmf->address, huge_page_size(h));
        if (likely(vmf->pte && pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), pte))) {
                pte_t newpte = make_huge_pte(vma, &new_folio->page, !unshare);

                /* Break COW or unshare */
                huge_ptep_clear_flush(vma, vmf->address, vmf->pte);
                hugetlb_remove_rmap(old_folio);
                hugetlb_add_new_anon_rmap(new_folio, vma, vmf->address);
                if (huge_pte_uffd_wp(pte))
                        newpte = huge_pte_mkuffd_wp(newpte);
                set_huge_pte_at(mm, vmf->address, vmf->pte, newpte,
                                huge_page_size(h));
                folio_set_hugetlb_migratable(new_folio);
                /* Make the old page be freed below */
                new_folio = old_folio;
        }
        spin_unlock(vmf->ptl);
        mmu_notifier_invalidate_range_end(&range);
out_release_all:
        /*
         * No restore in case of successful pagetable update (Break COW or
         * unshare)
         */
        if (new_folio != old_folio)
                restore_reserve_on_error(h, vma, vmf->address, new_folio);
        folio_put(new_folio);
out_release_old:
        folio_put(old_folio);

        spin_lock(vmf->ptl); /* Caller expects lock to be held */

        delayacct_wpcopy_end();
        return ret;
}

/*
 * Return whether there is a pagecache page to back given address within VMA.
 */
bool hugetlbfs_pagecache_present(struct hstate *h,
                                 struct vm_area_struct *vma, unsigned long address)
{
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = linear_page_index(vma, address);
        struct folio *folio;

        folio = filemap_get_folio(mapping, idx);
        if (IS_ERR(folio))
                return false;
        folio_put(folio);
        return true;
}

int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping,
                           pgoff_t idx)
{
        struct inode *inode = mapping->host;
        struct hstate *h = hstate_inode(inode);
        int err;

        idx <<= huge_page_order(h);
        __folio_set_locked(folio);
        err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);

        if (unlikely(err)) {
                __folio_clear_locked(folio);
                return err;
        }
        folio_clear_hugetlb_restore_reserve(folio);

        /*
         * mark folio dirty so that it will not be removed from cache/file
         * by non-hugetlbfs specific code paths.
         */
        folio_mark_dirty(folio);

        spin_lock(&inode->i_lock);
        inode->i_blocks += blocks_per_huge_page(h);
        spin_unlock(&inode->i_lock);
        return 0;
}

static inline vm_fault_t hugetlb_handle_userfault(struct vm_fault *vmf,
                                                  struct address_space *mapping,
                                                  unsigned long reason)
{
        u32 hash;

        /*
         * vma_lock and hugetlb_fault_mutex must be dropped before handling
         * userfault. Also mmap_lock could be dropped due to handling
         * userfault, any vma operation should be careful from here.
         */
        hugetlb_vma_unlock_read(vmf->vma);
        hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        return handle_userfault(vmf, reason);
}

/*
 * Recheck pte with pgtable lock.  Returns true if pte didn't change, or
 * false if pte changed or is changing.
 */
static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, unsigned long addr,
                               pte_t *ptep, pte_t old_pte)
{
        spinlock_t *ptl;
        bool same;

        ptl = huge_pte_lock(h, mm, ptep);
        same = pte_same(huge_ptep_get(mm, addr, ptep), old_pte);
        spin_unlock(ptl);

        return same;
}

static vm_fault_t hugetlb_no_page(struct address_space *mapping,
                        struct vm_fault *vmf)
{
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        struct hstate *h = hstate_vma(vma);
        vm_fault_t ret = VM_FAULT_SIGBUS;
        int anon_rmap = 0;
        unsigned long size;
        struct folio *folio;
        pte_t new_pte;
        bool new_folio, new_pagecache_folio = false;
        u32 hash = hugetlb_fault_mutex_hash(mapping, vmf->pgoff);

        /*
         * Currently, we are forced to kill the process in the event the
         * original mapper has unmapped pages from the child due to a failed
         * COW/unsharing. Warn that such a situation has occurred as it may not
         * be obvious.
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                           current->pid);
                goto out;
        }

        /*
         * Use page lock to guard against racing truncation
         * before we get page_table_lock.
         */
        new_folio = false;
        folio = filemap_lock_hugetlb_folio(h, mapping, vmf->pgoff);
        if (IS_ERR(folio)) {
                size = i_size_read(mapping->host) >> huge_page_shift(h);
                if (vmf->pgoff >= size)
                        goto out;
                /* Check for page in userfault range */
                if (userfaultfd_missing(vma)) {
                        /*
                         * Since hugetlb_no_page() was examining pte
                         * without pgtable lock, we need to re-test under
                         * lock because the pte may not be stable and could
                         * have changed from under us.  Try to detect
                         * either changed or during-changing ptes and retry
                         * properly when needed.
                         *
                         * Note that userfaultfd is actually fine with
                         * false positives (e.g. caused by pte changed),
                         * but not wrong logical events (e.g. caused by
                         * reading a pte during changing).  The latter can
                         * confuse the userspace, so the strictness is very
                         * much preferred.  E.g., MISSING event should
                         * never happen on the page after UFFDIO_COPY has
                         * correctly installed the page and returned.
                         */
                        if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
                                ret = 0;
                                goto out;
                        }

                        return hugetlb_handle_userfault(vmf, mapping,
                                                        VM_UFFD_MISSING);
                }

                if (!(vma->vm_flags & VM_MAYSHARE)) {
                        ret = __vmf_anon_prepare(vmf);
                        if (unlikely(ret))
                                goto out;
                }

                folio = alloc_hugetlb_folio(vma, vmf->address, false);
                if (IS_ERR(folio)) {
                        /*
                         * Returning error will result in faulting task being
                         * sent SIGBUS.  The hugetlb fault mutex prevents two
                         * tasks from racing to fault in the same page which
                         * could result in false unable to allocate errors.
                         * Page migration does not take the fault mutex, but
                         * does a clear then write of pte's under page table
                         * lock.  Page fault code could race with migration,
                         * notice the clear pte and try to allocate a page
                         * here.  Before returning error, get ptl and make
                         * sure there really is no pte entry.
                         */
                        if (hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte))
                                ret = vmf_error(PTR_ERR(folio));
                        else
                                ret = 0;
                        goto out;
                }
                folio_zero_user(folio, vmf->real_address);
                __folio_mark_uptodate(folio);
                new_folio = true;

                if (vma->vm_flags & VM_MAYSHARE) {
                        int err = hugetlb_add_to_page_cache(folio, mapping,
                                                        vmf->pgoff);
                        if (err) {
                                /*
                                 * err can't be -EEXIST which implies someone
                                 * else consumed the reservation since hugetlb
                                 * fault mutex is held when add a hugetlb page
                                 * to the page cache. So it's safe to call
                                 * restore_reserve_on_error() here.
                                 */
                                restore_reserve_on_error(h, vma, vmf->address,
                                                        folio);
                                folio_put(folio);
                                ret = VM_FAULT_SIGBUS;
                                goto out;
                        }
                        new_pagecache_folio = true;
                } else {
                        folio_lock(folio);
                        anon_rmap = 1;
                }
        } else {
                /*
                 * If memory error occurs between mmap() and fault, some process
                 * don't have hwpoisoned swap entry for errored virtual address.
                 * So we need to block hugepage fault by PG_hwpoison bit check.
                 */
                if (unlikely(folio_test_hwpoison(folio))) {
                        ret = VM_FAULT_HWPOISON_LARGE |
                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }

                /* Check for page in userfault range. */
                if (userfaultfd_minor(vma)) {
                        folio_unlock(folio);
                        folio_put(folio);
                        /* See comment in userfaultfd_missing() block above */
                        if (!hugetlb_pte_stable(h, mm, vmf->address, vmf->pte, vmf->orig_pte)) {
                                ret = 0;
                                goto out;
                        }
                        return hugetlb_handle_userfault(vmf, mapping,
                                                        VM_UFFD_MINOR);
                }
        }

        /*
         * If we are going to COW a private mapping later, we examine the
         * pending reservations for this page now. This will ensure that
         * any allocations necessary to record that reservation occur outside
         * the spinlock.
         */
        if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                if (vma_needs_reservation(h, vma, vmf->address) < 0) {
                        ret = VM_FAULT_OOM;
                        goto backout_unlocked;
                }
                /* Just decrements count, does not deallocate */
                vma_end_reservation(h, vma, vmf->address);
        }

        vmf->ptl = huge_pte_lock(h, mm, vmf->pte);
        ret = 0;
        /* If pte changed from under us, retry */
        if (!pte_same(huge_ptep_get(mm, vmf->address, vmf->pte), vmf->orig_pte))
                goto backout;

        if (anon_rmap)
                hugetlb_add_new_anon_rmap(folio, vma, vmf->address);
        else
                hugetlb_add_file_rmap(folio);
        new_pte = make_huge_pte(vma, &folio->page, vma->vm_flags & VM_SHARED);
        /*
         * If this pte was previously wr-protected, keep it wr-protected even
         * if populated.
         */
        if (unlikely(pte_marker_uffd_wp(vmf->orig_pte)))
                new_pte = huge_pte_mkuffd_wp(new_pte);
        set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h));

        hugetlb_count_add(pages_per_huge_page(h), mm);
        if ((vmf->flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
                ret = hugetlb_wp(folio, vmf);
        }

        spin_unlock(vmf->ptl);

        /*
         * Only set hugetlb_migratable in newly allocated pages.  Existing pages
         * found in the pagecache may not have hugetlb_migratable if they have
         * been isolated for migration.
         */
        if (new_folio)
                folio_set_hugetlb_migratable(folio);

        folio_unlock(folio);
out:
        hugetlb_vma_unlock_read(vma);

        /*
         * We must check to release the per-VMA lock. __vmf_anon_prepare() is
         * the only way ret can be set to VM_FAULT_RETRY.
         */
        if (unlikely(ret & VM_FAULT_RETRY))
                vma_end_read(vma);

        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        return ret;

backout:
        spin_unlock(vmf->ptl);
backout_unlocked:
        if (new_folio && !new_pagecache_folio)
                restore_reserve_on_error(h, vma, vmf->address, folio);

        folio_unlock(folio);
        folio_put(folio);
        goto out;
}

#ifdef CONFIG_SMP
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
        unsigned long key[2];
        u32 hash;

        key[0] = (unsigned long) mapping;
        key[1] = idx;

        hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);

        return hash & (num_fault_mutexes - 1);
}
#else
/*
 * For uniprocessor systems we always use a single mutex, so just
 * return 0 and avoid the hashing overhead.
 */
u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
        return 0;
}
#endif

vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, unsigned int flags)
{
        vm_fault_t ret;
        u32 hash;
        struct folio *folio = NULL;
        struct folio *pagecache_folio = NULL;
        struct hstate *h = hstate_vma(vma);
        struct address_space *mapping;
        int need_wait_lock = 0;
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & huge_page_mask(h),
                .real_address = address,
                .flags = flags,
                .pgoff = vma_hugecache_offset(h, vma,
                                address & huge_page_mask(h)),
                /* TODO: Track hugetlb faults using vm_fault */

                /*
                 * Some fields may not be initialized, be careful as it may
                 * be hard to debug if called functions make assumptions
                 */
        };

        /*
         * Serialize hugepage allocation and instantiation, so that we don't
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
        mapping = vma->vm_file->f_mapping;
        hash = hugetlb_fault_mutex_hash(mapping, vmf.pgoff);
        mutex_lock(&hugetlb_fault_mutex_table[hash]);

        /*
         * Acquire vma lock before calling huge_pte_alloc and hold
         * until finished with vmf.pte.  This prevents huge_pmd_unshare from
         * being called elsewhere and making the vmf.pte no longer valid.
         */
        hugetlb_vma_lock_read(vma);
        vmf.pte = huge_pte_alloc(mm, vma, vmf.address, huge_page_size(h));
        if (!vmf.pte) {
                hugetlb_vma_unlock_read(vma);
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                return VM_FAULT_OOM;
        }

        vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte);
        if (huge_pte_none_mostly(vmf.orig_pte)) {
                if (is_pte_marker(vmf.orig_pte)) {
                        pte_marker marker =
                                pte_marker_get(pte_to_swp_entry(vmf.orig_pte));

                        if (marker & PTE_MARKER_POISONED) {
                                ret = VM_FAULT_HWPOISON_LARGE |
                                      VM_FAULT_SET_HINDEX(hstate_index(h));
                                goto out_mutex;
                        } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) {
                                /* This isn't supported in hugetlb. */
                                ret = VM_FAULT_SIGSEGV;
                                goto out_mutex;
                        }
                }

                /*
                 * Other PTE markers should be handled the same way as none PTE.
                 *
                 * hugetlb_no_page will drop vma lock and hugetlb fault
                 * mutex internally, which make us return immediately.
                 */
                return hugetlb_no_page(mapping, &vmf);
        }

        ret = 0;

        /*
         * vmf.orig_pte could be a migration/hwpoison vmf.orig_pte at this
         * point, so this check prevents the kernel from going below assuming
         * that we have an active hugepage in pagecache. This goto expects
         * the 2nd page fault, and is_hugetlb_entry_(migration|hwpoisoned)
         * check will properly handle it.
         */
        if (!pte_present(vmf.orig_pte)) {
                if (unlikely(is_hugetlb_entry_migration(vmf.orig_pte))) {
                        /*
                         * Release the hugetlb fault lock now, but retain
                         * the vma lock, because it is needed to guard the
                         * huge_pte_lockptr() later in
                         * migration_entry_wait_huge(). The vma lock will
                         * be released there.
                         */
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        migration_entry_wait_huge(vma, vmf.address, vmf.pte);
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(vmf.orig_pte)))
                        ret = VM_FAULT_HWPOISON_LARGE |
                            VM_FAULT_SET_HINDEX(hstate_index(h));
                goto out_mutex;
        }

        /*
         * If we are going to COW/unshare the mapping later, we examine the
         * pending reservations for this page now. This will ensure that any
         * allocations necessary to record that reservation occur outside the
         * spinlock. Also lookup the pagecache page now as it is used to
         * determine if a reservation has been consumed.
         */
        if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
            !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(vmf.orig_pte)) {
                if (vma_needs_reservation(h, vma, vmf.address) < 0) {
                        ret = VM_FAULT_OOM;
                        goto out_mutex;
                }
                /* Just decrements count, does not deallocate */
                vma_end_reservation(h, vma, vmf.address);

                pagecache_folio = filemap_lock_hugetlb_folio(h, mapping,
                                                             vmf.pgoff);
                if (IS_ERR(pagecache_folio))
                        pagecache_folio = NULL;
        }

        vmf.ptl = huge_pte_lock(h, mm, vmf.pte);

        /* Check for a racing update before calling hugetlb_wp() */
        if (unlikely(!pte_same(vmf.orig_pte, huge_ptep_get(mm, vmf.address, vmf.pte))))
                goto out_ptl;

        /* Handle userfault-wp first, before trying to lock more pages */
        if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(mm, vmf.address, vmf.pte)) &&
            (flags & FAULT_FLAG_WRITE) && !huge_pte_write(vmf.orig_pte)) {
                if (!userfaultfd_wp_async(vma)) {
                        spin_unlock(vmf.ptl);
                        if (pagecache_folio) {
                                folio_unlock(pagecache_folio);
                                folio_put(pagecache_folio);
                        }
                        hugetlb_vma_unlock_read(vma);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                        return handle_userfault(&vmf, VM_UFFD_WP);
                }

                vmf.orig_pte = huge_pte_clear_uffd_wp(vmf.orig_pte);
                set_huge_pte_at(mm, vmf.address, vmf.pte, vmf.orig_pte,
                                huge_page_size(hstate_vma(vma)));
                /* Fallthrough to CoW */
        }

        /*
         * hugetlb_wp() requires page locks of pte_page(vmf.orig_pte) and
         * pagecache_folio, so here we need take the former one
         * when folio != pagecache_folio or !pagecache_folio.
         */
        folio = page_folio(pte_page(vmf.orig_pte));
        if (folio != pagecache_folio)
                if (!folio_trylock(folio)) {
                        need_wait_lock = 1;
                        goto out_ptl;
                }

        folio_get(folio);

        if (flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
                if (!huge_pte_write(vmf.orig_pte)) {
                        ret = hugetlb_wp(pagecache_folio, &vmf);
                        goto out_put_page;
                } else if (likely(flags & FAULT_FLAG_WRITE)) {
                        vmf.orig_pte = huge_pte_mkdirty(vmf.orig_pte);
                }
        }
        vmf.orig_pte = pte_mkyoung(vmf.orig_pte);
        if (huge_ptep_set_access_flags(vma, vmf.address, vmf.pte, vmf.orig_pte,
                                                flags & FAULT_FLAG_WRITE))
                update_mmu_cache(vma, vmf.address, vmf.pte);
out_put_page:
        if (folio != pagecache_folio)
                folio_unlock(folio);
        folio_put(folio);
out_ptl:
        spin_unlock(vmf.ptl);

        if (pagecache_folio) {
                folio_unlock(pagecache_folio);
                folio_put(pagecache_folio);
        }
out_mutex:
        hugetlb_vma_unlock_read(vma);

        /*
         * We must check to release the per-VMA lock. __vmf_anon_prepare() in
         * hugetlb_wp() is the only way ret can be set to VM_FAULT_RETRY.
         */
        if (unlikely(ret & VM_FAULT_RETRY))
                vma_end_read(vma);

        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        /*
         * Generally it's safe to hold refcount during waiting page lock. But
         * here we just wait to defer the next page fault to avoid busy loop and
         * the page is not used after unlocked before returning from the current
         * page fault. So we are safe from accessing freed page, even if we wait
         * here without taking refcount.
         */
        if (need_wait_lock)
                folio_wait_locked(folio);
        return ret;
}

#ifdef CONFIG_USERFAULTFD
/*
 * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte().
 */
static struct folio *alloc_hugetlb_folio_vma(struct hstate *h,
                struct vm_area_struct *vma, unsigned long address)
{
        struct mempolicy *mpol;
        nodemask_t *nodemask;
        struct folio *folio;
        gfp_t gfp_mask;
        int node;

        gfp_mask = htlb_alloc_mask(h);
        node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
        /*
         * This is used to allocate a temporary hugetlb to hold the copied
         * content, which will then be copied again to the final hugetlb
         * consuming a reservation. Set the alloc_fallback to false to indicate
         * that breaking the per-node hugetlb pool is not allowed in this case.
         */
        folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask, false);
        mpol_cond_put(mpol);

        return folio;
}

/*
 * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
 * with modifications for hugetlb pages.
 */
int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                             struct vm_area_struct *dst_vma,
                             unsigned long dst_addr,
                             unsigned long src_addr,
                             uffd_flags_t flags,
                             struct folio **foliop)
{
        struct mm_struct *dst_mm = dst_vma->vm_mm;
        bool is_continue = uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE);
        bool wp_enabled = (flags & MFILL_ATOMIC_WP);
        struct hstate *h = hstate_vma(dst_vma);
        struct address_space *mapping = dst_vma->vm_file->f_mapping;
        pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
        unsigned long size = huge_page_size(h);
        int vm_shared = dst_vma->vm_flags & VM_SHARED;
        pte_t _dst_pte;
        spinlock_t *ptl;
        int ret = -ENOMEM;
        struct folio *folio;
        bool folio_in_pagecache = false;

        if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
                ptl = huge_pte_lock(h, dst_mm, dst_pte);

                /* Don't overwrite any existing PTEs (even markers) */
                if (!huge_pte_none(huge_ptep_get(dst_mm, dst_addr, dst_pte))) {
                        spin_unlock(ptl);
                        return -EEXIST;
                }

                _dst_pte = make_pte_marker(PTE_MARKER_POISONED);
                set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);

                /* No need to invalidate - it was non-present before */
                update_mmu_cache(dst_vma, dst_addr, dst_pte);

                spin_unlock(ptl);
                return 0;
        }

        if (is_continue) {
                ret = -EFAULT;
                folio = filemap_lock_hugetlb_folio(h, mapping, idx);
                if (IS_ERR(folio))
                        goto out;
                folio_in_pagecache = true;
        } else if (!*foliop) {
                /* If a folio already exists, then it's UFFDIO_COPY for
                 * a non-missing case. Return -EEXIST.
                 */
                if (vm_shared &&
                    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
                        ret = -EEXIST;
                        goto out;
                }

                folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
                if (IS_ERR(folio)) {
                        ret = -ENOMEM;
                        goto out;
                }

                ret = copy_folio_from_user(folio, (const void __user *) src_addr,
                                           false);

                /* fallback to copy_from_user outside mmap_lock */
                if (unlikely(ret)) {
                        ret = -ENOENT;
                        /* Free the allocated folio which may have
                         * consumed a reservation.
                         */
                        restore_reserve_on_error(h, dst_vma, dst_addr, folio);
                        folio_put(folio);

                        /* Allocate a temporary folio to hold the copied
                         * contents.
                         */
                        folio = alloc_hugetlb_folio_vma(h, dst_vma, dst_addr);
                        if (!folio) {
                                ret = -ENOMEM;
                                goto out;
                        }
                        *foliop = folio;
                        /* Set the outparam foliop and return to the caller to
                         * copy the contents outside the lock. Don't free the
                         * folio.
                         */
                        goto out;
                }
        } else {
                if (vm_shared &&
                    hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
                        folio_put(*foliop);
                        ret = -EEXIST;
                        *foliop = NULL;
                        goto out;
                }

                folio = alloc_hugetlb_folio(dst_vma, dst_addr, false);
                if (IS_ERR(folio)) {
                        folio_put(*foliop);
                        ret = -ENOMEM;
                        *foliop = NULL;
                        goto out;
                }
                ret = copy_user_large_folio(folio, *foliop, dst_addr, dst_vma);
                folio_put(*foliop);
                *foliop = NULL;
                if (ret) {
                        folio_put(folio);
                        goto out;
                }
        }

        /*
         * If we just allocated a new page, we need a memory barrier to ensure
         * that preceding stores to the page become visible before the
         * set_pte_at() write. The memory barrier inside __folio_mark_uptodate
         * is what we need.
         *
         * In the case where we have not allocated a new page (is_continue),
         * the page must already be uptodate. UFFDIO_CONTINUE already includes
         * an earlier smp_wmb() to ensure that prior stores will be visible
         * before the set_pte_at() write.
         */
        if (!is_continue)
                __folio_mark_uptodate(folio);
        else
                WARN_ON_ONCE(!folio_test_uptodate(folio));

        /* Add shared, newly allocated pages to the page cache. */
        if (vm_shared && !is_continue) {
                ret = -EFAULT;
                if (idx >= (i_size_read(mapping->host) >> huge_page_shift(h)))
                        goto out_release_nounlock;

                /*
                 * Serialization between remove_inode_hugepages() and
                 * hugetlb_add_to_page_cache() below happens through the
                 * hugetlb_fault_mutex_table that here must be hold by
                 * the caller.
                 */
                ret = hugetlb_add_to_page_cache(folio, mapping, idx);
                if (ret)
                        goto out_release_nounlock;
                folio_in_pagecache = true;
        }

        ptl = huge_pte_lock(h, dst_mm, dst_pte);

        ret = -EIO;
        if (folio_test_hwpoison(folio))
                goto out_release_unlock;

        /*
         * We allow to overwrite a pte marker: consider when both MISSING|WP
         * registered, we firstly wr-protect a none pte which has no page cache
         * page backing it, then access the page.
         */
        ret = -EEXIST;
        if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte)))
                goto out_release_unlock;

        if (folio_in_pagecache)
                hugetlb_add_file_rmap(folio);
        else
                hugetlb_add_new_anon_rmap(folio, dst_vma, dst_addr);

        /*
         * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
         * with wp flag set, don't set pte write bit.
         */
        _dst_pte = make_huge_pte(dst_vma, &folio->page,
                                 !wp_enabled && !(is_continue && !vm_shared));
        /*
         * Always mark UFFDIO_COPY page dirty; note that this may not be
         * extremely important for hugetlbfs for now since swapping is not
         * supported, but we should still be clear in that this page cannot be
         * thrown away at will, even if write bit not set.
         */
        _dst_pte = huge_pte_mkdirty(_dst_pte);
        _dst_pte = pte_mkyoung(_dst_pte);

        if (wp_enabled)
                _dst_pte = huge_pte_mkuffd_wp(_dst_pte);

        set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte, size);

        hugetlb_count_add(pages_per_huge_page(h), dst_mm);

        /* No need to invalidate - it was non-present before */
        update_mmu_cache(dst_vma, dst_addr, dst_pte);

        spin_unlock(ptl);
        if (!is_continue)
                folio_set_hugetlb_migratable(folio);
        if (vm_shared || is_continue)
                folio_unlock(folio);
        ret = 0;
out:
        return ret;
out_release_unlock:
        spin_unlock(ptl);
        if (vm_shared || is_continue)
                folio_unlock(folio);
out_release_nounlock:
        if (!folio_in_pagecache)
                restore_reserve_on_error(h, dst_vma, dst_addr, folio);
        folio_put(folio);
        goto out;
}
#endif /* CONFIG_USERFAULTFD */

long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end,
                pgprot_t newprot, unsigned long cp_flags)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long start = address;
        pte_t *ptep;
        pte_t pte;
        struct hstate *h = hstate_vma(vma);
        long pages = 0, psize = huge_page_size(h);
        bool shared_pmd = false;
        struct mmu_notifier_range range;
        unsigned long last_addr_mask;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;

        /*
         * In the case of shared PMDs, the area to flush could be beyond
         * start/end.  Set range.start/range.end to cover the maximum possible
         * range if PMD sharing is possible.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
                                0, mm, start, end);
        adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);

        BUG_ON(address >= end);
        flush_cache_range(vma, range.start, range.end);

        mmu_notifier_invalidate_range_start(&range);
        hugetlb_vma_lock_write(vma);
        i_mmap_lock_write(vma->vm_file->f_mapping);
        last_addr_mask = hugetlb_mask_last_page(h);
        for (; address < end; address += psize) {
                spinlock_t *ptl;
                ptep = hugetlb_walk(vma, address, psize);
                if (!ptep) {
                        if (!uffd_wp) {
                                address |= last_addr_mask;
                                continue;
                        }
                        /*
                         * Userfaultfd wr-protect requires pgtable
                         * pre-allocations to install pte markers.
                         */
                        ptep = huge_pte_alloc(mm, vma, address, psize);
                        if (!ptep) {
                                pages = -ENOMEM;
                                break;
                        }
                }
                ptl = huge_pte_lock(h, mm, ptep);
                if (huge_pmd_unshare(mm, vma, address, ptep)) {
                        /*
                         * When uffd-wp is enabled on the vma, unshare
                         * shouldn't happen at all.  Warn about it if it
                         * happened due to some reason.
                         */
                        WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
                        pages++;
                        spin_unlock(ptl);
                        shared_pmd = true;
                        address |= last_addr_mask;
                        continue;
                }
                pte = huge_ptep_get(mm, address, ptep);
                if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
                        /* Nothing to do. */
                } else if (unlikely(is_hugetlb_entry_migration(pte))) {
                        swp_entry_t entry = pte_to_swp_entry(pte);
                        struct page *page = pfn_swap_entry_to_page(entry);
                        pte_t newpte = pte;

                        if (is_writable_migration_entry(entry)) {
                                if (PageAnon(page))
                                        entry = make_readable_exclusive_migration_entry(
                                                                swp_offset(entry));
                                else
                                        entry = make_readable_migration_entry(
                                                                swp_offset(entry));
                                newpte = swp_entry_to_pte(entry);
                                pages++;
                        }

                        if (uffd_wp)
                                newpte = pte_swp_mkuffd_wp(newpte);
                        else if (uffd_wp_resolve)
                                newpte = pte_swp_clear_uffd_wp(newpte);
                        if (!pte_same(pte, newpte))
                                set_huge_pte_at(mm, address, ptep, newpte, psize);
                } else if (unlikely(is_pte_marker(pte))) {
                        /*
                         * Do nothing on a poison marker; page is
                         * corrupted, permissons do not apply.  Here
                         * pte_marker_uffd_wp()==true implies !poison
                         * because they're mutual exclusive.
                         */
                        if (pte_marker_uffd_wp(pte) && uffd_wp_resolve)
                                /* Safe to modify directly (non-present->none). */
                                huge_pte_clear(mm, address, ptep, psize);
                } else if (!huge_pte_none(pte)) {
                        pte_t old_pte;
                        unsigned int shift = huge_page_shift(hstate_vma(vma));

                        old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
                        pte = huge_pte_modify(old_pte, newprot);
                        pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
                        if (uffd_wp)
                                pte = huge_pte_mkuffd_wp(pte);
                        else if (uffd_wp_resolve)
                                pte = huge_pte_clear_uffd_wp(pte);
                        huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
                        pages++;
                } else {
                        /* None pte */
                        if (unlikely(uffd_wp))
                                /* Safe to modify directly (none->non-present). */
                                set_huge_pte_at(mm, address, ptep,
                                                make_pte_marker(PTE_MARKER_UFFD_WP),
                                                psize);
                }
                spin_unlock(ptl);
        }
        /*
         * Must flush TLB before releasing i_mmap_rwsem: x86's huge_pmd_unshare
         * may have cleared our pud entry and done put_page on the page table:
         * once we release i_mmap_rwsem, another task can do the final put_page
         * and that page table be reused and filled with junk.  If we actually
         * did unshare a page of pmds, flush the range corresponding to the pud.
         */
        if (shared_pmd)
                flush_hugetlb_tlb_range(vma, range.start, range.end);
        else
                flush_hugetlb_tlb_range(vma, start, end);
        /*
         * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are
         * downgrading page table protection not changing it to point to a new
         * page.
         *
         * See Documentation/mm/mmu_notifier.rst
         */
        i_mmap_unlock_write(vma->vm_file->f_mapping);
        hugetlb_vma_unlock_write(vma);
        mmu_notifier_invalidate_range_end(&range);

        return pages > 0 ? (pages << h->order) : pages;
}

/* Return true if reservation was successful, false otherwise.  */
bool hugetlb_reserve_pages(struct inode *inode,
                                        long from, long to,
                                        struct vm_area_struct *vma,
                                        vm_flags_t vm_flags)
{
        long chg = -1, add = -1;
        struct hstate *h = hstate_inode(inode);
        struct hugepage_subpool *spool = subpool_inode(inode);
        struct resv_map *resv_map;
        struct hugetlb_cgroup *h_cg = NULL;
        long gbl_reserve, regions_needed = 0;

        /* This should never happen */
        if (from > to) {
                VM_WARN(1, "%s called with a negative range\n", __func__);
                return false;
        }

        /*
         * vma specific semaphore used for pmd sharing and fault/truncation
         * synchronization
         */
        hugetlb_vma_lock_alloc(vma);

        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
         * without using reserves
         */
        if (vm_flags & VM_NORESERVE)
                return true;

        /*
         * Shared mappings base their reservation on the number of pages that
         * are already allocated on behalf of the file. Private mappings need
         * to reserve the full area even if read-only as mprotect() may be
         * called to make the mapping read-write. Assume !vma is a shm mapping
         */
        if (!vma || vma->vm_flags & VM_MAYSHARE) {
                /*
                 * resv_map can not be NULL as hugetlb_reserve_pages is only
                 * called for inodes for which resv_maps were created (see
                 * hugetlbfs_get_inode).
                 */
                resv_map = inode_resv_map(inode);

                chg = region_chg(resv_map, from, to, &regions_needed);
        } else {
                /* Private mapping. */
                resv_map = resv_map_alloc();
                if (!resv_map)
                        goto out_err;

                chg = to - from;

                set_vma_resv_map(vma, resv_map);
                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
        }

        if (chg < 0)
                goto out_err;

        if (hugetlb_cgroup_charge_cgroup_rsvd(hstate_index(h),
                                chg * pages_per_huge_page(h), &h_cg) < 0)
                goto out_err;

        if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
                /* For private mappings, the hugetlb_cgroup uncharge info hangs
                 * of the resv_map.
                 */
                resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
        }

        /*
         * There must be enough pages in the subpool for the mapping. If
         * the subpool has a minimum size, there may be some global
         * reservations already in place (gbl_reserve).
         */
        gbl_reserve = hugepage_subpool_get_pages(spool, chg);
        if (gbl_reserve < 0)
                goto out_uncharge_cgroup;

        /*
         * Check enough hugepages are available for the reservation.
         * Hand the pages back to the subpool if there are not
         */
        if (hugetlb_acct_memory(h, gbl_reserve) < 0)
                goto out_put_pages;

        /*
         * Account for the reservations made. Shared mappings record regions
         * that have reservations as they are shared by multiple VMAs.
         * When the last VMA disappears, the region map says how much
         * the reservation was and the page cache tells how much of
         * the reservation was consumed. Private mappings are per-VMA and
         * only the consumed reservations are tracked. When the VMA
         * disappears, the original reservation is the VMA size and the
         * consumed reservations are stored in the map. Hence, nothing
         * else has to be done for private mappings here
         */
        if (!vma || vma->vm_flags & VM_MAYSHARE) {
                add = region_add(resv_map, from, to, regions_needed, h, h_cg);

                if (unlikely(add < 0)) {
                        hugetlb_acct_memory(h, -gbl_reserve);
                        goto out_put_pages;
                } else if (unlikely(chg > add)) {
                        /*
                         * pages in this range were added to the reserve
                         * map between region_chg and region_add.  This
                         * indicates a race with alloc_hugetlb_folio.  Adjust
                         * the subpool and reserve counts modified above
                         * based on the difference.
                         */
                        long rsv_adjust;

                        /*
                         * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
                         * reference to h_cg->css. See comment below for detail.
                         */
                        hugetlb_cgroup_uncharge_cgroup_rsvd(
                                hstate_index(h),
                                (chg - add) * pages_per_huge_page(h), h_cg);

                        rsv_adjust = hugepage_subpool_put_pages(spool,
                                                                chg - add);
                        hugetlb_acct_memory(h, -rsv_adjust);
                } else if (h_cg) {
                        /*
                         * The file_regions will hold their own reference to
                         * h_cg->css. So we should release the reference held
                         * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
                         * done.
                         */
                        hugetlb_cgroup_put_rsvd_cgroup(h_cg);
                }
        }
        return true;

out_put_pages:
        /* put back original number of pages, chg */
        (void)hugepage_subpool_put_pages(spool, chg);
out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
                                            chg * pages_per_huge_page(h), h_cg);
out_err:
        hugetlb_vma_lock_free(vma);
        if (!vma || vma->vm_flags & VM_MAYSHARE)
                /* Only call region_abort if the region_chg succeeded but the
                 * region_add failed or didn't run.
                 */
                if (chg >= 0 && add < 0)
                        region_abort(resv_map, from, to, regions_needed);
        if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                kref_put(&resv_map->refs, resv_map_release);
                set_vma_resv_map(vma, NULL);
        }
        return false;
}

long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                                long freed)
{
        struct hstate *h = hstate_inode(inode);
        struct resv_map *resv_map = inode_resv_map(inode);
        long chg = 0;
        struct hugepage_subpool *spool = subpool_inode(inode);
        long gbl_reserve;

        /*
         * Since this routine can be called in the evict inode path for all
         * hugetlbfs inodes, resv_map could be NULL.
         */
        if (resv_map) {
                chg = region_del(resv_map, start, end);
                /*
                 * region_del() can fail in the rare case where a region
                 * must be split and another region descriptor can not be
                 * allocated.  If end == LONG_MAX, it will not fail.
                 */
                if (chg < 0)
                        return chg;
        }

        spin_lock(&inode->i_lock);
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);

        /*
         * If the subpool has a minimum size, the number of global
         * reservations to be released may be adjusted.
         *
         * Note that !resv_map implies freed == 0. So (chg - freed)
         * won't go negative.
         */
        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -gbl_reserve);

        return 0;
}

#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static unsigned long page_table_shareable(struct vm_area_struct *svma,
                                struct vm_area_struct *vma,
                                unsigned long addr, pgoff_t idx)
{
        unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
                                svma->vm_start;
        unsigned long sbase = saddr & PUD_MASK;
        unsigned long s_end = sbase + PUD_SIZE;

        /* Allow segments to share if only one is marked locked */
        unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
        unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;

        /*
         * match the virtual addresses, permission and the alignment of the
         * page table page.
         *
         * Also, vma_lock (vm_private_data) is required for sharing.
         */
        if (pmd_index(addr) != pmd_index(saddr) ||
            vm_flags != svm_flags ||
            !range_in_vma(svma, sbase, s_end) ||
            !svma->vm_private_data)
                return 0;

        return saddr;
}

bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
        unsigned long start = addr & PUD_MASK;
        unsigned long end = start + PUD_SIZE;

#ifdef CONFIG_USERFAULTFD
        if (uffd_disable_huge_pmd_share(vma))
                return false;
#endif
        /*
         * check on proper vm_flags and page table alignment
         */
        if (!(vma->vm_flags & VM_MAYSHARE))
                return false;
        if (!vma->vm_private_data)        /* vma lock required for sharing */
                return false;
        if (!range_in_vma(vma, start, end))
                return false;
        return true;
}

/*
 * Determine if start,end range within vma could be mapped by shared pmd.
 * If yes, adjust start and end to cover range associated with possible
 * shared pmd mappings.
 */
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
        unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
                v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);

        /*
         * vma needs to span at least one aligned PUD size, and the range
         * must be at least partially within in.
         */
        if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
                (*end <= v_start) || (*start >= v_end))
                return;

        /* Extend the range to be PUD aligned for a worst case scenario */
        if (*start > v_start)
                *start = ALIGN_DOWN(*start, PUD_SIZE);

        if (*end < v_end)
                *end = ALIGN(*end, PUD_SIZE);
}

/*
 * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
 * and returns the corresponding pte. While this is not necessary for the
 * !shared pmd case because we can allocate the pmd later as well, it makes the
 * code much cleaner. pmd allocation is essential for the shared case because
 * pud has to be populated inside the same i_mmap_rwsem section - otherwise
 * racing tasks could either miss the sharing (see huge_pte_offset) or select a
 * bad pmd for sharing.
 */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud)
{
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
        struct vm_area_struct *svma;
        unsigned long saddr;
        pte_t *spte = NULL;
        pte_t *pte;

        i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;

                saddr = page_table_shareable(svma, vma, addr, idx);
                if (saddr) {
                        spte = hugetlb_walk(svma, saddr,
                                            vma_mmu_pagesize(svma));
                        if (spte) {
                                ptdesc_pmd_pts_inc(virt_to_ptdesc(spte));
                                break;
                        }
                }
        }

        if (!spte)
                goto out;

        spin_lock(&mm->page_table_lock);
        if (pud_none(*pud)) {
                pud_populate(mm, pud,
                                (pmd_t *)((unsigned long)spte & PAGE_MASK));
                mm_inc_nr_pmds(mm);
        } else {
                ptdesc_pmd_pts_dec(virt_to_ptdesc(spte));
        }
        spin_unlock(&mm->page_table_lock);
out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
        i_mmap_unlock_read(mapping);
        return pte;
}

/*
 * unmap huge page backed by shared pte.
 *
 * Called with page table lock held.
 *
 * returns: 1 successfully unmapped a shared pte page
 *            0 the underlying pte page is not shared, or it is the last user
 */
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
                                        unsigned long addr, pte_t *ptep)
{
        unsigned long sz = huge_page_size(hstate_vma(vma));
        pgd_t *pgd = pgd_offset(mm, addr);
        p4d_t *p4d = p4d_offset(pgd, addr);
        pud_t *pud = pud_offset(p4d, addr);

        i_mmap_assert_write_locked(vma->vm_file->f_mapping);
        hugetlb_vma_assert_locked(vma);
        if (sz != PMD_SIZE)
                return 0;
        if (!ptdesc_pmd_pts_count(virt_to_ptdesc(ptep)))
                return 0;

        pud_clear(pud);
        ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
        mm_dec_nr_pmds(mm);
        return 1;
}

#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */

pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud)
{
        return NULL;
}

int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
                                unsigned long addr, pte_t *ptep)
{
        return 0;
}

void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
{
}

bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
        return false;
}
#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */

#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pte_t *pte = NULL;

        pgd = pgd_offset(mm, addr);
        p4d = p4d_alloc(mm, pgd, addr);
        if (!p4d)
                return NULL;
        pud = pud_alloc(mm, p4d, addr);
        if (pud) {
                if (sz == PUD_SIZE) {
                        pte = (pte_t *)pud;
                } else {
                        BUG_ON(sz != PMD_SIZE);
                        if (want_pmd_share(vma, addr) && pud_none(*pud))
                                pte = huge_pmd_share(mm, vma, addr, pud);
                        else
                                pte = (pte_t *)pmd_alloc(mm, pud, addr);
                }
        }

        if (pte) {
                pte_t pteval = ptep_get_lockless(pte);

                BUG_ON(pte_present(pteval) && !pte_huge(pteval));
        }

        return pte;
}

/*
 * huge_pte_offset() - Walk the page table to resolve the hugepage
 * entry at address @addr
 *
 * Return: Pointer to page table entry (PUD or PMD) for
 * address @addr, or NULL if a !p*d_present() entry is encountered and the
 * size @sz doesn't match the hugepage size at this level of the page
 * table.
 */
pte_t *huge_pte_offset(struct mm_struct *mm,
                       unsigned long addr, unsigned long sz)
{
        pgd_t *pgd;
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        pgd = pgd_offset(mm, addr);
        if (!pgd_present(*pgd))
                return NULL;
        p4d = p4d_offset(pgd, addr);
        if (!p4d_present(*p4d))
                return NULL;

        pud = pud_offset(p4d, addr);
        if (sz == PUD_SIZE)
                /* must be pud huge, non-present or none */
                return (pte_t *)pud;
        if (!pud_present(*pud))
                return NULL;
        /* must have a valid entry and size to go further */

        pmd = pmd_offset(pud, addr);
        /* must be pmd huge, non-present or none */
        return (pte_t *)pmd;
}

/*
 * Return a mask that can be used to update an address to the last huge
 * page in a page table page mapping size.  Used to skip non-present
 * page table entries when linearly scanning address ranges.  Architectures
 * with unique huge page to page table relationships can define their own
 * version of this routine.
 */
unsigned long hugetlb_mask_last_page(struct hstate *h)
{
        unsigned long hp_size = huge_page_size(h);

        if (hp_size == PUD_SIZE)
                return P4D_SIZE - PUD_SIZE;
        else if (hp_size == PMD_SIZE)
                return PUD_SIZE - PMD_SIZE;
        else
                return 0UL;
}

#else

/* See description above.  Architectures can provide their own version. */
__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
{
#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
        if (huge_page_size(h) == PMD_SIZE)
                return PUD_SIZE - PMD_SIZE;
#endif
        return 0UL;
}

#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */

/**
 * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio
 * @folio: the folio to isolate
 * @list: the list to add the folio to on success
 *
 * Isolate an allocated (refcount > 0) hugetlb folio, marking it as
 * isolated/non-migratable, and moving it from the active list to the
 * given list.
 *
 * Isolation will fail if @folio is not an allocated hugetlb folio, or if
 * it is already isolated/non-migratable.
 *
 * On success, an additional folio reference is taken that must be dropped
 * using folio_putback_hugetlb() to undo the isolation.
 *
 * Return: True if isolation worked, otherwise False.
 */
bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list)
{
        bool ret = true;

        spin_lock_irq(&hugetlb_lock);
        if (!folio_test_hugetlb(folio) ||
            !folio_test_hugetlb_migratable(folio) ||
            !folio_try_get(folio)) {
                ret = false;
                goto unlock;
        }
        folio_clear_hugetlb_migratable(folio);
        list_move_tail(&folio->lru, list);
unlock:
        spin_unlock_irq(&hugetlb_lock);
        return ret;
}

int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison)
{
        int ret = 0;

        *hugetlb = false;
        spin_lock_irq(&hugetlb_lock);
        if (folio_test_hugetlb(folio)) {
                *hugetlb = true;
                if (folio_test_hugetlb_freed(folio))
                        ret = 0;
                else if (folio_test_hugetlb_migratable(folio) || unpoison)
                        ret = folio_try_get(folio);
                else
                        ret = -EBUSY;
        }
        spin_unlock_irq(&hugetlb_lock);
        return ret;
}

int get_huge_page_for_hwpoison(unsigned long pfn, int flags,
                                bool *migratable_cleared)
{
        int ret;

        spin_lock_irq(&hugetlb_lock);
        ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared);
        spin_unlock_irq(&hugetlb_lock);
        return ret;
}

/**
 * folio_putback_hugetlb - unisolate a hugetlb folio
 * @folio: the isolated hugetlb folio
 *
 * Putback/un-isolate the hugetlb folio that was previous isolated using
 * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it
 * back onto the active list.
 *
 * Will drop the additional folio reference obtained through
 * folio_isolate_hugetlb().
 */
void folio_putback_hugetlb(struct folio *folio)
{
        spin_lock_irq(&hugetlb_lock);
        folio_set_hugetlb_migratable(folio);
        list_move_tail(&folio->lru, &(folio_hstate(folio))->hugepage_activelist);
        spin_unlock_irq(&hugetlb_lock);
        folio_put(folio);
}

void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason)
{
        struct hstate *h = folio_hstate(old_folio);

        hugetlb_cgroup_migrate(old_folio, new_folio);
        set_page_owner_migrate_reason(&new_folio->page, reason);

        /*
         * transfer temporary state of the new hugetlb folio. This is
         * reverse to other transitions because the newpage is going to
         * be final while the old one will be freed so it takes over
         * the temporary status.
         *
         * Also note that we have to transfer the per-node surplus state
         * here as well otherwise the global surplus count will not match
         * the per-node's.
         */
        if (folio_test_hugetlb_temporary(new_folio)) {
                int old_nid = folio_nid(old_folio);
                int new_nid = folio_nid(new_folio);

                folio_set_hugetlb_temporary(old_folio);
                folio_clear_hugetlb_temporary(new_folio);


                /*
                 * There is no need to transfer the per-node surplus state
                 * when we do not cross the node.
                 */
                if (new_nid == old_nid)
                        return;
                spin_lock_irq(&hugetlb_lock);
                if (h->surplus_huge_pages_node[old_nid]) {
                        h->surplus_huge_pages_node[old_nid]--;
                        h->surplus_huge_pages_node[new_nid]++;
                }
                spin_unlock_irq(&hugetlb_lock);
        }

        /*
         * Our old folio is isolated and has "migratable" cleared until it
         * is putback. As migration succeeded, set the new folio "migratable"
         * and add it to the active list.
         */
        spin_lock_irq(&hugetlb_lock);
        folio_set_hugetlb_migratable(new_folio);
        list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist);
        spin_unlock_irq(&hugetlb_lock);
}

static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                                   unsigned long start,
                                   unsigned long end)
{
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_notifier_range range;
        unsigned long address;
        spinlock_t *ptl;
        pte_t *ptep;

        if (!(vma->vm_flags & VM_MAYSHARE))
                return;

        if (start >= end)
                return;

        flush_cache_range(vma, start, end);
        /*
         * No need to call adjust_range_if_pmd_sharing_possible(), because
         * we have already done the PUD_SIZE alignment.
         */
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
                                start, end);
        mmu_notifier_invalidate_range_start(&range);
        hugetlb_vma_lock_write(vma);
        i_mmap_lock_write(vma->vm_file->f_mapping);
        for (address = start; address < end; address += PUD_SIZE) {
                ptep = hugetlb_walk(vma, address, sz);
                if (!ptep)
                        continue;
                ptl = huge_pte_lock(h, mm, ptep);
                huge_pmd_unshare(mm, vma, address, ptep);
                spin_unlock(ptl);
        }
        flush_hugetlb_tlb_range(vma, start, end);
        i_mmap_unlock_write(vma->vm_file->f_mapping);
        hugetlb_vma_unlock_write(vma);
        /*
         * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
         * Documentation/mm/mmu_notifier.rst.
         */
        mmu_notifier_invalidate_range_end(&range);
}

/*
 * This function will unconditionally remove all the shared pmd pgtable entries
 * within the specific vma for a hugetlbfs memory range.
 */
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
        hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
                        ALIGN_DOWN(vma->vm_end, PUD_SIZE));
}






















  164 

   42 




  147 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
// SPDX-License-Identifier: GPL-2.0
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/atomic.h>

/*
 * This is an implementation of the notion of "decrement a
 * reference count, and return locked if it decremented to zero".
 *
 * NOTE NOTE NOTE! This is _not_ equivalent to
 *
 *        if (atomic_dec_and_test(&atomic)) {
 *                spin_lock(&lock);
 *                return 1;
 *        }
 *        return 0;
 *
 * because the spin-lock and the decrement must be
 * "atomic".
 */
int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock(lock);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock(lock);
        return 0;
}

EXPORT_SYMBOL(_atomic_dec_and_lock);

int _atomic_dec_and_lock_irqsave(atomic_t *atomic, spinlock_t *lock,
                                 unsigned long *flags)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        spin_lock_irqsave(lock, *flags);
        if (atomic_dec_and_test(atomic))
                return 1;
        spin_unlock_irqrestore(lock, *flags);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_lock_irqsave);

int _atomic_dec_and_raw_lock(atomic_t *atomic, raw_spinlock_t *lock)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        raw_spin_lock(lock);
        if (atomic_dec_and_test(atomic))
                return 1;
        raw_spin_unlock(lock);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_raw_lock);

int _atomic_dec_and_raw_lock_irqsave(atomic_t *atomic, raw_spinlock_t *lock,
                                     unsigned long *flags)
{
        /* Subtract 1 from counter unless that drops it to 0 (ie. it was 1) */
        if (atomic_add_unless(atomic, -1, 1))
                return 0;

        /* Otherwise do it the slow way */
        raw_spin_lock_irqsave(lock, *flags);
        if (atomic_dec_and_test(atomic))
                return 1;
        raw_spin_unlock_irqrestore(lock, *flags);
        return 0;
}
EXPORT_SYMBOL(_atomic_dec_and_raw_lock_irqsave);












































































































































































































































































































































































































































































































    3 


    3 












































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_tables.h>
#include <net/ip.h>
#include <net/inet_dscp.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_flow_table.h>

struct nft_flow_offload {
        struct nft_flowtable        *flowtable;
};

static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
{
        if (dst_xfrm(dst))
                return FLOW_OFFLOAD_XMIT_XFRM;

        return FLOW_OFFLOAD_XMIT_NEIGH;
}

static void nft_default_forward_path(struct nf_flow_route *route,
                                     struct dst_entry *dst_cache,
                                     enum ip_conntrack_dir dir)
{
        route->tuple[!dir].in.ifindex        = dst_cache->dev->ifindex;
        route->tuple[dir].dst                = dst_cache;
        route->tuple[dir].xmit_type        = nft_xmit_type(dst_cache);
}

static bool nft_is_valid_ether_device(const struct net_device *dev)
{
        if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
            dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
                return false;

        return true;
}

static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
                                     const struct dst_entry *dst_cache,
                                     const struct nf_conn *ct,
                                     enum ip_conntrack_dir dir, u8 *ha,
                                     struct net_device_path_stack *stack)
{
        const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
        struct net_device *dev = dst_cache->dev;
        struct neighbour *n;
        u8 nud_state;

        if (!nft_is_valid_ether_device(dev))
                goto out;

        n = dst_neigh_lookup(dst_cache, daddr);
        if (!n)
                return -1;

        read_lock_bh(&n->lock);
        nud_state = n->nud_state;
        ether_addr_copy(ha, n->ha);
        read_unlock_bh(&n->lock);
        neigh_release(n);

        if (!(nud_state & NUD_VALID))
                return -1;

out:
        return dev_fill_forward_path(dev, ha, stack);
}

struct nft_forward_info {
        const struct net_device *indev;
        const struct net_device *outdev;
        const struct net_device *hw_outdev;
        struct id {
                __u16        id;
                __be16        proto;
        } encap[NF_FLOW_TABLE_ENCAP_MAX];
        u8 num_encaps;
        u8 ingress_vlans;
        u8 h_source[ETH_ALEN];
        u8 h_dest[ETH_ALEN];
        enum flow_offload_xmit_type xmit_type;
};

static void nft_dev_path_info(const struct net_device_path_stack *stack,
                              struct nft_forward_info *info,
                              unsigned char *ha, struct nf_flowtable *flowtable)
{
        const struct net_device_path *path;
        int i;

        memcpy(info->h_dest, ha, ETH_ALEN);

        for (i = 0; i < stack->num_paths; i++) {
                path = &stack->path[i];
                switch (path->type) {
                case DEV_PATH_ETHERNET:
                case DEV_PATH_DSA:
                case DEV_PATH_VLAN:
                case DEV_PATH_PPPOE:
                        info->indev = path->dev;
                        if (is_zero_ether_addr(info->h_source))
                                memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);

                        if (path->type == DEV_PATH_ETHERNET)
                                break;
                        if (path->type == DEV_PATH_DSA) {
                                i = stack->num_paths;
                                break;
                        }

                        /* DEV_PATH_VLAN and DEV_PATH_PPPOE */
                        if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
                                info->indev = NULL;
                                break;
                        }
                        if (!info->outdev)
                                info->outdev = path->dev;
                        info->encap[info->num_encaps].id = path->encap.id;
                        info->encap[info->num_encaps].proto = path->encap.proto;
                        info->num_encaps++;
                        if (path->type == DEV_PATH_PPPOE)
                                memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
                        break;
                case DEV_PATH_BRIDGE:
                        if (is_zero_ether_addr(info->h_source))
                                memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);

                        switch (path->bridge.vlan_mode) {
                        case DEV_PATH_BR_VLAN_UNTAG_HW:
                                info->ingress_vlans |= BIT(info->num_encaps - 1);
                                break;
                        case DEV_PATH_BR_VLAN_TAG:
                                info->encap[info->num_encaps].id = path->bridge.vlan_id;
                                info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
                                info->num_encaps++;
                                break;
                        case DEV_PATH_BR_VLAN_UNTAG:
                                info->num_encaps--;
                                break;
                        case DEV_PATH_BR_VLAN_KEEP:
                                break;
                        }
                        info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
                        break;
                default:
                        info->indev = NULL;
                        break;
                }
        }
        if (!info->outdev)
                info->outdev = info->indev;

        info->hw_outdev = info->indev;

        if (nf_flowtable_hw_offload(flowtable) &&
            nft_is_valid_ether_device(info->indev))
                info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
}

static bool nft_flowtable_find_dev(const struct net_device *dev,
                                   struct nft_flowtable *ft)
{
        struct nft_hook *hook;
        bool found = false;

        list_for_each_entry_rcu(hook, &ft->hook_list, list) {
                if (hook->ops.dev != dev)
                        continue;

                found = true;
                break;
        }

        return found;
}

static void nft_dev_forward_path(struct nf_flow_route *route,
                                 const struct nf_conn *ct,
                                 enum ip_conntrack_dir dir,
                                 struct nft_flowtable *ft)
{
        const struct dst_entry *dst = route->tuple[dir].dst;
        struct net_device_path_stack stack;
        struct nft_forward_info info = {};
        unsigned char ha[ETH_ALEN];
        int i;

        if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
                nft_dev_path_info(&stack, &info, ha, &ft->data);

        if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
                return;

        route->tuple[!dir].in.ifindex = info.indev->ifindex;
        for (i = 0; i < info.num_encaps; i++) {
                route->tuple[!dir].in.encap[i].id = info.encap[i].id;
                route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
        }
        route->tuple[!dir].in.num_encaps = info.num_encaps;
        route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;

        if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
                memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
                memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
                route->tuple[dir].out.ifindex = info.outdev->ifindex;
                route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
                route->tuple[dir].xmit_type = info.xmit_type;
        }
}

static int nft_flow_route(const struct nft_pktinfo *pkt,
                          const struct nf_conn *ct,
                          struct nf_flow_route *route,
                          enum ip_conntrack_dir dir,
                          struct nft_flowtable *ft)
{
        struct dst_entry *this_dst = skb_dst(pkt->skb);
        struct dst_entry *other_dst = NULL;
        struct flowi fl;

        memset(&fl, 0, sizeof(fl));
        switch (nft_pf(pkt)) {
        case NFPROTO_IPV4:
                fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
                fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
                fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
                fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
                fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb)));
                fl.u.ip4.flowi4_mark = pkt->skb->mark;
                fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
                break;
        case NFPROTO_IPV6:
                fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
                fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6;
                fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
                fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
                fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
                fl.u.ip6.flowi6_mark = pkt->skb->mark;
                fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC;
                break;
        }

        if (!dst_hold_safe(this_dst))
                return -ENOENT;

        nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt));
        if (!other_dst) {
                dst_release(this_dst);
                return -ENOENT;
        }

        nft_default_forward_path(route, this_dst, dir);
        nft_default_forward_path(route, other_dst, !dir);

        if (route->tuple[dir].xmit_type        == FLOW_OFFLOAD_XMIT_NEIGH &&
            route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
                nft_dev_forward_path(route, ct, dir, ft);
                nft_dev_forward_path(route, ct, !dir, ft);
        }

        return 0;
}

static bool nft_flow_offload_skip(struct sk_buff *skb, int family)
{
        if (skb_sec_path(skb))
                return true;

        if (family == NFPROTO_IPV4) {
                const struct ip_options *opt;

                opt = &(IPCB(skb)->opt);

                if (unlikely(opt->optlen))
                        return true;
        }

        return false;
}

static void flow_offload_ct_tcp(struct nf_conn *ct)
{
        /* conntrack will not see all packets, disable tcp window validation. */
        spin_lock_bh(&ct->lock);
        ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
        ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
        spin_unlock_bh(&ct->lock);
}

static void nft_flow_offload_eval(const struct nft_expr *expr,
                                  struct nft_regs *regs,
                                  const struct nft_pktinfo *pkt)
{
        struct nft_flow_offload *priv = nft_expr_priv(expr);
        struct nf_flowtable *flowtable = &priv->flowtable->data;
        struct tcphdr _tcph, *tcph = NULL;
        struct nf_flow_route route = {};
        enum ip_conntrack_info ctinfo;
        struct flow_offload *flow;
        enum ip_conntrack_dir dir;
        struct nf_conn *ct;
        int ret;

        if (nft_flow_offload_skip(pkt->skb, nft_pf(pkt)))
                goto out;

        ct = nf_ct_get(pkt->skb, &ctinfo);
        if (!ct)
                goto out;

        switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) {
        case IPPROTO_TCP:
                tcph = skb_header_pointer(pkt->skb, nft_thoff(pkt),
                                          sizeof(_tcph), &_tcph);
                if (unlikely(!tcph || tcph->fin || tcph->rst ||
                             !nf_conntrack_tcp_established(ct)))
                        goto out;
                break;
        case IPPROTO_UDP:
                break;
#ifdef CONFIG_NF_CT_PROTO_GRE
        case IPPROTO_GRE: {
                struct nf_conntrack_tuple *tuple;

                if (ct->status & IPS_NAT_MASK)
                        goto out;
                tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
                /* No support for GRE v1 */
                if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
                        goto out;
                break;
        }
#endif
        default:
                goto out;
        }

        if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
            ct->status & (IPS_SEQ_ADJUST | IPS_NAT_CLASH))
                goto out;

        if (!nf_ct_is_confirmed(ct))
                goto out;

        if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
                goto out;

        dir = CTINFO2DIR(ctinfo);
        if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0)
                goto err_flow_route;

        flow = flow_offload_alloc(ct);
        if (!flow)
                goto err_flow_alloc;

        flow_offload_route_init(flow, &route);
        if (tcph)
                flow_offload_ct_tcp(ct);

        __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
        ret = flow_offload_add(flowtable, flow);
        if (ret < 0)
                goto err_flow_add;

        return;

err_flow_add:
        flow_offload_free(flow);
err_flow_alloc:
        dst_release(route.tuple[dir].dst);
        dst_release(route.tuple[!dir].dst);
err_flow_route:
        clear_bit(IPS_OFFLOAD_BIT, &ct->status);
out:
        regs->verdict.code = NFT_BREAK;
}

static int nft_flow_offload_validate(const struct nft_ctx *ctx,
                                     const struct nft_expr *expr)
{
        unsigned int hook_mask = (1 << NF_INET_FORWARD);

        if (ctx->family != NFPROTO_IPV4 &&
            ctx->family != NFPROTO_IPV6 &&
            ctx->family != NFPROTO_INET)
                return -EOPNOTSUPP;

        return nft_chain_validate_hooks(ctx->chain, hook_mask);
}

static const struct nla_policy nft_flow_offload_policy[NFTA_FLOW_MAX + 1] = {
        [NFTA_FLOW_TABLE_NAME]        = { .type = NLA_STRING,
                                    .len = NFT_NAME_MAXLEN - 1 },
};

static int nft_flow_offload_init(const struct nft_ctx *ctx,
                                 const struct nft_expr *expr,
                                 const struct nlattr * const tb[])
{
        struct nft_flow_offload *priv = nft_expr_priv(expr);
        u8 genmask = nft_genmask_next(ctx->net);
        struct nft_flowtable *flowtable;

        if (!tb[NFTA_FLOW_TABLE_NAME])
                return -EINVAL;

        flowtable = nft_flowtable_lookup(ctx->net, ctx->table,
                                         tb[NFTA_FLOW_TABLE_NAME], genmask);
        if (IS_ERR(flowtable))
                return PTR_ERR(flowtable);

        if (!nft_use_inc(&flowtable->use))
                return -EMFILE;

        priv->flowtable = flowtable;

        return nf_ct_netns_get(ctx->net, ctx->family);
}

static void nft_flow_offload_deactivate(const struct nft_ctx *ctx,
                                        const struct nft_expr *expr,
                                        enum nft_trans_phase phase)
{
        struct nft_flow_offload *priv = nft_expr_priv(expr);

        nf_tables_deactivate_flowtable(ctx, priv->flowtable, phase);
}

static void nft_flow_offload_activate(const struct nft_ctx *ctx,
                                      const struct nft_expr *expr)
{
        struct nft_flow_offload *priv = nft_expr_priv(expr);

        nft_use_inc_restore(&priv->flowtable->use);
}

static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
                                     const struct nft_expr *expr)
{
        nf_ct_netns_put(ctx->net, ctx->family);
}

static int nft_flow_offload_dump(struct sk_buff *skb,
                                 const struct nft_expr *expr, bool reset)
{
        struct nft_flow_offload *priv = nft_expr_priv(expr);

        if (nla_put_string(skb, NFTA_FLOW_TABLE_NAME, priv->flowtable->name))
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -1;
}

static struct nft_expr_type nft_flow_offload_type;
static const struct nft_expr_ops nft_flow_offload_ops = {
        .type                = &nft_flow_offload_type,
        .size                = NFT_EXPR_SIZE(sizeof(struct nft_flow_offload)),
        .eval                = nft_flow_offload_eval,
        .init                = nft_flow_offload_init,
        .activate        = nft_flow_offload_activate,
        .deactivate        = nft_flow_offload_deactivate,
        .destroy        = nft_flow_offload_destroy,
        .validate        = nft_flow_offload_validate,
        .dump                = nft_flow_offload_dump,
        .reduce                = NFT_REDUCE_READONLY,
};

static struct nft_expr_type nft_flow_offload_type __read_mostly = {
        .name                = "flow_offload",
        .ops                = &nft_flow_offload_ops,
        .policy                = nft_flow_offload_policy,
        .maxattr        = NFTA_FLOW_MAX,
        .owner                = THIS_MODULE,
};

static int flow_offload_netdev_event(struct notifier_block *this,
                                     unsigned long event, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);

        if (event != NETDEV_DOWN)
                return NOTIFY_DONE;

        nf_flow_table_cleanup(dev);

        return NOTIFY_DONE;
}

static struct notifier_block flow_offload_netdev_notifier = {
        .notifier_call        = flow_offload_netdev_event,
};

static int __init nft_flow_offload_module_init(void)
{
        int err;

        err = register_netdevice_notifier(&flow_offload_netdev_notifier);
        if (err)
                goto err;

        err = nft_register_expr(&nft_flow_offload_type);
        if (err < 0)
                goto register_expr;

        return 0;

register_expr:
        unregister_netdevice_notifier(&flow_offload_netdev_notifier);
err:
        return err;
}

static void __exit nft_flow_offload_module_exit(void)
{
        nft_unregister_expr(&nft_flow_offload_type);
        unregister_netdevice_notifier(&flow_offload_netdev_notifier);
}

module_init(nft_flow_offload_module_init);
module_exit(nft_flow_offload_module_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_EXPR("flow_offload");
MODULE_DESCRIPTION("nftables hardware flow offload module");













































































































































































































































































































































































































































































































































































































    3 



    3 


































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/* gw.c - CAN frame Gateway/Router/Bridge with netlink interface
 *
 * Copyright (c) 2019 Volkswagen Group Electronic Research
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of Volkswagen nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * Alternatively, provided that this notice is retained in full, this
 * software may be distributed under the terms of the GNU General
 * Public License ("GPL") version 2, in which case the provisions of the
 * GPL apply INSTEAD OF those given above.
 *
 * The provided data structures and external interfaces from this code
 * are not restricted to be used by modules with a GPL compatible license.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 *
 */

#include <linux/module.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/skbuff.h>
#include <linux/can.h>
#include <linux/can/core.h>
#include <linux/can/skb.h>
#include <linux/can/gw.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/sock.h>

#define CAN_GW_NAME "can-gw"

MODULE_DESCRIPTION("PF_CAN netlink gateway");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
MODULE_ALIAS(CAN_GW_NAME);

#define CGW_MIN_HOPS 1
#define CGW_MAX_HOPS 6
#define CGW_DEFAULT_HOPS 1

static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS;
module_param(max_hops, uint, 0444);
MODULE_PARM_DESC(max_hops,
                 "maximum " CAN_GW_NAME " routing hops for CAN frames "
                 "(valid values: " __stringify(CGW_MIN_HOPS) "-"
                 __stringify(CGW_MAX_HOPS) " hops, "
                 "default: " __stringify(CGW_DEFAULT_HOPS) ")");

static struct notifier_block notifier;
static struct kmem_cache *cgw_cache __read_mostly;

/* structure that contains the (on-the-fly) CAN frame modifications */
struct cf_mod {
        struct {
                struct canfd_frame and;
                struct canfd_frame or;
                struct canfd_frame xor;
                struct canfd_frame set;
        } modframe;
        struct {
                u8 and;
                u8 or;
                u8 xor;
                u8 set;
        } modtype;
        void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf,
                                          struct cf_mod *mod);

        /* CAN frame checksum calculation after CAN frame modifications */
        struct {
                struct cgw_csum_xor xor;
                struct cgw_csum_crc8 crc8;
        } csum;
        struct {
                void (*xor)(struct canfd_frame *cf,
                            struct cgw_csum_xor *xor);
                void (*crc8)(struct canfd_frame *cf,
                             struct cgw_csum_crc8 *crc8);
        } csumfunc;
        u32 uid;
};

/* So far we just support CAN -> CAN routing and frame modifications.
 *
 * The internal can_can_gw structure contains data and attributes for
 * a CAN -> CAN gateway job.
 */
struct can_can_gw {
        struct can_filter filter;
        int src_idx;
        int dst_idx;
};

/* list entry for CAN gateways jobs */
struct cgw_job {
        struct hlist_node list;
        struct rcu_head rcu;
        u32 handled_frames;
        u32 dropped_frames;
        u32 deleted_frames;
        struct cf_mod mod;
        union {
                /* CAN frame data source */
                struct net_device *dev;
        } src;
        union {
                /* CAN frame data destination */
                struct net_device *dev;
        } dst;
        union {
                struct can_can_gw ccgw;
                /* tbc */
        };
        u8 gwtype;
        u8 limit_hops;
        u16 flags;
};

/* modification functions that are invoked in the hot path in can_can_gw_rcv */

#define MODFUNC(func, op) static void func(struct canfd_frame *cf, \
                                           struct cf_mod *mod) { op ; }

MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id)
MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len)
MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags)
MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data)
MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id)
MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len)
MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags)
MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data)
MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id)
MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len)
MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags)
MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data)
MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id)
MODFUNC(mod_set_len, cf->len = mod->modframe.set.len)
MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags)
MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data)

static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod)
{
        int i;

        for (i = 0; i < CANFD_MAX_DLEN; i += 8)
                *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i);
}

static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod)
{
        int i;

        for (i = 0; i < CANFD_MAX_DLEN; i += 8)
                *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i);
}

static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod)
{
        int i;

        for (i = 0; i < CANFD_MAX_DLEN; i += 8)
                *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i);
}

static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod)
{
        memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN);
}

/* retrieve valid CC DLC value and store it into 'len' */
static void mod_retrieve_ccdlc(struct canfd_frame *cf)
{
        struct can_frame *ccf = (struct can_frame *)cf;

        /* len8_dlc is only valid if len == CAN_MAX_DLEN */
        if (ccf->len != CAN_MAX_DLEN)
                return;

        /* do we have a valid len8_dlc value from 9 .. 15 ? */
        if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC)
                ccf->len = ccf->len8_dlc;
}

/* convert valid CC DLC value in 'len' into struct can_frame elements */
static void mod_store_ccdlc(struct canfd_frame *cf)
{
        struct can_frame *ccf = (struct can_frame *)cf;

        /* clear potential leftovers */
        ccf->len8_dlc = 0;

        /* plain data length 0 .. 8 - that was easy */
        if (ccf->len <= CAN_MAX_DLEN)
                return;

        /* potentially broken values are caught in can_can_gw_rcv() */
        if (ccf->len > CAN_MAX_RAW_DLC)
                return;

        /* we have a valid dlc value from 9 .. 15 in ccf->len */
        ccf->len8_dlc = ccf->len;
        ccf->len = CAN_MAX_DLEN;
}

static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
{
        mod_retrieve_ccdlc(cf);
        mod_and_len(cf, mod);
        mod_store_ccdlc(cf);
}

static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
{
        mod_retrieve_ccdlc(cf);
        mod_or_len(cf, mod);
        mod_store_ccdlc(cf);
}

static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
{
        mod_retrieve_ccdlc(cf);
        mod_xor_len(cf, mod);
        mod_store_ccdlc(cf);
}

static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
{
        mod_set_len(cf, mod);
        mod_store_ccdlc(cf);
}

static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
{
        /* Copy the struct members separately to ensure that no uninitialized
         * data are copied in the 3 bytes hole of the struct. This is needed
         * to make easy compares of the data in the struct cf_mod.
         */

        dst->can_id = src->can_id;
        dst->len = src->len;
        *(u64 *)dst->data = *(u64 *)src->data;
}

static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src)
{
        /* Copy the struct members separately to ensure that no uninitialized
         * data are copied in the 2 bytes hole of the struct. This is needed
         * to make easy compares of the data in the struct cf_mod.
         */

        dst->can_id = src->can_id;
        dst->flags = src->flags;
        dst->len = src->len;
        memcpy(dst->data, src->data, CANFD_MAX_DLEN);
}

static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r)
{
        s8 dlen = CAN_MAX_DLEN;

        if (r->flags & CGW_FLAGS_CAN_FD)
                dlen = CANFD_MAX_DLEN;

        /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0]
         * relative to received dlc -1 .. -8 :
         * e.g. for received dlc = 8
         * -1 => index = 7 (data[7])
         * -3 => index = 5 (data[5])
         * -8 => index = 0 (data[0])
         */

        if (fr >= -dlen && fr < dlen &&
            to >= -dlen && to < dlen &&
            re >= -dlen && re < dlen)
                return 0;
        else
                return -EINVAL;
}

static inline int calc_idx(int idx, int rx_len)
{
        if (idx < 0)
                return rx_len + idx;
        else
                return idx;
}

static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor)
{
        int from = calc_idx(xor->from_idx, cf->len);
        int to = calc_idx(xor->to_idx, cf->len);
        int res = calc_idx(xor->result_idx, cf->len);
        u8 val = xor->init_xor_val;
        int i;

        if (from < 0 || to < 0 || res < 0)
                return;

        if (from <= to) {
                for (i = from; i <= to; i++)
                        val ^= cf->data[i];
        } else {
                for (i = from; i >= to; i--)
                        val ^= cf->data[i];
        }

        cf->data[res] = val;
}

static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor)
{
        u8 val = xor->init_xor_val;
        int i;

        for (i = xor->from_idx; i <= xor->to_idx; i++)
                val ^= cf->data[i];

        cf->data[xor->result_idx] = val;
}

static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor)
{
        u8 val = xor->init_xor_val;
        int i;

        for (i = xor->from_idx; i >= xor->to_idx; i--)
                val ^= cf->data[i];

        cf->data[xor->result_idx] = val;
}

static void cgw_csum_crc8_rel(struct canfd_frame *cf,
                              struct cgw_csum_crc8 *crc8)
{
        int from = calc_idx(crc8->from_idx, cf->len);
        int to = calc_idx(crc8->to_idx, cf->len);
        int res = calc_idx(crc8->result_idx, cf->len);
        u8 crc = crc8->init_crc_val;
        int i;

        if (from < 0 || to < 0 || res < 0)
                return;

        if (from <= to) {
                for (i = crc8->from_idx; i <= crc8->to_idx; i++)
                        crc = crc8->crctab[crc ^ cf->data[i]];
        } else {
                for (i = crc8->from_idx; i >= crc8->to_idx; i--)
                        crc = crc8->crctab[crc ^ cf->data[i]];
        }

        switch (crc8->profile) {
        case CGW_CRC8PRF_1U8:
                crc = crc8->crctab[crc ^ crc8->profile_data[0]];
                break;

        case  CGW_CRC8PRF_16U8:
                crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
                break;

        case CGW_CRC8PRF_SFFID_XOR:
                crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
                                   (cf->can_id >> 8 & 0xFF)];
                break;
        }

        cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
}

static void cgw_csum_crc8_pos(struct canfd_frame *cf,
                              struct cgw_csum_crc8 *crc8)
{
        u8 crc = crc8->init_crc_val;
        int i;

        for (i = crc8->from_idx; i <= crc8->to_idx; i++)
                crc = crc8->crctab[crc ^ cf->data[i]];

        switch (crc8->profile) {
        case CGW_CRC8PRF_1U8:
                crc = crc8->crctab[crc ^ crc8->profile_data[0]];
                break;

        case  CGW_CRC8PRF_16U8:
                crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
                break;

        case CGW_CRC8PRF_SFFID_XOR:
                crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
                                   (cf->can_id >> 8 & 0xFF)];
                break;
        }

        cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
}

static void cgw_csum_crc8_neg(struct canfd_frame *cf,
                              struct cgw_csum_crc8 *crc8)
{
        u8 crc = crc8->init_crc_val;
        int i;

        for (i = crc8->from_idx; i >= crc8->to_idx; i--)
                crc = crc8->crctab[crc ^ cf->data[i]];

        switch (crc8->profile) {
        case CGW_CRC8PRF_1U8:
                crc = crc8->crctab[crc ^ crc8->profile_data[0]];
                break;

        case  CGW_CRC8PRF_16U8:
                crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
                break;

        case CGW_CRC8PRF_SFFID_XOR:
                crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
                                   (cf->can_id >> 8 & 0xFF)];
                break;
        }

        cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
}

/* the receive & process & send function */
static void can_can_gw_rcv(struct sk_buff *skb, void *data)
{
        struct cgw_job *gwj = (struct cgw_job *)data;
        struct canfd_frame *cf;
        struct sk_buff *nskb;
        int modidx = 0;

        /* process strictly Classic CAN or CAN FD frames */
        if (gwj->flags & CGW_FLAGS_CAN_FD) {
                if (!can_is_canfd_skb(skb))
                        return;
        } else {
                if (!can_is_can_skb(skb))
                        return;
        }

        /* Do not handle CAN frames routed more than 'max_hops' times.
         * In general we should never catch this delimiter which is intended
         * to cover a misconfiguration protection (e.g. circular CAN routes).
         *
         * The Controller Area Network controllers only accept CAN frames with
         * correct CRCs - which are not visible in the controller registers.
         * According to skbuff.h documentation the csum_start element for IP
         * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY.
         * Only CAN skbs can be processed here which already have this property.
         */

#define cgw_hops(skb) ((skb)->csum_start)

        BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY);

        if (cgw_hops(skb) >= max_hops) {
                /* indicate deleted frames due to misconfiguration */
                gwj->deleted_frames++;
                return;
        }

        if (!(gwj->dst.dev->flags & IFF_UP)) {
                gwj->dropped_frames++;
                return;
        }

        /* is sending the skb back to the incoming interface not allowed? */
        if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) &&
            can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex)
                return;

        /* clone the given skb, which has not been done in can_rcv()
         *
         * When there is at least one modification function activated,
         * we need to copy the skb as we want to modify skb->data.
         */
        if (gwj->mod.modfunc[0])
                nskb = skb_copy(skb, GFP_ATOMIC);
        else
                nskb = skb_clone(skb, GFP_ATOMIC);

        if (!nskb) {
                gwj->dropped_frames++;
                return;
        }

        /* put the incremented hop counter in the cloned skb */
        cgw_hops(nskb) = cgw_hops(skb) + 1;

        /* first processing of this CAN frame -> adjust to private hop limit */
        if (gwj->limit_hops && cgw_hops(nskb) == 1)
                cgw_hops(nskb) = max_hops - gwj->limit_hops + 1;

        nskb->dev = gwj->dst.dev;

        /* pointer to modifiable CAN frame */
        cf = (struct canfd_frame *)nskb->data;

        /* perform preprocessed modification functions if there are any */
        while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx])
                (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod);

        /* Has the CAN frame been modified? */
        if (modidx) {
                /* get available space for the processed CAN frame type */
                int max_len = nskb->len - offsetof(struct canfd_frame, data);

                /* dlc may have changed, make sure it fits to the CAN frame */
                if (cf->len > max_len) {
                        /* delete frame due to misconfiguration */
                        gwj->deleted_frames++;
                        kfree_skb(nskb);
                        return;
                }

                /* check for checksum updates */
                if (gwj->mod.csumfunc.crc8)
                        (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);

                if (gwj->mod.csumfunc.xor)
                        (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor);
        }

        /* clear the skb timestamp if not configured the other way */
        if (!(gwj->flags & CGW_FLAGS_CAN_SRC_TSTAMP))
                nskb->tstamp = 0;

        /* send to netdevice */
        if (can_send(nskb, gwj->flags & CGW_FLAGS_CAN_ECHO))
                gwj->dropped_frames++;
        else
                gwj->handled_frames++;
}

static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj)
{
        return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id,
                               gwj->ccgw.filter.can_mask, can_can_gw_rcv,
                               gwj, "gw", NULL);
}

static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj)
{
        can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id,
                          gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj);
}

static void cgw_job_free_rcu(struct rcu_head *rcu_head)
{
        struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu);

        kmem_cache_free(cgw_cache, gwj);
}

static int cgw_notifier(struct notifier_block *nb,
                        unsigned long msg, void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct net *net = dev_net(dev);

        if (dev->type != ARPHRD_CAN)
                return NOTIFY_DONE;

        if (msg == NETDEV_UNREGISTER) {
                struct cgw_job *gwj = NULL;
                struct hlist_node *nx;

                ASSERT_RTNL();

                hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
                        if (gwj->src.dev == dev || gwj->dst.dev == dev) {
                                hlist_del(&gwj->list);
                                cgw_unregister_filter(net, gwj);
                                call_rcu(&gwj->rcu, cgw_job_free_rcu);
                        }
                }
        }

        return NOTIFY_DONE;
}

static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
                       u32 pid, u32 seq, int flags)
{
        struct rtcanmsg *rtcan;
        struct nlmsghdr *nlh;

        nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags);
        if (!nlh)
                return -EMSGSIZE;

        rtcan = nlmsg_data(nlh);
        rtcan->can_family = AF_CAN;
        rtcan->gwtype = gwj->gwtype;
        rtcan->flags = gwj->flags;

        /* add statistics if available */

        if (gwj->handled_frames) {
                if (nla_put_u32(skb, CGW_HANDLED, gwj->handled_frames) < 0)
                        goto cancel;
        }

        if (gwj->dropped_frames) {
                if (nla_put_u32(skb, CGW_DROPPED, gwj->dropped_frames) < 0)
                        goto cancel;
        }

        if (gwj->deleted_frames) {
                if (nla_put_u32(skb, CGW_DELETED, gwj->deleted_frames) < 0)
                        goto cancel;
        }

        /* check non default settings of attributes */

        if (gwj->limit_hops) {
                if (nla_put_u8(skb, CGW_LIM_HOPS, gwj->limit_hops) < 0)
                        goto cancel;
        }

        if (gwj->flags & CGW_FLAGS_CAN_FD) {
                struct cgw_fdframe_mod mb;

                if (gwj->mod.modtype.and) {
                        memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
                        mb.modtype = gwj->mod.modtype.and;
                        if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0)
                                goto cancel;
                }

                if (gwj->mod.modtype.or) {
                        memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
                        mb.modtype = gwj->mod.modtype.or;
                        if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0)
                                goto cancel;
                }

                if (gwj->mod.modtype.xor) {
                        memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
                        mb.modtype = gwj->mod.modtype.xor;
                        if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0)
                                goto cancel;
                }

                if (gwj->mod.modtype.set) {
                        memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
                        mb.modtype = gwj->mod.modtype.set;
                        if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0)
                                goto cancel;
                }
        } else {
                struct cgw_frame_mod mb;

                if (gwj->mod.modtype.and) {
                        memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
                        mb.modtype = gwj->mod.modtype.and;
                        if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0)
                                goto cancel;
                }

                if (gwj->mod.modtype.or) {
                        memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
                        mb.modtype = gwj->mod.modtype.or;
                        if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0)
                                goto cancel;
                }

                if (gwj->mod.modtype.xor) {
                        memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
                        mb.modtype = gwj->mod.modtype.xor;
                        if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0)
                                goto cancel;
                }

                if (gwj->mod.modtype.set) {
                        memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
                        mb.modtype = gwj->mod.modtype.set;
                        if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0)
                                goto cancel;
                }
        }

        if (gwj->mod.uid) {
                if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0)
                        goto cancel;
        }

        if (gwj->mod.csumfunc.crc8) {
                if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN,
                            &gwj->mod.csum.crc8) < 0)
                        goto cancel;
        }

        if (gwj->mod.csumfunc.xor) {
                if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN,
                            &gwj->mod.csum.xor) < 0)
                        goto cancel;
        }

        if (gwj->gwtype == CGW_TYPE_CAN_CAN) {
                if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) {
                        if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter),
                                    &gwj->ccgw.filter) < 0)
                                goto cancel;
                }

                if (nla_put_u32(skb, CGW_SRC_IF, gwj->ccgw.src_idx) < 0)
                        goto cancel;

                if (nla_put_u32(skb, CGW_DST_IF, gwj->ccgw.dst_idx) < 0)
                        goto cancel;
        }

        nlmsg_end(skb, nlh);
        return 0;

cancel:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

/* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */
static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct cgw_job *gwj = NULL;
        int idx = 0;
        int s_idx = cb->args[0];

        rcu_read_lock();
        hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) {
                if (idx < s_idx)
                        goto cont;

                if (cgw_put_job(skb, gwj, RTM_NEWROUTE,
                                NETLINK_CB(cb->skb).portid,
                                cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0)
                        break;
cont:
                idx++;
        }
        rcu_read_unlock();

        cb->args[0] = idx;

        return skb->len;
}

static const struct nla_policy cgw_policy[CGW_MAX + 1] = {
        [CGW_MOD_AND]        = { .len = sizeof(struct cgw_frame_mod) },
        [CGW_MOD_OR]        = { .len = sizeof(struct cgw_frame_mod) },
        [CGW_MOD_XOR]        = { .len = sizeof(struct cgw_frame_mod) },
        [CGW_MOD_SET]        = { .len = sizeof(struct cgw_frame_mod) },
        [CGW_CS_XOR]        = { .len = sizeof(struct cgw_csum_xor) },
        [CGW_CS_CRC8]        = { .len = sizeof(struct cgw_csum_crc8) },
        [CGW_SRC_IF]        = { .type = NLA_U32 },
        [CGW_DST_IF]        = { .type = NLA_U32 },
        [CGW_FILTER]        = { .len = sizeof(struct can_filter) },
        [CGW_LIM_HOPS]        = { .type = NLA_U8 },
        [CGW_MOD_UID]        = { .type = NLA_U32 },
        [CGW_FDMOD_AND]        = { .len = sizeof(struct cgw_fdframe_mod) },
        [CGW_FDMOD_OR]        = { .len = sizeof(struct cgw_fdframe_mod) },
        [CGW_FDMOD_XOR]        = { .len = sizeof(struct cgw_fdframe_mod) },
        [CGW_FDMOD_SET]        = { .len = sizeof(struct cgw_fdframe_mod) },
};

/* check for common and gwtype specific attributes */
static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
                          u8 gwtype, void *gwtypeattr, u8 *limhops)
{
        struct nlattr *tb[CGW_MAX + 1];
        struct rtcanmsg *r = nlmsg_data(nlh);
        int modidx = 0;
        int err = 0;

        /* initialize modification & checksum data space */
        memset(mod, 0, sizeof(*mod));

        err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb,
                                     CGW_MAX, cgw_policy, NULL);
        if (err < 0)
                return err;

        if (tb[CGW_LIM_HOPS]) {
                *limhops = nla_get_u8(tb[CGW_LIM_HOPS]);

                if (*limhops < 1 || *limhops > max_hops)
                        return -EINVAL;
        }

        /* check for AND/OR/XOR/SET modifications */
        if (r->flags & CGW_FLAGS_CAN_FD) {
                struct cgw_fdframe_mod mb;

                if (tb[CGW_FDMOD_AND]) {
                        nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN);

                        canfdframecpy(&mod->modframe.and, &mb.cf);
                        mod->modtype.and = mb.modtype;

                        if (mb.modtype & CGW_MOD_ID)
                                mod->modfunc[modidx++] = mod_and_id;

                        if (mb.modtype & CGW_MOD_LEN)
                                mod->modfunc[modidx++] = mod_and_len;

                        if (mb.modtype & CGW_MOD_FLAGS)
                                mod->modfunc[modidx++] = mod_and_flags;

                        if (mb.modtype & CGW_MOD_DATA)
                                mod->modfunc[modidx++] = mod_and_fddata;
                }

                if (tb[CGW_FDMOD_OR]) {
                        nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN);

                        canfdframecpy(&mod->modframe.or, &mb.cf);
                        mod->modtype.or = mb.modtype;

                        if (mb.modtype & CGW_MOD_ID)
                                mod->modfunc[modidx++] = mod_or_id;

                        if (mb.modtype & CGW_MOD_LEN)
                                mod->modfunc[modidx++] = mod_or_len;

                        if (mb.modtype & CGW_MOD_FLAGS)
                                mod->modfunc[modidx++] = mod_or_flags;

                        if (mb.modtype & CGW_MOD_DATA)
                                mod->modfunc[modidx++] = mod_or_fddata;
                }

                if (tb[CGW_FDMOD_XOR]) {
                        nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN);

                        canfdframecpy(&mod->modframe.xor, &mb.cf);
                        mod->modtype.xor = mb.modtype;

                        if (mb.modtype & CGW_MOD_ID)
                                mod->modfunc[modidx++] = mod_xor_id;

                        if (mb.modtype & CGW_MOD_LEN)
                                mod->modfunc[modidx++] = mod_xor_len;

                        if (mb.modtype & CGW_MOD_FLAGS)
                                mod->modfunc[modidx++] = mod_xor_flags;

                        if (mb.modtype & CGW_MOD_DATA)
                                mod->modfunc[modidx++] = mod_xor_fddata;
                }

                if (tb[CGW_FDMOD_SET]) {
                        nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN);

                        canfdframecpy(&mod->modframe.set, &mb.cf);
                        mod->modtype.set = mb.modtype;

                        if (mb.modtype & CGW_MOD_ID)
                                mod->modfunc[modidx++] = mod_set_id;

                        if (mb.modtype & CGW_MOD_LEN)
                                mod->modfunc[modidx++] = mod_set_len;

                        if (mb.modtype & CGW_MOD_FLAGS)
                                mod->modfunc[modidx++] = mod_set_flags;

                        if (mb.modtype & CGW_MOD_DATA)
                                mod->modfunc[modidx++] = mod_set_fddata;
                }
        } else {
                struct cgw_frame_mod mb;

                if (tb[CGW_MOD_AND]) {
                        nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN);

                        canframecpy(&mod->modframe.and, &mb.cf);
                        mod->modtype.and = mb.modtype;

                        if (mb.modtype & CGW_MOD_ID)
                                mod->modfunc[modidx++] = mod_and_id;

                        if (mb.modtype & CGW_MOD_DLC)
                                mod->modfunc[modidx++] = mod_and_ccdlc;

                        if (mb.modtype & CGW_MOD_DATA)
                                mod->modfunc[modidx++] = mod_and_data;
                }

                if (tb[CGW_MOD_OR]) {
                        nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN);

                        canframecpy(&mod->modframe.or, &mb.cf);
                        mod->modtype.or = mb.modtype;

                        if (mb.modtype & CGW_MOD_ID)
                                mod->modfunc[modidx++] = mod_or_id;

                        if (mb.modtype & CGW_MOD_DLC)
                                mod->modfunc[modidx++] = mod_or_ccdlc;

                        if (mb.modtype & CGW_MOD_DATA)
                                mod->modfunc[modidx++] = mod_or_data;
                }

                if (tb[CGW_MOD_XOR]) {
                        nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN);

                        canframecpy(&mod->modframe.xor, &mb.cf);
                        mod->modtype.xor = mb.modtype;

                        if (mb.modtype & CGW_MOD_ID)
                                mod->modfunc[modidx++] = mod_xor_id;

                        if (mb.modtype & CGW_MOD_DLC)
                                mod->modfunc[modidx++] = mod_xor_ccdlc;

                        if (mb.modtype & CGW_MOD_DATA)
                                mod->modfunc[modidx++] = mod_xor_data;
                }

                if (tb[CGW_MOD_SET]) {
                        nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN);

                        canframecpy(&mod->modframe.set, &mb.cf);
                        mod->modtype.set = mb.modtype;

                        if (mb.modtype & CGW_MOD_ID)
                                mod->modfunc[modidx++] = mod_set_id;

                        if (mb.modtype & CGW_MOD_DLC)
                                mod->modfunc[modidx++] = mod_set_ccdlc;

                        if (mb.modtype & CGW_MOD_DATA)
                                mod->modfunc[modidx++] = mod_set_data;
                }
        }

        /* check for checksum operations after CAN frame modifications */
        if (modidx) {
                if (tb[CGW_CS_CRC8]) {
                        struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]);

                        err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
                                                 c->result_idx, r);
                        if (err)
                                return err;

                        nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8],
                                   CGW_CS_CRC8_LEN);

                        /* select dedicated processing function to reduce
                         * runtime operations in receive hot path.
                         */
                        if (c->from_idx < 0 || c->to_idx < 0 ||
                            c->result_idx < 0)
                                mod->csumfunc.crc8 = cgw_csum_crc8_rel;
                        else if (c->from_idx <= c->to_idx)
                                mod->csumfunc.crc8 = cgw_csum_crc8_pos;
                        else
                                mod->csumfunc.crc8 = cgw_csum_crc8_neg;
                }

                if (tb[CGW_CS_XOR]) {
                        struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]);

                        err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
                                                 c->result_idx, r);
                        if (err)
                                return err;

                        nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR],
                                   CGW_CS_XOR_LEN);

                        /* select dedicated processing function to reduce
                         * runtime operations in receive hot path.
                         */
                        if (c->from_idx < 0 || c->to_idx < 0 ||
                            c->result_idx < 0)
                                mod->csumfunc.xor = cgw_csum_xor_rel;
                        else if (c->from_idx <= c->to_idx)
                                mod->csumfunc.xor = cgw_csum_xor_pos;
                        else
                                mod->csumfunc.xor = cgw_csum_xor_neg;
                }

                if (tb[CGW_MOD_UID])
                        nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32));
        }

        if (gwtype == CGW_TYPE_CAN_CAN) {
                /* check CGW_TYPE_CAN_CAN specific attributes */
                struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr;

                memset(ccgw, 0, sizeof(*ccgw));

                /* check for can_filter in attributes */
                if (tb[CGW_FILTER])
                        nla_memcpy(&ccgw->filter, tb[CGW_FILTER],
                                   sizeof(struct can_filter));

                err = -ENODEV;

                /* specifying two interfaces is mandatory */
                if (!tb[CGW_SRC_IF] || !tb[CGW_DST_IF])
                        return err;

                ccgw->src_idx = nla_get_u32(tb[CGW_SRC_IF]);
                ccgw->dst_idx = nla_get_u32(tb[CGW_DST_IF]);

                /* both indices set to 0 for flushing all routing entries */
                if (!ccgw->src_idx && !ccgw->dst_idx)
                        return 0;

                /* only one index set to 0 is an error */
                if (!ccgw->src_idx || !ccgw->dst_idx)
                        return err;
        }

        /* add the checks for other gwtypes here */

        return 0;
}

static int cgw_create_job(struct sk_buff *skb,  struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct rtcanmsg *r;
        struct cgw_job *gwj;
        struct cf_mod mod;
        struct can_can_gw ccgw;
        u8 limhops = 0;
        int err = 0;

        if (!netlink_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        if (nlmsg_len(nlh) < sizeof(*r))
                return -EINVAL;

        r = nlmsg_data(nlh);
        if (r->can_family != AF_CAN)
                return -EPFNOSUPPORT;

        /* so far we only support CAN -> CAN routings */
        if (r->gwtype != CGW_TYPE_CAN_CAN)
                return -EINVAL;

        err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
        if (err < 0)
                return err;

        if (mod.uid) {
                ASSERT_RTNL();

                /* check for updating an existing job with identical uid */
                hlist_for_each_entry(gwj, &net->can.cgw_list, list) {
                        if (gwj->mod.uid != mod.uid)
                                continue;

                        /* interfaces & filters must be identical */
                        if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw)))
                                return -EINVAL;

                        /* update modifications with disabled softirq & quit */
                        local_bh_disable();
                        memcpy(&gwj->mod, &mod, sizeof(mod));
                        local_bh_enable();
                        return 0;
                }
        }

        /* ifindex == 0 is not allowed for job creation */
        if (!ccgw.src_idx || !ccgw.dst_idx)
                return -ENODEV;

        gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL);
        if (!gwj)
                return -ENOMEM;

        gwj->handled_frames = 0;
        gwj->dropped_frames = 0;
        gwj->deleted_frames = 0;
        gwj->flags = r->flags;
        gwj->gwtype = r->gwtype;
        gwj->limit_hops = limhops;

        /* insert already parsed information */
        memcpy(&gwj->mod, &mod, sizeof(mod));
        memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw));

        err = -ENODEV;

        gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx);

        if (!gwj->src.dev)
                goto out;

        if (gwj->src.dev->type != ARPHRD_CAN)
                goto out;

        gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx);

        if (!gwj->dst.dev)
                goto out;

        if (gwj->dst.dev->type != ARPHRD_CAN)
                goto out;

        /* is sending the skb back to the incoming interface intended? */
        if (gwj->src.dev == gwj->dst.dev &&
            !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) {
                err = -EINVAL;
                goto out;
        }

        ASSERT_RTNL();

        err = cgw_register_filter(net, gwj);
        if (!err)
                hlist_add_head_rcu(&gwj->list, &net->can.cgw_list);
out:
        if (err)
                kmem_cache_free(cgw_cache, gwj);

        return err;
}

static void cgw_remove_all_jobs(struct net *net)
{
        struct cgw_job *gwj = NULL;
        struct hlist_node *nx;

        ASSERT_RTNL();

        hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
                hlist_del(&gwj->list);
                cgw_unregister_filter(net, gwj);
                call_rcu(&gwj->rcu, cgw_job_free_rcu);
        }
}

static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh,
                          struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct cgw_job *gwj = NULL;
        struct hlist_node *nx;
        struct rtcanmsg *r;
        struct cf_mod mod;
        struct can_can_gw ccgw;
        u8 limhops = 0;
        int err = 0;

        if (!netlink_capable(skb, CAP_NET_ADMIN))
                return -EPERM;

        if (nlmsg_len(nlh) < sizeof(*r))
                return -EINVAL;

        r = nlmsg_data(nlh);
        if (r->can_family != AF_CAN)
                return -EPFNOSUPPORT;

        /* so far we only support CAN -> CAN routings */
        if (r->gwtype != CGW_TYPE_CAN_CAN)
                return -EINVAL;

        err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
        if (err < 0)
                return err;

        /* two interface indices both set to 0 => remove all entries */
        if (!ccgw.src_idx && !ccgw.dst_idx) {
                cgw_remove_all_jobs(net);
                return 0;
        }

        err = -EINVAL;

        ASSERT_RTNL();

        /* remove only the first matching entry */
        hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
                if (gwj->flags != r->flags)
                        continue;

                if (gwj->limit_hops != limhops)
                        continue;

                /* we have a match when uid is enabled and identical */
                if (gwj->mod.uid || mod.uid) {
                        if (gwj->mod.uid != mod.uid)
                                continue;
                } else {
                        /* no uid => check for identical modifications */
                        if (memcmp(&gwj->mod, &mod, sizeof(mod)))
                                continue;
                }

                /* if (r->gwtype == CGW_TYPE_CAN_CAN) - is made sure here */
                if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw)))
                        continue;

                hlist_del(&gwj->list);
                cgw_unregister_filter(net, gwj);
                call_rcu(&gwj->rcu, cgw_job_free_rcu);
                err = 0;
                break;
        }

        return err;
}

static int __net_init cangw_pernet_init(struct net *net)
{
        INIT_HLIST_HEAD(&net->can.cgw_list);
        return 0;
}

static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list)
{
        struct net *net;

        rtnl_lock();
        list_for_each_entry(net, net_list, exit_list)
                cgw_remove_all_jobs(net);
        rtnl_unlock();
}

static struct pernet_operations cangw_pernet_ops = {
        .init = cangw_pernet_init,
        .exit_batch = cangw_pernet_exit_batch,
};

static const struct rtnl_msg_handler cgw_rtnl_msg_handlers[] __initconst_or_module = {
        {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_NEWROUTE,
         .doit = cgw_create_job},
        {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_DELROUTE,
         .doit = cgw_remove_job},
        {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_GETROUTE,
         .dumpit = cgw_dump_jobs},
};

static __init int cgw_module_init(void)
{
        int ret;

        /* sanitize given module parameter */
        max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS);

        pr_info("can: netlink gateway - max_hops=%d\n",        max_hops);

        ret = register_pernet_subsys(&cangw_pernet_ops);
        if (ret)
                return ret;

        ret = -ENOMEM;
        cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job),
                                      0, 0, NULL);
        if (!cgw_cache)
                goto out_cache_create;

        /* set notifier */
        notifier.notifier_call = cgw_notifier;
        ret = register_netdevice_notifier(&notifier);
        if (ret)
                goto out_register_notifier;

        ret = rtnl_register_many(cgw_rtnl_msg_handlers);
        if (ret)
                goto out_rtnl_register;

        return 0;

out_rtnl_register:
        unregister_netdevice_notifier(&notifier);
out_register_notifier:
        kmem_cache_destroy(cgw_cache);
out_cache_create:
        unregister_pernet_subsys(&cangw_pernet_ops);

        return ret;
}

static __exit void cgw_module_exit(void)
{
        rtnl_unregister_all(PF_CAN);

        unregister_netdevice_notifier(&notifier);

        unregister_pernet_subsys(&cangw_pernet_ops);
        rcu_barrier(); /* Wait for completion of call_rcu()'s */

        kmem_cache_destroy(cgw_cache);
}

module_init(cgw_module_init);
module_exit(cgw_module_exit);
























































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * workqueue.h --- work queue handling for Linux.
 */

#ifndef _LINUX_WORKQUEUE_H
#define _LINUX_WORKQUEUE_H

#include <linux/timer.h>
#include <linux/linkage.h>
#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/threads.h>
#include <linux/atomic.h>
#include <linux/cpumask_types.h>
#include <linux/rcupdate.h>
#include <linux/workqueue_types.h>

/*
 * The first word is the work queue pointer and the flags rolled into
 * one
 */
#define work_data_bits(work) ((unsigned long *)(&(work)->data))

enum work_bits {
        WORK_STRUCT_PENDING_BIT        = 0,        /* work item is pending execution */
        WORK_STRUCT_INACTIVE_BIT,        /* work item is inactive */
        WORK_STRUCT_PWQ_BIT,                /* data points to pwq */
        WORK_STRUCT_LINKED_BIT,                /* next work is linked to this one */
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC_BIT,                /* static initializer (debugobjects) */
#endif
        WORK_STRUCT_FLAG_BITS,

        /* color for workqueue flushing */
        WORK_STRUCT_COLOR_SHIFT        = WORK_STRUCT_FLAG_BITS,
        WORK_STRUCT_COLOR_BITS        = 4,

        /*
         * When WORK_STRUCT_PWQ is set, reserve 8 bits off of pwq pointer w/
         * debugobjects turned off. This makes pwqs aligned to 256 bytes (512
         * bytes w/ DEBUG_OBJECTS_WORK) and allows 16 workqueue flush colors.
         *
         * MSB
         * [ pwq pointer ] [ flush color ] [ STRUCT flags ]
         *                     4 bits        4 or 5 bits
         */
        WORK_STRUCT_PWQ_SHIFT        = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS,

        /*
         * data contains off-queue information when !WORK_STRUCT_PWQ.
         *
         * MSB
         * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ]
         *                  16 bits          1 bit        4 or 5 bits
         */
        WORK_OFFQ_FLAG_SHIFT        = WORK_STRUCT_FLAG_BITS,
        WORK_OFFQ_BH_BIT        = WORK_OFFQ_FLAG_SHIFT,
        WORK_OFFQ_FLAG_END,
        WORK_OFFQ_FLAG_BITS        = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT,

        WORK_OFFQ_DISABLE_SHIFT        = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS,
        WORK_OFFQ_DISABLE_BITS        = 16,

        /*
         * When a work item is off queue, the high bits encode off-queue flags
         * and the last pool it was on. Cap pool ID to 31 bits and use the
         * highest number to indicate that no pool is associated.
         */
        WORK_OFFQ_POOL_SHIFT        = WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS,
        WORK_OFFQ_LEFT                = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT,
        WORK_OFFQ_POOL_BITS        = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31,
};

enum work_flags {
        WORK_STRUCT_PENDING        = 1 << WORK_STRUCT_PENDING_BIT,
        WORK_STRUCT_INACTIVE        = 1 << WORK_STRUCT_INACTIVE_BIT,
        WORK_STRUCT_PWQ                = 1 << WORK_STRUCT_PWQ_BIT,
        WORK_STRUCT_LINKED        = 1 << WORK_STRUCT_LINKED_BIT,
#ifdef CONFIG_DEBUG_OBJECTS_WORK
        WORK_STRUCT_STATIC        = 1 << WORK_STRUCT_STATIC_BIT,
#else
        WORK_STRUCT_STATIC        = 0,
#endif
};

enum wq_misc_consts {
        WORK_NR_COLORS                = (1 << WORK_STRUCT_COLOR_BITS),

        /* not bound to any CPU, prefer the local CPU */
        WORK_CPU_UNBOUND        = NR_CPUS,

        /* bit mask for work_busy() return values */
        WORK_BUSY_PENDING        = 1 << 0,
        WORK_BUSY_RUNNING        = 1 << 1,

        /* maximum string length for set_worker_desc() */
        WORKER_DESC_LEN                = 32,
};

/* Convenience constants - of type 'unsigned long', not 'enum'! */
#define WORK_OFFQ_BH                (1ul << WORK_OFFQ_BH_BIT)
#define WORK_OFFQ_FLAG_MASK        (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT)
#define WORK_OFFQ_DISABLE_MASK        (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT)
#define WORK_OFFQ_POOL_NONE        ((1ul << WORK_OFFQ_POOL_BITS) - 1)
#define WORK_STRUCT_NO_POOL        (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT)
#define WORK_STRUCT_PWQ_MASK        (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1))

#define WORK_DATA_INIT()        ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL)
#define WORK_DATA_STATIC_INIT()        \
        ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC))

struct delayed_work {
        struct work_struct work;
        struct timer_list timer;

        /* target workqueue and CPU ->timer uses to queue ->work */
        struct workqueue_struct *wq;
        int cpu;
};

struct rcu_work {
        struct work_struct work;
        struct rcu_head rcu;

        /* target workqueue ->rcu uses to queue ->work */
        struct workqueue_struct *wq;
};

enum wq_affn_scope {
        WQ_AFFN_DFL,                        /* use system default */
        WQ_AFFN_CPU,                        /* one pod per CPU */
        WQ_AFFN_SMT,                        /* one pod poer SMT */
        WQ_AFFN_CACHE,                        /* one pod per LLC */
        WQ_AFFN_NUMA,                        /* one pod per NUMA node */
        WQ_AFFN_SYSTEM,                        /* one pod across the whole system */

        WQ_AFFN_NR_TYPES,
};

/**
 * struct workqueue_attrs - A struct for workqueue attributes.
 *
 * This can be used to change attributes of an unbound workqueue.
 */
struct workqueue_attrs {
        /**
         * @nice: nice level
         */
        int nice;

        /**
         * @cpumask: allowed CPUs
         *
         * Work items in this workqueue are affine to these CPUs and not allowed
         * to execute on other CPUs. A pool serving a workqueue must have the
         * same @cpumask.
         */
        cpumask_var_t cpumask;

        /**
         * @__pod_cpumask: internal attribute used to create per-pod pools
         *
         * Internal use only.
         *
         * Per-pod unbound worker pools are used to improve locality. Always a
         * subset of ->cpumask. A workqueue can be associated with multiple
         * worker pools with disjoint @__pod_cpumask's. Whether the enforcement
         * of a pool's @__pod_cpumask is strict depends on @affn_strict.
         */
        cpumask_var_t __pod_cpumask;

        /**
         * @affn_strict: affinity scope is strict
         *
         * If clear, workqueue will make a best-effort attempt at starting the
         * worker inside @__pod_cpumask but the scheduler is free to migrate it
         * outside.
         *
         * If set, workers are only allowed to run inside @__pod_cpumask.
         */
        bool affn_strict;

        /*
         * Below fields aren't properties of a worker_pool. They only modify how
         * :c:func:`apply_workqueue_attrs` select pools and thus don't
         * participate in pool hash calculations or equality comparisons.
         *
         * If @affn_strict is set, @cpumask isn't a property of a worker_pool
         * either.
         */

        /**
         * @affn_scope: unbound CPU affinity scope
         *
         * CPU pods are used to improve execution locality of unbound work
         * items. There are multiple pod types, one for each wq_affn_scope, and
         * every CPU in the system belongs to one pod in every pod type. CPUs
         * that belong to the same pod share the worker pool. For example,
         * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker
         * pool for each NUMA node.
         */
        enum wq_affn_scope affn_scope;

        /**
         * @ordered: work items must be executed one by one in queueing order
         */
        bool ordered;
};

static inline struct delayed_work *to_delayed_work(struct work_struct *work)
{
        return container_of(work, struct delayed_work, work);
}

static inline struct rcu_work *to_rcu_work(struct work_struct *work)
{
        return container_of(work, struct rcu_work, work);
}

struct execute_work {
        struct work_struct work;
};

#ifdef CONFIG_LOCKDEP
/*
 * NB: because we have to copy the lockdep_map, setting _key
 * here is required, otherwise it could get initialised to the
 * copy of the lockdep_map!
 */
#define __WORK_INIT_LOCKDEP_MAP(n, k) \
        .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k),
#else
#define __WORK_INIT_LOCKDEP_MAP(n, k)
#endif

#define __WORK_INITIALIZER(n, f) {                                        \
        .data = WORK_DATA_STATIC_INIT(),                                \
        .entry        = { &(n).entry, &(n).entry },                                \
        .func = (f),                                                        \
        __WORK_INIT_LOCKDEP_MAP(#n, &(n))                                \
        }

#define __DELAYED_WORK_INITIALIZER(n, f, tflags) {                        \
        .work = __WORK_INITIALIZER((n).work, (f)),                        \
        .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\
                                     (tflags) | TIMER_IRQSAFE),                \
        }

#define DECLARE_WORK(n, f)                                                \
        struct work_struct n = __WORK_INITIALIZER(n, f)

#define DECLARE_DELAYED_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0)

#define DECLARE_DEFERRABLE_WORK(n, f)                                        \
        struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE)

#ifdef CONFIG_DEBUG_OBJECTS_WORK
extern void __init_work(struct work_struct *work, int onstack);
extern void destroy_work_on_stack(struct work_struct *work);
extern void destroy_delayed_work_on_stack(struct delayed_work *work);
static inline unsigned int work_static(struct work_struct *work)
{
        return *work_data_bits(work) & WORK_STRUCT_STATIC;
}
#else
static inline void __init_work(struct work_struct *work, int onstack) { }
static inline void destroy_work_on_stack(struct work_struct *work) { }
static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { }
static inline unsigned int work_static(struct work_struct *work) { return 0; }
#endif

/*
 * initialize all of a work item in one go
 *
 * NOTE! No point in using "atomic_long_set()": using a direct
 * assignment of the work data initializer allows the compiler
 * to generate better code.
 */
#ifdef CONFIG_LOCKDEP
#define __INIT_WORK_KEY(_work, _func, _onstack, _key)                        \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#else
#define __INIT_WORK_KEY(_work, _func, _onstack, _key)                        \
        do {                                                                \
                __init_work((_work), _onstack);                                \
                (_work)->data = (atomic_long_t) WORK_DATA_INIT();        \
                INIT_LIST_HEAD(&(_work)->entry);                        \
                (_work)->func = (_func);                                \
        } while (0)
#endif

#define __INIT_WORK(_work, _func, _onstack)                                \
        do {                                                                \
                static __maybe_unused struct lock_class_key __key;        \
                                                                        \
                __INIT_WORK_KEY(_work, _func, _onstack, &__key);        \
        } while (0)

#define INIT_WORK(_work, _func)                                                \
        __INIT_WORK((_work), (_func), 0)

#define INIT_WORK_ONSTACK(_work, _func)                                        \
        __INIT_WORK((_work), (_func), 1)

#define INIT_WORK_ONSTACK_KEY(_work, _func, _key)                        \
        __INIT_WORK_KEY((_work), (_func), 1, _key)

#define __INIT_DELAYED_WORK(_work, _func, _tflags)                        \
        do {                                                                \
                INIT_WORK(&(_work)->work, (_func));                        \
                __init_timer(&(_work)->timer,                                \
                             delayed_work_timer_fn,                        \
                             (_tflags) | TIMER_IRQSAFE);                \
        } while (0)

#define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags)                \
        do {                                                                \
                INIT_WORK_ONSTACK(&(_work)->work, (_func));                \
                __init_timer_on_stack(&(_work)->timer,                        \
                                      delayed_work_timer_fn,                \
                                      (_tflags) | TIMER_IRQSAFE);        \
        } while (0)

#define INIT_DELAYED_WORK(_work, _func)                                        \
        __INIT_DELAYED_WORK(_work, _func, 0)

#define INIT_DELAYED_WORK_ONSTACK(_work, _func)                                \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0)

#define INIT_DEFERRABLE_WORK(_work, _func)                                \
        __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE)

#define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func)                        \
        __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)

#define INIT_RCU_WORK(_work, _func)                                        \
        INIT_WORK(&(_work)->work, (_func))

#define INIT_RCU_WORK_ONSTACK(_work, _func)                                \
        INIT_WORK_ONSTACK(&(_work)->work, (_func))

/**
 * work_pending - Find out whether a work item is currently pending
 * @work: The work item in question
 */
#define work_pending(work) \
        test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))

/**
 * delayed_work_pending - Find out whether a delayable work item is currently
 * pending
 * @w: The work item in question
 */
#define delayed_work_pending(w) \
        work_pending(&(w)->work)

/*
 * Workqueue flags and constants.  For details, please refer to
 * Documentation/core-api/workqueue.rst.
 */
enum wq_flags {
        WQ_BH                        = 1 << 0, /* execute in bottom half (softirq) context */
        WQ_UNBOUND                = 1 << 1, /* not bound to any cpu */
        WQ_FREEZABLE                = 1 << 2, /* freeze during suspend */
        WQ_MEM_RECLAIM                = 1 << 3, /* may be used for memory reclaim */
        WQ_HIGHPRI                = 1 << 4, /* high priority */
        WQ_CPU_INTENSIVE        = 1 << 5, /* cpu intensive workqueue */
        WQ_SYSFS                = 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */

        /*
         * Per-cpu workqueues are generally preferred because they tend to
         * show better performance thanks to cache locality.  Per-cpu
         * workqueues exclude the scheduler from choosing the CPU to
         * execute the worker threads, which has an unfortunate side effect
         * of increasing power consumption.
         *
         * The scheduler considers a CPU idle if it doesn't have any task
         * to execute and tries to keep idle cores idle to conserve power;
         * however, for example, a per-cpu work item scheduled from an
         * interrupt handler on an idle CPU will force the scheduler to
         * execute the work item on that CPU breaking the idleness, which in
         * turn may lead to more scheduling choices which are sub-optimal
         * in terms of power consumption.
         *
         * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default
         * but become unbound if workqueue.power_efficient kernel param is
         * specified.  Per-cpu workqueues which are identified to
         * contribute significantly to power-consumption are identified and
         * marked with this flag and enabling the power_efficient mode
         * leads to noticeable power saving at the cost of small
         * performance disadvantage.
         *
         * http://thread.gmane.org/gmane.linux.kernel/1480396
         */
        WQ_POWER_EFFICIENT        = 1 << 7,

        __WQ_DESTROYING                = 1 << 15, /* internal: workqueue is destroying */
        __WQ_DRAINING                = 1 << 16, /* internal: workqueue is draining */
        __WQ_ORDERED                = 1 << 17, /* internal: workqueue is ordered */
        __WQ_LEGACY                = 1 << 18, /* internal: create*_workqueue() */

        /* BH wq only allows the following flags */
        __WQ_BH_ALLOWS                = WQ_BH | WQ_HIGHPRI,
};

enum wq_consts {
        WQ_MAX_ACTIVE                = 2048,          /* I like 2048, better ideas? */
        WQ_UNBOUND_MAX_ACTIVE        = WQ_MAX_ACTIVE,
        WQ_DFL_ACTIVE                = WQ_MAX_ACTIVE / 2,

        /*
         * Per-node default cap on min_active. Unless explicitly set, min_active
         * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see
         * workqueue_struct->min_active definition.
         */
        WQ_DFL_MIN_ACTIVE        = 8,
};

/*
 * System-wide workqueues which are always present.
 *
 * system_wq is the one used by schedule[_delayed]_work[_on]().
 * Multi-CPU multi-threaded.  There are users which expect relatively
 * short queue flush time.  Don't queue works which can run for too
 * long.
 *
 * system_highpri_wq is similar to system_wq but for work items which
 * require WQ_HIGHPRI.
 *
 * system_long_wq is similar to system_wq but may host long running
 * works.  Queue flushing might take relatively long.
 *
 * system_unbound_wq is unbound workqueue.  Workers are not bound to
 * any specific CPU, not concurrency managed, and all queued works are
 * executed immediately as long as max_active limit is not reached and
 * resources are available.
 *
 * system_freezable_wq is equivalent to system_wq except that it's
 * freezable.
 *
 * *_power_efficient_wq are inclined towards saving power and converted
 * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise,
 * they are same as their non-power-efficient counterparts - e.g.
 * system_power_efficient_wq is identical to system_wq if
 * 'wq_power_efficient' is disabled.  See WQ_POWER_EFFICIENT for more info.
 *
 * system_bh[_highpri]_wq are convenience interface to softirq. BH work items
 * are executed in the queueing CPU's BH context in the queueing order.
 */
extern struct workqueue_struct *system_wq;
extern struct workqueue_struct *system_highpri_wq;
extern struct workqueue_struct *system_long_wq;
extern struct workqueue_struct *system_unbound_wq;
extern struct workqueue_struct *system_freezable_wq;
extern struct workqueue_struct *system_power_efficient_wq;
extern struct workqueue_struct *system_freezable_power_efficient_wq;
extern struct workqueue_struct *system_bh_wq;
extern struct workqueue_struct *system_bh_highpri_wq;

void workqueue_softirq_action(bool highpri);
void workqueue_softirq_dead(unsigned int cpu);

/**
 * alloc_workqueue - allocate a workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * @...: args for @fmt
 *
 * For a per-cpu workqueue, @max_active limits the number of in-flight work
 * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be
 * executing at most one work item for the workqueue.
 *
 * For unbound workqueues, @max_active limits the number of in-flight work items
 * for the whole system. e.g. @max_active of 16 indicates that that there can be
 * at most 16 work items executing for the workqueue in the whole system.
 *
 * As sharing the same active counter for an unbound workqueue across multiple
 * NUMA nodes can be expensive, @max_active is distributed to each NUMA node
 * according to the proportion of the number of online CPUs and enforced
 * independently.
 *
 * Depending on online CPU distribution, a node may end up with per-node
 * max_active which is significantly lower than @max_active, which can lead to
 * deadlocks if the per-node concurrency limit is lower than the maximum number
 * of interdependent work items for the workqueue.
 *
 * To guarantee forward progress regardless of online CPU distribution, the
 * concurrency limit on every node is guaranteed to be equal to or greater than
 * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means
 * that the sum of per-node max_active's may be larger than @max_active.
 *
 * For detailed information on %WQ_* flags, please refer to
 * Documentation/core-api/workqueue.rst.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
__printf(1, 4) struct workqueue_struct *
alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...);

#ifdef CONFIG_LOCKDEP
/**
 * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags
 * @max_active: max in-flight work items, 0 for default
 * @lockdep_map: user-defined lockdep_map
 * @...: args for @fmt
 *
 * Same as alloc_workqueue but with the a user-define lockdep_map. Useful for
 * workqueues created with the same purpose and to avoid leaking a lockdep_map
 * on each workqueue creation.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
__printf(1, 5) struct workqueue_struct *
alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active,
                            struct lockdep_map *lockdep_map, ...);

/**
 * alloc_ordered_workqueue_lockdep_map - allocate an ordered workqueue with
 * user-defined lockdep_map
 *
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
 * @lockdep_map: user-defined lockdep_map
 * @args: args for @fmt
 *
 * Same as alloc_ordered_workqueue but with the a user-define lockdep_map.
 * Useful for workqueues created with the same purpose and to avoid leaking a
 * lockdep_map on each workqueue creation.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
#define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...)        \
        alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags),        \
                                    1, lockdep_map, ##args)
#endif

/**
 * alloc_ordered_workqueue - allocate an ordered workqueue
 * @fmt: printf format for the name of the workqueue
 * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
 * @args: args for @fmt
 *
 * Allocate an ordered workqueue.  An ordered workqueue executes at
 * most one work item at any given time in the queued order.  They are
 * implemented as unbound workqueues with @max_active of one.
 *
 * RETURNS:
 * Pointer to the allocated workqueue on success, %NULL on failure.
 */
#define alloc_ordered_workqueue(fmt, flags, args...)                        \
        alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)

#define create_workqueue(name)                                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name)                                \
        alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |        \
                        WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name)                                \
        alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)

#define from_work(var, callback_work, work_fieldname)        \
        container_of(callback_work, typeof(*var), work_fieldname)

extern void destroy_workqueue(struct workqueue_struct *wq);

struct workqueue_attrs *alloc_workqueue_attrs(void);
void free_workqueue_attrs(struct workqueue_attrs *attrs);
int apply_workqueue_attrs(struct workqueue_struct *wq,
                          const struct workqueue_attrs *attrs);
extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask);

extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
                        struct work_struct *work);
extern bool queue_work_node(int node, struct workqueue_struct *wq,
                            struct work_struct *work);
extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *work, unsigned long delay);
extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
                        struct delayed_work *dwork, unsigned long delay);
extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork);

extern void __flush_workqueue(struct workqueue_struct *wq);
extern void drain_workqueue(struct workqueue_struct *wq);

extern int schedule_on_each_cpu(work_func_t func);

int execute_in_process_context(work_func_t fn, struct execute_work *);

extern bool flush_work(struct work_struct *work);
extern bool cancel_work(struct work_struct *work);
extern bool cancel_work_sync(struct work_struct *work);

extern bool flush_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work(struct delayed_work *dwork);
extern bool cancel_delayed_work_sync(struct delayed_work *dwork);

extern bool disable_work(struct work_struct *work);
extern bool disable_work_sync(struct work_struct *work);
extern bool enable_work(struct work_struct *work);

extern bool disable_delayed_work(struct delayed_work *dwork);
extern bool disable_delayed_work_sync(struct delayed_work *dwork);
extern bool enable_delayed_work(struct delayed_work *dwork);

extern bool flush_rcu_work(struct rcu_work *rwork);

extern void workqueue_set_max_active(struct workqueue_struct *wq,
                                     int max_active);
extern void workqueue_set_min_active(struct workqueue_struct *wq,
                                     int min_active);
extern struct work_struct *current_work(void);
extern bool current_is_workqueue_rescuer(void);
extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
extern unsigned int work_busy(struct work_struct *work);
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
extern void show_all_workqueues(void);
extern void show_freezable_workqueues(void);
extern void show_one_workqueue(struct workqueue_struct *wq);
extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task);

/**
 * queue_work - queue work on a workqueue
 * @wq: workqueue to use
 * @work: work to queue
 *
 * Returns %false if @work was already on a queue, %true otherwise.
 *
 * We queue the work to the CPU on which it was submitted, but if the CPU dies
 * it can be processed by another CPU.
 *
 * Memory-ordering properties:  If it returns %true, guarantees that all stores
 * preceding the call to queue_work() in the program order will be visible from
 * the CPU which will execute @work by the time such work executes, e.g.,
 *
 * { x is initially 0 }
 *
 *   CPU0                                CPU1
 *
 *   WRITE_ONCE(x, 1);                        [ @work is being executed ]
 *   r0 = queue_work(wq, work);                  r1 = READ_ONCE(x);
 *
 * Forbids: r0 == true && r1 == 0
 */
static inline bool queue_work(struct workqueue_struct *wq,
                              struct work_struct *work)
{
        return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

/**
 * queue_delayed_work - queue work on a workqueue after delay
 * @wq: workqueue to use
 * @dwork: delayable work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
 */
static inline bool queue_delayed_work(struct workqueue_struct *wq,
                                      struct delayed_work *dwork,
                                      unsigned long delay)
{
        return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * mod_delayed_work - modify delay of or queue a delayed work
 * @wq: workqueue to use
 * @dwork: work to queue
 * @delay: number of jiffies to wait before queueing
 *
 * mod_delayed_work_on() on local CPU.
 */
static inline bool mod_delayed_work(struct workqueue_struct *wq,
                                    struct delayed_work *dwork,
                                    unsigned long delay)
{
        return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

/**
 * schedule_work_on - put work task on a specific cpu
 * @cpu: cpu to put the work task on
 * @work: job to be done
 *
 * This puts a job on a specific cpu
 */
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
        return queue_work_on(cpu, system_wq, work);
}

/**
 * schedule_work - put work task in global workqueue
 * @work: job to be done
 *
 * Returns %false if @work was already on the kernel-global workqueue and
 * %true otherwise.
 *
 * This puts a job in the kernel-global workqueue if it was not already
 * queued and leaves it in the same position on the kernel-global
 * workqueue otherwise.
 *
 * Shares the same memory-ordering properties of queue_work(), cf. the
 * DocBook header of queue_work().
 */
static inline bool schedule_work(struct work_struct *work)
{
        return queue_work(system_wq, work);
}

/**
 * enable_and_queue_work - Enable and queue a work item on a specific workqueue
 * @wq: The target workqueue
 * @work: The work item to be enabled and queued
 *
 * This function combines the operations of enable_work() and queue_work(),
 * providing a convenient way to enable and queue a work item in a single call.
 * It invokes enable_work() on @work and then queues it if the disable depth
 * reached 0. Returns %true if the disable depth reached 0 and @work is queued,
 * and %false otherwise.
 *
 * Note that @work is always queued when disable depth reaches zero. If the
 * desired behavior is queueing only if certain events took place while @work is
 * disabled, the user should implement the necessary state tracking and perform
 * explicit conditional queueing after enable_work().
 */
static inline bool enable_and_queue_work(struct workqueue_struct *wq,
                                         struct work_struct *work)
{
        if (enable_work(work)) {
                queue_work(wq, work);
                return true;
        }
        return false;
}

/*
 * Detect attempt to flush system-wide workqueues at compile time when possible.
 * Warn attempt to flush system-wide workqueues at runtime.
 *
 * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp
 * for reasons and steps for converting system-wide workqueues into local workqueues.
 */
extern void __warn_flushing_systemwide_wq(void)
        __compiletime_warning("Please avoid flushing system-wide workqueues.");

/* Please stop using this function, for this function will be removed in near future. */
#define flush_scheduled_work()                                                \
({                                                                        \
        __warn_flushing_systemwide_wq();                                \
        __flush_workqueue(system_wq);                                        \
})

#define flush_workqueue(wq)                                                \
({                                                                        \
        struct workqueue_struct *_wq = (wq);                                \
                                                                        \
        if ((__builtin_constant_p(_wq == system_wq) &&                        \
             _wq == system_wq) ||                                        \
            (__builtin_constant_p(_wq == system_highpri_wq) &&                \
             _wq == system_highpri_wq) ||                                \
            (__builtin_constant_p(_wq == system_long_wq) &&                \
             _wq == system_long_wq) ||                                        \
            (__builtin_constant_p(_wq == system_unbound_wq) &&                \
             _wq == system_unbound_wq) ||                                \
            (__builtin_constant_p(_wq == system_freezable_wq) &&        \
             _wq == system_freezable_wq) ||                                \
            (__builtin_constant_p(_wq == system_power_efficient_wq) &&        \
             _wq == system_power_efficient_wq) ||                        \
            (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \
             _wq == system_freezable_power_efficient_wq))                \
                __warn_flushing_systemwide_wq();                        \
        __flush_workqueue(_wq);                                                \
})

/**
 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
 * @cpu: cpu to use
 * @dwork: job to be done
 * @delay: number of jiffies to wait
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue on the specified CPU.
 */
static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
                                            unsigned long delay)
{
        return queue_delayed_work_on(cpu, system_wq, dwork, delay);
}

/**
 * schedule_delayed_work - put work task in global workqueue after delay
 * @dwork: job to be done
 * @delay: number of jiffies to wait or 0 for immediate execution
 *
 * After waiting for a given time this puts a job in the kernel-global
 * workqueue.
 */
static inline bool schedule_delayed_work(struct delayed_work *dwork,
                                         unsigned long delay)
{
        return queue_delayed_work(system_wq, dwork, delay);
}

#ifndef CONFIG_SMP
static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg)
{
        return fn(arg);
}
#else
long work_on_cpu_key(int cpu, long (*fn)(void *),
                     void *arg, struct lock_class_key *key);
/*
 * A new key is defined for each caller to make sure the work
 * associated with the function doesn't share its locking class.
 */
#define work_on_cpu(_cpu, _fn, _arg)                        \
({                                                        \
        static struct lock_class_key __key;                \
                                                        \
        work_on_cpu_key(_cpu, _fn, _arg, &__key);        \
})

long work_on_cpu_safe_key(int cpu, long (*fn)(void *),
                          void *arg, struct lock_class_key *key);

/*
 * A new key is defined for each caller to make sure the work
 * associated with the function doesn't share its locking class.
 */
#define work_on_cpu_safe(_cpu, _fn, _arg)                \
({                                                        \
        static struct lock_class_key __key;                \
                                                        \
        work_on_cpu_safe_key(_cpu, _fn, _arg, &__key);        \
})
#endif /* CONFIG_SMP */

#ifdef CONFIG_FREEZER
extern void freeze_workqueues_begin(void);
extern bool freeze_workqueues_busy(void);
extern void thaw_workqueues(void);
#endif /* CONFIG_FREEZER */

#ifdef CONFIG_SYSFS
int workqueue_sysfs_register(struct workqueue_struct *wq);
#else        /* CONFIG_SYSFS */
static inline int workqueue_sysfs_register(struct workqueue_struct *wq)
{ return 0; }
#endif        /* CONFIG_SYSFS */

#ifdef CONFIG_WQ_WATCHDOG
void wq_watchdog_touch(int cpu);
#else        /* CONFIG_WQ_WATCHDOG */
static inline void wq_watchdog_touch(int cpu) { }
#endif        /* CONFIG_WQ_WATCHDOG */

#ifdef CONFIG_SMP
int workqueue_prepare_cpu(unsigned int cpu);
int workqueue_online_cpu(unsigned int cpu);
int workqueue_offline_cpu(unsigned int cpu);
#endif

void __init workqueue_init_early(void);
void __init workqueue_init(void);
void __init workqueue_init_topology(void);

#endif

















































































































































































































































  265 








































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM sock

#if !defined(_TRACE_SOCK_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SOCK_H

#include <net/sock.h>
#include <net/ipv6.h>
#include <linux/tracepoint.h>
#include <linux/ipv6.h>
#include <linux/tcp.h>
#include <trace/events/net_probe_common.h>

#define family_names                        \
                EM(AF_INET)                                \
                EMe(AF_INET6)

/* The protocol traced by inet_sock_set_state */
#define inet_protocol_names                \
                EM(IPPROTO_TCP)                        \
                EM(IPPROTO_DCCP)                \
                EM(IPPROTO_SCTP)                \
                EMe(IPPROTO_MPTCP)

#define tcp_state_names                        \
                EM(TCP_ESTABLISHED)                \
                EM(TCP_SYN_SENT)                \
                EM(TCP_SYN_RECV)                \
                EM(TCP_FIN_WAIT1)                \
                EM(TCP_FIN_WAIT2)                \
                EM(TCP_TIME_WAIT)                \
                EM(TCP_CLOSE)                        \
                EM(TCP_CLOSE_WAIT)                \
                EM(TCP_LAST_ACK)                \
                EM(TCP_LISTEN)                        \
                EM(TCP_CLOSING)                        \
                EMe(TCP_NEW_SYN_RECV)

#define skmem_kind_names                        \
                EM(SK_MEM_SEND)                        \
                EMe(SK_MEM_RECV)

/* enums need to be exported to user space */
#undef EM
#undef EMe
#define EM(a)       TRACE_DEFINE_ENUM(a);
#define EMe(a)      TRACE_DEFINE_ENUM(a);

family_names
inet_protocol_names
tcp_state_names
skmem_kind_names

#undef EM
#undef EMe
#define EM(a)       { a, #a },
#define EMe(a)      { a, #a }

#define show_family_name(val)                        \
        __print_symbolic(val, family_names)

#define show_inet_protocol_name(val)    \
        __print_symbolic(val, inet_protocol_names)

#define show_tcp_state_name(val)        \
        __print_symbolic(val, tcp_state_names)

#define show_skmem_kind_names(val)        \
        __print_symbolic(val, skmem_kind_names)

TRACE_EVENT(sock_rcvqueue_full,

        TP_PROTO(struct sock *sk, struct sk_buff *skb),

        TP_ARGS(sk, skb),

        TP_STRUCT__entry(
                __field(int, rmem_alloc)
                __field(unsigned int, truesize)
                __field(int, sk_rcvbuf)
        ),

        TP_fast_assign(
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->truesize   = skb->truesize;
                __entry->sk_rcvbuf  = READ_ONCE(sk->sk_rcvbuf);
        ),

        TP_printk("rmem_alloc=%d truesize=%u sk_rcvbuf=%d",
                __entry->rmem_alloc, __entry->truesize, __entry->sk_rcvbuf)
);

TRACE_EVENT(sock_exceed_buf_limit,

        TP_PROTO(struct sock *sk, struct proto *prot, long allocated, int kind),

        TP_ARGS(sk, prot, allocated, kind),

        TP_STRUCT__entry(
                __array(char, name, 32)
                __array(long, sysctl_mem, 3)
                __field(long, allocated)
                __field(int, sysctl_rmem)
                __field(int, rmem_alloc)
                __field(int, sysctl_wmem)
                __field(int, wmem_alloc)
                __field(int, wmem_queued)
                __field(int, kind)
        ),

        TP_fast_assign(
                strscpy(__entry->name, prot->name, 32);
                __entry->sysctl_mem[0] = READ_ONCE(prot->sysctl_mem[0]);
                __entry->sysctl_mem[1] = READ_ONCE(prot->sysctl_mem[1]);
                __entry->sysctl_mem[2] = READ_ONCE(prot->sysctl_mem[2]);
                __entry->allocated = allocated;
                __entry->sysctl_rmem = sk_get_rmem0(sk, prot);
                __entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
                __entry->sysctl_wmem = sk_get_wmem0(sk, prot);
                __entry->wmem_alloc = refcount_read(&sk->sk_wmem_alloc);
                __entry->wmem_queued = READ_ONCE(sk->sk_wmem_queued);
                __entry->kind = kind;
        ),

        TP_printk("proto:%s sysctl_mem=%ld,%ld,%ld allocated=%ld sysctl_rmem=%d rmem_alloc=%d sysctl_wmem=%d wmem_alloc=%d wmem_queued=%d kind=%s",
                __entry->name,
                __entry->sysctl_mem[0],
                __entry->sysctl_mem[1],
                __entry->sysctl_mem[2],
                __entry->allocated,
                __entry->sysctl_rmem,
                __entry->rmem_alloc,
                __entry->sysctl_wmem,
                __entry->wmem_alloc,
                __entry->wmem_queued,
                show_skmem_kind_names(__entry->kind)
        )
);

TRACE_EVENT(inet_sock_set_state,

        TP_PROTO(const struct sock *sk, const int oldstate, const int newstate),

        TP_ARGS(sk, oldstate, newstate),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(int, oldstate)
                __field(int, newstate)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u16, protocol)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->skaddr = sk;
                __entry->oldstate = oldstate;
                __entry->newstate = newstate;

                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c oldstate=%s newstate=%s",
                        show_family_name(__entry->family),
                        show_inet_protocol_name(__entry->protocol),
                        __entry->sport, __entry->dport,
                        __entry->saddr, __entry->daddr,
                        __entry->saddr_v6, __entry->daddr_v6,
                        show_tcp_state_name(__entry->oldstate),
                        show_tcp_state_name(__entry->newstate))
);

TRACE_EVENT(inet_sk_error_report,

        TP_PROTO(const struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(int, error)
                __field(__u16, sport)
                __field(__u16, dport)
                __field(__u16, family)
                __field(__u16, protocol)
                __array(__u8, saddr, 4)
                __array(__u8, daddr, 4)
                __array(__u8, saddr_v6, 16)
                __array(__u8, daddr_v6, 16)
        ),

        TP_fast_assign(
                const struct inet_sock *inet = inet_sk(sk);
                __be32 *p32;

                __entry->error = sk->sk_err;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->sport = ntohs(inet->inet_sport);
                __entry->dport = ntohs(inet->inet_dport);

                p32 = (__be32 *) __entry->saddr;
                *p32 = inet->inet_saddr;

                p32 = (__be32 *) __entry->daddr;
                *p32 =  inet->inet_daddr;

                TP_STORE_ADDRS(__entry, inet->inet_saddr, inet->inet_daddr,
                               sk->sk_v6_rcv_saddr, sk->sk_v6_daddr);
        ),

        TP_printk("family=%s protocol=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 saddrv6=%pI6c daddrv6=%pI6c error=%d",
                  show_family_name(__entry->family),
                  show_inet_protocol_name(__entry->protocol),
                  __entry->sport, __entry->dport,
                  __entry->saddr, __entry->daddr,
                  __entry->saddr_v6, __entry->daddr_v6,
                  __entry->error)
);

TRACE_EVENT(sk_data_ready,

        TP_PROTO(const struct sock *sk),

        TP_ARGS(sk),

        TP_STRUCT__entry(
                __field(const void *, skaddr)
                __field(__u16, family)
                __field(__u16, protocol)
                __field(unsigned long, ip)
        ),

        TP_fast_assign(
                __entry->skaddr = sk;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->ip = _RET_IP_;
        ),

        TP_printk("family=%u protocol=%u func=%ps",
                  __entry->family, __entry->protocol, (void *)__entry->ip)
);

/*
 * sock send/recv msg length
 */
DECLARE_EVENT_CLASS(sock_msg_length,

        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags),

        TP_STRUCT__entry(
                __field(void *, sk)
                __field(__u16, family)
                __field(__u16, protocol)
                __field(int, ret)
                __field(int, flags)
        ),

        TP_fast_assign(
                __entry->sk = sk;
                __entry->family = sk->sk_family;
                __entry->protocol = sk->sk_protocol;
                __entry->ret = ret;
                __entry->flags = flags;
        ),

        TP_printk("sk address = %p, family = %s protocol = %s, length = %d, error = %d, flags = 0x%x",
                  __entry->sk, show_family_name(__entry->family),
                  show_inet_protocol_name(__entry->protocol),
                  !(__entry->flags & MSG_PEEK) ?
                  (__entry->ret > 0 ? __entry->ret : 0) : 0,
                  __entry->ret < 0 ? __entry->ret : 0,
                  __entry->flags)
);

DEFINE_EVENT(sock_msg_length, sock_send_length,
        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags)
);

DEFINE_EVENT(sock_msg_length, sock_recv_length,
        TP_PROTO(struct sock *sk, int ret, int flags),

        TP_ARGS(sk, ret, flags)
);
#endif /* _TRACE_SOCK_H */

/* This part must be outside protection */
#include <trace/define_trace.h>



















































































































































































































































































































































































































































  509 








































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_CGROUP_H
#define _LINUX_CGROUP_H
/*
 *  cgroup interface
 *
 *  Copyright (C) 2003 BULL SA
 *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
 *
 */

#include <linux/sched.h>
#include <linux/nodemask.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
#include <linux/jump_label.h>
#include <linux/types.h>
#include <linux/ns_common.h>
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/refcount.h>
#include <linux/kernel_stat.h>

#include <linux/cgroup-defs.h>

struct kernel_clone_args;

/*
 * All weight knobs on the default hierarchy should use the following min,
 * default and max values.  The default value is the logarithmic center of
 * MIN and MAX and allows 100x to be expressed in both directions.
 */
#define CGROUP_WEIGHT_MIN                1
#define CGROUP_WEIGHT_DFL                100
#define CGROUP_WEIGHT_MAX                10000

#ifdef CONFIG_CGROUPS

enum {
        CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
        CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
        CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
};

/* a css_task_iter should be treated as an opaque object */
struct css_task_iter {
        struct cgroup_subsys                *ss;
        unsigned int                        flags;

        struct list_head                *cset_pos;
        struct list_head                *cset_head;

        struct list_head                *tcset_pos;
        struct list_head                *tcset_head;

        struct list_head                *task_pos;

        struct list_head                *cur_tasks_head;
        struct css_set                        *cur_cset;
        struct css_set                        *cur_dcset;
        struct task_struct                *cur_task;
        struct list_head                iters_node;        /* css_set->task_iters */
};

extern struct file_system_type cgroup_fs_type;
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
extern spinlock_t css_set_lock;

#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

#define SUBSYS(_x)                                                                \
        extern struct static_key_true _x ## _cgrp_subsys_enabled_key;                \
        extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
#include <linux/cgroup_subsys.h>
#undef SUBSYS

/**
 * cgroup_subsys_enabled - fast test on whether a subsys is enabled
 * @ss: subsystem in question
 */
#define cgroup_subsys_enabled(ss)                                                \
        static_branch_likely(&ss ## _enabled_key)

/**
 * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
 * @ss: subsystem in question
 */
#define cgroup_subsys_on_dfl(ss)                                                \
        static_branch_likely(&ss ## _on_dfl_key)

bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup,
                                         struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
                                             struct cgroup_subsys *ss);
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
                                                       struct cgroup_subsys *ss);

struct cgroup *cgroup_get_from_path(const char *path);
struct cgroup *cgroup_get_from_fd(int fd);
struct cgroup *cgroup_v1v2_get_from_fd(int fd);

int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);

int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
int cgroup_rm_cftypes(struct cftype *cfts);
void cgroup_file_notify(struct cgroup_file *cfile);
void cgroup_file_show(struct cgroup_file *cfile, bool show);

int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk);

void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
                           struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
                               struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
                             struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);

int cgroup_init_early(void);
int cgroup_init(void);

int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v);

/*
 * Iteration helpers and macros.
 */

struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
                                           struct cgroup_subsys_state *parent);
struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
                                                    struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
                                                     struct cgroup_subsys_state *css);

struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
                                         struct cgroup_subsys_state **dst_cssp);
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
                                        struct cgroup_subsys_state **dst_cssp);

void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
                         struct css_task_iter *it);
struct task_struct *css_task_iter_next(struct css_task_iter *it);
void css_task_iter_end(struct css_task_iter *it);

/**
 * css_for_each_child - iterate through children of a css
 * @pos: the css * to use as the loop cursor
 * @parent: css whose children to walk
 *
 * Walk @parent's children.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_child(pos, parent)                                        \
        for ((pos) = css_next_child(NULL, (parent)); (pos);                \
             (pos) = css_next_child((pos), (parent)))

/**
 * css_for_each_descendant_pre - pre-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @root: css whose descendants to walk
 *
 * Walk @root's descendants.  @root is included in the iteration and the
 * first node to be visited.  Must be called under rcu_read_lock().
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * For example, the following guarantees that a descendant can't escape
 * state updates of its ancestors.
 *
 * my_online(@css)
 * {
 *        Lock @css's parent and @css;
 *        Inherit state from the parent;
 *        Unlock both.
 * }
 *
 * my_update_state(@css)
 * {
 *        css_for_each_descendant_pre(@pos, @css) {
 *                Lock @pos;
 *                if (@pos == @css)
 *                        Update @css's state;
 *                else
 *                        Verify @pos is alive and inherit state from its parent;
 *                Unlock @pos;
 *        }
 * }
 *
 * As long as the inheriting step, including checking the parent state, is
 * enclosed inside @pos locking, double-locking the parent isn't necessary
 * while inheriting.  The state update to the parent is guaranteed to be
 * visible by walking order and, as long as inheriting operations to the
 * same @pos are atomic to each other, multiple updates racing each other
 * still result in the correct state.  It's guaranateed that at least one
 * inheritance happens for any css after the latest update to its parent.
 *
 * If checking parent's state requires locking the parent, each inheriting
 * iteration should lock and unlock both @pos->parent and @pos.
 *
 * Alternatively, a subsystem may choose to use a single global lock to
 * synchronize ->css_online() and ->css_offline() against tree-walking
 * operations.
 *
 * It is allowed to temporarily drop RCU read lock during iteration.  The
 * caller is responsible for ensuring that @pos remains accessible until
 * the start of the next iteration by, for example, bumping the css refcnt.
 */
#define css_for_each_descendant_pre(pos, css)                                \
        for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_pre((pos), (css)))

/**
 * css_for_each_descendant_post - post-order walk of a css's descendants
 * @pos: the css * to use as the loop cursor
 * @css: css whose descendants to walk
 *
 * Similar to css_for_each_descendant_pre() but performs post-order
 * traversal instead.  @root is included in the iteration and the last
 * node to be visited.
 *
 * If a subsystem synchronizes ->css_online() and the start of iteration, a
 * css which finished ->css_online() is guaranteed to be visible in the
 * future iterations and will stay visible until the last reference is put.
 * A css which hasn't finished ->css_online() or already finished
 * ->css_offline() may show up during traversal.  It's each subsystem's
 * responsibility to synchronize against on/offlining.
 *
 * Note that the walk visibility guarantee example described in pre-order
 * walk doesn't apply the same to post-order walks.
 */
#define css_for_each_descendant_post(pos, css)                                \
        for ((pos) = css_next_descendant_post(NULL, (css)); (pos);        \
             (pos) = css_next_descendant_post((pos), (css)))

/**
 * cgroup_taskset_for_each - iterate cgroup_taskset
 * @task: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * @tset may contain multiple tasks and they may belong to multiple
 * processes.
 *
 * On the v2 hierarchy, there may be tasks from multiple processes and they
 * may not share the source or destination csses.
 *
 * On traditional hierarchies, when there are multiple tasks in @tset, if a
 * task of a process is in @tset, all tasks of the process are in @tset.
 * Also, all are guaranteed to share the same source and destination csses.
 *
 * Iteration is not in any specific order.
 */
#define cgroup_taskset_for_each(task, dst_css, tset)                        \
        for ((task) = cgroup_taskset_first((tset), &(dst_css));                \
             (task);                                                        \
             (task) = cgroup_taskset_next((tset), &(dst_css)))

/**
 * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
 * @leader: the loop cursor
 * @dst_css: the destination css
 * @tset: taskset to iterate
 *
 * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
 * may not contain any.
 */
#define cgroup_taskset_for_each_leader(leader, dst_css, tset)                \
        for ((leader) = cgroup_taskset_first((tset), &(dst_css));        \
             (leader);                                                        \
             (leader) = cgroup_taskset_next((tset), &(dst_css)))        \
                if ((leader) != (leader)->group_leader)                        \
                        ;                                                \
                else

/*
 * Inline functions.
 */

#ifdef CONFIG_DEBUG_CGROUP_REF
void css_get(struct cgroup_subsys_state *css);
void css_get_many(struct cgroup_subsys_state *css, unsigned int n);
bool css_tryget(struct cgroup_subsys_state *css);
bool css_tryget_online(struct cgroup_subsys_state *css);
void css_put(struct cgroup_subsys_state *css);
void css_put_many(struct cgroup_subsys_state *css, unsigned int n);
#else
#define CGROUP_REF_FN_ATTRS        static inline
#define CGROUP_REF_EXPORT(fn)
#include <linux/cgroup_refcnt.h>
#endif

static inline u64 cgroup_id(const struct cgroup *cgrp)
{
        return cgrp->kn->id;
}

/**
 * css_is_dying - test whether the specified css is dying
 * @css: target css
 *
 * Test whether @css is in the process of offlining or already offline.  In
 * most cases, ->css_online() and ->css_offline() callbacks should be
 * enough; however, the actual offline operations are RCU delayed and this
 * test returns %true also when @css is scheduled to be offlined.
 *
 * This is useful, for example, when the use case requires synchronous
 * behavior with respect to cgroup removal.  cgroup removal schedules css
 * offlining but the css can seem alive while the operation is being
 * delayed.  If the delay affects user visible semantics, this test can be
 * used to resolve the situation.
 */
static inline bool css_is_dying(struct cgroup_subsys_state *css)
{
        return css->flags & CSS_DYING;
}

static inline void cgroup_get(struct cgroup *cgrp)
{
        css_get(&cgrp->self);
}

static inline bool cgroup_tryget(struct cgroup *cgrp)
{
        return css_tryget(&cgrp->self);
}

static inline void cgroup_put(struct cgroup *cgrp)
{
        css_put(&cgrp->self);
}

extern struct mutex cgroup_mutex;

static inline void cgroup_lock(void)
{
        mutex_lock(&cgroup_mutex);
}

static inline void cgroup_unlock(void)
{
        mutex_unlock(&cgroup_mutex);
}

/**
 * task_css_set_check - obtain a task's css_set with extra access conditions
 * @task: the task to obtain css_set for
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * A task's css_set is RCU protected, initialized and exited while holding
 * task_lock(), and can only be modified while holding both cgroup_mutex
 * and task_lock() while the task is alive.  This macro verifies that the
 * caller is inside proper critical section and returns @task's css_set.
 *
 * The caller can also specify additional allowed conditions via @__c, such
 * as locks used during the cgroup_subsys::attach() methods.
 */
#ifdef CONFIG_PROVE_RCU
#define task_css_set_check(task, __c)                                        \
        rcu_dereference_check((task)->cgroups,                                \
                rcu_read_lock_sched_held() ||                                \
                lockdep_is_held(&cgroup_mutex) ||                        \
                lockdep_is_held(&css_set_lock) ||                        \
                ((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c)                                        \
        rcu_dereference((task)->cgroups)
#endif

/**
 * task_css_check - obtain css for (task, subsys) w/ extra access conds
 * @task: the target task
 * @subsys_id: the target subsystem ID
 * @__c: extra condition expression to be passed to rcu_dereference_check()
 *
 * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
 * synchronization rules are the same as task_css_set_check().
 */
#define task_css_check(task, subsys_id, __c)                                \
        task_css_set_check((task), (__c))->subsys[(subsys_id)]

/**
 * task_css_set - obtain a task's css_set
 * @task: the task to obtain css_set for
 *
 * See task_css_set_check().
 */
static inline struct css_set *task_css_set(struct task_struct *task)
{
        return task_css_set_check(task, false);
}

/**
 * task_css - obtain css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * See task_css_check().
 */
static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
                                                   int subsys_id)
{
        return task_css_check(task, subsys_id, false);
}

/**
 * task_get_css - find and get the css for (task, subsys)
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Find the css for the (@task, @subsys_id) combination, increment a
 * reference on and return it.  This function is guaranteed to return a
 * valid css.  The returned css may already have been offlined.
 */
static inline struct cgroup_subsys_state *
task_get_css(struct task_struct *task, int subsys_id)
{
        struct cgroup_subsys_state *css;

        rcu_read_lock();
        while (true) {
                css = task_css(task, subsys_id);
                /*
                 * Can't use css_tryget_online() here.  A task which has
                 * PF_EXITING set may stay associated with an offline css.
                 * If such task calls this function, css_tryget_online()
                 * will keep failing.
                 */
                if (likely(css_tryget(css)))
                        break;
                cpu_relax();
        }
        rcu_read_unlock();
        return css;
}

/**
 * task_css_is_root - test whether a task belongs to the root css
 * @task: the target task
 * @subsys_id: the target subsystem ID
 *
 * Test whether @task belongs to the root css on the specified subsystem.
 * May be invoked in any context.
 */
static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
{
        return task_css_check(task, subsys_id, true) ==
                init_css_set.subsys[subsys_id];
}

static inline struct cgroup *task_cgroup(struct task_struct *task,
                                         int subsys_id)
{
        return task_css(task, subsys_id)->cgroup;
}

static inline struct cgroup *task_dfl_cgroup(struct task_struct *task)
{
        return task_css_set(task)->dfl_cgrp;
}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        struct cgroup_subsys_state *parent_css = cgrp->self.parent;

        if (parent_css)
                return container_of(parent_css, struct cgroup, self);
        return NULL;
}

/**
 * cgroup_is_descendant - test ancestry
 * @cgrp: the cgroup to be tested
 * @ancestor: possible ancestor of @cgrp
 *
 * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
 * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
 * and @ancestor are accessible.
 */
static inline bool cgroup_is_descendant(struct cgroup *cgrp,
                                        struct cgroup *ancestor)
{
        if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
                return false;
        return cgrp->ancestors[ancestor->level] == ancestor;
}

/**
 * cgroup_ancestor - find ancestor of cgroup
 * @cgrp: cgroup to find ancestor of
 * @ancestor_level: level of ancestor to find starting from root
 *
 * Find ancestor of cgroup at specified level starting from root if it exists
 * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at
 * @ancestor_level.
 *
 * This function is safe to call as long as @cgrp is accessible.
 */
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
                                             int ancestor_level)
{
        if (ancestor_level < 0 || ancestor_level > cgrp->level)
                return NULL;
        return cgrp->ancestors[ancestor_level];
}

/**
 * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry
 * @task: the task to be tested
 * @ancestor: possible ancestor of @task's cgroup
 *
 * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor.
 * It follows all the same rules as cgroup_is_descendant, and only applies
 * to the default hierarchy.
 */
static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        struct css_set *cset = task_css_set(task);

        return cgroup_is_descendant(cset->dfl_cgrp, ancestor);
}

/* no synchronization, the result can only be used as a hint */
static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
        return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children +
                cgrp->nr_populated_threaded_children;
}

/* returns ino associated with a cgroup */
static inline ino_t cgroup_ino(struct cgroup *cgrp)
{
        return kernfs_ino(cgrp->kn);
}

/* cft/css accessors for cftype->write() operation */
static inline struct cftype *of_cft(struct kernfs_open_file *of)
{
        return of->kn->priv;
}

struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);

/* cft/css accessors for cftype->seq_*() operations */
static inline struct cftype *seq_cft(struct seq_file *seq)
{
        return of_cft(seq->private);
}

static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
        return of_css(seq->private);
}

/*
 * Name / path handling functions.  All are thin wrappers around the kernfs
 * counterparts and can be called under any context.
 */

static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_name(cgrp->kn, buf, buflen);
}

static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen)
{
        return kernfs_path(cgrp->kn, buf, buflen);
}

static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
{
        pr_cont_kernfs_name(cgrp->kn);
}

static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
{
        pr_cont_kernfs_path(cgrp->kn);
}

bool cgroup_psi_enabled(void);

static inline void cgroup_init_kthreadd(void)
{
        /*
         * kthreadd is inherited by all kthreads, keep it in the root so
         * that the new kthreads are guaranteed to stay in the root until
         * initialization is finished.
         */
        current->no_cgroup_migration = 1;
}

static inline void cgroup_kthread_ready(void)
{
        /*
         * This kthread finished initialization.  The creator should have
         * set PF_NO_SETAFFINITY if this kthread should stay in the root.
         */
        current->no_cgroup_migration = 0;
}

void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen);
struct cgroup *cgroup_get_from_id(u64 id);
#else /* !CONFIG_CGROUPS */

struct cgroup_subsys_state;
struct cgroup;

static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
static inline void cgroup_lock(void) {}
static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
                                         struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
                                    struct dentry *dentry) { return -EINVAL; }

static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
                                  struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
                                      struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
                                    struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}

static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
static inline void cgroup_init_kthreadd(void) {}
static inline void cgroup_kthread_ready(void) {}

static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
        return NULL;
}

static inline bool cgroup_psi_enabled(void)
{
        return false;
}

static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                               struct cgroup *ancestor)
{
        return true;
}

static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
{}
#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUPS
/*
 * cgroup scalable recursive statistics.
 */
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
void cgroup_rstat_flush(struct cgroup *cgrp);

/*
 * Basic resource stats.
 */
#ifdef CONFIG_CGROUP_CPUACCT
void cpuacct_charge(struct task_struct *tsk, u64 cputime);
void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
#else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_account_field(struct task_struct *tsk, int index,
                                         u64 val) {}
#endif

void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec);
void __cgroup_account_cputime_field(struct cgroup *cgrp,
                                    enum cpu_usage_stat index, u64 delta_exec);

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_charge(task, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime(cgrp, delta_exec);
}

static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec)
{
        struct cgroup *cgrp;

        cpuacct_account_field(task, index, delta_exec);

        cgrp = task_dfl_cgroup(task);
        if (cgroup_parent(cgrp))
                __cgroup_account_cputime_field(cgrp, index, delta_exec);
}

#else        /* CONFIG_CGROUPS */

static inline void cgroup_account_cputime(struct task_struct *task,
                                          u64 delta_exec) {}
static inline void cgroup_account_cputime_field(struct task_struct *task,
                                                enum cpu_usage_stat index,
                                                u64 delta_exec) {}

#endif        /* CONFIG_CGROUPS */

/*
 * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
 * definition in cgroup-defs.h.
 */
#ifdef CONFIG_SOCK_CGROUP_DATA

void cgroup_sk_alloc(struct sock_cgroup_data *skcd);
void cgroup_sk_clone(struct sock_cgroup_data *skcd);
void cgroup_sk_free(struct sock_cgroup_data *skcd);

static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd)
{
        return skcd->cgroup;
}

#else        /* CONFIG_CGROUP_DATA */

static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {}
static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}

#endif        /* CONFIG_CGROUP_DATA */

struct cgroup_namespace {
        struct ns_common        ns;
        struct user_namespace        *user_ns;
        struct ucounts                *ucounts;
        struct css_set          *root_cset;
};

extern struct cgroup_namespace init_cgroup_ns;

#ifdef CONFIG_CGROUPS

void free_cgroup_ns(struct cgroup_namespace *ns);

struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
                                        struct user_namespace *user_ns,
                                        struct cgroup_namespace *old_ns);

int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
                   struct cgroup_namespace *ns);

#else /* !CONFIG_CGROUPS */

static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
static inline struct cgroup_namespace *
copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
               struct cgroup_namespace *old_ns)
{
        return old_ns;
}

#endif /* !CONFIG_CGROUPS */

static inline void get_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns)
                refcount_inc(&ns->ns.count);
}

static inline void put_cgroup_ns(struct cgroup_namespace *ns)
{
        if (ns && refcount_dec_and_test(&ns->ns.count))
                free_cgroup_ns(ns);
}

#ifdef CONFIG_CGROUPS

void cgroup_enter_frozen(void);
void cgroup_leave_frozen(bool always_leave);
void cgroup_update_frozen(struct cgroup *cgrp);
void cgroup_freeze(struct cgroup *cgrp, bool freeze);
void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src,
                                 struct cgroup *dst);

static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return task->frozen;
}

#else /* !CONFIG_CGROUPS */

static inline void cgroup_enter_frozen(void) { }
static inline void cgroup_leave_frozen(bool always_leave) { }
static inline bool cgroup_task_frozen(struct task_struct *task)
{
        return false;
}

#endif /* !CONFIG_CGROUPS */

#ifdef CONFIG_CGROUP_BPF
static inline void cgroup_bpf_get(struct cgroup *cgrp)
{
        percpu_ref_get(&cgrp->bpf.refcnt);
}

static inline void cgroup_bpf_put(struct cgroup *cgrp)
{
        percpu_ref_put(&cgrp->bpf.refcnt);
}

#else /* CONFIG_CGROUP_BPF */

static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}

#endif /* CONFIG_CGROUP_BPF */

struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id);

struct cgroup_of_peak *of_peak(struct kernfs_open_file *of);

#endif /* _LINUX_CGROUP_H */







































   89 





 1098 



























































































































   34 
























































































































































































































































  237 
















































   35 

   34 












































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/uaccess.h
 *
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_UACCESS_H
#define __ASM_UACCESS_H

#include <asm/alternative.h>
#include <asm/kernel-pgtable.h>
#include <asm/sysreg.h>

/*
 * User space memory access functions
 */
#include <linux/bitops.h>
#include <linux/kasan-checks.h>
#include <linux/string.h>

#include <asm/asm-extable.h>
#include <asm/cpufeature.h>
#include <asm/mmu.h>
#include <asm/mte.h>
#include <asm/ptrace.h>
#include <asm/memory.h>
#include <asm/extable.h>

static inline int __access_ok(const void __user *ptr, unsigned long size);

/*
 * Test whether a block of memory is a valid user space address.
 * Returns 1 if the range is valid, 0 otherwise.
 *
 * This is equivalent to the following test:
 * (u65)addr + (u65)size <= (u65)TASK_SIZE_MAX
 */
static inline int access_ok(const void __user *addr, unsigned long size)
{
        /*
         * Asynchronous I/O running in a kernel thread does not have the
         * TIF_TAGGED_ADDR flag of the process owning the mm, so always untag
         * the user address before checking.
         */
        if (IS_ENABLED(CONFIG_ARM64_TAGGED_ADDR_ABI) &&
            (current->flags & PF_KTHREAD || test_thread_flag(TIF_TAGGED_ADDR)))
                addr = untagged_addr(addr);

        return likely(__access_ok(addr, size));
}
#define access_ok access_ok

#include <asm-generic/access_ok.h>

/*
 * User access enabling/disabling.
 */
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
static inline void __uaccess_ttbr0_disable(void)
{
        unsigned long flags, ttbr;

        local_irq_save(flags);
        ttbr = read_sysreg(ttbr1_el1);
        ttbr &= ~TTBR_ASID_MASK;
        /* reserved_pg_dir placed before swapper_pg_dir */
        write_sysreg(ttbr - RESERVED_SWAPPER_OFFSET, ttbr0_el1);
        /* Set reserved ASID */
        write_sysreg(ttbr, ttbr1_el1);
        isb();
        local_irq_restore(flags);
}

static inline void __uaccess_ttbr0_enable(void)
{
        unsigned long flags, ttbr0, ttbr1;

        /*
         * Disable interrupts to avoid preemption between reading the 'ttbr0'
         * variable and the MSR. A context switch could trigger an ASID
         * roll-over and an update of 'ttbr0'.
         */
        local_irq_save(flags);
        ttbr0 = READ_ONCE(current_thread_info()->ttbr0);

        /* Restore active ASID */
        ttbr1 = read_sysreg(ttbr1_el1);
        ttbr1 &= ~TTBR_ASID_MASK;                /* safety measure */
        ttbr1 |= ttbr0 & TTBR_ASID_MASK;
        write_sysreg(ttbr1, ttbr1_el1);

        /* Restore user page table */
        write_sysreg(ttbr0, ttbr0_el1);
        isb();
        local_irq_restore(flags);
}

static inline bool uaccess_ttbr0_disable(void)
{
        if (!system_uses_ttbr0_pan())
                return false;
        __uaccess_ttbr0_disable();
        return true;
}

static inline bool uaccess_ttbr0_enable(void)
{
        if (!system_uses_ttbr0_pan())
                return false;
        __uaccess_ttbr0_enable();
        return true;
}
#else
static inline bool uaccess_ttbr0_disable(void)
{
        return false;
}

static inline bool uaccess_ttbr0_enable(void)
{
        return false;
}
#endif

static inline void __uaccess_disable_hw_pan(void)
{
        asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN,
                        CONFIG_ARM64_PAN));
}

static inline void __uaccess_enable_hw_pan(void)
{
        asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN,
                        CONFIG_ARM64_PAN));
}

static inline void uaccess_disable_privileged(void)
{
        mte_disable_tco();

        if (uaccess_ttbr0_disable())
                return;

        __uaccess_enable_hw_pan();
}

static inline void uaccess_enable_privileged(void)
{
        mte_enable_tco();

        if (uaccess_ttbr0_enable())
                return;

        __uaccess_disable_hw_pan();
}

/*
 * Sanitize a uaccess pointer such that it cannot reach any kernel address.
 *
 * Clearing bit 55 ensures the pointer cannot address any portion of the TTBR1
 * address range (i.e. any kernel address), and either the pointer falls within
 * the TTBR0 address range or must cause a fault.
 */
#define uaccess_mask_ptr(ptr) (__typeof__(ptr))__uaccess_mask_ptr(ptr)
static inline void __user *__uaccess_mask_ptr(const void __user *ptr)
{
        void __user *safe_ptr;

        asm volatile(
        "        bic        %0, %1, %2\n"
        : "=r" (safe_ptr)
        : "r" (ptr),
          "i" (BIT(55))
        );

        return safe_ptr;
}

/*
 * The "__xxx" versions of the user access functions do not verify the address
 * space - it must have been done previously with a separate "access_ok()"
 * call.
 *
 * The "__xxx_error" versions set the third argument to -EFAULT if an error
 * occurs, and leave it unchanged on success.
 */
#ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
#define __get_mem_asm(load, reg, x, addr, label, type)                        \
        asm_goto_output(                                                \
        "1:        " load "        " reg "0, [%1]\n"                        \
        _ASM_EXTABLE_##type##ACCESS(1b, %l2)                                \
        : "=r" (x)                                                        \
        : "r" (addr) : : label)
#else
#define __get_mem_asm(load, reg, x, addr, label, type) do {                \
        int __gma_err = 0;                                                \
        asm volatile(                                                        \
        "1:        " load "        " reg "1, [%2]\n"                        \
        "2:\n"                                                                \
        _ASM_EXTABLE_##type##ACCESS_ERR_ZERO(1b, 2b, %w0, %w1)                \
        : "+r" (__gma_err), "=r" (x)                                        \
        : "r" (addr));                                                        \
        if (__gma_err) goto label; } while (0)
#endif

#define __raw_get_mem(ldr, x, ptr, label, type)                                        \
do {                                                                                \
        unsigned long __gu_val;                                                        \
        switch (sizeof(*(ptr))) {                                                \
        case 1:                                                                        \
                __get_mem_asm(ldr "b", "%w", __gu_val, (ptr), label, type);        \
                break;                                                                \
        case 2:                                                                        \
                __get_mem_asm(ldr "h", "%w", __gu_val, (ptr), label, type);        \
                break;                                                                \
        case 4:                                                                        \
                __get_mem_asm(ldr, "%w", __gu_val, (ptr), label, type);                \
                break;                                                                \
        case 8:                                                                        \
                __get_mem_asm(ldr, "%x",  __gu_val, (ptr), label, type);        \
                break;                                                                \
        default:                                                                \
                BUILD_BUG();                                                        \
        }                                                                        \
        (x) = (__force __typeof__(*(ptr)))__gu_val;                                \
} while (0)

/*
 * We must not call into the scheduler between uaccess_ttbr0_enable() and
 * uaccess_ttbr0_disable(). As `x` and `ptr` could contain blocking functions,
 * we must evaluate these outside of the critical section.
 */
#define __raw_get_user(x, ptr, label)                                        \
do {                                                                        \
        __typeof__(*(ptr)) __user *__rgu_ptr = (ptr);                        \
        __typeof__(x) __rgu_val;                                        \
        __chk_user_ptr(ptr);                                                \
        do {                                                                \
                __label__ __rgu_failed;                                        \
                uaccess_ttbr0_enable();                                        \
                __raw_get_mem("ldtr", __rgu_val, __rgu_ptr, __rgu_failed, U);        \
                uaccess_ttbr0_disable();                                \
                (x) = __rgu_val;                                        \
                break;                                                        \
        __rgu_failed:                                                        \
                uaccess_ttbr0_disable();                                \
                goto label;                                                \
        } while (0);                                                        \
} while (0)

#define __get_user_error(x, ptr, err)                                        \
do {                                                                        \
        __label__ __gu_failed;                                                \
        __typeof__(*(ptr)) __user *__p = (ptr);                                \
        might_fault();                                                        \
        if (access_ok(__p, sizeof(*__p))) {                                \
                __p = uaccess_mask_ptr(__p);                                \
                __raw_get_user((x), __p, __gu_failed);                        \
        } else {                                                        \
        __gu_failed:                                                        \
                (x) = (__force __typeof__(x))0; (err) = -EFAULT;        \
        }                                                                \
} while (0)

#define __get_user(x, ptr)                                                \
({                                                                        \
        int __gu_err = 0;                                                \
        __get_user_error((x), (ptr), __gu_err);                                \
        __gu_err;                                                        \
})

#define get_user        __get_user

/*
 * We must not call into the scheduler between __mte_enable_tco_async() and
 * __mte_disable_tco_async(). As `dst` and `src` may contain blocking
 * functions, we must evaluate these outside of the critical section.
 */
#define __get_kernel_nofault(dst, src, type, err_label)                        \
do {                                                                        \
        __typeof__(dst) __gkn_dst = (dst);                                \
        __typeof__(src) __gkn_src = (src);                                \
        do {                                                                 \
                __label__ __gkn_label;                                        \
                                                                        \
                __mte_enable_tco_async();                                \
                __raw_get_mem("ldr", *((type *)(__gkn_dst)),                \
                      (__force type *)(__gkn_src), __gkn_label, K);        \
                __mte_disable_tco_async();                                \
                break;                                                        \
        __gkn_label:                                                        \
                __mte_disable_tco_async();                                \
                goto err_label;                                                \
        } while (0);                                                        \
} while (0)

#define __put_mem_asm(store, reg, x, addr, label, type)                        \
        asm goto(                                                        \
        "1:        " store "        " reg "0, [%1]\n"                        \
        "2:\n"                                                                \
        _ASM_EXTABLE_##type##ACCESS(1b, %l2)                                \
        : : "rZ" (x), "r" (addr) : : label)

#define __raw_put_mem(str, x, ptr, label, type)                                        \
do {                                                                                \
        __typeof__(*(ptr)) __pu_val = (x);                                        \
        switch (sizeof(*(ptr))) {                                                \
        case 1:                                                                        \
                __put_mem_asm(str "b", "%w", __pu_val, (ptr), label, type);        \
                break;                                                                \
        case 2:                                                                        \
                __put_mem_asm(str "h", "%w", __pu_val, (ptr), label, type);        \
                break;                                                                \
        case 4:                                                                        \
                __put_mem_asm(str, "%w", __pu_val, (ptr), label, type);                \
                break;                                                                \
        case 8:                                                                        \
                __put_mem_asm(str, "%x", __pu_val, (ptr), label, type);                \
                break;                                                                \
        default:                                                                \
                BUILD_BUG();                                                        \
        }                                                                        \
} while (0)

/*
 * We must not call into the scheduler between uaccess_ttbr0_enable() and
 * uaccess_ttbr0_disable(). As `x` and `ptr` could contain blocking functions,
 * we must evaluate these outside of the critical section.
 */
#define __raw_put_user(x, ptr, label)                                        \
do {                                                                        \
        __label__ __rpu_failed;                                                \
        __typeof__(*(ptr)) __user *__rpu_ptr = (ptr);                        \
        __typeof__(*(ptr)) __rpu_val = (x);                                \
        __chk_user_ptr(__rpu_ptr);                                        \
                                                                        \
        do {                                                                \
                uaccess_ttbr0_enable();                                        \
                __raw_put_mem("sttr", __rpu_val, __rpu_ptr, __rpu_failed, U);        \
                uaccess_ttbr0_disable();                                \
                break;                                                        \
        __rpu_failed:                                                        \
                uaccess_ttbr0_disable();                                \
                goto label;                                                \
        } while (0);                                                        \
} while (0)

#define __put_user_error(x, ptr, err)                                        \
do {                                                                        \
        __label__ __pu_failed;                                                \
        __typeof__(*(ptr)) __user *__p = (ptr);                                \
        might_fault();                                                        \
        if (access_ok(__p, sizeof(*__p))) {                                \
                __p = uaccess_mask_ptr(__p);                                \
                __raw_put_user((x), __p, __pu_failed);                        \
        } else        {                                                        \
        __pu_failed:                                                        \
                (err) = -EFAULT;                                        \
        }                                                                \
} while (0)

#define __put_user(x, ptr)                                                \
({                                                                        \
        int __pu_err = 0;                                                \
        __put_user_error((x), (ptr), __pu_err);                                \
        __pu_err;                                                        \
})

#define put_user        __put_user

/*
 * We must not call into the scheduler between __mte_enable_tco_async() and
 * __mte_disable_tco_async(). As `dst` and `src` may contain blocking
 * functions, we must evaluate these outside of the critical section.
 */
#define __put_kernel_nofault(dst, src, type, err_label)                        \
do {                                                                        \
        __typeof__(dst) __pkn_dst = (dst);                                \
        __typeof__(src) __pkn_src = (src);                                \
                                                                        \
        do {                                                                \
                __label__ __pkn_err;                                        \
                __mte_enable_tco_async();                                \
                __raw_put_mem("str", *((type *)(__pkn_src)),                \
                              (__force type *)(__pkn_dst), __pkn_err, K);        \
                __mte_disable_tco_async();                                \
                break;                                                        \
        __pkn_err:                                                        \
                __mte_disable_tco_async();                                \
                goto err_label;                                                \
        } while (0);                                                        \
} while(0)

extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n);
#define raw_copy_from_user(to, from, n)                                        \
({                                                                        \
        unsigned long __acfu_ret;                                        \
        uaccess_ttbr0_enable();                                                \
        __acfu_ret = __arch_copy_from_user((to),                        \
                                      __uaccess_mask_ptr(from), (n));        \
        uaccess_ttbr0_disable();                                        \
        __acfu_ret;                                                        \
})

extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n);
#define raw_copy_to_user(to, from, n)                                        \
({                                                                        \
        unsigned long __actu_ret;                                        \
        uaccess_ttbr0_enable();                                                \
        __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to),        \
                                    (from), (n));                        \
        uaccess_ttbr0_disable();                                        \
        __actu_ret;                                                        \
})

static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len)
{
        if (unlikely(!access_ok(ptr,len)))
                return 0;
        uaccess_ttbr0_enable();
        return 1;
}
#define user_access_begin(a,b)        user_access_begin(a,b)
#define user_access_end()        uaccess_ttbr0_disable()
#define unsafe_put_user(x, ptr, label) \
        __raw_put_mem("sttr", x, uaccess_mask_ptr(ptr), label, U)
#define unsafe_get_user(x, ptr, label) \
        __raw_get_mem("ldtr", x, uaccess_mask_ptr(ptr), label, U)

/*
 * KCSAN uses these to save and restore ttbr state.
 * We do not support KCSAN with ARM64_SW_TTBR0_PAN, so
 * they are no-ops.
 */
static inline unsigned long user_access_save(void) { return 0; }
static inline void user_access_restore(unsigned long enabled) { }

/*
 * We want the unsafe accessors to always be inlined and use
 * the error labels - thus the macro games.
 */
#define unsafe_copy_loop(dst, src, len, type, label)                                \
        while (len >= sizeof(type)) {                                                \
                unsafe_put_user(*(type *)(src),(type __user *)(dst),label);        \
                dst += sizeof(type);                                                \
                src += sizeof(type);                                                \
                len -= sizeof(type);                                                \
        }

#define unsafe_copy_to_user(_dst,_src,_len,label)                        \
do {                                                                        \
        char __user *__ucu_dst = (_dst);                                \
        const char *__ucu_src = (_src);                                        \
        size_t __ucu_len = (_len);                                        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);        \
        unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);        \
} while (0)

#define INLINE_COPY_TO_USER
#define INLINE_COPY_FROM_USER

extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned long n);
static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n)
{
        if (access_ok(to, n)) {
                uaccess_ttbr0_enable();
                n = __arch_clear_user(__uaccess_mask_ptr(to), n);
                uaccess_ttbr0_disable();
        }
        return n;
}
#define clear_user        __clear_user

extern long strncpy_from_user(char *dest, const char __user *src, long count);

extern __must_check long strnlen_user(const char __user *str, long n);

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
extern unsigned long __must_check __copy_user_flushcache(void *to, const void __user *from, unsigned long n);

static inline int __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
{
        kasan_check_write(dst, size);
        return __copy_user_flushcache(dst, __uaccess_mask_ptr(src), size);
}
#endif

#ifdef CONFIG_ARCH_HAS_SUBPAGE_FAULTS

/*
 * Return 0 on success, the number of bytes not probed otherwise.
 */
static inline size_t probe_subpage_writeable(const char __user *uaddr,
                                             size_t size)
{
        if (!system_supports_mte())
                return 0;
        return mte_probe_user_range(uaddr, size);
}

#endif /* CONFIG_ARCH_HAS_SUBPAGE_FAULTS */

#ifdef CONFIG_ARM64_GCS

static inline int gcssttr(unsigned long __user *addr, unsigned long val)
{
        register unsigned long __user *_addr __asm__ ("x0") = addr;
        register unsigned long _val __asm__ ("x1") = val;
        int err = 0;

        /* GCSSTTR x1, x0 */
        asm volatile(
                "1: .inst 0xd91f1c01\n"
                "2: \n"
                _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w0)
                : "+r" (err)
                : "rZ" (_val), "r" (_addr)
                : "memory");

        return err;
}

static inline void put_user_gcs(unsigned long val, unsigned long __user *addr,
                                int *err)
{
        int ret;

        if (!access_ok((char __user *)addr, sizeof(u64))) {
                *err = -EFAULT;
                return;
        }

        uaccess_ttbr0_enable();
        ret = gcssttr(addr, val);
        if (ret != 0)
                *err = ret;
        uaccess_ttbr0_disable();
}


#endif /* CONFIG_ARM64_GCS */

#endif /* __ASM_UACCESS_H */









































































































   26 































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/xattr.h

  Extended attributes handling.

  Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org>
  Copyright (c) 2001-2002 Silicon Graphics, Inc.  All Rights Reserved.
  Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
*/
#ifndef _LINUX_XATTR_H
#define _LINUX_XATTR_H


#include <linux/slab.h>
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/user_namespace.h>
#include <uapi/linux/xattr.h>

/* List of all open_how "versions". */
#define XATTR_ARGS_SIZE_VER0        16 /* sizeof first published struct */
#define XATTR_ARGS_SIZE_LATEST        XATTR_ARGS_SIZE_VER0

struct inode;
struct dentry;

static inline bool is_posix_acl_xattr(const char *name)
{
        return (strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) == 0) ||
               (strcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT) == 0);
}

/*
 * struct xattr_handler: When @name is set, match attributes with exactly that
 * name.  When @prefix is set instead, match attributes with that prefix and
 * with a non-empty suffix.
 */
struct xattr_handler {
        const char *name;
        const char *prefix;
        int flags;      /* fs private flags */
        bool (*list)(struct dentry *dentry);
        int (*get)(const struct xattr_handler *, struct dentry *dentry,
                   struct inode *inode, const char *name, void *buffer,
                   size_t size);
        int (*set)(const struct xattr_handler *,
                   struct mnt_idmap *idmap, struct dentry *dentry,
                   struct inode *inode, const char *name, const void *buffer,
                   size_t size, int flags);
};

/**
 * xattr_handler_can_list - check whether xattr can be listed
 * @handler: handler for this type of xattr
 * @dentry: dentry whose inode xattr to list
 *
 * Determine whether the xattr associated with @dentry can be listed given
 * @handler.
 *
 * Return: true if xattr can be listed, false if not.
 */
static inline bool xattr_handler_can_list(const struct xattr_handler *handler,
                                          struct dentry *dentry)
{
        return handler && (!handler->list || handler->list(dentry));
}

const char *xattr_full_name(const struct xattr_handler *, const char *);

struct xattr {
        const char *name;
        void *value;
        size_t value_len;
};

ssize_t __vfs_getxattr(struct dentry *, struct inode *, const char *, void *, size_t);
ssize_t vfs_getxattr(struct mnt_idmap *, struct dentry *, const char *,
                     void *, size_t);
ssize_t vfs_listxattr(struct dentry *d, char *list, size_t size);
int __vfs_setxattr(struct mnt_idmap *, struct dentry *, struct inode *,
                   const char *, const void *, size_t, int);
int __vfs_setxattr_noperm(struct mnt_idmap *, struct dentry *,
                          const char *, const void *, size_t, int);
int __vfs_setxattr_locked(struct mnt_idmap *, struct dentry *,
                          const char *, const void *, size_t, int,
                          struct inode **);
int vfs_setxattr(struct mnt_idmap *, struct dentry *, const char *,
                 const void *, size_t, int);
int __vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);
int __vfs_removexattr_locked(struct mnt_idmap *, struct dentry *,
                             const char *, struct inode **);
int vfs_removexattr(struct mnt_idmap *, struct dentry *, const char *);

ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size);
int vfs_getxattr_alloc(struct mnt_idmap *idmap,
                       struct dentry *dentry, const char *name,
                       char **xattr_value, size_t size, gfp_t flags);

int xattr_supports_user_prefix(struct inode *inode);

static inline const char *xattr_prefix(const struct xattr_handler *handler)
{
        return handler->prefix ?: handler->name;
}

struct simple_xattrs {
        struct rb_root rb_root;
        rwlock_t lock;
};

struct simple_xattr {
        struct rb_node rb_node;
        char *name;
        size_t size;
        char value[];
};

void simple_xattrs_init(struct simple_xattrs *xattrs);
void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space);
size_t simple_xattr_space(const char *name, size_t size);
struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
void simple_xattr_free(struct simple_xattr *xattr);
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
                     void *buffer, size_t size);
struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs,
                                      const char *name, const void *value,
                                      size_t size, int flags);
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
                          char *buffer, size_t size);
void simple_xattr_add(struct simple_xattrs *xattrs,
                      struct simple_xattr *new_xattr);
int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name);

#endif        /* _LINUX_XATTR_H */








































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
/*
 * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
 * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
 */

#ifndef RXE_VERBS_H
#define RXE_VERBS_H

#include <linux/interrupt.h>
#include <linux/workqueue.h>
#include "rxe_pool.h"
#include "rxe_task.h"
#include "rxe_hw_counters.h"

static inline int pkey_match(u16 key1, u16 key2)
{
        return (((key1 & 0x7fff) != 0) &&
                ((key1 & 0x7fff) == (key2 & 0x7fff)) &&
                ((key1 & 0x8000) || (key2 & 0x8000))) ? 1 : 0;
}

/* Return >0 if psn_a > psn_b
 *           0 if psn_a == psn_b
 *          <0 if psn_a < psn_b
 */
static inline int psn_compare(u32 psn_a, u32 psn_b)
{
        s32 diff;

        diff = (psn_a - psn_b) << 8;
        return diff;
}

struct rxe_ucontext {
        struct ib_ucontext ibuc;
        struct rxe_pool_elem        elem;
};

struct rxe_pd {
        struct ib_pd            ibpd;
        struct rxe_pool_elem        elem;
};

struct rxe_ah {
        struct ib_ah                ibah;
        struct rxe_pool_elem        elem;
        struct rxe_av                av;
        bool                        is_user;
        int                        ah_num;
};

struct rxe_cqe {
        union {
                struct ib_wc                ibwc;
                struct ib_uverbs_wc        uibwc;
        };
};

struct rxe_cq {
        struct ib_cq                ibcq;
        struct rxe_pool_elem        elem;
        struct rxe_queue        *queue;
        spinlock_t                cq_lock;
        u8                        notify;
        bool                        is_user;
        atomic_t                num_wq;
};

enum wqe_state {
        wqe_state_posted,
        wqe_state_processing,
        wqe_state_pending,
        wqe_state_done,
        wqe_state_error,
};

struct rxe_sq {
        int                        max_wr;
        int                        max_sge;
        int                        max_inline;
        spinlock_t                sq_lock; /* guard queue */
        struct rxe_queue        *queue;
};

struct rxe_rq {
        int                        max_wr;
        int                        max_sge;
        spinlock_t                producer_lock; /* guard queue producer */
        spinlock_t                consumer_lock; /* guard queue consumer */
        struct rxe_queue        *queue;
};

struct rxe_srq {
        struct ib_srq                ibsrq;
        struct rxe_pool_elem        elem;
        struct rxe_pd                *pd;
        struct rxe_rq                rq;
        u32                        srq_num;

        int                        limit;
        int                        error;
};

struct rxe_req_info {
        int                        wqe_index;
        u32                        psn;
        int                        opcode;
        atomic_t                rd_atomic;
        int                        wait_fence;
        int                        need_rd_atomic;
        int                        wait_psn;
        int                        need_retry;
        int                        wait_for_rnr_timer;
        int                        noack_pkts;
        int                        again;
};

struct rxe_comp_info {
        u32                        psn;
        int                        opcode;
        int                        timeout;
        int                        timeout_retry;
        int                        started_retry;
        u32                        retry_cnt;
        u32                        rnr_retry;
};

/* responder states */
enum resp_states {
        RESPST_NONE,
        RESPST_GET_REQ,
        RESPST_CHK_PSN,
        RESPST_CHK_OP_SEQ,
        RESPST_CHK_OP_VALID,
        RESPST_CHK_RESOURCE,
        RESPST_CHK_LENGTH,
        RESPST_CHK_RKEY,
        RESPST_EXECUTE,
        RESPST_READ_REPLY,
        RESPST_ATOMIC_REPLY,
        RESPST_ATOMIC_WRITE_REPLY,
        RESPST_PROCESS_FLUSH,
        RESPST_COMPLETE,
        RESPST_ACKNOWLEDGE,
        RESPST_CLEANUP,
        RESPST_DUPLICATE_REQUEST,
        RESPST_ERR_MALFORMED_WQE,
        RESPST_ERR_UNSUPPORTED_OPCODE,
        RESPST_ERR_MISALIGNED_ATOMIC,
        RESPST_ERR_PSN_OUT_OF_SEQ,
        RESPST_ERR_MISSING_OPCODE_FIRST,
        RESPST_ERR_MISSING_OPCODE_LAST_C,
        RESPST_ERR_MISSING_OPCODE_LAST_D1E,
        RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
        RESPST_ERR_RNR,
        RESPST_ERR_RKEY_VIOLATION,
        RESPST_ERR_INVALIDATE_RKEY,
        RESPST_ERR_LENGTH,
        RESPST_ERR_CQ_OVERFLOW,
        RESPST_ERROR,
        RESPST_DONE,
        RESPST_EXIT,
};

enum rdatm_res_state {
        rdatm_res_state_next,
        rdatm_res_state_new,
        rdatm_res_state_replay,
};

struct resp_res {
        int                        type;
        int                        replay;
        u32                        first_psn;
        u32                        last_psn;
        u32                        cur_psn;
        enum rdatm_res_state        state;

        union {
                struct {
                        u64                orig_val;
                } atomic;
                struct {
                        u64                va_org;
                        u32                rkey;
                        u32                length;
                        u64                va;
                        u32                resid;
                } read;
                struct {
                        u32                length;
                        u64                va;
                        u8                type;
                        u8                level;
                } flush;
        };
};

struct rxe_resp_info {
        u32                        msn;
        u32                        psn;
        u32                        ack_psn;
        int                        opcode;
        int                        drop_msg;
        int                        goto_error;
        int                        sent_psn_nak;
        enum ib_wc_status        status;
        u8                        aeth_syndrome;

        /* Receive only */
        struct rxe_recv_wqe        *wqe;

        /* RDMA read / atomic only */
        u64                        va;
        u64                        offset;
        struct rxe_mr                *mr;
        u32                        resid;
        u32                        rkey;
        u32                        length;

        /* SRQ only */
        struct {
                struct rxe_recv_wqe        wqe;
                struct ib_sge                sge[RXE_MAX_SGE];
        } srq_wqe;

        /* Responder resources. It's a circular list where the oldest
         * resource is dropped first.
         */
        struct resp_res                *resources;
        unsigned int                res_head;
        unsigned int                res_tail;
        struct resp_res                *res;
};

struct rxe_qp {
        struct ib_qp                ibqp;
        struct rxe_pool_elem        elem;
        struct ib_qp_attr        attr;
        unsigned int                valid;
        unsigned int                mtu;
        bool                        is_user;

        struct rxe_pd                *pd;
        struct rxe_srq                *srq;
        struct rxe_cq                *scq;
        struct rxe_cq                *rcq;

        enum ib_sig_type        sq_sig_type;

        struct rxe_sq                sq;
        struct rxe_rq                rq;

        struct socket                *sk;
        u32                        dst_cookie;
        u16                        src_port;

        struct rxe_av                pri_av;
        struct rxe_av                alt_av;

        atomic_t                mcg_num;

        struct sk_buff_head        req_pkts;
        struct sk_buff_head        resp_pkts;

        struct rxe_task                send_task;
        struct rxe_task                recv_task;

        struct rxe_req_info        req;
        struct rxe_comp_info        comp;
        struct rxe_resp_info        resp;

        atomic_t                ssn;
        atomic_t                skb_out;
        int                        need_req_skb;

        /* Timer for retranmitting packet when ACKs have been lost. RC
         * only. The requester sets it when it is not already
         * started. The responder resets it whenever an ack is
         * received.
         */
        struct timer_list retrans_timer;
        u64 qp_timeout_jiffies;

        /* Timer for handling RNR NAKS. */
        struct timer_list rnr_nak_timer;

        spinlock_t                state_lock; /* guard requester and completer */

        struct execute_work        cleanup_work;
};

enum {
        RXE_ACCESS_REMOTE        = IB_ACCESS_REMOTE_READ
                                | IB_ACCESS_REMOTE_WRITE
                                | IB_ACCESS_REMOTE_ATOMIC,
        RXE_ACCESS_SUPPORTED_MR        = RXE_ACCESS_REMOTE
                                | IB_ACCESS_LOCAL_WRITE
                                | IB_ACCESS_MW_BIND
                                | IB_ACCESS_ON_DEMAND
                                | IB_ACCESS_FLUSH_GLOBAL
                                | IB_ACCESS_FLUSH_PERSISTENT
                                | IB_ACCESS_OPTIONAL,
        RXE_ACCESS_SUPPORTED_QP        = RXE_ACCESS_SUPPORTED_MR,
        RXE_ACCESS_SUPPORTED_MW        = RXE_ACCESS_SUPPORTED_MR
                                | IB_ZERO_BASED,
};

enum rxe_mr_state {
        RXE_MR_STATE_INVALID,
        RXE_MR_STATE_FREE,
        RXE_MR_STATE_VALID,
};

enum rxe_mr_copy_dir {
        RXE_TO_MR_OBJ,
        RXE_FROM_MR_OBJ,
};

enum rxe_mr_lookup_type {
        RXE_LOOKUP_LOCAL,
        RXE_LOOKUP_REMOTE,
};

enum rxe_rereg {
        RXE_MR_REREG_SUPPORTED        = IB_MR_REREG_PD
                                | IB_MR_REREG_ACCESS,
};

static inline int rkey_is_mw(u32 rkey)
{
        u32 index = rkey >> 8;

        return (index >= RXE_MIN_MW_INDEX) && (index <= RXE_MAX_MW_INDEX);
}

struct rxe_mr {
        struct rxe_pool_elem        elem;
        struct ib_mr                ibmr;

        struct ib_umem                *umem;

        u32                        lkey;
        u32                        rkey;
        enum rxe_mr_state        state;
        int                        access;
        atomic_t                num_mw;

        unsigned int                page_offset;
        unsigned int                page_shift;
        u64                        page_mask;

        u32                        num_buf;
        u32                        nbuf;

        struct xarray                page_list;
};

static inline unsigned int mr_page_size(struct rxe_mr *mr)
{
        return mr ? mr->ibmr.page_size : PAGE_SIZE;
}

enum rxe_mw_state {
        RXE_MW_STATE_INVALID        = RXE_MR_STATE_INVALID,
        RXE_MW_STATE_FREE        = RXE_MR_STATE_FREE,
        RXE_MW_STATE_VALID        = RXE_MR_STATE_VALID,
};

struct rxe_mw {
        struct ib_mw                ibmw;
        struct rxe_pool_elem        elem;
        spinlock_t                lock;
        enum rxe_mw_state        state;
        struct rxe_qp                *qp; /* Type 2 only */
        struct rxe_mr                *mr;
        u32                        rkey;
        int                        access;
        u64                        addr;
        u64                        length;
};

struct rxe_mcg {
        struct rb_node                node;
        struct kref                ref_cnt;
        struct rxe_dev                *rxe;
        struct list_head        qp_list;
        union ib_gid                mgid;
        atomic_t                qp_num;
        u32                        qkey;
        u16                        pkey;
};

struct rxe_mca {
        struct list_head        qp_list;
        struct rxe_qp                *qp;
};

struct rxe_port {
        struct ib_port_attr        attr;
        __be64                        port_guid;
        __be64                        subnet_prefix;
        spinlock_t                port_lock; /* guard port */
        unsigned int                mtu_cap;
        /* special QPs */
        u32                        qp_gsi_index;
};

#define        RXE_PORT        1
struct rxe_dev {
        struct ib_device        ib_dev;
        struct ib_device_attr        attr;
        int                        max_ucontext;
        int                        max_inline_data;
        struct mutex                usdev_lock;

        char                        raw_gid[ETH_ALEN];

        struct rxe_pool                uc_pool;
        struct rxe_pool                pd_pool;
        struct rxe_pool                ah_pool;
        struct rxe_pool                srq_pool;
        struct rxe_pool                qp_pool;
        struct rxe_pool                cq_pool;
        struct rxe_pool                mr_pool;
        struct rxe_pool                mw_pool;

        /* multicast support */
        spinlock_t                mcg_lock;
        struct rb_root                mcg_tree;
        atomic_t                mcg_num;
        atomic_t                mcg_attach;

        spinlock_t                pending_lock; /* guard pending_mmaps */
        struct list_head        pending_mmaps;

        spinlock_t                mmap_offset_lock; /* guard mmap_offset */
        u64                        mmap_offset;

        atomic64_t                stats_counters[RXE_NUM_OF_COUNTERS];

        struct rxe_port                port;
};

static inline struct net_device *rxe_ib_device_get_netdev(struct ib_device *dev)
{
        return ib_device_get_netdev(dev, RXE_PORT);
}

static inline void rxe_counter_inc(struct rxe_dev *rxe, enum rxe_counters index)
{
        atomic64_inc(&rxe->stats_counters[index]);
}

static inline struct rxe_dev *to_rdev(struct ib_device *dev)
{
        return dev ? container_of(dev, struct rxe_dev, ib_dev) : NULL;
}

static inline struct rxe_ucontext *to_ruc(struct ib_ucontext *uc)
{
        return uc ? container_of(uc, struct rxe_ucontext, ibuc) : NULL;
}

static inline struct rxe_pd *to_rpd(struct ib_pd *pd)
{
        return pd ? container_of(pd, struct rxe_pd, ibpd) : NULL;
}

static inline struct rxe_ah *to_rah(struct ib_ah *ah)
{
        return ah ? container_of(ah, struct rxe_ah, ibah) : NULL;
}

static inline struct rxe_srq *to_rsrq(struct ib_srq *srq)
{
        return srq ? container_of(srq, struct rxe_srq, ibsrq) : NULL;
}

static inline struct rxe_qp *to_rqp(struct ib_qp *qp)
{
        return qp ? container_of(qp, struct rxe_qp, ibqp) : NULL;
}

static inline struct rxe_cq *to_rcq(struct ib_cq *cq)
{
        return cq ? container_of(cq, struct rxe_cq, ibcq) : NULL;
}

static inline struct rxe_mr *to_rmr(struct ib_mr *mr)
{
        return mr ? container_of(mr, struct rxe_mr, ibmr) : NULL;
}

static inline struct rxe_mw *to_rmw(struct ib_mw *mw)
{
        return mw ? container_of(mw, struct rxe_mw, ibmw) : NULL;
}

static inline struct rxe_pd *rxe_ah_pd(struct rxe_ah *ah)
{
        return to_rpd(ah->ibah.pd);
}

static inline struct rxe_pd *mr_pd(struct rxe_mr *mr)
{
        return to_rpd(mr->ibmr.pd);
}

static inline struct rxe_pd *rxe_mw_pd(struct rxe_mw *mw)
{
        return to_rpd(mw->ibmw.pd);
}

int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name,
                                                struct net_device *ndev);

#endif /* RXE_VERBS_H */





































































































































    2 

















    1 



    1 








































    1 






    1 















    1 



    1 


















































    1 

    2 

    2 


    3 

    2 












    7 
    1 

    6 








    3 
    1 

    2 















    1 



    1 










    1 
















    3 



    3 



    2 






    2 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
// SPDX-License-Identifier: GPL-2.0-only
/*
 * VFIO-KVM bridge pseudo device
 *
 * Copyright (C) 2013 Red Hat, Inc.  All rights reserved.
 *     Author: Alex Williamson <alex.williamson@redhat.com>
 */

#include <linux/errno.h>
#include <linux/file.h>
#include <linux/kvm_host.h>
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
#include "vfio.h"

#ifdef CONFIG_SPAPR_TCE_IOMMU
#include <asm/kvm_ppc.h>
#endif

struct kvm_vfio_file {
        struct list_head node;
        struct file *file;
#ifdef CONFIG_SPAPR_TCE_IOMMU
        struct iommu_group *iommu_group;
#endif
};

struct kvm_vfio {
        struct list_head file_list;
        struct mutex lock;
        bool noncoherent;
};

static void kvm_vfio_file_set_kvm(struct file *file, struct kvm *kvm)
{
        void (*fn)(struct file *file, struct kvm *kvm);

        fn = symbol_get(vfio_file_set_kvm);
        if (!fn)
                return;

        fn(file, kvm);

        symbol_put(vfio_file_set_kvm);
}

static bool kvm_vfio_file_enforced_coherent(struct file *file)
{
        bool (*fn)(struct file *file);
        bool ret;

        fn = symbol_get(vfio_file_enforced_coherent);
        if (!fn)
                return false;

        ret = fn(file);

        symbol_put(vfio_file_enforced_coherent);

        return ret;
}

static bool kvm_vfio_file_is_valid(struct file *file)
{
        bool (*fn)(struct file *file);
        bool ret;

        fn = symbol_get(vfio_file_is_valid);
        if (!fn)
                return false;

        ret = fn(file);

        symbol_put(vfio_file_is_valid);

        return ret;
}

#ifdef CONFIG_SPAPR_TCE_IOMMU
static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file)
{
        struct iommu_group *(*fn)(struct file *file);
        struct iommu_group *ret;

        fn = symbol_get(vfio_file_iommu_group);
        if (!fn)
                return NULL;

        ret = fn(file);

        symbol_put(vfio_file_iommu_group);

        return ret;
}

static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm,
                                             struct kvm_vfio_file *kvf)
{
        if (WARN_ON_ONCE(!kvf->iommu_group))
                return;

        kvm_spapr_tce_release_iommu_group(kvm, kvf->iommu_group);
        iommu_group_put(kvf->iommu_group);
        kvf->iommu_group = NULL;
}
#endif

/*
 * Groups/devices can use the same or different IOMMU domains. If the same
 * then adding a new group/device may change the coherency of groups/devices
 * we've previously been told about. We don't want to care about any of
 * that so we retest each group/device and bail as soon as we find one that's
 * noncoherent.  This means we only ever [un]register_noncoherent_dma once
 * for the whole device.
 */
static void kvm_vfio_update_coherency(struct kvm_device *dev)
{
        struct kvm_vfio *kv = dev->private;
        bool noncoherent = false;
        struct kvm_vfio_file *kvf;

        list_for_each_entry(kvf, &kv->file_list, node) {
                if (!kvm_vfio_file_enforced_coherent(kvf->file)) {
                        noncoherent = true;
                        break;
                }
        }

        if (noncoherent != kv->noncoherent) {
                kv->noncoherent = noncoherent;

                if (kv->noncoherent)
                        kvm_arch_register_noncoherent_dma(dev->kvm);
                else
                        kvm_arch_unregister_noncoherent_dma(dev->kvm);
        }
}

static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd)
{
        struct kvm_vfio *kv = dev->private;
        struct kvm_vfio_file *kvf;
        struct file *filp;
        int ret = 0;

        filp = fget(fd);
        if (!filp)
                return -EBADF;

        /* Ensure the FD is a vfio FD. */
        if (!kvm_vfio_file_is_valid(filp)) {
                ret = -EINVAL;
                goto out_fput;
        }

        mutex_lock(&kv->lock);

        list_for_each_entry(kvf, &kv->file_list, node) {
                if (kvf->file == filp) {
                        ret = -EEXIST;
                        goto out_unlock;
                }
        }

        kvf = kzalloc(sizeof(*kvf), GFP_KERNEL_ACCOUNT);
        if (!kvf) {
                ret = -ENOMEM;
                goto out_unlock;
        }

        kvf->file = get_file(filp);
        list_add_tail(&kvf->node, &kv->file_list);

        kvm_arch_start_assignment(dev->kvm);
        kvm_vfio_file_set_kvm(kvf->file, dev->kvm);
        kvm_vfio_update_coherency(dev);

out_unlock:
        mutex_unlock(&kv->lock);
out_fput:
        fput(filp);
        return ret;
}

static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd)
{
        struct kvm_vfio *kv = dev->private;
        struct kvm_vfio_file *kvf;
        CLASS(fd, f)(fd);
        int ret;

        if (fd_empty(f))
                return -EBADF;

        ret = -ENOENT;

        mutex_lock(&kv->lock);

        list_for_each_entry(kvf, &kv->file_list, node) {
                if (kvf->file != fd_file(f))
                        continue;

                list_del(&kvf->node);
                kvm_arch_end_assignment(dev->kvm);
#ifdef CONFIG_SPAPR_TCE_IOMMU
                kvm_spapr_tce_release_vfio_group(dev->kvm, kvf);
#endif
                kvm_vfio_file_set_kvm(kvf->file, NULL);
                fput(kvf->file);
                kfree(kvf);
                ret = 0;
                break;
        }

        kvm_vfio_update_coherency(dev);

        mutex_unlock(&kv->lock);
        return ret;
}

#ifdef CONFIG_SPAPR_TCE_IOMMU
static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev,
                                       void __user *arg)
{
        struct kvm_vfio_spapr_tce param;
        struct kvm_vfio *kv = dev->private;
        struct kvm_vfio_file *kvf;
        int ret;

        if (copy_from_user(&param, arg, sizeof(struct kvm_vfio_spapr_tce)))
                return -EFAULT;

        CLASS(fd, f)(param.groupfd);
        if (fd_empty(f))
                return -EBADF;

        ret = -ENOENT;

        mutex_lock(&kv->lock);

        list_for_each_entry(kvf, &kv->file_list, node) {
                if (kvf->file != fd_file(f))
                        continue;

                if (!kvf->iommu_group) {
                        kvf->iommu_group = kvm_vfio_file_iommu_group(kvf->file);
                        if (WARN_ON_ONCE(!kvf->iommu_group)) {
                                ret = -EIO;
                                goto err_fdput;
                        }
                }

                ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd,
                                                       kvf->iommu_group);
                break;
        }

err_fdput:
        mutex_unlock(&kv->lock);
        return ret;
}
#endif

static int kvm_vfio_set_file(struct kvm_device *dev, long attr,
                             void __user *arg)
{
        int32_t __user *argp = arg;
        int32_t fd;

        switch (attr) {
        case KVM_DEV_VFIO_FILE_ADD:
                if (get_user(fd, argp))
                        return -EFAULT;
                return kvm_vfio_file_add(dev, fd);

        case KVM_DEV_VFIO_FILE_DEL:
                if (get_user(fd, argp))
                        return -EFAULT;
                return kvm_vfio_file_del(dev, fd);

#ifdef CONFIG_SPAPR_TCE_IOMMU
        case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE:
                return kvm_vfio_file_set_spapr_tce(dev, arg);
#endif
        }

        return -ENXIO;
}

static int kvm_vfio_set_attr(struct kvm_device *dev,
                             struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_VFIO_FILE:
                return kvm_vfio_set_file(dev, attr->attr,
                                         u64_to_user_ptr(attr->addr));
        }

        return -ENXIO;
}

static int kvm_vfio_has_attr(struct kvm_device *dev,
                             struct kvm_device_attr *attr)
{
        switch (attr->group) {
        case KVM_DEV_VFIO_FILE:
                switch (attr->attr) {
                case KVM_DEV_VFIO_FILE_ADD:
                case KVM_DEV_VFIO_FILE_DEL:
#ifdef CONFIG_SPAPR_TCE_IOMMU
                case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE:
#endif
                        return 0;
                }

                break;
        }

        return -ENXIO;
}

static void kvm_vfio_release(struct kvm_device *dev)
{
        struct kvm_vfio *kv = dev->private;
        struct kvm_vfio_file *kvf, *tmp;

        list_for_each_entry_safe(kvf, tmp, &kv->file_list, node) {
#ifdef CONFIG_SPAPR_TCE_IOMMU
                kvm_spapr_tce_release_vfio_group(dev->kvm, kvf);
#endif
                kvm_vfio_file_set_kvm(kvf->file, NULL);
                fput(kvf->file);
                list_del(&kvf->node);
                kfree(kvf);
                kvm_arch_end_assignment(dev->kvm);
        }

        kvm_vfio_update_coherency(dev);

        kfree(kv);
        kfree(dev); /* alloc by kvm_ioctl_create_device, free by .release */
}

static int kvm_vfio_create(struct kvm_device *dev, u32 type);

static const struct kvm_device_ops kvm_vfio_ops = {
        .name = "kvm-vfio",
        .create = kvm_vfio_create,
        .release = kvm_vfio_release,
        .set_attr = kvm_vfio_set_attr,
        .has_attr = kvm_vfio_has_attr,
};

static int kvm_vfio_create(struct kvm_device *dev, u32 type)
{
        struct kvm_device *tmp;
        struct kvm_vfio *kv;

        lockdep_assert_held(&dev->kvm->lock);

        /* Only one VFIO "device" per VM */
        list_for_each_entry(tmp, &dev->kvm->devices, vm_node)
                if (tmp->ops == &kvm_vfio_ops)
                        return -EBUSY;

        kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT);
        if (!kv)
                return -ENOMEM;

        INIT_LIST_HEAD(&kv->file_list);
        mutex_init(&kv->lock);

        dev->private = kv;

        return 0;
}

int kvm_vfio_ops_init(void)
{
        return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO);
}

void kvm_vfio_ops_exit(void)
{
        kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO);
}


































































































































































































































   26 




  267 





  248 

  267 










  267 





































  266 










   98 




   57 









































  248 
  238 

  246 























   98 




   98 













   98 

   57 



























  247 





  225 


  248 

   49 












   75 
  224 

  248 













  225 




























































































  267 
  267 


  266 



















  247 

  247 



















   54 




























  267 



  266 
    9 

  260 
  260 



  219 

  255 


  200 













  114 




   13 

    7 
  104 















  202 
   87 
   52 


























   74 






























   97 




   77 
   66 









   98 












  247 






  241 




  230 






  248 















  247 

































  247 



  247 


  244 
  172 







  247 
    1 





  230 


   16 





  248 















    9 
   43 




   46 


    4 






   49 













  248 
   47 

  247 




   71 






  247 




   90 





































   26 
















   18 















   24 

   24 









   26 







    9 









   26 
    7 








    9 

   26 
























































  248 











  248 

  247 






















































  247 


   74 
  174 
































  246 





  248 

  248 






  247 




  247 
   74 





  247 

  248 




  248 


  246 





















   26 



   26 
   26 

   26 

















   26 
































































































































































































































































































   26 



   26 



















  248 
















  247 
  248 























   26 










































































  247 












  247 











  247 













  248 










  248 








  248 


















  248 
  247 











  247 





























  247 

  247 


























  247 



  248 





  248 










































































































































































































































































































































   26 












   26 







   26 






   26 







   26 














   33 





   32 



































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
// SPDX-License-Identifier: GPL-2.0-only
/*
 * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009                SUSE Linux Products GmbH
 * Copyright (C) 2009                Tejun Heo <tj@kernel.org>
 *
 * Copyright (C) 2017                Facebook Inc.
 * Copyright (C) 2017                Dennis Zhou <dennis@kernel.org>
 *
 * The percpu allocator handles both static and dynamic areas.  Percpu
 * areas are allocated in chunks which are divided into units.  There is
 * a 1-to-1 mapping for units to possible cpus.  These units are grouped
 * based on NUMA properties of the machine.
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
 * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
 *  -------------------  ......  -------------------  ....  ------------
 *
 * Allocation is done by offsets into a unit's address space.  Ie., an
 * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0,
 * c1:u1, c1:u2, etc.  On NUMA machines, the mapping may be non-linear
 * and even sparse.  Access is handled by configuring percpu base
 * registers according to the cpu to unit mappings and offsetting the
 * base address using pcpu_unit_size.
 *
 * There is special consideration for the first chunk which must handle
 * the static percpu variables in the kernel image as allocation services
 * are not online yet.  In short, the first chunk is structured like so:
 *
 *                  <Static | [Reserved] | Dynamic>
 *
 * The static data is copied from the original section managed by the
 * linker.  The reserved section, if non-zero, primarily manages static
 * percpu variables from kernel modules.  Finally, the dynamic section
 * takes care of normal allocations.
 *
 * The allocator organizes chunks into lists according to free size and
 * memcg-awareness.  To make a percpu allocation memcg-aware the __GFP_ACCOUNT
 * flag should be passed.  All memcg-aware allocations are sharing one set
 * of chunks and all unaccounted allocations and allocations performed
 * by processes belonging to the root memory cgroup are using the second set.
 *
 * The allocator tries to allocate from the fullest chunk first. Each chunk
 * is managed by a bitmap with metadata blocks.  The allocation map is updated
 * on every allocation and free to reflect the current state while the boundary
 * map is only updated on allocation.  Each metadata block contains
 * information to help mitigate the need to iterate over large portions
 * of the bitmap.  The reverse mapping from page to chunk is stored in
 * the page's index.  Lastly, units are lazily backed and grow in unison.
 *
 * There is a unique conversion that goes on here between bytes and bits.
 * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE.  The chunk
 * tracks the number of pages it is responsible for in nr_pages.  Helper
 * functions are used to convert from between the bytes, bits, and blocks.
 * All hints are managed in bits unless explicitly stated.
 *
 * To use this allocator, arch code should do the following:
 *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
 *
 * - use pcpu_setup_first_chunk() during percpu area initialization to
 *   setup the first chunk containing the kernel static percpu area
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/bitmap.h>
#include <linux/cpumask.h>
#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/memcontrol.h>

#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/io.h>

#define CREATE_TRACE_POINTS
#include <trace/events/percpu.h>

#include "percpu-internal.h"

/*
 * The slots are sorted by the size of the biggest continuous free area.
 * 1-31 bytes share the same slot.
 */
#define PCPU_SLOT_BASE_SHIFT                5
/* chunks in slots below this are subject to being sidelined on failed alloc */
#define PCPU_SLOT_FAIL_THRESHOLD        3

#define PCPU_EMPTY_POP_PAGES_LOW        2
#define PCPU_EMPTY_POP_PAGES_HIGH        4

#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr)                                        \
        (void __percpu *)((unsigned long)(addr) -                        \
                          (unsigned long)pcpu_base_addr        +                \
                          (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr)                                                \
        (void __force *)((unsigned long)(ptr) +                                \
                         (unsigned long)pcpu_base_addr -                \
                         (unsigned long)__per_cpu_start)
#endif
#else        /* CONFIG_SMP */
/* on UP, it's always identity mapped */
#define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
#define __pcpu_ptr_to_addr(ptr)                (void __force *)(ptr)
#endif        /* CONFIG_SMP */

static int pcpu_unit_pages __ro_after_init;
static int pcpu_unit_size __ro_after_init;
static int pcpu_nr_units __ro_after_init;
static int pcpu_atom_size __ro_after_init;
int pcpu_nr_slots __ro_after_init;
static int pcpu_free_slot __ro_after_init;
int pcpu_sidelined_slot __ro_after_init;
int pcpu_to_depopulate_slot __ro_after_init;
static size_t pcpu_chunk_struct_size __ro_after_init;

/* cpus with the lowest and highest unit addresses */
static unsigned int pcpu_low_unit_cpu __ro_after_init;
static unsigned int pcpu_high_unit_cpu __ro_after_init;

/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __ro_after_init;

static const int *pcpu_unit_map __ro_after_init;                /* cpu -> unit */
const unsigned long *pcpu_unit_offsets __ro_after_init;        /* cpu -> unit offset */

/* group information, used for vm allocation */
static int pcpu_nr_groups __ro_after_init;
static const unsigned long *pcpu_group_offsets __ro_after_init;
static const size_t *pcpu_group_sizes __ro_after_init;

/*
 * The first chunk which always exists.  Note that unlike other
 * chunks, this one can be allocated and mapped in several different
 * ways and thus often doesn't live in the vmalloc area.
 */
struct pcpu_chunk *pcpu_first_chunk __ro_after_init;

/*
 * Optional reserved chunk.  This chunk reserves part of the first
 * chunk and serves it for reserved allocations.  When the reserved
 * region doesn't exist, the following variable is NULL.
 */
struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;

DEFINE_SPINLOCK(pcpu_lock);        /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex);        /* chunk create/destroy, [de]pop, map ext */

struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */

/*
 * The number of empty populated pages, protected by pcpu_lock.
 * The reserved chunk doesn't contribute to the count.
 */
int pcpu_nr_empty_pop_pages;

/*
 * The number of populated pages in use by the allocator, protected by
 * pcpu_lock.  This number is kept per a unit per chunk (i.e. when a page gets
 * allocated/deallocated, it is allocated/deallocated in all units of a chunk
 * and increments/decrements this count by 1).
 */
static unsigned long pcpu_nr_populated;

/*
 * Balance work is used to populate or destroy chunks asynchronously.  We
 * try to keep the number of populated free pages between
 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
 * empty chunk.
 */
static void pcpu_balance_workfn(struct work_struct *work);
static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
static bool pcpu_async_enabled __read_mostly;
static bool pcpu_atomic_alloc_failed;

static void pcpu_schedule_balance_work(void)
{
        if (pcpu_async_enabled)
                schedule_work(&pcpu_balance_work);
}

/**
 * pcpu_addr_in_chunk - check if the address is served from this chunk
 * @chunk: chunk of interest
 * @addr: percpu address
 *
 * RETURNS:
 * True if the address is served from this chunk.
 */
static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr)
{
        void *start_addr, *end_addr;

        if (!chunk)
                return false;

        start_addr = chunk->base_addr + chunk->start_offset;
        end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE -
                   chunk->end_offset;

        return addr >= start_addr && addr < end_addr;
}

static int __pcpu_size_to_slot(int size)
{
        int highbit = fls(size);        /* size is in bytes */
        return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}

static int pcpu_size_to_slot(int size)
{
        if (size == pcpu_unit_size)
                return pcpu_free_slot;
        return __pcpu_size_to_slot(size);
}

static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
        const struct pcpu_block_md *chunk_md = &chunk->chunk_md;

        if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
            chunk_md->contig_hint == 0)
                return 0;

        return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}

/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
        page->private = (unsigned long)pcpu;
}

/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
        return (struct pcpu_chunk *)page->private;
}

static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
{
        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}

static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx)
{
        return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT);
}

static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
                                     unsigned int cpu, int page_idx)
{
        return (unsigned long)chunk->base_addr +
               pcpu_unit_page_offset(cpu, page_idx);
}

/*
 * The following are helper functions to help access bitmaps and convert
 * between bitmap offsets to address offsets.
 */
static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index)
{
        return chunk->alloc_map +
               (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG);
}

static unsigned long pcpu_off_to_block_index(int off)
{
        return off / PCPU_BITMAP_BLOCK_BITS;
}

static unsigned long pcpu_off_to_block_off(int off)
{
        return off & (PCPU_BITMAP_BLOCK_BITS - 1);
}

static unsigned long pcpu_block_off_to_off(int index, int off)
{
        return index * PCPU_BITMAP_BLOCK_BITS + off;
}

/**
 * pcpu_check_block_hint - check against the contig hint
 * @block: block of interest
 * @bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 *
 * Check to see if the allocation can fit in the block's contig hint.
 * Note, a chunk uses the same hints as a block so this can also check against
 * the chunk's contig hint.
 */
static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits,
                                  size_t align)
{
        int bit_off = ALIGN(block->contig_hint_start, align) -
                block->contig_hint_start;

        return bit_off + bits <= block->contig_hint;
}

/*
 * pcpu_next_hint - determine which hint to use
 * @block: block of interest
 * @alloc_bits: size of allocation
 *
 * This determines if we should scan based on the scan_hint or first_free.
 * In general, we want to scan from first_free to fulfill allocations by
 * first fit.  However, if we know a scan_hint at position scan_hint_start
 * cannot fulfill an allocation, we can begin scanning from there knowing
 * the contig_hint will be our fallback.
 */
static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
{
        /*
         * The three conditions below determine if we can skip past the
         * scan_hint.  First, does the scan hint exist.  Second, is the
         * contig_hint after the scan_hint (possibly not true iff
         * contig_hint == scan_hint).  Third, is the allocation request
         * larger than the scan_hint.
         */
        if (block->scan_hint &&
            block->contig_hint_start > block->scan_hint_start &&
            alloc_bits > block->scan_hint)
                return block->scan_hint_start + block->scan_hint;

        return block->first_free;
}

/**
 * pcpu_next_md_free_region - finds the next hint free area
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Helper function for pcpu_for_each_md_free_region.  It checks
 * block->contig_hint and performs aggregation across blocks to find the
 * next hint.  It modifies bit_off and bits in-place to be consumed in the
 * loop.
 */
static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off,
                                     int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                        return;
                }

                /*
                 * This checks three things.  First is there a contig_hint to
                 * check.  Second, have we checked this hint before by
                 * comparing the block_off.  Third, is this the same as the
                 * right contig hint.  In the last case, it spills over into
                 * the next block and should be handled by the contig area
                 * across blocks code.
                 */
                *bits = block->contig_hint;
                if (*bits && block->contig_hint_start >= block_off &&
                    *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) {
                        *bit_off = pcpu_block_off_to_off(i,
                                        block->contig_hint_start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bits = block->right_free;
                *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free;
        }
}

/**
 * pcpu_next_fit_region - finds fit areas for a given allocation request
 * @chunk: chunk of interest
 * @alloc_bits: size of allocation
 * @align: alignment of area (max PAGE_SIZE)
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finds the next free region that is viable for use with a given size and
 * alignment.  This only returns if there is a valid area to be used for this
 * allocation.  block->first_free is returned if the allocation request fits
 * within the block to see if the request can be fulfilled prior to the contig
 * hint.
 */
static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits,
                                 int align, int *bit_off, int *bits)
{
        int i = pcpu_off_to_block_index(*bit_off);
        int block_off = pcpu_off_to_block_off(*bit_off);
        struct pcpu_block_md *block;

        *bits = 0;
        for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk);
             block++, i++) {
                /* handles contig area across blocks */
                if (*bits) {
                        *bits += block->left_free;
                        if (*bits >= alloc_bits)
                                return;
                        if (block->left_free == PCPU_BITMAP_BLOCK_BITS)
                                continue;
                }

                /* check block->contig_hint */
                *bits = ALIGN(block->contig_hint_start, align) -
                        block->contig_hint_start;
                /*
                 * This uses the block offset to determine if this has been
                 * checked in the prior iteration.
                 */
                if (block->contig_hint &&
                    block->contig_hint_start >= block_off &&
                    block->contig_hint >= *bits + alloc_bits) {
                        int start = pcpu_next_hint(block, alloc_bits);

                        *bits += alloc_bits + block->contig_hint_start -
                                 start;
                        *bit_off = pcpu_block_off_to_off(i, start);
                        return;
                }
                /* reset to satisfy the second predicate above */
                block_off = 0;

                *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free,
                                 align);
                *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off;
                *bit_off = pcpu_block_off_to_off(i, *bit_off);
                if (*bits >= alloc_bits)
                        return;
        }

        /* no valid offsets were found - fail condition */
        *bit_off = pcpu_chunk_map_bits(chunk);
}

/*
 * Metadata free area iterators.  These perform aggregation of free areas
 * based on the metadata blocks and return the offset @bit_off and size in
 * bits of the free area @bits.  pcpu_for_each_fit_region only returns when
 * a fit is found for the allocation request.
 */
#define pcpu_for_each_md_free_region(chunk, bit_off, bits)                \
        for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits));        \
             (bit_off) < pcpu_chunk_map_bits((chunk));                        \
             (bit_off) += (bits) + 1,                                        \
             pcpu_next_md_free_region((chunk), &(bit_off), &(bits)))

#define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits)     \
        for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits));                                      \
             (bit_off) < pcpu_chunk_map_bits((chunk));                              \
             (bit_off) += (bits),                                              \
             pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \
                                  &(bits)))

/**
 * pcpu_mem_zalloc - allocate memory
 * @size: bytes to allocate
 * @gfp: allocation flags
 *
 * Allocate @size bytes.  If @size is smaller than PAGE_SIZE,
 * kzalloc() is used; otherwise, the equivalent of vzalloc() is used.
 * This is to facilitate passing through whitelisted flags.  The
 * returned memory is always zeroed.
 *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
{
        if (WARN_ON_ONCE(!slab_is_available()))
                return NULL;

        if (size <= PAGE_SIZE)
                return kzalloc(size, gfp);
        else
                return __vmalloc(size, gfp | __GFP_ZERO);
}

/**
 * pcpu_mem_free - free memory
 * @ptr: memory to free
 *
 * Free @ptr.  @ptr should have been allocated using pcpu_mem_zalloc().
 */
static void pcpu_mem_free(void *ptr)
{
        kvfree(ptr);
}

static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
                              bool move_front)
{
        if (chunk != pcpu_reserved_chunk) {
                if (move_front)
                        list_move(&chunk->list, &pcpu_chunk_lists[slot]);
                else
                        list_move_tail(&chunk->list, &pcpu_chunk_lists[slot]);
        }
}

static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
{
        __pcpu_chunk_move(chunk, slot, true);
}

/**
 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
 * @chunk: chunk of interest
 * @oslot: the previous slot it was on
 *
 * This function is called after an allocation or free changed @chunk.
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
 *
 * CONTEXT:
 * pcpu_lock.
 */
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
        int nslot = pcpu_chunk_slot(chunk);

        /* leave isolated chunks in-place */
        if (chunk->isolated)
                return;

        if (oslot != nslot)
                __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}

static void pcpu_isolate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (!chunk->isolated) {
                chunk->isolated = true;
                pcpu_nr_empty_pop_pages -= chunk->nr_empty_pop_pages;
        }
        list_move(&chunk->list, &pcpu_chunk_lists[pcpu_to_depopulate_slot]);
}

static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk)
{
        lockdep_assert_held(&pcpu_lock);

        if (chunk->isolated) {
                chunk->isolated = false;
                pcpu_nr_empty_pop_pages += chunk->nr_empty_pop_pages;
                pcpu_chunk_relocate(chunk, -1);
        }
}

/*
 * pcpu_update_empty_pages - update empty page counters
 * @chunk: chunk of interest
 * @nr: nr of empty pages
 *
 * This is used to keep track of the empty pages now based on the premise
 * a md_block covers a page.  The hint update functions recognize if a block
 * is made full or broken to calculate deltas for keeping track of free pages.
 */
static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
        chunk->nr_empty_pop_pages += nr;
        if (chunk != pcpu_reserved_chunk && !chunk->isolated)
                pcpu_nr_empty_pop_pages += nr;
}

/*
 * pcpu_region_overlap - determines if two regions overlap
 * @a: start of first region, inclusive
 * @b: end of first region, exclusive
 * @x: start of second region, inclusive
 * @y: end of second region, exclusive
 *
 * This is used to determine if the hint region [a, b) overlaps with the
 * allocated region [x, y).
 */
static inline bool pcpu_region_overlap(int a, int b, int x, int y)
{
        return (a < y) && (x < b);
}

/**
 * pcpu_block_update - updates a block given a free area
 * @block: block of interest
 * @start: start offset in block
 * @end: end offset in block
 *
 * Updates a block given a known free area.  The region [start, end) is
 * expected to be the entirety of the free area within a block.  Chooses
 * the best starting offset if the contig hints are equal.
 */
static void pcpu_block_update(struct pcpu_block_md *block, int start, int end)
{
        int contig = end - start;

        block->first_free = min(block->first_free, start);
        if (start == 0)
                block->left_free = contig;

        if (end == block->nr_bits)
                block->right_free = contig;

        if (contig > block->contig_hint) {
                /* promote the old contig_hint to be the new scan_hint */
                if (start > block->contig_hint_start) {
                        if (block->contig_hint > block->scan_hint) {
                                block->scan_hint_start =
                                        block->contig_hint_start;
                                block->scan_hint = block->contig_hint;
                        } else if (start < block->scan_hint_start) {
                                /*
                                 * The old contig_hint == scan_hint.  But, the
                                 * new contig is larger so hold the invariant
                                 * scan_hint_start < contig_hint_start.
                                 */
                                block->scan_hint = 0;
                        }
                } else {
                        block->scan_hint = 0;
                }
                block->contig_hint_start = start;
                block->contig_hint = contig;
        } else if (contig == block->contig_hint) {
                if (block->contig_hint_start &&
                    (!start ||
                     __ffs(start) > __ffs(block->contig_hint_start))) {
                        /* start has a better alignment so use it */
                        block->contig_hint_start = start;
                        if (start < block->scan_hint_start &&
                            block->contig_hint > block->scan_hint)
                                block->scan_hint = 0;
                } else if (start > block->scan_hint_start ||
                           block->contig_hint > block->scan_hint) {
                        /*
                         * Knowing contig == contig_hint, update the scan_hint
                         * if it is farther than or larger than the current
                         * scan_hint.
                         */
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        } else {
                /*
                 * The region is smaller than the contig_hint.  So only update
                 * the scan_hint if it is larger than or equal and farther than
                 * the current scan_hint.
                 */
                if ((start < block->contig_hint_start &&
                     (contig > block->scan_hint ||
                      (contig == block->scan_hint &&
                       start > block->scan_hint_start)))) {
                        block->scan_hint_start = start;
                        block->scan_hint = contig;
                }
        }
}

/*
 * pcpu_block_update_scan - update a block given a free area from a scan
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of free area
 *
 * Finding the final allocation spot first goes through pcpu_find_block_fit()
 * to find a block that can hold the allocation and then pcpu_alloc_area()
 * where a scan is used.  When allocations require specific alignments,
 * we can inadvertently create holes which will not be seen in the alloc
 * or free paths.
 *
 * This takes a given free area hole and updates a block as it may change the
 * scan_hint.  We need to scan backwards to ensure we don't miss free bits
 * from alignment.
 */
static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
                                   int bits)
{
        int s_off = pcpu_off_to_block_off(bit_off);
        int e_off = s_off + bits;
        int s_index, l_bit;
        struct pcpu_block_md *block;

        if (e_off > PCPU_BITMAP_BLOCK_BITS)
                return;

        s_index = pcpu_off_to_block_index(bit_off);
        block = chunk->md_blocks + s_index;

        /* scan backwards in case of alignment skipping free bits */
        l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
        s_off = (s_off == l_bit) ? 0 : l_bit + 1;

        pcpu_block_update(block, s_off, e_off);
}

/**
 * pcpu_chunk_refresh_hint - updates metadata about a chunk
 * @chunk: chunk of interest
 * @full_scan: if we should scan from the beginning
 *
 * Iterates over the metadata blocks to find the largest contig area.
 * A full scan can be avoided on the allocation path as this is triggered
 * if we broke the contig_hint.  In doing so, the scan_hint will be before
 * the contig_hint or after if the scan_hint == contig_hint.  This cannot
 * be prevented on freeing as we want to find the largest area possibly
 * spanning blocks.
 */
static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits;

        /* promote scan_hint to contig_hint */
        if (!full_scan && chunk_md->scan_hint) {
                bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
                chunk_md->contig_hint_start = chunk_md->scan_hint_start;
                chunk_md->contig_hint = chunk_md->scan_hint;
                chunk_md->scan_hint = 0;
        } else {
                bit_off = chunk_md->first_free;
                chunk_md->contig_hint = 0;
        }

        bits = 0;
        pcpu_for_each_md_free_region(chunk, bit_off, bits)
                pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}

/**
 * pcpu_block_refresh_hint
 * @chunk: chunk of interest
 * @index: index of the metadata block
 *
 * Scans over the block beginning at first_free and updates the block
 * metadata accordingly.
 */
static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
        struct pcpu_block_md *block = chunk->md_blocks + index;
        unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
        unsigned int start, end;        /* region start, region end */

        /* promote scan_hint to contig_hint */
        if (block->scan_hint) {
                start = block->scan_hint_start + block->scan_hint;
                block->contig_hint_start = block->scan_hint_start;
                block->contig_hint = block->scan_hint;
                block->scan_hint = 0;
        } else {
                start = block->first_free;
                block->contig_hint = 0;
        }

        block->right_free = 0;

        /* iterate over free areas and update the contig hints */
        for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
                pcpu_block_update(block, start, end);
}

/**
 * pcpu_block_update_hint_alloc - update hint on allocation path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  The metadata only has to be
 * refreshed by a full scan iff the chunk's contig hint is broken.  Block level
 * scans are required if the block's contig hint is broken.
 */
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
                                         int bits)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Update s_block.
         */
        if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;

        /*
         * block->first_free must be updated if the allocation takes its place.
         * If the allocation breaks the contig_hint, a scan is required to
         * restore this hint.
         */
        if (s_off == s_block->first_free)
                s_block->first_free = find_next_zero_bit(
                                        pcpu_index_alloc_map(chunk, s_index),
                                        PCPU_BITMAP_BLOCK_BITS,
                                        s_off + bits);

        if (pcpu_region_overlap(s_block->scan_hint_start,
                                s_block->scan_hint_start + s_block->scan_hint,
                                s_off,
                                s_off + bits))
                s_block->scan_hint = 0;

        if (pcpu_region_overlap(s_block->contig_hint_start,
                                s_block->contig_hint_start +
                                s_block->contig_hint,
                                s_off,
                                s_off + bits)) {
                /* block contig hint is broken - scan to fix it */
                if (!s_off)
                        s_block->left_free = 0;
                pcpu_block_refresh_hint(chunk, s_index);
        } else {
                /* update left and right contig manually */
                s_block->left_free = min(s_block->left_free, s_off);
                if (s_index == e_index)
                        s_block->right_free = min_t(int, s_block->right_free,
                                        PCPU_BITMAP_BLOCK_BITS - e_off);
                else
                        s_block->right_free = 0;
        }

        /*
         * Update e_block.
         */
        if (s_index != e_index) {
                if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;

                /*
                 * When the allocation is across blocks, the end is along
                 * the left part of the e_block.
                 */
                e_block->first_free = find_next_zero_bit(
                                pcpu_index_alloc_map(chunk, e_index),
                                PCPU_BITMAP_BLOCK_BITS, e_off);

                if (e_off == PCPU_BITMAP_BLOCK_BITS) {
                        /* reset the block */
                        e_block++;
                } else {
                        if (e_off > e_block->scan_hint_start)
                                e_block->scan_hint = 0;

                        e_block->left_free = 0;
                        if (e_off > e_block->contig_hint_start) {
                                /* contig hint is broken - scan to fix it */
                                pcpu_block_refresh_hint(chunk, e_index);
                        } else {
                                e_block->right_free =
                                        min_t(int, e_block->right_free,
                                              PCPU_BITMAP_BLOCK_BITS - e_off);
                        }
                }

                /* update in-between md_blocks */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->scan_hint = 0;
                        block->contig_hint = 0;
                        block->left_free = 0;
                        block->right_free = 0;
                }
        }

        /*
         * If the allocation is not atomic, some blocks may not be
         * populated with pages, while we account it here.  The number
         * of pages will be added back with pcpu_chunk_populated()
         * when populating pages.
         */
        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, -nr_empty_pages);

        if (pcpu_region_overlap(chunk_md->scan_hint_start,
                                chunk_md->scan_hint_start +
                                chunk_md->scan_hint,
                                bit_off,
                                bit_off + bits))
                chunk_md->scan_hint = 0;

        /*
         * The only time a full chunk scan is required is if the chunk
         * contig hint is broken.  Otherwise, it means a smaller space
         * was used and therefore the chunk contig hint is still correct.
         */
        if (pcpu_region_overlap(chunk_md->contig_hint_start,
                                chunk_md->contig_hint_start +
                                chunk_md->contig_hint,
                                bit_off,
                                bit_off + bits))
                pcpu_chunk_refresh_hint(chunk, false);
}

/**
 * pcpu_block_update_hint_free - updates the block hints on the free path
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of request
 *
 * Updates metadata for the allocation path.  This avoids a blind block
 * refresh by making use of the block contig hints.  If this fails, it scans
 * forward and backward to determine the extent of the free area.  This is
 * capped at the boundary of blocks.
 *
 * A chunk update is triggered if a page becomes free, a block becomes free,
 * or the free spans across blocks.  This tradeoff is to minimize iterating
 * over the block metadata to update chunk_md->contig_hint.
 * chunk_md->contig_hint may be off by up to a page, but it will never be more
 * than the available space.  If the contig hint is contained in one block, it
 * will be accurate.
 */
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
                                        int bits)
{
        int nr_empty_pages = 0;
        struct pcpu_block_md *s_block, *e_block, *block;
        int s_index, e_index;        /* block indexes of the freed allocation */
        int s_off, e_off;        /* block offsets of the freed allocation */
        int start, end;                /* start and end of the whole free area */

        /*
         * Calculate per block offsets.
         * The calculation uses an inclusive range, but the resulting offsets
         * are [start, end).  e_index always points to the last block in the
         * range.
         */
        s_index = pcpu_off_to_block_index(bit_off);
        e_index = pcpu_off_to_block_index(bit_off + bits - 1);
        s_off = pcpu_off_to_block_off(bit_off);
        e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1;

        s_block = chunk->md_blocks + s_index;
        e_block = chunk->md_blocks + e_index;

        /*
         * Check if the freed area aligns with the block->contig_hint.
         * If it does, then the scan to find the beginning/end of the
         * larger free area can be avoided.
         *
         * start and end refer to beginning and end of the free area
         * within each their respective blocks.  This is not necessarily
         * the entire free area as it may span blocks past the beginning
         * or end of the block.
         */
        start = s_off;
        if (s_off == s_block->contig_hint + s_block->contig_hint_start) {
                start = s_block->contig_hint_start;
        } else {
                /*
                 * Scan backwards to find the extent of the free area.
                 * find_last_bit returns the starting bit, so if the start bit
                 * is returned, that means there was no last bit and the
                 * remainder of the chunk is free.
                 */
                int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index),
                                          start);
                start = (start == l_bit) ? 0 : l_bit + 1;
        }

        end = e_off;
        if (e_off == e_block->contig_hint_start)
                end = e_block->contig_hint_start + e_block->contig_hint;
        else
                end = find_next_bit(pcpu_index_alloc_map(chunk, e_index),
                                    PCPU_BITMAP_BLOCK_BITS, end);

        /* update s_block */
        e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
        if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
                nr_empty_pages++;
        pcpu_block_update(s_block, start, e_off);

        /* freeing in the same block */
        if (s_index != e_index) {
                /* update e_block */
                if (end == PCPU_BITMAP_BLOCK_BITS)
                        nr_empty_pages++;
                pcpu_block_update(e_block, 0, end);

                /* reset md_blocks in the middle */
                nr_empty_pages += (e_index - s_index - 1);
                for (block = s_block + 1; block < e_block; block++) {
                        block->first_free = 0;
                        block->scan_hint = 0;
                        block->contig_hint_start = 0;
                        block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
                        block->left_free = PCPU_BITMAP_BLOCK_BITS;
                        block->right_free = PCPU_BITMAP_BLOCK_BITS;
                }
        }

        if (nr_empty_pages)
                pcpu_update_empty_pages(chunk, nr_empty_pages);

        /*
         * Refresh chunk metadata when the free makes a block free or spans
         * across blocks.  The contig_hint may be off by up to a page, but if
         * the contig_hint is contained in a block, it will be accurate with
         * the else condition below.
         */
        if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
                pcpu_chunk_refresh_hint(chunk, true);
        else
                pcpu_block_update(&chunk->chunk_md,
                                  pcpu_block_off_to_off(s_index, start),
                                  end);
}

/**
 * pcpu_is_populated - determines if the region is populated
 * @chunk: chunk of interest
 * @bit_off: chunk offset
 * @bits: size of area
 * @next_off: return value for the next offset to start searching
 *
 * For atomic allocations, check if the backing pages are populated.
 *
 * RETURNS:
 * Bool if the backing pages are populated.
 * next_index is to skip over unpopulated blocks in pcpu_find_block_fit.
 */
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
                              int *next_off)
{
        unsigned int start, end;

        start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
        end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);

        start = find_next_zero_bit(chunk->populated, end, start);
        if (start >= end)
                return true;

        end = find_next_bit(chunk->populated, end, start + 1);

        *next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
        return false;
}

/**
 * pcpu_find_block_fit - finds the block index to start searching
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE bytes)
 * @pop_only: use populated regions only
 *
 * Given a chunk and an allocation spec, find the offset to begin searching
 * for a free region.  This iterates over the bitmap metadata blocks to
 * find an offset that will be guaranteed to fit the requirements.  It is
 * not quite first fit as if the allocation does not fit in the contig hint
 * of a block or chunk, it is skipped.  This errs on the side of caution
 * to prevent excess iteration.  Poor alignment can cause the allocator to
 * skip over blocks and chunks that have valid free areas.
 *
 * RETURNS:
 * The offset in the bitmap to begin searching.
 * -1 if no offset is found.
 */
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
                               size_t align, bool pop_only)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, next_off;

        /*
         * This is an optimization to prevent scanning by assuming if the
         * allocation cannot fit in the global hint, there is memory pressure
         * and creating a new chunk would happen soon.
         */
        if (!pcpu_check_block_hint(chunk_md, alloc_bits, align))
                return -1;

        bit_off = pcpu_next_hint(chunk_md, alloc_bits);
        bits = 0;
        pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
                if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
                                                   &next_off))
                        break;

                bit_off = next_off;
                bits = 0;
        }

        if (bit_off == pcpu_chunk_map_bits(chunk))
                return -1;

        return bit_off;
}

/*
 * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
 * @map: the address to base the search on
 * @size: the bitmap size in bits
 * @start: the bitnumber to start searching at
 * @nr: the number of zeroed bits we're looking for
 * @align_mask: alignment mask for zero area
 * @largest_off: offset of the largest area skipped
 * @largest_bits: size of the largest area skipped
 *
 * The @align_mask should be one less than a power of 2.
 *
 * This is a modified version of bitmap_find_next_zero_area_off() to remember
 * the largest area that was skipped.  This is imperfect, but in general is
 * good enough.  The largest remembered region is the largest failed region
 * seen.  This does not include anything we possibly skipped due to alignment.
 * pcpu_block_update_scan() does scan backwards to try and recover what was
 * lost to alignment.  While this can cause scanning to miss earlier possible
 * free areas, smaller allocations will eventually fill those holes.
 */
static unsigned long pcpu_find_zero_area(unsigned long *map,
                                         unsigned long size,
                                         unsigned long start,
                                         unsigned long nr,
                                         unsigned long align_mask,
                                         unsigned long *largest_off,
                                         unsigned long *largest_bits)
{
        unsigned long index, end, i, area_off, area_bits;
again:
        index = find_next_zero_bit(map, size, start);

        /* Align allocation */
        index = __ALIGN_MASK(index, align_mask);
        area_off = index;

        end = index + nr;
        if (end > size)
                return end;
        i = find_next_bit(map, end, index);
        if (i < end) {
                area_bits = i - area_off;
                /* remember largest unused area with best alignment */
                if (area_bits > *largest_bits ||
                    (area_bits == *largest_bits && *largest_off &&
                     (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
                        *largest_off = area_off;
                        *largest_bits = area_bits;
                }

                start = i + 1;
                goto again;
        }
        return index;
}

/**
 * pcpu_alloc_area - allocates an area from a pcpu_chunk
 * @chunk: chunk of interest
 * @alloc_bits: size of request in allocation units
 * @align: alignment of area (max PAGE_SIZE)
 * @start: bit_off to start searching
 *
 * This function takes in a @start offset to begin searching to fit an
 * allocation of @alloc_bits with alignment @align.  It needs to scan
 * the allocation map because if it fits within the block's contig hint,
 * @start will be block->first_free. This is an attempt to fill the
 * allocation prior to breaking the contig hint.  The allocation and
 * boundary maps are updated accordingly if it confirms a valid
 * free area.
 *
 * RETURNS:
 * Allocated addr offset in @chunk on success.
 * -1 if no matching area is found.
 */
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
                           size_t align, int start)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        size_t align_mask = (align) ? (align - 1) : 0;
        unsigned long area_off = 0, area_bits = 0;
        int bit_off, end, oslot;

        lockdep_assert_held(&pcpu_lock);

        oslot = pcpu_chunk_slot(chunk);

        /*
         * Search to find a fit.
         */
        end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
                    pcpu_chunk_map_bits(chunk));
        bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
                                      align_mask, &area_off, &area_bits);
        if (bit_off >= end)
                return -1;

        if (area_bits)
                pcpu_block_update_scan(chunk, area_off, area_bits);

        /* update alloc map */
        bitmap_set(chunk->alloc_map, bit_off, alloc_bits);

        /* update boundary map */
        set_bit(bit_off, chunk->bound_map);
        bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1);
        set_bit(bit_off + alloc_bits, chunk->bound_map);

        chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;

        /* update first free bit */
        if (bit_off == chunk_md->first_free)
                chunk_md->first_free = find_next_zero_bit(
                                        chunk->alloc_map,
                                        pcpu_chunk_map_bits(chunk),
                                        bit_off + alloc_bits);

        pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits);

        pcpu_chunk_relocate(chunk, oslot);

        return bit_off * PCPU_MIN_ALLOC_SIZE;
}

/**
 * pcpu_free_area - frees the corresponding offset
 * @chunk: chunk of interest
 * @off: addr offset into chunk
 *
 * This function determines the size of an allocation to free using
 * the boundary bitmap and clears the allocation map.
 *
 * RETURNS:
 * Number of freed bytes.
 */
static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
        struct pcpu_block_md *chunk_md = &chunk->chunk_md;
        int bit_off, bits, end, oslot, freed;

        lockdep_assert_held(&pcpu_lock);
        pcpu_stats_area_dealloc(chunk);

        oslot = pcpu_chunk_slot(chunk);

        bit_off = off / PCPU_MIN_ALLOC_SIZE;

        /* find end index */
        end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
                            bit_off + 1);
        bits = end - bit_off;
        bitmap_clear(chunk->alloc_map, bit_off, bits);

        freed = bits * PCPU_MIN_ALLOC_SIZE;

        /* update metadata */
        chunk->free_bytes += freed;

        /* update first free bit */
        chunk_md->first_free = min(chunk_md->first_free, bit_off);

        pcpu_block_update_hint_free(chunk, bit_off, bits);

        pcpu_chunk_relocate(chunk, oslot);

        return freed;
}

static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
{
        block->scan_hint = 0;
        block->contig_hint = nr_bits;
        block->left_free = nr_bits;
        block->right_free = nr_bits;
        block->first_free = 0;
        block->nr_bits = nr_bits;
}

static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
        struct pcpu_block_md *md_block;

        /* init the chunk's block */
        pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));

        for (md_block = chunk->md_blocks;
             md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
             md_block++)
                pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}

/**
 * pcpu_alloc_first_chunk - creates chunks that serve the first chunk
 * @tmp_addr: the start of the region served
 * @map_size: size of the region served
 *
 * This is responsible for creating the chunks that serve the first chunk.  The
 * base_addr is page aligned down of @tmp_addr while the region end is page
 * aligned up.  Offsets are kept track of to determine the region served. All
 * this is done to appease the bitmap allocator in avoiding partial blocks.
 *
 * RETURNS:
 * Chunk serving the region at @tmp_addr of @map_size.
 */
static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
                                                         int map_size)
{
        struct pcpu_chunk *chunk;
        unsigned long aligned_addr;
        int start_offset, offset_bits, region_size, region_bits;
        size_t alloc_size;

        /* region calculations */
        aligned_addr = tmp_addr & PAGE_MASK;

        start_offset = tmp_addr - aligned_addr;
        region_size = ALIGN(start_offset + map_size, PAGE_SIZE);

        /* allocate chunk */
        alloc_size = struct_size(chunk, populated,
                                 BITS_TO_LONGS(region_size >> PAGE_SHIFT));
        chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        INIT_LIST_HEAD(&chunk->list);

        chunk->base_addr = (void *)aligned_addr;
        chunk->start_offset = start_offset;
        chunk->end_offset = region_size - chunk->start_offset - map_size;

        chunk->nr_pages = region_size >> PAGE_SHIFT;
        region_bits = pcpu_chunk_map_bits(chunk);

        alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
        chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size =
                BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
        chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
        chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);
#ifdef NEED_PCPUOBJ_EXT
        /* first chunk is free to use */
        chunk->obj_exts = NULL;
#endif
        pcpu_init_md_blocks(chunk);

        /* manage populated page bitmap */
        chunk->immutable = true;
        bitmap_fill(chunk->populated, chunk->nr_pages);
        chunk->nr_populated = chunk->nr_pages;
        chunk->nr_empty_pop_pages = chunk->nr_pages;

        chunk->free_bytes = map_size;

        if (chunk->start_offset) {
                /* hide the beginning of the bitmap */
                offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map, 0, offset_bits);
                set_bit(0, chunk->bound_map);
                set_bit(offset_bits, chunk->bound_map);

                chunk->chunk_md.first_free = offset_bits;

                pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
        }

        if (chunk->end_offset) {
                /* hide the end of the bitmap */
                offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE;
                bitmap_set(chunk->alloc_map,
                           pcpu_chunk_map_bits(chunk) - offset_bits,
                           offset_bits);
                set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE,
                        chunk->bound_map);
                set_bit(region_bits, chunk->bound_map);

                pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk)
                                             - offset_bits, offset_bits);
        }

        return chunk;
}

static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
{
        struct pcpu_chunk *chunk;
        int region_bits;

        chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp);
        if (!chunk)
                return NULL;

        INIT_LIST_HEAD(&chunk->list);
        chunk->nr_pages = pcpu_unit_pages;
        region_bits = pcpu_chunk_map_bits(chunk);

        chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) *
                                           sizeof(chunk->alloc_map[0]), gfp);
        if (!chunk->alloc_map)
                goto alloc_map_fail;

        chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) *
                                           sizeof(chunk->bound_map[0]), gfp);
        if (!chunk->bound_map)
                goto bound_map_fail;

        chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) *
                                           sizeof(chunk->md_blocks[0]), gfp);
        if (!chunk->md_blocks)
                goto md_blocks_fail;

#ifdef NEED_PCPUOBJ_EXT
        if (need_pcpuobj_ext()) {
                chunk->obj_exts =
                        pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
                                        sizeof(struct pcpuobj_ext), gfp);
                if (!chunk->obj_exts)
                        goto objcg_fail;
        }
#endif

        pcpu_init_md_blocks(chunk);

        /* init metadata */
        chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;

        return chunk;

#ifdef NEED_PCPUOBJ_EXT
objcg_fail:
        pcpu_mem_free(chunk->md_blocks);
#endif
md_blocks_fail:
        pcpu_mem_free(chunk->bound_map);
bound_map_fail:
        pcpu_mem_free(chunk->alloc_map);
alloc_map_fail:
        pcpu_mem_free(chunk);

        return NULL;
}

static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
        if (!chunk)
                return;
#ifdef NEED_PCPUOBJ_EXT
        pcpu_mem_free(chunk->obj_exts);
#endif
        pcpu_mem_free(chunk->md_blocks);
        pcpu_mem_free(chunk->bound_map);
        pcpu_mem_free(chunk->alloc_map);
        pcpu_mem_free(chunk);
}

/**
 * pcpu_chunk_populated - post-population bookkeeping
 * @chunk: pcpu_chunk which got populated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been populated to @chunk.  Update
 * the bookkeeping information accordingly.  Must be called after each
 * successful population.
 */
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
                                 int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_set(chunk->populated, page_start, nr);
        chunk->nr_populated += nr;
        pcpu_nr_populated += nr;

        pcpu_update_empty_pages(chunk, nr);
}

/**
 * pcpu_chunk_depopulated - post-depopulation bookkeeping
 * @chunk: pcpu_chunk which got depopulated
 * @page_start: the start page
 * @page_end: the end page
 *
 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
 * Update the bookkeeping information accordingly.  Must be called after
 * each successful depopulation.
 */
static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
                                   int page_start, int page_end)
{
        int nr = page_end - page_start;

        lockdep_assert_held(&pcpu_lock);

        bitmap_clear(chunk->populated, page_start, nr);
        chunk->nr_populated -= nr;
        pcpu_nr_populated -= nr;

        pcpu_update_empty_pages(chunk, -nr);
}

/*
 * Chunk management implementation.
 *
 * To allow different implementations, chunk alloc/free and
 * [de]population are implemented in a separate file which is pulled
 * into this file and compiled together.  The following functions
 * should be implemented.
 *
 * pcpu_populate_chunk                - populate the specified range of a chunk
 * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
 * pcpu_post_unmap_tlb_flush        - flush tlb for the specified range of a chunk
 * pcpu_create_chunk                - create a new chunk
 * pcpu_destroy_chunk                - destroy a chunk, always preceded by full depop
 * pcpu_addr_to_page                - translate address to physical address
 * pcpu_verify_alloc_info        - check alloc_info is acceptable during init
 */
static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
                               int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
                                  int page_start, int page_end);
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
                                      int page_start, int page_end);
static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);

#ifdef CONFIG_NEED_PER_CPU_KM
#include "percpu-km.c"
#else
#include "percpu-vm.c"
#endif

/**
 * pcpu_chunk_addr_search - determine chunk containing specified address
 * @addr: address for which the chunk needs to be determined.
 *
 * This is an internal function that handles all but static allocations.
 * Static percpu address values should never be passed into the allocator.
 *
 * RETURNS:
 * The address of the found chunk.
 */
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
        /* is it in the dynamic region (first chunk)? */
        if (pcpu_addr_in_chunk(pcpu_first_chunk, addr))
                return pcpu_first_chunk;

        /* is it in the reserved region? */
        if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr))
                return pcpu_reserved_chunk;

        /*
         * The address is relative to unit0 which might be unused and
         * thus unmapped.  Offset the address to the unit space of the
         * current processor before looking it up in the vmalloc
         * space.  Note that any possible cpu id can be used here, so
         * there's no need to worry about preemption or cpu hotplug.
         */
        addr += pcpu_unit_offsets[raw_smp_processor_id()];
        return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}

#ifdef CONFIG_MEMCG
static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
                                      struct obj_cgroup **objcgp)
{
        struct obj_cgroup *objcg;

        if (!memcg_kmem_online() || !(gfp & __GFP_ACCOUNT))
                return true;

        objcg = current_obj_cgroup();
        if (!objcg)
                return true;

        if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size)))
                return false;

        *objcgp = objcg;
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
        if (!objcg)
                return;

        if (likely(chunk && chunk->obj_exts)) {
                obj_cgroup_get(objcg);
                chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg;

                rcu_read_lock();
                mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                                pcpu_obj_full_size(size));
                rcu_read_unlock();
        } else {
                obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
        }
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        struct obj_cgroup *objcg;

        if (unlikely(!chunk->obj_exts))
                return;

        objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup;
        if (!objcg)
                return;
        chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL;

        obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));

        rcu_read_lock();
        mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
                        -pcpu_obj_full_size(size));
        rcu_read_unlock();

        obj_cgroup_put(objcg);
}

#else /* CONFIG_MEMCG */
static bool
pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
{
        return true;
}

static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
                                       struct pcpu_chunk *chunk, int off,
                                       size_t size)
{
}

static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif /* CONFIG_MEMCG */

#ifdef CONFIG_MEM_ALLOC_PROFILING
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) {
                alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag,
                              current->alloc_tag, size);
        }
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
        if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts))
                alloc_tag_sub(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size);
}
#else
static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off,
                                      size_t size)
{
}

static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
{
}
#endif

/**
 * pcpu_alloc - the percpu allocator
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
 * @gfp: allocation flags
 *
 * Allocate percpu area of @size bytes aligned at @align.  If @gfp doesn't
 * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN
 * then no warning will be triggered on invalid or failed allocation
 * requests.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved,
                                 gfp_t gfp)
{
        gfp_t pcpu_gfp;
        bool is_atomic;
        bool do_warn;
        struct obj_cgroup *objcg = NULL;
        static int warn_limit = 10;
        struct pcpu_chunk *chunk, *next;
        const char *err;
        int slot, off, cpu, ret;
        unsigned long flags;
        void __percpu *ptr;
        size_t bits, bit_align;

        gfp = current_gfp_context(gfp);
        /* whitelisted flags that can be passed to the backing allocators */
        pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
        is_atomic = !gfpflags_allow_blocking(gfp);
        do_warn = !(gfp & __GFP_NOWARN);

        /*
         * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE,
         * therefore alignment must be a minimum of that many bytes.
         * An allocation may have internal fragmentation from rounding up
         * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes.
         */
        if (unlikely(align < PCPU_MIN_ALLOC_SIZE))
                align = PCPU_MIN_ALLOC_SIZE;

        size = ALIGN(size, PCPU_MIN_ALLOC_SIZE);
        bits = size >> PCPU_MIN_ALLOC_SHIFT;
        bit_align = align >> PCPU_MIN_ALLOC_SHIFT;

        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE ||
                     !is_power_of_2(align))) {
                WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n",
                     size, align);
                return NULL;
        }

        if (unlikely(!pcpu_memcg_pre_alloc_hook(size, gfp, &objcg)))
                return NULL;

        if (!is_atomic) {
                /*
                 * pcpu_balance_workfn() allocates memory under this mutex,
                 * and it may wait for memory reclaim. Allow current task
                 * to become OOM victim, in case of memory pressure.
                 */
                if (gfp & __GFP_NOFAIL) {
                        mutex_lock(&pcpu_alloc_mutex);
                } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
                        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
                        return NULL;
                }
        }

        spin_lock_irqsave(&pcpu_lock, flags);

        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;

                off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic);
                if (off < 0) {
                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }

                off = pcpu_alloc_area(chunk, bits, bit_align, off);
                if (off >= 0)
                        goto area_found;

                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }

restart:
        /* search through normal chunks */
        for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) {
                list_for_each_entry_safe(chunk, next, &pcpu_chunk_lists[slot],
                                         list) {
                        off = pcpu_find_block_fit(chunk, bits, bit_align,
                                                  is_atomic);
                        if (off < 0) {
                                if (slot < PCPU_SLOT_FAIL_THRESHOLD)
                                        pcpu_chunk_move(chunk, 0);
                                continue;
                        }

                        off = pcpu_alloc_area(chunk, bits, bit_align, off);
                        if (off >= 0) {
                                pcpu_reintegrate_chunk(chunk);
                                goto area_found;
                        }
                }
        }

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (is_atomic) {
                err = "atomic alloc failed, no space left";
                goto fail;
        }

        /* No space left.  Create a new chunk. */
        if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) {
                chunk = pcpu_create_chunk(pcpu_gfp);
                if (!chunk) {
                        err = "failed to allocate new chunk";
                        goto fail;
                }

                spin_lock_irqsave(&pcpu_lock, flags);
                pcpu_chunk_relocate(chunk, -1);
        } else {
                spin_lock_irqsave(&pcpu_lock, flags);
        }

        goto restart;

area_found:
        pcpu_stats_area_alloc(chunk, size);

        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
                pcpu_schedule_balance_work();

        spin_unlock_irqrestore(&pcpu_lock, flags);

        /* populate if not all pages are already there */
        if (!is_atomic) {
                unsigned int page_end, rs, re;

                rs = PFN_DOWN(off);
                page_end = PFN_UP(off + size);

                for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
                        WARN_ON(chunk->immutable);

                        ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);

                        spin_lock_irqsave(&pcpu_lock, flags);
                        if (ret) {
                                pcpu_free_area(chunk, off);
                                err = "failed to populate";
                                goto fail_unlock;
                        }
                        pcpu_chunk_populated(chunk, rs, re);
                        spin_unlock_irqrestore(&pcpu_lock, flags);
                }

                mutex_unlock(&pcpu_alloc_mutex);
        }

        /* clear the areas and return address relative to base address */
        for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);

        ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
        kmemleak_alloc_percpu(ptr, size, gfp);

        trace_percpu_alloc_percpu(_RET_IP_, reserved, is_atomic, size, align,
                                  chunk->base_addr, off, ptr,
                                  pcpu_obj_full_size(size), gfp);

        pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);

        pcpu_alloc_tag_alloc_hook(chunk, off, size);

        return ptr;

fail_unlock:
        spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
        trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);

        if (do_warn && warn_limit) {
                pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
                        size, align, is_atomic, err);
                if (!is_atomic)
                        dump_stack();
                if (!--warn_limit)
                        pr_info("limit reached, disable warning\n");
        }

        if (is_atomic) {
                /* see the flag handling in pcpu_balance_workfn() */
                pcpu_atomic_alloc_failed = true;
                pcpu_schedule_balance_work();
        } else {
                mutex_unlock(&pcpu_alloc_mutex);
        }

        pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);

        return NULL;
}
EXPORT_SYMBOL_GPL(pcpu_alloc_noprof);

/**
 * pcpu_balance_free - manage the amount of free chunks
 * @empty_only: free chunks only if there are no populated pages
 *
 * If empty_only is %false, reclaim all fully free chunks regardless of the
 * number of populated pages.  Otherwise, only reclaim chunks that have no
 * populated pages.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_free(bool empty_only)
{
        LIST_HEAD(to_free);
        struct list_head *free_head = &pcpu_chunk_lists[pcpu_free_slot];
        struct pcpu_chunk *chunk, *next;

        lockdep_assert_held(&pcpu_lock);

        /*
         * There's no reason to keep around multiple unused chunks and VM
         * areas can be scarce.  Destroy all free chunks except for one.
         */
        list_for_each_entry_safe(chunk, next, free_head, list) {
                WARN_ON(chunk->immutable);

                /* spare the first one */
                if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
                        continue;

                if (!empty_only || chunk->nr_empty_pop_pages == 0)
                        list_move(&chunk->list, &to_free);
        }

        if (list_empty(&to_free))
                return;

        spin_unlock_irq(&pcpu_lock);
        list_for_each_entry_safe(chunk, next, &to_free, list) {
                unsigned int rs, re;

                for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        pcpu_depopulate_chunk(chunk, rs, re);
                        spin_lock_irq(&pcpu_lock);
                        pcpu_chunk_depopulated(chunk, rs, re);
                        spin_unlock_irq(&pcpu_lock);
                }
                pcpu_destroy_chunk(chunk);
                cond_resched();
        }
        spin_lock_irq(&pcpu_lock);
}

/**
 * pcpu_balance_populated - manage the amount of populated pages
 *
 * Maintain a certain amount of populated pages to satisfy atomic allocations.
 * It is possible that this is called when physical memory is scarce causing
 * OOM killer to be triggered.  We should avoid doing so until an actual
 * allocation causes the failure as it is possible that requests can be
 * serviced from already backed regions.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 */
static void pcpu_balance_populated(void)
{
        /* gfp flags passed to underlying allocators */
        const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
        struct pcpu_chunk *chunk;
        int slot, nr_to_pop, ret;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Ensure there are certain number of free populated pages for
         * atomic allocs.  Fill up from the most packed so that atomic
         * allocs don't increase fragmentation.  If atomic allocation
         * failed previously, always populate the maximum amount.  This
         * should prevent atomic allocs larger than PAGE_SIZE from keeping
         * failing indefinitely; however, large atomic allocs are not
         * something we support properly and can be highly unreliable and
         * inefficient.
         */
retry_pop:
        if (pcpu_atomic_alloc_failed) {
                nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
                /* best effort anyway, don't worry about synchronization */
                pcpu_atomic_alloc_failed = false;
        } else {
                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
                                  pcpu_nr_empty_pop_pages,
                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
        }

        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) {
                unsigned int nr_unpop = 0, rs, re;

                if (!nr_to_pop)
                        break;

                list_for_each_entry(chunk, &pcpu_chunk_lists[slot], list) {
                        nr_unpop = chunk->nr_pages - chunk->nr_populated;
                        if (nr_unpop)
                                break;
                }

                if (!nr_unpop)
                        continue;

                /* @chunk can't go away while pcpu_alloc_mutex is held */
                for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
                        int nr = min_t(int, re - rs, nr_to_pop);

                        spin_unlock_irq(&pcpu_lock);
                        ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                        if (!ret) {
                                nr_to_pop -= nr;
                                pcpu_chunk_populated(chunk, rs, rs + nr);
                        } else {
                                nr_to_pop = 0;
                        }

                        if (!nr_to_pop)
                                break;
                }
        }

        if (nr_to_pop) {
                /* ran out of chunks to populate, create a new one and retry */
                spin_unlock_irq(&pcpu_lock);
                chunk = pcpu_create_chunk(gfp);
                cond_resched();
                spin_lock_irq(&pcpu_lock);
                if (chunk) {
                        pcpu_chunk_relocate(chunk, -1);
                        goto retry_pop;
                }
        }
}

/**
 * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages
 *
 * Scan over chunks in the depopulate list and try to release unused populated
 * pages back to the system.  Depopulated chunks are sidelined to prevent
 * repopulating these pages unless required.  Fully free chunks are reintegrated
 * and freed accordingly (1 is kept around).  If we drop below the empty
 * populated pages threshold, reintegrate the chunk if it has empty free pages.
 * Each chunk is scanned in the reverse order to keep populated pages close to
 * the beginning of the chunk.
 *
 * CONTEXT:
 * pcpu_lock (can be dropped temporarily)
 *
 */
static void pcpu_reclaim_populated(void)
{
        struct pcpu_chunk *chunk;
        struct pcpu_block_md *block;
        int freed_page_start, freed_page_end;
        int i, end;
        bool reintegrate;

        lockdep_assert_held(&pcpu_lock);

        /*
         * Once a chunk is isolated to the to_depopulate list, the chunk is no
         * longer discoverable to allocations whom may populate pages.  The only
         * other accessor is the free path which only returns area back to the
         * allocator not touching the populated bitmap.
         */
        while ((chunk = list_first_entry_or_null(
                        &pcpu_chunk_lists[pcpu_to_depopulate_slot],
                        struct pcpu_chunk, list))) {
                WARN_ON(chunk->immutable);

                /*
                 * Scan chunk's pages in the reverse order to keep populated
                 * pages close to the beginning of the chunk.
                 */
                freed_page_start = chunk->nr_pages;
                freed_page_end = 0;
                reintegrate = false;
                for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) {
                        /* no more work to do */
                        if (chunk->nr_empty_pop_pages == 0)
                                break;

                        /* reintegrate chunk to prevent atomic alloc failures */
                        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) {
                                reintegrate = true;
                                break;
                        }

                        /*
                         * If the page is empty and populated, start or
                         * extend the (i, end) range.  If i == 0, decrease
                         * i and perform the depopulation to cover the last
                         * (first) page in the chunk.
                         */
                        block = chunk->md_blocks + i;
                        if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS &&
                            test_bit(i, chunk->populated)) {
                                if (end == -1)
                                        end = i;
                                if (i > 0)
                                        continue;
                                i--;
                        }

                        /* depopulate if there is an active range */
                        if (end == -1)
                                continue;

                        spin_unlock_irq(&pcpu_lock);
                        pcpu_depopulate_chunk(chunk, i + 1, end + 1);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);

                        pcpu_chunk_depopulated(chunk, i + 1, end + 1);
                        freed_page_start = min(freed_page_start, i + 1);
                        freed_page_end = max(freed_page_end, end + 1);

                        /* reset the range and continue */
                        end = -1;
                }

                /* batch tlb flush per chunk to amortize cost */
                if (freed_page_start < freed_page_end) {
                        spin_unlock_irq(&pcpu_lock);
                        pcpu_post_unmap_tlb_flush(chunk,
                                                  freed_page_start,
                                                  freed_page_end);
                        cond_resched();
                        spin_lock_irq(&pcpu_lock);
                }

                if (reintegrate || chunk->free_bytes == pcpu_unit_size)
                        pcpu_reintegrate_chunk(chunk);
                else
                        list_move_tail(&chunk->list,
                                       &pcpu_chunk_lists[pcpu_sidelined_slot]);
        }
}

/**
 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
 * For each chunk type, manage the number of fully free chunks and the number of
 * populated pages.  An important thing to consider is when pages are freed and
 * how they contribute to the global counts.
 */
static void pcpu_balance_workfn(struct work_struct *work)
{
        /*
         * pcpu_balance_free() is called twice because the first time we may
         * trim pages in the active pcpu_nr_empty_pop_pages which may cause us
         * to grow other chunks.  This then gives pcpu_reclaim_populated() time
         * to move fully free chunks to the active list to be freed if
         * appropriate.
         *
         * Enforce GFP_NOIO allocations because we have pcpu_alloc users
         * constrained to GFP_NOIO/NOFS contexts and they could form lock
         * dependency through pcpu_alloc_mutex
         */
        unsigned int flags = memalloc_noio_save();
        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irq(&pcpu_lock);

        pcpu_balance_free(false);
        pcpu_reclaim_populated();
        pcpu_balance_populated();
        pcpu_balance_free(true);

        spin_unlock_irq(&pcpu_lock);
        mutex_unlock(&pcpu_alloc_mutex);
        memalloc_noio_restore(flags);
}

/**
 * free_percpu - free percpu area
 * @ptr: pointer to area to free
 *
 * Free percpu area @ptr.
 *
 * CONTEXT:
 * Can be called from atomic context.
 */
void free_percpu(void __percpu *ptr)
{
        void *addr;
        struct pcpu_chunk *chunk;
        unsigned long flags;
        int size, off;
        bool need_balance = false;

        if (!ptr)
                return;

        kmemleak_free_percpu(ptr);

        addr = __pcpu_ptr_to_addr(ptr);
        chunk = pcpu_chunk_addr_search(addr);
        off = addr - chunk->base_addr;

        spin_lock_irqsave(&pcpu_lock, flags);
        size = pcpu_free_area(chunk, off);

        pcpu_alloc_tag_free_hook(chunk, off, size);

        pcpu_memcg_free_hook(chunk, off, size);

        /*
         * If there are more than one fully free chunks, wake up grim reaper.
         * If the chunk is isolated, it may be in the process of being
         * reclaimed.  Let reclaim manage cleaning up of that chunk.
         */
        if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) {
                struct pcpu_chunk *pos;

                list_for_each_entry(pos, &pcpu_chunk_lists[pcpu_free_slot], list)
                        if (pos != chunk) {
                                need_balance = true;
                                break;
                        }
        } else if (pcpu_should_reclaim_chunk(chunk)) {
                pcpu_isolate_chunk(chunk);
                need_balance = true;
        }

        trace_percpu_free_percpu(chunk->base_addr, off, ptr);

        spin_unlock_irqrestore(&pcpu_lock, flags);

        if (need_balance)
                pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);

bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
{
#ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        unsigned int cpu;

        for_each_possible_cpu(cpu) {
                void *start = per_cpu_ptr(base, cpu);
                void *va = (void *)addr;

                if (va >= start && va < start + static_size) {
                        if (can_addr) {
                                *can_addr = (unsigned long) (va - start);
                                *can_addr += (unsigned long)
                                        per_cpu_ptr(base, get_boot_cpu_id());
                        }
                        return true;
                }
        }
#endif
        /* on UP, can't distinguish from other static vars, always false */
        return false;
}

/**
 * is_kernel_percpu_address - test whether address is from static percpu area
 * @addr: address to test
 *
 * Test whether @addr belongs to in-kernel static percpu area.  Module
 * static percpu areas are not considered.  For those, use
 * is_module_percpu_address().
 *
 * RETURNS:
 * %true if @addr is from in-kernel static percpu area, %false otherwise.
 */
bool is_kernel_percpu_address(unsigned long addr)
{
        return __is_kernel_percpu_address(addr, NULL);
}

/**
 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
 * @addr: the address to be converted to physical address
 *
 * Given @addr which is dereferenceable address obtained via one of
 * percpu access macros, this function translates it into its physical
 * address.  The caller is responsible for ensuring @addr stays valid
 * until this function finishes.
 *
 * percpu allocator has special setup for the first chunk, which currently
 * supports either embedding in linear address space or vmalloc mapping,
 * and, from the second one, the backing allocator (currently either vm or
 * km) provides translation.
 *
 * The addr can be translated simply without checking if it falls into the
 * first chunk. But the current code reflects better how percpu allocator
 * actually works, and the verification can discover both bugs in percpu
 * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current
 * code.
 *
 * RETURNS:
 * The physical address for @addr.
 */
phys_addr_t per_cpu_ptr_to_phys(void *addr)
{
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        bool in_first_chunk = false;
        unsigned long first_low, first_high;
        unsigned int cpu;

        /*
         * The following test on unit_low/high isn't strictly
         * necessary but will speed up lookups of addresses which
         * aren't in the first chunk.
         *
         * The address check is against full chunk sizes.  pcpu_base_addr
         * points to the beginning of the first chunk including the
         * static region.  Assumes good intent as the first chunk may
         * not be full (ie. < pcpu_unit_pages in size).
         */
        first_low = (unsigned long)pcpu_base_addr +
                    pcpu_unit_page_offset(pcpu_low_unit_cpu, 0);
        first_high = (unsigned long)pcpu_base_addr +
                     pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages);
        if ((unsigned long)addr >= first_low &&
            (unsigned long)addr < first_high) {
                for_each_possible_cpu(cpu) {
                        void *start = per_cpu_ptr(base, cpu);

                        if (addr >= start && addr < start + pcpu_unit_size) {
                                in_first_chunk = true;
                                break;
                        }
                }
        }

        if (in_first_chunk) {
                if (!is_vmalloc_addr(addr))
                        return __pa(addr);
                else
                        return page_to_phys(vmalloc_to_page(addr)) +
                               offset_in_page(addr);
        } else
                return page_to_phys(pcpu_addr_to_page(addr)) +
                       offset_in_page(addr);
}

/**
 * pcpu_alloc_alloc_info - allocate percpu allocation info
 * @nr_groups: the number of groups
 * @nr_units: the number of units
 *
 * Allocate ai which is large enough for @nr_groups groups containing
 * @nr_units units.  The returned ai's groups[0].cpu_map points to the
 * cpu_map array which is long enough for @nr_units and filled with
 * NR_CPUS.  It's the caller's responsibility to initialize cpu_map
 * pointer of other groups.
 *
 * RETURNS:
 * Pointer to the allocated pcpu_alloc_info on success, NULL on
 * failure.
 */
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
                                                      int nr_units)
{
        struct pcpu_alloc_info *ai;
        size_t base_size, ai_size;
        void *ptr;
        int unit;

        base_size = ALIGN(struct_size(ai, groups, nr_groups),
                          __alignof__(ai->groups[0].cpu_map[0]));
        ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);

        ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
        if (!ptr)
                return NULL;
        ai = ptr;
        ptr += base_size;

        ai->groups[0].cpu_map = ptr;

        for (unit = 0; unit < nr_units; unit++)
                ai->groups[0].cpu_map[unit] = NR_CPUS;

        ai->nr_groups = nr_groups;
        ai->__ai_size = PFN_ALIGN(ai_size);

        return ai;
}

/**
 * pcpu_free_alloc_info - free percpu allocation info
 * @ai: pcpu_alloc_info to free
 *
 * Free @ai which was allocated by pcpu_alloc_alloc_info().
 */
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
        memblock_free(ai, ai->__ai_size);
}

/**
 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
 * @lvl: loglevel
 * @ai: allocation info to dump
 *
 * Print out information about @ai using loglevel @lvl.
 */
static void pcpu_dump_alloc_info(const char *lvl,
                                 const struct pcpu_alloc_info *ai)
{
        int group_width = 1, cpu_width = 1, width;
        char empty_str[] = "--------";
        int alloc = 0, alloc_end = 0;
        int group, v;
        int upa, apl;        /* units per alloc, allocs per line */

        v = ai->nr_groups;
        while (v /= 10)
                group_width++;

        v = num_possible_cpus();
        while (v /= 10)
                cpu_width++;
        empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';

        upa = ai->alloc_size / ai->unit_size;
        width = upa * (cpu_width + 1) + group_width + 3;
        apl = rounddown_pow_of_two(max(60 / width, 1));

        printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
               lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
               ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);

        for (group = 0; group < ai->nr_groups; group++) {
                const struct pcpu_group_info *gi = &ai->groups[group];
                int unit = 0, unit_end = 0;

                BUG_ON(gi->nr_units % upa);
                for (alloc_end += gi->nr_units / upa;
                     alloc < alloc_end; alloc++) {
                        if (!(alloc % apl)) {
                                pr_cont("\n");
                                printk("%spcpu-alloc: ", lvl);
                        }
                        pr_cont("[%0*d] ", group_width, group);

                        for (unit_end += upa; unit < unit_end; unit++)
                                if (gi->cpu_map[unit] != NR_CPUS)
                                        pr_cont("%0*d ",
                                                cpu_width, gi->cpu_map[unit]);
                                else
                                        pr_cont("%s ", empty_str);
                }
        }
        pr_cont("\n");
}

/**
 * pcpu_setup_first_chunk - initialize the first percpu chunk
 * @ai: pcpu_alloc_info describing how to percpu area is shaped
 * @base_addr: mapped address
 *
 * Initialize the first percpu chunk which contains the kernel static
 * percpu area.  This function is to be called from arch percpu area
 * setup path.
 *
 * @ai contains all information necessary to initialize the first
 * chunk and prime the dynamic percpu allocator.
 *
 * @ai->static_size is the size of static percpu area.
 *
 * @ai->reserved_size, if non-zero, specifies the amount of bytes to
 * reserve after the static area in the first chunk.  This reserves
 * the first chunk such that it's available only through reserved
 * percpu allocation.  This is primarily used to serve module percpu
 * static areas on architectures where the addressing model has
 * limited offset range for symbol relocations to guarantee module
 * percpu symbols fall inside the relocatable range.
 *
 * @ai->dyn_size determines the number of bytes available for dynamic
 * allocation in the first chunk.  The area between @ai->static_size +
 * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
 *
 * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
 * and equal to or larger than @ai->static_size + @ai->reserved_size +
 * @ai->dyn_size.
 *
 * @ai->atom_size is the allocation atom size and used as alignment
 * for vm areas.
 *
 * @ai->alloc_size is the allocation size and always multiple of
 * @ai->atom_size.  This is larger than @ai->atom_size if
 * @ai->unit_size is larger than @ai->atom_size.
 *
 * @ai->nr_groups and @ai->groups describe virtual memory layout of
 * percpu areas.  Units which should be colocated are put into the
 * same group.  Dynamic VM areas will be allocated according to these
 * groupings.  If @ai->nr_groups is zero, a single group containing
 * all units is assumed.
 *
 * The caller should have mapped the first chunk at @base_addr and
 * copied static data to each unit.
 *
 * The first chunk will always contain a static and a dynamic region.
 * However, the static region is not managed by any chunk.  If the first
 * chunk also contains a reserved region, it is served by two chunks -
 * one for the reserved region and one for the dynamic region.  They
 * share the same vm, but use offset regions in the area allocation map.
 * The chunk serving the dynamic region is circulated in the chunk slots
 * and available for dynamic allocation like any other chunk.
 */
void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
                                   void *base_addr)
{
        size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        size_t static_size, dyn_size;
        unsigned long *group_offsets;
        size_t *group_sizes;
        unsigned long *unit_off;
        unsigned int cpu;
        int *unit_map;
        int group, unit, i;
        unsigned long tmp_addr;
        size_t alloc_size;

#define PCPU_SETUP_BUG_ON(cond)        do {                                        \
        if (unlikely(cond)) {                                                \
                pr_emerg("failed to initialize, %s\n", #cond);                \
                pr_emerg("cpu_possible_mask=%*pb\n",                        \
                         cpumask_pr_args(cpu_possible_mask));                \
                pcpu_dump_alloc_info(KERN_EMERG, ai);                        \
                BUG();                                                        \
        }                                                                \
} while (0)

        /* sanity checks */
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
#ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
        PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start));
#endif
        PCPU_SETUP_BUG_ON(!base_addr);
        PCPU_SETUP_BUG_ON(offset_in_page(base_addr));
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size));
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE));
        PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE);
        PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE));
        PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) ||
                            IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE)));
        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);

        /* process group information and build config tables accordingly */
        alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
        group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
        group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
        unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
        unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES);

        for (cpu = 0; cpu < nr_cpu_ids; cpu++)
                unit_map[cpu] = UINT_MAX;

        pcpu_low_unit_cpu = NR_CPUS;
        pcpu_high_unit_cpu = NR_CPUS;

        for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
                const struct pcpu_group_info *gi = &ai->groups[group];

                group_offsets[group] = gi->base_offset;
                group_sizes[group] = gi->nr_units * ai->unit_size;

                for (i = 0; i < gi->nr_units; i++) {
                        cpu = gi->cpu_map[i];
                        if (cpu == NR_CPUS)
                                continue;

                        PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids);
                        PCPU_SETUP_BUG_ON(!cpu_possible(cpu));
                        PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX);

                        unit_map[cpu] = unit + i;
                        unit_off[cpu] = gi->base_offset + i * ai->unit_size;

                        /* determine low/high unit_cpu */
                        if (pcpu_low_unit_cpu == NR_CPUS ||
                            unit_off[cpu] < unit_off[pcpu_low_unit_cpu])
                                pcpu_low_unit_cpu = cpu;
                        if (pcpu_high_unit_cpu == NR_CPUS ||
                            unit_off[cpu] > unit_off[pcpu_high_unit_cpu])
                                pcpu_high_unit_cpu = cpu;
                }
        }
        pcpu_nr_units = unit;

        for_each_possible_cpu(cpu)
                PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX);

        /* we're done parsing the input, undefine BUG macro and dump config */
#undef PCPU_SETUP_BUG_ON
        pcpu_dump_alloc_info(KERN_DEBUG, ai);

        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
        pcpu_group_sizes = group_sizes;
        pcpu_unit_map = unit_map;
        pcpu_unit_offsets = unit_off;

        /* determine basic parameters */
        pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
        pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
        pcpu_atom_size = ai->atom_size;
        pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated,
                                             BITS_TO_LONGS(pcpu_unit_pages));

        pcpu_stats_save_ai(ai);

        /*
         * Allocate chunk slots.  The slots after the active slots are:
         *   sidelined_slot - isolated, depopulated chunks
         *   free_slot - fully free chunks
         *   to_depopulate_slot - isolated, chunks to depopulate
         */
        pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1;
        pcpu_free_slot = pcpu_sidelined_slot + 1;
        pcpu_to_depopulate_slot = pcpu_free_slot + 1;
        pcpu_nr_slots = pcpu_to_depopulate_slot + 1;
        pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots *
                                          sizeof(pcpu_chunk_lists[0]),
                                          SMP_CACHE_BYTES);

        for (i = 0; i < pcpu_nr_slots; i++)
                INIT_LIST_HEAD(&pcpu_chunk_lists[i]);

        /*
         * The end of the static region needs to be aligned with the
         * minimum allocation size as this offsets the reserved and
         * dynamic region.  The first chunk ends page aligned by
         * expanding the dynamic region, therefore the dynamic region
         * can be shrunk to compensate while still staying above the
         * configured sizes.
         */
        static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE);
        dyn_size = ai->dyn_size - (static_size - ai->static_size);

        /*
         * Initialize first chunk:
         * This chunk is broken up into 3 parts:
         *                < static | [reserved] | dynamic >
         * - static - there is no backing chunk because these allocations can
         *   never be freed.
         * - reserved (pcpu_reserved_chunk) - exists primarily to serve
         *   allocations from module load.
         * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first
         *   chunk.
         */
        tmp_addr = (unsigned long)base_addr + static_size;
        if (ai->reserved_size)
                pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr,
                                                ai->reserved_size);
        tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size;
        pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size);

        pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages;
        pcpu_chunk_relocate(pcpu_first_chunk, -1);

        /* include all regions of the first chunk */
        pcpu_nr_populated += PFN_DOWN(size_sum);

        pcpu_stats_chunk_alloc();
        trace_percpu_create_chunk(base_addr);

        /* we're done */
        pcpu_base_addr = base_addr;
}

#ifdef CONFIG_SMP

const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = {
        [PCPU_FC_AUTO]        = "auto",
        [PCPU_FC_EMBED]        = "embed",
        [PCPU_FC_PAGE]        = "page",
};

enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;

static int __init percpu_alloc_setup(char *str)
{
        if (!str)
                return -EINVAL;

        if (0)
                /* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
        else if (!strcmp(str, "embed"))
                pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
        else if (!strcmp(str, "page"))
                pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
        else
                pr_warn("unknown allocator %s specified\n", str);

        return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);

/*
 * pcpu_embed_first_chunk() is used by the generic percpu setup.
 * Build it if needed by the arch config or the generic setup is going
 * to be used.
 */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
        !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
#define BUILD_EMBED_FIRST_CHUNK
#endif

/* build pcpu_page_first_chunk() iff needed by the arch config */
#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#define BUILD_PAGE_FIRST_CHUNK
#endif

/* pcpu_build_alloc_info() is used by both embed and page first chunk */
#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
/**
 * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 *
 * This function determines grouping of units, their mappings to cpus
 * and other parameters considering needed percpu size, allocation
 * atom size and distances between CPUs.
 *
 * Groups are always multiples of atom size and CPUs which are of
 * LOCAL_DISTANCE both ways are grouped together and share space for
 * units in the same group.  The returned configuration is guaranteed
 * to have CPUs on different nodes on different groups and >=75% usage
 * of allocated virtual address space.
 *
 * RETURNS:
 * On success, pointer to the new allocation_info is returned.  On
 * failure, ERR_PTR value is returned.
 */
static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
                                size_t reserved_size, size_t dyn_size,
                                size_t atom_size,
                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
        static int group_map[NR_CPUS] __initdata;
        static int group_cnt[NR_CPUS] __initdata;
        static struct cpumask mask __initdata;
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        int nr_groups = 1, nr_units = 0;
        size_t size_sum, min_unit_size, alloc_size;
        int upa, max_upa, best_upa;        /* units_per_alloc */
        int last_allocs, group, unit;
        unsigned int cpu, tcpu;
        struct pcpu_alloc_info *ai;
        unsigned int *cpu_map;

        /* this function may be called multiple times */
        memset(group_map, 0, sizeof(group_map));
        memset(group_cnt, 0, sizeof(group_cnt));
        cpumask_clear(&mask);

        /* calculate size_sum and ensure dyn_size is enough for early alloc */
        size_sum = PFN_ALIGN(static_size + reserved_size +
                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
        dyn_size = size_sum - static_size - reserved_size;

        /*
         * Determine min_unit_size, alloc_size and max_upa such that
         * alloc_size is multiple of atom_size and is the smallest
         * which can accommodate 4k aligned segments which are equal to
         * or larger than min_unit_size.
         */
        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);

        /* determine the maximum # of units that can fit in an allocation */
        alloc_size = roundup(min_unit_size, atom_size);
        upa = alloc_size / min_unit_size;
        while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                upa--;
        max_upa = upa;

        cpumask_copy(&mask, cpu_possible_mask);

        /* group cpus according to their proximity */
        for (group = 0; !cpumask_empty(&mask); group++) {
                /* pop the group's first cpu */
                cpu = cpumask_first(&mask);
                group_map[cpu] = group;
                group_cnt[group]++;
                cpumask_clear_cpu(cpu, &mask);

                for_each_cpu(tcpu, &mask) {
                        if (!cpu_distance_fn ||
                            (cpu_distance_fn(cpu, tcpu) == LOCAL_DISTANCE &&
                             cpu_distance_fn(tcpu, cpu) == LOCAL_DISTANCE)) {
                                group_map[tcpu] = group;
                                group_cnt[group]++;
                                cpumask_clear_cpu(tcpu, &mask);
                        }
                }
        }
        nr_groups = group;

        /*
         * Wasted space is caused by a ratio imbalance of upa to group_cnt.
         * Expand the unit_size until we use >= 75% of the units allocated.
         * Related to atom_size, which could be much larger than the unit_size.
         */
        last_allocs = INT_MAX;
        best_upa = 0;
        for (upa = max_upa; upa; upa--) {
                int allocs = 0, wasted = 0;

                if (alloc_size % upa || (offset_in_page(alloc_size / upa)))
                        continue;

                for (group = 0; group < nr_groups; group++) {
                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
                        allocs += this_allocs;
                        wasted += this_allocs * upa - group_cnt[group];
                }

                /*
                 * Don't accept if wastage is over 1/3.  The
                 * greater-than comparison ensures upa==1 always
                 * passes the following check.
                 */
                if (wasted > num_possible_cpus() / 3)
                        continue;

                /* and then don't consume more memory */
                if (allocs > last_allocs)
                        break;
                last_allocs = allocs;
                best_upa = upa;
        }
        BUG_ON(!best_upa);
        upa = best_upa;

        /* allocate and fill alloc_info */
        for (group = 0; group < nr_groups; group++)
                nr_units += roundup(group_cnt[group], upa);

        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
        if (!ai)
                return ERR_PTR(-ENOMEM);
        cpu_map = ai->groups[0].cpu_map;

        for (group = 0; group < nr_groups; group++) {
                ai->groups[group].cpu_map = cpu_map;
                cpu_map += roundup(group_cnt[group], upa);
        }

        ai->static_size = static_size;
        ai->reserved_size = reserved_size;
        ai->dyn_size = dyn_size;
        ai->unit_size = alloc_size / upa;
        ai->atom_size = atom_size;
        ai->alloc_size = alloc_size;

        for (group = 0, unit = 0; group < nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];

                /*
                 * Initialize base_offset as if all groups are located
                 * back-to-back.  The caller should update this to
                 * reflect actual allocation.
                 */
                gi->base_offset = unit * ai->unit_size;

                for_each_possible_cpu(cpu)
                        if (group_map[cpu] == group)
                                gi->cpu_map[gi->nr_units++] = cpu;
                gi->nr_units = roundup(gi->nr_units, upa);
                unit += gi->nr_units;
        }
        BUG_ON(unit != nr_units);

        return ai;
}

static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
                                   pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NUMA
        int node = NUMA_NO_NODE;
        void *ptr;

        if (cpu_to_nd_fn)
                node = cpu_to_nd_fn(cpu);

        if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
                ptr = memblock_alloc_from(size, align, goal);
                pr_info("cpu %d has no node %d or node-local memory\n",
                        cpu, node);
                pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
                         cpu, size, (u64)__pa(ptr));
        } else {
                ptr = memblock_alloc_try_nid(size, align, goal,
                                             MEMBLOCK_ALLOC_ACCESSIBLE,
                                             node);

                pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
                         cpu, size, node, (u64)__pa(ptr));
        }
        return ptr;
#else
        return memblock_alloc_from(size, align, goal);
#endif
}

static void __init pcpu_fc_free(void *ptr, size_t size)
{
        memblock_free(ptr, size);
}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */

#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: minimum free size for dynamic allocation in bytes
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
 *
 * If this function is used to setup the first chunk, it is allocated
 * by calling pcpu_fc_alloc and used as-is without being mapped into
 * vmalloc area.  Allocations are always whole multiples of @atom_size
 * aligned to @atom_size.
 *
 * This enables the first chunk to piggy back on the linear physical
 * mapping which often uses larger page size.  Please note that this
 * can result in very sparse cpu->unit mapping on NUMA machines thus
 * requiring large vmalloc address space.  Don't use this allocator if
 * vmalloc space is not orders of magnitude larger than distances
 * between node memory addresses (ie. 32bit NUMA machines).
 *
 * @dyn_size specifies the minimum dynamic area size.
 *
 * If the needed size is smaller than the minimum or specified unit
 * size, the leftover is returned using pcpu_fc_free.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                                  size_t atom_size,
                                  pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
                                  pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        void *base = (void *)ULONG_MAX;
        void **areas = NULL;
        struct pcpu_alloc_info *ai;
        size_t size_sum, areas_size;
        unsigned long max_distance;
        int group, i, highest_group, rc = 0;

        ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
                                   cpu_distance_fn);
        if (IS_ERR(ai))
                return PTR_ERR(ai);

        size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
        areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));

        areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
        if (!areas) {
                rc = -ENOMEM;
                goto out_free;
        }

        /* allocate, copy and determine base address & max_distance */
        highest_group = 0;
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                unsigned int cpu = NR_CPUS;
                void *ptr;

                for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
                        cpu = gi->cpu_map[i];
                BUG_ON(cpu == NR_CPUS);

                /* allocate space for the whole group */
                ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
                if (!ptr) {
                        rc = -ENOMEM;
                        goto out_free_areas;
                }
                /* kmemleak tracks the percpu allocations separately */
                kmemleak_ignore_phys(__pa(ptr));
                areas[group] = ptr;

                base = min(ptr, base);
                if (ptr > areas[highest_group])
                        highest_group = group;
        }
        max_distance = areas[highest_group] - base;
        max_distance += ai->unit_size * ai->groups[highest_group].nr_units;

        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > VMALLOC_TOTAL * 3 / 4) {
                pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n",
                                max_distance, VMALLOC_TOTAL);
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
                goto out_free_areas;
#endif
        }

        /*
         * Copy data and free unused parts.  This should happen after all
         * allocations are complete; otherwise, we may end up with
         * overlapping groups.
         */
        for (group = 0; group < ai->nr_groups; group++) {
                struct pcpu_group_info *gi = &ai->groups[group];
                void *ptr = areas[group];

                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
                                /* unused unit, free whole */
                                pcpu_fc_free(ptr, ai->unit_size);
                                continue;
                        }
                        /* copy and return the unused part */
                        memcpy(ptr, __per_cpu_start, ai->static_size);
                        pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
                }
        }

        /* base address is now known, determine group base offsets */
        for (group = 0; group < ai->nr_groups; group++) {
                ai->groups[group].base_offset = areas[group] - base;
        }

        pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
                PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
                ai->dyn_size, ai->unit_size);

        pcpu_setup_first_chunk(ai, base);
        goto out_free;

out_free_areas:
        for (group = 0; group < ai->nr_groups; group++)
                if (areas[group])
                        pcpu_fc_free(areas[group],
                                ai->groups[group].nr_units * ai->unit_size);
out_free:
        pcpu_free_alloc_info(ai);
        if (areas)
                memblock_free(areas, areas_size);
        return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */

#ifdef BUILD_PAGE_FIRST_CHUNK
#include <asm/pgalloc.h>

#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PUD_TABLE_SIZE
#define PUD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PMD_TABLE_SIZE
#define PMD_TABLE_SIZE PAGE_SIZE
#endif

#ifndef PTE_TABLE_SIZE
#define PTE_TABLE_SIZE PAGE_SIZE
#endif
void __init __weak pcpu_populate_pte(unsigned long addr)
{
        pgd_t *pgd = pgd_offset_k(addr);
        p4d_t *p4d;
        pud_t *pud;
        pmd_t *pmd;

        if (pgd_none(*pgd)) {
                p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
                pgd_populate(&init_mm, pgd, p4d);
        }

        p4d = p4d_offset(pgd, addr);
        if (p4d_none(*p4d)) {
                pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
                p4d_populate(&init_mm, p4d, pud);
        }

        pud = pud_offset(p4d, addr);
        if (pud_none(*pud)) {
                pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
                pud_populate(&init_mm, pud, pmd);
        }

        pmd = pmd_offset(pud, addr);
        if (!pmd_present(*pmd)) {
                pte_t *new;

                new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
                pmd_populate_kernel(&init_mm, pmd, new);
        }

        return;
}

/**
 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
 * @reserved_size: the size of reserved percpu area in bytes
 * @cpu_to_nd_fn: callback to convert cpu to it's node, optional
 *
 * This is a helper to ease setting up page-remapped first percpu
 * chunk and can be called where pcpu_setup_first_chunk() is expected.
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page into vmalloc area.
 *
 * RETURNS:
 * 0 on success, -errno on failure.
 */
int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
        static struct vm_struct vm;
        struct pcpu_alloc_info *ai;
        char psize_str[16];
        int unit_pages;
        size_t pages_size;
        struct page **pages;
        int unit, i, j, rc = 0;
        int upa;
        int nr_g0_units;

        snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);

        ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL);
        if (IS_ERR(ai))
                return PTR_ERR(ai);
        BUG_ON(ai->nr_groups != 1);
        upa = ai->alloc_size/ai->unit_size;
        nr_g0_units = roundup(num_possible_cpus(), upa);
        if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
                pcpu_free_alloc_info(ai);
                return -EINVAL;
        }

        unit_pages = ai->unit_size >> PAGE_SHIFT;

        /* unaligned allocations can't be freed, round up to page size */
        pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
                               sizeof(pages[0]));
        pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES);

        /* allocate pages */
        j = 0;
        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned int cpu = ai->groups[0].cpu_map[unit];
                for (i = 0; i < unit_pages; i++) {
                        void *ptr;

                        ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
                        if (!ptr) {
                                pr_warn("failed to allocate %s page for cpu%u\n",
                                                psize_str, cpu);
                                goto enomem;
                        }
                        /* kmemleak tracks the percpu allocations separately */
                        kmemleak_ignore_phys(__pa(ptr));
                        pages[j++] = virt_to_page(ptr);
                }
        }

        /* allocate vm area, map the pages and copy static data */
        vm.flags = VM_ALLOC;
        vm.size = num_possible_cpus() * ai->unit_size;
        vm_area_register_early(&vm, PAGE_SIZE);

        for (unit = 0; unit < num_possible_cpus(); unit++) {
                unsigned long unit_addr =
                        (unsigned long)vm.addr + unit * ai->unit_size;

                for (i = 0; i < unit_pages; i++)
                        pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));

                /* pte already populated, the following shouldn't fail */
                rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
                                      unit_pages);
                if (rc < 0)
                        panic("failed to map percpu area, err=%d\n", rc);

                flush_cache_vmap_early(unit_addr, unit_addr + ai->unit_size);

                /* copy static data */
                memcpy((void *)unit_addr, __per_cpu_start, ai->static_size);
        }

        /* we're ready, commit */
        pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
                unit_pages, psize_str, ai->static_size,
                ai->reserved_size, ai->dyn_size);

        pcpu_setup_first_chunk(ai, vm.addr);
        goto out_free_ar;

enomem:
        while (--j >= 0)
                pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
        rc = -ENOMEM;
out_free_ar:
        memblock_free(pages, pages_size);
        pcpu_free_alloc_info(ai);
        return rc;
}
#endif /* BUILD_PAGE_FIRST_CHUNK */

#ifndef        CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
 * Generic SMP percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
 * important because many archs have addressing restrictions and might
 * fail if the percpu area is located far away from the previous
 * location.  As an added bonus, in non-NUMA cases, embedding is
 * generally a good idea TLB-wise because percpu area can piggy back
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

void __init setup_per_cpu_areas(void)
{
        unsigned long delta;
        unsigned int cpu;
        int rc;

        /*
         * Always reserve area for module percpu variables.  That's
         * what the legacy allocator did.
         */
        rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
                                    PAGE_SIZE, NULL, NULL);
        if (rc < 0)
                panic("Failed to initialize percpu areas.");

        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif        /* CONFIG_HAVE_SETUP_PER_CPU_AREA */

#else        /* CONFIG_SMP */

/*
 * UP percpu area setup.
 *
 * UP always uses km-based percpu allocator with identity mapping.
 * Static percpu variables are indistinguishable from the usual static
 * variables and don't require any special preparation.
 */
void __init setup_per_cpu_areas(void)
{
        const size_t unit_size =
                roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
                                         PERCPU_DYNAMIC_RESERVE));
        struct pcpu_alloc_info *ai;
        void *fc;

        ai = pcpu_alloc_alloc_info(1, 1);
        fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
        /* kmemleak tracks the percpu allocations separately */
        kmemleak_ignore_phys(__pa(fc));

        ai->dyn_size = unit_size;
        ai->unit_size = unit_size;
        ai->atom_size = unit_size;
        ai->alloc_size = unit_size;
        ai->groups[0].nr_units = 1;
        ai->groups[0].cpu_map[0] = 0;

        pcpu_setup_first_chunk(ai, fc);
        pcpu_free_alloc_info(ai);
}

#endif        /* CONFIG_SMP */

/*
 * pcpu_nr_pages - calculate total number of populated backing pages
 *
 * This reflects the number of pages populated to back chunks.  Metadata is
 * excluded in the number exposed in meminfo as the number of backing pages
 * scales with the number of cpus and can quickly outweigh the memory used for
 * metadata.  It also keeps this calculation nice and simple.
 *
 * RETURNS:
 * Total number of populated backing pages in use by the allocator.
 */
unsigned long pcpu_nr_pages(void)
{
        return pcpu_nr_populated * pcpu_nr_units;
}

/*
 * Percpu allocator is initialized early during boot when neither slab or
 * workqueue is available.  Plug async management until everything is up
 * and running.
 */
static int __init percpu_enable_async(void)
{
        pcpu_async_enabled = true;
        return 0;
}
subsys_initcall(percpu_enable_async);

























































































































  207 








  208 


  209 










































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Fixmap manipulation code
 */

#include <linux/bug.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/libfdt.h>
#include <linux/memory.h>
#include <linux/mm.h>
#include <linux/sizes.h>

#include <asm/fixmap.h>
#include <asm/kernel-pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>

/* ensure that the fixmap region does not grow down into the PCI I/O region */
static_assert(FIXADDR_TOT_START > PCI_IO_END);

#define NR_BM_PTE_TABLES \
        SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PMD_SHIFT)
#define NR_BM_PMD_TABLES \
        SPAN_NR_ENTRIES(FIXADDR_TOT_START, FIXADDR_TOP, PUD_SHIFT)

static_assert(NR_BM_PMD_TABLES == 1);

#define __BM_TABLE_IDX(addr, shift) \
        (((addr) >> (shift)) - (FIXADDR_TOT_START >> (shift)))

#define BM_PTE_TABLE_IDX(addr)        __BM_TABLE_IDX(addr, PMD_SHIFT)

static pte_t bm_pte[NR_BM_PTE_TABLES][PTRS_PER_PTE] __page_aligned_bss;
static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss __maybe_unused;
static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused;

static inline pte_t *fixmap_pte(unsigned long addr)
{
        return &bm_pte[BM_PTE_TABLE_IDX(addr)][pte_index(addr)];
}

static void __init early_fixmap_init_pte(pmd_t *pmdp, unsigned long addr)
{
        pmd_t pmd = READ_ONCE(*pmdp);
        pte_t *ptep;

        if (pmd_none(pmd)) {
                ptep = bm_pte[BM_PTE_TABLE_IDX(addr)];
                __pmd_populate(pmdp, __pa_symbol(ptep),
                               PMD_TYPE_TABLE | PMD_TABLE_AF);
        }
}

static void __init early_fixmap_init_pmd(pud_t *pudp, unsigned long addr,
                                         unsigned long end)
{
        unsigned long next;
        pud_t pud = READ_ONCE(*pudp);
        pmd_t *pmdp;

        if (pud_none(pud))
                __pud_populate(pudp, __pa_symbol(bm_pmd),
                               PUD_TYPE_TABLE | PUD_TABLE_AF);

        pmdp = pmd_offset_kimg(pudp, addr);
        do {
                next = pmd_addr_end(addr, end);
                early_fixmap_init_pte(pmdp, addr);
        } while (pmdp++, addr = next, addr != end);
}


static void __init early_fixmap_init_pud(p4d_t *p4dp, unsigned long addr,
                                         unsigned long end)
{
        p4d_t p4d = READ_ONCE(*p4dp);
        pud_t *pudp;

        if (CONFIG_PGTABLE_LEVELS > 3 && !p4d_none(p4d) &&
            p4d_page_paddr(p4d) != __pa_symbol(bm_pud)) {
                /*
                 * We only end up here if the kernel mapping and the fixmap
                 * share the top level pgd entry, which should only happen on
                 * 16k/4 levels configurations.
                 */
                BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
        }

        if (p4d_none(p4d))
                __p4d_populate(p4dp, __pa_symbol(bm_pud),
                               P4D_TYPE_TABLE | P4D_TABLE_AF);

        pudp = pud_offset_kimg(p4dp, addr);
        early_fixmap_init_pmd(pudp, addr, end);
}

/*
 * The p*d_populate functions call virt_to_phys implicitly so they can't be used
 * directly on kernel symbols (bm_p*d). This function is called too early to use
 * lm_alias so __p*d_populate functions must be used to populate with the
 * physical address from __pa_symbol.
 */
void __init early_fixmap_init(void)
{
        unsigned long addr = FIXADDR_TOT_START;
        unsigned long end = FIXADDR_TOP;

        pgd_t *pgdp = pgd_offset_k(addr);
        p4d_t *p4dp = p4d_offset_kimg(pgdp, addr);

        early_fixmap_init_pud(p4dp, addr, end);
}

/*
 * Unusually, this is also called in IRQ context (ghes_iounmap_irq) so if we
 * ever need to use IPIs for TLB broadcasting, then we're in trouble here.
 */
void __set_fixmap(enum fixed_addresses idx,
                               phys_addr_t phys, pgprot_t flags)
{
        unsigned long addr = __fix_to_virt(idx);
        pte_t *ptep;

        BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses);

        ptep = fixmap_pte(addr);

        if (pgprot_val(flags)) {
                __set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
        } else {
                __pte_clear(&init_mm, addr, ptep);
                flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
        }
}

void *__init fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot)
{
        const u64 dt_virt_base = __fix_to_virt(FIX_FDT);
        phys_addr_t dt_phys_base;
        int offset;
        void *dt_virt;

        /*
         * Check whether the physical FDT address is set and meets the minimum
         * alignment requirement. Since we are relying on MIN_FDT_ALIGN to be
         * at least 8 bytes so that we can always access the magic and size
         * fields of the FDT header after mapping the first chunk, double check
         * here if that is indeed the case.
         */
        BUILD_BUG_ON(MIN_FDT_ALIGN < 8);
        if (!dt_phys || dt_phys % MIN_FDT_ALIGN)
                return NULL;

        dt_phys_base = round_down(dt_phys, PAGE_SIZE);
        offset = dt_phys % PAGE_SIZE;
        dt_virt = (void *)dt_virt_base + offset;

        /* map the first chunk so we can read the size from the header */
        create_mapping_noalloc(dt_phys_base, dt_virt_base, PAGE_SIZE, prot);

        if (fdt_magic(dt_virt) != FDT_MAGIC)
                return NULL;

        *size = fdt_totalsize(dt_virt);
        if (*size > MAX_FDT_SIZE)
                return NULL;

        if (offset + *size > PAGE_SIZE) {
                create_mapping_noalloc(dt_phys_base, dt_virt_base,
                                       offset + *size, prot);
        }

        return dt_virt;
}


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    4 










    4 






















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
// SPDX-License-Identifier: GPL-2.0
/*
 * Block multiqueue core code
 *
 * Copyright (C) 2013-2014 Jens Axboe
 * Copyright (C) 2013-2014 Christoph Hellwig
 */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/llist.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
#include <linux/part_stat.h>
#include <linux/sched/isolation.h>

#include <trace/events/block.h>

#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-pm.h"
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"

static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
static DEFINE_MUTEX(blk_mq_cpuhp_lock);

static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
static void blk_mq_request_bypass_insert(struct request *rq,
                blk_insert_t flags);
static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                struct list_head *list);
static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
                         struct io_comp_batch *iob, unsigned int flags);

/*
 * Check if any of the ctx, dispatch list or elevator
 * have pending work in this hardware queue.
 */
static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
        return !list_empty_careful(&hctx->dispatch) ||
                sbitmap_any_bit_set(&hctx->ctx_map) ||
                        blk_mq_sched_has_work(hctx);
}

/*
 * Mark this ctx as having pending work in this hardware queue
 */
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
                                     struct blk_mq_ctx *ctx)
{
        const int bit = ctx->index_hw[hctx->type];

        if (!sbitmap_test_bit(&hctx->ctx_map, bit))
                sbitmap_set_bit(&hctx->ctx_map, bit);
}

static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
                                      struct blk_mq_ctx *ctx)
{
        const int bit = ctx->index_hw[hctx->type];

        sbitmap_clear_bit(&hctx->ctx_map, bit);
}

struct mq_inflight {
        struct block_device *part;
        unsigned int inflight[2];
};

static bool blk_mq_check_inflight(struct request *rq, void *priv)
{
        struct mq_inflight *mi = priv;

        if (rq->rq_flags & RQF_IO_STAT &&
            (!bdev_is_partition(mi->part) || rq->part == mi->part) &&
            blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
                mi->inflight[rq_data_dir(rq)]++;

        return true;
}

unsigned int blk_mq_in_flight(struct request_queue *q,
                struct block_device *part)
{
        struct mq_inflight mi = { .part = part };

        blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);

        return mi.inflight[0] + mi.inflight[1];
}

void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
                unsigned int inflight[2])
{
        struct mq_inflight mi = { .part = part };

        blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
        inflight[0] = mi.inflight[0];
        inflight[1] = mi.inflight[1];
}

#ifdef CONFIG_LOCKDEP
static bool blk_freeze_set_owner(struct request_queue *q,
                                 struct task_struct *owner)
{
        if (!owner)
                return false;

        if (!q->mq_freeze_depth) {
                q->mq_freeze_owner = owner;
                q->mq_freeze_owner_depth = 1;
                q->mq_freeze_disk_dead = !q->disk ||
                        test_bit(GD_DEAD, &q->disk->state) ||
                        !blk_queue_registered(q);
                q->mq_freeze_queue_dying = blk_queue_dying(q);
                return true;
        }

        if (owner == q->mq_freeze_owner)
                q->mq_freeze_owner_depth += 1;
        return false;
}

/* verify the last unfreeze in owner context */
static bool blk_unfreeze_check_owner(struct request_queue *q)
{
        if (q->mq_freeze_owner != current)
                return false;
        if (--q->mq_freeze_owner_depth == 0) {
                q->mq_freeze_owner = NULL;
                return true;
        }
        return false;
}

#else

static bool blk_freeze_set_owner(struct request_queue *q,
                                 struct task_struct *owner)
{
        return false;
}

static bool blk_unfreeze_check_owner(struct request_queue *q)
{
        return false;
}
#endif

bool __blk_freeze_queue_start(struct request_queue *q,
                              struct task_struct *owner)
{
        bool freeze;

        mutex_lock(&q->mq_freeze_lock);
        freeze = blk_freeze_set_owner(q, owner);
        if (++q->mq_freeze_depth == 1) {
                percpu_ref_kill(&q->q_usage_counter);
                mutex_unlock(&q->mq_freeze_lock);
                if (queue_is_mq(q))
                        blk_mq_run_hw_queues(q, false);
        } else {
                mutex_unlock(&q->mq_freeze_lock);
        }

        return freeze;
}

void blk_freeze_queue_start(struct request_queue *q)
{
        if (__blk_freeze_queue_start(q, current))
                blk_freeze_acquire_lock(q);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start);

void blk_mq_freeze_queue_wait(struct request_queue *q)
{
        wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);

int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
                                     unsigned long timeout)
{
        return wait_event_timeout(q->mq_freeze_wq,
                                        percpu_ref_is_zero(&q->q_usage_counter),
                                        timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);

void blk_mq_freeze_queue_nomemsave(struct request_queue *q)
{
        blk_freeze_queue_start(q);
        blk_mq_freeze_queue_wait(q);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_nomemsave);

bool __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
        bool unfreeze;

        mutex_lock(&q->mq_freeze_lock);
        if (force_atomic)
                q->q_usage_counter.data->force_atomic = true;
        q->mq_freeze_depth--;
        WARN_ON_ONCE(q->mq_freeze_depth < 0);
        if (!q->mq_freeze_depth) {
                percpu_ref_resurrect(&q->q_usage_counter);
                wake_up_all(&q->mq_freeze_wq);
        }
        unfreeze = blk_unfreeze_check_owner(q);
        mutex_unlock(&q->mq_freeze_lock);

        return unfreeze;
}

void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q)
{
        if (__blk_mq_unfreeze_queue(q, false))
                blk_unfreeze_release_lock(q);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_nomemrestore);

/*
 * non_owner variant of blk_freeze_queue_start
 *
 * Unlike blk_freeze_queue_start, the queue doesn't need to be unfrozen
 * by the same task.  This is fragile and should not be used if at all
 * possible.
 */
void blk_freeze_queue_start_non_owner(struct request_queue *q)
{
        __blk_freeze_queue_start(q, NULL);
}
EXPORT_SYMBOL_GPL(blk_freeze_queue_start_non_owner);

/* non_owner variant of blk_mq_unfreeze_queue */
void blk_mq_unfreeze_queue_non_owner(struct request_queue *q)
{
        __blk_mq_unfreeze_queue(q, false);
}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue_non_owner);

/*
 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the
 * mpt3sas driver such that this function can be removed.
 */
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
        unsigned long flags;

        spin_lock_irqsave(&q->queue_lock, flags);
        if (!q->quiesce_depth++)
                blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
        spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

/**
 * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
 * @set: tag_set to wait on
 *
 * Note: it is driver's responsibility for making sure that quiesce has
 * been started on or more of the request_queues of the tag_set.  This
 * function only waits for the quiesce on those request_queues that had
 * the quiesce flag set using blk_mq_quiesce_queue_nowait.
 */
void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{
        if (set->flags & BLK_MQ_F_BLOCKING)
                synchronize_srcu(set->srcu);
        else
                synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);

/**
 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
 * @q: request queue.
 *
 * Note: this function does not prevent that the struct request end_io()
 * callback function is invoked. Once this function is returned, we make
 * sure no dispatch can happen until the queue is unquiesced via
 * blk_mq_unquiesce_queue().
 */
void blk_mq_quiesce_queue(struct request_queue *q)
{
        blk_mq_quiesce_queue_nowait(q);
        /* nothing to wait for non-mq queues */
        if (queue_is_mq(q))
                blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

/*
 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
 * @q: request queue.
 *
 * This function recovers queue into the state before quiescing
 * which is done by blk_mq_quiesce_queue.
 */
void blk_mq_unquiesce_queue(struct request_queue *q)
{
        unsigned long flags;
        bool run_queue = false;

        spin_lock_irqsave(&q->queue_lock, flags);
        if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
                ;
        } else if (!--q->quiesce_depth) {
                blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
                run_queue = true;
        }
        spin_unlock_irqrestore(&q->queue_lock, flags);

        /* dispatch requests which are inserted during quiescing */
        if (run_queue)
                blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);

void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
{
        struct request_queue *q;

        mutex_lock(&set->tag_list_lock);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                if (!blk_queue_skip_tagset_quiesce(q))
                        blk_mq_quiesce_queue_nowait(q);
        }
        mutex_unlock(&set->tag_list_lock);

        blk_mq_wait_quiesce_done(set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);

void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
{
        struct request_queue *q;

        mutex_lock(&set->tag_list_lock);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                if (!blk_queue_skip_tagset_quiesce(q))
                        blk_mq_unquiesce_queue(q);
        }
        mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);

void blk_mq_wake_waiters(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                if (blk_mq_hw_queue_mapped(hctx))
                        blk_mq_tag_wakeup_all(hctx->tags, true);
}

void blk_rq_init(struct request_queue *q, struct request *rq)
{
        memset(rq, 0, sizeof(*rq));

        INIT_LIST_HEAD(&rq->queuelist);
        rq->q = q;
        rq->__sector = (sector_t) -1;
        INIT_HLIST_NODE(&rq->hash);
        RB_CLEAR_NODE(&rq->rb_node);
        rq->tag = BLK_MQ_NO_TAG;
        rq->internal_tag = BLK_MQ_NO_TAG;
        rq->start_time_ns = blk_time_get_ns();
        blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);

/* Set start and alloc time when the allocated request is actually used */
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
#ifdef CONFIG_BLK_RQ_ALLOC_TIME
        if (blk_queue_rq_alloc_time(rq->q))
                rq->alloc_time_ns = alloc_time_ns;
        else
                rq->alloc_time_ns = 0;
#endif
}

static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                struct blk_mq_tags *tags, unsigned int tag)
{
        struct blk_mq_ctx *ctx = data->ctx;
        struct blk_mq_hw_ctx *hctx = data->hctx;
        struct request_queue *q = data->q;
        struct request *rq = tags->static_rqs[tag];

        rq->q = q;
        rq->mq_ctx = ctx;
        rq->mq_hctx = hctx;
        rq->cmd_flags = data->cmd_flags;

        if (data->flags & BLK_MQ_REQ_PM)
                data->rq_flags |= RQF_PM;
        rq->rq_flags = data->rq_flags;

        if (data->rq_flags & RQF_SCHED_TAGS) {
                rq->tag = BLK_MQ_NO_TAG;
                rq->internal_tag = tag;
        } else {
                rq->tag = tag;
                rq->internal_tag = BLK_MQ_NO_TAG;
        }
        rq->timeout = 0;

        rq->part = NULL;
        rq->io_start_time_ns = 0;
        rq->stats_sectors = 0;
        rq->nr_phys_segments = 0;
        rq->nr_integrity_segments = 0;
        rq->end_io = NULL;
        rq->end_io_data = NULL;

        blk_crypto_rq_set_defaults(rq);
        INIT_LIST_HEAD(&rq->queuelist);
        /* tag was already set */
        WRITE_ONCE(rq->deadline, 0);
        req_ref_set(rq, 1);

        if (rq->rq_flags & RQF_USE_SCHED) {
                struct elevator_queue *e = data->q->elevator;

                INIT_HLIST_NODE(&rq->hash);
                RB_CLEAR_NODE(&rq->rb_node);

                if (e->type->ops.prepare_request)
                        e->type->ops.prepare_request(rq);
        }

        return rq;
}

static inline struct request *
__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
{
        unsigned int tag, tag_offset;
        struct blk_mq_tags *tags;
        struct request *rq;
        unsigned long tag_mask;
        int i, nr = 0;

        tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
        if (unlikely(!tag_mask))
                return NULL;

        tags = blk_mq_tags_from_data(data);
        for (i = 0; tag_mask; i++) {
                if (!(tag_mask & (1UL << i)))
                        continue;
                tag = tag_offset + i;
                prefetch(tags->static_rqs[tag]);
                tag_mask &= ~(1UL << i);
                rq = blk_mq_rq_ctx_init(data, tags, tag);
                rq_list_add_head(data->cached_rqs, rq);
                nr++;
        }
        if (!(data->rq_flags & RQF_SCHED_TAGS))
                blk_mq_add_active_requests(data->hctx, nr);
        /* caller already holds a reference, add for remainder */
        percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
        data->nr_tags -= nr;

        return rq_list_pop(data->cached_rqs);
}

static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
{
        struct request_queue *q = data->q;
        u64 alloc_time_ns = 0;
        struct request *rq;
        unsigned int tag;

        /* alloc_time includes depth and tag waits */
        if (blk_queue_rq_alloc_time(q))
                alloc_time_ns = blk_time_get_ns();

        if (data->cmd_flags & REQ_NOWAIT)
                data->flags |= BLK_MQ_REQ_NOWAIT;

retry:
        data->ctx = blk_mq_get_ctx(q);
        data->hctx = blk_mq_map_queue(data->cmd_flags, data->ctx);

        if (q->elevator) {
                /*
                 * All requests use scheduler tags when an I/O scheduler is
                 * enabled for the queue.
                 */
                data->rq_flags |= RQF_SCHED_TAGS;

                /*
                 * Flush/passthrough requests are special and go directly to the
                 * dispatch list.
                 */
                if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
                    !blk_op_is_passthrough(data->cmd_flags)) {
                        struct elevator_mq_ops *ops = &q->elevator->type->ops;

                        WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);

                        data->rq_flags |= RQF_USE_SCHED;
                        if (ops->limit_depth)
                                ops->limit_depth(data->cmd_flags, data);
                }
        } else {
                blk_mq_tag_busy(data->hctx);
        }

        if (data->flags & BLK_MQ_REQ_RESERVED)
                data->rq_flags |= RQF_RESV;

        /*
         * Try batched alloc if we want more than 1 tag.
         */
        if (data->nr_tags > 1) {
                rq = __blk_mq_alloc_requests_batch(data);
                if (rq) {
                        blk_mq_rq_time_init(rq, alloc_time_ns);
                        return rq;
                }
                data->nr_tags = 1;
        }

        /*
         * Waiting allocations only fail because of an inactive hctx.  In that
         * case just retry the hctx assignment and tag allocation as CPU hotplug
         * should have migrated us to an online CPU by now.
         */
        tag = blk_mq_get_tag(data);
        if (tag == BLK_MQ_NO_TAG) {
                if (data->flags & BLK_MQ_REQ_NOWAIT)
                        return NULL;
                /*
                 * Give up the CPU and sleep for a random short time to
                 * ensure that thread using a realtime scheduling class
                 * are migrated off the CPU, and thus off the hctx that
                 * is going away.
                 */
                msleep(3);
                goto retry;
        }

        if (!(data->rq_flags & RQF_SCHED_TAGS))
                blk_mq_inc_active_requests(data->hctx);
        rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
        blk_mq_rq_time_init(rq, alloc_time_ns);
        return rq;
}

static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
                                            struct blk_plug *plug,
                                            blk_opf_t opf,
                                            blk_mq_req_flags_t flags)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .flags                = flags,
                .cmd_flags        = opf,
                .nr_tags        = plug->nr_ios,
                .cached_rqs        = &plug->cached_rqs,
        };
        struct request *rq;

        if (blk_queue_enter(q, flags))
                return NULL;

        plug->nr_ios = 1;

        rq = __blk_mq_alloc_requests(&data);
        if (unlikely(!rq))
                blk_queue_exit(q);
        return rq;
}

static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
                                                   blk_opf_t opf,
                                                   blk_mq_req_flags_t flags)
{
        struct blk_plug *plug = current->plug;
        struct request *rq;

        if (!plug)
                return NULL;

        if (rq_list_empty(&plug->cached_rqs)) {
                if (plug->nr_ios == 1)
                        return NULL;
                rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
                if (!rq)
                        return NULL;
        } else {
                rq = rq_list_peek(&plug->cached_rqs);
                if (!rq || rq->q != q)
                        return NULL;

                if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
                        return NULL;
                if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
                        return NULL;

                rq_list_pop(&plug->cached_rqs);
                blk_mq_rq_time_init(rq, blk_time_get_ns());
        }

        rq->cmd_flags = opf;
        INIT_LIST_HEAD(&rq->queuelist);
        return rq;
}

struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
                blk_mq_req_flags_t flags)
{
        struct request *rq;

        rq = blk_mq_alloc_cached_request(q, opf, flags);
        if (!rq) {
                struct blk_mq_alloc_data data = {
                        .q                = q,
                        .flags                = flags,
                        .cmd_flags        = opf,
                        .nr_tags        = 1,
                };
                int ret;

                ret = blk_queue_enter(q, flags);
                if (ret)
                        return ERR_PTR(ret);

                rq = __blk_mq_alloc_requests(&data);
                if (!rq)
                        goto out_queue_exit;
        }
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;
out_queue_exit:
        blk_queue_exit(q);
        return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);

struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
        blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .flags                = flags,
                .cmd_flags        = opf,
                .nr_tags        = 1,
        };
        u64 alloc_time_ns = 0;
        struct request *rq;
        unsigned int cpu;
        unsigned int tag;
        int ret;

        /* alloc_time includes depth and tag waits */
        if (blk_queue_rq_alloc_time(q))
                alloc_time_ns = blk_time_get_ns();

        /*
         * If the tag allocator sleeps we could get an allocation for a
         * different hardware context.  No need to complicate the low level
         * allocator for this for the rare use case of a command tied to
         * a specific queue.
         */
        if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
            WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
                return ERR_PTR(-EINVAL);

        if (hctx_idx >= q->nr_hw_queues)
                return ERR_PTR(-EIO);

        ret = blk_queue_enter(q, flags);
        if (ret)
                return ERR_PTR(ret);

        /*
         * Check if the hardware context is actually mapped to anything.
         * If not tell the caller that it should skip this queue.
         */
        ret = -EXDEV;
        data.hctx = xa_load(&q->hctx_table, hctx_idx);
        if (!blk_mq_hw_queue_mapped(data.hctx))
                goto out_queue_exit;
        cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
        if (cpu >= nr_cpu_ids)
                goto out_queue_exit;
        data.ctx = __blk_mq_get_ctx(q, cpu);

        if (q->elevator)
                data.rq_flags |= RQF_SCHED_TAGS;
        else
                blk_mq_tag_busy(data.hctx);

        if (flags & BLK_MQ_REQ_RESERVED)
                data.rq_flags |= RQF_RESV;

        ret = -EWOULDBLOCK;
        tag = blk_mq_get_tag(&data);
        if (tag == BLK_MQ_NO_TAG)
                goto out_queue_exit;
        if (!(data.rq_flags & RQF_SCHED_TAGS))
                blk_mq_inc_active_requests(data.hctx);
        rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
        blk_mq_rq_time_init(rq, alloc_time_ns);
        rq->__data_len = 0;
        rq->__sector = (sector_t) -1;
        rq->bio = rq->biotail = NULL;
        return rq;

out_queue_exit:
        blk_queue_exit(q);
        return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

static void blk_mq_finish_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_zone_finish_request(rq);

        if (rq->rq_flags & RQF_USE_SCHED) {
                q->elevator->type->ops.finish_request(rq);
                /*
                 * For postflush request that may need to be
                 * completed twice, we should clear this flag
                 * to avoid double finish_request() on the rq.
                 */
                rq->rq_flags &= ~RQF_USE_SCHED;
        }
}

static void __blk_mq_free_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        const int sched_tag = rq->internal_tag;

        blk_crypto_free_request(rq);
        blk_pm_mark_last_busy(rq);
        rq->mq_hctx = NULL;

        if (rq->tag != BLK_MQ_NO_TAG) {
                blk_mq_dec_active_requests(hctx);
                blk_mq_put_tag(hctx->tags, ctx, rq->tag);
        }
        if (sched_tag != BLK_MQ_NO_TAG)
                blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
        blk_mq_sched_restart(hctx);
        blk_queue_exit(q);
}

void blk_mq_free_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_mq_finish_request(rq);

        if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
                laptop_io_completion(q->disk->bdi);

        rq_qos_done(q, rq);

        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
        if (req_ref_put_and_test(rq))
                __blk_mq_free_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);

void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
        struct request *rq;

        while ((rq = rq_list_pop(&plug->cached_rqs)) != NULL)
                blk_mq_free_request(rq);
}

void blk_dump_rq_flags(struct request *rq, char *msg)
{
        printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
                rq->q->disk ? rq->q->disk->disk_name : "?",
                (__force unsigned long long) rq->cmd_flags);

        printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
               (unsigned long long)blk_rq_pos(rq),
               blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
        printk(KERN_INFO "  bio %p, biotail %p, len %u\n",
               rq->bio, rq->biotail, blk_rq_bytes(rq));
}
EXPORT_SYMBOL(blk_dump_rq_flags);

static void blk_account_io_completion(struct request *req, unsigned int bytes)
{
        if (req->rq_flags & RQF_IO_STAT) {
                const int sgrp = op_stat_group(req_op(req));

                part_stat_lock();
                part_stat_add(req->part, sectors[sgrp], bytes >> 9);
                part_stat_unlock();
        }
}

static void blk_print_req_error(struct request *req, blk_status_t status)
{
        printk_ratelimited(KERN_ERR
                "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
                "phys_seg %u prio class %u\n",
                blk_status_to_str(status),
                req->q->disk ? req->q->disk->disk_name : "?",
                blk_rq_pos(req), (__force u32)req_op(req),
                blk_op_str(req_op(req)),
                (__force u32)(req->cmd_flags & ~REQ_OP_MASK),
                req->nr_phys_segments,
                IOPRIO_PRIO_CLASS(req_get_ioprio(req)));
}

/*
 * Fully end IO on a request. Does not support partial completions, or
 * errors.
 */
static void blk_complete_request(struct request *req)
{
        const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
        int total_bytes = blk_rq_bytes(req);
        struct bio *bio = req->bio;

        trace_block_rq_complete(req, BLK_STS_OK, total_bytes);

        if (!bio)
                return;

        if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
                blk_integrity_complete(req, total_bytes);

        /*
         * Upper layers may call blk_crypto_evict_key() anytime after the last
         * bio_endio().  Therefore, the keyslot must be released before that.
         */
        blk_crypto_rq_put_keyslot(req);

        blk_account_io_completion(req, total_bytes);

        do {
                struct bio *next = bio->bi_next;

                /* Completion has already been traced */
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);

                blk_zone_update_request_bio(req, bio);

                if (!is_flush)
                        bio_endio(bio);
                bio = next;
        } while (bio);

        /*
         * Reset counters so that the request stacking driver
         * can find how many bytes remain in the request
         * later.
         */
        if (!req->end_io) {
                req->bio = NULL;
                req->__data_len = 0;
        }
}

/**
 * blk_update_request - Complete multiple bytes without completing the request
 * @req:      the request being processed
 * @error:    block status code
 * @nr_bytes: number of bytes to complete for @req
 *
 * Description:
 *     Ends I/O on a number of bytes attached to @req, but doesn't complete
 *     the request structure even if @req doesn't have leftover.
 *     If @req has leftover, sets it up for the next range of segments.
 *
 *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
 *     %false return from this function.
 *
 * Note:
 *        The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
 *      except in the consistency check at the end of this function.
 *
 * Return:
 *     %false - this request doesn't have any more data
 *     %true  - this request has more data
 **/
bool blk_update_request(struct request *req, blk_status_t error,
                unsigned int nr_bytes)
{
        bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
        bool quiet = req->rq_flags & RQF_QUIET;
        int total_bytes;

        trace_block_rq_complete(req, error, nr_bytes);

        if (!req->bio)
                return false;

        if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
            error == BLK_STS_OK)
                blk_integrity_complete(req, nr_bytes);

        /*
         * Upper layers may call blk_crypto_evict_key() anytime after the last
         * bio_endio().  Therefore, the keyslot must be released before that.
         */
        if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
                __blk_crypto_rq_put_keyslot(req);

        if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
            !test_bit(GD_DEAD, &req->q->disk->state)) {
                blk_print_req_error(req, error);
                trace_block_rq_error(req, error, nr_bytes);
        }

        blk_account_io_completion(req, nr_bytes);

        total_bytes = 0;
        while (req->bio) {
                struct bio *bio = req->bio;
                unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);

                if (unlikely(error))
                        bio->bi_status = error;

                if (bio_bytes == bio->bi_iter.bi_size) {
                        req->bio = bio->bi_next;
                } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
                        /*
                         * Partial zone append completions cannot be supported
                         * as the BIO fragments may end up not being written
                         * sequentially.
                         */
                        bio->bi_status = BLK_STS_IOERR;
                }

                /* Completion has already been traced */
                bio_clear_flag(bio, BIO_TRACE_COMPLETION);
                if (unlikely(quiet))
                        bio_set_flag(bio, BIO_QUIET);

                bio_advance(bio, bio_bytes);

                /* Don't actually finish bio if it's part of flush sequence */
                if (!bio->bi_iter.bi_size) {
                        blk_zone_update_request_bio(req, bio);
                        if (!is_flush)
                                bio_endio(bio);
                }

                total_bytes += bio_bytes;
                nr_bytes -= bio_bytes;

                if (!nr_bytes)
                        break;
        }

        /*
         * completely done
         */
        if (!req->bio) {
                /*
                 * Reset counters so that the request stacking driver
                 * can find how many bytes remain in the request
                 * later.
                 */
                req->__data_len = 0;
                return false;
        }

        req->__data_len -= total_bytes;

        /* update sector only for requests with clear definition of sector */
        if (!blk_rq_is_passthrough(req))
                req->__sector += total_bytes >> 9;

        /* mixed attributes always follow the first bio */
        if (req->rq_flags & RQF_MIXED_MERGE) {
                req->cmd_flags &= ~REQ_FAILFAST_MASK;
                req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
        }

        if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
                /*
                 * If total number of sectors is less than the first segment
                 * size, something has gone terribly wrong.
                 */
                if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
                        blk_dump_rq_flags(req, "request botched");
                        req->__data_len = blk_rq_cur_bytes(req);
                }

                /* recalculate the number of segments */
                req->nr_phys_segments = blk_recalc_rq_segments(req);
        }

        return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);

static inline void blk_account_io_done(struct request *req, u64 now)
{
        trace_block_io_done(req);

        /*
         * Account IO completion.  flush_rq isn't accounted as a
         * normal IO on queueing nor completion.  Accounting the
         * containing request is enough.
         */
        if ((req->rq_flags & (RQF_IO_STAT|RQF_FLUSH_SEQ)) == RQF_IO_STAT) {
                const int sgrp = op_stat_group(req_op(req));

                part_stat_lock();
                update_io_ticks(req->part, jiffies, true);
                part_stat_inc(req->part, ios[sgrp]);
                part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
                part_stat_local_dec(req->part,
                                    in_flight[op_is_write(req_op(req))]);
                part_stat_unlock();
        }
}

static inline bool blk_rq_passthrough_stats(struct request *req)
{
        struct bio *bio = req->bio;

        if (!blk_queue_passthrough_stat(req->q))
                return false;

        /* Requests without a bio do not transfer data. */
        if (!bio)
                return false;

        /*
         * Stats are accumulated in the bdev, so must have one attached to a
         * bio to track stats. Most drivers do not set the bdev for passthrough
         * requests, but nvme is one that will set it.
         */
        if (!bio->bi_bdev)
                return false;

        /*
         * We don't know what a passthrough command does, but we know the
         * payload size and data direction. Ensuring the size is aligned to the
         * block size filters out most commands with payloads that don't
         * represent sector access.
         */
        if (blk_rq_bytes(req) & (bdev_logical_block_size(bio->bi_bdev) - 1))
                return false;
        return true;
}

static inline void blk_account_io_start(struct request *req)
{
        trace_block_io_start(req);

        if (!blk_queue_io_stat(req->q))
                return;
        if (blk_rq_is_passthrough(req) && !blk_rq_passthrough_stats(req))
                return;

        req->rq_flags |= RQF_IO_STAT;
        req->start_time_ns = blk_time_get_ns();

        /*
         * All non-passthrough requests are created from a bio with one
         * exception: when a flush command that is part of a flush sequence
         * generated by the state machine in blk-flush.c is cloned onto the
         * lower device by dm-multipath we can get here without a bio.
         */
        if (req->bio)
                req->part = req->bio->bi_bdev;
        else
                req->part = req->q->disk->part0;

        part_stat_lock();
        update_io_ticks(req->part, jiffies, false);
        part_stat_local_inc(req->part, in_flight[op_is_write(req_op(req))]);
        part_stat_unlock();
}

static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
{
        if (rq->rq_flags & RQF_STATS)
                blk_stat_add(rq, now);

        blk_mq_sched_completed_request(rq, now);
        blk_account_io_done(rq, now);
}

inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
        if (blk_mq_need_time_stamp(rq))
                __blk_mq_end_request_acct(rq, blk_time_get_ns());

        blk_mq_finish_request(rq);

        if (rq->end_io) {
                rq_qos_done(rq->q, rq);
                if (rq->end_io(rq, error) == RQ_END_IO_FREE)
                        blk_mq_free_request(rq);
        } else {
                blk_mq_free_request(rq);
        }
}
EXPORT_SYMBOL(__blk_mq_end_request);

void blk_mq_end_request(struct request *rq, blk_status_t error)
{
        if (blk_update_request(rq, error, blk_rq_bytes(rq)))
                BUG();
        __blk_mq_end_request(rq, error);
}
EXPORT_SYMBOL(blk_mq_end_request);

#define TAG_COMP_BATCH                32

static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
                                          int *tag_array, int nr_tags)
{
        struct request_queue *q = hctx->queue;

        blk_mq_sub_active_requests(hctx, nr_tags);

        blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
        percpu_ref_put_many(&q->q_usage_counter, nr_tags);
}

void blk_mq_end_request_batch(struct io_comp_batch *iob)
{
        int tags[TAG_COMP_BATCH], nr_tags = 0;
        struct blk_mq_hw_ctx *cur_hctx = NULL;
        struct request *rq;
        u64 now = 0;

        if (iob->need_ts)
                now = blk_time_get_ns();

        while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
                prefetch(rq->bio);
                prefetch(rq->rq_next);

                blk_complete_request(rq);
                if (iob->need_ts)
                        __blk_mq_end_request_acct(rq, now);

                blk_mq_finish_request(rq);

                rq_qos_done(rq->q, rq);

                /*
                 * If end_io handler returns NONE, then it still has
                 * ownership of the request.
                 */
                if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
                        continue;

                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
                if (!req_ref_put_and_test(rq))
                        continue;

                blk_crypto_free_request(rq);
                blk_pm_mark_last_busy(rq);

                if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
                        if (cur_hctx)
                                blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
                        nr_tags = 0;
                        cur_hctx = rq->mq_hctx;
                }
                tags[nr_tags++] = rq->tag;
        }

        if (nr_tags)
                blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
}
EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);

static void blk_complete_reqs(struct llist_head *list)
{
        struct llist_node *entry = llist_reverse_order(llist_del_all(list));
        struct request *rq, *next;

        llist_for_each_entry_safe(rq, next, entry, ipi_list)
                rq->q->mq_ops->complete(rq);
}

static __latent_entropy void blk_done_softirq(void)
{
        blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
}

static int blk_softirq_cpu_dead(unsigned int cpu)
{
        blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
        return 0;
}

static void __blk_mq_complete_request_remote(void *data)
{
        __raise_softirq_irqoff(BLOCK_SOFTIRQ);
}

static inline bool blk_mq_complete_need_ipi(struct request *rq)
{
        int cpu = raw_smp_processor_id();

        if (!IS_ENABLED(CONFIG_SMP) ||
            !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
                return false;
        /*
         * With force threaded interrupts enabled, raising softirq from an SMP
         * function call will always result in waking the ksoftirqd thread.
         * This is probably worse than completing the request on a different
         * cache domain.
         */
        if (force_irqthreads())
                return false;

        /* same CPU or cache domain and capacity?  Complete locally */
        if (cpu == rq->mq_ctx->cpu ||
            (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
             cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
             cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
                return false;

        /* don't try to IPI to an offline CPU */
        return cpu_online(rq->mq_ctx->cpu);
}

static void blk_mq_complete_send_ipi(struct request *rq)
{
        unsigned int cpu;

        cpu = rq->mq_ctx->cpu;
        if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu)))
                smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu));
}

static void blk_mq_raise_softirq(struct request *rq)
{
        struct llist_head *list;

        preempt_disable();
        list = this_cpu_ptr(&blk_cpu_done);
        if (llist_add(&rq->ipi_list, list))
                raise_softirq(BLOCK_SOFTIRQ);
        preempt_enable();
}

bool blk_mq_complete_request_remote(struct request *rq)
{
        WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);

        /*
         * For request which hctx has only one ctx mapping,
         * or a polled request, always complete locally,
         * it's pointless to redirect the completion.
         */
        if ((rq->mq_hctx->nr_ctx == 1 &&
             rq->mq_ctx->cpu == raw_smp_processor_id()) ||
             rq->cmd_flags & REQ_POLLED)
                return false;

        if (blk_mq_complete_need_ipi(rq)) {
                blk_mq_complete_send_ipi(rq);
                return true;
        }

        if (rq->q->nr_hw_queues == 1) {
                blk_mq_raise_softirq(rq);
                return true;
        }
        return false;
}
EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);

/**
 * blk_mq_complete_request - end I/O on a request
 * @rq:                the request being processed
 *
 * Description:
 *        Complete a request by scheduling the ->complete_rq operation.
 **/
void blk_mq_complete_request(struct request *rq)
{
        if (!blk_mq_complete_request_remote(rq))
                rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);

/**
 * blk_mq_start_request - Start processing a request
 * @rq: Pointer to request to be started
 *
 * Function used by device drivers to notify the block layer that a request
 * is going to be processed now, so blk layer can do proper initializations
 * such as starting the timeout timer.
 */
void blk_mq_start_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        trace_block_rq_issue(rq);

        if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
            !blk_rq_is_passthrough(rq)) {
                rq->io_start_time_ns = blk_time_get_ns();
                rq->stats_sectors = blk_rq_sectors(rq);
                rq->rq_flags |= RQF_STATS;
                rq_qos_issue(q, rq);
        }

        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);

        blk_add_timer(rq);
        WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
        rq->mq_hctx->tags->rqs[rq->tag] = rq;

        if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
                blk_integrity_prepare(rq);

        if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
                WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
}
EXPORT_SYMBOL(blk_mq_start_request);

/*
 * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
 * queues. This is important for md arrays to benefit from merging
 * requests.
 */
static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
{
        if (plug->multiple_queues)
                return BLK_MAX_REQUEST_COUNT * 2;
        return BLK_MAX_REQUEST_COUNT;
}

static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
{
        struct request *last = rq_list_peek(&plug->mq_list);

        if (!plug->rq_count) {
                trace_block_plug(rq->q);
        } else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
                   (!blk_queue_nomerges(rq->q) &&
                    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
                blk_mq_flush_plug_list(plug, false);
                last = NULL;
                trace_block_plug(rq->q);
        }

        if (!plug->multiple_queues && last && last->q != rq->q)
                plug->multiple_queues = true;
        /*
         * Any request allocated from sched tags can't be issued to
         * ->queue_rqs() directly
         */
        if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
                plug->has_elevator = true;
        rq_list_add_tail(&plug->mq_list, rq);
        plug->rq_count++;
}

/**
 * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
 * @rq:                request to insert
 * @at_head:    insert request at head or tail of queue
 *
 * Description:
 *    Insert a fully prepared request at the back of the I/O scheduler queue
 *    for execution.  Don't wait for completion.
 *
 * Note:
 *    This function will invoke @done directly if the queue is dead.
 */
void blk_execute_rq_nowait(struct request *rq, bool at_head)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        WARN_ON(irqs_disabled());
        WARN_ON(!blk_rq_is_passthrough(rq));

        blk_account_io_start(rq);

        if (current->plug && !at_head) {
                blk_add_rq_to_plug(current->plug, rq);
                return;
        }

        blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
        blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
}
EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);

struct blk_rq_wait {
        struct completion done;
        blk_status_t ret;
};

static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
{
        struct blk_rq_wait *wait = rq->end_io_data;

        wait->ret = ret;
        complete(&wait->done);
        return RQ_END_IO_NONE;
}

bool blk_rq_is_poll(struct request *rq)
{
        if (!rq->mq_hctx)
                return false;
        if (rq->mq_hctx->type != HCTX_TYPE_POLL)
                return false;
        return true;
}
EXPORT_SYMBOL_GPL(blk_rq_is_poll);

static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
{
        do {
                blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0);
                cond_resched();
        } while (!completion_done(wait));
}

/**
 * blk_execute_rq - insert a request into queue for execution
 * @rq:                request to insert
 * @at_head:    insert request at head or tail of queue
 *
 * Description:
 *    Insert a fully prepared request at the back of the I/O scheduler queue
 *    for execution and wait for completion.
 * Return: The blk_status_t result provided to blk_mq_end_request().
 */
blk_status_t blk_execute_rq(struct request *rq, bool at_head)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        struct blk_rq_wait wait = {
                .done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
        };

        WARN_ON(irqs_disabled());
        WARN_ON(!blk_rq_is_passthrough(rq));

        rq->end_io_data = &wait;
        rq->end_io = blk_end_sync_rq;

        blk_account_io_start(rq);
        blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
        blk_mq_run_hw_queue(hctx, false);

        if (blk_rq_is_poll(rq))
                blk_rq_poll_completion(rq, &wait.done);
        else
                blk_wait_io(&wait.done);

        return wait.ret;
}
EXPORT_SYMBOL(blk_execute_rq);

static void __blk_mq_requeue_request(struct request *rq)
{
        struct request_queue *q = rq->q;

        blk_mq_put_driver_tag(rq);

        trace_block_rq_requeue(rq);
        rq_qos_requeue(q, rq);

        if (blk_mq_request_started(rq)) {
                WRITE_ONCE(rq->state, MQ_RQ_IDLE);
                rq->rq_flags &= ~RQF_TIMED_OUT;
        }
}

void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
        struct request_queue *q = rq->q;
        unsigned long flags;

        __blk_mq_requeue_request(rq);

        /* this request will be re-inserted to io scheduler queue */
        blk_mq_sched_requeue_request(rq);

        spin_lock_irqsave(&q->requeue_lock, flags);
        list_add_tail(&rq->queuelist, &q->requeue_list);
        spin_unlock_irqrestore(&q->requeue_lock, flags);

        if (kick_requeue_list)
                blk_mq_kick_requeue_list(q);
}
EXPORT_SYMBOL(blk_mq_requeue_request);

static void blk_mq_requeue_work(struct work_struct *work)
{
        struct request_queue *q =
                container_of(work, struct request_queue, requeue_work.work);
        LIST_HEAD(rq_list);
        LIST_HEAD(flush_list);
        struct request *rq;

        spin_lock_irq(&q->requeue_lock);
        list_splice_init(&q->requeue_list, &rq_list);
        list_splice_init(&q->flush_list, &flush_list);
        spin_unlock_irq(&q->requeue_lock);

        while (!list_empty(&rq_list)) {
                rq = list_entry(rq_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
                /*
                 * If RQF_DONTPREP is set, the request has been started by the
                 * driver already and might have driver-specific data allocated
                 * already.  Insert it into the hctx dispatch list to avoid
                 * block layer merges for the request.
                 */
                if (rq->rq_flags & RQF_DONTPREP)
                        blk_mq_request_bypass_insert(rq, 0);
                else
                        blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
        }

        while (!list_empty(&flush_list)) {
                rq = list_entry(flush_list.next, struct request, queuelist);
                list_del_init(&rq->queuelist);
                blk_mq_insert_request(rq, 0);
        }

        blk_mq_run_hw_queues(q, false);
}

void blk_mq_kick_requeue_list(struct request_queue *q)
{
        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
}
EXPORT_SYMBOL(blk_mq_kick_requeue_list);

void blk_mq_delay_kick_requeue_list(struct request_queue *q,
                                    unsigned long msecs)
{
        kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work,
                                    msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);

static bool blk_is_flush_data_rq(struct request *rq)
{
        return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq);
}

static bool blk_mq_rq_inflight(struct request *rq, void *priv)
{
        /*
         * If we find a request that isn't idle we know the queue is busy
         * as it's checked in the iter.
         * Return false to stop the iteration.
         *
         * In case of queue quiesce, if one flush data request is completed,
         * don't count it as inflight given the flush sequence is suspended,
         * and the original flush data request is invisible to driver, just
         * like other pending requests because of quiesce
         */
        if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
                                blk_is_flush_data_rq(rq) &&
                                blk_mq_request_completed(rq))) {
                bool *busy = priv;

                *busy = true;
                return false;
        }

        return true;
}

bool blk_mq_queue_inflight(struct request_queue *q)
{
        bool busy = false;

        blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy);
        return busy;
}
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);

static void blk_mq_rq_timed_out(struct request *req)
{
        req->rq_flags |= RQF_TIMED_OUT;
        if (req->q->mq_ops->timeout) {
                enum blk_eh_timer_return ret;

                ret = req->q->mq_ops->timeout(req);
                if (ret == BLK_EH_DONE)
                        return;
                WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
        }

        blk_add_timer(req);
}

struct blk_expired_data {
        bool has_timedout_rq;
        unsigned long next;
        unsigned long timeout_start;
};

static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
{
        unsigned long deadline;

        if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
                return false;
        if (rq->rq_flags & RQF_TIMED_OUT)
                return false;

        deadline = READ_ONCE(rq->deadline);
        if (time_after_eq(expired->timeout_start, deadline))
                return true;

        if (expired->next == 0)
                expired->next = deadline;
        else if (time_after(expired->next, deadline))
                expired->next = deadline;
        return false;
}

void blk_mq_put_rq_ref(struct request *rq)
{
        if (is_flush_rq(rq)) {
                if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
                        blk_mq_free_request(rq);
        } else if (req_ref_put_and_test(rq)) {
                __blk_mq_free_request(rq);
        }
}

static bool blk_mq_check_expired(struct request *rq, void *priv)
{
        struct blk_expired_data *expired = priv;

        /*
         * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
         * be reallocated underneath the timeout handler's processing, then
         * the expire check is reliable. If the request is not expired, then
         * it was completed and reallocated as a new request after returning
         * from blk_mq_check_expired().
         */
        if (blk_mq_req_expired(rq, expired)) {
                expired->has_timedout_rq = true;
                return false;
        }
        return true;
}

static bool blk_mq_handle_expired(struct request *rq, void *priv)
{
        struct blk_expired_data *expired = priv;

        if (blk_mq_req_expired(rq, expired))
                blk_mq_rq_timed_out(rq);
        return true;
}

static void blk_mq_timeout_work(struct work_struct *work)
{
        struct request_queue *q =
                container_of(work, struct request_queue, timeout_work);
        struct blk_expired_data expired = {
                .timeout_start = jiffies,
        };
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        /* A deadlock might occur if a request is stuck requiring a
         * timeout at the same time a queue freeze is waiting
         * completion, since the timeout code would not be able to
         * acquire the queue reference here.
         *
         * That's why we don't use blk_queue_enter here; instead, we use
         * percpu_ref_tryget directly, because we need to be able to
         * obtain a reference even in the short window between the queue
         * starting to freeze, by dropping the first reference in
         * blk_freeze_queue_start, and the moment the last request is
         * consumed, marked by the instant q_usage_counter reaches
         * zero.
         */
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;

        /* check if there is any timed-out request */
        blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
        if (expired.has_timedout_rq) {
                /*
                 * Before walking tags, we must ensure any submit started
                 * before the current time has finished. Since the submit
                 * uses srcu or rcu, wait for a synchronization point to
                 * ensure all running submits have finished
                 */
                blk_mq_wait_quiesce_done(q->tag_set);

                expired.next = 0;
                blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
        }

        if (expired.next != 0) {
                mod_timer(&q->timeout, expired.next);
        } else {
                /*
                 * Request timeouts are handled as a forward rolling timer. If
                 * we end up here it means that no requests are pending and
                 * also that no request has been pending for a while. Mark
                 * each hctx as idle.
                 */
                queue_for_each_hw_ctx(q, hctx, i) {
                        /* the hctx may be unmapped, so check it here */
                        if (blk_mq_hw_queue_mapped(hctx))
                                blk_mq_tag_idle(hctx);
                }
        }
        blk_queue_exit(q);
}

struct flush_busy_ctx_data {
        struct blk_mq_hw_ctx *hctx;
        struct list_head *list;
};

static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
{
        struct flush_busy_ctx_data *flush_data = data;
        struct blk_mq_hw_ctx *hctx = flush_data->hctx;
        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
        enum hctx_type type = hctx->type;

        spin_lock(&ctx->lock);
        list_splice_tail_init(&ctx->rq_lists[type], flush_data->list);
        sbitmap_clear_bit(sb, bitnr);
        spin_unlock(&ctx->lock);
        return true;
}

/*
 * Process software queues that have been marked busy, splicing them
 * to the for-dispatch
 */
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
        struct flush_busy_ctx_data data = {
                .hctx = hctx,
                .list = list,
        };

        sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
}

struct dispatch_rq_data {
        struct blk_mq_hw_ctx *hctx;
        struct request *rq;
};

static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
                void *data)
{
        struct dispatch_rq_data *dispatch_data = data;
        struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
        struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
        enum hctx_type type = hctx->type;

        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_lists[type])) {
                dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next);
                list_del_init(&dispatch_data->rq->queuelist);
                if (list_empty(&ctx->rq_lists[type]))
                        sbitmap_clear_bit(sb, bitnr);
        }
        spin_unlock(&ctx->lock);

        return !dispatch_data->rq;
}

struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
                                        struct blk_mq_ctx *start)
{
        unsigned off = start ? start->index_hw[hctx->type] : 0;
        struct dispatch_rq_data data = {
                .hctx = hctx,
                .rq   = NULL,
        };

        __sbitmap_for_each_set(&hctx->ctx_map, off,
                               dispatch_rq_from_ctx, &data);

        return data.rq;
}

bool __blk_mq_alloc_driver_tag(struct request *rq)
{
        struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
        unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
        int tag;

        blk_mq_tag_busy(rq->mq_hctx);

        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
                bt = &rq->mq_hctx->tags->breserved_tags;
                tag_offset = 0;
        } else {
                if (!hctx_may_queue(rq->mq_hctx, bt))
                        return false;
        }

        tag = __sbitmap_queue_get(bt);
        if (tag == BLK_MQ_NO_TAG)
                return false;

        rq->tag = tag + tag_offset;
        blk_mq_inc_active_requests(rq->mq_hctx);
        return true;
}

static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
                                int flags, void *key)
{
        struct blk_mq_hw_ctx *hctx;

        hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);

        spin_lock(&hctx->dispatch_wait_lock);
        if (!list_empty(&wait->entry)) {
                struct sbitmap_queue *sbq;

                list_del_init(&wait->entry);
                sbq = &hctx->tags->bitmap_tags;
                atomic_dec(&sbq->ws_active);
        }
        spin_unlock(&hctx->dispatch_wait_lock);

        blk_mq_run_hw_queue(hctx, true);
        return 1;
}

/*
 * Mark us waiting for a tag. For shared tags, this involves hooking us into
 * the tag wakeups. For non-shared tags, we can simply mark us needing a
 * restart. For both cases, take care to check the condition again after
 * marking us as waiting.
 */
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
                                 struct request *rq)
{
        struct sbitmap_queue *sbq;
        struct wait_queue_head *wq;
        wait_queue_entry_t *wait;
        bool ret;

        if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
            !(blk_mq_is_shared_tags(hctx->flags))) {
                blk_mq_sched_mark_restart_hctx(hctx);

                /*
                 * It's possible that a tag was freed in the window between the
                 * allocation failure and adding the hardware queue to the wait
                 * queue.
                 *
                 * Don't clear RESTART here, someone else could have set it.
                 * At most this will cost an extra queue run.
                 */
                return blk_mq_get_driver_tag(rq);
        }

        wait = &hctx->dispatch_wait;
        if (!list_empty_careful(&wait->entry))
                return false;

        if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag))
                sbq = &hctx->tags->breserved_tags;
        else
                sbq = &hctx->tags->bitmap_tags;
        wq = &bt_wait_ptr(sbq, hctx)->wait;

        spin_lock_irq(&wq->lock);
        spin_lock(&hctx->dispatch_wait_lock);
        if (!list_empty(&wait->entry)) {
                spin_unlock(&hctx->dispatch_wait_lock);
                spin_unlock_irq(&wq->lock);
                return false;
        }

        atomic_inc(&sbq->ws_active);
        wait->flags &= ~WQ_FLAG_EXCLUSIVE;
        __add_wait_queue(wq, wait);

        /*
         * Add one explicit barrier since blk_mq_get_driver_tag() may
         * not imply barrier in case of failure.
         *
         * Order adding us to wait queue and allocating driver tag.
         *
         * The pair is the one implied in sbitmap_queue_wake_up() which
         * orders clearing sbitmap tag bits and waitqueue_active() in
         * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
         *
         * Otherwise, re-order of adding wait queue and getting driver tag
         * may cause __sbitmap_queue_wake_up() to wake up nothing because
         * the waitqueue_active() may not observe us in wait queue.
         */
        smp_mb();

        /*
         * It's possible that a tag was freed in the window between the
         * allocation failure and adding the hardware queue to the wait
         * queue.
         */
        ret = blk_mq_get_driver_tag(rq);
        if (!ret) {
                spin_unlock(&hctx->dispatch_wait_lock);
                spin_unlock_irq(&wq->lock);
                return false;
        }

        /*
         * We got a tag, remove ourselves from the wait queue to ensure
         * someone else gets the wakeup.
         */
        list_del_init(&wait->entry);
        atomic_dec(&sbq->ws_active);
        spin_unlock(&hctx->dispatch_wait_lock);
        spin_unlock_irq(&wq->lock);

        return true;
}

#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT  8
#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR  4
/*
 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA):
 * - EWMA is one simple way to compute running average value
 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially
 * - take 4 as factor for avoiding to get too small(0) result, and this
 *   factor doesn't matter because EWMA decreases exponentially
 */
static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
        unsigned int ewma;

        ewma = hctx->dispatch_busy;

        if (!ewma && !busy)
                return;

        ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1;
        if (busy)
                ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR;
        ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT;

        hctx->dispatch_busy = ewma;
}

#define BLK_MQ_RESOURCE_DELAY        3                /* ms units */

static void blk_mq_handle_dev_resource(struct request *rq,
                                       struct list_head *list)
{
        list_add(&rq->queuelist, list);
        __blk_mq_requeue_request(rq);
}

enum prep_dispatch {
        PREP_DISPATCH_OK,
        PREP_DISPATCH_NO_TAG,
        PREP_DISPATCH_NO_BUDGET,
};

static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
                                                  bool need_budget)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
        int budget_token = -1;

        if (need_budget) {
                budget_token = blk_mq_get_dispatch_budget(rq->q);
                if (budget_token < 0) {
                        blk_mq_put_driver_tag(rq);
                        return PREP_DISPATCH_NO_BUDGET;
                }
                blk_mq_set_rq_budget_token(rq, budget_token);
        }

        if (!blk_mq_get_driver_tag(rq)) {
                /*
                 * The initial allocation attempt failed, so we need to
                 * rerun the hardware queue when a tag is freed. The
                 * waitqueue takes care of that. If the queue is run
                 * before we add this entry back on the dispatch list,
                 * we'll re-run it below.
                 */
                if (!blk_mq_mark_tag_wait(hctx, rq)) {
                        /*
                         * All budgets not got from this function will be put
                         * together during handling partial dispatch
                         */
                        if (need_budget)
                                blk_mq_put_dispatch_budget(rq->q, budget_token);
                        return PREP_DISPATCH_NO_TAG;
                }
        }

        return PREP_DISPATCH_OK;
}

/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
static void blk_mq_release_budgets(struct request_queue *q,
                struct list_head *list)
{
        struct request *rq;

        list_for_each_entry(rq, list, queuelist) {
                int budget_token = blk_mq_get_rq_budget_token(rq);

                if (budget_token >= 0)
                        blk_mq_put_dispatch_budget(q, budget_token);
        }
}

/*
 * blk_mq_commit_rqs will notify driver using bd->last that there is no
 * more requests. (See comment in struct blk_mq_ops for commit_rqs for
 * details)
 * Attention, we should explicitly call this in unusual cases:
 *  1) did not queue everything initially scheduled to queue
 *  2) the last attempt to queue a request failed
 */
static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
                              bool from_schedule)
{
        if (hctx->queue->mq_ops->commit_rqs && queued) {
                trace_block_unplug(hctx->queue, queued, !from_schedule);
                hctx->queue->mq_ops->commit_rqs(hctx);
        }
}

/*
 * Returns true if we did some work AND can potentially do more.
 */
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
                             unsigned int nr_budgets)
{
        enum prep_dispatch prep;
        struct request_queue *q = hctx->queue;
        struct request *rq;
        int queued;
        blk_status_t ret = BLK_STS_OK;
        bool needs_resource = false;

        if (list_empty(list))
                return false;

        /*
         * Now process all the entries, sending them to the driver.
         */
        queued = 0;
        do {
                struct blk_mq_queue_data bd;

                rq = list_first_entry(list, struct request, queuelist);

                WARN_ON_ONCE(hctx != rq->mq_hctx);
                prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
                if (prep != PREP_DISPATCH_OK)
                        break;

                list_del_init(&rq->queuelist);

                bd.rq = rq;
                bd.last = list_empty(list);

                /*
                 * once the request is queued to lld, no need to cover the
                 * budget any more
                 */
                if (nr_budgets)
                        nr_budgets--;
                ret = q->mq_ops->queue_rq(hctx, &bd);
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                        needs_resource = true;
                        fallthrough;
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_handle_dev_resource(rq, list);
                        goto out;
                default:
                        blk_mq_end_request(rq, ret);
                }
        } while (!list_empty(list));
out:
        /* If we didn't flush the entire list, we could have told the driver
         * there was more coming, but that turned out to be a lie.
         */
        if (!list_empty(list) || ret != BLK_STS_OK)
                blk_mq_commit_rqs(hctx, queued, false);

        /*
         * Any items that need requeuing? Stuff them into hctx->dispatch,
         * that is where we will continue on next queue run.
         */
        if (!list_empty(list)) {
                bool needs_restart;
                /* For non-shared tags, the RESTART check will suffice */
                bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
                        ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
                        blk_mq_is_shared_tags(hctx->flags));

                if (nr_budgets)
                        blk_mq_release_budgets(q, list);

                spin_lock(&hctx->lock);
                list_splice_tail_init(list, &hctx->dispatch);
                spin_unlock(&hctx->lock);

                /*
                 * Order adding requests to hctx->dispatch and checking
                 * SCHED_RESTART flag. The pair of this smp_mb() is the one
                 * in blk_mq_sched_restart(). Avoid restart code path to
                 * miss the new added requests to hctx->dispatch, meantime
                 * SCHED_RESTART is observed here.
                 */
                smp_mb();

                /*
                 * If SCHED_RESTART was set by the caller of this function and
                 * it is no longer set that means that it was cleared by another
                 * thread and hence that a queue rerun is needed.
                 *
                 * If 'no_tag' is set, that means that we failed getting
                 * a driver tag with an I/O scheduler attached. If our dispatch
                 * waitqueue is no longer active, ensure that we run the queue
                 * AFTER adding our entries back to the list.
                 *
                 * If no I/O scheduler has been configured it is possible that
                 * the hardware queue got stopped and restarted before requests
                 * were pushed back onto the dispatch list. Rerun the queue to
                 * avoid starvation. Notes:
                 * - blk_mq_run_hw_queue() checks whether or not a queue has
                 *   been stopped before rerunning a queue.
                 * - Some but not all block drivers stop a queue before
                 *   returning BLK_STS_RESOURCE. Two exceptions are scsi-mq
                 *   and dm-rq.
                 *
                 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART
                 * bit is set, run queue after a delay to avoid IO stalls
                 * that could otherwise occur if the queue is idle.  We'll do
                 * similar if we couldn't get budget or couldn't lock a zone
                 * and SCHED_RESTART is set.
                 */
                needs_restart = blk_mq_sched_needs_restart(hctx);
                if (prep == PREP_DISPATCH_NO_BUDGET)
                        needs_resource = true;
                if (!needs_restart ||
                    (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
                        blk_mq_run_hw_queue(hctx, true);
                else if (needs_resource)
                        blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);

                blk_mq_update_dispatch_busy(hctx, true);
                return false;
        }

        blk_mq_update_dispatch_busy(hctx, false);
        return true;
}

static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
{
        int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask);

        if (cpu >= nr_cpu_ids)
                cpu = cpumask_first(hctx->cpumask);
        return cpu;
}

/*
 * ->next_cpu is always calculated from hctx->cpumask, so simply use
 * it for speeding up the check
 */
static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
{
        return hctx->next_cpu >= nr_cpu_ids;
}

/*
 * It'd be great if the workqueue API had a way to pass
 * in a mask and had some smarts for more clever placement.
 * For now we just round-robin here, switching for every
 * BLK_MQ_CPU_WORK_BATCH queued items.
 */
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{
        bool tried = false;
        int next_cpu = hctx->next_cpu;

        /* Switch to unbound if no allowable CPUs in this hctx */
        if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
                return WORK_CPU_UNBOUND;

        if (--hctx->next_cpu_batch <= 0) {
select_cpu:
                next_cpu = cpumask_next_and(next_cpu, hctx->cpumask,
                                cpu_online_mask);
                if (next_cpu >= nr_cpu_ids)
                        next_cpu = blk_mq_first_mapped_cpu(hctx);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }

        /*
         * Do unbound schedule if we can't find a online CPU for this hctx,
         * and it should only happen in the path of handling CPU DEAD.
         */
        if (!cpu_online(next_cpu)) {
                if (!tried) {
                        tried = true;
                        goto select_cpu;
                }

                /*
                 * Make sure to re-select CPU next time once after CPUs
                 * in hctx->cpumask become online again.
                 */
                hctx->next_cpu = next_cpu;
                hctx->next_cpu_batch = 1;
                return WORK_CPU_UNBOUND;
        }

        hctx->next_cpu = next_cpu;
        return next_cpu;
}

/**
 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
 * @hctx: Pointer to the hardware queue to run.
 * @msecs: Milliseconds of delay to wait before running the queue.
 *
 * Run a hardware queue asynchronously with a delay of @msecs.
 */
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
        if (unlikely(blk_mq_hctx_stopped(hctx)))
                return;
        kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
                                    msecs_to_jiffies(msecs));
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);

static inline bool blk_mq_hw_queue_need_run(struct blk_mq_hw_ctx *hctx)
{
        bool need_run;

        /*
         * When queue is quiesced, we may be switching io scheduler, or
         * updating nr_hw_queues, or other things, and we can't run queue
         * any more, even blk_mq_hctx_has_pending() can't be called safely.
         *
         * And queue will be rerun in blk_mq_unquiesce_queue() if it is
         * quiesced.
         */
        __blk_mq_run_dispatch_ops(hctx->queue, false,
                need_run = !blk_queue_quiesced(hctx->queue) &&
                blk_mq_hctx_has_pending(hctx));
        return need_run;
}

/**
 * blk_mq_run_hw_queue - Start to run a hardware queue.
 * @hctx: Pointer to the hardware queue to run.
 * @async: If we want to run the queue asynchronously.
 *
 * Check if the request queue is not in a quiesced state and if there are
 * pending requests to be sent. If this is true, run the queue to send requests
 * to hardware.
 */
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
        bool need_run;

        /*
         * We can't run the queue inline with interrupts disabled.
         */
        WARN_ON_ONCE(!async && in_interrupt());

        might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);

        need_run = blk_mq_hw_queue_need_run(hctx);
        if (!need_run) {
                unsigned long flags;

                /*
                 * Synchronize with blk_mq_unquiesce_queue(), because we check
                 * if hw queue is quiesced locklessly above, we need the use
                 * ->queue_lock to make sure we see the up-to-date status to
                 * not miss rerunning the hw queue.
                 */
                spin_lock_irqsave(&hctx->queue->queue_lock, flags);
                need_run = blk_mq_hw_queue_need_run(hctx);
                spin_unlock_irqrestore(&hctx->queue->queue_lock, flags);

                if (!need_run)
                        return;
        }

        if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
                blk_mq_delay_run_hw_queue(hctx, 0);
                return;
        }

        blk_mq_run_dispatch_ops(hctx->queue,
                                blk_mq_sched_dispatch_requests(hctx));
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);

/*
 * Return prefered queue to dispatch from (if any) for non-mq aware IO
 * scheduler.
 */
static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
{
        struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
        /*
         * If the IO scheduler does not respect hardware queues when
         * dispatching, we just don't bother with multiple HW queues and
         * dispatch from hctx for the current CPU since running multiple queues
         * just causes lock contention inside the scheduler and pointless cache
         * bouncing.
         */
        struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];

        if (!blk_mq_hctx_stopped(hctx))
                return hctx;
        return NULL;
}

/**
 * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
 * @q: Pointer to the request queue to run.
 * @async: If we want to run the queue asynchronously.
 */
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
        struct blk_mq_hw_ctx *hctx, *sq_hctx;
        unsigned long i;

        sq_hctx = NULL;
        if (blk_queue_sq_sched(q))
                sq_hctx = blk_mq_get_sq_hctx(q);
        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_hctx_stopped(hctx))
                        continue;
                /*
                 * Dispatch from this hctx either if there's no hctx preferred
                 * by IO scheduler or if it has requests that bypass the
                 * scheduler.
                 */
                if (!sq_hctx || sq_hctx == hctx ||
                    !list_empty_careful(&hctx->dispatch))
                        blk_mq_run_hw_queue(hctx, async);
        }
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);

/**
 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
 * @q: Pointer to the request queue to run.
 * @msecs: Milliseconds of delay to wait before running the queues.
 */
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
        struct blk_mq_hw_ctx *hctx, *sq_hctx;
        unsigned long i;

        sq_hctx = NULL;
        if (blk_queue_sq_sched(q))
                sq_hctx = blk_mq_get_sq_hctx(q);
        queue_for_each_hw_ctx(q, hctx, i) {
                if (blk_mq_hctx_stopped(hctx))
                        continue;
                /*
                 * If there is already a run_work pending, leave the
                 * pending delay untouched. Otherwise, a hctx can stall
                 * if another hctx is re-delaying the other's work
                 * before the work executes.
                 */
                if (delayed_work_pending(&hctx->run_work))
                        continue;
                /*
                 * Dispatch from this hctx either if there's no hctx preferred
                 * by IO scheduler or if it has requests that bypass the
                 * scheduler.
                 */
                if (!sq_hctx || sq_hctx == hctx ||
                    !list_empty_careful(&hctx->dispatch))
                        blk_mq_delay_run_hw_queue(hctx, msecs);
        }
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);

/*
 * This function is often used for pausing .queue_rq() by driver when
 * there isn't enough resource or some conditions aren't satisfied, and
 * BLK_STS_RESOURCE is usually returned.
 *
 * We do not guarantee that dispatch can be drained or blocked
 * after blk_mq_stop_hw_queue() returns. Please use
 * blk_mq_quiesce_queue() for that requirement.
 */
void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
{
        cancel_delayed_work(&hctx->run_work);

        set_bit(BLK_MQ_S_STOPPED, &hctx->state);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queue);

/*
 * This function is often used for pausing .queue_rq() by driver when
 * there isn't enough resource or some conditions aren't satisfied, and
 * BLK_STS_RESOURCE is usually returned.
 *
 * We do not guarantee that dispatch can be drained or blocked
 * after blk_mq_stop_hw_queues() returns. Please use
 * blk_mq_quiesce_queue() for that requirement.
 */
void blk_mq_stop_hw_queues(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_stop_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_stop_hw_queues);

void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{
        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);

        blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
}
EXPORT_SYMBOL(blk_mq_start_hw_queue);

void blk_mq_start_hw_queues(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_start_hw_queue(hctx);
}
EXPORT_SYMBOL(blk_mq_start_hw_queues);

void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
        if (!blk_mq_hctx_stopped(hctx))
                return;

        clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
        /*
         * Pairs with the smp_mb() in blk_mq_hctx_stopped() to order the
         * clearing of BLK_MQ_S_STOPPED above and the checking of dispatch
         * list in the subsequent routine.
         */
        smp_mb__after_atomic();
        blk_mq_run_hw_queue(hctx, async);
}
EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);

void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                blk_mq_start_stopped_hw_queue(hctx, async ||
                                        (hctx->flags & BLK_MQ_F_BLOCKING));
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);

static void blk_mq_run_work_fn(struct work_struct *work)
{
        struct blk_mq_hw_ctx *hctx =
                container_of(work, struct blk_mq_hw_ctx, run_work.work);

        blk_mq_run_dispatch_ops(hctx->queue,
                                blk_mq_sched_dispatch_requests(hctx));
}

/**
 * blk_mq_request_bypass_insert - Insert a request at dispatch list.
 * @rq: Pointer to request to be inserted.
 * @flags: BLK_MQ_INSERT_*
 *
 * Should only be used carefully, when the caller knows we want to
 * bypass a potential IO scheduler on the target device.
 */
static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        spin_lock(&hctx->lock);
        if (flags & BLK_MQ_INSERT_AT_HEAD)
                list_add(&rq->queuelist, &hctx->dispatch);
        else
                list_add_tail(&rq->queuelist, &hctx->dispatch);
        spin_unlock(&hctx->lock);
}

static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
                struct blk_mq_ctx *ctx, struct list_head *list,
                bool run_queue_async)
{
        struct request *rq;
        enum hctx_type type = hctx->type;

        /*
         * Try to issue requests directly if the hw queue isn't busy to save an
         * extra enqueue & dequeue to the sw queue.
         */
        if (!hctx->dispatch_busy && !run_queue_async) {
                blk_mq_run_dispatch_ops(hctx->queue,
                        blk_mq_try_issue_list_directly(hctx, list));
                if (list_empty(list))
                        goto out;
        }

        /*
         * preemption doesn't flush plug list, so it's possible ctx->cpu is
         * offline now
         */
        list_for_each_entry(rq, list, queuelist) {
                BUG_ON(rq->mq_ctx != ctx);
                trace_block_rq_insert(rq);
                if (rq->cmd_flags & REQ_NOWAIT)
                        run_queue_async = true;
        }

        spin_lock(&ctx->lock);
        list_splice_tail_init(list, &ctx->rq_lists[type]);
        blk_mq_hctx_mark_pending(hctx, ctx);
        spin_unlock(&ctx->lock);
out:
        blk_mq_run_hw_queue(hctx, run_queue_async);
}

static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
{
        struct request_queue *q = rq->q;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        if (blk_rq_is_passthrough(rq)) {
                /*
                 * Passthrough request have to be added to hctx->dispatch
                 * directly.  The device may be in a situation where it can't
                 * handle FS request, and always returns BLK_STS_RESOURCE for
                 * them, which gets them added to hctx->dispatch.
                 *
                 * If a passthrough request is required to unblock the queues,
                 * and it is added to the scheduler queue, there is no chance to
                 * dispatch it given we prioritize requests in hctx->dispatch.
                 */
                blk_mq_request_bypass_insert(rq, flags);
        } else if (req_op(rq) == REQ_OP_FLUSH) {
                /*
                 * Firstly normal IO request is inserted to scheduler queue or
                 * sw queue, meantime we add flush request to dispatch queue(
                 * hctx->dispatch) directly and there is at most one in-flight
                 * flush request for each hw queue, so it doesn't matter to add
                 * flush request to tail or front of the dispatch queue.
                 *
                 * Secondly in case of NCQ, flush request belongs to non-NCQ
                 * command, and queueing it will fail when there is any
                 * in-flight normal IO request(NCQ command). When adding flush
                 * rq to the front of hctx->dispatch, it is easier to introduce
                 * extra time to flush rq's latency because of S_SCHED_RESTART
                 * compared with adding to the tail of dispatch queue, then
                 * chance of flush merge is increased, and less flush requests
                 * will be issued to controller. It is observed that ~10% time
                 * is saved in blktests block/004 on disk attached to AHCI/NCQ
                 * drive when adding flush rq to the front of hctx->dispatch.
                 *
                 * Simply queue flush rq to the front of hctx->dispatch so that
                 * intensive flush workloads can benefit in case of NCQ HW.
                 */
                blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD);
        } else if (q->elevator) {
                LIST_HEAD(list);

                WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG);

                list_add(&rq->queuelist, &list);
                q->elevator->type->ops.insert_requests(hctx, &list, flags);
        } else {
                trace_block_rq_insert(rq);

                spin_lock(&ctx->lock);
                if (flags & BLK_MQ_INSERT_AT_HEAD)
                        list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
                else
                        list_add_tail(&rq->queuelist,
                                      &ctx->rq_lists[hctx->type]);
                blk_mq_hctx_mark_pending(hctx, ctx);
                spin_unlock(&ctx->lock);
        }
}

static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
                unsigned int nr_segs)
{
        int err;

        if (bio->bi_opf & REQ_RAHEAD)
                rq->cmd_flags |= REQ_FAILFAST_MASK;

        rq->bio = rq->biotail = bio;
        rq->__sector = bio->bi_iter.bi_sector;
        rq->__data_len = bio->bi_iter.bi_size;
        rq->nr_phys_segments = nr_segs;
        if (bio_integrity(bio))
                rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
                                                                      bio);

        /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
        err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
        WARN_ON_ONCE(err);

        blk_account_io_start(rq);
}

static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
                                            struct request *rq, bool last)
{
        struct request_queue *q = rq->q;
        struct blk_mq_queue_data bd = {
                .rq = rq,
                .last = last,
        };
        blk_status_t ret;

        /*
         * For OK queue, we are done. For error, caller may kill it.
         * Any other error (busy), just add it to our list as we
         * previously would have done.
         */
        ret = q->mq_ops->queue_rq(hctx, &bd);
        switch (ret) {
        case BLK_STS_OK:
                blk_mq_update_dispatch_busy(hctx, false);
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
                blk_mq_update_dispatch_busy(hctx, true);
                __blk_mq_requeue_request(rq);
                break;
        default:
                blk_mq_update_dispatch_busy(hctx, false);
                break;
        }

        return ret;
}

static bool blk_mq_get_budget_and_tag(struct request *rq)
{
        int budget_token;

        budget_token = blk_mq_get_dispatch_budget(rq->q);
        if (budget_token < 0)
                return false;
        blk_mq_set_rq_budget_token(rq, budget_token);
        if (!blk_mq_get_driver_tag(rq)) {
                blk_mq_put_dispatch_budget(rq->q, budget_token);
                return false;
        }
        return true;
}

/**
 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
 * @hctx: Pointer of the associated hardware queue.
 * @rq: Pointer to request to be sent.
 *
 * If the device has enough resources to accept a new request now, send the
 * request directly to device driver. Else, insert at hctx->dispatch queue, so
 * we can try send it another time in the future. Requests inserted at this
 * queue have higher priority.
 */
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
                struct request *rq)
{
        blk_status_t ret;

        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, false);
                return;
        }

        if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT);
                return;
        }

        ret = __blk_mq_issue_directly(hctx, rq, true);
        switch (ret) {
        case BLK_STS_OK:
                break;
        case BLK_STS_RESOURCE:
        case BLK_STS_DEV_RESOURCE:
                blk_mq_request_bypass_insert(rq, 0);
                blk_mq_run_hw_queue(hctx, false);
                break;
        default:
                blk_mq_end_request(rq, ret);
                break;
        }
}

static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
        struct blk_mq_hw_ctx *hctx = rq->mq_hctx;

        if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, false);
                return BLK_STS_OK;
        }

        if (!blk_mq_get_budget_and_tag(rq))
                return BLK_STS_RESOURCE;
        return __blk_mq_issue_directly(hctx, rq, last);
}

static void blk_mq_plug_issue_direct(struct blk_plug *plug)
{
        struct blk_mq_hw_ctx *hctx = NULL;
        struct request *rq;
        int queued = 0;
        blk_status_t ret = BLK_STS_OK;

        while ((rq = rq_list_pop(&plug->mq_list))) {
                bool last = rq_list_empty(&plug->mq_list);

                if (hctx != rq->mq_hctx) {
                        if (hctx) {
                                blk_mq_commit_rqs(hctx, queued, false);
                                queued = 0;
                        }
                        hctx = rq->mq_hctx;
                }

                ret = blk_mq_request_issue_directly(rq, last);
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_request_bypass_insert(rq, 0);
                        blk_mq_run_hw_queue(hctx, false);
                        goto out;
                default:
                        blk_mq_end_request(rq, ret);
                        break;
                }
        }

out:
        if (ret != BLK_STS_OK)
                blk_mq_commit_rqs(hctx, queued, false);
}

static void __blk_mq_flush_plug_list(struct request_queue *q,
                                     struct blk_plug *plug)
{
        if (blk_queue_quiesced(q))
                return;
        q->mq_ops->queue_rqs(&plug->mq_list);
}

static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
{
        struct blk_mq_hw_ctx *this_hctx = NULL;
        struct blk_mq_ctx *this_ctx = NULL;
        struct rq_list requeue_list = {};
        unsigned int depth = 0;
        bool is_passthrough = false;
        LIST_HEAD(list);

        do {
                struct request *rq = rq_list_pop(&plug->mq_list);

                if (!this_hctx) {
                        this_hctx = rq->mq_hctx;
                        this_ctx = rq->mq_ctx;
                        is_passthrough = blk_rq_is_passthrough(rq);
                } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
                           is_passthrough != blk_rq_is_passthrough(rq)) {
                        rq_list_add_tail(&requeue_list, rq);
                        continue;
                }
                list_add_tail(&rq->queuelist, &list);
                depth++;
        } while (!rq_list_empty(&plug->mq_list));

        plug->mq_list = requeue_list;
        trace_block_unplug(this_hctx->queue, depth, !from_sched);

        percpu_ref_get(&this_hctx->queue->q_usage_counter);
        /* passthrough requests should never be issued to the I/O scheduler */
        if (is_passthrough) {
                spin_lock(&this_hctx->lock);
                list_splice_tail_init(&list, &this_hctx->dispatch);
                spin_unlock(&this_hctx->lock);
                blk_mq_run_hw_queue(this_hctx, from_sched);
        } else if (this_hctx->queue->elevator) {
                this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
                                &list, 0);
                blk_mq_run_hw_queue(this_hctx, from_sched);
        } else {
                blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched);
        }
        percpu_ref_put(&this_hctx->queue->q_usage_counter);
}

void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
        struct request *rq;
        unsigned int depth;

        /*
         * We may have been called recursively midway through handling
         * plug->mq_list via a schedule() in the driver's queue_rq() callback.
         * To avoid mq_list changing under our feet, clear rq_count early and
         * bail out specifically if rq_count is 0 rather than checking
         * whether the mq_list is empty.
         */
        if (plug->rq_count == 0)
                return;
        depth = plug->rq_count;
        plug->rq_count = 0;

        if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
                struct request_queue *q;

                rq = rq_list_peek(&plug->mq_list);
                q = rq->q;
                trace_block_unplug(q, depth, true);

                /*
                 * Peek first request and see if we have a ->queue_rqs() hook.
                 * If we do, we can dispatch the whole plug list in one go. We
                 * already know at this point that all requests belong to the
                 * same queue, caller must ensure that's the case.
                 */
                if (q->mq_ops->queue_rqs) {
                        blk_mq_run_dispatch_ops(q,
                                __blk_mq_flush_plug_list(q, plug));
                        if (rq_list_empty(&plug->mq_list))
                                return;
                }

                blk_mq_run_dispatch_ops(q,
                                blk_mq_plug_issue_direct(plug));
                if (rq_list_empty(&plug->mq_list))
                        return;
        }

        do {
                blk_mq_dispatch_plug_list(plug, from_schedule);
        } while (!rq_list_empty(&plug->mq_list));
}

static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
                struct list_head *list)
{
        int queued = 0;
        blk_status_t ret = BLK_STS_OK;

        while (!list_empty(list)) {
                struct request *rq = list_first_entry(list, struct request,
                                queuelist);

                list_del_init(&rq->queuelist);
                ret = blk_mq_request_issue_directly(rq, list_empty(list));
                switch (ret) {
                case BLK_STS_OK:
                        queued++;
                        break;
                case BLK_STS_RESOURCE:
                case BLK_STS_DEV_RESOURCE:
                        blk_mq_request_bypass_insert(rq, 0);
                        if (list_empty(list))
                                blk_mq_run_hw_queue(hctx, false);
                        goto out;
                default:
                        blk_mq_end_request(rq, ret);
                        break;
                }
        }

out:
        if (ret != BLK_STS_OK)
                blk_mq_commit_rqs(hctx, queued, false);
}

static bool blk_mq_attempt_bio_merge(struct request_queue *q,
                                     struct bio *bio, unsigned int nr_segs)
{
        if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
                if (blk_attempt_plug_merge(q, bio, nr_segs))
                        return true;
                if (blk_mq_sched_bio_merge(q, bio, nr_segs))
                        return true;
        }
        return false;
}

static struct request *blk_mq_get_new_requests(struct request_queue *q,
                                               struct blk_plug *plug,
                                               struct bio *bio)
{
        struct blk_mq_alloc_data data = {
                .q                = q,
                .nr_tags        = 1,
                .cmd_flags        = bio->bi_opf,
        };
        struct request *rq;

        rq_qos_throttle(q, bio);

        if (plug) {
                data.nr_tags = plug->nr_ios;
                plug->nr_ios = 1;
                data.cached_rqs = &plug->cached_rqs;
        }

        rq = __blk_mq_alloc_requests(&data);
        if (unlikely(!rq))
                rq_qos_cleanup(q, bio);
        return rq;
}

/*
 * Check if there is a suitable cached request and return it.
 */
static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
                struct request_queue *q, blk_opf_t opf)
{
        enum hctx_type type = blk_mq_get_hctx_type(opf);
        struct request *rq;

        if (!plug)
                return NULL;
        rq = rq_list_peek(&plug->cached_rqs);
        if (!rq || rq->q != q)
                return NULL;
        if (type != rq->mq_hctx->type &&
            (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
                return NULL;
        if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
                return NULL;
        return rq;
}

static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
                struct bio *bio)
{
        if (rq_list_pop(&plug->cached_rqs) != rq)
                WARN_ON_ONCE(1);

        /*
         * If any qos ->throttle() end up blocking, we will have flushed the
         * plug and hence killed the cached_rq list as well. Pop this entry
         * before we throttle.
         */
        rq_qos_throttle(rq->q, bio);

        blk_mq_rq_time_init(rq, blk_time_get_ns());
        rq->cmd_flags = bio->bi_opf;
        INIT_LIST_HEAD(&rq->queuelist);
}

static bool bio_unaligned(const struct bio *bio, struct request_queue *q)
{
        unsigned int bs_mask = queue_logical_block_size(q) - 1;

        /* .bi_sector of any zero sized bio need to be initialized */
        if ((bio->bi_iter.bi_size & bs_mask) ||
            ((bio->bi_iter.bi_sector << SECTOR_SHIFT) & bs_mask))
                return true;
        return false;
}

/**
 * blk_mq_submit_bio - Create and send a request to block device.
 * @bio: Bio pointer.
 *
 * Builds up a request structure from @q and @bio and send to the device. The
 * request may not be queued directly to hardware if:
 * * This request can be merged with another one
 * * We want to place request at plug queue for possible future merging
 * * There is an IO scheduler active at this queue
 *
 * It will not queue the request if there is an error with the bio, or at the
 * request creation.
 */
void blk_mq_submit_bio(struct bio *bio)
{
        struct request_queue *q = bdev_get_queue(bio->bi_bdev);
        struct blk_plug *plug = current->plug;
        const int is_sync = op_is_sync(bio->bi_opf);
        struct blk_mq_hw_ctx *hctx;
        unsigned int nr_segs;
        struct request *rq;
        blk_status_t ret;

        /*
         * If the plug has a cached request for this queue, try to use it.
         */
        rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);

        /*
         * A BIO that was released from a zone write plug has already been
         * through the preparation in this function, already holds a reference
         * on the queue usage counter, and is the only write BIO in-flight for
         * the target zone. Go straight to preparing a request for it.
         */
        if (bio_zone_write_plugging(bio)) {
                nr_segs = bio->__bi_nr_segments;
                if (rq)
                        blk_queue_exit(q);
                goto new_request;
        }

        bio = blk_queue_bounce(bio, q);

        /*
         * The cached request already holds a q_usage_counter reference and we
         * don't have to acquire a new one if we use it.
         */
        if (!rq) {
                if (unlikely(bio_queue_enter(bio)))
                        return;
        }

        /*
         * Device reconfiguration may change logical block size or reduce the
         * number of poll queues, so the checks for alignment and poll support
         * have to be done with queue usage counter held.
         */
        if (unlikely(bio_unaligned(bio, q))) {
                bio_io_error(bio);
                goto queue_exit;
        }

        if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
                bio->bi_status = BLK_STS_NOTSUPP;
                bio_endio(bio);
                goto queue_exit;
        }

        bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
        if (!bio)
                goto queue_exit;

        if (!bio_integrity_prep(bio))
                goto queue_exit;

        if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
                goto queue_exit;

        if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
                goto queue_exit;

new_request:
        if (rq) {
                blk_mq_use_cached_rq(rq, plug, bio);
        } else {
                rq = blk_mq_get_new_requests(q, plug, bio);
                if (unlikely(!rq)) {
                        if (bio->bi_opf & REQ_NOWAIT)
                                bio_wouldblock_error(bio);
                        goto queue_exit;
                }
        }

        trace_block_getrq(bio);

        rq_qos_track(q, rq, bio);

        blk_mq_bio_to_request(rq, bio, nr_segs);

        ret = blk_crypto_rq_get_keyslot(rq);
        if (ret != BLK_STS_OK) {
                bio->bi_status = ret;
                bio_endio(bio);
                blk_mq_free_request(rq);
                return;
        }

        if (bio_zone_write_plugging(bio))
                blk_zone_write_plug_init_request(rq);

        if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
                return;

        if (plug) {
                blk_add_rq_to_plug(plug, rq);
                return;
        }

        hctx = rq->mq_hctx;
        if ((rq->rq_flags & RQF_USE_SCHED) ||
            (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
                blk_mq_insert_request(rq, 0);
                blk_mq_run_hw_queue(hctx, true);
        } else {
                blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
        }
        return;

queue_exit:
        /*
         * Don't drop the queue reference if we were trying to use a cached
         * request and thus didn't acquire one.
         */
        if (!rq)
                blk_queue_exit(q);
}

#ifdef CONFIG_BLK_MQ_STACKING
/**
 * blk_insert_cloned_request - Helper for stacking drivers to submit a request
 * @rq: the request being queued
 */
blk_status_t blk_insert_cloned_request(struct request *rq)
{
        struct request_queue *q = rq->q;
        unsigned int max_sectors = blk_queue_get_max_sectors(rq);
        unsigned int max_segments = blk_rq_get_max_segments(rq);
        blk_status_t ret;

        if (blk_rq_sectors(rq) > max_sectors) {
                /*
                 * SCSI device does not have a good way to return if
                 * Write Same/Zero is actually supported. If a device rejects
                 * a non-read/write command (discard, write same,etc.) the
                 * low-level device driver will set the relevant queue limit to
                 * 0 to prevent blk-lib from issuing more of the offending
                 * operations. Commands queued prior to the queue limit being
                 * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
                 * errors being propagated to upper layers.
                 */
                if (max_sectors == 0)
                        return BLK_STS_NOTSUPP;

                printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
                        __func__, blk_rq_sectors(rq), max_sectors);
                return BLK_STS_IOERR;
        }

        /*
         * The queue settings related to segment counting may differ from the
         * original queue.
         */
        rq->nr_phys_segments = blk_recalc_rq_segments(rq);
        if (rq->nr_phys_segments > max_segments) {
                printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n",
                        __func__, rq->nr_phys_segments, max_segments);
                return BLK_STS_IOERR;
        }

        if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
                return BLK_STS_IOERR;

        ret = blk_crypto_rq_get_keyslot(rq);
        if (ret != BLK_STS_OK)
                return ret;

        blk_account_io_start(rq);

        /*
         * Since we have a scheduler attached on the top device,
         * bypass a potential scheduler on the bottom device for
         * insert.
         */
        blk_mq_run_dispatch_ops(q,
                        ret = blk_mq_request_issue_directly(rq, true));
        if (ret)
                blk_account_io_done(rq, blk_time_get_ns());
        return ret;
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);

/**
 * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
 * @rq: the clone request to be cleaned up
 *
 * Description:
 *     Free all bios in @rq for a cloned request.
 */
void blk_rq_unprep_clone(struct request *rq)
{
        struct bio *bio;

        while ((bio = rq->bio) != NULL) {
                rq->bio = bio->bi_next;

                bio_put(bio);
        }
}
EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);

/**
 * blk_rq_prep_clone - Helper function to setup clone request
 * @rq: the request to be setup
 * @rq_src: original request to be cloned
 * @bs: bio_set that bios for clone are allocated from
 * @gfp_mask: memory allocation mask for bio
 * @bio_ctr: setup function to be called for each clone bio.
 *           Returns %0 for success, non %0 for failure.
 * @data: private data to be passed to @bio_ctr
 *
 * Description:
 *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
 *     Also, pages which the original bios are pointing to are not copied
 *     and the cloned bios just point same pages.
 *     So cloned bios must be completed before original bios, which means
 *     the caller must complete @rq before @rq_src.
 */
int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
                      struct bio_set *bs, gfp_t gfp_mask,
                      int (*bio_ctr)(struct bio *, struct bio *, void *),
                      void *data)
{
        struct bio *bio_src;

        if (!bs)
                bs = &fs_bio_set;

        __rq_for_each_bio(bio_src, rq_src) {
                struct bio *bio         = bio_alloc_clone(rq->q->disk->part0, bio_src,
                                        gfp_mask, bs);
                if (!bio)
                        goto free_and_out;

                if (bio_ctr && bio_ctr(bio, bio_src, data)) {
                        bio_put(bio);
                        goto free_and_out;
                }

                if (rq->bio) {
                        rq->biotail->bi_next = bio;
                        rq->biotail = bio;
                } else {
                        rq->bio = rq->biotail = bio;
                }
        }

        /* Copy attributes of the original request to the clone request. */
        rq->__sector = blk_rq_pos(rq_src);
        rq->__data_len = blk_rq_bytes(rq_src);
        if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
                rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
                rq->special_vec = rq_src->special_vec;
        }
        rq->nr_phys_segments = rq_src->nr_phys_segments;
        rq->nr_integrity_segments = rq_src->nr_integrity_segments;

        if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
                goto free_and_out;

        return 0;

free_and_out:
        blk_rq_unprep_clone(rq);

        return -ENOMEM;
}
EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
#endif /* CONFIG_BLK_MQ_STACKING */

/*
 * Steal bios from a request and add them to a bio list.
 * The request must not have been partially completed before.
 */
void blk_steal_bios(struct bio_list *list, struct request *rq)
{
        if (rq->bio) {
                if (list->tail)
                        list->tail->bi_next = rq->bio;
                else
                        list->head = rq->bio;
                list->tail = rq->biotail;

                rq->bio = NULL;
                rq->biotail = NULL;
        }

        rq->__data_len = 0;
}
EXPORT_SYMBOL_GPL(blk_steal_bios);

static size_t order_to_size(unsigned int order)
{
        return (size_t)PAGE_SIZE << order;
}

/* called before freeing request pool in @tags */
static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
                                    struct blk_mq_tags *tags)
{
        struct page *page;
        unsigned long flags;

        /*
         * There is no need to clear mapping if driver tags is not initialized
         * or the mapping belongs to the driver tags.
         */
        if (!drv_tags || drv_tags == tags)
                return;

        list_for_each_entry(page, &tags->page_list, lru) {
                unsigned long start = (unsigned long)page_address(page);
                unsigned long end = start + order_to_size(page->private);
                int i;

                for (i = 0; i < drv_tags->nr_tags; i++) {
                        struct request *rq = drv_tags->rqs[i];
                        unsigned long rq_addr = (unsigned long)rq;

                        if (rq_addr >= start && rq_addr < end) {
                                WARN_ON_ONCE(req_ref_read(rq) != 0);
                                cmpxchg(&drv_tags->rqs[i], rq, NULL);
                        }
                }
        }

        /*
         * Wait until all pending iteration is done.
         *
         * Request reference is cleared and it is guaranteed to be observed
         * after the ->lock is released.
         */
        spin_lock_irqsave(&drv_tags->lock, flags);
        spin_unlock_irqrestore(&drv_tags->lock, flags);
}

void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
                     unsigned int hctx_idx)
{
        struct blk_mq_tags *drv_tags;
        struct page *page;

        if (list_empty(&tags->page_list))
                return;

        if (blk_mq_is_shared_tags(set->flags))
                drv_tags = set->shared_tags;
        else
                drv_tags = set->tags[hctx_idx];

        if (tags->static_rqs && set->ops->exit_request) {
                int i;

                for (i = 0; i < tags->nr_tags; i++) {
                        struct request *rq = tags->static_rqs[i];

                        if (!rq)
                                continue;
                        set->ops->exit_request(set, rq, hctx_idx);
                        tags->static_rqs[i] = NULL;
                }
        }

        blk_mq_clear_rq_mapping(drv_tags, tags);

        while (!list_empty(&tags->page_list)) {
                page = list_first_entry(&tags->page_list, struct page, lru);
                list_del_init(&page->lru);
                /*
                 * Remove kmemleak object previously allocated in
                 * blk_mq_alloc_rqs().
                 */
                kmemleak_free(page_address(page));
                __free_pages(page, page->private);
        }
}

void blk_mq_free_rq_map(struct blk_mq_tags *tags)
{
        kfree(tags->rqs);
        tags->rqs = NULL;
        kfree(tags->static_rqs);
        tags->static_rqs = NULL;

        blk_mq_free_tags(tags);
}

static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
                unsigned int hctx_idx)
{
        int i;

        for (i = 0; i < set->nr_maps; i++) {
                unsigned int start = set->map[i].queue_offset;
                unsigned int end = start + set->map[i].nr_queues;

                if (hctx_idx >= start && hctx_idx < end)
                        break;
        }

        if (i >= set->nr_maps)
                i = HCTX_TYPE_DEFAULT;

        return i;
}

static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
                unsigned int hctx_idx)
{
        enum hctx_type type = hctx_idx_to_type(set, hctx_idx);

        return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
}

static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
                                               unsigned int hctx_idx,
                                               unsigned int nr_tags,
                                               unsigned int reserved_tags)
{
        int node = blk_mq_get_hctx_node(set, hctx_idx);
        struct blk_mq_tags *tags;

        if (node == NUMA_NO_NODE)
                node = set->numa_node;

        tags = blk_mq_init_tags(nr_tags, reserved_tags, set->flags, node);
        if (!tags)
                return NULL;

        tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
                                 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                 node);
        if (!tags->rqs)
                goto err_free_tags;

        tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
                                        GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
                                        node);
        if (!tags->static_rqs)
                goto err_free_rqs;

        return tags;

err_free_rqs:
        kfree(tags->rqs);
err_free_tags:
        blk_mq_free_tags(tags);
        return NULL;
}

static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
                               unsigned int hctx_idx, int node)
{
        int ret;

        if (set->ops->init_request) {
                ret = set->ops->init_request(set, rq, hctx_idx, node);
                if (ret)
                        return ret;
        }

        WRITE_ONCE(rq->state, MQ_RQ_IDLE);
        return 0;
}

static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
                            struct blk_mq_tags *tags,
                            unsigned int hctx_idx, unsigned int depth)
{
        unsigned int i, j, entries_per_page, max_order = 4;
        int node = blk_mq_get_hctx_node(set, hctx_idx);
        size_t rq_size, left;

        if (node == NUMA_NO_NODE)
                node = set->numa_node;

        INIT_LIST_HEAD(&tags->page_list);

        /*
         * rq_size is the size of the request plus driver payload, rounded
         * to the cacheline size
         */
        rq_size = round_up(sizeof(struct request) + set->cmd_size,
                                cache_line_size());
        left = rq_size * depth;

        for (i = 0; i < depth; ) {
                int this_order = max_order;
                struct page *page;
                int to_do;
                void *p;

                while (this_order && left < order_to_size(this_order - 1))
                        this_order--;

                do {
                        page = alloc_pages_node(node,
                                GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
                                this_order);
                        if (page)
                                break;
                        if (!this_order--)
                                break;
                        if (order_to_size(this_order) < rq_size)
                                break;
                } while (1);

                if (!page)
                        goto fail;

                page->private = this_order;
                list_add_tail(&page->lru, &tags->page_list);

                p = page_address(page);
                /*
                 * Allow kmemleak to scan these pages as they contain pointers
                 * to additional allocations like via ops->init_request().
                 */
                kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
                entries_per_page = order_to_size(this_order) / rq_size;
                to_do = min(entries_per_page, depth - i);
                left -= to_do * rq_size;
                for (j = 0; j < to_do; j++) {
                        struct request *rq = p;

                        tags->static_rqs[i] = rq;
                        if (blk_mq_init_request(set, rq, hctx_idx, node)) {
                                tags->static_rqs[i] = NULL;
                                goto fail;
                        }

                        p += rq_size;
                        i++;
                }
        }
        return 0;

fail:
        blk_mq_free_rqs(set, tags, hctx_idx);
        return -ENOMEM;
}

struct rq_iter_data {
        struct blk_mq_hw_ctx *hctx;
        bool has_rq;
};

static bool blk_mq_has_request(struct request *rq, void *data)
{
        struct rq_iter_data *iter_data = data;

        if (rq->mq_hctx != iter_data->hctx)
                return true;
        iter_data->has_rq = true;
        return false;
}

static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_tags *tags = hctx->sched_tags ?
                        hctx->sched_tags : hctx->tags;
        struct rq_iter_data data = {
                .hctx        = hctx,
        };

        blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
        return data.has_rq;
}

static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
                unsigned int this_cpu)
{
        enum hctx_type type = hctx->type;
        int cpu;

        /*
         * hctx->cpumask has to rule out isolated CPUs, but userspace still
         * might submit IOs on these isolated CPUs, so use the queue map to
         * check if all CPUs mapped to this hctx are offline
         */
        for_each_online_cpu(cpu) {
                struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
                                type, cpu);

                if (h != hctx)
                        continue;

                /* this hctx has at least one online CPU */
                if (this_cpu != cpu)
                        return true;
        }

        return false;
}

static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
                        struct blk_mq_hw_ctx, cpuhp_online);

        if (blk_mq_hctx_has_online_cpu(hctx, cpu))
                return 0;

        /*
         * Prevent new request from being allocated on the current hctx.
         *
         * The smp_mb__after_atomic() Pairs with the implied barrier in
         * test_and_set_bit_lock in sbitmap_get().  Ensures the inactive flag is
         * seen once we return from the tag allocator.
         */
        set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
        smp_mb__after_atomic();

        /*
         * Try to grab a reference to the queue and wait for any outstanding
         * requests.  If we could not grab a reference the queue has been
         * frozen and there are no requests.
         */
        if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
                while (blk_mq_hctx_has_requests(hctx))
                        msleep(5);
                percpu_ref_put(&hctx->queue->q_usage_counter);
        }

        return 0;
}

/*
 * Check if one CPU is mapped to the specified hctx
 *
 * Isolated CPUs have been ruled out from hctx->cpumask, which is supposed
 * to be used for scheduling kworker only. For other usage, please call this
 * helper for checking if one CPU belongs to the specified hctx
 */
static bool blk_mq_cpu_mapped_to_hctx(unsigned int cpu,
                const struct blk_mq_hw_ctx *hctx)
{
        struct blk_mq_hw_ctx *mapped_hctx = blk_mq_map_queue_type(hctx->queue,
                        hctx->type, cpu);

        return mapped_hctx == hctx;
}

static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
                        struct blk_mq_hw_ctx, cpuhp_online);

        if (blk_mq_cpu_mapped_to_hctx(cpu, hctx))
                clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
        return 0;
}

/*
 * 'cpu' is going away. splice any existing rq_list entries from this
 * software queue to the hw queue dispatch list, and ensure that it
 * gets run.
 */
static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
{
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        LIST_HEAD(tmp);
        enum hctx_type type;

        hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
        if (!blk_mq_cpu_mapped_to_hctx(cpu, hctx))
                return 0;

        ctx = __blk_mq_get_ctx(hctx->queue, cpu);
        type = hctx->type;

        spin_lock(&ctx->lock);
        if (!list_empty(&ctx->rq_lists[type])) {
                list_splice_init(&ctx->rq_lists[type], &tmp);
                blk_mq_hctx_clear_pending(hctx, ctx);
        }
        spin_unlock(&ctx->lock);

        if (list_empty(&tmp))
                return 0;

        spin_lock(&hctx->lock);
        list_splice_tail_init(&tmp, &hctx->dispatch);
        spin_unlock(&hctx->lock);

        blk_mq_run_hw_queue(hctx, true);
        return 0;
}

static void __blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
        lockdep_assert_held(&blk_mq_cpuhp_lock);

        if (!(hctx->flags & BLK_MQ_F_STACKING) &&
            !hlist_unhashed(&hctx->cpuhp_online)) {
                cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
                                                    &hctx->cpuhp_online);
                INIT_HLIST_NODE(&hctx->cpuhp_online);
        }

        if (!hlist_unhashed(&hctx->cpuhp_dead)) {
                cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
                                                    &hctx->cpuhp_dead);
                INIT_HLIST_NODE(&hctx->cpuhp_dead);
        }
}

static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
        mutex_lock(&blk_mq_cpuhp_lock);
        __blk_mq_remove_cpuhp(hctx);
        mutex_unlock(&blk_mq_cpuhp_lock);
}

static void __blk_mq_add_cpuhp(struct blk_mq_hw_ctx *hctx)
{
        lockdep_assert_held(&blk_mq_cpuhp_lock);

        if (!(hctx->flags & BLK_MQ_F_STACKING) &&
            hlist_unhashed(&hctx->cpuhp_online))
                cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
                                &hctx->cpuhp_online);

        if (hlist_unhashed(&hctx->cpuhp_dead))
                cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD,
                                &hctx->cpuhp_dead);
}

static void __blk_mq_remove_cpuhp_list(struct list_head *head)
{
        struct blk_mq_hw_ctx *hctx;

        lockdep_assert_held(&blk_mq_cpuhp_lock);

        list_for_each_entry(hctx, head, hctx_list)
                __blk_mq_remove_cpuhp(hctx);
}

/*
 * Unregister cpuhp callbacks from exited hw queues
 *
 * Safe to call if this `request_queue` is live
 */
static void blk_mq_remove_hw_queues_cpuhp(struct request_queue *q)
{
        LIST_HEAD(hctx_list);

        spin_lock(&q->unused_hctx_lock);
        list_splice_init(&q->unused_hctx_list, &hctx_list);
        spin_unlock(&q->unused_hctx_lock);

        mutex_lock(&blk_mq_cpuhp_lock);
        __blk_mq_remove_cpuhp_list(&hctx_list);
        mutex_unlock(&blk_mq_cpuhp_lock);

        spin_lock(&q->unused_hctx_lock);
        list_splice(&hctx_list, &q->unused_hctx_list);
        spin_unlock(&q->unused_hctx_lock);
}

/*
 * Register cpuhp callbacks from all hw queues
 *
 * Safe to call if this `request_queue` is live
 */
static void blk_mq_add_hw_queues_cpuhp(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        mutex_lock(&blk_mq_cpuhp_lock);
        queue_for_each_hw_ctx(q, hctx, i)
                __blk_mq_add_cpuhp(hctx);
        mutex_unlock(&blk_mq_cpuhp_lock);
}

/*
 * Before freeing hw queue, clearing the flush request reference in
 * tags->rqs[] for avoiding potential UAF.
 */
static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
                unsigned int queue_depth, struct request *flush_rq)
{
        int i;
        unsigned long flags;

        /* The hw queue may not be mapped yet */
        if (!tags)
                return;

        WARN_ON_ONCE(req_ref_read(flush_rq) != 0);

        for (i = 0; i < queue_depth; i++)
                cmpxchg(&tags->rqs[i], flush_rq, NULL);

        /*
         * Wait until all pending iteration is done.
         *
         * Request reference is cleared and it is guaranteed to be observed
         * after the ->lock is released.
         */
        spin_lock_irqsave(&tags->lock, flags);
        spin_unlock_irqrestore(&tags->lock, flags);
}

/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
        struct request *flush_rq = hctx->fq->flush_rq;

        if (blk_mq_hw_queue_mapped(hctx))
                blk_mq_tag_idle(hctx);

        if (blk_queue_init_done(q))
                blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
                                set->queue_depth, flush_rq);
        if (set->ops->exit_request)
                set->ops->exit_request(set, flush_rq, hctx_idx);

        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);

        xa_erase(&q->hctx_table, hctx_idx);

        spin_lock(&q->unused_hctx_lock);
        list_add(&hctx->hctx_list, &q->unused_hctx_list);
        spin_unlock(&q->unused_hctx_lock);
}

static void blk_mq_exit_hw_queues(struct request_queue *q,
                struct blk_mq_tag_set *set, int nr_queue)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (i == nr_queue)
                        break;
                blk_mq_remove_cpuhp(hctx);
                blk_mq_exit_hctx(q, set, hctx, i);
        }
}

static int blk_mq_init_hctx(struct request_queue *q,
                struct blk_mq_tag_set *set,
                struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
{
        hctx->queue_num = hctx_idx;

        hctx->tags = set->tags[hctx_idx];

        if (set->ops->init_hctx &&
            set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
                goto fail;

        if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
                                hctx->numa_node))
                goto exit_hctx;

        if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
                goto exit_flush_rq;

        return 0;

 exit_flush_rq:
        if (set->ops->exit_request)
                set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
 exit_hctx:
        if (set->ops->exit_hctx)
                set->ops->exit_hctx(hctx, hctx_idx);
 fail:
        return -1;
}

static struct blk_mq_hw_ctx *
blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
                int node)
{
        struct blk_mq_hw_ctx *hctx;
        gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;

        hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
        if (!hctx)
                goto fail_alloc_hctx;

        if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node))
                goto free_hctx;

        atomic_set(&hctx->nr_active, 0);
        if (node == NUMA_NO_NODE)
                node = set->numa_node;
        hctx->numa_node = node;

        INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
        spin_lock_init(&hctx->lock);
        INIT_LIST_HEAD(&hctx->dispatch);
        INIT_HLIST_NODE(&hctx->cpuhp_dead);
        INIT_HLIST_NODE(&hctx->cpuhp_online);
        hctx->queue = q;
        hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;

        INIT_LIST_HEAD(&hctx->hctx_list);

        /*
         * Allocate space for all possible cpus to avoid allocation at
         * runtime
         */
        hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *),
                        gfp, node);
        if (!hctx->ctxs)
                goto free_cpumask;

        if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
                                gfp, node, false, false))
                goto free_ctxs;
        hctx->nr_ctx = 0;

        spin_lock_init(&hctx->dispatch_wait_lock);
        init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
        INIT_LIST_HEAD(&hctx->dispatch_wait.entry);

        hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
        if (!hctx->fq)
                goto free_bitmap;

        blk_mq_hctx_kobj_init(hctx);

        return hctx;

 free_bitmap:
        sbitmap_free(&hctx->ctx_map);
 free_ctxs:
        kfree(hctx->ctxs);
 free_cpumask:
        free_cpumask_var(hctx->cpumask);
 free_hctx:
        kfree(hctx);
 fail_alloc_hctx:
        return NULL;
}

static void blk_mq_init_cpu_queues(struct request_queue *q,
                                   unsigned int nr_hw_queues)
{
        struct blk_mq_tag_set *set = q->tag_set;
        unsigned int i, j;

        for_each_possible_cpu(i) {
                struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
                struct blk_mq_hw_ctx *hctx;
                int k;

                __ctx->cpu = i;
                spin_lock_init(&__ctx->lock);
                for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++)
                        INIT_LIST_HEAD(&__ctx->rq_lists[k]);

                __ctx->queue = q;

                /*
                 * Set local node, IFF we have more than one hw queue. If
                 * not, we remain on the home node of the device
                 */
                for (j = 0; j < set->nr_maps; j++) {
                        hctx = blk_mq_map_queue_type(q, j, i);
                        if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
                                hctx->numa_node = cpu_to_node(i);
                }
        }
}

struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
                                             unsigned int hctx_idx,
                                             unsigned int depth)
{
        struct blk_mq_tags *tags;
        int ret;

        tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
        if (!tags)
                return NULL;

        ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
        if (ret) {
                blk_mq_free_rq_map(tags);
                return NULL;
        }

        return tags;
}

static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
                                       int hctx_idx)
{
        if (blk_mq_is_shared_tags(set->flags)) {
                set->tags[hctx_idx] = set->shared_tags;

                return true;
        }

        set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
                                                       set->queue_depth);

        return set->tags[hctx_idx];
}

void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
                             struct blk_mq_tags *tags,
                             unsigned int hctx_idx)
{
        if (tags) {
                blk_mq_free_rqs(set, tags, hctx_idx);
                blk_mq_free_rq_map(tags);
        }
}

static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
                                      unsigned int hctx_idx)
{
        if (!blk_mq_is_shared_tags(set->flags))
                blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);

        set->tags[hctx_idx] = NULL;
}

static void blk_mq_map_swqueue(struct request_queue *q)
{
        unsigned int j, hctx_idx;
        unsigned long i;
        struct blk_mq_hw_ctx *hctx;
        struct blk_mq_ctx *ctx;
        struct blk_mq_tag_set *set = q->tag_set;

        mutex_lock(&q->elevator_lock);

        queue_for_each_hw_ctx(q, hctx, i) {
                cpumask_clear(hctx->cpumask);
                hctx->nr_ctx = 0;
                hctx->dispatch_from = NULL;
        }

        /*
         * Map software to hardware queues.
         *
         * If the cpu isn't present, the cpu is mapped to first hctx.
         */
        for_each_possible_cpu(i) {

                ctx = per_cpu_ptr(q->queue_ctx, i);
                for (j = 0; j < set->nr_maps; j++) {
                        if (!set->map[j].nr_queues) {
                                ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                                HCTX_TYPE_DEFAULT, i);
                                continue;
                        }
                        hctx_idx = set->map[j].mq_map[i];
                        /* unmapped hw queue can be remapped after CPU topo changed */
                        if (!set->tags[hctx_idx] &&
                            !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
                                /*
                                 * If tags initialization fail for some hctx,
                                 * that hctx won't be brought online.  In this
                                 * case, remap the current ctx to hctx[0] which
                                 * is guaranteed to always have tags allocated
                                 */
                                set->map[j].mq_map[i] = 0;
                        }

                        hctx = blk_mq_map_queue_type(q, j, i);
                        ctx->hctxs[j] = hctx;
                        /*
                         * If the CPU is already set in the mask, then we've
                         * mapped this one already. This can happen if
                         * devices share queues across queue maps.
                         */
                        if (cpumask_test_cpu(i, hctx->cpumask))
                                continue;

                        cpumask_set_cpu(i, hctx->cpumask);
                        hctx->type = j;
                        ctx->index_hw[hctx->type] = hctx->nr_ctx;
                        hctx->ctxs[hctx->nr_ctx++] = ctx;

                        /*
                         * If the nr_ctx type overflows, we have exceeded the
                         * amount of sw queues we can support.
                         */
                        BUG_ON(!hctx->nr_ctx);
                }

                for (; j < HCTX_MAX_TYPES; j++)
                        ctx->hctxs[j] = blk_mq_map_queue_type(q,
                                        HCTX_TYPE_DEFAULT, i);
        }

        queue_for_each_hw_ctx(q, hctx, i) {
                int cpu;

                /*
                 * If no software queues are mapped to this hardware queue,
                 * disable it and free the request entries.
                 */
                if (!hctx->nr_ctx) {
                        /* Never unmap queue 0.  We need it as a
                         * fallback in case of a new remap fails
                         * allocation
                         */
                        if (i)
                                __blk_mq_free_map_and_rqs(set, i);

                        hctx->tags = NULL;
                        continue;
                }

                hctx->tags = set->tags[i];
                WARN_ON(!hctx->tags);

                /*
                 * Set the map size to the number of mapped software queues.
                 * This is more accurate and more efficient than looping
                 * over all possibly mapped software queues.
                 */
                sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);

                /*
                 * Rule out isolated CPUs from hctx->cpumask to avoid
                 * running block kworker on isolated CPUs
                 */
                for_each_cpu(cpu, hctx->cpumask) {
                        if (cpu_is_isolated(cpu))
                                cpumask_clear_cpu(cpu, hctx->cpumask);
                }

                /*
                 * Initialize batch roundrobin counts
                 */
                hctx->next_cpu = blk_mq_first_mapped_cpu(hctx);
                hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
        }

        mutex_unlock(&q->elevator_lock);
}

/*
 * Caller needs to ensure that we're either frozen/quiesced, or that
 * the queue isn't live yet.
 */
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i) {
                if (shared) {
                        hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
                } else {
                        blk_mq_tag_idle(hctx);
                        hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
                }
        }
}

static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
                                         bool shared)
{
        struct request_queue *q;
        unsigned int memflags;

        lockdep_assert_held(&set->tag_list_lock);

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                memflags = blk_mq_freeze_queue(q);
                queue_set_hctx_shared(q, shared);
                blk_mq_unfreeze_queue(q, memflags);
        }
}

static void blk_mq_del_queue_tag_set(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;

        mutex_lock(&set->tag_list_lock);
        list_del(&q->tag_set_list);
        if (list_is_singular(&set->tag_list)) {
                /* just transitioned to unshared */
                set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
                /* update existing queue */
                blk_mq_update_tag_set_shared(set, false);
        }
        mutex_unlock(&set->tag_list_lock);
        INIT_LIST_HEAD(&q->tag_set_list);
}

static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
                                     struct request_queue *q)
{
        mutex_lock(&set->tag_list_lock);

        /*
         * Check to see if we're transitioning to shared (from 1 to 2 queues).
         */
        if (!list_empty(&set->tag_list) &&
            !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
                set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
                /* update existing queue */
                blk_mq_update_tag_set_shared(set, true);
        }
        if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
                queue_set_hctx_shared(q, true);
        list_add_tail(&q->tag_set_list, &set->tag_list);

        mutex_unlock(&set->tag_list_lock);
}

/* All allocations will be freed in release handler of q->mq_kobj */
static int blk_mq_alloc_ctxs(struct request_queue *q)
{
        struct blk_mq_ctxs *ctxs;
        int cpu;

        ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL);
        if (!ctxs)
                return -ENOMEM;

        ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx);
        if (!ctxs->queue_ctx)
                goto fail;

        for_each_possible_cpu(cpu) {
                struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu);
                ctx->ctxs = ctxs;
        }

        q->mq_kobj = &ctxs->kobj;
        q->queue_ctx = ctxs->queue_ctx;

        return 0;
 fail:
        kfree(ctxs);
        return -ENOMEM;
}

/*
 * It is the actual release handler for mq, but we do it from
 * request queue's release handler for avoiding use-after-free
 * and headache because q->mq_kobj shouldn't have been introduced,
 * but we can't group ctx/kctx kobj without it.
 */
void blk_mq_release(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx, *next;
        unsigned long i;

        queue_for_each_hw_ctx(q, hctx, i)
                WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));

        /* all hctx are in .unused_hctx_list now */
        list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) {
                list_del_init(&hctx->hctx_list);
                kobject_put(&hctx->kobj);
        }

        xa_destroy(&q->hctx_table);

        /*
         * release .mq_kobj and sw queue's kobject now because
         * both share lifetime with request queue.
         */
        blk_mq_sysfs_deinit(q);
}

struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata)
{
        struct queue_limits default_lim = { };
        struct request_queue *q;
        int ret;

        if (!lim)
                lim = &default_lim;
        lim->features |= BLK_FEAT_IO_STAT | BLK_FEAT_NOWAIT;
        if (set->nr_maps > HCTX_TYPE_POLL)
                lim->features |= BLK_FEAT_POLL;

        q = blk_alloc_queue(lim, set->numa_node);
        if (IS_ERR(q))
                return q;
        q->queuedata = queuedata;
        ret = blk_mq_init_allocated_queue(set, q);
        if (ret) {
                blk_put_queue(q);
                return ERR_PTR(ret);
        }
        return q;
}
EXPORT_SYMBOL(blk_mq_alloc_queue);

/**
 * blk_mq_destroy_queue - shutdown a request queue
 * @q: request queue to shutdown
 *
 * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
 * requests will be failed with -ENODEV. The caller is responsible for dropping
 * the reference from blk_mq_alloc_queue() by calling blk_put_queue().
 *
 * Context: can sleep
 */
void blk_mq_destroy_queue(struct request_queue *q)
{
        WARN_ON_ONCE(!queue_is_mq(q));
        WARN_ON_ONCE(blk_queue_registered(q));

        might_sleep();

        blk_queue_flag_set(QUEUE_FLAG_DYING, q);
        blk_queue_start_drain(q);
        blk_mq_freeze_queue_wait(q);

        blk_sync_queue(q);
        blk_mq_cancel_work_sync(q);
        blk_mq_exit_queue(q);
}
EXPORT_SYMBOL(blk_mq_destroy_queue);

struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
                struct queue_limits *lim, void *queuedata,
                struct lock_class_key *lkclass)
{
        struct request_queue *q;
        struct gendisk *disk;

        q = blk_mq_alloc_queue(set, lim, queuedata);
        if (IS_ERR(q))
                return ERR_CAST(q);

        disk = __alloc_disk_node(q, set->numa_node, lkclass);
        if (!disk) {
                blk_mq_destroy_queue(q);
                blk_put_queue(q);
                return ERR_PTR(-ENOMEM);
        }
        set_bit(GD_OWNS_QUEUE, &disk->state);
        return disk;
}
EXPORT_SYMBOL(__blk_mq_alloc_disk);

struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
                struct lock_class_key *lkclass)
{
        struct gendisk *disk;

        if (!blk_get_queue(q))
                return NULL;
        disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
        if (!disk)
                blk_put_queue(q);
        return disk;
}
EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);

/*
 * Only hctx removed from cpuhp list can be reused
 */
static bool blk_mq_hctx_is_reusable(struct blk_mq_hw_ctx *hctx)
{
        return hlist_unhashed(&hctx->cpuhp_online) &&
                hlist_unhashed(&hctx->cpuhp_dead);
}

static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
                struct blk_mq_tag_set *set, struct request_queue *q,
                int hctx_idx, int node)
{
        struct blk_mq_hw_ctx *hctx = NULL, *tmp;

        /* reuse dead hctx first */
        spin_lock(&q->unused_hctx_lock);
        list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) {
                if (tmp->numa_node == node && blk_mq_hctx_is_reusable(tmp)) {
                        hctx = tmp;
                        break;
                }
        }
        if (hctx)
                list_del_init(&hctx->hctx_list);
        spin_unlock(&q->unused_hctx_lock);

        if (!hctx)
                hctx = blk_mq_alloc_hctx(q, set, node);
        if (!hctx)
                goto fail;

        if (blk_mq_init_hctx(q, set, hctx, hctx_idx))
                goto free_hctx;

        return hctx;

 free_hctx:
        kobject_put(&hctx->kobj);
 fail:
        return NULL;
}

static void __blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                                     struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i, j;

        for (i = 0; i < set->nr_hw_queues; i++) {
                int old_node;
                int node = blk_mq_get_hctx_node(set, i);
                struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);

                if (old_hctx) {
                        old_node = old_hctx->numa_node;
                        blk_mq_exit_hctx(q, set, old_hctx, i);
                }

                if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
                        if (!old_hctx)
                                break;
                        pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
                                        node, old_node);
                        hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
                        WARN_ON_ONCE(!hctx);
                }
        }
        /*
         * Increasing nr_hw_queues fails. Free the newly allocated
         * hctxs and keep the previous q->nr_hw_queues.
         */
        if (i != set->nr_hw_queues) {
                j = q->nr_hw_queues;
        } else {
                j = i;
                q->nr_hw_queues = set->nr_hw_queues;
        }

        xa_for_each_start(&q->hctx_table, j, hctx, j)
                blk_mq_exit_hctx(q, set, hctx, j);
}

static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
                                   struct request_queue *q, bool lock)
{
        if (lock) {
                /* protect against switching io scheduler  */
                mutex_lock(&q->elevator_lock);
                __blk_mq_realloc_hw_ctxs(set, q);
                mutex_unlock(&q->elevator_lock);
        } else {
                __blk_mq_realloc_hw_ctxs(set, q);
        }

        /* unregister cpuhp callbacks for exited hctxs */
        blk_mq_remove_hw_queues_cpuhp(q);

        /* register cpuhp for new initialized hctxs */
        blk_mq_add_hw_queues_cpuhp(q);
}

int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
                struct request_queue *q)
{
        /* mark the queue as mq asap */
        q->mq_ops = set->ops;

        /*
         * ->tag_set has to be setup before initialize hctx, which cpuphp
         * handler needs it for checking queue mapping
         */
        q->tag_set = set;

        if (blk_mq_alloc_ctxs(q))
                goto err_exit;

        /* init q->mq_kobj and sw queues' kobjects */
        blk_mq_sysfs_init(q);

        INIT_LIST_HEAD(&q->unused_hctx_list);
        spin_lock_init(&q->unused_hctx_lock);

        xa_init(&q->hctx_table);

        blk_mq_realloc_hw_ctxs(set, q, false);
        if (!q->nr_hw_queues)
                goto err_hctxs;

        INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
        blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);

        q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;

        INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
        INIT_LIST_HEAD(&q->flush_list);
        INIT_LIST_HEAD(&q->requeue_list);
        spin_lock_init(&q->requeue_lock);

        q->nr_requests = set->queue_depth;

        blk_mq_init_cpu_queues(q, set->nr_hw_queues);
        blk_mq_add_queue_tag_set(set, q);
        blk_mq_map_swqueue(q);
        return 0;

err_hctxs:
        blk_mq_release(q);
err_exit:
        q->mq_ops = NULL;
        return -ENOMEM;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);

/* tags can _not_ be used after returning from blk_mq_exit_queue */
void blk_mq_exit_queue(struct request_queue *q)
{
        struct blk_mq_tag_set *set = q->tag_set;

        /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
        blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
        /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
        blk_mq_del_queue_tag_set(q);
}

static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
        int i;

        if (blk_mq_is_shared_tags(set->flags)) {
                set->shared_tags = blk_mq_alloc_map_and_rqs(set,
                                                BLK_MQ_NO_HCTX_IDX,
                                                set->queue_depth);
                if (!set->shared_tags)
                        return -ENOMEM;
        }

        for (i = 0; i < set->nr_hw_queues; i++) {
                if (!__blk_mq_alloc_map_and_rqs(set, i))
                        goto out_unwind;
                cond_resched();
        }

        return 0;

out_unwind:
        while (--i >= 0)
                __blk_mq_free_map_and_rqs(set, i);

        if (blk_mq_is_shared_tags(set->flags)) {
                blk_mq_free_map_and_rqs(set, set->shared_tags,
                                        BLK_MQ_NO_HCTX_IDX);
        }

        return -ENOMEM;
}

/*
 * Allocate the request maps associated with this tag_set. Note that this
 * may reduce the depth asked for, if memory is tight. set->queue_depth
 * will be updated to reflect the allocated depth.
 */
static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
{
        unsigned int depth;
        int err;

        depth = set->queue_depth;
        do {
                err = __blk_mq_alloc_rq_maps(set);
                if (!err)
                        break;

                set->queue_depth >>= 1;
                if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
                        err = -ENOMEM;
                        break;
                }
        } while (set->queue_depth);

        if (!set->queue_depth || err) {
                pr_err("blk-mq: failed to allocate request map\n");
                return -ENOMEM;
        }

        if (depth != set->queue_depth)
                pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
                                                depth, set->queue_depth);

        return 0;
}

static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
        /*
         * blk_mq_map_queues() and multiple .map_queues() implementations
         * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
         * number of hardware queues.
         */
        if (set->nr_maps == 1)
                set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;

        if (set->ops->map_queues) {
                int i;

                /*
                 * transport .map_queues is usually done in the following
                 * way:
                 *
                 * for (queue = 0; queue < set->nr_hw_queues; queue++) {
                 *         mask = get_cpu_mask(queue)
                 *         for_each_cpu(cpu, mask)
                 *                 set->map[x].mq_map[cpu] = queue;
                 * }
                 *
                 * When we need to remap, the table has to be cleared for
                 * killing stale mapping since one CPU may not be mapped
                 * to any hw queue.
                 */
                for (i = 0; i < set->nr_maps; i++)
                        blk_mq_clear_mq_map(&set->map[i]);

                set->ops->map_queues(set);
        } else {
                BUG_ON(set->nr_maps > 1);
                blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
        }
}

static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
                                       int new_nr_hw_queues)
{
        struct blk_mq_tags **new_tags;
        int i;

        if (set->nr_hw_queues >= new_nr_hw_queues)
                goto done;

        new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
                                GFP_KERNEL, set->numa_node);
        if (!new_tags)
                return -ENOMEM;

        if (set->tags)
                memcpy(new_tags, set->tags, set->nr_hw_queues *
                       sizeof(*set->tags));
        kfree(set->tags);
        set->tags = new_tags;

        for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
                if (!__blk_mq_alloc_map_and_rqs(set, i)) {
                        while (--i >= set->nr_hw_queues)
                                __blk_mq_free_map_and_rqs(set, i);
                        return -ENOMEM;
                }
                cond_resched();
        }

done:
        set->nr_hw_queues = new_nr_hw_queues;
        return 0;
}

/*
 * Alloc a tag set to be associated with one or more request queues.
 * May fail with EINVAL for various error conditions. May adjust the
 * requested depth down, if it's too large. In that case, the set
 * value will be stored in set->queue_depth.
 */
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
        int i, ret;

        BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);

        if (!set->nr_hw_queues)
                return -EINVAL;
        if (!set->queue_depth)
                return -EINVAL;
        if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
                return -EINVAL;

        if (!set->ops->queue_rq)
                return -EINVAL;

        if (!set->ops->get_budget ^ !set->ops->put_budget)
                return -EINVAL;

        if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
                pr_info("blk-mq: reduced tag depth to %u\n",
                        BLK_MQ_MAX_DEPTH);
                set->queue_depth = BLK_MQ_MAX_DEPTH;
        }

        if (!set->nr_maps)
                set->nr_maps = 1;
        else if (set->nr_maps > HCTX_MAX_TYPES)
                return -EINVAL;

        /*
         * If a crashdump is active, then we are potentially in a very
         * memory constrained environment. Limit us to  64 tags to prevent
         * using too much memory.
         */
        if (is_kdump_kernel())
                set->queue_depth = min(64U, set->queue_depth);

        /*
         * There is no use for more h/w queues than cpus if we just have
         * a single map
         */
        if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
                set->nr_hw_queues = nr_cpu_ids;

        if (set->flags & BLK_MQ_F_BLOCKING) {
                set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
                if (!set->srcu)
                        return -ENOMEM;
                ret = init_srcu_struct(set->srcu);
                if (ret)
                        goto out_free_srcu;
        }

        ret = -ENOMEM;
        set->tags = kcalloc_node(set->nr_hw_queues,
                                 sizeof(struct blk_mq_tags *), GFP_KERNEL,
                                 set->numa_node);
        if (!set->tags)
                goto out_cleanup_srcu;

        for (i = 0; i < set->nr_maps; i++) {
                set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
                                                  sizeof(set->map[i].mq_map[0]),
                                                  GFP_KERNEL, set->numa_node);
                if (!set->map[i].mq_map)
                        goto out_free_mq_map;
                set->map[i].nr_queues = set->nr_hw_queues;
        }

        blk_mq_update_queue_map(set);

        ret = blk_mq_alloc_set_map_and_rqs(set);
        if (ret)
                goto out_free_mq_map;

        mutex_init(&set->tag_list_lock);
        INIT_LIST_HEAD(&set->tag_list);

        return 0;

out_free_mq_map:
        for (i = 0; i < set->nr_maps; i++) {
                kfree(set->map[i].mq_map);
                set->map[i].mq_map = NULL;
        }
        kfree(set->tags);
        set->tags = NULL;
out_cleanup_srcu:
        if (set->flags & BLK_MQ_F_BLOCKING)
                cleanup_srcu_struct(set->srcu);
out_free_srcu:
        if (set->flags & BLK_MQ_F_BLOCKING)
                kfree(set->srcu);
        return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);

/* allocate and initialize a tagset for a simple single-queue device */
int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
                const struct blk_mq_ops *ops, unsigned int queue_depth,
                unsigned int set_flags)
{
        memset(set, 0, sizeof(*set));
        set->ops = ops;
        set->nr_hw_queues = 1;
        set->nr_maps = 1;
        set->queue_depth = queue_depth;
        set->numa_node = NUMA_NO_NODE;
        set->flags = set_flags;
        return blk_mq_alloc_tag_set(set);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);

void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
        int i, j;

        for (i = 0; i < set->nr_hw_queues; i++)
                __blk_mq_free_map_and_rqs(set, i);

        if (blk_mq_is_shared_tags(set->flags)) {
                blk_mq_free_map_and_rqs(set, set->shared_tags,
                                        BLK_MQ_NO_HCTX_IDX);
        }

        for (j = 0; j < set->nr_maps; j++) {
                kfree(set->map[j].mq_map);
                set->map[j].mq_map = NULL;
        }

        kfree(set->tags);
        set->tags = NULL;
        if (set->flags & BLK_MQ_F_BLOCKING) {
                cleanup_srcu_struct(set->srcu);
                kfree(set->srcu);
        }
}
EXPORT_SYMBOL(blk_mq_free_tag_set);

int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
        struct blk_mq_tag_set *set = q->tag_set;
        struct blk_mq_hw_ctx *hctx;
        int ret;
        unsigned long i;

        if (WARN_ON_ONCE(!q->mq_freeze_depth))
                return -EINVAL;

        if (!set)
                return -EINVAL;

        if (q->nr_requests == nr)
                return 0;

        blk_mq_quiesce_queue(q);

        ret = 0;
        queue_for_each_hw_ctx(q, hctx, i) {
                if (!hctx->tags)
                        continue;
                /*
                 * If we're using an MQ scheduler, just update the scheduler
                 * queue depth. This is similar to what the old code would do.
                 */
                if (hctx->sched_tags) {
                        ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
                                                      nr, true);
                } else {
                        ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
                                                      false);
                }
                if (ret)
                        break;
                if (q->elevator && q->elevator->type->ops.depth_updated)
                        q->elevator->type->ops.depth_updated(hctx);
        }
        if (!ret) {
                q->nr_requests = nr;
                if (blk_mq_is_shared_tags(set->flags)) {
                        if (q->elevator)
                                blk_mq_tag_update_sched_shared_tags(q);
                        else
                                blk_mq_tag_resize_shared_tags(set, nr);
                }
        }

        blk_mq_unquiesce_queue(q);

        return ret;
}

/*
 * request_queue and elevator_type pair.
 * It is just used by __blk_mq_update_nr_hw_queues to cache
 * the elevator_type associated with a request_queue.
 */
struct blk_mq_qe_pair {
        struct list_head node;
        struct request_queue *q;
        struct elevator_type *type;
};

/*
 * Cache the elevator_type in qe pair list and switch the
 * io scheduler to 'none'
 */
static bool blk_mq_elv_switch_none(struct list_head *head,
                struct request_queue *q)
{
        struct blk_mq_qe_pair *qe;

        qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
        if (!qe)
                return false;

        /* Accessing q->elevator needs protection from ->elevator_lock. */
        mutex_lock(&q->elevator_lock);

        if (!q->elevator) {
                kfree(qe);
                goto unlock;
        }

        INIT_LIST_HEAD(&qe->node);
        qe->q = q;
        qe->type = q->elevator->type;
        /* keep a reference to the elevator module as we'll switch back */
        __elevator_get(qe->type);
        list_add(&qe->node, head);
        elevator_disable(q);
unlock:
        mutex_unlock(&q->elevator_lock);

        return true;
}

static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
                                                struct request_queue *q)
{
        struct blk_mq_qe_pair *qe;

        list_for_each_entry(qe, head, node)
                if (qe->q == q)
                        return qe;

        return NULL;
}

static void blk_mq_elv_switch_back(struct list_head *head,
                                  struct request_queue *q)
{
        struct blk_mq_qe_pair *qe;
        struct elevator_type *t;

        qe = blk_lookup_qe_pair(head, q);
        if (!qe)
                return;
        t = qe->type;
        list_del(&qe->node);
        kfree(qe);

        mutex_lock(&q->elevator_lock);
        elevator_switch(q, t);
        /* drop the reference acquired in blk_mq_elv_switch_none */
        elevator_put(t);
        mutex_unlock(&q->elevator_lock);
}

static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
                                                        int nr_hw_queues)
{
        struct request_queue *q;
        LIST_HEAD(head);
        int prev_nr_hw_queues = set->nr_hw_queues;
        unsigned int memflags;
        int i;

        lockdep_assert_held(&set->tag_list_lock);

        if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
                nr_hw_queues = nr_cpu_ids;
        if (nr_hw_queues < 1)
                return;
        if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
                return;

        memflags = memalloc_noio_save();
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_freeze_queue_nomemsave(q);

        /*
         * Switch IO scheduler to 'none', cleaning up the data associated
         * with the previous scheduler. We will switch back once we are done
         * updating the new sw to hw queue mappings.
         */
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                if (!blk_mq_elv_switch_none(&head, q))
                        goto switch_back;

        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_debugfs_unregister_hctxs(q);
                blk_mq_sysfs_unregister_hctxs(q);
        }

        if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
                goto reregister;

fallback:
        blk_mq_update_queue_map(set);
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_realloc_hw_ctxs(set, q, true);

                if (q->nr_hw_queues != set->nr_hw_queues) {
                        int i = prev_nr_hw_queues;

                        pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
                                        nr_hw_queues, prev_nr_hw_queues);
                        for (; i < set->nr_hw_queues; i++)
                                __blk_mq_free_map_and_rqs(set, i);

                        set->nr_hw_queues = prev_nr_hw_queues;
                        goto fallback;
                }
                blk_mq_map_swqueue(q);
        }

reregister:
        list_for_each_entry(q, &set->tag_list, tag_set_list) {
                blk_mq_sysfs_register_hctxs(q);
                blk_mq_debugfs_register_hctxs(q);
        }

switch_back:
        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_elv_switch_back(&head, q);

        list_for_each_entry(q, &set->tag_list, tag_set_list)
                blk_mq_unfreeze_queue_nomemrestore(q);
        memalloc_noio_restore(memflags);

        /* Free the excess tags when nr_hw_queues shrink. */
        for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
                __blk_mq_free_map_and_rqs(set, i);
}

void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
        mutex_lock(&set->tag_list_lock);
        __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
        mutex_unlock(&set->tag_list_lock);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);

static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
                         struct io_comp_batch *iob, unsigned int flags)
{
        long state = get_current_state();
        int ret;

        do {
                ret = q->mq_ops->poll(hctx, iob);
                if (ret > 0) {
                        __set_current_state(TASK_RUNNING);
                        return ret;
                }

                if (signal_pending_state(state, current))
                        __set_current_state(TASK_RUNNING);
                if (task_is_running(current))
                        return 1;

                if (ret < 0 || (flags & BLK_POLL_ONESHOT))
                        break;
                cpu_relax();
        } while (!need_resched());

        __set_current_state(TASK_RUNNING);
        return 0;
}

int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
                struct io_comp_batch *iob, unsigned int flags)
{
        if (!blk_mq_can_poll(q))
                return 0;
        return blk_hctx_poll(q, xa_load(&q->hctx_table, cookie), iob, flags);
}

int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
                unsigned int poll_flags)
{
        struct request_queue *q = rq->q;
        int ret;

        if (!blk_rq_is_poll(rq))
                return 0;
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return 0;

        ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags);
        blk_queue_exit(q);

        return ret;
}
EXPORT_SYMBOL_GPL(blk_rq_poll);

unsigned int blk_mq_rq_cpu(struct request *rq)
{
        return rq->mq_ctx->cpu;
}
EXPORT_SYMBOL(blk_mq_rq_cpu);

void blk_mq_cancel_work_sync(struct request_queue *q)
{
        struct blk_mq_hw_ctx *hctx;
        unsigned long i;

        cancel_delayed_work_sync(&q->requeue_work);

        queue_for_each_hw_ctx(q, hctx, i)
                cancel_delayed_work_sync(&hctx->run_work);
}

static int __init blk_mq_init(void)
{
        int i;

        for_each_possible_cpu(i)
                init_llist_head(&per_cpu(blk_cpu_done, i));
        for_each_possible_cpu(i)
                INIT_CSD(&per_cpu(blk_cpu_csd, i),
                         __blk_mq_complete_request_remote, NULL);
        open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);

        cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
                                  "block/softirq:dead", NULL,
                                  blk_softirq_cpu_dead);
        cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
                                blk_mq_hctx_notify_dead);
        cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
                                blk_mq_hctx_notify_online,
                                blk_mq_hctx_notify_offline);
        return 0;
}
subsys_initcall(blk_mq_init);




































   37 

















   34 




































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
/* SPDX-License-Identifier: GPL-2.0 */
/* Freezer declarations */

#ifndef FREEZER_H_INCLUDED
#define FREEZER_H_INCLUDED

#include <linux/debug_locks.h>
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>

#ifdef CONFIG_FREEZER
DECLARE_STATIC_KEY_FALSE(freezer_active);

extern bool pm_freezing;                /* PM freezing in effect */
extern bool pm_nosig_freezing;                /* PM nosig freezing in effect */

/*
 * Timeout for stopping processes
 */
extern unsigned int freeze_timeout_msecs;

/*
 * Check if a process has been frozen
 */
extern bool frozen(struct task_struct *p);

extern bool freezing_slow_path(struct task_struct *p);

/*
 * Check if there is a request to freeze a process
 */
static inline bool freezing(struct task_struct *p)
{
        if (static_branch_unlikely(&freezer_active))
                return freezing_slow_path(p);

        return false;
}

/* Takes and releases task alloc lock using task_lock() */
extern void __thaw_task(struct task_struct *t);

extern bool __refrigerator(bool check_kthr_stop);
extern int freeze_processes(void);
extern int freeze_kernel_threads(void);
extern void thaw_processes(void);
extern void thaw_kernel_threads(void);

static inline bool try_to_freeze(void)
{
        might_sleep();
        if (likely(!freezing(current)))
                return false;
        if (!(current->flags & PF_NOFREEZE))
                debug_check_no_locks_held();
        return __refrigerator(false);
}

extern bool freeze_task(struct task_struct *p);
extern bool set_freezable(void);

#ifdef CONFIG_CGROUP_FREEZER
extern bool cgroup_freezing(struct task_struct *task);
#else /* !CONFIG_CGROUP_FREEZER */
static inline bool cgroup_freezing(struct task_struct *task)
{
        return false;
}
#endif /* !CONFIG_CGROUP_FREEZER */

#else /* !CONFIG_FREEZER */
static inline bool frozen(struct task_struct *p) { return false; }
static inline bool freezing(struct task_struct *p) { return false; }
static inline void __thaw_task(struct task_struct *t) {}

static inline bool __refrigerator(bool check_kthr_stop) { return false; }
static inline int freeze_processes(void) { return -ENOSYS; }
static inline int freeze_kernel_threads(void) { return -ENOSYS; }
static inline void thaw_processes(void) {}
static inline void thaw_kernel_threads(void) {}

static inline bool try_to_freeze(void) { return false; }

static inline void set_freezable(void) {}

#endif /* !CONFIG_FREEZER */

#endif        /* FREEZER_H_INCLUDED */























   13 




   13 

    7 
    7 

















    7 


    2 


    6 
















    4 



    5 







    5 

    5 



















   67 



   57 
   14 








   26 



   26 















   44 
    2 
    3 






    2 

    5 
   37 

   43 





   34 



   19 







   51 





   45 
    2 




   46 










   46 








    7 



   41 



   44 























   43 










  248 







  247 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
// SPDX-License-Identifier: GPL-2.0-only
/*
 * irqchip.c: Common API for in kernel interrupt controllers
 * Copyright (c) 2007, Intel Corporation.
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
 * Copyright (c) 2013, Alexander Graf <agraf@suse.de>
 *
 * This file is derived from virt/kvm/irq_comm.c.
 *
 * Authors:
 *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
 *   Alexander Graf <agraf@suse.de>
 */

#include <linux/kvm_host.h>
#include <linux/slab.h>
#include <linux/srcu.h>
#include <linux/export.h>
#include <trace/events/kvm.h>

int kvm_irq_map_gsi(struct kvm *kvm,
                    struct kvm_kernel_irq_routing_entry *entries, int gsi)
{
        struct kvm_irq_routing_table *irq_rt;
        struct kvm_kernel_irq_routing_entry *e;
        int n = 0;

        irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
                                        lockdep_is_held(&kvm->irq_lock));
        if (irq_rt && gsi < irq_rt->nr_rt_entries) {
                hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
                        entries[n] = *e;
                        ++n;
                }
        }

        return n;
}

int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
{
        struct kvm_irq_routing_table *irq_rt;

        irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
        return irq_rt->chip[irqchip][pin];
}

int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
{
        struct kvm_kernel_irq_routing_entry route;

        if (!kvm_arch_irqchip_in_kernel(kvm) || (msi->flags & ~KVM_MSI_VALID_DEVID))
                return -EINVAL;

        route.msi.address_lo = msi->address_lo;
        route.msi.address_hi = msi->address_hi;
        route.msi.data = msi->data;
        route.msi.flags = msi->flags;
        route.msi.devid = msi->devid;

        return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false);
}

/*
 * Return value:
 *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
 *  = 0   Interrupt was coalesced (previous irq is still pending)
 *  > 0   Number of CPUs interrupt was delivered to
 */
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
                bool line_status)
{
        struct kvm_kernel_irq_routing_entry irq_set[KVM_NR_IRQCHIPS];
        int ret = -1, i, idx;

        trace_kvm_set_irq(irq, level, irq_source_id);

        /* Not possible to detect if the guest uses the PIC or the
         * IOAPIC.  So set the bit in both. The guest will ignore
         * writes to the unused one.
         */
        idx = srcu_read_lock(&kvm->irq_srcu);
        i = kvm_irq_map_gsi(kvm, irq_set, irq);
        srcu_read_unlock(&kvm->irq_srcu, idx);

        while (i--) {
                int r;
                r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
                                   line_status);
                if (r < 0)
                        continue;

                ret = r + ((ret < 0) ? 0 : ret);
        }

        return ret;
}

static void free_irq_routing_table(struct kvm_irq_routing_table *rt)
{
        int i;

        if (!rt)
                return;

        for (i = 0; i < rt->nr_rt_entries; ++i) {
                struct kvm_kernel_irq_routing_entry *e;
                struct hlist_node *n;

                hlist_for_each_entry_safe(e, n, &rt->map[i], link) {
                        hlist_del(&e->link);
                        kfree(e);
                }
        }

        kfree(rt);
}

void kvm_free_irq_routing(struct kvm *kvm)
{
        /* Called only during vm destruction. Nobody can use the pointer
           at this stage */
        struct kvm_irq_routing_table *rt = rcu_access_pointer(kvm->irq_routing);
        free_irq_routing_table(rt);
}

static int setup_routing_entry(struct kvm *kvm,
                               struct kvm_irq_routing_table *rt,
                               struct kvm_kernel_irq_routing_entry *e,
                               const struct kvm_irq_routing_entry *ue)
{
        struct kvm_kernel_irq_routing_entry *ei;
        int r;
        u32 gsi = array_index_nospec(ue->gsi, KVM_MAX_IRQ_ROUTES);

        /*
         * Do not allow GSI to be mapped to the same irqchip more than once.
         * Allow only one to one mapping between GSI and non-irqchip routing.
         */
        hlist_for_each_entry(ei, &rt->map[gsi], link)
                if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
                    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
                    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
                        return -EINVAL;

        e->gsi = gsi;
        e->type = ue->type;
        r = kvm_set_routing_entry(kvm, e, ue);
        if (r)
                return r;
        if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
                rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi;

        hlist_add_head(&e->link, &rt->map[e->gsi]);

        return 0;
}

void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm)
{
}

bool __weak kvm_arch_can_set_irq_routing(struct kvm *kvm)
{
        return true;
}

int kvm_set_irq_routing(struct kvm *kvm,
                        const struct kvm_irq_routing_entry *ue,
                        unsigned nr,
                        unsigned flags)
{
        struct kvm_irq_routing_table *new, *old;
        struct kvm_kernel_irq_routing_entry *e;
        u32 i, j, nr_rt_entries = 0;
        int r;

        for (i = 0; i < nr; ++i) {
                if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
                        return -EINVAL;
                nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
        }

        nr_rt_entries += 1;

        new = kzalloc(struct_size(new, map, nr_rt_entries), GFP_KERNEL_ACCOUNT);
        if (!new)
                return -ENOMEM;

        new->nr_rt_entries = nr_rt_entries;
        for (i = 0; i < KVM_NR_IRQCHIPS; i++)
                for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
                        new->chip[i][j] = -1;

        for (i = 0; i < nr; ++i) {
                r = -ENOMEM;
                e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT);
                if (!e)
                        goto out;

                r = -EINVAL;
                switch (ue->type) {
                case KVM_IRQ_ROUTING_MSI:
                        if (ue->flags & ~KVM_MSI_VALID_DEVID)
                                goto free_entry;
                        break;
                default:
                        if (ue->flags)
                                goto free_entry;
                        break;
                }
                r = setup_routing_entry(kvm, new, e, ue);
                if (r)
                        goto free_entry;
                ++ue;
        }

        mutex_lock(&kvm->irq_lock);
        old = rcu_dereference_protected(kvm->irq_routing, 1);
        rcu_assign_pointer(kvm->irq_routing, new);
        kvm_irq_routing_update(kvm);
        kvm_arch_irq_routing_update(kvm);
        mutex_unlock(&kvm->irq_lock);

        kvm_arch_post_irq_routing_update(kvm);

        synchronize_srcu_expedited(&kvm->irq_srcu);

        new = old;
        r = 0;
        goto out;

free_entry:
        kfree(e);
out:
        free_irq_routing_table(new);

        return r;
}

/*
 * Allocate empty IRQ routing by default so that additional setup isn't needed
 * when userspace-driven IRQ routing is activated, and so that kvm->irq_routing
 * is guaranteed to be non-NULL.
 */
int kvm_init_irq_routing(struct kvm *kvm)
{
        struct kvm_irq_routing_table *new;
        int chip_size;

        new = kzalloc(struct_size(new, map, 1), GFP_KERNEL_ACCOUNT);
        if (!new)
                return -ENOMEM;

        new->nr_rt_entries = 1;

        chip_size = sizeof(int) * KVM_NR_IRQCHIPS * KVM_IRQCHIP_NUM_PINS;
        memset(new->chip, -1, chip_size);

        RCU_INIT_POINTER(kvm->irq_routing, new);

        return 0;
}














































































































































































































































































    1 




















































































































































































































































































































































































































































































































































    1 
















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_FORTIFY_STRING_H_
#define _LINUX_FORTIFY_STRING_H_

#include <linux/bitfield.h>
#include <linux/bug.h>
#include <linux/const.h>
#include <linux/limits.h>

#define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable
#define __RENAME(x) __asm__(#x)

#define FORTIFY_REASON_DIR(r)                FIELD_GET(BIT(0), r)
#define FORTIFY_REASON_FUNC(r)                FIELD_GET(GENMASK(7, 1), r)
#define FORTIFY_REASON(func, write)        (FIELD_PREP(BIT(0), write) | \
                                         FIELD_PREP(GENMASK(7, 1), func))

/* Overridden by KUnit tests. */
#ifndef fortify_panic
# define fortify_panic(func, write, avail, size, retfail)        \
         __fortify_panic(FORTIFY_REASON(func, write), avail, size)
#endif
#ifndef fortify_warn_once
# define fortify_warn_once(x...)        WARN_ONCE(x)
#endif

#define FORTIFY_READ                 0
#define FORTIFY_WRITE                 1

#define EACH_FORTIFY_FUNC(macro)        \
        macro(strncpy),                        \
        macro(strnlen),                        \
        macro(strlen),                        \
        macro(strscpy),                        \
        macro(strlcat),                        \
        macro(strcat),                        \
        macro(strncat),                        \
        macro(memset),                        \
        macro(memcpy),                        \
        macro(memmove),                        \
        macro(memscan),                        \
        macro(memcmp),                        \
        macro(memchr),                        \
        macro(memchr_inv),                \
        macro(kmemdup),                        \
        macro(strcpy),                        \
        macro(UNKNOWN),

#define MAKE_FORTIFY_FUNC(func)        FORTIFY_FUNC_##func

enum fortify_func {
        EACH_FORTIFY_FUNC(MAKE_FORTIFY_FUNC)
};

void __fortify_report(const u8 reason, const size_t avail, const size_t size);
void __fortify_panic(const u8 reason, const size_t avail, const size_t size) __cold __noreturn;
void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)");
void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)");
void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?");
void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)");
void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?");

#define __compiletime_strlen(p)                                        \
({                                                                \
        char *__p = (char *)(p);                                \
        size_t __ret = SIZE_MAX;                                \
        const size_t __p_size = __member_size(p);                \
        if (__p_size != SIZE_MAX &&                                \
            __builtin_constant_p(*__p)) {                        \
                size_t __p_len = __p_size - 1;                        \
                if (__builtin_constant_p(__p[__p_len]) &&        \
                    __p[__p_len] == '\0')                        \
                        __ret = __builtin_strlen(__p);                \
        }                                                        \
        __ret;                                                        \
})

#if defined(__SANITIZE_ADDRESS__)

#if !defined(CONFIG_CC_HAS_KASAN_MEMINTRINSIC_PREFIX) && !defined(CONFIG_GENERIC_ENTRY)
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy);
#elif defined(CONFIG_KASAN_GENERIC)
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__asan_memset);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memmove);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__asan_memcpy);
#else /* CONFIG_KASAN_SW_TAGS */
extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(__hwasan_memset);
extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memmove);
extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(__hwasan_memcpy);
#endif

extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr);
extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp);
extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat);
extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy);
extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen);
extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat);
extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy);

#else

#if defined(__SANITIZE_MEMORY__)
/*
 * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the
 * corresponding __msan_XXX functions.
 */
#include <linux/kmsan_string.h>
#define __underlying_memcpy        __msan_memcpy
#define __underlying_memmove        __msan_memmove
#define __underlying_memset        __msan_memset
#else
#define __underlying_memcpy        __builtin_memcpy
#define __underlying_memmove        __builtin_memmove
#define __underlying_memset        __builtin_memset
#endif

#define __underlying_memchr        __builtin_memchr
#define __underlying_memcmp        __builtin_memcmp
#define __underlying_strcat        __builtin_strcat
#define __underlying_strcpy        __builtin_strcpy
#define __underlying_strlen        __builtin_strlen
#define __underlying_strncat        __builtin_strncat
#define __underlying_strncpy        __builtin_strncpy

#endif

/**
 * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking
 *
 * @dst: Destination memory address to write to
 * @src: Source memory address to read from
 * @bytes: How many bytes to write to @dst from @src
 * @justification: Free-form text or comment describing why the use is needed
 *
 * This should be used for corner cases where the compiler cannot do the
 * right thing, or during transitions between APIs, etc. It should be used
 * very rarely, and includes a place for justification detailing where bounds
 * checking has happened, and why existing solutions cannot be employed.
 */
#define unsafe_memcpy(dst, src, bytes, justification)                \
        __underlying_memcpy(dst, src, bytes)

/*
 * Clang's use of __builtin_*object_size() within inlines needs hinting via
 * __pass_*object_size(). The preference is to only ever use type 1 (member
 * size, rather than struct size), but there remain some stragglers using
 * type 0 that will be converted in the future.
 */
#if __has_builtin(__builtin_dynamic_object_size)
#define POS                        __pass_dynamic_object_size(1)
#define POS0                        __pass_dynamic_object_size(0)
#else
#define POS                        __pass_object_size(1)
#define POS0                        __pass_object_size(0)
#endif

#define __compiletime_lessthan(bounds, length)        (        \
        __builtin_constant_p((bounds) < (length)) &&        \
        (bounds) < (length)                                \
)

/**
 * strncpy - Copy a string to memory with non-guaranteed NUL padding
 *
 * @p: pointer to destination of copy
 * @q: pointer to NUL-terminated source string to copy
 * @size: bytes to write at @p
 *
 * If strlen(@q) >= @size, the copy of @q will stop after @size bytes,
 * and @p will NOT be NUL-terminated
 *
 * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes
 * will be written to @p until @size total bytes have been written.
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * over-reads of @q, it cannot defend against writing unterminated
 * results to @p. Using strncpy() remains ambiguous and fragile.
 * Instead, please choose an alternative, so that the expectation
 * of @p's contents is unambiguous:
 *
 * +--------------------+--------------------+------------+
 * | **p** needs to be: | padded to **size** | not padded |
 * +====================+====================+============+
 * |     NUL-terminated | strscpy_pad()      | strscpy()  |
 * +--------------------+--------------------+------------+
 * | not NUL-terminated | strtomem_pad()     | strtomem() |
 * +--------------------+--------------------+------------+
 *
 * Note strscpy*()'s differing return values for detecting truncation,
 * and strtomem*()'s expectation that the destination is marked with
 * __nonstring when it is a character array.
 *
 */
__FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3)
char *strncpy(char * const POS p, const char *q, __kernel_size_t size)
{
        const size_t p_size = __member_size(p);

        if (__compiletime_lessthan(p_size, size))
                __write_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_strncpy, FORTIFY_WRITE, p_size, size, p);
        return __underlying_strncpy(p, q, size);
}

extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen);
/**
 * strnlen - Return bounded count of characters in a NUL-terminated string
 *
 * @p: pointer to NUL-terminated string to count.
 * @maxlen: maximum number of characters to count.
 *
 * Returns number of characters in @p (NOT including the final NUL), or
 * @maxlen, if no NUL has been found up to there.
 *
 */
__FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen)
{
        const size_t p_size = __member_size(p);
        const size_t p_len = __compiletime_strlen(p);
        size_t ret;

        /* We can take compile-time actions when maxlen is const. */
        if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) {
                /* If p is const, we can use its compile-time-known len. */
                if (maxlen >= p_size)
                        return p_len;
        }

        /* Do not check characters beyond the end of p. */
        ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size);
        if (p_size <= ret && maxlen != ret)
                fortify_panic(FORTIFY_FUNC_strnlen, FORTIFY_READ, p_size, ret + 1, ret);
        return ret;
}

/*
 * Defined after fortified strnlen to reuse it. However, it must still be
 * possible for strlen() to be used on compile-time strings for use in
 * static initializers (i.e. as a constant expression).
 */
/**
 * strlen - Return count of characters in a NUL-terminated string
 *
 * @p: pointer to NUL-terminated string to count.
 *
 * Do not use this function unless the string length is known at
 * compile-time. When @p is unterminated, this function may crash
 * or return unexpected counts that could lead to memory content
 * exposures. Prefer strnlen().
 *
 * Returns number of characters in @p (NOT including the final NUL).
 *
 */
#define strlen(p)                                                        \
        __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)),        \
                __builtin_strlen(p), __fortify_strlen(p))
__FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1)
__kernel_size_t __fortify_strlen(const char * const POS p)
{
        const size_t p_size = __member_size(p);
        __kernel_size_t ret;

        /* Give up if we don't know how large p is. */
        if (p_size == SIZE_MAX)
                return __underlying_strlen(p);
        ret = strnlen(p, p_size);
        if (p_size <= ret)
                fortify_panic(FORTIFY_FUNC_strlen, FORTIFY_READ, p_size, ret + 1, ret);
        return ret;
}

/* Defined after fortified strnlen() to reuse it. */
extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(sized_strscpy);
__FORTIFY_INLINE ssize_t sized_strscpy(char * const POS p, const char * const POS q, size_t size)
{
        /* Use string size rather than possible enclosing struct size. */
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t len;

        /* If we cannot get size of p and q default to call strscpy. */
        if (p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __real_strscpy(p, q, size);

        /*
         * If size can be known at compile time and is greater than
         * p_size, generate a compile time write overflow error.
         */
        if (__compiletime_lessthan(p_size, size))
                __write_overflow();

        /* Short-circuit for compile-time known-safe lengths. */
        if (__compiletime_lessthan(p_size, SIZE_MAX)) {
                len = __compiletime_strlen(q);

                if (len < SIZE_MAX && __compiletime_lessthan(len, size)) {
                        __underlying_memcpy(p, q, len + 1);
                        return len;
                }
        }

        /*
         * This call protects from read overflow, because len will default to q
         * length if it smaller than size.
         */
        len = strnlen(q, size);
        /*
         * If len equals size, we will copy only size bytes which leads to
         * -E2BIG being returned.
         * Otherwise we will copy len + 1 because of the final '\O'.
         */
        len = len == size ? size : len + 1;

        /*
         * Generate a runtime write overflow error if len is greater than
         * p_size.
         */
        if (p_size < len)
                fortify_panic(FORTIFY_FUNC_strscpy, FORTIFY_WRITE, p_size, len, -E2BIG);

        /*
         * We can now safely call vanilla strscpy because we are protected from:
         * 1. Read overflow thanks to call to strnlen().
         * 2. Write overflow thanks to above ifs.
         */
        return __real_strscpy(p, q, len);
}

/* Defined after fortified strlen() to reuse it. */
extern size_t __real_strlcat(char *p, const char *q, size_t avail) __RENAME(strlcat);
/**
 * strlcat - Append a string to an existing string
 *
 * @p: pointer to %NUL-terminated string to append to
 * @q: pointer to %NUL-terminated string to append from
 * @avail: Maximum bytes available in @p
 *
 * Appends %NUL-terminated string @q after the %NUL-terminated
 * string at @p, but will not write beyond @avail bytes total,
 * potentially truncating the copy from @q. @p will stay
 * %NUL-terminated only if a %NUL already existed within
 * the @avail bytes of @p. If so, the resulting number of
 * bytes copied from @q will be at most "@avail - strlen(@p) - 1".
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * read and write overflows, this is only possible when the sizes
 * of @p and @q are known to the compiler. Prefer building the
 * string with formatting, via scnprintf(), seq_buf, or similar.
 *
 * Returns total bytes that _would_ have been contained by @p
 * regardless of truncation, similar to snprintf(). If return
 * value is >= @avail, the string has been truncated.
 *
 */
__FORTIFY_INLINE
size_t strlcat(char * const POS p, const char * const POS q, size_t avail)
{
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t p_len, copy_len;
        size_t actual, wanted;

        /* Give up immediately if both buffer sizes are unknown. */
        if (p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __real_strlcat(p, q, avail);

        p_len = strnlen(p, avail);
        copy_len = strlen(q);
        wanted = actual = p_len + copy_len;

        /* Cannot append any more: report truncation. */
        if (avail <= p_len)
                return wanted;

        /* Give up if string is already overflowed. */
        if (p_size <= p_len)
                fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_READ, p_size, p_len + 1, wanted);

        if (actual >= avail) {
                copy_len = avail - p_len - 1;
                actual = p_len + copy_len;
        }

        /* Give up if copy will overflow. */
        if (p_size <= actual)
                fortify_panic(FORTIFY_FUNC_strlcat, FORTIFY_WRITE, p_size, actual + 1, wanted);
        __underlying_memcpy(p + p_len, q, copy_len);
        p[actual] = '\0';

        return wanted;
}

/* Defined after fortified strlcat() to reuse it. */
/**
 * strcat - Append a string to an existing string
 *
 * @p: pointer to NUL-terminated string to append to
 * @q: pointer to NUL-terminated source string to append from
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * read and write overflows, this is only possible when the
 * destination buffer size is known to the compiler. Prefer
 * building the string with formatting, via scnprintf() or similar.
 * At the very least, use strncat().
 *
 * Returns @p.
 *
 */
__FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2)
char *strcat(char * const POS p, const char *q)
{
        const size_t p_size = __member_size(p);
        const size_t wanted = strlcat(p, q, p_size);

        if (p_size <= wanted)
                fortify_panic(FORTIFY_FUNC_strcat, FORTIFY_WRITE, p_size, wanted + 1, p);
        return p;
}

/**
 * strncat - Append a string to an existing string
 *
 * @p: pointer to NUL-terminated string to append to
 * @q: pointer to source string to append from
 * @count: Maximum bytes to read from @q
 *
 * Appends at most @count bytes from @q (stopping at the first
 * NUL byte) after the NUL-terminated string at @p. @p will be
 * NUL-terminated.
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * read and write overflows, this is only possible when the sizes
 * of @p and @q are known to the compiler. Prefer building the
 * string with formatting, via scnprintf() or similar.
 *
 * Returns @p.
 *
 */
/* Defined after fortified strlen() and strnlen() to reuse them. */
__FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3)
char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count)
{
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t p_len, copy_len, total;

        if (p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __underlying_strncat(p, q, count);
        p_len = strlen(p);
        copy_len = strnlen(q, count);
        total = p_len + copy_len + 1;
        if (p_size < total)
                fortify_panic(FORTIFY_FUNC_strncat, FORTIFY_WRITE, p_size, total, p);
        __underlying_memcpy(p + p_len, q, copy_len);
        p[p_len + copy_len] = '\0';
        return p;
}

__FORTIFY_INLINE bool fortify_memset_chk(__kernel_size_t size,
                                         const size_t p_size,
                                         const size_t p_size_field)
{
        if (__builtin_constant_p(size)) {
                /*
                 * Length argument is a constant expression, so we
                 * can perform compile-time bounds checking where
                 * buffer sizes are also known at compile time.
                 */

                /* Error when size is larger than enclosing struct. */
                if (__compiletime_lessthan(p_size_field, p_size) &&
                    __compiletime_lessthan(p_size, size))
                        __write_overflow();

                /* Warn when write size is larger than dest field. */
                if (__compiletime_lessthan(p_size_field, size))
                        __write_overflow_field(p_size_field, size);
        }
        /*
         * At this point, length argument may not be a constant expression,
         * so run-time bounds checking can be done where buffer sizes are
         * known. (This is not an "else" because the above checks may only
         * be compile-time warnings, and we want to still warn for run-time
         * overflows.)
         */

        /*
         * Always stop accesses beyond the struct that contains the
         * field, when the buffer's remaining size is known.
         * (The SIZE_MAX test is to optimize away checks where the buffer
         * lengths are unknown.)
         */
        if (p_size != SIZE_MAX && p_size < size)
                fortify_panic(FORTIFY_FUNC_memset, FORTIFY_WRITE, p_size, size, true);
        return false;
}

#define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({        \
        size_t __fortify_size = (size_t)(size);                                \
        fortify_memset_chk(__fortify_size, p_size, p_size_field),        \
        __underlying_memset(p, c, __fortify_size);                        \
})

/*
 * __struct_size() vs __member_size() must be captured here to avoid
 * evaluating argument side-effects further into the macro layers.
 */
#ifndef CONFIG_KMSAN
#define memset(p, c, s) __fortify_memset_chk(p, c, s,                        \
                __struct_size(p), __member_size(p))
#endif

/*
 * To make sure the compiler can enforce protection against buffer overflows,
 * memcpy(), memmove(), and memset() must not be used beyond individual
 * struct members. If you need to copy across multiple members, please use
 * struct_group() to create a named mirror of an anonymous struct union.
 * (e.g. see struct sk_buff.) Read overflow checking is currently only
 * done when a write overflow is also present, or when building with W=1.
 *
 * Mitigation coverage matrix
 *                                        Bounds checking at:
 *                                        +-------+-------+-------+-------+
 *                                        | Compile time  |   Run time    |
 * memcpy() argument sizes:                | write | read  | write | read  |
 *        dest     source   length      +-------+-------+-------+-------+
 * memcpy(known,   known,   constant)        |   y   |   y   |  n/a  |  n/a  |
 * memcpy(known,   unknown, constant)        |   y   |   n   |  n/a  |   V   |
 * memcpy(known,   known,   dynamic)        |   n   |   n   |   B   |   B   |
 * memcpy(known,   unknown, dynamic)        |   n   |   n   |   B   |   V   |
 * memcpy(unknown, known,   constant)        |   n   |   y   |   V   |  n/a  |
 * memcpy(unknown, unknown, constant)        |   n   |   n   |   V   |   V   |
 * memcpy(unknown, known,   dynamic)        |   n   |   n   |   V   |   B   |
 * memcpy(unknown, unknown, dynamic)        |   n   |   n   |   V   |   V   |
 *                                        +-------+-------+-------+-------+
 *
 * y = perform deterministic compile-time bounds checking
 * n = cannot perform deterministic compile-time bounds checking
 * n/a = no run-time bounds checking needed since compile-time deterministic
 * B = can perform run-time bounds checking (currently unimplemented)
 * V = vulnerable to run-time overflow (will need refactoring to solve)
 *
 */
__FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size,
                                         const size_t p_size,
                                         const size_t q_size,
                                         const size_t p_size_field,
                                         const size_t q_size_field,
                                         const u8 func)
{
        if (__builtin_constant_p(size)) {
                /*
                 * Length argument is a constant expression, so we
                 * can perform compile-time bounds checking where
                 * buffer sizes are also known at compile time.
                 */

                /* Error when size is larger than enclosing struct. */
                if (__compiletime_lessthan(p_size_field, p_size) &&
                    __compiletime_lessthan(p_size, size))
                        __write_overflow();
                if (__compiletime_lessthan(q_size_field, q_size) &&
                    __compiletime_lessthan(q_size, size))
                        __read_overflow2();

                /* Warn when write size argument larger than dest field. */
                if (__compiletime_lessthan(p_size_field, size))
                        __write_overflow_field(p_size_field, size);
                /*
                 * Warn for source field over-read when building with W=1
                 * or when an over-write happened, so both can be fixed at
                 * the same time.
                 */
                if ((IS_ENABLED(KBUILD_EXTRA_WARN1) ||
                     __compiletime_lessthan(p_size_field, size)) &&
                    __compiletime_lessthan(q_size_field, size))
                        __read_overflow2_field(q_size_field, size);
        }
        /*
         * At this point, length argument may not be a constant expression,
         * so run-time bounds checking can be done where buffer sizes are
         * known. (This is not an "else" because the above checks may only
         * be compile-time warnings, and we want to still warn for run-time
         * overflows.)
         */

        /*
         * Always stop accesses beyond the struct that contains the
         * field, when the buffer's remaining size is known.
         * (The SIZE_MAX test is to optimize away checks where the buffer
         * lengths are unknown.)
         */
        if (p_size != SIZE_MAX && p_size < size)
                fortify_panic(func, FORTIFY_WRITE, p_size, size, true);
        else if (q_size != SIZE_MAX && q_size < size)
                fortify_panic(func, FORTIFY_READ, p_size, size, true);

        /*
         * Warn when writing beyond destination field size.
         *
         * Note the implementation of __builtin_*object_size() behaves
         * like sizeof() when not directly referencing a flexible
         * array member, which means there will be many bounds checks
         * that will appear at run-time, without a way for them to be
         * detected at compile-time (as can be done when the destination
         * is specifically the flexible array member).
         * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832
         */
        if (p_size_field != SIZE_MAX &&
            p_size != p_size_field && p_size_field < size)
                return true;

        return false;
}

/*
 * To work around what seems to be an optimizer bug, the macro arguments
 * need to have const copies or the values end up changed by the time they
 * reach fortify_warn_once(). See commit 6f7630b1b5bc ("fortify: Capture
 * __bos() results in const temp vars") for more details.
 */
#define __fortify_memcpy_chk(p, q, size, p_size, q_size,                \
                             p_size_field, q_size_field, op) ({                \
        const size_t __fortify_size = (size_t)(size);                        \
        const size_t __p_size = (p_size);                                \
        const size_t __q_size = (q_size);                                \
        const size_t __p_size_field = (p_size_field);                        \
        const size_t __q_size_field = (q_size_field);                        \
        /* Keep a mutable version of the size for the final copy. */        \
        size_t __copy_size = __fortify_size;                                \
        fortify_warn_once(fortify_memcpy_chk(__fortify_size, __p_size,        \
                                     __q_size, __p_size_field,                \
                                     __q_size_field, FORTIFY_FUNC_ ##op), \
                  #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \
                  __fortify_size,                                        \
                  "field \"" #p "\" at " FILE_LINE,                        \
                  __p_size_field);                                        \
        /* Hide only the run-time size from value range tracking to */        \
        /* silence compile-time false positive bounds warnings. */        \
        if (!__builtin_constant_p(__copy_size))                                \
                OPTIMIZER_HIDE_VAR(__copy_size);                        \
        __underlying_##op(p, q, __copy_size);                                \
})

/*
 * Notes about compile-time buffer size detection:
 *
 * With these types...
 *
 *        struct middle {
 *                u16 a;
 *                u8 middle_buf[16];
 *                int b;
 *        };
 *        struct end {
 *                u16 a;
 *                u8 end_buf[16];
 *        };
 *        struct flex {
 *                int a;
 *                u8 flex_buf[];
 *        };
 *
 *        void func(TYPE *ptr) { ... }
 *
 * Cases where destination size cannot be currently detected:
 * - the size of ptr's object (seemingly by design, gcc & clang fail):
 *        __builtin_object_size(ptr, 1) == SIZE_MAX
 * - the size of flexible arrays in ptr's obj (by design, dynamic size):
 *        __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX
 * - the size of ANY array at the end of ptr's obj (gcc and clang bug):
 *        __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX
 *        https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836
 *
 * Cases where destination size is currently detected:
 * - the size of non-array members within ptr's object:
 *        __builtin_object_size(ptr->a, 1) == 2
 * - the size of non-flexible-array in the middle of ptr's obj:
 *        __builtin_object_size(ptr->middle_buf, 1) == 16
 *
 */

/*
 * __struct_size() vs __member_size() must be captured here to avoid
 * evaluating argument side-effects further into the macro layers.
 */
#define memcpy(p, q, s)  __fortify_memcpy_chk(p, q, s,                        \
                __struct_size(p), __struct_size(q),                        \
                __member_size(p), __member_size(q),                        \
                memcpy)
#define memmove(p, q, s)  __fortify_memcpy_chk(p, q, s,                        \
                __struct_size(p), __struct_size(q),                        \
                __member_size(p), __member_size(q),                        \
                memmove)

extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan);
__FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memscan, FORTIFY_READ, p_size, size, NULL);
        return __real_memscan(p, c, size);
}

__FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3)
int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size)
{
        const size_t p_size = __struct_size(p);
        const size_t q_size = __struct_size(q);

        if (__builtin_constant_p(size)) {
                if (__compiletime_lessthan(p_size, size))
                        __read_overflow();
                if (__compiletime_lessthan(q_size, size))
                        __read_overflow2();
        }
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, p_size, size, INT_MIN);
        else if (q_size < size)
                fortify_panic(FORTIFY_FUNC_memcmp, FORTIFY_READ, q_size, size, INT_MIN);
        return __underlying_memcmp(p, q, size);
}

__FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3)
void *memchr(const void * const POS0 p, int c, __kernel_size_t size)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memchr, FORTIFY_READ, p_size, size, NULL);
        return __underlying_memchr(p, c, size);
}

void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv);
__FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_memchr_inv, FORTIFY_READ, p_size, size, NULL);
        return __real_memchr_inv(p, c, size);
}

extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof)
                                                                    __realloc_size(2);
__FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp)
{
        const size_t p_size = __struct_size(p);

        if (__compiletime_lessthan(p_size, size))
                __read_overflow();
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_kmemdup, FORTIFY_READ, p_size, size,
                              __real_kmemdup(p, 0, gfp));
        return __real_kmemdup(p, size, gfp);
}
#define kmemdup(...)        alloc_hooks(kmemdup_noprof(__VA_ARGS__))

/**
 * strcpy - Copy a string into another string buffer
 *
 * @p: pointer to destination of copy
 * @q: pointer to NUL-terminated source string to copy
 *
 * Do not use this function. While FORTIFY_SOURCE tries to avoid
 * overflows, this is only possible when the sizes of @q and @p are
 * known to the compiler. Prefer strscpy(), though note its different
 * return values for detecting truncation.
 *
 * Returns @p.
 *
 */
/* Defined after fortified strlen to reuse it. */
__FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2)
char *strcpy(char * const POS p, const char * const POS q)
{
        const size_t p_size = __member_size(p);
        const size_t q_size = __member_size(q);
        size_t size;

        /* If neither buffer size is known, immediately give up. */
        if (__builtin_constant_p(p_size) &&
            __builtin_constant_p(q_size) &&
            p_size == SIZE_MAX && q_size == SIZE_MAX)
                return __underlying_strcpy(p, q);
        size = strlen(q) + 1;
        /* Compile-time check for const size overflow. */
        if (__compiletime_lessthan(p_size, size))
                __write_overflow();
        /* Run-time check for dynamic size overflow. */
        if (p_size < size)
                fortify_panic(FORTIFY_FUNC_strcpy, FORTIFY_WRITE, p_size, size, p);
        __underlying_memcpy(p, q, size);
        return p;
}

/* Don't use these outside the FORITFY_SOURCE implementation */
#undef __underlying_memchr
#undef __underlying_memcmp
#undef __underlying_strcat
#undef __underlying_strcpy
#undef __underlying_strlen
#undef __underlying_strncat
#undef __underlying_strncpy

#undef POS
#undef POS0

#endif /* _LINUX_FORTIFY_STRING_H_ */



























   35 









   24 








   24 














































   24 





   24 










   24 
   24 
   24 





















   24 




































   24 






















   11 




























































































   11 


   11 


   11 






    9 




    2 








   11 









    2 


    9 


    1 








   24 


























  158 

   35 
   35 
   35 
  157 
   85 
   85 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Fault injection for both 32 and 64bit guests.
 *
 * Copyright (C) 2012,2013 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 *
 * Based on arch/arm/kvm/emulate.c
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <hyp/adjust_pc.h>
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>

#if !defined (__KVM_NVHE_HYPERVISOR__) && !defined (__KVM_VHE_HYPERVISOR__)
#error Hypervisor code only!
#endif

static inline u64 __vcpu_read_sys_reg(const struct kvm_vcpu *vcpu, int reg)
{
        u64 val;

        if (unlikely(vcpu_has_nv(vcpu)))
                return vcpu_read_sys_reg(vcpu, reg);
        else if (__vcpu_read_sys_reg_from_cpu(reg, &val))
                return val;

        return __vcpu_sys_reg(vcpu, reg);
}

static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
{
        if (unlikely(vcpu_has_nv(vcpu)))
                vcpu_write_sys_reg(vcpu, val, reg);
        else if (!__vcpu_write_sys_reg_to_cpu(val, reg))
                __vcpu_sys_reg(vcpu, reg) = val;
}

static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long target_mode,
                              u64 val)
{
        if (unlikely(vcpu_has_nv(vcpu))) {
                if (target_mode == PSR_MODE_EL1h)
                        vcpu_write_sys_reg(vcpu, val, SPSR_EL1);
                else
                        vcpu_write_sys_reg(vcpu, val, SPSR_EL2);
        } else if (has_vhe()) {
                write_sysreg_el1(val, SYS_SPSR);
        } else {
                __vcpu_sys_reg(vcpu, SPSR_EL1) = val;
        }
}

static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
{
        if (has_vhe())
                write_sysreg(val, spsr_abt);
        else
                vcpu->arch.ctxt.spsr_abt = val;
}

static void __vcpu_write_spsr_und(struct kvm_vcpu *vcpu, u64 val)
{
        if (has_vhe())
                write_sysreg(val, spsr_und);
        else
                vcpu->arch.ctxt.spsr_und = val;
}

/*
 * This performs the exception entry at a given EL (@target_mode), stashing PC
 * and PSTATE into ELR and SPSR respectively, and compute the new PC/PSTATE.
 * The EL passed to this function *must* be a non-secure, privileged mode with
 * bit 0 being set (PSTATE.SP == 1).
 *
 * When an exception is taken, most PSTATE fields are left unchanged in the
 * handler. However, some are explicitly overridden (e.g. M[4:0]). Luckily all
 * of the inherited bits have the same position in the AArch64/AArch32 SPSR_ELx
 * layouts, so we don't need to shuffle these for exceptions from AArch32 EL0.
 *
 * For the SPSR_ELx layout for AArch64, see ARM DDI 0487E.a page C5-429.
 * For the SPSR_ELx layout for AArch32, see ARM DDI 0487E.a page C5-426.
 *
 * Here we manipulate the fields in order of the AArch64 SPSR_ELx layout, from
 * MSB to LSB.
 */
static void enter_exception64(struct kvm_vcpu *vcpu, unsigned long target_mode,
                              enum exception_type type)
{
        unsigned long sctlr, vbar, old, new, mode;
        u64 exc_offset;

        mode = *vcpu_cpsr(vcpu) & (PSR_MODE_MASK | PSR_MODE32_BIT);

        if      (mode == target_mode)
                exc_offset = CURRENT_EL_SP_ELx_VECTOR;
        else if ((mode | PSR_MODE_THREAD_BIT) == target_mode)
                exc_offset = CURRENT_EL_SP_EL0_VECTOR;
        else if (!(mode & PSR_MODE32_BIT))
                exc_offset = LOWER_EL_AArch64_VECTOR;
        else
                exc_offset = LOWER_EL_AArch32_VECTOR;

        switch (target_mode) {
        case PSR_MODE_EL1h:
                vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL1);
                sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
                __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL1);
                break;
        case PSR_MODE_EL2h:
                vbar = __vcpu_read_sys_reg(vcpu, VBAR_EL2);
                sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL2);
                __vcpu_write_sys_reg(vcpu, *vcpu_pc(vcpu), ELR_EL2);
                break;
        default:
                /* Don't do that */
                BUG();
        }

        *vcpu_pc(vcpu) = vbar + exc_offset + type;

        old = *vcpu_cpsr(vcpu);
        new = 0;

        new |= (old & PSR_N_BIT);
        new |= (old & PSR_Z_BIT);
        new |= (old & PSR_C_BIT);
        new |= (old & PSR_V_BIT);

        if (kvm_has_mte(kern_hyp_va(vcpu->kvm)))
                new |= PSR_TCO_BIT;

        new |= (old & PSR_DIT_BIT);

        // PSTATE.UAO is set to zero upon any exception to AArch64
        // See ARM DDI 0487E.a, page D5-2579.

        // PSTATE.PAN is unchanged unless SCTLR_ELx.SPAN == 0b0
        // SCTLR_ELx.SPAN is RES1 when ARMv8.1-PAN is not implemented
        // See ARM DDI 0487E.a, page D5-2578.
        new |= (old & PSR_PAN_BIT);
        if (!(sctlr & SCTLR_EL1_SPAN))
                new |= PSR_PAN_BIT;

        // PSTATE.SS is set to zero upon any exception to AArch64
        // See ARM DDI 0487E.a, page D2-2452.

        // PSTATE.IL is set to zero upon any exception to AArch64
        // See ARM DDI 0487E.a, page D1-2306.

        // PSTATE.SSBS is set to SCTLR_ELx.DSSBS upon any exception to AArch64
        // See ARM DDI 0487E.a, page D13-3258
        if (sctlr & SCTLR_ELx_DSSBS)
                new |= PSR_SSBS_BIT;

        // PSTATE.BTYPE is set to zero upon any exception to AArch64
        // See ARM DDI 0487E.a, pages D1-2293 to D1-2294.

        new |= PSR_D_BIT;
        new |= PSR_A_BIT;
        new |= PSR_I_BIT;
        new |= PSR_F_BIT;

        new |= target_mode;

        *vcpu_cpsr(vcpu) = new;
        __vcpu_write_spsr(vcpu, target_mode, old);
}

/*
 * When an exception is taken, most CPSR fields are left unchanged in the
 * handler. However, some are explicitly overridden (e.g. M[4:0]).
 *
 * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
 * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
 * obsoleted by the ARMv7 virtualization extensions and is RES0.
 *
 * For the SPSR layout seen from AArch32, see:
 * - ARM DDI 0406C.d, page B1-1148
 * - ARM DDI 0487E.a, page G8-6264
 *
 * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
 * - ARM DDI 0487E.a, page C5-426
 *
 * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
 * MSB to LSB.
 */
static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
{
        u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
        unsigned long old, new;

        old = *vcpu_cpsr(vcpu);
        new = 0;

        new |= (old & PSR_AA32_N_BIT);
        new |= (old & PSR_AA32_Z_BIT);
        new |= (old & PSR_AA32_C_BIT);
        new |= (old & PSR_AA32_V_BIT);
        new |= (old & PSR_AA32_Q_BIT);

        // CPSR.IT[7:0] are set to zero upon any exception
        // See ARM DDI 0487E.a, section G1.12.3
        // See ARM DDI 0406C.d, section B1.8.3

        new |= (old & PSR_AA32_DIT_BIT);

        // CPSR.SSBS is set to SCTLR.DSSBS upon any exception
        // See ARM DDI 0487E.a, page G8-6244
        if (sctlr & BIT(31))
                new |= PSR_AA32_SSBS_BIT;

        // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
        // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
        // See ARM DDI 0487E.a, page G8-6246
        new |= (old & PSR_AA32_PAN_BIT);
        if (!(sctlr & BIT(23)))
                new |= PSR_AA32_PAN_BIT;

        // SS does not exist in AArch32, so ignore

        // CPSR.IL is set to zero upon any exception
        // See ARM DDI 0487E.a, page G1-5527

        new |= (old & PSR_AA32_GE_MASK);

        // CPSR.IT[7:0] are set to zero upon any exception
        // See prior comment above

        // CPSR.E is set to SCTLR.EE upon any exception
        // See ARM DDI 0487E.a, page G8-6245
        // See ARM DDI 0406C.d, page B4-1701
        if (sctlr & BIT(25))
                new |= PSR_AA32_E_BIT;

        // CPSR.A is unchanged upon an exception to Undefined, Supervisor
        // CPSR.A is set upon an exception to other modes
        // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
        // See ARM DDI 0406C.d, page B1-1182
        new |= (old & PSR_AA32_A_BIT);
        if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
                new |= PSR_AA32_A_BIT;

        // CPSR.I is set upon any exception
        // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
        // See ARM DDI 0406C.d, page B1-1182
        new |= PSR_AA32_I_BIT;

        // CPSR.F is set upon an exception to FIQ
        // CPSR.F is unchanged upon an exception to other modes
        // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
        // See ARM DDI 0406C.d, page B1-1182
        new |= (old & PSR_AA32_F_BIT);
        if (mode == PSR_AA32_MODE_FIQ)
                new |= PSR_AA32_F_BIT;

        // CPSR.T is set to SCTLR.TE upon any exception
        // See ARM DDI 0487E.a, page G8-5514
        // See ARM DDI 0406C.d, page B1-1181
        if (sctlr & BIT(30))
                new |= PSR_AA32_T_BIT;

        new |= mode;

        return new;
}

/*
 * Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
 */
static const u8 return_offsets[8][2] = {
        [0] = { 0, 0 },                /* Reset, unused */
        [1] = { 4, 2 },                /* Undefined */
        [2] = { 0, 0 },                /* SVC, unused */
        [3] = { 4, 4 },                /* Prefetch abort */
        [4] = { 8, 8 },                /* Data abort */
        [5] = { 0, 0 },                /* HVC, unused */
        [6] = { 4, 4 },                /* IRQ, unused */
        [7] = { 4, 4 },                /* FIQ, unused */
};

static void enter_exception32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
{
        unsigned long spsr = *vcpu_cpsr(vcpu);
        bool is_thumb = (spsr & PSR_AA32_T_BIT);
        u32 sctlr = __vcpu_read_sys_reg(vcpu, SCTLR_EL1);
        u32 return_address;

        *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
        return_address   = *vcpu_pc(vcpu);
        return_address  += return_offsets[vect_offset >> 2][is_thumb];

        /* KVM only enters the ABT and UND modes, so only deal with those */
        switch(mode) {
        case PSR_AA32_MODE_ABT:
                __vcpu_write_spsr_abt(vcpu, host_spsr_to_spsr32(spsr));
                vcpu_gp_regs(vcpu)->compat_lr_abt = return_address;
                break;

        case PSR_AA32_MODE_UND:
                __vcpu_write_spsr_und(vcpu, host_spsr_to_spsr32(spsr));
                vcpu_gp_regs(vcpu)->compat_lr_und = return_address;
                break;
        }

        /* Branch to exception vector */
        if (sctlr & (1 << 13))
                vect_offset += 0xffff0000;
        else /* always have security exceptions */
                vect_offset += __vcpu_read_sys_reg(vcpu, VBAR_EL1);

        *vcpu_pc(vcpu) = vect_offset;
}

static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
        if (vcpu_el1_is_32bit(vcpu)) {
                switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
                case unpack_vcpu_flag(EXCEPT_AA32_UND):
                        enter_exception32(vcpu, PSR_AA32_MODE_UND, 4);
                        break;
                case unpack_vcpu_flag(EXCEPT_AA32_IABT):
                        enter_exception32(vcpu, PSR_AA32_MODE_ABT, 12);
                        break;
                case unpack_vcpu_flag(EXCEPT_AA32_DABT):
                        enter_exception32(vcpu, PSR_AA32_MODE_ABT, 16);
                        break;
                default:
                        /* Err... */
                        break;
                }
        } else {
                switch (vcpu_get_flag(vcpu, EXCEPT_MASK)) {
                case unpack_vcpu_flag(EXCEPT_AA64_EL1_SYNC):
                        enter_exception64(vcpu, PSR_MODE_EL1h, except_type_sync);
                        break;

                case unpack_vcpu_flag(EXCEPT_AA64_EL2_SYNC):
                        enter_exception64(vcpu, PSR_MODE_EL2h, except_type_sync);
                        break;

                case unpack_vcpu_flag(EXCEPT_AA64_EL2_IRQ):
                        enter_exception64(vcpu, PSR_MODE_EL2h, except_type_irq);
                        break;

                default:
                        /*
                         * Only EL1_SYNC and EL2_{SYNC,IRQ} makes
                         * sense so far. Everything else gets silently
                         * ignored.
                         */
                        break;
                }
        }
}

/*
 * Adjust the guest PC (and potentially exception state) depending on
 * flags provided by the emulation code.
 */
void __kvm_adjust_pc(struct kvm_vcpu *vcpu)
{
        if (vcpu_get_flag(vcpu, PENDING_EXCEPTION)) {
                kvm_inject_exception(vcpu);
                vcpu_clear_flag(vcpu, PENDING_EXCEPTION);
                vcpu_clear_flag(vcpu, EXCEPT_MASK);
        } else if (vcpu_get_flag(vcpu, INCREMENT_PC)) {
                kvm_skip_instr(vcpu);
                vcpu_clear_flag(vcpu, INCREMENT_PC);
        }
}















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   26 





   24 














   26 







   26 
















   26 
   26 






   26 





   26 



































































































































































































































































   22 




   22 

   22 





















    1 
   21 































































































































    6 
    6 


















    6 



































































































































































































































































































































































































































  265 


  265 

  265 
  265 

  265 

  265 
  265 



































































































































































    1 





    1 







    1 

    1 


    1 



    1 
    1 






































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * INET                An implementation of the TCP/IP protocol suite for the LINUX
 *                operating system.  INET is implemented using the  BSD Socket
 *                interface as the means of communication with the user level.
 *
 *                Generic socket support routines. Memory allocators, socket lock/release
 *                handler for protocols to use and generic option handler.
 *
 * Authors:        Ross Biro
 *                Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *                Florian La Roche, <flla@stud.uni-sb.de>
 *                Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Fixes:
 *                Alan Cox        :         Numerous verify_area() problems
 *                Alan Cox        :        Connecting on a connecting socket
 *                                        now returns an error for tcp.
 *                Alan Cox        :        sock->protocol is set correctly.
 *                                        and is not sometimes left as 0.
 *                Alan Cox        :        connect handles icmp errors on a
 *                                        connect properly. Unfortunately there
 *                                        is a restart syscall nasty there. I
 *                                        can't match BSD without hacking the C
 *                                        library. Ideas urgently sought!
 *                Alan Cox        :        Disallow bind() to addresses that are
 *                                        not ours - especially broadcast ones!!
 *                Alan Cox        :        Socket 1024 _IS_ ok for users. (fencepost)
 *                Alan Cox        :        sock_wfree/sock_rfree don't destroy sockets,
 *                                        instead they leave that for the DESTROY timer.
 *                Alan Cox        :        Clean up error flag in accept
 *                Alan Cox        :        TCP ack handling is buggy, the DESTROY timer
 *                                        was buggy. Put a remove_sock() in the handler
 *                                        for memory when we hit 0. Also altered the timer
 *                                        code. The ACK stuff can wait and needs major
 *                                        TCP layer surgery.
 *                Alan Cox        :        Fixed TCP ack bug, removed remove sock
 *                                        and fixed timer/inet_bh race.
 *                Alan Cox        :        Added zapped flag for TCP
 *                Alan Cox        :        Move kfree_skb into skbuff.c and tidied up surplus code
 *                Alan Cox        :        for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 *                Alan Cox        :        kfree_s calls now are kfree_skbmem so we can track skb resources
 *                Alan Cox        :        Supports socket option broadcast now as does udp. Packet and raw need fixing.
 *                Alan Cox        :        Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 *                Rick Sladkey        :        Relaxed UDP rules for matching packets.
 *                C.E.Hawkins        :        IFF_PROMISC/SIOCGHWADDR support
 *        Pauline Middelink        :        identd support
 *                Alan Cox        :        Fixed connect() taking signals I think.
 *                Alan Cox        :        SO_LINGER supported
 *                Alan Cox        :        Error reporting fixes
 *                Anonymous        :        inet_create tidied up (sk->reuse setting)
 *                Alan Cox        :        inet sockets don't set sk->type!
 *                Alan Cox        :        Split socket option code
 *                Alan Cox        :        Callbacks
 *                Alan Cox        :        Nagle flag for Charles & Johannes stuff
 *                Alex                :        Removed restriction on inet fioctl
 *                Alan Cox        :        Splitting INET from NET core
 *                Alan Cox        :        Fixed bogus SO_TYPE handling in getsockopt()
 *                Adam Caldwell        :        Missing return in SO_DONTROUTE/SO_DEBUG code
 *                Alan Cox        :        Split IP from generic code
 *                Alan Cox        :        New kfree_skbmem()
 *                Alan Cox        :        Make SO_DEBUG superuser only.
 *                Alan Cox        :        Allow anyone to clear SO_DEBUG
 *                                        (compatibility fix)
 *                Alan Cox        :        Added optimistic memory grabbing for AF_UNIX throughput.
 *                Alan Cox        :        Allocator for a socket is settable.
 *                Alan Cox        :        SO_ERROR includes soft errors.
 *                Alan Cox        :        Allow NULL arguments on some SO_ opts
 *                Alan Cox        :         Generic socket allocation to make hooks
 *                                        easier (suggested by Craig Metz).
 *                Michael Pall        :        SO_ERROR returns positive errno again
 *              Steve Whitehouse:       Added default destructor to free
 *                                      protocol private data.
 *              Steve Whitehouse:       Added various other default routines
 *                                      common to several socket families.
 *              Chris Evans     :       Call suser() check last on F_SETOWN
 *                Jay Schulist        :        Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 *                Andi Kleen        :        Add sock_kmalloc()/sock_kfree_s()
 *                Andi Kleen        :        Fix write_space callback
 *                Chris Evans        :        Security fixes - signedness again
 *                Arnaldo C. Melo :       cleanups, use skb_queue_purge
 *
 * To Fix:
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/poll.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
#include <linux/static_key.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
#include <linux/compat.h>
#include <linux/mroute.h>
#include <linux/mroute6.h>
#include <linux/icmpv6.h>

#include <linux/uaccess.h>

#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
#include <linux/skbuff_ref.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
#include <net/proto_memory.h>
#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
#include <net/netprio_cgroup.h>
#include <linux/sock_diag.h>

#include <linux/filter.h>
#include <net/sock_reuseport.h>
#include <net/bpf_sk_storage.h>

#include <trace/events/sock.h>

#include <net/tcp.h>
#include <net/busy_poll.h>
#include <net/phonet/phonet.h>

#include <linux/ethtool.h>

#include "dev.h"

static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);

static void sock_def_write_space_wfree(struct sock *sk);
static void sock_def_write_space(struct sock *sk);

/**
 * sk_ns_capable - General socket capability test
 * @sk: Socket to use a capability on or through
 * @user_ns: The user namespace of the capability to use
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in the user
 * namespace @user_ns.
 */
bool sk_ns_capable(const struct sock *sk,
                   struct user_namespace *user_ns, int cap)
{
        return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
                ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(sk_ns_capable);

/**
 * sk_capable - Socket global capability test
 * @sk: Socket to use a capability on or through
 * @cap: The global capability to use
 *
 * Test to see if the opener of the socket had when the socket was
 * created and the current process has the capability @cap in all user
 * namespaces.
 */
bool sk_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, &init_user_ns, cap);
}
EXPORT_SYMBOL(sk_capable);

/**
 * sk_net_capable - Network namespace socket capability test
 * @sk: Socket to use a capability on or through
 * @cap: The capability to use
 *
 * Test to see if the opener of the socket had when the socket was created
 * and the current process has the capability @cap over the network namespace
 * the socket is a member of.
 */
bool sk_net_capable(const struct sock *sk, int cap)
{
        return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
}
EXPORT_SYMBOL(sk_net_capable);

/*
 * Each address family might have different locking rules, so we have
 * one slock key per address family and separate keys for internal and
 * userspace sockets.
 */
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_kern_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
static struct lock_class_key af_family_kern_slock_keys[AF_MAX];

/*
 * Make lock validator output more readable. (we pre-construct these
 * strings build-time, so that runtime initialization of socket
 * locks is fast):
 */

#define _sock_locks(x)                                                  \
  x "AF_UNSPEC",        x "AF_UNIX"     ,        x "AF_INET"     , \
  x "AF_AX25"  ,        x "AF_IPX"      ,        x "AF_APPLETALK", \
  x "AF_NETROM",        x "AF_BRIDGE"   ,        x "AF_ATMPVC"   , \
  x "AF_X25"   ,        x "AF_INET6"    ,        x "AF_ROSE"     , \
  x "AF_DECnet",        x "AF_NETBEUI"  ,        x "AF_SECURITY" , \
  x "AF_KEY"   ,        x "AF_NETLINK"  ,        x "AF_PACKET"   , \
  x "AF_ASH"   ,        x "AF_ECONET"   ,        x "AF_ATMSVC"   , \
  x "AF_RDS"   ,        x "AF_SNA"      ,        x "AF_IRDA"     , \
  x "AF_PPPOX" ,        x "AF_WANPIPE"  ,        x "AF_LLC"      , \
  x "27"       ,        x "28"          ,        x "AF_CAN"      , \
  x "AF_TIPC"  ,        x "AF_BLUETOOTH",        x "IUCV"        , \
  x "AF_RXRPC" ,        x "AF_ISDN"     ,        x "AF_PHONET"   , \
  x "AF_IEEE802154",        x "AF_CAIF"        ,        x "AF_ALG"      , \
  x "AF_NFC"   ,        x "AF_VSOCK"    ,        x "AF_KCM"      , \
  x "AF_QIPCRTR",        x "AF_SMC"        ,        x "AF_XDP"        , \
  x "AF_MCTP"  , \
  x "AF_MAX"

static const char *const af_family_key_strings[AF_MAX+1] = {
        _sock_locks("sk_lock-")
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
        _sock_locks("slock-")
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
        _sock_locks("clock-")
};

static const char *const af_family_kern_key_strings[AF_MAX+1] = {
        _sock_locks("k-sk_lock-")
};
static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
        _sock_locks("k-slock-")
};
static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
        _sock_locks("k-clock-")
};
static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
        _sock_locks("rlock-")
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
        _sock_locks("wlock-")
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
        _sock_locks("elock-")
};

/*
 * sk_callback_lock and sk queues locking rules are per-address-family,
 * so split the lock classes by using a per-AF key:
 */
static struct lock_class_key af_callback_keys[AF_MAX];
static struct lock_class_key af_rlock_keys[AF_MAX];
static struct lock_class_key af_wlock_keys[AF_MAX];
static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];

/* Run time adjustable parameters. */
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
EXPORT_SYMBOL(sysctl_wmem_max);
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
EXPORT_SYMBOL(sysctl_rmem_max);
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;

DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);

/**
 * sk_set_memalloc - sets %SOCK_MEMALLOC
 * @sk: socket to set it on
 *
 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 * It's the responsibility of the admin to adjust min_free_kbytes
 * to meet the requirements
 */
void sk_set_memalloc(struct sock *sk)
{
        sock_set_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation |= __GFP_MEMALLOC;
        static_branch_inc(&memalloc_socks_key);
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);

void sk_clear_memalloc(struct sock *sk)
{
        sock_reset_flag(sk, SOCK_MEMALLOC);
        sk->sk_allocation &= ~__GFP_MEMALLOC;
        static_branch_dec(&memalloc_socks_key);

        /*
         * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
         * progress of swapping. SOCK_MEMALLOC may be cleared while
         * it has rmem allocations due to the last swapfile being deactivated
         * but there is a risk that the socket is unusable due to exceeding
         * the rmem limits. Reclaim the reserves and obey rmem limits again.
         */
        sk_mem_reclaim(sk);
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);

int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
        int ret;
        unsigned int noreclaim_flag;

        /* these should have been dropped before queueing */
        BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));

        noreclaim_flag = memalloc_noreclaim_save();
        ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
                                 tcp_v6_do_rcv,
                                 tcp_v4_do_rcv,
                                 sk, skb);
        memalloc_noreclaim_restore(noreclaim_flag);

        return ret;
}
EXPORT_SYMBOL(__sk_backlog_rcv);

void sk_error_report(struct sock *sk)
{
        sk->sk_error_report(sk);

        switch (sk->sk_family) {
        case AF_INET:
                fallthrough;
        case AF_INET6:
                trace_inet_sk_error_report(sk);
                break;
        default:
                break;
        }
}
EXPORT_SYMBOL(sk_error_report);

int sock_get_timeout(long timeo, void *optval, bool old_timeval)
{
        struct __kernel_sock_timeval tv;

        if (timeo == MAX_SCHEDULE_TIMEOUT) {
                tv.tv_sec = 0;
                tv.tv_usec = 0;
        } else {
                tv.tv_sec = timeo / HZ;
                tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
        }

        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
                *(struct old_timeval32 *)optval = tv32;
                return sizeof(tv32);
        }

        if (old_timeval) {
                struct __kernel_old_timeval old_tv;
                old_tv.tv_sec = tv.tv_sec;
                old_tv.tv_usec = tv.tv_usec;
                *(struct __kernel_old_timeval *)optval = old_tv;
                return sizeof(old_tv);
        }

        *(struct __kernel_sock_timeval *)optval = tv;
        return sizeof(tv);
}
EXPORT_SYMBOL(sock_get_timeout);

int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
                           sockptr_t optval, int optlen, bool old_timeval)
{
        if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
                struct old_timeval32 tv32;

                if (optlen < sizeof(tv32))
                        return -EINVAL;

                if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
                        return -EFAULT;
                tv->tv_sec = tv32.tv_sec;
                tv->tv_usec = tv32.tv_usec;
        } else if (old_timeval) {
                struct __kernel_old_timeval old_tv;

                if (optlen < sizeof(old_tv))
                        return -EINVAL;
                if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
                        return -EFAULT;
                tv->tv_sec = old_tv.tv_sec;
                tv->tv_usec = old_tv.tv_usec;
        } else {
                if (optlen < sizeof(*tv))
                        return -EINVAL;
                if (copy_from_sockptr(tv, optval, sizeof(*tv)))
                        return -EFAULT;
        }

        return 0;
}
EXPORT_SYMBOL(sock_copy_user_timeval);

static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
                            bool old_timeval)
{
        struct __kernel_sock_timeval tv;
        int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
        long val;

        if (err)
                return err;

        if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
                return -EDOM;

        if (tv.tv_sec < 0) {
                static int warned __read_mostly;

                WRITE_ONCE(*timeo_p, 0);
                if (warned < 10 && net_ratelimit()) {
                        warned++;
                        pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
                                __func__, current->comm, task_pid_nr(current));
                }
                return 0;
        }
        val = MAX_SCHEDULE_TIMEOUT;
        if ((tv.tv_sec || tv.tv_usec) &&
            (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
                val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
                                                    USEC_PER_SEC / HZ);
        WRITE_ONCE(*timeo_p, val);
        return 0;
}

static bool sk_set_prio_allowed(const struct sock *sk, int val)
{
        return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
                sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
                sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
}

static bool sock_needs_netstamp(const struct sock *sk)
{
        switch (sk->sk_family) {
        case AF_UNSPEC:
        case AF_UNIX:
                return false;
        default:
                return true;
        }
}

static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
        if (sk->sk_flags & flags) {
                sk->sk_flags &= ~flags;
                if (sock_needs_netstamp(sk) &&
                    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
                        net_disable_timestamp();
        }
}


int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
        unsigned long flags;
        struct sk_buff_head *list = &sk->sk_receive_queue;

        if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
                atomic_inc(&sk->sk_drops);
                trace_sock_rcvqueue_full(sk, skb);
                return -ENOMEM;
        }

        if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
                atomic_inc(&sk->sk_drops);
                return -ENOBUFS;
        }

        skb->dev = NULL;
        skb_set_owner_r(skb, sk);

        /* we escape from rcu protected region, make sure we dont leak
         * a norefcounted dst
         */
        skb_dst_force(skb);

        spin_lock_irqsave(&list->lock, flags);
        sock_skb_set_dropcount(sk, skb);
        __skb_queue_tail(list, skb);
        spin_unlock_irqrestore(&list->lock, flags);

        if (!sock_flag(sk, SOCK_DEAD))
                sk->sk_data_ready(sk);
        return 0;
}
EXPORT_SYMBOL(__sock_queue_rcv_skb);

int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
                              enum skb_drop_reason *reason)
{
        enum skb_drop_reason drop_reason;
        int err;

        err = sk_filter(sk, skb);
        if (err) {
                drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
                goto out;
        }
        err = __sock_queue_rcv_skb(sk, skb);
        switch (err) {
        case -ENOMEM:
                drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
                break;
        case -ENOBUFS:
                drop_reason = SKB_DROP_REASON_PROTO_MEM;
                break;
        default:
                drop_reason = SKB_NOT_DROPPED_YET;
                break;
        }
out:
        if (reason)
                *reason = drop_reason;
        return err;
}
EXPORT_SYMBOL(sock_queue_rcv_skb_reason);

int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
                     const int nested, unsigned int trim_cap, bool refcounted)
{
        int rc = NET_RX_SUCCESS;

        if (sk_filter_trim_cap(sk, skb, trim_cap))
                goto discard_and_relse;

        skb->dev = NULL;

        if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
                atomic_inc(&sk->sk_drops);
                goto discard_and_relse;
        }
        if (nested)
                bh_lock_sock_nested(sk);
        else
                bh_lock_sock(sk);
        if (!sock_owned_by_user(sk)) {
                /*
                 * trylock + unlock semantics:
                 */
                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);

                rc = sk_backlog_rcv(sk, skb);

                mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
        } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
                bh_unlock_sock(sk);
                atomic_inc(&sk->sk_drops);
                goto discard_and_relse;
        }

        bh_unlock_sock(sk);
out:
        if (refcounted)
                sock_put(sk);
        return rc;
discard_and_relse:
        kfree_skb(skb);
        goto out;
}
EXPORT_SYMBOL(__sk_receive_skb);

INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
                                                          u32));
INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
                                                           u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = __sk_dst_get(sk);

        if (dst && dst->obsolete &&
            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
                               dst, cookie) == NULL) {
                sk_tx_queue_clear(sk);
                WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
                RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(__sk_dst_check);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
        struct dst_entry *dst = sk_dst_get(sk);

        if (dst && dst->obsolete &&
            INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
                               dst, cookie) == NULL) {
                sk_dst_reset(sk);
                dst_release(dst);
                return NULL;
        }

        return dst;
}
EXPORT_SYMBOL(sk_dst_check);

static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);

        /* Sorry... */
        ret = -EPERM;
        if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
                goto out;

        ret = -EINVAL;
        if (ifindex < 0)
                goto out;

        /* Paired with all READ_ONCE() done locklessly. */
        WRITE_ONCE(sk->sk_bound_dev_if, ifindex);

        if (sk->sk_prot->rehash)
                sk->sk_prot->rehash(sk);
        sk_dst_reset(sk);

        ret = 0;

out:
#endif

        return ret;
}

int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
{
        int ret;

        if (lock_sk)
                lock_sock(sk);
        ret = sock_bindtoindex_locked(sk, ifindex);
        if (lock_sk)
                release_sock(sk);

        return ret;
}
EXPORT_SYMBOL(sock_bindtoindex);

static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];
        int index;

        ret = -EINVAL;
        if (optlen < 0)
                goto out;

        /* Bind this socket to a particular device like "eth0",
         * as specified in the passed interface name. If the
         * name is "" or the option length is zero the socket
         * is not bound.
         */
        if (optlen > IFNAMSIZ - 1)
                optlen = IFNAMSIZ - 1;
        memset(devname, 0, sizeof(devname));

        ret = -EFAULT;
        if (copy_from_sockptr(devname, optval, optlen))
                goto out;

        index = 0;
        if (devname[0] != '\0') {
                struct net_device *dev;

                rcu_read_lock();
                dev = dev_get_by_name_rcu(net, devname);
                if (dev)
                        index = dev->ifindex;
                rcu_read_unlock();
                ret = -ENODEV;
                if (!dev)
                        goto out;
        }

        sockopt_lock_sock(sk);
        ret = sock_bindtoindex_locked(sk, index);
        sockopt_release_sock(sk);
out:
#endif

        return ret;
}

static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
                                sockptr_t optlen, int len)
{
        int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
        int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
        struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];

        if (bound_dev_if == 0) {
                len = 0;
                goto zero;
        }

        ret = -EINVAL;
        if (len < IFNAMSIZ)
                goto out;

        ret = netdev_get_name(net, devname, bound_dev_if);
        if (ret)
                goto out;

        len = strlen(devname) + 1;

        ret = -EFAULT;
        if (copy_to_sockptr(optval, devname, len))
                goto out;

zero:
        ret = -EFAULT;
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                goto out;

        ret = 0;

out:
#endif

        return ret;
}

bool sk_mc_loop(const struct sock *sk)
{
        if (dev_recursion_level())
                return false;
        if (!sk)
                return true;
        /* IPV6_ADDRFORM can change sk->sk_family under us. */
        switch (READ_ONCE(sk->sk_family)) {
        case AF_INET:
                return inet_test_bit(MC_LOOP, sk);
#if IS_ENABLED(CONFIG_IPV6)
        case AF_INET6:
                return inet6_test_bit(MC6_LOOP, sk);
#endif
        }
        WARN_ON_ONCE(1);
        return true;
}
EXPORT_SYMBOL(sk_mc_loop);

void sock_set_reuseaddr(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuse = SK_CAN_REUSE;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseaddr);

void sock_set_reuseport(struct sock *sk)
{
        lock_sock(sk);
        sk->sk_reuseport = true;
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_reuseport);

void sock_no_linger(struct sock *sk)
{
        lock_sock(sk);
        WRITE_ONCE(sk->sk_lingertime, 0);
        sock_set_flag(sk, SOCK_LINGER);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_no_linger);

void sock_set_priority(struct sock *sk, u32 priority)
{
        WRITE_ONCE(sk->sk_priority, priority);
}
EXPORT_SYMBOL(sock_set_priority);

void sock_set_sndtimeo(struct sock *sk, s64 secs)
{
        lock_sock(sk);
        if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
                WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
        else
                WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_sndtimeo);

static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
{
        sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
        sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
        if (val)  {
                sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
                sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        }
}

void sock_enable_timestamps(struct sock *sk)
{
        lock_sock(sk);
        __sock_set_timestamps(sk, true, false, true);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_enable_timestamps);

void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
{
        switch (optname) {
        case SO_TIMESTAMP_OLD:
                __sock_set_timestamps(sk, valbool, false, false);
                break;
        case SO_TIMESTAMP_NEW:
                __sock_set_timestamps(sk, valbool, true, false);
                break;
        case SO_TIMESTAMPNS_OLD:
                __sock_set_timestamps(sk, valbool, false, true);
                break;
        case SO_TIMESTAMPNS_NEW:
                __sock_set_timestamps(sk, valbool, true, true);
                break;
        }
}

static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
{
        struct net *net = sock_net(sk);
        struct net_device *dev = NULL;
        bool match = false;
        int *vclock_index;
        int i, num;

        if (sk->sk_bound_dev_if)
                dev = dev_get_by_index(net, sk->sk_bound_dev_if);

        if (!dev) {
                pr_err("%s: sock not bind to device\n", __func__);
                return -EOPNOTSUPP;
        }

        num = ethtool_get_phc_vclocks(dev, &vclock_index);
        dev_put(dev);

        for (i = 0; i < num; i++) {
                if (*(vclock_index + i) == phc_index) {
                        match = true;
                        break;
                }
        }

        if (num > 0)
                kfree(vclock_index);

        if (!match)
                return -EINVAL;

        WRITE_ONCE(sk->sk_bind_phc, phc_index);

        return 0;
}

int sock_set_timestamping(struct sock *sk, int optname,
                          struct so_timestamping timestamping)
{
        int val = timestamping.flags;
        int ret;

        if (val & ~SOF_TIMESTAMPING_MASK)
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
            !(val & SOF_TIMESTAMPING_OPT_ID))
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_OPT_ID &&
            !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
                if (sk_is_tcp(sk)) {
                        if ((1 << sk->sk_state) &
                            (TCPF_CLOSE | TCPF_LISTEN))
                                return -EINVAL;
                        if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
                        else
                                atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
                } else {
                        atomic_set(&sk->sk_tskey, 0);
                }
        }

        if (val & SOF_TIMESTAMPING_OPT_STATS &&
            !(val & SOF_TIMESTAMPING_OPT_TSONLY))
                return -EINVAL;

        if (val & SOF_TIMESTAMPING_BIND_PHC) {
                ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
                if (ret)
                        return ret;
        }

        WRITE_ONCE(sk->sk_tsflags, val);
        sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
        sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));

        if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
                sock_enable_timestamp(sk,
                                      SOCK_TIMESTAMPING_RX_SOFTWARE);
        else
                sock_disable_timestamp(sk,
                                       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
        return 0;
}

#if defined(CONFIG_CGROUP_BPF)
void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
{
        struct bpf_sock_ops_kern sock_ops;

        memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
        sock_ops.op = op;
        sock_ops.is_fullsock = 1;
        sock_ops.sk = sk;
        bpf_skops_init_skb(&sock_ops, skb, 0);
        __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
}
#endif

void sock_set_keepalive(struct sock *sk)
{
        lock_sock(sk);
        if (sk->sk_prot->keepalive)
                sk->sk_prot->keepalive(sk, true);
        sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_keepalive);

static void __sock_set_rcvbuf(struct sock *sk, int val)
{
        /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
         * as a negative value.
         */
        val = min_t(int, val, INT_MAX / 2);
        sk->sk_userlocks |= SOCK_RCVBUF_LOCK;

        /* We double it on the way in to account for "struct sk_buff" etc.
         * overhead.   Applications assume that the SO_RCVBUF setting they make
         * will allow that much actual data to be received on that socket.
         *
         * Applications are unaware that "struct sk_buff" and other overheads
         * allocate from the receive buffer during socket buffer allocation.
         *
         * And after considering the possible alternatives, returning the value
         * we actually used in getsockopt is the most desirable behavior.
         */
        WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
}

void sock_set_rcvbuf(struct sock *sk, int val)
{
        lock_sock(sk);
        __sock_set_rcvbuf(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_rcvbuf);

static void __sock_set_mark(struct sock *sk, u32 val)
{
        if (val != sk->sk_mark) {
                WRITE_ONCE(sk->sk_mark, val);
                sk_dst_reset(sk);
        }
}

void sock_set_mark(struct sock *sk, u32 val)
{
        lock_sock(sk);
        __sock_set_mark(sk, val);
        release_sock(sk);
}
EXPORT_SYMBOL(sock_set_mark);

static void sock_release_reserved_memory(struct sock *sk, int bytes)
{
        /* Round down bytes to multiple of pages */
        bytes = round_down(bytes, PAGE_SIZE);

        WARN_ON(bytes > sk->sk_reserved_mem);
        WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
        sk_mem_reclaim(sk);
}

static int sock_reserve_memory(struct sock *sk, int bytes)
{
        long allocated;
        bool charged;
        int pages;

        if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
                return -EOPNOTSUPP;

        if (!bytes)
                return 0;

        pages = sk_mem_pages(bytes);

        /* pre-charge to memcg */
        charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
                                          GFP_KERNEL | __GFP_RETRY_MAYFAIL);
        if (!charged)
                return -ENOMEM;

        /* pre-charge to forward_alloc */
        sk_memory_allocated_add(sk, pages);
        allocated = sk_memory_allocated(sk);
        /* If the system goes into memory pressure with this
         * precharge, give up and return error.
         */
        if (allocated > sk_prot_mem_limits(sk, 1)) {
                sk_memory_allocated_sub(sk, pages);
                mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
                return -ENOMEM;
        }
        sk_forward_alloc_add(sk, pages << PAGE_SHIFT);

        WRITE_ONCE(sk->sk_reserved_mem,
                   sk->sk_reserved_mem + (pages << PAGE_SHIFT));

        return 0;
}

#ifdef CONFIG_PAGE_POOL

/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
 * in 1 syscall. The limit exists to limit the amount of memory the kernel
 * allocates to copy these tokens, and to prevent looping over the frags for
 * too long.
 */
#define MAX_DONTNEED_TOKENS 128
#define MAX_DONTNEED_FRAGS 1024

static noinline_for_stack int
sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
{
        unsigned int num_tokens, i, j, k, netmem_num = 0;
        struct dmabuf_token *tokens;
        int ret = 0, num_frags = 0;
        netmem_ref netmems[16];

        if (!sk_is_tcp(sk))
                return -EBADF;

        if (optlen % sizeof(*tokens) ||
            optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
                return -EINVAL;

        num_tokens = optlen / sizeof(*tokens);
        tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
        if (!tokens)
                return -ENOMEM;

        if (copy_from_sockptr(tokens, optval, optlen)) {
                kvfree(tokens);
                return -EFAULT;
        }

        xa_lock_bh(&sk->sk_user_frags);
        for (i = 0; i < num_tokens; i++) {
                for (j = 0; j < tokens[i].token_count; j++) {
                        if (++num_frags > MAX_DONTNEED_FRAGS)
                                goto frag_limit_reached;

                        netmem_ref netmem = (__force netmem_ref)__xa_erase(
                                &sk->sk_user_frags, tokens[i].token_start + j);

                        if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
                                continue;

                        netmems[netmem_num++] = netmem;
                        if (netmem_num == ARRAY_SIZE(netmems)) {
                                xa_unlock_bh(&sk->sk_user_frags);
                                for (k = 0; k < netmem_num; k++)
                                        WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
                                netmem_num = 0;
                                xa_lock_bh(&sk->sk_user_frags);
                        }
                        ret++;
                }
        }

frag_limit_reached:
        xa_unlock_bh(&sk->sk_user_frags);
        for (k = 0; k < netmem_num; k++)
                WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));

        kvfree(tokens);
        return ret;
}
#endif

void sockopt_lock_sock(struct sock *sk)
{
        /* When current->bpf_ctx is set, the setsockopt is called from
         * a bpf prog.  bpf has ensured the sk lock has been
         * acquired before calling setsockopt().
         */
        if (has_current_bpf_ctx())
                return;

        lock_sock(sk);
}
EXPORT_SYMBOL(sockopt_lock_sock);

void sockopt_release_sock(struct sock *sk)
{
        if (has_current_bpf_ctx())
                return;

        release_sock(sk);
}
EXPORT_SYMBOL(sockopt_release_sock);

bool sockopt_ns_capable(struct user_namespace *ns, int cap)
{
        return has_current_bpf_ctx() || ns_capable(ns, cap);
}
EXPORT_SYMBOL(sockopt_ns_capable);

bool sockopt_capable(int cap)
{
        return has_current_bpf_ctx() || capable(cap);
}
EXPORT_SYMBOL(sockopt_capable);

static int sockopt_validate_clockid(__kernel_clockid_t value)
{
        switch (value) {
        case CLOCK_REALTIME:
        case CLOCK_MONOTONIC:
        case CLOCK_TAI:
                return 0;
        }
        return -EINVAL;
}

/*
 *        This is meant for all protocols to use and covers goings on
 *        at the socket level. Everything here is generic.
 */

int sk_setsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, unsigned int optlen)
{
        struct so_timestamping timestamping;
        struct socket *sock = sk->sk_socket;
        struct sock_txtime sk_txtime;
        int val;
        int valbool;
        struct linger ling;
        int ret = 0;

        /*
         *        Options without arguments
         */

        if (optname == SO_BINDTODEVICE)
                return sock_setbindtodevice(sk, optval, optlen);

        if (optlen < sizeof(int))
                return -EINVAL;

        if (copy_from_sockptr(&val, optval, sizeof(val)))
                return -EFAULT;

        valbool = val ? 1 : 0;

        /* handle options which do not require locking the socket. */
        switch (optname) {
        case SO_PRIORITY:
                if (sk_set_prio_allowed(sk, val)) {
                        sock_set_priority(sk, val);
                        return 0;
                }
                return -EPERM;
        case SO_PASSSEC:
                assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
                return 0;
        case SO_PASSCRED:
                assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
                return 0;
        case SO_PASSPIDFD:
                assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
                return 0;
        case SO_TYPE:
        case SO_PROTOCOL:
        case SO_DOMAIN:
        case SO_ERROR:
                return -ENOPROTOOPT;
#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                if (val < 0)
                        return -EINVAL;
                WRITE_ONCE(sk->sk_ll_usec, val);
                return 0;
        case SO_PREFER_BUSY_POLL:
                if (valbool && !sockopt_capable(CAP_NET_ADMIN))
                        return -EPERM;
                WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
                return 0;
        case SO_BUSY_POLL_BUDGET:
                if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
                    !sockopt_capable(CAP_NET_ADMIN))
                        return -EPERM;
                if (val < 0 || val > U16_MAX)
                        return -EINVAL;
                WRITE_ONCE(sk->sk_busy_poll_budget, val);
                return 0;
#endif
        case SO_MAX_PACING_RATE:
                {
                unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
                unsigned long pacing_rate;

                if (sizeof(ulval) != sizeof(val) &&
                    optlen >= sizeof(ulval) &&
                    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
                        return -EFAULT;
                }
                if (ulval != ~0UL)
                        cmpxchg(&sk->sk_pacing_status,
                                SK_PACING_NONE,
                                SK_PACING_NEEDED);
                /* Pairs with READ_ONCE() from sk_getsockopt() */
                WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
                pacing_rate = READ_ONCE(sk->sk_pacing_rate);
                if (ulval < pacing_rate)
                        WRITE_ONCE(sk->sk_pacing_rate, ulval);
                return 0;
                }
        case SO_TXREHASH:
                if (val < -1 || val > 1)
                        return -EINVAL;
                if ((u8)val == SOCK_TXREHASH_DEFAULT)
                        val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
                /* Paired with READ_ONCE() in tcp_rtx_synack()
                 * and sk_getsockopt().
                 */
                WRITE_ONCE(sk->sk_txrehash, (u8)val);
                return 0;
        case SO_PEEK_OFF:
                {
                int (*set_peek_off)(struct sock *sk, int val);

                set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
                if (set_peek_off)
                        ret = set_peek_off(sk, val);
                else
                        ret = -EOPNOTSUPP;
                return ret;
                }
#ifdef CONFIG_PAGE_POOL
        case SO_DEVMEM_DONTNEED:
                return sock_devmem_dontneed(sk, optval, optlen);
#endif
        }

        sockopt_lock_sock(sk);

        switch (optname) {
        case SO_DEBUG:
                if (val && !sockopt_capable(CAP_NET_ADMIN))
                        ret = -EACCES;
                else
                        sock_valbool_flag(sk, SOCK_DBG, valbool);
                break;
        case SO_REUSEADDR:
                sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
                break;
        case SO_REUSEPORT:
                if (valbool && !sk_is_inet(sk))
                        ret = -EOPNOTSUPP;
                else
                        sk->sk_reuseport = valbool;
                break;
        case SO_DONTROUTE:
                sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
                sk_dst_reset(sk);
                break;
        case SO_BROADCAST:
                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
                break;
        case SO_SNDBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
                /* Ensure val * 2 fits into an int, to prevent max_t()
                 * from treating it as a negative value.
                 */
                val = min_t(int, val, INT_MAX / 2);
                sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
                WRITE_ONCE(sk->sk_sndbuf,
                           max_t(int, val * 2, SOCK_MIN_SNDBUF));
                /* Wake up sending tasks if we upped the value. */
                sk->sk_write_space(sk);
                break;

        case SO_SNDBUFFORCE:
                if (!sockopt_capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                if (val < 0)
                        val = 0;
                goto set_sndbuf;

        case SO_RCVBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
                 * play 'guess the biggest size' games. RCVBUF/SNDBUF
                 * are treated in BSD as hints
                 */
                __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
                break;

        case SO_RCVBUFFORCE:
                if (!sockopt_capable(CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                /* No negative values (to prevent underflow, as val will be
                 * multiplied by 2).
                 */
                __sock_set_rcvbuf(sk, max(val, 0));
                break;

        case SO_KEEPALIVE:
                if (sk->sk_prot->keepalive)
                        sk->sk_prot->keepalive(sk, valbool);
                sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
                break;

        case SO_OOBINLINE:
                sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
                break;

        case SO_NO_CHECK:
                sk->sk_no_check_tx = valbool;
                break;

        case SO_LINGER:
                if (optlen < sizeof(ling)) {
                        ret = -EINVAL;        /* 1003.1g */
                        break;
                }
                if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
                        ret = -EFAULT;
                        break;
                }
                if (!ling.l_onoff) {
                        sock_reset_flag(sk, SOCK_LINGER);
                } else {
                        unsigned long t_sec = ling.l_linger;

                        if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
                                WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
                        else
                                WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
                        sock_set_flag(sk, SOCK_LINGER);
                }
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_TIMESTAMP_OLD:
        case SO_TIMESTAMP_NEW:
        case SO_TIMESTAMPNS_OLD:
        case SO_TIMESTAMPNS_NEW:
                sock_set_timestamp(sk, optname, valbool);
                break;

        case SO_TIMESTAMPING_NEW:
        case SO_TIMESTAMPING_OLD:
                if (optlen == sizeof(timestamping)) {
                        if (copy_from_sockptr(&timestamping, optval,
                                              sizeof(timestamping))) {
                                ret = -EFAULT;
                                break;
                        }
                } else {
                        memset(&timestamping, 0, sizeof(timestamping));
                        timestamping.flags = val;
                }
                ret = sock_set_timestamping(sk, optname, timestamping);
                break;

        case SO_RCVLOWAT:
                {
                int (*set_rcvlowat)(struct sock *sk, int val) = NULL;

                if (val < 0)
                        val = INT_MAX;
                if (sock)
                        set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
                if (set_rcvlowat)
                        ret = set_rcvlowat(sk, val);
                else
                        WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
                break;
                }
        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
                                       optlen, optname == SO_RCVTIMEO_OLD);
                break;

        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
                                       optlen, optname == SO_SNDTIMEO_OLD);
                break;

        case SO_ATTACH_FILTER: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_BPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_attach_bpf(ufd, sk);
                }
                break;

        case SO_ATTACH_REUSEPORT_CBPF: {
                struct sock_fprog fprog;

                ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
                if (!ret)
                        ret = sk_reuseport_attach_filter(&fprog, sk);
                break;
        }
        case SO_ATTACH_REUSEPORT_EBPF:
                ret = -EINVAL;
                if (optlen == sizeof(u32)) {
                        u32 ufd;

                        ret = -EFAULT;
                        if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
                                break;

                        ret = sk_reuseport_attach_bpf(ufd, sk);
                }
                break;

        case SO_DETACH_REUSEPORT_BPF:
                ret = reuseport_detach_prog(sk);
                break;

        case SO_DETACH_FILTER:
                ret = sk_detach_filter(sk);
                break;

        case SO_LOCK_FILTER:
                if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
                        ret = -EPERM;
                else
                        sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
                break;

        case SO_MARK:
                if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                __sock_set_mark(sk, val);
                break;
        case SO_RCVMARK:
                sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
                break;

        case SO_RCVPRIORITY:
                sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
                break;

        case SO_RXQ_OVFL:
                sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
                break;

        case SO_WIFI_STATUS:
                sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
                break;

        case SO_NOFCS:
                sock_valbool_flag(sk, SOCK_NOFCS, valbool);
                break;

        case SO_SELECT_ERR_QUEUE:
                sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
                break;


        case SO_INCOMING_CPU:
                reuseport_update_incoming_cpu(sk, val);
                break;

        case SO_CNX_ADVICE:
                if (val == 1)
                        dst_negative_advice(sk);
                break;

        case SO_ZEROCOPY:
                if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
                        if (!(sk_is_tcp(sk) ||
                              (sk->sk_type == SOCK_DGRAM &&
                               sk->sk_protocol == IPPROTO_UDP)))
                                ret = -EOPNOTSUPP;
                } else if (sk->sk_family != PF_RDS) {
                        ret = -EOPNOTSUPP;
                }
                if (!ret) {
                        if (val < 0 || val > 1)
                                ret = -EINVAL;
                        else
                                sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
                }
                break;

        case SO_TXTIME:
                if (optlen != sizeof(struct sock_txtime)) {
                        ret = -EINVAL;
                        break;
                } else if (copy_from_sockptr(&sk_txtime, optval,
                           sizeof(struct sock_txtime))) {
                        ret = -EFAULT;
                        break;
                } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
                        ret = -EINVAL;
                        break;
                }
                /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
                 * scheduler has enough safe guards.
                 */
                if (sk_txtime.clockid != CLOCK_MONOTONIC &&
                    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
                        ret = -EPERM;
                        break;
                }

                ret = sockopt_validate_clockid(sk_txtime.clockid);
                if (ret)
                        break;

                sock_valbool_flag(sk, SOCK_TXTIME, true);
                sk->sk_clockid = sk_txtime.clockid;
                sk->sk_txtime_deadline_mode =
                        !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
                sk->sk_txtime_report_errors =
                        !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
                break;

        case SO_BINDTOIFINDEX:
                ret = sock_bindtoindex_locked(sk, val);
                break;

        case SO_BUF_LOCK:
                if (val & ~SOCK_BUF_LOCK_MASK) {
                        ret = -EINVAL;
                        break;
                }
                sk->sk_userlocks = val | (sk->sk_userlocks &
                                          ~SOCK_BUF_LOCK_MASK);
                break;

        case SO_RESERVE_MEM:
        {
                int delta;

                if (val < 0) {
                        ret = -EINVAL;
                        break;
                }

                delta = val - sk->sk_reserved_mem;
                if (delta < 0)
                        sock_release_reserved_memory(sk, -delta);
                else
                        ret = sock_reserve_memory(sk, delta);
                break;
        }

        default:
                ret = -ENOPROTOOPT;
                break;
        }
        sockopt_release_sock(sk);
        return ret;
}

int sock_setsockopt(struct socket *sock, int level, int optname,
                    sockptr_t optval, unsigned int optlen)
{
        return sk_setsockopt(sock->sk, level, optname,
                             optval, optlen);
}
EXPORT_SYMBOL(sock_setsockopt);

static const struct cred *sk_get_peer_cred(struct sock *sk)
{
        const struct cred *cred;

        spin_lock(&sk->sk_peer_lock);
        cred = get_cred(sk->sk_peer_cred);
        spin_unlock(&sk->sk_peer_lock);

        return cred;
}

static void cred_to_ucred(struct pid *pid, const struct cred *cred,
                          struct ucred *ucred)
{
        ucred->pid = pid_vnr(pid);
        ucred->uid = ucred->gid = -1;
        if (cred) {
                struct user_namespace *current_ns = current_user_ns();

                ucred->uid = from_kuid_munged(current_ns, cred->euid);
                ucred->gid = from_kgid_munged(current_ns, cred->egid);
        }
}

static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
        struct user_namespace *user_ns = current_user_ns();
        int i;

        for (i = 0; i < src->ngroups; i++) {
                gid_t gid = from_kgid_munged(user_ns, src->gid[i]);

                if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
                        return -EFAULT;
        }

        return 0;
}

int sk_getsockopt(struct sock *sk, int level, int optname,
                  sockptr_t optval, sockptr_t optlen)
{
        struct socket *sock = sk->sk_socket;

        union {
                int val;
                u64 val64;
                unsigned long ulval;
                struct linger ling;
                struct old_timeval32 tm32;
                struct __kernel_old_timeval tm;
                struct  __kernel_sock_timeval stm;
                struct sock_txtime txtime;
                struct so_timestamping timestamping;
        } v;

        int lv = sizeof(int);
        int len;

        if (copy_from_sockptr(&len, optlen, sizeof(int)))
                return -EFAULT;
        if (len < 0)
                return -EINVAL;

        memset(&v, 0, sizeof(v));

        switch (optname) {
        case SO_DEBUG:
                v.val = sock_flag(sk, SOCK_DBG);
                break;

        case SO_DONTROUTE:
                v.val = sock_flag(sk, SOCK_LOCALROUTE);
                break;

        case SO_BROADCAST:
                v.val = sock_flag(sk, SOCK_BROADCAST);
                break;

        case SO_SNDBUF:
                v.val = READ_ONCE(sk->sk_sndbuf);
                break;

        case SO_RCVBUF:
                v.val = READ_ONCE(sk->sk_rcvbuf);
                break;

        case SO_REUSEADDR:
                v.val = sk->sk_reuse;
                break;

        case SO_REUSEPORT:
                v.val = sk->sk_reuseport;
                break;

        case SO_KEEPALIVE:
                v.val = sock_flag(sk, SOCK_KEEPOPEN);
                break;

        case SO_TYPE:
                v.val = sk->sk_type;
                break;

        case SO_PROTOCOL:
                v.val = sk->sk_protocol;
                break;

        case SO_DOMAIN:
                v.val = sk->sk_family;
                break;

        case SO_ERROR:
                v.val = -sock_error(sk);
                if (v.val == 0)
                        v.val = xchg(&sk->sk_err_soft, 0);
                break;

        case SO_OOBINLINE:
                v.val = sock_flag(sk, SOCK_URGINLINE);
                break;

        case SO_NO_CHECK:
                v.val = sk->sk_no_check_tx;
                break;

        case SO_PRIORITY:
                v.val = READ_ONCE(sk->sk_priority);
                break;

        case SO_LINGER:
                lv                = sizeof(v.ling);
                v.ling.l_onoff        = sock_flag(sk, SOCK_LINGER);
                v.ling.l_linger        = READ_ONCE(sk->sk_lingertime) / HZ;
                break;

        case SO_BSDCOMPAT:
                break;

        case SO_TIMESTAMP_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
                                !sock_flag(sk, SOCK_TSTAMP_NEW) &&
                                !sock_flag(sk, SOCK_RCVTSTAMPNS);
                break;

        case SO_TIMESTAMPNS_OLD:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMP_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPNS_NEW:
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
                break;

        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
                lv = sizeof(v.timestamping);
                /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
                 * returning the flags when they were set through the same option.
                 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
                 */
                if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
                        v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
                        v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
                }
                break;

        case SO_RCVTIMEO_OLD:
        case SO_RCVTIMEO_NEW:
                lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
                                      SO_RCVTIMEO_OLD == optname);
                break;

        case SO_SNDTIMEO_OLD:
        case SO_SNDTIMEO_NEW:
                lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
                                      SO_SNDTIMEO_OLD == optname);
                break;

        case SO_RCVLOWAT:
                v.val = READ_ONCE(sk->sk_rcvlowat);
                break;

        case SO_SNDLOWAT:
                v.val = 1;
                break;

        case SO_PASSCRED:
                v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
                break;

        case SO_PASSPIDFD:
                v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
                break;

        case SO_PEERCRED:
        {
                struct ucred peercred;
                if (len > sizeof(peercred))
                        len = sizeof(peercred);

                spin_lock(&sk->sk_peer_lock);
                cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
                spin_unlock(&sk->sk_peer_lock);

                if (copy_to_sockptr(optval, &peercred, len))
                        return -EFAULT;
                goto lenout;
        }

        case SO_PEERPIDFD:
        {
                struct pid *peer_pid;
                struct file *pidfd_file = NULL;
                int pidfd;

                if (len > sizeof(pidfd))
                        len = sizeof(pidfd);

                spin_lock(&sk->sk_peer_lock);
                peer_pid = get_pid(sk->sk_peer_pid);
                spin_unlock(&sk->sk_peer_lock);

                if (!peer_pid)
                        return -ENODATA;

                pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
                put_pid(peer_pid);
                if (pidfd < 0)
                        return pidfd;

                if (copy_to_sockptr(optval, &pidfd, len) ||
                    copy_to_sockptr(optlen, &len, sizeof(int))) {
                        put_unused_fd(pidfd);
                        fput(pidfd_file);

                        return -EFAULT;
                }

                fd_install(pidfd, pidfd_file);
                return 0;
        }

        case SO_PEERGROUPS:
        {
                const struct cred *cred;
                int ret, n;

                cred = sk_get_peer_cred(sk);
                if (!cred)
                        return -ENODATA;

                n = cred->group_info->ngroups;
                if (len < n * sizeof(gid_t)) {
                        len = n * sizeof(gid_t);
                        put_cred(cred);
                        return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
                }
                len = n * sizeof(gid_t);

                ret = groups_to_user(optval, cred->group_info);
                put_cred(cred);
                if (ret)
                        return ret;
                goto lenout;
        }

        case SO_PEERNAME:
        {
                struct sockaddr_storage address;

                lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
                if (lv < 0)
                        return -ENOTCONN;
                if (lv < len)
                        return -EINVAL;
                if (copy_to_sockptr(optval, &address, len))
                        return -EFAULT;
                goto lenout;
        }

        /* Dubious BSD thing... Probably nobody even uses it, but
         * the UNIX standard wants it for whatever reason... -DaveM
         */
        case SO_ACCEPTCONN:
                v.val = sk->sk_state == TCP_LISTEN;
                break;

        case SO_PASSSEC:
                v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
                break;

        case SO_PEERSEC:
                return security_socket_getpeersec_stream(sock,
                                                         optval, optlen, len);

        case SO_MARK:
                v.val = READ_ONCE(sk->sk_mark);
                break;

        case SO_RCVMARK:
                v.val = sock_flag(sk, SOCK_RCVMARK);
                break;

        case SO_RCVPRIORITY:
                v.val = sock_flag(sk, SOCK_RCVPRIORITY);
                break;

        case SO_RXQ_OVFL:
                v.val = sock_flag(sk, SOCK_RXQ_OVFL);
                break;

        case SO_WIFI_STATUS:
                v.val = sock_flag(sk, SOCK_WIFI_STATUS);
                break;

        case SO_PEEK_OFF:
                if (!READ_ONCE(sock->ops)->set_peek_off)
                        return -EOPNOTSUPP;

                v.val = READ_ONCE(sk->sk_peek_off);
                break;
        case SO_NOFCS:
                v.val = sock_flag(sk, SOCK_NOFCS);
                break;

        case SO_BINDTODEVICE:
                return sock_getbindtodevice(sk, optval, optlen, len);

        case SO_GET_FILTER:
                len = sk_get_filter(sk, optval, len);
                if (len < 0)
                        return len;

                goto lenout;

        case SO_LOCK_FILTER:
                v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
                break;

        case SO_BPF_EXTENSIONS:
                v.val = bpf_tell_extensions();
                break;

        case SO_SELECT_ERR_QUEUE:
                v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
                break;

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_BUSY_POLL:
                v.val = READ_ONCE(sk->sk_ll_usec);
                break;
        case SO_PREFER_BUSY_POLL:
                v.val = READ_ONCE(sk->sk_prefer_busy_poll);
                break;
#endif

        case SO_MAX_PACING_RATE:
                /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
                if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
                        lv = sizeof(v.ulval);
                        v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
                } else {
                        /* 32bit version */
                        v.val = min_t(unsigned long, ~0U,
                                      READ_ONCE(sk->sk_max_pacing_rate));
                }
                break;

        case SO_INCOMING_CPU:
                v.val = READ_ONCE(sk->sk_incoming_cpu);
                break;

        case SO_MEMINFO:
        {
                u32 meminfo[SK_MEMINFO_VARS];

                sk_get_meminfo(sk, meminfo);

                len = min_t(unsigned int, len, sizeof(meminfo));
                if (copy_to_sockptr(optval, &meminfo, len))
                        return -EFAULT;

                goto lenout;
        }

#ifdef CONFIG_NET_RX_BUSY_POLL
        case SO_INCOMING_NAPI_ID:
                v.val = READ_ONCE(sk->sk_napi_id);

                /* aggregate non-NAPI IDs down to 0 */
                if (!napi_id_valid(v.val))
                        v.val = 0;

                break;
#endif

        case SO_COOKIE:
                lv = sizeof(u64);
                if (len < lv)
                        return -EINVAL;
                v.val64 = sock_gen_cookie(sk);
                break;

        case SO_ZEROCOPY:
                v.val = sock_flag(sk, SOCK_ZEROCOPY);
                break;

        case SO_TXTIME:
                lv = sizeof(v.txtime);
                v.txtime.clockid = sk->sk_clockid;
                v.txtime.flags |= sk->sk_txtime_deadline_mode ?
                                  SOF_TXTIME_DEADLINE_MODE : 0;
                v.txtime.flags |= sk->sk_txtime_report_errors ?
                                  SOF_TXTIME_REPORT_ERRORS : 0;
                break;

        case SO_BINDTOIFINDEX:
                v.val = READ_ONCE(sk->sk_bound_dev_if);
                break;

        case SO_NETNS_COOKIE:
                lv = sizeof(u64);
                if (len != lv)
                        return -EINVAL;
                v.val64 = sock_net(sk)->net_cookie;
                break;

        case SO_BUF_LOCK:
                v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
                break;

        case SO_RESERVE_MEM:
                v.val = READ_ONCE(sk->sk_reserved_mem);
                break;

        case SO_TXREHASH:
                /* Paired with WRITE_ONCE() in sk_setsockopt() */
                v.val = READ_ONCE(sk->sk_txrehash);
                break;

        default:
                /* We implement the SO_SNDLOWAT etc to not be settable
                 * (1003.1g 7).
                 */
                return -ENOPROTOOPT;
        }

        if (len > lv)
                len = lv;
        if (copy_to_sockptr(optval, &v, len))
                return -EFAULT;
lenout:
        if (copy_to_sockptr(optlen, &len, sizeof(int)))
                return -EFAULT;
        return 0;
}

/*
 * Initialize an sk_lock.
 *
 * (We also register the sk_lock with the lock validator.)
 */
static inline void sock_lock_init(struct sock *sk)
{
        sk_owner_clear(sk);

        if (sk->sk_kern_sock)
                sock_lock_init_class_and_name(
                        sk,
                        af_family_kern_slock_key_strings[sk->sk_family],
                        af_family_kern_slock_keys + sk->sk_family,
                        af_family_kern_key_strings[sk->sk_family],
                        af_family_kern_keys + sk->sk_family);
        else
                sock_lock_init_class_and_name(
                        sk,
                        af_family_slock_key_strings[sk->sk_family],
                        af_family_slock_keys + sk->sk_family,
                        af_family_key_strings[sk->sk_family],
                        af_family_keys + sk->sk_family);
}

/*
 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 * even temporarily, because of RCU lookups. sk_node should also be left as is.
 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
 */
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
        const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
        void *sptr = nsk->sk_security;
#endif

        /* If we move sk_tx_queue_mapping out of the private section,
         * we must check if sk_tx_queue_clear() is called after
         * sock_copy() in sk_clone_lock().
         */
        BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
                     offsetof(struct sock, sk_dontcopy_begin) ||
                     offsetof(struct sock, sk_tx_queue_mapping) >=
                     offsetof(struct sock, sk_dontcopy_end));

        memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));

        unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
                      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
                      /* alloc is larger than struct, see sk_prot_alloc() */);

#ifdef CONFIG_SECURITY_NETWORK
        nsk->sk_security = sptr;
        security_sk_clone(osk, nsk);
#endif
}

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
                int family)
{
        struct sock *sk;
        struct kmem_cache *slab;

        slab = prot->slab;
        if (slab != NULL) {
                sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
                if (!sk)
                        return sk;
                if (want_init_on_alloc(priority))
                        sk_prot_clear_nulls(sk, prot->obj_size);
        } else
                sk = kmalloc(prot->obj_size, priority);

        if (sk != NULL) {
                if (security_sk_alloc(sk, family, priority))
                        goto out_free;

                if (!try_module_get(prot->owner))
                        goto out_free_sec;
        }

        return sk;

out_free_sec:
        security_sk_free(sk);
out_free:
        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        return NULL;
}

static void sk_prot_free(struct proto *prot, struct sock *sk)
{
        struct kmem_cache *slab;
        struct module *owner;

        owner = prot->owner;
        slab = prot->slab;

        cgroup_sk_free(&sk->sk_cgrp_data);
        mem_cgroup_sk_free(sk);
        security_sk_free(sk);

        sk_owner_put(sk);

        if (slab != NULL)
                kmem_cache_free(slab, sk);
        else
                kfree(sk);
        module_put(owner);
}

/**
 *        sk_alloc - All socket objects are allocated here
 *        @net: the applicable net namespace
 *        @family: protocol family
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *        @prot: struct proto associated with this new sock instance
 *        @kern: is this to be a kernel socket?
 */
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot, int kern)
{
        struct sock *sk;

        sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
        if (sk) {
                sk->sk_family = family;
                /*
                 * See comment in struct sock definition to understand
                 * why we need sk_prot_creator -acme
                 */
                sk->sk_prot = sk->sk_prot_creator = prot;
                sk->sk_kern_sock = kern;
                sock_lock_init(sk);
                sk->sk_net_refcnt = kern ? 0 : 1;
                if (likely(sk->sk_net_refcnt)) {
                        get_net_track(net, &sk->ns_tracker, priority);
                        sock_inuse_add(net, 1);
                } else {
                        net_passive_inc(net);
                        __netns_tracker_alloc(net, &sk->ns_tracker,
                                              false, priority);
                }

                sock_net_set(sk, net);
                refcount_set(&sk->sk_wmem_alloc, 1);

                mem_cgroup_sk_alloc(sk);
                cgroup_sk_alloc(&sk->sk_cgrp_data);
                sock_update_classid(&sk->sk_cgrp_data);
                sock_update_netprioidx(&sk->sk_cgrp_data);
                sk_tx_queue_clear(sk);
        }

        return sk;
}
EXPORT_SYMBOL(sk_alloc);

/* Sockets having SOCK_RCU_FREE will call this function after one RCU
 * grace period. This is the case for UDP sockets and TCP listeners.
 */
static void __sk_destruct(struct rcu_head *head)
{
        struct sock *sk = container_of(head, struct sock, sk_rcu);
        struct net *net = sock_net(sk);
        struct sk_filter *filter;

        if (sk->sk_destruct)
                sk->sk_destruct(sk);

        filter = rcu_dereference_check(sk->sk_filter,
                                       refcount_read(&sk->sk_wmem_alloc) == 0);
        if (filter) {
                sk_filter_uncharge(sk, filter);
                RCU_INIT_POINTER(sk->sk_filter, NULL);
        }

        sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);

#ifdef CONFIG_BPF_SYSCALL
        bpf_sk_storage_free(sk);
#endif

        if (atomic_read(&sk->sk_omem_alloc))
                pr_debug("%s: optmem leakage (%d bytes) detected\n",
                         __func__, atomic_read(&sk->sk_omem_alloc));

        if (sk->sk_frag.page) {
                put_page(sk->sk_frag.page);
                sk->sk_frag.page = NULL;
        }

        /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
        put_cred(sk->sk_peer_cred);
        put_pid(sk->sk_peer_pid);

        if (likely(sk->sk_net_refcnt)) {
                put_net_track(net, &sk->ns_tracker);
        } else {
                __netns_tracker_free(net, &sk->ns_tracker, false);
                net_passive_dec(net);
        }
        sk_prot_free(sk->sk_prot_creator, sk);
}

void sk_net_refcnt_upgrade(struct sock *sk)
{
        struct net *net = sock_net(sk);

        WARN_ON_ONCE(sk->sk_net_refcnt);
        __netns_tracker_free(net, &sk->ns_tracker, false);
        net_passive_dec(net);
        sk->sk_net_refcnt = 1;
        get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
        sock_inuse_add(net, 1);
}
EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);

void sk_destruct(struct sock *sk)
{
        bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);

        if (rcu_access_pointer(sk->sk_reuseport_cb)) {
                reuseport_detach_sock(sk);
                use_call_rcu = true;
        }

        if (use_call_rcu)
                call_rcu(&sk->sk_rcu, __sk_destruct);
        else
                __sk_destruct(&sk->sk_rcu);
}

static void __sk_free(struct sock *sk)
{
        if (likely(sk->sk_net_refcnt))
                sock_inuse_add(sock_net(sk), -1);

        if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
                sock_diag_broadcast_destroy(sk);
        else
                sk_destruct(sk);
}

void sk_free(struct sock *sk)
{
        /*
         * We subtract one from sk_wmem_alloc and can know if
         * some packets are still in some tx queue.
         * If not null, sock_wfree() will call __sk_free(sk) later
         */
        if (refcount_dec_and_test(&sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sk_free);

static void sk_init_common(struct sock *sk)
{
        skb_queue_head_init(&sk->sk_receive_queue);
        skb_queue_head_init(&sk->sk_write_queue);
        skb_queue_head_init(&sk->sk_error_queue);

        rwlock_init(&sk->sk_callback_lock);
        lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
                        af_rlock_keys + sk->sk_family,
                        af_family_rlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_write_queue.lock,
                        af_wlock_keys + sk->sk_family,
                        af_family_wlock_key_strings[sk->sk_family]);
        lockdep_set_class_and_name(&sk->sk_error_queue.lock,
                        af_elock_keys + sk->sk_family,
                        af_family_elock_key_strings[sk->sk_family]);
        if (sk->sk_kern_sock)
                lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_kern_callback_keys + sk->sk_family,
                        af_family_kern_clock_key_strings[sk->sk_family]);
        else
                lockdep_set_class_and_name(&sk->sk_callback_lock,
                        af_callback_keys + sk->sk_family,
                        af_family_clock_key_strings[sk->sk_family]);
}

/**
 *        sk_clone_lock - clone a socket, and lock its clone
 *        @sk: the socket to clone
 *        @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *
 *        Caller must unlock socket even in error path (bh_unlock_sock(newsk))
 */
struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
        struct proto *prot = READ_ONCE(sk->sk_prot);
        struct sk_filter *filter;
        bool is_charged = true;
        struct sock *newsk;

        newsk = sk_prot_alloc(prot, priority, sk->sk_family);
        if (!newsk)
                goto out;

        sock_copy(newsk, sk);

        newsk->sk_prot_creator = prot;

        /* SANITY */
        if (likely(newsk->sk_net_refcnt)) {
                get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
                sock_inuse_add(sock_net(newsk), 1);
        } else {
                /* Kernel sockets are not elevating the struct net refcount.
                 * Instead, use a tracker to more easily detect if a layer
                 * is not properly dismantling its kernel sockets at netns
                 * destroy time.
                 */
                net_passive_inc(sock_net(newsk));
                __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
                                      false, priority);
        }
        sk_node_init(&newsk->sk_node);
        sock_lock_init(newsk);
        bh_lock_sock(newsk);
        newsk->sk_backlog.head        = newsk->sk_backlog.tail = NULL;
        newsk->sk_backlog.len = 0;

        atomic_set(&newsk->sk_rmem_alloc, 0);

        /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
        refcount_set(&newsk->sk_wmem_alloc, 1);

        atomic_set(&newsk->sk_omem_alloc, 0);
        sk_init_common(newsk);

        newsk->sk_dst_cache        = NULL;
        newsk->sk_dst_pending_confirm = 0;
        newsk->sk_wmem_queued        = 0;
        newsk->sk_forward_alloc = 0;
        newsk->sk_reserved_mem  = 0;
        atomic_set(&newsk->sk_drops, 0);
        newsk->sk_send_head        = NULL;
        newsk->sk_userlocks        = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
        atomic_set(&newsk->sk_zckey, 0);

        sock_reset_flag(newsk, SOCK_DONE);

        /* sk->sk_memcg will be populated at accept() time */
        newsk->sk_memcg = NULL;

        cgroup_sk_clone(&newsk->sk_cgrp_data);

        rcu_read_lock();
        filter = rcu_dereference(sk->sk_filter);
        if (filter != NULL)
                /* though it's an empty new sock, the charging may fail
                 * if sysctl_optmem_max was changed between creation of
                 * original socket and cloning
                 */
                is_charged = sk_filter_charge(newsk, filter);
        RCU_INIT_POINTER(newsk->sk_filter, filter);
        rcu_read_unlock();

        if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
                /* We need to make sure that we don't uncharge the new
                 * socket if we couldn't charge it in the first place
                 * as otherwise we uncharge the parent's filter.
                 */
                if (!is_charged)
                        RCU_INIT_POINTER(newsk->sk_filter, NULL);
                sk_free_unlock_clone(newsk);
                newsk = NULL;
                goto out;
        }
        RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);

        if (bpf_sk_storage_clone(sk, newsk)) {
                sk_free_unlock_clone(newsk);
                newsk = NULL;
                goto out;
        }

        /* Clear sk_user_data if parent had the pointer tagged
         * as not suitable for copying when cloning.
         */
        if (sk_user_data_is_nocopy(newsk))
                newsk->sk_user_data = NULL;

        newsk->sk_err           = 0;
        newsk->sk_err_soft = 0;
        newsk->sk_priority = 0;
        newsk->sk_incoming_cpu = raw_smp_processor_id();

        /* Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&newsk->sk_refcnt, 2);

        sk_set_socket(newsk, NULL);
        sk_tx_queue_clear(newsk);
        RCU_INIT_POINTER(newsk->sk_wq, NULL);

        if (newsk->sk_prot->sockets_allocated)
                sk_sockets_allocated_inc(newsk);

        if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
                net_enable_timestamp();
out:
        return newsk;
}
EXPORT_SYMBOL_GPL(sk_clone_lock);

void sk_free_unlock_clone(struct sock *sk)
{
        /* It is still raw copy of parent, so invalidate
         * destructor and make plain sk_free() */
        sk->sk_destruct = NULL;
        bh_unlock_sock(sk);
        sk_free(sk);
}
EXPORT_SYMBOL_GPL(sk_free_unlock_clone);

static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
{
        bool is_ipv6 = false;
        u32 max_size;

#if IS_ENABLED(CONFIG_IPV6)
        is_ipv6 = (sk->sk_family == AF_INET6 &&
                   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
#endif
        /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
        max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
                        READ_ONCE(dst->dev->gso_ipv4_max_size);
        if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
                max_size = GSO_LEGACY_MAX_SIZE;

        return max_size - (MAX_TCP_HEADER + 1);
}

void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
        u32 max_segs = 1;

        sk->sk_route_caps = dst->dev->features;
        if (sk_is_tcp(sk)) {
                struct inet_connection_sock *icsk = inet_csk(sk);

                sk->sk_route_caps |= NETIF_F_GSO;
                icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
        }
        if (sk->sk_route_caps & NETIF_F_GSO)
                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
        if (unlikely(sk->sk_gso_disabled))
                sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
        if (sk_can_gso(sk)) {
                if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
                } else {
                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
                        sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
                        /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
                        max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
                }
        }
        sk->sk_gso_max_segs = max_segs;
        sk_dst_set(sk, dst);
}
EXPORT_SYMBOL_GPL(sk_setup_caps);

/*
 *        Simple resource managers for sockets.
 */


/*
 * Write buffer destructor automatically called from kfree_skb.
 */
void sock_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        unsigned int len = skb->truesize;
        bool free;

        if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
                if (sock_flag(sk, SOCK_RCU_FREE) &&
                    sk->sk_write_space == sock_def_write_space) {
                        rcu_read_lock();
                        free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
                        sock_def_write_space_wfree(sk);
                        rcu_read_unlock();
                        if (unlikely(free))
                                __sk_free(sk);
                        return;
                }

                /*
                 * Keep a reference on sk_wmem_alloc, this will be released
                 * after sk_write_space() call
                 */
                WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
                sk->sk_write_space(sk);
                len = 1;
        }
        /*
         * if sk_wmem_alloc reaches 0, we must finish what sk_free()
         * could not do because of in-flight packets
         */
        if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
                __sk_free(sk);
}
EXPORT_SYMBOL(sock_wfree);

/* This variant of sock_wfree() is used by TCP,
 * since it sets SOCK_USE_WRITE_QUEUE.
 */
void __sock_wfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
                __sk_free(sk);
}

void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
        skb_orphan(skb);
#ifdef CONFIG_INET
        if (unlikely(!sk_fullsock(sk)))
                return skb_set_owner_edemux(skb, sk);
#endif
        skb->sk = sk;
        skb->destructor = sock_wfree;
        skb_set_hash_from_sk(skb, sk);
        /*
         * We used to take a refcount on sk, but following operation
         * is enough to guarantee sk_free() won't free this sock until
         * all in-flight packets are completed
         */
        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
}
EXPORT_SYMBOL(skb_set_owner_w);

static bool can_skb_orphan_partial(const struct sk_buff *skb)
{
        /* Drivers depend on in-order delivery for crypto offload,
         * partial orphan breaks out-of-order-OK logic.
         */
        if (skb_is_decrypted(skb))
                return false;

        return (skb->destructor == sock_wfree ||
                (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
}

/* This helper is used by netem, as it can hold packets in its
 * delay queue. We want to allow the owner socket to send more
 * packets, as if they were already TX completed by a typical driver.
 * But we also want to keep skb->sk set because some packet schedulers
 * rely on it (sch_fq for example).
 */
void skb_orphan_partial(struct sk_buff *skb)
{
        if (skb_is_tcp_pure_ack(skb))
                return;

        if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
                return;

        skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);

/*
 * Read buffer destructor automatically called from kfree_skb.
 */
void sock_rfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;
        unsigned int len = skb->truesize;

        atomic_sub(len, &sk->sk_rmem_alloc);
        sk_mem_uncharge(sk, len);
}
EXPORT_SYMBOL(sock_rfree);

/*
 * Buffer destructor for skbs that are not used directly in read or write
 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
 */
void sock_efree(struct sk_buff *skb)
{
        sock_put(skb->sk);
}
EXPORT_SYMBOL(sock_efree);

/* Buffer destructor for prefetch/receive path where reference count may
 * not be held, e.g. for listen sockets.
 */
#ifdef CONFIG_INET
void sock_pfree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        if (!sk_is_refcounted(sk))
                return;

        if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
                inet_reqsk(sk)->rsk_listener = NULL;
                reqsk_free(inet_reqsk(sk));
                return;
        }

        sock_gen_put(sk);
}
EXPORT_SYMBOL(sock_pfree);
#endif /* CONFIG_INET */

kuid_t sock_i_uid(struct sock *sk)
{
        kuid_t uid;

        read_lock_bh(&sk->sk_callback_lock);
        uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
        read_unlock_bh(&sk->sk_callback_lock);
        return uid;
}
EXPORT_SYMBOL(sock_i_uid);

unsigned long __sock_i_ino(struct sock *sk)
{
        unsigned long ino;

        read_lock(&sk->sk_callback_lock);
        ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
        read_unlock(&sk->sk_callback_lock);
        return ino;
}
EXPORT_SYMBOL(__sock_i_ino);

unsigned long sock_i_ino(struct sock *sk)
{
        unsigned long ino;

        local_bh_disable();
        ino = __sock_i_ino(sk);
        local_bh_enable();
        return ino;
}
EXPORT_SYMBOL(sock_i_ino);

/*
 * Allocate a skb from the socket's send buffer.
 */
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
                             gfp_t priority)
{
        if (force ||
            refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
                struct sk_buff *skb = alloc_skb(size, priority);

                if (skb) {
                        skb_set_owner_w(skb, sk);
                        return skb;
                }
        }
        return NULL;
}
EXPORT_SYMBOL(sock_wmalloc);

static void sock_ofree(struct sk_buff *skb)
{
        struct sock *sk = skb->sk;

        atomic_sub(skb->truesize, &sk->sk_omem_alloc);
}

struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
                             gfp_t priority)
{
        struct sk_buff *skb;

        /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
        if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
            READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
                return NULL;

        skb = alloc_skb(size, priority);
        if (!skb)
                return NULL;

        atomic_add(skb->truesize, &sk->sk_omem_alloc);
        skb->sk = sk;
        skb->destructor = sock_ofree;
        return skb;
}

/*
 * Allocate a memory block from the socket's option memory buffer.
 */
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
        int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);

        if ((unsigned int)size <= optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
                void *mem;
                /* First do the add, to avoid the race if kmalloc
                 * might sleep.
                 */
                atomic_add(size, &sk->sk_omem_alloc);
                mem = kmalloc(size, priority);
                if (mem)
                        return mem;
                atomic_sub(size, &sk->sk_omem_alloc);
        }
        return NULL;
}
EXPORT_SYMBOL(sock_kmalloc);

/*
 * Duplicate the input "src" memory block using the socket's
 * option memory buffer.
 */
void *sock_kmemdup(struct sock *sk, const void *src,
                   int size, gfp_t priority)
{
        void *mem;

        mem = sock_kmalloc(sk, size, priority);
        if (mem)
                memcpy(mem, src, size);
        return mem;
}
EXPORT_SYMBOL(sock_kmemdup);

/* Free an option memory block. Note, we actually want the inline
 * here as this allows gcc to detect the nullify and fold away the
 * condition entirely.
 */
static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
                                  const bool nullify)
{
        if (WARN_ON_ONCE(!mem))
                return;
        if (nullify)
                kfree_sensitive(mem);
        else
                kfree(mem);
        atomic_sub(size, &sk->sk_omem_alloc);
}

void sock_kfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, false);
}
EXPORT_SYMBOL(sock_kfree_s);

void sock_kzfree_s(struct sock *sk, void *mem, int size)
{
        __sock_kfree_s(sk, mem, size, true);
}
EXPORT_SYMBOL(sock_kzfree_s);

/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
   I think, these locks should be removed for datagram sockets.
 */
static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
        DEFINE_WAIT(wait);

        sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
        for (;;) {
                if (!timeo)
                        break;
                if (signal_pending(current))
                        break;
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
                if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
                        break;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        break;
                if (READ_ONCE(sk->sk_err))
                        break;
                timeo = schedule_timeout(timeo);
        }
        finish_wait(sk_sleep(sk), &wait);
        return timeo;
}


/*
 *        Generic send/receive buffer handlers
 */

struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
                                     unsigned long data_len, int noblock,
                                     int *errcode, int max_page_order)
{
        struct sk_buff *skb;
        long timeo;
        int err;

        timeo = sock_sndtimeo(sk, noblock);
        for (;;) {
                err = sock_error(sk);
                if (err != 0)
                        goto failure;

                err = -EPIPE;
                if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
                        goto failure;

                if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
                        break;

                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
                err = -EAGAIN;
                if (!timeo)
                        goto failure;
                if (signal_pending(current))
                        goto interrupted;
                timeo = sock_wait_for_wmem(sk, timeo);
        }
        skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
                                   errcode, sk->sk_allocation);
        if (skb)
                skb_set_owner_w(skb, sk);
        return skb;

interrupted:
        err = sock_intr_errno(timeo);
failure:
        *errcode = err;
        return NULL;
}
EXPORT_SYMBOL(sock_alloc_send_pskb);

int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
                     struct sockcm_cookie *sockc)
{
        u32 tsflags;

        BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));

        switch (cmsg->cmsg_type) {
        case SO_MARK:
                if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
                    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
                        return -EPERM;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->mark = *(u32 *)CMSG_DATA(cmsg);
                break;
        case SO_TIMESTAMPING_OLD:
        case SO_TIMESTAMPING_NEW:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;

                tsflags = *(u32 *)CMSG_DATA(cmsg);
                if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
                        return -EINVAL;

                sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
                sockc->tsflags |= tsflags;
                break;
        case SCM_TXTIME:
                if (!sock_flag(sk, SOCK_TXTIME))
                        return -EINVAL;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
                        return -EINVAL;
                sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
                break;
        case SCM_TS_OPT_ID:
                if (sk_is_tcp(sk))
                        return -EINVAL;
                tsflags = READ_ONCE(sk->sk_tsflags);
                if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
                        return -EINVAL;
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
                sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
                break;
        /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
        case SCM_RIGHTS:
        case SCM_CREDENTIALS:
                break;
        case SO_PRIORITY:
                if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
                        return -EINVAL;
                if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
                        return -EPERM;
                sockc->priority = *(u32 *)CMSG_DATA(cmsg);
                break;
        default:
                return -EINVAL;
        }
        return 0;
}
EXPORT_SYMBOL(__sock_cmsg_send);

int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
                   struct sockcm_cookie *sockc)
{
        struct cmsghdr *cmsg;
        int ret;

        for_each_cmsghdr(cmsg, msg) {
                if (!CMSG_OK(msg, cmsg))
                        return -EINVAL;
                if (cmsg->cmsg_level != SOL_SOCKET)
                        continue;
                ret = __sock_cmsg_send(sk, cmsg, sockc);
                if (ret)
                        return ret;
        }
        return 0;
}
EXPORT_SYMBOL(sock_cmsg_send);

static void sk_enter_memory_pressure(struct sock *sk)
{
        if (!sk->sk_prot->enter_memory_pressure)
                return;

        sk->sk_prot->enter_memory_pressure(sk);
}

static void sk_leave_memory_pressure(struct sock *sk)
{
        if (sk->sk_prot->leave_memory_pressure) {
                INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
                                     tcp_leave_memory_pressure, sk);
        } else {
                unsigned long *memory_pressure = sk->sk_prot->memory_pressure;

                if (memory_pressure && READ_ONCE(*memory_pressure))
                        WRITE_ONCE(*memory_pressure, 0);
        }
}

DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);

/**
 * skb_page_frag_refill - check that a page_frag contains enough room
 * @sz: minimum size of the fragment we want to get
 * @pfrag: pointer to page_frag
 * @gfp: priority for memory allocation
 *
 * Note: While this allocator tries to use high order pages, there is
 * no guarantee that allocations succeed. Therefore, @sz MUST be
 * less or equal than PAGE_SIZE.
 */
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
        if (pfrag->page) {
                if (page_ref_count(pfrag->page) == 1) {
                        pfrag->offset = 0;
                        return true;
                }
                if (pfrag->offset + sz <= pfrag->size)
                        return true;
                put_page(pfrag->page);
        }

        pfrag->offset = 0;
        if (SKB_FRAG_PAGE_ORDER &&
            !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
                /* Avoid direct reclaim but allow kswapd to wake */
                pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
                                          __GFP_COMP | __GFP_NOWARN |
                                          __GFP_NORETRY,
                                          SKB_FRAG_PAGE_ORDER);
                if (likely(pfrag->page)) {
                        pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
                        return true;
                }
        }
        pfrag->page = alloc_page(gfp);
        if (likely(pfrag->page)) {
                pfrag->size = PAGE_SIZE;
                return true;
        }
        return false;
}
EXPORT_SYMBOL(skb_page_frag_refill);

bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
{
        if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
                return true;

        sk_enter_memory_pressure(sk);
        sk_stream_moderate_sndbuf(sk);
        return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);

void __lock_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        DEFINE_WAIT(wait);

        for (;;) {
                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
                                        TASK_UNINTERRUPTIBLE);
                spin_unlock_bh(&sk->sk_lock.slock);
                schedule();
                spin_lock_bh(&sk->sk_lock.slock);
                if (!sock_owned_by_user(sk))
                        break;
        }
        finish_wait(&sk->sk_lock.wq, &wait);
}

void __release_sock(struct sock *sk)
        __releases(&sk->sk_lock.slock)
        __acquires(&sk->sk_lock.slock)
{
        struct sk_buff *skb, *next;

        while ((skb = sk->sk_backlog.head) != NULL) {
                sk->sk_backlog.head = sk->sk_backlog.tail = NULL;

                spin_unlock_bh(&sk->sk_lock.slock);

                do {
                        next = skb->next;
                        prefetch(next);
                        DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
                        skb_mark_not_on_list(skb);
                        sk_backlog_rcv(sk, skb);

                        cond_resched();

                        skb = next;
                } while (skb != NULL);

                spin_lock_bh(&sk->sk_lock.slock);
        }

        /*
         * Doing the zeroing here guarantee we can not loop forever
         * while a wild producer attempts to flood us.
         */
        sk->sk_backlog.len = 0;
}

void __sk_flush_backlog(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);
        __release_sock(sk);

        if (sk->sk_prot->release_cb)
                INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
                                     tcp_release_cb, sk);

        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL_GPL(__sk_flush_backlog);

/**
 * sk_wait_data - wait for data to arrive at sk_receive_queue
 * @sk:    sock to wait on
 * @timeo: for how long
 * @skb:   last skb seen on sk_receive_queue
 *
 * Now socket state including sk->sk_err is changed only under lock,
 * hence we may omit checks after joining wait queue.
 * We check receive queue before schedule() only as optimization;
 * it is very likely that release_sock() added new data.
 */
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
{
        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        int rc;

        add_wait_queue(sk_sleep(sk), &wait);
        sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
        sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
        remove_wait_queue(sk_sleep(sk), &wait);
        return rc;
}
EXPORT_SYMBOL(sk_wait_data);

/**
 *        __sk_mem_raise_allocated - increase memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @amt: pages to allocate
 *        @kind: allocation type
 *
 *        Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
 *
 *        Unlike the globally shared limits among the sockets under same protocol,
 *        consuming the budget of a memcg won't have direct effect on other ones.
 *        So be optimistic about memcg's tolerance, and leave the callers to decide
 *        whether or not to raise allocated through sk_under_memory_pressure() or
 *        its variants.
 */
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
        struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
        struct proto *prot = sk->sk_prot;
        bool charged = false;
        long allocated;

        sk_memory_allocated_add(sk, amt);
        allocated = sk_memory_allocated(sk);

        if (memcg) {
                if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
                        goto suppress_allocation;
                charged = true;
        }

        /* Under limit. */
        if (allocated <= sk_prot_mem_limits(sk, 0)) {
                sk_leave_memory_pressure(sk);
                return 1;
        }

        /* Under pressure. */
        if (allocated > sk_prot_mem_limits(sk, 1))
                sk_enter_memory_pressure(sk);

        /* Over hard limit. */
        if (allocated > sk_prot_mem_limits(sk, 2))
                goto suppress_allocation;

        /* Guarantee minimum buffer size under pressure (either global
         * or memcg) to make sure features described in RFC 7323 (TCP
         * Extensions for High Performance) work properly.
         *
         * This rule does NOT stand when exceeds global or memcg's hard
         * limit, or else a DoS attack can be taken place by spawning
         * lots of sockets whose usage are under minimum buffer size.
         */
        if (kind == SK_MEM_RECV) {
                if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
                        return 1;

        } else { /* SK_MEM_SEND */
                int wmem0 = sk_get_wmem0(sk, prot);

                if (sk->sk_type == SOCK_STREAM) {
                        if (sk->sk_wmem_queued < wmem0)
                                return 1;
                } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
                                return 1;
                }
        }

        if (sk_has_memory_pressure(sk)) {
                u64 alloc;

                /* The following 'average' heuristic is within the
                 * scope of global accounting, so it only makes
                 * sense for global memory pressure.
                 */
                if (!sk_under_global_memory_pressure(sk))
                        return 1;

                /* Try to be fair among all the sockets under global
                 * pressure by allowing the ones that below average
                 * usage to raise.
                 */
                alloc = sk_sockets_allocated_read_positive(sk);
                if (sk_prot_mem_limits(sk, 2) > alloc *
                    sk_mem_pages(sk->sk_wmem_queued +
                                 atomic_read(&sk->sk_rmem_alloc) +
                                 sk->sk_forward_alloc))
                        return 1;
        }

suppress_allocation:

        if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
                sk_stream_moderate_sndbuf(sk);

                /* Fail only if socket is _under_ its sndbuf.
                 * In this case we cannot block, so that we have to fail.
                 */
                if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
                        /* Force charge with __GFP_NOFAIL */
                        if (memcg && !charged) {
                                mem_cgroup_charge_skmem(memcg, amt,
                                        gfp_memcg_charge() | __GFP_NOFAIL);
                        }
                        return 1;
                }
        }

        if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
                trace_sock_exceed_buf_limit(sk, prot, allocated, kind);

        sk_memory_allocated_sub(sk, amt);

        if (charged)
                mem_cgroup_uncharge_skmem(memcg, amt);

        return 0;
}

/**
 *        __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @size: memory size to allocate
 *        @kind: allocation type
 *
 *        If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
 *        rmem allocation. This function assumes that protocols which have
 *        memory_pressure use sk_wmem_queued as write buffer accounting.
 */
int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
        int ret, amt = sk_mem_pages(size);

        sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
        ret = __sk_mem_raise_allocated(sk, size, amt, kind);
        if (!ret)
                sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
        return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);

/**
 *        __sk_mem_reduce_allocated - reclaim memory_allocated
 *        @sk: socket
 *        @amount: number of quanta
 *
 *        Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
 */
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
        sk_memory_allocated_sub(sk, amount);

        if (mem_cgroup_sockets_enabled && sk->sk_memcg)
                mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);

        if (sk_under_global_memory_pressure(sk) &&
            (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
                sk_leave_memory_pressure(sk);
}

/**
 *        __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
 *        @sk: socket
 *        @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
 */
void __sk_mem_reclaim(struct sock *sk, int amount)
{
        amount >>= PAGE_SHIFT;
        sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
        __sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);

int sk_set_peek_off(struct sock *sk, int val)
{
        WRITE_ONCE(sk->sk_peek_off, val);
        return 0;
}
EXPORT_SYMBOL_GPL(sk_set_peek_off);

/*
 * Set of default routines for initialising struct proto_ops when
 * the protocol does not support a particular function. In certain
 * cases where it makes no sense for a protocol to have a "do nothing"
 * function, some default processing is provided.
 */

int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_bind);

int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
                    int len, int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_connect);

int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_socketpair);

int sock_no_accept(struct socket *sock, struct socket *newsock,
                   struct proto_accept_arg *arg)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_accept);

int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
                    int peer)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_getname);

int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_ioctl);

int sock_no_listen(struct socket *sock, int backlog)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_listen);

int sock_no_shutdown(struct socket *sock, int how)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_shutdown);

int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg);

int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_sendmsg_locked);

int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
                    int flags)
{
        return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_recvmsg);

int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
        /* Mirror missing mmap method error code */
        return -ENODEV;
}
EXPORT_SYMBOL(sock_no_mmap);

/*
 * When a file is received (via SCM_RIGHTS, etc), we must bump the
 * various sock-based usage counts.
 */
void __receive_sock(struct file *file)
{
        struct socket *sock;

        sock = sock_from_file(file);
        if (sock) {
                sock_update_netprioidx(&sock->sk->sk_cgrp_data);
                sock_update_classid(&sock->sk->sk_cgrp_data);
        }
}

/*
 *        Default Socket Callbacks
 */

static void sock_def_wakeup(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_all(&wq->wait);
        rcu_read_unlock();
}

static void sock_def_error_report(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_poll(&wq->wait, EPOLLERR);
        sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
        rcu_read_unlock();
}

void sock_def_readable(struct sock *sk)
{
        struct socket_wq *wq;

        trace_sk_data_ready(sk);

        rcu_read_lock();
        wq = rcu_dereference(sk->sk_wq);
        if (skwq_has_sleeper(wq))
                wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
                                                EPOLLRDNORM | EPOLLRDBAND);
        sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
        rcu_read_unlock();
}

static void sock_def_write_space(struct sock *sk)
{
        struct socket_wq *wq;

        rcu_read_lock();

        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
        if (sock_writeable(sk)) {
                wq = rcu_dereference(sk->sk_wq);
                if (skwq_has_sleeper(wq))
                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

                /* Should agree with poll, otherwise some programs break */
                sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }

        rcu_read_unlock();
}

/* An optimised version of sock_def_write_space(), should only be called
 * for SOCK_RCU_FREE sockets under RCU read section and after putting
 * ->sk_wmem_alloc.
 */
static void sock_def_write_space_wfree(struct sock *sk)
{
        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
        if (sock_writeable(sk)) {
                struct socket_wq *wq = rcu_dereference(sk->sk_wq);

                /* rely on refcount_sub from sock_wfree() */
                smp_mb__after_atomic();
                if (wq && waitqueue_active(&wq->wait))
                        wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
                                                EPOLLWRNORM | EPOLLWRBAND);

                /* Should agree with poll, otherwise some programs break */
                sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
}

static void sock_def_destruct(struct sock *sk)
{
}

void sk_send_sigurg(struct sock *sk)
{
        if (sk->sk_socket && sk->sk_socket->file)
                if (send_sigurg(sk->sk_socket->file))
                        sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
EXPORT_SYMBOL(sk_send_sigurg);

void sk_reset_timer(struct sock *sk, struct timer_list* timer,
                    unsigned long expires)
{
        if (!mod_timer(timer, expires))
                sock_hold(sk);
}
EXPORT_SYMBOL(sk_reset_timer);

void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
        if (timer_delete(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);

void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
{
        if (timer_delete_sync(timer))
                __sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer_sync);

void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
{
        sk_init_common(sk);
        sk->sk_send_head        =        NULL;

        timer_setup(&sk->sk_timer, NULL, 0);

        sk->sk_allocation        =        GFP_KERNEL;
        sk->sk_rcvbuf                =        READ_ONCE(sysctl_rmem_default);
        sk->sk_sndbuf                =        READ_ONCE(sysctl_wmem_default);
        sk->sk_state                =        TCP_CLOSE;
        sk->sk_use_task_frag        =        true;
        sk_set_socket(sk, sock);

        sock_set_flag(sk, SOCK_ZAPPED);

        if (sock) {
                sk->sk_type        =        sock->type;
                RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
                sock->sk        =        sk;
        } else {
                RCU_INIT_POINTER(sk->sk_wq, NULL);
        }
        sk->sk_uid        =        uid;

        sk->sk_state_change        =        sock_def_wakeup;
        sk->sk_data_ready        =        sock_def_readable;
        sk->sk_write_space        =        sock_def_write_space;
        sk->sk_error_report        =        sock_def_error_report;
        sk->sk_destruct                =        sock_def_destruct;

        sk->sk_frag.page        =        NULL;
        sk->sk_frag.offset        =        0;
        sk->sk_peek_off                =        -1;

        sk->sk_peer_pid         =        NULL;
        sk->sk_peer_cred        =        NULL;
        spin_lock_init(&sk->sk_peer_lock);

        sk->sk_write_pending        =        0;
        sk->sk_rcvlowat                =        1;
        sk->sk_rcvtimeo                =        MAX_SCHEDULE_TIMEOUT;
        sk->sk_sndtimeo                =        MAX_SCHEDULE_TIMEOUT;

        sk->sk_stamp = SK_DEFAULT_STAMP;
#if BITS_PER_LONG==32
        seqlock_init(&sk->sk_stamp_seq);
#endif
        atomic_set(&sk->sk_zckey, 0);

#ifdef CONFIG_NET_RX_BUSY_POLL
        sk->sk_napi_id                =        0;
        sk->sk_ll_usec                =        READ_ONCE(sysctl_net_busy_read);
#endif

        sk->sk_max_pacing_rate = ~0UL;
        sk->sk_pacing_rate = ~0UL;
        WRITE_ONCE(sk->sk_pacing_shift, 10);
        sk->sk_incoming_cpu = -1;

        sk_rx_queue_clear(sk);
        /*
         * Before updating sk_refcnt, we must commit prior changes to memory
         * (Documentation/RCU/rculist_nulls.rst for details)
         */
        smp_wmb();
        refcount_set(&sk->sk_refcnt, 1);
        atomic_set(&sk->sk_drops, 0);
}
EXPORT_SYMBOL(sock_init_data_uid);

void sock_init_data(struct socket *sock, struct sock *sk)
{
        kuid_t uid = sock ?
                SOCK_INODE(sock)->i_uid :
                make_kuid(sock_net(sk)->user_ns, 0);

        sock_init_data_uid(sock, sk, uid);
}
EXPORT_SYMBOL(sock_init_data);

void lock_sock_nested(struct sock *sk, int subclass)
{
        /* The sk_lock has mutex_lock() semantics here. */
        mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);

        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);
        if (sock_owned_by_user_nocheck(sk))
                __lock_sock(sk);
        sk->sk_lock.owned = 1;
        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(lock_sock_nested);

void release_sock(struct sock *sk)
{
        spin_lock_bh(&sk->sk_lock.slock);
        if (sk->sk_backlog.tail)
                __release_sock(sk);

        if (sk->sk_prot->release_cb)
                INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
                                     tcp_release_cb, sk);

        sock_release_ownership(sk);
        if (waitqueue_active(&sk->sk_lock.wq))
                wake_up(&sk->sk_lock.wq);
        spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(release_sock);

bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
{
        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);

        if (!sock_owned_by_user_nocheck(sk)) {
                /*
                 * Fast path return with bottom halves disabled and
                 * sock::sk_lock.slock held.
                 *
                 * The 'mutex' is not contended and holding
                 * sock::sk_lock.slock prevents all other lockers to
                 * proceed so the corresponding unlock_sock_fast() can
                 * avoid the slow path of release_sock() completely and
                 * just release slock.
                 *
                 * From a semantical POV this is equivalent to 'acquiring'
                 * the 'mutex', hence the corresponding lockdep
                 * mutex_release() has to happen in the fast path of
                 * unlock_sock_fast().
                 */
                return false;
        }

        __lock_sock(sk);
        sk->sk_lock.owned = 1;
        __acquire(&sk->sk_lock.slock);
        spin_unlock_bh(&sk->sk_lock.slock);
        return true;
}
EXPORT_SYMBOL(__lock_sock_fast);

int sock_gettstamp(struct socket *sock, void __user *userstamp,
                   bool timeval, bool time32)
{
        struct sock *sk = sock->sk;
        struct timespec64 ts;

        sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        ts = ktime_to_timespec64(sock_read_timestamp(sk));
        if (ts.tv_sec == -1)
                return -ENOENT;
        if (ts.tv_sec == 0) {
                ktime_t kt = ktime_get_real();
                sock_write_timestamp(sk, kt);
                ts = ktime_to_timespec64(kt);
        }

        if (timeval)
                ts.tv_nsec /= 1000;

#ifdef CONFIG_COMPAT_32BIT_TIME
        if (time32)
                return put_old_timespec32(&ts, userstamp);
#endif
#ifdef CONFIG_SPARC64
        /* beware of padding in sparc64 timeval */
        if (timeval && !in_compat_syscall()) {
                struct __kernel_old_timeval __user tv = {
                        .tv_sec = ts.tv_sec,
                        .tv_usec = ts.tv_nsec,
                };
                if (copy_to_user(userstamp, &tv, sizeof(tv)))
                        return -EFAULT;
                return 0;
        }
#endif
        return put_timespec64(&ts, userstamp);
}
EXPORT_SYMBOL(sock_gettstamp);

void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
{
        if (!sock_flag(sk, flag)) {
                unsigned long previous_flags = sk->sk_flags;

                sock_set_flag(sk, flag);
                /*
                 * we just set one of the two flags which require net
                 * time stamping, but time stamping might have been on
                 * already because of the other one
                 */
                if (sock_needs_netstamp(sk) &&
                    !(previous_flags & SK_FLAGS_TIMESTAMP))
                        net_enable_timestamp();
        }
}

int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
                       int level, int type)
{
        struct sock_exterr_skb *serr;
        struct sk_buff *skb;
        int copied, err;

        err = -EAGAIN;
        skb = sock_dequeue_err_skb(sk);
        if (skb == NULL)
                goto out;

        copied = skb->len;
        if (copied > len) {
                msg->msg_flags |= MSG_TRUNC;
                copied = len;
        }
        err = skb_copy_datagram_msg(skb, 0, msg, copied);
        if (err)
                goto out_free_skb;

        sock_recv_timestamp(msg, sk, skb);

        serr = SKB_EXT_ERR(skb);
        put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);

        msg->msg_flags |= MSG_ERRQUEUE;
        err = copied;

out_free_skb:
        kfree_skb(skb);
out:
        return err;
}
EXPORT_SYMBOL(sock_recv_errqueue);

/*
 *        Get a socket option on an socket.
 *
 *        FIX: POSIX 1003.1g is very ambiguous here. It states that
 *        asynchronous errors should be reported by getsockopt. We assume
 *        this means if you specify SO_ERROR (otherwise what is the point of it).
 */
int sock_common_getsockopt(struct socket *sock, int level, int optname,
                           char __user *optval, int __user *optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);

int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
                        int flags)
{
        struct sock *sk = sock->sk;
        int addr_len = 0;
        int err;

        err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
        if (err >= 0)
                msg->msg_namelen = addr_len;
        return err;
}
EXPORT_SYMBOL(sock_common_recvmsg);

/*
 *        Set socket options on an inet socket.
 */
int sock_common_setsockopt(struct socket *sock, int level, int optname,
                           sockptr_t optval, unsigned int optlen)
{
        struct sock *sk = sock->sk;

        /* IPV6_ADDRFORM can change sk->sk_prot under us. */
        return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);

void sk_common_release(struct sock *sk)
{
        if (sk->sk_prot->destroy)
                sk->sk_prot->destroy(sk);

        /*
         * Observation: when sk_common_release is called, processes have
         * no access to socket. But net still has.
         * Step one, detach it from networking:
         *
         * A. Remove from hash tables.
         */

        sk->sk_prot->unhash(sk);

        /*
         * In this point socket cannot receive new packets, but it is possible
         * that some packets are in flight because some CPU runs receiver and
         * did hash table lookup before we unhashed socket. They will achieve
         * receive queue and will be purged by socket destructor.
         *
         * Also we still have packets pending on receive queue and probably,
         * our own packets waiting in device queues. sock_destroy will drain
         * receive queue, but transmitted packets will delay socket destruction
         * until the last reference will be released.
         */

        sock_orphan(sk);

        xfrm_sk_free_policy(sk);

        sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);

void sk_get_meminfo(const struct sock *sk, u32 *mem)
{
        memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);

        mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
        mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
        mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
        mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
        mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
        mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
        mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
        mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
        mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
}

#ifdef CONFIG_PROC_FS
static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);

int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
        int cpu, idx = prot->inuse_idx;
        int res = 0;

        for_each_possible_cpu(cpu)
                res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];

        return res >= 0 ? res : 0;
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);

int sock_inuse_get(struct net *net)
{
        int cpu, res = 0;

        for_each_possible_cpu(cpu)
                res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;

        return res;
}

EXPORT_SYMBOL_GPL(sock_inuse_get);

static int __net_init sock_inuse_init_net(struct net *net)
{
        net->core.prot_inuse = alloc_percpu(struct prot_inuse);
        if (net->core.prot_inuse == NULL)
                return -ENOMEM;
        return 0;
}

static void __net_exit sock_inuse_exit_net(struct net *net)
{
        free_percpu(net->core.prot_inuse);
}

static struct pernet_operations net_inuse_ops = {
        .init = sock_inuse_init_net,
        .exit = sock_inuse_exit_net,
};

static __init int net_inuse_init(void)
{
        if (register_pernet_subsys(&net_inuse_ops))
                panic("Cannot initialize net inuse counters");

        return 0;
}

core_initcall(net_inuse_init);

static int assign_proto_idx(struct proto *prot)
{
        prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);

        if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
                pr_err("PROTO_INUSE_NR exhausted\n");
                return -ENOSPC;
        }

        set_bit(prot->inuse_idx, proto_inuse_idx);
        return 0;
}

static void release_proto_idx(struct proto *prot)
{
        if (prot->inuse_idx != PROTO_INUSE_NR - 1)
                clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
static inline int assign_proto_idx(struct proto *prot)
{
        return 0;
}

static inline void release_proto_idx(struct proto *prot)
{
}

#endif

static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
{
        if (!twsk_prot)
                return;
        kfree(twsk_prot->twsk_slab_name);
        twsk_prot->twsk_slab_name = NULL;
        kmem_cache_destroy(twsk_prot->twsk_slab);
        twsk_prot->twsk_slab = NULL;
}

static int tw_prot_init(const struct proto *prot)
{
        struct timewait_sock_ops *twsk_prot = prot->twsk_prot;

        if (!twsk_prot)
                return 0;

        twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
                                              prot->name);
        if (!twsk_prot->twsk_slab_name)
                return -ENOMEM;

        twsk_prot->twsk_slab =
                kmem_cache_create(twsk_prot->twsk_slab_name,
                                  twsk_prot->twsk_obj_size, 0,
                                  SLAB_ACCOUNT | prot->slab_flags,
                                  NULL);
        if (!twsk_prot->twsk_slab) {
                pr_crit("%s: Can't create timewait sock SLAB cache!\n",
                        prot->name);
                return -ENOMEM;
        }

        return 0;
}

static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
        if (!rsk_prot)
                return;
        kfree(rsk_prot->slab_name);
        rsk_prot->slab_name = NULL;
        kmem_cache_destroy(rsk_prot->slab);
        rsk_prot->slab = NULL;
}

static int req_prot_init(const struct proto *prot)
{
        struct request_sock_ops *rsk_prot = prot->rsk_prot;

        if (!rsk_prot)
                return 0;

        rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
                                        prot->name);
        if (!rsk_prot->slab_name)
                return -ENOMEM;

        rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
                                           rsk_prot->obj_size, 0,
                                           SLAB_ACCOUNT | prot->slab_flags,
                                           NULL);

        if (!rsk_prot->slab) {
                pr_crit("%s: Can't create request sock SLAB cache!\n",
                        prot->name);
                return -ENOMEM;
        }
        return 0;
}

int proto_register(struct proto *prot, int alloc_slab)
{
        int ret = -ENOBUFS;

        if (prot->memory_allocated && !prot->sysctl_mem) {
                pr_err("%s: missing sysctl_mem\n", prot->name);
                return -EINVAL;
        }
        if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
                pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
                return -EINVAL;
        }
        if (alloc_slab) {
                prot->slab = kmem_cache_create_usercopy(prot->name,
                                        prot->obj_size, 0,
                                        SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
                                        prot->slab_flags,
                                        prot->useroffset, prot->usersize,
                                        NULL);

                if (prot->slab == NULL) {
                        pr_crit("%s: Can't create sock SLAB cache!\n",
                                prot->name);
                        goto out;
                }

                if (req_prot_init(prot))
                        goto out_free_request_sock_slab;

                if (tw_prot_init(prot))
                        goto out_free_timewait_sock_slab;
        }

        mutex_lock(&proto_list_mutex);
        ret = assign_proto_idx(prot);
        if (ret) {
                mutex_unlock(&proto_list_mutex);
                goto out_free_timewait_sock_slab;
        }
        list_add(&prot->node, &proto_list);
        mutex_unlock(&proto_list_mutex);
        return ret;

out_free_timewait_sock_slab:
        if (alloc_slab)
                tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
        if (alloc_slab) {
                req_prot_cleanup(prot->rsk_prot);

                kmem_cache_destroy(prot->slab);
                prot->slab = NULL;
        }
out:
        return ret;
}
EXPORT_SYMBOL(proto_register);

void proto_unregister(struct proto *prot)
{
        mutex_lock(&proto_list_mutex);
        release_proto_idx(prot);
        list_del(&prot->node);
        mutex_unlock(&proto_list_mutex);

        kmem_cache_destroy(prot->slab);
        prot->slab = NULL;

        req_prot_cleanup(prot->rsk_prot);
        tw_prot_cleanup(prot->twsk_prot);
}
EXPORT_SYMBOL(proto_unregister);

int sock_load_diag_module(int family, int protocol)
{
        if (!protocol) {
                if (!sock_is_registered(family))
                        return -ENOENT;

                return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
                                      NETLINK_SOCK_DIAG, family);
        }

#ifdef CONFIG_INET
        if (family == AF_INET &&
            protocol != IPPROTO_RAW &&
            protocol < MAX_INET_PROTOS &&
            !rcu_access_pointer(inet_protos[protocol]))
                return -ENOENT;
#endif

        return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
                              NETLINK_SOCK_DIAG, family, protocol);
}
EXPORT_SYMBOL(sock_load_diag_module);

#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(proto_list_mutex)
{
        mutex_lock(&proto_list_mutex);
        return seq_list_start_head(&proto_list, *pos);
}

static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        return seq_list_next(v, &proto_list, pos);
}

static void proto_seq_stop(struct seq_file *seq, void *v)
        __releases(proto_list_mutex)
{
        mutex_unlock(&proto_list_mutex);
}

static char proto_method_implemented(const void *method)
{
        return method == NULL ? 'n' : 'y';
}
static long sock_prot_memory_allocated(struct proto *proto)
{
        return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
}

static const char *sock_prot_memory_pressure(struct proto *proto)
{
        return proto->memory_pressure != NULL ?
        proto_memory_pressure(proto) ? "yes" : "no" : "NI";
}

static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{

        seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
                   proto->name,
                   proto->obj_size,
                   sock_prot_inuse_get(seq_file_net(seq), proto),
                   sock_prot_memory_allocated(proto),
                   sock_prot_memory_pressure(proto),
                   proto->max_header,
                   proto->slab == NULL ? "no" : "yes",
                   module_name(proto->owner),
                   proto_method_implemented(proto->close),
                   proto_method_implemented(proto->connect),
                   proto_method_implemented(proto->disconnect),
                   proto_method_implemented(proto->accept),
                   proto_method_implemented(proto->ioctl),
                   proto_method_implemented(proto->init),
                   proto_method_implemented(proto->destroy),
                   proto_method_implemented(proto->shutdown),
                   proto_method_implemented(proto->setsockopt),
                   proto_method_implemented(proto->getsockopt),
                   proto_method_implemented(proto->sendmsg),
                   proto_method_implemented(proto->recvmsg),
                   proto_method_implemented(proto->bind),
                   proto_method_implemented(proto->backlog_rcv),
                   proto_method_implemented(proto->hash),
                   proto_method_implemented(proto->unhash),
                   proto_method_implemented(proto->get_port),
                   proto_method_implemented(proto->enter_memory_pressure));
}

static int proto_seq_show(struct seq_file *seq, void *v)
{
        if (v == &proto_list)
                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
                           "protocol",
                           "size",
                           "sockets",
                           "memory",
                           "press",
                           "maxhdr",
                           "slab",
                           "module",
                           "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
        else
                proto_seq_printf(seq, list_entry(v, struct proto, node));
        return 0;
}

static const struct seq_operations proto_seq_ops = {
        .start  = proto_seq_start,
        .next   = proto_seq_next,
        .stop   = proto_seq_stop,
        .show   = proto_seq_show,
};

static __net_init int proto_init_net(struct net *net)
{
        if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
                        sizeof(struct seq_net_private)))
                return -ENOMEM;

        return 0;
}

static __net_exit void proto_exit_net(struct net *net)
{
        remove_proc_entry("protocols", net->proc_net);
}


static __net_initdata struct pernet_operations proto_net_ops = {
        .init = proto_init_net,
        .exit = proto_exit_net,
};

static int __init proto_init(void)
{
        return register_pernet_subsys(&proto_net_ops);
}

subsys_initcall(proto_init);

#endif /* PROC_FS */

#ifdef CONFIG_NET_RX_BUSY_POLL
bool sk_busy_loop_end(void *p, unsigned long start_time)
{
        struct sock *sk = p;

        if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
                return true;

        if (sk_is_udp(sk) &&
            !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
                return true;

        return sk_busy_loop_timeout(sk, start_time);
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */

int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
{
        if (!sk->sk_prot->bind_add)
                return -EOPNOTSUPP;
        return sk->sk_prot->bind_add(sk, addr, addr_len);
}
EXPORT_SYMBOL(sock_bind_add);

/* Copy 'size' bytes from userspace and return `size` back to userspace */
int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
                     void __user *arg, void *karg, size_t size)
{
        int ret;

        if (copy_from_user(karg, arg, size))
                return -EFAULT;

        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
        if (ret)
                return ret;

        if (copy_to_user(arg, karg, size))
                return -EFAULT;

        return 0;
}
EXPORT_SYMBOL(sock_ioctl_inout);

/* This is the most common ioctl prep function, where the result (4 bytes) is
 * copied back to userspace if the ioctl() returns successfully. No input is
 * copied from userspace as input argument.
 */
static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
{
        int ret, karg = 0;

        ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
        if (ret)
                return ret;

        return put_user(karg, (int __user *)arg);
}

/* A wrapper around sock ioctls, which copies the data from userspace
 * (depending on the protocol/ioctl), and copies back the result to userspace.
 * The main motivation for this function is to pass kernel memory to the
 * protocol ioctl callbacks, instead of userspace memory.
 */
int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
        int rc = 1;

        if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
                rc = ipmr_sk_ioctl(sk, cmd, arg);
        else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
                rc = ip6mr_sk_ioctl(sk, cmd, arg);
        else if (sk_is_phonet(sk))
                rc = phonet_sk_ioctl(sk, cmd, arg);

        /* If ioctl was processed, returns its value */
        if (rc <= 0)
                return rc;

        /* Otherwise call the default handler */
        return sock_ioctl_out(sk, cmd, arg);
}
EXPORT_SYMBOL(sk_ioctl);

static int __init sock_struct_check(void)
{
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);

        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
        CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
        return 0;
}

core_initcall(sock_struct_check);



























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   32 



   33 
   32 





























































   33 



























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
// SPDX-License-Identifier: GPL-2.0-or-later
/* auditfilter.c -- filtering of audit events
 *
 * Copyright 2003-2004 Red Hat, Inc.
 * Copyright 2005 Hewlett-Packard Development Company, L.P.
 * Copyright 2005 IBM Corporation
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/audit.h>
#include <linux/kthread.h>
#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/netlink.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include "audit.h"

/*
 * Locking model:
 *
 * audit_filter_mutex:
 *                Synchronizes writes and blocking reads of audit's filterlist
 *                data.  Rcu is used to traverse the filterlist and access
 *                contents of structs audit_entry, audit_watch and opaque
 *                LSM rules during filtering.  If modified, these structures
 *                must be copied and replace their counterparts in the filterlist.
 *                An audit_parent struct is not accessed during filtering, so may
 *                be written directly provided audit_filter_mutex is held.
 */

/* Audit filter lists, defined in <linux/audit.h> */
struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
        LIST_HEAD_INIT(audit_filter_list[0]),
        LIST_HEAD_INIT(audit_filter_list[1]),
        LIST_HEAD_INIT(audit_filter_list[2]),
        LIST_HEAD_INIT(audit_filter_list[3]),
        LIST_HEAD_INIT(audit_filter_list[4]),
        LIST_HEAD_INIT(audit_filter_list[5]),
        LIST_HEAD_INIT(audit_filter_list[6]),
        LIST_HEAD_INIT(audit_filter_list[7]),
#if AUDIT_NR_FILTERS != 8
#error Fix audit_filter_list initialiser
#endif
};
static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
        LIST_HEAD_INIT(audit_rules_list[0]),
        LIST_HEAD_INIT(audit_rules_list[1]),
        LIST_HEAD_INIT(audit_rules_list[2]),
        LIST_HEAD_INIT(audit_rules_list[3]),
        LIST_HEAD_INIT(audit_rules_list[4]),
        LIST_HEAD_INIT(audit_rules_list[5]),
        LIST_HEAD_INIT(audit_rules_list[6]),
        LIST_HEAD_INIT(audit_rules_list[7]),
};

DEFINE_MUTEX(audit_filter_mutex);

static void audit_free_lsm_field(struct audit_field *f)
{
        switch (f->type) {
        case AUDIT_SUBJ_USER:
        case AUDIT_SUBJ_ROLE:
        case AUDIT_SUBJ_TYPE:
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_USER:
        case AUDIT_OBJ_ROLE:
        case AUDIT_OBJ_TYPE:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
                kfree(f->lsm_str);
                security_audit_rule_free(f->lsm_rule);
        }
}

static inline void audit_free_rule(struct audit_entry *e)
{
        int i;
        struct audit_krule *erule = &e->rule;

        /* some rules don't have associated watches */
        if (erule->watch)
                audit_put_watch(erule->watch);
        if (erule->fields)
                for (i = 0; i < erule->field_count; i++)
                        audit_free_lsm_field(&erule->fields[i]);
        kfree(erule->fields);
        kfree(erule->filterkey);
        kfree(e);
}

void audit_free_rule_rcu(struct rcu_head *head)
{
        struct audit_entry *e = container_of(head, struct audit_entry, rcu);
        audit_free_rule(e);
}

/* Initialize an audit filterlist entry. */
static inline struct audit_entry *audit_init_entry(u32 field_count)
{
        struct audit_entry *entry;
        struct audit_field *fields;

        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (unlikely(!entry))
                return NULL;

        fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
        if (unlikely(!fields)) {
                kfree(entry);
                return NULL;
        }
        entry->rule.fields = fields;

        return entry;
}

/* Unpack a filter field's string representation from user-space
 * buffer. */
char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
{
        char *str;

        if (!*bufp || (len == 0) || (len > *remain))
                return ERR_PTR(-EINVAL);

        /* Of the currently implemented string fields, PATH_MAX
         * defines the longest valid length.
         */
        if (len > PATH_MAX)
                return ERR_PTR(-ENAMETOOLONG);

        str = kmalloc(len + 1, GFP_KERNEL);
        if (unlikely(!str))
                return ERR_PTR(-ENOMEM);

        memcpy(str, *bufp, len);
        str[len] = 0;
        *bufp += len;
        *remain -= len;

        return str;
}

/* Translate an inode field to kernel representation. */
static inline int audit_to_inode(struct audit_krule *krule,
                                 struct audit_field *f)
{
        if ((krule->listnr != AUDIT_FILTER_EXIT &&
             krule->listnr != AUDIT_FILTER_URING_EXIT) ||
            krule->inode_f || krule->watch || krule->tree ||
            (f->op != Audit_equal && f->op != Audit_not_equal))
                return -EINVAL;

        krule->inode_f = f;
        return 0;
}

static __u32 *classes[AUDIT_SYSCALL_CLASSES];

int __init audit_register_class(int class, unsigned *list)
{
        __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
        if (!p)
                return -ENOMEM;
        while (*list != ~0U) {
                unsigned n = *list++;
                if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
                        kfree(p);
                        return -EINVAL;
                }
                p[AUDIT_WORD(n)] |= AUDIT_BIT(n);
        }
        if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) {
                kfree(p);
                return -EINVAL;
        }
        classes[class] = p;
        return 0;
}

int audit_match_class(int class, unsigned syscall)
{
        if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32))
                return 0;
        if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
                return 0;
        return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
}

#ifdef CONFIG_AUDITSYSCALL
static inline int audit_match_class_bits(int class, u32 *mask)
{
        int i;

        if (classes[class]) {
                for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                        if (mask[i] & classes[class][i])
                                return 0;
        }
        return 1;
}

static int audit_match_signal(struct audit_entry *entry)
{
        struct audit_field *arch = entry->rule.arch_f;

        if (!arch) {
                /* When arch is unspecified, we must check both masks on biarch
                 * as syscall number alone is ambiguous. */
                return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
                                               entry->rule.mask) &&
                        audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
                                               entry->rule.mask));
        }

        switch (audit_classify_arch(arch->val)) {
        case 0: /* native */
                return (audit_match_class_bits(AUDIT_CLASS_SIGNAL,
                                               entry->rule.mask));
        case 1: /* 32bit on biarch */
                return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32,
                                               entry->rule.mask));
        default:
                return 1;
        }
}
#endif

/* Common user-space to kernel rule translation. */
static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *rule)
{
        unsigned listnr;
        struct audit_entry *entry;
        int i, err;

        err = -EINVAL;
        listnr = rule->flags & ~AUDIT_FILTER_PREPEND;
        switch (listnr) {
        default:
                goto exit_err;
#ifdef CONFIG_AUDITSYSCALL
        case AUDIT_FILTER_ENTRY:
                pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
                goto exit_err;
        case AUDIT_FILTER_EXIT:
        case AUDIT_FILTER_URING_EXIT:
        case AUDIT_FILTER_TASK:
#endif
        case AUDIT_FILTER_USER:
        case AUDIT_FILTER_EXCLUDE:
        case AUDIT_FILTER_FS:
                ;
        }
        if (unlikely(rule->action == AUDIT_POSSIBLE)) {
                pr_err("AUDIT_POSSIBLE is deprecated\n");
                goto exit_err;
        }
        if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
                goto exit_err;
        if (rule->field_count > AUDIT_MAX_FIELDS)
                goto exit_err;

        err = -ENOMEM;
        entry = audit_init_entry(rule->field_count);
        if (!entry)
                goto exit_err;

        entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND;
        entry->rule.listnr = listnr;
        entry->rule.action = rule->action;
        entry->rule.field_count = rule->field_count;

        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                entry->rule.mask[i] = rule->mask[i];

        for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) {
                int bit = AUDIT_BITMASK_SIZE * 32 - i - 1;
                __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)];
                __u32 *class;

                if (!(*p & AUDIT_BIT(bit)))
                        continue;
                *p &= ~AUDIT_BIT(bit);
                class = classes[i];
                if (class) {
                        int j;
                        for (j = 0; j < AUDIT_BITMASK_SIZE; j++)
                                entry->rule.mask[j] |= class[j];
                }
        }

        return entry;

exit_err:
        return ERR_PTR(err);
}

static u32 audit_ops[] =
{
        [Audit_equal] = AUDIT_EQUAL,
        [Audit_not_equal] = AUDIT_NOT_EQUAL,
        [Audit_bitmask] = AUDIT_BIT_MASK,
        [Audit_bittest] = AUDIT_BIT_TEST,
        [Audit_lt] = AUDIT_LESS_THAN,
        [Audit_gt] = AUDIT_GREATER_THAN,
        [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL,
        [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL,
};

static u32 audit_to_op(u32 op)
{
        u32 n;
        for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++)
                ;
        return n;
}

/* check if an audit field is valid */
static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
        switch (f->type) {
        case AUDIT_MSGTYPE:
                if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE &&
                    entry->rule.listnr != AUDIT_FILTER_USER)
                        return -EINVAL;
                break;
        case AUDIT_FSTYPE:
                if (entry->rule.listnr != AUDIT_FILTER_FS)
                        return -EINVAL;
                break;
        case AUDIT_PERM:
                if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT)
                        return -EINVAL;
                break;
        }

        switch (entry->rule.listnr) {
        case AUDIT_FILTER_FS:
                switch (f->type) {
                case AUDIT_FSTYPE:
                case AUDIT_FILTERKEY:
                        break;
                default:
                        return -EINVAL;
                }
        }

        /* Check for valid field type and op */
        switch (f->type) {
        case AUDIT_ARG0:
        case AUDIT_ARG1:
        case AUDIT_ARG2:
        case AUDIT_ARG3:
        case AUDIT_PERS: /* <uapi/linux/personality.h> */
        case AUDIT_DEVMINOR:
                /* all ops are valid */
                break;
        case AUDIT_UID:
        case AUDIT_EUID:
        case AUDIT_SUID:
        case AUDIT_FSUID:
        case AUDIT_LOGINUID:
        case AUDIT_OBJ_UID:
        case AUDIT_GID:
        case AUDIT_EGID:
        case AUDIT_SGID:
        case AUDIT_FSGID:
        case AUDIT_OBJ_GID:
        case AUDIT_PID:
        case AUDIT_MSGTYPE:
        case AUDIT_PPID:
        case AUDIT_DEVMAJOR:
        case AUDIT_EXIT:
        case AUDIT_SUCCESS:
        case AUDIT_INODE:
        case AUDIT_SESSIONID:
        case AUDIT_SUBJ_SEN:
        case AUDIT_SUBJ_CLR:
        case AUDIT_OBJ_LEV_LOW:
        case AUDIT_OBJ_LEV_HIGH:
        case AUDIT_SADDR_FAM:
                /* bit ops are only useful on syscall args */
                if (f->op == Audit_bitmask || f->op == Audit_bittest)
                        return -EINVAL;
                break;
        case AUDIT_SUBJ_USER:
        case AUDIT_SUBJ_ROLE:
        case AUDIT_SUBJ_TYPE:
        case AUDIT_OBJ_USER:
        case AUDIT_OBJ_ROLE:
        case AUDIT_OBJ_TYPE:
        case AUDIT_WATCH:
        case AUDIT_DIR:
        case AUDIT_FILTERKEY:
        case AUDIT_LOGINUID_SET:
        case AUDIT_ARCH:
        case AUDIT_FSTYPE:
        case AUDIT_PERM:
        case AUDIT_FILETYPE:
        case AUDIT_FIELD_COMPARE:
        case AUDIT_EXE:
                /* only equal and not equal valid ops */
                if (f->op != Audit_not_equal && f->op != Audit_equal)
                        return -EINVAL;
                break;
        default:
                /* field not recognized */
                return -EINVAL;
        }

        /* Check for select valid field values */
        switch (f->type) {
        case AUDIT_LOGINUID_SET:
                if ((f->val != 0) && (f->val != 1))
                        return -EINVAL;
                break;
        case AUDIT_PERM:
                if (f->val & ~15)
                        return -EINVAL;
                break;
        case AUDIT_FILETYPE:
                if (f->val & ~S_IFMT)
                        return -EINVAL;
                break;
        case AUDIT_FIELD_COMPARE:
                if (f->val > AUDIT_MAX_FIELD_COMPARE)
                        return -EINVAL;
                break;
        case AUDIT_SADDR_FAM:
                if (f->val >= AF_MAX)
                        return -EINVAL;
                break;
        default:
                break;
        }

        return 0;
}

/* Translate struct audit_rule_data to kernel's rule representation. */
static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
                                               size_t datasz)
{
        int err = 0;
        struct audit_entry *entry;
        void *bufp;
        size_t remain = datasz - sizeof(struct audit_rule_data);
        int i;
        char *str;
        struct audit_fsnotify_mark *audit_mark;

        entry = audit_to_entry_common(data);
        if (IS_ERR(entry))
                goto exit_nofree;

        bufp = data->buf;
        for (i = 0; i < data->field_count; i++) {
                struct audit_field *f = &entry->rule.fields[i];
                u32 f_val;

                err = -EINVAL;

                f->op = audit_to_op(data->fieldflags[i]);
                if (f->op == Audit_bad)
                        goto exit_free;

                f->type = data->fields[i];
                f_val = data->values[i];

                /* Support legacy tests for a valid loginuid */
                if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
                        f->type = AUDIT_LOGINUID_SET;
                        f_val = 0;
                        entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
                }

                err = audit_field_valid(entry, f);
                if (err)
                        goto exit_free;

                err = -EINVAL;
                switch (f->type) {
                case AUDIT_LOGINUID:
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
                case AUDIT_FSUID:
                case AUDIT_OBJ_UID:
                        f->uid = make_kuid(current_user_ns(), f_val);
                        if (!uid_valid(f->uid))
                                goto exit_free;
                        break;
                case AUDIT_GID:
                case AUDIT_EGID:
                case AUDIT_SGID:
                case AUDIT_FSGID:
                case AUDIT_OBJ_GID:
                        f->gid = make_kgid(current_user_ns(), f_val);
                        if (!gid_valid(f->gid))
                                goto exit_free;
                        break;
                case AUDIT_ARCH:
                        f->val = f_val;
                        entry->rule.arch_f = f;
                        break;
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        entry->rule.buflen += f_val;
                        f->lsm_str = str;
                        err = security_audit_rule_init(f->type, f->op, str,
                                                       (void **)&f->lsm_rule,
                                                       GFP_KERNEL);
                        /* Keep currently invalid fields around in case they
                         * become valid after a policy reload. */
                        if (err == -EINVAL) {
                                pr_warn("audit rule for LSM \'%s\' is invalid\n",
                                        str);
                                err = 0;
                        } else if (err)
                                goto exit_free;
                        break;
                case AUDIT_WATCH:
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        err = audit_to_watch(&entry->rule, str, f_val, f->op);
                        if (err) {
                                kfree(str);
                                goto exit_free;
                        }
                        entry->rule.buflen += f_val;
                        break;
                case AUDIT_DIR:
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        err = audit_make_tree(&entry->rule, str, f->op);
                        kfree(str);
                        if (err)
                                goto exit_free;
                        entry->rule.buflen += f_val;
                        break;
                case AUDIT_INODE:
                        f->val = f_val;
                        err = audit_to_inode(&entry->rule, f);
                        if (err)
                                goto exit_free;
                        break;
                case AUDIT_FILTERKEY:
                        if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
                                goto exit_free;
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        entry->rule.buflen += f_val;
                        entry->rule.filterkey = str;
                        break;
                case AUDIT_EXE:
                        if (entry->rule.exe || f_val > PATH_MAX)
                                goto exit_free;
                        str = audit_unpack_string(&bufp, &remain, f_val);
                        if (IS_ERR(str)) {
                                err = PTR_ERR(str);
                                goto exit_free;
                        }
                        audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
                        if (IS_ERR(audit_mark)) {
                                kfree(str);
                                err = PTR_ERR(audit_mark);
                                goto exit_free;
                        }
                        entry->rule.buflen += f_val;
                        entry->rule.exe = audit_mark;
                        break;
                default:
                        f->val = f_val;
                        break;
                }
        }

        if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
                entry->rule.inode_f = NULL;

exit_nofree:
        return entry;

exit_free:
        if (entry->rule.tree)
                audit_put_tree(entry->rule.tree); /* that's the temporary one */
        if (entry->rule.exe)
                audit_remove_mark(entry->rule.exe); /* that's the template one */
        audit_free_rule(entry);
        return ERR_PTR(err);
}

/* Pack a filter field's string representation into data block. */
static inline size_t audit_pack_string(void **bufp, const char *str)
{
        size_t len = strlen(str);

        memcpy(*bufp, str, len);
        *bufp += len;

        return len;
}

/* Translate kernel rule representation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
        struct audit_rule_data *data;
        void *bufp;
        int i;

        data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
        if (unlikely(!data))
                return NULL;
        memset(data, 0, sizeof(*data));

        data->flags = krule->flags | krule->listnr;
        data->action = krule->action;
        data->field_count = krule->field_count;
        bufp = data->buf;
        for (i = 0; i < data->field_count; i++) {
                struct audit_field *f = &krule->fields[i];

                data->fields[i] = f->type;
                data->fieldflags[i] = audit_ops[f->op];
                switch (f->type) {
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, f->lsm_str);
                        break;
                case AUDIT_WATCH:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp,
                                                  audit_watch_path(krule->watch));
                        break;
                case AUDIT_DIR:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp,
                                                  audit_tree_path(krule->tree));
                        break;
                case AUDIT_FILTERKEY:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, krule->filterkey);
                        break;
                case AUDIT_EXE:
                        data->buflen += data->values[i] =
                                audit_pack_string(&bufp, audit_mark_path(krule->exe));
                        break;
                case AUDIT_LOGINUID_SET:
                        if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
                                data->fields[i] = AUDIT_LOGINUID;
                                data->values[i] = AUDIT_UID_UNSET;
                                break;
                        }
                        fallthrough;        /* if set */
                default:
                        data->values[i] = f->val;
                }
        }
        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                data->mask[i] = krule->mask[i];

        return data;
}

/* Compare two rules in kernel format.  Considered success if rules
 * don't match. */
static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
{
        int i;

        if (a->flags != b->flags ||
            a->pflags != b->pflags ||
            a->listnr != b->listnr ||
            a->action != b->action ||
            a->field_count != b->field_count)
                return 1;

        for (i = 0; i < a->field_count; i++) {
                if (a->fields[i].type != b->fields[i].type ||
                    a->fields[i].op != b->fields[i].op)
                        return 1;

                switch (a->fields[i].type) {
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str))
                                return 1;
                        break;
                case AUDIT_WATCH:
                        if (strcmp(audit_watch_path(a->watch),
                                   audit_watch_path(b->watch)))
                                return 1;
                        break;
                case AUDIT_DIR:
                        if (strcmp(audit_tree_path(a->tree),
                                   audit_tree_path(b->tree)))
                                return 1;
                        break;
                case AUDIT_FILTERKEY:
                        /* both filterkeys exist based on above type compare */
                        if (strcmp(a->filterkey, b->filterkey))
                                return 1;
                        break;
                case AUDIT_EXE:
                        /* both paths exist based on above type compare */
                        if (strcmp(audit_mark_path(a->exe),
                                   audit_mark_path(b->exe)))
                                return 1;
                        break;
                case AUDIT_UID:
                case AUDIT_EUID:
                case AUDIT_SUID:
                case AUDIT_FSUID:
                case AUDIT_LOGINUID:
                case AUDIT_OBJ_UID:
                        if (!uid_eq(a->fields[i].uid, b->fields[i].uid))
                                return 1;
                        break;
                case AUDIT_GID:
                case AUDIT_EGID:
                case AUDIT_SGID:
                case AUDIT_FSGID:
                case AUDIT_OBJ_GID:
                        if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
                                return 1;
                        break;
                default:
                        if (a->fields[i].val != b->fields[i].val)
                                return 1;
                }
        }

        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                if (a->mask[i] != b->mask[i])
                        return 1;

        return 0;
}

/* Duplicate LSM field information.  The lsm_rule is opaque, so must be
 * re-initialized. */
static inline int audit_dupe_lsm_field(struct audit_field *df,
                                           struct audit_field *sf)
{
        int ret;
        char *lsm_str;

        /* our own copy of lsm_str */
        lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL);
        if (unlikely(!lsm_str))
                return -ENOMEM;
        df->lsm_str = lsm_str;

        /* our own (refreshed) copy of lsm_rule */
        ret = security_audit_rule_init(df->type, df->op, df->lsm_str,
                                       (void **)&df->lsm_rule, GFP_KERNEL);
        /* Keep currently invalid fields around in case they
         * become valid after a policy reload. */
        if (ret == -EINVAL) {
                pr_warn("audit rule for LSM \'%s\' is invalid\n",
                        df->lsm_str);
                ret = 0;
        }

        return ret;
}

/* Duplicate an audit rule.  This will be a deep copy with the exception
 * of the watch - that pointer is carried over.  The LSM specific fields
 * will be updated in the copy.  The point is to be able to replace the old
 * rule with the new rule in the filterlist, then free the old rule.
 * The rlist element is undefined; list manipulations are handled apart from
 * the initial copy. */
struct audit_entry *audit_dupe_rule(struct audit_krule *old)
{
        u32 fcount = old->field_count;
        struct audit_entry *entry;
        struct audit_krule *new;
        char *fk;
        int i, err = 0;

        entry = audit_init_entry(fcount);
        if (unlikely(!entry))
                return ERR_PTR(-ENOMEM);

        new = &entry->rule;
        new->flags = old->flags;
        new->pflags = old->pflags;
        new->listnr = old->listnr;
        new->action = old->action;
        for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
                new->mask[i] = old->mask[i];
        new->prio = old->prio;
        new->buflen = old->buflen;
        new->inode_f = old->inode_f;
        new->field_count = old->field_count;

        /*
         * note that we are OK with not refcounting here; audit_match_tree()
         * never dereferences tree and we can't get false positives there
         * since we'd have to have rule gone from the list *and* removed
         * before the chunks found by lookup had been allocated, i.e. before
         * the beginning of list scan.
         */
        new->tree = old->tree;
        memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);

        /* deep copy this information, updating the lsm_rule fields, because
         * the originals will all be freed when the old rule is freed. */
        for (i = 0; i < fcount; i++) {
                switch (new->fields[i].type) {
                case AUDIT_SUBJ_USER:
                case AUDIT_SUBJ_ROLE:
                case AUDIT_SUBJ_TYPE:
                case AUDIT_SUBJ_SEN:
                case AUDIT_SUBJ_CLR:
                case AUDIT_OBJ_USER:
                case AUDIT_OBJ_ROLE:
                case AUDIT_OBJ_TYPE:
                case AUDIT_OBJ_LEV_LOW:
                case AUDIT_OBJ_LEV_HIGH:
                        err = audit_dupe_lsm_field(&new->fields[i],
                                                       &old->fields[i]);
                        break;
                case AUDIT_FILTERKEY:
                        fk = kstrdup(old->filterkey, GFP_KERNEL);
                        if (unlikely(!fk))
                                err = -ENOMEM;
                        else
                                new->filterkey = fk;
                        break;
                case AUDIT_EXE:
                        err = audit_dupe_exe(new, old);
                        break;
                }
                if (err) {
                        if (new->exe)
                                audit_remove_mark(new->exe);
                        audit_free_rule(entry);
                        return ERR_PTR(err);
                }
        }

        if (old->watch) {
                audit_get_watch(old->watch);
                new->watch = old->watch;
        }

        return entry;
}

/* Find an existing audit rule.
 * Caller must hold audit_filter_mutex to prevent stale rule data. */
static struct audit_entry *audit_find_rule(struct audit_entry *entry,
                                           struct list_head **p)
{
        struct audit_entry *e, *found = NULL;
        struct list_head *list;
        int h;

        if (entry->rule.inode_f) {
                h = audit_hash_ino(entry->rule.inode_f->val);
                *p = list = &audit_inode_hash[h];
        } else if (entry->rule.watch) {
                /* we don't know the inode number, so must walk entire hash */
                for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
                        list = &audit_inode_hash[h];
                        list_for_each_entry(e, list, list)
                                if (!audit_compare_rule(&entry->rule, &e->rule)) {
                                        found = e;
                                        goto out;
                                }
                }
                goto out;
        } else {
                *p = list = &audit_filter_list[entry->rule.listnr];
        }

        list_for_each_entry(e, list, list)
                if (!audit_compare_rule(&entry->rule, &e->rule)) {
                        found = e;
                        goto out;
                }

out:
        return found;
}

static u64 prio_low = ~0ULL/2;
static u64 prio_high = ~0ULL/2 - 1;

/* Add rule to given filterlist if not a duplicate. */
static inline int audit_add_rule(struct audit_entry *entry)
{
        struct audit_entry *e;
        struct audit_watch *watch = entry->rule.watch;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
        int err = 0;
#ifdef CONFIG_AUDITSYSCALL
        int dont_count = 0;

        /* If any of these, don't count towards total */
        switch (entry->rule.listnr) {
        case AUDIT_FILTER_USER:
        case AUDIT_FILTER_EXCLUDE:
        case AUDIT_FILTER_FS:
                dont_count = 1;
        }
#endif

        mutex_lock(&audit_filter_mutex);
        e = audit_find_rule(entry, &list);
        if (e) {
                mutex_unlock(&audit_filter_mutex);
                err = -EEXIST;
                /* normally audit_add_tree_rule() will free it on failure */
                if (tree)
                        audit_put_tree(tree);
                return err;
        }

        if (watch) {
                /* audit_filter_mutex is dropped and re-taken during this call */
                err = audit_add_watch(&entry->rule, &list);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
                        /*
                         * normally audit_add_tree_rule() will free it
                         * on failure
                         */
                        if (tree)
                                audit_put_tree(tree);
                        return err;
                }
        }
        if (tree) {
                err = audit_add_tree_rule(&entry->rule);
                if (err) {
                        mutex_unlock(&audit_filter_mutex);
                        return err;
                }
        }

        entry->rule.prio = ~0ULL;
        if (entry->rule.listnr == AUDIT_FILTER_EXIT ||
            entry->rule.listnr == AUDIT_FILTER_URING_EXIT) {
                if (entry->rule.flags & AUDIT_FILTER_PREPEND)
                        entry->rule.prio = ++prio_high;
                else
                        entry->rule.prio = --prio_low;
        }

        if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
                list_add(&entry->rule.list,
                         &audit_rules_list[entry->rule.listnr]);
                list_add_rcu(&entry->list, list);
                entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
        } else {
                list_add_tail(&entry->rule.list,
                              &audit_rules_list[entry->rule.listnr]);
                list_add_tail_rcu(&entry->list, list);
        }
#ifdef CONFIG_AUDITSYSCALL
        if (!dont_count)
                audit_n_rules++;

        if (!audit_match_signal(entry))
                audit_signals++;
#endif
        mutex_unlock(&audit_filter_mutex);

        return err;
}

/* Remove an existing rule from filterlist. */
int audit_del_rule(struct audit_entry *entry)
{
        struct audit_entry  *e;
        struct audit_tree *tree = entry->rule.tree;
        struct list_head *list;
        int ret = 0;
#ifdef CONFIG_AUDITSYSCALL
        int dont_count = 0;

        /* If any of these, don't count towards total */
        switch (entry->rule.listnr) {
        case AUDIT_FILTER_USER:
        case AUDIT_FILTER_EXCLUDE:
        case AUDIT_FILTER_FS:
                dont_count = 1;
        }
#endif

        mutex_lock(&audit_filter_mutex);
        e = audit_find_rule(entry, &list);
        if (!e) {
                ret = -ENOENT;
                goto out;
        }

        if (e->rule.watch)
                audit_remove_watch_rule(&e->rule);

        if (e->rule.tree)
                audit_remove_tree_rule(&e->rule);

        if (e->rule.exe)
                audit_remove_mark_rule(&e->rule);

#ifdef CONFIG_AUDITSYSCALL
        if (!dont_count)
                audit_n_rules--;

        if (!audit_match_signal(entry))
                audit_signals--;
#endif

        list_del_rcu(&e->list);
        list_del(&e->rule.list);
        call_rcu(&e->rcu, audit_free_rule_rcu);

out:
        mutex_unlock(&audit_filter_mutex);

        if (tree)
                audit_put_tree(tree);        /* that's the temporary one */

        return ret;
}

/* List rules using struct audit_rule_data. */
static void audit_list_rules(int seq, struct sk_buff_head *q)
{
        struct sk_buff *skb;
        struct audit_krule *r;
        int i;

        /* This is a blocking read, so use audit_filter_mutex instead of rcu
         * iterator to sync with list writers. */
        for (i = 0; i < AUDIT_NR_FILTERS; i++) {
                list_for_each_entry(r, &audit_rules_list[i], list) {
                        struct audit_rule_data *data;

                        data = audit_krule_to_data(r);
                        if (unlikely(!data))
                                break;
                        skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
                                               data,
                                               struct_size(data, buf, data->buflen));
                        if (skb)
                                skb_queue_tail(q, skb);
                        kfree(data);
                }
        }
        skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
        if (skb)
                skb_queue_tail(q, skb);
}

/* Log rule additions and removals */
static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
        struct audit_buffer *ab;

        if (!audit_enabled)
                return;

        ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_CONFIG_CHANGE);
        if (!ab)
                return;
        audit_log_session_info(ab);
        audit_log_task_context(ab);
        audit_log_format(ab, " op=%s", action);
        audit_log_key(ab, rule->filterkey);
        audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
        audit_log_end(ab);
}

/**
 * audit_rule_change - apply all rules to the specified message type
 * @type: audit message type
 * @seq: netlink audit message sequence (serial) number
 * @data: payload data
 * @datasz: size of payload data
 */
int audit_rule_change(int type, int seq, void *data, size_t datasz)
{
        int err = 0;
        struct audit_entry *entry;

        switch (type) {
        case AUDIT_ADD_RULE:
                entry = audit_data_to_entry(data, datasz);
                if (IS_ERR(entry))
                        return PTR_ERR(entry);
                err = audit_add_rule(entry);
                audit_log_rule_change("add_rule", &entry->rule, !err);
                break;
        case AUDIT_DEL_RULE:
                entry = audit_data_to_entry(data, datasz);
                if (IS_ERR(entry))
                        return PTR_ERR(entry);
                err = audit_del_rule(entry);
                audit_log_rule_change("remove_rule", &entry->rule, !err);
                break;
        default:
                WARN_ON(1);
                return -EINVAL;
        }

        if (err || type == AUDIT_DEL_RULE) {
                if (entry->rule.exe)
                        audit_remove_mark(entry->rule.exe);
                audit_free_rule(entry);
        }

        return err;
}

/**
 * audit_list_rules_send - list the audit rules
 * @request_skb: skb of request we are replying to (used to target the reply)
 * @seq: netlink audit message sequence (serial) number
 */
int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
        struct task_struct *tsk;
        struct audit_netlink_list *dest;

        /* We can't just spew out the rules here because we might fill
         * the available socket buffer space and deadlock waiting for
         * auditctl to read from it... which isn't ever going to
         * happen if we're actually running in the context of auditctl
         * trying to _send_ the stuff */

        dest = kmalloc(sizeof(*dest), GFP_KERNEL);
        if (!dest)
                return -ENOMEM;
        dest->net = get_net(sock_net(NETLINK_CB(request_skb).sk));
        dest->portid = NETLINK_CB(request_skb).portid;
        skb_queue_head_init(&dest->q);

        mutex_lock(&audit_filter_mutex);
        audit_list_rules(seq, &dest->q);
        mutex_unlock(&audit_filter_mutex);

        tsk = kthread_run(audit_send_list_thread, dest, "audit_send_list");
        if (IS_ERR(tsk)) {
                skb_queue_purge(&dest->q);
                put_net(dest->net);
                kfree(dest);
                return PTR_ERR(tsk);
        }

        return 0;
}

int audit_comparator(u32 left, u32 op, u32 right)
{
        switch (op) {
        case Audit_equal:
                return (left == right);
        case Audit_not_equal:
                return (left != right);
        case Audit_lt:
                return (left < right);
        case Audit_le:
                return (left <= right);
        case Audit_gt:
                return (left > right);
        case Audit_ge:
                return (left >= right);
        case Audit_bitmask:
                return (left & right);
        case Audit_bittest:
                return ((left & right) == right);
        default:
                return 0;
        }
}

int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
{
        switch (op) {
        case Audit_equal:
                return uid_eq(left, right);
        case Audit_not_equal:
                return !uid_eq(left, right);
        case Audit_lt:
                return uid_lt(left, right);
        case Audit_le:
                return uid_lte(left, right);
        case Audit_gt:
                return uid_gt(left, right);
        case Audit_ge:
                return uid_gte(left, right);
        case Audit_bitmask:
        case Audit_bittest:
        default:
                return 0;
        }
}

int audit_gid_comparator(kgid_t left, u32 op, kgid_t right)
{
        switch (op) {
        case Audit_equal:
                return gid_eq(left, right);
        case Audit_not_equal:
                return !gid_eq(left, right);
        case Audit_lt:
                return gid_lt(left, right);
        case Audit_le:
                return gid_lte(left, right);
        case Audit_gt:
                return gid_gt(left, right);
        case Audit_ge:
                return gid_gte(left, right);
        case Audit_bitmask:
        case Audit_bittest:
        default:
                return 0;
        }
}

/**
 * parent_len - find the length of the parent portion of a pathname
 * @path: pathname of which to determine length
 */
int parent_len(const char *path)
{
        int plen;
        const char *p;

        plen = strlen(path);

        if (plen == 0)
                return plen;

        /* disregard trailing slashes */
        p = path + plen - 1;
        while ((*p == '/') && (p > path))
                p--;

        /* walk backward until we find the next slash or hit beginning */
        while ((*p != '/') && (p > path))
                p--;

        /* did we find a slash? Then increment to include it in path */
        if (*p == '/')
                p++;

        return p - path;
}

/**
 * audit_compare_dname_path - compare given dentry name with last component in
 *                               given path. Return of 0 indicates a match.
 * @dname:        dentry name that we're comparing
 * @path:        full pathname that we're comparing
 * @parentlen:        length of the parent if known. Passing in AUDIT_NAME_FULL
 *                 here indicates that we must compute this value.
 */
int audit_compare_dname_path(const struct qstr *dname, const char *path, int parentlen)
{
        int dlen, pathlen;
        const char *p;

        dlen = dname->len;
        pathlen = strlen(path);
        if (pathlen < dlen)
                return 1;

        if (parentlen == AUDIT_NAME_FULL)
                parentlen = parent_len(path);

        p = path + parentlen;

        /* handle trailing slashes */
        pathlen -= parentlen;
        while (p[pathlen - 1] == '/')
                pathlen--;

        if (pathlen != dlen)
                return 1;

        return memcmp(p, dname->name, dlen);
}

int audit_filter(int msgtype, unsigned int listtype)
{
        struct audit_entry *e;
        int ret = 1; /* Audit by default */

        rcu_read_lock();
        list_for_each_entry_rcu(e, &audit_filter_list[listtype], list) {
                int i, result = 0;

                for (i = 0; i < e->rule.field_count; i++) {
                        struct audit_field *f = &e->rule.fields[i];
                        struct lsm_prop prop = { };
                        pid_t pid;

                        switch (f->type) {
                        case AUDIT_PID:
                                pid = task_tgid_nr(current);
                                result = audit_comparator(pid, f->op, f->val);
                                break;
                        case AUDIT_UID:
                                result = audit_uid_comparator(current_uid(), f->op, f->uid);
                                break;
                        case AUDIT_GID:
                                result = audit_gid_comparator(current_gid(), f->op, f->gid);
                                break;
                        case AUDIT_LOGINUID:
                                result = audit_uid_comparator(audit_get_loginuid(current),
                                                              f->op, f->uid);
                                break;
                        case AUDIT_LOGINUID_SET:
                                result = audit_comparator(audit_loginuid_set(current),
                                                          f->op, f->val);
                                break;
                        case AUDIT_MSGTYPE:
                                result = audit_comparator(msgtype, f->op, f->val);
                                break;
                        case AUDIT_SUBJ_USER:
                        case AUDIT_SUBJ_ROLE:
                        case AUDIT_SUBJ_TYPE:
                        case AUDIT_SUBJ_SEN:
                        case AUDIT_SUBJ_CLR:
                                if (f->lsm_rule) {
                                        security_current_getlsmprop_subj(&prop);
                                        result = security_audit_rule_match(
                                                   &prop, f->type, f->op,
                                                   f->lsm_rule);
                                }
                                break;
                        case AUDIT_EXE:
                                result = audit_exe_compare(current, e->rule.exe);
                                if (f->op == Audit_not_equal)
                                        result = !result;
                                break;
                        default:
                                goto unlock_and_return;
                        }
                        if (result < 0) /* error */
                                goto unlock_and_return;
                        if (!result)
                                break;
                }
                if (result > 0) {
                        if (e->rule.action == AUDIT_NEVER || listtype == AUDIT_FILTER_EXCLUDE)
                                ret = 0;
                        break;
                }
        }
unlock_and_return:
        rcu_read_unlock();
        return ret;
}

static int update_lsm_rule(struct audit_krule *r)
{
        struct audit_entry *entry = container_of(r, struct audit_entry, rule);
        struct audit_entry *nentry;
        int err = 0;

        if (!security_audit_rule_known(r))
                return 0;

        nentry = audit_dupe_rule(r);
        if (entry->rule.exe)
                audit_remove_mark(entry->rule.exe);
        if (IS_ERR(nentry)) {
                /* save the first error encountered for the
                 * return value */
                err = PTR_ERR(nentry);
                audit_panic("error updating LSM filters");
                if (r->watch)
                        list_del(&r->rlist);
                list_del_rcu(&entry->list);
                list_del(&r->list);
        } else {
                if (r->watch || r->tree)
                        list_replace_init(&r->rlist, &nentry->rule.rlist);
                list_replace_rcu(&entry->list, &nentry->list);
                list_replace(&r->list, &nentry->rule.list);
        }
        call_rcu(&entry->rcu, audit_free_rule_rcu);

        return err;
}

/* This function will re-initialize the lsm_rule field of all applicable rules.
 * It will traverse the filter lists serarching for rules that contain LSM
 * specific filter fields.  When such a rule is found, it is copied, the
 * LSM field is re-initialized, and the old rule is replaced with the
 * updated rule. */
int audit_update_lsm_rules(void)
{
        struct audit_krule *r, *n;
        int i, err = 0;

        /* audit_filter_mutex synchronizes the writers */
        mutex_lock(&audit_filter_mutex);

        for (i = 0; i < AUDIT_NR_FILTERS; i++) {
                list_for_each_entry_safe(r, n, &audit_rules_list[i], list) {
                        int res = update_lsm_rule(r);
                        if (!err)
                                err = res;
                }
        }
        mutex_unlock(&audit_filter_mutex);

        return err;
}
























































































































   21 
   22 





































































































































































   22 













   22 
   22 













   22 








   22 

   22 

    1 

   22 










   22 

   22 










   22 

















   22 






















































































































































































   22 


















































































































































































































































































































































































































































































































































































































































































































































































   21 
























































































   21 


   21 



   21 






























   20 




























    2 


   21 








   21 























   21 







    2 




















   21 




















































































































































































































































































































































































   21 












   21 

























   21 
























   21 

































   22 





   22 












    1 




    1 















    1 








































































































   22 











   22 


























   22 
   22 
























   22 






   22 







   22 

   22 




   22 








   22 














   22 










   22 
















   22 

   22 


   22 


   22 


   22 



























   22 










   22 







   22 






   22 



   22 








































   22 












   22 







   22 

    1 



   22 



   22 



   22 

    1 














    1 
   22 
   22 












   22 





   21 











































































































































































































































































































































































































   22 

    1 

   21 












   22 

   22 
































































































   22 






















   22 










   21 







   21 
























   21 
   20 



   21 
   21 



























   22 








   22 










   21 



























   21 














   21 









































   22 















   22 
   22 








   22 


   22 








   22 











   21 
   22 

   22 




   21 





   22 
   22 

































   22 
















   22 















   22 




   22 

























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































   22 















    2 

   22 


   22 

















































    1 






























































































































































































































































































































































   22 



















































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/kernel/printk.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 * Modified to make sys_syslog() more flexible: added commands to
 * return the last 4k of kernel messages, regardless of whether
 * they've been read or not.  Added option to suppress kernel printk's
 * to the console.  Added hook for sending the console messages
 * elsewhere, in preparation for a serial line console (someday).
 * Ted Ts'o, 2/11/93.
 * Modified for sysctl support, 1/8/97, Chris Horn.
 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
 *     manfred@colorfullife.com
 * Rewrote bits to get rid of console_lock
 *        01Mar01 Andrew Morton
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/nmi.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/syscore_ops.h>
#include <linux/vmcore_info.h>
#include <linux/ratelimit.h>
#include <linux/kmsg_dump.h>
#include <linux/syslog.h>
#include <linux/cpu.h>
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
#include <linux/ctype.h>
#include <linux/uio.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/task_stack.h>

#include <linux/uaccess.h>
#include <asm/sections.h>

#include <trace/events/initcall.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>

#include "printk_ringbuffer.h"
#include "console_cmdline.h"
#include "braille.h"
#include "internal.h"

int console_printk[4] = {
        CONSOLE_LOGLEVEL_DEFAULT,        /* console_loglevel */
        MESSAGE_LOGLEVEL_DEFAULT,        /* default_message_loglevel */
        CONSOLE_LOGLEVEL_MIN,                /* minimum_console_loglevel */
        CONSOLE_LOGLEVEL_DEFAULT,        /* default_console_loglevel */
};
EXPORT_SYMBOL_GPL(console_printk);

atomic_t ignore_console_lock_warning __read_mostly = ATOMIC_INIT(0);
EXPORT_SYMBOL(ignore_console_lock_warning);

EXPORT_TRACEPOINT_SYMBOL_GPL(console);

/*
 * Low level drivers may need that to know if they can schedule in
 * their unblank() callback or not. So let's export it.
 */
int oops_in_progress;
EXPORT_SYMBOL(oops_in_progress);

/*
 * console_mutex protects console_list updates and console->flags updates.
 * The flags are synchronized only for consoles that are registered, i.e.
 * accessible via the console list.
 */
static DEFINE_MUTEX(console_mutex);

/*
 * console_sem protects updates to console->seq
 * and also provides serialization for console printing.
 */
static DEFINE_SEMAPHORE(console_sem, 1);
HLIST_HEAD(console_list);
EXPORT_SYMBOL_GPL(console_list);
DEFINE_STATIC_SRCU(console_srcu);

/*
 * System may need to suppress printk message under certain
 * circumstances, like after kernel panic happens.
 */
int __read_mostly suppress_printk;

#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_lock_dep_map = {
        .name = "console_lock"
};

void lockdep_assert_console_list_lock_held(void)
{
        lockdep_assert_held(&console_mutex);
}
EXPORT_SYMBOL(lockdep_assert_console_list_lock_held);
#endif

#ifdef CONFIG_DEBUG_LOCK_ALLOC
bool console_srcu_read_lock_is_held(void)
{
        return srcu_read_lock_held(&console_srcu);
}
EXPORT_SYMBOL(console_srcu_read_lock_is_held);
#endif

enum devkmsg_log_bits {
        __DEVKMSG_LOG_BIT_ON = 0,
        __DEVKMSG_LOG_BIT_OFF,
        __DEVKMSG_LOG_BIT_LOCK,
};

enum devkmsg_log_masks {
        DEVKMSG_LOG_MASK_ON             = BIT(__DEVKMSG_LOG_BIT_ON),
        DEVKMSG_LOG_MASK_OFF            = BIT(__DEVKMSG_LOG_BIT_OFF),
        DEVKMSG_LOG_MASK_LOCK           = BIT(__DEVKMSG_LOG_BIT_LOCK),
};

/* Keep both the 'on' and 'off' bits clear, i.e. ratelimit by default: */
#define DEVKMSG_LOG_MASK_DEFAULT        0

static unsigned int __read_mostly devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;

static int __control_devkmsg(char *str)
{
        size_t len;

        if (!str)
                return -EINVAL;

        len = str_has_prefix(str, "on");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_ON;
                return len;
        }

        len = str_has_prefix(str, "off");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_OFF;
                return len;
        }

        len = str_has_prefix(str, "ratelimit");
        if (len) {
                devkmsg_log = DEVKMSG_LOG_MASK_DEFAULT;
                return len;
        }

        return -EINVAL;
}

static int __init control_devkmsg(char *str)
{
        if (__control_devkmsg(str) < 0) {
                pr_warn("printk.devkmsg: bad option string '%s'\n", str);
                return 1;
        }

        /*
         * Set sysctl string accordingly:
         */
        if (devkmsg_log == DEVKMSG_LOG_MASK_ON)
                strscpy(devkmsg_log_str, "on");
        else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF)
                strscpy(devkmsg_log_str, "off");
        /* else "ratelimit" which is set by default. */

        /*
         * Sysctl cannot change it anymore. The kernel command line setting of
         * this parameter is to force the setting to be permanent throughout the
         * runtime of the system. This is a precation measure against userspace
         * trying to be a smarta** and attempting to change it up on us.
         */
        devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;

        return 1;
}
__setup("printk.devkmsg=", control_devkmsg);

char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit";
#if defined(CONFIG_PRINTK) && defined(CONFIG_SYSCTL)
int devkmsg_sysctl_set_loglvl(const struct ctl_table *table, int write,
                              void *buffer, size_t *lenp, loff_t *ppos)
{
        char old_str[DEVKMSG_STR_MAX_SIZE];
        unsigned int old;
        int err;

        if (write) {
                if (devkmsg_log & DEVKMSG_LOG_MASK_LOCK)
                        return -EINVAL;

                old = devkmsg_log;
                strscpy(old_str, devkmsg_log_str);
        }

        err = proc_dostring(table, write, buffer, lenp, ppos);
        if (err)
                return err;

        if (write) {
                err = __control_devkmsg(devkmsg_log_str);

                /*
                 * Do not accept an unknown string OR a known string with
                 * trailing crap...
                 */
                if (err < 0 || (err + 1 != *lenp)) {

                        /* ... and restore old setting. */
                        devkmsg_log = old;
                        strscpy(devkmsg_log_str, old_str);

                        return -EINVAL;
                }
        }

        return 0;
}
#endif /* CONFIG_PRINTK && CONFIG_SYSCTL */

/**
 * console_list_lock - Lock the console list
 *
 * For console list or console->flags updates
 */
void console_list_lock(void)
{
        /*
         * In unregister_console() and console_force_preferred_locked(),
         * synchronize_srcu() is called with the console_list_lock held.
         * Therefore it is not allowed that the console_list_lock is taken
         * with the srcu_lock held.
         *
         * Detecting if this context is really in the read-side critical
         * section is only possible if the appropriate debug options are
         * enabled.
         */
        WARN_ON_ONCE(debug_lockdep_rcu_enabled() &&
                     srcu_read_lock_held(&console_srcu));

        mutex_lock(&console_mutex);
}
EXPORT_SYMBOL(console_list_lock);

/**
 * console_list_unlock - Unlock the console list
 *
 * Counterpart to console_list_lock()
 */
void console_list_unlock(void)
{
        mutex_unlock(&console_mutex);
}
EXPORT_SYMBOL(console_list_unlock);

/**
 * console_srcu_read_lock - Register a new reader for the
 *        SRCU-protected console list
 *
 * Use for_each_console_srcu() to iterate the console list
 *
 * Context: Any context.
 * Return: A cookie to pass to console_srcu_read_unlock().
 */
int console_srcu_read_lock(void)
        __acquires(&console_srcu)
{
        return srcu_read_lock_nmisafe(&console_srcu);
}
EXPORT_SYMBOL(console_srcu_read_lock);

/**
 * console_srcu_read_unlock - Unregister an old reader from
 *        the SRCU-protected console list
 * @cookie: cookie returned from console_srcu_read_lock()
 *
 * Counterpart to console_srcu_read_lock()
 */
void console_srcu_read_unlock(int cookie)
        __releases(&console_srcu)
{
        srcu_read_unlock_nmisafe(&console_srcu, cookie);
}
EXPORT_SYMBOL(console_srcu_read_unlock);

/*
 * Helper macros to handle lockdep when locking/unlocking console_sem. We use
 * macros instead of functions so that _RET_IP_ contains useful information.
 */
#define down_console_sem() do { \
        down(&console_sem);\
        mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\
} while (0)

static int __down_trylock_console_sem(unsigned long ip)
{
        int lock_failed;
        unsigned long flags;

        /*
         * Here and in __up_console_sem() we need to be in safe mode,
         * because spindump/WARN/etc from under console ->lock will
         * deadlock in printk()->down_trylock_console_sem() otherwise.
         */
        printk_safe_enter_irqsave(flags);
        lock_failed = down_trylock(&console_sem);
        printk_safe_exit_irqrestore(flags);

        if (lock_failed)
                return 1;
        mutex_acquire(&console_lock_dep_map, 0, 1, ip);
        return 0;
}
#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_)

static void __up_console_sem(unsigned long ip)
{
        unsigned long flags;

        mutex_release(&console_lock_dep_map, ip);

        printk_safe_enter_irqsave(flags);
        up(&console_sem);
        printk_safe_exit_irqrestore(flags);
}
#define up_console_sem() __up_console_sem(_RET_IP_)

static bool panic_in_progress(void)
{
        return unlikely(atomic_read(&panic_cpu) != PANIC_CPU_INVALID);
}

/* Return true if a panic is in progress on the current CPU. */
bool this_cpu_in_panic(void)
{
        /*
         * We can use raw_smp_processor_id() here because it is impossible for
         * the task to be migrated to the panic_cpu, or away from it. If
         * panic_cpu has already been set, and we're not currently executing on
         * that CPU, then we never will be.
         */
        return unlikely(atomic_read(&panic_cpu) == raw_smp_processor_id());
}

/*
 * Return true if a panic is in progress on a remote CPU.
 *
 * On true, the local CPU should immediately release any printing resources
 * that may be needed by the panic CPU.
 */
bool other_cpu_in_panic(void)
{
        return (panic_in_progress() && !this_cpu_in_panic());
}

/*
 * This is used for debugging the mess that is the VT code by
 * keeping track if we have the console semaphore held. It's
 * definitely not the perfect debug tool (we don't know if _WE_
 * hold it and are racing, but it helps tracking those weird code
 * paths in the console code where we end up in places I want
 * locked without the console semaphore held).
 */
static int console_locked;

/*
 *        Array of consoles built from command line options (console=)
 */

#define MAX_CMDLINECONSOLES 8

static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];

static int preferred_console = -1;
int console_set_on_cmdline;
EXPORT_SYMBOL(console_set_on_cmdline);

/* Flag: console code may call schedule() */
static int console_may_schedule;

enum con_msg_format_flags {
        MSG_FORMAT_DEFAULT        = 0,
        MSG_FORMAT_SYSLOG        = (1 << 0),
};

static int console_msg_format = MSG_FORMAT_DEFAULT;

/*
 * The printk log buffer consists of a sequenced collection of records, each
 * containing variable length message text. Every record also contains its
 * own meta-data (@info).
 *
 * Every record meta-data carries the timestamp in microseconds, as well as
 * the standard userspace syslog level and syslog facility. The usual kernel
 * messages use LOG_KERN; userspace-injected messages always carry a matching
 * syslog facility, by default LOG_USER. The origin of every message can be
 * reliably determined that way.
 *
 * The human readable log message of a record is available in @text, the
 * length of the message text in @text_len. The stored message is not
 * terminated.
 *
 * Optionally, a record can carry a dictionary of properties (key/value
 * pairs), to provide userspace with a machine-readable message context.
 *
 * Examples for well-defined, commonly used property names are:
 *   DEVICE=b12:8               device identifier
 *                                b12:8         block dev_t
 *                                c127:3        char dev_t
 *                                n8            netdev ifindex
 *                                +sound:card0  subsystem:devname
 *   SUBSYSTEM=pci              driver-core subsystem name
 *
 * Valid characters in property names are [a-zA-Z0-9.-_]. Property names
 * and values are terminated by a '\0' character.
 *
 * Example of record values:
 *   record.text_buf                = "it's a line" (unterminated)
 *   record.info.seq                = 56
 *   record.info.ts_nsec            = 36863
 *   record.info.text_len           = 11
 *   record.info.facility           = 0 (LOG_KERN)
 *   record.info.flags              = 0
 *   record.info.level              = 3 (LOG_ERR)
 *   record.info.caller_id          = 299 (task 299)
 *   record.info.dev_info.subsystem = "pci" (terminated)
 *   record.info.dev_info.device    = "+pci:0000:00:01.0" (terminated)
 *
 * The 'struct printk_info' buffer must never be directly exported to
 * userspace, it is a kernel-private implementation detail that might
 * need to be changed in the future, when the requirements change.
 *
 * /dev/kmsg exports the structured data in the following line format:
 *   "<level>,<sequnum>,<timestamp>,<contflag>[,additional_values, ... ];<message text>\n"
 *
 * Users of the export format should ignore possible additional values
 * separated by ',', and find the message after the ';' character.
 *
 * The optional key/value pairs are attached as continuation lines starting
 * with a space character and terminated by a newline. All possible
 * non-prinatable characters are escaped in the "\xff" notation.
 */

/* syslog_lock protects syslog_* variables and write access to clear_seq. */
static DEFINE_MUTEX(syslog_lock);

/*
 * Specifies if a legacy console is registered. If legacy consoles are
 * present, it is necessary to perform the console lock/unlock dance
 * whenever console flushing should occur.
 */
bool have_legacy_console;

/*
 * Specifies if an nbcon console is registered. If nbcon consoles are present,
 * synchronous printing of legacy consoles will not occur during panic until
 * the backtrace has been stored to the ringbuffer.
 */
bool have_nbcon_console;

/*
 * Specifies if a boot console is registered. If boot consoles are present,
 * nbcon consoles cannot print simultaneously and must be synchronized by
 * the console lock. This is because boot consoles and nbcon consoles may
 * have mapped the same hardware.
 */
bool have_boot_console;

/* See printk_legacy_allow_panic_sync() for details. */
bool legacy_allow_panic_sync;

#ifdef CONFIG_PRINTK
DECLARE_WAIT_QUEUE_HEAD(log_wait);
static DECLARE_WAIT_QUEUE_HEAD(legacy_wait);
/* All 3 protected by @syslog_lock. */
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static size_t syslog_partial;
static bool syslog_time;

/* True when _all_ printer threads are available for printing. */
bool printk_kthreads_running;

struct latched_seq {
        seqcount_latch_t        latch;
        u64                        val[2];
};

/*
 * The next printk record to read after the last 'clear' command. There are
 * two copies (updated with seqcount_latch) so that reads can locklessly
 * access a valid value. Writers are synchronized by @syslog_lock.
 */
static struct latched_seq clear_seq = {
        .latch                = SEQCNT_LATCH_ZERO(clear_seq.latch),
        .val[0]                = 0,
        .val[1]                = 0,
};

#define LOG_LEVEL(v)                ((v) & 0x07)
#define LOG_FACILITY(v)                ((v) >> 3 & 0xff)

/* record buffer */
#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
#define LOG_BUF_LEN_MAX ((u32)1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;

/*
 * Define the average message size. This only affects the number of
 * descriptors that will be available. Underestimating is better than
 * overestimating (too many available descriptors is better than not enough).
 */
#define PRB_AVGBITS 5        /* 32 character average length */

#if CONFIG_LOG_BUF_SHIFT <= PRB_AVGBITS
#error CONFIG_LOG_BUF_SHIFT value too small.
#endif
_DEFINE_PRINTKRB(printk_rb_static, CONFIG_LOG_BUF_SHIFT - PRB_AVGBITS,
                 PRB_AVGBITS, &__log_buf[0]);

static struct printk_ringbuffer printk_rb_dynamic;

struct printk_ringbuffer *prb = &printk_rb_static;

/*
 * We cannot access per-CPU data (e.g. per-CPU flush irq_work) before
 * per_cpu_areas are initialised. This variable is set to true when
 * it's safe to access per-CPU data.
 */
static bool __printk_percpu_data_ready __ro_after_init;

bool printk_percpu_data_ready(void)
{
        return __printk_percpu_data_ready;
}

/* Must be called under syslog_lock. */
static void latched_seq_write(struct latched_seq *ls, u64 val)
{
        write_seqcount_latch_begin(&ls->latch);
        ls->val[0] = val;
        write_seqcount_latch(&ls->latch);
        ls->val[1] = val;
        write_seqcount_latch_end(&ls->latch);
}

/* Can be called from any context. */
static u64 latched_seq_read_nolock(struct latched_seq *ls)
{
        unsigned int seq;
        unsigned int idx;
        u64 val;

        do {
                seq = read_seqcount_latch(&ls->latch);
                idx = seq & 0x1;
                val = ls->val[idx];
        } while (read_seqcount_latch_retry(&ls->latch, seq));

        return val;
}

/* Return log buffer address */
char *log_buf_addr_get(void)
{
        return log_buf;
}

/* Return log buffer size */
u32 log_buf_len_get(void)
{
        return log_buf_len;
}

/*
 * Define how much of the log buffer we could take at maximum. The value
 * must be greater than two. Note that only half of the buffer is available
 * when the index points to the middle.
 */
#define MAX_LOG_TAKE_PART 4
static const char trunc_msg[] = "<truncated>";

static void truncate_msg(u16 *text_len, u16 *trunc_msg_len)
{
        /*
         * The message should not take the whole buffer. Otherwise, it might
         * get removed too soon.
         */
        u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART;

        if (*text_len > max_text_len)
                *text_len = max_text_len;

        /* enable the warning message (if there is room) */
        *trunc_msg_len = strlen(trunc_msg);
        if (*text_len >= *trunc_msg_len)
                *text_len -= *trunc_msg_len;
        else
                *trunc_msg_len = 0;
}

int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);

static int syslog_action_restricted(int type)
{
        if (dmesg_restrict)
                return 1;
        /*
         * Unless restricted, we allow "read all" and "get buffer size"
         * for everybody.
         */
        return type != SYSLOG_ACTION_READ_ALL &&
               type != SYSLOG_ACTION_SIZE_BUFFER;
}

static int check_syslog_permissions(int type, int source)
{
        /*
         * If this is from /proc/kmsg and we've already opened it, then we've
         * already done the capabilities checks at open time.
         */
        if (source == SYSLOG_FROM_PROC && type != SYSLOG_ACTION_OPEN)
                goto ok;

        if (syslog_action_restricted(type)) {
                if (capable(CAP_SYSLOG))
                        goto ok;
                return -EPERM;
        }
ok:
        return security_syslog(type);
}

static void append_char(char **pp, char *e, char c)
{
        if (*pp < e)
                *(*pp)++ = c;
}

static ssize_t info_print_ext_header(char *buf, size_t size,
                                     struct printk_info *info)
{
        u64 ts_usec = info->ts_nsec;
        char caller[20];
#ifdef CONFIG_PRINTK_CALLER
        u32 id = info->caller_id;

        snprintf(caller, sizeof(caller), ",caller=%c%u",
                 id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
#else
        caller[0] = '\0';
#endif

        do_div(ts_usec, 1000);

        return scnprintf(buf, size, "%u,%llu,%llu,%c%s;",
                         (info->facility << 3) | info->level, info->seq,
                         ts_usec, info->flags & LOG_CONT ? 'c' : '-', caller);
}

static ssize_t msg_add_ext_text(char *buf, size_t size,
                                const char *text, size_t text_len,
                                unsigned char endc)
{
        char *p = buf, *e = buf + size;
        size_t i;

        /* escape non-printable characters */
        for (i = 0; i < text_len; i++) {
                unsigned char c = text[i];

                if (c < ' ' || c >= 127 || c == '\\')
                        p += scnprintf(p, e - p, "\\x%02x", c);
                else
                        append_char(&p, e, c);
        }
        append_char(&p, e, endc);

        return p - buf;
}

static ssize_t msg_add_dict_text(char *buf, size_t size,
                                 const char *key, const char *val)
{
        size_t val_len = strlen(val);
        ssize_t len;

        if (!val_len)
                return 0;

        len = msg_add_ext_text(buf, size, "", 0, ' ');        /* dict prefix */
        len += msg_add_ext_text(buf + len, size - len, key, strlen(key), '=');
        len += msg_add_ext_text(buf + len, size - len, val, val_len, '\n');

        return len;
}

static ssize_t msg_print_ext_body(char *buf, size_t size,
                                  char *text, size_t text_len,
                                  struct dev_printk_info *dev_info)
{
        ssize_t len;

        len = msg_add_ext_text(buf, size, text, text_len, '\n');

        if (!dev_info)
                goto out;

        len += msg_add_dict_text(buf + len, size - len, "SUBSYSTEM",
                                 dev_info->subsystem);
        len += msg_add_dict_text(buf + len, size - len, "DEVICE",
                                 dev_info->device);
out:
        return len;
}

/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
        atomic64_t seq;
        struct ratelimit_state rs;
        struct mutex lock;
        struct printk_buffers pbufs;
};

static __printf(3, 4) __cold
int devkmsg_emit(int facility, int level, const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk_emit(facility, level, NULL, fmt, args);
        va_end(args);

        return r;
}

static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
        char *buf, *line;
        int level = default_message_loglevel;
        int facility = 1;        /* LOG_USER */
        struct file *file = iocb->ki_filp;
        struct devkmsg_user *user = file->private_data;
        size_t len = iov_iter_count(from);
        ssize_t ret = len;

        if (len > PRINTKRB_RECORD_MAX)
                return -EINVAL;

        /* Ignore when user logging is disabled. */
        if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
                return len;

        /* Ratelimit when not explicitly enabled. */
        if (!(devkmsg_log & DEVKMSG_LOG_MASK_ON)) {
                if (!___ratelimit(&user->rs, current->comm))
                        return ret;
        }

        buf = kmalloc(len+1, GFP_KERNEL);
        if (buf == NULL)
                return -ENOMEM;

        buf[len] = '\0';
        if (!copy_from_iter_full(buf, len, from)) {
                kfree(buf);
                return -EFAULT;
        }

        /*
         * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace
         * the decimal value represents 32bit, the lower 3 bit are the log
         * level, the rest are the log facility.
         *
         * If no prefix or no userspace facility is specified, we
         * enforce LOG_USER, to be able to reliably distinguish
         * kernel-generated messages from userspace-injected ones.
         */
        line = buf;
        if (line[0] == '<') {
                char *endp = NULL;
                unsigned int u;

                u = simple_strtoul(line + 1, &endp, 10);
                if (endp && endp[0] == '>') {
                        level = LOG_LEVEL(u);
                        if (LOG_FACILITY(u) != 0)
                                facility = LOG_FACILITY(u);
                        endp++;
                        line = endp;
                }
        }

        devkmsg_emit(facility, level, "%s", line);
        kfree(buf);
        return ret;
}

static ssize_t devkmsg_read(struct file *file, char __user *buf,
                            size_t count, loff_t *ppos)
{
        struct devkmsg_user *user = file->private_data;
        char *outbuf = &user->pbufs.outbuf[0];
        struct printk_message pmsg = {
                .pbufs = &user->pbufs,
        };
        ssize_t ret;

        ret = mutex_lock_interruptible(&user->lock);
        if (ret)
                return ret;

        if (!printk_get_next_message(&pmsg, atomic64_read(&user->seq), true, false)) {
                if (file->f_flags & O_NONBLOCK) {
                        ret = -EAGAIN;
                        goto out;
                }

                /*
                 * Guarantee this task is visible on the waitqueue before
                 * checking the wake condition.
                 *
                 * The full memory barrier within set_current_state() of
                 * prepare_to_wait_event() pairs with the full memory barrier
                 * within wq_has_sleeper().
                 *
                 * This pairs with __wake_up_klogd:A.
                 */
                ret = wait_event_interruptible(log_wait,
                                printk_get_next_message(&pmsg, atomic64_read(&user->seq), true,
                                                        false)); /* LMM(devkmsg_read:A) */
                if (ret)
                        goto out;
        }

        if (pmsg.dropped) {
                /* our last seen message is gone, return error and reset */
                atomic64_set(&user->seq, pmsg.seq);
                ret = -EPIPE;
                goto out;
        }

        atomic64_set(&user->seq, pmsg.seq + 1);

        if (pmsg.outbuf_len > count) {
                ret = -EINVAL;
                goto out;
        }

        if (copy_to_user(buf, outbuf, pmsg.outbuf_len)) {
                ret = -EFAULT;
                goto out;
        }
        ret = pmsg.outbuf_len;
out:
        mutex_unlock(&user->lock);
        return ret;
}

/*
 * Be careful when modifying this function!!!
 *
 * Only few operations are supported because the device works only with the
 * entire variable length messages (records). Non-standard values are
 * returned in the other cases and has been this way for quite some time.
 * User space applications might depend on this behavior.
 */
static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
{
        struct devkmsg_user *user = file->private_data;
        loff_t ret = 0;

        if (offset)
                return -ESPIPE;

        switch (whence) {
        case SEEK_SET:
                /* the first record */
                atomic64_set(&user->seq, prb_first_valid_seq(prb));
                break;
        case SEEK_DATA:
                /*
                 * The first record after the last SYSLOG_ACTION_CLEAR,
                 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
                 * changes no global state, and does not clear anything.
                 */
                atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq));
                break;
        case SEEK_END:
                /* after the last record */
                atomic64_set(&user->seq, prb_next_seq(prb));
                break;
        default:
                ret = -EINVAL;
        }
        return ret;
}

static __poll_t devkmsg_poll(struct file *file, poll_table *wait)
{
        struct devkmsg_user *user = file->private_data;
        struct printk_info info;
        __poll_t ret = 0;

        poll_wait(file, &log_wait, wait);

        if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) {
                /* return error when data has vanished underneath us */
                if (info.seq != atomic64_read(&user->seq))
                        ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
                else
                        ret = EPOLLIN|EPOLLRDNORM;
        }

        return ret;
}

static int devkmsg_open(struct inode *inode, struct file *file)
{
        struct devkmsg_user *user;
        int err;

        if (devkmsg_log & DEVKMSG_LOG_MASK_OFF)
                return -EPERM;

        /* write-only does not need any file context */
        if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
                err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
                                               SYSLOG_FROM_READER);
                if (err)
                        return err;
        }

        user = kvmalloc(sizeof(struct devkmsg_user), GFP_KERNEL);
        if (!user)
                return -ENOMEM;

        ratelimit_default_init(&user->rs);
        ratelimit_set_flags(&user->rs, RATELIMIT_MSG_ON_RELEASE);

        mutex_init(&user->lock);

        atomic64_set(&user->seq, prb_first_valid_seq(prb));

        file->private_data = user;
        return 0;
}

static int devkmsg_release(struct inode *inode, struct file *file)
{
        struct devkmsg_user *user = file->private_data;

        ratelimit_state_exit(&user->rs);

        mutex_destroy(&user->lock);
        kvfree(user);
        return 0;
}

const struct file_operations kmsg_fops = {
        .open = devkmsg_open,
        .read = devkmsg_read,
        .write_iter = devkmsg_write,
        .llseek = devkmsg_llseek,
        .poll = devkmsg_poll,
        .release = devkmsg_release,
};

#ifdef CONFIG_VMCORE_INFO
/*
 * This appends the listed symbols to /proc/vmcore
 *
 * /proc/vmcore is used by various utilities, like crash and makedumpfile to
 * obtain access to symbols that are otherwise very difficult to locate.  These
 * symbols are specifically used so that utilities can access and extract the
 * dmesg log from a vmcore file after a crash.
 */
void log_buf_vmcoreinfo_setup(void)
{
        struct dev_printk_info *dev_info = NULL;

        VMCOREINFO_SYMBOL(prb);
        VMCOREINFO_SYMBOL(printk_rb_static);
        VMCOREINFO_SYMBOL(clear_seq);

        /*
         * Export struct size and field offsets. User space tools can
         * parse it and detect any changes to structure down the line.
         */

        VMCOREINFO_STRUCT_SIZE(printk_ringbuffer);
        VMCOREINFO_OFFSET(printk_ringbuffer, desc_ring);
        VMCOREINFO_OFFSET(printk_ringbuffer, text_data_ring);
        VMCOREINFO_OFFSET(printk_ringbuffer, fail);

        VMCOREINFO_STRUCT_SIZE(prb_desc_ring);
        VMCOREINFO_OFFSET(prb_desc_ring, count_bits);
        VMCOREINFO_OFFSET(prb_desc_ring, descs);
        VMCOREINFO_OFFSET(prb_desc_ring, infos);
        VMCOREINFO_OFFSET(prb_desc_ring, head_id);
        VMCOREINFO_OFFSET(prb_desc_ring, tail_id);

        VMCOREINFO_STRUCT_SIZE(prb_desc);
        VMCOREINFO_OFFSET(prb_desc, state_var);
        VMCOREINFO_OFFSET(prb_desc, text_blk_lpos);

        VMCOREINFO_STRUCT_SIZE(prb_data_blk_lpos);
        VMCOREINFO_OFFSET(prb_data_blk_lpos, begin);
        VMCOREINFO_OFFSET(prb_data_blk_lpos, next);

        VMCOREINFO_STRUCT_SIZE(printk_info);
        VMCOREINFO_OFFSET(printk_info, seq);
        VMCOREINFO_OFFSET(printk_info, ts_nsec);
        VMCOREINFO_OFFSET(printk_info, text_len);
        VMCOREINFO_OFFSET(printk_info, caller_id);
        VMCOREINFO_OFFSET(printk_info, dev_info);

        VMCOREINFO_STRUCT_SIZE(dev_printk_info);
        VMCOREINFO_OFFSET(dev_printk_info, subsystem);
        VMCOREINFO_LENGTH(printk_info_subsystem, sizeof(dev_info->subsystem));
        VMCOREINFO_OFFSET(dev_printk_info, device);
        VMCOREINFO_LENGTH(printk_info_device, sizeof(dev_info->device));

        VMCOREINFO_STRUCT_SIZE(prb_data_ring);
        VMCOREINFO_OFFSET(prb_data_ring, size_bits);
        VMCOREINFO_OFFSET(prb_data_ring, data);
        VMCOREINFO_OFFSET(prb_data_ring, head_lpos);
        VMCOREINFO_OFFSET(prb_data_ring, tail_lpos);

        VMCOREINFO_SIZE(atomic_long_t);
        VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter);

        VMCOREINFO_STRUCT_SIZE(latched_seq);
        VMCOREINFO_OFFSET(latched_seq, val);
}
#endif

/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;

/* we practice scaling the ring buffer by powers of 2 */
static void __init log_buf_len_update(u64 size)
{
        if (size > (u64)LOG_BUF_LEN_MAX) {
                size = (u64)LOG_BUF_LEN_MAX;
                pr_err("log_buf over 2G is not supported.\n");
        }

        if (size)
                size = roundup_pow_of_two(size);
        if (size > log_buf_len)
                new_log_buf_len = (unsigned long)size;
}

/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
        u64 size;

        if (!str)
                return -EINVAL;

        size = memparse(str, &str);

        log_buf_len_update(size);

        return 0;
}
early_param("log_buf_len", log_buf_len_setup);

#ifdef CONFIG_SMP
#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)

static void __init log_buf_add_cpu(void)
{
        unsigned int cpu_extra;

        /*
         * archs should set up cpu_possible_bits properly with
         * set_cpu_possible() after setup_arch() but just in
         * case lets ensure this is valid.
         */
        if (num_possible_cpus() == 1)
                return;

        cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;

        /* by default this will only continue through for large > 64 CPUs */
        if (cpu_extra <= __LOG_BUF_LEN / 2)
                return;

        pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
                __LOG_CPU_MAX_BUF_LEN);
        pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
                cpu_extra);
        pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);

        log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
}
#else /* !CONFIG_SMP */
static inline void log_buf_add_cpu(void) {}
#endif /* CONFIG_SMP */

static void __init set_percpu_data_ready(void)
{
        __printk_percpu_data_ready = true;
}

static unsigned int __init add_to_rb(struct printk_ringbuffer *rb,
                                     struct printk_record *r)
{
        struct prb_reserved_entry e;
        struct printk_record dest_r;

        prb_rec_init_wr(&dest_r, r->info->text_len);

        if (!prb_reserve(&e, rb, &dest_r))
                return 0;

        memcpy(&dest_r.text_buf[0], &r->text_buf[0], r->info->text_len);
        dest_r.info->text_len = r->info->text_len;
        dest_r.info->facility = r->info->facility;
        dest_r.info->level = r->info->level;
        dest_r.info->flags = r->info->flags;
        dest_r.info->ts_nsec = r->info->ts_nsec;
        dest_r.info->caller_id = r->info->caller_id;
        memcpy(&dest_r.info->dev_info, &r->info->dev_info, sizeof(dest_r.info->dev_info));

        prb_final_commit(&e);

        return prb_record_text_space(&e);
}

static char setup_text_buf[PRINTKRB_RECORD_MAX] __initdata;

static void print_log_buf_usage_stats(void)
{
        unsigned int descs_count = log_buf_len >> PRB_AVGBITS;
        size_t meta_data_size;

        meta_data_size = descs_count * (sizeof(struct prb_desc) + sizeof(struct printk_info));

        pr_info("log buffer data + meta data: %u + %zu = %zu bytes\n",
                log_buf_len, meta_data_size, log_buf_len + meta_data_size);
}

void __init setup_log_buf(int early)
{
        struct printk_info *new_infos;
        unsigned int new_descs_count;
        struct prb_desc *new_descs;
        struct printk_info info;
        struct printk_record r;
        unsigned int text_size;
        size_t new_descs_size;
        size_t new_infos_size;
        unsigned long flags;
        char *new_log_buf;
        unsigned int free;
        u64 seq;

        /*
         * Some archs call setup_log_buf() multiple times - first is very
         * early, e.g. from setup_arch(), and second - when percpu_areas
         * are initialised.
         */
        if (!early)
                set_percpu_data_ready();

        if (log_buf != __log_buf)
                return;

        if (!early && !new_log_buf_len)
                log_buf_add_cpu();

        if (!new_log_buf_len) {
                /* Show the memory stats only once. */
                if (!early)
                        goto out;

                return;
        }

        new_descs_count = new_log_buf_len >> PRB_AVGBITS;
        if (new_descs_count == 0) {
                pr_err("new_log_buf_len: %lu too small\n", new_log_buf_len);
                goto out;
        }

        new_log_buf = memblock_alloc(new_log_buf_len, LOG_ALIGN);
        if (unlikely(!new_log_buf)) {
                pr_err("log_buf_len: %lu text bytes not available\n",
                       new_log_buf_len);
                goto out;
        }

        new_descs_size = new_descs_count * sizeof(struct prb_desc);
        new_descs = memblock_alloc(new_descs_size, LOG_ALIGN);
        if (unlikely(!new_descs)) {
                pr_err("log_buf_len: %zu desc bytes not available\n",
                       new_descs_size);
                goto err_free_log_buf;
        }

        new_infos_size = new_descs_count * sizeof(struct printk_info);
        new_infos = memblock_alloc(new_infos_size, LOG_ALIGN);
        if (unlikely(!new_infos)) {
                pr_err("log_buf_len: %zu info bytes not available\n",
                       new_infos_size);
                goto err_free_descs;
        }

        prb_rec_init_rd(&r, &info, &setup_text_buf[0], sizeof(setup_text_buf));

        prb_init(&printk_rb_dynamic,
                 new_log_buf, ilog2(new_log_buf_len),
                 new_descs, ilog2(new_descs_count),
                 new_infos);

        local_irq_save(flags);

        log_buf_len = new_log_buf_len;
        log_buf = new_log_buf;
        new_log_buf_len = 0;

        free = __LOG_BUF_LEN;
        prb_for_each_record(0, &printk_rb_static, seq, &r) {
                text_size = add_to_rb(&printk_rb_dynamic, &r);
                if (text_size > free)
                        free = 0;
                else
                        free -= text_size;
        }

        prb = &printk_rb_dynamic;

        local_irq_restore(flags);

        /*
         * Copy any remaining messages that might have appeared from
         * NMI context after copying but before switching to the
         * dynamic buffer.
         */
        prb_for_each_record(seq, &printk_rb_static, seq, &r) {
                text_size = add_to_rb(&printk_rb_dynamic, &r);
                if (text_size > free)
                        free = 0;
                else
                        free -= text_size;
        }

        if (seq != prb_next_seq(&printk_rb_static)) {
                pr_err("dropped %llu messages\n",
                       prb_next_seq(&printk_rb_static) - seq);
        }

        print_log_buf_usage_stats();
        pr_info("early log buf free: %u(%u%%)\n",
                free, (free * 100) / __LOG_BUF_LEN);
        return;

err_free_descs:
        memblock_free(new_descs, new_descs_size);
err_free_log_buf:
        memblock_free(new_log_buf, new_log_buf_len);
out:
        print_log_buf_usage_stats();
}

static bool __read_mostly ignore_loglevel;

static int __init ignore_loglevel_setup(char *str)
{
        ignore_loglevel = true;
        pr_info("debug: ignoring loglevel setting.\n");

        return 0;
}

early_param("ignore_loglevel", ignore_loglevel_setup);
module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel,
                 "ignore loglevel setting (prints all kernel messages to the console)");

static bool suppress_message_printing(int level)
{
        return (level >= console_loglevel && !ignore_loglevel);
}

#ifdef CONFIG_BOOT_PRINTK_DELAY

static int boot_delay; /* msecs delay after each printk during bootup */
static unsigned long long loops_per_msec;        /* based on boot_delay */

static int __init boot_delay_setup(char *str)
{
        unsigned long lpj;

        lpj = preset_lpj ? preset_lpj : 1000000;        /* some guess */
        loops_per_msec = (unsigned long long)lpj / 1000 * HZ;

        get_option(&str, &boot_delay);
        if (boot_delay > 10 * 1000)
                boot_delay = 0;

        pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, "
                "HZ: %d, loops_per_msec: %llu\n",
                boot_delay, preset_lpj, lpj, HZ, loops_per_msec);
        return 0;
}
early_param("boot_delay", boot_delay_setup);

static void boot_delay_msec(int level)
{
        unsigned long long k;
        unsigned long timeout;
        bool suppress = !is_printk_force_console() &&
                        suppress_message_printing(level);

        if ((boot_delay == 0 || system_state >= SYSTEM_RUNNING) || suppress)
                return;

        k = (unsigned long long)loops_per_msec * boot_delay;

        timeout = jiffies + msecs_to_jiffies(boot_delay);
        while (k) {
                k--;
                cpu_relax();
                /*
                 * use (volatile) jiffies to prevent
                 * compiler reduction; loop termination via jiffies
                 * is secondary and may or may not happen.
                 */
                if (time_after(jiffies, timeout))
                        break;
                touch_nmi_watchdog();
        }
}
#else
static inline void boot_delay_msec(int level)
{
}
#endif

static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);

static size_t print_syslog(unsigned int level, char *buf)
{
        return sprintf(buf, "<%u>", level);
}

static size_t print_time(u64 ts, char *buf)
{
        unsigned long rem_nsec = do_div(ts, 1000000000);

        return sprintf(buf, "[%5lu.%06lu]",
                       (unsigned long)ts, rem_nsec / 1000);
}

#ifdef CONFIG_PRINTK_CALLER
static size_t print_caller(u32 id, char *buf)
{
        char caller[12];

        snprintf(caller, sizeof(caller), "%c%u",
                 id & 0x80000000 ? 'C' : 'T', id & ~0x80000000);
        return sprintf(buf, "[%6s]", caller);
}
#else
#define print_caller(id, buf) 0
#endif

static size_t info_print_prefix(const struct printk_info  *info, bool syslog,
                                bool time, char *buf)
{
        size_t len = 0;

        if (syslog)
                len = print_syslog((info->facility << 3) | info->level, buf);

        if (time)
                len += print_time(info->ts_nsec, buf + len);

        len += print_caller(info->caller_id, buf + len);

        if (IS_ENABLED(CONFIG_PRINTK_CALLER) || time) {
                buf[len++] = ' ';
                buf[len] = '\0';
        }

        return len;
}

/*
 * Prepare the record for printing. The text is shifted within the given
 * buffer to avoid a need for another one. The following operations are
 * done:
 *
 *   - Add prefix for each line.
 *   - Drop truncated lines that no longer fit into the buffer.
 *   - Add the trailing newline that has been removed in vprintk_store().
 *   - Add a string terminator.
 *
 * Since the produced string is always terminated, the maximum possible
 * return value is @r->text_buf_size - 1;
 *
 * Return: The length of the updated/prepared text, including the added
 * prefixes and the newline. The terminator is not counted. The dropped
 * line(s) are not counted.
 */
static size_t record_print_text(struct printk_record *r, bool syslog,
                                bool time)
{
        size_t text_len = r->info->text_len;
        size_t buf_size = r->text_buf_size;
        char *text = r->text_buf;
        char prefix[PRINTK_PREFIX_MAX];
        bool truncated = false;
        size_t prefix_len;
        size_t line_len;
        size_t len = 0;
        char *next;

        /*
         * If the message was truncated because the buffer was not large
         * enough, treat the available text as if it were the full text.
         */
        if (text_len > buf_size)
                text_len = buf_size;

        prefix_len = info_print_prefix(r->info, syslog, time, prefix);

        /*
         * @text_len: bytes of unprocessed text
         * @line_len: bytes of current line _without_ newline
         * @text:     pointer to beginning of current line
         * @len:      number of bytes prepared in r->text_buf
         */
        for (;;) {
                next = memchr(text, '\n', text_len);
                if (next) {
                        line_len = next - text;
                } else {
                        /* Drop truncated line(s). */
                        if (truncated)
                                break;
                        line_len = text_len;
                }

                /*
                 * Truncate the text if there is not enough space to add the
                 * prefix and a trailing newline and a terminator.
                 */
                if (len + prefix_len + text_len + 1 + 1 > buf_size) {
                        /* Drop even the current line if no space. */
                        if (len + prefix_len + line_len + 1 + 1 > buf_size)
                                break;

                        text_len = buf_size - len - prefix_len - 1 - 1;
                        truncated = true;
                }

                memmove(text + prefix_len, text, text_len);
                memcpy(text, prefix, prefix_len);

                /*
                 * Increment the prepared length to include the text and
                 * prefix that were just moved+copied. Also increment for the
                 * newline at the end of this line. If this is the last line,
                 * there is no newline, but it will be added immediately below.
                 */
                len += prefix_len + line_len + 1;
                if (text_len == line_len) {
                        /*
                         * This is the last line. Add the trailing newline
                         * removed in vprintk_store().
                         */
                        text[prefix_len + line_len] = '\n';
                        break;
                }

                /*
                 * Advance beyond the added prefix and the related line with
                 * its newline.
                 */
                text += prefix_len + line_len + 1;

                /*
                 * The remaining text has only decreased by the line with its
                 * newline.
                 *
                 * Note that @text_len can become zero. It happens when @text
                 * ended with a newline (either due to truncation or the
                 * original string ending with "\n\n"). The loop is correctly
                 * repeated and (if not truncated) an empty line with a prefix
                 * will be prepared.
                 */
                text_len -= line_len + 1;
        }

        /*
         * If a buffer was provided, it will be terminated. Space for the
         * string terminator is guaranteed to be available. The terminator is
         * not counted in the return value.
         */
        if (buf_size > 0)
                r->text_buf[len] = 0;

        return len;
}

static size_t get_record_print_text_size(struct printk_info *info,
                                         unsigned int line_count,
                                         bool syslog, bool time)
{
        char prefix[PRINTK_PREFIX_MAX];
        size_t prefix_len;

        prefix_len = info_print_prefix(info, syslog, time, prefix);

        /*
         * Each line will be preceded with a prefix. The intermediate
         * newlines are already within the text, but a final trailing
         * newline will be added.
         */
        return ((prefix_len * line_count) + info->text_len + 1);
}

/*
 * Beginning with @start_seq, find the first record where it and all following
 * records up to (but not including) @max_seq fit into @size.
 *
 * @max_seq is simply an upper bound and does not need to exist. If the caller
 * does not require an upper bound, -1 can be used for @max_seq.
 */
static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size,
                                  bool syslog, bool time)
{
        struct printk_info info;
        unsigned int line_count;
        size_t len = 0;
        u64 seq;

        /* Determine the size of the records up to @max_seq. */
        prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
                if (info.seq >= max_seq)
                        break;
                len += get_record_print_text_size(&info, line_count, syslog, time);
        }

        /*
         * Adjust the upper bound for the next loop to avoid subtracting
         * lengths that were never added.
         */
        if (seq < max_seq)
                max_seq = seq;

        /*
         * Move first record forward until length fits into the buffer. Ignore
         * newest messages that were not counted in the above cycle. Messages
         * might appear and get lost in the meantime. This is a best effort
         * that prevents an infinite loop that could occur with a retry.
         */
        prb_for_each_info(start_seq, prb, seq, &info, &line_count) {
                if (len <= size || info.seq >= max_seq)
                        break;
                len -= get_record_print_text_size(&info, line_count, syslog, time);
        }

        return seq;
}

/* The caller is responsible for making sure @size is greater than 0. */
static int syslog_print(char __user *buf, int size)
{
        struct printk_info info;
        struct printk_record r;
        char *text;
        int len = 0;
        u64 seq;

        text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;

        prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);

        mutex_lock(&syslog_lock);

        /*
         * Wait for the @syslog_seq record to be available. @syslog_seq may
         * change while waiting.
         */
        do {
                seq = syslog_seq;

                mutex_unlock(&syslog_lock);
                /*
                 * Guarantee this task is visible on the waitqueue before
                 * checking the wake condition.
                 *
                 * The full memory barrier within set_current_state() of
                 * prepare_to_wait_event() pairs with the full memory barrier
                 * within wq_has_sleeper().
                 *
                 * This pairs with __wake_up_klogd:A.
                 */
                len = wait_event_interruptible(log_wait,
                                prb_read_valid(prb, seq, NULL)); /* LMM(syslog_print:A) */
                mutex_lock(&syslog_lock);

                if (len)
                        goto out;
        } while (syslog_seq != seq);

        /*
         * Copy records that fit into the buffer. The above cycle makes sure
         * that the first record is always available.
         */
        do {
                size_t n;
                size_t skip;
                int err;

                if (!prb_read_valid(prb, syslog_seq, &r))
                        break;

                if (r.info->seq != syslog_seq) {
                        /* message is gone, move to next valid one */
                        syslog_seq = r.info->seq;
                        syslog_partial = 0;
                }

                /*
                 * To keep reading/counting partial line consistent,
                 * use printk_time value as of the beginning of a line.
                 */
                if (!syslog_partial)
                        syslog_time = printk_time;

                skip = syslog_partial;
                n = record_print_text(&r, true, syslog_time);
                if (n - syslog_partial <= size) {
                        /* message fits into buffer, move forward */
                        syslog_seq = r.info->seq + 1;
                        n -= syslog_partial;
                        syslog_partial = 0;
                } else if (!len){
                        /* partial read(), remember position */
                        n = size;
                        syslog_partial += n;
                } else
                        n = 0;

                if (!n)
                        break;

                mutex_unlock(&syslog_lock);
                err = copy_to_user(buf, text + skip, n);
                mutex_lock(&syslog_lock);

                if (err) {
                        if (!len)
                                len = -EFAULT;
                        break;
                }

                len += n;
                size -= n;
                buf += n;
        } while (size);
out:
        mutex_unlock(&syslog_lock);
        kfree(text);
        return len;
}

static int syslog_print_all(char __user *buf, int size, bool clear)
{
        struct printk_info info;
        struct printk_record r;
        char *text;
        int len = 0;
        u64 seq;
        bool time;

        text = kmalloc(PRINTK_MESSAGE_MAX, GFP_KERNEL);
        if (!text)
                return -ENOMEM;

        time = printk_time;
        /*
         * Find first record that fits, including all following records,
         * into the user-provided buffer for this dump.
         */
        seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1,
                                     size, true, time);

        prb_rec_init_rd(&r, &info, text, PRINTK_MESSAGE_MAX);

        prb_for_each_record(seq, prb, seq, &r) {
                int textlen;

                textlen = record_print_text(&r, true, time);

                if (len + textlen > size) {
                        seq--;
                        break;
                }

                if (copy_to_user(buf + len, text, textlen))
                        len = -EFAULT;
                else
                        len += textlen;

                if (len < 0)
                        break;
        }

        if (clear) {
                mutex_lock(&syslog_lock);
                latched_seq_write(&clear_seq, seq);
                mutex_unlock(&syslog_lock);
        }

        kfree(text);
        return len;
}

static void syslog_clear(void)
{
        mutex_lock(&syslog_lock);
        latched_seq_write(&clear_seq, prb_next_seq(prb));
        mutex_unlock(&syslog_lock);
}

int do_syslog(int type, char __user *buf, int len, int source)
{
        struct printk_info info;
        bool clear = false;
        static int saved_console_loglevel = LOGLEVEL_DEFAULT;
        int error;

        error = check_syslog_permissions(type, source);
        if (error)
                return error;

        switch (type) {
        case SYSLOG_ACTION_CLOSE:        /* Close log */
                break;
        case SYSLOG_ACTION_OPEN:        /* Open log */
                break;
        case SYSLOG_ACTION_READ:        /* Read from log */
                if (!buf || len < 0)
                        return -EINVAL;
                if (!len)
                        return 0;
                if (!access_ok(buf, len))
                        return -EFAULT;
                error = syslog_print(buf, len);
                break;
        /* Read/clear last kernel messages */
        case SYSLOG_ACTION_READ_CLEAR:
                clear = true;
                fallthrough;
        /* Read last kernel messages */
        case SYSLOG_ACTION_READ_ALL:
                if (!buf || len < 0)
                        return -EINVAL;
                if (!len)
                        return 0;
                if (!access_ok(buf, len))
                        return -EFAULT;
                error = syslog_print_all(buf, len, clear);
                break;
        /* Clear ring buffer */
        case SYSLOG_ACTION_CLEAR:
                syslog_clear();
                break;
        /* Disable logging to console */
        case SYSLOG_ACTION_CONSOLE_OFF:
                if (saved_console_loglevel == LOGLEVEL_DEFAULT)
                        saved_console_loglevel = console_loglevel;
                console_loglevel = minimum_console_loglevel;
                break;
        /* Enable logging to console */
        case SYSLOG_ACTION_CONSOLE_ON:
                if (saved_console_loglevel != LOGLEVEL_DEFAULT) {
                        console_loglevel = saved_console_loglevel;
                        saved_console_loglevel = LOGLEVEL_DEFAULT;
                }
                break;
        /* Set level of messages printed to console */
        case SYSLOG_ACTION_CONSOLE_LEVEL:
                if (len < 1 || len > 8)
                        return -EINVAL;
                if (len < minimum_console_loglevel)
                        len = minimum_console_loglevel;
                console_loglevel = len;
                /* Implicitly re-enable logging to console */
                saved_console_loglevel = LOGLEVEL_DEFAULT;
                break;
        /* Number of chars in the log buffer */
        case SYSLOG_ACTION_SIZE_UNREAD:
                mutex_lock(&syslog_lock);
                if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) {
                        /* No unread messages. */
                        mutex_unlock(&syslog_lock);
                        return 0;
                }
                if (info.seq != syslog_seq) {
                        /* messages are gone, move to first one */
                        syslog_seq = info.seq;
                        syslog_partial = 0;
                }
                if (source == SYSLOG_FROM_PROC) {
                        /*
                         * Short-cut for poll(/"proc/kmsg") which simply checks
                         * for pending data, not the size; return the count of
                         * records, not the length.
                         */
                        error = prb_next_seq(prb) - syslog_seq;
                } else {
                        bool time = syslog_partial ? syslog_time : printk_time;
                        unsigned int line_count;
                        u64 seq;

                        prb_for_each_info(syslog_seq, prb, seq, &info,
                                          &line_count) {
                                error += get_record_print_text_size(&info, line_count,
                                                                    true, time);
                                time = printk_time;
                        }
                        error -= syslog_partial;
                }
                mutex_unlock(&syslog_lock);
                break;
        /* Size of the log buffer */
        case SYSLOG_ACTION_SIZE_BUFFER:
                error = log_buf_len;
                break;
        default:
                error = -EINVAL;
                break;
        }

        return error;
}

SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
{
        return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}

/*
 * Special console_lock variants that help to reduce the risk of soft-lockups.
 * They allow to pass console_lock to another printk() call using a busy wait.
 */

#ifdef CONFIG_LOCKDEP
static struct lockdep_map console_owner_dep_map = {
        .name = "console_owner"
};
#endif

static DEFINE_RAW_SPINLOCK(console_owner_lock);
static struct task_struct *console_owner;
static bool console_waiter;

/**
 * console_lock_spinning_enable - mark beginning of code where another
 *        thread might safely busy wait
 *
 * This basically converts console_lock into a spinlock. This marks
 * the section where the console_lock owner can not sleep, because
 * there may be a waiter spinning (like a spinlock). Also it must be
 * ready to hand over the lock at the end of the section.
 */
void console_lock_spinning_enable(void)
{
        /*
         * Do not use spinning in panic(). The panic CPU wants to keep the lock.
         * Non-panic CPUs abandon the flush anyway.
         *
         * Just keep the lockdep annotation. The panic-CPU should avoid
         * taking console_owner_lock because it might cause a deadlock.
         * This looks like the easiest way how to prevent false lockdep
         * reports without handling races a lockless way.
         */
        if (panic_in_progress())
                goto lockdep;

        raw_spin_lock(&console_owner_lock);
        console_owner = current;
        raw_spin_unlock(&console_owner_lock);

lockdep:
        /* The waiter may spin on us after setting console_owner */
        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}

/**
 * console_lock_spinning_disable_and_check - mark end of code where another
 *        thread was able to busy wait and check if there is a waiter
 * @cookie: cookie returned from console_srcu_read_lock()
 *
 * This is called at the end of the section where spinning is allowed.
 * It has two functions. First, it is a signal that it is no longer
 * safe to start busy waiting for the lock. Second, it checks if
 * there is a busy waiter and passes the lock rights to her.
 *
 * Important: Callers lose both the console_lock and the SRCU read lock if
 *        there was a busy waiter. They must not touch items synchronized by
 *        console_lock or SRCU read lock in this case.
 *
 * Return: 1 if the lock rights were passed, 0 otherwise.
 */
int console_lock_spinning_disable_and_check(int cookie)
{
        int waiter;

        /*
         * Ignore spinning waiters during panic() because they might get stopped
         * or blocked at any time,
         *
         * It is safe because nobody is allowed to start spinning during panic
         * in the first place. If there has been a waiter then non panic CPUs
         * might stay spinning. They would get stopped anyway. The panic context
         * will never start spinning and an interrupted spin on panic CPU will
         * never continue.
         */
        if (panic_in_progress()) {
                /* Keep lockdep happy. */
                spin_release(&console_owner_dep_map, _THIS_IP_);
                return 0;
        }

        raw_spin_lock(&console_owner_lock);
        waiter = READ_ONCE(console_waiter);
        console_owner = NULL;
        raw_spin_unlock(&console_owner_lock);

        if (!waiter) {
                spin_release(&console_owner_dep_map, _THIS_IP_);
                return 0;
        }

        /* The waiter is now free to continue */
        WRITE_ONCE(console_waiter, false);

        spin_release(&console_owner_dep_map, _THIS_IP_);

        /*
         * Preserve lockdep lock ordering. Release the SRCU read lock before
         * releasing the console_lock.
         */
        console_srcu_read_unlock(cookie);

        /*
         * Hand off console_lock to waiter. The waiter will perform
         * the up(). After this, the waiter is the console_lock owner.
         */
        mutex_release(&console_lock_dep_map, _THIS_IP_);
        return 1;
}

/**
 * console_trylock_spinning - try to get console_lock by busy waiting
 *
 * This allows to busy wait for the console_lock when the current
 * owner is running in specially marked sections. It means that
 * the current owner is running and cannot reschedule until it
 * is ready to lose the lock.
 *
 * Return: 1 if we got the lock, 0 othrewise
 */
static int console_trylock_spinning(void)
{
        struct task_struct *owner = NULL;
        bool waiter;
        bool spin = false;
        unsigned long flags;

        if (console_trylock())
                return 1;

        /*
         * It's unsafe to spin once a panic has begun. If we are the
         * panic CPU, we may have already halted the owner of the
         * console_sem. If we are not the panic CPU, then we should
         * avoid taking console_sem, so the panic CPU has a better
         * chance of cleanly acquiring it later.
         */
        if (panic_in_progress())
                return 0;

        printk_safe_enter_irqsave(flags);

        raw_spin_lock(&console_owner_lock);
        owner = READ_ONCE(console_owner);
        waiter = READ_ONCE(console_waiter);
        if (!waiter && owner && owner != current) {
                WRITE_ONCE(console_waiter, true);
                spin = true;
        }
        raw_spin_unlock(&console_owner_lock);

        /*
         * If there is an active printk() writing to the
         * consoles, instead of having it write our data too,
         * see if we can offload that load from the active
         * printer, and do some printing ourselves.
         * Go into a spin only if there isn't already a waiter
         * spinning, and there is an active printer, and
         * that active printer isn't us (recursive printk?).
         */
        if (!spin) {
                printk_safe_exit_irqrestore(flags);
                return 0;
        }

        /* We spin waiting for the owner to release us */
        spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
        /* Owner will clear console_waiter on hand off */
        while (READ_ONCE(console_waiter))
                cpu_relax();
        spin_release(&console_owner_dep_map, _THIS_IP_);

        printk_safe_exit_irqrestore(flags);
        /*
         * The owner passed the console lock to us.
         * Since we did not spin on console lock, annotate
         * this as a trylock. Otherwise lockdep will
         * complain.
         */
        mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_);

        /*
         * Update @console_may_schedule for trylock because the previous
         * owner may have been schedulable.
         */
        console_may_schedule = 0;

        return 1;
}

/*
 * Recursion is tracked separately on each CPU. If NMIs are supported, an
 * additional NMI context per CPU is also separately tracked. Until per-CPU
 * is available, a separate "early tracking" is performed.
 */
static DEFINE_PER_CPU(u8, printk_count);
static u8 printk_count_early;
#ifdef CONFIG_HAVE_NMI
static DEFINE_PER_CPU(u8, printk_count_nmi);
static u8 printk_count_nmi_early;
#endif

/*
 * Recursion is limited to keep the output sane. printk() should not require
 * more than 1 level of recursion (allowing, for example, printk() to trigger
 * a WARN), but a higher value is used in case some printk-internal errors
 * exist, such as the ringbuffer validation checks failing.
 */
#define PRINTK_MAX_RECURSION 3

/*
 * Return a pointer to the dedicated counter for the CPU+context of the
 * caller.
 */
static u8 *__printk_recursion_counter(void)
{
#ifdef CONFIG_HAVE_NMI
        if (in_nmi()) {
                if (printk_percpu_data_ready())
                        return this_cpu_ptr(&printk_count_nmi);
                return &printk_count_nmi_early;
        }
#endif
        if (printk_percpu_data_ready())
                return this_cpu_ptr(&printk_count);
        return &printk_count_early;
}

/*
 * Enter recursion tracking. Interrupts are disabled to simplify tracking.
 * The caller must check the boolean return value to see if the recursion is
 * allowed. On failure, interrupts are not disabled.
 *
 * @recursion_ptr must be a variable of type (u8 *) and is the same variable
 * that is passed to printk_exit_irqrestore().
 */
#define printk_enter_irqsave(recursion_ptr, flags)        \
({                                                        \
        bool success = true;                                \
                                                        \
        typecheck(u8 *, recursion_ptr);                        \
        local_irq_save(flags);                                \
        (recursion_ptr) = __printk_recursion_counter();        \
        if (*(recursion_ptr) > PRINTK_MAX_RECURSION) {        \
                local_irq_restore(flags);                \
                success = false;                        \
        } else {                                        \
                (*(recursion_ptr))++;                        \
        }                                                \
        success;                                        \
})

/* Exit recursion tracking, restoring interrupts. */
#define printk_exit_irqrestore(recursion_ptr, flags)        \
        do {                                                \
                typecheck(u8 *, recursion_ptr);                \
                (*(recursion_ptr))--;                        \
                local_irq_restore(flags);                \
        } while (0)

int printk_delay_msec __read_mostly;

static inline void printk_delay(int level)
{
        boot_delay_msec(level);

        if (unlikely(printk_delay_msec)) {
                int m = printk_delay_msec;

                while (m--) {
                        mdelay(1);
                        touch_nmi_watchdog();
                }
        }
}

static inline u32 printk_caller_id(void)
{
        return in_task() ? task_pid_nr(current) :
                0x80000000 + smp_processor_id();
}

/**
 * printk_parse_prefix - Parse level and control flags.
 *
 * @text:     The terminated text message.
 * @level:    A pointer to the current level value, will be updated.
 * @flags:    A pointer to the current printk_info flags, will be updated.
 *
 * @level may be NULL if the caller is not interested in the parsed value.
 * Otherwise the variable pointed to by @level must be set to
 * LOGLEVEL_DEFAULT in order to be updated with the parsed value.
 *
 * @flags may be NULL if the caller is not interested in the parsed value.
 * Otherwise the variable pointed to by @flags will be OR'd with the parsed
 * value.
 *
 * Return: The length of the parsed level and control flags.
 */
u16 printk_parse_prefix(const char *text, int *level,
                        enum printk_info_flags *flags)
{
        u16 prefix_len = 0;
        int kern_level;

        while (*text) {
                kern_level = printk_get_level(text);
                if (!kern_level)
                        break;

                switch (kern_level) {
                case '0' ... '7':
                        if (level && *level == LOGLEVEL_DEFAULT)
                                *level = kern_level - '0';
                        break;
                case 'c':        /* KERN_CONT */
                        if (flags)
                                *flags |= LOG_CONT;
                }

                prefix_len += 2;
                text += 2;
        }

        return prefix_len;
}

__printf(5, 0)
static u16 printk_sprint(char *text, u16 size, int facility,
                         enum printk_info_flags *flags, const char *fmt,
                         va_list args)
{
        u16 text_len;

        text_len = vscnprintf(text, size, fmt, args);

        /* Mark and strip a trailing newline. */
        if (text_len && text[text_len - 1] == '\n') {
                text_len--;
                *flags |= LOG_NEWLINE;
        }

        /* Strip log level and control flags. */
        if (facility == 0) {
                u16 prefix_len;

                prefix_len = printk_parse_prefix(text, NULL, NULL);
                if (prefix_len) {
                        text_len -= prefix_len;
                        memmove(text, text + prefix_len, text_len);
                }
        }

        trace_console(text, text_len);

        return text_len;
}

__printf(4, 0)
int vprintk_store(int facility, int level,
                  const struct dev_printk_info *dev_info,
                  const char *fmt, va_list args)
{
        struct prb_reserved_entry e;
        enum printk_info_flags flags = 0;
        struct printk_record r;
        unsigned long irqflags;
        u16 trunc_msg_len = 0;
        char prefix_buf[8];
        u8 *recursion_ptr;
        u16 reserve_size;
        va_list args2;
        u32 caller_id;
        u16 text_len;
        int ret = 0;
        u64 ts_nsec;

        if (!printk_enter_irqsave(recursion_ptr, irqflags))
                return 0;

        /*
         * Since the duration of printk() can vary depending on the message
         * and state of the ringbuffer, grab the timestamp now so that it is
         * close to the call of printk(). This provides a more deterministic
         * timestamp with respect to the caller.
         */
        ts_nsec = local_clock();

        caller_id = printk_caller_id();

        /*
         * The sprintf needs to come first since the syslog prefix might be
         * passed in as a parameter. An extra byte must be reserved so that
         * later the vscnprintf() into the reserved buffer has room for the
         * terminating '\0', which is not counted by vsnprintf().
         */
        va_copy(args2, args);
        reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1;
        va_end(args2);

        if (reserve_size > PRINTKRB_RECORD_MAX)
                reserve_size = PRINTKRB_RECORD_MAX;

        /* Extract log level or control flags. */
        if (facility == 0)
                printk_parse_prefix(&prefix_buf[0], &level, &flags);

        if (level == LOGLEVEL_DEFAULT)
                level = default_message_loglevel;

        if (dev_info)
                flags |= LOG_NEWLINE;

        if (is_printk_force_console())
                flags |= LOG_FORCE_CON;

        if (flags & LOG_CONT) {
                prb_rec_init_wr(&r, reserve_size);
                if (prb_reserve_in_last(&e, prb, &r, caller_id, PRINTKRB_RECORD_MAX)) {
                        text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size,
                                                 facility, &flags, fmt, args);
                        r.info->text_len += text_len;

                        if (flags & LOG_FORCE_CON)
                                r.info->flags |= LOG_FORCE_CON;

                        if (flags & LOG_NEWLINE) {
                                r.info->flags |= LOG_NEWLINE;
                                prb_final_commit(&e);
                        } else {
                                prb_commit(&e);
                        }

                        ret = text_len;
                        goto out;
                }
        }

        /*
         * Explicitly initialize the record before every prb_reserve() call.
         * prb_reserve_in_last() and prb_reserve() purposely invalidate the
         * structure when they fail.
         */
        prb_rec_init_wr(&r, reserve_size);
        if (!prb_reserve(&e, prb, &r)) {
                /* truncate the message if it is too long for empty buffer */
                truncate_msg(&reserve_size, &trunc_msg_len);

                prb_rec_init_wr(&r, reserve_size + trunc_msg_len);
                if (!prb_reserve(&e, prb, &r))
                        goto out;
        }

        /* fill message */
        text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args);
        if (trunc_msg_len)
                memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len);
        r.info->text_len = text_len + trunc_msg_len;
        r.info->facility = facility;
        r.info->level = level & 7;
        r.info->flags = flags & 0x1f;
        r.info->ts_nsec = ts_nsec;
        r.info->caller_id = caller_id;
        if (dev_info)
                memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info));

        /* A message without a trailing newline can be continued. */
        if (!(flags & LOG_NEWLINE))
                prb_commit(&e);
        else
                prb_final_commit(&e);

        ret = text_len + trunc_msg_len;
out:
        printk_exit_irqrestore(recursion_ptr, irqflags);
        return ret;
}

/*
 * This acts as a one-way switch to allow legacy consoles to print from
 * the printk() caller context on a panic CPU. It also attempts to flush
 * the legacy consoles in this context.
 */
void printk_legacy_allow_panic_sync(void)
{
        struct console_flush_type ft;

        legacy_allow_panic_sync = true;

        printk_get_console_flush_type(&ft);
        if (ft.legacy_direct) {
                if (console_trylock())
                        console_unlock();
        }
}

bool __read_mostly debug_non_panic_cpus;

#ifdef CONFIG_PRINTK_CALLER
static int __init debug_non_panic_cpus_setup(char *str)
{
        debug_non_panic_cpus = true;
        pr_info("allow messages from non-panic CPUs in panic()\n");

        return 0;
}
early_param("debug_non_panic_cpus", debug_non_panic_cpus_setup);
module_param(debug_non_panic_cpus, bool, 0644);
MODULE_PARM_DESC(debug_non_panic_cpus,
                 "allow messages from non-panic CPUs in panic()");
#endif

asmlinkage int vprintk_emit(int facility, int level,
                            const struct dev_printk_info *dev_info,
                            const char *fmt, va_list args)
{
        struct console_flush_type ft;
        int printed_len;

        /* Suppress unimportant messages after panic happens */
        if (unlikely(suppress_printk))
                return 0;

        /*
         * The messages on the panic CPU are the most important. If
         * non-panic CPUs are generating any messages, they will be
         * silently dropped.
         */
        if (other_cpu_in_panic() &&
            !debug_non_panic_cpus &&
            !panic_triggering_all_cpu_backtrace)
                return 0;

        printk_get_console_flush_type(&ft);

        /* If called from the scheduler, we can not call up(). */
        if (level == LOGLEVEL_SCHED) {
                level = LOGLEVEL_DEFAULT;
                ft.legacy_offload |= ft.legacy_direct;
                ft.legacy_direct = false;
        }

        printk_delay(level);

        printed_len = vprintk_store(facility, level, dev_info, fmt, args);

        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();

        if (ft.nbcon_offload)
                nbcon_kthreads_wake();

        if (ft.legacy_direct) {
                /*
                 * The caller may be holding system-critical or
                 * timing-sensitive locks. Disable preemption during
                 * printing of all remaining records to all consoles so that
                 * this context can return as soon as possible. Hopefully
                 * another printk() caller will take over the printing.
                 */
                preempt_disable();
                /*
                 * Try to acquire and then immediately release the console
                 * semaphore. The release will print out buffers. With the
                 * spinning variant, this context tries to take over the
                 * printing from another printing context.
                 */
                if (console_trylock_spinning())
                        console_unlock();
                preempt_enable();
        }

        if (ft.legacy_offload)
                defer_console_output();
        else
                wake_up_klogd();

        return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);

int vprintk_default(const char *fmt, va_list args)
{
        return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args);
}
EXPORT_SYMBOL_GPL(vprintk_default);

asmlinkage __visible int _printk(const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk(fmt, args);
        va_end(args);

        return r;
}
EXPORT_SYMBOL(_printk);

static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);

#else /* CONFIG_PRINTK */

#define printk_time                false

#define prb_read_valid(rb, seq, r)        false
#define prb_first_valid_seq(rb)                0
#define prb_next_seq(rb)                0

static u64 syslog_seq;

static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }

#endif /* CONFIG_PRINTK */

#ifdef CONFIG_EARLY_PRINTK
struct console *early_console;

asmlinkage __visible void early_printk(const char *fmt, ...)
{
        va_list ap;
        char buf[512];
        int n;

        if (!early_console)
                return;

        va_start(ap, fmt);
        n = vscnprintf(buf, sizeof(buf), fmt, ap);
        va_end(ap);

        early_console->write(early_console, buf, n);
}
#endif

static void set_user_specified(struct console_cmdline *c, bool user_specified)
{
        if (!user_specified)
                return;

        /*
         * @c console was defined by the user on the command line.
         * Do not clear when added twice also by SPCR or the device tree.
         */
        c->user_specified = true;
        /* At least one console defined by the user on the command line. */
        console_set_on_cmdline = 1;
}

static int __add_preferred_console(const char *name, const short idx,
                                   const char *devname, char *options,
                                   char *brl_options, bool user_specified)
{
        struct console_cmdline *c;
        int i;

        if (!name && !devname)
                return -EINVAL;

        /*
         * We use a signed short index for struct console for device drivers to
         * indicate a not yet assigned index or port. However, a negative index
         * value is not valid when the console name and index are defined on
         * the command line.
         */
        if (name && idx < 0)
                return -EINVAL;

        /*
         *        See if this tty is not yet registered, and
         *        if we have a slot free.
         */
        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
             i++, c++) {
                if ((name && strcmp(c->name, name) == 0 && c->index == idx) ||
                    (devname && strcmp(c->devname, devname) == 0)) {
                        if (!brl_options)
                                preferred_console = i;
                        set_user_specified(c, user_specified);
                        return 0;
                }
        }
        if (i == MAX_CMDLINECONSOLES)
                return -E2BIG;
        if (!brl_options)
                preferred_console = i;
        if (name)
                strscpy(c->name, name);
        if (devname)
                strscpy(c->devname, devname);
        c->options = options;
        set_user_specified(c, user_specified);
        braille_set_options(c, brl_options);

        c->index = idx;
        return 0;
}

static int __init console_msg_format_setup(char *str)
{
        if (!strcmp(str, "syslog"))
                console_msg_format = MSG_FORMAT_SYSLOG;
        if (!strcmp(str, "default"))
                console_msg_format = MSG_FORMAT_DEFAULT;
        return 1;
}
__setup("console_msg_format=", console_msg_format_setup);

/*
 * Set up a console.  Called via do_early_param() in init/main.c
 * for each "console=" parameter in the boot command line.
 */
static int __init console_setup(char *str)
{
        static_assert(sizeof(console_cmdline[0].devname) >= sizeof(console_cmdline[0].name) + 4);
        char buf[sizeof(console_cmdline[0].devname)];
        char *brl_options = NULL;
        char *ttyname = NULL;
        char *devname = NULL;
        char *options;
        char *s;
        int idx;

        /*
         * console="" or console=null have been suggested as a way to
         * disable console output. Use ttynull that has been created
         * for exactly this purpose.
         */
        if (str[0] == 0 || strcmp(str, "null") == 0) {
                __add_preferred_console("ttynull", 0, NULL, NULL, NULL, true);
                return 1;
        }

        if (_braille_console_setup(&str, &brl_options))
                return 1;

        /* For a DEVNAME:0.0 style console the character device is unknown early */
        if (strchr(str, ':'))
                devname = buf;
        else
                ttyname = buf;

        /*
         * Decode str into name, index, options.
         */
        if (ttyname && isdigit(str[0]))
                scnprintf(buf, sizeof(buf), "ttyS%s", str);
        else
                strscpy(buf, str);

        options = strchr(str, ',');
        if (options)
                *(options++) = 0;

#ifdef __sparc__
        if (!strcmp(str, "ttya"))
                strscpy(buf, "ttyS0");
        if (!strcmp(str, "ttyb"))
                strscpy(buf, "ttyS1");
#endif

        for (s = buf; *s; s++)
                if ((ttyname && isdigit(*s)) || *s == ',')
                        break;

        /* @idx will get defined when devname matches. */
        if (devname)
                idx = -1;
        else
                idx = simple_strtoul(s, NULL, 10);

        *s = 0;

        __add_preferred_console(ttyname, idx, devname, options, brl_options, true);
        return 1;
}
__setup("console=", console_setup);

/**
 * add_preferred_console - add a device to the list of preferred consoles.
 * @name: device name
 * @idx: device index
 * @options: options for this console
 *
 * The last preferred console added will be used for kernel messages
 * and stdin/out/err for init.  Normally this is used by console_setup
 * above to handle user-supplied console arguments; however it can also
 * be used by arch-specific code either to override the user or more
 * commonly to provide a default console (ie from PROM variables) when
 * the user has not supplied one.
 */
int add_preferred_console(const char *name, const short idx, char *options)
{
        return __add_preferred_console(name, idx, NULL, options, NULL, false);
}

/**
 * match_devname_and_update_preferred_console - Update a preferred console
 *        when matching devname is found.
 * @devname: DEVNAME:0.0 style device name
 * @name: Name of the corresponding console driver, e.g. "ttyS"
 * @idx: Console index, e.g. port number.
 *
 * The function checks whether a device with the given @devname is
 * preferred via the console=DEVNAME:0.0 command line option.
 * It fills the missing console driver name and console index
 * so that a later register_console() call could find (match)
 * and enable this device.
 *
 * It might be used when a driver subsystem initializes particular
 * devices with already known DEVNAME:0.0 style names. And it
 * could predict which console driver name and index this device
 * would later get associated with.
 *
 * Return: 0 on success, negative error code on failure.
 */
int match_devname_and_update_preferred_console(const char *devname,
                                               const char *name,
                                               const short idx)
{
        struct console_cmdline *c = console_cmdline;
        int i;

        if (!devname || !strlen(devname) || !name || !strlen(name) || idx < 0)
                return -EINVAL;

        for (i = 0; i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
             i++, c++) {
                if (!strcmp(devname, c->devname)) {
                        pr_info("associate the preferred console \"%s\" with \"%s%d\"\n",
                                devname, name, idx);
                        strscpy(c->name, name);
                        c->index = idx;
                        return 0;
                }
        }

        return -ENOENT;
}
EXPORT_SYMBOL_GPL(match_devname_and_update_preferred_console);

bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);

static int __init console_suspend_disable(char *str)
{
        console_suspend_enabled = false;
        return 1;
}
__setup("no_console_suspend", console_suspend_disable);
module_param_named(console_suspend, console_suspend_enabled,
                bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
        " and hibernate operations");

static bool printk_console_no_auto_verbose;

void console_verbose(void)
{
        if (console_loglevel && !printk_console_no_auto_verbose)
                console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH;
}
EXPORT_SYMBOL_GPL(console_verbose);

module_param_named(console_no_auto_verbose, printk_console_no_auto_verbose, bool, 0644);
MODULE_PARM_DESC(console_no_auto_verbose, "Disable console loglevel raise to highest on oops/panic/etc");

/**
 * console_suspend_all - suspend the console subsystem
 *
 * This disables printk() while we go into suspend states
 */
void console_suspend_all(void)
{
        struct console *con;

        if (!console_suspend_enabled)
                return;
        pr_info("Suspending console(s) (use no_console_suspend to debug)\n");
        pr_flush(1000, true);

        console_list_lock();
        for_each_console(con)
                console_srcu_write_flags(con, con->flags | CON_SUSPENDED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All printing
         * contexts must be able to see that they are suspended so that it
         * is guaranteed that all printing has stopped when this function
         * completes.
         */
        synchronize_srcu(&console_srcu);
}

void console_resume_all(void)
{
        struct console_flush_type ft;
        struct console *con;

        if (!console_suspend_enabled)
                return;

        console_list_lock();
        for_each_console(con)
                console_srcu_write_flags(con, con->flags & ~CON_SUSPENDED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All printing
         * contexts must be able to see they are no longer suspended so
         * that they are guaranteed to wake up and resume printing.
         */
        synchronize_srcu(&console_srcu);

        printk_get_console_flush_type(&ft);
        if (ft.nbcon_offload)
                nbcon_kthreads_wake();
        if (ft.legacy_offload)
                defer_console_output();

        pr_flush(1000, true);
}

/**
 * console_cpu_notify - print deferred console messages after CPU hotplug
 * @cpu: unused
 *
 * If printk() is called from a CPU that is not online yet, the messages
 * will be printed on the console only if there are CON_ANYTIME consoles.
 * This function is called when a new CPU comes online (or fails to come
 * up) or goes offline.
 */
static int console_cpu_notify(unsigned int cpu)
{
        struct console_flush_type ft;

        if (!cpuhp_tasks_frozen) {
                printk_get_console_flush_type(&ft);
                if (ft.nbcon_atomic)
                        nbcon_atomic_flush_pending();
                if (ft.legacy_direct) {
                        if (console_trylock())
                                console_unlock();
                }
        }
        return 0;
}

/**
 * console_lock - block the console subsystem from printing
 *
 * Acquires a lock which guarantees that no consoles will
 * be in or enter their write() callback.
 *
 * Can sleep, returns nothing.
 */
void console_lock(void)
{
        might_sleep();

        /* On panic, the console_lock must be left to the panic cpu. */
        while (other_cpu_in_panic())
                msleep(1000);

        down_console_sem();
        console_locked = 1;
        console_may_schedule = 1;
}
EXPORT_SYMBOL(console_lock);

/**
 * console_trylock - try to block the console subsystem from printing
 *
 * Try to acquire a lock which guarantees that no consoles will
 * be in or enter their write() callback.
 *
 * returns 1 on success, and 0 on failure to acquire the lock.
 */
int console_trylock(void)
{
        /* On panic, the console_lock must be left to the panic cpu. */
        if (other_cpu_in_panic())
                return 0;
        if (down_trylock_console_sem())
                return 0;
        console_locked = 1;
        console_may_schedule = 0;
        return 1;
}
EXPORT_SYMBOL(console_trylock);

int is_console_locked(void)
{
        return console_locked;
}
EXPORT_SYMBOL(is_console_locked);

static void __console_unlock(void)
{
        console_locked = 0;
        up_console_sem();
}

#ifdef CONFIG_PRINTK

/*
 * Prepend the message in @pmsg->pbufs->outbuf. This is achieved by shifting
 * the existing message over and inserting the scratchbuf message.
 *
 * @pmsg is the original printk message.
 * @fmt is the printf format of the message which will prepend the existing one.
 *
 * If there is not enough space in @pmsg->pbufs->outbuf, the existing
 * message text will be sufficiently truncated.
 *
 * If @pmsg->pbufs->outbuf is modified, @pmsg->outbuf_len is updated.
 */
__printf(2, 3)
static void console_prepend_message(struct printk_message *pmsg, const char *fmt, ...)
{
        struct printk_buffers *pbufs = pmsg->pbufs;
        const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
        const size_t outbuf_sz = sizeof(pbufs->outbuf);
        char *scratchbuf = &pbufs->scratchbuf[0];
        char *outbuf = &pbufs->outbuf[0];
        va_list args;
        size_t len;

        va_start(args, fmt);
        len = vscnprintf(scratchbuf, scratchbuf_sz, fmt, args);
        va_end(args);

        /*
         * Make sure outbuf is sufficiently large before prepending.
         * Keep at least the prefix when the message must be truncated.
         * It is a rather theoretical problem when someone tries to
         * use a minimalist buffer.
         */
        if (WARN_ON_ONCE(len + PRINTK_PREFIX_MAX >= outbuf_sz))
                return;

        if (pmsg->outbuf_len + len >= outbuf_sz) {
                /* Truncate the message, but keep it terminated. */
                pmsg->outbuf_len = outbuf_sz - (len + 1);
                outbuf[pmsg->outbuf_len] = 0;
        }

        memmove(outbuf + len, outbuf, pmsg->outbuf_len + 1);
        memcpy(outbuf, scratchbuf, len);
        pmsg->outbuf_len += len;
}

/*
 * Prepend the message in @pmsg->pbufs->outbuf with a "dropped message".
 * @pmsg->outbuf_len is updated appropriately.
 *
 * @pmsg is the printk message to prepend.
 *
 * @dropped is the dropped count to report in the dropped message.
 */
void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped)
{
        console_prepend_message(pmsg, "** %lu printk messages dropped **\n", dropped);
}

/*
 * Prepend the message in @pmsg->pbufs->outbuf with a "replay message".
 * @pmsg->outbuf_len is updated appropriately.
 *
 * @pmsg is the printk message to prepend.
 */
void console_prepend_replay(struct printk_message *pmsg)
{
        console_prepend_message(pmsg, "** replaying previous printk message **\n");
}

/*
 * Read and format the specified record (or a later record if the specified
 * record is not available).
 *
 * @pmsg will contain the formatted result. @pmsg->pbufs must point to a
 * struct printk_buffers.
 *
 * @seq is the record to read and format. If it is not available, the next
 * valid record is read.
 *
 * @is_extended specifies if the message should be formatted for extended
 * console output.
 *
 * @may_supress specifies if records may be skipped based on loglevel.
 *
 * Returns false if no record is available. Otherwise true and all fields
 * of @pmsg are valid. (See the documentation of struct printk_message
 * for information about the @pmsg fields.)
 */
bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
                             bool is_extended, bool may_suppress)
{
        struct printk_buffers *pbufs = pmsg->pbufs;
        const size_t scratchbuf_sz = sizeof(pbufs->scratchbuf);
        const size_t outbuf_sz = sizeof(pbufs->outbuf);
        char *scratchbuf = &pbufs->scratchbuf[0];
        char *outbuf = &pbufs->outbuf[0];
        struct printk_info info;
        struct printk_record r;
        size_t len = 0;
        bool force_con;

        /*
         * Formatting extended messages requires a separate buffer, so use the
         * scratch buffer to read in the ringbuffer text.
         *
         * Formatting normal messages is done in-place, so read the ringbuffer
         * text directly into the output buffer.
         */
        if (is_extended)
                prb_rec_init_rd(&r, &info, scratchbuf, scratchbuf_sz);
        else
                prb_rec_init_rd(&r, &info, outbuf, outbuf_sz);

        if (!prb_read_valid(prb, seq, &r))
                return false;

        pmsg->seq = r.info->seq;
        pmsg->dropped = r.info->seq - seq;
        force_con = r.info->flags & LOG_FORCE_CON;

        /*
         * Skip records that are not forced to be printed on consoles and that
         * has level above the console loglevel.
         */
        if (!force_con && may_suppress && suppress_message_printing(r.info->level))
                goto out;

        if (is_extended) {
                len = info_print_ext_header(outbuf, outbuf_sz, r.info);
                len += msg_print_ext_body(outbuf + len, outbuf_sz - len,
                                          &r.text_buf[0], r.info->text_len, &r.info->dev_info);
        } else {
                len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
        }
out:
        pmsg->outbuf_len = len;
        return true;
}

/*
 * Legacy console printing from printk() caller context does not respect
 * raw_spinlock/spinlock nesting. For !PREEMPT_RT the lockdep warning is a
 * false positive. For PREEMPT_RT the false positive condition does not
 * occur.
 *
 * This map is used to temporarily establish LD_WAIT_SLEEP context for the
 * console write() callback when legacy printing to avoid false positive
 * lockdep complaints, thus allowing lockdep to continue to function for
 * real issues.
 */
#ifdef CONFIG_PREEMPT_RT
static inline void printk_legacy_allow_spinlock_enter(void) { }
static inline void printk_legacy_allow_spinlock_exit(void) { }
#else
static DEFINE_WAIT_OVERRIDE_MAP(printk_legacy_map, LD_WAIT_SLEEP);

static inline void printk_legacy_allow_spinlock_enter(void)
{
        lock_map_acquire_try(&printk_legacy_map);
}

static inline void printk_legacy_allow_spinlock_exit(void)
{
        lock_map_release(&printk_legacy_map);
}
#endif /* CONFIG_PREEMPT_RT */

/*
 * Used as the printk buffers for non-panic, serialized console printing.
 * This is for legacy (!CON_NBCON) as well as all boot (CON_BOOT) consoles.
 * Its usage requires the console_lock held.
 */
struct printk_buffers printk_shared_pbufs;

/*
 * Print one record for the given console. The record printed is whatever
 * record is the next available record for the given console.
 *
 * @handover will be set to true if a printk waiter has taken over the
 * console_lock, in which case the caller is no longer holding both the
 * console_lock and the SRCU read lock. Otherwise it is set to false.
 *
 * @cookie is the cookie from the SRCU read lock.
 *
 * Returns false if the given console has no next record to print, otherwise
 * true.
 *
 * Requires the console_lock and the SRCU read lock.
 */
static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
        bool is_extended = console_srcu_read_flags(con) & CON_EXTENDED;
        char *outbuf = &printk_shared_pbufs.outbuf[0];
        struct printk_message pmsg = {
                .pbufs = &printk_shared_pbufs,
        };
        unsigned long flags;

        *handover = false;

        if (!printk_get_next_message(&pmsg, con->seq, is_extended, true))
                return false;

        con->dropped += pmsg.dropped;

        /* Skip messages of formatted length 0. */
        if (pmsg.outbuf_len == 0) {
                con->seq = pmsg.seq + 1;
                goto skip;
        }

        if (con->dropped && !is_extended) {
                console_prepend_dropped(&pmsg, con->dropped);
                con->dropped = 0;
        }

        /* Write everything out to the hardware. */

        if (force_legacy_kthread() && !panic_in_progress()) {
                /*
                 * With forced threading this function is in a task context
                 * (either legacy kthread or get_init_console_seq()). There
                 * is no need for concern about printk reentrance, handovers,
                 * or lockdep complaints.
                 */

                con->write(con, outbuf, pmsg.outbuf_len);
                con->seq = pmsg.seq + 1;
        } else {
                /*
                 * While actively printing out messages, if another printk()
                 * were to occur on another CPU, it may wait for this one to
                 * finish. This task can not be preempted if there is a
                 * waiter waiting to take over.
                 *
                 * Interrupts are disabled because the hand over to a waiter
                 * must not be interrupted until the hand over is completed
                 * (@console_waiter is cleared).
                 */
                printk_safe_enter_irqsave(flags);
                console_lock_spinning_enable();

                /* Do not trace print latency. */
                stop_critical_timings();

                printk_legacy_allow_spinlock_enter();
                con->write(con, outbuf, pmsg.outbuf_len);
                printk_legacy_allow_spinlock_exit();

                start_critical_timings();

                con->seq = pmsg.seq + 1;

                *handover = console_lock_spinning_disable_and_check(cookie);
                printk_safe_exit_irqrestore(flags);
        }
skip:
        return true;
}

#else

static bool console_emit_next_record(struct console *con, bool *handover, int cookie)
{
        *handover = false;
        return false;
}

static inline void printk_kthreads_check_locked(void) { }

#endif /* CONFIG_PRINTK */

/*
 * Print out all remaining records to all consoles.
 *
 * @do_cond_resched is set by the caller. It can be true only in schedulable
 * context.
 *
 * @next_seq is set to the sequence number after the last available record.
 * The value is valid only when this function returns true. It means that all
 * usable consoles are completely flushed.
 *
 * @handover will be set to true if a printk waiter has taken over the
 * console_lock, in which case the caller is no longer holding the
 * console_lock. Otherwise it is set to false.
 *
 * Returns true when there was at least one usable console and all messages
 * were flushed to all usable consoles. A returned false informs the caller
 * that everything was not flushed (either there were no usable consoles or
 * another context has taken over printing or it is a panic situation and this
 * is not the panic CPU). Regardless the reason, the caller should assume it
 * is not useful to immediately try again.
 *
 * Requires the console_lock.
 */
static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handover)
{
        struct console_flush_type ft;
        bool any_usable = false;
        struct console *con;
        bool any_progress;
        int cookie;

        *next_seq = 0;
        *handover = false;

        do {
                any_progress = false;

                printk_get_console_flush_type(&ft);

                cookie = console_srcu_read_lock();
                for_each_console_srcu(con) {
                        short flags = console_srcu_read_flags(con);
                        u64 printk_seq;
                        bool progress;

                        /*
                         * console_flush_all() is only responsible for nbcon
                         * consoles when the nbcon consoles cannot print via
                         * their atomic or threaded flushing.
                         */
                        if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
                                continue;

                        if (!console_is_usable(con, flags, !do_cond_resched))
                                continue;
                        any_usable = true;

                        if (flags & CON_NBCON) {
                                progress = nbcon_legacy_emit_next_record(con, handover, cookie,
                                                                         !do_cond_resched);
                                printk_seq = nbcon_seq_read(con);
                        } else {
                                progress = console_emit_next_record(con, handover, cookie);
                                printk_seq = con->seq;
                        }

                        /*
                         * If a handover has occurred, the SRCU read lock
                         * is already released.
                         */
                        if (*handover)
                                return false;

                        /* Track the next of the highest seq flushed. */
                        if (printk_seq > *next_seq)
                                *next_seq = printk_seq;

                        if (!progress)
                                continue;
                        any_progress = true;

                        /* Allow panic_cpu to take over the consoles safely. */
                        if (other_cpu_in_panic())
                                goto abandon;

                        if (do_cond_resched)
                                cond_resched();
                }
                console_srcu_read_unlock(cookie);
        } while (any_progress);

        return any_usable;

abandon:
        console_srcu_read_unlock(cookie);
        return false;
}

static void __console_flush_and_unlock(void)
{
        bool do_cond_resched;
        bool handover;
        bool flushed;
        u64 next_seq;

        /*
         * Console drivers are called with interrupts disabled, so
         * @console_may_schedule should be cleared before; however, we may
         * end up dumping a lot of lines, for example, if called from
         * console registration path, and should invoke cond_resched()
         * between lines if allowable.  Not doing so can cause a very long
         * scheduling stall on a slow console leading to RCU stall and
         * softlockup warnings which exacerbate the issue with more
         * messages practically incapacitating the system. Therefore, create
         * a local to use for the printing loop.
         */
        do_cond_resched = console_may_schedule;

        do {
                console_may_schedule = 0;

                flushed = console_flush_all(do_cond_resched, &next_seq, &handover);
                if (!handover)
                        __console_unlock();

                /*
                 * Abort if there was a failure to flush all messages to all
                 * usable consoles. Either it is not possible to flush (in
                 * which case it would be an infinite loop of retrying) or
                 * another context has taken over printing.
                 */
                if (!flushed)
                        break;

                /*
                 * Some context may have added new records after
                 * console_flush_all() but before unlocking the console.
                 * Re-check if there is a new record to flush. If the trylock
                 * fails, another context is already handling the printing.
                 */
        } while (prb_read_valid(prb, next_seq, NULL) && console_trylock());
}

/**
 * console_unlock - unblock the legacy console subsystem from printing
 *
 * Releases the console_lock which the caller holds to block printing of
 * the legacy console subsystem.
 *
 * While the console_lock was held, console output may have been buffered
 * by printk(). If this is the case, console_unlock() emits the output on
 * legacy consoles prior to releasing the lock.
 *
 * console_unlock(); may be called from any context.
 */
void console_unlock(void)
{
        struct console_flush_type ft;

        printk_get_console_flush_type(&ft);
        if (ft.legacy_direct)
                __console_flush_and_unlock();
        else
                __console_unlock();
}
EXPORT_SYMBOL(console_unlock);

/**
 * console_conditional_schedule - yield the CPU if required
 *
 * If the console code is currently allowed to sleep, and
 * if this CPU should yield the CPU to another task, do
 * so here.
 *
 * Must be called within console_lock();.
 */
void __sched console_conditional_schedule(void)
{
        if (console_may_schedule)
                cond_resched();
}
EXPORT_SYMBOL(console_conditional_schedule);

void console_unblank(void)
{
        bool found_unblank = false;
        struct console *c;
        int cookie;

        /*
         * First check if there are any consoles implementing the unblank()
         * callback. If not, there is no reason to continue and take the
         * console lock, which in particular can be dangerous if
         * @oops_in_progress is set.
         */
        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                short flags = console_srcu_read_flags(c);

                if (flags & CON_SUSPENDED)
                        continue;

                if ((flags & CON_ENABLED) && c->unblank) {
                        found_unblank = true;
                        break;
                }
        }
        console_srcu_read_unlock(cookie);
        if (!found_unblank)
                return;

        /*
         * Stop console printing because the unblank() callback may
         * assume the console is not within its write() callback.
         *
         * If @oops_in_progress is set, this may be an atomic context.
         * In that case, attempt a trylock as best-effort.
         */
        if (oops_in_progress) {
                /* Semaphores are not NMI-safe. */
                if (in_nmi())
                        return;

                /*
                 * Attempting to trylock the console lock can deadlock
                 * if another CPU was stopped while modifying the
                 * semaphore. "Hope and pray" that this is not the
                 * current situation.
                 */
                if (down_trylock_console_sem() != 0)
                        return;
        } else
                console_lock();

        console_locked = 1;
        console_may_schedule = 0;

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                short flags = console_srcu_read_flags(c);

                if (flags & CON_SUSPENDED)
                        continue;

                if ((flags & CON_ENABLED) && c->unblank)
                        c->unblank();
        }
        console_srcu_read_unlock(cookie);

        console_unlock();

        if (!oops_in_progress)
                pr_flush(1000, true);
}

/*
 * Rewind all consoles to the oldest available record.
 *
 * IMPORTANT: The function is safe only when called under
 *            console_lock(). It is not enforced because
 *            it is used as a best effort in panic().
 */
static void __console_rewind_all(void)
{
        struct console *c;
        short flags;
        int cookie;
        u64 seq;

        seq = prb_first_valid_seq(prb);

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                flags = console_srcu_read_flags(c);

                if (flags & CON_NBCON) {
                        nbcon_seq_force(c, seq);
                } else {
                        /*
                         * This assignment is safe only when called under
                         * console_lock(). On panic, legacy consoles are
                         * only best effort.
                         */
                        c->seq = seq;
                }
        }
        console_srcu_read_unlock(cookie);
}

/**
 * console_flush_on_panic - flush console content on panic
 * @mode: flush all messages in buffer or just the pending ones
 *
 * Immediately output all pending messages no matter what.
 */
void console_flush_on_panic(enum con_flush_mode mode)
{
        struct console_flush_type ft;
        bool handover;
        u64 next_seq;

        /*
         * Ignore the console lock and flush out the messages. Attempting a
         * trylock would not be useful because:
         *
         *   - if it is contended, it must be ignored anyway
         *   - console_lock() and console_trylock() block and fail
         *     respectively in panic for non-panic CPUs
         *   - semaphores are not NMI-safe
         */

        /*
         * If another context is holding the console lock,
         * @console_may_schedule might be set. Clear it so that
         * this context does not call cond_resched() while flushing.
         */
        console_may_schedule = 0;

        if (mode == CONSOLE_REPLAY_ALL)
                __console_rewind_all();

        printk_get_console_flush_type(&ft);
        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();

        /* Flush legacy consoles once allowed, even when dangerous. */
        if (legacy_allow_panic_sync)
                console_flush_all(false, &next_seq, &handover);
}

/*
 * Return the console tty driver structure and its associated index
 */
struct tty_driver *console_device(int *index)
{
        struct console *c;
        struct tty_driver *driver = NULL;
        int cookie;

        /*
         * Take console_lock to serialize device() callback with
         * other console operations. For example, fg_console is
         * modified under console_lock when switching vt.
         */
        console_lock();

        cookie = console_srcu_read_lock();
        for_each_console_srcu(c) {
                if (!c->device)
                        continue;
                driver = c->device(c, index);
                if (driver)
                        break;
        }
        console_srcu_read_unlock(cookie);

        console_unlock();
        return driver;
}

/*
 * Prevent further output on the passed console device so that (for example)
 * serial drivers can suspend console output before suspending a port, and can
 * re-enable output afterwards.
 */
void console_suspend(struct console *console)
{
        __pr_flush(console, 1000, true);
        console_list_lock();
        console_srcu_write_flags(console, console->flags & ~CON_ENABLED);
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. All contexts must
         * be able to see that this console is disabled so that (for example)
         * the caller can suspend the port without risk of another context
         * using the port.
         */
        synchronize_srcu(&console_srcu);
}
EXPORT_SYMBOL(console_suspend);

void console_resume(struct console *console)
{
        struct console_flush_type ft;
        bool is_nbcon;

        console_list_lock();
        console_srcu_write_flags(console, console->flags | CON_ENABLED);
        is_nbcon = console->flags & CON_NBCON;
        console_list_unlock();

        /*
         * Ensure that all SRCU list walks have completed. The related
         * printing context must be able to see it is enabled so that
         * it is guaranteed to wake up and resume printing.
         */
        synchronize_srcu(&console_srcu);

        printk_get_console_flush_type(&ft);
        if (is_nbcon && ft.nbcon_offload)
                nbcon_kthread_wake(console);
        else if (ft.legacy_offload)
                defer_console_output();

        __pr_flush(console, 1000, true);
}
EXPORT_SYMBOL(console_resume);

#ifdef CONFIG_PRINTK
static int unregister_console_locked(struct console *console);

/* True when system boot is far enough to create printer threads. */
static bool printk_kthreads_ready __ro_after_init;

static struct task_struct *printk_legacy_kthread;

static bool legacy_kthread_should_wakeup(void)
{
        struct console_flush_type ft;
        struct console *con;
        bool ret = false;
        int cookie;

        if (kthread_should_stop())
                return true;

        printk_get_console_flush_type(&ft);

        cookie = console_srcu_read_lock();
        for_each_console_srcu(con) {
                short flags = console_srcu_read_flags(con);
                u64 printk_seq;

                /*
                 * The legacy printer thread is only responsible for nbcon
                 * consoles when the nbcon consoles cannot print via their
                 * atomic or threaded flushing.
                 */
                if ((flags & CON_NBCON) && (ft.nbcon_atomic || ft.nbcon_offload))
                        continue;

                if (!console_is_usable(con, flags, false))
                        continue;

                if (flags & CON_NBCON) {
                        printk_seq = nbcon_seq_read(con);
                } else {
                        /*
                         * It is safe to read @seq because only this
                         * thread context updates @seq.
                         */
                        printk_seq = con->seq;
                }

                if (prb_read_valid(prb, printk_seq, NULL)) {
                        ret = true;
                        break;
                }
        }
        console_srcu_read_unlock(cookie);

        return ret;
}

static int legacy_kthread_func(void *unused)
{
        for (;;) {
                wait_event_interruptible(legacy_wait, legacy_kthread_should_wakeup());

                if (kthread_should_stop())
                        break;

                console_lock();
                __console_flush_and_unlock();
        }

        return 0;
}

static bool legacy_kthread_create(void)
{
        struct task_struct *kt;

        lockdep_assert_console_list_lock_held();

        kt = kthread_run(legacy_kthread_func, NULL, "pr/legacy");
        if (WARN_ON(IS_ERR(kt))) {
                pr_err("failed to start legacy printing thread\n");
                return false;
        }

        printk_legacy_kthread = kt;

        /*
         * It is important that console printing threads are scheduled
         * shortly after a printk call and with generous runtime budgets.
         */
        sched_set_normal(printk_legacy_kthread, -20);

        return true;
}

/**
 * printk_kthreads_shutdown - shutdown all threaded printers
 *
 * On system shutdown all threaded printers are stopped. This allows printk
 * to transition back to atomic printing, thus providing a robust mechanism
 * for the final shutdown/reboot messages to be output.
 */
static void printk_kthreads_shutdown(void)
{
        struct console *con;

        console_list_lock();
        if (printk_kthreads_running) {
                printk_kthreads_running = false;

                for_each_console(con) {
                        if (con->flags & CON_NBCON)
                                nbcon_kthread_stop(con);
                }

                /*
                 * The threads may have been stopped while printing a
                 * backlog. Flush any records left over.
                 */
                nbcon_atomic_flush_pending();
        }
        console_list_unlock();
}

static struct syscore_ops printk_syscore_ops = {
        .shutdown = printk_kthreads_shutdown,
};

/*
 * If appropriate, start nbcon kthreads and set @printk_kthreads_running.
 * If any kthreads fail to start, those consoles are unregistered.
 *
 * Must be called under console_list_lock().
 */
static void printk_kthreads_check_locked(void)
{
        struct hlist_node *tmp;
        struct console *con;

        lockdep_assert_console_list_lock_held();

        if (!printk_kthreads_ready)
                return;

        if (have_legacy_console || have_boot_console) {
                if (!printk_legacy_kthread &&
                    force_legacy_kthread() &&
                    !legacy_kthread_create()) {
                        /*
                         * All legacy consoles must be unregistered. If there
                         * are any nbcon consoles, they will set up their own
                         * kthread.
                         */
                        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                                if (con->flags & CON_NBCON)
                                        continue;

                                unregister_console_locked(con);
                        }
                }
        } else if (printk_legacy_kthread) {
                kthread_stop(printk_legacy_kthread);
                printk_legacy_kthread = NULL;
        }

        /*
         * Printer threads cannot be started as long as any boot console is
         * registered because there is no way to synchronize the hardware
         * registers between boot console code and regular console code.
         * It can only be known that there will be no new boot consoles when
         * an nbcon console is registered.
         */
        if (have_boot_console || !have_nbcon_console) {
                /* Clear flag in case all nbcon consoles unregistered. */
                printk_kthreads_running = false;
                return;
        }

        if (printk_kthreads_running)
                return;

        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                if (!(con->flags & CON_NBCON))
                        continue;

                if (!nbcon_kthread_create(con))
                        unregister_console_locked(con);
        }

        printk_kthreads_running = true;
}

static int __init printk_set_kthreads_ready(void)
{
        register_syscore_ops(&printk_syscore_ops);

        console_list_lock();
        printk_kthreads_ready = true;
        printk_kthreads_check_locked();
        console_list_unlock();

        return 0;
}
early_initcall(printk_set_kthreads_ready);
#endif /* CONFIG_PRINTK */

static int __read_mostly keep_bootcon;

static int __init keep_bootcon_setup(char *str)
{
        keep_bootcon = 1;
        pr_info("debug: skip boot console de-registration.\n");

        return 0;
}

early_param("keep_bootcon", keep_bootcon_setup);

static int console_call_setup(struct console *newcon, char *options)
{
        int err;

        if (!newcon->setup)
                return 0;

        /* Synchronize with possible boot console. */
        console_lock();
        err = newcon->setup(newcon, options);
        console_unlock();

        return err;
}

/*
 * This is called by register_console() to try to match
 * the newly registered console with any of the ones selected
 * by either the command line or add_preferred_console() and
 * setup/enable it.
 *
 * Care need to be taken with consoles that are statically
 * enabled such as netconsole
 */
static int try_enable_preferred_console(struct console *newcon,
                                        bool user_specified)
{
        struct console_cmdline *c;
        int i, err;

        for (i = 0, c = console_cmdline;
             i < MAX_CMDLINECONSOLES && (c->name[0] || c->devname[0]);
             i++, c++) {
                /* Console not yet initialized? */
                if (!c->name[0])
                        continue;
                if (c->user_specified != user_specified)
                        continue;
                if (!newcon->match ||
                    newcon->match(newcon, c->name, c->index, c->options) != 0) {
                        /* default matching */
                        BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name));
                        if (strcmp(c->name, newcon->name) != 0)
                                continue;
                        if (newcon->index >= 0 &&
                            newcon->index != c->index)
                                continue;
                        if (newcon->index < 0)
                                newcon->index = c->index;

                        if (_braille_register_console(newcon, c))
                                return 0;

                        err = console_call_setup(newcon, c->options);
                        if (err)
                                return err;
                }
                newcon->flags |= CON_ENABLED;
                if (i == preferred_console)
                        newcon->flags |= CON_CONSDEV;
                return 0;
        }

        /*
         * Some consoles, such as pstore and netconsole, can be enabled even
         * without matching. Accept the pre-enabled consoles only when match()
         * and setup() had a chance to be called.
         */
        if (newcon->flags & CON_ENABLED && c->user_specified ==        user_specified)
                return 0;

        return -ENOENT;
}

/* Try to enable the console unconditionally */
static void try_enable_default_console(struct console *newcon)
{
        if (newcon->index < 0)
                newcon->index = 0;

        if (console_call_setup(newcon, NULL) != 0)
                return;

        newcon->flags |= CON_ENABLED;

        if (newcon->device)
                newcon->flags |= CON_CONSDEV;
}

/* Return the starting sequence number for a newly registered console. */
static u64 get_init_console_seq(struct console *newcon, bool bootcon_registered)
{
        struct console *con;
        bool handover;
        u64 init_seq;

        if (newcon->flags & (CON_PRINTBUFFER | CON_BOOT)) {
                /* Get a consistent copy of @syslog_seq. */
                mutex_lock(&syslog_lock);
                init_seq = syslog_seq;
                mutex_unlock(&syslog_lock);
        } else {
                /* Begin with next message added to ringbuffer. */
                init_seq = prb_next_seq(prb);

                /*
                 * If any enabled boot consoles are due to be unregistered
                 * shortly, some may not be caught up and may be the same
                 * device as @newcon. Since it is not known which boot console
                 * is the same device, flush all consoles and, if necessary,
                 * start with the message of the enabled boot console that is
                 * the furthest behind.
                 */
                if (bootcon_registered && !keep_bootcon) {
                        /*
                         * Hold the console_lock to stop console printing and
                         * guarantee safe access to console->seq.
                         */
                        console_lock();

                        /*
                         * Flush all consoles and set the console to start at
                         * the next unprinted sequence number.
                         */
                        if (!console_flush_all(true, &init_seq, &handover)) {
                                /*
                                 * Flushing failed. Just choose the lowest
                                 * sequence of the enabled boot consoles.
                                 */

                                /*
                                 * If there was a handover, this context no
                                 * longer holds the console_lock.
                                 */
                                if (handover)
                                        console_lock();

                                init_seq = prb_next_seq(prb);
                                for_each_console(con) {
                                        u64 seq;

                                        if (!(con->flags & CON_BOOT) ||
                                            !(con->flags & CON_ENABLED)) {
                                                continue;
                                        }

                                        if (con->flags & CON_NBCON)
                                                seq = nbcon_seq_read(con);
                                        else
                                                seq = con->seq;

                                        if (seq < init_seq)
                                                init_seq = seq;
                                }
                        }

                        console_unlock();
                }
        }

        return init_seq;
}

#define console_first()                                \
        hlist_entry(console_list.first, struct console, node)

static int unregister_console_locked(struct console *console);

/*
 * The console driver calls this routine during kernel initialization
 * to register the console printing procedure with printk() and to
 * print any messages that were printed by the kernel before the
 * console driver was initialized.
 *
 * This can happen pretty early during the boot process (because of
 * early_printk) - sometimes before setup_arch() completes - be careful
 * of what kernel features are used - they may not be initialised yet.
 *
 * There are two types of consoles - bootconsoles (early_printk) and
 * "real" consoles (everything which is not a bootconsole) which are
 * handled differently.
 *  - Any number of bootconsoles can be registered at any time.
 *  - As soon as a "real" console is registered, all bootconsoles
 *    will be unregistered automatically.
 *  - Once a "real" console is registered, any attempt to register a
 *    bootconsoles will be rejected
 */
void register_console(struct console *newcon)
{
        bool use_device_lock = (newcon->flags & CON_NBCON) && newcon->write_atomic;
        bool bootcon_registered = false;
        bool realcon_registered = false;
        struct console *con;
        unsigned long flags;
        u64 init_seq;
        int err;

        console_list_lock();

        for_each_console(con) {
                if (WARN(con == newcon, "console '%s%d' already registered\n",
                                         con->name, con->index)) {
                        goto unlock;
                }

                if (con->flags & CON_BOOT)
                        bootcon_registered = true;
                else
                        realcon_registered = true;
        }

        /* Do not register boot consoles when there already is a real one. */
        if ((newcon->flags & CON_BOOT) && realcon_registered) {
                pr_info("Too late to register bootconsole %s%d\n",
                        newcon->name, newcon->index);
                goto unlock;
        }

        if (newcon->flags & CON_NBCON) {
                /*
                 * Ensure the nbcon console buffers can be allocated
                 * before modifying any global data.
                 */
                if (!nbcon_alloc(newcon))
                        goto unlock;
        }

        /*
         * See if we want to enable this console driver by default.
         *
         * Nope when a console is preferred by the command line, device
         * tree, or SPCR.
         *
         * The first real console with tty binding (driver) wins. More
         * consoles might get enabled before the right one is found.
         *
         * Note that a console with tty binding will have CON_CONSDEV
         * flag set and will be first in the list.
         */
        if (preferred_console < 0) {
                if (hlist_empty(&console_list) || !console_first()->device ||
                    console_first()->flags & CON_BOOT) {
                        try_enable_default_console(newcon);
                }
        }

        /* See if this console matches one we selected on the command line */
        err = try_enable_preferred_console(newcon, true);

        /* If not, try to match against the platform default(s) */
        if (err == -ENOENT)
                err = try_enable_preferred_console(newcon, false);

        /* printk() messages are not printed to the Braille console. */
        if (err || newcon->flags & CON_BRL) {
                if (newcon->flags & CON_NBCON)
                        nbcon_free(newcon);
                goto unlock;
        }

        /*
         * If we have a bootconsole, and are switching to a real console,
         * don't print everything out again, since when the boot console, and
         * the real console are the same physical device, it's annoying to
         * see the beginning boot messages twice
         */
        if (bootcon_registered &&
            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
                newcon->flags &= ~CON_PRINTBUFFER;
        }

        newcon->dropped = 0;
        init_seq = get_init_console_seq(newcon, bootcon_registered);

        if (newcon->flags & CON_NBCON) {
                have_nbcon_console = true;
                nbcon_seq_force(newcon, init_seq);
        } else {
                have_legacy_console = true;
                newcon->seq = init_seq;
        }

        if (newcon->flags & CON_BOOT)
                have_boot_console = true;

        /*
         * If another context is actively using the hardware of this new
         * console, it will not be aware of the nbcon synchronization. This
         * is a risk that two contexts could access the hardware
         * simultaneously if this new console is used for atomic printing
         * and the other context is still using the hardware.
         *
         * Use the driver synchronization to ensure that the hardware is not
         * in use while this new console transitions to being registered.
         */
        if (use_device_lock)
                newcon->device_lock(newcon, &flags);

        /*
         * Put this console in the list - keep the
         * preferred driver at the head of the list.
         */
        if (hlist_empty(&console_list)) {
                /* Ensure CON_CONSDEV is always set for the head. */
                newcon->flags |= CON_CONSDEV;
                hlist_add_head_rcu(&newcon->node, &console_list);

        } else if (newcon->flags & CON_CONSDEV) {
                /* Only the new head can have CON_CONSDEV set. */
                console_srcu_write_flags(console_first(), console_first()->flags & ~CON_CONSDEV);
                hlist_add_head_rcu(&newcon->node, &console_list);

        } else {
                hlist_add_behind_rcu(&newcon->node, console_list.first);
        }

        /*
         * No need to synchronize SRCU here! The caller does not rely
         * on all contexts being able to see the new console before
         * register_console() completes.
         */

        /* This new console is now registered. */
        if (use_device_lock)
                newcon->device_unlock(newcon, flags);

        console_sysfs_notify();

        /*
         * By unregistering the bootconsoles after we enable the real console
         * we get the "console xxx enabled" message on all the consoles -
         * boot consoles, real consoles, etc - this is to ensure that end
         * users know there might be something in the kernel's log buffer that
         * went to the bootconsole (that they do not see on the real console)
         */
        con_printk(KERN_INFO, newcon, "enabled\n");
        if (bootcon_registered &&
            ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
            !keep_bootcon) {
                struct hlist_node *tmp;

                hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                        if (con->flags & CON_BOOT)
                                unregister_console_locked(con);
                }
        }

        /* Changed console list, may require printer threads to start/stop. */
        printk_kthreads_check_locked();
unlock:
        console_list_unlock();
}
EXPORT_SYMBOL(register_console);

/* Must be called under console_list_lock(). */
static int unregister_console_locked(struct console *console)
{
        bool use_device_lock = (console->flags & CON_NBCON) && console->write_atomic;
        bool found_legacy_con = false;
        bool found_nbcon_con = false;
        bool found_boot_con = false;
        unsigned long flags;
        struct console *c;
        int res;

        lockdep_assert_console_list_lock_held();

        con_printk(KERN_INFO, console, "disabled\n");

        res = _braille_unregister_console(console);
        if (res < 0)
                return res;
        if (res > 0)
                return 0;

        if (!console_is_registered_locked(console))
                res = -ENODEV;
        else if (console_is_usable(console, console->flags, true))
                __pr_flush(console, 1000, true);

        /* Disable it unconditionally */
        console_srcu_write_flags(console, console->flags & ~CON_ENABLED);

        if (res < 0)
                return res;

        /*
         * Use the driver synchronization to ensure that the hardware is not
         * in use while this console transitions to being unregistered.
         */
        if (use_device_lock)
                console->device_lock(console, &flags);

        hlist_del_init_rcu(&console->node);

        if (use_device_lock)
                console->device_unlock(console, flags);

        /*
         * <HISTORICAL>
         * If this isn't the last console and it has CON_CONSDEV set, we
         * need to set it on the next preferred console.
         * </HISTORICAL>
         *
         * The above makes no sense as there is no guarantee that the next
         * console has any device attached. Oh well....
         */
        if (!hlist_empty(&console_list) && console->flags & CON_CONSDEV)
                console_srcu_write_flags(console_first(), console_first()->flags | CON_CONSDEV);

        /*
         * Ensure that all SRCU list walks have completed. All contexts
         * must not be able to see this console in the list so that any
         * exit/cleanup routines can be performed safely.
         */
        synchronize_srcu(&console_srcu);

        if (console->flags & CON_NBCON)
                nbcon_free(console);

        console_sysfs_notify();

        if (console->exit)
                res = console->exit(console);

        /*
         * With this console gone, the global flags tracking registered
         * console types may have changed. Update them.
         */
        for_each_console(c) {
                if (c->flags & CON_BOOT)
                        found_boot_con = true;

                if (c->flags & CON_NBCON)
                        found_nbcon_con = true;
                else
                        found_legacy_con = true;
        }
        if (!found_boot_con)
                have_boot_console = found_boot_con;
        if (!found_legacy_con)
                have_legacy_console = found_legacy_con;
        if (!found_nbcon_con)
                have_nbcon_console = found_nbcon_con;

        /* Changed console list, may require printer threads to start/stop. */
        printk_kthreads_check_locked();

        return res;
}

int unregister_console(struct console *console)
{
        int res;

        console_list_lock();
        res = unregister_console_locked(console);
        console_list_unlock();
        return res;
}
EXPORT_SYMBOL(unregister_console);

/**
 * console_force_preferred_locked - force a registered console preferred
 * @con: The registered console to force preferred.
 *
 * Must be called under console_list_lock().
 */
void console_force_preferred_locked(struct console *con)
{
        struct console *cur_pref_con;

        if (!console_is_registered_locked(con))
                return;

        cur_pref_con = console_first();

        /* Already preferred? */
        if (cur_pref_con == con)
                return;

        /*
         * Delete, but do not re-initialize the entry. This allows the console
         * to continue to appear registered (via any hlist_unhashed_lockless()
         * checks), even though it was briefly removed from the console list.
         */
        hlist_del_rcu(&con->node);

        /*
         * Ensure that all SRCU list walks have completed so that the console
         * can be added to the beginning of the console list and its forward
         * list pointer can be re-initialized.
         */
        synchronize_srcu(&console_srcu);

        con->flags |= CON_CONSDEV;
        WARN_ON(!con->device);

        /* Only the new head can have CON_CONSDEV set. */
        console_srcu_write_flags(cur_pref_con, cur_pref_con->flags & ~CON_CONSDEV);
        hlist_add_head_rcu(&con->node, &console_list);
}
EXPORT_SYMBOL(console_force_preferred_locked);

/*
 * Initialize the console device. This is called *early*, so
 * we can't necessarily depend on lots of kernel help here.
 * Just do some early initializations, and do the complex setup
 * later.
 */
void __init console_init(void)
{
        int ret;
        initcall_t call;
        initcall_entry_t *ce;

#ifdef CONFIG_NULL_TTY_DEFAULT_CONSOLE
        if (!console_set_on_cmdline)
                add_preferred_console("ttynull", 0, NULL);
#endif

        /* Setup the default TTY line discipline. */
        n_tty_init();

        /*
         * set up the console device so that later boot sequences can
         * inform about problems etc..
         */
        ce = __con_initcall_start;
        trace_initcall_level("console");
        while (ce < __con_initcall_end) {
                call = initcall_from_entry(ce);
                trace_initcall_start(call);
                ret = call();
                trace_initcall_finish(call, ret);
                ce++;
        }
}

/*
 * Some boot consoles access data that is in the init section and which will
 * be discarded after the initcalls have been run. To make sure that no code
 * will access this data, unregister the boot consoles in a late initcall.
 *
 * If for some reason, such as deferred probe or the driver being a loadable
 * module, the real console hasn't registered yet at this point, there will
 * be a brief interval in which no messages are logged to the console, which
 * makes it difficult to diagnose problems that occur during this time.
 *
 * To mitigate this problem somewhat, only unregister consoles whose memory
 * intersects with the init section. Note that all other boot consoles will
 * get unregistered when the real preferred console is registered.
 */
static int __init printk_late_init(void)
{
        struct hlist_node *tmp;
        struct console *con;
        int ret;

        console_list_lock();
        hlist_for_each_entry_safe(con, tmp, &console_list, node) {
                if (!(con->flags & CON_BOOT))
                        continue;

                /* Check addresses that might be used for enabled consoles. */
                if (init_section_intersects(con, sizeof(*con)) ||
                    init_section_contains(con->write, 0) ||
                    init_section_contains(con->read, 0) ||
                    init_section_contains(con->device, 0) ||
                    init_section_contains(con->unblank, 0) ||
                    init_section_contains(con->data, 0)) {
                        /*
                         * Please, consider moving the reported consoles out
                         * of the init section.
                         */
                        pr_warn("bootconsole [%s%d] uses init memory and must be disabled even before the real one is ready\n",
                                con->name, con->index);
                        unregister_console_locked(con);
                }
        }
        console_list_unlock();

        ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL,
                                        console_cpu_notify);
        WARN_ON(ret < 0);
        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "printk:online",
                                        console_cpu_notify, NULL);
        WARN_ON(ret < 0);
        printk_sysctl_init();
        return 0;
}
late_initcall(printk_late_init);

#if defined CONFIG_PRINTK
/* If @con is specified, only wait for that console. Otherwise wait for all. */
static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
{
        unsigned long timeout_jiffies = msecs_to_jiffies(timeout_ms);
        unsigned long remaining_jiffies = timeout_jiffies;
        struct console_flush_type ft;
        struct console *c;
        u64 last_diff = 0;
        u64 printk_seq;
        short flags;
        int cookie;
        u64 diff;
        u64 seq;

        /* Sorry, pr_flush() will not work this early. */
        if (system_state < SYSTEM_SCHEDULING)
                return false;

        might_sleep();

        seq = prb_next_reserve_seq(prb);

        /* Flush the consoles so that records up to @seq are printed. */
        printk_get_console_flush_type(&ft);
        if (ft.nbcon_atomic)
                nbcon_atomic_flush_pending();
        if (ft.legacy_direct) {
                console_lock();
                console_unlock();
        }

        for (;;) {
                unsigned long begin_jiffies;
                unsigned long slept_jiffies;

                diff = 0;

                /*
                 * Hold the console_lock to guarantee safe access to
                 * console->seq. Releasing console_lock flushes more
                 * records in case @seq is still not printed on all
                 * usable consoles.
                 *
                 * Holding the console_lock is not necessary if there
                 * are no legacy or boot consoles. However, such a
                 * console could register at any time. Always hold the
                 * console_lock as a precaution rather than
                 * synchronizing against register_console().
                 */
                console_lock();

                cookie = console_srcu_read_lock();
                for_each_console_srcu(c) {
                        if (con && con != c)
                                continue;

                        flags = console_srcu_read_flags(c);

                        /*
                         * If consoles are not usable, it cannot be expected
                         * that they make forward progress, so only increment
                         * @diff for usable consoles.
                         */
                        if (!console_is_usable(c, flags, true) &&
                            !console_is_usable(c, flags, false)) {
                                continue;
                        }

                        if (flags & CON_NBCON) {
                                printk_seq = nbcon_seq_read(c);
                        } else {
                                printk_seq = c->seq;
                        }

                        if (printk_seq < seq)
                                diff += seq - printk_seq;
                }
                console_srcu_read_unlock(cookie);

                if (diff != last_diff && reset_on_progress)
                        remaining_jiffies = timeout_jiffies;

                console_unlock();

                /* Note: @diff is 0 if there are no usable consoles. */
                if (diff == 0 || remaining_jiffies == 0)
                        break;

                /* msleep(1) might sleep much longer. Check time by jiffies. */
                begin_jiffies = jiffies;
                msleep(1);
                slept_jiffies = jiffies - begin_jiffies;

                remaining_jiffies -= min(slept_jiffies, remaining_jiffies);

                last_diff = diff;
        }

        return (diff == 0);
}

/**
 * pr_flush() - Wait for printing threads to catch up.
 *
 * @timeout_ms:        The maximum time (in ms) to wait.
 * @reset_on_progress: Reset the timeout if forward progress is seen.
 *
 * A value of 0 for @timeout_ms means no waiting will occur. A value of -1
 * represents infinite waiting.
 *
 * If @reset_on_progress is true, the timeout will be reset whenever any
 * printer has been seen to make some forward progress.
 *
 * Context: Process context. May sleep while acquiring console lock.
 * Return: true if all usable printers are caught up.
 */
bool pr_flush(int timeout_ms, bool reset_on_progress)
{
        return __pr_flush(NULL, timeout_ms, reset_on_progress);
}

/*
 * Delayed printk version, for scheduler-internal messages:
 */
#define PRINTK_PENDING_WAKEUP        0x01
#define PRINTK_PENDING_OUTPUT        0x02

static DEFINE_PER_CPU(int, printk_pending);

static void wake_up_klogd_work_func(struct irq_work *irq_work)
{
        int pending = this_cpu_xchg(printk_pending, 0);

        if (pending & PRINTK_PENDING_OUTPUT) {
                if (force_legacy_kthread()) {
                        if (printk_legacy_kthread)
                                wake_up_interruptible(&legacy_wait);
                } else {
                        if (console_trylock())
                                console_unlock();
                }
        }

        if (pending & PRINTK_PENDING_WAKEUP)
                wake_up_interruptible(&log_wait);
}

static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
        IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);

static void __wake_up_klogd(int val)
{
        if (!printk_percpu_data_ready())
                return;

        preempt_disable();
        /*
         * Guarantee any new records can be seen by tasks preparing to wait
         * before this context checks if the wait queue is empty.
         *
         * The full memory barrier within wq_has_sleeper() pairs with the full
         * memory barrier within set_current_state() of
         * prepare_to_wait_event(), which is called after ___wait_event() adds
         * the waiter but before it has checked the wait condition.
         *
         * This pairs with devkmsg_read:A and syslog_print:A.
         */
        if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
            (val & PRINTK_PENDING_OUTPUT)) {
                this_cpu_or(printk_pending, val);
                irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
        }
        preempt_enable();
}

/**
 * wake_up_klogd - Wake kernel logging daemon
 *
 * Use this function when new records have been added to the ringbuffer
 * and the console printing of those records has already occurred or is
 * known to be handled by some other context. This function will only
 * wake the logging daemon.
 *
 * Context: Any context.
 */
void wake_up_klogd(void)
{
        __wake_up_klogd(PRINTK_PENDING_WAKEUP);
}

/**
 * defer_console_output - Wake kernel logging daemon and trigger
 *        console printing in a deferred context
 *
 * Use this function when new records have been added to the ringbuffer,
 * this context is responsible for console printing those records, but
 * the current context is not allowed to perform the console printing.
 * Trigger an irq_work context to perform the console printing. This
 * function also wakes the logging daemon.
 *
 * Context: Any context.
 */
void defer_console_output(void)
{
        /*
         * New messages may have been added directly to the ringbuffer
         * using vprintk_store(), so wake any waiters as well.
         */
        __wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
}

void printk_trigger_flush(void)
{
        defer_console_output();
}

int vprintk_deferred(const char *fmt, va_list args)
{
        return vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args);
}

int _printk_deferred(const char *fmt, ...)
{
        va_list args;
        int r;

        va_start(args, fmt);
        r = vprintk_deferred(fmt, args);
        va_end(args);

        return r;
}

/*
 * printk rate limiting, lifted from the networking subsystem.
 *
 * This enforces a rate limit: not more than 10 kernel messages
 * every 5s to make a denial-of-service attack impossible.
 */
DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);

int __printk_ratelimit(const char *func)
{
        return ___ratelimit(&printk_ratelimit_state, func);
}
EXPORT_SYMBOL(__printk_ratelimit);

/**
 * printk_timed_ratelimit - caller-controlled printk ratelimiting
 * @caller_jiffies: pointer to caller's state
 * @interval_msecs: minimum interval between prints
 *
 * printk_timed_ratelimit() returns true if more than @interval_msecs
 * milliseconds have elapsed since the last time printk_timed_ratelimit()
 * returned true.
 */
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
                        unsigned int interval_msecs)
{
        unsigned long elapsed = jiffies - *caller_jiffies;

        if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
                return false;

        *caller_jiffies = jiffies;
        return true;
}
EXPORT_SYMBOL(printk_timed_ratelimit);

static DEFINE_SPINLOCK(dump_list_lock);
static LIST_HEAD(dump_list);

/**
 * kmsg_dump_register - register a kernel log dumper.
 * @dumper: pointer to the kmsg_dumper structure
 *
 * Adds a kernel log dumper to the system. The dump callback in the
 * structure will be called when the kernel oopses or panics and must be
 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
 */
int kmsg_dump_register(struct kmsg_dumper *dumper)
{
        unsigned long flags;
        int err = -EBUSY;

        /* The dump callback needs to be set */
        if (!dumper->dump)
                return -EINVAL;

        spin_lock_irqsave(&dump_list_lock, flags);
        /* Don't allow registering multiple times */
        if (!dumper->registered) {
                dumper->registered = 1;
                list_add_tail_rcu(&dumper->list, &dump_list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);

        return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_register);

/**
 * kmsg_dump_unregister - unregister a kmsg dumper.
 * @dumper: pointer to the kmsg_dumper structure
 *
 * Removes a dump device from the system. Returns zero on success and
 * %-EINVAL otherwise.
 */
int kmsg_dump_unregister(struct kmsg_dumper *dumper)
{
        unsigned long flags;
        int err = -EINVAL;

        spin_lock_irqsave(&dump_list_lock, flags);
        if (dumper->registered) {
                dumper->registered = 0;
                list_del_rcu(&dumper->list);
                err = 0;
        }
        spin_unlock_irqrestore(&dump_list_lock, flags);
        synchronize_rcu();

        return err;
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);

static bool always_kmsg_dump;
module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);

const char *kmsg_dump_reason_str(enum kmsg_dump_reason reason)
{
        switch (reason) {
        case KMSG_DUMP_PANIC:
                return "Panic";
        case KMSG_DUMP_OOPS:
                return "Oops";
        case KMSG_DUMP_EMERG:
                return "Emergency";
        case KMSG_DUMP_SHUTDOWN:
                return "Shutdown";
        default:
                return "Unknown";
        }
}
EXPORT_SYMBOL_GPL(kmsg_dump_reason_str);

/**
 * kmsg_dump_desc - dump kernel log to kernel message dumpers.
 * @reason: the reason (oops, panic etc) for dumping
 * @desc: a short string to describe what caused the panic or oops. Can be NULL
 * if no additional description is available.
 *
 * Call each of the registered dumper's dump() callback, which can
 * retrieve the kmsg records with kmsg_dump_get_line() or
 * kmsg_dump_get_buffer().
 */
void kmsg_dump_desc(enum kmsg_dump_reason reason, const char *desc)
{
        struct kmsg_dumper *dumper;
        struct kmsg_dump_detail detail = {
                .reason = reason,
                .description = desc};

        rcu_read_lock();
        list_for_each_entry_rcu(dumper, &dump_list, list) {
                enum kmsg_dump_reason max_reason = dumper->max_reason;

                /*
                 * If client has not provided a specific max_reason, default
                 * to KMSG_DUMP_OOPS, unless always_kmsg_dump was set.
                 */
                if (max_reason == KMSG_DUMP_UNDEF) {
                        max_reason = always_kmsg_dump ? KMSG_DUMP_MAX :
                                                        KMSG_DUMP_OOPS;
                }
                if (reason > max_reason)
                        continue;

                /* invoke dumper which will iterate over records */
                dumper->dump(dumper, &detail);
        }
        rcu_read_unlock();
}

/**
 * kmsg_dump_get_line - retrieve one kmsg log line
 * @iter: kmsg dump iterator
 * @syslog: include the "<4>" prefixes
 * @line: buffer to copy the line to
 * @size: maximum size of the buffer
 * @len: length of line placed into buffer
 *
 * Start at the beginning of the kmsg buffer, with the oldest kmsg
 * record, and copy one record into the provided buffer.
 *
 * Consecutive calls will return the next available record moving
 * towards the end of the buffer with the youngest messages.
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
 */
bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog,
                        char *line, size_t size, size_t *len)
{
        u64 min_seq = latched_seq_read_nolock(&clear_seq);
        struct printk_info info;
        unsigned int line_count;
        struct printk_record r;
        size_t l = 0;
        bool ret = false;

        if (iter->cur_seq < min_seq)
                iter->cur_seq = min_seq;

        prb_rec_init_rd(&r, &info, line, size);

        /* Read text or count text lines? */
        if (line) {
                if (!prb_read_valid(prb, iter->cur_seq, &r))
                        goto out;
                l = record_print_text(&r, syslog, printk_time);
        } else {
                if (!prb_read_valid_info(prb, iter->cur_seq,
                                         &info, &line_count)) {
                        goto out;
                }
                l = get_record_print_text_size(&info, line_count, syslog,
                                               printk_time);

        }

        iter->cur_seq = r.info->seq + 1;
        ret = true;
out:
        if (len)
                *len = l;
        return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_line);

/**
 * kmsg_dump_get_buffer - copy kmsg log lines
 * @iter: kmsg dump iterator
 * @syslog: include the "<4>" prefixes
 * @buf: buffer to copy the line to
 * @size: maximum size of the buffer
 * @len_out: length of line placed into buffer
 *
 * Start at the end of the kmsg buffer and fill the provided buffer
 * with as many of the *youngest* kmsg records that fit into it.
 * If the buffer is large enough, all available kmsg records will be
 * copied with a single call.
 *
 * Consecutive calls will fill the buffer with the next block of
 * available older records, not including the earlier retrieved ones.
 *
 * A return value of FALSE indicates that there are no more records to
 * read.
 */
bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog,
                          char *buf, size_t size, size_t *len_out)
{
        u64 min_seq = latched_seq_read_nolock(&clear_seq);
        struct printk_info info;
        struct printk_record r;
        u64 seq;
        u64 next_seq;
        size_t len = 0;
        bool ret = false;
        bool time = printk_time;

        if (!buf || !size)
                goto out;

        if (iter->cur_seq < min_seq)
                iter->cur_seq = min_seq;

        if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) {
                if (info.seq != iter->cur_seq) {
                        /* messages are gone, move to first available one */
                        iter->cur_seq = info.seq;
                }
        }

        /* last entry */
        if (iter->cur_seq >= iter->next_seq)
                goto out;

        /*
         * Find first record that fits, including all following records,
         * into the user-provided buffer for this dump. Pass in size-1
         * because this function (by way of record_print_text()) will
         * not write more than size-1 bytes of text into @buf.
         */
        seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq,
                                     size - 1, syslog, time);

        /*
         * Next kmsg_dump_get_buffer() invocation will dump block of
         * older records stored right before this one.
         */
        next_seq = seq;

        prb_rec_init_rd(&r, &info, buf, size);

        prb_for_each_record(seq, prb, seq, &r) {
                if (r.info->seq >= iter->next_seq)
                        break;

                len += record_print_text(&r, syslog, time);

                /* Adjust record to store to remaining buffer space. */
                prb_rec_init_rd(&r, &info, buf + len, size - len);
        }

        iter->next_seq = next_seq;
        ret = true;
out:
        if (len_out)
                *len_out = len;
        return ret;
}
EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);

/**
 * kmsg_dump_rewind - reset the iterator
 * @iter: kmsg dump iterator
 *
 * Reset the dumper's iterator so that kmsg_dump_get_line() and
 * kmsg_dump_get_buffer() can be called again and used multiple
 * times within the same dumper.dump() callback.
 */
void kmsg_dump_rewind(struct kmsg_dump_iter *iter)
{
        iter->cur_seq = latched_seq_read_nolock(&clear_seq);
        iter->next_seq = prb_next_seq(prb);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);

/**
 * console_try_replay_all - try to replay kernel log on consoles
 *
 * Try to obtain lock on console subsystem and replay all
 * available records in printk buffer on the consoles.
 * Does nothing if lock is not obtained.
 *
 * Context: Any, except for NMI.
 */
void console_try_replay_all(void)
{
        struct console_flush_type ft;

        printk_get_console_flush_type(&ft);
        if (console_trylock()) {
                __console_rewind_all();
                if (ft.nbcon_atomic)
                        nbcon_atomic_flush_pending();
                if (ft.nbcon_offload)
                        nbcon_kthreads_wake();
                if (ft.legacy_offload)
                        defer_console_output();
                /* Consoles are flushed as part of console_unlock(). */
                console_unlock();
        }
}
#endif

#ifdef CONFIG_SMP
static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);

bool is_printk_cpu_sync_owner(void)
{
        return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id());
}

/**
 * __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
 *                            spinning lock is not owned by any CPU.
 *
 * Context: Any context.
 */
void __printk_cpu_sync_wait(void)
{
        do {
                cpu_relax();
        } while (atomic_read(&printk_cpu_sync_owner) != -1);
}
EXPORT_SYMBOL(__printk_cpu_sync_wait);

/**
 * __printk_cpu_sync_try_get() - Try to acquire the printk cpu-reentrant
 *                               spinning lock.
 *
 * If no processor has the lock, the calling processor takes the lock and
 * becomes the owner. If the calling processor is already the owner of the
 * lock, this function succeeds immediately.
 *
 * Context: Any context. Expects interrupts to be disabled.
 * Return: 1 on success, otherwise 0.
 */
int __printk_cpu_sync_try_get(void)
{
        int cpu;
        int old;

        cpu = smp_processor_id();

        /*
         * Guarantee loads and stores from this CPU when it is the lock owner
         * are _not_ visible to the previous lock owner. This pairs with
         * __printk_cpu_sync_put:B.
         *
         * Memory barrier involvement:
         *
         * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
         * then __printk_cpu_sync_put:A can never read from
         * __printk_cpu_sync_try_get:B.
         *
         * Relies on:
         *
         * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
         * of the previous CPU
         *    matching
         * ACQUIRE from __printk_cpu_sync_try_get:A to
         * __printk_cpu_sync_try_get:B of this CPU
         */
        old = atomic_cmpxchg_acquire(&printk_cpu_sync_owner, -1,
                                     cpu); /* LMM(__printk_cpu_sync_try_get:A) */
        if (old == -1) {
                /*
                 * This CPU is now the owner and begins loading/storing
                 * data: LMM(__printk_cpu_sync_try_get:B)
                 */
                return 1;

        } else if (old == cpu) {
                /* This CPU is already the owner. */
                atomic_inc(&printk_cpu_sync_nested);
                return 1;
        }

        return 0;
}
EXPORT_SYMBOL(__printk_cpu_sync_try_get);

/**
 * __printk_cpu_sync_put() - Release the printk cpu-reentrant spinning lock.
 *
 * The calling processor must be the owner of the lock.
 *
 * Context: Any context. Expects interrupts to be disabled.
 */
void __printk_cpu_sync_put(void)
{
        if (atomic_read(&printk_cpu_sync_nested)) {
                atomic_dec(&printk_cpu_sync_nested);
                return;
        }

        /*
         * This CPU is finished loading/storing data:
         * LMM(__printk_cpu_sync_put:A)
         */

        /*
         * Guarantee loads and stores from this CPU when it was the
         * lock owner are visible to the next lock owner. This pairs
         * with __printk_cpu_sync_try_get:A.
         *
         * Memory barrier involvement:
         *
         * If __printk_cpu_sync_try_get:A reads from __printk_cpu_sync_put:B,
         * then __printk_cpu_sync_try_get:B reads from __printk_cpu_sync_put:A.
         *
         * Relies on:
         *
         * RELEASE from __printk_cpu_sync_put:A to __printk_cpu_sync_put:B
         * of this CPU
         *    matching
         * ACQUIRE from __printk_cpu_sync_try_get:A to
         * __printk_cpu_sync_try_get:B of the next CPU
         */
        atomic_set_release(&printk_cpu_sync_owner,
                           -1); /* LMM(__printk_cpu_sync_put:B) */
}
EXPORT_SYMBOL(__printk_cpu_sync_put);
#endif /* CONFIG_SMP */



















  111 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __ASM_CURRENT_H
#define __ASM_CURRENT_H

#include <linux/compiler.h>

#ifndef __ASSEMBLY__

struct task_struct;

/*
 * We don't use read_sysreg() as we want the compiler to cache the value where
 * possible.
 */
static __always_inline struct task_struct *get_current(void)
{
        unsigned long sp_el0;

        asm ("mrs %0, sp_el0" : "=r" (sp_el0));

        return (struct task_struct *)sp_el0;
}

#define current get_current()

#endif /* __ASSEMBLY__ */

#endif /* __ASM_CURRENT_H */













































   95 































   95 

   95 

















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
/**
 * css_get - obtain a reference on the specified css
 * @css: target css
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get(&css->refcnt);
}
CGROUP_REF_EXPORT(css_get)

/**
 * css_get_many - obtain references on the specified css
 * @css: target css
 * @n: number of references to get
 *
 * The caller must already have a reference.
 */
CGROUP_REF_FN_ATTRS
void css_get_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_get_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_get_many)

/**
 * css_tryget - try to obtain a reference on the specified css
 * @css: target css
 *
 * Obtain a reference on @css unless it already has reached zero and is
 * being released.  This function doesn't care whether @css is on or
 * offline.  The caller naturally needs to ensure that @css is accessible
 * but doesn't have to be holding a reference on it - IOW, RCU protected
 * access is good enough for this function.  Returns %true if a reference
 * count was successfully obtained; %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget)

/**
 * css_tryget_online - try to obtain a reference on the specified css if online
 * @css: target css
 *
 * Obtain a reference on @css if it's online.  The caller naturally needs
 * to ensure that @css is accessible but doesn't have to be holding a
 * reference on it - IOW, RCU protected access is good enough for this
 * function.  Returns %true if a reference count was successfully obtained;
 * %false otherwise.
 */
CGROUP_REF_FN_ATTRS
bool css_tryget_online(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                return percpu_ref_tryget_live(&css->refcnt);
        return true;
}
CGROUP_REF_EXPORT(css_tryget_online)

/**
 * css_put - put a css reference
 * @css: target css
 *
 * Put a reference obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put(struct cgroup_subsys_state *css)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put(&css->refcnt);
}
CGROUP_REF_EXPORT(css_put)

/**
 * css_put_many - put css references
 * @css: target css
 * @n: number of references to put
 *
 * Put references obtained via css_get() and css_tryget_online().
 */
CGROUP_REF_FN_ATTRS
void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
{
        if (!(css->flags & CSS_NO_REF))
                percpu_ref_put_many(&css->refcnt, n);
}
CGROUP_REF_EXPORT(css_put_many)

































































































































































































































































    1 





    1 















































































    1 










    3 






    2 


    1 


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
// SPDX-License-Identifier: GPL-2.0-only
/*
 * VGIC system registers handling functions for AArch64 mode
 */

#include <linux/irqchip/arm-gic-v3.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include "vgic/vgic.h"
#include "sys_regs.h"

static int set_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 val)
{
        u32 host_pri_bits, host_id_bits, host_seis, host_a3v, seis, a3v;
        struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);

        /*
         * Disallow restoring VM state if not supported by this
         * hardware.
         */
        host_pri_bits = FIELD_GET(ICC_CTLR_EL1_PRI_BITS_MASK, val) + 1;
        if (host_pri_bits > vgic_v3_cpu->num_pri_bits)
                return -EINVAL;

        vgic_v3_cpu->num_pri_bits = host_pri_bits;

        host_id_bits = FIELD_GET(ICC_CTLR_EL1_ID_BITS_MASK, val);
        if (host_id_bits > vgic_v3_cpu->num_id_bits)
                return -EINVAL;

        vgic_v3_cpu->num_id_bits = host_id_bits;

        host_seis = FIELD_GET(ICH_VTR_EL2_SEIS, kvm_vgic_global_state.ich_vtr_el2);
        seis = FIELD_GET(ICC_CTLR_EL1_SEIS_MASK, val);
        if (host_seis != seis)
                return -EINVAL;

        host_a3v = FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2);
        a3v = FIELD_GET(ICC_CTLR_EL1_A3V_MASK, val);
        if (host_a3v != a3v)
                return -EINVAL;

        /*
         * Here set VMCR.CTLR in ICC_CTLR_EL1 layout.
         * The vgic_set_vmcr() will convert to ICH_VMCR layout.
         */
        vmcr.cbpr = FIELD_GET(ICC_CTLR_EL1_CBPR_MASK, val);
        vmcr.eoim = FIELD_GET(ICC_CTLR_EL1_EOImode_MASK, val);
        vgic_set_vmcr(vcpu, &vmcr);

        return 0;
}

static int get_gic_ctlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 *valp)
{
        struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu;
        struct vgic_vmcr vmcr;
        u64 val;

        vgic_get_vmcr(vcpu, &vmcr);
        val = 0;
        val |= FIELD_PREP(ICC_CTLR_EL1_PRI_BITS_MASK, vgic_v3_cpu->num_pri_bits - 1);
        val |= FIELD_PREP(ICC_CTLR_EL1_ID_BITS_MASK, vgic_v3_cpu->num_id_bits);
        val |= FIELD_PREP(ICC_CTLR_EL1_SEIS_MASK,
                          FIELD_GET(ICH_VTR_EL2_SEIS,
                                    kvm_vgic_global_state.ich_vtr_el2));
        val |= FIELD_PREP(ICC_CTLR_EL1_A3V_MASK,
                          FIELD_GET(ICH_VTR_EL2_A3V, kvm_vgic_global_state.ich_vtr_el2));
        /*
         * The VMCR.CTLR value is in ICC_CTLR_EL1 layout.
         * Extract it directly using ICC_CTLR_EL1 reg definitions.
         */
        val |= FIELD_PREP(ICC_CTLR_EL1_CBPR_MASK, vmcr.cbpr);
        val |= FIELD_PREP(ICC_CTLR_EL1_EOImode_MASK, vmcr.eoim);

        *valp = val;

        return 0;
}

static int set_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                       u64 val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        vmcr.pmr = FIELD_GET(ICC_PMR_EL1_MASK, val);
        vgic_set_vmcr(vcpu, &vmcr);

        return 0;
}

static int get_gic_pmr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                       u64 *val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        *val = FIELD_PREP(ICC_PMR_EL1_MASK, vmcr.pmr);

        return 0;
}

static int set_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        vmcr.bpr = FIELD_GET(ICC_BPR0_EL1_MASK, val);
        vgic_set_vmcr(vcpu, &vmcr);

        return 0;
}

static int get_gic_bpr0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 *val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        *val = FIELD_PREP(ICC_BPR0_EL1_MASK, vmcr.bpr);

        return 0;
}

static int set_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        if (!vmcr.cbpr) {
                vmcr.abpr = FIELD_GET(ICC_BPR1_EL1_MASK, val);
                vgic_set_vmcr(vcpu, &vmcr);
        }

        return 0;
}

static int get_gic_bpr1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 *val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        if (!vmcr.cbpr)
                *val = FIELD_PREP(ICC_BPR1_EL1_MASK, vmcr.abpr);
        else
                *val = min((vmcr.bpr + 1), 7U);


        return 0;
}

static int set_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                          u64 val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        vmcr.grpen0 = FIELD_GET(ICC_IGRPEN0_EL1_MASK, val);
        vgic_set_vmcr(vcpu, &vmcr);

        return 0;
}

static int get_gic_grpen0(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                          u64 *val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        *val = FIELD_PREP(ICC_IGRPEN0_EL1_MASK, vmcr.grpen0);

        return 0;
}

static int set_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                          u64 val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        vmcr.grpen1 = FIELD_GET(ICC_IGRPEN1_EL1_MASK, val);
        vgic_set_vmcr(vcpu, &vmcr);

        return 0;
}

static int get_gic_grpen1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                          u64 *val)
{
        struct vgic_vmcr vmcr;

        vgic_get_vmcr(vcpu, &vmcr);
        *val = FIELD_GET(ICC_IGRPEN1_EL1_MASK, vmcr.grpen1);

        return 0;
}

static void set_apr_reg(struct kvm_vcpu *vcpu, u64 val, u8 apr, u8 idx)
{
        struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;

        if (apr)
                vgicv3->vgic_ap1r[idx] = val;
        else
                vgicv3->vgic_ap0r[idx] = val;
}

static u64 get_apr_reg(struct kvm_vcpu *vcpu, u8 apr, u8 idx)
{
        struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;

        if (apr)
                return vgicv3->vgic_ap1r[idx];
        else
                return vgicv3->vgic_ap0r[idx];
}

static int set_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 val)

{
        u8 idx = r->Op2 & 3;

        if (idx > vgic_v3_max_apr_idx(vcpu))
                return -EINVAL;

        set_apr_reg(vcpu, val, 0, idx);
        return 0;
}

static int get_gic_ap0r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 *val)
{
        u8 idx = r->Op2 & 3;

        if (idx > vgic_v3_max_apr_idx(vcpu))
                return -EINVAL;

        *val = get_apr_reg(vcpu, 0, idx);

        return 0;
}

static int set_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 val)

{
        u8 idx = r->Op2 & 3;

        if (idx > vgic_v3_max_apr_idx(vcpu))
                return -EINVAL;

        set_apr_reg(vcpu, val, 1, idx);
        return 0;
}

static int get_gic_ap1r(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                        u64 *val)
{
        u8 idx = r->Op2 & 3;

        if (idx > vgic_v3_max_apr_idx(vcpu))
                return -EINVAL;

        *val = get_apr_reg(vcpu, 1, idx);

        return 0;
}

static int set_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                       u64 val)
{
        /* Validate SRE bit */
        if (!(val & ICC_SRE_EL1_SRE))
                return -EINVAL;

        return 0;
}

static int get_gic_sre(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r,
                       u64 *val)
{
        struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;

        *val = vgicv3->vgic_sre;

        return 0;
}

static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
        { SYS_DESC(SYS_ICC_PMR_EL1),
          .set_user = set_gic_pmr, .get_user = get_gic_pmr, },
        { SYS_DESC(SYS_ICC_BPR0_EL1),
          .set_user = set_gic_bpr0, .get_user = get_gic_bpr0, },
        { SYS_DESC(SYS_ICC_AP0R0_EL1),
          .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, },
        { SYS_DESC(SYS_ICC_AP0R1_EL1),
          .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, },
        { SYS_DESC(SYS_ICC_AP0R2_EL1),
          .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, },
        { SYS_DESC(SYS_ICC_AP0R3_EL1),
          .set_user = set_gic_ap0r, .get_user = get_gic_ap0r, },
        { SYS_DESC(SYS_ICC_AP1R0_EL1),
          .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, },
        { SYS_DESC(SYS_ICC_AP1R1_EL1),
          .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, },
        { SYS_DESC(SYS_ICC_AP1R2_EL1),
          .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, },
        { SYS_DESC(SYS_ICC_AP1R3_EL1),
          .set_user = set_gic_ap1r, .get_user = get_gic_ap1r, },
        { SYS_DESC(SYS_ICC_BPR1_EL1),
          .set_user = set_gic_bpr1, .get_user = get_gic_bpr1, },
        { SYS_DESC(SYS_ICC_CTLR_EL1),
          .set_user = set_gic_ctlr, .get_user = get_gic_ctlr, },
        { SYS_DESC(SYS_ICC_SRE_EL1),
          .set_user = set_gic_sre, .get_user = get_gic_sre, },
        { SYS_DESC(SYS_ICC_IGRPEN0_EL1),
          .set_user = set_gic_grpen0, .get_user = get_gic_grpen0, },
        { SYS_DESC(SYS_ICC_IGRPEN1_EL1),
          .set_user = set_gic_grpen1, .get_user = get_gic_grpen1, },
};

static u64 attr_to_id(u64 attr)
{
        return ARM64_SYS_REG(FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP0_MASK, attr),
                             FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP1_MASK, attr),
                             FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRN_MASK, attr),
                             FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_CRM_MASK, attr),
                             FIELD_GET(KVM_REG_ARM_VGIC_SYSREG_OP2_MASK, attr));
}

int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr)
{
        if (get_reg_by_id(attr_to_id(attr->attr), gic_v3_icc_reg_descs,
                          ARRAY_SIZE(gic_v3_icc_reg_descs)))
                return 0;

        return -ENXIO;
}

int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu,
                                struct kvm_device_attr *attr,
                                bool is_write)
{
        struct kvm_one_reg reg = {
                .id        = attr_to_id(attr->attr),
                .addr        = attr->addr,
        };

        if (is_write)
                return kvm_sys_reg_set_user(vcpu, &reg, gic_v3_icc_reg_descs,
                                            ARRAY_SIZE(gic_v3_icc_reg_descs));
        else
                return kvm_sys_reg_get_user(vcpu, &reg, gic_v3_icc_reg_descs,
                                            ARRAY_SIZE(gic_v3_icc_reg_descs));
}




















































































































































   98 



























  165 










































































  165 




























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * include/linux/backing-dev.h
 *
 * low-level device information and state which is propagated up through
 * to high-level code.
 */

#ifndef _LINUX_BACKING_DEV_H
#define _LINUX_BACKING_DEV_H

#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
#include <linux/device.h>
#include <linux/writeback.h>
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>

static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi)
{
        kref_get(&bdi->refcnt);
        return bdi;
}

struct backing_dev_info *bdi_get_by_id(u64 id);
void bdi_put(struct backing_dev_info *bdi);

__printf(2, 3)
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
__printf(2, 0)
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt,
                    va_list args);
void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner);
void bdi_unregister(struct backing_dev_info *bdi);

struct backing_dev_info *bdi_alloc(int node_id);

void wb_start_background_writeback(struct bdi_writeback *wb);
void wb_workfn(struct work_struct *work);

void wb_wait_for_completion(struct wb_completion *done);

extern spinlock_t bdi_lock;
extern struct list_head bdi_list;

extern struct workqueue_struct *bdi_wq;

static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
{
        return test_bit(WB_has_dirty_io, &wb->state);
}

static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
{
        /*
         * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
         * any dirty wbs.  See wb_update_write_bandwidth().
         */
        return atomic_long_read(&bdi->tot_write_bandwidth);
}

static inline void wb_stat_mod(struct bdi_writeback *wb,
                                 enum wb_stat_item item, s64 amount)
{
        percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
}

static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        wb_stat_mod(wb, item, 1);
}

static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        wb_stat_mod(wb, item, -1);
}

static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_read_positive(&wb->stat[item]);
}

static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
{
        return percpu_counter_sum_positive(&wb->stat[item]);
}

extern void wb_writeout_inc(struct bdi_writeback *wb);

/*
 * maximal error of a stat counter.
 */
static inline unsigned long wb_stat_error(void)
{
#ifdef CONFIG_SMP
        return nr_cpu_ids * WB_STAT_BATCH;
#else
        return 1;
#endif
}

/* BDI ratio is expressed as part per 1000000 for finer granularity. */
#define BDI_RATIO_SCALE 10000

u64 bdi_get_min_bytes(struct backing_dev_info *bdi);
u64 bdi_get_max_bytes(struct backing_dev_info *bdi);
int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio);
int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio);
int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes);
int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes);
int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit);

/*
 * Flags in backing_dev_info::capability
 *
 * BDI_CAP_WRITEBACK:                Supports dirty page writeback, and dirty pages
 *                                should contribute to accounting
 * BDI_CAP_WRITEBACK_ACCT:        Automatically account writeback pages
 * BDI_CAP_STRICTLIMIT:                Keep number of dirty pages below bdi threshold
 */
#define BDI_CAP_WRITEBACK                (1 << 0)
#define BDI_CAP_WRITEBACK_ACCT                (1 << 1)
#define BDI_CAP_STRICTLIMIT                (1 << 2)

extern struct backing_dev_info noop_backing_dev_info;

int bdi_init(struct backing_dev_info *bdi);

/**
 * writeback_in_progress - determine whether there is writeback in progress
 * @wb: bdi_writeback of interest
 *
 * Determine whether there is writeback waiting to be handled against a
 * bdi_writeback.
 */
static inline bool writeback_in_progress(struct bdi_writeback *wb)
{
        return test_bit(WB_writeback_running, &wb->state);
}

struct backing_dev_info *inode_to_bdi(struct inode *inode);

static inline bool mapping_can_writeback(struct address_space *mapping)
{
        return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK;
}

#ifdef CONFIG_CGROUP_WRITEBACK

struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css);
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
                                    struct cgroup_subsys_state *memcg_css,
                                    gfp_t gfp);
void wb_memcg_offline(struct mem_cgroup *memcg);
void wb_blkcg_offline(struct cgroup_subsys_state *css);

/**
 * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
 * @inode: inode of interest
 *
 * Cgroup writeback requires support from the filesystem.  Also, both memcg and
 * iocg have to be on the default hierarchy.  Test whether all conditions are
 * met.
 *
 * Note that the test result may change dynamically on the same inode
 * depending on how memcg and iocg are configured.
 */
static inline bool inode_cgwb_enabled(struct inode *inode)
{
        struct backing_dev_info *bdi = inode_to_bdi(inode);

        return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
                cgroup_subsys_on_dfl(io_cgrp_subsys) &&
                (bdi->capabilities & BDI_CAP_WRITEBACK) &&
                (inode->i_sb->s_iflags & SB_I_CGROUPWB);
}

/**
 * wb_find_current - find wb for %current on a bdi
 * @bdi: bdi of interest
 *
 * Find the wb of @bdi which matches both the memcg and blkcg of %current.
 * Must be called under rcu_read_lock() which protects the returend wb.
 * NULL if not found.
 */
static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        struct cgroup_subsys_state *memcg_css;
        struct bdi_writeback *wb;

        memcg_css = task_css(current, memory_cgrp_id);
        if (!memcg_css->parent)
                return &bdi->wb;

        wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);

        /*
         * %current's blkcg equals the effective blkcg of its memcg.  No
         * need to use the relatively expensive cgroup_get_e_css().
         */
        if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
                return wb;
        return NULL;
}

/**
 * wb_get_create_current - get or create wb for %current on a bdi
 * @bdi: bdi of interest
 * @gfp: allocation mask
 *
 * Equivalent to wb_get_create() on %current's memcg.  This function is
 * called from a relatively hot path and optimizes the common cases using
 * wb_find_current().
 */
static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        struct bdi_writeback *wb;

        rcu_read_lock();
        wb = wb_find_current(bdi);
        if (wb && unlikely(!wb_tryget(wb)))
                wb = NULL;
        rcu_read_unlock();

        if (unlikely(!wb)) {
                struct cgroup_subsys_state *memcg_css;

                memcg_css = task_get_css(current, memory_cgrp_id);
                wb = wb_get_create(bdi, memcg_css, gfp);
                css_put(memcg_css);
        }
        return wb;
}

/**
 * inode_to_wb - determine the wb of an inode
 * @inode: inode of interest
 *
 * Returns the wb @inode is currently associated with.  The caller must be
 * holding either @inode->i_lock, the i_pages lock, or the
 * associated wb's list_lock.
 */
static inline struct bdi_writeback *inode_to_wb(const struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
        WARN_ON_ONCE(debug_locks &&
                     (inode->i_sb->s_iflags & SB_I_CGROUPWB) &&
                     (!lockdep_is_held(&inode->i_lock) &&
                      !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) &&
                      !lockdep_is_held(&inode->i_wb->list_lock)));
#endif
        return inode->i_wb;
}

static inline struct bdi_writeback *inode_to_wb_wbc(
                                struct inode *inode,
                                struct writeback_control *wbc)
{
        /*
         * If wbc does not have inode attached, it means cgroup writeback was
         * disabled when wbc started. Just use the default wb in that case.
         */
        return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb;
}

/**
 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
 * @inode: target inode
 * @cookie: output param, to be passed to the end function
 *
 * The caller wants to access the wb associated with @inode but isn't
 * holding inode->i_lock, the i_pages lock or wb->list_lock.  This
 * function determines the wb associated with @inode and ensures that the
 * association doesn't change until the transaction is finished with
 * unlocked_inode_to_wb_end().
 *
 * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
 * can't sleep during the transaction.  IRQs may or may not be disabled on
 * return.
 */
static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        rcu_read_lock();

        /*
         * Paired with store_release in inode_switch_wbs_work_fn() and
         * ensures that we see the new wb if we see cleared I_WB_SWITCH.
         */
        cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;

        if (unlikely(cookie->locked))
                xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags);

        /*
         * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages
         * lock.  inode_to_wb() will bark.  Deref directly.
         */
        return inode->i_wb;
}

/**
 * unlocked_inode_to_wb_end - end inode wb access transaction
 * @inode: target inode
 * @cookie: @cookie from unlocked_inode_to_wb_begin()
 */
static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
        if (unlikely(cookie->locked))
                xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags);

        rcu_read_unlock();
}

#else        /* CONFIG_CGROUP_WRITEBACK */

static inline bool inode_cgwb_enabled(struct inode *inode)
{
        return false;
}

static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *
wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
{
        return &bdi->wb;
}

static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
{
        return &inode_to_bdi(inode)->wb;
}

static inline struct bdi_writeback *inode_to_wb_wbc(
                                struct inode *inode,
                                struct writeback_control *wbc)
{
        return inode_to_wb(inode);
}


static inline struct bdi_writeback *
unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
{
        return inode_to_wb(inode);
}

static inline void unlocked_inode_to_wb_end(struct inode *inode,
                                            struct wb_lock_cookie *cookie)
{
}

static inline void wb_memcg_offline(struct mem_cgroup *memcg)
{
}

static inline void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
}

#endif        /* CONFIG_CGROUP_WRITEBACK */

const char *bdi_dev_name(struct backing_dev_info *bdi);

#endif        /* _LINUX_BACKING_DEV_H */


















































    2 
    3 
    1 







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _LINUX_FILEATTR_H
#define _LINUX_FILEATTR_H

/* Flags shared betwen flags/xflags */
#define FS_COMMON_FL \
        (FS_SYNC_FL | FS_IMMUTABLE_FL | FS_APPEND_FL | \
         FS_NODUMP_FL |        FS_NOATIME_FL | FS_DAX_FL | \
         FS_PROJINHERIT_FL)

#define FS_XFLAG_COMMON \
        (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND | \
         FS_XFLAG_NODUMP | FS_XFLAG_NOATIME | FS_XFLAG_DAX | \
         FS_XFLAG_PROJINHERIT)

/*
 * Merged interface for miscellaneous file attributes.  'flags' originates from
 * ext* and 'fsx_flags' from xfs.  There's some overlap between the two, which
 * is handled by the VFS helpers, so filesystems are free to implement just one
 * or both of these sub-interfaces.
 */
struct fileattr {
        u32        flags;                /* flags (FS_IOC_GETFLAGS/FS_IOC_SETFLAGS) */
        /* struct fsxattr: */
        u32        fsx_xflags;        /* xflags field value (get/set) */
        u32        fsx_extsize;        /* extsize field value (get/set)*/
        u32        fsx_nextents;        /* nextents field value (get)        */
        u32        fsx_projid;        /* project identifier (get/set) */
        u32        fsx_cowextsize;        /* CoW extsize field value (get/set)*/
        /* selectors: */
        bool        flags_valid:1;
        bool        fsx_valid:1;
};

int copy_fsxattr_to_user(const struct fileattr *fa, struct fsxattr __user *ufa);

void fileattr_fill_xflags(struct fileattr *fa, u32 xflags);
void fileattr_fill_flags(struct fileattr *fa, u32 flags);

/**
 * fileattr_has_fsx - check for extended flags/attributes
 * @fa:                fileattr pointer
 *
 * Return: true if any attributes are present that are not represented in
 * ->flags.
 */
static inline bool fileattr_has_fsx(const struct fileattr *fa)
{
        return fa->fsx_valid &&
                ((fa->fsx_xflags & ~FS_XFLAG_COMMON) || fa->fsx_extsize != 0 ||
                 fa->fsx_projid != 0 ||        fa->fsx_cowextsize != 0);
}

int vfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
                     struct fileattr *fa);

#endif /* _LINUX_FILEATTR_H */



























































































































































































































































































































































































    4 


    4 

    2 


    2 









































































































    4 









    2 


    1 








    1 




















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/slab.h>
#include <linux/stat.h>
#include <linux/sched/xacct.h>
#include <linux/fcntl.h>
#include <linux/file.h>
#include <linux/uio.h>
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/mount.h>
#include <linux/fs.h>
#include <linux/dax.h>
#include <linux/overflow.h>
#include "internal.h"

#include <linux/uaccess.h>
#include <asm/unistd.h>

/*
 * Performs necessary checks before doing a clone.
 *
 * Can adjust amount of bytes to clone via @req_count argument.
 * Returns appropriate error code that caller should return or
 * zero in case the clone should be allowed.
 */
static int generic_remap_checks(struct file *file_in, loff_t pos_in,
                                struct file *file_out, loff_t pos_out,
                                loff_t *req_count, unsigned int remap_flags)
{
        struct inode *inode_in = file_in->f_mapping->host;
        struct inode *inode_out = file_out->f_mapping->host;
        uint64_t count = *req_count;
        uint64_t bcount;
        loff_t size_in, size_out;
        loff_t bs = inode_out->i_sb->s_blocksize;
        int ret;

        /* The start of both ranges must be aligned to an fs block. */
        if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
                return -EINVAL;

        /* Ensure offsets don't wrap. */
        if (pos_in + count < pos_in || pos_out + count < pos_out)
                return -EINVAL;

        size_in = i_size_read(inode_in);
        size_out = i_size_read(inode_out);

        /* Dedupe requires both ranges to be within EOF. */
        if ((remap_flags & REMAP_FILE_DEDUP) &&
            (pos_in >= size_in || pos_in + count > size_in ||
             pos_out >= size_out || pos_out + count > size_out))
                return -EINVAL;

        /* Ensure the infile range is within the infile. */
        if (pos_in >= size_in)
                return -EINVAL;
        count = min(count, size_in - (uint64_t)pos_in);

        ret = generic_write_check_limits(file_out, pos_out, &count);
        if (ret)
                return ret;

        /*
         * If the user wanted us to link to the infile's EOF, round up to the
         * next block boundary for this check.
         *
         * Otherwise, make sure the count is also block-aligned, having
         * already confirmed the starting offsets' block alignment.
         */
        if (pos_in + count == size_in &&
            (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
                bcount = ALIGN(size_in, bs) - pos_in;
        } else {
                if (!IS_ALIGNED(count, bs))
                        count = ALIGN_DOWN(count, bs);
                bcount = count;
        }

        /* Don't allow overlapped cloning within the same file. */
        if (inode_in == inode_out &&
            pos_out + bcount > pos_in &&
            pos_out < pos_in + bcount)
                return -EINVAL;

        /*
         * We shortened the request but the caller can't deal with that, so
         * bounce the request back to userspace.
         */
        if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
                return -EINVAL;

        *req_count = count;
        return 0;
}

int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write)
{
        int mask = write ? MAY_WRITE : MAY_READ;
        loff_t tmp;
        int ret;

        if (unlikely(pos < 0 || len < 0))
                return -EINVAL;

        if (unlikely(check_add_overflow(pos, len, &tmp)))
                return -EINVAL;

        ret = security_file_permission(file, mask);
        if (ret)
                return ret;

        return fsnotify_file_area_perm(file, mask, &pos, len);
}
EXPORT_SYMBOL_GPL(remap_verify_area);

/*
 * Ensure that we don't remap a partial EOF block in the middle of something
 * else.  Assume that the offsets have already been checked for block
 * alignment.
 *
 * For clone we only link a partial EOF block above or at the destination file's
 * EOF.  For deduplication we accept a partial EOF block only if it ends at the
 * destination file's EOF (can not link it into the middle of a file).
 *
 * Shorten the request if possible.
 */
static int generic_remap_check_len(struct inode *inode_in,
                                   struct inode *inode_out,
                                   loff_t pos_out,
                                   loff_t *len,
                                   unsigned int remap_flags)
{
        u64 blkmask = i_blocksize(inode_in) - 1;
        loff_t new_len = *len;

        if ((*len & blkmask) == 0)
                return 0;

        if (pos_out + *len < i_size_read(inode_out))
                new_len &= ~blkmask;

        if (new_len == *len)
                return 0;

        if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
                *len = new_len;
                return 0;
        }

        return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
}

/* Read a page's worth of file data into the page cache. */
static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos)
{
        return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
}

/*
 * Lock two folios, ensuring that we lock in offset order if the folios
 * are from the same file.
 */
static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2)
{
        /* Always lock in order of increasing index. */
        if (folio1->index > folio2->index)
                swap(folio1, folio2);

        folio_lock(folio1);
        if (folio1 != folio2)
                folio_lock(folio2);
}

/* Unlock two folios, being careful not to unlock the same folio twice. */
static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2)
{
        folio_unlock(folio1);
        if (folio1 != folio2)
                folio_unlock(folio2);
}

/*
 * Compare extents of two files to see if they are the same.
 * Caller must have locked both inodes to prevent write races.
 */
static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
                                         struct file *dest, loff_t dstoff,
                                         loff_t len, bool *is_same)
{
        bool same = true;
        int error = -EINVAL;

        while (len) {
                struct folio *src_folio, *dst_folio;
                void *src_addr, *dst_addr;
                loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff),
                                     PAGE_SIZE - offset_in_page(dstoff));

                cmp_len = min(cmp_len, len);
                if (cmp_len <= 0)
                        goto out_error;

                src_folio = vfs_dedupe_get_folio(src, srcoff);
                if (IS_ERR(src_folio)) {
                        error = PTR_ERR(src_folio);
                        goto out_error;
                }
                dst_folio = vfs_dedupe_get_folio(dest, dstoff);
                if (IS_ERR(dst_folio)) {
                        error = PTR_ERR(dst_folio);
                        folio_put(src_folio);
                        goto out_error;
                }

                vfs_lock_two_folios(src_folio, dst_folio);

                /*
                 * Now that we've locked both folios, make sure they're still
                 * mapped to the file data we're interested in.  If not,
                 * someone is invalidating pages on us and we lose.
                 */
                if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) ||
                    src_folio->mapping != src->f_mapping ||
                    dst_folio->mapping != dest->f_mapping) {
                        same = false;
                        goto unlock;
                }

                src_addr = kmap_local_folio(src_folio,
                                        offset_in_folio(src_folio, srcoff));
                dst_addr = kmap_local_folio(dst_folio,
                                        offset_in_folio(dst_folio, dstoff));

                flush_dcache_folio(src_folio);
                flush_dcache_folio(dst_folio);

                if (memcmp(src_addr, dst_addr, cmp_len))
                        same = false;

                kunmap_local(dst_addr);
                kunmap_local(src_addr);
unlock:
                vfs_unlock_two_folios(src_folio, dst_folio);
                folio_put(dst_folio);
                folio_put(src_folio);

                if (!same)
                        break;

                srcoff += cmp_len;
                dstoff += cmp_len;
                len -= cmp_len;
        }

        *is_same = same;
        return 0;

out_error:
        return error;
}

/*
 * Check that the two inodes are eligible for cloning, the ranges make
 * sense, and then flush all dirty data.  Caller must ensure that the
 * inodes have been locked against any other modifications.
 *
 * If there's an error, then the usual negative error code is returned.
 * Otherwise returns 0 with *len set to the request length.
 */
int
__generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                struct file *file_out, loff_t pos_out,
                                loff_t *len, unsigned int remap_flags,
                                const struct iomap_ops *dax_read_ops)
{
        struct inode *inode_in = file_inode(file_in);
        struct inode *inode_out = file_inode(file_out);
        bool same_inode = (inode_in == inode_out);
        int ret;

        /* Don't touch certain kinds of inodes */
        if (IS_IMMUTABLE(inode_out))
                return -EPERM;

        if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
                return -ETXTBSY;

        /* Don't reflink dirs, pipes, sockets... */
        if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
                return -EISDIR;
        if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
                return -EINVAL;

        /* Zero length dedupe exits immediately; reflink goes to EOF. */
        if (*len == 0) {
                loff_t isize = i_size_read(inode_in);

                if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
                        return 0;
                if (pos_in > isize)
                        return -EINVAL;
                *len = isize - pos_in;
                if (*len == 0)
                        return 0;
        }

        /* Check that we don't violate system file offset limits. */
        ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
                        remap_flags);
        if (ret || *len == 0)
                return ret;

        /* Wait for the completion of any pending IOs on both files */
        inode_dio_wait(inode_in);
        if (!same_inode)
                inode_dio_wait(inode_out);

        ret = filemap_write_and_wait_range(inode_in->i_mapping,
                        pos_in, pos_in + *len - 1);
        if (ret)
                return ret;

        ret = filemap_write_and_wait_range(inode_out->i_mapping,
                        pos_out, pos_out + *len - 1);
        if (ret)
                return ret;

        /*
         * Check that the extents are the same.
         */
        if (remap_flags & REMAP_FILE_DEDUP) {
                bool                is_same = false;

                if (!IS_DAX(inode_in))
                        ret = vfs_dedupe_file_range_compare(file_in, pos_in,
                                        file_out, pos_out, *len, &is_same);
                else if (dax_read_ops)
                        ret = dax_dedupe_file_range_compare(inode_in, pos_in,
                                        inode_out, pos_out, *len, &is_same,
                                        dax_read_ops);
                else
                        return -EINVAL;
                if (ret)
                        return ret;
                if (!is_same)
                        return -EBADE;
        }

        ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
                        remap_flags);
        if (ret || *len == 0)
                return ret;

        /* If can't alter the file contents, we're done. */
        if (!(remap_flags & REMAP_FILE_DEDUP))
                ret = file_modified(file_out);

        return ret;
}

int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
                                  struct file *file_out, loff_t pos_out,
                                  loff_t *len, unsigned int remap_flags)
{
        return __generic_remap_file_range_prep(file_in, pos_in, file_out,
                                               pos_out, len, remap_flags, NULL);
}
EXPORT_SYMBOL(generic_remap_file_range_prep);

loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
                            struct file *file_out, loff_t pos_out,
                            loff_t len, unsigned int remap_flags)
{
        loff_t ret;

        WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);

        if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
                return -EXDEV;

        ret = generic_file_rw_checks(file_in, file_out);
        if (ret < 0)
                return ret;

        if (!file_in->f_op->remap_file_range)
                return -EOPNOTSUPP;

        ret = remap_verify_area(file_in, pos_in, len, false);
        if (ret)
                return ret;

        ret = remap_verify_area(file_out, pos_out, len, true);
        if (ret)
                return ret;

        file_start_write(file_out);
        ret = file_in->f_op->remap_file_range(file_in, pos_in,
                        file_out, pos_out, len, remap_flags);
        file_end_write(file_out);
        if (ret < 0)
                return ret;

        fsnotify_access(file_in);
        fsnotify_modify(file_out);
        return ret;
}
EXPORT_SYMBOL(vfs_clone_file_range);

/* Check whether we are allowed to dedupe the destination file */
static bool may_dedupe_file(struct file *file)
{
        struct mnt_idmap *idmap = file_mnt_idmap(file);
        struct inode *inode = file_inode(file);

        if (capable(CAP_SYS_ADMIN))
                return true;
        if (file->f_mode & FMODE_WRITE)
                return true;
        if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid()))
                return true;
        if (!inode_permission(idmap, inode, MAY_WRITE))
                return true;
        return false;
}

loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
                                 struct file *dst_file, loff_t dst_pos,
                                 loff_t len, unsigned int remap_flags)
{
        loff_t ret;

        WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
                                     REMAP_FILE_CAN_SHORTEN));

        /*
         * This is redundant if called from vfs_dedupe_file_range(), but other
         * callers need it and it's not performance sesitive...
         */
        ret = remap_verify_area(src_file, src_pos, len, false);
        if (ret)
                return ret;

        ret = remap_verify_area(dst_file, dst_pos, len, true);
        if (ret)
                return ret;

        /*
         * This needs to be called after remap_verify_area() because of
         * sb_start_write() and before may_dedupe_file() because the mount's
         * MAY_WRITE need to be checked with mnt_get_write_access_file() held.
         */
        ret = mnt_want_write_file(dst_file);
        if (ret)
                return ret;

        ret = -EPERM;
        if (!may_dedupe_file(dst_file))
                goto out_drop_write;

        ret = -EXDEV;
        if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb)
                goto out_drop_write;

        ret = -EISDIR;
        if (S_ISDIR(file_inode(dst_file)->i_mode))
                goto out_drop_write;

        ret = -EINVAL;
        if (!dst_file->f_op->remap_file_range)
                goto out_drop_write;

        if (len == 0) {
                ret = 0;
                goto out_drop_write;
        }

        ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
                        dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
out_drop_write:
        mnt_drop_write_file(dst_file);

        return ret;
}
EXPORT_SYMBOL(vfs_dedupe_file_range_one);

int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
{
        struct file_dedupe_range_info *info;
        struct inode *src = file_inode(file);
        u64 off;
        u64 len;
        int i;
        int ret;
        u16 count = same->dest_count;
        loff_t deduped;

        if (!(file->f_mode & FMODE_READ))
                return -EINVAL;

        if (same->reserved1 || same->reserved2)
                return -EINVAL;

        off = same->src_offset;
        len = same->src_length;

        if (S_ISDIR(src->i_mode))
                return -EISDIR;

        if (!S_ISREG(src->i_mode))
                return -EINVAL;

        if (!file->f_op->remap_file_range)
                return -EOPNOTSUPP;

        ret = remap_verify_area(file, off, len, false);
        if (ret < 0)
                return ret;
        ret = 0;

        if (off + len > i_size_read(src))
                return -EINVAL;

        /* Arbitrary 1G limit on a single dedupe request, can be raised. */
        len = min_t(u64, len, 1 << 30);

        /* pre-format output fields to sane values */
        for (i = 0; i < count; i++) {
                same->info[i].bytes_deduped = 0ULL;
                same->info[i].status = FILE_DEDUPE_RANGE_SAME;
        }

        for (i = 0, info = same->info; i < count; i++, info++) {
                CLASS(fd, dst_fd)(info->dest_fd);

                if (fd_empty(dst_fd)) {
                        info->status = -EBADF;
                        goto next_loop;
                }

                if (info->reserved) {
                        info->status = -EINVAL;
                        goto next_loop;
                }

                deduped = vfs_dedupe_file_range_one(file, off, fd_file(dst_fd),
                                                    info->dest_offset, len,
                                                    REMAP_FILE_CAN_SHORTEN);
                if (deduped == -EBADE)
                        info->status = FILE_DEDUPE_RANGE_DIFFERS;
                else if (deduped < 0)
                        info->status = deduped;
                else
                        info->bytes_deduped = len;

next_loop:
                if (fatal_signal_pending(current))
                        break;
        }
        return ret;
}
EXPORT_SYMBOL(vfs_dedupe_file_range);


























   54 







   54 


































































  159 


















  159 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Based on arch/arm/include/asm/pgalloc.h
 *
 * Copyright (C) 2000-2001 Russell King
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_PGALLOC_H
#define __ASM_PGALLOC_H

#include <asm/pgtable-hwdef.h>
#include <asm/processor.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

#define __HAVE_ARCH_PGD_FREE
#define __HAVE_ARCH_PUD_FREE
#include <asm-generic/pgalloc.h>

#define PGD_SIZE        (PTRS_PER_PGD * sizeof(pgd_t))

#if CONFIG_PGTABLE_LEVELS > 2

static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
{
        set_pud(pudp, __pud(__phys_to_pud_val(pmdp) | prot));
}

static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp)
{
        pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_AF;

        pudval |= (mm == &init_mm) ? PUD_TABLE_UXN : PUD_TABLE_PXN;
        __pud_populate(pudp, __pa(pmdp), pudval);
}
#else
static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot)
{
        BUILD_BUG();
}
#endif        /* CONFIG_PGTABLE_LEVELS > 2 */

#if CONFIG_PGTABLE_LEVELS > 3

static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
{
        if (pgtable_l4_enabled())
                set_p4d(p4dp, __p4d(__phys_to_p4d_val(pudp) | prot));
}

static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp)
{
        p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_AF;

        p4dval |= (mm == &init_mm) ? P4D_TABLE_UXN : P4D_TABLE_PXN;
        __p4d_populate(p4dp, __pa(pudp), p4dval);
}

static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
        if (!pgtable_l4_enabled())
                return;
        __pud_free(mm, pud);
}
#else
static inline void __p4d_populate(p4d_t *p4dp, phys_addr_t pudp, p4dval_t prot)
{
        BUILD_BUG();
}
#endif        /* CONFIG_PGTABLE_LEVELS > 3 */

#if CONFIG_PGTABLE_LEVELS > 4

static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
{
        if (pgtable_l5_enabled())
                set_pgd(pgdp, __pgd(__phys_to_pgd_val(p4dp) | prot));
}

static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp)
{
        pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_AF;

        pgdval |= (mm == &init_mm) ? PGD_TABLE_UXN : PGD_TABLE_PXN;
        __pgd_populate(pgdp, __pa(p4dp), pgdval);
}

#else
static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t p4dp, pgdval_t prot)
{
        BUILD_BUG();
}
#endif        /* CONFIG_PGTABLE_LEVELS > 4 */

extern pgd_t *pgd_alloc(struct mm_struct *mm);
extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp);

static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t ptep,
                                  pmdval_t prot)
{
        set_pmd(pmdp, __pmd(__phys_to_pmd_val(ptep) | prot));
}

/*
 * Populate the pmdp entry with a pointer to the pte.  This pmd is part
 * of the mm address space.
 */
static inline void
pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
{
        VM_BUG_ON(mm && mm != &init_mm);
        __pmd_populate(pmdp, __pa(ptep),
                       PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_UXN);
}

static inline void
pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
{
        VM_BUG_ON(mm == &init_mm);
        __pmd_populate(pmdp, page_to_phys(ptep),
                       PMD_TYPE_TABLE | PMD_TABLE_AF | PMD_TABLE_PXN);
}

#endif






















































































































































































































































































































    3 
    3 































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 



    3 




    3 


































    3 











































    3 




























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































    3 
    3 









    3 


    3 



    3 

















    3 









    3 




















    3 















    3 










    3 
    3 







    3 
    3 




    3 






























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
5523
5524
5525
5526
5527
5528
5529
5530
5531
5532
5533
5534
5535
5536
5537
5538
5539
5540
5541
5542
5543
5544
5545
5546
5547
5548
5549
5550
5551
5552
5553
5554
5555
5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
5708
5709
5710
5711
5712
5713
5714
5715
5716
5717
5718
5719
5720
5721
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
5753
5754
5755
5756
5757
5758
5759
5760
5761
5762
5763
5764
5765
5766
5767
5768
5769
5770
5771
5772
5773
5774
5775
5776
5777
5778
5779
5780
5781
5782
5783
5784
5785
5786
5787
5788
5789
5790
5791
5792
5793
5794
5795
5796
5797
5798
5799
5800
5801
5802
5803
5804
5805
5806
5807
5808
5809
5810
5811
5812
5813
5814
5815
5816
5817
5818
5819
5820
5821
5822
5823
5824
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
5853
5854
5855
5856
5857
5858
5859
5860
5861
5862
5863
5864
5865
5866
5867
5868
5869
5870
5871
5872
5873
5874
5875
5876
5877
5878
5879
5880
5881
5882
5883
5884
5885
5886
5887
5888
5889
5890
5891
5892
5893
5894
5895
5896
5897
5898
5899
5900
5901
5902
5903
5904
5905
5906
5907
5908
5909
5910
5911
5912
5913
5914
5915
5916
5917
5918
5919
5920
5921
5922
5923
5924
5925
5926
5927
5928
5929
5930
5931
5932
5933
5934
5935
5936
5937
5938
5939
5940
5941
5942
5943
5944
5945
5946
5947
5948
5949
5950
5951
5952
5953
5954
5955
5956
5957
5958
5959
5960
5961
5962
5963
5964
5965
5966
5967
5968
5969
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
6348
6349
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
6371
6372
6373
6374
6375
6376
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
6438
6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468
6469
6470
6471
6472
6473
6474
6475
6476
6477
6478
6479
6480
6481
6482
6483
6484
6485
6486
6487
6488
6489
6490
6491
6492
6493
6494
6495
6496
6497
6498
6499
6500
6501
6502
6503
6504
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736
6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973
6974
6975
6976
6977
6978
6979
6980
6981
6982
6983
6984
6985
6986
6987
6988
6989
6990
6991
6992
6993
6994
6995
6996
6997
6998
6999
7000
7001
7002
7003
7004
7005
7006
7007
7008
7009
7010
7011
7012
7013
7014
7015
7016
7017
7018
7019
7020
7021
7022
7023
7024
7025
7026
7027
7028
7029
7030
7031
7032
7033
7034
7035
7036
7037
7038
7039
7040
7041
7042
7043
7044
7045
7046
7047
7048
7049
7050
7051
7052
7053
7054
7055
7056
7057
7058
7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
7074
7075
7076
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087
7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101
7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
7138
7139
7140
7141
7142
7143
7144
7145
7146
7147
7148
7149
7150
7151
7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
7170
7171
7172
7173
7174
7175
7176
7177
7178
7179
7180
7181
7182
7183
7184
7185
7186
7187
7188
7189
7190
7191
7192
7193
7194
7195
7196
7197
7198
7199
7200
7201
7202
7203
7204
7205
7206
7207
7208
7209
7210
7211
7212
7213
7214
7215
7216
7217
7218
7219
7220
7221
7222
7223
7224
7225
7226
7227
7228
7229
7230
7231
7232
7233
7234
7235
7236
7237
7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
7248
7249
7250
7251
7252
7253
7254
7255
7256
7257
7258
7259
7260
7261
7262
7263
7264
7265
7266
7267
7268
7269
7270
7271
7272
7273
7274
7275
7276
7277
7278
7279
7280
7281
7282
7283
7284
7285
7286
7287
7288
7289
7290
7291
7292
7293
7294
7295
7296
7297
7298
7299
7300
7301
7302
7303
7304
7305
7306
7307
7308
7309
7310
7311
7312
7313
7314
7315
7316
7317
7318
7319
7320
7321
7322
7323
7324
7325
7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
7345
7346
7347
7348
7349
7350
7351
7352
7353
7354
7355
7356
7357
7358
7359
7360
7361
7362
7363
7364
7365
7366
7367
7368
7369
7370
7371
7372
7373
7374
7375
7376
7377
7378
7379
7380
7381
7382
7383
7384
7385
7386
7387
7388
7389
7390
7391
7392
7393
7394
7395
7396
7397
7398
7399
7400
7401
7402
7403
7404
7405
7406
7407
7408
7409
7410
7411
7412
7413
7414
7415
7416
7417
7418
7419
7420
7421
7422
7423
7424
7425
7426
7427
7428
7429
7430
7431
7432
7433
7434
7435
7436
7437
7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
7454
7455
7456
7457
7458
7459
7460
7461
7462
7463
7464
7465
7466
7467
7468
7469
7470
7471
7472
7473
7474
7475
7476
7477
7478
7479
7480
7481
7482
7483
7484
7485
7486
7487
7488
7489
7490
7491
7492
7493
7494
7495
7496
7497
7498
7499
7500
7501
7502
7503
7504
7505
7506
7507
7508
7509
7510
7511
7512
7513
7514
7515
7516
7517
7518
7519
7520
7521
7522
7523
7524
7525
7526
7527
7528
7529
7530
7531
7532
7533
7534
7535
7536
7537
7538
7539
7540
7541
7542
7543
// SPDX-License-Identifier: GPL-2.0-or-later
/*
 *        IPv6 Address [auto]configuration
 *        Linux INET6 implementation
 *
 *        Authors:
 *        Pedro Roque                <roque@di.fc.ul.pt>
 *        Alexey Kuznetsov        <kuznet@ms2.inr.ac.ru>
 */

/*
 *        Changes:
 *
 *        Janos Farkas                        :        delete timer on ifdown
 *        <chexum@bankinf.banki.hu>
 *        Andi Kleen                        :        kill double kfree on module
 *                                                unload.
 *        Maciej W. Rozycki                :        FDDI support
 *        sekiya@USAGI                        :        Don't send too many RS
 *                                                packets.
 *        yoshfuji@USAGI                        :       Fixed interval between DAD
 *                                                packets.
 *        YOSHIFUJI Hideaki @USAGI        :        improved accuracy of
 *                                                address validation timer.
 *        YOSHIFUJI Hideaki @USAGI        :        Privacy Extensions (RFC3041)
 *                                                support.
 *        Yuji SEKIYA @USAGI                :        Don't assign a same IPv6
 *                                                address on a same interface.
 *        YOSHIFUJI Hideaki @USAGI        :        ARCnet support
 *        YOSHIFUJI Hideaki @USAGI        :        convert /proc/net/if_inet6 to
 *                                                seq_file.
 *        YOSHIFUJI Hideaki @USAGI        :        improved source address
 *                                                selection; consider scope,
 *                                                status etc.
 */

#define pr_fmt(fmt) "IPv6: " fmt

#include <linux/errno.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/if_arcnet.h>
#include <linux/if_infiniband.h>
#include <linux/route.h>
#include <linux/inetdevice.h>
#include <linux/init.h>
#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
#include <linux/capability.h>
#include <linux/delay.h>
#include <linux/notifier.h>
#include <linux/string.h>
#include <linux/hash.h>

#include <net/ip_tunnels.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/snmp.h>

#include <net/6lowpan.h>
#include <net/firewire.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/ndisc.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/l3mdev.h>
#include <net/netdev_lock.h>
#include <linux/if_tunnel.h>
#include <linux/rtnetlink.h>
#include <linux/netconf.h>
#include <linux/random.h>
#include <linux/uaccess.h>
#include <linux/unaligned.h>

#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/export.h>
#include <linux/ioam6.h>

#define IPV6_MAX_STRLEN \
        sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255")

static inline u32 cstamp_delta(unsigned long cstamp)
{
        return (cstamp - INITIAL_JIFFIES) * 100UL / HZ;
}

static inline s32 rfc3315_s14_backoff_init(s32 irt)
{
        /* multiply 'initial retransmission time' by 0.9 .. 1.1 */
        u64 tmp = get_random_u32_inclusive(900000, 1100000) * (u64)irt;
        do_div(tmp, 1000000);
        return (s32)tmp;
}

static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
{
        /* multiply 'retransmission timeout' by 1.9 .. 2.1 */
        u64 tmp = get_random_u32_inclusive(1900000, 2100000) * (u64)rt;
        do_div(tmp, 1000000);
        if ((s32)tmp > mrt) {
                /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
                tmp = get_random_u32_inclusive(900000, 1100000) * (u64)mrt;
                do_div(tmp, 1000000);
        }
        return (s32)tmp;
}

#ifdef CONFIG_SYSCTL
static int addrconf_sysctl_register(struct inet6_dev *idev);
static void addrconf_sysctl_unregister(struct inet6_dev *idev);
#else
static inline int addrconf_sysctl_register(struct inet6_dev *idev)
{
        return 0;
}

static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
{
}
#endif

static void ipv6_gen_rnd_iid(struct in6_addr *addr);

static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
static int ipv6_count_addresses(const struct inet6_dev *idev);
static int ipv6_generate_stable_address(struct in6_addr *addr,
                                        u8 dad_count,
                                        const struct inet6_dev *idev);

#define IN6_ADDR_HSIZE_SHIFT        8
#define IN6_ADDR_HSIZE                (1 << IN6_ADDR_HSIZE_SHIFT)

static void addrconf_verify(struct net *net);
static void addrconf_verify_rtnl(struct net *net);

static struct workqueue_struct *addrconf_wq;

static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);

static void addrconf_type_change(struct net_device *dev,
                                 unsigned long event);
static int addrconf_ifdown(struct net_device *dev, bool unregister);

static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
                                                  int plen,
                                                  const struct net_device *dev,
                                                  u32 flags, u32 noflags,
                                                  bool no_gw);

static void addrconf_dad_start(struct inet6_ifaddr *ifp);
static void addrconf_dad_work(struct work_struct *w);
static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
                                   bool send_na);
static void addrconf_dad_run(struct inet6_dev *idev, bool restart);
static void addrconf_rs_timer(struct timer_list *t);
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);

static void inet6_prefix_notify(int event, struct inet6_dev *idev,
                                struct prefix_info *pinfo);

static struct ipv6_devconf ipv6_devconf __read_mostly = {
        .forwarding                = 0,
        .hop_limit                = IPV6_DEFAULT_HOPLIMIT,
        .mtu6                        = IPV6_MIN_MTU,
        .accept_ra                = 1,
        .accept_redirects        = 1,
        .autoconf                = 1,
        .force_mld_version        = 0,
        .mldv1_unsolicited_report_interval = 10 * HZ,
        .mldv2_unsolicited_report_interval = HZ,
        .dad_transmits                = 1,
        .rtr_solicits                = MAX_RTR_SOLICITATIONS,
        .rtr_solicit_interval        = RTR_SOLICITATION_INTERVAL,
        .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
        .rtr_solicit_delay        = MAX_RTR_SOLICITATION_DELAY,
        .use_tempaddr                = 0,
        .temp_valid_lft                = TEMP_VALID_LIFETIME,
        .temp_prefered_lft        = TEMP_PREFERRED_LIFETIME,
        .regen_min_advance        = REGEN_MIN_ADVANCE,
        .regen_max_retry        = REGEN_MAX_RETRY,
        .max_desync_factor        = MAX_DESYNC_FACTOR,
        .max_addresses                = IPV6_MAX_ADDRESSES,
        .accept_ra_defrtr        = 1,
        .ra_defrtr_metric        = IP6_RT_PRIO_USER,
        .accept_ra_from_local        = 0,
        .accept_ra_min_hop_limit= 1,
        .accept_ra_min_lft        = 0,
        .accept_ra_pinfo        = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
        .accept_ra_rtr_pref        = 1,
        .rtr_probe_interval        = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
        .accept_ra_rt_info_min_plen = 0,
        .accept_ra_rt_info_max_plen = 0,
#endif
#endif
        .proxy_ndp                = 0,
        .accept_source_route        = 0,        /* we do not accept RH0 by default. */
        .disable_ipv6                = 0,
        .accept_dad                = 0,
        .suppress_frag_ndisc        = 1,
        .accept_ra_mtu                = 1,
        .stable_secret                = {
                .initialized = false,
        },
        .use_oif_addrs_only        = 0,
        .ignore_routes_with_linkdown = 0,
        .keep_addr_on_down        = 0,
        .seg6_enabled                = 0,
#ifdef CONFIG_IPV6_SEG6_HMAC
        .seg6_require_hmac        = 0,
#endif
        .enhanced_dad           = 1,
        .addr_gen_mode                = IN6_ADDR_GEN_MODE_EUI64,
        .disable_policy                = 0,
        .rpl_seg_enabled        = 0,
        .ioam6_enabled                = 0,
        .ioam6_id               = IOAM6_DEFAULT_IF_ID,
        .ioam6_id_wide                = IOAM6_DEFAULT_IF_ID_WIDE,
        .ndisc_evict_nocarrier        = 1,
        .ra_honor_pio_life        = 0,
        .ra_honor_pio_pflag        = 0,
};

static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
        .forwarding                = 0,
        .hop_limit                = IPV6_DEFAULT_HOPLIMIT,
        .mtu6                        = IPV6_MIN_MTU,
        .accept_ra                = 1,
        .accept_redirects        = 1,
        .autoconf                = 1,
        .force_mld_version        = 0,
        .mldv1_unsolicited_report_interval = 10 * HZ,
        .mldv2_unsolicited_report_interval = HZ,
        .dad_transmits                = 1,
        .rtr_solicits                = MAX_RTR_SOLICITATIONS,
        .rtr_solicit_interval        = RTR_SOLICITATION_INTERVAL,
        .rtr_solicit_max_interval = RTR_SOLICITATION_MAX_INTERVAL,
        .rtr_solicit_delay        = MAX_RTR_SOLICITATION_DELAY,
        .use_tempaddr                = 0,
        .temp_valid_lft                = TEMP_VALID_LIFETIME,
        .temp_prefered_lft        = TEMP_PREFERRED_LIFETIME,
        .regen_min_advance        = REGEN_MIN_ADVANCE,
        .regen_max_retry        = REGEN_MAX_RETRY,
        .max_desync_factor        = MAX_DESYNC_FACTOR,
        .max_addresses                = IPV6_MAX_ADDRESSES,
        .accept_ra_defrtr        = 1,
        .ra_defrtr_metric        = IP6_RT_PRIO_USER,
        .accept_ra_from_local        = 0,
        .accept_ra_min_hop_limit= 1,
        .accept_ra_min_lft        = 0,
        .accept_ra_pinfo        = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
        .accept_ra_rtr_pref        = 1,
        .rtr_probe_interval        = 60 * HZ,
#ifdef CONFIG_IPV6_ROUTE_INFO
        .accept_ra_rt_info_min_plen = 0,
        .accept_ra_rt_info_max_plen = 0,
#endif
#endif
        .proxy_ndp                = 0,
        .accept_source_route        = 0,        /* we do not accept RH0 by default. */
        .disable_ipv6                = 0,
        .accept_dad                = 1,
        .suppress_frag_ndisc        = 1,
        .accept_ra_mtu                = 1,
        .stable_secret                = {
                .initialized = false,
        },
        .use_oif_addrs_only        = 0,
        .ignore_routes_with_linkdown = 0,
        .keep_addr_on_down        = 0,
        .seg6_enabled                = 0,
#ifdef CONFIG_IPV6_SEG6_HMAC
        .seg6_require_hmac        = 0,
#endif
        .enhanced_dad           = 1,
        .addr_gen_mode                = IN6_ADDR_GEN_MODE_EUI64,
        .disable_policy                = 0,
        .rpl_seg_enabled        = 0,
        .ioam6_enabled                = 0,
        .ioam6_id               = IOAM6_DEFAULT_IF_ID,
        .ioam6_id_wide                = IOAM6_DEFAULT_IF_ID_WIDE,
        .ndisc_evict_nocarrier        = 1,
        .ra_honor_pio_life        = 0,
        .ra_honor_pio_pflag        = 0,
};

/* Check if link is ready: is it up and is a valid qdisc available */
static inline bool addrconf_link_ready(const struct net_device *dev)
{
        return netif_oper_up(dev) && !qdisc_tx_is_noop(dev);
}

static void addrconf_del_rs_timer(struct inet6_dev *idev)
{
        if (timer_delete(&idev->rs_timer))
                __in6_dev_put(idev);
}

static void addrconf_del_dad_work(struct inet6_ifaddr *ifp)
{
        if (cancel_delayed_work(&ifp->dad_work))
                __in6_ifa_put(ifp);
}

static void addrconf_mod_rs_timer(struct inet6_dev *idev,
                                  unsigned long when)
{
        if (!mod_timer(&idev->rs_timer, jiffies + when))
                in6_dev_hold(idev);
}

static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp,
                                   unsigned long delay)
{
        in6_ifa_hold(ifp);
        if (mod_delayed_work(addrconf_wq, &ifp->dad_work, delay))
                in6_ifa_put(ifp);
}

static int snmp6_alloc_dev(struct inet6_dev *idev)
{
        int i;

        idev->stats.ipv6 = alloc_percpu_gfp(struct ipstats_mib, GFP_KERNEL_ACCOUNT);
        if (!idev->stats.ipv6)
                goto err_ip;

        for_each_possible_cpu(i) {
                struct ipstats_mib *addrconf_stats;
                addrconf_stats = per_cpu_ptr(idev->stats.ipv6, i);
                u64_stats_init(&addrconf_stats->syncp);
        }


        idev->stats.icmpv6dev = kzalloc(sizeof(struct icmpv6_mib_device),
                                        GFP_KERNEL);
        if (!idev->stats.icmpv6dev)
                goto err_icmp;
        idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device),
                                           GFP_KERNEL_ACCOUNT);
        if (!idev->stats.icmpv6msgdev)
                goto err_icmpmsg;

        return 0;

err_icmpmsg:
        kfree(idev->stats.icmpv6dev);
err_icmp:
        free_percpu(idev->stats.ipv6);
err_ip:
        return -ENOMEM;
}

static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
{
        struct inet6_dev *ndev;
        int err = -ENOMEM;

        ASSERT_RTNL();
        netdev_ops_assert_locked(dev);

        if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev)
                return ERR_PTR(-EINVAL);

        ndev = kzalloc(sizeof(*ndev), GFP_KERNEL_ACCOUNT);
        if (!ndev)
                return ERR_PTR(err);

        rwlock_init(&ndev->lock);
        ndev->dev = dev;
        INIT_LIST_HEAD(&ndev->addr_list);
        timer_setup(&ndev->rs_timer, addrconf_rs_timer, 0);
        memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));

        if (ndev->cnf.stable_secret.initialized)
                ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;

        ndev->cnf.mtu6 = dev->mtu;
        ndev->ra_mtu = 0;
        ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
        if (!ndev->nd_parms) {
                kfree(ndev);
                return ERR_PTR(err);
        }
        if (ndev->cnf.forwarding)
                netif_disable_lro(dev);
        /* We refer to the device */
        netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL);

        if (snmp6_alloc_dev(ndev) < 0) {
                netdev_dbg(dev, "%s: cannot allocate memory for statistics\n",
                           __func__);
                neigh_parms_release(&nd_tbl, ndev->nd_parms);
                netdev_put(dev, &ndev->dev_tracker);
                kfree(ndev);
                return ERR_PTR(err);
        }

        if (dev != blackhole_netdev) {
                if (snmp6_register_dev(ndev) < 0) {
                        netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
                                   __func__, dev->name);
                        goto err_release;
                }
        }
        /* One reference from device. */
        refcount_set(&ndev->refcnt, 1);

        if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
                ndev->cnf.accept_dad = -1;

#if IS_ENABLED(CONFIG_IPV6_SIT)
        if (dev->type == ARPHRD_SIT && (dev->priv_flags & IFF_ISATAP)) {
                pr_info("%s: Disabled Multicast RS\n", dev->name);
                ndev->cnf.rtr_solicits = 0;
        }
#endif

        INIT_LIST_HEAD(&ndev->tempaddr_list);
        ndev->desync_factor = U32_MAX;
        if ((dev->flags&IFF_LOOPBACK) ||
            dev->type == ARPHRD_TUNNEL ||
            dev->type == ARPHRD_TUNNEL6 ||
            dev->type == ARPHRD_SIT ||
            dev->type == ARPHRD_NONE) {
                ndev->cnf.use_tempaddr = -1;
        }

        ndev->token = in6addr_any;

        if (netif_running(dev) && addrconf_link_ready(dev))
                ndev->if_flags |= IF_READY;

        ipv6_mc_init_dev(ndev);
        ndev->tstamp = jiffies;
        if (dev != blackhole_netdev) {
                err = addrconf_sysctl_register(ndev);
                if (err) {
                        ipv6_mc_destroy_dev(ndev);
                        snmp6_unregister_dev(ndev);
                        goto err_release;
                }
        }
        /* protected by rtnl_lock */
        rcu_assign_pointer(dev->ip6_ptr, ndev);

        if (dev != blackhole_netdev) {
                /* Join interface-local all-node multicast group */
                ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);

                /* Join all-node multicast group */
                ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);

                /* Join all-router multicast group if forwarding is set */
                if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
                        ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
        }
        return ndev;

err_release:
        neigh_parms_release(&nd_tbl, ndev->nd_parms);
        ndev->dead = 1;
        in6_dev_finish_destroy(ndev);
        return ERR_PTR(err);
}

static struct inet6_dev *ipv6_find_idev(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        idev = __in6_dev_get(dev);
        if (!idev) {
                idev = ipv6_add_dev(dev);
                if (IS_ERR(idev))
                        return idev;
        }

        if (dev->flags&IFF_UP)
                ipv6_mc_up(idev);
        return idev;
}

static int inet6_netconf_msgsize_devconf(int type)
{
        int size =  NLMSG_ALIGN(sizeof(struct netconfmsg))
                    + nla_total_size(4);        /* NETCONFA_IFINDEX */
        bool all = false;

        if (type == NETCONFA_ALL)
                all = true;

        if (all || type == NETCONFA_FORWARDING)
                size += nla_total_size(4);
#ifdef CONFIG_IPV6_MROUTE
        if (all || type == NETCONFA_MC_FORWARDING)
                size += nla_total_size(4);
#endif
        if (all || type == NETCONFA_PROXY_NEIGH)
                size += nla_total_size(4);

        if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
                size += nla_total_size(4);

        return size;
}

static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
                                      struct ipv6_devconf *devconf, u32 portid,
                                      u32 seq, int event, unsigned int flags,
                                      int type)
{
        struct nlmsghdr  *nlh;
        struct netconfmsg *ncm;
        bool all = false;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
                        flags);
        if (!nlh)
                return -EMSGSIZE;

        if (type == NETCONFA_ALL)
                all = true;

        ncm = nlmsg_data(nlh);
        ncm->ncm_family = AF_INET6;

        if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
                goto nla_put_failure;

        if (!devconf)
                goto out;

        if ((all || type == NETCONFA_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_FORWARDING,
                        READ_ONCE(devconf->forwarding)) < 0)
                goto nla_put_failure;
#ifdef CONFIG_IPV6_MROUTE
        if ((all || type == NETCONFA_MC_FORWARDING) &&
            nla_put_s32(skb, NETCONFA_MC_FORWARDING,
                        atomic_read(&devconf->mc_forwarding)) < 0)
                goto nla_put_failure;
#endif
        if ((all || type == NETCONFA_PROXY_NEIGH) &&
            nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
                        READ_ONCE(devconf->proxy_ndp)) < 0)
                goto nla_put_failure;

        if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
            nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                        READ_ONCE(devconf->ignore_routes_with_linkdown)) < 0)
                goto nla_put_failure;

out:
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

void inet6_netconf_notify_devconf(struct net *net, int event, int type,
                                  int ifindex, struct ipv6_devconf *devconf)
{
        struct sk_buff *skb;
        int err = -ENOBUFS;

        skb = nlmsg_new(inet6_netconf_msgsize_devconf(type), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet6_netconf_fill_devconf(skb, ifindex, devconf, 0, 0,
                                         event, 0, type);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_NETCONF, NULL, GFP_KERNEL);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_NETCONF, err);
}

static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
        [NETCONFA_IFINDEX]        = { .len = sizeof(int) },
        [NETCONFA_FORWARDING]        = { .len = sizeof(int) },
        [NETCONFA_PROXY_NEIGH]        = { .len = sizeof(int) },
        [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN]        = { .len = sizeof(int) },
};

static int inet6_netconf_valid_get_req(struct sk_buff *skb,
                                       const struct nlmsghdr *nlh,
                                       struct nlattr **tb,
                                       struct netlink_ext_ack *extack)
{
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf get request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
                                              tb, NETCONFA_MAX,
                                              devconf_ipv6_policy, extack);

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
                                            tb, NETCONFA_MAX,
                                            devconf_ipv6_policy, extack);
        if (err)
                return err;

        for (i = 0; i <= NETCONFA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case NETCONFA_IFINDEX:
                        break;
                default:
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in netconf get request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
                                     struct nlmsghdr *nlh,
                                     struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(in_skb->sk);
        struct nlattr *tb[NETCONFA_MAX+1];
        struct inet6_dev *in6_dev = NULL;
        struct net_device *dev = NULL;
        struct sk_buff *skb;
        struct ipv6_devconf *devconf;
        int ifindex;
        int err;

        err = inet6_netconf_valid_get_req(in_skb, nlh, tb, extack);
        if (err < 0)
                return err;

        if (!tb[NETCONFA_IFINDEX])
                return -EINVAL;

        err = -EINVAL;
        ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
        switch (ifindex) {
        case NETCONFA_IFINDEX_ALL:
                devconf = net->ipv6.devconf_all;
                break;
        case NETCONFA_IFINDEX_DEFAULT:
                devconf = net->ipv6.devconf_dflt;
                break;
        default:
                dev = dev_get_by_index(net, ifindex);
                if (!dev)
                        return -EINVAL;
                in6_dev = in6_dev_get(dev);
                if (!in6_dev)
                        goto errout;
                devconf = &in6_dev->cnf;
                break;
        }

        err = -ENOBUFS;
        skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL);
        if (!skb)
                goto errout;

        err = inet6_netconf_fill_devconf(skb, ifindex, devconf,
                                         NETLINK_CB(in_skb).portid,
                                         nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
                                         NETCONFA_ALL);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
        if (in6_dev)
                in6_dev_put(in6_dev);
        dev_put(dev);
        return err;
}

/* Combine dev_addr_genid and dev_base_seq to detect changes.
 */
static u32 inet6_base_seq(const struct net *net)
{
        u32 res = atomic_read(&net->ipv6.dev_addr_genid) +
                  READ_ONCE(net->dev_base_seq);

        /* Must not return 0 (see nl_dump_check_consistent()).
         * Chose a value far away from 0.
         */
        if (!res)
                res = 0x80000000;
        return res;
}

static int inet6_netconf_dump_devconf(struct sk_buff *skb,
                                      struct netlink_callback *cb)
{
        const struct nlmsghdr *nlh = cb->nlh;
        struct net *net = sock_net(skb->sk);
        struct {
                unsigned long ifindex;
                unsigned int all_default;
        } *ctx = (void *)cb->ctx;
        struct net_device *dev;
        struct inet6_dev *idev;
        int err = 0;

        if (cb->strict_check) {
                struct netlink_ext_ack *extack = cb->extack;
                struct netconfmsg *ncm;

                if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
                        NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request");
                        return -EINVAL;
                }

                if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
                        NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request");
                        return -EINVAL;
                }
        }

        rcu_read_lock();
        for_each_netdev_dump(net, dev, ctx->ifindex) {
                idev = __in6_dev_get(dev);
                if (!idev)
                        continue;
                err = inet6_netconf_fill_devconf(skb, dev->ifindex,
                                                 &idev->cnf,
                                                 NETLINK_CB(cb->skb).portid,
                                                 nlh->nlmsg_seq,
                                                 RTM_NEWNETCONF,
                                                 NLM_F_MULTI,
                                                 NETCONFA_ALL);
                if (err < 0)
                        goto done;
        }
        if (ctx->all_default == 0) {
                err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
                                                 net->ipv6.devconf_all,
                                                 NETLINK_CB(cb->skb).portid,
                                                 nlh->nlmsg_seq,
                                                 RTM_NEWNETCONF, NLM_F_MULTI,
                                                 NETCONFA_ALL);
                if (err < 0)
                        goto done;
                ctx->all_default++;
        }
        if (ctx->all_default == 1) {
                err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
                                                 net->ipv6.devconf_dflt,
                                                 NETLINK_CB(cb->skb).portid,
                                                 nlh->nlmsg_seq,
                                                 RTM_NEWNETCONF, NLM_F_MULTI,
                                                 NETCONFA_ALL);
                if (err < 0)
                        goto done;
                ctx->all_default++;
        }
done:
        rcu_read_unlock();
        return err;
}

#ifdef CONFIG_SYSCTL
static void dev_forward_change(struct inet6_dev *idev)
{
        struct net_device *dev;
        struct inet6_ifaddr *ifa;
        LIST_HEAD(tmp_addr_list);

        if (!idev)
                return;
        dev = idev->dev;
        if (idev->cnf.forwarding)
                dev_disable_lro(dev);
        if (dev->flags & IFF_MULTICAST) {
                if (idev->cnf.forwarding) {
                        ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
                        ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allrouters);
                        ipv6_dev_mc_inc(dev, &in6addr_sitelocal_allrouters);
                } else {
                        ipv6_dev_mc_dec(dev, &in6addr_linklocal_allrouters);
                        ipv6_dev_mc_dec(dev, &in6addr_interfacelocal_allrouters);
                        ipv6_dev_mc_dec(dev, &in6addr_sitelocal_allrouters);
                }
        }

        read_lock_bh(&idev->lock);
        list_for_each_entry(ifa, &idev->addr_list, if_list) {
                if (ifa->flags&IFA_F_TENTATIVE)
                        continue;
                list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
        }
        read_unlock_bh(&idev->lock);

        while (!list_empty(&tmp_addr_list)) {
                ifa = list_first_entry(&tmp_addr_list,
                                       struct inet6_ifaddr, if_list_aux);
                list_del(&ifa->if_list_aux);
                if (idev->cnf.forwarding)
                        addrconf_join_anycast(ifa);
                else
                        addrconf_leave_anycast(ifa);
        }

        inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
                                     NETCONFA_FORWARDING,
                                     dev->ifindex, &idev->cnf);
}


static void addrconf_forward_change(struct net *net, __s32 newf)
{
        struct net_device *dev;
        struct inet6_dev *idev;

        for_each_netdev(net, dev) {
                idev = __in6_dev_get_rtnl_net(dev);
                if (idev) {
                        int changed = (!idev->cnf.forwarding) ^ (!newf);

                        WRITE_ONCE(idev->cnf.forwarding, newf);
                        if (changed)
                                dev_forward_change(idev);
                }
        }
}

static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf)
{
        struct net *net = (struct net *)table->extra2;
        int old;

        if (!rtnl_net_trylock(net))
                return restart_syscall();

        old = *p;
        WRITE_ONCE(*p, newf);

        if (p == &net->ipv6.devconf_dflt->forwarding) {
                if ((!newf) ^ (!old))
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_FORWARDING,
                                                     NETCONFA_IFINDEX_DEFAULT,
                                                     net->ipv6.devconf_dflt);
                rtnl_net_unlock(net);
                return 0;
        }

        if (p == &net->ipv6.devconf_all->forwarding) {
                int old_dflt = net->ipv6.devconf_dflt->forwarding;

                WRITE_ONCE(net->ipv6.devconf_dflt->forwarding, newf);
                if ((!newf) ^ (!old_dflt))
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_FORWARDING,
                                                     NETCONFA_IFINDEX_DEFAULT,
                                                     net->ipv6.devconf_dflt);

                addrconf_forward_change(net, newf);
                if ((!newf) ^ (!old))
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_FORWARDING,
                                                     NETCONFA_IFINDEX_ALL,
                                                     net->ipv6.devconf_all);
        } else if ((!newf) ^ (!old))
                dev_forward_change((struct inet6_dev *)table->extra1);
        rtnl_net_unlock(net);

        if (newf)
                rt6_purge_dflt_routers(net);
        return 1;
}

static void addrconf_linkdown_change(struct net *net, __s32 newf)
{
        struct net_device *dev;
        struct inet6_dev *idev;

        for_each_netdev(net, dev) {
                idev = __in6_dev_get_rtnl_net(dev);
                if (idev) {
                        int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);

                        WRITE_ONCE(idev->cnf.ignore_routes_with_linkdown, newf);
                        if (changed)
                                inet6_netconf_notify_devconf(dev_net(dev),
                                                             RTM_NEWNETCONF,
                                                             NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                                                             dev->ifindex,
                                                             &idev->cnf);
                }
        }
}

static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf)
{
        struct net *net = (struct net *)table->extra2;
        int old;

        if (!rtnl_net_trylock(net))
                return restart_syscall();

        old = *p;
        WRITE_ONCE(*p, newf);

        if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
                if ((!newf) ^ (!old))
                        inet6_netconf_notify_devconf(net,
                                                     RTM_NEWNETCONF,
                                                     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                                                     NETCONFA_IFINDEX_DEFAULT,
                                                     net->ipv6.devconf_dflt);
                rtnl_net_unlock(net);
                return 0;
        }

        if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
                WRITE_ONCE(net->ipv6.devconf_dflt->ignore_routes_with_linkdown, newf);
                addrconf_linkdown_change(net, newf);
                if ((!newf) ^ (!old))
                        inet6_netconf_notify_devconf(net,
                                                     RTM_NEWNETCONF,
                                                     NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
                                                     NETCONFA_IFINDEX_ALL,
                                                     net->ipv6.devconf_all);
        }

        rtnl_net_unlock(net);

        return 1;
}

#endif

/* Nobody refers to this ifaddr, destroy it */
void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp)
{
        WARN_ON(!hlist_unhashed(&ifp->addr_lst));

#ifdef NET_REFCNT_DEBUG
        pr_debug("%s\n", __func__);
#endif

        in6_dev_put(ifp->idev);

        if (cancel_delayed_work(&ifp->dad_work))
                pr_notice("delayed DAD work was pending while freeing ifa=%p\n",
                          ifp);

        if (ifp->state != INET6_IFADDR_STATE_DEAD) {
                pr_warn("Freeing alive inet6 address %p\n", ifp);
                return;
        }

        kfree_rcu(ifp, rcu);
}

static void
ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
{
        struct list_head *p;
        int ifp_scope = ipv6_addr_src_scope(&ifp->addr);

        /*
         * Each device address list is sorted in order of scope -
         * global before linklocal.
         */
        list_for_each(p, &idev->addr_list) {
                struct inet6_ifaddr *ifa
                        = list_entry(p, struct inet6_ifaddr, if_list);
                if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr))
                        break;
        }

        list_add_tail_rcu(&ifp->if_list, p);
}

static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr)
{
        u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net));

        return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
}

static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
                               struct net_device *dev, unsigned int hash)
{
        struct inet6_ifaddr *ifp;

        hlist_for_each_entry(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                if (ipv6_addr_equal(&ifp->addr, addr)) {
                        if (!dev || ifp->idev->dev == dev)
                                return true;
                }
        }
        return false;
}

static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
{
        struct net *net = dev_net(dev);
        unsigned int hash = inet6_addr_hash(net, &ifa->addr);
        int err = 0;

        spin_lock_bh(&net->ipv6.addrconf_hash_lock);

        /* Ignore adding duplicate addresses on an interface */
        if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) {
                netdev_dbg(dev, "ipv6_add_addr: already assigned\n");
                err = -EEXIST;
        } else {
                hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]);
        }

        spin_unlock_bh(&net->ipv6.addrconf_hash_lock);

        return err;
}

/* On success it returns ifp with increased reference count */

static struct inet6_ifaddr *
ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
              bool can_block, struct netlink_ext_ack *extack)
{
        gfp_t gfp_flags = can_block ? GFP_KERNEL : GFP_ATOMIC;
        int addr_type = ipv6_addr_type(cfg->pfx);
        struct net *net = dev_net(idev->dev);
        struct inet6_ifaddr *ifa = NULL;
        struct fib6_info *f6i = NULL;
        int err = 0;

        if (addr_type == IPV6_ADDR_ANY) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid address");
                return ERR_PTR(-EADDRNOTAVAIL);
        } else if (addr_type & IPV6_ADDR_MULTICAST &&
                   !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) {
                NL_SET_ERR_MSG_MOD(extack, "Cannot assign multicast address without \"IFA_F_MCAUTOJOIN\" flag");
                return ERR_PTR(-EADDRNOTAVAIL);
        } else if (!(idev->dev->flags & IFF_LOOPBACK) &&
                   !netif_is_l3_master(idev->dev) &&
                   addr_type & IPV6_ADDR_LOOPBACK) {
                NL_SET_ERR_MSG_MOD(extack, "Cannot assign loopback address on this device");
                return ERR_PTR(-EADDRNOTAVAIL);
        }

        if (idev->dead) {
                NL_SET_ERR_MSG_MOD(extack, "device is going away");
                err = -ENODEV;
                goto out;
        }

        if (idev->cnf.disable_ipv6) {
                NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
                err = -EACCES;
                goto out;
        }

        /* validator notifier needs to be blocking;
         * do not call in atomic context
         */
        if (can_block) {
                struct in6_validator_info i6vi = {
                        .i6vi_addr = *cfg->pfx,
                        .i6vi_dev = idev,
                        .extack = extack,
                };

                err = inet6addr_validator_notifier_call_chain(NETDEV_UP, &i6vi);
                err = notifier_to_errno(err);
                if (err < 0)
                        goto out;
        }

        ifa = kzalloc(sizeof(*ifa), gfp_flags | __GFP_ACCOUNT);
        if (!ifa) {
                err = -ENOBUFS;
                goto out;
        }

        f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags, extack);
        if (IS_ERR(f6i)) {
                err = PTR_ERR(f6i);
                f6i = NULL;
                goto out;
        }

        neigh_parms_data_state_setall(idev->nd_parms);

        ifa->addr = *cfg->pfx;
        if (cfg->peer_pfx)
                ifa->peer_addr = *cfg->peer_pfx;

        spin_lock_init(&ifa->lock);
        INIT_DELAYED_WORK(&ifa->dad_work, addrconf_dad_work);
        INIT_HLIST_NODE(&ifa->addr_lst);
        ifa->scope = cfg->scope;
        ifa->prefix_len = cfg->plen;
        ifa->rt_priority = cfg->rt_priority;
        ifa->flags = cfg->ifa_flags;
        ifa->ifa_proto = cfg->ifa_proto;
        /* No need to add the TENTATIVE flag for addresses with NODAD */
        if (!(cfg->ifa_flags & IFA_F_NODAD))
                ifa->flags |= IFA_F_TENTATIVE;
        ifa->valid_lft = cfg->valid_lft;
        ifa->prefered_lft = cfg->preferred_lft;
        ifa->cstamp = ifa->tstamp = jiffies;
        ifa->tokenized = false;

        ifa->rt = f6i;

        ifa->idev = idev;
        in6_dev_hold(idev);

        /* For caller */
        refcount_set(&ifa->refcnt, 1);

        rcu_read_lock();

        err = ipv6_add_addr_hash(idev->dev, ifa);
        if (err < 0) {
                rcu_read_unlock();
                goto out;
        }

        write_lock_bh(&idev->lock);

        /* Add to inet6_dev unicast addr list. */
        ipv6_link_dev_addr(idev, ifa);

        if (ifa->flags&IFA_F_TEMPORARY) {
                list_add(&ifa->tmp_list, &idev->tempaddr_list);
                in6_ifa_hold(ifa);
        }

        in6_ifa_hold(ifa);
        write_unlock_bh(&idev->lock);

        rcu_read_unlock();

        inet6addr_notifier_call_chain(NETDEV_UP, ifa);
out:
        if (unlikely(err < 0)) {
                fib6_info_release(f6i);

                if (ifa) {
                        if (ifa->idev)
                                in6_dev_put(ifa->idev);
                        kfree(ifa);
                }
                ifa = ERR_PTR(err);
        }

        return ifa;
}

enum cleanup_prefix_rt_t {
        CLEANUP_PREFIX_RT_NOP,    /* no cleanup action for prefix route */
        CLEANUP_PREFIX_RT_DEL,    /* delete the prefix route */
        CLEANUP_PREFIX_RT_EXPIRE, /* update the lifetime of the prefix route */
};

/*
 * Check, whether the prefix for ifp would still need a prefix route
 * after deleting ifp. The function returns one of the CLEANUP_PREFIX_RT_*
 * constants.
 *
 * 1) we don't purge prefix if address was not permanent.
 *    prefix is managed by its own lifetime.
 * 2) we also don't purge, if the address was IFA_F_NOPREFIXROUTE.
 * 3) if there are no addresses, delete prefix.
 * 4) if there are still other permanent address(es),
 *    corresponding prefix is still permanent.
 * 5) if there are still other addresses with IFA_F_NOPREFIXROUTE,
 *    don't purge the prefix, assume user space is managing it.
 * 6) otherwise, update prefix lifetime to the
 *    longest valid lifetime among the corresponding
 *    addresses on the device.
 *    Note: subsequent RA will update lifetime.
 **/
static enum cleanup_prefix_rt_t
check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
{
        struct inet6_ifaddr *ifa;
        struct inet6_dev *idev = ifp->idev;
        unsigned long lifetime;
        enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_DEL;

        *expires = jiffies;

        list_for_each_entry(ifa, &idev->addr_list, if_list) {
                if (ifa == ifp)
                        continue;
                if (ifa->prefix_len != ifp->prefix_len ||
                    !ipv6_prefix_equal(&ifa->addr, &ifp->addr,
                                       ifp->prefix_len))
                        continue;
                if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE))
                        return CLEANUP_PREFIX_RT_NOP;

                action = CLEANUP_PREFIX_RT_EXPIRE;

                spin_lock(&ifa->lock);

                lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ);
                /*
                 * Note: Because this address is
                 * not permanent, lifetime <
                 * LONG_MAX / HZ here.
                 */
                if (time_before(*expires, ifa->tstamp + lifetime * HZ))
                        *expires = ifa->tstamp + lifetime * HZ;
                spin_unlock(&ifa->lock);
        }

        return action;
}

static void
cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
                     bool del_rt, bool del_peer)
{
        struct fib6_table *table;
        struct fib6_info *f6i;

        f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr,
                                        ifp->prefix_len,
                                        ifp->idev->dev, 0, RTF_DEFAULT, true);
        if (f6i) {
                if (del_rt)
                        ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
                else {
                        if (!(f6i->fib6_flags & RTF_EXPIRES)) {
                                table = f6i->fib6_table;
                                spin_lock_bh(&table->tb6_lock);

                                fib6_set_expires(f6i, expires);
                                fib6_add_gc_list(f6i);

                                spin_unlock_bh(&table->tb6_lock);
                        }
                        fib6_info_release(f6i);
                }
        }
}


/* This function wants to get referenced ifp and releases it before return */

static void ipv6_del_addr(struct inet6_ifaddr *ifp)
{
        enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP;
        struct net *net = dev_net(ifp->idev->dev);
        unsigned long expires;
        int state;

        ASSERT_RTNL();

        spin_lock_bh(&ifp->lock);
        state = ifp->state;
        ifp->state = INET6_IFADDR_STATE_DEAD;
        spin_unlock_bh(&ifp->lock);

        if (state == INET6_IFADDR_STATE_DEAD)
                goto out;

        spin_lock_bh(&net->ipv6.addrconf_hash_lock);
        hlist_del_init_rcu(&ifp->addr_lst);
        spin_unlock_bh(&net->ipv6.addrconf_hash_lock);

        write_lock_bh(&ifp->idev->lock);

        if (ifp->flags&IFA_F_TEMPORARY) {
                list_del(&ifp->tmp_list);
                if (ifp->ifpub) {
                        in6_ifa_put(ifp->ifpub);
                        ifp->ifpub = NULL;
                }
                __in6_ifa_put(ifp);
        }

        if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE))
                action = check_cleanup_prefix_route(ifp, &expires);

        list_del_rcu(&ifp->if_list);
        __in6_ifa_put(ifp);

        write_unlock_bh(&ifp->idev->lock);

        addrconf_del_dad_work(ifp);

        ipv6_ifa_notify(RTM_DELADDR, ifp);

        inet6addr_notifier_call_chain(NETDEV_DOWN, ifp);

        if (action != CLEANUP_PREFIX_RT_NOP) {
                cleanup_prefix_route(ifp, expires,
                        action == CLEANUP_PREFIX_RT_DEL, false);
        }

        /* clean up prefsrc entries */
        rt6_remove_prefsrc(ifp);
out:
        in6_ifa_put(ifp);
}

static unsigned long ipv6_get_regen_advance(const struct inet6_dev *idev)
{
        return READ_ONCE(idev->cnf.regen_min_advance) +
                READ_ONCE(idev->cnf.regen_max_retry) *
                READ_ONCE(idev->cnf.dad_transmits) *
                max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
}

static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block)
{
        struct inet6_dev *idev = ifp->idev;
        unsigned long tmp_tstamp, age;
        unsigned long regen_advance;
        unsigned long now = jiffies;
        u32 if_public_preferred_lft;
        s32 cnf_temp_preferred_lft;
        struct inet6_ifaddr *ift;
        struct ifa6_config cfg;
        long max_desync_factor;
        struct in6_addr addr;
        int ret = 0;

        write_lock_bh(&idev->lock);

retry:
        in6_dev_hold(idev);
        if (READ_ONCE(idev->cnf.use_tempaddr) <= 0) {
                write_unlock_bh(&idev->lock);
                pr_info("%s: use_tempaddr is disabled\n", __func__);
                in6_dev_put(idev);
                ret = -1;
                goto out;
        }
        spin_lock_bh(&ifp->lock);
        if (ifp->regen_count++ >= READ_ONCE(idev->cnf.regen_max_retry)) {
                WRITE_ONCE(idev->cnf.use_tempaddr, -1);        /*XXX*/
                spin_unlock_bh(&ifp->lock);
                write_unlock_bh(&idev->lock);
                pr_warn("%s: regeneration time exceeded - disabled temporary address support\n",
                        __func__);
                in6_dev_put(idev);
                ret = -1;
                goto out;
        }
        in6_ifa_hold(ifp);
        memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
        ipv6_gen_rnd_iid(&addr);

        age = (now - ifp->tstamp) / HZ;

        regen_advance = ipv6_get_regen_advance(idev);

        /* recalculate max_desync_factor each time and update
         * idev->desync_factor if it's larger
         */
        cnf_temp_preferred_lft = READ_ONCE(idev->cnf.temp_prefered_lft);
        max_desync_factor = min_t(long,
                                  READ_ONCE(idev->cnf.max_desync_factor),
                                  cnf_temp_preferred_lft - regen_advance);

        if (unlikely(idev->desync_factor > max_desync_factor)) {
                if (max_desync_factor > 0) {
                        get_random_bytes(&idev->desync_factor,
                                         sizeof(idev->desync_factor));
                        idev->desync_factor %= max_desync_factor;
                } else {
                        idev->desync_factor = 0;
                }
        }

        if_public_preferred_lft = ifp->prefered_lft;

        memset(&cfg, 0, sizeof(cfg));
        cfg.valid_lft = min_t(__u32, ifp->valid_lft,
                              READ_ONCE(idev->cnf.temp_valid_lft) + age);
        cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
        cfg.preferred_lft = min_t(__u32, if_public_preferred_lft, cfg.preferred_lft);
        cfg.preferred_lft = min_t(__u32, cfg.valid_lft, cfg.preferred_lft);

        cfg.plen = ifp->prefix_len;
        tmp_tstamp = ifp->tstamp;
        spin_unlock_bh(&ifp->lock);

        write_unlock_bh(&idev->lock);

        /* From RFC 4941:
         *
         *     A temporary address is created only if this calculated Preferred
         *     Lifetime is greater than REGEN_ADVANCE time units.  In
         *     particular, an implementation must not create a temporary address
         *     with a zero Preferred Lifetime.
         *
         *     ...
         *
         *     When creating a temporary address, the lifetime values MUST be
         *     derived from the corresponding prefix as follows:
         *
         *     ...
         *
         *     *  Its Preferred Lifetime is the lower of the Preferred Lifetime
         *        of the public address or TEMP_PREFERRED_LIFETIME -
         *        DESYNC_FACTOR.
         *
         * To comply with the RFC's requirements, clamp the preferred lifetime
         * to a minimum of regen_advance, unless that would exceed valid_lft or
         * ifp->prefered_lft.
         *
         * Use age calculation as in addrconf_verify to avoid unnecessary
         * temporary addresses being generated.
         */
        age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
        if (cfg.preferred_lft <= regen_advance + age) {
                cfg.preferred_lft = regen_advance + age + 1;
                if (cfg.preferred_lft > cfg.valid_lft ||
                    cfg.preferred_lft > if_public_preferred_lft) {
                        in6_ifa_put(ifp);
                        in6_dev_put(idev);
                        ret = -1;
                        goto out;
                }
        }

        cfg.ifa_flags = IFA_F_TEMPORARY;
        /* set in addrconf_prefix_rcv() */
        if (ifp->flags & IFA_F_OPTIMISTIC)
                cfg.ifa_flags |= IFA_F_OPTIMISTIC;

        cfg.pfx = &addr;
        cfg.scope = ipv6_addr_scope(cfg.pfx);

        ift = ipv6_add_addr(idev, &cfg, block, NULL);
        if (IS_ERR(ift)) {
                in6_ifa_put(ifp);
                in6_dev_put(idev);
                pr_info("%s: retry temporary address regeneration\n", __func__);
                write_lock_bh(&idev->lock);
                goto retry;
        }

        spin_lock_bh(&ift->lock);
        ift->ifpub = ifp;
        ift->cstamp = now;
        ift->tstamp = tmp_tstamp;
        spin_unlock_bh(&ift->lock);

        addrconf_dad_start(ift);
        in6_ifa_put(ift);
        in6_dev_put(idev);
out:
        return ret;
}

/*
 *        Choose an appropriate source address (RFC3484)
 */
enum {
        IPV6_SADDR_RULE_INIT = 0,
        IPV6_SADDR_RULE_LOCAL,
        IPV6_SADDR_RULE_SCOPE,
        IPV6_SADDR_RULE_PREFERRED,
#ifdef CONFIG_IPV6_MIP6
        IPV6_SADDR_RULE_HOA,
#endif
        IPV6_SADDR_RULE_OIF,
        IPV6_SADDR_RULE_LABEL,
        IPV6_SADDR_RULE_PRIVACY,
        IPV6_SADDR_RULE_ORCHID,
        IPV6_SADDR_RULE_PREFIX,
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        IPV6_SADDR_RULE_NOT_OPTIMISTIC,
#endif
        IPV6_SADDR_RULE_MAX
};

struct ipv6_saddr_score {
        int                        rule;
        int                        addr_type;
        struct inet6_ifaddr        *ifa;
        DECLARE_BITMAP(scorebits, IPV6_SADDR_RULE_MAX);
        int                        scopedist;
        int                        matchlen;
};

struct ipv6_saddr_dst {
        const struct in6_addr *addr;
        int ifindex;
        int scope;
        int label;
        unsigned int prefs;
};

static inline int ipv6_saddr_preferred(int type)
{
        if (type & (IPV6_ADDR_MAPPED|IPV6_ADDR_COMPATv4|IPV6_ADDR_LOOPBACK))
                return 1;
        return 0;
}

static bool ipv6_use_optimistic_addr(const struct net *net,
                                     const struct inet6_dev *idev)
{
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        if (!idev)
                return false;
        if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) &&
            !READ_ONCE(idev->cnf.optimistic_dad))
                return false;
        if (!READ_ONCE(net->ipv6.devconf_all->use_optimistic) &&
            !READ_ONCE(idev->cnf.use_optimistic))
                return false;

        return true;
#else
        return false;
#endif
}

static bool ipv6_allow_optimistic_dad(const struct net *net,
                                      const struct inet6_dev *idev)
{
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        if (!idev)
                return false;
        if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) &&
            !READ_ONCE(idev->cnf.optimistic_dad))
                return false;

        return true;
#else
        return false;
#endif
}

static int ipv6_get_saddr_eval(struct net *net,
                               struct ipv6_saddr_score *score,
                               struct ipv6_saddr_dst *dst,
                               int i)
{
        int ret;

        if (i <= score->rule) {
                switch (i) {
                case IPV6_SADDR_RULE_SCOPE:
                        ret = score->scopedist;
                        break;
                case IPV6_SADDR_RULE_PREFIX:
                        ret = score->matchlen;
                        break;
                default:
                        ret = !!test_bit(i, score->scorebits);
                }
                goto out;
        }

        switch (i) {
        case IPV6_SADDR_RULE_INIT:
                /* Rule 0: remember if hiscore is not ready yet */
                ret = !!score->ifa;
                break;
        case IPV6_SADDR_RULE_LOCAL:
                /* Rule 1: Prefer same address */
                ret = ipv6_addr_equal(&score->ifa->addr, dst->addr);
                break;
        case IPV6_SADDR_RULE_SCOPE:
                /* Rule 2: Prefer appropriate scope
                 *
                 *      ret
                 *       ^
                 *    -1 |  d 15
                 *    ---+--+-+---> scope
                 *       |
                 *       |             d is scope of the destination.
                 *  B-d  |  \
                 *       |   \      <- smaller scope is better if
                 *  B-15 |    \        if scope is enough for destination.
                 *       |             ret = B - scope (-1 <= scope >= d <= 15).
                 * d-C-1 | /
                 *       |/         <- greater is better
                 *   -C  /             if scope is not enough for destination.
                 *      /|             ret = scope - C (-1 <= d < scope <= 15).
                 *
                 * d - C - 1 < B -15 (for all -1 <= d <= 15).
                 * C > d + 14 - B >= 15 + 14 - B = 29 - B.
                 * Assume B = 0 and we get C > 29.
                 */
                ret = __ipv6_addr_src_scope(score->addr_type);
                if (ret >= dst->scope)
                        ret = -ret;
                else
                        ret -= 128;        /* 30 is enough */
                score->scopedist = ret;
                break;
        case IPV6_SADDR_RULE_PREFERRED:
            {
                /* Rule 3: Avoid deprecated and optimistic addresses */
                u8 avoid = IFA_F_DEPRECATED;

                if (!ipv6_use_optimistic_addr(net, score->ifa->idev))
                        avoid |= IFA_F_OPTIMISTIC;
                ret = ipv6_saddr_preferred(score->addr_type) ||
                      !(score->ifa->flags & avoid);
                break;
            }
#ifdef CONFIG_IPV6_MIP6
        case IPV6_SADDR_RULE_HOA:
            {
                /* Rule 4: Prefer home address */
                int prefhome = !(dst->prefs & IPV6_PREFER_SRC_COA);
                ret = !(score->ifa->flags & IFA_F_HOMEADDRESS) ^ prefhome;
                break;
            }
#endif
        case IPV6_SADDR_RULE_OIF:
                /* Rule 5: Prefer outgoing interface */
                ret = (!dst->ifindex ||
                       dst->ifindex == score->ifa->idev->dev->ifindex);
                break;
        case IPV6_SADDR_RULE_LABEL:
                /* Rule 6: Prefer matching label */
                ret = ipv6_addr_label(net,
                                      &score->ifa->addr, score->addr_type,
                                      score->ifa->idev->dev->ifindex) == dst->label;
                break;
        case IPV6_SADDR_RULE_PRIVACY:
            {
                /* Rule 7: Prefer public address
                 * Note: prefer temporary address if use_tempaddr >= 2
                 */
                int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ?
                                !!(dst->prefs & IPV6_PREFER_SRC_TMP) :
                                READ_ONCE(score->ifa->idev->cnf.use_tempaddr) >= 2;
                ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp;
                break;
            }
        case IPV6_SADDR_RULE_ORCHID:
                /* Rule 8-: Prefer ORCHID vs ORCHID or
                 *            non-ORCHID vs non-ORCHID
                 */
                ret = !(ipv6_addr_orchid(&score->ifa->addr) ^
                        ipv6_addr_orchid(dst->addr));
                break;
        case IPV6_SADDR_RULE_PREFIX:
                /* Rule 8: Use longest matching prefix */
                ret = ipv6_addr_diff(&score->ifa->addr, dst->addr);
                if (ret > score->ifa->prefix_len)
                        ret = score->ifa->prefix_len;
                score->matchlen = ret;
                break;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        case IPV6_SADDR_RULE_NOT_OPTIMISTIC:
                /* Optimistic addresses still have lower precedence than other
                 * preferred addresses.
                 */
                ret = !(score->ifa->flags & IFA_F_OPTIMISTIC);
                break;
#endif
        default:
                ret = 0;
        }

        if (ret)
                __set_bit(i, score->scorebits);
        score->rule = i;
out:
        return ret;
}

static int __ipv6_dev_get_saddr(struct net *net,
                                struct ipv6_saddr_dst *dst,
                                struct inet6_dev *idev,
                                struct ipv6_saddr_score *scores,
                                int hiscore_idx)
{
        struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx];

        list_for_each_entry_rcu(score->ifa, &idev->addr_list, if_list) {
                int i;

                /*
                 * - Tentative Address (RFC2462 section 5.4)
                 *  - A tentative address is not considered
                 *    "assigned to an interface" in the traditional
                 *    sense, unless it is also flagged as optimistic.
                 * - Candidate Source Address (section 4)
                 *  - In any case, anycast addresses, multicast
                 *    addresses, and the unspecified address MUST
                 *    NOT be included in a candidate set.
                 */
                if ((score->ifa->flags & IFA_F_TENTATIVE) &&
                    (!(score->ifa->flags & IFA_F_OPTIMISTIC)))
                        continue;

                score->addr_type = __ipv6_addr_type(&score->ifa->addr);

                if (unlikely(score->addr_type == IPV6_ADDR_ANY ||
                             score->addr_type & IPV6_ADDR_MULTICAST)) {
                        net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s",
                                            idev->dev->name);
                        continue;
                }

                score->rule = -1;
                bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX);

                for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) {
                        int minihiscore, miniscore;

                        minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i);
                        miniscore = ipv6_get_saddr_eval(net, score, dst, i);

                        if (minihiscore > miniscore) {
                                if (i == IPV6_SADDR_RULE_SCOPE &&
                                    score->scopedist > 0) {
                                        /*
                                         * special case:
                                         * each remaining entry
                                         * has too small (not enough)
                                         * scope, because ifa entries
                                         * are sorted by their scope
                                         * values.
                                         */
                                        goto out;
                                }
                                break;
                        } else if (minihiscore < miniscore) {
                                swap(hiscore, score);
                                hiscore_idx = 1 - hiscore_idx;

                                /* restore our iterator */
                                score->ifa = hiscore->ifa;

                                break;
                        }
                }
        }
out:
        return hiscore_idx;
}

static int ipv6_get_saddr_master(struct net *net,
                                 const struct net_device *dst_dev,
                                 const struct net_device *master,
                                 struct ipv6_saddr_dst *dst,
                                 struct ipv6_saddr_score *scores,
                                 int hiscore_idx)
{
        struct inet6_dev *idev;

        idev = __in6_dev_get(dst_dev);
        if (idev)
                hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
                                                   scores, hiscore_idx);

        idev = __in6_dev_get(master);
        if (idev)
                hiscore_idx = __ipv6_dev_get_saddr(net, dst, idev,
                                                   scores, hiscore_idx);

        return hiscore_idx;
}

int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
                       const struct in6_addr *daddr, unsigned int prefs,
                       struct in6_addr *saddr)
{
        struct ipv6_saddr_score scores[2], *hiscore;
        struct ipv6_saddr_dst dst;
        struct inet6_dev *idev;
        struct net_device *dev;
        int dst_type;
        bool use_oif_addr = false;
        int hiscore_idx = 0;
        int ret = 0;

        dst_type = __ipv6_addr_type(daddr);
        dst.addr = daddr;
        dst.ifindex = dst_dev ? dst_dev->ifindex : 0;
        dst.scope = __ipv6_addr_src_scope(dst_type);
        dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex);
        dst.prefs = prefs;

        scores[hiscore_idx].rule = -1;
        scores[hiscore_idx].ifa = NULL;

        rcu_read_lock();

        /* Candidate Source Address (section 4)
         *  - multicast and link-local destination address,
         *    the set of candidate source address MUST only
         *    include addresses assigned to interfaces
         *    belonging to the same link as the outgoing
         *    interface.
         * (- For site-local destination addresses, the
         *    set of candidate source addresses MUST only
         *    include addresses assigned to interfaces
         *    belonging to the same site as the outgoing
         *    interface.)
         *  - "It is RECOMMENDED that the candidate source addresses
         *    be the set of unicast addresses assigned to the
         *    interface that will be used to send to the destination
         *    (the 'outgoing' interface)." (RFC 6724)
         */
        if (dst_dev) {
                idev = __in6_dev_get(dst_dev);
                if ((dst_type & IPV6_ADDR_MULTICAST) ||
                    dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
                    (idev && READ_ONCE(idev->cnf.use_oif_addrs_only))) {
                        use_oif_addr = true;
                }
        }

        if (use_oif_addr) {
                if (idev)
                        hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
        } else {
                const struct net_device *master;
                int master_idx = 0;

                /* if dst_dev exists and is enslaved to an L3 device, then
                 * prefer addresses from dst_dev and then the master over
                 * any other enslaved devices in the L3 domain.
                 */
                master = l3mdev_master_dev_rcu(dst_dev);
                if (master) {
                        master_idx = master->ifindex;

                        hiscore_idx = ipv6_get_saddr_master(net, dst_dev,
                                                            master, &dst,
                                                            scores, hiscore_idx);

                        if (scores[hiscore_idx].ifa &&
                            scores[hiscore_idx].scopedist >= 0)
                                goto out;
                }

                for_each_netdev_rcu(net, dev) {
                        /* only consider addresses on devices in the
                         * same L3 domain
                         */
                        if (l3mdev_master_ifindex_rcu(dev) != master_idx)
                                continue;
                        idev = __in6_dev_get(dev);
                        if (!idev)
                                continue;
                        hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx);
                }
        }

out:
        hiscore = &scores[hiscore_idx];
        if (!hiscore->ifa)
                ret = -EADDRNOTAVAIL;
        else
                *saddr = hiscore->ifa->addr;

        rcu_read_unlock();
        return ret;
}
EXPORT_SYMBOL(ipv6_dev_get_saddr);

static int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
                              u32 banned_flags)
{
        struct inet6_ifaddr *ifp;
        int err = -EADDRNOTAVAIL;

        list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
                if (ifp->scope > IFA_LINK)
                        break;
                if (ifp->scope == IFA_LINK &&
                    !(ifp->flags & banned_flags)) {
                        *addr = ifp->addr;
                        err = 0;
                        break;
                }
        }
        return err;
}

int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
                    u32 banned_flags)
{
        struct inet6_dev *idev;
        int err = -EADDRNOTAVAIL;

        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (idev) {
                read_lock_bh(&idev->lock);
                err = __ipv6_get_lladdr(idev, addr, banned_flags);
                read_unlock_bh(&idev->lock);
        }
        rcu_read_unlock();
        return err;
}

static int ipv6_count_addresses(const struct inet6_dev *idev)
{
        const struct inet6_ifaddr *ifp;
        int cnt = 0;

        rcu_read_lock();
        list_for_each_entry_rcu(ifp, &idev->addr_list, if_list)
                cnt++;
        rcu_read_unlock();
        return cnt;
}

int ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
                  const struct net_device *dev, int strict)
{
        return ipv6_chk_addr_and_flags(net, addr, dev, !dev,
                                       strict, IFA_F_TENTATIVE);
}
EXPORT_SYMBOL(ipv6_chk_addr);

/* device argument is used to find the L3 domain of interest. If
 * skip_dev_check is set, then the ifp device is not checked against
 * the passed in dev argument. So the 2 cases for addresses checks are:
 *   1. does the address exist in the L3 domain that dev is part of
 *      (skip_dev_check = true), or
 *
 *   2. does the address exist on the specific device
 *      (skip_dev_check = false)
 */
static struct net_device *
__ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
                          const struct net_device *dev, bool skip_dev_check,
                          int strict, u32 banned_flags)
{
        unsigned int hash = inet6_addr_hash(net, addr);
        struct net_device *l3mdev, *ndev;
        struct inet6_ifaddr *ifp;
        u32 ifp_flags;

        rcu_read_lock();

        l3mdev = l3mdev_master_dev_rcu(dev);
        if (skip_dev_check)
                dev = NULL;

        hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                ndev = ifp->idev->dev;

                if (l3mdev_master_dev_rcu(ndev) != l3mdev)
                        continue;

                /* Decouple optimistic from tentative for evaluation here.
                 * Ban optimistic addresses explicitly, when required.
                 */
                ifp_flags = (ifp->flags&IFA_F_OPTIMISTIC)
                            ? (ifp->flags&~IFA_F_TENTATIVE)
                            : ifp->flags;
                if (ipv6_addr_equal(&ifp->addr, addr) &&
                    !(ifp_flags&banned_flags) &&
                    (!dev || ndev == dev ||
                     !(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
                        rcu_read_unlock();
                        return ndev;
                }
        }

        rcu_read_unlock();
        return NULL;
}

int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
                            const struct net_device *dev, bool skip_dev_check,
                            int strict, u32 banned_flags)
{
        return __ipv6_chk_addr_and_flags(net, addr, dev, skip_dev_check,
                                         strict, banned_flags) ? 1 : 0;
}
EXPORT_SYMBOL(ipv6_chk_addr_and_flags);


/* Compares an address/prefix_len with addresses on device @dev.
 * If one is found it returns true.
 */
bool ipv6_chk_custom_prefix(const struct in6_addr *addr,
        const unsigned int prefix_len, struct net_device *dev)
{
        const struct inet6_ifaddr *ifa;
        const struct inet6_dev *idev;
        bool ret = false;

        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (idev) {
                list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                        ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len);
                        if (ret)
                                break;
                }
        }
        rcu_read_unlock();

        return ret;
}
EXPORT_SYMBOL(ipv6_chk_custom_prefix);

int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
{
        const struct inet6_ifaddr *ifa;
        const struct inet6_dev *idev;
        int        onlink;

        onlink = 0;
        rcu_read_lock();
        idev = __in6_dev_get(dev);
        if (idev) {
                list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                        onlink = ipv6_prefix_equal(addr, &ifa->addr,
                                                   ifa->prefix_len);
                        if (onlink)
                                break;
                }
        }
        rcu_read_unlock();
        return onlink;
}
EXPORT_SYMBOL(ipv6_chk_prefix);

/**
 * ipv6_dev_find - find the first device with a given source address.
 * @net: the net namespace
 * @addr: the source address
 * @dev: used to find the L3 domain of interest
 *
 * The caller should be protected by RCU, or RTNL.
 */
struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
                                 struct net_device *dev)
{
        return __ipv6_chk_addr_and_flags(net, addr, dev, !dev, 1,
                                         IFA_F_TENTATIVE);
}
EXPORT_SYMBOL(ipv6_dev_find);

struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr,
                                     struct net_device *dev, int strict)
{
        unsigned int hash = inet6_addr_hash(net, addr);
        struct inet6_ifaddr *ifp, *result = NULL;

        rcu_read_lock();
        hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                if (ipv6_addr_equal(&ifp->addr, addr)) {
                        if (!dev || ifp->idev->dev == dev ||
                            !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
                                if (in6_ifa_hold_safe(ifp)) {
                                        result = ifp;
                                        break;
                                }
                        }
                }
        }
        rcu_read_unlock();

        return result;
}

/* Gets referenced address, destroys ifaddr */

static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
{
        if (dad_failed)
                ifp->flags |= IFA_F_DADFAILED;

        if (ifp->flags&IFA_F_TEMPORARY) {
                struct inet6_ifaddr *ifpub;
                spin_lock_bh(&ifp->lock);
                ifpub = ifp->ifpub;
                if (ifpub) {
                        in6_ifa_hold(ifpub);
                        spin_unlock_bh(&ifp->lock);
                        ipv6_create_tempaddr(ifpub, true);
                        in6_ifa_put(ifpub);
                } else {
                        spin_unlock_bh(&ifp->lock);
                }
                ipv6_del_addr(ifp);
        } else if (ifp->flags&IFA_F_PERMANENT || !dad_failed) {
                spin_lock_bh(&ifp->lock);
                addrconf_del_dad_work(ifp);
                ifp->flags |= IFA_F_TENTATIVE;
                if (dad_failed)
                        ifp->flags &= ~IFA_F_OPTIMISTIC;
                spin_unlock_bh(&ifp->lock);
                if (dad_failed)
                        ipv6_ifa_notify(0, ifp);
                in6_ifa_put(ifp);
        } else {
                ipv6_del_addr(ifp);
        }
}

static int addrconf_dad_end(struct inet6_ifaddr *ifp)
{
        int err = -ENOENT;

        spin_lock_bh(&ifp->lock);
        if (ifp->state == INET6_IFADDR_STATE_DAD) {
                ifp->state = INET6_IFADDR_STATE_POSTDAD;
                err = 0;
        }
        spin_unlock_bh(&ifp->lock);

        return err;
}

void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
{
        struct inet6_dev *idev = ifp->idev;
        struct net *net = dev_net(idev->dev);
        int max_addresses;

        if (addrconf_dad_end(ifp)) {
                in6_ifa_put(ifp);
                return;
        }

        net_info_ratelimited("%s: IPv6 duplicate address %pI6c used by %pM detected!\n",
                             ifp->idev->dev->name, &ifp->addr, eth_hdr(skb)->h_source);

        spin_lock_bh(&ifp->lock);

        if (ifp->flags & IFA_F_STABLE_PRIVACY) {
                struct in6_addr new_addr;
                struct inet6_ifaddr *ifp2;
                int retries = ifp->stable_privacy_retry + 1;
                struct ifa6_config cfg = {
                        .pfx = &new_addr,
                        .plen = ifp->prefix_len,
                        .ifa_flags = ifp->flags,
                        .valid_lft = ifp->valid_lft,
                        .preferred_lft = ifp->prefered_lft,
                        .scope = ifp->scope,
                };

                if (retries > net->ipv6.sysctl.idgen_retries) {
                        net_info_ratelimited("%s: privacy stable address generation failed because of DAD conflicts!\n",
                                             ifp->idev->dev->name);
                        goto errdad;
                }

                new_addr = ifp->addr;
                if (ipv6_generate_stable_address(&new_addr, retries,
                                                 idev))
                        goto errdad;

                spin_unlock_bh(&ifp->lock);

                max_addresses = READ_ONCE(idev->cnf.max_addresses);
                if (max_addresses &&
                    ipv6_count_addresses(idev) >= max_addresses)
                        goto lock_errdad;

                net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
                                     ifp->idev->dev->name);

                ifp2 = ipv6_add_addr(idev, &cfg, false, NULL);
                if (IS_ERR(ifp2))
                        goto lock_errdad;

                spin_lock_bh(&ifp2->lock);
                ifp2->stable_privacy_retry = retries;
                ifp2->state = INET6_IFADDR_STATE_PREDAD;
                spin_unlock_bh(&ifp2->lock);

                addrconf_mod_dad_work(ifp2, net->ipv6.sysctl.idgen_delay);
                in6_ifa_put(ifp2);
lock_errdad:
                spin_lock_bh(&ifp->lock);
        }

errdad:
        /* transition from _POSTDAD to _ERRDAD */
        ifp->state = INET6_IFADDR_STATE_ERRDAD;
        spin_unlock_bh(&ifp->lock);

        addrconf_mod_dad_work(ifp, 0);
        in6_ifa_put(ifp);
}

/* Join to solicited addr multicast group.
 * caller must hold RTNL */
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr)
{
        struct in6_addr maddr;

        if (dev->flags&(IFF_LOOPBACK|IFF_NOARP))
                return;

        addrconf_addr_solict_mult(addr, &maddr);
        ipv6_dev_mc_inc(dev, &maddr);
}

/* caller must hold RTNL */
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr)
{
        struct in6_addr maddr;

        if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP))
                return;

        addrconf_addr_solict_mult(addr, &maddr);
        __ipv6_dev_mc_dec(idev, &maddr);
}

/* caller must hold RTNL */
static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
{
        struct in6_addr addr;

        if (ifp->prefix_len >= 127) /* RFC 6164 */
                return;
        ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
        if (ipv6_addr_any(&addr))
                return;
        __ipv6_dev_ac_inc(ifp->idev, &addr);
}

/* caller must hold RTNL */
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
{
        struct in6_addr addr;

        if (ifp->prefix_len >= 127) /* RFC 6164 */
                return;
        ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len);
        if (ipv6_addr_any(&addr))
                return;
        __ipv6_dev_ac_dec(ifp->idev, &addr);
}

static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev)
{
        switch (dev->addr_len) {
        case ETH_ALEN:
                memcpy(eui, dev->dev_addr, 3);
                eui[3] = 0xFF;
                eui[4] = 0xFE;
                memcpy(eui + 5, dev->dev_addr + 3, 3);
                break;
        case EUI64_ADDR_LEN:
                memcpy(eui, dev->dev_addr, EUI64_ADDR_LEN);
                eui[0] ^= 2;
                break;
        default:
                return -1;
        }

        return 0;
}

static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev)
{
        const union fwnet_hwaddr *ha;

        if (dev->addr_len != FWNET_ALEN)
                return -1;

        ha = (const union fwnet_hwaddr *)dev->dev_addr;

        memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id));
        eui[0] ^= 2;
        return 0;
}

static int addrconf_ifid_arcnet(u8 *eui, struct net_device *dev)
{
        /* XXX: inherit EUI-64 from other interface -- yoshfuji */
        if (dev->addr_len != ARCNET_ALEN)
                return -1;
        memset(eui, 0, 7);
        eui[7] = *(u8 *)dev->dev_addr;
        return 0;
}

static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev)
{
        if (dev->addr_len != INFINIBAND_ALEN)
                return -1;
        memcpy(eui, dev->dev_addr + 12, 8);
        eui[0] |= 2;
        return 0;
}

static int __ipv6_isatap_ifid(u8 *eui, __be32 addr)
{
        if (addr == 0)
                return -1;
        eui[0] = (ipv4_is_zeronet(addr) || ipv4_is_private_10(addr) ||
                  ipv4_is_loopback(addr) || ipv4_is_linklocal_169(addr) ||
                  ipv4_is_private_172(addr) || ipv4_is_test_192(addr) ||
                  ipv4_is_anycast_6to4(addr) || ipv4_is_private_192(addr) ||
                  ipv4_is_test_198(addr) || ipv4_is_multicast(addr) ||
                  ipv4_is_lbcast(addr)) ? 0x00 : 0x02;
        eui[1] = 0;
        eui[2] = 0x5E;
        eui[3] = 0xFE;
        memcpy(eui + 4, &addr, 4);
        return 0;
}

static int addrconf_ifid_sit(u8 *eui, struct net_device *dev)
{
        if (dev->priv_flags & IFF_ISATAP)
                return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
        return -1;
}

static int addrconf_ifid_gre(u8 *eui, struct net_device *dev)
{
        return __ipv6_isatap_ifid(eui, *(__be32 *)dev->dev_addr);
}

static int addrconf_ifid_ip6tnl(u8 *eui, struct net_device *dev)
{
        memcpy(eui, dev->perm_addr, 3);
        memcpy(eui + 5, dev->perm_addr + 3, 3);
        eui[3] = 0xFF;
        eui[4] = 0xFE;
        eui[0] ^= 2;
        return 0;
}

static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
{
        switch (dev->type) {
        case ARPHRD_ETHER:
        case ARPHRD_FDDI:
                return addrconf_ifid_eui48(eui, dev);
        case ARPHRD_ARCNET:
                return addrconf_ifid_arcnet(eui, dev);
        case ARPHRD_INFINIBAND:
                return addrconf_ifid_infiniband(eui, dev);
        case ARPHRD_SIT:
                return addrconf_ifid_sit(eui, dev);
        case ARPHRD_IPGRE:
        case ARPHRD_TUNNEL:
                return addrconf_ifid_gre(eui, dev);
        case ARPHRD_6LOWPAN:
                return addrconf_ifid_6lowpan(eui, dev);
        case ARPHRD_IEEE1394:
                return addrconf_ifid_ieee1394(eui, dev);
        case ARPHRD_TUNNEL6:
        case ARPHRD_IP6GRE:
        case ARPHRD_RAWIP:
                return addrconf_ifid_ip6tnl(eui, dev);
        }
        return -1;
}

static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
{
        int err = -1;
        struct inet6_ifaddr *ifp;

        read_lock_bh(&idev->lock);
        list_for_each_entry_reverse(ifp, &idev->addr_list, if_list) {
                if (ifp->scope > IFA_LINK)
                        break;
                if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) {
                        memcpy(eui, ifp->addr.s6_addr+8, 8);
                        err = 0;
                        break;
                }
        }
        read_unlock_bh(&idev->lock);
        return err;
}

/* Generation of a randomized Interface Identifier
 * draft-ietf-6man-rfc4941bis, Section 3.3.1
 */

static void ipv6_gen_rnd_iid(struct in6_addr *addr)
{
regen:
        get_random_bytes(&addr->s6_addr[8], 8);

        /* <draft-ietf-6man-rfc4941bis-08.txt>, Section 3.3.1:
         * check if generated address is not inappropriate:
         *
         * - Reserved IPv6 Interface Identifiers
         * - XXX: already assigned to an address on the device
         */

        /* Subnet-router anycast: 0000:0000:0000:0000 */
        if (!(addr->s6_addr32[2] | addr->s6_addr32[3]))
                goto regen;

        /* IANA Ethernet block: 0200:5EFF:FE00:0000-0200:5EFF:FE00:5212
         * Proxy Mobile IPv6:   0200:5EFF:FE00:5213
         * IANA Ethernet block: 0200:5EFF:FE00:5214-0200:5EFF:FEFF:FFFF
         */
        if (ntohl(addr->s6_addr32[2]) == 0x02005eff &&
            (ntohl(addr->s6_addr32[3]) & 0Xff000000) == 0xfe000000)
                goto regen;

        /* Reserved subnet anycast addresses */
        if (ntohl(addr->s6_addr32[2]) == 0xfdffffff &&
            ntohl(addr->s6_addr32[3]) >= 0Xffffff80)
                goto regen;
}

/*
 *        Add prefix route.
 */

static void
addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
                      struct net_device *dev, unsigned long expires,
                      u32 flags, gfp_t gfp_flags)
{
        struct fib6_config cfg = {
                .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
                .fc_metric = metric ? : IP6_RT_PRIO_ADDRCONF,
                .fc_ifindex = dev->ifindex,
                .fc_expires = expires,
                .fc_dst_len = plen,
                .fc_flags = RTF_UP | flags,
                .fc_nlinfo.nl_net = dev_net(dev),
                .fc_protocol = RTPROT_KERNEL,
                .fc_type = RTN_UNICAST,
        };

        cfg.fc_dst = *pfx;

        /* Prevent useless cloning on PtP SIT.
           This thing is done here expecting that the whole
           class of non-broadcast devices need not cloning.
         */
#if IS_ENABLED(CONFIG_IPV6_SIT)
        if (dev->type == ARPHRD_SIT && (dev->flags & IFF_POINTOPOINT))
                cfg.fc_flags |= RTF_NONEXTHOP;
#endif

        ip6_route_add(&cfg, gfp_flags, NULL);
}


static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
                                                  int plen,
                                                  const struct net_device *dev,
                                                  u32 flags, u32 noflags,
                                                  bool no_gw)
{
        struct fib6_node *fn;
        struct fib6_info *rt = NULL;
        struct fib6_table *table;
        u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;

        table = fib6_get_table(dev_net(dev), tb_id);
        if (!table)
                return NULL;

        rcu_read_lock();
        fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
        if (!fn)
                goto out;

        for_each_fib6_node_rt_rcu(fn) {
                /* prefix routes only use builtin fib6_nh */
                if (rt->nh)
                        continue;

                if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
                        continue;
                if (no_gw && rt->fib6_nh->fib_nh_gw_family)
                        continue;
                if ((rt->fib6_flags & flags) != flags)
                        continue;
                if ((rt->fib6_flags & noflags) != 0)
                        continue;
                if (!fib6_info_hold_safe(rt))
                        continue;
                break;
        }
out:
        rcu_read_unlock();
        return rt;
}


/* Create "default" multicast route to the interface */

static void addrconf_add_mroute(struct net_device *dev)
{
        struct fib6_config cfg = {
                .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_LOCAL,
                .fc_metric = IP6_RT_PRIO_ADDRCONF,
                .fc_ifindex = dev->ifindex,
                .fc_dst_len = 8,
                .fc_flags = RTF_UP,
                .fc_type = RTN_MULTICAST,
                .fc_nlinfo.nl_net = dev_net(dev),
                .fc_protocol = RTPROT_KERNEL,
        };

        ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);

        ip6_route_add(&cfg, GFP_KERNEL, NULL);
}

static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev))
                return idev;

        if (idev->cnf.disable_ipv6)
                return ERR_PTR(-EACCES);

        /* Add default multicast route */
        if (!(dev->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev))
                addrconf_add_mroute(dev);

        return idev;
}

static void delete_tempaddrs(struct inet6_dev *idev,
                             struct inet6_ifaddr *ifp)
{
        struct inet6_ifaddr *ift, *tmp;

        write_lock_bh(&idev->lock);
        list_for_each_entry_safe(ift, tmp, &idev->tempaddr_list, tmp_list) {
                if (ift->ifpub != ifp)
                        continue;

                in6_ifa_hold(ift);
                write_unlock_bh(&idev->lock);
                ipv6_del_addr(ift);
                write_lock_bh(&idev->lock);
        }
        write_unlock_bh(&idev->lock);
}

static void manage_tempaddrs(struct inet6_dev *idev,
                             struct inet6_ifaddr *ifp,
                             __u32 valid_lft, __u32 prefered_lft,
                             bool create, unsigned long now)
{
        u32 flags;
        struct inet6_ifaddr *ift;

        read_lock_bh(&idev->lock);
        /* update all temporary addresses in the list */
        list_for_each_entry(ift, &idev->tempaddr_list, tmp_list) {
                int age, max_valid, max_prefered;

                if (ifp != ift->ifpub)
                        continue;

                /* RFC 4941 section 3.3:
                 * If a received option will extend the lifetime of a public
                 * address, the lifetimes of temporary addresses should
                 * be extended, subject to the overall constraint that no
                 * temporary addresses should ever remain "valid" or "preferred"
                 * for a time longer than (TEMP_VALID_LIFETIME) or
                 * (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), respectively.
                 */
                age = (now - ift->cstamp) / HZ;
                max_valid = READ_ONCE(idev->cnf.temp_valid_lft) - age;
                if (max_valid < 0)
                        max_valid = 0;

                max_prefered = READ_ONCE(idev->cnf.temp_prefered_lft) -
                               idev->desync_factor - age;
                if (max_prefered < 0)
                        max_prefered = 0;

                if (valid_lft > max_valid)
                        valid_lft = max_valid;

                if (prefered_lft > max_prefered)
                        prefered_lft = max_prefered;

                spin_lock(&ift->lock);
                flags = ift->flags;
                ift->valid_lft = valid_lft;
                ift->prefered_lft = prefered_lft;
                ift->tstamp = now;
                if (prefered_lft > 0)
                        ift->flags &= ~IFA_F_DEPRECATED;

                spin_unlock(&ift->lock);
                if (!(flags&IFA_F_TENTATIVE))
                        ipv6_ifa_notify(0, ift);
        }

        /* Also create a temporary address if it's enabled but no temporary
         * address currently exists.
         * However, we get called with valid_lft == 0, prefered_lft == 0, create == false
         * as part of cleanup (ie. deleting the mngtmpaddr).
         * We don't want that to result in creating a new temporary ip address.
         */
        if (list_empty(&idev->tempaddr_list) && (valid_lft || prefered_lft))
                create = true;

        if (create && READ_ONCE(idev->cnf.use_tempaddr) > 0) {
                /* When a new public address is created as described
                 * in [ADDRCONF], also create a new temporary address.
                 */
                read_unlock_bh(&idev->lock);
                ipv6_create_tempaddr(ifp, false);
        } else {
                read_unlock_bh(&idev->lock);
        }
}

static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
{
        return idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
               idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
}

int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
                                 const struct prefix_info *pinfo,
                                 struct inet6_dev *in6_dev,
                                 const struct in6_addr *addr, int addr_type,
                                 u32 addr_flags, bool sllao, bool tokenized,
                                 __u32 valid_lft, u32 prefered_lft)
{
        struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1);
        int create = 0, update_lft = 0;

        if (!ifp && valid_lft) {
                int max_addresses = READ_ONCE(in6_dev->cnf.max_addresses);
                struct ifa6_config cfg = {
                        .pfx = addr,
                        .plen = pinfo->prefix_len,
                        .ifa_flags = addr_flags,
                        .valid_lft = valid_lft,
                        .preferred_lft = prefered_lft,
                        .scope = addr_type & IPV6_ADDR_SCOPE_MASK,
                        .ifa_proto = IFAPROT_KERNEL_RA
                };

#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
                if ((READ_ONCE(net->ipv6.devconf_all->optimistic_dad) ||
                     READ_ONCE(in6_dev->cnf.optimistic_dad)) &&
                    !net->ipv6.devconf_all->forwarding && sllao)
                        cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif

                /* Do not allow to create too much of autoconfigured
                 * addresses; this would be too easy way to crash kernel.
                 */
                if (!max_addresses ||
                    ipv6_count_addresses(in6_dev) < max_addresses)
                        ifp = ipv6_add_addr(in6_dev, &cfg, false, NULL);

                if (IS_ERR_OR_NULL(ifp))
                        return -1;

                create = 1;
                spin_lock_bh(&ifp->lock);
                ifp->flags |= IFA_F_MANAGETEMPADDR;
                ifp->cstamp = jiffies;
                ifp->tokenized = tokenized;
                spin_unlock_bh(&ifp->lock);
                addrconf_dad_start(ifp);
        }

        if (ifp) {
                u32 flags;
                unsigned long now;
                u32 stored_lft;

                /* update lifetime (RFC2462 5.5.3 e) */
                spin_lock_bh(&ifp->lock);
                now = jiffies;
                if (ifp->valid_lft > (now - ifp->tstamp) / HZ)
                        stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
                else
                        stored_lft = 0;

                /* RFC4862 Section 5.5.3e:
                 * "Note that the preferred lifetime of the
                 *  corresponding address is always reset to
                 *  the Preferred Lifetime in the received
                 *  Prefix Information option, regardless of
                 *  whether the valid lifetime is also reset or
                 *  ignored."
                 *
                 * So we should always update prefered_lft here.
                 */
                update_lft = !create && stored_lft;

                if (update_lft && !READ_ONCE(in6_dev->cnf.ra_honor_pio_life)) {
                        const u32 minimum_lft = min_t(u32,
                                stored_lft, MIN_VALID_LIFETIME);
                        valid_lft = max(valid_lft, minimum_lft);
                }

                if (update_lft) {
                        ifp->valid_lft = valid_lft;
                        ifp->prefered_lft = prefered_lft;
                        WRITE_ONCE(ifp->tstamp, now);
                        flags = ifp->flags;
                        ifp->flags &= ~IFA_F_DEPRECATED;
                        spin_unlock_bh(&ifp->lock);

                        if (!(flags&IFA_F_TENTATIVE))
                                ipv6_ifa_notify(0, ifp);
                } else
                        spin_unlock_bh(&ifp->lock);

                manage_tempaddrs(in6_dev, ifp, valid_lft, prefered_lft,
                                 create, now);

                in6_ifa_put(ifp);
                addrconf_verify(net);
        }

        return 0;
}
EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr);

void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
{
        struct prefix_info *pinfo;
        struct fib6_table *table;
        __u32 valid_lft;
        __u32 prefered_lft;
        int addr_type, err;
        u32 addr_flags = 0;
        struct inet6_dev *in6_dev;
        struct net *net = dev_net(dev);
        bool ignore_autoconf = false;

        pinfo = (struct prefix_info *) opt;

        if (len < sizeof(struct prefix_info)) {
                netdev_dbg(dev, "addrconf: prefix option too short\n");
                return;
        }

        /*
         *        Validation checks ([ADDRCONF], page 19)
         */

        addr_type = ipv6_addr_type(&pinfo->prefix);

        if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL))
                return;

        valid_lft = ntohl(pinfo->valid);
        prefered_lft = ntohl(pinfo->prefered);

        if (prefered_lft > valid_lft) {
                net_warn_ratelimited("addrconf: prefix option has invalid lifetime\n");
                return;
        }

        in6_dev = in6_dev_get(dev);

        if (!in6_dev) {
                net_dbg_ratelimited("addrconf: device %s not configured\n",
                                    dev->name);
                return;
        }

        if (valid_lft != 0 && valid_lft < in6_dev->cnf.accept_ra_min_lft)
                goto put;

        /*
         *        Two things going on here:
         *        1) Add routes for on-link prefixes
         *        2) Configure prefixes with the auto flag set
         */

        if (pinfo->onlink) {
                struct fib6_info *rt;
                unsigned long rt_expires;

                /* Avoid arithmetic overflow. Really, we could
                 * save rt_expires in seconds, likely valid_lft,
                 * but it would require division in fib gc, that it
                 * not good.
                 */
                if (HZ > USER_HZ)
                        rt_expires = addrconf_timeout_fixup(valid_lft, HZ);
                else
                        rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ);

                if (addrconf_finite_timeout(rt_expires))
                        rt_expires *= HZ;

                rt = addrconf_get_prefix_route(&pinfo->prefix,
                                               pinfo->prefix_len,
                                               dev,
                                               RTF_ADDRCONF | RTF_PREFIX_RT,
                                               RTF_DEFAULT, true);

                if (rt) {
                        /* Autoconf prefix route */
                        if (valid_lft == 0) {
                                ip6_del_rt(net, rt, false);
                                rt = NULL;
                        } else {
                                table = rt->fib6_table;
                                spin_lock_bh(&table->tb6_lock);

                                if (addrconf_finite_timeout(rt_expires)) {
                                        /* not infinity */
                                        fib6_set_expires(rt, jiffies + rt_expires);
                                        fib6_add_gc_list(rt);
                                } else {
                                        fib6_clean_expires(rt);
                                        fib6_remove_gc_list(rt);
                                }

                                spin_unlock_bh(&table->tb6_lock);
                        }
                } else if (valid_lft) {
                        clock_t expires = 0;
                        int flags = RTF_ADDRCONF | RTF_PREFIX_RT;
                        if (addrconf_finite_timeout(rt_expires)) {
                                /* not infinity */
                                flags |= RTF_EXPIRES;
                                expires = jiffies_to_clock_t(rt_expires);
                        }
                        addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len,
                                              0, dev, expires, flags,
                                              GFP_ATOMIC);
                }
                fib6_info_release(rt);
        }

        /* Try to figure out our local address for this prefix */

        ignore_autoconf = READ_ONCE(in6_dev->cnf.ra_honor_pio_pflag) && pinfo->preferpd;
        if (pinfo->autoconf && in6_dev->cnf.autoconf && !ignore_autoconf) {
                struct in6_addr addr;
                bool tokenized = false, dev_addr_generated = false;

                if (pinfo->prefix_len == 64) {
                        memcpy(&addr, &pinfo->prefix, 8);

                        if (!ipv6_addr_any(&in6_dev->token)) {
                                read_lock_bh(&in6_dev->lock);
                                memcpy(addr.s6_addr + 8,
                                       in6_dev->token.s6_addr + 8, 8);
                                read_unlock_bh(&in6_dev->lock);
                                tokenized = true;
                        } else if (is_addr_mode_generate_stable(in6_dev) &&
                                   !ipv6_generate_stable_address(&addr, 0,
                                                                 in6_dev)) {
                                addr_flags |= IFA_F_STABLE_PRIVACY;
                                goto ok;
                        } else if (ipv6_generate_eui64(addr.s6_addr + 8, dev) &&
                                   ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) {
                                goto put;
                        } else {
                                dev_addr_generated = true;
                        }
                        goto ok;
                }
                net_dbg_ratelimited("IPv6 addrconf: prefix with wrong length %d\n",
                                    pinfo->prefix_len);
                goto put;

ok:
                err = addrconf_prefix_rcv_add_addr(net, dev, pinfo, in6_dev,
                                                   &addr, addr_type,
                                                   addr_flags, sllao,
                                                   tokenized, valid_lft,
                                                   prefered_lft);
                if (err)
                        goto put;

                /* Ignore error case here because previous prefix add addr was
                 * successful which will be notified.
                 */
                ndisc_ops_prefix_rcv_add_addr(net, dev, pinfo, in6_dev, &addr,
                                              addr_type, addr_flags, sllao,
                                              tokenized, valid_lft,
                                              prefered_lft,
                                              dev_addr_generated);
        }
        inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
put:
        in6_dev_put(in6_dev);
}

static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev,
                struct in6_ifreq *ireq)
{
        struct ip_tunnel_parm_kern p = { };
        int err;

        if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4))
                return -EADDRNOTAVAIL;

        p.iph.daddr = ireq->ifr6_addr.s6_addr32[3];
        p.iph.version = 4;
        p.iph.ihl = 5;
        p.iph.protocol = IPPROTO_IPV6;
        p.iph.ttl = 64;

        if (!dev->netdev_ops->ndo_tunnel_ctl)
                return -EOPNOTSUPP;
        err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, SIOCADDTUNNEL);
        if (err)
                return err;

        dev = __dev_get_by_name(net, p.name);
        if (!dev)
                return -ENOBUFS;
        return dev_open(dev, NULL);
}

/*
 *        Set destination address.
 *        Special case for SIT interfaces where we create a new "virtual"
 *        device.
 */
int addrconf_set_dstaddr(struct net *net, void __user *arg)
{
        struct net_device *dev;
        struct in6_ifreq ireq;
        int err = -ENODEV;

        if (!IS_ENABLED(CONFIG_IPV6_SIT))
                return -ENODEV;
        if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
                return -EFAULT;

        rtnl_net_lock(net);
        dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
        if (dev && dev->type == ARPHRD_SIT)
                err = addrconf_set_sit_dstaddr(net, dev, &ireq);
        rtnl_net_unlock(net);
        return err;
}

static int ipv6_mc_config(struct sock *sk, bool join,
                          const struct in6_addr *addr, int ifindex)
{
        int ret;

        ASSERT_RTNL();

        lock_sock(sk);
        if (join)
                ret = ipv6_sock_mc_join(sk, ifindex, addr);
        else
                ret = ipv6_sock_mc_drop(sk, ifindex, addr);
        release_sock(sk);

        return ret;
}

/*
 *        Manual configuration of address on an interface
 */
static int inet6_addr_add(struct net *net, struct net_device *dev,
                          struct ifa6_config *cfg, clock_t expires, u32 flags,
                          struct netlink_ext_ack *extack)
{
        struct inet6_ifaddr *ifp;
        struct inet6_dev *idev;

        ASSERT_RTNL_NET(net);

        if (cfg->plen > 128) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length");
                return -EINVAL;
        }

        if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) {
                NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64");
                return -EINVAL;
        }

        idev = addrconf_add_dev(dev);
        if (IS_ERR(idev)) {
                NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
                return PTR_ERR(idev);
        }

        if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
                int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
                                         true, cfg->pfx, dev->ifindex);

                if (ret < 0) {
                        NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed");
                        return ret;
                }
        }

        cfg->scope = ipv6_addr_scope(cfg->pfx);

        ifp = ipv6_add_addr(idev, cfg, true, extack);
        if (!IS_ERR(ifp)) {
                if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
                        addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
                                              ifp->rt_priority, dev, expires,
                                              flags, GFP_KERNEL);
                }

                /* Send a netlink notification if DAD is enabled and
                 * optimistic flag is not set
                 */
                if (!(ifp->flags & (IFA_F_OPTIMISTIC | IFA_F_NODAD)))
                        ipv6_ifa_notify(0, ifp);
                /*
                 * Note that section 3.1 of RFC 4429 indicates
                 * that the Optimistic flag should not be set for
                 * manually configured addresses
                 */
                addrconf_dad_start(ifp);
                if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR)
                        manage_tempaddrs(idev, ifp, cfg->valid_lft,
                                         cfg->preferred_lft, true, jiffies);
                in6_ifa_put(ifp);
                addrconf_verify_rtnl(net);
                return 0;
        } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
                ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
                               cfg->pfx, dev->ifindex);
        }

        return PTR_ERR(ifp);
}

static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
                          const struct in6_addr *pfx, unsigned int plen,
                          struct netlink_ext_ack *extack)
{
        struct inet6_ifaddr *ifp;
        struct inet6_dev *idev;
        struct net_device *dev;

        if (plen > 128) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length");
                return -EINVAL;
        }

        dev = __dev_get_by_index(net, ifindex);
        if (!dev) {
                NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface");
                return -ENODEV;
        }

        idev = __in6_dev_get_rtnl_net(dev);
        if (!idev) {
                NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
                return -ENXIO;
        }

        read_lock_bh(&idev->lock);
        list_for_each_entry(ifp, &idev->addr_list, if_list) {
                if (ifp->prefix_len == plen &&
                    ipv6_addr_equal(pfx, &ifp->addr)) {
                        in6_ifa_hold(ifp);
                        read_unlock_bh(&idev->lock);

                        ipv6_del_addr(ifp);

                        if (!(ifp->flags & IFA_F_TEMPORARY) &&
                            (ifp->flags & IFA_F_MANAGETEMPADDR))
                                delete_tempaddrs(idev, ifp);

                        addrconf_verify_rtnl(net);
                        if (ipv6_addr_is_multicast(pfx)) {
                                ipv6_mc_config(net->ipv6.mc_autojoin_sk,
                                               false, pfx, dev->ifindex);
                        }
                        return 0;
                }
        }
        read_unlock_bh(&idev->lock);

        NL_SET_ERR_MSG_MOD(extack, "address not found");
        return -EADDRNOTAVAIL;
}


int addrconf_add_ifaddr(struct net *net, void __user *arg)
{
        struct ifa6_config cfg = {
                .ifa_flags = IFA_F_PERMANENT,
                .preferred_lft = INFINITY_LIFE_TIME,
                .valid_lft = INFINITY_LIFE_TIME,
        };
        struct net_device *dev;
        struct in6_ifreq ireq;
        int err;

        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
                return -EFAULT;

        cfg.pfx = &ireq.ifr6_addr;
        cfg.plen = ireq.ifr6_prefixlen;

        rtnl_net_lock(net);
        dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
        if (dev) {
                netdev_lock_ops(dev);
                err = inet6_addr_add(net, dev, &cfg, 0, 0, NULL);
                netdev_unlock_ops(dev);
        } else {
                err = -ENODEV;
        }
        rtnl_net_unlock(net);
        return err;
}

int addrconf_del_ifaddr(struct net *net, void __user *arg)
{
        struct in6_ifreq ireq;
        int err;

        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
                return -EPERM;

        if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
                return -EFAULT;

        rtnl_net_lock(net);
        err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr,
                             ireq.ifr6_prefixlen, NULL);
        rtnl_net_unlock(net);
        return err;
}

static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
                     int plen, int scope, u8 proto)
{
        struct inet6_ifaddr *ifp;
        struct ifa6_config cfg = {
                .pfx = addr,
                .plen = plen,
                .ifa_flags = IFA_F_PERMANENT,
                .valid_lft = INFINITY_LIFE_TIME,
                .preferred_lft = INFINITY_LIFE_TIME,
                .scope = scope,
                .ifa_proto = proto
        };

        ifp = ipv6_add_addr(idev, &cfg, true, NULL);
        if (!IS_ERR(ifp)) {
                spin_lock_bh(&ifp->lock);
                ifp->flags &= ~IFA_F_TENTATIVE;
                spin_unlock_bh(&ifp->lock);
                rt_genid_bump_ipv6(dev_net(idev->dev));
                ipv6_ifa_notify(RTM_NEWADDR, ifp);
                in6_ifa_put(ifp);
        }
}

#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
static void add_v4_addrs(struct inet6_dev *idev)
{
        struct in6_addr addr;
        struct net_device *dev;
        struct net *net = dev_net(idev->dev);
        int scope, plen, offset = 0;
        u32 pflags = 0;

        ASSERT_RTNL();

        memset(&addr, 0, sizeof(struct in6_addr));
        /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */
        if (idev->dev->addr_len == sizeof(struct in6_addr))
                offset = sizeof(struct in6_addr) - 4;
        memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4);

        if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) {
                scope = IPV6_ADDR_COMPATv4;
                plen = 96;
                pflags |= RTF_NONEXTHOP;
        } else {
                if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE)
                        return;

                addr.s6_addr32[0] = htonl(0xfe800000);
                scope = IFA_LINK;
                plen = 64;
        }

        if (addr.s6_addr32[3]) {
                add_addr(idev, &addr, plen, scope, IFAPROT_UNSPEC);
                addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
                                      GFP_KERNEL);
                return;
        }

        for_each_netdev(net, dev) {
                struct in_device *in_dev = __in_dev_get_rtnl(dev);
                if (in_dev && (dev->flags & IFF_UP)) {
                        struct in_ifaddr *ifa;
                        int flag = scope;

                        in_dev_for_each_ifa_rtnl(ifa, in_dev) {
                                addr.s6_addr32[3] = ifa->ifa_local;

                                if (ifa->ifa_scope == RT_SCOPE_LINK)
                                        continue;
                                if (ifa->ifa_scope >= RT_SCOPE_HOST) {
                                        if (idev->dev->flags&IFF_POINTOPOINT)
                                                continue;
                                        flag |= IFA_HOST;
                                }

                                add_addr(idev, &addr, plen, flag,
                                         IFAPROT_UNSPEC);
                                addrconf_prefix_route(&addr, plen, 0, idev->dev,
                                                      0, pflags, GFP_KERNEL);
                        }
                }
        }
}
#endif

static void init_loopback(struct net_device *dev)
{
        struct inet6_dev  *idev;

        /* ::1 */

        ASSERT_RTNL();

        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev)) {
                pr_debug("%s: add_dev failed\n", __func__);
                return;
        }

        add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFAPROT_KERNEL_LO);
}

void addrconf_add_linklocal(struct inet6_dev *idev,
                            const struct in6_addr *addr, u32 flags)
{
        struct ifa6_config cfg = {
                .pfx = addr,
                .plen = 64,
                .ifa_flags = flags | IFA_F_PERMANENT,
                .valid_lft = INFINITY_LIFE_TIME,
                .preferred_lft = INFINITY_LIFE_TIME,
                .scope = IFA_LINK,
                .ifa_proto = IFAPROT_KERNEL_LL
        };
        struct inet6_ifaddr *ifp;

#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad) ||
             READ_ONCE(idev->cnf.optimistic_dad)) &&
            !dev_net(idev->dev)->ipv6.devconf_all->forwarding)
                cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif

        ifp = ipv6_add_addr(idev, &cfg, true, NULL);
        if (!IS_ERR(ifp)) {
                addrconf_prefix_route(&ifp->addr, ifp->prefix_len, 0, idev->dev,
                                      0, 0, GFP_ATOMIC);
                addrconf_dad_start(ifp);
                in6_ifa_put(ifp);
        }
}
EXPORT_SYMBOL_GPL(addrconf_add_linklocal);

static bool ipv6_reserved_interfaceid(struct in6_addr address)
{
        if ((address.s6_addr32[2] | address.s6_addr32[3]) == 0)
                return true;

        if (address.s6_addr32[2] == htonl(0x02005eff) &&
            ((address.s6_addr32[3] & htonl(0xfe000000)) == htonl(0xfe000000)))
                return true;

        if (address.s6_addr32[2] == htonl(0xfdffffff) &&
            ((address.s6_addr32[3] & htonl(0xffffff80)) == htonl(0xffffff80)))
                return true;

        return false;
}

static int ipv6_generate_stable_address(struct in6_addr *address,
                                        u8 dad_count,
                                        const struct inet6_dev *idev)
{
        static DEFINE_SPINLOCK(lock);
        static __u32 digest[SHA1_DIGEST_WORDS];
        static __u32 workspace[SHA1_WORKSPACE_WORDS];

        static union {
                char __data[SHA1_BLOCK_SIZE];
                struct {
                        struct in6_addr secret;
                        __be32 prefix[2];
                        unsigned char hwaddr[MAX_ADDR_LEN];
                        u8 dad_count;
                } __packed;
        } data;

        struct in6_addr secret;
        struct in6_addr temp;
        struct net *net = dev_net(idev->dev);

        BUILD_BUG_ON(sizeof(data.__data) != sizeof(data));

        if (idev->cnf.stable_secret.initialized)
                secret = idev->cnf.stable_secret.secret;
        else if (net->ipv6.devconf_dflt->stable_secret.initialized)
                secret = net->ipv6.devconf_dflt->stable_secret.secret;
        else
                return -1;

retry:
        spin_lock_bh(&lock);

        sha1_init(digest);
        memset(&data, 0, sizeof(data));
        memset(workspace, 0, sizeof(workspace));
        memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len);
        data.prefix[0] = address->s6_addr32[0];
        data.prefix[1] = address->s6_addr32[1];
        data.secret = secret;
        data.dad_count = dad_count;

        sha1_transform(digest, data.__data, workspace);

        temp = *address;
        temp.s6_addr32[2] = (__force __be32)digest[0];
        temp.s6_addr32[3] = (__force __be32)digest[1];

        spin_unlock_bh(&lock);

        if (ipv6_reserved_interfaceid(temp)) {
                dad_count++;
                if (dad_count > dev_net(idev->dev)->ipv6.sysctl.idgen_retries)
                        return -1;
                goto retry;
        }

        *address = temp;
        return 0;
}

static void ipv6_gen_mode_random_init(struct inet6_dev *idev)
{
        struct ipv6_stable_secret *s = &idev->cnf.stable_secret;

        if (s->initialized)
                return;
        s = &idev->cnf.stable_secret;
        get_random_bytes(&s->secret, sizeof(s->secret));
        s->initialized = true;
}

static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
{
        struct in6_addr addr;

        /* no link local addresses on L3 master devices */
        if (netif_is_l3_master(idev->dev))
                return;

        /* no link local addresses on devices flagged as slaves */
        if (idev->dev->priv_flags & IFF_NO_ADDRCONF)
                return;

        ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);

        switch (idev->cnf.addr_gen_mode) {
        case IN6_ADDR_GEN_MODE_RANDOM:
                ipv6_gen_mode_random_init(idev);
                fallthrough;
        case IN6_ADDR_GEN_MODE_STABLE_PRIVACY:
                if (!ipv6_generate_stable_address(&addr, 0, idev))
                        addrconf_add_linklocal(idev, &addr,
                                               IFA_F_STABLE_PRIVACY);
                else if (prefix_route)
                        addrconf_prefix_route(&addr, 64, 0, idev->dev,
                                              0, 0, GFP_KERNEL);
                break;
        case IN6_ADDR_GEN_MODE_EUI64:
                /* addrconf_add_linklocal also adds a prefix_route and we
                 * only need to care about prefix routes if ipv6_generate_eui64
                 * couldn't generate one.
                 */
                if (ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) == 0)
                        addrconf_add_linklocal(idev, &addr, 0);
                else if (prefix_route)
                        addrconf_prefix_route(&addr, 64, 0, idev->dev,
                                              0, 0, GFP_KERNEL);
                break;
        case IN6_ADDR_GEN_MODE_NONE:
        default:
                /* will not add any link local address */
                break;
        }
}

static void addrconf_dev_config(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        if ((dev->type != ARPHRD_ETHER) &&
            (dev->type != ARPHRD_FDDI) &&
            (dev->type != ARPHRD_ARCNET) &&
            (dev->type != ARPHRD_INFINIBAND) &&
            (dev->type != ARPHRD_IEEE1394) &&
            (dev->type != ARPHRD_TUNNEL6) &&
            (dev->type != ARPHRD_6LOWPAN) &&
            (dev->type != ARPHRD_TUNNEL) &&
            (dev->type != ARPHRD_NONE) &&
            (dev->type != ARPHRD_RAWIP)) {
                /* Alas, we support only Ethernet autoconfiguration. */
                idev = __in6_dev_get(dev);
                if (!IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP &&
                    dev->flags & IFF_MULTICAST)
                        ipv6_mc_up(idev);
                return;
        }

        idev = addrconf_add_dev(dev);
        if (IS_ERR(idev))
                return;

        /* this device type has no EUI support */
        if (dev->type == ARPHRD_NONE &&
            idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
                WRITE_ONCE(idev->cnf.addr_gen_mode,
                           IN6_ADDR_GEN_MODE_RANDOM);

        addrconf_addr_gen(idev, false);
}

#if IS_ENABLED(CONFIG_IPV6_SIT)
static void addrconf_sit_config(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        /*
         * Configure the tunnel with one of our IPv4
         * addresses... we should configure all of
         * our v4 addrs in the tunnel
         */

        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev)) {
                pr_debug("%s: add_dev failed\n", __func__);
                return;
        }

        if (dev->priv_flags & IFF_ISATAP) {
                addrconf_addr_gen(idev, false);
                return;
        }

        add_v4_addrs(idev);

        if (dev->flags&IFF_POINTOPOINT)
                addrconf_add_mroute(dev);
}
#endif

#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
static void addrconf_gre_config(struct net_device *dev)
{
        struct inet6_dev *idev;

        ASSERT_RTNL();

        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev)) {
                pr_debug("%s: add_dev failed\n", __func__);
                return;
        }

        if (dev->type == ARPHRD_ETHER) {
                addrconf_addr_gen(idev, true);
                return;
        }

        add_v4_addrs(idev);

        if (dev->flags & IFF_POINTOPOINT)
                addrconf_add_mroute(dev);
}
#endif

static void addrconf_init_auto_addrs(struct net_device *dev)
{
        switch (dev->type) {
#if IS_ENABLED(CONFIG_IPV6_SIT)
        case ARPHRD_SIT:
                addrconf_sit_config(dev);
                break;
#endif
#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE)
        case ARPHRD_IP6GRE:
        case ARPHRD_IPGRE:
                addrconf_gre_config(dev);
                break;
#endif
        case ARPHRD_LOOPBACK:
                init_loopback(dev);
                break;

        default:
                addrconf_dev_config(dev);
                break;
        }
}

static int fixup_permanent_addr(struct net *net,
                                struct inet6_dev *idev,
                                struct inet6_ifaddr *ifp)
{
        /* !fib6_node means the host route was removed from the
         * FIB, for example, if 'lo' device is taken down. In that
         * case regenerate the host route.
         */
        if (!ifp->rt || !ifp->rt->fib6_node) {
                struct fib6_info *f6i, *prev;

                f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
                                         GFP_ATOMIC, NULL);
                if (IS_ERR(f6i))
                        return PTR_ERR(f6i);

                /* ifp->rt can be accessed outside of rtnl */
                spin_lock(&ifp->lock);
                prev = ifp->rt;
                ifp->rt = f6i;
                spin_unlock(&ifp->lock);

                fib6_info_release(prev);
        }

        if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
                addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
                                      ifp->rt_priority, idev->dev, 0, 0,
                                      GFP_ATOMIC);
        }

        if (ifp->state == INET6_IFADDR_STATE_PREDAD)
                addrconf_dad_start(ifp);

        return 0;
}

static void addrconf_permanent_addr(struct net *net, struct net_device *dev)
{
        struct inet6_ifaddr *ifp, *tmp;
        struct inet6_dev *idev;

        idev = __in6_dev_get(dev);
        if (!idev)
                return;

        write_lock_bh(&idev->lock);

        list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
                if ((ifp->flags & IFA_F_PERMANENT) &&
                    fixup_permanent_addr(net, idev, ifp) < 0) {
                        write_unlock_bh(&idev->lock);
                        in6_ifa_hold(ifp);
                        ipv6_del_addr(ifp);
                        write_lock_bh(&idev->lock);

                        net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
                                             idev->dev->name, &ifp->addr);
                }
        }

        write_unlock_bh(&idev->lock);
}

static int addrconf_notify(struct notifier_block *this, unsigned long event,
                           void *ptr)
{
        struct net_device *dev = netdev_notifier_info_to_dev(ptr);
        struct netdev_notifier_change_info *change_info;
        struct netdev_notifier_changeupper_info *info;
        struct inet6_dev *idev = __in6_dev_get(dev);
        struct net *net = dev_net(dev);
        int run_pending = 0;
        int err;

        switch (event) {
        case NETDEV_REGISTER:
                if (!idev && dev->mtu >= IPV6_MIN_MTU) {
                        idev = ipv6_add_dev(dev);
                        if (IS_ERR(idev))
                                return notifier_from_errno(PTR_ERR(idev));
                }
                break;

        case NETDEV_CHANGEMTU:
                /* if MTU under IPV6_MIN_MTU stop IPv6 on this interface. */
                if (dev->mtu < IPV6_MIN_MTU) {
                        addrconf_ifdown(dev, dev != net->loopback_dev);
                        break;
                }

                if (idev) {
                        rt6_mtu_change(dev, dev->mtu);
                        WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
                        break;
                }

                /* allocate new idev */
                idev = ipv6_add_dev(dev);
                if (IS_ERR(idev))
                        break;

                /* device is still not ready */
                if (!(idev->if_flags & IF_READY))
                        break;

                run_pending = 1;
                fallthrough;
        case NETDEV_UP:
        case NETDEV_CHANGE:
                if (idev && idev->cnf.disable_ipv6)
                        break;

                if (dev->priv_flags & IFF_NO_ADDRCONF) {
                        if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) &&
                            dev->flags & IFF_UP && dev->flags & IFF_MULTICAST)
                                ipv6_mc_up(idev);
                        break;
                }

                if (event == NETDEV_UP) {
                        /* restore routes for permanent addresses */
                        addrconf_permanent_addr(net, dev);

                        if (!addrconf_link_ready(dev)) {
                                /* device is not ready yet. */
                                pr_debug("ADDRCONF(NETDEV_UP): %s: link is not ready\n",
                                         dev->name);
                                break;
                        }

                        if (!idev && dev->mtu >= IPV6_MIN_MTU)
                                idev = ipv6_add_dev(dev);

                        if (!IS_ERR_OR_NULL(idev)) {
                                idev->if_flags |= IF_READY;
                                run_pending = 1;
                        }
                } else if (event == NETDEV_CHANGE) {
                        if (!addrconf_link_ready(dev)) {
                                /* device is still not ready. */
                                rt6_sync_down_dev(dev, event);
                                break;
                        }

                        if (!IS_ERR_OR_NULL(idev)) {
                                if (idev->if_flags & IF_READY) {
                                        /* device is already configured -
                                         * but resend MLD reports, we might
                                         * have roamed and need to update
                                         * multicast snooping switches
                                         */
                                        ipv6_mc_up(idev);
                                        change_info = ptr;
                                        if (change_info->flags_changed & IFF_NOARP)
                                                addrconf_dad_run(idev, true);
                                        rt6_sync_up(dev, RTNH_F_LINKDOWN);
                                        break;
                                }
                                idev->if_flags |= IF_READY;
                        }

                        pr_debug("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n",
                                 dev->name);

                        run_pending = 1;
                }

                addrconf_init_auto_addrs(dev);

                if (!IS_ERR_OR_NULL(idev)) {
                        if (run_pending)
                                addrconf_dad_run(idev, false);

                        /* Device has an address by now */
                        rt6_sync_up(dev, RTNH_F_DEAD);

                        /*
                         * If the MTU changed during the interface down,
                         * when the interface up, the changed MTU must be
                         * reflected in the idev as well as routers.
                         */
                        if (idev->cnf.mtu6 != dev->mtu &&
                            dev->mtu >= IPV6_MIN_MTU) {
                                rt6_mtu_change(dev, dev->mtu);
                                WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
                        }
                        WRITE_ONCE(idev->tstamp, jiffies);
                        inet6_ifinfo_notify(RTM_NEWLINK, idev);

                        /*
                         * If the changed mtu during down is lower than
                         * IPV6_MIN_MTU stop IPv6 on this interface.
                         */
                        if (dev->mtu < IPV6_MIN_MTU)
                                addrconf_ifdown(dev, dev != net->loopback_dev);
                }
                break;

        case NETDEV_DOWN:
        case NETDEV_UNREGISTER:
                /*
                 *        Remove all addresses from this interface.
                 */
                addrconf_ifdown(dev, event != NETDEV_DOWN);
                break;

        case NETDEV_CHANGENAME:
                if (idev) {
                        snmp6_unregister_dev(idev);
                        addrconf_sysctl_unregister(idev);
                        err = addrconf_sysctl_register(idev);
                        if (err)
                                return notifier_from_errno(err);
                        err = snmp6_register_dev(idev);
                        if (err) {
                                addrconf_sysctl_unregister(idev);
                                return notifier_from_errno(err);
                        }
                }
                break;

        case NETDEV_PRE_TYPE_CHANGE:
        case NETDEV_POST_TYPE_CHANGE:
                if (idev)
                        addrconf_type_change(dev, event);
                break;

        case NETDEV_CHANGEUPPER:
                info = ptr;

                /* flush all routes if dev is linked to or unlinked from
                 * an L3 master device (e.g., VRF)
                 */
                if (info->upper_dev && netif_is_l3_master(info->upper_dev))
                        addrconf_ifdown(dev, false);
        }

        return NOTIFY_OK;
}

/*
 *        addrconf module should be notified of a device going up
 */
static struct notifier_block ipv6_dev_notf = {
        .notifier_call = addrconf_notify,
        .priority = ADDRCONF_NOTIFY_PRIORITY,
};

static void addrconf_type_change(struct net_device *dev, unsigned long event)
{
        struct inet6_dev *idev;
        ASSERT_RTNL();

        idev = __in6_dev_get(dev);

        if (event == NETDEV_POST_TYPE_CHANGE)
                ipv6_mc_remap(idev);
        else if (event == NETDEV_PRE_TYPE_CHANGE)
                ipv6_mc_unmap(idev);
}

static bool addr_is_local(const struct in6_addr *addr)
{
        return ipv6_addr_type(addr) &
                (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}

static int addrconf_ifdown(struct net_device *dev, bool unregister)
{
        unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN;
        struct net *net = dev_net(dev);
        struct inet6_dev *idev;
        struct inet6_ifaddr *ifa;
        LIST_HEAD(tmp_addr_list);
        bool keep_addr = false;
        bool was_ready;
        int state, i;

        ASSERT_RTNL();

        rt6_disable_ip(dev, event);

        idev = __in6_dev_get(dev);
        if (!idev)
                return -ENODEV;

        /*
         * Step 1: remove reference to ipv6 device from parent device.
         *           Do not dev_put!
         */
        if (unregister) {
                idev->dead = 1;

                /* protected by rtnl_lock */
                RCU_INIT_POINTER(dev->ip6_ptr, NULL);

                /* Step 1.5: remove snmp6 entry */
                snmp6_unregister_dev(idev);

        }

        /* combine the user config with event to determine if permanent
         * addresses are to be removed from address hash table
         */
        if (!unregister && !idev->cnf.disable_ipv6) {
                /* aggregate the system setting and interface setting */
                int _keep_addr = READ_ONCE(net->ipv6.devconf_all->keep_addr_on_down);

                if (!_keep_addr)
                        _keep_addr = READ_ONCE(idev->cnf.keep_addr_on_down);

                keep_addr = (_keep_addr > 0);
        }

        /* Step 2: clear hash table */
        for (i = 0; i < IN6_ADDR_HSIZE; i++) {
                struct hlist_head *h = &net->ipv6.inet6_addr_lst[i];

                spin_lock_bh(&net->ipv6.addrconf_hash_lock);
restart:
                hlist_for_each_entry_rcu(ifa, h, addr_lst) {
                        if (ifa->idev == idev) {
                                addrconf_del_dad_work(ifa);
                                /* combined flag + permanent flag decide if
                                 * address is retained on a down event
                                 */
                                if (!keep_addr ||
                                    !(ifa->flags & IFA_F_PERMANENT) ||
                                    addr_is_local(&ifa->addr)) {
                                        hlist_del_init_rcu(&ifa->addr_lst);
                                        goto restart;
                                }
                        }
                }
                spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
        }

        write_lock_bh(&idev->lock);

        addrconf_del_rs_timer(idev);

        /* Step 2: clear flags for stateless addrconf, repeated down
         *         detection
         */
        was_ready = idev->if_flags & IF_READY;
        if (!unregister)
                idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);

        /* Step 3: clear tempaddr list */
        while (!list_empty(&idev->tempaddr_list)) {
                ifa = list_first_entry(&idev->tempaddr_list,
                                       struct inet6_ifaddr, tmp_list);
                list_del(&ifa->tmp_list);
                write_unlock_bh(&idev->lock);
                spin_lock_bh(&ifa->lock);

                if (ifa->ifpub) {
                        in6_ifa_put(ifa->ifpub);
                        ifa->ifpub = NULL;
                }
                spin_unlock_bh(&ifa->lock);
                in6_ifa_put(ifa);
                write_lock_bh(&idev->lock);
        }

        list_for_each_entry(ifa, &idev->addr_list, if_list)
                list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
        write_unlock_bh(&idev->lock);

        while (!list_empty(&tmp_addr_list)) {
                struct fib6_info *rt = NULL;
                bool keep;

                ifa = list_first_entry(&tmp_addr_list,
                                       struct inet6_ifaddr, if_list_aux);
                list_del(&ifa->if_list_aux);

                addrconf_del_dad_work(ifa);

                keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
                        !addr_is_local(&ifa->addr);

                spin_lock_bh(&ifa->lock);

                if (keep) {
                        /* set state to skip the notifier below */
                        state = INET6_IFADDR_STATE_DEAD;
                        ifa->state = INET6_IFADDR_STATE_PREDAD;
                        if (!(ifa->flags & IFA_F_NODAD))
                                ifa->flags |= IFA_F_TENTATIVE;

                        rt = ifa->rt;
                        ifa->rt = NULL;
                } else {
                        state = ifa->state;
                        ifa->state = INET6_IFADDR_STATE_DEAD;
                }

                spin_unlock_bh(&ifa->lock);

                if (rt)
                        ip6_del_rt(net, rt, false);

                if (state != INET6_IFADDR_STATE_DEAD) {
                        __ipv6_ifa_notify(RTM_DELADDR, ifa);
                        inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
                } else {
                        if (idev->cnf.forwarding)
                                addrconf_leave_anycast(ifa);
                        addrconf_leave_solict(ifa->idev, &ifa->addr);
                }

                if (!keep) {
                        write_lock_bh(&idev->lock);
                        list_del_rcu(&ifa->if_list);
                        write_unlock_bh(&idev->lock);
                        in6_ifa_put(ifa);
                }
        }

        /* Step 5: Discard anycast and multicast list */
        if (unregister) {
                ipv6_ac_destroy_dev(idev);
                ipv6_mc_destroy_dev(idev);
        } else if (was_ready) {
                ipv6_mc_down(idev);
        }

        WRITE_ONCE(idev->tstamp, jiffies);
        idev->ra_mtu = 0;

        /* Last: Shot the device (if unregistered) */
        if (unregister) {
                addrconf_sysctl_unregister(idev);
                neigh_parms_release(&nd_tbl, idev->nd_parms);
                neigh_ifdown(&nd_tbl, dev);
                in6_dev_put(idev);
        }
        return 0;
}

static void addrconf_rs_timer(struct timer_list *t)
{
        struct inet6_dev *idev = from_timer(idev, t, rs_timer);
        struct net_device *dev = idev->dev;
        struct in6_addr lladdr;
        int rtr_solicits;

        write_lock(&idev->lock);
        if (idev->dead || !(idev->if_flags & IF_READY))
                goto out;

        if (!ipv6_accept_ra(idev))
                goto out;

        /* Announcement received after solicitation was sent */
        if (idev->if_flags & IF_RA_RCVD)
                goto out;

        rtr_solicits = READ_ONCE(idev->cnf.rtr_solicits);

        if (idev->rs_probes++ < rtr_solicits || rtr_solicits < 0) {
                write_unlock(&idev->lock);
                if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
                        ndisc_send_rs(dev, &lladdr,
                                      &in6addr_linklocal_allrouters);
                else
                        goto put;

                write_lock(&idev->lock);
                idev->rs_interval = rfc3315_s14_backoff_update(
                                idev->rs_interval,
                                READ_ONCE(idev->cnf.rtr_solicit_max_interval));
                /* The wait after the last probe can be shorter */
                addrconf_mod_rs_timer(idev, (idev->rs_probes ==
                                             READ_ONCE(idev->cnf.rtr_solicits)) ?
                                      READ_ONCE(idev->cnf.rtr_solicit_delay) :
                                      idev->rs_interval);
        } else {
                /*
                 * Note: we do not support deprecated "all on-link"
                 * assumption any longer.
                 */
                pr_debug("%s: no IPv6 routers present\n", idev->dev->name);
        }

out:
        write_unlock(&idev->lock);
put:
        in6_dev_put(idev);
}

/*
 *        Duplicate Address Detection
 */
static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
{
        struct inet6_dev *idev = ifp->idev;
        unsigned long rand_num;
        u64 nonce;

        if (ifp->flags & IFA_F_OPTIMISTIC)
                rand_num = 0;
        else
                rand_num = get_random_u32_below(
                                READ_ONCE(idev->cnf.rtr_solicit_delay) ? : 1);

        nonce = 0;
        if (READ_ONCE(idev->cnf.enhanced_dad) ||
            READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad)) {
                do
                        get_random_bytes(&nonce, 6);
                while (nonce == 0);
        }
        ifp->dad_nonce = nonce;
        ifp->dad_probes = READ_ONCE(idev->cnf.dad_transmits);
        addrconf_mod_dad_work(ifp, rand_num);
}

static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
{
        struct inet6_dev *idev = ifp->idev;
        struct net_device *dev = idev->dev;
        bool bump_id, notify = false;
        struct net *net;

        addrconf_join_solict(dev, &ifp->addr);

        read_lock_bh(&idev->lock);
        spin_lock(&ifp->lock);
        if (ifp->state == INET6_IFADDR_STATE_DEAD)
                goto out;

        net = dev_net(dev);
        if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
            (READ_ONCE(net->ipv6.devconf_all->accept_dad) < 1 &&
             READ_ONCE(idev->cnf.accept_dad) < 1) ||
            !(ifp->flags&IFA_F_TENTATIVE) ||
            ifp->flags & IFA_F_NODAD) {
                bool send_na = false;

                if (ifp->flags & IFA_F_TENTATIVE &&
                    !(ifp->flags & IFA_F_OPTIMISTIC))
                        send_na = true;
                bump_id = ifp->flags & IFA_F_TENTATIVE;
                ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
                spin_unlock(&ifp->lock);
                read_unlock_bh(&idev->lock);

                addrconf_dad_completed(ifp, bump_id, send_na);
                return;
        }

        if (!(idev->if_flags & IF_READY)) {
                spin_unlock(&ifp->lock);
                read_unlock_bh(&idev->lock);
                /*
                 * If the device is not ready:
                 * - keep it tentative if it is a permanent address.
                 * - otherwise, kill it.
                 */
                in6_ifa_hold(ifp);
                addrconf_dad_stop(ifp, 0);
                return;
        }

        /*
         * Optimistic nodes can start receiving
         * Frames right away
         */
        if (ifp->flags & IFA_F_OPTIMISTIC) {
                ip6_ins_rt(net, ifp->rt);
                if (ipv6_use_optimistic_addr(net, idev)) {
                        /* Because optimistic nodes can use this address,
                         * notify listeners. If DAD fails, RTM_DELADDR is sent.
                         */
                        notify = true;
                }
        }

        addrconf_dad_kick(ifp);
out:
        spin_unlock(&ifp->lock);
        read_unlock_bh(&idev->lock);
        if (notify)
                ipv6_ifa_notify(RTM_NEWADDR, ifp);
}

static void addrconf_dad_start(struct inet6_ifaddr *ifp)
{
        bool begin_dad = false;

        spin_lock_bh(&ifp->lock);
        if (ifp->state != INET6_IFADDR_STATE_DEAD) {
                ifp->state = INET6_IFADDR_STATE_PREDAD;
                begin_dad = true;
        }
        spin_unlock_bh(&ifp->lock);

        if (begin_dad)
                addrconf_mod_dad_work(ifp, 0);
}

static void addrconf_dad_work(struct work_struct *w)
{
        struct inet6_ifaddr *ifp = container_of(to_delayed_work(w),
                                                struct inet6_ifaddr,
                                                dad_work);
        struct inet6_dev *idev = ifp->idev;
        bool bump_id, disable_ipv6 = false;
        struct in6_addr mcaddr;
        struct net *net;

        enum {
                DAD_PROCESS,
                DAD_BEGIN,
                DAD_ABORT,
        } action = DAD_PROCESS;

        net = dev_net(idev->dev);

        rtnl_net_lock(net);

        spin_lock_bh(&ifp->lock);
        if (ifp->state == INET6_IFADDR_STATE_PREDAD) {
                action = DAD_BEGIN;
                ifp->state = INET6_IFADDR_STATE_DAD;
        } else if (ifp->state == INET6_IFADDR_STATE_ERRDAD) {
                action = DAD_ABORT;
                ifp->state = INET6_IFADDR_STATE_POSTDAD;

                if ((READ_ONCE(net->ipv6.devconf_all->accept_dad) > 1 ||
                     READ_ONCE(idev->cnf.accept_dad) > 1) &&
                    !idev->cnf.disable_ipv6 &&
                    !(ifp->flags & IFA_F_STABLE_PRIVACY)) {
                        struct in6_addr addr;

                        addr.s6_addr32[0] = htonl(0xfe800000);
                        addr.s6_addr32[1] = 0;

                        if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
                            ipv6_addr_equal(&ifp->addr, &addr)) {
                                /* DAD failed for link-local based on MAC */
                                WRITE_ONCE(idev->cnf.disable_ipv6, 1);

                                pr_info("%s: IPv6 being disabled!\n",
                                        ifp->idev->dev->name);
                                disable_ipv6 = true;
                        }
                }
        }
        spin_unlock_bh(&ifp->lock);

        if (action == DAD_BEGIN) {
                addrconf_dad_begin(ifp);
                goto out;
        } else if (action == DAD_ABORT) {
                in6_ifa_hold(ifp);
                addrconf_dad_stop(ifp, 1);
                if (disable_ipv6)
                        addrconf_ifdown(idev->dev, false);
                goto out;
        }

        if (!ifp->dad_probes && addrconf_dad_end(ifp))
                goto out;

        write_lock_bh(&idev->lock);
        if (idev->dead || !(idev->if_flags & IF_READY)) {
                write_unlock_bh(&idev->lock);
                goto out;
        }

        spin_lock(&ifp->lock);
        if (ifp->state == INET6_IFADDR_STATE_DEAD) {
                spin_unlock(&ifp->lock);
                write_unlock_bh(&idev->lock);
                goto out;
        }

        if (ifp->dad_probes == 0) {
                bool send_na = false;

                /*
                 * DAD was successful
                 */

                if (ifp->flags & IFA_F_TENTATIVE &&
                    !(ifp->flags & IFA_F_OPTIMISTIC))
                        send_na = true;
                bump_id = ifp->flags & IFA_F_TENTATIVE;
                ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
                spin_unlock(&ifp->lock);
                write_unlock_bh(&idev->lock);

                addrconf_dad_completed(ifp, bump_id, send_na);

                goto out;
        }

        ifp->dad_probes--;
        addrconf_mod_dad_work(ifp,
                              max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME),
                                  HZ/100));
        spin_unlock(&ifp->lock);
        write_unlock_bh(&idev->lock);

        /* send a neighbour solicitation for our addr */
        addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
        ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any,
                      ifp->dad_nonce);
out:
        in6_ifa_put(ifp);
        rtnl_net_unlock(net);
}

/* ifp->idev must be at least read locked */
static bool ipv6_lonely_lladdr(struct inet6_ifaddr *ifp)
{
        struct inet6_ifaddr *ifpiter;
        struct inet6_dev *idev = ifp->idev;

        list_for_each_entry_reverse(ifpiter, &idev->addr_list, if_list) {
                if (ifpiter->scope > IFA_LINK)
                        break;
                if (ifp != ifpiter && ifpiter->scope == IFA_LINK &&
                    (ifpiter->flags & (IFA_F_PERMANENT|IFA_F_TENTATIVE|
                                       IFA_F_OPTIMISTIC|IFA_F_DADFAILED)) ==
                    IFA_F_PERMANENT)
                        return false;
        }
        return true;
}

static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
                                   bool send_na)
{
        struct net_device *dev = ifp->idev->dev;
        struct in6_addr lladdr;
        bool send_rs, send_mld;

        addrconf_del_dad_work(ifp);

        /*
         *        Configure the address for reception. Now it is valid.
         */

        ipv6_ifa_notify(RTM_NEWADDR, ifp);

        /* If added prefix is link local and we are prepared to process
           router advertisements, start sending router solicitations.
         */

        read_lock_bh(&ifp->idev->lock);
        send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
        send_rs = send_mld &&
                  ipv6_accept_ra(ifp->idev) &&
                  READ_ONCE(ifp->idev->cnf.rtr_solicits) != 0 &&
                  (dev->flags & IFF_LOOPBACK) == 0 &&
                  (dev->type != ARPHRD_TUNNEL) &&
                  !netif_is_team_port(dev);
        read_unlock_bh(&ifp->idev->lock);

        /* While dad is in progress mld report's source address is in6_addrany.
         * Resend with proper ll now.
         */
        if (send_mld)
                ipv6_mc_dad_complete(ifp->idev);

        /* send unsolicited NA if enabled */
        if (send_na &&
            (READ_ONCE(ifp->idev->cnf.ndisc_notify) ||
             READ_ONCE(dev_net(dev)->ipv6.devconf_all->ndisc_notify))) {
                ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifp->addr,
                              /*router=*/ !!ifp->idev->cnf.forwarding,
                              /*solicited=*/ false, /*override=*/ true,
                              /*inc_opt=*/ true);
        }

        if (send_rs) {
                /*
                 *        If a host as already performed a random delay
                 *        [...] as part of DAD [...] there is no need
                 *        to delay again before sending the first RS
                 */
                if (ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
                        return;
                ndisc_send_rs(dev, &lladdr, &in6addr_linklocal_allrouters);

                write_lock_bh(&ifp->idev->lock);
                spin_lock(&ifp->lock);
                ifp->idev->rs_interval = rfc3315_s14_backoff_init(
                        READ_ONCE(ifp->idev->cnf.rtr_solicit_interval));
                ifp->idev->rs_probes = 1;
                ifp->idev->if_flags |= IF_RS_SENT;
                addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
                spin_unlock(&ifp->lock);
                write_unlock_bh(&ifp->idev->lock);
        }

        if (bump_id)
                rt_genid_bump_ipv6(dev_net(dev));

        /* Make sure that a new temporary address will be created
         * before this temporary address becomes deprecated.
         */
        if (ifp->flags & IFA_F_TEMPORARY)
                addrconf_verify_rtnl(dev_net(dev));
}

static void addrconf_dad_run(struct inet6_dev *idev, bool restart)
{
        struct inet6_ifaddr *ifp;

        read_lock_bh(&idev->lock);
        list_for_each_entry(ifp, &idev->addr_list, if_list) {
                spin_lock(&ifp->lock);
                if ((ifp->flags & IFA_F_TENTATIVE &&
                     ifp->state == INET6_IFADDR_STATE_DAD) || restart) {
                        if (restart)
                                ifp->state = INET6_IFADDR_STATE_PREDAD;
                        addrconf_dad_kick(ifp);
                }
                spin_unlock(&ifp->lock);
        }
        read_unlock_bh(&idev->lock);
}

#ifdef CONFIG_PROC_FS
struct if6_iter_state {
        struct seq_net_private p;
        int bucket;
        int offset;
};

static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
{
        struct if6_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);
        struct inet6_ifaddr *ifa = NULL;
        int p = 0;

        /* initial bucket if pos is 0 */
        if (pos == 0) {
                state->bucket = 0;
                state->offset = 0;
        }

        for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
                hlist_for_each_entry_rcu(ifa, &net->ipv6.inet6_addr_lst[state->bucket],
                                         addr_lst) {
                        /* sync with offset */
                        if (p < state->offset) {
                                p++;
                                continue;
                        }
                        return ifa;
                }

                /* prepare for next bucket */
                state->offset = 0;
                p = 0;
        }
        return NULL;
}

static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
                                         struct inet6_ifaddr *ifa)
{
        struct if6_iter_state *state = seq->private;
        struct net *net = seq_file_net(seq);

        hlist_for_each_entry_continue_rcu(ifa, addr_lst) {
                state->offset++;
                return ifa;
        }

        state->offset = 0;
        while (++state->bucket < IN6_ADDR_HSIZE) {
                hlist_for_each_entry_rcu(ifa,
                                     &net->ipv6.inet6_addr_lst[state->bucket], addr_lst) {
                        return ifa;
                }
        }

        return NULL;
}

static void *if6_seq_start(struct seq_file *seq, loff_t *pos)
        __acquires(rcu)
{
        rcu_read_lock();
        return if6_get_first(seq, *pos);
}

static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
        struct inet6_ifaddr *ifa;

        ifa = if6_get_next(seq, v);
        ++*pos;
        return ifa;
}

static void if6_seq_stop(struct seq_file *seq, void *v)
        __releases(rcu)
{
        rcu_read_unlock();
}

static int if6_seq_show(struct seq_file *seq, void *v)
{
        struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v;
        seq_printf(seq, "%pi6 %02x %02x %02x %02x %8s\n",
                   &ifp->addr,
                   ifp->idev->dev->ifindex,
                   ifp->prefix_len,
                   ifp->scope,
                   (u8) ifp->flags,
                   ifp->idev->dev->name);
        return 0;
}

static const struct seq_operations if6_seq_ops = {
        .start        = if6_seq_start,
        .next        = if6_seq_next,
        .show        = if6_seq_show,
        .stop        = if6_seq_stop,
};

static int __net_init if6_proc_net_init(struct net *net)
{
        if (!proc_create_net("if_inet6", 0444, net->proc_net, &if6_seq_ops,
                        sizeof(struct if6_iter_state)))
                return -ENOMEM;
        return 0;
}

static void __net_exit if6_proc_net_exit(struct net *net)
{
        remove_proc_entry("if_inet6", net->proc_net);
}

static struct pernet_operations if6_proc_net_ops = {
        .init = if6_proc_net_init,
        .exit = if6_proc_net_exit,
};

int __init if6_proc_init(void)
{
        return register_pernet_subsys(&if6_proc_net_ops);
}

void if6_proc_exit(void)
{
        unregister_pernet_subsys(&if6_proc_net_ops);
}
#endif        /* CONFIG_PROC_FS */

#if IS_ENABLED(CONFIG_IPV6_MIP6)
/* Check if address is a home address configured on any interface. */
int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
{
        unsigned int hash = inet6_addr_hash(net, addr);
        struct inet6_ifaddr *ifp = NULL;
        int ret = 0;

        rcu_read_lock();
        hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
                if (ipv6_addr_equal(&ifp->addr, addr) &&
                    (ifp->flags & IFA_F_HOMEADDRESS)) {
                        ret = 1;
                        break;
                }
        }
        rcu_read_unlock();
        return ret;
}
#endif

/* RFC6554 has some algorithm to avoid loops in segment routing by
 * checking if the segments contains any of a local interface address.
 *
 * Quote:
 *
 * To detect loops in the SRH, a router MUST determine if the SRH
 * includes multiple addresses assigned to any interface on that router.
 * If such addresses appear more than once and are separated by at least
 * one address not assigned to that router.
 */
int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
                          unsigned char nsegs)
{
        const struct in6_addr *addr;
        int i, ret = 0, found = 0;
        struct inet6_ifaddr *ifp;
        bool separated = false;
        unsigned int hash;
        bool hash_found;

        rcu_read_lock();
        for (i = 0; i < nsegs; i++) {
                addr = &segs[i];
                hash = inet6_addr_hash(net, addr);

                hash_found = false;
                hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {

                        if (ipv6_addr_equal(&ifp->addr, addr)) {
                                hash_found = true;
                                break;
                        }
                }

                if (hash_found) {
                        if (found > 1 && separated) {
                                ret = 1;
                                break;
                        }

                        separated = false;
                        found++;
                } else {
                        separated = true;
                }
        }
        rcu_read_unlock();

        return ret;
}

/*
 *        Periodic address status verification
 */

static void addrconf_verify_rtnl(struct net *net)
{
        unsigned long now, next, next_sec, next_sched;
        struct inet6_ifaddr *ifp;
        int i;

        ASSERT_RTNL();

        rcu_read_lock_bh();
        now = jiffies;
        next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);

        cancel_delayed_work(&net->ipv6.addr_chk_work);

        for (i = 0; i < IN6_ADDR_HSIZE; i++) {
restart:
                hlist_for_each_entry_rcu_bh(ifp, &net->ipv6.inet6_addr_lst[i], addr_lst) {
                        unsigned long age;

                        /* When setting preferred_lft to a value not zero or
                         * infinity, while valid_lft is infinity
                         * IFA_F_PERMANENT has a non-infinity life time.
                         */
                        if ((ifp->flags & IFA_F_PERMANENT) &&
                            (ifp->prefered_lft == INFINITY_LIFE_TIME))
                                continue;

                        spin_lock(&ifp->lock);
                        /* We try to batch several events at once. */
                        age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;

                        if ((ifp->flags&IFA_F_TEMPORARY) &&
                            !(ifp->flags&IFA_F_TENTATIVE) &&
                            ifp->prefered_lft != INFINITY_LIFE_TIME &&
                            !ifp->regen_count && ifp->ifpub) {
                                /* This is a non-regenerated temporary addr. */

                                unsigned long regen_advance = ipv6_get_regen_advance(ifp->idev);

                                if (age + regen_advance >= ifp->prefered_lft) {
                                        struct inet6_ifaddr *ifpub = ifp->ifpub;
                                        if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
                                                next = ifp->tstamp + ifp->prefered_lft * HZ;

                                        ifp->regen_count++;
                                        in6_ifa_hold(ifp);
                                        in6_ifa_hold(ifpub);
                                        spin_unlock(&ifp->lock);

                                        spin_lock(&ifpub->lock);
                                        ifpub->regen_count = 0;
                                        spin_unlock(&ifpub->lock);
                                        rcu_read_unlock_bh();
                                        ipv6_create_tempaddr(ifpub, true);
                                        in6_ifa_put(ifpub);
                                        in6_ifa_put(ifp);
                                        rcu_read_lock_bh();
                                        goto restart;
                                } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
                                        next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
                        }

                        if (ifp->valid_lft != INFINITY_LIFE_TIME &&
                            age >= ifp->valid_lft) {
                                spin_unlock(&ifp->lock);
                                in6_ifa_hold(ifp);
                                rcu_read_unlock_bh();
                                ipv6_del_addr(ifp);
                                rcu_read_lock_bh();
                                goto restart;
                        } else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
                                spin_unlock(&ifp->lock);
                                continue;
                        } else if (age >= ifp->prefered_lft) {
                                /* jiffies - ifp->tstamp > age >= ifp->prefered_lft */
                                int deprecate = 0;

                                if (!(ifp->flags&IFA_F_DEPRECATED)) {
                                        deprecate = 1;
                                        ifp->flags |= IFA_F_DEPRECATED;
                                }

                                if ((ifp->valid_lft != INFINITY_LIFE_TIME) &&
                                    (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)))
                                        next = ifp->tstamp + ifp->valid_lft * HZ;

                                spin_unlock(&ifp->lock);

                                if (deprecate) {
                                        in6_ifa_hold(ifp);

                                        ipv6_ifa_notify(0, ifp);
                                        in6_ifa_put(ifp);
                                        goto restart;
                                }
                        } else {
                                /* ifp->prefered_lft <= ifp->valid_lft */
                                if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
                                        next = ifp->tstamp + ifp->prefered_lft * HZ;
                                spin_unlock(&ifp->lock);
                        }
                }
        }

        next_sec = round_jiffies_up(next);
        next_sched = next;

        /* If rounded timeout is accurate enough, accept it. */
        if (time_before(next_sec, next + ADDRCONF_TIMER_FUZZ))
                next_sched = next_sec;

        /* And minimum interval is ADDRCONF_TIMER_FUZZ_MAX. */
        if (time_before(next_sched, jiffies + ADDRCONF_TIMER_FUZZ_MAX))
                next_sched = jiffies + ADDRCONF_TIMER_FUZZ_MAX;

        pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
                 now, next, next_sec, next_sched);
        mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, next_sched - now);
        rcu_read_unlock_bh();
}

static void addrconf_verify_work(struct work_struct *w)
{
        struct net *net = container_of(to_delayed_work(w), struct net,
                                       ipv6.addr_chk_work);

        rtnl_net_lock(net);
        addrconf_verify_rtnl(net);
        rtnl_net_unlock(net);
}

static void addrconf_verify(struct net *net)
{
        mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, 0);
}

static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local,
                                     struct in6_addr **peer_pfx)
{
        struct in6_addr *pfx = NULL;

        *peer_pfx = NULL;

        if (addr)
                pfx = nla_data(addr);

        if (local) {
                if (pfx && nla_memcmp(local, pfx, sizeof(*pfx)))
                        *peer_pfx = pfx;
                pfx = nla_data(local);
        }

        return pfx;
}

static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
        [IFA_ADDRESS]                = { .len = sizeof(struct in6_addr) },
        [IFA_LOCAL]                = { .len = sizeof(struct in6_addr) },
        [IFA_CACHEINFO]                = { .len = sizeof(struct ifa_cacheinfo) },
        [IFA_FLAGS]                = { .len = sizeof(u32) },
        [IFA_RT_PRIORITY]        = { .len = sizeof(u32) },
        [IFA_TARGET_NETNSID]        = { .type = NLA_S32 },
        [IFA_PROTO]                = { .type = NLA_U8 },
};

static int
inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
                  struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct ifaddrmsg *ifm;
        struct nlattr *tb[IFA_MAX+1];
        struct in6_addr *pfx, *peer_pfx;
        u32 ifa_flags;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                     ifa_ipv6_policy, extack);
        if (err < 0)
                return err;

        ifm = nlmsg_data(nlh);
        pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
        if (!pfx)
                return -EINVAL;

        ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);

        /* We ignore other flags so far. */
        ifa_flags &= IFA_F_MANAGETEMPADDR;

        rtnl_net_lock(net);
        err = inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
                             ifm->ifa_prefixlen, extack);
        rtnl_net_unlock(net);

        return err;
}

static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp,
                               unsigned long expires, u32 flags,
                               bool modify_peer)
{
        struct fib6_table *table;
        struct fib6_info *f6i;
        u32 prio;

        f6i = addrconf_get_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
                                        ifp->prefix_len,
                                        ifp->idev->dev, 0, RTF_DEFAULT, true);
        if (!f6i)
                return -ENOENT;

        prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
        if (f6i->fib6_metric != prio) {
                /* delete old one */
                ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);

                /* add new one */
                addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
                                      ifp->prefix_len,
                                      ifp->rt_priority, ifp->idev->dev,
                                      expires, flags, GFP_KERNEL);
                return 0;
        }
        if (f6i != net->ipv6.fib6_null_entry) {
                table = f6i->fib6_table;
                spin_lock_bh(&table->tb6_lock);

                if (!(flags & RTF_EXPIRES)) {
                        fib6_clean_expires(f6i);
                        fib6_remove_gc_list(f6i);
                } else {
                        fib6_set_expires(f6i, expires);
                        fib6_add_gc_list(f6i);
                }

                spin_unlock_bh(&table->tb6_lock);
        }
        fib6_info_release(f6i);

        return 0;
}

static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
                             struct ifa6_config *cfg, clock_t expires,
                             u32 flags)
{
        bool was_managetempaddr;
        bool new_peer = false;
        bool had_prefixroute;

        ASSERT_RTNL_NET(net);

        if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
            (ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
                return -EINVAL;

        if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
                cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;

        if (cfg->peer_pfx &&
            memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) {
                if (!ipv6_addr_any(&ifp->peer_addr))
                        cleanup_prefix_route(ifp, expires, true, true);
                new_peer = true;
        }

        spin_lock_bh(&ifp->lock);
        was_managetempaddr = ifp->flags & IFA_F_MANAGETEMPADDR;
        had_prefixroute = ifp->flags & IFA_F_PERMANENT &&
                          !(ifp->flags & IFA_F_NOPREFIXROUTE);
        ifp->flags &= ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD |
                        IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
                        IFA_F_NOPREFIXROUTE);
        ifp->flags |= cfg->ifa_flags;
        WRITE_ONCE(ifp->tstamp, jiffies);
        WRITE_ONCE(ifp->valid_lft, cfg->valid_lft);
        WRITE_ONCE(ifp->prefered_lft, cfg->preferred_lft);
        WRITE_ONCE(ifp->ifa_proto, cfg->ifa_proto);

        if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
                WRITE_ONCE(ifp->rt_priority, cfg->rt_priority);

        if (new_peer)
                ifp->peer_addr = *cfg->peer_pfx;

        spin_unlock_bh(&ifp->lock);
        if (!(ifp->flags&IFA_F_TENTATIVE))
                ipv6_ifa_notify(0, ifp);

        if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
                int rc = -ENOENT;

                if (had_prefixroute)
                        rc = modify_prefix_route(net, ifp, expires, flags, false);

                /* prefix route could have been deleted; if so restore it */
                if (rc == -ENOENT) {
                        addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
                                              ifp->rt_priority, ifp->idev->dev,
                                              expires, flags, GFP_KERNEL);
                }

                if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr))
                        rc = modify_prefix_route(net, ifp, expires, flags, true);

                if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) {
                        addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len,
                                              ifp->rt_priority, ifp->idev->dev,
                                              expires, flags, GFP_KERNEL);
                }
        } else if (had_prefixroute) {
                enum cleanup_prefix_rt_t action;
                unsigned long rt_expires;

                write_lock_bh(&ifp->idev->lock);
                action = check_cleanup_prefix_route(ifp, &rt_expires);
                write_unlock_bh(&ifp->idev->lock);

                if (action != CLEANUP_PREFIX_RT_NOP) {
                        cleanup_prefix_route(ifp, rt_expires,
                                action == CLEANUP_PREFIX_RT_DEL, false);
                }
        }

        if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
                if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR))
                        delete_tempaddrs(ifp->idev, ifp);
                else
                        manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
                                         cfg->preferred_lft, !was_managetempaddr,
                                         jiffies);
        }

        addrconf_verify_rtnl(net);

        return 0;
}

static int
inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
                  struct netlink_ext_ack *extack)
{
        struct net *net = sock_net(skb->sk);
        struct nlattr *tb[IFA_MAX+1];
        struct in6_addr *peer_pfx;
        struct inet6_ifaddr *ifa;
        struct net_device *dev;
        struct inet6_dev *idev;
        struct ifa6_config cfg;
        struct ifaddrmsg *ifm;
        unsigned long timeout;
        clock_t expires;
        u32 flags;
        int err;

        err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                     ifa_ipv6_policy, extack);
        if (err < 0)
                return err;

        memset(&cfg, 0, sizeof(cfg));

        ifm = nlmsg_data(nlh);
        cfg.pfx = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer_pfx);
        if (!cfg.pfx)
                return -EINVAL;

        cfg.peer_pfx = peer_pfx;
        cfg.plen = ifm->ifa_prefixlen;
        if (tb[IFA_RT_PRIORITY])
                cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);

        if (tb[IFA_PROTO])
                cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]);

        cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);

        /* We ignore other flags so far. */
        cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
                         IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
                         IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;

        cfg.ifa_flags |= IFA_F_PERMANENT;
        cfg.valid_lft = INFINITY_LIFE_TIME;
        cfg.preferred_lft = INFINITY_LIFE_TIME;
        expires = 0;
        flags = 0;

        if (tb[IFA_CACHEINFO]) {
                struct ifa_cacheinfo *ci;

                ci = nla_data(tb[IFA_CACHEINFO]);
                cfg.valid_lft = ci->ifa_valid;
                cfg.preferred_lft = ci->ifa_prefered;

                if (!cfg.valid_lft || cfg.preferred_lft > cfg.valid_lft) {
                        NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid");
                        return -EINVAL;
                }

                timeout = addrconf_timeout_fixup(cfg.valid_lft, HZ);
                if (addrconf_finite_timeout(timeout)) {
                        cfg.ifa_flags &= ~IFA_F_PERMANENT;
                        cfg.valid_lft = timeout;
                        expires = jiffies_to_clock_t(timeout * HZ);
                        flags = RTF_EXPIRES;
                }

                timeout = addrconf_timeout_fixup(cfg.preferred_lft, HZ);
                if (addrconf_finite_timeout(timeout)) {
                        if (timeout == 0)
                                cfg.ifa_flags |= IFA_F_DEPRECATED;

                        cfg.preferred_lft = timeout;
                }
        }

        rtnl_net_lock(net);

        dev =  __dev_get_by_index(net, ifm->ifa_index);
        if (!dev) {
                NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface");
                err = -ENODEV;
                goto unlock_rtnl;
        }

        netdev_lock_ops(dev);
        idev = ipv6_find_idev(dev);
        if (IS_ERR(idev)) {
                err = PTR_ERR(idev);
                goto unlock;
        }

        if (!ipv6_allow_optimistic_dad(net, idev))
                cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;

        if (cfg.ifa_flags & IFA_F_NODAD &&
            cfg.ifa_flags & IFA_F_OPTIMISTIC) {
                NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
                err = -EINVAL;
                goto unlock;
        }

        ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
        if (!ifa) {
                /*
                 * It would be best to check for !NLM_F_CREATE here but
                 * userspace already relies on not having to provide this.
                 */
                err = inet6_addr_add(net, dev, &cfg, expires, flags, extack);
                goto unlock;
        }

        if (nlh->nlmsg_flags & NLM_F_EXCL ||
            !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
                NL_SET_ERR_MSG_MOD(extack, "address already assigned");
                err = -EEXIST;
        } else {
                err = inet6_addr_modify(net, ifa, &cfg, expires, flags);
        }

        in6_ifa_put(ifa);
unlock:
        netdev_unlock_ops(dev);
unlock_rtnl:
        rtnl_net_unlock(net);

        return err;
}

static void put_ifaddrmsg(struct nlmsghdr *nlh, u8 prefixlen, u32 flags,
                          u8 scope, int ifindex)
{
        struct ifaddrmsg *ifm;

        ifm = nlmsg_data(nlh);
        ifm->ifa_family = AF_INET6;
        ifm->ifa_prefixlen = prefixlen;
        ifm->ifa_flags = flags;
        ifm->ifa_scope = scope;
        ifm->ifa_index = ifindex;
}

static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
                         unsigned long tstamp, u32 preferred, u32 valid)
{
        struct ifa_cacheinfo ci;

        ci.cstamp = cstamp_delta(cstamp);
        ci.tstamp = cstamp_delta(tstamp);
        ci.ifa_prefered = preferred;
        ci.ifa_valid = valid;

        return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
}

static inline int rt_scope(int ifa_scope)
{
        if (ifa_scope & IFA_HOST)
                return RT_SCOPE_HOST;
        else if (ifa_scope & IFA_LINK)
                return RT_SCOPE_LINK;
        else if (ifa_scope & IFA_SITE)
                return RT_SCOPE_SITE;
        else
                return RT_SCOPE_UNIVERSE;
}

static inline int inet6_ifaddr_msgsize(void)
{
        return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
               + nla_total_size(16) /* IFA_LOCAL */
               + nla_total_size(16) /* IFA_ADDRESS */
               + nla_total_size(sizeof(struct ifa_cacheinfo))
               + nla_total_size(4)  /* IFA_FLAGS */
               + nla_total_size(1)  /* IFA_PROTO */
               + nla_total_size(4)  /* IFA_RT_PRIORITY */;
}

static int inet6_fill_ifaddr(struct sk_buff *skb,
                             const struct inet6_ifaddr *ifa,
                             struct inet6_fill_args *args)
{
        struct nlmsghdr *nlh;
        u32 preferred, valid;
        u32 flags, priority;
        u8 proto;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
                        sizeof(struct ifaddrmsg), args->flags);
        if (!nlh)
                return -EMSGSIZE;

        flags = READ_ONCE(ifa->flags);
        put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
                      ifa->idev->dev->ifindex);

        if (args->netnsid >= 0 &&
            nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
                goto error;

        preferred = READ_ONCE(ifa->prefered_lft);
        valid = READ_ONCE(ifa->valid_lft);

        if (!((flags & IFA_F_PERMANENT) &&
              (preferred == INFINITY_LIFE_TIME))) {
                if (preferred != INFINITY_LIFE_TIME) {
                        long tval = (jiffies - READ_ONCE(ifa->tstamp)) / HZ;

                        if (preferred > tval)
                                preferred -= tval;
                        else
                                preferred = 0;
                        if (valid != INFINITY_LIFE_TIME) {
                                if (valid > tval)
                                        valid -= tval;
                                else
                                        valid = 0;
                        }
                }
        } else {
                preferred = INFINITY_LIFE_TIME;
                valid = INFINITY_LIFE_TIME;
        }

        if (!ipv6_addr_any(&ifa->peer_addr)) {
                if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 ||
                    nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->peer_addr) < 0)
                        goto error;
        } else {
                if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
                        goto error;
        }

        priority = READ_ONCE(ifa->rt_priority);
        if (priority && nla_put_u32(skb, IFA_RT_PRIORITY, priority))
                goto error;

        if (put_cacheinfo(skb, ifa->cstamp, READ_ONCE(ifa->tstamp),
                          preferred, valid) < 0)
                goto error;

        if (nla_put_u32(skb, IFA_FLAGS, flags) < 0)
                goto error;

        proto = READ_ONCE(ifa->ifa_proto);
        if (proto && nla_put_u8(skb, IFA_PROTO, proto))
                goto error;

        nlmsg_end(skb, nlh);
        return 0;

error:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

int inet6_fill_ifmcaddr(struct sk_buff *skb,
                        const struct ifmcaddr6 *ifmca,
                        struct inet6_fill_args *args)
{
        int ifindex = ifmca->idev->dev->ifindex;
        u8 scope = RT_SCOPE_UNIVERSE;
        struct nlmsghdr *nlh;

        if (!args->force_rt_scope_universe &&
            ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
                scope = RT_SCOPE_SITE;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
                        sizeof(struct ifaddrmsg), args->flags);
        if (!nlh)
                return -EMSGSIZE;

        if (args->netnsid >= 0 &&
            nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
        if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
            put_cacheinfo(skb, ifmca->mca_cstamp, READ_ONCE(ifmca->mca_tstamp),
                          INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

int inet6_fill_ifacaddr(struct sk_buff *skb,
                        const struct ifacaddr6 *ifaca,
                        struct inet6_fill_args *args)
{
        struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
        int ifindex = dev ? dev->ifindex : 1;
        u8 scope = RT_SCOPE_UNIVERSE;
        struct nlmsghdr *nlh;

        if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
                scope = RT_SCOPE_SITE;

        nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
                        sizeof(struct ifaddrmsg), args->flags);
        if (!nlh)
                return -EMSGSIZE;

        if (args->netnsid >= 0 &&
            nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
        if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
            put_cacheinfo(skb, ifaca->aca_cstamp, READ_ONCE(ifaca->aca_tstamp),
                          INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
                nlmsg_cancel(skb, nlh);
                return -EMSGSIZE;
        }

        nlmsg_end(skb, nlh);
        return 0;
}

/* called with rcu_read_lock() */
static int in6_dump_addrs(const struct inet6_dev *idev, struct sk_buff *skb,
                          struct netlink_callback *cb, int *s_ip_idx,
                          struct inet6_fill_args *fillargs)
{
        const struct ifmcaddr6 *ifmca;
        const struct ifacaddr6 *ifaca;
        int ip_idx = 0;
        int err = 0;

        switch (fillargs->type) {
        case UNICAST_ADDR: {
                const struct inet6_ifaddr *ifa;
                fillargs->event = RTM_NEWADDR;

                /* unicast address incl. temp addr */
                list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
                        if (ip_idx < *s_ip_idx)
                                goto next;
                        err = inet6_fill_ifaddr(skb, ifa, fillargs);
                        if (err < 0)
                                break;
                        nl_dump_check_consistent(cb, nlmsg_hdr(skb));
next:
                        ip_idx++;
                }
                break;
        }
        case MULTICAST_ADDR:
                fillargs->event = RTM_GETMULTICAST;

                /* multicast address */
                for (ifmca = rcu_dereference(idev->mc_list);
                     ifmca;
                     ifmca = rcu_dereference(ifmca->next), ip_idx++) {
                        if (ip_idx < *s_ip_idx)
                                continue;
                        err = inet6_fill_ifmcaddr(skb, ifmca, fillargs);
                        if (err < 0)
                                break;
                }
                break;
        case ANYCAST_ADDR:
                fillargs->event = RTM_GETANYCAST;
                /* anycast address */
                for (ifaca = rcu_dereference(idev->ac_list); ifaca;
                     ifaca = rcu_dereference(ifaca->aca_next), ip_idx++) {
                        if (ip_idx < *s_ip_idx)
                                continue;
                        err = inet6_fill_ifacaddr(skb, ifaca, fillargs);
                        if (err < 0)
                                break;
                }
                break;
        default:
                break;
        }
        *s_ip_idx = err ? ip_idx : 0;
        return err;
}

static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
                                       struct inet6_fill_args *fillargs,
                                       struct net **tgt_net, struct sock *sk,
                                       struct netlink_callback *cb)
{
        struct netlink_ext_ack *extack = cb->extack;
        struct nlattr *tb[IFA_MAX+1];
        struct ifaddrmsg *ifm;
        int err, i;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request");
                return -EINVAL;
        }

        ifm = nlmsg_data(nlh);
        if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request");
                return -EINVAL;
        }

        fillargs->ifindex = ifm->ifa_index;
        if (fillargs->ifindex) {
                cb->answer_flags |= NLM_F_DUMP_FILTERED;
                fillargs->flags |= NLM_F_DUMP_FILTERED;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
                                            ifa_ipv6_policy, extack);
        if (err < 0)
                return err;

        for (i = 0; i <= IFA_MAX; ++i) {
                if (!tb[i])
                        continue;

                if (i == IFA_TARGET_NETNSID) {
                        struct net *net;

                        fillargs->netnsid = nla_get_s32(tb[i]);
                        net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
                        if (IS_ERR(net)) {
                                fillargs->netnsid = -1;
                                NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id");
                                return PTR_ERR(net);
                        }
                        *tgt_net = net;
                } else {
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
                           enum addr_type_t type)
{
        struct net *tgt_net = sock_net(skb->sk);
        const struct nlmsghdr *nlh = cb->nlh;
        struct inet6_fill_args fillargs = {
                .portid = NETLINK_CB(cb->skb).portid,
                .seq = cb->nlh->nlmsg_seq,
                .flags = NLM_F_MULTI,
                .netnsid = -1,
                .type = type,
                .force_rt_scope_universe = false,
        };
        struct {
                unsigned long ifindex;
                int ip_idx;
        } *ctx = (void *)cb->ctx;
        struct net_device *dev;
        struct inet6_dev *idev;
        int err = 0;

        rcu_read_lock();
        if (cb->strict_check) {
                err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
                                                  skb->sk, cb);
                if (err < 0)
                        goto done;

                err = 0;
                if (fillargs.ifindex) {
                        dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex);
                        if (!dev) {
                                err = -ENODEV;
                                goto done;
                        }
                        idev = __in6_dev_get(dev);
                        if (idev)
                                err = in6_dump_addrs(idev, skb, cb,
                                                     &ctx->ip_idx,
                                                     &fillargs);
                        goto done;
                }
        }

        cb->seq = inet6_base_seq(tgt_net);
        for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
                idev = __in6_dev_get(dev);
                if (!idev)
                        continue;
                err = in6_dump_addrs(idev, skb, cb, &ctx->ip_idx,
                                     &fillargs);
                if (err < 0)
                        goto done;
        }
done:
        rcu_read_unlock();
        if (fillargs.netnsid >= 0)
                put_net(tgt_net);

        return err;
}

static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        enum addr_type_t type = UNICAST_ADDR;

        return inet6_dump_addr(skb, cb, type);
}

static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        enum addr_type_t type = MULTICAST_ADDR;

        return inet6_dump_addr(skb, cb, type);
}


static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
        enum addr_type_t type = ANYCAST_ADDR;

        return inet6_dump_addr(skb, cb, type);
}

static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb,
                                       const struct nlmsghdr *nlh,
                                       struct nlattr **tb,
                                       struct netlink_ext_ack *extack)
{
        struct ifaddrmsg *ifm;
        int i, err;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request");
                return -EINVAL;
        }

        if (!netlink_strict_get_check(skb))
                return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
                                              ifa_ipv6_policy, extack);

        ifm = nlmsg_data(nlh);
        if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request");
                return -EINVAL;
        }

        err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
                                            ifa_ipv6_policy, extack);
        if (err)
                return err;

        for (i = 0; i <= IFA_MAX; i++) {
                if (!tb[i])
                        continue;

                switch (i) {
                case IFA_TARGET_NETNSID:
                case IFA_ADDRESS:
                case IFA_LOCAL:
                        break;
                default:
                        NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get address request");
                        return -EINVAL;
                }
        }

        return 0;
}

static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
                             struct netlink_ext_ack *extack)
{
        struct net *tgt_net = sock_net(in_skb->sk);
        struct inet6_fill_args fillargs = {
                .portid = NETLINK_CB(in_skb).portid,
                .seq = nlh->nlmsg_seq,
                .event = RTM_NEWADDR,
                .flags = 0,
                .netnsid = -1,
                .force_rt_scope_universe = false,
        };
        struct ifaddrmsg *ifm;
        struct nlattr *tb[IFA_MAX+1];
        struct in6_addr *addr = NULL, *peer;
        struct net_device *dev = NULL;
        struct inet6_ifaddr *ifa;
        struct sk_buff *skb;
        int err;

        err = inet6_rtm_valid_getaddr_req(in_skb, nlh, tb, extack);
        if (err < 0)
                return err;

        if (tb[IFA_TARGET_NETNSID]) {
                fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]);

                tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(in_skb).sk,
                                                  fillargs.netnsid);
                if (IS_ERR(tgt_net))
                        return PTR_ERR(tgt_net);
        }

        addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
        if (!addr) {
                err = -EINVAL;
                goto errout;
        }
        ifm = nlmsg_data(nlh);
        if (ifm->ifa_index)
                dev = dev_get_by_index(tgt_net, ifm->ifa_index);

        ifa = ipv6_get_ifaddr(tgt_net, addr, dev, 1);
        if (!ifa) {
                err = -EADDRNOTAVAIL;
                goto errout;
        }

        skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_KERNEL);
        if (!skb) {
                err = -ENOBUFS;
                goto errout_ifa;
        }

        err = inet6_fill_ifaddr(skb, ifa, &fillargs);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout_ifa;
        }
        err = rtnl_unicast(skb, tgt_net, NETLINK_CB(in_skb).portid);
errout_ifa:
        in6_ifa_put(ifa);
errout:
        dev_put(dev);
        if (fillargs.netnsid >= 0)
                put_net(tgt_net);

        return err;
}

static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
{
        struct sk_buff *skb;
        struct net *net = dev_net(ifa->idev->dev);
        struct inet6_fill_args fillargs = {
                .portid = 0,
                .seq = 0,
                .event = event,
                .flags = 0,
                .netnsid = -1,
                .force_rt_scope_universe = false,
        };
        int err = -ENOBUFS;

        skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = inet6_fill_ifaddr(skb, ifa, &fillargs);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
}

static void ipv6_store_devconf(const struct ipv6_devconf *cnf,
                               __s32 *array, int bytes)
{
        BUG_ON(bytes < (DEVCONF_MAX * 4));

        memset(array, 0, bytes);
        array[DEVCONF_FORWARDING] = READ_ONCE(cnf->forwarding);
        array[DEVCONF_HOPLIMIT] = READ_ONCE(cnf->hop_limit);
        array[DEVCONF_MTU6] = READ_ONCE(cnf->mtu6);
        array[DEVCONF_ACCEPT_RA] = READ_ONCE(cnf->accept_ra);
        array[DEVCONF_ACCEPT_REDIRECTS] = READ_ONCE(cnf->accept_redirects);
        array[DEVCONF_AUTOCONF] = READ_ONCE(cnf->autoconf);
        array[DEVCONF_DAD_TRANSMITS] = READ_ONCE(cnf->dad_transmits);
        array[DEVCONF_RTR_SOLICITS] = READ_ONCE(cnf->rtr_solicits);
        array[DEVCONF_RTR_SOLICIT_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_interval));
        array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_max_interval));
        array[DEVCONF_RTR_SOLICIT_DELAY] =
                jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_delay));
        array[DEVCONF_FORCE_MLD_VERSION] = READ_ONCE(cnf->force_mld_version);
        array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->mldv1_unsolicited_report_interval));
        array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->mldv2_unsolicited_report_interval));
        array[DEVCONF_USE_TEMPADDR] = READ_ONCE(cnf->use_tempaddr);
        array[DEVCONF_TEMP_VALID_LFT] = READ_ONCE(cnf->temp_valid_lft);
        array[DEVCONF_TEMP_PREFERED_LFT] = READ_ONCE(cnf->temp_prefered_lft);
        array[DEVCONF_REGEN_MAX_RETRY] = READ_ONCE(cnf->regen_max_retry);
        array[DEVCONF_MAX_DESYNC_FACTOR] = READ_ONCE(cnf->max_desync_factor);
        array[DEVCONF_MAX_ADDRESSES] = READ_ONCE(cnf->max_addresses);
        array[DEVCONF_ACCEPT_RA_DEFRTR] = READ_ONCE(cnf->accept_ra_defrtr);
        array[DEVCONF_RA_DEFRTR_METRIC] = READ_ONCE(cnf->ra_defrtr_metric);
        array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] =
                READ_ONCE(cnf->accept_ra_min_hop_limit);
        array[DEVCONF_ACCEPT_RA_PINFO] = READ_ONCE(cnf->accept_ra_pinfo);
#ifdef CONFIG_IPV6_ROUTER_PREF
        array[DEVCONF_ACCEPT_RA_RTR_PREF] = READ_ONCE(cnf->accept_ra_rtr_pref);
        array[DEVCONF_RTR_PROBE_INTERVAL] =
                jiffies_to_msecs(READ_ONCE(cnf->rtr_probe_interval));
#ifdef CONFIG_IPV6_ROUTE_INFO
        array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] =
                READ_ONCE(cnf->accept_ra_rt_info_min_plen);
        array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] =
                READ_ONCE(cnf->accept_ra_rt_info_max_plen);
#endif
#endif
        array[DEVCONF_PROXY_NDP] = READ_ONCE(cnf->proxy_ndp);
        array[DEVCONF_ACCEPT_SOURCE_ROUTE] =
                READ_ONCE(cnf->accept_source_route);
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        array[DEVCONF_OPTIMISTIC_DAD] = READ_ONCE(cnf->optimistic_dad);
        array[DEVCONF_USE_OPTIMISTIC] = READ_ONCE(cnf->use_optimistic);
#endif
#ifdef CONFIG_IPV6_MROUTE
        array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding);
#endif
        array[DEVCONF_DISABLE_IPV6] = READ_ONCE(cnf->disable_ipv6);
        array[DEVCONF_ACCEPT_DAD] = READ_ONCE(cnf->accept_dad);
        array[DEVCONF_FORCE_TLLAO] = READ_ONCE(cnf->force_tllao);
        array[DEVCONF_NDISC_NOTIFY] = READ_ONCE(cnf->ndisc_notify);
        array[DEVCONF_SUPPRESS_FRAG_NDISC] =
                READ_ONCE(cnf->suppress_frag_ndisc);
        array[DEVCONF_ACCEPT_RA_FROM_LOCAL] =
                READ_ONCE(cnf->accept_ra_from_local);
        array[DEVCONF_ACCEPT_RA_MTU] = READ_ONCE(cnf->accept_ra_mtu);
        array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] =
                READ_ONCE(cnf->ignore_routes_with_linkdown);
        /* we omit DEVCONF_STABLE_SECRET for now */
        array[DEVCONF_USE_OIF_ADDRS_ONLY] = READ_ONCE(cnf->use_oif_addrs_only);
        array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] =
                READ_ONCE(cnf->drop_unicast_in_l2_multicast);
        array[DEVCONF_DROP_UNSOLICITED_NA] = READ_ONCE(cnf->drop_unsolicited_na);
        array[DEVCONF_KEEP_ADDR_ON_DOWN] = READ_ONCE(cnf->keep_addr_on_down);
        array[DEVCONF_SEG6_ENABLED] = READ_ONCE(cnf->seg6_enabled);
#ifdef CONFIG_IPV6_SEG6_HMAC
        array[DEVCONF_SEG6_REQUIRE_HMAC] = READ_ONCE(cnf->seg6_require_hmac);
#endif
        array[DEVCONF_ENHANCED_DAD] = READ_ONCE(cnf->enhanced_dad);
        array[DEVCONF_ADDR_GEN_MODE] = READ_ONCE(cnf->addr_gen_mode);
        array[DEVCONF_DISABLE_POLICY] = READ_ONCE(cnf->disable_policy);
        array[DEVCONF_NDISC_TCLASS] = READ_ONCE(cnf->ndisc_tclass);
        array[DEVCONF_RPL_SEG_ENABLED] = READ_ONCE(cnf->rpl_seg_enabled);
        array[DEVCONF_IOAM6_ENABLED] = READ_ONCE(cnf->ioam6_enabled);
        array[DEVCONF_IOAM6_ID] = READ_ONCE(cnf->ioam6_id);
        array[DEVCONF_IOAM6_ID_WIDE] = READ_ONCE(cnf->ioam6_id_wide);
        array[DEVCONF_NDISC_EVICT_NOCARRIER] =
                READ_ONCE(cnf->ndisc_evict_nocarrier);
        array[DEVCONF_ACCEPT_UNTRACKED_NA] =
                READ_ONCE(cnf->accept_untracked_na);
        array[DEVCONF_ACCEPT_RA_MIN_LFT] = READ_ONCE(cnf->accept_ra_min_lft);
}

static inline size_t inet6_ifla6_size(void)
{
        return nla_total_size(4) /* IFLA_INET6_FLAGS */
             + nla_total_size(sizeof(struct ifla_cacheinfo))
             + nla_total_size(DEVCONF_MAX * 4) /* IFLA_INET6_CONF */
             + nla_total_size(IPSTATS_MIB_MAX * 8) /* IFLA_INET6_STATS */
             + nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
             + nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
             + nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
             + nla_total_size(4) /* IFLA_INET6_RA_MTU */
             + 0;
}

static inline size_t inet6_if_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct ifinfomsg))
               + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
               + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
               + nla_total_size(4) /* IFLA_MTU */
               + nla_total_size(4) /* IFLA_LINK */
               + nla_total_size(1) /* IFLA_OPERSTATE */
               + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */
}

static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib,
                                        int bytes)
{
        int i;
        int pad = bytes - sizeof(u64) * ICMP6_MIB_MAX;
        BUG_ON(pad < 0);

        /* Use put_unaligned() because stats may not be aligned for u64. */
        put_unaligned(ICMP6_MIB_MAX, &stats[0]);
        for (i = 1; i < ICMP6_MIB_MAX; i++)
                put_unaligned(atomic_long_read(&mib[i]), &stats[i]);

        memset(&stats[ICMP6_MIB_MAX], 0, pad);
}

static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib,
                                        int bytes, size_t syncpoff)
{
        int i, c;
        u64 buff[IPSTATS_MIB_MAX];
        int pad = bytes - sizeof(u64) * IPSTATS_MIB_MAX;

        BUG_ON(pad < 0);

        memset(buff, 0, sizeof(buff));
        buff[0] = IPSTATS_MIB_MAX;

        for_each_possible_cpu(c) {
                for (i = 1; i < IPSTATS_MIB_MAX; i++)
                        buff[i] += snmp_get_cpu_field64(mib, c, i, syncpoff);
        }

        memcpy(stats, buff, IPSTATS_MIB_MAX * sizeof(u64));
        memset(&stats[IPSTATS_MIB_MAX], 0, pad);
}

static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
                             int bytes)
{
        switch (attrtype) {
        case IFLA_INET6_STATS:
                __snmp6_fill_stats64(stats, idev->stats.ipv6, bytes,
                                     offsetof(struct ipstats_mib, syncp));
                break;
        case IFLA_INET6_ICMP6STATS:
                __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, bytes);
                break;
        }
}

static int inet6_fill_ifla6_stats_attrs(struct sk_buff *skb,
                                        struct inet6_dev *idev)
{
        struct nlattr *nla;

        nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
        if (!nla)
                goto nla_put_failure;
        snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));

        nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
        if (!nla)
                goto nla_put_failure;
        snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
                                  u32 ext_filter_mask)
{
        struct ifla_cacheinfo ci;
        struct nlattr *nla;
        u32 ra_mtu;

        if (nla_put_u32(skb, IFLA_INET6_FLAGS, READ_ONCE(idev->if_flags)))
                goto nla_put_failure;
        ci.max_reasm_len = IPV6_MAXPLEN;
        ci.tstamp = cstamp_delta(READ_ONCE(idev->tstamp));
        ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time);
        ci.retrans_time = jiffies_to_msecs(NEIGH_VAR(idev->nd_parms, RETRANS_TIME));
        if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci))
                goto nla_put_failure;
        nla = nla_reserve(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(s32));
        if (!nla)
                goto nla_put_failure;
        ipv6_store_devconf(&idev->cnf, nla_data(nla), nla_len(nla));

        /* XXX - MC not implemented */

        if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) {
                if (inet6_fill_ifla6_stats_attrs(skb, idev) < 0)
                        goto nla_put_failure;
        }

        nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr));
        if (!nla)
                goto nla_put_failure;
        read_lock_bh(&idev->lock);
        memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla));
        read_unlock_bh(&idev->lock);

        if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE,
                       READ_ONCE(idev->cnf.addr_gen_mode)))
                goto nla_put_failure;

        ra_mtu = READ_ONCE(idev->ra_mtu);
        if (ra_mtu && nla_put_u32(skb, IFLA_INET6_RA_MTU, ra_mtu))
                goto nla_put_failure;

        return 0;

nla_put_failure:
        return -EMSGSIZE;
}

static size_t inet6_get_link_af_size(const struct net_device *dev,
                                     u32 ext_filter_mask)
{
        if (!__in6_dev_get(dev))
                return 0;

        return inet6_ifla6_size();
}

static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
                              u32 ext_filter_mask)
{
        struct inet6_dev *idev = __in6_dev_get(dev);

        if (!idev)
                return -ENODATA;

        if (inet6_fill_ifla6_attrs(skb, idev, ext_filter_mask) < 0)
                return -EMSGSIZE;

        return 0;
}

static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token,
                             struct netlink_ext_ack *extack)
{
        struct inet6_ifaddr *ifp;
        struct net_device *dev = idev->dev;
        bool clear_token, update_rs = false;
        struct in6_addr ll_addr;

        ASSERT_RTNL();

        if (!token)
                return -EINVAL;

        if (dev->flags & IFF_LOOPBACK) {
                NL_SET_ERR_MSG_MOD(extack, "Device is loopback");
                return -EINVAL;
        }

        if (dev->flags & IFF_NOARP) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Device does not do neighbour discovery");
                return -EINVAL;
        }

        if (!ipv6_accept_ra(idev)) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Router advertisement is disabled on device");
                return -EINVAL;
        }

        if (READ_ONCE(idev->cnf.rtr_solicits) == 0) {
                NL_SET_ERR_MSG(extack,
                               "Router solicitation is disabled on device");
                return -EINVAL;
        }

        write_lock_bh(&idev->lock);

        BUILD_BUG_ON(sizeof(token->s6_addr) != 16);
        memcpy(idev->token.s6_addr + 8, token->s6_addr + 8, 8);

        write_unlock_bh(&idev->lock);

        clear_token = ipv6_addr_any(token);
        if (clear_token)
                goto update_lft;

        if (!idev->dead && (idev->if_flags & IF_READY) &&
            !ipv6_get_lladdr(dev, &ll_addr, IFA_F_TENTATIVE |
                             IFA_F_OPTIMISTIC)) {
                /* If we're not ready, then normal ifup will take care
                 * of this. Otherwise, we need to request our rs here.
                 */
                ndisc_send_rs(dev, &ll_addr, &in6addr_linklocal_allrouters);
                update_rs = true;
        }

update_lft:
        write_lock_bh(&idev->lock);

        if (update_rs) {
                idev->if_flags |= IF_RS_SENT;
                idev->rs_interval = rfc3315_s14_backoff_init(
                        READ_ONCE(idev->cnf.rtr_solicit_interval));
                idev->rs_probes = 1;
                addrconf_mod_rs_timer(idev, idev->rs_interval);
        }

        /* Well, that's kinda nasty ... */
        list_for_each_entry(ifp, &idev->addr_list, if_list) {
                spin_lock(&ifp->lock);
                if (ifp->tokenized) {
                        ifp->valid_lft = 0;
                        ifp->prefered_lft = 0;
                }
                spin_unlock(&ifp->lock);
        }

        write_unlock_bh(&idev->lock);
        inet6_ifinfo_notify(RTM_NEWLINK, idev);
        addrconf_verify_rtnl(dev_net(dev));
        return 0;
}

static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = {
        [IFLA_INET6_ADDR_GEN_MODE]        = { .type = NLA_U8 },
        [IFLA_INET6_TOKEN]                = { .len = sizeof(struct in6_addr) },
        [IFLA_INET6_RA_MTU]                = { .type = NLA_REJECT,
                                            .reject_message =
                                                "IFLA_INET6_RA_MTU can not be set" },
};

static int check_addr_gen_mode(int mode)
{
        if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
            mode != IN6_ADDR_GEN_MODE_NONE &&
            mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
            mode != IN6_ADDR_GEN_MODE_RANDOM)
                return -EINVAL;
        return 1;
}

static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
                                int mode)
{
        if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
            !idev->cnf.stable_secret.initialized &&
            !net->ipv6.devconf_dflt->stable_secret.initialized)
                return -EINVAL;
        return 1;
}

static int inet6_validate_link_af(const struct net_device *dev,
                                  const struct nlattr *nla,
                                  struct netlink_ext_ack *extack)
{
        struct nlattr *tb[IFLA_INET6_MAX + 1];
        struct inet6_dev *idev = NULL;
        int err;

        if (dev) {
                idev = __in6_dev_get(dev);
                if (!idev)
                        return -EAFNOSUPPORT;
        }

        err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla,
                                          inet6_af_policy, extack);
        if (err)
                return err;

        if (!tb[IFLA_INET6_TOKEN] && !tb[IFLA_INET6_ADDR_GEN_MODE])
                return -EINVAL;

        if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
                u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);

                if (check_addr_gen_mode(mode) < 0)
                        return -EINVAL;
                if (dev && check_stable_privacy(idev, dev_net(dev), mode) < 0)
                        return -EINVAL;
        }

        return 0;
}

static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla,
                             struct netlink_ext_ack *extack)
{
        struct inet6_dev *idev = __in6_dev_get(dev);
        struct nlattr *tb[IFLA_INET6_MAX + 1];
        int err;

        if (!idev)
                return -EAFNOSUPPORT;

        if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0)
                return -EINVAL;

        if (tb[IFLA_INET6_TOKEN]) {
                err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]),
                                        extack);
                if (err)
                        return err;
        }

        if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
                u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);

                WRITE_ONCE(idev->cnf.addr_gen_mode, mode);
        }

        return 0;
}

static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
                             u32 portid, u32 seq, int event, unsigned int flags)
{
        struct net_device *dev = idev->dev;
        struct ifinfomsg *hdr;
        struct nlmsghdr *nlh;
        int ifindex, iflink;
        void *protoinfo;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags);
        if (!nlh)
                return -EMSGSIZE;

        hdr = nlmsg_data(nlh);
        hdr->ifi_family = AF_INET6;
        hdr->__ifi_pad = 0;
        hdr->ifi_type = dev->type;
        ifindex = READ_ONCE(dev->ifindex);
        hdr->ifi_index = ifindex;
        hdr->ifi_flags = dev_get_flags(dev);
        hdr->ifi_change = 0;

        iflink = dev_get_iflink(dev);
        if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
            (dev->addr_len &&
             nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
            nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
            (ifindex != iflink &&
             nla_put_u32(skb, IFLA_LINK, iflink)) ||
            nla_put_u8(skb, IFLA_OPERSTATE,
                       netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN))
                goto nla_put_failure;
        protoinfo = nla_nest_start_noflag(skb, IFLA_PROTINFO);
        if (!protoinfo)
                goto nla_put_failure;

        if (inet6_fill_ifla6_attrs(skb, idev, 0) < 0)
                goto nla_put_failure;

        nla_nest_end(skb, protoinfo);
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh,
                                   struct netlink_ext_ack *extack)
{
        struct ifinfomsg *ifm;

        if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request");
                return -EINVAL;
        }

        if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid data after header");
                return -EINVAL;
        }

        ifm = nlmsg_data(nlh);
        if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
            ifm->ifi_change || ifm->ifi_index) {
                NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request");
                return -EINVAL;
        }

        return 0;
}

static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
        struct net *net = sock_net(skb->sk);
        struct {
                unsigned long ifindex;
        } *ctx = (void *)cb->ctx;
        struct net_device *dev;
        struct inet6_dev *idev;
        int err;

        /* only requests using strict checking can pass data to
         * influence the dump
         */
        if (cb->strict_check) {
                err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack);

                if (err < 0)
                        return err;
        }

        err = 0;
        rcu_read_lock();
        for_each_netdev_dump(net, dev, ctx->ifindex) {
                idev = __in6_dev_get(dev);
                if (!idev)
                        continue;
                err = inet6_fill_ifinfo(skb, idev,
                                        NETLINK_CB(cb->skb).portid,
                                        cb->nlh->nlmsg_seq,
                                        RTM_NEWLINK, NLM_F_MULTI);
                if (err < 0)
                        break;
        }
        rcu_read_unlock();

        return err;
}

void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
{
        struct sk_buff *skb;
        struct net *net = dev_net(idev->dev);
        int err = -ENOBUFS;

        skb = nlmsg_new(inet6_if_nlmsg_size(), GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = inet6_fill_ifinfo(skb, idev, 0, 0, event, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_if_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err);
}

static inline size_t inet6_prefix_nlmsg_size(void)
{
        return NLMSG_ALIGN(sizeof(struct prefixmsg))
               + nla_total_size(sizeof(struct in6_addr))
               + nla_total_size(sizeof(struct prefix_cacheinfo));
}

static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
                             struct prefix_info *pinfo, u32 portid, u32 seq,
                             int event, unsigned int flags)
{
        struct prefixmsg *pmsg;
        struct nlmsghdr *nlh;
        struct prefix_cacheinfo        ci;

        nlh = nlmsg_put(skb, portid, seq, event, sizeof(*pmsg), flags);
        if (!nlh)
                return -EMSGSIZE;

        pmsg = nlmsg_data(nlh);
        pmsg->prefix_family = AF_INET6;
        pmsg->prefix_pad1 = 0;
        pmsg->prefix_pad2 = 0;
        pmsg->prefix_ifindex = idev->dev->ifindex;
        pmsg->prefix_len = pinfo->prefix_len;
        pmsg->prefix_type = pinfo->type;
        pmsg->prefix_pad3 = 0;
        pmsg->prefix_flags = pinfo->flags;

        if (nla_put(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix))
                goto nla_put_failure;
        ci.preferred_time = ntohl(pinfo->prefered);
        ci.valid_time = ntohl(pinfo->valid);
        if (nla_put(skb, PREFIX_CACHEINFO, sizeof(ci), &ci))
                goto nla_put_failure;
        nlmsg_end(skb, nlh);
        return 0;

nla_put_failure:
        nlmsg_cancel(skb, nlh);
        return -EMSGSIZE;
}

static void inet6_prefix_notify(int event, struct inet6_dev *idev,
                         struct prefix_info *pinfo)
{
        struct sk_buff *skb;
        struct net *net = dev_net(idev->dev);
        int err = -ENOBUFS;

        skb = nlmsg_new(inet6_prefix_nlmsg_size(), GFP_ATOMIC);
        if (!skb)
                goto errout;

        err = inet6_fill_prefix(skb, idev, pinfo, 0, 0, event, 0);
        if (err < 0) {
                /* -EMSGSIZE implies BUG in inet6_prefix_nlmsg_size() */
                WARN_ON(err == -EMSGSIZE);
                kfree_skb(skb);
                goto errout;
        }
        rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
        return;
errout:
        rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
}

static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
{
        struct net *net = dev_net(ifp->idev->dev);

        if (event)
                ASSERT_RTNL();

        inet6_ifa_notify(event ? : RTM_NEWADDR, ifp);

        switch (event) {
        case RTM_NEWADDR:
                /*
                 * If the address was optimistic we inserted the route at the
                 * start of our DAD process, so we don't need to do it again.
                 * If the device was taken down in the middle of the DAD
                 * cycle there is a race where we could get here without a
                 * host route, so nothing to insert. That will be fixed when
                 * the device is brought up.
                 */
                if (ifp->rt && !rcu_access_pointer(ifp->rt->fib6_node)) {
                        ip6_ins_rt(net, ifp->rt);
                } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) {
                        pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n",
                                &ifp->addr, ifp->idev->dev->name);
                }

                if (ifp->idev->cnf.forwarding)
                        addrconf_join_anycast(ifp);
                if (!ipv6_addr_any(&ifp->peer_addr))
                        addrconf_prefix_route(&ifp->peer_addr, 128,
                                              ifp->rt_priority, ifp->idev->dev,
                                              0, 0, GFP_ATOMIC);
                break;
        case RTM_DELADDR:
                if (ifp->idev->cnf.forwarding)
                        addrconf_leave_anycast(ifp);
                addrconf_leave_solict(ifp->idev, &ifp->addr);
                if (!ipv6_addr_any(&ifp->peer_addr)) {
                        struct fib6_info *rt;

                        rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
                                                       ifp->idev->dev, 0, 0,
                                                       false);
                        if (rt)
                                ip6_del_rt(net, rt, false);
                }
                if (ifp->rt) {
                        ip6_del_rt(net, ifp->rt, false);
                        ifp->rt = NULL;
                }
                rt_genid_bump_ipv6(net);
                break;
        }
        atomic_inc(&net->ipv6.dev_addr_genid);
}

static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
{
        if (likely(ifp->idev->dead == 0))
                __ipv6_ifa_notify(event, ifp);
}

#ifdef CONFIG_SYSCTL

static int addrconf_sysctl_forward(const struct ctl_table *ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct ctl_table lctl;
        int ret;

        /*
         * ctl->data points to idev->cnf.forwarding, we should
         * not modify it until we get the rtnl lock.
         */
        lctl = *ctl;
        lctl.data = &val;

        ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);

        if (write)
                ret = addrconf_fixup_forwarding(ctl, valp, val);
        if (ret)
                *ppos = pos;
        return ret;
}

static int addrconf_sysctl_mtu(const struct ctl_table *ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        struct inet6_dev *idev = ctl->extra1;
        int min_mtu = IPV6_MIN_MTU;
        struct ctl_table lctl;

        lctl = *ctl;
        lctl.extra1 = &min_mtu;
        lctl.extra2 = idev ? &idev->dev->mtu : NULL;

        return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos);
}

static void dev_disable_change(struct inet6_dev *idev)
{
        struct netdev_notifier_info info;

        if (!idev || !idev->dev)
                return;

        netdev_notifier_info_init(&info, idev->dev);
        if (idev->cnf.disable_ipv6)
                addrconf_notify(NULL, NETDEV_DOWN, &info);
        else
                addrconf_notify(NULL, NETDEV_UP, &info);
}

static void addrconf_disable_change(struct net *net, __s32 newf)
{
        struct net_device *dev;
        struct inet6_dev *idev;

        for_each_netdev(net, dev) {
                idev = __in6_dev_get_rtnl_net(dev);
                if (idev) {
                        int changed = (!idev->cnf.disable_ipv6) ^ (!newf);

                        WRITE_ONCE(idev->cnf.disable_ipv6, newf);
                        if (changed)
                                dev_disable_change(idev);
                }
        }
}

static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf)
{
        struct net *net = (struct net *)table->extra2;
        int old;

        if (p == &net->ipv6.devconf_dflt->disable_ipv6) {
                WRITE_ONCE(*p, newf);
                return 0;
        }

        if (!rtnl_net_trylock(net))
                return restart_syscall();

        old = *p;
        WRITE_ONCE(*p, newf);

        if (p == &net->ipv6.devconf_all->disable_ipv6) {
                WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf);
                addrconf_disable_change(net, newf);
        } else if ((!newf) ^ (!old)) {
                dev_disable_change((struct inet6_dev *)table->extra1);
        }

        rtnl_net_unlock(net);
        return 0;
}

static int addrconf_sysctl_disable(const struct ctl_table *ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct ctl_table lctl;
        int ret;

        /*
         * ctl->data points to idev->cnf.disable_ipv6, we should
         * not modify it until we get the rtnl lock.
         */
        lctl = *ctl;
        lctl.data = &val;

        ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);

        if (write)
                ret = addrconf_disable_ipv6(ctl, valp, val);
        if (ret)
                *ppos = pos;
        return ret;
}

static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write,
                void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int ret;
        int old, new;

        old = *valp;
        ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
        new = *valp;

        if (write && old != new) {
                struct net *net = ctl->extra2;

                if (!rtnl_net_trylock(net))
                        return restart_syscall();

                if (valp == &net->ipv6.devconf_dflt->proxy_ndp) {
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_PROXY_NEIGH,
                                                     NETCONFA_IFINDEX_DEFAULT,
                                                     net->ipv6.devconf_dflt);
                } else if (valp == &net->ipv6.devconf_all->proxy_ndp) {
                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_PROXY_NEIGH,
                                                     NETCONFA_IFINDEX_ALL,
                                                     net->ipv6.devconf_all);
                } else {
                        struct inet6_dev *idev = ctl->extra1;

                        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
                                                     NETCONFA_PROXY_NEIGH,
                                                     idev->dev->ifindex,
                                                     &idev->cnf);
                }
                rtnl_net_unlock(net);
        }

        return ret;
}

static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write,
                                         void *buffer, size_t *lenp,
                                         loff_t *ppos)
{
        int ret = 0;
        u32 new_val;
        struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1;
        struct net *net = (struct net *)ctl->extra2;
        struct ctl_table tmp = {
                .data = &new_val,
                .maxlen = sizeof(new_val),
                .mode = ctl->mode,
        };

        if (!rtnl_net_trylock(net))
                return restart_syscall();

        new_val = *((u32 *)ctl->data);

        ret = proc_douintvec(&tmp, write, buffer, lenp, ppos);
        if (ret != 0)
                goto out;

        if (write) {
                if (check_addr_gen_mode(new_val) < 0) {
                        ret = -EINVAL;
                        goto out;
                }

                if (idev) {
                        if (check_stable_privacy(idev, net, new_val) < 0) {
                                ret = -EINVAL;
                                goto out;
                        }

                        if (idev->cnf.addr_gen_mode != new_val) {
                                WRITE_ONCE(idev->cnf.addr_gen_mode, new_val);
                                netdev_lock_ops(idev->dev);
                                addrconf_init_auto_addrs(idev->dev);
                                netdev_unlock_ops(idev->dev);
                        }
                } else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) {
                        struct net_device *dev;

                        WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val);
                        for_each_netdev(net, dev) {
                                idev = __in6_dev_get_rtnl_net(dev);
                                if (idev &&
                                    idev->cnf.addr_gen_mode != new_val) {
                                        WRITE_ONCE(idev->cnf.addr_gen_mode,
                                                  new_val);
                                        netdev_lock_ops(idev->dev);
                                        addrconf_init_auto_addrs(idev->dev);
                                        netdev_unlock_ops(idev->dev);
                                }
                        }
                }

                WRITE_ONCE(*((u32 *)ctl->data), new_val);
        }

out:
        rtnl_net_unlock(net);

        return ret;
}

static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write,
                                         void *buffer, size_t *lenp,
                                         loff_t *ppos)
{
        int err;
        struct in6_addr addr;
        char str[IPV6_MAX_STRLEN];
        struct ctl_table lctl = *ctl;
        struct net *net = ctl->extra2;
        struct ipv6_stable_secret *secret = ctl->data;

        if (&net->ipv6.devconf_all->stable_secret == ctl->data)
                return -EIO;

        lctl.maxlen = IPV6_MAX_STRLEN;
        lctl.data = str;

        if (!rtnl_net_trylock(net))
                return restart_syscall();

        if (!write && !secret->initialized) {
                err = -EIO;
                goto out;
        }

        err = snprintf(str, sizeof(str), "%pI6", &secret->secret);
        if (err >= sizeof(str)) {
                err = -EIO;
                goto out;
        }

        err = proc_dostring(&lctl, write, buffer, lenp, ppos);
        if (err || !write)
                goto out;

        if (in6_pton(str, -1, addr.in6_u.u6_addr8, -1, NULL) != 1) {
                err = -EIO;
                goto out;
        }

        secret->initialized = true;
        secret->secret = addr;

        if (&net->ipv6.devconf_dflt->stable_secret == ctl->data) {
                struct net_device *dev;

                for_each_netdev(net, dev) {
                        struct inet6_dev *idev = __in6_dev_get_rtnl_net(dev);

                        if (idev) {
                                WRITE_ONCE(idev->cnf.addr_gen_mode,
                                           IN6_ADDR_GEN_MODE_STABLE_PRIVACY);
                        }
                }
        } else {
                struct inet6_dev *idev = ctl->extra1;

                WRITE_ONCE(idev->cnf.addr_gen_mode,
                           IN6_ADDR_GEN_MODE_STABLE_PRIVACY);
        }

out:
        rtnl_net_unlock(net);

        return err;
}

static
int addrconf_sysctl_ignore_routes_with_linkdown(const struct ctl_table *ctl,
                                                int write, void *buffer,
                                                size_t *lenp,
                                                loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct ctl_table lctl;
        int ret;

        /* ctl->data points to idev->cnf.ignore_routes_when_linkdown
         * we should not modify it until we get the rtnl lock.
         */
        lctl = *ctl;
        lctl.data = &val;

        ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);

        if (write)
                ret = addrconf_fixup_linkdown(ctl, valp, val);
        if (ret)
                *ppos = pos;
        return ret;
}

static
void addrconf_set_nopolicy(struct rt6_info *rt, int action)
{
        if (rt) {
                if (action)
                        rt->dst.flags |= DST_NOPOLICY;
                else
                        rt->dst.flags &= ~DST_NOPOLICY;
        }
}

static
void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
{
        struct inet6_ifaddr *ifa;

        read_lock_bh(&idev->lock);
        list_for_each_entry(ifa, &idev->addr_list, if_list) {
                spin_lock(&ifa->lock);
                if (ifa->rt) {
                        /* host routes only use builtin fib6_nh */
                        struct fib6_nh *nh = ifa->rt->fib6_nh;
                        int cpu;

                        rcu_read_lock();
                        ifa->rt->dst_nopolicy = val ? true : false;
                        if (nh->rt6i_pcpu) {
                                for_each_possible_cpu(cpu) {
                                        struct rt6_info **rtp;

                                        rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu);
                                        addrconf_set_nopolicy(*rtp, val);
                                }
                        }
                        rcu_read_unlock();
                }
                spin_unlock(&ifa->lock);
        }
        read_unlock_bh(&idev->lock);
}

static
int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val)
{
        struct net *net = (struct net *)ctl->extra2;
        struct inet6_dev *idev;

        if (valp == &net->ipv6.devconf_dflt->disable_policy) {
                WRITE_ONCE(*valp, val);
                return 0;
        }

        if (!rtnl_net_trylock(net))
                return restart_syscall();

        WRITE_ONCE(*valp, val);

        if (valp == &net->ipv6.devconf_all->disable_policy)  {
                struct net_device *dev;

                for_each_netdev(net, dev) {
                        idev = __in6_dev_get_rtnl_net(dev);
                        if (idev)
                                addrconf_disable_policy_idev(idev, val);
                }
        } else {
                idev = (struct inet6_dev *)ctl->extra1;
                addrconf_disable_policy_idev(idev, val);
        }

        rtnl_net_unlock(net);
        return 0;
}

static int addrconf_sysctl_disable_policy(const struct ctl_table *ctl, int write,
                                   void *buffer, size_t *lenp, loff_t *ppos)
{
        int *valp = ctl->data;
        int val = *valp;
        loff_t pos = *ppos;
        struct ctl_table lctl;
        int ret;

        lctl = *ctl;
        lctl.data = &val;
        ret = proc_dointvec(&lctl, write, buffer, lenp, ppos);

        if (write && (*valp != val))
                ret = addrconf_disable_policy(ctl, valp, val);

        if (ret)
                *ppos = pos;

        return ret;
}

static int minus_one = -1;
static const int two_five_five = 255;
static u32 ioam6_if_id_max = U16_MAX;

static const struct ctl_table addrconf_sysctl[] = {
        {
                .procname        = "forwarding",
                .data                = &ipv6_devconf.forwarding,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_forward,
        },
        {
                .procname        = "hop_limit",
                .data                = &ipv6_devconf.hop_limit,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = (void *)SYSCTL_ONE,
                .extra2                = (void *)&two_five_five,
        },
        {
                .procname        = "mtu",
                .data                = &ipv6_devconf.mtu6,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_mtu,
        },
        {
                .procname        = "accept_ra",
                .data                = &ipv6_devconf.accept_ra,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_redirects",
                .data                = &ipv6_devconf.accept_redirects,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "autoconf",
                .data                = &ipv6_devconf.autoconf,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "dad_transmits",
                .data                = &ipv6_devconf.dad_transmits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "router_solicitations",
                .data                = &ipv6_devconf.rtr_solicits,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = &minus_one,
        },
        {
                .procname        = "router_solicitation_interval",
                .data                = &ipv6_devconf.rtr_solicit_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "router_solicitation_max_interval",
                .data                = &ipv6_devconf.rtr_solicit_max_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "router_solicitation_delay",
                .data                = &ipv6_devconf.rtr_solicit_delay,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
        {
                .procname        = "force_mld_version",
                .data                = &ipv6_devconf.force_mld_version,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "mldv1_unsolicited_report_interval",
                .data                =
                        &ipv6_devconf.mldv1_unsolicited_report_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_ms_jiffies,
        },
        {
                .procname        = "mldv2_unsolicited_report_interval",
                .data                =
                        &ipv6_devconf.mldv2_unsolicited_report_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_ms_jiffies,
        },
        {
                .procname        = "use_tempaddr",
                .data                = &ipv6_devconf.use_tempaddr,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "temp_valid_lft",
                .data                = &ipv6_devconf.temp_valid_lft,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "temp_prefered_lft",
                .data                = &ipv6_devconf.temp_prefered_lft,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname       = "regen_min_advance",
                .data           = &ipv6_devconf.regen_min_advance,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname        = "regen_max_retry",
                .data                = &ipv6_devconf.regen_max_retry,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "max_desync_factor",
                .data                = &ipv6_devconf.max_desync_factor,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "max_addresses",
                .data                = &ipv6_devconf.max_addresses,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_defrtr",
                .data                = &ipv6_devconf.accept_ra_defrtr,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "ra_defrtr_metric",
                .data                = &ipv6_devconf.ra_defrtr_metric,
                .maxlen                = sizeof(u32),
                .mode                = 0644,
                .proc_handler        = proc_douintvec_minmax,
                .extra1                = (void *)SYSCTL_ONE,
        },
        {
                .procname        = "accept_ra_min_hop_limit",
                .data                = &ipv6_devconf.accept_ra_min_hop_limit,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_min_lft",
                .data                = &ipv6_devconf.accept_ra_min_lft,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_pinfo",
                .data                = &ipv6_devconf.accept_ra_pinfo,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "ra_honor_pio_life",
                .data                = &ipv6_devconf.ra_honor_pio_life,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler        = proc_dou8vec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "ra_honor_pio_pflag",
                .data                = &ipv6_devconf.ra_honor_pio_pflag,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler        = proc_dou8vec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
#ifdef CONFIG_IPV6_ROUTER_PREF
        {
                .procname        = "accept_ra_rtr_pref",
                .data                = &ipv6_devconf.accept_ra_rtr_pref,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "router_probe_interval",
                .data                = &ipv6_devconf.rtr_probe_interval,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_jiffies,
        },
#ifdef CONFIG_IPV6_ROUTE_INFO
        {
                .procname        = "accept_ra_rt_info_min_plen",
                .data                = &ipv6_devconf.accept_ra_rt_info_min_plen,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_rt_info_max_plen",
                .data                = &ipv6_devconf.accept_ra_rt_info_max_plen,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
#endif
        {
                .procname        = "proxy_ndp",
                .data                = &ipv6_devconf.proxy_ndp,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_proxy_ndp,
        },
        {
                .procname        = "accept_source_route",
                .data                = &ipv6_devconf.accept_source_route,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
        {
                .procname        = "optimistic_dad",
                .data                = &ipv6_devconf.optimistic_dad,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname        = "use_optimistic",
                .data                = &ipv6_devconf.use_optimistic,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
#ifdef CONFIG_IPV6_MROUTE
        {
                .procname        = "mc_forwarding",
                .data                = &ipv6_devconf.mc_forwarding,
                .maxlen                = sizeof(int),
                .mode                = 0444,
                .proc_handler        = proc_dointvec,
        },
#endif
        {
                .procname        = "disable_ipv6",
                .data                = &ipv6_devconf.disable_ipv6,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_disable,
        },
        {
                .procname        = "accept_dad",
                .data                = &ipv6_devconf.accept_dad,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "force_tllao",
                .data                = &ipv6_devconf.force_tllao,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
        {
                .procname        = "ndisc_notify",
                .data                = &ipv6_devconf.ndisc_notify,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
        {
                .procname        = "suppress_frag_ndisc",
                .data                = &ipv6_devconf.suppress_frag_ndisc,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec
        },
        {
                .procname        = "accept_ra_from_local",
                .data                = &ipv6_devconf.accept_ra_from_local,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "accept_ra_mtu",
                .data                = &ipv6_devconf.accept_ra_mtu,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "stable_secret",
                .data                = &ipv6_devconf.stable_secret,
                .maxlen                = IPV6_MAX_STRLEN,
                .mode                = 0600,
                .proc_handler        = addrconf_sysctl_stable_secret,
        },
        {
                .procname        = "use_oif_addrs_only",
                .data                = &ipv6_devconf.use_oif_addrs_only,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "ignore_routes_with_linkdown",
                .data                = &ipv6_devconf.ignore_routes_with_linkdown,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_ignore_routes_with_linkdown,
        },
        {
                .procname        = "drop_unicast_in_l2_multicast",
                .data                = &ipv6_devconf.drop_unicast_in_l2_multicast,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "drop_unsolicited_na",
                .data                = &ipv6_devconf.drop_unsolicited_na,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "keep_addr_on_down",
                .data                = &ipv6_devconf.keep_addr_on_down,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,

        },
        {
                .procname        = "seg6_enabled",
                .data                = &ipv6_devconf.seg6_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#ifdef CONFIG_IPV6_SEG6_HMAC
        {
                .procname        = "seg6_require_hmac",
                .data                = &ipv6_devconf.seg6_require_hmac,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
#endif
        {
                .procname       = "enhanced_dad",
                .data           = &ipv6_devconf.enhanced_dad,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
        {
                .procname        = "addr_gen_mode",
                .data                = &ipv6_devconf.addr_gen_mode,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = addrconf_sysctl_addr_gen_mode,
        },
        {
                .procname       = "disable_policy",
                .data           = &ipv6_devconf.disable_policy,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = addrconf_sysctl_disable_policy,
        },
        {
                .procname        = "ndisc_tclass",
                .data                = &ipv6_devconf.ndisc_tclass,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = (void *)SYSCTL_ZERO,
                .extra2                = (void *)&two_five_five,
        },
        {
                .procname        = "rpl_seg_enabled",
                .data                = &ipv6_devconf.rpl_seg_enabled,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec,
        },
        {
                .procname        = "ioam6_enabled",
                .data                = &ipv6_devconf.ioam6_enabled,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler        = proc_dou8vec_minmax,
                .extra1                = (void *)SYSCTL_ZERO,
                .extra2                = (void *)SYSCTL_ONE,
        },
        {
                .procname        = "ioam6_id",
                .data                = &ipv6_devconf.ioam6_id,
                .maxlen                = sizeof(u32),
                .mode                = 0644,
                .proc_handler        = proc_douintvec_minmax,
                .extra1                = (void *)SYSCTL_ZERO,
                .extra2                = (void *)&ioam6_if_id_max,
        },
        {
                .procname        = "ioam6_id_wide",
                .data                = &ipv6_devconf.ioam6_id_wide,
                .maxlen                = sizeof(u32),
                .mode                = 0644,
                .proc_handler        = proc_douintvec,
        },
        {
                .procname        = "ndisc_evict_nocarrier",
                .data                = &ipv6_devconf.ndisc_evict_nocarrier,
                .maxlen                = sizeof(u8),
                .mode                = 0644,
                .proc_handler        = proc_dou8vec_minmax,
                .extra1                = (void *)SYSCTL_ZERO,
                .extra2                = (void *)SYSCTL_ONE,
        },
        {
                .procname        = "accept_untracked_na",
                .data                = &ipv6_devconf.accept_untracked_na,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
};

static int __addrconf_sysctl_register(struct net *net, char *dev_name,
                struct inet6_dev *idev, struct ipv6_devconf *p)
{
        size_t table_size = ARRAY_SIZE(addrconf_sysctl);
        int i, ifindex;
        struct ctl_table *table;
        char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];

        table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL_ACCOUNT);
        if (!table)
                goto out;

        for (i = 0; i < table_size; i++) {
                table[i].data += (char *)p - (char *)&ipv6_devconf;
                /* If one of these is already set, then it is not safe to
                 * overwrite either of them: this makes proc_dointvec_minmax
                 * usable.
                 */
                if (!table[i].extra1 && !table[i].extra2) {
                        table[i].extra1 = idev; /* embedded; no ref */
                        table[i].extra2 = net;
                }
        }

        snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);

        p->sysctl_header = register_net_sysctl_sz(net, path, table,
                                                  table_size);
        if (!p->sysctl_header)
                goto free;

        if (!strcmp(dev_name, "all"))
                ifindex = NETCONFA_IFINDEX_ALL;
        else if (!strcmp(dev_name, "default"))
                ifindex = NETCONFA_IFINDEX_DEFAULT;
        else
                ifindex = idev->dev->ifindex;
        inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL,
                                     ifindex, p);
        return 0;

free:
        kfree(table);
out:
        return -ENOBUFS;
}

static void __addrconf_sysctl_unregister(struct net *net,
                                         struct ipv6_devconf *p, int ifindex)
{
        const struct ctl_table *table;

        if (!p->sysctl_header)
                return;

        table = p->sysctl_header->ctl_table_arg;
        unregister_net_sysctl_table(p->sysctl_header);
        p->sysctl_header = NULL;
        kfree(table);

        inet6_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL);
}

static int addrconf_sysctl_register(struct inet6_dev *idev)
{
        int err;

        if (!sysctl_dev_name_is_allowed(idev->dev->name))
                return -EINVAL;

        err = neigh_sysctl_register(idev->dev, idev->nd_parms,
                                    &ndisc_ifinfo_sysctl_change);
        if (err)
                return err;
        err = __addrconf_sysctl_register(dev_net(idev->dev), idev->dev->name,
                                         idev, &idev->cnf);
        if (err)
                neigh_sysctl_unregister(idev->nd_parms);

        return err;
}

static void addrconf_sysctl_unregister(struct inet6_dev *idev)
{
        __addrconf_sysctl_unregister(dev_net(idev->dev), &idev->cnf,
                                     idev->dev->ifindex);
        neigh_sysctl_unregister(idev->nd_parms);
}


#endif

static int __net_init addrconf_init_net(struct net *net)
{
        int err = -ENOMEM;
        struct ipv6_devconf *all, *dflt;

        spin_lock_init(&net->ipv6.addrconf_hash_lock);
        INIT_DEFERRABLE_WORK(&net->ipv6.addr_chk_work, addrconf_verify_work);
        net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE,
                                           sizeof(struct hlist_head),
                                           GFP_KERNEL);
        if (!net->ipv6.inet6_addr_lst)
                goto err_alloc_addr;

        all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL);
        if (!all)
                goto err_alloc_all;

        dflt = kmemdup(&ipv6_devconf_dflt, sizeof(ipv6_devconf_dflt), GFP_KERNEL);
        if (!dflt)
                goto err_alloc_dflt;

        if (!net_eq(net, &init_net)) {
                switch (net_inherit_devconf()) {
                case 1:  /* copy from init_net */
                        memcpy(all, init_net.ipv6.devconf_all,
                               sizeof(ipv6_devconf));
                        memcpy(dflt, init_net.ipv6.devconf_dflt,
                               sizeof(ipv6_devconf_dflt));
                        break;
                case 3: /* copy from the current netns */
                        memcpy(all, current->nsproxy->net_ns->ipv6.devconf_all,
                               sizeof(ipv6_devconf));
                        memcpy(dflt,
                               current->nsproxy->net_ns->ipv6.devconf_dflt,
                               sizeof(ipv6_devconf_dflt));
                        break;
                case 0:
                case 2:
                        /* use compiled values */
                        break;
                }
        }

        /* these will be inherited by all namespaces */
        dflt->autoconf = ipv6_defaults.autoconf;
        dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;

        dflt->stable_secret.initialized = false;
        all->stable_secret.initialized = false;

        net->ipv6.devconf_all = all;
        net->ipv6.devconf_dflt = dflt;

#ifdef CONFIG_SYSCTL
        err = __addrconf_sysctl_register(net, "all", NULL, all);
        if (err < 0)
                goto err_reg_all;

        err = __addrconf_sysctl_register(net, "default", NULL, dflt);
        if (err < 0)
                goto err_reg_dflt;
#endif
        return 0;

#ifdef CONFIG_SYSCTL
err_reg_dflt:
        __addrconf_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
        kfree(dflt);
        net->ipv6.devconf_dflt = NULL;
#endif
err_alloc_dflt:
        kfree(all);
        net->ipv6.devconf_all = NULL;
err_alloc_all:
        kfree(net->ipv6.inet6_addr_lst);
err_alloc_addr:
        return err;
}

static void __net_exit addrconf_exit_net(struct net *net)
{
        int i;

#ifdef CONFIG_SYSCTL
        __addrconf_sysctl_unregister(net, net->ipv6.devconf_dflt,
                                     NETCONFA_IFINDEX_DEFAULT);
        __addrconf_sysctl_unregister(net, net->ipv6.devconf_all,
                                     NETCONFA_IFINDEX_ALL);
#endif
        kfree(net->ipv6.devconf_dflt);
        net->ipv6.devconf_dflt = NULL;
        kfree(net->ipv6.devconf_all);
        net->ipv6.devconf_all = NULL;

        cancel_delayed_work_sync(&net->ipv6.addr_chk_work);
        /*
         *        Check hash table, then free it.
         */
        for (i = 0; i < IN6_ADDR_HSIZE; i++)
                WARN_ON_ONCE(!hlist_empty(&net->ipv6.inet6_addr_lst[i]));

        kfree(net->ipv6.inet6_addr_lst);
        net->ipv6.inet6_addr_lst = NULL;
}

static struct pernet_operations addrconf_ops = {
        .init = addrconf_init_net,
        .exit = addrconf_exit_net,
};

static struct rtnl_af_ops inet6_ops __read_mostly = {
        .family                  = AF_INET6,
        .fill_link_af          = inet6_fill_link_af,
        .get_link_af_size = inet6_get_link_af_size,
        .validate_link_af = inet6_validate_link_af,
        .set_link_af          = inet6_set_link_af,
};

static const struct rtnl_msg_handler addrconf_rtnl_msg_handlers[] __initconst_or_module = {
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETLINK,
         .dumpit = inet6_dump_ifinfo, .flags = RTNL_FLAG_DUMP_UNLOCKED},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDR,
         .doit = inet6_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDR,
         .doit = inet6_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDR,
         .doit = inet6_rtm_getaddr, .dumpit = inet6_dump_ifaddr,
         .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETMULTICAST,
         .dumpit = inet6_dump_ifmcaddr,
         .flags = RTNL_FLAG_DUMP_UNLOCKED},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETANYCAST,
         .dumpit = inet6_dump_ifacaddr,
         .flags = RTNL_FLAG_DUMP_UNLOCKED},
        {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETNETCONF,
         .doit = inet6_netconf_get_devconf, .dumpit = inet6_netconf_dump_devconf,
         .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
};

/*
 *        Init / cleanup code
 */

int __init addrconf_init(void)
{
        struct inet6_dev *idev;
        int err;

        err = ipv6_addr_label_init();
        if (err < 0) {
                pr_crit("%s: cannot initialize default policy table: %d\n",
                        __func__, err);
                goto out;
        }

        err = register_pernet_subsys(&addrconf_ops);
        if (err < 0)
                goto out_addrlabel;

        /* All works using addrconf_wq need to lock rtnl. */
        addrconf_wq = create_singlethread_workqueue("ipv6_addrconf");
        if (!addrconf_wq) {
                err = -ENOMEM;
                goto out_nowq;
        }

        rtnl_net_lock(&init_net);
        idev = ipv6_add_dev(blackhole_netdev);
        rtnl_net_unlock(&init_net);
        if (IS_ERR(idev)) {
                err = PTR_ERR(idev);
                goto errlo;
        }

        ip6_route_init_special_entries();

        register_netdevice_notifier(&ipv6_dev_notf);

        addrconf_verify(&init_net);

        err = rtnl_af_register(&inet6_ops);
        if (err)
                goto erraf;

        err = rtnl_register_many(addrconf_rtnl_msg_handlers);
        if (err)
                goto errout;

        err = ipv6_addr_label_rtnl_register();
        if (err < 0)
                goto errout;

        return 0;
errout:
        rtnl_unregister_all(PF_INET6);
        rtnl_af_unregister(&inet6_ops);
erraf:
        unregister_netdevice_notifier(&ipv6_dev_notf);
errlo:
        destroy_workqueue(addrconf_wq);
out_nowq:
        unregister_pernet_subsys(&addrconf_ops);
out_addrlabel:
        ipv6_addr_label_cleanup();
out:
        return err;
}

void addrconf_cleanup(void)
{
        struct net_device *dev;

        unregister_netdevice_notifier(&ipv6_dev_notf);
        unregister_pernet_subsys(&addrconf_ops);
        ipv6_addr_label_cleanup();

        rtnl_af_unregister(&inet6_ops);

        rtnl_net_lock(&init_net);

        /* clean dev list */
        for_each_netdev(&init_net, dev) {
                if (!__in6_dev_get_rtnl_net(dev))
                        continue;
                addrconf_ifdown(dev, true);
        }
        addrconf_ifdown(init_net.loopback_dev, true);

        rtnl_net_unlock(&init_net);

        destroy_workqueue(addrconf_wq);
}





























































   22 



















   22 







   22 

















   22 



   22 
   22 




   22 

   22 





   22 

































































































































































   12 
   12 









   12 





























1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra
 *
 * Provides a framework for enqueueing and running callbacks from hardirq
 * context. The enqueueing is NMI-safe.
 */

#include <linux/bug.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/irq_work.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/irqflags.h>
#include <linux/sched.h>
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/smpboot.h>
#include <asm/processor.h>
#include <linux/kasan.h>

#include <trace/events/ipi.h>

static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
static DEFINE_PER_CPU(struct task_struct *, irq_workd);

static void wake_irq_workd(void)
{
        struct task_struct *tsk = __this_cpu_read(irq_workd);

        if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
                wake_up_process(tsk);
}

#ifdef CONFIG_SMP
static void irq_work_wake(struct irq_work *entry)
{
        wake_irq_workd();
}

static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
        IRQ_WORK_INIT_HARD(irq_work_wake);
#endif

static int irq_workd_should_run(unsigned int cpu)
{
        return !llist_empty(this_cpu_ptr(&lazy_list));
}

/*
 * Claim the entry so that no one else will poke at it.
 */
static bool irq_work_claim(struct irq_work *work)
{
        int oflags;

        oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
        /*
         * If the work is already pending, no need to raise the IPI.
         * The pairing smp_mb() in irq_work_single() makes sure
         * everything we did before is visible.
         */
        if (oflags & IRQ_WORK_PENDING)
                return false;
        return true;
}

void __weak arch_irq_work_raise(void)
{
        /*
         * Lame architectures will get the timer tick callback
         */
}

static __always_inline void irq_work_raise(struct irq_work *work)
{
        if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt())
                trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func);

        arch_irq_work_raise();
}

/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
        struct llist_head *list;
        bool rt_lazy_work = false;
        bool lazy_work = false;
        int work_flags;

        work_flags = atomic_read(&work->node.a_flags);
        if (work_flags & IRQ_WORK_LAZY)
                lazy_work = true;
        else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
                 !(work_flags & IRQ_WORK_HARD_IRQ))
                rt_lazy_work = true;

        if (lazy_work || rt_lazy_work)
                list = this_cpu_ptr(&lazy_list);
        else
                list = this_cpu_ptr(&raised_list);

        if (!llist_add(&work->node.llist, list))
                return;

        /* If the work is "lazy", handle it from next tick if any */
        if (!lazy_work || tick_nohz_tick_stopped())
                irq_work_raise(work);
}

/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;

        /* Queue the entry and raise the IPI if needed. */
        preempt_disable();
        __irq_work_queue_local(work);
        preempt_enable();

        return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);

/*
 * Enqueue the irq_work @work on @cpu unless it's already pending
 * somewhere.
 *
 * Can be re-enqueued while the callback is still in progress.
 */
bool irq_work_queue_on(struct irq_work *work, int cpu)
{
#ifndef CONFIG_SMP
        return irq_work_queue(work);

#else /* CONFIG_SMP: */
        /* All work should have been flushed before going offline */
        WARN_ON_ONCE(cpu_is_offline(cpu));

        /* Only queue if not already pending */
        if (!irq_work_claim(work))
                return false;

        kasan_record_aux_stack(work);

        preempt_disable();
        if (cpu != smp_processor_id()) {
                /* Arch remote IPI send/receive backend aren't NMI safe */
                WARN_ON_ONCE(in_nmi());

                /*
                 * On PREEMPT_RT the items which are not marked as
                 * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
                 * item is used on the remote CPU to wake the thread.
                 */
                if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
                    !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {

                        if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
                                goto out;

                        work = &per_cpu(irq_work_wakeup, cpu);
                        if (!irq_work_claim(work))
                                goto out;
                }

                __smp_call_single_queue(cpu, &work->node.llist);
        } else {
                __irq_work_queue_local(work);
        }
out:
        preempt_enable();

        return true;
#endif /* CONFIG_SMP */
}

bool irq_work_needs_cpu(void)
{
        struct llist_head *raised, *lazy;

        raised = this_cpu_ptr(&raised_list);
        lazy = this_cpu_ptr(&lazy_list);

        if (llist_empty(raised) || arch_irq_work_has_interrupt())
                if (llist_empty(lazy))
                        return false;

        /* All work should have been flushed before going offline */
        WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));

        return true;
}

void irq_work_single(void *arg)
{
        struct irq_work *work = arg;
        int flags;

        /*
         * Clear the PENDING bit, after this point the @work can be re-used.
         * The PENDING bit acts as a lock, and we own it, so we can clear it
         * without atomic ops.
         */
        flags = atomic_read(&work->node.a_flags);
        flags &= ~IRQ_WORK_PENDING;
        atomic_set(&work->node.a_flags, flags);

        /*
         * See irq_work_claim().
         */
        smp_mb();

        lockdep_irq_work_enter(flags);
        work->func(work);
        lockdep_irq_work_exit(flags);

        /*
         * Clear the BUSY bit, if set, and return to the free state if no-one
         * else claimed it meanwhile.
         */
        (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);

        if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
            !arch_irq_work_has_interrupt())
                rcuwait_wake_up(&work->irqwait);
}

static void irq_work_run_list(struct llist_head *list)
{
        struct irq_work *work, *tmp;
        struct llist_node *llnode;

        /*
         * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
         * in a per-CPU thread in preemptible context. Only the items which are
         * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
         */
        BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));

        if (llist_empty(list))
                return;

        llnode = llist_del_all(list);
        llist_for_each_entry_safe(work, tmp, llnode, node.llist)
                irq_work_single(work);
}

/*
 * hotplug calls this through:
 *  hotplug_cfd() -> flush_smp_call_function_queue()
 */
void irq_work_run(void)
{
        irq_work_run_list(this_cpu_ptr(&raised_list));
        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                irq_work_run_list(this_cpu_ptr(&lazy_list));
        else
                wake_irq_workd();
}
EXPORT_SYMBOL_GPL(irq_work_run);

void irq_work_tick(void)
{
        struct llist_head *raised = this_cpu_ptr(&raised_list);

        if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
                irq_work_run_list(raised);

        if (!IS_ENABLED(CONFIG_PREEMPT_RT))
                irq_work_run_list(this_cpu_ptr(&lazy_list));
        else
                wake_irq_workd();
}

/*
 * Synchronize against the irq_work @entry, ensures the entry is not
 * currently in use.
 */
void irq_work_sync(struct irq_work *work)
{
        lockdep_assert_irqs_enabled();
        might_sleep();

        if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
            !arch_irq_work_has_interrupt()) {
                rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
                                   TASK_UNINTERRUPTIBLE);
                return;
        }

        while (irq_work_is_busy(work))
                cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);

static void run_irq_workd(unsigned int cpu)
{
        irq_work_run_list(this_cpu_ptr(&lazy_list));
}

static void irq_workd_setup(unsigned int cpu)
{
        sched_set_fifo_low(current);
}

static struct smp_hotplug_thread irqwork_threads = {
        .store                  = &irq_workd,
        .setup                        = irq_workd_setup,
        .thread_should_run      = irq_workd_should_run,
        .thread_fn              = run_irq_workd,
        .thread_comm            = "irq_work/%u",
};

static __init int irq_work_init_threads(void)
{
        if (IS_ENABLED(CONFIG_PREEMPT_RT))
                BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
        return 0;
}
early_initcall(irq_work_init_threads);









































































































































  220 




  221 


















  215 

    5 





    1 










  215 

































  215 































































  214 






  215 











  215 








































































































  213 
  204 

  276 








  213 



















































  444 





  445 



















    9 
    1 







    9 
    9 





















  444 




  279 
















  445 



   68 



















  445 


  445 















   68 



  444 



  444 
  163 













  204 

  203 










    3 










































  215 


  214 







  215 












  214 

  215 





























    1 








  214 







  215 


  215 



  214 
  214 



  213 

  213 
    7 



    5 












    1 













    1 



  213 






  213 













  212 




























  213 






    1 

  212 


  213 























    2 







    2 










































  185 
















  184 



    7 






  181 

  181 



  184 




















  181 


    7 













  176 














  176 

  176 
  176 

    3 






  175 


  175 




















    3 


































































































































































































































































   40 



























































































































































    3 




    3 


    3 










    3 


    3 




















    3 

    3 












   12 

















































  178 










  172 












  172 






















   12 

  182 








   12 
   12 



    3 

   12 

    5 



    2 










  274 
    4 



















































































  188 











    9 











  181 












    3 














  277 
























  277 
  276 
    1 









    3 















  212 






























































































































  186 


  186 

    5 


  182 






  181 



    7 
    3 





















    2 

    1 

















    1 



    1 







    1 




































    7 
    5 





    2 



    2 


    2 





    2 

















    9 







    3 






  173 

    3 


    6 















































































































 1257 



 1254 








 1107 



































































   25 




   46 
























































































  215 






   11 

  175 

    5 










  212 




   46 

  173 

    2 












  209 






  207 







    4 




  211 









  174 



    4 





  173 











  215 










  215 

    3 




  215 
  215 























  175 











   47 


   47 
   47 









    2 


    3 







    3 





    1 







   48 













    8 
    1 














    9 



    9 





    4 
    9 

    8 






    3 
    5 












































































































































































































  276 


  276 



  276 











  276 
















































  274 




  274 





  274 


















































  246 








  246 
































    2 
  244 



































  246 
  246 












































































































































































































































































   36 















































































  200 







  163 



    1 

    2 




  163 









   38 













  199 







  199 









    8 





    8 



    8 


























   34 

























































































    4 

   38 


   38 











    4 




    4 
   34 










   36 
    2 

    4 


   34 

   34 
















   38 
   36 

    2 








    3 

   33 







   33 


   36 



















  170 
    1 



   40 



  176 
    2 


  172 




    7 




















    4 





  204 



  173 


    2 
    4 


    2 
   32 




    2 
   36 







   34 

    4 
   38 
   38 
    7 
   33 
   36 



   34 

    4 

    2 
   36 

    2 


    7 
   33 





  179 


  176 
    3 















   33 

  177 


   33 
  177 

  166 
   41 

    1 

   40 




  199 






   33 

  165 

    8 




    1 

  197 

  194 
    8 
  198 



  196 
    8 





















    2 

















    2 



    2 








    2 











































    4 



    2 







    2 








    5 


    3 
    3 








  215 








    4 

    5 


   15 
  205 


  200 


   25 
  200 
















  215 




  215 

  215 
    3 
  215 

  215 


























































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
3978
3979
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102
4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121
4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
4133
4134
4135
4136
4137
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
4155
4156
4157
4158
4159
4160
4161
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204
4205
4206
4207
4208
4209
4210
4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
4270
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
4457
4458
4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
4632
4633
4634
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
4732
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
4789
4790
4791
4792
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
4808
4809
4810
4811
4812
4813
4814
4815
4816
4817
4818
4819
4820
4821
4822
4823
4824
4825
4826
4827
4828
4829
4830
4831
4832
4833
4834
4835
4836
4837
4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
4855
4856
4857
4858
4859
4860
4861
4862
4863
4864
4865
4866
4867
4868
4869
4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069
5070
5071
5072
5073
5074
5075
5076
5077
5078
5079
5080
5081
5082
5083
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117
5118
5119
5120
5121
5122
5123
5124
5125
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
5154
5155
5156
5157
5158
5159
5160
5161
5162
5163
5164
5165
5166
5167
5168
5169
5170
5171
5172
5173
5174
5175
5176
5177
5178
5179
5180
5181
5182
5183
5184
5185
5186
5187
5188
5189
5190
5191
5192
5193
5194
5195
5196
5197
5198
5199
5200
5201
5202
5203
5204
5205
5206
5207
5208
5209
5210
5211
5212
5213
5214
5215
5216
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
5255
5256
5257
5258
5259
5260
5261
5262
5263
5264
5265
5266
5267
5268
5269
5270
5271
5272
5273
5274
5275
5276
5277
5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289
5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301
5302
5303
5304
5305
5306
5307
5308
5309
5310
5311
5312
5313
5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
// SPDX-License-Identifier: GPL-2.0
/*
 *  linux/fs/namei.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

/*
 * Some corrections by tytso.
 */

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
 * lookup logic.
 */
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
 */

#include <linux/init.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/wordpart.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>

#include "internal.h"
#include "mount.h"

/* [Feb-1997 T. Schoebel-Theuer]
 * Fundamental changes in the pathname lookup mechanisms (namei)
 * were necessary because of omirr.  The reason is that omirr needs
 * to know the _real_ pathname, not the user-supplied one, in case
 * of symlinks (and also when transname replacements occur).
 *
 * The new code replaces the old recursive symlink resolution with
 * an iterative one (in case of non-nested symlink chains).  It does
 * this with calls to <fs>_follow_link().
 * As a side effect, dir_namei(), _namei() and follow_link() are now 
 * replaced with a single function lookup_dentry() that can handle all 
 * the special cases of the former code.
 *
 * With the new dcache, the pathname is stored at each inode, at least as
 * long as the refcount of the inode is positive.  As a side effect, the
 * size of the dcache depends on the inode cache and thus is dynamic.
 *
 * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
 * resolution to correspond with current state of the code.
 *
 * Note that the symlink resolution is not *completely* iterative.
 * There is still a significant amount of tail- and mid- recursion in
 * the algorithm.  Also, note that <fs>_readlink() is not used in
 * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
 * may return different results than <fs>_follow_link().  Many virtual
 * filesystems (including /proc) exhibit this behavior.
 */

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
 * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
 * and the name already exists in form of a symlink, try to create the new
 * name indicated by the symlink. The old code always complained that the
 * name already exists, due to not following the symlink even if its target
 * is nonexistent.  The new semantics affects also mknod() and link() when
 * the name is a symlink pointing to a non-existent name.
 *
 * I don't know which semantics is the right one, since I have no access
 * to standards. But I found by trial that HP-UX 9.0 has the full "new"
 * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
 * "old" one. Personally, I think the new semantics is much more logical.
 * Note that "ln old new" where "new" is a symlink pointing to a non-existing
 * file does succeed in both HP-UX and SunOs, but not in Solaris
 * and in the old Linux semantics.
 */

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
 * semantics.  See the comments in "open_namei" and "do_link" below.
 *
 * [10-Sep-98 Alan Modra] Another symlink change.
 */

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
 *        inside the path - always follow.
 *        in the last component in creation/removal/renaming - never follow.
 *        if LOOKUP_FOLLOW passed - follow.
 *        if the pathname has trailing slashes - follow.
 *        otherwise - don't follow.
 * (applied in that order).
 *
 * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
 * restored for 2.4. This is the last surviving part of old 4.2BSD bug.
 * During the 2.4 we need to fix the userland stuff depending on it -
 * hopefully we will be able to get rid of that wart in 2.5. So far only
 * XEmacs seems to be relying on it...
 */
/*
 * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
 * implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
 * any extra contention...
 */

/* In order to reduce some races, while at the same time doing additional
 * checking and hopefully speeding things up, we copy filenames to the
 * kernel data space before using them..
 *
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */

#define EMBEDDED_NAME_MAX        (PATH_MAX - offsetof(struct filename, iname))

static inline void initname(struct filename *name, const char __user *uptr)
{
        name->uptr = uptr;
        name->aname = NULL;
        atomic_set(&name->refcnt, 1);
}

struct filename *
getname_flags(const char __user *filename, int flags)
{
        struct filename *result;
        char *kname;
        int len;

        result = audit_reusename(filename);
        if (result)
                return result;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        /*
         * First, try to embed the struct filename inside the names_cache
         * allocation
         */
        kname = (char *)result->iname;
        result->name = kname;

        len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
        /*
         * Handle both empty path and copy failure in one go.
         */
        if (unlikely(len <= 0)) {
                if (unlikely(len < 0)) {
                        __putname(result);
                        return ERR_PTR(len);
                }

                /* The empty path is special. */
                if (!(flags & LOOKUP_EMPTY)) {
                        __putname(result);
                        return ERR_PTR(-ENOENT);
                }
        }

        /*
         * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
         * separate struct filename so we can dedicate the entire
         * names_cache allocation for the pathname, and re-do the copy from
         * userland.
         */
        if (unlikely(len == EMBEDDED_NAME_MAX)) {
                const size_t size = offsetof(struct filename, iname[1]);
                kname = (char *)result;

                /*
                 * size is chosen that way we to guarantee that
                 * result->iname[0] is within the same object and that
                 * kname can't be equal to result->iname, no matter what.
                 */
                result = kzalloc(size, GFP_KERNEL);
                if (unlikely(!result)) {
                        __putname(kname);
                        return ERR_PTR(-ENOMEM);
                }
                result->name = kname;
                len = strncpy_from_user(kname, filename, PATH_MAX);
                if (unlikely(len < 0)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(len);
                }
                /* The empty path is special. */
                if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(-ENOENT);
                }
                if (unlikely(len == PATH_MAX)) {
                        __putname(kname);
                        kfree(result);
                        return ERR_PTR(-ENAMETOOLONG);
                }
        }
        initname(result, filename);
        audit_getname(result);
        return result;
}

struct filename *getname_uflags(const char __user *filename, int uflags)
{
        int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

        return getname_flags(filename, flags);
}

struct filename *__getname_maybe_null(const char __user *pathname)
{
        struct filename *name;
        char c;

        /* try to save on allocations; loss on um, though */
        if (get_user(c, pathname))
                return ERR_PTR(-EFAULT);
        if (!c)
                return NULL;

        name = getname_flags(pathname, LOOKUP_EMPTY);
        if (!IS_ERR(name) && !(name->name[0])) {
                putname(name);
                name = NULL;
        }
        return name;
}

struct filename *getname_kernel(const char * filename)
{
        struct filename *result;
        int len = strlen(filename) + 1;

        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);

        if (len <= EMBEDDED_NAME_MAX) {
                result->name = (char *)result->iname;
        } else if (len <= PATH_MAX) {
                const size_t size = offsetof(struct filename, iname[1]);
                struct filename *tmp;

                tmp = kmalloc(size, GFP_KERNEL);
                if (unlikely(!tmp)) {
                        __putname(result);
                        return ERR_PTR(-ENOMEM);
                }
                tmp->name = (char *)result;
                result = tmp;
        } else {
                __putname(result);
                return ERR_PTR(-ENAMETOOLONG);
        }
        memcpy((char *)result->name, filename, len);
        initname(result, NULL);
        audit_getname(result);
        return result;
}
EXPORT_SYMBOL(getname_kernel);

void putname(struct filename *name)
{
        int refcnt;

        if (IS_ERR_OR_NULL(name))
                return;

        refcnt = atomic_read(&name->refcnt);
        if (refcnt != 1) {
                if (WARN_ON_ONCE(!refcnt))
                        return;

                if (!atomic_dec_and_test(&name->refcnt))
                        return;
        }

        if (name->name != name->iname) {
                __putname(name->name);
                kfree(name);
        } else
                __putname(name);
}
EXPORT_SYMBOL(putname);

/**
 * check_acl - perform ACL permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the ACL permission checking. Since this function
 * retrieve POSIX acls it needs to know whether it is called from a blocking or
 * non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
static int check_acl(struct mnt_idmap *idmap,
                     struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl *acl;

        if (mask & MAY_NOT_BLOCK) {
                acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
                if (!acl)
                        return -EAGAIN;
                /* no ->get_inode_acl() calls in RCU mode... */
                if (is_uncached_acl(acl))
                        return -ECHILD;
                return posix_acl_permission(idmap, inode, acl, mask);
        }

        acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
        if (IS_ERR(acl))
                return PTR_ERR(acl);
        if (acl) {
                int error = posix_acl_permission(idmap, inode, acl, mask);
                posix_acl_release(acl);
                return error;
        }
#endif

        return -EAGAIN;
}

/*
 * Very quick optimistic "we know we have no ACL's" check.
 *
 * Note that this is purely for ACL_TYPE_ACCESS, and purely
 * for the "we have cached that there are no ACLs" case.
 *
 * If this returns true, we know there are no ACLs. But if
 * it returns false, we might still not have ACLs (it could
 * be the is_uncached_acl() case).
 */
static inline bool no_acl_inode(struct inode *inode)
{
#ifdef CONFIG_FS_POSIX_ACL
        return likely(!READ_ONCE(inode->i_acl));
#else
        return true;
#endif
}

/**
 * acl_permission_check - perform basic UNIX permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * This function performs the basic UNIX permission checking. Since this
 * function may retrieve POSIX acls it needs to know whether it is called from a
 * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
static int acl_permission_check(struct mnt_idmap *idmap,
                                struct inode *inode, int mask)
{
        unsigned int mode = inode->i_mode;
        vfsuid_t vfsuid;

        /*
         * Common cheap case: everybody has the requested
         * rights, and there are no ACLs to check. No need
         * to do any owner/group checks in that case.
         *
         *  - 'mask&7' is the requested permission bit set
         *  - multiplying by 0111 spreads them out to all of ugo
         *  - '& ~mode' looks for missing inode permission bits
         *  - the '!' is for "no missing permissions"
         *
         * After that, we just need to check that there are no
         * ACL's on the inode - do the 'IS_POSIXACL()' check last
         * because it will dereference the ->i_sb pointer and we
         * want to avoid that if at all possible.
         */
        if (!((mask & 7) * 0111 & ~mode)) {
                if (no_acl_inode(inode))
                        return 0;
                if (!IS_POSIXACL(inode))
                        return 0;
        }

        /* Are we the owner? If so, ACL's don't matter */
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
                mask &= 7;
                mode >>= 6;
                return (mask & ~mode) ? -EACCES : 0;
        }

        /* Do we have ACL's? */
        if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
                int error = check_acl(idmap, inode, mask);
                if (error != -EAGAIN)
                        return error;
        }

        /* Only RWX matters for group/other mode bits */
        mask &= 7;

        /*
         * Are the group permissions different from
         * the other permissions in the bits we care
         * about? Need to check group ownership if so.
         */
        if (mask & (mode ^ (mode >> 3))) {
                vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
                if (vfsgid_in_group_p(vfsgid))
                        mode >>= 3;
        }

        /* Bits in 'mode' clear that we require? */
        return (mask & ~mode) ? -EACCES : 0;
}

/**
 * generic_permission -  check for access rights on a Posix-like filesystem
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check access rights for
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
 *                %MAY_NOT_BLOCK ...)
 *
 * Used to check for read/write/execute permissions on a file.
 * We use "fsuid" for this, letting us set arbitrary permissions
 * for filesystem access without changing the "normal" uids which
 * are used for other things.
 *
 * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
 * request cannot be satisfied (eg. requires blocking or too much complexity).
 * It would then be called again in ref-walk mode.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
                       int mask)
{
        int ret;

        /*
         * Do the basic permission checks.
         */
        ret = acl_permission_check(idmap, inode, mask);
        if (ret != -EACCES)
                return ret;

        if (S_ISDIR(inode->i_mode)) {
                /* DACs are overridable for directories */
                if (!(mask & MAY_WRITE))
                        if (capable_wrt_inode_uidgid(idmap, inode,
                                                     CAP_DAC_READ_SEARCH))
                                return 0;
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_OVERRIDE))
                        return 0;
                return -EACCES;
        }

        /*
         * Searching includes executable on directories, else just read.
         */
        mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
        if (mask == MAY_READ)
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_READ_SEARCH))
                        return 0;
        /*
         * Read/write DACs are always overridable.
         * Executable DACs are overridable when there is
         * at least one exec bit set.
         */
        if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
                if (capable_wrt_inode_uidgid(idmap, inode,
                                             CAP_DAC_OVERRIDE))
                        return 0;

        return -EACCES;
}
EXPORT_SYMBOL(generic_permission);

/**
 * do_inode_permission - UNIX permission checking
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        inode to check permissions on
 * @mask:        right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
 *
 * We _really_ want to just do "generic_permission()" without
 * even looking at the inode->i_op values. So we keep a cache
 * flag in inode->i_opflags, that says "this has not special
 * permission function, use the fast case".
 */
static inline int do_inode_permission(struct mnt_idmap *idmap,
                                      struct inode *inode, int mask)
{
        if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
                if (likely(inode->i_op->permission))
                        return inode->i_op->permission(idmap, inode, mask);

                /* This gets set once for the inode lifetime */
                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_FASTPERM;
                spin_unlock(&inode->i_lock);
        }
        return generic_permission(idmap, inode, mask);
}

/**
 * sb_permission - Check superblock-level permissions
 * @sb: Superblock of inode to check permission on
 * @inode: Inode to check permission on
 * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Separate out file-system wide checks from inode-specific permission checks.
 */
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
        if (unlikely(mask & MAY_WRITE)) {
                umode_t mode = inode->i_mode;

                /* Nobody gets write access to a read-only fs. */
                if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
                        return -EROFS;
        }
        return 0;
}

/**
 * inode_permission - Check for access rights to a given inode
 * @idmap:        idmap of the mount the inode was found from
 * @inode:        Inode to check permission on
 * @mask:        Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
 *
 * Check for read/write/execute permissions on an inode.  We use fs[ug]id for
 * this, letting us set arbitrary permissions for filesystem access without
 * changing the "normal" UIDs which are used for other things.
 *
 * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
 */
int inode_permission(struct mnt_idmap *idmap,
                     struct inode *inode, int mask)
{
        int retval;

        retval = sb_permission(inode->i_sb, inode, mask);
        if (retval)
                return retval;

        if (unlikely(mask & MAY_WRITE)) {
                /*
                 * Nobody gets write access to an immutable file.
                 */
                if (IS_IMMUTABLE(inode))
                        return -EPERM;

                /*
                 * Updating mtime will likely cause i_uid and i_gid to be
                 * written back improperly if their true value is unknown
                 * to the vfs.
                 */
                if (HAS_UNMAPPED_ID(idmap, inode))
                        return -EACCES;
        }

        retval = do_inode_permission(idmap, inode, mask);
        if (retval)
                return retval;

        retval = devcgroup_inode_permission(inode, mask);
        if (retval)
                return retval;

        return security_inode_permission(inode, mask);
}
EXPORT_SYMBOL(inode_permission);

/**
 * path_get - get a reference to a path
 * @path: path to get the reference to
 *
 * Given a path increment the reference count to the dentry and the vfsmount.
 */
void path_get(const struct path *path)
{
        mntget(path->mnt);
        dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

/**
 * path_put - put a reference to a path
 * @path: path to put the reference to
 *
 * Given a path decrement the reference count to the dentry and the vfsmount.
 */
void path_put(const struct path *path)
{
        dput(path->dentry);
        mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);

#define EMBEDDED_LEVELS 2
struct nameidata {
        struct path        path;
        struct qstr        last;
        struct path        root;
        struct inode        *inode; /* path.dentry.d_inode */
        unsigned int        flags, state;
        unsigned        seq, next_seq, m_seq, r_seq;
        int                last_type;
        unsigned        depth;
        int                total_link_count;
        struct saved {
                struct path link;
                struct delayed_call done;
                const char *name;
                unsigned seq;
        } *stack, internal[EMBEDDED_LEVELS];
        struct filename        *name;
        const char *pathname;
        struct nameidata *saved;
        unsigned        root_seq;
        int                dfd;
        vfsuid_t        dir_vfsuid;
        umode_t                dir_mode;
} __randomize_layout;

#define ND_ROOT_PRESET 1
#define ND_ROOT_GRABBED 2
#define ND_JUMPED 4

static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
        struct nameidata *old = current->nameidata;
        p->stack = p->internal;
        p->depth = 0;
        p->dfd = dfd;
        p->name = name;
        p->pathname = likely(name) ? name->name : "";
        p->path.mnt = NULL;
        p->path.dentry = NULL;
        p->total_link_count = old ? old->total_link_count : 0;
        p->saved = old;
        current->nameidata = p;
}

static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
                          const struct path *root)
{
        __set_nameidata(p, dfd, name);
        p->state = 0;
        if (unlikely(root)) {
                p->state = ND_ROOT_PRESET;
                p->root = *root;
        }
}

static void restore_nameidata(void)
{
        struct nameidata *now = current->nameidata, *old = now->saved;

        current->nameidata = old;
        if (old)
                old->total_link_count = now->total_link_count;
        if (now->stack != now->internal)
                kfree(now->stack);
}

static bool nd_alloc_stack(struct nameidata *nd)
{
        struct saved *p;

        p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
                         nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
        if (unlikely(!p))
                return false;
        memcpy(p, nd->internal, sizeof(nd->internal));
        nd->stack = p;
        return true;
}

/**
 * path_connected - Verify that a dentry is below mnt.mnt_root
 * @mnt: The mountpoint to check.
 * @dentry: The dentry to check.
 *
 * Rename can sometimes move a file or directory outside of a bind
 * mount, path_connected allows those cases to be detected.
 */
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
        struct super_block *sb = mnt->mnt_sb;

        /* Bind mounts can have disconnected paths */
        if (mnt->mnt_root == sb->s_root)
                return true;

        return is_subdir(dentry, mnt->mnt_root);
}

static void drop_links(struct nameidata *nd)
{
        int i = nd->depth;
        while (i--) {
                struct saved *last = nd->stack + i;
                do_delayed_call(&last->done);
                clear_delayed_call(&last->done);
        }
}

static void leave_rcu(struct nameidata *nd)
{
        nd->flags &= ~LOOKUP_RCU;
        nd->seq = nd->next_seq = 0;
        rcu_read_unlock();
}

static void terminate_walk(struct nameidata *nd)
{
        drop_links(nd);
        if (!(nd->flags & LOOKUP_RCU)) {
                int i;
                path_put(&nd->path);
                for (i = 0; i < nd->depth; i++)
                        path_put(&nd->stack[i].link);
                if (nd->state & ND_ROOT_GRABBED) {
                        path_put(&nd->root);
                        nd->state &= ~ND_ROOT_GRABBED;
                }
        } else {
                leave_rcu(nd);
        }
        nd->depth = 0;
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
}

/* path_put is needed afterwards regardless of success or failure */
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
{
        int res = __legitimize_mnt(path->mnt, mseq);
        if (unlikely(res)) {
                if (res > 0)
                        path->mnt = NULL;
                path->dentry = NULL;
                return false;
        }
        if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
                path->dentry = NULL;
                return false;
        }
        return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static inline bool legitimize_path(struct nameidata *nd,
                            struct path *path, unsigned seq)
{
        return __legitimize_path(path, seq, nd->m_seq);
}

static bool legitimize_links(struct nameidata *nd)
{
        int i;
        if (unlikely(nd->flags & LOOKUP_CACHED)) {
                drop_links(nd);
                nd->depth = 0;
                return false;
        }
        for (i = 0; i < nd->depth; i++) {
                struct saved *last = nd->stack + i;
                if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
                        drop_links(nd);
                        nd->depth = i + 1;
                        return false;
                }
        }
        return true;
}

static bool legitimize_root(struct nameidata *nd)
{
        /* Nothing to do if nd->root is zero or is managed by the VFS user. */
        if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
                return true;
        nd->state |= ND_ROOT_GRABBED;
        return legitimize_path(nd, &nd->root, nd->root_seq);
}

/*
 * Path walking has 2 modes, rcu-walk and ref-walk (see
 * Documentation/filesystems/path-lookup.txt).  In situations when we can't
 * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
 * normal reference counts on dentries and vfsmounts to transition to ref-walk
 * mode.  Refcounts are grabbed at the last known good point before rcu-walk
 * got stuck, so ref-walk may continue from there. If this is not successful
 * (eg. a seqcount has changed), then failure is returned and it's up to caller
 * to restart the path walk from the beginning in ref-walk mode.
 */

/**
 * try_to_unlazy - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * Returns: true on success, false on failure
 *
 * try_to_unlazy attempts to legitimize the current nd->path and nd->root
 * for ref-walk mode.
 * Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy() failure and
 * terminate_walk().
 */
static bool try_to_unlazy(struct nameidata *nd)
{
        struct dentry *parent = nd->path.dentry;

        BUG_ON(!(nd->flags & LOOKUP_RCU));

        if (unlikely(!legitimize_links(nd)))
                goto out1;
        if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
                goto out;
        if (unlikely(!legitimize_root(nd)))
                goto out;
        leave_rcu(nd);
        BUG_ON(nd->inode != parent->d_inode);
        return true;

out1:
        nd->path.mnt = NULL;
        nd->path.dentry = NULL;
out:
        leave_rcu(nd);
        return false;
}

/**
 * try_to_unlazy_next - try to switch to ref-walk mode.
 * @nd: nameidata pathwalk data
 * @dentry: next dentry to step into
 * Returns: true on success, false on failure
 *
 * Similar to try_to_unlazy(), but here we have the next dentry already
 * picked by rcu-walk and want to legitimize that in addition to the current
 * nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
 * Nothing should touch nameidata between try_to_unlazy_next() failure and
 * terminate_walk().
 */
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
{
        int res;
        BUG_ON(!(nd->flags & LOOKUP_RCU));

        if (unlikely(!legitimize_links(nd)))
                goto out2;
        res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
        if (unlikely(res)) {
                if (res > 0)
                        goto out2;
                goto out1;
        }
        if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
                goto out1;

        /*
         * We need to move both the parent and the dentry from the RCU domain
         * to be properly refcounted. And the sequence number in the dentry
         * validates *both* dentry counters, since we checked the sequence
         * number of the parent after we got the child sequence number. So we
         * know the parent must still be valid if the child sequence number is
         */
        if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
                goto out;
        if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
                goto out_dput;
        /*
         * Sequence counts matched. Now make sure that the root is
         * still valid and get it if required.
         */
        if (unlikely(!legitimize_root(nd)))
                goto out_dput;
        leave_rcu(nd);
        return true;

out2:
        nd->path.mnt = NULL;
out1:
        nd->path.dentry = NULL;
out:
        leave_rcu(nd);
        return false;
out_dput:
        leave_rcu(nd);
        dput(dentry);
        return false;
}

static inline int d_revalidate(struct inode *dir, const struct qstr *name,
                               struct dentry *dentry, unsigned int flags)
{
        if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
                return dentry->d_op->d_revalidate(dir, name, dentry, flags);
        else
                return 1;
}

/**
 * complete_walk - successful completion of path walk
 * @nd:  pointer nameidata
 *
 * If we had been in RCU mode, drop out of it and legitimize nd->path.
 * Revalidate the final result, unless we'd already done that during
 * the path walk or the filesystem doesn't ask for it.  Return 0 on
 * success, -error on failure.  In case of failure caller does not
 * need to drop nd->path.
 */
static int complete_walk(struct nameidata *nd)
{
        struct dentry *dentry = nd->path.dentry;
        int status;

        if (nd->flags & LOOKUP_RCU) {
                /*
                 * We don't want to zero nd->root for scoped-lookups or
                 * externally-managed nd->root.
                 */
                if (!(nd->state & ND_ROOT_PRESET))
                        if (!(nd->flags & LOOKUP_IS_SCOPED))
                                nd->root.mnt = NULL;
                nd->flags &= ~LOOKUP_CACHED;
                if (!try_to_unlazy(nd))
                        return -ECHILD;
        }

        if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                /*
                 * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
                 * ever step outside the root during lookup" and should already
                 * be guaranteed by the rest of namei, we want to avoid a namei
                 * BUG resulting in userspace being given a path that was not
                 * scoped within the root at some point during the lookup.
                 *
                 * So, do a final sanity-check to make sure that in the
                 * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
                 * we won't silently return an fd completely outside of the
                 * requested root to userspace.
                 *
                 * Userspace could move the path outside the root after this
                 * check, but as discussed elsewhere this is not a concern (the
                 * resolved file was inside the root at some point).
                 */
                if (!path_is_under(&nd->path, &nd->root))
                        return -EXDEV;
        }

        if (likely(!(nd->state & ND_JUMPED)))
                return 0;

        if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
                return 0;

        status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
        if (status > 0)
                return 0;

        if (!status)
                status = -ESTALE;

        return status;
}

static int set_root(struct nameidata *nd)
{
        struct fs_struct *fs = current->fs;

        /*
         * Jumping to the real root in a scoped-lookup is a BUG in namei, but we
         * still have to ensure it doesn't happen because it will cause a breakout
         * from the dirfd.
         */
        if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
                return -ENOTRECOVERABLE;

        if (nd->flags & LOOKUP_RCU) {
                unsigned seq;

                do {
                        seq = read_seqcount_begin(&fs->seq);
                        nd->root = fs->root;
                        nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
                } while (read_seqcount_retry(&fs->seq, seq));
        } else {
                get_fs_root(fs, &nd->root);
                nd->state |= ND_ROOT_GRABBED;
        }
        return 0;
}

static int nd_jump_root(struct nameidata *nd)
{
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                /* Absolute path arguments to path_init() are allowed. */
                if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
                        return -EXDEV;
        }
        if (!nd->root.mnt) {
                int error = set_root(nd);
                if (error)
                        return error;
        }
        if (nd->flags & LOOKUP_RCU) {
                struct dentry *d;
                nd->path = nd->root;
                d = nd->path.dentry;
                nd->inode = d->d_inode;
                nd->seq = nd->root_seq;
                if (read_seqcount_retry(&d->d_seq, nd->seq))
                        return -ECHILD;
        } else {
                path_put(&nd->path);
                nd->path = nd->root;
                path_get(&nd->path);
                nd->inode = nd->path.dentry->d_inode;
        }
        nd->state |= ND_JUMPED;
        return 0;
}

/*
 * Helper to directly jump to a known parsed path from ->get_link,
 * caller must have taken a reference to path beforehand.
 */
int nd_jump_link(const struct path *path)
{
        int error = -ELOOP;
        struct nameidata *nd = current->nameidata;

        if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
                goto err;

        error = -EXDEV;
        if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
                if (nd->path.mnt != path->mnt)
                        goto err;
        }
        /* Not currently safe for scoped-lookups. */
        if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
                goto err;

        path_put(&nd->path);
        nd->path = *path;
        nd->inode = nd->path.dentry->d_inode;
        nd->state |= ND_JUMPED;
        return 0;

err:
        path_put(path);
        return error;
}

static inline void put_link(struct nameidata *nd)
{
        struct saved *last = nd->stack + --nd->depth;
        do_delayed_call(&last->done);
        if (!(nd->flags & LOOKUP_RCU))
                path_put(&last->link);
}

static int sysctl_protected_symlinks __read_mostly;
static int sysctl_protected_hardlinks __read_mostly;
static int sysctl_protected_fifos __read_mostly;
static int sysctl_protected_regular __read_mostly;

#ifdef CONFIG_SYSCTL
static const struct ctl_table namei_sysctls[] = {
        {
                .procname        = "protected_symlinks",
                .data                = &sysctl_protected_symlinks,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "protected_hardlinks",
                .data                = &sysctl_protected_hardlinks,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_ONE,
        },
        {
                .procname        = "protected_fifos",
                .data                = &sysctl_protected_fifos,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
        {
                .procname        = "protected_regular",
                .data                = &sysctl_protected_regular,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = SYSCTL_TWO,
        },
};

static int __init init_fs_namei_sysctls(void)
{
        register_sysctl_init("fs", namei_sysctls);
        return 0;
}
fs_initcall(init_fs_namei_sysctls);

#endif /* CONFIG_SYSCTL */

/**
 * may_follow_link - Check symlink following for unsafe situations
 * @nd: nameidata pathwalk data
 * @inode: Used for idmapping.
 *
 * In the case of the sysctl_protected_symlinks sysctl being enabled,
 * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
 * in a sticky world-writable directory. This is to protect privileged
 * processes from failing races against path names that may change out
 * from under them by way of other users creating malicious symlinks.
 * It will permit symlinks to be followed only when outside a sticky
 * world-writable directory, or when the uid of the symlink and follower
 * match, or when the directory owner matches the symlink's owner.
 *
 * Returns 0 if following the symlink is allowed, -ve on error.
 */
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
        struct mnt_idmap *idmap;
        vfsuid_t vfsuid;

        if (!sysctl_protected_symlinks)
                return 0;

        idmap = mnt_idmap(nd->path.mnt);
        vfsuid = i_uid_into_vfsuid(idmap, inode);
        /* Allowed if owner and follower match. */
        if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
                return 0;

        /* Allowed if parent directory not sticky and world-writable. */
        if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
                return 0;

        /* Allowed if parent directory and link owner match. */
        if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
                return 0;

        if (nd->flags & LOOKUP_RCU)
                return -ECHILD;

        audit_inode(nd->name, nd->stack[0].link.dentry, 0);
        audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
        return -EACCES;
}

/**
 * safe_hardlink_source - Check for safe hardlink conditions
 * @idmap: idmap of the mount the inode was found from
 * @inode: the source inode to hardlink from
 *
 * Return false if at least one of the following conditions:
 *    - inode is not a regular file
 *    - inode is setuid
 *    - inode is setgid and group-exec
 *    - access failure for read and write
 *
 * Otherwise returns true.
 */
static bool safe_hardlink_source(struct mnt_idmap *idmap,
                                 struct inode *inode)
{
        umode_t mode = inode->i_mode;

        /* Special files should not get pinned to the filesystem. */
        if (!S_ISREG(mode))
                return false;

        /* Setuid files should not get pinned to the filesystem. */
        if (mode & S_ISUID)
                return false;

        /* Executable setgid files should not get pinned to the filesystem. */
        if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
                return false;

        /* Hardlinking to unreadable or unwritable sources is dangerous. */
        if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE))
                return false;

        return true;
}

/**
 * may_linkat - Check permissions for creating a hardlink
 * @idmap: idmap of the mount the inode was found from
 * @link:  the source to hardlink from
 *
 * Block hardlink when all of:
 *  - sysctl_protected_hardlinks enabled
 *  - fsuid does not match inode
 *  - hardlink source is unsafe (see safe_hardlink_source() above)
 *  - not CAP_FOWNER in a namespace with the inode owner uid mapped
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * Returns 0 if successful, -ve on error.
 */
int may_linkat(struct mnt_idmap *idmap, const struct path *link)
{
        struct inode *inode = link->dentry->d_inode;

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        if (!sysctl_protected_hardlinks)
                return 0;

        /* Source inode owner (or CAP_FOWNER) can hardlink all they like,
         * otherwise, it must be a safe source.
         */
        if (safe_hardlink_source(idmap, inode) ||
            inode_owner_or_capable(idmap, inode))
                return 0;

        audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
        return -EPERM;
}

/**
 * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
 *                          should be allowed, or not, on files that already
 *                          exist.
 * @idmap: idmap of the mount the inode was found from
 * @nd: nameidata pathwalk data
 * @inode: the inode of the file to open
 *
 * Block an O_CREAT open of a FIFO (or a regular file) when:
 *   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
 *   - the file already exists
 *   - we are in a sticky directory
 *   - we don't own the file
 *   - the owner of the directory doesn't own the file
 *   - the directory is world writable
 * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
 * the directory doesn't have to be world writable: being group writable will
 * be enough.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * Returns 0 if the open is allowed, -ve on error.
 */
static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
                                struct inode *const inode)
{
        umode_t dir_mode = nd->dir_mode;
        vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;

        if (likely(!(dir_mode & S_ISVTX)))
                return 0;

        if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
                return 0;

        if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
                return 0;

        i_vfsuid = i_uid_into_vfsuid(idmap, inode);

        if (vfsuid_eq(i_vfsuid, dir_vfsuid))
                return 0;

        if (vfsuid_eq_kuid(i_vfsuid, current_fsuid()))
                return 0;

        if (likely(dir_mode & 0002)) {
                audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create");
                return -EACCES;
        }

        if (dir_mode & 0020) {
                if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) {
                        audit_log_path_denied(AUDIT_ANOM_CREAT,
                                              "sticky_create_fifo");
                        return -EACCES;
                }

                if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) {
                        audit_log_path_denied(AUDIT_ANOM_CREAT,
                                              "sticky_create_regular");
                        return -EACCES;
                }
        }

        return 0;
}

/*
 * follow_up - Find the mountpoint of path's vfsmount
 *
 * Given a path, find the mountpoint of its source file system.
 * Replace @path with the path of the mountpoint in the parent mount.
 * Up is towards /.
 *
 * Return 1 if we went up a level and 0 if we were already at the
 * root.
 */
int follow_up(struct path *path)
{
        struct mount *mnt = real_mount(path->mnt);
        struct mount *parent;
        struct dentry *mountpoint;

        read_seqlock_excl(&mount_lock);
        parent = mnt->mnt_parent;
        if (parent == mnt) {
                read_sequnlock_excl(&mount_lock);
                return 0;
        }
        mntget(&parent->mnt);
        mountpoint = dget(mnt->mnt_mountpoint);
        read_sequnlock_excl(&mount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
        path->mnt = &parent->mnt;
        return 1;
}
EXPORT_SYMBOL(follow_up);

static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
                                  struct path *path, unsigned *seqp)
{
        while (mnt_has_parent(m)) {
                struct dentry *mountpoint = m->mnt_mountpoint;

                m = m->mnt_parent;
                if (unlikely(root->dentry == mountpoint &&
                             root->mnt == &m->mnt))
                        break;
                if (mountpoint != m->mnt.mnt_root) {
                        path->mnt = &m->mnt;
                        path->dentry = mountpoint;
                        *seqp = read_seqcount_begin(&mountpoint->d_seq);
                        return true;
                }
        }
        return false;
}

static bool choose_mountpoint(struct mount *m, const struct path *root,
                              struct path *path)
{
        bool found;

        rcu_read_lock();
        while (1) {
                unsigned seq, mseq = read_seqbegin(&mount_lock);

                found = choose_mountpoint_rcu(m, root, path, &seq);
                if (unlikely(!found)) {
                        if (!read_seqretry(&mount_lock, mseq))
                                break;
                } else {
                        if (likely(__legitimize_path(path, seq, mseq)))
                                break;
                        rcu_read_unlock();
                        path_put(path);
                        rcu_read_lock();
                }
        }
        rcu_read_unlock();
        return found;
}

/*
 * Perform an automount
 * - return -EISDIR to tell follow_managed() to stop and return the path we
 *   were called with.
 */
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
{
        struct dentry *dentry = path->dentry;

        /* We don't want to mount if someone's just doing a stat -
         * unless they're stat'ing a directory and appended a '/' to
         * the name.
         *
         * We do, however, want to mount if someone wants to open or
         * create a file of any type under the mountpoint, wants to
         * traverse through the mountpoint or wants to open the
         * mounted directory.  Also, autofs may mark negative dentries
         * as being automount points.  These will need the attentions
         * of the daemon to instantiate them before they can be used.
         */
        if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
                           LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
            dentry->d_inode)
                return -EISDIR;

        if (count && (*count)++ >= MAXSYMLINKS)
                return -ELOOP;

        return finish_automount(dentry->d_op->d_automount(path), path);
}

/*
 * mount traversal - out-of-line part.  One note on ->d_flags accesses -
 * dentries are pinned but not locked here, so negative dentry can go
 * positive right under us.  Use of smp_load_acquire() provides a barrier
 * sufficient for ->d_inode and ->d_flags consistency.
 */
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
                             int *count, unsigned lookup_flags)
{
        struct vfsmount *mnt = path->mnt;
        bool need_mntput = false;
        int ret = 0;

        while (flags & DCACHE_MANAGED_DENTRY) {
                /* Allow the filesystem to manage the transit without i_mutex
                 * being held. */
                if (flags & DCACHE_MANAGE_TRANSIT) {
                        ret = path->dentry->d_op->d_manage(path, false);
                        flags = smp_load_acquire(&path->dentry->d_flags);
                        if (ret < 0)
                                break;
                }

                if (flags & DCACHE_MOUNTED) {        // something's mounted on it..
                        struct vfsmount *mounted = lookup_mnt(path);
                        if (mounted) {                // ... in our namespace
                                dput(path->dentry);
                                if (need_mntput)
                                        mntput(path->mnt);
                                path->mnt = mounted;
                                path->dentry = dget(mounted->mnt_root);
                                // here we know it's positive
                                flags = path->dentry->d_flags;
                                need_mntput = true;
                                continue;
                        }
                }

                if (!(flags & DCACHE_NEED_AUTOMOUNT))
                        break;

                // uncovered automount point
                ret = follow_automount(path, count, lookup_flags);
                flags = smp_load_acquire(&path->dentry->d_flags);
                if (ret < 0)
                        break;
        }

        if (ret == -EISDIR)
                ret = 0;
        // possible if you race with several mount --move
        if (need_mntput && path->mnt == mnt)
                mntput(path->mnt);
        if (!ret && unlikely(d_flags_negative(flags)))
                ret = -ENOENT;
        *jumped = need_mntput;
        return ret;
}

static inline int traverse_mounts(struct path *path, bool *jumped,
                                  int *count, unsigned lookup_flags)
{
        unsigned flags = smp_load_acquire(&path->dentry->d_flags);

        /* fastpath */
        if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
                *jumped = false;
                if (unlikely(d_flags_negative(flags)))
                        return -ENOENT;
                return 0;
        }
        return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

int follow_down_one(struct path *path)
{
        struct vfsmount *mounted;

        mounted = lookup_mnt(path);
        if (mounted) {
                dput(path->dentry);
                mntput(path->mnt);
                path->mnt = mounted;
                path->dentry = dget(mounted->mnt_root);
                return 1;
        }
        return 0;
}
EXPORT_SYMBOL(follow_down_one);

/*
 * Follow down to the covering mount currently visible to userspace.  At each
 * point, the filesystem owning that dentry may be queried as to whether the
 * caller is permitted to proceed or not.
 */
int follow_down(struct path *path, unsigned int flags)
{
        struct vfsmount *mnt = path->mnt;
        bool jumped;
        int ret = traverse_mounts(path, &jumped, NULL, flags);

        if (path->mnt != mnt)
                mntput(mnt);
        return ret;
}
EXPORT_SYMBOL(follow_down);

/*
 * Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
 * we meet a managed dentry that would need blocking.
 */
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
{
        struct dentry *dentry = path->dentry;
        unsigned int flags = dentry->d_flags;

        if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
                return true;

        if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                return false;

        for (;;) {
                /*
                 * Don't forget we might have a non-mountpoint managed dentry
                 * that wants to block transit.
                 */
                if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
                        int res = dentry->d_op->d_manage(path, true);
                        if (res)
                                return res == -EISDIR;
                        flags = dentry->d_flags;
                }

                if (flags & DCACHE_MOUNTED) {
                        struct mount *mounted = __lookup_mnt(path->mnt, dentry);
                        if (mounted) {
                                path->mnt = &mounted->mnt;
                                dentry = path->dentry = mounted->mnt.mnt_root;
                                nd->state |= ND_JUMPED;
                                nd->next_seq = read_seqcount_begin(&dentry->d_seq);
                                flags = dentry->d_flags;
                                // makes sure that non-RCU pathwalk could reach
                                // this state.
                                if (read_seqretry(&mount_lock, nd->m_seq))
                                        return false;
                                continue;
                        }
                        if (read_seqretry(&mount_lock, nd->m_seq))
                                return false;
                }
                return !(flags & DCACHE_NEED_AUTOMOUNT);
        }
}

static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
                          struct path *path)
{
        bool jumped;
        int ret;

        path->mnt = nd->path.mnt;
        path->dentry = dentry;
        if (nd->flags & LOOKUP_RCU) {
                unsigned int seq = nd->next_seq;
                if (likely(__follow_mount_rcu(nd, path)))
                        return 0;
                // *path and nd->next_seq might've been clobbered
                path->mnt = nd->path.mnt;
                path->dentry = dentry;
                nd->next_seq = seq;
                if (!try_to_unlazy_next(nd, dentry))
                        return -ECHILD;
        }
        ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
        if (jumped) {
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        ret = -EXDEV;
                else
                        nd->state |= ND_JUMPED;
        }
        if (unlikely(ret)) {
                dput(path->dentry);
                if (path->mnt != nd->path.mnt)
                        mntput(path->mnt);
        }
        return ret;
}

/*
 * This looks up the name in dcache and possibly revalidates the found dentry.
 * NULL is returned if the dentry does not exist in the cache.
 */
static struct dentry *lookup_dcache(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry = d_lookup(dir, name);
        if (dentry) {
                int error = d_revalidate(dir->d_inode, name, dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error)
                                d_invalidate(dentry);
                        dput(dentry);
                        return ERR_PTR(error);
                }
        }
        return dentry;
}

static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name,
                                               struct dentry *base,
                                               unsigned int flags)
{
        struct dentry *dentry;
        struct dentry *old;
        struct inode *dir;

        dentry = lookup_dcache(name, base, flags);
        if (dentry)
                return dentry;

        /* Don't create child dentry for a dead directory. */
        dir = base->d_inode;
        if (unlikely(IS_DEADDIR(dir)))
                return ERR_PTR(-ENOENT);

        dentry = d_alloc(base, name);
        if (unlikely(!dentry))
                return ERR_PTR(-ENOMEM);

        old = dir->i_op->lookup(dir, dentry, flags);
        if (unlikely(old)) {
                dput(dentry);
                dentry = old;
        }
        return dentry;
}

/*
 * Parent directory has inode locked exclusive.  This is one
 * and only case when ->lookup() gets called on non in-lookup
 * dentries - as the matter of fact, this only gets called
 * when directory is guaranteed to have no in-lookup children
 * at all.
 * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
 * Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
 */
struct dentry *lookup_one_qstr_excl(const struct qstr *name,
                                    struct dentry *base, unsigned int flags)
{
        struct dentry *dentry;

        dentry = lookup_one_qstr_excl_raw(name, base, flags);
        if (IS_ERR(dentry))
                return dentry;
        if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
                dput(dentry);
                return ERR_PTR(-ENOENT);
        }
        if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
                dput(dentry);
                return ERR_PTR(-EEXIST);
        }
        return dentry;
}
EXPORT_SYMBOL(lookup_one_qstr_excl);

/**
 * lookup_fast - do fast lockless (but racy) lookup of a dentry
 * @nd: current nameidata
 *
 * Do a fast, but racy lookup in the dcache for the given dentry, and
 * revalidate it. Returns a valid dentry pointer or NULL if one wasn't
 * found. On error, an ERR_PTR will be returned.
 *
 * If this function returns a valid dentry and the walk is no longer
 * lazy, the dentry will carry a reference that must later be put. If
 * RCU mode is still in force, then this is not the case and the dentry
 * must be legitimized before use. If this returns NULL, then the walk
 * will no longer be in RCU mode.
 */
static struct dentry *lookup_fast(struct nameidata *nd)
{
        struct dentry *dentry, *parent = nd->path.dentry;
        int status = 1;

        /*
         * Rename seqlock is not required here because in the off chance
         * of a false negative due to a concurrent rename, the caller is
         * going to fall back to non-racy lookup.
         */
        if (nd->flags & LOOKUP_RCU) {
                dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
                if (unlikely(!dentry)) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                        return NULL;
                }

                /*
                 * This sequence count validates that the parent had no
                 * changes while we did the lookup of the dentry above.
                 */
                if (read_seqcount_retry(&parent->d_seq, nd->seq))
                        return ERR_PTR(-ECHILD);

                status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
                if (likely(status > 0))
                        return dentry;
                if (!try_to_unlazy_next(nd, dentry))
                        return ERR_PTR(-ECHILD);
                if (status == -ECHILD)
                        /* we'd been told to redo it in non-rcu mode */
                        status = d_revalidate(nd->inode, &nd->last,
                                              dentry, nd->flags);
        } else {
                dentry = __d_lookup(parent, &nd->last);
                if (unlikely(!dentry))
                        return NULL;
                status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
        }
        if (unlikely(status <= 0)) {
                if (!status)
                        d_invalidate(dentry);
                dput(dentry);
                return ERR_PTR(status);
        }
        return dentry;
}

/* Fast lookup failed, do it the slow way */
static struct dentry *__lookup_slow(const struct qstr *name,
                                    struct dentry *dir,
                                    unsigned int flags)
{
        struct dentry *dentry, *old;
        struct inode *inode = dir->d_inode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        /* Don't go there if it's already dead */
        if (unlikely(IS_DEADDIR(inode)))
                return ERR_PTR(-ENOENT);
again:
        dentry = d_alloc_parallel(dir, name, &wq);
        if (IS_ERR(dentry))
                return dentry;
        if (unlikely(!d_in_lookup(dentry))) {
                int error = d_revalidate(inode, name, dentry, flags);
                if (unlikely(error <= 0)) {
                        if (!error) {
                                d_invalidate(dentry);
                                dput(dentry);
                                goto again;
                        }
                        dput(dentry);
                        dentry = ERR_PTR(error);
                }
        } else {
                old = inode->i_op->lookup(inode, dentry, flags);
                d_lookup_done(dentry);
                if (unlikely(old)) {
                        dput(dentry);
                        dentry = old;
                }
        }
        return dentry;
}

static struct dentry *lookup_slow(const struct qstr *name,
                                  struct dentry *dir,
                                  unsigned int flags)
{
        struct inode *inode = dir->d_inode;
        struct dentry *res;
        inode_lock_shared(inode);
        res = __lookup_slow(name, dir, flags);
        inode_unlock_shared(inode);
        return res;
}

static inline int may_lookup(struct mnt_idmap *idmap,
                             struct nameidata *restrict nd)
{
        int err, mask;

        mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
        err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
        if (likely(!err))
                return 0;

        // If we failed, and we weren't in LOOKUP_RCU, it's final
        if (!(nd->flags & LOOKUP_RCU))
                return err;

        // Drop out of RCU mode to make sure it wasn't transient
        if (!try_to_unlazy(nd))
                return -ECHILD;        // redo it all non-lazy

        if (err != -ECHILD)        // hard error
                return err;

        return inode_permission(idmap, nd->inode, MAY_EXEC);
}

static int reserve_stack(struct nameidata *nd, struct path *link)
{
        if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
                return -ELOOP;

        if (likely(nd->depth != EMBEDDED_LEVELS))
                return 0;
        if (likely(nd->stack != nd->internal))
                return 0;
        if (likely(nd_alloc_stack(nd)))
                return 0;

        if (nd->flags & LOOKUP_RCU) {
                // we need to grab link before we do unlazy.  And we can't skip
                // unlazy even if we fail to grab the link - cleanup needs it
                bool grabbed_link = legitimize_path(nd, link, nd->next_seq);

                if (!try_to_unlazy(nd) || !grabbed_link)
                        return -ECHILD;

                if (nd_alloc_stack(nd))
                        return 0;
        }
        return -ENOMEM;
}

enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

static const char *pick_link(struct nameidata *nd, struct path *link,
                     struct inode *inode, int flags)
{
        struct saved *last;
        const char *res;
        int error = reserve_stack(nd, link);

        if (unlikely(error)) {
                if (!(nd->flags & LOOKUP_RCU))
                        path_put(link);
                return ERR_PTR(error);
        }
        last = nd->stack + nd->depth++;
        last->link = *link;
        clear_delayed_call(&last->done);
        last->seq = nd->next_seq;

        if (flags & WALK_TRAILING) {
                error = may_follow_link(nd, inode);
                if (unlikely(error))
                        return ERR_PTR(error);
        }

        if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
                        unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
                return ERR_PTR(-ELOOP);

        if (!(nd->flags & LOOKUP_RCU)) {
                touch_atime(&last->link);
                cond_resched();
        } else if (atime_needs_update(&last->link, inode)) {
                if (!try_to_unlazy(nd))
                        return ERR_PTR(-ECHILD);
                touch_atime(&last->link);
        }

        error = security_inode_follow_link(link->dentry, inode,
                                           nd->flags & LOOKUP_RCU);
        if (unlikely(error))
                return ERR_PTR(error);

        res = READ_ONCE(inode->i_link);
        if (!res) {
                const char * (*get)(struct dentry *, struct inode *,
                                struct delayed_call *);
                get = inode->i_op->get_link;
                if (nd->flags & LOOKUP_RCU) {
                        res = get(NULL, inode, &last->done);
                        if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
                                res = get(link->dentry, inode, &last->done);
                } else {
                        res = get(link->dentry, inode, &last->done);
                }
                if (!res)
                        goto all_done;
                if (IS_ERR(res))
                        return res;
        }
        if (*res == '/') {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                while (unlikely(*++res == '/'))
                        ;
        }
        if (*res)
                return res;
all_done: // pure jump
        put_link(nd);
        return NULL;
}

/*
 * Do we need to follow links? We _really_ want to be able
 * to do this check without having to look at inode->i_op,
 * so we keep a cache of "no, this doesn't need follow_link"
 * for the common case.
 *
 * NOTE: dentry must be what nd->next_seq had been sampled from.
 */
static const char *step_into(struct nameidata *nd, int flags,
                     struct dentry *dentry)
{
        struct path path;
        struct inode *inode;
        int err = handle_mounts(nd, dentry, &path);

        if (err < 0)
                return ERR_PTR(err);
        inode = path.dentry->d_inode;
        if (likely(!d_is_symlink(path.dentry)) ||
           ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
           (flags & WALK_NOFOLLOW)) {
                /* not a symlink or should not follow */
                if (nd->flags & LOOKUP_RCU) {
                        if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
                                return ERR_PTR(-ECHILD);
                        if (unlikely(!inode))
                                return ERR_PTR(-ENOENT);
                } else {
                        dput(nd->path.dentry);
                        if (nd->path.mnt != path.mnt)
                                mntput(nd->path.mnt);
                }
                nd->path = path;
                nd->inode = inode;
                nd->seq = nd->next_seq;
                return NULL;
        }
        if (nd->flags & LOOKUP_RCU) {
                /* make sure that d_is_symlink above matches inode */
                if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
                        return ERR_PTR(-ECHILD);
        } else {
                if (path.mnt == nd->path.mnt)
                        mntget(path.mnt);
        }
        return pick_link(nd, &path, inode, flags);
}

static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
{
        struct dentry *parent, *old;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;
                unsigned seq;
                if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
                                           &nd->root, &path, &seq))
                        goto in_root;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-ECHILD);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                nd->seq = seq;
                // makes sure that non-RCU pathwalk could reach this state
                if (read_seqretry(&mount_lock, nd->m_seq))
                        return ERR_PTR(-ECHILD);
                /* we know that mountpoint was pinned */
        }
        old = nd->path.dentry;
        parent = old->d_parent;
        nd->next_seq = read_seqcount_begin(&parent->d_seq);
        // makes sure that non-RCU pathwalk could reach this state
        if (read_seqcount_retry(&old->d_seq, nd->seq))
                return ERR_PTR(-ECHILD);
        if (unlikely(!path_connected(nd->path.mnt, parent)))
                return ERR_PTR(-ECHILD);
        return parent;
in_root:
        if (read_seqretry(&mount_lock, nd->m_seq))
                return ERR_PTR(-ECHILD);
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-ECHILD);
        nd->next_seq = nd->seq;
        return nd->path.dentry;
}

static struct dentry *follow_dotdot(struct nameidata *nd)
{
        struct dentry *parent;

        if (path_equal(&nd->path, &nd->root))
                goto in_root;
        if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
                struct path path;

                if (!choose_mountpoint(real_mount(nd->path.mnt),
                                       &nd->root, &path))
                        goto in_root;
                path_put(&nd->path);
                nd->path = path;
                nd->inode = path.dentry->d_inode;
                if (unlikely(nd->flags & LOOKUP_NO_XDEV))
                        return ERR_PTR(-EXDEV);
        }
        /* rare case of legitimate dget_parent()... */
        parent = dget_parent(nd->path.dentry);
        if (unlikely(!path_connected(nd->path.mnt, parent))) {
                dput(parent);
                return ERR_PTR(-ENOENT);
        }
        return parent;

in_root:
        if (unlikely(nd->flags & LOOKUP_BENEATH))
                return ERR_PTR(-EXDEV);
        return dget(nd->path.dentry);
}

static const char *handle_dots(struct nameidata *nd, int type)
{
        if (type == LAST_DOTDOT) {
                const char *error = NULL;
                struct dentry *parent;

                if (!nd->root.mnt) {
                        error = ERR_PTR(set_root(nd));
                        if (error)
                                return error;
                }
                if (nd->flags & LOOKUP_RCU)
                        parent = follow_dotdot_rcu(nd);
                else
                        parent = follow_dotdot(nd);
                if (IS_ERR(parent))
                        return ERR_CAST(parent);
                error = step_into(nd, WALK_NOFOLLOW, parent);
                if (unlikely(error))
                        return error;

                if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
                        /*
                         * If there was a racing rename or mount along our
                         * path, then we can't be sure that ".." hasn't jumped
                         * above nd->root (and so userspace should retry or use
                         * some fallback).
                         */
                        smp_rmb();
                        if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
                                return ERR_PTR(-EAGAIN);
                        if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
                                return ERR_PTR(-EAGAIN);
                }
        }
        return NULL;
}

static const char *walk_component(struct nameidata *nd, int flags)
{
        struct dentry *dentry;
        /*
         * "." and ".." are special - ".." especially so because it has
         * to be able to know about the current root directory and
         * parent relationships.
         */
        if (unlikely(nd->last_type != LAST_NORM)) {
                if (!(flags & WALK_MORE) && nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }
        dentry = lookup_fast(nd);
        if (IS_ERR(dentry))
                return ERR_CAST(dentry);
        if (unlikely(!dentry)) {
                dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
                if (IS_ERR(dentry))
                        return ERR_CAST(dentry);
        }
        if (!(flags & WALK_MORE) && nd->depth)
                put_link(nd);
        return step_into(nd, flags, dentry);
}

/*
 * We can do the critical dentry name comparison and hashing
 * operations one word at a time, but we are limited to:
 *
 * - Architectures with fast unaligned word accesses. We could
 *   do a "get_unaligned()" if this helps and is sufficiently
 *   fast.
 *
 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
 *   do not trap on the (extremely unlikely) case of a page
 *   crossing operation.
 *
 * - Furthermore, we need an efficient 64-bit compile for the
 *   64-bit case in order to generate the "number of bytes in
 *   the final mask". Again, that could be replaced with a
 *   efficient population count instruction or similar.
 */
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>

#ifdef HASH_MIX

/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */

#elif defined(CONFIG_64BIT)
/*
 * Register pressure in the mixing function is an issue, particularly
 * on 32-bit x86, but almost any function requires one state value and
 * one temporary.  Instead, use a function designed for two state values
 * and no temporaries.
 *
 * This function cannot create a collision in only two iterations, so
 * we have two iterations to achieve avalanche.  In those two iterations,
 * we have six layers of mixing, which is enough to spread one bit's
 * influence out to 2^6 = 64 state bits.
 *
 * Rotate constants are scored by considering either 64 one-bit input
 * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
 * probability of that delta causing a change to each of the 128 output
 * bits, using a sample of random initial states.
 *
 * The Shannon entropy of the computed probabilities is then summed
 * to produce a score.  Ideally, any input change has a 50% chance of
 * toggling any given output bit.
 *
 * Mixing scores (in bits) for (12,45):
 * Input delta: 1-bit      2-bit
 * 1 round:     713.3    42542.6
 * 2 rounds:   2753.7   140389.8
 * 3 rounds:   5954.1   233458.2
 * 4 rounds:   7862.6   256672.2
 * Perfect:    8192     258048
 *            (64*128) (64*63/2 * 128)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol64(x,12),\
        x += y,        y = rol64(y,45),\
        y *= 9                        )

/*
 * Fold two longs into one 32-bit hash value.  This must be fast, but
 * latency isn't quite as critical, as there is a fair bit of additional
 * work done before the hash value is used.
 */
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        y ^= x * GOLDEN_RATIO_64;
        y *= GOLDEN_RATIO_64;
        return y >> 32;
}

#else        /* 32-bit case */

/*
 * Mixing scores (in bits) for (7,20):
 * Input delta: 1-bit      2-bit
 * 1 round:     330.3     9201.6
 * 2 rounds:   1246.4    25475.4
 * 3 rounds:   1907.1    31295.1
 * 4 rounds:   2042.3    31718.6
 * Perfect:    2048      31744
 *            (32*64)   (32*31/2 * 64)
 */
#define HASH_MIX(x, y, a)        \
        (        x ^= (a),        \
        y ^= x,        x = rol32(x, 7),\
        x += y,        y = rol32(y,20),\
        y *= 9                        )

static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
        /* Use arch-optimized multiply if one exists */
        return __hash_32(y ^ __hash_32(x));
}

#endif

/*
 * Return the hash of a string of known length.  This is carfully
 * designed to match hash_name(), which is the more critical function.
 * In particular, we must end by hashing a final word containing 0..7
 * payload bytes, to match the way that hash_name() iterates until it
 * finds the delimiter after the name.
 */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long a, x = 0, y = (unsigned long)salt;

        for (;;) {
                if (!len)
                        goto done;
                a = load_unaligned_zeropad(name);
                if (len < sizeof(unsigned long))
                        break;
                HASH_MIX(x, y, a);
                name += sizeof(unsigned long);
                len -= sizeof(unsigned long);
        }
        x ^= a & bytemask_from_count(len);
done:
        return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long a = 0, x = 0, y = (unsigned long)salt;
        unsigned long adata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        len = 0;
        goto inside;

        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
inside:
                a = load_unaligned_zeropad(name+len);
        } while (!has_zero(a, &adata, &constants));

        adata = prep_zero_mask(a, adata, &constants);
        mask = create_zero_mask(adata);
        x ^= a & zero_bytemask(mask);

        return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
EXPORT_SYMBOL(hashlen_string);

/*
 * Calculate the length and hash of the path component, and
 * return the length as the result.
 */
static inline const char *hash_name(struct nameidata *nd,
                                    const char *name,
                                    unsigned long *lastword)
{
        unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
        unsigned long adata, bdata, mask, len;
        const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

        /*
         * The first iteration is special, because it can result in
         * '.' and '..' and has no mixing other than the final fold.
         */
        a = load_unaligned_zeropad(name);
        b = a ^ REPEAT_BYTE('/');
        if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) {
                adata = prep_zero_mask(a, adata, &constants);
                bdata = prep_zero_mask(b, bdata, &constants);
                mask = create_zero_mask(adata | bdata);
                a &= zero_bytemask(mask);
                *lastword = a;
                len = find_zero(mask);
                nd->last.hash = fold_hash(a, y);
                nd->last.len = len;
                return name + len;
        }

        len = 0;
        x = 0;
        do {
                HASH_MIX(x, y, a);
                len += sizeof(unsigned long);
                a = load_unaligned_zeropad(name+len);
                b = a ^ REPEAT_BYTE('/');
        } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

        adata = prep_zero_mask(a, adata, &constants);
        bdata = prep_zero_mask(b, bdata, &constants);
        mask = create_zero_mask(adata | bdata);
        a &= zero_bytemask(mask);
        x ^= a;
        len += find_zero(mask);
        *lastword = 0;                // Multi-word components cannot be DOT or DOTDOT

        nd->last.hash = fold_hash(x, y);
        nd->last.len = len;
        return name + len;
}

/*
 * Note that the 'last' word is always zero-masked, but
 * was loaded as a possibly big-endian word.
 */
#ifdef __BIG_ENDIAN
  #define LAST_WORD_IS_DOT        (0x2eul << (BITS_PER_LONG-8))
  #define LAST_WORD_IS_DOTDOT        (0x2e2eul << (BITS_PER_LONG-16))
#endif

#else        /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */

/* Return the hash of a string of known length */
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
        unsigned long hash = init_name_hash(salt);
        while (len--)
                hash = partial_name_hash((unsigned char)*name++, hash);
        return end_name_hash(hash);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
        unsigned long hash = init_name_hash(salt);
        unsigned long len = 0, c;

        c = (unsigned char)*name;
        while (c) {
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        }
        return hashlen_create(end_name_hash(hash), len);
}
EXPORT_SYMBOL(hashlen_string);

/*
 * We know there's a real path component here of at least
 * one character.
 */
static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword)
{
        unsigned long hash = init_name_hash(nd->path.dentry);
        unsigned long len = 0, c, last = 0;

        c = (unsigned char)*name;
        do {
                last = (last << 8) + c;
                len++;
                hash = partial_name_hash(c, hash);
                c = (unsigned char)name[len];
        } while (c && c != '/');

        // This is reliable for DOT or DOTDOT, since the component
        // cannot contain NUL characters - top bits being zero means
        // we cannot have had any other pathnames.
        *lastword = last;
        nd->last.hash = end_name_hash(hash);
        nd->last.len = len;
        return name + len;
}

#endif

#ifndef LAST_WORD_IS_DOT
  #define LAST_WORD_IS_DOT        0x2e
  #define LAST_WORD_IS_DOTDOT        0x2e2e
#endif

/*
 * Name resolution.
 * This is the basic name resolution function, turning a pathname into
 * the final dentry. We expect 'base' to be positive and a directory.
 *
 * Returns 0 and nd will have valid dentry and mnt on success.
 * Returns error and drops reference to input namei data on failure.
 */
static int link_path_walk(const char *name, struct nameidata *nd)
{
        int depth = 0; // depth <= nd->depth
        int err;

        nd->last_type = LAST_ROOT;
        nd->flags |= LOOKUP_PARENT;
        if (IS_ERR(name))
                return PTR_ERR(name);
        while (*name=='/')
                name++;
        if (!*name) {
                nd->dir_mode = 0; // short-circuit the 'hardening' idiocy
                return 0;
        }

        /* At this point we know we have a real path component. */
        for(;;) {
                struct mnt_idmap *idmap;
                const char *link;
                unsigned long lastword;

                idmap = mnt_idmap(nd->path.mnt);
                err = may_lookup(idmap, nd);
                if (err)
                        return err;

                nd->last.name = name;
                name = hash_name(nd, name, &lastword);

                switch(lastword) {
                case LAST_WORD_IS_DOTDOT:
                        nd->last_type = LAST_DOTDOT;
                        nd->state |= ND_JUMPED;
                        break;

                case LAST_WORD_IS_DOT:
                        nd->last_type = LAST_DOT;
                        break;

                default:
                        nd->last_type = LAST_NORM;
                        nd->state &= ~ND_JUMPED;

                        struct dentry *parent = nd->path.dentry;
                        if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
                                err = parent->d_op->d_hash(parent, &nd->last);
                                if (err < 0)
                                        return err;
                        }
                }

                if (!*name)
                        goto OK;
                /*
                 * If it wasn't NUL, we know it was '/'. Skip that
                 * slash, and continue until no more slashes.
                 */
                do {
                        name++;
                } while (unlikely(*name == '/'));
                if (unlikely(!*name)) {
OK:
                        /* pathname or trailing symlink, done */
                        if (!depth) {
                                nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode);
                                nd->dir_mode = nd->inode->i_mode;
                                nd->flags &= ~LOOKUP_PARENT;
                                return 0;
                        }
                        /* last component of nested symlink */
                        name = nd->stack[--depth].name;
                        link = walk_component(nd, 0);
                } else {
                        /* not the last component */
                        link = walk_component(nd, WALK_MORE);
                }
                if (unlikely(link)) {
                        if (IS_ERR(link))
                                return PTR_ERR(link);
                        /* a symlink to follow */
                        nd->stack[depth++].name = name;
                        name = link;
                        continue;
                }
                if (unlikely(!d_can_lookup(nd->path.dentry))) {
                        if (nd->flags & LOOKUP_RCU) {
                                if (!try_to_unlazy(nd))
                                        return -ECHILD;
                        }
                        return -ENOTDIR;
                }
        }
}

/* must be paired with terminate_walk() */
static const char *path_init(struct nameidata *nd, unsigned flags)
{
        int error;
        const char *s = nd->pathname;

        /* LOOKUP_CACHED requires RCU, ask caller to retry */
        if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
                return ERR_PTR(-EAGAIN);

        if (!*s)
                flags &= ~LOOKUP_RCU;
        if (flags & LOOKUP_RCU)
                rcu_read_lock();
        else
                nd->seq = nd->next_seq = 0;

        nd->flags = flags;
        nd->state |= ND_JUMPED;

        nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount);
        nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount);
        smp_rmb();

        if (nd->state & ND_ROOT_PRESET) {
                struct dentry *root = nd->root.dentry;
                struct inode *inode = root->d_inode;
                if (*s && unlikely(!d_can_lookup(root)))
                        return ERR_PTR(-ENOTDIR);
                nd->path = nd->root;
                nd->inode = inode;
                if (flags & LOOKUP_RCU) {
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->path);
                }
                return s;
        }

        nd->root.mnt = NULL;

        /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
        if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
                error = nd_jump_root(nd);
                if (unlikely(error))
                        return ERR_PTR(error);
                return s;
        }

        /* Relative pathname -- get the starting-point it is relative to. */
        if (nd->dfd == AT_FDCWD) {
                if (flags & LOOKUP_RCU) {
                        struct fs_struct *fs = current->fs;
                        unsigned seq;

                        do {
                                seq = read_seqcount_begin(&fs->seq);
                                nd->path = fs->pwd;
                                nd->inode = nd->path.dentry->d_inode;
                                nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                        } while (read_seqcount_retry(&fs->seq, seq));
                } else {
                        get_fs_pwd(current->fs, &nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
        } else {
                /* Caller must check execute permissions on the starting path component */
                CLASS(fd_raw, f)(nd->dfd);
                struct dentry *dentry;

                if (fd_empty(f))
                        return ERR_PTR(-EBADF);

                if (flags & LOOKUP_LINKAT_EMPTY) {
                        if (fd_file(f)->f_cred != current_cred() &&
                            !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
                                return ERR_PTR(-ENOENT);
                }

                dentry = fd_file(f)->f_path.dentry;

                if (*s && unlikely(!d_can_lookup(dentry)))
                        return ERR_PTR(-ENOTDIR);

                nd->path = fd_file(f)->f_path;
                if (flags & LOOKUP_RCU) {
                        nd->inode = nd->path.dentry->d_inode;
                        nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
                } else {
                        path_get(&nd->path);
                        nd->inode = nd->path.dentry->d_inode;
                }
        }

        /* For scoped-lookups we need to set the root to the dirfd as well. */
        if (flags & LOOKUP_IS_SCOPED) {
                nd->root = nd->path;
                if (flags & LOOKUP_RCU) {
                        nd->root_seq = nd->seq;
                } else {
                        path_get(&nd->root);
                        nd->state |= ND_ROOT_GRABBED;
                }
        }
        return s;
}

static inline const char *lookup_last(struct nameidata *nd)
{
        if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

        return walk_component(nd, WALK_TRAILING);
}

static int handle_lookup_down(struct nameidata *nd)
{
        if (!(nd->flags & LOOKUP_RCU))
                dget(nd->path.dentry);
        nd->next_seq = nd->seq;
        return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
}

/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
        const char *s = path_init(nd, flags);
        int err;

        if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) {
                err = handle_lookup_down(nd);
                if (unlikely(err < 0))
                        s = ERR_PTR(err);
        }

        while (!(err = link_path_walk(s, nd)) &&
               (s = lookup_last(nd)) != NULL)
                ;
        if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) {
                err = handle_lookup_down(nd);
                nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please...
        }
        if (!err)
                err = complete_walk(nd);

        if (!err && nd->flags & LOOKUP_DIRECTORY)
                if (!d_can_lookup(nd->path.dentry))
                        err = -ENOTDIR;
        if (!err) {
                *path = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

int filename_lookup(int dfd, struct filename *name, unsigned flags,
                    struct path *path, struct path *root)
{
        int retval;
        struct nameidata nd;
        if (IS_ERR(name))
                return PTR_ERR(name);
        set_nameidata(&nd, dfd, name, root);
        retval = path_lookupat(&nd, flags | LOOKUP_RCU, path);
        if (unlikely(retval == -ECHILD))
                retval = path_lookupat(&nd, flags, path);
        if (unlikely(retval == -ESTALE))
                retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path);

        if (likely(!retval))
                audit_inode(name, path->dentry,
                            flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0);
        restore_nameidata();
        return retval;
}

/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
                                struct path *parent)
{
        const char *s = path_init(nd, flags);
        int err = link_path_walk(s, nd);
        if (!err)
                err = complete_walk(nd);
        if (!err) {
                *parent = nd->path;
                nd->path.mnt = NULL;
                nd->path.dentry = NULL;
        }
        terminate_walk(nd);
        return err;
}

/* Note: this does not consume "name" */
static int __filename_parentat(int dfd, struct filename *name,
                               unsigned int flags, struct path *parent,
                               struct qstr *last, int *type,
                               const struct path *root)
{
        int retval;
        struct nameidata nd;

        if (IS_ERR(name))
                return PTR_ERR(name);
        set_nameidata(&nd, dfd, name, root);
        retval = path_parentat(&nd, flags | LOOKUP_RCU, parent);
        if (unlikely(retval == -ECHILD))
                retval = path_parentat(&nd, flags, parent);
        if (unlikely(retval == -ESTALE))
                retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
        if (likely(!retval)) {
                *last = nd.last;
                *type = nd.last_type;
                audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
        }
        restore_nameidata();
        return retval;
}

static int filename_parentat(int dfd, struct filename *name,
                             unsigned int flags, struct path *parent,
                             struct qstr *last, int *type)
{
        return __filename_parentat(dfd, name, flags, parent, last, type, NULL);
}

/* does lookup, returns the object with parent locked */
static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
{
        struct path parent_path __free(path_put) = {};
        struct dentry *d;
        struct qstr last;
        int type, error;

        error = filename_parentat(dfd, name, 0, &parent_path, &last, &type);
        if (error)
                return ERR_PTR(error);
        if (unlikely(type != LAST_NORM))
                return ERR_PTR(-EINVAL);
        inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
        d = lookup_one_qstr_excl(&last, parent_path.dentry, 0);
        if (IS_ERR(d)) {
                inode_unlock(parent_path.dentry->d_inode);
                return d;
        }
        path->dentry = no_free_ptr(parent_path.dentry);
        path->mnt = no_free_ptr(parent_path.mnt);
        return d;
}

struct dentry *kern_path_locked_negative(const char *name, struct path *path)
{
        struct path parent_path __free(path_put) = {};
        struct filename *filename __free(putname) = getname_kernel(name);
        struct dentry *d;
        struct qstr last;
        int type, error;

        error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type);
        if (error)
                return ERR_PTR(error);
        if (unlikely(type != LAST_NORM))
                return ERR_PTR(-EINVAL);
        inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT);
        d = lookup_one_qstr_excl_raw(&last, parent_path.dentry, 0);
        if (IS_ERR(d)) {
                inode_unlock(parent_path.dentry->d_inode);
                return d;
        }
        path->dentry = no_free_ptr(parent_path.dentry);
        path->mnt = no_free_ptr(parent_path.mnt);
        return d;
}

struct dentry *kern_path_locked(const char *name, struct path *path)
{
        struct filename *filename = getname_kernel(name);
        struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);

        putname(filename);
        return res;
}

struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
{
        struct filename *filename = getname(name);
        struct dentry *res = __kern_path_locked(dfd, filename, path);

        putname(filename);
        return res;
}
EXPORT_SYMBOL(user_path_locked_at);

int kern_path(const char *name, unsigned int flags, struct path *path)
{
        struct filename *filename = getname_kernel(name);
        int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL);

        putname(filename);
        return ret;

}
EXPORT_SYMBOL(kern_path);

/**
 * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair
 * @filename: filename structure
 * @flags: lookup flags
 * @parent: pointer to struct path to fill
 * @last: last component
 * @type: type of the last component
 * @root: pointer to struct path of the base directory
 */
int vfs_path_parent_lookup(struct filename *filename, unsigned int flags,
                           struct path *parent, struct qstr *last, int *type,
                           const struct path *root)
{
        return  __filename_parentat(AT_FDCWD, filename, flags, parent, last,
                                    type, root);
}
EXPORT_SYMBOL(vfs_path_parent_lookup);

/**
 * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair
 * @dentry:  pointer to dentry of the base directory
 * @mnt: pointer to vfs mount of the base directory
 * @name: pointer to file name
 * @flags: lookup flags
 * @path: pointer to struct path to fill
 */
int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
                    const char *name, unsigned int flags,
                    struct path *path)
{
        struct filename *filename;
        struct path root = {.mnt = mnt, .dentry = dentry};
        int ret;

        filename = getname_kernel(name);
        /* the first argument of filename_lookup() is ignored with root */
        ret = filename_lookup(AT_FDCWD, filename, flags, path, &root);
        putname(filename);
        return ret;
}
EXPORT_SYMBOL(vfs_path_lookup);

static int lookup_one_common(struct mnt_idmap *idmap,
                             const char *name, struct dentry *base, int len,
                             struct qstr *this)
{
        this->name = name;
        this->len = len;
        this->hash = full_name_hash(base, name, len);
        if (!len)
                return -EACCES;

        if (is_dot_dotdot(name, len))
                return -EACCES;

        while (len--) {
                unsigned int c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
                        return -EACCES;
        }
        /*
         * See if the low-level filesystem might want
         * to use its own hash..
         */
        if (base->d_flags & DCACHE_OP_HASH) {
                int err = base->d_op->d_hash(base, this);
                if (err < 0)
                        return err;
        }

        return inode_permission(idmap, base->d_inode, MAY_EXEC);
}

/**
 * try_lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Look up a dentry by name in the dcache, returning NULL if it does not
 * currently exist.  The function does not try to create a dentry.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * No locks need be held - only a counted reference to @base is needed.
 *
 */
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct qstr this;
        int err;

        err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        return lookup_dcache(&this, base, 0);
}
EXPORT_SYMBOL(try_lookup_one_len);

/**
 * lookup_one_len - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
{
        struct dentry *dentry;
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(&this, base, 0);
        return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one_len);

/**
 * lookup_one - filesystem helper to lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The caller must hold base->i_mutex.
 */
struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name,
                          struct dentry *base, int len)
{
        struct dentry *dentry;
        struct qstr this;
        int err;

        WARN_ON_ONCE(!inode_is_locked(base->d_inode));

        err = lookup_one_common(idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        dentry = lookup_dcache(&this, base, 0);
        return dentry ? dentry : __lookup_slow(&this, base, 0);
}
EXPORT_SYMBOL(lookup_one);

/**
 * lookup_one_unlocked - filesystem helper to lookup single pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap,
                                   const char *name, struct dentry *base,
                                   int len)
{
        struct qstr this;
        int err;
        struct dentry *ret;

        err = lookup_one_common(idmap, name, base, len, &this);
        if (err)
                return ERR_PTR(err);

        ret = lookup_dcache(&this, base, 0);
        if (!ret)
                ret = lookup_slow(&this, base, 0);
        return ret;
}
EXPORT_SYMBOL(lookup_one_unlocked);

/**
 * lookup_one_positive_unlocked - filesystem helper to lookup single
 *                                  pathname component
 * @idmap:        idmap of the mount the lookup is performed from
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns
 * known positive or ERR_PTR(). This is what most of the users want.
 *
 * Note that pinned negative with unlocked parent _can_ become positive at any
 * time, so callers of lookup_one_unlocked() need to be very careful; pinned
 * positives have >d_inode stable, so this one avoids such problems.
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * The helper should be called without i_mutex held.
 */
struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap,
                                            const char *name,
                                            struct dentry *base, int len)
{
        struct dentry *ret = lookup_one_unlocked(idmap, name, base, len);

        if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
                dput(ret);
                ret = ERR_PTR(-ENOENT);
        }
        return ret;
}
EXPORT_SYMBOL(lookup_one_positive_unlocked);

/**
 * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
 * @name:        pathname component to lookup
 * @base:        base directory to lookup from
 * @len:        maximum length @len should be interpreted to
 *
 * Note that this routine is purely a helper for filesystem usage and should
 * not be called by generic code.
 *
 * Unlike lookup_one_len, it should be called without the parent
 * i_mutex held, and will take the i_mutex itself if necessary.
 */
struct dentry *lookup_one_len_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        return lookup_one_unlocked(&nop_mnt_idmap, name, base, len);
}
EXPORT_SYMBOL(lookup_one_len_unlocked);

/*
 * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
 * on negatives.  Returns known positive or ERR_PTR(); that's what
 * most of the users want.  Note that pinned negative with unlocked parent
 * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
 * need to be very careful; pinned positives have ->d_inode stable, so
 * this one avoids such problems.
 */
struct dentry *lookup_positive_unlocked(const char *name,
                                       struct dentry *base, int len)
{
        return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len);
}
EXPORT_SYMBOL(lookup_positive_unlocked);

#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
        /* Find something mounted on "pts" in the same directory as
         * the input path.
         */
        struct dentry *parent = dget_parent(path->dentry);
        struct dentry *child;
        struct qstr this = QSTR_INIT("pts", 3);

        if (unlikely(!path_connected(path->mnt, parent))) {
                dput(parent);
                return -ENOENT;
        }
        dput(path->dentry);
        path->dentry = parent;
        child = d_hash_and_lookup(parent, &this);
        if (IS_ERR_OR_NULL(child))
                return -ENOENT;

        path->dentry = child;
        dput(parent);
        follow_down(path, 0);
        return 0;
}
#endif

int user_path_at(int dfd, const char __user *name, unsigned flags,
                 struct path *path)
{
        struct filename *filename = getname_flags(name, flags);
        int ret = filename_lookup(dfd, filename, flags, path, NULL);

        putname(filename);
        return ret;
}
EXPORT_SYMBOL(user_path_at);

int __check_sticky(struct mnt_idmap *idmap, struct inode *dir,
                   struct inode *inode)
{
        kuid_t fsuid = current_fsuid();

        if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid))
                return 0;
        if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid))
                return 0;
        return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER);
}
EXPORT_SYMBOL(__check_sticky);

/*
 *        Check whether we can remove a link victim from directory dir, check
 *  whether the type of victim is right.
 *  1. We can't do it if dir is read-only (done in permission())
 *  2. We should have write and exec permissions on dir
 *  3. We can't remove anything from append-only dir
 *  4. We can't do anything with immutable dir (done in permission())
 *  5. If the sticky bit on dir is set we should either
 *        a. be owner of dir, or
 *        b. be owner of victim, or
 *        c. have CAP_FOWNER capability
 *  6. If the victim is append-only or immutable we can't do antyhing with
 *     links pointing to it.
 *  7. If the victim has an unknown uid or gid we can't change the inode.
 *  8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 *  9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 * 10. We can't remove a root or mountpoint.
 * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
 *     nfs_async_unlink().
 */
static int may_delete(struct mnt_idmap *idmap, struct inode *dir,
                      struct dentry *victim, bool isdir)
{
        struct inode *inode = d_backing_inode(victim);
        int error;

        if (d_is_negative(victim))
                return -ENOENT;
        BUG_ON(!inode);

        BUG_ON(victim->d_parent->d_inode != dir);

        /* Inode writeback is not safe when the uid or gid are invalid. */
        if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
            !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
                return -EOVERFLOW;

        audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);

        error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (IS_APPEND(dir))
                return -EPERM;

        if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) ||
            IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) ||
            HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        if (isdir) {
                if (!d_is_dir(victim))
                        return -ENOTDIR;
                if (IS_ROOT(victim))
                        return -EBUSY;
        } else if (d_is_dir(victim))
                return -EISDIR;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
                return -EBUSY;
        return 0;
}

/*        Check whether we can create an object with dentry child in directory
 *  dir.
 *  1. We can't do it if child already exists (open has special treatment for
 *     this case, but since we are inlined it's OK)
 *  2. We can't do it if dir is read-only (done in permission())
 *  3. We can't do it if the fs can't represent the fsuid or fsgid.
 *  4. We should have write and exec permissions on dir
 *  5. We can't do it if dir is immutable (done in permission())
 */
static inline int may_create(struct mnt_idmap *idmap,
                             struct inode *dir, struct dentry *child)
{
        audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
        if (child->d_inode)
                return -EEXIST;
        if (IS_DEADDIR(dir))
                return -ENOENT;
        if (!fsuidgid_has_mapping(dir->i_sb, idmap))
                return -EOVERFLOW;

        return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
}

// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
{
        struct dentry *p = p1, *q = p2, *r;

        while ((r = p->d_parent) != p2 && r != p)
                p = r;
        if (r == p2) {
                // p is a child of p2 and an ancestor of p1 or p1 itself
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
                return p;
        }
        // p is the root of connected component that contains p1
        // p2 does not occur on the path from p to p1
        while ((r = q->d_parent) != p1 && r != p && r != q)
                q = r;
        if (r == p1) {
                // q is a child of p1 and an ancestor of p2 or p2 itself
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
                return q;
        } else if (likely(r == p)) {
                // both p2 and p1 are descendents of p
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
                return NULL;
        } else { // no common ancestor at the time we'd been called
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
                return ERR_PTR(-EXDEV);
        }
}

/*
 * p1 and p2 should be directories on the same fs.
 */
struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
{
        if (p1 == p2) {
                inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
                return NULL;
        }

        mutex_lock(&p1->d_sb->s_vfs_rename_mutex);
        return lock_two_directories(p1, p2);
}
EXPORT_SYMBOL(lock_rename);

/*
 * c1 and p2 should be on the same fs.
 */
struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2)
{
        if (READ_ONCE(c1->d_parent) == p2) {
                /*
                 * hopefully won't need to touch ->s_vfs_rename_mutex at all.
                 */
                inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
                /*
                 * now that p2 is locked, nobody can move in or out of it,
                 * so the test below is safe.
                 */
                if (likely(c1->d_parent == p2))
                        return NULL;

                /*
                 * c1 got moved out of p2 while we'd been taking locks;
                 * unlock and fall back to slow case.
                 */
                inode_unlock(p2->d_inode);
        }

        mutex_lock(&c1->d_sb->s_vfs_rename_mutex);
        /*
         * nobody can move out of any directories on this fs.
         */
        if (likely(c1->d_parent != p2))
                return lock_two_directories(c1->d_parent, p2);

        /*
         * c1 got moved into p2 while we were taking locks;
         * we need p2 locked and ->s_vfs_rename_mutex unlocked,
         * for consistency with lock_rename().
         */
        inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
        mutex_unlock(&c1->d_sb->s_vfs_rename_mutex);
        return NULL;
}
EXPORT_SYMBOL(lock_rename_child);

void unlock_rename(struct dentry *p1, struct dentry *p2)
{
        inode_unlock(p1->d_inode);
        if (p1 != p2) {
                inode_unlock(p2->d_inode);
                mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
        }
}
EXPORT_SYMBOL(unlock_rename);

/**
 * vfs_prepare_mode - prepare the mode to be used for a new inode
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        parent directory of the new inode
 * @mode:        mode of the new inode
 * @mask_perms:        allowed permission by the vfs
 * @type:        type of file to be created
 *
 * This helper consolidates and enforces vfs restrictions on the @mode of a new
 * object to be created.
 *
 * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
 * the kernel documentation for mode_strip_umask()). Moving umask stripping
 * after setgid stripping allows the same ordering for both non-POSIX ACL and
 * POSIX ACL supporting filesystems.
 *
 * Note that it's currently valid for @type to be 0 if a directory is created.
 * Filesystems raise that flag individually and we need to check whether each
 * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
 * non-zero type.
 *
 * Returns: mode to be passed to the filesystem
 */
static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
                                       const struct inode *dir, umode_t mode,
                                       umode_t mask_perms, umode_t type)
{
        mode = mode_strip_sgid(idmap, dir, mode);
        mode = mode_strip_umask(dir, mode);

        /*
         * Apply the vfs mandated allowed permission mask and set the type of
         * file to be created before we call into the filesystem.
         */
        mode &= (mask_perms & ~S_IFMT);
        mode |= (type & S_IFMT);

        return mode;
}

/**
 * vfs_create - create new file
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of the parent directory
 * @dentry:        dentry of the child file
 * @mode:        mode of the child file
 * @want_excl:        whether the file must not yet exist
 *
 * Create a new file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
               struct dentry *dentry, umode_t mode, bool want_excl)
{
        int error;

        error = may_create(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->create)
                return -EACCES;        /* shouldn't it be ENOSYS? */

        mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG);
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = dir->i_op->create(idmap, dir, dentry, mode, want_excl);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_create);

int vfs_mkobj(struct dentry *dentry, umode_t mode,
                int (*f)(struct dentry *, umode_t, void *),
                void *arg)
{
        struct inode *dir = dentry->d_parent->d_inode;
        int error = may_create(&nop_mnt_idmap, dir, dentry);
        if (error)
                return error;

        mode &= S_IALLUGO;
        mode |= S_IFREG;
        error = security_inode_create(dir, dentry, mode);
        if (error)
                return error;
        error = f(dentry, mode, arg);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mkobj);

bool may_open_dev(const struct path *path)
{
        return !(path->mnt->mnt_flags & MNT_NODEV) &&
                !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
}

static int may_open(struct mnt_idmap *idmap, const struct path *path,
                    int acc_mode, int flag)
{
        struct dentry *dentry = path->dentry;
        struct inode *inode = dentry->d_inode;
        int error;

        if (!inode)
                return -ENOENT;

        switch (inode->i_mode & S_IFMT) {
        case S_IFLNK:
                return -ELOOP;
        case S_IFDIR:
                if (acc_mode & MAY_WRITE)
                        return -EISDIR;
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                break;
        case S_IFBLK:
        case S_IFCHR:
                if (!may_open_dev(path))
                        return -EACCES;
                fallthrough;
        case S_IFIFO:
        case S_IFSOCK:
                if (acc_mode & MAY_EXEC)
                        return -EACCES;
                flag &= ~O_TRUNC;
                break;
        case S_IFREG:
                if ((acc_mode & MAY_EXEC) && path_noexec(path))
                        return -EACCES;
                break;
        default:
                VFS_BUG_ON_INODE(1, inode);
        }

        error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
        if (error)
                return error;

        /*
         * An append-only file must be opened in append mode for writing.
         */
        if (IS_APPEND(inode)) {
                if  ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND))
                        return -EPERM;
                if (flag & O_TRUNC)
                        return -EPERM;
        }

        /* O_NOATIME can only be set by the owner or superuser */
        if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode))
                return -EPERM;

        return 0;
}

static int handle_truncate(struct mnt_idmap *idmap, struct file *filp)
{
        const struct path *path = &filp->f_path;
        struct inode *inode = path->dentry->d_inode;
        int error = get_write_access(inode);
        if (error)
                return error;

        error = security_file_truncate(filp);
        if (!error) {
                error = do_truncate(idmap, path->dentry, 0,
                                    ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
                                    filp);
        }
        put_write_access(inode);
        return error;
}

static inline int open_to_namei_flags(int flag)
{
        if ((flag & O_ACCMODE) == 3)
                flag--;
        return flag;
}

static int may_o_create(struct mnt_idmap *idmap,
                        const struct path *dir, struct dentry *dentry,
                        umode_t mode)
{
        int error = security_path_mknod(dir, dentry, mode, 0);
        if (error)
                return error;

        if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap))
                return -EOVERFLOW;

        error = inode_permission(idmap, dir->dentry->d_inode,
                                 MAY_WRITE | MAY_EXEC);
        if (error)
                return error;

        return security_inode_create(dir->dentry->d_inode, dentry, mode);
}

/*
 * Attempt to atomically look up, create and open a file from a negative
 * dentry.
 *
 * Returns 0 if successful.  The file will have been created and attached to
 * @file by the filesystem calling finish_open().
 *
 * If the file was looked up only or didn't need creating, FMODE_OPENED won't
 * be set.  The caller will need to perform the open themselves.  @path will
 * have been updated to point to the new dentry.  This may be negative.
 *
 * Returns an error code otherwise.
 */
static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry,
                                  struct file *file,
                                  int open_flag, umode_t mode)
{
        struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
        struct inode *dir =  nd->path.dentry->d_inode;
        int error;

        if (nd->flags & LOOKUP_DIRECTORY)
                open_flag |= O_DIRECTORY;

        file->f_path.dentry = DENTRY_NOT_SET;
        file->f_path.mnt = nd->path.mnt;
        error = dir->i_op->atomic_open(dir, dentry, file,
                                       open_to_namei_flags(open_flag), mode);
        d_lookup_done(dentry);
        if (!error) {
                if (file->f_mode & FMODE_OPENED) {
                        if (unlikely(dentry != file->f_path.dentry)) {
                                dput(dentry);
                                dentry = dget(file->f_path.dentry);
                        }
                } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
                        error = -EIO;
                } else {
                        if (file->f_path.dentry) {
                                dput(dentry);
                                dentry = file->f_path.dentry;
                        }
                        if (unlikely(d_is_negative(dentry)))
                                error = -ENOENT;
                }
        }
        if (error) {
                dput(dentry);
                dentry = ERR_PTR(error);
        }
        return dentry;
}

/*
 * Look up and maybe create and open the last component.
 *
 * Must be called with parent locked (exclusive in O_CREAT case).
 *
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
 *
 * An error code is returned on failure.
 */
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
                                  const struct open_flags *op,
                                  bool got_write)
{
        struct mnt_idmap *idmap;
        struct dentry *dir = nd->path.dentry;
        struct inode *dir_inode = dir->d_inode;
        int open_flag = op->open_flag;
        struct dentry *dentry;
        int error, create_error = 0;
        umode_t mode = op->mode;
        DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

        if (unlikely(IS_DEADDIR(dir_inode)))
                return ERR_PTR(-ENOENT);

        file->f_mode &= ~FMODE_CREATED;
        dentry = d_lookup(dir, &nd->last);
        for (;;) {
                if (!dentry) {
                        dentry = d_alloc_parallel(dir, &nd->last, &wq);
                        if (IS_ERR(dentry))
                                return dentry;
                }
                if (d_in_lookup(dentry))
                        break;

                error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags);
                if (likely(error > 0))
                        break;
                if (error)
                        goto out_dput;
                d_invalidate(dentry);
                dput(dentry);
                dentry = NULL;
        }
        if (dentry->d_inode) {
                /* Cached positive dentry: will open in f_op->open */
                return dentry;
        }

        if (open_flag & O_CREAT)
                audit_inode(nd->name, dir, AUDIT_INODE_PARENT);

        /*
         * Checking write permission is tricky, bacuse we don't know if we are
         * going to actually need it: O_CREAT opens should work as long as the
         * file exists.  But checking existence breaks atomicity.  The trick is
         * to check access and if not granted clear O_CREAT from the flags.
         *
         * Another problem is returing the "right" error value (e.g. for an
         * O_EXCL open we want to return EEXIST not EROFS).
         */
        if (unlikely(!got_write))
                open_flag &= ~O_TRUNC;
        idmap = mnt_idmap(nd->path.mnt);
        if (open_flag & O_CREAT) {
                if (open_flag & O_EXCL)
                        open_flag &= ~O_TRUNC;
                mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
                if (likely(got_write))
                        create_error = may_o_create(idmap, &nd->path,
                                                    dentry, mode);
                else
                        create_error = -EROFS;
        }
        if (create_error)
                open_flag &= ~O_CREAT;
        if (dir_inode->i_op->atomic_open) {
                dentry = atomic_open(nd, dentry, file, open_flag, mode);
                if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
                        dentry = ERR_PTR(create_error);
                return dentry;
        }

        if (d_in_lookup(dentry)) {
                struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
                                                             nd->flags);
                d_lookup_done(dentry);
                if (unlikely(res)) {
                        if (IS_ERR(res)) {
                                error = PTR_ERR(res);
                                goto out_dput;
                        }
                        dput(dentry);
                        dentry = res;
                }
        }

        /* Negative dentry, just create the file */
        if (!dentry->d_inode && (open_flag & O_CREAT)) {
                file->f_mode |= FMODE_CREATED;
                audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
                if (!dir_inode->i_op->create) {
                        error = -EACCES;
                        goto out_dput;
                }

                error = dir_inode->i_op->create(idmap, dir_inode, dentry,
                                                mode, open_flag & O_EXCL);
                if (error)
                        goto out_dput;
        }
        if (unlikely(create_error) && !dentry->d_inode) {
                error = create_error;
                goto out_dput;
        }
        return dentry;

out_dput:
        dput(dentry);
        return ERR_PTR(error);
}

static inline bool trailing_slashes(struct nameidata *nd)
{
        return (bool)nd->last.name[nd->last.len];
}

static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag)
{
        struct dentry *dentry;

        if (open_flag & O_CREAT) {
                if (trailing_slashes(nd))
                        return ERR_PTR(-EISDIR);

                /* Don't bother on an O_EXCL create */
                if (open_flag & O_EXCL)
                        return NULL;
        }

        if (trailing_slashes(nd))
                nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

        dentry = lookup_fast(nd);
        if (IS_ERR_OR_NULL(dentry))
                return dentry;

        if (open_flag & O_CREAT) {
                /* Discard negative dentries. Need inode_lock to do the create */
                if (!dentry->d_inode) {
                        if (!(nd->flags & LOOKUP_RCU))
                                dput(dentry);
                        dentry = NULL;
                }
        }
        return dentry;
}

static const char *open_last_lookups(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct dentry *dir = nd->path.dentry;
        int open_flag = op->open_flag;
        bool got_write = false;
        struct dentry *dentry;
        const char *res;

        nd->flags |= op->intent;

        if (nd->last_type != LAST_NORM) {
                if (nd->depth)
                        put_link(nd);
                return handle_dots(nd, nd->last_type);
        }

        /* We _can_ be in RCU mode here */
        dentry = lookup_fast_for_open(nd, open_flag);
        if (IS_ERR(dentry))
                return ERR_CAST(dentry);

        if (likely(dentry))
                goto finish_lookup;

        if (!(open_flag & O_CREAT)) {
                if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
                        return ERR_PTR(-ECHILD);
        } else {
                if (nd->flags & LOOKUP_RCU) {
                        if (!try_to_unlazy(nd))
                                return ERR_PTR(-ECHILD);
                }
        }

        if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
                got_write = !mnt_want_write(nd->path.mnt);
                /*
                 * do _not_ fail yet - we might not need that or fail with
                 * a different error; let lookup_open() decide; we'll be
                 * dropping this one anyway.
                 */
        }
        if (open_flag & O_CREAT)
                inode_lock(dir->d_inode);
        else
                inode_lock_shared(dir->d_inode);
        dentry = lookup_open(nd, file, op, got_write);
        if (!IS_ERR(dentry)) {
                if (file->f_mode & FMODE_CREATED)
                        fsnotify_create(dir->d_inode, dentry);
                if (file->f_mode & FMODE_OPENED)
                        fsnotify_open(file);
        }
        if (open_flag & O_CREAT)
                inode_unlock(dir->d_inode);
        else
                inode_unlock_shared(dir->d_inode);

        if (got_write)
                mnt_drop_write(nd->path.mnt);

        if (IS_ERR(dentry))
                return ERR_CAST(dentry);

        if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
                dput(nd->path.dentry);
                nd->path.dentry = dentry;
                return NULL;
        }

finish_lookup:
        if (nd->depth)
                put_link(nd);
        res = step_into(nd, WALK_TRAILING, dentry);
        if (unlikely(res))
                nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
        return res;
}

/*
 * Handle the last step of open()
 */
static int do_open(struct nameidata *nd,
                   struct file *file, const struct open_flags *op)
{
        struct mnt_idmap *idmap;
        int open_flag = op->open_flag;
        bool do_truncate;
        int acc_mode;
        int error;

        if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
                error = complete_walk(nd);
                if (error)
                        return error;
        }
        if (!(file->f_mode & FMODE_CREATED))
                audit_inode(nd->name, nd->path.dentry, 0);
        idmap = mnt_idmap(nd->path.mnt);
        if (open_flag & O_CREAT) {
                if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
                        return -EEXIST;
                if (d_is_dir(nd->path.dentry))
                        return -EISDIR;
                error = may_create_in_sticky(idmap, nd,
                                             d_backing_inode(nd->path.dentry));
                if (unlikely(error))
                        return error;
        }
        if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
                return -ENOTDIR;

        do_truncate = false;
        acc_mode = op->acc_mode;
        if (file->f_mode & FMODE_CREATED) {
                /* Don't check for write permission, don't truncate */
                open_flag &= ~O_TRUNC;
                acc_mode = 0;
        } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
                error = mnt_want_write(nd->path.mnt);
                if (error)
                        return error;
                do_truncate = true;
        }
        error = may_open(idmap, &nd->path, acc_mode, open_flag);
        if (!error && !(file->f_mode & FMODE_OPENED))
                error = vfs_open(&nd->path, file);
        if (!error)
                error = security_file_post_open(file, op->acc_mode);
        if (!error && do_truncate)
                error = handle_truncate(idmap, file);
        if (unlikely(error > 0)) {
                WARN_ON(1);
                error = -EINVAL;
        }
        if (do_truncate)
                mnt_drop_write(nd->path.mnt);
        return error;
}

/**
 * vfs_tmpfile - create tmpfile
 * @idmap:        idmap of the mount the inode was found from
 * @parentpath:        pointer to the path of the base directory
 * @file:        file descriptor of the new tmpfile
 * @mode:        mode of the new tmpfile
 *
 * Create a temporary file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_tmpfile(struct mnt_idmap *idmap,
                const struct path *parentpath,
                struct file *file, umode_t mode)
{
        struct dentry *child;
        struct inode *dir = d_inode(parentpath->dentry);
        struct inode *inode;
        int error;
        int open_flag = file->f_flags;

        /* we want directory to be writable */
        error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
        if (error)
                return error;
        if (!dir->i_op->tmpfile)
                return -EOPNOTSUPP;
        child = d_alloc(parentpath->dentry, &slash_name);
        if (unlikely(!child))
                return -ENOMEM;
        file->f_path.mnt = parentpath->mnt;
        file->f_path.dentry = child;
        mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
        error = dir->i_op->tmpfile(idmap, dir, file, mode);
        dput(child);
        if (file->f_mode & FMODE_OPENED)
                fsnotify_open(file);
        if (error)
                return error;
        /* Don't check for other permissions, the inode was just created */
        error = may_open(idmap, &file->f_path, 0, file->f_flags);
        if (error)
                return error;
        inode = file_inode(file);
        if (!(open_flag & O_EXCL)) {
                spin_lock(&inode->i_lock);
                inode->i_state |= I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        security_inode_post_create_tmpfile(idmap, inode);
        return 0;
}

/**
 * kernel_tmpfile_open - open a tmpfile for kernel internal use
 * @idmap:        idmap of the mount the inode was found from
 * @parentpath:        path of the base directory
 * @mode:        mode of the new tmpfile
 * @open_flag:        flags
 * @cred:        credentials for open
 *
 * Create and open a temporary file.  The file is not accounted in nr_files,
 * hence this is only for kernel internal use, and must not be installed into
 * file tables or such.
 */
struct file *kernel_tmpfile_open(struct mnt_idmap *idmap,
                                 const struct path *parentpath,
                                 umode_t mode, int open_flag,
                                 const struct cred *cred)
{
        struct file *file;
        int error;

        file = alloc_empty_file_noaccount(open_flag, cred);
        if (IS_ERR(file))
                return file;

        error = vfs_tmpfile(idmap, parentpath, file, mode);
        if (error) {
                fput(file);
                file = ERR_PTR(error);
        }
        return file;
}
EXPORT_SYMBOL(kernel_tmpfile_open);

static int do_tmpfile(struct nameidata *nd, unsigned flags,
                const struct open_flags *op,
                struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path);

        if (unlikely(error))
                return error;
        error = mnt_want_write(path.mnt);
        if (unlikely(error))
                goto out;
        error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode);
        if (error)
                goto out2;
        audit_inode(nd->name, file->f_path.dentry, 0);
out2:
        mnt_drop_write(path.mnt);
out:
        path_put(&path);
        return error;
}

static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
{
        struct path path;
        int error = path_lookupat(nd, flags, &path);
        if (!error) {
                audit_inode(nd->name, path.dentry, 0);
                error = vfs_open(&path, file);
                path_put(&path);
        }
        return error;
}

static struct file *path_openat(struct nameidata *nd,
                        const struct open_flags *op, unsigned flags)
{
        struct file *file;
        int error;

        file = alloc_empty_file(op->open_flag, current_cred());
        if (IS_ERR(file))
                return file;

        if (unlikely(file->f_flags & __O_TMPFILE)) {
                error = do_tmpfile(nd, flags, op, file);
        } else if (unlikely(file->f_flags & O_PATH)) {
                error = do_o_path(nd, flags, file);
        } else {
                const char *s = path_init(nd, flags);
                while (!(error = link_path_walk(s, nd)) &&
                       (s = open_last_lookups(nd, file, op)) != NULL)
                        ;
                if (!error)
                        error = do_open(nd, file, op);
                terminate_walk(nd);
        }
        if (likely(!error)) {
                if (likely(file->f_mode & FMODE_OPENED))
                        return file;
                WARN_ON(1);
                error = -EINVAL;
        }
        fput_close(file);
        if (error == -EOPENSTALE) {
                if (flags & LOOKUP_RCU)
                        error = -ECHILD;
                else
                        error = -ESTALE;
        }
        return ERR_PTR(error);
}

struct file *do_filp_open(int dfd, struct filename *pathname,
                const struct open_flags *op)
{
        struct nameidata nd;
        int flags = op->lookup_flags;
        struct file *filp;

        set_nameidata(&nd, dfd, pathname, NULL);
        filp = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(filp == ERR_PTR(-ECHILD)))
                filp = path_openat(&nd, op, flags);
        if (unlikely(filp == ERR_PTR(-ESTALE)))
                filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        return filp;
}

struct file *do_file_open_root(const struct path *root,
                const char *name, const struct open_flags *op)
{
        struct nameidata nd;
        struct file *file;
        struct filename *filename;
        int flags = op->lookup_flags;

        if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);

        filename = getname_kernel(name);
        if (IS_ERR(filename))
                return ERR_CAST(filename);

        set_nameidata(&nd, -1, filename, root);
        file = path_openat(&nd, op, flags | LOOKUP_RCU);
        if (unlikely(file == ERR_PTR(-ECHILD)))
                file = path_openat(&nd, op, flags);
        if (unlikely(file == ERR_PTR(-ESTALE)))
                file = path_openat(&nd, op, flags | LOOKUP_REVAL);
        restore_nameidata();
        putname(filename);
        return file;
}

static struct dentry *filename_create(int dfd, struct filename *name,
                                      struct path *path, unsigned int lookup_flags)
{
        struct dentry *dentry = ERR_PTR(-EEXIST);
        struct qstr last;
        bool want_dir = lookup_flags & LOOKUP_DIRECTORY;
        unsigned int reval_flag = lookup_flags & LOOKUP_REVAL;
        unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL;
        int type;
        int err2;
        int error;

        error = filename_parentat(dfd, name, reval_flag, path, &last, &type);
        if (error)
                return ERR_PTR(error);

        /*
         * Yucky last component or no last component at all?
         * (foo/., foo/.., /////)
         */
        if (unlikely(type != LAST_NORM))
                goto out;

        /* don't fail immediately if it's r/o, at least try to report other errors */
        err2 = mnt_want_write(path->mnt);
        /*
         * Do the final lookup.  Suppress 'create' if there is a trailing
         * '/', and a directory wasn't requested.
         */
        if (last.name[last.len] && !want_dir)
                create_flags &= ~LOOKUP_CREATE;
        inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
        dentry = lookup_one_qstr_excl(&last, path->dentry,
                                      reval_flag | create_flags);
        if (IS_ERR(dentry))
                goto unlock;

        if (unlikely(err2)) {
                error = err2;
                goto fail;
        }
        return dentry;
fail:
        dput(dentry);
        dentry = ERR_PTR(error);
unlock:
        inode_unlock(path->dentry->d_inode);
        if (!err2)
                mnt_drop_write(path->mnt);
out:
        path_put(path);
        return dentry;
}

struct dentry *kern_path_create(int dfd, const char *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        struct filename *filename = getname_kernel(pathname);
        struct dentry *res = filename_create(dfd, filename, path, lookup_flags);

        putname(filename);
        return res;
}
EXPORT_SYMBOL(kern_path_create);

void done_path_create(struct path *path, struct dentry *dentry)
{
        if (!IS_ERR(dentry))
                dput(dentry);
        inode_unlock(path->dentry->d_inode);
        mnt_drop_write(path->mnt);
        path_put(path);
}
EXPORT_SYMBOL(done_path_create);

inline struct dentry *user_path_create(int dfd, const char __user *pathname,
                                struct path *path, unsigned int lookup_flags)
{
        struct filename *filename = getname(pathname);
        struct dentry *res = filename_create(dfd, filename, path, lookup_flags);

        putname(filename);
        return res;
}
EXPORT_SYMBOL(user_path_create);

/**
 * vfs_mknod - create device node or file
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of the parent directory
 * @dentry:        dentry of the child device node
 * @mode:        mode of the child device node
 * @dev:        device number of device to create
 *
 * Create a device node or file.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
              struct dentry *dentry, umode_t mode, dev_t dev)
{
        bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV;
        int error = may_create(idmap, dir, dentry);

        if (error)
                return error;

        if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout &&
            !capable(CAP_MKNOD))
                return -EPERM;

        if (!dir->i_op->mknod)
                return -EPERM;

        mode = vfs_prepare_mode(idmap, dir, mode, mode, mode);
        error = devcgroup_inode_mknod(mode, dev);
        if (error)
                return error;

        error = security_inode_mknod(dir, dentry, mode, dev);
        if (error)
                return error;

        error = dir->i_op->mknod(idmap, dir, dentry, mode, dev);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_mknod);

static int may_mknod(umode_t mode)
{
        switch (mode & S_IFMT) {
        case S_IFREG:
        case S_IFCHR:
        case S_IFBLK:
        case S_IFIFO:
        case S_IFSOCK:
        case 0: /* zero mode translates to S_IFREG */
                return 0;
        case S_IFDIR:
                return -EPERM;
        default:
                return -EINVAL;
        }
}

static int do_mknodat(int dfd, struct filename *name, umode_t mode,
                unsigned int dev)
{
        struct mnt_idmap *idmap;
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = 0;

        error = may_mknod(mode);
        if (error)
                goto out1;
retry:
        dentry = filename_create(dfd, name, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out1;

        error = security_path_mknod(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode), dev);
        if (error)
                goto out2;

        idmap = mnt_idmap(path.mnt);
        switch (mode & S_IFMT) {
                case 0: case S_IFREG:
                        error = vfs_create(idmap, path.dentry->d_inode,
                                           dentry, mode, true);
                        if (!error)
                                security_path_post_mknod(idmap, dentry);
                        break;
                case S_IFCHR: case S_IFBLK:
                        error = vfs_mknod(idmap, path.dentry->d_inode,
                                          dentry, mode, new_decode_dev(dev));
                        break;
                case S_IFIFO: case S_IFSOCK:
                        error = vfs_mknod(idmap, path.dentry->d_inode,
                                          dentry, mode, 0);
                        break;
        }
out2:
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out1:
        putname(name);
        return error;
}

SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
                unsigned int, dev)
{
        return do_mknodat(dfd, getname(filename), mode, dev);
}

SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev)
{
        return do_mknodat(AT_FDCWD, getname(filename), mode, dev);
}

/**
 * vfs_mkdir - create directory returning correct dentry if possible
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of the parent directory
 * @dentry:        dentry of the child directory
 * @mode:        mode of the child directory
 *
 * Create a directory.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 *
 * In the event that the filesystem does not use the *@dentry but leaves it
 * negative or unhashes it and possibly splices a different one returning it,
 * the original dentry is dput() and the alternate is returned.
 *
 * In case of an error the dentry is dput() and an ERR_PTR() is returned.
 */
struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
                         struct dentry *dentry, umode_t mode)
{
        int error;
        unsigned max_links = dir->i_sb->s_max_links;
        struct dentry *de;

        error = may_create(idmap, dir, dentry);
        if (error)
                goto err;

        error = -EPERM;
        if (!dir->i_op->mkdir)
                goto err;

        mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0);
        error = security_inode_mkdir(dir, dentry, mode);
        if (error)
                goto err;

        error = -EMLINK;
        if (max_links && dir->i_nlink >= max_links)
                goto err;

        de = dir->i_op->mkdir(idmap, dir, dentry, mode);
        error = PTR_ERR(de);
        if (IS_ERR(de))
                goto err;
        if (de) {
                dput(dentry);
                dentry = de;
        }
        fsnotify_mkdir(dir, dentry);
        return dentry;

err:
        dput(dentry);
        return ERR_PTR(error);
}
EXPORT_SYMBOL(vfs_mkdir);

int do_mkdirat(int dfd, struct filename *name, umode_t mode)
{
        struct dentry *dentry;
        struct path path;
        int error;
        unsigned int lookup_flags = LOOKUP_DIRECTORY;

retry:
        dentry = filename_create(dfd, name, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putname;

        error = security_path_mkdir(&path, dentry,
                        mode_strip_umask(path.dentry->d_inode, mode));
        if (!error) {
                dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode,
                                  dentry, mode);
                if (IS_ERR(dentry))
                        error = PTR_ERR(dentry);
        }
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out_putname:
        putname(name);
        return error;
}

SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(dfd, getname(pathname), mode);
}

SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
{
        return do_mkdirat(AT_FDCWD, getname(pathname), mode);
}

/**
 * vfs_rmdir - remove directory
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of the parent directory
 * @dentry:        dentry of the child directory
 *
 * Remove a directory.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
                     struct dentry *dentry)
{
        int error = may_delete(idmap, dir, dentry, 1);

        if (error)
                return error;

        if (!dir->i_op->rmdir)
                return -EPERM;

        dget(dentry);
        inode_lock(dentry->d_inode);

        error = -EBUSY;
        if (is_local_mountpoint(dentry) ||
            (dentry->d_inode->i_flags & S_KERNEL_FILE))
                goto out;

        error = security_inode_rmdir(dir, dentry);
        if (error)
                goto out;

        error = dir->i_op->rmdir(dir, dentry);
        if (error)
                goto out;

        shrink_dcache_parent(dentry);
        dentry->d_inode->i_flags |= S_DEAD;
        dont_mount(dentry);
        detach_mounts(dentry);

out:
        inode_unlock(dentry->d_inode);
        dput(dentry);
        if (!error)
                d_delete_notify(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_rmdir);

int do_rmdir(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        unsigned int lookup_flags = 0;
retry:
        error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                goto exit1;

        switch (type) {
        case LAST_DOTDOT:
                error = -ENOTEMPTY;
                goto exit2;
        case LAST_DOT:
                error = -EINVAL;
                goto exit2;
        case LAST_ROOT:
                error = -EBUSY;
                goto exit2;
        }

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit2;

        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto exit3;
        error = security_path_rmdir(&path, dentry);
        if (error)
                goto exit4;
        error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry);
exit4:
        dput(dentry);
exit3:
        inode_unlock(path.dentry->d_inode);
        mnt_drop_write(path.mnt);
exit2:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
exit1:
        putname(name);
        return error;
}

SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
{
        return do_rmdir(AT_FDCWD, getname(pathname));
}

/**
 * vfs_unlink - unlink a filesystem object
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        parent directory
 * @dentry:        victim
 * @delegated_inode: returns victim inode, if the inode is delegated.
 *
 * The caller must hold dir->i_mutex.
 *
 * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and
 * return a reference to the inode in delegated_inode.  The caller
 * should then break the delegation on that inode and retry.  Because
 * breaking a delegation may take a long time, the caller should drop
 * dir->i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
               struct dentry *dentry, struct inode **delegated_inode)
{
        struct inode *target = dentry->d_inode;
        int error = may_delete(idmap, dir, dentry, 0);

        if (error)
                return error;

        if (!dir->i_op->unlink)
                return -EPERM;

        inode_lock(target);
        if (IS_SWAPFILE(target))
                error = -EPERM;
        else if (is_local_mountpoint(dentry))
                error = -EBUSY;
        else {
                error = security_inode_unlink(dir, dentry);
                if (!error) {
                        error = try_break_deleg(target, delegated_inode);
                        if (error)
                                goto out;
                        error = dir->i_op->unlink(dir, dentry);
                        if (!error) {
                                dont_mount(dentry);
                                detach_mounts(dentry);
                        }
                }
        }
out:
        inode_unlock(target);

        /* We don't d_delete() NFS sillyrenamed files--they still exist. */
        if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) {
                fsnotify_unlink(dir, dentry);
        } else if (!error) {
                fsnotify_link_count(target);
                d_delete_notify(dir, dentry);
        }

        return error;
}
EXPORT_SYMBOL(vfs_unlink);

/*
 * Make sure that the actual truncation of the file will occur outside its
 * directory's i_mutex.  Truncate can take a long time if there is a lot of
 * writeout happening, and we don't want to prevent access to the directory
 * while waiting on the I/O.
 */
int do_unlinkat(int dfd, struct filename *name)
{
        int error;
        struct dentry *dentry;
        struct path path;
        struct qstr last;
        int type;
        struct inode *inode = NULL;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0;
retry:
        error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type);
        if (error)
                goto exit1;

        error = -EISDIR;
        if (type != LAST_NORM)
                goto exit2;

        error = mnt_want_write(path.mnt);
        if (error)
                goto exit2;
retry_deleg:
        inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
        dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags);
        error = PTR_ERR(dentry);
        if (!IS_ERR(dentry)) {

                /* Why not before? Because we want correct error value */
                if (last.name[last.len])
                        goto slashes;
                inode = dentry->d_inode;
                ihold(inode);
                error = security_path_unlink(&path, dentry);
                if (error)
                        goto exit3;
                error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode,
                                   dentry, &delegated_inode);
exit3:
                dput(dentry);
        }
        inode_unlock(path.dentry->d_inode);
        if (inode)
                iput(inode);        /* truncate the inode here */
        inode = NULL;
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(path.mnt);
exit2:
        path_put(&path);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                inode = NULL;
                goto retry;
        }
exit1:
        putname(name);
        return error;

slashes:
        if (d_is_dir(dentry))
                error = -EISDIR;
        else
                error = -ENOTDIR;
        goto exit3;
}

SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag)
{
        if ((flag & ~AT_REMOVEDIR) != 0)
                return -EINVAL;

        if (flag & AT_REMOVEDIR)
                return do_rmdir(dfd, getname(pathname));
        return do_unlinkat(dfd, getname(pathname));
}

SYSCALL_DEFINE1(unlink, const char __user *, pathname)
{
        return do_unlinkat(AT_FDCWD, getname(pathname));
}

/**
 * vfs_symlink - create symlink
 * @idmap:        idmap of the mount the inode was found from
 * @dir:        inode of the parent directory
 * @dentry:        dentry of the child symlink file
 * @oldname:        name of the file to link to
 *
 * Create a symlink.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
                struct dentry *dentry, const char *oldname)
{
        int error;

        error = may_create(idmap, dir, dentry);
        if (error)
                return error;

        if (!dir->i_op->symlink)
                return -EPERM;

        error = security_inode_symlink(dir, dentry, oldname);
        if (error)
                return error;

        error = dir->i_op->symlink(idmap, dir, dentry, oldname);
        if (!error)
                fsnotify_create(dir, dentry);
        return error;
}
EXPORT_SYMBOL(vfs_symlink);

int do_symlinkat(struct filename *from, int newdfd, struct filename *to)
{
        int error;
        struct dentry *dentry;
        struct path path;
        unsigned int lookup_flags = 0;

        if (IS_ERR(from)) {
                error = PTR_ERR(from);
                goto out_putnames;
        }
retry:
        dentry = filename_create(newdfd, to, &path, lookup_flags);
        error = PTR_ERR(dentry);
        if (IS_ERR(dentry))
                goto out_putnames;

        error = security_path_symlink(&path, dentry, from->name);
        if (!error)
                error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode,
                                    dentry, from->name);
        done_path_create(&path, dentry);
        if (retry_estale(error, lookup_flags)) {
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
out_putnames:
        putname(to);
        putname(from);
        return error;
}

SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_symlinkat(getname(oldname), newdfd, getname(newname));
}

SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname)
{
        return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname));
}

/**
 * vfs_link - create a new link
 * @old_dentry:        object to be linked
 * @idmap:        idmap of the mount
 * @dir:        new parent
 * @new_dentry:        where to create the new link
 * @delegated_inode: returns inode needing a delegation break
 *
 * The caller must hold dir->i_mutex
 *
 * If vfs_link discovers a delegation on the to-be-linked file in need
 * of breaking, it will return -EWOULDBLOCK and return a reference to the
 * inode in delegated_inode.  The caller should then break the delegation
 * and retry.  Because breaking a delegation may take a long time, the
 * caller should drop the i_mutex before doing so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * If the inode has been found through an idmapped mount the idmap of
 * the vfsmount must be passed through @idmap. This function will then take
 * care to map the inode according to @idmap before checking permissions.
 * On non-idmapped mounts or if permission checking is to be performed on the
 * raw inode simply pass @nop_mnt_idmap.
 */
int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
             struct inode *dir, struct dentry *new_dentry,
             struct inode **delegated_inode)
{
        struct inode *inode = old_dentry->d_inode;
        unsigned max_links = dir->i_sb->s_max_links;
        int error;

        if (!inode)
                return -ENOENT;

        error = may_create(idmap, dir, new_dentry);
        if (error)
                return error;

        if (dir->i_sb != inode->i_sb)
                return -EXDEV;

        /*
         * A link to an append-only or immutable file cannot be created.
         */
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return -EPERM;
        /*
         * Updating the link count will likely cause i_uid and i_gid to
         * be writen back improperly if their true value is unknown to
         * the vfs.
         */
        if (HAS_UNMAPPED_ID(idmap, inode))
                return -EPERM;
        if (!dir->i_op->link)
                return -EPERM;
        if (S_ISDIR(inode->i_mode))
                return -EPERM;

        error = security_inode_link(old_dentry, dir, new_dentry);
        if (error)
                return error;

        inode_lock(inode);
        /* Make sure we don't allow creating hardlink to an unlinked file */
        if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
                error =  -ENOENT;
        else if (max_links && inode->i_nlink >= max_links)
                error = -EMLINK;
        else {
                error = try_break_deleg(inode, delegated_inode);
                if (!error)
                        error = dir->i_op->link(old_dentry, dir, new_dentry);
        }

        if (!error && (inode->i_state & I_LINKABLE)) {
                spin_lock(&inode->i_lock);
                inode->i_state &= ~I_LINKABLE;
                spin_unlock(&inode->i_lock);
        }
        inode_unlock(inode);
        if (!error)
                fsnotify_link(dir, inode, new_dentry);
        return error;
}
EXPORT_SYMBOL(vfs_link);

/*
 * Hardlinks are often used in delicate situations.  We avoid
 * security-related surprises by not following symlinks on the
 * newname.  --KAB
 *
 * We don't follow them on the oldname either to be compatible
 * with linux 2.0, and to avoid hard-linking to directories
 * and other special files.  --ADM
 */
int do_linkat(int olddfd, struct filename *old, int newdfd,
              struct filename *new, int flags)
{
        struct mnt_idmap *idmap;
        struct dentry *new_dentry;
        struct path old_path, new_path;
        struct inode *delegated_inode = NULL;
        int how = 0;
        int error;

        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) {
                error = -EINVAL;
                goto out_putnames;
        }
        /*
         * To use null names we require CAP_DAC_READ_SEARCH or
         * that the open-time creds of the dfd matches current.
         * This ensures that not everyone will be able to create
         * a hardlink using the passed file descriptor.
         */
        if (flags & AT_EMPTY_PATH)
                how |= LOOKUP_LINKAT_EMPTY;

        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
retry:
        error = filename_lookup(olddfd, old, how, &old_path, NULL);
        if (error)
                goto out_putnames;

        new_dentry = filename_create(newdfd, new, &new_path,
                                        (how & LOOKUP_REVAL));
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto out_putpath;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto out_dput;
        idmap = mnt_idmap(new_path.mnt);
        error = may_linkat(idmap, &old_path);
        if (unlikely(error))
                goto out_dput;
        error = security_path_link(old_path.dentry, &new_path, new_dentry);
        if (error)
                goto out_dput;
        error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode,
                         new_dentry, &delegated_inode);
out_dput:
        done_path_create(&new_path, new_dentry);
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error) {
                        path_put(&old_path);
                        goto retry;
                }
        }
        if (retry_estale(error, how)) {
                path_put(&old_path);
                how |= LOOKUP_REVAL;
                goto retry;
        }
out_putpath:
        path_put(&old_path);
out_putnames:
        putname(old);
        putname(new);

        return error;
}

SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, int, flags)
{
        return do_linkat(olddfd, getname_uflags(oldname, flags),
                newdfd, getname(newname), flags);
}

SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname)
{
        return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0);
}

/**
 * vfs_rename - rename a filesystem object
 * @rd:                pointer to &struct renamedata info
 *
 * The caller must hold multiple mutexes--see lock_rename()).
 *
 * If vfs_rename discovers a delegation in need of breaking at either
 * the source or destination, it will return -EWOULDBLOCK and return a
 * reference to the inode in delegated_inode.  The caller should then
 * break the delegation and retry.  Because breaking a delegation may
 * take a long time, the caller should drop all locks before doing
 * so.
 *
 * Alternatively, a caller may pass NULL for delegated_inode.  This may
 * be appropriate for callers that expect the underlying filesystem not
 * to be NFS exported.
 *
 * The worst of all namespace operations - renaming directory. "Perverted"
 * doesn't even start to describe it. Somebody in UCB had a heck of a trip...
 * Problems:
 *
 *        a) we can get into loop creation.
 *        b) race potential - two innocent renames can create a loop together.
 *           That's where 4.4BSD screws up. Current fix: serialization on
 *           sb->s_vfs_rename_mutex. We might be more accurate, but that's another
 *           story.
 *        c) we may have to lock up to _four_ objects - parents and victim (if it exists),
 *           and source (if it's a non-directory or a subdirectory that moves to
 *           different parent).
 *           And that - after we got ->i_mutex on parents (until then we don't know
 *           whether the target exists).  Solution: try to be smart with locking
 *           order for inodes.  We rely on the fact that tree topology may change
 *           only under ->s_vfs_rename_mutex _and_ that parent of the object we
 *           move will be locked.  Thus we can rank directories by the tree
 *           (ancestors first) and rank all non-directories after them.
 *           That works since everybody except rename does "lock parent, lookup,
 *           lock child" and rename is under ->s_vfs_rename_mutex.
 *           HOWEVER, it relies on the assumption that any object with ->lookup()
 *           has no more than 1 dentry.  If "hybrid" objects will ever appear,
 *           we'd better make sure that there's no link(2) for them.
 *        d) conversion from fhandle to dentry may come in the wrong moment - when
 *           we are removing the target. Solution: we will have to grab ->i_mutex
 *           in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on
 *           ->i_mutex on parents, which works but leads to some truly excessive
 *           locking].
 */
int vfs_rename(struct renamedata *rd)
{
        int error;
        struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
        struct dentry *old_dentry = rd->old_dentry;
        struct dentry *new_dentry = rd->new_dentry;
        struct inode **delegated_inode = rd->delegated_inode;
        unsigned int flags = rd->flags;
        bool is_dir = d_is_dir(old_dentry);
        struct inode *source = old_dentry->d_inode;
        struct inode *target = new_dentry->d_inode;
        bool new_is_dir = false;
        unsigned max_links = new_dir->i_sb->s_max_links;
        struct name_snapshot old_name;
        bool lock_old_subdir, lock_new_subdir;

        if (source == target)
                return 0;

        error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir);
        if (error)
                return error;

        if (!target) {
                error = may_create(rd->new_mnt_idmap, new_dir, new_dentry);
        } else {
                new_is_dir = d_is_dir(new_dentry);

                if (!(flags & RENAME_EXCHANGE))
                        error = may_delete(rd->new_mnt_idmap, new_dir,
                                           new_dentry, is_dir);
                else
                        error = may_delete(rd->new_mnt_idmap, new_dir,
                                           new_dentry, new_is_dir);
        }
        if (error)
                return error;

        if (!old_dir->i_op->rename)
                return -EPERM;

        /*
         * If we are going to change the parent - check write permissions,
         * we'll need to flip '..'.
         */
        if (new_dir != old_dir) {
                if (is_dir) {
                        error = inode_permission(rd->old_mnt_idmap, source,
                                                 MAY_WRITE);
                        if (error)
                                return error;
                }
                if ((flags & RENAME_EXCHANGE) && new_is_dir) {
                        error = inode_permission(rd->new_mnt_idmap, target,
                                                 MAY_WRITE);
                        if (error)
                                return error;
                }
        }

        error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry,
                                      flags);
        if (error)
                return error;

        take_dentry_name_snapshot(&old_name, old_dentry);
        dget(new_dentry);
        /*
         * Lock children.
         * The source subdirectory needs to be locked on cross-directory
         * rename or cross-directory exchange since its parent changes.
         * The target subdirectory needs to be locked on cross-directory
         * exchange due to parent change and on any rename due to becoming
         * a victim.
         * Non-directories need locking in all cases (for NFS reasons);
         * they get locked after any subdirectories (in inode address order).
         *
         * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
         * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
         */
        lock_old_subdir = new_dir != old_dir;
        lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
        if (is_dir) {
                if (lock_old_subdir)
                        inode_lock_nested(source, I_MUTEX_CHILD);
                if (target && (!new_is_dir || lock_new_subdir))
                        inode_lock(target);
        } else if (new_is_dir) {
                if (lock_new_subdir)
                        inode_lock_nested(target, I_MUTEX_CHILD);
                inode_lock(source);
        } else {
                lock_two_nondirectories(source, target);
        }

        error = -EPERM;
        if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
                goto out;

        error = -EBUSY;
        if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
                goto out;

        if (max_links && new_dir != old_dir) {
                error = -EMLINK;
                if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links)
                        goto out;
                if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir &&
                    old_dir->i_nlink >= max_links)
                        goto out;
        }
        if (!is_dir) {
                error = try_break_deleg(source, delegated_inode);
                if (error)
                        goto out;
        }
        if (target && !new_is_dir) {
                error = try_break_deleg(target, delegated_inode);
                if (error)
                        goto out;
        }
        error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry,
                                      new_dir, new_dentry, flags);
        if (error)
                goto out;

        if (!(flags & RENAME_EXCHANGE) && target) {
                if (is_dir) {
                        shrink_dcache_parent(new_dentry);
                        target->i_flags |= S_DEAD;
                }
                dont_mount(new_dentry);
                detach_mounts(new_dentry);
        }
        if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) {
                if (!(flags & RENAME_EXCHANGE))
                        d_move(old_dentry, new_dentry);
                else
                        d_exchange(old_dentry, new_dentry);
        }
out:
        if (!is_dir || lock_old_subdir)
                inode_unlock(source);
        if (target && (!new_is_dir || lock_new_subdir))
                inode_unlock(target);
        dput(new_dentry);
        if (!error) {
                fsnotify_move(old_dir, new_dir, &old_name.name, is_dir,
                              !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry);
                if (flags & RENAME_EXCHANGE) {
                        fsnotify_move(new_dir, old_dir, &old_dentry->d_name,
                                      new_is_dir, NULL, new_dentry);
                }
        }
        release_dentry_name_snapshot(&old_name);

        return error;
}
EXPORT_SYMBOL(vfs_rename);

int do_renameat2(int olddfd, struct filename *from, int newdfd,
                 struct filename *to, unsigned int flags)
{
        struct renamedata rd;
        struct dentry *old_dentry, *new_dentry;
        struct dentry *trap;
        struct path old_path, new_path;
        struct qstr old_last, new_last;
        int old_type, new_type;
        struct inode *delegated_inode = NULL;
        unsigned int lookup_flags = 0, target_flags =
                LOOKUP_RENAME_TARGET | LOOKUP_CREATE;
        bool should_retry = false;
        int error = -EINVAL;

        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                goto put_names;

        if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) &&
            (flags & RENAME_EXCHANGE))
                goto put_names;

        if (flags & RENAME_EXCHANGE)
                target_flags = 0;
        if (flags & RENAME_NOREPLACE)
                target_flags |= LOOKUP_EXCL;

retry:
        error = filename_parentat(olddfd, from, lookup_flags, &old_path,
                                  &old_last, &old_type);
        if (error)
                goto put_names;

        error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last,
                                  &new_type);
        if (error)
                goto exit1;

        error = -EXDEV;
        if (old_path.mnt != new_path.mnt)
                goto exit2;

        error = -EBUSY;
        if (old_type != LAST_NORM)
                goto exit2;

        if (flags & RENAME_NOREPLACE)
                error = -EEXIST;
        if (new_type != LAST_NORM)
                goto exit2;

        error = mnt_want_write(old_path.mnt);
        if (error)
                goto exit2;

retry_deleg:
        trap = lock_rename(new_path.dentry, old_path.dentry);
        if (IS_ERR(trap)) {
                error = PTR_ERR(trap);
                goto exit_lock_rename;
        }

        old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
                                          lookup_flags);
        error = PTR_ERR(old_dentry);
        if (IS_ERR(old_dentry))
                goto exit3;
        new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry,
                                          lookup_flags | target_flags);
        error = PTR_ERR(new_dentry);
        if (IS_ERR(new_dentry))
                goto exit4;
        if (flags & RENAME_EXCHANGE) {
                if (!d_is_dir(new_dentry)) {
                        error = -ENOTDIR;
                        if (new_last.name[new_last.len])
                                goto exit5;
                }
        }
        /* unless the source is a directory trailing slashes give -ENOTDIR */
        if (!d_is_dir(old_dentry)) {
                error = -ENOTDIR;
                if (old_last.name[old_last.len])
                        goto exit5;
                if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len])
                        goto exit5;
        }
        /* source should not be ancestor of target */
        error = -EINVAL;
        if (old_dentry == trap)
                goto exit5;
        /* target should not be an ancestor of source */
        if (!(flags & RENAME_EXCHANGE))
                error = -ENOTEMPTY;
        if (new_dentry == trap)
                goto exit5;

        error = security_path_rename(&old_path, old_dentry,
                                     &new_path, new_dentry, flags);
        if (error)
                goto exit5;

        rd.old_dir           = old_path.dentry->d_inode;
        rd.old_dentry           = old_dentry;
        rd.old_mnt_idmap   = mnt_idmap(old_path.mnt);
        rd.new_dir           = new_path.dentry->d_inode;
        rd.new_dentry           = new_dentry;
        rd.new_mnt_idmap   = mnt_idmap(new_path.mnt);
        rd.delegated_inode = &delegated_inode;
        rd.flags           = flags;
        error = vfs_rename(&rd);
exit5:
        dput(new_dentry);
exit4:
        dput(old_dentry);
exit3:
        unlock_rename(new_path.dentry, old_path.dentry);
exit_lock_rename:
        if (delegated_inode) {
                error = break_deleg_wait(&delegated_inode);
                if (!error)
                        goto retry_deleg;
        }
        mnt_drop_write(old_path.mnt);
exit2:
        if (retry_estale(error, lookup_flags))
                should_retry = true;
        path_put(&new_path);
exit1:
        path_put(&old_path);
        if (should_retry) {
                should_retry = false;
                lookup_flags |= LOOKUP_REVAL;
                goto retry;
        }
put_names:
        putname(from);
        putname(to);
        return error;
}

SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname, unsigned int, flags)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                flags);
}

SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname,
                int, newdfd, const char __user *, newname)
{
        return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname),
                                0);
}

SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname)
{
        return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD,
                                getname(newname), 0);
}

int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen)
{
        int copylen;

        copylen = linklen;
        if (unlikely(copylen > (unsigned) buflen))
                copylen = buflen;
        if (copy_to_user(buffer, link, copylen))
                copylen = -EFAULT;
        return copylen;
}

/**
 * vfs_readlink - copy symlink body into userspace buffer
 * @dentry: dentry on which to get symbolic link
 * @buffer: user memory pointer
 * @buflen: size of buffer
 *
 * Does not touch atime.  That's up to the caller if necessary
 *
 * Does not call security hook.
 */
int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        struct inode *inode = d_inode(dentry);
        DEFINE_DELAYED_CALL(done);
        const char *link;
        int res;

        if (inode->i_opflags & IOP_CACHED_LINK)
                return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen);

        if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
                if (unlikely(inode->i_op->readlink))
                        return inode->i_op->readlink(dentry, buffer, buflen);

                if (!d_is_symlink(dentry))
                        return -EINVAL;

                spin_lock(&inode->i_lock);
                inode->i_opflags |= IOP_DEFAULT_READLINK;
                spin_unlock(&inode->i_lock);
        }

        link = READ_ONCE(inode->i_link);
        if (!link) {
                link = inode->i_op->get_link(dentry, inode, &done);
                if (IS_ERR(link))
                        return PTR_ERR(link);
        }
        res = readlink_copy(buffer, buflen, link, strlen(link));
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(vfs_readlink);

/**
 * vfs_get_link - get symlink body
 * @dentry: dentry on which to get symbolic link
 * @done: caller needs to free returned data with this
 *
 * Calls security hook and i_op->get_link() on the supplied inode.
 *
 * It does not touch atime.  That's up to the caller if necessary.
 *
 * Does not work on "special" symlinks like /proc/$$/fd/N
 */
const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done)
{
        const char *res = ERR_PTR(-EINVAL);
        struct inode *inode = d_inode(dentry);

        if (d_is_symlink(dentry)) {
                res = ERR_PTR(security_inode_readlink(dentry));
                if (!res)
                        res = inode->i_op->get_link(dentry, inode, done);
        }
        return res;
}
EXPORT_SYMBOL(vfs_get_link);

/* get the link contents into pagecache */
static char *__page_get_link(struct dentry *dentry, struct inode *inode,
                             struct delayed_call *callback)
{
        struct page *page;
        struct address_space *mapping = inode->i_mapping;

        if (!dentry) {
                page = find_get_page(mapping, 0);
                if (!page)
                        return ERR_PTR(-ECHILD);
                if (!PageUptodate(page)) {
                        put_page(page);
                        return ERR_PTR(-ECHILD);
                }
        } else {
                page = read_mapping_page(mapping, 0, NULL);
                if (IS_ERR(page))
                        return (char*)page;
        }
        set_delayed_call(callback, page_put_link, page);
        BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
        return page_address(page);
}

const char *page_get_link_raw(struct dentry *dentry, struct inode *inode,
                              struct delayed_call *callback)
{
        return __page_get_link(dentry, inode, callback);
}
EXPORT_SYMBOL_GPL(page_get_link_raw);

const char *page_get_link(struct dentry *dentry, struct inode *inode,
                                        struct delayed_call *callback)
{
        char *kaddr = __page_get_link(dentry, inode, callback);

        if (!IS_ERR(kaddr))
                nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
        return kaddr;
}

EXPORT_SYMBOL(page_get_link);

void page_put_link(void *arg)
{
        put_page(arg);
}
EXPORT_SYMBOL(page_put_link);

int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
{
        const char *link;
        int res;

        DEFINE_DELAYED_CALL(done);
        link = page_get_link(dentry, d_inode(dentry), &done);
        res = PTR_ERR(link);
        if (!IS_ERR(link))
                res = readlink_copy(buffer, buflen, link, strlen(link));
        do_delayed_call(&done);
        return res;
}
EXPORT_SYMBOL(page_readlink);

int page_symlink(struct inode *inode, const char *symname, int len)
{
        struct address_space *mapping = inode->i_mapping;
        const struct address_space_operations *aops = mapping->a_ops;
        bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS);
        struct folio *folio;
        void *fsdata = NULL;
        int err;
        unsigned int flags;

retry:
        if (nofs)
                flags = memalloc_nofs_save();
        err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata);
        if (nofs)
                memalloc_nofs_restore(flags);
        if (err)
                goto fail;

        memcpy(folio_address(folio), symname, len - 1);

        err = aops->write_end(NULL, mapping, 0, len - 1, len - 1,
                                                folio, fsdata);
        if (err < 0)
                goto fail;
        if (err < len-1)
                goto retry;

        mark_inode_dirty(inode);
        return 0;
fail:
        return err;
}
EXPORT_SYMBOL(page_symlink);

const struct inode_operations page_symlink_inode_operations = {
        .get_link        = page_get_link,
};
EXPORT_SYMBOL(page_symlink_inode_operations);














    8 
    8 
















  131 
  131 





















































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
/*
 * Compatibility functions which bloat the callers too much to make inline.
 * All of the callers of these functions should be converted to use folios
 * eventually.
 */

#include <linux/migrate.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include "internal.h"

void unlock_page(struct page *page)
{
        return folio_unlock(page_folio(page));
}
EXPORT_SYMBOL(unlock_page);

void end_page_writeback(struct page *page)
{
        return folio_end_writeback(page_folio(page));
}
EXPORT_SYMBOL(end_page_writeback);

void wait_on_page_writeback(struct page *page)
{
        return folio_wait_writeback(page_folio(page));
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);

void mark_page_accessed(struct page *page)
{
        folio_mark_accessed(page_folio(page));
}
EXPORT_SYMBOL(mark_page_accessed);

void set_page_writeback(struct page *page)
{
        folio_start_writeback(page_folio(page));
}
EXPORT_SYMBOL(set_page_writeback);

bool set_page_dirty(struct page *page)
{
        return folio_mark_dirty(page_folio(page));
}
EXPORT_SYMBOL(set_page_dirty);

int set_page_dirty_lock(struct page *page)
{
        return folio_mark_dirty_lock(page_folio(page));
}
EXPORT_SYMBOL(set_page_dirty_lock);

bool clear_page_dirty_for_io(struct page *page)
{
        return folio_clear_dirty_for_io(page_folio(page));
}
EXPORT_SYMBOL(clear_page_dirty_for_io);

bool redirty_page_for_writepage(struct writeback_control *wbc,
                struct page *page)
{
        return folio_redirty_for_writepage(wbc, page_folio(page));
}
EXPORT_SYMBOL(redirty_page_for_writepage);

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                pgoff_t index, gfp_t gfp)
{
        return filemap_add_folio(mapping, page_folio(page), index, gfp);
}
EXPORT_SYMBOL(add_to_page_cache_lru);

noinline
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
                fgf_t fgp_flags, gfp_t gfp)
{
        struct folio *folio;

        folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
        if (IS_ERR(folio))
                return NULL;
        return folio_file_page(folio, index);
}
EXPORT_SYMBOL(pagecache_get_page);





























































































































































































   34 


   34 


























































































   33 
   34 


    1 
   34 




   34 
   34 




   34 





   34 
   34 




   34 





    1 

   34 













































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
// SPDX-License-Identifier: GPL-2.0-only

#include <linux/stat.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/hash.h>
#include <linux/kmemleak.h>
#include <linux/user_namespace.h>

struct ucounts init_ucounts = {
        .ns    = &init_user_ns,
        .uid   = GLOBAL_ROOT_UID,
        .count = RCUREF_INIT(1),
};

#define UCOUNTS_HASHTABLE_BITS 10
#define UCOUNTS_HASHTABLE_ENTRIES (1 << UCOUNTS_HASHTABLE_BITS)
static struct hlist_nulls_head ucounts_hashtable[UCOUNTS_HASHTABLE_ENTRIES] = {
        [0 ... UCOUNTS_HASHTABLE_ENTRIES - 1] = HLIST_NULLS_HEAD_INIT(0)
};
static DEFINE_SPINLOCK(ucounts_lock);

#define ucounts_hashfn(ns, uid)                                                \
        hash_long((unsigned long)__kuid_val(uid) + (unsigned long)(ns), \
                  UCOUNTS_HASHTABLE_BITS)
#define ucounts_hashentry(ns, uid)        \
        (ucounts_hashtable + ucounts_hashfn(ns, uid))

#ifdef CONFIG_SYSCTL
static struct ctl_table_set *
set_lookup(struct ctl_table_root *root)
{
        return &current_user_ns()->set;
}

static int set_is_seen(struct ctl_table_set *set)
{
        return &current_user_ns()->set == set;
}

static int set_permissions(struct ctl_table_header *head,
                           const struct ctl_table *table)
{
        struct user_namespace *user_ns =
                container_of(head->set, struct user_namespace, set);
        int mode;

        /* Allow users with CAP_SYS_RESOURCE unrestrained access */
        if (ns_capable(user_ns, CAP_SYS_RESOURCE))
                mode = (table->mode & S_IRWXU) >> 6;
        else
        /* Allow all others at most read-only access */
                mode = table->mode & S_IROTH;
        return (mode << 6) | (mode << 3) | mode;
}

static struct ctl_table_root set_root = {
        .lookup = set_lookup,
        .permissions = set_permissions,
};

static long ue_zero = 0;
static long ue_int_max = INT_MAX;

#define UCOUNT_ENTRY(name)                                        \
        {                                                        \
                .procname        = name,                                \
                .maxlen                = sizeof(long),                        \
                .mode                = 0644,                                \
                .proc_handler        = proc_doulongvec_minmax,        \
                .extra1                = &ue_zero,                        \
                .extra2                = &ue_int_max,                        \
        }
static const struct ctl_table user_table[] = {
        UCOUNT_ENTRY("max_user_namespaces"),
        UCOUNT_ENTRY("max_pid_namespaces"),
        UCOUNT_ENTRY("max_uts_namespaces"),
        UCOUNT_ENTRY("max_ipc_namespaces"),
        UCOUNT_ENTRY("max_net_namespaces"),
        UCOUNT_ENTRY("max_mnt_namespaces"),
        UCOUNT_ENTRY("max_cgroup_namespaces"),
        UCOUNT_ENTRY("max_time_namespaces"),
#ifdef CONFIG_INOTIFY_USER
        UCOUNT_ENTRY("max_inotify_instances"),
        UCOUNT_ENTRY("max_inotify_watches"),
#endif
#ifdef CONFIG_FANOTIFY
        UCOUNT_ENTRY("max_fanotify_groups"),
        UCOUNT_ENTRY("max_fanotify_marks"),
#endif
};
#endif /* CONFIG_SYSCTL */

bool setup_userns_sysctls(struct user_namespace *ns)
{
#ifdef CONFIG_SYSCTL
        struct ctl_table *tbl;

        BUILD_BUG_ON(ARRAY_SIZE(user_table) != UCOUNT_COUNTS);
        setup_sysctl_set(&ns->set, &set_root, set_is_seen);
        tbl = kmemdup(user_table, sizeof(user_table), GFP_KERNEL);
        if (tbl) {
                int i;
                for (i = 0; i < UCOUNT_COUNTS; i++) {
                        tbl[i].data = &ns->ucount_max[i];
                }
                ns->sysctls = __register_sysctl_table(&ns->set, "user", tbl,
                                                      ARRAY_SIZE(user_table));
        }
        if (!ns->sysctls) {
                kfree(tbl);
                retire_sysctl_set(&ns->set);
                return false;
        }
#endif
        return true;
}

void retire_userns_sysctls(struct user_namespace *ns)
{
#ifdef CONFIG_SYSCTL
        const struct ctl_table *tbl;

        tbl = ns->sysctls->ctl_table_arg;
        unregister_sysctl_table(ns->sysctls);
        retire_sysctl_set(&ns->set);
        kfree(tbl);
#endif
}

static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid,
                                    struct hlist_nulls_head *hashent)
{
        struct ucounts *ucounts;
        struct hlist_nulls_node *pos;

        guard(rcu)();
        hlist_nulls_for_each_entry_rcu(ucounts, pos, hashent, node) {
                if (uid_eq(ucounts->uid, uid) && (ucounts->ns == ns)) {
                        if (rcuref_get(&ucounts->count))
                                return ucounts;
                }
        }
        return NULL;
}

static void hlist_add_ucounts(struct ucounts *ucounts)
{
        struct hlist_nulls_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);

        spin_lock_irq(&ucounts_lock);
        hlist_nulls_add_head_rcu(&ucounts->node, hashent);
        spin_unlock_irq(&ucounts_lock);
}

struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
        struct hlist_nulls_head *hashent = ucounts_hashentry(ns, uid);
        struct ucounts *ucounts, *new;

        ucounts = find_ucounts(ns, uid, hashent);
        if (ucounts)
                return ucounts;

        new = kzalloc(sizeof(*new), GFP_KERNEL);
        if (!new)
                return NULL;

        new->ns = ns;
        new->uid = uid;
        rcuref_init(&new->count, 1);

        spin_lock_irq(&ucounts_lock);
        ucounts = find_ucounts(ns, uid, hashent);
        if (ucounts) {
                spin_unlock_irq(&ucounts_lock);
                kfree(new);
                return ucounts;
        }

        hlist_nulls_add_head_rcu(&new->node, hashent);
        get_user_ns(new->ns);
        spin_unlock_irq(&ucounts_lock);
        return new;
}

void put_ucounts(struct ucounts *ucounts)
{
        unsigned long flags;

        if (rcuref_put(&ucounts->count)) {
                spin_lock_irqsave(&ucounts_lock, flags);
                hlist_nulls_del_rcu(&ucounts->node);
                spin_unlock_irqrestore(&ucounts_lock, flags);

                put_user_ns(ucounts->ns);
                kfree_rcu(ucounts, rcu);
        }
}

static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
{
        long c, old;
        c = atomic_long_read(v);
        for (;;) {
                if (unlikely(c >= u))
                        return false;
                old = atomic_long_cmpxchg(v, c, c+1);
                if (likely(old == c))
                        return true;
                c = old;
        }
}

struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
                           enum ucount_type type)
{
        struct ucounts *ucounts, *iter, *bad;
        struct user_namespace *tns;
        ucounts = alloc_ucounts(ns, uid);
        for (iter = ucounts; iter; iter = tns->ucounts) {
                long max;
                tns = iter->ns;
                max = READ_ONCE(tns->ucount_max[type]);
                if (!atomic_long_inc_below(&iter->ucount[type], max))
                        goto fail;
        }
        return ucounts;
fail:
        bad = iter;
        for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
                atomic_long_dec(&iter->ucount[type]);

        put_ucounts(ucounts);
        return NULL;
}

void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
{
        struct ucounts *iter;
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
                WARN_ON_ONCE(dec < 0);
        }
        put_ucounts(ucounts);
}

long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
{
        struct ucounts *iter;
        long max = LONG_MAX;
        long ret = 0;

        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long new = atomic_long_add_return(v, &iter->rlimit[type]);
                if (new < 0 || new > max)
                        ret = LONG_MAX;
                else if (iter == ucounts)
                        ret = new;
                max = get_userns_rlimit_max(iter->ns, type);
        }
        return ret;
}

bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v)
{
        struct ucounts *iter;
        long new = -1; /* Silence compiler warning */
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long dec = atomic_long_sub_return(v, &iter->rlimit[type]);
                WARN_ON_ONCE(dec < 0);
                if (iter == ucounts)
                        new = dec;
        }
        return (new == 0);
}

static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
                                struct ucounts *last, enum rlimit_type type)
{
        struct ucounts *iter, *next;
        for (iter = ucounts; iter != last; iter = next) {
                long dec = atomic_long_sub_return(1, &iter->rlimit[type]);
                WARN_ON_ONCE(dec < 0);
                next = iter->ns->ucounts;
                if (dec == 0)
                        put_ucounts(iter);
        }
}

void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type)
{
        do_dec_rlimit_put_ucounts(ucounts, NULL, type);
}

long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type,
                            bool override_rlimit)
{
        /* Caller must hold a reference to ucounts */
        struct ucounts *iter;
        long max = LONG_MAX;
        long dec, ret = 0;

        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long new = atomic_long_add_return(1, &iter->rlimit[type]);
                if (new < 0 || new > max)
                        goto dec_unwind;
                if (iter == ucounts)
                        ret = new;
                if (!override_rlimit)
                        max = get_userns_rlimit_max(iter->ns, type);
                /*
                 * Grab an extra ucount reference for the caller when
                 * the rlimit count was previously 0.
                 */
                if (new != 1)
                        continue;
                if (!get_ucounts(iter))
                        goto dec_unwind;
        }
        return ret;
dec_unwind:
        dec = atomic_long_sub_return(1, &iter->rlimit[type]);
        WARN_ON_ONCE(dec < 0);
        do_dec_rlimit_put_ucounts(ucounts, iter, type);
        return 0;
}

bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit)
{
        struct ucounts *iter;
        long max = rlimit;
        if (rlimit > LONG_MAX)
                max = LONG_MAX;
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
                long val = get_rlimit_value(iter, type);
                if (val < 0 || val > max)
                        return true;
                max = get_userns_rlimit_max(iter->ns, type);
        }
        return false;
}

static __init int user_namespace_sysctl_init(void)
{
#ifdef CONFIG_SYSCTL
        static struct ctl_table_header *user_header;
        static struct ctl_table empty[1];
        /*
         * It is necessary to register the user directory in the
         * default set so that registrations in the child sets work
         * properly.
         */
        user_header = register_sysctl_sz("user", empty, 0);
        kmemleak_ignore(user_header);
        BUG_ON(!user_header);
        BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
        hlist_add_ucounts(&init_ucounts);
        inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
        return 0;
}
subsys_initcall(user_namespace_sysctl_init);








































































    1 




   68 


   45 
    1 
   45 






































   68 





   68 




   68 
































  143 



















   68 

   68 














  134 






  134 









  245 



  246 




   24 
   24 























  135 
  135 








   14 


   14 
   14 
   14 



   14 
   14 



  134 




  136 
  136 



  135 
  135 



  135 
  135 








































   45 



   45 
   45 
   45 





































































































































































































  432 




  432 


























   32 



   32 











































































































































































































































































    1 

    1 

















    1 






















    1 






    2 




  245 





























  248 





















  248 








  246 













  246 








  246 












   24 
   24 
























































































   24 









































































































   68 




























   68 




















   68 


   67 
































































  130 
































  123 






























  127 







  127 
    1 




    1 




















  128 


















































































  143 






  143 
   11 
  118 



  143 










    4 



  143 













  143 


   14 








    8 



  128 


    6 










    5 














  140 































  139 





















  139 







    7 



















    8 


  134 













  134 







  134 

    5 








    8 







  127 





  135 





















  135 














    7 

  134 





  135 
  135 



  121 

  130 




























  156 


































  156 







  156 










  156 



  155 
   13 























  156 
























  127 
  143 






   26 




















  104 

























  143 




  130 




  156 




   31 
   31 































































































































































   78 
   10 






   63 










    1 






   68 
   68 
















   84 



   84 







    5 





   75 















    6 


   74 




   74 









   75 




   14 



   82 




   18 












































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2012 - Virtual Open Systems and Columbia University
 * Author: Christoffer Dall <c.dall@virtualopensystems.com>
 */

#include <linux/mman.h>
#include <linux/kvm_host.h>
#include <linux/io.h>
#include <linux/hugetlb.h>
#include <linux/sched/signal.h>
#include <trace/events/kvm.h>
#include <asm/pgalloc.h>
#include <asm/cacheflush.h>
#include <asm/kvm_arm.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_pgtable.h>
#include <asm/kvm_pkvm.h>
#include <asm/kvm_ras.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/virt.h>

#include "trace.h"

static struct kvm_pgtable *hyp_pgtable;
static DEFINE_MUTEX(kvm_hyp_pgd_mutex);

static unsigned long __ro_after_init hyp_idmap_start;
static unsigned long __ro_after_init hyp_idmap_end;
static phys_addr_t __ro_after_init hyp_idmap_vector;

u32 __ro_after_init __hyp_va_bits;

static unsigned long __ro_after_init io_map_base;

#define KVM_PGT_FN(fn)                (!is_protected_kvm_enabled() ? fn : p ## fn)

static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end,
                                           phys_addr_t size)
{
        phys_addr_t boundary = ALIGN_DOWN(addr + size, size);

        return (boundary - 1 < end - 1) ? boundary : end;
}

static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end)
{
        phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL);

        return __stage2_range_addr_end(addr, end, size);
}

/*
 * Release kvm_mmu_lock periodically if the memory region is large. Otherwise,
 * we may see kernel panics with CONFIG_DETECT_HUNG_TASK,
 * CONFIG_LOCKUP_DETECTOR, CONFIG_LOCKDEP. Additionally, holding the lock too
 * long will also starve other vCPUs. We have to also make sure that the page
 * tables are not freed while we released the lock.
 */
static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr,
                              phys_addr_t end,
                              int (*fn)(struct kvm_pgtable *, u64, u64),
                              bool resched)
{
        struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        int ret;
        u64 next;

        do {
                struct kvm_pgtable *pgt = mmu->pgt;
                if (!pgt)
                        return -EINVAL;

                next = stage2_range_addr_end(addr, end);
                ret = fn(pgt, addr, next - addr);
                if (ret)
                        break;

                if (resched && next != end)
                        cond_resched_rwlock_write(&kvm->mmu_lock);
        } while (addr = next, addr != end);

        return ret;
}

#define stage2_apply_range_resched(mmu, addr, end, fn)                        \
        stage2_apply_range(mmu, addr, end, fn, true)

/*
 * Get the maximum number of page-tables pages needed to split a range
 * of blocks into PAGE_SIZE PTEs. It assumes the range is already
 * mapped at level 2, or at level 1 if allowed.
 */
static int kvm_mmu_split_nr_page_tables(u64 range)
{
        int n = 0;

        if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2)
                n += DIV_ROUND_UP(range, PUD_SIZE);
        n += DIV_ROUND_UP(range, PMD_SIZE);
        return n;
}

static bool need_split_memcache_topup_or_resched(struct kvm *kvm)
{
        struct kvm_mmu_memory_cache *cache;
        u64 chunk_size, min;

        if (need_resched() || rwlock_needbreak(&kvm->mmu_lock))
                return true;

        chunk_size = kvm->arch.mmu.split_page_chunk_size;
        min = kvm_mmu_split_nr_page_tables(chunk_size);
        cache = &kvm->arch.mmu.split_page_cache;
        return kvm_mmu_memory_cache_nr_free_objects(cache) < min;
}

static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr,
                                    phys_addr_t end)
{
        struct kvm_mmu_memory_cache *cache;
        struct kvm_pgtable *pgt;
        int ret, cache_capacity;
        u64 next, chunk_size;

        lockdep_assert_held_write(&kvm->mmu_lock);

        chunk_size = kvm->arch.mmu.split_page_chunk_size;
        cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size);

        if (chunk_size == 0)
                return 0;

        cache = &kvm->arch.mmu.split_page_cache;

        do {
                if (need_split_memcache_topup_or_resched(kvm)) {
                        write_unlock(&kvm->mmu_lock);
                        cond_resched();
                        /* Eager page splitting is best-effort. */
                        ret = __kvm_mmu_topup_memory_cache(cache,
                                                           cache_capacity,
                                                           cache_capacity);
                        write_lock(&kvm->mmu_lock);
                        if (ret)
                                break;
                }

                pgt = kvm->arch.mmu.pgt;
                if (!pgt)
                        return -EINVAL;

                next = __stage2_range_addr_end(addr, end, chunk_size);
                ret = KVM_PGT_FN(kvm_pgtable_stage2_split)(pgt, addr, next - addr, cache);
                if (ret)
                        break;
        } while (addr = next, addr != end);

        return ret;
}

static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
        return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
}

/**
 * kvm_arch_flush_remote_tlbs() - flush all VM TLB entries for v7/8
 * @kvm:        pointer to kvm structure.
 *
 * Interface to HYP function to flush all VM TLB entries
 */
int kvm_arch_flush_remote_tlbs(struct kvm *kvm)
{
        if (is_protected_kvm_enabled())
                kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
        else
                kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu);
        return 0;
}

int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm,
                                      gfn_t gfn, u64 nr_pages)
{
        u64 size = nr_pages << PAGE_SHIFT;
        u64 addr = gfn << PAGE_SHIFT;

        if (is_protected_kvm_enabled())
                kvm_call_hyp_nvhe(__pkvm_tlb_flush_vmid, kvm->arch.pkvm.handle);
        else
                kvm_tlb_flush_vmid_range(&kvm->arch.mmu, addr, size);
        return 0;
}

static bool kvm_is_device_pfn(unsigned long pfn)
{
        return !pfn_is_map_memory(pfn);
}

static void *stage2_memcache_zalloc_page(void *arg)
{
        struct kvm_mmu_memory_cache *mc = arg;
        void *virt;

        /* Allocated with __GFP_ZERO, so no need to zero */
        virt = kvm_mmu_memory_cache_alloc(mc);
        if (virt)
                kvm_account_pgtable_pages(virt, 1);
        return virt;
}

static void *kvm_host_zalloc_pages_exact(size_t size)
{
        return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
}

static void *kvm_s2_zalloc_pages_exact(size_t size)
{
        void *virt = kvm_host_zalloc_pages_exact(size);

        if (virt)
                kvm_account_pgtable_pages(virt, (size >> PAGE_SHIFT));
        return virt;
}

static void kvm_s2_free_pages_exact(void *virt, size_t size)
{
        kvm_account_pgtable_pages(virt, -(size >> PAGE_SHIFT));
        free_pages_exact(virt, size);
}

static struct kvm_pgtable_mm_ops kvm_s2_mm_ops;

static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head)
{
        struct page *page = container_of(head, struct page, rcu_head);
        void *pgtable = page_to_virt(page);
        s8 level = page_private(page);

        KVM_PGT_FN(kvm_pgtable_stage2_free_unlinked)(&kvm_s2_mm_ops, pgtable, level);
}

static void stage2_free_unlinked_table(void *addr, s8 level)
{
        struct page *page = virt_to_page(addr);

        set_page_private(page, (unsigned long)level);
        call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb);
}

static void kvm_host_get_page(void *addr)
{
        get_page(virt_to_page(addr));
}

static void kvm_host_put_page(void *addr)
{
        put_page(virt_to_page(addr));
}

static void kvm_s2_put_page(void *addr)
{
        struct page *p = virt_to_page(addr);
        /* Dropping last refcount, the page will be freed */
        if (page_count(p) == 1)
                kvm_account_pgtable_pages(addr, -1);
        put_page(p);
}

static int kvm_host_page_count(void *addr)
{
        return page_count(virt_to_page(addr));
}

static phys_addr_t kvm_host_pa(void *addr)
{
        return __pa(addr);
}

static void *kvm_host_va(phys_addr_t phys)
{
        return __va(phys);
}

static void clean_dcache_guest_page(void *va, size_t size)
{
        __clean_dcache_guest_page(va, size);
}

static void invalidate_icache_guest_page(void *va, size_t size)
{
        __invalidate_icache_guest_page(va, size);
}

/*
 * Unmapping vs dcache management:
 *
 * If a guest maps certain memory pages as uncached, all writes will
 * bypass the data cache and go directly to RAM.  However, the CPUs
 * can still speculate reads (not writes) and fill cache lines with
 * data.
 *
 * Those cache lines will be *clean* cache lines though, so a
 * clean+invalidate operation is equivalent to an invalidate
 * operation, because no cache lines are marked dirty.
 *
 * Those clean cache lines could be filled prior to an uncached write
 * by the guest, and the cache coherent IO subsystem would therefore
 * end up writing old data to disk.
 *
 * This is why right after unmapping a page/section and invalidating
 * the corresponding TLBs, we flush to make sure the IO subsystem will
 * never hit in the cache.
 *
 * This is all avoided on systems that have ARM64_HAS_STAGE2_FWB, as
 * we then fully enforce cacheability of RAM, no matter what the guest
 * does.
 */
/**
 * __unmap_stage2_range -- Clear stage2 page table entries to unmap a range
 * @mmu:   The KVM stage-2 MMU pointer
 * @start: The intermediate physical base address of the range to unmap
 * @size:  The size of the area to unmap
 * @may_block: Whether or not we are permitted to block
 *
 * Clear a range of stage-2 mappings, lowering the various ref-counts.  Must
 * be called while holding mmu_lock (unless for freeing the stage2 pgd before
 * destroying the VM), otherwise another faulting VCPU may come in and mess
 * with things behind our backs.
 */
static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size,
                                 bool may_block)
{
        struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        phys_addr_t end = start + size;

        lockdep_assert_held_write(&kvm->mmu_lock);
        WARN_ON(size & ~PAGE_MASK);
        WARN_ON(stage2_apply_range(mmu, start, end, KVM_PGT_FN(kvm_pgtable_stage2_unmap),
                                   may_block));
}

void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start,
                            u64 size, bool may_block)
{
        __unmap_stage2_range(mmu, start, size, may_block);
}

void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
        stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_flush));
}

static void stage2_flush_memslot(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot)
{
        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
        phys_addr_t end = addr + PAGE_SIZE * memslot->npages;

        kvm_stage2_flush_range(&kvm->arch.mmu, addr, end);
}

/**
 * stage2_flush_vm - Invalidate cache for pages mapped in stage 2
 * @kvm: The struct kvm pointer
 *
 * Go through the stage 2 page tables and invalidate any cache lines
 * backing memory already mapped to the VM.
 */
static void stage2_flush_vm(struct kvm *kvm)
{
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int idx, bkt;

        idx = srcu_read_lock(&kvm->srcu);
        write_lock(&kvm->mmu_lock);

        slots = kvm_memslots(kvm);
        kvm_for_each_memslot(memslot, bkt, slots)
                stage2_flush_memslot(kvm, memslot);

        kvm_nested_s2_flush(kvm);

        write_unlock(&kvm->mmu_lock);
        srcu_read_unlock(&kvm->srcu, idx);
}

/**
 * free_hyp_pgds - free Hyp-mode page tables
 */
void __init free_hyp_pgds(void)
{
        mutex_lock(&kvm_hyp_pgd_mutex);
        if (hyp_pgtable) {
                kvm_pgtable_hyp_destroy(hyp_pgtable);
                kfree(hyp_pgtable);
                hyp_pgtable = NULL;
        }
        mutex_unlock(&kvm_hyp_pgd_mutex);
}

static bool kvm_host_owns_hyp_mappings(void)
{
        if (is_kernel_in_hyp_mode())
                return false;

        if (static_branch_likely(&kvm_protected_mode_initialized))
                return false;

        /*
         * This can happen at boot time when __create_hyp_mappings() is called
         * after the hyp protection has been enabled, but the static key has
         * not been flipped yet.
         */
        if (!hyp_pgtable && is_protected_kvm_enabled())
                return false;

        WARN_ON(!hyp_pgtable);

        return true;
}

int __create_hyp_mappings(unsigned long start, unsigned long size,
                          unsigned long phys, enum kvm_pgtable_prot prot)
{
        int err;

        if (WARN_ON(!kvm_host_owns_hyp_mappings()))
                return -EINVAL;

        mutex_lock(&kvm_hyp_pgd_mutex);
        err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot);
        mutex_unlock(&kvm_hyp_pgd_mutex);

        return err;
}

static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
{
        if (!is_vmalloc_addr(kaddr)) {
                BUG_ON(!virt_addr_valid(kaddr));
                return __pa(kaddr);
        } else {
                return page_to_phys(vmalloc_to_page(kaddr)) +
                       offset_in_page(kaddr);
        }
}

struct hyp_shared_pfn {
        u64 pfn;
        int count;
        struct rb_node node;
};

static DEFINE_MUTEX(hyp_shared_pfns_lock);
static struct rb_root hyp_shared_pfns = RB_ROOT;

static struct hyp_shared_pfn *find_shared_pfn(u64 pfn, struct rb_node ***node,
                                              struct rb_node **parent)
{
        struct hyp_shared_pfn *this;

        *node = &hyp_shared_pfns.rb_node;
        *parent = NULL;
        while (**node) {
                this = container_of(**node, struct hyp_shared_pfn, node);
                *parent = **node;
                if (this->pfn < pfn)
                        *node = &((**node)->rb_left);
                else if (this->pfn > pfn)
                        *node = &((**node)->rb_right);
                else
                        return this;
        }

        return NULL;
}

static int share_pfn_hyp(u64 pfn)
{
        struct rb_node **node, *parent;
        struct hyp_shared_pfn *this;
        int ret = 0;

        mutex_lock(&hyp_shared_pfns_lock);
        this = find_shared_pfn(pfn, &node, &parent);
        if (this) {
                this->count++;
                goto unlock;
        }

        this = kzalloc(sizeof(*this), GFP_KERNEL);
        if (!this) {
                ret = -ENOMEM;
                goto unlock;
        }

        this->pfn = pfn;
        this->count = 1;
        rb_link_node(&this->node, parent, node);
        rb_insert_color(&this->node, &hyp_shared_pfns);
        ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, pfn, 1);
unlock:
        mutex_unlock(&hyp_shared_pfns_lock);

        return ret;
}

static int unshare_pfn_hyp(u64 pfn)
{
        struct rb_node **node, *parent;
        struct hyp_shared_pfn *this;
        int ret = 0;

        mutex_lock(&hyp_shared_pfns_lock);
        this = find_shared_pfn(pfn, &node, &parent);
        if (WARN_ON(!this)) {
                ret = -ENOENT;
                goto unlock;
        }

        this->count--;
        if (this->count)
                goto unlock;

        rb_erase(&this->node, &hyp_shared_pfns);
        kfree(this);
        ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_hyp, pfn, 1);
unlock:
        mutex_unlock(&hyp_shared_pfns_lock);

        return ret;
}

int kvm_share_hyp(void *from, void *to)
{
        phys_addr_t start, end, cur;
        u64 pfn;
        int ret;

        if (is_kernel_in_hyp_mode())
                return 0;

        /*
         * The share hcall maps things in the 'fixed-offset' region of the hyp
         * VA space, so we can only share physically contiguous data-structures
         * for now.
         */
        if (is_vmalloc_or_module_addr(from) || is_vmalloc_or_module_addr(to))
                return -EINVAL;

        if (kvm_host_owns_hyp_mappings())
                return create_hyp_mappings(from, to, PAGE_HYP);

        start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
        end = PAGE_ALIGN(__pa(to));
        for (cur = start; cur < end; cur += PAGE_SIZE) {
                pfn = __phys_to_pfn(cur);
                ret = share_pfn_hyp(pfn);
                if (ret)
                        return ret;
        }

        return 0;
}

void kvm_unshare_hyp(void *from, void *to)
{
        phys_addr_t start, end, cur;
        u64 pfn;

        if (is_kernel_in_hyp_mode() || kvm_host_owns_hyp_mappings() || !from)
                return;

        start = ALIGN_DOWN(__pa(from), PAGE_SIZE);
        end = PAGE_ALIGN(__pa(to));
        for (cur = start; cur < end; cur += PAGE_SIZE) {
                pfn = __phys_to_pfn(cur);
                WARN_ON(unshare_pfn_hyp(pfn));
        }
}

/**
 * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
 * @from:        The virtual kernel start address of the range
 * @to:                The virtual kernel end address of the range (exclusive)
 * @prot:        The protection to be applied to this range
 *
 * The same virtual address as the kernel virtual address is also used
 * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
 * physical pages.
 */
int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot)
{
        phys_addr_t phys_addr;
        unsigned long virt_addr;
        unsigned long start = kern_hyp_va((unsigned long)from);
        unsigned long end = kern_hyp_va((unsigned long)to);

        if (is_kernel_in_hyp_mode())
                return 0;

        if (!kvm_host_owns_hyp_mappings())
                return -EPERM;

        start = start & PAGE_MASK;
        end = PAGE_ALIGN(end);

        for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) {
                int err;

                phys_addr = kvm_kaddr_to_phys(from + virt_addr - start);
                err = __create_hyp_mappings(virt_addr, PAGE_SIZE, phys_addr,
                                            prot);
                if (err)
                        return err;
        }

        return 0;
}

static int __hyp_alloc_private_va_range(unsigned long base)
{
        lockdep_assert_held(&kvm_hyp_pgd_mutex);

        if (!PAGE_ALIGNED(base))
                return -EINVAL;

        /*
         * Verify that BIT(VA_BITS - 1) hasn't been flipped by
         * allocating the new area, as it would indicate we've
         * overflowed the idmap/IO address range.
         */
        if ((base ^ io_map_base) & BIT(VA_BITS - 1))
                return -ENOMEM;

        io_map_base = base;

        return 0;
}

/**
 * hyp_alloc_private_va_range - Allocates a private VA range.
 * @size:        The size of the VA range to reserve.
 * @haddr:        The hypervisor virtual start address of the allocation.
 *
 * The private virtual address (VA) range is allocated below io_map_base
 * and aligned based on the order of @size.
 *
 * Return: 0 on success or negative error code on failure.
 */
int hyp_alloc_private_va_range(size_t size, unsigned long *haddr)
{
        unsigned long base;
        int ret = 0;

        mutex_lock(&kvm_hyp_pgd_mutex);

        /*
         * This assumes that we have enough space below the idmap
         * page to allocate our VAs. If not, the check in
         * __hyp_alloc_private_va_range() will kick. A potential
         * alternative would be to detect that overflow and switch
         * to an allocation above the idmap.
         *
         * The allocated size is always a multiple of PAGE_SIZE.
         */
        size = PAGE_ALIGN(size);
        base = io_map_base - size;
        ret = __hyp_alloc_private_va_range(base);

        mutex_unlock(&kvm_hyp_pgd_mutex);

        if (!ret)
                *haddr = base;

        return ret;
}

static int __create_hyp_private_mapping(phys_addr_t phys_addr, size_t size,
                                        unsigned long *haddr,
                                        enum kvm_pgtable_prot prot)
{
        unsigned long addr;
        int ret = 0;

        if (!kvm_host_owns_hyp_mappings()) {
                addr = kvm_call_hyp_nvhe(__pkvm_create_private_mapping,
                                         phys_addr, size, prot);
                if (IS_ERR_VALUE(addr))
                        return addr;
                *haddr = addr;

                return 0;
        }

        size = PAGE_ALIGN(size + offset_in_page(phys_addr));
        ret = hyp_alloc_private_va_range(size, &addr);
        if (ret)
                return ret;

        ret = __create_hyp_mappings(addr, size, phys_addr, prot);
        if (ret)
                return ret;

        *haddr = addr + offset_in_page(phys_addr);
        return ret;
}

int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr)
{
        unsigned long base;
        size_t size;
        int ret;

        mutex_lock(&kvm_hyp_pgd_mutex);
        /*
         * Efficient stack verification using the NVHE_STACK_SHIFT bit implies
         * an alignment of our allocation on the order of the size.
         */
        size = NVHE_STACK_SIZE * 2;
        base = ALIGN_DOWN(io_map_base - size, size);

        ret = __hyp_alloc_private_va_range(base);

        mutex_unlock(&kvm_hyp_pgd_mutex);

        if (ret) {
                kvm_err("Cannot allocate hyp stack guard page\n");
                return ret;
        }

        /*
         * Since the stack grows downwards, map the stack to the page
         * at the higher address and leave the lower guard page
         * unbacked.
         *
         * Any valid stack address now has the NVHE_STACK_SHIFT bit as 1
         * and addresses corresponding to the guard page have the
         * NVHE_STACK_SHIFT bit as 0 - this is used for overflow detection.
         */
        ret = __create_hyp_mappings(base + NVHE_STACK_SIZE, NVHE_STACK_SIZE,
                                    phys_addr, PAGE_HYP);
        if (ret)
                kvm_err("Cannot map hyp stack\n");

        *haddr = base + size;

        return ret;
}

/**
 * create_hyp_io_mappings - Map IO into both kernel and HYP
 * @phys_addr:        The physical start address which gets mapped
 * @size:        Size of the region being mapped
 * @kaddr:        Kernel VA for this mapping
 * @haddr:        HYP VA for this mapping
 */
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
                           void __iomem **kaddr,
                           void __iomem **haddr)
{
        unsigned long addr;
        int ret;

        if (is_protected_kvm_enabled())
                return -EPERM;

        *kaddr = ioremap(phys_addr, size);
        if (!*kaddr)
                return -ENOMEM;

        if (is_kernel_in_hyp_mode()) {
                *haddr = *kaddr;
                return 0;
        }

        ret = __create_hyp_private_mapping(phys_addr, size,
                                           &addr, PAGE_HYP_DEVICE);
        if (ret) {
                iounmap(*kaddr);
                *kaddr = NULL;
                *haddr = NULL;
                return ret;
        }

        *haddr = (void __iomem *)addr;
        return 0;
}

/**
 * create_hyp_exec_mappings - Map an executable range into HYP
 * @phys_addr:        The physical start address which gets mapped
 * @size:        Size of the region being mapped
 * @haddr:        HYP VA for this mapping
 */
int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size,
                             void **haddr)
{
        unsigned long addr;
        int ret;

        BUG_ON(is_kernel_in_hyp_mode());

        ret = __create_hyp_private_mapping(phys_addr, size,
                                           &addr, PAGE_HYP_EXEC);
        if (ret) {
                *haddr = NULL;
                return ret;
        }

        *haddr = (void *)addr;
        return 0;
}

static struct kvm_pgtable_mm_ops kvm_user_mm_ops = {
        /* We shouldn't need any other callback to walk the PT */
        .phys_to_virt                = kvm_host_va,
};

static int get_user_mapping_size(struct kvm *kvm, u64 addr)
{
        struct kvm_pgtable pgt = {
                .pgd                = (kvm_pteref_t)kvm->mm->pgd,
                .ia_bits        = vabits_actual,
                .start_level        = (KVM_PGTABLE_LAST_LEVEL -
                                   ARM64_HW_PGTABLE_LEVELS(pgt.ia_bits) + 1),
                .mm_ops                = &kvm_user_mm_ops,
        };
        unsigned long flags;
        kvm_pte_t pte = 0;        /* Keep GCC quiet... */
        s8 level = S8_MAX;
        int ret;

        /*
         * Disable IRQs so that we hazard against a concurrent
         * teardown of the userspace page tables (which relies on
         * IPI-ing threads).
         */
        local_irq_save(flags);
        ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level);
        local_irq_restore(flags);

        if (ret)
                return ret;

        /*
         * Not seeing an error, but not updating level? Something went
         * deeply wrong...
         */
        if (WARN_ON(level > KVM_PGTABLE_LAST_LEVEL))
                return -EFAULT;
        if (WARN_ON(level < KVM_PGTABLE_FIRST_LEVEL))
                return -EFAULT;

        /* Oops, the userspace PTs are gone... Replay the fault */
        if (!kvm_pte_valid(pte))
                return -EAGAIN;

        return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level));
}

static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = {
        .zalloc_page                = stage2_memcache_zalloc_page,
        .zalloc_pages_exact        = kvm_s2_zalloc_pages_exact,
        .free_pages_exact        = kvm_s2_free_pages_exact,
        .free_unlinked_table        = stage2_free_unlinked_table,
        .get_page                = kvm_host_get_page,
        .put_page                = kvm_s2_put_page,
        .page_count                = kvm_host_page_count,
        .phys_to_virt                = kvm_host_va,
        .virt_to_phys                = kvm_host_pa,
        .dcache_clean_inval_poc        = clean_dcache_guest_page,
        .icache_inval_pou        = invalidate_icache_guest_page,
};

static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
{
        u32 kvm_ipa_limit = get_kvm_ipa_limit();
        u64 mmfr0, mmfr1;
        u32 phys_shift;

        if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
                return -EINVAL;

        phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
        if (is_protected_kvm_enabled()) {
                phys_shift = kvm_ipa_limit;
        } else if (phys_shift) {
                if (phys_shift > kvm_ipa_limit ||
                    phys_shift < ARM64_MIN_PARANGE_BITS)
                        return -EINVAL;
        } else {
                phys_shift = KVM_PHYS_SHIFT;
                if (phys_shift > kvm_ipa_limit) {
                        pr_warn_once("%s using unsupported default IPA limit, upgrade your VMM\n",
                                     current->comm);
                        return -EINVAL;
                }
        }

        mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
        mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
        mmu->vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);

        return 0;
}

/**
 * kvm_init_stage2_mmu - Initialise a S2 MMU structure
 * @kvm:        The pointer to the KVM structure
 * @mmu:        The pointer to the s2 MMU structure
 * @type:        The machine type of the virtual machine
 *
 * Allocates only the stage-2 HW PGD level table(s).
 * Note we don't need locking here as this is only called in two cases:
 *
 * - when the VM is created, which can't race against anything
 *
 * - when secondary kvm_s2_mmu structures are initialised for NV
 *   guests, and the caller must hold kvm->lock as this is called on a
 *   per-vcpu basis.
 */
int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type)
{
        int cpu, err;
        struct kvm_pgtable *pgt;

        /*
         * If we already have our page tables in place, and that the
         * MMU context is the canonical one, we have a bug somewhere,
         * as this is only supposed to ever happen once per VM.
         *
         * Otherwise, we're building nested page tables, and that's
         * probably because userspace called KVM_ARM_VCPU_INIT more
         * than once on the same vcpu. Since that's actually legal,
         * don't kick a fuss and leave gracefully.
         */
        if (mmu->pgt != NULL) {
                if (kvm_is_nested_s2_mmu(kvm, mmu))
                        return 0;

                kvm_err("kvm_arch already initialized?\n");
                return -EINVAL;
        }

        err = kvm_init_ipa_range(mmu, type);
        if (err)
                return err;

        pgt = kzalloc(sizeof(*pgt), GFP_KERNEL_ACCOUNT);
        if (!pgt)
                return -ENOMEM;

        mmu->arch = &kvm->arch;
        err = KVM_PGT_FN(kvm_pgtable_stage2_init)(pgt, mmu, &kvm_s2_mm_ops);
        if (err)
                goto out_free_pgtable;

        mmu->pgt = pgt;
        if (is_protected_kvm_enabled())
                return 0;

        mmu->last_vcpu_ran = alloc_percpu(typeof(*mmu->last_vcpu_ran));
        if (!mmu->last_vcpu_ran) {
                err = -ENOMEM;
                goto out_destroy_pgtable;
        }

        for_each_possible_cpu(cpu)
                *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1;

         /* The eager page splitting is disabled by default */
        mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT;
        mmu->split_page_cache.gfp_zero = __GFP_ZERO;

        mmu->pgd_phys = __pa(pgt->pgd);

        if (kvm_is_nested_s2_mmu(kvm, mmu))
                kvm_init_nested_s2_mmu(mmu);

        return 0;

out_destroy_pgtable:
        KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
out_free_pgtable:
        kfree(pgt);
        return err;
}

void kvm_uninit_stage2_mmu(struct kvm *kvm)
{
        kvm_free_stage2_pgd(&kvm->arch.mmu);
        kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
}

static void stage2_unmap_memslot(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot)
{
        hva_t hva = memslot->userspace_addr;
        phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
        phys_addr_t size = PAGE_SIZE * memslot->npages;
        hva_t reg_end = hva + size;

        /*
         * A memory region could potentially cover multiple VMAs, and any holes
         * between them, so iterate over all of them to find out if we should
         * unmap any of them.
         *
         *     +--------------------------------------------+
         * +---------------+----------------+   +----------------+
         * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
         * +---------------+----------------+   +----------------+
         *     |               memory region                |
         *     +--------------------------------------------+
         */
        do {
                struct vm_area_struct *vma;
                hva_t vm_start, vm_end;

                vma = find_vma_intersection(current->mm, hva, reg_end);
                if (!vma)
                        break;

                /*
                 * Take the intersection of this VMA with the memory region
                 */
                vm_start = max(hva, vma->vm_start);
                vm_end = min(reg_end, vma->vm_end);

                if (!(vma->vm_flags & VM_PFNMAP)) {
                        gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
                        kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true);
                }
                hva = vm_end;
        } while (hva < reg_end);
}

/**
 * stage2_unmap_vm - Unmap Stage-2 RAM mappings
 * @kvm: The struct kvm pointer
 *
 * Go through the memregions and unmap any regular RAM
 * backing memory already mapped to the VM.
 */
void stage2_unmap_vm(struct kvm *kvm)
{
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        int idx, bkt;

        idx = srcu_read_lock(&kvm->srcu);
        mmap_read_lock(current->mm);
        write_lock(&kvm->mmu_lock);

        slots = kvm_memslots(kvm);
        kvm_for_each_memslot(memslot, bkt, slots)
                stage2_unmap_memslot(kvm, memslot);

        kvm_nested_s2_unmap(kvm, true);

        write_unlock(&kvm->mmu_lock);
        mmap_read_unlock(current->mm);
        srcu_read_unlock(&kvm->srcu, idx);
}

void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
{
        struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
        struct kvm_pgtable *pgt = NULL;

        write_lock(&kvm->mmu_lock);
        pgt = mmu->pgt;
        if (pgt) {
                mmu->pgd_phys = 0;
                mmu->pgt = NULL;
                free_percpu(mmu->last_vcpu_ran);
        }
        write_unlock(&kvm->mmu_lock);

        if (pgt) {
                KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
                kfree(pgt);
        }
}

static void hyp_mc_free_fn(void *addr, void *mc)
{
        struct kvm_hyp_memcache *memcache = mc;

        if (memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
                kvm_account_pgtable_pages(addr, -1);

        free_page((unsigned long)addr);
}

static void *hyp_mc_alloc_fn(void *mc)
{
        struct kvm_hyp_memcache *memcache = mc;
        void *addr;

        addr = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
        if (addr && memcache->flags & HYP_MEMCACHE_ACCOUNT_STAGE2)
                kvm_account_pgtable_pages(addr, 1);

        return addr;
}

void free_hyp_memcache(struct kvm_hyp_memcache *mc)
{
        if (!is_protected_kvm_enabled())
                return;

        kfree(mc->mapping);
        __free_hyp_memcache(mc, hyp_mc_free_fn, kvm_host_va, mc);
}

int topup_hyp_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages)
{
        if (!is_protected_kvm_enabled())
                return 0;

        if (!mc->mapping) {
                mc->mapping = kzalloc(sizeof(struct pkvm_mapping), GFP_KERNEL_ACCOUNT);
                if (!mc->mapping)
                        return -ENOMEM;
        }

        return __topup_hyp_memcache(mc, min_pages, hyp_mc_alloc_fn,
                                    kvm_host_pa, mc);
}

/**
 * kvm_phys_addr_ioremap - map a device range to guest IPA
 *
 * @kvm:        The KVM pointer
 * @guest_ipa:        The IPA at which to insert the mapping
 * @pa:                The physical address of the device
 * @size:        The size of the mapping
 * @writable:   Whether or not to create a writable mapping
 */
int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
                          phys_addr_t pa, unsigned long size, bool writable)
{
        phys_addr_t addr;
        int ret = 0;
        struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
        struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
        struct kvm_pgtable *pgt = mmu->pgt;
        enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_DEVICE |
                                     KVM_PGTABLE_PROT_R |
                                     (writable ? KVM_PGTABLE_PROT_W : 0);

        if (is_protected_kvm_enabled())
                return -EPERM;

        size += offset_in_page(guest_ipa);
        guest_ipa &= PAGE_MASK;

        for (addr = guest_ipa; addr < guest_ipa + size; addr += PAGE_SIZE) {
                ret = kvm_mmu_topup_memory_cache(&cache,
                                                 kvm_mmu_cache_min_pages(mmu));
                if (ret)
                        break;

                write_lock(&kvm->mmu_lock);
                ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, addr, PAGE_SIZE,
                                 pa, prot, &cache, 0);
                write_unlock(&kvm->mmu_lock);
                if (ret)
                        break;

                pa += PAGE_SIZE;
        }

        kvm_mmu_free_memory_cache(&cache);
        return ret;
}

/**
 * kvm_stage2_wp_range() - write protect stage2 memory region range
 * @mmu:        The KVM stage-2 MMU pointer
 * @addr:        Start address of range
 * @end:        End address of range
 */
void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end)
{
        stage2_apply_range_resched(mmu, addr, end, KVM_PGT_FN(kvm_pgtable_stage2_wrprotect));
}

/**
 * kvm_mmu_wp_memory_region() - write protect stage 2 entries for memory slot
 * @kvm:        The KVM pointer
 * @slot:        The memory slot to write protect
 *
 * Called to start logging dirty pages after memory region
 * KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
 * all present PUD, PMD and PTEs are write protected in the memory region.
 * Afterwards read of dirty page log can be called.
 *
 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
 * serializing operations for VM memory regions.
 */
static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot)
{
        struct kvm_memslots *slots = kvm_memslots(kvm);
        struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
        phys_addr_t start, end;

        if (WARN_ON_ONCE(!memslot))
                return;

        start = memslot->base_gfn << PAGE_SHIFT;
        end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;

        write_lock(&kvm->mmu_lock);
        kvm_stage2_wp_range(&kvm->arch.mmu, start, end);
        kvm_nested_s2_wp(kvm);
        write_unlock(&kvm->mmu_lock);
        kvm_flush_remote_tlbs_memslot(kvm, memslot);
}

/**
 * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE
 *                                   pages for memory slot
 * @kvm:        The KVM pointer
 * @slot:        The memory slot to split
 *
 * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired,
 * serializing operations for VM memory regions.
 */
static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot)
{
        struct kvm_memslots *slots;
        struct kvm_memory_slot *memslot;
        phys_addr_t start, end;

        lockdep_assert_held(&kvm->slots_lock);

        slots = kvm_memslots(kvm);
        memslot = id_to_memslot(slots, slot);

        start = memslot->base_gfn << PAGE_SHIFT;
        end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;

        write_lock(&kvm->mmu_lock);
        kvm_mmu_split_huge_pages(kvm, start, end);
        write_unlock(&kvm->mmu_lock);
}

/*
 * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages.
 * @kvm:        The KVM pointer
 * @slot:        The memory slot associated with mask
 * @gfn_offset:        The gfn offset in memory slot
 * @mask:        The mask of pages at offset 'gfn_offset' in this memory
 *                slot to enable dirty logging on
 *
 * Writes protect selected pages to enable dirty logging, and then
 * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock.
 */
void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
                struct kvm_memory_slot *slot,
                gfn_t gfn_offset, unsigned long mask)
{
        phys_addr_t base_gfn = slot->base_gfn + gfn_offset;
        phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
        phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;

        lockdep_assert_held_write(&kvm->mmu_lock);

        kvm_stage2_wp_range(&kvm->arch.mmu, start, end);

        /*
         * Eager-splitting is done when manual-protect is set.  We
         * also check for initially-all-set because we can avoid
         * eager-splitting if initially-all-set is false.
         * Initially-all-set equal false implies that huge-pages were
         * already split when enabling dirty logging: no need to do it
         * again.
         */
        if (kvm_dirty_log_manual_protect_and_init_set(kvm))
                kvm_mmu_split_huge_pages(kvm, start, end);

        kvm_nested_s2_wp(kvm);
}

static void kvm_send_hwpoison_signal(unsigned long address, short lsb)
{
        send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
}

static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot,
                                               unsigned long hva,
                                               unsigned long map_size)
{
        gpa_t gpa_start;
        hva_t uaddr_start, uaddr_end;
        size_t size;

        /* The memslot and the VMA are guaranteed to be aligned to PAGE_SIZE */
        if (map_size == PAGE_SIZE)
                return true;

        /* pKVM only supports PMD_SIZE huge-mappings */
        if (is_protected_kvm_enabled() && map_size != PMD_SIZE)
                return false;

        size = memslot->npages * PAGE_SIZE;

        gpa_start = memslot->base_gfn << PAGE_SHIFT;

        uaddr_start = memslot->userspace_addr;
        uaddr_end = uaddr_start + size;

        /*
         * Pages belonging to memslots that don't have the same alignment
         * within a PMD/PUD for userspace and IPA cannot be mapped with stage-2
         * PMD/PUD entries, because we'll end up mapping the wrong pages.
         *
         * Consider a layout like the following:
         *
         *    memslot->userspace_addr:
         *    +-----+--------------------+--------------------+---+
         *    |abcde|fgh  Stage-1 block  |    Stage-1 block tv|xyz|
         *    +-----+--------------------+--------------------+---+
         *
         *    memslot->base_gfn << PAGE_SHIFT:
         *      +---+--------------------+--------------------+-----+
         *      |abc|def  Stage-2 block  |    Stage-2 block   |tvxyz|
         *      +---+--------------------+--------------------+-----+
         *
         * If we create those stage-2 blocks, we'll end up with this incorrect
         * mapping:
         *   d -> f
         *   e -> g
         *   f -> h
         */
        if ((gpa_start & (map_size - 1)) != (uaddr_start & (map_size - 1)))
                return false;

        /*
         * Next, let's make sure we're not trying to map anything not covered
         * by the memslot. This means we have to prohibit block size mappings
         * for the beginning and end of a non-block aligned and non-block sized
         * memory slot (illustrated by the head and tail parts of the
         * userspace view above containing pages 'abcde' and 'xyz',
         * respectively).
         *
         * Note that it doesn't matter if we do the check using the
         * userspace_addr or the base_gfn, as both are equally aligned (per
         * the check above) and equally sized.
         */
        return (hva & ~(map_size - 1)) >= uaddr_start &&
               (hva & ~(map_size - 1)) + map_size <= uaddr_end;
}

/*
 * Check if the given hva is backed by a transparent huge page (THP) and
 * whether it can be mapped using block mapping in stage2. If so, adjust
 * the stage2 PFN and IPA accordingly. Only PMD_SIZE THPs are currently
 * supported. This will need to be updated to support other THP sizes.
 *
 * Returns the size of the mapping.
 */
static long
transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot,
                            unsigned long hva, kvm_pfn_t *pfnp,
                            phys_addr_t *ipap)
{
        kvm_pfn_t pfn = *pfnp;

        /*
         * Make sure the adjustment is done only for THP pages. Also make
         * sure that the HVA and IPA are sufficiently aligned and that the
         * block map is contained within the memslot.
         */
        if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) {
                int sz = get_user_mapping_size(kvm, hva);

                if (sz < 0)
                        return sz;

                if (sz < PMD_SIZE)
                        return PAGE_SIZE;

                *ipap &= PMD_MASK;
                pfn &= ~(PTRS_PER_PMD - 1);
                *pfnp = pfn;

                return PMD_SIZE;
        }

        /* Use page mapping if we cannot use block mapping. */
        return PAGE_SIZE;
}

static int get_vma_page_shift(struct vm_area_struct *vma, unsigned long hva)
{
        unsigned long pa;

        if (is_vm_hugetlb_page(vma) && !(vma->vm_flags & VM_PFNMAP))
                return huge_page_shift(hstate_vma(vma));

        if (!(vma->vm_flags & VM_PFNMAP))
                return PAGE_SHIFT;

        VM_BUG_ON(is_vm_hugetlb_page(vma));

        pa = (vma->vm_pgoff << PAGE_SHIFT) + (hva - vma->vm_start);

#ifndef __PAGETABLE_PMD_FOLDED
        if ((hva & (PUD_SIZE - 1)) == (pa & (PUD_SIZE - 1)) &&
            ALIGN_DOWN(hva, PUD_SIZE) >= vma->vm_start &&
            ALIGN(hva, PUD_SIZE) <= vma->vm_end)
                return PUD_SHIFT;
#endif

        if ((hva & (PMD_SIZE - 1)) == (pa & (PMD_SIZE - 1)) &&
            ALIGN_DOWN(hva, PMD_SIZE) >= vma->vm_start &&
            ALIGN(hva, PMD_SIZE) <= vma->vm_end)
                return PMD_SHIFT;

        return PAGE_SHIFT;
}

/*
 * The page will be mapped in stage 2 as Normal Cacheable, so the VM will be
 * able to see the page's tags and therefore they must be initialised first. If
 * PG_mte_tagged is set, tags have already been initialised.
 *
 * The race in the test/set of the PG_mte_tagged flag is handled by:
 * - preventing VM_SHARED mappings in a memslot with MTE preventing two VMs
 *   racing to santise the same page
 * - mmap_lock protects between a VM faulting a page in and the VMM performing
 *   an mprotect() to add VM_MTE
 */
static void sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
                              unsigned long size)
{
        unsigned long i, nr_pages = size >> PAGE_SHIFT;
        struct page *page = pfn_to_page(pfn);
        struct folio *folio = page_folio(page);

        if (!kvm_has_mte(kvm))
                return;

        if (folio_test_hugetlb(folio)) {
                /* Hugetlb has MTE flags set on head page only */
                if (folio_try_hugetlb_mte_tagging(folio)) {
                        for (i = 0; i < nr_pages; i++, page++)
                                mte_clear_page_tags(page_address(page));
                        folio_set_hugetlb_mte_tagged(folio);
                }
                return;
        }

        for (i = 0; i < nr_pages; i++, page++) {
                if (try_page_mte_tagging(page)) {
                        mte_clear_page_tags(page_address(page));
                        set_page_mte_tagged(page);
                }
        }
}

static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
{
        return vma->vm_flags & VM_MTE_ALLOWED;
}

static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
                          struct kvm_s2_trans *nested,
                          struct kvm_memory_slot *memslot, unsigned long hva,
                          bool fault_is_perm)
{
        int ret = 0;
        bool write_fault, writable, force_pte = false;
        bool exec_fault, mte_allowed;
        bool device = false, vfio_allow_any_uc = false;
        unsigned long mmu_seq;
        phys_addr_t ipa = fault_ipa;
        struct kvm *kvm = vcpu->kvm;
        struct vm_area_struct *vma;
        short vma_shift;
        void *memcache;
        gfn_t gfn;
        kvm_pfn_t pfn;
        bool logging_active = memslot_is_logging(memslot);
        long vma_pagesize, fault_granule;
        enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
        struct kvm_pgtable *pgt;
        struct page *page;
        enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;

        if (fault_is_perm)
                fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
        write_fault = kvm_is_write_fault(vcpu);
        exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
        VM_BUG_ON(write_fault && exec_fault);

        if (fault_is_perm && !write_fault && !exec_fault) {
                kvm_err("Unexpected L2 read permission error\n");
                return -EFAULT;
        }

        /*
         * Permission faults just need to update the existing leaf entry,
         * and so normally don't require allocations from the memcache. The
         * only exception to this is when dirty logging is enabled at runtime
         * and a write fault needs to collapse a block entry into a table.
         */
        if (!fault_is_perm || (logging_active && write_fault)) {
                int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);

                if (!is_protected_kvm_enabled()) {
                        memcache = &vcpu->arch.mmu_page_cache;
                        ret = kvm_mmu_topup_memory_cache(memcache, min_pages);
                } else {
                        memcache = &vcpu->arch.pkvm_memcache;
                        ret = topup_hyp_memcache(memcache, min_pages);
                }
                if (ret)
                        return ret;
        }

        /*
         * Let's check if we will get back a huge page backed by hugetlbfs, or
         * get block mapping for device MMIO region.
         */
        mmap_read_lock(current->mm);
        vma = vma_lookup(current->mm, hva);
        if (unlikely(!vma)) {
                kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
                mmap_read_unlock(current->mm);
                return -EFAULT;
        }

        /*
         * logging_active is guaranteed to never be true for VM_PFNMAP
         * memslots.
         */
        if (logging_active) {
                force_pte = true;
                vma_shift = PAGE_SHIFT;
        } else {
                vma_shift = get_vma_page_shift(vma, hva);
        }

        switch (vma_shift) {
#ifndef __PAGETABLE_PMD_FOLDED
        case PUD_SHIFT:
                if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
                        break;
                fallthrough;
#endif
        case CONT_PMD_SHIFT:
                vma_shift = PMD_SHIFT;
                fallthrough;
        case PMD_SHIFT:
                if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
                        break;
                fallthrough;
        case CONT_PTE_SHIFT:
                vma_shift = PAGE_SHIFT;
                force_pte = true;
                fallthrough;
        case PAGE_SHIFT:
                break;
        default:
                WARN_ONCE(1, "Unknown vma_shift %d", vma_shift);
        }

        vma_pagesize = 1UL << vma_shift;

        if (nested) {
                unsigned long max_map_size;

                max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;

                ipa = kvm_s2_trans_output(nested);

                /*
                 * If we're about to create a shadow stage 2 entry, then we
                 * can only create a block mapping if the guest stage 2 page
                 * table uses at least as big a mapping.
                 */
                max_map_size = min(kvm_s2_trans_size(nested), max_map_size);

                /*
                 * Be careful that if the mapping size falls between
                 * two host sizes, take the smallest of the two.
                 */
                if (max_map_size >= PMD_SIZE && max_map_size < PUD_SIZE)
                        max_map_size = PMD_SIZE;
                else if (max_map_size >= PAGE_SIZE && max_map_size < PMD_SIZE)
                        max_map_size = PAGE_SIZE;

                force_pte = (max_map_size == PAGE_SIZE);
                vma_pagesize = min(vma_pagesize, (long)max_map_size);
        }

        /*
         * Both the canonical IPA and fault IPA must be hugepage-aligned to
         * ensure we find the right PFN and lay down the mapping in the right
         * place.
         */
        if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
                fault_ipa &= ~(vma_pagesize - 1);
                ipa &= ~(vma_pagesize - 1);
        }

        gfn = ipa >> PAGE_SHIFT;
        mte_allowed = kvm_vma_mte_allowed(vma);

        vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;

        /* Don't use the VMA after the unlock -- it may have vanished */
        vma = NULL;

        /*
         * Read mmu_invalidate_seq so that KVM can detect if the results of
         * vma_lookup() or __kvm_faultin_pfn() become stale prior to
         * acquiring kvm->mmu_lock.
         *
         * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
         * with the smp_wmb() in kvm_mmu_invalidate_end().
         */
        mmu_seq = vcpu->kvm->mmu_invalidate_seq;
        mmap_read_unlock(current->mm);

        pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
                                &writable, &page);
        if (pfn == KVM_PFN_ERR_HWPOISON) {
                kvm_send_hwpoison_signal(hva, vma_shift);
                return 0;
        }
        if (is_error_noslot_pfn(pfn))
                return -EFAULT;

        if (kvm_is_device_pfn(pfn)) {
                /*
                 * If the page was identified as device early by looking at
                 * the VMA flags, vma_pagesize is already representing the
                 * largest quantity we can map.  If instead it was mapped
                 * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
                 * and must not be upgraded.
                 *
                 * In both cases, we don't let transparent_hugepage_adjust()
                 * change things at the last minute.
                 */
                device = true;
        } else if (logging_active && !write_fault) {
                /*
                 * Only actually map the page as writable if this was a write
                 * fault.
                 */
                writable = false;
        }

        if (exec_fault && device)
                return -ENOEXEC;

        /*
         * Potentially reduce shadow S2 permissions to match the guest's own
         * S2. For exec faults, we'd only reach this point if the guest
         * actually allowed it (see kvm_s2_handle_perm_fault).
         *
         * Also encode the level of the original translation in the SW bits
         * of the leaf entry as a proxy for the span of that translation.
         * This will be retrieved on TLB invalidation from the guest and
         * used to limit the invalidation scope if a TTL hint or a range
         * isn't provided.
         */
        if (nested) {
                writable &= kvm_s2_trans_writable(nested);
                if (!kvm_s2_trans_readable(nested))
                        prot &= ~KVM_PGTABLE_PROT_R;

                prot |= kvm_encode_nested_level(nested);
        }

        kvm_fault_lock(kvm);
        pgt = vcpu->arch.hw_mmu->pgt;
        if (mmu_invalidate_retry(kvm, mmu_seq)) {
                ret = -EAGAIN;
                goto out_unlock;
        }

        /*
         * If we are not forced to use page mapping, check if we are
         * backed by a THP and thus use block mapping if possible.
         */
        if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
                if (fault_is_perm && fault_granule > PAGE_SIZE)
                        vma_pagesize = fault_granule;
                else
                        vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
                                                                   hva, &pfn,
                                                                   &fault_ipa);

                if (vma_pagesize < 0) {
                        ret = vma_pagesize;
                        goto out_unlock;
                }
        }

        if (!fault_is_perm && !device && kvm_has_mte(kvm)) {
                /* Check the VMM hasn't introduced a new disallowed VMA */
                if (mte_allowed) {
                        sanitise_mte_tags(kvm, pfn, vma_pagesize);
                } else {
                        ret = -EFAULT;
                        goto out_unlock;
                }
        }

        if (writable)
                prot |= KVM_PGTABLE_PROT_W;

        if (exec_fault)
                prot |= KVM_PGTABLE_PROT_X;

        if (device) {
                if (vfio_allow_any_uc)
                        prot |= KVM_PGTABLE_PROT_NORMAL_NC;
                else
                        prot |= KVM_PGTABLE_PROT_DEVICE;
        } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
                   (!nested || kvm_s2_trans_executable(nested))) {
                prot |= KVM_PGTABLE_PROT_X;
        }

        /*
         * Under the premise of getting a FSC_PERM fault, we just need to relax
         * permissions only if vma_pagesize equals fault_granule. Otherwise,
         * kvm_pgtable_stage2_map() should be called to change block size.
         */
        if (fault_is_perm && vma_pagesize == fault_granule) {
                /*
                 * Drop the SW bits in favour of those stored in the
                 * PTE, which will be preserved.
                 */
                prot &= ~KVM_NV_GUEST_MAP_SZ;
                ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
        } else {
                ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
                                             __pfn_to_phys(pfn), prot,
                                             memcache, flags);
        }

out_unlock:
        kvm_release_faultin_page(kvm, page, !!ret, writable);
        kvm_fault_unlock(kvm);

        /* Mark the page dirty only if the fault is handled successfully */
        if (writable && !ret)
                mark_page_dirty_in_slot(kvm, memslot, gfn);

        return ret != -EAGAIN ? ret : 0;
}

/* Resolve the access fault by making the page young again. */
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
        enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
        struct kvm_s2_mmu *mmu;

        trace_kvm_access_fault(fault_ipa);

        read_lock(&vcpu->kvm->mmu_lock);
        mmu = vcpu->arch.hw_mmu;
        KVM_PGT_FN(kvm_pgtable_stage2_mkyoung)(mmu->pgt, fault_ipa, flags);
        read_unlock(&vcpu->kvm->mmu_lock);
}

/**
 * kvm_handle_guest_abort - handles all 2nd stage aborts
 * @vcpu:        the VCPU pointer
 *
 * Any abort that gets to the host is almost guaranteed to be caused by a
 * missing second stage translation table entry, which can mean that either the
 * guest simply needs more memory and we must allocate an appropriate page or it
 * can mean that the guest tried to access I/O memory, which is emulated by user
 * space. The distinction is based on the IPA causing the fault and whether this
 * memory region has been registered as standard RAM by user space.
 */
int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
{
        struct kvm_s2_trans nested_trans, *nested = NULL;
        unsigned long esr;
        phys_addr_t fault_ipa; /* The address we faulted on */
        phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
        struct kvm_memory_slot *memslot;
        unsigned long hva;
        bool is_iabt, write_fault, writable;
        gfn_t gfn;
        int ret, idx;

        /* Synchronous External Abort? */
        if (kvm_vcpu_abt_issea(vcpu)) {
                /*
                 * For RAS the host kernel may handle this abort.
                 * There is no need to pass the error into the guest.
                 */
                if (kvm_handle_guest_sea())
                        kvm_inject_vabt(vcpu);

                return 1;
        }

        esr = kvm_vcpu_get_esr(vcpu);

        /*
         * The fault IPA should be reliable at this point as we're not dealing
         * with an SEA.
         */
        ipa = fault_ipa = kvm_vcpu_get_fault_ipa(vcpu);
        if (KVM_BUG_ON(ipa == INVALID_GPA, vcpu->kvm))
                return -EFAULT;

        is_iabt = kvm_vcpu_trap_is_iabt(vcpu);

        if (esr_fsc_is_translation_fault(esr)) {
                /* Beyond sanitised PARange (which is the IPA limit) */
                if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) {
                        kvm_inject_size_fault(vcpu);
                        return 1;
                }

                /* Falls between the IPA range and the PARange? */
                if (fault_ipa >= BIT_ULL(VTCR_EL2_IPA(vcpu->arch.hw_mmu->vtcr))) {
                        fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);

                        if (is_iabt)
                                kvm_inject_pabt(vcpu, fault_ipa);
                        else
                                kvm_inject_dabt(vcpu, fault_ipa);
                        return 1;
                }
        }

        trace_kvm_guest_fault(*vcpu_pc(vcpu), kvm_vcpu_get_esr(vcpu),
                              kvm_vcpu_get_hfar(vcpu), fault_ipa);

        /* Check the stage-2 fault is trans. fault or write fault */
        if (!esr_fsc_is_translation_fault(esr) &&
            !esr_fsc_is_permission_fault(esr) &&
            !esr_fsc_is_access_flag_fault(esr)) {
                kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
                        kvm_vcpu_trap_get_class(vcpu),
                        (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
                        (unsigned long)kvm_vcpu_get_esr(vcpu));
                return -EFAULT;
        }

        idx = srcu_read_lock(&vcpu->kvm->srcu);

        /*
         * We may have faulted on a shadow stage 2 page table if we are
         * running a nested guest.  In this case, we have to resolve the L2
         * IPA to the L1 IPA first, before knowing what kind of memory should
         * back the L1 IPA.
         *
         * If the shadow stage 2 page table walk faults, then we simply inject
         * this to the guest and carry on.
         *
         * If there are no shadow S2 PTs because S2 is disabled, there is
         * nothing to walk and we treat it as a 1:1 before going through the
         * canonical translation.
         */
        if (kvm_is_nested_s2_mmu(vcpu->kvm,vcpu->arch.hw_mmu) &&
            vcpu->arch.hw_mmu->nested_stage2_enabled) {
                u32 esr;

                ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
                if (ret) {
                        esr = kvm_s2_trans_esr(&nested_trans);
                        kvm_inject_s2_fault(vcpu, esr);
                        goto out_unlock;
                }

                ret = kvm_s2_handle_perm_fault(vcpu, &nested_trans);
                if (ret) {
                        esr = kvm_s2_trans_esr(&nested_trans);
                        kvm_inject_s2_fault(vcpu, esr);
                        goto out_unlock;
                }

                ipa = kvm_s2_trans_output(&nested_trans);
                nested = &nested_trans;
        }

        gfn = ipa >> PAGE_SHIFT;
        memslot = gfn_to_memslot(vcpu->kvm, gfn);
        hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
        write_fault = kvm_is_write_fault(vcpu);
        if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
                /*
                 * The guest has put either its instructions or its page-tables
                 * somewhere it shouldn't have. Userspace won't be able to do
                 * anything about this (there's no syndrome for a start), so
                 * re-inject the abort back into the guest.
                 */
                if (is_iabt) {
                        ret = -ENOEXEC;
                        goto out;
                }

                if (kvm_vcpu_abt_iss1tw(vcpu)) {
                        kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
                        ret = 1;
                        goto out_unlock;
                }

                /*
                 * Check for a cache maintenance operation. Since we
                 * ended-up here, we know it is outside of any memory
                 * slot. But we can't find out if that is for a device,
                 * or if the guest is just being stupid. The only thing
                 * we know for sure is that this range cannot be cached.
                 *
                 * So let's assume that the guest is just being
                 * cautious, and skip the instruction.
                 */
                if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
                        kvm_incr_pc(vcpu);
                        ret = 1;
                        goto out_unlock;
                }

                /*
                 * The IPA is reported as [MAX:12], so we need to
                 * complement it with the bottom 12 bits from the
                 * faulting VA. This is always 12 bits, irrespective
                 * of the page size.
                 */
                ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
                ret = io_mem_abort(vcpu, ipa);
                goto out_unlock;
        }

        /* Userspace should not be able to register out-of-bounds IPAs */
        VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));

        if (esr_fsc_is_access_flag_fault(esr)) {
                handle_access_fault(vcpu, fault_ipa);
                ret = 1;
                goto out_unlock;
        }

        ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
                             esr_fsc_is_permission_fault(esr));
        if (ret == 0)
                ret = 1;
out:
        if (ret == -ENOEXEC) {
                kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
                ret = 1;
        }
out_unlock:
        srcu_read_unlock(&vcpu->kvm->srcu, idx);
        return ret;
}

bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
        if (!kvm->arch.mmu.pgt)
                return false;

        __unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
                             (range->end - range->start) << PAGE_SHIFT,
                             range->may_block);

        kvm_nested_s2_unmap(kvm, range->may_block);
        return false;
}

bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
        u64 size = (range->end - range->start) << PAGE_SHIFT;

        if (!kvm->arch.mmu.pgt)
                return false;

        return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
                                                   range->start << PAGE_SHIFT,
                                                   size, true);
        /*
         * TODO: Handle nested_mmu structures here using the reverse mapping in
         * a later version of patch series.
         */
}

bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
{
        u64 size = (range->end - range->start) << PAGE_SHIFT;

        if (!kvm->arch.mmu.pgt)
                return false;

        return KVM_PGT_FN(kvm_pgtable_stage2_test_clear_young)(kvm->arch.mmu.pgt,
                                                   range->start << PAGE_SHIFT,
                                                   size, false);
}

phys_addr_t kvm_mmu_get_httbr(void)
{
        return __pa(hyp_pgtable->pgd);
}

phys_addr_t kvm_get_idmap_vector(void)
{
        return hyp_idmap_vector;
}

static int kvm_map_idmap_text(void)
{
        unsigned long size = hyp_idmap_end - hyp_idmap_start;
        int err = __create_hyp_mappings(hyp_idmap_start, size, hyp_idmap_start,
                                        PAGE_HYP_EXEC);
        if (err)
                kvm_err("Failed to idmap %lx-%lx\n",
                        hyp_idmap_start, hyp_idmap_end);

        return err;
}

static void *kvm_hyp_zalloc_page(void *arg)
{
        return (void *)get_zeroed_page(GFP_KERNEL);
}

static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = {
        .zalloc_page                = kvm_hyp_zalloc_page,
        .get_page                = kvm_host_get_page,
        .put_page                = kvm_host_put_page,
        .phys_to_virt                = kvm_host_va,
        .virt_to_phys                = kvm_host_pa,
};

int __init kvm_mmu_init(u32 *hyp_va_bits)
{
        int err;
        u32 idmap_bits;
        u32 kernel_bits;

        hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start);
        hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE);
        hyp_idmap_end = __pa_symbol(__hyp_idmap_text_end);
        hyp_idmap_end = ALIGN(hyp_idmap_end, PAGE_SIZE);
        hyp_idmap_vector = __pa_symbol(__kvm_hyp_init);

        /*
         * We rely on the linker script to ensure at build time that the HYP
         * init code does not cross a page boundary.
         */
        BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);

        /*
         * The ID map is always configured for 48 bits of translation, which
         * may be fewer than the number of VA bits used by the regular kernel
         * stage 1, when VA_BITS=52.
         *
         * At EL2, there is only one TTBR register, and we can't switch between
         * translation tables *and* update TCR_EL2.T0SZ at the same time. Bottom
         * line: we need to use the extended range with *both* our translation
         * tables.
         *
         * So use the maximum of the idmap VA bits and the regular kernel stage
         * 1 VA bits to assure that the hypervisor can both ID map its code page
         * and map any kernel memory.
         */
        idmap_bits = IDMAP_VA_BITS;
        kernel_bits = vabits_actual;
        *hyp_va_bits = max(idmap_bits, kernel_bits);

        kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits);
        kvm_debug("IDMAP page: %lx\n", hyp_idmap_start);
        kvm_debug("HYP VA range: %lx:%lx\n",
                  kern_hyp_va(PAGE_OFFSET),
                  kern_hyp_va((unsigned long)high_memory - 1));

        if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
            hyp_idmap_start <  kern_hyp_va((unsigned long)high_memory - 1) &&
            hyp_idmap_start != (unsigned long)__hyp_idmap_text_start) {
                /*
                 * The idmap page is intersecting with the VA space,
                 * it is not safe to continue further.
                 */
                kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
                err = -EINVAL;
                goto out;
        }

        hyp_pgtable = kzalloc(sizeof(*hyp_pgtable), GFP_KERNEL);
        if (!hyp_pgtable) {
                kvm_err("Hyp mode page-table not allocated\n");
                err = -ENOMEM;
                goto out;
        }

        err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops);
        if (err)
                goto out_free_pgtable;

        err = kvm_map_idmap_text();
        if (err)
                goto out_destroy_pgtable;

        io_map_base = hyp_idmap_start;
        __hyp_va_bits = *hyp_va_bits;
        return 0;

out_destroy_pgtable:
        kvm_pgtable_hyp_destroy(hyp_pgtable);
out_free_pgtable:
        kfree(hyp_pgtable);
        hyp_pgtable = NULL;
out:
        return err;
}

void kvm_arch_commit_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *old,
                                   const struct kvm_memory_slot *new,
                                   enum kvm_mr_change change)
{
        bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES;

        /*
         * At this point memslot has been committed and there is an
         * allocated dirty_bitmap[], dirty pages will be tracked while the
         * memory slot is write protected.
         */
        if (log_dirty_pages) {

                if (change == KVM_MR_DELETE)
                        return;

                /*
                 * Huge and normal pages are write-protected and split
                 * on either of these two cases:
                 *
                 * 1. with initial-all-set: gradually with CLEAR ioctls,
                 */
                if (kvm_dirty_log_manual_protect_and_init_set(kvm))
                        return;
                /*
                 * or
                 * 2. without initial-all-set: all in one shot when
                 *    enabling dirty logging.
                 */
                kvm_mmu_wp_memory_region(kvm, new->id);
                kvm_mmu_split_memory_region(kvm, new->id);
        } else {
                /*
                 * Free any leftovers from the eager page splitting cache. Do
                 * this when deleting, moving, disabling dirty logging, or
                 * creating the memslot (a nop). Doing it for deletes makes
                 * sure we don't leak memory, and there's no need to keep the
                 * cache around for any of the other cases.
                 */
                kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
        }
}

int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   const struct kvm_memory_slot *old,
                                   struct kvm_memory_slot *new,
                                   enum kvm_mr_change change)
{
        hva_t hva, reg_end;
        int ret = 0;

        if (change != KVM_MR_CREATE && change != KVM_MR_MOVE &&
                        change != KVM_MR_FLAGS_ONLY)
                return 0;

        /*
         * Prevent userspace from creating a memory region outside of the IPA
         * space addressable by the KVM guest IPA space.
         */
        if ((new->base_gfn + new->npages) > (kvm_phys_size(&kvm->arch.mmu) >> PAGE_SHIFT))
                return -EFAULT;

        hva = new->userspace_addr;
        reg_end = hva + (new->npages << PAGE_SHIFT);

        mmap_read_lock(current->mm);
        /*
         * A memory region could potentially cover multiple VMAs, and any holes
         * between them, so iterate over all of them.
         *
         *     +--------------------------------------------+
         * +---------------+----------------+   +----------------+
         * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
         * +---------------+----------------+   +----------------+
         *     |               memory region                |
         *     +--------------------------------------------+
         */
        do {
                struct vm_area_struct *vma;

                vma = find_vma_intersection(current->mm, hva, reg_end);
                if (!vma)
                        break;

                if (kvm_has_mte(kvm) && !kvm_vma_mte_allowed(vma)) {
                        ret = -EINVAL;
                        break;
                }

                if (vma->vm_flags & VM_PFNMAP) {
                        /* IO region dirty page logging not allowed */
                        if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
                                ret = -EINVAL;
                                break;
                        }
                }
                hva = min(reg_end, vma->vm_end);
        } while (hva < reg_end);

        mmap_read_unlock(current->mm);
        return ret;
}

void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
}

void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
}

void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
                                   struct kvm_memory_slot *slot)
{
        gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
        phys_addr_t size = slot->npages << PAGE_SHIFT;

        write_lock(&kvm->mmu_lock);
        kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
        kvm_nested_s2_unmap(kvm, true);
        write_unlock(&kvm->mmu_lock);
}

/*
 * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized).
 *
 * Main problems:
 * - S/W ops are local to a CPU (not broadcast)
 * - We have line migration behind our back (speculation)
 * - System caches don't support S/W at all (damn!)
 *
 * In the face of the above, the best we can do is to try and convert
 * S/W ops to VA ops. Because the guest is not allowed to infer the
 * S/W to PA mapping, it can only use S/W to nuke the whole cache,
 * which is a rather good thing for us.
 *
 * Also, it is only used when turning caches on/off ("The expected
 * usage of the cache maintenance instructions that operate by set/way
 * is associated with the cache maintenance instructions associated
 * with the powerdown and powerup of caches, if this is required by
 * the implementation.").
 *
 * We use the following policy:
 *
 * - If we trap a S/W operation, we enable VM trapping to detect
 *   caches being turned on/off, and do a full clean.
 *
 * - We flush the caches on both caches being turned on and off.
 *
 * - Once the caches are enabled, we stop trapping VM ops.
 */
void kvm_set_way_flush(struct kvm_vcpu *vcpu)
{
        unsigned long hcr = *vcpu_hcr(vcpu);

        /*
         * If this is the first time we do a S/W operation
         * (i.e. HCR_TVM not set) flush the whole memory, and set the
         * VM trapping.
         *
         * Otherwise, rely on the VM trapping to wait for the MMU +
         * Caches to be turned off. At that point, we'll be able to
         * clean the caches again.
         */
        if (!(hcr & HCR_TVM)) {
                trace_kvm_set_way_flush(*vcpu_pc(vcpu),
                                        vcpu_has_cache_enabled(vcpu));
                stage2_flush_vm(vcpu->kvm);
                *vcpu_hcr(vcpu) = hcr | HCR_TVM;
        }
}

void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled)
{
        bool now_enabled = vcpu_has_cache_enabled(vcpu);

        /*
         * If switching the MMU+caches on, need to invalidate the caches.
         * If switching it off, need to clean the caches.
         * Clean + invalidate does the trick always.
         */
        if (now_enabled != was_enabled)
                stage2_flush_vm(vcpu->kvm);

        /* Caches are now on, stop trapping VM ops (until a S/W op) */
        if (now_enabled)
                *vcpu_hcr(vcpu) &= ~HCR_TVM;

        trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled);
}




















































   57 

   58 




































   23 

   23 



























































































































































   58 




   31 




   31 
   31 













































































   38 


   13 


   13 




































































   24 

   23 




   24 









































   27 




   27 














   27 
   27 
   27 









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/export.h>
#include <linux/bvec.h>
#include <linux/fault-inject-usercopy.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/splice.h>
#include <linux/compat.h>
#include <linux/scatterlist.h>
#include <linux/instrumented.h>
#include <linux/iov_iter.h>

static __always_inline
size_t copy_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = raw_copy_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress,
                                 size_t len, void *from, void *priv2)
{
        ssize_t res;

        if (should_fail_usercopy())
                return len;

        from += progress;
        res = copy_to_user_nofault(iter_to, from, len);
        return res < 0 ? len : res;
}

static __always_inline
size_t copy_from_user_iter(void __user *iter_from, size_t progress,
                           size_t len, void *to, void *priv2)
{
        size_t res = len;

        if (should_fail_usercopy())
                return len;
        if (access_ok(iter_from, len)) {
                to += progress;
                instrument_copy_from_user_before(to, iter_from, len);
                res = raw_copy_from_user(to, iter_from, len);
                instrument_copy_from_user_after(to, iter_from, len, res);
        }
        return res;
}

static __always_inline
size_t memcpy_to_iter(void *iter_to, size_t progress,
                      size_t len, void *from, void *priv2)
{
        memcpy(iter_to, from + progress, len);
        return 0;
}

static __always_inline
size_t memcpy_from_iter(void *iter_from, size_t progress,
                        size_t len, void *to, void *priv2)
{
        memcpy(to + progress, iter_from, len);
        return 0;
}

/*
 * fault_in_iov_iter_readable - fault in iov iterator for reading
 * @i: iterator
 * @size: maximum length
 *
 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 * @size.  For each iovec, fault in each page that constitutes the iovec.
 *
 * Returns the number of bytes not faulted in (like copy_to_user() and
 * copy_from_user()).
 *
 * Always returns 0 for non-userspace iterators.
 */
size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_readable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_readable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_readable);

/*
 * fault_in_iov_iter_writeable - fault in iov iterator for writing
 * @i: iterator
 * @size: maximum length
 *
 * Faults in the iterator using get_user_pages(), i.e., without triggering
 * hardware page faults.  This is primarily useful when we already know that
 * some or all of the pages in @i aren't in memory.
 *
 * Returns the number of bytes not faulted in, like copy_to_user() and
 * copy_from_user().
 *
 * Always returns 0 for non-user-space iterators.
 */
size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
{
        if (iter_is_ubuf(i)) {
                size_t n = min(size, iov_iter_count(i));
                n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n);
                return size - n;
        } else if (iter_is_iovec(i)) {
                size_t count = min(size, iov_iter_count(i));
                const struct iovec *p;
                size_t skip;

                size -= count;
                for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) {
                        size_t len = min(count, p->iov_len - skip);
                        size_t ret;

                        if (unlikely(!len))
                                continue;
                        ret = fault_in_safe_writeable(p->iov_base + skip, len);
                        count -= len - ret;
                        if (ret)
                                break;
                }
                return count + size;
        }
        return 0;
}
EXPORT_SYMBOL(fault_in_iov_iter_writeable);

void iov_iter_init(struct iov_iter *i, unsigned int direction,
                        const struct iovec *iov, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter) {
                .iter_type = ITER_IOVEC,
                .nofault = false,
                .data_source = direction,
                .__iov = iov,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_init);

size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter, memcpy_to_iter);
}
EXPORT_SYMBOL(_copy_to_iter);

#ifdef CONFIG_ARCH_HAS_COPY_MC
static __always_inline
size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress,
                            size_t len, void *from, void *priv2)
{
        if (access_ok(iter_to, len)) {
                from += progress;
                instrument_copy_to_user(iter_to, from, len);
                len = copy_mc_to_user(iter_to, from, len);
        }
        return len;
}

static __always_inline
size_t memcpy_to_iter_mc(void *iter_to, size_t progress,
                         size_t len, void *from, void *priv2)
{
        return copy_mc_to_kernel(iter_to, from + progress, len);
}

/**
 * _copy_mc_to_iter - copy to iter with source memory error exception handling
 * @addr: source kernel address
 * @bytes: total transfer length
 * @i: destination iterator
 *
 * The pmem driver deploys this for the dax operation
 * (dax_copy_to_iter()) for dax reads (bypass page-cache and the
 * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes
 * successfully copied.
 *
 * The main differences between this and typical _copy_to_iter().
 *
 * * Typical tail/residue handling after a fault retries the copy
 *   byte-by-byte until the fault happens again. Re-triggering machine
 *   checks is potentially fatal so the implementation uses source
 *   alignment and poison alignment assumptions to avoid re-triggering
 *   hardware exceptions.
 *
 * * ITER_KVEC and ITER_BVEC can return short copies.  Compare to
 *   copy_to_iter() where only ITER_IOVEC attempts might return a short copy.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        if (user_backed_iter(i))
                might_fault();
        return iterate_and_advance(i, bytes, (void *)addr,
                                   copy_to_user_iter_mc, memcpy_to_iter_mc);
}
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
#endif /* CONFIG_ARCH_HAS_COPY_MC */

static __always_inline
size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter, memcpy_from_iter);
}

size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        if (user_backed_iter(i))
                might_fault();
        return __copy_from_iter(addr, bytes, i);
}
EXPORT_SYMBOL(_copy_from_iter);

static __always_inline
size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        return __copy_from_user_inatomic_nocache(to + progress, iter_from, len);
}

size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_nocache,
                                   memcpy_from_iter);
}
EXPORT_SYMBOL(_copy_from_iter_nocache);

#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
static __always_inline
size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress,
                                      size_t len, void *to, void *priv2)
{
        return __copy_from_user_flushcache(to + progress, iter_from, len);
}

static __always_inline
size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress,
                                   size_t len, void *to, void *priv2)
{
        memcpy_flushcache(to + progress, iter_from, len);
        return 0;
}

/**
 * _copy_from_iter_flushcache - write destination through cpu cache
 * @addr: destination kernel address
 * @bytes: total transfer length
 * @i: source iterator
 *
 * The pmem driver arranges for filesystem-dax to use this facility via
 * dax_copy_from_iter() for ensuring that writes to persistent memory
 * are flushed through the CPU cache. It is differentiated from
 * _copy_from_iter_nocache() in that guarantees all data is flushed for
 * all iterator types. The _copy_from_iter_nocache() only attempts to
 * bypass the cache for the ITER_IOVEC case, and on some archs may use
 * instructions that strand dirty-data in the cache.
 *
 * Return: number of bytes copied (may be %0)
 */
size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i)
{
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        return iterate_and_advance(i, bytes, addr,
                                   copy_from_user_iter_flushcache,
                                   memcpy_from_iter_flushcache);
}
EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache);
#endif

static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
{
        struct page *head;
        size_t v = n + offset;

        /*
         * The general case needs to access the page order in order
         * to compute the page size.
         * However, we mostly deal with order-0 pages and thus can
         * avoid a possible cache line miss for requests that fit all
         * page orders.
         */
        if (n <= v && v <= PAGE_SIZE)
                return true;

        head = compound_head(page);
        v += (page - head) << PAGE_SHIFT;

        if (WARN_ON(n > v || v > page_size(head)))
                return false;
        return true;
}

size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_to_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter);

size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes,
                                 struct iov_iter *i)
{
        size_t res = 0;

        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(i->data_source))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);

                n = iterate_and_advance(i, n, kaddr + offset,
                                        copy_to_user_iter_nofault,
                                        memcpy_to_iter);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_to_iter_nofault);

size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
                         struct iov_iter *i)
{
        size_t res = 0;
        if (!page_copy_sane(page, offset, bytes))
                return 0;
        page += offset / PAGE_SIZE; // first subpage
        offset %= PAGE_SIZE;
        while (1) {
                void *kaddr = kmap_local_page(page);
                size_t n = min(bytes, (size_t)PAGE_SIZE - offset);
                n = _copy_from_iter(kaddr + offset, n, i);
                kunmap_local(kaddr);
                res += n;
                bytes -= n;
                if (!bytes || !n)
                        break;
                offset += n;
                if (offset == PAGE_SIZE) {
                        page++;
                        offset = 0;
                }
        }
        return res;
}
EXPORT_SYMBOL(copy_page_from_iter);

static __always_inline
size_t zero_to_user_iter(void __user *iter_to, size_t progress,
                         size_t len, void *priv, void *priv2)
{
        return clear_user(iter_to, len);
}

static __always_inline
size_t zero_to_iter(void *iter_to, size_t progress,
                    size_t len, void *priv, void *priv2)
{
        memset(iter_to, 0, len);
        return 0;
}

size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
        return iterate_and_advance(i, bytes, NULL,
                                   zero_to_user_iter, zero_to_iter);
}
EXPORT_SYMBOL(iov_iter_zero);

size_t copy_page_from_iter_atomic(struct page *page, size_t offset,
                size_t bytes, struct iov_iter *i)
{
        size_t n, copied = 0;
        bool uses_kmap = IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) ||
                         PageHighMem(page);

        if (!page_copy_sane(page, offset, bytes))
                return 0;
        if (WARN_ON_ONCE(!i->data_source))
                return 0;

        do {
                char *p;

                n = bytes - copied;
                if (uses_kmap) {
                        page += offset / PAGE_SIZE;
                        offset %= PAGE_SIZE;
                        n = min_t(size_t, n, PAGE_SIZE - offset);
                }

                p = kmap_atomic(page) + offset;
                n = __copy_from_iter(p, n, i);
                kunmap_atomic(p);
                copied += n;
                offset += n;
        } while (uses_kmap && copied != bytes && n > 0);

        return copied;
}
EXPORT_SYMBOL(copy_page_from_iter_atomic);

static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
{
        const struct bio_vec *bvec, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset;

        for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
                if (likely(size < bvec->bv_len))
                        break;
                size -= bvec->bv_len;
        }
        i->iov_offset = size;
        i->nr_segs -= bvec - i->bvec;
        i->bvec = bvec;
}

static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
{
        const struct iovec *iov, *end;

        if (!i->count)
                return;
        i->count -= size;

        size += i->iov_offset; // from beginning of current segment
        for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) {
                if (likely(size < iov->iov_len))
                        break;
                size -= iov->iov_len;
        }
        i->iov_offset = size;
        i->nr_segs -= iov - iter_iov(i);
        i->__iov = iov;
}

static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
{
        const struct folio_queue *folioq = i->folioq;
        unsigned int slot = i->folioq_slot;

        if (!i->count)
                return;
        i->count -= size;

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = folioq->next;
                slot = 0;
        }

        size += i->iov_offset; /* From beginning of current segment. */
        do {
                size_t fsize = folioq_folio_size(folioq, slot);

                if (likely(size < fsize))
                        break;
                size -= fsize;
                slot++;
                if (slot >= folioq_nr_slots(folioq) && folioq->next) {
                        folioq = folioq->next;
                        slot = 0;
                }
        } while (size);

        i->iov_offset = size;
        i->folioq_slot = slot;
        i->folioq = folioq;
}

void iov_iter_advance(struct iov_iter *i, size_t size)
{
        if (unlikely(i->count < size))
                size = i->count;
        if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) {
                i->iov_offset += size;
                i->count -= size;
        } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) {
                /* iovec and kvec have identical layouts */
                iov_iter_iovec_advance(i, size);
        } else if (iov_iter_is_bvec(i)) {
                iov_iter_bvec_advance(i, size);
        } else if (iov_iter_is_folioq(i)) {
                iov_iter_folioq_advance(i, size);
        } else if (iov_iter_is_discard(i)) {
                i->count -= size;
        }
}
EXPORT_SYMBOL(iov_iter_advance);

static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll)
{
        const struct folio_queue *folioq = i->folioq;
        unsigned int slot = i->folioq_slot;

        for (;;) {
                size_t fsize;

                if (slot == 0) {
                        folioq = folioq->prev;
                        slot = folioq_nr_slots(folioq);
                }
                slot--;

                fsize = folioq_folio_size(folioq, slot);
                if (unroll <= fsize) {
                        i->iov_offset = fsize - unroll;
                        break;
                }
                unroll -= fsize;
        }

        i->folioq_slot = slot;
        i->folioq = folioq;
}

void iov_iter_revert(struct iov_iter *i, size_t unroll)
{
        if (!unroll)
                return;
        if (WARN_ON(unroll > MAX_RW_COUNT))
                return;
        i->count += unroll;
        if (unlikely(iov_iter_is_discard(i)))
                return;
        if (unroll <= i->iov_offset) {
                i->iov_offset -= unroll;
                return;
        }
        unroll -= i->iov_offset;
        if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
                BUG(); /* We should never go beyond the start of the specified
                        * range since we might then be straying into pages that
                        * aren't pinned.
                        */
        } else if (iov_iter_is_bvec(i)) {
                const struct bio_vec *bvec = i->bvec;
                while (1) {
                        size_t n = (--bvec)->bv_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->bvec = bvec;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        } else if (iov_iter_is_folioq(i)) {
                i->iov_offset = 0;
                iov_iter_folioq_revert(i, unroll);
        } else { /* same logics for iovec and kvec */
                const struct iovec *iov = iter_iov(i);
                while (1) {
                        size_t n = (--iov)->iov_len;
                        i->nr_segs++;
                        if (unroll <= n) {
                                i->__iov = iov;
                                i->iov_offset = n - unroll;
                                return;
                        }
                        unroll -= n;
                }
        }
}
EXPORT_SYMBOL(iov_iter_revert);

/*
 * Return the count of just the current iov_iter segment.
 */
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
        if (i->nr_segs > 1) {
                if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                        return min(i->count, iter_iov(i)->iov_len - i->iov_offset);
                if (iov_iter_is_bvec(i))
                        return min(i->count, i->bvec->bv_len - i->iov_offset);
        }
        if (unlikely(iov_iter_is_folioq(i)))
                return !i->count ? 0 :
                        umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count);
        return i->count;
}
EXPORT_SYMBOL(iov_iter_single_seg_count);

void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
                        const struct kvec *kvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_KVEC,
                .data_source = direction,
                .kvec = kvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_kvec);

void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
                        const struct bio_vec *bvec, unsigned long nr_segs,
                        size_t count)
{
        WARN_ON(direction & ~(READ | WRITE));
        *i = (struct iov_iter){
                .iter_type = ITER_BVEC,
                .data_source = direction,
                .bvec = bvec,
                .nr_segs = nr_segs,
                .iov_offset = 0,
                .count = count
        };
}
EXPORT_SYMBOL(iov_iter_bvec);

/**
 * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @folioq: The starting point in the folio queue.
 * @first_slot: The first slot in the folio queue to use
 * @offset: The offset into the folio in the first slot to start at
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator to either draw data out of the pages attached to an
 * inode or to inject data into those pages.  The pages *must* be prevented
 * from evaporation, either by taking a ref on them or locking them by the
 * caller.
 */
void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
                          const struct folio_queue *folioq, unsigned int first_slot,
                          unsigned int offset, size_t count)
{
        BUG_ON(direction & ~1);
        *i = (struct iov_iter) {
                .iter_type = ITER_FOLIOQ,
                .data_source = direction,
                .folioq = folioq,
                .folioq_slot = first_slot,
                .count = count,
                .iov_offset = offset,
        };
}
EXPORT_SYMBOL(iov_iter_folio_queue);

/**
 * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @xarray: The xarray to access.
 * @start: The start file position.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator to either draw data out of the pages attached to an
 * inode or to inject data into those pages.  The pages *must* be prevented
 * from evaporation, either by taking a ref on them or locking them by the
 * caller.
 */
void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
                     struct xarray *xarray, loff_t start, size_t count)
{
        BUG_ON(direction & ~1);
        *i = (struct iov_iter) {
                .iter_type = ITER_XARRAY,
                .data_source = direction,
                .xarray = xarray,
                .xarray_start = start,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_xarray);

/**
 * iov_iter_discard - Initialise an I/O iterator that discards data
 * @i: The iterator to initialise.
 * @direction: The direction of the transfer.
 * @count: The size of the I/O buffer in bytes.
 *
 * Set up an I/O iterator that just discards everything that's written to it.
 * It's only available as a READ iterator.
 */
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
{
        BUG_ON(direction != READ);
        *i = (struct iov_iter){
                .iter_type = ITER_DISCARD,
                .data_source = false,
                .count = count,
                .iov_offset = 0
        };
}
EXPORT_SYMBOL(iov_iter_discard);

static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask,
                                   unsigned len_mask)
{
        const struct iovec *iov = iter_iov(i);
        size_t size = i->count;
        size_t skip = i->iov_offset;

        do {
                size_t len = iov->iov_len - skip;

                if (len > size)
                        len = size;
                if (len & len_mask)
                        return false;
                if ((unsigned long)(iov->iov_base + skip) & addr_mask)
                        return false;

                iov++;
                size -= len;
                skip = 0;
        } while (size);

        return true;
}

static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
                                  unsigned len_mask)
{
        const struct bio_vec *bvec = i->bvec;
        unsigned skip = i->iov_offset;
        size_t size = i->count;

        do {
                size_t len = bvec->bv_len;

                if (len > size)
                        len = size;
                if (len & len_mask)
                        return false;
                if ((unsigned long)(bvec->bv_offset + skip) & addr_mask)
                        return false;

                bvec++;
                size -= len;
                skip = 0;
        } while (size);

        return true;
}

/**
 * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
 *         are aligned to the parameters.
 *
 * @i: &struct iov_iter to restore
 * @addr_mask: bit mask to check against the iov element's addresses
 * @len_mask: bit mask to check against the iov element's lengths
 *
 * Return: false if any addresses or lengths intersect with the provided masks
 */
bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
                         unsigned len_mask)
{
        if (likely(iter_is_ubuf(i))) {
                if (i->count & len_mask)
                        return false;
                if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask)
                        return false;
                return true;
        }

        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_aligned_iovec(i, addr_mask, len_mask);

        if (iov_iter_is_bvec(i))
                return iov_iter_aligned_bvec(i, addr_mask, len_mask);

        /* With both xarray and folioq types, we're dealing with whole folios. */
        if (iov_iter_is_xarray(i)) {
                if (i->count & len_mask)
                        return false;
                if ((i->xarray_start + i->iov_offset) & addr_mask)
                        return false;
        }
        if (iov_iter_is_folioq(i)) {
                if (i->count & len_mask)
                        return false;
                if (i->iov_offset & addr_mask)
                        return false;
        }

        return true;
}
EXPORT_SYMBOL_GPL(iov_iter_is_aligned);

static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i)
{
        const struct iovec *iov = iter_iov(i);
        unsigned long res = 0;
        size_t size = i->count;
        size_t skip = i->iov_offset;

        do {
                size_t len = iov->iov_len - skip;
                if (len) {
                        res |= (unsigned long)iov->iov_base + skip;
                        if (len > size)
                                len = size;
                        res |= len;
                        size -= len;
                }
                iov++;
                skip = 0;
        } while (size);
        return res;
}

static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i)
{
        const struct bio_vec *bvec = i->bvec;
        unsigned res = 0;
        size_t size = i->count;
        unsigned skip = i->iov_offset;

        do {
                size_t len = bvec->bv_len - skip;
                res |= (unsigned long)bvec->bv_offset + skip;
                if (len > size)
                        len = size;
                res |= len;
                bvec++;
                size -= len;
                skip = 0;
        } while (size);

        return res;
}

unsigned long iov_iter_alignment(const struct iov_iter *i)
{
        if (likely(iter_is_ubuf(i))) {
                size_t size = i->count;
                if (size)
                        return ((unsigned long)i->ubuf + i->iov_offset) | size;
                return 0;
        }

        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_iter_alignment_iovec(i);

        if (iov_iter_is_bvec(i))
                return iov_iter_alignment_bvec(i);

        /* With both xarray and folioq types, we're dealing with whole folios. */
        if (iov_iter_is_folioq(i))
                return i->iov_offset | i->count;
        if (iov_iter_is_xarray(i))
                return (i->xarray_start + i->iov_offset) | i->count;

        return 0;
}
EXPORT_SYMBOL(iov_iter_alignment);

unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
{
        unsigned long res = 0;
        unsigned long v = 0;
        size_t size = i->count;
        unsigned k;

        if (iter_is_ubuf(i))
                return 0;

        if (WARN_ON(!iter_is_iovec(i)))
                return ~0U;

        for (k = 0; k < i->nr_segs; k++) {
                const struct iovec *iov = iter_iov(i) + k;
                if (iov->iov_len) {
                        unsigned long base = (unsigned long)iov->iov_base;
                        if (v) // if not the first one
                                res |= base | v; // this start | previous end
                        v = base + iov->iov_len;
                        if (size <= iov->iov_len)
                                break;
                        size -= iov->iov_len;
                }
        }
        return res;
}
EXPORT_SYMBOL(iov_iter_gap_alignment);

static int want_pages_array(struct page ***res, size_t size,
                            size_t start, unsigned int maxpages)
{
        unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE);

        if (count > maxpages)
                count = maxpages;
        WARN_ON(!count);        // caller should've prevented that
        if (!*res) {
                *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
                if (!*res)
                        return 0;
        }
        return count;
}

static ssize_t iter_folioq_get_pages(struct iov_iter *iter,
                                     struct page ***ppages, size_t maxsize,
                                     unsigned maxpages, size_t *_start_offset)
{
        const struct folio_queue *folioq = iter->folioq;
        struct page **pages;
        unsigned int slot = iter->folioq_slot;
        size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset;

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = folioq->next;
                slot = 0;
                if (WARN_ON(iov_offset != 0))
                        return -EIO;
        }

        maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages);
        if (!maxpages)
                return -ENOMEM;
        *_start_offset = iov_offset & ~PAGE_MASK;
        pages = *ppages;

        for (;;) {
                struct folio *folio = folioq_folio(folioq, slot);
                size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot);
                size_t part = PAGE_SIZE - offset % PAGE_SIZE;

                if (offset < fsize) {
                        part = umin(part, umin(maxsize - extracted, fsize - offset));
                        count -= part;
                        iov_offset += part;
                        extracted += part;

                        *pages = folio_page(folio, offset / PAGE_SIZE);
                        get_page(*pages);
                        pages++;
                        maxpages--;
                }

                if (maxpages == 0 || extracted >= maxsize)
                        break;

                if (iov_offset >= fsize) {
                        iov_offset = 0;
                        slot++;
                        if (slot == folioq_nr_slots(folioq) && folioq->next) {
                                folioq = folioq->next;
                                slot = 0;
                        }
                }
        }

        iter->count = count;
        iter->iov_offset = iov_offset;
        iter->folioq = folioq;
        iter->folioq_slot = slot;
        return extracted;
}

static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa,
                                          pgoff_t index, unsigned int nr_pages)
{
        XA_STATE(xas, xa, index);
        struct page *page;
        unsigned int ret = 0;

        rcu_read_lock();
        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
                if (xas_retry(&xas, page))
                        continue;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                pages[ret] = find_subpage(page, xas.xa_index);
                get_page(pages[ret]);
                if (++ret == nr_pages)
                        break;
        }
        rcu_read_unlock();
        return ret;
}

static ssize_t iter_xarray_get_pages(struct iov_iter *i,
                                     struct page ***pages, size_t maxsize,
                                     unsigned maxpages, size_t *_start_offset)
{
        unsigned nr, offset, count;
        pgoff_t index;
        loff_t pos;

        pos = i->xarray_start + i->iov_offset;
        index = pos >> PAGE_SHIFT;
        offset = pos & ~PAGE_MASK;
        *_start_offset = offset;

        count = want_pages_array(pages, maxsize, offset, maxpages);
        if (!count)
                return -ENOMEM;
        nr = iter_xarray_populate_pages(*pages, i->xarray, index, count);
        if (nr == 0)
                return 0;

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        i->iov_offset += maxsize;
        i->count -= maxsize;
        return maxsize;
}

/* must be done on non-empty ITER_UBUF or ITER_IOVEC one */
static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
{
        size_t skip;
        long k;

        if (iter_is_ubuf(i))
                return (unsigned long)i->ubuf + i->iov_offset;

        for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
                const struct iovec *iov = iter_iov(i) + k;
                size_t len = iov->iov_len - skip;

                if (unlikely(!len))
                        continue;
                if (*size > len)
                        *size = len;
                return (unsigned long)iov->iov_base + skip;
        }
        BUG(); // if it had been empty, we wouldn't get called
}

/* must be done on non-empty ITER_BVEC one */
static struct page *first_bvec_segment(const struct iov_iter *i,
                                       size_t *size, size_t *start)
{
        struct page *page;
        size_t skip = i->iov_offset, len;

        len = i->bvec->bv_len - skip;
        if (*size > len)
                *size = len;
        skip += i->bvec->bv_offset;
        page = i->bvec->bv_page + skip / PAGE_SIZE;
        *start = skip % PAGE_SIZE;
        return page;
}

static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
                   struct page ***pages, size_t maxsize,
                   unsigned int maxpages, size_t *start)
{
        unsigned int n, gup_flags = 0;

        if (maxsize > i->count)
                maxsize = i->count;
        if (!maxsize)
                return 0;
        if (maxsize > MAX_RW_COUNT)
                maxsize = MAX_RW_COUNT;

        if (likely(user_backed_iter(i))) {
                unsigned long addr;
                int res;

                if (iov_iter_rw(i) != WRITE)
                        gup_flags |= FOLL_WRITE;
                if (i->nofault)
                        gup_flags |= FOLL_NOFAULT;

                addr = first_iovec_segment(i, &maxsize);
                *start = addr % PAGE_SIZE;
                addr &= PAGE_MASK;
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                res = get_user_pages_fast(addr, n, gup_flags, *pages);
                if (unlikely(res <= 0))
                        return res;
                maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start);
                iov_iter_advance(i, maxsize);
                return maxsize;
        }
        if (iov_iter_is_bvec(i)) {
                struct page **p;
                struct page *page;

                page = first_bvec_segment(i, &maxsize, start);
                n = want_pages_array(pages, maxsize, *start, maxpages);
                if (!n)
                        return -ENOMEM;
                p = *pages;
                for (int k = 0; k < n; k++) {
                        struct folio *folio = page_folio(page + k);
                        p[k] = page + k;
                        if (!folio_test_slab(folio))
                                folio_get(folio);
                }
                maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start);
                i->count -= maxsize;
                i->iov_offset += maxsize;
                if (i->iov_offset == i->bvec->bv_len) {
                        i->iov_offset = 0;
                        i->bvec++;
                        i->nr_segs--;
                }
                return maxsize;
        }
        if (iov_iter_is_folioq(i))
                return iter_folioq_get_pages(i, pages, maxsize, maxpages, start);
        if (iov_iter_is_xarray(i))
                return iter_xarray_get_pages(i, pages, maxsize, maxpages, start);
        return -EFAULT;
}

ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
                size_t maxsize, unsigned maxpages, size_t *start)
{
        if (!maxpages)
                return 0;
        BUG_ON(!pages);

        return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
}
EXPORT_SYMBOL(iov_iter_get_pages2);

ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
                struct page ***pages, size_t maxsize, size_t *start)
{
        ssize_t len;

        *pages = NULL;

        len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
        if (len <= 0) {
                kvfree(*pages);
                *pages = NULL;
        }
        return len;
}
EXPORT_SYMBOL(iov_iter_get_pages_alloc2);

static int iov_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct iovec *p;
        int npages = 0;

        for (p = iter_iov(i); size; skip = 0, p++) {
                unsigned offs = offset_in_page(p->iov_base + skip);
                size_t len = min(p->iov_len - skip, size);

                if (len) {
                        size -= len;
                        npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                        if (unlikely(npages > maxpages))
                                return maxpages;
                }
        }
        return npages;
}

static int bvec_npages(const struct iov_iter *i, int maxpages)
{
        size_t skip = i->iov_offset, size = i->count;
        const struct bio_vec *p;
        int npages = 0;

        for (p = i->bvec; size; skip = 0, p++) {
                unsigned offs = (p->bv_offset + skip) % PAGE_SIZE;
                size_t len = min(p->bv_len - skip, size);

                size -= len;
                npages += DIV_ROUND_UP(offs + len, PAGE_SIZE);
                if (unlikely(npages > maxpages))
                        return maxpages;
        }
        return npages;
}

int iov_iter_npages(const struct iov_iter *i, int maxpages)
{
        if (unlikely(!i->count))
                return 0;
        if (likely(iter_is_ubuf(i))) {
                unsigned offs = offset_in_page(i->ubuf + i->iov_offset);
                int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        /* iovec and kvec have identical layouts */
        if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i)))
                return iov_npages(i, maxpages);
        if (iov_iter_is_bvec(i))
                return bvec_npages(i, maxpages);
        if (iov_iter_is_folioq(i)) {
                unsigned offset = i->iov_offset % PAGE_SIZE;
                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        if (iov_iter_is_xarray(i)) {
                unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE;
                int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE);
                return min(npages, maxpages);
        }
        return 0;
}
EXPORT_SYMBOL(iov_iter_npages);

const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
{
        *new = *old;
        if (iov_iter_is_bvec(new))
                return new->bvec = kmemdup(new->bvec,
                                    new->nr_segs * sizeof(struct bio_vec),
                                    flags);
        else if (iov_iter_is_kvec(new) || iter_is_iovec(new))
                /* iovec and kvec have identical layout */
                return new->__iov = kmemdup(new->__iov,
                                   new->nr_segs * sizeof(struct iovec),
                                   flags);
        return NULL;
}
EXPORT_SYMBOL(dup_iter);

static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uvec, u32 nr_segs)
{
        const struct compat_iovec __user *uiov =
                (const struct compat_iovec __user *)uvec;
        int ret = -EFAULT;
        u32 i;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        for (i = 0; i < nr_segs; i++) {
                compat_uptr_t buf;
                compat_ssize_t len;

                unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);

                /* check for compat_size_t not fitting in compat_ssize_t .. */
                if (len < 0) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov[i].iov_base = compat_ptr(buf);
                iov[i].iov_len = len;
        }

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

static __noclone int copy_iovec_from_user(struct iovec *iov,
                const struct iovec __user *uiov, unsigned long nr_segs)
{
        int ret = -EFAULT;

        if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
                return -EFAULT;

        do {
                void __user *buf;
                ssize_t len;

                unsafe_get_user(len, &uiov->iov_len, uaccess_end);
                unsafe_get_user(buf, &uiov->iov_base, uaccess_end);

                /* check for size_t not fitting in ssize_t .. */
                if (unlikely(len < 0)) {
                        ret = -EINVAL;
                        goto uaccess_end;
                }
                iov->iov_base = buf;
                iov->iov_len = len;

                uiov++; iov++;
        } while (--nr_segs);

        ret = 0;
uaccess_end:
        user_access_end();
        return ret;
}

struct iovec *iovec_from_user(const struct iovec __user *uvec,
                unsigned long nr_segs, unsigned long fast_segs,
                struct iovec *fast_iov, bool compat)
{
        struct iovec *iov = fast_iov;
        int ret;

        /*
         * SuS says "The readv() function *may* fail if the iovcnt argument was
         * less than or equal to 0, or greater than {IOV_MAX}.  Linux has
         * traditionally returned zero for zero segments, so...
         */
        if (nr_segs == 0)
                return iov;
        if (nr_segs > UIO_MAXIOV)
                return ERR_PTR(-EINVAL);
        if (nr_segs > fast_segs) {
                iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL);
                if (!iov)
                        return ERR_PTR(-ENOMEM);
        }

        if (unlikely(compat))
                ret = copy_compat_iovec_from_user(iov, uvec, nr_segs);
        else
                ret = copy_iovec_from_user(iov, uvec, nr_segs);
        if (ret) {
                if (iov != fast_iov)
                        kfree(iov);
                return ERR_PTR(ret);
        }

        return iov;
}

/*
 * Single segment iovec supplied by the user, import it as ITER_UBUF.
 */
static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
                                   struct iovec **iovp, struct iov_iter *i,
                                   bool compat)
{
        struct iovec *iov = *iovp;
        ssize_t ret;

        *iovp = NULL;

        if (compat)
                ret = copy_compat_iovec_from_user(iov, uvec, 1);
        else
                ret = copy_iovec_from_user(iov, uvec, 1);
        if (unlikely(ret))
                return ret;

        ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
        if (unlikely(ret))
                return ret;
        return i->count;
}

ssize_t __import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs, struct iovec **iovp,
                 struct iov_iter *i, bool compat)
{
        ssize_t total_len = 0;
        unsigned long seg;
        struct iovec *iov;

        if (nr_segs == 1)
                return __import_iovec_ubuf(type, uvec, iovp, i, compat);

        iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat);
        if (IS_ERR(iov)) {
                *iovp = NULL;
                return PTR_ERR(iov);
        }

        /*
         * According to the Single Unix Specification we should return EINVAL if
         * an element length is < 0 when cast to ssize_t or if the total length
         * would overflow the ssize_t return value of the system call.
         *
         * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
         * overflow case.
         */
        for (seg = 0; seg < nr_segs; seg++) {
                ssize_t len = (ssize_t)iov[seg].iov_len;

                if (!access_ok(iov[seg].iov_base, len)) {
                        if (iov != *iovp)
                                kfree(iov);
                        *iovp = NULL;
                        return -EFAULT;
                }

                if (len > MAX_RW_COUNT - total_len) {
                        len = MAX_RW_COUNT - total_len;
                        iov[seg].iov_len = len;
                }
                total_len += len;
        }

        iov_iter_init(i, type, iov, nr_segs, total_len);
        if (iov == *iovp)
                *iovp = NULL;
        else
                *iovp = iov;
        return total_len;
}

/**
 * import_iovec() - Copy an array of &struct iovec from userspace
 *     into the kernel, check that it is valid, and initialize a new
 *     &struct iov_iter iterator to access it.
 *
 * @type: One of %READ or %WRITE.
 * @uvec: Pointer to the userspace array.
 * @nr_segs: Number of elements in userspace array.
 * @fast_segs: Number of elements in @iov.
 * @iovp: (input and output parameter) Pointer to pointer to (usually small
 *     on-stack) kernel array.
 * @i: Pointer to iterator that will be initialized on success.
 *
 * If the array pointed to by *@iov is large enough to hold all @nr_segs,
 * then this function places %NULL in *@iov on return. Otherwise, a new
 * array will be allocated and the result placed in *@iov. This means that
 * the caller may call kfree() on *@iov regardless of whether the small
 * on-stack array was used or not (and regardless of whether this function
 * returns an error or not).
 *
 * Return: Negative error code on error, bytes imported on success
 */
ssize_t import_iovec(int type, const struct iovec __user *uvec,
                 unsigned nr_segs, unsigned fast_segs,
                 struct iovec **iovp, struct iov_iter *i)
{
        return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i,
                              in_compat_syscall());
}
EXPORT_SYMBOL(import_iovec);

int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{
        if (len > MAX_RW_COUNT)
                len = MAX_RW_COUNT;
        if (unlikely(!access_ok(buf, len)))
                return -EFAULT;

        iov_iter_ubuf(i, rw, buf, len);
        return 0;
}
EXPORT_SYMBOL_GPL(import_ubuf);

/**
 * iov_iter_restore() - Restore a &struct iov_iter to the same state as when
 *     iov_iter_save_state() was called.
 *
 * @i: &struct iov_iter to restore
 * @state: state to restore from
 *
 * Used after iov_iter_save_state() to bring restore @i, if operations may
 * have advanced it.
 *
 * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC
 */
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
        if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
                         !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
                return;
        i->iov_offset = state->iov_offset;
        i->count = state->count;
        if (iter_is_ubuf(i))
                return;
        /*
         * For the *vec iters, nr_segs + iov is constant - if we increment
         * the vec, then we also decrement the nr_segs count. Hence we don't
         * need to track both of these, just one is enough and we can deduct
         * the other from that. ITER_KVEC and ITER_IOVEC are the same struct
         * size, so we can just increment the iov pointer as they are unionzed.
         * ITER_BVEC _may_ be the same size on some archs, but on others it is
         * not. Be safe and handle it separately.
         */
        BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
        if (iov_iter_is_bvec(i))
                i->bvec -= state->nr_segs - i->nr_segs;
        else
                i->__iov -= state->nr_segs - i->nr_segs;
        i->nr_segs = state->nr_segs;
}

/*
 * Extract a list of contiguous pages from an ITER_FOLIOQ iterator.  This does
 * not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i,
                                             struct page ***pages, size_t maxsize,
                                             unsigned int maxpages,
                                             iov_iter_extraction_t extraction_flags,
                                             size_t *offset0)
{
        const struct folio_queue *folioq = i->folioq;
        struct page **p;
        unsigned int nr = 0;
        size_t extracted = 0, offset, slot = i->folioq_slot;

        if (slot >= folioq_nr_slots(folioq)) {
                folioq = folioq->next;
                slot = 0;
                if (WARN_ON(i->iov_offset != 0))
                        return -EIO;
        }

        offset = i->iov_offset & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        for (;;) {
                struct folio *folio = folioq_folio(folioq, slot);
                size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot);
                size_t part = PAGE_SIZE - offset % PAGE_SIZE;

                if (offset < fsize) {
                        part = umin(part, umin(maxsize - extracted, fsize - offset));
                        i->count -= part;
                        i->iov_offset += part;
                        extracted += part;

                        p[nr++] = folio_page(folio, offset / PAGE_SIZE);
                }

                if (nr >= maxpages || extracted >= maxsize)
                        break;

                if (i->iov_offset >= fsize) {
                        i->iov_offset = 0;
                        slot++;
                        if (slot == folioq_nr_slots(folioq) && folioq->next) {
                                folioq = folioq->next;
                                slot = 0;
                        }
                }
        }

        i->folioq = folioq;
        i->folioq_slot = slot;
        return extracted;
}

/*
 * Extract a list of contiguous pages from an ITER_XARRAY iterator.  This does not
 * get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i,
                                             struct page ***pages, size_t maxsize,
                                             unsigned int maxpages,
                                             iov_iter_extraction_t extraction_flags,
                                             size_t *offset0)
{
        struct page *page, **p;
        unsigned int nr = 0, offset;
        loff_t pos = i->xarray_start + i->iov_offset;
        pgoff_t index = pos >> PAGE_SHIFT;
        XA_STATE(xas, i->xarray, index);

        offset = pos & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        rcu_read_lock();
        for (page = xas_load(&xas); page; page = xas_next(&xas)) {
                if (xas_retry(&xas, page))
                        continue;

                /* Has the page moved or been split? */
                if (unlikely(page != xas_reload(&xas))) {
                        xas_reset(&xas);
                        continue;
                }

                p[nr++] = find_subpage(page, xas.xa_index);
                if (nr == maxpages)
                        break;
        }
        rcu_read_unlock();

        maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/*
 * Extract a list of virtually contiguous pages from an ITER_BVEC iterator.
 * This does not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        size_t skip = i->iov_offset, size = 0;
        struct bvec_iter bi;
        int k = 0;

        if (i->nr_segs == 0)
                return 0;

        if (i->iov_offset == i->bvec->bv_len) {
                i->iov_offset = 0;
                i->nr_segs--;
                i->bvec++;
                skip = 0;
        }
        bi.bi_idx = 0;
        bi.bi_size = maxsize;
        bi.bi_bvec_done = skip;

        maxpages = want_pages_array(pages, maxsize, skip, maxpages);

        while (bi.bi_size && bi.bi_idx < i->nr_segs) {
                struct bio_vec bv = bvec_iter_bvec(i->bvec, bi);

                /*
                 * The iov_iter_extract_pages interface only allows an offset
                 * into the first page.  Break out of the loop if we see an
                 * offset into subsequent pages, the caller will have to call
                 * iov_iter_extract_pages again for the reminder.
                 */
                if (k) {
                        if (bv.bv_offset)
                                break;
                } else {
                        *offset0 = bv.bv_offset;
                }

                (*pages)[k++] = bv.bv_page;
                size += bv.bv_len;

                if (k >= maxpages)
                        break;

                /*
                 * We are done when the end of the bvec doesn't align to a page
                 * boundary as that would create a hole in the returned space.
                 * The caller will handle this with another call to
                 * iov_iter_extract_pages.
                 */
                if (bv.bv_offset + bv.bv_len != PAGE_SIZE)
                        break;

                bvec_iter_advance_single(i->bvec, &bi, bv.bv_len);
        }

        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of virtually contiguous pages from an ITER_KVEC iterator.
 * This does not get references on the pages, nor does it get a pin on them.
 */
static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i,
                                           struct page ***pages, size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        struct page **p, *page;
        const void *kaddr;
        size_t skip = i->iov_offset, offset, len, size;
        int k;

        for (;;) {
                if (i->nr_segs == 0)
                        return 0;
                size = min(maxsize, i->kvec->iov_len - skip);
                if (size)
                        break;
                i->iov_offset = 0;
                i->nr_segs--;
                i->kvec++;
                skip = 0;
        }

        kaddr = i->kvec->iov_base + skip;
        offset = (unsigned long)kaddr & ~PAGE_MASK;
        *offset0 = offset;

        maxpages = want_pages_array(pages, size, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        p = *pages;

        kaddr -= offset;
        len = offset + size;
        for (k = 0; k < maxpages; k++) {
                size_t seg = min_t(size_t, len, PAGE_SIZE);

                if (is_vmalloc_or_module_addr(kaddr))
                        page = vmalloc_to_page(kaddr);
                else
                        page = virt_to_page(kaddr);

                p[k] = page;
                len -= seg;
                kaddr += PAGE_SIZE;
        }

        size = min_t(size_t, size, maxpages * PAGE_SIZE - offset);
        iov_iter_advance(i, size);
        return size;
}

/*
 * Extract a list of contiguous pages from a user iterator and get a pin on
 * each of them.  This should only be used if the iterator is user-backed
 * (IOBUF/UBUF).
 *
 * It does not get refs on the pages, but the pages must be unpinned by the
 * caller once the transfer is complete.
 *
 * This is safe to be used where background IO/DMA *is* going to be modifying
 * the buffer; using a pin rather than a ref makes forces fork() to give the
 * child a copy of the page.
 */
static ssize_t iov_iter_extract_user_pages(struct iov_iter *i,
                                           struct page ***pages,
                                           size_t maxsize,
                                           unsigned int maxpages,
                                           iov_iter_extraction_t extraction_flags,
                                           size_t *offset0)
{
        unsigned long addr;
        unsigned int gup_flags = 0;
        size_t offset;
        int res;

        if (i->data_source == ITER_DEST)
                gup_flags |= FOLL_WRITE;
        if (extraction_flags & ITER_ALLOW_P2PDMA)
                gup_flags |= FOLL_PCI_P2PDMA;
        if (i->nofault)
                gup_flags |= FOLL_NOFAULT;

        addr = first_iovec_segment(i, &maxsize);
        *offset0 = offset = addr % PAGE_SIZE;
        addr &= PAGE_MASK;
        maxpages = want_pages_array(pages, maxsize, offset, maxpages);
        if (!maxpages)
                return -ENOMEM;
        res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages);
        if (unlikely(res <= 0))
                return res;
        maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset);
        iov_iter_advance(i, maxsize);
        return maxsize;
}

/**
 * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator
 * @i: The iterator to extract from
 * @pages: Where to return the list of pages
 * @maxsize: The maximum amount of iterator to extract
 * @maxpages: The maximum size of the list of pages
 * @extraction_flags: Flags to qualify request
 * @offset0: Where to return the starting offset into (*@pages)[0]
 *
 * Extract a list of contiguous pages from the current point of the iterator,
 * advancing the iterator.  The maximum number of pages and the maximum amount
 * of page contents can be set.
 *
 * If *@pages is NULL, a page list will be allocated to the required size and
 * *@pages will be set to its base.  If *@pages is not NULL, it will be assumed
 * that the caller allocated a page list at least @maxpages in size and this
 * will be filled in.
 *
 * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA
 * be allowed on the pages extracted.
 *
 * The iov_iter_extract_will_pin() function can be used to query how cleanup
 * should be performed.
 *
 * Extra refs or pins on the pages may be obtained as follows:
 *
 *  (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be
 *      added to the pages, but refs will not be taken.
 *      iov_iter_extract_will_pin() will return true.
 *
 *  (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the
 *      pages are merely listed; no extra refs or pins are obtained.
 *      iov_iter_extract_will_pin() will return 0.
 *
 * Note also:
 *
 *  (*) Use with ITER_DISCARD is not supported as that has no content.
 *
 * On success, the function sets *@pages to the new pagelist, if allocated, and
 * sets *offset0 to the offset into the first page.
 *
 * It may also return -ENOMEM and -EFAULT.
 */
ssize_t iov_iter_extract_pages(struct iov_iter *i,
                               struct page ***pages,
                               size_t maxsize,
                               unsigned int maxpages,
                               iov_iter_extraction_t extraction_flags,
                               size_t *offset0)
{
        maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT);
        if (!maxsize)
                return 0;

        if (likely(user_backed_iter(i)))
                return iov_iter_extract_user_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_kvec(i))
                return iov_iter_extract_kvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_bvec(i))
                return iov_iter_extract_bvec_pages(i, pages, maxsize,
                                                   maxpages, extraction_flags,
                                                   offset0);
        if (iov_iter_is_folioq(i))
                return iov_iter_extract_folioq_pages(i, pages, maxsize,
                                                     maxpages, extraction_flags,
                                                     offset0);
        if (iov_iter_is_xarray(i))
                return iov_iter_extract_xarray_pages(i, pages, maxsize,
                                                     maxpages, extraction_flags,
                                                     offset0);
        return -EFAULT;
}
EXPORT_SYMBOL_GPL(iov_iter_extract_pages);





























































  179 



  179 
    3 


  179 




  179 

















































































  159 










    3 





    3 
    3 











    3 




  159 



    3 




  206 






    3 







  159 

















  206 









  206 




  206 


  206 







  206 




   67 




  179 







  179 



































    3 








    3 
























    3 

























































































































































































































   67 









   68 



   68 














    6 




    6 





    6 




   67 






   68 

   68 
    6 




   67 





   68 

    5 
   68 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
// SPDX-License-Identifier: GPL-2.0
/*
 * Implementation of the SID table type.
 *
 * Original author: Stephen Smalley, <stephen.smalley.work@gmail.com>
 * Author: Ondrej Mosnacek, <omosnacek@gmail.com>
 *
 * Copyright (C) 2018 Red Hat, Inc.
 */

#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <asm/barrier.h>
#include "flask.h"
#include "security.h"
#include "sidtab.h"
#include "services.h"

struct sidtab_str_cache {
        struct rcu_head rcu_member;
        struct list_head lru_member;
        struct sidtab_entry *parent;
        u32 len;
        char str[] __counted_by(len);
};

#define index_to_sid(index) ((index) + SECINITSID_NUM + 1)
#define sid_to_index(sid)   ((sid) - (SECINITSID_NUM + 1))

int sidtab_init(struct sidtab *s)
{
        u32 i;

        memset(s->roots, 0, sizeof(s->roots));

        for (i = 0; i < SECINITSID_NUM; i++)
                s->isids[i].set = 0;

        s->frozen = false;
        s->count = 0;
        s->convert = NULL;
        hash_init(s->context_to_sid);

        spin_lock_init(&s->lock);

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        s->cache_free_slots = CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE;
        INIT_LIST_HEAD(&s->cache_lru_list);
        spin_lock_init(&s->cache_lock);
#endif

        return 0;
}

static u32 context_to_sid(struct sidtab *s, struct context *context, u32 hash)
{
        struct sidtab_entry *entry;
        u32 sid = 0;

        rcu_read_lock();
        hash_for_each_possible_rcu(s->context_to_sid, entry, list, hash) {
                if (entry->hash != hash)
                        continue;
                if (context_equal(&entry->context, context)) {
                        sid = entry->sid;
                        break;
                }
        }
        rcu_read_unlock();
        return sid;
}

int sidtab_set_initial(struct sidtab *s, u32 sid, struct context *context)
{
        struct sidtab_isid_entry *isid;
        u32 hash;
        int rc;

        if (sid == 0 || sid > SECINITSID_NUM)
                return -EINVAL;

        isid = &s->isids[sid - 1];

        rc = context_cpy(&isid->entry.context, context);
        if (rc)
                return rc;

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        isid->entry.cache = NULL;
#endif
        isid->set = 1;

        hash = context_compute_hash(context);

        /*
         * Multiple initial sids may map to the same context. Check that this
         * context is not already represented in the context_to_sid hashtable
         * to avoid duplicate entries and long linked lists upon hash
         * collision.
         */
        if (!context_to_sid(s, context, hash)) {
                isid->entry.sid = sid;
                isid->entry.hash = hash;
                hash_add(s->context_to_sid, &isid->entry.list, hash);
        }

        return 0;
}

int sidtab_hash_stats(struct sidtab *sidtab, char *page)
{
        unsigned int i;
        int chain_len = 0;
        int slots_used = 0;
        int entries = 0;
        int max_chain_len = 0;
        unsigned int cur_bucket = 0;
        struct sidtab_entry *entry;

        rcu_read_lock();
        hash_for_each_rcu(sidtab->context_to_sid, i, entry, list) {
                entries++;
                if (i == cur_bucket) {
                        chain_len++;
                        if (chain_len == 1)
                                slots_used++;
                } else {
                        cur_bucket = i;
                        if (chain_len > max_chain_len)
                                max_chain_len = chain_len;
                        chain_len = 0;
                }
        }
        rcu_read_unlock();

        if (chain_len > max_chain_len)
                max_chain_len = chain_len;

        return scnprintf(page, PAGE_SIZE,
                         "entries: %d\nbuckets used: %d/%d\n"
                         "longest chain: %d\n",
                         entries, slots_used, SIDTAB_HASH_BUCKETS,
                         max_chain_len);
}

static u32 sidtab_level_from_count(u32 count)
{
        u32 capacity = SIDTAB_LEAF_ENTRIES;
        u32 level = 0;

        while (count > capacity) {
                capacity <<= SIDTAB_INNER_SHIFT;
                ++level;
        }
        return level;
}

static int sidtab_alloc_roots(struct sidtab *s, u32 level)
{
        u32 l;

        if (!s->roots[0].ptr_leaf) {
                s->roots[0].ptr_leaf =
                        kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC);
                if (!s->roots[0].ptr_leaf)
                        return -ENOMEM;
        }
        for (l = 1; l <= level; ++l)
                if (!s->roots[l].ptr_inner) {
                        s->roots[l].ptr_inner =
                                kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC);
                        if (!s->roots[l].ptr_inner)
                                return -ENOMEM;
                        s->roots[l].ptr_inner->entries[0] = s->roots[l - 1];
                }
        return 0;
}

static struct sidtab_entry *sidtab_do_lookup(struct sidtab *s, u32 index,
                                             int alloc)
{
        union sidtab_entry_inner *entry;
        u32 level, capacity_shift, leaf_index = index / SIDTAB_LEAF_ENTRIES;

        /* find the level of the subtree we need */
        level = sidtab_level_from_count(index + 1);
        capacity_shift = level * SIDTAB_INNER_SHIFT;

        /* allocate roots if needed */
        if (alloc && sidtab_alloc_roots(s, level) != 0)
                return NULL;

        /* lookup inside the subtree */
        entry = &s->roots[level];
        while (level != 0) {
                capacity_shift -= SIDTAB_INNER_SHIFT;
                --level;

                entry = &entry->ptr_inner->entries[leaf_index >> capacity_shift];
                leaf_index &= ((u32)1 << capacity_shift) - 1;

                if (!entry->ptr_inner) {
                        if (alloc)
                                entry->ptr_inner = kzalloc(
                                        SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC);
                        if (!entry->ptr_inner)
                                return NULL;
                }
        }
        if (!entry->ptr_leaf) {
                if (alloc)
                        entry->ptr_leaf =
                                kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_ATOMIC);
                if (!entry->ptr_leaf)
                        return NULL;
        }
        return &entry->ptr_leaf->entries[index % SIDTAB_LEAF_ENTRIES];
}

static struct sidtab_entry *sidtab_lookup(struct sidtab *s, u32 index)
{
        /* read entries only after reading count */
        u32 count = smp_load_acquire(&s->count);

        if (index >= count)
                return NULL;

        return sidtab_do_lookup(s, index, 0);
}

static struct sidtab_entry *sidtab_lookup_initial(struct sidtab *s, u32 sid)
{
        return s->isids[sid - 1].set ? &s->isids[sid - 1].entry : NULL;
}

static struct sidtab_entry *sidtab_search_core(struct sidtab *s, u32 sid,
                                               int force)
{
        if (sid != 0) {
                struct sidtab_entry *entry;

                if (sid > SECINITSID_NUM)
                        entry = sidtab_lookup(s, sid_to_index(sid));
                else
                        entry = sidtab_lookup_initial(s, sid);
                if (entry && (!entry->context.len || force))
                        return entry;
        }

        return sidtab_lookup_initial(s, SECINITSID_UNLABELED);
}

struct sidtab_entry *sidtab_search_entry(struct sidtab *s, u32 sid)
{
        return sidtab_search_core(s, sid, 0);
}

struct sidtab_entry *sidtab_search_entry_force(struct sidtab *s, u32 sid)
{
        return sidtab_search_core(s, sid, 1);
}

int sidtab_context_to_sid(struct sidtab *s, struct context *context, u32 *sid)
{
        unsigned long flags;
        u32 count, hash = context_compute_hash(context);
        struct sidtab_convert_params *convert;
        struct sidtab_entry *dst, *dst_convert;
        int rc;

        *sid = context_to_sid(s, context, hash);
        if (*sid)
                return 0;

        /* lock-free search failed: lock, re-search, and insert if not found */
        spin_lock_irqsave(&s->lock, flags);

        rc = 0;
        *sid = context_to_sid(s, context, hash);
        if (*sid)
                goto out_unlock;

        if (unlikely(s->frozen)) {
                /*
                 * This sidtab is now frozen - tell the caller to abort and
                 * get the new one.
                 */
                rc = -ESTALE;
                goto out_unlock;
        }

        count = s->count;

        /* bail out if we already reached max entries */
        rc = -EOVERFLOW;
        if (count >= SIDTAB_MAX)
                goto out_unlock;

        /* insert context into new entry */
        rc = -ENOMEM;
        dst = sidtab_do_lookup(s, count, 1);
        if (!dst)
                goto out_unlock;

        dst->sid = index_to_sid(count);
        dst->hash = hash;

        rc = context_cpy(&dst->context, context);
        if (rc)
                goto out_unlock;

        /*
         * if we are building a new sidtab, we need to convert the context
         * and insert it there as well
         */
        convert = s->convert;
        if (convert) {
                struct sidtab *target = convert->target;

                rc = -ENOMEM;
                dst_convert = sidtab_do_lookup(target, count, 1);
                if (!dst_convert) {
                        context_destroy(&dst->context);
                        goto out_unlock;
                }

                rc = services_convert_context(convert->args, context,
                                              &dst_convert->context,
                                              GFP_ATOMIC);
                if (rc) {
                        context_destroy(&dst->context);
                        goto out_unlock;
                }
                dst_convert->sid = index_to_sid(count);
                dst_convert->hash = context_compute_hash(&dst_convert->context);
                target->count = count + 1;

                hash_add_rcu(target->context_to_sid, &dst_convert->list,
                             dst_convert->hash);
        }

        if (context->len)
                pr_info("SELinux:  Context %s is not valid (left unmapped).\n",
                        context->str);

        *sid = index_to_sid(count);

        /* write entries before updating count */
        smp_store_release(&s->count, count + 1);
        hash_add_rcu(s->context_to_sid, &dst->list, dst->hash);

        rc = 0;
out_unlock:
        spin_unlock_irqrestore(&s->lock, flags);
        return rc;
}

static void sidtab_convert_hashtable(struct sidtab *s, u32 count)
{
        struct sidtab_entry *entry;
        u32 i;

        for (i = 0; i < count; i++) {
                entry = sidtab_do_lookup(s, i, 0);
                entry->sid = index_to_sid(i);
                entry->hash = context_compute_hash(&entry->context);

                hash_add_rcu(s->context_to_sid, &entry->list, entry->hash);
        }
}

static int sidtab_convert_tree(union sidtab_entry_inner *edst,
                               union sidtab_entry_inner *esrc, u32 *pos,
                               u32 count, u32 level,
                               struct sidtab_convert_params *convert)
{
        int rc;
        u32 i;

        if (level != 0) {
                if (!edst->ptr_inner) {
                        edst->ptr_inner =
                                kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_KERNEL);
                        if (!edst->ptr_inner)
                                return -ENOMEM;
                }
                i = 0;
                while (i < SIDTAB_INNER_ENTRIES && *pos < count) {
                        rc = sidtab_convert_tree(&edst->ptr_inner->entries[i],
                                                 &esrc->ptr_inner->entries[i],
                                                 pos, count, level - 1,
                                                 convert);
                        if (rc)
                                return rc;
                        i++;
                }
        } else {
                if (!edst->ptr_leaf) {
                        edst->ptr_leaf =
                                kzalloc(SIDTAB_NODE_ALLOC_SIZE, GFP_KERNEL);
                        if (!edst->ptr_leaf)
                                return -ENOMEM;
                }
                i = 0;
                while (i < SIDTAB_LEAF_ENTRIES && *pos < count) {
                        rc = services_convert_context(
                                convert->args,
                                &esrc->ptr_leaf->entries[i].context,
                                &edst->ptr_leaf->entries[i].context,
                                GFP_KERNEL);
                        if (rc)
                                return rc;
                        (*pos)++;
                        i++;
                }
                cond_resched();
        }
        return 0;
}

int sidtab_convert(struct sidtab *s, struct sidtab_convert_params *params)
{
        unsigned long flags;
        u32 count, level, pos;
        int rc;

        spin_lock_irqsave(&s->lock, flags);

        /* concurrent policy loads are not allowed */
        if (s->convert) {
                spin_unlock_irqrestore(&s->lock, flags);
                return -EBUSY;
        }

        count = s->count;
        level = sidtab_level_from_count(count);

        /* allocate last leaf in the new sidtab (to avoid race with
         * live convert)
         */
        rc = sidtab_do_lookup(params->target, count - 1, 1) ? 0 : -ENOMEM;
        if (rc) {
                spin_unlock_irqrestore(&s->lock, flags);
                return rc;
        }

        /* set count in case no new entries are added during conversion */
        params->target->count = count;

        /* enable live convert of new entries */
        s->convert = params;

        /* we can safely convert the tree outside the lock */
        spin_unlock_irqrestore(&s->lock, flags);

        pr_info("SELinux:  Converting %u SID table entries...\n", count);

        /* convert all entries not covered by live convert */
        pos = 0;
        rc = sidtab_convert_tree(&params->target->roots[level],
                                 &s->roots[level], &pos, count, level, params);
        if (rc) {
                /* we need to keep the old table - disable live convert */
                spin_lock_irqsave(&s->lock, flags);
                s->convert = NULL;
                spin_unlock_irqrestore(&s->lock, flags);
                return rc;
        }
        /*
         * The hashtable can also be modified in sidtab_context_to_sid()
         * so we must re-acquire the lock here.
         */
        spin_lock_irqsave(&s->lock, flags);
        sidtab_convert_hashtable(params->target, count);
        spin_unlock_irqrestore(&s->lock, flags);

        return 0;
}

void sidtab_cancel_convert(struct sidtab *s)
{
        unsigned long flags;

        /* cancelling policy load - disable live convert of sidtab */
        spin_lock_irqsave(&s->lock, flags);
        s->convert = NULL;
        spin_unlock_irqrestore(&s->lock, flags);
}

void sidtab_freeze_begin(struct sidtab *s, unsigned long *flags)
        __acquires(&s->lock)
{
        spin_lock_irqsave(&s->lock, *flags);
        s->frozen = true;
        s->convert = NULL;
}
void sidtab_freeze_end(struct sidtab *s, unsigned long *flags)
        __releases(&s->lock)
{
        spin_unlock_irqrestore(&s->lock, *flags);
}

static void sidtab_destroy_entry(struct sidtab_entry *entry)
{
        context_destroy(&entry->context);
#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0
        kfree(rcu_dereference_raw(entry->cache));
#endif
}

static void sidtab_destroy_tree(union sidtab_entry_inner entry, u32 level)
{
        u32 i;

        if (level != 0) {
                struct sidtab_node_inner *node = entry.ptr_inner;

                if (!node)
                        return;

                for (i = 0; i < SIDTAB_INNER_ENTRIES; i++)
                        sidtab_destroy_tree(node->entries[i], level - 1);
                kfree(node);
        } else {
                struct sidtab_node_leaf *node = entry.ptr_leaf;

                if (!node)
                        return;

                for (i = 0; i < SIDTAB_LEAF_ENTRIES; i++)
                        sidtab_destroy_entry(&node->entries[i]);
                kfree(node);
        }
}

void sidtab_destroy(struct sidtab *s)
{
        u32 i, level;

        for (i = 0; i < SECINITSID_NUM; i++)
                if (s->isids[i].set)
                        sidtab_destroy_entry(&s->isids[i].entry);

        level = SIDTAB_MAX_LEVEL;
        while (level && !s->roots[level].ptr_inner)
                --level;

        sidtab_destroy_tree(s->roots[level], level);
        /*
         * The context_to_sid hashtable's objects are all shared
         * with the isids array and context tree, and so don't need
         * to be cleaned up here.
         */
}

#if CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0

void sidtab_sid2str_put(struct sidtab *s, struct sidtab_entry *entry,
                        const char *str, u32 str_len)
{
        struct sidtab_str_cache *cache, *victim = NULL;
        unsigned long flags;

        /* do not cache invalid contexts */
        if (entry->context.len)
                return;

        spin_lock_irqsave(&s->cache_lock, flags);

        cache = rcu_dereference_protected(entry->cache,
                                          lockdep_is_held(&s->cache_lock));
        if (cache) {
                /* entry in cache - just bump to the head of LRU list */
                list_move(&cache->lru_member, &s->cache_lru_list);
                goto out_unlock;
        }

        cache = kmalloc(struct_size(cache, str, str_len), GFP_ATOMIC);
        if (!cache)
                goto out_unlock;

        if (s->cache_free_slots == 0) {
                /* pop a cache entry from the tail and free it */
                victim = container_of(s->cache_lru_list.prev,
                                      struct sidtab_str_cache, lru_member);
                list_del(&victim->lru_member);
                rcu_assign_pointer(victim->parent->cache, NULL);
        } else {
                s->cache_free_slots--;
        }
        cache->parent = entry;
        cache->len = str_len;
        memcpy(cache->str, str, str_len);
        list_add(&cache->lru_member, &s->cache_lru_list);

        rcu_assign_pointer(entry->cache, cache);

out_unlock:
        spin_unlock_irqrestore(&s->cache_lock, flags);
        kfree_rcu(victim, rcu_member);
}

int sidtab_sid2str_get(struct sidtab *s, struct sidtab_entry *entry, char **out,
                       u32 *out_len)
{
        struct sidtab_str_cache *cache;
        int rc = 0;

        if (entry->context.len)
                return -ENOENT; /* do not cache invalid contexts */

        rcu_read_lock();

        cache = rcu_dereference(entry->cache);
        if (!cache) {
                rc = -ENOENT;
        } else {
                *out_len = cache->len;
                if (out) {
                        *out = kmemdup(cache->str, cache->len, GFP_ATOMIC);
                        if (!*out)
                                rc = -ENOMEM;
                }
        }

        rcu_read_unlock();

        if (!rc && out)
                sidtab_sid2str_put(s, entry, *out, *out_len);
        return rc;
}

#endif /* CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE > 0 */
































































  284 









  270 

    5 



   24 











   14 
   13 





















   21 







   21 



    9 
   13 



   21 












  255 
   13 



















  231 
   19 
    5 

















  278 
   19 
   19 








   75 
  169 












   32 
   74 

























  177 








  177 








  258 
  254 







    1 






   13 




  100 

  172 
  172 



  254 


   13 
















  258 
  254 
   13 






  259 



   21 





  256 



  101 
  172 
  172 




  253 
   13 






  258 


   13 




    5 
    8 

   13 

   13 






   13 
















  256 








  301 
   43 
  278 











  301 
  100 
  231 












   14 








  318 
  303 
   95 
  318 










   63 




   63 



   63 
   63 












  257 




  258 
  257 

  255 

    5 








  188 

  242 
















  252 
   13 

  256 


  256 
  256 

  256 





  188 


  240 








  188 

  241 









































   11 


    2 


    2 










  335 






  334 












  334 












  334 

  335 















  334 















   21 


















   21 








   21 





















   21 

   21 










   13 

























































































































































































































































































  302 





  301 
  302 

  301 

















  288 
    5 








   13 





   17 







  285 
   21 
























   21 





   21 

   21 
    4 




   13 
   13 




   13 





    5 


   20 



    9 
   13 

   21 
























































  318 


  280 








  317 
















   43 




  280 
  280 













  328 






  299 
   86 

   40 




  318 



  318 
  318 
  317 






  317 
  318 



  279 
   86 











    3 



    3 
    2 
















  315 







  177 






  189 












  189 




  316 













  157 

  242 


  318 









  311 
   47 

  145 
  303 


    4 

   31 
  137 
  298 













  317 




  318 





  318 









  317 

  318 




  317 




  318 

































































   89 


   89 






    3 







   86 




























   91 



   91 










    1 



   89 






































































































































  315 




   76 





   76 
   76 
   76 








  314 
  305 


   76 

   20 






  315 
  288 
   76 







   81 


   62 



   24 









   54 


  215 





























































































































   26 
   70 
   46 
   29 





























   94 


   14 














   94 






   27 
   67 









   88 
































   98 






























  269 

  198 



   97 




















   98 






  246 














  246 







  246 









  246 

  246 










































  246 




  246 









  246 


  246 



  246 


  246 

    3 



  246 


  245 

  246 



  246 


  246 
  246 
  246 











  246 















  246 





  246 





  246 










  246 




  246 


  246 
  246 
  246 
  246 
  246 
  246 















  194 





















    3 










   60 




























  279 





    2 





  280 


   43 







  301 


   25 

   34 













  302 











  194 
   63 












  147 




  147 













































  194 



   54 

   89 








  253 
  253 

  253 





   54 
  215 





























  179 
  117 


   72 

   50 












  270 














  302 




  302 






  302 
  285 





  270 









  270 





   62 
    1 

   63 





























  300 



  301 






  302 



  210 
  131 






  302 


  211 
  131 
















































































































































































































    1 



    2 


















































































































   13 








   13 









   10 
    3 












   12 





   12 












    3 

    1 
    8 

    9 




    9 












   91 





   91 



   87 
    4 




1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013
3014
3015
3016
3017
3018
3019
// SPDX-License-Identifier: GPL-2.0-or-later

/*
 * VMA-specific functions.
 */

#include "vma_internal.h"
#include "vma.h"

struct mmap_state {
        struct mm_struct *mm;
        struct vma_iterator *vmi;

        unsigned long addr;
        unsigned long end;
        pgoff_t pgoff;
        unsigned long pglen;
        unsigned long flags;
        struct file *file;

        unsigned long charged;
        bool retry_merge;

        struct vm_area_struct *prev;
        struct vm_area_struct *next;

        /* Unmapping state. */
        struct vma_munmap_struct vms;
        struct ma_state mas_detach;
        struct maple_tree mt_detach;
};

#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \
        struct mmap_state name = {                                        \
                .mm = mm_,                                                \
                .vmi = vmi_,                                                \
                .addr = addr_,                                                \
                .end = (addr_) + (len_),                                \
                .pgoff = pgoff_,                                        \
                .pglen = PHYS_PFN(len_),                                \
                .flags = flags_,                                        \
                .file = file_,                                                \
        }

#define VMG_MMAP_STATE(name, map_, vma_)                                \
        struct vma_merge_struct name = {                                \
                .mm = (map_)->mm,                                        \
                .vmi = (map_)->vmi,                                        \
                .start = (map_)->addr,                                        \
                .end = (map_)->end,                                        \
                .flags = (map_)->flags,                                        \
                .pgoff = (map_)->pgoff,                                        \
                .file = (map_)->file,                                        \
                .prev = (map_)->prev,                                        \
                .middle = vma_,                                                \
                .next = (vma_) ? NULL : (map_)->next,                        \
                .state = VMA_MERGE_START,                                \
        }

static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
{
        struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;

        if (!mpol_equal(vmg->policy, vma_policy(vma)))
                return false;
        /*
         * VM_SOFTDIRTY should not prevent from VMA merging, if we
         * match the flags but dirty bit -- the caller should mark
         * merged VMA as dirty. If dirty bit won't be excluded from
         * comparison, we increase pressure on the memory system forcing
         * the kernel to generate new VMAs when old one could be
         * extended instead.
         */
        if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
                return false;
        if (vma->vm_file != vmg->file)
                return false;
        if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
                return false;
        if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
                return false;
        return true;
}

static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
                 struct anon_vma *anon_vma2, struct vm_area_struct *vma)
{
        /*
         * The list_is_singular() test is to avoid merging VMA cloned from
         * parents. This can improve scalability caused by anon_vma lock.
         */
        if ((!anon_vma1 || !anon_vma2) && (!vma ||
                list_is_singular(&vma->anon_vma_chain)))
                return true;
        return anon_vma1 == anon_vma2;
}

/* Are the anon_vma's belonging to each VMA compatible with one another? */
static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
                                            struct vm_area_struct *vma2)
{
        return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
}

/*
 * init_multi_vma_prep() - Initializer for struct vma_prepare
 * @vp: The vma_prepare struct
 * @vma: The vma that will be altered once locked
 * @vmg: The merge state that will be used to determine adjustment and VMA
 *       removal.
 */
static void init_multi_vma_prep(struct vma_prepare *vp,
                                struct vm_area_struct *vma,
                                struct vma_merge_struct *vmg)
{
        struct vm_area_struct *adjust;
        struct vm_area_struct **remove = &vp->remove;

        memset(vp, 0, sizeof(struct vma_prepare));
        vp->vma = vma;
        vp->anon_vma = vma->anon_vma;

        if (vmg && vmg->__remove_middle) {
                *remove = vmg->middle;
                remove = &vp->remove2;
        }
        if (vmg && vmg->__remove_next)
                *remove = vmg->next;

        if (vmg && vmg->__adjust_middle_start)
                adjust = vmg->middle;
        else if (vmg && vmg->__adjust_next_start)
                adjust = vmg->next;
        else
                adjust = NULL;

        vp->adj_next = adjust;
        if (!vp->anon_vma && adjust)
                vp->anon_vma = adjust->anon_vma;

        VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma &&
                   vp->anon_vma != adjust->anon_vma);

        vp->file = vma->vm_file;
        if (vp->file)
                vp->mapping = vma->vm_file->f_mapping;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * in front of (at a lower virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We don't check here for the merged mmap wrapping around the end of pagecache
 * indices (16TB on ia32) because do_mmap() does not permit mmap's which
 * wrap, nor mmaps which cover the final page at index -1UL.
 *
 * We assume the vma may be removed as part of the merge.
 */
static bool can_vma_merge_before(struct vma_merge_struct *vmg)
{
        pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);

        if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
            is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
                if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
                        return true;
        }

        return false;
}

/*
 * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
 * beyond (at a higher virtual address and file offset than) the vma.
 *
 * We cannot merge two vmas if they have differently assigned (non-NULL)
 * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
 *
 * We assume that vma is not removed as part of the merge.
 */
static bool can_vma_merge_after(struct vma_merge_struct *vmg)
{
        if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
            is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
                if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
                        return true;
        }
        return false;
}

static void __vma_link_file(struct vm_area_struct *vma,
                            struct address_space *mapping)
{
        if (vma_is_shared_maywrite(vma))
                mapping_allow_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_insert(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

/*
 * Requires inode->i_mapping->i_mmap_rwsem
 */
static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                                      struct address_space *mapping)
{
        if (vma_is_shared_maywrite(vma))
                mapping_unmap_writable(mapping);

        flush_dcache_mmap_lock(mapping);
        vma_interval_tree_remove(vma, &mapping->i_mmap);
        flush_dcache_mmap_unlock(mapping);
}

/*
 * vma has some anon_vma assigned, and is already inserted on that
 * anon_vma's interval trees.
 *
 * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
 * vma must be removed from the anon_vma's interval trees using
 * anon_vma_interval_tree_pre_update_vma().
 *
 * After the update, the vma will be reinserted using
 * anon_vma_interval_tree_post_update_vma().
 *
 * The entire update must be protected by exclusive mmap_lock and by
 * the root anon_vma's mutex.
 */
static void
anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
}

static void
anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
{
        struct anon_vma_chain *avc;

        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}

/*
 * vma_prepare() - Helper function for handling locking VMAs prior to altering
 * @vp: The initialized vma_prepare struct
 */
static void vma_prepare(struct vma_prepare *vp)
{
        if (vp->file) {
                uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);

                if (vp->adj_next)
                        uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
                                      vp->adj_next->vm_end);

                i_mmap_lock_write(vp->mapping);
                if (vp->insert && vp->insert->vm_file) {
                        /*
                         * Put into interval tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
                         * throughout; but we cannot insert into address
                         * space until vma start or end is updated.
                         */
                        __vma_link_file(vp->insert,
                                        vp->insert->vm_file->f_mapping);
                }
        }

        if (vp->anon_vma) {
                anon_vma_lock_write(vp->anon_vma);
                anon_vma_interval_tree_pre_update_vma(vp->vma);
                if (vp->adj_next)
                        anon_vma_interval_tree_pre_update_vma(vp->adj_next);
        }

        if (vp->file) {
                flush_dcache_mmap_lock(vp->mapping);
                vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
                if (vp->adj_next)
                        vma_interval_tree_remove(vp->adj_next,
                                                 &vp->mapping->i_mmap);
        }

}

/*
 * vma_complete- Helper function for handling the unlocking after altering VMAs,
 * or for inserting a VMA.
 *
 * @vp: The vma_prepare struct
 * @vmi: The vma iterator
 * @mm: The mm_struct
 */
static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
                         struct mm_struct *mm)
{
        if (vp->file) {
                if (vp->adj_next)
                        vma_interval_tree_insert(vp->adj_next,
                                                 &vp->mapping->i_mmap);
                vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
                flush_dcache_mmap_unlock(vp->mapping);
        }

        if (vp->remove && vp->file) {
                __remove_shared_vm_struct(vp->remove, vp->mapping);
                if (vp->remove2)
                        __remove_shared_vm_struct(vp->remove2, vp->mapping);
        } else if (vp->insert) {
                /*
                 * split_vma has split insert from vma, and needs
                 * us to insert it before dropping the locks
                 * (it may either follow vma or precede it).
                 */
                vma_iter_store_new(vmi, vp->insert);
                mm->map_count++;
        }

        if (vp->anon_vma) {
                anon_vma_interval_tree_post_update_vma(vp->vma);
                if (vp->adj_next)
                        anon_vma_interval_tree_post_update_vma(vp->adj_next);
                anon_vma_unlock_write(vp->anon_vma);
        }

        if (vp->file) {
                i_mmap_unlock_write(vp->mapping);
                uprobe_mmap(vp->vma);

                if (vp->adj_next)
                        uprobe_mmap(vp->adj_next);
        }

        if (vp->remove) {
again:
                vma_mark_detached(vp->remove);
                if (vp->file) {
                        uprobe_munmap(vp->remove, vp->remove->vm_start,
                                      vp->remove->vm_end);
                        fput(vp->file);
                }
                if (vp->remove->anon_vma)
                        anon_vma_merge(vp->vma, vp->remove);
                mm->map_count--;
                mpol_put(vma_policy(vp->remove));
                if (!vp->remove2)
                        WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
                vm_area_free(vp->remove);

                /*
                 * In mprotect's case 6 (see comments on vma_merge),
                 * we are removing both mid and next vmas
                 */
                if (vp->remove2) {
                        vp->remove = vp->remove2;
                        vp->remove2 = NULL;
                        goto again;
                }
        }
        if (vp->insert && vp->file)
                uprobe_mmap(vp->insert);
}

/*
 * init_vma_prep() - Initializer wrapper for vma_prepare struct
 * @vp: The vma_prepare struct
 * @vma: The vma that will be altered once locked
 */
static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
{
        init_multi_vma_prep(vp, vma, NULL);
}

/*
 * Can the proposed VMA be merged with the left (previous) VMA taking into
 * account the start position of the proposed range.
 */
static bool can_vma_merge_left(struct vma_merge_struct *vmg)

{
        return vmg->prev && vmg->prev->vm_end == vmg->start &&
                can_vma_merge_after(vmg);
}

/*
 * Can the proposed VMA be merged with the right (next) VMA taking into
 * account the end position of the proposed range.
 *
 * In addition, if we can merge with the left VMA, ensure that left and right
 * anon_vma's are also compatible.
 */
static bool can_vma_merge_right(struct vma_merge_struct *vmg,
                                bool can_merge_left)
{
        if (!vmg->next || vmg->end != vmg->next->vm_start ||
            !can_vma_merge_before(vmg))
                return false;

        if (!can_merge_left)
                return true;

        /*
         * If we can merge with prev (left) and next (right), indicating that
         * each VMA's anon_vma is compatible with the proposed anon_vma, this
         * does not mean prev and next are compatible with EACH OTHER.
         *
         * We therefore check this in addition to mergeability to either side.
         */
        return are_anon_vmas_compatible(vmg->prev, vmg->next);
}

/*
 * Close a vm structure and free it.
 */
void remove_vma(struct vm_area_struct *vma)
{
        might_sleep();
        vma_close(vma);
        if (vma->vm_file)
                fput(vma->vm_file);
        mpol_put(vma_policy(vma));
        vm_area_free(vma);
}

/*
 * Get rid of page table information in the indicated region.
 *
 * Called with the mm semaphore held.
 */
void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct vm_area_struct *next)
{
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;

        tlb_gather_mmu(&tlb, mm);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
                   /* mm_wr_locked = */ true);
        mas_set(mas, vma->vm_end);
        free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
                      next ? next->vm_start : USER_PGTABLES_CEILING,
                      /* mm_wr_locked = */ true);
        tlb_finish_mmu(&tlb);
}

/*
 * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
 * has already been checked or doesn't make sense to fail.
 * VMA Iterator will point to the original VMA.
 */
static __must_check int
__split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
            unsigned long addr, int new_below)
{
        struct vma_prepare vp;
        struct vm_area_struct *new;
        int err;

        WARN_ON(vma->vm_start >= addr);
        WARN_ON(vma->vm_end <= addr);

        if (vma->vm_ops && vma->vm_ops->may_split) {
                err = vma->vm_ops->may_split(vma, addr);
                if (err)
                        return err;
        }

        new = vm_area_dup(vma);
        if (!new)
                return -ENOMEM;

        if (new_below) {
                new->vm_end = addr;
        } else {
                new->vm_start = addr;
                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
        }

        err = -ENOMEM;
        vma_iter_config(vmi, new->vm_start, new->vm_end);
        if (vma_iter_prealloc(vmi, new))
                goto out_free_vma;

        err = vma_dup_policy(vma, new);
        if (err)
                goto out_free_vmi;

        err = anon_vma_clone(new, vma);
        if (err)
                goto out_free_mpol;

        if (new->vm_file)
                get_file(new->vm_file);

        if (new->vm_ops && new->vm_ops->open)
                new->vm_ops->open(new);

        vma_start_write(vma);
        vma_start_write(new);

        init_vma_prep(&vp, vma);
        vp.insert = new;
        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL);

        if (new_below) {
                vma->vm_start = addr;
                vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
        } else {
                vma->vm_end = addr;
        }

        /* vma_complete stores the new vma */
        vma_complete(&vp, vmi, vma->vm_mm);
        validate_mm(vma->vm_mm);

        /* Success. */
        if (new_below)
                vma_next(vmi);
        else
                vma_prev(vmi);

        return 0;

out_free_mpol:
        mpol_put(vma_policy(new));
out_free_vmi:
        vma_iter_free(vmi);
out_free_vma:
        vm_area_free(new);
        return err;
}

/*
 * Split a vma into two pieces at address 'addr', a new vma is allocated
 * either for the first part or the tail.
 */
static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
                     unsigned long addr, int new_below)
{
        if (vma->vm_mm->map_count >= sysctl_max_map_count)
                return -ENOMEM;

        return __split_vma(vmi, vma, addr, new_below);
}

/*
 * dup_anon_vma() - Helper function to duplicate anon_vma
 * @dst: The destination VMA
 * @src: The source VMA
 * @dup: Pointer to the destination VMA when successful.
 *
 * Returns: 0 on success.
 */
static int dup_anon_vma(struct vm_area_struct *dst,
                        struct vm_area_struct *src, struct vm_area_struct **dup)
{
        /*
         * Easily overlooked: when mprotect shifts the boundary, make sure the
         * expanding vma has anon_vma set if the shrinking vma had, to cover any
         * anon pages imported.
         */
        if (src->anon_vma && !dst->anon_vma) {
                int ret;

                vma_assert_write_locked(dst);
                dst->anon_vma = src->anon_vma;
                ret = anon_vma_clone(dst, src);
                if (ret)
                        return ret;

                *dup = dst;
        }

        return 0;
}

#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
void validate_mm(struct mm_struct *mm)
{
        int bug = 0;
        int i = 0;
        struct vm_area_struct *vma;
        VMA_ITERATOR(vmi, mm, 0);

        mt_validate(&mm->mm_mt);
        for_each_vma(vmi, vma) {
#ifdef CONFIG_DEBUG_VM_RB
                struct anon_vma *anon_vma = vma->anon_vma;
                struct anon_vma_chain *avc;
#endif
                unsigned long vmi_start, vmi_end;
                bool warn = 0;

                vmi_start = vma_iter_addr(&vmi);
                vmi_end = vma_iter_end(&vmi);
                if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
                        warn = 1;

                if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
                        warn = 1;

                if (warn) {
                        pr_emerg("issue in %s\n", current->comm);
                        dump_stack();
                        dump_vma(vma);
                        pr_emerg("tree range: %px start %lx end %lx\n", vma,
                                 vmi_start, vmi_end - 1);
                        vma_iter_dump_tree(&vmi);
                }

#ifdef CONFIG_DEBUG_VM_RB
                if (anon_vma) {
                        anon_vma_lock_read(anon_vma);
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                anon_vma_interval_tree_verify(avc);
                        anon_vma_unlock_read(anon_vma);
                }
#endif
                /* Check for a infinite loop */
                if (++i > mm->map_count + 10) {
                        i = -1;
                        break;
                }
        }
        if (i != mm->map_count) {
                pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
                bug = 1;
        }
        VM_BUG_ON_MM(bug, mm);
}
#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */

/*
 * Based on the vmg flag indicating whether we need to adjust the vm_start field
 * for the middle or next VMA, we calculate what the range of the newly adjusted
 * VMA ought to be, and set the VMA's range accordingly.
 */
static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *adjust;
        pgoff_t pgoff;

        if (vmg->__adjust_middle_start) {
                adjust = vmg->middle;
                pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start);
        } else if (vmg->__adjust_next_start) {
                adjust = vmg->next;
                pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end);
        } else {
                return;
        }

        vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff);
}

/*
 * Actually perform the VMA merge operation.
 *
 * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
 * modify any VMAs or cause inconsistent state should an OOM condition arise.
 *
 * Returns 0 on success, or an error value on failure.
 */
static int commit_merge(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *vma;
        struct vma_prepare vp;

        if (vmg->__adjust_next_start) {
                /* We manipulate middle and adjust next, which is the target. */
                vma = vmg->middle;
                vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end);
        } else {
                vma = vmg->target;
                 /* Note: vma iterator must be pointing to 'start'. */
                vma_iter_config(vmg->vmi, vmg->start, vmg->end);
        }

        init_multi_vma_prep(&vp, vma, vmg);

        /*
         * If vmg->give_up_on_oom is set, we're safe, because we don't actually
         * manipulate any VMAs until we succeed at preallocation.
         *
         * Past this point, we will not return an error.
         */
        if (vma_iter_prealloc(vmg->vmi, vma))
                return -ENOMEM;

        vma_prepare(&vp);
        /*
         * THP pages may need to do additional splits if we increase
         * middle->vm_start.
         */
        vma_adjust_trans_huge(vma, vmg->start, vmg->end,
                              vmg->__adjust_middle_start ? vmg->middle : NULL);
        vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff);
        vmg_adjust_set_range(vmg);
        vma_iter_store_overwrite(vmg->vmi, vmg->target);

        vma_complete(&vp, vmg->vmi, vma->vm_mm);

        return 0;
}

/* We can only remove VMAs when merging if they do not have a close hook. */
static bool can_merge_remove_vma(struct vm_area_struct *vma)
{
        return !vma->vm_ops || !vma->vm_ops->close;
}

/*
 * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
 * attributes modified.
 *
 * @vmg: Describes the modifications being made to a VMA and associated
 *       metadata.
 *
 * When the attributes of a range within a VMA change, then it might be possible
 * for immediately adjacent VMAs to be merged into that VMA due to having
 * identical properties.
 *
 * This function checks for the existence of any such mergeable VMAs and updates
 * the maple tree describing the @vmg->middle->vm_mm address space to account
 * for this, as well as any VMAs shrunk/expanded/deleted as a result of this
 * merge.
 *
 * As part of this operation, if a merge occurs, the @vmg object will have its
 * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
 * calls to this function should reset these fields.
 *
 * Returns: The merged VMA if merge succeeds, or NULL otherwise.
 *
 * ASSUMPTIONS:
 * - The caller must assign the VMA to be modifed to @vmg->middle.
 * - The caller must have set @vmg->prev to the previous VMA, if there is one.
 * - The caller must not set @vmg->next, as we determine this.
 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
 * - vmi must be positioned within [@vmg->middle->vm_start, @vmg->middle->vm_end).
 */
static __must_check struct vm_area_struct *vma_merge_existing_range(
                struct vma_merge_struct *vmg)
{
        struct vm_area_struct *middle = vmg->middle;
        struct vm_area_struct *prev = vmg->prev;
        struct vm_area_struct *next;
        struct vm_area_struct *anon_dup = NULL;
        unsigned long start = vmg->start;
        unsigned long end = vmg->end;
        bool left_side = middle && start == middle->vm_start;
        bool right_side = middle && end == middle->vm_end;
        int err = 0;
        bool merge_left, merge_right, merge_both;

        mmap_assert_write_locked(vmg->mm);
        VM_WARN_ON_VMG(!middle, vmg); /* We are modifying a VMA, so caller must specify. */
        VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */
        VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg);
        VM_WARN_ON_VMG(start >= end, vmg);

        /*
         * If middle == prev, then we are offset into a VMA. Otherwise, if we are
         * not, we must span a portion of the VMA.
         */
        VM_WARN_ON_VMG(middle &&
                       ((middle != prev && vmg->start != middle->vm_start) ||
                        vmg->end > middle->vm_end), vmg);
        /* The vmi must be positioned within vmg->middle. */
        VM_WARN_ON_VMG(middle &&
                       !(vma_iter_addr(vmg->vmi) >= middle->vm_start &&
                         vma_iter_addr(vmg->vmi) < middle->vm_end), vmg);

        vmg->state = VMA_MERGE_NOMERGE;

        /*
         * If a special mapping or if the range being modified is neither at the
         * furthermost left or right side of the VMA, then we have no chance of
         * merging and should abort.
         */
        if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
                return NULL;

        if (left_side)
                merge_left = can_vma_merge_left(vmg);
        else
                merge_left = false;

        if (right_side) {
                next = vmg->next = vma_iter_next_range(vmg->vmi);
                vma_iter_prev_range(vmg->vmi);

                merge_right = can_vma_merge_right(vmg, merge_left);
        } else {
                merge_right = false;
                next = NULL;
        }

        if (merge_left)                /* If merging prev, position iterator there. */
                vma_prev(vmg->vmi);
        else if (!merge_right)        /* If we have nothing to merge, abort. */
                return NULL;

        merge_both = merge_left && merge_right;
        /* If we span the entire VMA, a merge implies it will be deleted. */
        vmg->__remove_middle = left_side && right_side;

        /*
         * If we need to remove middle in its entirety but are unable to do so,
         * we have no sensible recourse but to abort the merge.
         */
        if (vmg->__remove_middle && !can_merge_remove_vma(middle))
                return NULL;

        /*
         * If we merge both VMAs, then next is also deleted. This implies
         * merge_will_delete_vma also.
         */
        vmg->__remove_next = merge_both;

        /*
         * If we cannot delete next, then we can reduce the operation to merging
         * prev and middle (thereby deleting middle).
         */
        if (vmg->__remove_next && !can_merge_remove_vma(next)) {
                vmg->__remove_next = false;
                merge_right = false;
                merge_both = false;
        }

        /* No matter what happens, we will be adjusting middle. */
        vma_start_write(middle);

        if (merge_right) {
                vma_start_write(next);
                vmg->target = next;
        }

        if (merge_left) {
                vma_start_write(prev);
                vmg->target = prev;
        }

        if (merge_both) {
                /*
                 * |<-------------------->|
                 * |-------********-------|
                 *   prev   middle   next
                 *  extend  delete  delete
                 */

                vmg->start = prev->vm_start;
                vmg->end = next->vm_end;
                vmg->pgoff = prev->vm_pgoff;

                /*
                 * We already ensured anon_vma compatibility above, so now it's
                 * simply a case of, if prev has no anon_vma object, which of
                 * next or middle contains the anon_vma we must duplicate.
                 */
                err = dup_anon_vma(prev, next->anon_vma ? next : middle,
                                   &anon_dup);
        } else if (merge_left) {
                /*
                 * |<------------>|      OR
                 * |<----------------->|
                 * |-------*************
                 *   prev     middle
                 *  extend shrink/delete
                 */

                vmg->start = prev->vm_start;
                vmg->pgoff = prev->vm_pgoff;

                if (!vmg->__remove_middle)
                        vmg->__adjust_middle_start = true;

                err = dup_anon_vma(prev, middle, &anon_dup);
        } else { /* merge_right */
                /*
                 *     |<------------->| OR
                 * |<----------------->|
                 * *************-------|
                 *    middle     next
                 * shrink/delete extend
                 */

                pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);

                VM_WARN_ON_VMG(!merge_right, vmg);
                /* If we are offset into a VMA, then prev must be middle. */
                VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg);

                if (vmg->__remove_middle) {
                        vmg->end = next->vm_end;
                        vmg->pgoff = next->vm_pgoff - pglen;
                } else {
                        /* We shrink middle and expand next. */
                        vmg->__adjust_next_start = true;
                        vmg->start = middle->vm_start;
                        vmg->end = start;
                        vmg->pgoff = middle->vm_pgoff;
                }

                err = dup_anon_vma(next, middle, &anon_dup);
        }

        if (err)
                goto abort;

        err = commit_merge(vmg);
        if (err) {
                VM_WARN_ON(err != -ENOMEM);

                if (anon_dup)
                        unlink_anon_vmas(anon_dup);

                /*
                 * We've cleaned up any cloned anon_vma's, no VMAs have been
                 * modified, no harm no foul if the user requests that we not
                 * report this and just give up, leaving the VMAs unmerged.
                 */
                if (!vmg->give_up_on_oom)
                        vmg->state = VMA_MERGE_ERROR_NOMEM;
                return NULL;
        }

        khugepaged_enter_vma(vmg->target, vmg->flags);
        vmg->state = VMA_MERGE_SUCCESS;
        return vmg->target;

abort:
        vma_iter_set(vmg->vmi, start);
        vma_iter_load(vmg->vmi);

        /*
         * This means we have failed to clone anon_vma's correctly, but no
         * actual changes to VMAs have occurred, so no harm no foul - if the
         * user doesn't want this reported and instead just wants to give up on
         * the merge, allow it.
         */
        if (!vmg->give_up_on_oom)
                vmg->state = VMA_MERGE_ERROR_NOMEM;
        return NULL;
}

/*
 * vma_merge_new_range - Attempt to merge a new VMA into address space
 *
 * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
 *       (exclusive), which we try to merge with any adjacent VMAs if possible.
 *
 * We are about to add a VMA to the address space starting at @vmg->start and
 * ending at @vmg->end. There are three different possible scenarios:
 *
 * 1. There is a VMA with identical properties immediately adjacent to the
 *    proposed new VMA [@vmg->start, @vmg->end) either before or after it -
 *    EXPAND that VMA:
 *
 * Proposed:       |-----|  or  |-----|
 * Existing:  |----|                  |----|
 *
 * 2. There are VMAs with identical properties immediately adjacent to the
 *    proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
 *    EXPAND the former and REMOVE the latter:
 *
 * Proposed:       |-----|
 * Existing:  |----|     |----|
 *
 * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
 *    VMAs do not have identical attributes - NO MERGE POSSIBLE.
 *
 * In instances where we can merge, this function returns the expanded VMA which
 * will have its range adjusted accordingly and the underlying maple tree also
 * adjusted.
 *
 * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
 *          to the VMA we expanded.
 *
 * This function adjusts @vmg to provide @vmg->next if not already specified,
 * and adjusts [@vmg->start, @vmg->end) to span the expanded range.
 *
 * ASSUMPTIONS:
 * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
 * - The caller must have determined that [@vmg->start, @vmg->end) is empty,
     other than VMAs that will be unmapped should the operation succeed.
 * - The caller must have specified the previous vma in @vmg->prev.
 * - The caller must have specified the next vma in @vmg->next.
 * - The caller must have positioned the vmi at or before the gap.
 */
struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *prev = vmg->prev;
        struct vm_area_struct *next = vmg->next;
        unsigned long end = vmg->end;
        bool can_merge_left, can_merge_right;

        mmap_assert_write_locked(vmg->mm);
        VM_WARN_ON_VMG(vmg->middle, vmg);
        /* vmi must point at or before the gap. */
        VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg);

        vmg->state = VMA_MERGE_NOMERGE;

        /* Special VMAs are unmergeable, also if no prev/next. */
        if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
                return NULL;

        can_merge_left = can_vma_merge_left(vmg);
        can_merge_right = !vmg->just_expand && can_vma_merge_right(vmg, can_merge_left);

        /* If we can merge with the next VMA, adjust vmg accordingly. */
        if (can_merge_right) {
                vmg->end = next->vm_end;
                vmg->middle = next;
        }

        /* If we can merge with the previous VMA, adjust vmg accordingly. */
        if (can_merge_left) {
                vmg->start = prev->vm_start;
                vmg->middle = prev;
                vmg->pgoff = prev->vm_pgoff;

                /*
                 * If this merge would result in removal of the next VMA but we
                 * are not permitted to do so, reduce the operation to merging
                 * prev and vma.
                 */
                if (can_merge_right && !can_merge_remove_vma(next))
                        vmg->end = end;

                /* In expand-only case we are already positioned at prev. */
                if (!vmg->just_expand) {
                        /* Equivalent to going to the previous range. */
                        vma_prev(vmg->vmi);
                }
        }

        /*
         * Now try to expand adjacent VMA(s). This takes care of removing the
         * following VMA if we have VMAs on both sides.
         */
        if (vmg->middle && !vma_expand(vmg)) {
                khugepaged_enter_vma(vmg->middle, vmg->flags);
                vmg->state = VMA_MERGE_SUCCESS;
                return vmg->middle;
        }

        return NULL;
}

/*
 * vma_expand - Expand an existing VMA
 *
 * @vmg: Describes a VMA expansion operation.
 *
 * Expand @vma to vmg->start and vmg->end.  Can expand off the start and end.
 * Will expand over vmg->next if it's different from vmg->middle and vmg->end ==
 * vmg->next->vm_end.  Checking if the vmg->middle can expand and merge with
 * vmg->next needs to be handled by the caller.
 *
 * Returns: 0 on success.
 *
 * ASSUMPTIONS:
 * - The caller must hold a WRITE lock on vmg->middle->mm->mmap_lock.
 * - The caller must have set @vmg->middle and @vmg->next.
 */
int vma_expand(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *anon_dup = NULL;
        bool remove_next = false;
        struct vm_area_struct *middle = vmg->middle;
        struct vm_area_struct *next = vmg->next;

        mmap_assert_write_locked(vmg->mm);

        vma_start_write(middle);
        if (next && (middle != next) && (vmg->end == next->vm_end)) {
                int ret;

                remove_next = true;
                /* This should already have been checked by this point. */
                VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
                vma_start_write(next);
                /*
                 * In this case we don't report OOM, so vmg->give_up_on_mm is
                 * safe.
                 */
                ret = dup_anon_vma(middle, next, &anon_dup);
                if (ret)
                        return ret;
        }

        /* Not merging but overwriting any part of next is not handled. */
        VM_WARN_ON_VMG(next && !remove_next &&
                       next != middle && vmg->end > next->vm_start, vmg);
        /* Only handles expanding */
        VM_WARN_ON_VMG(middle->vm_start < vmg->start ||
                       middle->vm_end > vmg->end, vmg);

        vmg->target = middle;
        if (remove_next)
                vmg->__remove_next = true;

        if (commit_merge(vmg))
                goto nomem;

        return 0;

nomem:
        if (anon_dup)
                unlink_anon_vmas(anon_dup);
        /*
         * If the user requests that we just give upon OOM, we are safe to do so
         * here, as commit merge provides this contract to us. Nothing has been
         * changed - no harm no foul, just don't report it.
         */
        if (!vmg->give_up_on_oom)
                vmg->state = VMA_MERGE_ERROR_NOMEM;
        return -ENOMEM;
}

/*
 * vma_shrink() - Reduce an existing VMAs memory area
 * @vmi: The vma iterator
 * @vma: The VMA to modify
 * @start: The new start
 * @end: The new end
 *
 * Returns: 0 on success, -ENOMEM otherwise
 */
int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
               unsigned long start, unsigned long end, pgoff_t pgoff)
{
        struct vma_prepare vp;

        WARN_ON((vma->vm_start != start) && (vma->vm_end != end));

        if (vma->vm_start < start)
                vma_iter_config(vmi, vma->vm_start, start);
        else
                vma_iter_config(vmi, end, vma->vm_end);

        if (vma_iter_prealloc(vmi, NULL))
                return -ENOMEM;

        vma_start_write(vma);

        init_vma_prep(&vp, vma);
        vma_prepare(&vp);
        vma_adjust_trans_huge(vma, start, end, NULL);

        vma_iter_clear(vmi);
        vma_set_range(vma, start, end, pgoff);
        vma_complete(&vp, vmi, vma->vm_mm);
        validate_mm(vma->vm_mm);
        return 0;
}

static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
                    struct ma_state *mas_detach, bool mm_wr_locked)
{
        struct mmu_gather tlb;

        if (!vms->clear_ptes) /* Nothing to do */
                return;

        /*
         * We can free page tables without write-locking mmap_lock because VMAs
         * were isolated before we downgraded mmap_lock.
         */
        mas_set(mas_detach, 1);
        tlb_gather_mmu(&tlb, vms->vma->vm_mm);
        update_hiwater_rss(vms->vma->vm_mm);
        unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
                   vms->vma_count, mm_wr_locked);

        mas_set(mas_detach, 1);
        /* start and end may be different if there is no prev or next vma. */
        free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
                      vms->unmap_end, mm_wr_locked);
        tlb_finish_mmu(&tlb);
        vms->clear_ptes = false;
}

static void vms_clean_up_area(struct vma_munmap_struct *vms,
                struct ma_state *mas_detach)
{
        struct vm_area_struct *vma;

        if (!vms->nr_pages)
                return;

        vms_clear_ptes(vms, mas_detach, true);
        mas_set(mas_detach, 0);
        mas_for_each(mas_detach, vma, ULONG_MAX)
                vma_close(vma);
}

/*
 * vms_complete_munmap_vmas() - Finish the munmap() operation
 * @vms: The vma munmap struct
 * @mas_detach: The maple state of the detached vmas
 *
 * This updates the mm_struct, unmaps the region, frees the resources
 * used for the munmap() and may downgrade the lock - if requested.  Everything
 * needed to be done once the vma maple tree is updated.
 */
static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
                struct ma_state *mas_detach)
{
        struct vm_area_struct *vma;
        struct mm_struct *mm;

        mm = current->mm;
        mm->map_count -= vms->vma_count;
        mm->locked_vm -= vms->locked_vm;
        if (vms->unlock)
                mmap_write_downgrade(mm);

        if (!vms->nr_pages)
                return;

        vms_clear_ptes(vms, mas_detach, !vms->unlock);
        /* Update high watermark before we lower total_vm */
        update_hiwater_vm(mm);
        /* Stat accounting */
        WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
        /* Paranoid bookkeeping */
        VM_WARN_ON(vms->exec_vm > mm->exec_vm);
        VM_WARN_ON(vms->stack_vm > mm->stack_vm);
        VM_WARN_ON(vms->data_vm > mm->data_vm);
        mm->exec_vm -= vms->exec_vm;
        mm->stack_vm -= vms->stack_vm;
        mm->data_vm -= vms->data_vm;

        /* Remove and clean up vmas */
        mas_set(mas_detach, 0);
        mas_for_each(mas_detach, vma, ULONG_MAX)
                remove_vma(vma);

        vm_unacct_memory(vms->nr_accounted);
        validate_mm(mm);
        if (vms->unlock)
                mmap_read_unlock(mm);

        __mt_destroy(mas_detach->tree);
}

/*
 * reattach_vmas() - Undo any munmap work and free resources
 * @mas_detach: The maple state with the detached maple tree
 *
 * Reattach any detached vmas and free up the maple tree used to track the vmas.
 */
static void reattach_vmas(struct ma_state *mas_detach)
{
        struct vm_area_struct *vma;

        mas_set(mas_detach, 0);
        mas_for_each(mas_detach, vma, ULONG_MAX)
                vma_mark_attached(vma);

        __mt_destroy(mas_detach->tree);
}

/*
 * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
 * for removal at a later date.  Handles splitting first and last if necessary
 * and marking the vmas as isolated.
 *
 * @vms: The vma munmap struct
 * @mas_detach: The maple state tracking the detached tree
 *
 * Return: 0 on success, error otherwise
 */
static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
                struct ma_state *mas_detach)
{
        struct vm_area_struct *next = NULL;
        int error;

        /*
         * If we need to split any vma, do it now to save pain later.
         * Does it split the first one?
         */
        if (vms->start > vms->vma->vm_start) {

                /*
                 * Make sure that map_count on return from munmap() will
                 * not exceed its limit; but let map_count go just above
                 * its limit temporarily, to help free resources as expected.
                 */
                if (vms->end < vms->vma->vm_end &&
                    vms->vma->vm_mm->map_count >= sysctl_max_map_count) {
                        error = -ENOMEM;
                        goto map_count_exceeded;
                }

                /* Don't bother splitting the VMA if we can't unmap it anyway */
                if (!can_modify_vma(vms->vma)) {
                        error = -EPERM;
                        goto start_split_failed;
                }

                error = __split_vma(vms->vmi, vms->vma, vms->start, 1);
                if (error)
                        goto start_split_failed;
        }
        vms->prev = vma_prev(vms->vmi);
        if (vms->prev)
                vms->unmap_start = vms->prev->vm_end;

        /*
         * Detach a range of VMAs from the mm. Using next as a temp variable as
         * it is always overwritten.
         */
        for_each_vma_range(*(vms->vmi), next, vms->end) {
                long nrpages;

                if (!can_modify_vma(next)) {
                        error = -EPERM;
                        goto modify_vma_failed;
                }
                /* Does it split the end? */
                if (next->vm_end > vms->end) {
                        error = __split_vma(vms->vmi, next, vms->end, 0);
                        if (error)
                                goto end_split_failed;
                }
                vma_start_write(next);
                mas_set(mas_detach, vms->vma_count++);
                error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
                if (error)
                        goto munmap_gather_failed;

                vma_mark_detached(next);
                nrpages = vma_pages(next);

                vms->nr_pages += nrpages;
                if (next->vm_flags & VM_LOCKED)
                        vms->locked_vm += nrpages;

                if (next->vm_flags & VM_ACCOUNT)
                        vms->nr_accounted += nrpages;

                if (is_exec_mapping(next->vm_flags))
                        vms->exec_vm += nrpages;
                else if (is_stack_mapping(next->vm_flags))
                        vms->stack_vm += nrpages;
                else if (is_data_mapping(next->vm_flags))
                        vms->data_vm += nrpages;

                if (vms->uf) {
                        /*
                         * If userfaultfd_unmap_prep returns an error the vmas
                         * will remain split, but userland will get a
                         * highly unexpected error anyway. This is no
                         * different than the case where the first of the two
                         * __split_vma fails, but we don't undo the first
                         * split, despite we could. This is unlikely enough
                         * failure that it's not worth optimizing it for.
                         */
                        error = userfaultfd_unmap_prep(next, vms->start,
                                                       vms->end, vms->uf);
                        if (error)
                                goto userfaultfd_error;
                }
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
                BUG_ON(next->vm_start < vms->start);
                BUG_ON(next->vm_start > vms->end);
#endif
        }

        vms->next = vma_next(vms->vmi);
        if (vms->next)
                vms->unmap_end = vms->next->vm_start;

#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
        /* Make sure no VMAs are about to be lost. */
        {
                MA_STATE(test, mas_detach->tree, 0, 0);
                struct vm_area_struct *vma_mas, *vma_test;
                int test_count = 0;

                vma_iter_set(vms->vmi, vms->start);
                rcu_read_lock();
                vma_test = mas_find(&test, vms->vma_count - 1);
                for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
                        BUG_ON(vma_mas != vma_test);
                        test_count++;
                        vma_test = mas_next(&test, vms->vma_count - 1);
                }
                rcu_read_unlock();
                BUG_ON(vms->vma_count != test_count);
        }
#endif

        while (vma_iter_addr(vms->vmi) > vms->start)
                vma_iter_prev_range(vms->vmi);

        vms->clear_ptes = true;
        return 0;

userfaultfd_error:
munmap_gather_failed:
end_split_failed:
modify_vma_failed:
        reattach_vmas(mas_detach);
start_split_failed:
map_count_exceeded:
        return error;
}

/*
 * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
 * @vms: The vma munmap struct
 * @vmi: The vma iterator
 * @vma: The first vm_area_struct to munmap
 * @start: The aligned start address to munmap
 * @end: The aligned end address to munmap
 * @uf: The userfaultfd list_head
 * @unlock: Unlock after the operation.  Only unlocked on success
 */
static void init_vma_munmap(struct vma_munmap_struct *vms,
                struct vma_iterator *vmi, struct vm_area_struct *vma,
                unsigned long start, unsigned long end, struct list_head *uf,
                bool unlock)
{
        vms->vmi = vmi;
        vms->vma = vma;
        if (vma) {
                vms->start = start;
                vms->end = end;
        } else {
                vms->start = vms->end = 0;
        }
        vms->unlock = unlock;
        vms->uf = uf;
        vms->vma_count = 0;
        vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
        vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
        vms->unmap_start = FIRST_USER_ADDRESS;
        vms->unmap_end = USER_PGTABLES_CEILING;
        vms->clear_ptes = false;
}

/*
 * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
 * @vmi: The vma iterator
 * @vma: The starting vm_area_struct
 * @mm: The mm_struct
 * @start: The aligned start address to munmap.
 * @end: The aligned end address to munmap.
 * @uf: The userfaultfd list_head
 * @unlock: Set to true to drop the mmap_lock.  unlocking only happens on
 * success.
 *
 * Return: 0 on success and drops the lock if so directed, error and leaves the
 * lock held otherwise.
 */
int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
                struct mm_struct *mm, unsigned long start, unsigned long end,
                struct list_head *uf, bool unlock)
{
        struct maple_tree mt_detach;
        MA_STATE(mas_detach, &mt_detach, 0, 0);
        mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
        mt_on_stack(mt_detach);
        struct vma_munmap_struct vms;
        int error;

        init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
        error = vms_gather_munmap_vmas(&vms, &mas_detach);
        if (error)
                goto gather_failed;

        error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
        if (error)
                goto clear_tree_failed;

        /* Point of no return */
        vms_complete_munmap_vmas(&vms, &mas_detach);
        return 0;

clear_tree_failed:
        reattach_vmas(&mas_detach);
gather_failed:
        validate_mm(mm);
        return error;
}

/*
 * do_vmi_munmap() - munmap a given range.
 * @vmi: The vma iterator
 * @mm: The mm_struct
 * @start: The start address to munmap
 * @len: The length of the range to munmap
 * @uf: The userfaultfd list_head
 * @unlock: set to true if the user wants to drop the mmap_lock on success
 *
 * This function takes a @mas that is either pointing to the previous VMA or set
 * to MA_START and sets it up to remove the mapping(s).  The @len will be
 * aligned.
 *
 * Return: 0 on success and drops the lock if so directed, error and leaves the
 * lock held otherwise.
 */
int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
                  unsigned long start, size_t len, struct list_head *uf,
                  bool unlock)
{
        unsigned long end;
        struct vm_area_struct *vma;

        if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
                return -EINVAL;

        end = start + PAGE_ALIGN(len);
        if (end == start)
                return -EINVAL;

        /* Find the first overlapping VMA */
        vma = vma_find(vmi, end);
        if (!vma) {
                if (unlock)
                        mmap_write_unlock(mm);
                return 0;
        }

        return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
}

/*
 * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
 * context and anonymous VMA name within the range [start, end).
 *
 * As a result, we might be able to merge the newly modified VMA range with an
 * adjacent VMA with identical properties.
 *
 * If no merge is possible and the range does not span the entirety of the VMA,
 * we then need to split the VMA to accommodate the change.
 *
 * The function returns either the merged VMA, the original VMA if a split was
 * required instead, or an error if the split failed.
 */
static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
{
        struct vm_area_struct *vma = vmg->middle;
        unsigned long start = vmg->start;
        unsigned long end = vmg->end;
        struct vm_area_struct *merged;

        /* First, try to merge. */
        merged = vma_merge_existing_range(vmg);
        if (merged)
                return merged;
        if (vmg_nomem(vmg))
                return ERR_PTR(-ENOMEM);

        /*
         * Split can fail for reasons other than OOM, so if the user requests
         * this it's probably a mistake.
         */
        VM_WARN_ON(vmg->give_up_on_oom &&
                   (vma->vm_start != start || vma->vm_end != end));

        /* Split any preceding portion of the VMA. */
        if (vma->vm_start < start) {
                int err = split_vma(vmg->vmi, vma, start, 1);

                if (err)
                        return ERR_PTR(err);
        }

        /* Split any trailing portion of the VMA. */
        if (vma->vm_end > end) {
                int err = split_vma(vmg->vmi, vma, end, 0);

                if (err)
                        return ERR_PTR(err);
        }

        return vma;
}

struct vm_area_struct *vma_modify_flags(
        struct vma_iterator *vmi, struct vm_area_struct *prev,
        struct vm_area_struct *vma, unsigned long start, unsigned long end,
        unsigned long new_flags)
{
        VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

        vmg.flags = new_flags;

        return vma_modify(&vmg);
}

struct vm_area_struct
*vma_modify_flags_name(struct vma_iterator *vmi,
                       struct vm_area_struct *prev,
                       struct vm_area_struct *vma,
                       unsigned long start,
                       unsigned long end,
                       unsigned long new_flags,
                       struct anon_vma_name *new_name)
{
        VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

        vmg.flags = new_flags;
        vmg.anon_name = new_name;

        return vma_modify(&vmg);
}

struct vm_area_struct
*vma_modify_policy(struct vma_iterator *vmi,
                   struct vm_area_struct *prev,
                   struct vm_area_struct *vma,
                   unsigned long start, unsigned long end,
                   struct mempolicy *new_pol)
{
        VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

        vmg.policy = new_pol;

        return vma_modify(&vmg);
}

struct vm_area_struct
*vma_modify_flags_uffd(struct vma_iterator *vmi,
                       struct vm_area_struct *prev,
                       struct vm_area_struct *vma,
                       unsigned long start, unsigned long end,
                       unsigned long new_flags,
                       struct vm_userfaultfd_ctx new_ctx,
                       bool give_up_on_oom)
{
        VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);

        vmg.flags = new_flags;
        vmg.uffd_ctx = new_ctx;
        if (give_up_on_oom)
                vmg.give_up_on_oom = true;

        return vma_modify(&vmg);
}

/*
 * Expand vma by delta bytes, potentially merging with an immediately adjacent
 * VMA with identical properties.
 */
struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
                                        struct vm_area_struct *vma,
                                        unsigned long delta)
{
        VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);

        vmg.next = vma_iter_next_rewind(vmi, NULL);
        vmg.middle = NULL; /* We use the VMA to populate VMG fields only. */

        return vma_merge_new_range(&vmg);
}

void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
{
        vb->count = 0;
}

static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
{
        struct address_space *mapping;
        int i;

        mapping = vb->vmas[0]->vm_file->f_mapping;
        i_mmap_lock_write(mapping);
        for (i = 0; i < vb->count; i++) {
                VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
                __remove_shared_vm_struct(vb->vmas[i], mapping);
        }
        i_mmap_unlock_write(mapping);

        unlink_file_vma_batch_init(vb);
}

void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
                               struct vm_area_struct *vma)
{
        if (vma->vm_file == NULL)
                return;

        if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
            vb->count == ARRAY_SIZE(vb->vmas))
                unlink_file_vma_batch_process(vb);

        vb->vmas[vb->count] = vma;
        vb->count++;
}

void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
{
        if (vb->count > 0)
                unlink_file_vma_batch_process(vb);
}

/*
 * Unlink a file-based vm structure from its interval tree, to hide
 * vma from rmap and vmtruncate before freeing its page tables.
 */
void unlink_file_vma(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;

        if (file) {
                struct address_space *mapping = file->f_mapping;

                i_mmap_lock_write(mapping);
                __remove_shared_vm_struct(vma, mapping);
                i_mmap_unlock_write(mapping);
        }
}

void vma_link_file(struct vm_area_struct *vma)
{
        struct file *file = vma->vm_file;
        struct address_space *mapping;

        if (file) {
                mapping = file->f_mapping;
                i_mmap_lock_write(mapping);
                __vma_link_file(vma, mapping);
                i_mmap_unlock_write(mapping);
        }
}

int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
{
        VMA_ITERATOR(vmi, mm, 0);

        vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        vma_start_write(vma);
        vma_iter_store_new(&vmi, vma);
        vma_link_file(vma);
        mm->map_count++;
        validate_mm(mm);
        return 0;
}

/*
 * Copy the vma structure to a new location in the same mm,
 * prior to moving page table entries, to effect an mremap move.
 */
struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
        unsigned long addr, unsigned long len, pgoff_t pgoff,
        bool *need_rmap_locks)
{
        struct vm_area_struct *vma = *vmap;
        unsigned long vma_start = vma->vm_start;
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *new_vma;
        bool faulted_in_anon_vma = true;
        VMA_ITERATOR(vmi, mm, addr);
        VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);

        /*
         * If anonymous vma has not yet been faulted, update new pgoff
         * to match new location, to increase its chance of merging.
         */
        if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
                pgoff = addr >> PAGE_SHIFT;
                faulted_in_anon_vma = false;
        }

        new_vma = find_vma_prev(mm, addr, &vmg.prev);
        if (new_vma && new_vma->vm_start < addr + len)
                return NULL;        /* should never get here */

        vmg.middle = NULL; /* New VMA range. */
        vmg.pgoff = pgoff;
        vmg.next = vma_iter_next_rewind(&vmi, NULL);
        new_vma = vma_merge_new_range(&vmg);

        if (new_vma) {
                /*
                 * Source vma may have been merged into new_vma
                 */
                if (unlikely(vma_start >= new_vma->vm_start &&
                             vma_start < new_vma->vm_end)) {
                        /*
                         * The only way we can get a vma_merge with
                         * self during an mremap is if the vma hasn't
                         * been faulted in yet and we were allowed to
                         * reset the dst vma->vm_pgoff to the
                         * destination address of the mremap to allow
                         * the merge to happen. mremap must change the
                         * vm_pgoff linearity between src and dst vmas
                         * (in turn preventing a vma_merge) to be
                         * safe. It is only safe to keep the vm_pgoff
                         * linear if there are no pages mapped yet.
                         */
                        VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
                        *vmap = vma = new_vma;
                }
                *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
        } else {
                new_vma = vm_area_dup(vma);
                if (!new_vma)
                        goto out;
                vma_set_range(new_vma, addr, addr + len, pgoff);
                if (vma_dup_policy(vma, new_vma))
                        goto out_free_vma;
                if (anon_vma_clone(new_vma, vma))
                        goto out_free_mempol;
                if (new_vma->vm_file)
                        get_file(new_vma->vm_file);
                if (new_vma->vm_ops && new_vma->vm_ops->open)
                        new_vma->vm_ops->open(new_vma);
                if (vma_link(mm, new_vma))
                        goto out_vma_link;
                *need_rmap_locks = false;
        }
        return new_vma;

out_vma_link:
        vma_close(new_vma);

        if (new_vma->vm_file)
                fput(new_vma->vm_file);

        unlink_anon_vmas(new_vma);
out_free_mempol:
        mpol_put(vma_policy(new_vma));
out_free_vma:
        vm_area_free(new_vma);
out:
        return NULL;
}

/*
 * Rough compatibility check to quickly see if it's even worth looking
 * at sharing an anon_vma.
 *
 * They need to have the same vm_file, and the flags can only differ
 * in things that mprotect may change.
 *
 * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
 * we can merge the two vma's. For example, we refuse to merge a vma if
 * there is a vm_ops->close() function, because that indicates that the
 * driver is doing some kind of reference counting. But that doesn't
 * really matter for the anon_vma sharing case.
 */
static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
{
        return a->vm_end == b->vm_start &&
                mpol_equal(vma_policy(a), vma_policy(b)) &&
                a->vm_file == b->vm_file &&
                !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
}

/*
 * Do some basic sanity checking to see if we can re-use the anon_vma
 * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
 * the same as 'old', the other will be the new one that is trying
 * to share the anon_vma.
 *
 * NOTE! This runs with mmap_lock held for reading, so it is possible that
 * the anon_vma of 'old' is concurrently in the process of being set up
 * by another page fault trying to merge _that_. But that's ok: if it
 * is being set up, that automatically means that it will be a singleton
 * acceptable for merging, so we can do all of this optimistically. But
 * we do that READ_ONCE() to make sure that we never re-load the pointer.
 *
 * IOW: that the "list_is_singular()" test on the anon_vma_chain only
 * matters for the 'stable anon_vma' case (ie the thing we want to avoid
 * is to return an anon_vma that is "complex" due to having gone through
 * a fork).
 *
 * We also make sure that the two vma's are compatible (adjacent,
 * and with the same memory policies). That's all stable, even with just
 * a read lock on the mmap_lock.
 */
static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
                                          struct vm_area_struct *a,
                                          struct vm_area_struct *b)
{
        if (anon_vma_compatible(a, b)) {
                struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);

                if (anon_vma && list_is_singular(&old->anon_vma_chain))
                        return anon_vma;
        }
        return NULL;
}

/*
 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
 * neighbouring vmas for a suitable anon_vma, before it goes off
 * to allocate a new anon_vma.  It checks because a repetitive
 * sequence of mprotects and faults may otherwise lead to distinct
 * anon_vmas being allocated, preventing vma merge in subsequent
 * mprotect.
 */
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
        struct anon_vma *anon_vma = NULL;
        struct vm_area_struct *prev, *next;
        VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);

        /* Try next first. */
        next = vma_iter_load(&vmi);
        if (next) {
                anon_vma = reusable_anon_vma(next, vma, next);
                if (anon_vma)
                        return anon_vma;
        }

        prev = vma_prev(&vmi);
        VM_BUG_ON_VMA(prev != vma, vma);
        prev = vma_prev(&vmi);
        /* Try prev next. */
        if (prev)
                anon_vma = reusable_anon_vma(prev, prev, vma);

        /*
         * We might reach here with anon_vma == NULL if we can't find
         * any reusable anon_vma.
         * There's no absolute need to look only at touching neighbours:
         * we could search further afield for "compatible" anon_vmas.
         * But it would probably just be a waste of time searching,
         * or lead to too many vmas hanging off the same anon_vma.
         * We're trying to allow mprotect remerging later on,
         * not trying to minimize memory used for anon_vmas.
         */
        return anon_vma;
}

static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
{
        return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
}

static bool vma_is_shared_writable(struct vm_area_struct *vma)
{
        return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
                (VM_WRITE | VM_SHARED);
}

static bool vma_fs_can_writeback(struct vm_area_struct *vma)
{
        /* No managed pages to writeback. */
        if (vma->vm_flags & VM_PFNMAP)
                return false;

        return vma->vm_file && vma->vm_file->f_mapping &&
                mapping_can_writeback(vma->vm_file->f_mapping);
}

/*
 * Does this VMA require the underlying folios to have their dirty state
 * tracked?
 */
bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
{
        /* Only shared, writable VMAs require dirty tracking. */
        if (!vma_is_shared_writable(vma))
                return false;

        /* Does the filesystem need to be notified? */
        if (vm_ops_needs_writenotify(vma->vm_ops))
                return true;

        /*
         * Even if the filesystem doesn't indicate a need for writenotify, if it
         * can writeback, dirty tracking is still required.
         */
        return vma_fs_can_writeback(vma);
}

/*
 * Some shared mappings will want the pages marked read-only
 * to track write events. If so, we'll downgrade vm_page_prot
 * to the private version (using protection_map[] without the
 * VM_SHARED bit).
 */
bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
{
        /* If it was private or non-writable, the write bit is already clear */
        if (!vma_is_shared_writable(vma))
                return false;

        /* The backer wishes to know when pages are first written to? */
        if (vm_ops_needs_writenotify(vma->vm_ops))
                return true;

        /* The open routine did something to the protections that pgprot_modify
         * won't preserve? */
        if (pgprot_val(vm_page_prot) !=
            pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
                return false;

        /*
         * Do we need to track softdirty? hugetlb does not support softdirty
         * tracking yet.
         */
        if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
                return true;

        /* Do we need write faults for uffd-wp tracking? */
        if (userfaultfd_wp(vma))
                return true;

        /* Can the mapping track the dirty pages? */
        return vma_fs_can_writeback(vma);
}

static DEFINE_MUTEX(mm_all_locks_mutex);

static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
{
        if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
                /*
                 * We can safely modify head.next after taking the
                 * anon_vma->root->rwsem. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
                 * anon_vma->root->rwsem.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
        }
}

static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
{
        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change from under us because
                 * we hold the mm_all_locks_mutex.
                 *
                 * Operations on ->flags have to be atomic because
                 * even if AS_MM_ALL_LOCKS is stable thanks to the
                 * mm_all_locks_mutex, there may be other cpus
                 * changing other bitflags in parallel to us.
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
                down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
        }
}

/*
 * This operation locks against the VM for all pte/vma/mm related
 * operations that could ever happen on a certain mm. This includes
 * vmtruncate, try_to_unmap, and all page faults.
 *
 * The caller must take the mmap_lock in write mode before calling
 * mm_take_all_locks(). The caller isn't allowed to release the
 * mmap_lock until mm_drop_all_locks() returns.
 *
 * mmap_lock in write mode is required in order to block all operations
 * that could modify pagetables and free pages without need of
 * altering the vma layout. It's also needed in write mode to avoid new
 * anon_vmas to be associated with existing vmas.
 *
 * A single task can't take more than one mm_take_all_locks() in a row
 * or it would deadlock.
 *
 * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
 * mapping->flags avoid to take the same lock twice, if more than one
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We take locks in following order, accordingly to comment at beginning
 * of mm/rmap.c:
 *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
 *     hugetlb mapping);
 *   - all vmas marked locked
 *   - all i_mmap_rwsem locks;
 *   - all anon_vma->rwseml
 *
 * We can take all locks within these types randomly because the VM code
 * doesn't nest them and we protected from parallel mm_take_all_locks() by
 * mm_all_locks_mutex.
 *
 * mm_take_all_locks() and mm_drop_all_locks are expensive operations
 * that may have to take thousand of locks.
 *
 * mm_take_all_locks() can fail if it's interrupted by signals.
 */
int mm_take_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_assert_write_locked(mm);

        mutex_lock(&mm_all_locks_mutex);

        /*
         * vma_start_write() does not have a complement in mm_drop_all_locks()
         * because vma_start_write() is always asymmetrical; it marks a VMA as
         * being written to until mmap_write_unlock() or mmap_write_downgrade()
         * is reached.
         */
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                vma_start_write(vma);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->vm_file && vma->vm_file->f_mapping &&
                                !is_vm_hugetlb_page(vma))
                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
        }

        vma_iter_init(&vmi, mm, 0);
        for_each_vma(vmi, vma) {
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_lock_anon_vma(mm, avc->anon_vma);
        }

        return 0;

out_unlock:
        mm_drop_all_locks(mm);
        return -EINTR;
}

static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
{
        if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
                 *
                 * We must however clear the bitflag before unlocking
                 * the vma so the users using the anon_vma->rb_root will
                 * never see our bitflag.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
                 * anon_vma->root->rwsem.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->rb_root.rb_root.rb_node))
                        BUG();
                anon_vma_unlock_write(anon_vma);
        }
}

static void vm_unlock_mapping(struct address_space *mapping)
{
        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
                /*
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
                i_mmap_unlock_write(mapping);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();
        }
}

/*
 * The mmap_lock cannot be released by the caller until
 * mm_drop_all_locks() returns.
 */
void mm_drop_all_locks(struct mm_struct *mm)
{
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;
        VMA_ITERATOR(vmi, mm, 0);

        mmap_assert_write_locked(mm);
        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));

        for_each_vma(vmi, vma) {
                if (vma->anon_vma)
                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
                                vm_unlock_anon_vma(avc->anon_vma);
                if (vma->vm_file && vma->vm_file->f_mapping)
                        vm_unlock_mapping(vma->vm_file->f_mapping);
        }

        mutex_unlock(&mm_all_locks_mutex);
}

/*
 * We account for memory if it's a private writeable mapping,
 * not hugepages and VM_NORESERVE wasn't set.
 */
static bool accountable_mapping(struct file *file, vm_flags_t vm_flags)
{
        /*
         * hugetlb has its own accounting separate from the core VM
         * VM_HUGETLB may not be set yet so we cannot check for that flag.
         */
        if (file && is_file_hugepages(file))
                return false;

        return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}

/*
 * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
 * operation.
 * @vms: The vma unmap structure
 * @mas_detach: The maple state with the detached maple tree
 *
 * Reattach any detached vmas, free up the maple tree used to track the vmas.
 * If that's not possible because the ptes are cleared (and vm_ops->closed() may
 * have been called), then a NULL is written over the vmas and the vmas are
 * removed (munmap() completed).
 */
static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
                struct ma_state *mas_detach)
{
        struct ma_state *mas = &vms->vmi->mas;

        if (!vms->nr_pages)
                return;

        if (vms->clear_ptes)
                return reattach_vmas(mas_detach);

        /*
         * Aborting cannot just call the vm_ops open() because they are often
         * not symmetrical and state data has been lost.  Resort to the old
         * failure method of leaving a gap where the MAP_FIXED mapping failed.
         */
        mas_set_range(mas, vms->start, vms->end - 1);
        mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL);
        /* Clean up the insertion of the unfortunate gap */
        vms_complete_munmap_vmas(vms, mas_detach);
}

/*
 * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be
 * unmapped once the map operation is completed, check limits, account mapping
 * and clean up any pre-existing VMAs.
 *
 * @map: Mapping state.
 * @uf:  Userfaultfd context list.
 *
 * Returns: 0 on success, error code otherwise.
 */
static int __mmap_prepare(struct mmap_state *map, struct list_head *uf)
{
        int error;
        struct vma_iterator *vmi = map->vmi;
        struct vma_munmap_struct *vms = &map->vms;

        /* Find the first overlapping VMA and initialise unmap state. */
        vms->vma = vma_find(vmi, map->end);
        init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf,
                        /* unlock = */ false);

        /* OK, we have overlapping VMAs - prepare to unmap them. */
        if (vms->vma) {
                mt_init_flags(&map->mt_detach,
                              vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
                mt_on_stack(map->mt_detach);
                mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0);
                /* Prepare to unmap any existing mapping in the area */
                error = vms_gather_munmap_vmas(vms, &map->mas_detach);
                if (error) {
                        /* On error VMAs will already have been reattached. */
                        vms->nr_pages = 0;
                        return error;
                }

                map->next = vms->next;
                map->prev = vms->prev;
        } else {
                map->next = vma_iter_next_rewind(vmi, &map->prev);
        }

        /* Check against address space limit. */
        if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages))
                return -ENOMEM;

        /* Private writable mapping: check memory availability. */
        if (accountable_mapping(map->file, map->flags)) {
                map->charged = map->pglen;
                map->charged -= vms->nr_accounted;
                if (map->charged) {
                        error = security_vm_enough_memory_mm(map->mm, map->charged);
                        if (error)
                                return error;
                }

                vms->nr_accounted = 0;
                map->flags |= VM_ACCOUNT;
        }

        /*
         * Clear PTEs while the vma is still in the tree so that rmap
         * cannot race with the freeing later in the truncate scenario.
         * This is also needed for mmap_file(), which is why vm_ops
         * close function is called.
         */
        vms_clean_up_area(vms, &map->mas_detach);

        return 0;
}


static int __mmap_new_file_vma(struct mmap_state *map,
                               struct vm_area_struct *vma)
{
        struct vma_iterator *vmi = map->vmi;
        int error;

        vma->vm_file = get_file(map->file);
        error = mmap_file(vma->vm_file, vma);
        if (error) {
                fput(vma->vm_file);
                vma->vm_file = NULL;

                vma_iter_set(vmi, vma->vm_end);
                /* Undo any partial mapping done by a device driver. */
                unmap_region(&vmi->mas, vma, map->prev, map->next);

                return error;
        }

        /* Drivers cannot alter the address of the VMA. */
        WARN_ON_ONCE(map->addr != vma->vm_start);
        /*
         * Drivers should not permit writability when previously it was
         * disallowed.
         */
        VM_WARN_ON_ONCE(map->flags != vma->vm_flags &&
                        !(map->flags & VM_MAYWRITE) &&
                        (vma->vm_flags & VM_MAYWRITE));

        /* If the flags change (and are mergeable), let's retry later. */
        map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL);
        map->flags = vma->vm_flags;

        return 0;
}

/*
 * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not
 * possible.
 *
 * @map:  Mapping state.
 * @vmap: Output pointer for the new VMA.
 *
 * Returns: Zero on success, or an error.
 */
static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap)
{
        struct vma_iterator *vmi = map->vmi;
        int error = 0;
        struct vm_area_struct *vma;

        /*
         * Determine the object being mapped and call the appropriate
         * specific mapper. the address has already been validated, but
         * not unmapped, but the maps are removed from the list.
         */
        vma = vm_area_alloc(map->mm);
        if (!vma)
                return -ENOMEM;

        vma_iter_config(vmi, map->addr, map->end);
        vma_set_range(vma, map->addr, map->end, map->pgoff);
        vm_flags_init(vma, map->flags);
        vma->vm_page_prot = vm_get_page_prot(map->flags);

        if (vma_iter_prealloc(vmi, vma)) {
                error = -ENOMEM;
                goto free_vma;
        }

        if (map->file)
                error = __mmap_new_file_vma(map, vma);
        else if (map->flags & VM_SHARED)
                error = shmem_zero_setup(vma);
        else
                vma_set_anonymous(vma);

        if (error)
                goto free_iter_vma;

#ifdef CONFIG_SPARC64
        /* TODO: Fix SPARC ADI! */
        WARN_ON_ONCE(!arch_validate_flags(map->flags));
#endif

        /* Lock the VMA since it is modified after insertion into VMA tree */
        vma_start_write(vma);
        vma_iter_store_new(vmi, vma);
        map->mm->map_count++;
        vma_link_file(vma);

        /*
         * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
         * call covers the non-merge case.
         */
        if (!vma_is_anonymous(vma))
                khugepaged_enter_vma(vma, map->flags);
        ksm_add_vma(vma);
        *vmap = vma;
        return 0;

free_iter_vma:
        vma_iter_free(vmi);
free_vma:
        vm_area_free(vma);
        return error;
}

/*
 * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping
 *                     statistics, handle locking and finalise the VMA.
 *
 * @map: Mapping state.
 * @vma: Merged or newly allocated VMA for the mmap()'d region.
 */
static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma)
{
        struct mm_struct *mm = map->mm;
        unsigned long vm_flags = vma->vm_flags;

        perf_event_mmap(vma);

        /* Unmap any existing mapping in the area. */
        vms_complete_munmap_vmas(&map->vms, &map->mas_detach);

        vm_stat_account(mm, vma->vm_flags, map->pglen);
        if (vm_flags & VM_LOCKED) {
                if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
                                        is_vm_hugetlb_page(vma) ||
                                        vma == get_gate_vma(mm))
                        vm_flags_clear(vma, VM_LOCKED_MASK);
                else
                        mm->locked_vm += map->pglen;
        }

        if (vma->vm_file)
                uprobe_mmap(vma);

        /*
         * New (or expanded) vma always get soft dirty status.
         * Otherwise user-space soft-dirty page tracker won't
         * be able to distinguish situation when vma area unmapped,
         * then new mapped in-place (which must be aimed as
         * a completely new data area).
         */
        vm_flags_set(vma, VM_SOFTDIRTY);

        vma_set_page_prot(vma);
}

static unsigned long __mmap_region(struct file *file, unsigned long addr,
                unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
                struct list_head *uf)
{
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma = NULL;
        int error;
        VMA_ITERATOR(vmi, mm, addr);
        MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file);

        error = __mmap_prepare(&map, uf);
        if (error)
                goto abort_munmap;

        /* Attempt to merge with adjacent VMAs... */
        if (map.prev || map.next) {
                VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL);

                vma = vma_merge_new_range(&vmg);
        }

        /* ...but if we can't, allocate a new VMA. */
        if (!vma) {
                error = __mmap_new_vma(&map, &vma);
                if (error)
                        goto unacct_error;
        }

        /* If flags changed, we might be able to merge, so try again. */
        if (map.retry_merge) {
                struct vm_area_struct *merged;
                VMG_MMAP_STATE(vmg, &map, vma);

                vma_iter_config(map.vmi, map.addr, map.end);
                merged = vma_merge_existing_range(&vmg);
                if (merged)
                        vma = merged;
        }

        __mmap_complete(&map, vma);

        return addr;

        /* Accounting was done by __mmap_prepare(). */
unacct_error:
        if (map.charged)
                vm_unacct_memory(map.charged);
abort_munmap:
        vms_abort_munmap_vmas(&map.vms, &map.mas_detach);
        return error;
}

/**
 * mmap_region() - Actually perform the userland mapping of a VMA into
 * current->mm with known, aligned and overflow-checked @addr and @len, and
 * correctly determined VMA flags @vm_flags and page offset @pgoff.
 *
 * This is an internal memory management function, and should not be used
 * directly.
 *
 * The caller must write-lock current->mm->mmap_lock.
 *
 * @file: If a file-backed mapping, a pointer to the struct file describing the
 * file to be mapped, otherwise NULL.
 * @addr: The page-aligned address at which to perform the mapping.
 * @len: The page-aligned, non-zero, length of the mapping.
 * @vm_flags: The VMA flags which should be applied to the mapping.
 * @pgoff: If @file is specified, the page offset into the file, if not then
 * the virtual page offset in memory of the anonymous mapping.
 * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap
 * events.
 *
 * Returns: Either an error, or the address at which the requested mapping has
 * been performed.
 */
unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
                          struct list_head *uf)
{
        unsigned long ret;
        bool writable_file_mapping = false;

        mmap_assert_write_locked(current->mm);

        /* Check to see if MDWE is applicable. */
        if (map_deny_write_exec(vm_flags, vm_flags))
                return -EACCES;

        /* Allow architectures to sanity-check the vm_flags. */
        if (!arch_validate_flags(vm_flags))
                return -EINVAL;

        /* Map writable and ensure this isn't a sealed memfd. */
        if (file && is_shared_maywrite(vm_flags)) {
                int error = mapping_map_writable(file->f_mapping);

                if (error)
                        return error;
                writable_file_mapping = true;
        }

        ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);

        /* Clear our write mapping regardless of error. */
        if (writable_file_mapping)
                mapping_unmap_writable(file->f_mapping);

        validate_mm(current->mm);
        return ret;
}

/*
 * do_brk_flags() - Increase the brk vma if the flags match.
 * @vmi: The vma iterator
 * @addr: The start address
 * @len: The length of the increase
 * @vma: The vma,
 * @flags: The VMA Flags
 *
 * Extend the brk VMA from addr to addr + len.  If the VMA is NULL or the flags
 * do not match then create a new anonymous VMA.  Eventually we may be able to
 * do some brk-specific accounting here.
 */
int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
                 unsigned long addr, unsigned long len, unsigned long flags)
{
        struct mm_struct *mm = current->mm;

        /*
         * Check against address space limits by the changed size
         * Note: This happens *after* clearing old mappings in some code paths.
         */
        flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
        if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
                return -ENOMEM;

        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;

        if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;

        /*
         * Expand the existing vma if possible; Note that singular lists do not
         * occur after forking, so the expand will only happen on new VMAs.
         */
        if (vma && vma->vm_end == addr) {
                VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));

                vmg.prev = vma;
                /* vmi is positioned at prev, which this mode expects. */
                vmg.just_expand = true;

                if (vma_merge_new_range(&vmg))
                        goto out;
                else if (vmg_nomem(&vmg))
                        goto unacct_fail;
        }

        if (vma)
                vma_iter_next_range(vmi);
        /* create a vma struct for an anonymous mapping */
        vma = vm_area_alloc(mm);
        if (!vma)
                goto unacct_fail;

        vma_set_anonymous(vma);
        vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT);
        vm_flags_init(vma, flags);
        vma->vm_page_prot = vm_get_page_prot(flags);
        vma_start_write(vma);
        if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL))
                goto mas_store_fail;

        mm->map_count++;
        validate_mm(mm);
        ksm_add_vma(vma);
out:
        perf_event_mmap(vma);
        mm->total_vm += len >> PAGE_SHIFT;
        mm->data_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED)
                mm->locked_vm += (len >> PAGE_SHIFT);
        vm_flags_set(vma, VM_SOFTDIRTY);
        return 0;

mas_store_fail:
        vm_area_free(vma);
unacct_fail:
        vm_unacct_memory(len >> PAGE_SHIFT);
        return -ENOMEM;
}

/**
 * unmapped_area() - Find an area between the low_limit and the high_limit with
 * the correct alignment and offset, all from @info. Note: current->mm is used
 * for the search.
 *
 * @info: The unmapped area information including the range [low_limit -
 * high_limit), the alignment offset and mask.
 *
 * Return: A memory address or -ENOMEM.
 */
unsigned long unmapped_area(struct vm_unmapped_area_info *info)
{
        unsigned long length, gap;
        unsigned long low_limit, high_limit;
        struct vm_area_struct *tmp;
        VMA_ITERATOR(vmi, current->mm, 0);

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask + info->start_gap;
        if (length < info->length)
                return -ENOMEM;

        low_limit = info->low_limit;
        if (low_limit < mmap_min_addr)
                low_limit = mmap_min_addr;
        high_limit = info->high_limit;
retry:
        if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
                return -ENOMEM;

        /*
         * Adjust for the gap first so it doesn't interfere with the
         * later alignment. The first step is the minimum needed to
         * fulill the start gap, the next steps is the minimum to align
         * that. It is the minimum needed to fulill both.
         */
        gap = vma_iter_addr(&vmi) + info->start_gap;
        gap += (info->align_offset - gap) & info->align_mask;
        tmp = vma_next(&vmi);
        if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
                if (vm_start_gap(tmp) < gap + length - 1) {
                        low_limit = tmp->vm_end;
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        } else {
                tmp = vma_prev(&vmi);
                if (tmp && vm_end_gap(tmp) > gap) {
                        low_limit = vm_end_gap(tmp);
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        }

        return gap;
}

/**
 * unmapped_area_topdown() - Find an area between the low_limit and the
 * high_limit with the correct alignment and offset at the highest available
 * address, all from @info. Note: current->mm is used for the search.
 *
 * @info: The unmapped area information including the range [low_limit -
 * high_limit), the alignment offset and mask.
 *
 * Return: A memory address or -ENOMEM.
 */
unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
{
        unsigned long length, gap, gap_end;
        unsigned long low_limit, high_limit;
        struct vm_area_struct *tmp;
        VMA_ITERATOR(vmi, current->mm, 0);

        /* Adjust search length to account for worst case alignment overhead */
        length = info->length + info->align_mask + info->start_gap;
        if (length < info->length)
                return -ENOMEM;

        low_limit = info->low_limit;
        if (low_limit < mmap_min_addr)
                low_limit = mmap_min_addr;
        high_limit = info->high_limit;
retry:
        if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
                return -ENOMEM;

        gap = vma_iter_end(&vmi) - info->length;
        gap -= (gap - info->align_offset) & info->align_mask;
        gap_end = vma_iter_end(&vmi);
        tmp = vma_next(&vmi);
        if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
                if (vm_start_gap(tmp) < gap_end) {
                        high_limit = vm_start_gap(tmp);
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        } else {
                tmp = vma_prev(&vmi);
                if (tmp && vm_end_gap(tmp) > gap) {
                        high_limit = tmp->vm_start;
                        vma_iter_reset(&vmi);
                        goto retry;
                }
        }

        return gap;
}

/*
 * Verify that the stack growth is acceptable and
 * update accounting. This is shared with both the
 * grow-up and grow-down cases.
 */
static int acct_stack_growth(struct vm_area_struct *vma,
                             unsigned long size, unsigned long grow)
{
        struct mm_struct *mm = vma->vm_mm;
        unsigned long new_start;

        /* address space limit tests */
        if (!may_expand_vm(mm, vma->vm_flags, grow))
                return -ENOMEM;

        /* Stack limit test */
        if (size > rlimit(RLIMIT_STACK))
                return -ENOMEM;

        /* mlock limit tests */
        if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT))
                return -ENOMEM;

        /* Check to ensure the stack will not grow into a hugetlb-only region */
        new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start :
                        vma->vm_end - size;
        if (is_hugepage_only_range(vma->vm_mm, new_start, size))
                return -EFAULT;

        /*
         * Overcommit..  This must be the final test, as it will
         * update security statistics.
         */
        if (security_vm_enough_memory_mm(mm, grow))
                return -ENOMEM;

        return 0;
}

#if defined(CONFIG_STACK_GROWSUP)
/*
 * PA-RISC uses this for its stack.
 * vma is the last one with address > vma->vm_end.  Have to extend vma.
 */
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *next;
        unsigned long gap_addr;
        int error = 0;
        VMA_ITERATOR(vmi, mm, vma->vm_start);

        if (!(vma->vm_flags & VM_GROWSUP))
                return -EFAULT;

        mmap_assert_write_locked(mm);

        /* Guard against exceeding limits of the address space. */
        address &= PAGE_MASK;
        if (address >= (TASK_SIZE & PAGE_MASK))
                return -ENOMEM;
        address += PAGE_SIZE;

        /* Enforce stack_guard_gap */
        gap_addr = address + stack_guard_gap;

        /* Guard against overflow */
        if (gap_addr < address || gap_addr > TASK_SIZE)
                gap_addr = TASK_SIZE;

        next = find_vma_intersection(mm, vma->vm_end, gap_addr);
        if (next && vma_is_accessible(next)) {
                if (!(next->vm_flags & VM_GROWSUP))
                        return -ENOMEM;
                /* Check that both stack segments have the same anon_vma? */
        }

        if (next)
                vma_iter_prev_range_limit(&vmi, address);

        vma_iter_config(&vmi, vma->vm_start, address);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma))) {
                vma_iter_free(&vmi);
                return -ENOMEM;
        }

        /* Lock the VMA before expanding to prevent concurrent page faults */
        vma_start_write(vma);
        /* We update the anon VMA tree. */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address > vma->vm_end) {
                unsigned long size, grow;

                size = address - vma->vm_start;
                grow = (address - vma->vm_end) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_end = address;
                                /* Overwrite old entry in mtree. */
                                vma_iter_store_overwrite(&vmi, vma);
                                anon_vma_interval_tree_post_update_vma(vma);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        vma_iter_free(&vmi);
        validate_mm(mm);
        return error;
}
#endif /* CONFIG_STACK_GROWSUP */

/*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 * mmap_lock held for writing.
 */
int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
        struct mm_struct *mm = vma->vm_mm;
        struct vm_area_struct *prev;
        int error = 0;
        VMA_ITERATOR(vmi, mm, vma->vm_start);

        if (!(vma->vm_flags & VM_GROWSDOWN))
                return -EFAULT;

        mmap_assert_write_locked(mm);

        address &= PAGE_MASK;
        if (address < mmap_min_addr || address < FIRST_USER_ADDRESS)
                return -EPERM;

        /* Enforce stack_guard_gap */
        prev = vma_prev(&vmi);
        /* Check that both stack segments have the same anon_vma? */
        if (prev) {
                if (!(prev->vm_flags & VM_GROWSDOWN) &&
                    vma_is_accessible(prev) &&
                    (address - prev->vm_end < stack_guard_gap))
                        return -ENOMEM;
        }

        if (prev)
                vma_iter_next_range_limit(&vmi, vma->vm_start);

        vma_iter_config(&vmi, address, vma->vm_end);
        if (vma_iter_prealloc(&vmi, vma))
                return -ENOMEM;

        /* We must make sure the anon_vma is allocated. */
        if (unlikely(anon_vma_prepare(vma))) {
                vma_iter_free(&vmi);
                return -ENOMEM;
        }

        /* Lock the VMA before expanding to prevent concurrent page faults */
        vma_start_write(vma);
        /* We update the anon VMA tree. */
        anon_vma_lock_write(vma->anon_vma);

        /* Somebody else might have raced and expanded it already */
        if (address < vma->vm_start) {
                unsigned long size, grow;

                size = vma->vm_end - address;
                grow = (vma->vm_start - address) >> PAGE_SHIFT;

                error = -ENOMEM;
                if (grow <= vma->vm_pgoff) {
                        error = acct_stack_growth(vma, size, grow);
                        if (!error) {
                                if (vma->vm_flags & VM_LOCKED)
                                        mm->locked_vm += grow;
                                vm_stat_account(mm, vma->vm_flags, grow);
                                anon_vma_interval_tree_pre_update_vma(vma);
                                vma->vm_start = address;
                                vma->vm_pgoff -= grow;
                                /* Overwrite old entry in mtree. */
                                vma_iter_store_overwrite(&vmi, vma);
                                anon_vma_interval_tree_post_update_vma(vma);

                                perf_event_mmap(vma);
                        }
                }
        }
        anon_vma_unlock_write(vma->anon_vma);
        vma_iter_free(&vmi);
        validate_mm(mm);
        return error;
}

int __vm_munmap(unsigned long start, size_t len, bool unlock)
{
        int ret;
        struct mm_struct *mm = current->mm;
        LIST_HEAD(uf);
        VMA_ITERATOR(vmi, mm, start);

        if (mmap_write_lock_killable(mm))
                return -EINTR;

        ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock);
        if (ret || !unlock)
                mmap_write_unlock(mm);

        userfaultfd_unmap_complete(mm, &uf);
        return ret;
}















































    5 




























































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_GENERIC_LOCAL64_H
#define _ASM_GENERIC_LOCAL64_H

#include <linux/percpu.h>
#include <asm/types.h>

/*
 * A signed long type for operations which are atomic for a single CPU.
 * Usually used in combination with per-cpu variables.
 *
 * This is the default implementation, which uses atomic64_t.  Which is
 * rather pointless.  The whole point behind local64_t is that some processors
 * can perform atomic adds and subtracts in a manner which is atomic wrt IRQs
 * running on this CPU.  local64_t allows exploitation of such capabilities.
 */

/* Implement in terms of atomics. */

#if BITS_PER_LONG == 64

#include <asm/local.h>

typedef struct {
        local_t a;
} local64_t;

#define LOCAL64_INIT(i)        { LOCAL_INIT(i) }

#define local64_read(l)                local_read(&(l)->a)
#define local64_set(l,i)        local_set((&(l)->a),(i))
#define local64_inc(l)                local_inc(&(l)->a)
#define local64_dec(l)                local_dec(&(l)->a)
#define local64_add(i,l)        local_add((i),(&(l)->a))
#define local64_sub(i,l)        local_sub((i),(&(l)->a))

#define local64_sub_and_test(i, l) local_sub_and_test((i), (&(l)->a))
#define local64_dec_and_test(l) local_dec_and_test(&(l)->a)
#define local64_inc_and_test(l) local_inc_and_test(&(l)->a)
#define local64_add_negative(i, l) local_add_negative((i), (&(l)->a))
#define local64_add_return(i, l) local_add_return((i), (&(l)->a))
#define local64_sub_return(i, l) local_sub_return((i), (&(l)->a))
#define local64_inc_return(l)        local_inc_return(&(l)->a)

static inline s64 local64_cmpxchg(local64_t *l, s64 old, s64 new)
{
        return local_cmpxchg(&l->a, old, new);
}

static inline bool local64_try_cmpxchg(local64_t *l, s64 *old, s64 new)
{
        return local_try_cmpxchg(&l->a, (long *)old, new);
}

#define local64_xchg(l, n)        local_xchg((&(l)->a), (n))
#define local64_add_unless(l, _a, u) local_add_unless((&(l)->a), (_a), (u))
#define local64_inc_not_zero(l)        local_inc_not_zero(&(l)->a)

/* Non-atomic variants, ie. preemption disabled and won't be touched
 * in interrupt, etc.  Some archs can optimize this case well. */
#define __local64_inc(l)        local64_set((l), local64_read(l) + 1)
#define __local64_dec(l)        local64_set((l), local64_read(l) - 1)
#define __local64_add(i,l)        local64_set((l), local64_read(l) + (i))
#define __local64_sub(i,l)        local64_set((l), local64_read(l) - (i))

#else /* BITS_PER_LONG != 64 */

#include <linux/atomic.h>

/* Don't use typedef: don't want them to be mixed with atomic_t's. */
typedef struct {
        atomic64_t a;
} local64_t;

#define LOCAL64_INIT(i)        { ATOMIC_LONG_INIT(i) }

#define local64_read(l)                atomic64_read(&(l)->a)
#define local64_set(l,i)        atomic64_set((&(l)->a),(i))
#define local64_inc(l)                atomic64_inc(&(l)->a)
#define local64_dec(l)                atomic64_dec(&(l)->a)
#define local64_add(i,l)        atomic64_add((i),(&(l)->a))
#define local64_sub(i,l)        atomic64_sub((i),(&(l)->a))

#define local64_sub_and_test(i, l) atomic64_sub_and_test((i), (&(l)->a))
#define local64_dec_and_test(l) atomic64_dec_and_test(&(l)->a)
#define local64_inc_and_test(l) atomic64_inc_and_test(&(l)->a)
#define local64_add_negative(i, l) atomic64_add_negative((i), (&(l)->a))
#define local64_add_return(i, l) atomic64_add_return((i), (&(l)->a))
#define local64_sub_return(i, l) atomic64_sub_return((i), (&(l)->a))
#define local64_inc_return(l)        atomic64_inc_return(&(l)->a)

#define local64_cmpxchg(l, o, n) atomic64_cmpxchg((&(l)->a), (o), (n))
#define local64_try_cmpxchg(l, po, n) atomic64_try_cmpxchg((&(l)->a), (po), (n))
#define local64_xchg(l, n)        atomic64_xchg((&(l)->a), (n))
#define local64_add_unless(l, _a, u) atomic64_add_unless((&(l)->a), (_a), (u))
#define local64_inc_not_zero(l)        atomic64_inc_not_zero(&(l)->a)

/* Non-atomic variants, ie. preemption disabled and won't be touched
 * in interrupt, etc.  Some archs can optimize this case well. */
#define __local64_inc(l)        local64_set((l), local64_read(l) + 1)
#define __local64_dec(l)        local64_set((l), local64_read(l) - 1)
#define __local64_add(i,l)        local64_set((l), local64_read(l) + (i))
#define __local64_sub(i,l)        local64_set((l), local64_read(l) - (i))

#endif /* BITS_PER_LONG != 64 */

#endif /* _ASM_GENERIC_LOCAL64_H */











  319 












1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGETLB_INLINE_H
#define _LINUX_HUGETLB_INLINE_H

#ifdef CONFIG_HUGETLB_PAGE

#include <linux/mm.h>

static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
        return !!(vma->vm_flags & VM_HUGETLB);
}

#else

static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
{
        return false;
}

#endif

#endif
























































































































































  229 






  228 
  229 






























































   73 






   26 



























































  225 




  254 


















































































































































































  128 
































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_HUGE_MM_H
#define _LINUX_HUGE_MM_H

#include <linux/mm_types.h>

#include <linux/fs.h> /* only for vma_is_dax() */
#include <linux/kobject.h>

vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
                  struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma);
void huge_pmd_set_accessed(struct vm_fault *vmf);
int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
                  struct vm_area_struct *vma);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud);
#else
static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
}
#endif

vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf);
bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                           pmd_t *pmd, unsigned long addr, unsigned long next);
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr);
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud,
                 unsigned long addr);
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
                   unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd);
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pmd_t *pmd, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags);

vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio,
                                bool write);
vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
                                bool write);

enum transparent_hugepage_flag {
        TRANSPARENT_HUGEPAGE_UNSUPPORTED,
        TRANSPARENT_HUGEPAGE_FLAG,
        TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
        TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG,
        TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG,
};

struct kobject;
struct kobj_attribute;

ssize_t single_hugepage_flag_store(struct kobject *kobj,
                                   struct kobj_attribute *attr,
                                   const char *buf, size_t count,
                                   enum transparent_hugepage_flag flag);
ssize_t single_hugepage_flag_show(struct kobject *kobj,
                                  struct kobj_attribute *attr, char *buf,
                                  enum transparent_hugepage_flag flag);
extern struct kobj_attribute shmem_enabled_attr;
extern struct kobj_attribute thpsize_shmem_enabled_attr;

/*
 * Mask of all large folio orders supported for anonymous THP; all orders up to
 * and including PMD_ORDER, except order-0 (which is not "huge") and order-1
 * (which is a limitation of the THP implementation).
 */
#define THP_ORDERS_ALL_ANON        ((BIT(PMD_ORDER + 1) - 1) & ~(BIT(0) | BIT(1)))

/*
 * Mask of all large folio orders supported for file THP. Folios in a DAX
 * file is never split and the MAX_PAGECACHE_ORDER limit does not apply to
 * it.  Same to PFNMAPs where there's neither page* nor pagecache.
 */
#define THP_ORDERS_ALL_SPECIAL                \
        (BIT(PMD_ORDER) | BIT(PUD_ORDER))
#define THP_ORDERS_ALL_FILE_DEFAULT        \
        ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))

/*
 * Mask of all large folio orders supported for THP.
 */
#define THP_ORDERS_ALL        \
        (THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_SPECIAL | THP_ORDERS_ALL_FILE_DEFAULT)

#define TVA_SMAPS                (1 << 0)        /* Will be used for procfs */
#define TVA_IN_PF                (1 << 1)        /* Page fault handler */
#define TVA_ENFORCE_SYSFS        (1 << 2)        /* Obey sysfs configuration */

#define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \
        (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order)))

#define split_folio(f) split_folio_to_list(f, NULL)

#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
#define HPAGE_PMD_SHIFT PMD_SHIFT
#define HPAGE_PUD_SHIFT PUD_SHIFT
#else
#define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
#define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; })
#endif

#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
#define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER)
#define HPAGE_PMD_MASK        (~(HPAGE_PMD_SIZE - 1))
#define HPAGE_PMD_SIZE        ((1UL) << HPAGE_PMD_SHIFT)

#define HPAGE_PUD_ORDER (HPAGE_PUD_SHIFT-PAGE_SHIFT)
#define HPAGE_PUD_NR (1<<HPAGE_PUD_ORDER)
#define HPAGE_PUD_MASK        (~(HPAGE_PUD_SIZE - 1))
#define HPAGE_PUD_SIZE        ((1UL) << HPAGE_PUD_SHIFT)

enum mthp_stat_item {
        MTHP_STAT_ANON_FAULT_ALLOC,
        MTHP_STAT_ANON_FAULT_FALLBACK,
        MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
        MTHP_STAT_ZSWPOUT,
        MTHP_STAT_SWPIN,
        MTHP_STAT_SWPIN_FALLBACK,
        MTHP_STAT_SWPIN_FALLBACK_CHARGE,
        MTHP_STAT_SWPOUT,
        MTHP_STAT_SWPOUT_FALLBACK,
        MTHP_STAT_SHMEM_ALLOC,
        MTHP_STAT_SHMEM_FALLBACK,
        MTHP_STAT_SHMEM_FALLBACK_CHARGE,
        MTHP_STAT_SPLIT,
        MTHP_STAT_SPLIT_FAILED,
        MTHP_STAT_SPLIT_DEFERRED,
        MTHP_STAT_NR_ANON,
        MTHP_STAT_NR_ANON_PARTIALLY_MAPPED,
        __MTHP_STAT_COUNT
};

#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
struct mthp_stat {
        unsigned long stats[ilog2(MAX_PTRS_PER_PTE) + 1][__MTHP_STAT_COUNT];
};

DECLARE_PER_CPU(struct mthp_stat, mthp_stats);

static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
{
        if (order <= 0 || order > PMD_ORDER)
                return;

        this_cpu_add(mthp_stats.stats[order][item], delta);
}

static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
        mod_mthp_stat(order, item, 1);
}

#else
static inline void mod_mthp_stat(int order, enum mthp_stat_item item, int delta)
{
}

static inline void count_mthp_stat(int order, enum mthp_stat_item item)
{
}
#endif

#ifdef CONFIG_TRANSPARENT_HUGEPAGE

extern unsigned long transparent_hugepage_flags;
extern unsigned long huge_anon_orders_always;
extern unsigned long huge_anon_orders_madvise;
extern unsigned long huge_anon_orders_inherit;

static inline bool hugepage_global_enabled(void)
{
        return transparent_hugepage_flags &
                        ((1<<TRANSPARENT_HUGEPAGE_FLAG) |
                        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG));
}

static inline bool hugepage_global_always(void)
{
        return transparent_hugepage_flags &
                        (1<<TRANSPARENT_HUGEPAGE_FLAG);
}

static inline int highest_order(unsigned long orders)
{
        return fls_long(orders) - 1;
}

static inline int next_order(unsigned long *orders, int prev)
{
        *orders &= ~BIT(prev);
        return highest_order(*orders);
}

/*
 * Do the below checks:
 *   - For file vma, check if the linear page offset of vma is
 *     order-aligned within the file.  The hugepage is
 *     guaranteed to be order-aligned within the file, but we must
 *     check that the order-aligned addresses in the VMA map to
 *     order-aligned offsets within the file, else the hugepage will
 *     not be mappable.
 *   - For all vmas, check if the haddr is in an aligned hugepage
 *     area.
 */
static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
                unsigned long addr, int order)
{
        unsigned long hpage_size = PAGE_SIZE << order;
        unsigned long haddr;

        /* Don't have to check pgoff for anonymous vma */
        if (!vma_is_anonymous(vma)) {
                if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
                                hpage_size >> PAGE_SHIFT))
                        return false;
        }

        haddr = ALIGN_DOWN(addr, hpage_size);

        if (haddr < vma->vm_start || haddr + hpage_size > vma->vm_end)
                return false;
        return true;
}

/*
 * Filter the bitfield of input orders to the ones suitable for use in the vma.
 * See thp_vma_suitable_order().
 * All orders that pass the checks are returned as a bitfield.
 */
static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
                unsigned long addr, unsigned long orders)
{
        int order;

        /*
         * Iterate over orders, highest to lowest, removing orders that don't
         * meet alignment requirements from the set. Exit loop at first order
         * that meets requirements, since all lower orders must also meet
         * requirements.
         */

        order = highest_order(orders);

        while (orders) {
                if (thp_vma_suitable_order(vma, addr, order))
                        break;
                order = next_order(&orders, order);
        }

        return orders;
}

unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
                                         unsigned long vm_flags,
                                         unsigned long tva_flags,
                                         unsigned long orders);

/**
 * thp_vma_allowable_orders - determine hugepage orders that are allowed for vma
 * @vma:  the vm area to check
 * @vm_flags: use these vm_flags instead of vma->vm_flags
 * @tva_flags: Which TVA flags to honour
 * @orders: bitfield of all orders to consider
 *
 * Calculates the intersection of the requested hugepage orders and the allowed
 * hugepage orders for the provided vma. Permitted orders are encoded as a set
 * bit at the corresponding bit position (bit-2 corresponds to order-2, bit-3
 * corresponds to order-3, etc). Order-0 is never considered a hugepage order.
 *
 * Return: bitfield of orders allowed for hugepage in the vma. 0 if no hugepage
 * orders are allowed.
 */
static inline
unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
                                       unsigned long vm_flags,
                                       unsigned long tva_flags,
                                       unsigned long orders)
{
        /* Optimization to check if required orders are enabled early. */
        if ((tva_flags & TVA_ENFORCE_SYSFS) && vma_is_anonymous(vma)) {
                unsigned long mask = READ_ONCE(huge_anon_orders_always);

                if (vm_flags & VM_HUGEPAGE)
                        mask |= READ_ONCE(huge_anon_orders_madvise);
                if (hugepage_global_always() ||
                    ((vm_flags & VM_HUGEPAGE) && hugepage_global_enabled()))
                        mask |= READ_ONCE(huge_anon_orders_inherit);

                orders &= mask;
                if (!orders)
                        return 0;
        }

        return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
}

struct thpsize {
        struct kobject kobj;
        struct list_head node;
        int order;
};

#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)

#define transparent_hugepage_use_zero_page()                                \
        (transparent_hugepage_flags &                                        \
         (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG))

static inline bool vma_thp_disabled(struct vm_area_struct *vma,
                unsigned long vm_flags)
{
        /*
         * Explicitly disabled through madvise or prctl, or some
         * architectures may disable THP for some mappings, for
         * example, s390 kvm.
         */
        return (vm_flags & VM_NOHUGEPAGE) ||
               test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags);
}

static inline bool thp_disabled_by_hw(void)
{
        /* If the hardware/firmware marked hugepage support disabled. */
        return transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED);
}

unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);
unsigned long thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags,
                vm_flags_t vm_flags);

bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order);
int min_order_for_split(struct folio *folio);
int split_folio_to_list(struct folio *folio, struct list_head *list);
bool uniform_split_supported(struct folio *folio, unsigned int new_order,
                bool warns);
bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
                bool warns);
int folio_split(struct folio *folio, unsigned int new_order, struct page *page,
                struct list_head *list);
/*
 * try_folio_split - try to split a @folio at @page using non uniform split.
 * @folio: folio to be split
 * @page: split to order-0 at the given page
 * @list: store the after-split folios
 *
 * Try to split a @folio at @page using non uniform split to order-0, if
 * non uniform split is not supported, fall back to uniform split.
 *
 * Return: 0: split is successful, otherwise split failed.
 */
static inline int try_folio_split(struct folio *folio, struct page *page,
                struct list_head *list)
{
        int ret = min_order_for_split(folio);

        if (ret < 0)
                return ret;

        if (!non_uniform_split_supported(folio, 0, false))
                return split_huge_page_to_list_to_order(&folio->page, list,
                                ret);
        return folio_split(folio, ret, page, list);
}
static inline int split_huge_page(struct page *page)
{
        struct folio *folio = page_folio(page);
        int ret = min_order_for_split(folio);

        if (ret < 0)
                return ret;

        /*
         * split_huge_page() locks the page before splitting and
         * expects the same page that has been split to be locked when
         * returned. split_folio(page_folio(page)) cannot be used here
         * because it converts the page to folio and passes the head
         * page to be split.
         */
        return split_huge_page_to_list_to_order(page, NULL, ret);
}
void deferred_split_folio(struct folio *folio, bool partially_mapped);

void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct folio *folio);

#define split_huge_pmd(__vma, __pmd, __address)                                \
        do {                                                                \
                pmd_t *____pmd = (__pmd);                                \
                if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd)        \
                                        || pmd_devmap(*____pmd))        \
                        __split_huge_pmd(__vma, __pmd, __address,        \
                                                false, NULL);                \
        }  while (0)


void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
                bool freeze, struct folio *folio);

void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                unsigned long address);

#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                    pud_t *pudp, unsigned long addr, pgprot_t newprot,
                    unsigned long cp_flags);
#else
static inline int
change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
                pud_t *pudp, unsigned long addr, pgprot_t newprot,
                unsigned long cp_flags) { return 0; }
#endif

#define split_huge_pud(__vma, __pud, __address)                                \
        do {                                                                \
                pud_t *____pud = (__pud);                                \
                if (pud_trans_huge(*____pud)                                \
                                        || pud_devmap(*____pud))        \
                        __split_huge_pud(__vma, __pud, __address);        \
        }  while (0)

int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
                     int advice);
int madvise_collapse(struct vm_area_struct *vma,
                     struct vm_area_struct **prev,
                     unsigned long start, unsigned long end);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
                           unsigned long end, struct vm_area_struct *next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma);

static inline int is_swap_pmd(pmd_t pmd)
{
        return !pmd_none(pmd) && !pmd_present(pmd);
}

/* mmap_lock must be held on entry */
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd))
                return __pmd_trans_huge_lock(pmd, vma);
        else
                return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        if (pud_trans_huge(*pud) || pud_devmap(*pud))
                return __pud_trans_huge_lock(pud, vma);
        else
                return NULL;
}

/**
 * folio_test_pmd_mappable - Can we map this folio with a PMD?
 * @folio: The folio to test
 */
static inline bool folio_test_pmd_mappable(struct folio *folio)
{
        return folio_order(folio) >= HPAGE_PMD_ORDER;
}

struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
                pmd_t *pmd, int flags, struct dev_pagemap **pgmap);

vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);

extern struct folio *huge_zero_folio;
extern unsigned long huge_zero_pfn;

static inline bool is_huge_zero_folio(const struct folio *folio)
{
        return READ_ONCE(huge_zero_folio) == folio;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return pmd_present(pmd) && READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd);
}

struct folio *mm_get_huge_zero_folio(struct mm_struct *mm);
void mm_put_huge_zero_folio(struct mm_struct *mm);

#define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot))

static inline bool thp_migration_supported(void)
{
        return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION);
}

void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
                           pmd_t *pmd, bool freeze, struct folio *folio);
bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
                           pmd_t *pmdp, struct folio *folio);

#else /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline bool folio_test_pmd_mappable(struct folio *folio)
{
        return false;
}

static inline bool thp_vma_suitable_order(struct vm_area_struct *vma,
                unsigned long addr, int order)
{
        return false;
}

static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma,
                unsigned long addr, unsigned long orders)
{
        return 0;
}

static inline unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
                                        unsigned long vm_flags,
                                        unsigned long tva_flags,
                                        unsigned long orders)
{
        return 0;
}

#define transparent_hugepage_flags 0UL

#define thp_get_unmapped_area        NULL

static inline unsigned long
thp_get_unmapped_area_vmflags(struct file *filp, unsigned long addr,
                              unsigned long len, unsigned long pgoff,
                              unsigned long flags, vm_flags_t vm_flags)
{
        return 0;
}

static inline bool
can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{
        return false;
}
static inline int
split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
                unsigned int new_order)
{
        return 0;
}
static inline int split_huge_page(struct page *page)
{
        return 0;
}

static inline int split_folio_to_list(struct folio *folio, struct list_head *list)
{
        return 0;
}

static inline int try_folio_split(struct folio *folio, struct page *page,
                struct list_head *list)
{
        return 0;
}

static inline void deferred_split_folio(struct folio *folio, bool partially_mapped) {}
#define split_huge_pmd(__vma, __pmd, __address)        \
        do { } while (0)

static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long address, bool freeze, struct folio *folio) {}
static inline void split_huge_pmd_address(struct vm_area_struct *vma,
                unsigned long address, bool freeze, struct folio *folio) {}
static inline void split_huge_pmd_locked(struct vm_area_struct *vma,
                                         unsigned long address, pmd_t *pmd,
                                         bool freeze, struct folio *folio) {}

static inline bool unmap_huge_pmd_locked(struct vm_area_struct *vma,
                                         unsigned long addr, pmd_t *pmdp,
                                         struct folio *folio)
{
        return false;
}

#define split_huge_pud(__vma, __pmd, __address)        \
        do { } while (0)

static inline int hugepage_madvise(struct vm_area_struct *vma,
                                   unsigned long *vm_flags, int advice)
{
        return -EINVAL;
}

static inline int madvise_collapse(struct vm_area_struct *vma,
                                   struct vm_area_struct **prev,
                                   unsigned long start, unsigned long end)
{
        return -EINVAL;
}

static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
                                         unsigned long start,
                                         unsigned long end,
                                         struct vm_area_struct *next)
{
}
static inline int is_swap_pmd(pmd_t pmd)
{
        return 0;
}
static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd,
                struct vm_area_struct *vma)
{
        return NULL;
}
static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
                struct vm_area_struct *vma)
{
        return NULL;
}

static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
        return 0;
}

static inline bool is_huge_zero_folio(const struct folio *folio)
{
        return false;
}

static inline bool is_huge_zero_pmd(pmd_t pmd)
{
        return false;
}

static inline void mm_put_huge_zero_folio(struct mm_struct *mm)
{
        return;
}

static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
        unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
        return NULL;
}

static inline bool thp_migration_supported(void)
{
        return false;
}

static inline int highest_order(unsigned long orders)
{
        return 0;
}

static inline int next_order(unsigned long *orders, int prev)
{
        return 0;
}

static inline void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
                                    unsigned long address)
{
}

static inline int change_huge_pud(struct mmu_gather *tlb,
                                  struct vm_area_struct *vma, pud_t *pudp,
                                  unsigned long addr, pgprot_t newprot,
                                  unsigned long cp_flags)
{
        return 0;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

static inline int split_folio_to_list_to_order(struct folio *folio,
                struct list_head *list, int new_order)
{
        return split_huge_page_to_list_to_order(&folio->page, list, new_order);
}

static inline int split_folio_to_order(struct folio *folio, int new_order)
{
        return split_folio_to_list_to_order(folio, NULL, new_order);
}

#endif /* _LINUX_HUGE_MM_H */













  660 



  644 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BSEARCH_H
#define _LINUX_BSEARCH_H

#include <linux/types.h>

static __always_inline
void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
{
        const char *pivot;
        int result;

        while (num > 0) {
                pivot = base + (num >> 1) * size;
                result = cmp(key, pivot);

                if (result == 0)
                        return (void *)pivot;

                if (result > 0) {
                        base = pivot + size;
                        num--;
                }
                num >>= 1;
        }

        return NULL;
}

extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);

#endif /* _LINUX_BSEARCH_H */











































































  193 
   66 
  139 
  140 
  140 













   65 




   65 










   65 
   64 
   65 
   65 






  220 
  219 
  220 














  220 
  220 


   13 
   13 









   13 


  220 
  220 



  220 




  217 


  220 


  220 



  220 






  305 


  304 


  306 




  305 

  303 






  305 






















































































































  178 


































































   16 




  243 




   30 
   25 
   12 




  194 













































  293 



  293 








































  304 
  306 





   25 
  284 














  254 



   24 

  233 
















































































































  179 



   70 
  177 



  179 









  179 



  179 



  179 


  178 




























































  179 


  179 


















































































































































































































  259 




  225 

  260 














  260 











  140 


  134 
   41 


  230 
  140 
  231 





  140 























































   46 
    5 
   46 
















   46 


   13 


   46 

   46 

































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
// SPDX-License-Identifier: GPL-2.0-only
/*
 *  linux/mm/swap.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 */

/*
 * This file contains the default values for the operation of the
 * Linux VM subsystem. Fine-tuning documentation can be found in
 * Documentation/admin-guide/sysctl/vm.rst.
 * Started 18.12.91
 * Swap aging added 23.2.95, Stephen Tweedie.
 * Buffermem limits added 12.3.98, Rik van Riel.
 */

#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/pagevec.h>
#include <linux/init.h>
#include <linux/export.h>
#include <linux/mm_inline.h>
#include <linux/percpu_counter.h>
#include <linux/memremap.h>
#include <linux/percpu.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
#include <linux/local_lock.h>
#include <linux/buffer_head.h>

#include "internal.h"

#define CREATE_TRACE_POINTS
#include <trace/events/pagemap.h>

/* How many pages do we try to swap or page in/out together? As a power of 2 */
int page_cluster;
static const int page_cluster_max = 31;

struct cpu_fbatches {
        /*
         * The following folio batches are grouped together because they are protected
         * by disabling preemption (and interrupts remain enabled).
         */
        local_lock_t lock;
        struct folio_batch lru_add;
        struct folio_batch lru_deactivate_file;
        struct folio_batch lru_deactivate;
        struct folio_batch lru_lazyfree;
#ifdef CONFIG_SMP
        struct folio_batch lru_activate;
#endif
        /* Protecting the following batches which require disabling interrupts */
        local_lock_t lock_irq;
        struct folio_batch lru_move_tail;
};

static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
        .lock = INIT_LOCAL_LOCK(lock),
        .lock_irq = INIT_LOCAL_LOCK(lock_irq),
};

static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
                unsigned long *flagsp)
{
        if (folio_test_lru(folio)) {
                folio_lruvec_relock_irqsave(folio, lruvecp, flagsp);
                lruvec_del_folio(*lruvecp, folio);
                __folio_clear_lru_flags(folio);
        }
}

/*
 * This path almost never happens for VM activity - pages are normally freed
 * in batches.  But it gets used by networking - and for compound pages.
 */
static void page_cache_release(struct folio *folio)
{
        struct lruvec *lruvec = NULL;
        unsigned long flags;

        __page_cache_release(folio, &lruvec, &flags);
        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
}

void __folio_put(struct folio *folio)
{
        if (unlikely(folio_is_zone_device(folio))) {
                free_zone_device_folio(folio);
                return;
        }

        if (folio_test_hugetlb(folio)) {
                free_huge_folio(folio);
                return;
        }

        page_cache_release(folio);
        folio_unqueue_deferred_split(folio);
        mem_cgroup_uncharge(folio);
        free_frozen_pages(&folio->page, folio_order(folio));
}
EXPORT_SYMBOL(__folio_put);

typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);

static void lru_add(struct lruvec *lruvec, struct folio *folio)
{
        int was_unevictable = folio_test_clear_unevictable(folio);
        long nr_pages = folio_nr_pages(folio);

        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        /*
         * Is an smp_mb__after_atomic() still required here, before
         * folio_evictable() tests the mlocked flag, to rule out the possibility
         * of stranding an evictable folio on an unevictable LRU?  I think
         * not, because __munlock_folio() only clears the mlocked flag
         * while the LRU lock is held.
         *
         * (That is not true of __page_cache_release(), and not necessarily
         * true of folios_put(): but those only clear the mlocked flag after
         * folio_put_testzero() has excluded any other users of the folio.)
         */
        if (folio_evictable(folio)) {
                if (was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
        } else {
                folio_clear_active(folio);
                folio_set_unevictable(folio);
                /*
                 * folio->mlock_count = !!folio_test_mlocked(folio)?
                 * But that leaves __mlock_folio() in doubt whether another
                 * actor has already counted the mlock or not.  Err on the
                 * safe side, underestimate, let page reclaim fix it, rather
                 * than leaving a page on the unevictable LRU indefinitely.
                 */
                folio->mlock_count = 0;
                if (!was_unevictable)
                        __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
        }

        lruvec_add_folio(lruvec, folio);
        trace_mm_lru_insertion(folio);
}

static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
{
        int i;
        struct lruvec *lruvec = NULL;
        unsigned long flags = 0;

        for (i = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];

                folio_lruvec_relock_irqsave(folio, &lruvec, &flags);
                move_fn(lruvec, folio);

                folio_set_lru(folio);
        }

        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
        folios_put(fbatch);
}

static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
                struct folio *folio, move_fn_t move_fn,
                bool on_lru, bool disable_irq)
{
        unsigned long flags;

        if (on_lru && !folio_test_clear_lru(folio))
                return;

        folio_get(folio);

        if (disable_irq)
                local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
        else
                local_lock(&cpu_fbatches.lock);

        if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
            lru_cache_disabled())
                folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);

        if (disable_irq)
                local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
        else
                local_unlock(&cpu_fbatches.lock);
}

#define folio_batch_add_and_move(folio, op, on_lru)                                                \
        __folio_batch_add_and_move(                                                                \
                &cpu_fbatches.op,                                                                \
                folio,                                                                                \
                op,                                                                                \
                on_lru,                                                                                \
                offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq)        \
        )

static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
{
        if (folio_test_unevictable(folio))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        lruvec_add_folio_tail(lruvec, folio);
        __count_vm_events(PGROTATED, folio_nr_pages(folio));
}

/*
 * Writeback is about to end against a folio which has been marked for
 * immediate reclaim.  If it still appears to be reclaimable, move it
 * to the tail of the inactive list.
 *
 * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
 */
void folio_rotate_reclaimable(struct folio *folio)
{
        if (folio_test_locked(folio) || folio_test_dirty(folio) ||
            folio_test_unevictable(folio))
                return;

        folio_batch_add_and_move(folio, lru_move_tail, true);
}

void lru_note_cost(struct lruvec *lruvec, bool file,
                   unsigned int nr_io, unsigned int nr_rotated)
{
        unsigned long cost;

        /*
         * Reflect the relative cost of incurring IO and spending CPU
         * time on rotations. This doesn't attempt to make a precise
         * comparison, it just says: if reloads are about comparable
         * between the LRU lists, or rotations are overwhelmingly
         * different between them, adjust scan balance for CPU work.
         */
        cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated;

        do {
                unsigned long lrusize;

                /*
                 * Hold lruvec->lru_lock is safe here, since
                 * 1) The pinned lruvec in reclaim, or
                 * 2) From a pre-LRU page during refault (which also holds the
                 *    rcu lock, so would be safe even if the page was on the LRU
                 *    and could move simultaneously to a new lruvec).
                 */
                spin_lock_irq(&lruvec->lru_lock);
                /* Record cost event */
                if (file)
                        lruvec->file_cost += cost;
                else
                        lruvec->anon_cost += cost;

                /*
                 * Decay previous events
                 *
                 * Because workloads change over time (and to avoid
                 * overflow) we keep these statistics as a floating
                 * average, which ends up weighing recent refaults
                 * more than old ones.
                 */
                lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
                          lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
                          lruvec_page_state(lruvec, NR_ACTIVE_FILE);

                if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
                        lruvec->file_cost /= 2;
                        lruvec->anon_cost /= 2;
                }
                spin_unlock_irq(&lruvec->lru_lock);
        } while ((lruvec = parent_lruvec(lruvec)));
}

void lru_note_cost_refault(struct folio *folio)
{
        lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
                      folio_nr_pages(folio), 0);
}

static void lru_activate(struct lruvec *lruvec, struct folio *folio)
{
        long nr_pages = folio_nr_pages(folio);

        if (folio_test_active(folio) || folio_test_unevictable(folio))
                return;


        lruvec_del_folio(lruvec, folio);
        folio_set_active(folio);
        lruvec_add_folio(lruvec, folio);
        trace_mm_lru_activate(folio);

        __count_vm_events(PGACTIVATE, nr_pages);
        __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
}

#ifdef CONFIG_SMP
static void folio_activate_drain(int cpu)
{
        struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu);

        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_activate);
}

void folio_activate(struct folio *folio)
{
        if (folio_test_active(folio) || folio_test_unevictable(folio))
                return;

        folio_batch_add_and_move(folio, lru_activate, true);
}

#else
static inline void folio_activate_drain(int cpu)
{
}

void folio_activate(struct folio *folio)
{
        struct lruvec *lruvec;

        if (!folio_test_clear_lru(folio))
                return;

        lruvec = folio_lruvec_lock_irq(folio);
        lru_activate(lruvec, folio);
        unlock_page_lruvec_irq(lruvec);
        folio_set_lru(folio);
}
#endif

static void __lru_cache_activate_folio(struct folio *folio)
{
        struct folio_batch *fbatch;
        int i;

        local_lock(&cpu_fbatches.lock);
        fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);

        /*
         * Search backwards on the optimistic assumption that the folio being
         * activated has just been added to this batch. Note that only
         * the local batch is examined as a !LRU folio could be in the
         * process of being released, reclaimed, migrated or on a remote
         * batch that is currently being drained. Furthermore, marking
         * a remote batch's folio active potentially hits a race where
         * a folio is marked active just after it is added to the inactive
         * list causing accounting errors and BUG_ON checks to trigger.
         */
        for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) {
                struct folio *batch_folio = fbatch->folios[i];

                if (batch_folio == folio) {
                        folio_set_active(folio);
                        break;
                }
        }

        local_unlock(&cpu_fbatches.lock);
}

#ifdef CONFIG_LRU_GEN

static void lru_gen_inc_refs(struct folio *folio)
{
        unsigned long new_flags, old_flags = READ_ONCE(folio->flags);

        if (folio_test_unevictable(folio))
                return;

        /* see the comment on LRU_REFS_FLAGS */
        if (!folio_test_referenced(folio)) {
                set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced));
                return;
        }

        do {
                if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) {
                        if (!folio_test_workingset(folio))
                                folio_set_workingset(folio);
                        return;
                }

                new_flags = old_flags + BIT(LRU_REFS_PGOFF);
        } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
}

static bool lru_gen_clear_refs(struct folio *folio)
{
        struct lru_gen_folio *lrugen;
        int gen = folio_lru_gen(folio);
        int type = folio_is_file_lru(folio);

        if (gen < 0)
                return true;

        set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0);

        lrugen = &folio_lruvec(folio)->lrugen;
        /* whether can do without shuffling under the LRU lock */
        return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type]));
}

#else /* !CONFIG_LRU_GEN */

static void lru_gen_inc_refs(struct folio *folio)
{
}

static bool lru_gen_clear_refs(struct folio *folio)
{
        return false;
}

#endif /* CONFIG_LRU_GEN */

/**
 * folio_mark_accessed - Mark a folio as having seen activity.
 * @folio: The folio to mark.
 *
 * This function will perform one of the following transitions:
 *
 * * inactive,unreferenced        ->        inactive,referenced
 * * inactive,referenced        ->        active,unreferenced
 * * active,unreferenced        ->        active,referenced
 *
 * When a newly allocated folio is not yet visible, so safe for non-atomic ops,
 * __folio_set_referenced() may be substituted for folio_mark_accessed().
 */
void folio_mark_accessed(struct folio *folio)
{
        if (folio_test_dropbehind(folio))
                return;
        if (lru_gen_enabled()) {
                lru_gen_inc_refs(folio);
                return;
        }

        if (!folio_test_referenced(folio)) {
                folio_set_referenced(folio);
        } else if (folio_test_unevictable(folio)) {
                /*
                 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
                 * this list is never rotated or maintained, so marking an
                 * unevictable page accessed has no effect.
                 */
        } else if (!folio_test_active(folio)) {
                /*
                 * If the folio is on the LRU, queue it for activation via
                 * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a
                 * folio_batch, mark it active and it'll be moved to the active
                 * LRU on the next drain.
                 */
                if (folio_test_lru(folio))
                        folio_activate(folio);
                else
                        __lru_cache_activate_folio(folio);
                folio_clear_referenced(folio);
                workingset_activation(folio);
        }
        if (folio_test_idle(folio))
                folio_clear_idle(folio);
}
EXPORT_SYMBOL(folio_mark_accessed);

/**
 * folio_add_lru - Add a folio to an LRU list.
 * @folio: The folio to be added to the LRU.
 *
 * Queue the folio for addition to the LRU. The decision on whether
 * to add the page to the [in]active [file|anon] list is deferred until the
 * folio_batch is drained. This gives a chance for the caller of folio_add_lru()
 * have the folio added to the active list using folio_mark_accessed().
 */
void folio_add_lru(struct folio *folio)
{
        VM_BUG_ON_FOLIO(folio_test_active(folio) &&
                        folio_test_unevictable(folio), folio);
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        /* see the comment in lru_gen_folio_seq() */
        if (lru_gen_enabled() && !folio_test_unevictable(folio) &&
            lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
                folio_set_active(folio);

        folio_batch_add_and_move(folio, lru_add, false);
}
EXPORT_SYMBOL(folio_add_lru);

/**
 * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA.
 * @folio: The folio to be added to the LRU.
 * @vma: VMA in which the folio is mapped.
 *
 * If the VMA is mlocked, @folio is added to the unevictable list.
 * Otherwise, it is treated the same way as folio_add_lru().
 */
void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
{
        VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);

        if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED))
                mlock_new_folio(folio);
        else
                folio_add_lru(folio);
}

/*
 * If the folio cannot be invalidated, it is moved to the
 * inactive list to speed up its reclaim.  It is moved to the
 * head of the list, rather than the tail, to give the flusher
 * threads some time to write it out, as this is much more
 * effective than the single-page writeout from reclaim.
 *
 * If the folio isn't mapped and dirty/writeback, the folio
 * could be reclaimed asap using the reclaim flag.
 *
 * 1. active, mapped folio -> none
 * 2. active, dirty/writeback folio -> inactive, head, reclaim
 * 3. inactive, mapped folio -> none
 * 4. inactive, dirty/writeback folio -> inactive, head, reclaim
 * 5. inactive, clean -> inactive, tail
 * 6. Others -> none
 *
 * In 4, it moves to the head of the inactive list so the folio is
 * written out by flusher threads as this is much more efficient
 * than the single-page writeout from reclaim.
 */
static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
{
        bool active = folio_test_active(folio) || lru_gen_enabled();
        long nr_pages = folio_nr_pages(folio);

        if (folio_test_unevictable(folio))
                return;

        /* Some processes are using the folio */
        if (folio_mapped(folio))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_clear_referenced(folio);

        if (folio_test_writeback(folio) || folio_test_dirty(folio)) {
                /*
                 * Setting the reclaim flag could race with
                 * folio_end_writeback() and confuse readahead.  But the
                 * race window is _really_ small and  it's not a critical
                 * problem.
                 */
                lruvec_add_folio(lruvec, folio);
                folio_set_reclaim(folio);
        } else {
                /*
                 * The folio's writeback ended while it was in the batch.
                 * We move that folio to the tail of the inactive list.
                 */
                lruvec_add_folio_tail(lruvec, folio);
                __count_vm_events(PGROTATED, nr_pages);
        }

        if (active) {
                __count_vm_events(PGDEACTIVATE, nr_pages);
                __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
                                     nr_pages);
        }
}

static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
{
        long nr_pages = folio_nr_pages(folio);

        if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        folio_clear_referenced(folio);
        lruvec_add_folio(lruvec, folio);

        __count_vm_events(PGDEACTIVATE, nr_pages);
        __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
}

static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
{
        long nr_pages = folio_nr_pages(folio);

        if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
            folio_test_swapcache(folio) || folio_test_unevictable(folio))
                return;

        lruvec_del_folio(lruvec, folio);
        folio_clear_active(folio);
        if (lru_gen_enabled())
                lru_gen_clear_refs(folio);
        else
                folio_clear_referenced(folio);
        /*
         * Lazyfree folios are clean anonymous folios.  They have
         * the swapbacked flag cleared, to distinguish them from normal
         * anonymous folios
         */
        folio_clear_swapbacked(folio);
        lruvec_add_folio(lruvec, folio);

        __count_vm_events(PGLAZYFREE, nr_pages);
        __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
}

/*
 * Drain pages out of the cpu's folio_batch.
 * Either "cpu" is the current CPU, and preemption has already been
 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 */
void lru_add_drain_cpu(int cpu)
{
        struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
        struct folio_batch *fbatch = &fbatches->lru_add;

        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_add);

        fbatch = &fbatches->lru_move_tail;
        /* Disabling interrupts below acts as a compiler barrier. */
        if (data_race(folio_batch_count(fbatch))) {
                unsigned long flags;

                /* No harm done if a racing interrupt already did this */
                local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
                folio_batch_move_lru(fbatch, lru_move_tail);
                local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
        }

        fbatch = &fbatches->lru_deactivate_file;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_deactivate_file);

        fbatch = &fbatches->lru_deactivate;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_deactivate);

        fbatch = &fbatches->lru_lazyfree;
        if (folio_batch_count(fbatch))
                folio_batch_move_lru(fbatch, lru_lazyfree);

        folio_activate_drain(cpu);
}

/**
 * deactivate_file_folio() - Deactivate a file folio.
 * @folio: Folio to deactivate.
 *
 * This function hints to the VM that @folio is a good reclaim candidate,
 * for example if its invalidation fails due to the folio being dirty
 * or under writeback.
 *
 * Context: Caller holds a reference on the folio.
 */
void deactivate_file_folio(struct folio *folio)
{
        /* Deactivating an unevictable folio will not accelerate reclaim */
        if (folio_test_unevictable(folio))
                return;

        if (lru_gen_enabled() && lru_gen_clear_refs(folio))
                return;

        folio_batch_add_and_move(folio, lru_deactivate_file, true);
}

/*
 * folio_deactivate - deactivate a folio
 * @folio: folio to deactivate
 *
 * folio_deactivate() moves @folio to the inactive list if @folio was on the
 * active list and was not unevictable. This is done to accelerate the
 * reclaim of @folio.
 */
void folio_deactivate(struct folio *folio)
{
        if (folio_test_unevictable(folio))
                return;

        if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio))
                return;

        folio_batch_add_and_move(folio, lru_deactivate, true);
}

/**
 * folio_mark_lazyfree - make an anon folio lazyfree
 * @folio: folio to deactivate
 *
 * folio_mark_lazyfree() moves @folio to the inactive file list.
 * This is done to accelerate the reclaim of @folio.
 */
void folio_mark_lazyfree(struct folio *folio)
{
        if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
            folio_test_swapcache(folio) || folio_test_unevictable(folio))
                return;

        folio_batch_add_and_move(folio, lru_lazyfree, true);
}

void lru_add_drain(void)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&cpu_fbatches.lock);
        mlock_drain_local();
}

/*
 * It's called from per-cpu workqueue context in SMP case so
 * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
 * the same cpu. It shouldn't be a problem in !SMP case since
 * the core is only one and the locks will disable preemption.
 */
static void lru_add_and_bh_lrus_drain(void)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        local_unlock(&cpu_fbatches.lock);
        invalidate_bh_lrus_cpu();
        mlock_drain_local();
}

void lru_add_drain_cpu_zone(struct zone *zone)
{
        local_lock(&cpu_fbatches.lock);
        lru_add_drain_cpu(smp_processor_id());
        drain_local_pages(zone);
        local_unlock(&cpu_fbatches.lock);
        mlock_drain_local();
}

#ifdef CONFIG_SMP

static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);

static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
        lru_add_and_bh_lrus_drain();
}

static bool cpu_needs_drain(unsigned int cpu)
{
        struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);

        /* Check these in order of likelihood that they're not zero */
        return folio_batch_count(&fbatches->lru_add) ||
                folio_batch_count(&fbatches->lru_move_tail) ||
                folio_batch_count(&fbatches->lru_deactivate_file) ||
                folio_batch_count(&fbatches->lru_deactivate) ||
                folio_batch_count(&fbatches->lru_lazyfree) ||
                folio_batch_count(&fbatches->lru_activate) ||
                need_mlock_drain(cpu) ||
                has_bh_in_lru(cpu, NULL);
}

/*
 * Doesn't need any cpu hotplug locking because we do rely on per-cpu
 * kworkers being shut down before our page_alloc_cpu_dead callback is
 * executed on the offlined cpu.
 * Calling this function with cpu hotplug locks held can actually lead
 * to obscure indirect dependencies via WQ context.
 */
static inline void __lru_add_drain_all(bool force_all_cpus)
{
        /*
         * lru_drain_gen - Global pages generation number
         *
         * (A) Definition: global lru_drain_gen = x implies that all generations
         *     0 < n <= x are already *scheduled* for draining.
         *
         * This is an optimization for the highly-contended use case where a
         * user space workload keeps constantly generating a flow of pages for
         * each CPU.
         */
        static unsigned int lru_drain_gen;
        static struct cpumask has_work;
        static DEFINE_MUTEX(lock);
        unsigned cpu, this_gen;

        /*
         * Make sure nobody triggers this path before mm_percpu_wq is fully
         * initialized.
         */
        if (WARN_ON(!mm_percpu_wq))
                return;

        /*
         * Guarantee folio_batch counter stores visible by this CPU
         * are visible to other CPUs before loading the current drain
         * generation.
         */
        smp_mb();

        /*
         * (B) Locally cache global LRU draining generation number
         *
         * The read barrier ensures that the counter is loaded before the mutex
         * is taken. It pairs with smp_mb() inside the mutex critical section
         * at (D).
         */
        this_gen = smp_load_acquire(&lru_drain_gen);

        mutex_lock(&lock);

        /*
         * (C) Exit the draining operation if a newer generation, from another
         * lru_add_drain_all(), was already scheduled for draining. Check (A).
         */
        if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
                goto done;

        /*
         * (D) Increment global generation number
         *
         * Pairs with smp_load_acquire() at (B), outside of the critical
         * section. Use a full memory barrier to guarantee that the
         * new global drain generation number is stored before loading
         * folio_batch counters.
         *
         * This pairing must be done here, before the for_each_online_cpu loop
         * below which drains the page vectors.
         *
         * Let x, y, and z represent some system CPU numbers, where x < y < z.
         * Assume CPU #z is in the middle of the for_each_online_cpu loop
         * below and has already reached CPU #y's per-cpu data. CPU #x comes
         * along, adds some pages to its per-cpu vectors, then calls
         * lru_add_drain_all().
         *
         * If the paired barrier is done at any later step, e.g. after the
         * loop, CPU #x will just exit at (C) and miss flushing out all of its
         * added pages.
         */
        WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
        smp_mb();

        cpumask_clear(&has_work);
        for_each_online_cpu(cpu) {
                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);

                if (cpu_needs_drain(cpu)) {
                        INIT_WORK(work, lru_add_drain_per_cpu);
                        queue_work_on(cpu, mm_percpu_wq, work);
                        __cpumask_set_cpu(cpu, &has_work);
                }
        }

        for_each_cpu(cpu, &has_work)
                flush_work(&per_cpu(lru_add_drain_work, cpu));

done:
        mutex_unlock(&lock);
}

void lru_add_drain_all(void)
{
        __lru_add_drain_all(false);
}
#else
void lru_add_drain_all(void)
{
        lru_add_drain();
}
#endif /* CONFIG_SMP */

atomic_t lru_disable_count = ATOMIC_INIT(0);

/*
 * lru_cache_disable() needs to be called before we start compiling
 * a list of folios to be migrated using folio_isolate_lru().
 * It drains folios on LRU cache and then disable on all cpus until
 * lru_cache_enable is called.
 *
 * Must be paired with a call to lru_cache_enable().
 */
void lru_cache_disable(void)
{
        atomic_inc(&lru_disable_count);
        /*
         * Readers of lru_disable_count are protected by either disabling
         * preemption or rcu_read_lock:
         *
         * preempt_disable, local_irq_disable  [bh_lru_lock()]
         * rcu_read_lock                       [rt_spin_lock CONFIG_PREEMPT_RT]
         * preempt_disable                       [local_lock !CONFIG_PREEMPT_RT]
         *
         * Since v5.1 kernel, synchronize_rcu() is guaranteed to wait on
         * preempt_disable() regions of code. So any CPU which sees
         * lru_disable_count = 0 will have exited the critical
         * section when synchronize_rcu() returns.
         */
        synchronize_rcu_expedited();
#ifdef CONFIG_SMP
        __lru_add_drain_all(true);
#else
        lru_add_and_bh_lrus_drain();
#endif
}

/**
 * folios_put_refs - Reduce the reference count on a batch of folios.
 * @folios: The folios.
 * @refs: The number of refs to subtract from each folio.
 *
 * Like folio_put(), but for a batch of folios.  This is more efficient
 * than writing the loop yourself as it will optimise the locks which need
 * to be taken if the folios are freed.  The folios batch is returned
 * empty and ready to be reused for another batch; there is no need
 * to reinitialise it.  If @refs is NULL, we subtract one from each
 * folio refcount.
 *
 * Context: May be called in process or interrupt context, but not in NMI
 * context.  May be called while holding a spinlock.
 */
void folios_put_refs(struct folio_batch *folios, unsigned int *refs)
{
        int i, j;
        struct lruvec *lruvec = NULL;
        unsigned long flags = 0;

        for (i = 0, j = 0; i < folios->nr; i++) {
                struct folio *folio = folios->folios[i];
                unsigned int nr_refs = refs ? refs[i] : 1;

                if (is_huge_zero_folio(folio))
                        continue;

                if (folio_is_zone_device(folio)) {
                        if (lruvec) {
                                unlock_page_lruvec_irqrestore(lruvec, flags);
                                lruvec = NULL;
                        }
                        if (folio_ref_sub_and_test(folio, nr_refs))
                                free_zone_device_folio(folio);
                        continue;
                }

                if (!folio_ref_sub_and_test(folio, nr_refs))
                        continue;

                /* hugetlb has its own memcg */
                if (folio_test_hugetlb(folio)) {
                        if (lruvec) {
                                unlock_page_lruvec_irqrestore(lruvec, flags);
                                lruvec = NULL;
                        }
                        free_huge_folio(folio);
                        continue;
                }
                folio_unqueue_deferred_split(folio);
                __page_cache_release(folio, &lruvec, &flags);

                if (j != i)
                        folios->folios[j] = folio;
                j++;
        }
        if (lruvec)
                unlock_page_lruvec_irqrestore(lruvec, flags);
        if (!j) {
                folio_batch_reinit(folios);
                return;
        }

        folios->nr = j;
        mem_cgroup_uncharge_folios(folios);
        free_unref_folios(folios);
}
EXPORT_SYMBOL(folios_put_refs);

/**
 * release_pages - batched put_page()
 * @arg: array of pages to release
 * @nr: number of pages
 *
 * Decrement the reference count on all the pages in @arg.  If it
 * fell to zero, remove the page from the LRU and free it.
 *
 * Note that the argument can be an array of pages, encoded pages,
 * or folio pointers. We ignore any encoded bits, and turn any of
 * them into just a folio that gets free'd.
 */
void release_pages(release_pages_arg arg, int nr)
{
        struct folio_batch fbatch;
        int refs[PAGEVEC_SIZE];
        struct encoded_page **encoded = arg.encoded_pages;
        int i;

        folio_batch_init(&fbatch);
        for (i = 0; i < nr; i++) {
                /* Turn any of the argument types into a folio */
                struct folio *folio = page_folio(encoded_page_ptr(encoded[i]));

                /* Is our next entry actually "nr_pages" -> "nr_refs" ? */
                refs[fbatch.nr] = 1;
                if (unlikely(encoded_page_flags(encoded[i]) &
                             ENCODED_PAGE_BIT_NR_PAGES_NEXT))
                        refs[fbatch.nr] = encoded_nr_pages(encoded[++i]);

                if (folio_batch_add(&fbatch, folio) > 0)
                        continue;
                folios_put_refs(&fbatch, refs);
        }

        if (fbatch.nr)
                folios_put_refs(&fbatch, refs);
}
EXPORT_SYMBOL(release_pages);

/*
 * The folios which we're about to release may be in the deferred lru-addition
 * queues.  That would prevent them from really being freed right now.  That's
 * OK from a correctness point of view but is inefficient - those folios may be
 * cache-warm and we want to give them back to the page allocator ASAP.
 *
 * So __folio_batch_release() will drain those queues here.
 * folio_batch_move_lru() calls folios_put() directly to avoid
 * mutual recursion.
 */
void __folio_batch_release(struct folio_batch *fbatch)
{
        if (!fbatch->percpu_pvec_drained) {
                lru_add_drain();
                fbatch->percpu_pvec_drained = true;
        }
        folios_put(fbatch);
}
EXPORT_SYMBOL(__folio_batch_release);

/**
 * folio_batch_remove_exceptionals() - Prune non-folios from a batch.
 * @fbatch: The batch to prune
 *
 * find_get_entries() fills a batch with both folios and shadow/swap/DAX
 * entries.  This function prunes all the non-folio entries from @fbatch
 * without leaving holes, so that it can be passed on to folio-only batch
 * operations.
 */
void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
{
        unsigned int i, j;

        for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
                struct folio *folio = fbatch->folios[i];
                if (!xa_is_value(folio))
                        fbatch->folios[j++] = folio;
        }
        fbatch->nr = j;
}

static const struct ctl_table swap_sysctl_table[] = {
        {
                .procname        = "page-cluster",
                .data                = &page_cluster,
                .maxlen                = sizeof(int),
                .mode                = 0644,
                .proc_handler        = proc_dointvec_minmax,
                .extra1                = SYSCTL_ZERO,
                .extra2                = (void *)&page_cluster_max,
        }
};

/*
 * Perform any setup for the swap system
 */
void __init swap_setup(void)
{
        unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);

        /* Use a smaller cluster for small-memory machines */
        if (megs < 16)
                page_cluster = 2;
        else
                page_cluster = 3;
        /*
         * Right now other parts of the system means that we
         * _really_ don't want to cluster much more
         */

        register_sysctl_init("vm", swap_sysctl_table);
}



























































    9 




    9 








1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
#define TRACE_SYSTEM csd

#if !defined(_TRACE_CSD_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_CSD_H

#include <linux/tracepoint.h>

TRACE_EVENT(csd_queue_cpu,

        TP_PROTO(const unsigned int cpu,
                unsigned long callsite,
                smp_call_func_t func,
                call_single_data_t *csd),

        TP_ARGS(cpu, callsite, func, csd),

        TP_STRUCT__entry(
                __field(unsigned int, cpu)
                __field(void *, callsite)
                __field(void *, func)
                __field(void *, csd)
                ),

            TP_fast_assign(
                __entry->cpu = cpu;
                __entry->callsite = (void *)callsite;
                __entry->func = func;
                __entry->csd  = csd;
                ),

        TP_printk("cpu=%u callsite=%pS func=%ps csd=%p",
                __entry->cpu, __entry->callsite, __entry->func, __entry->csd)
        );

/*
 * Tracepoints for a function which is called as an effect of smp_call_function.*
 */
DECLARE_EVENT_CLASS(csd_function,

        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),

        TP_ARGS(func, csd),

        TP_STRUCT__entry(
                __field(void *,        func)
                __field(void *,        csd)
        ),

        TP_fast_assign(
                __entry->func        = func;
                __entry->csd        = csd;
        ),

        TP_printk("func=%ps, csd=%p", __entry->func, __entry->csd)
);

DEFINE_EVENT(csd_function, csd_function_entry,
        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
        TP_ARGS(func, csd)
);

DEFINE_EVENT(csd_function, csd_function_exit,
        TP_PROTO(smp_call_func_t func, call_single_data_t *csd),
        TP_ARGS(func, csd)
);

#endif /* _TRACE_CSD_H */

/* This part must be outside protection */
#include <trace/define_trace.h>




























  871 






  869 





  870 























  651 




  572 











  354 












































  246 


  246 














  353 


  354 
  354 













  570 




    5 
  540 
  542 




    9 
   94 
   94 





  377 

  376 



   33 


  160 

  160 




   25 


  331 

  330 




  178 
  641 















1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
#ifndef _LINUX_MMAP_LOCK_H
#define _LINUX_MMAP_LOCK_H

#include <linux/lockdep.h>
#include <linux/mm_types.h>
#include <linux/mmdebug.h>
#include <linux/rwsem.h>
#include <linux/tracepoint-defs.h>
#include <linux/types.h>

#define MMAP_LOCK_INITIALIZER(name) \
        .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),

DECLARE_TRACEPOINT(mmap_lock_start_locking);
DECLARE_TRACEPOINT(mmap_lock_acquire_returned);
DECLARE_TRACEPOINT(mmap_lock_released);

#ifdef CONFIG_TRACING

void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write);
void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
                                           bool success);
void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write);

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
        if (tracepoint_enabled(mmap_lock_start_locking))
                __mmap_lock_do_trace_start_locking(mm, write);
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
        if (tracepoint_enabled(mmap_lock_acquire_returned))
                __mmap_lock_do_trace_acquire_returned(mm, write, success);
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
        if (tracepoint_enabled(mmap_lock_released))
                __mmap_lock_do_trace_released(mm, write);
}

#else /* !CONFIG_TRACING */

static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm,
                                                   bool write)
{
}

static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm,
                                                      bool write, bool success)
{
}

static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
{
}

#endif /* CONFIG_TRACING */

static inline void mmap_assert_locked(const struct mm_struct *mm)
{
        rwsem_assert_held(&mm->mmap_lock);
}

static inline void mmap_assert_write_locked(const struct mm_struct *mm)
{
        rwsem_assert_held_write(&mm->mmap_lock);
}

#ifdef CONFIG_PER_VMA_LOCK

static inline void mm_lock_seqcount_init(struct mm_struct *mm)
{
        seqcount_init(&mm->mm_lock_seq);
}

static inline void mm_lock_seqcount_begin(struct mm_struct *mm)
{
        do_raw_write_seqcount_begin(&mm->mm_lock_seq);
}

static inline void mm_lock_seqcount_end(struct mm_struct *mm)
{
        ASSERT_EXCLUSIVE_WRITER(mm->mm_lock_seq);
        do_raw_write_seqcount_end(&mm->mm_lock_seq);
}

static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
        /*
         * Since mmap_lock is a sleeping lock, and waiting for it to become
         * unlocked is more or less equivalent with taking it ourselves, don't
         * bother with the speculative path if mmap_lock is already write-locked
         * and take the slow path, which takes the lock.
         */
        return raw_seqcount_try_begin(&mm->mm_lock_seq, *seq);
}

static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
        return read_seqcount_retry(&mm->mm_lock_seq, seq);
}

#else /* CONFIG_PER_VMA_LOCK */

static inline void mm_lock_seqcount_init(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_begin(struct mm_struct *mm) {}
static inline void mm_lock_seqcount_end(struct mm_struct *mm) {}

static inline bool mmap_lock_speculate_try_begin(struct mm_struct *mm, unsigned int *seq)
{
        return false;
}

static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int seq)
{
        return true;
}

#endif /* CONFIG_PER_VMA_LOCK */

static inline void mmap_write_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write(&mm->mmap_lock);
        mm_lock_seqcount_begin(mm);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass)
{
        __mmap_lock_trace_start_locking(mm, true);
        down_write_nested(&mm->mmap_lock, subclass);
        mm_lock_seqcount_begin(mm);
        __mmap_lock_trace_acquire_returned(mm, true, true);
}

static inline int mmap_write_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, true);
        ret = down_write_killable(&mm->mmap_lock);
        if (!ret)
                mm_lock_seqcount_begin(mm);
        __mmap_lock_trace_acquire_returned(mm, true, ret == 0);
        return ret;
}

/*
 * Drop all currently-held per-VMA locks.
 * This is called from the mmap_lock implementation directly before releasing
 * a write-locked mmap_lock (or downgrading it to read-locked).
 * This should normally NOT be called manually from other places.
 * If you want to call this manually anyway, keep in mind that this will release
 * *all* VMA write locks, including ones from further up the stack.
 */
static inline void vma_end_write_all(struct mm_struct *mm)
{
        mmap_assert_write_locked(mm);
        mm_lock_seqcount_end(mm);
}

static inline void mmap_write_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, true);
        vma_end_write_all(mm);
        up_write(&mm->mmap_lock);
}

static inline void mmap_write_downgrade(struct mm_struct *mm)
{
        __mmap_lock_trace_acquire_returned(mm, false, true);
        vma_end_write_all(mm);
        downgrade_write(&mm->mmap_lock);
}

static inline void mmap_read_lock(struct mm_struct *mm)
{
        __mmap_lock_trace_start_locking(mm, false);
        down_read(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, true);
}

static inline int mmap_read_lock_killable(struct mm_struct *mm)
{
        int ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_killable(&mm->mmap_lock);
        __mmap_lock_trace_acquire_returned(mm, false, ret == 0);
        return ret;
}

static inline bool mmap_read_trylock(struct mm_struct *mm)
{
        bool ret;

        __mmap_lock_trace_start_locking(mm, false);
        ret = down_read_trylock(&mm->mmap_lock) != 0;
        __mmap_lock_trace_acquire_returned(mm, false, ret);
        return ret;
}

static inline void mmap_read_unlock(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read(&mm->mmap_lock);
}

static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
{
        __mmap_lock_trace_released(mm, false);
        up_read_non_owner(&mm->mmap_lock);
}

static inline int mmap_lock_is_contended(struct mm_struct *mm)
{
        return rwsem_is_contended(&mm->mmap_lock);
}

#endif /* _LINUX_MMAP_LOCK_H */



















































   35 

























































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
/* SPDX-License-Identifier: GPL-2.0 */
/*
  File: linux/posix_acl.h

  (C) 2002 Andreas Gruenbacher, <a.gruenbacher@computer.org>
*/


#ifndef __LINUX_POSIX_ACL_H
#define __LINUX_POSIX_ACL_H

#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/rcupdate.h>
#include <linux/refcount.h>
#include <uapi/linux/posix_acl.h>

struct user_namespace;

struct posix_acl_entry {
        short                        e_tag;
        unsigned short                e_perm;
        union {
                kuid_t                e_uid;
                kgid_t                e_gid;
        };
};

struct posix_acl {
        /* New members MUST be added within the struct_group() macro below. */
        struct_group_tagged(posix_acl_hdr, hdr,
                refcount_t                a_refcount;
                unsigned int                a_count;
                struct rcu_head                a_rcu;
        );
        struct posix_acl_entry        a_entries[] __counted_by(a_count);
};
static_assert(offsetof(struct posix_acl, a_entries) == sizeof(struct posix_acl_hdr),
              "struct member likely outside of struct_group_tagged()");

#define FOREACH_ACL_ENTRY(pa, acl, pe) \
        for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++)


/*
 * Duplicate an ACL handle.
 */
static inline struct posix_acl *
posix_acl_dup(struct posix_acl *acl)
{
        if (acl)
                refcount_inc(&acl->a_refcount);
        return acl;
}

/*
 * Free an ACL handle.
 */
static inline void
posix_acl_release(struct posix_acl *acl)
{
        if (acl && refcount_dec_and_test(&acl->a_refcount))
                kfree_rcu(acl, a_rcu);
}


/* posix_acl.c */

extern void posix_acl_init(struct posix_acl *, int);
extern struct posix_acl *posix_acl_alloc(unsigned int count, gfp_t flags);
extern struct posix_acl *posix_acl_from_mode(umode_t, gfp_t);
extern int posix_acl_equiv_mode(const struct posix_acl *, umode_t *);
extern int __posix_acl_create(struct posix_acl **, gfp_t, umode_t *);
extern int __posix_acl_chmod(struct posix_acl **, gfp_t, umode_t);

extern struct posix_acl *get_posix_acl(struct inode *, int);
int set_posix_acl(struct mnt_idmap *, struct dentry *, int,
                  struct posix_acl *);

struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type);
struct posix_acl *posix_acl_clone(const struct posix_acl *acl, gfp_t flags);

#ifdef CONFIG_FS_POSIX_ACL
int posix_acl_chmod(struct mnt_idmap *, struct dentry *, umode_t);
extern int posix_acl_create(struct inode *, umode_t *, struct posix_acl **,
                struct posix_acl **);
int posix_acl_update_mode(struct mnt_idmap *, struct inode *, umode_t *,
                          struct posix_acl **);

int simple_set_acl(struct mnt_idmap *, struct dentry *,
                   struct posix_acl *, int);
extern int simple_acl_create(struct inode *, struct inode *);

struct posix_acl *get_cached_acl(struct inode *inode, int type);
void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl);
void forget_cached_acl(struct inode *inode, int type);
void forget_all_cached_acls(struct inode *inode);
int posix_acl_valid(struct user_namespace *, const struct posix_acl *);
int posix_acl_permission(struct mnt_idmap *, struct inode *,
                         const struct posix_acl *, int);

static inline void cache_no_acl(struct inode *inode)
{
        inode->i_acl = NULL;
        inode->i_default_acl = NULL;
}

int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                const char *acl_name, struct posix_acl *kacl);
struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *acl_name);
int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
                   const char *acl_name);
int posix_acl_listxattr(struct inode *inode, char **buffer,
                        ssize_t *remaining_size);
#else
static inline int posix_acl_chmod(struct mnt_idmap *idmap,
                                  struct dentry *dentry, umode_t mode)
{
        return 0;
}

#define simple_set_acl                NULL

static inline int simple_acl_create(struct inode *dir, struct inode *inode)
{
        return 0;
}
static inline void cache_no_acl(struct inode *inode)
{
}

static inline int posix_acl_create(struct inode *inode, umode_t *mode,
                struct posix_acl **default_acl, struct posix_acl **acl)
{
        *default_acl = *acl = NULL;
        return 0;
}

static inline void forget_all_cached_acls(struct inode *inode)
{
}

static inline int vfs_set_acl(struct mnt_idmap *idmap,
                              struct dentry *dentry, const char *name,
                              struct posix_acl *acl)
{
        return -EOPNOTSUPP;
}

static inline struct posix_acl *vfs_get_acl(struct mnt_idmap *idmap,
                                            struct dentry *dentry,
                                            const char *acl_name)
{
        return ERR_PTR(-EOPNOTSUPP);
}

static inline int vfs_remove_acl(struct mnt_idmap *idmap,
                                 struct dentry *dentry, const char *acl_name)
{
        return -EOPNOTSUPP;
}
static inline int posix_acl_listxattr(struct inode *inode, char **buffer,
                                      ssize_t *remaining_size)
{
        return 0;
}
#endif /* CONFIG_FS_POSIX_ACL */

struct posix_acl *get_inode_acl(struct inode *inode, int type);

#endif  /* __LINUX_POSIX_ACL_H */

















































































































































































































    4 



    4 














































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
// SPDX-License-Identifier: GPL-2.0
/*
 * This file contains the procedures for the handling of select and poll
 *
 * Created for Linux based loosely upon Mathius Lattner's minix
 * patches by Peter MacDonald. Heavily edited by Linus.
 *
 *  4 February 1994
 *     COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
 *     flag set in its personality we do *not* modify the given timeout
 *     parameter to reflect time remaining.
 *
 *  24 January 2000
 *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
 *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
 */

#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/sched/signal.h>
#include <linux/sched/rt.h>
#include <linux/syscalls.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/personality.h> /* for STICKY_TIMEOUTS */
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/fs.h>
#include <linux/rcupdate.h>
#include <linux/hrtimer.h>
#include <linux/freezer.h>
#include <net/busy_poll.h>
#include <linux/vmalloc.h>

#include <linux/uaccess.h>


/*
 * Estimate expected accuracy in ns from a timeval.
 *
 * After quite a bit of churning around, we've settled on
 * a simple thing of taking 0.1% of the timeout as the
 * slack, with a cap of 100 msec.
 * "nice" tasks get a 0.5% slack instead.
 *
 * Consider this comment an open invitation to come up with even
 * better solutions..
 */

#define MAX_SLACK        (100 * NSEC_PER_MSEC)

static long __estimate_accuracy(struct timespec64 *tv)
{
        long slack;
        int divfactor = 1000;

        if (tv->tv_sec < 0)
                return 0;

        if (task_nice(current) > 0)
                divfactor = divfactor / 5;

        if (tv->tv_sec > MAX_SLACK / (NSEC_PER_SEC/divfactor))
                return MAX_SLACK;

        slack = tv->tv_nsec / divfactor;
        slack += tv->tv_sec * (NSEC_PER_SEC/divfactor);

        if (slack > MAX_SLACK)
                return MAX_SLACK;

        return slack;
}

u64 select_estimate_accuracy(struct timespec64 *tv)
{
        u64 ret;
        struct timespec64 now;
        u64 slack = current->timer_slack_ns;

        if (slack == 0)
                return 0;

        ktime_get_ts64(&now);
        now = timespec64_sub(*tv, now);
        ret = __estimate_accuracy(&now);
        if (ret < slack)
                return slack;
        return ret;
}



struct poll_table_page {
        struct poll_table_page * next;
        struct poll_table_entry * entry;
        struct poll_table_entry entries[];
};

#define POLL_TABLE_FULL(table) \
        ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

/*
 * Ok, Peter made a complicated, but straightforward multiple_wait() function.
 * I have rewritten this, taking some shortcuts: This code may not be easy to
 * follow, but it should be free of race-conditions, and it's practical. If you
 * understand what I'm doing here, then you understand how the linux
 * sleep/wakeup mechanism works.
 *
 * Two very simple procedures, poll_wait() and poll_freewait() make all the
 * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
 * as all select/poll functions have to call it to add an entry to the
 * poll table.
 */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                       poll_table *p);

void poll_initwait(struct poll_wqueues *pwq)
{
        init_poll_funcptr(&pwq->pt, __pollwait);
        pwq->polling_task = current;
        pwq->triggered = 0;
        pwq->error = 0;
        pwq->table = NULL;
        pwq->inline_index = 0;
}
EXPORT_SYMBOL(poll_initwait);

static void free_poll_entry(struct poll_table_entry *entry)
{
        remove_wait_queue(entry->wait_address, &entry->wait);
        fput(entry->filp);
}

void poll_freewait(struct poll_wqueues *pwq)
{
        struct poll_table_page * p = pwq->table;
        int i;
        for (i = 0; i < pwq->inline_index; i++)
                free_poll_entry(pwq->inline_entries + i);
        while (p) {
                struct poll_table_entry * entry;
                struct poll_table_page *old;

                entry = p->entry;
                do {
                        entry--;
                        free_poll_entry(entry);
                } while (entry > p->entries);
                old = p;
                p = p->next;
                free_page((unsigned long) old);
        }
}
EXPORT_SYMBOL(poll_freewait);

static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
        struct poll_table_page *table = p->table;

        if (p->inline_index < N_INLINE_POLL_ENTRIES)
                return p->inline_entries + p->inline_index++;

        if (!table || POLL_TABLE_FULL(table)) {
                struct poll_table_page *new_table;

                new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
                if (!new_table) {
                        p->error = -ENOMEM;
                        return NULL;
                }
                new_table->entry = new_table->entries;
                new_table->next = table;
                p->table = new_table;
                table = new_table;
        }

        return table->entry++;
}

static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        struct poll_wqueues *pwq = wait->private;
        DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

        /*
         * Although this function is called under waitqueue lock, LOCK
         * doesn't imply write barrier and the users expect write
         * barrier semantics on wakeup functions.  The following
         * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
         * and is paired with smp_store_mb() in poll_schedule_timeout.
         */
        smp_wmb();
        pwq->triggered = 1;

        /*
         * Perform the default wake up operation using a dummy
         * waitqueue.
         *
         * TODO: This is hacky but there currently is no interface to
         * pass in @sync.  @sync is scheduled to be removed and once
         * that happens, wake_up_process() can be used directly.
         */
        return default_wake_function(&dummy_wait, mode, sync, key);
}

static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
        struct poll_table_entry *entry;

        entry = container_of(wait, struct poll_table_entry, wait);
        if (key && !(key_to_poll(key) & entry->key))
                return 0;
        return __pollwake(wait, mode, sync, key);
}

/* Add a new entry */
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
                                poll_table *p)
{
        struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
        struct poll_table_entry *entry = poll_get_entry(pwq);
        if (!entry)
                return;
        entry->filp = get_file(filp);
        entry->wait_address = wait_address;
        entry->key = p->_key;
        init_waitqueue_func_entry(&entry->wait, pollwake);
        entry->wait.private = pwq;
        add_wait_queue(wait_address, &entry->wait);
}

static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
                          ktime_t *expires, unsigned long slack)
{
        int rc = -EINTR;

        set_current_state(state);
        if (!pwq->triggered)
                rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);
        __set_current_state(TASK_RUNNING);

        /*
         * Prepare for the next iteration.
         *
         * The following smp_store_mb() serves two purposes.  First, it's
         * the counterpart rmb of the wmb in pollwake() such that data
         * written before wake up is always visible after wake up.
         * Second, the full barrier guarantees that triggered clearing
         * doesn't pass event check of the next iteration.  Note that
         * this problem doesn't exist for the first iteration as
         * add_wait_queue() has full barrier semantics.
         */
        smp_store_mb(pwq->triggered, 0);

        return rc;
}

/**
 * poll_select_set_timeout - helper function to setup the timeout value
 * @to:                pointer to timespec64 variable for the final timeout
 * @sec:        seconds (from user space)
 * @nsec:        nanoseconds (from user space)
 *
 * Note, we do not use a timespec for the user space value here, That
 * way we can use the function for timeval and compat interfaces as well.
 *
 * Returns -EINVAL if sec/nsec are not normalized. Otherwise 0.
 */
int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
{
        struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};

        if (!timespec64_valid(&ts))
                return -EINVAL;

        /* Optimize for the zero timeout value here */
        if (!sec && !nsec) {
                to->tv_sec = to->tv_nsec = 0;
        } else {
                ktime_get_ts64(to);
                *to = timespec64_add_safe(*to, ts);
        }
        return 0;
}

enum poll_time_type {
        PT_TIMEVAL = 0,
        PT_OLD_TIMEVAL = 1,
        PT_TIMESPEC = 2,
        PT_OLD_TIMESPEC = 3,
};

static int poll_select_finish(struct timespec64 *end_time,
                              void __user *p,
                              enum poll_time_type pt_type, int ret)
{
        struct timespec64 rts;

        restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);

        if (!p)
                return ret;

        if (current->personality & STICKY_TIMEOUTS)
                goto sticky;

        /* No update for zero timeout */
        if (!end_time->tv_sec && !end_time->tv_nsec)
                return ret;

        ktime_get_ts64(&rts);
        rts = timespec64_sub(*end_time, rts);
        if (rts.tv_sec < 0)
                rts.tv_sec = rts.tv_nsec = 0;


        switch (pt_type) {
        case PT_TIMEVAL:
                {
                        struct __kernel_old_timeval rtv;

                        if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
                                memset(&rtv, 0, sizeof(rtv));
                        rtv.tv_sec = rts.tv_sec;
                        rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
                        if (!copy_to_user(p, &rtv, sizeof(rtv)))
                                return ret;
                }
                break;
        case PT_OLD_TIMEVAL:
                {
                        struct old_timeval32 rtv;

                        rtv.tv_sec = rts.tv_sec;
                        rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
                        if (!copy_to_user(p, &rtv, sizeof(rtv)))
                                return ret;
                }
                break;
        case PT_TIMESPEC:
                if (!put_timespec64(&rts, p))
                        return ret;
                break;
        case PT_OLD_TIMESPEC:
                if (!put_old_timespec32(&rts, p))
                        return ret;
                break;
        default:
                BUG();
        }
        /*
         * If an application puts its timeval in read-only memory, we
         * don't want the Linux-specific update to the timeval to
         * cause a fault after the select has completed
         * successfully. However, because we're not updating the
         * timeval, we can't restart the system call.
         */

sticky:
        if (ret == -ERESTARTNOHAND)
                ret = -EINTR;
        return ret;
}

/*
 * Scalable version of the fd_set.
 */

typedef struct {
        unsigned long *in, *out, *ex;
        unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

/*
 * How many longwords for "nr" bits?
 */
#define FDS_BITPERLONG        (8*sizeof(long))
#define FDS_LONGS(nr)        (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr)        (FDS_LONGS(nr)*sizeof(long))

/*
 * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
 */
static inline
int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
        nr = FDS_BYTES(nr);
        if (ufdset)
                return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;

        memset(fdset, 0, nr);
        return 0;
}

static inline unsigned long __must_check
set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
        if (ufdset)
                return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
        return 0;
}

static inline
void zero_fd_set(unsigned long nr, unsigned long *fdset)
{
        memset(fdset, 0, FDS_BYTES(nr));
}

#define FDS_IN(fds, n)                (fds->in + n)
#define FDS_OUT(fds, n)                (fds->out + n)
#define FDS_EX(fds, n)                (fds->ex + n)

#define BITS(fds, n)        (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
        unsigned long *open_fds;
        unsigned long set;
        int max;
        struct fdtable *fdt;

        /* handle last in-complete long-word first */
        set = ~(~0UL << (n & (BITS_PER_LONG-1)));
        n /= BITS_PER_LONG;
        fdt = files_fdtable(current->files);
        open_fds = fdt->open_fds + n;
        max = 0;
        if (set) {
                set &= BITS(fds, n);
                if (set) {
                        if (!(set & ~*open_fds))
                                goto get_max;
                        return -EBADF;
                }
        }
        while (n) {
                open_fds--;
                n--;
                set = BITS(fds, n);
                if (!set)
                        continue;
                if (set & ~*open_fds)
                        return -EBADF;
                if (max)
                        continue;
get_max:
                do {
                        max++;
                        set >>= 1;
                } while (set);
                max += n * BITS_PER_LONG;
        }

        return max;
}

#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\
                        EPOLLNVAL)
#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\
                         EPOLLNVAL)
#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)

static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in,
                                unsigned long out, unsigned long bit,
                                __poll_t ll_flag)
{
        CLASS(fd, f)(fd);

        if (fd_empty(f))
                return EPOLLNVAL;

        wait->_key = POLLEX_SET | ll_flag;
        if (in & bit)
                wait->_key |= POLLIN_SET;
        if (out & bit)
                wait->_key |= POLLOUT_SET;

        return vfs_poll(fd_file(f), wait);
}

static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
        ktime_t expire, *to = NULL;
        struct poll_wqueues table;
        poll_table *wait;
        int retval, i, timed_out = 0;
        u64 slack = 0;
        __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_start = 0;

        rcu_read_lock();
        retval = max_select_fd(n, fds);
        rcu_read_unlock();

        if (retval < 0)
                return retval;
        n = retval;

        poll_initwait(&table);
        wait = &table.pt;
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                wait->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);

        retval = 0;
        for (;;) {
                unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
                bool can_busy_loop = false;

                inp = fds->in; outp = fds->out; exp = fds->ex;
                rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

                for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
                        unsigned long in, out, ex, all_bits, bit = 1, j;
                        unsigned long res_in = 0, res_out = 0, res_ex = 0;
                        __poll_t mask;

                        in = *inp++; out = *outp++; ex = *exp++;
                        all_bits = in | out | ex;
                        if (all_bits == 0) {
                                i += BITS_PER_LONG;
                                continue;
                        }

                        for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
                                if (i >= n)
                                        break;
                                if (!(bit & all_bits))
                                        continue;
                                mask = select_poll_one(i, wait, in, out, bit,
                                                       busy_flag);
                                if ((mask & POLLIN_SET) && (in & bit)) {
                                        res_in |= bit;
                                        retval++;
                                        wait->_qproc = NULL;
                                }
                                if ((mask & POLLOUT_SET) && (out & bit)) {
                                        res_out |= bit;
                                        retval++;
                                        wait->_qproc = NULL;
                                }
                                if ((mask & POLLEX_SET) && (ex & bit)) {
                                        res_ex |= bit;
                                        retval++;
                                        wait->_qproc = NULL;
                                }
                                /* got something, stop busy polling */
                                if (retval) {
                                        can_busy_loop = false;
                                        busy_flag = 0;

                                /*
                                 * only remember a returned
                                 * POLL_BUSY_LOOP if we asked for it
                                 */
                                } else if (busy_flag & mask)
                                        can_busy_loop = true;

                        }
                        if (res_in)
                                *rinp = res_in;
                        if (res_out)
                                *routp = res_out;
                        if (res_ex)
                                *rexp = res_ex;
                        cond_resched();
                }
                wait->_qproc = NULL;
                if (retval || timed_out || signal_pending(current))
                        break;
                if (table.error) {
                        retval = table.error;
                        break;
                }

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
                                           to, slack))
                        timed_out = 1;
        }

        poll_freewait(&table);

        return retval;
}

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
                           fd_set __user *exp, struct timespec64 *end_time)
{
        fd_set_bits fds;
        void *bits;
        int ret, max_fds;
        size_t size, alloc_size;
        struct fdtable *fdt;
        /* Allocate small arguments on the stack to save memory and be faster */
        long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

        ret = -EINVAL;
        if (n < 0)
                goto out_nofds;

        /* max_fds can increase, so grab it once to avoid race */
        rcu_read_lock();
        fdt = files_fdtable(current->files);
        max_fds = fdt->max_fds;
        rcu_read_unlock();
        if (n > max_fds)
                n = max_fds;

        /*
         * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
         * since we used fdset we need to allocate memory in units of
         * long-words. 
         */
        size = FDS_BYTES(n);
        bits = stack_fds;
        if (size > sizeof(stack_fds) / 6) {
                /* Not enough space in on-stack array; must use kmalloc */
                ret = -ENOMEM;
                if (size > (SIZE_MAX / 6))
                        goto out_nofds;

                alloc_size = 6 * size;
                bits = kvmalloc(alloc_size, GFP_KERNEL);
                if (!bits)
                        goto out_nofds;
        }
        fds.in      = bits;
        fds.out     = bits +   size;
        fds.ex      = bits + 2*size;
        fds.res_in  = bits + 3*size;
        fds.res_out = bits + 4*size;
        fds.res_ex  = bits + 5*size;

        if ((ret = get_fd_set(n, inp, fds.in)) ||
            (ret = get_fd_set(n, outp, fds.out)) ||
            (ret = get_fd_set(n, exp, fds.ex)))
                goto out;
        zero_fd_set(n, fds.res_in);
        zero_fd_set(n, fds.res_out);
        zero_fd_set(n, fds.res_ex);

        ret = do_select(n, &fds, end_time);

        if (ret < 0)
                goto out;
        if (!ret) {
                ret = -ERESTARTNOHAND;
                if (signal_pending(current))
                        goto out;
                ret = 0;
        }

        if (set_fd_set(n, inp, fds.res_in) ||
            set_fd_set(n, outp, fds.res_out) ||
            set_fd_set(n, exp, fds.res_ex))
                ret = -EFAULT;

out:
        if (bits != stack_fds)
                kvfree(bits);
out_nofds:
        return ret;
}

static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
                       fd_set __user *exp, struct __kernel_old_timeval __user *tvp)
{
        struct timespec64 end_time, *to = NULL;
        struct __kernel_old_timeval tv;
        int ret;

        if (tvp) {
                if (copy_from_user(&tv, tvp, sizeof(tv)))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to,
                                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                        return -EINVAL;
        }

        ret = core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret);
}

SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
                fd_set __user *, exp, struct __kernel_old_timeval __user *, tvp)
{
        return kern_select(n, inp, outp, exp, tvp);
}

static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
                       fd_set __user *exp, void __user *tsp,
                       const sigset_t __user *sigmask, size_t sigsetsize,
                       enum poll_time_type type)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                switch (type) {
                case PT_TIMESPEC:
                        if (get_timespec64(&ts, tsp))
                                return -EFAULT;
                        break;
                case PT_OLD_TIMESPEC:
                        if (get_old_timespec32(&ts, tsp))
                                return -EFAULT;
                        break;
                default:
                        BUG();
                }

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tsp, type, ret);
}

/*
 * Most architectures can't handle 7-argument syscalls. So we provide a
 * 6-argument version where the sixth argument is a pointer to a structure
 * which has a pointer to the sigset_t itself followed by a size_t containing
 * the sigset size.
 */
struct sigset_argpack {
        sigset_t __user *p;
        size_t size;
};

static inline int get_sigset_argpack(struct sigset_argpack *to,
                                     struct sigset_argpack __user *from)
{
        // the path is hot enough for overhead of copy_from_user() to matter
        if (from) {
                if (can_do_masked_user_access())
                        from = masked_user_access_begin(from);
                else if (!user_read_access_begin(from, sizeof(*from)))
                        return -EFAULT;
                unsafe_get_user(to->p, &from->p, Efault);
                unsafe_get_user(to->size, &from->size, Efault);
                user_read_access_end();
        }
        return 0;
Efault:
        user_read_access_end();
        return -EFAULT;
}

SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp,
                fd_set __user *, exp, struct __kernel_timespec __user *, tsp,
                void __user *, sig)
{
        struct sigset_argpack x = {NULL, 0};

        if (get_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_TIMESPEC);
}

#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)

SYSCALL_DEFINE6(pselect6_time32, int, n, fd_set __user *, inp, fd_set __user *, outp,
                fd_set __user *, exp, struct old_timespec32 __user *, tsp,
                void __user *, sig)
{
        struct sigset_argpack x = {NULL, 0};

        if (get_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_pselect(n, inp, outp, exp, tsp, x.p, x.size, PT_OLD_TIMESPEC);
}

#endif

#ifdef __ARCH_WANT_SYS_OLD_SELECT
struct sel_arg_struct {
        unsigned long n;
        fd_set __user *inp, *outp, *exp;
        struct __kernel_old_timeval __user *tvp;
};

SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
{
        struct sel_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp);
}
#endif

struct poll_list {
        struct poll_list *next;
        unsigned int len;
        struct pollfd entries[] __counted_by(len);
};

#define POLLFD_PER_PAGE  ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd))

/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                     bool *can_busy_poll,
                                     __poll_t busy_flag)
{
        int fd = pollfd->fd;
        __poll_t mask, filter;

        if (fd < 0)
                return 0;

        CLASS(fd, f)(fd);
        if (fd_empty(f))
                return EPOLLNVAL;

        /* userland u16 ->events contains POLL... bitmap */
        filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
        pwait->_key = filter | busy_flag;
        mask = vfs_poll(fd_file(f), pwait);
        if (mask & busy_flag)
                *can_busy_poll = true;
        return mask & filter;                /* Mask out unneeded events. */
}

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                   struct timespec64 *end_time)
{
        poll_table* pt = &wait->pt;
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
        u64 slack = 0;
        __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_start = 0;

        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                pt->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);

        for (;;) {
                struct poll_list *walk;
                bool can_busy_loop = false;

                for (walk = list; walk != NULL; walk = walk->next) {
                        struct pollfd * pfd, * pfd_end;

                        pfd = walk->entries;
                        pfd_end = pfd + walk->len;
                        for (; pfd != pfd_end; pfd++) {
                                __poll_t mask;
                                /*
                                 * Fish for events. If we found one, record it
                                 * and kill poll_table->_qproc, so we don't
                                 * needlessly register any other waiters after
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
                                mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag);
                                pfd->revents = mangle_poll(mask);
                                if (mask) {
                                        count++;
                                        pt->_qproc = NULL;
                                        /* found something, stop busy polling */
                                        busy_flag = 0;
                                        can_busy_loop = false;
                                }
                        }
                }
                /*
                 * All waiters have already been registered, so don't provide
                 * a poll_table->_qproc to them on the next loop iteration.
                 */
                pt->_qproc = NULL;
                if (!count) {
                        count = wait->error;
                        if (signal_pending(current))
                                count = -ERESTARTNOHAND;
                }
                if (count || timed_out)
                        break;

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))
                        timed_out = 1;
        }
        return count;
}

#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list))  / \
                        sizeof(struct pollfd))

static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                struct timespec64 *end_time)
{
        struct poll_wqueues table;
        int err = -EFAULT, fdcount;
        /* Allocate small arguments on the stack to save memory and be
           faster - use long to make sure the buffer is aligned properly
           on 64 bit archs to avoid unaligned access */
        long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
        struct poll_list *const head = (struct poll_list *)stack_pps;
         struct poll_list *walk = head;
        unsigned int todo = nfds;
        unsigned int len;

        if (nfds > rlimit(RLIMIT_NOFILE))
                return -EINVAL;

        len = min_t(unsigned int, nfds, N_STACK_PPS);
        for (;;) {
                walk->next = NULL;
                walk->len = len;
                if (!len)
                        break;

                if (copy_from_user(walk->entries, ufds + nfds-todo,
                                        sizeof(struct pollfd) * walk->len))
                        goto out_fds;

                if (walk->len >= todo)
                        break;
                todo -= walk->len;

                len = min(todo, POLLFD_PER_PAGE);
                walk = walk->next = kmalloc(struct_size(walk, entries, len),
                                            GFP_KERNEL);
                if (!walk) {
                        err = -ENOMEM;
                        goto out_fds;
                }
        }

        poll_initwait(&table);
        fdcount = do_poll(head, &table, end_time);
        poll_freewait(&table);

        if (!user_write_access_begin(ufds, nfds * sizeof(*ufds)))
                goto out_fds;

        for (walk = head; walk; walk = walk->next) {
                struct pollfd *fds = walk->entries;
                unsigned int j;

                for (j = walk->len; j; fds++, ufds++, j--)
                        unsafe_put_user(fds->revents, &ufds->revents, Efault);
          }
        user_write_access_end();

        err = fdcount;
out_fds:
        walk = head->next;
        while (walk) {
                struct poll_list *pos = walk;
                walk = walk->next;
                kfree(pos);
        }

        return err;

Efault:
        user_write_access_end();
        err = -EFAULT;
        goto out_fds;
}

static long do_restart_poll(struct restart_block *restart_block)
{
        struct pollfd __user *ufds = restart_block->poll.ufds;
        int nfds = restart_block->poll.nfds;
        struct timespec64 *to = NULL, end_time;
        int ret;

        if (restart_block->poll.has_timeout) {
                end_time.tv_sec = restart_block->poll.tv_sec;
                end_time.tv_nsec = restart_block->poll.tv_nsec;
                to = &end_time;
        }

        ret = do_sys_poll(ufds, nfds, to);

        if (ret == -ERESTARTNOHAND)
                ret = set_restart_fn(restart_block, do_restart_poll);

        return ret;
}

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                int, timeout_msecs)
{
        struct timespec64 end_time, *to = NULL;
        int ret;

        if (timeout_msecs >= 0) {
                to = &end_time;
                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
        }

        ret = do_sys_poll(ufds, nfds, to);

        if (ret == -ERESTARTNOHAND) {
                struct restart_block *restart_block;

                restart_block = &current->restart_block;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;

                if (timeout_msecs >= 0) {
                        restart_block->poll.tv_sec = end_time.tv_sec;
                        restart_block->poll.tv_nsec = end_time.tv_nsec;
                        restart_block->poll.has_timeout = 1;
                } else
                        restart_block->poll.has_timeout = 0;

                ret = set_restart_fn(restart_block, do_restart_poll);
        }
        return ret;
}

SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
                struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_timespec64(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret);
}

#if defined(CONFIG_COMPAT_32BIT_TIME) && !defined(CONFIG_64BIT)

SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
                struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
                size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_old_timespec32(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret);
}
#endif

#ifdef CONFIG_COMPAT
#define __COMPAT_NFDBITS       (8 * sizeof(compat_ulong_t))

/*
 * Ooo, nasty.  We need here to frob 32-bit unsigned longs to
 * 64-bit unsigned longs.
 */
static
int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
                        unsigned long *fdset)
{
        if (ufdset) {
                return compat_get_bitmap(fdset, ufdset, nr);
        } else {
                zero_fd_set(nr, fdset);
                return 0;
        }
}

static
int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset,
                      unsigned long *fdset)
{
        if (!ufdset)
                return 0;
        return compat_put_bitmap(ufdset, fdset, nr);
}


/*
 * This is a virtual copy of sys_select from fs/select.c and probably
 * should be compared to it from time to time
 */

/*
 * We can actually return ERESTARTSYS instead of EINTR, but I'd
 * like to be certain this leads to no problems. So I return
 * EINTR just for safety.
 *
 * Update: ERESTARTSYS breaks at least the xview clock binary, so
 * I'm trying ERESTARTNOHAND which restart only when you want to.
 */
static int compat_core_sys_select(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct timespec64 *end_time)
{
        fd_set_bits fds;
        void *bits;
        int size, max_fds, ret = -EINVAL;
        struct fdtable *fdt;
        long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

        if (n < 0)
                goto out_nofds;

        /* max_fds can increase, so grab it once to avoid race */
        rcu_read_lock();
        fdt = files_fdtable(current->files);
        max_fds = fdt->max_fds;
        rcu_read_unlock();
        if (n > max_fds)
                n = max_fds;

        /*
         * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
         * since we used fdset we need to allocate memory in units of
         * long-words.
         */
        size = FDS_BYTES(n);
        bits = stack_fds;
        if (size > sizeof(stack_fds) / 6) {
                bits = kmalloc_array(6, size, GFP_KERNEL);
                ret = -ENOMEM;
                if (!bits)
                        goto out_nofds;
        }
        fds.in      = (unsigned long *)  bits;
        fds.out     = (unsigned long *) (bits +   size);
        fds.ex      = (unsigned long *) (bits + 2*size);
        fds.res_in  = (unsigned long *) (bits + 3*size);
        fds.res_out = (unsigned long *) (bits + 4*size);
        fds.res_ex  = (unsigned long *) (bits + 5*size);

        if ((ret = compat_get_fd_set(n, inp, fds.in)) ||
            (ret = compat_get_fd_set(n, outp, fds.out)) ||
            (ret = compat_get_fd_set(n, exp, fds.ex)))
                goto out;
        zero_fd_set(n, fds.res_in);
        zero_fd_set(n, fds.res_out);
        zero_fd_set(n, fds.res_ex);

        ret = do_select(n, &fds, end_time);

        if (ret < 0)
                goto out;
        if (!ret) {
                ret = -ERESTARTNOHAND;
                if (signal_pending(current))
                        goto out;
                ret = 0;
        }

        if (compat_set_fd_set(n, inp, fds.res_in) ||
            compat_set_fd_set(n, outp, fds.res_out) ||
            compat_set_fd_set(n, exp, fds.res_ex))
                ret = -EFAULT;
out:
        if (bits != stack_fds)
                kfree(bits);
out_nofds:
        return ret;
}

static int do_compat_select(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        struct old_timeval32 __user *tvp)
{
        struct timespec64 end_time, *to = NULL;
        struct old_timeval32 tv;
        int ret;

        if (tvp) {
                if (copy_from_user(&tv, tvp, sizeof(tv)))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to,
                                tv.tv_sec + (tv.tv_usec / USEC_PER_SEC),
                                (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC))
                        return -EINVAL;
        }

        ret = compat_core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tvp, PT_OLD_TIMEVAL, ret);
}

COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp,
        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
        struct old_timeval32 __user *, tvp)
{
        return do_compat_select(n, inp, outp, exp, tvp);
}

struct compat_sel_arg_struct {
        compat_ulong_t n;
        compat_uptr_t inp;
        compat_uptr_t outp;
        compat_uptr_t exp;
        compat_uptr_t tvp;
};

COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg)
{
        struct compat_sel_arg_struct a;

        if (copy_from_user(&a, arg, sizeof(a)))
                return -EFAULT;
        return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
                                compat_ptr(a.exp), compat_ptr(a.tvp));
}

static long do_compat_pselect(int n, compat_ulong_t __user *inp,
        compat_ulong_t __user *outp, compat_ulong_t __user *exp,
        void __user *tsp, compat_sigset_t __user *sigmask,
        compat_size_t sigsetsize, enum poll_time_type type)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                switch (type) {
                case PT_OLD_TIMESPEC:
                        if (get_old_timespec32(&ts, tsp))
                                return -EFAULT;
                        break;
                case PT_TIMESPEC:
                        if (get_timespec64(&ts, tsp))
                                return -EFAULT;
                        break;
                default:
                        BUG();
                }

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_compat_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = compat_core_sys_select(n, inp, outp, exp, to);
        return poll_select_finish(&end_time, tsp, type, ret);
}

struct compat_sigset_argpack {
        compat_uptr_t p;
        compat_size_t size;
};
static inline int get_compat_sigset_argpack(struct compat_sigset_argpack *to,
                                            struct compat_sigset_argpack __user *from)
{
        if (from) {
                if (!user_read_access_begin(from, sizeof(*from)))
                        return -EFAULT;
                unsafe_get_user(to->p, &from->p, Efault);
                unsafe_get_user(to->size, &from->size, Efault);
                user_read_access_end();
        }
        return 0;
Efault:
        user_read_access_end();
        return -EFAULT;
}

COMPAT_SYSCALL_DEFINE6(pselect6_time64, int, n, compat_ulong_t __user *, inp,
        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
        struct __kernel_timespec __user *, tsp, void __user *, sig)
{
        struct compat_sigset_argpack x = {0, 0};

        if (get_compat_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p),
                                 x.size, PT_TIMESPEC);
}

#if defined(CONFIG_COMPAT_32BIT_TIME)

COMPAT_SYSCALL_DEFINE6(pselect6_time32, int, n, compat_ulong_t __user *, inp,
        compat_ulong_t __user *, outp, compat_ulong_t __user *, exp,
        struct old_timespec32 __user *, tsp, void __user *, sig)
{
        struct compat_sigset_argpack x = {0, 0};

        if (get_compat_sigset_argpack(&x, sig))
                return -EFAULT;

        return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(x.p),
                                 x.size, PT_OLD_TIMESPEC);
}

#endif

#if defined(CONFIG_COMPAT_32BIT_TIME)
COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
        unsigned int,  nfds, struct old_timespec32 __user *, tsp,
        const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_old_timespec32(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_compat_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_OLD_TIMESPEC, ret);
}
#endif

/* New compat syscall for 64 bit time_t*/
COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
        unsigned int,  nfds, struct __kernel_timespec __user *, tsp,
        const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
        struct timespec64 ts, end_time, *to = NULL;
        int ret;

        if (tsp) {
                if (get_timespec64(&ts, tsp))
                        return -EFAULT;

                to = &end_time;
                if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec))
                        return -EINVAL;
        }

        ret = set_compat_user_sigmask(sigmask, sigsetsize);
        if (ret)
                return ret;

        ret = do_sys_poll(ufds, nfds, to);
        return poll_select_finish(&end_time, tsp, PT_TIMESPEC, ret);
}

#endif






























































  141 
   15 









































































































































  166 
  166 


  165 
  166 
  165 

  166 


  166 






















  126 
  126 
  126 
  127 
  126 

  126 


  126 













  165 















  127 

























  164 

  166 







  163 








  166 
  165 











  126 
  126 



  127 

  127 
  127 




  157 


  157 

  156 
    1 










  155 
    1 






  156 























































  157 




















  157 




















































































































































































































   23 




   23 































































   23 



   23 



   23 



   23 






    2 
    2 








  154 
  156 








  126 
  127 


  127 


































   33 














  157 
















  121 
  156 

  156 





















  122 



  156 








































1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2015 - ARM Ltd
 * Author: Marc Zyngier <marc.zyngier@arm.com>
 */

#ifndef __ARM64_KVM_HYP_SWITCH_H__
#define __ARM64_KVM_HYP_SWITCH_H__

#include <hyp/adjust_pc.h>
#include <hyp/fault.h>

#include <linux/arm-smccc.h>
#include <linux/kvm_host.h>
#include <linux/types.h>
#include <linux/jump_label.h>
#include <uapi/linux/psci.h>

#include <kvm/arm_psci.h>

#include <asm/barrier.h>
#include <asm/cpufeature.h>
#include <asm/extable.h>
#include <asm/kprobes.h>
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
#include <asm/kvm_nested.h>
#include <asm/fpsimd.h>
#include <asm/debug-monitors.h>
#include <asm/processor.h>
#include <asm/traps.h>

struct kvm_exception_table_entry {
        int insn, fixup;
};

extern struct kvm_exception_table_entry __start___kvm_ex_table;
extern struct kvm_exception_table_entry __stop___kvm_ex_table;

/* Save the 32-bit only FPSIMD system register state */
static inline void __fpsimd_save_fpexc32(struct kvm_vcpu *vcpu)
{
        if (!vcpu_el1_is_32bit(vcpu))
                return;

        __vcpu_sys_reg(vcpu, FPEXC32_EL2) = read_sysreg(fpexc32_el2);
}

static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu)
{
        /*
         * We are about to set CPTR_EL2.TFP to trap all floating point
         * register accesses to EL2, however, the ARM ARM clearly states that
         * traps are only taken to EL2 if the operation would not otherwise
         * trap to EL1.  Therefore, always make sure that for 32-bit guests,
         * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
         * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to
         * it will cause an exception.
         */
        if (vcpu_el1_is_32bit(vcpu) && system_supports_fpsimd()) {
                write_sysreg(1 << 30, fpexc32_el2);
                isb();
        }
}

#define reg_to_fgt_masks(reg)                                                \
        ({                                                                \
                struct fgt_masks *m;                                        \
                switch(reg) {                                                \
                case HFGRTR_EL2:                                        \
                        m = &hfgrtr_masks;                                \
                        break;                                                \
                case HFGWTR_EL2:                                        \
                        m = &hfgwtr_masks;                                \
                        break;                                                \
                case HFGITR_EL2:                                        \
                        m = &hfgitr_masks;                                \
                        break;                                                \
                case HDFGRTR_EL2:                                        \
                        m = &hdfgrtr_masks;                                \
                        break;                                                \
                case HDFGWTR_EL2:                                        \
                        m = &hdfgwtr_masks;                                \
                        break;                                                \
                case HAFGRTR_EL2:                                        \
                        m = &hafgrtr_masks;                                \
                        break;                                                \
                case HFGRTR2_EL2:                                        \
                        m = &hfgrtr2_masks;                                \
                        break;                                                \
                case HFGWTR2_EL2:                                        \
                        m = &hfgwtr2_masks;                                \
                        break;                                                \
                case HFGITR2_EL2:                                        \
                        m = &hfgitr2_masks;                                \
                        break;                                                \
                case HDFGRTR2_EL2:                                        \
                        m = &hdfgrtr2_masks;                                \
                        break;                                                \
                case HDFGWTR2_EL2:                                        \
                        m = &hdfgwtr2_masks;                                \
                        break;                                                \
                default:                                                \
                        BUILD_BUG_ON(1);                                \
                }                                                        \
                                                                        \
                m;                                                        \
        })

#define compute_clr_set(vcpu, reg, clr, set)                                \
        do {                                                                \
                u64 hfg = __vcpu_sys_reg(vcpu, reg);                        \
                struct fgt_masks *m = reg_to_fgt_masks(reg);                \
                set |= hfg & m->mask;                                        \
                clr |= ~hfg & m->nmask;                                        \
        } while(0)

#define reg_to_fgt_group_id(reg)                                        \
        ({                                                                \
                enum fgt_group_id id;                                        \
                switch(reg) {                                                \
                case HFGRTR_EL2:                                        \
                case HFGWTR_EL2:                                        \
                        id = HFGRTR_GROUP;                                \
                        break;                                                \
                case HFGITR_EL2:                                        \
                        id = HFGITR_GROUP;                                \
                        break;                                                \
                case HDFGRTR_EL2:                                        \
                case HDFGWTR_EL2:                                        \
                        id = HDFGRTR_GROUP;                                \
                        break;                                                \
                case HAFGRTR_EL2:                                        \
                        id = HAFGRTR_GROUP;                                \
                        break;                                                \
                case HFGRTR2_EL2:                                        \
                case HFGWTR2_EL2:                                        \
                        id = HFGRTR2_GROUP;                                \
                        break;                                                \
                case HFGITR2_EL2:                                        \
                        id = HFGITR2_GROUP;                                \
                        break;                                                \
                case HDFGRTR2_EL2:                                        \
                case HDFGWTR2_EL2:                                        \
                        id = HDFGRTR2_GROUP;                                \
                        break;                                                \
                default:                                                \
                        BUILD_BUG_ON(1);                                \
                }                                                        \
                                                                        \
                id;                                                        \
        })

#define compute_undef_clr_set(vcpu, kvm, reg, clr, set)                        \
        do {                                                                \
                u64 hfg = kvm->arch.fgu[reg_to_fgt_group_id(reg)];        \
                struct fgt_masks *m = reg_to_fgt_masks(reg);                \
                set |= hfg & m->mask;                                        \
                clr |= hfg & m->nmask;                                        \
        } while(0)

#define update_fgt_traps_cs(hctxt, vcpu, kvm, reg, clr, set)                \
        do {                                                                \
                struct fgt_masks *m = reg_to_fgt_masks(reg);                \
                u64 c = clr, s = set;                                        \
                u64 val;                                                \
                                                                        \
                ctxt_sys_reg(hctxt, reg) = read_sysreg_s(SYS_ ## reg);        \
                if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))                \
                        compute_clr_set(vcpu, reg, c, s);                \
                                                                        \
                compute_undef_clr_set(vcpu, kvm, reg, c, s);                \
                                                                        \
                val = m->nmask;                                                \
                val |= s;                                                \
                val &= ~c;                                                \
                write_sysreg_s(val, SYS_ ## reg);                        \
        } while(0)

#define update_fgt_traps(hctxt, vcpu, kvm, reg)                \
        update_fgt_traps_cs(hctxt, vcpu, kvm, reg, 0, 0)

static inline bool cpu_has_amu(void)
{
       u64 pfr0 = read_sysreg_s(SYS_ID_AA64PFR0_EL1);

       return cpuid_feature_extract_unsigned_field(pfr0,
               ID_AA64PFR0_EL1_AMU_SHIFT);
}

static inline void __activate_traps_hfgxtr(struct kvm_vcpu *vcpu)
{
        struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);
        struct kvm *kvm = kern_hyp_va(vcpu->kvm);

        if (!cpus_have_final_cap(ARM64_HAS_FGT))
                return;

        update_fgt_traps(hctxt, vcpu, kvm, HFGRTR_EL2);
        update_fgt_traps_cs(hctxt, vcpu, kvm, HFGWTR_EL2, 0,
                            cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) ?
                            HFGWTR_EL2_TCR_EL1_MASK : 0);
        update_fgt_traps(hctxt, vcpu, kvm, HFGITR_EL2);
        update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR_EL2);
        update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR_EL2);

        if (cpu_has_amu())
                update_fgt_traps(hctxt, vcpu, kvm, HAFGRTR_EL2);

        if (!cpus_have_final_cap(ARM64_HAS_FGT2))
            return;

        update_fgt_traps(hctxt, vcpu, kvm, HFGRTR2_EL2);
        update_fgt_traps(hctxt, vcpu, kvm, HFGWTR2_EL2);
        update_fgt_traps(hctxt, vcpu, kvm, HFGITR2_EL2);
        update_fgt_traps(hctxt, vcpu, kvm, HDFGRTR2_EL2);
        update_fgt_traps(hctxt, vcpu, kvm, HDFGWTR2_EL2);
}

#define __deactivate_fgt(htcxt, vcpu, reg)                                \
        do {                                                                \
                write_sysreg_s(ctxt_sys_reg(hctxt, reg),                \
                               SYS_ ## reg);                                \
        } while(0)

static inline void __deactivate_traps_hfgxtr(struct kvm_vcpu *vcpu)
{
        struct kvm_cpu_context *hctxt = host_data_ptr(host_ctxt);

        if (!cpus_have_final_cap(ARM64_HAS_FGT))
                return;

        __deactivate_fgt(hctxt, vcpu, HFGRTR_EL2);
        __deactivate_fgt(hctxt, vcpu, HFGWTR_EL2);
        __deactivate_fgt(hctxt, vcpu, HFGITR_EL2);
        __deactivate_fgt(hctxt, vcpu, HDFGRTR_EL2);
        __deactivate_fgt(hctxt, vcpu, HDFGWTR_EL2);

        if (cpu_has_amu())
                __deactivate_fgt(hctxt, vcpu, HAFGRTR_EL2);

        if (!cpus_have_final_cap(ARM64_HAS_FGT2))
            return;

        __deactivate_fgt(hctxt, vcpu, HFGRTR2_EL2);
        __deactivate_fgt(hctxt, vcpu, HFGWTR2_EL2);
        __deactivate_fgt(hctxt, vcpu, HFGITR2_EL2);
        __deactivate_fgt(hctxt, vcpu, HDFGRTR2_EL2);
        __deactivate_fgt(hctxt, vcpu, HDFGWTR2_EL2);
}

static inline void  __activate_traps_mpam(struct kvm_vcpu *vcpu)
{
        u64 r = MPAM2_EL2_TRAPMPAM0EL1 | MPAM2_EL2_TRAPMPAM1EL1;

        if (!system_supports_mpam())
                return;

        /* trap guest access to MPAMIDR_EL1 */
        if (system_supports_mpam_hcr()) {
                write_sysreg_s(MPAMHCR_EL2_TRAP_MPAMIDR_EL1, SYS_MPAMHCR_EL2);
        } else {
                /* From v1.1 TIDR can trap MPAMIDR, set it unconditionally */
                r |= MPAM2_EL2_TIDR;
        }

        write_sysreg_s(r, SYS_MPAM2_EL2);
}

static inline void __deactivate_traps_mpam(void)
{
        if (!system_supports_mpam())
                return;

        write_sysreg_s(0, SYS_MPAM2_EL2);

        if (system_supports_mpam_hcr())
                write_sysreg_s(MPAMHCR_HOST_FLAGS, SYS_MPAMHCR_EL2);
}

static inline void __activate_traps_common(struct kvm_vcpu *vcpu)
{
        /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */
        write_sysreg(1 << 15, hstr_el2);

        /*
         * Make sure we trap PMU access from EL0 to EL2. Also sanitize
         * PMSELR_EL0 to make sure it never contains the cycle
         * counter, which could make a PMXEVCNTR_EL0 access UNDEF at
         * EL1 instead of being trapped to EL2.
         */
        if (system_supports_pmuv3()) {
                struct kvm_cpu_context *hctxt;

                write_sysreg(0, pmselr_el0);

                hctxt = host_data_ptr(host_ctxt);
                ctxt_sys_reg(hctxt, PMUSERENR_EL0) = read_sysreg(pmuserenr_el0);
                write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
                vcpu_set_flag(vcpu, PMUSERENR_ON_CPU);
        }

        *host_data_ptr(host_debug_state.mdcr_el2) = read_sysreg(mdcr_el2);
        write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);

        if (cpus_have_final_cap(ARM64_HAS_HCX)) {
                u64 hcrx = vcpu->arch.hcrx_el2;
                if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu)) {
                        u64 val = __vcpu_sys_reg(vcpu, HCRX_EL2);
                        hcrx |= val & __HCRX_EL2_MASK;
                        hcrx &= ~(~val & __HCRX_EL2_nMASK);
                }

                write_sysreg_s(hcrx, SYS_HCRX_EL2);
        }

        __activate_traps_hfgxtr(vcpu);
        __activate_traps_mpam(vcpu);
}

static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu)
{
        write_sysreg(*host_data_ptr(host_debug_state.mdcr_el2), mdcr_el2);

        write_sysreg(0, hstr_el2);
        if (system_supports_pmuv3()) {
                struct kvm_cpu_context *hctxt;

                hctxt = host_data_ptr(host_ctxt);
                write_sysreg(ctxt_sys_reg(hctxt, PMUSERENR_EL0), pmuserenr_el0);
                vcpu_clear_flag(vcpu, PMUSERENR_ON_CPU);
        }

        if (cpus_have_final_cap(ARM64_HAS_HCX))
                write_sysreg_s(HCRX_HOST_FLAGS, SYS_HCRX_EL2);

        __deactivate_traps_hfgxtr(vcpu);
        __deactivate_traps_mpam();
}

static inline void ___activate_traps(struct kvm_vcpu *vcpu, u64 hcr)
{
        if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM))
                hcr |= HCR_TVM;

        write_sysreg_hcr(hcr);

        if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN) && (hcr & HCR_VSE))
                write_sysreg_s(vcpu->arch.vsesr_el2, SYS_VSESR_EL2);
}

static inline void ___deactivate_traps(struct kvm_vcpu *vcpu)
{
        /*
         * If we pended a virtual abort, preserve it until it gets
         * cleared. See D1.14.3 (Virtual Interrupts) for details, but
         * the crucial bit is "On taking a vSError interrupt,
         * HCR_EL2.VSE is cleared to 0."
         */
        if (vcpu->arch.hcr_el2 & HCR_VSE) {
                vcpu->arch.hcr_el2 &= ~HCR_VSE;
                vcpu->arch.hcr_el2 |= read_sysreg(hcr_el2) & HCR_VSE;
        }
}

static inline bool __populate_fault_info(struct kvm_vcpu *vcpu)
{
        return __get_fault_info(vcpu->arch.fault.esr_el2, &vcpu->arch.fault);
}

static inline bool kvm_hyp_handle_mops(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        *vcpu_pc(vcpu) = read_sysreg_el2(SYS_ELR);
        arm64_mops_reset_regs(vcpu_gp_regs(vcpu), vcpu->arch.fault.esr_el2);
        write_sysreg_el2(*vcpu_pc(vcpu), SYS_ELR);

        /*
         * Finish potential single step before executing the prologue
         * instruction.
         */
        *vcpu_cpsr(vcpu) &= ~DBG_SPSR_SS;
        write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);

        return true;
}

static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu)
{
        /*
         * The vCPU's saved SVE state layout always matches the max VL of the
         * vCPU. Start off with the max VL so we can load the SVE state.
         */
        sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2);
        __sve_restore_state(vcpu_sve_pffr(vcpu),
                            &vcpu->arch.ctxt.fp_regs.fpsr,
                            true);

        /*
         * The effective VL for a VM could differ from the max VL when running a
         * nested guest, as the guest hypervisor could select a smaller VL. Slap
         * that into hardware before wrapping up.
         */
        if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
                sve_cond_update_zcr_vq(__vcpu_sys_reg(vcpu, ZCR_EL2), SYS_ZCR_EL2);

        write_sysreg_el1(__vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)), SYS_ZCR);
}

static inline void __hyp_sve_save_host(void)
{
        struct cpu_sve_state *sve_state = *host_data_ptr(sve_state);

        sve_state->zcr_el1 = read_sysreg_el1(SYS_ZCR);
        write_sysreg_s(sve_vq_from_vl(kvm_host_sve_max_vl) - 1, SYS_ZCR_EL2);
        __sve_save_state(sve_state->sve_regs + sve_ffr_offset(kvm_host_sve_max_vl),
                         &sve_state->fpsr,
                         true);
}

static inline void fpsimd_lazy_switch_to_guest(struct kvm_vcpu *vcpu)
{
        u64 zcr_el1, zcr_el2;

        if (!guest_owns_fp_regs())
                return;

        if (vcpu_has_sve(vcpu)) {
                /* A guest hypervisor may restrict the effective max VL. */
                if (vcpu_has_nv(vcpu) && !is_hyp_ctxt(vcpu))
                        zcr_el2 = __vcpu_sys_reg(vcpu, ZCR_EL2);
                else
                        zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;

                write_sysreg_el2(zcr_el2, SYS_ZCR);

                zcr_el1 = __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu));
                write_sysreg_el1(zcr_el1, SYS_ZCR);
        }
}

static inline void fpsimd_lazy_switch_to_host(struct kvm_vcpu *vcpu)
{
        u64 zcr_el1, zcr_el2;

        if (!guest_owns_fp_regs())
                return;

        /*
         * When the guest owns the FP regs, we know that guest+hyp traps for
         * any FPSIMD/SVE/SME features exposed to the guest have been disabled
         * by either fpsimd_lazy_switch_to_guest() or kvm_hyp_handle_fpsimd()
         * prior to __guest_entry(). As __guest_entry() guarantees a context
         * synchronization event, we don't need an ISB here to avoid taking
         * traps for anything that was exposed to the guest.
         */
        if (vcpu_has_sve(vcpu)) {
                zcr_el1 = read_sysreg_el1(SYS_ZCR);
                __vcpu_sys_reg(vcpu, vcpu_sve_zcr_elx(vcpu)) = zcr_el1;

                /*
                 * The guest's state is always saved using the guest's max VL.
                 * Ensure that the host has the guest's max VL active such that
                 * the host can save the guest's state lazily, but don't
                 * artificially restrict the host to the guest's max VL.
                 */
                if (has_vhe()) {
                        zcr_el2 = vcpu_sve_max_vq(vcpu) - 1;
                        write_sysreg_el2(zcr_el2, SYS_ZCR);
                } else {
                        zcr_el2 = sve_vq_from_vl(kvm_host_sve_max_vl) - 1;
                        write_sysreg_el2(zcr_el2, SYS_ZCR);

                        zcr_el1 = vcpu_sve_max_vq(vcpu) - 1;
                        write_sysreg_el1(zcr_el1, SYS_ZCR);
                }
        }
}

static void kvm_hyp_save_fpsimd_host(struct kvm_vcpu *vcpu)
{
        /*
         * Non-protected kvm relies on the host restoring its sve state.
         * Protected kvm restores the host's sve state as not to reveal that
         * fpsimd was used by a guest nor leak upper sve bits.
         */
        if (system_supports_sve()) {
                __hyp_sve_save_host();

                /* Re-enable SVE traps if not supported for the guest vcpu. */
                if (!vcpu_has_sve(vcpu))
                        cpacr_clear_set(CPACR_EL1_ZEN, 0);

        } else {
                __fpsimd_save_state(host_data_ptr(host_ctxt.fp_regs));
        }

        if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm)))
                *host_data_ptr(fpmr) = read_sysreg_s(SYS_FPMR);
}


/*
 * We trap the first access to the FP/SIMD to save the host context and
 * restore the guest context lazily.
 * If FP/SIMD is not implemented, handle the trap and inject an undefined
 * instruction exception to the guest. Similarly for trapped SVE accesses.
 */
static inline bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        bool sve_guest;
        u8 esr_ec;

        if (!system_supports_fpsimd())
                return false;

        sve_guest = vcpu_has_sve(vcpu);
        esr_ec = kvm_vcpu_trap_get_class(vcpu);

        /* Only handle traps the vCPU can support here: */
        switch (esr_ec) {
        case ESR_ELx_EC_FP_ASIMD:
                /* Forward traps to the guest hypervisor as required */
                if (guest_hyp_fpsimd_traps_enabled(vcpu))
                        return false;
                break;
        case ESR_ELx_EC_SYS64:
                if (WARN_ON_ONCE(!is_hyp_ctxt(vcpu)))
                        return false;
                fallthrough;
        case ESR_ELx_EC_SVE:
                if (!sve_guest)
                        return false;
                if (guest_hyp_sve_traps_enabled(vcpu))
                        return false;
                break;
        default:
                return false;
        }

        /* Valid trap.  Switch the context: */

        /* First disable enough traps to allow us to update the registers */
        if (sve_guest || (is_protected_kvm_enabled() && system_supports_sve()))
                cpacr_clear_set(0, CPACR_EL1_FPEN | CPACR_EL1_ZEN);
        else
                cpacr_clear_set(0, CPACR_EL1_FPEN);
        isb();

        /* Write out the host state if it's in the registers */
        if (is_protected_kvm_enabled() && host_owns_fp_regs())
                kvm_hyp_save_fpsimd_host(vcpu);

        /* Restore the guest state */
        if (sve_guest)
                __hyp_sve_restore_guest(vcpu);
        else
                __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs);

        if (kvm_has_fpmr(kern_hyp_va(vcpu->kvm)))
                write_sysreg_s(__vcpu_sys_reg(vcpu, FPMR), SYS_FPMR);

        /* Skip restoring fpexc32 for AArch64 guests */
        if (!(read_sysreg(hcr_el2) & HCR_RW))
                write_sysreg(__vcpu_sys_reg(vcpu, FPEXC32_EL2), fpexc32_el2);

        *host_data_ptr(fp_owner) = FP_STATE_GUEST_OWNED;

        return true;
}

static inline bool handle_tx2_tvm(struct kvm_vcpu *vcpu)
{
        u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
        int rt = kvm_vcpu_sys_get_rt(vcpu);
        u64 val = vcpu_get_reg(vcpu, rt);

        /*
         * The normal sysreg handling code expects to see the traps,
         * let's not do anything here.
         */
        if (vcpu->arch.hcr_el2 & HCR_TVM)
                return false;

        switch (sysreg) {
        case SYS_SCTLR_EL1:
                write_sysreg_el1(val, SYS_SCTLR);
                break;
        case SYS_TTBR0_EL1:
                write_sysreg_el1(val, SYS_TTBR0);
                break;
        case SYS_TTBR1_EL1:
                write_sysreg_el1(val, SYS_TTBR1);
                break;
        case SYS_TCR_EL1:
                write_sysreg_el1(val, SYS_TCR);
                break;
        case SYS_ESR_EL1:
                write_sysreg_el1(val, SYS_ESR);
                break;
        case SYS_FAR_EL1:
                write_sysreg_el1(val, SYS_FAR);
                break;
        case SYS_AFSR0_EL1:
                write_sysreg_el1(val, SYS_AFSR0);
                break;
        case SYS_AFSR1_EL1:
                write_sysreg_el1(val, SYS_AFSR1);
                break;
        case SYS_MAIR_EL1:
                write_sysreg_el1(val, SYS_MAIR);
                break;
        case SYS_AMAIR_EL1:
                write_sysreg_el1(val, SYS_AMAIR);
                break;
        case SYS_CONTEXTIDR_EL1:
                write_sysreg_el1(val, SYS_CONTEXTIDR);
                break;
        default:
                return false;
        }

        __kvm_skip_instr(vcpu);
        return true;
}

/* Open-coded version of timer_get_offset() to allow for kern_hyp_va() */
static inline u64 hyp_timer_get_offset(struct arch_timer_context *ctxt)
{
        u64 offset = 0;

        if (ctxt->offset.vm_offset)
                offset += *kern_hyp_va(ctxt->offset.vm_offset);
        if (ctxt->offset.vcpu_offset)
                offset += *kern_hyp_va(ctxt->offset.vcpu_offset);

        return offset;
}

static inline u64 compute_counter_value(struct arch_timer_context *ctxt)
{
        return arch_timer_read_cntpct_el0() - hyp_timer_get_offset(ctxt);
}

static bool kvm_handle_cntxct(struct kvm_vcpu *vcpu)
{
        struct arch_timer_context *ctxt;
        u32 sysreg;
        u64 val;

        /*
         * We only get here for 64bit guests, 32bit guests will hit
         * the long and winding road all the way to the standard
         * handling. Yes, it sucks to be irrelevant.
         *
         * Also, we only deal with non-hypervisor context here (either
         * an EL1 guest, or a non-HYP context of an EL2 guest).
         */
        if (is_hyp_ctxt(vcpu))
                return false;

        sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));

        switch (sysreg) {
        case SYS_CNTPCT_EL0:
        case SYS_CNTPCTSS_EL0:
                if (vcpu_has_nv(vcpu)) {
                        /* Check for guest hypervisor trapping */
                        val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);
                        if (!vcpu_el2_e2h_is_set(vcpu))
                                val = (val & CNTHCTL_EL1PCTEN) << 10;

                        if (!(val & (CNTHCTL_EL1PCTEN << 10)))
                                return false;
                }

                ctxt = vcpu_ptimer(vcpu);
                break;
        case SYS_CNTVCT_EL0:
        case SYS_CNTVCTSS_EL0:
                if (vcpu_has_nv(vcpu)) {
                        /* Check for guest hypervisor trapping */
                        val = __vcpu_sys_reg(vcpu, CNTHCTL_EL2);

                        if (val & CNTHCTL_EL1TVCT)
                                return false;
                }

                ctxt = vcpu_vtimer(vcpu);
                break;
        default:
                return false;
        }

        val = compute_counter_value(ctxt);

        vcpu_set_reg(vcpu, kvm_vcpu_sys_get_rt(vcpu), val);
        __kvm_skip_instr(vcpu);
        return true;
}

static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu)
{
        u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu));
        int rt = kvm_vcpu_sys_get_rt(vcpu);
        u64 val = vcpu_get_reg(vcpu, rt);

        if (sysreg != SYS_TCR_EL1)
                return false;

        /*
         * Affected parts do not advertise support for hardware Access Flag /
         * Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying
         * control bits are still functional. The architecture requires these be
         * RES0 on systems that do not implement FEAT_HAFDBS.
         *
         * Uphold the requirements of the architecture by masking guest writes
         * to TCR_EL1.{HA,HD} here.
         */
        val &= ~(TCR_HD | TCR_HA);
        write_sysreg_el1(val, SYS_TCR);
        __kvm_skip_instr(vcpu);
        return true;
}

static inline bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) &&
            handle_tx2_tvm(vcpu))
                return true;

        if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) &&
            handle_ampere1_tcr(vcpu))
                return true;

        if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
            __vgic_v3_perform_cpuif_access(vcpu) == 1)
                return true;

        if (kvm_handle_cntxct(vcpu))
                return true;

        return false;
}

static inline bool kvm_hyp_handle_cp15_32(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        if (static_branch_unlikely(&vgic_v3_cpuif_trap) &&
            __vgic_v3_perform_cpuif_access(vcpu) == 1)
                return true;

        return false;
}

static inline bool kvm_hyp_handle_memory_fault(struct kvm_vcpu *vcpu,
                                               u64 *exit_code)
{
        if (!__populate_fault_info(vcpu))
                return true;

        return false;
}
#define kvm_hyp_handle_iabt_low                kvm_hyp_handle_memory_fault
#define kvm_hyp_handle_watchpt_low        kvm_hyp_handle_memory_fault

static inline bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        if (kvm_hyp_handle_memory_fault(vcpu, exit_code))
                return true;

        if (static_branch_unlikely(&vgic_v2_cpuif_trap)) {
                bool valid;

                valid = kvm_vcpu_trap_is_translation_fault(vcpu) &&
                        kvm_vcpu_dabt_isvalid(vcpu) &&
                        !kvm_vcpu_abt_issea(vcpu) &&
                        !kvm_vcpu_abt_iss1tw(vcpu);

                if (valid) {
                        int ret = __vgic_v2_perform_cpuif_access(vcpu);

                        if (ret == 1)
                                return true;

                        /* Promote an illegal access to an SError.*/
                        if (ret == -1)
                                *exit_code = ARM_EXCEPTION_EL1_SERROR;
                }
        }

        return false;
}

typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *);

/*
 * Allow the hypervisor to handle the exit with an exit handler if it has one.
 *
 * Returns true if the hypervisor handled the exit, and control should go back
 * to the guest, or false if it hasn't.
 */
static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
                                       const exit_handler_fn *handlers)
{
        exit_handler_fn fn = handlers[kvm_vcpu_trap_get_class(vcpu)];
        if (fn)
                return fn(vcpu, exit_code);

        return false;
}

static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code)
{
        /*
         * Check for the conditions of Cortex-A510's #2077057. When these occur
         * SPSR_EL2 can't be trusted, but isn't needed either as it is
         * unchanged from the value in vcpu_gp_regs(vcpu)->pstate.
         * Are we single-stepping the guest, and took a PAC exception from the
         * active-not-pending state?
         */
        if (cpus_have_final_cap(ARM64_WORKAROUND_2077057)                &&
            vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP                        &&
            *vcpu_cpsr(vcpu) & DBG_SPSR_SS                                &&
            ESR_ELx_EC(read_sysreg_el2(SYS_ESR)) == ESR_ELx_EC_PAC)
                write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR);

        vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR);
}

/*
 * Return true when we were able to fixup the guest exit and should return to
 * the guest, false when we should restore the host state and return to the
 * main run loop.
 */
static inline bool __fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code,
                                      const exit_handler_fn *handlers)
{
        if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
                vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);

        if (ARM_SERROR_PENDING(*exit_code) &&
            ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) {
                u8 esr_ec = kvm_vcpu_trap_get_class(vcpu);

                /*
                 * HVC already have an adjusted PC, which we need to
                 * correct in order to return to after having injected
                 * the SError.
                 *
                 * SMC, on the other hand, is *trapped*, meaning its
                 * preferred return address is the SMC itself.
                 */
                if (esr_ec == ESR_ELx_EC_HVC32 || esr_ec == ESR_ELx_EC_HVC64)
                        write_sysreg_el2(read_sysreg_el2(SYS_ELR) - 4, SYS_ELR);
        }

        /*
         * We're using the raw exception code in order to only process
         * the trap if no SError is pending. We will come back to the
         * same PC once the SError has been injected, and replay the
         * trapping instruction.
         */
        if (*exit_code != ARM_EXCEPTION_TRAP)
                goto exit;

        /* Check if there's an exit handler and allow it to handle the exit. */
        if (kvm_hyp_handle_exit(vcpu, exit_code, handlers))
                goto guest;
exit:
        /* Return to the host kernel and handle the exit */
        return false;

guest:
        /* Re-enter the guest */
        asm(ALTERNATIVE("nop", "dmb sy", ARM64_WORKAROUND_1508412));
        return true;
}

static inline void __kvm_unexpected_el2_exception(void)
{
        extern char __guest_exit_restore_elr_and_panic[];
        unsigned long addr, fixup;
        struct kvm_exception_table_entry *entry, *end;
        unsigned long elr_el2 = read_sysreg(elr_el2);

        entry = &__start___kvm_ex_table;
        end = &__stop___kvm_ex_table;

        while (entry < end) {
                addr = (unsigned long)&entry->insn + entry->insn;
                fixup = (unsigned long)&entry->fixup + entry->fixup;

                if (addr != elr_el2) {
                        entry++;
                        continue;
                }

                write_sysreg(fixup, elr_el2);
                return;
        }

        /* Trigger a panic after restoring the hyp context. */
        this_cpu_ptr(&kvm_hyp_ctxt)->sys_regs[ELR_EL2] = elr_el2;
        write_sysreg(__guest_exit_restore_elr_and_panic, elr_el2);
}

#endif /* __ARM64_KVM_HYP_SWITCH_H__ */





























  179 







  179 












    3 






    3 











































































































































































































   39 


    7 



   34 
   38 
    7 











































































  179 






  179 















































































































































































































  179 




  179 










1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Implementation of the extensible bitmap type.
 *
 * Author : Stephen Smalley, <stephen.smalley.work@gmail.com>
 */
/*
 * Updated: Hewlett-Packard <paul@paul-moore.com>
 *          Added support to import/export the NetLabel category bitmap
 *          (c) Copyright Hewlett-Packard Development Company, L.P., 2006
 *
 * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com>
 *          Applied standard bit operations to improve bitmap scanning.
 */

#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/errno.h>
#include <linux/jhash.h>
#include <net/netlabel.h>
#include "ebitmap.h"
#include "policydb.h"

#define BITS_PER_U64 ((u32)(sizeof(u64) * 8))

static struct kmem_cache *ebitmap_node_cachep __ro_after_init;

bool ebitmap_equal(const struct ebitmap *e1, const struct ebitmap *e2)
{
        const struct ebitmap_node *n1, *n2;

        if (e1->highbit != e2->highbit)
                return false;

        n1 = e1->node;
        n2 = e2->node;
        while (n1 && n2 && (n1->startbit == n2->startbit) &&
               !memcmp(n1->maps, n2->maps, EBITMAP_SIZE / 8)) {
                n1 = n1->next;
                n2 = n2->next;
        }

        if (n1 || n2)
                return false;

        return true;
}

int ebitmap_cpy(struct ebitmap *dst, const struct ebitmap *src)
{
        struct ebitmap_node *new, *prev;
        const struct ebitmap_node *n;

        ebitmap_init(dst);
        n = src->node;
        prev = NULL;
        while (n) {
                new = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC);
                if (!new) {
                        ebitmap_destroy(dst);
                        return -ENOMEM;
                }
                new->startbit = n->startbit;
                memcpy(new->maps, n->maps, EBITMAP_SIZE / 8);
                new->next = NULL;
                if (prev)
                        prev->next = new;
                else
                        dst->node = new;
                prev = new;
                n = n->next;
        }

        dst->highbit = src->highbit;
        return 0;
}

int ebitmap_and(struct ebitmap *dst, const struct ebitmap *e1,
                const struct ebitmap *e2)
{
        struct ebitmap_node *n;
        u32 bit;
        int rc;

        ebitmap_init(dst);

        ebitmap_for_each_positive_bit(e1, n, bit)
        {
                if (ebitmap_get_bit(e2, bit)) {
                        rc = ebitmap_set_bit(dst, bit, 1);
                        if (rc < 0)
                                return rc;
                }
        }
        return 0;
}

#ifdef CONFIG_NETLABEL
/**
 * ebitmap_netlbl_export - Export an ebitmap into a NetLabel category bitmap
 * @ebmap: the ebitmap to export
 * @catmap: the NetLabel category bitmap
 *
 * Description:
 * Export a SELinux extensibile bitmap into a NetLabel category bitmap.
 * Returns zero on success, negative values on error.
 *
 */
int ebitmap_netlbl_export(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap **catmap)
{
        struct ebitmap_node *e_iter = ebmap->node;
        unsigned long e_map;
        u32 offset;
        unsigned int iter;
        int rc;

        if (e_iter == NULL) {
                *catmap = NULL;
                return 0;
        }

        if (*catmap != NULL)
                netlbl_catmap_free(*catmap);
        *catmap = NULL;

        while (e_iter) {
                offset = e_iter->startbit;
                for (iter = 0; iter < EBITMAP_UNIT_NUMS; iter++) {
                        e_map = e_iter->maps[iter];
                        if (e_map != 0) {
                                rc = netlbl_catmap_setlong(catmap, offset,
                                                           e_map, GFP_ATOMIC);
                                if (rc != 0)
                                        goto netlbl_export_failure;
                        }
                        offset += EBITMAP_UNIT_SIZE;
                }
                e_iter = e_iter->next;
        }

        return 0;

netlbl_export_failure:
        netlbl_catmap_free(*catmap);
        return -ENOMEM;
}

/**
 * ebitmap_netlbl_import - Import a NetLabel category bitmap into an ebitmap
 * @ebmap: the ebitmap to import
 * @catmap: the NetLabel category bitmap
 *
 * Description:
 * Import a NetLabel category bitmap into a SELinux extensibile bitmap.
 * Returns zero on success, negative values on error.
 *
 */
int ebitmap_netlbl_import(struct ebitmap *ebmap,
                          struct netlbl_lsm_catmap *catmap)
{
        int rc;
        struct ebitmap_node *e_iter = NULL;
        struct ebitmap_node *e_prev = NULL;
        u32 offset = 0, idx;
        unsigned long bitmap;

        for (;;) {
                rc = netlbl_catmap_getlong(catmap, &offset, &bitmap);
                if (rc < 0)
                        goto netlbl_import_failure;
                if (offset == (u32)-1)
                        return 0;

                /* don't waste ebitmap space if the netlabel bitmap is empty */
                if (bitmap == 0) {
                        offset += EBITMAP_UNIT_SIZE;
                        continue;
                }

                if (e_iter == NULL ||
                    offset >= e_iter->startbit + EBITMAP_SIZE) {
                        e_prev = e_iter;
                        e_iter = kmem_cache_zalloc(ebitmap_node_cachep,
                                                   GFP_ATOMIC);
                        if (e_iter == NULL)
                                goto netlbl_import_failure;
                        e_iter->startbit = offset - (offset % EBITMAP_SIZE);
                        if (e_prev == NULL)
                                ebmap->node = e_iter;
                        else
                                e_prev->next = e_iter;
                        ebmap->highbit = e_iter->startbit + EBITMAP_SIZE;
                }

                /* offset will always be aligned to an unsigned long */
                idx = EBITMAP_NODE_INDEX(e_iter, offset);
                e_iter->maps[idx] = bitmap;

                /* next */
                offset += EBITMAP_UNIT_SIZE;
        }

        /* NOTE: we should never reach this return */
        return 0;

netlbl_import_failure:
        ebitmap_destroy(ebmap);
        return -ENOMEM;
}
#endif /* CONFIG_NETLABEL */

/*
 * Check to see if all the bits set in e2 are also set in e1. Optionally,
 * if last_e2bit is non-zero, the highest set bit in e2 cannot exceed
 * last_e2bit.
 */
int ebitmap_contains(const struct ebitmap *e1, const struct ebitmap *e2,
                     u32 last_e2bit)
{
        const struct ebitmap_node *n1, *n2;
        int i;

        if (e1->highbit < e2->highbit)
                return 0;

        n1 = e1->node;
        n2 = e2->node;

        while (n1 && n2 && (n1->startbit <= n2->startbit)) {
                if (n1->startbit < n2->startbit) {
                        n1 = n1->next;
                        continue;
                }
                for (i = EBITMAP_UNIT_NUMS - 1; (i >= 0) && !n2->maps[i];)
                        i--; /* Skip trailing NULL map entries */
                if (last_e2bit && (i >= 0)) {
                        u32 lastsetbit = n2->startbit + i * EBITMAP_UNIT_SIZE +
                                         __fls(n2->maps[i]);
                        if (lastsetbit > last_e2bit)
                                return 0;
                }

                while (i >= 0) {
                        if ((n1->maps[i] & n2->maps[i]) != n2->maps[i])
                                return 0;
                        i--;
                }

                n1 = n1->next;
                n2 = n2->next;
        }

        if (n2)
                return 0;

        return 1;
}

int ebitmap_get_bit(const struct ebitmap *e, u32 bit)
{
        const struct ebitmap_node *n;

        if (e->highbit < bit)
                return 0;

        n = e->node;
        while (n && (n->startbit <= bit)) {
                if ((n->startbit + EBITMAP_SIZE) > bit)
                        return ebitmap_node_get_bit(n, bit);
                n = n->next;
        }

        return 0;
}

int ebitmap_set_bit(struct ebitmap *e, u32 bit, int value)
{
        struct ebitmap_node *n, *prev, *new;

        prev = NULL;
        n = e->node;
        while (n && n->startbit <= bit) {
                if ((n->startbit + EBITMAP_SIZE) > bit) {
                        if (value) {
                                ebitmap_node_set_bit(n, bit);
                        } else {
                                u32 s;

                                ebitmap_node_clr_bit(n, bit);

                                s = find_first_bit(n->maps, EBITMAP_SIZE);
                                if (s < EBITMAP_SIZE)
                                        return 0;

                                /* drop this node from the bitmap */
                                if (!n->next) {
                                        /*
                                         * this was the highest map
                                         * within the bitmap
                                         */
                                        if (prev)
                                                e->highbit = prev->startbit +
                                                             EBITMAP_SIZE;
                                        else
                                                e->highbit = 0;
                                }
                                if (prev)
                                        prev->next = n->next;
                                else
                                        e->node = n->next;
                                kmem_cache_free(ebitmap_node_cachep, n);
                        }
                        return 0;
                }
                prev = n;
                n = n->next;
        }

        if (!value)
                return 0;

        new = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC);
        if (!new)
                return -ENOMEM;

        new->startbit = bit - (bit % EBITMAP_SIZE);
        ebitmap_node_set_bit(new, bit);

        if (!n)
                /* this node will be the highest map within the bitmap */
                e->highbit = new->startbit + EBITMAP_SIZE;

        if (prev) {
                new->next = prev->next;
                prev->next = new;
        } else {
                new->next = e->node;
                e->node = new;
        }

        return 0;
}

void ebitmap_destroy(struct ebitmap *e)
{
        struct ebitmap_node *n, *temp;

        if (!e)
                return;

        n = e->node;
        while (n) {
                temp = n;
                n = n->next;
                kmem_cache_free(ebitmap_node_cachep, temp);
        }

        e->highbit = 0;
        e->node = NULL;
}

int ebitmap_read(struct ebitmap *e, struct policy_file *fp)
{
        struct ebitmap_node *n = NULL;
        u32 mapunit, count, startbit, index, i;
        __le32 ebitmap_start;
        u64 map;
        __le64 mapbits;
        __le32 buf[3];
        int rc;

        ebitmap_init(e);

        rc = next_entry(buf, fp, sizeof buf);
        if (rc < 0)
                goto out;

        mapunit = le32_to_cpu(buf[0]);
        e->highbit = le32_to_cpu(buf[1]);
        count = le32_to_cpu(buf[2]);

        if (mapunit != BITS_PER_U64) {
                pr_err("SELinux: ebitmap: map size %u does not "
                       "match my size %u (high bit was %u)\n",
                       mapunit, BITS_PER_U64, e->highbit);
                goto bad;
        }

        /* round up e->highbit */
        e->highbit += EBITMAP_SIZE - 1;
        e->highbit -= (e->highbit % EBITMAP_SIZE);

        if (!e->highbit) {
                e->node = NULL;
                goto ok;
        }

        if (e->highbit && !count)
                goto bad;

        for (i = 0; i < count; i++) {
                rc = next_entry(&ebitmap_start, fp, sizeof(u32));
                if (rc < 0) {
                        pr_err("SELinux: ebitmap: truncated map\n");
                        goto bad;
                }
                startbit = le32_to_cpu(ebitmap_start);

                if (startbit & (mapunit - 1)) {
                        pr_err("SELinux: ebitmap start bit (%u) is "
                               "not a multiple of the map unit size (%u)\n",
                               startbit, mapunit);
                        goto bad;
                }
                if (startbit > e->highbit - mapunit) {
                        pr_err("SELinux: ebitmap start bit (%u) is "
                               "beyond the end of the bitmap (%u)\n",
                               startbit, (e->highbit - mapunit));
                        goto bad;
                }

                if (!n || startbit >= n->startbit + EBITMAP_SIZE) {
                        struct ebitmap_node *tmp;
                        tmp = kmem_cache_zalloc(ebitmap_node_cachep,
                                                GFP_KERNEL);
                        if (!tmp) {
                                pr_err("SELinux: ebitmap: out of memory\n");
                                rc = -ENOMEM;
                                goto bad;
                        }
                        /* round down */
                        tmp->startbit = startbit - (startbit % EBITMAP_SIZE);
                        if (n)
                                n->next = tmp;
                        else
                                e->node = tmp;
                        n = tmp;
                } else if (startbit <= n->startbit) {
                        pr_err("SELinux: ebitmap: start bit %u"
                               " comes after start bit %u\n",
                               startbit, n->startbit);
                        goto bad;
                }

                rc = next_entry(&mapbits, fp, sizeof(u64));
                if (rc < 0) {
                        pr_err("SELinux: ebitmap: truncated map\n");
                        goto bad;
                }
                map = le64_to_cpu(mapbits);
                if (!map) {
                        pr_err("SELinux: ebitmap: empty map\n");
                        goto bad;
                }

                index = (startbit - n->startbit) / EBITMAP_UNIT_SIZE;
                while (map) {
                        n->maps[index++] = map & (-1UL);
                        map = EBITMAP_SHIFT_UNIT_SIZE(map);
                }
        }

        if (n && n->startbit + EBITMAP_SIZE != e->highbit) {
                pr_err("SELinux: ebitmap: high bit %u is not equal to the expected value %zu\n",
                       e->highbit, n->startbit + EBITMAP_SIZE);
                goto bad;
        }

ok:
        rc = 0;
out:
        return rc;
bad:
        if (!rc)
                rc = -EINVAL;
        ebitmap_destroy(e);
        goto out;
}

int ebitmap_write(const struct ebitmap *e, struct policy_file *fp)
{
        struct ebitmap_node *n;
        u32 bit, count, last_bit, last_startbit;
        __le32 buf[3];
        u64 map;
        int rc;

        buf[0] = cpu_to_le32(BITS_PER_U64);

        count = 0;
        last_bit = 0;
        last_startbit = U32_MAX;
        ebitmap_for_each_positive_bit(e, n, bit)
        {
                if (last_startbit == U32_MAX ||
                    rounddown(bit, BITS_PER_U64) > last_startbit) {
                        count++;
                        last_startbit = rounddown(bit, BITS_PER_U64);
                }
                last_bit = roundup(bit + 1, BITS_PER_U64);
        }
        buf[1] = cpu_to_le32(last_bit);
        buf[2] = cpu_to_le32(count);

        rc = put_entry(buf, sizeof(u32), 3, fp);
        if (rc)
                return rc;

        map = 0;
        last_startbit = U32_MAX;
        ebitmap_for_each_positive_bit(e, n, bit)
        {
                if (last_startbit == U32_MAX ||
                    rounddown(bit, BITS_PER_U64) > last_startbit) {
                        __le64 buf64[1];

                        /* this is the very first bit */
                        if (!map) {
                                last_startbit = rounddown(bit, BITS_PER_U64);
                                map = (u64)1 << (bit - last_startbit);
                                continue;
                        }

                        /* write the last node */
                        buf[0] = cpu_to_le32(last_startbit);
                        rc = put_entry(buf, sizeof(u32), 1, fp);
                        if (rc)
                                return rc;

                        buf64[0] = cpu_to_le64(map);
                        rc = put_entry(buf64, sizeof(u64), 1, fp);
                        if (rc)
                                return rc;

                        /* set up for the next node */
                        map = 0;
                        last_startbit = rounddown(bit, BITS_PER_U64);
                }
                map |= (u64)1 << (bit - last_startbit);
        }
        /* write the last node */
        if (map) {
                __le64 buf64[1];

                /* write the last node */
                buf[0] = cpu_to_le32(last_startbit);
                rc = put_entry(buf, sizeof(u32), 1, fp);
                if (rc)
                        return rc;

                buf64[0] = cpu_to_le64(map);
                rc = put_entry(buf64, sizeof(u64), 1, fp);
                if (rc)
                        return rc;
        }
        return 0;
}

u32 ebitmap_hash(const struct ebitmap *e, u32 hash)
{
        struct ebitmap_node *node;

        /* need to change hash even if ebitmap is empty */
        hash = jhash_1word(e->highbit, hash);
        for (node = e->node; node; node = node->next) {
                hash = jhash_1word(node->startbit, hash);
                hash = jhash(node->maps, sizeof(node->maps), hash);
        }
        return hash;
}

void __init ebitmap_cache_init(void)
{
        ebitmap_node_cachep = KMEM_CACHE(ebitmap_node, SLAB_PANIC);
}













































































































































    2 








    2 














1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2012 ARM Ltd.
 */
#ifndef __ASM_HW_BREAKPOINT_H
#define __ASM_HW_BREAKPOINT_H

#include <asm/cputype.h>
#include <asm/cpufeature.h>
#include <asm/sysreg.h>
#include <asm/virt.h>

struct arch_hw_breakpoint_ctrl {
        u32 __reserved        : 19,
        len                : 8,
        type                : 2,
        privilege        : 2,
        enabled                : 1;
};

struct arch_hw_breakpoint {
        u64 address;
        u64 trigger;
        struct arch_hw_breakpoint_ctrl ctrl;
};

/* Privilege Levels */
#define AARCH64_BREAKPOINT_EL1        1
#define AARCH64_BREAKPOINT_EL0        2

#define DBG_HMC_HYP                (1 << 13)

static inline u32 encode_ctrl_reg(struct arch_hw_breakpoint_ctrl ctrl)
{
        u32 val = (ctrl.len << 5) | (ctrl.type << 3) | (ctrl.privilege << 1) |
                ctrl.enabled;

        if (is_kernel_in_hyp_mode() && ctrl.privilege == AARCH64_BREAKPOINT_EL1)
                val |= DBG_HMC_HYP;

        return val;
}

static inline void decode_ctrl_reg(u32 reg,
                                   struct arch_hw_breakpoint_ctrl *ctrl)
{
        ctrl->enabled        = reg & 0x1;
        reg >>= 1;
        ctrl->privilege        = reg & 0x3;
        reg >>= 2;
        ctrl->type        = reg & 0x3;
        reg >>= 2;
        ctrl->len        = reg & 0xff;
}

/* Breakpoint */
#define ARM_BREAKPOINT_EXECUTE        0

/* Watchpoints */
#define ARM_BREAKPOINT_LOAD        1
#define ARM_BREAKPOINT_STORE        2

/* Lengths */
#define ARM_BREAKPOINT_LEN_1        0x1
#define ARM_BREAKPOINT_LEN_2        0x3
#define ARM_BREAKPOINT_LEN_3        0x7
#define ARM_BREAKPOINT_LEN_4        0xf
#define ARM_BREAKPOINT_LEN_5        0x1f
#define ARM_BREAKPOINT_LEN_6        0x3f
#define ARM_BREAKPOINT_LEN_7        0x7f
#define ARM_BREAKPOINT_LEN_8        0xff

/* Kernel stepping */
#define ARM_KERNEL_STEP_NONE        0
#define ARM_KERNEL_STEP_ACTIVE        1
#define ARM_KERNEL_STEP_SUSPEND        2

/*
 * Limits.
 * Changing these will require modifications to the register accessors.
 */
#define ARM_MAX_BRP                16
#define ARM_MAX_WRP                16

/* Virtual debug register bases. */
#define AARCH64_DBG_REG_BVR        0
#define AARCH64_DBG_REG_BCR        (AARCH64_DBG_REG_BVR + ARM_MAX_BRP)
#define AARCH64_DBG_REG_WVR        (AARCH64_DBG_REG_BCR + ARM_MAX_BRP)
#define AARCH64_DBG_REG_WCR        (AARCH64_DBG_REG_WVR + ARM_MAX_WRP)

/* Debug register names. */
#define AARCH64_DBG_REG_NAME_BVR        bvr
#define AARCH64_DBG_REG_NAME_BCR        bcr
#define AARCH64_DBG_REG_NAME_WVR        wvr
#define AARCH64_DBG_REG_NAME_WCR        wcr

/* Accessor macros for the debug registers. */
#define AARCH64_DBG_READ(N, REG, VAL) do {\
        VAL = read_sysreg(dbg##REG##N##_el1);\
} while (0)

#define AARCH64_DBG_WRITE(N, REG, VAL) do {\
        write_sysreg(VAL, dbg##REG##N##_el1);\
} while (0)

struct task_struct;
struct notifier_block;
struct perf_event_attr;
struct perf_event;
struct pmu;

extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl,
                                  int *gen_len, int *gen_type, int *offset);
extern int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw);
extern int hw_breakpoint_arch_parse(struct perf_event *bp,
                                    const struct perf_event_attr *attr,
                                    struct arch_hw_breakpoint *hw);
extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
                                           unsigned long val, void *data);

extern int arch_install_hw_breakpoint(struct perf_event *bp);
extern void arch_uninstall_hw_breakpoint(struct perf_event *bp);
extern void hw_breakpoint_pmu_read(struct perf_event *bp);
extern int hw_breakpoint_slots(int type);

#ifdef CONFIG_HAVE_HW_BREAKPOINT
extern void hw_breakpoint_thread_switch(struct task_struct *next);
extern void ptrace_hw_copy_thread(struct task_struct *task);
#else
static inline void hw_breakpoint_thread_switch(struct task_struct *next)
{
}
static inline void ptrace_hw_copy_thread(struct task_struct *task)
{
}
#endif

/* Determine number of BRP registers available. */
static inline int get_num_brps(void)
{
        u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
        return 1 +
                cpuid_feature_extract_unsigned_field(dfr0,
                                                ID_AA64DFR0_EL1_BRPs_SHIFT);
}

/* Determine number of WRP registers available. */
static inline int get_num_wrps(void)
{
        u64 dfr0 = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1);
        return 1 +
                cpuid_feature_extract_unsigned_field(dfr0,
                                                ID_AA64DFR0_EL1_WRPs_SHIFT);
}

#ifdef CONFIG_CPU_PM
extern void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int));
#else
static inline void cpu_suspend_set_dbg_restorer(int (*hw_bp_restore)(unsigned int))
{
}
#endif

#endif        /* __ASM_BREAKPOINT_H */

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r3, 0x4068aea3, &(0x7f0000000180)={0xdf, 0x0, 0x1000000})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x20000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x80800)
r3 = eventfd2(0x7, 0x800)
write$eventfd(r2, 0x0, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={0xffffffffffffffff, 0xffff, 0x1, r3})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013c038, &(0x7f0000000000)=0xff})

      
      r0 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r0, &(0x7f0000c00000/0x400000)=nil)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x40000, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x6, 0x8032, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0xc, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x20000000)

      
      mmap$KVM_VCPU(&(0x7f0000581000/0x1000)=nil, 0x930, 0x0, 0x4020131, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="820000000000000028000000000000000200000000002200040000000000000001"], 0x28}, 0x0, 0x0)
r4 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r5=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r5, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x4, 0x3, 0x0})

      
      openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
r0 = ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
ioctl$KVM_SET_ONE_REG(r0, 0x4010aeac, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r5, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r5, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000180)=@x86={0xac, 0xd, 0x1, 0x0, 0xe4a, 0xa, 0x9, 0x6, 0x9, 0x3, 0x1, 0x9, 0x0, 0x6, 0x7fff, 0x6, 0x1, 0xfd, 0x1, '\x00', 0xd0, 0x5})
ioctl$KVM_IOEVENTFD(r4, 0x4040ae79, &(0x7f0000000140)={0xb5d6, 0x10000, 0x0, 0xffffffffffffffff, 0x8})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x2, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000140)={0x0, 0x80000000, 0x1}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(0xffffffffffffffff, 0x4018aee1, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, r2, 0x100000c, 0x23ac5f9b426ec4b2, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
r3 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GUEST_DEBUG(0xffffffffffffffff, 0x4208ae9b, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
munmap(&(0x7f0000f12000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, 0xffffffffffffffff, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR_vcpu(0xffffffffffffffff, 0x4018aee3, &(0x7f0000000180)=@attr_pmu_irq={0x0, 0x0, 0x0, 0x0})
ioctl$KVM_IOEVENTFD(r3, 0x5452, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x0, 0x4003831, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x2, 0x4003831, 0xffffffffffffffff, 0x0)
ioctl$KVM_RUN(0xffffffffffffffff, 0xae80, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x3000007, 0x2012, r2, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x2000007, 0x2012, r2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000000)={0xffffffffffffffff, 0x1, 0xea12157bff932e6})

      
      r0 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000180)={0x0, &(0x7f00000001c0)=[@msr={0x14, 0x20, {0x603000000013c4f1, 0x8000}}, @msr={0x14, 0x20, {0x603000000013c4f2, 0x8000}}, @msr={0x14, 0x20, {0x603000000013dce0, 0x8000}}, @msr={0x14, 0x20, {0x603000000013dce1, 0x8000}}, @msr={0x14, 0x20, {0x603000000013dce2, 0x8000}}, @msr={0x14, 0x20, {0x603000000013dce3, 0x8000}}, @msr={0x14, 0x20, {0x603000000013dce4, 0x8000}}, @msr={0x14, 0x20, {0x603000000013dce5, 0x8000}}, @msr={0x14, 0x20, {0x603000000013dce8, 0x8000}}, @msr={0x14, 0x20, {0x603000000013dce9, 0x8000}}], 0x140}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r4, 0x3, 0x11, r3, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
syz_kvm_assert_syzos_uexit$arm64(r5, 0xffffffffffffffff)
syz_kvm_assert_reg(r3, 0x603000000013c4f1, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013c4f2, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce0, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce1, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce2, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce3, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce4, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce5, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce8, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce9, 0x8000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000080)={0xe1, 0x0, 0x2000})
ioctl$KVM_GET_DIRTY_LOG(r1, 0x4010ae42, &(0x7f0000000040)={0x10200, 0x0, &(0x7f0000d1e000/0x2000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x1, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xe})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_sys={0x603000000013804c, &(0x7f00000000c0)=0x8})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000340), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x88)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0x80111500, 0x20000000)
ioctl$KVM_CREATE_VM(r1, 0x541b, 0x10000000000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000f1a000/0x4000)=nil, 0x930, 0x0, 0x9032, 0xffffffffffffffff, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000d46000/0x3000)=nil, r1, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000001c0)={0x5, 0x2})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_ARM_SET_COUNTER_OFFSET(r1, 0x4010aeb5, &(0x7f00000000c0)={0x6})

      
      r0 = openat$kvm(0x0, &(0x7f0000000b40), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bff000/0x400000)=nil)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000300)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000200)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000080)={0xc0000000, 0x10000, 0x2}})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x401054d5, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000100)={0xdf, 0x0, 0x4000})
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000080)={0xdf, 0x0, 0x8000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0xa000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000000)=@arm64_fp_extra={0x60200000001000d9, &(0x7f00000000c0)=0x9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100012, 0xfffffffffffffffe})

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
r0 = mmap$KVM_VCPU(&(0x7f00006b5000/0x2000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
mmap$KVM_VCPU(&(0x7f00006b4000/0x3000)=nil, r2, 0x300000f, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, 0x930, 0x8, 0x8032, 0xffffffffffffffff, 0x0)
r2 = eventfd2(0x0, 0x80000)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000100)={0xd000, 0x0, 0x0, r2})
close(0x5)
close(0x4)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x2b, 0x8, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x4, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0xffff8001, 0x1, 0x0, 0x1, 0x1, '\x00', 0xe, 0x200})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x4b47, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000004c0), 0x22100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION2(r1, 0x40a0ae49, &(0x7f0000000700)={0x1fe, 0x3, 0x1, 0x2000, &(0x7f0000f06000/0x2000)=nil, 0x7})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2c000020fbff675200006f8f1f449a7a835673312b54ebb2aa76bf69d22627e6fffffffffffffb00", 0x0, 0xfffffffffffffe57)
r4 = openat$kvm(0x0, &(0x7f0000000000), 0x101001, 0x0)
r5 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r4, 0xae04)
mmap$KVM_VCPU(&(0x7f000002a000/0x2000)=nil, r5, 0xe, 0x4003831, r2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_core={0x603000000010001e, &(0x7f00000000c0)=0x3})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(0xffffffffffffffff, 0x4040aea0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4020ae46, &(0x7f00000000c0)=ANY=[@ANYBLOB="010000000100000000000001000000000010", @ANYRESDEC])

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x0, 0x200000e, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_core={0x6030000000100036, &(0x7f00000001c0)=0x86c4})

      
      munmap(&(0x7f0000ce0000/0x3000)=nil, 0x3000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x1000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000880)=[{0x0, &(0x7f00000006c0)=[@smc={0x1e, 0x40, {0xc5000021, [0x7d, 0x5, 0x1ff, 0x154, 0xb8d5]}}, @svc={0x122, 0x40, {0x86000000, [0x3, 0x0, 0x7fffffffffffffff, 0x458, 0x100]}}], 0x80}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0xef000000, 0x1000, 0x2}})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=[@smc={0x1e, 0x40, {0x80007fff, [0x0, 0x1, 0x2, 0x4, 0x4]}}], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_ccsidr={0x6020000000110005, &(0x7f0000000040)=0x8})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000001c0)={0x5, 0x1f})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_core={0x603000000010003e, &(0x7f0000000180)=0xfff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x103a00, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000300)=@arm64_fw={0x6030000000140002, &(0x7f00000002c0)=0x4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_other={0x0, 0x8, 0xfffffffffffffffd, 0xfffffffffffffffe})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x603000000010003a, &(0x7f0000000100)=0x7})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_sys={0x603000000013e099, &(0x7f0000000140)=0xfffffffffffffff9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000140)={0x0, &(0x7f0000000180)=ANY=[], 0xe0}, 0x0, 0x0)
ioctl$KVM_GET_REG_LIST(r3, 0xc008aeb0, &(0x7f0000000000))

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x2)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x7, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x6030000000140002, &(0x7f0000000100)=0x100000000000a})

      
      ioctl$KVM_RUN(0xffffffffffffffff, 0xae80, 0x0)
r0 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
syz_kvm_setup_syzos_vm$arm64(r2, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r2, 0x4020ae46, &(0x7f0000000000)={0x1fe, 0x0, 0x8080000, 0x1000, &(0x7f0000ffc000/0x1000)=nil})
r3 = syz_kvm_add_vcpu$arm64(r0, &(0x7f0000000080)={0x0, &(0x7f0000000280)=ANY=[@ANYBLOB="aa00000000000000280000000000000003"], 0x28}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_extra={0x603000000013df11, &(0x7f0000000040)})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x8, 0x4, &(0x7f00000001c0)=0xe5c5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(0xffffffffffffffff, 0x4018aee1, &(0x7f00000002c0)=@attr_pmu_init)
r2 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f00000001c0)={0x8, <r4=>0xffffffffffffffff})
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
r7 = syz_kvm_setup_syzos_vm$arm64(r6, &(0x7f0000c00000/0x400000)=nil)
r8 = syz_kvm_add_vcpu$arm64(r7, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@its_setup={0x82, 0x28, {0x1, 0x2001, 0x1}}], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r6, 0x4, 0x100)
ioctl$KVM_CREATE_DEVICE(r6, 0xc00caee0, &(0x7f0000000100)={0x8, <r9=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r9, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, 0x0})
ioctl$KVM_RUN(r8, 0xae80, 0x0)
r10 = openat$kvm(0x0, &(0x7f0000000100), 0x0, 0x0)
r11 = ioctl$KVM_CREATE_VM(r10, 0xae01, 0x0)
r12 = syz_kvm_setup_syzos_vm$arm64(r11, &(0x7f0000c00000/0x400000)=nil)
r13 = syz_kvm_add_vcpu$arm64(r12, &(0x7f0000000140)={0x0, &(0x7f0000000180)=ANY=[], 0xe0}, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_REG_LIST(r13, 0xc008aeb0, &(0x7f0000000000))
ioctl$KVM_SIGNAL_MSI(r6, 0x4020aea5, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x128, &(0x7f0000000340)=0x8000000000000000})
close(0x4)
close(0x5)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x4, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_arm64={0x0, 0x1, 0x5, 0x0})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f00000001c0)={0x8, <r4=>0xffffffffffffffff})
syz_kvm_setup_syzos_vm$arm64(r3, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0x80111500, 0x20000000)
write$eventfd(r2, &(0x7f0000000000), 0xfffffdef)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r4, 0xc00caee0, &(0x7f0000000000)={0x8, <r5=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r5, 0x541b, 0x0)

      
      close(0x3)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(0xffffffffffffffff, 0x4018aee1, &(0x7f00000002c0)=@attr_pmu_init)
r2 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = syz_kvm_setup_syzos_vm$arm64(r4, &(0x7f0000c00000/0x400000)=nil)
r6 = syz_kvm_add_vcpu$arm64(r5, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@its_setup={0x82, 0x28, {0x1, 0x2001, 0x1}}], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r4, 0x4, 0x100)
ioctl$KVM_CREATE_DEVICE(r4, 0xc00caee0, &(0x7f0000000100)={0x8, <r7=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r7, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, 0x0})
ioctl$KVM_RUN(r6, 0xae80, 0x0)
r8 = openat$kvm(0x0, &(0x7f0000000100), 0x0, 0x0)
r9 = ioctl$KVM_CREATE_VM(r8, 0xae01, 0x0)
r10 = syz_kvm_setup_syzos_vm$arm64(r9, &(0x7f0000c00000/0x400000)=nil)
r11 = syz_kvm_add_vcpu$arm64(r10, &(0x7f0000000140)={0x0, &(0x7f0000000180)=ANY=[], 0xe0}, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_REG_LIST(r11, 0xc008aeb0, &(0x7f0000000000))
ioctl$KVM_SIGNAL_MSI(r4, 0x4020aea5, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x128, &(0x7f0000000340)=0x8000000000000000})
close(0x4)
close(0x5)

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000f82000/0x3000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION2(r1, 0x40a0ae49, &(0x7f00000000c0)={0x1fd, 0x0, 0xdddda000, 0x2000, &(0x7f0000ffc000/0x2000)=nil, 0x6})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000400)={0x0, 0x0, 0x0, 0x20002000, &(0x7f0000000000/0x2000)=nil})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000000000/0x400000)=nil)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000100)="e20d12caf4bcbfb01044d0101019522def6e7908b76e28fdd765d0f8eaa16f41e9b2a4df6d542391c0b464bd97afcf854e2d2067ac3ac884975660f5ded4cca6c85e2d5a3259dd65", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r4, 0x0)
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000040)={0xfffffffffffff001, 0x2000})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x88141, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x6030000000100024, &(0x7f00000000c0)=0xf55})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000140)={0x8000000, 0x104000})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000000)={0xffff1000, 0x8000, 0x1})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000100)={0x80a0000})

      
      munmap(&(0x7f0000ec1000/0x3000)=nil, 0x3000)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0xf, 0x9032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000080)={0x0, 0x6})
ioctl$KVM_IRQ_LINE_STATUS(r1, 0xc008ae67, &(0x7f0000000040)={0x0, 0x101})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xe6)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
ioctl$KVM_GET_API_VERSION(r0, 0xae03, 0x42)

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ffa000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e87000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000000)=@arm64_ccsidr={0x606000000011000d, 0x0})

      
      openat$kvm(0x0, &(0x7f0000000280), 0x505001, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000001000/0x2000)=nil, 0x930, 0x2000003, 0x4120932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_SIGNAL_MASK(r2, 0x4004ae8b, &(0x7f0000000240)=ANY=[])

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x3, 0xa0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000200)=@attr_other={0x0, 0x3, 0x612, 0x0})

      
      ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x3)
r0 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r1 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r3, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013dce0, &(0x7f0000000000)=0x3ff})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
r4 = syz_kvm_setup_syzos_vm$arm64(r2, &(0x7f0000c00000/0x400000)=nil)
r5 = syz_kvm_add_vcpu$arm64(r4, &(0x7f0000000180)={0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="14000000000000002000000000000000f2c4130000003060008000000000000014000000000000002000000000000000e0dc1300000030d11b"], 0x60}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r5, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
ioctl$KVM_RUN(r5, 0xae80, 0x0)
r6 = syz_kvm_add_vcpu$arm64(r0, &(0x7f0000000180)={0x0, &(0x7f0000000380)=ANY=[@ANYBLOB="14000000000000002000000000000000f1c4130000003060008000000000000014000000000000002000000000000000f2c4130000003060008000000000000014000000000000002000000000000000e0dc130000003060c7"], 0x140}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r6, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
ioctl$KVM_RUN(r6, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_DEVICE_ATTR_vm(r1, 0x4018aee2, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r3 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = syz_kvm_setup_syzos_vm$arm64(r4, &(0x7f0000c00000/0x400000)=nil)
r6 = syz_kvm_add_vcpu$arm64(r5, &(0x7f0000000180)={0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="14000000000000002000000000000000f2c4130000003060008000000000000014000000000000002000000000000000e0dc1300000030d11b"], 0x60}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r6, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
ioctl$KVM_RUN(r6, 0xae80, 0x0)
r7 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000180)={0x0, &(0x7f0000000380)=ANY=[@ANYBLOB="14000000000000002000000000000000f1c4130000003060008000000000000014000000000000002000000000000000f2c4130000003060008000000000000014000000000000002000000000000000e0dc130000003060c7"], 0x140}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r7, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
ioctl$KVM_RUN(r7, 0xae80, 0x0)
r8 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r9 = syz_kvm_add_vcpu$arm64(r8, &(0x7f0000000180)={0x0, &(0x7f00000001c0)=[@msr={0x14, 0x20, {0x6030000000138012, 0x8000}}], 0x20}, 0x0, 0x0)
ioctl$KVM_RUN(r9, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f00000002c0)=@arm64={0x0, 0x5, 0x1, '\x00', 0x100000000})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r5, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x9, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x0, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013df19, &(0x7f0000000100)=0x1})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000005c0), 0x20000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_ARM_SET_COUNTER_OFFSET(r1, 0x4010aeab, &(0x7f0000001280)={0x6, 0x1000})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000080)={0xe1, 0x0, 0x2000})
ioctl$KVM_CLEAR_DIRTY_LOG(r1, 0xc018aec0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000080)={0x6, 0xffffffffffffffff, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000080)=[{0x600000000000000, 0x0}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000080)=@arm64_sys={0x603000000013c807, &(0x7f0000000280)=0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0x80111500, 0x20000000)
write$eventfd(r1, &(0x7f0000000000), 0xfffffdef)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x408)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000001, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x1, 0xcccc0000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x0, 0x10000000000, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000000)={0x8, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x5760, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xc0189436, 0x172)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000000)=@attr_pvtime_ipa={0x0, 0x2, 0x0, 0xffffffffffffffff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000280)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000001000000000000001100000000000000aa00000000000000280000000000000009", @ANYRESOCT=r1], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, &(0x7f00000001c0)=0xc, 0xfffffdd9)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(0xffffffffffffffff, 0x4040ae79, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, 0x0, 0x80180, 0x0)
ioctl$KVM_CHECK_EXTENSION(r2, 0x5450, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r3=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x3, 0x4, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(0xffffffffffffffff, 0xae03, 0x83)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000200)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0x8, 0xbc, 0x2}})
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000080)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000000)={0x8, 0x401, 0x2}})

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xd7, 0x80000001})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
r5 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x1)
r6 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x1800002, 0x11, r5, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r6, 0x20, &(0x7f0000000380)="f30138dd033be3ac4ac4a29ea6ab08004b584bd92e2e0000000000000f0000000000010001000000000000000300000000000000040a00", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r5, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x40305828, &(0x7f0000000040)=@attr_arm64={0x0, 0x0, 0x100000000000000, 0x0})

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000200), 0x22c00, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x4, <r3=>0xffffffffffffffff})
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000180)=@attr_arm64={0x0, 0x1, 0x2, &(0x7f0000000000)=0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x3, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_DEVICE_ATTR_vcpu(r2, 0x4018aee2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000000)=ANY=[], 0x40}, 0x0, 0x0)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x2082, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
ioctl$KVM_CAP_HALT_POLL(r1, 0x4068aea3, &(0x7f0000000300)={0xb6, 0x0, 0x7c65dec2})
r6 = syz_kvm_setup_syzos_vm$arm64(r5, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r6, &(0x7f0000000080)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="0a0000000000000018000000000000007f2003d5c0"], 0x18}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000100)={0x0, 0x7f})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000240)=@attr_arm64={0x0, 0x5, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_fp={0x6040000000100086, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f000000e000/0x3000)=nil, r1, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000c8e000/0x2000)=nil, r1, 0x1000006, 0x1010, r2, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x2901, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
r7 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000240), 0x65a480, 0x0)
ioctl$KVM_CHECK_EXTENSION(r7, 0xae03, 0xef)
r8 = syz_kvm_setup_syzos_vm$arm64(r6, &(0x7f0000c00000/0x400000)=nil)
r9 = syz_kvm_add_vcpu$arm64(r8, &(0x7f0000000080)={0x0, &(0x7f0000000380)=ANY=[@ANYBLOB="820000"], 0x28}, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r6, 0xc00caee0, &(0x7f00000000c0)={0x8, <r10=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r10, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
syz_kvm_vgic_v3_setup(r6, 0x4, 0x100)
ioctl$KVM_CREATE_DEVICE(r6, 0xc00caee0, &(0x7f0000000100)={0x8})
ioctl$KVM_RUN(r9, 0xae80, 0x0)
ioctl$KVM_SIGNAL_MSI(r6, 0x4020aea5, &(0x7f0000000200)={0x8090040, 0x0, 0x0, 0x1, 0x5})
ioctl$KVM_SET_DEVICE_ATTR_vm(r4, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0x3, 0x1000, 0x2}})
r11 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r4, r11, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="1e0000000000000040000000000000005200008400000000fc54e74f000000007802000000000000f8ffffffffffffff00000000000000000a00000000000000"], 0x40}], 0x1, 0x0, 0x0, 0x0)
r12 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r13 = ioctl$KVM_CREATE_VM(r12, 0xae01, 0x0)
r14 = syz_kvm_setup_syzos_vm$arm64(r13, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_setup_syzos_vm$arm64(r13, &(0x7f0000c00000/0x400000)=nil)
r15 = syz_kvm_add_vcpu$arm64(r14, &(0x7f0000000080)={0x0, &(0x7f0000000180)=ANY=[@ANYBLOB='n\x00\x00\x00\x00\x00\x00\x000\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\b\x00\x00\x00\x00\x00i'], 0x30}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r13, 0xfffffbffffffffff, 0x240)
ioctl$KVM_RUN(r15, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013c801, &(0x7f00000000c0)=0x4e0045a2})

      
      munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x2, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000400)={0x0, 0x0, 0x0, 0x20002000, &(0x7f0000000000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x5, 0x3, 0xffff1000, 0x1000, &(0x7f0000000000/0x1000)=nil})
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000000000/0x400000)=nil)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000380)={0x10200, 0x0, 0xdddd1000, 0x1000, &(0x7f0000ffe000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000180)=ANY=[@ANYBLOB='n\x00\x00\x00\x00\x00\x00\x000\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\b\x00\x00\x00\x00\x00i'], 0x30}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x60)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x400454ca, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x100000c, 0x16831, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)

      
      munmap(&(0x7f0000ce0000/0x3000)=nil, 0x3000)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_sys={0x603000000013e088, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee8000, 0x0, r2, 0x2})
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x2, 0x0, 0x0, r3})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000001a40)={0x8, 0x0, 0x0, r3})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x3, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x2, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x3, 0x0, 0x1, r2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x0, 0x0, 0x1, r2, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x121e82, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x6)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000100)={0x5})
r3 = syz_kvm_vgic_v3_setup(r1, 0x2, 0x40)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x6, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ff8000/0x8000)=nil, r1, 0x1000000, 0xe637a22295c143f8, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x1fe, 0x0, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000100), 0x183a42, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000180)=@arm64_ccsidr={0x6020000000110007, 0xfffffffffffffffe})

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x7, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000b80)={0x0, &(0x7f0000000400)=ANY=[@ANYBLOB="1e000000000000004000000000000000040000c400000000", @ANYBLOB="26d83423"], 0x40}, &(0x7f0000000240)=[@featur1={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r2, 0x40fff, 0x0, r2})
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000140)={r2, 0xc8})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x1000)=nil, 0x930, 0x100000f, 0x10, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013df1a, &(0x7f0000000100)=0xfffffffffffffffe})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x121e82, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000200)={0x5, 0x2, 0x0, 0x1000, &(0x7f0000d00000/0x1000)=nil})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x0, 0x23ac5f9b426e84b2, 0xffffffffffffffff, 0x0)

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000daf000/0x3000)=nil, 0x930, 0x3000007, 0x8a031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c8f000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)

      
      openat$kvm(0x0, 0x0, 0x0, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, 0x930, 0x200000e, 0x30d2a4fbfbea96b8, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0xa)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x15})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000240)=@arm64_sve_vls={0x606000000015ffff, &(0x7f0000000280)=0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x300000c, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
ioctl$KVM_IRQ_LINE_STATUS(r1, 0xc008ae67, &(0x7f0000000200)={0xfffff76a, 0xe})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x80111500, 0xfffffffffffff000)

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ff5000/0x1000)=nil, 0x1000)
munmap(&(0x7f00007df000/0x1000)=nil, 0x1000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000584000/0x800000)=nil, 0x800000)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000000)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_add_vcpu$arm64(0x0, &(0x7f00000000c0)={0x0, &(0x7f0000000240)=ANY=[], 0x50}, 0x0, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000080)=ANY=[@ANYBLOB="02000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001"])

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_GET_API_VERSION(r0, 0xae00, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x2, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x5452, 0x2000fdfd)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x46)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x40086602, 0x8000000400000004)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x7a)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f00000000c0)={0xdf, 0x0, 0x10000})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000bfd000/0x400000)=nil)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f0000001000/0x400000)=nil, &(0x7f0000000000)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
close(0x4)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000140)=0x1, 0x8)
write$eventfd(r4, &(0x7f0000000100), 0x8)
write$eventfd(r4, &(0x7f0000000800)=0x3, 0x8)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000200)=@arm64_core={0x603000000010002c, &(0x7f0000000140)=0x2})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000001000/0x2000)=nil, 0x930, 0x2000003, 0x4120932, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f00004f0000/0x2000)=nil, 0x930, 0x0, 0x11, r2, 0x0)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r3, 0xae04)
mmap$KVM_VCPU(&(0x7f0000dee000/0x3000)=nil, r4, 0x100000e, 0x8a031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, 0x930, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, r4, 0x2000002, 0x4f832, 0xffffffffffffffff, 0x0)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="fb52b6e7127a837f0c00003345de6a2d8dc85e000000d533f3f64e44c40400002885848ad900000800000000f3ffffffffffffff00", 0x0, 0x48)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="0100000001000000000000000800"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xff3c)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x2})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_core={0x603000000010001e, &(0x7f0000000180)=0x23})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x40, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000bff000/0x400000)=nil, &(0x7f0000000000)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000bfd000/0x400000)=nil)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_fw={0x6030000000140002})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000040)={0x1, 0x1, 0x5000, 0x2000, &(0x7f0000fa2000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x26e8, 0x0, 0x0, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
close(r1)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_HALT_POLL(r1, 0x4068aea3, &(0x7f0000000940)={0xb6, 0x0, 0x79b3e5a9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000b80)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="1e000000000000004000000000000000040000c40000"], 0x40}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_GUEST_MEMFD(0xffffffffffffffff, 0xc040aed4, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r2 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r2, 0x4040ae79, &(0x7f0000000040)={0x3, 0xf000, 0x0, r3, 0x8})
ioctl$KVM_IOEVENTFD(r2, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x8, r3, 0x1})
ioctl$KVM_IOEVENTFD(r2, 0x4040ae79, &(0x7f0000000080)={0xffffffffffffffff, 0x0, 0x1, r3, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x0, 0x20)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_GET_REG_LIST(r2, 0xc008aeb0, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x29031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000e0c000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000f1a000/0x4000)=nil, 0x930, 0x0, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x3, 0x9032, 0xffffffffffffffff, 0x0)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ad4000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000584000/0x800000)=nil, 0x800000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f00000001c0)=@attr_arm64={0x0, 0x8, 0x3, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x1f9, 0x0, 0xeeee8000, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000080)={0xe1, 0x0, 0x2000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x60000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
syz_kvm_setup_cpu$arm64(0xffffffffffffffff, 0xffffffffffffffff, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000080)=ANY=[@ANYBLOB="1e000000000000202300000068106a0a0e345817"], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, &(0x7f0000000100), 0x183a42, 0x0)
close(0x5)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_IRQ_LINE_STATUS(r1, 0xc008ae67, &(0x7f0000000040)={0x0, 0x9})
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000080)={0x0, 0x6})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000380)="f30149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a3ff7fbc51869be2e2e0000000000000f000000000000000001000000000000000000000000000e00", 0x0, 0x34)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
close(r1)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r5 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r4, 0xae04)
mmap$KVM_VCPU(&(0x7f0000521000/0x2000)=nil, r5, 0x2, 0x810, r1, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = eventfd2(0x0, 0x0)
close(r1)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000004000/0x4000)=nil, r2, 0x680000a, 0x11, r1, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x408)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000001, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_SIGNAL_MASK(r2, 0x4004ae8b, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000700)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x0, 0x0})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000340)=@attr_other={0x0, 0x1, 0x648c, &(0x7f0000000000)=0x80000000})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x10, 0xffffffffffffffff, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x7, <r3=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x801054db, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000038000/0x1000)=nil, 0x930, 0x1, 0x30, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
r6 = openat$kvm(0x0, &(0x7f0000000040), 0x109000, 0x0)
r7 = ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
r8 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r9 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r8, 0xae04)
r10 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x3000)=nil, r9, 0x100000a, 0x12, r10, 0x100000)
mmap$KVM_VCPU(&(0x7f0000ffe000/0x1000)=nil, r9, 0x1, 0x11, r10, 0x0)
r11 = ioctl$KVM_CREATE_VCPU(r7, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, r5, 0x0, 0x12, r11, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r5, 0x0, 0x2012, r11, 0x0)
r12 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000ff9000/0x4000)=nil, 0x930, 0x280000f, 0x11, r12, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r12, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = eventfd2(0x0, 0x0)
close(r1)
openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000004000/0x4000)=nil, r2, 0x3000004, 0x11, r1, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0xc0480, 0x0)
syz_kvm_add_vcpu$arm64(0x0, &(0x7f00000000c0)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="0700000000000000280000000000000000000000000001"], 0x28}, 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
openat$kvm(0x0, &(0x7f0000000240), 0x580, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfff02000000ffffff00000d00e6ffea000000002000", 0x0, 0xffffffffffffff98)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x39, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x5, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@irq_setup={0x46, 0x18, {0x1, 0x20}}], 0x18}, 0x0, 0x0)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000100)={0x0, &(0x7f0000000140)=[@irq_setup={0x46, 0x18, {0x1, 0x20}}], 0x18}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x100)
r5 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r6 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r5, 0x3, 0x11, r3, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r7 = mmap$KVM_VCPU(&(0x7f000000a000/0x1000)=nil, r5, 0x3, 0x11, r4, 0x0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
r8 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r9 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r10 = ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000e85000/0x2000)=nil, 0x2000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
syz_kvm_setup_syzos_vm$arm64(r10, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r10, 0x4020ae46, &(0x7f0000000000)={0x1fe, 0x0, 0x8080000, 0x1000, &(0x7f0000ffc000/0x1000)=nil})
r11 = syz_kvm_add_vcpu$arm64(r8, &(0x7f0000000080)={0x0, &(0x7f0000000280)=ANY=[@ANYBLOB="aa00000000000000280000000000000003"], 0x28}, 0x0, 0x0)
ioctl$KVM_RUN(r11, 0xae80, 0x0)
syz_kvm_assert_syzos_uexit$arm64(r7, 0xffffffffffffffff)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000180)={0x1010020, 0x1})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
syz_kvm_assert_syzos_uexit$arm64(r6, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013c801, &(0x7f00000000c0)=0x80000004a0045a6})

      
      munmap(&(0x7f0000ffa000/0x3000)=nil, 0x3000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x0, 0x4003831, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, r1, 0x5000003, 0x80031, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x139040, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000002000000000002000000000000000000aa00000000000000280000000000000009"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_GET_DEVICE_ATTR_vcpu(r2, 0x4018aee2, &(0x7f0000000140)=@attr_pvtime_ipa={0x0, 0x2, 0x0, 0xffffffffffffffff})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4020ae46, &(0x7f0000000180)=ANY=[@ANYBLOB="010000000100000000000001000000000010000002"])
ioctl$KVM_CLEAR_DIRTY_LOG(r1, 0xc018aec0, &(0x7f0000000100)={0x1, 0x1ffc01, 0x400, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000a40), 0x8600, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2(r1, 0x4068aea3, &(0x7f0000001c00)={0xa8, 0x0, 0x3})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CAP_HALT_POLL(0xffffffffffffffff, 0x4068aea3, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x6030000000100014, &(0x7f0000000040)=0x5})

      
      munmap(&(0x7f00000be000/0x1000)=nil, 0xffffffffdff41fff)

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x7, 0x4f832, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x801c581f, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR_vm(r1, 0x4018aee3, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x2, 0x0, 0x0, r2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000001a40)={0x8, 0x0, 0x0, r2, 0x4})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x102, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000ac0)=ANY=[@ANYBLOB="01000000000020"])

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000100), 0x76b200, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
r5 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)
ioctl$KVM_CREATE_VM(r5, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013dce0, &(0x7f0000000000)=0x3ff})
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r4 = syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000180)={0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="14000000000000002000000000000000f2c4130000003060008000000000000014000000000000002000000000000000e0dc1300000030d11b"], 0x60}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
r5 = ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0xfffffffffffffffa)
r6 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
r7 = openat$kvm(0x0, &(0x7f0000000040), 0x40000, 0x0)
r8 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
ioctl$KVM_ARM_VCPU_FINALIZE(r5, 0x4004aec2, &(0x7f0000000040)=0x4)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r8, 0xae03, 0xaa)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r4, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
syz_kvm_setup_cpu$arm64(r8, r4, &(0x7f0000ba4000/0x400000)=nil, &(0x7f0000000080)=[{0x0, &(0x7f0000000380)=ANY=[@ANYBLOB="1e000000000000004000000000000000010000860000000007000000000000002080000000000000e85700000000000002000000000000007c08000000000000460000000000000018000000000000000100000035000000220100000000000040000000000000000600008400000000050000000000000006000000000000000200000000000000830c00000000000003000000000000006e0000000000000030000000000000000000000000000000d80600000000000004000000000000006d4237ff2f501c34e600000000001b001800000000000000f8ffffffffffffff82000000000000002800000000000000030000000000000004000000000000004603000000000000460000000000000018000000000000000400000098010000220100000000000040000000000000000040000000000000feffffffffffffff080000000000000009000000000000005e0f0000000000000100000001000000e600000000000000180000000000000001000000000000000a000000000000009c00000000000000c0b384d200c0b8f2210180d2020080d2c30080d2040080d2020000d4008008d5008008d5007008d500f987d20060b0f2010180d2e20180d2630180d2e40180d2020000d4007008d50080209ba0b69fd20000b8f2a10080d2a20180d2c30080d2840080d2020000d400d0000f205883d20000b8f2610180d2220180d2230180d2c40080d2020000d4c0035fd62201000000000000400000000000000000000004000000000000000000000000cf00000000000000ff01000000000000080000000000000007000000000000000a000000000000009c00000000000000008008d5007008d5000008d5801686d200c0b0f2610080d2820180d2630180d2040080d2020000d4204f96d200e0b8f2210180d2620080d2230080d2440080d2020000d4007008d5004887d200c0b0f2a10080d2620180d2430080d2a40180d2020000d40100a0d4000b93d200e0b8f2a10080d2020180d2430180d2440080d2020000d4008008d5c0035fd60a000000000000005400000000000000007008d5000008d5007008d5007008d5c0648cd20020b8f2010180d2c20080d2030180d2840080d2020000d40040400d007008d50068203c0004000f003c202ec0035fd61e0000000000000040000000000000000f0000840000000001000000000000000300000000000000080000000000000001000000000000000900000000000000e60000000000000018000000000000000f000000000000001e0000000000000040000000000000000000003200000000010000000000008002000000000000000100000000000000060000000000000000000100000000001400000000000000200000000000000000000000000000000c00000000000000aa0000000000000028000000000000000301000000000a00000004000000060000000300000000000a00000000000000b400000000000000c0ea87d20080b0f2610080d2220180d2030180d2c40080d2020000d40000219e00800008007008d5a03588d200a0b0f2c10180d2c20080d2630080d2240180d2020000d40060400da07691d20020b8f2410080d2a20180d2030180d2c40180d2020000d4000028d5402982d20040b8f2010080d2e20080d2830080d2040180d2020000d400d791d20020b8f2810180d2420080d2230180d2840080d2020000d4c0035fd614000000000000002000000000000000abc213000000306001000100000000001400000000000000200000000000000062e6130000003060000000000000000022010000000000004000000000000000ff7f0080000000000100000000000000020000000000000000010000000000000800000000000000ffffffff00000000e6000000000000001800000000000000ff070000000000001e0000000000000040000000000000005000008400000000a1160000000000000a0000000000000002000000000000000000000000000000ff000000000000000a00000000000000b400000000000000403e9dd20060b0f2010080d2e20080d2830080d2c40080d2020000d4000028d5000008d5802985d200e0b8f2610180d2820180d2430080d2e40080d2020000d4607885d200e0b8f2c10080d2a20080d2430080d2a40080d2020000d400809f0d804d93d200a0b8f2a10080d2020080d2c30180d2a40080d2020000d4000028d50000229ea04d9ed200c0b0f2410080d2420080d2c30180d2a40080d2020000d4c0035fd6000000000000000018000000000000001000000000000000e6000000000000001800000000000000010100000000000046000000000000001800000000000000000000009c0000006e0000000000000030000000000000000000080800000000040000000000000000080000000000000c00000000000000"], 0x6dc}], 0x1, 0x0, &(0x7f0000000100)=[@featur1={0x1, 0x80}], 0x1)
r9 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r10 = ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x802, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r10, 0x4010ae67, &(0x7f0000000140)={0x8000000, 0x4000})
ioctl$KVM_REGISTER_COALESCED_MMIO(r10, 0x4010ae67, &(0x7f0000000080)={0xeeef0000})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r10, 0x4010ae68, &(0x7f0000000240)={0x22224000, 0x118000, 0xffffffff})
r11 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r11, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000300)=@arm64={0xc, 0x0, 0x5, '\x00', 0x8000000000000000})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
eventfd2(0x5, 0x80801)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, 0x0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x20000, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r7 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r6, 0xae04)
r8 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x3000)=nil, r7, 0x100000a, 0x12, r8, 0x100000)
mmap$KVM_VCPU(&(0x7f0000ffe000/0x1000)=nil, r7, 0x1, 0x11, r8, 0x0)
r9 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r3, 0x0, 0x2012, r9, 0x0)
mmap$KVM_VCPU(&(0x7f0000ead000/0x3000)=nil, r3, 0x2800007, 0x11, r9, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x40000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x66)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000600)=@arm64_core={0x603000000010002a, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION(r1, 0xae03, 0x83)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x121e82, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000040)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000003, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1b17f2, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xae)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000400)={0x0, 0x3, 0xeeef0000, 0x2000, &(0x7f0000239000/0x2000)=nil})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000000000/0x400000)=nil)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x20000, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r7 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r6, 0xae04)
r8 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x3000)=nil, r7, 0x100000a, 0x12, r8, 0x100000)
mmap$KVM_VCPU(&(0x7f0000ffe000/0x1000)=nil, r7, 0x1, 0x11, r8, 0x0)
r9 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, r3, 0x100000f, 0x12, r9, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r3, 0x0, 0x2012, r9, 0x0)
mmap$KVM_VCPU(&(0x7f0000ead000/0x3000)=nil, r3, 0x2800007, 0x11, r9, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_ONE_REG(r1, 0x4010ae42, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_fw={0x6030000000140001, &(0x7f00000000c0)=0x6})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4008ae6a, &(0x7f0000000100)={0x5, 0x0, [{0x9, 0x1, 0x0, 0x0, @adapter={0x1, 0x3, 0x4, 0xd7, 0xf055}}, {0xc, 0x1, 0x0, 0x0, @adapter={0x1, 0x8c, 0x1170, 0x4, 0x8}}, {0x1f, 0x6, 0x0, 0x0, @sint={0x7, 0x3}}, {0x7, 0x2, 0x0, 0x0, @sint={0xfffffffc, 0x8001}}, {0x2, 0x4, 0x0, 0x0, @sint={0x2, 0xdb6b}}]})
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x1})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, &(0x7f00000001c0)=0x10000000000001, 0xe80)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x1f01)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x69)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x58)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0xc, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x20000000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x8, 0x5c1fd1b6565d2f2, 0xffffffffffffffff, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION2(r3, 0x40a0ae49, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000100)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_GET_REG_LIST(r3, 0xc008aeb0, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
close(r1)
r4 = eventfd2(0x0, 0x0)
close(r1)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffe09)
r5 = mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x3000002, 0x13, r1, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(r5, 0x20, &(0x7f0000000000)="7cfaa2bfd6dd76375aa1bde04fceeb33743b07d73b3e9aac", 0x0, 0x18)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x2400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x0, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000280)={0x1fe, 0x1, 0x6000, 0x1000, &(0x7f0000000000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000240), 0x2400, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r3, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x0, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="76b92cfb97422a99b188adac74647aa1221e4d8e6da62d5f533e7f6120be5a845d77658c900fa608d72c085a1f4e5203df5e7728260b7ab522076295a9cbeeae01832398e92fc7bc", 0x0, 0x48)
ioctl$KVM_ARM_VCPU_INIT(r4, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r4, 0xae80, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r5 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r5, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r5, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x6, 0x8032, 0xffffffffffffffff, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000daf000/0x3000)=nil, 0x930, 0x3000007, 0x8a031, 0xffffffffffffffff, 0x0)
r0 = mmap$KVM_VCPU(&(0x7f0000f82000/0x3000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
r3 = mmap$KVM_VCPU(&(0x7f0000f82000/0x1000)=nil, r2, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r4 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = syz_kvm_setup_syzos_vm$arm64(r5, &(0x7f0000c00000/0x400000)=nil)
r7 = syz_kvm_add_vcpu$arm64(r6, &(0x7f0000000180)={0x0, &(0x7f00000001c0)=[@msr={0x14, 0x20, {0x603000000013dce9, 0x8000}}], 0x20}, 0x0, 0x0)
ioctl$KVM_RUN(r7, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1c})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_core={0x603000000010001c, &(0x7f00000001c0)=0x3})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee0000, 0x2, r2, 0x8})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0xffffffffffffffff, 0x0, 0x1, r2, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_RESET_DIRTY_RINGS(r1, 0xaec7)

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f000000e000/0x3000)=nil, r1, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
munmap(&(0x7f0000ce0000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000c8f000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)

      
      mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, 0x930, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
close(0x4)
mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, 0x930, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xaece, 0x0)

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
munmap(&(0x7f0000c07000/0x1000)=nil, 0x1000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x100000f, 0x9032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000005c0), 0x20000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_SET_COUNTER_OFFSET(r1, 0x5421, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x109000, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r6 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r5, 0xae04)
r7 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, r2, 0x0, 0x12, r7, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r2, 0x0, 0x2012, r7, 0x0)
mmap$KVM_VCPU(&(0x7f0000f4f000/0x2000)=nil, r6, 0x0, 0x11, r7, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x18})
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x3, 0xe0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_other={0x0, 0x9, 0x9, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0x3, 0x1000, 0x2}})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f00000001c0)=ANY=[], 0x40}], 0x1, 0x0, 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
syz_kvm_setup_cpu$arm64(r3, 0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=ANY=[@ANYBLOB="32000000000000004000000000000000530000c4"], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000240)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000180)=@arm64_fw={0x6030000000140001, 0xfffffffffffffffe})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x5, 0x3, &(0x7f0000000200)=0xf})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="547a80816119d6f740eba70939b4dd3c67cc8ef30267b6e351ec92609ea1772af89374b2c24ae764125ca82e671b267d8980f7f7061632c7b88459ab6c0154d4086903dedbfdd6fb", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a13f2, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000140)={0x5, 0x0, 0x5000, 0x2000, &(0x7f0000002000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000040)={0x4, 0x3, 0xdddd1000, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x1, 0x10000, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION2(r1, 0x40a0ae49, &(0x7f00000001c0)={0x1, 0x1, 0x4, 0x1000, &(0x7f000054a000/0x1000)=nil, 0x200})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac3bc4a22332fdaa8de0518df242008031d1dfd92f0000000001fff9ffdc9610fbff77521ce30d8f00", 0x0, 0xfcf7)

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x2, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_GET_REG_LIST(r2, 0xc008aeb0, &(0x7f0000000680)=ANY=[@ANYBLOB="4bda"])

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000002c0), 0x800, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x5)
r3 = openat$kvm(0x0, 0x0, 0x2002, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
r4 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4008ae6a, &(0x7f0000000100)=ANY=[@ANYBLOB="050000000000000009000000010000000000000000000000010000000000000003000000000000000400000000000000d700000055f000001400000001000000000000000000000001000000000000008c00000000000000701100000000000004000000080000001f000000060000000000000000000000070000000300000000000000000000000000000000000000000000000000000007000000020000000000000000000000fcffffff01800000000000000000000000000000000000000000000000000000020000000400"])
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x1})
ioctl$KVM_CREATE_DEVICE(r4, 0xc00caee0, &(0x7f0000000140)={0x4, <r5=>0xffffffffffffffff, 0x1})
write$eventfd(r5, &(0x7f00000001c0)=0xfffffffffffffffd, 0x4e)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1c})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_core={0x6030000000100006})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x1)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000100)={0x1, 0x5000, 0x8, r2, 0x2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f00000000c0)={0x1000, 0x0, 0x1, r2, 0x5})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x3000002, 0x8a031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(r0, 0x20, &(0x7f0000000000)="7cfaa2bfd6dd76375aa1bde04fceeb33743b07d73b3e9aac", 0x0, 0x18)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac3bc4a22332fdaa8de0518df242008031d1dfd92f0000000001fff9ffdc9610fbff77521ce30d8f00", 0x0, 0xfcf7)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x2000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f000000e000/0x3000)=nil, r1, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r3, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, r4, 0x100000c, 0x23ac5f9b426ec4b2, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000240), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x7f)

      
      r0 = eventfd2(0x1, 0x1)
r1 = openat$kvm(0x0, &(0x7f0000000080), 0x20200, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r2, 0x1, 0x100)
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f00000000c0)={r0, 0x4, 0x0, r0})
ioctl$KVM_SET_GSI_ROUTING(r2, 0x4008ae6a, &(0x7f0000000240)=ANY=[@ANYBLOB="01000000000000000300000002"])
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000100)={0x8})
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f0000000040)={r0, 0x3, 0x2, r0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x603000000010004c, &(0x7f0000000280)=0x7})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1c})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_sve_vls={0x606000000015ffff, &(0x7f00000000c0)=0xd1e})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000200)={0x75c, 0x300, 0x0, 0xffffffffffffffff, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000000)=[@mrs={0xbe, 0x18, {0x603000000013808c}}], 0x18}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, 0x0, 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_core={0x603000000010003a, &(0x7f0000000100)=0x1})

      
      munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x0, 0x7d7b465c1d30afba, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=[@msr={0x14, 0x20, {0x603000000013c522, 0x7}}], 0x20}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0xca680, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, &(0x7f0000000180)={0x8})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
r3 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r5, 0x0)
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000140)=@arm64_fp_extra={0x60200000001000d4, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="1400000000000000200000000000000002"], 0x20}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_GET_DEVICE_ATTR_vcpu(r2, 0x4018aee2, &(0x7f0000000140)=@attr_irq_timer={0x0, 0x1, 0x1, &(0x7f00000000c0)=0x10f})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=[@its_setup={0x82, 0x28, {0x1, 0x1, 0x1}}, @its_send_cmd={0xaa, 0x28, {0xb, 0x0, 0x0, 0x4, 0x8000e}}], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x140, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x9})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000000)=@arm64_sys={0x603000000013c028, &(0x7f0000000140)=0x6})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_IRQCHIP(r1, 0xae60)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=ANY=[@ANYBLOB="140000000000000020000000000000005dc6"], 0x20}], 0x1, 0x0, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x4, 0x40)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      write$eventfd(0xffffffffffffffff, 0x0, 0x0)
r0 = eventfd2(0x1, 0x1)
r1 = openat$kvm(0x0, &(0x7f0000000080), 0x20200, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r2, 0x1, 0x100)
syz_kvm_setup_cpu$arm64(0xffffffffffffffff, 0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=ANY=[@ANYBLOB="32001400000000004000000000000000200000c5000000000300000000000000060000000000000001000100000000000000002001000000000000000000000046000000000000001800000000000000040000000c030000e60000000000080000000000000000000200000000000000aa000000000000002800e6df000000000f01040000000600000007000000001000000400000000000a000000000000005400000000000000000028d50000c028008c200e0020200e007008d5000800b800f8302e000000e0dfc488d200a0b8f2810080d2828380d2030080d2c40080d2020000d4003c004ec0035fd632000000000000004000000000000000200000000000000005000000000000004000000000000000030000000000000001000000000000001000"], 0x14c}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f00000000c0)={r0, 0x4, 0x0, r0})
ioctl$KVM_SET_GSI_ROUTING(r2, 0x4008ae6a, 0x0)
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f0000000040)={r0, 0x3, 0x2, r0})

      
      munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ffd000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x0, 0x80031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f00006b4000/0x3000)=nil, 0x930, 0xf, 0x32, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0xf, 0xffffffffffffffff, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x88000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_GUEST_DEBUG(r2, 0x4208ae9b, &(0x7f0000000040)={0x2, 0x0, [0x3, 0x6, 0x1849ece3, 0x2, 0x4c, 0x2, 0x2, 0x7]})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x3, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000007, 0x23ac5f9b426eccb2, 0xffffffffffffffff, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_ONE_REG(r2, 0x4000ae8d, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000200)=@arm64_core={0x603000000010002e, &(0x7f0000000140)=0x2})

      
      r0 = ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r0, 0x4020aeae, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x400000, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r3, 0x0)
r5 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x40, 0x0)
ioctl$KVM_CHECK_EXTENSION(r5, 0x4030582b, 0x1)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xa8)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x400000f, 0x80031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000680)="38ce8347fc1e86008cfc72bb352c8659dcc9225b48cb5cb00c73b0b33018748e73f7f1f493e89c859e17625ad1b19ca88da9c227db3473a7fd4ce992bfc316bd22ccc646cd69c728", 0x0, 0x48)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000000)=@arm64_fp_extra={0x60200000001000d0, 0x0})

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION(0xffffffffffffffff, 0xae03, 0xe1)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, r3, 0x100000c, 0x23ac5f9b426ec4b2, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_MP_STATE(r4, 0x4004ae99, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000280)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000001000000000000000200000000000000aa0000000000000028000000000000000100000000000600000000000000020000"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x5452, 0x2000fdfd)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
write$eventfd(r4, &(0x7f0000000000), 0xfffffe1e)
r5 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r6 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = eventfd2(0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r3 = eventfd2(0xffffffff, 0x80001)
ioctl$KVM_IOEVENTFD(r2, 0x4040ae79, &(0x7f0000000080)={0x2, 0x0, 0x0, r1})
ioctl$KVM_IOEVENTFD(r2, 0x4040ae79, &(0x7f0000001a40)={0xfffffffffffffffe, 0x300, 0x0, r3, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="820000000000000028000000000000000200000000002200040000000000000001"], 0x28}, 0x0, 0x0)
r4 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r5=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r5, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
munmap(&(0x7f0000800000/0x800000)=nil, 0x800000)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x4, 0x3, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x14})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000000)=@arm64_bitmap={0x6030000000160003, &(0x7f0000000180)=0x2000000000000037})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000001000/0x2000)=nil, 0x930, 0x2000003, 0x4120932, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fd, 0x0, 0x0, 0x1000, &(0x7f000000d000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000000)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000180)="fb0149dd033be3073da85cac1648f1e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76d869d2855c7f3200", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x80086601, 0x2000fdfd)

      
      r0 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x7, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, 0xffffffffffffffff)

      
      syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x4, 0x2, 0xd000, 0x1000, &(0x7f0000001000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_arm64={0x0, 0x2})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100020, &(0x7f0000000180)=0x4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000200), 0x9e9483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_bitmap={0x6030000000160003, &(0x7f0000000140)=0x7})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x580, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f00008a0000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000e00)=ANY=[], 0x630}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000080)=@attr_other={0x0, 0x2, 0x0, &(0x7f0000000000)=0x80})
syz_kvm_vgic_v3_setup(r1, 0x4, 0x300)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f000000a000/0x400000)=nil)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x5, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
syz_kvm_setup_syzos_vm$arm64(r0, &(0x7f0000c00000/0x400000)=nil)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_RESET_DIRTY_RINGS(r1, 0xaec7)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f00000000c0)={0x0, 0x9, 0x2}})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=[@hvc={0x32, 0x40, {0x86000000, [0x3, 0x8, 0x6, 0x0, 0xfffffffffffff509]}}], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000200), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000100)={0xf000, 0x117800})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000040)={0x10000})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000080)={0xdddd1000, 0x0, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, r3, 0x300000e, 0x13, r2, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000100)="c2f09feefa7f61dfd678c0827fcde6607c27f8b6ba8f96142f3520a4eb0565eba4d8e6a48909911f398c5defc24101c05e06ebab5bebac6170497931f113876e11ab48e78261db36", 0x0, 0x48)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(0xffffffffffffffff, 0x4018aee1, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000200)=@attr_other={0x0, 0x0, 0x8000000000000007, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bff000/0x400000)=nil)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000000)=0xfffffffffffffffa})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100042, &(0x7f0000000100)=0x812})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000240)=@arm64_core={0x6030000000100048, &(0x7f0000000180)=0xb99b})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1d})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, 0x0)
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000600)=@arm64_core={0x603000000010004e, &(0x7f00000001c0)=0x9c90})

      
      mmap$KVM_VCPU(&(0x7f00006b5000/0x2000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f00006b4000/0x3000)=nil, r1, 0x300000f, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x3, 0x380)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000000)={0x4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x909483, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000300)=@attr_other={0x0, 0x8, 0x20000000100, &(0x7f0000000200)=0x403})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xc0189436, 0x100000000000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xac)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x6030000000100018, &(0x7f0000000040)=0x8})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x83)

      
      close(0xffffffffffffffff)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000240)=@attr_arm64={0x0, 0x5, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x0, 0x0)
r3 = eventfd2(0x8001, 0x0)
write$eventfd(r3, &(0x7f0000000000)=0xfffffffffffffffb, 0x8)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000000c0)={r3, 0x1, 0x2, r2})
close(0x4)

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r3, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x602002, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
close(r0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
ioctl$KVM_GET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee2, 0x0)
ioctl$KVM_GET_MP_STATE(0xffffffffffffffff, 0x8004ae98, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000440)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000004c0)=@attr_arm64={0x0, 0x0, 0x2, &(0x7f0000000480)=0x8000000000000001})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f00000000c0)=[{0x0, 0x0}], 0x1, 0x0, &(0x7f0000000100)=[@featur2={0x1, 0x4}], 0x1)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="1e000000000000004000000000000000030000c4"], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000800000/0x800000)=nil, 0x800000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x43033, 0xffffffffffffffff, 0x0)

      
      r0 = eventfd2(0x0, 0x0)
close(r0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r2, 0x1, 0x40)
ioctl$KVM_IRQFD(r2, 0x4020ae76, 0x0)
r4 = eventfd2(0x0, 0x801)
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f00000000c0)={r4, 0x1, 0x2, r0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x143041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_CLEAR_DIRTY_LOG(r1, 0xc018aec0, &(0x7f0000000040)={0xa02d11a4906d870c, 0x140, 0x300, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0xe2a00, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x5452, 0x2000fdfd)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f00000000c0)=@attr_other={0x0, 0x0, 0x3, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000040)={0x5})
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x603000000010003c, &(0x7f0000000100)})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, r1, 0x0, 0x23ac5f9b426ec4b2, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x2, 0x23ac5f9b426ec4b1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="82000000"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x0, 0x1, 0x1000, 0x1000, &(0x7f0000ffc000/0x1000)=nil})
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4020ae46, &(0x7f0000000180)=ANY=[@ANYBLOB="010000000100000000400000000000000010000002"])
munmap(&(0x7f0000ffd000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000300)=ANY=[], 0x28}, 0x0, 0x0)
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000280)=@arm64_bitmap={0x6030000000160001, &(0x7f0000000240)=0x618})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f00000000c0)={0x8, 0x0, 0x8, 0xffffffffffffffff, 0x20})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x5452, 0x2000fdfd)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
write$eventfd(r4, &(0x7f0000000000), 0xfffffe1e)
r5 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r6 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x2, 0x4003831, 0xffffffffffffffff, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x3, 0xa0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4020ae46, &(0x7f0000000200)=ANY=[@ANYBLOB="0100000001000000000000010000000000103f5c4bd284"])

      
      r0 = openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, 0x0, 0x2000, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000f82000/0x3000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r5 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r4, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, r5, 0x5000003, 0x80031, 0xffffffffffffffff, 0x0)
r6 = mmap$KVM_VCPU(&(0x7f0000f82000/0x1000)=nil, r5, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r6, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
ioctl$KVM_SET_DEVICE_ATTR_vm(r2, 0x4018aee1, &(0x7f0000000200)=@attr_other={0x0, 0x0, 0x8000000000000007, 0x0})
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)

      
      r0 = openat$kvm(0xffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xcb)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x80, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000240), 0x2, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000200), 0x22c00, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000002c0)={0x2, 0x0, 0x0, 0x1000, &(0x7f0000ff9000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000340)={0x2, 0x1, 0x10000000000, 0x1000, &(0x7f0000ff9000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x1000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000180)={0x5, 0x1c})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_sve={0x6080000000150011, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="140000000000000020000000000000005dc613"], 0x20}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
close(r1)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1c1cf2, 0x0)
mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x3000002, 0x11, r1, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x400454da, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_HALT_POLL(r1, 0x4068aea3, &(0x7f0000000080)={0xb6, 0x0, 0xed6})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000b80)={0x0, &(0x7f0000000040)=[@smc={0x1e, 0x40, {0x84000001, [0x99a, 0xb, 0xaca, 0x101, 0x1]}}], 0x40}, &(0x7f0000000bc0)=[@featur1={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_SET_REGS(r3, 0x4360ae82, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xab)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x0, 0x0, 0x3d70000000, &(0x7f0000ffe000/0x2000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x40a0ae49, &(0x7f00000002c0))

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x802, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000140)={0x8000000, 0x4000})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000080)={0xeeef0000})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f00000000c0)={0x6000, 0x7000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f00000000c0)=@arm64_core={0x603000000010000e, &(0x7f0000000000)=0xffffffffffffffff})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1ff, 0x2, 0x10000, 0x2000, &(0x7f0000440000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x3, 0xd000, 0x1000, &(0x7f0000009000/0x1000)=nil})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000000000/0x400000)=nil)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac3bc4a22332fdaa8de0518df242008031d1dfd92f0000000001fff9ffdc9610fbff77521ce30d8f00", 0x0, 0xfcf7)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE(r1, 0x4068aea3, &(0x7f0000000140)={0xb1, 0x0, 0x5})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x800454e1, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r3, 0x4020ae46, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r3, 0x4020ae46, &(0x7f0000000100)={0x0, 0x3, 0x10000000000, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000100)={0x2010040, 0x1000c53})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, 0x0, 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_core={0x6030000000100038, &(0x7f0000000100)=0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x19)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="547a80816119d6f740eba70939b4dd3c67cc8ef30267b6e351ec92609ea1772af89374b2c24ae764125ca82e671b267d8980f7f7061632c7b88459ab6c0154d4086903dedbfdd6fb", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000200), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r2, 0x4010ae67, 0x0)
r3 = openat$kvm(0x0, 0x0, 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r4, 0x4018aee1, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r2, 0x4010ae67, &(0x7f0000000040)={0x10000})
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ffa000/0x4000)=nil, 0x4000)
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
r7 = ioctl$KVM_CREATE_VCPU(r6, 0xae41, 0x0)
r8 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r7, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r8, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r7, 0x0)
r9 = eventfd2(0x0, 0x0)
close(r9)
r10 = eventfd2(0x0, 0x0)
close(r10)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r10, &(0x7f0000000180)=0x5, 0xfffffde3)
write$eventfd(r9, 0x0, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20203, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000200), 0x22c00, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x4, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000180)=@attr_arm64={0x0, 0x1, 0x2, &(0x7f0000000000)=0x9})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x400454d8, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x3000000, 0x4f831, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f00006b5000/0x2000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ffd000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x4020940d, 0x20)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000200)=@attr_other={0x0, 0x8, 0x80, &(0x7f0000000040)=0xc})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000080)=@attr_other={0x0, 0x9, 0x7, &(0x7f0000000000)=0x8})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x109901, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000100)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000140)={0x0, 0x0}, 0x0, 0x13)
syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x401, 0x3c0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0xa001, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_GUEST_DEBUG(r3, 0x4208ae9b, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000340)=ANY=[@ANYBLOB="076fde5a55"], 0x40}, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000100)=@attr_other={0x0, 0x4, 0x3, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x29)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_add_vcpu$arm64(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x8})
syz_kvm_vgic_v3_setup(r1, 0x5, 0x1e0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000300)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f00000002c0)=0x64})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000100)=@attr_pmu_init)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_fw={0x6030000004140003, &(0x7f00000001c0)=0x4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x402, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000280)=ANY=[@ANYBLOB="320000000000000040000000000000005300008440df97e8ea"], 0x40}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000001c0)={0x5, 0x2})
ioctl$KVM_SET_GUEST_DEBUG(r2, 0x4208ae9b, &(0x7f0000000000)={0x30001, 0x0, [0x11, 0x2, 0x7c, 0xb, 0x2, 0xffff, 0xfffffffffffffff9, 0x3c7]})
ioctl$KVM_ARM_SET_COUNTER_OFFSET(r1, 0x4010aeb5, &(0x7f00000000c0)={0x6})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f00000001c0)="fb4149dd033be3ac2cc4a22332a77b23b08986814d7bb14c94a6ab8031d1dfd92f00000000010000005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa7fc869d22627e7", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000009, 0x11, r2, 0x0)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
close(r4)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000100), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x3000000, 0x4f831, 0xffffffffffffffff, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_REGS(0xffffffffffffffff, 0x4360ae82, &(0x7f0000000280)={[0x734, 0x200, 0x0, 0x7, 0x5, 0x8, 0xffff, 0x7, 0x5, 0x7f, 0xd, 0x6, 0xfffffffffffffffb, 0x800, 0xf0fa5ad], 0x5000, 0x200})
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000001000000000000000100000000000000aa00000000000000280000000000000008"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x140)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x181fc, 0x0, 0x100000, 0x1000, &(0x7f0000858000/0x1000)=nil})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0xc00000)=nil, 0x930, 0x0, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_fw={0x6030000000140003, &(0x7f00000000c0)=0x4})

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, r2, 0x100000c, 0x23ac5f9b426ec4b2, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x8001, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000000c0)={r2, 0x1, 0x2})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4020940d, &(0x7f0000000080)={0xec70, 0x0, 0x1, 0xffffffffffffffff, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000140)={0x2000, 0x1a000})
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000000)={0xffff1000, 0x8000})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000100)={0x80a0000, 0x2000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
munmap(&(0x7f0000000000/0x2000)=nil, 0x2000)
ioctl$KVM_IRQ_LINE_STATUS(r1, 0xc008ae67, &(0x7f0000000000)={0x3, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0) (async)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000700)={0x7, 0x0}) (async)
ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x2)
ioctl$KVM_IRQ_LINE(r2, 0x4008ae61, &(0x7f0000000240)={0x200002f}) (async)
r3 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r4 = syz_kvm_setup_syzos_vm$arm64(r3, &(0x7f0000c00000/0x400000)=nil)
r5 = syz_kvm_add_vcpu$arm64(r4, &(0x7f0000000080)={0x0, &(0x7f0000000000)=ANY=[@ANYBLOB="054c0f0000000060b47892001800000000003b67ed394d2d53cb"], 0x18}, 0x0, 0x0) (async, rerun: 64)
syz_kvm_vgic_v3_setup(r3, 0x1, 0x100)
ioctl$KVM_RUN(r5, 0xae80, 0x0) (async)
ioctl$KVM_IRQ_LINE(r3, 0x4008ae61, &(0x7f0000000100)={0x1000020, 0x1}) (async, rerun: 32)
ioctl$KVM_RUN(r5, 0xae80, 0x0) (rerun: 32)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfff02000000ffffff00000d00e6ffea000000002000", 0x0, 0xffffffffffffff98)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x9, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x0, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x0, 0xcccc0000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x4, 0x0, 0x5000, 0x1000, &(0x7f0000003000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000140)={0x1fd, 0x0, 0x2000, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x3, 0x8000000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x1, 0x10000, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x101900, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="a55afac482ae9086510a1cfeebb372c746b69b695f50f0fe4a42e0db94adb9afe18edc51d30da60113b8f98bcdfe68bbc48c525a1b3867d3b43108ff914877781493d36fc97b8d2f", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x408)

      
      r0 = eventfd2(0x0, 0x80000)
write$eventfd(r0, 0x0, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000080)={0x4, <r3=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r3, 0x4018aee3, &(0x7f00000000c0)=@attr_other={0x0, 0x5f7, 0x7, 0x0})
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
close(r4)
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
r7 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r8 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r9 = ioctl$KVM_CREATE_VM(r8, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r9, 0x4008ae6a, &(0x7f0000000180)={0x1, 0x0, [{0x3, 0x3, 0x0, 0x0, @sint={0x8, 0x80000001}}]})
r10 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
r11 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x80900, 0x0)
ioctl$KVM_CHECK_EXTENSION(r11, 0xae03, 0x9)
r12 = mmap$KVM_VCPU(&(0x7f0000d39000/0x3000)=nil, 0x0, 0x0, 0x10, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r12, 0x20, &(0x7f00000003c0)="1eb7eeff4689d86f9d654aedea282e95e5128d28b50504927210aea6a1e033ee3b50b254f86fd58858006aae157c3537ffc5a526ea75bc32ea829b043346d02a41ded9f41b39a228", 0x0, 0x48)
r13 = syz_kvm_setup_syzos_vm$arm64(r10, &(0x7f0000c00000/0x400000)=nil)
r14 = syz_kvm_setup_syzos_vm$arm64(r10, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r14, &(0x7f0000000100)={0x0, 0x0}, 0x0, 0x0)
r15 = syz_kvm_add_vcpu$arm64(r13, &(0x7f0000000080)={0x0, &(0x7f00000001c0)=[@memwrite={0x6e, 0x30, @vgic_gits={0x8080000, 0x200b0, 0x49ea, 0x3}}, @mrs={0xbe, 0x18, {0x603000000013c000}}, @hvc={0x32, 0x40, {0xc4000053, [0x8, 0x1, 0x942, 0x4, 0x6]}}, @irq_setup={0x46, 0x18, {0x4, 0x1a2}}, @mrs={0xbe, 0x18, {0x603000000013e108}}, @uexit={0x0, 0x18, 0x466d}, @its_setup={0x82, 0x28, {0x2, 0x1, 0x3c3}}, @code={0xa, 0xb4, {"805380d20060b8f2e10080d2820180d2c30180d2840080d2020000d4000008d5607695d200a0b0f2a10080d2220180d2a30180d2440180d2020000d4007008d50078207e008008d5c0e79ed20020b0f2610180d2820180d2c30080d2040180d2020000d400000039c00a82d200a0b8f2c10180d2620080d2c30180d2a40080d2020000d4400780d20040b0f2010180d2220180d2830180d2240080d2020000d4"}}, @smc={0x1e, 0x40, {0x84000012, [0x5, 0x9255, 0xff, 0x4, 0x9]}}], 0x1ec}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r10, 0x2, 0x3c0)
ioctl$KVM_RUN(r15, 0xae80, 0x0)
syz_kvm_setup_syzos_vm$arm64(r6, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r14, &(0x7f0000000080)={0x0, &(0x7f00000000c0)}, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000280)={0x0, &(0x7f0000000300)=[@its_setup={0x82, 0x28, {0x1, 0x1, 0x2}}, @memwrite={0x6e, 0x30, @vgic_gits={0x8080000, 0x90, 0x1ff, 0x1}}], 0x58}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x580, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x2, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f00008a0000/0x400000)=nil)
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, 0x0)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000e00)=ANY=[], 0x630}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000080)=@attr_other={0x0, 0x2, 0x0, &(0x7f0000000000)=0x80})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0xef000000, 0x1000, 0x2}})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=[@smc={0x1e, 0x40, {0xef000000, [0x0, 0x1, 0x2, 0x3, 0x4]}}, @mrs={0xbe, 0x18, {0x603000000013c807}}], 0x58}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, &(0x7f0000000140)={0x1, 0xffffffffffffffff, 0x1})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="820000000000000028000000000000000100000000000000040000000000000002000000000000008200000000000000280000000000000004"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x8)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100016, &(0x7f0000000100)=0x105})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000180)=@attr_set_pmu={0x0, 0x0, 0x3, &(0x7f0000000140)=0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x400454cc, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0xc, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x20000000)

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0xfffffffffffffffc)
r0 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xb2)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = syz_kvm_setup_syzos_vm$arm64(r2, &(0x7f0000bfd000/0x400000)=nil)
r4 = syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000b80)={0x0, &(0x7f0000000400)=ANY=[@ANYBLOB], 0x40}, &(0x7f0000000240)=[@featur1={0x1, 0x4}], 0x1)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013d801, &(0x7f00000000c0)=0x9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x3c0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r3, 0xae04)
r5 = mmap$KVM_VCPU(&(0x7f0000d3f000/0xd000)=nil, r4, 0x3000007, 0x13, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x5421, 0xfffffffefffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb4000/0x3000)=nil, 0x930, 0x200000c, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_ONE_REG(r1, 0x4010aeab, &(0x7f00000002c0)=@arm64_ccsidr={0x6020000000110011, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r6, 0x4020ae46, &(0x7f0000000040)={0x1, 0x1, 0x5000, 0x2000, &(0x7f0000fa2000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r6, 0x4020ae46, &(0x7f0000000080)={0x26e8, 0x0, 0x0, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
ioctl$KVM_CREATE_DEVICE(r6, 0xc018aec0, &(0x7f0000000040)={0x1})
r7 = syz_kvm_setup_syzos_vm$arm64(r4, &(0x7f0000bfd000/0x400000)=nil)
r8 = syz_kvm_add_vcpu$arm64(r7, &(0x7f0000000b80)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="1e000000000000004000000000000000040000c4000000009a"], 0x40}, &(0x7f0000000bc0)=[@featur1={0x1, 0x4}], 0x1)
syz_kvm_add_vcpu$arm64(r7, &(0x7f00000000c0)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_RUN(r8, 0xae80, 0x0)
r9 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
r10 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r9, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r10, 0x20, &(0x7f0000000380)="f30149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a3ff7fbc51869be2e2e0000000000000f000000000000000001000000000000000000000000000e00", 0x0, 0x34)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r9, 0x0)
close(r2)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
openat$kvm(0x6, &(0x7f0000000040), 0x565e02, 0x0)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f00000001c0)={0xffffffffffffffff, 0x3, 0x2})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0)=0xffffff7f, 0xff25)

      
      r0 = openat$kvm(0x0, &(0x7f00000002c0), 0x1ab801, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x60081, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r4 = syz_kvm_vgic_v3_setup(r2, 0x1, 0x100)
ioctl$KVM_CAP_DIRTY_LOG_RING(r2, 0x4068aea3, &(0x7f0000000100)={0xc0, 0x0, 0x3000})
ioctl$KVM_HAS_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_other={0x0, 0x2, 0xba, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x40802, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x3, 0xa0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000200)=@attr_other={0x0, 0x0, 0x5, &(0x7f00000001c0)=0x6d})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013dce0, &(0x7f0000000000)=0x43ff})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_sys={0x603000000013df43, &(0x7f0000000080)=0x3})

      
      ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4008ae6a, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x18})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000080)=@attr_pmu_filter={0x0, 0x0, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_fw={0x6030000000140000, &(0x7f00000001c0)=0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000080)={0xdf, 0x0, 0x800})

      
      r0 = eventfd2(0x8001, 0x0)
write$eventfd(r0, &(0x7f0000000000)=0xfffffffffffffffb, 0x8)
write$eventfd(r0, &(0x7f0000000000)=0x89ef, 0x8)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x1, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xe})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000000)=@arm64_extra={0x6030000000140000, &(0x7f00000001c0)=0x10000})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000240)={0x5, 0x11})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000080)=@arm64_bitmap={0x6030000000160000, &(0x7f0000000100)})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000500), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xa2)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000380)=ANY=[], 0x28}, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x8})
syz_kvm_vgic_v3_setup(r1, 0x4, 0x100)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_SIGNAL_MSI(r1, 0x4020aea5, &(0x7f0000000200)={0x8000000, 0x0, 0x0, 0x1, 0x5})

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
eventfd2(0x45, 0x800)
mmap$KVM_VCPU(&(0x7f0000ff5000/0x3000)=nil, 0x930, 0x100000f, 0x24132, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_core={0x60300000001000d7, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x109000, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r7 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r6, 0xae04)
r8 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, r3, 0x0, 0x12, r8, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r3, 0x0, 0x2012, r8, 0x0)
r9 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000f4f000/0x2000)=nil, r7, 0x0, 0x11, r8, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r9, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000e0c000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000e0b000/0x1000)=nil, 0x1000)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000200)=@arm64_core={0x6030000000100032, &(0x7f0000000140)=0x2})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x5d)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0xec, 0x0, 0x8, r1, 0x6})

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xd8, 0x1})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000ad4000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x7, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40305839, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x60100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_core={0x603000000010001c, &(0x7f0000000000)=0x8000000000800004})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, &(0x7f0000000000)="375aa1bde04fceeb33743b07d73b3e9aac00", 0x0, 0x18)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000000)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="e20d12caf4bcbfb01044d0101019522def6e7908b76e28fdd765d0f8eaa16f41e9b2a4df6d542391c0b464bd97afcf854e2d2067ac3ac884975660f5ded4cca6c85e2d5a3259dd65", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20203, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_bitmap={0x6030000000160002, &(0x7f0000000080)=0x7fffffff})

      
      munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ff2000/0xd000)=nil, 0xd000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x43133, 0xffffffffffffffff, 0xfffffffff0000000)
munmap(&(0x7f0000ff5000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000db0000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000fff000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x8040aeb6, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x8)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_sve_vls={0x606000000015ffff, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x2710, 0x0, 0x100080ae000, 0x2000, &(0x7f0000ffe000/0x2000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0xa)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x1f})
ioctl$KVM_SET_ONE_REG(r3, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013c00a, &(0x7f0000000040)=0x3})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_ccsidr={0x602000000011000f, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x52)

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000f1a000/0x4000)=nil, 0x930, 0x0, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x3, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x2, 0x0, 0x0, r2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000001a40)={0xfffffffffffffffe, 0x300, 0x0, r2, 0x5})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x51)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x1, 0x800)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000180)={r2, 0x3, 0x0, r2})
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r3, 0x40fff, 0x0, r3})
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000000c0)={r3, 0x7, 0x3, r2})

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000f1a000/0x4000)=nil, 0x930, 0x0, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x3, 0x9032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000380), 0x20000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000b40)=@attr_irq_timer={0x0, 0x1, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000100)={0x0, 0x0}, 0x0, 0x0)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000001c0)=[@memwrite={0x6e, 0x30, @vgic_gits={0x8080000, 0x200b0, 0x49ea, 0x3}}, @mrs={0xbe, 0x18, {0x603000000013c000}}, @smc={0x1e, 0x40, {0x84000012, [0x5, 0x9255, 0xff, 0x4, 0x9]}}], 0x88}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x3c0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x4, 0x2, 0x0})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000300)=@attr_arm64={0x0, 0x4, 0x4, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000100)=@attr_other={0x0, 0x3cc27b60, 0x3, 0x0})

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c8f000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x4b49, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000ffa000/0x3000)=nil, 0x930, 0x0, 0x7f09bd658b282731, 0xffffffffffffffff, 0x0)

      
      munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_VCPU_FINALIZE(r2, 0x4004aec2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f0000000040)="4df74d20cd04ee4ce2aa8a0797d68e953766cd7a4855880c9bf8c2b7cf738dc33732698d631778d116a24fd82e39c234c499eff943378c8ca92835aac201b216e92cae0faa84392b", 0x0, 0x48)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40305829, &(0x7f0000000100)=@attr_other={0x0, 0xb, 0x9f01, &(0x7f0000000180)=0xfffffffffffffffc})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION(r1, 0xae03, 0xe5)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x101001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION(r1, 0xae03, 0xe6)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r2, 0x4020ae46, 0x0)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f0000bfe000/0x400000)=nil, &(0x7f00000008c0)=[{0x0, &(0x7f0000000640)=ANY=[@ANYBLOB="820000000000000028000000000000000200000000000000030000000000000046"], 0x248}], 0x1, 0x0, 0x0, 0x0)
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r4 = syz_kvm_add_vcpu$arm64(r3, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=[@its_setup={0x82, 0x28, {0x1, 0x1, 0x1}}], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x120)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r5=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r5, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r4, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x300000c, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_HAS_DEVICE_ATTR_vcpu(r2, 0x4018aee3, &(0x7f0000000040)=@attr_irq_timer={0x0, 0x1, 0x1, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r2 = mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, r1, 0x100000e, 0x8a031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f00000000c0)="e51b9ce9a032a1ca7079bce9b3cf3ba9c7fbc2e7ab457eacc044b677d9d49c274b8d12fb382e0520cadbc6763409ffdb41911831b85a42b40c1689a8bf14be81eda4bae2d8c28ef8", 0x0, 0x48)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
close(r1)
r4 = eventfd2(0x0, 0x0)
close(r1)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffe09)
r5 = mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x3000002, 0x12, r1, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(r5, 0x20, &(0x7f0000000000)="7cfaa2bfd6dd76375aa1bde04fceeb33743b07d73b3e9aac", 0x0, 0x18)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
munmap(&(0x7f0000001000/0x3000)=nil, 0x3000)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r4, 0x0)
r6 = eventfd2(0x0, 0x0)
close(r6)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x1f01)
write$eventfd(r6, &(0x7f0000000180)=0x5, 0xfffffde3)
r7 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r8 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r9 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r7, 0xae04)
mmap$KVM_VCPU(&(0x7f0000002000/0x3000)=nil, r9, 0xa, 0x11, r8, 0x0)
r10 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r8, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r10, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r8, 0x0)
r11 = eventfd2(0x0, 0x0)
close(r11)
r12 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
ioctl$KVM_CREATE_VM(r12, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r12, 0xae01, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x200, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000200)={0x7, <r5=>0xffffffffffffffff})
syz_kvm_setup_cpu$arm64(r3, r4, &(0x7f0000bfe000/0x400000)=nil, &(0x7f0000000100)=[{0x0, 0x0, 0x30c}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR(r5, 0x4018aee3, &(0x7f00000000c0)=@attr_arm64={0x0, 0x6, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x140)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000280)=@arm64_core={0x6030000000100032, &(0x7f00000001c0)=0x8})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r4, 0x3000008, 0x11, r3, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, 0x930, 0x0, 0x11, r3, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x4, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_arm64={0x0, 0x1, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_REGS(r2, 0x8360ae81, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="0100000001000000000000000806"])
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, 0x0, 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, &(0x7f00000001c0)=0x3, 0x10)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xc0045878, 0x20000000)

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x1, 0x1, 0x5000, 0x1000, &(0x7f0000fa2000/0x1000)=nil})
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000040)={0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x202, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r3 = syz_kvm_vgic_v3_setup(r1, 0x2, 0x200)
syz_kvm_setup_syzos_vm$arm64(r2, &(0x7f0000bff000/0x400000)=nil)
syz_kvm_vgic_v3_setup(r2, 0x2, 0x280)
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000580)=@attr_other={0x0, 0x4, 0xc, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4020ae46, &(0x7f0000000180)=ANY=[@ANYBLOB="010000000100000000000001000000000010000002"])
ioctl$KVM_CLEAR_DIRTY_LOG(r1, 0xc018aec0, &(0x7f0000000100)={0x1, 0x1ffc01, 0x400, &(0x7f00000003c0)=[0x7fff, 0x6, 0xfffffffffffffff8, 0x2, 0xffffffffffffffff, 0x80000000, 0x6f7, 0x81, 0x7, 0x6, 0x10000, 0x5, 0x9, 0x9, 0x8, 0x9, 0x0, 0xd07, 0x1000, 0x4, 0x0, 0x4, 0x2, 0x8001, 0xfffffffffffffff9, 0xffffffffffff0000, 0x9, 0x81, 0x6, 0x400, 0x0, 0x1, 0x5, 0x98, 0x3, 0xc, 0x8, 0x2f, 0x5, 0x7fffffff, 0x0, 0x9, 0x7, 0x0, 0x800, 0x6, 0xd, 0x3fe00, 0x1, 0xf4, 0xff, 0x9, 0x1, 0x8, 0x9, 0x7, 0x7fff, 0xfff, 0x95, 0x6, 0x7, 0x1, 0x4, 0x1ff, 0xffffffffffffffff, 0x2, 0xf, 0x6, 0x6, 0x4, 0x400, 0x42, 0xa8a6, 0x0, 0x8, 0x1, 0x155, 0x2, 0x4, 0x1, 0x1, 0x100000001, 0x1, 0x10000, 0x1, 0x10001, 0x7, 0x6, 0xfffffffffffffffa, 0xffffffff, 0xffffffffffffff00, 0x0, 0x67, 0x2, 0x7, 0x7f0d, 0x4, 0x5, 0xff4, 0x2f4, 0x1f8, 0xb, 0x8001, 0x6, 0xe, 0x3, 0x5, 0x5, 0x0, 0x3c4, 0x5, 0x0, 0x7, 0x0, 0x8, 0x1, 0x800, 0x3, 0xc98e, 0x2, 0x3ff, 0x8, 0x401, 0xfffffffffffffffd, 0xe39, 0x9, 0x5, 0x7]})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000080)={0x80020009, 0x1})

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x3, 0xa0)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0) (async)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x0, 0x0}) (async)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000140)={0x4, <r3=>0xffffffffffffffff, 0x1})
r4 = ioctl$KVM_CREATE_VM(r3, 0x894c, 0x0)
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
r7 = ioctl$KVM_CREATE_VCPU(r6, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r7, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1}) (async)
ioctl$KVM_SET_ONE_REG(r7, 0x4010aeac, &(0x7f00000001c0)=@arm64_sys={0x6030000000138064, &(0x7f00000000c0)=0x8000})
ioctl$KVM_CREATE_VCPU(r4, 0xb702, 0x0) (async)
openat$kvm(0x0, &(0x7f0000000080), 0x141001, 0x0)
ioctl$KVM_IOEVENTFD(0xffffffffffffffff, 0x4040ae79, &(0x7f0000000100))
r8 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r9 = syz_kvm_setup_syzos_vm$arm64(r8, &(0x7f0000c00000/0x400000)=nil)
r10 = syz_kvm_add_vcpu$arm64(r9, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@irq_setup={0x5, 0x18, {0x1, 0x20}}], 0x18}, 0x0, 0x0) (async)
r11 = syz_kvm_add_vcpu$arm64(r9, &(0x7f0000000100)={0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="050000"], 0x18}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r8, 0x2, 0x100) (async)
ioctl$KVM_RUN(r11, 0xae80, 0x0) (async)
ioctl$KVM_RUN(r10, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0xb703, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x62)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x808683, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x8)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, 0x0)
ioctl$KVM_CHECK_EXTENSION(r2, 0x541b, 0x1)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r3 = eventfd2(0x0, 0x0)
r4 = eventfd2(0x0, 0x1)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r3, 0x1, 0x2, r4})
r5 = eventfd2(0x0, 0x801)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000000c0)={r5, 0x1, 0x2, r4})
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000080)=ANY=[])

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x9)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x0)
ioctl$KVM_SET_SIGNAL_MASK(r3, 0x4004ae8b, &(0x7f0000000100)={0x6, "e5ccd16738ea"})
r4 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r4, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r4, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_sys={0x603000000013c03a, &(0x7f0000000140)=0x40000000026})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee0000, 0x2, r2, 0x8})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x0, 0x0, 0x1, r2, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1c})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_extra={0x6030000000140000, &(0x7f00000001c0)=0x10002})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x8004b707, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x121e82, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
r3 = syz_kvm_vgic_v3_setup(r1, 0x2, 0x40)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x4, 0x3, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x400454e2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_REG_LIST(0xffffffffffffffff, 0x4020aeae, &(0x7f0000000000)=ANY=[@ANYBLOB="05000000000000000000000000000082"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000000)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000140)=@arm64_core={0x6030000000100028, &(0x7f0000000100)=0xfffffffffffffbff})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0xc0189436, &(0x7f0000000080)={0x0, 0x0, 0x1, 0xffffffffffffffff, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000001000000000000000100000000000000aa00000000000000280000000000000008"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x140)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x20000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000dc0)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000fc0)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0xa)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x1f})
ioctl$KVM_SET_ONE_REG(r3, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013e110, &(0x7f0000000040)=0x2})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000040)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x6030000000100006, &(0x7f0000000100)})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(r0, 0x20, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r2, 0x4020ae46, &(0x7f00000000c0)={0x0, 0x0, 0xd000, 0x1000, &(0x7f0000ffd000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r2, 0x4020ae46, &(0x7f0000000040)={0x4, 0x0, 0xdddd1000, 0x1000, &(0x7f0000012000/0x1000)=nil})
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION2(r2, 0x40a0ae49, &(0x7f00000001c0)={0x1, 0x1, 0x4, 0x1000, &(0x7f000054a000/0x1000)=nil, 0x200})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x40, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x1000)=nil, r1, 0x100000a, 0x13, r0, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x800454df, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000000)=@arm64_sve={0x6080000000150220, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_IRQCHIP(r1, 0xae60)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000380)={0x2, 0x0, [{0x0, 0x1, 0x0, 0x0, @msi={0x0, 0x5, 0x3f0, 0x9}}, {0x0, 0x2, 0x1, 0x0, @sint={0x1000, 0x3}}]})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000080)=@attr_arm64={0x0, 0x8, 0x4, 0x0})

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4008ae6a, &(0x7f0000000100)=ANY=[@ANYBLOB="05000000000000000900000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000c00000004000000000000000000000001000000000000008c00000000000000701100000000000004000000080000001f0000000600000000000000000000000800000000000000fec61c1200000000c200000000000000ff0700000100000007000000020000000000000000000000fcffffff01800000000000000000000000000000000000000000000000000000020000e5c40d"])
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x1})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0)=0x10000000000001, 0xe80)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, &(0x7f00000001c0)=0x100, 0x1bed)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4020ae46, &(0x7f0000000180)=ANY=[@ANYBLOB="0100000001000000000000fe7f0000000010"])

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r4, 0xae03, 0xb6)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)

      
      r0 = openat$kvm(0x0, &(0x7f00000002c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE(r1, 0x4068aea3, &(0x7f00000001c0)={0xef, 0x0, 0x6})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, &(0x7f0000000140)={0x1, 0xffffffffffffffff, 0x1})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="820000000000000028000000000000000100000000000000040000000000000002000000000000008200000000000000280000000000000004"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x8004b708, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, 0x930, 0x2000007, 0x30d2a4fbfbea96b8, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000140)=0x1, 0x8)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0xc0e00, 0x2000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x80087601, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_extra={0x603000000013c026, &(0x7f00000000c0)=0x1ff})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x40000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xe3)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee8000, 0x0, r2, 0x2})
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x2, 0x0, 0x1, r3})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000001a40)={0x8, 0x0, 0x0, r3})

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x6, 0x8032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(0xffffffffffffffff, 0x4040aea0, &(0x7f0000000000)=@x86={0x6, 0x6f, 0x5, 0x0, 0x3, 0x5, 0x6, 0xd, 0x40, 0x3, 0x4, 0xa, 0x0, 0x5, 0x6, 0x0, 0x0, 0x2b, 0x6, '\x00', 0x0, 0x3})
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfff02000000ffffff00000d00e6ffea000000002000", 0x0, 0xffffffffffffff98)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
ioctl$KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2(r1, 0x4068aea3, &(0x7f0000000100)={0xa8, 0x0, 0x1})
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x39, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x5, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000000)=@arm64_sys={0x603000000013c024, &(0x7f00000000c0)=0xa5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x60100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x2, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f00000000c0)=@arm64_extra={0x6030000000160002, &(0x7f0000000140)=0x5})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000040)=@attr_pmu_init)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000000)=@attr_pmu_irq={0x0, 0x0, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_bitmap={0x6030000000160001, 0x0})

      
      r0 = eventfd2(0x80005ff, 0x1)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f00000001c0)={0xffffffffffffffff, 0x3, 0x2, r0})
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000140)={0x4, <r3=>0xffffffffffffffff, 0x1})
write$eventfd(r3, &(0x7f00000001c0)=0xffffff7f, 0xff25)

      
      r0 = eventfd2(0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, 0x0, 0x1000004, 0x11, r0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_ARM_MTE(r1, 0x4068aea3, &(0x7f0000000140))

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_ONE_REG(r2, 0xc018ae85, &(0x7f0000000100)=@arm64_sys={0x603000000013804c, 0x0})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000f82000/0x3000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
r3 = mmap$KVM_VCPU(&(0x7f0000f82000/0x1000)=nil, r2, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
eventfd2(0x5, 0x1)
r4 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = syz_kvm_setup_syzos_vm$arm64(r5, &(0x7f0000c00000/0x400000)=nil)
r7 = syz_kvm_add_vcpu$arm64(r6, &(0x7f0000000180)={0x0, &(0x7f00000001c0)=[@msr={0x14, 0x20, {0x603000000013dce9, 0x8000}}], 0x20}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r7, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
ioctl$KVM_RUN(r7, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000000)={0x8000000, 0x104000})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000080)={0x0, 0x6000})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000100)={0x80a0000})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x0, 0x12eeff265b2ad0b8, 0xffffffffffffffff, 0x1000000)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_GUEST_DEBUG(r2, 0x4208ae9b, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
mmap$KVM_VCPU(&(0x7f0000ded000/0x2000)=nil, 0x930, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000000)="448e05c3a2b26fea6b71868f82b61311d51a93b36691d240c2e3f7b1e3bb9d2d983856d617a19358b48b226ea5c5a11c61e64751a1899c060b14820c32b96cd45dc21afa0f76b768", 0x0, 0x48)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_VCPU_EVENTS(0xffffffffffffffff, 0x4040aea0, &(0x7f0000000000)=@arm64={0x6, 0x8, 0x77, '\x00', 0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000000)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_arm64={0x0, 0x5, 0x3, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000002c0), 0x300, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000880)=@arm64={0xae, 0x5, 0x9, '\x00', 0x6})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x300000c, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000100)=@arm64_extra={0x603000000013df02, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x40000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f00000000c0)={0x1, 0x0, [{0x3, 0x2, 0x0, 0x0, @sint={0x7fff, 0x5}}]})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xc6)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x80000)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000100)={0xd000, 0x0, 0x0, r2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x10001, 0xf000, 0x0, r2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000080)=@attr_other={0x0, 0x8, 0x8, &(0x7f0000000040)})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_core={0x6030000000100036, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xe3)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000100), 0x76b200, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
r5 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)
ioctl$KVM_CREATE_VM(r5, 0x401c5820, 0x20000000)
write$eventfd(r4, &(0x7f0000000000)=0x25d9, 0x8)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000080)=@attr_arm64={0x0, 0x8, 0x4, &(0x7f0000000240)=0x8})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000040)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_fp_extra={0x60200000001000d5, &(0x7f0000000300)=0x8})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x894c, 0x0)
close(0x5)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000100)="fb0149dd033be3ac2cc4a29ea667521ce16f8f1f449a7a835600000000000000000000000000ffffffffffffffde00", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
ioctl$KVM_SET_REGS(r4, 0x4360ae82, &(0x7f0000000200)={[0x6, 0x1e2c0000, 0xbb71, 0x8001, 0x4, 0x8, 0x40ae, 0x0, 0xfffffffffffffffe, 0x401, 0xfffffffffffffff8, 0x100000001, 0x3, 0x3ff, 0x101, 0x401], 0xdddd0000, 0x420})
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x9, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x0, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000000)=@x86={0x6, 0xef, 0x5, 0x0, 0x3, 0x5, 0x6, 0xd, 0x40, 0x3, 0x4, 0xa, 0x0, 0x5, 0x6, 0x0, 0x0, 0x2b, 0x6, '\x00', 0x0, 0x3})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x3, 0x40)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={0xffffffffffffffff, 0x1, 0x2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x40)
close(0x5)
close(0x4)

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x6, 0x8032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = syz_kvm_vgic_v3_setup(r1, 0x4, 0x80)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000840)=@attr_other={0x0, 0x1, 0xc, &(0x7f0000000040)=0x4002})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x5452, 0x2000fdfd)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r3, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r5 = eventfd2(0x0, 0x0)
close(r0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
r6 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000a40), 0x8600, 0x0)
r7 = ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
ioctl$KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2(r7, 0x4068aea3, &(0x7f0000001c00)={0xa8, 0x0, 0x3})
syz_kvm_setup_syzos_vm$arm64(r7, &(0x7f0000c00000/0x400000)=nil)
write$eventfd(r5, 0x0, 0x0)
r8 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r8, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r9 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r10 = ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r10, 0x4020ae46, 0x0)
close(0x3)
r11 = ioctl$KVM_CREATE_VCPU(r10, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r11, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r11, 0x4018aee1, &(0x7f0000000240)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000280)={0x6, 0x2538}})
r12 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r12, 0xae01, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x1})
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="fb52456012ab8ba1286bf6cd8100cdc404000000006abf47d90000000000000000000000000000000000000000fff900000000000700", 0x0, 0x48)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xff3c)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_core={0x6030000000100022, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x220)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000000)={0x5, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000040)={0x1, 0x0, 0x8000000, 0x2000, &(0x7f0000ffd000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x0, 0x5000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x1, 0x5000, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x1, 0xcccc0000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x0, 0xeeee8000, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x402, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f00000001c0)={0x2, 0x0, [{0x5, 0x3, 0x0, 0x0, @adapter={0x9, 0x3, 0x4, 0x1, 0x800}}, {0x8, 0x5, 0x1, 0x0, @irqchip={0x3, 0x2}}]})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
r2 = openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
syz_kvm_add_vcpu$arm64(0x0, 0x0, 0x0, 0x0)
syz_kvm_add_vcpu$arm64(0x0, 0x0, 0x0, 0x0)
close(0x4)
close(0x5)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_core={0x6030000000100050, &(0x7f0000000240)=0xff})

      
      r0 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000bfd000/0x400000)=nil)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
syz_kvm_setup_syzos_vm$arm64(r2, &(0x7f0000bfd000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r0, &(0x7f0000000b80)={0x0, &(0x7f0000000280)=ANY=[], 0x40}, 0x0, 0x0)
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000140)=@arm64_sve={0x608000000015009d, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000240)={0x5, 0x11})
ioctl$KVM_ARM_VCPU_FINALIZE(r2, 0x4004aec2, &(0x7f0000000040)=0x4)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(0xffffffffffffffff, 0xae03, 0xaa)
ioctl$KVM_IOEVENTFD(0xffffffffffffffff, 0x4040ae79, 0x0)
close(0x5)
close(0x4)

      
      munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000584000/0x800000)=nil, 0x800000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000340))
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000040)={0xfffffffffffff001, 0x2000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x402, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000600)=@arm64_core={0x6030000000100008, &(0x7f00000005c0)=0x9})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_sve={0x6080000000150465, 0x0})

      
      munmap(&(0x7f0000e85000/0x2000)=nil, 0x2000)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x2})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_core={0x603000000010001e, 0x0})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000040)=@arm64_fw={0x6030000000140003, &(0x7f0000000000)=0xa2})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x80086601, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x5452, 0x2000fdfd)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
write$eventfd(r4, &(0x7f0000000000), 0xfffffe1e)
r5 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      openat$kvm(0x0, 0x0, 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
close(r1)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x3000002, 0x11, r1, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(r4, 0x20, &(0x7f0000000000)="7cfaa2bfd6dd76375aa1bde04fceeb33743b07d73b3e9aac", 0x0, 0x18)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x603000000010001a, &(0x7f00000000c0)=0x8})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x400000f, 0x80031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000040)="4df74d20cd04ee4ce2aa8a0797d68e953766cd7a4855880c9bf8c2b7cf738dc33732698d631778d116a24fd82e39c234c499eff943378c8ca92835aac201b216e92cae0faa84392b", 0x0, 0x48)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x101900, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="a55afac482ae9086510a1cfeebb372c746b69b695f50f0fe4a42e0db94adb9afe18edc51d30da60113b8f98bcdfe68bbc48c525a1b3867d3b43108ff914877781493d36fc97b8d2f", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x8600, 0x408)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0xa)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x1f})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_sys={0x603000000013c090, &(0x7f0000000000)=0x8000000000000001})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x2, &(0x7f00000000c0)=0xde5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100004, &(0x7f00000000c0)=0xa83})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000480)=ANY=[@ANYBLOB="0100000000000000020000000200000008"])

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x2000, 0x0)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r2 = eventfd2(0x80005ff, 0x1)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f00000001c0)={0xffffffffffffffff, 0x3, 0x2, r2})
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r4, 0xc00caee0, &(0x7f0000000140)={0x4, <r5=>0xffffffffffffffff, 0x1})
write$eventfd(r5, &(0x7f00000001c0)=0xffffff7f, 0xff25)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(0xffffffffffffffff, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0xeeffbffd, 0xffd, 0x1}})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r2, 0xefaf, 0x2, r2})
write$eventfd(r2, &(0x7f0000000100)=0x7, 0x8)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x103a00, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000100)=@arm64={0xdc, 0x1, 0x0, '\x00', 0xffff})

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000f1a000/0x4000)=nil, 0x930, 0x0, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x3, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
munmap(&(0x7f00006e2000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ffb000/0x3000)=nil, 0x3000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x48)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000080)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000040)=0xc000000000001000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xd7, 0x80000001})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
r5 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x1)
r6 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x1800002, 0x11, r5, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r6, 0x20, &(0x7f0000000380)="f30138dd033be3ac4ac4a29ea6ab08004b584bd92e2e0000000000000f0000000000010001000000000000000300000000000000040a00", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r5, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x40305829, &(0x7f0000000040)=@attr_arm64={0x0, 0x0, 0x100000000000000, 0x0})

      
      ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x1, 0x1)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000040)={r2, 0x2, 0x2, r2})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x1, 0x1, 0x5000, 0x1000, &(0x7f0000fa2000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x26e8, 0x0, 0x0, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000040)={0x1})

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x1000000)
munmap(&(0x7f0000584000/0x800000)=nil, 0x800000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x4, 0x40)
r2 = eventfd2(0x1, 0x800)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000180)={r2, 0x3, 0x0, r2})
ioctl$KVM_IRQFD(r1, 0x4020ae76, 0x0)
write$eventfd(r2, &(0x7f0000000000)=0xa5, 0x8)

      
      r0 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae03, 0xc3)

      
      munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x1000000)
mmap$KVM_VCPU(&(0x7f0000bb8000/0x400000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_fp_extra={0x60200000001000d5, &(0x7f00000001c0)=0xfffffffffffffff9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000000)={0xa})

      
      openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
syz_kvm_add_vcpu$arm64(0x0, &(0x7f0000000080)={0x0, &(0x7f00000002c0)=ANY=[@ANYBLOB="07000000000000002800000000000000010000000000000001000000000000000100000000000000080000000000000028000000000000000f00000000000200000000000000000000000000009adcfcde6736eba15eb2465242a62674ccd1a572fa3670bcf0e6c1444f690b150175e882a36c8d05328fcfbd9bdf21ba1a433c176b967333ab16f216e2a0d077e00629e2f58c73a630506c2ce779312b4d3d46958f330be2a1c6f707e2cc84563d2ae7acebe26cf05fd903931955256b5b384e25b2dba0bd593d0adb68dae235929f1f64ecc30d129721cc0cf72a7f6fd3c910555b0a125b8cd67a2ee6d4e4744d157a53b506504e18c50772c7384475d6583918d0f54e960b2a3af2f2db831a450d7d7e4ddf0a95dc83d14105ca149f452ef169055a1d28e47d847d6c7f56e330320c0ec19344020aa516fdb6280e4024dbe4ca6bc1256012c5a8ae7886813e5147cd6d20ab88ab4b975e17037004bd3ab4db"], 0x50}, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc018aec0, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x843, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_arm64={0x0, 0x1, 0x1, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x4, <r2=>0xffffffffffffffff, 0x932d82b1a9412f16})
mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, 0x930, 0x0, 0x40032, 0xffffffffffffffff, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, 0x0)

      
      openat$kvm(0x0, 0x0, 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000240), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000280)=@arm64_sys={0x603000000013c000, &(0x7f00000000c0)=0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_HALT_POLL(r1, 0x4068aea3, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000b80)={0x0, &(0x7f0000000040)=ANY=[@ANYBLOB="1e00000000000000400000000000000008000084"], 0x40}, &(0x7f0000000bc0)=[@featur1={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x0)
r6 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r5, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r6, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r5, 0x0)
close(r4)
r7 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
write$eventfd(r7, &(0x7f0000000180)=0x5, 0xfffffe09)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
syz_kvm_vgic_v3_setup(r1, 0x2, 0xa0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000000)={0x2, 0x8000, 0x2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_GET_ONE_REG(0xffffffffffffffff, 0x4010aeab, &(0x7f0000000600)=@arm64_core={0x6030000000100008, 0x0})
ioctl$KVM_GET_REG_LIST(r2, 0xc008aeb0, &(0x7f0000000600)=ANY=[])

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000006c0), 0x8000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, 0xffffffffffffffff)

      
      r0 = eventfd2(0x1, 0x1)
r1 = openat$kvm(0x0, &(0x7f0000000080), 0x20200, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
eventfd2(0x10001, 0x0)
syz_kvm_vgic_v3_setup(r2, 0x1, 0x100)
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f00000000c0)={r0, 0x4, 0x0, r0})
ioctl$KVM_SET_GSI_ROUTING(r2, 0x4008ae6a, &(0x7f0000000240)=ANY=[@ANYBLOB="01000000000000000300000002"])
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f0000000040)={r0, 0x3, 0x2, r0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x7, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, 0x930, 0x0, 0x40032, 0xffffffffffffffff, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR_vm(r1, 0x4018aee3, 0x0)

      
      munmap(&(0x7f0000eb0000/0x3000)=nil, 0x3000)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, r1, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000100), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xac)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x140, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000580)=[{0x0, &(0x7f0000000140)=[@smc={0x1e, 0x40, {0x84000012, [0x38000000000, 0x9, 0x4, 0x8001, 0x6]}}], 0x40}], 0x1, 0x0, 0x0, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000780)={0x0, &(0x7f0000000680)=[@code={0xa, 0x84, {"00fc209b000008d5e0888dd20060b8f2c10080d2620080d2a30180d2440080d2020000d4000000130000002b000008d50084202ea0a483d20000b8f2010080d2020080d2630180d2040180d2020000d4004d8fd200c0b0f2e10080d2e20080d2030180d2040180d2020000d4007008d5"}}], 0x84}, &(0x7f00000007c0)=[@featur2={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013df19, &(0x7f0000000100)=0x1})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_bitmap={0x6030000000160002, &(0x7f00000001c0)})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_ccsidr={0x6020000000110005, &(0x7f00000000c0)=0x7})

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="0100000001000000000000000806"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0)=0xffffff7f, 0xe80)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000480)={0x2, 0x0, [{0x2, 0x2, 0x0, 0x0, @irqchip={0x8, 0x5}}, {0x2, 0x2, 0x0, 0x0, @sint={0x5, 0x4a}}]})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000080)={0x0, 0xb4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x40086602, 0x110e02ffff)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x18})
syz_kvm_vgic_v3_setup(r1, 0x1, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000080)=@attr_pmu_init)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
mmap$KVM_VCPU(&(0x7f0000002000/0x3000)=nil, r4, 0xa, 0x11, r3, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r3, 0x0)
r6 = eventfd2(0x0, 0x0)
close(r6)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f000000c000/0xe000)=nil, 0xe000)
write$eventfd(r6, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000080)={0x8, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x1, 0x10000000000, 0xa00, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000000)=@arm64_sve_vls={0x606000000015ffff, &(0x7f00000000c0)=0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x121e82, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_core={0x6030000000100030, &(0x7f0000000300)=0x5})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1ff, 0x1, 0x100000, 0x2000, &(0x7f0000f5c000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x0, 0xcccc0000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
munmap(&(0x7f0000fe8000/0x3000)=nil, 0x3000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000200)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000080)={0xfffffffe, 0x1, 0x2}})

      
      r0 = openat$kvm(0x0, &(0x7f00000002c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r3 = mmap$KVM_VCPU(&(0x7f0000004000/0x2000)=nil, 0x930, 0x2800002, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f00000001c0)="fb4149dd033be3ac2cc4a22332a77b23b08986814d7bb14c94a6ab8031d1dfd92f00000000010000005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa7fc869d22627e7", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000001000/0x2000)=nil, 0x930, 0x2000003, 0x4120932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x401c5820, &(0x7f0000000000)=@attr_arm64={0x0, 0x5, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={0xffffffffffffffff, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_other={0x0, 0x8, 0x9, &(0x7f00000001c0)=0xffffffffffffff80})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000080)=@attr_arm64={0x0, 0x8, 0x0, &(0x7f0000000240)=0x5})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x78)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=[@its_setup={0x82, 0x28, {0x1, 0x1, 0x2}}, @its_send_cmd={0xaa, 0x28, {0xf, 0x5, 0x0, 0x6, 0xfffffffe, 0x2}}], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000001000/0x2000)=nil, 0x930, 0x2000003, 0x4120932, 0xffffffffffffffff, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, 0x930, 0x2000007, 0x30d2a4fbfbea96b8, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x42)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x400454d0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x401c5820, 0x8000000000000001)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000080)=@attr_irq_timer={0x0, 0x1, 0x0, &(0x7f0000000000)=0x1f})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5, 0x8})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000ff5000/0x3000)=nil, 0x930, 0x100000f, 0x24132, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5, 0x8})
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ff5000/0x3000)=nil, 0x930, 0x100000f, 0x24132, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
ioctl$KVM_SET_ONE_REG(0xffffffffffffffff, 0x4010aeac, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0xa)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x15})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013c00a, &(0x7f0000000040)=0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x541b, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000180)=ANY=[@ANYBLOB="02000000000000000000000001"])

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
close(0x5)
syz_kvm_vgic_v3_setup(r1, 0x0, 0x200)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
close(r2)
close(0x4)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000400)=ANY=[@ANYBLOB="8200"], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee0000, 0x2, r2, 0x8})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x0, 0x0, 0x1, r2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f00000000c0)={0x1})
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_SIGNAL_MSI(r1, 0x4020aea5, &(0x7f0000000200)={0x8090040})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x2})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, &(0x7f00000001c0)=0x3, 0x10)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_core={0x6030000000100008, &(0x7f0000000180)=0x9})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r3 = openat$kvm(0x0, &(0x7f0000000000), 0x2002, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r6, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
r7 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r4, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, r7, 0x300000e, 0x13, r6, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(0xffffffffffffffff, 0x4010ae67, &(0x7f0000000000)={0xd000, 0x19000})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r6, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
ioctl$KVM_RUN(r6, 0xae80, 0x0)
r8 = openat$kvm(0x0, &(0x7f0000000180), 0x0, 0x0)
r9 = ioctl$KVM_CREATE_VM(r8, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)
r10 = ioctl$KVM_CREATE_VCPU(r9, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r10, 0x4020aeae, &(0x7f0000000080)={0x5, 0x85})
ioctl$KVM_SET_ONE_REG(r10, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100042, &(0x7f0000000100)=0x1b})
r11 = eventfd2(0x5, 0x800)
write$eventfd(r11, &(0x7f0000000100), 0xfffffffffffffe8a)
r12 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r12, 0xae03, 0xcd)
close(r1)
ioctl$KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2(r1, 0x4068aea3, &(0x7f0000000040))
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0xaece, 0x0)
close(0x4)

      
      mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x1})
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="fb52456012ab8ba1286bf6cd8100cd30f00515f86636544e44c404000000006abf47d90000000000000000000000000000000000000000fff900", 0x0, 0xfffffffffffffe49)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xff3c)

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100042, &(0x7f0000000100)=0x1b})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x2, 0x4003831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x3000)=nil, 0x930, 0x3000007, 0x2012, r2, 0x2000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x1c1040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x43033, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000f48000/0x3000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a13f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x2, 0x10000000000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x101001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION(r1, 0xae03, 0xe4)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xc020660b, 0xe1)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x400454c9, 0x0)

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x8004b709, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x1)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
ioctl$KVM_CAP_HALT_POLL(r1, 0x4068aea3, &(0x7f0000000080)={0xb6, 0x0, 0x8000000000000001})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x1, 0x1, 0x5000, 0x1000, &(0x7f0000fa2000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x26e8, 0x0, 0x0, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000040)={0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x29)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000b80)={0x0, &(0x7f0000000040)=ANY=[@ANYBLOB="1e00000000000000400000000000000004000084"], 0x40}, &(0x7f0000000bc0)=[@featur1={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x8008b705, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x6030000000100034, &(0x7f0000000100)=0xffffffffffffffff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x3, 0x6, 0xfffffffffffffffe})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xb2)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000100), 0x76b200, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
munmap(&(0x7f0000000000/0x2000)=nil, 0x2000)
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000140)={0xdddd0000, 0x104000})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000000)={0x100000, 0x8000})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000100)={0x80a0000})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x501080, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000040)={0x2, 0x0, [{0x0, 0x1, 0x0, 0x0, @msi={0x2, 0x40000, 0x2, 0x9}}, {0x9, 0x3, 0x0, 0x0, @sint={0x1, 0x3}}]})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x77)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x8)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100042, &(0x7f0000000100)=0x800})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x18})
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000080)=@attr_pmu_init)

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000380)="f30149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a3ff7fbc51869be2e2e0000000000000f000000000000000001000000000000000000000000000e00", 0x0, 0x34)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0x6, &(0x7f0000000040), 0x565e02, 0x0)

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000f1a000/0x4000)=nil, 0x930, 0x0, 0x9032, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, r3, 0x1000007, 0x2012, r4, 0x0)

      
      r0 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x109000, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, r2, 0x0, 0x12, r5, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r2, 0x0, 0x2012, r5, 0x0)
ioctl$KVM_CREATE_VCPU(r0, 0xae41, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_ccsidr={0x602000000011000b, &(0x7f0000000180)=0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000180)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x8200, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x6)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x8933, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000100), 0x82001, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x3000007, 0x2012, r0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x20000, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(0xffffffffffffffff, 0xae04)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, r1, 0x100000f, 0x12, r4, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0xf, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013dce0, &(0x7f0000000000)=0x3ff})
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r4 = syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000180)={0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="14000000000000002000000000000000f2c4130000003060008000000000000014000000000000002000000000000000e0dc1300000030d11b"], 0x60}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r4, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
ioctl$KVM_RUN(r4, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE(r1, 0x4068aea3, &(0x7f0000000140)={0xe4, 0x0, 0x80000000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, &(0x7f0000000000)="37d3116035d7513e9a000200018000", 0x0, 0x43)
ioctl$KVM_CREATE_VM(r0, 0x40086602, 0x20000000)

      
      ioctl$KVM_SET_USER_MEMORY_REGION2(0xffffffffffffffff, 0x40a0ae49, &(0x7f0000000240)={0x1, 0x0, 0x0, 0x1000, &(0x7f0000e97000/0x1000)=nil, 0x40})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfff02000000ffffff00000d00e6ffea000000002000", 0x0, 0xffffffffffffff98)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x9, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x0, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x8)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100042, &(0x7f0000000100)=0x81f})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_GET_REG_LIST(r2, 0xc008aeb0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x600040, 0x0)
close(r0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x2801, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xb1)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR_vcpu(r2, 0x4018aee3, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x101900, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="a55afac482ae9086510a1cfeebb372c746b69b695f50f0fe4a42e0db94adb9afe18edc51d30da60113b8f98bcdfe68bbc48c525a1b3867d3b43108ff914877781493d36fc97b8d2f", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x733140, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x1, 0x0)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x0)
eventfd2(0x10, 0x80001)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
ioctl$KVM_IOEVENTFD(0xffffffffffffffff, 0x4040ae79, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x80)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xc6)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x400000f, 0x80031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x5452, 0xa00000000000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000200)=@arm64={0x4, 0x0, 0x0, '\x00', 0x101})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000001000000000000000100000000000000aa0000000000000028000000000000000c00000000000400000000000000dfff00000000000000007209399ad3dc974d4bbac91caca7728a1e6cef247ec862b51948cc"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      munmap(&(0x7f000049b000/0x400000)=nil, 0x400000)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_GET_DEVICE_ATTR_vcpu(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_other={0x0, 0x2, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f00000007c0)=ANY=[@ANYBLOB="be0000000000000018000000000000001ac0"], 0x18}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r4, 0x3000000, 0x11, r3, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x0, 0x11, r3, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0xa2d40, 0x0)
ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f000000e000/0x3000)=nil, r1, 0x1000002, 0x12, r5, 0x0)
r6 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r6, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000040)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000100)={0xa, 0x8000, 0xf33d48fdd23b28c7}})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000300)=@attr_arm64={0x0, 0x4, 0x4, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_HAS_DEVICE_ATTR_vcpu(r2, 0x4018aee3, &(0x7f0000000180)=@attr_other={0x0, 0x0, 0x5, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000001000000000000001100000000000000aa00000000000000280000000000000009"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000200)={0x1, 0x0, [{0x9c2e, 0x1, 0x0, 0x0, @adapter={0x3, 0x2, 0x4, 0x10001, 0x3}}]})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQ_LINE_STATUS(r1, 0xc008ae67, 0xfffffffffffffffe)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r4, 0x4040ae79, &(0x7f0000000080)={0x2, 0x0, 0x0, r5})
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f0000000240)={r5, 0xfb, 0x2, r5})
ioctl$KVM_CHECK_EXTENSION(r3, 0x40086602, 0x110e20ffff)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000dee000/0x3000)=nil, 0x0, 0x0, 0x10, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, 0x0, 0x1a2943, 0x0)
ioctl$KVM_CHECK_EXTENSION(r1, 0xae03, 0xcd)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x800454cf, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000e95000/0x4000)=nil, 0x0, 0x1000006, 0x2010, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x300000c, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000000)={0x4})
close(0x5)
r3 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r3, 0x1, 0x100)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e7000000000000000000000000100800", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r3, 0x0)
r5 = eventfd2(0x0, 0x0)
close(r5)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
r6 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
write$eventfd(r5, &(0x7f0000000100), 0x8)
mmap$KVM_VCPU(&(0x7f0000004000/0x4000)=nil, r6, 0x467af21e7c8bde04, 0x8013, r5, 0x0)
write$eventfd(r5, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      mmap$KVM_VCPU(&(0x7f0000c17000/0x3000)=nil, 0x930, 0x0, 0x10, 0xffffffffffffffff, 0x20)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x4, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x60100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x2, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f00000000c0)=@arm64_extra={0x6030000000160000, &(0x7f0000000140)=0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x4, 0x100)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000003c0)=@attr_arm64={0x0, 0x3, 0x2, &(0x7f0000000340)=0x100})

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000f78000/0x3000)=nil, 0x0, 0x0, 0x6efcf3a2fd459e36, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ce0000/0x3000)=nil, 0x3000)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x8040aeb6, &(0x7f0000000200)=@attr_other={0x0, 0x8, 0x9, 0x0})
ioctl$KVM_GET_VCPU_MMAP_SIZE(0xffffffffffffffff, 0xae04)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x4, 0x80)
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000140)=@attr_other={0x0, 0x4, 0x5, 0x0})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x401054d6, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_ONE_REG(r1, 0x4010aeab, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="6e000000000000000000000000000000000008080000000090"], 0x19}, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
r5 = syz_kvm_add_vcpu$arm64(r2, 0x0, 0x0, 0x0)
ioctl$KVM_GET_ONE_REG(r5, 0x4010aeab, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x101080, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x40086602, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000002c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xf)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40c02, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_GET_VCPU_EVENTS(r2, 0x8040ae9f, &(0x7f00000000c0)=@arm64)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x2, 0x0, 0x0, r2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000001a40)={0xff, 0x0, 0x1, r2, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r2 = syz_kvm_vgic_v3_setup(r1, 0x5, 0x140)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_arm64={0x0, 0x6, 0x0, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x88141, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_sys={0x603000000013c4f6, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_other={0x0, 0x8, 0x4, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, 0x0, 0x0)

      
      openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, 0x930, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000400)=ANY=[@ANYBLOB="82000000000000000000000000000000010000000000000001000000000000000173"], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000b80)={0x0, &(0x7f0000000240)=ANY=[], 0x40}, 0x0, 0x0)
ioctl$KVM_SET_ONE_REG(r3, 0x4010aeac, &(0x7f00000000c0)=@arm64_fw={0x6030000000140000, &(0x7f0000000080)=0x10003})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x5, &(0x7f0000000100)=0x8010000001000001})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x0, 0x200)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
close(r2)
close(0x4)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x2400, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r2, 0xc0189436, 0x100000000000000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x4, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f00000001c0))

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000000000/0x400000)=nil)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000a40), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0xc0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000cc0)=@attr_other={0x0, 0x1, 0x480, &(0x7f0000000000)=0x2})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfff02000000ffffff00000d00e6ffea0000000020000000000000000000000000000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x9, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x0, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000002c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xa)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x5460, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x8000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013dce0, &(0x7f0000000000)=0x3ff})
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000040)=@arm64={0x3, 0x7f, 0xe, '\x00', 0x80ae})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, r3, 0x300000e, 0x13, r2, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(0xffffffffffffffff, 0x4010ae67, &(0x7f0000000000)={0xd000, 0x19000})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x80005ff, 0x1)
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000180)={r3, 0x27, 0x2, r2})
r4 = eventfd2(0x0, 0x0)
r5 = eventfd2(0xffff, 0x80801)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r4, 0x40fff, 0x2, r5})
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000001c0)={r3, 0x3, 0x3, r2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r3 = mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x6, 0x40a8012, r2, 0x2000)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000380)="fb4109000000e3ac2cc4a22332fdaa8de0418df242000000008e700900d1dfd92f0000000001fffffdff2627e700", 0x0, 0x48)

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x400, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(0xffffffffffffffff, 0x4040aea0, &(0x7f0000000000)=@arm64={0x9, 0x3d, 0x1, '\x00', 0x8})
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x1000002, 0xaf832, 0xffffffffffffffff, 0x0)
syz_kvm_setup_cpu$arm64(0xffffffffffffffff, 0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000700)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x40086602, 0x110e22ffff)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_cpu$arm64(0xffffffffffffffff, r1, &(0x7f0000000000/0x400000)=nil, &(0x7f0000000000)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)

      
      syz_kvm_add_vcpu$arm64(0x0, &(0x7f0000000080)={0x0, &(0x7f00000002c0)=ANY=[@ANYBLOB="07000000000000002800000000000000010000000000000001000000000000000100000000000000080000000000000028000000000000000f00000000000200000000000000000000000000009adcfcde6736eba15eb2465242a62674ccd1a572fa3670bcf0e6c1444f690b150175e882a36c8d05328fcfbd9bdf21ba1a433c176b967333ab16f216e2a0d077e00629e2f58c73a630506c2ce779312b4d3d46958f330be2a1c6f707e2cc84563d2ae7acebe26cf05fd903931955256b5b384e25b2dba0bd593d0adb68dae235929f1f64ecc30d129721cc0cf72a7f6fd3c910555b0a125b8cd67a2ee6d4e4744d157a53b506504e18c50772c7384475d6583918d0f54e960b2a3af2f2db831a450d7d7e4ddf0a95dc83d14105ca149f452ef169055a1d28e47d847d6c7f56e330320c0ec19344020aa516fdb6280e4024dbe4ca6bc1256012c5a8ae7886813e5147cd6d20ab88ab4b975e17037004bd3ab4db"], 0x50}, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013df19, &(0x7f0000000100)=0x1})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_bitmap={0x6030000000160000, &(0x7f00000001c0)=0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x222000, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_other={0x0, 0x1, 0x14000fc, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000a40), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000c40)={0x2, 0x0, [{0x40, 0x3, 0x1, 0x0, @sint={0x7fffffff, 0xf}}, {0xb4c700d5, 0x3, 0x1, 0x0, @sint={0x6, 0x4}}]})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x24)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=ANY=[@ANYBLOB='F'], 0x40}, 0x0, 0x0)
r4 = eventfd2(0x836, 0x1)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000000)={0x1006, 0x8000000, 0x4, r4})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x1fd, 0x1, 0x8080000, 0x2000, &(0x7f0000fa3000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000040)={0x1, 0x0, 0x0, 0x2000, &(0x7f0000fa2000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000040)={0x1, 0x0, 0x1000, 0x2000, &(0x7f0000fa2000/0x2000)=nil})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000380))

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x5452, 0x2000fdfd)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r3, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r5 = eventfd2(0x0, 0x0)
close(r0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000a40), 0x8600, 0x0)
ioctl$KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2(0xffffffffffffffff, 0x4068aea3, &(0x7f0000001c00)={0xa8, 0x0, 0x3})
syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
write$eventfd(r5, &(0x7f0000000000), 0xfffffe1e)
r6 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r6, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r7 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r8 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r8, 0x4020ae46, 0x0)
close(0x3)
r9 = ioctl$KVM_CREATE_VCPU(r8, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r9, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r9, 0x4018aee1, &(0x7f0000000240)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000280)={0x6, 0x2538}})
r10 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r10, 0xae01, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x14})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000000)=@arm64_fw={0x6030000000140000, &(0x7f0000000100)=0x5})

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xd7, 0x80000001})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
r5 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x1)
r6 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x1800002, 0x11, r5, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r6, 0x20, &(0x7f0000000380)="f30138dd033be3ac4ac4a29ea6ab08004b584bd92e2e0000000000000f0000000000010001000000000000000300000000000000040a00", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r5, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x40305839, &(0x7f0000000040)=@attr_arm64={0x0, 0x0, 0x100000000000000, 0x0})

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000fa8000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f00006b4000/0x3000)=nil, 0x930, 0xf, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000080)=[{0x600000000000000, 0x0}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000080)=@arm64_sys={0x603000000013c807, &(0x7f0000000280)=0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xb6)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000180)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000000)={0x9, 0x8002, 0x1}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xa5)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_REG_LIST(0xffffffffffffffff, 0x4020aeae, &(0x7f0000000000)=ANY=[@ANYBLOB="05000000000000000000000000000082"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000040)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x800454d2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r3, 0x0)
r5 = eventfd2(0x0, 0x0)
close(r5)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r5, &(0x7f0000000140)=0x1, 0x8)
r6 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
write$eventfd(r5, &(0x7f0000000100), 0x8)
mmap$KVM_VCPU(&(0x7f0000004000/0x4000)=nil, r6, 0x467af21e7e8bde02, 0x11, r5, 0x0)
write$eventfd(r5, &(0x7f0000000180)=0x5, 0xfffffde3)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0xc0e00, 0x2000)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xd8, 0x1})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000ad4000/0x1000)=nil, 0x1000)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x7, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4030582a, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000200)=@arm64_core={0x6030000000100026, &(0x7f0000000140)=0x2})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x8)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
syz_kvm_setup_cpu$arm64(r2, r3, &(0x7f00009c4000/0x400000)=nil, &(0x7f0000000500)=[{0x0, &(0x7f0000000c00)=ANY=[], 0x2c7}], 0x1, 0x0, &(0x7f0000000540)=[@featur1={0x1, 0x3}], 0x1)

      
      munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
mmap$KVM_VCPU(&(0x7f0000ff5000/0x3000)=nil, 0x930, 0x100000f, 0x24132, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ff5000/0x1000)=nil, 0x1000)

      
      syz_kvm_add_vcpu$arm64(0x0, &(0x7f00000000c0)={0x0, &(0x7f00000002c0)=ANY=[@ANYBLOB="07000000000000002800000000000000010000000000000001000000000000000100000000000000080000000000000028000000000000000f000000000002000000000000000000000000000074ccd1a572fa3670bcf0e6c1444f690b150175e882a36c8d05328fcfbd9bdf21ba1a433c176b967333ab16f216e2a0d077e00629e2f58c73a630506c2ce779312b4d3d46958f330be2a1c6f707e2cc84563d2ae7acebe26cf05fd903931955256b5b384e25b2dba0bd593d0adb68dae235929f1f64ecc30d129721cc0cf72a7f6fd3c910555b0a125b8cd67a2ee6d4e4744d157a53b506504e18c50772c7384475d6583918d0f54e960b2a3af2f2db831a450d7d7e4ddf0a95dc83d14105ca149f452ef169055a1d28e47d847d6c7f56e330320c0ec19344020aa516fdb6280e4024dbe4ca6bc1256012c5a8ae7886813e5147cd6d20ab88ab4b975e17037004bd3ab4dbc05115180e96555ac26050194217c9476fad9f77849b8b2e273a47c4f7624d2070bcb5c13c3597e53435ce0b4318fbc5323fdf572f61"], 0x50}, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x145480, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x84001, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000380)={0x10200, 0x0, 0xdddd1000, 0x1000, &(0x7f0000ffe000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f00000000c0)=@arm64_core={0x603000000010002e, &(0x7f00000001c0)=0xffffffffffffffff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000240)={0x3, 0x80000003}})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@mrs={0xbe, 0x18, {0x6030000000138007}}], 0x18}, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x0, 0x3, 0x11, r3, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100042, &(0x7f0000000100)=0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@irq_setup={0x46, 0x18, {0x1, 0x20}}], 0x18}, 0x0, 0x0)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000100)={0x0, &(0x7f0000000140)=[@irq_setup={0x46, 0x18, {0x1, 0x20}}], 0x18}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x100)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000180)={0x1010020, 0x1})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
syz_kvm_assert_syzos_uexit$arm64(0x0, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x1, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xe})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_extra={0x6030000000140000, &(0x7f00000001c0)=0x10001})

      
      r0 = openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, 0x0, 0x2000, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(0xffffffffffffffff, 0x4018aee1, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000f82000/0x3000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r5 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r4, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, r5, 0x5000003, 0x80031, 0xffffffffffffffff, 0x0)
r6 = mmap$KVM_VCPU(&(0x7f0000f82000/0x1000)=nil, r5, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r6, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
ioctl$KVM_SET_DEVICE_ATTR_vm(r2, 0x4018aee1, &(0x7f0000000200)=@attr_other={0x0, 0x0, 0x8000000000000007, 0x0})
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, 0xffffffffffffffff)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0xfffffffffffffffd, 0x40)
r2 = eventfd2(0x0, 0x0)
r3 = eventfd2(0xffff, 0x80801)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r2, 0x40fff, 0x2, r3})
close(r2)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x0, r2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x10001, 0x0, 0x1, r2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0x80111500, 0x20000000)
write$eventfd(r1, &(0x7f0000000000), 0xfffffdef)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0x80111500, 0x20000000)
write$eventfd(r3, &(0x7f0000000000), 0x26d07478)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x400454d9, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xe5)

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ffd000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c8f000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000d47000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
munmap(&(0x7f00006e2000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ffc000/0x4000)=nil, 0x4000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x80811501, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000780)={0x0, &(0x7f0000000040)=ANY=[@ANYBLOB="1e0000000000000040000000000000000000008400000006"], 0x40}, &(0x7f00000007c0)=[@featur2={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r4, 0x40305829, 0x10000000000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x80000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000300)=@arm64_fp_extra={0x60200000001000d6, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x80, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xe4)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_core={0x6030000000100026, &(0x7f00000000c0)=0x3})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x8, 0x5c1fd1b6565d2f2, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f00000000c0)=@arm64_sve={0x6080000000150500, &(0x7f0000000080)=0x40})
openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x100, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000340), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xd9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x8, <r2=>0xffffffffffffffff, 0x1})
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x3000003, 0x30, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x0)
r3 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r3, 0x0, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40305839, &(0x7f0000000100)=@attr_other={0x0, 0x3, 0x7fffffffffffffff, &(0x7f0000000300)=0x1a})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000180)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000000)={0xf5fe, 0xff, 0x1}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee8000, 0x0, r2})
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000100)={0x2, 0x0, 0x0, r3})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000001a40)={0x8, 0x0, 0x0, r3, 0x4})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000080)={0x80020009, 0x1})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, 0x0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x20000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x3, 0xe0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000040)={0x5})
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000080)=@arm64_extra={0x603000000013c513, 0x0})

      
      syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, &(0x7f0000000000)="7c5aa1bde04fceeb33743b07d73b3e9eac00", 0x0, 0x18)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x401c5820, 0x20000000)

      
      openat$kvm(0x0, 0x0, 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_vgic_v3_setup(r1, 0x4, 0x100)
r2 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f00000001c0)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x8, 0x1, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000040)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000100)={0x403, 0x6, 0x1}})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013dce0, &(0x7f0000000000)=0x43ff})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x300000c, 0x4f832, 0xffffffffffffffff, 0x1000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x5)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000040)=@attr_arm64={0x0, 0x2, 0x0, &(0x7f0000000000)=0xc0000000000})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000040)={0xffffffffffffffff, 0x2})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x1, 0x1, 0x1000, 0x2000, &(0x7f0000c98000/0x2000)=nil})
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000040)={0x1})

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c8f000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000d47000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1c})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000140)=@arm64_sys={0x603000000013df5a, &(0x7f00000000c0)=0xfffffffffffffffc})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x100000c, 0x6832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
write$eventfd(r4, &(0x7f0000000000), 0xfffffe1e)
r5 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r6 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000800000/0x800000)=nil, 0x800000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x43033, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000e31000/0x2000)=nil, 0x930, 0x5, 0x2012, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x29031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000e0c000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000f1a000/0x4000)=nil, 0x930, 0x0, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x3, 0x9032, 0xffffffffffffffff, 0x0)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ad4000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000584000/0x800000)=nil, 0x800000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x400, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000000)=ANY=[], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000040)={0xdddd0000, 0x108000})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = eventfd2(0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x80111500, 0x20000000)

      
      munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000f71000/0x6000)=nil, 0x6000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x4, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000080)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f00000000c0)})

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="fb52b604127a837f0400003345de6a2d8dc85e000000f86636544e44c404000000006abf47d900", 0x0, 0x48)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="0100000001000000000000000800"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xff3c)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x18})
ioctl$KVM_ARM_VCPU_FINALIZE(r2, 0x4004aec2, &(0x7f0000000180)=0x4)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x18}) (async)
syz_kvm_vgic_v3_setup(r1, 0x3, 0x180)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="0100000001000000000000000806"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xff3c)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(0xffffffffffffffff, 0x4040aea0, &(0x7f0000000000)=@x86={0xd, 0x10, 0xe, 0x0, 0x7, 0x40, 0x40, 0x7e, 0x8, 0x81, 0x7, 0xd4, 0x0, 0x23, 0x3, 0xc6, 0x1, 0xd3, 0x0, '\x00', 0x7, 0x6})
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_fw={0x6030000000140002, &(0x7f0000000080)=0x6})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, 0x0)

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000c88000/0x2000)=nil, 0x930, 0x1, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000140)={0x8000000, 0x4000})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000080)={0x2000, 0x10000})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000180)={0x2, 0x5834d4dbb6893c4c})
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x8440, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000002c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_other={0x0, 0x4, 0xfffffffffffffffe, 0x0})

      
      syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000bfd000/0x400000)=nil)
r0 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, 0x0})
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, 0x930, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
r4 = syz_kvm_setup_syzos_vm$arm64(r3, &(0x7f0000c00000/0x400000)=nil)
r5 = syz_kvm_add_vcpu$arm64(r4, &(0x7f0000000080)={0x0, &(0x7f0000000400)=ANY=[@ANYBLOB="82000000000000000000000000000000010000000000000001000000000000000173"], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r3, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000100)={0x8, <r6=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r6, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r5, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r6, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      syz_kvm_add_vcpu$arm64(0x0, &(0x7f0000000080)={0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="460000000000000018000000000000000100000020"], 0x18}, 0x0, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="010000000100000000000000080009"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xe80)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100048, &(0x7f0000000100)=0x3})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, 0x930, 0x2000007, 0x30d2a4fbfbea96b8, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000180)=@attr_arm64={0x0, 0x0, 0x0, 0xffffffffffffffff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f00000000c0), 0x200, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r3, 0xae03, 0xac)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r4, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_GET_ONE_REG(0xffffffffffffffff, 0x4010aeab, &(0x7f0000000600)=@arm64_core={0x6030000000100008, 0x0})
ioctl$KVM_GET_REG_LIST(r4, 0xc008aeb0, &(0x7f0000000600)=ANY=[])
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x80040, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r6, 0xae41, 0x0)
r7 = syz_kvm_vgic_v3_setup(r6, 0x3, 0x40)
r8 = ioctl$KVM_CREATE_GUEST_MEMFD(0xffffffffffffffff, 0xc040aed4, &(0x7f0000000000)={0x3, 0x100000001})
ioctl$KVM_SET_USER_MEMORY_REGION2(r6, 0x40a0ae49, &(0x7f0000000240)={0x10000, 0x7, 0xffff1000, 0x1000, &(0x7f0000e91000/0x1000)=nil, 0x400, r8})
r9 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r10 = ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
r11 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r12 = ioctl$KVM_CREATE_VM(r11, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x30282, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r12, 0x4018aee1, &(0x7f0000000200)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000080)={0x1}})
openat$kvm(0xffffffffffffff9c, 0x0, 0x400, 0x0)
r13 = ioctl$KVM_CREATE_VCPU(r10, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r10, r4, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000180)=[{0x0, &(0x7f0000000640)=[@hvc={0x32, 0x40, {0xc4000053, [0x9, 0x39, 0x1, 0x4, 0x1]}}, @its_send_cmd={0xaa, 0x28, {0xc, 0x1, 0x3, 0xe, 0x5, 0x7, 0x1}}, @code={0xa, 0x6c, {"000028d5000008d50000006d600a81d20000b0f2e10180d2620180d2e30180d2a40180d2020000d400000053e06691d20060b0f2010180d2620180d2630180d2640180d2020000d4000028d5007008d50064002f007008d5"}}, @mrs={0xbe, 0x18, {0x3844}}, @memwrite={0x6e, 0x30, @vgic_gits={0x8080000, 0x4, 0x2278, 0xa}}, @its_setup={0x82, 0x28, {0x1, 0x4, 0x39a}}, @its_send_cmd={0xaa, 0x28, {0xe, 0x1, 0x3, 0x7, 0xa0000000, 0x1ff, 0x4}}, @mrs={0xbe, 0x18, {0x603000000013debb}}, @its_setup={0x82, 0x28, {0x4, 0x1, 0x22}}, @msr={0x14, 0x20, {0x603000000013c4cf, 0x4}}, @hvc={0x32, 0x40, {0x40, [0x4, 0x9, 0x80000001, 0x4, 0x6]}}, @irq_setup={0x46, 0x18, {0x3, 0x101}}, @irq_setup={0x46, 0x18, {0x0, 0xd9}}, @smc={0x1e, 0x40, {0x84000002, [0x5, 0x3ff, 0x3, 0x4, 0x9]}}, @msr={0x14, 0x20, {0x603000000013e6c1, 0x7}}, @its_send_cmd={0xaa, 0x28, {0x9, 0x1, 0x0, 0xa, 0x8, 0x400, 0x2}}, @svc={0x122, 0x40, {0x84000004, [0xec, 0x81, 0x5, 0x0, 0x8f]}}, @uexit={0x0, 0x18, 0x9}], 0x31c}], 0x1, 0x0, &(0x7f0000000300)=[@featur1={0x1, 0x10}], 0x1)
syz_kvm_setup_cpu$arm64(r10, r13, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000000)=ANY=[], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r10, 0x4010ae67, &(0x7f0000000040)={0xdddd0000, 0x119000, 0x1})
ioctl$KVM_RUN(r13, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r7, 0x4018aee1, &(0x7f00000001c0)=@attr_other={0x0, 0x7, 0x0, &(0x7f0000000200)=0x105b7})
ioctl$KVM_GET_DEVICE_ATTR(r7, 0x4018aee2, &(0x7f0000000100)=@attr_arm64={0x0, 0x7, 0x0, &(0x7f00000000c0)=0x401})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r3, 0x4068aea3, &(0x7f0000000180))

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0xf, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x102, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR_vcpu(r3, 0x4018aee3, &(0x7f00000001c0)=@attr_other={0x0, 0x5, 0x7fffffff, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000000)=@arm64_sys={0x603000000013c006, &(0x7f0000000100)=0xc})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0xd, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, 0x0, 0x2041, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r3=>0xffffffffffffffff})
ioctl$KVM_GET_DEVICE_ATTR(r3, 0x4018aee2, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x80000)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000100)={0xd000, 0x10000, 0x0, r2})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f00000000c0)={0x5000, 0x5000})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x10001, 0xf000, 0x0, r2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0x0, &(0x7f0000000040), 0x260002, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae03, 0xbb)
ioctl$KVM_IOEVENTFD(r1, 0xc0189436, &(0x7f0000000080)={0x0, 0x0, 0x1, 0xffffffffffffffff, 0x5})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x800, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x90)

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x2, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_HAS_DEVICE_ATTR_vcpu(r2, 0x4018aee3, &(0x7f0000000000)=@attr_pmu_init)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r3 = eventfd2(0x3ff, 0x80001)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r3, 0x0, 0x0, r3})
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000080)=ANY=[])

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x80040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x3, 0x40)
ioctl$KVM_SET_USER_MEMORY_REGION2(r1, 0x40a0ae49, &(0x7f0000000240)={0x10000, 0x7, 0xffff1000, 0x1000, &(0x7f0000e91000/0x1000)=nil, 0x400})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000100)=@attr_arm64={0x0, 0x7, 0x0, &(0x7f00000000c0)=0x401})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r3, 0x0)
r5 = eventfd2(0x0, 0x0)
close(r5)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r5, &(0x7f0000000140)=0x1, 0x8)
r6 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
mmap$KVM_VCPU(&(0x7f0000004000/0x4000)=nil, r6, 0x467af21e7e8bde02, 0x11, r5, 0x0)
write$eventfd(r5, &(0x7f0000000180)=0x5, 0xfffffde3)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0xc0e00, 0x2000)

      
      r0 = eventfd2(0xfffffffa, 0x0)
close(r0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8440000429610fbff65521ce16f8f1f447d69835673312654ebb20176c85cd204000000000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r3, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000280)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000040)={0x7fff, 0xb, 0x1}})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000180)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000100)={0x9, 0x81}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x1, 0xcccc0000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x1, 0x0, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_other={0x0, 0x0, 0x973, &(0x7f0000000100)=0x10001})

      
      syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, 0xfffffffffffffffe, 0x0, 0xfffffffffffffee9)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=[@its_setup={0x82, 0x28, {0x1, 0x1, 0x2}}, @its_send_cmd={0xaa, 0x28, {0xf, 0x5, 0x0, 0x6, 0x0, 0x2}}], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x40087602, 0x0)
r3 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r3, 0x4040ae79, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r4 = eventfd2(0x836, 0x1)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000000)={0x1006, 0x8000000, 0x4, r4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x10001, 0x5, 0x2000, 0x2000, &(0x7f0000ea3000/0x2000)=nil})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r3, 0xae04)
r5 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x3000)=nil, r4, 0x100000a, 0x12, r5, 0x100000)
r6 = ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x6, 0x40a8012, r5, 0x2000)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r2, 0x3000000, 0x12, r6, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x1000007, 0x2012, r5, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x121e82, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x6)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000100)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_fp={0x60400000001010b6, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x80000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000140)=@arm64_sve={0x6080000000150159, &(0x7f00000001c0)=0xe11})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000a40), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_GET_DEVICE_ATTR(r3, 0x4018aee2, &(0x7f0000000040)=@attr_arm64={0x0, 0x1, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x2, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5, 0x3})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      openat$kvm(0xffffffffffffff9c, &(0x7f00000002c0), 0x18b400, 0x0)
munmap(&(0x7f0000ffd000/0x3000)=nil, 0x3000)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x20001, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xc0189436, 0x20004000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_other={0x0, 0x1, 0xfc, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x9)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r2, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x2, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r3 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_SIGNAL_MASK(r4, 0x4004ae8b, &(0x7f0000000100)={0x8, "e5ccd16738eaa59c"})
r5 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r5, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r5, 0xae80, 0x0)

      
      ioctl$KVM_GET_VCPU_MMAP_SIZE(0xffffffffffffffff, 0xae04)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013df19, &(0x7f0000000100)=0x3ff})
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000180)=[{0x0, 0x0}], 0x1, 0x0, &(0x7f00000001c0)=[@featur1={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7})
ioctl$KVM_SIGNAL_MSI(r1, 0x4020aea5, &(0x7f0000000000)={0x100000, 0xeeee0000, 0xff, 0x2, 0x9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x3, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000001c0)=@attr_other={0x0, 0x6, 0x382bc648, &(0x7f0000000200)=0x5ba})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x4020940d, 0xfffffffffffff4c2)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20203, 0x0)

      
      munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ffa000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)

      
      munmap(&(0x7f0000c8f000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100014, 0xfffffffffffffffe})
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000d47000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
r3 = eventfd2(0x1, 0x80000)
write$eventfd(r3, &(0x7f0000000000)=0x8000, 0x8)
munmap(&(0x7f0000ff5000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000db0000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000002c0), 0x300, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000880)=@arm64={0xae, 0x5, 0x9, '\x00', 0x6})
ioctl$KVM_GET_VCPU_EVENTS(r2, 0x8040ae9f, &(0x7f00000008c0))

      
      r0 = eventfd2(0x0, 0x0)
write$eventfd(r0, 0xffffffffffffffff, 0x0)

      
      r0 = eventfd2(0xfffffffa, 0x80001)
write$eventfd(r0, &(0x7f0000000200)=0x8, 0x8)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000480)=ANY=[@ANYBLOB="02000000000000000200000002000000000000000000000008000000050000000000000000000000000000000000000000000000000000000200"])

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r2, 0x80111500, 0xfffffffffffff000)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = syz_kvm_setup_syzos_vm$arm64(r4, &(0x7f0000c00000/0x400000)=nil)
r6 = syz_kvm_add_vcpu$arm64(r5, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=ANY=[@ANYBLOB="820000000000000000000000000000000100000000000000010000000000000001"], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r4, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r4, 0xc00caee0, &(0x7f0000000100)={0x8, <r7=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r7, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
r8 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r9 = ioctl$KVM_CREATE_VM(r8, 0xae01, 0x0)
r10 = syz_kvm_setup_syzos_vm$arm64(r9, &(0x7f0000c00000/0x400000)=nil)
r11 = syz_kvm_add_vcpu$arm64(r10, &(0x7f0000000080)={0x0, &(0x7f0000000580)=ANY=[@ANYBLOB="320000000000000040000000000000001200008400000000000000000000000080ffffffffffffff080000000000000000000000000000000400000000000000be00000000000000180000000000000030c0"], 0x208}, 0x0, 0x0)
ioctl$KVM_RUN(r11, 0xae80, 0x0)
ioctl$KVM_RUN(r6, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2, 0x4102932, 0xffffffffffffffff, 0x0)
r12 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r13 = ioctl$KVM_CREATE_VM(r12, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r13, &(0x7f0000c00000/0x400000)=nil)
r14 = syz_kvm_vgic_v3_setup(r13, 0x1000001, 0x100)
ioctl$KVM_SET_DEVICE_ATTR(r14, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x5, &(0x7f0000000100)=0x8010000001000001})
ioctl$KVM_SET_DEVICE_ATTR(r7, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x4, 0x1, 0x0})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000040)={0x1, 0x0, 0x8000000, 0x2000, &(0x7f0000ffd000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x0, 0x5000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
mmap$KVM_VCPU(&(0x7f0000dfb000/0x3000)=nil, 0x0, 0x200000c, 0x13, r11, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x1, 0x5000, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x1)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x4, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000100)={0x1, 0x5000, 0x8, r2, 0x2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f00000000c0)={0x1000, 0x0, 0x1, r2, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x402, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0x3, 0x3, 0x1}})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000080)=ANY=[@ANYBLOB="320000000000000040000000000000005300008400"], 0x40}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE(r1, 0x4068aea3, &(0x7f0000000180))

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x580, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f00008a0000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000e00)=ANY=[], 0x630}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000080)=@attr_other={0x0, 0x2, 0x0, &(0x7f0000000000)=0x80})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_ONE_REG(r1, 0x4010aeab, &(0x7f0000000100)=@other={0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_kvm_setup_cpu$arm64(0xffffffffffffffff, 0xffffffffffffffff, &(0x7f0000bff000/0x400000)=nil, &(0x7f0000000000)=[{0x0, 0x0, 0x18}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000380), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000180)=@attr_other={0x0, 0x1, 0x304, &(0x7f00000000c0)=0x82})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x5450, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x402, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000080)=ANY=[], 0x40}, 0x0, 0x0)
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000180)=@arm64_core={0x6030000000100030, &(0x7f0000000140)=0x838})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000240)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000280)={0x6, 0x2538}})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x804)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      eventfd2(0x0, 0x0)
r0 = eventfd2(0x0, 0x0)
close(r0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee8000, 0x0, r2})
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000180)={0x2, 0x0, 0x2, r3})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000001a40)={0x8, 0x0, 0x0, r3, 0x4})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x3)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_sys={0x603000000013808c, &(0x7f0000000000)=0x8})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2c00, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000700)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x0, 0x0})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x5, 0x0, &(0x7f0000000240)=0x104})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000280)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000001000000000000000200000000000000aa00000000000000280000000000000001"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, &(0x7f00000001c0), 0xf001)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r5=>0xffffffffffffffff, 0x1})
write$eventfd(r5, &(0x7f00000000c0)=0xffffffffffff8000, 0x8)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xd9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x8, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40305839, &(0x7f0000000100)=@attr_arm64={0x0, 0x1, 0x100000000000000, &(0x7f0000000180)=0x10001})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x1, 0x40)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
close(r1)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x0)
r6 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r5, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r6, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r5, 0x0)
close(r4)
r7 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
write$eventfd(r7, &(0x7f0000000180)=0x5, 0xfffffe09)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
syz_kvm_vgic_v3_setup(r1, 0x2, 0xa0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, 0x0, 0x0, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(r0, 0x20, 0x0, 0x0, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000100)={0x4})
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000140)={0x4})

      
      ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x3)
r0 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r1 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x8000, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r5, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r5, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013dce0, &(0x7f0000000000)=0x3ff})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r5, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
ioctl$KVM_RUN(r5, 0xae80, 0x0)
r6 = syz_kvm_setup_syzos_vm$arm64(r2, &(0x7f0000c00000/0x400000)=nil)
r7 = syz_kvm_add_vcpu$arm64(r6, &(0x7f0000000180)={0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="14000000000000002000000000000000f2c4130000003060008000000000000014000000000000002000000000000000e0dc1300000030d11b"], 0x60}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r7, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
ioctl$KVM_RUN(r7, 0xae80, 0x0)
r8 = syz_kvm_add_vcpu$arm64(r0, &(0x7f0000000180)={0x0, &(0x7f0000000380)=ANY=[@ANYBLOB="14000000000000002000000000000000f1c4130000003060008000000000000014000000000000002000000000000000f2c4130000003060008000000000000014000000000000002000000000000000e0dc130000003060c7"], 0x140}, &(0x7f0000000300)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r8, 0x4018aee1, &(0x7f0000000340)=@attr_pmu_init)
ioctl$KVM_RUN(r8, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = syz_kvm_vgic_v3_setup(r1, 0x9, 0x140)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_other={0x0, 0x5, 0x8, &(0x7f0000000000)=0x7})

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x400454d1, 0x0)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000000)={0xffffffffffffffff, 0x1, 0xea12157bff932e6})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x40086602, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013c000, 0xfffffffffffffffe})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000200)=@arm64_core={0x603000000010000c, &(0x7f0000000140)=0x2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_SIGNAL_MASK(r2, 0x4004ae8b, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x28a43, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f00000001c0)=ANY=[@ANYBLOB="0002"])

      
      munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x3)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, r3, 0x3000000, 0x12, r4, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, r3, 0x1000007, 0x2012, r4, 0x0)
r5 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r6, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r6, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000040)={0x1ff, 0x0, 0x6000, 0x2000, &(0x7f0000fa2000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000140)={0x1fd, 0x2, 0xf000, 0x1000, &(0x7f0000d6a000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x26e8, 0x0, 0x0, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfe000/0x400000)=nil)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x1, 0x800)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000180)={r2, 0x3, 0x0, r2})
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r3, 0x40fff, 0x0, r3})
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000000)={r2, 0x3, 0x1, r2})

      
      r0 = ioctl$KVM_GET_VCPU_MMAP_SIZE(0xffffffffffffffff, 0xae04)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x4)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, r0, 0x1000004, 0x32e7851d6de9e532, r4, 0x0)
ioctl$KVM_GET_SREGS(r4, 0x8000ae83, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_sys={0x603000000013c090, &(0x7f0000000300)=0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000180)=@arm64_core={0x603000000010003e, &(0x7f0000000100)=0xc74d})

      
      ioctl$KVM_GET_VCPU_MMAP_SIZE(0xffffffffffffffff, 0xae04)
r0 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = openat$kvm(0x0, 0x0, 0x60100, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r4, 0x4020aeae, 0x0)
ioctl$KVM_GET_ONE_REG(r4, 0x4010aeab, 0x0)
r5 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r5, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r5, 0x4020ae46, &(0x7f0000000000)={0x1fe, 0x3, 0x8080000, 0x1000, &(0x7f0000ffc000/0x1000)=nil})
r6 = syz_kvm_add_vcpu$arm64(r0, &(0x7f0000000080)={0x0, &(0x7f0000000280)=ANY=[@ANYBLOB="aa00000000000000280000000000000003"], 0x28}, 0x0, 0x0)
ioctl$KVM_RUN(r6, 0xae80, 0x0)

      
      r0 = eventfd2(0x101, 0x800)
munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x2, 0x120)
ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x1)
r3 = syz_kvm_vgic_v3_setup(r2, 0x5, 0x140)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x101900, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x0)
r7 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r6, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r7, 0x20, &(0x7f0000000100)="a55afac482ae9086510a1cfeebb372c746b69b695f50f0fe4a42e0db94adb9afe18edc51d30da60113b8f98bcdfe68bbc48c525a1b3867d3b43108ff914877781493d36fc97b8d2f", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r6, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x8600, 0x408)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_GET_DEVICE_ATTR(r3, 0x4018aee2, &(0x7f0000000080)=@attr_arm64={0x0, 0x7, 0x3, &(0x7f0000000000)=0x100000001})
write$eventfd(r0, &(0x7f0000000080)=0xfffffffffffffff7, 0x8)
r8 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r9 = syz_kvm_add_vcpu$arm64(r8, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x1, 0x100)
ioctl$KVM_RUN(r9, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r9, 0x4018aee1, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x200000c, 0x4069831, 0xffffffffffffffff, 0x1000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f00000001c0)=@attr_pmu_irq={0x0, 0x0, 0x0, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ce0000/0x3000)=nil, 0x3000)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@other={0x0, 0x0})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x800454d7, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f0000bfe000/0x400000)=nil, &(0x7f00000008c0)=[{0x0, &(0x7f0000000640)=ANY=[@ANYBLOB="820000000000000028000000000000000200000000000000030000000000000046"], 0x248}], 0x1, 0x0, 0x0, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=[@its_setup={0x82, 0x28, {0x1, 0x1, 0x1}}], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x120)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000240)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_other={0x0, 0x5, 0x4, 0x0})

      
      r0 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
syz_kvm_setup_syzos_vm$arm64(r0, &(0x7f0000bfd000/0x400000)=nil)
r1 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, 0x0})
mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, 0x930, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
r4 = syz_kvm_setup_syzos_vm$arm64(r3, &(0x7f0000c00000/0x400000)=nil)
r5 = syz_kvm_add_vcpu$arm64(r4, &(0x7f0000000080)={0x0, &(0x7f0000000400)=ANY=[@ANYBLOB="82000000000000000000000000000000010000000000000001000000000000000173"], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r3, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000100)={0x8, <r6=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r6, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r5, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r6, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7})
ioctl$KVM_SIGNAL_MSI(r1, 0x4020aea5, &(0x7f0000000240)={0x0, 0x10000, 0x0, 0x0, 0x8d})

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000240), 0x2, 0x0)
r2 = openat$kvm(0x0, 0x0, 0x22c00, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x4, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000180)=@attr_arm64={0x0, 0x1, 0x1, &(0x7f0000000000)=0x9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x3c0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x140)

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0xf, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = eventfd2(0x1, 0x1)
r1 = openat$kvm(0x0, &(0x7f0000000080), 0x20200, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r2, 0x1, 0x100)
ioctl$KVM_SET_GSI_ROUTING(r2, 0x4008ae6a, &(0x7f0000000240)=ANY=[@ANYBLOB="01000000000000000300000002"])
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f0000000040)={r0, 0x3, 0x2, r0})
write$eventfd(r0, &(0x7f0000000000), 0x8)
ioctl$KVM_SET_VCPU_EVENTS(0xffffffffffffffff, 0x4040aea0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x40049409, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x8440, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, 0x930, 0x9, 0x10, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x2, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000002c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_other={0x0, 0x4, 0x0, 0x0})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0xdc032, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa7fc869d22627e700", 0x0, 0x48)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_REG_LIST(0xffffffffffffffff, 0x4020aeae, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000040)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x200, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, 0xffffffffffffffff, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_FINALIZE(r2, 0x4004aec2, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x68bc0, 0x0)
openat$kvm(0x0, 0x0, 0x457140, 0x0)
openat$kvm(0xffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r2, &(0x7f0000c00000/0x400000)=nil)
openat$kvm(0xffffffffffffff9c, 0x0, 0x40000, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000ca9000/0x1000)=nil, 0x930, 0x2, 0x2010, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0xc, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x20000000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x140)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x100)
ioctl$KVM_GET_DEVICE_ATTR_vcpu(r2, 0x4018aee2, &(0x7f0000000240)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f0000000200)=0x6})

      
      r0 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000180)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_assert_syzos_uexit$arm64(0x0, 0xffffffffffffffff)
syz_kvm_assert_reg(r3, 0x603000000013c4f2, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce2, 0x8000)
syz_kvm_assert_reg(r3, 0x603000000013dce3, 0x8000)
ioctl$KVM_GET_MP_STATE(r3, 0x8004ae98, &(0x7f0000000000))

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x800000000003, 0xeeee0000, 0x2, r2, 0x8})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0xfffffffffffffffc, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x0, 0x0, 0x1, r2, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_sys={0x603000000013c807, &(0x7f0000000300)=0x37b7})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x6030000000100020, &(0x7f0000000100)=0x39})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x20)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000000)={0x1, 0x0, [{0xf, 0x2, 0x1, 0x0, @adapter={0x4, 0x582, 0x9, 0x6, 0x8}}]})
close(0x5)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xcd)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x4000, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000000)=0xc000000000000000})
ioctl$KVM_SET_USER_MEMORY_REGION2(r1, 0x40a0ae49, &(0x7f00000003c0)={0x2710, 0x2, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil, 0x8})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67301ce16f8f1f449a7a835673312b54ebb2aa8cc869d22627e7000000000000000000001f000e00", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x121218, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r4, 0x40305828, 0x10000000000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0x3, 0x1000, 0x2}})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="1e0000000000000040000000000000000100008600000000fc4ce74f000000007802000000000000f8ffffff020000000000000000"], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0xc0081, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000640)=@attr_arm64={0x0, 0x0, 0x3, &(0x7f00000000c0)=0xde5})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x7e)

      
      r0 = eventfd2(0x2, 0x80000)
syz_kvm_setup_syzos_vm$arm64(r0, &(0x7f0000c00000/0x400000)=nil)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x80)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000180)=@attr_set_pmu={0x0, 0x0, 0x3, &(0x7f0000000040)=0x8})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0xef000000, 0x1000, 0x2}})
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000140)={0x0, &(0x7f0000000180)=[@smc={0x1e, 0x40, {0xef000000, [0x0, 0x1, 0x2, 0x3, 0x4]}}, @hvc={0x32, 0x40, {0xef000000, [0x0, 0x1, 0x2, 0x3, 0x4]}}], 0x80}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r4, 0x3, 0x11, r3, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
syz_kvm_assert_syzos_uexit$arm64(r5, 0xffffffffffffffff)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x1c1040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x43033, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000f48000/0x3000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a13f2, 0x0)
ioctl$KVM_CHECK_EXTENSION(r4, 0x541b, 0x6b2)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1c})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013c020, &(0x7f0000000140)=0x7})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000780), 0x1, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x10)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_extra={0x603000000013df19, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000f33000/0x2000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
munmap(&(0x7f0000ed8000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000380)=ANY=[], 0x28}, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x4, 0x320)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_SIGNAL_MSI(r1, 0x4020aea5, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000080)=@attr_arm64={0x0, 0x8, 0x4, &(0x7f0000000040)=0xffffffffffffff80})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x80005ff, 0x1)
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000180)={r3, 0x27, 0x2, r3})
r4 = eventfd2(0x0, 0x0)
r5 = eventfd2(0xffff, 0x80801)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r4, 0x40fff, 0x2, r5})
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000001c0)={r2, 0x6d, 0x0, r2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=[@smc={0x1e, 0x40, {0x80000000, [0x0, 0x2, 0x2, 0x4, 0x4]}}], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x300000c, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0)=0xffffff7f, 0xff25)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_GET_API_VERSION(r0, 0x2, 0x1000000000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r3, 0x0)
r5 = eventfd2(0x0, 0x0)
close(r5)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
r6 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
mmap$KVM_VCPU(&(0x7f0000004000/0x4000)=nil, r6, 0x467af21e7e8bde02, 0x11, r5, 0x0)
write$eventfd(r5, &(0x7f0000000180)=0x5, 0xfffffde3)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0xc0e00, 0x2000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x5, 0x1e0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000100)=@attr_pmu_irq={0x0, 0x0, 0x0, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xe7)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="cfd589f46c597945b5a5074263bcb4f10e2f9419d690ac1c53be4bec8b529135783816c48a673916fc7d6ec77f0eae1c5f5140880eedf99393eb1764158dd0b178b76c5c6162b6a7", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)

      
      openat$kvm(0x0, &(0x7f0000000040), 0x4080, 0x0)
mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, 0x930, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000001, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, &(0x7f00000001c0), 0xc40, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5})
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, &(0x7f00000001c0)={0x8})
ioctl$KVM_SET_SIGNAL_MASK(r2, 0x4004ae8b, &(0x7f00000001c0)=ANY=[])
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_sys={0x603000000013dcf3, &(0x7f0000000000)=0x8})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0x80111500, 0x20000000)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8440000429610fbff65521ce16f8f1f447d69835673312654ebb20176c85cd204000000000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r4, 0x0)
close(r1)
openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x121e82, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x2, 0x0, 0x0, 0xffffffffffffffff, 0x9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000040)=@arm64_core={0x6030000000100042, &(0x7f0000000000)=0xffffffffffffffff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000200)=@attr_other={0x0, 0x8, 0x3d74, 0x0})

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x2, 0x23ac5f9b426ec4b1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000300)=@attr_other={0x0, 0x0, 0x4, &(0x7f00000000c0)=0x1a1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x14})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000000)=@arm64_sys={0x603000000013c000, &(0x7f0000000100)=0x100000000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x5460, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x580, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfe000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000e00)=ANY=[], 0x630}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000080)=@attr_other={0x0, 0x2, 0x0, &(0x7f0000000000)=0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x30282, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000200)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000080)={0x1}})

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x1})
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="fb52456012ab8ba1286bf6cd81002000d300447c7a837fc869cba6cd30f0050003000000d0020000ffffff000000f86636544e44c404000000006abf47d900", 0x0, 0x48)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xff3c)

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x400454cb, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x401c5820, 0xffff98600fff)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000280)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000040)={0x7fff, 0xb, 0x1}})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013df19, &(0x7f0000000100)=0x1})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_extra={0x603000000013c02b, &(0x7f0000000000)=0x9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000080)=[{0x0, &(0x7f00000000c0)=[@irq_setup={0x46, 0x18, {0x1, 0x20}}], 0x18}], 0x1, 0x0, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000100)={0x1000020, 0x1})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
r0 = mmap$KVM_VCPU(&(0x7f00006b5000/0x2000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
munmap(&(0x7f0000ffe000/0x1000)=nil, 0x1000)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
mmap$KVM_VCPU(&(0x7f00006b4000/0x3000)=nil, r2, 0x300000f, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x8, 0x88, &(0x7f00000001c0)=0xe5c5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x200000e, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_fw={0x6030000000140001, &(0x7f00000000c0)=0x5})

      
      ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, &(0x7f0000000200)={0x5})
r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f00000002c0)=ANY=[@ANYBLOB="46000000000000001800000000000000010000002000000082000000000000002800000000000000010000000000000001000000000000000100000000000000aa000000000000002800000000000000030000000000000000fdff"], 0x68}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xa})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000100)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f00000000c0)={0x4, 0xbc}})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x3000000, 0x4f831, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x8, 0x5c1fd1b6565d2f2, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000300)=@attr_arm64={0x0, 0x4, 0x1, 0x0}) (rerun: 64)
mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, 0x930, 0x200000e, 0x30d2a4fbfbea96b8, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x5452, 0x2000fdfd)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
write$eventfd(r4, &(0x7f0000000000), 0xfffffe1e)
r5 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r6 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000180)=@attr_other={0x0, 0x1, 0x180, &(0x7f00000000c0)=0x80})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xc00000)=nil, 0x930, 0xf, 0x32, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000080)={0x5, 0xffffffffffffffff, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ff5000/0x3000)=nil, 0x930, 0x100000f, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013df02, &(0x7f0000000100)=0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000040)=@arm64_sve_vls={0x606000000015ffff, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x1)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000000)={0x401, 0x5000, 0x8, r2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000100)={0x1000, 0x5000, 0x8, r2, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b8540000429610fbff67521ce16f8f1f447d69835673312b54ebb20176c869d22627e700000000000000000000000000000900", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r4, 0x40305839, 0x10000000000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000100)={0x0, 0x0}, 0x0, 0x0)
r5 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r5, 0x3, 0x11, r3, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
mmap$KVM_VCPU(&(0x7f000000a000/0x1000)=nil, r5, 0x3, 0x11, r4, 0x0)
r6 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r7 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r8 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
syz_kvm_setup_syzos_vm$arm64(r8, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r8, 0x4020ae46, &(0x7f0000000000)={0x1fe, 0x0, 0x8080000, 0x1000, &(0x7f0000ffc000/0x1000)=nil})
r9 = syz_kvm_add_vcpu$arm64(r6, &(0x7f0000000080)={0x0, &(0x7f0000000280)=ANY=[@ANYBLOB="aa00000000000000280000000000000003"], 0x28}, 0x0, 0x0)
ioctl$KVM_RUN(r9, 0xae80, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, 0x0, 0xe00)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40305828, &(0x7f0000000100)=@attr_other={0x0, 0xb, 0x9f01, &(0x7f0000000180)=0xfffffffffffffffc})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x66)

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x2})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0xc, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x20000000)
write$eventfd(r2, &(0x7f00000001c0)=0x3, 0x10)

      
      munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x0, 0x12eeff265b2ad0b8, 0xffffffffffffffff, 0x1000000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000f9f000/0x1000)=nil, 0x930, 0x100000f, 0x50011, r2, 0x0)

      
      r0 = eventfd2(0x0, 0x80000)
write$eventfd(r0, 0x0, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000080)={0x4, <r3=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r3, 0x4018aee3, &(0x7f00000000c0)=@attr_other={0x0, 0x5f7, 0x7, 0x0})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c17000/0x3000)=nil, 0x930, 0x19, 0x8032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x202, 0x1, 0x1000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x0, 0x0, 0xd000, 0x1000, &(0x7f0000ffd000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, 0x0)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f00002cd000/0x400000)=nil, &(0x7f0000000100)=[{0x0, 0x0}], 0x1, 0x0, &(0x7f0000000180)=[@featur2={0x1, 0x24}], 0x1)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION2(r1, 0x40a0ae49, &(0x7f00000001c0)={0x1, 0x1, 0x4, 0x1000, &(0x7f000054a000/0x1000)=nil, 0x200})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x0, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f00000000c0)={0xd000})
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="76b92cfb97422a99b188adac74647aa1221e4d8e6da62d5f533e7f6120be5a845d77658c900fa608d72c085a1f4e5203df5e7728260b7ab522076295a9cbeeae01832398e92fc7bc", 0x0, 0x48)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0xc0800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f0000a75000/0x400000)=nil, 0x0, 0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000880)=@attr_arm64={0x0, 0x2, 0x0, &(0x7f0000000840)=0x3})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_sys={0x603000000013dce0, &(0x7f0000000140)=0xfffffffffffffffa})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_ONE_REG(0xffffffffffffffff, 0x4010aeac, &(0x7f0000000100)=@arm64_ccsidr={0x6020000000110004, 0x0})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000080)={0x4000, 0x10a000})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
r3 = openat$kvm(0x0, 0x0, 0x800, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x400454d8, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x6, 0x8032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x20000, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r7 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r6, 0xae04)
r8 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x3000)=nil, r7, 0x100000a, 0x12, r8, 0x100000)
mmap$KVM_VCPU(&(0x7f0000ffe000/0x1000)=nil, r7, 0x1, 0x11, r8, 0x0)
r9 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, r3, 0x100000f, 0x12, r9, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r3, 0x0, 0x2012, r9, 0x0)
mmap$KVM_VCPU(&(0x7f0000ead000/0x3000)=nil, r3, 0x2800007, 0x11, r9, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000140)={0x5, 0x8}) (async, rerun: 32)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0xf, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQ_LINE_STATUS(r1, 0xc008ae67, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40305839, &(0x7f0000000100)=@attr_other={0x0, 0xb, 0x9f01, &(0x7f0000000180)=0xfffffffffffffffc})

      
      r0 = eventfd2(0x1, 0x1)
r1 = openat$kvm(0x0, &(0x7f0000000080), 0x20200, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r5 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r5, 0xc00caee0, &(0x7f00000000c0)={0x4, <r6=>0xffffffffffffffff})
ioctl$KVM_GET_VCPU_MMAP_SIZE(0xffffffffffffffff, 0xae04)
r7 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r8 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
r9 = ioctl$KVM_CREATE_VCPU(r8, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r9, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r9, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013df19, &(0x7f0000000100)=0x3ff})
syz_kvm_setup_cpu$arm64(r8, r9, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000180)=[{0x0, 0x0}], 0x1, 0x0, &(0x7f00000001c0)=[@featur1={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r9, 0xae80, 0x0)
ioctl$KVM_GET_DEVICE_ATTR(r6, 0x4018aee2, 0x0)
syz_kvm_vgic_v3_setup(r2, 0x1, 0x100)
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f00000000c0)={r0, 0x4, 0x0, r0})
r10 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
r11 = ioctl$KVM_CREATE_VM(r10, 0xae01, 0x0)
r12 = ioctl$KVM_CREATE_VCPU(r11, 0xae41, 0x1)
ioctl$KVM_SET_SREGS(r12, 0x4000ae84, &(0x7f00000004c0)={{0x0, 0x100000, 0x8, 0x3, 0x2, 0xb, 0x4, 0xff, 0x81, 0x4, 0x2, 0x7}, {0x4000, 0x10000, 0x3, 0xf, 0x5, 0x1, 0x3, 0x4, 0xb7, 0x3, 0x7, 0x8}, {0x1000, 0x2, 0x3, 0x44, 0x5, 0x9, 0x8, 0xfd, 0x0, 0xb0, 0x57}, {0x1000, 0xeeee0000, 0x3, 0x8, 0x1, 0x9, 0xff, 0x4, 0x3, 0x4, 0x18}, {0x2000, 0x4, 0x4, 0x7e, 0x5, 0x1, 0x6, 0x1, 0x2, 0x6, 0x9f, 0x7}, {0x8080000, 0xf7ff1000, 0x10, 0x1, 0x5, 0x41, 0x0, 0x9, 0x0, 0x7, 0x9a, 0x92}, {0xdddd0000, 0xeeef0000, 0x9, 0x2, 0x9, 0x4, 0x9, 0x9, 0x6, 0x5, 0x3, 0xc3}, {0x1000, 0x1000, 0x3, 0x4, 0x2, 0x4, 0x2, 0x4, 0x7b, 0xff, 0x6d, 0xe}, {0x4, 0x7}, {0xdddd1000, 0x8}, 0x8, 0x0, 0x8000000, 0x4, 0x3, 0x4000, 0x2, [0x3fa, 0x0, 0x3, 0x8]})
ioctl$KVM_SET_GSI_ROUTING(r2, 0x4008ae6a, &(0x7f0000000240)=ANY=[@ANYBLOB="01000000000000000300000002"])
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000100)={0x8})
ioctl$KVM_IRQFD(r2, 0x4020ae76, &(0x7f0000000000)={r0, 0x2, 0x2, r0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000080)={0x10000, 0x10f000})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x0, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f00000000c0)={0x6000, 0x107000})
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="76b92cfb97422a99b188adac74647aa1221e4d8e6da62d5f533e7f6120be5a845d77658c900fa608d72c085a1f4e5203df5e7728260b7ab522076295a9cbeeae01832398e92fc7bc", 0x0, 0x48)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(0xffffffffffffffff, 0x4040ae79, &(0x7f00000000c0)={0x3, 0x0, 0x2, 0xffffffffffffffff, 0xb})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=ANY=[@ANYBLOB='F'], 0x40}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
r4 = eventfd2(0xc0, 0x80000)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000000)={0x3, 0x2, 0x0, r4})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x40049409, 0x0)

      
      munmap(&(0x7f0000ffd000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000eb0000/0x3000)=nil, 0x3000)

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ff5000/0x1000)=nil, 0x1000)
munmap(&(0x7f00007df000/0x1000)=nil, 0x1000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000584000/0x800000)=nil, 0x800000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000001000/0x2000)=nil, 0x930, 0x2000003, 0x4120932, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000080)={0x8, <r3=>0xffffffffffffffff})
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x3000)=nil, r4, 0x3000003, 0x12, r3, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000100)={0x2010040, 0x1000c53})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x8030aeb4, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_arm64={0x0, 0x3, 0x1, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, r3, 0x1000007, 0x2012, r4, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000200)={0x1fe, 0x2, 0x0, 0x2000, &(0x7f0000ffd000/0x2000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000340)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, 0x0)
ioctl$KVM_SET_GUEST_DEBUG(r2, 0x4208ae9b, &(0x7f0000000180)={0x8002, 0x0, [0x8, 0x8000, 0x400, 0x1000000000, 0x2, 0x7e8, 0x6, 0xffffffffffffffff]})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="fb0149dd033be3ac2cc4a29ea6ab10fbff67301ce16f8f1f449a7a836a73312b54ebb2aa8cc876d226275c000000000000002003001f004e00", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x2c00, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x7, <r4=>0xffffffffffffffff, 0x1})
r5 = ioctl$KVM_CREATE_VM(r4, 0x894c, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r5, 0xb701, 0x0)
ioctl$KVM_CREATE_VCPU(r6, 0xb704, 0x20000002)
r7 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r7, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r7, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x6030000000100016, &(0x7f0000000040)=0x8})
r8 = openat$kvm(0x0, &(0x7f0000000080), 0x101000, 0x0)
r9 = ioctl$KVM_CREATE_VM(r8, 0xae01, 0x0)
r10 = syz_kvm_setup_syzos_vm$arm64(r9, &(0x7f0000b60000/0x400000)=nil)
r11 = syz_kvm_add_vcpu$arm64(r10, &(0x7f0000000140)={0x0, &(0x7f0000000180)=ANY=[@ANYBLOB="3200000000000000400000000000000050000084"], 0x80}, 0x0, 0x0)
ioctl$KVM_RUN(r11, 0xae80, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x400454c8, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x40, &(0x7f0000000080)=0x800})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, 0x0}, 0x0, 0x0)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000180)=ANY=[@ANYBLOB='n\x00\x00\x00\x00\x00\x00\x000\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\b\x00\x00\x00\x00\x00i'], 0x30}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0xfffffbffffffffff, 0x240)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="fb52456012ab8ba1286bf6cd81002000d300447c7a837fc869cba6cd30f0050003000000d0020000ffffff000000f86636544e44c404000000006abf47d900", 0x0, 0x48)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="010000000100000000000000080009"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xff3c)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000e31000/0x2000)=nil, r3, 0x3000011, 0x2012, r2, 0x0)
close(0x5)
close(0x4)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x0, 0x0, 0x1, r2, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
close(0x5)
close(r3)
close(0x4)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000380)=ANY=[@ANYBLOB="820000"], 0x28}, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
syz_kvm_vgic_v3_setup(r1, 0x4, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r5=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r5, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_SIGNAL_MSI(r1, 0x4020aea5, &(0x7f0000000200)={0x8090040, 0x0, 0x0, 0x1, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000180)=@attr_other={0x0, 0x1, 0xc, &(0x7f00000000c0)=0x80})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000200)={0x7})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x5000, 0x10000, 0x4, 0xffffffffffffffff, 0x20})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
syz_kvm_setup_cpu$arm64(r1, r3, &(0x7f0000b4a000/0x400000)=nil, &(0x7f0000000000)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_IRQ_LINE_STATUS(r1, 0xc008ae67, &(0x7f0000000040)={0x0, 0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_MP_STATE(r2, 0x4004ae99, &(0x7f0000000000)=0x5)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x121e82, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x6)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000100)={0x5})
r3 = syz_kvm_vgic_v3_setup(r1, 0x2, 0x40)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x5, 0x1, 0x0})

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x4000)=nil, 0x930, 0x4, 0x4f833, 0xffffffffffffffff, 0x0)

      
      ioctl$KVM_SET_ONE_REG(0xffffffffffffffff, 0x4010aeac, &(0x7f0000000000)=@arm64_fp_extra={0x60200000001000d4, 0x0})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000140)={0x8, 0x0, 0x0, 0xffffffffffffffff, 0x20})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f000075f000/0x2000)=nil, 0x930, 0x80000e, 0x12, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)

      
      r0 = openat$kvm(0x0, 0x0, 0x40000, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="fb52456012ab8ba1286bf6cd8100cd30f00515f86636544e44c404000000006abf47d90000000000000000000000000000000000000000fff900", 0x0, 0xfffffffffffffe49)
r1 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
write$eventfd(0xffffffffffffffff, 0x0, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x2})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
write$eventfd(r4, &(0x7f00000001c0)=0x3, 0x10)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x80, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000080)=@attr_arm64={0x0, 0x0, 0x3, &(0x7f00000000c0)=0xffffffffffffffbe})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000180)="fb0149dd033be3073da85cac1648f1e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76d869d2855c7f3200", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x80086601, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000b80)={0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="1e00000000000000400000000000000004000084806d21c355a0bea6"], 0x40}, &(0x7f0000000bc0)=[@featur1={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_bitmap={0x6030000000162000, &(0x7f00000000c0)=0x8906})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(0xffffffffffffffff, 0x4018aee1, &(0x7f00000002c0)=@attr_pmu_init)
r2 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = syz_kvm_setup_syzos_vm$arm64(r4, &(0x7f0000c00000/0x400000)=nil)
r6 = syz_kvm_add_vcpu$arm64(r5, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@its_setup={0x82, 0x28, {0x1, 0x2001, 0x1}}], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r4, 0x4, 0x100)
ioctl$KVM_CREATE_DEVICE(r4, 0xc00caee0, &(0x7f0000000100)={0x8, <r7=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r7, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, 0x0})
ioctl$KVM_RUN(r6, 0xae80, 0x0)
r8 = openat$kvm(0x0, &(0x7f0000000100), 0x0, 0x0)
r9 = ioctl$KVM_CREATE_VM(r8, 0xae01, 0x0)
r10 = syz_kvm_setup_syzos_vm$arm64(r9, &(0x7f0000c00000/0x400000)=nil)
r11 = syz_kvm_add_vcpu$arm64(r10, &(0x7f0000000140)={0x0, &(0x7f0000000180)=ANY=[], 0xe0}, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_REG_LIST(r11, 0xc008aeb0, &(0x7f0000000000))
ioctl$KVM_SIGNAL_MSI(r4, 0x4020aea5, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
close(0x4)
close(0x5)

      
      mmap$KVM_VCPU(&(0x7f0000daf000/0x3000)=nil, 0x930, 0x3000007, 0x8a031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffe000/0x1000)=nil, 0x930, 0x5, 0x2012, r2, 0x0)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_INTERRUPT(r2, 0x4004ae86, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x0, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="76b92cfb97422a99b188adac74647aa1221e4d8e6da62d5f533e7f6120be5a845d77658c900fa608d72c085a1f4e5203df5e7728260b7ab522076295a9cbeeae01832398e92fc7bc", 0x0, 0x48)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_fp={0x6040000000100060, &(0x7f0000000300)=0x8000000000000000})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000400)={0x0, 0x0, 0x0, 0x20002000, &(0x7f0000000000/0x2000)=nil})
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000000000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000380)={0x10200, 0x0, 0xdddd1000, 0x1000, &(0x7f0000ffe000/0x1000)=nil})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
syz_kvm_vgic_v3_setup(r1, 0x5, 0x1e0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000300)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f00000002c0)=0x64})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000100)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f0000000080)=0x64})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x400000a, 0x12, 0xffffffffffffffff, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x101001, 0x0)
ioctl$KVM_GET_DIRTY_LOG(0xffffffffffffffff, 0x4010ae42, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x801c581f, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x402, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000080)=ANY=[], 0x40}, 0x0, 0x0)
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000180)=@arm64_core={0x6030000000100038, &(0x7f0000000080)=0x838})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_ASSIGN_SET_MSIX_NR(r1, 0x4008ae73, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_SET_MP_STATE(r2, 0x4004ae99, 0xfffffffffffffffe)

      
      openat$kvm(0xffffffffffffff9c, 0x0, 0xc0c0, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x8440, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000002c0)={0x8, <r3=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r3, 0x4018aee3, &(0x7f0000000340)=@attr_arm64={0x0, 0x6, 0x3, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xd7)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013808c, &(0x7f00000000c0)=0x274})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_ONE_REG(r2, 0x8000ae8c, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x8040aeb6, &(0x7f00000004c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000480)={0x0, 0x20}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000040)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x0, 0x3, 0x0})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1) (async)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x3000002, 0x8a031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(r0, 0x20, &(0x7f0000000000)="7cfaa2bfd6dd76375aa1bde04fceeb33743b07d73b3e9aac", 0x0, 0x18)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION2(r1, 0x40a0ae49, &(0x7f00000000c0)={0x1fd, 0x0, 0xdddda000, 0x2000, &(0x7f0000ffc000/0x2000)=nil, 0x6})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000400)={0x0, 0x0, 0x0, 0x20002000, &(0x7f0000000000/0x2000)=nil})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000000000/0x400000)=nil)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x2, 0x8032, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000e31000/0x2000)=nil, 0x930, 0x5, 0x2012, r2, 0x0)
mmap$KVM_VCPU(&(0x7f0000f31000/0x3000)=nil, 0x930, 0x100000a, 0x213011, r2, 0x0)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)
mmap$KVM_VCPU(&(0x7f0000000000/0x4000)=nil, 0x930, 0x4, 0x4f833, 0xffffffffffffffff, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x800454e0, 0x0)

      
      openat$kvm(0x0, 0x0, 0x457140, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xb})
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000140)={0x5, 0xa})

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0xcb3993e4c7433bb8, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x102, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
ioctl$KVM_CAP_ARM_USER_IRQ(r1, 0x4068aea3, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
close(0x3)
eventfd2(0x0, 0x80000)
ioctl$KVM_CHECK_EXTENSION(r0, 0x541b, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000180)=[@msr={0x14, 0x20, {0x6030000000138084, 0x101}}], 0x20}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000080)={0x0, 0x6})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000080)={0xe1, 0x0, 0x2000})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000000000/0x400000)=nil)

      
      ioctl$KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2(0xffffffffffffffff, 0x4068aea3, &(0x7f0000000000)={0xa8, 0x0, 0x1})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x40086602, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xef)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@mrs={0xbe, 0x18, {0x6030000000138010}}], 0x18}, 0x0, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r4, 0x3, 0x11, r3, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
syz_kvm_assert_syzos_uexit$arm64(r5, 0xffffffffffffffff)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1d})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000600)=@arm64_core={0x6030000000100000, &(0x7f00000001c0)=0x2})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
write$eventfd(0xffffffffffffffff, &(0x7f0000000000)=0xe9c5, 0x8)
ioctl$KVM_CREATE_VM(r0, 0x40086602, 0x20000000)

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(0xffffffffffffffff, 0xae04)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x2, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0xc, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x20000000)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x800454dd, 0x0)

      
      openat$kvm(0x0, 0xfffffffffffffffe, 0x0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x3, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r3 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x51)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000340), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xc0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x2400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000040)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000000)={0x7, 0x4, 0x2}})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000300)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000004000000000000000200000000000000aa000000000000002800000000000000040100000000010000000000000002"], 0x61}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, r2, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x8030aeb4, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f00000000c0)=@arm64_fp={0x60400000001000d4, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000873000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, &(0x7f00000000c0)=ANY=[@ANYRES64=r2, @ANYRES8=r2], 0xfffffd06}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x101000, 0x0)
eventfd2(0x7, 0x100001)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xc6)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x1, 0xcccc0000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x3, 0x10000000000, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000140)={0x4100001d, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x5, 0x140)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_arm64={0x0, 0x5, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x24)

      
      openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x0, 0x200000e, 0x4000030, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, 0x0, 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f0000000140)={0x4, <r3=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x400454ce, 0x0)
r4 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r5 = syz_kvm_setup_syzos_vm$arm64(r4, &(0x7f0000c00000/0x400000)=nil)
r6 = syz_kvm_setup_syzos_vm$arm64(r4, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x2)
syz_kvm_add_vcpu$arm64(r6, 0x0, 0x0, 0x0)
syz_kvm_add_vcpu$arm64(r5, 0x0, 0x0, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000300)=@attr_other={0x0, 0x0, 0x4, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = syz_kvm_vgic_v3_setup(r1, 0x9, 0x140)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x100)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_arm64={0x0, 0x5, 0x4, &(0x7f0000000080)})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000080)={0x80020009, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x10)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000280)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000040)={0x7fff, 0x200b, 0x1}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000500), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x7e)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xb2)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x15)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffde3)
close(r4)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x88000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000000)={r1, 0x29c, 0x3})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x54e3, 0x0)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, 0x0)
ioctl$KVM_CREATE_DEVICE(r4, 0xc00caee0, &(0x7f0000000140)={0x4, <r5=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r5, 0x894c, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000d36000/0x4000)=nil, 0x930, 0x100000c, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000200)=@arm64_sys={0x603000000013c10a, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x0, 0x20)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0xa)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x1f})
ioctl$KVM_SET_ONE_REG(r3, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013c00a, &(0x7f0000000040)=0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0xb704, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000080)=@arm64_core={0x6030000000100046, &(0x7f0000000040)=0x4})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x57)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x4})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_fw={0x6030000000140000, &(0x7f00000001c0)=0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x88800, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae01, 0x200440000004)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
syz_kvm_vgic_v3_setup(r1, 0x5, 0x1e0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000300)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f00000002c0)=0x64})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000100)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f0000000080)=0x31})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000200)=@arm64_sys={0x603000000013804c, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_fw={0x6030000000140000, &(0x7f0000000080)=0x7fffffffffffffff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000240)=ANY=[@ANYBLOB="6e00000000000000300000000000000000000008000000000008000000000000090000000000000001"], 0x30}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x4, 0x100)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000080)=@attr_arm64={0x0, 0x8, 0x0, &(0x7f0000000240)=0x8})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000100)={0x1, 0x0, [{0x0, 0x4, 0x1, 0x0, @sint={0xffff, 0x1}}]})

      
      r0 = mmap$KVM_VCPU(&(0x7f00006b5000/0x2000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
mmap$KVM_VCPU(&(0x7f00006b4000/0x3000)=nil, r2, 0x300000f, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x4d)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000bff000/0x400000)=nil, &(0x7f0000000000)=[{0x0, &(0x7f00000000c0)=[@mrs={0xbe, 0x18, {0x603000000013c522}}], 0x18}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x1fe, 0x0, 0x8080000, 0x1000, &(0x7f0000ffc000/0x1000)=nil})
ioctl$KVM_RUN(0xffffffffffffffff, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(0xffffffffffffffff, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee0000, 0x2, 0xffffffffffffffff, 0x8})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x1, 0x1, 0x5000, 0x1000, &(0x7f0000fa2000/0x1000)=nil})
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000040)={0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_core={0x6030000000100042, &(0x7f0000000000)=0xffffffffffff692c})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000f71000/0x6000)=nil, 0x6000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x800000000000)
ioctl$KVM_CAP_HALT_POLL(r1, 0x4068aea3, &(0x7f0000000400)={0xb6, 0x0, 0x3000000000000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_SET_COUNTER_OFFSET(r1, 0x4010aeb5, &(0x7f00000000c0)={0x8})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x2, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0x5452, 0x2000fdfd)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xcd)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0x80111500, 0x20000000)
ioctl$KVM_CREATE_VM(r1, 0x5761, 0x10000000000000)

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000800000/0x800000)=nil, 0x800000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x43033, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xd9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x8, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40186366, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x400454d4, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
close(r2)
syz_kvm_vgic_v3_setup(r1, 0x0, 0x300)
close(0x4)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_ONE_REG(0xffffffffffffffff, 0x4010aeac, &(0x7f0000000000)=@arm64_sys={0x603000000013c807, &(0x7f00000000c0)})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_add_vcpu$arm64(0x0, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@mrs={0xbe, 0x18, {0x603000000013dff6}}], 0x18}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f0000000080)=0x7fffffff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x7fffffff})

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2000002, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x29031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000ad4000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000584000/0x800000)=nil, 0x800000)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      mmap$KVM_VCPU(&(0x7f0000daf000/0x3000)=nil, 0x930, 0x2000009, 0x8a031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="140000000000000020000000000000005dc613000020306005fd0747c17f"], 0x20}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
syz_kvm_vgic_v3_setup(r1, 0x4, 0x40)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x313040, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ffa000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e87000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x2)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x20)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
r6 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r7 = ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r7, 0xae41, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r6, 0xae04)
close(0x5)
close(0x4)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)
r8 = eventfd2(0x1, 0x1)
openat$kvm(0x0, &(0x7f0000000080), 0x20200, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x1, 0x100)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f00000000c0)={r8, 0x4, 0x0, r8})
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4008ae6a, &(0x7f0000000240)=ANY=[])
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000040)={r8, 0x3, 0x2, r8})
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000100)="547a80816119d6f740eba70939b4dd3c67cc8ef30267b6e351ec92609ea1772af89374b2c24ae764125ca82e671b267d8980f7f7061632c7b88459ab6c0154d4086903dedbfdd6fb", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r4, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a13f2, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
ioctl$KVM_CREATE_IRQCHIP(r1, 0xae60)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x400454dc, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x843, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x77)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x600000)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x80000, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000240)={0x5, 0x11})
munmap(&(0x7f0000db1000/0x1000)=nil, 0x1000)
ioctl$KVM_ARM_VCPU_FINALIZE(r2, 0x4004aec2, &(0x7f0000000040)=0x5)

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x2, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_GET_REG_LIST(r2, 0xc008aeb0, &(0x7f0000000680)=ANY=[@ANYBLOB="4bdaef"])
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)

      
      ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000400)={0x0, 0x0, 0x0, 0x20002000, &(0x7f0000000000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x1ff, 0x3, 0xffff1000, 0x1000, &(0x7f0000000000/0x1000)=nil})
mmap$KVM_VCPU(&(0x7f0000d93000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x0, 0x3000000, 0x12, 0xffffffffffffffff, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x6)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x603000000010003c, &(0x7f0000000100)=0x1b})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x57)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000040)={0x1, 0x1, 0x5000, 0x2000, &(0x7f0000fa2000/0x2000)=nil})
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000040)={0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5})
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, &(0x7f00000001c0)={0x8})
ioctl$KVM_SET_SIGNAL_MASK(r2, 0x4004ae8b, &(0x7f00000001c0)=ANY=[])
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee0000, 0x2, r2, 0x8})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x0, 0x0, 0x1, r2, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee3, &(0x7f00000001c0)=@attr_arm64={0x0, 0x6, 0x3, &(0x7f0000000100)=0x5})
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r1, 0xae03, 0x28)
r2 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x1)
syz_kvm_vgic_v3_setup(r5, 0x1, 0x140)
ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x1)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r6 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r7 = ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
r8 = ioctl$KVM_CREATE_VCPU(r7, 0xae41, 0x1)
r9 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r10 = ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r10, 0xae41, 0x1)
r11 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r1, 0xae04)
mmap$KVM_VCPU(&(0x7f0000f5d000/0x2000)=nil, r11, 0x2, 0x12, r8, 0x0)
ioctl$KVM_IRQ_LINE(r10, 0x4008ae61, &(0x7f0000000080)={0x80020007, 0x11})
ioctl$KVM_SET_USER_MEMORY_REGION(r2, 0x4020ae46, &(0x7f00000000c0)={0x1ff, 0x1, 0x100000, 0x2000, &(0x7f0000f5c000/0x2000)=nil})
r12 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r13 = ioctl$KVM_CREATE_VM(r12, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r13, 0xc00caee0, &(0x7f0000000140)={0x4, <r14=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r14, 0x400454e2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000002, 0x11, r4, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r3, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x0, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r5 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r5, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_RUN(r5, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000b00), 0x1401, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000b52000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000001580)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_setup_cpu$arm64(0xffffffffffffffff, r3, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000001840)=[{0x0, 0x0}], 0x1, 0x0, &(0x7f0000001880)=[@featur1={0x1, 0x39}], 0x1)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f000000e000/0x1000)=nil, 0x930, 0x1000001, 0x4013, r2, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, 0x0})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x400454de, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_DEVICE_ATTR_vcpu(r2, 0x4018aee2, &(0x7f0000000080)=@attr_set_pmu={0x0, 0x0, 0x3, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_IRQ_LINE_STATUS(r1, 0xc008ae67, &(0x7f0000000040)={0x0, 0x9})
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000080)={0x0, 0x6})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
close(0x4)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x2a60, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0xb701, 0x0)
close(0x5)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@msr={0x14, 0x20, {0x603000000013df40, 0x8000}}], 0x20}, &(0x7f0000000100)=[@featur1={0x1, 0x8}], 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000140)=@attr_pmu_init)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x0, 0x3, 0x11, r3, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
syz_kvm_assert_syzos_uexit$arm64(r4, 0xffffffffffffffff)
syz_kvm_assert_reg(r3, 0x603000000013df40, 0x8000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x2, 0x0, 0x0, r2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000001a40)={0xfffffffffffffffe, 0x0, 0x0, r2, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x3000)=nil, 0x930, 0x3000007, 0x2012, r2, 0x2000000)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2c00, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x7, <r2=>0xffffffffffffffff, 0x1})
r3 = ioctl$KVM_CREATE_VM(r2, 0x894c, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xb701, 0x0)
ioctl$KVM_CREATE_VCPU(r4, 0xb704, 0x20000002)

      
      openat$kvm(0x0, 0x0, 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_bitmap={0x6030000000160001, &(0x7f0000000000)=0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x9, 0x140)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x100)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_arm64={0x0, 0x3, 0x3, &(0x7f0000000080)=0x3})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_other={0x0, 0x8, 0x8, &(0x7f00000002c0)=0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(0xffffffffffffffff, 0x4018aee1, &(0x7f00000002c0)=@attr_pmu_init)
openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = syz_kvm_setup_syzos_vm$arm64(r3, &(0x7f0000c00000/0x400000)=nil)
r5 = syz_kvm_add_vcpu$arm64(r4, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@its_setup={0x82, 0x28, {0x1, 0x2001, 0x1}}], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r3, 0x4, 0x100)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000100)={0x8, <r6=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r6, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, 0x0})
ioctl$KVM_RUN(r5, 0xae80, 0x0)
r7 = openat$kvm(0x0, &(0x7f0000000100), 0x0, 0x0)
r8 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
r9 = syz_kvm_setup_syzos_vm$arm64(r8, &(0x7f0000c00000/0x400000)=nil)
r10 = syz_kvm_add_vcpu$arm64(r9, &(0x7f0000000140)={0x0, &(0x7f0000000180)=ANY=[], 0xe0}, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_REG_LIST(r10, 0xc008aeb0, &(0x7f0000000000))
ioctl$KVM_SIGNAL_MSI(r3, 0x4020aea5, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x128, &(0x7f0000000340)=0x8000000000000000})
close(0x4)
close(0x5)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1ff, 0x1, 0x100000, 0x2000, &(0x7f0000f5c000/0x2000)=nil})
ioctl$KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE(r1, 0x4068aea3, &(0x7f0000000140)={0xe4, 0x0, 0x5})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
close(r1)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
syz_kvm_vgic_v3_setup(r1, 0x2, 0xa0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000000)={0xffffffffffffffff, 0x1, 0xea12157bff932e6})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000000)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0xc0189436, &(0x7f0000000040)={0xbffffffffffffffd})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x2, &(0x7f0000000080)=0xe})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000240)={0x5, 0x11})
ioctl$KVM_ARM_VCPU_FINALIZE(r2, 0x4004aec2, &(0x7f0000000180)=0x4)
openat$kvm(0xffffffffffffff9c, 0x0, 0x100, 0x0)
mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_REG_LIST(r2, 0xc008aeb0, &(0x7f0000000280)=ANY=[@ANYRES16, @ANYBLOB])

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f932, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000140)={0x8000000, 0x104000})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000000)={0xffff1000, 0x8000})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000100)={0x80a0000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x80, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x200000b, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000180)=@attr_arm64={0x0, 0x6, 0x3, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x30)
ioctl$KVM_ARM_SET_COUNTER_OFFSET(r1, 0x4010aeb5, &(0x7f0000000180)={0x8, 0x7ff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, r1, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, r1, 0x0, 0x23ac5f9b426ec4b1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x100000a, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000000), 0x121e82, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x2)
r5 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r5, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR_vcpu(r4, 0x4018aee3, &(0x7f00000001c0)=@attr_pvtime_ipa={0x0, 0x2, 0x0, 0x5})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_ARM_SET_COUNTER_OFFSET(r1, 0x4010aeb5, &(0x7f0000000180)={0x4f, 0x400000000000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000100)="fb010401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002d0000000000000000000000000000080000fa00", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x6, 0x9, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x5, 0x0, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f00000001c0)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
write$eventfd(r4, &(0x7f0000000000)=0x8001, 0x8)
r5 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ff9000/0x4000)=nil, r5, 0x2000004, 0x2812, r4, 0x0)
write$eventfd(r4, &(0x7f0000000080)=0x8, 0x8)

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f000000e000/0x3000)=nil, r1, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r3, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, r4, 0x100000c, 0x23ac5f9b426ec4b2, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000140)=@arm64_core={0x603000000010002c, &(0x7f0000000100)=0x7})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x400, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000000)=@x86={0x6, 0xef, 0x5, 0x0, 0x3, 0x5, 0x6, 0xd, 0x40, 0x3, 0x4, 0xa, 0x0, 0x5, 0x6, 0x0, 0x0, 0x2b, 0x6, '\x00', 0x0, 0x3})
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, &(0x7f0000000040)=@attr_arm64={0x0, 0x0, 0x3})
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000100)={0x2010040, 0x1000c53})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, &(0x7f00000001c0), 0xc40, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5})
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, &(0x7f00000001c0)={0x8})
ioctl$KVM_SET_SIGNAL_MASK(r2, 0x4004ae8b, &(0x7f00000001c0)=ANY=[])
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000700)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x0, 0x0})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000340)=@attr_other={0x0, 0x1, 0x6488, &(0x7f0000000080)})

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x4, 0x40a8012, 0xffffffffffffffff, 0x2000)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100042, &(0x7f0000000100)=0x8})

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
syz_kvm_add_vcpu$arm64(0x0, &(0x7f0000000b80)={0x0, &(0x7f0000000040)=[@smc={0x1e, 0x40, {0x84000001, [0x99a, 0xb, 0xaca, 0x101, 0x1]}}], 0x40}, &(0x7f0000000bc0)=[@featur1={0x1, 0x4}], 0x1)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000b80)={0x0, &(0x7f0000000040)=ANY=[@ANYBLOB="1e0000000000000040000000000000000400"], 0x40}, &(0x7f0000000bc0)=[@featur1={0x1, 0x4}], 0x1)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x400, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = syz_kvm_setup_syzos_vm$arm64(r5, &(0x7f0000c00000/0x400000)=nil)
r7 = syz_kvm_add_vcpu$arm64(r6, &(0x7f0000000080)={0x0, &(0x7f0000000000)=[@mrs={0xbe, 0x18, {0x603000000013808c}}], 0x18}, 0x0, 0x0)
ioctl$KVM_RUN(r7, 0xae80, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r3, 0xae03, 0xb2)
r8 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r9 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r8, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r9, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r8, 0x0)
r10 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r8, 0x4020aeae, &(0x7f0000000140)={0x4})
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(0xffffffffffffffff, 0x4040aea0, &(0x7f0000000000)=@x86={0x6, 0x6f, 0x5, 0x0, 0x3, 0x5, 0x6, 0xd, 0x40, 0x3, 0x4, 0xa, 0x0, 0x5, 0x6, 0x0, 0x0, 0x2b, 0x6, '\x00', 0x0, 0x3})
ioctl$KVM_CREATE_VM(r10, 0x401c5820, 0x20000000)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r11 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r12 = ioctl$KVM_CREATE_VM(r11, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
eventfd2(0x45, 0x800)
mmap$KVM_VCPU(&(0x7f0000ff5000/0x3000)=nil, 0x930, 0x100000f, 0x24132, 0xffffffffffffffff, 0x0)
r13 = ioctl$KVM_CREATE_VCPU(r12, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r13, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r13, 0x4010aeac, &(0x7f0000000100)=@arm64_core={0x60300000001000d7, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x8000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000bfe000/0x400000)=nil, &(0x7f0000001080)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_ARM_VCPU_FINALIZE(r2, 0x4004aec2, 0xffffffffffffffff)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfd000/0x400000)=nil)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f00000001c0)="fb4149dd033be3ac2cc4a22332a77b23b08986814d7bb14c94a6ab8031d1dfd92f00000000010000005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa7fc869d22627e7", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000009, 0x11, r2, 0x0)
openat$kvm(0x0, &(0x7f0000000040), 0x5bdb03, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x6000007, 0x4f833, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
r6 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, r6, 0x1, 0x11, r4, 0x0)
r7 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_MP_STATE(r7, 0x4004ae99, &(0x7f0000000080)=0x5)
ioctl$KVM_SET_VCPU_EVENTS(r7, 0x4040aea0, &(0x7f0000000040)=@x86={0x2b, 0x8, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x4, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0xffff8001, 0x1, 0x0, 0x1, 0x1, '\x00', 0xe, 0x200})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, 0xfffffffffffffffe)
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, 0x930, 0x0, 0x4003831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x3000)=nil, 0x930, 0x3000007, 0x2012, r2, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x2000007, 0x2012, r2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r2 = mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, r1, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x18})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_fw={0x6030000000140001, &(0x7f0000000100)=0xffffffffffffffff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = syz_kvm_vgic_v3_setup(r1, 0x2, 0xa0)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_other={0x0, 0x1, 0x400, &(0x7f0000000080)=0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, r3, 0x300000e, 0x13, r2, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000100)="7f61dfd678c763aea4eb000000000000000809911f398c5defc24101c0bdddd2285bebac6170497931f113870311ab48e78261db3600", 0x0, 0x48)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
syz_kvm_setup_cpu$arm64(r3, r4, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000080)=[{0x600000000000000, 0x0}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r5, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r5, 0x4010aeab, &(0x7f0000000080)=@arm64_sys={0x603000000013c006, 0xffffffffffffffff})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f00000000c0)={0x3, 0x0, 0x2, r2, 0xb})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0x0, 0x2, r2, 0xf})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x5452, 0x2000fdfd)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
write$eventfd(r4, &(0x7f0000000000), 0xfffffe1e)
r5 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r6 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x18})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000080)=@attr_other={0x0, 0x0, 0x100000000, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000380)=[@its_setup={0x82, 0x28, {0x0, 0x1, 0x308}}, @its_send_cmd={0xaa, 0x28, {0xf, 0x8, 0x3, 0xb, 0xc80c, 0x0, 0x2}}], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5, 0x2})
ioctl$KVM_SET_GUEST_DEBUG(r2, 0x4208ae9b, &(0x7f0000000000)={0x10003, 0x0, [0x7ff, 0x80, 0x4, 0x2, 0x8, 0x5, 0xfffffffffffffff9, 0x3]})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f00001e3000/0x400000)=nil, &(0x7f0000000280)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(0xffffffffffffffff, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000240)={0x5, 0x11})
ioctl$KVM_ARM_VCPU_FINALIZE(r2, 0x4004aec2, &(0x7f0000000040)=0x4)
ioctl$KVM_CHECK_EXTENSION_VM(0xffffffffffffffff, 0xae03, 0xaa)
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000080)=@arm64_sve_vls={0x606000000015ffff, &(0x7f0000000000)=0x81})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=[@its_setup={0x82, 0x28, {0x1, 0x1, 0x1}}], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x120)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000240)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f0000000240)=ANY=[@ANYBLOB="01000000000000000000000000000002"])
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x9, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x0, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(0xffffffffffffffff, 0xae04)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f00000000c0)=@arm64={0x27, 0xf4, 0x2, '\x00', 0x8000000000000000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013c03f, &(0x7f00000000c0)=0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000040)=@arm64_sys={0x603000000013e090, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x3, 0x220)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, &(0x7f0000000100)="e739e84ace16da742404c008d1c444752f9cf53101c23680", 0x0, 0x18)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x8, 0x5c1fd1b6565d2f2, 0xffffffffffffffff, 0x0)
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x801c581f, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, 0xfffffffffffffffe})

      
      openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2c00, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x7, <r2=>0xffffffffffffffff, 0x1})
r3 = ioctl$KVM_CREATE_VM(r2, 0x894c, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xb701, 0x0)
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r7 = ioctl$KVM_CREATE_VCPU(r6, 0xae41, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r7, 0x0)
openat$kvm(0xffffff9c, 0x0, 0x1a17f2, 0x0)
r8 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r8, 0x40305839, 0x10000000000000)
ioctl$KVM_CREATE_VCPU(r4, 0xb704, 0x20000002)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, 0x930, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2000002, 0x4f832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0x80111500, 0x20000000)
ioctl$KVM_CREATE_VM(r1, 0x40049409, 0x10000000000000)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
close(0x4)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0xaece, 0x0)
close(0x4)

      
      r0 = eventfd2(0x0, 0x0)
close(r0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
write$eventfd(r0, 0x0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000038000/0x1000)=nil, 0x930, 0x1, 0x30, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(0xffffffffffffffff, 0xae03, 0x80)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x5452, &(0x7f0000000080)={0xfdfdffff, 0x8016000, 0x1, 0xffffffffffffffff, 0x5})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x28)

      
      munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x3, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r3 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r4, 0x4020aeae, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
r7 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r8 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
r9 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
r10 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r9, 0xae04)
r11 = ioctl$KVM_CREATE_VCPU(r8, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x4000)=nil, r10, 0x1000007, 0x2012, r11, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x2, 0x120)
ioctl$KVM_CREATE_VCPU(r6, 0xae41, 0x1)
r12 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r13 = ioctl$KVM_CREATE_VM(r12, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r13, &(0x7f0000c00000/0x400000)=nil)
r14 = syz_kvm_vgic_v3_setup(r13, 0x15, 0x180)
ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
ioctl$KVM_GET_DEVICE_ATTR(r14, 0x4018aee2, &(0x7f0000000080)=@attr_arm64={0x0, 0x7, 0x3, &(0x7f0000000000)=0x100000001})
ioctl$KVM_IOEVENTFD(r8, 0x4040ae79, &(0x7f0000000140)={0x800, 0x10000, 0x4, 0xffffffffffffffff, 0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
syz_kvm_add_vcpu$arm64(0x0, &(0x7f00000000c0)={0x0, &(0x7f0000000240)=[@its_setup={0x7, 0x28, {0x0, 0x1, 0x17}}], 0x28}, 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
openat$kvm(0x0, &(0x7f0000000240), 0x580, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfff02000000ffffff00000d00e6ffea000000002000", 0x0, 0xffffffffffffff98)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r4, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x39, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x5, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = eventfd2(0x101, 0x800)
write$eventfd(r0, &(0x7f0000000080)=0xfffffffffffffff7, 0x8)

      
      r0 = openat$kvm(0x0, &(0x7f0000000180), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000000)=@arm64_sys={0x603000000013d801, &(0x7f0000000140)=0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0, 0x28}, 0x0, 0x0)
r3 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8})
ioctl$KVM_GET_DEVICE_ATTR(r3, 0x4018aee2, &(0x7f0000000240)=@attr_arm64={0x0, 0x1, 0x4, &(0x7f0000000200)=0x8000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1d})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013c090, &(0x7f0000000000)=0x5})

      
      r0 = ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x1)
r1 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_ARM_VCPU_FINALIZE(r0, 0x4004aec2, &(0x7f0000000040)=0x4)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r3, 0xae03, 0xaa)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000180)="fb0149dd033be3073da85cac1648f1e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76d869d2855c7f3200", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x80086601, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000440)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000140)={0x6, 0x800003a, 0x2}})
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000004c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000480)={0x0, 0x20}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(0xffffffffffffffff, 0x4040aea0, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x17)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x4008ae6a, &(0x7f0000000080)=ANY=[@ANYBLOB="02000000000000000000000001000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000001"])

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x800, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x53033, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004ff000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000f1a000/0x4000)=nil, 0x930, 0x0, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x3, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
munmap(&(0x7f0000ff5000/0x1000)=nil, 0x1000)
munmap(&(0x7f00006e2000/0x2000)=nil, 0x2000)
r3 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r4, 0x380000f, 0x11, r3, 0x0)
mmap$KVM_VCPU(&(0x7f000000d000/0x3000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000300)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_ccsidr={0x6020000000110004, 0x0})

      
      mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0x0, 0x0, 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x8)
r2 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000001000/0x2000)=nil, 0x930, 0x2000003, 0x4120932, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f00000001c0)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_USER_MEMORY_REGION(r3, 0x4020ae46, &(0x7f00000000c0)={0x1fd, 0x1, 0xdddd1000, 0x2000, &(0x7f0000fb5000/0x2000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000000)=0xc000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x800454d3, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x8e, 0x80000)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000000c0)={r2, 0x4, 0x1})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x2000, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x80000, 0x0)
openat$kvm(0x0, &(0x7f00000000c0), 0x100, 0x0)
r2 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r3, 0x4010aeac, &(0x7f00000001c0)=@arm64_core={0x6030000000100024, &(0x7f0000000000)=0x9})
r4 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r4, 0x4018aee1, &(0x7f0000000440)=@attr_other={0x0, 0x8, 0x0, &(0x7f0000000200)=0x200})
r5 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
r7 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r8 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
r9 = openat$kvm(0xffffffffffffff9c, &(0x7f00000002c0), 0x300, 0x0)
r10 = ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
r11 = ioctl$KVM_CREATE_VCPU(r10, 0xae41, 0x2)
ioctl$KVM_SET_VCPU_EVENTS(r11, 0x4040aea0, &(0x7f0000000880)=@arm64={0xae, 0x5, 0x9, '\x00', 0x6})
ioctl$KVM_SET_DEVICE_ATTR_vm(0xffffffffffffffff, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0xeeffbffd, 0xffd, 0x1}})
ioctl$KVM_SET_USER_MEMORY_REGION(r8, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r12 = ioctl$KVM_CREATE_VCPU(r8, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r12, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r12, 0xae80, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r6, 0x4020ae46, &(0x7f0000000080)={0x0, 0x1, 0xcccc0000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r6, 0x4020ae46, &(0x7f0000000100)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR_vm(r4, 0x4018aee1, &(0x7f00000004c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000480)={0x0, 0x20}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x1e)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000000)={0x0, 0x0}, 0x0, 0x0)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000300)=ANY=[@ANYBLOB="8200000008"], 0x28}, 0x0, 0x0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000000c0)={0x7})

      
      syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, &(0x7f0000000000)="7cfaa2bfd6dd76375aa1bde04fceeb33743b07d73b3e9aac", 0x0, 0xffffffffffffff94)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x40086602, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_ONE_REG(r2, 0xc018ae85, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x121e82, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
r3 = syz_kvm_vgic_v3_setup(r1, 0x2, 0x40)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f00000000c0)=@attr_other={0x0, 0x9, 0x200ff, &(0x7f0000000180)=0x2})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000080)={0x9, 0x3, 0x2}})
close(0x4)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
r2 = eventfd2(0x1, 0x1)
r3 = openat$kvm(0x0, &(0x7f0000000080), 0x40000, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r4, 0x1, 0x100)
ioctl$KVM_SET_GSI_ROUTING(r4, 0x4008ae6a, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000000)={r2, 0x0, 0x2, r2})
ioctl$KVM_IRQFD(r4, 0x4020ae76, &(0x7f0000000040)={r2, 0x2, 0x2, r2})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x52)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000e85000/0x2000)=nil, 0x2000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
munmap(&(0x7f0000ff5000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x100000f, 0x9032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000001c0), 0xc40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000200)=@attr_arm64={0x0, 0x8, 0x0, &(0x7f00000003c0)=0x7ff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r2, 0xc00caee0, &(0x7f00000001c0)={0x8})
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = syz_kvm_setup_syzos_vm$arm64(r4, &(0x7f0000c00000/0x400000)=nil)
r6 = syz_kvm_add_vcpu$arm64(r5, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@its_setup={0x82, 0x28, {0x1, 0x2001, 0x1}}], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r4, 0x4, 0x100)
ioctl$KVM_CREATE_DEVICE(r4, 0xc00caee0, &(0x7f0000000100)={0x8, <r7=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r7, 0x4018aee1, &(0x7f0000000140)=@attr_other={0x0, 0x2, 0x2, &(0x7f0000000000)=0xe})
ioctl$KVM_RUN(r6, 0xae80, 0x0)
r8 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r9 = ioctl$KVM_CREATE_VM(r8, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r9, 0xc00caee0, &(0x7f00000001c0)={0x8, <r10=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r10, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
syz_kvm_setup_syzos_vm$arm64(r9, &(0x7f000000a000/0x400000)=nil)
ioctl$KVM_SET_DEVICE_ATTR(r10, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r10, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x8800, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xf1)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac3bc4a22332fdaa8de0518df242008031d1dfd92f0000000001fff9ffdc9610fbff77521ce30d8f00", 0x0, 0xfcf7)

      
      openat$kvm(0x0, &(0x7f0000000180), 0x0, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x2, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_core={0x6030000000100042, &(0x7f0000000000)=0x9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@its_setup={0x82, 0x28, {0x1, 0x2001, 0x1}}], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x4, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)
ioctl$KVM_SIGNAL_MSI(r1, 0x4020aea5, &(0x7f0000000200)={0x8090040, 0x0, 0x100000, 0x1, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_core={0x6030000000100042, &(0x7f0000000000)=0x71})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x1)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000000)={0x401, 0x5000, 0x8, r2})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f00000000c0)={0x1000, 0x0, 0x1, r2, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000011c020, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000040)={0x3, 0xeeee0000, 0x2, r2, 0x8})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000001, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x3000000, 0x4f831, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_cpu$arm64(r1, 0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000880)=[{0x0, &(0x7f00000006c0)=[@smc={0x1e, 0x40, {0xc5000021, [0x79, 0x5, 0x1ff, 0x154, 0xb8d5]}}, @svc={0x122, 0x40, {0x86000000, [0x3, 0x0, 0x7fffffffffffffff, 0x458, 0x100]}}], 0x80}], 0x1, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=[@smc={0x1e, 0x40, {0x80007fff, [0x0, 0x1, 0x2, 0x4, 0x4]}}], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000000)=[{0x0, &(0x7f0000000840)=ANY=[@ANYBLOB="0000000000000000180000000000000002000000000000006e0000000000000030000000000000000000000800000000000000000000000004000000000000000600000000000000be000000000000001800000000000000ce831300000030601400000000000000200000000000000018c51300000030609501000000000000000000000000000018000000000000000900000000000000aa000000000000002800000000000000030104000000020000000600000001040000000000000000be0000000000000018000000000000005bc613000000306046000000000000001800000000000000030000004100000082000000000000002800000000000000030000000000000003000000000000008a0100"], 0x36c}], 0x1, 0x0, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_RUN(r2, 0xae80, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x28000, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000fcc000/0x4000)=nil, r1, 0x2, 0x8032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000580)=ANY=[@ANYBLOB="320000000000000040000000000000001200008400000000000000000000000080ffffffffffffff080000000000000000000000000000000400000000000000be00000000000000180000000000000030c0"], 0x208}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ffa000/0x4000)=nil, 0x4000)
ioctl$KVM_IOEVENTFD(0xffffffffffffffff, 0x4040ae79, 0x0)
ioctl$KVM_CAP_HALT_POLL(r1, 0x4068aea3, &(0x7f0000000140)={0xb6, 0x1000000, 0x8})

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x1, 0x1, 0x5000, 0x1000, &(0x7f0000fa2000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x26e8, 0x0, 0x0, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000040)={0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000100), 0x42080, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013df12, &(0x7f0000000040)=0xfffffffffffffff4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x109901, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000100)={0x0, 0x0}, 0x0, 0x0)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x3c0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x1, 0x100)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
ioctl$KVM_RUN(0xffffffffffffffff, 0xae80, 0x0)
syz_kvm_add_vcpu$arm64(0x0, &(0x7f0000000140)={0x0, &(0x7f0000000180)=ANY=[], 0xe0}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_other={0x0, 0x8, 0x128, &(0x7f0000000340)=0x8000000000000000})
close(0x4)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x5, 0x140)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_arm64={0x0, 0x1, 0x4, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r2 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, r1, 0x200000e, 0x4000030, 0xffffffffffffffff, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce181000000d22627e700", 0x0, 0x48)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r4 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r5, 0xc00caee0, &(0x7f0000000140)={0x4, <r6=>0xffffffffffffffff, 0x1})
openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r7 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r7, 0x4020ae46, &(0x7f0000000040)={0x1, 0x0, 0x8000000, 0x2000, &(0x7f0000fa2000/0x2000)=nil})
ioctl$KVM_CREATE_DEVICE(r7, 0xc018aec0, &(0x7f00000000c0)={0x1})
ioctl$KVM_SET_DEVICE_ATTR(r6, 0x400454ce, 0x0)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
r8 = mmap$KVM_VCPU(&(0x7f00006b5000/0x2000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r8, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r9 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r10 = ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
r11 = ioctl$KVM_CREATE_VCPU(r10, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r11, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r11, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100014, 0xfffffffffffffffe})
munmap(&(0x7f0000ffe000/0x1000)=nil, 0x1000)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r12 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r9, 0xae04)
mmap$KVM_VCPU(&(0x7f00006b4000/0x3000)=nil, r12, 0x300000f, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x1, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
r3 = openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r3, 0xae04)
close(0x4)
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000080)=@arm64_sys={0x603000000013df42, &(0x7f0000000040)})

      
      r0 = syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000c00000/0x400000)=nil)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r4, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_RUN(0xffffffffffffffff, 0xae80, 0x0)
syz_kvm_setup_syzos_vm$arm64(r2, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r2, 0x4020ae46, &(0x7f0000000000)={0x1fe, 0x0, 0x8080000, 0x1000, &(0x7f0000ffc000/0x1000)=nil})
r5 = syz_kvm_add_vcpu$arm64(r0, &(0x7f0000000080)={0x0, &(0x7f0000000280)=ANY=[@ANYBLOB="aa00000000000000280000000000000003"], 0x28}, 0x0, 0x0)
ioctl$KVM_RUN(r5, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r3, 0x0)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x0, 0x0, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000140)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000040)={0x7ffd, 0xb}})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000140)={0x2000, 0x1a000})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000000)={0xffff1000, 0x8000})
ioctl$KVM_UNREGISTER_COALESCED_MMIO(r1, 0x4010ae68, &(0x7f0000000100)={0x80a0000, 0x2000})

      
      munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(0xffffffffffffffff, 0x2, 0x120)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r2 = syz_kvm_vgic_v3_setup(r1, 0x5, 0x140)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000080)=@attr_arm64={0x0, 0x7, 0x3, &(0x7f0000000000)=0x100000001})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2(r1, 0x4068aea3, &(0x7f0000000000)={0xa8, 0x0, 0x1})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x0, 0x1, 0x0, 0x2000, &(0x7f0000000000/0x2000)=nil})
ioctl$KVM_GET_DIRTY_LOG(r1, 0x4010ae42, &(0x7f0000000080)={0x0, 0x0, &(0x7f00007a4000/0x4000)=nil})

      
      r0 = eventfd2(0x5, 0x0)
close(r0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x141, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, 0x930, 0x0, 0x11, r0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x40000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xf1)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x300000c, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000000)={0x4})
close(0x5)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x109901, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000100)={0x0, 0x0}, 0x0, 0x0)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x8030aeb4, &(0x7f0000000140)=@attr_other={0x0, 0x8, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x580, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f00008a0000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000e00)=ANY=[], 0x630}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000080)=@attr_other={0x0, 0x2, 0x0, &(0x7f0000000000)=0x80})
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r6, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      munmap(&(0x7f0000e0c000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e0b000/0x1000)=nil, 0x1000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f00000000c0)=@arm64_fp={0x6040000000100090, &(0x7f0000000000)=0xb})

      
      eventfd2(0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_ONE_REG(r1, 0x4010aeab, &(0x7f0000000140)=@arm64_sys={0x603000000013df46, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r2 = eventfd2(0x0, 0x0)
write$eventfd(r2, &(0x7f0000000000)=0xffffffffffffffff, 0x8)
ioctl$KVM_IRQ_LINE_STATUS(r2, 0xc008ae67, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x3000)=nil, 0x0, 0x1000001, 0x11, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000240))

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000080)={0xe1, 0x0, 0x2000})
ioctl$KVM_CLEAR_DIRTY_LOG(r1, 0xc018aec0, &(0x7f0000000040)={0xa02d11a4906d870c, 0x140, 0x300, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_bitmap={0x6030000000160000, &(0x7f00000000c0)=0x8906})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x8000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x320)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x400000000, 0x0, 0x1, r2, 0x5})

      
      munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000010000/0x1000)=nil, 0x930, 0x100000f, 0x9032, 0xffffffffffffffff, 0x0)

      
      munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000fde000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000eb3000/0x1000)=nil, 0x930, 0x0, 0x20031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f0f000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000f2a000/0x2000)=nil, 0x2000)
munmap(&(0x7f00004a0000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
mmap$KVM_VCPU(&(0x7f0000f1a000/0x4000)=nil, 0x930, 0x0, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x3, 0x9032, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000400)=ANY=[@ANYBLOB="8200"], 0x28}, 0x0, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8090000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40305839, &(0x7f0000000100)=@attr_other={0x1000000, 0xb, 0x9f01, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
close(r0)
r1 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r5, 0x4008ae6a, &(0x7f0000000180)={0x1, 0x0, [{0x3, 0x3, 0x0, 0x0, @sint={0x8, 0x80000001}}]})
r6 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r7 = syz_kvm_setup_syzos_vm$arm64(r6, &(0x7f0000c00000/0x400000)=nil)
r8 = syz_kvm_setup_syzos_vm$arm64(r6, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r8, &(0x7f0000000100)={0x0, 0x0}, 0x0, 0x0)
r9 = syz_kvm_add_vcpu$arm64(r7, &(0x7f0000000080)={0x0, &(0x7f00000001c0)=[@memwrite={0x6e, 0x30, @vgic_gits={0x8080000, 0x200b0, 0x49ea, 0x3}}, @mrs={0xbe, 0x18, {0x603000000013c000}}, @hvc={0x32, 0x40, {0xc4000053, [0x8, 0x1, 0x942, 0x4, 0x6]}}, @irq_setup={0x46, 0x18, {0x4, 0x1a2}}, @mrs={0xbe, 0x18, {0x603000000013e108}}, @uexit={0x0, 0x18, 0x466d}, @its_setup={0x82, 0x28, {0x2, 0x1, 0x3c3}}, @code={0xa, 0xb4, {"805380d20060b8f2e10080d2820180d2c30180d2840080d2020000d4000008d5607695d200a0b0f2a10080d2220180d2a30180d2440180d2020000d4007008d50078207e008008d5c0e79ed20020b0f2610180d2820180d2c30080d2040180d2020000d400000039c00a82d200a0b8f2c10180d2620080d2c30180d2a40080d2020000d4400780d20040b0f2010180d2220180d2830180d2240080d2020000d4"}}, @smc={0x1e, 0x40, {0x84000012, [0x5, 0x9255, 0xff, 0x4, 0x9]}}], 0x1ec}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r6, 0x2, 0x3c0)
ioctl$KVM_RUN(r9, 0xae80, 0x0)
syz_kvm_setup_syzos_vm$arm64(r2, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r8, &(0x7f0000000080)={0x0, &(0x7f00000000c0)}, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000000)={0x102091e})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_SIGNAL_MASK(r2, 0x4004ae8b, &(0x7f0000000100)={0x8, "e5ccd16738eaa59c"})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r2, 0x4020ae46, &(0x7f0000000000)=ANY=[@ANYBLOB="010000e40000000000000001040000000010", @ANYRES64=r1, @ANYRESOCT=r2])

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
ioctl$KVM_GET_REG_LIST(r2, 0xc008aeb0, &(0x7f0000000600)=ANY=[])

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000400)={0x0, 0x0, 0x0, 0x20002000, &(0x7f0000000000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x5, 0x3, 0xffff1000, 0x1000, &(0x7f0000000000/0x1000)=nil})
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x10200, 0x0, 0xdddd1000, 0x1000, &(0x7f0000ffe000/0x1000)=nil})
munmap(&(0x7f0000c8f000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000482000/0x2000)=nil, 0x2000)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x2000)=nil, 0x930, 0x0, 0x24132, 0xffffffffffffffff, 0x0)
r2 = eventfd2(0x1, 0x80000)
write$eventfd(r2, 0x0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r2 = mmap$KVM_VCPU(&(0x7f0000dee000/0x3000)=nil, r1, 0x100000e, 0x8a031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f00000000c0)="e51b9ce9a032a1ca7079bce9b3cf3ba9c7fbc2e7ab457eacc044b677d9d49c274b8d12fb382e0520cadbc6763409ffdb41911831b85a42b40c1689a8bf14be81eda4bae2d8c28ef8", 0x0, 0x48)
r3 = mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, 0x930, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, r1, 0x2000002, 0x4f832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000040)=@x86={0x2e, 0x9, 0x2, 0x0, 0x2, 0x9e, 0x5, 0x0, 0x4, 0xc, 0x8, 0xf8, 0x0, 0x0, 0x9e, 0x1, 0x0, 0x1, 0x1, '\x00', 0xf, 0x200})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xae)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x400454cd, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(0xffffffffffffffff, 0xae03, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4018aee1, &(0x7f00000000c0)={0x0, 0x3})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE(0xffffffffffffffff, 0x4068aea3, &(0x7f0000000140)={0xb1, 0x0, 0x5})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="820000000000000028000000000000000100000000000000040000000000000002000000000000008200000000000000280000000000000004"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x480, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xdf)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000400)={0x0, 0x0, 0x0, 0x20002000, &(0x7f0000000000/0x2000)=nil})
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000000000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000380)={0x10200, 0x0, 0xdddd1000, 0x1000, &(0x7f0000ffe000/0x1000)=nil})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x78)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000240)={0x5, 0x11})
ioctl$KVM_GET_REG_LIST(r2, 0xc008aeb0, 0x0)

      
      munmap(&(0x7f0000c07000/0x1000)=nil, 0x1000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x100000f, 0x9032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x9)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5, 0x2})
ioctl$KVM_SET_GUEST_DEBUG(r2, 0x4208ae9b, &(0x7f0000000000)={0x10003, 0x0, [0x7ff, 0x80, 0x4, 0x2, 0x8, 0x5, 0xfffffffffffffff9, 0x3]})
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f00000000c0)=@arm64={0xa, 0x33, 0x4, '\x00', 0x401})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f00001e3000/0x400000)=nil, &(0x7f0000000280)=[{0x0, 0x0}], 0x1, 0x0, 0x0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000140)=@attr_arm64={0x0, 0x7, 0x0, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x17)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4018aee2, &(0x7f0000000040)={0x4, 0x3})

      
      syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, &(0x7f0000000000)="7cfaa2bfd6dd76375aa1bde04fceeb33743b07d73b3e9aac", 0x0, 0xffffffffffffff94)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x5, &(0x7f0000000080)=0x6})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xaa)

      
      mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x0, 0x1000002, 0x13, 0xffffffffffffffff, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000200), 0x22c00, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x4, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000180)=@attr_arm64={0x0, 0x1, 0x2, 0x0})

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2, 0x4102932, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, &(0x7f0000000000)="7cfaa2bfd6dd76375aa1bde04fceeb33743b07d73b3e9aac", 0x0, 0x18)
r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac3bc4a22332fdaa8de0518df242008031d1dfd92f0000000001fff9ffdc9610fbff77521ce30d8f00", 0x0, 0xfcf7)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b6565d2f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="01000000010000000000000008"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0)=0x3, 0x10)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x541b, 0x0)
openat$kvm(0x0, 0x0, 0x9c481, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_SET_GSI_ROUTING(r1, 0x40a0ae49, &(0x7f0000000080)=ANY=[@ANYRESOCT, @ANYRES16])

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=[@its_setup={0x82, 0x28, {0x1, 0x1, 0x2}}, @its_send_cmd={0xaa, 0x28, {0x3, 0x5, 0x0, 0x4, 0x0, 0x2}}], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
eventfd2(0x0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x6030000000100004, &(0x7f0000000040)=0x8})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x400, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, &(0x7f0000000000)=@x86={0x6, 0xef, 0x5, 0x0, 0x3, 0x5, 0x6, 0xd, 0x40, 0x3, 0x4, 0xa, 0x0, 0x5, 0x6, 0x0, 0x0, 0x2b, 0x6, '\x00', 0x0, 0x3})
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      munmap(&(0x7f0000800000/0x800000)=nil, 0x800000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x43033, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000140)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_fw={0x6030000000140000, &(0x7f0000000080)=0x2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, &(0x7f0000000000)="375ae04fceeb298d3b07d73b3e9aac00", 0x0, 0x18)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
r3 = eventfd2(0xfffffffb, 0x80800)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f00000002c0)={0x6, 0xeeef0000, 0x4, r3, 0x6})
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000200)=ANY=[@ANYBLOB="14000000000000002000000000000000e3dc"], 0x20}, 0x0, 0x0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r7 = ioctl$KVM_CREATE_VCPU(r6, 0xae41, 0x0)
r8 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x3800003, 0x11, r7, 0x0)
r9 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r10 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r9, 0xae04)
r11 = mmap$KVM_VCPU(&(0x7f0000d10000/0xa000)=nil, r10, 0x3000006, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r11, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r8, 0x20, 0x0, 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r7, 0x0)
r12 = eventfd2(0x0, 0x0)
close(r12)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x2000, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
close(r2)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
close(r2)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f00000000c0)=[{0x0, 0x0}], 0x1, 0x0, &(0x7f0000000100)=[@featur2={0x1, 0x4}], 0x1)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB="1e000000000000004000000000000000030000c400000000fc4ce74f"], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR_vcpu(r2, 0x4018aee3, &(0x7f0000000200)=@attr_pvtime_ipa={0x0, 0x2, 0x0, 0x2b2f7ee2})

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f0000000100)="a2eef5bcde22df43ff0c9fcab949df446182aed04775ca3527bf61af95d446e5ceee43605ccbe21b5839696b3b50ef6a01e79fecaf2362817a96d1ba168e2dc4ece7639168efcf53", 0x0, 0x48)
munmap(&(0x7f0000ce0000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000c8f000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000647000/0x1000)=nil, 0x1000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x2, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x1, 0x4, &(0x7f0000000040)=0x9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfe000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f0000000200)=ANY=[], 0x518}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r4, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r4, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013d801, &(0x7f0000000000)=0x8005})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, r1, 0x100000c, 0x23ac5f9b426ec4b2, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x4, 0x2, 0xd000, 0x1000, &(0x7f0000001000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_arm64={0x0, 0x2})

      
      munmap(&(0x7f0000ffd000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f00000000c0)=@attr_arm64={0x0, 0x6, 0x1, 0x0})

      
      mmap$KVM_VCPU(&(0x7f0000f72000/0x1000)=nil, 0x930, 0x0, 0xe832, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x40000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0xe)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xd8)

      
      ioctl$KVM_HAS_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0xc02, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000500), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r2, 0xae03, 0xa2)
munmap(&(0x7f00007c2000/0x3000)=nil, 0x3000)
ioctl$KVM_CAP_HALT_POLL(0xffffffffffffffff, 0x4068aea3, 0x0)
ioctl$KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2(r1, 0x4068aea3, &(0x7f00000001c0)={0xa8, 0x0, 0x2})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100022, &(0x7f0000000180)=0x4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000ff5000/0x3000)=nil, 0x930, 0x100000f, 0x24132, 0xffffffffffffffff, 0x0)
write$eventfd(0xffffffffffffffff, &(0x7f0000000000), 0x8)
openat$kvm(0x0, 0x0, 0x48b04, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000100)=@arm64_extra={0x603000000013df1a, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x5, 0x140)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000080)=@attr_arm64={0x0, 0x2, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r3=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000000)=@attr_arm64={0x0, 0x0, 0x3, &(0x7f0000000180)=0x6})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
syz_kvm_vgic_v3_setup(r1, 0x5, 0x60)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000100)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f0000000080)=0x31})
r3 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r5, 0x0)
ioctl$KVM_GET_DEVICE_ATTR_vcpu(r2, 0x4018aee2, &(0x7f0000000240)=@attr_pmu_irq={0x0, 0x0, 0x0, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x9e)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013c028, &(0x7f00000000c0)})

      
      openat$kvm(0x0, 0x0, 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xd9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x8, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40305828, &(0x7f0000000100)=@attr_arm64={0x0, 0x1, 0x0, &(0x7f0000000180)=0x10001})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x5d)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_SET_ONE_REG(r3, 0x4010aeac, &(0x7f0000000100)=@arm64_sys={0x603000000013df01, &(0x7f0000000140)=0x26})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xa5)

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000240), 0x2, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000200), 0x22c00, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x4, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000180)=@attr_arm64={0x0, 0x1, 0x1, &(0x7f0000000000)=0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000100)={0xdf, 0x0, 0x1000})
ioctl$KVM_GET_DIRTY_LOG(r1, 0x4010ae42, &(0x7f0000000040)={0x5, 0x0, &(0x7f0000e4d000/0x2000)=nil})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000000)={0x2710, 0x2, 0xdddd0000, 0x1000, &(0x7f0000dd2000/0x1000)=nil})
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x1000009, 0x16831, 0xffffffffffffffff, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000000000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000380)={0x10200, 0x0, 0xdddd1000, 0x1000, &(0x7f0000ffe000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000000)=@arm64_core={0x6030000000100012, 0xffffffffffffffff})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x40305829, &(0x7f0000000100)=@attr_other={0x1000000, 0xb, 0x9f01, 0x0})

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r4, 0x4020aeae, &(0x7f00000000c0)={0x5, 0x10})
ioctl$KVM_SET_ONE_REG(r4, 0x4010aeac, &(0x7f0000000000)=@arm64_extra={0x6030000000140000, &(0x7f00000001c0)=0x10000})
r5 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r6 = syz_kvm_add_vcpu$arm64(r5, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=ANY=[@ANYBLOB="820000000000000000000000000000000100000000000000010000000000000001"], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
r7 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r8=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r8, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r6, 0xae80, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR(r8, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x4, 0x1, 0x0})

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000e1f000/0x4000)=nil, 0x4000)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000000)={0x8, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x541b, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4020940d, &(0x7f00000000c0)={0x4, 0x0, 0x1, 0xffffffffffffffff, 0x5})

      
      r0 = eventfd2(0x0, 0x80000)
mmap$KVM_VCPU(&(0x7f0000ffe000/0x2000)=nil, 0x930, 0x0, 0x110, r0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_GET_DEVICE_ATTR_vcpu(r2, 0x4018aee2, &(0x7f0000000100)=@attr_other={0x0, 0x1, 0x4, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
syz_kvm_vgic_v3_setup(r1, 0x5, 0x1e0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000100)=@attr_irq_timer={0x0, 0x1, 0x0, &(0x7f0000000340)=0x1c})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0xc0189436, &(0x7f0000000040)={0x0, 0x0, 0x841e06913f3012dd, 0xffffffffffffffff, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f0000000240)=@attr_arm64={0x0, 0x0, 0x2, 0x0})

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
munmap(&(0x7f0000ed8000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x4, 0x40a8012, 0xffffffffffffffff, 0x2000)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x8, 0x4f832, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100042, &(0x7f0000000100)=0x10})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, &(0x7f00000001c0), 0xc40, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x3})
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, &(0x7f00000001c0)={0x8})
ioctl$KVM_SET_SIGNAL_MASK(r2, 0x4004ae8b, &(0x7f00000001c0)=ANY=[])
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, r1, 0x100000b, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)

      
      munmap(&(0x7f0000ad4000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x1000000)
munmap(&(0x7f0000584000/0x800000)=nil, 0x800000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0xdc032, 0xffffffffffffffff, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x2041, 0x0)
ioctl$KVM_IRQ_LINE(r1, 0x4008ae61, &(0x7f0000000100)={0x2010040, 0x1000c53})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x894c, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1c})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_extra={0x6030000000140000, &(0x7f00000001c0)=0x2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x541b, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
syz_kvm_vgic_v3_setup(r1, 0x5, 0x1e0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000100)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f0000000080)=0x80000000})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_sve={0x60800000001503ec, 0x0})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac3bc4a22332fdaa8de0518df242008031d1dfd92f0000000001fff9ffdc9610fbff77521ce30d8f00", 0x0, 0xfcf7)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x20)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
r6 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r7 = ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
r8 = ioctl$KVM_CREATE_VCPU(r7, 0xae41, 0x0)
r9 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r6, 0xae04)
mmap$KVM_VCPU(&(0x7f0000e31000/0x2000)=nil, r9, 0x3000011, 0x2012, r8, 0x0)
close(0x5)
close(0x4)
munmap(&(0x7f0000c00000/0x400000)=nil, 0x400000)
r10 = eventfd2(0x1, 0x1)
r11 = openat$kvm(0x0, &(0x7f0000000080), 0x20200, 0x0)
r12 = ioctl$KVM_CREATE_VM(r11, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r12, 0x1, 0x100)
ioctl$KVM_IRQFD(r12, 0x4020ae76, &(0x7f00000000c0)={r10, 0x4, 0x0, r10})
ioctl$KVM_SET_GSI_ROUTING(r12, 0x4008ae6a, &(0x7f0000000240)=ANY=[])
ioctl$KVM_IRQFD(r12, 0x4020ae76, &(0x7f0000000040)={r10, 0x3, 0x2, r10})
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000100)="547a80816119d6f740eba70939b4dd3c67cc8ef30267b6e351ec92609ea1772af89374b2c24ae764125ca82e671b267d8980f7f7061632c7b88459ab6c0154d4086903dedbfdd6fb", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r4, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a13f2, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
ioctl$KVM_CREATE_IRQCHIP(r1, 0xae60)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x4, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x1, 0x3, 0x0})

      
      munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x1000000)
mmap$KVM_VCPU(&(0x7f00008c9000/0x1000)=nil, 0x930, 0x0, 0x4030031, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000080), 0x300, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
close(0x4)
syz_kvm_vgic_v3_setup(r3, 0x1, 0x100)
ioctl$KVM_ARM_VCPU_INIT(r1, 0x4018aee3, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_ARM_VCPU_FINALIZE(r2, 0x4004aec2, &(0x7f00000000c0)=0x4)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000380), 0x20000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000b40)=@attr_pvtime_ipa={0x0, 0x2, 0x0, 0x9})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x0, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000180)={r2, 0x27, 0x2, r2})
r3 = eventfd2(0x0, 0x0)
r4 = eventfd2(0xffff, 0x80801)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r3, 0x40fff, 0x2, r4})
close(0x4)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x80005ff, 0x1)
r3 = eventfd2(0x0, 0x0)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000180)={r3, 0x27, 0x2, r3})
r4 = eventfd2(0x0, 0x0)
r5 = eventfd2(0xffff, 0x80801)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r4, 0x40fff, 0x2, r5})
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f0000000000)={r4, 0x6d, 0x2, r2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000380)})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x108, &(0x7f0000000000)=0x1f87583d})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, &(0x7f0000000100)=[@its_setup={0x82, 0x28, {0x1, 0x0, 0x1}}, @its_send_cmd={0xaa, 0x28, {0xc, 0x0, 0x0, 0x4}}], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x100000d, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
ioctl$KVM_ARM_SET_COUNTER_OFFSET(r1, 0x4010aeb5, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION(r1, 0xae03, 0xc3)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
openat$kvm(0x0, 0x0, 0x909483, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000002c0)=@attr_arm64={0x0, 0x4, 0x2, 0x0})
close(r2)
close(0x4)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(0xffffffffffffffff, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0xeefffffc, 0x1000, 0x2}})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x2})
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000080)=@arm64_core={0x603000000010001a, &(0x7f0000000040)=0x8})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_GET_DIRTY_LOG(r1, 0x4010ae42, &(0x7f0000000080)={0xfdfd, 0x0, &(0x7f00007a4000/0x4000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_extra={0x603000000013c103, &(0x7f0000000100)=0xfffffffffffffffe})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r2, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r3=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x3, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f000000a000/0x400000)=nil)
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r3, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x5, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x100, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r3, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x40842, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
r2 = eventfd2(0x0, 0x0)
r3 = eventfd2(0x0, 0x1)
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000002c0)={r2, 0x1, 0x2, r3})
ioctl$KVM_IRQFD(r1, 0x4020ae76, &(0x7f00000000c0)={r2, 0x1, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000e15000/0x4000)=nil, 0x0, 0x2000002, 0x30, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_ARM_VCPU_INIT(r1, 0x4018aee3, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x100, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0xb5, 0x0, 0x4, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x5, 0x0, 0x0, r2})

      
      munmap(&(0x7f0000002000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0xc00000)=nil, 0x930, 0xf, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x6030000000100042, &(0x7f0000000100)=0x13})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_ARM_SYSTEM_SUSPEND(r1, 0x4068aea3, &(0x7f0000000b40))

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x20000, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r7 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r6, 0xae04)
r8 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x3000)=nil, r7, 0x100000a, 0x12, r8, 0x100000)
mmap$KVM_VCPU(&(0x7f0000ffe000/0x1000)=nil, r7, 0x1, 0x11, r8, 0x0)
r9 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, r3, 0x100000f, 0x12, r9, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0xc3033, 0xffffffffffffffff, 0x0)

      
      openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x480b40, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, 0x0, 0x100, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f00000000c0), 0x40480, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_IOEVENTFD(r2, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x7, 0xffffffffffffffff, 0x1})

      
      r0 = eventfd2(0x0, 0x0)
close(r0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x121801, 0x0)
write$eventfd(r0, 0x0, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0xc, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x20000000)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0xe, 0x16831, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0) (async)
r1 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
r3 = ioctl$KVM_CREATE_VM(r2, 0x894c, 0x0)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
r6 = ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r6, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1}) (async)
ioctl$KVM_SET_ONE_REG(r6, 0x4010aeac, &(0x7f00000001c0)=@arm64_sys={0x6030000000138064, 0x0})
ioctl$KVM_CREATE_VCPU(r3, 0xb702, 0x0) (async)
r7 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r8 = syz_kvm_setup_syzos_vm$arm64(r7, &(0x7f0000c00000/0x400000)=nil)
r9 = syz_kvm_add_vcpu$arm64(r8, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0) (async)
r10 = syz_kvm_add_vcpu$arm64(r8, &(0x7f0000000100)={0x0, &(0x7f00000001c0)=ANY=[@ANYBLOB], 0x18}, 0x0, 0x0)
ioctl$KVM_RUN(r10, 0xae80, 0x0) (async)
ioctl$KVM_RUN(r9, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x18})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000100)=@arm64_extra={0x6030000000140000, &(0x7f00000001c0)=0x10002})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x20000, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x400, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(0xffffffffffffffff, 0x4020aeae, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, 0x930, 0x0, 0x12eeff265b2ad0b8, 0xffffffffffffffff, 0x1000000)
ioctl$KVM_SET_GUEST_DEBUG(0xffffffffffffffff, 0x4208ae9b, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x5452, &(0x7f0000000080)={0x0, 0x8016000, 0x1, 0xffffffffffffffff, 0x5})

      
      r0 = openat$kvm(0x0, &(0x7f00000002c0), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r3 = mmap$KVM_VCPU(&(0x7f0000004000/0x2000)=nil, 0x930, 0x2800002, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f00000001c0)="fb4149dd033be3ac2cc4a22332a77b23b08986814d7bb14c94a6ab8031d1dfd92f00000000010000005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa7fc869d22627e7", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x12, r2, 0x0)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r4, 0x40086602, 0x110caafffd)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="010000000100000000000000080009"])
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xe80)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r3, 0x4018aee1, &(0x7f0000000100)=@attr_other={0x0, 0x8, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000100)={0x0, 0x0}, 0x0, 0x0)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000000)={0x0, 0x0}, &(0x7f00000000c0)=[@featur2={0x1, 0x45}], 0x1)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x3c0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000440)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000000)={0xfff, 0x800003a}})
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000004c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000480)={0x0, 0x20}})

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8, 0x1})
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f00000001c0)="fb52226012ab8b78286bf6cd81002000d3d9639c0810000000000005ff0f26ea4849dcfd69bf47d9000000000000000000000000000000000000000000000000cd9100", 0x0, 0x48)
openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee3, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0), 0xff3c)

      
      syz_memcpy_off$KVM_EXIT_MMIO(0x0, 0x20, &(0x7f0000000000)="b7b9ffff09fd10000000bde04fceebac00", 0x0, 0xfffffffffffffc7e)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
ioctl$KVM_SET_ONE_REG(0xffffffffffffffff, 0x4010aeac, &(0x7f0000000000)=@arm64_core={0x6030000000100014, 0x0})
ioctl$KVM_CREATE_VM(r0, 0x40086602, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x4, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8})
ioctl$KVM_SIGNAL_MSI(r1, 0x4020aea5, &(0x7f0000000200)={0x8090040, 0x0, 0x0, 0x1, 0x204})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000080)={0xe1, 0x900, 0x2000})

      
      r0 = eventfd2(0x0, 0x0)
close(r0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, 0x930, 0x0, 0x11, r0, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x400454d4, 0x0)

      
      mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, 0x930, 0x0, 0x40032, 0xffffffffffffffff, 0x0)
r0 = eventfd2(0x8001, 0x0)
write$eventfd(r0, &(0x7f0000000000)=0xfffffffffffffffb, 0x8)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x8})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_core={0x6030000000100042, &(0x7f0000000000)=0x4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000380), 0x101000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000180)=@attr_other={0x0, 0x1, 0x1000304, &(0x7f00000000c0)})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000000)=@arm64_sys={0x603000000013c024, &(0x7f00000000c0)})

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
r4 = mmap$KVM_VCPU(&(0x7f0000000000/0x1000)=nil, 0x0, 0x300000e, 0x13, r3, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_init)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, 0x0, 0x0, 0x0)
r5 = openat$kvm(0x0, &(0x7f0000000240), 0x580, 0x0)
r6 = ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r6, 0x4020ae46, &(0x7f00000000c0)={0x5, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r7 = syz_kvm_setup_syzos_vm$arm64(r6, &(0x7f00008a0000/0x400000)=nil)
r8 = syz_kvm_add_vcpu$arm64(r7, &(0x7f00000000c0)={0x0, &(0x7f0000000e00)=ANY=[], 0x630}, 0x0, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r8, 0x4018aee1, &(0x7f0000000080)=@attr_other={0x0, 0x2, 0x0, &(0x7f0000000000)=0x80})
r9 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r10 = ioctl$KVM_CREATE_VM(r9, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
r11 = ioctl$KVM_CREATE_VCPU(r10, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffc000/0x3000)=nil, 0x930, 0x1000001, 0x11, r11, 0x0)
ioctl$KVM_RUN(r8, 0xae80, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
r12 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GSI_ROUTING(0xffffffffffffffff, 0x4020ae46, &(0x7f00000001c0)=ANY=[@ANYBLOB="0100000001000000000000000806"])
ioctl$KVM_CREATE_DEVICE(r12, 0xc00caee0, &(0x7f0000000140)={0x4, <r13=>0xffffffffffffffff, 0x1})
write$eventfd(r13, &(0x7f00000001c0)=0xffffff7f, 0xe80)

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f00000000c0)={0x0, 0x9, 0x2}})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=[@hvc={0x32, 0x40, {0xc5000020, [0x3, 0x6, 0x10001, 0x100000000]}}, @irq_setup={0x46, 0x18, {0x4, 0x30c}}, @eret={0xe6, 0x18, 0x2}, @its_send_cmd={0xaa, 0x28, {0xf, 0x1, 0x4, 0x6, 0x7, 0x1000, 0x4}}, @code={0xa, 0x54, {"000028d50000c028008c200e0020200e007008d5000800b800f8302e0000009c40c488d200a0b8f2810080d2820180d2030080d2c40080d2020000d4003c004e"}}, @hvc={0x32, 0x40, {0x20, [0x5, 0x40, 0x3, 0x1, 0x10]}}, @msr={0x14, 0x20, {0x603000000013e6d4, 0x8}}], 0x14c}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0xfffffffffffffffe)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r3, 0x4020aeae, &(0x7f0000000200)={0x5, 0x8})
syz_kvm_vgic_v3_setup(r1, 0x5, 0x1e0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000100)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f0000000080)=0x31})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f00000000c0)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f0000000040)=0x19})

      
      munmap(&(0x7f0000ffa000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x2000)=nil, 0x930, 0x100000c, 0x16831, 0xffffffffffffffff, 0x0)
munmap(&(0x7f0000ffd000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f00000000c0)=@arm64_fp={0x6040000000100014, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, &(0x7f00000000c0)=[@msr={0x14, 0x20, {0x603000000013c600, 0xfefefee0}}], 0x20}, 0x0, 0x0)
r4 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, r4, 0x3, 0x11, r3, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
syz_kvm_assert_syzos_uexit$arm64(r5, 0xffffffffffffffff)
syz_kvm_assert_reg(r3, 0x603000000013c600, 0xfefefee0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
ioctl$KVM_SET_MP_STATE(r2, 0x4004ae99, &(0x7f00000001c0)=0xa)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000280), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x3)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000000)={0x5, 0x8})
ioctl$KVM_RUN(r2, 0xae80, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x5, 0x1e0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000300)=@attr_pmu_irq={0x0, 0x0, 0x0, &(0x7f00000002c0)=0x64})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000100)=@attr_pmu_init)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1, 0x2, 0x5000, 0x2000, &(0x7f0000ee8000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x26e8, 0x0, 0x0, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
ioctl$KVM_CREATE_DEVICE(r1, 0xc018aec0, &(0x7f0000000040)={0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x101001, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000003c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000380)={0x9, 0xc5}})
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000bfe000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000140)={0x0, &(0x7f0000000180)=ANY=[@ANYBLOB="1e00000000000000400000000000000001ff0086"], 0x40}, 0x0, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x8100, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_HAS_DEVICE_ATTR(r1, 0x4018aee3, &(0x7f0000000080)=@attr_other={0x0, 0x4, 0x100, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x5451, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f932, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000180)=@arm64_core={0x603000000010000a, &(0x7f0000000100)=0xc74d})

      
      r0 = eventfd2(0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0x4020940d, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
r4 = openat$kvm(0x0, 0x0, 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r5, 0xae41, 0x1)
r6 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x0, 0x3, 0x11, r3, 0x0)
ioctl$KVM_RUN(r3, 0xae80, 0x0)
syz_kvm_assert_syzos_uexit$arm64(r6, 0xffffffffffffffff)
r7 = openat$kvm(0x0, 0x0, 0x200, 0x0)
r8 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x2, 0x4f832, 0xffffffffffffffff, 0x0)
ioctl$KVM_CREATE_VCPU(r8, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0xdc032, 0xffffffffffffffff, 0x0)
close(0x5)
close(0x4)
syz_kvm_assert_reg(r3, 0x603000000013c600, 0xfefefee0)

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
openat$kvm(0x0, &(0x7f0000000040), 0x2c00, 0x0)
ioctl$KVM_CREATE_DEVICE(0xffffffffffffffff, 0xc00caee0, 0x0)
r0 = ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x894c, 0x0)
r1 = ioctl$KVM_CREATE_VCPU(r0, 0xb701, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xb704, 0x20000002)
ioctl$KVM_GET_ONE_REG(0xffffffffffffffff, 0x4010aeab, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000080), 0x101000, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = syz_kvm_setup_syzos_vm$arm64(r3, &(0x7f0000b60000/0x400000)=nil)
r5 = syz_kvm_add_vcpu$arm64(r4, &(0x7f00000000c0)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_GET_ONE_REG(r5, 0x4010aeab, &(0x7f0000000200)=@arm64_core={0x603000000010004a, &(0x7f00000001c0)=0x2a})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_core={0x603000000010004a, &(0x7f0000000100)=0x3})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0xef000000, 0x1000, 0x2}})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000e8a000/0x18000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=[@hvc={0x32, 0x40, {0x84000051, [0x28e1e71d, 0x8, 0x47, 0x0, 0x22]}}, @memwrite={0x6e, 0x30, @generic={0x4000, 0x29b, 0x5, 0xe}}, @smc={0x1e, 0x40, {0xbb000000, [0x8001, 0x8, 0x2, 0x9, 0x4]}}, @smc={0x1e, 0x40, {0xc4000014, [0x7, 0xce, 0x9, 0x3, 0x8]}}], 0xf0}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f0000000080)={0x4, 0x2000})
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
r2 = syz_kvm_vgic_v3_setup(r1, 0x4, 0x40)
close(0x5)
close(r2)
close(0x4)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x4, 0x8)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x2800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x5, 0x1)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000900)={0x0, 0x0, 0x1, r2, 0x1})
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000080)={0x0, 0x0, 0x1, r2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, 0xffffffffffffffff)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc9})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x8004b706, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_RUN(0xffffffffffffffff, 0xae80, 0x0)
ioctl$KVM_GET_ONE_REG(r3, 0x4010aeab, &(0x7f00000000c0)=@arm64_core={0x603000000010000c, &(0x7f0000000000)=0x400000000000008})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0xae01, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, 0x0, 0x2000, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000f82000/0x3000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x2000)=nil, r3, 0x5000003, 0x80031, 0xffffffffffffffff, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000f82000/0x1000)=nil, r3, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x3000)=nil, 0x930, 0x0, 0x8032, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000200)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_HAS_DEVICE_ATTR(r2, 0x4018aee3, &(0x7f0000000000)=@attr_arm64={0x0, 0x1, 0x1, 0x0})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x40842, 0x0)
ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0xc, 0x5c1fd1b65647af1, 0xffffffffffffffff, 0x20000000)
mmap$KVM_VCPU(&(0x7f0000eb2000/0x3000)=nil, 0x930, 0x6, 0x40a8012, 0xffffffffffffffff, 0x2000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000440), 0x200, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xef)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000340)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000140)=@arm64_ccsidr={0x602000000011000c, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x0, 0x4f831, 0xffffffffffffffff, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x9)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000000)=@arm64_bitmap={0x6030000000160002, &(0x7f0000000100)})

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000280)={0x7, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x6, 0xfffffffffffffffe})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xe1)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x1c1040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000f48000/0x3000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000100)="d4ece25438ac761d768f5c3f54d9506333a3efeda6b20c676f2c855f9505e66570fef4c314d949f94d16402868c2c64a1e54a0541230b4183257337f2ffb4f655500672bee04cb71", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x2f46b2, 0x0)

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r3, 0xc00caee0, &(0x7f0000000140)={0x4, <r4=>0xffffffffffffffff, 0x1})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x8927, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000000c0)=@arm64_sys={0x603000000013c801, &(0x7f0000000180)=0x2})

      
      ioctl$KVM_RUN(0xffffffffffffffff, 0xae80, 0x0)
openat$kvm(0x0, 0x0, 0x2002, 0x0)
syz_kvm_setup_syzos_vm$arm64(0xffffffffffffffff, &(0x7f0000873000/0x400000)=nil)
r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(0xffffffffffffffff, 0x4020ae46, &(0x7f0000000180)={0x1fe, 0x1, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f0000000180)=ANY=[@ANYBLOB="32000000000000004000000000000000530000c4"], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000200)={0x1, 0x5000, 0x2, 0xffffffffffffffff, 0x50f5b1735c3712b})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f00000001c0)=@arm64_core={0x6030000000100010, &(0x7f0000000180)=0x9})

      
      r0 = openat$kvm(0xffffffffffffff9c, 0x0, 0x2041, 0x0)
ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000a40), 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x0)
close(r3)
syz_kvm_vgic_v3_setup(r2, 0x2, 0xc0)
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r3, 0x4018aee1, &(0x7f0000000cc0)=@attr_other={0x0, 0x1, 0x480, &(0x7f0000000000)=0x2})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_VCPU_EVENTS(0xffffffffffffffff, 0x4040aea0, &(0x7f0000000000)=@arm64={0x6, 0x8, 0x77, '\x00', 0xffffffffffffffff})
ioctl$KVM_CREATE_VM(r4, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r2 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r3 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r4 = ioctl$KVM_CREATE_VM(r1, 0xae01, 0x0)
r5 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r6 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r5, 0xae04)
r7 = ioctl$KVM_CREATE_VCPU(r2, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x3000)=nil, r6, 0x100000a, 0x12, r7, 0x100000)
mmap$KVM_VCPU(&(0x7f0000ffb000/0x1000)=nil, r6, 0x3000004, 0x11, r7, 0x0)
r8 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, r3, 0x100000f, 0x12, r8, 0x0)
munmap(&(0x7f0000ffa000/0x4000)=nil, 0x4000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(0xffffffffffffffff, 0xae41, 0x2)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, 0xffffffffffffffff)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2000, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000040)={0xdf, 0x0, 0x4000})
ioctl$KVM_IRQFD(r1, 0x4020ae76, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xd2)
ioctl$KVM_CAP_HALT_POLL(r1, 0x4068aea3, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x3000)=nil, 0x0, 0x0, 0x80010, 0xffffffffffffffff, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
munmap(&(0x7f0000e76000/0x1000)=nil, 0x1000)
ioctl$KVM_SET_DEVICE_ATTR(0xffffffffffffffff, 0x4018aee1, 0x0)
ioctl$KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2(0xffffffffffffffff, 0x4068aea3, &(0x7f00000004c0)={0xa8, 0x0, 0x1})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x1})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000140)=@arm64_ccsidr={0x6020000000110006, &(0x7f0000000000)=0x2})

      
      r0 = mmap$KVM_VCPU(&(0x7f0000007000/0x1000)=nil, 0x930, 0x1000002, 0x28031, 0xffffffffffffffff, 0x0)
r1 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r1, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000001c0)={0x8, <r2=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000100)=@attr_arm64={0x0, 0x0, 0x4})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f00000000c0)={0x1fe, 0x3, 0x0, 0x1000, &(0x7f0000000000/0x1000)=nil})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000040)=@attr_other={0x0, 0x8, 0x100, &(0x7f0000000080)=0x8000000000000000})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000280)=@attr_arm64={0x0, 0x4, 0x2, 0x0})
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000300)=@attr_other={0x0, 0x4, 0x1, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000c00000/0x400000)=nil, 0x930, 0x1, 0x4f832, 0xffffffffffffffff, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xe})
ioctl$KVM_SET_DEVICE_ATTR_vcpu(r2, 0x4018aee1, &(0x7f0000000180)=@attr_pmu_filter={0x0, 0x0, 0x2, &(0x7f0000000140)={0x8, 0xffff, 0x1}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x8800, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0xf0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x4, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x4, 0x0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0x80111500, 0x20000000)
close(r1)

      
      r0 = openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000200)={0x1fe, 0x0, 0xdddd0000, 0x2000, &(0x7f0000e41000/0x2000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CLEAR_DIRTY_LOG(r1, 0xc018aec0, &(0x7f00000001c0)={0x10000, 0x1c0, 0x0, 0x0})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0)=0xffffff7f, 0xfdef)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000200), 0x0, 0x0)
close(0x3)
eventfd2(0x0, 0x80000)
ioctl$KVM_CREATE_VM(r0, 0x401c5820, 0x20000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SIGNAL_MSI(r1, 0x4020aea5, 0x0)
ioctl$KVM_CHECK_EXTENSION_VM(r1, 0xae03, 0x58)

      
      ioctl$KVM_IRQFD(0xffffffffffffffff, 0x4020ae76, &(0x7f0000000140)={0xffffffffffffffff, 0xc8})
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_GUEST_DEBUG(0xffffffffffffffff, 0x4208ae9b, &(0x7f00000001c0)={0x38003, 0x0, [0x4000100000000, 0x4, 0xfffffffffffffffe, 0x4, 0x37e7, 0x6, 0x100005, 0xf369]})
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000140)={0x4, <r2=>0xffffffffffffffff, 0x1})
write$eventfd(r2, &(0x7f00000001c0)=0xffffff7f, 0xe80)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2901, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f00000000c0)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000100)={0x3, 0x1000, 0x2}})
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
syz_kvm_setup_cpu$arm64(r1, r2, &(0x7f0000c00000/0x400000)=nil, &(0x7f0000000140)=[{0x0, &(0x7f00000001c0)=[@smc={0x1e, 0x40, {0x84000052, [0x4fe74cfc, 0x278, 0xfffffffffffffff8, 0x0, 0xa]}}], 0x40}], 0x1, 0x0, 0x0, 0x0)
ioctl$KVM_RUN(r2, 0xae80, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x20200, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x360)
syz_kvm_vgic_v3_setup(r1, 0x0, 0x80)
ioctl$KVM_SET_VCPU_EVENTS(r2, 0x4040aea0, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_GET_REG_LIST(r2, 0x4020aeae, &(0x7f00000003c0)=ANY=[@ANYBLOB="05000000000000000000000000000082"])

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000180), 0x80180, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0x5450, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
close(r1)
r4 = eventfd2(0x0, 0x0)
close(r1)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
write$eventfd(r4, &(0x7f0000000180)=0x5, 0xfffffe09)
munmap(&(0x7f0000470000/0x400000)=nil, 0xe06500)
r5 = mmap$KVM_VCPU(&(0x7f0000fed000/0x3000)=nil, 0x930, 0x3000002, 0x12, r1, 0x0)
syz_memcpy_off$KVM_EXIT_MMIO(r5, 0x20, &(0x7f0000000000)="7cfaa2bfd6dd76375aa1bde04fceeb33743b07d73b3e9aac", 0x0, 0x18)

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=[@its_setup={0x82, 0x28, {0x1, 0x3, 0x2}}, @its_send_cmd={0xaa, 0x28, {0x5, 0x5, 0x0, 0x10000004, 0x0, 0x2}}], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x101041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION2(r1, 0x40a0ae49, &(0x7f0000000240)={0x10000, 0x7, 0xffff1000, 0x1000, &(0x7f0000e91000/0x1000)=nil, 0x400})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xa})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000040)=@arm64_ccsidr={0x6020000000110005, 0xffffffffffffffff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = eventfd2(0x0, 0x1)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f0000000100)={0xd000, 0x10000, 0x0, r2})
ioctl$KVM_REGISTER_COALESCED_MMIO(r1, 0x4010ae67, &(0x7f00000000c0)={0x5000, 0x5000})
close(0x4)

      
      openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
munmap(&(0x7f0000ec1000/0x3000)=nil, 0x3000)
munmap(&(0x7f000000f000/0x2000)=nil, 0x2000)
munmap(&(0x7f000075a000/0xb000)=nil, 0xb000)
munmap(&(0x7f0000c90000/0x1000)=nil, 0x1000)
munmap(&(0x7f0000ece000/0x2000)=nil, 0x2000)
munmap(&(0x7f0000e76000/0x12000)=nil, 0x12000)
munmap(&(0x7f0000f40000/0x5000)=nil, 0x5000)
munmap(&(0x7f0000ff5000/0x1000)=nil, 0x1000)
mmap$KVM_VCPU(&(0x7f0000ec1000/0x1000)=nil, 0x930, 0x100000f, 0x9032, 0xffffffffffffffff, 0x0)

      
      syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000200)="fb014401ac2cc4a2c0a6000000faff00bfffffffffffffffffffde00000000faffffff00000d00e6ffea000000002000", 0x0, 0xfffffffffffffe78)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r2, 0x0)
openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x44200, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000140), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000040)={0x1, 0x1, 0x5000, 0x2000, &(0x7f0000fa2000/0x2000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x26e8, 0x0, 0x0, 0x2000, &(0x7f0000ffe000/0x2000)=nil})
syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000936000/0x400000)=nil)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f00000002c0)={0x7, <r2=>0xffffffffffffffff, 0x1})
mmap$KVM_VCPU(&(0x7f00009fb000/0x4000)=nil, 0x0, 0x1, 0x12, r2, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f00000001c0), 0x100, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000000, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, &(0x7f0000000140), 0x0, 0x0)
r4 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
r5 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r4, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r4, 0x0)
r6 = eventfd2(0x0, 0x0)
mmap$KVM_VCPU(&(0x7f000002d000/0x1000)=nil, r1, 0x0, 0x32e7851d6de9e532, 0xffffffffffffffff, 0x0)
close(r6)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
write$eventfd(r6, &(0x7f0000000180)=0x5, 0xfffffde3)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r4 = syz_kvm_add_vcpu$arm64(r2, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x2, 0x3c0)
mmap$KVM_VCPU(&(0x7f0000c5b000/0x1000)=nil, 0x0, 0x2000006, 0x110, 0xffffffffffffffff, 0x0)
ioctl$KVM_RUN(r4, 0xae80, 0x0)
syz_kvm_add_vcpu$arm64(r3, &(0x7f0000000080)={0x0, 0x0}, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)

      
      ioctl$KVM_REGISTER_COALESCED_MMIO(0xffffffffffffffff, 0x4010ae67, 0x0)
r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_SET_REGS(r3, 0x4360ae82, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r0, 0xae04)
mmap$KVM_VCPU(&(0x7f00006b4000/0x3000)=nil, r1, 0x300000f, 0x32, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
r2 = syz_kvm_vgic_v3_setup(r1, 0x1, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000180)=@attr_arm64={0x0, 0x1, 0x0, &(0x7f0000000200)=0x4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000080), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r2 = syz_kvm_vgic_v3_setup(r1, 0x4, 0x40)
ioctl$KVM_SET_DEVICE_ATTR(r2, 0x4018aee1, &(0x7f0000000000)=@attr_other={0x0, 0x1, 0x104, &(0x7f0000000180)=0x3})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000200)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000080)={0xbffffffe, 0x2, 0x2}})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x3, 0x4102932, 0xffffffffffffffff, 0x0)
openat$kvm(0x0, 0x0, 0x0, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000180)="fb0149dd033be3073da85cac1648f1e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76d869d2855c7f3200", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a17f2, 0x0)
ioctl$KVM_CREATE_VM(r4, 0x80086601, 0x10000000000000)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000080)={0x0, 0x1, 0xcccc0000, 0x1000, &(0x7f0000fff000/0x1000)=nil})
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000100)={0x0, 0x1, 0x10000000000, 0x1000, &(0x7f0000fff000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x2000005, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)
mmap$KVM_VCPU(&(0x7f0000027000/0x13000)=nil, 0x930, 0x2000009, 0x4102932, 0xffffffffffffffff, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x8040aeb6, &(0x7f0000000140)=@attr_other={0x0, 0x8, 0x2, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x40c02, 0x0)
r3 = ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r4 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r5 = ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r4, 0xae01, 0x0)
ioctl$KVM_CHECK_EXTENSION(r5, 0xae03, 0xc3)
r6 = ioctl$KVM_CREATE_VCPU(r3, 0xae41, 0x0)
ioctl$KVM_GET_VCPU_EVENTS(r6, 0x8040ae9f, 0xffffffffffffffff)
ioctl$KVM_SET_VCPU_EVENTS(r6, 0x4040aea0, &(0x7f0000000000)=@arm64={0x7, 0x5, 0x0, '\x00', 0x1})
munmap(&(0x7f00000be000/0x1000)=nil, 0xffffffffdff41fff)
r7 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r8 = ioctl$KVM_CREATE_VM(r7, 0xae01, 0x0)
r9 = ioctl$KVM_CREATE_VCPU(r8, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r9, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_SET_ONE_REG(r9, 0x4010aeac, &(0x7f0000000140)=@arm64_sys={0x603000000013c2b0, &(0x7f0000000000)=0x8000000000000000})
r10 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r11 = syz_kvm_add_vcpu$arm64(r10, &(0x7f0000000080)={0x0, &(0x7f00000001c0)=ANY=[@ANYRESOCT=r1, @ANYRESHEX=r10, @ANYBLOB="f82cf9188e6de57527c965f7ff8f4a460f3bd22df7d852642d7078b99478b4566febae30c8dfb66690644ef3e477c6470cb2502547b9d7a45a269ebc72a7f928663a064fb9eaac6817f26c986a400bb3df605e329271b8297bbe0b1d3f7723bafd99accd92d24859c002731614b84ced72f6089b887530c8a3704928173ec2898708598f406287fedd0222c370b5a3eaaa1b6db2206dfad65ce09a64aa0908a73fadd81d124f0512171a88a687fa71354453118a31e8d89c09efbeb5"], 0x28}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000100)={0x8, <r12=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r12, 0x4018aee1, &(0x7f0000000140)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000180)=0x8080000})
ioctl$KVM_RUN(r11, 0xae80, 0x0)
munmap(&(0x7f0000800000/0x800000)=nil, 0x800000)

      
      mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x0, 0x3c2a1c3178cda732, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x2002, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="82000000000000002800000000000000010000000000000001000000000000001100000000000000aa00000000000000280000000000000009"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x1)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000000)=@arm64_fw={0x6030000000140000, &(0x7f0000000100)=0x5})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_DEVICE_ATTR_vm(r1, 0x4018aee1, &(0x7f0000000200)=@attr_arm64={0x0, 0x0, 0x0, &(0x7f0000000080)={0xc0000000, 0x10001, 0x2}})

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x0, 0x0)
ioctl$KVM_CHECK_EXTENSION(r0, 0xae03, 0x59)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CAP_DIRTY_LOG_RING_ACQ_REL(r1, 0x4068aea3, &(0x7f0000000300)={0xdf, 0x0, 0x8000})
ioctl$KVM_RESET_DIRTY_RINGS(r1, 0xaec7)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000100), 0x101080, 0x0)
ioctl$KVM_CREATE_VM(r2, 0xae01, 0x0)
r3 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r4 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r3, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000080)="fb0149dd033be3ac2cc4a29ea6abf4e7454e37c4b85400005a9610fbff67521ce16f8f1f449a7a835673312b54ebb2aa76c869d22627e700", 0x0, 0x29)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1, 0x11, r3, 0x0)
r5 = openat$kvm(0xffffff9c, &(0x7f0000000040), 0x1a3ef2, 0x0)
ioctl$KVM_CREATE_VM(r5, 0x40086602, 0x20000000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x5452, 0x2000fdfd)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
close(0xffffffffffffffff)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
write$eventfd(0xffffffffffffffff, &(0x7f0000000000), 0xfffffe1e)
r4 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r4, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r5 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
ioctl$KVM_CREATE_VM(r5, 0xae01, 0x0)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000040), 0x20040, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_SET_USER_MEMORY_REGION(r1, 0x4020ae46, &(0x7f0000000400)={0x0, 0x0, 0x0, 0x20002000, &(0x7f0000000000/0x2000)=nil})
ioctl$KVM_GET_DIRTY_LOG(r1, 0x4010ae42, &(0x7f0000000300)={0x0, 0x0, &(0x7f0000010000/0x1000)=nil})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0x19})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000140)=@arm64_fp={0x60400000001000b0, 0x0})

      
      r0 = openat$kvm(0x0, &(0x7f0000000240), 0xca680, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = syz_kvm_setup_syzos_vm$arm64(r1, &(0x7f0000c00000/0x400000)=nil)
r3 = syz_kvm_add_vcpu$arm64(r2, &(0x7f00000000c0)={0x0, &(0x7f0000000100)=ANY=[@ANYBLOB="820000000000000028000000000000000100000000000000040000000000000002000000000000008200000000000000280000000000000004"], 0x50}, 0x0, 0x0)
syz_kvm_vgic_v3_setup(r1, 0x1, 0x100)
ioctl$KVM_CREATE_DEVICE(r1, 0xc00caee0, &(0x7f0000000180)={0x8, <r4=>0xffffffffffffffff})
ioctl$KVM_SET_DEVICE_ATTR(r4, 0x4018aee1, &(0x7f00000001c0)=@attr_arm64={0x0, 0x0, 0x4, &(0x7f0000000200)=0x8080000})
ioctl$KVM_RUN(r3, 0xae80, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x800, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
r2 = syz_kvm_vgic_v3_setup(r1, 0x9, 0x140)
ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x100)
ioctl$KVM_GET_DEVICE_ATTR(r2, 0x4018aee2, &(0x7f00000000c0)=@attr_other={0x0, 0x1, 0xfff, &(0x7f0000000000)=0x7ff})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
mmap$KVM_VCPU(&(0x7f0000001000/0x2000)=nil, 0x930, 0x2000003, 0x4120932, 0xffffffffffffffff, 0x0)
ioctl$KVM_IOEVENTFD(r1, 0x4040ae79, &(0x7f00000000c0)={0x7, 0x2000, 0x1, 0xffffffffffffffff, 0xe6598e131c2c8028})

      
      r0 = openat$kvm(0x0, &(0x7f00000000c0), 0x909483, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x4)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5, 0xb})
ioctl$KVM_SET_ONE_REG(r2, 0x4010aeac, &(0x7f0000000180)=@arm64_fw={0x6030000000140003, &(0x7f0000000000)=0x4})

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x0, 0x0)
r3 = openat$kvm(0xffffffffffffff9c, &(0x7f0000000000), 0x2041, 0x0)
r4 = ioctl$KVM_CREATE_VM(r3, 0xae01, 0x0)
r5 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r2, 0xae04)
r6 = openat$kvm(0x0, &(0x7f0000000040), 0x20000, 0x0)
r7 = ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
r8 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r9 = ioctl$KVM_GET_VCPU_MMAP_SIZE(r8, 0xae04)
r10 = ioctl$KVM_CREATE_VCPU(r4, 0xae41, 0x2)
mmap$KVM_VCPU(&(0x7f0000000000/0x3000)=nil, r9, 0x100000a, 0x12, r10, 0x100000)
mmap$KVM_VCPU(&(0x7f0000ffe000/0x1000)=nil, r9, 0x1, 0x11, r10, 0x0)
r11 = ioctl$KVM_CREATE_VCPU(r7, 0xae41, 0x1)
mmap$KVM_VCPU(&(0x7f0000ffd000/0x2000)=nil, r5, 0x100000f, 0x12, r11, 0x0)
mmap$KVM_VCPU(&(0x7f0000ffa000/0x4000)=nil, r5, 0x0, 0x2012, r11, 0x0)
r12 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
mmap$KVM_VCPU(&(0x7f0000ead000/0x3000)=nil, r5, 0x2800007, 0x11, r11, 0x0)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r12, 0x0)
munmap(&(0x7f0000ffb000/0x2000)=nil, 0x2000)

      
      r0 = openat$kvm(0x0, &(0x7f0000000040), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
munmap(&(0x7f0000ff9000/0x3000)=nil, 0x3000)
ioctl$KVM_CREATE_VM(0xffffffffffffffff, 0x5452, 0x2000fdfd)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x0)
r3 = mmap$KVM_VCPU(&(0x7f0000009000/0x1000)=nil, 0x930, 0x280000f, 0x11, r2, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r3, 0x20, &(0x7f0000000240)="fb0149dd033be3ac2cc4a29ea6ab8031d1dfd92f00000000010000005a9610fbff67521cd66f8f1f447d3570707cd24b7eebb20700000000000000000000000100", 0x0, 0x48)
mmap$KVM_VCPU(&(0x7f0000000000/0xa000)=nil, 0x930, 0x1000001, 0x11, r2, 0x0)
r4 = eventfd2(0x0, 0x0)
close(r4)
openat$kvm(0xffffff9c, &(0x7f0000000040), 0xa00f2, 0x0)
write$eventfd(r4, &(0x7f0000000000), 0xfffffe1e)
r5 = mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x3000003, 0x28031, 0xffffffffffffffff, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(r5, 0x20, &(0x7f0000000240)="fb4149dd033be3ac2cc4a22332fdaa8de0418df24200000000a6ab8031d1dfd92f0000000001ffffffff9610fbff77521ce10d8f6b69d22627e700", 0x0, 0xffffffffffffffca)
r6 = openat$kvm(0xffffffffffffff9c, 0x0, 0x0, 0x0)
ioctl$KVM_CREATE_VM(r6, 0xae01, 0x0)
syz_memcpy_off$KVM_EXIT_HYPERCALL(0x0, 0x20, 0x0, 0x0, 0x0)
munmap(&(0x7f0000d83000/0x4000)=nil, 0x4000)
mmap$KVM_VCPU(&(0x7f0000000000/0x14000)=nil, 0x930, 0x0, 0x5c1fd1b656592f1, 0xffffffffffffffff, 0x0)

      
      r0 = openat$kvm(0x0, &(0x7f0000000000), 0x0, 0x0)
r1 = ioctl$KVM_CREATE_VM(r0, 0xae01, 0x0)
r2 = ioctl$KVM_CREATE_VCPU(r1, 0xae41, 0x2)
ioctl$KVM_ARM_VCPU_INIT(r2, 0x4020aeae, &(0x7f0000000080)={0x5})
ioctl$KVM_GET_ONE_REG(r2, 0x4010aeab, &(0x7f0000000180)=@arm64_core={0x6030000000100046, &(0x7f0000000100)=0xc74d})

srcu_lock_acquire---of 2
srcu_lock_release---of 2
tomoyo_bprm_check_security---of 4
tomoyo_bprm_committed_creds---of 3
tomoyo_cred_prepare---of 5
tomoyo_domain40%of 5
tomoyo_file_fcntl---of 7
tomoyo_file_ioctl100%of 1
tomoyo_file_open34%of 6
tomoyo_file_truncate100%of 1
tomoyo_inode_getattr---of 1
tomoyo_path_chmod---of 1
tomoyo_path_chown---of 6
tomoyo_path_chroot---of 1
tomoyo_path_link---of 1
tomoyo_path_mkdir---of 1
tomoyo_path_mknod50%of 4
tomoyo_path_rename---of 4
tomoyo_path_rmdir---of 1
tomoyo_path_symlink---of 1
tomoyo_path_truncate---of 1
tomoyo_path_unlink---of 1
tomoyo_sb_mount---of 1
tomoyo_sb_pivotroot---of 1
tomoyo_sb_umount---of 1
tomoyo_socket_bind---of 1
tomoyo_socket_connect---of 1
tomoyo_socket_listen---of 1
tomoyo_socket_sendmsg---of 1
tomoyo_task_alloc---of 3
tomoyo_task_free---of 7
-----------
SUMMARY48%of 17

interval_tree_augment_rotate100%of 5
interval_tree_insert100%of 7
interval_tree_iter_first100%of 10
interval_tree_iter_next100%of 14
interval_tree_remove78%of 36
interval_tree_span_iter_advance---of 8
interval_tree_span_iter_first---of 48
interval_tree_span_iter_next---of 25
-----------
SUMMARY89%of 72

-----------
SUMMARY---of 0

__post_watch_notification---of 38
add_watch_to_object---of 39
free_watch---of 8
get_watch_queue---of 11
init_watch---of 1
post_one_notification---of 48
put_watch_queue---of 12
rcu_lock_acquire---of 2
rcu_lock_release---of 2
remove_watch_from_object---of 56
watch_queue_clear---of 47
watch_queue_init---of 3
watch_queue_pipe_buf_release---of 3
watch_queue_set_filter7%of 31
watch_queue_set_size11%of 19
-----------
SUMMARY8%of 50

NF_HOOK---of 20
__in_dev_get_rcu---of 6
ip_call_ra_chain---of 26
ip_list_rcv---of 14
ip_local_deliver---of 4
ip_local_deliver_finish---of 19
ip_protocol_deliver_rcu---of 44
ip_rcv67%of 3
ip_rcv_core20%of 31
ip_rcv_finish---of 13
ip_rcv_finish_core---of 85
ip_sublist_rcv---of 59
rcu_lock_acquire---of 2
rcu_lock_release---of 2
skb_dst---of 5
-----------
SUMMARY24%of 34

dsa_bridge_mtu_normalization---of 23
dsa_bridge_prechangelower_sanity_check---of 15
dsa_enqueue_skb---of 5
dsa_foreign_dev_check---of 14
dsa_hw_port_list_set_mtu---of 12
dsa_port_phylink_mac_change---of 9
dsa_user_add_cls_matchall_mirred---of 30
dsa_user_change_conduit---of 26
dsa_user_change_mtu---of 20
dsa_user_change_rx_flags---of 15
dsa_user_changeupper---of 21
dsa_user_clear_vlan---of 3
dsa_user_close---of 1
dsa_user_create---of 52
dsa_user_dcbnl_get_apptrust---of 3
dsa_user_dcbnl_ieee_delapp---of 28
dsa_user_dcbnl_ieee_setapp---of 29
dsa_user_dcbnl_set_apptrust---of 3
dsa_user_destroy---of 4
dsa_user_dev_check---of 1
dsa_user_fdb_dump---of 1
dsa_user_fdb_event---of 33
dsa_user_fill_forward_path---of 4
dsa_user_get_drvinfo---of 1
dsa_user_get_eee---of 5
dsa_user_get_eeprom---of 3
dsa_user_get_eeprom_len---of 6
dsa_user_get_eth_ctrl_stats---of 3
dsa_user_get_eth_mac_stats---of 3
dsa_user_get_eth_phy_stats---of 3
dsa_user_get_ethtool_stats---of 6
dsa_user_get_iflink---of 4
dsa_user_get_link_ksettings---of 1
dsa_user_get_mm---of 3
dsa_user_get_mm_stats---of 3
dsa_user_get_pause_stats---of 3
dsa_user_get_pauseparam---of 1
dsa_user_get_regs---of 3
dsa_user_get_regs_len---of 3
dsa_user_get_rmon_stats---of 3
dsa_user_get_rxnfc---of 3
dsa_user_get_sset_count---of 6
dsa_user_get_stats64---of 3
dsa_user_get_strings---of 5
dsa_user_get_ts_info---of 3
dsa_user_get_ts_stats---of 3
dsa_user_get_wol---of 3
dsa_user_host_uc_install---of 15
dsa_user_host_uc_uninstall---of 10
dsa_user_ioctl---of 6
dsa_user_lag_prechangeupper---of 17
dsa_user_manage_vlan_filtering---of 5
dsa_user_mii_bus_init---of 1
dsa_user_net_selftest---of 3
dsa_user_netdevice_event3%of 139
dsa_user_netpoll_cleanup---of 3
dsa_user_netpoll_setup---of 7
dsa_user_nway_reset---of 1
dsa_user_open---of 8
dsa_user_phy_read---of 3
dsa_user_phy_write---of 3
dsa_user_phylink_fixed_state---of 1
dsa_user_poll_controller---of 1
dsa_user_port_attr_set---of 43
dsa_user_port_fdb_do_dump---of 11
dsa_user_port_obj_add---of 33
dsa_user_port_obj_del---of 31
dsa_user_prechangeupper---of 8
dsa_user_register_notifier---of 5
dsa_user_restore_vlan---of 3
dsa_user_resume---of 3
dsa_user_set_eee---of 13
dsa_user_set_eeprom---of 3
dsa_user_set_link_ksettings---of 1
dsa_user_set_mac_address---of 9
dsa_user_set_mm---of 3
dsa_user_set_pauseparam---of 1
dsa_user_set_rx_mode---of 1
dsa_user_set_rxnfc---of 3
dsa_user_set_wol---of 3
dsa_user_setup_tagger---of 8
dsa_user_setup_tc---of 31
dsa_user_setup_tc_block_cb---of 43
dsa_user_setup_tc_block_cb_eg---of 1
dsa_user_setup_tc_block_cb_ig---of 1
dsa_user_standalone_event_work---of 10
dsa_user_suspend---of 3
dsa_user_switchdev_blocking_event---of 5
dsa_user_switchdev_event---of 5
dsa_user_switchdev_event_work---of 12
dsa_user_sync_ha---of 19
dsa_user_sync_mc---of 15
dsa_user_sync_uc---of 15
dsa_user_unregister_notifier---of 7
dsa_user_unsync_ha---of 19
dsa_user_unsync_mc---of 15
dsa_user_unsync_uc---of 15
dsa_user_vlan_rx_add_vid---of 34
dsa_user_vlan_rx_kill_vid---of 32
dsa_user_xmit---of 19
local_bh_disable---of 2
-----------
SUMMARY3%of 139

__arm64_compat_sys_fcntl---of 8
__arm64_compat_sys_fcntl64---of 1
__arm64_sys_fcntl---of 9
__f_setown---of 11
_inline_copy_from_user---of 8
_inline_copy_to_user---of 7
check_fcntl_cmd---of 9
do_compat_fcntl64---of 31
do_fcntl---of 115
f_delown---of 3
f_getown---of 14
f_setown---of 18
fasync_alloc---of 1
fasync_free---of 1
fasync_helper---of 5
fasync_insert_entry---of 5
fasync_remove_entry---of 6
file_f_owner_allocate---of 7
file_f_owner_release67%of 3
get_compat_flock---of 3
kill_fasync7%of 31
put_compat_flock---of 7
put_compat_flock64---of 7
rcu_lock_acquire---of 2
rcu_lock_release---of 2
send_sigio---of 17
send_sigio_to_task---of 9
send_sigurg---of 21
sigio_perm---of 21
-----------
SUMMARY12%of 34

fsnotify_change25%of 12
inode_newsize_ok---of 7
may_setattr---of 9
notify_change34%of 50
setattr_copy30%of 24
setattr_prepare19%of 38
setattr_should_drop_sgid---of 4
setattr_should_drop_suidgid78%of 9
try_break_deleg29%of 7
-----------
SUMMARY31%of 140

__delete_from_swap_cache---of 17
__read_swap_cache_async---of 21
add_to_swap_cache---of 41
clear_shadow_from_swap_cache---of 22
delete_from_swap_cache---of 6
exit_swap_address_space---of 6
filemap_get_incore_folio---of 7
folio_large_mapcount---of 4
free_page_and_swap_cache---of 9
free_pages_and_swap_cache75%of 12
free_swap_cache20%of 15
get_shadow_from_swap_cache---of 1
init_swap_address_space---of 7
put_swap_device---of 18
rcu_lock_acquire---of 2
rcu_lock_release---of 2
read_swap_cache_async---of 7
show_swap_cache_info---of 1
swap_cache_get_folio---of 17
swap_cluster_readahead---of 27
swapin_readahead---of 52
vma_ra_enabled_show---of 1
vma_ra_enabled_store---of 1
xas_next---of 10
-----------
SUMMARY45%of 27

arch_cpu_is_hotpluggable---of 4
arch_match_cpu_phys_id---of 1
arm64_panic_block_dump---of 1
cpu_logical_map100%of 1
cpu_uninstall_idmap---of 6
efi_enabled---of 1
local_daif_restore---of 1
-----------
SUMMARY100%of 1

__arm64_compat_sys_rt_sigaction---of 48
__arm64_compat_sys_rt_sigpending---of 10
__arm64_compat_sys_rt_sigprocmask---of 15
__arm64_compat_sys_rt_sigqueueinfo---of 5
__arm64_compat_sys_rt_sigsuspend---of 4
__arm64_compat_sys_rt_sigtimedwait_time32---of 8
__arm64_compat_sys_rt_sigtimedwait_time64---of 8
__arm64_compat_sys_rt_tgsigqueueinfo---of 6
__arm64_compat_sys_sigaction---of 53
__arm64_compat_sys_sigaltstack---of 1
__arm64_compat_sys_sigpending---of 7
__arm64_sys_kill---of 15
__arm64_sys_pause---of 5
__arm64_sys_pidfd_send_signal---of 17
__arm64_sys_restart_syscall---of 1
__arm64_sys_rt_sigaction---of 14
__arm64_sys_rt_sigpending---of 10
__arm64_sys_rt_sigprocmask---of 16
__arm64_sys_rt_sigqueueinfo---of 18
__arm64_sys_rt_sigsuspend---of 4
__arm64_sys_rt_sigtimedwait---of 8
__arm64_sys_rt_sigtimedwait_time32---of 8
__arm64_sys_rt_tgsigqueueinfo---of 19
__arm64_sys_sigaltstack---of 11
__arm64_sys_sigpending---of 7
__arm64_sys_sigprocmask---of 15
__arm64_sys_sigsuspend---of 1
__arm64_sys_tgkill---of 3
__arm64_sys_tkill---of 3
__compat_save_altstack---of 19
__copy_siginfo_to_user32---of 7
__dequeue_signal---of 15
__kill_pgrp_info---of 5
__lock_task_sighand---of 19
__probestub_signal_deliver---of 1
__probestub_signal_generate---of 1
__save_altstack37%of 19
__send_signal_locked18%of 46
__set_current_blocked50%of 6
__sigqueue_free---of 4
__traceiter_signal_deliver---of 4
__traceiter_signal_generate---of 4
_inline_copy_from_user---of 8
arch_vma_name---of 1
calculate_sigpending---of 3
cgroup_threadgroup_change_end---of 14
check_kill_permission---of 21
class_rcu_constructor---of 6
class_rcu_destructor---of 7
clear_siginfo---of 1
clear_ti_thread_flag67%of 3
compat_restore_altstack---of 6
complete_signal13%of 58
copy_siginfo100%of 1
copy_siginfo_from_user---of 15
copy_siginfo_from_user32---of 3
copy_siginfo_to_external32---of 26
copy_siginfo_to_user34%of 12
dequeue_signal---of 13
do_compat_sigaltstack---of 11
do_freezer_trap---of 10
do_jobctl_trap---of 8
do_no_restart_syscall---of 1
do_notify_parent---of 61
do_notify_parent_cldstop---of 50
do_notify_pidfd---of 3
do_pidfd_send_signal---of 19
do_send_sig_info---of 3
do_send_specific---of 20
do_sigaction---of 33
do_sigaltstack---of 18
do_signal_stop---of 48
do_sigtimedwait---of 13
exit_signals---of 20
flush_itimer_signals---of 15
flush_signal_handlers---of 6
flush_signals---of 3
flush_sigqueue---of 9
flush_sigqueue_mask---of 15
force_exit_sig---of 1
force_fatal_sig---of 1
force_sig---of 1
force_sig_bnderr---of 1
force_sig_fault100%of 1
force_sig_fault_to_task---of 1
force_sig_fault_trapno---of 1
force_sig_info---of 1
force_sig_info_to_task24%of 17
force_sig_mceerr---of 3
force_sig_pkuerr---of 1
force_sig_ptrace_errno_trap---of 1
force_sig_seccomp---of 1
force_sigsegv---of 3
get_signal18%of 80
group_send_sig_info---of 15
ignore_signals---of 4
kernel_sigaction---of 3
kill_pgrp---of 5
kill_pid---of 1
kill_pid_info---of 1
kill_pid_info_type---of 15
kill_pid_usb_asyncio---of 26
kill_proc_info---of 11
lockdep_assert_task_sighand_held---of 20
next_signal---of 3
perf_trace_signal_deliver---of 8
perf_trace_signal_generate---of 8
posixtimer_get_target---of 5
posixtimer_init_sigqueue---of 3
posixtimer_putref---of 16
posixtimer_queue_sigqueue---of 7
posixtimer_send_sigqueue---of 46
posixtimer_sig_ignore---of 5
post_copy_siginfo_from_user32---of 26
prepare_kill_siginfo---of 1
prepare_signal16%of 33
print_dropped_signal---of 4
print_fatal_signal---of 5
ptrace_notify---of 4
ptrace_signal---of 32
ptrace_stop---of 37
ptrace_trap_notify---of 20
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
recalc_sigpending34%of 12
restore_altstack---of 13
retarget_shared_pending---of 18
send_sig---of 4
send_sig_fault---of 4
send_sig_fault_trapno---of 4
send_sig_info---of 4
send_sig_mceerr---of 5
send_sig_perf---of 3
send_signal_locked5%of 46
set_compat_user_sigmask---of 6
set_current_blocked---of 1
set_user_sigmask---of 6
sig_get_ucounts25%of 29
sigaction_compat_abi---of 1
siginfo_layout---of 10
signal_setup_done50%of 10
signal_wake_up_state---of 8
sigprocmask58%of 7
sigsuspend---of 7
task_clear_jobctl_pending---of 4
task_clear_jobctl_trapping---of 3
task_join_group_stop---of 10
task_participate_group_stop---of 12
task_set_jobctl_pending---of 9
trace_event_raw_event_signal_deliver---of 9
trace_event_raw_event_signal_generate---of 9
trace_raw_output_signal_deliver---of 3
trace_raw_output_signal_generate---of 3
trace_signal_deliver24%of 17
trace_signal_generate24%of 17
unhandled_signal---of 6
zap_other_threads---of 17
-----------
SUMMARY22%of 418

__probestub_notifier_register---of 1
__probestub_notifier_run---of 1
__probestub_notifier_unregister---of 1
__traceiter_notifier_register---of 4
__traceiter_notifier_run---of 4
__traceiter_notifier_unregister---of 4
atomic_notifier_call_chain---of 11
atomic_notifier_call_chain_is_empty---of 1
atomic_notifier_chain_register---of 1
atomic_notifier_chain_register_unique_prio---of 1
atomic_notifier_chain_unregister---of 1
blocking_notifier_call_chain---of 3
blocking_notifier_call_chain_robust---of 4
blocking_notifier_chain_register---of 3
blocking_notifier_chain_register_unique_prio---of 3
blocking_notifier_chain_unregister---of 3
notifier_call_chain29%of 25
notifier_chain_register---of 23
notifier_chain_unregister---of 22
notify_die---of 6
perf_trace_notifier_info---of 6
raw_notifier_call_chain100%of 1
raw_notifier_call_chain_robust---of 3
raw_notifier_chain_register---of 1
raw_notifier_chain_unregister---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
register_die_notifier---of 1
srcu_init_notifier_head---of 3
srcu_lock_acquire---of 2
srcu_lock_release---of 2
srcu_notifier_call_chain---of 3
srcu_notifier_chain_register---of 3
srcu_notifier_chain_unregister---of 3
trace_event_raw_event_notifier_info---of 7
trace_raw_output_notifier_info---of 3
unregister_die_notifier---of 1
-----------
SUMMARY31%of 26

NF_HOOK---of 25
__ipv6_dev_mc_dec---of 13
__ipv6_dev_mc_inc---of 41
__ipv6_sock_mc_close---of 24
__ipv6_sock_mc_join---of 34
add_grec---of 98
add_grhead---of 6
dst_output---of 5
igmp6_cleanup---of 1
igmp6_event_query---of 16
igmp6_event_report---of 16
igmp6_group_added45%of 18
igmp6_group_dropped---of 53
igmp6_group_queried---of 16
igmp6_join_group---of 22
igmp6_late_cleanup---of 1
igmp6_mc_get_next---of 27
igmp6_mc_seq_next---of 1
igmp6_mc_seq_show---of 3
igmp6_mc_seq_start---of 26
igmp6_mc_seq_stop---of 8
igmp6_mcf_get_first---of 24
igmp6_mcf_get_next---of 40
igmp6_mcf_seq_next---of 3
igmp6_mcf_seq_show---of 3
igmp6_mcf_seq_start---of 12
igmp6_mcf_seq_stop---of 10
igmp6_net_exit---of 5
igmp6_net_init---of 12
igmp6_send---of 65
in6_dev_get---of 22
inet6_ifmcaddr_notify---of 5
inet6_mc_check---of 43
ip6_mc_add_src---of 58
ip6_mc_clear_src---of 27
ip6_mc_del1_src---of 37
ip6_mc_del_src---of 39
ip6_mc_find_dev_rtnl---of 13
ip6_mc_leave_src---of 14
ip6_mc_msfget---of 40
ip6_mc_msfilter---of 43
ip6_mc_source---of 59
ipv6_chk_mcast_addr---of 48
ipv6_dev_mc_dec---of 12
ipv6_dev_mc_inc---of 1
ipv6_mc_dad_complete---of 18
ipv6_mc_destroy_dev---of 20
ipv6_mc_down---of 39
ipv6_mc_init_dev---of 10
ipv6_mc_netdev_event10%of 33
ipv6_mc_remap---of 1
ipv6_mc_unmap---of 14
ipv6_mc_up35%of 23
ipv6_sock_mc_close---of 5
ipv6_sock_mc_drop---of 29
ipv6_sock_mc_join---of 1
ipv6_sock_mc_join_ssm---of 1
is_in---of 22
lockdep_sock_is_held---of 3
ma_put---of 12
mld_clear_delrec---of 45
mld_clear_zeros---of 25
mld_dad_work---of 23
mld_del_delrec6%of 57
mld_gq_start_work---of 7
mld_gq_work---of 6
mld_ifc_event44%of 16
mld_ifc_work---of 70
mld_in_v1_mode---of 10
mld_mca_work---of 10
mld_newpack---of 26
mld_query_work---of 160
mld_report_work---of 53
mld_send_initial_cr---of 24
mld_send_report---of 19
mld_sendpack---of 52
pskb_may_pull---of 4
rcu_lock_acquire---of 2
rcu_lock_release---of 2
sf_markstate---of 17
sf_setstate---of 66
-----------
SUMMARY20%of 147

___ptep_set_wrprotect---of 5
__access_remote_vm---of 40
__apply_to_page_range28%of 121
__check_safe_pte_update15%of 14
__clear_bit_unlock---of 1
__do_fault43%of 19
__folio_large_mapcount_sanity_checks---of 33
__folio_rmap_sanity_checks---of 15
__get_locked_pte---of 5
__might_fault50%of 4
__p4d_alloc---of 22
__pagetable_ctor46%of 11
__pmd_alloc29%of 14
__pte_alloc29%of 7
__pte_alloc_kernel---of 9
__pud_alloc---of 11
__set_ptes60%of 10
__vma_enter_locked59%of 12
__vma_exit_locked---of 6
__vma_start_write58%of 7
__vmf_anon_prepare86%of 7
access_process_vm---of 3
access_remote_vm---of 1
add_mm_counter100%of 1
add_mm_rss_vec89%of 9
apply_to_existing_page_range---of 1
apply_to_page_range100%of 1
clear_gigantic_page---of 7
clear_subpage---of 5
copy_folio_from_user---of 22
copy_page_range---of 70
copy_pmd_range---of 415
copy_present_page---of 21
copy_remote_vm_str---of 42
copy_user_gigantic_page---of 10
copy_user_large_folio---of 14
count_memcg_event_mm28%of 22
count_mthp_stat40%of 5
do_page_mkwrite---of 10
do_set_pmd---of 32
do_swap_page---of 332
do_wp_page16%of 330
fault_around_bytes_fops_open---of 1
fault_around_bytes_get---of 1
fault_around_bytes_set---of 3
fault_dirty_shared_page43%of 26
finish_fault38%of 56
finish_mkwrite_fault---of 12
flush_tlb_page---of 5
folio_dup_file_rmap_ptes---of 38
folio_get---of 4
folio_large_mapcount---of 4
folio_lock43%of 7
folio_needs_cow_for_dma---of 6
folio_nr_pages---of 4
folio_order50%of 4
folio_prealloc39%of 13
folio_pte_batch---of 24
folio_put67%of 6
folio_ref_add67%of 3
folio_ref_sub---of 3
folio_test_large---of 3
folio_try_dup_anon_rmap_ptes---of 109
folio_trylock---of 5
folio_zero_user---of 5
follow_pfnmap_end---of 9
follow_pfnmap_start---of 53
free_pgd_range27%of 76
free_pgtables74%of 26
generic_access_phys---of 14
get_page---of 7
handle_mm_fault31%of 399
insert_page---of 20
insert_page_into_pte_locked---of 32
insert_pfn---of 34
kmap_local_page---of 5
lock_mm_and_find_vma58%of 21
lock_vma_under_rcu33%of 34
mm_counter---of 4
mm_counter_file---of 3
mm_trace_rss_stat29%of 14
mmap_read_lock_killable60%of 5
mmap_read_trylock60%of 5
mmap_read_unlock67%of 3
mmap_write_downgrade50%of 6
mmap_write_unlock50%of 6
mmu_notifier_invalidate_range_start100%of 3
numa_migrate_check---of 43
page_mapped---of 6
page_pgmap---of 6
pagetable_dtor_free47%of 15
pfn_swap_entry_folio---of 11
pfn_swap_entry_to_page---of 14
pfn_valid20%of 31
pmd_install45%of 9
print_bad_pte---of 25
print_vma_addr---of 11
process_huge_page---of 12
pte_range_none---of 6
pte_unmap34%of 6
ptep_set_access_flags---of 3
ptlock_alloc---of 3
ptlock_free---of 3
put_page---of 9
put_swap_device---of 18
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
remap_pfn_range---of 1
remap_pfn_range_notrack---of 67
remove_device_exclusive_entry---of 51
restore_exclusive_pte---of 24
set_pte_range70%of 23
test_and_set_bit_lock---of 4
tlb_flush_mmu_tlbonly30%of 58
try_restore_exclusive_pte---of 22
trylock_page---of 7
unmap_mapping_folio---of 7
unmap_mapping_pages---of 3
unmap_mapping_range100%of 3
unmap_mapping_range_tree50%of 4
unmap_page_range24%of 228
unmap_vmas78%of 18
upgrade_mmap_lock_carefully54%of 13
vm_insert_page---of 15
vm_insert_pages---of 56
vm_iomap_memory---of 8
vm_map_pages---of 6
vm_map_pages_zero---of 6
vm_mixed_zeropage_allowed---of 10
vm_normal_folio---of 4
vm_normal_folio_pmd---of 4
vm_normal_page40%of 10
vm_normal_page_pmd---of 13
vma_end_read50%of 8
vma_mark_detached34%of 15
vma_pgtable_walk_begin100%of 3
vma_pgtable_walk_end100%of 3
vma_refcount_put50%of 8
vmf_insert_mixed---of 10
vmf_insert_mixed_mkwrite---of 6
vmf_insert_page_mkwrite---of 5
vmf_insert_pfn---of 10
vmf_insert_pfn_prot---of 10
vmf_pte_changed60%of 5
walk_to_pmd---of 30
wp_page_reuse---of 27
zap_page_range_single39%of 18
zap_vma_ptes---of 6
-----------
SUMMARY32%of 1789

-----------
SUMMARY---of 0

__cpu_replace_ttbr1---of 12
__create_pgd_mapping---of 1
__pagetable_ctor---of 11
__pgd_pgtable_alloc---of 3
arch_add_memory---of 9
arch_get_mappable_range---of 3
arch_remove_memory---of 6
arch_set_user_pkey_access---of 4
create_kpti_ng_temp_pgd---of 103
free_empty_tables---of 79
free_hotplug_page_range---of 39
lpa2_is_enabled---of 1
mark_rodata_ro---of 1
p4d_clear_huge100%of 1
pgattr_change_is_safe---of 8
pgd_pgtable_alloc---of 9
phys_mem_access_prot---of 4
pmd_clear_huge50%of 4
pmd_free_pte_page---of 7
pmd_set_huge---of 13
prevent_bootmem_remove_notifier---of 15
ptep_modify_prot_commit---of 34
ptep_modify_prot_start---of 10
pud_clear_huge50%of 4
pud_free_pmd_page---of 19
pud_set_huge---of 13
rcu_lock_acquire---of 2
rcu_lock_release---of 2
unmap_hotplug_range---of 105
update_mapping_prot---of 17
vmemmap_check_pmd---of 1
vmemmap_free---of 3
vmemmap_populate---of 6
vmemmap_set_pmd---of 1
-----------
SUMMARY56%of 9

-----------
SUMMARY---of 0

ipvtap_count_rx_dropped---of 1
ipvtap_count_tx_dropped---of 4
ipvtap_dellink---of 1
ipvtap_device_event19%of 11
ipvtap_net_namespace---of 1
ipvtap_newlink---of 4
ipvtap_setup---of 1
ipvtap_update_features---of 1
-----------
SUMMARY19%of 11

-----------
SUMMARY---of 0

__cleanup_nmi---of 5
__disable_irq---of 3
__enable_irq---of 5
__free_percpu_irq---of 8
__irq_apply_affinity_hint---of 5
__irq_set_trigger---of 17
__request_percpu_irq---of 11
__setup_irq---of 85
__synchronize_irq---of 14
can_request_irq---of 6
disable_hardirq---of 10
disable_irq---of 6
disable_irq_nosync---of 4
disable_nmi_nosync---of 4
disable_percpu_irq---of 3
disable_percpu_nmi---of 3
enable_irq---of 7
enable_nmi---of 1
enable_percpu_irq---of 7
enable_percpu_nmi---of 1
free_irq---of 33
free_nmi---of 8
free_percpu_irq---of 7
free_percpu_nmi---of 5
irq_affinity_notify---of 9
irq_can_set_affinity---of 5
irq_can_set_affinity_usr---of 6
irq_check_status_bit---of 13
irq_default_primary_handler---of 1
irq_do_set_affinity---of 26
irq_finalize_oneshot---of 13
irq_force_affinity---of 3
irq_forced_secondary_handler---of 1
irq_forced_thread_fn---of 6
irq_get_irqchip_state---of 6
irq_has_action---of 13
irq_nested_primary_handler---of 1
irq_percpu_is_enabled---of 4
irq_set_affinity---of 3
irq_set_affinity_locked---of 18
irq_set_affinity_notifier---of 17
irq_set_irq_wake---of 14
irq_set_irqchip_state34%of 6
irq_set_parent---of 3
irq_set_vcpu_affinity---of 7
irq_setup_affinity---of 11
irq_thread---of 16
irq_thread_check_affinity---of 10
irq_thread_dtor---of 9
irq_thread_fn---of 4
irq_update_affinity_desc---of 9
irq_wait_for_interrupt---of 14
irq_wake_thread---of 8
local_bh_disable---of 2
local_bh_enable---of 2
prepare_percpu_nmi---of 9
rcu_lock_acquire---of 2
rcu_lock_release---of 2
remove_percpu_irq---of 4
request_any_context_irq---of 5
request_nmi---of 16
request_percpu_nmi---of 13
request_threaded_irq---of 13
setup_irq_thread---of 9
setup_percpu_irq---of 6
synchronize_hardirq---of 7
synchronize_irq---of 3
teardown_percpu_nmi---of 8
wake_threads_waitq---of 5
wake_up_and_wait_for_irq_thread_ready---of 7
-----------
SUMMARY34%of 6

___ratelimit53%of 17
-----------
SUMMARY53%of 17

__find_rr_leaf---of 20
__ip6_del_rt---of 10
__ip6_del_rt_siblings---of 33
__ip6_route_redirect---of 42
__ip6_rt_update_pmtu---of 65
__ipv6_neigh_lookup_noref---of 12
__neigh_lookup---of 3
__probestub_fib6_table_lookup---of 1
__rt6_find_exception_rcu---of 16
__rt6_find_exception_spinlock---of 14
__rt6_nh_dev_match---of 6
__traceiter_fib6_table_lookup---of 4
addrconf_f6i_alloc---of 6
dst_discard---of 1
fib6_add_gc_list---of 10
fib6_backtrack---of 11
fib6_clean_tohost---of 21
fib6_ifdown---of 18
fib6_ifup80%of 5
fib6_info_hw_flags_set---of 14
fib6_info_nh_uses_dev---of 1
fib6_info_release---of 8
fib6_nh_age_exceptions---of 35
fib6_nh_del_cached_rt---of 10
fib6_nh_find_match---of 7
fib6_nh_flush_exceptions---of 24
fib6_nh_init---of 122
fib6_nh_mtu_change---of 31
fib6_nh_redirect_match---of 8
fib6_nh_release---of 24
fib6_nh_release_dsts---of 7
fib6_nh_remove_exception---of 9
fib6_remove_prefsrc---of 6
fib6_rt_update---of 7
fib6_select_path---of 47
fib6_table_lookup---of 35
find_match---of 54
icmp6_dst_alloc---of 12
in6_dev_get---of 22
in6_dev_put---of 6
inet6_rt_notify---of 7
inet6_rtm_delroute---of 18
inet6_rtm_getroute---of 110
inet6_rtm_newroute---of 115
ip6_blackhole_route---of 9
ip6_confirm_neigh---of 22
ip6_create_rt_rcu---of 22
ip6_default_advmss---of 16
ip6_del_rt---of 1
ip6_dst_alloc---of 4
ip6_dst_check---of 42
ip6_dst_destroy---of 23
ip6_dst_gc---of 6
ip6_dst_ifdown---of 16
ip6_dst_neigh_lookup---of 4
ip6_hold_safe---of 11
ip6_ins_rt---of 1
ip6_link_failure---of 31
ip6_mtu---of 24
ip6_mtu_from_fib6---of 18
ip6_multipath_l3_keys---of 12
ip6_negative_advice---of 14
ip6_neigh_lookup---of 23
ip6_pkt_discard---of 1
ip6_pkt_discard_out---of 5
ip6_pkt_drop---of 36
ip6_pkt_prohibit---of 1
ip6_pkt_prohibit_out---of 5
ip6_pol_route---of 81
ip6_pol_route_input---of 1
ip6_pol_route_lookup---of 56
ip6_pol_route_output---of 1
ip6_redirect---of 1
ip6_redirect_no_header---of 3
ip6_route_add---of 9
ip6_route_cleanup---of 1
ip6_route_del---of 87
ip6_route_dev_notify9%of 23
ip6_route_get_saddr---of 22
ip6_route_info_create---of 50
ip6_route_input---of 32
ip6_route_input_lookup---of 3
ip6_route_lookup---of 1
ip6_route_mpath_notify---of 17
ip6_route_net_exit---of 1
ip6_route_net_exit_late---of 1
ip6_route_net_init---of 7
ip6_route_net_init_late---of 4
ip6_route_output_flags---of 34
ip6_rt_cache_alloc---of 33
ip6_rt_copy_init---of 26
ip6_rt_update_pmtu---of 3
ip6_sk_dst_store_flow---of 29
ip6_sk_redirect---of 1
ip6_sk_update_pmtu---of 37
ip6_update_pmtu---of 6
ip_fib_metrics_put---of 7
ipv6_addr_prefix---of 6
ipv6_inetpeer_exit---of 1
ipv6_inetpeer_init---of 3
ipv6_route_ioctl---of 6
ipv6_route_sysctl_init---of 3
ipv6_route_sysctl_table_size---of 1
ipv6_sysctl_rtcache_flush---of 4
l3mdev_fib_table---of 11
local_bh_disable---of 2
local_bh_enable---of 2
neigh_release---of 6
nexthop_fib6_nh---of 17
nexthop_get---of 7
nexthop_is_blackhole---of 16
nexthop_path_fib6_result---of 9
nlmsg_parse_deprecated_strict---of 4
perf_trace_fib6_table_lookup---of 16
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
rt6_add_dflt_router---of 4
rt6_add_route_info---of 3
rt6_age_exceptions---of 3
rt6_check_expired---of 12
rt6_clean_tohost---of 1
rt6_disable_ip---of 36
rt6_do_redirect---of 56
rt6_do_update_pmtu---of 26
rt6_dump_route---of 43
rt6_fill_node---of 76
rt6_fill_node_nexthop---of 28
rt6_find_cached_rt---of 12
rt6_flush_exceptions---of 3
rt6_get_dflt_router---of 35
rt6_get_route_info---of 39
rt6_insert_exception---of 57
rt6_is_dead---of 12
rt6_lookup---of 5
rt6_mtu_change---of 1
rt6_mtu_change_route---of 11
rt6_multipath_dead_count---of 10
rt6_multipath_hash---of 61
rt6_multipath_nh_flags_set---of 8
rt6_multipath_rebalance6%of 39
rt6_nh_age_exceptions---of 1
rt6_nh_dump_exceptions---of 16
rt6_nh_find_match---of 1
rt6_nh_flush_exceptions---of 1
rt6_nh_nlmsg_size---of 3
rt6_nh_remove_exception_rt---of 1
rt6_nlmsg_size---of 20
rt6_probe_deferred---of 6
rt6_purge_dflt_routers---of 65
rt6_remove_exception---of 6
rt6_remove_exception_rt---of 10
rt6_remove_prefsrc---of 1
rt6_route_rcv---of 22
rt6_score_route---of 21
rt6_stats_seq_show---of 1
rt6_sync_down_dev---of 3
rt6_sync_up50%of 4
rt6_uncached_list_add---of 3
rt6_uncached_list_del---of 4
rtm_to_fib6_config---of 55
skb_header_pointer---of 3
trace_event_raw_event_fib6_table_lookup---of 17
trace_fib6_table_lookup---of 17
trace_raw_output_fib6_table_lookup---of 3
-----------
SUMMARY15%of 71

__activate_cptr_traps7%of 66
__deactivate_traps35%of 29
__kvm_vcpu_run10%of 265
compute_emulated_cntx_ctl_el0---of 42
kvm_hyp_handle_cp15_3250%of 4
kvm_hyp_handle_dabt_low18%of 52
kvm_hyp_handle_eret---of 51
kvm_hyp_handle_fpsimd---of 180
kvm_hyp_handle_impdef---of 4
kvm_hyp_handle_memory_fault17%of 30
kvm_hyp_handle_mops---of 7
kvm_hyp_handle_sysreg_vhe4%of 287
kvm_unexpected_el2_exception---of 5
kvm_vcpu_load_vhe6%of 373
kvm_vcpu_put_vhe22%of 69
vcpu_el2_e2h_is_set---of 11
-----------
SUMMARY9%of 1175

context_compute_hash67%of 3
-----------
SUMMARY67%of 3

__arm64_compat_sys_lseek---of 9
__arm64_compat_sys_preadv---of 8
__arm64_compat_sys_preadv2---of 9
__arm64_compat_sys_pwritev---of 8
__arm64_compat_sys_pwritev2---of 9
__arm64_compat_sys_sendfile---of 13
__arm64_compat_sys_sendfile64---of 9
__arm64_sys_copy_file_range---of 29
__arm64_sys_llseek---of 15
__arm64_sys_lseek---of 9
__arm64_sys_pread64---of 7
__arm64_sys_preadv---of 8
__arm64_sys_preadv2---of 9
__arm64_sys_pwrite64---of 7
__arm64_sys_pwritev---of 8
__arm64_sys_pwritev2---of 9
__arm64_sys_read---of 1
__arm64_sys_readv---of 1
__arm64_sys_sendfile---of 13
__arm64_sys_sendfile64---of 9
__arm64_sys_write100%of 1
__arm64_sys_writev---of 1
__kernel_read---of 22
__kernel_write---of 1
__kernel_write_iter---of 21
_inline_copy_from_user---of 8
default_llseek---of 14
do_iter_readv_writev---of 27
do_readv---of 12
do_sendfile---of 24
do_writev---of 12
fixed_size_llseek---of 3
fsnotify_access---of 9
fsnotify_modify---of 9
generic_atomic_write_valid---of 4
generic_file_llseek---of 1
generic_file_llseek_size---of 24
generic_file_rw_checks29%of 7
generic_llseek_cookie---of 20
generic_write_check_limits---of 6
generic_write_checks75%of 4
generic_write_checks_count54%of 13
kernel_read---of 3
kernel_write---of 6
kiocb_end_write---of 2
kiocb_start_write---of 2
ksys_pread64---of 7
ksys_pwrite64---of 7
ksys_read---of 10
ksys_write90%of 10
no_seek_end_llseek---of 3
no_seek_end_llseek_size---of 3
noop_llseek---of 1
rw_verify_area24%of 26
sb_end_write29%of 14
sb_start_write34%of 12
vfs_copy_file_range---of 53
vfs_iocb_iter_read---of 16
vfs_iocb_iter_write---of 16
vfs_iter_read---of 16
vfs_iter_write---of 19
vfs_llseek---of 3
vfs_read---of 28
vfs_readv---of 28
vfs_setpos---of 7
vfs_write63%of 32
vfs_writev---of 31
warn_unsupported---of 3
-----------
SUMMARY48%of 119

-----------
SUMMARY---of 0

__destroy_inode23%of 44
__iget---of 3
__inode_add_lru17%of 12
__insert_inode_hash---of 3
__probestub_ctime_ns_xchg---of 1
__probestub_ctime_xchg_skip---of 1
__probestub_fill_mg_cmtime---of 1
__probestub_inode_set_ctime_to_ts---of 1
__remove_inode_hash---of 4
__traceiter_ctime_ns_xchg---of 4
__traceiter_ctime_xchg_skip---of 4
__traceiter_fill_mg_cmtime---of 4
__traceiter_inode_set_ctime_to_ts---of 4
__wait_on_freeing_inode---of 15
address_space_init_once---of 1
alloc_inode40%of 10
atime_needs_update25%of 20
bmap---of 3
clear_inode29%of 7
clear_nlink50%of 4
current_time42%of 12
dentry_needs_remove_privs75%of 4
destroy_inode---of 5
discard_new_inode---of 6
drop_nlink67%of 6
dump_mapping---of 12
evict33%of 37
evict_inodes---of 26
file_modified---of 1
file_modified_flags---of 17
file_remove_privs100%of 1
file_remove_privs_flags40%of 15
file_update_time72%of 14
find_inode---of 26
find_inode_by_ino_rcu---of 13
find_inode_fast---of 26
find_inode_nowait---of 8
find_inode_rcu---of 13
free_inode_nonrcu---of 1
generic_delete_inode100%of 1
generic_update_time---of 3
get_next_ino75%of 8
get_nr_dirty_inodes---of 7
i_callback---of 3
iget5_locked---of 5
iget5_locked_rcu---of 13
iget_locked---of 24
igrab---of 4
ihold60%of 5
ilookup---of 9
ilookup5---of 8
ilookup5_nowait---of 1
in_group_or_capable67%of 3
inc_nlink34%of 6
init_once---of 1
init_special_inode---of 6
inode_add_lru100%of 1
inode_bit_waitqueue---of 1
inode_dio_finished---of 1
inode_dio_wait---of 5
inode_dio_wait_interruptible---of 6
inode_init_always_gfp70%of 10
inode_init_once100%of 1
inode_init_owner75%of 4
inode_insert5---of 14
inode_lru_isolate---of 21
inode_needs_sync---of 6
inode_nohighmem---of 1
inode_owner_or_capable40%of 5
inode_pin_lru_isolating---of 6
inode_sb_list_add---of 3
inode_set_ctime_current24%of 73
inode_set_ctime_deleg---of 23
inode_set_ctime_to_ts24%of 17
inode_set_flags58%of 7
inode_unpin_lru_isolating---of 3
inode_update_time---of 4
inode_update_timestamps54%of 13
insert_inode_locked---of 12
insert_inode_locked4---of 3
iput23%of 40
iunique---of 17
kiocb_modified---of 1
lock_two_nondirectories---of 11
lockdep_annotate_inode_mutex_key50%of 4
mgts_open---of 1
mgts_show---of 10
mode_strip_sgid43%of 7
new_inode50%of 4
no_open---of 1
perf_trace_ctime---of 6
perf_trace_ctime_ns_xchg---of 6
perf_trace_fill_mg_cmtime---of 6
proc_nr_inodes---of 7
prune_icache_sb---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
set_nlink---of 7
timestamp_truncate---of 8
touch_atime38%of 32
trace_event_raw_event_ctime---of 7
trace_event_raw_event_ctime_ns_xchg---of 7
trace_event_raw_event_fill_mg_cmtime---of 7
trace_raw_output_ctime---of 3
trace_raw_output_ctime_ns_xchg---of 3
trace_raw_output_fill_mg_cmtime---of 3
unlock_new_inode---of 6
unlock_two_nondirectories---of 7
wait_on_inode---of 5
-----------
SUMMARY37%of 427

-----------
SUMMARY---of 0

__kthread_cancel_work_sync---of 11
__kthread_create_on_node---of 8
__kthread_init_worker---of 1
__kthread_parkme---of 7
__kthread_queue_delayed_work---of 13
free_kthread_struct---of 6
get_kthread_comm---of 9
kthread---of 29
kthread_affine_preferred---of 18
kthread_associate_blkcg---of 39
kthread_bind---of 7
kthread_bind_mask---of 7
kthread_blkcg50%of 4
kthread_cancel_delayed_work_sync---of 1
kthread_cancel_work_sync---of 1
kthread_complete_and_exit---of 3
kthread_create_on_cpu---of 10
kthread_create_on_node---of 1
kthread_create_worker_on_cpu---of 8
kthread_create_worker_on_node---of 4
kthread_data---of 3
kthread_delayed_work_timer_fn---of 10
kthread_destroy_worker---of 6
kthread_exit---of 8
kthread_flush_work---of 7
kthread_flush_work_fn---of 1
kthread_flush_worker---of 7
kthread_freezable_should_stop---of 8
kthread_func---of 4
kthread_insert_work---of 26
kthread_is_per_cpu50%of 4
kthread_mod_delayed_work---of 8
kthread_park---of 10
kthread_parkme---of 3
kthread_probe_data---of 4
kthread_queue_delayed_work---of 7
kthread_queue_work---of 7
kthread_set_per_cpu---of 10
kthread_should_park---of 3
kthread_should_stop---of 3
kthread_should_stop_or_park---of 4
kthread_stop---of 38
kthread_stop_put---of 1
kthread_unpark---of 9
kthread_unuse_mm---of 13
kthread_use_mm---of 16
kthread_worker_fn---of 52
kthreadd---of 24
kthreads_init---of 1
kthreads_online_cpu---of 15
put_task_struct---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
set_kthread_struct---of 7
tsk_fork_get_node---of 3
-----------
SUMMARY50%of 8

-----------
SUMMARY---of 0

bsearch100%of 5
-----------
SUMMARY100%of 5

-----------
SUMMARY---of 0

__vcpu_read_sys_reg_from_cpu16%of 32
__vcpu_write_sys_reg_to_cpu16%of 32
aa32_id_visibility34%of 9
access_actlr---of 5
access_arch_timer---of 151
access_ccsidr---of 6
access_clidr---of 13
access_cntkctl_el12---of 13
access_csselr---of 3
access_ctr---of 4
access_dcgsw---of 8
access_dcsw---of 6
access_elr---of 3
access_gic_eisr---of 4
access_gic_elrsr---of 4
access_gic_misr---of 4
access_gic_sgi50%of 6
access_gic_sre---of 5
access_gic_vtr---of 4
access_hv_timer---of 13
access_id_reg23%of 9
access_imp_id_reg29%of 7
access_mdcr---of 19
access_pmceid---of 14
access_pmcnten29%of 28
access_pmcr38%of 16
access_pminten27%of 26
access_pmovs22%of 28
access_pmselr28%of 22
access_pmswinc34%of 12
access_pmu_evcntr17%of 59
access_pmu_evtyper27%of 30
access_pmuserenr---of 14
access_rw---of 3
access_sp_el1---of 17
access_spsr---of 17
access_vm_reg---of 5
access_zcr_el2---of 34
bad_redir_trap---of 3
bad_vncr_trap---of 3
check_sysreg_table---of 20
el2_visibility50%of 4
emulate_sys_reg---of 5
fp8_visibility50%of 4
get_dbg_wb_reg40%of 5
get_el2_to_el1_mapping7%of 43
get_id_reg62%of 13
get_pmcr100%of 1
get_pmreg28%of 11
get_pmu_evcntr60%of 5
get_raz_reg100%of 1
get_reg_by_id50%of 6
handle_alle1is---of 7
handle_at_s12---of 1
handle_at_s1e01---of 1
handle_at_s1e2---of 4
handle_ipas2e1is---of 18
handle_ripas2e1is---of 18
handle_tlbi_el1---of 42
handle_tlbi_el2---of 16
handle_vmalls12e1is---of 7
id_to_sys_reg_desc84%of 12
id_visibility80%of 5
idregs_debug_next---of 13
idregs_debug_open---of 4
idregs_debug_show---of 19
idregs_debug_start---of 14
idregs_debug_stop---of 3
is_hyp_ctxt15%of 14
kvm_arm_copy_sys_reg_indices67%of 9
kvm_arm_num_sys_reg_descs100%of 1
kvm_arm_sys_reg_get_reg75%of 12
kvm_arm_sys_reg_set_reg64%of 19
kvm_calculate_traps38%of 35
kvm_emulate_cp15_id_reg---of 8
kvm_finalize_sys_regs40%of 10
kvm_handle_cp10_id---of 12
kvm_handle_cp14_32---of 1
kvm_handle_cp14_64---of 1
kvm_handle_cp14_load_store---of 1
kvm_handle_cp15_3229%of 7
kvm_handle_cp15_64---of 1
kvm_handle_cp_3250%of 12
kvm_handle_cp_64---of 18
kvm_handle_sys_reg40%of 20
kvm_read_sanitised_id_reg43%of 42
kvm_reset_sys_regs62%of 36
kvm_set_vm_id_reg62%of 13
kvm_sys_reg_get_user48%of 23
kvm_sys_reg_set_user44%of 23
kvm_sys_regs_create_debugfs100%of 1
kvm_vm_ioctl_get_reg_writable_masks19%of 59
mte_visibility50%of 4
perform_access44%of 23
pmu_visibility100%of 1
print_sys_reg_msg100%of 3
ptrauth_visibility25%of 8
raz_visibility100%of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
read_sanitised_id_dfr0_el160%of 5
reset_actlr100%of 1
reset_amair_el1100%of 1
reset_clidr27%of 26
reset_dbg_wb_reg40%of 5
reset_hcr29%of 14
reset_imp_id_reg67%of 6
reset_mdcr28%of 11
reset_mpidr100%of 1
reset_pmcr27%of 26
reset_pmevcntr24%of 21
reset_pmevtyper28%of 22
reset_pmselr24%of 21
reset_pmu_reg24%of 21
reset_unknown22%of 23
reset_val22%of 23
s1pie_el2_visibility40%of 5
s1pie_visibility100%of 1
s1poe_el2_visibility40%of 5
s1poe_visibility100%of 1
s2_mmu_tlbi_s1e1---of 3
s2_mmu_unmap_ipa---of 1
s2_mmu_unmap_range---of 1
set_clidr38%of 16
set_ctr_el0100%of 3
set_dbg_wb_reg40%of 5
set_id_aa64dfr0_el1100%of 3
set_id_aa64mmfr0_el134%of 6
set_id_aa64mmfr2_el1100%of 1
set_id_aa64pfr0_el1100%of 1
set_id_aa64pfr1_el1---of 1
set_id_dfr0_el1100%of 3
set_id_reg72%of 28
set_imp_id_reg42%of 12
set_oslsr_el134%of 12
set_pmcr36%of 25
set_pmreg31%of 13
set_pmu_evcntr40%of 5
set_wi_reg---of 1
sme_visibility100%of 1
sve_el2_visibility29%of 7
sve_visibility75%of 4
tcr2_el2_visibility40%of 5
tcr2_visibility100%of 1
translate_cptr_el2_to_cpacr_el1---of 1
translate_sctlr_el2_to_sctlr_el1---of 1
translate_tcr_el2_to_tcr_el1---of 1
translate_ttbr0_el2_to_ttbr0_el1---of 1
trap_dbg_wb_reg20%of 10
trap_dbgauthstatus_el1---of 3
trap_dbgdidr---of 3
trap_debug_regs100%of 3
trap_loregion50%of 6
trap_oslar_el150%of 4
trap_oslsr_el124%of 13
trap_raz_wi67%of 3
undef_access100%of 1
vcpu_el2_e2h_is_set---of 11
vcpu_read_sys_reg18%of 45
vcpu_write_sys_reg16%of 45
vncr_el2_visibility34%of 6
walk_sys_regs82%of 16
-----------
SUMMARY36%of 1348

kvm_arm_setup_mdcr_el262%of 13
kvm_debug_handle_oslar27%of 15
kvm_debug_set_guest_ownership67%of 3
kvm_disable_trbe---of 13
kvm_enable_trbe---of 13
kvm_init_host_debug_data---of 35
kvm_tracing_set_el1_configuration---of 21
kvm_vcpu_load_debug58%of 21
kvm_vcpu_put_debug67%of 6
-----------
SUMMARY52%of 58

cond_bools_copy---of 3
cond_bools_destroy---of 1
cond_bools_index---of 1
cond_compute_av15%of 14
cond_compute_xperms---of 7
cond_destroy_bool---of 1
cond_dup_av_list---of 6
cond_index_bool---of 4
cond_init_bool_indexes---of 1
cond_insertf---of 13
cond_policydb_destroy---of 4
cond_policydb_destroy_dup---of 1
cond_policydb_dup---of 14
cond_policydb_init---of 1
cond_read_av_list---of 7
cond_read_bool---of 7
cond_read_list---of 20
cond_write_av_list---of 6
cond_write_bool---of 4
cond_write_list---of 13
evaluate_cond_nodes---of 31
-----------
SUMMARY15%of 14

-----------
SUMMARY---of 0

__radix_tree_delete---of 30
__radix_tree_lookup---of 14
__radix_tree_preload---of 9
__radix_tree_replace---of 12
delete_node---of 43
idr_destroy---of 10
idr_get_free---of 26
idr_preload---of 3
local_lock_acquire---of 6
local_lock_release---of 7
radix_tree_cpu_dead---of 4
radix_tree_delete---of 1
radix_tree_delete_item---of 20
radix_tree_extend---of 20
radix_tree_gang_lookup---of 12
radix_tree_gang_lookup_tag---of 17
radix_tree_gang_lookup_tag_slot---of 10
radix_tree_insert---of 24
radix_tree_iter_delete---of 3
radix_tree_iter_replace---of 1
radix_tree_iter_resume---of 1
radix_tree_iter_tag_clear---of 8
radix_tree_lookup25%of 8
radix_tree_lookup_slot---of 10
radix_tree_maybe_preload---of 3
radix_tree_next_chunk16%of 32
radix_tree_node_alloc---of 9
radix_tree_node_ctor100%of 1
radix_tree_node_rcu_free---of 1
radix_tree_preload---of 3
radix_tree_replace_slot---of 7
radix_tree_tag_clear---of 13
radix_tree_tag_get---of 9
radix_tree_tag_set---of 11
radix_tree_tagged---of 1
-----------
SUMMARY20%of 41

-----------
SUMMARY---of 0

hsr_add_port---of 29
hsr_del_port---of 8
hsr_handle_frame---of 30
hsr_invalid_dan_ingress_frame---of 1
hsr_port_exists100%of 1
-----------
SUMMARY100%of 1

bdi_alloc---of 4
bdi_debug_stats_open---of 1
bdi_debug_stats_show---of 18
bdi_dev_name---of 3
bdi_get_by_id---of 15
bdi_init---of 3
bdi_put---of 11
bdi_register---of 1
bdi_register_va---of 33
bdi_set_owner---of 3
bdi_unregister---of 19
cgwb_debug_stats_open---of 1
cgwb_debug_stats_show---of 29
cgwb_free_rcu---of 1
cgwb_kill---of 12
cgwb_release---of 1
cgwb_release_workfn---of 11
cleanup_offline_cgwbs_workfn---of 19
collect_wb_stats---of 15
css_get---of 17
inode_to_bdi50%of 4
max_bytes_show---of 1
max_bytes_store---of 3
max_ratio_fine_show---of 1
max_ratio_fine_store---of 3
max_ratio_show---of 1
max_ratio_store---of 3
min_bytes_show---of 1
min_bytes_store---of 3
min_ratio_fine_show---of 1
min_ratio_fine_store---of 3
min_ratio_show---of 1
min_ratio_store---of 3
percpu_ref_put_many---of 18
rcu_lock_acquire---of 2
rcu_lock_release---of 2
read_ahead_kb_show---of 1
read_ahead_kb_store---of 3
stable_pages_required_show---of 3
strict_limit_show---of 1
strict_limit_store---of 3
wb_blkcg_offline---of 4
wb_get_create---of 36
wb_get_lookup---of 18
wb_init---of 4
wb_memcg_offline---of 4
wb_shutdown---of 8
wb_tryget---of 20
wb_update_bandwidth_workfn---of 1
-----------
SUMMARY50%of 4

nft_chain_offload_cmd---of 10
nft_chain_offload_support---of 10
nft_flow_offload_unbind---of 12
nft_flow_rule_create---of 25
nft_flow_rule_destroy---of 10
nft_flow_rule_offload_commit---of 70
nft_flow_rule_set_addr_type---of 3
nft_flow_rule_stats---of 9
nft_indr_block_cleanup---of 7
nft_offload_exit---of 1
nft_offload_init---of 1
nft_offload_netdev_event11%of 19
nft_offload_set_dependency---of 1
nft_offload_update_dependency---of 6
nft_pernet---of 16
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY11%of 19

__clear_user---of 6
__kvm_arm_vcpu_get_events60%of 5
__kvm_arm_vcpu_set_events78%of 9
_inline_copy_from_user63%of 8
copy_core_reg_indices67%of 15
core_reg_addr55%of 87
core_reg_size_from_offset99%of 84
folio_set_hugetlb_mte_tagged---of 5
folio_try_hugetlb_mte_tagging---of 9
kvm_arch_vcpu_ioctl_get_fpu100%of 1
kvm_arch_vcpu_ioctl_get_regs100%of 1
kvm_arch_vcpu_ioctl_get_sregs100%of 1
kvm_arch_vcpu_ioctl_set_fpu100%of 1
kvm_arch_vcpu_ioctl_set_guest_debug43%of 19
kvm_arch_vcpu_ioctl_set_regs100%of 1
kvm_arch_vcpu_ioctl_set_sregs100%of 1
kvm_arch_vcpu_ioctl_translate100%of 1
kvm_arm_copy_reg_indices36%of 37
kvm_arm_get_reg42%of 62
kvm_arm_num_regs60%of 5
kvm_arm_set_reg66%of 67
kvm_arm_vcpu_arch_get_attr100%of 5
kvm_arm_vcpu_arch_has_attr100%of 5
kvm_arm_vcpu_arch_set_attr100%of 5
kvm_target_cpu---of 9
kvm_vm_ioctl_mte_copy_tags3%of 71
page_mte_tagged---of 7
rcu_lock_acquire---of 2
rcu_lock_release---of 2
set_page_mte_tagged---of 7
sve_reg_to_region50%of 12
try_page_mte_tagging---of 11
-----------
SUMMARY56%of 503

__mas_set_range---of 14
__mt_destroy39%of 13
__mt_dup---of 4
__probestub_ma_op---of 1
__probestub_ma_read---of 1
__probestub_ma_write---of 1
__traceiter_ma_op---of 4
__traceiter_ma_read---of 4
__traceiter_ma_write---of 4
mab_mas_cp85%of 19
mab_no_null_split40%of 5
mab_shift_right67%of 3
mas_adopt_children43%of 19
mas_alloc_cyclic24%of 25
mas_alloc_nodes30%of 27
mas_ascend13%of 31
mas_descend43%of 21
mas_destroy7%of 101
mas_dump---of 7
mas_dup_build---of 39
mas_dup_free---of 32
mas_empty_area21%of 69
mas_empty_area_rev---of 97
mas_erase---of 13
mas_expected_entries---of 7
mas_find36%of 48
mas_find_child48%of 21
mas_find_range---of 48
mas_find_range_rev---of 3
mas_find_rev---of 3
mas_find_rev_setup---of 35
mas_leaf_max_gap50%of 22
mas_mab_cp81%of 21
mas_max_gap29%of 7
mas_new_root---of 26
mas_next100%of 3
mas_next_node29%of 60
mas_next_range67%of 3
mas_next_setup10%of 32
mas_next_slot38%of 45
mas_nomem---of 8
mas_pause---of 1
mas_preallocate38%of 29
mas_prev67%of 3
mas_prev_node22%of 61
mas_prev_range67%of 3
mas_prev_setup10%of 50
mas_prev_slot32%of 44
mas_push_data66%of 32
mas_replace_node22%of 14
mas_root_expand34%of 24
mas_set_height50%of 6
mas_set_parent46%of 11
mas_skip_node---of 21
mas_spanning_rebalance56%of 109
mas_split_final_node42%of 12
mas_start67%of 15
mas_state_walk---of 3
mas_store---of 39
mas_store_b_node60%of 27
mas_store_gfp24%of 13
mas_store_prealloc41%of 47
mas_update_gap34%of 27
mas_walk34%of 9
mas_wmb_replace40%of 117
mas_wr_dump---of 1
mas_wr_preallocate70%of 26
mas_wr_spanning_store35%of 43
mas_wr_store_entry37%of 147
mas_wr_store_type77%of 52
mas_wr_walk_descend79%of 19
mas_wr_walk_index34%of 12
mast_ascend36%of 25
mast_fill_bnode79%of 14
mast_spanning_rebalance11%of 28
mast_split_data30%of 17
mt_cache_shrink---of 1
mt_destroy_walk19%of 37
mt_dump---of 13
mt_dump_entry---of 10
mt_dump_node---of 139
mt_find34%of 33
mt_find_after---of 3
mt_free_rcu---of 1
mt_free_walk---of 25
mt_next---of 13
mt_prev---of 13
mt_validate35%of 292
mt_write_locked---of 4
mte_dead_leaves31%of 13
mte_dead_walk---of 24
mte_destroy_descend19%of 27
mtree_alloc_cyclic50%of 4
mtree_alloc_range---of 22
mtree_alloc_rrange---of 22
mtree_destroy100%of 1
mtree_dup---of 4
mtree_erase---of 1
mtree_insert---of 1
mtree_insert_range39%of 21
mtree_load42%of 39
mtree_range_walk74%of 30
mtree_store---of 3
mtree_store_range---of 4
perf_trace_ma_op---of 6
perf_trace_ma_read---of 6
perf_trace_ma_write---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
trace_event_raw_event_ma_op---of 7
trace_event_raw_event_ma_read---of 7
trace_event_raw_event_ma_write---of 7
trace_ma_op24%of 17
trace_ma_read24%of 17
trace_ma_write24%of 17
trace_raw_output_ma_op---of 3
trace_raw_output_ma_read---of 3
trace_raw_output_ma_write---of 3
-----------
SUMMARY37%of 2183

-----------
SUMMARY---of 0

get_kvm_ipa_limit100%of 1
kvm_arm_vcpu_destroy75%of 4
kvm_arm_vcpu_finalize46%of 11
kvm_arm_vcpu_is_finalized80%of 5
kvm_get_pa_bits---of 1
kvm_reset_vcpu46%of 24
-----------
SUMMARY54%of 45

-----------
SUMMARY---of 0

_smc_pnet_dump---of 26
_smc_pnet_find_roce_by_pnetid---of 28
net_generic---of 16
pnet_find_base_ndev---of 8
rcu_lock_acquire---of 2
rcu_lock_release---of 2
sk_dst_get---of 22
smc_pnet_add---of 74
smc_pnet_apply_ib---of 3
smc_pnet_apply_smcd---of 3
smc_pnet_del---of 3
smc_pnet_dump---of 1
smc_pnet_dump_start---of 1
smc_pnet_exit---of 1
smc_pnet_find_alt_roce---of 1
smc_pnet_find_ism_resource---of 12
smc_pnet_find_ndev_pnetid_by_table---of 7
smc_pnet_find_roce_resource---of 28
smc_pnet_flush---of 1
smc_pnet_get---of 7
smc_pnet_is_ndev_pnetid---of 4
smc_pnet_is_pnetid_set---of 1
smc_pnet_match---of 112
smc_pnet_net_exit---of 6
smc_pnet_net_init---of 12
smc_pnet_netdev_event5%of 44
smc_pnet_remove_by_pnetid---of 45
smc_pnetid_by_table_ib---of 8
smc_pnetid_by_table_smcd---of 9
-----------
SUMMARY5%of 44

-----------
SUMMARY---of 0

__pkvm_pgtable_stage2_unmap---of 64
host_s2_pgtable_pages---of 4
hyp_s1_pgtable_pages---of 4
hyp_vmemmap_pages---of 8
pkvm_create_hyp_vcpu---of 8
pkvm_create_hyp_vm---of 14
pkvm_destroy_hyp_vm---of 6
pkvm_init_host_vm100%of 1
pkvm_mapping_augment_rotate---of 5
pkvm_pgtable_stage2_create_unlinked---of 1
pkvm_pgtable_stage2_destroy---of 1
pkvm_pgtable_stage2_flush---of 32
pkvm_pgtable_stage2_free_unlinked---of 1
pkvm_pgtable_stage2_init---of 1
pkvm_pgtable_stage2_map---of 28
pkvm_pgtable_stage2_mkyoung---of 5
pkvm_pgtable_stage2_relax_perms---of 3
pkvm_pgtable_stage2_split---of 1
pkvm_pgtable_stage2_test_clear_young---of 30
pkvm_pgtable_stage2_unmap---of 4
pkvm_pgtable_stage2_wrprotect---of 31
-----------
SUMMARY100%of 1

__arm64_sys_brk---of 39
__arm64_sys_mmap_pgoff---of 1
__arm64_sys_munmap100%of 1
__arm64_sys_remap_file_pages---of 29
__get_unmapped_area59%of 17
__probestub_exit_mmap---of 1
__probestub_vm_unmapped_area---of 1
__probestub_vma_mas_szero---of 1
__probestub_vma_store---of 1
__traceiter_exit_mmap---of 4
__traceiter_vm_unmapped_area---of 4
__traceiter_vma_mas_szero---of 4
__traceiter_vma_store---of 4
_install_special_mapping---of 7
arch_get_unmapped_area---of 1
arch_get_unmapped_area_topdown---of 1
do_mmap60%of 85
do_munmap---of 1
exit_mmap---of 63
expand_stack---of 26
expand_stack_locked100%of 1
file_mmap_ok58%of 7
find_extend_vma_locked---of 9
find_vma50%of 4
find_vma_intersection50%of 4
find_vma_prev---of 3
generic_get_unmapped_area---of 22
generic_get_unmapped_area_topdown11%of 28
get_file---of 6
init_admin_reserve---of 1
init_reserve_notifier---of 3
init_user_reserve---of 1
insert_vm_struct---of 14
ksys_mmap_pgoff67%of 15
may_expand_vm37%of 11
mlock_future_ok75%of 4
mm_get_unmapped_area67%of 3
mm_get_unmapped_area_vmflags67%of 3
mmap_read_lock_maybe_expand---of 22
mmap_read_unlock---of 3
mmap_write_lock_killable---of 7
mmap_write_unlock---of 6
perf_trace_exit_mmap---of 6
perf_trace_vm_unmapped_area---of 6
perf_trace_vma_mas_szero---of 6
perf_trace_vma_store---of 6
relocate_vma_down---of 9
reserve_mem_notifier---of 10
special_mapping_close---of 3
special_mapping_fault---of 13
special_mapping_mremap---of 4
special_mapping_name---of 1
special_mapping_split---of 1
trace_event_raw_event_exit_mmap---of 7
trace_event_raw_event_vm_unmapped_area---of 7
trace_event_raw_event_vma_mas_szero---of 7
trace_event_raw_event_vma_store---of 7
trace_raw_output_exit_mmap---of 3
trace_raw_output_vm_unmapped_area---of 4
trace_raw_output_vma_mas_szero---of 3
trace_raw_output_vma_store---of 3
vm_brk_flags---of 24
vm_munmap---of 1
vm_stat_account100%of 5
vm_unmapped_area---of 19
vma_is_special_mapping---of 3
vma_set_page_prot67%of 3
-----------
SUMMARY54%of 191

failover_event11%of 28
failover_get_bymac---of 13
failover_register---of 17
failover_slave_register---of 16
failover_slave_unregister---of 13
failover_unregister---of 13
-----------
SUMMARY11%of 28

_add_netdev_ips---of 98
add_default_gids---of 3
add_netdev_ips---of 1
add_netdev_upper_ips---of 1
addr_event---of 15
callback_for_addr_gid_device_scan---of 12
del_default_gids---of 3
del_netdev_default_ips_join---of 46
del_netdev_ips---of 1
del_netdev_upper_ips---of 1
enum_all_gids_of_dev_cb---of 12
handle_netdev_upper---of 21
inet6addr_event---of 1
inetaddr_event---of 1
is_eth_active_slave_of_bonding_rcu---of 17
is_eth_port_inactive_slave_filter---of 12
is_eth_port_of_netdev_filter---of 17
is_ndev_for_default_gid_filter---of 19
is_upper_ndev_bond_master_filter---of 15
netdev_upper_walk---of 10
netdevice_event7%of 32
netdevice_event_work_handler---of 14
pass_all_filter---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rdma_roce_rescan_device---of 1
rdma_roce_rescan_port---of 7
roce_del_all_netdev_gids---of 1
roce_gid_type_mask_support---of 3
update_gid_event_work_handler---of 6
upper_device_filter---of 13
-----------
SUMMARY7%of 32

__arm64_compat_sys_wait4---of 4
__arm64_compat_sys_waitid---of 20
__arm64_sys_wait4---of 9
__arm64_sys_waitid---of 25
__do_wait---of 33
__wake_up_parent---of 1
child_wait_callback---of 9
coredump_task_exit---of 13
delayed_put_task_struct---of 14
do_exit---of 104
do_group_exit---of 6
do_wait---of 22
exit_mm---of 15
get_task_struct---of 6
is_current_pgrp_orphaned---of 13
kernel_wait---of 4
kernel_wait4---of 14
kernel_waitid_prepare---of 14
kill_orphaned_pgrp---of 21
make_task_dead---of 12
mm_update_next_owner---of 35
oops_count_show---of 1
pid_child_should_wake---of 9
put_task_struct---of 6
put_task_struct_rcu_user67%of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
rcuwait_wake_up34%of 18
refcount_inc---of 6
release_task---of 76
release_thread---of 1
trace_sched_process_exit---of 14
wait_consider_task---of 119
-----------
SUMMARY50%of 28

__jump_label_update34%of 12
__static_key_deferred_flush---of 3
__static_key_slow_dec_cpuslocked50%of 8
__static_key_slow_dec_deferred---of 5
jump_label_cmp---of 5
jump_label_del_module---of 27
jump_label_init_ro---of 8
jump_label_init_type---of 1
jump_label_lock---of 1
jump_label_module_notify---of 40
jump_label_rate_limit---of 3
jump_label_swap---of 1
jump_label_text_reserved---of 30
jump_label_unlock---of 1
jump_label_update27%of 23
jump_label_update_timeout---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
static_key_count100%of 1
static_key_dec_not_one56%of 9
static_key_disable---of 1
static_key_disable_cpuslocked---of 9
static_key_enable100%of 1
static_key_enable_cpuslocked43%of 7
static_key_fast_inc_not_disabled75%of 8
static_key_slow_dec67%of 3
static_key_slow_dec_cpuslocked---of 3
static_key_slow_inc100%of 1
static_key_slow_inc_cpuslocked58%of 7
-----------
SUMMARY49%of 84

__csd_lock_record---of 3
__csd_lock_wait---of 39
__flush_smp_call_function_queue---of 49
__probestub_csd_function_entry---of 1
__probestub_csd_function_exit---of 1
__probestub_csd_queue_cpu---of 1
__smp_call_single_queue---of 7
__traceiter_csd_function_entry---of 4
__traceiter_csd_function_exit---of 4
__traceiter_csd_queue_cpu---of 4
csd_lock_is_stuck---of 1
do_nothing---of 1
flush_smp_call_function_queue---of 10
generic_exec_single35%of 20
generic_smp_call_function_single_interrupt---of 1
kick_all_cpus_sync100%of 3
on_each_cpu_cond_mask---of 3
perf_trace_csd_function---of 6
perf_trace_csd_queue_cpu---of 6
smp_call_function---of 3
smp_call_function_any---of 15
smp_call_function_many---of 1
smp_call_function_many_cond15%of 87
smp_call_function_single39%of 21
smp_call_function_single_async---of 5
smp_call_on_cpu---of 6
smp_call_on_cpu_callback---of 1
smpcfd_dead_cpu---of 1
smpcfd_dying_cpu---of 1
smpcfd_prepare_cpu---of 5
trace_csd_function_entry24%of 17
trace_csd_function_exit24%of 17
trace_csd_queue_cpu---of 17
trace_event_raw_event_csd_function---of 7
trace_event_raw_event_csd_queue_cpu---of 7
trace_ipi_send_cpu---of 14
trace_raw_output_csd_function---of 3
trace_raw_output_csd_queue_cpu---of 3
wake_up_all_idle_cpus---of 9
-----------
SUMMARY24%of 165

-----------
SUMMARY---of 0

__probestub_cap_capable---of 1
__traceiter_cap_capable---of 4
cap_bprm_creds_from_file---of 62
cap_capable23%of 22
cap_capget---of 16
cap_capset---of 9
cap_convert_nscap---of 16
cap_inode_getsecurity---of 25
cap_inode_killpriv---of 1
cap_inode_need_killpriv100%of 1
cap_inode_removexattr---of 5
cap_inode_setxattr---of 4
cap_mmap_addr50%of 4
cap_ptrace_access_check---of 19
cap_ptrace_traceme---of 19
cap_safe_nice---of 22
cap_settime---of 1
cap_task_fix_setuid---of 28
cap_task_prctl---of 35
cap_task_setioprio---of 1
cap_task_setnice---of 1
cap_task_setscheduler---of 1
cap_vm_enough_memory100%of 1
get_vfs_caps_from_disk---of 17
perf_trace_cap_capable---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rootid_owns_currentns---of 6
trace_event_raw_event_cap_capable---of 7
trace_raw_output_cap_capable---of 3
-----------
SUMMARY33%of 28

-----------
SUMMARY---of 0

macvtap_count_rx_dropped---of 4
macvtap_count_tx_dropped---of 4
macvtap_dellink---of 1
macvtap_device_event19%of 11
macvtap_link_net---of 1
macvtap_net_namespace---of 1
macvtap_newlink---of 4
macvtap_setup---of 1
macvtap_update_features---of 1
-----------
SUMMARY19%of 11

-----------
SUMMARY---of 0

io_mem_abort45%of 40
kvm_handle_mmio_return49%of 31
kvm_mmio_read_buf67%of 6
kvm_mmio_write_buf50%of 6
trace_kvm_mmio29%of 14
-----------
SUMMARY46%of 97

-----------
SUMMARY---of 0

srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
vcpu_set_ich_hcr60%of 10
vgic_v3_check_base34%of 15
vgic_v3_clear_lr100%of 1
vgic_v3_enable67%of 3
vgic_v3_fold_lr_state40%of 15
vgic_v3_get_vmcr100%of 1
vgic_v3_load31%of 13
vgic_v3_lpi_sync_pending_status34%of 12
vgic_v3_map_resources36%of 14
vgic_v3_populate_lr30%of 30
vgic_v3_probe---of 27
vgic_v3_put31%of 13
vgic_v3_rdist_free_slot80%of 5
vgic_v3_rdist_overlap45%of 9
vgic_v3_rdist_region_from_index75%of 4
vgic_v3_save_pending_tables38%of 32
vgic_v3_set_underflow---of 1
vgic_v3_set_vmcr---of 3
-----------
SUMMARY41%of 181

__arm64_compat_sys_ftruncate---of 5
__arm64_compat_sys_open---of 5
__arm64_compat_sys_openat---of 5
__arm64_compat_sys_truncate---of 1
__arm64_sys_access---of 1
__arm64_sys_chdir---of 7
__arm64_sys_chmod---of 1
__arm64_sys_chown---of 1
__arm64_sys_chroot---of 12
__arm64_sys_close38%of 8
__arm64_sys_creat---of 1
__arm64_sys_faccessat---of 1
__arm64_sys_faccessat2---of 1
__arm64_sys_fallocate---of 4
__arm64_sys_fchdir---of 7
__arm64_sys_fchmod---of 7
__arm64_sys_fchmodat---of 1
__arm64_sys_fchmodat2---of 1
__arm64_sys_fchown---of 1
__arm64_sys_fchownat---of 1
__arm64_sys_ftruncate---of 5
__arm64_sys_lchown---of 1
__arm64_sys_open---of 5
__arm64_sys_openat100%of 5
__arm64_sys_openat2---of 19
__arm64_sys_truncate---of 1
__arm64_sys_vhangup---of 3
break_lease---of 5
build_open_flags65%of 20
build_open_how---of 5
chmod_common---of 11
chown_common---of 18
dentry_create---of 5
dentry_open40%of 5
dentry_open_nonotify---of 4
do_dentry_open33%of 88
do_faccessat---of 44
do_fchmodat---of 6
do_fchownat---of 9
do_ftruncate---of 11
do_sys_ftruncate---of 5
do_sys_open---of 5
do_sys_openat284%of 6
do_sys_truncate---of 7
do_truncate43%of 7
file_open_name---of 7
file_open_root---of 7
file_path---of 1
filp_close---of 1
filp_flush58%of 7
filp_open---of 8
finish_no_open---of 1
finish_open67%of 3
fsnotify_file_area_perm34%of 9
fsnotify_modify---of 9
generic_file_open50%of 4
get_write_access---of 6
kernel_file_open---of 11
ksys_fallocate---of 4
ksys_fchown---of 8
nonseekable_open---of 1
put_write_access---of 3
sb_end_write---of 14
sb_start_write---of 12
stream_open---of 1
vfs_fallocate38%of 35
vfs_fchmod---of 4
vfs_fchown---of 5
vfs_open60%of 10
vfs_truncate---of 15
-----------
SUMMARY44%of 207

-----------
SUMMARY---of 0

__arm64_sys_rt_sigreturn---of 227
clear_ti_thread_flag---of 3
do_signal21%of 333
set_ti_thread_flag---of 3
setup_sigframe_layout27%of 53
sigframe_alloc---of 7
-----------
SUMMARY22%of 386

__arm64_compat_sys_epoll_pwait---of 4
__arm64_compat_sys_epoll_pwait2---of 5
__arm64_sys_epoll_create---of 3
__arm64_sys_epoll_create1---of 1
__arm64_sys_epoll_ctl---of 4
__arm64_sys_epoll_pwait---of 4
__arm64_sys_epoll_pwait2---of 5
__arm64_sys_epoll_wait---of 4
__ep_eventpoll_poll---of 23
__ep_remove---of 39
_inline_copy_from_user---of 8
do_compat_epoll_pwait---of 9
do_epoll_create---of 11
do_epoll_ctl---of 75
do_epoll_pwait---of 9
do_epoll_wait---of 11
ep_autoremove_wake_function67%of 3
ep_busy_loop_end---of 7
ep_clear_and_put---of 19
ep_create_wakeup_source---of 6
ep_destroy_wakeup_source---of 7
ep_done_scan---of 20
ep_eventpoll_ioctl---of 19
ep_eventpoll_poll---of 1
ep_eventpoll_release---of 3
ep_insert---of 103
ep_loop_check_proc---of 10
ep_modify---of 37
ep_pm_stay_awake_rcu28%of 18
ep_poll---of 55
ep_poll_callback33%of 37
ep_ptable_queue_proc---of 5
ep_show_fdinfo---of 5
ep_start_scan---of 13
ep_try_send_events---of 62
ep_unregister_pollwait---of 16
epoll_mutex_lock---of 3
epoll_sendevents---of 11
eventpoll_release_file---of 9
get_epoll_tfile_raw_ptr---of 8
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
reverse_path_check_proc---of 10
-----------
SUMMARY38%of 62

-----------
SUMMARY---of 0

bcm_can_tx---of 19
bcm_connect---of 16
bcm_delete_rx_op---of 18
bcm_delete_tx_op---of 9
bcm_free_op_rcu---of 5
bcm_init---of 3
bcm_notifier8%of 27
bcm_proc_getifname---of 16
bcm_proc_show---of 16
bcm_read_op---of 9
bcm_recvmsg---of 12
bcm_release---of 38
bcm_remove_op---of 1
bcm_rx_cmp_to_index---of 9
bcm_rx_handler---of 27
bcm_rx_setup---of 62
bcm_rx_thr_flush---of 12
bcm_rx_thr_handler---of 3
bcm_rx_timeout_handler---of 4
bcm_rx_update_and_send---of 9
bcm_send_to_user---of 12
bcm_sendmsg---of 22
bcm_sock_no_ioctlcmd---of 1
bcm_tx_send---of 20
bcm_tx_setup---of 70
bcm_tx_timeout_handler---of 13
canbcm_pernet_exit---of 3
canbcm_pernet_init---of 1
dev_put---of 5
hrtimer_dummy_timeout---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY8%of 27

__phys_addr_symbol67%of 3
__virt_to_phys50%of 4
-----------
SUMMARY58%of 7

llist_add_batch60%of 5
llist_del_first50%of 6
llist_del_first_this---of 6
llist_reverse_order---of 4
-----------
SUMMARY55%of 11

net_generic25%of 16
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
tee_net_init---of 1
tee_netdev_event17%of 12
tee_tg4---of 3
tee_tg6---of 3
tee_tg_check---of 12
tee_tg_destroy---of 4
-----------
SUMMARY32%of 32

page_counter_calculate_protection---of 12
page_counter_cancel63%of 8
page_counter_charge64%of 11
page_counter_memparse---of 4
page_counter_set_low---of 4
page_counter_set_max---of 5
page_counter_set_min---of 4
page_counter_try_charge39%of 18
page_counter_uncharge75%of 4
propagate_protected_usage20%of 10
-----------
SUMMARY48%of 51

-----------
SUMMARY---of 0

jiffy_sched_clock_read---of 1
sched_clock40%of 5
sched_clock_poll---of 1
sched_clock_read_begin---of 1
sched_clock_read_retry---of 1
sched_clock_resume---of 1
sched_clock_suspend---of 1
suspended_sched_clock_read---of 1
update_sched_clock---of 1
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

timerqueue_add58%of 7
timerqueue_del60%of 5
timerqueue_iterate_next---of 3
-----------
SUMMARY59%of 12

-----------
SUMMARY---of 0

j1939_can_recv---of 30
j1939_netdev_notify13%of 16
j1939_netdev_start---of 60
j1939_netdev_stop---of 4
j1939_priv_get---of 6
j1939_priv_put---of 18
j1939_send_one---of 6
-----------
SUMMARY13%of 16

-----------
SUMMARY---of 0

tomoyo_encode67%of 15
tomoyo_encode2---of 15
tomoyo_get_local_path23%of 22
tomoyo_realpath_from_path65%of 20
tomoyo_realpath_nofollow---of 4
-----------
SUMMARY50%of 57

hsr_announce---of 14
hsr_change_rx_flags---of 6
hsr_check_carrier_and_operstate---of 26
hsr_del_ports---of 9
hsr_dev_change_mtu---of 8
hsr_dev_close---of 6
hsr_dev_finalize---of 19
hsr_dev_open---of 15
hsr_dev_setup---of 1
hsr_dev_xmit---of 6
hsr_fix_features---of 4
hsr_get_max_mtu---of 6
hsr_get_port_ndev---of 4
hsr_init_skb---of 13
hsr_ndo_vlan_rx_add_vid---of 12
hsr_ndo_vlan_rx_kill_vid---of 6
hsr_proxy_announce---of 21
hsr_set_rx_mode---of 6
is_hsr_master100%of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
send_hsr_supervision_frame---of 15
send_prp_supervision_frame---of 7
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__fsnotify_inode_delete100%of 1
__fsnotify_mntns_delete---of 1
__fsnotify_parent27%of 15
__fsnotify_vfsmount_delete---of 1
file_set_fsnotify_mode_from_watchers28%of 11
fsnotify---of 181
fsnotify_clear_child_dentry_flag---of 3
fsnotify_data_inode---of 7
fsnotify_event_needs_parent---of 3
fsnotify_first_mark---of 14
fsnotify_mnt---of 4
fsnotify_pre_content---of 15
fsnotify_sb_delete---of 22
fsnotify_sb_free---of 1
fsnotify_set_children_dentry_flags---of 10
srcu_lock_acquire---of 2
srcu_lock_release---of 2
-----------
SUMMARY30%of 27

__read_sysreg_by_encoding---of 46
__verify_local_elf_hwcaps---of 13
aarch32_el0_show---of 9
arm64_ftr_safe_value43%of 7
arm64_get_meltdown_state50%of 4
bti_enable---of 3
check_local_cpu_capabilities---of 44
check_update_ftr_reg---of 17
compat_has_neon---of 10
cpu_amu_enable---of 22
cpu_clear_disr---of 1
cpu_copy_el2regs---of 3
cpu_emulate_effective_ctr---of 4
cpu_enable_cnp---of 3
cpu_enable_dit---of 1
cpu_enable_e0pd---of 8
cpu_enable_gcs---of 1
cpu_enable_hw_dbm---of 17
cpu_enable_kpti---of 4
cpu_enable_mops---of 3
cpu_enable_mpam---of 1
cpu_enable_mte---of 19
cpu_enable_non_boot_scope_capabilities---of 8
cpu_enable_pan---of 5
cpu_enable_poe---of 5
cpu_get_elf_hwcap---of 1
cpu_get_elf_hwcap2---of 1
cpu_get_elf_hwcap3---of 1
cpu_has_amu_feat---of 3
cpu_have_feature---of 1
cpu_set_feature---of 3
cpu_show_meltdown---of 4
cpu_trap_el0_impdef---of 3
cpucap_multi_entry_cap_matches---of 5
do_emulate_mrs---of 15
dump_cpu_features---of 1
enable_mismatched_32bit_el0---of 17
get_arm64_ftr_reg50%of 4
get_cpu_with_amu_feat---of 1
has_32bit_el0---of 4
has_address_auth_cpucap---of 8
has_address_auth_metacap---of 1
has_always---of 1
has_amu---of 1
has_cache_dic---of 3
has_cache_idc---of 4
has_cpuid_feature---of 15
has_generic_auth---of 7
has_hw_dbm---of 1
has_lpa2---of 5
has_nested_virt_support---of 6
has_nv1---of 5
has_pmuv3---of 4
has_sve_feature---of 3
has_useable_cnp---of 4
has_useable_gicv3_cpuif---of 6
has_user_cpuid_feature---of 18
hvhe_possible---of 1
init_32bit_cpu_features---of 1
init_cpu_ftr_reg---of 20
is_kvm_protected_mode---of 1
read_sanitised_ftr_reg50%of 4
runs_at_el2---of 1
search_cmp_ftr_reg100%of 1
setup_elf_hwcaps---of 12
system_32bit_el0_cpumask---of 9
task_cpu_fallback_mask---of 11
test_has_mpam---of 3
test_has_mpam_hcr---of 4
this_cpu_has_cap---of 7
try_emulate_mrs---of 3
unmap_kernel_at_el0---of 15
update_cpu_capabilities---of 16
update_cpu_features---of 76
verify_local_cpu_caps---of 16
-----------
SUMMARY50%of 20

__xsk_map_flush---of 4
__xsk_map_redirect---of 39
__xsk_rcv---of 40
__xsk_rcv_zc---of 8
_inline_copy_to_user---of 7
copy_from_sockptr---of 11
dev_put---of 6
netdev_unlock_ops---of 5
pfn_valid---of 31
rcu_lock_acquire---of 2
rcu_lock_release---of 2
xsk_bind---of 43
xsk_clear_pool_at_qid---of 5
xsk_clear_rx_need_wakeup---of 3
xsk_clear_tx_need_wakeup---of 15
xsk_create---of 21
xsk_destruct---of 4
xsk_destruct_skb---of 5
xsk_generic_rcv---of 9
xsk_generic_xmit---of 102
xsk_get_pool_from_qid---of 4
xsk_getsockopt---of 51
xsk_init_queue---of 5
xsk_lookup_xsk_from_fd---of 4
xsk_mmap---of 12
xsk_net_exit---of 3
xsk_net_init---of 1
xsk_notifier25%of 8
xsk_poll---of 24
xsk_recvmsg---of 26
xsk_reg_pool_at_qid---of 6
xsk_release---of 25
xsk_sendmsg---of 28
xsk_set_rx_need_wakeup---of 3
xsk_set_tx_need_wakeup---of 15
xsk_setsockopt---of 22
xsk_tx_completed---of 1
xsk_tx_peek_desc---of 32
xsk_tx_peek_release_desc_batch---of 50
xsk_tx_release---of 16
xsk_unbind_dev---of 7
xsk_uses_need_wakeup---of 1
xskq_cons_peek_desc---of 17
-----------
SUMMARY25%of 8

__ima_inode_hash---of 16
ima_bprm_check---of 3
ima_bprm_creds_for_exec---of 3
ima_file_check100%of 1
ima_file_free12%of 17
ima_file_hash---of 3
ima_file_mmap86%of 7
ima_file_mprotect---of 7
ima_get_current_hash_algo---of 1
ima_inode_hash---of 3
ima_kernel_module_request---of 1
ima_kexec_cmdline---of 5
ima_load_data---of 9
ima_measure_critical_data---of 3
ima_post_create_tmpfile29%of 7
ima_post_load_data---of 6
ima_post_path_mknod---of 6
ima_post_read_file---of 4
ima_read_file---of 3
integrity_inode_attrs_changed---of 4
mmap_violation_check---of 5
process_buffer_measurement---of 23
process_measurement2%of 116
-----------
SUMMARY9%of 148

-----------
SUMMARY---of 0

tomoyo_convert_time---of 1
tomoyo_correct_domain---of 9
tomoyo_correct_path---of 4
tomoyo_correct_word---of 1
tomoyo_correct_word2---of 32
tomoyo_domain_def---of 6
tomoyo_domain_quota_is_ok12%of 18
tomoyo_file_matches_pattern2---of 62
tomoyo_fill_path_info67%of 15
tomoyo_find_domain---of 8
tomoyo_get_domainname---of 11
tomoyo_get_exe---of 4
tomoyo_get_mode40%of 5
tomoyo_init_request_info58%of 7
tomoyo_normalize_line---of 11
tomoyo_parse_name_union---of 4
tomoyo_parse_number_union---of 17
tomoyo_parse_ulong---of 7
tomoyo_path_matches_pattern67%of 6
tomoyo_path_matches_pattern2---of 42
tomoyo_permstr---of 4
tomoyo_print_ulong60%of 5
tomoyo_read_token---of 3
tomoyo_str_starts---of 3
-----------
SUMMARY45%of 56

_inline_copy_from_user63%of 8
get_kernel_wa_level23%of 9
kvm_arm_copy_fw_reg_indices50%of 8
kvm_arm_get_fw_num_regs100%of 1
kvm_arm_get_fw_reg59%of 24
kvm_arm_init_hypercalls100%of 1
kvm_arm_set_fw_reg83%of 35
kvm_arm_teardown_hypercalls100%of 1
kvm_ptp_get_time40%of 5
kvm_smccc_call_handler55%of 59
kvm_smccc_filter_insert_reserved50%of 4
kvm_vm_smccc_has_attr100%of 1
kvm_vm_smccc_set_attr35%of 43
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY56%of 203

-----------
SUMMARY---of 0

llc_sap_close---of 7
llc_sap_find23%of 22
llc_sap_open---of 9
local_bh_disable100%of 2
local_bh_enable100%of 2
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY44%of 30

-----------
SUMMARY---of 0

bstats_update---of 7
dev_is_mac_header_xmit---of 12
dev_queue_xmit---of 1
mirred_device_event16%of 19
mirred_exit_net---of 4
mirred_init_net---of 3
net_generic---of 16
netdev_tracker_alloc---of 5
rcu_lock_acquire---of 2
rcu_lock_release---of 2
tcf_action_inc_overlimit_qstats---of 3
tcf_mirred_act---of 69
tcf_mirred_dev_put---of 6
tcf_mirred_dump---of 19
tcf_mirred_get_dev---of 22
tcf_mirred_get_fill_size---of 1
tcf_mirred_init---of 36
tcf_mirred_offload_act_setup---of 27
tcf_mirred_release---of 8
tcf_mirred_replace_dev---of 11
tcf_mirred_to_dev---of 43
tcf_stats_update---of 1
-----------
SUMMARY16%of 19

-----------
SUMMARY---of 0

__kvm_flush_cpu_context100%of 1
__kvm_flush_vm_context---of 1
__kvm_tlb_flush_vmid---of 1
__kvm_tlb_flush_vmid_ipa50%of 4
__kvm_tlb_flush_vmid_ipa_nsh50%of 4
__kvm_tlb_flush_vmid_range59%of 17
__kvm_tlbi_s1e2---of 96
enter_vmid_context58%of 14
exit_vmid_context50%of 14
-----------
SUMMARY56%of 54

-----------
SUMMARY---of 0

vgic_compute_mi_state---of 13
vgic_state_is_nested7%of 30
vgic_v3_get_eisr---of 1
vgic_v3_get_elrsr---of 1
vgic_v3_get_misr---of 29
vgic_v3_handle_nested_maint_irq---of 3
vgic_v3_load_nested---of 54
vgic_v3_nested_update_mi---of 11
vgic_v3_put_nested---of 61
vgic_v3_sync_nested---of 15
-----------
SUMMARY7%of 30

dev_exception_add---of 15
dev_exception_clean---of 11
dev_exception_rm---of 15
dev_exceptions_copy---of 14
dev_exceptions_move---of 11
devcgroup_access_write---of 84
devcgroup_check_permission13%of 49
devcgroup_css_alloc---of 3
devcgroup_css_free---of 8
devcgroup_offline---of 1
devcgroup_online---of 4
devcgroup_seq_show---of 27
parent_allows_removal---of 16
parent_has_perm---of 35
propagate_exception---of 35
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY19%of 53

-----------
SUMMARY---of 0

strncpy_from_user40%of 20
-----------
SUMMARY40%of 20

-----------
SUMMARY---of 0

__hsiphash_unaligned---of 6
__siphash_unaligned---of 6
hsiphash_1u32---of 1
hsiphash_2u32---of 1
hsiphash_3u32---of 1
hsiphash_4u32---of 1
siphash_1u32---of 1
siphash_1u64100%of 1
siphash_2u64---of 1
siphash_3u32---of 1
siphash_3u64---of 1
siphash_4u64---of 1
-----------
SUMMARY100%of 1

__check_al---of 1
__check_cc---of 1
__check_cs---of 1
__check_eq---of 1
__check_ge---of 1
__check_gt---of 1
__check_hi---of 1
__check_le---of 1
__check_ls---of 1
__check_lt---of 1
__check_mi---of 1
__check_ne---of 1
__check_pl---of 1
__check_vc---of 1
__check_vs---of 1
__die---of 13
arm64_force_sig_fault67%of 3
arm64_force_sig_fault_pkey---of 1
arm64_force_sig_mceerr---of 1
arm64_force_sig_ptrace_errno_trap---of 1
arm64_is_fatal_ras_serror---of 9
arm64_notify_die---of 6
arm64_notify_segfault---of 7
arm64_serror_panic---of 3
arm64_show_signal34%of 6
arm64_skip_faulting_instruction---of 6
bad_el0_sync---of 1
bug_handler---of 9
cntfrq_read_handler---of 9
cntvct_read_handler---of 9
compat_cntfrq_read_handler---of 8
compat_cntvct_read_handler---of 10
ctr_read_handler---of 11
die---of 8
do_el0_bti---of 1
do_el0_cp15---of 18
do_el0_fpac---of 1
do_el0_gcs---of 1
do_el0_mops---of 7
do_el0_sys---of 9
do_el0_undef---of 22
do_el1_bti---of 3
do_el1_fpac---of 1
do_el1_gcs---of 1
do_el1_mops---of 7
do_el1_undef---of 4
do_serror---of 14
esr_get_class_string---of 1
force_signal_inject---of 10
is_valid_bugaddr---of 1
kasan_handler---of 8
mrs_handler---of 3
panic_bad_stack---of 4
reserved_fault_handler---of 1
user_cache_maint_handler---of 22
wfi_handler---of 6
-----------
SUMMARY45%of 9

__preempt_count_add---of 1
__preempt_count_sub---of 1
arch_irqs_disabled_flags100%of 1
arch_local_save_flags100%of 1
preempt_count100%of 1
-----------
SUMMARY100%of 3

-----------
SUMMARY---of 0

should_fail_alloc_page40%of 5
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

kvm_register_vgic_device---of 5
kvm_set_legacy_vgic_v2_addr34%of 12
kvm_vgic_addr61%of 38
vgic_check_iorange80%of 5
vgic_create100%of 1
vgic_destroy100%of 1
vgic_get_common_attr40%of 15
vgic_set_common_attr82%of 16
vgic_v2_attr_regs_access---of 21
vgic_v2_get_attr---of 3
vgic_v2_has_attr---of 9
vgic_v2_parse_attr---of 10
vgic_v2_set_attr---of 3
vgic_v3_attr_regs_access70%of 33
vgic_v3_get_attr100%of 7
vgic_v3_has_attr84%of 12
vgic_v3_parse_attr100%of 6
vgic_v3_set_attr100%of 7
-----------
SUMMARY69%of 153

-----------
SUMMARY---of 0

__sysreg_restore_vel2_state---of 193
__sysreg_save_vel2_state---of 162
__vcpu_load_switch_sysregs23%of 175
__vcpu_put_switch_sysregs23%of 154
sysreg_restore_guest_state_vhe34%of 21
sysreg_restore_host_state_vhe31%of 13
sysreg_save_guest_state_vhe34%of 18
sysreg_save_host_state_vhe31%of 13
-----------
SUMMARY25%of 394

-----------
SUMMARY---of 0

__page_frag_alloc_align---of 11
__page_frag_cache_drain---of 7
__page_frag_cache_refill---of 38
page_frag_cache_drain---of 3
page_frag_free45%of 9
page_ref_sub_and_test---of 3
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY45%of 9

-----------
SUMMARY---of 0

__arm64_sys_eventfd---of 1
__arm64_sys_eventfd2100%of 1
do_eventfd38%of 8
eventfd_ctx_do_read58%of 7
eventfd_ctx_fdget60%of 10
eventfd_ctx_fileget58%of 7
eventfd_ctx_put58%of 7
eventfd_ctx_remove_wait_queue---of 12
eventfd_fget---of 4
eventfd_poll50%of 4
eventfd_read---of 22
eventfd_release58%of 7
eventfd_show_fdinfo---of 1
eventfd_signal_mask50%of 6
eventfd_write63%of 24
-----------
SUMMARY57%of 81

-----------
SUMMARY---of 0

list_sort66%of 26
-----------
SUMMARY66%of 26

-----------
SUMMARY---of 0

kvm_arm_pmu_get_max_counters50%of 4
kvm_arm_pmu_get_pmuver_limit40%of 5
kvm_arm_pmu_v3_enable63%of 8
kvm_arm_pmu_v3_get_attr55%of 11
kvm_arm_pmu_v3_has_attr100%of 4
kvm_arm_pmu_v3_set_attr43%of 78
kvm_arm_set_default_pmu34%of 12
kvm_arm_set_nr_counters10%of 20
kvm_host_pmu_init---of 9
kvm_pmc_counts_at_el2---of 31
kvm_pmc_has_64bit_overflow22%of 23
kvm_pmu_accessible_counter_mask12%of 17
kvm_pmu_counter_increment4%of 50
kvm_pmu_counter_is_enabled28%of 29
kvm_pmu_counter_is_hyp---of 13
kvm_pmu_create_perf_event27%of 83
kvm_pmu_evtyper_mask34%of 6
kvm_pmu_flush_hwstate60%of 5
kvm_pmu_get_counter_value100%of 1
kvm_pmu_get_pmc_value47%of 15
kvm_pmu_get_pmceid---of 11
kvm_pmu_handle_pmcr40%of 35
kvm_pmu_implemented_counter_mask---of 1
kvm_pmu_nested_transition---of 24
kvm_pmu_overflow_status21%of 39
kvm_pmu_perf_overflow---of 20
kvm_pmu_perf_overflow_notify_vcpu---of 1
kvm_pmu_reprogram_counter_mask91%of 11
kvm_pmu_set_counter_event_type27%of 15
kvm_pmu_set_counter_value100%of 1
kvm_pmu_set_counter_value_user36%of 14
kvm_pmu_set_pmc_value23%of 22
kvm_pmu_should_notify_user67%of 3
kvm_pmu_software_increment100%of 1
kvm_pmu_sync_hwstate40%of 5
kvm_pmu_update_run100%of 3
kvm_pmu_vcpu_destroy80%of 5
kvm_pmu_vcpu_init100%of 1
kvm_supports_guest_pmuv3100%of 1
kvm_vcpu_read_pmcr23%of 18
kvm_vcpu_reload_pmu37%of 19
pmu_irq_is_valid70%of 10
-----------
SUMMARY33%of 574

kvm_condition_valid3229%of 14
kvm_skip_instr3250%of 6
-----------
SUMMARY35%of 20

kvm_arm_vmid_clear_active100%of 1
kvm_arm_vmid_update25%of 29
-----------
SUMMARY27%of 30

__page_table_check_pmd_clear---of 4
__page_table_check_pmd_set---of 12
__page_table_check_pte_clear100%of 3
__page_table_check_pte_clear_range---of 13
__page_table_check_ptes_set42%of 17
__page_table_check_pud_clear---of 4
__page_table_check_pud_set---of 6
__page_table_check_zero32%of 22
page_table_check_clear35%of 32
page_table_check_set35%of 32
pfn_valid20%of 30
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY35%of 140

-----------
SUMMARY---of 0

____do_softirq100%of 1
do_softirq_own_stack100%of 1
-----------
SUMMARY100%of 2

__anon_inode_getfile---of 13
anon_inode_create_getfd---of 4
anon_inode_create_getfile---of 1
anon_inode_getfd38%of 8
anon_inode_getfile43%of 7
anon_inode_getfile_fmode43%of 7
anon_inodefs_dname100%of 1
anon_inodefs_init_fs_context---of 3
-----------
SUMMARY44%of 23

-----------
SUMMARY---of 0

__free_page_ext---of 10
init_section_page_ext---of 9
page_ext_callback---of 32
page_ext_get---of 20
page_ext_lookup45%of 9
page_ext_put---of 7
pgdat_page_ext_init---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY45%of 9

__probestub_selinux_audited---of 1
__traceiter_selinux_audited---of 4
avc_alloc_node9%of 37
avc_audit_post_callback28%of 33
avc_audit_pre_callback63%of 8
avc_compute_av36%of 17
avc_copy_xperms_decision---of 7
avc_denied40%of 5
avc_get_cache_threshold---of 1
avc_get_hash_stats---of 16
avc_has_extended_perms21%of 58
avc_has_perm72%of 7
avc_has_perm_noaudit40%of 25
avc_node_free---of 1
avc_node_kill---of 3
avc_perm_nonode100%of 3
avc_policy_seqno100%of 1
avc_set_cache_threshold---of 1
avc_ss_reset---of 26
avc_update_node26%of 31
avc_xperms_allow_perm---of 9
avc_xperms_decision_alloc---of 17
avc_xperms_free---of 13
avc_xperms_populate23%of 9
perf_trace_selinux_audited---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
selinux_avc_init---of 3
slow_avc_audit50%of 4
trace_event_get_offsets_selinux_audited---of 1
trace_event_raw_event_selinux_audited---of 7
trace_raw_output_selinux_audited---of 3
-----------
SUMMARY30%of 242

-----------
SUMMARY---of 0

__arm64_compat_sys_gettimeofday---of 21
__arm64_compat_sys_settimeofday---of 29
__arm64_sys_adjtimex---of 8
__arm64_sys_adjtimex_time32---of 3
__arm64_sys_gettimeofday---of 16
__arm64_sys_settimeofday---of 29
__msecs_to_jiffies---of 3
__put_old_timespec32---of 7
__usecs_to_jiffies---of 3
_inline_copy_from_user---of 8
_inline_copy_to_user---of 7
clock_t_to_jiffies---of 1
do_sys_settimeofday64---of 13
get_itimerspec64---of 7
get_old_itimerspec32---of 4
get_old_timespec32---of 3
get_old_timex32---of 3
get_timespec64---of 4
jiffies64_to_msecs---of 1
jiffies64_to_nsecs---of 1
jiffies_64_to_clock_t---of 1
jiffies_to_clock_t---of 1
jiffies_to_msecs100%of 1
jiffies_to_timespec64---of 1
jiffies_to_usecs---of 1
mktime64---of 1
ns_to_kernel_old_timeval---of 4
ns_to_timespec64---of 4
nsec_to_clock_t---of 1
nsecs_to_jiffies---of 1
nsecs_to_jiffies64---of 1
put_itimerspec64---of 3
put_old_itimerspec32---of 3
put_old_timespec32---of 1
put_old_timex32---of 7
put_timespec64---of 7
set_normalized_timespec6443%of 7
timespec64_add_safe---of 9
timespec64_to_jiffies---of 1
-----------
SUMMARY50%of 8

__is_visible_gfn_locked67%of 3
handle_l1_dte100%of 3
its_free_ite43%of 7
its_mmio_write_wi100%of 1
kvm_arch_allow_write_without_running_vcpu100%of 1
kvm_get_vcpu_by_id84%of 12
kvm_vgic_register_its_device---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
scan_its_table75%of 8
srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
update_affinity25%of 8
update_lpi_config55%of 11
vgic_add_lpi37%of 19
vgic_enable_lpis16%of 13
vgic_its_alloc_collection---of 4
vgic_its_attr_regs_access100%of 8
vgic_its_check_id60%of 15
vgic_its_cmd_handle_mapi42%of 36
vgic_its_commit_v0---of 1
vgic_its_create50%of 6
vgic_its_destroy72%of 7
vgic_its_device_cmp---of 1
vgic_its_free_collection---of 15
vgic_its_free_collection_list28%of 18
vgic_its_free_device54%of 15
vgic_its_get_abi---of 1
vgic_its_get_attr57%of 16
vgic_its_has_attr100%of 8
vgic_its_inject_cached_translation29%of 28
vgic_its_inject_msi50%of 14
vgic_its_inv_lpi---of 1
vgic_its_invalidate_all_caches---of 18
vgic_its_invall75%of 8
vgic_its_ite_cmp100%of 1
vgic_its_process_commands38%of 124
vgic_its_resolve_lpi21%of 29
vgic_its_restore_dte67%of 9
vgic_its_restore_ite25%of 16
vgic_its_restore_tables_v075%of 27
vgic_its_save_tables_v052%of 29
vgic_its_set_attr72%of 28
vgic_mmio_read_its_baser100%of 4
vgic_mmio_read_its_cbaser100%of 1
vgic_mmio_read_its_creadr---of 1
vgic_mmio_read_its_ctlr100%of 1
vgic_mmio_read_its_cwriter100%of 1
vgic_mmio_read_its_idregs---of 3
vgic_mmio_read_its_iidr100%of 1
vgic_mmio_read_its_typer---of 1
vgic_mmio_uaccess_write_its_creadr---of 4
vgic_mmio_uaccess_write_its_iidr100%of 3
vgic_mmio_write_its_baser73%of 11
vgic_mmio_write_its_cbaser100%of 3
vgic_mmio_write_its_ctlr56%of 9
vgic_mmio_write_its_cwriter75%of 4
vgic_msi_to_its---of 7
vgic_register_its_iodev67%of 3
-----------
SUMMARY51%of 577

__anon_vma_prepare47%of 15
__check_safe_pte_update---of 14
__flush_tlb_range_nosync---of 41
__folio_large_mapcount_sanity_checks---of 33
__folio_mod_stat50%of 8
__folio_rmap_sanity_checks---of 22
__probestub_mm_migrate_pages---of 1
__probestub_mm_migrate_pages_start---of 1
__probestub_remove_migration_pte---of 1
__probestub_set_migration_pte---of 1
__probestub_tlb_flush---of 1
__put_anon_vma28%of 11
__rmap_walk_file---of 33
__set_ptes---of 10
__traceiter_mm_migrate_pages---of 4
__traceiter_mm_migrate_pages_start---of 4
__traceiter_remove_migration_pte---of 4
__traceiter_set_migration_pte---of 4
__traceiter_tlb_flush---of 4
anon_vma_clone44%of 23
anon_vma_ctor100%of 1
anon_vma_fork---of 13
flush_tlb_batched_pending50%of 4
folio_add_anon_rmap_pmd---of 91
folio_add_anon_rmap_ptes---of 110
folio_add_file_rmap_pmd---of 32
folio_add_file_rmap_ptes26%of 47
folio_add_file_rmap_pud---of 1
folio_add_new_anon_rmap17%of 73
folio_get_anon_vma---of 36
folio_large_mapcount---of 4
folio_lock_anon_vma_read---of 46
folio_mkclean---of 9
folio_move_anon_rmap---of 5
folio_not_mapped---of 4
folio_referenced---of 16
folio_referenced_one---of 93
folio_remove_rmap_pmd---of 35
folio_remove_rmap_ptes25%of 54
folio_remove_rmap_pud---of 1
folio_test_pmd_mappable---of 4
folio_try_share_anon_rmap_pte---of 59
hugetlb_add_anon_rmap---of 36
hugetlb_add_new_anon_rmap---of 20
hugetlb_remove_rmap---of 7
invalid_folio_referenced_vma---of 28
invalid_migration_vma---of 1
invalid_mkclean_vma---of 1
make_device_exclusive---of 57
mapping_wrprotect_range---of 3
mapping_wrprotect_range_one---of 1
mm_find_pmd37%of 19
mmu_notifier_invalidate_range_end---of 5
mmu_notifier_invalidate_range_start---of 3
page_address_in_vma---of 11
page_mkclean_one---of 4
page_vma_mkclean_one---of 39
perf_trace_migration_pte---of 6
perf_trace_mm_migrate_pages---of 6
perf_trace_mm_migrate_pages_start---of 6
perf_trace_tlb_flush---of 6
pfn_mkclean_range---of 9
pte_clear---of 3
pte_unmap---of 6
put_anon_vma---of 5
put_page---of 9
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
rmap_walk---of 8
rmap_walk_anon---of 33
rmap_walk_locked---of 8
set_tlb_ubc_flush_pending---of 16
swp_offset_pfn---of 5
trace_event_raw_event_migration_pte---of 7
trace_event_raw_event_mm_migrate_pages---of 7
trace_event_raw_event_mm_migrate_pages_start---of 7
trace_event_raw_event_tlb_flush---of 7
trace_raw_output_migration_pte---of 3
trace_raw_output_mm_migrate_pages---of 3
trace_raw_output_mm_migrate_pages_start---of 3
trace_raw_output_tlb_flush---of 3
try_to_migrate---of 14
try_to_migrate_one---of 238
try_to_unmap---of 3
try_to_unmap_flush---of 3
try_to_unmap_flush_dirty---of 4
try_to_unmap_one---of 256
unlink_anon_vmas63%of 27
-----------
SUMMARY32%of 282

ipvlan_add_addr---of 6
ipvlan_addr4_event---of 9
ipvlan_addr4_validator_event---of 7
ipvlan_addr6_event---of 9
ipvlan_addr6_validator_event---of 7
ipvlan_change_rx_flags---of 3
ipvlan_del_addr---of 4
ipvlan_device_event5%of 40
ipvlan_ethtool_get_drvinfo---of 1
ipvlan_ethtool_get_link_ksettings---of 1
ipvlan_ethtool_get_msglevel---of 1
ipvlan_ethtool_set_msglevel---of 1
ipvlan_fix_features---of 1
ipvlan_get_iflink---of 1
ipvlan_get_link_net---of 1
ipvlan_get_stats64---of 5
ipvlan_hard_header---of 6
ipvlan_init---of 20
ipvlan_link_delete---of 10
ipvlan_link_new---of 33
ipvlan_link_register---of 1
ipvlan_link_setup---of 1
ipvlan_nl_changelink---of 13
ipvlan_nl_fillinfo---of 9
ipvlan_nl_getsize---of 1
ipvlan_nl_validate---of 8
ipvlan_open---of 14
ipvlan_set_multicast_mac_filter---of 5
ipvlan_set_port_mode---of 14
ipvlan_start_xmit---of 10
ipvlan_stop---of 14
ipvlan_uninit---of 27
ipvlan_vlan_rx_add_vid---of 1
ipvlan_vlan_rx_kill_vid---of 1
list_add_tail_rcu---of 3
netdev_lock_cmp_fn---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY5%of 40

-----------
SUMMARY---of 0

__arm64_sys_capget---of 32
__arm64_sys_capset---of 28
cap_validate_magic---of 17
capable50%of 4
capable_wrt_inode_uidgid40%of 5
file_ns_capable---of 3
has_capability_noaudit---of 1
has_ns_capability---of 16
has_ns_capability_noaudit---of 16
ns_capable50%of 4
ns_capable_noaudit---of 4
ns_capable_setid---of 4
privileged_wrt_inode_uidgid---of 3
ptracer_capable---of 18
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY47%of 13

__cgroup_account_cputime67%of 3
__cgroup_account_cputime_field---of 9
__cgroup_rstat_lock---of 30
__cgroup_rstat_unlock---of 14
bpf_rstat_flush---of 1
cgroup_base_stat_cputime_show---of 5
cgroup_rstat_exit---of 6
cgroup_rstat_flush---of 85
cgroup_rstat_init---of 7
cgroup_rstat_updated26%of 51
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY28%of 54

call_blocking_lsm_notifier---of 1
inode_free_by_rcu---of 19
lsm_append---of 8
lsm_fill_user_ctx---of 14
register_blocking_lsm_notifier---of 1
security_audit_rule_free---of 19
security_audit_rule_init---of 27
security_audit_rule_known---of 27
security_audit_rule_match---of 27
security_bdev_alloc---of 31
security_bdev_free---of 20
security_bdev_setintegrity---of 27
security_binder_set_context_mgr---of 27
security_binder_transaction---of 27
security_binder_transfer_binder---of 27
security_binder_transfer_file---of 27
security_bpf---of 27
security_bpf_map---of 27
security_bpf_map_create---of 27
security_bpf_map_free---of 19
security_bpf_prog---of 27
security_bpf_prog_free---of 19
security_bpf_prog_load---of 27
security_bpf_token_capable---of 27
security_bpf_token_cmd---of 27
security_bpf_token_create---of 27
security_bpf_token_free---of 19
security_bprm_check---of 27
security_bprm_committed_creds---of 19
security_bprm_committing_creds---of 19
security_bprm_creds_for_exec---of 27
security_bprm_creds_from_file---of 27
security_capable38%of 27
security_capget---of 27
security_capset---of 27
security_create_user_ns---of 27
security_cred_alloc_blank---of 31
security_cred_free---of 20
security_cred_getlsmprop---of 19
security_cred_getsecid---of 19
security_current_getlsmprop_subj53%of 19
security_d_instantiate55%of 22
security_dentry_create_files_as---of 27
security_dentry_init_security---of 27
security_file_alloc36%of 31
security_file_fcntl---of 27
security_file_free53%of 21
security_file_ioctl38%of 27
security_file_ioctl_compat---of 27
security_file_lock---of 27
security_file_mprotect---of 27
security_file_open38%of 27
security_file_permission38%of 27
security_file_post_open38%of 27
security_file_receive---of 27
security_file_release53%of 19
security_file_send_sigiotask---of 27
security_file_set_fowner---of 19
security_file_truncate38%of 27
security_free_mnt_opts---of 20
security_fs_context_dup---of 27
security_fs_context_parse_param---of 35
security_fs_context_submount---of 27
security_getprocattr---of 41
security_getselfattr---of 39
security_ib_alloc_security---of 31
security_ib_endport_manage_subnet---of 27
security_ib_free_security---of 1
security_ib_pkey_access---of 27
security_inet_conn_established---of 19
security_inet_conn_request---of 27
security_inet_csk_clone---of 19
security_initramfs_populated---of 19
security_inode_alloc36%of 31
security_inode_copy_up---of 27
security_inode_copy_up_xattr---of 27
security_inode_create36%of 28
security_inode_follow_link---of 28
security_inode_free53%of 21
security_inode_get_acl---of 28
security_inode_getattr---of 28
security_inode_getlsmprop---of 19
security_inode_getsecctx---of 27
security_inode_getsecurity---of 28
security_inode_getxattr---of 28
security_inode_init_security28%of 48
security_inode_init_security_anon---of 27
security_inode_invalidate_secctx---of 19
security_inode_killpriv---of 27
security_inode_link---of 28
security_inode_listsecurity---of 28
security_inode_listxattr---of 28
security_inode_mkdir---of 28
security_inode_mknod---of 28
security_inode_need_killpriv38%of 27
security_inode_notifysecctx---of 27
security_inode_permission36%of 28
security_inode_post_create_tmpfile50%of 20
security_inode_post_remove_acl---of 20
security_inode_post_removexattr---of 20
security_inode_post_set_acl---of 20
security_inode_post_setattr50%of 20
security_inode_post_setxattr---of 20
security_inode_readlink---of 28
security_inode_remove_acl---of 28
security_inode_removexattr---of 57
security_inode_rename---of 59
security_inode_rmdir---of 28
security_inode_set_acl---of 28
security_inode_setattr36%of 28
security_inode_setintegrity---of 27
security_inode_setsecctx---of 27
security_inode_setsecurity---of 28
security_inode_setxattr---of 57
security_inode_symlink---of 28
security_inode_unlink---of 28
security_ipc_getlsmprop---of 19
security_ipc_permission---of 27
security_ismaclabel---of 27
security_kernel_act_as---of 27
security_kernel_create_files_as---of 27
security_kernel_load_data---of 27
security_kernel_module_request---of 27
security_kernel_post_load_data---of 27
security_kernel_post_read_file---of 27
security_kernel_read_file---of 27
security_kernfs_init_security---of 27
security_key_alloc---of 31
security_key_free---of 1
security_key_getsecurity---of 27
security_key_permission---of 27
security_key_post_create_or_update---of 19
security_locked_down---of 27
security_lsmprop_to_secctx---of 27
security_mmap_addr38%of 27
security_mmap_file23%of 62
security_move_mount---of 27
security_mptcp_add_subflow---of 27
security_msg_msg_alloc---of 31
security_msg_msg_free---of 19
security_msg_queue_alloc---of 31
security_msg_queue_associate---of 27
security_msg_queue_free---of 19
security_msg_queue_msgctl---of 27
security_msg_queue_msgrcv---of 27
security_msg_queue_msgsnd---of 27
security_netlink_send---of 27
security_path_chmod---of 28
security_path_chown---of 28
security_path_chroot---of 27
security_path_link---of 28
security_path_mkdir---of 28
security_path_mknod36%of 28
security_path_notify---of 27
security_path_post_mknod---of 20
security_path_rename---of 31
security_path_rmdir---of 28
security_path_symlink---of 28
security_path_truncate---of 28
security_path_unlink---of 28
security_perf_event_alloc36%of 31
security_perf_event_free---of 1
security_perf_event_open---of 27
security_perf_event_read---of 27
security_perf_event_write---of 27
security_post_notification---of 27
security_prepare_creds---of 31
security_ptrace_access_check---of 27
security_ptrace_traceme---of 27
security_quota_on---of 27
security_quotactl---of 27
security_release_secctx---of 19
security_req_classify_flow---of 19
security_sb_alloc---of 31
security_sb_clone_mnt_opts---of 27
security_sb_delete---of 19
security_sb_eat_lsm_opts---of 27
security_sb_free---of 19
security_sb_kern_mount---of 27
security_sb_mnt_opts_compat---of 27
security_sb_mount---of 27
security_sb_pivotroot---of 27
security_sb_remount---of 27
security_sb_set_mnt_opts---of 27
security_sb_show_options---of 27
security_sb_statfs---of 27
security_sb_umount---of 27
security_sctp_assoc_established---of 27
security_sctp_assoc_request---of 27
security_sctp_bind_connect---of 27
security_sctp_sk_clone---of 19
security_secctx_to_secid---of 27
security_secid_to_secctx---of 27
security_secmark_refcount_dec---of 19
security_secmark_refcount_inc---of 19
security_secmark_relabel_packet---of 27
security_sem_alloc---of 31
security_sem_associate---of 27
security_sem_free---of 19
security_sem_semctl---of 27
security_sem_semop---of 27
security_setprocattr---of 41
security_setselfattr---of 34
security_settime64---of 27
security_shm_alloc---of 31
security_shm_associate---of 27
security_shm_free---of 19
security_shm_shmat---of 27
security_shm_shmctl---of 27
security_sk_alloc---of 31
security_sk_classify_flow---of 19
security_sk_clone---of 19
security_sk_free---of 19
security_skb_classify_flow---of 28
security_sock_graft---of 19
security_sock_rcv_skb38%of 27
security_socket_accept---of 27
security_socket_bind---of 27
security_socket_connect---of 27
security_socket_create---of 27
security_socket_getpeername---of 27
security_socket_getpeersec_dgram---of 27
security_socket_getpeersec_stream---of 27
security_socket_getsockname---of 27
security_socket_getsockopt---of 27
security_socket_listen---of 27
security_socket_post_create---of 27
security_socket_recvmsg---of 27
security_socket_sendmsg---of 27
security_socket_setsockopt---of 27
security_socket_shutdown---of 27
security_socket_socketpair---of 27
security_syslog---of 27
security_task_alloc---of 31
security_task_fix_setgid---of 27
security_task_fix_setgroups---of 27
security_task_fix_setuid---of 27
security_task_free---of 19
security_task_getioprio---of 27
security_task_getlsmprop_obj---of 19
security_task_getpgid---of 27
security_task_getscheduler---of 27
security_task_getsid---of 27
security_task_kill---of 27
security_task_movememory---of 27
security_task_prctl---of 35
security_task_prlimit---of 27
security_task_setioprio---of 27
security_task_setnice---of 27
security_task_setpgid---of 27
security_task_setrlimit---of 27
security_task_setscheduler---of 27
security_task_to_inode---of 19
security_transfer_creds---of 19
security_tun_dev_alloc_security---of 31
security_tun_dev_attach---of 27
security_tun_dev_attach_queue---of 27
security_tun_dev_create---of 27
security_tun_dev_free_security---of 1
security_tun_dev_open---of 27
security_unix_may_send---of 27
security_unix_stream_connect---of 27
security_uring_allowed---of 27
security_uring_cmd---of 27
security_uring_override_creds---of 27
security_uring_sqpoll---of 27
security_vm_enough_memory_mm36%of 28
security_watch_key---of 27
security_xfrm_decode_session---of 27
security_xfrm_policy_alloc---of 27
security_xfrm_policy_clone---of 27
security_xfrm_policy_delete---of 27
security_xfrm_policy_free---of 19
security_xfrm_policy_lookup---of 27
security_xfrm_state_alloc---of 27
security_xfrm_state_alloc_acquire---of 27
security_xfrm_state_delete---of 27
security_xfrm_state_free---of 19
security_xfrm_state_pol_flow_match---of 11
unregister_blocking_lsm_notifier---of 1
-----------
SUMMARY38%of 728

-----------
SUMMARY---of 0

rcu_lock_acquire---of 2
rcu_lock_release---of 2
srcu_lock_acquire---of 2
srcu_lock_release---of 2
tomoyo_addprintf---of 1
tomoyo_check_profile---of 9
tomoyo_close_control---of 6
tomoyo_find_yesno---of 7
tomoyo_flush---of 28
tomoyo_init_policy_namespace---of 4
tomoyo_io_printf---of 5
tomoyo_numscan---of 5
tomoyo_open_control---of 25
tomoyo_parse_policy---of 10
tomoyo_poll_control---of 3
tomoyo_poll_query---of 5
tomoyo_print_name_union---of 9
tomoyo_print_number_union---of 3
tomoyo_print_number_union_nospace---of 7
tomoyo_profile100%of 1
tomoyo_read_control---of 20
tomoyo_read_domain---of 50
tomoyo_read_domain2---of 175
tomoyo_read_exception---of 119
tomoyo_read_manager---of 24
tomoyo_read_pid---of 22
tomoyo_read_profile---of 50
tomoyo_read_query---of 14
tomoyo_read_stat---of 19
tomoyo_read_version---of 3
tomoyo_same_manager---of 1
tomoyo_same_task_acl---of 1
tomoyo_set_group---of 9
tomoyo_supervisor6%of 77
tomoyo_update_stat---of 3
tomoyo_write_answer---of 10
tomoyo_write_control---of 77
tomoyo_write_domain---of 30
tomoyo_write_domain2---of 7
tomoyo_write_exception---of 15
tomoyo_write_manager---of 9
tomoyo_write_pid---of 1
tomoyo_write_profile---of 45
tomoyo_write_stat---of 8
tomoyo_write_task---of 6
-----------
SUMMARY7%of 78

-----------
SUMMARY---of 0

_inline_copy_to_user---of 7
copy_from_sockptr---of 11
memdup_sockptr_noprof---of 4
netdev_hold---of 6
netdev_put---of 6
raw_bind---of 24
raw_disable_allfilters---of 6
raw_enable_allfilters---of 12
raw_enable_filters---of 7
raw_getname---of 3
raw_getsockopt---of 53
raw_init---of 4
raw_notifier7%of 29
raw_rcv---of 32
raw_recvmsg---of 15
raw_release---of 33
raw_sendmsg---of 45
raw_setsockopt---of 57
raw_sock_no_ioctlcmd---of 1
-----------
SUMMARY7%of 29

collect_domain_accesses---of 10
current_check_access_path25%of 8
current_check_refer_path---of 32
find_rule---of 17
hook_file_alloc_security100%of 1
hook_file_free_security100%of 1
hook_file_ioctl12%of 18
hook_file_ioctl_compat---of 18
hook_file_open14%of 15
hook_file_set_fowner---of 28
hook_file_truncate67%of 3
hook_inode_free_security_rcu---of 3
hook_move_mount---of 7
hook_path_link---of 1
hook_path_mkdir---of 1
hook_path_mknod67%of 3
hook_path_rename---of 1
hook_path_rmdir---of 1
hook_path_symlink---of 1
hook_path_truncate---of 1
hook_path_unlink---of 1
hook_sb_delete---of 41
hook_sb_mount---of 7
hook_sb_pivotroot---of 7
hook_sb_remount---of 7
hook_sb_umount---of 7
is_access_to_paths_allowed---of 56
is_layer_masks_allowed---of 1
landlock_append_fs_rule---of 39
rcu_lock_acquire---of 2
rcu_lock_release---of 2
release_inode---of 8
scope_to_request---of 5
-----------
SUMMARY25%of 49

-----------
SUMMARY---of 0

_inline_copy_from_user---of 8
_inline_copy_to_user---of 7
_ip6mr_fill_mroute---of 1
copy_from_sockptr---of 4
copy_to_sockptr---of 11
dev_put---of 5
ip6_mr_forward---of 38
ip6_mr_input---of 15
ip6_mroute_getsockopt---of 14
ip6_mroute_setsockopt---of 60
ip6mr_cache_free_rcu---of 1
ip6mr_cache_report---of 52
ip6mr_cache_unresolved---of 16
ip6mr_compat_ioctl---of 38
ip6mr_destroy_unres---of 8
ip6mr_device_event20%of 10
ip6mr_dump---of 1
ip6mr_fib_lookup---of 3
ip6mr_fill_mroute---of 11
ip6mr_forward2---of 39
ip6mr_forward2_finish---of 16
ip6mr_free_table---of 4
ip6mr_get_route---of 53
ip6mr_get_table---of 14
ip6mr_hash_cmp---of 3
ip6mr_ioctl---of 34
ip6mr_mfc_add---of 144
ip6mr_mfc_delete---of 22
ip6mr_mr_table_iter---of 1
ip6mr_net_exit---of 1
ip6mr_net_exit_batch---of 4
ip6mr_net_init---of 17
ip6mr_new_table---of 5
ip6mr_new_table_set---of 3
ip6mr_rtm_dumproute---of 13
ip6mr_rtm_getroute---of 51
ip6mr_rule_action---of 8
ip6mr_rule_compare---of 1
ip6mr_rule_configure---of 1
ip6mr_rule_default---of 5
ip6mr_rule_fill---of 1
ip6mr_rule_match---of 1
ip6mr_rules_dump---of 1
ip6mr_rules_exit---of 9
ip6mr_seq_read---of 1
ip6mr_sk_done---of 14
ip6mr_vif_seq_show---of 7
ip6mr_vif_seq_start---of 16
ip6mr_vif_seq_stop---of 6
ipmr_do_expire_process---of 9
ipmr_expire_process---of 4
ipmr_mfc_seq_show---of 9
ipmr_mfc_seq_start---of 4
mif6_add---of 42
mif6_delete---of 33
mr6_netlink_event---of 4
mr_mfc_seq_stop---of 8
mroute6_is_socket---of 3
mroute_clean_tables---of 30
pim6_rcv---of 22
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
reg_vif_get_iflink---of 1
reg_vif_setup---of 1
reg_vif_xmit---of 24
rhltable_remove---of 78
rht_assign_unlock---of 7
rht_lock---of 12
rht_unlock---of 10
skb_tunnel_rx---of 7
vif_dev_read---of 6
-----------
SUMMARY20%of 10

__arm64_sys_close_range---of 28
__arm64_sys_dup---of 4
__arm64_sys_dup2---of 9
__arm64_sys_dup3---of 1
__f_unlock_pos100%of 1
__fget_files34%of 21
__fget_files_rcu---of 11
__file_ref_put50%of 6
__file_ref_put_badval---of 5
__free_fdtable---of 1
__get_unused_fd_flags---of 1
__put_unused_fd46%of 11
alloc_fd39%of 21
alloc_fdtable---of 8
close_fd---of 3
copy_fd_bitmaps---of 3
do_close_on_exec---of 14
do_dup2---of 22
dup_fd---of 36
exit_files---of 3
expand_files---of 26
f_dupfd---of 8
fd_install24%of 25
fdget60%of 5
fdget_pos50%of 10
fdget_raw75%of 4
fget100%of 1
fget_raw---of 1
fget_task---of 3
fget_task_next---of 28
file_close_fd100%of 1
file_close_fd_locked47%of 13
file_seek_cur_needs_f_lock---of 4
free_fdtable_rcu---of 1
get_close_on_exec---of 17
get_file_active---of 17
get_file_rcu---of 9
get_unused_fd_flags100%of 1
iterate_fd---of 20
ksys_dup3---of 12
put_files_struct---of 15
put_unused_fd100%of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
rcu_read_unlock_sched---of 8
receive_fd---of 15
receive_fd_replace---of 4
replace_fd---of 6
sane_fdtable_size---of 7
set_close_on_exec50%of 10
-----------
SUMMARY45%of 135

-----------
SUMMARY---of 0

__bpf_getsockopt---of 17
__bpf_redirect---of 41
__bpf_setsockopt---of 13
__bpf_skb_change_head---of 14
__bpf_skb_change_tail---of 23
__bpf_skb_load_bytes---of 9
__bpf_skb_store_bytes---of 15
__bpf_skc_lookup---of 20
__bpf_tx_skb---of 5
__bpf_xdp_load_bytes---of 14
__bpf_xdp_store_bytes---of 14
__get_filter---of 10
__ipv4_neigh_lookup_noref---of 12
__ipv6_neigh_lookup_noref_stub---of 12
__sk_attach_prog---of 14
_inline_copy_from_user---of 8
bpf_bind---of 7
bpf_clone_redirect---of 8
bpf_convert_ctx_access---of 89
bpf_convert_filter---of 141
bpf_convert_tstamp_read---of 3
bpf_convert_tstamp_write---of 3
bpf_csum_diff---of 5
bpf_csum_level---of 13
bpf_csum_update---of 3
bpf_dispatcher_xdp_func---of 1
bpf_dynptr_from_skb---of 3
bpf_dynptr_from_skb_rdonly---of 3
bpf_dynptr_from_xdp---of 4
bpf_flow_dissector_load_bytes---of 9
bpf_gen_ld_abs---of 8
bpf_get_cgroup_classid---of 9
bpf_get_cgroup_classid_curr---of 1
bpf_get_hash_recalc---of 3
bpf_get_listener_sock---of 8
bpf_get_netns_cookie---of 4
bpf_get_netns_cookie_sk_msg---of 4
bpf_get_netns_cookie_sock---of 3
bpf_get_netns_cookie_sock_addr---of 4
bpf_get_netns_cookie_sock_ops---of 4
bpf_get_route_realm---of 7
bpf_get_skb_set_tunnel_proto---of 7
bpf_get_socket_cookie---of 3
bpf_get_socket_cookie_sock---of 1
bpf_get_socket_cookie_sock_addr---of 1
bpf_get_socket_cookie_sock_ops---of 1
bpf_get_socket_ptr_cookie---of 4
bpf_get_socket_uid---of 8
bpf_helper_changes_pkt_data---of 25
bpf_ipv4_fib_lookup---of 69
bpf_ipv6_fib_lookup---of 41
bpf_l3_csum_replace---of 9
bpf_l4_csum_replace---of 15
bpf_lwt_in_push_encap---of 5
bpf_lwt_seg6_action---of 42
bpf_lwt_seg6_adjust_srh---of 16
bpf_lwt_seg6_store_bytes---of 15
bpf_lwt_xmit_push_encap---of 3
bpf_msg_apply_bytes---of 1
bpf_msg_cork_bytes---of 1
bpf_msg_pop_data---of 67
bpf_msg_pull_data---of 40
bpf_msg_push_data---of 63
bpf_noop_prologue---of 1
bpf_prepare_filter---of 81
bpf_prog_change_xdp---of 1
bpf_prog_create---of 5
bpf_prog_create_from_user---of 12
bpf_prog_destroy---of 4
bpf_push_seg6_encap---of 9
bpf_redirect---of 5
bpf_redirect_neigh---of 6
bpf_redirect_peer---of 5
bpf_run_sk_reuseport---of 8
bpf_search_tcp_opt---of 14
bpf_set_hash---of 1
bpf_set_hash_invalid---of 1
bpf_sk_ancestor_cgroup_id---of 11
bpf_sk_assign---of 19
bpf_sk_assign_tcp_reqsk---of 30
bpf_sk_base_func_proto---of 10
bpf_sk_cgroup_id---of 8
bpf_sk_fullsock---of 1
bpf_sk_getsockopt---of 6
bpf_sk_lookup---of 12
bpf_sk_lookup_assign---of 22
bpf_sk_lookup_tcp---of 1
bpf_sk_lookup_udp---of 1
bpf_sk_release---of 5
bpf_sk_setsockopt---of 6
bpf_skb_adjust_room---of 76
bpf_skb_ancestor_cgroup_id---of 11
bpf_skb_cgroup_classid---of 8
bpf_skb_cgroup_id---of 8
bpf_skb_change_head---of 1
bpf_skb_change_proto---of 23
bpf_skb_change_tail---of 1
bpf_skb_change_type---of 3
bpf_skb_check_mtu---of 14
bpf_skb_copy---of 8
bpf_skb_ecn_set_ce---of 46
bpf_skb_event_output---of 5
bpf_skb_fib_lookup---of 7
bpf_skb_get_nlattr---of 6
bpf_skb_get_nlattr_nest---of 8
bpf_skb_get_pay_offset---of 1
bpf_skb_get_tunnel_key---of 33
bpf_skb_get_tunnel_opt---of 22
bpf_skb_get_xfrm_state---of 6
bpf_skb_is_valid_access---of 119
bpf_skb_load_bytes---of 9
bpf_skb_load_bytes_relative---of 8
bpf_skb_load_helper_16---of 11
bpf_skb_load_helper_16_no_cache---of 11
bpf_skb_load_helper_32---of 11
bpf_skb_load_helper_32_no_cache---of 11
bpf_skb_load_helper_8---of 11
bpf_skb_load_helper_8_no_cache---of 11
bpf_skb_net_hdr_pop---of 16
bpf_skb_net_hdr_push---of 3
bpf_skb_pull_data---of 5
bpf_skb_set_tstamp---of 10
bpf_skb_set_tunnel_key---of 22
bpf_skb_set_tunnel_opt---of 22
bpf_skb_store_bytes---of 15
bpf_skb_under_cgroup---of 12
bpf_skb_vlan_pop---of 13
bpf_skb_vlan_push---of 15
bpf_skc_lookup_tcp---of 3
bpf_skc_to_mptcp_sock---of 1
bpf_skc_to_tcp6_sock---of 5
bpf_skc_to_tcp_request_sock---of 4
bpf_skc_to_tcp_sock---of 4
bpf_skc_to_tcp_timewait_sock---of 4
bpf_skc_to_udp6_sock---of 6
bpf_skc_to_unix_sock---of 4
bpf_sock_addr_getsockopt---of 6
bpf_sock_addr_set_sun_path---of 3
bpf_sock_addr_setsockopt---of 6
bpf_sock_addr_sk_lookup_tcp---of 10
bpf_sock_addr_sk_lookup_udp---of 10
bpf_sock_addr_skc_lookup_tcp---of 1
bpf_sock_common_is_valid_access---of 3
bpf_sock_convert_ctx_access---of 49
bpf_sock_destroy---of 5
bpf_sock_from_file---of 1
bpf_sock_is_valid_access---of 58
bpf_sock_ops_cb_flags_set---of 4
bpf_sock_ops_enable_tx_tstamp---of 4
bpf_sock_ops_get_syn---of 17
bpf_sock_ops_getsockopt---of 9
bpf_sock_ops_load_hdr_opt---of 22
bpf_sock_ops_reserve_hdr_opt---of 5
bpf_sock_ops_setsockopt---of 7
bpf_sock_ops_store_hdr_opt---of 12
bpf_sol_tcp_getsockopt---of 6
bpf_sol_tcp_setsockopt---of 14
bpf_tc_sk_lookup_tcp---of 10
bpf_tc_sk_lookup_udp---of 10
bpf_tc_skc_lookup_tcp---of 1
bpf_tcp_check_syncookie---of 23
bpf_tcp_gen_syncookie---of 16
bpf_tcp_raw_check_syncookie_ipv4---of 1
bpf_tcp_raw_check_syncookie_ipv6---of 1
bpf_tcp_raw_gen_syncookie_ipv4---of 4
bpf_tcp_raw_gen_syncookie_ipv6---of 4
bpf_tcp_sock---of 3
bpf_tcp_sock_convert_ctx_access---of 4
bpf_tcp_sock_is_valid_access---of 6
bpf_unlocked_sk_getsockopt---of 1
bpf_unlocked_sk_setsockopt---of 1
bpf_update_srh_state---of 3
bpf_warn_invalid_xdp_action---of 3
bpf_xdp_adjust_head---of 5
bpf_xdp_adjust_meta---of 5
bpf_xdp_adjust_tail---of 7
bpf_xdp_check_mtu---of 6
bpf_xdp_copy---of 1
bpf_xdp_copy_buf---of 6
bpf_xdp_event_output---of 7
bpf_xdp_fib_lookup---of 5
bpf_xdp_frags_increase_tail---of 10
bpf_xdp_frags_shrink_tail---of 14
bpf_xdp_get_buff_len---of 3
bpf_xdp_load_bytes---of 14
bpf_xdp_pointer---of 10
bpf_xdp_redirect---of 5
bpf_xdp_redirect_map---of 1
bpf_xdp_sk_lookup_tcp---of 10
bpf_xdp_sk_lookup_udp---of 10
bpf_xdp_skc_lookup_tcp---of 1
bpf_xdp_sock_convert_ctx_access---of 3
bpf_xdp_sock_is_valid_access---of 3
bpf_xdp_store_bytes---of 14
btf_id_cmp_func---of 1
cg_skb_func_proto---of 18
cg_skb_is_valid_access---of 66
convert_bpf_ld_abs---of 16
copy_bpf_fprog_from_user---of 9
flow_dissector_convert_ctx_access---of 5
flow_dissector_func_proto---of 11
flow_dissector_is_valid_access---of 25
init_subsystem---of 1
ip_neigh_gw4---of 12
ip_neigh_gw6---of 3
local_bh_disable---of 2
local_bh_enable---of 2
lwt_in_func_proto---of 3
lwt_is_valid_access---of 113
lwt_out_func_proto---of 19
lwt_seg6local_func_proto---of 3
lwt_xmit_func_proto---of 27
neigh_output---of 19
nexthop_num_path---of 10
pfn_valid---of 31
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
sk_attach_bpf---of 5
sk_attach_filter---of 6
sk_detach_filter30%of 10
sk_filter_charge---of 17
sk_filter_func_proto---of 16
sk_filter_is_valid_access---of 91
sk_filter_release_rcu---of 4
sk_filter_trim_cap18%of 41
sk_filter_uncharge---of 8
sk_get_filter---of 22
sk_lookup---of 12
sk_lookup_convert_ctx_access---of 43
sk_lookup_func_proto---of 13
sk_lookup_is_valid_access---of 69
sk_msg_convert_ctx_access---of 37
sk_msg_func_proto---of 23
sk_msg_is_valid_access---of 68
sk_reuseport_attach_bpf---of 16
sk_reuseport_attach_filter---of 8
sk_reuseport_convert_ctx_access---of 11
sk_reuseport_func_proto---of 7
sk_reuseport_is_valid_access---of 29
sk_reuseport_load_bytes---of 9
sk_reuseport_load_bytes_relative---of 8
sk_reuseport_prog_free---of 6
sk_select_reuseport---of 19
sk_skb_adjust_room---of 19
sk_skb_change_head---of 1
sk_skb_change_tail---of 1
sk_skb_convert_ctx_access---of 28
sk_skb_func_proto---of 25
sk_skb_is_valid_access---of 52
sk_skb_prologue---of 3
sk_skb_pull_data---of 5
skb_do_redirect---of 130
skb_frag_address---of 6
skb_postpull_rcsum---of 5
sock_addr_convert_ctx_access---of 49
sock_addr_func_proto---of 50
sock_addr_is_valid_access---of 119
sock_filter_func_proto---of 9
sock_filter_is_valid_access---of 36
sock_ops_convert_ctx_access---of 204
sock_ops_func_proto---of 25
sock_ops_is_valid_access---of 38
sol_socket_sockopt---of 23
sol_tcp_sockopt---of 34
tc_cls_act_btf_struct_access---of 3
tc_cls_act_convert_ctx_access---of 3
tc_cls_act_func_proto---of 61
tc_cls_act_is_valid_access---of 117
tc_cls_act_prologue---of 3
trace_xdp_redirect---of 14
trace_xdp_redirect_err---of 14
tracing_iter_filter---of 4
xdp_btf_struct_access---of 3
xdp_convert_ctx_access---of 8
xdp_do_check_flushed---of 16
xdp_do_flush---of 13
xdp_do_generic_redirect---of 21
xdp_do_redirect---of 22
xdp_do_redirect_frame---of 18
xdp_func_proto---of 33
xdp_is_valid_access---of 13
xdp_master_redirect---of 6
-----------
SUMMARY26%of 55

-----------
SUMMARY---of 0

__in_dev_get_rtnl_net---of 6
__inet_dev_addr_type---of 22
fib_add_ifaddr---of 17
fib_compute_spec_dst---of 19
fib_del_ifaddr---of 75
fib_disable_ip---of 8
fib_flush---of 7
fib_get_table---of 4
fib_gw_from_via---of 11
fib_inetaddr_event---of 8
fib_info_nh_uses_dev---of 10
fib_lookup---of 29
fib_magic---of 4
fib_modify_prefix_metric---of 6
fib_net_exit---of 1
fib_net_exit_batch---of 7
fib_net_init---of 10
fib_netdev_event10%of 31
fib_new_table---of 12
fib_unmerge---of 13
fib_validate_source---of 49
inet_addr_type---of 1
inet_addr_type_dev_table---of 1
inet_addr_type_table---of 1
inet_dev_addr_type---of 1
inet_dump_fib---of 33
inet_rtm_delroute---of 10
inet_rtm_newroute---of 6
ip_fib_net_exit---of 10
ip_rt_ioctl---of 68
ip_valid_fib_dump_req---of 33
l3mdev_fib_table---of 11
local_bh_disable---of 2
local_bh_enable---of 2
nexthop_fib_nhc---of 18
nexthop_num_path---of 10
nexthop_uses_dev---of 24
nl_fib_input---of 20
nlmsg_parse_deprecated_strict---of 4
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rtm_to_fib_config---of 47
-----------
SUMMARY10%of 31

__fsnotify_recalc_mask---of 16
fsnotify_add_mark---of 1
fsnotify_add_mark_locked---of 56
fsnotify_clear_marks_by_group---of 23
fsnotify_compare_groups---of 7
fsnotify_conn_mask---of 3
fsnotify_connector_destroy_workfn---of 4
fsnotify_destroy_mark---of 4
fsnotify_destroy_marks10%of 22
fsnotify_detach_connector_from_object---of 14
fsnotify_detach_mark---of 12
fsnotify_find_mark15%of 21
fsnotify_finish_user_wait---of 9
fsnotify_free_mark---of 4
fsnotify_get_mark---of 8
fsnotify_grab_connector31%of 13
fsnotify_init_mark---of 1
fsnotify_mark_destroy_workfn---of 8
fsnotify_prepare_user_wait---of 24
fsnotify_put_mark---of 31
fsnotify_recalc_mask---of 8
fsnotify_update_sb_watchers---of 21
fsnotify_wait_marks_destroyed---of 1
srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
-----------
SUMMARY22%of 60

__kvm_timer_set_cntvoff100%of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__arm64_sys_clone---of 1
__arm64_sys_clone3---of 14
__arm64_sys_fork---of 1
__arm64_sys_set_tid_address---of 1
__arm64_sys_unshare---of 1
__arm64_sys_vfork---of 1
__cleanup_sighand---of 6
__delayed_free_task---of 1
__mas_set_range---of 14
__mmdrop---of 27
__mmput---of 19
__pidfd_prepare---of 4
__probestub_task_newtask---of 1
__probestub_task_prctl_unknown---of 1
__probestub_task_rename---of 1
__put_task_struct---of 14
__put_task_struct_rcu_cb---of 1
__traceiter_task_newtask---of 4
__traceiter_task_prctl_unknown---of 4
__traceiter_task_rename---of 4
_inline_copy_from_user---of 8
account_kernel_stack---of 9
clear_ti_thread_flag---of 3
copy_clone_args_from_user---of 22
copy_files---of 7
copy_fs---of 4
copy_mm---of 123
copy_oom_score_adj---of 4
copy_process---of 154
copy_seccomp---of 8
copy_sighand---of 9
copy_signal---of 4
create_io_thread---of 1
dup_task_struct---of 22
exec_mm_release---of 1
exit_mm_release---of 1
exit_task_stack_account---of 17
free_signal_struct---of 8
free_task---of 7
free_vm_stack_cache---of 5
get_mm_exe_file---of 11
get_task_exe_file---of 4
get_task_mm---of 5
get_user_ns---of 7
idle_dummy---of 1
kernel_clone---of 41
kernel_thread---of 1
ksys_unshare---of 47
list_add_tail_rcu---of 3
lockdep_tasklist_lock_is_held---of 1
memcg_charge_kernel_stack---of 46
mm_access---of 16
mm_alloc---of 3
mm_init---of 14
mm_release---of 12
mmdrop_async_fn---of 1
mmput---of 5
mmput_async---of 5
mmput_async_fn---of 1
nr_processes---of 4
perf_trace_task_newtask---of 6
perf_trace_task_prctl_unknown---of 6
perf_trace_task_rename---of 8
pidfd_prepare---of 6
posix_cputimers_init---of 1
ptrace_event_pid---of 21
ptrace_init_task---of 5
put_cred---of 5
put_task_stack50%of 8
put_task_struct---of 6
rcu_copy_process---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
refcount_inc---of 6
replace_mm_exe_file---of 41
set_mm_exe_file---of 17
set_task_stack_end_magic---of 1
sighand_ctor---of 1
syscall_tracepoint_update---of 5
sysctl_max_threads---of 3
task_io_accounting_init---of 1
thread_stack_free_rcu---of 3
trace_event_raw_event_task_newtask---of 7
trace_event_raw_event_task_prctl_unknown---of 7
trace_event_raw_event_task_rename---of 9
trace_raw_output_task_newtask---of 3
trace_raw_output_task_prctl_unknown---of 3
trace_raw_output_task_rename---of 3
trace_task_newtask---of 17
try_release_thread_stack_to_cache50%of 12
tty_kref_get---of 7
unshare_files---of 6
user_mode_thread---of 1
vm_area_alloc67%of 3
vm_area_dup23%of 9
vm_area_free34%of 9
walk_process_tree---of 8
-----------
SUMMARY42%of 41

__probestub_hugetlbfs_alloc_inode---of 1
__probestub_hugetlbfs_evict_inode---of 1
__probestub_hugetlbfs_fallocate---of 1
__probestub_hugetlbfs_free_inode---of 1
__probestub_hugetlbfs_setattr---of 1
__traceiter_hugetlbfs_alloc_inode---of 4
__traceiter_hugetlbfs_evict_inode---of 4
__traceiter_hugetlbfs_fallocate---of 4
__traceiter_hugetlbfs_free_inode---of 4
__traceiter_hugetlbfs_setattr---of 4
folio_put---of 6
hugetlb_file_setup27%of 19
hugetlb_get_unmapped_area60%of 5
hugetlb_vma_maps_pfn---of 8
hugetlb_vmdelete_list---of 7
hugetlbfs_alloc_inode50%of 6
hugetlbfs_create---of 4
hugetlbfs_destroy_inode67%of 3
hugetlbfs_error_remove_folio---of 1
hugetlbfs_evict_inode27%of 19
hugetlbfs_fallocate---of 55
hugetlbfs_file_mmap60%of 15
hugetlbfs_fill_super---of 11
hugetlbfs_free_inode---of 17
hugetlbfs_fs_context_free---of 1
hugetlbfs_get_inode22%of 32
hugetlbfs_get_tree---of 9
hugetlbfs_inc_free_inodes---of 3
hugetlbfs_init_fs_context---of 3
hugetlbfs_migrate_folio---of 4
hugetlbfs_mkdir---of 4
hugetlbfs_mknod---of 4
hugetlbfs_parse_param---of 17
hugetlbfs_put_super---of 4
hugetlbfs_read_iter---of 31
hugetlbfs_setattr---of 29
hugetlbfs_show_options---of 14
hugetlbfs_statfs---of 4
hugetlbfs_symlink---of 5
hugetlbfs_tmpfile---of 3
hugetlbfs_write_begin---of 1
hugetlbfs_zero_partial_page---of 50
init_once---of 1
perf_trace_hugetlbfs__inode---of 6
perf_trace_hugetlbfs_alloc_inode---of 7
perf_trace_hugetlbfs_fallocate---of 6
perf_trace_hugetlbfs_setattr---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
remove_inode_hugepages5%of 61
trace_event_raw_event_hugetlbfs__inode---of 7
trace_event_raw_event_hugetlbfs_alloc_inode---of 8
trace_event_raw_event_hugetlbfs_fallocate---of 7
trace_event_raw_event_hugetlbfs_setattr---of 7
trace_raw_output_hugetlbfs__inode---of 3
trace_raw_output_hugetlbfs_alloc_inode---of 3
trace_raw_output_hugetlbfs_fallocate---of 3
trace_raw_output_hugetlbfs_setattr---of 3
-----------
SUMMARY24%of 160

-----------
SUMMARY---of 0

alloc_offload_ctx_tx---of 3
destroy_record---of 14
dev_put---of 6
get_netdev_for_sock---of 29
pfn_valid---of 31
rcu_lock_acquire---of 2
rcu_lock_release---of 2
sk_stream_moderate_sndbuf---of 4
tcp_inq---of 9
tls_append_frag---of 10
tls_dev_event4%of 66
tls_device_attach---of 9
tls_device_decrypted---of 52
tls_device_free_resources_tx---of 1
tls_device_offload_cleanup_rx---of 13
tls_device_push_pending_record---of 1
tls_device_reencrypt---of 50
tls_device_resync_rx---of 34
tls_device_rx_resync_async---of 19
tls_device_rx_resync_new_rec---of 19
tls_device_sendmsg---of 6
tls_device_sk_destruct---of 27
tls_device_splice_eof---of 4
tls_device_tx_del_task---of 15
tls_device_write_space---of 4
tls_get_record---of 24
tls_offload_tx_resync_request---of 18
tls_push_data---of 126
tls_set_device_offload---of 26
tls_set_device_offload_rx---of 14
tls_tcp_clean_acked---of 11
trace_tls_device_offload_set---of 14
trace_tls_device_rx_resync_nh_delay---of 14
-----------
SUMMARY4%of 66

__forget_cached_acl---of 7
__get_acl8%of 25
__posix_acl_chmod---of 26
__posix_acl_create---of 15
do_get_acl---of 16
do_set_acl---of 9
forget_all_cached_acls---of 1
forget_cached_acl---of 4
get_cached_acl19%of 33
get_cached_acl_rcu---of 11
get_inode_acl---of 1
posix_acl_alloc---of 3
posix_acl_chmod---of 12
posix_acl_clone---of 4
posix_acl_create12%of 17
posix_acl_create_masq---of 13
posix_acl_equiv_mode---of 14
posix_acl_from_mode---of 6
posix_acl_from_xattr---of 23
posix_acl_init---of 1
posix_acl_listxattr---of 8
posix_acl_permission---of 21
posix_acl_release---of 7
posix_acl_to_xattr---of 9
posix_acl_update_mode---of 13
posix_acl_valid---of 22
posix_acl_xattr_list---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
set_cached_acl32%of 16
set_posix_acl---of 10
simple_acl_create22%of 14
simple_set_acl---of 6
vfs_get_acl---of 8
vfs_remove_acl---of 35
vfs_set_acl---of 46
-----------
SUMMARY21%of 109

__arm64_sys_readahead---of 9
file_ra_state_init100%of 1
force_page_cache_ra---of 10
ksys_readahead---of 9
page_cache_async_ra---of 27
page_cache_ra_order13%of 31
page_cache_ra_unbounded34%of 24
page_cache_sync_ra---of 26
rcu_lock_acquire---of 2
rcu_lock_release---of 2
read_pages23%of 22
readahead_expand---of 38
readahead_folio16%of 13
-----------
SUMMARY22%of 91

ida_alloc_range17%of 30
ida_destroy---of 20
ida_find_first_range---of 8
ida_free30%of 10
idr_alloc---of 6
idr_alloc_cyclic---of 12
idr_alloc_u32---of 5
idr_find100%of 1
idr_for_each23%of 9
idr_get_next---of 13
idr_get_next_ul---of 11
idr_remove---of 1
idr_replace---of 4
-----------
SUMMARY22%of 50

rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
rxe_find_route---of 68
rxe_init_packet---of 18
rxe_net_add---of 4
rxe_net_exit---of 5
rxe_net_init---of 15
rxe_notify15%of 14
rxe_parent_name---of 6
rxe_port_down---of 3
rxe_port_up---of 1
rxe_prepare---of 30
rxe_set_port_state---of 9
rxe_skb_tx_dtor---of 16
rxe_udp_encap_recv---of 10
rxe_xmit_packet---of 43
-----------
SUMMARY15%of 14

-----------
SUMMARY---of 0

__cpu_disable---of 10
__cpu_try_die---of 4
__cpu_up---of 19
acpi_cpu_get_madt_gicc---of 1
arch_cpuhp_cleanup_dead_cpu---of 6
arch_irq_work_raise100%of 1
arch_register_cpu---of 9
arch_send_call_function_ipi_mask---of 1
arch_send_call_function_single_ipi---of 1
arch_show_interrupts---of 8
arch_smp_send_reschedule---of 1
arch_trigger_cpumask_backtrace---of 1
arch_unregister_cpu---of 10
arm64_backtrace_ipi---of 1
cpu_die_early---of 5
cpus_are_stuck_in_kernel---of 9
crash_smp_send_stop---of 3
get_irq_regs---of 1
ipi_cpu_crash_stop---of 1
ipi_handler---of 40
ipi_setup---of 5
local_cpu_stop---of 1
local_daif_mask---of 1
local_daif_restore---of 1
secondary_start_kernel---of 7
smp_crash_stop_failed---of 3
smp_cross_call29%of 14
smp_send_stop---of 25
tick_broadcast---of 1
-----------
SUMMARY34%of 15

-----------
SUMMARY---of 0

__memblock_dump_all---of 1
__next_mem_pfn_range---of 10
__next_mem_range---of 38
__next_mem_range_rev---of 41
memblock_add---of 3
memblock_add_node---of 3
memblock_add_range---of 31
memblock_addrs_overlap---of 1
memblock_clear_hotplug---of 5
memblock_clear_nomap---of 5
memblock_debug_open---of 1
memblock_debug_show---of 13
memblock_double_array---of 25
memblock_dump---of 6
memblock_dump_all---of 3
memblock_end_of_DRAM---of 1
memblock_find_in_range_node---of 13
memblock_free---of 3
memblock_get_current_limit---of 1
memblock_has_mirror---of 1
memblock_is_map_memory34%of 6
memblock_is_memory---of 6
memblock_is_region_memory---of 6
memblock_is_region_reserved---of 4
memblock_is_reserved---of 6
memblock_isolate_range---of 15
memblock_mark_hotplug---of 5
memblock_mark_mirror---of 6
memblock_mark_nomap---of 5
memblock_merge_regions---of 10
memblock_overlaps_region---of 4
memblock_phys_free---of 7
memblock_phys_mem_size---of 1
memblock_remove---of 7
memblock_remove_range---of 5
memblock_remove_region---of 4
memblock_reserve---of 3
memblock_reserved_mark_noinit---of 5
memblock_reserved_size---of 1
memblock_search_pfn_nid---of 6
memblock_set_current_limit---of 1
memblock_set_node---of 5
memblock_start_of_DRAM---of 1
memblock_trim_memory---of 7
memblock_validate_numa_coverage---of 12
reserve_mem_find_by_name---of 8
reserve_mem_release_by_name---of 9
-----------
SUMMARY34%of 6

-----------
SUMMARY---of 0

__bitmap_and---of 6
__bitmap_andnot---of 6
__bitmap_clear91%of 11
__bitmap_complement---of 4
__bitmap_equal---of 8
__bitmap_intersects---of 8
__bitmap_or---of 4
__bitmap_or_equal---of 7
__bitmap_replace---of 4
__bitmap_set91%of 11
__bitmap_shift_left---of 8
__bitmap_shift_right---of 8
__bitmap_subset---of 8
__bitmap_weight---of 6
__bitmap_weight_and---of 6
__bitmap_weight_andnot---of 6
__bitmap_xor---of 4
bitmap_alloc100%of 1
bitmap_alloc_node---of 1
bitmap_bitremap---of 16
bitmap_cut---of 11
bitmap_find_next_zero_area_off---of 4
bitmap_fold---of 5
bitmap_free100%of 1
bitmap_from_arr32---of 8
bitmap_onto---of 6
bitmap_remap---of 21
bitmap_to_arr32---of 8
bitmap_zalloc---of 1
bitmap_zalloc_node---of 1
devm_bitmap_alloc---of 4
devm_bitmap_free---of 1
devm_bitmap_zalloc---of 4
-----------
SUMMARY92%of 24

-----------
SUMMARY---of 0

nsim_dev_hwstats_do_write---of 36
nsim_dev_hwstats_exit---of 11
nsim_dev_hwstats_init---of 5
nsim_dev_hwstats_traffic_work---of 6
nsim_dev_netdevice_event8%of 28
-----------
SUMMARY8%of 28

alloc_etherdev_mqs---of 1
arch_get_platform_mac_address---of 1
device_get_ethdev_address---of 3
device_get_mac_address---of 1
eth_commit_mac_addr_change---of 1
eth_get_headlen---of 4
eth_gro_complete---of 6
eth_gro_receive---of 21
eth_header---of 9
eth_header_cache---of 3
eth_header_cache_update---of 1
eth_header_parse---of 3
eth_header_parse_protocol---of 3
eth_mac_addr---of 7
eth_platform_get_mac_address---of 4
eth_prepare_mac_addr_change---of 6
eth_type_trans48%of 19
eth_validate_addr---of 3
ether_setup---of 1
fwnode_get_mac_address---of 13
nvmem_get_mac_address---of 7
platform_get_ethdev_address---of 4
skb_header_pointer50%of 4
sysfs_format_mac---of 1
-----------
SUMMARY48%of 23

__get_intid_range3%of 100
gic_check_rdist---of 3
gic_cpu_init---of 20
gic_cpu_pm_notifier---of 11
gic_cpu_sys_reg_init---of 32
gic_dist_wait_for_rwp---of 7
gic_enable_quirk_arm64_2941627---of 1
gic_enable_quirk_asr8601---of 1
gic_enable_quirk_cavium_38539---of 1
gic_enable_quirk_hip06_07---of 3
gic_enable_quirk_msm8996---of 1
gic_enable_quirk_nvidia_t241---of 22
gic_enable_quirk_rk3399---of 3
gic_enable_redist---of 12
gic_eoi_irq---of 9
gic_eoimode1_eoi_irq---of 11
gic_eoimode1_mask_irq---of 3
gic_handle_irq---of 12
gic_ipi_send_mask37%of 19
gic_irq_domain_alloc---of 16
gic_irq_domain_free---of 4
gic_irq_domain_select---of 13
gic_irq_domain_translate---of 23
gic_irq_get_irqchip_state---of 6
gic_irq_nmi_setup---of 1
gic_irq_nmi_teardown---of 1
gic_irq_set_irqchip_state38%of 8
gic_irq_set_vcpu_affinity---of 3
gic_mask_irq---of 10
gic_of_iomap---of 5
gic_peek_irq17%of 18
gic_poke_irq18%of 17
gic_redist_wait_for_rwp---of 7
gic_request_region---of 4
gic_retrigger---of 3
gic_set_affinity---of 35
gic_set_type---of 25
gic_starting_cpu---of 7
gic_unmask_irq---of 1
gic_v3_get_gsi_domain_id---of 1
partition_domain_translate---of 6
rd_set_non_coherent---of 1
-----------
SUMMARY12%of 162

-----------
SUMMARY---of 0

__tlb_remove_folio_pages100%of 1
__tlb_remove_folio_pages_size58%of 14
__tlb_remove_page_size---of 1
__tlb_remove_table---of 13
tlb_finish_mmu75%of 8
tlb_flush_mmu44%of 16
tlb_flush_mmu_tlbonly21%of 58
tlb_flush_rmaps69%of 19
tlb_gather_mmu60%of 5
tlb_gather_mmu_fullmm---of 3
tlb_remove_table67%of 6
tlb_remove_table_rcu---of 4
tlb_remove_table_smp_sync---of 1
tlb_remove_table_sync_one---of 1
-----------
SUMMARY43%of 127

__arm64_sys_getcwd---of 31
__d_path---of 6
__dentry_path39%of 26
d_absolute_path67%of 6
d_path34%of 27
dentry_path---of 6
dentry_path_raw67%of 3
dynamic_dname67%of 3
prepend34%of 6
prepend_path37%of 49
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
seqcount_lockdep_reader_access60%of 5
simple_dname67%of 3
-----------
SUMMARY43%of 132

-----------
SUMMARY---of 0

list_add_tail_rcu---of 3
rcu_lock_acquire---of 2
rcu_lock_release---of 2
tomoyo_assign_domain---of 26
tomoyo_assign_namespace---of 28
tomoyo_check_acl62%of 13
tomoyo_dump_page---of 55
tomoyo_find_next_domain---of 79
tomoyo_same_transition_control---of 5
tomoyo_update_domain---of 24
tomoyo_update_policy---of 12
tomoyo_write_aggregator---of 23
tomoyo_write_transition_control---of 22
-----------
SUMMARY62%of 13

__arm64_compat_sys_newfstat---of 8
__arm64_compat_sys_newfstatat---of 3
__arm64_compat_sys_newlstat---of 3
__arm64_compat_sys_newstat---of 3
__arm64_sys_fstat64---of 8
__arm64_sys_fstatat64---of 3
__arm64_sys_lstat64---of 3
__arm64_sys_newfstat---of 8
__arm64_sys_newfstatat---of 3
__arm64_sys_newlstat---of 3
__arm64_sys_newstat---of 3
__arm64_sys_readlink---of 1
__arm64_sys_readlinkat---of 1
__arm64_sys_stat64---of 3
__arm64_sys_statx---of 8
__inode_add_bytes---of 3
__inode_sub_bytes---of 3
cp_compat_stat---of 14
cp_new_stat---of 7
cp_new_stat64---of 7
cp_statx---of 7
do_readlinkat---of 15
do_statx---of 4
do_statx_fd---of 13
fill_mg_cmtime---of 18
generic_fill_statx_atomic_writes---of 3
generic_fill_statx_attr---of 5
generic_fillattr---of 6
inode_add_bytes67%of 3
inode_get_bytes100%of 1
inode_set_bytes---of 1
inode_sub_bytes67%of 3
vfs_fstat---of 6
vfs_fstatat---of 10
vfs_getattr---of 3
vfs_getattr_nosec---of 12
vfs_statx---of 17
-----------
SUMMARY72%of 7

-----------
SUMMARY---of 0

__kvm_pgtable_stage2_init67%of 3
__kvm_pgtable_walk67%of 39
hyp_free_walker---of 6
hyp_map_walker---of 16
hyp_unmap_walker---of 14
kvm_get_vtcr40%of 10
kvm_pgtable_get_leaf50%of 6
kvm_pgtable_hyp_destroy---of 9
kvm_pgtable_hyp_init---of 4
kvm_pgtable_hyp_map---of 16
kvm_pgtable_hyp_pte_prot---of 9
kvm_pgtable_hyp_unmap---of 3
kvm_pgtable_stage2_create_unlinked---of 9
kvm_pgtable_stage2_destroy34%of 9
kvm_pgtable_stage2_flush---of 5
kvm_pgtable_stage2_free_unlinked---of 5
kvm_pgtable_stage2_map50%of 6
kvm_pgtable_stage2_mkyoung---of 3
kvm_pgtable_stage2_pgd_size---of 1
kvm_pgtable_stage2_pte_prot---of 3
kvm_pgtable_stage2_relax_perms34%of 9
kvm_pgtable_stage2_set_owner---of 3
kvm_pgtable_stage2_split---of 1
kvm_pgtable_stage2_test_clear_young---of 3
kvm_pgtable_stage2_unmap19%of 16
kvm_pgtable_stage2_wrprotect100%of 1
kvm_pgtable_walk43%of 19
kvm_tlb_flush_vmid_range34%of 12
leaf_walker100%of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
stage2_age_walker---of 8
stage2_attr_walker47%of 13
stage2_flush_walker---of 10
stage2_free_walker34%of 6
stage2_make_pte---of 5
stage2_map_walker29%of 25
stage2_map_walker_try_leaf26%of 39
stage2_set_prot_attr19%of 16
stage2_split_walker---of 14
stage2_try_break_pte18%of 29
stage2_unmap_walker34%of 36
-----------
SUMMARY37%of 299

-----------
SUMMARY---of 0

create_rule---of 16
free_ruleset---of 16
free_ruleset_work---of 1
inherit_tree---of 11
insert_rule---of 25
landlock_create_ruleset---of 12
landlock_find_rule---of 9
landlock_get_fs_access_mask---of 1
landlock_get_net_access_mask---of 1
landlock_init_layer_masks---of 10
landlock_insert_rule---of 1
landlock_merge_ruleset---of 34
landlock_put_ruleset---of 7
landlock_put_ruleset_deferred29%of 7
landlock_unmask_layers---of 11
merge_tree---of 13
-----------
SUMMARY29%of 7

-----------
SUMMARY---of 0

__percpu_counter_compare---of 7
__percpu_counter_init_many---of 8
__percpu_counter_limited_add31%of 33
__percpu_counter_sum---of 4
compute_batch_value---of 1
percpu_counter_add_batch84%of 12
percpu_counter_cpu_dead---of 4
percpu_counter_destroy_many---of 16
percpu_counter_fixup_free---of 3
percpu_counter_set---of 4
percpu_counter_sync---of 1
-----------
SUMMARY45%of 45

mls_compute_context_len10%of 21
mls_compute_sid8%of 26
mls_context_cpy---of 4
mls_context_cpy_high---of 4
mls_context_cpy_low---of 4
mls_context_glblub---of 5
mls_context_isvalid19%of 11
mls_context_to_sid---of 34
mls_convert_context---of 19
mls_export_netlbl_cat---of 5
mls_export_netlbl_lvl---of 3
mls_from_string---of 4
mls_import_netlbl_cat---of 4
mls_import_netlbl_lvl---of 3
mls_level_isvalid---of 5
mls_range_isvalid---of 11
mls_range_set---of 3
mls_setup_user_range---of 23
mls_sid_to_context10%of 21
-----------
SUMMARY11%of 79

__text_poke---of 4
aarch64_insn_patch_text---of 3
aarch64_insn_patch_text_cb---of 11
aarch64_insn_patch_text_nosync50%of 4
aarch64_insn_read---of 3
aarch64_insn_write---of 1
patch_map6%of 36
patch_unmap---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
text_poke_memcpy---of 1
text_poke_memset---of 1
-----------
SUMMARY10%of 40

llc_add_pack---of 3
llc_rcv25%of 57
llc_remove_pack---of 3
llc_set_station_handler---of 3
-----------
SUMMARY25%of 57

__shmem_file_setup37%of 11
casefold_show---of 1
cond_resched_rcu---of 11
folio_address---of 5
folio_large_mapcount---of 4
folio_nr_pages---of 4
folio_put---of 6
folio_swap---of 8
folio_test_uptodate---of 4
pfn_valid23%of 31
put_swap_device---of 18
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
shmem_add_to_page_cache42%of 34
shmem_alloc_and_add_folio13%of 74
shmem_alloc_inode100%of 1
shmem_allowable_huge_orders38%of 16
shmem_charge---of 3
shmem_create100%of 1
shmem_destroy_inode60%of 5
shmem_enabled_show---of 1
shmem_enabled_store---of 9
shmem_encode_fh---of 5
shmem_error_remove_folio---of 1
shmem_evict_inode34%of 24
shmem_falloc_wait---of 22
shmem_fallocate---of 39
shmem_fault46%of 11
shmem_fh_to_dentry---of 5
shmem_file_llseek---of 5
shmem_file_open100%of 1
shmem_file_read_iter---of 42
shmem_file_setup---of 1
shmem_file_setup_with_mnt---of 1
shmem_file_splice_read---of 37
shmem_file_write_iter60%of 5
shmem_fileattr_get100%of 1
shmem_fileattr_set90%of 10
shmem_fill_super---of 27
shmem_free_fc---of 4
shmem_free_in_core_inode---of 3
shmem_free_swap---of 3
shmem_get_dquots100%of 1
shmem_get_folio---of 1
shmem_get_folio_gfp29%of 100
shmem_get_inode36%of 37
shmem_get_link---of 25
shmem_get_offset_ctx---of 1
shmem_get_parent---of 1
shmem_get_partial_folio14%of 15
shmem_get_policy100%of 1
shmem_get_tree---of 1
shmem_get_unmapped_area14%of 30
shmem_getattr---of 13
shmem_hpage_pmd_enabled---of 6
shmem_huge_global_enabled12%of 17
shmem_init_fs_context---of 3
shmem_init_inode---of 1
shmem_initxattrs19%of 16
shmem_inode_acct_blocks46%of 11
shmem_kernel_file_setup---of 1
shmem_link---of 13
shmem_listxattr---of 1
shmem_lock---of 8
shmem_mapping100%of 1
shmem_mapping_size_orders---of 7
shmem_match---of 3
shmem_mfill_atomic_pte---of 48
shmem_mkdir---of 3
shmem_mknod32%of 16
shmem_mmap100%of 3
shmem_next_opt---of 5
shmem_parse_huge---of 9
shmem_parse_monolithic---of 1
shmem_parse_one---of 48
shmem_parse_opt_casefold---of 7
shmem_partial_swap_usage---of 21
shmem_put_link---of 6
shmem_put_super---of 5
shmem_read_folio_gfp---of 3
shmem_read_mapping_page_gfp---of 14
shmem_recalc_inode100%of 6
shmem_reconfigure---of 50
shmem_rename2---of 13
shmem_replace_folio---of 51
shmem_rmdir---of 3
shmem_set_inode_flags20%of 10
shmem_set_policy---of 1
shmem_setattr38%of 45
shmem_show_options---of 38
shmem_split_large_entry---of 12
shmem_statfs---of 5
shmem_swap_usage---of 5
shmem_swapin_folio---of 130
shmem_symlink---of 17
shmem_tmpfile38%of 8
shmem_truncate_range100%of 1
shmem_uncharge---of 1
shmem_undo_range16%of 76
shmem_unlink---of 7
shmem_unlock_mapping---of 7
shmem_unuse---of 49
shmem_unused_huge_count---of 1
shmem_unused_huge_scan---of 3
shmem_unused_huge_shrink---of 53
shmem_write_begin17%of 18
shmem_write_end53%of 21
shmem_writepage---of 72
shmem_xattr_handler_get100%of 1
shmem_xattr_handler_set---of 12
shmem_zero_setup50%of 4
synchronous_wake_function---of 3
thpsize_shmem_enabled_show---of 5
thpsize_shmem_enabled_store---of 45
vma_is_anon_shmem100%of 1
vma_is_shmem---of 1
xas_next_entry---of 15
zero_pipe_buf_get---of 1
zero_pipe_buf_release---of 1
zero_pipe_buf_try_steal---of 1
zero_user_segments50%of 18
-----------
SUMMARY32%of 686

__ksm_enter---of 31
__ksm_exit---of 46
__probestub_ksm_advisor---of 1
__probestub_ksm_enter---of 1
__probestub_ksm_exit---of 1
__probestub_ksm_merge_one_page---of 1
__probestub_ksm_merge_with_ksm_page---of 1
__probestub_ksm_remove_ksm_page---of 1
__probestub_ksm_remove_rmap_item---of 1
__probestub_ksm_start_scan---of 1
__probestub_ksm_stop_scan---of 1
__set_ptes---of 23
__stable_node_chain---of 50
__traceiter_ksm_advisor---of 4
__traceiter_ksm_enter---of 4
__traceiter_ksm_exit---of 4
__traceiter_ksm_merge_one_page---of 4
__traceiter_ksm_merge_with_ksm_page---of 4
__traceiter_ksm_remove_ksm_page---of 4
__traceiter_ksm_remove_rmap_item---of 4
__traceiter_ksm_start_scan---of 4
__traceiter_ksm_stop_scan---of 4
advisor_max_cpu_show---of 1
advisor_max_cpu_store---of 3
advisor_max_pages_to_scan_show---of 1
advisor_max_pages_to_scan_store---of 3
advisor_min_pages_to_scan_show---of 1
advisor_min_pages_to_scan_store---of 3
advisor_mode_show---of 1
advisor_mode_store---of 6
advisor_target_scan_time_show---of 1
advisor_target_scan_time_store---of 4
break_cow---of 16
break_ksm---of 23
calc_checksum---of 35
collect_procs_ksm---of 24
folio_get---of 4
folio_large_mapcount---of 4
folio_migrate_ksm---of 20
folio_set_stable_node---of 13
full_scans_show---of 1
general_profit_show---of 1
ksm_add_vma19%of 11
ksm_add_vmas---of 13
ksm_del_vmas---of 18
ksm_disable---of 10
ksm_disable_merge_any---of 5
ksm_do_scan---of 498
ksm_enable_merge_any---of 7
ksm_get_folio---of 54
ksm_madvise---of 17
ksm_memory_callback---of 28
ksm_might_need_to_copy---of 28
ksm_process_mergeable---of 7
ksm_process_profit---of 1
ksm_scan_thread---of 28
ksm_zero_pages_show---of 1
max_page_sharing_show---of 1
max_page_sharing_store---of 8
merge_across_nodes_show---of 1
merge_across_nodes_store---of 11
mmu_notifier_invalidate_range_start---of 3
pages_scanned_show---of 1
pages_shared_show---of 1
pages_sharing_show---of 1
pages_skipped_show---of 1
pages_to_scan_show---of 1
pages_to_scan_store---of 4
pages_unshared_show---of 1
pages_volatile_show---of 1
perf_trace_ksm_advisor---of 6
perf_trace_ksm_enter_exit_template---of 6
perf_trace_ksm_merge_one_page---of 6
perf_trace_ksm_merge_with_ksm_page---of 6
perf_trace_ksm_remove_ksm_page---of 6
perf_trace_ksm_remove_rmap_item---of 6
perf_trace_ksm_scan_template---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
remove_all_stable_nodes---of 22
remove_node_from_stable_tree---of 52
remove_rmap_item_from_tree---of 18
remove_stable_node---of 25
replace_page---of 73
rmap_walk_ksm---of 26
run_show---of 1
run_store---of 48
sleep_millisecs_show---of 1
sleep_millisecs_store---of 3
smart_scan_show---of 1
smart_scan_store---of 3
split_huge_page---of 5
stable_node_chains_prune_millisecs_show---of 1
stable_node_chains_prune_millisecs_store---of 3
stable_node_chains_show---of 1
stable_node_dups_show---of 1
trace_event_raw_event_ksm_advisor---of 7
trace_event_raw_event_ksm_enter_exit_template---of 7
trace_event_raw_event_ksm_merge_one_page---of 7
trace_event_raw_event_ksm_merge_with_ksm_page---of 7
trace_event_raw_event_ksm_remove_ksm_page---of 7
trace_event_raw_event_ksm_remove_rmap_item---of 7
trace_event_raw_event_ksm_scan_template---of 7
trace_raw_output_ksm_advisor---of 3
trace_raw_output_ksm_enter_exit_template---of 3
trace_raw_output_ksm_merge_one_page---of 3
trace_raw_output_ksm_merge_with_ksm_page---of 3
trace_raw_output_ksm_remove_ksm_page---of 3
trace_raw_output_ksm_remove_rmap_item---of 3
trace_raw_output_ksm_scan_template---of 3
try_to_merge_one_page---of 140
try_to_merge_with_ksm_page---of 30
use_zero_pages_show---of 1
use_zero_pages_store---of 3
wait_while_offlining---of 6
-----------
SUMMARY19%of 11

-----------
SUMMARY---of 0

__collapse_huge_page_isolate---of 85
__khugepaged_enter---of 15
__khugepaged_exit---of 23
__probestub_mm_collapse_huge_page---of 1
__probestub_mm_collapse_huge_page_isolate---of 1
__probestub_mm_collapse_huge_page_swapin---of 1
__probestub_mm_khugepaged_collapse_file---of 1
__probestub_mm_khugepaged_scan_file---of 1
__probestub_mm_khugepaged_scan_pmd---of 1
__traceiter_mm_collapse_huge_page---of 4
__traceiter_mm_collapse_huge_page_isolate---of 4
__traceiter_mm_collapse_huge_page_swapin---of 4
__traceiter_mm_khugepaged_collapse_file---of 4
__traceiter_mm_khugepaged_scan_file---of 4
__traceiter_mm_khugepaged_scan_pmd---of 4
add_mm_counter---of 1
alloc_charge_folio---of 33
alloc_sleep_millisecs_show---of 1
alloc_sleep_millisecs_store---of 3
collapse_pte_mapped_thp---of 66
collect_mm_slot---of 14
count_vm_event---of 4
current_is_khugepaged---of 1
defrag_show---of 1
defrag_store---of 1
filemap_nr_thps_dec---of 7
find_pmd_or_thp_or_none---of 7
flush_tlb_mm---of 5
folio_large_mapcount---of 4
folio_order---of 4
folio_put---of 6
folio_ref_sub---of 3
full_scans_show---of 1
hpage_collapse_scan_file---of 299
hpage_collapse_scan_pmd---of 280
hugepage_madvise---of 4
hugepage_vma_revalidate---of 21
is_refcount_suitable---of 14
khugepaged---of 111
khugepaged_alloc_sleep---of 3
khugepaged_enter_vma38%of 16
khugepaged_min_free_kbytes_update---of 8
madvise_collapse---of 74
max_ptes_none_show---of 1
max_ptes_none_store---of 4
max_ptes_shared_show---of 1
max_ptes_shared_store---of 4
max_ptes_swap_show---of 1
max_ptes_swap_store---of 4
mm_counter_file---of 3
mm_dec_nr_ptes---of 3
mmu_notifier_invalidate_range_end---of 5
mmu_notifier_invalidate_range_start---of 3
pages_collapsed_show---of 1
pages_to_scan_show---of 1
pages_to_scan_store---of 4
perf_trace_mm_collapse_huge_page---of 6
perf_trace_mm_collapse_huge_page_isolate---of 7
perf_trace_mm_collapse_huge_page_swapin---of 6
perf_trace_mm_khugepaged_collapse_file---of 7
perf_trace_mm_khugepaged_scan_file---of 7
perf_trace_mm_khugepaged_scan_pmd---of 7
pfn_valid---of 31
pmd_lock---of 1
pmd_populate---of 6
pte_unmap---of 6
ptep_clear---of 7
rcu_lock_acquire---of 2
rcu_lock_release---of 2
release_pte_folio---of 6
scan_sleep_millisecs_show---of 1
scan_sleep_millisecs_store---of 3
set_huge_pmd---of 14
set_recommended_min_free_kbytes---of 14
start_stop_khugepaged---of 13
trace_event_raw_event_mm_collapse_huge_page---of 7
trace_event_raw_event_mm_collapse_huge_page_isolate---of 8
trace_event_raw_event_mm_collapse_huge_page_swapin---of 7
trace_event_raw_event_mm_khugepaged_collapse_file---of 8
trace_event_raw_event_mm_khugepaged_scan_file---of 8
trace_event_raw_event_mm_khugepaged_scan_pmd---of 8
trace_mm_collapse_huge_page_isolate---of 17
trace_raw_output_mm_collapse_huge_page---of 3
trace_raw_output_mm_collapse_huge_page_isolate---of 3
trace_raw_output_mm_collapse_huge_page_swapin---of 3
trace_raw_output_mm_khugepaged_collapse_file---of 3
trace_raw_output_mm_khugepaged_scan_file---of 3
trace_raw_output_mm_khugepaged_scan_pmd---of 3
xas_next---of 10
xas_next_entry---of 15
-----------
SUMMARY38%of 16

__arm64_sys_getrandom---of 9
__get_random_u32_below---of 6
_credit_init_bits---of 13
_get_random_bytes38%of 8
add_device_randomness---of 1
add_disk_randomness---of 4
add_hwgenerator_randomness---of 12
add_input_randomness---of 3
add_interrupt_randomness---of 9
add_timer_randomness---of 7
crng_fast_key_erasure100%of 1
crng_make_state40%of 15
crng_reseed---of 7
crng_set_ready---of 1
entropy_timer---of 7
execute_with_initialized_rng---of 4
extract_entropy---of 31
get_random_bytes---of 1
get_random_bytes_user---of 10
get_random_u16---of 13
get_random_u3254%of 13
get_random_u6447%of 13
get_random_u8---of 13
local_lock_acquire34%of 6
local_lock_release43%of 7
mix_interrupt_randomness---of 7
mix_pool_bytes---of 1
proc_do_rointvec---of 3
proc_do_uuid---of 5
rand_initialize_disk---of 3
random_fasync---of 1
random_ioctl---of 42
random_online_cpu---of 1
random_pm_notification---of 6
random_poll---of 6
random_prepare_cpu---of 1
random_read_iter---of 8
random_write_iter---of 1
rng_is_initialized---of 3
try_to_generate_entropy---of 20
urandom_read_iter---of 10
wait_for_random_bytes---of 13
write_pool_user---of 7
-----------
SUMMARY45%of 63

-----------
SUMMARY---of 0

__mctp_dev_get20%of 15
mctp_add_dev---of 11
mctp_addr_notify---of 5
mctp_dev_get_rtnl---of 6
mctp_dev_hold---of 6
mctp_dev_notify10%of 22
mctp_dev_put---of 11
mctp_dev_release_key---of 5
mctp_dev_set_key---of 6
mctp_dump_addrinfo---of 22
mctp_fill_addrinfo---of 9
mctp_fill_link_af34%of 9
mctp_get_link_af_size67%of 3
mctp_register_netdev---of 3
mctp_rtm_deladdr---of 17
mctp_rtm_newaddr---of 19
mctp_set_link_af---of 11
mctp_unregister_netdev---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY21%of 49

llc_station_exit---of 1
llc_station_rcv12%of 26
-----------
SUMMARY12%of 26

-----------
SUMMARY---of 0

get_timer_map18%of 34
kvm_arch_timer_get_input_level---of 23
kvm_arch_timer_handler---of 9
kvm_arm_timer_get_attr63%of 8
kvm_arm_timer_get_reg39%of 26
kvm_arm_timer_has_attr100%of 1
kvm_arm_timer_read---of 21
kvm_arm_timer_read_sysreg---of 5
kvm_arm_timer_set_attr55%of 11
kvm_arm_timer_set_reg43%of 14
kvm_arm_timer_write---of 10
kvm_arm_timer_write_sysreg---of 5
kvm_bg_timer_expire---of 3
kvm_cpu_has_pending_timer40%of 5
kvm_hrtimer_expire---of 22
kvm_phys_timer_read---of 1
kvm_timer_cpu_down---of 3
kvm_timer_cpu_up---of 3
kvm_timer_earliest_exp---of 20
kvm_timer_enable45%of 20
kvm_timer_init_vhe---of 5
kvm_timer_init_vm100%of 1
kvm_timer_should_fire58%of 14
kvm_timer_should_notify_user75%of 4
kvm_timer_sync_nested---of 4
kvm_timer_sync_user50%of 4
kvm_timer_update_irq38%of 16
kvm_timer_update_run80%of 5
kvm_timer_update_status12%of 17
kvm_timer_vcpu_init58%of 7
kvm_timer_vcpu_load16%of 110
kvm_timer_vcpu_put16%of 33
kvm_timer_vcpu_reset26%of 50
kvm_timer_vcpu_terminate100%of 1
kvm_vm_ioctl_set_counter_offset80%of 5
timer_emulate---of 26
timer_get_ctl17%of 30
timer_get_cval17%of 30
timer_irq_ack---of 3
timer_irq_domain_alloc---of 1
timer_irq_domain_free---of 1
timer_irq_eoi---of 3
timer_irq_set_irqchip_state---of 5
timer_irq_set_vcpu_affinity---of 1
timer_restore_state30%of 51
timer_save_state30%of 48
timer_set_ctl17%of 30
timer_set_cval17%of 30
unmask_vtimer_irq_user60%of 5
wfit_delay_ns---of 23
-----------
SUMMARY28%of 610

__get_user_pages11%of 249
__gup_longterm_locked---of 103
__mm_populate50%of 18
check_and_migrate_movable_pages_or_folios---of 76
check_vma_flags33%of 28
fault_in_readable60%of 15
fault_in_safe_writeable---of 11
fault_in_subpage_writeable---of 18
fault_in_writeable---of 15
faultin_page_range---of 46
fixup_user_fault---of 36
folio_add_pin---of 13
folio_add_pins---of 6
folio_put_refs---of 5
follow_page_pte15%of 96
follow_pfn_pte---of 31
get_dump_page---of 15
get_user_pages---of 10
get_user_pages_fast---of 5
get_user_pages_fast_only40%of 5
get_user_pages_remote---of 59
get_user_pages_unlocked22%of 42
gup_fast_fallback15%of 249
gup_fast_folio_allowed34%of 21
gup_fast_undo_dev_pagemap---of 9
gup_put_folio34%of 12
memfd_pin_folios---of 60
no_page_table---of 7
pin_user_pages---of 6
pin_user_pages_fast---of 6
pin_user_pages_remote---of 9
pin_user_pages_unlocked---of 6
populate_vma_page_range37%of 11
put_dev_pagemap---of 19
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
sanity_check_pinned_pages---of 44
try_get_folio34%of 24
try_grab_folio20%of 15
try_grab_folio_fast12%of 27
unpin_folio---of 1
unpin_folios---of 10
unpin_user_folio---of 1
unpin_user_page---of 3
unpin_user_page_range_dirty_lock---of 18
unpin_user_pages---of 13
unpin_user_pages_dirty_lock---of 31
-----------
SUMMARY18%of 816

-----------
SUMMARY---of 0

__get_vm_area_caller---of 1
__get_vm_area_node34%of 9
__probestub_alloc_vmap_area---of 1
__probestub_free_vmap_area_noflush---of 1
__probestub_purge_vmap_area_lazy---of 1
__purge_vmap_area_lazy---of 54
__set_ptes27%of 23
__traceiter_alloc_vmap_area---of 4
__traceiter_free_vmap_area_noflush---of 4
__traceiter_purge_vmap_area_lazy---of 4
__vmalloc_node_noprof---of 1
__vmalloc_node_range_noprof39%of 52
__vmalloc_noprof100%of 1
__vmap_pages_range_noflush29%of 60
__vunmap_range_noflush33%of 52
_vm_unmap_aliases---of 43
aligned_vread_iter---of 8
alloc_vmap_area48%of 135
decay_va_pool_node---of 40
delayed_vfree_work---of 4
drain_vmap_area_work---of 1
find_unlink_vmap_area42%of 12
find_vm_area---of 11
find_vmap_area---of 11
find_vmap_area_exceed_addr_lock---of 23
free_unmap_vmap_area---of 1
free_vm_area---of 3
free_vmap_area---of 67
free_vmap_area_noflush29%of 28
free_vmap_area_rb_augment_cb_propagate---of 9
free_vmap_area_rb_augment_cb_rotate100%of 5
free_vmap_block---of 14
get_vm_area---of 1
get_vm_area_caller---of 1
get_vm_area_page_order---of 1
insert_vmap_area62%of 13
insert_vmap_area_augment---of 23
ioremap_page_range---of 12
is_vmalloc_addr100%of 3
is_vmalloc_or_module_addr50%of 4
mod_memcg_page_state25%of 28
pcpu_free_vm_areas---of 5
pcpu_get_vm_areas---of 249
perf_trace_alloc_vmap_area---of 6
perf_trace_free_vmap_area_noflush---of 6
perf_trace_purge_vmap_area_lazy---of 6
pfn_valid23%of 31
purge_fragmented_block---of 6
purge_vmap_node---of 26
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
reclaim_and_purge_vmap_areas---of 24
reclaim_list_global---of 68
register_vmap_purge_notifier---of 1
remap_vmalloc_range---of 1
remap_vmalloc_range_partial---of 10
remove_vm_area40%of 5
trace_event_raw_event_alloc_vmap_area---of 7
trace_event_raw_event_free_vmap_area_noflush---of 7
trace_event_raw_event_purge_vmap_area_lazy---of 7
trace_raw_output_alloc_vmap_area---of 3
trace_raw_output_free_vmap_area_noflush---of 3
trace_raw_output_purge_vmap_area_lazy---of 3
unregister_vmap_purge_notifier---of 1
vfree44%of 16
vfree_atomic---of 5
vm_area_map_pages---of 8
vm_area_unmap_pages---of 8
vm_flags_set---of 6
vm_map_ram---of 43
vm_reset_perms---of 18
vm_unmap_aliases---of 1
vm_unmap_ram---of 17
vmalloc_32_noprof---of 1
vmalloc_32_user_noprof---of 1
vmalloc_dump_obj---of 10
vmalloc_huge_noprof---of 1
vmalloc_info_show---of 47
vmalloc_node_noprof---of 1
vmalloc_noprof---of 1
vmalloc_nr_pages---of 1
vmalloc_to_page27%of 38
vmalloc_to_pfn---of 1
vmalloc_user_noprof---of 1
vmap---of 15
vmap_node_shrink_count---of 6
vmap_node_shrink_scan---of 4
vmap_page_range---of 4
vmap_pages_range---of 1
vmap_pages_range_noflush---of 1
vmap_pfn---of 8
vmap_pfn_apply---of 6
vmap_range_noflush---of 48
vread_iter---of 55
vrealloc_noprof---of 11
vunmap---of 5
vunmap_range---of 16
vunmap_range_noflush---of 1
vzalloc_node_noprof---of 1
vzalloc_noprof100%of 1
-----------
SUMMARY38%of 520

-----------
SUMMARY---of 0

__futex_wait---of 10
__futex_wake_mark---of 4
futex_unqueue_multiple---of 4
futex_wait8%of 14
futex_wait_multiple---of 24
futex_wait_multiple_setup---of 33
futex_wait_queue30%of 10
futex_wait_restart---of 3
futex_wait_setup23%of 18
futex_wake12%of 17
futex_wake_mark---of 9
futex_wake_op---of 62
put_task_struct---of 6
-----------
SUMMARY17%of 59

__sync_icache_dcache50%of 12
arch_invalidate_pmem---of 1
arch_wb_cache_pmem---of 1
copy_to_user_page---of 5
flush_dcache_folio75%of 4
flush_dcache_page67%of 6
folio_address23%of 35
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
sync_icache_aliases---of 4
-----------
SUMMARY41%of 61

-----------
SUMMARY---of 0

__sk_queue_drop_skb---of 11
__skb_datagram_iter---of 34
__skb_recv_datagram---of 8
__skb_try_recv_datagram---of 13
__skb_try_recv_from_queue---of 25
__skb_wait_for_more_packets---of 13
__zerocopy_sg_from_iter---of 15
_inline_copy_to_user---of 7
csum_and_copy_to_iter---of 70
datagram_poll---of 14
folio_size---of 4
hash_and_copy_to_iter---of 4
kmap_local_folio---of 5
kmap_local_page---of 5
pfn_valid---of 31
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
receiver_wake_function---of 3
simple_copy_to_iter---of 4
skb_copy_and_csum_datagram_msg---of 10
skb_copy_and_hash_datagram_iter---of 1
skb_copy_datagram_from_iter36%of 28
skb_copy_datagram_iter---of 14
skb_free_datagram---of 1
skb_kill_datagram---of 1
skb_recv_datagram---of 8
xas_next_entry---of 15
zerocopy_fill_skb_from_iter---of 29
zerocopy_sg_from_iter---of 7
-----------
SUMMARY36%of 28

-----------
SUMMARY---of 0

__futex_queue100%of 1
__futex_unqueue---of 8
compat_exit_robust_list---of 33
exit_pi_state_list---of 18
exit_robust_list---of 33
fault_in_user_writeable---of 7
folio_lock---of 7
folio_put---of 6
futex_exec_release---of 7
futex_exit_recursive---of 3
futex_exit_release---of 7
futex_hash100%of 1
futex_q_lock67%of 3
futex_q_unlock---of 3
futex_setup_timer67%of 3
futex_top_waiter---of 9
futex_unqueue---of 12
futex_unqueue_pi---of 11
get_futex_key5%of 69
handle_futex_death---of 21
put_task_struct---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
should_fail_futex---of 3
wait_for_owner_exiting---of 5
-----------
SUMMARY12%of 77

-----------
SUMMARY---of 0

aarch32_break_handler---of 26
brk_handler---of 9
clear_os_lock---of 1
create_debug_debugfs_entry---of 1
debug_monitors_arch---of 1
disable_debug_monitors---of 14
enable_debug_monitors---of 17
kernel_active_single_step---of 3
kernel_disable_single_step---of 5
kernel_enable_single_step---of 5
kernel_fastforward_single_step---of 1
kernel_rewind_single_step---of 1
register_kernel_break_hook---of 3
register_kernel_step_hook---of 3
register_user_break_hook---of 3
register_user_step_hook---of 3
single_step_handler---of 12
unregister_kernel_break_hook---of 3
unregister_kernel_step_hook---of 3
unregister_user_break_hook---of 3
unregister_user_step_hook---of 3
user_disable_single_step---of 3
user_enable_single_step---of 5
user_fastforward_single_step---of 3
user_regs_reset_single_step100%of 1
user_rewind_single_step---of 3
-----------
SUMMARY100%of 1

__arm64_sys_memfd_secret---of 14
folio_put---of 6
pfn_valid---of 31
rcu_lock_acquire---of 2
rcu_lock_release---of 2
secretmem_active---of 1
secretmem_fault---of 26
secretmem_free_folio---of 16
secretmem_init_fs_context---of 1
secretmem_migrate_folio---of 1
secretmem_mmap---of 8
secretmem_release---of 3
secretmem_setattr---of 4
vma_is_secretmem100%of 1
-----------
SUMMARY100%of 1

__probestub_kvm_arm_set_dreg32---of 1
__probestub_kvm_handle_sys_reg---of 1
__probestub_kvm_hvc_arm64---of 1
__probestub_kvm_set_guest_debug---of 1
__probestub_kvm_sys_access---of 1
__probestub_kvm_wfx_arm64---of 1
__traceiter_kvm_arm_set_dreg32---of 4
__traceiter_kvm_handle_sys_reg---of 4
__traceiter_kvm_hvc_arm64---of 4
__traceiter_kvm_set_guest_debug---of 4
__traceiter_kvm_sys_access---of 4
__traceiter_kvm_wfx_arm64---of 4
handle_exit29%of 14
handle_exit_early12%of 17
handle_hvc23%of 22
handle_other---of 54
handle_smc50%of 8
handle_svc---of 1
handle_sve---of 32
kvm_handle_eret---of 24
kvm_handle_fpasimd---of 32
kvm_handle_gcs---of 3
kvm_handle_guest_debug58%of 7
kvm_handle_ptrauth---of 26
kvm_handle_unknown_ec---of 3
kvm_handle_wfx29%of 32
nvhe_hyp_panic_handler---of 9
perf_trace_kvm_arm_set_dreg32---of 6
perf_trace_kvm_handle_sys_reg---of 6
perf_trace_kvm_hvc_arm64---of 6
perf_trace_kvm_set_guest_debug---of 6
perf_trace_kvm_sys_access---of 6
perf_trace_kvm_wfx_arm64---of 6
trace_event_raw_event_kvm_arm_set_dreg32---of 7
trace_event_raw_event_kvm_handle_sys_reg---of 7
trace_event_raw_event_kvm_hvc_arm64---of 7
trace_event_raw_event_kvm_set_guest_debug---of 7
trace_event_raw_event_kvm_sys_access---of 7
trace_event_raw_event_kvm_wfx_arm64---of 7
trace_kvm_wfx_arm6424%of 17
trace_raw_output_kvm_arm_set_dreg32---of 3
trace_raw_output_kvm_handle_sys_reg---of 3
trace_raw_output_kvm_hvc_arm64---of 3
trace_raw_output_kvm_set_guest_debug---of 3
trace_raw_output_kvm_sys_access---of 3
trace_raw_output_kvm_wfx_arm64---of 3
-----------
SUMMARY28%of 117

-----------
SUMMARY---of 0

__ext4_expand_extra_isize---of 8
__ext4_get_inode_loc---of 41
__ext4_iget---of 124
__ext4_journalled_invalidate_folio---of 21
__ext4_mark_inode_dirty---of 31
_ext4_get_block---of 14
check_igot_inode---of 10
do_journal_get_write_access---of 4
ext4_alloc_da_blocks---of 16
ext4_begin_ordered_truncate---of 16
ext4_block_truncate_page---of 4
ext4_block_write_begin---of 57
ext4_block_zero_page_range---of 38
ext4_bmap---of 8
ext4_bread---of 10
ext4_bread_batch---of 29
ext4_break_layouts---of 3
ext4_buffer_uptodate---of 5
ext4_can_truncate---of 10
ext4_change_inode_journal_flag---of 25
ext4_chunk_trans_blocks---of 6
ext4_da_get_block_prep---of 65
ext4_da_release_space---of 17
ext4_da_reserve_space---of 16
ext4_da_update_reserve_space---of 21
ext4_da_write_begin---of 37
ext4_da_write_end---of 47
ext4_dax_writepages---of 4
ext4_dio_alignment---of 9
ext4_dirty_folio---of 7
ext4_dirty_inode---of 3
ext4_do_writepages---of 151
ext4_es_is_delayed---of 1
ext4_es_is_mapped---of 1
ext4_evict_inode---of 65
ext4_expand_extra_isize---of 16
ext4_file_getattr---of 4
ext4_fill_raw_inode---of 47
ext4_get_block---of 1
ext4_get_block_unwritten---of 6
ext4_get_fc_inode_loc---of 1
ext4_get_inode_loc---of 3
ext4_get_projid---of 3
ext4_get_reserved_space---of 1
ext4_getattr---of 31
ext4_getblk---of 24
ext4_iget_extra_inode---of 11
ext4_inode_attach_jinode---of 7
ext4_inode_blocks---of 4
ext4_inode_csum---of 4
ext4_inode_csum_set---of 6
ext4_inode_csum_verify---of 6
ext4_inode_is_fast_symlink---of 10
ext4_invalidate_folio---of 17
ext4_iomap_begin---of 28
ext4_iomap_begin_report---of 12
ext4_iomap_end---of 1
ext4_iomap_overwrite_begin---of 4
ext4_iomap_swap_activate---of 1
ext4_issue_zeroout---of 4
ext4_journal_folio_buffers---of 15
ext4_journalled_dirty_folio---of 11
ext4_journalled_invalidate_folio---of 3
ext4_journalled_write_end---of 64
ext4_journalled_zero_new_buffers---of 9
ext4_map_blocks8%of 64
ext4_map_query_blocks---of 6
ext4_mark_iloc_dirty---of 81
ext4_normal_submit_inode_data_buffers---of 1
ext4_page_mkwrite---of 55
ext4_punch_hole---of 39
ext4_read_folio---of 18
ext4_readahead50%of 4
ext4_release_folio---of 18
ext4_reserve_inode_write---of 8
ext4_set_aops---of 8
ext4_set_inode_flags---of 15
ext4_set_inode_state---of 3
ext4_set_iomap---of 25
ext4_setattr---of 80
ext4_should_dioread_nolock---of 6
ext4_truncate---of 75
ext4_truncate_folio---of 10
ext4_truncate_page_cache_block_range---of 7
ext4_update_disksize_before_punch---of 11
ext4_update_inode_fsync_trans---of 7
ext4_wait_dax_page---of 1
ext4_wait_for_tail_page_commit---of 17
ext4_walk_page_buffers---of 8
ext4_write_begin---of 80
ext4_write_end---of 52
ext4_write_inode---of 18
ext4_writepage_trans_blocks---of 8
ext4_writepages---of 6
ext4_zero_partial_blocks---of 8
folio_large_mapcount---of 4
folio_put---of 6
folio_test_uptodate---of 4
i_gid_needs_update---of 3
lock_buffer---of 6
mpage_prepare_extent_to_map---of 73
mpage_process_page_bufs---of 28
mpage_release_unused_pages---of 30
mpage_submit_folio---of 10
percpu_down_read---of 12
percpu_up_read---of 14
put_bh---of 3
rcu_lock_acquire---of 2
rcu_lock_release---of 2
trace_ext4_load_inode---of 14
trace_ext4_writepages---of 14
trace_ext4_writepages_result---of 14
wait_on_buffer---of 3
write_end_fn---of 10
zero_user_segments---of 48
-----------
SUMMARY11%of 68

rcu_lock_acquire---of 2
rcu_lock_release---of 2
tomoyo_commit_condition---of 20
tomoyo_condition2%of 187
tomoyo_get_attributes---of 13
tomoyo_get_condition---of 83
tomoyo_get_dqword---of 7
tomoyo_put_name---of 4
-----------
SUMMARY2%of 187

__arm64_sys_get_mempolicy---of 97
__arm64_sys_mbind---of 71
__arm64_sys_migrate_pages---of 57
__arm64_sys_set_mempolicy---of 17
__arm64_sys_set_mempolicy_home_node---of 35
__get_vma_policy---of 4
__mpol_dup---of 9
__mpol_equal---of 15
__mpol_put---of 5
alloc_frozen_pages_noprof34%of 6
alloc_migration_target_by_mpol---of 9
alloc_pages_bulk_mempolicy_noprof5%of 74
alloc_pages_mpol16%of 19
alloc_pages_noprof30%of 10
apply_policy_zone---of 3
change_prot_numa---of 26
do_migrate_pages---of 36
do_set_mempolicy---of 29
folio_alloc_mpol_noprof23%of 9
folio_alloc_noprof29%of 7
folio_large_mapcount---of 4
folio_order---of 4
get_bitmap---of 15
get_il_weight---of 18
get_task_policy40%of 5
get_vma_policy---of 12
huge_node34%of 12
hugetlb_pmd_shared---of 3
init_nodemask_of_mempolicy---of 11
interleave_nid---of 8
lru_cache_enable---of 3
mbind_range---of 41
mempolicy_in_oom_domain---of 5
mempolicy_kobj_release---of 6
mempolicy_slab_node11%of 19
migrate_folio_add---of 21
mmap_read_unlock---of 3
mmap_write_unlock---of 6
mpol_cond_put---of 7
mpol_free_shared_policy20%of 10
mpol_get---of 4
mpol_misplaced---of 41
mpol_new_nodemask---of 3
mpol_new_preferred---of 4
mpol_parse_str---of 51
mpol_put_task_policy---of 6
mpol_rebind_default---of 1
mpol_rebind_mm---of 23
mpol_rebind_nodemask---of 6
mpol_rebind_preferred---of 1
mpol_rebind_task---of 6
mpol_set_nodemask---of 6
mpol_set_shared_policy---of 53
mpol_shared_policy_init6%of 35
mpol_shared_policy_lookup16%of 13
mpol_to_str---of 20
nearest_node_nodemask---of 6
node_show---of 1
node_store---of 14
numa_default_policy---of 1
numa_nearest_node---of 9
policy_nodemask9%of 24
put_task_struct---of 6
queue_folios_hugetlb---of 39
queue_folios_pte_range---of 69
queue_pages_range---of 3
queue_pages_test_walk---of 26
rcu_lock_acquire---of 2
rcu_lock_release---of 2
read_mems_allowed_begin---of 9
sysfs_wi_release---of 6
vma_alloc_folio_noprof39%of 18
vma_dup_policy50%of 4
vma_iter_init---of 1
vma_migratable---of 12
vma_policy_mof---of 14
weighted_interleave_nid---of 31
weighted_interleave_nodes---of 12
-----------
SUMMARY16%of 265

dump_mem_limit---of 3
free_initmem---of 7
pfn_is_map_memory67%of 3
-----------
SUMMARY67%of 3

of_device_get_match_data---of 6
of_device_make_bus_id---of 14
of_device_modalias---of 7
of_device_uevent16%of 13
of_device_uevent_modalias---of 8
of_dma_configure_id---of 34
of_match_device---of 5
-----------
SUMMARY16%of 13

-----------
SUMMARY---of 0

nf_tables_netdev_event6%of 38
nft_chain_filter_fini---of 1
nft_do_chain_arp---of 1
nft_do_chain_inet---of 6
nft_do_chain_inet_ingress---of 40
nft_do_chain_ipv4---of 1
nft_do_chain_ipv6---of 4
nft_do_chain_netdev---of 24
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY6%of 38

__alloc_workqueue---of 78
__cancel_work---of 13
__flush_work12%of 43
__flush_workqueue17%of 65
__init_work67%of 3
__probestub_workqueue_activate_work---of 1
__probestub_workqueue_execute_end---of 1
__probestub_workqueue_execute_start---of 1
__probestub_workqueue_queue_work---of 1
__pwq_activate_work---of 11
__queue_delayed_work43%of 19
__queue_work19%of 86
__traceiter_workqueue_activate_work---of 4
__traceiter_workqueue_execute_end---of 4
__traceiter_workqueue_execute_start---of 4
__traceiter_workqueue_queue_work---of 4
__warn_flushing_systemwide_wq---of 1
alloc_unbound_pwq---of 37
alloc_workqueue---of 3
alloc_workqueue_attrs---of 5
alloc_workqueue_lockdep_map---of 3
apply_workqueue_attrs---of 4
apply_wqattrs_cleanup---of 19
apply_wqattrs_commit---of 6
apply_wqattrs_prepare---of 32
assign_work---of 17
bh_pool_kick_highpri---of 1
bh_pool_kick_normal---of 1
bh_worker---of 31
cancel_delayed_work---of 1
cancel_delayed_work_sync---of 6
cancel_work---of 1
cancel_work_sync---of 6
check_flush_dependency34%of 15
cpumask_isolated_show---of 1
cpumask_requested_show---of 1
cpumask_show---of 1
cpumask_store---of 7
create_worker---of 14
current_is_workqueue_rescuer---of 5
current_work---of 5
delayed_work_timer_fn---of 1
destroy_delayed_work_on_stack---of 1
destroy_work_on_stack---of 1
destroy_workqueue---of 56
disable_delayed_work---of 1
disable_delayed_work_sync---of 6
disable_work---of 1
disable_work_sync---of 6
drain_dead_softirq_workfn---of 5
drain_workqueue---of 11
enable_delayed_work---of 1
enable_work---of 12
execute_in_process_context---of 3
flush_delayed_work60%of 5
flush_rcu_work---of 3
flush_work100%of 1
flush_workqueue_prep_pwqs44%of 25
format_worker_id---of 5
free_workqueue_attrs---of 3
freeze_workqueues_begin---of 6
freeze_workqueues_busy---of 25
get_pwq---of 6
get_work_pool40%of 10
idle_cull_fn---of 19
idle_worker_timeout---of 6
init_pwq---of 3
init_rescuer---of 11
init_worker_pool---of 4
insert_work50%of 10
install_unbound_pwq---of 15
jhash---of 17
kick_pool24%of 17
local_bh_disable---of 2
local_bh_enable---of 2
max_active_show---of 1
max_active_store---of 4
mod_delayed_work_on58%of 7
move_linked_works---of 8
parse_affn_scope---of 7
per_cpu_show---of 1
perf_trace_workqueue_activate_work---of 6
perf_trace_workqueue_execute_end---of 6
perf_trace_workqueue_execute_start---of 6
perf_trace_workqueue_queue_work---of 6
pool_mayday_timeout---of 19
pr_cont_pool_info---of 5
pr_cont_work---of 16
pr_cont_worker_id---of 3
print_worker_info---of 8
process_one_work---of 74
put_unbound_pool---of 40
pwq_dec_nr_in_flight---of 51
pwq_release_workfn---of 26
pwq_tryinc_nr_active16%of 26
queue_delayed_work_on50%of 12
queue_rcu_work---of 6
queue_work_node---of 17
queue_work_on67%of 12
rcu_free_pool---of 3
rcu_free_wq---of 12
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_work_rcufn---of 3
rescuer_thread---of 53
schedule_on_each_cpu---of 8
set_work_pool_and_clear_pending---of 3
set_worker_desc---of 5
set_worker_dying---of 19
show_all_workqueues---of 32
show_cpu_pools_hogs---of 28
show_freezable_workqueues---of 16
show_one_workqueue---of 11
show_pwq---of 36
thaw_workqueues---of 5
touch_work_lockdep_map---of 5
touch_wq_lockdep_map50%of 6
trace_event_raw_event_workqueue_activate_work---of 7
trace_event_raw_event_workqueue_execute_end---of 7
trace_event_raw_event_workqueue_execute_start---of 7
trace_event_raw_event_workqueue_queue_work---of 7
trace_raw_output_workqueue_activate_work---of 3
trace_raw_output_workqueue_execute_end---of 3
trace_raw_output_workqueue_execute_start---of 3
trace_raw_output_workqueue_queue_work---of 3
trace_workqueue_activate_work24%of 17
unbind_worker---of 8
unbound_pwq---of 10
unbound_wq_update_pwq---of 34
work_busy---of 18
work_debug_hint---of 1
work_fixup_free---of 7
work_fixup_init---of 7
work_for_cpu_fn---of 1
work_grab_pending15%of 42
work_is_static_object---of 1
work_on_cpu_key---of 1
work_on_cpu_safe_key---of 5
worker_attach_to_pool---of 10
worker_enter_idle---of 15
worker_set_flags---of 7
worker_thread---of 50
workqueue_apply_unbound_cpumask---of 18
workqueue_congested---of 15
workqueue_offline_cpu---of 38
workqueue_online_cpu---of 69
workqueue_prepare_cpu---of 7
workqueue_set_max_active---of 7
workqueue_set_min_active---of 3
workqueue_softirq_action---of 4
workqueue_softirq_dead---of 7
workqueue_sysfs_register---of 9
workqueue_unbound_exclude_cpumask---of 7
wq_adjust_max_active---of 20
wq_affinity_strict_show---of 1
wq_affinity_strict_store---of 9
wq_affn_dfl_get---of 1
wq_affn_dfl_set---of 9
wq_affn_scope_show---of 3
wq_affn_scope_store---of 9
wq_barrier_func---of 1
wq_cpumask_show---of 1
wq_cpumask_store---of 9
wq_device_release---of 1
wq_nice_show---of 1
wq_nice_store---of 10
wq_update_node_max_active---of 26
wq_watchdog_param_set_thresh---of 7
wq_watchdog_timer_fn---of 39
wq_watchdog_touch---of 6
wq_worker_comm---of 6
wq_worker_last_func---of 1
wq_worker_running---of 6
wq_worker_sleeping---of 6
wq_worker_tick---of 13
-----------
SUMMARY27%of 425

vgic_debug_destroy100%of 1
vgic_debug_init100%of 1
vgic_debug_next---of 16
vgic_debug_open---of 4
vgic_debug_show---of 26
vgic_debug_start---of 33
vgic_debug_stop---of 5
vgic_its_debug_destroy100%of 1
vgic_its_debug_init67%of 3
vgic_its_debug_next---of 8
vgic_its_debug_open---of 4
vgic_its_debug_show---of 7
vgic_its_debug_start---of 13
vgic_its_debug_stop---of 3
-----------
SUMMARY84%of 6

-----------
SUMMARY---of 0

__rtnl_register_many---of 37
__rtnl_unlock50%of 6
__rtnl_unregister_many---of 19
do_set_master---of 27
do_set_proto_down---of 16
do_setlink---of 221
fdb_vid_parse---of 7
if_nlmsg_size29%of 49
if_nlmsg_stats_size---of 35
local_bh_disable---of 2
lockdep_rtnl_is_held100%of 1
ndo_dflt_bridge_getlink---of 64
ndo_dflt_fdb_add---of 9
ndo_dflt_fdb_del---of 5
ndo_dflt_fdb_dump---of 14
netif_set_operstate---of 6
nla_put_ifalias40%of 5
nla_put_string100%of 1
nla_put_uint67%of 3
nlmsg_parse_deprecated_strict---of 4
nlmsg_populate_fdb_fill---of 10
put_master_ifindex31%of 13
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock34%of 6
refcount_dec_and_rtnl_lock---of 1
rtmsg_ifinfo40%of 5
rtmsg_ifinfo_build_skb38%of 8
rtmsg_ifinfo_newnet---of 4
rtmsg_ifinfo_send---of 3
rtnetlink_bind---of 4
rtnetlink_event34%of 6
rtnetlink_net_exit---of 1
rtnetlink_net_init---of 3
rtnetlink_put_metrics---of 18
rtnetlink_rcv---of 1
rtnetlink_rcv_msg---of 55
rtnetlink_send---of 1
rtnl_af_lookup---of 17
rtnl_af_register---of 4
rtnl_af_unregister---of 3
rtnl_bridge_dellink---of 23
rtnl_bridge_getlink---of 48
rtnl_bridge_notify---of 8
rtnl_bridge_setlink---of 27
rtnl_calcit---of 23
rtnl_configure_link---of 11
rtnl_create_link---of 45
rtnl_delete_link---of 4
rtnl_dellink---of 31
rtnl_dellinkprop---of 1
rtnl_dump_all---of 23
rtnl_dump_ifinfo---of 58
rtnl_dumpit---of 8
rtnl_fdb_add---of 30
rtnl_fdb_del---of 39
rtnl_fdb_dump---of 56
rtnl_fdb_get---of 51
rtnl_fdb_notify---of 4
rtnl_fill_devlink_port34%of 6
rtnl_fill_dpll_pin67%of 3
rtnl_fill_ifinfo16%of 100
rtnl_fill_link_af46%of 11
rtnl_fill_link_ifmap100%of 1
rtnl_fill_link_netnsid38%of 8
rtnl_fill_prop_list34%of 9
rtnl_fill_proto_down29%of 7
rtnl_fill_stats50%of 4
rtnl_fill_statsinfo---of 110
rtnl_fill_vf16%of 13
rtnl_fill_vfinfo---of 38
rtnl_get_link---of 20
rtnl_get_net_ns_capable---of 8
rtnl_getlink---of 49
rtnl_group_dellink---of 15
rtnl_have_link_slave_info---of 13
rtnl_is_locked100%of 1
rtnl_kfree_skbs---of 3
rtnl_link_fill19%of 22
rtnl_link_get_net---of 11
rtnl_link_get_net_capable---of 17
rtnl_link_ops_get---of 14
rtnl_link_register---of 13
rtnl_link_unregister---of 19
rtnl_linkprop---of 31
rtnl_lock100%of 1
rtnl_lock_interruptible---of 1
rtnl_lock_killable---of 1
rtnl_mdb_add---of 13
rtnl_mdb_del---of 20
rtnl_mdb_dump---of 16
rtnl_mdb_get---of 13
rtnl_newlink---of 107
rtnl_newlink_create---of 37
rtnl_newlinkprop---of 1
rtnl_nla_parse_ifinfomsg---of 4
rtnl_notify---of 3
rtnl_offload_xstats_notify---of 7
rtnl_phys_port_id_fill50%of 4
rtnl_phys_port_name_fill34%of 6
rtnl_phys_switch_id_fill50%of 4
rtnl_port_fill8%of 25
rtnl_prop_list_size25%of 12
rtnl_put_cacheinfo---of 5
rtnl_set_sk_err---of 1
rtnl_setlink---of 37
rtnl_stats_dump---of 24
rtnl_stats_get---of 20
rtnl_stats_get_parse---of 25
rtnl_stats_set---of 26
rtnl_trylock---of 1
rtnl_unicast---of 1
rtnl_unlock100%of 1
rtnl_unregister_all---of 17
rtnl_validate_mdb_entry---of 24
rtnl_validate_mdb_entry_del_bulk---of 12
rtnl_validate_mdb_entry_get---of 16
rtnl_xdp_fill27%of 34
set_operstate---of 15
srcu_lock_acquire---of 2
srcu_lock_release---of 2
validate_linkmsg---of 41
-----------
SUMMARY28%of 379

INET_ECN_decapsulate---of 45
dev_dstats_rx_dropped---of 3
dev_dstats_tx_dropped---of 3
geneve6_lookup---of 7
geneve_build_skb---of 13
geneve_change_mtu---of 3
geneve_changelink---of 27
geneve_configure---of 33
geneve_dellink---of 3
geneve_dev_create_fb---of 8
geneve_exit_batch_rtnl---of 9
geneve_exit_net---of 3
geneve_fill_info---of 24
geneve_fill_metadata_dst---of 52
geneve_get_drvinfo---of 1
geneve_get_dsfield---of 19
geneve_get_size---of 1
geneve_gro_complete---of 6
geneve_gro_receive---of 34
geneve_init---of 6
geneve_init_net---of 1
geneve_link_config---of 19
geneve_lookup---of 7
geneve_netdevice_event50%of 4
geneve_newlink---of 6
geneve_nl2info---of 63
geneve_offload_rx_ports---of 16
geneve_open---of 8
geneve_setup---of 1
geneve_sock_add---of 25
geneve_sock_release---of 19
geneve_stop---of 7
geneve_udp_encap_err_lookup---of 26
geneve_udp_encap_recv---of 113
geneve_uninit---of 1
geneve_validate---of 23
geneve_xmit---of 173
ip4_dst_hoplimit---of 17
ip_tunnel_get_ttl---of 17
jhash---of 1
net_generic---of 16
netdev_lock_cmp_fn---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
skb_tunnel_info_unclone---of 17
skb_vlan_inet_prepare---of 25
-----------
SUMMARY50%of 4

__kern_my_cpu_offset---of 1
__probestub_kvm_access_fault---of 1
__probestub_kvm_entry---of 1
__probestub_kvm_exit---of 1
__probestub_kvm_forward_sysreg_trap---of 1
__probestub_kvm_get_timer_map---of 1
__probestub_kvm_guest_fault---of 1
__probestub_kvm_inject_nested_exception---of 1
__probestub_kvm_irq_line---of 1
__probestub_kvm_mmio_emulate---of 1
__probestub_kvm_mmio_nisv---of 1
__probestub_kvm_nested_eret---of 1
__probestub_kvm_set_way_flush---of 1
__probestub_kvm_timer_emulate---of 1
__probestub_kvm_timer_hrtimer_expire---of 1
__probestub_kvm_timer_restore_state---of 1
__probestub_kvm_timer_save_state---of 1
__probestub_kvm_timer_update_irq---of 1
__probestub_kvm_toggle_cache---of 1
__traceiter_kvm_access_fault---of 4
__traceiter_kvm_entry---of 4
__traceiter_kvm_exit---of 4
__traceiter_kvm_forward_sysreg_trap---of 4
__traceiter_kvm_get_timer_map---of 4
__traceiter_kvm_guest_fault---of 4
__traceiter_kvm_inject_nested_exception---of 4
__traceiter_kvm_irq_line---of 4
__traceiter_kvm_mmio_emulate---of 4
__traceiter_kvm_mmio_nisv---of 4
__traceiter_kvm_nested_eret---of 4
__traceiter_kvm_set_way_flush---of 4
__traceiter_kvm_timer_emulate---of 4
__traceiter_kvm_timer_hrtimer_expire---of 4
__traceiter_kvm_timer_restore_state---of 4
__traceiter_kvm_timer_save_state---of 4
__traceiter_kvm_timer_update_irq---of 4
__traceiter_kvm_toggle_cache---of 4
_inline_copy_from_user63%of 8
_inline_copy_to_user58%of 7
cpu_hyp_init---of 4
cpu_hyp_init_context---of 15
cpu_hyp_init_features---of 10
cpu_hyp_uninit---of 4
finalize_init_hyp_mode---of 7
hyp_init_cpu_pm_notifier---of 10
init_pkvm_host_sve_state---of 11
is_kvm_arm_initialised---of 1
kvm_arch_alloc_vm50%of 4
kvm_arch_create_vm_debugfs100%of 1
kvm_arch_destroy_vm50%of 4
kvm_arch_dev_ioctl100%of 1
kvm_arch_disable_virtualization_cpu---of 6
kvm_arch_enable_virtualization_cpu---of 6
kvm_arch_init_vm43%of 7
kvm_arch_intc_initialized100%of 1
kvm_arch_irq_bypass_add_producer---of 3
kvm_arch_irq_bypass_del_producer---of 3
kvm_arch_irq_bypass_start---of 1
kvm_arch_irq_bypass_stop---of 1
kvm_arch_irqchip_in_kernel100%of 1
kvm_arch_sync_dirty_log100%of 1
kvm_arch_vcpu_blocking100%of 1
kvm_arch_vcpu_create50%of 6
kvm_arch_vcpu_destroy50%of 4
kvm_arch_vcpu_fault100%of 1
kvm_arch_vcpu_get_ip---of 1
kvm_arch_vcpu_in_kernel---of 1
kvm_arch_vcpu_ioctl60%of 128
kvm_arch_vcpu_ioctl_get_mpstate100%of 1
kvm_arch_vcpu_ioctl_run40%of 161
kvm_arch_vcpu_ioctl_set_mpstate58%of 7
kvm_arch_vcpu_load14%of 109
kvm_arch_vcpu_postcreate100%of 1
kvm_arch_vcpu_precreate100%of 4
kvm_arch_vcpu_put28%of 18
kvm_arch_vcpu_run_pid_change50%of 26
kvm_arch_vcpu_runnable67%of 6
kvm_arch_vcpu_should_kick67%of 3
kvm_arch_vcpu_unblocking100%of 1
kvm_arch_vm_ioctl77%of 26
kvm_arm_halt_guest60%of 5
kvm_arm_resume_guest60%of 5
kvm_arm_vcpu_power_off---of 3
kvm_arm_vcpu_stopped100%of 1
kvm_destroy_mpidr_data50%of 8
kvm_get_mode---of 1
kvm_get_vcpu_by_id82%of 11
kvm_init_mpidr_data49%of 27
kvm_init_vector_slots---of 13
kvm_mpidr_to_vcpu39%of 39
kvm_vcpu_sleep43%of 14
kvm_vcpu_wfi64%of 11
kvm_vm_ioctl_check_extension61%of 86
kvm_vm_ioctl_enable_cap33%of 34
kvm_vm_ioctl_irq_line50%of 36
lock_all_vcpus70%of 13
perf_trace_kvm_access_fault---of 6
perf_trace_kvm_entry---of 6
perf_trace_kvm_exit---of 6
perf_trace_kvm_forward_sysreg_trap---of 6
perf_trace_kvm_get_timer_map---of 11
perf_trace_kvm_guest_fault---of 6
perf_trace_kvm_inject_nested_exception---of 13
perf_trace_kvm_irq_line---of 6
perf_trace_kvm_mmio_emulate---of 6
perf_trace_kvm_mmio_nisv---of 6
perf_trace_kvm_nested_eret---of 13
perf_trace_kvm_set_way_flush---of 6
perf_trace_kvm_timer_emulate---of 6
perf_trace_kvm_timer_hrtimer_expire---of 6
perf_trace_kvm_timer_restore_state---of 6
perf_trace_kvm_timer_save_state---of 6
perf_trace_kvm_timer_update_irq---of 6
perf_trace_kvm_toggle_cache---of 6
pfn_valid---of 31
pkvm_hyp_init_ptrauth---of 4
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_virt_note_context_switch100%of 1
trace_event_raw_event_kvm_access_fault---of 7
trace_event_raw_event_kvm_entry---of 7
trace_event_raw_event_kvm_exit---of 7
trace_event_raw_event_kvm_forward_sysreg_trap---of 7
trace_event_raw_event_kvm_get_timer_map---of 12
trace_event_raw_event_kvm_guest_fault---of 7
trace_event_raw_event_kvm_inject_nested_exception---of 14
trace_event_raw_event_kvm_irq_line---of 7
trace_event_raw_event_kvm_mmio_emulate---of 7
trace_event_raw_event_kvm_mmio_nisv---of 7
trace_event_raw_event_kvm_nested_eret---of 14
trace_event_raw_event_kvm_set_way_flush---of 7
trace_event_raw_event_kvm_timer_emulate---of 7
trace_event_raw_event_kvm_timer_hrtimer_expire---of 7
trace_event_raw_event_kvm_timer_restore_state---of 7
trace_event_raw_event_kvm_timer_save_state---of 7
trace_event_raw_event_kvm_timer_update_irq---of 7
trace_event_raw_event_kvm_toggle_cache---of 7
trace_raw_output_kvm_access_fault---of 3
trace_raw_output_kvm_entry---of 3
trace_raw_output_kvm_exit---of 3
trace_raw_output_kvm_forward_sysreg_trap---of 3
trace_raw_output_kvm_get_timer_map---of 3
trace_raw_output_kvm_guest_fault---of 3
trace_raw_output_kvm_inject_nested_exception---of 3
trace_raw_output_kvm_irq_line---of 5
trace_raw_output_kvm_mmio_emulate---of 3
trace_raw_output_kvm_mmio_nisv---of 3
trace_raw_output_kvm_nested_eret---of 3
trace_raw_output_kvm_set_way_flush---of 3
trace_raw_output_kvm_timer_emulate---of 3
trace_raw_output_kvm_timer_hrtimer_expire---of 3
trace_raw_output_kvm_timer_restore_state---of 3
trace_raw_output_kvm_timer_save_state---of 3
trace_raw_output_kvm_timer_update_irq---of 3
trace_raw_output_kvm_toggle_cache---of 3
unlock_all_vcpus67%of 9
-----------
SUMMARY48%of 842

-----------
SUMMARY---of 0

__xa_alloc---of 7
__xa_alloc_cyclic---of 8
__xa_clear_mark---of 3
__xa_cmpxchg43%of 7
__xa_erase100%of 1
__xa_insert43%of 7
__xa_set_mark---of 3
__xa_store50%of 6
__xas_next---of 23
__xas_nomem20%of 10
__xas_prev---of 23
node_set_marks---of 29
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
xa_clear_mark---of 3
xa_delete_node---of 6
xa_destroy42%of 12
xa_erase100%of 1
xa_extract---of 64
xa_find36%of 17
xa_find_after40%of 23
xa_get_mark---of 38
xa_get_order---of 13
xa_load24%of 13
xa_parent29%of 7
xa_set_mark---of 3
xa_store---of 1
xa_store_range---of 18
xas_clear_mark36%of 14
xas_create45%of 85
xas_create_range---of 19
xas_destroy---of 4
xas_find60%of 25
xas_find_conflict18%of 52
xas_find_marked6%of 51
xas_free_nodes---of 21
xas_get_mark---of 4
xas_get_order---of 11
xas_init_marks67%of 3
xas_load32%of 22
xas_nomem34%of 6
xas_pause---of 14
xas_set_mark24%of 13
xas_split---of 16
xas_split_alloc---of 12
xas_start26%of 31
xas_store44%of 76
xas_try_split---of 54
xas_try_split_min_order---of 1
-----------
SUMMARY35%of 486

__nf_tables_commit_chain_free_rules---of 1
__nf_tables_dump_rules---of 15
__nf_tables_unregister_hook---of 16
__nft_obj_notify---of 8
__nft_reg_track_cancel---of 1
__nft_release_hook---of 14
__nft_release_table---of 60
__nft_set_elem_destroy---of 19
__nft_trans_set_add---of 7
jhash---of 17
list_add_rcu---of 3
list_add_tail_rcu---of 3
lockdep_commit_lock_is_held---of 1
nf_jiffies64_to_msecs---of 1
nf_msecs_to_jiffies64---of 3
nf_tables_abort---of 391
nf_tables_activate_set---of 6
nf_tables_bind_chain---of 13
nf_tables_bind_check_setelem---of 3
nf_tables_bind_set---of 32
nf_tables_chain_destroy---of 20
nf_tables_chain_notify---of 8
nf_tables_commit---of 380
nf_tables_deactivate_flowtable---of 8
nf_tables_deactivate_set---of 35
nf_tables_delchain---of 49
nf_tables_delflowtable---of 66
nf_tables_delobj---of 39
nf_tables_delrule---of 60
nf_tables_delset---of 42
nf_tables_delsetelem---of 81
nf_tables_deltable---of 50
nf_tables_destroy_set---of 4
nf_tables_dump_chains---of 28
nf_tables_dump_flowtable---of 31
nf_tables_dump_flowtable_done---of 3
nf_tables_dump_flowtable_start---of 5
nf_tables_dump_obj---of 37
nf_tables_dump_obj_done---of 1
nf_tables_dump_obj_start---of 6
nf_tables_dump_rules---of 28
nf_tables_dump_rules_done---of 1
nf_tables_dump_rules_start---of 7
nf_tables_dump_set---of 44
nf_tables_dump_set_done---of 1
nf_tables_dump_set_start---of 1
nf_tables_dump_setelem---of 9
nf_tables_dump_sets---of 29
nf_tables_dump_sets_done---of 1
nf_tables_dump_sets_start---of 3
nf_tables_dump_tables---of 25
nf_tables_dumpreset_obj---of 1
nf_tables_dumpreset_obj_start---of 6
nf_tables_dumpreset_rules---of 1
nf_tables_dumpreset_rules_start---of 7
nf_tables_dumpreset_set---of 4
nf_tables_exit_batch---of 1
nf_tables_exit_net---of 29
nf_tables_expr_parse---of 40
nf_tables_fill_chain_info---of 28
nf_tables_fill_expr_info---of 6
nf_tables_fill_flowtable_info---of 18
nf_tables_fill_gen_info---of 16
nf_tables_fill_obj_info---of 15
nf_tables_fill_rule_info---of 23
nf_tables_fill_set---of 50
nf_tables_fill_set_concat---of 7
nf_tables_fill_setelem---of 50
nf_tables_fill_setelem_info---of 11
nf_tables_fill_table_info---of 16
nf_tables_flowtable_destroy---of 8
nf_tables_flowtable_event15%of 14
nf_tables_flowtable_notify---of 8
nf_tables_getchain---of 17
nf_tables_getflowtable---of 24
nf_tables_getgen---of 4
nf_tables_getobj---of 4
nf_tables_getobj_reset---of 16
nf_tables_getobj_single---of 18
nf_tables_getrule---of 4
nf_tables_getrule_reset---of 16
nf_tables_getrule_single---of 22
nf_tables_getset---of 29
nf_tables_getsetelem---of 12
nf_tables_getsetelem_reset---of 35
nf_tables_gettable---of 15
nf_tables_init_net---of 1
nf_tables_newchain---of 104
nf_tables_newflowtable---of 70
nf_tables_newobj---of 42
nf_tables_newrule---of 111
nf_tables_newset---of 95
nf_tables_newsetelem---of 208
nf_tables_newtable---of 67
nf_tables_parse_netdev_hooks---of 25
nf_tables_pre_exit_net---of 6
nf_tables_register_hook---of 14
nf_tables_rule_destroy---of 7
nf_tables_rule_notify---of 12
nf_tables_set_alloc_name---of 24
nf_tables_set_desc_parse---of 23
nf_tables_set_elem_destroy---of 9
nf_tables_set_notify---of 8
nf_tables_setelem_notify---of 8
nf_tables_table_enable---of 9
nf_tables_table_notify---of 8
nf_tables_trans_destroy_flush_work---of 1
nf_tables_trans_destroy_work---of 81
nf_tables_unbind_chain---of 10
nf_tables_updchain---of 80
nf_tables_updobj---of 4
nf_tables_valid_genid---of 4
nft_chain_add---of 4
nft_chain_del---of 5
nft_chain_hash---of 1
nft_chain_hash_cmp---of 1
nft_chain_hash_obj---of 1
nft_chain_lookup---of 19
nft_chain_parse_hook---of 80
nft_chain_release_hook---of 6
nft_chain_validate---of 15
nft_chain_validate_dependency---of 4
nft_chain_validate_hooks---of 3
nft_data_dump---of 5
nft_data_hold---of 5
nft_data_init---of 39
nft_data_release---of 6
nft_delchain---of 5
nft_delchain_hook---of 28
nft_delflowtable---of 5
nft_delrule---of 26
nft_delset---of 6
nft_dump_basechain_hook---of 15
nft_dump_register---of 1
nft_dump_stats---of 9
nft_expr_clone---of 4
nft_expr_destroy---of 3
nft_expr_dump---of 4
nft_expr_inner_parse---of 27
nft_flowtable_lookup---of 7
nft_flowtable_parse_hook---of 20
nft_flowtable_type_get---of 26
nft_flush_table---of 40
nft_get_set_elem---of 31
nft_hooks_destroy---of 8
nft_map_activate---of 14
nft_map_deactivate---of 15
nft_mapelem_activate---of 11
nft_mapelem_deactivate---of 12
nft_netdev_register_hooks---of 7
nft_netlink_dump_start_rcu---of 12
nft_obj_init---of 12
nft_obj_lookup---of 24
nft_obj_notify---of 3
nft_obj_type_get---of 29
nft_object_dump---of 4
nft_objname_hash---of 1
nft_objname_hash_cmp---of 3
nft_objname_hash_obj---of 1
nft_parse_register_load---of 7
nft_parse_register_store---of 5
nft_parse_u32_check---of 3
nft_pernet---of 16
nft_rcv_nl_event---of 17
nft_reg_track_cancel---of 7
nft_reg_track_update---of 7
nft_register_chain_type---of 4
nft_register_expr---of 6
nft_register_flowtable_net_hooks---of 22
nft_register_flowtable_type---of 3
nft_register_obj---of 4
nft_request_module---of 9
nft_rule_expr_activate---of 7
nft_rule_expr_deactivate---of 7
nft_rule_lookup_byid---of 8
nft_select_set_ops---of 22
nft_set_catchall_dump---of 13
nft_set_catchall_lookup---of 12
nft_set_catchall_validate---of 7
nft_set_destroy---of 27
nft_set_dump_ctx_init---of 22
nft_set_elem_destroy---of 1
nft_set_elem_expr_alloc---of 16
nft_set_elem_expr_clone---of 12
nft_set_elem_expr_setup---of 21
nft_set_elem_init---of 18
nft_set_expr_alloc---of 21
nft_set_is_same---of 15
nft_set_lookup---of 8
nft_set_lookup_global---of 16
nft_setelem_data_deactivate---of 11
nft_setelem_deactivate---of 6
nft_setelem_flush---of 13
nft_setelem_insert---of 7
nft_setelem_remove---of 10
nft_setelem_validate---of 21
nft_stats_alloc---of 7
nft_table_disable---of 18
nft_table_lookup---of 10
nft_table_validate---of 19
nft_trans_alloc---of 3
nft_trans_commit_list_add_elem---of 19
nft_trans_commit_list_add_tail---of 14
nft_trans_destroy---of 9
nft_trans_elem_alloc---of 3
nft_trans_flowtable_add---of 4
nft_trans_gc_alloc---of 17
nft_trans_gc_catchall_async---of 16
nft_trans_gc_catchall_sync---of 28
nft_trans_gc_destroy---of 11
nft_trans_gc_elem_add---of 1
nft_trans_gc_queue_async---of 4
nft_trans_gc_queue_async_done---of 4
nft_trans_gc_queue_sync---of 4
nft_trans_gc_queue_sync_done---of 5
nft_trans_gc_trans_free---of 18
nft_trans_gc_work---of 32
nft_trans_obj_add---of 4
nft_trans_rule_add---of 5
nft_trans_table_add---of 4
nft_unregister_chain_type---of 1
nft_unregister_expr---of 3
nft_unregister_flowtable_type---of 3
nft_unregister_obj---of 3
nft_validate_register_store---of 23
nft_verdict_dump---of 6
nla_memdup_noprof---of 1
nla_put_string---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rhltable_insert_key---of 69
rhltable_lookup---of 30
rhltable_remove---of 78
rht_assign_unlock---of 7
rht_lock---of 12
rht_unlock---of 10
-----------
SUMMARY15%of 14

__clear_pending---of 15
__read_pending---of 15
__set_pending---of 14
dispatch_mmio_read53%of 19
dispatch_mmio_write74%of 19
match_region100%of 3
vgic_data_host_to_mmio_bus---of 4
vgic_data_mmio_bus_to_host---of 4
vgic_find_mmio_region100%of 1
vgic_get_mmio_region46%of 11
vgic_get_vmcr67%of 3
vgic_mmio_change_active62%of 13
vgic_mmio_read_active---of 12
vgic_mmio_read_config---of 4
vgic_mmio_read_enable---of 4
vgic_mmio_read_group---of 4
vgic_mmio_read_pending---of 1
vgic_mmio_read_priority---of 4
vgic_mmio_read_rao---of 1
vgic_mmio_read_raz100%of 1
vgic_mmio_uaccess_write_cactive---of 3
vgic_mmio_uaccess_write_sactive100%of 3
vgic_mmio_uaccess_write_wi---of 1
vgic_mmio_write_cactive60%of 10
vgic_mmio_write_cenable67%of 6
vgic_mmio_write_config---of 6
vgic_mmio_write_cpending---of 1
vgic_mmio_write_group63%of 8
vgic_mmio_write_priority63%of 8
vgic_mmio_write_sactive---of 10
vgic_mmio_write_senable30%of 10
vgic_mmio_write_spending---of 1
vgic_mmio_write_wi100%of 1
vgic_read_irq_line_level_info86%of 7
vgic_register_dist_iodev50%of 4
vgic_set_vmcr---of 3
vgic_uaccess74%of 15
vgic_uaccess_read_active---of 4
vgic_uaccess_read_pending---of 1
vgic_uaccess_write_cenable---of 3
vgic_uaccess_write_cpending---of 1
vgic_uaccess_write_senable100%of 3
vgic_uaccess_write_spending---of 1
vgic_write_irq_line_level_info100%of 6
-----------
SUMMARY66%of 151

__path_add---of 9
__path_find---of 7
create_child_store---of 4
delete_child_store---of 4
dev_id_show---of 5
ipoib_add_one---of 30
ipoib_add_pkey_attr---of 1
ipoib_add_umcast_attr---of 1
ipoib_change_mtu---of 15
ipoib_del_neighs_by_gid---of 23
ipoib_dev_init_default---of 6
ipoib_dev_uninit_default---of 1
ipoib_fix_features---of 1
ipoib_flush_paths---of 22
ipoib_get_iflink---of 3
ipoib_get_net_dev_by_params---of 26
ipoib_get_stats---of 3
ipoib_get_vf_config---of 5
ipoib_get_vf_guid---of 1
ipoib_get_vf_stats---of 1
ipoib_hard_header---of 1
ipoib_ib_tx_timeout_work---of 7
ipoib_intf_alloc---of 7
ipoib_intf_free---of 3
ipoib_intf_init---of 7
ipoib_ioctl---of 3
ipoib_is_dev_match_addr_rcu---of 33
ipoib_mark_paths_invalid---of 10
ipoib_match_gid_pkey_addr---of 55
ipoib_ndo_init---of 41
ipoib_ndo_uninit---of 17
ipoib_neigh_alloc---of 41
ipoib_neigh_dtor---of 19
ipoib_neigh_free---of 29
ipoib_neigh_get---of 39
ipoib_neigh_hash_uninit---of 27
ipoib_neigh_reclaim---of 6
ipoib_netdev_event34%of 6
ipoib_open---of 21
ipoib_path_iter_init---of 4
ipoib_path_iter_next---of 5
ipoib_path_iter_read---of 1
ipoib_reap_neigh---of 23
ipoib_remove_one---of 7
ipoib_set_mac---of 7
ipoib_set_mcast_list---of 4
ipoib_set_mode---of 16
ipoib_set_umcast---of 7
ipoib_set_vf_guid---of 3
ipoib_set_vf_link_state---of 1
ipoib_setup_common---of 1
ipoib_start_xmit---of 57
ipoib_stop---of 15
ipoib_timeout---of 7
ipoib_upper_walk---of 8
local_bh_disable---of 2
local_bh_enable---of 2
neigh_add_path---of 49
neigh_hash_free_rcu---of 1
path_rec_completion---of 59
path_rec_create---of 4
path_rec_start---of 6
pkey_show---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
set_base_guid---of 7
umcast_show---of 1
umcast_store---of 7
-----------
SUMMARY34%of 6

fixup_exception32%of 16
insn_may_access_user75%of 4
-----------
SUMMARY40%of 20

-----------
SUMMARY---of 0

__arch_timer_setup---of 14
__kern_my_cpu_offset---of 1
arch_counter_get_cntpct_stable---of 7
arch_counter_get_cntvct_stable---of 7
arch_counter_read100%of 1
arch_counter_read_cc100%of 1
arch_timer_check_ool_workaround---of 39
arch_timer_cpu_pm_notify---of 13
arch_timer_dying_cpu---of 3
arch_timer_evtstrm_available---of 3
arch_timer_evtstrm_dying_cpu---of 5
arch_timer_evtstrm_starting_cpu---of 10
arch_timer_get_kvm_info---of 1
arch_timer_get_rate---of 1
arch_timer_handler_phys---of 3
arch_timer_handler_phys_mem---of 3
arch_timer_handler_virt---of 3
arch_timer_handler_virt_mem---of 3
arch_timer_read_cntpct_el0---of 1
arch_timer_read_cntvct_el0---of 1
arch_timer_set_next_event_phys---of 1
arch_timer_set_next_event_phys_mem---of 3
arch_timer_set_next_event_virt---of 1
arch_timer_set_next_event_virt_mem---of 3
arch_timer_shutdown_phys---of 1
arch_timer_shutdown_phys_mem---of 1
arch_timer_shutdown_virt---of 1
arch_timer_shutdown_virt_mem---of 1
arch_timer_starting_cpu---of 8
arm64_858921_read_cntpct_el0---of 1
arm64_858921_read_cntvct_el0---of 1
erratum_set_next_event_phys---of 7
erratum_set_next_event_virt---of 7
fsl_a008585_read_cntpct_el0---of 5
fsl_a008585_read_cntvct_el0---of 5
hisi_161010101_read_cntpct_el0---of 5
hisi_161010101_read_cntvct_el0---of 5
kvm_arch_ptp_get_crosststamp---of 9
-----------
SUMMARY100%of 2

__netlink_change_ngroups---of 11
__netlink_clear_multicast_users---of 9
__netlink_create---of 3
__netlink_deliver_tap---of 33
__netlink_dump_start---of 22
__netlink_kernel_create---of 20
__netlink_lookup---of 40
__netlink_ns_capable---of 4
__netlink_sendskb---of 1
__netlink_seq_next---of 7
__nlmsg_put100%of 1
__probestub_netlink_extack---of 1
__traceiter_netlink_extack---of 4
_inline_copy_to_user---of 7
deferred_put_nlk_sk---of 6
do_trace_netlink_extack---of 17
net_generic25%of 16
netlink_ack---of 19
netlink_ack_tlv_fill---of 23
netlink_ack_tlv_len---of 8
netlink_add_tap---of 4
netlink_alloc_large_skb---of 6
netlink_allowed---of 3
netlink_attachskb---of 41
netlink_autobind---of 16
netlink_bind---of 49
netlink_broadcast100%of 1
netlink_broadcast_filtered30%of 79
netlink_capable---of 4
netlink_change_ngroups---of 1
netlink_compare---of 3
netlink_connect---of 14
netlink_create---of 24
netlink_deliver_tap31%of 13
netlink_detachskb---of 6
netlink_dump---of 46
netlink_getname---of 10
netlink_getsockbyfd---of 11
netlink_getsockopt---of 37
netlink_has_listeners25%of 20
netlink_hash---of 1
netlink_insert---of 73
netlink_ioctl---of 1
netlink_kernel_release---of 4
netlink_lock_table---of 3
netlink_lookup---of 17
netlink_net_capable---of 4
netlink_net_exit---of 1
netlink_net_init---of 1
netlink_ns_capable---of 4
netlink_rcv_skb---of 12
netlink_realloc_groups---of 6
netlink_recvmsg---of 37
netlink_register_notifier---of 1
netlink_release---of 102
netlink_remove_tap---of 5
netlink_sendmsg---of 33
netlink_sendskb---of 6
netlink_seq_next---of 1
netlink_seq_show---of 6
netlink_seq_start---of 4
netlink_seq_stop---of 6
netlink_set_err---of 12
netlink_setsockopt---of 47
netlink_skb_destructor---of 9
netlink_sock_destruct---of 8
netlink_strict_get_check---of 1
netlink_table_grab---of 7
netlink_table_ungrab---of 1
netlink_tap_init_net---of 1
netlink_trim39%of 13
netlink_undo_bind---of 6
netlink_unicast---of 35
netlink_unlock_table---of 5
netlink_unregister_notifier---of 1
netlink_update_listeners---of 15
netlink_update_socket_mc---of 8
nlmsg_notify30%of 10
perf_trace_netlink_extack---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
refcount_inc---of 6
rht_assign_unlock---of 7
rht_lock---of 12
rht_unlock---of 10
trace_event_raw_event_netlink_extack---of 7
trace_raw_output_netlink_extack---of 3
-----------
SUMMARY32%of 157

kvm_clr_pmu_events50%of 4
kvm_get_pmu_events---of 1
kvm_set_pmu_events42%of 12
kvm_set_pmuserenr24%of 13
kvm_vcpu_pmu_read_evtype_direct15%of 34
kvm_vcpu_pmu_restore_guest39%of 13
kvm_vcpu_pmu_restore_host37%of 11
kvm_vcpu_pmu_resync_el029%of 7
kvm_vcpu_pmu_write_evtype_direct15%of 34
-----------
SUMMARY25%of 128

-----------
SUMMARY---of 0

__fanout_link---of 3
__fanout_set_data_bpf---of 8
__packet_get_status---of 8
__packet_rcv_has_room---of 7
__packet_set_status---of 8
__register_prot_hook---of 12
__unregister_prot_hook---of 30
__vlan_get_protocol_offset---of 16
_inline_copy_from_user---of 8
bpf_prog_run_clear_cb---of 8
copy_from_sockptr---of 5
fanout_add---of 50
fanout_demux_rollover---of 43
fanout_init_data---of 5
fanout_set_data---of 13
free_pg_vec---of 7
match_fanout_group---of 3
nf_hook_direct_egress---of 34
packet_bind---of 4
packet_bind_spkt---of 3
packet_cached_dev_get---of 22
packet_create---of 22
packet_do_bind---of 69
packet_extra_vlan_len_allowed---of 6
packet_getname---of 16
packet_getname_spkt---of 15
packet_getsockopt---of 47
packet_increment_rx_head---of 3
packet_ioctl---of 30
packet_mc_add---of 26
packet_mc_drop---of 17
packet_mm_close---of 4
packet_mm_open---of 4
packet_mmap---of 20
packet_net_exit---of 3
packet_net_init---of 1
packet_notifier10%of 42
packet_parse_headers---of 39
packet_poll---of 12
packet_rcv---of 52
packet_rcv_fanout---of 38
packet_rcv_spkt---of 22
packet_recvmsg---of 62
packet_release---of 52
packet_sendmsg---of 214
packet_sendmsg_spkt---of 54
packet_seq_next---of 1
packet_seq_show---of 3
packet_seq_start---of 6
packet_seq_stop---of 6
packet_set_ring---of 66
packet_setsockopt---of 84
packet_sock_destruct---of 7
packet_sock_flag_set---of 5
packet_xmit---of 11
prb_dispatch_next_block---of 3
prb_fill_curr_block---of 20
prb_retire_current_block---of 13
prb_retire_rx_blk_timer_expired---of 10
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
run_filter---of 18
skb_clear_delivery_time---of 4
skb_csum_unnecessary---of 5
skb_get---of 6
skb_set_delivery_type_by_clockid---of 7
skb_set_owner_r---of 8
skb_setup_tx_timestamp---of 8
tpacket_destruct_skb---of 18
tpacket_get_timestamp---of 8
tpacket_rcv---of 92
virtio_net_hdr_from_skb---of 11
virtio_net_hdr_set_proto---of 5
virtio_net_hdr_to_skb---of 70
vlan_get_protocol_dgram---of 6
vlan_get_tci---of 12
-----------
SUMMARY18%of 46

-----------
SUMMARY---of 0

cbs_change---of 13
cbs_child_dequeue---of 8
cbs_dequeue---of 1
cbs_dequeue_offload---of 1
cbs_dequeue_soft---of 11
cbs_destroy---of 8
cbs_dev_notifier40%of 10
cbs_dump---of 5
cbs_dump_class---of 4
cbs_enqueue---of 1
cbs_enqueue_offload---of 3
cbs_enqueue_soft---of 6
cbs_find---of 1
cbs_graft---of 25
cbs_init---of 8
cbs_leaf---of 1
cbs_set_port_rate---of 7
cbs_walk---of 5
qdisc_peek_dequeued---of 4
qdisc_reset_queue---of 6
-----------
SUMMARY40%of 10

-----------
SUMMARY---of 0

__es_find_extent_range---of 25
__es_insert_extent---of 59
__es_remove_extent---of 81
count_rsvd---of 17
es_do_reclaim_extents---of 18
ext4_clear_inode_es---of 8
ext4_es_cache_extent---of 31
ext4_es_count---of 14
ext4_es_find_extent_range---of 28
ext4_es_free_extent---of 9
ext4_es_init_tree---of 1
ext4_es_insert_delayed_extent---of 58
ext4_es_insert_extent---of 197
ext4_es_lookup_extent31%of 43
ext4_es_register_shrinker---of 7
ext4_es_remove_extent---of 22
ext4_es_scan---of 66
ext4_es_scan_clu---of 5
ext4_es_scan_range---of 5
ext4_es_unregister_shrinker---of 1
ext4_exit_es---of 1
ext4_exit_pending---of 1
ext4_init_pending_tree---of 1
ext4_is_pending---of 7
ext4_remove_pending---of 7
ext4_seq_es_shrinker_info_show---of 9
-----------
SUMMARY31%of 43

__ipv6_neigh_lookup_noref_stub---of 12
call_fib_nh_notifiers---of 21
fib4_semantics_exit---of 1
fib4_semantics_init---of 3
fib_add_multipath---of 14
fib_add_nexthop---of 8
fib_check_nh---of 105
fib_create_info---of 125
fib_detect_death---of 17
fib_dump_info---of 40
fib_find_info---of 40
fib_get_nhs---of 51
fib_info_hash_bucket---of 6
fib_info_update_nhc_saddr---of 3
fib_metrics_match---of 13
fib_nexthop_info---of 48
fib_nh_common_init---of 15
fib_nh_common_release---of 29
fib_nh_init---of 8
fib_nh_match---of 61
fib_nh_release---of 4
fib_nhc_update_mtu---of 11
fib_nlmsg_size---of 11
fib_rebalance9%of 35
fib_release_info---of 28
fib_result_prefsrc---of 6
fib_select_default---of 33
fib_select_multipath---of 37
fib_select_path---of 17
fib_sync_down_addr---of 19
fib_sync_down_dev---of 34
fib_sync_mtu---of 16
fib_sync_up25%of 33
fib_valid_prefsrc---of 8
free_fib_info---of 3
free_fib_info_rcu---of 18
hlist_add_head_rcu---of 3
ip_fib_check_default---of 9
ipv6_addr_cmp---of 1
netdev_tracker_alloc---of 5
nexthop_fib_nhc---of 18
nexthop_get---of 7
nexthop_is_blackhole---of 16
nexthop_mpath_fill_node---of 20
nexthop_num_path---of 10
nexthop_path_fib_result---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
refcount_inc---of 6
rtmsg_fib---of 7
-----------
SUMMARY17%of 68

irq_bypass_register_consumer19%of 27
irq_bypass_register_producer---of 25
irq_bypass_unregister_consumer---of 19
irq_bypass_unregister_producer---of 19
-----------
SUMMARY19%of 27

-----------
SUMMARY---of 0

__dquot_alloc_space5%of 72
__dquot_drop---of 22
__dquot_free_space7%of 43
__dquot_initialize5%of 46
__dquot_transfer---of 117
__quota_error---of 3
add_dquot_ref---of 17
clear_dquot_dirty---of 8
do_get_dqblk---of 1
do_proc_dqstats---of 1
dqcache_shrink_count---of 1
dqcache_shrink_scan---of 16
dqget---of 34
dqput---of 15
dquot_acquire---of 15
dquot_add_inodes---of 22
dquot_add_space---of 28
dquot_alloc---of 1
dquot_alloc_inode3%of 67
dquot_claim_space_nodirty---of 35
dquot_commit---of 4
dquot_commit_info---of 1
dquot_destroy---of 1
dquot_disable---of 88
dquot_drop34%of 6
dquot_file_open---of 4
dquot_free_inode7%of 32
dquot_get_dqblk---of 3
dquot_get_next_dqblk---of 5
dquot_get_next_id---of 4
dquot_get_state---of 12
dquot_initialize100%of 1
dquot_initialize_needed---of 12
dquot_load_quota_inode---of 13
dquot_load_quota_sb---of 36
dquot_mark_dquot_dirty---of 10
dquot_quota_disable---of 24
dquot_quota_enable---of 20
dquot_quota_off---of 1
dquot_quota_on---of 4
dquot_quota_on_mount---of 4
dquot_quota_sync---of 21
dquot_reclaim_space_nodirty---of 35
dquot_release---of 8
dquot_resume---of 26
dquot_scan_active---of 13
dquot_set_dqblk---of 63
dquot_set_dqinfo---of 13
dquot_transfer---of 13
dquot_writeback_dquots---of 37
mark_all_dquot_dirty---of 25
mark_info_dirty---of 1
prepare_warning---of 7
quota_release_workfn---of 19
register_quota_format---of 1
srcu_lock_acquire---of 2
srcu_lock_release---of 2
srcu_read_lock_held---of 3
unregister_quota_format---of 4
-----------
SUMMARY6%of 267

__nla_parse---of 1
__nla_put100%of 1
__nla_put_64bit100%of 1
__nla_put_nohdr---of 1
__nla_reserve---of 1
__nla_reserve_64bit---of 1
__nla_reserve_nohdr---of 1
__nla_validate---of 1
__nla_validate_parse---of 174
nla_append---of 5
nla_find---of 6
nla_get_range_signed---of 8
nla_get_range_unsigned---of 13
nla_memcmp---of 3
nla_memcpy---of 3
nla_policy_len---of 8
nla_put60%of 5
nla_put_64bit60%of 5
nla_put_nohdr---of 5
nla_reserve60%of 5
nla_reserve_64bit60%of 5
nla_reserve_nohdr---of 5
nla_strcmp---of 7
nla_strdup---of 5
nla_strscpy---of 5
nla_validate_array---of 10
-----------
SUMMARY64%of 22

__udp_tunnel_nic_add_port---of 42
__udp_tunnel_nic_del_port---of 12
__udp_tunnel_nic_device_sync---of 64
__udp_tunnel_nic_dump_size---of 7
__udp_tunnel_nic_dump_write---of 13
__udp_tunnel_nic_get_port---of 3
__udp_tunnel_nic_reset_ntf---of 15
__udp_tunnel_nic_set_port_priv---of 1
udp_tunnel_get_rx_info---of 6
udp_tunnel_nic_device_sync_work---of 30
udp_tunnel_nic_flush---of 19
udp_tunnel_nic_is_empty---of 9
udp_tunnel_nic_netdevice_event3%of 81
udp_tunnel_nic_try_existing---of 22
-----------
SUMMARY3%of 81

hsr_get_version---of 1
hsr_netdev_notify9%of 47
hsr_port_get_hsr---of 4
-----------
SUMMARY9%of 47

__vfio_register_dev---of 14
_inline_copy_from_user---of 8
_inline_copy_to_user---of 7
_vfio_alloc_device---of 14
vfio_allocate_device_file---of 3
vfio_assign_device_set---of 9
vfio_combine_iova_ranges---of 15
vfio_device_fops_cdev_open---of 1
vfio_device_fops_mmap---of 4
vfio_device_fops_read---of 4
vfio_device_fops_release---of 8
vfio_device_fops_unl_ioctl---of 34
vfio_device_fops_write---of 4
vfio_device_get_kvm_safe---of 9
vfio_device_log_read_and_clear---of 1
vfio_device_put_kvm---of 7
vfio_device_put_registration---of 6
vfio_device_release---of 3
vfio_device_set_open_count---of 7
vfio_device_try_get_registration---of 7
vfio_df_close---of 14
vfio_df_open---of 17
vfio_dma_rw---of 6
vfio_file_enforced_coherent---of 5
vfio_file_is_valid50%of 4
vfio_file_set_kvm---of 6
vfio_find_device_in_devset---of 7
vfio_fs_init_fs_context---of 1
vfio_info_add_capability---of 3
vfio_info_cap_add---of 6
vfio_info_cap_shift---of 6
vfio_ioct_mig_return_fd---of 9
vfio_ioctl_device_feature_logging_report---of 10
vfio_ioctl_device_feature_logging_start---of 18
vfio_ioctl_device_feature_mig_device_state---of 18
vfio_ioctl_device_feature_migration---of 11
vfio_ioctl_device_feature_migration_data_size---of 12
vfio_mig_get_next_state---of 7
vfio_pin_pages---of 7
vfio_register_emulated_iommu_dev---of 1
vfio_register_group_dev---of 1
vfio_release_device_set---of 6
vfio_set_irqs_validate_and_prepare---of 15
vfio_unpin_pages---of 6
vfio_unregister_group_dev---of 14
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

__account_locked_vm---of 11
__vcalloc_noprof67%of 3
__vm_enough_memory15%of 14
__vmalloc_array_noprof---of 3
_inline_copy_from_user50%of 8
account_locked_vm---of 21
arch_mmap_rnd---of 1
arch_pick_mmap_layout---of 15
arch_randomize_brk---of 5
folio_anon_vma---of 1
folio_copy---of 6
folio_mapping50%of 6
folio_mc_copy---of 6
get_cmdline---of 9
kfree_const---of 3
kmap_local_page---of 35
kmemdup_array---of 3
kmemdup_noprof67%of 3
kmemdup_nul---of 5
kstrdup---of 5
kstrdup_const---of 3
kstrndup---of 6
kvmemdup---of 3
mem_dump_obj---of 7
memdup_user58%of 7
memdup_user_nul---of 7
overcommit_kbytes_handler---of 3
overcommit_policy_handler---of 6
overcommit_ratio_handler---of 3
page_offline_begin---of 1
page_offline_end---of 1
page_offline_freeze---of 1
page_offline_thaw---of 1
randomize_page---of 3
randomize_stack_top---of 3
rcu_lock_acquire---of 2
rcu_lock_release---of 2
strndup_user---of 10
sync_overcommit_as---of 1
vcalloc_noprof---of 3
vm_commit_limit---of 3
vm_memory_committed---of 1
vm_mmap---of 3
vm_mmap_pgoff50%of 20
vma_is_stack_for_current---of 3
vma_set_file---of 6
vmalloc_array_noprof---of 3
vmemdup_user58%of 7
-----------
SUMMARY46%of 68

__debug_restore_state9%of 68
__debug_save_state9%of 68
__debug_switch_to_guest67%of 6
__debug_switch_to_host67%of 6
-----------
SUMMARY14%of 148

-----------
SUMMARY---of 0

kobj_lookup24%of 13
kobj_map---of 9
kobj_map_init---of 4
kobj_unmap---of 10
-----------
SUMMARY24%of 13

-----------
SUMMARY---of 0

__arm64_sys_madvise---of 4
__arm64_sys_process_madvise---of 27
anon_vma_name50%of 4
anon_vma_name_alloc---of 4
anon_vma_name_free---of 1
do_madvise---of 4
folio_get---of 4
folio_large_mapcount---of 4
folio_lock---of 7
folio_put---of 6
guard_install_pmd_entry---of 4
guard_install_pte_entry---of 5
guard_install_pud_entry---of 1
guard_install_set_pte---of 1
guard_remove_pmd_entry---of 6
guard_remove_pte_entry---of 6
guard_remove_pud_entry---of 1
madvise_cold_or_pageout_pte_range---of 142
madvise_do_behavior---of 236
madvise_folio_pte_batch---of 32
madvise_free_pte_range---of 106
madvise_lock---of 26
madvise_set_anon_name---of 21
madvise_should_skip---of 33
madvise_unlock---of 21
madvise_update_vma---of 47
mmu_notifier_invalidate_range_start---of 3
pte_unmap---of 6
put_task_struct---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
set_pmd_at---of 23
swapin_walk_pmd_entry---of 29
tlb_end_vma---of 53
-----------
SUMMARY50%of 4

__ib_get_client_nl_info---of 11
__ib_get_global_client_nl_info---of 9
__ib_unregister_device---of 15
__ibdev_printk---of 8
_ib_alloc_device---of 5
add_client_context---of 40
add_one_compat_dev---of 16
alloc_port_data---of 9
compatdev_release---of 1
disable_device---of 16
enable_device_and_get---of 15
free_netdevs---of 18
ib_add_sub_device---of 17
ib_dealloc_device---of 11
ib_del_sub_device_and_put---of 14
ib_device_get_by_index---of 11
ib_device_get_by_name---of 15
ib_device_get_by_netdev16%of 26
ib_device_get_netdev---of 18
ib_device_notify_register---of 12
ib_device_put---of 6
ib_device_release---of 10
ib_device_rename---of 26
ib_device_set_dim---of 3
ib_device_set_netdev---of 31
ib_device_set_netns_put---of 25
ib_device_uevent---of 3
ib_dispatch_event_clients---of 4
ib_dispatch_port_state_event---of 6
ib_dma_virt_map_sg---of 38
ib_enum_all_devs---of 9
ib_enum_all_roce_netdevs---of 4
ib_enum_roce_netdev---of 12
ib_find_gid---of 12
ib_find_pkey---of 14
ib_get_client_nl_info---of 9
ib_get_device_fw_str---of 3
ib_get_net_dev_by_params---of 9
ib_modify_device---of 3
ib_modify_port---of 10
ib_netdevice_event14%of 29
ib_policy_change_task---of 7
ib_port_immutable_read---of 6
ib_query_netdev_port---of 14
ib_query_pkey---of 7
ib_query_port---of 37
ib_register_client---of 11
ib_register_device---of 68
ib_register_event_handler---of 3
ib_security_change---of 3
ib_set_client_data---of 5
ib_set_device_ops---of 408
ib_unregister_client---of 45
ib_unregister_device---of 1
ib_unregister_device_and_put---of 8
ib_unregister_device_queued---of 7
ib_unregister_driver---of 7
ib_unregister_event_handler---of 3
ib_unregister_work---of 1
ibdev_alert---of 1
ibdev_crit---of 1
ibdev_emerg---of 1
ibdev_err---of 1
ibdev_info---of 1
ibdev_notice---of 1
ibdev_warn---of 1
net_namespace---of 1
prevent_dealloc_device---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rdma_compatdev_set---of 12
rdma_dev_access_netns---of 3
rdma_dev_change_netns---of 14
rdma_dev_exit_net---of 8
rdma_dev_init_net---of 8
rdma_init_coredev---of 4
rdma_net_to_dev_net---of 16
remove_all_compat_devs---of 9
remove_client_context---of 14
xan_find_marked---of 19
-----------
SUMMARY21%of 59

-----------
SUMMARY---of 0

clear_shadow_entries---of 13
folio_invalidate---of 3
folio_large_mapcount---of 4
folio_test_large---of 3
folio_unmap_invalidate---of 28
generic_error_remove_folio---of 5
invalidate_inode_pages2---of 1
invalidate_inode_pages2_range---of 41
invalidate_mapping_pages---of 1
mapping_evict_folio---of 11
mapping_try_invalidate---of 14
pagecache_isize_extended---of 17
rcu_lock_acquire---of 2
rcu_lock_release---of 2
truncate_cleanup_folio32%of 19
truncate_folio_batch_exceptionals---of 22
truncate_inode_folio67%of 3
truncate_inode_pages---of 1
truncate_inode_pages_final60%of 5
truncate_inode_pages_range4%of 64
truncate_inode_partial_folio---of 57
truncate_pagecache---of 1
truncate_pagecache_range---of 3
truncate_setsize---of 3
try_folio_split---of 4
xas_next_entry---of 15
zero_user_segments---of 43
-----------
SUMMARY15%of 91

arch_prctl_spec_ctrl_get---of 18
arch_prctl_spec_ctrl_set---of 41
arm64_get_spectre_bhb_state100%of 1
arm64_get_spectre_v2_state100%of 1
arm64_get_spectre_v4_state100%of 1
cpu_show_spec_store_bypass---of 4
cpu_show_spectre_v1---of 1
cpu_show_spectre_v2---of 8
has_spectre_bhb_fw_mitigation---of 4
has_spectre_v2---of 10
has_spectre_v3a---of 5
has_spectre_v4---of 10
is_spectre_bhb_affected---of 15
spectre_bhb_enable_mitigation---of 31
spectre_bhb_loop_affected---of 7
spectre_v2_enable_mitigation---of 27
spectre_v3a_enable_mitigation---of 3
spectre_v4_enable_mitigation---of 42
spectre_v4_enable_task_mitigation---of 11
spectre_v4_mitigations_dynamic---of 6
spectre_v4_mitigations_off---of 6
ssbd_prctl_enable_mitigation---of 7
this_cpu_set_vectors---of 3
try_emulate_el1_ssbs---of 3
unpriv_ebpf_notify---of 4
-----------
SUMMARY100%of 3

-----------
SUMMARY---of 0

__bio_queue_enter---of 23
__blk_flush_plug22%of 14
__probestub_block_bio_backmerge---of 1
__probestub_block_bio_bounce---of 1
__probestub_block_bio_complete---of 1
__probestub_block_bio_frontmerge---of 1
__probestub_block_bio_queue---of 1
__probestub_block_bio_remap---of 1
__probestub_block_dirty_buffer---of 1
__probestub_block_getrq---of 1
__probestub_block_io_done---of 1
__probestub_block_io_start---of 1
__probestub_block_plug---of 1
__probestub_block_rq_complete---of 1
__probestub_block_rq_error---of 1
__probestub_block_rq_insert---of 1
__probestub_block_rq_issue---of 1
__probestub_block_rq_merge---of 1
__probestub_block_rq_remap---of 1
__probestub_block_rq_requeue---of 1
__probestub_block_split---of 1
__probestub_block_touch_buffer---of 1
__probestub_block_unplug---of 1
__submit_bio---of 15
__traceiter_block_bio_backmerge---of 4
__traceiter_block_bio_bounce---of 4
__traceiter_block_bio_complete---of 4
__traceiter_block_bio_frontmerge---of 4
__traceiter_block_bio_queue---of 4
__traceiter_block_bio_remap---of 4
__traceiter_block_dirty_buffer---of 4
__traceiter_block_getrq---of 4
__traceiter_block_io_done---of 4
__traceiter_block_io_start---of 4
__traceiter_block_plug---of 4
__traceiter_block_rq_complete---of 4
__traceiter_block_rq_error---of 4
__traceiter_block_rq_insert---of 4
__traceiter_block_rq_issue---of 4
__traceiter_block_rq_merge---of 4
__traceiter_block_rq_remap---of 4
__traceiter_block_rq_requeue---of 4
__traceiter_block_split---of 4
__traceiter_block_touch_buffer---of 4
__traceiter_block_unplug---of 4
bdev_end_io_acct---of 11
bdev_set_flag---of 3
bdev_start_io_acct---of 5
bio_end_io_acct_remapped---of 1
bio_poll---of 28
bio_start_io_acct---of 1
blk_alloc_queue---of 7
blk_check_plugged---of 12
blk_clear_pm_only---of 6
blk_finish_plug67%of 3
blk_free_queue_rcu---of 1
blk_get_queue---of 7
blk_io_schedule---of 3
blk_lld_busy---of 4
blk_op_str---of 4
blk_put_queue---of 7
blk_queue_enter---of 28
blk_queue_exit---of 18
blk_queue_flag_clear---of 3
blk_queue_flag_set---of 3
blk_queue_start_drain---of 3
blk_queue_usage_counter_release---of 1
blk_rq_timed_out_timer---of 1
blk_set_pm_only---of 3
blk_start_plug67%of 3
blk_start_plug_nr_ios---of 3
blk_status_to_errno---of 3
blk_status_to_str---of 3
blk_sync_queue---of 1
blk_timeout_work---of 1
blk_try_enter_queue---of 31
errno_to_blk_status---of 20
iocb_bio_iopoll---of 13
kblockd_mod_delayed_work_on---of 1
kblockd_schedule_work---of 1
perf_trace_block_bio---of 6
perf_trace_block_bio_complete---of 7
perf_trace_block_bio_remap---of 6
perf_trace_block_buffer---of 6
perf_trace_block_plug---of 6
perf_trace_block_rq---of 13
perf_trace_block_rq_completion---of 11
perf_trace_block_rq_remap---of 8
perf_trace_block_rq_requeue---of 13
perf_trace_block_split---of 6
perf_trace_block_unplug---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
should_fail_bio---of 3
should_fail_request---of 3
submit_bio---of 13
submit_bio_noacct---of 88
submit_bio_noacct_nocheck---of 46
trace_event_raw_event_block_bio---of 7
trace_event_raw_event_block_bio_complete---of 8
trace_event_raw_event_block_bio_remap---of 7
trace_event_raw_event_block_buffer---of 7
trace_event_raw_event_block_plug---of 7
trace_event_raw_event_block_rq---of 14
trace_event_raw_event_block_rq_completion---of 12
trace_event_raw_event_block_rq_remap---of 9
trace_event_raw_event_block_rq_requeue---of 14
trace_event_raw_event_block_split---of 7
trace_event_raw_event_block_unplug---of 7
trace_raw_output_block_bio---of 3
trace_raw_output_block_bio_complete---of 3
trace_raw_output_block_bio_remap---of 3
trace_raw_output_block_buffer---of 3
trace_raw_output_block_plug---of 3
trace_raw_output_block_rq---of 3
trace_raw_output_block_rq_completion---of 3
trace_raw_output_block_rq_remap---of 3
trace_raw_output_block_rq_requeue---of 3
trace_raw_output_block_split---of 3
trace_raw_output_block_unplug---of 3
update_io_ticks---of 10
-----------
SUMMARY35%of 20

-----------
SUMMARY---of 0

plist_add32%of 22
plist_check_list56%of 9
plist_del---of 14
plist_requeue---of 15
-----------
SUMMARY39%of 31

__check_safe_pte_update---of 14
__folio_rmap_sanity_checks---of 17
__folio_split---of 62
__folio_unqueue_deferred_split---of 24
__pmd_trans_huge_lock---of 7
__probestub_hugepage_set_pmd---of 1
__probestub_hugepage_set_pud---of 1
__probestub_hugepage_update_pmd---of 1
__probestub_hugepage_update_pud---of 1
__probestub_remove_migration_pmd---of 1
__probestub_set_migration_pmd---of 1
__pud_trans_huge_lock---of 1
__split_huge_pmd100%of 5
__split_huge_pud---of 1
__split_unmapped_folio---of 161
__thp_vma_allowable_orders35%of 35
__traceiter_hugepage_set_pmd---of 4
__traceiter_hugepage_set_pud---of 4
__traceiter_hugepage_update_pmd---of 4
__traceiter_hugepage_update_pud---of 4
__traceiter_remove_migration_pmd---of 4
__traceiter_set_migration_pmd---of 4
add_mm_counter---of 1
anon_enabled_show---of 4
anon_enabled_store---of 28
anon_fault_alloc_show---of 4
anon_fault_fallback_charge_show---of 4
anon_fault_fallback_show---of 4
can_change_pmd_writable---of 21
can_split_folio---of 14
change_huge_pmd---of 56
const_folio_flags---of 5
copy_huge_pmd---of 121
count_mthp_stat---of 5
count_vm_event---of 4
current_gfp_context---of 4
deferred_split_count---of 1
deferred_split_folio---of 47
deferred_split_scan---of 76
defrag_show---of 5
defrag_store---of 42
do_huge_pmd_anonymous_page---of 35
do_huge_pmd_numa_page---of 25
do_huge_pmd_wp_page---of 87
enabled_show---of 3
enabled_store---of 16
file_thp_enabled---of 4
filemap_nr_thps_dec---of 7
folio_flags---of 5
folio_large_mapcount---of 4
folio_memcg---of 10
folio_nr_pages---of 4
folio_order---of 4
folio_put---of 6
folio_ref_freeze---of 3
folio_split---of 1
folio_test_pmd_mappable---of 4
folio_try_get---of 16
folio_try_share_anon_rmap_pmd---of 47
follow_devmap_pmd---of 15
hpage_pmd_size_show---of 1
huge_pmd_set_accessed---of 4
insert_pfn_pmd---of 20
madvise_free_huge_pmd---of 62
map_anon_folio_pmd---of 37
maybe_pmd_mkwrite---of 1
min_order_for_split---of 9
mm_get_huge_zero_folio---of 22
mm_inc_nr_ptes---of 3
mm_put_huge_zero_folio---of 6
mmu_notifier_invalidate_range_start100%of 3
mod_mthp_stat---of 5
move_huge_pmd---of 57
move_pages_huge_pmd---of 112
non_uniform_split_supported---of 12
nr_anon_partially_mapped_show---of 4
nr_anon_show---of 4
perf_trace_hugepage_set---of 6
perf_trace_hugepage_update---of 6
perf_trace_migration_pmd---of 6
pfn_swap_entry_folio---of 11
pfn_swap_entry_to_page---of 14
pfn_valid---of 31
pmd_lock---of 1
pte_alloc_one_noprof---of 15
pte_free---of 13
put_anon_vma---of 5
put_task_struct---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
remap_page---of 10
remove_migration_pmd---of 57
set_huge_zero_folio---of 12
set_pmd_at---of 10
set_pmd_migration_entry---of 59
shmem_alloc_show---of 4
shmem_fallback_charge_show---of 4
shmem_fallback_show---of 4
shrink_huge_zero_page_count---of 1
shrink_huge_zero_page_scan---of 10
single_hugepage_flag_show---of 1
single_hugepage_flag_store---of 7
split_deferred_show---of 4
split_failed_show---of 4
split_folio_to_list---of 4
split_huge_page_to_list_to_order---of 3
split_huge_pages_all---of 43
split_huge_pages_in_file---of 32
split_huge_pages_write---of 103
split_huge_pmd_address---of 3
split_huge_pmd_locked4%of 187
split_show---of 4
split_underused_thp_show---of 1
split_underused_thp_store---of 1
swpin_fallback_charge_show---of 4
swpin_fallback_show---of 4
swpin_show---of 4
swpout_fallback_show---of 4
swpout_show---of 4
sysfs_add_group---of 4
thp_get_unmapped_area---of 1
thp_get_unmapped_area_vmflags---of 10
thpsize_release---of 1
touch_pmd---of 3
trace_event_raw_event_hugepage_set---of 7
trace_event_raw_event_hugepage_update---of 7
trace_event_raw_event_migration_pmd---of 7
trace_raw_output_hugepage_set---of 3
trace_raw_output_hugepage_update---of 3
trace_raw_output_migration_pmd---of 3
uniform_split_supported---of 13
unmap_folio---of 7
unmap_huge_pmd_locked---of 63
use_zero_page_show---of 1
use_zero_page_store---of 7
vma_adjust_trans_huge50%of 18
vma_alloc_anon_folio_pmd---of 35
vma_end_read---of 8
vma_thp_gfp_mask---of 7
vmf_insert_folio_pmd---of 12
vmf_insert_pfn_pmd---of 9
zap_huge_pmd---of 50
zswpout_show---of 4
-----------
SUMMARY15%of 248

__arm64_sys_pidfd_getfd---of 13
__arm64_sys_pidfd_open---of 10
__change_pid---of 17
__task_pid_nr_ns---of 25
alloc_pid---of 37
attach_pid---of 8
change_pid---of 8
delayed_put_pid---of 1
detach_pid---of 1
disable_pid_allocation---of 1
exchange_tids---of 8
find_ge_pid---of 1
find_get_pid---of 19
find_get_task_by_vpid---of 19
find_pid_ns---of 1
find_task_by_pid_ns---of 13
find_task_by_vpid---of 3
find_vpid---of 3
free_pid---of 11
free_pids---of 9
get_pid_task---of 24
get_task_pid30%of 24
local_lock_release---of 7
pid_nr_ns---of 5
pid_table_root_lookup---of 3
pid_table_root_permissions---of 5
pid_table_root_set_ownership---of 5
pid_task---of 8
pid_vnr---of 7
pidfd_get_pid---of 11
pidfd_get_task---of 7
put_pid58%of 7
put_task_struct---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
register_pidns_sysctls---of 4
set_is_seen---of 3
task_active_pid_ns67%of 3
transfer_pid---of 8
unregister_pidns_sysctls---of 1
-----------
SUMMARY45%of 38

__armv8_pmuv3_map_event42%of 17
__armv8pmu_probe_pmu---of 10
arch_perf_update_userpage---of 7
armv8_a53_map_event---of 1
armv8_a57_map_event---of 1
armv8_a73_map_event---of 1
armv8_brcm_vulcan_pmu_init---of 1
armv8_cavium_thunder_pmu_init---of 1
armv8_cortex_a34_pmu_init---of 1
armv8_cortex_a35_pmu_init---of 1
armv8_cortex_a53_pmu_init---of 1
armv8_cortex_a55_pmu_init---of 1
armv8_cortex_a57_pmu_init---of 1
armv8_cortex_a65_pmu_init---of 1
armv8_cortex_a72_pmu_init---of 1
armv8_cortex_a73_pmu_init---of 1
armv8_cortex_a75_pmu_init---of 1
armv8_cortex_a76_pmu_init---of 1
armv8_cortex_a77_pmu_init---of 1
armv8_cortex_a78_pmu_init---of 1
armv8_cortex_x1_pmu_init---of 1
armv8_neoverse_e1_pmu_init---of 1
armv8_neoverse_n1_pmu_init---of 1
armv8_neoverse_v1_pmu_init---of 1
armv8_neoverse_v2_pmu_init---of 1
armv8_neoverse_v3_pmu_init---of 1
armv8_neoverse_v3ae_pmu_init---of 1
armv8_nvidia_carmel_pmu_init---of 1
armv8_nvidia_denver_pmu_init---of 1
armv8_pmu_device_probe---of 1
armv8_pmu_init---of 7
armv8_pmuv3_map_event100%of 1
armv8_pmuv3_pmu_init---of 1
armv8_rainier_pmu_init---of 1
armv8_samsung_mongoose_pmu_init---of 1
armv8_thunder_map_event---of 1
armv8_vulcan_map_event---of 1
armv8pmu_clear_event_idx38%of 8
armv8pmu_disable_event38%of 8
armv8pmu_disable_user_access_ipi---of 1
armv8pmu_enable_event40%of 15
armv8pmu_event_attr_is_visible---of 7
armv8pmu_events_sysfs_show---of 1
armv8pmu_get_event_idx28%of 37
armv8pmu_handle_irq---of 10
armv8pmu_proc_user_access_handler---of 4
armv8pmu_read_counter45%of 9
armv8pmu_read_evcntr22%of 33
armv8pmu_reset---of 1
armv8pmu_set_event_filter34%of 9
armv8pmu_start17%of 12
armv8pmu_stop100%of 1
armv8pmu_user_event_idx---of 4
armv8pmu_write_counter50%of 8
armv8pmu_write_evcntr22%of 33
armv8pmu_write_evtype22%of 33
armv9_cortex_a510_pmu_init---of 1
armv9_cortex_a520_pmu_init---of 1
armv9_cortex_a710_pmu_init---of 1
armv9_cortex_a715_pmu_init---of 1
armv9_cortex_a720_pmu_init---of 1
armv9_cortex_a725_pmu_init---of 1
armv9_cortex_x2_pmu_init---of 1
armv9_cortex_x3_pmu_init---of 1
armv9_cortex_x4_pmu_init---of 1
armv9_cortex_x925_pmu_init---of 1
armv9_neoverse_n2_pmu_init---of 1
armv9_neoverse_n3_pmu_init---of 1
bus_slots_show---of 1
bus_width_show---of 1
event_show---of 1
long_show---of 1
rdpmc_show---of 1
slots_show---of 1
threshold_compare_show---of 1
threshold_count_show---of 1
threshold_max_show---of 1
threshold_show---of 1
update_pmuserenr39%of 13
-----------
SUMMARY30%of 237

__devlink_port_type_set---of 27
devl_port_fn_devlink_set---of 6
devl_port_register_with_ops---of 10
devl_port_unregister---of 12
devlink_compat_phys_port_name_get12%of 27
devlink_compat_switch_id_get50%of 4
devlink_nl_port_attrs_put---of 29
devlink_nl_port_del_doit---of 3
devlink_nl_port_fill---of 25
devlink_nl_port_function_attrs_put---of 52
devlink_nl_port_get_doit---of 4
devlink_nl_port_get_dump_one---of 5
devlink_nl_port_get_dumpit---of 1
devlink_nl_port_handle_fill---of 3
devlink_nl_port_handle_size---of 3
devlink_nl_port_new_doit---of 18
devlink_nl_port_set_doit---of 76
devlink_nl_port_split_doit---of 13
devlink_nl_port_unsplit_doit---of 3
devlink_nl_put_handle---of 4
devlink_port_attrs_pci_pf_set---of 6
devlink_port_attrs_pci_sf_set---of 6
devlink_port_attrs_pci_vf_set---of 6
devlink_port_attrs_set---of 8
devlink_port_fini---of 3
devlink_port_get_by_index---of 1
devlink_port_get_from_attrs---of 3
devlink_port_get_from_info---of 3
devlink_port_init---of 3
devlink_port_linecard_set---of 3
devlink_port_netdevice_event10%of 21
devlink_port_notify---of 11
devlink_port_register_with_ops---of 1
devlink_port_rel_cleanup_cb---of 4
devlink_port_rel_notify_cb---of 3
devlink_port_type_clear---of 9
devlink_port_type_eth_set---of 7
devlink_port_type_ib_set---of 7
devlink_port_type_warn---of 1
devlink_port_unregister---of 1
devlink_ports_notify_register---of 4
devlink_ports_notify_unregister---of 4
nla_put_string---of 1
-----------
SUMMARY14%of 52

copy_from_sockptr---of 11
dev_put---of 5
isotp_bind---of 18
isotp_create_fframe---of 8
isotp_fill_dataframe---of 12
isotp_getname---of 3
isotp_getsockopt---of 27
isotp_init---of 3
isotp_notifier11%of 19
isotp_poll---of 6
isotp_rcv---of 97
isotp_rcv_echo---of 9
isotp_rcv_sf---of 26
isotp_rcv_skb---of 3
isotp_recvmsg---of 18
isotp_release---of 38
isotp_rx_timer_handler---of 4
isotp_send_cframe---of 28
isotp_send_fc---of 23
isotp_sendmsg---of 44
isotp_setsockopt---of 32
isotp_sock_no_ioctlcmd---of 1
isotp_tx_timer_handler---of 5
isotp_txfr_timer_handler---of 4
memcpy_from_msg---of 6
skb_put_zero---of 1
sock_error---of 3
-----------
SUMMARY11%of 19

-----------
SUMMARY---of 0

rcu_lock_acquire---of 2
rcu_lock_release---of 2
siw_destroy_cpulist---of 4
siw_device_cleanup---of 1
siw_get_base_qp---of 24
siw_get_tx_cpu---of 20
siw_netdev_event34%of 9
siw_newlink---of 16
siw_put_tx_cpu---of 3
-----------
SUMMARY34%of 9

__arm64_sys_userfaultfd---of 5
__wake_userfault---of 5
_inline_copy_from_user---of 8
dup_userfaultfd---of 20
dup_userfaultfd_complete---of 6
dup_userfaultfd_fail---of 9
handle_userfault---of 110
init_once_userfaultfd_ctx---of 1
mmget_not_zero---of 6
mremap_userfaultfd_complete---of 4
mremap_userfaultfd_prep---of 10
new_userfaultfd---of 8
rcu_lock_release---of 2
release_fault_lock---of 10
seqcount_lockdep_reader_access---of 5
userfaultfd_ctx_put---of 17
userfaultfd_ctx_read---of 57
userfaultfd_dev_ioctl---of 3
userfaultfd_event_wait_completion---of 26
userfaultfd_ioctl---of 305
userfaultfd_poll---of 8
userfaultfd_read_iter---of 14
userfaultfd_release---of 1
userfaultfd_remove---of 12
userfaultfd_show_fdinfo---of 6
userfaultfd_unmap_complete34%of 6
userfaultfd_unmap_prep10%of 20
userfaultfd_wake_function---of 7
userfaultfd_wp_async---of 3
userfaultfd_wp_unpopulated---of 3
vma_iter_set---of 5
-----------
SUMMARY16%of 26

evm_file_release63%of 8
evm_inode_alloc_security67%of 3
evm_inode_copy_up_xattr---of 4
evm_inode_init_security19%of 11
evm_inode_post_remove_acl---of 1
evm_inode_post_removexattr---of 13
evm_inode_post_set_acl---of 1
evm_inode_post_setattr20%of 10
evm_inode_post_setxattr---of 15
evm_inode_remove_acl---of 1
evm_inode_removexattr---of 3
evm_inode_set_acl---of 18
evm_inode_setattr8%of 26
evm_inode_setxattr---of 7
evm_metadata_changed---of 8
evm_post_path_mknod---of 4
evm_protect_xattr---of 42
evm_protected_xattr---of 1
evm_protected_xattr_common---of 12
evm_protected_xattr_if_enabled---of 1
evm_read_protected_xattrs---of 15
evm_revalidate_status---of 7
evm_verify_hmac---of 44
evm_verifyxattr---of 4
is_unsupported_hmac_fs---of 4
-----------
SUMMARY23%of 58

kvm_cpu_dirty_log_size100%of 1
kvm_dirty_ring_alloc67%of 3
kvm_dirty_ring_check_request10%of 20
kvm_dirty_ring_free100%of 1
kvm_dirty_ring_get_page---of 1
kvm_dirty_ring_get_rsvd_entries100%of 1
kvm_dirty_ring_push---of 19
kvm_dirty_ring_reset18%of 29
kvm_reset_dirty_gfn13%of 16
kvm_use_dirty_bitmap67%of 6
-----------
SUMMARY24%of 77

-----------
SUMMARY---of 0

ima_appraise_measurement---of 81
ima_check_blacklist---of 4
ima_get_cache_status---of 8
ima_get_hash_algo---of 14
ima_inode_post_setattr17%of 12
ima_inode_remove_acl---of 3
ima_inode_removexattr---of 6
ima_inode_set_acl---of 3
ima_inode_setxattr---of 16
ima_must_appraise---of 3
ima_read_xattr---of 1
ima_reset_appraise_flags---of 10
ima_update_xattr---of 10
is_ima_appraise_enabled---of 1
-----------
SUMMARY17%of 12

-----------
SUMMARY---of 0

__arm64_sys_swapoff---of 35
__arm64_sys_swapon---of 83
__folio_swap_cache_index---of 1
__folio_throttle_swaprate23%of 9
__swap_count---of 3
__swap_duplicate---of 21
__try_to_reclaim_swap---of 46
add_swap_count_continuation---of 21
add_swap_extent---of 8
add_to_avail_list---of 19
alloc_swap_scan_cluster---of 49
arch_max_swapfile_size---of 1
claim_swapfile---of 5
cluster_alloc_swap_entry---of 103
cluster_swap_free_nr---of 11
count_swap_pages---of 5
del_from_avail_list---of 15
destroy_swap_extents---of 7
discard_swap---of 8
drain_mmlist---of 10
enable_swap_info---of 17
find_first_swap---of 5
flush_percpu_swap_cluster---of 7
folio_alloc_swap---of 40
folio_free_swap---of 20
folio_large_mapcount---of 4
folio_order---of 4
free_cluster---of 11
free_swap_and_cache_nr---of 33
free_swap_count_continuations---of 10
generic_max_swapfile_size---of 1
get_swap_device---of 7
get_swap_device_info---of 24
get_swap_page_of_type---of 8
inode_drain_writes---of 1
kmap_local_folio---of 5
local_lock_acquire---of 6
local_lock_release---of 7
move_cluster---of 17
percpu_ref_put---of 18
pfn_valid---of 31
put_swap_folio---of 19
rcu_lock_acquire---of 2
rcu_lock_release---of 2
read_swap_header---of 16
reinsert_swap_info---of 17
relocate_cluster---of 11
setup_clusters---of 30
setup_swap_map_and_extents---of 30
si_swapinfo---of 6
swap_count_continued---of 41
swap_discard_work---of 1
swap_do_scheduled_discard---of 17
swap_duplicate---of 4
swap_entry_range_free---of 35
swap_entry_swapped---of 1
swap_folio_sector---of 9
swap_free_nr---of 10
swap_next---of 10
swap_page_trans_huge_swapped---of 4
swap_reclaim_full_clusters---of 25
swap_reclaim_work---of 1
swap_shmem_alloc---of 1
swap_show---of 3
swap_start---of 9
swap_stop---of 1
swap_type_of---of 9
swap_users_ref_free---of 1
swapcache_clear---of 6
swapcache_mapping---of 3
swapcache_prepare---of 1
swapdev_block---of 10
swaps_open---of 3
swaps_poll---of 6
swp_swap_info---of 3
swp_swapcount---of 14
try_to_unuse---of 221
wait_for_allocation---of 5
-----------
SUMMARY23%of 9

__arm64_sys_arm64_personality---of 6
__arm64_sys_mmap100%of 3
__arm64_sys_ni_syscall---of 1
-----------
SUMMARY100%of 3

__dev_flush---of 4
__dev_map_alloc_node---of 14
__dev_map_entry_free---of 8
bq_enqueue---of 8
bq_xmit_all---of 40
dev_hash_map_redirect---of 11
dev_map_alloc---of 15
dev_map_alloc_check---of 8
dev_map_delete_elem---of 5
dev_map_enqueue---of 10
dev_map_enqueue_multi---of 49
dev_map_free---of 29
dev_map_generic_redirect---of 11
dev_map_get_next_key---of 5
dev_map_hash_delete_elem---of 7
dev_map_hash_get_next_key---of 11
dev_map_hash_lookup_elem---of 5
dev_map_hash_update_elem---of 18
dev_map_lookup_elem---of 8
dev_map_mem_usage---of 1
dev_map_notification5%of 45
dev_map_redirect---of 16
dev_map_redirect_multi---of 35
dev_map_update_elem---of 12
dev_xdp_enqueue---of 10
rcu_lock_acquire---of 2
rcu_lock_release---of 2
trace_xdp_exception---of 14
-----------
SUMMARY5%of 45

-----------
SUMMARY---of 0

__arm64_sys_getgroups---of 12
__arm64_sys_setgroups---of 27
gid_cmp---of 1
groups_alloc---of 3
groups_free---of 1
groups_search---of 6
groups_sort---of 1
in_egroup_p---of 7
in_group_p29%of 7
may_setgroups---of 3
set_current_groups---of 4
set_groups---of 11
-----------
SUMMARY29%of 7

__rdma_create_id---of 9
__rdma_create_kernel_id---of 3
_cma_attach_to_dev---of 21
_cma_cancel_listens---of 13
_destroy_id---of 64
addr_handler---of 16
cma_acquire_dev_by_src_ip---of 15
cma_add_one---of 31
cma_addr_cmp---of 7
cma_alloc_port---of 20
cma_check_port---of 35
cma_cm_event_handler---of 17
cma_dev_get---of 6
cma_dev_put---of 6
cma_enum_devices_by_ibdev---of 9
cma_exit_net---of 9
cma_get_default_gid_type---of 6
cma_get_default_roce_tos---of 6
cma_get_ib_dev---of 1
cma_ib_handler---of 30
cma_ib_mc_handler---of 7
cma_ib_req_handler---of 250
cma_iboe_join_work_handler---of 7
cma_iboe_set_mgid---of 10
cma_iboe_set_path_rec_l2_fields---of 13
cma_igmp_send---of 9
cma_init_net---of 1
cma_iw_handler---of 12
cma_listen_handler---of 3
cma_listen_on_dev---of 16
cma_make_mc_event---of 11
cma_modify_qp_rtr---of 8
cma_netdev_callback11%of 19
cma_netevent_callback---of 25
cma_netevent_work_handler---of 14
cma_pernet---of 16
cma_process_remove---of 49
cma_query_handler---of 16
cma_release_dev---of 10
cma_remove_one---of 16
cma_rep_recv---of 26
cma_save_ib_info---of 7
cma_save_net_info---of 17
cma_send_sidr_rep---of 22
cma_set_default_gid_type---of 9
cma_set_default_roce_tos---of 6
cma_sidr_rep_handler---of 15
cma_translate_addr---of 3
cma_validate_port---of 45
cma_work_handler---of 25
compare_netdev_and_ip---of 6
destroy_id_handler_unlock---of 4
destroy_mc---of 12
enqueue_resolve_addr_work---of 6
get_lower_vlan_dev_tc---of 8
iw_conn_req_handler---of 22
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rdma_accept---of 36
rdma_accept_ece---of 1
rdma_addr_set_sgid---of 1
rdma_bind_addr---of 1
rdma_bind_addr_dst---of 180
rdma_connect---of 1
rdma_connect_ece---of 1
rdma_connect_locked---of 91
rdma_consumer_reject_data---of 6
rdma_create_qp---of 11
rdma_create_user_id---of 3
rdma_destroy_id---of 1
rdma_destroy_qp---of 14
rdma_disconnect---of 47
rdma_event_msg---of 3
rdma_get_service_id---of 5
rdma_init_qp_attr---of 24
rdma_iw_cm_id---of 3
rdma_join_multicast---of 71
rdma_leave_multicast---of 7
rdma_listen---of 32
rdma_lock_handler---of 1
rdma_notify---of 4
rdma_read_gids---of 17
rdma_reject---of 6
rdma_reject_msg---of 4
rdma_res_to_id---of 1
rdma_resolve_addr---of 66
rdma_resolve_route---of 95
rdma_set_ack_timeout---of 4
rdma_set_afonly---of 4
rdma_set_ib_path---of 11
rdma_set_min_rnr_timer---of 4
rdma_set_reuseaddr---of 6
rdma_set_service_type---of 1
rdma_unlock_handler---of 1
trace_cm_event_handler---of 14
trace_cm_id_destroy---of 14
trace_cm_qp_create---of 14
trace_cm_send_mra---of 14
trace_cm_send_rej---of 14
-----------
SUMMARY11%of 19

__dev_printk---of 8
__device_link_del---of 13
__device_links_no_driver---of 14
__device_links_queue_sync_state---of 18
__fw_devlink_link_to_consumers---of 31
__fw_devlink_link_to_suppliers---of 14
__fw_devlink_pickup_dangling_consumers---of 16
__fw_devlink_relax_cycles---of 53
__fwnode_link_add---of 10
__root_device_register---of 9
_dev_alert---of 1
_dev_crit---of 1
_dev_emerg---of 1
_dev_err---of 1
_dev_info---of 1
_dev_notice---of 1
_dev_printk---of 1
_dev_warn---of 1
auto_remove_on_show---of 1
class_dir_child_ns_type100%of 1
class_dir_release---of 1
cleanup_glue_dir---of 11
dev_attr_show---of 4
dev_attr_store---of 3
dev_driver_string---of 5
dev_err_probe---of 5
dev_printk_emit---of 1
dev_set_name---of 1
dev_show---of 1
dev_uevent28%of 33
dev_uevent_filter40%of 5
dev_uevent_name50%of 4
dev_vprintk_emit---of 10
dev_warn_probe---of 5
device_add---of 58
device_add_attrs---of 47
device_add_class_symlinks---of 14
device_add_groups---of 1
device_add_of_node---of 6
device_change_owner---of 22
device_check_offline---of 11
device_create---of 1
device_create_bin_file---of 3
device_create_file---of 8
device_create_groups_vargs---of 6
device_create_release---of 4
device_create_sys_dev_entry---of 3
device_create_with_groups---of 1
device_del---of 37
device_destroy---of 5
device_find_child---of 7
device_for_each_child---of 7
device_for_each_child_reverse---of 7
device_for_each_child_reverse_from---of 9
device_get_devnode25%of 16
device_get_ownership---of 4
device_initialize---of 3
device_is_dependent---of 17
device_link_add---of 60
device_link_del---of 1
device_link_init_status---of 8
device_link_put_kref---of 8
device_link_release_fn---of 11
device_link_remove---of 5
device_link_wait_removal---of 1
device_links_busy---of 7
device_links_check_suppliers---of 35
device_links_driver_bound---of 63
device_links_driver_cleanup---of 18
device_links_flush_sync_list---of 16
device_links_force_bind---of 11
device_links_no_driver---of 7
device_links_read_lock---of 1
device_links_read_lock_held---of 3
device_links_read_unlock---of 3
device_links_supplier_sync_state_pause---of 1
device_links_supplier_sync_state_resume---of 8
device_links_unbind_consumers---of 11
device_match_acpi_dev---of 3
device_match_acpi_handle---of 4
device_match_any---of 1
device_match_devt---of 1
device_match_fwnode---of 3
device_match_name---of 3
device_match_of_node---of 3
device_match_type---of 1
device_move---of 36
device_namespace---of 4
device_offline---of 14
device_online---of 7
device_pm_move_to_tail---of 3
device_register---of 1
device_release---of 10
device_remove_attrs---of 9
device_remove_bin_file---of 3
device_remove_class_symlinks---of 10
device_remove_file---of 3
device_remove_file_self---of 3
device_remove_groups---of 1
device_remove_of_node---of 6
device_rename---of 13
device_reorder_to_tail---of 16
device_set_node---of 3
device_set_of_node_from_dev---of 1
device_show_bool---of 1
device_show_int---of 1
device_show_string---of 1
device_show_ulong---of 1
device_shutdown---of 32
device_store_bool---of 1
device_store_int---of 4
device_store_ulong---of 3
device_unregister---of 6
devices_kset_move_after---of 11
devices_kset_move_before---of 11
devices_kset_move_last---of 9
devlink_add_symlinks---of 22
devlink_dev_release---of 1
devlink_remove_symlinks---of 20
devm_attr_group_remove---of 3
devm_device_add_group---of 4
fw_devlink_create_devlink---of 58
fw_devlink_dev_sync_state---of 15
fw_devlink_drivers_done---of 1
fw_devlink_is_strict---of 3
fw_devlink_link_device---of 3
fw_devlink_no_driver---of 6
fw_devlink_parse_fwtree---of 9
fw_devlink_probing_done---of 1
fw_devlink_purge_absent_suppliers---of 5
fw_devlink_unblock_consumers---of 10
fwnode_link_add---of 1
fwnode_links_purge---of 1
fwnode_links_purge_consumers---of 10
fwnode_links_purge_suppliers---of 10
get_device---of 3
get_device_parent---of 17
kill_device---of 6
klist_children_get---of 3
klist_children_put---of 3
kref_get---of 6
list_add_tail_rcu---of 3
lock_device_hotplug---of 1
lock_device_hotplug_sysfs---of 4
online_show---of 1
online_store---of 11
pm_runtime_put_noidle---of 6
put_device---of 3
refcount_inc---of 6
removable_show---of 1
root_device_release---of 1
root_device_unregister---of 8
runtime_pm_show---of 1
set_primary_fwnode---of 12
set_secondary_fwnode---of 6
srcu_lock_acquire---of 2
srcu_lock_release---of 2
status_show---of 3
sync_state_only_show---of 1
sync_state_resume_initcall---of 1
uevent_show---of 13
uevent_store---of 3
unlock_device_hotplug---of 1
virtual_device_parent---of 3
waiting_for_supplier_show---of 6
-----------
SUMMARY31%of 59

__fscrypt_encrypt_symlink---of 11
__fscrypt_prepare_link---of 3
__fscrypt_prepare_lookup---of 8
__fscrypt_prepare_readdir---of 1
__fscrypt_prepare_rename---of 11
__fscrypt_prepare_setattr---of 5
fscrypt_file_open---of 26
fscrypt_get_symlink---of 14
fscrypt_prepare_lookup_partial---of 7
fscrypt_prepare_setflags25%of 8
fscrypt_prepare_symlink---of 5
fscrypt_symlink_getattr---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY25%of 8

-----------
SUMMARY---of 0

__arm64_sys_memfd_create---of 40
folio_large_mapcount---of 4
mapping_allow_writable---of 3
memfd_alloc_folio---of 9
memfd_check_seals_mmap50%of 12
memfd_fcntl---of 29
memfd_folio_has_extra_refs---of 7
memfd_wait_for_pins---of 48
-----------
SUMMARY50%of 12

extract_bytes100%of 1
update_64bit_reg100%of 1
vgic_has_its67%of 3
vgic_lpis_enabled---of 1
vgic_mmio_read_irouter75%of 4
vgic_mmio_read_pendbase---of 1
vgic_mmio_read_propbase---of 1
vgic_mmio_read_sync---of 1
vgic_mmio_read_v3_idregs---of 1
vgic_mmio_read_v3_misc50%of 8
vgic_mmio_read_v3r_ctlr100%of 1
vgic_mmio_read_v3r_iidr100%of 1
vgic_mmio_read_v3r_typer34%of 18
vgic_mmio_uaccess_write_v3_misc58%of 7
vgic_mmio_write_invall29%of 7
vgic_mmio_write_invlpi---of 10
vgic_mmio_write_irouter50%of 4
vgic_mmio_write_pendbase67%of 6
vgic_mmio_write_propbase67%of 6
vgic_mmio_write_v3_misc63%of 16
vgic_mmio_write_v3r_ctlr64%of 11
vgic_register_redist_iodev45%of 9
vgic_sanitise_field100%of 1
vgic_sanitise_inner_cacheability100%of 1
vgic_sanitise_outer_cacheability100%of 1
vgic_sanitise_shareability100%of 1
vgic_supports_direct_msis40%of 5
vgic_unregister_redist_iodev100%of 1
vgic_v3_dispatch_sgi50%of 12
vgic_v3_dist_uaccess100%of 1
vgic_v3_free_redist_region50%of 12
vgic_v3_has_attr_regs88%of 8
vgic_v3_init_dist_iodev100%of 1
vgic_v3_line_level_info_uaccess100%of 4
vgic_v3_queue_sgi34%of 6
vgic_v3_redist_uaccess100%of 1
vgic_v3_set_redist_base52%of 27
vgic_v3_uaccess_write_pending---of 3
-----------
SUMMARY57%of 185

-----------
SUMMARY---of 0

__do_kernel_fault6%of 35
__ptep_set_access_flags20%of 10
data_abort_decode---of 3
do_alignment_fault---of 1
do_bad---of 1
do_bad_area38%of 8
do_debug_exception---of 15
do_mem_abort50%of 4
do_page_fault31%of 117
do_sea---of 4
do_sp_pc_abort---of 1
do_tag_check_fault---of 1
do_translation_fault80%of 5
fault_signal_pending38%of 8
mem_abort_decode---of 3
pte_unmap---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
show_pte---of 36
tag_clear_highpage---of 50
vma_alloc_zeroed_movable_folio100%of 1
vma_end_read63%of 8
-----------
SUMMARY30%of 196

-----------
SUMMARY---of 0

can_stop_idle_tick---of 15
get_cpu_idle_time_us---of 1
get_cpu_iowait_time_us---of 1
get_cpu_sleep_time_us---of 11
get_jiffies_update---of 6
seqcount_lockdep_reader_access---of 5
tick_check_oneshot_change---of 11
tick_clock_notify---of 6
tick_do_update_jiffies64---of 8
tick_get_tick_sched---of 1
tick_irq_enter---of 9
tick_nohz_get_idle_calls_cpu---of 1
tick_nohz_get_next_hrtimer---of 1
tick_nohz_get_sleep_length---of 6
tick_nohz_handler---of 15
tick_nohz_idle_enter---of 25
tick_nohz_idle_exit---of 26
tick_nohz_idle_got_tick---of 3
tick_nohz_idle_restart_tick---of 7
tick_nohz_idle_retain_tick---of 1
tick_nohz_idle_stop_tick---of 64
tick_nohz_irq_exit---of 3
tick_nohz_lowres_handler---of 3
tick_nohz_next_event---of 20
tick_nohz_restart_sched_tick---of 13
tick_nohz_start_idle---of 22
tick_nohz_stop_idle---of 23
tick_nohz_tick_stopped100%of 1
tick_nohz_tick_stopped_cpu---of 1
tick_oneshot_notify---of 3
tick_sched_timer_dying---of 3
tick_setup_sched_timer---of 37
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__gfn_to_page---of 17
__kvm_faultin_pfn34%of 9
__kvm_gfn_to_hva_cache_init---of 19
__kvm_io_bus_write58%of 19
__kvm_memslots23%of 9
__kvm_mmu_topup_memory_cache35%of 20
__kvm_read_guest_page38%of 8
__kvm_vcpu_map---of 23
__kvm_write_guest_page60%of 10
__probestub_kvm_ack_irq---of 1
__probestub_kvm_age_hva---of 1
__probestub_kvm_dirty_ring_exit---of 1
__probestub_kvm_dirty_ring_push---of 1
__probestub_kvm_dirty_ring_reset---of 1
__probestub_kvm_fpu---of 1
__probestub_kvm_halt_poll_ns---of 1
__probestub_kvm_iocsr---of 1
__probestub_kvm_mmio---of 1
__probestub_kvm_set_irq---of 1
__probestub_kvm_test_age_hva---of 1
__probestub_kvm_unmap_hva_range---of 1
__probestub_kvm_userspace_exit---of 1
__probestub_kvm_vcpu_wakeup---of 1
__traceiter_kvm_ack_irq---of 4
__traceiter_kvm_age_hva---of 4
__traceiter_kvm_dirty_ring_exit---of 4
__traceiter_kvm_dirty_ring_push---of 4
__traceiter_kvm_dirty_ring_reset---of 4
__traceiter_kvm_fpu---of 4
__traceiter_kvm_halt_poll_ns---of 4
__traceiter_kvm_iocsr---of 4
__traceiter_kvm_mmio---of 4
__traceiter_kvm_set_irq---of 4
__traceiter_kvm_test_age_hva---of 4
__traceiter_kvm_unmap_hva_range---of 4
__traceiter_kvm_userspace_exit---of 4
__traceiter_kvm_vcpu_wakeup---of 4
_inline_copy_from_user63%of 8
_inline_copy_to_user43%of 7
access_ok50%of 4
ack_kick---of 1
create_vcpu_fd100%of 1
file_is_kvm---of 3
gfn_to_hva80%of 15
gfn_to_hva_memslot---of 5
gfn_to_hva_memslot_prot80%of 5
gfn_to_hva_prot---of 15
gfn_to_memslot92%of 12
hva_to_pfn38%of 59
id_to_memslot100%of 5
kvm_arch_disable_virtualization---of 1
kvm_arch_dy_has_pending_interrupt---of 1
kvm_arch_dy_runnable---of 1
kvm_arch_enable_virtualization---of 1
kvm_arch_guest_memory_reclaimed100%of 1
kvm_arch_pre_destroy_vm100%of 1
kvm_arch_vcpu_preempted_in_kernel---of 1
kvm_are_all_memslots_empty25%of 12
kvm_check_memslot_overlap87%of 15
kvm_clear_guest---of 10
kvm_clear_stat_per_vcpu---of 5
kvm_create_vm_debugfs30%of 20
kvm_debugfs_release---of 1
kvm_destroy_vcpus56%of 9
kvm_dev_ioctl36%of 48
kvm_device_from_filp---of 3
kvm_device_ioctl72%of 14
kvm_device_mmap67%of 3
kvm_device_release75%of 4
kvm_disable_virtualization---of 3
kvm_disable_virtualization_cpu---of 3
kvm_enable_virtualization40%of 5
kvm_exit---of 7
kvm_flush_remote_tlbs---of 4
kvm_flush_remote_tlbs_memslot38%of 8
kvm_flush_remote_tlbs_range---of 5
kvm_get_kvm50%of 6
kvm_get_kvm_safe---of 7
kvm_get_running_vcpu67%of 3
kvm_get_running_vcpus---of 1
kvm_get_stat_per_vcpu---of 5
kvm_get_vcpu---of 3
kvm_get_vcpu_by_id80%of 10
kvm_gfn_to_hva_cache_init---of 1
kvm_guest_get_ip---of 5
kvm_guest_state---of 5
kvm_host_page_size---of 12
kvm_init---of 22
kvm_io_bus_get_dev45%of 18
kvm_io_bus_read50%of 24
kvm_io_bus_register_dev47%of 26
kvm_io_bus_sort_cmp100%of 4
kvm_io_bus_unregister_dev34%of 27
kvm_io_bus_write34%of 9
kvm_io_bus_write_cookie---of 15
kvm_ioctl_create_device35%of 26
kvm_is_visible_gfn72%of 14
kvm_make_all_cpus_request59%of 12
kvm_make_vcpu_request39%of 13
kvm_make_vcpus_request_mask---of 14
kvm_memslots23%of 9
kvm_mmu_free_memory_cache84%of 6
kvm_mmu_invalidate_begin---of 6
kvm_mmu_invalidate_end---of 9
kvm_mmu_invalidate_range_add---of 8
kvm_mmu_memory_cache_alloc60%of 5
kvm_mmu_memory_cache_nr_free_objects---of 1
kvm_mmu_notifier_clear_flush_young---of 20
kvm_mmu_notifier_clear_young---of 16
kvm_mmu_notifier_invalidate_range_end44%of 25
kvm_mmu_notifier_invalidate_range_start42%of 51
kvm_mmu_notifier_release67%of 3
kvm_mmu_notifier_test_young---of 32
kvm_mmu_topup_memory_cache100%of 1
kvm_mmu_unmap_gfn_range---of 8
kvm_no_compat_ioctl---of 1
kvm_no_compat_open100%of 1
kvm_offline_cpu---of 3
kvm_online_cpu---of 4
kvm_prefetch_pages---of 6
kvm_put_kvm53%of 63
kvm_put_kvm_no_destroy---of 6
kvm_read_guest43%of 7
kvm_read_guest_cached---of 1
kvm_read_guest_offset_cached---of 10
kvm_read_guest_page92%of 12
kvm_register_device_ops---of 4
kvm_register_perf_callbacks---of 1
kvm_release_page_clean47%of 13
kvm_release_page_dirty67%of 3
kvm_replace_memslot80%of 25
kvm_resume---of 17
kvm_sched_in100%of 1
kvm_sched_out---of 5
kvm_set_internal_memslot---of 4
kvm_set_memory_region63%of 37
kvm_set_memslot69%of 57
kvm_set_page_dirty37%of 11
kvm_shutdown---of 1
kvm_sigset_activate100%of 3
kvm_sigset_deactivate100%of 3
kvm_stat_data_clear---of 5
kvm_stat_data_get---of 4
kvm_stat_data_open---of 4
kvm_suspend---of 16
kvm_swap_active_memslots75%of 8
kvm_uevent_notify_change64%of 11
kvm_uninit_virtualization---of 4
kvm_unregister_device_ops---of 3
kvm_unregister_perf_callbacks---of 1
kvm_vcpu_block50%of 12
kvm_vcpu_check_block40%of 10
kvm_vcpu_fault59%of 12
kvm_vcpu_gfn_to_hva---of 5
kvm_vcpu_gfn_to_hva_prot---of 5
kvm_vcpu_gfn_to_memslot---of 22
kvm_vcpu_halt30%of 41
kvm_vcpu_ioctl64%of 44
kvm_vcpu_ioctl_get_stats_fd38%of 8
kvm_vcpu_is_visible_gfn---of 4
kvm_vcpu_kick20%of 25
kvm_vcpu_mark_page_dirty---of 1
kvm_vcpu_mmap56%of 9
kvm_vcpu_on_spin---of 23
kvm_vcpu_read_guest---of 7
kvm_vcpu_read_guest_atomic---of 7
kvm_vcpu_read_guest_page---of 1
kvm_vcpu_release100%of 1
kvm_vcpu_stats_read---of 1
kvm_vcpu_stats_release100%of 1
kvm_vcpu_unmap---of 12
kvm_vcpu_wake_up---of 3
kvm_vcpu_write_guest---of 7
kvm_vcpu_write_guest_page---of 1
kvm_vcpu_yield_to---of 5
kvm_vm_ioctl93%of 51
kvm_vm_ioctl_check_extension_generic100%of 24
kvm_vm_ioctl_clear_dirty_log65%of 31
kvm_vm_ioctl_create_vcpu43%of 19
kvm_vm_ioctl_enable_cap_generic96%of 21
kvm_vm_ioctl_get_dirty_log33%of 37
kvm_vm_ioctl_get_stats_fd38%of 8
kvm_vm_ioctl_reset_dirty_pages56%of 9
kvm_vm_ioctl_set_memory_region100%of 3
kvm_vm_release100%of 1
kvm_vm_stats_read---of 1
kvm_vm_stats_release---of 1
kvm_write_guest43%of 7
kvm_write_guest_cached---of 1
kvm_write_guest_offset_cached---of 12
kvm_write_guest_page84%of 12
mark_page_dirty17%of 12
mark_page_dirty_in_slot47%of 13
mmu_memory_cache_alloc_obj---of 5
perf_trace_kvm_ack_irq---of 6
perf_trace_kvm_age_hva---of 6
perf_trace_kvm_dirty_ring_exit---of 6
perf_trace_kvm_dirty_ring_push---of 6
perf_trace_kvm_dirty_ring_reset---of 6
perf_trace_kvm_fpu---of 6
perf_trace_kvm_halt_poll_ns---of 6
perf_trace_kvm_iocsr---of 7
perf_trace_kvm_mmio---of 7
perf_trace_kvm_set_irq---of 6
perf_trace_kvm_test_age_hva---of 6
perf_trace_kvm_unmap_hva_range---of 6
perf_trace_kvm_userspace_exit---of 6
perf_trace_kvm_vcpu_wakeup---of 6
pfn_valid23%of 31
put_task_struct---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
trace_event_raw_event_kvm_ack_irq---of 7
trace_event_raw_event_kvm_age_hva---of 7
trace_event_raw_event_kvm_dirty_ring_exit---of 7
trace_event_raw_event_kvm_dirty_ring_push---of 7
trace_event_raw_event_kvm_dirty_ring_reset---of 7
trace_event_raw_event_kvm_fpu---of 7
trace_event_raw_event_kvm_halt_poll_ns---of 7
trace_event_raw_event_kvm_iocsr---of 8
trace_event_raw_event_kvm_mmio---of 8
trace_event_raw_event_kvm_set_irq---of 7
trace_event_raw_event_kvm_test_age_hva---of 7
trace_event_raw_event_kvm_unmap_hva_range---of 7
trace_event_raw_event_kvm_userspace_exit---of 7
trace_event_raw_event_kvm_vcpu_wakeup---of 7
trace_kvm_age_hva---of 17
trace_kvm_halt_poll_ns24%of 17
trace_kvm_userspace_exit24%of 17
trace_raw_output_kvm_ack_irq---of 3
trace_raw_output_kvm_age_hva---of 3
trace_raw_output_kvm_dirty_ring_exit---of 3
trace_raw_output_kvm_dirty_ring_push---of 3
trace_raw_output_kvm_dirty_ring_reset---of 3
trace_raw_output_kvm_fpu---of 3
trace_raw_output_kvm_halt_poll_ns---of 3
trace_raw_output_kvm_iocsr---of 3
trace_raw_output_kvm_mmio---of 3
trace_raw_output_kvm_set_irq---of 3
trace_raw_output_kvm_test_age_hva---of 3
trace_raw_output_kvm_unmap_hva_range---of 3
trace_raw_output_kvm_userspace_exit---of 5
trace_raw_output_kvm_vcpu_wakeup---of 3
vcpu_load100%of 3
vcpu_put100%of 3
vcpu_stat_clear---of 5
vcpu_stat_fops_open---of 1
vcpu_stat_get---of 4
vcpu_stat_readonly_fops_open---of 1
vm_stat_clear---of 5
vm_stat_fops_open---of 1
vm_stat_get---of 4
vm_stat_readonly_fops_open---of 1
xa_insert100%of 1
-----------
SUMMARY54%of 1369

-----------
SUMMARY---of 0

__inode_attach_wb12%of 36
__mark_inode_dirty19%of 90
__probestub_balance_dirty_pages---of 1
__probestub_bdi_dirty_ratelimit---of 1
__probestub_flush_foreign---of 1
__probestub_folio_wait_writeback---of 1
__probestub_global_dirty_state---of 1
__probestub_inode_foreign_history---of 1
__probestub_inode_switch_wbs---of 1
__probestub_sb_clear_inode_writeback---of 1
__probestub_sb_mark_inode_writeback---of 1
__probestub_track_foreign_dirty---of 1
__probestub_wbc_writepage---of 1
__probestub_writeback_bdi_register---of 1
__probestub_writeback_dirty_folio---of 1
__probestub_writeback_dirty_inode---of 1
__probestub_writeback_dirty_inode_enqueue---of 1
__probestub_writeback_dirty_inode_start---of 1
__probestub_writeback_exec---of 1
__probestub_writeback_lazytime---of 1
__probestub_writeback_lazytime_iput---of 1
__probestub_writeback_mark_inode_dirty---of 1
__probestub_writeback_pages_written---of 1
__probestub_writeback_queue---of 1
__probestub_writeback_queue_io---of 1
__probestub_writeback_sb_inodes_requeue---of 1
__probestub_writeback_single_inode---of 1
__probestub_writeback_single_inode_start---of 1
__probestub_writeback_start---of 1
__probestub_writeback_wait---of 1
__probestub_writeback_wake_background---of 1
__probestub_writeback_write_inode---of 1
__probestub_writeback_write_inode_start---of 1
__probestub_writeback_written---of 1
__traceiter_balance_dirty_pages---of 4
__traceiter_bdi_dirty_ratelimit---of 4
__traceiter_flush_foreign---of 4
__traceiter_folio_wait_writeback---of 4
__traceiter_global_dirty_state---of 4
__traceiter_inode_foreign_history---of 4
__traceiter_inode_switch_wbs---of 4
__traceiter_sb_clear_inode_writeback---of 4
__traceiter_sb_mark_inode_writeback---of 4
__traceiter_track_foreign_dirty---of 4
__traceiter_wbc_writepage---of 4
__traceiter_writeback_bdi_register---of 4
__traceiter_writeback_dirty_folio---of 4
__traceiter_writeback_dirty_inode---of 4
__traceiter_writeback_dirty_inode_enqueue---of 4
__traceiter_writeback_dirty_inode_start---of 4
__traceiter_writeback_exec---of 4
__traceiter_writeback_lazytime---of 4
__traceiter_writeback_lazytime_iput---of 4
__traceiter_writeback_mark_inode_dirty---of 4
__traceiter_writeback_pages_written---of 4
__traceiter_writeback_queue---of 4
__traceiter_writeback_queue_io---of 4
__traceiter_writeback_sb_inodes_requeue---of 4
__traceiter_writeback_single_inode---of 4
__traceiter_writeback_single_inode_start---of 4
__traceiter_writeback_start---of 4
__traceiter_writeback_wait---of 4
__traceiter_writeback_wake_background---of 4
__traceiter_writeback_write_inode---of 4
__traceiter_writeback_write_inode_start---of 4
__traceiter_writeback_written---of 4
__wakeup_flusher_threads_bdi---of 12
__writeback_inodes_sb_nr---of 4
__writeback_inodes_wb---of 9
__writeback_single_inode---of 97
bdi_split_work_to_wbs---of 41
cgroup_writeback_by_id---of 25
cgroup_writeback_umount---of 4
cleanup_offline_cgwb---of 26
dirtytime_interval_handler---of 3
folio_memcg---of 10
inode_cgwb_move_to_attached---of 11
inode_io_list_del---of 3
inode_io_list_move_locked---of 17
inode_prepare_wbs_switch---of 13
inode_sleep_on_writeback---of 4
inode_switch_wbs---of 32
inode_switch_wbs_work_fn---of 62
inode_wait_for_writeback34%of 6
locked_inode_to_wb_and_lock_list24%of 13
move_expired_inodes---of 22
percpu_ref_put_many---of 18
percpu_ref_tryget---of 19
perf_trace_balance_dirty_pages---of 9
perf_trace_bdi_dirty_ratelimit---of 7
perf_trace_flush_foreign---of 7
perf_trace_global_dirty_state---of 6
perf_trace_inode_foreign_history---of 9
perf_trace_inode_switch_wbs---of 7
perf_trace_track_foreign_dirty---of 11
perf_trace_wbc_class---of 9
perf_trace_writeback_bdi_register---of 7
perf_trace_writeback_class---of 7
perf_trace_writeback_dirty_inode_template---of 7
perf_trace_writeback_folio_template---of 12
perf_trace_writeback_inode_template---of 6
perf_trace_writeback_pages_written---of 6
perf_trace_writeback_queue_io---of 7
perf_trace_writeback_sb_inodes_requeue---of 13
perf_trace_writeback_single_inode_template---of 9
perf_trace_writeback_work_class---of 9
perf_trace_writeback_write_inode_template---of 9
queue_io---of 30
rcu_lock_acquire---of 2
rcu_lock_release---of 2
redirty_tail_locked---of 7
sb_clear_inode_writeback---of 21
sb_mark_inode_writeback---of 21
sync_inode_metadata---of 1
sync_inodes_sb---of 39
trace_event_raw_event_balance_dirty_pages---of 10
trace_event_raw_event_bdi_dirty_ratelimit---of 8
trace_event_raw_event_flush_foreign---of 8
trace_event_raw_event_global_dirty_state---of 7
trace_event_raw_event_inode_foreign_history---of 10
trace_event_raw_event_inode_switch_wbs---of 8
trace_event_raw_event_track_foreign_dirty---of 12
trace_event_raw_event_wbc_class---of 10
trace_event_raw_event_writeback_bdi_register---of 8
trace_event_raw_event_writeback_class---of 8
trace_event_raw_event_writeback_dirty_inode_template---of 8
trace_event_raw_event_writeback_folio_template---of 13
trace_event_raw_event_writeback_inode_template---of 7
trace_event_raw_event_writeback_pages_written---of 7
trace_event_raw_event_writeback_queue_io---of 8
trace_event_raw_event_writeback_sb_inodes_requeue---of 14
trace_event_raw_event_writeback_single_inode_template---of 10
trace_event_raw_event_writeback_work_class---of 10
trace_event_raw_event_writeback_write_inode_template---of 10
trace_raw_output_balance_dirty_pages---of 3
trace_raw_output_bdi_dirty_ratelimit---of 3
trace_raw_output_flush_foreign---of 3
trace_raw_output_global_dirty_state---of 3
trace_raw_output_inode_foreign_history---of 3
trace_raw_output_inode_switch_wbs---of 3
trace_raw_output_track_foreign_dirty---of 3
trace_raw_output_wbc_class---of 3
trace_raw_output_writeback_bdi_register---of 3
trace_raw_output_writeback_class---of 3
trace_raw_output_writeback_dirty_inode_template---of 3
trace_raw_output_writeback_folio_template---of 3
trace_raw_output_writeback_inode_template---of 3
trace_raw_output_writeback_pages_written---of 3
trace_raw_output_writeback_queue_io---of 3
trace_raw_output_writeback_sb_inodes_requeue---of 3
trace_raw_output_writeback_single_inode_template---of 3
trace_raw_output_writeback_work_class---of 3
trace_raw_output_writeback_write_inode_template---of 3
trace_writeback_pages_written---of 17
try_to_writeback_inodes_sb---of 3
wakeup_dirtytime_writeback---of 20
wakeup_flusher_threads---of 16
wakeup_flusher_threads_bdi---of 11
wb_get12%of 17
wb_io_lists_depopulated---of 11
wb_queue_work---of 29
wb_start_background_writeback---of 19
wb_wait_for_completion---of 7
wb_wakeup_delayed---of 3
wb_workfn---of 58
wb_writeback---of 65
wbc_account_cgroup_owner---of 9
wbc_attach_and_unlock_inode---of 14
wbc_attach_fdatawrite_inode---of 3
wbc_detach_inode---of 28
write_inode_now---of 3
writeback_inodes_sb---of 1
writeback_inodes_sb_nr---of 1
writeback_inodes_wb---of 3
writeback_sb_inodes---of 54
writeback_single_inode---of 22
xas_next_marked---of 13
-----------
SUMMARY18%of 162

-----------
SUMMARY---of 0

__arm64_sys_fsmount---of 25
__arm64_sys_listmount---of 21
__arm64_sys_mount---of 23
__arm64_sys_mount_setattr---of 15
__arm64_sys_move_mount---of 27
__arm64_sys_open_tree---of 5
__arm64_sys_open_tree_attr---of 19
__arm64_sys_pivot_root---of 32
__arm64_sys_statmount---of 115
__arm64_sys_umount---of 4
__cleanup_mnt---of 1
__detach_mounts---of 13
__do_loopback---of 18
__is_local_mountpoint---of 4
__legitimize_mnt50%of 10
__lookup_mnt29%of 7
__mnt_is_readonly---of 3
__put_mountpoint---of 5
_inline_copy_from_user---of 8
_inline_copy_to_user---of 7
alloc_mnt_ns---of 13
alloc_vfsmnt---of 10
attach_mnt---of 11
attach_recursive_mnt---of 108
can_move_mount_beneath---of 11
check_for_nsfs_mounts---of 12
cleanup_mnt---of 15
clone_mnt---of 35
clone_private_mount---of 16
collect_mounts---of 3
commit_tree---of 15
copy_mnt_id_req---of 17
copy_mnt_ns---of 44
copy_tree---of 39
count_mounts---of 12
current_chrooted---of 6
delayed_free_vfsmnt---of 1
delayed_mntput---of 4
dissolve_on_fput---of 18
do_change_type---of 14
do_listmount---of 49
do_lock_mount---of 20
do_loopback---of 13
do_mount---of 3
do_mount_setattr---of 85
do_move_mount---of 39
do_move_mount_old---of 5
do_new_mount---of 37
drop_collected_mounts---of 1
fc_mount---of 3
finish_automount---of 32
free_mnt_ns---of 6
from_mnt_ns---of 1
get_mountpoint---of 17
get_sequential_mnt_ns---of 31
get_user_ns---of 7
grab_requested_mnt_ns---of 37
grab_requested_root---of 13
graft_tree---of 4
has_locked_children---of 7
invent_group_ids---of 22
is_path_reachable---of 5
iterate_mounts---of 5
kern_mount---of 3
kern_unmount---of 5
kern_unmount_array---of 10
lock_mnt_tree---of 8
lock_mount_hash100%of 1
lookup_mnt16%of 26
m_next---of 3
m_show---of 1
m_start---of 7
m_stop---of 1
mark_mounts_for_expiry---of 16
may_mount---of 1
may_umount---of 1
may_umount_tree---of 12
mnt_add_count50%of 4
mnt_add_to_ns---of 15
mnt_change_mountpoint---of 25
mnt_clone_internal---of 3
mnt_drop_write100%of 1
mnt_drop_write_file100%of 3
mnt_get_count75%of 4
mnt_get_write_access38%of 16
mnt_get_write_access_file60%of 5
mnt_make_shortterm---of 3
mnt_may_suid---of 4
mnt_notify_add---of 9
mnt_ns_from_dentry---of 4
mnt_ns_release---of 11
mnt_ns_release_rcu---of 1
mnt_ns_tree_add---of 13
mnt_put_write_access67%of 6
mnt_put_write_access_file100%of 3
mnt_release_group_id---of 1
mnt_set_expiry---of 3
mnt_set_mountpoint---of 6
mnt_set_mountpoint_beneath---of 6
mnt_want_write67%of 3
mnt_want_write_file50%of 6
mnt_warn_timestamp_expiry---of 9
mntget80%of 5
mntns_get---of 7
mntns_install---of 13
mntns_owner---of 1
mntns_put---of 1
mntput75%of 4
mntput_no_expire20%of 30
mount_subtree---of 10
mount_too_revealing---of 20
namespace_unlock---of 25
our_mnt---of 1
path_is_mountpoint---of 19
path_is_under---of 5
path_mount---of 51
path_overmounted---of 17
path_umount---of 49
put_mnt_ns---of 6
put_mountpoint---of 5
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock34%of 6
read_seqbegin50%of 8
sb_end_write36%of 14
sb_prepare_remount_readonly---of 18
sb_start_write42%of 12
set_mount_attributes---of 3
show_path---of 3
statmount_string---of 54
touch_mnt_namespace---of 3
tree_contains_unbindable---of 9
umount_mnt---of 13
umount_tree---of 59
unhash_mnt---of 9
unlock_mount---of 5
unlock_mount_hash100%of 1
vfs_create_mount---of 11
vfs_kern_mount---of 9
vfs_move_mount---of 23
vfs_open_tree---of 26
vfs_submount---of 3
wants_mount_setattr---of 36
-----------
SUMMARY43%of 179

-----------
SUMMARY---of 0

avtab_alloc---of 6
avtab_alloc_dup---of 4
avtab_destroy---of 10
avtab_init---of 1
avtab_insert_nonunique---of 20
avtab_insertf---of 21
avtab_read---of 20
avtab_read_item---of 36
avtab_search_node60%of 15
avtab_search_node_next29%of 14
avtab_write---of 9
avtab_write_item---of 8
-----------
SUMMARY45%of 29

-----------
SUMMARY---of 0

__copy_overflow---of 1
copy_from_kernel_nofault59%of 17
copy_from_kernel_nofault_allowed100%of 1
copy_from_user_nofault---of 4
copy_to_kernel_nofault32%of 16
copy_to_user_nofault---of 8
strncpy_from_kernel_nofault---of 6
strncpy_from_user_nofault---of 4
strnlen_user_nofault---of 1
-----------
SUMMARY48%of 34

__read_end_io---of 31
bio_post_read_processing---of 7
decrypt_work---of 3
ext4_exit_post_read_processing---of 1
ext4_mpage_readpages13%of 101
folio_zero_segment28%of 43
mpage_end_io---of 4
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
verity_work---of 1
-----------
SUMMARY20%of 148

__sw_hweight16---of 1
__sw_hweight32100%of 1
__sw_hweight64100%of 1
__sw_hweight8---of 1
-----------
SUMMARY100%of 2

audit_log_lsm_data11%of 55
common_lsm_audit37%of 11
ipv4_skb_to_auditdata---of 13
ipv6_skb_to_auditdata---of 25
print_ipv4_addr---of 5
print_ipv6_addr---of 5
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY20%of 70

-----------
SUMMARY---of 0

__nfqnl_enqueue_packet---of 116
instance_create---of 7
instance_destroy---of 3
instance_destroy_rcu---of 16
local_bh_disable---of 2
local_bh_enable---of 2
nfnl_queue_net_exit---of 33
nfnl_queue_net_init---of 1
nfnl_queue_pernet---of 16
nfqnl_enqueue_packet---of 23
nfqnl_nf_hook_drop---of 12
nfqnl_put_packet_info---of 3
nfqnl_put_sk_classid---of 6
nfqnl_put_sk_uidgid---of 7
nfqnl_rcv_dev_event8%of 27
nfqnl_rcv_nl_event---of 11
nfqnl_recv_config---of 44
nfqnl_recv_unsupp---of 1
nfqnl_recv_verdict---of 56
nfqnl_recv_verdict_batch---of 22
nfqnl_reinject---of 68
rcu_lock_acquire---of 2
rcu_lock_release---of 2
seq_next---of 5
seq_show---of 1
seq_start---of 26
seq_stop---of 1
-----------
SUMMARY8%of 27

__put_net---of 3
cleanup_net---of 51
copy_net_ns---of 25
free_exit_list---of 13
get_net_ns43%of 7
get_net_ns_by_fd---of 11
get_net_ns_by_id---of 19
get_net_ns_by_pid---of 18
net_drop_ns---of 3
net_eq_idr---of 1
net_ns_barrier---of 1
net_ns_get_ownership---of 6
net_ns_net_exit---of 1
net_ns_net_init---of 1
net_passive_dec---of 6
netns_get---of 7
netns_install---of 14
netns_owner100%of 1
netns_put43%of 7
ops_free_list---of 20
ops_init---of 22
peernet2id28%of 11
peernet2id_alloc---of 18
peernet_has_id---of 1
preinit_net---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
ref_tracker_dir_init---of 4
register_pernet_device---of 4
register_pernet_operations---of 19
register_pernet_subsys---of 1
rtnl_net_dumpid---of 36
rtnl_net_dumpid_one---of 6
rtnl_net_fill---of 10
rtnl_net_getid---of 43
rtnl_net_newid---of 25
rtnl_net_notifyid---of 6
setup_net---of 42
unregister_pernet_device---of 3
unregister_pernet_operations---of 10
unregister_pernet_subsys---of 1
-----------
SUMMARY47%of 30

-----------
SUMMARY---of 0

__arm64_compat_sys_ioctl---of 15
__arm64_sys_ioctl75%of 8
_inline_copy_from_user63%of 8
compat_ptr_ioctl---of 3
copy_fsxattr_to_user43%of 7
do_vfs_ioctl44%of 158
fiemap_fill_next_extent---of 10
fiemap_prep---of 8
fileattr_fill_flags100%of 15
fileattr_fill_xflags100%of 15
vfs_fileattr_get---of 3
vfs_fileattr_set64%of 33
vfs_ioctl---of 3
-----------
SUMMARY55%of 244

misc_deregister---of 7
misc_devnode50%of 6
misc_open34%of 15
misc_register---of 24
misc_seq_next---of 1
misc_seq_show---of 1
misc_seq_start---of 1
misc_seq_stop---of 1
-----------
SUMMARY39%of 21

-----------
SUMMARY---of 0

ethnl_bcastmsg_put---of 1
ethnl_default_doit---of 40
ethnl_default_done---of 1
ethnl_default_dumpit---of 53
ethnl_default_notify---of 27
ethnl_default_set_doit---of 31
ethnl_default_start---of 16
ethnl_dump_put---of 1
ethnl_fill_reply_header---of 8
ethnl_multicast---of 3
ethnl_netdev_event34%of 6
ethnl_ops_begin---of 19
ethnl_ops_complete---of 5
ethnl_parse_header_dev_get---of 29
ethnl_reply_init---of 9
ethnl_req_get_phydev---of 12
ethnl_sock_priv_destroy---of 3
ethnl_sock_priv_set---of 3
ethnl_unicast_put---of 1
ethtool_notify---of 9
genlmsg_reply---of 1
netdev_put---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY34%of 6

___d_drop36%of 17
__d_add22%of 23
__d_alloc65%of 14
__d_drop---of 3
__d_free---of 1
__d_free_external---of 1
__d_instantiate55%of 22
__d_lookup36%of 31
__d_lookup_rcu53%of 19
__d_lookup_rcu_op_compare---of 17
__d_lookup_unhash34%of 18
__d_lookup_unhash_wake100%of 1
__d_move---of 69
__d_obtain_alias---of 38
__d_rehash32%of 16
__d_unalias---of 16
__dentry_kill76%of 29
d_add67%of 3
d_add_ci---of 11
d_alloc50%of 4
d_alloc_anon100%of 1
d_alloc_cursor---of 4
d_alloc_name---of 4
d_alloc_parallel20%of 75
d_alloc_pseudo58%of 7
d_ancestor---of 4
d_delete---of 6
d_drop---of 3
d_exchange---of 9
d_find_alias---of 8
d_find_alias_rcu25%of 8
d_find_any_alias---of 3
d_genocide---of 1
d_genocide_kill---of 6
d_hash_and_lookup---of 10
d_instantiate50%of 4
d_instantiate_new---of 5
d_invalidate25%of 8
d_lookup72%of 7
d_lru_add---of 12
d_lru_del---of 12
d_make_root---of 5
d_mark_dontcache---of 4
d_mark_tmpfile34%of 6
d_move---of 1
d_obtain_alias---of 1
d_obtain_root---of 1
d_parent_ino---of 16
d_prune_aliases---of 6
d_rehash---of 1
d_same_name---of 6
d_set_d_op65%of 20
d_set_mounted---of 10
d_splice_alias---of 17
d_tmpfile50%of 4
d_wait_lookup---of 6
d_walk17%of 49
dentry_free70%of 10
dentry_lru_isolate---of 18
dentry_lru_isolate_shrink---of 8
dentry_unlink_inode47%of 15
dget_parent---of 27
do_one_tree---of 3
dput23%of 22
dput_to_list---of 12
fast_dput32%of 16
find_submount67%of 3
hlist_bl_unlock---of 4
is_subdir---of 22
lock_for_kill34%of 9
path_check_mount---of 5
path_has_submounts---of 1
proc_nr_dentry---of 10
prune_dcache_sb---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock34%of 6
read_word_at_a_time100%of 1
release_dentry_name_snapshot---of 7
retain_dentry---of 13
select_collect25%of 8
select_collect2---of 11
seqcount_lockdep_reader_access60%of 5
shrink_dcache_for_umount---of 9
shrink_dcache_parent17%of 12
shrink_dcache_sb---of 13
shrink_dentry_list---of 30
shrink_kill---of 22
start_dir_add30%of 17
take_dentry_name_snapshot---of 26
to_shrink_list---of 11
umount_check---of 7
vfs_pressure_ratio---of 1
write_seqlock---of 1
write_sequnlock---of 1
-----------
SUMMARY38%of 514

-----------
SUMMARY---of 0

_setid_policy_lookup---of 14
rcu_lock_acquire---of 2
rcu_lock_release---of 2
safesetid_security_capable34%of 6
safesetid_task_fix_setgid---of 22
safesetid_task_fix_setgroups---of 24
safesetid_task_fix_setuid---of 22
setid_policy_lookup---of 34
-----------
SUMMARY34%of 6

-----------
SUMMARY---of 0

__vlan_find_dev_deep_rcu---of 13
vlan_dev_real_dev---of 3
vlan_dev_vlan_id---of 1
vlan_dev_vlan_proto---of 1
vlan_do_receive7%of 49
vlan_filter_drop_vids---of 12
vlan_filter_push_vids---of 24
vlan_for_each---of 18
vlan_gro_complete---of 3
vlan_gro_receive---of 21
vlan_info_rcu_free---of 1
vlan_uses_dev---of 11
vlan_vid_add---of 34
vlan_vid_del---of 31
vlan_vids_add_by_dev---of 27
vlan_vids_del_by_dev---of 18
-----------
SUMMARY7%of 49

__debugfs_create_file37%of 11
debugfs_alloc_inode100%of 1
debugfs_automount---of 1
debugfs_change_name---of 22
debugfs_create_automount---of 10
debugfs_create_dir30%of 10
debugfs_create_file_full100%of 1
debugfs_create_file_short---of 1
debugfs_create_file_size---of 3
debugfs_create_file_unsafe---of 1
debugfs_create_symlink---of 5
debugfs_fill_super---of 3
debugfs_free_fc---of 1
debugfs_free_inode---of 3
debugfs_get_tree---of 3
debugfs_init_fs_context---of 3
debugfs_initialized100%of 1
debugfs_lookup40%of 5
debugfs_lookup_and_remove---of 4
debugfs_parse_param---of 8
debugfs_reconfigure---of 7
debugfs_release_dentry50%of 4
debugfs_remove67%of 3
debugfs_setattr---of 4
debugfs_show_options---of 7
failed_creating---of 1
init_once100%of 1
remove_one19%of 16
start_creating47%of 13
-----------
SUMMARY40%of 66

__bforget---of 9
__bh_read---of 8
__bh_read_batch---of 15
__block_write_begin---of 1
__block_write_begin_int---of 98
__block_write_full_folio---of 69
__bread_gfp---of 14
__breadahead---of 15
__brelse---of 4
__find_get_block---of 1
__find_get_block_nonatomic---of 1
__lock_buffer---of 5
__sync_dirty_buffer---of 16
__wait_on_buffer---of 3
alloc_buffer_head---of 8
alloc_page_buffers---of 3
bdev_getblk---of 27
bh_read---of 3
bh_uptodate_or_lock---of 5
block_commit_write---of 14
block_dirty_folio---of 12
block_invalidate_folio---of 25
block_is_partially_uptodate---of 16
block_page_mkwrite---of 16
block_read_full_folio---of 36
block_truncate_page---of 32
block_write_begin---of 4
block_write_end---of 3
block_write_full_folio---of 10
buffer_check_dirty_writeback---of 12
buffer_exit_cpu_dead---of 9
clean_bdev_aliases---of 28
cont_write_begin---of 32
create_empty_buffers---of 25
decrypt_bh---of 7
drop_buffers---of 18
end_bio_bh_io_sync---of 4
end_buffer_async_read---of 18
end_buffer_async_read_io---of 10
end_buffer_async_write---of 18
end_buffer_read_sync---of 10
end_buffer_write_sync---of 13
find_get_block_common---of 76
folio_alloc_buffers---of 33
folio_init_buffers---of 17
folio_put---of 6
folio_set_bh---of 9
folio_test_uptodate---of 4
folio_zero_new_buffers---of 14
free_buffer_head---of 8
generic_block_bmap---of 1
generic_buffers_fsync---of 7
generic_buffers_fsync_noflush---of 5
generic_cont_expand_simple---of 5
generic_write_end---of 14
has_bh_in_lru---of 17
inode_has_buffers100%of 1
invalidate_bh_lru---of 8
invalidate_bh_lrus---of 1
invalidate_bh_lrus_cpu---of 9
invalidate_inode_buffers---of 9
lock_buffer---of 10
mark_buffer_async_write---of 4
mark_buffer_dirty---of 28
mark_buffer_dirty_inode---of 9
mark_buffer_write_io_error---of 15
pfn_valid---of 31
rcu_lock_acquire---of 2
rcu_lock_release---of 2
remove_inode_buffers---of 10
submit_bh---of 1
submit_bh_wbc---of 16
sync_dirty_buffer---of 1
sync_mapping_buffers---of 52
touch_buffer---of 14
try_to_free_buffers---of 12
unlock_buffer---of 3
verify_bh---of 3
write_boundary_block---of 6
write_dirty_buffer---of 7
zero_user_segments---of 18
-----------
SUMMARY100%of 1

msleep---of 4
msleep_interruptible---of 6
process_timeout---of 1
schedule_hrtimeout---of 1
schedule_hrtimeout_range---of 1
schedule_hrtimeout_range_clock---of 8
schedule_timeout34%of 6
schedule_timeout_idle---of 3
schedule_timeout_interruptible---of 3
schedule_timeout_killable---of 3
schedule_timeout_uninterruptible---of 3
usleep_range_state---of 9
-----------
SUMMARY34%of 6

-----------
SUMMARY---of 0

_inline_copy_to_user---of 7
net_generic---of 16
rcu_lock_acquire---of 2
rcu_lock_release---of 2
register_vlan_dev---of 32
register_vlan_device---of 10
unregister_vlan_dev---of 19
vlan_check_real_dev---of 19
vlan_device_event5%of 104
vlan_exit_net---of 1
vlan_init_net---of 1
vlan_ioctl_handler---of 39
-----------
SUMMARY5%of 104

cat_destroy---of 1
cat_index---of 5
cat_read---of 6
cat_write---of 4
class_index---of 4
class_read---of 24
class_write---of 23
cls_destroy---of 18
common_destroy---of 3
common_index---of 4
common_read---of 11
common_write---of 4
context_read_and_validate---of 7
ebitmap_init---of 1
filename_trans_read---of 51
filename_trans_write---of 6
filename_write_helper---of 9
filename_write_helper_compat---of 17
filenametr_cmp---of 4
filenametr_destroy---of 3
filenametr_hash---of 1
genfs_read---of 34
genfs_write---of 22
hashtab_insert---of 9
mls_read_level---of 4
mls_read_range_helper---of 10
mls_write_level---of 3
mls_write_range_helper---of 9
next_entry---of 3
ocontext_destroy---of 7
ocontext_read---of 48
ocontext_write---of 52
perm_destroy---of 1
perm_read---of 6
perm_write---of 4
policydb_bounds_sanity_check---of 5
policydb_class_isvalid---of 3
policydb_context_isvalid16%of 13
policydb_destroy---of 18
policydb_filenametr_search---of 11
policydb_index---of 16
policydb_load_isids---of 11
policydb_rangetr_search---of 10
policydb_read---of 92
policydb_role_isvalid---of 3
policydb_roletr_search20%of 10
policydb_type_isvalid---of 3
policydb_write---of 28
put_entry---of 4
range_read---of 33
range_tr_destroy---of 1
range_write---of 3
range_write_helper---of 5
read_cons_helper---of 35
role_allow_write---of 9
role_bounds_sanity_check---of 17
role_destroy---of 3
role_index---of 5
role_read---of 12
role_tr_destroy---of 1
role_trans_cmp---of 4
role_trans_hash---of 1
role_trans_write---of 3
role_trans_write_one---of 5
role_write---of 7
sens_destroy---of 3
sens_index---of 5
sens_read---of 7
sens_write---of 5
str_read---of 6
string_to_av_perm---of 8
string_to_security_class---of 3
type_bounds_sanity_check---of 12
type_destroy---of 1
type_index---of 6
type_read---of 11
type_write---of 6
user_bounds_sanity_check---of 17
user_destroy---of 3
user_index---of 5
user_read---of 13
user_write---of 8
write_cons_helper---of 19
-----------
SUMMARY18%of 23

__ipi_send_mask25%of 8
__ipi_send_single---of 5
ipi_get_hwirq---of 10
ipi_send_mask---of 19
ipi_send_single---of 15
irq_destroy_ipi---of 8
irq_reserve_ipi---of 21
-----------
SUMMARY25%of 8

ioeventfd_destructor67%of 3
ioeventfd_write16%of 13
irqfd_inject---of 3
irqfd_ptable_queue_proc100%of 1
irqfd_resampler_ack---of 6
irqfd_resampler_shutdown56%of 9
irqfd_shutdown---of 3
irqfd_update67%of 6
irqfd_wakeup53%of 19
kvm_arch_irqfd_allowed100%of 1
kvm_arch_irqfd_route_changed---of 1
kvm_arch_update_irqfd_routing---of 1
kvm_assign_ioeventfd_idx65%of 28
kvm_deassign_ioeventfd_idx66%of 23
kvm_eventfd_init100%of 1
kvm_ioeventfd94%of 15
kvm_irq_has_notifier---of 8
kvm_irq_routing_update63%of 8
kvm_irqfd78%of 44
kvm_irqfd_exit---of 1
kvm_irqfd_init---of 1
kvm_irqfd_release72%of 7
kvm_notify_acked_gsi---of 6
kvm_notify_acked_irq---of 22
kvm_notify_irqfd_resampler---of 11
kvm_register_irq_ack_notifier100%of 3
kvm_unregister_irq_ack_notifier---of 4
list_add_rcu67%of 3
srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
srcu_read_lock100%of 1
srcu_read_unlock67%of 3
-----------
SUMMARY68%of 192

adjust_historical_crosststamp---of 17
change_clocksource---of 8
delta_to_ns_safe---of 1
do_adjtimex---of 40
do_settimeofday64---of 16
do_timer---of 1
dummy_clock_read---of 3
get_device_system_crosststamp---of 28
getboottime64---of 1
ktime_get46%of 11
ktime_get_boot_fast_ns---of 6
ktime_get_coarse_real_ts6450%of 6
ktime_get_coarse_real_ts64_mg58%of 7
ktime_get_coarse_ts64---of 6
ktime_get_coarse_with_offset---of 8
ktime_get_mono_fast_ns84%of 6
ktime_get_raw---of 9
ktime_get_raw_fast_ns---of 6
ktime_get_raw_ts64---of 11
ktime_get_real_fast_ns---of 6
ktime_get_real_seconds100%of 1
ktime_get_real_ts64---of 13
ktime_get_real_ts64_mg---of 17
ktime_get_resolution_ns---of 8
ktime_get_seconds---of 3
ktime_get_snapshot43%of 14
ktime_get_tai_fast_ns---of 6
ktime_get_ts64---of 13
ktime_get_update_offsets_now---of 13
ktime_get_with_offset---of 11
ktime_mono_to_any---of 1
ktime_real_to_base_clock---of 11
pvclock_gtod_register_notifier---of 1
pvclock_gtod_unregister_notifier---of 1
random_get_entropy_fallback---of 3
read_persistent_clock64---of 1
seqcount_lockdep_reader_access80%of 5
timekeeper_lock_irqsave---of 1
timekeeper_unlock_irqrestore---of 1
timekeeping_advance---of 38
timekeeping_clocksource_has_base---of 3
timekeeping_forward_now---of 10
timekeeping_inject_offset---of 19
timekeeping_max_deferment---of 6
timekeeping_notify---of 3
timekeeping_resume---of 16
timekeeping_suspend---of 7
timekeeping_update_from_shadow---of 13
timekeeping_valid_for_hres---of 6
timekeeping_warp_clock---of 3
timespec64_sub---of 1
tk_set_wall_to_mono---of 3
tk_setup_internals---of 4
update_fast_timekeeper---of 1
update_wall_time---of 3
-----------
SUMMARY57%of 50

-----------
SUMMARY---of 0

compute_fgu62%of 49
compute_hcr_e2h---of 3
compute_hcr_rw---of 3
feat_aderr---of 1
feat_anerr---of 1
feat_csv2_2_csv2_1p250%of 4
feat_ebep_pmuv3_ss67%of 3
feat_nv2---of 4
feat_nv2_e2h0_ni---of 6
feat_pauth100%of 1
feat_pauth_lr---of 1
feat_rasv1p150%of 4
feat_sme_smps---of 3
feat_spe_fds67%of 3
feat_trbe_mpam50%of 4
get_reg_fixed_bits---of 75
not_feat_aa64el3---of 1
-----------
SUMMARY61%of 68

kvm_arch_set_irq_inatomic45%of 9
kvm_set_msi100%of 4
kvm_set_routing_entry100%of 5
kvm_vgic_setup_default_irq_routing30%of 10
vgic_irqfd_set_irq50%of 4
-----------
SUMMARY57%of 32

__compute_trap_behaviour---of 82
__forward_traps8%of 28
check_cnthctl_el1nvpct---of 17
check_cnthctl_el1nvvct---of 17
check_cnthctl_el1pcten---of 1
check_cnthctl_el1pten---of 1
check_cptr_tta---of 17
check_mdcr_hpmn---of 12
encoding_next---of 6
forward_debug_exception---of 1
forward_smc_trap100%of 1
get_sanitized_cnthctl---of 20
kvm_emulate_nested_eret---of 43
kvm_inject_el2_exception---of 28
kvm_inject_nested---of 26
kvm_inject_nested_irq---of 17
kvm_inject_nested_sync---of 1
print_sys_reg_msg67%of 3
triage_sysreg_trap7%of 120
-----------
SUMMARY9%of 152

___pskb_trim21%of 39
__alloc_skb46%of 11
__build_skb---of 3
__build_skb_around34%of 6
__consume_stateless_skb---of 1
__copy_skb_header22%of 19
__kfree_skb---of 3
__kunmap_atomic---of 3
__napi_alloc_frag_align---of 10
__napi_kfree_skb---of 3
__netdev_alloc_frag_align---of 4
__netdev_alloc_skb---of 20
__pskb_copy_fclone---of 30
__pskb_pull_tail---of 52
__skb_checksum---of 26
__skb_checksum_complete---of 7
__skb_checksum_complete_head---of 7
__skb_clone60%of 5
__skb_complete_tx_timestamp---of 10
__skb_ext_alloc---of 3
__skb_ext_del---of 12
__skb_ext_put---of 16
__skb_ext_set---of 3
__skb_frag_ref---of 7
__skb_frag_unref---of 14
__skb_pad---of 19
__skb_send_sock---of 18
__skb_splice_bits---of 17
__skb_to_sgvec---of 35
__skb_tstamp_tx---of 41
__skb_unclone_keeptruesize---of 6
__skb_vlan_pop---of 19
__skb_warn_lro_forwarding---of 3
__skb_zcopy_downgrade_managed---of 10
__splice_segment---of 28
_inline_copy_from_user---of 8
alloc_skb_for_msg---of 3
alloc_skb_with_frags56%of 20
build_skb43%of 7
build_skb_around---of 7
consume_skb63%of 8
csum_and_copy_from_iter_full---of 67
csum_block_add_ext---of 1
csum_partial_ext---of 1
drop_reasons_register_subsys---of 3
drop_reasons_unregister_subsys---of 3
folio_size---of 4
kfree_skb_list_reason---of 17
kfree_skb_partial---of 4
kfree_skbmem20%of 10
kmalloc_reserve67%of 9
kmap_local_folio---of 5
kmap_local_page---of 5
local_bh_disable---of 2
local_bh_enable---of 2
local_lock_acquire---of 6
local_lock_release---of 7
mm_account_pinned_pages---of 15
mm_unaccount_pinned_pages---of 4
msg_zerocopy_complete---of 21
msg_zerocopy_put_abort---of 5
msg_zerocopy_realloc---of 25
napi_alloc_skb---of 20
napi_build_skb---of 7
napi_consume_skb---of 13
napi_pp_put_page---of 11
napi_skb_cache_get---of 13
napi_skb_cache_get_bulk---of 18
napi_skb_cache_put---of 14
napi_skb_free_stolen_head---of 10
net_zcopy_get---of 6
nf_reset_ct---of 7
page_ref_inc---of 3
pfn_valid---of 31
pskb_carve---of 72
pskb_expand_head14%of 46
pskb_extract---of 7
pskb_put---of 6
pskb_trim_rcsum_slow58%of 7
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
sendmsg_locked---of 4
sendmsg_unlocked---of 3
sk_skb_reason_drop46%of 11
skb_abort_seq_read---of 4
skb_add_rx_frag_netmem---of 8
skb_append---of 1
skb_append_pagefrags---of 17
skb_attempt_defer_free---of 21
skb_checksum---of 1
skb_checksum_setup---of 35
skb_checksum_setup_ip---of 20
skb_checksum_trimmed---of 24
skb_clone34%of 12
skb_clone_sk---of 15
skb_cloned---of 3
skb_coalesce_rx_frag---of 3
skb_complete_tx_timestamp---of 18
skb_complete_wifi_ack---of 15
skb_condense---of 8
skb_copy---of 8
skb_copy_and_csum_bits---of 29
skb_copy_and_csum_dev---of 8
skb_copy_bits7%of 31
skb_copy_expand---of 11
skb_copy_header---of 1
skb_copy_seq_read---of 5
skb_copy_ubufs---of 74
skb_cow_data---of 42
skb_cow_data_for_xdp---of 3
skb_dequeue---of 3
skb_dequeue_tail---of 3
skb_dump---of 45
skb_ensure_writable---of 15
skb_ensure_writable_head_tail---of 10
skb_errqueue_purge---of 9
skb_eth_pop---of 13
skb_eth_push---of 16
skb_expand_head---of 17
skb_ext_add---of 17
skb_fill_page_desc---of 6
skb_find_text---of 3
skb_frag_unref40%of 15
skb_free_head67%of 6
skb_head_is_locked---of 4
skb_headers_offset_update---of 5
skb_morph---of 3
skb_mpls_dec_ttl---of 17
skb_mpls_pop---of 20
skb_mpls_push---of 30
skb_mpls_update_lse---of 7
skb_partial_csum_set---of 6
skb_pp_cow_data---of 47
skb_pp_frag_ref---of 12
skb_prepare_for_shift---of 4
skb_prepare_seq_read---of 1
skb_pull50%of 6
skb_pull_data---of 6
skb_pull_rcsum34%of 9
skb_push---of 3
skb_put50%of 4
skb_queue_head---of 1
skb_queue_purge_reason---of 7
skb_queue_tail100%of 1
skb_rbtree_purge---of 4
skb_realloc_headroom---of 5
skb_release_data45%of 34
skb_release_head_state40%of 15
skb_scrub_packet---of 15
skb_segment---of 134
skb_segment_list---of 43
skb_send_sock---of 1
skb_send_sock_locked---of 1
skb_seq_read---of 31
skb_shift---of 47
skb_splice_bits---of 3
skb_splice_from_iter---of 20
skb_split---of 19
skb_store_bits---of 31
skb_to_sgvec---of 3
skb_to_sgvec_nomark---of 1
skb_trim50%of 4
skb_try_coalesce---of 37
skb_ts_finish---of 4
skb_ts_get_next_block---of 1
skb_tstamp_tx---of 1
skb_tx_error---of 8
skb_unlink---of 1
skb_vlan_pop---of 12
skb_vlan_push---of 16
skb_vlan_untag43%of 33
skb_zerocopy---of 43
skb_zerocopy_clone---of 19
skb_zerocopy_headlen---of 7
skb_zerocopy_iter_stream---of 20
slab_build_skb---of 6
sock_dequeue_err_skb---of 10
sock_queue_err_skb---of 25
sock_rmem_free---of 3
sock_spd_release---of 9
trace_consume_skb29%of 14
trace_kfree_skb29%of 14
virt_to_head_page---of 3
warn_crc32c_csum_combine---of 3
warn_crc32c_csum_update---of 3
xas_next_entry---of 15
-----------
SUMMARY34%of 396

__devinet_sysctl_register---of 5
__inet_del_ifa---of 69
__inet_insert_ifa---of 31
__ip_dev_find---of 23
check_lifetime---of 43
confirm_addr_indev---of 33
devinet_conf_proc---of 45
devinet_exit_net---of 5
devinet_init_net---of 18
devinet_ioctl---of 93
devinet_sysctl_forward---of 22
devinet_sysctl_register---of 6
in_dev_dump_addr---of 38
in_dev_finish_destroy---of 12
in_dev_free_rcu---of 1
in_dev_select_addr---of 17
inet_abc_len---of 7
inet_addr_onlink---of 33
inet_alloc_ifa---of 7
inet_confirm_addr---of 23
inet_dump_addr---of 60
inet_dump_ifaddr---of 1
inet_dump_ifmcaddr---of 1
inet_fill_ifaddr---of 34
inet_fill_link_af30%of 10
inet_get_link_af_size29%of 7
inet_gifconf---of 35
inet_ifa_byprefix---of 20
inet_insert_ifa---of 3
inet_lookup_ifaddr_rcu---of 4
inet_netconf_dump_devconf---of 34
inet_netconf_fill_devconf---of 35
inet_netconf_get_devconf---of 64
inet_netconf_notify_devconf---of 20
inet_rcu_free_ifa---of 6
inet_rtm_deladdr---of 27
inet_rtm_newaddr---of 81
inet_select_addr---of 68
inet_set_link_af---of 15
inet_validate_link_af---of 17
inetdev_by_index---of 18
inetdev_event5%of 91
inetdev_init---of 22
ipv4_doint_and_flush---of 4
put_cacheinfo---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
register_inetaddr_notifier---of 1
register_inetaddr_validator_notifier---of 1
rtmsg_ifa---of 7
unregister_inetaddr_notifier---of 1
unregister_inetaddr_validator_notifier---of 1
-----------
SUMMARY9%of 108

-----------
SUMMARY---of 0

aarch32_insn_extract_reg_num---of 1
aarch32_insn_is_wide---of 1
aarch32_insn_mcr_extract_crm---of 1
aarch32_insn_mcr_extract_opc2---of 1
aarch64_get_branch_offset---of 9
aarch64_insn_adrp_get_offset---of 3
aarch64_insn_adrp_set_offset---of 4
aarch64_insn_decode_immediate---of 14
aarch64_insn_decode_register---of 3
aarch64_insn_encode_immediate---of 15
aarch64_insn_extract_system_reg---of 1
aarch64_insn_gen_add_sub_imm---of 14
aarch64_insn_gen_add_sub_shifted_reg---of 14
aarch64_insn_gen_adr---of 7
aarch64_insn_gen_atomic_ld_op---of 8
aarch64_insn_gen_bitfield---of 13
aarch64_insn_gen_branch_imm34%of 6
aarch64_insn_gen_branch_reg---of 4
aarch64_insn_gen_cas---of 7
aarch64_insn_gen_comp_branch_imm---of 12
aarch64_insn_gen_cond_branch_imm---of 6
aarch64_insn_gen_data1---of 13
aarch64_insn_gen_data2---of 11
aarch64_insn_gen_data3---of 15
aarch64_insn_gen_dmb---of 3
aarch64_insn_gen_extr---of 10
aarch64_insn_gen_load_acq_store_rel---of 8
aarch64_insn_gen_load_literal---of 5
aarch64_insn_gen_load_store_ex---of 8
aarch64_insn_gen_load_store_imm---of 10
aarch64_insn_gen_load_store_pair---of 11
aarch64_insn_gen_load_store_reg---of 10
aarch64_insn_gen_logical_immediate---of 22
aarch64_insn_gen_logical_shifted_reg---of 14
aarch64_insn_gen_move_reg---of 7
aarch64_insn_gen_movewide---of 13
aarch64_insn_gen_mrs---of 3
aarch64_set_branch_offset---of 9
-----------
SUMMARY34%of 6

copy_highpage11%of 59
copy_user_highpage100%of 1
pfn_valid20%of 31
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY18%of 95

____fput100%of 1
__fput61%of 33
__fput_deferred38%of 8
__fput_sync---of 9
alloc_empty_backing_file---of 4
alloc_empty_file34%of 9
alloc_empty_file_noaccount---of 4
alloc_file_clone---of 3
alloc_file_pseudo50%of 4
alloc_file_pseudo_noaccount---of 7
backing_file_user_path---of 1
class_preempt_destructor50%of 4
delayed_fput---of 4
file_free50%of 10
file_init_path86%of 14
flush_delayed_fput---of 4
fput89%of 9
fput_close28%of 11
fput_close_sync55%of 11
get_max_files---of 1
init_file50%of 6
proc_nr_files---of 1
put_cred---of 6
-----------
SUMMARY57%of 120

-----------
SUMMARY---of 0

__tun_build_skb40%of 10
__tun_chr_ioctl36%of 100
__tun_detach---of 66
__tun_set_ebpf---of 11
_inline_copy_from_user38%of 8
_inline_copy_to_user43%of 7
bpf_prog_run_clear_cb---of 8
dev_sw_netstats_rx_add60%of 5
group_show---of 3
local_bh_disable100%of 2
local_bh_enable100%of 2
owner_show---of 3
pfn_valid20%of 31
pskb_may_pull---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock34%of 6
set_offload34%of 9
tun_attach---of 64
tun_attach_filter---of 10
tun_chr_close---of 14
tun_chr_compat_ioctl---of 1
tun_chr_fasync---of 6
tun_chr_ioctl100%of 1
tun_chr_open---of 4
tun_chr_poll---of 22
tun_chr_read_iter---of 12
tun_chr_show_fdinfo---of 9
tun_chr_write_iter40%of 10
tun_detach_filter34%of 9
tun_device_event5%of 47
tun_do_read---of 57
tun_fill_info38%of 16
tun_flags_show---of 1
tun_flow_cleanup---of 12
tun_flow_create---of 6
tun_flow_uninit---of 10
tun_flow_update---of 41
tun_free_netdev---of 3
tun_get27%of 23
tun_get_addr_len---of 17
tun_get_channels---of 1
tun_get_coalesce---of 1
tun_get_drvinfo---of 4
tun_get_iff50%of 4
tun_get_link_ksettings---of 1
tun_get_msglevel---of 1
tun_get_size100%of 1
tun_get_socket---of 4
tun_get_tx_ring---of 4
tun_get_user19%of 200
tun_napi_poll---of 12
tun_net_change_carrier---of 4
tun_net_close---of 1
tun_net_fix_features100%of 1
tun_net_get_stats64---of 1
tun_net_init---of 4
tun_net_initialize---of 4
tun_net_mclist---of 1
tun_net_open---of 6
tun_net_uninit---of 46
tun_net_xmit---of 83
tun_not_capable---of 7
tun_peek_len---of 11
tun_prog_free---of 1
tun_ptr_free---of 4
tun_queue_purge---of 13
tun_recvmsg---of 18
tun_ring_recv---of 26
tun_rx_batched19%of 11
tun_select_queue---of 28
tun_sendmsg---of 101
tun_set_coalesce---of 1
tun_set_ebpf40%of 5
tun_set_headroom---of 1
tun_set_iff---of 37
tun_set_link_ksettings---of 1
tun_set_msglevel---of 1
tun_set_sndbuf---of 9
tun_setup---of 1
tun_sock_write_space38%of 8
tun_validate---of 3
tun_vnet_hdr_to_skb7%of 74
tun_vnet_ioctl36%of 39
tun_xdp---of 20
tun_xdp_act---of 35
tun_xdp_xmit---of 33
update_filter8%of 26
-----------
SUMMARY25%of 659

-----------
SUMMARY---of 0

copy_ns_info_to_user---of 7
ns_dname100%of 1
ns_get_name---of 4
ns_get_path---of 3
ns_get_path_cb---of 3
ns_ioctl33%of 71
ns_match---of 3
nsfs_evict100%of 1
nsfs_init_fs_context---of 3
nsfs_init_inode100%of 1
nsfs_put_data100%of 1
nsfs_show_path---of 1
open_namespace40%of 5
open_related_ns67%of 3
proc_ns_file---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY38%of 83

netconsole_netdev_event6%of 39
write_ext_msg---of 26
write_msg24%of 13
-----------
SUMMARY10%of 52

-----------
SUMMARY---of 0

task_work_add25%of 24
task_work_cancel---of 10
task_work_cancel_func---of 10
task_work_cancel_match---of 10
task_work_run73%of 11
task_work_set_notify_irq---of 3
-----------
SUMMARY40%of 35

device_cmp---of 7
inet_cmp---of 8
iterate_cleanup_work---of 8
masq_device_event67%of 3
masq_inet6_event---of 3
masq_inet_event---of 4
nf_nat_masq_schedule---of 23
nf_nat_masquerade_inet_register_notifiers---of 7
nf_nat_masquerade_inet_unregister_notifiers---of 3
nf_nat_masquerade_ipv4---of 16
nf_nat_masquerade_ipv6---of 9
-----------
SUMMARY67%of 3

netlbl_unlabel_accept---of 5
netlbl_unlabel_getattr---of 34
netlbl_unlabel_list---of 5
netlbl_unlabel_staticadd---of 19
netlbl_unlabel_staticadddef---of 18
netlbl_unlabel_staticlist---of 75
netlbl_unlabel_staticlist_gen---of 22
netlbl_unlabel_staticlistdef---of 66
netlbl_unlabel_staticremove---of 16
netlbl_unlabel_staticremovedef---of 15
netlbl_unlhsh_add---of 63
netlbl_unlhsh_free_iface---of 31
netlbl_unlhsh_netdev_handler34%of 6
netlbl_unlhsh_remove---of 71
netlbl_unlhsh_search_iface---of 19
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY34%of 6

add_uevent_var50%of 4
alloc_uevent_skb67%of 3
cleanup_uevent_env---of 1
init_uevent_argv---of 3
kobj_usermode_filter---of 3
kobject_synth_uevent---of 61
kobject_uevent---of 1
kobject_uevent_env24%of 47
kobject_uevent_net_broadcast30%of 34
uevent_net_exit---of 4
uevent_net_init---of 6
uevent_net_rcv---of 1
uevent_net_rcv_skb---of 13
zap_modalias_env---of 9
-----------
SUMMARY29%of 88

net_generic---of 16
netdev_debug_event10%of 55
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rtnl_net_debug_net_exit---of 1
rtnl_net_debug_net_init---of 1
-----------
SUMMARY10%of 55

-----------
SUMMARY---of 0

__dcbnl_pg_getcfg---of 47
__dcbnl_pg_setcfg---of 34
dcb_delrewr---of 11
dcb_doit---of 21
dcb_getapp---of 9
dcb_getrewr---of 9
dcb_getrewr_prio_dscp_mask_map---of 9
dcb_getrewr_prio_pcp_mask_map---of 9
dcb_ieee_delapp---of 13
dcb_ieee_getapp_default_prio_mask---of 9
dcb_ieee_getapp_dscp_prio_mask_map---of 9
dcb_ieee_getapp_mask---of 9
dcb_ieee_getapp_prio_dscp_mask_map---of 9
dcb_ieee_setapp---of 15
dcb_setapp---of 17
dcb_setrewr---of 13
dcbnl_app_table_setdel---of 12
dcbnl_bcn_getcfg---of 21
dcbnl_bcn_setcfg---of 25
dcbnl_build_peer_app---of 14
dcbnl_cee_fill---of 60
dcbnl_cee_get---of 3
dcbnl_cee_notify---of 1
dcbnl_cee_pg_fill---of 17
dcbnl_getapp---of 16
dcbnl_getapptrust---of 11
dcbnl_getcap---of 14
dcbnl_getdcbx---of 3
dcbnl_getfeatcfg---of 28
dcbnl_getnumtcs---of 19
dcbnl_getperm_hwaddr---of 3
dcbnl_getpfccfg---of 13
dcbnl_getpfcstate---of 3
dcbnl_getstate---of 3
dcbnl_ieee_del---of 28
dcbnl_ieee_fill---of 66
dcbnl_ieee_get---of 3
dcbnl_ieee_notify---of 1
dcbnl_ieee_set---of 62
dcbnl_netdevice_event23%of 9
dcbnl_notify---of 10
dcbnl_pgrx_getcfg---of 1
dcbnl_pgrx_setcfg---of 1
dcbnl_pgtx_getcfg---of 1
dcbnl_pgtx_setcfg---of 1
dcbnl_setall---of 4
dcbnl_setapp---of 10
dcbnl_setdcbx---of 4
dcbnl_setfeatcfg---of 13
dcbnl_setnumtcs---of 10
dcbnl_setpfccfg---of 20
dcbnl_setpfcstate---of 4
dcbnl_setstate---of 4
-----------
SUMMARY23%of 9

_prb_commit56%of 9
_prb_read_valid28%of 44
data_alloc38%of 8
data_push_tail10%of 20
desc_read55%of 11
desc_update_last_finalized34%of 9
prb_commit---of 6
prb_final_commit100%of 1
prb_first_seq50%of 4
prb_first_valid_seq---of 3
prb_init---of 1
prb_next_reserve_seq---of 9
prb_next_seq---of 4
prb_read_valid100%of 1
prb_read_valid_info---of 1
prb_record_text_space---of 1
prb_reserve25%of 49
prb_reserve_in_last---of 54
-----------
SUMMARY31%of 156

-----------
SUMMARY---of 0

audit_inode_permission67%of 3
copy_to_sockptr---of 11
delayed_superblock_init---of 1
file_has_perm47%of 13
file_map_prot_check90%of 10
has_cap_mac_admin---of 10
inode_doinit_use_xattr---of 12
inode_doinit_with_dentry26%of 39
inode_mode_to_security_class---of 3
ioctl_has_perm50%of 12
match_file---of 4
may_context_mount_inode_relabel---of 3
may_context_mount_sb_relabel---of 3
may_create42%of 12
may_link---of 17
ptrace_parent_sid---of 19
rcu_lock_acquire---of 2
rcu_lock_release---of 2
sb_finish_set_opts---of 30
selinux_add_opt---of 20
selinux_binder_set_context_mgr---of 1
selinux_binder_transaction---of 4
selinux_binder_transfer_binder---of 1
selinux_binder_transfer_file---of 20
selinux_bpf---of 4
selinux_bpf_map---of 1
selinux_bpf_map_create---of 3
selinux_bpf_map_free---of 1
selinux_bpf_prog---of 1
selinux_bpf_prog_free---of 1
selinux_bpf_prog_load---of 3
selinux_bpf_token_create---of 3
selinux_bpf_token_free---of 1
selinux_bprm_committed_creds---of 6
selinux_bprm_committing_creds---of 15
selinux_bprm_creds_for_exec---of 35
selinux_capable34%of 12
selinux_capget---of 1
selinux_capset---of 1
selinux_complete_init---of 5
selinux_cred_getlsmprop---of 1
selinux_cred_getsecid---of 1
selinux_cred_prepare---of 1
selinux_cred_transfer---of 1
selinux_current_getlsmprop_subj100%of 1
selinux_d_instantiate67%of 3
selinux_dentry_create_files_as---of 5
selinux_dentry_init_security---of 6
selinux_determine_inode_label43%of 14
selinux_file_alloc_security100%of 1
selinux_file_fcntl---of 16
selinux_file_ioctl71%of 24
selinux_file_ioctl_compat---of 6
selinux_file_lock---of 1
selinux_file_mprotect---of 17
selinux_file_open57%of 16
selinux_file_permission44%of 16
selinux_file_receive---of 4
selinux_file_send_sigiotask---of 5
selinux_file_set_fowner---of 1
selinux_free_mnt_opts---of 1
selinux_fs_context_dup---of 3
selinux_fs_context_parse_param---of 3
selinux_fs_context_submount---of 9
selinux_genfs_get_sid34%of 9
selinux_getprocattr---of 4
selinux_getselfattr---of 3
selinux_ib_alloc_security---of 1
selinux_ib_endport_manage_subnet---of 3
selinux_ib_pkey_access---of 3
selinux_inet_conn_established---of 7
selinux_inet_conn_request---of 7
selinux_inet_csk_clone---of 1
selinux_inet_sys_rcv_skb---of 5
selinux_inode_alloc_security67%of 3
selinux_inode_copy_up---of 6
selinux_inode_copy_up_xattr---of 4
selinux_inode_create100%of 1
selinux_inode_follow_link---of 12
selinux_inode_free_security25%of 8
selinux_inode_get_acl---of 9
selinux_inode_getattr---of 9
selinux_inode_getlsmprop---of 3
selinux_inode_getsecctx---of 3
selinux_inode_getsecurity---of 16
selinux_inode_getxattr---of 9
selinux_inode_init_security36%of 14
selinux_inode_init_security_anon---of 12
selinux_inode_invalidate_secctx---of 3
selinux_inode_link---of 1
selinux_inode_listsecurity---of 4
selinux_inode_listxattr---of 9
selinux_inode_mkdir---of 1
selinux_inode_mknod---of 3
selinux_inode_notifysecctx---of 1
selinux_inode_permission61%of 23
selinux_inode_post_setxattr---of 13
selinux_inode_readlink---of 9
selinux_inode_remove_acl---of 9
selinux_inode_removexattr---of 10
selinux_inode_rename---of 38
selinux_inode_rmdir---of 1
selinux_inode_set_acl---of 9
selinux_inode_setattr28%of 22
selinux_inode_setsecctx---of 1
selinux_inode_setsecurity---of 9
selinux_inode_setxattr---of 30
selinux_inode_symlink---of 1
selinux_inode_unlink---of 1
selinux_inode_xattr_skipcap---of 1
selinux_ip_forward---of 24
selinux_ip_output---of 9
selinux_ip_postroute---of 62
selinux_ipc_getlsmprop---of 1
selinux_ipc_permission---of 3
selinux_ismaclabel---of 1
selinux_kernel_act_as---of 3
selinux_kernel_create_files_as---of 10
selinux_kernel_load_data---of 8
selinux_kernel_load_from_file---of 12
selinux_kernel_module_request---of 1
selinux_kernel_read_file---of 3
selinux_kernfs_init_security---of 11
selinux_key_alloc---of 3
selinux_key_getsecurity---of 3
selinux_key_permission---of 12
selinux_lsm_getattr---of 34
selinux_lsm_notifier_avc_callback---of 3
selinux_lsm_setattr---of 32
selinux_lsmprop_to_secctx---of 5
selinux_mmap_addr67%of 3
selinux_mmap_file72%of 7
selinux_mount---of 10
selinux_move_mount---of 9
selinux_mptcp_add_subflow---of 1
selinux_msg_msg_alloc_security---of 1
selinux_msg_queue_alloc_security---of 1
selinux_msg_queue_associate---of 1
selinux_msg_queue_msgctl---of 9
selinux_msg_queue_msgrcv---of 3
selinux_msg_queue_msgsnd---of 7
selinux_netcache_avc_callback---of 3
selinux_netlink_send---of 20
selinux_nf_register---of 1
selinux_nf_unregister---of 1
selinux_parse_skb---of 70
selinux_path_notify---of 15
selinux_perf_event_alloc100%of 1
selinux_perf_event_open---of 3
selinux_perf_event_read---of 1
selinux_perf_event_write---of 1
selinux_ptrace_access_check---of 3
selinux_ptrace_traceme---of 1
selinux_quota_on---of 9
selinux_quotactl---of 18
selinux_release_secctx---of 3
selinux_req_classify_flow---of 1
selinux_sb_alloc_security---of 1
selinux_sb_clone_mnt_opts---of 63
selinux_sb_eat_lsm_opts---of 41
selinux_sb_kern_mount---of 1
selinux_sb_mnt_opts_compat---of 26
selinux_sb_remount---of 25
selinux_sb_show_options---of 24
selinux_sb_statfs---of 1
selinux_sctp_assoc_established---of 3
selinux_sctp_assoc_request---of 6
selinux_sctp_bind_connect---of 22
selinux_sctp_process_new_assoc---of 16
selinux_sctp_sk_clone---of 3
selinux_secctx_to_secid---of 1
selinux_secid_to_secctx---of 5
selinux_secmark_refcount_dec---of 3
selinux_secmark_refcount_inc---of 3
selinux_secmark_relabel_packet---of 1
selinux_sem_alloc_security---of 1
selinux_sem_associate---of 1
selinux_sem_semctl---of 16
selinux_sem_semop---of 1
selinux_set_mnt_opts---of 81
selinux_setprocattr---of 3
selinux_setselfattr---of 1
selinux_shm_alloc_security---of 1
selinux_shm_associate---of 1
selinux_shm_shmat---of 1
selinux_shm_shmctl---of 11
selinux_sk_alloc_security---of 1
selinux_sk_clone_security---of 1
selinux_sk_free_security---of 1
selinux_sk_getsecid---of 3
selinux_sock_graft---of 7
selinux_socket_accept---of 9
selinux_socket_bind---of 28
selinux_socket_connect---of 3
selinux_socket_connect_helper---of 23
selinux_socket_create---of 5
selinux_socket_getpeername---of 4
selinux_socket_getpeersec_dgram---of 14
selinux_socket_getpeersec_stream---of 9
selinux_socket_getsockname---of 4
selinux_socket_getsockopt---of 4
selinux_socket_listen---of 4
selinux_socket_post_create---of 10
selinux_socket_recvmsg---of 4
selinux_socket_sendmsg---of 4
selinux_socket_setsockopt---of 5
selinux_socket_shutdown---of 4
selinux_socket_sock_rcv_skb8%of 28
selinux_socket_socketpair---of 1
selinux_socket_unix_may_send---of 1
selinux_socket_unix_stream_connect---of 4
selinux_syslog---of 7
selinux_task_alloc---of 1
selinux_task_getioprio---of 1
selinux_task_getlsmprop_obj---of 1
selinux_task_getpgid---of 1
selinux_task_getscheduler---of 1
selinux_task_getsid---of 1
selinux_task_kill---of 8
selinux_task_movememory---of 1
selinux_task_prlimit---of 3
selinux_task_setioprio---of 1
selinux_task_setnice---of 1
selinux_task_setpgid---of 1
selinux_task_setrlimit---of 3
selinux_task_setscheduler---of 1
selinux_task_to_inode---of 5
selinux_tun_dev_alloc_security---of 1
selinux_tun_dev_attach---of 1
selinux_tun_dev_attach_queue---of 1
selinux_tun_dev_create---of 1
selinux_tun_dev_open---of 4
selinux_umount---of 1
selinux_uring_allowed---of 1
selinux_uring_cmd---of 3
selinux_uring_override_creds---of 1
selinux_uring_sqpoll---of 1
selinux_userns_create---of 1
selinux_vm_enough_memory100%of 1
selinux_watch_key---of 1
show_sid---of 4
socket_type_to_security_class---of 27
task_sid_obj---of 16
-----------
SUMMARY44%of 296

-----------
SUMMARY---of 0

__cpu_down_maps_locked---of 1
__cpuhp_remove_state---of 1
__cpuhp_remove_state_cpuslocked---of 15
__cpuhp_setup_state---of 1
__cpuhp_setup_state_cpuslocked---of 34
__cpuhp_state_add_instance---of 1
__cpuhp_state_add_instance_cpuslocked---of 22
__cpuhp_state_remove_instance---of 12
__probestub_cpuhp_enter---of 1
__probestub_cpuhp_exit---of 1
__probestub_cpuhp_multi_enter---of 1
__traceiter_cpuhp_enter---of 4
__traceiter_cpuhp_exit---of 4
__traceiter_cpuhp_multi_enter---of 4
_cpu_down---of 35
_cpu_up---of 31
active_show---of 3
add_cpu---of 1
arch_cpuhp_sync_state_poll---of 1
arch_smt_update---of 1
arch_thaw_secondary_cpus_begin---of 1
arch_thaw_secondary_cpus_end---of 1
bringup_cpu---of 15
bringup_hibernate_cpu---of 6
clear_tasks_mm_cpumask---of 23
control_show---of 4
control_store---of 16
cpu_device_down---of 1
cpu_device_up---of 1
cpu_down---of 7
cpu_hotplug_disable---of 1
cpu_hotplug_disable_offlining---of 1
cpu_hotplug_enable---of 4
cpu_hotplug_pm_callback---of 9
cpu_maps_update_begin---of 1
cpu_maps_update_done---of 1
cpu_mitigations_auto_nosmt---of 1
cpu_mitigations_off---of 1
cpu_smt_possible---of 1
cpu_up---of 14
cpuhp_ap_report_dead---of 1
cpuhp_complete_idle_dead---of 1
cpuhp_invoke_callback---of 24
cpuhp_issue_call---of 12
cpuhp_kick_ap---of 19
cpuhp_kick_ap_work---of 1
cpuhp_lock_acquire---of 2
cpuhp_lock_release---of 2
cpuhp_online_idle---of 3
cpuhp_report_idle_dead---of 3
cpuhp_reset_state---of 15
cpuhp_should_run---of 1
cpuhp_smt_disable---of 12
cpuhp_smt_enable---of 8
cpuhp_thread_fun---of 21
cpus_read_lock42%of 12
cpus_read_trylock---of 14
cpus_read_unlock36%of 14
cpus_write_lock---of 1
cpus_write_unlock---of 1
fail_show---of 1
fail_store---of 10
finish_cpu---of 7
freeze_secondary_cpus---of 22
init_cpu_possible---of 1
init_cpu_present---of 1
lockdep_acquire_cpus_lock---of 2
lockdep_assert_cpus_held40%of 5
lockdep_is_cpus_held---of 1
lockdep_release_cpus_lock---of 2
notify_cpu_starting---of 9
perf_trace_cpuhp_enter---of 6
perf_trace_cpuhp_exit---of 6
perf_trace_cpuhp_multi_enter---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
remove_cpu---of 1
set_cpu_online---of 15
smp_shutdown_nonboot_cpus---of 18
state_show---of 1
states_show---of 5
take_cpu_down---of 8
takedown_cpu---of 18
target_show---of 1
target_store---of 12
thaw_secondary_cpus---of 10
trace_cpuhp_enter---of 17
trace_cpuhp_exit---of 17
trace_cpuhp_multi_enter---of 17
trace_event_raw_event_cpuhp_enter---of 7
trace_event_raw_event_cpuhp_exit---of 7
trace_event_raw_event_cpuhp_multi_enter---of 7
trace_raw_output_cpuhp_enter---of 3
trace_raw_output_cpuhp_exit---of 3
trace_raw_output_cpuhp_multi_enter---of 3
trace_suspend_resume---of 14
-----------
SUMMARY39%of 31

do_el0_svc100%of 1
do_el0_svc_compat---of 1
el0_svc_common25%of 8
invoke_syscall45%of 9
-----------
SUMMARY39%of 18

-----------
SUMMARY---of 0

__arm64_sys_mlock---of 1
__arm64_sys_mlock2---of 3
__arm64_sys_mlockall---of 24
__arm64_sys_munlock---of 13
__arm64_sys_munlockall---of 13
apply_mlockall_flags---of 11
apply_vma_lock_flags---of 11
can_do_mlock67%of 3
do_mlock---of 30
folio_evictable38%of 16
folio_is_file_lru---of 3
folio_lruvec_relock_irq36%of 17
folio_nr_pages---of 4
local_lock_acquire34%of 6
local_lock_release43%of 7
lru_gen_add_folio28%of 22
lru_gen_del_folio42%of 17
lru_gen_update_size22%of 32
mlock_drain_local100%of 5
mlock_drain_remote---of 7
mlock_fixup---of 9
mlock_folio45%of 20
mlock_folio_batch26%of 188
mlock_new_folio39%of 18
mlock_pte_range---of 69
mlock_vma_pages_range---of 16
munlock_folio46%of 11
need_mlock_drain---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
user_shm_lock---of 10
user_shm_unlock---of 1
vm_flags_reset---of 6
vma_start_write---of 6
-----------
SUMMARY32%of 366

-----------
SUMMARY---of 0

__blkcg_rstat_flush---of 21
__blkg_prfill_u64---of 4
__blkg_release---of 8
bio_associate_blkg---of 15
bio_associate_blkg_from_css---of 74
bio_blkcg_css---of 4
bio_clone_blkg_association---of 4
blk_cgroup_bio_start---of 9
blk_cgroup_congested34%of 15
blkcg_activate_policy---of 36
blkcg_add_delay---of 4
blkcg_css19%of 11
blkcg_css_alloc---of 50
blkcg_css_free---of 15
blkcg_css_offline---of 1
blkcg_css_online---of 7
blkcg_deactivate_policy---of 14
blkcg_exit---of 3
blkcg_exit_disk---of 20
blkcg_get_cgwb_list---of 1
blkcg_init_disk---of 8
blkcg_maybe_throttle_blkg---of 20
blkcg_maybe_throttle_current7%of 33
blkcg_pin_online---of 6
blkcg_policy_register---of 26
blkcg_policy_unregister---of 12
blkcg_print_blkgs---of 19
blkcg_print_stat---of 47
blkcg_reset_stats---of 27
blkcg_rstat_flush---of 3
blkcg_scale_delay---of 11
blkcg_schedule_throttle---of 12
blkcg_unpin_online---of 13
blkg_alloc---of 21
blkg_conf_exit---of 5
blkg_conf_exit_frozen---of 6
blkg_conf_init---of 1
blkg_conf_open_bdev---of 8
blkg_conf_open_bdev_frozen---of 4
blkg_conf_prep---of 62
blkg_create---of 92
blkg_destroy---of 37
blkg_dev_name---of 3
blkg_free_workfn---of 17
blkg_get---of 16
blkg_init_queue---of 1
blkg_release---of 1
blkg_tryget---of 19
local_lock_release---of 7
percpu_ref_put---of 18
radix_tree_preload_end---of 3
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY21%of 63

-----------
SUMMARY---of 0

should_fail_usercopy100%of 1
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

count_shadow_nodes---of 8
folio_memcg---of 10
rcu_lock_acquire---of 2
rcu_lock_release---of 2
scan_shadow_nodes---of 1
shadow_lru_isolate---of 12
workingset_activation---of 26
workingset_age_nonresident---of 10
workingset_eviction---of 49
workingset_refault---of 96
workingset_test_recent---of 82
workingset_update_node40%of 10
-----------
SUMMARY40%of 10

rcu_lock_acquire---of 2
rcu_lock_release---of 2
tomoyo_init_log---of 77
tomoyo_poll_log---of 5
tomoyo_read_log---of 7
tomoyo_write_log---of 1
tomoyo_write_log210%of 22
-----------
SUMMARY10%of 22

__pim_rcv---of 21
_inline_copy_from_user---of 8
_inline_copy_to_user---of 7
_ipmr_fill_mroute---of 1
copy_from_sockptr---of 4
copy_to_sockptr---of 11
dev_hold---of 6
dev_put---of 6
ip_encap---of 12
ip_mr_forward---of 48
ip_mr_input---of 37
ip_mroute_getsockopt---of 14
ip_mroute_setsockopt---of 67
ipmr_cache_free_rcu---of 1
ipmr_cache_report---of 34
ipmr_cache_unresolved---of 18
ipmr_compat_ioctl---of 38
ipmr_destroy_unres---of 8
ipmr_device_event20%of 10
ipmr_dump---of 1
ipmr_expire_process---of 9
ipmr_fill_mroute---of 11
ipmr_forward_finish---of 10
ipmr_free_table---of 4
ipmr_get_route---of 47
ipmr_get_table---of 14
ipmr_hash_cmp---of 3
ipmr_init_vif_indev---of 11
ipmr_ioctl---of 34
ipmr_mfc_add---of 139
ipmr_mfc_delete---of 22
ipmr_mfc_seq_show---of 9
ipmr_mfc_seq_start---of 4
ipmr_mr_table_iter---of 1
ipmr_net_exit---of 1
ipmr_net_exit_batch---of 4
ipmr_net_init---of 17
ipmr_new_table---of 6
ipmr_new_table_set---of 3
ipmr_queue_xmit---of 66
ipmr_rt_fib_lookup---of 9
ipmr_rtm_dumplink---of 55
ipmr_rtm_dumproute---of 13
ipmr_rtm_getroute---of 63
ipmr_rtm_route---of 39
ipmr_rule_action---of 8
ipmr_rule_compare---of 1
ipmr_rule_configure---of 1
ipmr_rule_default---of 3
ipmr_rule_fill---of 1
ipmr_rule_match---of 1
ipmr_rules_dump---of 1
ipmr_rules_exit---of 9
ipmr_seq_read---of 1
ipmr_sk_ioctl---of 4
ipmr_vif_seq_show---of 7
ipmr_vif_seq_start---of 16
ipmr_vif_seq_stop---of 6
mr_mfc_seq_stop---of 8
mroute_clean_tables---of 30
mroute_netlink_event---of 4
mrtsock_destruct---of 11
nf_reset_ct---of 7
nlmsg_parse_deprecated_strict---of 4
pim_rcv---of 15
pim_rcv_v1---of 13
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
reg_vif_get_iflink---of 1
reg_vif_setup---of 1
reg_vif_xmit---of 16
rhltable_remove---of 78
rht_assign_unlock---of 7
rht_lock---of 12
rht_unlock---of 10
vif_add---of 67
vif_delete---of 31
-----------
SUMMARY20%of 10

__hugetlb_cgroup_charge_cgroup19%of 55
__hugetlb_cgroup_commit_charge---of 11
__hugetlb_cgroup_uncharge_folio---of 13
css_put27%of 19
hugetlb_cgroup_charge_cgroup100%of 1
hugetlb_cgroup_charge_cgroup_rsvd100%of 1
hugetlb_cgroup_commit_charge---of 1
hugetlb_cgroup_commit_charge_rsvd---of 8
hugetlb_cgroup_css_alloc---of 30
hugetlb_cgroup_css_free---of 6
hugetlb_cgroup_css_offline---of 20
hugetlb_cgroup_migrate---of 12
hugetlb_cgroup_read_numa_stat---of 28
hugetlb_cgroup_read_u64---of 10
hugetlb_cgroup_read_u64_max---of 8
hugetlb_cgroup_reset---of 6
hugetlb_cgroup_uncharge_cgroup50%of 4
hugetlb_cgroup_uncharge_cgroup_rsvd50%of 4
hugetlb_cgroup_uncharge_counter---of 6
hugetlb_cgroup_uncharge_file_region---of 8
hugetlb_cgroup_uncharge_folio---of 1
hugetlb_cgroup_uncharge_folio_rsvd---of 8
hugetlb_cgroup_write---of 6
hugetlb_cgroup_write_dfl---of 1
hugetlb_cgroup_write_legacy---of 1
hugetlb_events_local_show---of 1
hugetlb_events_show---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY29%of 88

-----------
SUMMARY---of 0

vfio_create_group---of 12
vfio_device_block_group---of 3
vfio_device_group_register---of 3
vfio_device_group_unregister---of 3
vfio_device_group_unuse_iommu---of 6
vfio_device_group_use_iommu---of 6
vfio_device_has_container---of 1
vfio_device_remove_group---of 10
vfio_device_set_group---of 22
vfio_device_unblock_group---of 1
vfio_devnode---of 3
vfio_df_group_close---of 3
vfio_file_has_dev---of 4
vfio_file_iommu_group---of 1
vfio_file_is_group---of 3
vfio_group_cleanup---of 3
vfio_group_enforced_coherent---of 4
vfio_group_fops_open---of 8
vfio_group_fops_release---of 5
vfio_group_fops_unl_ioctl---of 94
vfio_group_from_file67%of 3
vfio_group_release---of 5
vfio_group_set_kvm---of 1
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

__ip_do_redirect---of 47
__ip_rt_update_pmtu---of 31
__ip_select_ident---of 10
__ipv4_neigh_lookup---of 27
__ipv6_neigh_lookup_noref_stub---of 12
dst_discard---of 1
fib_dump_info_fnhe---of 60
fib_lookup---of 29
fib_multipath_hash---of 75
find_exception---of 53
fnhe_flush_routes---of 15
inet_iif---of 8
inet_rtm_getroute---of 131
ip_do_redirect---of 4
ip_error---of 38
ip_handle_martian_source---of 8
ip_mc_validate_source---of 13
ip_mkroute_input---of 61
ip_mtu_from_fib_result---of 14
ip_neigh_gw4---of 12
ip_neigh_gw6---of 3
ip_route_input_noref---of 11
ip_route_input_rcu---of 142
ip_route_output_flow---of 4
ip_route_output_key_hash---of 11
ip_route_output_key_hash_rcu---of 107
ip_route_use_hint---of 25
ip_rt_bug---of 3
ip_rt_do_proc_exit---of 1
ip_rt_do_proc_init---of 5
ip_rt_get_source---of 14
ip_rt_multicast_event---of 3
ip_rt_send_redirect---of 46
ip_rt_update_pmtu---of 29
ipv4_blackhole_route---of 11
ipv4_confirm_neigh---of 40
ipv4_cow_metrics---of 1
ipv4_default_advmss---of 16
ipv4_dst_check---of 3
ipv4_dst_destroy---of 10
ipv4_inetpeer_exit---of 1
ipv4_inetpeer_init---of 3
ipv4_link_failure---of 39
ipv4_mtu---of 27
ipv4_negative_advice---of 5
ipv4_neigh_lookup---of 22
ipv4_redirect---of 3
ipv4_sk_redirect---of 6
ipv4_sk_update_pmtu---of 59
ipv4_sysctl_rtcache_flush---of 6
ipv4_update_pmtu---of 5
l3mdev_master_dev_rcu---of 5
lwtstate_get---of 4
neigh_event_send---of 5
neigh_release---of 6
netns_ip_rt_init---of 1
nexthop_fib_nhc---of 18
nexthop_num_path---of 10
nlmsg_parse_deprecated_strict---of 4
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rt_acct_proc_show---of 6
rt_add_uncached_list---of 3
rt_bind_exception---of 21
rt_cache_flush67%of 3
rt_cache_route---of 14
rt_cache_seq_next---of 1
rt_cache_seq_show---of 3
rt_cache_seq_start---of 1
rt_cache_seq_stop---of 1
rt_cpu_seq_next---of 7
rt_cpu_seq_show---of 3
rt_cpu_seq_start---of 8
rt_cpu_seq_stop---of 1
rt_del_uncached_list---of 4
rt_dst_alloc---of 4
rt_dst_clone---of 8
rt_fill_info---of 60
rt_flush_dev---of 26
rt_genid_init---of 1
rt_is_expired---of 16
rt_set_nexthop---of 31
skb_dst_set_noref---of 4
skb_header_pointer---of 3
sysctl_route_net_exit---of 3
sysctl_route_net_init---of 9
update_or_create_fnhe---of 79
-----------
SUMMARY67%of 3

__iterate_supers---of 24
alloc_super---of 18
bdev_read_only---of 4
bdev_super_lock---of 16
deactivate_locked_super---of 5
deactivate_super---of 6
destroy_super_rcu---of 1
destroy_super_work---of 7
do_emergency_remount---of 1
do_emergency_remount_callback---of 8
do_thaw_all---of 1
do_thaw_all_callback---of 7
drop_super---of 1
drop_super_exclusive---of 1
emergency_remount---of 3
emergency_thaw_all---of 3
free_anon_bdev---of 1
freeze_inc---of 9
freeze_super---of 58
fs_bdev_freeze---of 18
fs_bdev_mark_dead---of 6
fs_bdev_sync---of 3
fs_bdev_thaw---of 17
generic_shutdown_super---of 11
get_anon_bdev---of 3
get_tree_bdev---of 1
get_tree_bdev_flags---of 13
get_tree_keyed---of 6
get_tree_nodev---of 6
get_tree_single---of 6
grab_super---of 11
iterate_supers---of 26
iterate_supers_type---of 25
kill_anon_super---of 1
kill_block_super---of 3
kill_litter_super---of 3
kill_super_notify---of 8
lockdep_sb_freeze_acquire---of 3
lockdep_sb_freeze_release---of 3
mount_bdev---of 10
mount_capable---of 3
mount_nodev---of 5
put_super---of 10
reconfigure_super---of 27
retire_super---of 5
sb_init_dio_done_wq---of 6
set_anon_super---of 3
set_anon_super_fc---of 3
set_bdev_super---of 1
setup_bdev_super---of 18
sget---of 19
sget_dev---of 1
sget_fc---of 27
super_cache_count---of 7
super_cache_scan---of 9
super_lock34%of 15
super_s_dev_set---of 1
super_s_dev_test---of 3
super_setup_bdi---of 3
super_setup_bdi_name---of 5
super_trylock_shared---of 5
test_bdev_super---of 3
test_keyed_super---of 1
test_single_super---of 1
thaw_super67%of 3
thaw_super_locked12%of 17
user_get_super---of 16
vfs_get_tree---of 9
-----------
SUMMARY26%of 35

-----------
SUMMARY---of 0

kvm_prepare_system_event40%of 5
kvm_psci_0_2_call49%of 29
kvm_psci_1_x_call13%of 41
kvm_psci_call44%of 16
kvm_psci_vcpu_on28%of 11
-----------
SUMMARY31%of 102

-----------
SUMMARY---of 0

__efi_fpsimd_begin---of 14
__efi_fpsimd_end---of 6
cpu_enable_fpmr---of 1
cpu_enable_fpsimd---of 1
cpu_enable_sve---of 1
do_fpsimd_acc---of 3
do_fpsimd_exc---of 7
do_sme_acc---of 1
do_sve_acc---of 15
find_supported_vector_length---of 5
fpsimd_bind_state_to_cpu---of 6
fpsimd_bind_task_to_cpu50%of 8
fpsimd_cpu_dead---of 1
fpsimd_cpu_pm_notifier---of 4
fpsimd_flush_task_state---of 4
fpsimd_flush_thread---of 8
fpsimd_flush_thread_vl---of 12
fpsimd_force_sync_to_sve---of 4
fpsimd_preserve_current_state---of 3
fpsimd_release_task---of 1
fpsimd_restore_current_state43%of 7
fpsimd_save_and_flush_cpu_state47%of 15
fpsimd_save_user_state38%of 16
fpsimd_signal_preserve_current_state43%of 7
fpsimd_sync_to_sve---of 5
fpsimd_thread_switch---of 21
fpsimd_update_current_state---of 8
kernel_neon_begin---of 20
kernel_neon_end---of 9
local_bh_disable100%of 2
local_bh_enable100%of 2
sve_alloc---of 6
sve_get_current_vl---of 4
sve_set_current_vl---of 5
sve_state_size---of 3
sve_sync_from_fpsimd_zeropad---of 4
sve_sync_to_fpsimd---of 5
task_fpsimd_load28%of 29
task_get_vl100%of 1
task_get_vl_onexec---of 1
task_set_vl---of 1
task_set_vl_onexec---of 1
vec_probe_vqs---of 10
vec_proc_do_default_vl---of 7
vec_set_vector_length---of 31
vec_update_vq_map---of 1
vec_verify_vq_map---of 9
-----------
SUMMARY42%of 87

-----------
SUMMARY---of 0

__anon_vma_interval_tree_augment_rotate100%of 5
anon_vma_interval_tree_insert100%of 7
anon_vma_interval_tree_iter_first---of 10
anon_vma_interval_tree_iter_next---of 14
anon_vma_interval_tree_remove92%of 36
anon_vma_interval_tree_verify60%of 5
vma_interval_tree_augment_rotate100%of 5
vma_interval_tree_insert100%of 7
vma_interval_tree_insert_after---of 10
vma_interval_tree_iter_first30%of 10
vma_interval_tree_iter_next22%of 14
vma_interval_tree_remove98%of 36
-----------
SUMMARY81%of 125

__security_genfs_sid27%of 19
aurule_avc_callback---of 3
constraint_expr_eval22%of 55
context_destroy---of 1
context_struct_compute_av41%of 59
context_struct_to_string34%of 9
dump_masked_av_helper---of 3
get_classes_callback---of 1
get_permissions_callback---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
security_bounded_transition---of 31
security_change_sid---of 1
security_compute_av32%of 45
security_compute_av_user---of 26
security_compute_sid22%of 82
security_compute_validatetrans---of 41
security_compute_xperms_decision---of 52
security_context_str_to_sid---of 1
security_context_to_sid---of 1
security_context_to_sid_core---of 37
security_context_to_sid_default---of 1
security_context_to_sid_force---of 1
security_dump_masked_av---of 13
security_fs_use---of 35
security_genfs_sid27%of 19
security_get_allow_unknown---of 17
security_get_bool_value---of 19
security_get_bools---of 12
security_get_classes---of 6
security_get_initial_sid_context---of 3
security_get_permissions---of 11
security_get_reject_unknown---of 17
security_get_user_sids---of 60
security_ib_endport_sid---of 30
security_ib_pkey_sid---of 31
security_is_socket_class4%of 61
security_load_policy---of 45
security_load_policycaps---of 14
security_member_sid---of 1
security_mls_enabled---of 17
security_net_peersid_resolve---of 27
security_netif_sid---of 32
security_netlbl_secattr_to_sid---of 34
security_netlbl_sid_to_secattr---of 20
security_node_sid---of 40
security_policycap_supported---of 17
security_port_sid---of 31
security_read_policy---of 10
security_read_state_kernel---of 10
security_set_bools---of 15
security_sid_mls_copy---of 33
security_sid_to_context100%of 1
security_sid_to_context_core35%of 32
security_sid_to_context_force100%of 1
security_sid_to_context_inval100%of 1
security_sidtab_hash_stats---of 17
security_transition_sid100%of 3
security_transition_sid_user---of 1
security_validate_transition---of 1
security_validate_transition_user---of 1
selinux_audit_rule_free---of 3
selinux_audit_rule_init---of 60
selinux_audit_rule_known---of 7
selinux_audit_rule_match---of 59
selinux_notify_policy_change---of 8
selinux_policy_cancel---of 8
selinux_policy_commit---of 15
selinux_policy_genfs_sid---of 1
services_compute_xperms_decision---of 17
services_compute_xperms_drivers---of 5
services_convert_context---of 23
string_to_context_struct---of 15
update_xperms_extended_data---of 5
-----------
SUMMARY27%of 391

arch_bpf_unwind_consume_entry---of 1
arch_irqs_disabled_flags100%of 1
arch_local_save_flags100%of 1
arch_stack_walk_user---of 20
dump_backtrace---of 64
on_task_stack---of 1
preempt_count100%of 1
show_stack---of 1
stackinfo_get_efi---of 1
stackinfo_get_irq100%of 1
stackinfo_get_overflow100%of 1
stackinfo_get_sdei_critical---of 1
stackinfo_get_sdei_normal---of 1
stackinfo_get_task100%of 1
unwind_consume_stack100%of 1
unwind_find_stack37%of 11
unwind_init_common100%of 1
-----------
SUMMARY64%of 19

__arm64_compat_sys_execve---of 1
__arm64_compat_sys_execveat---of 1
__arm64_sys_execve---of 1
__arm64_sys_execveat---of 1
__register_binfmt---of 5
__set_task_comm---of 14
acct_arg_size---of 3
alloc_bprm---of 41
begin_new_exec---of 56
bprm_change_interp---of 3
bprm_execve---of 79
cgroup_threadgroup_change_begin---of 12
cgroup_threadgroup_change_end---of 14
copy_string_kernel---of 9
copy_strings---of 31
de_thread---of 23
do_execveat_common---of 34
do_open_execat---of 18
exec_mmap---of 26
finalize_exec---of 1
free_bprm---of 14
get_arg_page---of 8
get_user_arg_ptr---of 13
kernel_execve---of 39
kmap_local_page---of 35
list_replace_rcu---of 1
open_exec---of 3
path_noexec67%of 3
proc_dointvec_minmax_coredump---of 3
put_arg_page---of 9
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
remove_arg_zero---of 11
set_binfmt---of 5
set_dumpable---of 6
setup_arg_pages---of 30
setup_new_exec---of 3
unregister_binfmt---of 3
unshare_sighand---of 4
would_dump---of 19
-----------
SUMMARY67%of 3

-----------
SUMMARY---of 0

__xfrm_mode_beet_prep---of 18
__xfrm_mode_tunnel_prep---of 16
__xfrm_transport_prep---of 18
dev_hold---of 7
dev_put---of 6
netdev_put---of 6
netdev_tracker_alloc---of 5
rcu_lock_acquire---of 2
rcu_lock_release---of 2
validate_xmit_xfrm---of 49
xfrm_dev_backlog---of 9
xfrm_dev_event12%of 18
xfrm_dev_offload_ok---of 28
xfrm_dev_policy_add---of 33
xfrm_dev_resume---of 22
xfrm_dev_state_add---of 35
xfrm_outer_mode_prep---of 13
xmit_xfrm_check_overflow---of 6
-----------
SUMMARY12%of 18

-----------
SUMMARY---of 0

__arm64_sys_fgetxattr---of 1
__arm64_sys_flistxattr---of 7
__arm64_sys_fremovexattr---of 1
__arm64_sys_fsetxattr---of 1
__arm64_sys_getxattr---of 3
__arm64_sys_getxattrat---of 9
__arm64_sys_lgetxattr---of 3
__arm64_sys_listxattr---of 1
__arm64_sys_listxattrat---of 1
__arm64_sys_llistxattr---of 1
__arm64_sys_lremovexattr---of 1
__arm64_sys_lsetxattr---of 1
__arm64_sys_removexattr---of 1
__arm64_sys_removexattrat---of 1
__arm64_sys_setxattr---of 1
__arm64_sys_setxattrat---of 8
__vfs_getxattr25%of 20
__vfs_removexattr---of 20
__vfs_removexattr_locked---of 12
__vfs_setxattr---of 20
__vfs_setxattr_locked---of 10
__vfs_setxattr_noperm---of 22
_inline_copy_from_user---of 8
do_getxattr---of 21
file_getxattr---of 4
file_setxattr---of 8
filename_getxattr---of 5
filename_setxattr---of 12
fsnotify_xattr---of 8
generic_listxattr---of 11
import_xattr_name---of 1
listxattr---of 23
may_write_xattr---of 5
path_getxattrat---of 13
path_listxattrat---of 16
path_removexattrat---of 28
path_setxattrat---of 16
setxattr_copy---of 7
simple_xattr_add50%of 4
simple_xattr_alloc50%of 4
simple_xattr_free---of 3
simple_xattr_get34%of 9
simple_xattr_list---of 10
simple_xattr_set---of 19
simple_xattr_space100%of 1
simple_xattrs_free38%of 8
simple_xattrs_init100%of 1
vfs_getxattr---of 10
vfs_getxattr_alloc---of 22
vfs_listxattr---of 5
vfs_removexattr---of 8
vfs_setxattr---of 14
xattr_full_name67%of 3
xattr_list_one---of 4
xattr_permission---of 14
xattr_supports_user_prefix---of 8
-----------
SUMMARY38%of 50

-----------
SUMMARY---of 0

kvm_trng_call62%of 18
-----------
SUMMARY62%of 18

-----------
SUMMARY---of 0

cmp_ex_search100%of 1
cmp_ex_sort---of 1
search_extable100%of 1
sort_extable---of 1
swap_ex---of 1
trim_init_extable---of 14
-----------
SUMMARY100%of 2

-----------
SUMMARY---of 0

cgrp_css_alloc---of 1
cgrp_css_free---of 1
cgrp_css_online---of 15
net_prio_attach---of 4
netprio_device_event23%of 9
netprio_set_prio---of 29
rcu_lock_acquire---of 2
rcu_lock_release---of 2
read_prioidx---of 1
read_priomap---of 23
update_netprio---of 3
write_priomap---of 7
-----------
SUMMARY23%of 9

____napi_schedule---of 22
__dev_change_flags---of 19
__dev_change_net_namespace---of 109
__dev_close_many---of 35
__dev_direct_xmit---of 20
__dev_forward_skb---of 1
__dev_forward_skb2---of 18
__dev_get_by_flags---of 7
__dev_get_by_index---of 5
__dev_get_by_name---of 3
__dev_notify_flags---of 20
__dev_open---of 41
__dev_queue_xmit---of 231
__dev_remove_pack---of 14
__dev_set_mtu---of 3
__dev_set_promiscuity---of 14
__dev_set_rx_mode---of 10
__get_xps_queue_idx---of 14
__napi_busy_loop---of 55
__napi_poll---of 17
__napi_schedule---of 5
__napi_schedule_irqoff---of 1
__netdev_adjacent_dev_insert---of 31
__netdev_adjacent_dev_remove---of 22
__netdev_adjacent_dev_unlink_neighbour---of 1
__netdev_notify_peers---of 14
__netdev_printk---of 14
__netdev_put_lock---of 11
__netdev_update_features16%of 144
__netdev_update_lower_level---of 15
__netdev_update_upper_level---of 8
__netdev_upper_dev_link---of 45
__netdev_upper_dev_unlink---of 75
__netdev_walk_all_lower_dev---of 15
__netdev_walk_all_upper_dev---of 10
__netif_napi_del_locked---of 19
__netif_receive_skb58%of 7
__netif_receive_skb_core14%of 271
__netif_receive_skb_list_core---of 23
__netif_rx---of 4
__netif_schedule---of 10
__netif_set_xps_queue---of 141
alloc_netdev_dummy---of 1
alloc_netdev_mqs---of 30
backlog_napi_setup---of 6
backlog_napi_should_run---of 1
bpf_prog_run_generic_xdp---of 42
bpf_xdp_link_attach---of 33
bpf_xdp_link_dealloc---of 1
bpf_xdp_link_detach---of 1
bpf_xdp_link_fill_link_info---of 3
bpf_xdp_link_release---of 23
bpf_xdp_link_show_fdinfo---of 3
bpf_xdp_link_update---of 23
busy_poll_stop---of 23
call_netdevice_notifiers---of 6
call_netdevice_notifiers_info---of 6
call_netdevice_notifiers_mtu---of 6
call_netdevice_register_net_notifiers---of 19
clean_xps_maps---of 25
default_device_exit_batch---of 42
deliver_ptype_list_skb---of 16
dev_add_pack---of 12
dev_alloc_name---of 1
dev_change_xdp_fd---of 17
dev_close_many---of 18
dev_cpu_dead---of 31
dev_fetch_sw_netstats75%of 4
dev_fill_forward_path---of 10
dev_fill_metadata_dst---of 22
dev_forward_skb---of 3
dev_forward_skb_nomtu---of 3
dev_get_alias28%of 18
dev_get_by_index---of 19
dev_get_by_index_rcu---of 5
dev_get_by_name---of 17
dev_get_by_name_rcu---of 5
dev_get_by_napi_id---of 8
dev_get_flags67%of 3
dev_get_iflink50%of 4
dev_get_mac_address---of 14
dev_get_min_mp_channel_count---of 12
dev_get_phys_port_id67%of 3
dev_get_phys_port_name50%of 4
dev_get_port_parent_id22%of 14
dev_get_stats34%of 12
dev_get_tstats64---of 1
dev_getbyhwaddr---of 10
dev_getbyhwaddr_rcu---of 7
dev_getfirstbyhwtype---of 19
dev_hard_start_xmit---of 41
dev_index_reserve---of 6
dev_ingress_queue_create---of 11
dev_kfree_skb_any_reason---of 4
dev_kfree_skb_irq_reason---of 11
dev_loopback_xmit---of 14
dev_nit_active_rcu---of 6
dev_pick_tx_zero---of 1
dev_pre_changeaddr_notify---of 6
dev_prep_valid_name---of 26
dev_qdisc_enqueue---of 15
dev_queue_xmit_nit---of 55
dev_remove_pack---of 4
dev_set_promiscuity---of 4
dev_set_rx_mode---of 10
dev_set_threaded---of 21
dev_valid_name---of 15
dev_validate_mtu---of 8
dev_xdp_attach---of 88
dev_xdp_install---of 20
dev_xdp_prog_count---of 10
dev_xdp_prog_id67%of 3
dev_xdp_sb_prog_count---of 7
do_netdev_rx_csum_fault---of 1
do_xdp_generic---of 42
enqueue_to_backlog---of 40
flush_backlog---of 23
free_netdev---of 25
generic_xdp_install---of 15
generic_xdp_tx---of 11
get_rps_cpu---of 44
init_dummy_netdev---of 5
is_skb_forwardable---of 4
kick_defer_list_purge---of 7
list_netdevice---of 13
local_bh_disable---of 2
local_bh_enable---of 2
local_lock_acquire---of 6
local_lock_release---of 7
napi_busy_loop---of 11
napi_busy_loop_rcu---of 1
napi_complete_done---of 28
napi_disable---of 1
napi_disable_locked---of 20
napi_enable---of 1
napi_enable_locked---of 19
napi_hash_add---of 10
napi_resume_irqs---of 21
napi_schedule---of 7
napi_schedule_prep---of 6
napi_suspend_irqs---of 16
napi_thread_wait---of 14
napi_threaded_poll---of 4
napi_threaded_poll_loop---of 32
napi_watchdog---of 7
net_dec_egress_queue---of 1
net_dec_ingress_queue---of 1
net_disable_timestamp---of 7
net_enable_timestamp---of 7
net_inc_egress_queue---of 1
net_inc_ingress_queue---of 1
net_rx_action---of 52
net_tx_action---of 49
netdev_adjacent_change_abort---of 11
netdev_adjacent_change_commit---of 10
netdev_adjacent_change_prepare---of 20
netdev_adjacent_get_private---of 1
netdev_adjacent_rename_links---of 11
netdev_alert---of 1
netdev_bind_sb_channel_queue---of 9
netdev_bonding_info_change---of 6
netdev_change_features---of 1
netdev_change_proto_down_reason_locked---of 5
netdev_cmd_to_name---of 3
netdev_copy_name46%of 11
netdev_core_pick_tx---of 11
netdev_core_stats_alloc---of 6
netdev_core_stats_inc43%of 7
netdev_crit---of 1
netdev_drivername---of 5
netdev_emerg---of 1
netdev_err---of 1
netdev_exit---of 4
netdev_features_change---of 6
netdev_get_by_index---of 6
netdev_get_by_index_lock---of 3
netdev_get_by_name---of 6
netdev_get_name---of 15
netdev_get_xmit_slave---of 3
netdev_has_any_upper_dev---of 4
netdev_has_upper_dev---of 14
netdev_has_upper_dev_all_rcu---of 11
netdev_hold---of 7
netdev_increment_features---of 1
netdev_info---of 1
netdev_init---of 4
netdev_is_rx_handler_busy---of 10
netdev_lock_ops---of 5
netdev_lower_dev_get_private---of 6
netdev_lower_get_first_private_rcu---of 4
netdev_lower_get_next---of 3
netdev_lower_get_next_private---of 3
netdev_lower_get_next_private_rcu---of 6
netdev_lower_state_changed---of 9
netdev_master_upper_dev_get43%of 7
netdev_master_upper_dev_get_rcu40%of 5
netdev_master_upper_dev_link---of 1
netdev_name_in_use---of 1
netdev_name_node_add---of 4
netdev_name_node_alt_create---of 5
netdev_name_node_alt_destroy---of 8
netdev_name_node_alt_free---of 1
netdev_name_node_lookup---of 5
netdev_napi_by_id_lock---of 47
netdev_next_lower_dev_rcu---of 3
netdev_notice---of 1
netdev_notify_peers---of 1
netdev_offload_xstats_disable---of 14
netdev_offload_xstats_enable---of 15
netdev_offload_xstats_enabled---of 6
netdev_offload_xstats_get---of 22
netdev_offload_xstats_push_delta---of 7
netdev_offload_xstats_report_delta---of 1
netdev_offload_xstats_report_used---of 1
netdev_pick_tx---of 51
netdev_port_same_parent_id---of 6
netdev_printk---of 1
netdev_refcnt_read---of 1
netdev_reg_state---of 4
netdev_reset_tc---of 9
netdev_run_todo9%of 61
netdev_rx_csum_fault---of 3
netdev_rx_handler_register---of 4
netdev_rx_handler_unregister---of 7
netdev_set_default_ethtool_ops---of 3
netdev_set_num_tc---of 10
netdev_set_sb_channel---of 4
netdev_set_tc_queue---of 5
netdev_sk_get_lowest_dev---of 6
netdev_stats_to_stats64100%of 1
netdev_sw_irq_coalesce_default_on---of 15
netdev_txq_to_tc---of 18
netdev_unbind_sb_channel---of 9
netdev_unlock_ops---of 5
netdev_update_features67%of 3
netdev_upper_dev_link---of 1
netdev_upper_dev_unlink---of 1
netdev_upper_get_next_dev_rcu---of 6
netdev_walk_all_lower_dev---of 9
netdev_walk_all_lower_dev_rcu---of 9
netdev_walk_all_upper_dev_rcu---of 12
netdev_warn---of 1
netdev_xa_find_lock---of 25
netdev_xmit_skip_txqueue---of 1
netif_change_carrier---of 4
netif_change_flags---of 3
netif_change_name---of 24
netif_change_proto_down---of 5
netif_change_tx_queue_len---of 11
netif_close---of 6
netif_device_attach---of 8
netif_device_detach---of 10
netif_disable_lro---of 19
netif_enable_cpu_rmap---of 4
netif_get_num_default_rss_queues---of 8
netif_inherit_tso_max---of 8
netif_napi_add_weight_locked---of 31
netif_napi_affinity_release---of 10
netif_napi_irq_notify---of 7
netif_napi_set_irq_locked---of 24
netif_open---of 8
netif_queue_set_napi---of 17
netif_receive_skb23%of 48
netif_receive_skb_core---of 13
netif_receive_skb_list---of 32
netif_receive_skb_list_internal---of 47
netif_reset_xps_queues_gt---of 4
netif_rx---of 5
netif_rx_internal---of 31
netif_schedule_queue---of 17
netif_set_affinity_auto---of 8
netif_set_alias---of 12
netif_set_allmulti---of 12
netif_set_group---of 1
netif_set_mac_address---of 13
netif_set_mtu---of 5
netif_set_mtu_ext---of 21
netif_set_real_num_queues---of 17
netif_set_real_num_rx_queues---of 17
netif_set_real_num_tx_queues---of 39
netif_set_tso_max_segs---of 3
netif_set_tso_max_size---of 6
netif_set_xps_queue---of 1
netif_skb_features---of 38
netif_stacked_transfer_operstate---of 19
netif_state_change32%of 16
netif_tx_stop_all_queues---of 6
netif_tx_wake_queue---of 19
netif_xdp_propagate---of 11
netstamp_clear---of 5
passthru_features_check---of 1
process_backlog---of 58
qdisc_run---of 12
qdisc_run_end---of 4
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
refcount_dec_and_test---of 6
register_netdev---of 3
register_netdevice---of 68
register_netdevice_notifier---of 13
register_netdevice_notifier_dev_net---of 7
register_netdevice_notifier_net---of 5
remove_xps_queue---of 12
rps_may_expire_flow---of 21
rps_trigger_softirq---of 1
rtnl_net_dev_lock---of 23
run_backlog_napi---of 1
set_rps_cpu---of 17
skb_checksum_help---of 18
skb_crc32c_csum_help---of 10
skb_csum_hwoffload_help---of 33
skb_dst_force---of 17
skb_header_pointer---of 4
skb_network_protocol---of 28
skb_warn_bad_offload---of 6
synchronize_net---of 4
tc_run---of 23
tcx_dec---of 1
tcx_inc---of 1
trace_kfree_skb---of 14
trace_napi_poll---of 14
trace_netif_rx_entry---of 14
trace_netif_rx_exit---of 14
trace_xdp_exception---of 14
trigger_rx_softirq---of 1
unlist_netdevice---of 15
unregister_netdev---of 6
unregister_netdevice_many---of 1
unregister_netdevice_many_notify---of 145
unregister_netdevice_notifier---of 10
unregister_netdevice_notifier_dev_net---of 9
unregister_netdevice_notifier_net---of 7
unregister_netdevice_queue---of 10
validate_xmit_skb---of 49
validate_xmit_skb_list---of 7
write_seqlock_bh---of 1
write_sequnlock_bh---of 1
-----------
SUMMARY20%of 650

-----------
SUMMARY---of 0

__get_next_timer_interrupt---of 37
__mod_timer18%of 39
__probestub_hrtimer_cancel---of 1
__probestub_hrtimer_expire_entry---of 1
__probestub_hrtimer_expire_exit---of 1
__probestub_hrtimer_setup---of 1
__probestub_hrtimer_start---of 1
__probestub_itimer_expire---of 1
__probestub_itimer_state---of 1
__probestub_tick_stop---of 1
__probestub_timer_base_idle---of 1
__probestub_timer_cancel---of 1
__probestub_timer_expire_entry---of 1
__probestub_timer_expire_exit---of 1
__probestub_timer_init---of 1
__probestub_timer_start---of 1
__round_jiffies---of 1
__round_jiffies_relative---of 1
__round_jiffies_up---of 1
__round_jiffies_up_relative---of 1
__run_timer_base---of 27
__timer_delete56%of 9
__timer_delete_sync62%of 13
__traceiter_hrtimer_cancel---of 4
__traceiter_hrtimer_expire_entry---of 4
__traceiter_hrtimer_expire_exit---of 4
__traceiter_hrtimer_setup---of 4
__traceiter_hrtimer_start---of 4
__traceiter_itimer_expire---of 4
__traceiter_itimer_state---of 4
__traceiter_tick_stop---of 4
__traceiter_timer_base_idle---of 4
__traceiter_timer_cancel---of 4
__traceiter_timer_expire_entry---of 4
__traceiter_timer_expire_exit---of 4
__traceiter_timer_init---of 4
__traceiter_timer_start---of 4
__try_to_del_timer_sync34%of 9
add_timer---of 3
add_timer_global67%of 3
add_timer_local---of 3
add_timer_on---of 11
calc_wheel_index30%of 10
call_timer_fn---of 36
destroy_timer_on_stack---of 1
detach_timer34%of 21
enqueue_timer32%of 25
fetch_next_timer_interrupt---of 15
fetch_next_timer_interrupt_remote---of 7
get_next_timer_interrupt---of 1
init_timer_key27%of 19
init_timer_on_stack_key---of 3
mod_timer---of 1
mod_timer_pending---of 1
perf_trace_hrtimer_class---of 6
perf_trace_hrtimer_expire_entry---of 6
perf_trace_hrtimer_setup---of 6
perf_trace_hrtimer_start---of 6
perf_trace_itimer_expire---of 7
perf_trace_itimer_state---of 6
perf_trace_tick_stop---of 6
perf_trace_timer_base_idle---of 6
perf_trace_timer_class---of 6
perf_trace_timer_expire_entry---of 6
perf_trace_timer_start---of 6
round_jiffies---of 1
round_jiffies_relative---of 1
round_jiffies_up---of 1
round_jiffies_up_relative---of 1
run_timer_softirq---of 3
stub_timer---of 1
timer_base_is_idle---of 1
timer_base_try_to_set_idle---of 3
timer_clear_idle---of 1
timer_debug_hint---of 4
timer_delete100%of 1
timer_delete_sync100%of 1
timer_expire_remote---of 1
timer_fixup_activate---of 4
timer_fixup_assert_init---of 3
timer_fixup_free---of 3
timer_fixup_init---of 3
timer_is_static_object---of 3
timer_lock_remote_bases---of 11
timer_migration_handler---of 5
timer_recalc_next_expiry---of 15
timer_reduce---of 1
timer_shutdown---of 1
timer_shutdown_sync---of 1
timer_unlock_remote_bases---of 1
timer_update_keys---of 4
timers_dead_cpu---of 15
timers_prepare_cpu---of 1
timers_update_nohz---of 1
trace_event_raw_event_hrtimer_class---of 7
trace_event_raw_event_hrtimer_expire_entry---of 7
trace_event_raw_event_hrtimer_setup---of 7
trace_event_raw_event_hrtimer_start---of 7
trace_event_raw_event_itimer_expire---of 8
trace_event_raw_event_itimer_state---of 7
trace_event_raw_event_tick_stop---of 7
trace_event_raw_event_timer_base_idle---of 7
trace_event_raw_event_timer_class---of 7
trace_event_raw_event_timer_expire_entry---of 7
trace_event_raw_event_timer_start---of 7
trace_raw_output_hrtimer_class---of 3
trace_raw_output_hrtimer_expire_entry---of 3
trace_raw_output_hrtimer_setup---of 3
trace_raw_output_hrtimer_start---of 3
trace_raw_output_itimer_expire---of 3
trace_raw_output_itimer_state---of 3
trace_raw_output_tick_stop---of 3
trace_raw_output_timer_base_idle---of 3
trace_raw_output_timer_class---of 3
trace_raw_output_timer_expire_entry---of 3
trace_raw_output_timer_start---of 3
trace_timer_base_idle---of 17
try_to_del_timer_sync---of 1
update_process_times---of 20
-----------
SUMMARY34%of 150

__arm64_sys_nanosleep---of 5
__arm64_sys_nanosleep_time32---of 5
__hrtimer_get_next_event---of 23
__hrtimer_get_remaining---of 4
__hrtimer_run_queues---of 48
__hrtimer_setup38%of 8
clock_was_set---of 22
clock_was_set_delayed---of 1
clock_was_set_work---of 1
debug_deactivate29%of 14
destroy_hrtimer_on_stack---of 1
do_nanosleep---of 22
enqueue_hrtimer32%of 16
hrtimer_active---of 9
hrtimer_cancel50%of 4
hrtimer_debug_hint---of 1
hrtimer_dummy_timeout---of 1
hrtimer_fixup_activate---of 3
hrtimer_fixup_free---of 5
hrtimer_fixup_init---of 5
hrtimer_forward---of 6
hrtimer_get_next_event---of 3
hrtimer_interrupt---of 15
hrtimer_nanosleep---of 4
hrtimer_nanosleep_restart---of 6
hrtimer_next_event_without---of 23
hrtimer_reprogram---of 13
hrtimer_run_queues---of 6
hrtimer_run_softirq---of 13
hrtimer_setup100%of 1
hrtimer_setup_on_stack---of 1
hrtimer_setup_sleeper_on_stack---of 1
hrtimer_sleeper_start_expires---of 1
hrtimer_start_range_ns18%of 39
hrtimer_try_to_cancel34%of 21
hrtimer_update_next_event---of 24
hrtimer_wakeup---of 3
hrtimers_cpu_dying---of 20
hrtimers_cpu_starting---of 1
hrtimers_prepare_cpu---of 1
hrtimers_resume_local---of 11
ktime_add_safe---of 1
ktime_get_boottime---of 1
ktime_get_clocktai---of 1
ktime_get_real---of 1
nanosleep_copyout---of 6
raise_timer_softirq---of 13
retrigger_next_event---of 7
trace_hrtimer_setup29%of 14
-----------
SUMMARY29%of 117

-----------
SUMMARY---of 0

__flow_hash_from_keys---of 12
__get_hash_from_flowi6---of 4
__skb_flow_dissect12%of 349
__skb_get_hash_net---of 6
__skb_get_hash_symmetric_net---of 4
__skb_get_poff---of 15
bpf_flow_dissect---of 6
flow_dissector_bpf_prog_attach_check---of 9
flow_get_u32_dst---of 4
flow_get_u32_src---of 5
flow_hash_from_keys---of 4
flow_hash_from_keys_seed---of 1
make_flow_keys_digest---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
skb_flow_dissect_ct---of 10
skb_flow_dissect_hash---of 3
skb_flow_dissect_meta---of 6
skb_flow_dissect_tunnel_info---of 36
skb_flow_dissector_init---of 9
skb_flow_get_icmp_tci---of 13
skb_flow_get_ports---of 17
skb_get_hash_perturb---of 1
skb_get_poff---of 3
skb_metadata_dst---of 8
-----------
SUMMARY13%of 353

advance_sched---of 49
dump_schedule---of 17
find_entry_to_transmit---of 15
parse_taprio_schedule---of 58
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
setup_first_end_time---of 9
taprio_attach---of 14
taprio_change---of 161
taprio_dequeue---of 45
taprio_dequeue_from_txq---of 25
taprio_destroy---of 24
taprio_dev_notifier10%of 42
taprio_disable_offload---of 12
taprio_dump---of 49
taprio_dump_class---of 3
taprio_dump_class_stats---of 10
taprio_dump_stats---of 1
taprio_dump_xstats---of 13
taprio_enable_offload---of 43
taprio_enqueue---of 12
taprio_enqueue_one---of 85
taprio_find---of 4
taprio_free_sched_cb---of 6
taprio_get_start_time---of 6
taprio_graft---of 23
taprio_init---of 15
taprio_leaf---of 3
taprio_offload_config_changed---of 13
taprio_offload_free---of 6
taprio_offload_get---of 6
taprio_parse_clockid---of 22
taprio_peek---of 3
taprio_reset---of 7
taprio_select_queue---of 4
taprio_set_picos_per_byte---of 7
taprio_skb_exceeds_queue_max_sdu---of 18
taprio_start_sched---of 3
taprio_update_queue_max_sdu---of 9
taprio_walk---of 8
-----------
SUMMARY10%of 42

-----------
SUMMARY---of 0

arm_perf_starting_cpu---of 8
arm_perf_teardown_cpu---of 6
arm_pmu_hp_init---of 3
arm_pmu_irq_is_nmi---of 1
armpmu_add56%of 9
armpmu_alloc---of 6
armpmu_del67%of 3
armpmu_disable60%of 5
armpmu_disable_percpu_pmunmi---of 1
armpmu_dispatch_irq---of 3
armpmu_enable80%of 5
armpmu_enable_percpu_pmuirq---of 1
armpmu_enable_percpu_pmunmi---of 3
armpmu_event_init17%of 53
armpmu_event_set_period75%of 8
armpmu_event_update46%of 11
armpmu_filter67%of 3
armpmu_free---of 1
armpmu_free_irq---of 4
armpmu_free_percpu_pmuirq---of 6
armpmu_free_percpu_pmunmi---of 6
armpmu_free_pmuirq---of 1
armpmu_free_pmunmi---of 1
armpmu_map_event17%of 12
armpmu_read100%of 1
armpmu_register---of 7
armpmu_request_irq---of 20
armpmu_start---of 4
armpmu_stop---of 3
cpu_pm_pmu_notify---of 28
cpus_show---of 1
-----------
SUMMARY36%of 110

netdev_nl_page_pool_event---of 12
netdev_nl_page_pool_get_do---of 7
netdev_nl_page_pool_get_doit---of 5
netdev_nl_page_pool_get_dump---of 12
netdev_nl_page_pool_get_dumpit---of 1
netdev_nl_page_pool_stats_get_doit---of 12
netdev_nl_page_pool_stats_get_dumpit---of 1
nla_put_uint---of 3
page_pool_check_memory_provider---of 9
page_pool_detached---of 1
page_pool_list---of 5
page_pool_netdevice_event16%of 13
page_pool_nl_fill---of 25
page_pool_nl_stats_fill---of 29
page_pool_unlist---of 4
-----------
SUMMARY16%of 13

__irq_alloc_descs---of 29
__irq_get_desc_lock30%of 10
__irq_put_desc_unlock50%of 4
__irq_set_lockdep_class---of 3
actions_show---of 7
alloc_desc---of 10
chip_name_show---of 4
delayed_free_desc---of 1
generic_handle_domain_irq---of 5
generic_handle_domain_irq_safe---of 11
generic_handle_domain_nmi---of 7
generic_handle_irq---of 5
generic_handle_irq_safe---of 11
handle_irq_desc---of 5
hwirq_show---of 3
irq_free_descs---of 7
irq_get_next_irq---of 11
irq_get_nr_irqs---of 1
irq_get_percpu_devid_partition---of 5
irq_insert_desc---of 3
irq_kobj_release---of 1
irq_lock_sparse---of 1
irq_set_nr_irqs---of 1
irq_set_percpu_devid---of 5
irq_set_percpu_devid_partition---of 5
irq_to_desc100%of 1
irq_unlock_sparse---of 1
kstat_incr_irq_this_cpu---of 1
kstat_irqs_cpu---of 4
kstat_irqs_usr---of 19
name_show---of 3
per_cpu_count_show---of 10
rcu_lock_acquire---of 2
rcu_lock_release---of 2
type_show---of 1
wakeup_show---of 1
-----------
SUMMARY40%of 15

__contpte_try_fold42%of 12
__contpte_try_unfold---of 3
__flush_tlb_range_nosync---of 25
__set_ptes---of 25
contpte_clear_full_ptes---of 13
contpte_clear_young_dirty_ptes---of 9
contpte_convert---of 7
contpte_get_and_clear_full_ptes---of 18
contpte_ptep_clear_flush_young---of 7
contpte_ptep_get---of 5
contpte_ptep_get_lockless---of 10
contpte_ptep_set_access_flags---of 8
contpte_ptep_test_and_clear_young---of 6
contpte_set_ptes---of 6
contpte_wrprotect_ptes---of 15
-----------
SUMMARY42%of 12

add_probe_files---of 4
bind_store---of 12
bus_add_device---of 17
bus_add_driver---of 23
bus_attr_show---of 3
bus_attr_store---of 3
bus_create_file---of 8
bus_find_device---of 15
bus_find_device_by_name---of 13
bus_for_each_dev---of 15
bus_for_each_drv---of 15
bus_get_dev_root---of 8
bus_get_kset---of 8
bus_is_registered---of 8
bus_notify---of 8
bus_probe_device---of 14
bus_put---of 8
bus_register---of 11
bus_register_notifier---of 8
bus_release---of 1
bus_remove_device---of 19
bus_remove_driver---of 11
bus_remove_file---of 8
bus_rescan_devices---of 1
bus_rescan_devices_helper---of 8
bus_sort_breadthfirst---of 23
bus_to_subsys25%of 8
bus_uevent_filter---of 1
bus_uevent_store---of 8
bus_unregister---of 11
bus_unregister_notifier---of 8
device_reprobe---of 10
driver_find---of 9
driver_release---of 3
drivers_autoprobe_show---of 8
drivers_autoprobe_store---of 8
drivers_probe_store---of 9
drv_attr_show---of 3
drv_attr_store---of 3
klist_devices_get---of 1
klist_devices_put---of 1
remove_probe_files---of 1
subsys_interface_register---of 17
subsys_interface_unregister---of 17
subsys_register---of 12
subsys_system_register---of 1
subsys_virtual_register---of 3
system_root_device_release---of 1
uevent_store---of 1
unbind_store---of 11
-----------
SUMMARY25%of 8

__register_chrdev---of 14
__register_chrdev_region---of 43
__unregister_chrdev---of 9
alloc_chrdev_region---of 3
base_probe---of 3
cd_forget---of 3
cdev_add---of 4
cdev_alloc---of 3
cdev_default_release---of 6
cdev_del---of 1
cdev_device_add---of 10
cdev_device_del---of 3
cdev_dynamic_release---of 6
cdev_init---of 1
cdev_put67%of 3
cdev_set_parent---of 3
chrdev_open30%of 17
chrdev_show---of 6
exact_lock50%of 4
exact_match100%of 1
register_chrdev_region---of 13
unregister_chrdev_region---of 11
-----------
SUMMARY40%of 25

-----------
SUMMARY---of 0

__arm64_sys_cachestat---of 60
__filemap_add_folio26%of 71
__filemap_fdatawait_range---of 8
__filemap_fdatawrite_range---of 4
__filemap_get_folio11%of 66
__filemap_remove_folio47%of 13
__filemap_set_wb_err---of 17
__folio_lock100%of 1
__folio_lock_killable---of 1
__folio_lock_or_retry---of 11
__generic_file_write_iter---of 8
__probestub_file_check_and_advance_wb_err---of 1
__probestub_filemap_set_wb_err---of 1
__probestub_mm_filemap_add_to_page_cache---of 1
__probestub_mm_filemap_delete_from_page_cache---of 1
__probestub_mm_filemap_fault---of 1
__probestub_mm_filemap_get_pages---of 1
__probestub_mm_filemap_map_pages---of 1
__traceiter_file_check_and_advance_wb_err---of 4
__traceiter_filemap_set_wb_err---of 4
__traceiter_mm_filemap_add_to_page_cache---of 4
__traceiter_mm_filemap_delete_from_page_cache---of 4
__traceiter_mm_filemap_fault---of 4
__traceiter_mm_filemap_get_pages---of 4
__traceiter_mm_filemap_map_pages---of 4
count_memcg_event_mm28%of 22
count_vm_event50%of 4
delete_from_page_cache_batch---of 29
do_read_cache_folio---of 33
do_sync_mmap_readahead12%of 18
file_check_and_advance_wb_err---of 22
file_fdatawait_range---of 1
file_write_and_wait_range---of 7
filemap_add_folio27%of 15
filemap_alloc_folio_noprof12%of 18
filemap_check_errors---of 7
filemap_fault18%of 90
filemap_fault_recheck_pte_none13%of 16
filemap_fdatawait_keep_errors---of 9
filemap_fdatawait_range---of 1
filemap_fdatawait_range_keep_errors---of 3
filemap_fdatawrite---of 4
filemap_fdatawrite_range---of 4
filemap_fdatawrite_range_kick---of 4
filemap_fdatawrite_wbc---of 4
filemap_flush---of 4
filemap_free_folio50%of 10
filemap_get_entry28%of 18
filemap_get_folios100%of 1
filemap_get_folios_contig---of 33
filemap_get_folios_tag21%of 24
filemap_get_pages---of 105
filemap_get_read_batch---of 29
filemap_invalidate_inode---of 8
filemap_invalidate_lock_two---of 5
filemap_invalidate_pages---of 5
filemap_invalidate_unlock_two---of 5
filemap_map_pages23%of 86
filemap_page_mkwrite---of 35
filemap_range_has_page---of 15
filemap_range_has_writeback---of 23
filemap_read---of 60
filemap_read_folio---of 13
filemap_release_folio---of 12
filemap_remove_folio50%of 6
filemap_splice_read---of 25
filemap_unaccount_folio22%of 28
filemap_write_and_wait_range---of 7
find_get_entries19%of 27
find_lock_entries23%of 48
folio_end_private_2---of 10
folio_end_read37%of 11
folio_end_writeback---of 32
folio_large_mapcount---of 4
folio_put---of 6
folio_try_get32%of 16
folio_unlock58%of 7
folio_wait_bit---of 1
folio_wait_bit_common30%of 44
folio_wait_bit_killable---of 1
folio_wait_private_2---of 5
folio_wait_private_2_killable---of 5
folio_wake_bit43%of 7
generic_file_direct_write---of 12
generic_file_mmap---of 4
generic_file_read_iter---of 18
generic_file_readonly_mmap---of 5
generic_file_write_iter---of 12
generic_perform_write58%of 26
inode_to_wb---of 7
kiocb_invalidate_pages---of 5
kiocb_invalidate_post_direct_write---of 5
kiocb_write_and_wait---of 5
mapping_read_folio_gfp---of 1
mapping_seek_hole_data---of 71
maybe_unlock_mmap_for_io38%of 8
migration_entry_wait_on_locked---of 32
next_uptodate_folio28%of 29
page_cache_next_miss---of 5
page_cache_prev_miss---of 14
perf_trace_file_check_and_advance_wb_err---of 6
perf_trace_filemap_set_wb_err---of 6
perf_trace_mm_filemap_fault---of 6
perf_trace_mm_filemap_op_page_cache---of 8
perf_trace_mm_filemap_op_page_cache_range---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
read_cache_folio---of 1
read_cache_page---of 5
read_cache_page_gfp---of 5
release_fault_lock20%of 10
replace_page_cache_folio---of 32
splice_folio_into_pipe---of 18
trace_event_raw_event_file_check_and_advance_wb_err---of 7
trace_event_raw_event_filemap_set_wb_err---of 7
trace_event_raw_event_mm_filemap_fault---of 7
trace_event_raw_event_mm_filemap_op_page_cache---of 9
trace_event_raw_event_mm_filemap_op_page_cache_range---of 7
trace_mm_filemap_delete_from_page_cache24%of 17
trace_raw_output_file_check_and_advance_wb_err---of 3
trace_raw_output_filemap_set_wb_err---of 3
trace_raw_output_mm_filemap_fault---of 3
trace_raw_output_mm_filemap_op_page_cache---of 3
trace_raw_output_mm_filemap_op_page_cache_range---of 3
wake_page_function25%of 12
xas_next---of 10
xas_next_entry40%of 15
xas_reload20%of 20
-----------
SUMMARY26%of 808

__bond_release_one---of 64
__bond_xmit_hash---of 44
_inline_copy_from_user---of 8
block_netpoll_tx---of 3
bond_advance_esn_state---of 17
bond_arp_monitor---of 250
bond_arp_send---of 7
bond_change_active_slave---of 150
bond_change_mtu---of 16
bond_change_rx_flags---of 31
bond_check_dev_link---of 21
bond_close---of 29
bond_compute_features---of 5
bond_confirm_addr---of 18
bond_confirm_addr6---of 1
bond_create---of 4
bond_destructor---of 3
bond_dev_queue_xmit---of 3
bond_do_ioctl---of 36
bond_enslave---of 164
bond_eth_ioctl---of 9
bond_ether_setup---of 1
bond_ethtool_get_drvinfo---of 1
bond_ethtool_get_link_ksettings---of 23
bond_ethtool_get_ts_info---of 49
bond_fix_features---of 4
bond_flow_ip---of 18
bond_get_num_tx_queues---of 1
bond_get_slave_by_id---of 19
bond_get_stats---of 19
bond_handle_frame---of 33
bond_handle_vlan---of 20
bond_hw_addr_copy---of 3
bond_hw_addr_flush---of 3
bond_hwtstamp_get---of 15
bond_hwtstamp_set---of 16
bond_init---of 12
bond_ipsec_add_sa---of 40
bond_ipsec_del_sa---of 42
bond_ipsec_dev---of 12
bond_ipsec_free_sa---of 38
bond_ipsec_offload_ok---of 20
bond_lower_state_changed---of 1
bond_master_upper_dev_link---of 9
bond_mii_monitor---of 133
bond_mode_name---of 3
bond_neigh_init---of 16
bond_neigh_setup---of 3
bond_net_exit_batch---of 4
bond_net_exit_batch_rtnl---of 7
bond_net_init---of 1
bond_net_pre_exit---of 1
bond_netdev_event5%of 62
bond_netdev_notify_work---of 5
bond_netpoll_cleanup---of 8
bond_netpoll_setup---of 13
bond_ns_send---of 7
bond_open---of 45
bond_poll_controller---of 13
bond_queue_slave_event---of 1
bond_rcv_validate---of 66
bond_release---of 1
bond_resend_igmp_join_requests_delayed---of 4
bond_rr_gen_slave_id---of 8
bond_select_active_slave---of 52
bond_select_queue---of 4
bond_send_validate---of 45
bond_set_active_slave---of 3
bond_set_carrier---of 10
bond_set_dev_addr---of 5
bond_set_mac_address---of 23
bond_set_phc_index_flag---of 10
bond_set_rx_mode---of 24
bond_set_slave_arr---of 15
bond_set_slave_inactive_flags---of 10
bond_set_slave_link_state---of 4
bond_setup---of 3
bond_setup_by_slave---of 8
bond_should_notify_peers---of 14
bond_siocdevprivate---of 8
bond_sk_get_lower_dev---of 28
bond_slave_arr_handler---of 5
bond_slave_arr_work_rearm---of 1
bond_slave_link_status---of 3
bond_start_xmit---of 82
bond_time_in_interval---of 3
bond_uninit---of 13
bond_update_slave_arr---of 44
bond_update_speed_duplex---of 6
bond_upper_dev_walk---of 1
bond_validate_arp---of 21
bond_validate_na---of 23
bond_verify_device_path---of 10
bond_vlan_rx_add_vid---of 7
bond_vlan_rx_kill_vid---of 6
bond_work_init_all---of 1
bond_xdp---of 36
bond_xdp_check---of 8
bond_xdp_get_xmit_slave---of 34
bond_xdp_set_features---of 15
bond_xdp_xmit---of 16
bond_xfrm_update_stats---of 17
bond_xmit_get_slave---of 32
bond_xmit_hash---of 4
bond_xmit_roundrobin_slave_get---of 18
eth_hw_addr_random---of 1
local_bh_disable---of 2
net_generic---of 16
netdev_lock_cmp_fn---of 1
netdev_lock_ops---of 5
netdev_unlock_ops---of 5
pskb_may_pull---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
slave_enable_netpoll---of 4
slave_kobj_release---of 3
slave_last_rx---of 18
unblock_netpoll_tx---of 3
-----------
SUMMARY5%of 62

-----------
SUMMARY---of 0

kvm_arm_pvtime_get_attr63%of 8
kvm_arm_pvtime_has_attr100%of 1
kvm_arm_pvtime_set_attr62%of 13
kvm_arm_pvtime_supported100%of 1
kvm_hypercall_pv_features67%of 3
kvm_init_stolen_time50%of 4
kvm_update_stolen_time42%of 17
srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
-----------
SUMMARY59%of 51

__arm64_sys_pipe---of 1
__arm64_sys_pipe2---of 1
__do_pipe_flags---of 8
account_pipe_buffers---of 3
alloc_pipe_info---of 25
anon_pipe_buf_release---of 1
anon_pipe_buf_try_steal---of 13
anon_pipe_put_page20%of 15
anon_pipe_read---of 34
anon_pipe_write57%of 44
create_pipe_files---of 10
do_pipe2---of 9
do_pipe_flags---of 3
do_proc_dopipe_max_size_conv---of 4
fifo_open---of 29
fifo_pipe_read---of 4
fifo_pipe_write---of 27
free_pipe_info---of 16
generic_pipe_buf_get---of 6
generic_pipe_buf_release---of 9
generic_pipe_buf_try_steal---of 12
get_pipe_info---of 6
pipe_double_lock---of 6
pipe_fasync---of 7
pipe_fcntl---of 27
pipe_ioctl59%of 17
pipe_is_unprivileged_user---of 3
pipe_lock---of 3
pipe_lock_cmp_fn---of 1
pipe_poll---of 14
pipe_release---of 9
pipe_resize_ring---of 14
pipe_unlock---of 3
pipe_wait_readable---of 10
pipe_wait_writable---of 13
pipefs_dname100%of 1
pipefs_init_fs_context---of 3
proc_dopipe_max_size---of 1
round_pipe_size---of 4
too_many_pipe_buffers_hard---of 1
too_many_pipe_buffers_soft---of 1
wait_for_partner---of 9
-----------
SUMMARY51%of 77

-----------
SUMMARY---of 0

___pte_offset_map36%of 17
__flush_tlb_range---of 27
__pte_offset_map_lock40%of 5
p4d_clear_bad---of 3
pgd_clear_bad---of 7
pgtable_trans_huge_deposit---of 5
pgtable_trans_huge_withdraw---of 5
pmd_clear_bad---of 3
pmdp_clear_flush_young---of 11
pmdp_collapse_flush---of 7
pmdp_huge_clear_flush---of 8
pmdp_invalidate---of 5
pmdp_invalidate_ad---of 7
pte_free_defer---of 1
pte_free_now---of 13
pte_offset_map_ro_nolock---of 3
pte_offset_map_rw_nolock60%of 5
pte_unmap---of 6
ptep_clear_flush62%of 13
pud_clear_bad---of 3
rcu_lock_acquire100%of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
-----------
SUMMARY50%of 42

__fib6_clean_all40%of 15
__fib6_drop_pcpu_from---of 24
__fib6_update_sernum_upto_root36%of 14
call_fib6_entry_notifiers---of 1
call_fib6_entry_notifiers_replace---of 1
call_fib6_multipath_entry_notifiers---of 1
fib6_add---of 248
fib6_add_1---of 74
fib6_clean_all100%of 1
fib6_clean_all_skip_notify---of 1
fib6_clean_node32%of 16
fib6_del---of 81
fib6_dump_done---of 3
fib6_dump_end---of 5
fib6_dump_node---of 7
fib6_dump_table---of 8
fib6_find_prefix---of 25
fib6_flush_trees---of 5
fib6_force_start_gc---of 3
fib6_gc_cleanup---of 1
fib6_gc_timer_cb---of 1
fib6_get_table---of 17
fib6_info_alloc---of 3
fib6_info_destroy_rcu---of 15
fib6_locate---of 11
fib6_locate_1---of 31
fib6_metric_set---of 5
fib6_net_exit---of 7
fib6_net_init---of 10
fib6_new_table---of 5
fib6_nh_drop_pcpu_from---of 1
fib6_node_dump---of 5
fib6_node_lookup---of 4
fib6_node_lookup_1---of 43
fib6_purge_rt---of 37
fib6_repair_tree---of 88
fib6_run_gc---of 30
fib6_tables_dump---of 6
fib6_tables_seq_read---of 15
fib6_update_sernum---of 11
fib6_update_sernum_stub---of 1
fib6_update_sernum_upto_root60%of 5
fib6_walk50%of 6
fib6_walk_continue58%of 28
inet6_dump_fib---of 37
ipv6_route_seq_next---of 19
ipv6_route_seq_next_table---of 15
ipv6_route_seq_setup_walk---of 3
ipv6_route_seq_show---of 21
ipv6_route_seq_start---of 14
ipv6_route_seq_stop---of 15
ipv6_route_yield---of 10
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY49%of 89

attach_dn---of 7
dnotify_flush19%of 16
dnotify_free_mark---of 3
dnotify_handle_event---of 13
dnotify_recalc_inode_mask---of 7
fcntl_dirnotify---of 20
-----------
SUMMARY19%of 16

-----------
SUMMARY---of 0

call_fib_rule_notifiers---of 4
dump_rules---of 19
fib_default_rule_add---of 4
fib_delrule---of 115
fib_newrule---of 54
fib_nl2rule---of 86
fib_nl2rule_port_mask---of 12
fib_nl2rule_rtnl---of 15
fib_nl_delrule---of 1
fib_nl_dumprule---of 38
fib_nl_fill_rule---of 66
fib_nl_newrule---of 1
fib_rule_get---of 6
fib_rule_matchall---of 15
fib_rule_put---of 7
fib_rules_dump---of 5
fib_rules_event7%of 46
fib_rules_lookup---of 59
fib_rules_net_exit---of 3
fib_rules_net_init---of 1
fib_rules_register---of 12
fib_rules_seq_read---of 3
fib_rules_unregister---of 18
list_add_rcu---of 3
lookup_rules_ops---of 17
nla_get_kuid_range---of 1
nla_put_string---of 1
nla_put_uid_range---of 1
notify_rule_change---of 7
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rule_exists---of 27
-----------
SUMMARY7%of 46

__call_nexthop_res_bucket_notifiers---of 37
__remove_nexthop---of 93
__unregister_nexthop_notifier---of 7
call_nexthop_notifiers---of 8
fib6_check_nexthop---of 25
fib_check_nexthop---of 32
l3mdev_fib_table---of 11
nexthop_bucket_set_hw_flags---of 31
nexthop_find_by_id---of 7
nexthop_find_group_resilient---of 16
nexthop_for_each_fib6_nh---of 25
nexthop_free_rcu---of 18
nexthop_net_exit---of 1
nexthop_net_exit_batch_rtnl---of 10
nexthop_net_init---of 3
nexthop_notify---of 22
nexthop_put---of 6
nexthop_res_grp_activity_update---of 33
nexthop_select_path---of 76
nexthop_set_hw_flags---of 18
nh_check_attr_group---of 29
nh_dump_filtered---of 24
nh_fill_node---of 126
nh_fill_res_bucket---of 14
nh_grp_entry_stats_inc---of 5
nh_grp_hw_stats_report_delta---of 1
nh_netdev_event8%of 25
nh_notifier_info_fini---of 10
nh_notifier_info_init---of 39
nh_notifier_mpath_info_init---of 13
nh_res_table_upkeep---of 34
nh_res_table_upkeep_dw---of 1
nh_rt_cache_flush---of 22
rcu_lock_acquire---of 2
rcu_lock_release---of 2
register_nexthop_notifier---of 6
remove_nexthop---of 8
replace_nexthop_grp_res---of 38
replace_nexthop_single_notify---of 33
rtm_del_nexthop---of 20
rtm_dump_nexthop---of 29
rtm_dump_nexthop_bucket---of 46
rtm_dump_nexthop_bucket_nh---of 25
rtm_get_nexthop---of 25
rtm_get_nexthop_bucket---of 38
rtm_new_nexthop---of 425
rtm_to_nh_config_grp_res---of 18
unregister_nexthop_notifier---of 1
-----------
SUMMARY8%of 25

__arm64_sys_flock---of 29
__break_lease---of 88
__locks_delete_block---of 12
__locks_wake_up_blocks---of 16
__probestub_break_lease_block---of 1
__probestub_break_lease_noblock---of 1
__probestub_break_lease_unblock---of 1
__probestub_fcntl_setlk---of 1
__probestub_flock_lock_inode---of 1
__probestub_generic_add_lease---of 1
__probestub_generic_delete_lease---of 1
__probestub_leases_conflict---of 1
__probestub_locks_get_lock_context---of 1
__probestub_locks_remove_posix---of 1
__probestub_posix_lock_inode---of 1
__probestub_time_out_leases---of 1
__traceiter_break_lease_block---of 4
__traceiter_break_lease_noblock---of 4
__traceiter_break_lease_unblock---of 4
__traceiter_fcntl_setlk---of 4
__traceiter_flock_lock_inode---of 4
__traceiter_generic_add_lease---of 4
__traceiter_generic_delete_lease---of 4
__traceiter_leases_conflict---of 4
__traceiter_locks_get_lock_context---of 4
__traceiter_locks_remove_posix---of 4
__traceiter_posix_lock_inode---of 4
__traceiter_time_out_leases---of 4
check_conflicting_open---of 6
fcntl_getlease---of 10
fcntl_getlk---of 26
fcntl_setlease---of 10
fcntl_setlk---of 58
files_lookup_fd_locked---of 6
flock_lock_inode---of 67
flock_locks_conflict---of 4
flock_to_posix_lock---of 14
generic_setlease---of 61
kernel_setlease---of 5
lease_break_callback---of 1
lease_get_mtime---of 8
lease_modify---of 10
lease_register_notifier---of 1
lease_setup---of 3
lease_unregister_notifier---of 1
leases_conflict---of 24
lock_get_status---of 19
locks_alloc_lease---of 3
locks_alloc_lock---of 3
locks_check_ctx_lists---of 5
locks_copy_conflock---of 4
locks_copy_lock---of 9
locks_delete_block---of 1
locks_delete_lock_ctx---of 14
locks_dispose_list---of 18
locks_dump_ctx_list---of 4
locks_free_lease---of 1
locks_free_lock---of 12
locks_free_lock_context67%of 3
locks_get_lock_context---of 24
locks_init_lease---of 1
locks_init_lock---of 1
locks_insert_block---of 12
locks_insert_lock_ctx---of 8
locks_lock_inode_wait---of 18
locks_next---of 1
locks_owner_has_blockers---of 7
locks_release_private---of 12
locks_remove_file7%of 29
locks_remove_posix9%of 24
locks_show---of 16
locks_start---of 1
locks_stop---of 1
locks_translate_pid---of 13
locks_unlink_lock_ctx---of 12
percpu_down_read---of 12
percpu_up_read---of 14
perf_trace_filelock_lease---of 7
perf_trace_filelock_lock---of 7
perf_trace_generic_add_lease---of 6
perf_trace_leases_conflict---of 6
perf_trace_locks_get_lock_context---of 6
posix_lock_file---of 1
posix_lock_inode---of 175
posix_test_lock---of 22
rcu_lock_acquire---of 2
rcu_lock_release---of 2
show_fd_locks---of 20
time_out_leases---of 30
trace_event_raw_event_filelock_lease---of 8
trace_event_raw_event_filelock_lock---of 8
trace_event_raw_event_generic_add_lease---of 7
trace_event_raw_event_leases_conflict---of 7
trace_event_raw_event_locks_get_lock_context---of 7
trace_generic_delete_lease---of 17
trace_raw_output_filelock_lease---of 3
trace_raw_output_filelock_lock---of 3
trace_raw_output_generic_add_lease---of 3
trace_raw_output_leases_conflict---of 3
trace_raw_output_locks_get_lock_context---of 3
vfs_cancel_lock---of 5
vfs_inode_has_locks---of 4
vfs_lock_file---of 5
vfs_setlease---of 10
vfs_test_lock---of 5
-----------
SUMMARY11%of 56

-----------
SUMMARY---of 0

dev_change_carrier---of 9
dev_change_flags---of 9
dev_change_name---of 9
dev_change_net_namespace---of 1
dev_change_proto_down---of 9
dev_change_tx_queue_len---of 9
dev_close---of 9
dev_disable_lro---of 9
dev_eth_ioctl---of 12
dev_open---of 9
dev_set_alias---of 9
dev_set_allmulti---of 9
dev_set_group---of 9
dev_set_mac_address---of 9
dev_set_mac_address_user---of 9
dev_set_mtu---of 9
dev_xdp_propagate---of 9
netdev_state_change56%of 9
-----------
SUMMARY56%of 9

alloc_cpumask_var_node50%of 6
cpumask_any_and_distribute---of 10
cpumask_any_distribute---of 10
cpumask_local_spread---of 3
free_cpumask_var100%of 1
-----------
SUMMARY58%of 7

-----------
SUMMARY---of 0

arch_jump_label_transform_apply100%of 1
arch_jump_label_transform_queue100%of 3
-----------
SUMMARY100%of 4

___perf_sw_event---of 32
__arm64_sys_perf_event_open---of 126
__ctx_time_update20%of 10
__detach_global_ctx_data---of 31
__free_event---of 32
__free_perf_ctx_data_rcu---of 1
__perf_cgroup_move---of 3
__perf_event_account_interrupt---of 8
__perf_event_disable---of 24
__perf_event_enable32%of 38
__perf_event_exit_context---of 4
__perf_event_header__init_id---of 14
__perf_event_output_stop---of 11
__perf_event_overflow---of 56
__perf_event_period---of 10
__perf_event_read22%of 37
__perf_event_read_cpu17%of 18
__perf_event_read_value50%of 4
__perf_event_stop---of 5
__perf_event_task_sched_in---of 82
__perf_event_task_sched_out---of 35
__perf_install_in_context44%of 32
__perf_pmu_install_event---of 8
__perf_pmu_output_stop---of 38
__perf_pmu_remove---of 42
__perf_read_group_add---of 33
__perf_remove_from_context---of 57
__perf_sw_event---of 6
__perf_tp_event_target_task---of 14
__pmu_ctx_sched_in100%of 3
__pmu_ctx_sched_out45%of 18
__static_call_return0---of 1
__update_context_time50%of 6
_free_event---of 53
_inline_copy_from_user---of 8
_perf_event_disable---of 3
_perf_event_enable---of 6
_perf_event_refresh---of 10
_perf_event_reset---of 1
account_event35%of 49
add_event_to_ctx27%of 49
alloc_perf_context38%of 8
attach_task_ctx_data---of 16
class_percpu_read_constructor---of 12
class_percpu_read_destructor---of 15
context_equiv---of 18
cpu_clock_event_add---of 5
cpu_clock_event_del---of 1
cpu_clock_event_init---of 7
cpu_clock_event_read---of 3
cpu_clock_event_start---of 4
cpu_clock_event_stop---of 5
cpumask_show---of 4
ctx_event_to_rotate---of 54
ctx_resched31%of 43
ctx_sched_in49%of 31
ctx_sched_out40%of 23
detach_task_ctx_data---of 30
event_function36%of 28
event_function_call14%of 30
event_sched_in34%of 42
event_sched_out42%of 31
exclusive_event_init17%of 12
exclusive_event_installable28%of 11
find_get_context18%of 23
find_get_pmu_context24%of 21
find_lively_task_by_vpid---of 23
free_cpc_rcu---of 1
free_ctx---of 1
free_epc_rcu---of 1
free_event---of 5
free_event_rcu---of 3
get_ctx---of 6
get_pmu_ctx---of 6
get_uid---of 6
group_sched_out32%of 19
inherit_event---of 24
inherit_task_group---of 31
ktime_get_boottime_ns---of 1
ktime_get_clocktai_ns---of 1
ktime_get_real_ns---of 1
list_del_event---of 31
local_clock---of 1
map_range---of 6
merge_sched_in20%of 85
nr_addr_filters_show---of 1
perf_addr_filters_splice---of 13
perf_adjust_freq_unthr_context---of 14
perf_adjust_freq_unthr_events---of 22
perf_adjust_period---of 23
perf_allow_kernel---of 5
perf_bp_event---of 11
perf_callchain---of 5
perf_cgroup_attach---of 7
perf_cgroup_connect---of 17
perf_cgroup_css_alloc---of 4
perf_cgroup_css_free---of 1
perf_cgroup_css_online---of 14
perf_cgroup_from_task---of 11
perf_cgroup_set_timestamp34%of 6
perf_cgroup_switch---of 31
perf_check_permission---of 23
perf_compat_ioctl---of 8
perf_copy_attr---of 48
perf_cpu_task_ctx37%of 11
perf_cpu_time_max_percent_handler---of 5
perf_ctx_disable---of 9
perf_ctx_enable---of 9
perf_ctx_sched_task_cb---of 7
perf_detach_cgroup---of 19
perf_duration_warn---of 3
perf_event__header_size---of 1
perf_event__id_header_size---of 1
perf_event__output_id_sample---of 14
perf_event_account_interrupt---of 1
perf_event_addr_filters_apply---of 28
perf_event_addr_filters_sync---of 4
perf_event_alloc17%of 154
perf_event_attrs---of 1
perf_event_aux_event---of 5
perf_event_aux_pause---of 20
perf_event_bpf_event---of 8
perf_event_bpf_output---of 6
perf_event_cgroup_output---of 9
perf_event_comm---of 7
perf_event_comm_output---of 13
perf_event_create_kernel_counter25%of 20
perf_event_ctx_lock_nested24%of 25
perf_event_delayed_put---of 3
perf_event_disable67%of 3
perf_event_disable_inatomic---of 1
perf_event_disable_local---of 31
perf_event_enable50%of 6
perf_event_exec---of 83
perf_event_exit_cpu---of 37
perf_event_exit_event---of 3
perf_event_exit_task---of 28
perf_event_fork---of 48
perf_event_free_bpf_prog---of 4
perf_event_free_task---of 12
perf_event_get---of 4
perf_event_groups_insert40%of 20
perf_event_header__init_id---of 3
perf_event_idx_default---of 1
perf_event_init_cpu---of 54
perf_event_init_task---of 16
perf_event_init_userpage---of 18
perf_event_itrace_started---of 1
perf_event_ksymbol---of 16
perf_event_ksymbol_output---of 9
perf_event_max_sample_rate_handler---of 6
perf_event_mmap4%of 64
perf_event_mmap_output---of 32
perf_event_modify_breakpoint---of 9
perf_event_mux_interval_ms_show---of 1
perf_event_mux_interval_ms_store---of 7
perf_event_namespaces---of 16
perf_event_namespaces_output---of 9
perf_event_nop_int---of 1
perf_event_output---of 18
perf_event_output_backward---of 18
perf_event_output_forward---of 18
perf_event_overflow---of 1
perf_event_pause---of 7
perf_event_period---of 7
perf_event_print_debug---of 1
perf_event_read15%of 20
perf_event_read_local---of 31
perf_event_read_value100%of 1
perf_event_refresh---of 1
perf_event_release_kernel---of 27
perf_event_set_bpf_prog---of 22
perf_event_set_clock---of 9
perf_event_set_output---of 24
perf_event_switch_output---of 11
perf_event_sync_stat---of 16
perf_event_sysfs_show---of 3
perf_event_task_disable---of 13
perf_event_task_enable---of 19
perf_event_task_output---of 17
perf_event_task_tick---of 28
perf_event_text_poke---of 3
perf_event_text_poke_output---of 17
perf_event_update_sibling_time24%of 21
perf_event_update_time---of 6
perf_event_update_userpage17%of 31
perf_event_validate_size---of 19
perf_event_wakeup28%of 22
perf_exclude_event---of 9
perf_fasync---of 1
perf_get_aux_event---of 19
perf_get_event---of 3
perf_get_page_size---of 37
perf_group_detach---of 63
perf_install_in_context38%of 32
perf_instruction_pointer---of 4
perf_ioctl---of 154
perf_iterate_sb---of 59
perf_lock_task_context28%of 48
perf_log_lost_samples---of 5
perf_log_throttle---of 7
perf_misc_flags---of 4
perf_mmap---of 65
perf_mmap_close---of 71
perf_mmap_open---of 10
perf_mmap_pfn_mkwrite---of 1
perf_mux_hrtimer_handler---of 50
perf_mux_hrtimer_restart_ipi---of 3
perf_output_read---of 80
perf_output_sample---of 111
perf_pending_disable---of 9
perf_pending_irq---of 7
perf_pending_task---of 11
perf_pin_task_context---of 3
perf_pmu_cancel_txn50%of 4
perf_pmu_commit_txn50%of 4
perf_pmu_disable---of 3
perf_pmu_enable---of 3
perf_pmu_free---of 13
perf_pmu_migrate_context---of 13
perf_pmu_nop_int---of 1
perf_pmu_nop_txn---of 1
perf_pmu_nop_void---of 1
perf_pmu_register---of 35
perf_pmu_resched---of 10
perf_pmu_sched_task---of 21
perf_pmu_start_txn50%of 4
perf_pmu_unregister---of 3
perf_poll---of 12
perf_prepare_header---of 6
perf_prepare_sample---of 118
perf_read---of 37
perf_reboot---of 4
perf_register_guest_info_callbacks---of 4
perf_release---of 1
perf_remove_from_context---of 6
perf_remove_from_owner---of 24
perf_report_aux_output_id---of 5
perf_sample_event_took---of 5
perf_sched_cb_dec---of 7
perf_sched_cb_inc---of 7
perf_sched_delayed---of 5
perf_swevent_add---of 15
perf_swevent_del---of 3
perf_swevent_event---of 25
perf_swevent_get_recursion_context---of 3
perf_swevent_hrtimer---of 13
perf_swevent_init---of 9
perf_swevent_put_recursion_context---of 1
perf_swevent_read---of 1
perf_swevent_set_period---of 6
perf_swevent_start---of 1
perf_swevent_stop---of 1
perf_tp_event---of 99
perf_tp_event_init---of 5
perf_trace_run_bpf_submit---of 4
perf_try_init_event21%of 29
perf_unpin_context100%of 1
perf_unregister_guest_info_callbacks---of 3
pmu_dev_alloc---of 7
pmu_dev_is_visible---of 6
pmu_dev_release---of 1
put_ctx30%of 10
put_event---of 6
put_pmu_ctx---of 10
put_task_struct---of 6
rb_free_rcu---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
remote_function40%of 5
ring_buffer_attach---of 24
ring_buffer_get---of 23
ring_buffer_put---of 7
srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
sw_perf_event_destroy---of 6
swevent_hlist_get---of 16
swevent_hlist_put_cpu---of 9
task_clock_event_add---of 5
task_clock_event_del---of 1
task_clock_event_init---of 7
task_clock_event_read---of 3
task_clock_event_start---of 4
task_clock_event_stop---of 5
task_ctx_sched_out---of 4
tp_perf_event_destroy---of 1
type_show---of 1
unclone_ctx50%of 6
visit_groups_merge25%of 99
-----------
SUMMARY28%of 1325

valid_mmap_phys_addr_range---of 1
valid_phys_addr_range---of 3
vm_get_page_prot50%of 6
-----------
SUMMARY50%of 6

-----------
SUMMARY---of 0

kvm_vgic_v4_set_forwarding---of 12
kvm_vgic_v4_unset_forwarding---of 10
rcu_lock_acquire---of 2
rcu_lock_release---of 2
vgic_v4_commit---of 3
vgic_v4_configure_vsgis---of 21
vgic_v4_doorbell_handler---of 6
vgic_v4_get_vlpi_state---of 35
vgic_v4_init15%of 20
vgic_v4_load25%of 8
vgic_v4_put25%of 8
vgic_v4_request_vpe_irq---of 1
vgic_v4_teardown---of 10
-----------
SUMMARY20%of 36

inject_abt3262%of 13
kvm_inject_dabt100%of 3
kvm_inject_pabt100%of 3
kvm_inject_size_fault---of 6
kvm_inject_undefined63%of 8
kvm_inject_vabt100%of 1
kvm_set_sei_esr100%of 1
pend_sync_exception17%of 43
-----------
SUMMARY39%of 72

__gic_v3_get_lr---of 17
__vgic_v3_activate_traps63%of 8
__vgic_v3_clear_active_lr---of 19
__vgic_v3_deactivate_traps78%of 9
__vgic_v3_get_gic_config---of 3
__vgic_v3_highest_priority_lr---of 25
__vgic_v3_init_lrs---of 19
__vgic_v3_perform_cpuif_access---of 132
__vgic_v3_read_apxr0---of 5
__vgic_v3_read_apxr1---of 5
__vgic_v3_read_apxr2---of 5
__vgic_v3_read_apxr3---of 5
__vgic_v3_read_bpr0---of 3
__vgic_v3_read_bpr1---of 5
__vgic_v3_read_ctlr---of 3
__vgic_v3_read_hppir---of 5
__vgic_v3_read_iar---of 47
__vgic_v3_read_igrpen0---of 3
__vgic_v3_read_igrpen1---of 3
__vgic_v3_read_pmr---of 3
__vgic_v3_read_rpr---of 11
__vgic_v3_restore_state19%of 22
__vgic_v3_restore_vmcr_aprs56%of 9
__vgic_v3_save_state18%of 41
__vgic_v3_save_vmcr_aprs56%of 9
__vgic_v3_write_apxr0---of 5
__vgic_v3_write_apxr1---of 5
__vgic_v3_write_apxr2---of 5
__vgic_v3_write_apxr3---of 5
__vgic_v3_write_bpr0---of 3
__vgic_v3_write_bpr1---of 5
__vgic_v3_write_ctlr---of 3
__vgic_v3_write_dir---of 25
__vgic_v3_write_eoir---of 47
__vgic_v3_write_igrpen0---of 3
__vgic_v3_write_igrpen1---of 3
__vgic_v3_write_pmr---of 3
-----------
SUMMARY34%of 98

-----------
SUMMARY---of 0

__kvm_vgic_vcpu_destroy67%of 15
kvm_vgic_cpu_down---of 1
kvm_vgic_cpu_up---of 1
kvm_vgic_create57%of 25
kvm_vgic_destroy75%of 16
kvm_vgic_early_init100%of 1
kvm_vgic_hyp_init---of 17
kvm_vgic_init_cpu_hardware---of 9
kvm_vgic_map_resources56%of 9
kvm_vgic_vcpu_destroy100%of 1
kvm_vgic_vcpu_init60%of 5
kvm_vgic_vcpu_nv_init---of 3
vgic_allocate_private_irqs_locked45%of 18
vgic_init60%of 27
vgic_lazy_init75%of 4
vgic_maintenance_handler---of 4
-----------
SUMMARY61%of 121

-----------
SUMMARY---of 0

errseq_check---of 3
errseq_check_and_advance---of 5
errseq_sample100%of 1
errseq_set---of 7
-----------
SUMMARY100%of 1

list_add_tail_rcu---of 3
macvlan_broadcast---of 16
macvlan_change_mtu---of 3
macvlan_change_rx_flags---of 6
macvlan_changelink---of 35
macvlan_changelink_sources---of 36
macvlan_common_newlink---of 62
macvlan_common_setup---of 1
macvlan_compute_filter---of 8
macvlan_count_rx---of 13
macvlan_dellink---of 15
macvlan_dev_free---of 6
macvlan_dev_get_iflink---of 1
macvlan_dev_get_stats64---of 5
macvlan_dev_netpoll_cleanup---of 3
macvlan_dev_netpoll_setup---of 4
macvlan_dev_poll_controller---of 1
macvlan_device_event6%of 37
macvlan_ethtool_get_drvinfo---of 1
macvlan_ethtool_get_link_ksettings---of 1
macvlan_ethtool_get_ts_info---of 1
macvlan_fdb_add---of 7
macvlan_fdb_del---of 6
macvlan_fill_info---of 19
macvlan_fix_features---of 1
macvlan_forward_source---of 11
macvlan_get_link_net---of 1
macvlan_get_size---of 1
macvlan_handle_frame---of 46
macvlan_hard_header---of 6
macvlan_hash_add_source---of 10
macvlan_hwtstamp_get---of 1
macvlan_hwtstamp_set---of 3
macvlan_init---of 11
macvlan_link_register---of 1
macvlan_newlink---of 1
macvlan_open---of 27
macvlan_port_destroy---of 19
macvlan_process_broadcast---of 25
macvlan_set_mac_address---of 12
macvlan_set_mac_lists---of 3
macvlan_setup---of 1
macvlan_start_xmit---of 19
macvlan_stop---of 13
macvlan_sync_address---of 18
macvlan_uninit---of 10
macvlan_validate---of 34
macvlan_vlan_rx_add_vid---of 1
macvlan_vlan_rx_kill_vid---of 1
netdev_lock_cmp_fn---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
update_port_bc_cutoff---of 4
-----------
SUMMARY6%of 37

__generic_file_fsync---of 5
_inline_copy_from_user---of 8
alloc_anon_inode---of 3
always_delete_dentry100%of 1
dcache_dir_close---of 1
dcache_dir_lseek---of 14
dcache_dir_open---of 1
dcache_readdir---of 16
direct_write_fallback---of 4
empty_dir_listxattr---of 1
empty_dir_llseek---of 1
empty_dir_lookup---of 1
empty_dir_readdir---of 7
empty_dir_setattr---of 1
find_positive_dentry---of 15
generic_check_addressable---of 4
generic_ci_d_compare---of 8
generic_ci_d_hash---of 6
generic_ci_match---of 15
generic_encode_ino32_fh---of 5
generic_fh_to_dentry---of 4
generic_fh_to_parent---of 5
generic_file_fsync---of 3
generic_read_dir---of 1
generic_set_sb_d_ops---of 4
init_pseudo---of 3
inode_maybe_inc_iversion50%of 10
inode_query_iversion---of 7
is_empty_dir_inode---of 3
kfree_link---of 1
make_empty_dir_inode---of 1
memory_read_from_buffer---of 4
noop_direct_IO---of 1
noop_fsync---of 1
offset_dir_llseek---of 6
offset_dir_lookup---of 12
offset_readdir---of 15
path_from_stashed35%of 35
pseudo_fs_fill_super---of 4
pseudo_fs_free---of 1
pseudo_fs_get_tree---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
scan_positives---of 18
simple_attr_open---of 3
simple_attr_read---of 10
simple_attr_release---of 1
simple_attr_write---of 1
simple_attr_write_signed---of 1
simple_attr_write_xsigned---of 10
simple_empty---of 7
simple_fill_super---of 10
simple_get_link---of 1
simple_getattr---of 1
simple_inode_init_ts100%of 1
simple_link---of 3
simple_lookup84%of 6
simple_nosetlease---of 1
simple_offset_add50%of 4
simple_offset_destroy---of 1
simple_offset_init---of 1
simple_offset_remove---of 3
simple_offset_rename---of 8
simple_offset_rename_exchange---of 13
simple_open---of 3
simple_pin_fs40%of 5
simple_read_folio---of 7
simple_read_from_buffer---of 13
simple_recursive_removal47%of 32
simple_release_fs67%of 3
simple_rename---of 12
simple_rename_exchange---of 7
simple_rename_timestamp---of 5
simple_rmdir---of 3
simple_setattr---of 4
simple_statfs---of 1
simple_transaction_get---of 6
simple_transaction_read---of 3
simple_transaction_release---of 1
simple_transaction_set---of 3
simple_unlink---of 1
simple_write_begin---of 12
simple_write_end---of 17
simple_write_to_buffer---of 8
stashed_dentry_get34%of 18
stashed_dentry_prune40%of 5
zero_user_segments---of 48
-----------
SUMMARY46%of 124

-----------
SUMMARY---of 0

__kvm_make_request---of 3
__probestub_vgic_update_irq_pending---of 1
__traceiter_vgic_update_irq_pending---of 4
kvm_vgic_flush_hwstate39%of 77
kvm_vgic_get_map---of 6
kvm_vgic_inject_irq46%of 31
kvm_vgic_load45%of 9
kvm_vgic_map_is_active58%of 7
kvm_vgic_map_phys_irq38%of 8
kvm_vgic_put45%of 9
kvm_vgic_reset_mapped_irq50%of 6
kvm_vgic_set_owner55%of 11
kvm_vgic_sync_hwstate28%of 62
kvm_vgic_unmap_phys_irq---of 7
kvm_vgic_vcpu_pending_irq22%of 14
perf_trace_vgic_update_irq_pending---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
trace_event_raw_event_vgic_update_irq_pending---of 7
trace_raw_output_vgic_update_irq_pending---of 3
vgic_flush_pending_lpis29%of 7
vgic_get_irq38%of 24
vgic_get_phys_line_level---of 7
vgic_get_vcpu_irq75%of 4
vgic_irq_cmp---of 12
vgic_irq_handle_resampling9%of 24
vgic_irq_set_phys_active50%of 4
vgic_irq_set_phys_pending---of 3
vgic_kick_vcpus50%of 8
vgic_put_irq63%of 8
vgic_queue_irq_unlock34%of 48
-----------
SUMMARY37%of 365

__arm64_compat_sys_get_robust_list---of 31
__arm64_compat_sys_set_robust_list---of 3
__arm64_sys_futex18%of 17
__arm64_sys_futex_requeue---of 4
__arm64_sys_futex_time32---of 16
__arm64_sys_futex_wait---of 6
__arm64_sys_futex_waitv---of 11
__arm64_sys_futex_wake---of 4
__arm64_sys_get_robust_list---of 31
__arm64_sys_set_robust_list---of 3
do_futex16%of 19
futex2_setup_timeout---of 8
futex_parse_waitv---of 16
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY17%of 36

address_val---of 8
bdev_name---of 13
bitmap_list_string---of 17
bitmap_string---of 13
bstr_printf---of 44
clock---of 8
date_str---of 5
default_pointer4%of 53
dentry_name---of 35
device_node_string---of 66
err_ptr---of 8
escaped_string---of 16
file_dentry_name---of 8
fill_ptr_key---of 1
flags_string---of 54
format_decode50%of 28
fourcc_string---of 38
fwnode_full_name_string---of 9
fwnode_string---of 27
hex_range---of 4
hex_string---of 21
ip4_addr_string---of 7
ip4_addr_string_sa---of 17
ip4_string---of 32
ip6_addr_string---of 10
ip6_addr_string_sa---of 27
ip6_compressed_string---of 20
ip6_string---of 15
ip_addr_string---of 41
mac_address_string---of 26
move_right40%of 5
netdev_bits---of 26
num_to_str---of 21
number58%of 50
pointer4%of 85
ptr_to_hashval---of 3
put_dec75%of 4
put_dec_full8100%of 1
put_dec_trunc880%of 5
range_string---of 13
rcu_lock_acquire---of 2
rcu_lock_release---of 2
resource_or_range---of 4
resource_string---of 108
restricted_pointer---of 34
rtc_str---of 20
scnprintf---of 4
simple_strntoll---of 3
simple_strntoul---of 1
simple_strntoull---of 5
simple_strtol---of 3
simple_strtoll---of 1
simple_strtoul---of 1
simple_strtoull---of 1
snprintf100%of 1
special_hex_number---of 1
sprintf100%of 1
sscanf---of 1
string39%of 13
string_nocheck---of 7
symbol_string---of 15
time64_str---of 1
time_and_date---of 21
time_str---of 5
uuid_string---of 23
vbin_printf---of 57
vscnprintf50%of 4
vsnprintf37%of 52
vsprintf---of 1
vsscanf---of 128
widen_string43%of 7
-----------
SUMMARY29%of 309

return_address67%of 3
save_return_addr100%of 3
-----------
SUMMARY84%of 6

-----------
SUMMARY---of 0

lockref_get100%of 1
lockref_get_not_dead67%of 3
lockref_get_not_zero---of 3
lockref_mark_dead67%of 3
lockref_put_or_lock---of 3
lockref_put_return100%of 1
-----------
SUMMARY75%of 8

rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
sel_netif_flush---of 10
sel_netif_netdev_notifier_handler10%of 20
sel_netif_sid---of 17
sel_netif_sid_slow---of 20
-----------
SUMMARY10%of 20

-----------
SUMMARY---of 0

__kobject_del---of 9
dynamic_kobj_release---of 3
kobj_attr_show---of 3
kobj_attr_store---of 3
kobj_child_ns_ops---of 5
kobj_kset_leave---of 4
kobj_ns_current_may_mount---of 4
kobj_ns_drop---of 5
kobj_ns_grab_current---of 4
kobj_ns_ops40%of 5
kobj_ns_type_register---of 4
kobj_ns_type_registered---of 3
kobject_add---of 5
kobject_add_internal---of 59
kobject_create_and_add---of 6
kobject_del---of 3
kobject_get---of 9
kobject_get_ownership---of 3
kobject_get_path42%of 12
kobject_get_unless_zero38%of 8
kobject_init---of 5
kobject_init_and_add---of 3
kobject_move---of 34
kobject_namespace---of 7
kobject_put17%of 24
kobject_rename---of 21
kobject_set_name---of 1
kobject_set_name_vargs---of 6
kset_create_and_add---of 5
kset_find_obj---of 7
kset_get_ownership---of 4
kset_init---of 1
kset_register---of 5
kset_release---of 3
kset_unregister---of 3
-----------
SUMMARY29%of 49

__ndisc_fill_addr_option---of 3
__neigh_lookup---of 3
accept_untracked_na---of 10
dst_output---of 5
fib6_add_gc_list---of 10
fib6_info_release---of 8
in6_dev_get28%of 22
ndisc_alloc_skb---of 19
ndisc_allow_add---of 12
ndisc_cleanup---of 1
ndisc_constructor---of 29
ndisc_error_report---of 9
ndisc_fill_redirect_addr_option---of 4
ndisc_fill_redirect_hdr_option---of 1
ndisc_hash---of 1
ndisc_ifinfo_sysctl_change---of 23
ndisc_is_multicast---of 1
ndisc_key_eq---of 1
ndisc_late_cleanup---of 1
ndisc_mc_map---of 11
ndisc_net_exit---of 3
ndisc_net_init---of 7
ndisc_netdev_event22%of 28
ndisc_ns_create---of 13
ndisc_parse_options---of 30
ndisc_rcv---of 28
ndisc_recv_na---of 48
ndisc_recv_ns---of 68
ndisc_recv_rs---of 29
ndisc_redirect_opt_addr_space---of 4
ndisc_redirect_rcv---of 18
ndisc_router_discovery---of 153
ndisc_send_na---of 19
ndisc_send_ns---of 6
ndisc_send_redirect---of 27
ndisc_send_rs---of 19
ndisc_send_skb---of 100
ndisc_send_unsol_na---of 12
ndisc_solicit---of 19
ndisc_update---of 4
neigh_release---of 6
neigh_var_set---of 3
pndisc_constructor---of 10
pndisc_destructor---of 10
pndisc_is_router---of 3
pndisc_redo---of 1
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
skb_dst---of 5
-----------
SUMMARY30%of 54

-----------
SUMMARY---of 0

__nbcon_atomic_flush_pending---of 20
__nbcon_atomic_flush_pending_con---of 8
__nbcon_context_update_unsafe---of 15
nbcon_alloc---of 7
nbcon_atomic_flush_pending---of 1
nbcon_atomic_flush_unsafe---of 1
nbcon_can_proceed---of 5
nbcon_context_release---of 6
nbcon_context_try_acquire---of 47
nbcon_context_try_acquire_requested---of 11
nbcon_cpu_emergency_enter---of 1
nbcon_cpu_emergency_exit---of 5
nbcon_device_release---of 19
nbcon_device_try_acquire---of 3
nbcon_emit_next_record---of 32
nbcon_enter_unsafe---of 3
nbcon_exit_unsafe---of 3
nbcon_free---of 6
nbcon_get_cpu_emergency_nesting67%of 3
nbcon_get_default_prio67%of 3
nbcon_irq_work---of 1
nbcon_kthread_create---of 4
nbcon_kthread_func---of 32
nbcon_kthread_stop---of 3
nbcon_kthreads_wake60%of 10
nbcon_legacy_emit_next_record---of 14
nbcon_reacquire_nobuf---of 4
nbcon_seq_force---of 1
nbcon_seq_read---of 1
printk_get_console_flush_type---of 25
-----------
SUMMARY63%of 16

-----------
SUMMARY---of 0

NF_HOOK---of 20
__neigh_lookup---of 3
_inline_copy_to_user---of 7
arp_accept---of 4
arp_constructor---of 44
arp_create---of 13
arp_error_report---of 9
arp_filter---of 4
arp_fwd_proxy---of 14
arp_fwd_pvlan---of 4
arp_hash---of 1
arp_ifdown---of 1
arp_ignore---of 6
arp_invalidate---of 14
arp_ioctl---of 30
arp_is_garp---of 6
arp_is_multicast---of 1
arp_key_eq---of 1
arp_mc_map---of 8
arp_net_exit---of 1
arp_net_init---of 1
arp_netdev_event32%of 16
arp_process---of 81
arp_rcv40%of 15
arp_req_delete---of 5
arp_req_dev---of 9
arp_req_get---of 20
arp_req_set---of 24
arp_req_set_proxy---of 15
arp_send---of 4
arp_send_dst---of 9
arp_seq_show---of 14
arp_seq_start---of 1
arp_solicit---of 52
arp_xmit---of 16
arp_xmit_finish---of 1
neigh_release---of 6
parp_redo---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
skb_metadata_dst---of 8
skb_rtable---of 5
-----------
SUMMARY36%of 31

-----------
SUMMARY---of 0

chroot_fs_refs---of 22
copy_fs_struct---of 3
current_umask100%of 1
exit_fs---of 4
free_fs_struct---of 1
set_fs_pwd---of 6
set_fs_root---of 6
unshare_fs_struct---of 4
-----------
SUMMARY100%of 1

-----------
SUMMARY---of 0

__probestub_sys_enter---of 1
__probestub_sys_exit---of 1
__traceiter_sys_enter---of 4
__traceiter_sys_exit---of 4
arch_ptrace---of 3
clear_tsk_thread_flag---of 3
compat_arch_ptrace---of 80
compat_gpr_get---of 10
compat_gpr_set---of 25
compat_tls_get---of 4
compat_tls_set---of 3
compat_vfp_get---of 8
compat_vfp_set---of 5
flush_ptrace_hw_breakpoint---of 65
fpmr_get---of 7
fpmr_set---of 4
fpr_active---of 3
fpr_get---of 6
fpr_set---of 4
gcs_get---of 6
gcs_set---of 5
gpr_get---of 3
gpr_set---of 8
hw_break_get---of 34
hw_break_set---of 19
pac_address_keys_get---of 5
pac_address_keys_set---of 5
pac_enabled_keys_get---of 4
pac_enabled_keys_set---of 4
pac_generic_keys_get---of 4
pac_generic_keys_set---of 4
pac_mask_get---of 5
perf_trace_sys_enter---of 9
perf_trace_sys_exit---of 9
poe_get---of 4
poe_set---of 4
ptrace_disable---of 1
ptrace_hbp_get_initialised_bp---of 9
ptrace_hbp_set_ctrl---of 9
ptrace_hbptriggered---of 34
ptrace_hw_copy_thread---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
regs_get_kernel_stack_nth---of 5
regs_query_register_offset---of 4
set_tsk_thread_flag---of 3
sve_get---of 17
sve_set---of 19
syscall_trace_enter---of 26
syscall_trace_exit---of 19
system_call_get---of 4
system_call_set---of 3
tagged_addr_ctrl_get---of 4
tagged_addr_ctrl_set---of 4
task_user_regset_view---of 3
tls_get---of 7
tls_set---of 3
trace_event_raw_event_sys_enter---of 10
trace_event_raw_event_sys_exit---of 10
trace_raw_output_sys_enter---of 3
trace_raw_output_sys_exit---of 3
user_regset_copyin---of 10
valid_user_regs34%of 6
-----------
SUMMARY34%of 6

kvm_arch_vcpu_ctxflush_fp40%of 5
kvm_arch_vcpu_ctxsync_fp17%of 24
kvm_arch_vcpu_load_fp34%of 6
kvm_arch_vcpu_put_fp50%of 12
kvm_arch_vcpu_run_map_fp50%of 4
-----------
SUMMARY32%of 51

INET_ECN_decapsulate---of 45
__in6_dev_get---of 7
__skb_pull---of 5
__vxlan_dev_create---of 28
__vxlan_fdb_delete---of 31
__vxlan_fdb_free---of 10
__vxlan_sock_add---of 47
__vxlan_sock_release_prep---of 8
dev_dstats_rx_add---of 5
dev_dstats_rx_dropped---of 3
dev_dstats_tx_dropped---of 3
encap_bypass_if_local---of 25
eth_vni_hash---of 1
fdb_head_index---of 3
ip4_dst_hoplimit---of 17
ip_tunnel_ecn_encap---of 17
jhash---of 17
neigh_reduce---of 69
neigh_release---of 6
net_generic25%of 16
netdev_lock_cmp_fn---of 1
nexthop_has_v4---of 8
nexthop_is_fdb---of 13
pskb_inet_may_pull_reason---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
route_shortcircuit---of 40
skb_postpull_rcsum---of 5
skb_tunnel_info_unclone---of 17
vxlan_build_skb---of 29
vxlan_change_mtu---of 4
vxlan_changelink---of 37
vxlan_cleanup---of 12
vxlan_config_apply---of 12
vxlan_config_validate---of 43
vxlan_dellink---of 5
vxlan_dev_create---of 8
vxlan_dst_free---of 1
vxlan_ecn_decapsulate---of 39
vxlan_encap_bypass---of 31
vxlan_err_lookup---of 29
vxlan_exit_batch_rtnl---of 14
vxlan_exit_net---of 5
vxlan_fdb_add---of 12
vxlan_fdb_append---of 18
vxlan_fdb_clear_offload---of 12
vxlan_fdb_create---of 11
vxlan_fdb_delete---of 6
vxlan_fdb_delete_bulk---of 25
vxlan_fdb_destroy---of 12
vxlan_fdb_dump---of 33
vxlan_fdb_find_uc---of 24
vxlan_fdb_free---of 1
vxlan_fdb_get---of 26
vxlan_fdb_info---of 68
vxlan_fdb_insert---of 5
vxlan_fdb_miss---of 1
vxlan_fdb_nh_update---of 54
vxlan_fdb_notify---of 10
vxlan_fdb_offloaded_set---of 25
vxlan_fdb_parse---of 35
vxlan_fdb_replay---of 12
vxlan_fdb_switchdev_call_notifiers---of 1
vxlan_fdb_update---of 85
vxlan_fill_info---of 57
vxlan_fill_metadata_dst---of 42
vxlan_find_mac---of 12
vxlan_flush---of 54
vxlan_get_drvinfo---of 1
vxlan_get_link_ksettings---of 3
vxlan_get_link_net---of 1
vxlan_get_size---of 1
vxlan_gpe_gro_complete---of 5
vxlan_gpe_gro_receive---of 18
vxlan_gro_complete---of 1
vxlan_gro_prepare_receive---of 37
vxlan_gro_receive---of 15
vxlan_init---of 11
vxlan_init_net---of 1
vxlan_ip_miss---of 1
vxlan_netdevice_event17%of 12
vxlan_newlink---of 5
vxlan_nexthop_event---of 32
vxlan_nl2conf---of 140
vxlan_nl2flag---of 35
vxlan_offload_rx_ports---of 8
vxlan_open---of 11
vxlan_parse_gbp_hdr---of 16
vxlan_parse_gpe_proto---of 8
vxlan_rcv---of 68
vxlan_remcsum---of 14
vxlan_set_mac---of 17
vxlan_set_multicast_list---of 1
vxlan_setup---of 3
vxlan_snoop---of 31
vxlan_sock_release---of 22
vxlan_stop---of 1
vxlan_switchdev_event---of 21
vxlan_uninit---of 14
vxlan_validate---of 27
vxlan_vni_in_use---of 14
vxlan_vnifilter_lookup---of 49
vxlan_xmit---of 182
vxlan_xmit_one---of 194
-----------
SUMMARY32%of 32

-----------
SUMMARY---of 0

genlmsg_reply---of 1
netdev_genl_dev_notify---of 8
netdev_genl_netdevice_event40%of 5
netdev_nl_bind_rx_doit---of 38
netdev_nl_dev_fill---of 16
netdev_nl_dev_get_doit---of 7
netdev_nl_dev_get_dumpit---of 5
netdev_nl_napi_fill_one---of 18
netdev_nl_napi_get_doit---of 9
netdev_nl_napi_get_dumpit---of 27
netdev_nl_napi_set_doit---of 13
netdev_nl_qstats_get_dump_one---of 117
netdev_nl_qstats_get_dumpit---of 15
netdev_nl_queue_fill_one---of 22
netdev_nl_queue_get_doit---of 17
netdev_nl_queue_get_dumpit---of 25
netdev_nl_sock_priv_destroy---of 4
netdev_nl_sock_priv_init---of 1
netdev_nl_stats_queue---of 17
netdev_nl_stats_write_rx---of 31
netdev_nl_stats_write_tx---of 30
netdev_stat_put---of 4
nla_put_napi_id---of 4
nla_put_uint---of 3
-----------
SUMMARY40%of 5

-----------
SUMMARY---of 0

clear_rx_sa---of 6
copy_rx_sa_stats---of 6
copy_rx_sc_stats---of 11
copy_secy_stats---of 9
copy_tx_sa_stats---of 3
copy_tx_sc_stats---of 5
count_rx---of 5
del_rx_sc---of 18
dev_to_sci---of 1
dump_secy---of 151
find_rx_sc---of 15
find_rx_sc_rtnl---of 15
free_rx_sc---of 18
free_rx_sc_rcu---of 1
free_rxsa---of 1
free_txsa---of 1
get_rx_sa_stats---of 10
get_rx_sc_stats---of 10
get_rxsa_from_nl---of 17
get_secy_stats---of 10
get_tx_sa_stats---of 10
get_tx_sc_stats---of 10
get_txsa_from_nl---of 13
init_rx_sa---of 8
init_tx_sa---of 8
local_bh_disable---of 2
local_bh_enable---of 2
macsec_add_dev---of 6
macsec_add_rxsa---of 55
macsec_add_rxsc---of 29
macsec_add_txsa---of 47
macsec_change_mtu---of 3
macsec_changelink---of 15
macsec_changelink_common---of 33
macsec_common_dellink---of 10
macsec_count_tx---of 7
macsec_decrypt---of 29
macsec_decrypt_done---of 47
macsec_del_dev---of 22
macsec_del_rxsa---of 12
macsec_del_rxsc---of 13
macsec_del_txsa---of 15
macsec_dellink---of 8
macsec_dev_change_rx_flags---of 6
macsec_dev_init---of 24
macsec_dev_open---of 21
macsec_dev_set_rx_mode---of 1
macsec_dev_stop---of 12
macsec_dev_uninit---of 1
macsec_dump_txsc---of 8
macsec_encrypt_done---of 32
macsec_fill_info---of 19
macsec_finalize_skb---of 5
macsec_fix_features---of 1
macsec_free_netdev---of 6
macsec_get_iflink---of 1
macsec_get_link_net---of 1
macsec_get_ops---of 14
macsec_get_real_dev---of 1
macsec_get_size---of 1
macsec_get_stats64---of 3
macsec_handle_frame---of 158
macsec_netdev_is_offloaded---of 1
macsec_newlink---of 54
macsec_notify8%of 27
macsec_offload---of 6
macsec_pn_wrapped---of 5
macsec_post_decrypt---of 29
macsec_reset_skb---of 8
macsec_rxsa_get---of 14
macsec_rxsa_put---of 6
macsec_rxsc_put---of 6
macsec_set_mac_address---of 19
macsec_setup---of 1
macsec_start_xmit---of 142
macsec_txsa_put---of 6
macsec_upd_offload---of 10
macsec_upd_rxsa---of 30
macsec_upd_rxsc---of 22
macsec_upd_txsa---of 30
macsec_update_offload---of 44
macsec_validate_attr---of 57
macsec_validate_skb---of 7
netdev_lock_cmp_fn---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
register_macsec_dev---of 12
sci_exists---of 9
-----------
SUMMARY8%of 27

-----------
SUMMARY---of 0

dev_dstats_rx_dropped---of 3
dst_output---of 5
ip_neigh_gw4---of 12
ip_neigh_gw6---of 12
local_bh_disable---of 2
local_bh_enable---of 2
neigh_output---of 19
net_generic---of 16
netdev_lock_cmp_fn---of 1
netdev_ref_replace---of 16
nf_hook---of 31
rcu_lock_acquire---of 2
rcu_lock_release---of 2
vrf_add_fib_rules---of 6
vrf_add_mac_header_if_unset---of 19
vrf_add_slave---of 11
vrf_del_slave---of 5
vrf_dellink---of 8
vrf_dev_init---of 10
vrf_dev_uninit---of 8
vrf_device_event29%of 7
vrf_fib_rule---of 15
vrf_fib_table---of 1
vrf_fill_slave_info---of 1
vrf_fillinfo---of 1
vrf_finish_direct---of 15
vrf_finish_output---of 36
vrf_finish_output6---of 50
vrf_get_drvinfo---of 1
vrf_get_slave_size---of 1
vrf_ifindex_lookup_by_table_id---of 6
vrf_ip6_local_out---of 12
vrf_ip6_out_direct_finish---of 4
vrf_ip_local_out---of 12
vrf_ip_out_direct_finish---of 4
vrf_l3_out---of 76
vrf_l3_rcv---of 36
vrf_link_scope_lookup---of 3
vrf_local_xmit---of 9
vrf_map_unregister_dev---of 10
vrf_netns_exit---of 1
vrf_netns_init---of 4
vrf_newlink---of 24
vrf_nl_getsize---of 1
vrf_output---of 14
vrf_output6---of 8
vrf_output6_direct_finish---of 1
vrf_output_direct_finish---of 1
vrf_rcv_finish---of 1
vrf_rtable_release---of 8
vrf_setup---of 1
vrf_shared_table_handler---of 7
vrf_validate---of 9
vrf_xmit---of 50
-----------
SUMMARY29%of 7

-----------
SUMMARY---of 0

____ip_mc_inc_group---of 69
__igmp_group_dropped---of 21
__ip_mc_dec_group---of 33
__ip_mc_inc_group---of 1
__ip_mc_join_group---of 25
add_grec---of 72
add_grhead---of 6
copy_to_sockptr_offset---of 11
igmp_gq_start_timer---of 10
igmp_gq_timer_expire---of 6
igmp_group_added---of 22
igmp_heard_report---of 29
igmp_ifc_event---of 19
igmp_ifc_timer_expire---of 79
igmp_mc_get_first---of 17
igmp_mc_get_next---of 26
igmp_mc_seq_next---of 3
igmp_mc_seq_show---of 15
igmp_mc_seq_start---of 12
igmp_mc_seq_stop---of 6
igmp_mcf_get_first---of 18
igmp_mcf_get_next---of 24
igmp_mcf_seq_next---of 3
igmp_mcf_seq_show---of 3
igmp_mcf_seq_start---of 12
igmp_mcf_seq_stop---of 8
igmp_net_exit---of 3
igmp_net_init---of 5
igmp_netdev_event6%of 38
igmp_rcv---of 134
igmp_send_report---of 14
igmp_start_timer---of 10
igmp_stop_timer---of 6
igmp_timer_expire---of 14
igmpv3_add_delrec---of 16
igmpv3_clear_delrec---of 47
igmpv3_del_delrec---of 27
igmpv3_newpack---of 42
igmpv3_send_report---of 31
igmpv3_sendpack---of 10
inet_fill_ifmcaddr---of 9
inet_ifmcaddr_notify---of 5
ip_check_mc_rcu---of 39
ip_ma_put---of 12
ip_mc_add_src---of 64
ip_mc_check_igmp---of 76
ip_mc_del1_src---of 23
ip_mc_del_src---of 52
ip_mc_destroy_dev---of 16
ip_mc_down---of 27
ip_mc_drop_socket---of 12
ip_mc_find_dev---of 12
ip_mc_gsfget---of 32
ip_mc_inc_group---of 1
ip_mc_init_dev---of 4
ip_mc_join_group---of 1
ip_mc_join_group_ssm---of 1
ip_mc_leave_group---of 23
ip_mc_leave_src---of 9
ip_mc_msfget---of 33
ip_mc_msfilter---of 40
ip_mc_remap---of 17
ip_mc_sf_allow---of 42
ip_mc_source---of 58
ip_mc_unmap---of 17
ip_mc_up---of 17
ip_mc_validate_checksum---of 11
is_in---of 22
pskb_may_pull---of 6
rcu_lock_acquire---of 2
rcu_lock_release---of 2
sf_setstate---of 19
unsolicited_report_interval---of 11
-----------
SUMMARY6%of 38

__printk_deferred_enter50%of 4
__printk_deferred_exit50%of 4
__printk_safe_enter50%of 4
__printk_safe_exit50%of 4
is_printk_force_console100%of 1
is_printk_legacy_deferred43%of 7
printk_force_console_enter---of 3
printk_force_console_exit---of 3
vprintk100%of 1
-----------
SUMMARY52%of 25

-----------
SUMMARY---of 0

chacha_block_generic100%of 1
chacha_permute58%of 7
hchacha_block_generic---of 1
-----------
SUMMARY63%of 8

-----------
SUMMARY---of 0

__set_ptes---of 23
arch_hugetlb_migration_supported67%of 6
arch_make_huge_pte---of 6
find_num_contig---of 12
flush_tlb_range---of 25
get_clear_contig---of 11
huge_pte_alloc40%of 51
huge_pte_clear---of 8
huge_pte_offset40%of 25
huge_ptep_clear_flush---of 3
huge_ptep_get16%of 13
huge_ptep_get_and_clear---of 6
huge_ptep_modify_prot_commit---of 1
huge_ptep_modify_prot_start---of 10
huge_ptep_set_access_flags---of 14
huge_ptep_set_wrprotect---of 7
hugetlb_mask_last_page67%of 6
pte_offset_huge67%of 3
set_huge_pte_at---of 14
-----------
SUMMARY41%of 104

__ref_tracker_dir_pr_ostream---of 30
ref_tracker_alloc27%of 15
ref_tracker_dir_exit---of 16
ref_tracker_dir_print---of 1
ref_tracker_dir_print_locked---of 1
ref_tracker_dir_snprint---of 1
ref_tracker_free19%of 22
refcount_inc---of 6
-----------
SUMMARY22%of 37

-----------
SUMMARY---of 0

kasprintf100%of 1
kvasprintf50%of 4
kvasprintf_const---of 8
-----------
SUMMARY60%of 5

-----------
SUMMARY---of 0

__folio_cancel_dirty9%of 23
__folio_end_writeback---of 38
__folio_mark_dirty---of 50
__folio_start_writeback---of 40
__wb_calc_thresh---of 9
__wb_update_bandwidth---of 21
__wb_writeout_add---of 6
balance_dirty_pages---of 50
balance_dirty_pages_ratelimited100%of 1
balance_dirty_pages_ratelimited_flags4%of 57
balance_wb_limits---of 22
bdi_get_max_bytes---of 1
bdi_get_min_bytes---of 1
bdi_set_max_bytes---of 7
bdi_set_max_ratio---of 4
bdi_set_max_ratio_no_scale---of 4
bdi_set_min_bytes---of 8
bdi_set_min_ratio---of 5
bdi_set_min_ratio_no_scale---of 5
bdi_set_strict_limit---of 3
cgwb_calc_thresh---of 1
dirty_background_bytes_handler---of 4
dirty_background_ratio_handler---of 3
dirty_bytes_handler---of 5
dirty_ratio_handler---of 4
dirty_writeback_centisecs_handler---of 4
do_writepages---of 25
domain_dirty_avail---of 5
domain_dirty_limits---of 22
domain_over_bg_thresh---of 4
filemap_dirty_folio---of 8
folio_account_cleaned---of 10
folio_clear_dirty_for_io---of 37
folio_mark_dirty50%of 6
folio_mark_dirty_lock---of 7
folio_redirty_for_writepage---of 21
folio_wait_stable---of 6
folio_wait_writeback---of 5
folio_wait_writeback_killable---of 5
global_dirty_limits---of 1
laptop_io_completion---of 1
laptop_mode_timer_fn---of 1
laptop_sync_completion---of 14
node_dirty_ok---of 13
noop_dirty_folio60%of 5
page_writeback_cpu_online---of 1
percpu_ref_put_many---of 18
percpu_ref_tryget---of 19
rcu_lock_acquire---of 2
rcu_lock_release---of 2
tag_pages_for_writeback---of 21
task_get_css---of 23
trace_balance_dirty_pages---of 14
trace_folio_wait_writeback---of 14
wb_calc_thresh---of 1
wb_dirty_limits---of 5
wb_domain_exit---of 1
wb_domain_init---of 1
wb_over_bg_thresh---of 4
wb_update_bandwidth---of 1
wb_update_dirty_ratelimit---of 20
wb_writeout_inc---of 5
write_cache_pages---of 6
writeback_iter---of 65
writeback_set_ratelimit---of 1
writeout_period---of 3
-----------
SUMMARY12%of 92

alloc_mnt_idmap---of 16
from_vfsgid34%of 6
from_vfsuid34%of 6
make_vfsgid29%of 7
make_vfsuid29%of 7
mnt_idmap_get---of 7
mnt_idmap_put---of 10
statmount_mnt_idmap---of 12
vfsgid_in_group_p100%of 1
-----------
SUMMARY34%of 27

-----------
SUMMARY---of 0

audit_ctl_lock---of 1
audit_ctl_unlock---of 1
audit_free_reply---of 8
audit_get_tty---of 8
audit_log---of 3
audit_log_common_recv_msg---of 4
audit_log_config_change---of 3
audit_log_d_path45%of 9
audit_log_d_path_exe---of 4
audit_log_end34%of 6
audit_log_format67%of 3
audit_log_key---of 3
audit_log_lost---of 13
audit_log_multicast---of 12
audit_log_n_hex27%of 15
audit_log_n_string17%of 12
audit_log_n_untrustedstring---of 5
audit_log_path_denied---of 6
audit_log_session_info---of 1
audit_log_start17%of 31
audit_log_task_context---of 8
audit_log_task_info---of 27
audit_log_untrustedstring80%of 5
audit_log_vformat20%of 20
audit_make_reply---of 7
audit_multicast_bind---of 1
audit_multicast_unbind---of 1
audit_net_exit---of 1
audit_net_init---of 6
audit_panic---of 5
audit_put_tty---of 1
audit_receive---of 162
audit_send_list_thread---of 11
audit_send_reply---of 10
audit_send_reply_thread---of 3
audit_serial---of 3
audit_set_backlog_wait_time---of 4
audit_set_loginuid---of 31
audit_signal_info---of 7
audit_string_contains_control---of 4
auditd_conn_free---of 6
auditd_pid_vnr---of 19
auditd_reset---of 12
auditd_set---of 22
auditd_test_task28%of 18
kauditd_hold_skb---of 11
kauditd_retry_skb---of 6
kauditd_send_multicast_skb---of 4
kauditd_send_queue---of 23
kauditd_thread---of 65
net_generic---of 16
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY30%of 123

-----------
SUMMARY---of 0

coalesced_mmio_destructor67%of 3
coalesced_mmio_write30%of 10
kvm_coalesced_mmio_free67%of 3
kvm_coalesced_mmio_init25%of 36
kvm_vm_ioctl_register_coalesced_mmio50%of 6
kvm_vm_ioctl_unregister_coalesced_mmio100%of 12
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY48%of 74

-----------
SUMMARY---of 0

__mmu_interval_notifier_insert---of 17
__mmu_notifier_arch_invalidate_secondary_tlbs75%of 8
__mmu_notifier_clear_flush_young---of 8
__mmu_notifier_clear_young---of 8
__mmu_notifier_invalidate_range_end54%of 15
__mmu_notifier_invalidate_range_start30%of 30
__mmu_notifier_register41%of 22
__mmu_notifier_release---of 21
__mmu_notifier_subscriptions_destroy---of 3
__mmu_notifier_test_young---of 9
mmu_interval_notifier_insert---of 6
mmu_interval_notifier_insert_locked---of 9
mmu_interval_notifier_remove---of 19
mmu_interval_read_begin---of 6
mmu_notifier_free_rcu---of 5
mmu_notifier_get_locked---of 12
mmu_notifier_put---of 6
mmu_notifier_register50%of 10
mmu_notifier_synchronize---of 1
mmu_notifier_unregister47%of 15
mn_itree_inv_end---of 13
srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
-----------
SUMMARY47%of 104

-----------
SUMMARY---of 0

__put_user_ns---of 1
cmp_extents_forward---of 1
cmp_extents_reverse---of 1
cmp_map_id---of 1
create_user_ns---of 37
current_in_userns---of 4
enforced_nproc_rlimit---of 4
free_user_ns---of 12
from_kgid20%of 15
from_kgid_munged19%of 16
from_kprojid---of 15
from_kprojid_munged---of 16
from_kuid20%of 15
from_kuid_munged19%of 16
gid_m_show---of 17
gid_m_start---of 4
in_userns---of 4
m_next---of 1
m_stop---of 1
make_kgid20%of 15
make_kprojid27%of 15
make_kuid20%of 15
map_id_down---of 15
map_id_range_up---of 15
map_id_up---of 15
map_write---of 61
new_idmap_permitted---of 54
ns_get_owner34%of 9
proc_gid_map_write---of 4
proc_projid_map_write---of 4
proc_setgroups_show---of 1
proc_setgroups_write---of 22
proc_uid_map_write---of 4
projid_m_show---of 17
projid_m_start---of 4
rcu_lock_acquire---of 2
rcu_lock_release---of 2
sort_idmaps---of 4
uid_m_show---of 17
uid_m_start---of 4
unshare_userns---of 8
userns_get---of 22
userns_install---of 20
userns_may_setgroups---of 3
userns_owner---of 1
userns_put43%of 7
-----------
SUMMARY23%of 123

__list_lru_init---of 13
__list_lru_walk_one---of 24
list_lru_add---of 9
list_lru_add_obj---of 13
list_lru_count_node---of 1
list_lru_count_one---of 19
list_lru_del---of 7
list_lru_del_obj---of 13
list_lru_destroy---of 23
list_lru_isolate---of 3
list_lru_isolate_move---of 5
list_lru_walk_node---of 63
list_lru_walk_one---of 1
list_lru_walk_one_irq---of 1
lock_list_lru_of_memcg---of 23
memcg_list_lru_alloc34%of 24
memcg_reparent_list_lrus---of 16
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
-----------
SUMMARY34%of 24

check_nested_vcpu_requests7%of 58
compute_tlb_inval_range---of 38
kvm_arch_flush_shadow_all34%of 6
kvm_handle_s1e2_tlbi---of 122
kvm_handle_vncr_abort---of 77
kvm_init_nested100%of 1
kvm_init_nested_s2_mmu---of 1
kvm_init_nv_sysregs---of 74
kvm_inject_s2_fault---of 1
kvm_invalidate_vncr_ipa14%of 22
kvm_nested_s2_flush---of 9
kvm_nested_s2_unmap34%of 9
kvm_nested_s2_wp34%of 9
kvm_s2_handle_perm_fault---of 8
kvm_s2_mmu_iterate_by_vmid---of 7
kvm_vcpu_allocate_vncr_tlb---of 3
kvm_vcpu_apply_reg_masks---of 3
kvm_vcpu_init_nested---of 19
kvm_vcpu_load_hw_mmu---of 49
kvm_vcpu_put_hw_mmu---of 35
kvm_walk_nested_s2---of 42
limit_nv_id_reg---of 11
lookup_s2_mmu---of 14
srcu_lock_acquire---of 2
srcu_lock_release---of 2
-----------
SUMMARY16%of 105

-----------
SUMMARY---of 0

srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
tomoyo_add_slash---of 6
tomoyo_check_mkdev_acl---of 20
tomoyo_check_open_permission43%of 14
tomoyo_check_path2_acl---of 9
tomoyo_check_path_acl60%of 5
tomoyo_check_path_number_acl70%of 10
tomoyo_compare_name_union---of 4
tomoyo_compare_number_union---of 4
tomoyo_execute_permission---of 6
tomoyo_merge_mkdev_acl---of 1
tomoyo_merge_path2_acl---of 1
tomoyo_merge_path_acl---of 1
tomoyo_merge_path_number_acl---of 1
tomoyo_mkdev_perm---of 6
tomoyo_path2_perm---of 14
tomoyo_path_number_perm45%of 9
tomoyo_path_perm34%of 15
tomoyo_put_name_union---of 7
tomoyo_put_number_union---of 4
tomoyo_same_mkdev_acl---of 18
tomoyo_same_mount_acl---of 12
tomoyo_same_path2_acl---of 5
tomoyo_same_path_acl---of 3
tomoyo_same_path_number_acl---of 8
tomoyo_update_mount_acl---of 27
tomoyo_write_file---of 58
-----------
SUMMARY51%of 57

-----------
SUMMARY---of 0

__flush_tlb_range---of 43
__hugetlb_zap_begin36%of 17
__hugetlb_zap_end27%of 23
__nr_hugepages_store_common---of 37
__unmap_hugepage_range16%of 75
__update_and_free_hugetlb_folio---of 13
__vma_private_lock---of 5
__vma_reservation_common30%of 24
adjust_pool_surplus---of 39
adjust_range_if_pmd_sharing_possible---of 8
alloc_and_dissolve_hugetlb_folio---of 20
alloc_buddy_hugetlb_folio---of 26
alloc_gigantic_folio---of 24
alloc_hugetlb_folio19%of 65
alloc_hugetlb_folio_nodemask---of 19
alloc_hugetlb_folio_reserve---of 4
alloc_hugetlb_folio_vma---of 6
alloc_pool_huge_folio---of 19
alloc_surplus_hugetlb_folio25%of 12
allocate_file_region_entries12%of 18
clear_vma_resv_huge_pages---of 24
copy_hugetlb_page_range---of 144
css_get---of 17
css_put---of 19
demote_size_show---of 12
demote_size_store---of 19
demote_store---of 95
dequeue_hugetlb_folio_nodemask---of 60
dissolve_free_hugetlb_folio---of 17
dissolve_free_hugetlb_folios---of 10
enqueue_hugetlb_folio---of 12
folio_isolate_hugetlb---of 10
folio_large_mapcount---of 4
folio_lock---of 7
folio_put---of 6
folio_putback_hugetlb---of 20
folio_try_get---of 16
free_hpage_workfn---of 10
free_huge_folio---of 68
free_hugepages_show---of 14
get_huge_page_for_hwpoison---of 1
get_hwpoison_hugetlb_folio---of 5
hstate_next_node_to_alloc---of 13
huge_pmd_share---of 27
huge_pmd_unshare29%of 21
hugepage_new_subpool---of 5
hugepage_put_subpool---of 10
hugepage_subpool_put_pages15%of 14
hugetlb_acct_memory17%of 73
hugetlb_add_file_rmap---of 9
hugetlb_add_to_page_cache---of 6
hugetlb_change_protection---of 70
hugetlb_dup_vma_private---of 6
hugetlb_fault8%of 144
hugetlb_fault_mutex_hash---of 1
hugetlb_fix_reserve_counts---of 12
hugetlb_folio_mapping_lock_write---of 3
hugetlb_free_folio---of 7
hugetlb_handle_userfault---of 7
hugetlb_mempolicy_sysctl_handler---of 4
hugetlb_mfill_atomic_pte---of 76
hugetlb_overcommit_handler---of 6
hugetlb_pte_stable60%of 5
hugetlb_register_node---of 8
hugetlb_report_meminfo---of 6
hugetlb_report_node_meminfo---of 1
hugetlb_report_usage---of 1
hugetlb_reserve_pages33%of 64
hugetlb_resv_map_add8%of 28
hugetlb_show_meminfo_node---of 4
hugetlb_sysctl_handler---of 4
hugetlb_sysfs_add_hstate---of 6
hugetlb_total_pages---of 4
hugetlb_unregister_node---of 8
hugetlb_unreserve_pages50%of 4
hugetlb_unshare_all_pmds---of 1
hugetlb_unshare_pmds---of 33
hugetlb_vm_op_close20%of 25
hugetlb_vm_op_open---of 23
hugetlb_vm_op_pagesize---of 1
hugetlb_vm_op_split34%of 6
hugetlb_vma_assert_locked28%of 11
hugetlb_vma_lock_free50%of 10
hugetlb_vma_lock_read43%of 7
hugetlb_vma_lock_release---of 1
hugetlb_vma_lock_write---of 7
hugetlb_vma_trylock_write---of 7
hugetlb_vma_unlock_read43%of 7
hugetlb_vma_unlock_write---of 7
hugetlb_wp---of 123
hugetlbfs_pagecache_present---of 7
is_hugetlb_entry_hwpoisoned---of 3
is_hugetlb_entry_migration---of 3
isolate_or_dissolve_huge_page---of 17
mmu_notifier_invalidate_range_start---of 3
move_hugetlb_page_tables---of 61
move_hugetlb_state---of 33
nr_hugepages_mempolicy_show---of 14
nr_hugepages_mempolicy_store---of 1
nr_hugepages_show---of 14
nr_hugepages_store---of 1
nr_hugepages_store_common---of 13
nr_overcommit_hugepages_show---of 12
nr_overcommit_hugepages_store---of 15
only_alloc_fresh_hugetlb_folio---of 9
pfn_swap_entry_to_page---of 14
prep_and_add_allocated_folios---of 8
rcu_lock_acquire---of 2
rcu_lock_release---of 2
region_add---of 22
region_chg36%of 17
region_del9%of 24
remove_hugetlb_folio---of 19
remove_pool_hugetlb_folio---of 24
replace_free_hugepage_folios---of 17
restore_reserve_on_error---of 12
resv_hugepages_show---of 12
resv_map_alloc50%of 4
resv_map_release50%of 8
size_to_hstate80%of 5
surplus_hugepages_show---of 14
tlb_flush_mmu_tlbonly13%of 16
unmap_hugepage_range---of 10
vma_end_read---of 8
vma_kernel_pagesize---of 4
vma_mmu_pagesize---of 4
wait_for_freed_hugetlb_folios---of 3
want_pmd_share60%of 5
-----------
SUMMARY21%of 732

_atomic_dec_and_lock56%of 9
_atomic_dec_and_lock_irqsave---of 9
_atomic_dec_and_raw_lock---of 9
_atomic_dec_and_raw_lock_irqsave---of 9
-----------
SUMMARY56%of 9

flow_offload_ct_tcp---of 1
flow_offload_netdev_event67%of 3
nft_dev_forward_path---of 61
nft_flow_offload_activate---of 3
nft_flow_offload_deactivate---of 1
nft_flow_offload_destroy---of 1
nft_flow_offload_dump---of 1
nft_flow_offload_eval---of 39
nft_flow_offload_init---of 5
nft_flow_offload_validate---of 5
nft_flow_route---of 17
-----------
SUMMARY67%of 3

can_can_gw_rcv---of 44
cangw_pernet_exit_batch---of 4
cangw_pernet_init---of 1
cgw_create_job---of 29
cgw_csum_crc8_neg---of 8
cgw_csum_crc8_pos---of 8
cgw_csum_crc8_rel---of 12
cgw_csum_xor_neg---of 8
cgw_csum_xor_pos---of 8
cgw_csum_xor_rel---of 14
cgw_dump_jobs---of 74
cgw_job_free_rcu---of 1
cgw_notifier16%of 13
cgw_parse_attr---of 98
cgw_register_filter---of 1
cgw_remove_all_jobs---of 9
cgw_remove_job---of 26
hlist_add_head_rcu---of 3
local_bh_disable---of 2
local_bh_enable---of 2
mod_and_ccdlc---of 6
mod_and_data---of 1
mod_and_fddata---of 1
mod_and_flags---of 1
mod_and_id---of 1
mod_and_len---of 1
mod_or_ccdlc---of 6
mod_or_data---of 1
mod_or_fddata---of 1
mod_or_flags---of 1
mod_or_id---of 1
mod_or_len---of 1
mod_set_ccdlc---of 3
mod_set_data---of 1
mod_set_fddata---of 1
mod_set_flags---of 1
mod_set_id---of 1
mod_set_len---of 1
mod_xor_ccdlc---of 6
mod_xor_data---of 1
mod_xor_fddata---of 1
mod_xor_flags---of 1
mod_xor_id---of 1
mod_xor_len---of 1
rcu_lock_acquire---of 2
rcu_lock_release---of 2
-----------
SUMMARY16%of 13

-----------
SUMMARY---of 0

kvm_vfio_create63%of 8
kvm_vfio_has_attr100%of 4
kvm_vfio_ops_exit---of 1
kvm_vfio_ops_init---of 1
kvm_vfio_release20%of 15
kvm_vfio_set_attr24%of 51
-----------
SUMMARY31%of 78

__is_kernel_percpu_address50%of 6
__probestub_percpu_alloc_percpu---of 1
__probestub_percpu_alloc_percpu_fail---of 1
__probestub_percpu_create_chunk---of 1
__probestub_percpu_destroy_chunk---of 1
__probestub_percpu_free_percpu---of 1
__traceiter_percpu_alloc_percpu---of 4
__traceiter_percpu_alloc_percpu_fail---of 4
__traceiter_percpu_create_chunk---of 4
__traceiter_percpu_destroy_chunk---of 4
__traceiter_percpu_free_percpu---of 4
free_percpu10%of 71
is_kernel_percpu_address---of 5
pcpu_alloc_area58%of 26
pcpu_alloc_noprof18%of 110
pcpu_balance_free---of 46
pcpu_balance_workfn---of 64
pcpu_block_refresh_hint100%of 6
pcpu_block_update92%of 24
pcpu_block_update_hint_alloc68%of 31
pcpu_chunk_refresh_hint83%of 23
pcpu_chunk_relocate20%of 15
pcpu_create_chunk---of 28
pcpu_depopulate_chunk---of 28
pcpu_dump_alloc_info---of 21
pcpu_find_block_fit56%of 9
pcpu_free_area47%of 30
pcpu_memcg_post_alloc_hook6%of 37
pcpu_next_fit_region74%of 15
pcpu_nr_pages---of 1
pcpu_obj_full_size---of 1
pcpu_populate_chunk---of 44
pcpu_post_unmap_tlb_flush---of 16
per_cpu_ptr_to_phys---of 12
perf_trace_percpu_alloc_percpu---of 6
perf_trace_percpu_alloc_percpu_fail---of 6
perf_trace_percpu_create_chunk---of 6
perf_trace_percpu_destroy_chunk---of 6
perf_trace_percpu_free_percpu---of 6
pfn_valid---of 31
rcu_lock_acquire---of 2
rcu_lock_release---of 2
trace_event_raw_event_percpu_alloc_percpu---of 7
trace_event_raw_event_percpu_alloc_percpu_fail---of 7
trace_event_raw_event_percpu_create_chunk---of 7
trace_event_raw_event_percpu_destroy_chunk---of 7
trace_event_raw_event_percpu_free_percpu---of 7
trace_percpu_create_chunk---of 17
trace_percpu_free_percpu24%of 17
trace_raw_output_percpu_alloc_percpu---of 4
trace_raw_output_percpu_alloc_percpu_fail---of 3
trace_raw_output_percpu_create_chunk---of 3
trace_raw_output_percpu_destroy_chunk---of 3
trace_raw_output_percpu_free_percpu---of 3
-----------
SUMMARY36%of 420

__set_fixmap50%of 10
-----------
SUMMARY50%of 10

__blk_freeze_queue_start---of 10
__blk_mq_alloc_disk---of 10
__blk_mq_alloc_driver_tag---of 22
__blk_mq_alloc_requests---of 40
__blk_mq_complete_request_remote---of 1
__blk_mq_end_request---of 21
__blk_mq_free_request---of 16
__blk_mq_realloc_hw_ctxs---of 20
__blk_mq_requeue_request---of 25
__blk_mq_unfreeze_queue---of 10
blk_account_io_completion---of 6
blk_account_io_done---of 23
blk_account_io_start---of 30
blk_add_rq_to_plug---of 14
blk_done_softirq---of 4
blk_dump_rq_flags---of 7
blk_end_sync_rq---of 1
blk_execute_rq---of 12
blk_execute_rq_nowait---of 7
blk_freeze_queue_start---of 6
blk_freeze_queue_start_non_owner---of 4
blk_hctx_poll---of 18
blk_insert_cloned_request---of 32
blk_mq_alloc_and_init_hctx---of 29
blk_mq_alloc_disk_for_queue---of 4
blk_mq_alloc_map_and_rqs---of 63
blk_mq_alloc_queue---of 6
blk_mq_alloc_request---of 18
blk_mq_alloc_request_hctx---of 24
blk_mq_alloc_set_map_and_rqs---of 23
blk_mq_alloc_sq_tag_set---of 1
blk_mq_alloc_tag_set---of 34
blk_mq_cancel_work_sync---of 4
blk_mq_check_expired---of 6
blk_mq_check_inflight---of 7
blk_mq_complete_request---of 3
blk_mq_complete_request_remote---of 22
blk_mq_delay_kick_requeue_list---of 1
blk_mq_delay_run_hw_queue---of 19
blk_mq_delay_run_hw_queues---of 15
blk_mq_dequeue_from_ctx---of 16
blk_mq_destroy_queue---of 8
blk_mq_dispatch_rq_list---of 84
blk_mq_dispatch_wake---of 9
blk_mq_end_request---of 3
blk_mq_end_request_batch---of 66
blk_mq_exit_hctx---of 20
blk_mq_exit_queue---of 18
blk_mq_flush_busy_ctxs---of 10
blk_mq_flush_plug_list3%of 79
blk_mq_free_map_and_rqs---of 3
blk_mq_free_plug_rqs---of 6
blk_mq_free_request---of 17
blk_mq_free_rq_map---of 1
blk_mq_free_rqs---of 34
blk_mq_free_tag_set---of 15
blk_mq_freeze_queue_nomemsave---of 6
blk_mq_freeze_queue_wait---of 5
blk_mq_freeze_queue_wait_timeout---of 5
blk_mq_handle_expired---of 9
blk_mq_has_request---of 3
blk_mq_hctx_notify_dead---of 13
blk_mq_hctx_notify_offline---of 14
blk_mq_hctx_notify_online---of 4
blk_mq_hw_queue_need_run---of 27
blk_mq_in_flight---of 1
blk_mq_in_flight_rw---of 1
blk_mq_inc_active_requests---of 6
blk_mq_init_allocated_queue---of 25
blk_mq_insert_request---of 18
blk_mq_kick_requeue_list---of 1
blk_mq_map_swqueue---of 45
blk_mq_plug_issue_direct---of 20
blk_mq_poll---of 4
blk_mq_put_rq_ref---of 9
blk_mq_queue_inflight---of 1
blk_mq_quiesce_queue---of 6
blk_mq_quiesce_queue_nowait---of 3
blk_mq_quiesce_tagset---of 9
blk_mq_realloc_hw_ctxs---of 32
blk_mq_release---of 11
blk_mq_request_issue_directly---of 20
blk_mq_requeue_request---of 8
blk_mq_requeue_work---of 18
blk_mq_rq_cpu---of 1
blk_mq_rq_ctx_init---of 6
blk_mq_rq_inflight---of 7
blk_mq_run_hw_queue---of 25
blk_mq_run_hw_queues---of 14
blk_mq_run_work_fn---of 13
blk_mq_start_hw_queue---of 3
blk_mq_start_hw_queues---of 6
blk_mq_start_request---of 29
blk_mq_start_stopped_hw_queue---of 5
blk_mq_start_stopped_hw_queues---of 10
blk_mq_stop_hw_queue---of 3
blk_mq_stop_hw_queues---of 6
blk_mq_submit_bio---of 135
blk_mq_timeout_work---of 13
blk_mq_try_issue_directly---of 27
blk_mq_try_issue_list_directly---of 18
blk_mq_unfreeze_queue_nomemrestore---of 11
blk_mq_unfreeze_queue_non_owner---of 8
blk_mq_unquiesce_queue---of 4
blk_mq_unquiesce_tagset---of 8
blk_mq_update_nr_hw_queues---of 83
blk_mq_update_nr_requests---of 25
blk_mq_update_queue_map---of 12
blk_mq_update_tag_set_shared---of 18
blk_mq_wait_quiesce_done---of 3
blk_mq_wake_waiters---of 7
blk_print_req_error---of 4
blk_rq_cur_bytes---of 7
blk_rq_init---of 5
blk_rq_is_poll---of 3
blk_rq_poll---of 5
blk_rq_prep_clone---of 19
blk_rq_unprep_clone---of 4
blk_softirq_cpu_dead---of 4
blk_steal_bios---of 3
blk_update_request---of 44
percpu_ref_get_many---of 16
percpu_ref_put_many---of 18
percpu_ref_tryget---of 19
pfn_valid---of 31
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
srcu_lock_acquire---of 2
srcu_lock_release---of 2
srcu_read_lock---of 1
srcu_read_unlock---of 3
trace_block_plug---of 14
trace_block_rq_complete---of 14
trace_block_rq_error---of 14
trace_block_rq_insert---of 14
trace_block_unplug---of 14
-----------
SUMMARY3%of 79

-----------
SUMMARY---of 0

kvm_arch_can_set_irq_routing100%of 1
kvm_arch_irq_routing_update100%of 1
kvm_free_irq_routing56%of 9
kvm_init_irq_routing67%of 3
kvm_irq_map_chip_pin---of 7
kvm_irq_map_gsi47%of 13
kvm_send_userspace_msi100%of 4
kvm_set_irq37%of 19
kvm_set_irq_routing76%of 33
srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
-----------
SUMMARY64%of 87

-----------
SUMMARY---of 0

__kvm_adjust_pc62%of 18
enter_exception3236%of 17
enter_exception6413%of 54
-----------
SUMMARY27%of 89

__lock_sock---of 7
__lock_sock_fast---of 3
__receive_sock---of 3
__release_sock---of 15
__sk_backlog_rcv---of 3
__sk_destruct---of 39
__sk_dst_check---of 12
__sk_flush_backlog---of 3
__sk_free---of 20
__sk_mem_raise_allocated---of 62
__sk_mem_reclaim---of 1
__sk_mem_reduce_allocated---of 16
__sk_mem_schedule---of 3
__sk_receive_skb---of 32
__sock_cmsg_send---of 31
__sock_i_ino---of 3
__sock_queue_rcv_skb---of 33
__sock_set_mark---of 3
__sock_wfree---of 6
_inline_copy_from_user---of 8
bpf_skops_tx_timestamping---of 1
copy_from_sockptr---of 4
copy_to_sockptr_offset---of 11
cred_to_ucred---of 3
dst_negative_advice---of 14
get_pid---of 7
groups_to_user---of 5
local_bh_disable---of 2
local_bh_enable---of 2
lock_sock_nested67%of 3
net_passive_inc---of 6
proto_exit_net---of 1
proto_init_net---of 1
proto_memory_pcpu_drain---of 7
proto_register---of 27
proto_seq_next---of 1
proto_seq_show---of 9
proto_seq_start---of 1
proto_seq_stop---of 1
proto_unregister---of 10
put_cred---of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
release_sock56%of 9
sk_alloc---of 11
sk_busy_loop_end---of 9
sk_capable---of 3
sk_clear_memalloc---of 6
sk_clone_lock---of 47
sk_common_release---of 16
sk_destruct---of 4
sk_dst_check---of 26
sk_dst_reset---of 1
sk_error_report---of 17
sk_free---of 6
sk_free_unlock_clone---of 6
sk_get_meminfo---of 1
sk_get_peer_cred---of 4
sk_getsockopt---of 114
sk_init_common---of 1
sk_ioctl---of 24
sk_mc_loop---of 9
sk_net_capable---of 3
sk_net_refcnt_upgrade---of 11
sk_ns_capable---of 3
sk_page_frag_refill---of 4
sk_prot_alloc---of 12
sk_reset_timer---of 7
sk_send_sigurg---of 20
sk_set_memalloc---of 1
sk_set_peek_off---of 1
sk_set_prio_allowed---of 6
sk_setsockopt---of 204
sk_setup_caps---of 25
sk_stop_timer---of 6
sk_stop_timer_sync---of 6
sk_stream_moderate_sndbuf---of 4
sk_wait_data---of 13
skb_dst_force---of 17
skb_orphan_partial---of 18
skb_page_frag_refill18%of 17
skb_set_owner_edemux---of 12
skb_set_owner_w42%of 12
sock_alloc_send_pskb18%of 29
sock_bind_add---of 3
sock_bindtoindex---of 12
sock_bindtoindex_locked---of 7
sock_cmsg_send---of 10
sock_common_getsockopt---of 1
sock_common_recvmsg---of 3
sock_common_setsockopt---of 1
sock_copy_user_timeval---of 13
sock_def_destruct---of 1
sock_def_error_report---of 25
sock_def_readable24%of 38
sock_def_wakeup---of 19
sock_def_write_space---of 26
sock_devmem_dontneed---of 59
sock_efree---of 10
sock_enable_timestamp---of 4
sock_enable_timestamps---of 6
sock_error---of 3
sock_gen_cookie---of 3
sock_get_timeout---of 6
sock_getbindtodevice---of 8
sock_gettstamp---of 11
sock_i_ino---of 3
sock_i_uid---of 3
sock_init_data---of 3
sock_init_data_uid---of 3
sock_inuse_exit_net---of 1
sock_inuse_get---of 4
sock_inuse_init_net---of 1
sock_ioctl_inout---of 14
sock_kfree_s---of 4
sock_kmalloc---of 8
sock_kmemdup---of 3
sock_kzfree_s---of 4
sock_load_diag_module---of 6
sock_no_accept---of 1
sock_no_bind---of 1
sock_no_connect---of 1
sock_no_getname---of 1
sock_no_ioctl---of 1
sock_no_linger---of 3
sock_no_listen---of 1
sock_no_mmap---of 1
sock_no_recvmsg---of 1
sock_no_sendmsg---of 1
sock_no_sendmsg_locked---of 1
sock_no_shutdown---of 1
sock_no_socketpair---of 1
sock_ofree---of 3
sock_omalloc---of 5
sock_pfree---of 18
sock_prot_inuse_get---of 4
sock_queue_rcv_skb_reason---of 7
sock_recv_errqueue---of 12
sock_release_reserved_memory---of 8
sock_reserve_memory---of 17
sock_rfree---of 9
sock_set_keepalive---of 5
sock_set_mark---of 5
sock_set_priority---of 1
sock_set_rcvbuf---of 3
sock_set_reuseaddr---of 3
sock_set_reuseport---of 3
sock_set_sndtimeo---of 3
sock_set_timeout---of 8
sock_set_timestamp---of 18
sock_set_timestamping---of 29
sock_setsockopt---of 1
sock_update_classid---of 21
sock_update_netprioidx---of 21
sock_wfree14%of 46
sock_wmalloc---of 6
sockopt_capable---of 3
sockopt_lock_sock---of 4
sockopt_ns_capable---of 3
sockopt_release_sock---of 3
-----------
SUMMARY25%of 158

audit_comparator---of 10
audit_compare_dname_path---of 7
audit_compare_rule---of 51
audit_data_to_entry---of 192
audit_del_rule---of 27
audit_dupe_rule---of 46
audit_filter6%of 78
audit_free_rule_rcu---of 18
audit_gid_comparator---of 8
audit_list_rules_send---of 43
audit_log_rule_change---of 4
audit_match_class---of 4
audit_match_signal---of 18
audit_pack_string---of 1
audit_rule_change---of 58
audit_uid_comparator---of 8
audit_unpack_string---of 6
audit_update_lsm_rules---of 19
parent_len---of 5
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
-----------
SUMMARY10%of 82

__add_preferred_console---of 29
__arm64_sys_syslog---of 1
__console_rewind_all---of 11
__console_unlock60%of 5
__down_trylock_console_sem67%of 9
__pr_flush---of 37
__printk_cpu_sync_put---of 4
__printk_cpu_sync_try_get---of 7
__printk_cpu_sync_wait---of 3
__printk_ratelimit---of 1
__probestub_console---of 1
__traceiter_console---of 4
__wake_up_klogd63%of 8
_printk100%of 1
_printk_deferred100%of 1
access_ok---of 4
add_preferred_console---of 1
console_conditional_schedule---of 3
console_cpu_notify---of 10
console_device---of 9
console_flush_all44%of 46
console_flush_on_panic---of 7
console_force_preferred_locked---of 15
console_list_lock---of 5
console_list_unlock---of 1
console_lock---of 5
console_lock_spinning_disable_and_check34%of 6
console_lock_spinning_enable67%of 3
console_prepend_dropped---of 1
console_prepend_message---of 4
console_prepend_replay---of 1
console_resume---of 13
console_resume_all---of 19
console_srcu_read_lock100%of 1
console_srcu_read_lock_is_held67%of 3
console_srcu_read_unlock67%of 3
console_suspend---of 8
console_suspend_all---of 15
console_try_replay_all---of 11
console_trylock---of 6
console_trylock_spinning35%of 23
console_unblank---of 30
console_unlock28%of 11
console_verbose---of 4
defer_console_output---of 1
devkmsg_emit---of 1
devkmsg_llseek---of 7
devkmsg_open---of 10
devkmsg_poll---of 6
devkmsg_read---of 21
devkmsg_release---of 4
devkmsg_sysctl_set_loglvl---of 18
devkmsg_write---of 14
do_syslog---of 34
find_first_fitting_seq---of 11
get_init_console_seq---of 16
hlist_add_behind_rcu---of 3
hlist_add_head_rcu---of 3
info_print_prefix60%of 5
is_console_locked---of 1
is_printk_cpu_sync_owner100%of 1
kmsg_dump_desc---of 18
kmsg_dump_get_buffer---of 15
kmsg_dump_get_line---of 10
kmsg_dump_reason_str---of 3
kmsg_dump_register---of 5
kmsg_dump_rewind---of 3
kmsg_dump_unregister---of 4
lockdep_assert_console_list_lock_held---of 4
log_buf_addr_get---of 1
log_buf_len_get---of 1
log_buf_vmcoreinfo_setup---of 1
match_devname_and_update_preferred_console---of 41
msg_add_dict_text---of 18
other_cpu_in_panic---of 3
perf_trace_console---of 7
pr_flush---of 1
printk_get_console_flush_type14%of 30
printk_get_next_message27%of 15
printk_kthreads_check_locked---of 14
printk_kthreads_shutdown---of 14
printk_legacy_allow_panic_sync---of 7
printk_legacy_allow_spinlock_enter100%of 2
printk_legacy_allow_spinlock_exit100%of 2
printk_parse_prefix---of 28
printk_percpu_data_ready100%of 1
printk_sprint28%of 36
printk_timed_ratelimit---of 4
printk_trigger_flush---of 1
rcu_lock_acquire---of 2
rcu_lock_release100%of 2
rcu_try_lock_acquire100%of 2
record_print_text59%of 12
register_console---of 59
syslog_print---of 29
syslog_print_all---of 20
this_cpu_in_panic100%of 1
trace_event_raw_event_console---of 8
trace_raw_output_console---of 3
try_enable_preferred_console---of 21
unregister_console---of 5
unregister_console_locked---of 64
vprintk_default100%of 1
vprintk_deferred---of 1
vprintk_emit58%of 21
vprintk_store40%of 50
wake_up_klogd---of 1
wake_up_klogd_work_func---of 12
-----------
SUMMARY43%of 301

-----------
SUMMARY---of 0

get_gic_ap0r---of 3
get_gic_ap1r---of 3
get_gic_bpr0---of 1
get_gic_bpr1---of 3
get_gic_ctlr---of 1
get_gic_grpen0---of 1
get_gic_grpen1---of 1
get_gic_pmr---of 1
get_gic_sre---of 1
set_gic_ap0r---of 3
set_gic_ap1r67%of 3
set_gic_bpr0---of 1
set_gic_bpr1---of 3
set_gic_ctlr---of 6
set_gic_grpen0---of 1
set_gic_grpen1---of 1
set_gic_pmr---of 1
set_gic_sre---of 1
vgic_v3_cpu_sysregs_uaccess100%of 3
vgic_v3_has_cpu_sysregs_attr100%of 1
-----------
SUMMARY86%of 7

-----------
SUMMARY---of 0

__generic_remap_file_range_prep---of 40
fsnotify_access---of 9
fsnotify_modify---of 9
generic_remap_check_len---of 5
generic_remap_file_range_prep---of 1
kmap_local_folio---of 35
rcu_lock_acquire---of 2
rcu_lock_release---of 2
remap_verify_area---of 20
vfs_clone_file_range9%of 45
vfs_dedupe_file_range19%of 22
vfs_dedupe_file_range_compare---of 61
vfs_dedupe_file_range_one---of 25
-----------
SUMMARY12%of 67

-----------
SUMMARY---of 0

__addrconf_sysctl_register---of 10
__ipv6_chk_addr_and_flags---of 32
__ipv6_dev_get_saddr---of 14
__ipv6_ifa_notify---of 33
__ipv6_isatap_ifid---of 9
_inline_copy_from_user---of 8
add_addr---of 11
add_v4_addrs---of 42
addrconf_add_dev---of 9
addrconf_add_ifaddr---of 12
addrconf_add_linklocal---of 17
addrconf_addr_gen---of 14
addrconf_cleanup---of 11
addrconf_dad_completed---of 39
addrconf_dad_failure---of 25
addrconf_dad_kick---of 8
addrconf_dad_run---of 9
addrconf_dad_stop---of 28
addrconf_dad_work---of 52
addrconf_del_ifaddr---of 4
addrconf_disable_policy_idev---of 21
addrconf_exit_net---of 9
addrconf_get_prefix_route---of 40
addrconf_ifdown---of 103
addrconf_init_auto_addrs---of 51
addrconf_init_net---of 12
addrconf_join_anycast---of 6
addrconf_join_solict---of 3
addrconf_leave_anycast---of 6
addrconf_leave_solict---of 3
addrconf_link_ready43%of 7
addrconf_mod_dad_work---of 12
addrconf_mod_rs_timer---of 7
addrconf_notify7%of 72
addrconf_permanent_addr---of 37
addrconf_prefix_rcv---of 53
addrconf_prefix_rcv_add_addr---of 23
addrconf_prefix_route---of 4
addrconf_rs_timer---of 14
addrconf_set_dstaddr---of 9
addrconf_sysctl_addr_gen_mode---of 31
addrconf_sysctl_disable---of 22
addrconf_sysctl_disable_policy---of 19
addrconf_sysctl_forward---of 26
addrconf_sysctl_ignore_routes_with_linkdown---of 21
addrconf_sysctl_mtu---of 3
addrconf_sysctl_proxy_ndp---of 8
addrconf_sysctl_register---of 6
addrconf_sysctl_stable_secret---of 21
addrconf_sysctl_unregister---of 3
addrconf_verify_rtnl---of 104
addrconf_verify_work---of 1
check_cleanup_prefix_route---of 16
cleanup_prefix_route---of 19
delete_tempaddrs---of 10
dev_forward_change---of 20
fib6_add_gc_list---of 10
fib6_info_release---of 8
if6_proc_exit---of 1
if6_proc_net_exit---of 1
if6_proc_net_init---of 1
if6_seq_next---of 5
if6_seq_show---of 1
if6_seq_start---of 15
if6_seq_stop---of 6
in6_dev_get---of 22
in6_dev_hold---of 6
in6_dev_put---of 6
in6_dump_addrs---of 42
in6_ifa_hold---of 6
in6_ifa_put---of 6
inet6_addr_add---of 37
inet6_addr_del---of 30
inet6_addr_modify---of 46
inet6_dump_addr---of 72
inet6_dump_ifacaddr---of 1
inet6_dump_ifaddr---of 1
inet6_dump_ifinfo---of 36
inet6_dump_ifmcaddr---of 1
inet6_fill_ifacaddr---of 32
inet6_fill_ifaddr---of 28
inet6_fill_ifinfo---of 20
inet6_fill_ifla6_attrs22%of 14
inet6_fill_ifmcaddr---of 15
inet6_fill_link_af34%of 9
inet6_get_link_af_size29%of 7
inet6_ifa_finish_destroy---of 13
inet6_ifinfo_notify---of 5
inet6_netconf_dump_devconf---of 35
inet6_netconf_fill_devconf---of 25
inet6_netconf_get_devconf---of 44
inet6_netconf_notify_devconf---of 14
inet6_rtm_deladdr---of 11
inet6_rtm_getaddr---of 58
inet6_rtm_newaddr---of 50
inet6_set_link_af---of 35
inet6_validate_link_af---of 19
ipv6_add_addr---of 36
ipv6_add_addr_hash---of 12
ipv6_add_dev---of 67
ipv6_chk_addr---of 1
ipv6_chk_addr_and_flags---of 1
ipv6_chk_custom_prefix---of 28
ipv6_chk_home_addr---of 17
ipv6_chk_prefix---of 28
ipv6_chk_rpl_srh_loop---of 19
ipv6_count_addresses---of 12
ipv6_create_tempaddr---of 53
ipv6_del_addr---of 43
ipv6_dev_find---of 1
ipv6_dev_get_saddr---of 62
ipv6_find_idev---of 15
ipv6_generate_eui64---of 23
ipv6_generate_stable_address---of 14
ipv6_get_ifaddr---of 27
ipv6_get_lladdr---of 25
ipv6_get_saddr_eval---of 30
ipv6_inherit_eui64---of 8
ipv6_link_dev_addr---of 6
l3mdev_fib_table---of 11
local_bh_disable---of 2
local_bh_enable---of 2
manage_tempaddrs---of 16
modify_prefix_route---of 24
netdev_lock_ops---of 5
netdev_unlock_ops---of 5
nlmsg_parse_deprecated_strict---of 4
rcu_lock_acquire---of 2
rcu_lock_release---of 2
rcu_read_unlock---of 6
rfc3315_s14_backoff_update---of 5
snmp6_fill_stats56%of 9
-----------
SUMMARY18%of 118

__irq_work_queue_local20%of 20
irq_work_needs_cpu---of 6
irq_work_queue67%of 6
irq_work_queue_on---of 13
irq_work_run---of 11
irq_work_single---of 5
irq_work_sync36%of 14
irq_work_tick---of 6
-----------
SUMMARY33%of 40

HAS_UNMAPPED_ID67%of 3
__arm64_sys_link---of 1
__arm64_sys_linkat---of 1
__arm64_sys_mkdir---of 1
__arm64_sys_mkdirat---of 1
__arm64_sys_mknod---of 1
__arm64_sys_mknodat---of 1
__arm64_sys_rename---of 1
__arm64_sys_renameat---of 1
__arm64_sys_renameat2---of 1
__arm64_sys_rmdir---of 1
__arm64_sys_symlink---of 1
__arm64_sys_symlinkat---of 1
__arm64_sys_unlink---of 1
__arm64_sys_unlinkat---of 4
__check_sticky---of 4
__filename_parentat---of 24
__getname_maybe_null---of 10
__kern_path_locked---of 6
__lookup_slow42%of 12
__page_get_link---of 21
__traverse_mounts29%of 28
check_acl---of 11
choose_mountpoint---of 34
choose_mountpoint_rcu---of 9
complete_walk62%of 13
d_delete_notify---of 4
do_file_open_root---of 19
do_filp_open62%of 13
do_linkat---of 19
do_mkdirat---of 20
do_mknodat---of 25
do_o_path60%of 5
do_renameat2---of 41
do_rmdir---of 16
do_symlinkat---of 9
do_tmpfile43%of 7
do_unlinkat---of 20
done_path_create---of 3
dont_mount---of 1
filename_create---of 10
filename_lookup---of 20
follow_down---of 5
follow_down_one---of 4
follow_up---of 4
fsnotify_create---of 7
fsnotify_link---of 10
fsnotify_link_count---of 4
fsnotify_move---of 20
full_name_hash80%of 5
generic_permission39%of 26
getname_flags35%of 20
getname_kernel---of 9
getname_uflags---of 1
handle_dots27%of 42
handle_lookup_down---of 4
hashlen_string---of 4
inode_permission45%of 20
kern_path---of 1
kern_path_create---of 1
kern_path_locked---of 1
kern_path_locked_negative---of 7
kernel_tmpfile_open---of 4
leave_rcu34%of 6
legitimize_links12%of 18
legitimize_path---of 5
link_path_walk53%of 34
lock_rename---of 3
lock_rename_child---of 6
lock_two_directories---of 7
lookup_fast25%of 16
lookup_one---of 9
lookup_one_common44%of 16
lookup_one_len45%of 9
lookup_one_len_unlocked---of 1
lookup_one_positive_unlocked---of 4
lookup_one_qstr_excl---of 5
lookup_one_qstr_excl_raw---of 10
lookup_one_unlocked43%of 7
lookup_positive_unlocked75%of 4
lookup_slow100%of 1
may_create---of 11
may_delete---of 25
may_linkat---of 9
may_open35%of 26
may_open_dev---of 3
nd_alloc_stack---of 3
nd_jump_link---of 6
nd_jump_root39%of 13
page_get_link---of 3
page_get_link_raw---of 1
page_put_link---of 9
page_readlink---of 5
page_symlink---of 10
path_get67%of 3
path_init30%of 57
path_lookupat65%of 17
path_openat49%of 183
path_parentat---of 4
path_pts---of 9
path_put100%of 1
pfn_valid---of 31
pick_link---of 45
put_link---of 5
putname30%of 10
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
readlink_copy---of 12
seqcount_lockdep_reader_access60%of 5
set_root42%of 12
step_into48%of 44
terminate_walk50%of 12
try_break_deleg---of 7
try_lookup_one_len---of 7
try_to_unlazy32%of 16
try_to_unlazy_next14%of 15
unlock_rename---of 3
user_path_at---of 1
user_path_create---of 1
user_path_locked_at---of 1
vfs_create---of 14
vfs_get_link---of 4
vfs_link---of 18
vfs_mkdir---of 19
vfs_mknod---of 18
vfs_mkobj---of 10
vfs_path_lookup---of 1
vfs_path_parent_lookup---of 1
vfs_readlink---of 11
vfs_rename---of 63
vfs_rmdir---of 17
vfs_symlink---of 11
vfs_tmpfile20%of 20
vfs_unlink---of 23
walk_component36%of 14
-----------
SUMMARY42%of 757

add_to_page_cache_lru---of 3
clear_page_dirty_for_io---of 3
end_page_writeback---of 3
mark_page_accessed67%of 3
pagecache_get_page---of 5
redirty_page_for_writepage---of 3
set_page_dirty---of 3
set_page_dirty_lock---of 3
set_page_writeback---of 3
unlock_page67%of 3
wait_on_page_writeback---of 3
-----------
SUMMARY67%of 6

alloc_ucounts---of 12
dec_rlimit_put_ucounts56%of 9
dec_rlimit_ucounts---of 8
dec_ucount---of 8
find_ucounts---of 21
inc_rlimit_get_ucounts26%of 27
inc_rlimit_ucounts---of 6
inc_ucount---of 12
is_rlimit_overlimit---of 5
put_ucounts24%of 25
rcu_lock_acquire---of 2
rcu_lock_release---of 2
retire_userns_sysctls---of 1
set_is_seen---of 1
set_lookup---of 1
set_permissions---of 1
setup_userns_sysctls---of 5
-----------
SUMMARY30%of 61

__create_hyp_mappings---of 8
__create_hyp_private_mapping---of 26
__unmap_stage2_range54%of 15
clean_dcache_guest_page50%of 4
create_hyp_exec_mappings---of 4
create_hyp_io_mappings---of 7
create_hyp_mappings---of 25
create_hyp_stack---of 14
free_hyp_memcache---of 14
hyp_alloc_private_va_range---of 7
invalidate_icache_guest_page34%of 6
kvm_age_gfn---of 5
kvm_arch_commit_memory_region41%of 22
kvm_arch_flush_remote_tlbs---of 8
kvm_arch_flush_remote_tlbs_range40%of 5
kvm_arch_flush_shadow_memslot100%of 1
kvm_arch_free_memslot100%of 1
kvm_arch_memslots_updated100%of 1
kvm_arch_mmu_enable_log_dirty_pt_masked---of 13
kvm_arch_prepare_memory_region64%of 22
kvm_free_stage2_pgd---of 5
kvm_get_idmap_vector---of 1
kvm_handle_guest_abort34%of 214
kvm_host_get_page43%of 7
kvm_host_pa100%of 1
kvm_host_page_count67%of 3
kvm_host_put_page---of 9
kvm_host_va67%of 3
kvm_hyp_zalloc_page---of 1
kvm_init_stage2_mmu35%of 26
kvm_memslots23%of 9
kvm_mmu_get_httbr---of 1
kvm_mmu_split_huge_pages18%of 17
kvm_pgtable_stage2_init100%of 1
kvm_phys_addr_ioremap---of 11
kvm_s2_free_pages_exact58%of 7
kvm_s2_put_page56%of 18
kvm_s2_zalloc_pages_exact50%of 8
kvm_set_way_flush---of 15
kvm_share_hyp8%of 26
kvm_stage2_flush_range---of 8
kvm_stage2_unmap_range---of 1
kvm_stage2_wp_range---of 8
kvm_test_age_gfn---of 5
kvm_toggle_cache---of 18
kvm_uninit_stage2_mmu40%of 5
kvm_unmap_gfn_range100%of 3
kvm_unshare_hyp9%of 24
mmap_read_unlock67%of 3
pfn_valid---of 31
rcu_lock_acquire---of 2
rcu_lock_release---of 2
sanitise_mte_tags---of 41
srcu_lock_acquire100%of 2
srcu_lock_release100%of 2
stage2_flush_vm---of 16
stage2_free_unlinked_table---of 1
stage2_free_unlinked_table_rcu_cb---of 7
stage2_memcache_zalloc_page50%of 8
stage2_unmap_vm---of 20
topup_hyp_memcache---of 17
transparent_hugepage_adjust45%of 18
-----------
SUMMARY38%of 482

__import_iovec---of 36
__iov_iter_get_pages_alloc---of 28
_copy_from_iter11%of 64
_copy_from_iter_flushcache---of 51
_copy_from_iter_nocache---of 56
_copy_to_iter---of 70
access_ok---of 4
bvec_npages---of 7
copy_compat_iovec_from_user---of 10
copy_page_from_iter58%of 14
copy_page_from_iter_atomic11%of 75
copy_page_to_iter---of 15
copy_page_to_iter_nofault---of 69
dup_iter---of 3
fault_in_iov_iter_readable16%of 13
fault_in_iov_iter_writeable---of 13
folio_get---of 4
folio_size---of 4
import_iovec---of 1
import_ubuf---of 7
iov_iter_advance---of 11
iov_iter_aligned_bvec---of 8
iov_iter_aligned_iovec---of 10
iov_iter_alignment---of 9
iov_iter_alignment_bvec---of 4
iov_iter_alignment_iovec---of 10
iov_iter_bvec---of 3
iov_iter_bvec_advance---of 6
iov_iter_discard---of 3
iov_iter_extract_bvec_pages---of 17
iov_iter_extract_folioq_pages---of 16
iov_iter_extract_kvec_pages---of 17
iov_iter_extract_pages---of 22
iov_iter_extract_xarray_pages---of 30
iov_iter_folio_queue---of 3
iov_iter_folioq_advance---of 9
iov_iter_gap_alignment---of 9
iov_iter_get_pages2---of 4
iov_iter_get_pages_alloc2---of 3
iov_iter_init---of 3
iov_iter_iovec_advance---of 10
iov_iter_is_aligned---of 13
iov_iter_kvec---of 3
iov_iter_npages---of 9
iov_iter_restore---of 7
iov_iter_revert---of 15
iov_iter_single_seg_count---of 9
iov_iter_xarray---of 3
iov_iter_zero---of 64
iov_npages---of 13
iovec_from_user---of 19
iter_folioq_get_pages---of 21
iter_xarray_get_pages---of 38
kmap_local_folio---of 5
kmap_local_page---of 5
pfn_valid23%of 31
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
rcu_read_unlock---of 6
want_pages_array---of 6
xas_next---of 10
xas_next_entry---of 15
xas_reload---of 20
-----------
SUMMARY18%of 201

context_destroy---of 1
context_to_sid19%of 27
hlist_add_head_rcu67%of 3
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
sidtab_cancel_convert---of 1
sidtab_context_to_sid30%of 17
sidtab_convert---of 8
sidtab_convert_hashtable---of 14
sidtab_convert_tree---of 15
sidtab_destroy---of 7
sidtab_destroy_entry---of 1
sidtab_destroy_tree---of 7
sidtab_do_lookup30%of 27
sidtab_freeze_begin---of 1
sidtab_freeze_end---of 1
sidtab_hash_stats---of 17
sidtab_init---of 1
sidtab_search_core36%of 17
sidtab_search_entry100%of 1
sidtab_search_entry_force100%of 1
sidtab_set_initial---of 10
sidtab_sid2str_get37%of 22
sidtab_sid2str_put37%of 19
-----------
SUMMARY35%of 138

__mas_set_range36%of 14
__split_vma63%of 37
__vm_munmap50%of 14
can_vma_merge_left50%of 22
can_vma_merge_right50%of 22
commit_merge45%of 9
copy_vma---of 29
do_brk_flags---of 33
do_vmi_align_munmap67%of 6
do_vmi_munmap50%of 12
expand_downwards46%of 33
find_mergeable_anon_vma58%of 26
get_file50%of 6
init_multi_vma_prep44%of 16
mm_drop_all_locks60%of 22
mm_take_all_locks50%of 52
mmap_region57%of 104
mmap_write_unlock50%of 6
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
reattach_vmas60%of 10
remove_vma---of 8
unlink_file_vma80%of 5
unlink_file_vma_batch_add100%of 5
unlink_file_vma_batch_final100%of 3
unlink_file_vma_batch_init100%of 1
unlink_file_vma_batch_process67%of 9
unmap_region72%of 7
unmapped_area---of 14
unmapped_area_topdown---of 14
validate_mm45%of 18
vma_complete47%of 43
vma_expand50%of 38
vma_iter_store_overwrite40%of 23
vma_link---of 17
vma_link_file---of 5
vma_merge_existing_range---of 100
vma_merge_extend---of 3
vma_merge_new_range48%of 23
vma_modify---of 15
vma_modify_flags---of 1
vma_modify_flags_name---of 1
vma_modify_flags_uffd---of 3
vma_modify_policy---of 1
vma_needs_dirty_tracking---of 10
vma_prepare56%of 18
vma_shrink---of 14
vma_wants_writenotify34%of 12
vms_clear_ptes100%of 4
vms_complete_munmap_vmas69%of 29
vms_gather_munmap_vmas58%of 54
-----------
SUMMARY55%of 707

-----------
SUMMARY---of 0

__folio_batch_add_and_move34%of 24
__folio_batch_release100%of 3
__folio_put48%of 17
__lru_add_drain_all---of 18
__lru_cache_activate_folio---of 8
__page_cache_release29%of 32
__probestub_mm_lru_activate---of 1
__probestub_mm_lru_insertion---of 1
__traceiter_mm_lru_activate---of 4
__traceiter_mm_lru_insertion---of 4
deactivate_file_folio---of 6
folio_activate---of 5
folio_add_lru31%of 13
folio_add_lru_vma60%of 5
folio_batch_move_lru54%of 15
folio_batch_remove_exceptionals67%of 6
folio_deactivate---of 8
folio_large_mapcount---of 4
folio_mark_accessed28%of 33
folio_mark_lazyfree---of 9
folio_memcg20%of 10
folio_rotate_reclaimable---of 6
folios_put_refs42%of 31
local_lock_acquire34%of 6
local_lock_release43%of 7
lru_activate---of 61
lru_add36%of 67
lru_add_drain100%of 3
lru_add_drain_all---of 1
lru_add_drain_cpu45%of 18
lru_add_drain_cpu_zone---of 3
lru_add_drain_per_cpu---of 3
lru_cache_disable---of 3
lru_deactivate---of 51
lru_deactivate_file---of 79
lru_gen_add_folio36%of 25
lru_gen_clear_refs---of 16
lru_gen_del_folio36%of 17
lru_gen_update_size22%of 32
lru_lazyfree---of 56
lru_move_tail---of 42
lru_note_cost---of 10
lru_note_cost_refault---of 14
perf_trace_mm_lru_activate---of 6
perf_trace_mm_lru_insertion---of 17
rcu_lock_acquire100%of 2
rcu_lock_release100%of 2
release_pages---of 12
trace_event_raw_event_mm_lru_activate---of 7
trace_event_raw_event_mm_lru_insertion---of 18
trace_raw_output_mm_lru_activate---of 3
trace_raw_output_mm_lru_insertion---of 3
-----------
SUMMARY38%of 368

-----------
SUMMARY---of 0

__arm64_compat_sys_old_select---of 3
__arm64_compat_sys_ppoll_time32---of 8
__arm64_compat_sys_ppoll_time64---of 8
__arm64_compat_sys_pselect6_time32---of 9
__arm64_compat_sys_pselect6_time64---of 9
__arm64_compat_sys_select---of 1
__arm64_sys_poll---of 7
__arm64_sys_ppoll---of 8
__arm64_sys_pselect6---of 16
__arm64_sys_select---of 6
__pollwait---of 13
_inline_copy_from_user---of 8
compat_core_sys_select---of 48
core_sys_select---of 52
do_compat_pselect---of 11
do_compat_select---of 6
do_restart_poll---of 5
do_select---of 73
do_sys_poll---of 54
poll_freewait---of 8
poll_initwait---of 1
poll_schedule_timeout---of 7
poll_select_finish---of 33
poll_select_set_timeout---of 4
pollwake50%of 4
rcu_lock_acquire---of 2
rcu_lock_release---of 2
select_estimate_accuracy---of 5
set_fd_set---of 5
-----------
SUMMARY50%of 4

-----------
SUMMARY---of 0

ebitmap_and---of 30
ebitmap_contains---of 30
ebitmap_cpy25%of 8
ebitmap_destroy40%of 5
ebitmap_equal34%of 6
ebitmap_get_bit72%of 7
ebitmap_hash50%of 4
ebitmap_netlbl_export---of 28
ebitmap_netlbl_import---of 16
ebitmap_read---of 23
ebitmap_set_bit---of 18
ebitmap_write---of 31
-----------
SUMMARY44%of 30

-----------
SUMMARY---of 0